From 918b03b3b01412bdd1b0f1a5afb90a3c0522a96b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Jan 2024 09:25:07 +0100
Subject: [PATCH 0001/1571] integrate tokio-epoll-uring as alternative
 VirtualFile IO engine (#5824)

---
 .github/workflows/build_and_test.yml          |  23 +-
 Cargo.lock                                    |  43 +-
 Cargo.toml                                    |   1 +
 pageserver/Cargo.toml                         |   1 +
 pageserver/ctl/src/layer_map_analyzer.rs      |   4 +-
 pageserver/ctl/src/layers.rs                  |   4 +-
 pageserver/ctl/src/main.rs                    |   2 +-
 pageserver/src/bin/pageserver.rs              |   2 +-
 pageserver/src/config.rs                      |  26 ++
 pageserver/src/lib.rs                         |   1 +
 pageserver/src/metrics.rs                     |  15 +
 pageserver/src/tenant/block_io.rs             |  22 +-
 pageserver/src/tenant/ephemeral_file.rs       |  18 +-
 .../src/tenant/storage_layer/delta_layer.rs   |   4 +-
 .../src/tenant/storage_layer/image_layer.rs   |  18 +-
 pageserver/src/virtual_file.rs                | 428 +++++++++++++++---
 pageserver/src/virtual_file/io_engine.rs      | 114 +++++
 pageserver/src/virtual_file/open_options.rs   | 138 ++++++
 scripts/flaky_tests.py                        |  14 +-
 test_runner/fixtures/neon_fixtures.py         |  12 +
 test_runner/fixtures/parametrize.py           |  22 +-
 21 files changed, 794 insertions(+), 118 deletions(-)
 create mode 100644 pageserver/src/virtual_file/io_engine.rs
 create mode 100644 pageserver/src/virtual_file/open_options.rs

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 643d24696d..7445501f00 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -203,7 +203,11 @@ jobs:
     runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-      options: --init
+      # Raise locked memory limit for tokio-epoll-uring.
+      # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
+      # io_uring will account the memory of the CQ and SQ as locked.
+      # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
     strategy:
       fail-fast: false
       matrix:
@@ -358,7 +362,9 @@ jobs:
 
       - name: Run rust tests
         run: |
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          for io_engine in std-fs tokio-epoll-uring ; do
+            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          done
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
@@ -436,8 +442,8 @@ jobs:
     runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-      # Default shared memory is 64mb
-      options: --init --shm-size=512mb
+      # for changed limits, see comments on `options:` earlier in this file
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
     strategy:
       fail-fast: false
       matrix:
@@ -465,6 +471,7 @@ jobs:
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
           BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
 
       - name: Merge and upload coverage data
         if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
@@ -475,12 +482,13 @@ jobs:
     runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-      # Default shared memory is 64mb
-      options: --init --shm-size=512mb
+      # for changed limits, see comments on `options:` earlier in this file
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
     strategy:
       fail-fast: false
       matrix:
+        # the amount of groups (N) should be reflected in `extra_params: --splits N ...`
         pytest_split_group: [ 1, 2, 3, 4 ]
         build_type: [ release ]
     steps:
@@ -494,11 +502,12 @@ jobs:
           test_selection: performance
           run_in_parallel: false
           save_perf_report: ${{ github.ref_name == 'main' }}
-          extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
+          extra_params: --splits 4 --group ${{ matrix.pytest_split_group }}
         env:
           VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
diff --git a/Cargo.lock b/Cargo.lock
index f0e8b6a0ed..6e91363de8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2564,6 +2564,16 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "io-uring"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460648e47a07a43110fbfa2e0b14afb2be920093c31e5dccc50e49568e099762"
+dependencies = [
+ "bitflags 1.3.2",
+ "libc",
+]
+
 [[package]]
 name = "ipnet"
 version = "2.9.0"
@@ -3362,6 +3372,7 @@ dependencies = [
  "tenant_size_model",
  "thiserror",
  "tokio",
+ "tokio-epoll-uring",
  "tokio-io-timeout",
  "tokio-postgres",
  "tokio-stream",
@@ -5383,18 +5394,18 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "1.0.40"
+version = "1.0.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
+checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.40"
+version = "1.0.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
+checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -5518,6 +5529,21 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "tokio-epoll-uring"
+version = "0.1.0"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f"
+dependencies = [
+ "futures",
+ "once_cell",
+ "scopeguard",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+ "tracing",
+ "uring-common",
+]
+
 [[package]]
 name = "tokio-io-timeout"
 version = "1.2.0"
@@ -6027,6 +6053,15 @@ dependencies = [
  "webpki-roots 0.23.1",
 ]
 
+[[package]]
+name = "uring-common"
+version = "0.1.0"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f"
+dependencies = [
+ "io-uring",
+ "libc",
+]
+
 [[package]]
 name = "url"
 version = "2.3.1"
diff --git a/Cargo.toml b/Cargo.toml
index eefd1cb114..8afab02b15 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -151,6 +151,7 @@ test-context = "0.1"
 thiserror = "1.0"
 tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
+tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.10.0"
 tokio-rustls = "0.24"
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 980fbab22e..e44501d1ed 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -61,6 +61,7 @@ sync_wrapper.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
+tokio-epoll-uring.workspace = true
 tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index 15d4eb09e0..eb5c3f15cf 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -18,7 +18,7 @@ use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
 use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
 use pageserver::tenant::storage_layer::range_overlaps;
-use pageserver::virtual_file::VirtualFile;
+use pageserver::virtual_file::{self, VirtualFile};
 
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
@@ -142,7 +142,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
 
     // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(10);
+    pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
     pageserver::page_cache::init(100);
 
     let mut total_delta_layers = 0usize;
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index ebf4a4bec3..dbbcfedac0 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -59,7 +59,7 @@ pub(crate) enum LayerCmd {
 
 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
     let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10);
+    virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
     page_cache::init(100);
     let file = FileBlockReader::new(VirtualFile::open(path).await?);
     let summary_blk = file.read_blk(0, ctx).await?;
@@ -187,7 +187,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             new_tenant_id,
             new_timeline_id,
         } => {
-            pageserver::virtual_file::init(10);
+            pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
             pageserver::page_cache::init(100);
 
             let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index fb42d6d2f1..3c90933fe9 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -123,7 +123,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
 
 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
     // Basic initialization of things that don't change after startup
-    virtual_file::init(10);
+    virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
     page_cache::init(100);
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
     dump_layerfile_from_path(path, true, &ctx).await
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 15e3359c06..84de76e55e 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -130,7 +130,7 @@ fn main() -> anyhow::Result<()> {
     let scenario = failpoint_support::init();
 
     // Basic initialization of things that don't change after startup
-    virtual_file::init(conf.max_file_descriptors);
+    virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
     page_cache::init(conf.page_cache_size);
 
     start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 52277d7f24..1989bef817 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -36,6 +36,7 @@ use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{
     TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
+use crate::virtual_file;
 use crate::{
     IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
     TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
@@ -43,6 +44,8 @@ use crate::{
 
 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
 
+use self::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE;
+
 pub mod defaults {
     use crate::tenant::config::defaults::*;
     use const_format::formatcp;
@@ -79,6 +82,8 @@ pub mod defaults {
 
     pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 
+    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
+
     ///
     /// Default built-in configuration file.
     ///
@@ -114,6 +119,8 @@ pub mod defaults {
 
 #ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
 
+#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -247,6 +254,8 @@ pub struct PageServerConf {
 
     /// Maximum number of WAL records to be ingested and committed at the same time
     pub ingest_batch_size: u64,
+
+    pub virtual_file_io_engine: virtual_file::IoEngineKind,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -331,6 +340,8 @@ struct PageServerConfigBuilder {
     secondary_download_concurrency: BuilderValue<usize>,
 
     ingest_batch_size: BuilderValue<u64>,
+
+    virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -406,6 +417,8 @@ impl Default for PageServerConfigBuilder {
             secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
 
             ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
+
+            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
         }
     }
 }
@@ -562,6 +575,10 @@ impl PageServerConfigBuilder {
         self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
     }
 
+    pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) {
+        self.virtual_file_io_engine = BuilderValue::Set(value);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let concurrent_tenant_warmup = self
             .concurrent_tenant_warmup
@@ -669,6 +686,9 @@ impl PageServerConfigBuilder {
             ingest_batch_size: self
                 .ingest_batch_size
                 .ok_or(anyhow!("missing ingest_batch_size"))?,
+            virtual_file_io_engine: self
+                .virtual_file_io_engine
+                .ok_or(anyhow!("missing virtual_file_io_engine"))?,
         })
     }
 }
@@ -920,6 +940,9 @@ impl PageServerConf {
                     builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
                 },
                 "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
+                "virtual_file_io_engine" => {
+                    builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -993,6 +1016,7 @@ impl PageServerConf {
             heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
             secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
             ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
+            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
         }
     }
 }
@@ -1225,6 +1249,7 @@ background_task_maximum_delay = '334 s'
                 heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
                 secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                 ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
+                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1288,6 +1313,7 @@ background_task_maximum_delay = '334 s'
                 heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
                 secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                 ingest_batch_size: 100,
+                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 26070e0cc1..bcde1166b7 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,3 +1,4 @@
+#![recursion_limit = "300"]
 #![deny(clippy::undocumented_unsafe_blocks)]
 
 mod auth;
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 993685db6e..2cfa77f1c5 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -932,6 +932,7 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+#[cfg(not(test))]
 pub(crate) mod virtual_file_descriptor_cache {
     use super::*;
 
@@ -951,6 +952,20 @@ pub(crate) mod virtual_file_descriptor_cache {
     // ```
 }
 
+#[cfg(not(test))]
+pub(crate) mod virtual_file_io_engine {
+    use super::*;
+
+    pub(crate) static KIND: Lazy<UIntGaugeVec> = Lazy::new(|| {
+        register_uint_gauge_vec!(
+            "pageserver_virtual_file_io_engine_kind",
+            "The configured io engine for VirtualFile",
+            &["kind"],
+        )
+        .unwrap()
+    });
+}
+
 #[derive(Debug)]
 struct GlobalAndPerTimelineHistogram {
     global: Histogram,
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 0617017528..1b6bccc120 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -5,10 +5,10 @@
 use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
-use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
+use crate::page_cache::{self, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
-use std::ops::{Deref, DerefMut};
+use std::ops::Deref;
 
 /// This is implemented by anything that can read 8 kB (PAGE_SZ)
 /// blocks, using the page cache
@@ -39,6 +39,8 @@ pub enum BlockLease<'a> {
     EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
     #[cfg(test)]
     Arc(std::sync::Arc<[u8; PAGE_SZ]>),
+    #[cfg(test)]
+    Vec(Vec<u8>),
 }
 
 impl From<PageReadGuard<'static>> for BlockLease<'static> {
@@ -63,6 +65,10 @@ impl<'a> Deref for BlockLease<'a> {
             BlockLease::EphemeralFileMutableTail(v) => v,
             #[cfg(test)]
             BlockLease::Arc(v) => v.deref(),
+            #[cfg(test)]
+            BlockLease::Vec(v) => {
+                TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ")
+            }
         }
     }
 }
@@ -169,10 +175,14 @@ impl FileBlockReader {
     }
 
     /// Read a page from the underlying file into given buffer.
-    async fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
+    async fn fill_buffer(
+        &self,
+        buf: PageWriteGuard<'static>,
+        blkno: u32,
+    ) -> Result<PageWriteGuard<'static>, std::io::Error> {
         assert!(buf.len() == PAGE_SZ);
         self.file
-            .read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
+            .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64)
             .await
     }
     /// Read a block.
@@ -196,9 +206,9 @@ impl FileBlockReader {
                 )
             })? {
             ReadBufResult::Found(guard) => Ok(guard.into()),
-            ReadBufResult::NotFound(mut write_guard) => {
+            ReadBufResult::NotFound(write_guard) => {
                 // Read the page from disk into the buffer
-                self.fill_buffer(write_guard.deref_mut(), blknum).await?;
+                let write_guard = self.fill_buffer(write_guard, blknum).await?;
                 Ok(write_guard.mark_valid().into())
             }
         }
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 591eacd104..6b8cd77d78 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -5,11 +5,11 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::{self, VirtualFile};
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use std::cmp::min;
-use std::fs::OpenOptions;
+
 use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
 use std::sync::atomic::AtomicU64;
@@ -47,7 +47,10 @@ impl EphemeralFile {
 
         let file = VirtualFile::open_with_options(
             &filename,
-            OpenOptions::new().read(true).write(true).create(true),
+            virtual_file::OpenOptions::new()
+                .read(true)
+                .write(true)
+                .create(true),
         )
         .await?;
 
@@ -89,11 +92,10 @@ impl EphemeralFile {
                 page_cache::ReadBufResult::Found(guard) => {
                     return Ok(BlockLease::PageReadGuard(guard))
                 }
-                page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                    let buf: &mut [u8] = write_guard.deref_mut();
-                    debug_assert_eq!(buf.len(), PAGE_SZ);
-                    self.file
-                        .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
+                page_cache::ReadBufResult::NotFound(write_guard) => {
+                    let write_guard = self
+                        .file
+                        .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
                         .await?;
                     let read_guard = write_guard.mark_valid();
                     return Ok(BlockLease::PageReadGuard(read_guard));
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 4ded6d6a8d..3a445ef71e 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,7 +36,7 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::Timeline;
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
@@ -649,7 +649,7 @@ impl DeltaLayer {
     {
         let file = VirtualFile::open_with_options(
             path,
-            &*std::fs::OpenOptions::new().read(true).write(true),
+            virtual_file::OpenOptions::new().read(true).write(true),
         )
         .await
         .with_context(|| format!("Failed to open file '{}'", path))?;
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index f03c7642eb..c62e6aed51 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -34,7 +34,7 @@ use crate::tenant::storage_layer::{
     LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
 use crate::tenant::Timeline;
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
@@ -327,7 +327,7 @@ impl ImageLayer {
     {
         let file = VirtualFile::open_with_options(
             path,
-            &*std::fs::OpenOptions::new().read(true).write(true),
+            virtual_file::OpenOptions::new().read(true).write(true),
         )
         .await
         .with_context(|| format!("Failed to open file '{}'", path))?;
@@ -492,11 +492,15 @@ impl ImageLayerWriterInner {
             },
         );
         info!("new image layer {path}");
-        let mut file = VirtualFile::open_with_options(
-            &path,
-            std::fs::OpenOptions::new().write(true).create_new(true),
-        )
-        .await?;
+        let mut file = {
+            VirtualFile::open_with_options(
+                &path,
+                virtual_file::OpenOptions::new()
+                    .write(true)
+                    .create_new(true),
+            )
+            .await?
+        };
         // make room for the header block
         file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
         let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 06f58b5c52..d200a4ba5e 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -11,18 +11,28 @@
 //! src/backend/storage/file/fd.c
 //!
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
+
+use crate::page_cache::PageWriteGuard;
 use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
-use std::fs::{self, File, OpenOptions};
+use std::fs::{self, File};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
+use tokio_epoll_uring::IoBufMut;
+
+use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
 use utils::fs_ext;
 
+mod io_engine;
+mod open_options;
+pub use io_engine::IoEngineKind;
+pub(crate) use open_options::*;
+
 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
 /// the underlying file is closed if the system is low on file descriptors,
@@ -106,7 +116,38 @@ struct SlotInner {
     tag: u64,
 
     /// the underlying file
-    file: Option<File>,
+    file: Option<OwnedFd>,
+}
+
+/// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`].
+struct PageWriteGuardBuf {
+    page: PageWriteGuard<'static>,
+    init_up_to: usize,
+}
+// Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot,
+// and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved.
+unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf {
+    fn stable_ptr(&self) -> *const u8 {
+        self.page.as_ptr()
+    }
+    fn bytes_init(&self) -> usize {
+        self.init_up_to
+    }
+    fn bytes_total(&self) -> usize {
+        self.page.len()
+    }
+}
+// Safety: see above, plus: the ownership of [`PageWriteGuard`] means exclusive access,
+// hence it's safe to hand out the `stable_mut_ptr()`.
+unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf {
+    fn stable_mut_ptr(&mut self) -> *mut u8 {
+        self.page.as_mut_ptr()
+    }
+
+    unsafe fn set_init(&mut self, pos: usize) {
+        assert!(pos <= self.page.len());
+        self.init_up_to = pos;
+    }
 }
 
 impl OpenFiles {
@@ -274,6 +315,10 @@ macro_rules! with_file {
         let $ident = $this.lock_file().await?;
         observe_duration!($op, $($body)*)
     }};
+    ($this:expr, $op:expr, | mut $ident:ident | $($body:tt)*) => {{
+        let mut $ident = $this.lock_file().await?;
+        observe_duration!($op, $($body)*)
+    }};
 }
 
 impl VirtualFile {
@@ -326,7 +371,9 @@ impl VirtualFile {
         // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
         // where our caller doesn't get to use the returned VirtualFile before its
         // slot gets re-used by someone else.
-        let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?;
+        let file = observe_duration!(StorageIoOperation::Open, {
+            open_options.open(path.as_std_path()).await?
+        });
 
         // Strip all options other than read and write.
         //
@@ -395,15 +442,13 @@ impl VirtualFile {
 
     /// Call File::sync_all() on the underlying File.
     pub async fn sync_all(&self) -> Result<(), Error> {
-        with_file!(self, StorageIoOperation::Fsync, |file| file
-            .as_ref()
-            .sync_all())
+        with_file!(self, StorageIoOperation::Fsync, |file_guard| file_guard
+            .with_std_file(|std_file| std_file.sync_all()))
     }
 
     pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
-        with_file!(self, StorageIoOperation::Metadata, |file| file
-            .as_ref()
-            .metadata())
+        with_file!(self, StorageIoOperation::Metadata, |file_guard| file_guard
+            .with_std_file(|std_file| std_file.metadata()))
     }
 
     /// Helper function internal to `VirtualFile` that looks up the underlying File,
@@ -412,7 +457,7 @@ impl VirtualFile {
     ///
     /// We are doing it via a macro as Rust doesn't support async closures that
     /// take on parameters with lifetimes.
-    async fn lock_file(&self) -> Result<FileGuard<'_>, Error> {
+    async fn lock_file(&self) -> Result<FileGuard, Error> {
         let open_files = get_open_files();
 
         let mut handle_guard = {
@@ -458,10 +503,9 @@ impl VirtualFile {
         // NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this
         // case from StorageIoOperation::Open. This helps with identifying thrashing
         // of the virtual file descriptor cache.
-        let file = observe_duration!(
-            StorageIoOperation::OpenAfterReplace,
-            self.open_options.open(&self.path)
-        )?;
+        let file = observe_duration!(StorageIoOperation::OpenAfterReplace, {
+            self.open_options.open(self.path.as_std_path()).await?
+        });
 
         // Store the File in the slot and update the handle in the VirtualFile
         // to point to it.
@@ -486,9 +530,8 @@ impl VirtualFile {
                 self.pos = offset;
             }
             SeekFrom::End(offset) => {
-                self.pos = with_file!(self, StorageIoOperation::Seek, |file| file
-                    .as_ref()
-                    .seek(SeekFrom::End(offset)))?
+                self.pos = with_file!(self, StorageIoOperation::Seek, |mut file_guard| file_guard
+                    .with_std_file_mut(|std_file| std_file.seek(SeekFrom::End(offset))))?
             }
             SeekFrom::Current(offset) => {
                 let pos = self.pos as i128 + offset as i128;
@@ -507,25 +550,28 @@ impl VirtualFile {
         Ok(self.pos)
     }
 
-    // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
-    pub async fn read_exact_at(&self, mut buf: &mut [u8], mut offset: u64) -> Result<(), Error> {
-        while !buf.is_empty() {
-            match self.read_at(buf, offset).await {
-                Ok(0) => {
-                    return Err(Error::new(
-                        std::io::ErrorKind::UnexpectedEof,
-                        "failed to fill whole buffer",
-                    ))
-                }
-                Ok(n) => {
-                    buf = &mut buf[n..];
-                    offset += n as u64;
-                }
-                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
-            }
-        }
-        Ok(())
+    pub async fn read_exact_at<B>(&self, buf: B, offset: u64) -> Result<B, Error>
+    where
+        B: IoBufMut + Send,
+    {
+        let (buf, res) =
+            read_exact_at_impl(buf, offset, |buf, offset| self.read_at(buf, offset)).await;
+        res.map(|()| buf)
+    }
+
+    /// Like [`Self::read_exact_at`] but for [`PageWriteGuard`].
+    pub async fn read_exact_at_page(
+        &self,
+        page: PageWriteGuard<'static>,
+        offset: u64,
+    ) -> Result<PageWriteGuard<'static>, Error> {
+        let buf = PageWriteGuardBuf {
+            page,
+            init_up_to: 0,
+        };
+        let res = self.read_exact_at(buf, offset).await;
+        res.map(|PageWriteGuardBuf { page, .. }| page)
+            .map_err(|e| Error::new(ErrorKind::Other, e))
     }
 
     // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
@@ -575,22 +621,35 @@ impl VirtualFile {
         Ok(n)
     }
 
-    pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Read, |file| file
-            .as_ref()
-            .read_at(buf, offset));
-        if let Ok(size) = result {
-            STORAGE_IO_SIZE
-                .with_label_values(&["read", &self.tenant_id, &self.shard_id, &self.timeline_id])
-                .add(size as i64);
-        }
-        result
+    pub(crate) async fn read_at<B>(&self, buf: B, offset: u64) -> (B, Result<usize, Error>)
+    where
+        B: tokio_epoll_uring::BoundedBufMut + Send,
+    {
+        let file_guard = match self.lock_file().await {
+            Ok(file_guard) => file_guard,
+            Err(e) => return (buf, Err(e)),
+        };
+
+        observe_duration!(StorageIoOperation::Read, {
+            let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
+            if let Ok(size) = res {
+                STORAGE_IO_SIZE
+                    .with_label_values(&[
+                        "read",
+                        &self.tenant_id,
+                        &self.shard_id,
+                        &self.timeline_id,
+                    ])
+                    .add(size as i64);
+            }
+            (buf, res)
+        })
     }
 
     async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Write, |file| file
-            .as_ref()
-            .write_at(buf, offset));
+        let result = with_file!(self, StorageIoOperation::Write, |file_guard| {
+            file_guard.with_std_file(|std_file| std_file.write_at(buf, offset))
+        });
         if let Ok(size) = result {
             STORAGE_IO_SIZE
                 .with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id])
@@ -600,18 +659,241 @@ impl VirtualFile {
     }
 }
 
-struct FileGuard<'a> {
-    slot_guard: RwLockReadGuard<'a, SlotInner>,
+// Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
+pub async fn read_exact_at_impl<B, F, Fut>(
+    buf: B,
+    mut offset: u64,
+    mut read_at: F,
+) -> (B, std::io::Result<()>)
+where
+    B: IoBufMut + Send,
+    F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
+    Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
+{
+    use tokio_epoll_uring::BoundedBuf;
+    let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
+    while buf.bytes_total() != 0 {
+        let res;
+        (buf, res) = read_at(buf, offset).await;
+        match res {
+            Ok(0) => break,
+            Ok(n) => {
+                buf = buf.slice(n..);
+                offset += n as u64;
+            }
+            Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+            Err(e) => return (buf.into_inner(), Err(e)),
+        }
+    }
+    // NB: don't use `buf.is_empty()` here; it is from the
+    // `impl Deref for Slice { Target = [u8] }`; the the &[u8]
+    // returned by it only covers the initialized portion of `buf`.
+    // Whereas we're interested in ensuring that we filled the entire
+    // buffer that the user passed in.
+    if buf.bytes_total() != 0 {
+        (
+            buf.into_inner(),
+            Err(std::io::Error::new(
+                std::io::ErrorKind::UnexpectedEof,
+                "failed to fill whole buffer",
+            )),
+        )
+    } else {
+        assert_eq!(buf.len(), buf.bytes_total());
+        (buf.into_inner(), Ok(()))
+    }
 }
 
-impl<'a> AsRef<File> for FileGuard<'a> {
-    fn as_ref(&self) -> &File {
+#[cfg(test)]
+mod test_read_exact_at_impl {
+
+    use std::{collections::VecDeque, sync::Arc};
+
+    use tokio_epoll_uring::{BoundedBuf, BoundedBufMut};
+
+    use super::read_exact_at_impl;
+
+    struct Expectation {
+        offset: u64,
+        bytes_total: usize,
+        result: std::io::Result<Vec<u8>>,
+    }
+    struct MockReadAt {
+        expectations: VecDeque<Expectation>,
+    }
+
+    impl MockReadAt {
+        async fn read_at(
+            &mut self,
+            mut buf: tokio_epoll_uring::Slice<Vec<u8>>,
+            offset: u64,
+        ) -> (tokio_epoll_uring::Slice<Vec<u8>>, std::io::Result<usize>) {
+            let exp = self
+                .expectations
+                .pop_front()
+                .expect("read_at called but we have no expectations left");
+            assert_eq!(exp.offset, offset);
+            assert_eq!(exp.bytes_total, buf.bytes_total());
+            match exp.result {
+                Ok(bytes) => {
+                    assert!(bytes.len() <= buf.bytes_total());
+                    buf.put_slice(&bytes);
+                    (buf, Ok(bytes.len()))
+                }
+                Err(e) => (buf, Err(e)),
+            }
+        }
+    }
+
+    impl Drop for MockReadAt {
+        fn drop(&mut self) {
+            assert_eq!(self.expectations.len(), 0);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_basic() {
+        let buf = Vec::with_capacity(5);
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::from(vec![Expectation {
+                offset: 0,
+                bytes_total: 5,
+                result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
+            }]),
+        }));
+        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        assert!(res.is_ok());
+        assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
+    }
+
+    #[tokio::test]
+    async fn test_empty_buf_issues_no_syscall() {
+        let buf = Vec::new();
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::new(),
+        }));
+        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        assert!(res.is_ok());
+    }
+
+    #[tokio::test]
+    async fn test_two_read_at_calls_needed_until_buf_filled() {
+        let buf = Vec::with_capacity(4);
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::from(vec![
+                Expectation {
+                    offset: 0,
+                    bytes_total: 4,
+                    result: Ok(vec![b'a', b'b']),
+                },
+                Expectation {
+                    offset: 2,
+                    bytes_total: 2,
+                    result: Ok(vec![b'c', b'd']),
+                },
+            ]),
+        }));
+        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        assert!(res.is_ok());
+        assert_eq!(buf, vec![b'a', b'b', b'c', b'd']);
+    }
+
+    #[tokio::test]
+    async fn test_eof_before_buffer_full() {
+        let buf = Vec::with_capacity(3);
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::from(vec![
+                Expectation {
+                    offset: 0,
+                    bytes_total: 3,
+                    result: Ok(vec![b'a']),
+                },
+                Expectation {
+                    offset: 1,
+                    bytes_total: 2,
+                    result: Ok(vec![b'b']),
+                },
+                Expectation {
+                    offset: 2,
+                    bytes_total: 1,
+                    result: Ok(vec![]),
+                },
+            ]),
+        }));
+        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        let Err(err) = res else {
+            panic!("should return an error");
+        };
+        assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof);
+        assert_eq!(format!("{err}"), "failed to fill whole buffer");
+        // buffer contents on error are unspecified
+    }
+}
+
+struct FileGuard {
+    slot_guard: RwLockReadGuard<'static, SlotInner>,
+}
+
+impl AsRef<OwnedFd> for FileGuard {
+    fn as_ref(&self) -> &OwnedFd {
         // This unwrap is safe because we only create `FileGuard`s
         // if we know that the file is Some.
         self.slot_guard.file.as_ref().unwrap()
     }
 }
 
+impl FileGuard {
+    /// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually.
+    fn with_std_file<F, R>(&self, with: F) -> R
+    where
+        F: FnOnce(&File) -> R,
+    {
+        // SAFETY:
+        // - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`.
+        // - `&` usage below: `self` is `&`, hence Rust typesystem guarantees there are is no `&mut`
+        let file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) };
+        let res = with(&file);
+        let _ = file.into_raw_fd();
+        res
+    }
+    /// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually.
+    fn with_std_file_mut<F, R>(&mut self, with: F) -> R
+    where
+        F: FnOnce(&mut File) -> R,
+    {
+        // SAFETY:
+        // - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`.
+        // - &mut usage below: `self` is `&mut`, hence this call is the only task/thread that has control over the underlying fd
+        let mut file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) };
+        let res = with(&mut file);
+        let _ = file.into_raw_fd();
+        res
+    }
+}
+
+impl tokio_epoll_uring::IoFd for FileGuard {
+    unsafe fn as_fd(&self) -> RawFd {
+        let owned_fd: &OwnedFd = self.as_ref();
+        owned_fd.as_raw_fd()
+    }
+}
+
 #[cfg(test)]
 impl VirtualFile {
     pub(crate) async fn read_blk(
@@ -619,16 +901,19 @@ impl VirtualFile {
         blknum: u32,
     ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
         use crate::page_cache::PAGE_SZ;
-        let mut buf = [0; PAGE_SZ];
-        self.read_exact_at(&mut buf, blknum as u64 * (PAGE_SZ as u64))
+        let buf = vec![0; PAGE_SZ];
+        let buf = self
+            .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64))
             .await?;
-        Ok(std::sync::Arc::new(buf).into())
+        Ok(crate::tenant::block_io::BlockLease::Vec(buf))
     }
 
     async fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<(), Error> {
+        let mut tmp = vec![0; 128];
         loop {
-            let mut tmp = [0; 128];
-            match self.read_at(&mut tmp, self.pos).await {
+            let res;
+            (tmp, res) = self.read_at(tmp, self.pos).await;
+            match res {
                 Ok(0) => return Ok(()),
                 Ok(n) => {
                     self.pos += n as u64;
@@ -704,10 +989,12 @@ impl OpenFiles {
 /// Initialize the virtual file module. This must be called once at page
 /// server startup.
 ///
-pub fn init(num_slots: usize) {
+#[cfg(not(test))]
+pub fn init(num_slots: usize, engine: IoEngineKind) {
     if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
         panic!("virtual_file::init called twice");
     }
+    io_engine::init(engine);
     crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }
 
@@ -752,10 +1039,10 @@ mod tests {
     }
 
     impl MaybeVirtualFile {
-        async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> {
+        async fn read_exact_at(&self, mut buf: Vec<u8>, offset: u64) -> Result<Vec<u8>, Error> {
             match self {
                 MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await,
-                MaybeVirtualFile::File(file) => file.read_exact_at(buf, offset),
+                MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
             }
         }
         async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> {
@@ -797,14 +1084,14 @@ mod tests {
 
         // Helper function to slurp a portion of a file into a string
         async fn read_string_at(&mut self, pos: u64, len: usize) -> Result<String, Error> {
-            let mut buf = vec![0; len];
-            self.read_exact_at(&mut buf, pos).await?;
+            let buf = vec![0; len];
+            let buf = self.read_exact_at(buf, pos).await?;
             Ok(String::from_utf8(buf).unwrap())
         }
     }
 
     #[tokio::test]
-    async fn test_virtual_files() -> Result<(), Error> {
+    async fn test_virtual_files() -> anyhow::Result<()> {
         // The real work is done in the test_files() helper function. This
         // allows us to run the same set of tests against a native File, and
         // VirtualFile. We trust the native Files and wouldn't need to test them,
@@ -820,14 +1107,17 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_physical_files() -> Result<(), Error> {
+    async fn test_physical_files() -> anyhow::Result<()> {
         test_files("physical_files", |path, open_options| async move {
-            Ok(MaybeVirtualFile::File(open_options.open(path)?))
+            Ok(MaybeVirtualFile::File({
+                let owned_fd = open_options.open(path.as_std_path()).await?;
+                File::from(owned_fd)
+            }))
         })
         .await
     }
 
-    async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> Result<(), Error>
+    async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> anyhow::Result<()>
     where
         OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
         FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
@@ -971,11 +1261,11 @@ mod tests {
         for _threadno in 0..THREADS {
             let files = files.clone();
             let hdl = rt.spawn(async move {
-                let mut buf = [0u8; SIZE];
+                let mut buf = vec![0u8; SIZE];
                 let mut rng = rand::rngs::OsRng;
                 for _ in 1..1000 {
                     let f = &files[rng.gen_range(0..files.len())];
-                    f.read_exact_at(&mut buf, 0).await.unwrap();
+                    buf = f.read_exact_at(buf, 0).await.unwrap();
                     assert!(buf == SAMPLE);
                 }
             });
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
new file mode 100644
index 0000000000..f7b46fe653
--- /dev/null
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -0,0 +1,114 @@
+//! [`super::VirtualFile`] supports different IO engines.
+//!
+//! The [`IoEngineKind`] enum identifies them.
+//!
+//! The choice of IO engine is global.
+//! Initialize using [`init`].
+//!
+//! Then use [`get`] and  [`super::OpenOptions`].
+
+#[derive(
+    Copy,
+    Clone,
+    PartialEq,
+    Eq,
+    Hash,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+    Debug,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum IoEngineKind {
+    StdFs,
+    #[cfg(target_os = "linux")]
+    TokioEpollUring,
+}
+
+static IO_ENGINE: once_cell::sync::OnceCell<IoEngineKind> = once_cell::sync::OnceCell::new();
+
+#[cfg(not(test))]
+pub(super) fn init(engine: IoEngineKind) {
+    if IO_ENGINE.set(engine).is_err() {
+        panic!("called twice");
+    }
+    crate::metrics::virtual_file_io_engine::KIND
+        .with_label_values(&[&format!("{engine}")])
+        .set(1);
+}
+
+pub(super) fn get() -> &'static IoEngineKind {
+    #[cfg(test)]
+    {
+        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
+        IO_ENGINE.get_or_init(|| match std::env::var(env_var_name) {
+            Ok(v) => match v.parse::<IoEngineKind>() {
+                Ok(engine_kind) => engine_kind,
+                Err(e) => {
+                    panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
+                }
+            },
+            Err(std::env::VarError::NotPresent) => {
+                crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
+                    .parse()
+                    .unwrap()
+            }
+            Err(std::env::VarError::NotUnicode(_)) => {
+                panic!("env var {env_var_name} is not unicode");
+            }
+        })
+    }
+    #[cfg(not(test))]
+    IO_ENGINE.get().unwrap()
+}
+
+use std::os::unix::prelude::FileExt;
+
+use super::FileGuard;
+
+impl IoEngineKind {
+    pub(super) async fn read_at<B>(
+        &self,
+        file_guard: FileGuard,
+        offset: u64,
+        mut buf: B,
+    ) -> ((FileGuard, B), std::io::Result<usize>)
+    where
+        B: tokio_epoll_uring::BoundedBufMut + Send,
+    {
+        match self {
+            IoEngineKind::StdFs => {
+                // SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory.
+                let dst = unsafe {
+                    std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total())
+                };
+                let res = file_guard.with_std_file(|std_file| std_file.read_at(dst, offset));
+                if let Ok(nbytes) = &res {
+                    assert!(*nbytes <= buf.bytes_total());
+                    // SAFETY: see above assertion
+                    unsafe {
+                        buf.set_init(*nbytes);
+                    }
+                }
+                #[allow(dropping_references)]
+                drop(dst);
+                ((file_guard, buf), res)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngineKind::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.read(file_guard, offset, buf).await;
+                (
+                    resources,
+                    res.map_err(|e| match e {
+                        tokio_epoll_uring::Error::Op(e) => e,
+                        tokio_epoll_uring::Error::System(system) => {
+                            std::io::Error::new(std::io::ErrorKind::Other, system)
+                        }
+                    }),
+                )
+            }
+        }
+    }
+}
diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs
new file mode 100644
index 0000000000..1e5ffe15cc
--- /dev/null
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -0,0 +1,138 @@
+//! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`];
+
+use super::IoEngineKind;
+use std::{os::fd::OwnedFd, path::Path};
+
+#[derive(Debug, Clone)]
+pub enum OpenOptions {
+    StdFs(std::fs::OpenOptions),
+    #[cfg(target_os = "linux")]
+    TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions),
+}
+
+impl Default for OpenOptions {
+    fn default() -> Self {
+        match super::io_engine::get() {
+            IoEngineKind::StdFs => Self::StdFs(std::fs::OpenOptions::new()),
+            #[cfg(target_os = "linux")]
+            IoEngineKind::TokioEpollUring => {
+                Self::TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions::new())
+            }
+        }
+    }
+}
+
+impl OpenOptions {
+    pub fn new() -> OpenOptions {
+        Self::default()
+    }
+
+    pub fn read(&mut self, read: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.read(read);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.read(read);
+            }
+        }
+        self
+    }
+
+    pub fn write(&mut self, write: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.write(write);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.write(write);
+            }
+        }
+        self
+    }
+
+    pub fn create(&mut self, create: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.create(create);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.create(create);
+            }
+        }
+        self
+    }
+
+    pub fn create_new(&mut self, create_new: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.create_new(create_new);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.create_new(create_new);
+            }
+        }
+        self
+    }
+
+    pub fn truncate(&mut self, truncate: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.truncate(truncate);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.truncate(truncate);
+            }
+        }
+        self
+    }
+
+    pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result<OwnedFd> {
+        match self {
+            OpenOptions::StdFs(x) => x.open(path).map(|file| file.into()),
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                system.open(path, x).await.map_err(|e| match e {
+                    tokio_epoll_uring::Error::Op(e) => e,
+                    tokio_epoll_uring::Error::System(system) => {
+                        std::io::Error::new(std::io::ErrorKind::Other, system)
+                    }
+                })
+            }
+        }
+    }
+}
+
+impl std::os::unix::prelude::OpenOptionsExt for OpenOptions {
+    fn mode(&mut self, mode: u32) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.mode(mode);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.mode(mode);
+            }
+        }
+        self
+    }
+
+    fn custom_flags(&mut self, flags: i32) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.custom_flags(flags);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.custom_flags(flags);
+            }
+        }
+        self
+    }
+}
diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
index b07e4bea9b..61a97f520d 100755
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -3,6 +3,7 @@
 import argparse
 import json
 import logging
+import os
 from collections import defaultdict
 from typing import DefaultDict, Dict
 
@@ -45,6 +46,15 @@ def main(args: argparse.Namespace):
         logging.error("cannot fetch flaky tests from the DB due to an error", exc)
         rows = []
 
+    # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not std-fs),
+    # use it to parametrize test name along with build_type and pg_version
+    #
+    # See test_runner/fixtures/parametrize.py for details
+    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
+        pageserver_virtual_file_io_engine_parameter = f"-{io_engine}"
+    else:
+        pageserver_virtual_file_io_engine_parameter = ""
+
     for row in rows:
         # We don't want to automatically rerun tests in a performance suite
         if row["parent_suite"] != "test_runner.regress":
@@ -53,10 +63,10 @@ def main(args: argparse.Namespace):
         if row["name"].endswith("]"):
             parametrized_test = row["name"].replace(
                 "[",
-                f"[{build_type}-pg{pg_version}-",
+                f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}-",
             )
         else:
-            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}]"
+            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}]"
 
         res[row["parent_suite"]][row["suite"]][parametrized_test] = True
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index fd5e77671b..142c97d5c3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -446,6 +446,7 @@ class NeonEnvBuilder:
         preserve_database_files: bool = False,
         initial_tenant: Optional[TenantId] = None,
         initial_timeline: Optional[TimelineId] = None,
+        pageserver_virtual_file_io_engine: Optional[str] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -481,6 +482,8 @@ class NeonEnvBuilder:
         self.config_init_force: Optional[str] = None
         self.top_output_dir = top_output_dir
 
+        self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -995,6 +998,8 @@ class NeonEnv:
             self, config.auth_enabled
         )
 
+        self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
+
         # Create a config file corresponding to the options
         cfg: Dict[str, Any] = {
             "default_tenant_id": str(self.initial_tenant),
@@ -1026,6 +1031,9 @@ class NeonEnv:
                 "pg_auth_type": pg_auth_type,
                 "http_auth_type": http_auth_type,
             }
+            if self.pageserver_virtual_file_io_engine is not None:
+                ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
+
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
                 NeonPageserver(
@@ -1191,6 +1199,7 @@ def _shared_simple_env(
     neon_binpath: Path,
     pg_distrib_dir: Path,
     pg_version: PgVersion,
+    pageserver_virtual_file_io_engine: str,
 ) -> Iterator[NeonEnv]:
     """
     # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
@@ -1220,6 +1229,7 @@ def _shared_simple_env(
         preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
         test_name=request.node.name,
         test_output_dir=test_output_dir,
+        pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
     ) as builder:
         env = builder.init_start()
 
@@ -1258,6 +1268,7 @@ def neon_env_builder(
     request: FixtureRequest,
     test_overlay_dir: Path,
     top_output_dir: Path,
+    pageserver_virtual_file_io_engine: str,
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.
@@ -1287,6 +1298,7 @@ def neon_env_builder(
         broker=default_broker,
         run_id=run_id,
         preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
+        pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
         test_name=request.node.name,
         test_output_dir=test_output_dir,
         test_overlay_dir=test_overlay_dir,
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 53350138dd..d8ac92abb6 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -8,7 +8,7 @@ from _pytest.python import Metafunc
 from fixtures.pg_version import PgVersion
 
 """
-Dynamically parametrize tests by Postgres version and build type (debug/release/remote)
+Dynamically parametrize tests by Postgres version, build type (debug/release/remote), and possibly by other parameters
 """
 
 
@@ -31,11 +31,12 @@ def build_type(request: FixtureRequest) -> Optional[str]:
     return None
 
 
-def pytest_generate_tests(metafunc: Metafunc):
-    # Do not parametrize performance tests yet, we need to prepare grafana charts first
-    if "test_runner/performance" in metafunc.definition._nodeid:
-        return
+@pytest.fixture(scope="function", autouse=True)
+def pageserver_virtual_file_io_engine(request: FixtureRequest) -> Optional[str]:
+    return None
 
+
+def pytest_generate_tests(metafunc: Metafunc):
     if (v := os.environ.get("DEFAULT_PG_VERSION")) is None:
         pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET]
     else:
@@ -46,5 +47,12 @@ def pytest_generate_tests(metafunc: Metafunc):
     else:
         build_types = [bt.lower()]
 
-    metafunc.parametrize("build_type", build_types)
-    metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
+    # Do not parametrize performance tests yet by Postgres version or build type, we need to prepare grafana charts first
+    if "test_runner/performance" not in metafunc.definition._nodeid:
+        metafunc.parametrize("build_type", build_types)
+        metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
+
+    # A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=tokio-epoll-uring`
+    # And do not change test name for default `pageserver_virtual_file_io_engine=std-fs` to keep tests statistics
+    if (io_engine := os.environ.get("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
+        metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine])

From 12e9b2a909d2382567b79fa4b27a879f0009b5ff Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Fri, 26 Jan 2024 10:56:11 +0100
Subject: [PATCH 0002/1571] Update plv8 (#6465)

---
 Dockerfile.compute-node | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index a5c1f3157d..2414c089dc 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -144,30 +144,23 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ARG PG_VERSION
 RUN apt update && \
     apt install -y ninja-build python3-dev libncurses5 binutils clang
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export PLV8_VERSION=3.1.5 \
-        export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
-        ;; \
-      "v16") \
-        export PLV8_VERSION=3.1.8 \
-        export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
-    echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
+    echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
     mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
+    # generate and copy upgrade scripts
+    mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
+    cp upgrade/* /usr/local/pgsql/share/extension/ && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
     find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
+    # don't break computes with installed old version of plv8
+    cd /usr/local/pgsql/lib/ && \
+    ln -s plv8-3.1.10.so plv8-3.1.5.so && \
+    ln -s plv8-3.1.10.so plv8-3.1.8.so && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control

From 26c55b025582e6fbe373a2c5cb489127a2014c62 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 26 Jan 2024 12:39:20 +0000
Subject: [PATCH 0003/1571] Compute: fix rdkit extension build (#6488)

## Problem

`rdkit` extension build started to fail because of the changed checksum
of the Comic Neue font:

```
Downloading https://fonts.google.com/download?family=Comic%20Neue...
CMake Error at Code/cmake/Modules/RDKitUtils.cmake:257 (MESSAGE):
  The md5 checksum for /rdkit-src/Code/GraphMol/MolDraw2D/Comic_Neue.zip is
  incorrect; expected: 850b0df852f1cda4970887b540f8f333, found:
  b7fd0df73ad4637504432d72a0accb8f
```

https://github.com/neondatabase/neon/actions/runs/7666530536/job/20895534826

Ref https://neondb.slack.com/archives/C059ZC138NR/p1706265392422469

## Summary of changes
- Disable comic fonts for `rdkit` extension
---
 Dockerfile.compute-node | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 2414c089dc..299c4097e8 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -52,7 +52,7 @@ RUN cd postgres && \
     # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
     # In vanilla postgres this function is limited to Postgres role superuser.
     # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
-    # We could add the additional grant statements to the postgres repository but it would be hard to maintain, 
+    # We could add the additional grant statements to the postgres repository but it would be hard to maintain,
     # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
     # so we do it here.
     old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
@@ -63,14 +63,14 @@ RUN cd postgres && \
             echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
         fi; \
     done; \
-    # the second loop is for pg_stat_statement extension versions >= 1.7, 
+    # the second loop is for pg_stat_statement extension versions >= 1.7,
     # where pg_stat_statement_reset() got 3 additional arguments
     for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
         filename=$(basename "$file"); \
         if ! echo "$old_list" | grep -q -F "$filename"; then \
             echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
         fi; \
-    done      
+    done
 
 #########################################################################################
 #
@@ -546,6 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
         -D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
         -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
         -D RDK_INSTALL_INTREE=OFF \
+        -D RDK_INSTALL_COMIC_FONTS=OFF \
         -D CMAKE_BUILD_TYPE=Release \
         . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \

From 5b34d5f561f4565dd22dd5e9be58a9844c6a5476 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 26 Jan 2024 13:40:03 +0000
Subject: [PATCH 0004/1571] pageserver: add vectored get latency histogram
 (#6461)

This patch introduces a new set of grafana metrics for a histogram:
pageserver_get_vectored_seconds_bucket{task_kind="Compaction|PageRequestHandler"}.

While it has a `task_kind` label, only compaction and SLRU fetches are
tracked. This reduces the increase in cardinality to 24.

The metric should allow us to isolate performance regressions while the
vectorized get is being implemented. Once the implementation is
complete, it'll also allow us to quantify the improvements.
---
 pageserver/src/metrics.rs         | 37 +++++++++++++++++++++++++++++++
 pageserver/src/tenant/timeline.rs |  4 ++++
 2 files changed, 41 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 2cfa77f1c5..9b3679e3c2 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -150,6 +150,43 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) struct GetVectoredLatency {
+    map: EnumMap<TaskKind, Option<Histogram>>,
+}
+
+impl GetVectoredLatency {
+    // Only these task types perform vectored gets. Filter all other tasks out to reduce total
+    // cardinality of the metric.
+    const TRACKED_TASK_KINDS: [TaskKind; 2] = [TaskKind::Compaction, TaskKind::PageRequestHandler];
+
+    pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
+        self.map[task_kind].as_ref()
+    }
+}
+
+pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
+    let inner = register_histogram_vec!(
+        "pageserver_get_vectored_seconds",
+        "Time spent in get_vectored",
+        &["task_kind"],
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric");
+
+    GetVectoredLatency {
+        map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
+            let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
+
+            if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
+                let task_kind = task_kind.into();
+                Some(inner.with_label_values(&[task_kind]))
+            } else {
+                None
+            }
+        })),
+    }
+});
+
 pub(crate) struct PageCacheMetricsForTaskKind {
     pub read_accesses_materialized_page: IntCounter,
     pub read_accesses_immutable: IntCounter,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c21fe94d01..70c6ee2042 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -678,6 +678,10 @@ impl Timeline {
             return Err(GetVectoredError::Oversized(key_count));
         }
 
+        let _timer = crate::metrics::GET_VECTORED_LATENCY
+            .for_task_kind(ctx.task_kind())
+            .map(|t| t.start_timer());
+
         let mut values = BTreeMap::new();
         for range in key_ranges {
             let mut key = range.start;

From 55b7cde665294e4dfcfd0898c26f42c6c6b88d57 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 26 Jan 2024 14:40:47 +0000
Subject: [PATCH 0005/1571] tests: add basic coverage for sharding (#6380)

## Problem

The support for sharding in the pageserver was written before
https://github.com/neondatabase/neon/pull/6205 landed, so when it landed
we couldn't directly test sharding.

## Summary of changes

- Add `test_sharding_smoke` which tests the basics of creating a
sharding tenant, creating a timeline within it, checking that data
within it is distributed.
- Add modes to pg_regress tests for running with 4 shards as well as
with 1.
---
 pageserver/src/walingest.rs                   | 18 +++-
 test_runner/fixtures/workload.py              | 13 ++-
 .../regress/test_pageserver_restart.py        | 24 ++++--
 test_runner/regress/test_pg_regress.py        | 53 ++++++++----
 test_runner/regress/test_sharding.py          | 85 +++++++++++++++++++
 5 files changed, 170 insertions(+), 23 deletions(-)
 create mode 100644 test_runner/regress/test_sharding.py

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 3183608862..5a6f9a590f 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1033,7 +1033,23 @@ impl WalIngest {
             // Copy content
             debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks);
             for blknum in 0..nblocks {
-                debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel);
+                // Sharding:
+                //  - src and dst are always on the same shard, because they differ only by dbNode, and
+                //    dbNode is not included in the hash inputs for sharding.
+                //  - This WAL command is replayed on all shards, but each shard only copies the blocks
+                //    that belong to it.
+                let src_key = rel_block_to_key(src_rel, blknum);
+                if !self.shard.is_key_local(&src_key) {
+                    debug!(
+                        "Skipping non-local key {} during XLOG_DBASE_CREATE",
+                        src_key
+                    );
+                    continue;
+                }
+                debug!(
+                    "copying block {} from {} ({}) to {}",
+                    blknum, src_rel, src_key, dst_rel
+                );
 
                 let content = modification
                     .tline
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index 30def1194d..f29a6cbf3c 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -21,12 +21,21 @@ class Workload:
     - reads, checking we get the right data (`validate`)
     """
 
-    def __init__(self, env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
+    def __init__(
+        self,
+        env: NeonEnv,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        branch_name: Optional[str] = None,
+    ):
         self.env = env
         self.tenant_id = tenant_id
         self.timeline_id = timeline_id
         self.table = "foo"
 
+        # By default, use the default branch name for initial tenant in NeonEnv
+        self.branch_name = branch_name or "main"
+
         self.expect_rows = 0
         self.churn_cursor = 0
 
@@ -35,7 +44,7 @@ class Workload:
     def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
         if self._endpoint is None:
             self._endpoint = self.env.endpoints.create(
-                "main",
+                self.branch_name,
                 tenant_id=self.tenant_id,
                 pageserver_id=pageserver_id,
                 endpoint_id="ep-workload",
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index c4499196b5..753898f747 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -1,4 +1,6 @@
+import random
 from contextlib import closing
+from typing import Optional
 
 import pytest
 from fixtures.log_helper import log
@@ -141,18 +143,24 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 # Test that repeatedly kills and restarts the page server, while the
 # safekeeper and compute node keep running.
 @pytest.mark.timeout(540)
-def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str):
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_pageserver_chaos(
+    neon_env_builder: NeonEnvBuilder, build_type: str, shard_count: Optional[int]
+):
     if build_type == "debug":
         pytest.skip("times out in debug builds")
 
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
 
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
     # these can happen, if we shutdown at a good time. to be fixed as part of #5172.
     message = ".*duplicated L1 layer layer=.*"
-    env.pageserver.allowed_errors.append(message)
+    for ps in env.pageservers:
+        ps.allowed_errors.append(message)
 
     # Use a tiny checkpoint distance, to create a lot of layers quickly.
     # That allows us to stress the compaction and layer flushing logic more.
@@ -192,13 +200,19 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str):
             log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
             assert int(row[0]) < int(row[1])
 
+    # We run "random" kills using a fixed seed, to improve reproducibility if a test
+    # failure is related to a particular order of operations.
+    seed = 0xDEADBEEF
+    rng = random.Random(seed)
+
     # Update the whole table, then immediately kill and restart the pageserver
     for i in range(1, 15):
         endpoint.safe_psql("UPDATE foo set updates = updates + 1")
 
         # This kills the pageserver immediately, to simulate a crash
-        env.pageserver.stop(immediate=True)
-        env.pageserver.start()
+        to_kill = rng.choice(env.pageservers)
+        to_kill.stop(immediate=True)
+        to_kill.start()
 
         # Check that all the updates are visible
         num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0]
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index f26d04e2f3..e4219ec7a6 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -2,25 +2,40 @@
 # This file runs pg_regress-based tests.
 #
 from pathlib import Path
+from typing import Optional
 
-from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    check_restored_datadir_content,
+)
+from fixtures.remote_storage import s3_storage
 
 
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
+@pytest.mark.parametrize("shard_count", [None, 4])
 def test_pg_regress(
-    neon_simple_env: NeonEnv,
+    neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
     pg_bin,
     capsys,
     base_dir: Path,
     pg_distrib_dir: Path,
+    shard_count: Optional[int],
 ):
-    env = neon_simple_env
+    """
+    :param shard_count: if None, create an unsharded tenant.  Otherwise create a tenant with this
+                        many shards.
+    """
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
-    env.neon_cli.create_branch("test_pg_regress", "empty")
     # Connect to postgres and create a database called "regression".
-    endpoint = env.endpoints.create_start("test_pg_regress")
+    endpoint = env.endpoints.create_start("main")
     endpoint.safe_psql("CREATE DATABASE regression")
 
     # Create some local directories for pg_regress to run in.
@@ -61,22 +76,25 @@ def test_pg_regress(
 
 # Run the PostgreSQL "isolation" tests, in src/test/isolation.
 #
+@pytest.mark.parametrize("shard_count", [None, 4])
 def test_isolation(
-    neon_simple_env: NeonEnv,
+    neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
     pg_bin,
     capsys,
     base_dir: Path,
     pg_distrib_dir: Path,
+    shard_count: Optional[int],
 ):
-    env = neon_simple_env
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
-    env.neon_cli.create_branch("test_isolation", "empty")
     # Connect to postgres and create a database called "regression".
     # isolation tests use prepared transactions, so enable them
-    endpoint = env.endpoints.create_start(
-        "test_isolation", config_lines=["max_prepared_transactions=100"]
-    )
+    endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"])
     endpoint.safe_psql("CREATE DATABASE isolation_regression")
 
     # Create some local directories for pg_isolation_regress to run in.
@@ -114,19 +132,24 @@ def test_isolation(
 
 # Run extra Neon-specific pg_regress-based tests. The tests and their
 # schedule file are in the sql_regress/ directory.
+@pytest.mark.parametrize("shard_count", [None, 4])
 def test_sql_regress(
-    neon_simple_env: NeonEnv,
+    neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
     pg_bin,
     capsys,
     base_dir: Path,
     pg_distrib_dir: Path,
+    shard_count: Optional[int],
 ):
-    env = neon_simple_env
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
-    env.neon_cli.create_branch("test_sql_regress", "empty")
     # Connect to postgres and create a database called "regression".
-    endpoint = env.endpoints.create_start("test_sql_regress")
+    endpoint = env.endpoints.create_start("main")
     endpoint.safe_psql("CREATE DATABASE regression")
 
     # Create some local directories for pg_regress to run in.
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
new file mode 100644
index 0000000000..c16bfc2ec6
--- /dev/null
+++ b/test_runner/regress/test_sharding.py
@@ -0,0 +1,85 @@
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+from fixtures.remote_storage import s3_storage
+from fixtures.types import TimelineId
+from fixtures.workload import Workload
+
+
+def test_sharding_smoke(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test the basic lifecycle of a sharded tenant:
+     - ingested data gets split up
+     - page service reads
+     - timeline creation and deletion
+     - splits
+    """
+
+    shard_count = 4
+    neon_env_builder.num_pageservers = shard_count
+
+    # 1MiB stripes: enable getting some meaningful data distribution without
+    # writing large quantities of data in this test.  The stripe size is given
+    # in number of 8KiB pages.
+    stripe_size = 128
+
+    # Use S3-compatible remote storage so that we can scrub: this test validates
+    # that the scrubber doesn't barf when it sees a sharded tenant.
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+
+    neon_env_builder.preserve_database_files = True
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
+    )
+    tenant_id = env.initial_tenant
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+    shards = env.attachment_service.locate(tenant_id)
+
+    def get_sizes():
+        sizes = {}
+        for shard in shards:
+            node_id = int(shard["node_id"])
+            pageserver = pageservers[node_id]
+            sizes[node_id] = pageserver.http_client().tenant_status(shard["shard_id"])[
+                "current_physical_size"
+            ]
+        log.info(f"sizes = {sizes}")
+        return sizes
+
+    # Test that timeline creation works on a sharded tenant
+    timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id)
+
+    # Test that we can write data to a sharded tenant
+    workload = Workload(env, tenant_id, timeline_b, branch_name="branch_b")
+    workload.init()
+
+    sizes_before = get_sizes()
+    workload.write_rows(256)
+
+    # Test that we can read data back from a sharded tenant
+    workload.validate()
+
+    # Validate that the data is spread across pageservers
+    sizes_after = get_sizes()
+    # Our sizes increased when we wrote data
+    assert sum(sizes_after.values()) > sum(sizes_before.values())
+    # That increase is present on all shards
+    assert all(sizes_after[ps.id] > sizes_before[ps.id] for ps in env.pageservers)
+
+    # Validate that timeline list API works properly on all shards
+    for shard in shards:
+        node_id = int(shard["node_id"])
+        pageserver = pageservers[node_id]
+        timelines = set(
+            TimelineId(tl["timeline_id"])
+            for tl in pageserver.http_client().timeline_list(shard["shard_id"])
+        )
+        assert timelines == {env.initial_timeline, timeline_b}
+
+    # TODO: test timeline deletion and tenant deletion (depends on change in attachment_service)

From 4c245b0f5abda55083bd9e4a87375185cc1f3528 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 26 Jan 2024 16:12:49 +0000
Subject: [PATCH 0006/1571] update_build_tools_image.yml: Push build-tools
 image to Docker Hub (#6481)

## Problem

- `docker.io/neondatabase/build-tools:pinned` image is frequently
outdated on Docker Hub because there's no automated way to update it.
- `update_build_tools_image.yml` workflow contains legacy roll-back
logic, which is not required anymore because it updates only a single
image.

## Summary of changes
- Make `update_build_tools_image.yml` workflow push images to both ECR
and Docker Hub
- Remove unneeded roll-back logic
---
 .../workflows/build_and_push_docker_image.yml |  25 +++-
 .../workflows/update_build_tools_image.yml    | 122 +++++-------------
 2 files changed, 53 insertions(+), 94 deletions(-)

diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml
index e401b2f418..892e21114b 100644
--- a/.github/workflows/build_and_push_docker_image.yml
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -69,7 +69,15 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
+        run: |
+          /kaniko/executor \
+            --reproducible \
+            --snapshotMode=redo \
+            --skip-unused-stages \
+            --dockerfile ${{ inputs.dockerfile-path }} \
+            --cache=true \
+            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
 
   kaniko-arm:
     if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -85,7 +93,15 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+        run: |
+          /kaniko/executor \
+            --reproducible \
+            --snapshotMode=redo \
+            --skip-unused-stages \
+            --dockerfile ${{ inputs.dockerfile-path }} \
+            --cache=true \
+            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
 
   manifest:
     if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -99,7 +115,10 @@ jobs:
 
     steps:
       - name: Create manifest
-        run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+        run: |
+          docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
+                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
+                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
 
       - name: Push manifest
         run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml
index 88bab797b7..900724fc60 100644
--- a/.github/workflows/update_build_tools_image.yml
+++ b/.github/workflows/update_build_tools_image.yml
@@ -20,111 +20,51 @@ defaults:
   run:
     shell: bash -euo pipefail {0}
 
-env:
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
 permissions: {}
 
 jobs:
   tag-image:
     runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
 
     env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: ${{ inputs.to-tag }}
-    outputs:
-      next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
-      prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
-
-    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Get source image digest
-        id: next-digest
-        run: |
-          NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
-            exit 1
-          fi
-
-          echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
-          echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
-
-      - name: Get destination image digest (if already exists)
-        id: prev-digest
-        run: |
-          PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
-          if [ -z "${PREV_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
-          else
-            echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
-
-            echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Tag image
-        run: |
-          crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
-
-  rollback-tag-image:
-    needs:  tag-image
-    if: ${{ !success() }}
-
-    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
-
-    env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
       FROM_TAG: ${{ inputs.from-tag }}
       TO_TAG: ${{ inputs.to-tag }}
 
     steps:
-      - name: Install Crane & ECR helper
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
         run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
 
-      - name: Configure ECR login
+      - uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - uses: docker/login-action@v2
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21'
+
+      - name: Install crane
         run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+          go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
 
-      - name: Restore previous tag if needed
+      - name: Copy images
         run: |
-          NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
-          PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
+          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
+          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
 
-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
-            exit 0
-          fi
-
-          if [ -z "${PREV_DIGEST}" ]; then
-            # I guess we should delete the tag here/untag the image, but crane does not support it
-            # - https://github.com/google/go-containerregistry/issues/999
-
-            echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
-
-            exit 0
-          fi
-
-          CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
-          if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
-            crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
-
-            echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
-          else
-            echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
-          fi
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom

From dcc7610ad67c4a1d9f00c884a044f33c0b4d1de0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 26 Jan 2024 17:43:56 +0100
Subject: [PATCH 0007/1571] Do backoff::retry in s3 timetravel test (#6493)

The top level retries weren't enough, probably because we do so many
network requests. Fine grained retries ensure that there is higher
potential for the entire test to succeed.

To demonstrate this, consider the following example: let's assume that
each request has 5% chance of failing and we do 10 requests. Then
chances of success without any retries is 0.95^10 = 0.6. With 3 top
level retries it is 1-0.4^3 = 0.936. With 3 fine grained retries it is
(1-0.05^3)^10 = 0.9988 (roundings implicit). So chances of failure are
6.4% for the top level retry vs 0.12% for the fine grained retry.

Follow-up of #6155
---
 libs/remote_storage/tests/test_real_s3.rs | 62 ++++++++++++++++++-----
 1 file changed, 49 insertions(+), 13 deletions(-)

diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 9e1b989e4d..679be66bf7 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,4 +1,5 @@
 use std::env;
+use std::fmt::{Debug, Display};
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::sync::Arc;
@@ -8,6 +9,7 @@ use std::{collections::HashSet, time::SystemTime};
 use crate::common::{download_to_vec, upload_stream};
 use anyhow::Context;
 use camino::Utf8Path;
+use futures_util::Future;
 use remote_storage::{
     GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
@@ -22,6 +24,7 @@ mod common;
 mod tests_s3;
 
 use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};
+use utils::backoff;
 
 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
 
@@ -39,6 +42,25 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     // to take the time from S3 response headers.
     const WAIT_TIME: Duration = Duration::from_millis(3_000);
 
+    async fn retry<T, O, F, E>(op: O) -> Result<T, E>
+    where
+        E: Display + Debug + 'static,
+        O: FnMut() -> F,
+        F: Future<Output = Result<T, E>>,
+    {
+        let warn_threshold = 3;
+        let max_retries = 10;
+        backoff::retry(
+            op,
+            |_e| false,
+            warn_threshold,
+            max_retries,
+            "test retry",
+            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+        )
+        .await
+    }
+
     async fn time_point() -> SystemTime {
         tokio::time::sleep(WAIT_TIME).await;
         let ret = SystemTime::now();
@@ -47,8 +69,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     }
 
     async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(client
-            .list_files(None)
+        Ok(retry(|| client.list_files(None))
             .await
             .context("list root files failure")?
             .into_iter()
@@ -64,16 +85,23 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
         .with_context(|| "RemotePath conversion")?;
 
-    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-    ctx.client.upload(data, len, &path1, None).await?;
+    retry(|| {
+        let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+        ctx.client.upload(data, len, &path1, None)
+    })
+    .await?;
 
     let t0_files = list_files(&ctx.client).await?;
     let t0 = time_point().await;
     println!("at t0: {t0_files:?}");
 
     let old_data = "remote blob data2";
-    let (data, len) = upload_stream(old_data.as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
+
+    retry(|| {
+        let (data, len) = upload_stream(old_data.as_bytes().into());
+        ctx.client.upload(data, len, &path2, None)
+    })
+    .await?;
 
     let t1_files = list_files(&ctx.client).await?;
     let t1 = time_point().await;
@@ -81,7 +109,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
 
     // A little check to ensure that our clock is not too far off from the S3 clock
     {
-        let dl = ctx.client.download(&path2).await?;
+        let dl = retry(|| ctx.client.download(&path2)).await?;
         let last_modified = dl.last_modified.unwrap();
         let half_wt = WAIT_TIME.mul_f32(0.5);
         let t0_hwt = t0 + half_wt;
@@ -92,15 +120,21 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
         }
     }
 
-    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-    ctx.client.upload(data, len, &path3, None).await?;
+    retry(|| {
+        let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+        ctx.client.upload(data, len, &path3, None)
+    })
+    .await?;
 
     let new_data = "new remote blob data2";
-    let (data, len) = upload_stream(new_data.as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
 
-    ctx.client.delete(&path1).await?;
+    retry(|| {
+        let (data, len) = upload_stream(new_data.as_bytes().into());
+        ctx.client.upload(data, len, &path2, None)
+    })
+    .await?;
 
+    retry(|| ctx.client.delete(&path1)).await?;
     let t2_files = list_files(&ctx.client).await?;
     let t2 = time_point().await;
     println!("at t2: {t2_files:?}");
@@ -137,7 +171,9 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     assert_eq!(t0_files, t0_files_recovered);
 
     // cleanup
-    ctx.client.delete_objects(&[path1, path2, path3]).await?;
+
+    let paths = &[path1, path2, path3];
+    retry(|| ctx.client.delete_objects(paths)).await?;
 
     Ok(())
 }

From 58f6cb649e42ff9f2fb82efda8dec7dd3f947434 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 26 Jan 2024 17:20:44 +0000
Subject: [PATCH 0008/1571] control_plane: database persistence for
 attachment_service (#6468)

## Problem

Spun off from https://github.com/neondatabase/neon/pull/6394 -- this PR
is just the persistence parts and the changes that enable it to work
nicely


## Summary of changes

- Revert #6444 and #6450
- In neon_local, start a vanilla postgres instance for the attachment
service to use.
- Adopt `diesel` crate for database access in attachment service. This
uses raw SQL migrations as the source of truth for the schema, so it's a
soft dependency: we can switch libraries pretty easily.
- Rewrite persistence.rs to use postgres (via diesel) instead of JSON.
- Preserve JSON read+write at startup and shutdown: this enables using
the JSON format in compatibility tests, so that we don't have to commit
to our DB schema yet.
- In neon_local, run database creation + migrations before starting
attachment service
- Run the initial reconciliation in Service::spawn in the background, so
that the pageserver + attachment service don't get stuck waiting for
each other to start, when restarting both together in a test.
---
 Cargo.lock                                    |  81 ++-
 control_plane/Cargo.toml                      |   2 +
 control_plane/attachment_service/Cargo.toml   |   3 +-
 .../attachment_service/migrations/.keep       |   0
 .../down.sql                                  |   6 +
 .../up.sql                                    |  36 ++
 .../down.sql                                  |   1 +
 .../up.sql                                    |  12 +
 .../2024-01-07-212945_create_nodes/down.sql   |   1 +
 .../2024-01-07-212945_create_nodes/up.sql     |  10 +
 control_plane/attachment_service/src/http.rs  |  86 ++-
 control_plane/attachment_service/src/lib.rs   |   1 +
 control_plane/attachment_service/src/main.rs  |  46 +-
 control_plane/attachment_service/src/node.rs  |  13 +
 .../attachment_service/src/persistence.rs     | 526 +++++++++++-------
 .../attachment_service/src/schema.rs          |  27 +
 .../attachment_service/src/service.rs         | 294 ++++++----
 control_plane/src/attachment_service.rs       | 300 ++++++++--
 control_plane/src/bin/neon_local.rs           |  48 +-
 control_plane/src/local_env.rs                |   6 +-
 control_plane/src/pageserver.rs               |  27 +-
 diesel.toml                                   |   9 +
 libs/utils/src/crashsafe.rs                   |  44 +-
 pageserver/src/virtual_file.rs                |   7 +-
 test_runner/fixtures/neon_fixtures.py         |  38 +-
 test_runner/regress/test_compatibility.py     |   7 +
 .../regress/test_pageserver_generations.py    |   3 +-
 workspace_hack/Cargo.toml                     |   5 +-
 28 files changed, 1168 insertions(+), 471 deletions(-)
 create mode 100644 control_plane/attachment_service/migrations/.keep
 create mode 100644 control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
 create mode 100644 control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
 create mode 100644 control_plane/attachment_service/src/schema.rs
 create mode 100644 diesel.toml

diff --git a/Cargo.lock b/Cargo.lock
index 6e91363de8..f0bcfb762a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -278,6 +278,7 @@ dependencies = [
  "camino",
  "clap",
  "control_plane",
+ "diesel",
  "futures",
  "git-version",
  "hyper",
@@ -286,7 +287,6 @@ dependencies = [
  "pageserver_client",
  "postgres_backend",
  "postgres_connection",
- "scopeguard",
  "serde",
  "serde_json",
  "thiserror",
@@ -1328,6 +1328,8 @@ dependencies = [
  "clap",
  "comfy-table",
  "compute_api",
+ "diesel",
+ "diesel_migrations",
  "futures",
  "git-version",
  "hex",
@@ -1638,6 +1640,52 @@ dependencies = [
  "rusticata-macros",
 ]
 
+[[package]]
+name = "diesel"
+version = "2.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62c6fcf842f17f8c78ecf7c81d75c5ce84436b41ee07e03f490fbb5f5a8731d8"
+dependencies = [
+ "bitflags 2.4.1",
+ "byteorder",
+ "diesel_derives",
+ "itoa",
+ "pq-sys",
+ "serde_json",
+]
+
+[[package]]
+name = "diesel_derives"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef8337737574f55a468005a83499da720f20c65586241ffea339db9ecdfd2b44"
+dependencies = [
+ "diesel_table_macro_syntax",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.32",
+]
+
+[[package]]
+name = "diesel_migrations"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6036b3f0120c5961381b570ee20a02432d7e2d27ea60de9578799cf9156914ac"
+dependencies = [
+ "diesel",
+ "migrations_internals",
+ "migrations_macros",
+]
+
+[[package]]
+name = "diesel_table_macro_syntax"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5"
+dependencies = [
+ "syn 2.0.32",
+]
+
 [[package]]
 name = "digest"
 version = "0.10.7"
@@ -2787,6 +2835,27 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "migrations_internals"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f23f71580015254b020e856feac3df5878c2c7a8812297edd6c0a485ac9dada"
+dependencies = [
+ "serde",
+ "toml",
+]
+
+[[package]]
+name = "migrations_macros"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cce3325ac70e67bbab5bd837a31cae01f1a6db64e0e744a33cb03a543469ef08"
+dependencies = [
+ "migrations_internals",
+ "proc-macro2",
+ "quote",
+]
+
 [[package]]
 name = "mime"
 version = "0.3.17"
@@ -3795,6 +3864,15 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
+[[package]]
+name = "pq-sys"
+version = "0.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31c0052426df997c0cbd30789eb44ca097e3541717a7b8fa36b1c464ee7edebd"
+dependencies = [
+ "vcpkg",
+]
+
 [[package]]
 name = "pq_proto"
 version = "0.1.0"
@@ -6623,6 +6701,7 @@ dependencies = [
  "clap",
  "clap_builder",
  "crossbeam-utils",
+ "diesel",
  "either",
  "fail",
  "futures-channel",
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 75e5dcb7f8..09c171f1d3 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -10,6 +10,8 @@ async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+diesel = { version = "2.1.4", features = ["postgres"]}
+diesel_migrations = { version = "2.1.0", features = ["postgres"]}
 futures.workspace = true
 git-version.workspace = true
 nix.workspace = true
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 743dd806c4..6fc21810bc 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -14,7 +14,6 @@ hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
-scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
@@ -26,6 +25,8 @@ tracing.workspace = true
 # a parsing function when loading pageservers from neon_local LocalEnv
 postgres_backend.workspace = true
 
+diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
+
 utils = { path = "../../libs/utils/" }
 metrics = { path = "../../libs/metrics/" }
 control_plane = { path = ".." }
diff --git a/control_plane/attachment_service/migrations/.keep b/control_plane/attachment_service/migrations/.keep
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
new file mode 100644
index 0000000000..a9f5260911
--- /dev/null
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
@@ -0,0 +1,6 @@
+-- This file was automatically created by Diesel to setup helper functions
+-- and other internal bookkeeping. This file is safe to edit, any future
+-- changes will be added to existing projects as new migrations.
+
+DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass);
+DROP FUNCTION IF EXISTS diesel_set_updated_at();
diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
new file mode 100644
index 0000000000..d68895b1a7
--- /dev/null
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
@@ -0,0 +1,36 @@
+-- This file was automatically created by Diesel to setup helper functions
+-- and other internal bookkeeping. This file is safe to edit, any future
+-- changes will be added to existing projects as new migrations.
+
+
+
+
+-- Sets up a trigger for the given table to automatically set a column called
+-- `updated_at` whenever the row is modified (unless `updated_at` was included
+-- in the modified columns)
+--
+-- # Example
+--
+-- ```sql
+-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW());
+--
+-- SELECT diesel_manage_updated_at('users');
+-- ```
+CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$
+BEGIN
+    EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s
+                    FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl);
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$
+BEGIN
+    IF (
+        NEW IS DISTINCT FROM OLD AND
+        NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at
+    ) THEN
+        NEW.updated_at := current_timestamp;
+    END IF;
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
new file mode 100644
index 0000000000..b875b91c00
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
@@ -0,0 +1 @@
+DROP TABLE tenant_shards;
diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
new file mode 100644
index 0000000000..585dbc79a0
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
@@ -0,0 +1,12 @@
+CREATE TABLE tenant_shards (
+  tenant_id VARCHAR NOT NULL,
+  shard_number INTEGER NOT NULL,
+  shard_count INTEGER NOT NULL,
+  PRIMARY KEY(tenant_id, shard_number, shard_count),
+  shard_stripe_size INTEGER NOT NULL,
+  generation INTEGER NOT NULL,
+  generation_pageserver BIGINT NOT NULL,
+  placement_policy VARCHAR NOT NULL,
+  -- config is JSON encoded, opaque to the database.
+  config TEXT NOT NULL
+);
\ No newline at end of file
diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
new file mode 100644
index 0000000000..ec303bc8cf
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
@@ -0,0 +1 @@
+DROP TABLE nodes;
diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
new file mode 100644
index 0000000000..9be0880fa4
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
@@ -0,0 +1,10 @@
+CREATE TABLE nodes (
+  node_id BIGINT PRIMARY KEY NOT NULL,
+
+  scheduling_policy VARCHAR NOT NULL,
+
+  listen_http_addr VARCHAR NOT NULL,
+  listen_http_port INTEGER NOT NULL,
+  listen_pg_addr VARCHAR NOT NULL,
+  listen_pg_port INTEGER NOT NULL
+);
\ No newline at end of file
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 30f6dd66ee..81f21a8e7a 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,5 +1,5 @@
 use crate::reconciler::ReconcileError;
-use crate::service::Service;
+use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
@@ -104,34 +104,34 @@ async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiErr
     json_response(StatusCode::OK, state.service.inspect(inspect_req))
 }
 
-async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_create(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
     let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
-    let state = get_state(&req);
-    json_response(
-        StatusCode::OK,
-        state.service.tenant_create(create_req).await?,
-    )
+    json_response(StatusCode::OK, service.tenant_create(create_req).await?)
 }
 
-async fn handle_tenant_timeline_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_timeline_create(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
-
-    let state = get_state(&req);
     json_response(
         StatusCode::OK,
-        state
-            .service
+        service
             .tenant_timeline_create(tenant_id, create_req)
             .await?,
     )
 }
 
-async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_locate(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let state = get_state(&req);
-
-    json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?)
+    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }
 
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -154,14 +154,15 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
     json_response(StatusCode::OK, state.service.node_configure(config_req)?)
 }
 
-async fn handle_tenant_shard_migrate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_shard_migrate(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
     let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
-    let state = get_state(&req);
     json_response(
         StatusCode::OK,
-        state
-            .service
+        service
             .tenant_shard_migrate(tenant_shard_id, migrate_req)
             .await?,
     )
@@ -178,6 +179,35 @@ impl From<ReconcileError> for ApiError {
     }
 }
 
+/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
+/// be allowed to run if Service has finished its initial reconciliation.
+async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
+where
+    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
+{
+    let state = get_state(&request);
+    let service = state.service.clone();
+
+    let startup_complete = service.startup_complete.clone();
+    if tokio::time::timeout(STARTUP_RECONCILE_TIMEOUT, startup_complete.wait())
+        .await
+        .is_err()
+    {
+        // This shouldn't happen: it is the responsibilty of [`Service::startup_reconcile`] to use appropriate
+        // timeouts around its remote calls, to bound its runtime.
+        return Err(ApiError::Timeout(
+            "Timed out waiting for service readiness".into(),
+        ));
+    }
+
+    request_span(
+        request,
+        |request| async move { handler(service, request).await },
+    )
+    .await
+}
+
 pub fn make_router(
     service: Arc<Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
@@ -205,14 +235,20 @@ pub fn make_router(
         .put("/node/:node_id/config", |r| {
             request_span(r, handle_node_configure)
         })
-        .post("/tenant", |r| request_span(r, handle_tenant_create))
-        .post("/tenant/:tenant_id/timeline", |r| {
-            request_span(r, handle_tenant_timeline_create)
+        .post("/v1/tenant", |r| {
+            tenant_service_handler(r, handle_tenant_create)
+        })
+        .post("/v1/tenant/:tenant_id/timeline", |r| {
+            tenant_service_handler(r, handle_tenant_timeline_create)
         })
         .get("/tenant/:tenant_id/locate", |r| {
-            request_span(r, handle_tenant_locate)
+            tenant_service_handler(r, handle_tenant_locate)
         })
         .put("/tenant/:tenant_shard_id/migrate", |r| {
-            request_span(r, handle_tenant_shard_migrate)
+            tenant_service_handler(r, handle_tenant_shard_migrate)
         })
+        // Path aliases for tests_forward_compatibility
+        // TODO: remove these in future PR
+        .post("/re-attach", |r| request_span(r, handle_re_attach))
+        .post("/validate", |r| request_span(r, handle_validate))
 }
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index e4ca9aa304..082afb4157 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -7,6 +7,7 @@ mod node;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
+mod schema;
 pub mod service;
 mod tenant_state;
 
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 38e51b9a9e..05a3895dfa 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -12,9 +12,9 @@ use camino::Utf8PathBuf;
 use clap::Parser;
 use metrics::launch_timestamp::LaunchTimestamp;
 use std::sync::Arc;
+use tokio::signal::unix::SignalKind;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};
-use utils::signals::{ShutdownSignals, Signal};
 
 use utils::{project_build_tag, project_git_version, tcp_listener};
 
@@ -40,6 +40,10 @@ struct Cli {
     /// Path to the .json file to store state (will be created if it doesn't exist)
     #[arg(short, long)]
     path: Utf8PathBuf,
+
+    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
+    #[arg(long)]
+    database_url: String,
 }
 
 #[tokio::main]
@@ -66,9 +70,14 @@ async fn main() -> anyhow::Result<()> {
         jwt_token: args.jwt_token,
     };
 
-    let persistence = Arc::new(Persistence::spawn(&args.path).await);
+    let json_path = if args.path.as_os_str().is_empty() {
+        None
+    } else {
+        Some(args.path)
+    };
+    let persistence = Arc::new(Persistence::new(args.database_url, json_path.clone()));
 
-    let service = Service::spawn(config, persistence).await?;
+    let service = Service::spawn(config, persistence.clone()).await?;
 
     let http_listener = tcp_listener::bind(args.listen)?;
 
@@ -81,20 +90,31 @@ async fn main() -> anyhow::Result<()> {
     let router = make_router(service, auth)
         .build()
         .map_err(|err| anyhow!(err))?;
-    let service = utils::http::RouterService::new(router).unwrap();
-    let server = hyper::Server::from_tcp(http_listener)?.serve(service);
+    let router_service = utils::http::RouterService::new(router).unwrap();
+    let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);
 
     tracing::info!("Serving on {0}", args.listen);
 
     tokio::task::spawn(server);
 
-    ShutdownSignals::handle(|signal| match signal {
-        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
-            tracing::info!("Got {}. Terminating", signal.name());
-            // We're just a test helper: no graceful shutdown.
-            std::process::exit(0);
-        }
-    })?;
+    // Wait until we receive a signal
+    let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
+    let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
+    let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate())?;
+    tokio::select! {
+        _ = sigint.recv() => {},
+        _ = sigterm.recv() => {},
+        _ = sigquit.recv() => {},
+    }
+    tracing::info!("Terminating on signal");
 
-    Ok(())
+    if json_path.is_some() {
+        // Write out a JSON dump on shutdown: this is used in compat tests to avoid passing
+        // full postgres dumps around.
+        if let Err(e) = persistence.write_tenants_json().await {
+            tracing::error!("Failed to write JSON on shutdown: {e}")
+        }
+    }
+
+    std::process::exit(0);
 }
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index efd3f8f49b..47f61702d8 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,6 +1,8 @@
 use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
 use utils::id::NodeId;
 
+use crate::persistence::NodePersistence;
+
 #[derive(Clone)]
 pub(crate) struct Node {
     pub(crate) id: NodeId,
@@ -34,4 +36,15 @@ impl Node {
             NodeSchedulingPolicy::Pause => false,
         }
     }
+
+    pub(crate) fn to_persistent(&self) -> NodePersistence {
+        NodePersistence {
+            node_id: self.id.0 as i64,
+            scheduling_policy: self.scheduling.into(),
+            listen_http_addr: self.listen_http_addr.clone(),
+            listen_http_port: self.listen_http_port as i32,
+            listen_pg_addr: self.listen_pg_addr.clone(),
+            listen_pg_port: self.listen_pg_port as i32,
+        }
+    }
 }
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index e944a2e9ed..b27bd2bf2e 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -1,182 +1,161 @@
-use std::{collections::HashMap, str::FromStr};
+use std::collections::HashMap;
+use std::str::FromStr;
 
-use camino::{Utf8Path, Utf8PathBuf};
-use control_plane::{
-    attachment_service::{NodeAvailability, NodeSchedulingPolicy},
-    local_env::LocalEnv,
-};
-use pageserver_api::{
-    models::TenantConfig,
-    shard::{ShardCount, ShardNumber, TenantShardId},
-};
+use camino::Utf8Path;
+use camino::Utf8PathBuf;
+use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+use diesel::pg::PgConnection;
+use diesel::prelude::*;
+use diesel::Connection;
+use pageserver_api::models::TenantConfig;
+use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
-use tracing::info;
-use utils::{
-    generation::Generation,
-    id::{NodeId, TenantId},
-};
+use utils::generation::Generation;
+use utils::id::{NodeId, TenantId};
 
-use crate::{node::Node, PlacementPolicy};
+use crate::node::Node;
+use crate::PlacementPolicy;
 
-/// Placeholder for storage.  This will be replaced with a database client.
+/// ## What do we store?
+///
+/// The attachment service does not store most of its state durably.
+///
+/// The essential things to store durably are:
+/// - generation numbers, as these must always advance monotonically to ensure data safety.
+/// - Tenant's PlacementPolicy and TenantConfig, as the source of truth for these is something external.
+/// - Node's scheduling policies, as the source of truth for these is something external.
+///
+/// Other things we store durably as an implementation detail:
+/// - Node's host/port: this could be avoided it we made nodes emit a self-registering heartbeat,
+///   but it is operationally simpler to make this service the authority for which nodes
+///   it talks to.
+///
+/// ## Performance/efficiency
+///
+/// The attachment service does not go via the database for most things: there are
+/// a couple of places where we must, and where efficiency matters:
+/// - Incrementing generation numbers: the Reconciler has to wait for this to complete
+///   before it can attach a tenant, so this acts as a bound on how fast things like
+///   failover can happen.
+/// - Pageserver re-attach: we will increment many shards' generations when this happens,
+///   so it is important to avoid e.g. issuing O(N) queries.
+///
+/// Database calls relating to nodes have low performance requirements, as they are very rarely
+/// updated, and reads of nodes are always from memory, not the database.  We only require that
+/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
-    inner: std::sync::Mutex<Inner>,
-}
-
-struct Inner {
-    state: PersistentState,
-    write_queue_tx: tokio::sync::mpsc::UnboundedSender<PendingWrite>,
+    database_url: String,
+
+    // In test environments, we support loading+saving a JSON file.  This is temporary, for the benefit of
+    // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
+    // compatible just yet.
+    json_path: Option<Utf8PathBuf>,
 }
 
+/// Legacy format, for use in JSON compat objects in test environment
 #[derive(Serialize, Deserialize)]
-struct PersistentState {
+struct JsonPersistence {
     tenants: HashMap<TenantShardId, TenantShardPersistence>,
 }
 
-struct PendingWrite {
-    bytes: Vec<u8>,
-    done_tx: tokio::sync::oneshot::Sender<()>,
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum DatabaseError {
+    #[error(transparent)]
+    Query(#[from] diesel::result::Error),
+    #[error(transparent)]
+    Connection(#[from] diesel::result::ConnectionError),
+    #[error("Logical error: {0}")]
+    Logical(String),
 }
 
-impl PersistentState {
-    async fn load(path: &Utf8Path) -> anyhow::Result<Self> {
-        let bytes = tokio::fs::read(path).await?;
-        let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
-
-        for (tenant_id, tenant) in &mut decoded.tenants {
-            // Backward compat: an old attachments.json from before PR #6251, replace
-            // empty strings with proper defaults.
-            if tenant.tenant_id.is_empty() {
-                tenant.tenant_id = format!("{}", tenant_id);
-                tenant.config = serde_json::to_string(&TenantConfig::default())?;
-                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())?;
-            }
-        }
-
-        Ok(decoded)
-    }
-
-    async fn load_or_new(path: &Utf8Path) -> Self {
-        match Self::load(path).await {
-            Ok(s) => {
-                tracing::info!("Loaded state file at {}", path);
-                s
-            }
-            Err(e)
-                if e.downcast_ref::<std::io::Error>()
-                    .map(|e| e.kind() == std::io::ErrorKind::NotFound)
-                    .unwrap_or(false) =>
-            {
-                tracing::info!("Will create state file at {}", path);
-                Self {
-                    tenants: HashMap::new(),
-                }
-            }
-            Err(e) => {
-                panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path)
-            }
-        }
-    }
-}
+pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
 impl Persistence {
-    pub async fn spawn(path: &Utf8Path) -> Self {
-        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
-        let state = PersistentState::load_or_new(path).await;
-        tokio::spawn(Self::writer_task(rx, path.to_owned()));
+    pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
         Self {
-            inner: std::sync::Mutex::new(Inner {
-                state,
-                write_queue_tx: tx,
-            }),
+            database_url,
+            json_path,
         }
     }
 
-    async fn writer_task(
-        mut rx: tokio::sync::mpsc::UnboundedReceiver<PendingWrite>,
-        path: Utf8PathBuf,
-    ) {
-        scopeguard::defer! {
-            info!("persistence writer task exiting");
-        };
-        loop {
-            match rx.recv().await {
-                Some(write) => {
-                    tokio::task::spawn_blocking({
-                        let path = path.clone();
-                        move || {
-                            let tmp_path =
-                                utils::crashsafe::path_with_suffix_extension(&path, "___new");
-                            utils::crashsafe::overwrite(&path, &tmp_path, &write.bytes)
-                        }
-                    })
-                    .await
-                    .expect("spawn_blocking")
-                    .expect("write file");
-                    let _ = write.done_tx.send(()); // receiver may lose interest any time
-                }
-                None => {
-                    return;
-                }
-            }
-        }
-    }
-
-    /// Perform a modification on our [`PersistentState`].
-    /// Return a future that completes once our modification has been persisted.
-    /// The output of the future is the return value of the `txn`` closure.
-    async fn mutating_transaction<F, R>(&self, txn: F) -> R
+    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
+    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
     where
-        F: FnOnce(&mut PersistentState) -> R,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        R: Send + 'static,
     {
-        let (ret, done_rx) = {
-            let mut inner = self.inner.lock().unwrap();
-            let ret = txn(&mut inner.state);
-            let (done_tx, done_rx) = tokio::sync::oneshot::channel();
-            let write = PendingWrite {
-                bytes: serde_json::to_vec(&inner.state).expect("Serialization error"),
-                done_tx,
-            };
-            inner
-                .write_queue_tx
-                .send(write)
-                .expect("writer task always outlives self");
-            (ret, done_rx)
-        };
-        // the write task can go away once we start .await'ing
-        let _: () = done_rx.await.expect("writer task dead, check logs");
-        ret
+        let database_url = self.database_url.clone();
+        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
+            // TODO: connection pooling, such as via diesel::r2d2
+            let mut conn = PgConnection::establish(&database_url)?;
+            func(&mut conn)
+        })
+        .await
+        .expect("Task panic")
     }
 
-    /// When registering a node, persist it so that on next start we will be able to
-    /// iterate over known nodes to synchronize their tenant shard states with our observed state.
-    pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> {
-        // TODO: node persitence will come with database backend
-        Ok(())
+    /// When a node is first registered, persist it before using it for anything
+    pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
+        let np = node.to_persistent();
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::insert_into(crate::schema::nodes::table)
+                .values(&np)
+                .execute(conn)?;
+            Ok(())
+        })
+        .await
     }
 
-    /// At startup, we populate the service's list of nodes, and use this list to call into
-    /// each node to do an initial reconciliation of the state of the world with our in-memory
-    /// observed state.
-    pub(crate) async fn list_nodes(&self) -> anyhow::Result<Vec<Node>> {
-        let env = LocalEnv::load_config()?;
-        // TODO: node persitence will come with database backend
+    /// At startup, populate the list of nodes which our shards may be placed on
+    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
+        let nodes: Vec<Node> = self
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::nodes::table
+                    .load::<NodePersistence>(conn)?
+                    .into_iter()
+                    .map(|n| Node {
+                        id: NodeId(n.node_id as u64),
+                        // At startup we consider a node offline until proven otherwise.
+                        availability: NodeAvailability::Offline,
+                        scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
+                            .expect("Bad scheduling policy in DB"),
+                        listen_http_addr: n.listen_http_addr,
+                        listen_http_port: n.listen_http_port as u16,
+                        listen_pg_addr: n.listen_pg_addr,
+                        listen_pg_port: n.listen_pg_port as u16,
+                    })
+                    .collect::<Vec<Node>>())
+            })
+            .await?;
 
-        // XXX hack: enable test_backward_compatibility to work by populating our list of
+        if nodes.is_empty() {
+            return self.list_nodes_local_env().await;
+        }
+
+        tracing::info!("list_nodes: loaded {} nodes", nodes.len());
+
+        Ok(nodes)
+    }
+
+    /// Shim for automated compatibility tests: load nodes from LocalEnv instead of database
+    pub(crate) async fn list_nodes_local_env(&self) -> DatabaseResult<Vec<Node>> {
+        // Enable test_backward_compatibility to work by populating our list of
         // nodes from LocalEnv when it is not present in persistent storage.  Otherwise at
         // first startup in the compat test, we may have shards but no nodes.
-        let mut result = Vec::new();
+        use control_plane::local_env::LocalEnv;
+        let env = LocalEnv::load_config().map_err(|e| DatabaseError::Logical(format!("{e}")))?;
         tracing::info!(
-            "Loaded {} pageserver nodes from LocalEnv",
+            "Loading {} pageserver nodes from LocalEnv",
             env.pageservers.len()
         );
+        let mut nodes = Vec::new();
         for ps_conf in env.pageservers {
             let (pg_host, pg_port) =
                 parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
             let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
                 .expect("Unable to parse listen_http_addr");
-            result.push(Node {
+            let node = Node {
                 id: ps_conf.id,
                 listen_pg_addr: pg_host.to_string(),
                 listen_pg_port: pg_port.unwrap_or(5432),
@@ -184,16 +163,96 @@ impl Persistence {
                 listen_http_port: http_port.unwrap_or(80),
                 availability: NodeAvailability::Active,
                 scheduling: NodeSchedulingPolicy::Active,
-            });
+            };
+
+            // Synchronize database with what we learn from LocalEnv
+            self.insert_node(&node).await?;
+
+            nodes.push(node);
         }
 
-        Ok(result)
+        Ok(nodes)
     }
 
-    /// At startup, we populate our map of tenant shards from persistent storage.
-    pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result<Vec<TenantShardPersistence>> {
-        let inner = self.inner.lock().unwrap();
-        Ok(inner.state.tenants.values().cloned().collect())
+    /// At startup, load the high level state for shards, such as their config + policy.  This will
+    /// be enriched at runtime with state discovered on pageservers.
+    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
+        let loaded = self
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
+            })
+            .await?;
+
+        if loaded.is_empty() {
+            if let Some(path) = &self.json_path {
+                if tokio::fs::try_exists(path)
+                    .await
+                    .map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))?
+                {
+                    tracing::info!("Importing from legacy JSON format at {path}");
+                    return self.list_tenant_shards_json(path).await;
+                }
+            }
+        }
+        Ok(loaded)
+    }
+
+    /// Shim for automated compatibility tests: load tenants from a JSON file instead of database
+    pub(crate) async fn list_tenant_shards_json(
+        &self,
+        path: &Utf8Path,
+    ) -> DatabaseResult<Vec<TenantShardPersistence>> {
+        let bytes = tokio::fs::read(path)
+            .await
+            .map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?;
+
+        let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
+            .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
+        for (tenant_id, tenant) in &mut decoded.tenants {
+            // Backward compat: an old attachments.json from before PR #6251, replace
+            // empty strings with proper defaults.
+            if tenant.tenant_id.is_empty() {
+                tenant.tenant_id = tenant_id.to_string();
+                tenant.config = serde_json::to_string(&TenantConfig::default())
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+            }
+        }
+
+        let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
+
+        // Synchronize database with what is in the JSON file
+        self.insert_tenant_shards(tenants.clone()).await?;
+
+        Ok(tenants)
+    }
+
+    /// For use in testing environments, where we dump out JSON on shutdown.
+    pub async fn write_tenants_json(&self) -> anyhow::Result<()> {
+        let Some(path) = &self.json_path else {
+            anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)");
+        };
+        tracing::info!("Writing state to {path}...");
+        let tenants = self.list_tenant_shards().await?;
+        let mut tenants_map = HashMap::new();
+        for tsp in tenants {
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
+                shard_number: ShardNumber(tsp.shard_number as u8),
+                shard_count: ShardCount(tsp.shard_count as u8),
+            };
+
+            tenants_map.insert(tenant_shard_id, tsp);
+        }
+        let json = serde_json::to_string(&JsonPersistence {
+            tenants: tenants_map,
+        })?;
+
+        tokio::fs::write(path, &json).await?;
+        tracing::info!("Wrote {} bytes to {path}...", json.len());
+
+        Ok(())
     }
 
     /// Tenants must be persisted before we schedule them for the first time.  This enables us
@@ -201,22 +260,79 @@ impl Persistence {
     pub(crate) async fn insert_tenant_shards(
         &self,
         shards: Vec<TenantShardPersistence>,
-    ) -> anyhow::Result<()> {
-        self.mutating_transaction(|locked| {
-            for shard in shards {
-                let tenant_shard_id = TenantShardId {
-                    tenant_id: TenantId::from_str(shard.tenant_id.as_str())?,
-                    shard_number: ShardNumber(shard.shard_number as u8),
-                    shard_count: ShardCount(shard.shard_count as u8),
-                };
-
-                locked.tenants.insert(tenant_shard_id, shard);
-            }
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            conn.transaction(|conn| -> QueryResult<()> {
+                for tenant in &shards {
+                    diesel::insert_into(tenant_shards)
+                        .values(tenant)
+                        .execute(conn)?;
+                }
+                Ok(())
+            })?;
             Ok(())
         })
         .await
     }
 
+    /// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for
+    /// the tenant from memory on this server.
+    #[allow(unused)]
+    pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::delete(tenant_shards)
+                .filter(tenant_id.eq(del_tenant_id.to_string()))
+                .execute(conn)?;
+
+            Ok(())
+        })
+        .await
+    }
+
+    /// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient
+    /// batched increment of the generations of all tenants whose generation_pageserver is equal to
+    /// the node that called /re-attach.
+    #[tracing::instrument(skip_all, fields(node_id))]
+    pub(crate) async fn re_attach(
+        &self,
+        node_id: NodeId,
+    ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
+        use crate::schema::tenant_shards::dsl::*;
+        let updated = self
+            .with_conn(move |conn| {
+                let rows_updated = diesel::update(tenant_shards)
+                    .filter(generation_pageserver.eq(node_id.0 as i64))
+                    .set(generation.eq(generation + 1))
+                    .execute(conn)?;
+
+                tracing::info!("Incremented {} tenants' generations", rows_updated);
+
+                // TODO: UPDATE+SELECT in one query
+
+                let updated = tenant_shards
+                    .filter(generation_pageserver.eq(node_id.0 as i64))
+                    .select(TenantShardPersistence::as_select())
+                    .load(conn)?;
+                Ok(updated)
+            })
+            .await?;
+
+        let mut result = HashMap::new();
+        for tsp in updated {
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
+                    .map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
+                shard_number: ShardNumber(tsp.shard_number as u8),
+                shard_count: ShardCount(tsp.shard_count as u8),
+            };
+            result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
+        }
+
+        Ok(result)
+    }
+
     /// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
     /// advancing generation number.  We also store the NodeId for which the generation was issued, so that in
     /// [`Self::re_attach`] we can do a bulk UPDATE on the generations for that node.
@@ -225,47 +341,46 @@ impl Persistence {
         tenant_shard_id: TenantShardId,
         node_id: NodeId,
     ) -> anyhow::Result<Generation> {
-        self.mutating_transaction(|locked| {
-            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
-                anyhow::bail!("Tried to increment generation of unknown shard");
-            };
+        use crate::schema::tenant_shards::dsl::*;
+        let updated = self
+            .with_conn(move |conn| {
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
+                    .set((
+                        generation.eq(generation + 1),
+                        generation_pageserver.eq(node_id.0 as i64),
+                    ))
+                    // TODO: only returning() the generation column
+                    .returning(TenantShardPersistence::as_returning())
+                    .get_result(conn)?;
 
-            shard.generation += 1;
-            shard.generation_pageserver = Some(node_id);
+                Ok(updated)
+            })
+            .await?;
 
-            let gen = Generation::new(shard.generation);
-            Ok(gen)
-        })
-        .await
+        Ok(Generation::new(updated.generation as u32))
     }
 
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
-        self.mutating_transaction(|locked| {
-            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
-                anyhow::bail!("Tried to increment generation of unknown shard");
-            };
-            shard.generation_pageserver = None;
-            shard.placement_policy = serde_json::to_string(&PlacementPolicy::Detached).unwrap();
-            Ok(())
-        })
-        .await
-    }
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| {
+            let updated = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
+                .set((
+                    generation_pageserver.eq(i64::MAX),
+                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
+                ))
+                .execute(conn)?;
 
-    pub(crate) async fn re_attach(
-        &self,
-        node_id: NodeId,
-    ) -> anyhow::Result<HashMap<TenantShardId, Generation>> {
-        self.mutating_transaction(|locked| {
-            let mut result = HashMap::new();
-            for (tenant_shard_id, shard) in locked.tenants.iter_mut() {
-                if shard.generation_pageserver == Some(node_id) {
-                    shard.generation += 1;
-                    result.insert(*tenant_shard_id, Generation::new(shard.generation));
-                }
-            }
-            Ok(result)
+            Ok(updated)
         })
-        .await
+        .await?;
+
+        Ok(())
     }
 
     // TODO: when we start shard splitting, we must durably mark the tenant so that
@@ -285,7 +400,8 @@ impl Persistence {
 }
 
 /// Parts of [`crate::tenant_state::TenantState`] that are stored durably
-#[derive(Serialize, Deserialize, Clone)]
+#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
+#[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
     #[serde(default)]
     pub(crate) tenant_id: String,
@@ -296,16 +412,28 @@ pub(crate) struct TenantShardPersistence {
     #[serde(default)]
     pub(crate) shard_stripe_size: i32,
 
-    // Currently attached pageserver
-    #[serde(rename = "pageserver")]
-    pub(crate) generation_pageserver: Option<NodeId>,
-
     // Latest generation number: next time we attach, increment this
     // and use the incremented number when attaching
-    pub(crate) generation: u32,
+    pub(crate) generation: i32,
+
+    // Currently attached pageserver
+    #[serde(rename = "pageserver")]
+    pub(crate) generation_pageserver: i64,
 
     #[serde(default)]
     pub(crate) placement_policy: String,
     #[serde(default)]
     pub(crate) config: String,
 }
+
+/// Parts of [`crate::node::Node`] that are stored durably
+#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
+#[diesel(table_name = crate::schema::nodes)]
+pub(crate) struct NodePersistence {
+    pub(crate) node_id: i64,
+    pub(crate) scheduling_policy: String,
+    pub(crate) listen_http_addr: String,
+    pub(crate) listen_http_port: i32,
+    pub(crate) listen_pg_addr: String,
+    pub(crate) listen_pg_port: i32,
+}
diff --git a/control_plane/attachment_service/src/schema.rs b/control_plane/attachment_service/src/schema.rs
new file mode 100644
index 0000000000..de80fc8f64
--- /dev/null
+++ b/control_plane/attachment_service/src/schema.rs
@@ -0,0 +1,27 @@
+// @generated automatically by Diesel CLI.
+
+diesel::table! {
+    nodes (node_id) {
+        node_id -> Int8,
+        scheduling_policy -> Varchar,
+        listen_http_addr -> Varchar,
+        listen_http_port -> Int4,
+        listen_pg_addr -> Varchar,
+        listen_pg_port -> Int4,
+    }
+}
+
+diesel::table! {
+    tenant_shards (tenant_id, shard_number, shard_count) {
+        tenant_id -> Varchar,
+        shard_number -> Int4,
+        shard_count -> Int4,
+        shard_stripe_size -> Int4,
+        generation -> Int4,
+        generation_pageserver -> Int8,
+        placement_policy -> Varchar,
+        config -> Text,
+    }
+}
+
+diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index c9ed07ae5f..ec56dc8ad4 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -11,6 +11,7 @@ use control_plane::attachment_service::{
     TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard,
     TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
+use diesel::result::DatabaseErrorKind;
 use hyper::StatusCode;
 use pageserver_api::{
     control_api::{
@@ -26,6 +27,7 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api;
 use utils::{
+    completion::Barrier,
     generation::Generation,
     http::error::ApiError,
     id::{NodeId, TenantId},
@@ -35,7 +37,7 @@ use utils::{
 use crate::{
     compute_hook::ComputeHook,
     node::Node,
-    persistence::{Persistence, TenantShardPersistence},
+    persistence::{DatabaseError, Persistence, TenantShardPersistence},
     scheduler::Scheduler,
     tenant_state::{
         IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
@@ -46,6 +48,10 @@ use crate::{
 
 const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
+/// How long [`Service::startup_reconcile`] is allowed to take before it should give
+/// up on unresponsive pageservers and proceed.
+pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
+
 // Top level state available to all HTTP handlers
 struct ServiceState {
     tenants: BTreeMap<TenantShardId, TenantState>,
@@ -79,10 +85,27 @@ pub struct Config {
     pub jwt_token: Option<String>,
 }
 
+impl From<DatabaseError> for ApiError {
+    fn from(err: DatabaseError) -> ApiError {
+        match err {
+            DatabaseError::Query(e) => ApiError::InternalServerError(e.into()),
+            // FIXME: ApiError doesn't have an Unavailable variant, but ShuttingDown maps to 503.
+            DatabaseError::Connection(_e) => ApiError::ShuttingDown,
+            DatabaseError::Logical(reason) => {
+                ApiError::InternalServerError(anyhow::anyhow!(reason))
+            }
+        }
+    }
+}
+
 pub struct Service {
     inner: Arc<std::sync::RwLock<ServiceState>>,
     config: Config,
     persistence: Arc<Persistence>,
+
+    /// This waits for initial reconciliation with pageservers to complete.  Until this barrier
+    /// passes, it isn't safe to do any actions that mutate tenants.
+    pub(crate) startup_complete: Barrier,
 }
 
 impl From<ReconcileWaitError> for ApiError {
@@ -96,77 +119,32 @@ impl From<ReconcileWaitError> for ApiError {
 }
 
 impl Service {
-    pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
-        let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
-
-        tracing::info!("Loading nodes from database...");
-        let mut nodes = persistence.list_nodes().await?;
-        tracing::info!("Loaded {} nodes from database.", nodes.len());
-
-        tracing::info!("Loading shards from database...");
-        let tenant_shard_persistence = persistence.list_tenant_shards().await?;
-        tracing::info!(
-            "Loaded {} shards from database.",
-            tenant_shard_persistence.len()
-        );
-
-        let mut tenants = BTreeMap::new();
-
-        for tsp in tenant_shard_persistence {
-            let tenant_shard_id = TenantShardId {
-                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
-                shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount(tsp.shard_count as u8),
-            };
-            let shard_identity = if tsp.shard_count == 0 {
-                ShardIdentity::unsharded()
-            } else {
-                ShardIdentity::new(
-                    ShardNumber(tsp.shard_number as u8),
-                    ShardCount(tsp.shard_count as u8),
-                    ShardStripeSize(tsp.shard_stripe_size as u32),
-                )?
-            };
-            let new_tenant = TenantState {
-                tenant_shard_id,
-                shard: shard_identity,
-                sequence: Sequence::initial(),
-                // Note that we load generation, but don't care about generation_pageserver.  We will either end up finding
-                // our existing attached location and it will match generation_pageserver, or we will attach somewhere new
-                // and update generation_pageserver in the process.
-                generation: Generation::new(tsp.generation),
-                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
-                intent: IntentState::new(),
-                observed: ObservedState::new(),
-                config: serde_json::from_str(&tsp.config).unwrap(),
-                reconciler: None,
-                waiter: Arc::new(SeqWait::new(Sequence::initial())),
-                error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
-                last_error: Arc::default(),
-            };
-
-            tenants.insert(tenant_shard_id, new_tenant);
-        }
+    pub fn get_config(&self) -> &Config {
+        &self.config
+    }
 
+    /// TODO: don't allow other API calls until this is done, don't start doing any background housekeeping
+    /// until this is done.
+    async fn startup_reconcile(&self) {
         // For all tenant shards, a vector of observed states on nodes (where None means
         // indeterminate, same as in [`ObservedStateLocation`])
         let mut observed = HashMap::new();
 
+        let nodes = {
+            let locked = self.inner.read().unwrap();
+            locked.nodes.clone()
+        };
+
         // TODO: issue these requests concurrently
-        for node in &mut nodes {
-            let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
+        for node in nodes.values() {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
 
             tracing::info!("Scanning shards on node {}...", node.id);
             match client.list_location_config().await {
                 Err(e) => {
                     tracing::warn!("Could not contact pageserver {} ({e})", node.id);
-                    // TODO: be more tolerant, apply a generous 5-10 second timeout
-                    // TODO: setting a node to Offline is a dramatic thing to do, and can
-                    // prevent neon_local from starting up (it starts this service before
-                    // any pageservers are  running).  It may make sense to give nodes
-                    // a Pending state to accomodate this situation, and allow (but deprioritize)
-                    // scheduling on Pending nodes.
-                    //node.availability = NodeAvailability::Offline;
+                    // TODO: be more tolerant, apply a generous 5-10 second timeout with retries, in case
+                    // pageserver is being restarted at the same time as we are
                 }
                 Ok(listing) => {
                     tracing::info!(
@@ -174,7 +152,6 @@ impl Service {
                         listing.tenant_shards.len(),
                         node.id
                     );
-                    node.availability = NodeAvailability::Active;
 
                     for (tenant_shard_id, conf_opt) in listing.tenant_shards {
                         observed.insert(tenant_shard_id, (node.id, conf_opt));
@@ -186,41 +163,46 @@ impl Service {
         let mut cleanup = Vec::new();
 
         // Populate intent and observed states for all tenants, based on reported state on pageservers
-        for (tenant_shard_id, (node_id, observed_loc)) in observed {
-            let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
-                cleanup.push((tenant_shard_id, node_id));
-                continue;
-            };
+        let shard_count = {
+            let mut locked = self.inner.write().unwrap();
+            for (tenant_shard_id, (node_id, observed_loc)) in observed {
+                let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
+                    cleanup.push((tenant_shard_id, node_id));
+                    continue;
+                };
 
-            tenant_state
-                .observed
-                .locations
-                .insert(node_id, ObservedStateLocation { conf: observed_loc });
-        }
-
-        // State of nodes is now frozen, transform to a HashMap.
-        let mut nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
-
-        // Populate each tenant's intent state
-        let mut scheduler = Scheduler::new(&tenants, &nodes);
-        for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
-            tenant_state.intent_from_observed();
-            if let Err(e) = tenant_state.schedule(&mut scheduler) {
-                // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
-                // not enough pageservers are available.  The tenant may well still be available
-                // to clients.
-                tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
+                tenant_state
+                    .observed
+                    .locations
+                    .insert(node_id, ObservedStateLocation { conf: observed_loc });
             }
-        }
+
+            // Populate each tenant's intent state
+            let mut scheduler = Scheduler::new(&locked.tenants, &nodes);
+            for (tenant_shard_id, tenant_state) in locked.tenants.iter_mut() {
+                tenant_state.intent_from_observed();
+                if let Err(e) = tenant_state.schedule(&mut scheduler) {
+                    // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
+                    // not enough pageservers are available.  The tenant may well still be available
+                    // to clients.
+                    tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
+                }
+            }
+
+            locked.tenants.len()
+        };
+
+        // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
+        // generation_pageserver in the database.
 
         // Clean up any tenants that were found on pageservers but are not known to us.
         for (tenant_shard_id, node_id) in cleanup {
             // A node reported a tenant_shard_id which is unknown to us: detach it.
             let node = nodes
-                .get_mut(&node_id)
+                .get(&node_id)
                 .expect("Always exists: only known nodes are scanned");
 
-            let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
             match client
                 .location_config(
                     tenant_shard_id,
@@ -252,13 +234,80 @@ impl Service {
             }
         }
 
-        let shard_count = tenants.len();
+        // Finally, now that the service is up and running, launch reconcile operations for any tenants
+        // which require it: under normal circumstances this should only include tenants that were in some
+        // transient state before we restarted.
+        let reconcile_tasks = self.reconcile_all();
+        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
+    }
+
+    pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
+        let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
+
+        tracing::info!("Loading nodes from database...");
+        let nodes = persistence.list_nodes().await?;
+        let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
+        tracing::info!("Loaded {} nodes from database.", nodes.len());
+
+        tracing::info!("Loading shards from database...");
+        let tenant_shard_persistence = persistence.list_tenant_shards().await?;
+        tracing::info!(
+            "Loaded {} shards from database.",
+            tenant_shard_persistence.len()
+        );
+
+        let mut tenants = BTreeMap::new();
+
+        for tsp in tenant_shard_persistence {
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
+                shard_number: ShardNumber(tsp.shard_number as u8),
+                shard_count: ShardCount(tsp.shard_count as u8),
+            };
+            let shard_identity = if tsp.shard_count == 0 {
+                ShardIdentity::unsharded()
+            } else {
+                ShardIdentity::new(
+                    ShardNumber(tsp.shard_number as u8),
+                    ShardCount(tsp.shard_count as u8),
+                    ShardStripeSize(tsp.shard_stripe_size as u32),
+                )?
+            };
+
+            // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
+            // it with what we can infer: the node for which a generation was most recently issued.
+            let mut intent = IntentState::new();
+            if tsp.generation_pageserver != i64::MAX {
+                intent.attached = Some(NodeId(tsp.generation_pageserver as u64))
+            }
+
+            let new_tenant = TenantState {
+                tenant_shard_id,
+                shard: shard_identity,
+                sequence: Sequence::initial(),
+                generation: Generation::new(tsp.generation as u32),
+                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
+                intent,
+                observed: ObservedState::new(),
+                config: serde_json::from_str(&tsp.config).unwrap(),
+                reconciler: None,
+                waiter: Arc::new(SeqWait::new(Sequence::initial())),
+                error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
+                last_error: Arc::default(),
+            };
+
+            tenants.insert(tenant_shard_id, new_tenant);
+        }
+
+        let (startup_completion, startup_complete) = utils::completion::channel();
+
         let this = Arc::new(Self {
             inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
                 result_tx, nodes, tenants,
             ))),
             config,
             persistence,
+            startup_complete,
         });
 
         let result_task_this = this.clone();
@@ -316,11 +365,13 @@ impl Service {
             }
         });
 
-        // Finally, now that the service is up and running, launch reconcile operations for any tenants
-        // which require it: under normal circumstances this should only include tenants that were in some
-        // transient state before we restarted.
-        let reconcile_tasks = this.reconcile_all();
-        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
+        let startup_reconcile_this = this.clone();
+        tokio::task::spawn(async move {
+            // Block the [`Service::startup_complete`] barrier until we're done
+            let _completion = startup_completion;
+
+            startup_reconcile_this.startup_reconcile().await
+        });
 
         Ok(this)
     }
@@ -336,7 +387,6 @@ impl Service {
             let locked = self.inner.write().unwrap();
             !locked.tenants.contains_key(&attach_req.tenant_shard_id)
         };
-
         if insert {
             let tsp = TenantShardPersistence {
                 tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
@@ -344,22 +394,39 @@ impl Service {
                 shard_count: attach_req.tenant_shard_id.shard_count.0 as i32,
                 shard_stripe_size: 0,
                 generation: 0,
-                generation_pageserver: None,
+                generation_pageserver: i64::MAX,
                 placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
                 config: serde_json::to_string(&TenantConfig::default()).unwrap(),
             };
 
-            self.persistence.insert_tenant_shards(vec![tsp]).await?;
+            match self.persistence.insert_tenant_shards(vec![tsp]).await {
+                Err(e) => match e {
+                    DatabaseError::Query(diesel::result::Error::DatabaseError(
+                        DatabaseErrorKind::UniqueViolation,
+                        _,
+                    )) => {
+                        tracing::info!(
+                            "Raced with another request to insert tenant {}",
+                            attach_req.tenant_shard_id
+                        )
+                    }
+                    _ => return Err(e.into()),
+                },
+                Ok(()) => {
+                    tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);
 
-            let mut locked = self.inner.write().unwrap();
-            locked.tenants.insert(
-                attach_req.tenant_shard_id,
-                TenantState::new(
-                    attach_req.tenant_shard_id,
-                    ShardIdentity::unsharded(),
-                    PlacementPolicy::Single,
-                ),
-            );
+                    let mut locked = self.inner.write().unwrap();
+                    locked.tenants.insert(
+                        attach_req.tenant_shard_id,
+                        TenantState::new(
+                            attach_req.tenant_shard_id,
+                            ShardIdentity::unsharded(),
+                            PlacementPolicy::Single,
+                        ),
+                    );
+                    tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
+                }
+            }
         }
 
         let new_generation = if let Some(req_node_id) = attach_req.node_id {
@@ -506,6 +573,14 @@ impl Service {
                     id: req_tenant.id,
                     valid,
                 });
+            } else {
+                // After tenant deletion, we may approve any validation.  This avoids
+                // spurious warnings on the pageserver if it has pending LSN updates
+                // at the point a deletion happens.
+                response.tenants.push(ValidateResponseTenant {
+                    id: req_tenant.id,
+                    valid: true,
+                });
             }
         }
         response
@@ -561,7 +636,7 @@ impl Service {
                 shard_count: tenant_shard_id.shard_count.0 as i32,
                 shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
                 generation: 0,
-                generation_pageserver: None,
+                generation_pageserver: i64::MAX,
                 placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                 config: serde_json::to_string(&create_req.config).unwrap(),
             })
@@ -967,10 +1042,7 @@ impl Service {
             availability: NodeAvailability::Active,
         };
         // TODO: idempotency if the node already exists in the database
-        self.persistence
-            .insert_node(&new_node)
-            .await
-            .map_err(ApiError::InternalServerError)?;
+        self.persistence.insert_node(&new_node).await?;
 
         let mut locked = self.inner.write().unwrap();
         let mut new_nodes = (*locked.nodes).clone();
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 2d43c46270..6602aa9a73 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,5 +1,11 @@
 use crate::{background_process, local_env::LocalEnv};
-use camino::Utf8PathBuf;
+use camino::{Utf8Path, Utf8PathBuf};
+use diesel::{
+    backend::Backend,
+    query_builder::{AstPass, QueryFragment, QueryId},
+    Connection, PgConnection, QueryResult, RunQueryDsl,
+};
+use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 use hyper::Method;
 use pageserver_api::{
     models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
@@ -7,9 +13,9 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
-use postgres_connection::parse_host_port;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{path::PathBuf, str::FromStr};
+use std::{env, str::FromStr};
+use tokio::process::Command;
 use tracing::instrument;
 use utils::{
     auth::{Claims, Scope},
@@ -19,14 +25,17 @@ use utils::{
 pub struct AttachmentService {
     env: LocalEnv,
     listen: String,
-    path: PathBuf,
+    path: Utf8PathBuf,
     jwt_token: Option<String>,
     public_key_path: Option<Utf8PathBuf>,
+    postgres_port: u16,
     client: reqwest::Client,
 }
 
 const COMMAND: &str = "attachment_service";
 
+const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
@@ -169,7 +178,9 @@ pub struct TenantShardMigrateResponse {}
 
 impl AttachmentService {
     pub fn from_env(env: &LocalEnv) -> Self {
-        let path = env.base_data_dir.join("attachments.json");
+        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
+            .unwrap()
+            .join("attachments.json");
 
         // Makes no sense to construct this if pageservers aren't going to use it: assume
         // pageservers have control plane API set
@@ -181,6 +192,13 @@ impl AttachmentService {
             listen_url.port().unwrap()
         );
 
+        // Convention: NeonEnv in python tests reserves the next port after the control_plane_api
+        // port, for use by our captive postgres.
+        let postgres_port = listen_url
+            .port()
+            .expect("Control plane API setting should always have a port")
+            + 1;
+
         // Assume all pageservers have symmetric auth configuration: this service
         // expects to use one JWT token to talk to all of them.
         let ps_conf = env
@@ -209,6 +227,7 @@ impl AttachmentService {
             listen,
             jwt_token,
             public_key_path,
+            postgres_port,
             client: reqwest::ClientBuilder::new()
                 .build()
                 .expect("Failed to construct http client"),
@@ -220,13 +239,214 @@ impl AttachmentService {
             .expect("non-Unicode path")
     }
 
-    pub async fn start(&self) -> anyhow::Result<()> {
-        let path_str = self.path.to_string_lossy();
+    /// PIDFile for the postgres instance used to store attachment service state
+    fn postgres_pid_file(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(
+            self.env
+                .base_data_dir
+                .join("attachment_service_postgres.pid"),
+        )
+        .expect("non-Unicode path")
+    }
 
-        let mut args = vec!["-l", &self.listen, "-p", &path_str]
-            .into_iter()
-            .map(|s| s.to_string())
-            .collect::<Vec<_>>();
+    /// In order to access database migrations, we need to find the Neon source tree
+    async fn find_source_root(&self) -> anyhow::Result<Utf8PathBuf> {
+        // We assume that either prd or our binary is in the source tree. The former is usually
+        // true for automated test runners, the latter is usually true for developer workstations. Often
+        // both are true, which is fine.
+        let candidate_start_points = [
+            // Current working directory
+            Utf8PathBuf::from_path_buf(std::env::current_dir()?).unwrap(),
+            // Directory containing the binary we're running inside
+            Utf8PathBuf::from_path_buf(env::current_exe()?.parent().unwrap().to_owned()).unwrap(),
+        ];
+
+        // For each candidate start point, search through ancestors looking for a neon.git source tree root
+        for start_point in &candidate_start_points {
+            // Start from the build dir: assumes we are running out of a built neon source tree
+            for path in start_point.ancestors() {
+                // A crude approximation: the root of the source tree is whatever contains a "control_plane"
+                // subdirectory.
+                let control_plane = path.join("control_plane");
+                if tokio::fs::try_exists(&control_plane).await? {
+                    return Ok(path.to_owned());
+                }
+            }
+        }
+
+        // Fall-through
+        Err(anyhow::anyhow!(
+            "Could not find control_plane src dir, after searching ancestors of {candidate_start_points:?}"
+        ))
+    }
+
+    /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
+    ///
+    /// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
+    /// to other versions if that one isn't found.  Some automated tests create circumstances
+    /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
+    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+        let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
+
+        for v in prefer_versions {
+            let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
+            if tokio::fs::try_exists(&path).await? {
+                return Ok(path);
+            }
+        }
+
+        // Fall through
+        anyhow::bail!(
+            "Postgres binaries not found in {}",
+            self.env.pg_distrib_dir.display()
+        );
+    }
+
+    /// Readiness check for our postgres process
+    async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
+        let bin_path = pg_bin_dir.join("pg_isready");
+        let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
+        let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
+
+        Ok(exitcode.success())
+    }
+
+    /// Create our database if it doesn't exist, and run migrations.
+    ///
+    /// This function is equivalent to the `diesel setup` command in the diesel CLI.  We implement
+    /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
+    /// who just want to run `cargo neon_local` without knowing about diesel.
+    ///
+    /// Returns the database url
+    pub async fn setup_database(&self) -> anyhow::Result<String> {
+        let database_url = format!(
+            "postgresql://localhost:{}/attachment_service",
+            self.postgres_port
+        );
+        println!("Running attachment service database setup...");
+        fn change_database_of_url(database_url: &str, default_database: &str) -> (String, String) {
+            let base = ::url::Url::parse(database_url).unwrap();
+            let database = base.path_segments().unwrap().last().unwrap().to_owned();
+            let mut new_url = base.join(default_database).unwrap();
+            new_url.set_query(base.query());
+            (database, new_url.into())
+        }
+
+        #[derive(Debug, Clone)]
+        pub struct CreateDatabaseStatement {
+            db_name: String,
+        }
+
+        impl CreateDatabaseStatement {
+            pub fn new(db_name: &str) -> Self {
+                CreateDatabaseStatement {
+                    db_name: db_name.to_owned(),
+                }
+            }
+        }
+
+        impl<DB: Backend> QueryFragment<DB> for CreateDatabaseStatement {
+            fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, DB>) -> QueryResult<()> {
+                out.push_sql("CREATE DATABASE ");
+                out.push_identifier(&self.db_name)?;
+                Ok(())
+            }
+        }
+
+        impl<Conn> RunQueryDsl<Conn> for CreateDatabaseStatement {}
+
+        impl QueryId for CreateDatabaseStatement {
+            type QueryId = ();
+
+            const HAS_STATIC_QUERY_ID: bool = false;
+        }
+        if PgConnection::establish(&database_url).is_err() {
+            let (database, postgres_url) = change_database_of_url(&database_url, "postgres");
+            println!("Creating database: {database}");
+            let mut conn = PgConnection::establish(&postgres_url)?;
+            CreateDatabaseStatement::new(&database).execute(&mut conn)?;
+        }
+        let mut conn = PgConnection::establish(&database_url)?;
+
+        let migrations_dir = self
+            .find_source_root()
+            .await?
+            .join("control_plane/attachment_service/migrations");
+
+        let migrations = diesel_migrations::FileBasedMigrations::from_path(migrations_dir)?;
+        println!("Running migrations in {}", migrations.path().display());
+        HarnessWithOutput::write_to_stdout(&mut conn)
+            .run_pending_migrations(migrations)
+            .map(|_| ())
+            .map_err(|e| anyhow::anyhow!(e))?;
+
+        println!("Migrations complete");
+
+        Ok(database_url)
+    }
+
+    pub async fn start(&self) -> anyhow::Result<()> {
+        // Start a vanilla Postgres process used by the attachment service for persistence.
+        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
+            .unwrap()
+            .join("attachment_service_db");
+        let pg_bin_dir = self.get_pg_bin_dir().await?;
+        let pg_log_path = pg_data_path.join("postgres.log");
+
+        if !tokio::fs::try_exists(&pg_data_path).await? {
+            // Initialize empty database
+            let initdb_path = pg_bin_dir.join("initdb");
+            let mut child = Command::new(&initdb_path)
+                .args(["-D", pg_data_path.as_ref()])
+                .spawn()
+                .expect("Failed to spawn initdb");
+            let status = child.wait().await?;
+            if !status.success() {
+                anyhow::bail!("initdb failed with status {status}");
+            }
+
+            tokio::fs::write(
+                &pg_data_path.join("postgresql.conf"),
+                format!("port = {}", self.postgres_port),
+            )
+            .await?;
+        };
+
+        println!("Starting attachment service database...");
+        let db_start_args = [
+            "-w",
+            "-D",
+            pg_data_path.as_ref(),
+            "-l",
+            pg_log_path.as_ref(),
+            "start",
+        ];
+
+        background_process::start_process(
+            "attachment_service_db",
+            &self.env.base_data_dir,
+            pg_bin_dir.join("pg_ctl").as_std_path(),
+            db_start_args,
+            [],
+            background_process::InitialPidFile::Create(self.postgres_pid_file()),
+            || self.pg_isready(&pg_bin_dir),
+        )
+        .await?;
+
+        // Run migrations on every startup, in case something changed.
+        let database_url = self.setup_database().await?;
+
+        let mut args = vec![
+            "-l",
+            &self.listen,
+            "-p",
+            self.path.as_ref(),
+            "--database-url",
+            &database_url,
+        ]
+        .into_iter()
+        .map(|s| s.to_string())
+        .collect::<Vec<_>>();
         if let Some(jwt_token) = &self.jwt_token {
             args.push(format!("--jwt-token={jwt_token}"));
         }
@@ -235,7 +455,7 @@ impl AttachmentService {
             args.push(format!("--public-key={public_key_path}"));
         }
 
-        let result = background_process::start_process(
+        background_process::start_process(
             COMMAND,
             &self.env.base_data_dir,
             &self.env.attachment_service_bin(),
@@ -252,30 +472,46 @@ impl AttachmentService {
                 }
             },
         )
-        .await;
+        .await?;
 
-        // TODO: shouldn't we bail if we fail to spawn the process?
-        for ps_conf in &self.env.pageservers {
-            let (pg_host, pg_port) =
-                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            self.node_register(NodeRegisterRequest {
-                node_id: ps_conf.id,
-                listen_pg_addr: pg_host.to_string(),
-                listen_pg_port: pg_port.unwrap_or(5432),
-                listen_http_addr: http_host.to_string(),
-                listen_http_port: http_port.unwrap_or(80),
-            })
+        Ok(())
+    }
+
+    pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
+
+        let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
+        let pg_bin_dir = self.get_pg_bin_dir().await?;
+
+        println!("Stopping attachment service database...");
+        let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
+        let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
+            .args(pg_stop_args)
+            .spawn()?
+            .wait()
             .await?;
+        if !stop_status.success() {
+            let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
+            let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
+                .args(pg_status_args)
+                .spawn()?
+                .wait()
+                .await?;
+
+            // pg_ctl status returns this exit code if postgres is not running: in this case it is
+            // fine that stop failed.  Otherwise it is an error that stop failed.
+            const PG_STATUS_NOT_RUNNING: i32 = 3;
+            if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
+                println!("Attachment service data base is already stopped");
+                return Ok(());
+            } else {
+                anyhow::bail!("Failed to stop attachment service database: {stop_status}")
+            }
         }
 
-        result
+        Ok(())
     }
 
-    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        background_process::stop_process(immediate, COMMAND, &self.pid_file())
-    }
     /// Simple HTTP request wrapper for calling into attachment service
     async fn dispatch<RQ, RS>(
         &self,
@@ -357,7 +593,7 @@ impl AttachmentService {
         &self,
         req: TenantCreateRequest,
     ) -> anyhow::Result<TenantCreateResponse> {
-        self.dispatch(Method::POST, "tenant".to_string(), Some(req))
+        self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req))
             .await
     }
 
@@ -414,7 +650,7 @@ impl AttachmentService {
     ) -> anyhow::Result<TimelineInfo> {
         self.dispatch(
             Method::POST,
-            format!("tenant/{tenant_id}/timeline"),
+            format!("v1/tenant/{tenant_id}/timeline"),
             Some(req),
         )
         .await
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 279c47398f..a5242e3dc7 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -135,7 +135,7 @@ fn main() -> Result<()> {
             "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
             "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
             "start" => rt.block_on(handle_start_all(sub_args, &env)),
-            "stop" => handle_stop_all(sub_args, &env),
+            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
             "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
             "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
             "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
@@ -1056,8 +1056,9 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     match sub_match.subcommand() {
         Some(("start", subcommand_args)) => {
+            let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
             if let Err(e) = get_pageserver(env, subcommand_args)?
-                .start(&pageserver_config_overrides(subcommand_args))
+                .start(&pageserver_config_overrides(subcommand_args), *register)
                 .await
             {
                 eprintln!("pageserver start failed: {e}");
@@ -1086,24 +1087,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
             }
 
             if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args))
-                .await
-            {
-                eprintln!("pageserver start failed: {e}");
-                exit(1);
-            }
-        }
-
-        Some(("migrate", subcommand_args)) => {
-            let pageserver = get_pageserver(env, subcommand_args)?;
-            //TODO what shutdown strategy should we use here?
-            if let Err(e) = pageserver.stop(false) {
-                eprintln!("pageserver stop failed: {}", e);
-                exit(1);
-            }
-
-            if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args))
+                .start(&pageserver_config_overrides(subcommand_args), false)
                 .await
             {
                 eprintln!("pageserver start failed: {e}");
@@ -1161,7 +1145,7 @@ async fn handle_attachment_service(
                 .map(|s| s.as_str())
                 == Some("immediate");
 
-            if let Err(e) = svc.stop(immediate) {
+            if let Err(e) = svc.stop(immediate).await {
                 eprintln!("stop failed: {}", e);
                 exit(1);
             }
@@ -1257,7 +1241,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
         let attachment_service = AttachmentService::from_env(env);
         if let Err(e) = attachment_service.start().await {
             eprintln!("attachment_service start failed: {:#}", e);
-            try_stop_all(env, true);
+            try_stop_all(env, true).await;
             exit(1);
         }
     }
@@ -1265,11 +1249,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
     for ps_conf in &env.pageservers {
         let pageserver = PageServerNode::from_env(env, ps_conf);
         if let Err(e) = pageserver
-            .start(&pageserver_config_overrides(sub_match))
+            .start(&pageserver_config_overrides(sub_match), true)
             .await
         {
             eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
-            try_stop_all(env, true);
+            try_stop_all(env, true).await;
             exit(1);
         }
     }
@@ -1278,23 +1262,23 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
         let safekeeper = SafekeeperNode::from_env(env, node);
         if let Err(e) = safekeeper.start(vec![]).await {
             eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
-            try_stop_all(env, false);
+            try_stop_all(env, false).await;
             exit(1);
         }
     }
     Ok(())
 }
 
-fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     let immediate =
         sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
 
-    try_stop_all(env, immediate);
+    try_stop_all(env, immediate).await;
 
     Ok(())
 }
 
-fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
+async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
     // Stop all endpoints
     match ComputeControlPlane::load(env.clone()) {
         Ok(cplane) => {
@@ -1329,7 +1313,7 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
 
     if env.control_plane_api.is_some() {
         let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.stop(immediate) {
+        if let Err(e) = attachment_service.stop(immediate).await {
             eprintln!("attachment service stop failed: {e:#}");
         }
     }
@@ -1549,7 +1533,11 @@ fn cli() -> Command {
                 .subcommand(Command::new("status"))
                 .subcommand(Command::new("start")
                     .about("Start local pageserver")
-                    .arg(pageserver_config_args.clone())
+                    .arg(pageserver_config_args.clone()).arg(Arg::new("register")
+                    .long("register")
+                    .default_value("true").required(false)
+                    .value_parser(value_parser!(bool))
+                    .value_name("register"))
                 )
                 .subcommand(Command::new("stop")
                     .about("Stop local pageserver")
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 4460fdd3a6..aefef47da7 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -223,7 +223,11 @@ impl LocalEnv {
     }
 
     pub fn attachment_service_bin(&self) -> PathBuf {
-        self.neon_distrib_dir.join("attachment_service")
+        // Irrespective of configuration, attachment service binary is always
+        // run from the same location as neon_local.  This means that for compatibility
+        // tests that run old pageserver/safekeeper, they still run latest attachment service.
+        let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
+        neon_local_bin_dir.join("attachment_service")
     }
 
     pub fn safekeeper_bin(&self) -> PathBuf {
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 1db21c9a37..540d1185a2 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -30,6 +30,7 @@ use utils::{
     lsn::Lsn,
 };
 
+use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};
 
@@ -161,8 +162,8 @@ impl PageServerNode {
             .expect("non-Unicode path")
     }
 
-    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
-        self.start_node(config_overrides, false).await
+    pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
+        self.start_node(config_overrides, false, register).await
     }
 
     fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -207,6 +208,7 @@ impl PageServerNode {
         &self,
         config_overrides: &[&str],
         update_config: bool,
+        register: bool,
     ) -> anyhow::Result<()> {
         // TODO: using a thread here because start_process() is not async but we need to call check_status()
         let datadir = self.repo_path();
@@ -244,7 +246,26 @@ impl PageServerNode {
                 }
             },
         )
-        .await
+        .await?;
+
+        if register {
+            let attachment_service = AttachmentService::from_env(&self.env);
+            let (pg_host, pg_port) =
+                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
+            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
+                .expect("Unable to parse listen_http_addr");
+            attachment_service
+                .node_register(NodeRegisterRequest {
+                    node_id: self.conf.id,
+                    listen_pg_addr: pg_host.to_string(),
+                    listen_pg_port: pg_port.unwrap_or(5432),
+                    listen_http_addr: http_host.to_string(),
+                    listen_http_port: http_port.unwrap_or(80),
+                })
+                .await?;
+        }
+
+        Ok(())
     }
 
     fn pageserver_basic_args<'a>(
diff --git a/diesel.toml b/diesel.toml
new file mode 100644
index 0000000000..30ed4444d7
--- /dev/null
+++ b/diesel.toml
@@ -0,0 +1,9 @@
+# For documentation on how to configure this file,
+# see https://diesel.rs/guides/configuring-diesel-cli
+
+[print_schema]
+file = "control_plane/attachment_service/src/schema.rs"
+custom_type_derives = ["diesel::query_builder::QueryId"]
+
+[migrations_directory]
+dir = "control_plane/attachment_service/migrations"
diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 0c6855d17b..b089af4a02 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::{
     borrow::Cow,
     fs::{self, File},
-    io::{self, Write},
+    io,
 };
 
 use camino::{Utf8Path, Utf8PathBuf};
@@ -112,48 +112,6 @@ pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Erro
     tokio::fs::File::open(path.as_ref()).await?.sync_all().await
 }
 
-/// Writes a file to the specified `final_path` in a crash safe fasion
-///
-/// The file is first written to the specified tmp_path, and in a second
-/// step, the tmp path is renamed to the final path. As renames are
-/// atomic, a crash during the write operation will never leave behind a
-/// partially written file.
-///
-/// NB: an async variant of this code exists in Pageserver's VirtualFile.
-pub fn overwrite(
-    final_path: &Utf8Path,
-    tmp_path: &Utf8Path,
-    content: &[u8],
-) -> std::io::Result<()> {
-    let Some(final_path_parent) = final_path.parent() else {
-        return Err(std::io::Error::from_raw_os_error(
-            nix::errno::Errno::EINVAL as i32,
-        ));
-    };
-    std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
-    let mut file = std::fs::OpenOptions::new()
-        .write(true)
-        // Use `create_new` so that, if we race with ourselves or something else,
-        // we bail out instead of causing damage.
-        .create_new(true)
-        .open(tmp_path)?;
-    file.write_all(content)?;
-    file.sync_all()?;
-    drop(file); // before the rename, that's important!
-                // renames are atomic
-    std::fs::rename(tmp_path, final_path)?;
-    // Only open final path parent dirfd now, so that this operation only
-    // ever holds one VirtualFile fd at a time.  That's important because
-    // the current `find_victim_slot` impl might pick the same slot for both
-    // VirtualFile., and it eventually does a blocking write lock instead of
-    // try_lock.
-    let final_parent_dirfd = std::fs::OpenOptions::new()
-        .read(true)
-        .open(final_path_parent)?;
-    final_parent_dirfd.sync_all()?;
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index d200a4ba5e..066f06c88f 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -403,7 +403,12 @@ impl VirtualFile {
         Ok(vfile)
     }
 
-    /// Async & [`VirtualFile`]-enabled version of [`::utils::crashsafe::overwrite`].
+    /// Writes a file to the specified `final_path` in a crash safe fasion
+    ///
+    /// The file is first written to the specified tmp_path, and in a second
+    /// step, the tmp path is renamed to the final path. As renames are
+    /// atomic, a crash during the write operation will never leave behind a
+    /// partially written file.
     pub async fn crashsafe_overwrite(
         final_path: &Utf8Path,
         tmp_path: &Utf8Path,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 142c97d5c3..bbabfeedf6 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import abc
 import asyncio
+import concurrent.futures
 import filecmp
 import json
 import os
@@ -993,6 +994,11 @@ class NeonEnv:
         self.initial_timeline = config.initial_timeline
 
         attachment_service_port = self.port_distributor.get_port()
+        # Reserve the next port after attachment service for use by its postgres: this
+        # will assert out if the next port wasn't free.
+        attachment_service_pg_port = self.port_distributor.get_port()
+        assert attachment_service_pg_port == attachment_service_port + 1
+
         self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
         self.attachment_service: NeonAttachmentService = NeonAttachmentService(
             self, config.auth_enabled
@@ -1071,16 +1077,27 @@ class NeonEnv:
         self.neon_cli.init(cfg, force=config.config_init_force)
 
     def start(self):
-        # Start up broker, pageserver and all safekeepers
-        self.broker.try_start()
-
+        # Attachment service starts first, so that pageserver /re-attach calls don't
+        # bounce through retries on startup
         self.attachment_service.start()
 
-        for pageserver in self.pageservers:
-            pageserver.start()
+        # Start up broker, pageserver and all safekeepers
+        futs = []
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=2 + len(self.pageservers) + len(self.safekeepers)
+        ) as executor:
+            futs.append(
+                executor.submit(lambda: self.broker.try_start() or None)
+            )  # The `or None` is for the linter
 
-        for safekeeper in self.safekeepers:
-            safekeeper.start()
+            for pageserver in self.pageservers:
+                futs.append(executor.submit(lambda ps=pageserver: ps.start()))
+
+            for safekeeper in self.safekeepers:
+                futs.append(executor.submit(lambda sk=safekeeper: sk.start()))
+
+        for f in futs:
+            f.result()
 
     def stop(self, immediate=False, ps_assert_metric_no_errors=False):
         """
@@ -1652,8 +1669,10 @@ class NeonCli(AbstractNeonCli):
         id: int,
         overrides: Tuple[str, ...] = (),
         extra_env_vars: Optional[Dict[str, str]] = None,
+        register: bool = True,
     ) -> "subprocess.CompletedProcess[str]":
-        start_args = ["pageserver", "start", f"--id={id}", *overrides]
+        register_str = "true" if register else "false"
+        start_args = ["pageserver", "start", f"--id={id}", *overrides, f"--register={register_str}"]
         storage = self.env.pageserver_remote_storage
         append_pageserver_param_overrides(
             params_to_update=start_args,
@@ -2080,6 +2099,7 @@ class NeonPageserver(PgProtocol):
         self,
         overrides: Tuple[str, ...] = (),
         extra_env_vars: Optional[Dict[str, str]] = None,
+        register: bool = True,
     ) -> "NeonPageserver":
         """
         Start the page server.
@@ -2089,7 +2109,7 @@ class NeonPageserver(PgProtocol):
         assert self.running is False
 
         self.env.neon_cli.pageserver_start(
-            self.id, overrides=overrides, extra_env_vars=extra_env_vars
+            self.id, overrides=overrides, extra_env_vars=extra_env_vars, register=register
         )
         self.running = True
         return self
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 1a1425f069..d5d70951be 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -138,6 +138,7 @@ def test_create_snapshot(
     for sk in env.safekeepers:
         sk.stop()
     env.pageserver.stop()
+    env.attachment_service.stop()
 
     # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
     compatibility_snapshot_dir = (
@@ -226,11 +227,17 @@ def test_forward_compatibility(
 
     try:
         neon_env_builder.num_safekeepers = 3
+        neon_local_binpath = neon_env_builder.neon_binpath
         env = neon_env_builder.from_repo_dir(
             compatibility_snapshot_dir / "repo",
             neon_binpath=compatibility_neon_bin,
             pg_distrib_dir=compatibility_postgres_distrib_dir,
         )
+
+        # Use current neon_local even though we're using old binaries for
+        # everything else: our test code is written for latest CLI args.
+        env.neon_local_binpath = neon_local_binpath
+
         neon_env_builder.start()
 
         check_neon_works(
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 63f6130af5..725ed63d1c 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -499,7 +499,8 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     # and serve clients.
     env.pageserver.stop()  # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
     env.pageserver.start(
-        overrides=("--pageserver-config-override=control_plane_emergency_mode=true",)
+        overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
+        register=False,
     )
 
     # The pageserver should provide service to clients
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index b72e0f3c26..9d0f9bfcee 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -29,6 +29,7 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
+diesel = { version = "2", features = ["postgres", "serde_json"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures-channel = { version = "0.3", features = ["sink"] }
@@ -108,8 +109,10 @@ regex-automata = { version = "0.4", default-features = false, features = ["dfa-o
 regex-syntax = { version = "0.8" }
 serde = { version = "1", features = ["alloc", "derive"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
-syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] }
+syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }
+toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
+toml_edit = { version = "0.19", features = ["serde"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }

From 3a36a0a2272dac6e3e2774f6e6b2a8e326d8df6c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Jan 2024 19:23:53 +0100
Subject: [PATCH 0009/1571] fix(test suite): some tests leak child processes
 (#6497)

---
 control_plane/src/endpoint.rs              | 19 +++++++++++++++++--
 test_runner/regress/test_import.py         |  2 ++
 test_runner/regress/test_neon_local_cli.py |  2 ++
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index d3b0366d31..dcad22b992 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -438,7 +438,7 @@ impl Endpoint {
     }
 
     fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
-        // TODO use background_process::stop_process instead
+        // TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482
         let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
         let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
         let pid = nix::unistd::Pid::from_raw(pid as i32);
@@ -583,9 +583,21 @@ impl Endpoint {
         }
 
         let child = cmd.spawn()?;
+        // set up a scopeguard to kill & wait for the child in case we panic or bail below
+        let child = scopeguard::guard(child, |mut child| {
+            println!("SIGKILL & wait the started process");
+            (|| {
+                // TODO: use another signal that can be caught by the child so it can clean up any children it spawned
+                child.kill().context("SIGKILL child")?;
+                child.wait().context("wait() for child process")?;
+                anyhow::Ok(())
+            })()
+            .with_context(|| format!("scopeguard kill&wait child {child:?}"))
+            .unwrap();
+        });
 
         // Write down the pid so we can wait for it when we want to stop
-        // TODO use background_process::start_process instead
+        // TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482
         let pid = child.id();
         let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
         std::fs::write(pidfile_path, pid.to_string())?;
@@ -634,6 +646,9 @@ impl Endpoint {
             std::thread::sleep(ATTEMPT_INTERVAL);
         }
 
+        // disarm the scopeguard, let the child outlive this function (and neon_local invoction)
+        drop(scopeguard::ScopeGuard::into_inner(child));
+
         Ok(())
     }
 
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index faedf5d944..3519cbbaab 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -163,6 +163,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
     endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant)
     assert endpoint.safe_psql("select count(*) from t") == [(300000,)]
 
+    vanilla_pg.stop()
+
 
 def test_import_from_pageserver_small(
     pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, test_output_dir: Path
diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py
index 46b72fbca5..8edba49b8a 100644
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -59,3 +59,5 @@ def test_neon_two_primary_endpoints_fail(
     env.neon_cli.endpoint_stop("ep1")
     # ep1 is stopped so create ep2 will succeed
     env.neon_cli.endpoint_start("ep2")
+    # cleanup
+    env.neon_cli.endpoint_stop("ep2")

From e34166a28fdd2b20b7d84c254a75c3a7819fe5b7 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Jan 2024 22:48:34 +0100
Subject: [PATCH 0010/1571] CI: switch back to std-fs io engine for soak time
 before next release (#6492)

PR #5824 introduced the concept of io engines in pageserver and
implemented `tokio-epoll-uring` in addition to our current method,
`std-fs`.

We used `tokio-epoll-uring` in CI for a day to get more exposure to
the code.  Now it's time to switch CI back so that we test with `std-fs`
as well, because that's what we're (still) using in production.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7445501f00..84edc4fbc9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -471,7 +471,7 @@ jobs:
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
           BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
 
       - name: Merge and upload coverage data
         if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'

From 734755eaca42e9a70da0764a380c0e5b2447325e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 27 Jan 2024 05:16:11 +0100
Subject: [PATCH 0011/1571] Enable nextest retries for the arm build (#6496)

Also make the NEXTEST_RETRIES declaration more local.

Requested in https://github.com/neondatabase/neon/pull/6493#issuecomment-1912110202
---
 .github/workflows/build_and_test.yml    |  3 ++-
 .github/workflows/neon_extra_builds.yml | 14 ++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 84edc4fbc9..12ed70c372 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -21,7 +21,6 @@ env:
   COPT: '-Werror'
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-  NEXTEST_RETRIES: 3
   # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
   E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
 
@@ -361,6 +360,8 @@ jobs:
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       - name: Run rust tests
+        env:
+          NEXTEST_RETRIES: 3
         run: |
           for io_engine in std-fs tokio-epoll-uring ; do
             NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index c6c2b7386a..f8fb62d3f8 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -124,12 +124,12 @@ jobs:
       # Hence keeping target/ (and general cache size) smaller
       BUILD_TYPE: release
       CARGO_FEATURES: --features testing
-      CARGO_FLAGS: --locked --release
+      CARGO_FLAGS: --release
       AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
       options: --init
 
     steps:
@@ -210,18 +210,20 @@ jobs:
 
       - name: Run cargo build
         run: |
-          mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       - name: Run cargo test
+        env:
+          NEXTEST_RETRIES: 3
         run: |
-          cargo test $CARGO_FLAGS $CARGO_FEATURES
+          cargo nextest run $CARGO_FEATURES
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
           export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
           export REMOTE_STORAGE_S3_REGION=eu-central-1
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
+          cargo nextest run --package remote_storage --test test_real_s3
 
           # Run separate tests for real Azure Blob Storage
           # XXX: replace region with `eu-central-1`-like region
@@ -231,7 +233,7 @@ jobs:
           export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
           export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+          cargo nextest run --package remote_storage --test test_real_azure
 
   check-codestyle-rust-arm:
     timeout-minutes: 90

From 3a8243043234500d3f4cd64270150e3295c9d167 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sun, 28 Jan 2024 00:15:11 +0100
Subject: [PATCH 0012/1571] fixup(#6492): also switch the benchmarks that runs
 on merge-to-main back to std-fs (#6501)

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 12ed70c372..147d5cae2d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -508,7 +508,7 @@ jobs:
           VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 

From 8253cf1931a53128a9a4f5fdd71add6f90dd2a60 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Sun, 28 Jan 2024 22:27:14 +0100
Subject: [PATCH 0013/1571] proxy: Relax endpoint check (#6503)

## Problem

http-over-sql allowes host to be in format api.aws.... however it's not
the case for the websocket flow.

## Summary of changes

Relax endpoint check for the ws serverless connections.
---
 proxy/src/auth/credentials.rs         | 20 ++++++++++++--------
 proxy/src/serverless.rs               |  2 ++
 proxy/src/serverless/sql_over_http.rs | 11 ++++-------
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index bdb79f2517..5bf7667a1f 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -2,7 +2,8 @@
 
 use crate::{
     auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, EndpointId, RoleName,
+    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI,
+    EndpointId, RoleName,
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
@@ -54,10 +55,10 @@ impl ComputeUserInfoMaybeEndpoint {
     }
 }
 
-pub fn endpoint_sni<'a>(
-    sni: &'a str,
+pub fn endpoint_sni(
+    sni: &str,
     common_names: &HashSet<String>,
-) -> Result<&'a str, ComputeUserInfoParseError> {
+) -> Result<Option<EndpointId>, ComputeUserInfoParseError> {
     let Some((subdomain, common_name)) = sni.split_once('.') else {
         return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() });
     };
@@ -66,7 +67,10 @@ pub fn endpoint_sni<'a>(
             cn: common_name.into(),
         });
     }
-    Ok(subdomain)
+    if subdomain == SERVERLESS_DRIVER_SNI {
+        return Ok(None);
+    }
+    Ok(Some(EndpointId::from(subdomain)))
 }
 
 impl ComputeUserInfoMaybeEndpoint {
@@ -85,7 +89,6 @@ impl ComputeUserInfoMaybeEndpoint {
         // record the values if we have them
         ctx.set_application(params.get("application_name").map(SmolStr::from));
         ctx.set_user(user.clone());
-        ctx.set_endpoint_id(sni.map(EndpointId::from));
 
         // Project name might be passed via PG's command-line options.
         let endpoint_option = params
@@ -103,7 +106,7 @@ impl ComputeUserInfoMaybeEndpoint {
 
         let endpoint_from_domain = if let Some(sni_str) = sni {
             if let Some(cn) = common_names {
-                Some(EndpointId::from(endpoint_sni(sni_str, cn)?))
+                endpoint_sni(sni_str, cn)?
             } else {
                 None
             }
@@ -117,12 +120,13 @@ impl ComputeUserInfoMaybeEndpoint {
                 Some(Err(InconsistentProjectNames { domain, option }))
             }
             // Invariant: project name may not contain certain characters.
-            (a, b) => a.or(b).map(|name| match project_name_valid(&name) {
+            (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
                 false => Err(MalformedProjectName(name)),
                 true => Ok(name),
             }),
         }
         .transpose()?;
+        ctx.set_endpoint_id(endpoint.clone());
 
         info!(%user, project = endpoint.as_deref(), "credentials");
         if sni.is_some() {
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 8af008394a..dfef4ccdfa 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -41,6 +41,8 @@ use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};
 
+pub const SERVERLESS_DRIVER_SNI: &str = "api";
+
 pub async fn task_main(
     config: &'static ProxyConfig,
     ws_listener: TcpListener,
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index f108ab34ab..1e2ddaa2ff 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;
 
 use anyhow::bail;
+use anyhow::Context;
 use futures::pin_mut;
 use futures::StreamExt;
 use hyper::body::HttpBody;
@@ -35,11 +36,11 @@ use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
-use crate::EndpointId;
 use crate::RoleName;
 
 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;
+use super::SERVERLESS_DRIVER_SNI;
 
 #[derive(serde::Deserialize)]
 struct QueryData {
@@ -61,7 +62,6 @@ enum Payload {
 
 const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB
 const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB
-const SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART: &str = "api";
 
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
@@ -188,9 +188,7 @@ fn get_conn_info(
         }
     }
 
-    let endpoint = endpoint_sni(hostname, &tls.common_names)?;
-
-    let endpoint: EndpointId = endpoint.into();
+    let endpoint = endpoint_sni(hostname, &tls.common_names)?.context("malformed endpoint")?;
     ctx.set_endpoint_id(Some(endpoint.clone()));
 
     let pairs = connection_url.query_pairs();
@@ -227,8 +225,7 @@ fn check_matches(sni_hostname: &str, hostname: &str) -> Result<bool, anyhow::Err
     let (_, hostname_rest) = hostname
         .split_once('.')
         .ok_or_else(|| anyhow::anyhow!("Unexpected hostname format."))?;
-    Ok(sni_hostname_rest == hostname_rest
-        && sni_hostname_first == SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART)
+    Ok(sni_hostname_rest == hostname_rest && sni_hostname_first == SERVERLESS_DRIVER_SNI)
 }
 
 // TODO: return different http error codes

From c1148dc9acf938d912888ecb0a4e76ed40e21ef8 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 29 Jan 2024 07:39:16 +0200
Subject: [PATCH 0014/1571] Fix calculation of maximal multixact in
 ingest_multixact_create_record (#6502)

## Problem

See https://neondb.slack.com/archives/C06F5UJH601/p1706373716661439

## Summary of changes

Use None instead of 0 as initial accumulator value for calculating
maximal multixact XID.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pageserver/src/walingest.rs          | 18 ++++++++++++------
 test_runner/regress/test_next_xid.py | 12 +++++++++++-
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 5a6f9a590f..93d1dcab35 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1363,16 +1363,22 @@ impl WalIngest {
             self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
             self.checkpoint_modified = true;
         }
-        let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
-            if mbr.xid.wrapping_sub(acc) as i32 > 0 {
-                mbr.xid
+        let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| {
+            if let Some(max_xid) = acc {
+                if mbr.xid.wrapping_sub(max_xid) as i32 > 0 {
+                    Some(mbr.xid)
+                } else {
+                    acc
+                }
             } else {
-                acc
+                Some(mbr.xid)
             }
         });
 
-        if self.checkpoint.update_next_xid(max_mbr_xid) {
-            self.checkpoint_modified = true;
+        if let Some(max_xid) = max_mbr_xid {
+            if self.checkpoint.update_next_xid(max_xid) {
+                self.checkpoint_modified = true;
+            }
         }
         Ok(())
     }
diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py
index da2580dbf9..e880445c4d 100644
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -203,6 +203,16 @@ def test_import_at_2bil(
         $$;
         """
     )
+
+    # Also create a multi-XID with members past the 2 billion mark
+    conn2 = endpoint.connect()
+    cur2 = conn2.cursor()
+    cur.execute("INSERT INTO t VALUES ('x')")
+    cur.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;")
+    cur2.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;")
+    cur.execute("COMMIT")
+    cur2.execute("COMMIT")
+
     # A checkpoint writes a WAL record with xl_xid=0. Many other WAL
     # records would have the same effect.
     cur.execute("checkpoint")
@@ -217,4 +227,4 @@ def test_import_at_2bil(
     conn = endpoint.connect()
     cur = conn.cursor()
     cur.execute("SELECT count(*) from t")
-    assert cur.fetchone() == (10000 + 1,)
+    assert cur.fetchone() == (10000 + 1 + 1,)

From 511e730cc0be4295161332d5ee5da8148cf915f5 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 29 Jan 2024 07:26:20 +0000
Subject: [PATCH 0015/1571] hll experiment (#6312)

## Problem

Measuring cardinality using logs is expensive and slow.

## Summary of changes

Implement a pre-aggregated HyperLogLog-based cardinality estimate.
HyperLogLog estimates the cardinality of a set by using the probability
that the uniform hash of a value will have a run of n 0s at the end is
`1/2^n`, therefore, having observed a run of `n` 0s suggests we have
measured `2^n` distinct values. By using multiple shards, we can use the
harmonic mean to get a more accurate estimate.

We record this into a Prometheus time-series. HyperLogLog counts can be
merged by taking the `max` of each shard. We can apply a `max_over_time`
in order to find the estimate of cardinality of distinct values over
time
---
 Cargo.lock                |  20 ++
 Cargo.toml                |   1 +
 libs/metrics/Cargo.toml   |   5 +
 libs/metrics/src/hll.rs   | 523 ++++++++++++++++++++++++++++++++++++++
 libs/metrics/src/lib.rs   |   2 +
 proxy/src/context.rs      |   5 +
 proxy/src/metrics.rs      |  19 +-
 workspace_hack/Cargo.toml |   4 +-
 8 files changed, 571 insertions(+), 8 deletions(-)
 create mode 100644 libs/metrics/src/hll.rs

diff --git a/Cargo.lock b/Cargo.lock
index f0bcfb762a..a669fef314 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2736,6 +2736,12 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "libm"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.1.4"
@@ -2832,6 +2838,9 @@ dependencies = [
  "libc",
  "once_cell",
  "prometheus",
+ "rand 0.8.5",
+ "rand_distr",
+ "twox-hash",
  "workspace_hack",
 ]
 
@@ -3057,6 +3066,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
 dependencies = [
  "autocfg",
+ "libm",
 ]
 
 [[package]]
@@ -4171,6 +4181,16 @@ dependencies = [
  "getrandom 0.2.11",
 ]
 
+[[package]]
+name = "rand_distr"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
+dependencies = [
+ "num-traits",
+ "rand 0.8.5",
+]
+
 [[package]]
 name = "rand_hc"
 version = "0.2.0"
diff --git a/Cargo.toml b/Cargo.toml
index 8afab02b15..29618ca328 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -165,6 +165,7 @@ tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.20.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
+twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml
index d4323ae766..a547d492df 100644
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -9,5 +9,10 @@ prometheus.workspace = true
 libc.workspace = true
 once_cell.workspace = true
 chrono.workspace = true
+twox-hash.workspace = true
 
 workspace_hack.workspace = true
+
+[dev-dependencies]
+rand = "0.8"
+rand_distr = "0.4.3"
diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs
new file mode 100644
index 0000000000..46a623b0e2
--- /dev/null
+++ b/libs/metrics/src/hll.rs
@@ -0,0 +1,523 @@
+//! HyperLogLog is an algorithm for the count-distinct problem,
+//! approximating the number of distinct elements in a multiset.
+//! Calculating the exact cardinality of the distinct elements
+//! of a multiset requires an amount of memory proportional to
+//! the cardinality, which is impractical for very large data sets.
+//! Probabilistic cardinality estimators, such as the HyperLogLog algorithm,
+//! use significantly less memory than this, but can only approximate the cardinality.
+
+use std::{
+    collections::HashMap,
+    hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
+    sync::{atomic::AtomicU8, Arc, RwLock},
+};
+
+use prometheus::{
+    core::{self, Describer},
+    proto, Opts,
+};
+use twox_hash::xxh3;
+
+/// Create an [`HyperLogLogVec`] and registers to default registry.
+#[macro_export(local_inner_macros)]
+macro_rules! register_hll_vec {
+    ($N:literal, $OPTS:expr, $LABELS_NAMES:expr $(,)?) => {{
+        let hll_vec = $crate::HyperLogLogVec::<$N>::new($OPTS, $LABELS_NAMES).unwrap();
+        $crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec)
+    }};
+
+    ($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{
+        $crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
+    }};
+}
+
+/// Create an [`HyperLogLog`] and registers to default registry.
+#[macro_export(local_inner_macros)]
+macro_rules! register_hll {
+    ($N:literal, $OPTS:expr $(,)?) => {{
+        let hll = $crate::HyperLogLog::<$N>::with_opts($OPTS).unwrap();
+        $crate::register(Box::new(hll.clone())).map(|_| hll)
+    }};
+
+    ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
+        $crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
+    }};
+}
+
+/// HLL is a probabilistic cardinality measure.
+///
+/// How to use this time-series for a metric name `my_metrics_total_hll`:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// If you want an estimate over time, you can use the following query:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (
+///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
+///             ) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// In the case of low cardinality, you might want to use the linear counting approximation:
+///
+/// ```promql
+/// # LinearCounting(m, V) = m log (m / V)
+/// shards_count * ln(shards_count /
+///     # calculate V = how many shards contain a 0
+///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
+/// )
+/// ```
+///
+/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
+#[derive(Clone)]
+pub struct HyperLogLogVec<const N: usize> {
+    core: Arc<HyperLogLogVecCore<N>>,
+}
+
+struct HyperLogLogVecCore<const N: usize> {
+    pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
+    pub desc: core::Desc,
+    pub opts: Opts,
+}
+
+impl<const N: usize> core::Collector for HyperLogLogVec<N> {
+    fn desc(&self) -> Vec<&core::Desc> {
+        vec![&self.core.desc]
+    }
+
+    fn collect(&self) -> Vec<proto::MetricFamily> {
+        let mut m = proto::MetricFamily::default();
+        m.set_name(self.core.desc.fq_name.clone());
+        m.set_help(self.core.desc.help.clone());
+        m.set_field_type(proto::MetricType::GAUGE);
+
+        let mut metrics = Vec::new();
+        for child in self.core.children.read().unwrap().values() {
+            child.core.collect_into(&mut metrics);
+        }
+        m.set_metric(metrics);
+
+        vec![m]
+    }
+}
+
+impl<const N: usize> HyperLogLogVec<N> {
+    /// Create a new [`HyperLogLogVec`] based on the provided
+    /// [`Opts`] and partitioned by the given label names. At least one label name must be
+    /// provided.
+    pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
+        assert!(N.is_power_of_two());
+        let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
+        let opts = opts.variable_labels(variable_names);
+
+        let desc = opts.describe()?;
+        let v = HyperLogLogVecCore {
+            children: RwLock::new(HashMap::default()),
+            desc,
+            opts,
+        };
+
+        Ok(Self { core: Arc::new(v) })
+    }
+
+    /// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
+    /// of label values (same order as the VariableLabels in Desc). If that combination of
+    /// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
+    ///
+    /// An error is returned if the number of label values is not the same as the
+    /// number of VariableLabels in Desc.
+    pub fn get_metric_with_label_values(
+        &self,
+        vals: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        self.core.get_metric_with_label_values(vals)
+    }
+
+    /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
+    /// occurs.
+    pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
+        self.get_metric_with_label_values(vals).unwrap()
+    }
+}
+
+impl<const N: usize> HyperLogLogVecCore<N> {
+    pub fn get_metric_with_label_values(
+        &self,
+        vals: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        let h = self.hash_label_values(vals)?;
+
+        if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
+            return Ok(metric);
+        }
+
+        self.get_or_create_metric(h, vals)
+    }
+
+    pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
+        if vals.len() != self.desc.variable_labels.len() {
+            return Err(prometheus::Error::InconsistentCardinality {
+                expect: self.desc.variable_labels.len(),
+                got: vals.len(),
+            });
+        }
+
+        let mut h = xxh3::Hash64::default();
+        for val in vals {
+            h.write(val.as_bytes());
+        }
+
+        Ok(h.finish())
+    }
+
+    fn get_or_create_metric(
+        &self,
+        hash: u64,
+        label_values: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        let mut children = self.children.write().unwrap();
+        // Check exist first.
+        if let Some(metric) = children.get(&hash).cloned() {
+            return Ok(metric);
+        }
+
+        let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
+        children.insert(hash, metric.clone());
+        Ok(metric)
+    }
+}
+
+/// HLL is a probabilistic cardinality measure.
+///
+/// How to use this time-series for a metric name `my_metrics_total_hll`:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// If you want an estimate over time, you can use the following query:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (
+///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
+///             ) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// In the case of low cardinality, you might want to use the linear counting approximation:
+///
+/// ```promql
+/// # LinearCounting(m, V) = m log (m / V)
+/// shards_count * ln(shards_count /
+///     # calculate V = how many shards contain a 0
+///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
+/// )
+/// ```
+///
+/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
+#[derive(Clone)]
+pub struct HyperLogLog<const N: usize> {
+    core: Arc<HyperLogLogCore<N>>,
+}
+
+impl<const N: usize> HyperLogLog<N> {
+    /// Create a [`HyperLogLog`] with the `name` and `help` arguments.
+    pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
+        assert!(N.is_power_of_two());
+        let opts = Opts::new(name, help);
+        Self::with_opts(opts)
+    }
+
+    /// Create a [`HyperLogLog`] with the `opts` options.
+    pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
+        Self::with_opts_and_label_values(&opts, &[])
+    }
+
+    fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
+        let desc = opts.describe()?;
+        let labels = make_label_pairs(&desc, label_values)?;
+
+        let v = HyperLogLogCore {
+            shards: [0; N].map(AtomicU8::new),
+            desc,
+            labels,
+        };
+        Ok(Self { core: Arc::new(v) })
+    }
+
+    pub fn measure(&self, item: &impl Hash) {
+        // changing the hasher will break compatibility with previous measurements.
+        self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
+    }
+
+    fn record(&self, hash: u64) {
+        let p = N.ilog2() as u8;
+        let j = hash & (N as u64 - 1);
+        let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
+        self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
+    }
+}
+
+struct HyperLogLogCore<const N: usize> {
+    shards: [AtomicU8; N],
+    desc: core::Desc,
+    labels: Vec<proto::LabelPair>,
+}
+
+impl<const N: usize> core::Collector for HyperLogLog<N> {
+    fn desc(&self) -> Vec<&core::Desc> {
+        vec![&self.core.desc]
+    }
+
+    fn collect(&self) -> Vec<proto::MetricFamily> {
+        let mut m = proto::MetricFamily::default();
+        m.set_name(self.core.desc.fq_name.clone());
+        m.set_help(self.core.desc.help.clone());
+        m.set_field_type(proto::MetricType::GAUGE);
+
+        let mut metrics = Vec::new();
+        self.core.collect_into(&mut metrics);
+        m.set_metric(metrics);
+
+        vec![m]
+    }
+}
+
+impl<const N: usize> HyperLogLogCore<N> {
+    fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
+        self.shards.iter().enumerate().for_each(|(i, x)| {
+            let mut shard_label = proto::LabelPair::default();
+            shard_label.set_name("hll_shard".to_owned());
+            shard_label.set_value(format!("{i}"));
+
+            // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
+
+            // This seems like it would be a race condition,
+            // but HLL is not impacted by a write in one shard happening in between.
+            // This is because in PromQL we will be implementing a harmonic mean of all buckets.
+            // we will also merge samples in a time series using `max by (hll_shard)`.
+
+            // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
+            // this would mean that a dev port-forwarding the metrics url won't break the sampling.
+            let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
+
+            let mut m = proto::Metric::default();
+            let mut c = proto::Gauge::default();
+            c.set_value(v as f64);
+            m.set_gauge(c);
+
+            let mut labels = Vec::with_capacity(self.labels.len() + 1);
+            labels.extend_from_slice(&self.labels);
+            labels.push(shard_label);
+
+            m.set_label(labels);
+            metrics.push(m);
+        })
+    }
+}
+
+fn make_label_pairs(
+    desc: &core::Desc,
+    label_values: &[&str],
+) -> prometheus::Result<Vec<proto::LabelPair>> {
+    if desc.variable_labels.len() != label_values.len() {
+        return Err(prometheus::Error::InconsistentCardinality {
+            expect: desc.variable_labels.len(),
+            got: label_values.len(),
+        });
+    }
+
+    let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
+    if total_len == 0 {
+        return Ok(vec![]);
+    }
+
+    if desc.variable_labels.is_empty() {
+        return Ok(desc.const_label_pairs.clone());
+    }
+
+    let mut label_pairs = Vec::with_capacity(total_len);
+    for (i, n) in desc.variable_labels.iter().enumerate() {
+        let mut label_pair = proto::LabelPair::default();
+        label_pair.set_name(n.clone());
+        label_pair.set_value(label_values[i].to_owned());
+        label_pairs.push(label_pair);
+    }
+
+    for label_pair in &desc.const_label_pairs {
+        label_pairs.push(label_pair.clone());
+    }
+    label_pairs.sort();
+    Ok(label_pairs)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashSet;
+
+    use prometheus::{proto, Opts};
+    use rand::{rngs::StdRng, Rng, SeedableRng};
+    use rand_distr::{Distribution, Zipf};
+
+    use crate::HyperLogLogVec;
+
+    fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
+        let mut metrics = vec![];
+        hll.core
+            .children
+            .read()
+            .unwrap()
+            .values()
+            .for_each(|c| c.core.collect_into(&mut metrics));
+        metrics
+    }
+    fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
+        let mut buckets = [0.0; 32];
+        for metric in metrics.chunks_exact(32) {
+            if filter(&metric[0]) {
+                for (i, m) in metric.iter().enumerate() {
+                    buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
+                }
+            }
+        }
+
+        buckets
+            .into_iter()
+            .map(|f| 2.0f64.powf(-f))
+            .sum::<f64>()
+            .recip()
+            * 0.697
+            * 32.0
+            * 32.0
+    }
+
+    fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
+        let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
+
+        let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
+        let mut set_a = HashSet::new();
+        let mut set_b = HashSet::new();
+
+        for x in iter.by_ref().take(n) {
+            set_a.insert(x.to_bits());
+            hll.with_label_values(&["a"]).measure(&x.to_bits());
+        }
+        for x in iter.by_ref().take(n) {
+            set_b.insert(x.to_bits());
+            hll.with_label_values(&["b"]).measure(&x.to_bits());
+        }
+        let merge = &set_a | &set_b;
+
+        let metrics = collect(&hll);
+        let len = get_cardinality(&metrics, |_| true);
+        let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
+        let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
+
+        ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
+    }
+
+    #[test]
+    fn test_cardinality_small() {
+        let (actual, estimate) = test_cardinality(100, Zipf::new(100, 1.2f64).unwrap());
+
+        assert_eq!(actual, [46, 30, 32]);
+        assert!(51.3 < estimate[0] && estimate[0] < 51.4);
+        assert!(44.0 < estimate[1] && estimate[1] < 44.1);
+        assert!(39.0 < estimate[2] && estimate[2] < 39.1);
+    }
+
+    #[test]
+    fn test_cardinality_medium() {
+        let (actual, estimate) = test_cardinality(10000, Zipf::new(10000, 1.2f64).unwrap());
+
+        assert_eq!(actual, [2529, 1618, 1629]);
+        assert!(2309.1 < estimate[0] && estimate[0] < 2309.2);
+        assert!(1566.6 < estimate[1] && estimate[1] < 1566.7);
+        assert!(1629.5 < estimate[2] && estimate[2] < 1629.6);
+    }
+
+    #[test]
+    fn test_cardinality_large() {
+        let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(1_000_000, 1.2f64).unwrap());
+
+        assert_eq!(actual, [129077, 79579, 79630]);
+        assert!(126067.2 < estimate[0] && estimate[0] < 126067.3);
+        assert!(83076.8 < estimate[1] && estimate[1] < 83076.9);
+        assert!(64251.2 < estimate[2] && estimate[2] < 64251.3);
+    }
+
+    #[test]
+    fn test_cardinality_small2() {
+        let (actual, estimate) = test_cardinality(100, Zipf::new(200, 0.8f64).unwrap());
+
+        assert_eq!(actual, [92, 58, 60]);
+        assert!(116.1 < estimate[0] && estimate[0] < 116.2);
+        assert!(81.7 < estimate[1] && estimate[1] < 81.8);
+        assert!(69.3 < estimate[2] && estimate[2] < 69.4);
+    }
+
+    #[test]
+    fn test_cardinality_medium2() {
+        let (actual, estimate) = test_cardinality(10000, Zipf::new(20000, 0.8f64).unwrap());
+
+        assert_eq!(actual, [8201, 5131, 5051]);
+        assert!(6846.4 < estimate[0] && estimate[0] < 6846.5);
+        assert!(5239.1 < estimate[1] && estimate[1] < 5239.2);
+        assert!(4292.8 < estimate[2] && estimate[2] < 4292.9);
+    }
+
+    #[test]
+    fn test_cardinality_large2() {
+        let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(2_000_000, 0.8f64).unwrap());
+
+        assert_eq!(actual, [777847, 482069, 482246]);
+        assert!(699437.4 < estimate[0] && estimate[0] < 699437.5);
+        assert!(374948.9 < estimate[1] && estimate[1] < 374949.0);
+        assert!(434609.7 < estimate[2] && estimate[2] < 434609.8);
+    }
+}
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index d09ba11344..cb9914e5de 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -28,7 +28,9 @@ use prometheus::{Registry, Result};
 pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
+mod hll;
 pub mod metric_vec_duration;
+pub use hll::{HyperLogLog, HyperLogLogVec};
 
 pub type UIntGauge = GenericGauge<AtomicU64>;
 pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 9e2ea10031..ed2ed5e367 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -91,6 +91,11 @@ impl RequestMonitoring {
 
     pub fn set_endpoint_id(&mut self, endpoint_id: Option<EndpointId>) {
         self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone());
+        if let Some(ep) = &self.endpoint_id {
+            crate::metrics::CONNECTING_ENDPOINTS
+                .with_label_values(&[self.protocol])
+                .measure(&ep);
+        }
     }
 
     pub fn set_application(&mut self, app: Option<SmolStr>) {
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 6e4cbb3f3a..c7d566f645 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,10 +1,7 @@
 use ::metrics::{
-    exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec,
-    IntCounterPairVec, IntCounterVec,
-};
-use prometheus::{
-    register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec,
-    IntGaugeVec,
+    exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec,
+    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge_vec, Histogram,
+    HistogramVec, HyperLogLogVec, IntCounterPairVec, IntCounterVec, IntGaugeVec,
 };
 
 use once_cell::sync::Lazy;
@@ -236,3 +233,13 @@ pub const fn bool_to_str(x: bool) -> &'static str {
         "false"
     }
 }
+
+pub static CONNECTING_ENDPOINTS: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
+    register_hll_vec!(
+        32,
+        "proxy_connecting_endpoints",
+        "HLL approximate cardinality of endpoints that are connecting",
+        &["protocol"],
+    )
+    .unwrap()
+});
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 9d0f9bfcee..c29f8b422f 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -51,7 +51,7 @@ memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
-num-traits = { version = "0.2", features = ["i128"] }
+num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }
@@ -100,7 +100,7 @@ memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
-num-traits = { version = "0.2", features = ["i128"] }
+num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }

From 1e9a50bca8ee5887998d59c80a91516508985797 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 29 Jan 2024 10:38:40 +0200
Subject: [PATCH 0016/1571] disk_usage_eviction_task: cleanup summaries (#6490)

This is the "partial revert" of #6384. The summaries turned out to be
expensive due to naive vec usage, but also inconclusive because of the
additional context required. In addition to removing summary traces,
small refactoring is done.
---
 pageserver/src/disk_usage_eviction_task.rs | 352 +++++++--------------
 1 file changed, 120 insertions(+), 232 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 800e52bb51..1f0525b045 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -97,23 +97,86 @@ pub enum EvictionOrder {
 
     /// Order the layers to be evicted by how recently they have been accessed relatively within
     /// the set of resident layers of a tenant.
-    ///
-    /// This strategy will evict layers more fairly but is untested.
     RelativeAccessed {
-        #[serde(default)]
+        /// Determines if the tenant with most layers should lose first.
+        ///
+        /// Having this enabled is currently the only reasonable option, because the order in which
+        /// we read tenants is deterministic. If we find the need to use this as `false`, we need
+        /// to ensure nondeterminism by adding in a random number to break the
+        /// `relative_last_activity==0.0` ties.
+        #[serde(default = "default_highest_layer_count_loses_first")]
         highest_layer_count_loses_first: bool,
     },
 }
 
+fn default_highest_layer_count_loses_first() -> bool {
+    true
+}
+
 impl EvictionOrder {
-    /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
-    /// counts should be the first ones to have their layers evicted.
-    fn highest_layer_count_loses_first(&self) -> bool {
+    fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
+        use EvictionOrder::*;
+
         match self {
-            EvictionOrder::AbsoluteAccessed => false,
-            EvictionOrder::RelativeAccessed {
+            AbsoluteAccessed => {
+                candidates.sort_unstable_by_key(|(partition, candidate)| {
+                    (*partition, candidate.last_activity_ts)
+                });
+            }
+            RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
+                (*partition, candidate.relative_last_activity)
+            }),
+        }
+    }
+
+    /// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants
+    /// layers in **most** recently used order.
+    fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 {
+        use EvictionOrder::*;
+
+        match self {
+            AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
+            RelativeAccessed {
                 highest_layer_count_loses_first,
-            } => *highest_layer_count_loses_first,
+            } => {
+                // keeping the -1 or not decides if every tenant should lose their least recently accessed
+                // layer OR if this should happen in the order of having highest layer count:
+                let fudge = if *highest_layer_count_loses_first {
+                    // relative_last_activity vs. tenant layer count:
+                    // - 0.1..=1.0 (10 layers)
+                    // - 0.01..=1.0 (100 layers)
+                    // - 0.001..=1.0 (1000 layers)
+                    //
+                    // leading to evicting less of the smallest tenants.
+                    0
+                } else {
+                    // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
+                    // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
+                    // be that less than 10k layer evictions is enough, so we would not need to evict from
+                    // all tenants.
+                    //
+                    // as the tenant ordering is now deterministic this could hit the same tenants
+                    // disproportionetly on multiple invocations. alternative could be to remember how many
+                    // layers did we evict last time from this tenant, and inject that as an additional
+                    // fudge here.
+                    1
+                };
+
+                let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1);
+                let divider = total as f32;
+
+                // most recently used is always (total - 0) / divider == 1.0
+                // least recently used depends on the fudge:
+                // -       (total - 1) - (total - 1) / total => 0 / total
+                // -             total - (total - 1) / total => 1 / total
+                let distance = (total - index) as f32;
+
+                finite_f32::FiniteF32::try_from_normalized(distance / divider)
+                    .unwrap_or_else(|val| {
+                        tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}");
+                        finite_f32::FiniteF32::ZERO
+                    })
+            }
         }
     }
 }
@@ -389,52 +452,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 
     let selection = select_victims(&candidates, usage_pre);
 
-    let mut candidates = candidates;
-
-    let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) {
-        // we currently have the layers ordered by AbsoluteAccessed so that we can get the summary
-        // for comparison here. this is a temporary measure to develop alternatives.
-        use std::fmt::Write;
-
-        let mut summary_buf = String::with_capacity(256);
-
-        {
-            let absolute_summary = candidates
-                .iter()
-                .take(selection.amount)
-                .map(|(_, candidate)| candidate)
-                .collect::<summary::EvictionSummary>();
-
-            write!(summary_buf, "{absolute_summary}").expect("string grows");
-
-            info!("absolute accessed selection summary: {summary_buf}");
-        }
-
-        candidates.sort_unstable_by_key(|(partition, candidate)| {
-            (*partition, candidate.relative_last_activity)
-        });
-
-        let selection = select_victims(&candidates, usage_pre);
-
-        {
-            summary_buf.clear();
-
-            let relative_summary = candidates
-                .iter()
-                .take(selection.amount)
-                .map(|(_, candidate)| candidate)
-                .collect::<summary::EvictionSummary>();
-
-            write!(summary_buf, "{relative_summary}").expect("string grows");
-
-            info!("relative accessed selection summary: {summary_buf}");
-        }
-
-        selection
-    } else {
-        selection
-    };
-
     let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
 
     // phase2: evict layers
@@ -835,54 +852,12 @@ async fn collect_eviction_candidates(
             .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
         let mut cumsum: i128 = 0;
 
-        // keeping the -1 or not decides if every tenant should lose their least recently accessed
-        // layer OR if this should happen in the order of having highest layer count:
-        let fudge = if eviction_order.highest_layer_count_loses_first() {
-            // relative_age vs. tenant layer count:
-            // - 0.1..=1.0 (10 layers)
-            // - 0.01..=1.0 (100 layers)
-            // - 0.001..=1.0 (1000 layers)
-            //
-            // leading to evicting less of the smallest tenants.
-            0
-        } else {
-            // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
-            // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
-            // be that less than 10k layer evictions is enough, so we would not need to evict from
-            // all tenants.
-            //
-            // as the tenant ordering is now deterministic this could hit the same tenants
-            // disproportionetly on multiple invocations. alternative could be to remember how many
-            // layers did we evict last time from this tenant, and inject that as an additional
-            // fudge here.
-            1
-        };
-
-        let total = tenant_candidates
-            .len()
-            .checked_sub(fudge)
-            .filter(|&x| x > 0)
-            // support 0 or 1 resident layer tenants as well
-            .unwrap_or(1);
-        let divider = total as f32;
+        let total = tenant_candidates.len();
 
         for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
             // as we iterate this reverse sorted list, the most recently accessed layer will always
             // be 1.0; this is for us to evict it last.
-            candidate.relative_last_activity = if matches!(
-                eviction_order,
-                EvictionOrder::RelativeAccessed { .. }
-            ) {
-                // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
-                // similarly for u16. unsure how it would help.
-                finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
-                    .unwrap_or_else(|val| {
-                        tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
-                        finite_f32::FiniteF32::ZERO
-                    })
-            } else {
-                finite_f32::FiniteF32::ZERO
-            };
+            candidate.relative_last_activity = eviction_order.relative_last_activity(total, i);
 
             let partition = if cumsum > min_resident_size as i128 {
                 MinResidentSizePartition::Above
@@ -927,10 +902,7 @@ async fn collect_eviction_candidates(
     debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
         "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
 
-    // always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we
-    // will sort later by candidate.relative_last_activity to get compare evictions.
-    candidates
-        .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
+    eviction_order.sort(&mut candidates);
 
     Ok(EvictionCandidates::Finished(candidates))
 }
@@ -1070,6 +1042,12 @@ pub(crate) mod finite_f32 {
         }
     }
 
+    impl From<FiniteF32> for f32 {
+        fn from(value: FiniteF32) -> f32 {
+            value.0
+        }
+    }
+
     impl FiniteF32 {
         pub const ZERO: FiniteF32 = FiniteF32(0.0);
 
@@ -1082,136 +1060,9 @@ pub(crate) mod finite_f32 {
                 Err(value)
             }
         }
-    }
-}
 
-mod summary {
-    use super::finite_f32::FiniteF32;
-    use super::{EvictionCandidate, LayerCount};
-    use pageserver_api::shard::TenantShardId;
-    use std::collections::{BTreeMap, HashMap};
-    use std::time::SystemTime;
-
-    #[derive(Debug, Default)]
-    pub(super) struct EvictionSummary {
-        evicted_per_tenant: HashMap<TenantShardId, LayerCount>,
-        total: LayerCount,
-
-        last_absolute: Option<SystemTime>,
-        last_relative: Option<FiniteF32>,
-    }
-
-    impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary {
-        fn from_iter<T: IntoIterator<Item = &'a EvictionCandidate>>(iter: T) -> Self {
-            let mut summary = EvictionSummary::default();
-            for item in iter {
-                let counts = summary
-                    .evicted_per_tenant
-                    .entry(*item.layer.get_tenant_shard_id())
-                    .or_default();
-
-                let sz = item.layer.get_file_size();
-
-                counts.file_sizes += sz;
-                counts.count += 1;
-
-                summary.total.file_sizes += sz;
-                summary.total.count += 1;
-
-                summary.last_absolute = Some(item.last_activity_ts);
-                summary.last_relative = Some(item.relative_last_activity);
-            }
-
-            summary
-        }
-    }
-
-    struct SiBytesAmount(u64);
-
-    impl std::fmt::Display for SiBytesAmount {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            if self.0 < 1024 {
-                return write!(f, "{}B", self.0);
-            }
-
-            let mut tmp = self.0;
-            let mut ch = 0;
-            let suffixes = b"KMGTPE";
-
-            while tmp > 1024 * 1024 && ch < suffixes.len() - 1 {
-                tmp /= 1024;
-                ch += 1;
-            }
-
-            let ch = suffixes[ch] as char;
-
-            write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0)
-        }
-    }
-
-    impl std::fmt::Display for EvictionSummary {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            // wasteful, but it's for testing
-
-            let mut sorted: BTreeMap<usize, Vec<(TenantShardId, u64)>> = BTreeMap::new();
-
-            for (tenant_shard_id, count) in &self.evicted_per_tenant {
-                sorted
-                    .entry(count.count)
-                    .or_default()
-                    .push((*tenant_shard_id, count.file_sizes));
-            }
-
-            let total_file_sizes = SiBytesAmount(self.total.file_sizes);
-
-            writeln!(
-                f,
-                "selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):",
-                self.total.count, self.last_absolute, self.last_relative,
-            )?;
-
-            for (count, per_tenant) in sorted.iter().rev().take(10) {
-                write!(f, "- {count} layers: ")?;
-
-                if per_tenant.len() < 3 {
-                    for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() {
-                        if i > 0 {
-                            write!(f, ", ")?;
-                        }
-                        let bytes = SiBytesAmount(*bytes);
-                        write!(f, "{tenant_shard_id} ({bytes})")?;
-                    }
-                } else {
-                    let num_tenants = per_tenant.len();
-                    let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>();
-                    let total_bytes = SiBytesAmount(total_bytes);
-                    let layers = num_tenants * count;
-
-                    write!(
-                        f,
-                        "{num_tenants} tenants {total_bytes} in total {layers} layers",
-                    )?;
-                }
-
-                writeln!(f)?;
-            }
-
-            if sorted.len() > 10 {
-                let (rem_count, rem_bytes) = sorted
-                    .iter()
-                    .rev()
-                    .map(|(count, per_tenant)| {
-                        (
-                            count,
-                            per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>(),
-                        )
-                    })
-                    .fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1));
-                let rem_bytes = SiBytesAmount(rem_bytes);
-                writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?;
-            }
-
-            Ok(())
+        pub fn into_inner(self) -> f32 {
+            self.into()
         }
     }
 }
@@ -1336,3 +1187,40 @@ mod filesystem_level_usage {
         assert!(!usage.has_pressure());
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn relative_equal_bounds() {
+        let order = EvictionOrder::RelativeAccessed {
+            highest_layer_count_loses_first: false,
+        };
+
+        let len = 10;
+        let v = (0..len)
+            .map(|i| order.relative_last_activity(len, i).into_inner())
+            .collect::<Vec<_>>();
+
+        assert_eq!(v.first(), Some(&1.0));
+        assert_eq!(v.last(), Some(&0.0));
+        assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
+    }
+
+    #[test]
+    fn relative_spare_bounds() {
+        let order = EvictionOrder::RelativeAccessed {
+            highest_layer_count_loses_first: true,
+        };
+
+        let len = 10;
+        let v = (0..len)
+            .map(|i| order.relative_last_activity(len, i).into_inner())
+            .collect::<Vec<_>>();
+
+        assert_eq!(v.first(), Some(&1.0));
+        assert_eq!(v.last(), Some(&0.1));
+        assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
+    }
+}

From 0c7b89235c3cd396077afd6ec01ef74cb7e87e77 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 29 Jan 2024 09:47:12 +0000
Subject: [PATCH 0017/1571] pageserver: add range layer map search
 implementation (#6469)

## Problem
There's no efficient way of querying the layer map for a range.

## Summary of changes
Introduce a range query for the layer map (`LayerMap::range_search`).
There's two broad steps to it:
1. Find all coverage changes for layers that intersect the queried range
(see `LayerCoverage::range_overlaps`).
The slightly tricky part is dealing with the start of the range. We can
either be aligned with a layer or not and we need
to treat these cases differently.
2. Iterate over the coverage changes and collect the result. For this we
use a two pointer approach: the trailing pointer tracks the start of the
current range (current location in the key space) and the forward
pointer tracks the next coverage change.

Plugging the range search into the read path is deferred to a future PR.

## Performance
I adapted the layer map benchmarks on a local branch. Range searches are
between 2x and 2.5x slower than point searches. That's in line with what I
expected since we query thelayer map twice.

Since `Timeline::get` will proxy to `Timeline::get_vectored` we can
special case the one element layer map range search
at that point.
---
 pageserver/src/tenant/layer_map.rs            | 362 +++++++++++++++++-
 .../src/tenant/layer_map/layer_coverage.rs    |  36 ++
 .../src/tenant/storage_layer/layer_desc.rs    |   6 +-
 3 files changed, 399 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index c31d401e84..bb52e586d1 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,7 +51,10 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
-use std::collections::VecDeque;
+use pageserver_api::keyspace::KeySpaceAccum;
+use std::cmp::Ordering;
+use std::collections::{BTreeMap, VecDeque};
+use std::iter::Peekable;
 use std::ops::Range;
 use std::sync::Arc;
 use utils::lsn::Lsn;
@@ -144,11 +147,221 @@ impl Drop for BatchedUpdates<'_> {
 }
 
 /// Return value of LayerMap::search
+#[derive(Eq, PartialEq, Debug)]
 pub struct SearchResult {
     pub layer: Arc<PersistentLayerDesc>,
     pub lsn_floor: Lsn,
 }
 
+pub struct OrderedSearchResult(SearchResult);
+
+impl Ord for OrderedSearchResult {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.0.lsn_floor.cmp(&other.0.lsn_floor)
+    }
+}
+
+impl PartialOrd for OrderedSearchResult {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl PartialEq for OrderedSearchResult {
+    fn eq(&self, other: &Self) -> bool {
+        self.0.lsn_floor == other.0.lsn_floor
+    }
+}
+
+impl Eq for OrderedSearchResult {}
+
+pub struct RangeSearchResult {
+    pub found: BTreeMap<OrderedSearchResult, KeySpaceAccum>,
+    pub not_found: KeySpaceAccum,
+}
+
+impl RangeSearchResult {
+    fn new() -> Self {
+        Self {
+            found: BTreeMap::new(),
+            not_found: KeySpaceAccum::new(),
+        }
+    }
+}
+
+/// Collector for results of range search queries on the LayerMap.
+/// It should be provided with two iterators for the delta and image coverage
+/// that contain all the changes for layers which intersect the range.
+struct RangeSearchCollector<Iter>
+where
+    Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
+{
+    delta_coverage: Peekable<Iter>,
+    image_coverage: Peekable<Iter>,
+    key_range: Range<Key>,
+    end_lsn: Lsn,
+
+    current_delta: Option<Arc<PersistentLayerDesc>>,
+    current_image: Option<Arc<PersistentLayerDesc>>,
+
+    result: RangeSearchResult,
+}
+
+#[derive(Debug)]
+enum NextLayerType {
+    Delta(i128),
+    Image(i128),
+    Both(i128),
+}
+
+impl NextLayerType {
+    fn next_change_at_key(&self) -> Key {
+        match self {
+            NextLayerType::Delta(at) => Key::from_i128(*at),
+            NextLayerType::Image(at) => Key::from_i128(*at),
+            NextLayerType::Both(at) => Key::from_i128(*at),
+        }
+    }
+}
+
+impl<Iter> RangeSearchCollector<Iter>
+where
+    Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
+{
+    fn new(
+        key_range: Range<Key>,
+        end_lsn: Lsn,
+        delta_coverage: Iter,
+        image_coverage: Iter,
+    ) -> Self {
+        Self {
+            delta_coverage: delta_coverage.peekable(),
+            image_coverage: image_coverage.peekable(),
+            key_range,
+            end_lsn,
+            current_delta: None,
+            current_image: None,
+            result: RangeSearchResult::new(),
+        }
+    }
+
+    /// Run the collector. Collection is implemented via a two pointer algorithm.
+    /// One pointer tracks the start of the current range and the other tracks
+    /// the beginning of the next range which will overlap with the next change
+    /// in coverage across both image and delta.
+    fn collect(mut self) -> RangeSearchResult {
+        let next_layer_type = self.choose_next_layer_type();
+        let mut current_range_start = match next_layer_type {
+            None => {
+                // No changes for the range
+                self.pad_range(self.key_range.clone());
+                return self.result;
+            }
+            Some(layer_type) if self.key_range.end <= layer_type.next_change_at_key() => {
+                // Changes only after the end of the range
+                self.pad_range(self.key_range.clone());
+                return self.result;
+            }
+            Some(layer_type) => {
+                // Changes for the range exist. Record anything before the first
+                // coverage change as not found.
+                let coverage_start = layer_type.next_change_at_key();
+                let range_before = self.key_range.start..coverage_start;
+                self.pad_range(range_before);
+
+                self.advance(&layer_type);
+                coverage_start
+            }
+        };
+
+        while current_range_start < self.key_range.end {
+            let next_layer_type = self.choose_next_layer_type();
+            match next_layer_type {
+                Some(t) => {
+                    let current_range_end = t.next_change_at_key();
+                    self.add_range(current_range_start..current_range_end);
+                    current_range_start = current_range_end;
+
+                    self.advance(&t);
+                }
+                None => {
+                    self.add_range(current_range_start..self.key_range.end);
+                    current_range_start = self.key_range.end;
+                }
+            }
+        }
+
+        self.result
+    }
+
+    /// Mark a range as not found (i.e. no layers intersect it)
+    fn pad_range(&mut self, key_range: Range<Key>) {
+        if !key_range.is_empty() {
+            self.result.not_found.add_range(key_range);
+        }
+    }
+
+    /// Select the appropiate layer for the given range and update
+    /// the collector.
+    fn add_range(&mut self, covered_range: Range<Key>) {
+        let selected = LayerMap::select_layer(
+            self.current_delta.clone(),
+            self.current_image.clone(),
+            self.end_lsn,
+        );
+
+        match selected {
+            Some(search_result) => self
+                .result
+                .found
+                .entry(OrderedSearchResult(search_result))
+                .or_default()
+                .add_range(covered_range),
+            None => self.pad_range(covered_range),
+        }
+    }
+
+    /// Move to the next coverage change.
+    fn advance(&mut self, layer_type: &NextLayerType) {
+        match layer_type {
+            NextLayerType::Delta(_) => {
+                let (_, layer) = self.delta_coverage.next().unwrap();
+                self.current_delta = layer;
+            }
+            NextLayerType::Image(_) => {
+                let (_, layer) = self.image_coverage.next().unwrap();
+                self.current_image = layer;
+            }
+            NextLayerType::Both(_) => {
+                let (_, image_layer) = self.image_coverage.next().unwrap();
+                let (_, delta_layer) = self.delta_coverage.next().unwrap();
+
+                self.current_image = image_layer;
+                self.current_delta = delta_layer;
+            }
+        }
+    }
+
+    /// Pick the next coverage change: the one at the lesser key or both if they're alligned.
+    fn choose_next_layer_type(&mut self) -> Option<NextLayerType> {
+        let next_delta_at = self.delta_coverage.peek().map(|(key, _)| key);
+        let next_image_at = self.image_coverage.peek().map(|(key, _)| key);
+
+        match (next_delta_at, next_image_at) {
+            (None, None) => None,
+            (Some(next_delta_at), None) => Some(NextLayerType::Delta(*next_delta_at)),
+            (None, Some(next_image_at)) => Some(NextLayerType::Image(*next_image_at)),
+            (Some(next_delta_at), Some(next_image_at)) if next_image_at < next_delta_at => {
+                Some(NextLayerType::Image(*next_image_at))
+            }
+            (Some(next_delta_at), Some(next_image_at)) if next_delta_at < next_image_at => {
+                Some(NextLayerType::Delta(*next_delta_at))
+            }
+            (Some(next_delta_at), Some(_)) => Some(NextLayerType::Both(*next_delta_at)),
+        }
+    }
+}
+
 impl LayerMap {
     ///
     /// Find the latest layer (by lsn.end) that covers the given
@@ -186,7 +399,18 @@ impl LayerMap {
         let latest_delta = version.delta_coverage.query(key.to_i128());
         let latest_image = version.image_coverage.query(key.to_i128());
 
-        match (latest_delta, latest_image) {
+        Self::select_layer(latest_delta, latest_image, end_lsn)
+    }
+
+    fn select_layer(
+        delta_layer: Option<Arc<PersistentLayerDesc>>,
+        image_layer: Option<Arc<PersistentLayerDesc>>,
+        end_lsn: Lsn,
+    ) -> Option<SearchResult> {
+        assert!(delta_layer.as_ref().map_or(true, |l| l.is_delta()));
+        assert!(image_layer.as_ref().map_or(true, |l| !l.is_delta()));
+
+        match (delta_layer, image_layer) {
             (None, None) => None,
             (None, Some(image)) => {
                 let lsn_floor = image.get_lsn_range().start;
@@ -223,6 +447,17 @@ impl LayerMap {
         }
     }
 
+    pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> Option<RangeSearchResult> {
+        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
+
+        let raw_range = key_range.start.to_i128()..key_range.end.to_i128();
+        let delta_changes = version.delta_coverage.range_overlaps(&raw_range);
+        let image_changes = version.image_coverage.range_overlaps(&raw_range);
+
+        let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes);
+        Some(collector.collect())
+    }
+
     /// Start a batch of updates, applied on drop
     pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
         BatchedUpdates { layer_map: self }
@@ -631,3 +866,126 @@ impl LayerMap {
         Ok(())
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[derive(Clone)]
+    struct LayerDesc {
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+        is_delta: bool,
+    }
+
+    fn create_layer_map(layers: Vec<LayerDesc>) -> LayerMap {
+        let mut layer_map = LayerMap::default();
+
+        for layer in layers {
+            layer_map.insert_historic_noflush(PersistentLayerDesc::new_test(
+                layer.key_range,
+                layer.lsn_range,
+                layer.is_delta,
+            ));
+        }
+
+        layer_map.flush_updates();
+        layer_map
+    }
+
+    fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
+        assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
+        let lhs: Vec<_> = lhs
+            .found
+            .into_iter()
+            .map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
+            .collect();
+        let rhs: Vec<_> = rhs
+            .found
+            .into_iter()
+            .map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
+            .collect();
+
+        assert_eq!(lhs, rhs);
+    }
+
+    fn brute_force_range_search(
+        layer_map: &LayerMap,
+        key_range: Range<Key>,
+        end_lsn: Lsn,
+    ) -> RangeSearchResult {
+        let mut range_search_result = RangeSearchResult::new();
+
+        let mut key = key_range.start;
+        while key != key_range.end {
+            let res = layer_map.search(key, end_lsn);
+            match res {
+                Some(res) => {
+                    range_search_result
+                        .found
+                        .entry(OrderedSearchResult(res))
+                        .or_default()
+                        .add_key(key);
+                }
+                None => {
+                    range_search_result.not_found.add_key(key);
+                }
+            }
+
+            key = key.next();
+        }
+
+        range_search_result
+    }
+
+    #[test]
+    fn ranged_search_on_empty_layer_map() {
+        let layer_map = LayerMap::default();
+        let range = Key::from_i128(100)..Key::from_i128(200);
+
+        let res = layer_map.range_search(range, Lsn(100));
+        assert!(res.is_none());
+    }
+
+    #[test]
+    fn ranged_search() {
+        let layers = vec![
+            LayerDesc {
+                key_range: Key::from_i128(15)..Key::from_i128(50),
+                lsn_range: Lsn(0)..Lsn(5),
+                is_delta: false,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(10)..Key::from_i128(20),
+                lsn_range: Lsn(5)..Lsn(20),
+                is_delta: true,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(15)..Key::from_i128(25),
+                lsn_range: Lsn(20)..Lsn(30),
+                is_delta: true,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(35)..Key::from_i128(40),
+                lsn_range: Lsn(25)..Lsn(35),
+                is_delta: true,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(35)..Key::from_i128(40),
+                lsn_range: Lsn(35)..Lsn(40),
+                is_delta: false,
+            },
+        ];
+
+        let layer_map = create_layer_map(layers.clone());
+        for start in 0..60 {
+            for end in (start + 1)..60 {
+                let range = Key::from_i128(start)..Key::from_i128(end);
+                let result = layer_map.range_search(range.clone(), Lsn(100)).unwrap();
+                let expected = brute_force_range_search(&layer_map, range, Lsn(100));
+
+                assert_range_search_result_eq(result, expected);
+            }
+        }
+    }
+}
diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs
index 1d9101d3d1..cf0085c071 100644
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -129,6 +129,42 @@ impl<Value: Clone> LayerCoverage<Value> {
             .map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone())))
     }
 
+    /// Returns an iterator which includes all coverage changes for layers that intersect
+    /// with the provided range.
+    pub fn range_overlaps(
+        &self,
+        key_range: &Range<i128>,
+    ) -> impl Iterator<Item = (i128, Option<Value>)> + '_
+    where
+        Value: Eq,
+    {
+        let first_change = self.query(key_range.start);
+        match first_change {
+            Some(change) => {
+                // If the start of the range is covered, we have to deal with two cases:
+                // 1. Start of the range is aligned with the start of a layer.
+                // In this case the return of `self.range` will contain the layer which aligns with the start of the key range.
+                // We advance said iterator to avoid duplicating the first change.
+                // 2. Start of the range is not aligned with the start of a layer.
+                let range = key_range.start..key_range.end;
+                let mut range_coverage = self.range(range).peekable();
+                if range_coverage
+                    .peek()
+                    .is_some_and(|c| c.1.as_ref() == Some(&change))
+                {
+                    range_coverage.next();
+                }
+                itertools::Either::Left(
+                    std::iter::once((key_range.start, Some(change))).chain(range_coverage),
+                )
+            }
+            None => {
+                let range = key_range.start..key_range.end;
+                let coverage = self.range(range);
+                itertools::Either::Right(coverage)
+            }
+        }
+    }
     /// O(1) clone
     pub fn clone(&self) -> Self {
         Self {
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index bf24407fc5..fa78e9fdb2 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -55,13 +55,13 @@ impl PersistentLayerDesc {
     }
 
     #[cfg(test)]
-    pub fn new_test(key_range: Range<Key>) -> Self {
+    pub fn new_test(key_range: Range<Key>, lsn_range: Range<Lsn>, is_delta: bool) -> Self {
         Self {
             tenant_shard_id: TenantShardId::unsharded(TenantId::generate()),
             timeline_id: TimelineId::generate(),
             key_range,
-            lsn_range: Lsn(0)..Lsn(1),
-            is_delta: false,
+            lsn_range,
+            is_delta,
             file_size: 0,
         }
     }

From b04a6acd6caa3ef29225ec75d442a93d640bd350 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 29 Jan 2024 13:31:56 +0000
Subject: [PATCH 0018/1571] docker: add attachment_service binary (#6506)

## Problem

Creating sharded tenants will require an instance of the sharding
service -- the initial goal is to deploy one of these in a staging
region (https://github.com/neondatabase/cloud/issues/9718). It will run
as a kubernetes container, similar to the storage broker, so needs to be
built into the container image.

## Summary of changes

Add `attachment_service` binary to container image
---
 Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 5d5fde4f14..bb926643dc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -53,6 +53,7 @@ RUN set -e \
       --bin pagectl  \
       --bin safekeeper  \
       --bin storage_broker  \
+      --bin attachment_service  \
       --bin proxy  \
       --bin neon_local \
       --locked --release \
@@ -80,6 +81,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/attachment_service  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
 

From 6a85a06e1b7528ea365371ca80b3c6162ddf8610 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 29 Jan 2024 16:16:37 +0000
Subject: [PATCH 0019/1571] Compute: build rdkit without freetype support
 (#6495)

## Problem
`rdkit` extension is built with `RDK_BUILD_FREETYPE_SUPPORT=ON` (by
default), which requires a bunch of additional dependencies, but the
support of freetype fonts isn't required for Postgres.


With `RDK_BUILD_FREETYPE_SUPPORT=ON`:
```
ldd /usr/local/pgsql/lib/rdkit.so
	linux-vdso.so.1 (0x0000ffff82ea8000)
	libfreetype.so.6 => /usr/lib/aarch64-linux-gnu/libfreetype.so.6 (0x0000ffff825e5000)
	libboost_serialization.so.1.74.0 => /usr/lib/aarch64-linux-gnu/libboost_serialization.so.1.74.0 (0x0000ffff82590000)
	libpthread.so.0 => /lib/aarch64-linux-gnu/libpthread.so.0 (0x0000ffff8255f000)
	libstdc++.so.6 => /usr/lib/aarch64-linux-gnu/libstdc++.so.6 (0x0000ffff82387000)
	libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x0000ffff822dc000)
	libgcc_s.so.1 => /lib/aarch64-linux-gnu/libgcc_s.so.1 (0x0000ffff822b8000)
	libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x0000ffff82144000)
	libpng16.so.16 => /usr/lib/aarch64-linux-gnu/libpng16.so.16 (0x0000ffff820fd000)
	libz.so.1 => /lib/aarch64-linux-gnu/libz.so.1 (0x0000ffff820d3000)
	libbrotlidec.so.1 => /usr/lib/aarch64-linux-gnu/libbrotlidec.so.1 (0x0000ffff820b8000)
	/lib/ld-linux-aarch64.so.1 (0x0000ffff82e78000)
	libbrotlicommon.so.1 => /usr/lib/aarch64-linux-gnu/libbrotlicommon.so.1 (0x0000ffff82087000)
```

With `RDK_BUILD_FREETYPE_SUPPORT=OFF`:
```
ldd /usr/local/pgsql/lib/rdkit.so
	linux-vdso.so.1 (0x0000ffffbba75000)
	libboost_serialization.so.1.74.0 => /usr/lib/aarch64-linux-gnu/libboost_serialization.so.1.74.0 (0x0000ffffbb259000)
	libpthread.so.0 => /lib/aarch64-linux-gnu/libpthread.so.0 (0x0000ffffbb228000)
	libstdc++.so.6 => /usr/lib/aarch64-linux-gnu/libstdc++.so.6 (0x0000ffffbb050000)
	libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x0000ffffbafa5000)
	libgcc_s.so.1 => /lib/aarch64-linux-gnu/libgcc_s.so.1 (0x0000ffffbaf81000)
	libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x0000ffffbae0d000)
	/lib/ld-linux-aarch64.so.1 (0x0000ffffbba45000)
```

## Summary of changes
- Build `rdkit` with `RDK_BUILD_FREETYPE_SUPPORT=OFF`
- Remove extra dependencies from the Compute image
---
 Dockerfile.compute-node | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 299c4097e8..d91c7cfd72 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -520,8 +520,7 @@ RUN apt-get update && \
         libboost-regex1.74-dev \
         libboost-serialization1.74-dev \
         libboost-system1.74-dev \
-        libeigen3-dev \
-        libfreetype6-dev
+        libeigen3-dev
 
 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
@@ -547,6 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
         -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
         -D RDK_INSTALL_INTREE=OFF \
         -D RDK_INSTALL_COMIC_FONTS=OFF \
+        -D RDK_BUILD_FREETYPE_SUPPORT=OFF \
         -D CMAKE_BUILD_TYPE=Release \
         . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -901,7 +901,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
-# libboost*, libfreetype6, and zlib1g for rdkit
+# libboost* for rdkit
 # ca-certificates for communicating with s3 by compute_ctl
 RUN apt update &&  \
     apt install --no-install-recommends -y \
@@ -914,7 +914,6 @@ RUN apt update &&  \
         libboost-serialization1.74.0 \
         libboost-system1.74.0 \
         libossp-uuid16 \
-        libfreetype6 \
         libgeos-c1v5 \
         libgdal28 \
         libproj19 \
@@ -926,7 +925,6 @@ RUN apt update &&  \
         libcurl4-openssl-dev \
         locales \
         procps \
-        zlib1g \
         ca-certificates && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

From b844c6f0c754f0994182f8c367a50a01e4b7e023 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 29 Jan 2024 17:59:26 +0100
Subject: [PATCH 0020/1571] Do pagination in list_object_versions call (#6500)

## Problem

The tenants we want to recover might have tens of thousands of keys, or
more. At that point, the AWS API returns a paginated response.

## Summary of changes

Support paginated responses for `list_object_versions` requests.

Follow-up of #6155, part of https://github.com/neondatabase/cloud/issues/8233
---
 libs/remote_storage/src/s3_bucket.rs | 97 ++++++++++++++++++----------
 1 file changed, 62 insertions(+), 35 deletions(-)

diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 4909b8522b..83f3015eab 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -646,7 +646,7 @@ impl RemoteStorage for S3Bucket {
         let timestamp = DateTime::from(timestamp);
         let done_if_after = DateTime::from(done_if_after);
 
-        tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
+        tracing::info!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
 
         // get the passed prefix or if it is not set use prefix_in_bucket value
         let prefix = prefix
@@ -657,40 +657,67 @@ impl RemoteStorage for S3Bucket {
         let max_retries = 10;
         let is_permanent = |_e: &_| false;
 
-        let list = backoff::retry(
-            || async {
-                Ok(self
-                    .client
-                    .list_object_versions()
-                    .bucket(self.bucket_name.clone())
-                    .set_prefix(prefix.clone())
-                    .send()
-                    .await?)
-            },
-            is_permanent,
-            warn_threshold,
-            max_retries,
-            "listing object versions for time_travel_recover",
-            backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
-        )
-        .await?;
+        let mut key_marker = None;
+        let mut version_id_marker = None;
+        let mut versions_and_deletes = Vec::new();
 
-        if list.is_truncated().unwrap_or_default() {
-            anyhow::bail!("Received truncated ListObjectVersions response for prefix={prefix:?}");
+        loop {
+            let response = backoff::retry(
+                || async {
+                    Ok(self
+                        .client
+                        .list_object_versions()
+                        .bucket(self.bucket_name.clone())
+                        .set_prefix(prefix.clone())
+                        .set_key_marker(key_marker.clone())
+                        .set_version_id_marker(version_id_marker.clone())
+                        .send()
+                        .await?)
+                },
+                is_permanent,
+                warn_threshold,
+                max_retries,
+                "listing object versions for time_travel_recover",
+                backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
+            )
+            .await?;
+
+            tracing::trace!(
+                "  Got List response version_id_marker={:?}, key_marker={:?}",
+                response.version_id_marker,
+                response.key_marker
+            );
+            let versions = response.versions.unwrap_or_default();
+            let delete_markers = response.delete_markers.unwrap_or_default();
+            let new_versions = versions.into_iter().map(VerOrDelete::Version);
+            let new_deletes = delete_markers.into_iter().map(VerOrDelete::DeleteMarker);
+            let new_versions_and_deletes = new_versions.chain(new_deletes);
+            versions_and_deletes.extend(new_versions_and_deletes);
+            fn none_if_empty(v: Option<String>) -> Option<String> {
+                v.filter(|v| !v.is_empty())
+            }
+            version_id_marker = none_if_empty(response.next_version_id_marker);
+            key_marker = none_if_empty(response.next_key_marker);
+            if version_id_marker.is_none() {
+                // The final response is not supposed to be truncated
+                if response.is_truncated.unwrap_or_default() {
+                    anyhow::bail!(
+                        "Received truncated ListObjectVersions response for prefix={prefix:?}"
+                    );
+                }
+                break;
+            }
         }
 
-        let mut versions_deletes = list
-            .versions()
-            .iter()
-            .map(VerOrDelete::Version)
-            .chain(list.delete_markers().iter().map(VerOrDelete::DeleteMarker))
-            .collect::<Vec<_>>();
+        // Work on the list of references instead of the objects directly,
+        // otherwise we get lifetime errors in the sort_by_key call below.
+        let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
 
-        versions_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified()));
+        versions_and_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified()));
 
         let mut vds_for_key = HashMap::<_, Vec<_>>::new();
 
-        for vd in versions_deletes {
+        for vd in &versions_and_deletes {
             let last_modified = vd.last_modified();
             let version_id = vd.version_id();
             let key = vd.key();
@@ -811,25 +838,25 @@ fn start_measuring_requests(
     })
 }
 
-enum VerOrDelete<'a> {
-    Version(&'a ObjectVersion),
-    DeleteMarker(&'a DeleteMarkerEntry),
+enum VerOrDelete {
+    Version(ObjectVersion),
+    DeleteMarker(DeleteMarkerEntry),
 }
 
-impl<'a> VerOrDelete<'a> {
-    fn last_modified(&self) -> Option<&'a DateTime> {
+impl VerOrDelete {
+    fn last_modified(&self) -> Option<&DateTime> {
         match self {
             VerOrDelete::Version(v) => v.last_modified(),
             VerOrDelete::DeleteMarker(v) => v.last_modified(),
         }
     }
-    fn version_id(&self) -> Option<&'a str> {
+    fn version_id(&self) -> Option<&str> {
         match self {
             VerOrDelete::Version(v) => v.version_id(),
             VerOrDelete::DeleteMarker(v) => v.version_id(),
         }
     }
-    fn key(&self) -> Option<&'a str> {
+    fn key(&self) -> Option<&str> {
         match self {
             VerOrDelete::Version(v) => v.key(),
             VerOrDelete::DeleteMarker(v) => v.key(),

From ec8dcc223167aad145cc8b70cc3ac6801f0ed79c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 29 Jan 2024 17:38:03 +0000
Subject: [PATCH 0021/1571] flatten proxy flow (#6447)

## Problem

Taking my ideas from https://github.com/neondatabase/neon/pull/6283 and
doing a bit less radical changes. smaller commits.

Proxy flow was quite deeply nested, which makes adding more interesting
error handling quite tricky.

## Summary of changes

I recommend reviewing commit by commit.

1. move handshake logic into a separate file
2. move passthrough logic into a separate file
3. no longer accept a closure in CancelMap session logic
4. Remove connect_to_db, copy logic into handle_client
5. flatten auth_and_wake_compute in authenticate
6. record info for link auth
---
 proxy/src/auth/backend.rs             |  26 +-
 proxy/src/auth/backend/link.rs        |   6 +
 proxy/src/auth/credentials.rs         |   8 +-
 proxy/src/bin/pg_sni_router.rs        |   2 +-
 proxy/src/cancellation.rs             |  93 +++----
 proxy/src/context.rs                  |  12 +-
 proxy/src/proxy.rs                    | 351 ++++++--------------------
 proxy/src/proxy/handshake.rs          |  96 +++++++
 proxy/src/proxy/passthrough.rs        |  57 +++++
 proxy/src/serverless.rs               |   2 +-
 proxy/src/serverless/sql_over_http.rs |   2 +-
 proxy/src/serverless/websocket.rs     |   2 +-
 12 files changed, 297 insertions(+), 360 deletions(-)
 create mode 100644 proxy/src/proxy/handshake.rs
 create mode 100644 proxy/src/proxy/passthrough.rs

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index b1634906c9..4b8ebae86f 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -190,7 +190,10 @@ async fn auth_quirks(
         Err(info) => {
             let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer)
                 .await?;
-            ctx.set_endpoint_id(Some(res.info.endpoint.clone()));
+
+            ctx.set_endpoint_id(res.info.endpoint.clone());
+            tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
+
             (res.info, Some(res.keys))
         }
         Ok(info) => (info, None),
@@ -271,19 +274,12 @@ async fn authenticate_with_secret(
     classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
 }
 
-/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache)
-/// only if authentication was successfuly.
-async fn auth_and_wake_compute(
+/// wake a compute (or retrieve an existing compute session from cache)
+async fn wake_compute(
     ctx: &mut RequestMonitoring,
     api: &impl console::Api,
-    user_info: ComputeUserInfoMaybeEndpoint,
-    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-    allow_cleartext: bool,
-    config: &'static AuthenticationConfig,
+    compute_credentials: ComputeCredentials<ComputeCredentialKeys>,
 ) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
-    let compute_credentials =
-        auth_quirks(ctx, api, user_info, client, allow_cleartext, config).await?;
-
     let mut num_retries = 0;
     let mut node = loop {
         let wake_res = api.wake_compute(ctx, &compute_credentials.info).await;
@@ -358,16 +354,16 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
                     "performing authentication using the console"
                 );
 
-                let (cache_info, user_info) =
-                    auth_and_wake_compute(ctx, &*api, user_info, client, allow_cleartext, config)
-                        .await?;
+                let compute_credentials =
+                    auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
+                let (cache_info, user_info) = wake_compute(ctx, &*api, compute_credentials).await?;
                 (cache_info, BackendType::Console(api, user_info))
             }
             // NOTE: this auth backend doesn't use client credentials.
             Link(url) => {
                 info!("performing link authentication");
 
-                let node_info = link::authenticate(&url, client).await?;
+                let node_info = link::authenticate(ctx, &url, client).await?;
 
                 (
                     CachedNodeInfo::new_uncached(node_info),
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index a7ddd257b3..d8ae362c03 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -1,6 +1,7 @@
 use crate::{
     auth, compute,
     console::{self, provider::NodeInfo},
+    context::RequestMonitoring,
     error::UserFacingError,
     stream::PqStream,
     waiters,
@@ -54,6 +55,7 @@ pub fn new_psql_session_id() -> String {
 }
 
 pub(super) async fn authenticate(
+    ctx: &mut RequestMonitoring,
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
@@ -94,6 +96,10 @@ pub(super) async fn authenticate(
         .dbname(&db_info.dbname)
         .user(&db_info.user);
 
+    ctx.set_user(db_info.user.into());
+    ctx.set_project(db_info.aux.clone());
+    tracing::Span::current().record("ep", &tracing::field::display(&db_info.aux.endpoint_id));
+
     // Backwards compatibility. pg_sni_proxy uses "--" in domain names
     // while direct connections do not. Once we migrate to pg_sni_proxy
     // everywhere, we can remove this.
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 5bf7667a1f..875baaec47 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -126,7 +126,11 @@ impl ComputeUserInfoMaybeEndpoint {
             }),
         }
         .transpose()?;
-        ctx.set_endpoint_id(endpoint.clone());
+
+        if let Some(ep) = &endpoint {
+            ctx.set_endpoint_id(ep.clone());
+            tracing::Span::current().record("ep", &tracing::field::display(ep));
+        }
 
         info!(%user, project = endpoint.as_deref(), "credentials");
         if sni.is_some() {
@@ -150,7 +154,7 @@ impl ComputeUserInfoMaybeEndpoint {
 
         Ok(Self {
             user,
-            endpoint_id: endpoint.map(EndpointId::from),
+            endpoint_id: endpoint,
             options,
         })
     }
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 1edbc1e7e7..471be7af25 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -272,5 +272,5 @@ async fn handle_client(
     let client = tokio::net::TcpStream::connect(destination).await?;
 
     let metrics_aux: MetricsAuxInfo = Default::default();
-    proxy::proxy::proxy_pass(ctx, tls_stream, client, metrics_aux).await
+    proxy::proxy::passthrough::proxy_pass(ctx, tls_stream, client, metrics_aux).await
 }
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index a5eb3544b4..d4ee657144 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,7 +1,7 @@
-use anyhow::{bail, Context};
+use anyhow::Context;
 use dashmap::DashMap;
 use pq_proto::CancelKeyData;
-use std::net::SocketAddr;
+use std::{net::SocketAddr, sync::Arc};
 use tokio::net::TcpStream;
 use tokio_postgres::{CancelToken, NoTls};
 use tracing::info;
@@ -25,39 +25,31 @@ impl CancelMap {
     }
 
     /// Run async action within an ephemeral session identified by [`CancelKeyData`].
-    pub async fn with_session<'a, F, R, V>(&'a self, f: F) -> anyhow::Result<V>
-    where
-        F: FnOnce(Session<'a>) -> R,
-        R: std::future::Future<Output = anyhow::Result<V>>,
-    {
+    pub fn get_session(self: Arc<Self>) -> Session {
         // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
         // expose it and we don't want to do another roundtrip to query
         // for it. The client will be able to notice that this is not the
         // actual backend_pid, but backend_pid is not used for anything
         // so it doesn't matter.
-        let key = rand::random();
+        let key = loop {
+            let key = rand::random();
 
-        // Random key collisions are unlikely to happen here, but they're still possible,
-        // which is why we have to take care not to rewrite an existing key.
-        match self.0.entry(key) {
-            dashmap::mapref::entry::Entry::Occupied(_) => {
-                bail!("query cancellation key already exists: {key}")
+            // Random key collisions are unlikely to happen here, but they're still possible,
+            // which is why we have to take care not to rewrite an existing key.
+            match self.0.entry(key) {
+                dashmap::mapref::entry::Entry::Occupied(_) => continue,
+                dashmap::mapref::entry::Entry::Vacant(e) => {
+                    e.insert(None);
+                }
             }
-            dashmap::mapref::entry::Entry::Vacant(e) => {
-                e.insert(None);
-            }
-        }
-
-        // This will guarantee that the session gets dropped
-        // as soon as the future is finished.
-        scopeguard::defer! {
-            self.0.remove(&key);
-            info!("dropped query cancellation key {key}");
-        }
+            break key;
+        };
 
         info!("registered new query cancellation key {key}");
-        let session = Session::new(key, self);
-        f(session).await
+        Session {
+            key,
+            cancel_map: self,
+        }
     }
 
     #[cfg(test)]
@@ -98,23 +90,17 @@ impl CancelClosure {
 }
 
 /// Helper for registering query cancellation tokens.
-pub struct Session<'a> {
+pub struct Session {
     /// The user-facing key identifying this session.
     key: CancelKeyData,
     /// The [`CancelMap`] this session belongs to.
-    cancel_map: &'a CancelMap,
+    cancel_map: Arc<CancelMap>,
 }
 
-impl<'a> Session<'a> {
-    fn new(key: CancelKeyData, cancel_map: &'a CancelMap) -> Self {
-        Self { key, cancel_map }
-    }
-}
-
-impl Session<'_> {
+impl Session {
     /// Store the cancel token for the given session.
     /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
-    pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
+    pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
         info!("enabling query cancellation for this session");
         self.cancel_map.0.insert(self.key, Some(cancel_closure));
 
@@ -122,37 +108,26 @@ impl Session<'_> {
     }
 }
 
+impl Drop for Session {
+    fn drop(&mut self) {
+        self.cancel_map.0.remove(&self.key);
+        info!("dropped query cancellation key {}", &self.key);
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
-    use once_cell::sync::Lazy;
 
     #[tokio::test]
     async fn check_session_drop() -> anyhow::Result<()> {
-        static CANCEL_MAP: Lazy<CancelMap> = Lazy::new(Default::default);
-
-        let (tx, rx) = tokio::sync::oneshot::channel();
-        let task = tokio::spawn(CANCEL_MAP.with_session(|session| async move {
-            assert!(CANCEL_MAP.contains(&session));
-
-            tx.send(()).expect("failed to send");
-            futures::future::pending::<()>().await; // sleep forever
-
-            Ok(())
-        }));
-
-        // Wait until the task has been spawned.
-        rx.await.context("failed to hear from the task")?;
-
-        // Drop the session's entry by cancelling the task.
-        task.abort();
-        let error = task.await.expect_err("task should have failed");
-        if !error.is_cancelled() {
-            anyhow::bail!(error);
-        }
+        let cancel_map: Arc<CancelMap> = Default::default();
 
+        let session = cancel_map.clone().get_session();
+        assert!(cancel_map.contains(&session));
+        drop(session);
         // Check that the session has been dropped.
-        assert!(CANCEL_MAP.is_empty());
+        assert!(cancel_map.is_empty());
 
         Ok(())
     }
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index ed2ed5e367..e2b0294cd3 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -89,13 +89,11 @@ impl RequestMonitoring {
         self.project = Some(x.project_id);
     }
 
-    pub fn set_endpoint_id(&mut self, endpoint_id: Option<EndpointId>) {
-        self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone());
-        if let Some(ep) = &self.endpoint_id {
-            crate::metrics::CONNECTING_ENDPOINTS
-                .with_label_values(&[self.protocol])
-                .measure(&ep);
-        }
+    pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
+        crate::metrics::CONNECTING_ENDPOINTS
+            .with_label_values(&[self.protocol])
+            .measure(&endpoint_id);
+        self.endpoint_id = Some(endpoint_id);
     }
 
     pub fn set_application(&mut self, app: Option<SmolStr>) {
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 087cc7f7a9..4aa1f3590d 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -2,37 +2,34 @@
 mod tests;
 
 pub mod connect_compute;
+pub mod handshake;
+pub mod passthrough;
 pub mod retry;
 
 use crate::{
     auth,
     cancellation::{self, CancelMap},
     compute,
-    config::{AuthenticationConfig, ProxyConfig, TlsConfig},
-    console::messages::MetricsAuxInfo,
+    config::{ProxyConfig, TlsConfig},
     context::RequestMonitoring,
-    metrics::{
-        NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER,
-        NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE,
-    },
+    metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE},
     protocol2::WithClientIp,
+    proxy::{handshake::handshake, passthrough::proxy_pass},
     rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
-    usage_metrics::{Ids, USAGE_METRICS},
     EndpointCacheKey,
 };
 use anyhow::{bail, Context};
 use futures::TryFutureExt;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
-use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
+use pq_proto::{BeMessage as Be, StartupMessageParams};
 use regex::Regex;
 use smol_str::{format_smolstr, SmolStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, Instrument};
-use utils::measured_stream::MeasuredStream;
 
 use self::connect_compute::{connect_to_compute, TcpMechanism};
 
@@ -80,6 +77,13 @@ pub async fn task_main(
         let cancel_map = Arc::clone(&cancel_map);
         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
+        let session_span = info_span!(
+            "handle_client",
+            ?session_id,
+            peer_addr = tracing::field::Empty,
+            ep = tracing::field::Empty,
+        );
+
         connections.spawn(
             async move {
                 info!("accepted postgres client connection");
@@ -103,22 +107,18 @@ pub async fn task_main(
                 handle_client(
                     config,
                     &mut ctx,
-                    &cancel_map,
+                    cancel_map,
                     socket,
                     ClientMode::Tcp,
                     endpoint_rate_limiter,
                 )
                 .await
             }
-            .instrument(info_span!(
-                "handle_client",
-                ?session_id,
-                peer_addr = tracing::field::Empty
-            ))
             .unwrap_or_else(move |e| {
                 // Acknowledge that the task has finished with an error.
-                error!(?session_id, "per-client task finished with an error: {e:#}");
-            }),
+                error!("per-client task finished with an error: {e:#}");
+            })
+            .instrument(session_span),
         );
     }
 
@@ -171,7 +171,7 @@ impl ClientMode {
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
-    cancel_map: &CancelMap,
+    cancel_map: Arc<CancelMap>,
     stream: S,
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -192,138 +192,88 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let tls = config.tls_config.as_ref();
 
     let pause = ctx.latency_timer.pause();
-    let do_handshake = handshake(stream, mode.handshake_tls(tls), cancel_map);
+    let do_handshake = handshake(stream, mode.handshake_tls(tls), &cancel_map);
     let (mut stream, params) = match do_handshake.await? {
         Some(x) => x,
         None => return Ok(()), // it's a cancellation request
     };
     drop(pause);
 
+    let hostname = mode.hostname(stream.get_ref());
+
+    let common_names = tls.map(|tls| &tls.common_names);
+
     // Extract credentials which we're going to use for auth.
-    let user_info = {
-        let hostname = mode.hostname(stream.get_ref());
+    let result = config
+        .auth_backend
+        .as_ref()
+        .map(|_| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names))
+        .transpose();
 
-        let common_names = tls.map(|tls| &tls.common_names);
-        let result = config
-            .auth_backend
-            .as_ref()
-            .map(|_| {
-                auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names)
-            })
-            .transpose();
+    let user_info = match result {
+        Ok(user_info) => user_info,
+        Err(e) => stream.throw_error(e).await?,
+    };
 
-        match result {
-            Ok(user_info) => user_info,
-            Err(e) => stream.throw_error(e).await?,
+    // check rate limit
+    if let Some(ep) = user_info.get_endpoint() {
+        if !endpoint_rate_limiter.check(ep) {
+            return stream
+                .throw_error(auth::AuthError::too_many_connections())
+                .await;
+        }
+    }
+
+    let user = user_info.get_user().to_owned();
+    let (mut node_info, user_info) = match user_info
+        .authenticate(
+            ctx,
+            &mut stream,
+            mode.allow_cleartext(),
+            &config.authentication_config,
+        )
+        .await
+    {
+        Ok(auth_result) => auth_result,
+        Err(e) => {
+            let db = params.get("database");
+            let app = params.get("application_name");
+            let params_span = tracing::info_span!("", ?user, ?db, ?app);
+
+            return stream.throw_error(e).instrument(params_span).await;
         }
     };
 
-    ctx.set_endpoint_id(user_info.get_endpoint());
+    node_info.allow_self_signed_compute = mode.allow_self_signed_compute(config);
 
-    let client = Client::new(
-        stream,
-        user_info,
-        &params,
-        mode.allow_self_signed_compute(config),
-        endpoint_rate_limiter,
-    );
-    cancel_map
-        .with_session(|session| {
-            client.connect_to_db(ctx, session, mode, &config.authentication_config)
-        })
-        .await
-}
+    let aux = node_info.aux.clone();
+    let mut node = connect_to_compute(
+        ctx,
+        &TcpMechanism { params: &params },
+        node_info,
+        &user_info,
+    )
+    .or_else(|e| stream.throw_error(e))
+    .await?;
 
-/// Establish a (most probably, secure) connection with the client.
-/// For better testing experience, `stream` can be any object satisfying the traits.
-/// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
-/// we also take an extra care of propagating only the select handshake errors to client.
-#[tracing::instrument(skip_all)]
-async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
-    stream: S,
-    mut tls: Option<&TlsConfig>,
-    cancel_map: &CancelMap,
-) -> anyhow::Result<Option<(PqStream<Stream<S>>, StartupMessageParams)>> {
-    // Client may try upgrading to each protocol only once
-    let (mut tried_ssl, mut tried_gss) = (false, false);
+    let session = cancel_map.get_session();
+    prepare_client_connection(&node, &session, &mut stream).await?;
 
-    let mut stream = PqStream::new(Stream::from_raw(stream));
-    loop {
-        let msg = stream.read_startup_packet().await?;
-        info!("received {msg:?}");
+    // Before proxy passing, forward to compute whatever data is left in the
+    // PqStream input buffer. Normally there is none, but our serverless npm
+    // driver in pipeline mode sends startup, password and first query
+    // immediately after opening the connection.
+    let (stream, read_buf) = stream.into_inner();
+    node.stream.write_all(&read_buf).await?;
 
-        use FeStartupPacket::*;
-        match msg {
-            SslRequest => match stream.get_ref() {
-                Stream::Raw { .. } if !tried_ssl => {
-                    tried_ssl = true;
-
-                    // We can't perform TLS handshake without a config
-                    let enc = tls.is_some();
-                    stream.write_message(&Be::EncryptionResponse(enc)).await?;
-                    if let Some(tls) = tls.take() {
-                        // Upgrade raw stream into a secure TLS-backed stream.
-                        // NOTE: We've consumed `tls`; this fact will be used later.
-
-                        let (raw, read_buf) = stream.into_inner();
-                        // TODO: Normally, client doesn't send any data before
-                        // server says TLS handshake is ok and read_buf is empy.
-                        // However, you could imagine pipelining of postgres
-                        // SSLRequest + TLS ClientHello in one hunk similar to
-                        // pipelining in our node js driver. We should probably
-                        // support that by chaining read_buf with the stream.
-                        if !read_buf.is_empty() {
-                            bail!("data is sent before server replied with EncryptionResponse");
-                        }
-                        let tls_stream = raw.upgrade(tls.to_server_config()).await?;
-
-                        let (_, tls_server_end_point) = tls
-                            .cert_resolver
-                            .resolve(tls_stream.get_ref().1.server_name())
-                            .context("missing certificate")?;
-
-                        stream = PqStream::new(Stream::Tls {
-                            tls: Box::new(tls_stream),
-                            tls_server_end_point,
-                        });
-                    }
-                }
-                _ => bail!(ERR_PROTO_VIOLATION),
-            },
-            GssEncRequest => match stream.get_ref() {
-                Stream::Raw { .. } if !tried_gss => {
-                    tried_gss = true;
-
-                    // Currently, we don't support GSSAPI
-                    stream.write_message(&Be::EncryptionResponse(false)).await?;
-                }
-                _ => bail!(ERR_PROTO_VIOLATION),
-            },
-            StartupMessage { params, .. } => {
-                // Check that the config has been consumed during upgrade
-                // OR we didn't provide it at all (for dev purposes).
-                if tls.is_some() {
-                    stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
-                }
-
-                info!(session_type = "normal", "successful handshake");
-                break Ok(Some((stream, params)));
-            }
-            CancelRequest(cancel_key_data) => {
-                cancel_map.cancel_session(cancel_key_data).await?;
-
-                info!(session_type = "cancellation", "successful handshake");
-                break Ok(None);
-            }
-        }
-    }
+    proxy_pass(ctx, stream, node.stream, aux).await
 }
 
 /// Finish client connection initialization: confirm auth success, send params, etc.
 #[tracing::instrument(skip_all)]
 async fn prepare_client_connection(
     node: &compute::PostgresConnection,
-    session: cancellation::Session<'_>,
+    session: &cancellation::Session,
     stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> anyhow::Result<()> {
     // Register compute's query cancellation token and produce a new, unique one.
@@ -349,151 +299,6 @@ async fn prepare_client_connection(
     Ok(())
 }
 
-/// Forward bytes in both directions (client <-> compute).
-#[tracing::instrument(skip_all)]
-pub async fn proxy_pass(
-    ctx: &mut RequestMonitoring,
-    client: impl AsyncRead + AsyncWrite + Unpin,
-    compute: impl AsyncRead + AsyncWrite + Unpin,
-    aux: MetricsAuxInfo,
-) -> anyhow::Result<()> {
-    ctx.set_success();
-    ctx.log();
-
-    let usage = USAGE_METRICS.register(Ids {
-        endpoint_id: aux.endpoint_id.clone(),
-        branch_id: aux.branch_id.clone(),
-    });
-
-    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
-    let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx"));
-    let mut client = MeasuredStream::new(
-        client,
-        |_| {},
-        |cnt| {
-            // Number of bytes we sent to the client (outbound).
-            m_sent.inc_by(cnt as u64);
-            m_sent2.inc_by(cnt as u64);
-            usage.record_egress(cnt as u64);
-        },
-    );
-
-    let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
-    let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx"));
-    let mut compute = MeasuredStream::new(
-        compute,
-        |_| {},
-        |cnt| {
-            // Number of bytes the client sent to the compute node (inbound).
-            m_recv.inc_by(cnt as u64);
-            m_recv2.inc_by(cnt as u64);
-        },
-    );
-
-    // Starting from here we only proxy the client's traffic.
-    info!("performing the proxy pass...");
-    let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
-
-    Ok(())
-}
-
-/// Thin connection context.
-struct Client<'a, S> {
-    /// The underlying libpq protocol stream.
-    stream: PqStream<Stream<S>>,
-    /// Client credentials that we care about.
-    user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>,
-    /// KV-dictionary with PostgreSQL connection params.
-    params: &'a StartupMessageParams,
-    /// Allow self-signed certificates (for testing).
-    allow_self_signed_compute: bool,
-    /// Rate limiter for endpoints
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-}
-
-impl<'a, S> Client<'a, S> {
-    /// Construct a new connection context.
-    fn new(
-        stream: PqStream<Stream<S>>,
-        user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>,
-        params: &'a StartupMessageParams,
-        allow_self_signed_compute: bool,
-        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    ) -> Self {
-        Self {
-            stream,
-            user_info,
-            params,
-            allow_self_signed_compute,
-            endpoint_rate_limiter,
-        }
-    }
-}
-
-impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
-    /// Let the client authenticate and connect to the designated compute node.
-    // Instrumentation logs endpoint name everywhere. Doesn't work for link
-    // auth; strictly speaking we don't know endpoint name in its case.
-    #[tracing::instrument(name = "", fields(ep = %self.user_info.get_endpoint().unwrap_or_default()), skip_all)]
-    async fn connect_to_db(
-        self,
-        ctx: &mut RequestMonitoring,
-        session: cancellation::Session<'_>,
-        mode: ClientMode,
-        config: &'static AuthenticationConfig,
-    ) -> anyhow::Result<()> {
-        let Self {
-            mut stream,
-            user_info,
-            params,
-            allow_self_signed_compute,
-            endpoint_rate_limiter,
-        } = self;
-
-        // check rate limit
-        if let Some(ep) = user_info.get_endpoint() {
-            if !endpoint_rate_limiter.check(ep) {
-                return stream
-                    .throw_error(auth::AuthError::too_many_connections())
-                    .await;
-            }
-        }
-
-        let user = user_info.get_user().to_owned();
-        let auth_result = match user_info
-            .authenticate(ctx, &mut stream, mode.allow_cleartext(), config)
-            .await
-        {
-            Ok(auth_result) => auth_result,
-            Err(e) => {
-                let db = params.get("database");
-                let app = params.get("application_name");
-                let params_span = tracing::info_span!("", ?user, ?db, ?app);
-
-                return stream.throw_error(e).instrument(params_span).await;
-            }
-        };
-
-        let (mut node_info, user_info) = auth_result;
-
-        node_info.allow_self_signed_compute = allow_self_signed_compute;
-
-        let aux = node_info.aux.clone();
-        let mut node = connect_to_compute(ctx, &TcpMechanism { params }, node_info, &user_info)
-            .or_else(|e| stream.throw_error(e))
-            .await?;
-
-        prepare_client_connection(&node, session, &mut stream).await?;
-        // Before proxy passing, forward to compute whatever data is left in the
-        // PqStream input buffer. Normally there is none, but our serverless npm
-        // driver in pipeline mode sends startup, password and first query
-        // immediately after opening the connection.
-        let (stream, read_buf) = stream.into_inner();
-        node.stream.write_all(&read_buf).await?;
-        proxy_pass(ctx, stream, node.stream, aux).await
-    }
-}
-
 #[derive(Debug, Clone, PartialEq, Eq, Default)]
 pub struct NeonOptions(Vec<(SmolStr, SmolStr)>);
 
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
new file mode 100644
index 0000000000..1ad8da20d7
--- /dev/null
+++ b/proxy/src/proxy/handshake.rs
@@ -0,0 +1,96 @@
+use anyhow::{bail, Context};
+use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::info;
+
+use crate::{
+    cancellation::CancelMap,
+    config::TlsConfig,
+    proxy::{ERR_INSECURE_CONNECTION, ERR_PROTO_VIOLATION},
+    stream::{PqStream, Stream},
+};
+
+/// Establish a (most probably, secure) connection with the client.
+/// For better testing experience, `stream` can be any object satisfying the traits.
+/// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
+/// we also take an extra care of propagating only the select handshake errors to client.
+#[tracing::instrument(skip_all)]
+pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
+    stream: S,
+    mut tls: Option<&TlsConfig>,
+    cancel_map: &CancelMap,
+) -> anyhow::Result<Option<(PqStream<Stream<S>>, StartupMessageParams)>> {
+    // Client may try upgrading to each protocol only once
+    let (mut tried_ssl, mut tried_gss) = (false, false);
+
+    let mut stream = PqStream::new(Stream::from_raw(stream));
+    loop {
+        let msg = stream.read_startup_packet().await?;
+        info!("received {msg:?}");
+
+        use FeStartupPacket::*;
+        match msg {
+            SslRequest => match stream.get_ref() {
+                Stream::Raw { .. } if !tried_ssl => {
+                    tried_ssl = true;
+
+                    // We can't perform TLS handshake without a config
+                    let enc = tls.is_some();
+                    stream.write_message(&Be::EncryptionResponse(enc)).await?;
+                    if let Some(tls) = tls.take() {
+                        // Upgrade raw stream into a secure TLS-backed stream.
+                        // NOTE: We've consumed `tls`; this fact will be used later.
+
+                        let (raw, read_buf) = stream.into_inner();
+                        // TODO: Normally, client doesn't send any data before
+                        // server says TLS handshake is ok and read_buf is empy.
+                        // However, you could imagine pipelining of postgres
+                        // SSLRequest + TLS ClientHello in one hunk similar to
+                        // pipelining in our node js driver. We should probably
+                        // support that by chaining read_buf with the stream.
+                        if !read_buf.is_empty() {
+                            bail!("data is sent before server replied with EncryptionResponse");
+                        }
+                        let tls_stream = raw.upgrade(tls.to_server_config()).await?;
+
+                        let (_, tls_server_end_point) = tls
+                            .cert_resolver
+                            .resolve(tls_stream.get_ref().1.server_name())
+                            .context("missing certificate")?;
+
+                        stream = PqStream::new(Stream::Tls {
+                            tls: Box::new(tls_stream),
+                            tls_server_end_point,
+                        });
+                    }
+                }
+                _ => bail!(ERR_PROTO_VIOLATION),
+            },
+            GssEncRequest => match stream.get_ref() {
+                Stream::Raw { .. } if !tried_gss => {
+                    tried_gss = true;
+
+                    // Currently, we don't support GSSAPI
+                    stream.write_message(&Be::EncryptionResponse(false)).await?;
+                }
+                _ => bail!(ERR_PROTO_VIOLATION),
+            },
+            StartupMessage { params, .. } => {
+                // Check that the config has been consumed during upgrade
+                // OR we didn't provide it at all (for dev purposes).
+                if tls.is_some() {
+                    stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
+                }
+
+                info!(session_type = "normal", "successful handshake");
+                break Ok(Some((stream, params)));
+            }
+            CancelRequest(cancel_key_data) => {
+                cancel_map.cancel_session(cancel_key_data).await?;
+
+                info!(session_type = "cancellation", "successful handshake");
+                break Ok(None);
+            }
+        }
+    }
+}
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
new file mode 100644
index 0000000000..d6f097d72d
--- /dev/null
+++ b/proxy/src/proxy/passthrough.rs
@@ -0,0 +1,57 @@
+use crate::{
+    console::messages::MetricsAuxInfo,
+    context::RequestMonitoring,
+    metrics::{NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER},
+    usage_metrics::{Ids, USAGE_METRICS},
+};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::info;
+use utils::measured_stream::MeasuredStream;
+
+/// Forward bytes in both directions (client <-> compute).
+#[tracing::instrument(skip_all)]
+pub async fn proxy_pass(
+    ctx: &mut RequestMonitoring,
+    client: impl AsyncRead + AsyncWrite + Unpin,
+    compute: impl AsyncRead + AsyncWrite + Unpin,
+    aux: MetricsAuxInfo,
+) -> anyhow::Result<()> {
+    ctx.set_success();
+    ctx.log();
+
+    let usage = USAGE_METRICS.register(Ids {
+        endpoint_id: aux.endpoint_id.clone(),
+        branch_id: aux.branch_id.clone(),
+    });
+
+    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
+    let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx"));
+    let mut client = MeasuredStream::new(
+        client,
+        |_| {},
+        |cnt| {
+            // Number of bytes we sent to the client (outbound).
+            m_sent.inc_by(cnt as u64);
+            m_sent2.inc_by(cnt as u64);
+            usage.record_egress(cnt as u64);
+        },
+    );
+
+    let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
+    let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx"));
+    let mut compute = MeasuredStream::new(
+        compute,
+        |_| {},
+        |cnt| {
+            // Number of bytes the client sent to the compute node (inbound).
+            m_recv.inc_by(cnt as u64);
+            m_recv2.inc_by(cnt as u64);
+        },
+    );
+
+    // Starting from here we only proxy the client's traffic.
+    info!("performing the proxy pass...");
+    let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
+
+    Ok(())
+}
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index dfef4ccdfa..a2eb7e62cc 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -230,7 +230,7 @@ async fn request_handler(
                     config,
                     &mut ctx,
                     websocket,
-                    &cancel_map,
+                    cancel_map,
                     host,
                     endpoint_rate_limiter,
                 )
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 1e2ddaa2ff..27c2134221 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -189,7 +189,7 @@ fn get_conn_info(
     }
 
     let endpoint = endpoint_sni(hostname, &tls.common_names)?.context("malformed endpoint")?;
-    ctx.set_endpoint_id(Some(endpoint.clone()));
+    ctx.set_endpoint_id(endpoint.clone());
 
     let pairs = connection_url.query_pairs();
 
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index a6529c920a..f68b35010a 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -133,7 +133,7 @@ pub async fn serve_websocket(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
     websocket: HyperWebsocket,
-    cancel_map: &CancelMap,
+    cancel_map: Arc<CancelMap>,
     hostname: Option<String>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {

From 2ff1a5cecd96503b840f29f4228da0b34409eae8 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 29 Jan 2024 18:20:57 +0000
Subject: [PATCH 0022/1571] Patch safekeeper control file on HTTP request
 (#6455)

Closes #6397
---
 safekeeper/src/http/routes.rs            | 26 +++++++-
 safekeeper/src/lib.rs                    |  1 +
 safekeeper/src/patch_control_file.rs     | 85 ++++++++++++++++++++++++
 safekeeper/src/timeline.rs               | 14 ++++
 test_runner/fixtures/neon_fixtures.py    | 18 +++++
 test_runner/regress/test_wal_acceptor.py | 48 +++++++++++++
 6 files changed, 191 insertions(+), 1 deletion(-)
 create mode 100644 safekeeper/src/patch_control_file.rs

diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 919b6b2982..a0c0c7ca4c 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -28,7 +28,7 @@ use crate::safekeeper::Term;
 use crate::safekeeper::{ServerInfo, TermLsn};
 use crate::send_wal::WalSenderState;
 use crate::timeline::PeerInfo;
-use crate::{copy_timeline, debug_dump, pull_timeline};
+use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline};
 
 use crate::timelines_global_map::TimelineDeleteForceResult;
 use crate::GlobalTimelines;
@@ -465,6 +465,26 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
     Ok(response)
 }
 
+async fn patch_control_file_handler(
+    mut request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+
+    let patch_request: patch_control_file::Request = json_request(&mut request).await?;
+    let response = patch_control_file::handle_request(tli, patch_request)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, response)
+}
+
 /// Safekeeper http router.
 pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
     let mut router = endpoint::make_router();
@@ -526,6 +546,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
             "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
             |r| request_span(r, timeline_copy_handler),
         )
+        .patch(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/control_file",
+            |r| request_span(r, patch_control_file_handler),
+        )
         // for tests
         .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
             request_span(r, record_safekeeper_info)
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index f18a1ec22d..27b80fcbe8 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -22,6 +22,7 @@ pub mod handler;
 pub mod http;
 pub mod json_ctrl;
 pub mod metrics;
+pub mod patch_control_file;
 pub mod pull_timeline;
 pub mod receive_wal;
 pub mod recovery;
diff --git a/safekeeper/src/patch_control_file.rs b/safekeeper/src/patch_control_file.rs
new file mode 100644
index 0000000000..2136d1b5f7
--- /dev/null
+++ b/safekeeper/src/patch_control_file.rs
@@ -0,0 +1,85 @@
+use std::sync::Arc;
+
+use serde::{Deserialize, Serialize};
+use serde_json::Value;
+use tracing::info;
+
+use crate::{state::TimelinePersistentState, timeline::Timeline};
+
+#[derive(Deserialize, Debug, Clone)]
+pub struct Request {
+    /// JSON object with fields to update
+    pub updates: serde_json::Value,
+    /// List of fields to apply
+    pub apply_fields: Vec<String>,
+}
+
+#[derive(Serialize)]
+pub struct Response {
+    pub old_control_file: TimelinePersistentState,
+    pub new_control_file: TimelinePersistentState,
+}
+
+/// Patch control file with given request. Will update the persistent state using
+/// fields from the request and persist the new state on disk.
+pub async fn handle_request(tli: Arc<Timeline>, request: Request) -> anyhow::Result<Response> {
+    let response = tli
+        .map_control_file(|state| {
+            let old_control_file = state.clone();
+            let new_control_file = state_apply_diff(&old_control_file, &request)?;
+
+            info!(
+                "patching control file, old: {:?}, new: {:?}, patch: {:?}",
+                old_control_file, new_control_file, request
+            );
+            *state = new_control_file.clone();
+
+            Ok(Response {
+                old_control_file,
+                new_control_file,
+            })
+        })
+        .await?;
+
+    Ok(response)
+}
+
+fn state_apply_diff(
+    state: &TimelinePersistentState,
+    request: &Request,
+) -> anyhow::Result<TimelinePersistentState> {
+    let mut json_value = serde_json::to_value(state)?;
+
+    if let Value::Object(a) = &mut json_value {
+        if let Value::Object(b) = &request.updates {
+            json_apply_diff(a, b, &request.apply_fields)?;
+        } else {
+            anyhow::bail!("request.updates is not a json object")
+        }
+    } else {
+        anyhow::bail!("TimelinePersistentState is not a json object")
+    }
+
+    let new_state: TimelinePersistentState = serde_json::from_value(json_value)?;
+    Ok(new_state)
+}
+
+fn json_apply_diff(
+    object: &mut serde_json::Map<String, Value>,
+    updates: &serde_json::Map<String, Value>,
+    apply_keys: &Vec<String>,
+) -> anyhow::Result<()> {
+    for key in apply_keys {
+        if let Some(new_value) = updates.get(key) {
+            if let Some(existing_value) = object.get_mut(key) {
+                *existing_value = new_value.clone();
+            } else {
+                anyhow::bail!("key not found in original object: {}", key);
+            }
+        } else {
+            anyhow::bail!("key not found in request.updates: {}", key);
+        }
+    }
+
+    Ok(())
+}
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index ec7dd7d89b..730a80a583 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -901,6 +901,20 @@ impl Timeline {
             file_open,
         }
     }
+
+    /// Apply a function to the control file state and persist it.
+    pub async fn map_control_file<T>(
+        &self,
+        f: impl FnOnce(&mut TimelinePersistentState) -> Result<T>,
+    ) -> Result<T> {
+        let mut state = self.write_shared_state().await;
+        let mut persistent_state = state.sk.state.start_change();
+        // If f returns error, we abort the change and don't persist anything.
+        let res = f(&mut persistent_state)?;
+        // If persisting fails, we abort the change and return error.
+        state.sk.state.finish_change(&persistent_state).await?;
+        Ok(res)
+    }
 }
 
 /// Deletes directory and it's contents. Returns false if directory does not exist.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index bbabfeedf6..804685589f 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3443,6 +3443,24 @@ class SafekeeperHttpClient(requests.Session):
         assert isinstance(res_json, dict)
         return res_json
 
+    def patch_control_file(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        patch: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        res = self.patch(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
+            json={
+                "updates": patch,
+                "apply_fields": list(patch.keys()),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
     def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
         res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
         res.raise_for_status()
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 2f8e69165e..dab446fcfd 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1946,3 +1946,51 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int):
             assert orig_digest == new_digest
 
     # TODO: test timelines can start after copy
+
+
+def test_patch_control_file(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create_start("main")
+    # initialize safekeeper
+    endpoint.safe_psql("create table t(key int, value text)")
+
+    # update control file
+    res = (
+        env.safekeepers[0]
+        .http_client()
+        .patch_control_file(
+            tenant_id,
+            timeline_id,
+            {
+                "timeline_start_lsn": "0/1",
+            },
+        )
+    )
+
+    timeline_start_lsn_before = res["old_control_file"]["timeline_start_lsn"]
+    timeline_start_lsn_after = res["new_control_file"]["timeline_start_lsn"]
+
+    log.info(f"patch_control_file response: {res}")
+    log.info(
+        f"updated control file timeline_start_lsn, before {timeline_start_lsn_before}, after {timeline_start_lsn_after}"
+    )
+
+    assert timeline_start_lsn_after == "0/1"
+    env.safekeepers[0].stop().start()
+
+    # wait/check that safekeeper is alive
+    endpoint.safe_psql("insert into t values (1, 'payload')")
+
+    # check that timeline_start_lsn is updated
+    res = (
+        env.safekeepers[0]
+        .http_client()
+        .debug_dump({"dump_control_file": "true", "timeline_id": str(timeline_id)})
+    )
+    log.info(f"dump_control_file response: {res}")
+    assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1"

From 8e4da52069456c68b350bd4dee205aa49c40170c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 30 Jan 2024 09:29:45 +0000
Subject: [PATCH 0023/1571] Compute: pgvector 0.6.0 (#6517)

Update pgvector extension from 0.5.1 to 0.6.0
---
 Dockerfile.compute-node | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index d91c7cfd72..d96b9f99c8 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,8 +241,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
-    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \
+    echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \

From c70bf9150fdb5c25342b060ac445902e123796bb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 30 Jan 2024 10:46:49 +0000
Subject: [PATCH 0024/1571] build(deps): bump aiohttp from 3.9.0 to 3.9.2
 (#6518)

---
 poetry.lock    | 157 +++++++++++++++++++++++++------------------------
 pyproject.toml |   2 +-
 2 files changed, 80 insertions(+), 79 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 1644b2b299..2904e2872e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2,87 +2,87 @@
 
 [[package]]
 name = "aiohttp"
-version = "3.9.0"
+version = "3.9.2"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6896b8416be9ada4d22cd359d7cb98955576ce863eadad5596b7cdfbf3e17c6c"},
-    {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1736d87dad8ef46a8ec9cddd349fa9f7bd3a064c47dd6469c0d6763d3d49a4fc"},
-    {file = "aiohttp-3.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8c9e5f4d7208cda1a2bb600e29069eecf857e6980d0ccc922ccf9d1372c16f4b"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8488519aa05e636c5997719fe543c8daf19f538f4fa044f3ce94bee608817cff"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ab16c254e2312efeb799bc3c06897f65a133b38b69682bf75d1f1ee1a9c43a9"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7a94bde005a8f926d0fa38b88092a03dea4b4875a61fbcd9ac6f4351df1b57cd"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b777c9286b6c6a94f50ddb3a6e730deec327e9e2256cb08b5530db0f7d40fd8"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:571760ad7736b34d05597a1fd38cbc7d47f7b65deb722cb8e86fd827404d1f6b"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:deac0a32aec29608eb25d730f4bc5a261a65b6c48ded1ed861d2a1852577c932"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:4ee1b4152bc3190cc40ddd6a14715e3004944263ea208229ab4c297712aa3075"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:3607375053df58ed6f23903aa10cf3112b1240e8c799d243bbad0f7be0666986"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:65b0a70a25456d329a5e1426702dde67be0fb7a4ead718005ba2ca582d023a94"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5a2eb5311a37fe105aa35f62f75a078537e1a9e4e1d78c86ec9893a3c97d7a30"},
-    {file = "aiohttp-3.9.0-cp310-cp310-win32.whl", hash = "sha256:2cbc14a13fb6b42d344e4f27746a4b03a2cb0c1c3c5b932b0d6ad8881aa390e3"},
-    {file = "aiohttp-3.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:ac9669990e2016d644ba8ae4758688534aabde8dbbc81f9af129c3f5f01ca9cd"},
-    {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f8e05f5163528962ce1d1806fce763ab893b1c5b7ace0a3538cd81a90622f844"},
-    {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4afa8f71dba3a5a2e1e1282a51cba7341ae76585345c43d8f0e624882b622218"},
-    {file = "aiohttp-3.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f929f4c9b9a00f3e6cc0587abb95ab9c05681f8b14e0fe1daecfa83ea90f8318"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28185e36a78d247c55e9fbea2332d16aefa14c5276a582ce7a896231c6b1c208"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a486ddf57ab98b6d19ad36458b9f09e6022de0381674fe00228ca7b741aacb2f"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70e851f596c00f40a2f00a46126c95c2e04e146015af05a9da3e4867cfc55911"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5b7bf8fe4d39886adc34311a233a2e01bc10eb4e842220235ed1de57541a896"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c67a51ea415192c2e53e4e048c78bab82d21955b4281d297f517707dc836bf3d"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:694df243f394629bcae2d8ed94c589a181e8ba8604159e6e45e7b22e58291113"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3dd8119752dd30dd7bca7d4bc2a92a59be6a003e4e5c2cf7e248b89751b8f4b7"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:eb6dfd52063186ac97b4caa25764cdbcdb4b10d97f5c5f66b0fa95052e744eb7"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:d97c3e286d0ac9af6223bc132dc4bad6540b37c8d6c0a15fe1e70fb34f9ec411"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:816f4db40555026e4cdda604a1088577c1fb957d02f3f1292e0221353403f192"},
-    {file = "aiohttp-3.9.0-cp311-cp311-win32.whl", hash = "sha256:3abf0551874fecf95f93b58f25ef4fc9a250669a2257753f38f8f592db85ddea"},
-    {file = "aiohttp-3.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:e18d92c3e9e22553a73e33784fcb0ed484c9874e9a3e96c16a8d6a1e74a0217b"},
-    {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:99ae01fb13a618b9942376df77a1f50c20a281390dad3c56a6ec2942e266220d"},
-    {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:05857848da443c8c12110d99285d499b4e84d59918a21132e45c3f0804876994"},
-    {file = "aiohttp-3.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:317719d7f824eba55857fe0729363af58e27c066c731bc62cd97bc9c3d9c7ea4"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1e3b3c107ccb0e537f309f719994a55621acd2c8fdf6d5ce5152aed788fb940"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45820ddbb276113ead8d4907a7802adb77548087ff5465d5c554f9aa3928ae7d"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:05a183f1978802588711aed0dea31e697d760ce9055292db9dc1604daa9a8ded"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51a4cd44788ea0b5e6bb8fa704597af3a30be75503a7ed1098bc5b8ffdf6c982"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:673343fbc0c1ac44d0d2640addc56e97a052504beacd7ade0dc5e76d3a4c16e8"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7e8a3b79b6d186a9c99761fd4a5e8dd575a48d96021f220ac5b5fa856e5dd029"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6777a390e41e78e7c45dab43a4a0196c55c3b8c30eebe017b152939372a83253"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7ae5f99a32c53731c93ac3075abd3e1e5cfbe72fc3eaac4c27c9dd64ba3b19fe"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f1e4f254e9c35d8965d377e065c4a8a55d396fe87c8e7e8429bcfdeeb229bfb3"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11ca808f9a6b63485059f5f6e164ef7ec826483c1212a44f268b3653c91237d8"},
-    {file = "aiohttp-3.9.0-cp312-cp312-win32.whl", hash = "sha256:de3cc86f4ea8b4c34a6e43a7306c40c1275e52bfa9748d869c6b7d54aa6dad80"},
-    {file = "aiohttp-3.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:ca4fddf84ac7d8a7d0866664936f93318ff01ee33e32381a115b19fb5a4d1202"},
-    {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f09960b5bb1017d16c0f9e9f7fc42160a5a49fa1e87a175fd4a2b1a1833ea0af"},
-    {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8303531e2c17b1a494ffaeba48f2da655fe932c4e9a2626c8718403c83e5dd2b"},
-    {file = "aiohttp-3.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4790e44f46a4aa07b64504089def5744d3b6780468c4ec3a1a36eb7f2cae9814"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1d7edf74a36de0e5ca50787e83a77cf352f5504eb0ffa3f07000a911ba353fb"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94697c7293199c2a2551e3e3e18438b4cba293e79c6bc2319f5fd652fccb7456"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1b66dbb8a7d5f50e9e2ea3804b01e766308331d0cac76eb30c563ac89c95985"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9623cfd9e85b76b83ef88519d98326d4731f8d71869867e47a0b979ffec61c73"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f32c86dc967ab8c719fd229ce71917caad13cc1e8356ee997bf02c5b368799bf"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f50b4663c3e0262c3a361faf440761fbef60ccdde5fe8545689a4b3a3c149fb4"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dcf71c55ec853826cd70eadb2b6ac62ec577416442ca1e0a97ad875a1b3a0305"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:42fe4fd9f0dfcc7be4248c162d8056f1d51a04c60e53366b0098d1267c4c9da8"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:76a86a9989ebf82ee61e06e2bab408aec4ea367dc6da35145c3352b60a112d11"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f9e09a1c83521d770d170b3801eea19b89f41ccaa61d53026ed111cb6f088887"},
-    {file = "aiohttp-3.9.0-cp38-cp38-win32.whl", hash = "sha256:a00ce44c21612d185c5275c5cba4bab8d7c1590f248638b667ed8a782fa8cd6f"},
-    {file = "aiohttp-3.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:d5b9345ab92ebe6003ae11d8092ce822a0242146e6fa270889b9ba965457ca40"},
-    {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98d21092bf2637c5fa724a428a69e8f5955f2182bff61f8036827cf6ce1157bf"},
-    {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:35a68cd63ca6aaef5707888f17a70c36efe62b099a4e853d33dc2e9872125be8"},
-    {file = "aiohttp-3.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d7f6235c7475658acfc1769d968e07ab585c79f6ca438ddfecaa9a08006aee2"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db04d1de548f7a62d1dd7e7cdf7c22893ee168e22701895067a28a8ed51b3735"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:536b01513d67d10baf6f71c72decdf492fb7433c5f2f133e9a9087379d4b6f31"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c8b0a6487e8109427ccf638580865b54e2e3db4a6e0e11c02639231b41fc0f"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7276fe0017664414fdc3618fca411630405f1aaf0cc3be69def650eb50441787"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23170247ef89ffa842a02bbfdc425028574d9e010611659abeb24d890bc53bb8"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b1a2ea8252cacc7fd51df5a56d7a2bb1986ed39be9397b51a08015727dfb69bd"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2d71abc15ff7047412ef26bf812dfc8d0d1020d664617f4913df2df469f26b76"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:2d820162c8c2bdbe97d328cd4f417c955ca370027dce593345e437b2e9ffdc4d"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:2779f5e7c70f7b421915fd47db332c81de365678180a9f3ab404088f87ba5ff9"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:366bc870d7ac61726f32a489fbe3d1d8876e87506870be66b01aeb84389e967e"},
-    {file = "aiohttp-3.9.0-cp39-cp39-win32.whl", hash = "sha256:1df43596b826022b14998f0460926ce261544fedefe0d2f653e1b20f49e96454"},
-    {file = "aiohttp-3.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:9c196b30f1b1aa3363a69dd69079ae9bec96c2965c4707eaa6914ba099fb7d4f"},
-    {file = "aiohttp-3.9.0.tar.gz", hash = "sha256:09f23292d29135025e19e8ff4f0a68df078fe4ee013bca0105b2e803989de92d"},
+    {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:772fbe371788e61c58d6d3d904268e48a594ba866804d08c995ad71b144f94cb"},
+    {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:edd4f1af2253f227ae311ab3d403d0c506c9b4410c7fc8d9573dec6d9740369f"},
+    {file = "aiohttp-3.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cfee9287778399fdef6f8a11c9e425e1cb13cc9920fd3a3df8f122500978292b"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cc158466f6a980a6095ee55174d1de5730ad7dec251be655d9a6a9dd7ea1ff9"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54ec82f45d57c9a65a1ead3953b51c704f9587440e6682f689da97f3e8defa35"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abeb813a18eb387f0d835ef51f88568540ad0325807a77a6e501fed4610f864e"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc91d07280d7d169f3a0f9179d8babd0ee05c79d4d891447629ff0d7d8089ec2"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b65e861f4bebfb660f7f0f40fa3eb9f2ab9af10647d05dac824390e7af8f75b7"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:04fd8ffd2be73d42bcf55fd78cde7958eeee6d4d8f73c3846b7cba491ecdb570"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3d8d962b439a859b3ded9a1e111a4615357b01620a546bc601f25b0211f2da81"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:8ceb658afd12b27552597cf9a65d9807d58aef45adbb58616cdd5ad4c258c39e"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0e4ee4df741670560b1bc393672035418bf9063718fee05e1796bf867e995fad"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2dec87a556f300d3211decf018bfd263424f0690fcca00de94a837949fbcea02"},
+    {file = "aiohttp-3.9.2-cp310-cp310-win32.whl", hash = "sha256:3e1a800f988ce7c4917f34096f81585a73dbf65b5c39618b37926b1238cf9bc4"},
+    {file = "aiohttp-3.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:ea510718a41b95c236c992b89fdfc3d04cc7ca60281f93aaada497c2b4e05c46"},
+    {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6aaa6f99256dd1b5756a50891a20f0d252bd7bdb0854c5d440edab4495c9f973"},
+    {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a27d8c70ad87bcfce2e97488652075a9bdd5b70093f50b10ae051dfe5e6baf37"},
+    {file = "aiohttp-3.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:54287bcb74d21715ac8382e9de146d9442b5f133d9babb7e5d9e453faadd005e"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb3d05569aa83011fcb346b5266e00b04180105fcacc63743fc2e4a1862a891"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8534e7d69bb8e8d134fe2be9890d1b863518582f30c9874ed7ed12e48abe3c4"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd9d5b989d57b41e4ff56ab250c5ddf259f32db17159cce630fd543376bd96b"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa6904088e6642609981f919ba775838ebf7df7fe64998b1a954fb411ffb4663"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bda42eb410be91b349fb4ee3a23a30ee301c391e503996a638d05659d76ea4c2"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:193cc1ccd69d819562cc7f345c815a6fc51d223b2ef22f23c1a0f67a88de9a72"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b9f1cb839b621f84a5b006848e336cf1496688059d2408e617af33e3470ba204"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d22a0931848b8c7a023c695fa2057c6aaac19085f257d48baa24455e67df97ec"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4112d8ba61fbd0abd5d43a9cb312214565b446d926e282a6d7da3f5a5aa71d36"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c4ad4241b52bb2eb7a4d2bde060d31c2b255b8c6597dd8deac2f039168d14fd7"},
+    {file = "aiohttp-3.9.2-cp311-cp311-win32.whl", hash = "sha256:ee2661a3f5b529f4fc8a8ffee9f736ae054adfb353a0d2f78218be90617194b3"},
+    {file = "aiohttp-3.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:4deae2c165a5db1ed97df2868ef31ca3cc999988812e82386d22937d9d6fed52"},
+    {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6f4cdba12539215aaecf3c310ce9d067b0081a0795dd8a8805fdb67a65c0572a"},
+    {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:84e843b33d5460a5c501c05539809ff3aee07436296ff9fbc4d327e32aa3a326"},
+    {file = "aiohttp-3.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8008d0f451d66140a5aa1c17e3eedc9d56e14207568cd42072c9d6b92bf19b52"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61c47ab8ef629793c086378b1df93d18438612d3ed60dca76c3422f4fbafa792"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc71f748e12284312f140eaa6599a520389273174b42c345d13c7e07792f4f57"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1c3a4d0ab2f75f22ec80bca62385db2e8810ee12efa8c9e92efea45c1849133"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a87aa0b13bbee025faa59fa58861303c2b064b9855d4c0e45ec70182bbeba1b"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2cc0d04688b9f4a7854c56c18aa7af9e5b0a87a28f934e2e596ba7e14783192"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1956e3ac376b1711c1533266dec4efd485f821d84c13ce1217d53e42c9e65f08"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:114da29f39eccd71b93a0fcacff178749a5c3559009b4a4498c2c173a6d74dff"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:3f17999ae3927d8a9a823a1283b201344a0627272f92d4f3e3a4efe276972fe8"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f31df6a32217a34ae2f813b152a6f348154f948c83213b690e59d9e84020925c"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7a75307ffe31329928a8d47eae0692192327c599113d41b278d4c12b54e1bd11"},
+    {file = "aiohttp-3.9.2-cp312-cp312-win32.whl", hash = "sha256:972b63d589ff8f305463593050a31b5ce91638918da38139b9d8deaba9e0fed7"},
+    {file = "aiohttp-3.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:200dc0246f0cb5405c80d18ac905c8350179c063ea1587580e3335bfc243ba6a"},
+    {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:158564d0d1020e0d3fe919a81d97aadad35171e13e7b425b244ad4337fc6793a"},
+    {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:da1346cd0ccb395f0ed16b113ebb626fa43b7b07fd7344fce33e7a4f04a8897a"},
+    {file = "aiohttp-3.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eaa9256de26ea0334ffa25f1913ae15a51e35c529a1ed9af8e6286dd44312554"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1543e7fb00214fb4ccead42e6a7d86f3bb7c34751ec7c605cca7388e525fd0b4"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:186e94570433a004e05f31f632726ae0f2c9dee4762a9ce915769ce9c0a23d89"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d52d20832ac1560f4510d68e7ba8befbc801a2b77df12bd0cd2bcf3b049e52a4"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c45e4e815ac6af3b72ca2bde9b608d2571737bb1e2d42299fc1ffdf60f6f9a1"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa906b9bdfd4a7972dd0628dbbd6413d2062df5b431194486a78f0d2ae87bd55"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:68bbee9e17d66f17bb0010aa15a22c6eb28583edcc8b3212e2b8e3f77f3ebe2a"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4c189b64bd6d9a403a1a3f86a3ab3acbc3dc41a68f73a268a4f683f89a4dec1f"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8a7876f794523123bca6d44bfecd89c9fec9ec897a25f3dd202ee7fc5c6525b7"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d23fba734e3dd7b1d679b9473129cd52e4ec0e65a4512b488981a56420e708db"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b141753be581fab842a25cb319f79536d19c2a51995d7d8b29ee290169868eab"},
+    {file = "aiohttp-3.9.2-cp38-cp38-win32.whl", hash = "sha256:103daf41ff3b53ba6fa09ad410793e2e76c9d0269151812e5aba4b9dd674a7e8"},
+    {file = "aiohttp-3.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:328918a6c2835861ff7afa8c6d2c70c35fdaf996205d5932351bdd952f33fa2f"},
+    {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5264d7327c9464786f74e4ec9342afbbb6ee70dfbb2ec9e3dfce7a54c8043aa3"},
+    {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07205ae0015e05c78b3288c1517afa000823a678a41594b3fdc870878d645305"},
+    {file = "aiohttp-3.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0a1e638cffc3ec4d4784b8b4fd1cf28968febc4bd2718ffa25b99b96a741bd"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d43302a30ba1166325974858e6ef31727a23bdd12db40e725bec0f759abce505"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16a967685907003765855999af11a79b24e70b34dc710f77a38d21cd9fc4f5fe"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fa3ee92cd441d5c2d07ca88d7a9cef50f7ec975f0117cd0c62018022a184308"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b500c5ad9c07639d48615a770f49618130e61be36608fc9bc2d9bae31732b8f"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c07327b368745b1ce2393ae9e1aafed7073d9199e1dcba14e035cc646c7941bf"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc7d6502c23a0ec109687bf31909b3fb7b196faf198f8cff68c81b49eb316ea9"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:07be2be7071723c3509ab5c08108d3a74f2181d4964e869f2504aaab68f8d3e8"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:122468f6fee5fcbe67cb07014a08c195b3d4c41ff71e7b5160a7bcc41d585a5f"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:00a9abcea793c81e7f8778ca195a1714a64f6d7436c4c0bb168ad2a212627000"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a9825fdd64ecac5c670234d80bb52bdcaa4139d1f839165f548208b3779c6c6"},
+    {file = "aiohttp-3.9.2-cp39-cp39-win32.whl", hash = "sha256:5422cd9a4a00f24c7244e1b15aa9b87935c85fb6a00c8ac9b2527b38627a9211"},
+    {file = "aiohttp-3.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:7d579dcd5d82a86a46f725458418458fa43686f6a7b252f2966d359033ffc8ab"},
+    {file = "aiohttp-3.9.2.tar.gz", hash = "sha256:b0ad0a5e86ce73f5368a164c10ada10504bf91869c05ab75d982c6048217fbf7"},
 ]
 
 [package.dependencies]
@@ -2043,6 +2043,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2668,4 +2669,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "9cf2734cafd5b6963165d398f1b24621193d5284d0bc7cc26a720a014f523860"
+content-hash = "e99954cbbfef8dcc5e13cea7103c87657639a192f2372983bdb8c5d624c2e447"
diff --git a/pyproject.toml b/pyproject.toml
index 24e075b489..8ddaf0cdfb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
-aiohttp = "3.9.0"
+aiohttp = "3.9.2"
 pytest-rerunfailures = "^13.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"

From e3cb715e8ab43f1bea1df53d38dceecc90697132 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 30 Jan 2024 15:07:58 +0200
Subject: [PATCH 0025/1571] fix: capture initdb stderr, discard others (#6524)

When using spawn + wait_with_output instead of
std::process::Command::output or tokio::process::Command::output we must
configure the redirection.

Fixes: #6523 by discarding the stdout completely, we only care about
stderr if any.
---
 pageserver/src/tenant.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7bb5881aab..7a9fef43d2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3778,6 +3778,11 @@ async fn run_initdb(
         .env_clear()
         .env("LD_LIBRARY_PATH", &initdb_lib_dir)
         .env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
+        .stdin(std::process::Stdio::null())
+        // stdout invocation produces the same output every time, we don't need it
+        .stdout(std::process::Stdio::null())
+        // we would be interested in the stderr output, if there was any
+        .stderr(std::process::Stdio::piped())
         .spawn()?;
 
     // Ideally we'd select here with the cancellation token, but the problem is that

From 79137a089f81c8a844bd1ae80b99f1908f4b3cf9 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 30 Jan 2024 14:10:48 +0100
Subject: [PATCH 0026/1571] fix(#6366): pageserver: incorrect log level for
 Tenant not found during basebackup (#6400)

Before this patch, when requesting basebackup for a not-found tenant or
timeline, we'd emit an ERROR-level log entry with a huge stack trace.
See #6366 "Details" section for an example

With this patch, we log at INFO level and only a single line.
Example:

```
2024-01-19T14:16:11.479800Z  INFO page_service_conn_main{peer_addr=127.0.0.1:43448}: query handler for 'basebackup d69a536d529a68fcf85bc070030cdf4b 035484e9c28d8d0138a492caadd03ffd 0/2204340 --gzip' entity not found: Tenant d69a536d529a68fcf85bc070030cdf4b not found
2024-01-19T14:19:35.807819Z  INFO page_service_conn_main{peer_addr=127.0.0.1:48862}: query handler for 'basebackup d69a536d529a68fcf85bc070030cdf4a 035484e9c28d8d0138a492caadd03ffd 0/2204340 --gzip' entity not found: Timeline d69a536d529a68fcf85bc070030cdf4a/035484e9c28d8d0138a492caadd03ffd was not found
```

fixes https://github.com/neondatabase/neon/issues/6366

Changes
-------

- Change `handle_basebackup_request` to return a `QueryError`
- The new `impl From<WaitLsnError> for QueryError` is needed so the `?`
at `wait_lsn()` call in `handle_basebackup_request` works again. It's
duplicating `impl From<WaitLsnError> for PageStreamError`.
- Remove hard-to-spot conversion of `handle_basebackup_request` return
value to anyhow::Result (the place where I replaced `anyhow::Ok` with
`Result::<(), QueryError>::Ok(())`
- Add forgotten distinguished handling for "Tenant not found" case in
`impl From<GetActiveTenantError> for QueryError`

This was not at all pleasant, and I find it very hard to follow the
various error conversions.
It took me a while to spot the hard-to-spot `anyhow::Ok` thing above.
It would have been caught by the compiler if we weren't auto-converting
`anyhow::Error` into `QueryError::Other`.
We should move away from that, in my opinion, instead forcing each
`.context()` site to become `.context().map_err(QueryError::Other)`.
But that's for a future PR.
---
 pageserver/src/page_service.rs | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index a8a3487b4e..65191334a6 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -368,6 +368,16 @@ impl From<WaitLsnError> for PageStreamError {
     }
 }
 
+impl From<WaitLsnError> for QueryError {
+    fn from(value: WaitLsnError) -> Self {
+        match value {
+            e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
+            WaitLsnError::Shutdown => Self::Shutdown,
+            WaitLsnError::BadState => Self::Reconnect,
+        }
+    }
+}
+
 impl PageServerHandler {
     pub fn new(
         conf: &'static PageServerConf,
@@ -1139,7 +1149,7 @@ impl PageServerHandler {
         full_backup: bool,
         gzip: bool,
         ctx: RequestContext,
-    ) -> anyhow::Result<()>
+    ) -> Result<(), QueryError>
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
@@ -1404,7 +1414,7 @@ where
                     )
                     .await?;
                     pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                    anyhow::Ok(())
+                    Result::<(), QueryError>::Ok(())
                 },
             )
             .await?;
@@ -1678,6 +1688,7 @@ impl From<GetActiveTenantError> for QueryError {
             | GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
                 QueryError::Shutdown
             }
+            e @ GetActiveTenantError::NotFound(_) => QueryError::NotFound(format!("{e}").into()),
             e => QueryError::Other(anyhow::anyhow!(e)),
         }
     }

From 08532231ee39a45b8d2254c24e1c409a2c6950a4 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 24 Jan 2024 10:39:53 +0300
Subject: [PATCH 0027/1571] Fix find_end_of_wal busy loop.

It hanged if file size is less than of a normal segment. Normally that doesn't
happen, but it might in case of crash during segment init. We're going to fix
that half initialized segment by durably renaming it after cooking, so this fix
won't be needed, but better avoid busy loop anyway.

fixes https://github.com/neondatabase/neon/issues/6401
---
 libs/postgres_ffi/src/xlog_utils.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 56ce9c901e..a863fad269 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -207,10 +207,16 @@ pub fn find_end_of_wal(
                 let seg_offs = curr_lsn.segment_offset(wal_seg_size);
                 segment.seek(SeekFrom::Start(seg_offs as u64))?;
                 // loop inside segment
-                loop {
+                while curr_lsn.segment_number(wal_seg_size) == segno {
                     let bytes_read = segment.read(&mut buf)?;
                     if bytes_read == 0 {
-                        break; // EOF
+                        debug!(
+                            "find_end_of_wal reached end at {:?}, EOF in segment {:?} at offset {}",
+                            result,
+                            seg_file_path,
+                            curr_lsn.segment_offset(wal_seg_size)
+                        );
+                        return Ok(result);
                     }
                     curr_lsn += bytes_read as u64;
                     decoder.feed_bytes(&buf[0..bytes_read]);

From bc684e9d3bcc9285a8ad6d47651fb90bcb47886c Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 26 Jan 2024 16:51:41 +0300
Subject: [PATCH 0028/1571] Make WAL segment init atomic.

Since fdatasync is used for flushing WAL, changing file size is unsafe. Make
segment creation atomic by using tmp file + rename to avoid using partially
initialized segments.

fixes https://github.com/neondatabase/neon/issues/6402
---
 libs/utils/src/crashsafe.rs                   | 49 +++++++++++++++++++
 safekeeper/src/control_file.rs                | 33 ++-----------
 safekeeper/src/wal_storage.rs                 | 39 ++++++++++-----
 .../regress/test_wal_acceptor_async.py        | 36 ++++++++++++++
 4 files changed, 116 insertions(+), 41 deletions(-)

diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index b089af4a02..1c72e9cae9 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -112,6 +112,55 @@ pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Erro
     tokio::fs::File::open(path.as_ref()).await?.sync_all().await
 }
 
+pub async fn fsync_async_opt(
+    path: impl AsRef<Utf8Path>,
+    do_fsync: bool,
+) -> Result<(), std::io::Error> {
+    if do_fsync {
+        fsync_async(path.as_ref()).await?;
+    }
+    Ok(())
+}
+
+/// Like postgres' durable_rename, renames file issuing fsyncs do make it
+/// durable. After return, file and rename are guaranteed to be persisted.
+///
+/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make
+/// contents durable; 2) its directory entry to make rename durable 3) again to
+/// already renamed file, which is not required by standards but postgres does
+/// it, let's stick to that. Postgres additionally fsyncs newpath *before*
+/// rename if it exists to ensure that at least one of the files survives, but
+/// current callers don't need that.
+///
+/// virtual_file.rs has similar code, but it doesn't use vfs.
+///
+/// Useful links: <https://lwn.net/Articles/457667/>
+/// <https://www.postgresql.org/message-id/flat/56583BDD.9060302%402ndquadrant.com>
+/// <https://thunk.org/tytso/blog/2009/03/15/dont-fear-the-fsync/>
+pub async fn durable_rename(
+    old_path: impl AsRef<Utf8Path>,
+    new_path: impl AsRef<Utf8Path>,
+    do_fsync: bool,
+) -> io::Result<()> {
+    // first fsync the file
+    fsync_async_opt(old_path.as_ref(), do_fsync).await?;
+
+    // Time to do the real deal.
+    tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?;
+
+    // Postgres'ish fsync of renamed file.
+    fsync_async_opt(new_path.as_ref(), do_fsync).await?;
+
+    // Now fsync the parent
+    let parent = match new_path.as_ref().parent() {
+        Some(p) => p,
+        None => Utf8Path::new("./"), // assume current dir if there is no parent
+    };
+    fsync_async_opt(parent, do_fsync).await?;
+
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
 
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index f1daddd7c3..c39c1dbf28 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -3,8 +3,9 @@
 use anyhow::{bail, ensure, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use camino::Utf8PathBuf;
-use tokio::fs::{self, File};
+use tokio::fs::File;
 use tokio::io::AsyncWriteExt;
+use utils::crashsafe::durable_rename;
 
 use std::io::Read;
 use std::ops::Deref;
@@ -203,35 +204,8 @@ impl Storage for FileStorage {
             )
         })?;
 
-        // fsync the file
-        if !self.conf.no_sync {
-            control_partial.sync_all().await.with_context(|| {
-                format!(
-                    "failed to sync partial control file at {}",
-                    control_partial_path
-                )
-            })?;
-        }
-
         let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
-
-        // rename should be atomic
-        fs::rename(&control_partial_path, &control_path).await?;
-        // this sync is not required by any standard but postgres does this (see durable_rename)
-        if !self.conf.no_sync {
-            let new_f = File::open(&control_path).await?;
-            new_f
-                .sync_all()
-                .await
-                .with_context(|| format!("failed to sync control file at: {}", &control_path))?;
-
-            // fsync the directory (linux specific)
-            let tli_dir = File::open(&self.timeline_dir).await?;
-            tli_dir
-                .sync_all()
-                .await
-                .context("failed to sync control file directory")?;
-        }
+        durable_rename(&control_partial_path, &control_path, !self.conf.no_sync).await?;
 
         // update internal state
         self.state = s.clone();
@@ -249,6 +223,7 @@ mod test {
     use super::*;
     use crate::SafeKeeperConf;
     use anyhow::Result;
+    use tokio::fs;
     use utils::{id::TenantTimelineId, lsn::Lsn};
 
     fn stub_conf() -> SafeKeeperConf {
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index ed6190042a..8bbd95e9e8 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -21,6 +21,7 @@ use tokio::fs::{self, remove_file, File, OpenOptions};
 use tokio::io::{AsyncRead, AsyncWriteExt};
 use tokio::io::{AsyncReadExt, AsyncSeekExt};
 use tracing::*;
+use utils::crashsafe::durable_rename;
 
 use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
 use crate::state::TimelinePersistentState;
@@ -196,15 +197,6 @@ impl PhysicalStorage {
         Ok(())
     }
 
-    /// Call fsync if config requires so.
-    async fn fsync_file(&mut self, file: &File) -> Result<()> {
-        if !self.conf.no_sync {
-            self.metrics
-                .observe_flush_seconds(time_io_closure(file.sync_all()).await?);
-        }
-        Ok(())
-    }
-
     /// Open or create WAL segment file. Caller must call seek to the wanted position.
     /// Returns `file` and `is_partial`.
     async fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> {
@@ -223,15 +215,33 @@ impl PhysicalStorage {
             Ok((file, true))
         } else {
             // Create and fill new partial file
+            //
+            // We're using fdatasync during WAL writing, so file size must not
+            // change; to this end it is filled with zeros here. To avoid using
+            // half initialized segment, first bake it under tmp filename and
+            // then rename.
+            let tmp_path = self.timeline_dir.join("waltmp");
             let mut file = OpenOptions::new()
                 .create(true)
                 .write(true)
-                .open(&wal_file_partial_path)
+                .open(&tmp_path)
                 .await
-                .with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?;
+                .with_context(|| format!("Failed to open tmp wal file {:?}", &tmp_path))?;
 
             write_zeroes(&mut file, self.wal_seg_size).await?;
-            self.fsync_file(&file).await?;
+
+            // Note: this doesn't get into observe_flush_seconds metric. But
+            // segment init should be separate metric, if any.
+            if let Err(e) =
+                durable_rename(&tmp_path, &wal_file_partial_path, !self.conf.no_sync).await
+            {
+                // Probably rename succeeded, but fsync of it failed. Remove
+                // the file then to avoid using it.
+                remove_file(wal_file_partial_path)
+                    .await
+                    .or_else(utils::fs_ext::ignore_not_found)?;
+                return Err(e.into());
+            }
             Ok((file, true))
         }
     }
@@ -718,6 +728,11 @@ const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
 
 /// Helper for filling file with zeroes.
 async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
+    fail::fail_point!("sk-write-zeroes", |_| {
+        info!("write_zeroes hit failpoint");
+        Err(anyhow::anyhow!("failpoint: sk-write-zeroes"))
+    });
+
     while count >= XLOG_BLCKSZ {
         file.write_all(ZERO_BLOCK).await?;
         count -= XLOG_BLCKSZ;
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index 77d67cd63a..720633189e 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -515,6 +515,42 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder):
     asyncio.run(run_recovery_uncommitted(env))
 
 
+async def run_segment_init_failure(env: NeonEnv):
+    env.neon_cli.create_branch("test_segment_init_failure")
+    ep = env.endpoints.create_start("test_segment_init_failure")
+    ep.safe_psql("create table t(key int, value text)")
+    ep.safe_psql("insert into t select generate_series(1, 100), 'payload'")
+
+    sk = env.safekeepers[0]
+    sk_http = sk.http_client()
+    sk_http.configure_failpoints([("sk-write-zeroes", "return")])
+    conn = await ep.connect_async()
+    ep.safe_psql("select pg_switch_wal()")  # jump to the segment boundary
+    # next insertion should hang until failpoint is disabled.
+    asyncio.create_task(conn.execute("insert into t select generate_series(1,1), 'payload'"))
+    sleep_sec = 2
+    await asyncio.sleep(sleep_sec)
+    # also restart ep at segment boundary to make test more interesting
+    ep.stop()
+    # it must still be not finished
+    # assert not bg_query.done()
+    # Without segment rename during init (#6402) previous statement created
+    # partially initialized 16MB segment, so sk restart also triggers #6401.
+    sk.stop().start()
+    ep = env.endpoints.create_start("test_segment_init_failure")
+    ep.safe_psql("insert into t select generate_series(1,1), 'payload'")  # should be ok now
+
+
+# Test (injected) failure during WAL segment init.
+# https://github.com/neondatabase/neon/issues/6401
+# https://github.com/neondatabase/neon/issues/6402
+def test_segment_init_failure(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start()
+
+    asyncio.run(run_segment_init_failure(env))
+
+
 @dataclass
 class RaceConditionTest:
     iteration: int

From 6928a34f59fc9e7eeb2df4339c5fd323a2d3a492 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 30 Jan 2024 16:57:27 +0100
Subject: [PATCH 0029/1571] S3 DR: Large prefix improvements (#6515)

## Problem

PR #6500 has removed the limiting by number of versions/deletions for
time travel calls. We never get informed about how many versions there
are, and thus the call would just hang without any indication of
progress.

## Summary of changes

We improve the pageserver's behaviour with large prefixes, i.e. those
with many keys, removed or currently still available.

* Add a hard limit of 100k versions/deletions. For the reasoning see
https://github.com/neondatabase/cloud/issues/8233#issuecomment-1915021625
, but TLDR it will roughly support tenants of 2 TiB size, of course
depending on general write activity and duration of the s3 retention
window. The goal is to have a limit at all so that the process doesn't
accumulate increasing numbers of versions until an eventual crash.
* Lower the RAM footprint for the `VerOrDelete` datastructure. This
means we now don't cache a lot of redundant metadata in RAM like the
owner ID. The top level datastructure's footprint goes down from 264
bytes to 80 (but it contains strings that are not counted in there).

Follow-up of #6500, part of https://github.com/neondatabase/cloud/issues/8233

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 libs/remote_storage/src/s3_bucket.rs | 141 +++++++++++++++++----------
 1 file changed, 92 insertions(+), 49 deletions(-)

diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 83f3015eab..e615a1ce7e 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -687,12 +687,19 @@ impl RemoteStorage for S3Bucket {
                 response.version_id_marker,
                 response.key_marker
             );
-            let versions = response.versions.unwrap_or_default();
-            let delete_markers = response.delete_markers.unwrap_or_default();
-            let new_versions = versions.into_iter().map(VerOrDelete::Version);
-            let new_deletes = delete_markers.into_iter().map(VerOrDelete::DeleteMarker);
-            let new_versions_and_deletes = new_versions.chain(new_deletes);
-            versions_and_deletes.extend(new_versions_and_deletes);
+            let versions = response
+                .versions
+                .unwrap_or_default()
+                .into_iter()
+                .map(VerOrDelete::from_version);
+            let deletes = response
+                .delete_markers
+                .unwrap_or_default()
+                .into_iter()
+                .map(VerOrDelete::from_delete_marker);
+            itertools::process_results(versions.chain(deletes), |n_vds| {
+                versions_and_deletes.extend(n_vds)
+            })?;
             fn none_if_empty(v: Option<String>) -> Option<String> {
                 v.filter(|v| !v.is_empty())
             }
@@ -707,52 +714,51 @@ impl RemoteStorage for S3Bucket {
                 }
                 break;
             }
+            // Limit the number of versions deletions, mostly so that we don't
+            // keep requesting forever if the list is too long, as we'd put the
+            // list in RAM.
+            // Building a list of 100k entries that reaches the limit roughly takes
+            // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
+            const COMPLEXITY_LIMIT: usize = 100_000;
+            if versions_and_deletes.len() >= COMPLEXITY_LIMIT {
+                anyhow::bail!(
+                    "Limit for number of versions/deletions exceeded for prefix={prefix:?}"
+                );
+            }
         }
 
         // Work on the list of references instead of the objects directly,
         // otherwise we get lifetime errors in the sort_by_key call below.
         let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
 
-        versions_and_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified()));
+        versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified));
 
         let mut vds_for_key = HashMap::<_, Vec<_>>::new();
 
         for vd in &versions_and_deletes {
-            let last_modified = vd.last_modified();
-            let version_id = vd.version_id();
-            let key = vd.key();
-            let (Some(last_modified), Some(version_id), Some(key)) =
-                (last_modified, version_id, key)
-            else {
-                anyhow::bail!(
-                    "One (or more) of last_modified, key, and id is None. \
-                    Is versioning enabled in the bucket? last_modified={:?} key={:?} version_id={:?}",
-                    last_modified, key, version_id,
-                );
-            };
+            let VerOrDelete {
+                version_id, key, ..
+            } = &vd;
             if version_id == "null" {
                 anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \
                     indicating either disabled versioning, or legacy objects with null version id values");
             }
             tracing::trace!(
-                "Parsing version key={key} version_id={version_id} is_delete={}",
-                matches!(vd, VerOrDelete::DeleteMarker(_))
+                "Parsing version key={key} version_id={version_id} kind={:?}",
+                vd.kind
             );
 
-            vds_for_key
-                .entry(key)
-                .or_default()
-                .push((vd, last_modified, version_id));
+            vds_for_key.entry(key).or_default().push(vd);
         }
         for (key, versions) in vds_for_key {
-            let (last_vd, last_last_modified, _version_id) = versions.last().unwrap();
-            if last_last_modified > &&done_if_after {
+            let last_vd = versions.last().unwrap();
+            if last_vd.last_modified > done_if_after {
                 tracing::trace!("Key {key} has version later than done_if_after, skipping");
                 continue;
             }
             // the version we want to restore to.
             let version_to_restore_to =
-                match versions.binary_search_by_key(&timestamp, |tpl| *tpl.1) {
+                match versions.binary_search_by_key(&timestamp, |tpl| tpl.last_modified) {
                     Ok(v) => v,
                     Err(e) => e,
                 };
@@ -770,7 +776,11 @@ impl RemoteStorage for S3Bucket {
                 do_delete = true;
             } else {
                 match &versions[version_to_restore_to - 1] {
-                    (VerOrDelete::Version(_), _last_modified, version_id) => {
+                    VerOrDelete {
+                        kind: VerOrDeleteKind::Version,
+                        version_id,
+                        ..
+                    } => {
                         tracing::trace!("Copying old version {version_id} for {key}...");
                         // Restore the state to the last version by copying
                         let source_id =
@@ -795,13 +805,16 @@ impl RemoteStorage for S3Bucket {
                         )
                         .await?;
                     }
-                    (VerOrDelete::DeleteMarker(_), _last_modified, _version_id) => {
+                    VerOrDelete {
+                        kind: VerOrDeleteKind::DeleteMarker,
+                        ..
+                    } => {
                         do_delete = true;
                     }
                 }
             };
             if do_delete {
-                if matches!(last_vd, VerOrDelete::DeleteMarker(_)) {
+                if matches!(last_vd.kind, VerOrDeleteKind::DeleteMarker) {
                     // Key has since been deleted (but there was some history), no need to do anything
                     tracing::trace!("Key {key} already deleted, skipping.");
                 } else {
@@ -838,29 +851,59 @@ fn start_measuring_requests(
     })
 }
 
-enum VerOrDelete {
-    Version(ObjectVersion),
-    DeleteMarker(DeleteMarkerEntry),
+// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
+struct VerOrDelete {
+    kind: VerOrDeleteKind,
+    last_modified: DateTime,
+    version_id: String,
+    key: String,
+}
+
+#[derive(Debug)]
+enum VerOrDeleteKind {
+    Version,
+    DeleteMarker,
 }
 
 impl VerOrDelete {
-    fn last_modified(&self) -> Option<&DateTime> {
-        match self {
-            VerOrDelete::Version(v) => v.last_modified(),
-            VerOrDelete::DeleteMarker(v) => v.last_modified(),
-        }
+    fn with_kind(
+        kind: VerOrDeleteKind,
+        last_modified: Option<DateTime>,
+        version_id: Option<String>,
+        key: Option<String>,
+    ) -> anyhow::Result<Self> {
+        let lvk = (last_modified, version_id, key);
+        let (Some(last_modified), Some(version_id), Some(key)) = lvk else {
+            anyhow::bail!(
+                "One (or more) of last_modified, key, and id is None. \
+            Is versioning enabled in the bucket? last_modified={:?}, version_id={:?}, key={:?}",
+                lvk.0,
+                lvk.1,
+                lvk.2,
+            );
+        };
+        Ok(Self {
+            kind,
+            last_modified,
+            version_id,
+            key,
+        })
     }
-    fn version_id(&self) -> Option<&str> {
-        match self {
-            VerOrDelete::Version(v) => v.version_id(),
-            VerOrDelete::DeleteMarker(v) => v.version_id(),
-        }
+    fn from_version(v: ObjectVersion) -> anyhow::Result<Self> {
+        Self::with_kind(
+            VerOrDeleteKind::Version,
+            v.last_modified,
+            v.version_id,
+            v.key,
+        )
     }
-    fn key(&self) -> Option<&str> {
-        match self {
-            VerOrDelete::Version(v) => v.key(),
-            VerOrDelete::DeleteMarker(v) => v.key(),
-        }
+    fn from_delete_marker(v: DeleteMarkerEntry) -> anyhow::Result<Self> {
+        Self::with_kind(
+            VerOrDeleteKind::DeleteMarker,
+            v.last_modified,
+            v.version_id,
+            v.key,
+        )
     }
 }
 

From 3c3ee8f3e88075b2008c725d204424cb2f542d6b Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 30 Jan 2024 17:33:24 +0000
Subject: [PATCH 0030/1571] Compute: add compatibility patch for pgvector
 (#6527)

## Problem

`pgvector` requires a patch to work well with Neon (a patch created by
@hlinnaka)

## Summary of changes
- Apply the patch to `pgvector`
---
 .dockerignore           | 23 +++++++++--------
 Dockerfile.compute-node |  3 +++
 patches/pgvector.patch  | 56 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+), 11 deletions(-)
 create mode 100644 patches/pgvector.patch

diff --git a/.dockerignore b/.dockerignore
index ae0ad8fd77..29abdc37aa 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,27 +1,28 @@
 *
 
-!rust-toolchain.toml
-!Cargo.toml
+# Files
 !Cargo.lock
+!Cargo.toml
 !Makefile
+!rust-toolchain.toml
+!scripts/combine_control_files.py
+!scripts/ninstall.sh
+!vm-cgconfig.conf
 
+# Directories
 !.cargo/
 !.config/
-!control_plane/
 !compute_tools/
+!control_plane/
 !libs/
+!neon_local/
 !pageserver/
+!patches/
 !pgxn/
 !proxy/
-!safekeeper/
 !s3_scrubber/
+!safekeeper/
 !storage_broker/
 !trace/
-!vendor/postgres-v14/
-!vendor/postgres-v15/
-!vendor/postgres-v16/
+!vendor/postgres-*/
 !workspace_hack/
-!neon_local/
-!scripts/ninstall.sh
-!scripts/combine_control_files.py
-!vm-cgconfig.conf
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index d96b9f99c8..b13225172d 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,9 +241,12 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+COPY patches/pgvector.patch /pgvector.patch
+
 RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \
     echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
diff --git a/patches/pgvector.patch b/patches/pgvector.patch
new file mode 100644
index 0000000000..c429f272fc
--- /dev/null
+++ b/patches/pgvector.patch
@@ -0,0 +1,56 @@
+From 5518a806a70e7f40d5054a762ccda7d5e6b0d31c Mon Sep 17 00:00:00 2001
+From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+Date: Tue, 30 Jan 2024 14:33:00 +0200
+Subject: [PATCH] Make v0.6.0 work with Neon
+
+Now that the WAL-logging happens as a separate step at the end of the
+build, we need a few neon-specific hints to make it work.
+---
+ src/hnswbuild.c | 28 ++++++++++++++++++++++++++++
+ 1 file changed, 28 insertions(+)
+
+diff --git a/src/hnswbuild.c b/src/hnswbuild.c
+index 680789ba9044900eac9321844ee2a808a4a2ed12..41c5b709bcb2367ac8b8c498788ecac4c1148b74 100644
+--- a/src/hnswbuild.c
++++ b/src/hnswbuild.c
+@@ -1089,13 +1089,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+ 	SeedRandom(42);
+ #endif
+
++#ifdef NEON_SMGR
++	smgr_start_unlogged_build(index->rd_smgr);
++#endif
++
+ 	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
+
+ 	BuildGraph(buildstate, forkNum);
+
++#ifdef NEON_SMGR
++	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
++#endif
++
+ 	if (RelationNeedsWAL(index))
++	{
+ 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
+
++#ifdef NEON_SMGR
++		{
++#if PG_VERSION_NUM >= 160000
++			RelFileLocator rlocator = index->rd_smgr->smgr_rlocator.locator;
++#else
++			RelFileNode rlocator = index->rd_smgr->smgr_rnode.node;
++#endif
++
++			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
++										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
++			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
++		}
++#endif
++	}
++
++#ifdef NEON_SMGR
++	smgr_end_unlogged_build(index->rd_smgr);
++#endif
++
+ 	FreeBuildState(buildstate);
+ }

From e8c9a51273636d3af3969f8c6de3a9de1e8c0c2b Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Tue, 30 Jan 2024 22:32:33 -0800
Subject: [PATCH 0031/1571] Allow creating subscriptions as neon_superuser
 (#6484)

## Problem
We currently can't create subscriptions in PG14 and PG15 because only
superusers can, and PG16 requires adding roles to
pg_create_subscription.

## Summary of changes
I added changes to PG14 and PG15 that allow neon_superuser to bypass the
superuser requirement. For PG16, I didn't do that but added a migration
that adds neon_superuser to pg_create_subscription. Also added a test to
make sure it works.
---
 compute_tools/src/spec.rs                  |  8 ++++
 test_runner/fixtures/neon_fixtures.py      | 11 +++++
 test_runner/regress/test_migrations.py     |  6 +--
 test_runner/regress/test_neon_superuser.py | 55 +++++++++++++++++++---
 vendor/postgres-v14                        |  2 +-
 vendor/postgres-v15                        |  2 +-
 vendor/revisions.json                      |  4 +-
 7 files changed, 75 insertions(+), 13 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index e87dc0b732..2b1bff75fe 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -758,6 +758,14 @@ BEGIN
     END LOOP;
 END $$;
 "#,
+        r#"
+DO $$
+BEGIN
+    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
+        EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
+    END IF;
+END
+$$;"#,
     ];
 
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 804685589f..0f79df74ba 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3062,6 +3062,17 @@ class Endpoint(PgProtocol):
 
         return self
 
+    def edit_hba(self, hba: List[str]):
+        """Prepend hba lines into pg_hba.conf file."""
+        with open(os.path.join(self.pg_data_dir_path(), "pg_hba.conf"), "r+") as conf_file:
+            data = conf_file.read()
+            conf_file.seek(0)
+            conf_file.write("\n".join(hba) + "\n")
+            conf_file.write(data)
+
+        if self.running:
+            self.safe_psql("SELECT pg_reload_conf()")
+
     def reconfigure(self, pageserver_id: Optional[int] = None):
         assert self.endpoint_id is not None
         self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id)
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 121fa91f66..dee22f9b48 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -18,11 +18,11 @@ def test_migrations(neon_simple_env: NeonEnv):
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")
         migration_id = cur.fetchall()
-        assert migration_id[0][0] == 2
+        assert migration_id[0][0] == 3
 
     with open(log_path, "r") as log_file:
         logs = log_file.read()
-        assert "INFO handle_migrations: Ran 2 migrations" in logs
+        assert "INFO handle_migrations: Ran 3 migrations" in logs
 
     endpoint.stop()
     endpoint.start()
@@ -30,7 +30,7 @@ def test_migrations(neon_simple_env: NeonEnv):
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")
         migration_id = cur.fetchall()
-        assert migration_id[0][0] == 2
+        assert migration_id[0][0] == 3
 
     with open(log_path, "r") as log_file:
         logs = log_file.read()
diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index 6be7c114cb..8b9eb1d9c4 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -1,26 +1,44 @@
 import time
 
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.pg_version import PgVersion
 
 
 def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_neon_superuser", "empty")
-    endpoint = env.endpoints.create("test_neon_superuser")
-    endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"])
-    endpoint.start()
+    env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")
+    pub = env.endpoints.create("test_neon_superuser_publisher")
+
+    env.neon_cli.create_branch("test_neon_superuser_subscriber")
+    sub = env.endpoints.create("test_neon_superuser_subscriber")
+
+    pub.respec(skip_pg_catalog_updates=False, features=["migrations"])
+    pub.start()
+
+    sub.respec(skip_pg_catalog_updates=False, features=["migrations"])
+    sub.start()
 
     time.sleep(1)  # Sleep to let migrations run
 
-    with endpoint.cursor() as cur:
+    with pub.cursor() as cur:
         cur.execute(
             "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser"
         )
         cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers")
         cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser")
 
-    with endpoint.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
+        # If we don't do this, creating the subscription will fail later on PG16
+        pub.edit_hba(["host all mr_whiskers 0.0.0.0/0 md5"])
+
+    with sub.cursor() as cur:
+        cur.execute(
+            "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser"
+        )
+        cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers")
+        cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser")
+
+    with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
         cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'member')")
         assert cur.fetchall()[0][0]
         cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'usage')")
@@ -32,3 +50,28 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
 
         cur.execute("CREATE PUBLICATION pub FOR ALL TABLES")
         cur.execute("CREATE ROLE definitely_not_a_superuser WITH PASSWORD 'nope'")
+        cur.execute("CREATE DATABASE definitely_a_database")
+        cur.execute("CREATE TABLE t (a int)")
+        cur.execute("INSERT INTO t VALUES (10), (20)")
+        cur.execute("SELECT * from t")
+        res = cur.fetchall()
+        assert [r[0] for r in res] == [10, 20]
+
+    with sub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
+        cur.execute("CREATE TABLE t (a int)")
+
+        pub_conn = f"host=localhost port={pub.pg_port} dbname=neondb user=mr_whiskers password=cat"
+        query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub"
+        log.info(f"Creating subscription: {query}")
+        cur.execute(query)
+
+        with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as pcur:
+            pcur.execute("INSERT INTO t VALUES (30), (40)")
+
+        time.sleep(1)  # Give the change time to propagate
+
+        cur.execute("SELECT * FROM t")
+        res = cur.fetchall()
+        log.info(res)
+        assert len(res) == 4
+        assert [r[0] for r in res] == [10, 20, 30, 40]
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 11e970fe2b..3de48ce3d9 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 11e970fe2be56804f0a786ec5fc8141ffefa4ca7
+Subproject commit 3de48ce3d9c1f4fac1cdc7029487f8db9e537eac
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 731b4d1609..b089a8a02c 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 731b4d1609d6db1c953755810a41e0e67ea3db7b
+Subproject commit b089a8a02c9f6f4379883fddb33cf10a3aa0b14f
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c7b33f8c8a..1211155b7d 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
     "postgres-v16": "cf302768b2890569956641e0e5ba112ae1445351",
-    "postgres-v15": "731b4d1609d6db1c953755810a41e0e67ea3db7b",
-    "postgres-v14": "11e970fe2be56804f0a786ec5fc8141ffefa4ca7"
+    "postgres-v15": "b089a8a02c9f6f4379883fddb33cf10a3aa0b14f",
+    "postgres-v14": "3de48ce3d9c1f4fac1cdc7029487f8db9e537eac"
 }

From e10a7ee3915c036bafd5dee5b57f7d02eed46b29 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 31 Jan 2024 09:17:32 +0200
Subject: [PATCH 0032/1571] Prevent to frequent reconnects in case of race
 condition errors returned by PS (tenant not found) (#6522)

## Problem

See https://neondb.slack.com/archives/C04DGM6SMTM/p1706531433057289

## Summary of changes

1. Do not decrease reconnect timeout until maximal interval value (1
second) is reached
2. Compute reconnect time after connection attempt is taken to exclude
connect time itself from the interval measurement.

So now backend should not perform more than 4 reconnect attempts per
second.
But please notice that backoff is performed locally in each backend and
so if there are many active backends,
then connection (and  so error) rate may be much higher.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/libpagestore.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 0eb1acbfb0..a3543bca78 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -328,18 +328,14 @@ pageserver_connect(shardno_t shard_no, int elevel)
 
 	now = GetCurrentTimestamp();
 	us_since_last_connect = now - last_connect_time;
-	if (us_since_last_connect < delay_us)
+	if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC)
 	{
-		pg_usleep(delay_us - us_since_last_connect);
+		pg_usleep(delay_us);
 		delay_us *= 2;
-		if (delay_us > MAX_RECONNECT_INTERVAL_USEC)
-			delay_us = MAX_RECONNECT_INTERVAL_USEC;
-		last_connect_time = GetCurrentTimestamp();
 	}
 	else
 	{
 		delay_us = MIN_RECONNECT_INTERVAL_USEC;
-		last_connect_time = now;
 	}
 
 	/*
@@ -366,6 +362,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	values[n] = NULL;
 	n++;
 	conn = PQconnectdbParams(keywords, values, 1);
+	last_connect_time = GetCurrentTimestamp();
 
 	if (PQstatus(conn) == CONNECTION_BAD)
 	{

From 4010adf653252306a4ce9227b87bf9a23e9d155c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 31 Jan 2024 12:23:06 +0000
Subject: [PATCH 0033/1571] control_plane/attachment_service: complete APIs
 (#6394)

Depends on: https://github.com/neondatabase/neon/pull/6468

## Problem

The sharding service will be used as a "virtual pageserver" by the
control plane -- so it needs the set of pageserver APIs that the control
plane uses, and to present them under identical URLs, including prefix
(/v1).

## Summary of changes

- Add missing APIs:
  - Tenant deletion
  - Timeline deletion
  - Node list (used in test now, later in tools)
- `/location_config` API (for migrating tenants into the sharding
service)
- Rework attachment service URLs:
  - `/v1` prefix is used for pageserver-compatible APIs
- `/upcall/v1` prefix is used for APIs that are called by the pageserver
(re-attach and validate)
  - `/debug/v1` prefix is used for endpoints that are for testing
- `/control/v1` prefix is used for new sharding service APIs that do not
mimic a pageserver API, such as registering and configuring nodes.
- Add test_sharding_service. The sharding service already had some
collateral coverage from its use in general tests, but this is the first
dedicated testing for it.
---
 Cargo.lock                                    |   1 -
 control_plane/attachment_service/Cargo.toml   |   4 -
 control_plane/attachment_service/src/http.rs  | 200 ++++++++-
 .../attachment_service/src/persistence.rs     |  41 --
 .../attachment_service/src/service.rs         | 422 ++++++++++++++++--
 control_plane/src/attachment_service.rs       |  38 +-
 control_plane/src/bin/neon_local.rs           |   2 +-
 libs/pageserver_api/src/models.rs             |  13 +
 pageserver/client/src/mgmt_api.rs             |  64 +++
 pageserver/src/http/openapi_spec.yml          |  25 ++
 pageserver/src/http/routes.rs                 |  20 +-
 test_runner/fixtures/neon_fixtures.py         |  80 +++-
 test_runner/regress/test_sharding_service.py  | 272 +++++++++++
 13 files changed, 1059 insertions(+), 123 deletions(-)
 create mode 100644 test_runner/regress/test_sharding_service.py

diff --git a/Cargo.lock b/Cargo.lock
index a669fef314..e14196350b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -285,7 +285,6 @@ dependencies = [
  "metrics",
  "pageserver_api",
  "pageserver_client",
- "postgres_backend",
  "postgres_connection",
  "serde",
  "serde_json",
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 6fc21810bc..210a898747 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -21,10 +21,6 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 
-# TODO: remove this after DB persistence is added, it is only used for
-# a parsing function when loading pageservers from neon_local LocalEnv
-postgres_backend.workspace = true
-
 diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
 
 utils = { path = "../../libs/utils/" }
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 81f21a8e7a..aa8c73c493 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -2,13 +2,17 @@ use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
-use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
+use pageserver_api::models::{
+    TenantCreateRequest, TenantLocationConfigRequest, TimelineCreateRequest,
+};
 use pageserver_api::shard::TenantShardId;
+use pageserver_client::mgmt_api;
 use std::sync::Arc;
+use std::time::{Duration, Instant};
 use utils::auth::SwappableJwtAuth;
 use utils::http::endpoint::{auth_middleware, request_span};
 use utils::http::request::parse_request_param;
-use utils::id::TenantId;
+use utils::id::{TenantId, TimelineId};
 
 use utils::{
     http::{
@@ -112,6 +116,78 @@ async fn handle_tenant_create(
     json_response(StatusCode::OK, service.tenant_create(create_req).await?)
 }
 
+// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
+// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream.  This avoids
+// needing to track a "deleting" state for tenants.
+async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
+where
+    R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
+    F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
+{
+    let started_at = Instant::now();
+    // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
+    // completed.
+    let mut retry_period = Duration::from_secs(1);
+    // On subsequent retries, wait longer.
+    let max_retry_period = Duration::from_secs(5);
+    // Enable callers with a 30 second request timeout to reliably get a response
+    let max_wait = Duration::from_secs(25);
+
+    loop {
+        let status = f(service.clone()).await?;
+        match status {
+            StatusCode::ACCEPTED => {
+                tracing::info!("Deletion accepted, waiting to try again...");
+                tokio::time::sleep(retry_period).await;
+                retry_period = max_retry_period;
+            }
+            StatusCode::NOT_FOUND => {
+                tracing::info!("Deletion complete");
+                return json_response(StatusCode::OK, ());
+            }
+            _ => {
+                tracing::warn!("Unexpected status {status}");
+                return json_response(status, ());
+            }
+        }
+
+        let now = Instant::now();
+        if now + retry_period > started_at + max_wait {
+            tracing::info!("Deletion timed out waiting for 404");
+            // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
+            // the pageserver's swagger definition for this endpoint, and has the same desired
+            // effect of causing the control plane to retry later.
+            return json_response(StatusCode::CONFLICT, ());
+        }
+    }
+}
+
+async fn handle_tenant_location_config(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
+    json_response(
+        StatusCode::OK,
+        service
+            .tenant_location_config(tenant_id, config_req)
+            .await?,
+    )
+}
+
+async fn handle_tenant_delete(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
+    deletion_wrapper(service, move |service| async move {
+        service.tenant_delete(tenant_id).await
+    })
+    .await
+}
+
 async fn handle_tenant_timeline_create(
     service: Arc<Service>,
     mut req: Request<Body>,
@@ -126,6 +202,63 @@ async fn handle_tenant_timeline_create(
     )
 }
 
+async fn handle_tenant_timeline_delete(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
+    deletion_wrapper(service, move |service| async move {
+        service.tenant_timeline_delete(tenant_id, timeline_id).await
+    })
+    .await
+}
+
+async fn handle_tenant_timeline_passthrough(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
+    let Some(path) = req.uri().path_and_query() else {
+        // This should never happen, our request router only calls us if there is a path
+        return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path")));
+    };
+
+    tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
+
+    // Find the node that holds shard zero
+    let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
+
+    // Callers will always pass an unsharded tenant ID.  Before proxying, we must
+    // rewrite this to a shard-aware shard zero ID.
+    let path = format!("{}", path);
+    let tenant_str = tenant_id.to_string();
+    let tenant_shard_str = format!("{}", tenant_shard_id);
+    let path = path.replace(&tenant_str, &tenant_shard_str);
+
+    let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
+    let resp = client.get_raw(path).await.map_err(|_e|
+        // FIXME: give APiError a proper Unavailable variant.  We return 503 here because
+        // if we can't successfully send a request to the pageserver, we aren't available.
+        ApiError::ShuttingDown)?;
+
+    // We have a reqest::Response, would like a http::Response
+    let mut builder = hyper::Response::builder()
+        .status(resp.status())
+        .version(resp.version());
+    for (k, v) in resp.headers() {
+        builder = builder.header(k, v);
+    }
+
+    let response = builder
+        .body(Body::wrap_stream(resp.bytes_stream()))
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+
+    Ok(response)
+}
+
 async fn handle_tenant_locate(
     service: Arc<Service>,
     req: Request<Body>,
@@ -141,6 +274,11 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
     json_response(StatusCode::OK, ())
 }
 
+async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&req);
+    json_response(StatusCode::OK, state.service.node_list().await?)
+}
+
 async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
     let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
@@ -226,26 +364,64 @@ pub fn make_router(
 
     router
         .data(Arc::new(HttpState::new(service, auth)))
+        // Non-prefixed generic endpoints (status, metrics)
         .get("/status", |r| request_span(r, handle_status))
-        .post("/re-attach", |r| request_span(r, handle_re_attach))
-        .post("/validate", |r| request_span(r, handle_validate))
-        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
-        .post("/inspect", |r| request_span(r, handle_inspect))
-        .post("/node", |r| request_span(r, handle_node_register))
-        .put("/node/:node_id/config", |r| {
+        // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
+        .post("/upcall/v1/re-attach", |r| {
+            request_span(r, handle_re_attach)
+        })
+        .post("/upcall/v1/validate", |r| request_span(r, handle_validate))
+        // Test/dev/debug endpoints
+        .post("/debug/v1/attach-hook", |r| {
+            request_span(r, handle_attach_hook)
+        })
+        .post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
+        .get("/control/v1/tenant/:tenant_id/locate", |r| {
+            tenant_service_handler(r, handle_tenant_locate)
+        })
+        // Node operations
+        .post("/control/v1/node", |r| {
+            request_span(r, handle_node_register)
+        })
+        .get("/control/v1/node", |r| request_span(r, handle_node_list))
+        .put("/control/v1/node/:node_id/config", |r| {
             request_span(r, handle_node_configure)
         })
+        // Tenant Shard operations
+        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
+            tenant_service_handler(r, handle_tenant_shard_migrate)
+        })
+        // Tenant operations
+        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
+        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
         .post("/v1/tenant", |r| {
             tenant_service_handler(r, handle_tenant_create)
         })
+        .delete("/v1/tenant/:tenant_id", |r| {
+            tenant_service_handler(r, handle_tenant_delete)
+        })
+        .put("/v1/tenant/:tenant_id/location_config", |r| {
+            tenant_service_handler(r, handle_tenant_location_config)
+        })
+        // Tenant Shard operations (low level/maintenance)
+        .put("/tenant/:tenant_shard_id/migrate", |r| {
+            tenant_service_handler(r, handle_tenant_shard_migrate)
+        })
+        // Timeline operations
+        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+            tenant_service_handler(r, handle_tenant_timeline_delete)
+        })
         .post("/v1/tenant/:tenant_id/timeline", |r| {
             tenant_service_handler(r, handle_tenant_timeline_create)
         })
-        .get("/tenant/:tenant_id/locate", |r| {
-            tenant_service_handler(r, handle_tenant_locate)
+        // Tenant detail GET passthrough to shard zero
+        .get("/v1/tenant/:tenant_id*", |r| {
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
         })
-        .put("/tenant/:tenant_shard_id/migrate", |r| {
-            tenant_service_handler(r, handle_tenant_shard_migrate)
+        // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
+        // timeline GET APIs will be implicitly included.
+        .get("/v1/tenant/:tenant_id/timeline*", |r| {
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
         })
         // Path aliases for tests_forward_compatibility
         // TODO: remove these in future PR
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index b27bd2bf2e..574441c409 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -9,7 +9,6 @@ use diesel::prelude::*;
 use diesel::Connection;
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
-use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
@@ -129,51 +128,11 @@ impl Persistence {
             })
             .await?;
 
-        if nodes.is_empty() {
-            return self.list_nodes_local_env().await;
-        }
-
         tracing::info!("list_nodes: loaded {} nodes", nodes.len());
 
         Ok(nodes)
     }
 
-    /// Shim for automated compatibility tests: load nodes from LocalEnv instead of database
-    pub(crate) async fn list_nodes_local_env(&self) -> DatabaseResult<Vec<Node>> {
-        // Enable test_backward_compatibility to work by populating our list of
-        // nodes from LocalEnv when it is not present in persistent storage.  Otherwise at
-        // first startup in the compat test, we may have shards but no nodes.
-        use control_plane::local_env::LocalEnv;
-        let env = LocalEnv::load_config().map_err(|e| DatabaseError::Logical(format!("{e}")))?;
-        tracing::info!(
-            "Loading {} pageserver nodes from LocalEnv",
-            env.pageservers.len()
-        );
-        let mut nodes = Vec::new();
-        for ps_conf in env.pageservers {
-            let (pg_host, pg_port) =
-                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            let node = Node {
-                id: ps_conf.id,
-                listen_pg_addr: pg_host.to_string(),
-                listen_pg_port: pg_port.unwrap_or(5432),
-                listen_http_addr: http_host.to_string(),
-                listen_http_port: http_port.unwrap_or(80),
-                availability: NodeAvailability::Active,
-                scheduling: NodeSchedulingPolicy::Active,
-            };
-
-            // Synchronize database with what we learn from LocalEnv
-            self.insert_node(&node).await?;
-
-            nodes.push(node);
-        }
-
-        Ok(nodes)
-    }
-
     /// At startup, load the high level state for shards, such as their config + policy.  This will
     /// be enriched at runtime with state discovered on pageservers.
     pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index ec56dc8ad4..8c6a348515 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -21,6 +21,7 @@ use pageserver_api::{
     models,
     models::{
         LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
+        TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
         TimelineCreateRequest, TimelineInfo,
     },
     shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
@@ -30,14 +31,14 @@ use utils::{
     completion::Barrier,
     generation::Generation,
     http::error::ApiError,
-    id::{NodeId, TenantId},
+    id::{NodeId, TenantId, TimelineId},
     seqwait::SeqWait,
 };
 
 use crate::{
     compute_hook::ComputeHook,
     node::Node,
-    persistence::{DatabaseError, Persistence, TenantShardPersistence},
+    persistence::{DatabaseError, NodePersistence, Persistence, TenantShardPersistence},
     scheduler::Scheduler,
     tenant_state::{
         IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
@@ -635,7 +636,7 @@ impl Service {
                 shard_number: tenant_shard_id.shard_number.0 as i32,
                 shard_count: tenant_shard_id.shard_count.0 as i32,
                 shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
-                generation: 0,
+                generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
                 generation_pageserver: i64::MAX,
                 placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                 config: serde_json::to_string(&create_req.config).unwrap(),
@@ -677,6 +678,7 @@ impl Service {
                         })?;
 
                         response_shards.push(TenantCreateResponseShard {
+                            shard_id: tenant_shard_id,
                             node_id: entry
                                 .get()
                                 .intent
@@ -709,6 +711,7 @@ impl Service {
                         })?;
 
                         response_shards.push(TenantCreateResponseShard {
+                            shard_id: tenant_shard_id,
                             node_id: state
                                 .intent
                                 .attached
@@ -742,14 +745,257 @@ impl Service {
             (waiters, response_shards)
         };
 
-        let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
+        self.await_waiters(waiters).await?;
+
+        Ok(TenantCreateResponse {
+            shards: response_shards,
+        })
+    }
+
+    /// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
+    /// wait for reconciliation to complete before responding.
+    async fn await_waiters(
+        &self,
+        waiters: Vec<ReconcilerWaiter>,
+    ) -> Result<(), ReconcileWaitError> {
+        let deadline = Instant::now().checked_add(Duration::from_secs(30)).unwrap();
         for waiter in waiters {
             let timeout = deadline.duration_since(Instant::now());
             waiter.wait_timeout(timeout).await?;
         }
-        Ok(TenantCreateResponse {
-            shards: response_shards,
-        })
+
+        Ok(())
+    }
+
+    /// This API is used by the cloud control plane to do coarse-grained control of tenants:
+    /// - Call with mode Attached* to upsert the tenant.
+    /// - Call with mode Detached to switch to PolicyMode::Detached
+    ///
+    /// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has
+    /// secondary locations.
+    pub(crate) async fn tenant_location_config(
+        &self,
+        tenant_id: TenantId,
+        req: TenantLocationConfigRequest,
+    ) -> Result<TenantLocationConfigResponse, ApiError> {
+        if req.tenant_id.shard_count.0 > 1 {
+            return Err(ApiError::BadRequest(anyhow::anyhow!(
+                "This API is for importing single-sharded or unsharded tenants"
+            )));
+        }
+
+        let mut waiters = Vec::new();
+        let mut result = TenantLocationConfigResponse { shards: Vec::new() };
+        let maybe_create = {
+            let mut locked = self.inner.write().unwrap();
+            let result_tx = locked.result_tx.clone();
+            let compute_hook = locked.compute_hook.clone();
+            let pageservers = locked.nodes.clone();
+
+            let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
+
+            // Maybe we have existing shards
+            let mut create = true;
+            for (shard_id, shard) in locked
+                .tenants
+                .range_mut(TenantShardId::tenant_range(tenant_id))
+            {
+                // Saw an existing shard: this is not a creation
+                create = false;
+
+                // Note that for existing tenants we do _not_ respect the generation in the request: this is likely
+                // to be stale.  Once a tenant is created in this service, our view of generation is authoritative, and
+                // callers' generations may be ignored.  This represents a one-way migration of tenants from the outer
+                // cloud control plane into this service.
+
+                // Use location config mode as an indicator of policy: if they ask for
+                // attached we go to default HA attached mode.  If they ask for secondary
+                // we go to secondary-only mode.  If they ask for detached we detach.
+                match req.config.mode {
+                    LocationConfigMode::Detached => {
+                        shard.policy = PlacementPolicy::Detached;
+                    }
+                    LocationConfigMode::Secondary => {
+                        // TODO: implement secondary-only mode.
+                        todo!();
+                    }
+                    LocationConfigMode::AttachedMulti
+                    | LocationConfigMode::AttachedSingle
+                    | LocationConfigMode::AttachedStale => {
+                        // TODO: persistence for changes in policy
+                        if pageservers.len() > 1 {
+                            shard.policy = PlacementPolicy::Double(1)
+                        } else {
+                            // Convenience for dev/test: if we just have one pageserver, import
+                            // tenants into Single mode so that scheduling will succeed.
+                            shard.policy = PlacementPolicy::Single
+                        }
+                    }
+                }
+
+                shard.schedule(&mut scheduler)?;
+
+                let maybe_waiter = shard.maybe_reconcile(
+                    result_tx.clone(),
+                    &pageservers,
+                    &compute_hook,
+                    &self.config,
+                    &self.persistence,
+                );
+                if let Some(waiter) = maybe_waiter {
+                    waiters.push(waiter);
+                }
+
+                if let Some(node_id) = shard.intent.attached {
+                    result.shards.push(TenantShardLocation {
+                        shard_id: *shard_id,
+                        node_id,
+                    })
+                }
+            }
+
+            if create {
+                // Validate request mode
+                match req.config.mode {
+                    LocationConfigMode::Detached | LocationConfigMode::Secondary => {
+                        // When using this API to onboard an existing tenant to this service, it must start in
+                        // an attached state, because we need the request to come with a generation
+                        return Err(ApiError::BadRequest(anyhow::anyhow!(
+                            "Imported tenant must be in attached mode"
+                        )));
+                    }
+
+                    LocationConfigMode::AttachedMulti
+                    | LocationConfigMode::AttachedSingle
+                    | LocationConfigMode::AttachedStale => {
+                        // Pass
+                    }
+                }
+
+                // Validate request generation
+                let Some(generation) = req.config.generation else {
+                    // We can only import attached tenants, because we need the request to come with a generation
+                    return Err(ApiError::BadRequest(anyhow::anyhow!(
+                        "Generation is mandatory when importing tenant"
+                    )));
+                };
+
+                // Synthesize a creation request
+                Some(TenantCreateRequest {
+                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    generation: Some(generation),
+                    shard_parameters: ShardParameters {
+                        // Must preserve the incoming shard_count do distinguish unsharded (0)
+                        // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
+                        count: req.tenant_id.shard_count,
+                        // We only import un-sharded or single-sharded tenants, so stripe
+                        // size can be made up arbitrarily here.
+                        stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
+                    },
+                    config: req.config.tenant_conf,
+                })
+            } else {
+                None
+            }
+        };
+
+        if let Some(create_req) = maybe_create {
+            let create_resp = self.tenant_create(create_req).await?;
+            result.shards = create_resp
+                .shards
+                .into_iter()
+                .map(|s| TenantShardLocation {
+                    node_id: s.node_id,
+                    shard_id: s.shard_id,
+                })
+                .collect();
+        } else {
+            // This was an update, wait for reconciliation
+            self.await_waiters(waiters).await?;
+        }
+
+        Ok(result)
+    }
+
+    pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
+        // TODO: refactor into helper
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                let node_id = shard.intent.attached.ok_or_else(|| {
+                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
+                })?;
+                let node = locked
+                    .nodes
+                    .get(&node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                targets.push((*tenant_shard_id, node.clone()));
+            }
+            targets
+        };
+
+        // TODO: error out if the tenant is not attached anywhere.
+
+        // Phase 1: delete on the pageservers
+        let mut any_pending = false;
+        for (tenant_shard_id, node) in targets {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            // TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
+            // surface immediately as an error to our caller.
+            let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "Error deleting shard {tenant_shard_id} on node {}: {e}",
+                    node.id
+                ))
+            })?;
+            tracing::info!(
+                "Shard {tenant_shard_id} on node {}, delete returned {}",
+                node.id,
+                status
+            );
+            if status == StatusCode::ACCEPTED {
+                any_pending = true;
+            }
+        }
+
+        if any_pending {
+            // Caller should call us again later.  When we eventually see 404s from
+            // all the shards, we may proceed to delete our records of the tenant.
+            tracing::info!(
+                "Tenant {} has some shards pending deletion, returning 202",
+                tenant_id
+            );
+            return Ok(StatusCode::ACCEPTED);
+        }
+
+        // Fall through: deletion of the tenant on pageservers is complete, we may proceed to drop
+        // our in-memory state and database state.
+
+        // Ordering: we delete persistent state first: if we then
+        // crash, we will drop the in-memory state.
+
+        // Drop persistent state.
+        self.persistence.delete_tenant(tenant_id).await?;
+
+        // Drop in-memory state
+        {
+            let mut locked = self.inner.write().unwrap();
+            locked
+                .tenants
+                .retain(|tenant_shard_id, _shard| tenant_shard_id.tenant_id != tenant_id);
+            tracing::info!(
+                "Deleted tenant {tenant_id}, now have {} tenants",
+                locked.tenants.len()
+            );
+        };
+
+        // Success is represented as 404, to imitate the existing pageserver deletion API
+        Ok(StatusCode::NOT_FOUND)
     }
 
     pub(crate) async fn tenant_timeline_create(
@@ -759,25 +1005,15 @@ impl Service {
     ) -> Result<TimelineInfo, ApiError> {
         let mut timeline_info = None;
 
-        let ensure_waiters = {
-            let locked = self.inner.write().unwrap();
-            tracing::info!(
-                "Creating timeline {}/{}, have {} pageservers",
-                tenant_id,
-                create_req.new_timeline_id,
-                locked.nodes.len()
-            );
+        tracing::info!(
+            "Creating timeline {}/{}",
+            tenant_id,
+            create_req.new_timeline_id,
+        );
 
-            self.ensure_attached(locked, tenant_id)
-                .map_err(ApiError::InternalServerError)?
-        };
-
-        let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
-        for waiter in ensure_waiters {
-            let timeout = deadline.duration_since(Instant::now());
-            waiter.wait_timeout(timeout).await?;
-        }
+        self.ensure_attached_wait(tenant_id).await?;
 
+        // TODO: refuse to do this if shard splitting is in progress
         let targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
@@ -848,6 +1084,111 @@ impl Service {
         Ok(timeline_info.expect("targets cannot be empty"))
     }
 
+    pub(crate) async fn tenant_timeline_delete(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<StatusCode, ApiError> {
+        tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,);
+
+        self.ensure_attached_wait(tenant_id).await?;
+
+        // TODO: refuse to do this if shard splitting is in progress
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                let node_id = shard.intent.attached.ok_or_else(|| {
+                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
+                })?;
+                let node = locked
+                    .nodes
+                    .get(&node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                targets.push((*tenant_shard_id, node.clone()));
+            }
+            targets
+        };
+
+        if targets.is_empty() {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant not found").into(),
+            ));
+        }
+
+        // TODO: call into shards concurrently
+        let mut any_pending = false;
+        for (tenant_shard_id, node) in targets {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+
+            tracing::info!(
+                "Deleting timeline on shard {}/{}, attached to node {}",
+                tenant_shard_id,
+                timeline_id,
+                node.id
+            );
+
+            let status = client
+                .timeline_delete(tenant_shard_id, timeline_id)
+                .await
+                .map_err(|e| {
+                    ApiError::InternalServerError(anyhow::anyhow!(
+                    "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {}: {e}",
+                    node.id
+                ))
+                })?;
+
+            if status == StatusCode::ACCEPTED {
+                any_pending = true;
+            }
+        }
+
+        if any_pending {
+            Ok(StatusCode::ACCEPTED)
+        } else {
+            Ok(StatusCode::NOT_FOUND)
+        }
+    }
+
+    /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
+    /// function looks it up and returns the url.  If the tenant isn't found, returns Err(ApiError::NotFound)
+    pub(crate) fn tenant_shard0_baseurl(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<(String, TenantShardId), ApiError> {
+        let locked = self.inner.read().unwrap();
+        let Some((tenant_shard_id, shard)) = locked
+            .tenants
+            .range(TenantShardId::tenant_range(tenant_id))
+            .next()
+        else {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
+            ));
+        };
+
+        // TODO: should use the ID last published to compute_hook, rather than the intent: the intent might
+        // point to somewhere we haven't attached yet.
+        let Some(node_id) = shard.intent.attached else {
+            return Err(ApiError::Conflict(
+                "Cannot call timeline API on non-attached tenant".to_string(),
+            ));
+        };
+
+        let Some(node) = locked.nodes.get(&node_id) else {
+            // This should never happen
+            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Shard refers to nonexistent node"
+            )));
+        };
+
+        Ok((node.base_url(), *tenant_shard_id))
+    }
+
     pub(crate) fn tenant_locate(
         &self,
         tenant_id: TenantId,
@@ -993,6 +1334,20 @@ impl Service {
         Ok(TenantShardMigrateResponse {})
     }
 
+    pub(crate) async fn node_list(&self) -> Result<Vec<NodePersistence>, ApiError> {
+        // It is convenient to avoid taking the big lock and converting Node to a serializable
+        // structure, by fetching from storage instead of reading in-memory state.
+        let nodes = self
+            .persistence
+            .list_nodes()
+            .await?
+            .into_iter()
+            .map(|n| n.to_persistent())
+            .collect();
+
+        Ok(nodes)
+    }
+
     pub(crate) async fn node_register(
         &self,
         register_req: NodeRegisterRequest,
@@ -1166,7 +1521,7 @@ impl Service {
     /// Helper for methods that will try and call pageserver APIs for
     /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
     /// is attached somewhere.
-    fn ensure_attached(
+    fn ensure_attached_schedule(
         &self,
         mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
         tenant_id: TenantId,
@@ -1196,6 +1551,23 @@ impl Service {
         Ok(waiters)
     }
 
+    async fn ensure_attached_wait(&self, tenant_id: TenantId) -> Result<(), ApiError> {
+        let ensure_waiters = {
+            let locked = self.inner.write().unwrap();
+
+            self.ensure_attached_schedule(locked, tenant_id)
+                .map_err(ApiError::InternalServerError)?
+        };
+
+        let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
+        for waiter in ensure_waiters {
+            let timeout = deadline.duration_since(Instant::now());
+            waiter.wait_timeout(timeout).await?;
+        }
+
+        Ok(())
+    }
+
     /// Check all tenants for pending reconciliation work, and reconcile those in need
     ///
     /// Returns how many reconciliation tasks were started
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 6602aa9a73..7816d0953b 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -17,6 +17,7 @@ use serde::{de::DeserializeOwned, Deserialize, Serialize};
 use std::{env, str::FromStr};
 use tokio::process::Command;
 use tracing::instrument;
+use url::Url;
 use utils::{
     auth::{Claims, Scope},
     id::{NodeId, TenantId},
@@ -59,6 +60,7 @@ pub struct InspectResponse {
 
 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
+    pub shard_id: TenantShardId,
     pub node_id: NodeId,
     pub generation: u32,
 }
@@ -523,13 +525,15 @@ impl AttachmentService {
         RQ: Serialize + Sized,
         RS: DeserializeOwned + Sized,
     {
-        let url = self
-            .env
-            .control_plane_api
-            .clone()
-            .unwrap()
-            .join(&path)
-            .unwrap();
+        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
+        // for general purpose API access.
+        let listen_url = self.env.control_plane_api.clone().unwrap();
+        let url = Url::from_str(&format!(
+            "http://{}:{}/{path}",
+            listen_url.host_str().unwrap(),
+            listen_url.port().unwrap()
+        ))
+        .unwrap();
 
         let mut builder = self.client.request(method, url);
         if let Some(body) = body {
@@ -566,7 +570,7 @@ impl AttachmentService {
         let response = self
             .dispatch::<_, AttachHookResponse>(
                 Method::POST,
-                "attach-hook".to_string(),
+                "debug/v1/attach-hook".to_string(),
                 Some(request),
             )
             .await?;
@@ -582,7 +586,11 @@ impl AttachmentService {
         let request = InspectRequest { tenant_shard_id };
 
         let response = self
-            .dispatch::<_, InspectResponse>(Method::POST, "inspect".to_string(), Some(request))
+            .dispatch::<_, InspectResponse>(
+                Method::POST,
+                "debug/v1/inspect".to_string(),
+                Some(request),
+            )
             .await?;
 
         Ok(response.attachment)
@@ -599,8 +607,12 @@ impl AttachmentService {
 
     #[instrument(skip(self))]
     pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
-        self.dispatch::<(), _>(Method::GET, format!("tenant/{tenant_id}/locate"), None)
-            .await
+        self.dispatch::<(), _>(
+            Method::GET,
+            format!("control/v1/tenant/{tenant_id}/locate"),
+            None,
+        )
+        .await
     }
 
     #[instrument(skip(self))]
@@ -622,7 +634,7 @@ impl AttachmentService {
 
     #[instrument(skip_all, fields(node_id=%req.node_id))]
     pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
-        self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req))
+        self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
             .await
     }
 
@@ -630,7 +642,7 @@ impl AttachmentService {
     pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> {
         self.dispatch::<_, ()>(
             Method::PUT,
-            format!("node/{}/config", req.node_id),
+            format!("control/v1/node/{}/config", req.node_id),
             Some(req),
         )
         .await
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index a5242e3dc7..d5abda729f 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -51,7 +51,7 @@ project_git_version!(GIT_VERSION);
 
 const DEFAULT_PG_VERSION: &str = "15";
 
-const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";
+const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
 
 fn default_conf(num_pageservers: u16) -> String {
     let mut template = format!(
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 86d2c2a7ca..d885553cc7 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -364,6 +364,19 @@ pub struct TenantLocationConfigRequest {
     pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantShardLocation {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantLocationConfigResponse {
+    pub shards: Vec<TenantShardLocation>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantConfigRequest {
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 077c3909e1..91b9afa026 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -69,6 +69,25 @@ impl Client {
         resp.json().await.map_err(Error::ReceiveBody)
     }
 
+    /// Get an arbitrary path and returning a streaming Response.  This function is suitable
+    /// for pass-through/proxy use cases where we don't care what the response content looks
+    /// like.
+    ///
+    /// Use/add one of the properly typed methods below if you know aren't proxying, and
+    /// know what kind of response you expect.
+    pub async fn get_raw(&self, path: String) -> Result<reqwest::Response> {
+        debug_assert!(path.starts_with('/'));
+        let uri = format!("{}{}", self.mgmt_api_endpoint, path);
+
+        let req = self.client.request(Method::GET, uri);
+        let req = if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        };
+        req.send().await.map_err(Error::ReceiveBody)
+    }
+
     pub async fn tenant_details(
         &self,
         tenant_shard_id: TenantShardId,
@@ -171,6 +190,25 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    /// The tenant deletion API can return 202 if deletion is incomplete, or
+    /// 404 if it is complete.  Callers are responsible for checking the status
+    /// code and retrying.  Error codes other than 404 will return Err().
+    pub async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result<StatusCode> {
+        let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint);
+
+        match self.request(Method::DELETE, &uri, ()).await {
+            Err(Error::ApiError(status_code, msg)) => {
+                if status_code == StatusCode::NOT_FOUND {
+                    Ok(StatusCode::NOT_FOUND)
+                } else {
+                    Err(Error::ApiError(status_code, msg))
+                }
+            }
+            Err(e) => Err(e),
+            Ok(response) => Ok(response.status()),
+        }
+    }
+
     pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
         let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
         self.request(Method::PUT, &uri, req).await?;
@@ -234,6 +272,32 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    /// The timeline deletion API can return 201 if deletion is incomplete, or
+    /// 403 if it is complete.  Callers are responsible for checking the status
+    /// code and retrying.  Error codes other than 403 will return Err().
+    pub async fn timeline_delete(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<StatusCode> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
+            self.mgmt_api_endpoint
+        );
+
+        match self.request(Method::DELETE, &uri, ()).await {
+            Err(Error::ApiError(status_code, msg)) => {
+                if status_code == StatusCode::NOT_FOUND {
+                    Ok(StatusCode::NOT_FOUND)
+                } else {
+                    Err(Error::ApiError(status_code, msg))
+                }
+            }
+            Err(e) => Err(e),
+            Ok(response) => Ok(response.status()),
+        }
+    }
+
     pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
         let uri = format!(
             "{}/v1/tenant/{}/reset",
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index a49eef8bb9..676a63937d 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -674,6 +674,10 @@ paths:
       responses:
         "200":
           description: Tenant is now in requested state
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/TenantLocationConfigResponse"
         "503":
           description: Tenant's state cannot be changed right now.  Wait a few seconds and retry.
           content:
@@ -1426,6 +1430,27 @@ components:
           $ref: '#/components/schemas/SecondaryConfig'
         tenant_conf:
           $ref: '#/components/schemas/TenantConfig'
+    TenantLocationConfigResponse:
+      type: object
+      required:
+        - shards
+      properties:
+        shards:
+          description: Pageservers where this tenant's shards are attached.  Not populated for secondary locations.
+          type: array
+          items:
+            $ref: "#/components/schemas/TenantShardLocation"
+    TenantShardLocation:
+      type: object
+      required:
+        - node_id
+        - shard_id
+      properties:
+        node_id:
+          description: Pageserver node ID where this shard is attached
+          type: integer
+        shard_id: Tenant shard ID of the shard
+          type: string
     SecondaryConfig:
       type: object
       properties:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index aa56806246..c025a25ef1 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -17,6 +17,8 @@ use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
+use pageserver_api::models::TenantLocationConfigResponse;
+use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
     DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
@@ -1356,7 +1358,7 @@ async fn put_tenant_location_config_handler(
     let location_conf =
         LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
 
-    state
+    let attached = state
         .tenant_manager
         .upsert_location(
             tenant_shard_id,
@@ -1365,7 +1367,8 @@ async fn put_tenant_location_config_handler(
             tenant::SpawnMode::Normal,
             &ctx,
         )
-        .await?;
+        .await?
+        .is_some();
 
     if let Some(_flush_ms) = flush {
         match state
@@ -1384,7 +1387,18 @@ async fn put_tenant_location_config_handler(
         tracing::info!("No flush requested when configuring");
     }
 
-    json_response(StatusCode::OK, ())
+    // This API returns a vector of pageservers where the tenant is attached: this is
+    // primarily for use in the sharding service.  For compatibilty, we also return this
+    // when called directly on a pageserver, but the payload is always zero or one shards.
+    let mut response = TenantLocationConfigResponse { shards: Vec::new() };
+    if attached {
+        response.shards.push(TenantShardLocation {
+            shard_id: tenant_shard_id,
+            node_id: state.conf.id,
+        })
+    }
+
+    json_response(StatusCode::OK, response)
 }
 
 async fn list_location_config_handler(
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0f79df74ba..5be7551a1e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -993,13 +993,20 @@ class NeonEnv:
         self.initial_tenant = config.initial_tenant
         self.initial_timeline = config.initial_timeline
 
-        attachment_service_port = self.port_distributor.get_port()
-        # Reserve the next port after attachment service for use by its postgres: this
-        # will assert out if the next port wasn't free.
-        attachment_service_pg_port = self.port_distributor.get_port()
-        assert attachment_service_pg_port == attachment_service_port + 1
+        # Find two adjacent ports for attachment service and its postgres DB.  This
+        # loop would eventually throw from get_port() if we run out of ports (extremely
+        # unlikely): usually we find two adjacent free ports on the first iteration.
+        while True:
+            self.attachment_service_port = self.port_distributor.get_port()
+            attachment_service_pg_port = self.port_distributor.get_port()
+            if attachment_service_pg_port == self.attachment_service_port + 1:
+                break
+
+        # The URL for the pageserver to use as its control_plane_api config
+        self.control_plane_api: str = f"http://127.0.0.1:{self.attachment_service_port}/upcall/v1"
+        # The base URL of the attachment service
+        self.attachment_service_api: str = f"http://127.0.0.1:{self.attachment_service_port}"
 
-        self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
         self.attachment_service: NeonAttachmentService = NeonAttachmentService(
             self, config.auth_enabled
         )
@@ -1914,6 +1921,14 @@ class NeonAttachmentService:
             self.running = False
         return self
 
+    def pageserver_api(self) -> PageserverHttpClient:
+        """
+        The attachment service implements a subset of the pageserver REST API, for mapping
+        per-tenant actions into per-shard actions (e.g. timeline creation).  Tests should invoke those
+        functions via the HttpClient, as an implicit check that these APIs remain compatible.
+        """
+        return PageserverHttpClient(self.env.attachment_service_port, lambda: True)
+
     def request(self, method, *args, **kwargs) -> requests.Response:
         kwargs["headers"] = self.headers()
         return requests.request(method, *args, **kwargs)
@@ -1931,7 +1946,7 @@ class NeonAttachmentService:
     ) -> int:
         response = self.request(
             "POST",
-            f"{self.env.control_plane_api}/attach-hook",
+            f"{self.env.attachment_service_api}/debug/v1/attach-hook",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id},
             headers=self.headers(),
         )
@@ -1943,7 +1958,7 @@ class NeonAttachmentService:
     def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
         response = self.request(
             "POST",
-            f"{self.env.control_plane_api}/attach-hook",
+            f"{self.env.attachment_service_api}/debug/v1/attach-hook",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
             headers=self.headers(),
         )
@@ -1955,7 +1970,7 @@ class NeonAttachmentService:
         """
         response = self.request(
             "POST",
-            f"{self.env.control_plane_api}/inspect",
+            f"{self.env.attachment_service_api}/debug/v1/inspect",
             json={"tenant_shard_id": str(tenant_shard_id)},
             headers=self.headers(),
         )
@@ -1976,7 +1991,27 @@ class NeonAttachmentService:
         }
         log.info(f"node_register({body})")
         self.request(
-            "POST", f"{self.env.control_plane_api}/node", json=body, headers=self.headers()
+            "POST",
+            f"{self.env.attachment_service_api}/control/v1/node",
+            json=body,
+            headers=self.headers(),
+        ).raise_for_status()
+
+    def node_list(self):
+        response = self.request(
+            "GET", f"{self.env.attachment_service_api}/control/v1/node", headers=self.headers()
+        )
+        response.raise_for_status()
+        return response.json()
+
+    def node_configure(self, node_id, body: dict[str, Any]):
+        log.info(f"node_configure({node_id}, {body})")
+        body["node_id"] = node_id
+        self.request(
+            "PUT",
+            f"{self.env.attachment_service_api}/control/v1/node/{node_id}/config",
+            json=body,
+            headers=self.headers(),
         ).raise_for_status()
 
     def tenant_create(
@@ -1986,6 +2021,9 @@ class NeonAttachmentService:
         shard_stripe_size: Optional[int] = None,
         tenant_config: Optional[Dict[Any, Any]] = None,
     ):
+        """
+        Use this rather than pageserver_api() when you need to include shard parameters
+        """
         body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
 
         if shard_count is not None:
@@ -1999,21 +2037,17 @@ class NeonAttachmentService:
             for k, v in tenant_config.items():
                 body[k] = v
 
-        response = self.request("POST", f"{self.env.control_plane_api}/tenant", json=body)
+        response = self.request("POST", f"{self.env.attachment_service_api}/v1/tenant", json=body)
         response.raise_for_status()
         log.info(f"tenant_create success: {response.json()}")
 
-    def tenant_timeline_create(self, tenant_id: TenantId, timeline_id: TimelineId):
-        body: Dict[str, Any] = {"new_timeline_id": str(timeline_id)}
-
-        response = self.request(
-            "POST", f"{self.env.control_plane_api}/tenant/{tenant_id}/timeline", json=body
-        )
-        response.raise_for_status()
-        log.info(f"tenant_timeline_create success: {response.json()}")
-
     def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
-        response = self.request("GET", f"{self.env.control_plane_api}/tenant/{tenant_id}/locate")
+        """
+        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
+        """
+        response = self.request(
+            "GET", f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/locate"
+        )
         response.raise_for_status()
         body = response.json()
         shards: list[dict[str, Any]] = body["shards"]
@@ -2022,7 +2056,7 @@ class NeonAttachmentService:
     def tenant_shard_split(self, tenant_id: TenantId, shard_count: int) -> list[TenantShardId]:
         response = self.request(
             "PUT",
-            f"{self.env.control_plane_api}/tenant/{tenant_id}/shard_split",
+            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/shard_split",
             json={"new_shard_count": shard_count},
         )
         response.raise_for_status()
@@ -2034,7 +2068,7 @@ class NeonAttachmentService:
     def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
         response = self.request(
             "PUT",
-            f"{self.env.control_plane_api}/tenant/{tenant_shard_id}/migrate",
+            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_shard_id}/migrate",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
         )
         response.raise_for_status()
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
new file mode 100644
index 0000000000..3b2c9334db
--- /dev/null
+++ b/test_runner/regress/test_sharding_service.py
@@ -0,0 +1,272 @@
+import time
+from collections import defaultdict
+
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.utils import tenant_delete_wait_completed, timeline_delete_wait_completed
+from fixtures.pg_version import PgVersion
+from fixtures.types import TenantId, TimelineId
+from fixtures.utils import wait_until
+
+
+def test_sharding_service_smoke(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test the basic lifecycle of a sharding service:
+    - Restarting
+    - Restarting a pageserver
+    - Creating and deleting tenants and timelines
+    - Marking a pageserver offline
+    """
+
+    neon_env_builder.num_pageservers = 3
+    env = neon_env_builder.init_configs()
+
+    # Start services by hand so that we can skip a pageserver (this will start + register later)
+    env.broker.try_start()
+    env.attachment_service.start()
+    env.pageservers[0].start()
+    env.pageservers[1].start()
+    for sk in env.safekeepers:
+        sk.start()
+
+    # The pageservers we started should have registered with the sharding service on startup
+    nodes = env.attachment_service.node_list()
+    assert len(nodes) == 2
+    assert set(n["node_id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}
+
+    # Starting an additional pageserver should register successfully
+    env.pageservers[2].start()
+    nodes = env.attachment_service.node_list()
+    assert len(nodes) == 3
+    assert set(n["node_id"] for n in nodes) == {ps.id for ps in env.pageservers}
+
+    # Use a multiple of pageservers to get nice even number of shards on each one
+    tenant_shard_count = len(env.pageservers) * 4
+    tenant_count = len(env.pageservers) * 2
+    shards_per_tenant = tenant_shard_count // tenant_count
+    tenant_ids = set(TenantId.generate() for i in range(0, tenant_count))
+
+    # Creating several tenants should spread out across the pageservers
+    for tid in tenant_ids:
+        env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)
+
+    def get_node_shard_counts():
+        counts: defaultdict[str, int] = defaultdict(int)
+        for tid in tenant_ids:
+            for shard in env.attachment_service.locate(tid):
+                counts[shard["node_id"]] += 1
+        return counts
+
+    for node_id, count in get_node_shard_counts().items():
+        # we used a multiple of pagservers for the total shard count,
+        # so expect equal number on all pageservers
+        assert count == tenant_shard_count / len(
+            env.pageservers
+        ), f"Node {node_id} has bad count {count}"
+
+    # Creating and deleting timelines should work, using identical API to pageserver
+    timeline_crud_tenant = next(iter(tenant_ids))
+    timeline_id = TimelineId.generate()
+    env.attachment_service.pageserver_api().timeline_create(
+        pg_version=PgVersion.NOT_SET, tenant_id=timeline_crud_tenant, new_timeline_id=timeline_id
+    )
+    timelines = env.attachment_service.pageserver_api().timeline_list(timeline_crud_tenant)
+    assert len(timelines) == 2
+    assert timeline_id in set(TimelineId(t["timeline_id"]) for t in timelines)
+    #    virtual_ps_http.timeline_delete(tenant_id=timeline_crud_tenant, timeline_id=timeline_id)
+    timeline_delete_wait_completed(
+        env.attachment_service.pageserver_api(), timeline_crud_tenant, timeline_id
+    )
+    timelines = env.attachment_service.pageserver_api().timeline_list(timeline_crud_tenant)
+    assert len(timelines) == 1
+    assert timeline_id not in set(TimelineId(t["timeline_id"]) for t in timelines)
+
+    # Marking a pageserver offline should migrate tenants away from it.
+    env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
+
+    def node_evacuated(node_id: int):
+        counts = get_node_shard_counts()
+        assert counts[node_id] == 0
+
+    wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))
+
+    # Marking pageserver active should not migrate anything to it
+    # immediately
+    env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Active"})
+    time.sleep(1)
+    assert get_node_shard_counts()[env.pageservers[0].id] == 0
+
+    # Delete all the tenants
+    for tid in tenant_ids:
+        tenant_delete_wait_completed(env.attachment_service.pageserver_api(), tid, 10)
+
+    # Set a scheduling policy on one node, create all the tenants, observe
+    # that the scheduling policy is respected.
+    env.attachment_service.node_configure(env.pageservers[1].id, {"scheduling": "Draining"})
+
+    # Create some fresh tenants
+    tenant_ids = set(TenantId.generate() for i in range(0, tenant_count))
+    for tid in tenant_ids:
+        env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)
+
+    counts = get_node_shard_counts()
+    # Nothing should have been scheduled on the node in Draining
+    assert counts[env.pageservers[1].id] == 0
+    assert counts[env.pageservers[0].id] == tenant_shard_count // 2
+    assert counts[env.pageservers[2].id] == tenant_shard_count // 2
+
+
+def test_sharding_service_passthrough(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    For simple timeline/tenant GET APIs that don't require coordination across
+    shards, the sharding service implements a proxy to shard zero.  This test
+    calls those APIs.
+    """
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_start()
+
+    # We will talk to attachment service as if it was a pageserver, using the pageserver
+    # HTTP client
+    client = PageserverHttpClient(env.attachment_service_port, lambda: True)
+    timelines = client.timeline_list(tenant_id=env.initial_tenant)
+    assert len(timelines) == 1
+
+
+def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    tenant_a = env.initial_tenant
+    tenant_b = TenantId.generate()
+    env.attachment_service.tenant_create(tenant_b)
+    env.pageserver.tenant_detach(tenant_a)
+
+    # TODO: extend this test to use multiple pageservers, and check that locations don't move around
+    # on restart.
+
+    # Attachment service restart
+    env.attachment_service.stop()
+    env.attachment_service.start()
+
+    observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list())
+
+    # Tenant A should still be attached
+    assert tenant_a not in observed
+
+    # Tenant B should remain detached
+    assert tenant_b in observed
+
+    # Pageserver restart
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    # Same assertions as above: restarting either service should not perturb things
+    observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list())
+    assert tenant_a not in observed
+    assert tenant_b in observed
+
+
+def test_sharding_service_onboarding(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
+    which provides the /location_config API.  This is similar to creating a tenant,
+    but imports the generation number.
+    """
+
+    neon_env_builder.num_pageservers = 2
+
+    # Start services by hand so that we can skip registration on one of the pageservers
+    env = neon_env_builder.init_configs()
+    env.broker.try_start()
+    env.attachment_service.start()
+
+    # This is the pageserver where we'll initially create the tenant
+    env.pageservers[0].start(register=False)
+    origin_ps = env.pageservers[0]
+
+    # This is the pageserver managed by the sharding service, where the tenant
+    # will be attached after onboarding
+    env.pageservers[1].start(register=True)
+    dest_ps = env.pageservers[1]
+    virtual_ps_http = PageserverHttpClient(env.attachment_service_port, lambda: True)
+
+    for sk in env.safekeepers:
+        sk.start()
+
+    # Create a tenant directly via pageserver HTTP API, skipping the attachment service
+    tenant_id = TenantId.generate()
+    generation = 123
+    origin_ps.http_client().tenant_create(tenant_id, generation=generation)
+
+    # As if doing a live migration, first configure origin into stale mode
+    origin_ps.http_client().tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "AttachedStale",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": generation,
+        },
+    )
+
+    # Call into attachment service to onboard the tenant
+    generation += 1
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "AttachedMulti",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": generation,
+        },
+    )
+
+    # As if doing a live migration, detach the original pageserver
+    origin_ps.http_client().tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+
+    # As if doing a live migration, call into the attachment service to
+    # set it to AttachedSingle: this is a no-op, but we test it because the
+    # cloud control plane may call this for symmetry with live migration to
+    # an individual pageserver
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "AttachedSingle",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": generation,
+        },
+    )
+
+    # We should see the tenant is now attached to the pageserver managed
+    # by the sharding service
+    origin_tenants = origin_ps.http_client().tenant_list()
+    assert len(origin_tenants) == 0
+    dest_tenants = dest_ps.http_client().tenant_list()
+    assert len(dest_tenants) == 1
+    assert TenantId(dest_tenants[0]["id"]) == tenant_id
+
+    # sharding service advances generation by 1 when it first attaches
+    assert dest_tenants[0]["generation"] == generation + 1
+
+    # The onboarded tenant should survive a restart of sharding service
+    env.attachment_service.stop()
+    env.attachment_service.start()
+
+    # The onboarded tenant should surviev a restart of pageserver
+    dest_ps.stop()
+    dest_ps.start()

From c7b02ce8ec1c6e64782438cdc35700f19ca93219 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 31 Jan 2024 13:51:11 +0000
Subject: [PATCH 0034/1571] proxy: use jemalloc (#6531)

## Summary of changes

Experiment with jemalloc in proxy
---
 Cargo.lock                |  33 +++++++++++++
 Cargo.toml                |   2 +
 proxy/Cargo.toml          |   2 +
 proxy/src/bin/proxy.rs    |  10 ++++
 proxy/src/jemalloc.rs     | 100 ++++++++++++++++++++++++++++++++++++++
 proxy/src/lib.rs          |   1 +
 workspace_hack/Cargo.toml |   4 +-
 7 files changed, 150 insertions(+), 2 deletions(-)
 create mode 100644 proxy/src/jemalloc.rs

diff --git a/Cargo.lock b/Cargo.lock
index e14196350b..28ec84be1f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4080,6 +4080,8 @@ dependencies = [
  "sync_wrapper",
  "task-local-extensions",
  "thiserror",
+ "tikv-jemalloc-ctl",
+ "tikv-jemallocator",
  "tls-listener",
  "tokio",
  "tokio-postgres",
@@ -5530,6 +5532,37 @@ dependencies = [
  "ordered-float 2.10.1",
 ]
 
+[[package]]
+name = "tikv-jemalloc-ctl"
+version = "0.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "619bfed27d807b54f7f776b9430d4f8060e66ee138a28632ca898584d462c31c"
+dependencies = [
+ "libc",
+ "paste",
+ "tikv-jemalloc-sys",
+]
+
+[[package]]
+name = "tikv-jemalloc-sys"
+version = "0.5.4+5.3.0-patched"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9402443cb8fd499b6f327e40565234ff34dbda27460c5b47db0db77443dd85d1"
+dependencies = [
+ "cc",
+ "libc",
+]
+
+[[package]]
+name = "tikv-jemallocator"
+version = "0.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "965fe0c26be5c56c94e38ba547249074803efd52adfb66de62107d95aab3eaca"
+dependencies = [
+ "libc",
+ "tikv-jemalloc-sys",
+]
+
 [[package]]
 name = "time"
 version = "0.3.21"
diff --git a/Cargo.toml b/Cargo.toml
index 29618ca328..26cf604a91 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -149,6 +149,8 @@ tar = "0.4"
 task-local-extensions = "0.1.4"
 test-context = "0.1"
 thiserror = "1.0"
+tikv-jemallocator = "0.5"
+tikv-jemalloc-ctl = "0.5"
 tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index f075c718a7..79abe639ed 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -62,6 +62,8 @@ socket2.workspace = true
 sync_wrapper.workspace = true
 task-local-extensions.workspace = true
 thiserror.workspace = true
+tikv-jemallocator.workspace = true
+tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
 tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index ba113a89eb..3960b080be 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -32,6 +32,9 @@ project_build_tag!(BUILD_TAG);
 
 use clap::{Parser, ValueEnum};
 
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
 #[derive(Clone, Debug, ValueEnum)]
 enum AuthBackend {
     Console,
@@ -187,6 +190,13 @@ async fn main() -> anyhow::Result<()> {
     info!("Build_tag: {BUILD_TAG}");
     ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);
 
+    match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) {
+        Ok(t) => {
+            t.start();
+        }
+        Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"),
+    }
+
     let args = ProxyCliArgs::parse();
     let config = build_config(&args)?;
 
diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs
new file mode 100644
index 0000000000..ed20798d56
--- /dev/null
+++ b/proxy/src/jemalloc.rs
@@ -0,0 +1,100 @@
+use std::time::Duration;
+
+use metrics::IntGauge;
+use prometheus::{register_int_gauge_with_registry, Registry};
+use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version};
+
+pub struct MetricRecorder {
+    epoch: epoch_mib,
+    active: stats::active_mib,
+    active_gauge: IntGauge,
+    allocated: stats::allocated_mib,
+    allocated_gauge: IntGauge,
+    mapped: stats::mapped_mib,
+    mapped_gauge: IntGauge,
+    metadata: stats::metadata_mib,
+    metadata_gauge: IntGauge,
+    resident: stats::resident_mib,
+    resident_gauge: IntGauge,
+    retained: stats::retained_mib,
+    retained_gauge: IntGauge,
+}
+
+impl MetricRecorder {
+    pub fn new(registry: &Registry) -> Result<Self, anyhow::Error> {
+        tracing::info!(
+            config = config::malloc_conf::read()?,
+            version = version::read()?,
+            "starting jemalloc recorder"
+        );
+
+        Ok(Self {
+            epoch: epoch::mib()?,
+            active: stats::active::mib()?,
+            active_gauge: register_int_gauge_with_registry!(
+                "jemalloc_active_bytes",
+                "Total number of bytes in active pages allocated by the process",
+                registry
+            )?,
+            allocated: stats::allocated::mib()?,
+            allocated_gauge: register_int_gauge_with_registry!(
+                "jemalloc_allocated_bytes",
+                "Total number of bytes allocated by the process",
+                registry
+            )?,
+            mapped: stats::mapped::mib()?,
+            mapped_gauge: register_int_gauge_with_registry!(
+                "jemalloc_mapped_bytes",
+                "Total number of bytes in active extents mapped by the allocator",
+                registry
+            )?,
+            metadata: stats::metadata::mib()?,
+            metadata_gauge: register_int_gauge_with_registry!(
+                "jemalloc_metadata_bytes",
+                "Total number of bytes dedicated to jemalloc metadata",
+                registry
+            )?,
+            resident: stats::resident::mib()?,
+            resident_gauge: register_int_gauge_with_registry!(
+                "jemalloc_resident_bytes",
+                "Total number of bytes in physically resident data pages mapped by the allocator",
+                registry
+            )?,
+            retained: stats::retained::mib()?,
+            retained_gauge: register_int_gauge_with_registry!(
+                "jemalloc_retained_bytes",
+                "Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system",
+                registry
+            )?,
+        })
+    }
+
+    fn _poll(&self) -> Result<(), anyhow::Error> {
+        self.epoch.advance()?;
+        self.active_gauge.set(self.active.read()? as i64);
+        self.allocated_gauge.set(self.allocated.read()? as i64);
+        self.mapped_gauge.set(self.mapped.read()? as i64);
+        self.metadata_gauge.set(self.metadata.read()? as i64);
+        self.resident_gauge.set(self.resident.read()? as i64);
+        self.retained_gauge.set(self.retained.read()? as i64);
+        Ok(())
+    }
+
+    #[inline]
+    pub fn poll(&self) {
+        if let Err(error) = self._poll() {
+            tracing::warn!(%error, "Failed to poll jemalloc stats");
+        }
+    }
+
+    pub fn start(self) -> tokio::task::JoinHandle<()> {
+        tokio::task::spawn(async move {
+            let mut interval = tokio::time::interval(Duration::from_secs(15));
+            interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+            loop {
+                self.poll();
+                interval.tick().await;
+            }
+        })
+    }
+}
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index a9e4a38302..db6256d611 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -16,6 +16,7 @@ pub mod console;
 pub mod context;
 pub mod error;
 pub mod http;
+pub mod jemalloc;
 pub mod logging;
 pub mod metrics;
 pub mod parse;
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index c29f8b422f..8fd49956cc 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -45,7 +45,7 @@ hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
-libc = { version = "0.2", features = ["extra_traits"] }
+libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
@@ -94,7 +94,7 @@ getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", default-features = false, features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
-libc = { version = "0.2", features = ["extra_traits"] }
+libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }

From 47380be12d8f8e4b004c7ef0c3833de161f8ab37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 31 Jan 2024 15:30:19 +0100
Subject: [PATCH 0035/1571] Remove version param from get_lsn_by_timestamp
 (#6551)

This removes the last remnants of the version param added by #5608 ,
concluding the transition plan laid out in
https://github.com/neondatabase/cloud/pull/7553#discussion_r1370473911 .
It follows PR https://github.com/neondatabase/cloud/pull/9202, which we
now assume has been deployed to all environments.

Full history:

* https://github.com/neondatabase/neon/pull/5608
* https://github.com/neondatabase/cloud/pull/7553
* https://github.com/neondatabase/neon/pull/6178
* https://github.com/neondatabase/cloud/pull/9202
---
 pageserver/src/http/openapi_spec.yml    | 6 ------
 test_runner/fixtures/pageserver/http.py | 7 +------
 test_runner/regress/test_lsn_mapping.py | 2 +-
 3 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 676a63937d..e2a2865145 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -419,12 +419,6 @@ paths:
             type: string
             format: date-time
           description: A timestamp to get the LSN
-        - name: version
-          in: query
-          required: false
-          schema:
-            type: integer
-          description: The version of the endpoint to use
       responses:
         "200":
           description: OK
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 340cc9e9e3..65675aebe1 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -549,17 +549,12 @@ class PageserverHttpClient(requests.Session):
         tenant_id: Union[TenantId, TenantShardId],
         timeline_id: TimelineId,
         timestamp,
-        version: Optional[int] = None,
     ):
         log.info(
             f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
         )
-        if version is None:
-            version_str = ""
-        else:
-            version_str = f"&version={version}"
         res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}{version_str}",
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}",
         )
         self.verbose_error(res)
         res_json = res.json()
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 65d6d7a9fd..9788e8c0d7 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -109,7 +109,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         # Timestamp is in the unreachable past
         probe_timestamp = tbl[0][1] - timedelta(hours=10)
         result = client.timeline_get_lsn_by_timestamp(
-            tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z", 2
+            tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z"
         )
         assert result["kind"] == "past"
         # make sure that we return the minimum lsn here at the start of the range

From 799db161d3d352947b08c64fd5f26c6331fc89a1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 31 Jan 2024 17:37:25 +0200
Subject: [PATCH 0036/1571] tests: support for running on single pg version,
 use in one place (#6525)

Some tests which are unit test alike do not need to run on different pg
versions. Logging test is one of them which I found for unrelated
reasons.

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 test_runner/fixtures/pg_version.py  | 9 ++++++++-
 test_runner/regress/test_logging.py | 2 ++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py
index 657718da00..941889a2f5 100644
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -52,7 +52,7 @@ class PgVersion(str, enum.Enum):
         return None
 
 
-DEFAULT_VERSION: PgVersion = PgVersion.V14
+DEFAULT_VERSION: PgVersion = PgVersion.V15
 
 
 def skip_on_postgres(version: PgVersion, reason: str):
@@ -78,6 +78,13 @@ def pytest_addoption(parser: Parser):
     )
 
 
+def run_only_on_default_postgres(reason: str):
+    return pytest.mark.skipif(
+        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is not DEFAULT_VERSION,
+        reason=reason,
+    )
+
+
 def pytest_configure(config: Config):
     if config.getoption("--pg-version"):
         raise Exception("--pg-version is deprecated, use DEFAULT_PG_VERSION env var instead")
diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py
index d559be0a8f..d62b5e531c 100644
--- a/test_runner/regress/test_logging.py
+++ b/test_runner/regress/test_logging.py
@@ -3,10 +3,12 @@ import uuid
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.pg_version import run_only_on_default_postgres
 from fixtures.utils import wait_until
 
 
 @pytest.mark.parametrize("level", ["trace", "debug", "info", "warn", "error"])
+@run_only_on_default_postgres("it does not use any postgres functionality")
 def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
     # self-test: make sure the event is logged (i.e., our testing endpoint works)
     log_expected = {

From 2bfc831c60181d2abaa16a55d45c7b3d8b988eef Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 31 Jan 2024 17:02:41 +0000
Subject: [PATCH 0037/1571] control_plane/attachment_service: make --path
 optional (#6545)

## Problem

The `--path` argument is only used in testing, for compat tests that use
a JSON snapshot of state rather than the postgres database. In regular
deployments, it should be omitted (currently one has to specify `--path
""`)

## Summary of changes

Make `--path` optional.
---
 control_plane/attachment_service/src/main.rs | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 05a3895dfa..7c716a9f53 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -39,7 +39,7 @@ struct Cli {
 
     /// Path to the .json file to store state (will be created if it doesn't exist)
     #[arg(short, long)]
-    path: Utf8PathBuf,
+    path: Option<Utf8PathBuf>,
 
     /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
     #[arg(long)]
@@ -62,7 +62,7 @@ async fn main() -> anyhow::Result<()> {
         GIT_VERSION,
         launch_ts.to_string(),
         BUILD_TAG,
-        args.path,
+        args.path.as_ref().unwrap_or(&Utf8PathBuf::from("<none>")),
         args.listen
     );
 
@@ -70,11 +70,7 @@ async fn main() -> anyhow::Result<()> {
         jwt_token: args.jwt_token,
     };
 
-    let json_path = if args.path.as_os_str().is_empty() {
-        None
-    } else {
-        Some(args.path)
-    };
+    let json_path = args.path;
     let persistence = Arc::new(Persistence::new(args.database_url, json_path.clone()));
 
     let service = Service::spawn(config, persistence.clone()).await?;

From 9a9d9beaeef393aa3ad8ba5b7700adfaab857126 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 31 Jan 2024 21:39:18 +0200
Subject: [PATCH 0038/1571] Download SLRU segments on demand (#6151)

## Problem

See https://github.com/neondatabase/cloud/issues/8673

## Summary of changes


Download missed SLRU segments from page server

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 control_plane/src/pageserver.rs               |  10 ++
 libs/pageserver_api/src/models.rs             |  50 ++++++++
 libs/pageserver_api/src/reltag.rs             |   4 +-
 pageserver/client/src/page_service.rs         |   3 +-
 pageserver/src/basebackup.rs                  |  38 +++---
 pageserver/src/metrics.rs                     |   4 +-
 pageserver/src/page_service.rs                |  41 +++++-
 pageserver/src/pgdatadir_mapping.rs           |  23 +++-
 pageserver/src/tenant.rs                      |   1 +
 pageserver/src/tenant/config.rs               |  12 ++
 pageserver/src/tenant/timeline.rs             |   7 +
 pgxn/neon/pagestore_client.h                  |  25 ++++
 pgxn/neon/pagestore_smgr.c                    | 120 ++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py         |   9 ++
 test_runner/performance/test_lazy_startup.py  | 111 ++++++++++++++++
 .../regress/test_attach_tenant_config.py      |   1 +
 trace/src/main.rs                             |   1 +
 vendor/postgres-v14                           |   2 +-
 vendor/postgres-v15                           |   2 +-
 vendor/postgres-v16                           |   2 +-
 vendor/revisions.json                         |   6 +-
 21 files changed, 442 insertions(+), 30 deletions(-)
 create mode 100644 test_runner/performance/test_lazy_startup.py

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 540d1185a2..a1b0ba4252 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -395,6 +395,11 @@ impl PageServerNode {
                 .transpose()
                 .context("Failed to parse 'gc_feedback' as bool")?,
             heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
+            lazy_slru_download: settings
+                .remove("lazy_slru_download")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'lazy_slru_download' as bool")?,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
@@ -495,6 +500,11 @@ impl PageServerNode {
                     .transpose()
                     .context("Failed to parse 'gc_feedback' as bool")?,
                 heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
+                lazy_slru_download: settings
+                    .remove("lazy_slru_download")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'lazy_slru_download' as bool")?,
             }
         };
 
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d885553cc7..a7598f9fda 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -8,6 +8,7 @@ use std::{
 };
 
 use byteorder::{BigEndian, ReadBytesExt};
+use postgres_ffi::BLCKSZ;
 use serde::{Deserialize, Serialize};
 use serde_with::serde_as;
 use strum_macros;
@@ -271,6 +272,7 @@ pub struct TenantConfig {
     pub evictions_low_residence_duration_metric_threshold: Option<String>,
     pub gc_feedback: Option<bool>,
     pub heatmap_period: Option<String>,
+    pub lazy_slru_download: Option<bool>,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -646,6 +648,7 @@ pub enum PagestreamFeMessage {
     Nblocks(PagestreamNblocksRequest),
     GetPage(PagestreamGetPageRequest),
     DbSize(PagestreamDbSizeRequest),
+    GetSlruSegment(PagestreamGetSlruSegmentRequest),
 }
 
 // Wrapped in libpq CopyData
@@ -656,6 +659,7 @@ pub enum PagestreamBeMessage {
     GetPage(PagestreamGetPageResponse),
     Error(PagestreamErrorResponse),
     DbSize(PagestreamDbSizeResponse),
+    GetSlruSegment(PagestreamGetSlruSegmentResponse),
 }
 
 // Keep in sync with `pagestore_client.h`
@@ -666,6 +670,7 @@ enum PagestreamBeMessageTag {
     GetPage = 102,
     Error = 103,
     DbSize = 104,
+    GetSlruSegment = 105,
 }
 impl TryFrom<u8> for PagestreamBeMessageTag {
     type Error = u8;
@@ -676,6 +681,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
             102 => Ok(PagestreamBeMessageTag::GetPage),
             103 => Ok(PagestreamBeMessageTag::Error),
             104 => Ok(PagestreamBeMessageTag::DbSize),
+            105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
             _ => Err(value),
         }
     }
@@ -710,6 +716,14 @@ pub struct PagestreamDbSizeRequest {
     pub dbnode: u32,
 }
 
+#[derive(Debug, PartialEq, Eq)]
+pub struct PagestreamGetSlruSegmentRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub kind: u8,
+    pub segno: u32,
+}
+
 #[derive(Debug)]
 pub struct PagestreamExistsResponse {
     pub exists: bool,
@@ -725,6 +739,11 @@ pub struct PagestreamGetPageResponse {
     pub page: Bytes,
 }
 
+#[derive(Debug)]
+pub struct PagestreamGetSlruSegmentResponse {
+    pub segment: Bytes,
+}
+
 #[derive(Debug)]
 pub struct PagestreamErrorResponse {
     pub message: String,
@@ -788,6 +807,14 @@ impl PagestreamFeMessage {
                 bytes.put_u64(req.lsn.0);
                 bytes.put_u32(req.dbnode);
             }
+
+            Self::GetSlruSegment(req) => {
+                bytes.put_u8(4);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
+                bytes.put_u8(req.kind);
+                bytes.put_u32(req.segno);
+            }
         }
 
         bytes.into()
@@ -838,6 +865,14 @@ impl PagestreamFeMessage {
                 lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                 dbnode: body.read_u32::<BigEndian>()?,
             })),
+            4 => Ok(PagestreamFeMessage::GetSlruSegment(
+                PagestreamGetSlruSegmentRequest {
+                    latest: body.read_u8()? != 0,
+                    lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                    kind: body.read_u8()?,
+                    segno: body.read_u32::<BigEndian>()?,
+                },
+            )),
             _ => bail!("unknown smgr message tag: {:?}", msg_tag),
         }
     }
@@ -873,6 +908,12 @@ impl PagestreamBeMessage {
                 bytes.put_u8(Tag::DbSize as u8);
                 bytes.put_i64(resp.db_size);
             }
+
+            Self::GetSlruSegment(resp) => {
+                bytes.put_u8(Tag::GetSlruSegment as u8);
+                bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
+                bytes.put(&resp.segment[..]);
+            }
         }
 
         bytes.into()
@@ -913,6 +954,14 @@ impl PagestreamBeMessage {
                     let db_size = buf.read_i64::<BigEndian>()?;
                     Self::DbSize(PagestreamDbSizeResponse { db_size })
                 }
+                Tag::GetSlruSegment => {
+                    let n_blocks = buf.read_u32::<BigEndian>()?;
+                    let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
+                    buf.read_exact(&mut segment)?;
+                    Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
+                        segment: segment.into(),
+                    })
+                }
             };
         let remaining = buf.into_inner();
         if !remaining.is_empty() {
@@ -931,6 +980,7 @@ impl PagestreamBeMessage {
             Self::GetPage(_) => "GetPage",
             Self::Error(_) => "Error",
             Self::DbSize(_) => "DbSize",
+            Self::GetSlruSegment(_) => "GetSlruSegment",
         }
     }
 }
diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs
index 3f37af600d..8eb848a514 100644
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -123,9 +123,11 @@ impl RelTag {
     PartialOrd,
     Ord,
     strum_macros::EnumIter,
+    strum_macros::FromRepr,
 )]
+#[repr(u8)]
 pub enum SlruKind {
-    Clog,
+    Clog = 0,
     MultiXactMembers,
     MultiXactOffsets,
 }
diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs
index ff542670f1..49175b3b90 100644
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -156,7 +156,8 @@ impl PagestreamClient {
             PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
             PagestreamBeMessage::Exists(_)
             | PagestreamBeMessage::Nblocks(_)
-            | PagestreamBeMessage::DbSize(_) => {
+            | PagestreamBeMessage::DbSize(_)
+            | PagestreamBeMessage::GetSlruSegment(_) => {
                 anyhow::bail!(
                     "unexpected be message kind in response to getpage request: {}",
                     msg.kind()
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 009deff0aa..7edfab75d4 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -222,6 +222,8 @@ where
     async fn send_tarball(mut self) -> anyhow::Result<()> {
         // TODO include checksum
 
+        let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
+
         // Create pgdata subdirs structure
         for dir in PGDATA_SUBDIRS.iter() {
             let header = new_tar_header_dir(dir)?;
@@ -248,29 +250,29 @@ where
                     .context("could not add config file to basebackup tarball")?;
             }
         }
-
-        // Gather non-relational files from object storage pages.
-        let slru_partitions = self
-            .timeline
-            .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
-            .await?
-            .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
-
-        let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
-
-        for part in slru_partitions.parts {
-            let blocks = self
+        if !lazy_slru_download {
+            // Gather non-relational files from object storage pages.
+            let slru_partitions = self
                 .timeline
-                .get_vectored(&part.ranges, self.lsn, self.ctx)
-                .await?;
+                .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
+                .await?
+                .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
 
-            for (key, block) in blocks {
-                slru_builder.add_block(&key, block?).await?;
+            let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
+
+            for part in slru_partitions.parts {
+                let blocks = self
+                    .timeline
+                    .get_vectored(&part.ranges, self.lsn, self.ctx)
+                    .await?;
+
+                for (key, block) in blocks {
+                    slru_builder.add_block(&key, block?).await?;
+                }
             }
+            slru_builder.finish().await?;
         }
 
-        slru_builder.finish().await?;
-
         let mut min_restart_lsn: Lsn = Lsn::MAX;
         // Create tablespace directories
         for ((spcnode, dbnode), has_relmap_file) in
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9b3679e3c2..ed204cb48c 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1043,6 +1043,7 @@ pub enum SmgrQueryType {
     GetRelSize,
     GetPageAtLsn,
     GetDbSize,
+    GetSlruSegment,
 }
 
 #[derive(Debug)]
@@ -1159,11 +1160,12 @@ mod smgr_query_time_tests {
     #[test]
     fn op_label_name() {
         use super::SmgrQueryType::*;
-        let expect: [(super::SmgrQueryType, &'static str); 4] = [
+        let expect: [(super::SmgrQueryType, &'static str); 5] = [
             (GetRelExists, "get_rel_exists"),
             (GetRelSize, "get_rel_size"),
             (GetPageAtLsn, "get_page_at_lsn"),
             (GetDbSize, "get_db_size"),
+            (GetSlruSegment, "get_slru_segment"),
         ];
         for (op, expect) in expect {
             let actual: &'static str = op.into();
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 65191334a6..754c021c88 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -22,7 +22,8 @@ use pageserver_api::models::{
     PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
     PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
     PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
-    PagestreamNblocksRequest, PagestreamNblocksResponse,
+    PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
+    PagestreamNblocksResponse,
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::{ShardCount, ShardNumber};
@@ -74,8 +75,8 @@ use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;
-
 use pageserver_api::key::rel_block_to_key;
+use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
 
@@ -647,6 +648,15 @@ impl PageServerHandler {
                         span,
                     )
                 }
+                PagestreamFeMessage::GetSlruSegment(req) => {
+                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.lsn);
+                    (
+                        self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
+                        span,
+                    )
+                }
             };
 
             match response {
@@ -1137,6 +1147,33 @@ impl PageServerHandler {
         }))
     }
 
+    async fn handle_get_slru_segment_request(
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        req: &PagestreamGetSlruSegmentRequest,
+        ctx: &RequestContext,
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
+        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+
+        let _timer = timeline
+            .query_metrics
+            .start_timer(metrics::SmgrQueryType::GetSlruSegment);
+
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
+
+        let kind = SlruKind::from_repr(req.kind)
+            .ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?;
+        let segment = timeline.get_slru_segment(kind, req.segno, lsn, ctx).await?;
+
+        Ok(PagestreamBeMessage::GetSlruSegment(
+            PagestreamGetSlruSegmentResponse { segment },
+        ))
+    }
+
     #[allow(clippy::too_many_arguments)]
     #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
     async fn handle_basebackup_request<IO>(
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index b65fe1eddd..a36785a69f 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -12,7 +12,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
-use bytes::{Buf, Bytes};
+use bytes::{Buf, Bytes, BytesMut};
 use pageserver_api::key::{
     dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
     rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -321,6 +321,27 @@ impl Timeline {
         }
     }
 
+    /// Get the whole SLRU segment
+    pub(crate) async fn get_slru_segment(
+        &self,
+        kind: SlruKind,
+        segno: u32,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        let n_blocks = self
+            .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
+            .await?;
+        let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
+        for blkno in 0..n_blocks {
+            let block = self
+                .get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx)
+                .await?;
+            segment.extend_from_slice(&block[..BLCKSZ as usize]);
+        }
+        Ok(segment.freeze())
+    }
+
     /// Look up given SLRU page version.
     pub(crate) async fn get_slru_page_at_lsn(
         &self,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7a9fef43d2..681fd296ae 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3903,6 +3903,7 @@ pub(crate) mod harness {
                 ),
                 gc_feedback: Some(tenant_conf.gc_feedback),
                 heatmap_period: Some(tenant_conf.heatmap_period),
+                lazy_slru_download: Some(tenant_conf.lazy_slru_download),
             }
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index c44164c12d..63bd56cf5f 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -345,6 +345,9 @@ pub struct TenantConf {
     /// may be disabled if a Tenant will not have secondary locations: only secondary
     /// locations will use the heatmap uploaded by attached locations.
     pub heatmap_period: Duration,
+
+    /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
+    pub lazy_slru_download: bool,
 }
 
 /// Same as TenantConf, but this struct preserves the information about
@@ -430,6 +433,10 @@ pub struct TenantConfOpt {
     #[serde(with = "humantime_serde")]
     #[serde(default)]
     pub heatmap_period: Option<Duration>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub lazy_slru_download: Option<bool>,
 }
 
 impl TenantConfOpt {
@@ -475,6 +482,9 @@ impl TenantConfOpt {
                 .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
             gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
             heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period),
+            lazy_slru_download: self
+                .lazy_slru_download
+                .unwrap_or(global_conf.lazy_slru_download),
         }
     }
 }
@@ -513,6 +523,7 @@ impl Default for TenantConf {
             .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
             gc_feedback: false,
             heatmap_period: Duration::ZERO,
+            lazy_slru_download: false,
         }
     }
 }
@@ -584,6 +595,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
                 .map(humantime),
             gc_feedback: value.gc_feedback,
             heatmap_period: value.heatmap_period.map(humantime),
+            lazy_slru_download: value.lazy_slru_download,
         }
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 70c6ee2042..fc908ad299 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1287,6 +1287,13 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 
 // Private functions
 impl Timeline {
+    pub fn get_lazy_slru_download(&self) -> bool {
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        tenant_conf
+            .lazy_slru_download
+            .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
+    }
+
     fn get_checkpoint_distance(&self) -> u64 {
         let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
         tenant_conf
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 8c02f357bc..2889ffacae 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -15,6 +15,7 @@
 
 #include "neon_pgversioncompat.h"
 
+#include "access/slru.h"
 #include "access/xlogdefs.h"
 #include RELFILEINFO_HDR
 #include "lib/stringinfo.h"
@@ -34,6 +35,7 @@ typedef enum
 	T_NeonNblocksRequest,
 	T_NeonGetPageRequest,
 	T_NeonDbSizeRequest,
+	T_NeonGetSlruSegmentRequest,
 
 	/* pagestore -> pagestore_client */
 	T_NeonExistsResponse = 100,
@@ -41,6 +43,7 @@ typedef enum
 	T_NeonGetPageResponse,
 	T_NeonErrorResponse,
 	T_NeonDbSizeResponse,
+	T_NeonGetSlruSegmentResponse,
 } NeonMessageTag;
 
 /* base struct for c-style inheritance */
@@ -59,6 +62,13 @@ typedef struct
 														(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
 														 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
 
+/* SLRUs downloadable from page server */
+typedef enum {
+	SLRU_CLOG,
+	SLRU_MULTIXACT_MEMBERS,
+	SLRU_MULTIXACT_OFFSETS
+} SlruKind;
+
 /*
  * supertype of all the Neon*Request structs below
  *
@@ -101,6 +111,13 @@ typedef struct
 	BlockNumber blkno;
 } NeonGetPageRequest;
 
+typedef struct
+{
+	NeonRequest req;
+	SlruKind kind;
+	int      segno;
+} NeonGetSlruSegmentRequest;
+
 /* supertype of all the Neon*Response structs below */
 typedef struct
 {
@@ -140,6 +157,14 @@ typedef struct
 												 * message */
 } NeonErrorResponse;
 
+typedef struct
+{
+	NeonMessageTag tag;
+	int         n_blocks;
+	char		data[BLCKSZ * SLRU_PAGES_PER_SEGMENT];
+} NeonGetSlruSegmentResponse;
+
+
 extern StringInfoData nm_pack_request(NeonRequest *msg);
 extern NeonResponse *nm_unpack_response(StringInfo s);
 extern char *nm_to_string(NeonMessage *msg);
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 1fa802e6f4..63e8b8dc1f 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1043,12 +1043,25 @@ nm_pack_request(NeonRequest *msg)
 				break;
 			}
 
+		case T_NeonGetSlruSegmentRequest:
+			{
+				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
+
+				pq_sendbyte(&s, msg_req->req.latest);
+				pq_sendint64(&s, msg_req->req.lsn);
+				pq_sendbyte(&s, msg_req->kind);
+				pq_sendint32(&s, msg_req->segno);
+
+				break;
+			}
+
 			/* pagestore -> pagestore_client. We never need to create these. */
 		case T_NeonExistsResponse:
 		case T_NeonNblocksResponse:
 		case T_NeonGetPageResponse:
 		case T_NeonErrorResponse:
 		case T_NeonDbSizeResponse:
+		case T_NeonGetSlruSegmentResponse:
 		default:
 			neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
 			break;
@@ -1135,6 +1148,20 @@ nm_unpack_response(StringInfo s)
 				break;
 			}
 
+		case T_NeonGetSlruSegmentResponse:
+		    {
+				NeonGetSlruSegmentResponse *msg_resp;
+				int n_blocks = pq_getmsgint(s, 4);
+				msg_resp = palloc(sizeof(NeonGetSlruSegmentResponse));
+				msg_resp->tag = tag;
+				msg_resp->n_blocks = n_blocks;
+				memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ);
+				pq_getmsgend(s);
+
+				resp = (NeonResponse *) msg_resp;
+				break;
+			}
+
 			/*
 			 * pagestore_client -> pagestore
 			 *
@@ -1144,6 +1171,7 @@ nm_unpack_response(StringInfo s)
 		case T_NeonNblocksRequest:
 		case T_NeonGetPageRequest:
 		case T_NeonDbSizeRequest:
+		case T_NeonGetSlruSegmentRequest:
 		default:
 			neon_log(ERROR, "unexpected neon message tag 0x%02x", tag);
 			break;
@@ -1213,7 +1241,18 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfoChar(&s, '}');
 				break;
 			}
+		case T_NeonGetSlruSegmentRequest:
+			{
+				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
 
+				appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\"");
+				appendStringInfo(&s, ", \"kind\": %u", msg_req->kind);
+				appendStringInfo(&s, ", \"segno\": %u", msg_req->segno);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
 			/* pagestore -> pagestore_client */
 		case T_NeonExistsResponse:
 			{
@@ -1267,6 +1306,17 @@ nm_to_string(NeonMessage *msg)
 								 msg_resp->db_size);
 				appendStringInfoChar(&s, '}');
 
+				break;
+			}
+		case T_NeonGetSlruSegmentResponse:
+			{
+				NeonGetSlruSegmentResponse *msg_resp = (NeonGetSlruSegmentResponse *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentResponse\"");
+				appendStringInfo(&s, ", \"n_blocks\": %u}",
+								 msg_resp->n_blocks);
+				appendStringInfoChar(&s, '}');
+
 				break;
 			}
 
@@ -2739,6 +2789,74 @@ neon_end_unlogged_build(SMgrRelation reln)
 	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 }
 
+#define STRPREFIX(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0)
+
+static int
+neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer)
+{
+	XLogRecPtr request_lsn;
+	/*
+	 * GetRedoStartLsn() returns LSN of basebackup.
+	 * We need to download SLRU segments only once after node startup,
+	 * then SLRUs are maintained locally.
+	 */
+	request_lsn = GetRedoStartLsn();
+	request_lsn = nm_adjust_lsn(request_lsn);
+	SlruKind kind;
+
+    if (STRPREFIX(path, "pg_xact"))
+        kind = SLRU_CLOG;
+    else if (STRPREFIX(path, "pg_multixact/members"))
+        kind = SLRU_MULTIXACT_MEMBERS;
+    else if (STRPREFIX(path, "pg_multixact/offsets"))
+        kind = SLRU_MULTIXACT_OFFSETS;
+    else
+        return -1;
+
+	NeonResponse *resp;
+	NeonGetSlruSegmentRequest request = {
+		.req.tag = T_NeonGetSlruSegmentRequest,
+		.req.latest = false,
+		.req.lsn = request_lsn,
+
+		.kind = kind,
+		.segno = segno
+	};
+	int n_blocks;
+	shardno_t shard_no = 0; /* All SLRUs are at shard 0 */
+	do
+	{
+		while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no));
+		consume_prefetch_responses();
+		resp = page_server->receive(shard_no);
+	} while (resp == NULL);
+
+	switch (resp->tag)
+	{
+		case T_NeonGetSlruSegmentResponse:
+			n_blocks = ((NeonGetSlruSegmentResponse *) resp)->n_blocks;
+			memcpy(buffer, ((NeonGetSlruSegmentResponse *) resp)->data, n_blocks*BLCKSZ);
+			break;
+
+		case T_NeonErrorResponse:
+			ereport(ERROR,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg(NEON_TAG "could not read SLRU %d segment %d at lsn %X/%08X",
+							kind,
+							segno,
+							LSN_FORMAT_ARGS(request_lsn)),
+					 errdetail("page server returned error: %s",
+							   ((NeonErrorResponse *) resp)->message)));
+			break;
+
+		default:
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+	}
+	pfree(resp);
+
+	return n_blocks;
+}
+
 static void
 AtEOXact_neon(XactEvent event, void *arg)
 {
@@ -2797,6 +2915,8 @@ static const struct f_smgr neon_smgr =
 	.smgr_start_unlogged_build = neon_start_unlogged_build,
 	.smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1,
 	.smgr_end_unlogged_build = neon_end_unlogged_build,
+
+	.smgr_read_slru_segment = neon_read_slru_segment,
 };
 
 const f_smgr *
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5be7551a1e..e2a2291dbc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3980,8 +3980,17 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
     # list files we're going to compare
     assert endpoint.pgdata_dir
     pgdata_files = list_files_to_compare(Path(endpoint.pgdata_dir))
+
     restored_files = list_files_to_compare(restored_dir_path)
 
+    if pgdata_files != restored_files:
+        # filter pg_xact and multixact files which are downloaded on demand
+        pgdata_files = [
+            f
+            for f in pgdata_files
+            if not f.startswith("pg_xact") and not f.startswith("pg_multixact")
+        ]
+
     # check that file sets are equal
     assert pgdata_files == restored_files
 
diff --git a/test_runner/performance/test_lazy_startup.py b/test_runner/performance/test_lazy_startup.py
new file mode 100644
index 0000000000..1a431e272e
--- /dev/null
+++ b/test_runner/performance/test_lazy_startup.py
@@ -0,0 +1,111 @@
+import pytest
+import requests
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+# Start and measure duration with huge SLRU segments.
+# This test is similar to test_startup_simple, but it creates huge number of transactions
+# and records containing this XIDs. Autovacuum is disable for the table to prevent CLOG truncation.
+#
+# This test runs pretty quickly and can be informative when used in combination
+# with emulated network delay. Some useful delay commands:
+#
+# 1. Add 2msec delay to all localhost traffic
+# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec`
+#
+# 2. Test that it works (you should see 4ms ping)
+# `ping localhost`
+#
+# 3. Revert back to normal
+# `sudo tc qdisc del dev lo root netem`
+#
+# NOTE this test might not represent the real startup time because the basebackup
+#      for a large database might be larger if there's a lof of transaction metadata,
+#      or safekeepers might need more syncing, or there might be more operations to
+#      apply during config step, like more users, databases, or extensions. By default
+#      we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this
+#      test we only load neon.
+@pytest.mark.timeout(1000)
+def test_lazy_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    lazy_tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            "lazy_slru_download": "true",
+        }
+    )
+    eager_tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            "lazy_slru_download": "false",
+        }
+    )
+    tenants = [lazy_tenant, eager_tenant]
+    slru = "lazy"
+    for tenant in tenants:
+        endpoint = env.endpoints.create_start("main", tenant_id=tenant)
+        endpoint.safe_psql("CREATE TABLE t (pk integer PRIMARY KEY, x integer)")
+        endpoint.safe_psql("ALTER TABLE t SET (autovacuum_enabled = false)")
+        endpoint.safe_psql("INSERT INTO t VALUES (1, 0)")
+        endpoint.safe_psql(
+            """
+          CREATE PROCEDURE updating() as
+          $$
+            DECLARE
+              i integer;
+            BEGIN
+              FOR i IN 1..10000000 LOOP
+                UPDATE t SET x = x + 1 WHERE pk=1;
+                COMMIT;
+              END LOOP;
+            END
+          $$ LANGUAGE plpgsql
+        """
+        )
+        endpoint.safe_psql("SET statement_timeout=0")
+        endpoint.safe_psql("call updating()")
+
+        endpoint.stop()
+
+        # We do two iterations so we can see if the second startup is faster. It should
+        # be because the compute node should already be configured with roles, databases,
+        # extensions, etc from the first run.
+        for i in range(2):
+            # Start
+            with zenbenchmark.record_duration(f"{slru}_{i}_start"):
+                endpoint.start()
+
+            with zenbenchmark.record_duration(f"{slru}_{i}_select"):
+                sum = endpoint.safe_psql("select sum(x) from t")[0][0]
+                assert sum == 10000000
+
+            # Get metrics
+            metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json()
+            durations = {
+                "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec",
+                "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers",
+                "sync_sk_check_ms": f"{slru}_{i}_sync_sk_check",
+                "basebackup_ms": f"{slru}_{i}_basebackup",
+                "start_postgres_ms": f"{slru}_{i}_start_postgres",
+                "config_ms": f"{slru}_{i}_config",
+                "total_startup_ms": f"{slru}_{i}_total_startup",
+            }
+            for key, name in durations.items():
+                value = metrics[key]
+                zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER)
+
+            basebackup_bytes = metrics["basebackup_bytes"]
+            zenbenchmark.record(
+                f"{slru}_{i}_basebackup_bytes",
+                basebackup_bytes,
+                "bytes",
+                report=MetricReport.LOWER_IS_BETTER,
+            )
+
+            # Stop so we can restart
+            endpoint.stop()
+
+            # Imitate optimizations that console would do for the second start
+            endpoint.respec(skip_pg_catalog_updates=True)
+            slru = "eager"
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index ed389b1aa2..7cdc314658 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -173,6 +173,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "image_creation_threshold": 7,
         "pitr_interval": "1m",
         "lagging_wal_timeout": "23m",
+        "lazy_slru_download": True,
         "max_lsn_wal_lag": 230000,
         "min_resident_size_override": 23,
         "trace_read_requests": True,
diff --git a/trace/src/main.rs b/trace/src/main.rs
index ddd970e95d..4605c124e9 100644
--- a/trace/src/main.rs
+++ b/trace/src/main.rs
@@ -60,6 +60,7 @@ fn analyze_trace<R: std::io::Read>(mut reader: R) {
         match msg {
             PagestreamFeMessage::Exists(_) => {}
             PagestreamFeMessage::Nblocks(_) => {}
+            PagestreamFeMessage::GetSlruSegment(_) => {}
             PagestreamFeMessage::GetPage(req) => {
                 total += 1;
 
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 3de48ce3d9..be7a65fe67 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 3de48ce3d9c1f4fac1cdc7029487f8db9e537eac
+Subproject commit be7a65fe67dc81d85bbcbebb13e00d94715f4b88
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index b089a8a02c..81e16cd537 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit b089a8a02c9f6f4379883fddb33cf10a3aa0b14f
+Subproject commit 81e16cd537053f49e175d4a08ab7c8aec3d9b535
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index cf302768b2..f7ea954989 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit cf302768b2890569956641e0e5ba112ae1445351
+Subproject commit f7ea954989a2e7901f858779cff55259f203479a
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 1211155b7d..80699839ba 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "cf302768b2890569956641e0e5ba112ae1445351",
-    "postgres-v15": "b089a8a02c9f6f4379883fddb33cf10a3aa0b14f",
-    "postgres-v14": "3de48ce3d9c1f4fac1cdc7029487f8db9e537eac"
+    "postgres-v16": "f7ea954989a2e7901f858779cff55259f203479a",
+    "postgres-v15": "81e16cd537053f49e175d4a08ab7c8aec3d9b535",
+    "postgres-v14": "be7a65fe67dc81d85bbcbebb13e00d94715f4b88"
 }

From 66719d7eaf333ef6e18dac742fe1e0a77ec2601d Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 31 Jan 2024 22:52:00 +0200
Subject: [PATCH 0039/1571] logging: fix span usage (#6549)

Fixes some duplication due to extra or misconfigured `#[instrument]`,
while filling in the `timeline_id` to delete timeline flow calls.
---
 pageserver/src/tenant.rs                 | 1 +
 pageserver/src/tenant/delete.rs          | 6 +++++-
 pageserver/src/tenant/timeline.rs        | 2 +-
 pageserver/src/tenant/timeline/delete.rs | 4 +++-
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 681fd296ae..0543de931f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1020,6 +1020,7 @@ impl Tenant {
                 Some(remote_timeline_client),
                 self.deletion_queue_client.clone(),
             )
+            .instrument(tracing::info_span!("timeline_delete", %timeline_id))
             .await
             .context("resume_deletion")
             .map_err(LoadLocalTimelineError::ResumeDeletion)?;
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 97de0cdcf9..0dbaa3ec93 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -136,7 +136,11 @@ async fn schedule_ordered_timeline_deletions(
     let mut already_running_deletions = vec![];
 
     for (timeline_id, _) in sorted.into_iter().rev() {
-        if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await {
+        let span = tracing::info_span!("timeline_delete", %timeline_id);
+        let res = DeleteTimelineFlow::run(tenant, timeline_id, true)
+            .instrument(span)
+            .await;
+        if let Err(e) = res {
             match e {
                 DeleteTimelineError::NotFound => {
                     // Timeline deletion finished after call to clone above but before call
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fc908ad299..874603b81b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2792,12 +2792,12 @@ impl Timeline {
     }
 
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id, layer=%frozen_layer))]
     async fn flush_frozen_layer(
         self: &Arc<Self>,
         frozen_layer: Arc<InMemoryLayer>,
         ctx: &RequestContext,
     ) -> Result<(), FlushLayerError> {
+        span::debug_assert_current_span_has_tenant_and_timeline_id();
         // As a special case, when we have just imported an image into the repository,
         // instead of writing out a L0 delta layer, we directly write out image layer
         // files instead. This is possible as long as *all* the data imported into the
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index be873181d9..88d7ce61dd 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -356,12 +356,14 @@ impl DeleteTimelineFlow {
     // NB: If this fails half-way through, and is retried, the retry will go through
     // all the same steps again. Make sure the code here is idempotent, and don't
     // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_shard_id.tenant_id, shard_id=%tenant.tenant_shard_id.shard_slug()))]
+    #[instrument(skip_all, fields(%inplace))]
     pub async fn run(
         tenant: &Arc<Tenant>,
         timeline_id: TimelineId,
         inplace: bool,
     ) -> Result<(), DeleteTimelineError> {
+        super::debug_assert_current_span_has_tenant_and_timeline_id();
+
         let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
 
         guard.mark_in_progress()?;

From 3d5fab127ad4bd64034d7f9f8a5e94a30818013d Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 1 Feb 2024 00:15:58 +0200
Subject: [PATCH 0040/1571] rewrite Gate impl for better observability (#6542)

changes:
- two messages instead of message every second when gate was closing
- replace the gate name string by using a pointer
- slow GateGuards are likely to log who they were (see example)

example found in regress tests: <https://github.com/neondatabase/neon/pull/6542#issuecomment-1919009256>
---
 libs/utils/src/sync/gate.rs         | 227 ++++++++++++++++++----------
 pageserver/src/tenant.rs            |   9 +-
 pageserver/src/tenant/mgr.rs        |   1 +
 pageserver/src/tenant/secondary.rs  |   2 +-
 pageserver/src/tenant/timeline.rs   |   5 +-
 test_runner/regress/test_tenants.py |   5 -
 6 files changed, 162 insertions(+), 87 deletions(-)

diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs
index abc3842da8..c34176af57 100644
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -1,4 +1,10 @@
-use std::{sync::Arc, time::Duration};
+use std::{
+    sync::{
+        atomic::{AtomicBool, Ordering},
+        Arc,
+    },
+    time::Duration,
+};
 
 /// Gates are a concurrency helper, primarily used for implementing safe shutdown.
 ///
@@ -6,62 +12,70 @@ use std::{sync::Arc, time::Duration};
 /// the resource calls `close()` when they want to ensure that all holders of guards
 /// have released them, and that no future guards will be issued.
 pub struct Gate {
-    /// Each caller of enter() takes one unit from the semaphore. In close(), we
-    /// take all the units to ensure all GateGuards are destroyed.
-    sem: Arc<tokio::sync::Semaphore>,
-
-    /// For observability only: a name that will be used to log warnings if a particular
-    /// gate is holding up shutdown
-    name: String,
+    inner: Arc<GateInner>,
 }
 
 impl std::fmt::Debug for Gate {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "Gate<{}>", self.name)
+        f.debug_struct("Gate")
+            // use this for identification
+            .field("ptr", &Arc::as_ptr(&self.inner))
+            .field("inner", &self.inner)
+            .finish()
+    }
+}
+
+struct GateInner {
+    sem: tokio::sync::Semaphore,
+    closing: std::sync::atomic::AtomicBool,
+}
+
+impl std::fmt::Debug for GateInner {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let avail = self.sem.available_permits();
+
+        let guards = u32::try_from(avail)
+            .ok()
+            // the sem only supports 32-bit ish amount, but lets play it safe
+            .and_then(|x| Gate::MAX_UNITS.checked_sub(x));
+
+        let closing = self.closing.load(Ordering::Relaxed);
+
+        if let Some(guards) = guards {
+            f.debug_struct("Gate")
+                .field("remaining_guards", &guards)
+                .field("closing", &closing)
+                .finish()
+        } else {
+            f.debug_struct("Gate")
+                .field("avail_permits", &avail)
+                .field("closing", &closing)
+                .finish()
+        }
     }
 }
 
 /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
 /// not complete.
 #[derive(Debug)]
-pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
+pub struct GateGuard {
+    // Record the span where the gate was entered, so that we can identify who was blocking Gate::close
+    span_at_enter: tracing::Span,
+    gate: Arc<GateInner>,
+}
 
-/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
-async fn warn_if_stuck<Fut: std::future::Future>(
-    fut: Fut,
-    name: &str,
-    warn_period: std::time::Duration,
-) -> <Fut as std::future::Future>::Output {
-    let started = std::time::Instant::now();
-
-    let mut fut = std::pin::pin!(fut);
-
-    let mut warned = false;
-    let ret = loop {
-        match tokio::time::timeout(warn_period, &mut fut).await {
-            Ok(ret) => break ret,
-            Err(_) => {
-                tracing::warn!(
-                    gate = name,
-                    elapsed_ms = started.elapsed().as_millis(),
-                    "still waiting, taking longer than expected..."
-                );
-                warned = true;
-            }
+impl Drop for GateGuard {
+    fn drop(&mut self) {
+        if self.gate.closing.load(Ordering::Relaxed) {
+            self.span_at_enter.in_scope(
+                || tracing::info!(gate = ?Arc::as_ptr(&self.gate), "kept the gate from closing"),
+            );
         }
-    };
 
-    // If we emitted a warning for slowness, also emit a message when we complete, so that
-    // someone debugging a shutdown can know for sure whether we have moved past this operation.
-    if warned {
-        tracing::info!(
-            gate = name,
-            elapsed_ms = started.elapsed().as_millis(),
-            "completed, after taking longer than expected"
-        )
+        // when the permit was acquired, it was forgotten to allow us to manage it's lifecycle
+        // manually, so "return" the permit now.
+        self.gate.sem.add_permits(1);
     }
-
-    ret
 }
 
 #[derive(Debug)]
@@ -69,15 +83,19 @@ pub enum GateError {
     GateClosed,
 }
 
-impl Gate {
-    const MAX_UNITS: u32 = u32::MAX;
-
-    pub fn new(name: String) -> Self {
+impl Default for Gate {
+    fn default() -> Self {
         Self {
-            sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
-            name,
+            inner: Arc::new(GateInner {
+                sem: tokio::sync::Semaphore::new(Self::MAX_UNITS as usize),
+                closing: AtomicBool::new(false),
+            }),
         }
     }
+}
+
+impl Gate {
+    const MAX_UNITS: u32 = u32::MAX;
 
     /// Acquire a guard that will prevent close() calls from completing. If close()
     /// was already called, this will return an error which should be interpreted
@@ -88,11 +106,23 @@ impl Gate {
     /// to avoid blocking close() indefinitely: typically types that contain a Gate will
     /// also contain a CancellationToken.
     pub fn enter(&self) -> Result<GateGuard, GateError> {
-        self.sem
-            .clone()
-            .try_acquire_owned()
-            .map(GateGuard)
-            .map_err(|_| GateError::GateClosed)
+        let permit = self
+            .inner
+            .sem
+            .try_acquire()
+            .map_err(|_| GateError::GateClosed)?;
+
+        // we now have the permit, let's disable the normal raii functionality and leave
+        // "returning" the permit to our GateGuard::drop.
+        //
+        // this is done to avoid the need for multiple Arcs (one for semaphore, next for other
+        // fields).
+        permit.forget();
+
+        Ok(GateGuard {
+            span_at_enter: tracing::Span::current(),
+            gate: self.inner.clone(),
+        })
     }
 
     /// Types with a shutdown() method and a gate should call this method at the
@@ -102,48 +132,88 @@ impl Gate {
     /// important that the holders of such guards are respecting a CancellationToken which has
     /// been cancelled before entering this function.
     pub async fn close(&self) {
-        warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
+        let started_at = std::time::Instant::now();
+        let mut do_close = std::pin::pin!(self.do_close());
+
+        let nag_after = Duration::from_secs(1);
+
+        let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
+            return;
+        };
+
+        tracing::info!(
+            gate = ?self.as_ptr(),
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "closing is taking longer than expected"
+        );
+
+        // close operation is not trying to be cancellation safe as pageserver does not need it.
+        //
+        // note: "closing" is not checked in Gate::enter -- it exists just for observability,
+        // dropping of GateGuard after this will log who they were.
+        self.inner.closing.store(true, Ordering::Relaxed);
+
+        do_close.await;
+
+        tracing::info!(
+            gate = ?self.as_ptr(),
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "close completed"
+        );
+    }
+
+    /// Used as an identity of a gate. This identity will be resolved to something useful when
+    /// it's actually closed in a hopefully sensible `tracing::Span` which will describe it even
+    /// more.
+    ///
+    /// `GateGuard::drop` also logs this pointer when it has realized it has been keeping the gate
+    /// open for too long.
+    fn as_ptr(&self) -> *const GateInner {
+        Arc::as_ptr(&self.inner)
     }
 
     /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish.  This
     /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
     /// the CancellationToken on such types is analogous to "Did shutdown start?"
     pub fn close_complete(&self) -> bool {
-        self.sem.is_closed()
+        self.inner.sem.is_closed()
     }
 
+    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(gate = ?self.as_ptr()))]
     async fn do_close(&self) {
-        tracing::debug!(gate = self.name, "Closing Gate...");
-        match self.sem.acquire_many(Self::MAX_UNITS).await {
-            Ok(_units) => {
+        tracing::debug!("Closing Gate...");
+
+        match self.inner.sem.acquire_many(Self::MAX_UNITS).await {
+            Ok(_permit) => {
                 // While holding all units, close the semaphore.  All subsequent calls to enter() will fail.
-                self.sem.close();
+                self.inner.sem.close();
             }
-            Err(_) => {
+            Err(_closed) => {
                 // Semaphore closed: we are the only function that can do this, so it indicates a double-call.
                 // This is legal.  Timeline::shutdown for example is not protected from being called more than
                 // once.
-                tracing::debug!(gate = self.name, "Double close")
+                tracing::debug!("Double close")
             }
         }
-        tracing::debug!(gate = self.name, "Closed Gate.")
+        tracing::debug!("Closed Gate.")
     }
 }
 
 #[cfg(test)]
 mod tests {
-    use futures::FutureExt;
-
     use super::*;
 
     #[tokio::test]
-    async fn test_idle_gate() {
-        // Having taken no gates, we should not be blocked in close
-        let gate = Gate::new("test".to_string());
+    async fn close_unused() {
+        // Having taken no guards, we should not be blocked in close
+        let gate = Gate::default();
         gate.close().await;
+    }
 
+    #[tokio::test]
+    async fn close_idle() {
         // If a guard is dropped before entering, close should not be blocked
-        let gate = Gate::new("test".to_string());
+        let gate = Gate::default();
         let guard = gate.enter().unwrap();
         drop(guard);
         gate.close().await;
@@ -152,25 +222,30 @@ mod tests {
         gate.enter().expect_err("enter should fail after close");
     }
 
-    #[tokio::test]
-    async fn test_busy_gate() {
-        let gate = Gate::new("test".to_string());
+    #[tokio::test(start_paused = true)]
+    async fn close_busy_gate() {
+        let gate = Gate::default();
+        let forever = Duration::from_secs(24 * 7 * 365);
 
-        let guard = gate.enter().unwrap();
+        let guard =
+            tracing::info_span!("i am holding back the gate").in_scope(|| gate.enter().unwrap());
 
         let mut close_fut = std::pin::pin!(gate.close());
 
-        // Close should be blocked
-        assert!(close_fut.as_mut().now_or_never().is_none());
+        // Close should be waiting for guards to drop
+        tokio::time::timeout(forever, &mut close_fut)
+            .await
+            .unwrap_err();
 
         // Attempting to enter() should fail, even though close isn't done yet.
         gate.enter()
             .expect_err("enter should fail after entering close");
 
+        // this will now log, which we cannot verify except manually
         drop(guard);
 
         // Guard is gone, close should finish
-        assert!(close_fut.as_mut().now_or_never().is_some());
+        close_fut.await;
 
         // Attempting to enter() is still forbidden
         gate.enter().expect_err("enter should fail finishing close");
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0543de931f..ebf6eb56b1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2094,7 +2094,10 @@ impl Tenant {
             let timelines = self.timelines.lock().unwrap();
             timelines.values().for_each(|timeline| {
                 let timeline = Arc::clone(timeline);
-                let span = Span::current();
+                let timeline_id = timeline.timeline_id;
+
+                let span =
+                    tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
                 js.spawn(async move {
                     if freeze_and_flush {
                         timeline.flush_and_shutdown().instrument(span).await
@@ -2694,7 +2697,7 @@ impl Tenant {
             activate_now_sem: tokio::sync::Semaphore::new(0),
             delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
             cancel: CancellationToken::default(),
-            gate: Gate::new(format!("Tenant<{tenant_shard_id}>")),
+            gate: Gate::default(),
         }
     }
 
@@ -5227,7 +5230,7 @@ mod tests {
             let raw_tline = tline.raw_timeline().unwrap();
             raw_tline
                 .shutdown()
-                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id))
+                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, timeline_id=%TIMELINE_ID))
                 .await;
             std::mem::forget(tline);
         }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 32535e0134..949db3c543 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1311,6 +1311,7 @@ impl TenantManager {
         tenant_shard_id: TenantShardId,
         activation_timeout: Duration,
     ) -> Result<(), DeleteTenantError> {
+        super::span::debug_assert_current_span_has_tenant_id();
         // We acquire a SlotGuard during this function to protect against concurrent
         // changes while the ::prepare phase of DeleteTenantFlow executes, but then
         // have to return the Tenant to the map while the background deletion runs.
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index d00d901be6..4269e1dec1 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -112,7 +112,7 @@ impl SecondaryTenant {
             // on shutdown we walk the tenants and fire their
             // individual cancellations?
             cancel: CancellationToken::new(),
-            gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),
+            gate: Gate::default(),
 
             shard_identity,
             tenant_conf: std::sync::Mutex::new(tenant_conf),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 874603b81b..db739f1033 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1060,7 +1060,6 @@ impl Timeline {
     /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
     ///
     /// While we are flushing, we continue to accept read I/O.
-    #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
     pub(crate) async fn flush_and_shutdown(&self) {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -1109,6 +1108,8 @@ impl Timeline {
     /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
     /// the graceful [`Timeline::flush_and_shutdown`] function.
     pub(crate) async fn shutdown(&self) {
+        span::debug_assert_current_span_has_tenant_and_timeline_id();
+
         // Signal any subscribers to our cancellation token to drop out
         tracing::debug!("Cancelling CancellationToken");
         self.cancel.cancel();
@@ -1502,7 +1503,7 @@ impl Timeline {
                 delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
 
                 cancel,
-                gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")),
+                gate: Gate::default(),
 
                 compaction_lock: tokio::sync::Mutex::default(),
                 gc_lock: tokio::sync::Mutex::default(),
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 5164bda470..ba391a69d8 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -376,11 +376,6 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
     # so we allow it to log at WARN, even if it is occasionally a false positive.
     env.pageserver.allowed_errors.append(".*failed to freeze and flush.*")
 
-    # When we shut down a tenant during a timeline creation, initdb is not cancelled, we wait
-    # for it to complete (since https://github.com/neondatabase/neon/pull/6451).  This means
-    # that shutdown can be delayed by >=1s on debug builds where initdb takes a long time to run.
-    env.pageserver.allowed_errors.append(".*still waiting, taking longer than expected... gate.*")
-
     def create_bg(delay_ms):
         time.sleep(delay_ms / 1000.0)
         try:

From 271133d960ba305128d1327fd5f466aba93e49ac Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 31 Jan 2024 23:16:56 +0100
Subject: [PATCH 0041/1571] Proxy: reduce number of get role secret calls
 (#6557)

## Problem

Right now if get_role_secret response wasn't cached (e.g. cache already
reached max size) it will send the second (exactly the same request).

## Summary of changes

Avoid needless request.
---
 proxy/src/auth/backend.rs          | 25 ++++++++++++++-----------
 proxy/src/console/provider.rs      | 12 ++++++------
 proxy/src/console/provider/mock.rs | 13 ++++++++-----
 proxy/src/console/provider/neon.rs | 11 +++++++----
 proxy/src/proxy/tests.rs           |  7 +++++--
 proxy/src/serverless/conn_pool.rs  |  2 +-
 6 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 4b8ebae86f..144c9dcff5 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -9,7 +9,7 @@ use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::validate_password_and_exchange;
 use crate::cache::Cached;
 use crate::console::errors::GetAuthInfoError;
-use crate::console::provider::ConsoleBackend;
+use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
 use crate::console::AuthSecret;
 use crate::context::RequestMonitoring;
 use crate::proxy::connect_compute::handle_try_wake;
@@ -34,8 +34,6 @@ use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{error, info, warn};
 
-use super::IpPattern;
-
 /// This type serves two purposes:
 ///
 /// * When `T` is `()`, it's just a regular auth backend selector
@@ -56,7 +54,9 @@ pub enum BackendType<'a, T> {
 
 pub trait TestBackend: Send + Sync + 'static {
     fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
-    fn get_allowed_ips(&self) -> Result<Vec<IpPattern>, console::errors::GetAuthInfoError>;
+    fn get_allowed_ips_and_secret(
+        &self,
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
 }
 
 impl std::fmt::Display for BackendType<'_, ()> {
@@ -200,13 +200,16 @@ async fn auth_quirks(
     };
 
     info!("fetching user's authentication info");
-    let allowed_ips = api.get_allowed_ips(ctx, &info).await?;
+    let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;
 
     // check allowed list
     if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
         return Err(auth::AuthError::ip_address_not_allowed());
     }
-    let cached_secret = api.get_role_secret(ctx, &info).await?;
+    let cached_secret = match maybe_secret {
+        Some(secret) => secret,
+        None => api.get_role_secret(ctx, &info).await?,
+    };
 
     let secret = cached_secret.value.clone().unwrap_or_else(|| {
         // If we don't have an authentication secret, we mock one to
@@ -382,16 +385,16 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
 }
 
 impl BackendType<'_, ComputeUserInfo> {
-    pub async fn get_allowed_ips(
+    pub async fn get_allowed_ips_and_secret(
         &self,
         ctx: &mut RequestMonitoring,
-    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         use BackendType::*;
         match self {
-            Console(api, user_info) => api.get_allowed_ips(ctx, user_info).await,
-            Link(_) => Ok(Cached::new_uncached(Arc::new(vec![]))),
+            Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Link(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
             #[cfg(test)]
-            Test(x) => Ok(Cached::new_uncached(Arc::new(x.get_allowed_ips()?))),
+            Test(x) => x.get_allowed_ips_and_secret(),
         }
     }
 
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index a6dfbd79db..ff84db7738 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -250,11 +250,11 @@ pub trait Api {
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
 
-    async fn get_allowed_ips(
+    async fn get_allowed_ips_and_secret(
         &self,
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
-    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError>;
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
 
     /// Wake up the compute node and return the corresponding connection info.
     async fn wake_compute(
@@ -288,16 +288,16 @@ impl Api for ConsoleBackend {
         }
     }
 
-    async fn get_allowed_ips(
+    async fn get_allowed_ips_and_secret(
         &self,
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
-    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError> {
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
         use ConsoleBackend::*;
         match self {
-            Console(api) => api.get_allowed_ips(ctx, user_info).await,
+            Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             #[cfg(feature = "testing")]
-            Postgres(api) => api.get_allowed_ips(ctx, user_info).await,
+            Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
         }
     }
 
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 55f395a403..79a04f255d 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -157,14 +157,17 @@ impl super::Api for Api {
         ))
     }
 
-    async fn get_allowed_ips(
+    async fn get_allowed_ips_and_secret(
         &self,
         _ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
-    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
-        Ok(Cached::new_uncached(Arc::new(
-            self.do_get_auth_info(user_info).await?.allowed_ips,
-        )))
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
+        Ok((
+            Cached::new_uncached(Arc::new(
+                self.do_get_auth_info(user_info).await?.allowed_ips,
+            )),
+            None,
+        ))
     }
 
     #[tracing::instrument(skip_all)]
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 33618faed8..f22c6d2322 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -194,17 +194,17 @@ impl super::Api for Api {
         Ok(Cached::new_uncached(auth_info.secret))
     }
 
-    async fn get_allowed_ips(
+    async fn get_allowed_ips_and_secret(
         &self,
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
-    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         let ep = &user_info.endpoint;
         if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) {
             ALLOWED_IPS_BY_CACHE_OUTCOME
                 .with_label_values(&["hit"])
                 .inc();
-            return Ok(allowed_ips);
+            return Ok((allowed_ips, None));
         }
         ALLOWED_IPS_BY_CACHE_OUTCOME
             .with_label_values(&["miss"])
@@ -223,7 +223,10 @@ impl super::Api for Api {
                 .project_info
                 .insert_allowed_ips(&project_id, ep, allowed_ips.clone());
         }
-        Ok(Cached::new_uncached(allowed_ips))
+        Ok((
+            Cached::new_uncached(allowed_ips),
+            Some(Cached::new_uncached(auth_info.secret)),
+        ))
     }
 
     #[tracing::instrument(skip_all)]
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index a552a857b9..1f57d343c4 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -6,8 +6,8 @@ use super::connect_compute::ConnectMechanism;
 use super::retry::ShouldRetry;
 use super::*;
 use crate::auth::backend::{ComputeUserInfo, TestBackend};
-use crate::auth::IpPattern;
 use crate::config::CertResolver;
+use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
@@ -471,7 +471,10 @@ impl TestBackend for TestConnectMechanism {
         }
     }
 
-    fn get_allowed_ips(&self) -> Result<Vec<IpPattern>, console::errors::GetAuthInfoError> {
+    fn get_allowed_ips_and_secret(
+        &self,
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
+    {
         unimplemented!("not used in tests")
     }
 }
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 5a7279ae63..312fa2b36f 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -540,7 +540,7 @@ async fn connect_to_compute(
         .map(|_| conn_info.user_info.clone());
 
     if !config.disable_ip_check_for_http {
-        let allowed_ips = backend.get_allowed_ips(ctx).await?;
+        let (allowed_ips, _) = backend.get_allowed_ips_and_secret(ctx).await?;
         if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
             return Err(auth::AuthError::ip_address_not_allowed().into());
         }

From 0ac1e71524cf3d6e6623b8933fd5264500b359ab Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 31 Jan 2024 23:54:54 +0100
Subject: [PATCH 0042/1571] update tokio-epoll-uring (#6558)

to pull in fixes for
https://github.com/neondatabase/tokio-epoll-uring/issues/37
---
 Cargo.lock | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 28ec84be1f..73bef9c96b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2811,6 +2811,15 @@ version = "2.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
 
+[[package]]
+name = "memoffset"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
+dependencies = [
+ "autocfg",
+]
+
 [[package]]
 name = "memoffset"
 version = "0.8.0"
@@ -2943,6 +2952,19 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "nix"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b"
+dependencies = [
+ "bitflags 1.3.2",
+ "cfg-if",
+ "libc",
+ "memoffset 0.7.1",
+ "pin-utils",
+]
+
 [[package]]
 name = "nix"
 version = "0.27.1"
@@ -5662,9 +5684,10 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
 dependencies = [
  "futures",
+ "nix 0.26.4",
  "once_cell",
  "scopeguard",
  "thiserror",
@@ -6186,7 +6209,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
 dependencies = [
  "io-uring",
  "libc",

From e82625b77dd65253d6ef3860586a8efd68931b71 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 1 Feb 2024 00:25:57 +0100
Subject: [PATCH 0043/1571] refactor(pageserver main): signal handling (#6554)

This refactoring makes it easier to experimentally replace
BACKGROUND_RUNTIME with a single-threaded runtime. Found this useful
[during benchmarking](https://github.com/neondatabase/neon/pull/6555).
---
 pageserver/src/bin/pageserver.rs | 62 +++++++++++++++++---------------
 1 file changed, 34 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 84de76e55e..eaddcb4607 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -33,12 +33,10 @@ use pageserver::{
 use postgres_backend::AuthType;
 use utils::failpoint_support;
 use utils::logging::TracingErrorLayerEnablement;
-use utils::signals::ShutdownSignals;
 use utils::{
     auth::{JwtAuth, SwappableJwtAuth},
     logging, project_build_tag, project_git_version,
     sentry_init::init_sentry,
-    signals::Signal,
     tcp_listener,
 };
 
@@ -656,34 +654,42 @@ fn start_pageserver(
     let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
 
     // All started up! Now just sit and wait for shutdown signal.
-    ShutdownSignals::handle(|signal| match signal {
-        Signal::Quit => {
-            info!(
-                "Got {}. Terminating in immediate shutdown mode",
-                signal.name()
-            );
-            std::process::exit(111);
-        }
+    {
+        use signal_hook::consts::*;
+        let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
+            let mut signals =
+                signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
+            return signals
+                .forever()
+                .next()
+                .expect("forever() never returns None unless explicitly closed");
+        });
+        let signal = BACKGROUND_RUNTIME
+            .block_on(signal_handler)
+            .expect("join error");
+        match signal {
+            SIGQUIT => {
+                info!("Got signal {signal}. Terminating in immediate shutdown mode",);
+                std::process::exit(111);
+            }
+            SIGINT | SIGTERM => {
+                info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
 
-        Signal::Interrupt | Signal::Terminate => {
-            info!(
-                "Got {}. Terminating gracefully in fast shutdown mode",
-                signal.name()
-            );
-
-            // This cancels the `shutdown_pageserver` cancellation tree.
-            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
-            // The plan is to change that over time.
-            shutdown_pageserver.take();
-            let bg_remote_storage = remote_storage.clone();
-            let bg_deletion_queue = deletion_queue.clone();
-            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
-                bg_remote_storage.map(|_| bg_deletion_queue),
-                0,
-            ));
-            unreachable!()
+                // This cancels the `shutdown_pageserver` cancellation tree.
+                // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
+                // The plan is to change that over time.
+                shutdown_pageserver.take();
+                let bg_remote_storage = remote_storage.clone();
+                let bg_deletion_queue = deletion_queue.clone();
+                BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+                    bg_remote_storage.map(|_| bg_deletion_queue),
+                    0,
+                ));
+                unreachable!()
+            }
+            _ => unreachable!(),
         }
-    })
+    }
 }
 
 fn create_remote_storage_client(

From 4c173456dcf78e7f60f272d994b6a03d99f490c3 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 1 Feb 2024 00:29:48 +0100
Subject: [PATCH 0044/1571] pagebench: fix percentiles reporting (#6547)

Before this patch, pagebench was always showing the same value.

refs https://github.com/neondatabase/neon/issues/6509
---
 pageserver/pagebench/src/util/request_stats.rs | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/pageserver/pagebench/src/util/request_stats.rs b/pageserver/pagebench/src/util/request_stats.rs
index 5ecf1cbf24..4aa6950782 100644
--- a/pageserver/pagebench/src/util/request_stats.rs
+++ b/pageserver/pagebench/src/util/request_stats.rs
@@ -66,13 +66,10 @@ impl serde::Serialize for LatencyPercentiles {
     {
         use serde::ser::SerializeMap;
         let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?;
-        for p in LATENCY_PERCENTILES {
+        for (p, v) in LATENCY_PERCENTILES.iter().zip(&self.latency_percentiles) {
             ser.serialize_entry(
                 &format!("p{p}"),
-                &format!(
-                    "{}",
-                    &humantime::format_duration(self.latency_percentiles[0])
-                ),
+                &format!("{}", humantime::format_duration(*v)),
             )?;
         }
         ser.end()

From 221531c9db03d1c6766d6c655603584f9980d5a0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 1 Feb 2024 10:35:18 +0000
Subject: [PATCH 0045/1571] pageserver: lift ancestor timeline logic from read
 path (#6543)

When the read path needs to follow a key into the ancestor timeline, it
needs to wait for said ancestor to become active and aware of it's
branching lsn. The logic is lifted into a separate function with it's
own new error type.

This is done because the vectored read path needs the same logic. It's
also the reason for the newly introduced error type.

When we'll switch the read path to proxy into `get_vectored`, we can
remove the duplicated variants from `PageReconstructError`.
---
 pageserver/src/tenant/timeline.rs | 141 +++++++++++++++++++-----------
 1 file changed, 88 insertions(+), 53 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index db739f1033..168e565edb 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -457,6 +457,21 @@ pub(crate) enum GetVectoredError {
     InvalidLsn(Lsn),
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum GetReadyAncestorError {
+    #[error("ancestor timeline {0} is being stopped")]
+    AncestorStopping(TimelineId),
+
+    #[error("Ancestor LSN wait error: {0}")]
+    AncestorLsnTimeout(#[from] WaitLsnError),
+
+    #[error("Cancelled")]
+    Cancelled,
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 #[derive(Clone, Copy)]
 pub enum LogicalSizeCalculationCause {
     Initial,
@@ -535,6 +550,18 @@ impl From<GetVectoredError> for CreateImageLayersError {
     }
 }
 
+impl From<GetReadyAncestorError> for PageReconstructError {
+    fn from(e: GetReadyAncestorError) -> Self {
+        use GetReadyAncestorError::*;
+        match e {
+            AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid),
+            AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err),
+            Cancelled => PageReconstructError::Cancelled,
+            Other(other) => PageReconstructError::Other(other),
+        }
+    }
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -2400,60 +2427,8 @@ impl Timeline {
                     timeline.ancestor_lsn,
                     cont_lsn
                 );
-                let ancestor = match timeline.get_ancestor_timeline() {
-                    Ok(timeline) => timeline,
-                    Err(e) => return Err(PageReconstructError::from(e)),
-                };
 
-                // It's possible that the ancestor timeline isn't active yet, or
-                // is active but hasn't yet caught up to the branch point. Wait
-                // for it.
-                //
-                // This cannot happen while the pageserver is running normally,
-                // because you cannot create a branch from a point that isn't
-                // present in the pageserver yet. However, we don't wait for the
-                // branch point to be uploaded to cloud storage before creating
-                // a branch. I.e., the branch LSN need not be remote consistent
-                // for the branching operation to succeed.
-                //
-                // Hence, if we try to load a tenant in such a state where
-                // 1. the existence of the branch was persisted (in IndexPart and/or locally)
-                // 2. but the ancestor state is behind branch_lsn because it was not yet persisted
-                // then we will need to wait for the ancestor timeline to
-                // re-stream WAL up to branch_lsn before we access it.
-                //
-                // How can a tenant get in such a state?
-                // - ungraceful pageserver process exit
-                // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219
-                //
-                // NB: this could be avoided by requiring
-                //   branch_lsn >= remote_consistent_lsn
-                // during branch creation.
-                match ancestor.wait_to_become_active(ctx).await {
-                    Ok(()) => {}
-                    Err(TimelineState::Stopping) => {
-                        return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id));
-                    }
-                    Err(state) => {
-                        return Err(PageReconstructError::Other(anyhow::anyhow!(
-                            "Timeline {} will not become active. Current state: {:?}",
-                            ancestor.timeline_id,
-                            &state,
-                        )));
-                    }
-                }
-                ancestor
-                    .wait_lsn(timeline.ancestor_lsn, ctx)
-                    .await
-                    .map_err(|e| match e {
-                        e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e),
-                        WaitLsnError::Shutdown => PageReconstructError::Cancelled,
-                        e @ WaitLsnError::BadState => {
-                            PageReconstructError::Other(anyhow::anyhow!(e))
-                        }
-                    })?;
-
-                timeline_owned = ancestor;
+                timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
                 timeline = &*timeline_owned;
                 prev_lsn = Lsn(u64::MAX);
                 continue 'outer;
@@ -2583,6 +2558,66 @@ impl Timeline {
         Some((lsn, img))
     }
 
+    async fn get_ready_ancestor_timeline(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<Arc<Timeline>, GetReadyAncestorError> {
+        let ancestor = match self.get_ancestor_timeline() {
+            Ok(timeline) => timeline,
+            Err(e) => return Err(GetReadyAncestorError::from(e)),
+        };
+
+        // It's possible that the ancestor timeline isn't active yet, or
+        // is active but hasn't yet caught up to the branch point. Wait
+        // for it.
+        //
+        // This cannot happen while the pageserver is running normally,
+        // because you cannot create a branch from a point that isn't
+        // present in the pageserver yet. However, we don't wait for the
+        // branch point to be uploaded to cloud storage before creating
+        // a branch. I.e., the branch LSN need not be remote consistent
+        // for the branching operation to succeed.
+        //
+        // Hence, if we try to load a tenant in such a state where
+        // 1. the existence of the branch was persisted (in IndexPart and/or locally)
+        // 2. but the ancestor state is behind branch_lsn because it was not yet persisted
+        // then we will need to wait for the ancestor timeline to
+        // re-stream WAL up to branch_lsn before we access it.
+        //
+        // How can a tenant get in such a state?
+        // - ungraceful pageserver process exit
+        // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219
+        //
+        // NB: this could be avoided by requiring
+        //   branch_lsn >= remote_consistent_lsn
+        // during branch creation.
+        match ancestor.wait_to_become_active(ctx).await {
+            Ok(()) => {}
+            Err(TimelineState::Stopping) => {
+                return Err(GetReadyAncestorError::AncestorStopping(
+                    ancestor.timeline_id,
+                ));
+            }
+            Err(state) => {
+                return Err(GetReadyAncestorError::Other(anyhow::anyhow!(
+                    "Timeline {} will not become active. Current state: {:?}",
+                    ancestor.timeline_id,
+                    &state,
+                )));
+            }
+        }
+        ancestor
+            .wait_lsn(self.ancestor_lsn, ctx)
+            .await
+            .map_err(|e| match e {
+                e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
+                WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled,
+                e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)),
+            })?;
+
+        Ok(ancestor)
+    }
+
     fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
         let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
             format!(

From d2c410c748fa58e9a0dc6821185d389b828dc1f2 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 1 Feb 2024 13:14:35 +0000
Subject: [PATCH 0046/1571] pageserver_api: remove overlaps from KeySpace
 (#6544)

This commit adds a function to `KeySpace` which updates a key key space
by removing all overlaps with a second key space. This can involve
splitting or removing of existing ranges.

The implementation is not particularly efficient: O(M * N * log(N))
where N is the number of ranges in the current key space and M is the
number of ranges in the key space we are checking against. In practice,
this shouldn't matter much since, in the short term, the only caller of
this function will be the vectored read path and the number of key
spaces invovled will be small. This follows from the upper bound placed
on the number of keys accepted by the vectored read path.

A couple other small utility functions are added. They'll be used by the
vectored search path as well.
---
 libs/pageserver_api/src/keyspace.rs | 194 +++++++++++++++++++++++++++-
 1 file changed, 188 insertions(+), 6 deletions(-)

diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 2316acb616..396c801606 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -63,16 +63,84 @@ impl KeySpace {
         KeyPartitioning { parts }
     }
 
+    /// Update the keyspace such that it doesn't contain any range
+    /// that is overlapping with `other`. This can involve splitting or
+    /// removing of existing ranges.
+    pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
+        let (self_start, self_end) = match (self.start(), self.end()) {
+            (Some(start), Some(end)) => (start, end),
+            _ => {
+                // self is empty
+                return;
+            }
+        };
+
+        // Key spaces are sorted by definition, so skip ahead to the first
+        // potentially intersecting range. Similarly, ignore ranges that start
+        // after the current keyspace ends.
+        let other_ranges = other
+            .ranges
+            .iter()
+            .skip_while(|range| self_start >= range.end)
+            .take_while(|range| self_end > range.start);
+
+        for range in other_ranges {
+            while let Some(overlap_at) = self.overlaps_at(range) {
+                let overlapped = self.ranges[overlap_at].clone();
+
+                if overlapped.start < range.start && overlapped.end <= range.end {
+                    // Higher part of the range is completely overlapped.
+                    self.ranges[overlap_at].end = range.start;
+                }
+                if overlapped.start >= range.start && overlapped.end > range.end {
+                    // Lower part of the range is completely overlapped.
+                    self.ranges[overlap_at].start = range.end;
+                }
+                if overlapped.start < range.start && overlapped.end > range.end {
+                    // Middle part of the range is overlapped.
+                    self.ranges[overlap_at].end = range.start;
+                    self.ranges
+                        .insert(overlap_at + 1, range.end..overlapped.end);
+                }
+                if overlapped.start >= range.start && overlapped.end <= range.end {
+                    // Whole range is overlapped
+                    self.ranges.remove(overlap_at);
+                }
+            }
+        }
+    }
+
+    pub fn start(&self) -> Option<Key> {
+        self.ranges.first().map(|range| range.start)
+    }
+
+    pub fn end(&self) -> Option<Key> {
+        self.ranges.last().map(|range| range.end)
+    }
+
+    #[allow(unused)]
+    pub fn total_size(&self) -> usize {
+        self.ranges
+            .iter()
+            .map(|range| key_range_size(range) as usize)
+            .sum()
+    }
+
+    fn overlaps_at(&self, range: &Range<Key>) -> Option<usize> {
+        match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
+            Ok(0) => None,
+            Err(0) => None,
+            Ok(index) if self.ranges[index - 1].end > range.start => Some(index - 1),
+            Err(index) if self.ranges[index - 1].end > range.start => Some(index - 1),
+            _ => None,
+        }
+    }
+
     ///
     /// Check if key space contains overlapping range
     ///
     pub fn overlaps(&self, range: &Range<Key>) -> bool {
-        match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
-            Ok(0) => false,
-            Err(0) => false,
-            Ok(index) => self.ranges[index - 1].end > range.start,
-            Err(index) => self.ranges[index - 1].end > range.start,
-        }
+        self.overlaps_at(range).is_some()
     }
 }
 
@@ -441,4 +509,118 @@ mod tests {
         //        xxxxxxxxxxx
         assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently!
     }
+
+    #[test]
+    fn test_remove_full_overlapps() {
+        let mut key_space1 = KeySpace {
+            ranges: vec![
+                Key::from_i128(1)..Key::from_i128(4),
+                Key::from_i128(5)..Key::from_i128(8),
+                Key::from_i128(10)..Key::from_i128(12),
+            ],
+        };
+        let key_space2 = KeySpace {
+            ranges: vec![
+                Key::from_i128(2)..Key::from_i128(3),
+                Key::from_i128(6)..Key::from_i128(7),
+                Key::from_i128(11)..Key::from_i128(13),
+            ],
+        };
+        key_space1.remove_overlapping_with(&key_space2);
+        assert_eq!(
+            key_space1.ranges,
+            vec![
+                Key::from_i128(1)..Key::from_i128(2),
+                Key::from_i128(3)..Key::from_i128(4),
+                Key::from_i128(5)..Key::from_i128(6),
+                Key::from_i128(7)..Key::from_i128(8),
+                Key::from_i128(10)..Key::from_i128(11)
+            ]
+        );
+    }
+
+    #[test]
+    fn test_remove_partial_overlaps() {
+        // Test partial ovelaps
+        let mut key_space1 = KeySpace {
+            ranges: vec![
+                Key::from_i128(1)..Key::from_i128(5),
+                Key::from_i128(7)..Key::from_i128(10),
+                Key::from_i128(12)..Key::from_i128(15),
+            ],
+        };
+        let key_space2 = KeySpace {
+            ranges: vec![
+                Key::from_i128(3)..Key::from_i128(6),
+                Key::from_i128(8)..Key::from_i128(11),
+                Key::from_i128(14)..Key::from_i128(17),
+            ],
+        };
+        key_space1.remove_overlapping_with(&key_space2);
+        assert_eq!(
+            key_space1.ranges,
+            vec![
+                Key::from_i128(1)..Key::from_i128(3),
+                Key::from_i128(7)..Key::from_i128(8),
+                Key::from_i128(12)..Key::from_i128(14),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_remove_no_overlaps() {
+        let mut key_space1 = KeySpace {
+            ranges: vec![
+                Key::from_i128(1)..Key::from_i128(5),
+                Key::from_i128(7)..Key::from_i128(10),
+                Key::from_i128(12)..Key::from_i128(15),
+            ],
+        };
+        let key_space2 = KeySpace {
+            ranges: vec![
+                Key::from_i128(6)..Key::from_i128(7),
+                Key::from_i128(11)..Key::from_i128(12),
+                Key::from_i128(15)..Key::from_i128(17),
+            ],
+        };
+        key_space1.remove_overlapping_with(&key_space2);
+        assert_eq!(
+            key_space1.ranges,
+            vec![
+                Key::from_i128(1)..Key::from_i128(5),
+                Key::from_i128(7)..Key::from_i128(10),
+                Key::from_i128(12)..Key::from_i128(15),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_remove_one_range_overlaps_multiple() {
+        let mut key_space1 = KeySpace {
+            ranges: vec![
+                Key::from_i128(1)..Key::from_i128(3),
+                Key::from_i128(3)..Key::from_i128(6),
+                Key::from_i128(6)..Key::from_i128(10),
+                Key::from_i128(12)..Key::from_i128(15),
+                Key::from_i128(17)..Key::from_i128(20),
+                Key::from_i128(20)..Key::from_i128(30),
+                Key::from_i128(30)..Key::from_i128(40),
+            ],
+        };
+        let key_space2 = KeySpace {
+            ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
+        };
+        key_space1.remove_overlapping_with(&key_space2);
+        assert_eq!(
+            key_space1.ranges,
+            vec![
+                Key::from_i128(1)..Key::from_i128(3),
+                Key::from_i128(3)..Key::from_i128(6),
+                Key::from_i128(6)..Key::from_i128(9),
+                Key::from_i128(19)..Key::from_i128(20),
+                Key::from_i128(20)..Key::from_i128(30),
+                Key::from_i128(30)..Key::from_i128(40),
+            ]
+        );
+    }
 }

From fa52cd575e8bec9cd791f933ca80c498b17be7fa Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 1 Feb 2024 13:36:55 +0000
Subject: [PATCH 0047/1571] Remove old tests results and old coverage
 collection (#6376)

## Problem
We have switched to new test results and new coverage results, so no
need to collect these data in old formats.

## Summary of changes
- Remove "Upload coverage report" for old coverage report
- Remove "Store Allure test stat in the DB" for old test results format
---
 .../actions/allure-report-generate/action.yml |  17 ---
 .github/workflows/build_and_test.yml          |  24 +---
 scripts/ingest_regress_test_result.py         | 118 ------------------
 3 files changed, 1 insertion(+), 158 deletions(-)
 delete mode 100644 scripts/ingest_regress_test_result.py

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index abdbba802e..a33adf8bdd 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -179,23 +179,6 @@ runs:
           aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
         fi
 
-    - name: Store Allure test stat in the DB
-      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
-      shell: bash -euxo pipefail {0}
-      env:
-        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-        REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }}
-      run: |
-        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR}
-
-        ./scripts/pysync
-
-        poetry run python3 scripts/ingest_regress_test_result.py \
-          --revision ${COMMIT_SHA} \
-          --reference ${GITHUB_REF} \
-          --build-type unified \
-          --ingest ${WORKDIR}/report/data/suites.json
-
     - name: Store Allure test stat in the DB (new)
       if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
       shell: bash -euxo pipefail {0}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 147d5cae2d..201c77f138 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -531,7 +531,6 @@ jobs:
         with:
           store-test-results-into-db: true
         env:
-          REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
           REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
       - uses: actions/github-script@v6
@@ -609,17 +608,6 @@ jobs:
             --input-objects=/tmp/coverage/binaries.list \
             --format=lcov
 
-      - name: Upload coverage report
-        id: upload-coverage-report
-        env:
-          BUCKET: neon-github-public-dev
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-        run: |
-          aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA}
-
-          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
-          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
-
       - name: Build coverage report NEW
         id: upload-coverage-report-new
         env:
@@ -656,21 +644,11 @@ jobs:
 
       - uses: actions/github-script@v6
         env:
-          REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }}
           REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
           COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
         with:
           script: |
-            const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env
-
-            await github.rest.repos.createCommitStatus({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              sha: `${COMMIT_SHA}`,
-              state: 'success',
-              target_url: `${REPORT_URL}`,
-              context: 'Code coverage report',
-            })
+            const { REPORT_URL_NEW, COMMIT_SHA } = process.env
 
             await github.rest.repos.createCommitStatus({
               owner: context.repo.owner,
diff --git a/scripts/ingest_regress_test_result.py b/scripts/ingest_regress_test_result.py
deleted file mode 100644
index 39c1c02941..0000000000
--- a/scripts/ingest_regress_test_result.py
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import logging
-import os
-import re
-import sys
-from contextlib import contextmanager
-from pathlib import Path
-
-import backoff
-import psycopg2
-
-CREATE_TABLE = """
-CREATE TABLE IF NOT EXISTS regress_test_results (
-    id SERIAL PRIMARY KEY,
-    reference CHAR(255),
-    revision CHAR(40),
-    build_type CHAR(16),
-    data JSONB
-)
-"""
-
-
-def err(msg):
-    print(f"error: {msg}")
-    sys.exit(1)
-
-
-@contextmanager
-def get_connection_cursor():
-    connstr = os.getenv("DATABASE_URL")
-    if not connstr:
-        err("DATABASE_URL environment variable is not set")
-
-    @backoff.on_exception(backoff.expo, psycopg2.OperationalError, max_time=150)
-    def connect(connstr):
-        conn = psycopg2.connect(connstr, connect_timeout=30)
-        conn.autocommit = True
-        return conn
-
-    conn = connect(connstr)
-    try:
-        with conn.cursor() as cur:
-            yield cur
-    finally:
-        if conn is not None:
-            conn.close()
-
-
-def create_table(cur):
-    cur.execute(CREATE_TABLE)
-
-
-def ingest_regress_test_result(
-    cursor, reference: str, revision: str, build_type: str, data_file: Path
-):
-    data = data_file.read_text()
-    # In the JSON report we can have lines related to LazyFixture with escaped double-quote
-    # It's hard to insert them into jsonb field as is, so replace \" with ' to make it easier for us
-    #
-    # "<LazyFixture \"vanilla_compare\">" -> "<LazyFixture 'vanilla_compare'>"
-    data = re.sub(r'("<LazyFixture )\\"([^\\]+)\\"(>")', r"\g<1>'\g<2>'\g<3>", data)
-    values = (
-        reference,
-        revision,
-        build_type,
-        data,
-    )
-    cursor.execute(
-        """
-        INSERT INTO regress_test_results (
-            reference,
-            revision,
-            build_type,
-            data
-        ) VALUES (%s, %s, %s, %s)
-        """,
-        values,
-    )
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Regress test result uploader. \
-            Database connection string should be provided via DATABASE_URL environment variable",
-    )
-    parser.add_argument("--initdb", action="store_true", help="Initialuze database")
-    parser.add_argument(
-        "--reference", type=str, required=True, help="git reference, for example refs/heads/main"
-    )
-    parser.add_argument("--revision", type=str, required=True, help="git revision")
-    parser.add_argument(
-        "--build-type", type=str, required=True, help="build type: release, debug or remote"
-    )
-    parser.add_argument(
-        "--ingest", type=Path, required=True, help="Path to regress test result file"
-    )
-
-    args = parser.parse_args()
-    with get_connection_cursor() as cur:
-        if args.initdb:
-            create_table(cur)
-
-        if not args.ingest.exists():
-            err(f"ingest path {args.ingest} does not exist")
-
-        ingest_regress_test_result(
-            cur,
-            reference=args.reference,
-            revision=args.revision,
-            build_type=args.build_type,
-            data_file=args.ingest,
-        )
-
-
-if __name__ == "__main__":
-    logging.getLogger("backoff").addHandler(logging.StreamHandler())
-    main()

From 39be2b0108cad883340de461c2ff9c2ec7612b31 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 1 Feb 2024 17:34:48 +0000
Subject: [PATCH 0048/1571] Makefile: set PQ_LIB_DIR to avoid linkage with
 system libpq (#6538)

## Problem

Initially spotted on macOS. When building `attachment_service`, it might
get linked with system `libpq`:
```
$ otool -L target/debug/attachment_service
target/debug/attachment_service:
	/opt/homebrew/opt/libpq/lib/libpq.5.dylib (compatibility version 5.0.0, current version 5.16.0)
	/System/Library/Frameworks/Security.framework/Versions/A/Security (compatibility version 1.0.0, current version 61040.61.1)
	/System/Library/Frameworks/CoreFoundation.framework/Versions/A/CoreFoundation (compatibility version 150.0.0, current version 2202.0.0)
	/usr/lib/libiconv.2.dylib (compatibility version 7.0.0, current version 7.0.0)
	/usr/lib/libSystem.B.dylib (compatibility version 1.0.0, current version 1336.61.1)
```

After this PR:
```
$ otool -L target/debug/attachment_service
target/debug/attachment_service:
	/Users/bayandin/work/neon/pg_install/v16/lib/libpq.5.dylib (compatibility version 5.0.0, current version 5.16.0)
	/System/Library/Frameworks/Security.framework/Versions/A/Security (compatibility version 1.0.0, current version 61040.61.1)
	/System/Library/Frameworks/CoreFoundation.framework/Versions/A/CoreFoundation (compatibility version 150.0.0, current version 2202.0.0)
	/usr/lib/libiconv.2.dylib (compatibility version 7.0.0, current version 7.0.0)
	/usr/lib/libSystem.B.dylib (compatibility version 1.0.0, current version 1336.61.1)
```

## Summary of changes
- Set `PQ_LIB_DIR` to bundled Postgres 16 lib dir
---
 Makefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 004ca3fbcf..5bed4cb9fc 100644
--- a/Makefile
+++ b/Makefile
@@ -51,6 +51,8 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
+# Set PQ_LIB_DIR to make sure `attachment_service` get linked with bundled libpq (through diesel)
+CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
 
 #
 # Top level Makefile to build Neon and PostgreSQL
@@ -174,10 +176,10 @@ neon-pg-ext-clean-%:
 
 # Build walproposer as a static library. walproposer source code is located
 # in the pgxn/neon directory.
-# 
+#
 # We also need to include libpgport.a and libpgcommon.a, because walproposer
 # uses some functions from those libraries.
-# 
+#
 # Some object files are removed from libpgport.a and libpgcommon.a because
 # they depend on openssl and other libraries that are not included in our
 # Rust build.

From 527cdbc010a40d2f297bdc6771586fa1ff9a3863 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 1 Feb 2024 21:18:07 +0100
Subject: [PATCH 0049/1571] Don't require AWS access keys for S3 pytests
 (#6556)

Don't require AWS access keys (AWS_ACCESS_KEY_ID and
AWS_SECRET_ACCESS_KEY) for S3 usage in the pytests, and also allow
AWS_PROFILE to be passed.

One of the two methods is required however.

This allows local development like:

```
aws sso login --profile dev
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty REMOTE_STORAGE_S3_REGION=eu-central-1 REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests AWS_PROFILE=dev
cargo build_testing && RUST_BACKTRACE=1 ./scripts/pytest -k debug-pg16 test_runner/regress/test_tenant_delete.py::test_tenant_delete_smoke
```

related earlier PR for the cargo unit tests of the `remote_storage` crate: #6202

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 control_plane/src/background_process.rs |  4 ++-
 libs/remote_storage/src/lib.rs          |  7 ++++-
 test_runner/fixtures/remote_storage.py  | 35 +++++++++++++++----------
 3 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 3ffb8734d0..364cc01c39 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -256,7 +256,9 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
     for env_key in [
         "AWS_ACCESS_KEY_ID",
         "AWS_SECRET_ACCESS_KEY",
-        "AWS_SESSION_TOKEN",
+        "AWS_PROFILE",
+        // HOME is needed in combination with `AWS_PROFILE` to pick up the SSO sessions.
+        "HOME",
         "AZURE_STORAGE_ACCOUNT",
         "AZURE_STORAGE_ACCESS_KEY",
     ] {
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index bf9c51ad1a..38a8784fe2 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -434,7 +434,12 @@ impl GenericRemoteStorage {
                 Self::LocalFs(LocalFs::new(root.clone())?)
             }
             RemoteStorageKind::AwsS3(s3_config) => {
-                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
+                // The profile and access key id are only printed here for debugging purposes,
+                // their values don't indicate the eventually taken choice for auth.
+                let profile = std::env::var("AWS_PROFILE").unwrap_or_else(|_| "<none>".into());
+                let access_key_id =
+                    std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
+                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
                       s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
                 Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
             }
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index c0c2383feb..4a692688e0 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -160,8 +160,9 @@ class LocalFsStorage:
 class S3Storage:
     bucket_name: str
     bucket_region: str
-    access_key: str
-    secret_key: str
+    access_key: Optional[str]
+    secret_key: Optional[str]
+    aws_profile: Optional[str]
     prefix_in_bucket: str
     client: S3Client
     cleanup: bool
@@ -170,10 +171,18 @@ class S3Storage:
     endpoint: Optional[str] = None
 
     def access_env_vars(self) -> Dict[str, str]:
-        return {
-            "AWS_ACCESS_KEY_ID": self.access_key,
-            "AWS_SECRET_ACCESS_KEY": self.secret_key,
-        }
+        if self.aws_profile is not None:
+            return {
+                "AWS_PROFILE": self.aws_profile,
+            }
+        if self.access_key is not None and self.secret_key is not None:
+            return {
+                "AWS_ACCESS_KEY_ID": self.access_key,
+                "AWS_SECRET_ACCESS_KEY": self.secret_key,
+            }
+        raise RuntimeError(
+            "Either AWS_PROFILE or (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY) have to be set for S3Storage"
+        )
 
     def to_string(self) -> str:
         return json.dumps(
@@ -308,6 +317,7 @@ class RemoteStorageKind(str, enum.Enum):
                 bucket_region=mock_region,
                 access_key=access_key,
                 secret_key=secret_key,
+                aws_profile=None,
                 prefix_in_bucket="",
                 client=client,
                 cleanup=False,
@@ -317,12 +327,11 @@ class RemoteStorageKind(str, enum.Enum):
         assert self == RemoteStorageKind.REAL_S3
 
         env_access_key = os.getenv("AWS_ACCESS_KEY_ID")
-        assert env_access_key, "no aws access key provided"
         env_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
-        assert env_secret_key, "no aws access key provided"
-
-        # session token is needed for local runs with sso auth
-        session_token = os.getenv("AWS_SESSION_TOKEN")
+        env_profile = os.getenv("AWS_PROFILE")
+        assert (
+            env_access_key and env_secret_key
+        ) or env_profile, "need to specify either access key and secret access key or profile"
 
         bucket_name = bucket_name or os.getenv("REMOTE_STORAGE_S3_BUCKET")
         assert bucket_name is not None, "no remote storage bucket name provided"
@@ -334,9 +343,6 @@ class RemoteStorageKind(str, enum.Enum):
         client = boto3.client(
             "s3",
             region_name=bucket_region,
-            aws_access_key_id=env_access_key,
-            aws_secret_access_key=env_secret_key,
-            aws_session_token=session_token,
         )
 
         return S3Storage(
@@ -344,6 +350,7 @@ class RemoteStorageKind(str, enum.Enum):
             bucket_region=bucket_region,
             access_key=env_access_key,
             secret_key=env_secret_key,
+            aws_profile=env_profile,
             prefix_in_bucket=prefix_in_bucket,
             client=client,
             cleanup=True,

From 35250800312310052353ed138ac388c36d417970 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 1 Feb 2024 22:48:31 +0200
Subject: [PATCH 0050/1571] Fix pgvector 0.6.0 with Neon. (#6571)

The previous patch was broken. rd_smgr as not open yet, need to use
RelationGetSmgr() to access it.
---
 patches/pgvector.patch | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/patches/pgvector.patch b/patches/pgvector.patch
index c429f272fc..cc1ca2e3a6 100644
--- a/patches/pgvector.patch
+++ b/patches/pgvector.patch
@@ -1,7 +1,7 @@
-From 5518a806a70e7f40d5054a762ccda7d5e6b0d31c Mon Sep 17 00:00:00 2001
+From de3dd0cd034d2bcc12b456171ce163bdc1f4cb65 Mon Sep 17 00:00:00 2001
 From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
-Date: Tue, 30 Jan 2024 14:33:00 +0200
-Subject: [PATCH] Make v0.6.0 work with Neon
+Date: Thu, 1 Feb 2024 17:42:31 +0200
+Subject: [PATCH 1/1] Make v0.6.0 work with Neon
 
 Now that the WAL-logging happens as a separate step at the end of the
 build, we need a few neon-specific hints to make it work.
@@ -10,35 +10,35 @@ build, we need a few neon-specific hints to make it work.
  1 file changed, 28 insertions(+)
 
 diff --git a/src/hnswbuild.c b/src/hnswbuild.c
-index 680789ba9044900eac9321844ee2a808a4a2ed12..41c5b709bcb2367ac8b8c498788ecac4c1148b74 100644
+index 680789b..bfa657a 100644
 --- a/src/hnswbuild.c
 +++ b/src/hnswbuild.c
 @@ -1089,13 +1089,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
  	SeedRandom(42);
  #endif
-
+ 
 +#ifdef NEON_SMGR
-+	smgr_start_unlogged_build(index->rd_smgr);
++	smgr_start_unlogged_build(RelationGetSmgr(index));
 +#endif
 +
  	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
-
+ 
  	BuildGraph(buildstate, forkNum);
-
+ 
 +#ifdef NEON_SMGR
-+	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
++	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
 +#endif
 +
  	if (RelationNeedsWAL(index))
 +	{
  		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
-
+ 
 +#ifdef NEON_SMGR
 +		{
 +#if PG_VERSION_NUM >= 160000
-+			RelFileLocator rlocator = index->rd_smgr->smgr_rlocator.locator;
++			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
 +#else
-+			RelFileNode rlocator = index->rd_smgr->smgr_rnode.node;
++			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
 +#endif
 +
 +			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
@@ -49,8 +49,12 @@ index 680789ba9044900eac9321844ee2a808a4a2ed12..41c5b709bcb2367ac8b8c498788ecac4
 +	}
 +
 +#ifdef NEON_SMGR
-+	smgr_end_unlogged_build(index->rd_smgr);
++	smgr_end_unlogged_build(RelationGetSmgr(index));
 +#endif
 +
  	FreeBuildState(buildstate);
  }
+ 
+-- 
+2.39.2
+

From be3038890136922f9d51f2befcf620804f5f19cf Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 1 Feb 2024 11:50:04 -0900
Subject: [PATCH 0051/1571] Add retry to fetching basebackup (#6537)

## Problem
Currently we have no retry mechanism for fetching basebackup. If there's
an unstable connection, starting compute will just fail.

## Summary of changes
Adds an exponential backoff with 7 retries to get the basebackup.
---
 compute_tools/src/compute.rs               | 30 +++++++++++++++++++++-
 test_runner/regress/test_bad_connection.py |  8 +++---
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 07e0abe6ff..1976299e93 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -319,7 +319,7 @@ impl ComputeNode {
     // Get basebackup from the libpq connection to pageserver using `connstr` and
     // unarchive it to `pgdata` directory overriding all its previous content.
     #[instrument(skip_all, fields(%lsn))]
-    fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
+    fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
         let spec = compute_state.pspec.as_ref().expect("spec must be set");
         let start_time = Instant::now();
 
@@ -390,6 +390,34 @@ impl ComputeNode {
         Ok(())
     }
 
+    // Gets the basebackup in a retry loop
+    #[instrument(skip_all, fields(%lsn))]
+    pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
+        let mut retry_period_ms = 500;
+        let mut attempts = 0;
+        let max_attempts = 5;
+        loop {
+            let result = self.try_get_basebackup(compute_state, lsn);
+            match result {
+                Ok(_) => {
+                    return result;
+                }
+                Err(ref e) if attempts < max_attempts => {
+                    warn!(
+                        "Failed to get basebackup: {} (attempt {}/{})",
+                        e, attempts, max_attempts
+                    );
+                    std::thread::sleep(std::time::Duration::from_millis(retry_period_ms));
+                    retry_period_ms *= 2;
+                }
+                Err(_) => {
+                    return result;
+                }
+            }
+            attempts += 1;
+        }
+    }
+
     pub async fn check_safekeepers_synced_async(
         &self,
         compute_state: &ComputeState,
diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py
index ba0624c730..c808fa0f54 100644
--- a/test_runner/regress/test_bad_connection.py
+++ b/test_runner/regress/test_bad_connection.py
@@ -9,14 +9,14 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     env.pageserver.allowed_errors.append(".*simulated connection error.*")
 
+    # Enable failpoint before starting everything else up so that we exercise the retry
+    # on fetching basebackup
     pageserver_http = env.pageserver.http_client()
+    pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)"))
+
     env.neon_cli.create_branch("test_compute_pageserver_connection_stress")
     endpoint = env.endpoints.create_start("test_compute_pageserver_connection_stress")
 
-    # Enable failpoint after starting everything else up so that loading initial
-    # basebackup doesn't fail
-    pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)"))
-
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 

From 7a70ef991f071002474061530e799d3a8785fa4f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 1 Feb 2024 21:59:40 +0100
Subject: [PATCH 0052/1571] feat(walredo): various observability improvements
 (#6573)

- log when we start walredo process
- include tenant shard id in walredo argv
- dump some basic walredo state in tenant details api
- more suitable walredo process launch histogram buckets
- avoid duplicate tracing labels in walredo launch spans
---
 Cargo.lock                        |  1 +
 libs/pageserver_api/Cargo.toml    |  1 +
 libs/pageserver_api/src/models.rs |  8 ++++++++
 pageserver/src/http/routes.rs     |  1 +
 pageserver/src/metrics.rs         |  9 ++++++++-
 pageserver/src/tenant.rs          | 13 +++++++++++++
 pageserver/src/walredo.rs         | 31 ++++++++++++++++++++++++++++---
 7 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 73bef9c96b..ee6aa9e613 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3494,6 +3494,7 @@ dependencies = [
  "bincode",
  "byteorder",
  "bytes",
+ "chrono",
  "const_format",
  "enum-map",
  "hex",
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 96c6c10d3e..902af21965 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -20,6 +20,7 @@ strum_macros.workspace = true
 hex.workspace = true
 thiserror.workspace = true
 humantime-serde.workspace = true
+chrono.workspace = true
 
 workspace_hack.workspace = true
 
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index a7598f9fda..5a638df9cc 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -454,6 +454,8 @@ pub struct TenantDetails {
     #[serde(flatten)]
     pub tenant_info: TenantInfo,
 
+    pub walredo: Option<WalRedoManagerStatus>,
+
     pub timelines: Vec<TimelineId>,
 }
 
@@ -641,6 +643,12 @@ pub struct TimelineGcRequest {
     pub gc_horizon: Option<u64>,
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WalRedoManagerStatus {
+    pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
+    pub pid: Option<u32>,
+}
+
 // Wrapped in libpq CopyData
 #[derive(PartialEq, Eq, Debug)]
 pub enum PagestreamFeMessage {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index c025a25ef1..9d062c50f2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -959,6 +959,7 @@ async fn tenant_status(
                 attachment_status: state.attachment_status(),
                 generation: tenant.generation().into(),
             },
+            walredo: tenant.wal_redo_manager_status(),
             timelines: tenant.list_timeline_ids(),
         })
     }
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ed204cb48c..489ec58e62 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1651,11 +1651,18 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
     .unwrap()
 });
 
+#[rustfmt::skip]
 pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wal_redo_process_launch_duration",
         "Histogram of the duration of successful WalRedoProcess::launch calls",
-        redo_histogram_time_buckets!(),
+        vec![
+            0.0002, 0.0004, 0.0006, 0.0008, 0.0010,
+            0.0020, 0.0040, 0.0060, 0.0080, 0.0100,
+            0.0200, 0.0400, 0.0600, 0.0800, 0.1000,
+            0.2000, 0.4000, 0.6000, 0.8000, 1.0000,
+            1.5000, 2.0000, 2.5000, 3.0000, 4.0000, 10.0000
+        ],
     )
     .expect("failed to define a metric")
 });
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ebf6eb56b1..58af80238d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -20,6 +20,7 @@ use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
 use pageserver_api::models::TimelineState;
+use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
@@ -364,6 +365,14 @@ impl WalRedoManager {
             }
         }
     }
+
+    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
+        match self {
+            WalRedoManager::Prod(m) => m.status(),
+            #[cfg(test)]
+            WalRedoManager::Test(_) => None,
+        }
+    }
 }
 
 #[derive(Debug, thiserror::Error, PartialEq, Eq)]
@@ -1957,6 +1966,10 @@ impl Tenant {
         self.generation
     }
 
+    pub(crate) fn wal_redo_manager_status(&self) -> Option<WalRedoManagerStatus> {
+        self.walredo_mgr.status()
+    }
+
     /// Changes tenant status to active, unless shutdown was already requested.
     ///
     /// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index cfb8052cf1..793bcc1f00 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,6 +22,7 @@ use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
+use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use std::collections::VecDeque;
@@ -179,6 +180,20 @@ impl PostgresRedoManager {
             )
         }
     }
+
+    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
+        Some(WalRedoManagerStatus {
+            last_redo_at: {
+                let at = *self.last_redo_at.lock().unwrap();
+                at.and_then(|at| {
+                    let age = at.elapsed();
+                    // map any chrono errors silently to None here
+                    chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
+                })
+            },
+            pid: self.redo_process.read().unwrap().as_ref().map(|p| p.id()),
+        })
+    }
 }
 
 impl PostgresRedoManager {
@@ -243,8 +258,7 @@ impl PostgresRedoManager {
                         let mut proc_guard = self.redo_process.write().unwrap();
                         match &*proc_guard {
                             None => {
-                                let timer =
-                                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
+                                let start = Instant::now();
                                 let proc = Arc::new(
                                     WalRedoProcess::launch(
                                         self.conf,
@@ -253,7 +267,14 @@ impl PostgresRedoManager {
                                     )
                                     .context("launch walredo process")?,
                                 );
-                                timer.observe_duration();
+                                let duration = start.elapsed();
+                                WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM
+                                    .observe(duration.as_secs_f64());
+                                info!(
+                                    duration_ms = duration.as_millis(),
+                                    pid = proc.id(),
+                                    "launched walredo process"
+                                );
                                 *proc_guard = Some(Arc::clone(&proc));
                                 proc
                             }
@@ -669,7 +690,11 @@ impl WalRedoProcess {
 
         // Start postgres itself
         let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
             .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
             .stdin(Stdio::piped())
             .stderr(Stdio::piped())
             .stdout(Stdio::piped())

From 1be5e564ceac53456a4479bd8a8dc4c1af7c2b28 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 1 Feb 2024 22:38:34 +0100
Subject: [PATCH 0053/1571] feat(walredo): use posix_spawn by moving
 close_fds() work to walredo C code (#6574)

The rust stdlib uses the efficient `posix_spawn` by default.
However, before this PR, pageserver used `pre_exec()` in our
`close_fds()` ext trait.

This PR moves the work that `close_fds()` did to the walredo C code.
I verified manually using `gdb` that we're now forking out the walredo
process using `posix_spawn`.

refs https://github.com/neondatabase/neon/issues/6565
---
 Cargo.lock                      | 11 -------
 Cargo.toml                      |  1 -
 pageserver/Cargo.toml           |  1 -
 pageserver/src/walredo.rs       | 53 +++++----------------------------
 pgxn/neon_walredo/walredoproc.c | 33 ++++++++++++++++++++
 5 files changed, 41 insertions(+), 58 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ee6aa9e613..ea5a29a142 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1144,16 +1144,6 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
 
-[[package]]
-name = "close_fds"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed"
-dependencies = [
- "cfg-if",
- "libc",
-]
-
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -3418,7 +3408,6 @@ dependencies = [
  "camino-tempfile",
  "chrono",
  "clap",
- "close_fds",
  "const_format",
  "consumption_metrics",
  "crc32c",
diff --git a/Cargo.toml b/Cargo.toml
index 26cf604a91..d3006985ab 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -64,7 +64,6 @@ camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive"] }
-close_fds = "0.3.2"
 comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index e44501d1ed..95d558bb7b 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -21,7 +21,6 @@ camino.workspace = true
 camino-tempfile.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
-close_fds.workspace = true
 const_format.workspace = true
 consumption_metrics.workspace = true
 crc32c.workspace = true
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 793bcc1f00..5bc897b730 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -30,7 +30,6 @@ use std::io;
 use std::io::prelude::*;
 use std::ops::{Deref, DerefMut};
 use std::os::unix::io::AsRawFd;
-use std::os::unix::prelude::CommandExt;
 use std::process::Stdio;
 use std::process::{Child, ChildStdin, ChildStdout, Command};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock};
@@ -628,40 +627,6 @@ impl PostgresRedoManager {
     }
 }
 
-///
-/// Command with ability not to give all file descriptors to child process
-///
-trait CloseFileDescriptors: CommandExt {
-    ///
-    /// Close file descriptors (other than stdin, stdout, stderr) in child process
-    ///
-    fn close_fds(&mut self) -> &mut Command;
-}
-
-impl<C: CommandExt> CloseFileDescriptors for C {
-    fn close_fds(&mut self) -> &mut Command {
-        // SAFETY: Code executed inside pre_exec should have async-signal-safety,
-        // which means it should be safe to execute inside a signal handler.
-        // The precise meaning depends on platform. See `man signal-safety`
-        // for the linux definition.
-        //
-        // The set_fds_cloexec_threadsafe function is documented to be
-        // async-signal-safe.
-        //
-        // Aside from this function, the rest of the code is re-entrant and
-        // doesn't make any syscalls. We're just passing constants.
-        //
-        // NOTE: It's easy to indirectly cause a malloc or lock a mutex,
-        // which is not async-signal-safe. Be careful.
-        unsafe {
-            self.pre_exec(move || {
-                close_fds::set_fds_cloexec_threadsafe(3, &[]);
-                Ok(())
-            })
-        }
-    }
-}
-
 struct WalRedoProcess {
     #[allow(dead_code)]
     conf: &'static PageServerConf,
@@ -701,16 +666,14 @@ impl WalRedoProcess {
             .env_clear()
             .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
             .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // The redo process is not trusted, and runs in seccomp mode that
-            // doesn't allow it to open any files. We have to also make sure it
-            // doesn't inherit any file descriptors from the pageserver, that
-            // would allow an attacker to read any files that happen to be open
-            // in the pageserver.
-            //
-            // The Rust standard library makes sure to mark any file descriptors with
-            // as close-on-exec by default, but that's not enough, since we use
-            // libraries that directly call libc open without setting that flag.
-            .close_fds()
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
             .spawn_no_leak_child(tenant_shard_id)
             .context("spawn process")?;
         WAL_REDO_PROCESS_COUNTERS.started.inc();
diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c
index bdc50b0aa9..7ca4fe93df 100644
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -140,9 +140,42 @@ static XLogReaderState *reader_state;
 #define TRACE DEBUG5
 
 #ifdef HAVE_LIBSECCOMP
+
+
+/*
+ * https://man7.org/linux/man-pages/man2/close_range.2.html
+ *
+ * The `close_range` syscall is available as of Linux 5.9.
+ *
+ * The `close_range` libc wrapper is only available in glibc >= 2.34.
+ * Debian Bullseye ships a libc package based on glibc 2.31.
+ * => write the wrapper ourselves, using the syscall number from the kernel headers.
+ *
+ * If the Linux uAPI headers don't define the system call number,
+ * fail the build deliberately rather than ifdef'ing it to ENOSYS.
+ * We prefer a compile time over a runtime error for walredo.
+ */
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <errno.h>
+int close_range(unsigned int start_fd, unsigned int count, unsigned int flags) {
+    return syscall(__NR_close_range, start_fd, count, flags);
+}
+
 static void
 enter_seccomp_mode(void)
 {
+
+	/*
+	 * The pageserver process relies on us to close all the file descriptors
+	 * it potentially leaked to us, _before_ we start processing potentially dangerous
+	 * wal records. See the comment in the Rust code that launches this process.
+	 */
+	int err;
+	if (err = close_range(3, ~0U, 0)) {
+		ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not close files >= fd 3")));
+	}
+
 	PgSeccompRule syscalls[] =
 	{
 		/* Hard requirements */

From 350865392cd5bb38eb2a7ff6f45b36f11ac8a911 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 2 Feb 2024 01:35:31 +0200
Subject: [PATCH 0054/1571] Print checkpoint key contents with "pagectl
 print-layer-file" (#6541)

This was very useful in debugging the bugs fixed in #6410 and #6502.

There's a lot more we could do. This only adds the printing to delta
layers, not image layers, for example, and it might be useful to print
details of more record types. But this is a good start.
---
 .../src/tenant/storage_layer/delta_layer.rs   | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 3a445ef71e..ec031d6089 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -884,7 +884,7 @@ impl DeltaLayerInner {
 
         let keys = self.load_keys(ctx).await?;
 
-        async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
+        async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
             let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
             let val = Value::des(&buf)?;
             let desc = match val {
@@ -906,13 +906,32 @@ impl DeltaLayerInner {
 
         for entry in keys {
             let DeltaEntry { key, lsn, val, .. } = entry;
-            let desc = match dump_blob(val, ctx).await {
+            let desc = match dump_blob(&val, ctx).await {
                 Ok(desc) => desc,
                 Err(err) => {
                     format!("ERROR: {err}")
                 }
             };
             println!("  key {key} at {lsn}: {desc}");
+
+            // Print more details about CHECKPOINT records. Would be nice to print details
+            // of many other record types too, but these are particularly interesting, as
+            // have a lot of special processing for them in walingest.rs.
+            use pageserver_api::key::CHECKPOINT_KEY;
+            use postgres_ffi::CheckPoint;
+            if key == CHECKPOINT_KEY {
+                let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
+                let val = Value::des(&buf)?;
+                match val {
+                    Value::Image(img) => {
+                        let checkpoint = CheckPoint::decode(&img)?;
+                        println!("   CHECKPOINT: {:?}", checkpoint);
+                    }
+                    Value::WalRecord(_rec) => {
+                        println!("   unexpected walrecord value for checkpoint key");
+                    }
+                }
+            }
         }
 
         Ok(())

From 23f58145edbedd2908df1e674e680bc5c9c4f326 Mon Sep 17 00:00:00 2001
From: Andreas Scherbaum <andreasscherbaum@users.noreply.github.com>
Date: Fri, 2 Feb 2024 11:22:32 +0100
Subject: [PATCH 0055/1571] Update wording for better readability (#6559)

Update wording, add spaces in commandline arguments

Co-authored-by: Andreas Scherbaum <andreas@neon.tech>
---
 CONTRIBUTING.md |  2 +-
 README.md       | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b318c295a3..7e177693fa 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -20,7 +20,7 @@ ln -s ../../pre-commit.py .git/hooks/pre-commit
 
 This will run following checks on staged files before each commit:
 - `rustfmt`
-- checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
+- checks for Python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
 
 There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project
 and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date.
diff --git a/README.md b/README.md
index 98af1edee6..a0b368fb94 100644
--- a/README.md
+++ b/README.md
@@ -14,8 +14,8 @@ Alternatively, compile and run the project [locally](#running-local-installation
 A Neon installation consists of compute nodes and the Neon storage engine. Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.
 
 The Neon storage engine consists of two major components:
-- Pageserver. Scalable storage backend for the compute nodes.
-- Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
+- Pageserver: Scalable storage backend for the compute nodes.
+- Safekeepers: The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
 
 See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information.
 
@@ -81,9 +81,9 @@ The project uses [rust toolchain file](./rust-toolchain.toml) to define the vers
 
 This file is automatically picked up by [`rustup`](https://rust-lang.github.io/rustup/overrides.html#the-toolchain-file) that installs (if absent) and uses the toolchain version pinned in the file.
 
-rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory.
+rustup users who want to build with another toolchain can use the [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory.
 
-non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file.
+non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify that their toolchain matches the version in the file.
 Newer rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates.
 
 #### Building on Linux
@@ -124,7 +124,7 @@ make -j`sysctl -n hw.logicalcpu` -s
 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
 
 To run the integration tests or Python scripts (not required to use the code), install
-Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
+Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
 
 
 #### Running neon database
@@ -166,7 +166,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres'
 
 2. Now, it is possible to connect to postgres and run some queries:
 ```text
-> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
+> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# CREATE TABLE t(key int primary key, value text);
 CREATE TABLE
 postgres=# insert into t values(1,1);
@@ -205,7 +205,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'
 
 # this new postgres instance will have all the data from 'main' postgres,
 # but all modifications would not affect data in original postgres
-> psql -p55434 -h 127.0.0.1 -U cloud_admin postgres
+> psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# select * from t;
  key | value
 -----+-------
@@ -216,7 +216,7 @@ postgres=# insert into t values(2,2);
 INSERT 0 1
 
 # check that the new change doesn't affect the 'main' postgres
-> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
+> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# select * from t;
  key | value
 -----+-------
@@ -224,7 +224,7 @@ postgres=# select * from t;
 (1 row)
 ```
 
-4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances
+4. If you want to run tests afterwards (see below), you must stop all the running pageserver, safekeeper, and postgres instances
    you have just started. You can terminate them all with one command:
 ```sh
 > cargo neon stop
@@ -243,7 +243,7 @@ CARGO_BUILD_FLAGS="--features=testing" make
 ```
 
 By default, this runs both debug and release modes, and all supported postgres versions. When
-testing locally, it is convenient to run just run one set of permutations, like this:
+testing locally, it is convenient to run just one set of permutations, like this:
 
 ```sh
 DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest

From 24e916d37fbea209229caf5b3cbc3cd639d1ab63 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Feb 2024 10:35:09 +0000
Subject: [PATCH 0056/1571] pageserver: fix a syntax error in swagger (#6566)

A description was written as a follow-on to a section line, rather than
in the proper `description:` part. This caused swagger parsers to
rightly reject it.
---
 pageserver/src/http/openapi_spec.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index e2a2865145..3694385cab 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1443,7 +1443,8 @@ components:
         node_id:
           description: Pageserver node ID where this shard is attached
           type: integer
-        shard_id: Tenant shard ID of the shard
+        shard_id:
+          description: Tenant shard ID of the shard
           type: string
     SecondaryConfig:
       type: object

From 30c9e145d79b3a6e989e824b5cfd9c47a79a6dcc Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 2 Feb 2024 10:51:20 +0000
Subject: [PATCH 0057/1571] check-macos-build: switch job to macos-14 (M1)
 (#6539)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem
- GitHub made available `macos-14` runners, and they run on M1
processors[0]
- The price is the same as Intel-based runners — "macOS | 3 or 4 (M1 or
Intel) | $0.08"[1], but runners on Apple Silicon should be significantly
faster than their Intel counterparts.
- Most developers who use macOS use Apple Silicon-based Macs nowadays.

- [0] https://github.blog/changelog/2024-01-30-github-actions-introducing-the-new-m1-macos-runner-available-to-open-source/
- [1] https://docs.github.com/en/billing/managing-billing-for-github-actions/about-billing-for-github-actions#per-minute-rates

## Summary of changes
- Run `check-macos-build` on `macos-14`
---
 .github/actionlint.yml                  |  2 ++
 .github/workflows/neon_extra_builds.yml | 12 ++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 362480f256..cb36e2eee6 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -4,6 +4,8 @@ self-hosted-runner:
     - dev
     - gen3
     - large
+    # Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged.
+    - macos-14
     - small
     - us-east-2
 config-variables:
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index f8fb62d3f8..c90ef60074 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -26,7 +26,7 @@ jobs:
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
       github.ref_name == 'main'
     timeout-minutes: 90
-    runs-on: macos-latest
+    runs-on: macos-14
 
     env:
       # Use release build only, to have less debug info around
@@ -60,21 +60,21 @@ jobs:
         uses: actions/cache@v3
         with:
           path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
         uses: actions/cache@v3
         with:
           path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
         uses: actions/cache@v3
         with:
           path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Set extra env for macOS
         run: |
@@ -89,7 +89,7 @@ jobs:
             !~/.cargo/registry/src
             ~/.cargo/git
             target
-          key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
+          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
 
       - name: Build postgres v14
         if: steps.cache_pg_14.outputs.cache-hit != 'true'
@@ -110,7 +110,7 @@ jobs:
         run: make walproposer-lib -j$(sysctl -n hw.ncpu)
 
       - name: Run cargo build
-        run: cargo build --all --release
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release
 
       - name: Check that no warnings are produced
         run: ./run_clippy.sh

From 4133d14a7785a4ac4a1847ebda1dcf22992b906a Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 2 Feb 2024 11:49:11 +0000
Subject: [PATCH 0058/1571] Compute: pgbouncer 1.22.0 (#6582)

## Problem
Update pgbouncer from 1.21 (and patches[0][1]) to 1.22 (which includes
these patches)
- [0] https://github.com/pgbouncer/pgbouncer/pull/972
- [1] https://github.com/pgbouncer/pgbouncer/pull/998

## Summary of changes
- Build pgbouncer 1.22.0 for neonVMs from upstream
---
 vm-image-spec.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index bbe80ceeb1..16ceb06617 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -174,11 +174,10 @@ build: |
           libtool \
           pkg-config
 
-  # Note, we use pgbouncer from neondatabase/pgbouncer fork, which could contain extra commits.
   # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
-  ENV PGBOUNCER_TAG pgbouncer_1_21_0-neon-1
+  ENV PGBOUNCER_TAG pgbouncer_1_22_0
   RUN set -e \
-      && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/neondatabase/pgbouncer.git pgbouncer \
+      && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
       && cd pgbouncer \
       && ./autogen.sh \
       && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \

From 0856fe6676e7cf8d928c0da5a6036e58b360b00b Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 2 Feb 2024 12:28:48 +0000
Subject: [PATCH 0059/1571] proxy: remove per client bytes (#5466)

## Problem

Follow up to #5461

In my memory usage/fragmentation measurements, these metrics came up as
a large source of small allocations. The replacement metric has been in
use for a long time now so I think it's good to finally remove this.
Per-endpoint data is still tracked elsewhere

## Summary of changes

remove the per-client bytes metrics
---
 proxy/src/console/messages.rs  | 25 -------------------------
 proxy/src/metrics.rs           |  9 ---------
 proxy/src/proxy/passthrough.rs |  6 +-----
 3 files changed, 1 insertion(+), 39 deletions(-)

diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 6ef9bcf4eb..4e5920436f 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -100,31 +100,6 @@ pub struct MetricsAuxInfo {
     pub branch_id: BranchId,
 }
 
-impl MetricsAuxInfo {
-    /// Definitions of labels for traffic metric.
-    pub const TRAFFIC_LABELS: &'static [&'static str] = &[
-        // Received (rx) / sent (tx).
-        "direction",
-        // ID of a project.
-        "project_id",
-        // ID of an endpoint within a project.
-        "endpoint_id",
-        // ID of a branch within a project (snapshot).
-        "branch_id",
-    ];
-
-    /// Values of labels for traffic metric.
-    // TODO: add more type safety (validate arity & positions).
-    pub fn traffic_labels(&self, direction: &'static str) -> [&str; 4] {
-        [
-            direction,
-            &self.project_id,
-            &self.endpoint_id,
-            &self.branch_id,
-        ]
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index c7d566f645..fa663d8ff6 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -208,15 +208,6 @@ pub static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
     .unwrap()
 });
 
-pub static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_io_bytes_per_client",
-        "Number of bytes sent/received between client and backend.",
-        crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS,
-    )
-    .unwrap()
-});
-
 pub static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "proxy_io_bytes",
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index d6f097d72d..53e0c3c8f3 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -1,7 +1,7 @@
 use crate::{
     console::messages::MetricsAuxInfo,
     context::RequestMonitoring,
-    metrics::{NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER},
+    metrics::NUM_BYTES_PROXIED_COUNTER,
     usage_metrics::{Ids, USAGE_METRICS},
 };
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -25,27 +25,23 @@ pub async fn proxy_pass(
     });
 
     let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
-    let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx"));
     let mut client = MeasuredStream::new(
         client,
         |_| {},
         |cnt| {
             // Number of bytes we sent to the client (outbound).
             m_sent.inc_by(cnt as u64);
-            m_sent2.inc_by(cnt as u64);
             usage.record_egress(cnt as u64);
         },
     );
 
     let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
-    let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx"));
     let mut compute = MeasuredStream::new(
         compute,
         |_| {},
         |cnt| {
             // Number of bytes the client sent to the compute node (inbound).
             m_recv.inc_by(cnt as u64);
-            m_recv2.inc_by(cnt as u64);
         },
     );
 

From 48b05b7c503e3871d34f413211695fc5a2250a54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 2 Feb 2024 14:52:12 +0100
Subject: [PATCH 0060/1571] Add a time_travel_remote_storage http endpoint
 (#6533)

Adds an endpoint to the pageserver to S3-recover an entire tenant to a
specific given timestamp.

Required input parameters:
* `travel_to`: the target timestamp to recover the S3 state to
* `done_if_after`: a timestamp that marks the beginning of the recovery
process. retries of the query should keep this value constant. it *must*
be after `travel_to`, and also after any changes we want to revert, and
must represent a point in time before the endpoint is being called, all
of these time points in terms of the time source used by S3. these
criteria need to hold even in the face of clock differences, so I
recommend waiting a specific amount of time, then taking
`done_if_after`, then waiting some amount of time again, and only then
issuing the request.

Also important to note: the timestamps in S3 work at second accuracy, so
one needs to add generous waits before and after for the process to work
smoothly (at least 2-3 seconds).

We ignore the added test for the mocked S3 for now due to a limitation
in moto: https://github.com/getmoto/moto/issues/7300 .

Part of https://github.com/neondatabase/cloud/issues/8233
---
 libs/remote_storage/src/azure_blob.rs         |   7 +-
 libs/remote_storage/src/lib.rs                |  43 ++++++-
 libs/remote_storage/src/local_fs.rs           |   8 +-
 libs/remote_storage/src/s3_bucket.rs          |  54 ++++----
 libs/remote_storage/src/simulate_failures.rs  |   7 +-
 pageserver/src/http/openapi_spec.yml          |  58 +++++++++
 pageserver/src/http/routes.rs                 |  79 ++++++++++++
 pageserver/src/tenant/mgr.rs                  |  11 ++
 .../src/tenant/remote_timeline_client.rs      |   5 +
 .../tenant/remote_timeline_client/upload.rs   |  46 ++++++-
 test_runner/fixtures/pageserver/http.py       |  15 +++
 test_runner/fixtures/pageserver/utils.py      |  27 +++-
 test_runner/regress/test_s3_restore.py        | 121 ++++++++++++++++++
 13 files changed, 445 insertions(+), 36 deletions(-)
 create mode 100644 test_runner/regress/test_s3_restore.py

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index abab32470b..57c57a2b70 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -28,6 +28,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::debug;
 
 use crate::s3_bucket::RequestKind;
+use crate::TimeTravelError;
 use crate::{
     AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
     RemoteStorage, StorageMetadata,
@@ -379,12 +380,10 @@ impl RemoteStorage for AzureBlobStorage {
         _timestamp: SystemTime,
         _done_if_after: SystemTime,
         _cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), TimeTravelError> {
         // TODO use Azure point in time recovery feature for this
         // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
-        Err(anyhow::anyhow!(
-            "time travel recovery for azure blob storage is not implemented"
-        ))
+        Err(TimeTravelError::Unimplemented)
     }
 }
 
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 38a8784fe2..4aeaee70b1 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -219,7 +219,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
         timestamp: SystemTime,
         done_if_after: SystemTime,
         cancel: CancellationToken,
-    ) -> anyhow::Result<()>;
+    ) -> Result<(), TimeTravelError>;
 }
 
 pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
@@ -269,6 +269,45 @@ impl std::fmt::Display for DownloadError {
 
 impl std::error::Error for DownloadError {}
 
+#[derive(Debug)]
+pub enum TimeTravelError {
+    /// Validation or other error happened due to user input.
+    BadInput(anyhow::Error),
+    /// The used remote storage does not have time travel recovery implemented
+    Unimplemented,
+    /// The number of versions/deletion markers is above our limit.
+    TooManyVersions,
+    /// A cancellation token aborted the process, typically during
+    /// request closure or process shutdown.
+    Cancelled,
+    /// Other errors
+    Other(anyhow::Error),
+}
+
+impl std::fmt::Display for TimeTravelError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            TimeTravelError::BadInput(e) => {
+                write!(
+                    f,
+                    "Failed to time travel recover a prefix due to user input: {e}"
+                )
+            }
+            TimeTravelError::Unimplemented => write!(
+                f,
+                "time travel recovery is not implemented for the current storage backend"
+            ),
+            TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"),
+            TimeTravelError::TooManyVersions => {
+                write!(f, "Number of versions/delete markers above limit")
+            }
+            TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"),
+        }
+    }
+}
+
+impl std::error::Error for TimeTravelError {}
+
 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
 #[derive(Clone)]
@@ -404,7 +443,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         timestamp: SystemTime,
         done_if_after: SystemTime,
         cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), TimeTravelError> {
         match self {
             Self::LocalFs(s) => {
                 s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 34a6658a69..d47fa75b37 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -18,7 +18,9 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 
-use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath};
+use crate::{
+    Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath, TimeTravelError,
+};
 
 use super::{RemoteStorage, StorageMetadata};
 
@@ -430,8 +432,8 @@ impl RemoteStorage for LocalFs {
         _timestamp: SystemTime,
         _done_if_after: SystemTime,
         _cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
-        unimplemented!()
+    ) -> Result<(), TimeTravelError> {
+        Err(TimeTravelError::Unimplemented)
     }
 }
 
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index e615a1ce7e..4d6564cba6 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -46,7 +46,7 @@ use utils::backoff;
 use super::StorageMetadata;
 use crate::{
     ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
-    S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    S3Config, TimeTravelError, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 pub(super) mod metrics;
@@ -639,14 +639,14 @@ impl RemoteStorage for S3Bucket {
         timestamp: SystemTime,
         done_if_after: SystemTime,
         cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), TimeTravelError> {
         let kind = RequestKind::TimeTravel;
         let _guard = self.permit(kind).await;
 
         let timestamp = DateTime::from(timestamp);
         let done_if_after = DateTime::from(done_if_after);
 
-        tracing::info!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
+        tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
 
         // get the passed prefix or if it is not set use prefix_in_bucket value
         let prefix = prefix
@@ -664,21 +664,21 @@ impl RemoteStorage for S3Bucket {
         loop {
             let response = backoff::retry(
                 || async {
-                    Ok(self
-                        .client
+                    self.client
                         .list_object_versions()
                         .bucket(self.bucket_name.clone())
                         .set_prefix(prefix.clone())
                         .set_key_marker(key_marker.clone())
                         .set_version_id_marker(version_id_marker.clone())
                         .send()
-                        .await?)
+                        .await
+                        .map_err(|e| TimeTravelError::Other(e.into()))
                 },
                 is_permanent,
                 warn_threshold,
                 max_retries,
                 "listing object versions for time_travel_recover",
-                backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
+                backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
             )
             .await?;
 
@@ -699,7 +699,8 @@ impl RemoteStorage for S3Bucket {
                 .map(VerOrDelete::from_delete_marker);
             itertools::process_results(versions.chain(deletes), |n_vds| {
                 versions_and_deletes.extend(n_vds)
-            })?;
+            })
+            .map_err(TimeTravelError::Other)?;
             fn none_if_empty(v: Option<String>) -> Option<String> {
                 v.filter(|v| !v.is_empty())
             }
@@ -708,9 +709,9 @@ impl RemoteStorage for S3Bucket {
             if version_id_marker.is_none() {
                 // The final response is not supposed to be truncated
                 if response.is_truncated.unwrap_or_default() {
-                    anyhow::bail!(
+                    return Err(TimeTravelError::Other(anyhow::anyhow!(
                         "Received truncated ListObjectVersions response for prefix={prefix:?}"
-                    );
+                    )));
                 }
                 break;
             }
@@ -721,12 +722,15 @@ impl RemoteStorage for S3Bucket {
             // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
             const COMPLEXITY_LIMIT: usize = 100_000;
             if versions_and_deletes.len() >= COMPLEXITY_LIMIT {
-                anyhow::bail!(
-                    "Limit for number of versions/deletions exceeded for prefix={prefix:?}"
-                );
+                return Err(TimeTravelError::TooManyVersions);
             }
         }
 
+        tracing::info!(
+            "Built list for time travel with {} versions and deletions",
+            versions_and_deletes.len()
+        );
+
         // Work on the list of references instead of the objects directly,
         // otherwise we get lifetime errors in the sort_by_key call below.
         let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
@@ -740,8 +744,8 @@ impl RemoteStorage for S3Bucket {
                 version_id, key, ..
             } = &vd;
             if version_id == "null" {
-                anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \
-                    indicating either disabled versioning, or legacy objects with null version id values");
+                return Err(TimeTravelError::Other(anyhow!("Received ListVersions response for key={key} with version_id='null', \
+                    indicating either disabled versioning, or legacy objects with null version id values")));
             }
             tracing::trace!(
                 "Parsing version key={key} version_id={version_id} kind={:?}",
@@ -788,22 +792,23 @@ impl RemoteStorage for S3Bucket {
 
                         backoff::retry(
                             || async {
-                                Ok(self
-                                    .client
+                                self.client
                                     .copy_object()
                                     .bucket(self.bucket_name.clone())
                                     .key(key)
                                     .copy_source(&source_id)
                                     .send()
-                                    .await?)
+                                    .await
+                                    .map_err(|e| TimeTravelError::Other(e.into()))
                             },
                             is_permanent,
                             warn_threshold,
                             max_retries,
-                            "listing object versions for time_travel_recover",
-                            backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
+                            "copying object version for time_travel_recover",
+                            backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
                         )
                         .await?;
+                        tracing::info!(%version_id, %key, "Copied old version in S3");
                     }
                     VerOrDelete {
                         kind: VerOrDeleteKind::DeleteMarker,
@@ -820,8 +825,13 @@ impl RemoteStorage for S3Bucket {
                 } else {
                     tracing::trace!("Deleting {key}...");
 
-                    let oid = ObjectIdentifier::builder().key(key.to_owned()).build()?;
-                    self.delete_oids(kind, &[oid]).await?;
+                    let oid = ObjectIdentifier::builder()
+                        .key(key.to_owned())
+                        .build()
+                        .map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
+                    self.delete_oids(kind, &[oid])
+                        .await
+                        .map_err(TimeTravelError::Other)?;
                 }
             }
         }
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index fc4c4b315b..ee9792232a 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -11,7 +11,7 @@ use tokio_util::sync::CancellationToken;
 
 use crate::{
     Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
-    StorageMetadata,
+    StorageMetadata, TimeTravelError,
 };
 
 pub struct UnreliableWrapper {
@@ -191,8 +191,9 @@ impl RemoteStorage for UnreliableWrapper {
         timestamp: SystemTime,
         done_if_after: SystemTime,
         cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
-        self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))?;
+    ) -> Result<(), TimeTravelError> {
+        self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
+            .map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
         self.inner
             .time_travel_recover(prefix, timestamp, done_if_after, cancel)
             .await
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 3694385cab..a6fe7c67e1 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -178,6 +178,64 @@ paths:
               schema:
                 $ref: "#/components/schemas/ServiceUnavailableError"
 
+  /v1/tenant/{tenant_id}/time_travel_remote_storage:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: travel_to
+        in: query
+        required: true
+        schema:
+          type: string
+          format: date-time
+      - name: done_if_after
+        in: query
+        required: true
+        schema:
+          type: string
+          format: date-time
+    put:
+      description: Time travel the tenant's remote storage
+      responses:
+        "200":
+          description: OK
+          content:
+            application/json:
+              schema:
+                type: string
+        "400":
+          description: Error when no tenant id found in path or invalid timestamp
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/timeline:
     parameters:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 9d062c50f2..88c36e8595 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -26,6 +26,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
+use remote_storage::TimeTravelError;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -47,6 +48,7 @@ use crate::tenant::mgr::{
     TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
 use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
+use crate::tenant::remote_timeline_client;
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -1424,6 +1426,79 @@ async fn list_location_config_handler(
     json_response(StatusCode::OK, result)
 }
 
+// Do a time travel recovery on the given tenant/tenant shard. Tenant needs to be detached
+// (from all pageservers) as it invalidates consistency assumptions.
+async fn tenant_time_travel_remote_storage_handler(
+    request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let timestamp_raw = must_get_query_param(&request, "travel_to")?;
+    let timestamp = humantime::parse_rfc3339(&timestamp_raw)
+        .with_context(|| format!("Invalid time for travel_to: {timestamp_raw:?}"))
+        .map_err(ApiError::BadRequest)?;
+
+    let done_if_after_raw = must_get_query_param(&request, "done_if_after")?;
+    let done_if_after = humantime::parse_rfc3339(&done_if_after_raw)
+        .with_context(|| format!("Invalid time for done_if_after: {done_if_after_raw:?}"))
+        .map_err(ApiError::BadRequest)?;
+
+    // This is just a sanity check to fend off naive wrong usages of the API:
+    // the tenant needs to be detached *everywhere*
+    let state = get_state(&request);
+    let we_manage_tenant = state.tenant_manager.manages_tenant_shard(tenant_shard_id);
+    if we_manage_tenant {
+        return Err(ApiError::BadRequest(anyhow!(
+            "Tenant {tenant_shard_id} is already attached at this pageserver"
+        )));
+    }
+
+    let Some(storage) = state.remote_storage.as_ref() else {
+        return Err(ApiError::InternalServerError(anyhow::anyhow!(
+            "remote storage not configured, cannot run time travel"
+        )));
+    };
+
+    if timestamp > done_if_after {
+        return Err(ApiError::BadRequest(anyhow!(
+            "The done_if_after timestamp comes before the timestamp to recover to"
+        )));
+    }
+
+    tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}");
+
+    remote_timeline_client::upload::time_travel_recover_tenant(
+        storage,
+        &tenant_shard_id,
+        timestamp,
+        done_if_after,
+        &cancel,
+    )
+    .await
+    .map_err(|e| match e {
+        TimeTravelError::BadInput(e) => {
+            warn!("bad input error: {e}");
+            ApiError::BadRequest(anyhow!("bad input error"))
+        }
+        TimeTravelError::Unimplemented => {
+            ApiError::BadRequest(anyhow!("unimplemented for the configured remote storage"))
+        }
+        TimeTravelError::Cancelled => ApiError::InternalServerError(anyhow!("cancelled")),
+        TimeTravelError::TooManyVersions => {
+            ApiError::InternalServerError(anyhow!("too many versions in remote storage"))
+        }
+        TimeTravelError::Other(e) => {
+            warn!("internal error: {e}");
+            ApiError::InternalServerError(anyhow!("internal error"))
+        }
+    })?;
+
+    json_response(StatusCode::OK, ())
+}
+
 /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
 async fn handle_tenant_break(
     r: Request<Body>,
@@ -1969,6 +2044,10 @@ pub fn make_router(
         .get("/v1/location_config", |r| {
             api_handler(r, list_location_config_handler)
         })
+        .put(
+            "/v1/tenant/:tenant_shard_id/time_travel_remote_storage",
+            |r| api_handler(r, tenant_time_travel_remote_storage_handler),
+        )
         .get("/v1/tenant/:tenant_shard_id/timeline", |r| {
             api_handler(r, timeline_list_handler)
         })
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 949db3c543..64fd709386 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -898,6 +898,17 @@ impl TenantManager {
         }
     }
 
+    /// Whether the `TenantManager` is responsible for the tenant shard
+    pub(crate) fn manages_tenant_shard(&self, tenant_shard_id: TenantShardId) -> bool {
+        let locked = self.tenants.read().unwrap();
+
+        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
+            .ok()
+            .flatten();
+
+        peek_slot.is_some()
+    }
+
     #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
     pub(crate) async fn upsert_location(
         &self,
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 80ff5c9a2d..2e429ee9bc 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1719,6 +1719,11 @@ pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
     RemotePath::from_string(&path).expect("Failed to construct path")
 }
 
+fn remote_timelines_path_unsharded(tenant_id: &TenantId) -> RemotePath {
+    let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}");
+    RemotePath::from_string(&path).expect("Failed to construct path")
+}
+
 pub fn remote_timeline_path(
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 58d95f75c2..76df9ba5c4 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -5,9 +5,11 @@ use camino::Utf8Path;
 use fail::fail_point;
 use pageserver_api::shard::TenantShardId;
 use std::io::{ErrorKind, SeekFrom};
+use std::time::SystemTime;
 use tokio::fs::{self, File};
 use tokio::io::AsyncSeekExt;
 use tokio_util::sync::CancellationToken;
+use utils::backoff;
 
 use super::Generation;
 use crate::{
@@ -17,7 +19,7 @@ use crate::{
         remote_initdb_preserved_archive_path, remote_path, upload_cancellable,
     },
 };
-use remote_storage::GenericRemoteStorage;
+use remote_storage::{GenericRemoteStorage, TimeTravelError};
 use utils::id::{TenantId, TimelineId};
 
 use super::index::LayerFileMetadata;
@@ -157,3 +159,45 @@ pub(crate) async fn preserve_initdb_archive(
         .await
         .with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'"))
 }
+
+pub(crate) async fn time_travel_recover_tenant(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    timestamp: SystemTime,
+    done_if_after: SystemTime,
+    cancel: &CancellationToken,
+) -> Result<(), TimeTravelError> {
+    let warn_after = 3;
+    let max_attempts = 10;
+    let mut prefixes = Vec::with_capacity(2);
+    if tenant_shard_id.is_zero() {
+        // Also recover the unsharded prefix for a shard of zero:
+        // - if the tenant is totally unsharded, the unsharded prefix contains all the data
+        // - if the tenant is sharded, we still want to recover the initdb data, but we only
+        //   want to do it once, so let's do it on the 0 shard
+        let timelines_path_unsharded =
+            super::remote_timelines_path_unsharded(&tenant_shard_id.tenant_id);
+        prefixes.push(timelines_path_unsharded);
+    }
+    if !tenant_shard_id.is_unsharded() {
+        // If the tenant is sharded, we need to recover the sharded prefix
+        let timelines_path = super::remote_timelines_path(tenant_shard_id);
+        prefixes.push(timelines_path);
+    }
+    for prefix in &prefixes {
+        backoff::retry(
+            || async {
+                storage
+                    .time_travel_recover(Some(prefix), timestamp, done_if_after, cancel.clone())
+                    .await
+            },
+            |e| !matches!(e, TimeTravelError::Other(_)),
+            warn_after,
+            max_attempts,
+            "time travel recovery of tenant prefix",
+            backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
+        )
+        .await?;
+    }
+    Ok(())
+}
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 65675aebe1..1a8765d830 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -4,6 +4,7 @@ import json
 import time
 from collections import defaultdict
 from dataclasses import dataclass
+from datetime import datetime
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import requests
@@ -389,6 +390,20 @@ class PageserverHttpClient(requests.Session):
         )
         return res.text
 
+    def tenant_time_travel_remote_storage(
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+        timestamp: datetime,
+        done_if_after: datetime,
+    ):
+        """
+        Issues a request to perform time travel operations on the remote storage
+        """
+        res = self.put(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/time_travel_remote_storage?travel_to={timestamp.isoformat()}Z&done_if_after={done_if_after.isoformat()}Z"
+        )
+        self.verbose_error(res)
+
     def timeline_list(
         self,
         tenant_id: Union[TenantId, TenantShardId],
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 6b2651e447..4cfdee6e01 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -1,7 +1,11 @@
 import time
 from typing import Any, Dict, List, Optional, Union
 
-from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef
+from mypy_boto3_s3.type_defs import (
+    EmptyResponseMetadataTypeDef,
+    ListObjectsV2OutputTypeDef,
+    ObjectTypeDef,
+)
 
 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
@@ -346,6 +350,27 @@ def list_prefix(
     return response
 
 
+def enable_remote_storage_versioning(
+    remote: RemoteStorage,
+) -> EmptyResponseMetadataTypeDef:
+    """
+    Enable S3 versioning for the remote storage
+    """
+    # local_fs has no
+    assert isinstance(remote, S3Storage), "localfs is currently not supported"
+    assert remote.client is not None
+
+    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
+    response = remote.client.put_bucket_versioning(
+        Bucket=remote.bucket_name,
+        VersioningConfiguration={
+            "MFADelete": "Disabled",
+            "Status": "Enabled",
+        },
+    )
+    return response
+
+
 def wait_tenant_status_404(
     pageserver_http: PageserverHttpClient,
     tenant_id: TenantId,
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
new file mode 100644
index 0000000000..188d8a3b33
--- /dev/null
+++ b/test_runner/regress/test_s3_restore.py
@@ -0,0 +1,121 @@
+import time
+from datetime import datetime, timezone
+
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PgBin,
+)
+from fixtures.pageserver.utils import (
+    MANY_SMALL_LAYERS_TENANT_CONFIG,
+    assert_prefix_empty,
+    enable_remote_storage_versioning,
+    poll_for_remote_storage_iterations,
+    tenant_delete_wait_completed,
+    wait_for_upload,
+)
+from fixtures.remote_storage import RemoteStorageKind, s3_storage
+from fixtures.types import Lsn
+from fixtures.utils import run_pg_bench_small
+
+
+def test_tenant_s3_restore(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+):
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    # Mock S3 doesn't have versioning enabled by default, enable it
+    # (also do it before there is any writes to the bucket)
+    if remote_storage_kind == RemoteStorageKind.MOCK_S3:
+        remote_storage = neon_env_builder.pageserver_remote_storage
+        assert remote_storage, "remote storage not configured"
+        enable_remote_storage_versioning(remote_storage)
+        pytest.skip("moto doesn't support self-copy: https://github.com/getmoto/moto/issues/7300")
+
+    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    env.pageserver.allowed_errors.extend(
+        [
+            # The deletion queue will complain when it encounters simulated S3 errors
+            ".*deletion executor: DeleteObjects request failed.*",
+            # lucky race with stopping from flushing a layer we fail to schedule any uploads
+            ".*layer flush task.+: could not flush frozen layer: update_metadata_file",
+        ]
+    )
+
+    ps_http = env.pageserver.http_client()
+
+    tenant_id = env.initial_tenant
+
+    # Default tenant and the one we created
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+
+    # create two timelines one being the parent of another, both with non-trivial data
+    parent = None
+    last_flush_lsns = []
+
+    for timeline in ["first", "second"]:
+        timeline_id = env.neon_cli.create_branch(
+            timeline, tenant_id=tenant_id, ancestor_branch_name=parent
+        )
+        with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
+            run_pg_bench_small(pg_bin, endpoint.connstr())
+            endpoint.safe_psql(f"CREATE TABLE created_{timeline}(id integer);")
+            last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+            last_flush_lsns.append(last_flush_lsn)
+        ps_http.timeline_checkpoint(tenant_id, timeline_id)
+        wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
+        parent = timeline
+
+    # These sleeps are important because they fend off differences in clocks between us and S3
+    time.sleep(4)
+    ts_before_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None)
+    time.sleep(4)
+
+    assert (
+        ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+    ), "tenant removed before we deletion was issued"
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
+    ps_http.deletion_queue_flush(execute=True)
+    assert (
+        ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
+    ), "tenant removed before we deletion was issued"
+    env.attachment_service.attach_hook_drop(tenant_id)
+
+    tenant_path = env.pageserver.tenant_dir(tenant_id)
+    assert not tenant_path.exists()
+
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
+
+    time.sleep(4)
+    ts_after_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None)
+    time.sleep(4)
+
+    ps_http.tenant_time_travel_remote_storage(
+        tenant_id, timestamp=ts_before_deletion, done_if_after=ts_after_deletion
+    )
+
+    generation = env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)
+
+    ps_http.tenant_attach(tenant_id, generation=generation)
+    env.pageserver.quiesce_tenants()
+
+    for i, timeline in enumerate(["first", "second"]):
+        with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
+            endpoint.safe_psql(f"SELECT * FROM created_{timeline};")
+            last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+            expected_last_flush_lsn = last_flush_lsns[i]
+            # There might be some activity that advances the lsn so we can't use a strict equality check
+            assert last_flush_lsn >= expected_last_flush_lsn, "last_flush_lsn too old"
+
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1

From 56171cbe8c2b81ba2b949a5ec39c11991fb5e47a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Feb 2024 14:14:42 +0000
Subject: [PATCH 0061/1571] pageserver: more permissive activation timeout when
 testing (#6564)

## Problem

The 5 second activation timeout is appropriate for production
environments, where we want to give a prompt response to the cloud
control plane, and if we fail it will retry the call. In tests however,
we don't want every call to e.g. timeline create to have to come with a
retry wrapper.

This issue has always been there, but it is more apparent in sharding
tests that concurrently attach several tenant shards.

Closes: https://github.com/neondatabase/neon/issues/6563

## Summary of changes

When `testing` feature is enabled, make `ACTIVE_TENANT_TIMEOUT` 30
seconds instead of 5 seconds.
---
 pageserver/src/http/routes.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 88c36e8595..57ee746726 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -79,8 +79,14 @@ use utils::{
 // For APIs that require an Active tenant, how long should we block waiting for that state?
 // This is not functionally necessary (clients will retry), but avoids generating a lot of
 // failed API calls while tenants are activating.
+#[cfg(not(feature = "testing"))]
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
 
+// Tests run on slow/oversubscribed nodes, and may need to wait much longer for tenants to
+// finish attaching, if calls to remote storage are slow.
+#[cfg(feature = "testing")]
+const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
+
 pub struct State {
     conf: &'static PageServerConf,
     tenant_manager: Arc<TenantManager>,

From 46fb1a90cee74aba8c66317deb18d634756ccfa7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Feb 2024 15:52:03 +0000
Subject: [PATCH 0062/1571] pageserver: avoid calculating/sending logical sizes
 on shard !=0 (#6567)

## Problem

Sharded tenants only maintain accurate relation sizes on shard 0.
Therefore logical size can only be calculated on shard 0. Fortunately it
is also only _needed_ on shard 0, to provide Safekeeper feedback and to
send consumption metrics.

Closes: #6307

## Summary of changes

- Send 0 for logical size to safekeepers on shards !=0
- Skip logical size warmup task on shards !=0
- Skip imitate_layer_accesses on shards !=0
---
 pageserver/src/tenant/timeline.rs             | 91 +++++++++++--------
 .../src/tenant/timeline/eviction_task.rs      |  7 ++
 .../src/tenant/timeline/logical_size.rs       |  8 ++
 .../walreceiver/walreceiver_connection.rs     | 22 +++--
 4 files changed, 84 insertions(+), 44 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 168e565edb..e779f6f32e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -124,7 +124,7 @@ pub(super) enum FlushLoopState {
 
 /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub struct Hole {
+pub(crate) struct Hole {
     key_range: Range<Key>,
     coverage_size: usize,
 }
@@ -565,19 +565,19 @@ impl From<GetReadyAncestorError> for PageReconstructError {
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
-    pub fn get_ancestor_lsn(&self) -> Lsn {
+    pub(crate) fn get_ancestor_lsn(&self) -> Lsn {
         self.ancestor_lsn
     }
 
     /// Get the ancestor's timeline id
-    pub fn get_ancestor_timeline_id(&self) -> Option<TimelineId> {
+    pub(crate) fn get_ancestor_timeline_id(&self) -> Option<TimelineId> {
         self.ancestor_timeline
             .as_ref()
             .map(|ancestor| ancestor.timeline_id)
     }
 
     /// Lock and get timeline's GC cutoff
-    pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
+    pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
         self.latest_gc_cutoff_lsn.read()
     }
 
@@ -733,27 +733,27 @@ impl Timeline {
     }
 
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
-    pub fn get_last_record_lsn(&self) -> Lsn {
+    pub(crate) fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last
     }
 
-    pub fn get_prev_record_lsn(&self) -> Lsn {
+    pub(crate) fn get_prev_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().prev
     }
 
     /// Atomically get both last and prev.
-    pub fn get_last_record_rlsn(&self) -> RecordLsn {
+    pub(crate) fn get_last_record_rlsn(&self) -> RecordLsn {
         self.last_record_lsn.load()
     }
 
-    pub fn get_disk_consistent_lsn(&self) -> Lsn {
+    pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn {
         self.disk_consistent_lsn.load()
     }
 
     /// remote_consistent_lsn from the perspective of the tenant's current generation,
     /// not validated with control plane yet.
     /// See [`Self::get_remote_consistent_lsn_visible`].
-    pub fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
+    pub(crate) fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
         if let Some(remote_client) = &self.remote_client {
             remote_client.remote_consistent_lsn_projected()
         } else {
@@ -764,7 +764,7 @@ impl Timeline {
     /// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
     /// i.e. a value of remote_consistent_lsn_projected which has undergone
     /// generation validation in the deletion queue.
-    pub fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
+    pub(crate) fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
         if let Some(remote_client) = &self.remote_client {
             remote_client.remote_consistent_lsn_visible()
         } else {
@@ -775,7 +775,7 @@ impl Timeline {
     /// The sum of the file size of all historic layers in the layer map.
     /// This method makes no distinction between local and remote layers.
     /// Hence, the result **does not represent local filesystem usage**.
-    pub async fn layer_size_sum(&self) -> u64 {
+    pub(crate) async fn layer_size_sum(&self) -> u64 {
         let guard = self.layers.read().await;
         let layer_map = guard.layer_map();
         let mut size = 0;
@@ -785,7 +785,7 @@ impl Timeline {
         size
     }
 
-    pub fn resident_physical_size(&self) -> u64 {
+    pub(crate) fn resident_physical_size(&self) -> u64 {
         self.metrics.resident_physical_size_get()
     }
 
@@ -861,7 +861,7 @@ impl Timeline {
     }
 
     /// Check that it is valid to request operations with that lsn.
-    pub fn check_lsn_is_in_scope(
+    pub(crate) fn check_lsn_is_in_scope(
         &self,
         lsn: Lsn,
         latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
@@ -877,7 +877,7 @@ impl Timeline {
 
     /// Flush to disk all data that was written with the put_* functions
     #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
-    pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
+    pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
         self.freeze_inmem_layer(false).await;
         self.flush_frozen_layers_and_wait().await
     }
@@ -1021,7 +1021,7 @@ impl Timeline {
     }
 
     /// Mutate the timeline with a [`TimelineWriter`].
-    pub async fn writer(&self) -> TimelineWriter<'_> {
+    pub(crate) async fn writer(&self) -> TimelineWriter<'_> {
         TimelineWriter {
             tl: self,
             _write_guard: self.write_lock.lock().await,
@@ -1033,7 +1033,7 @@ impl Timeline {
     ///
     /// Also flush after a period of time without new data -- it helps
     /// safekeepers to regard pageserver as caught up and suspend activity.
-    pub async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
+    pub(crate) async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
         let last_lsn = self.get_last_record_lsn();
         let open_layer_size = {
             let guard = self.layers.read().await;
@@ -1071,13 +1071,16 @@ impl Timeline {
         Ok(())
     }
 
-    pub fn activate(
+    pub(crate) fn activate(
         self: &Arc<Self>,
         broker_client: BrokerClientChannel,
         background_jobs_can_start: Option<&completion::Barrier>,
         ctx: &RequestContext,
     ) {
-        self.spawn_initial_logical_size_computation_task(ctx);
+        if self.tenant_shard_id.is_zero() {
+            // Logical size is only maintained accurately on shard zero.
+            self.spawn_initial_logical_size_computation_task(ctx);
+        }
         self.launch_wal_receiver(ctx, broker_client);
         self.set_state(TimelineState::Active);
         self.launch_eviction_task(background_jobs_can_start);
@@ -1172,7 +1175,7 @@ impl Timeline {
         self.gate.close().await;
     }
 
-    pub fn set_state(&self, new_state: TimelineState) {
+    pub(crate) fn set_state(&self, new_state: TimelineState) {
         match (self.current_state(), new_state) {
             (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
                 info!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
@@ -1192,7 +1195,7 @@ impl Timeline {
         }
     }
 
-    pub fn set_broken(&self, reason: String) {
+    pub(crate) fn set_broken(&self, reason: String) {
         let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture());
         let broken_state = TimelineState::Broken {
             reason,
@@ -1206,27 +1209,27 @@ impl Timeline {
         self.cancel.cancel();
     }
 
-    pub fn current_state(&self) -> TimelineState {
+    pub(crate) fn current_state(&self) -> TimelineState {
         self.state.borrow().clone()
     }
 
-    pub fn is_broken(&self) -> bool {
+    pub(crate) fn is_broken(&self) -> bool {
         matches!(&*self.state.borrow(), TimelineState::Broken { .. })
     }
 
-    pub fn is_active(&self) -> bool {
+    pub(crate) fn is_active(&self) -> bool {
         self.current_state() == TimelineState::Active
     }
 
-    pub fn is_stopping(&self) -> bool {
+    pub(crate) fn is_stopping(&self) -> bool {
         self.current_state() == TimelineState::Stopping
     }
 
-    pub fn subscribe_for_state_updates(&self) -> watch::Receiver<TimelineState> {
+    pub(crate) fn subscribe_for_state_updates(&self) -> watch::Receiver<TimelineState> {
         self.state.subscribe()
     }
 
-    pub async fn wait_to_become_active(
+    pub(crate) async fn wait_to_become_active(
         &self,
         _ctx: &RequestContext, // Prepare for use by cancellation
     ) -> Result<(), TimelineState> {
@@ -1251,7 +1254,7 @@ impl Timeline {
         }
     }
 
-    pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
+    pub(crate) async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
         let guard = self.layers.read().await;
         let layer_map = guard.layer_map();
         let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
@@ -1275,7 +1278,10 @@ impl Timeline {
     }
 
     #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
-    pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
+    pub(crate) async fn download_layer(
+        &self,
+        layer_file_name: &str,
+    ) -> anyhow::Result<Option<bool>> {
         let Some(layer) = self.find_layer(layer_file_name).await else {
             return Ok(None);
         };
@@ -1292,7 +1298,7 @@ impl Timeline {
     /// Evict just one layer.
     ///
     /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
-    pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
+    pub(crate) async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
         let _gate = self
             .gate
             .enter()
@@ -1315,7 +1321,7 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 
 // Private functions
 impl Timeline {
-    pub fn get_lazy_slru_download(&self) -> bool {
+    pub(crate) fn get_lazy_slru_download(&self) -> bool {
         let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
         tenant_conf
             .lazy_slru_download
@@ -1852,6 +1858,12 @@ impl Timeline {
         priority: GetLogicalSizePriority,
         ctx: &RequestContext,
     ) -> logical_size::CurrentLogicalSize {
+        if !self.tenant_shard_id.is_zero() {
+            // Logical size is only accurately maintained on shard zero: when called elsewhere, for example
+            // when HTTP API is serving a GET for timeline zero, return zero
+            return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero());
+        }
+
         let current_size = self.current_logical_size.current_size();
         debug!("Current size: {current_size:?}");
 
@@ -2094,7 +2106,7 @@ impl Timeline {
             .expect("only this task sets it");
     }
 
-    pub fn spawn_ondemand_logical_size_calculation(
+    pub(crate) fn spawn_ondemand_logical_size_calculation(
         self: &Arc<Self>,
         lsn: Lsn,
         cause: LogicalSizeCalculationCause,
@@ -2140,6 +2152,9 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         span::debug_assert_current_span_has_tenant_and_timeline_id();
+        // We should never be calculating logical sizes on shard !=0, because these shards do not have
+        // accurate relation sizes, and they do not emit consumption metrics.
+        debug_assert!(self.tenant_shard_id.is_zero());
 
         let _guard = self.gate.enter();
 
@@ -2173,7 +2188,7 @@ impl Timeline {
     /// # Cancel-Safety
     ///
     /// This method is cancellation-safe.
-    pub async fn calculate_logical_size(
+    async fn calculate_logical_size(
         &self,
         up_to_lsn: Lsn,
         cause: LogicalSizeCalculationCause,
@@ -3422,7 +3437,7 @@ enum DurationRecorder {
 }
 
 impl DurationRecorder {
-    pub fn till_now(&self) -> DurationRecorder {
+    fn till_now(&self) -> DurationRecorder {
         match self {
             DurationRecorder::NotStarted => {
                 panic!("must only call on recorded measurements")
@@ -3433,7 +3448,7 @@ impl DurationRecorder {
             }
         }
     }
-    pub fn into_recorded(self) -> Option<RecordedDuration> {
+    fn into_recorded(self) -> Option<RecordedDuration> {
         match self {
             DurationRecorder::NotStarted => None,
             DurationRecorder::Recorded(recorded, _) => Some(recorded),
@@ -4633,7 +4648,9 @@ impl Timeline {
         }
     }
 
-    pub fn get_download_all_remote_layers_task_info(&self) -> Option<DownloadRemoteLayersTaskInfo> {
+    pub(crate) fn get_download_all_remote_layers_task_info(
+        &self,
+    ) -> Option<DownloadRemoteLayersTaskInfo> {
         self.download_all_remote_layers_task_info
             .read()
             .unwrap()
@@ -4729,7 +4746,7 @@ fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageRecon
 // TODO Currently, Deref is used to allow easy access to read methods from this trait.
 // This is probably considered a bad practice in Rust and should be fixed eventually,
 // but will cause large code changes.
-pub struct TimelineWriter<'a> {
+pub(crate) struct TimelineWriter<'a> {
     tl: &'a Timeline,
     _write_guard: tokio::sync::MutexGuard<'a, ()>,
 }
@@ -4747,7 +4764,7 @@ impl<'a> TimelineWriter<'a> {
     ///
     /// This will implicitly extend the relation, if the page is beyond the
     /// current end-of-file.
-    pub async fn put(
+    pub(crate) async fn put(
         &self,
         key: Key,
         lsn: Lsn,
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 01a5bfc32b..9bdd52e809 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -319,6 +319,13 @@ impl Timeline {
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
+        if !self.tenant_shard_id.is_zero() {
+            // Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size
+            // for consumption metrics (consumption metrics are only sent from shard 0).  We may therefore
+            // skip imitating logical size accesses for eviction purposes.
+            return ControlFlow::Continue(());
+        }
+
         let mut state = self.eviction_task_timeline_state.lock().await;
 
         // Only do the imitate_layer accesses approximately as often as the threshold.  A little
diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs
index 03bc59ea38..8f9ca0e29f 100644
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -101,6 +101,14 @@ impl From<&Exact> for u64 {
     }
 }
 
+impl Approximate {
+    /// For use in situations where we don't have a sane logical size value but need
+    /// to return something, e.g. in HTTP API on shard >0 of a sharded tenant.
+    pub(crate) fn zero() -> Self {
+        Self(0)
+    }
+}
+
 impl CurrentLogicalSize {
     pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 {
         match self {
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index e398d683e5..73eb42bb30 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -426,13 +426,21 @@ pub(super) async fn handle_walreceiver_connection(
 
             // Send the replication feedback message.
             // Regular standby_status_update fields are put into this message.
-            let current_timeline_size = timeline
-                .get_current_logical_size(
-                    crate::tenant::timeline::GetLogicalSizePriority::User,
-                    &ctx,
-                )
-                // FIXME: https://github.com/neondatabase/neon/issues/5963
-                .size_dont_care_about_accuracy();
+            let current_timeline_size = if timeline.tenant_shard_id.is_zero() {
+                timeline
+                    .get_current_logical_size(
+                        crate::tenant::timeline::GetLogicalSizePriority::User,
+                        &ctx,
+                    )
+                    // FIXME: https://github.com/neondatabase/neon/issues/5963
+                    .size_dont_care_about_accuracy()
+            } else {
+                // Non-zero shards send zero for logical size.  The safekeeper will ignore
+                // this number.  This is because in a sharded tenant, only shard zero maintains
+                // accurate logical size.
+                0
+            };
+
             let status_update = PageserverFeedback {
                 current_timeline_size,
                 last_received_lsn,

From 6506fd14c45bf4fd685e8ba25cbd609502537155 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 2 Feb 2024 16:07:35 +0000
Subject: [PATCH 0063/1571] proxy: more refactors (#6526)

## Problem

not really any problem, just some drive-by changes

## Summary of changes

1. move wake compute
2. move json processing
3. move handle_try_wake
4. move test backend to api provider
5. reduce wake-compute concerns
6. remove duplicate wake-compute loop
---
 proxy/src/auth/backend.rs             | 113 +++----
 proxy/src/auth/backend/classic.rs     |   2 +-
 proxy/src/auth/flow.rs                |   2 +-
 proxy/src/bin/proxy.rs                |  26 +-
 proxy/src/console/provider.rs         |  23 +-
 proxy/src/console/provider/neon.rs    |   1 -
 proxy/src/proxy.rs                    |   1 +
 proxy/src/proxy/connect_compute.rs    | 118 ++-----
 proxy/src/proxy/tests.rs              |  16 +-
 proxy/src/proxy/wake_compute.rs       |  95 ++++++
 proxy/src/serverless.rs               |   1 +
 proxy/src/serverless/json.rs          | 448 ++++++++++++++++++++++++++
 proxy/src/serverless/sql_over_http.rs | 447 +------------------------
 13 files changed, 649 insertions(+), 644 deletions(-)
 create mode 100644 proxy/src/proxy/wake_compute.rs
 create mode 100644 proxy/src/serverless/json.rs

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 144c9dcff5..236567163e 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -12,8 +12,7 @@ use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
 use crate::console::AuthSecret;
 use crate::context::RequestMonitoring;
-use crate::proxy::connect_compute::handle_try_wake;
-use crate::proxy::retry::retry_after;
+use crate::proxy::wake_compute::wake_compute;
 use crate::proxy::NeonOptions;
 use crate::stream::Stream;
 use crate::{
@@ -28,11 +27,26 @@ use crate::{
 };
 use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
 use futures::TryFutureExt;
-use std::borrow::Cow;
-use std::ops::ControlFlow;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{error, info, warn};
+use tracing::info;
+
+/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
+pub enum MaybeOwned<'a, T> {
+    Owned(T),
+    Borrowed(&'a T),
+}
+
+impl<T> std::ops::Deref for MaybeOwned<'_, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        match self {
+            MaybeOwned::Owned(t) => t,
+            MaybeOwned::Borrowed(t) => t,
+        }
+    }
+}
 
 /// This type serves two purposes:
 ///
@@ -44,12 +58,9 @@ use tracing::{error, info, warn};
 ///   backends which require them for the authentication process.
 pub enum BackendType<'a, T> {
     /// Cloud API (V2).
-    Console(Cow<'a, ConsoleBackend>, T),
+    Console(MaybeOwned<'a, ConsoleBackend>, T),
     /// Authentication via a web browser.
-    Link(Cow<'a, url::ApiUrl>),
-    #[cfg(test)]
-    /// Test backend.
-    Test(&'a dyn TestBackend),
+    Link(MaybeOwned<'a, url::ApiUrl>),
 }
 
 pub trait TestBackend: Send + Sync + 'static {
@@ -67,14 +78,14 @@ impl std::fmt::Display for BackendType<'_, ()> {
                 ConsoleBackend::Console(endpoint) => {
                     fmt.debug_tuple("Console").field(&endpoint.url()).finish()
                 }
-                #[cfg(feature = "testing")]
+                #[cfg(any(test, feature = "testing"))]
                 ConsoleBackend::Postgres(endpoint) => {
                     fmt.debug_tuple("Postgres").field(&endpoint.url()).finish()
                 }
+                #[cfg(test)]
+                ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
             },
             Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
-            #[cfg(test)]
-            Test(_) => fmt.debug_tuple("Test").finish(),
         }
     }
 }
@@ -85,10 +96,8 @@ impl<T> BackendType<'_, T> {
     pub fn as_ref(&self) -> BackendType<'_, &T> {
         use BackendType::*;
         match self {
-            Console(c, x) => Console(Cow::Borrowed(c), x),
-            Link(c) => Link(Cow::Borrowed(c)),
-            #[cfg(test)]
-            Test(x) => Test(*x),
+            Console(c, x) => Console(MaybeOwned::Borrowed(c), x),
+            Link(c) => Link(MaybeOwned::Borrowed(c)),
         }
     }
 }
@@ -102,8 +111,6 @@ impl<'a, T> BackendType<'a, T> {
         match self {
             Console(c, x) => Console(c, f(x)),
             Link(c) => Link(c),
-            #[cfg(test)]
-            Test(x) => Test(x),
         }
     }
 }
@@ -116,8 +123,6 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
         match self {
             Console(c, x) => x.map(|x| Console(c, x)),
             Link(c) => Ok(Link(c)),
-            #[cfg(test)]
-            Test(x) => Ok(Test(x)),
         }
     }
 }
@@ -147,7 +152,7 @@ impl ComputeUserInfo {
 }
 
 pub enum ComputeCredentialKeys {
-    #[cfg(feature = "testing")]
+    #[cfg(any(test, feature = "testing"))]
     Password(Vec<u8>),
     AuthKeys(AuthKeys),
 }
@@ -277,42 +282,6 @@ async fn authenticate_with_secret(
     classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
 }
 
-/// wake a compute (or retrieve an existing compute session from cache)
-async fn wake_compute(
-    ctx: &mut RequestMonitoring,
-    api: &impl console::Api,
-    compute_credentials: ComputeCredentials<ComputeCredentialKeys>,
-) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
-    let mut num_retries = 0;
-    let mut node = loop {
-        let wake_res = api.wake_compute(ctx, &compute_credentials.info).await;
-        match handle_try_wake(wake_res, num_retries) {
-            Err(e) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                return Err(e.into());
-            }
-            Ok(ControlFlow::Continue(e)) => {
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-            }
-            Ok(ControlFlow::Break(n)) => break n,
-        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-        tokio::time::sleep(wait_duration).await;
-    };
-
-    ctx.set_project(node.aux.clone());
-
-    match compute_credentials.keys {
-        #[cfg(feature = "testing")]
-        ComputeCredentialKeys::Password(password) => node.config.password(password),
-        ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
-    };
-
-    Ok((node, compute_credentials.info))
-}
-
 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
     /// Get compute endpoint name from the credentials.
     pub fn get_endpoint(&self) -> Option<EndpointId> {
@@ -321,8 +290,6 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
         match self {
             Console(_, user_info) => user_info.endpoint_id.clone(),
             Link(_) => Some("link".into()),
-            #[cfg(test)]
-            Test(_) => Some("test".into()),
         }
     }
 
@@ -333,8 +300,6 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
         match self {
             Console(_, user_info) => &user_info.user,
             Link(_) => "link",
-            #[cfg(test)]
-            Test(_) => "test",
         }
     }
 
@@ -359,8 +324,20 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
 
                 let compute_credentials =
                     auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
-                let (cache_info, user_info) = wake_compute(ctx, &*api, compute_credentials).await?;
-                (cache_info, BackendType::Console(api, user_info))
+
+                let mut num_retries = 0;
+                let mut node =
+                    wake_compute(&mut num_retries, ctx, &api, &compute_credentials.info).await?;
+
+                ctx.set_project(node.aux.clone());
+
+                match compute_credentials.keys {
+                    #[cfg(any(test, feature = "testing"))]
+                    ComputeCredentialKeys::Password(password) => node.config.password(password),
+                    ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
+                };
+
+                (node, BackendType::Console(api, compute_credentials.info))
             }
             // NOTE: this auth backend doesn't use client credentials.
             Link(url) => {
@@ -373,10 +350,6 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
                     BackendType::Link(url),
                 )
             }
-            #[cfg(test)]
-            Test(_) => {
-                unreachable!("this function should never be called in the test backend")
-            }
         };
 
         info!("user successfully authenticated");
@@ -393,8 +366,6 @@ impl BackendType<'_, ComputeUserInfo> {
         match self {
             Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             Link(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
-            #[cfg(test)]
-            Test(x) => x.get_allowed_ips_and_secret(),
         }
     }
 
@@ -409,8 +380,6 @@ impl BackendType<'_, ComputeUserInfo> {
         match self {
             Console(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await,
             Link(_) => Ok(None),
-            #[cfg(test)]
-            Test(x) => x.wake_compute().map(Some),
         }
     }
 }
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 358b335b88..384063ceae 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -20,7 +20,7 @@ pub(super) async fn authenticate(
 ) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     let flow = AuthFlow::new(client);
     let scram_keys = match secret {
-        #[cfg(feature = "testing")]
+        #[cfg(any(test, feature = "testing"))]
         AuthSecret::Md5(_) => {
             info!("auth endpoint chooses MD5");
             return Err(auth::AuthError::bad_auth_method("MD5"));
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 3151a77263..077178d107 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -172,7 +172,7 @@ pub(super) fn validate_password_and_exchange(
     secret: AuthSecret,
 ) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
     match secret {
-        #[cfg(feature = "testing")]
+        #[cfg(any(test, feature = "testing"))]
         AuthSecret::Md5(_) => {
             // test only
             Ok(sasl::Outcome::Success(ComputeCredentialKeys::Password(
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 3960b080be..3bbb87808d 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,5 +1,6 @@
 use futures::future::Either;
 use proxy::auth;
+use proxy::auth::backend::MaybeOwned;
 use proxy::config::AuthenticationConfig;
 use proxy::config::CacheOptions;
 use proxy::config::HttpConfig;
@@ -17,9 +18,9 @@ use proxy::usage_metrics;
 use anyhow::bail;
 use proxy::config::{self, ProxyConfig};
 use proxy::serverless;
+use std::net::SocketAddr;
 use std::pin::pin;
 use std::sync::Arc;
-use std::{borrow::Cow, net::SocketAddr};
 use tokio::net::TcpListener;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
@@ -259,18 +260,13 @@ async fn main() -> anyhow::Result<()> {
     }
 
     if let auth::BackendType::Console(api, _) = &config.auth_backend {
-        match &**api {
-            proxy::console::provider::ConsoleBackend::Console(api) => {
-                let cache = api.caches.project_info.clone();
-                if let Some(url) = args.redis_notifications {
-                    info!("Starting redis notifications listener ({url})");
-                    maintenance_tasks
-                        .spawn(notifications::task_main(url.to_owned(), cache.clone()));
-                }
-                maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+        if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
+            let cache = api.caches.project_info.clone();
+            if let Some(url) = args.redis_notifications {
+                info!("Starting redis notifications listener ({url})");
+                maintenance_tasks.spawn(notifications::task_main(url.to_owned(), cache.clone()));
             }
-            #[cfg(feature = "testing")]
-            proxy::console::provider::ConsoleBackend::Postgres(_) => {}
+            maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
         }
     }
 
@@ -369,18 +365,18 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
 
             let api = console::provider::neon::Api::new(endpoint, caches, locks);
             let api = console::provider::ConsoleBackend::Console(api);
-            auth::BackendType::Console(Cow::Owned(api), ())
+            auth::BackendType::Console(MaybeOwned::Owned(api), ())
         }
         #[cfg(feature = "testing")]
         AuthBackend::Postgres => {
             let url = args.auth_endpoint.parse()?;
             let api = console::provider::mock::Api::new(url);
             let api = console::provider::ConsoleBackend::Postgres(api);
-            auth::BackendType::Console(Cow::Owned(api), ())
+            auth::BackendType::Console(MaybeOwned::Owned(api), ())
         }
         AuthBackend::Link => {
             let url = args.uri.parse()?;
-            auth::BackendType::Link(Cow::Owned(url))
+            auth::BackendType::Link(MaybeOwned::Owned(url))
         }
     };
     let http_config = HttpConfig {
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index ff84db7738..c53d929470 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -1,4 +1,4 @@
-#[cfg(feature = "testing")]
+#[cfg(any(test, feature = "testing"))]
 pub mod mock;
 pub mod neon;
 
@@ -199,7 +199,7 @@ pub mod errors {
 /// Auth secret which is managed by the cloud.
 #[derive(Clone, Eq, PartialEq, Debug)]
 pub enum AuthSecret {
-    #[cfg(feature = "testing")]
+    #[cfg(any(test, feature = "testing"))]
     /// Md5 hash of user's password.
     Md5([u8; 16]),
 
@@ -264,13 +264,16 @@ pub trait Api {
     ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 }
 
-#[derive(Clone)]
+#[non_exhaustive]
 pub enum ConsoleBackend {
     /// Current Cloud API (V2).
     Console(neon::Api),
     /// Local mock of Cloud API (V2).
-    #[cfg(feature = "testing")]
+    #[cfg(any(test, feature = "testing"))]
     Postgres(mock::Api),
+    /// Internal testing
+    #[cfg(test)]
+    Test(Box<dyn crate::auth::backend::TestBackend>),
 }
 
 #[async_trait]
@@ -283,8 +286,10 @@ impl Api for ConsoleBackend {
         use ConsoleBackend::*;
         match self {
             Console(api) => api.get_role_secret(ctx, user_info).await,
-            #[cfg(feature = "testing")]
+            #[cfg(any(test, feature = "testing"))]
             Postgres(api) => api.get_role_secret(ctx, user_info).await,
+            #[cfg(test)]
+            Test(_) => unreachable!("this function should never be called in the test backend"),
         }
     }
 
@@ -296,8 +301,10 @@ impl Api for ConsoleBackend {
         use ConsoleBackend::*;
         match self {
             Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            #[cfg(feature = "testing")]
+            #[cfg(any(test, feature = "testing"))]
             Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            #[cfg(test)]
+            Test(api) => api.get_allowed_ips_and_secret(),
         }
     }
 
@@ -310,8 +317,10 @@ impl Api for ConsoleBackend {
 
         match self {
             Console(api) => api.wake_compute(ctx, user_info).await,
-            #[cfg(feature = "testing")]
+            #[cfg(any(test, feature = "testing"))]
             Postgres(api) => api.wake_compute(ctx, user_info).await,
+            #[cfg(test)]
+            Test(api) => api.wake_compute(),
         }
     }
 }
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index f22c6d2322..0785419790 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -19,7 +19,6 @@ use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};
 
-#[derive(Clone)]
 pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 4aa1f3590d..b68fb26e42 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -5,6 +5,7 @@ pub mod connect_compute;
 pub mod handshake;
 pub mod passthrough;
 pub mod retry;
+pub mod wake_compute;
 
 use crate::{
     auth,
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 8bbe88aa51..58c59dba36 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -1,15 +1,16 @@
 use crate::{
     auth,
     compute::{self, PostgresConnection},
-    console::{self, errors::WakeComputeError, Api},
+    console::{self, errors::WakeComputeError},
     context::RequestMonitoring,
-    metrics::{bool_to_str, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES},
-    proxy::retry::{retry_after, ShouldRetry},
+    metrics::NUM_CONNECTION_FAILURES,
+    proxy::{
+        retry::{retry_after, ShouldRetry},
+        wake_compute::wake_compute,
+    },
 };
 use async_trait::async_trait;
-use hyper::StatusCode;
 use pq_proto::StartupMessageParams;
-use std::ops::ControlFlow;
 use tokio::time;
 use tracing::{error, info, warn};
 
@@ -88,39 +89,6 @@ impl ConnectMechanism for TcpMechanism<'_> {
     }
 }
 
-fn report_error(e: &WakeComputeError, retry: bool) {
-    use crate::console::errors::ApiError;
-    let retry = bool_to_str(retry);
-    let kind = match e {
-        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
-        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ref text,
-        }) if text.contains("written data quota exceeded")
-            || text.contains("the limit for current plan reached") =>
-        {
-            "quota_exceeded"
-        }
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ..
-        }) => "api_console_locked",
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::BAD_REQUEST,
-            ..
-        }) => "api_console_bad_request",
-        WakeComputeError::ApiError(ApiError::Console { status, .. })
-            if status.is_server_error() =>
-        {
-            "api_console_other_server_error"
-        }
-        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
-        WakeComputeError::TimeoutError => "timeout_error",
-    };
-    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
-}
-
 /// Try to connect to the compute node, retrying if necessary.
 /// This function might update `node_info`, so we take it by `&mut`.
 #[tracing::instrument(skip_all)]
@@ -137,7 +105,7 @@ where
     mechanism.update_connect_config(&mut node_info.config);
 
     // try once
-    let (config, err) = match mechanism
+    let err = match mechanism
         .connect_once(ctx, &node_info, CONNECT_TIMEOUT)
         .await
     {
@@ -145,51 +113,27 @@ where
             ctx.latency_timer.success();
             return Ok(res);
         }
-        Err(e) => {
-            error!(error = ?e, "could not connect to compute node");
-            (invalidate_cache(node_info), e)
-        }
+        Err(e) => e,
     };
 
-    ctx.latency_timer.cache_miss();
+    error!(error = ?err, "could not connect to compute node");
 
     let mut num_retries = 1;
 
-    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
-    info!("compute node's state has likely changed; requesting a wake-up");
-    let node_info = loop {
-        let wake_res = match user_info {
-            auth::BackendType::Console(api, user_info) => api.wake_compute(ctx, user_info).await,
-            // nothing to do?
-            auth::BackendType::Link(_) => return Err(err.into()),
-            // test backend
-            #[cfg(test)]
-            auth::BackendType::Test(x) => x.wake_compute(),
-        };
+    match user_info {
+        auth::BackendType::Console(api, info) => {
+            // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
+            info!("compute node's state has likely changed; requesting a wake-up");
 
-        match handle_try_wake(wake_res, num_retries) {
-            Err(e) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                report_error(&e, false);
-                return Err(e.into());
-            }
-            // failed to wake up but we can continue to retry
-            Ok(ControlFlow::Continue(e)) => {
-                report_error(&e, true);
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-            }
-            // successfully woke up a compute node and can break the wakeup loop
-            Ok(ControlFlow::Break(mut node_info)) => {
-                node_info.config.reuse_password(&config);
-                mechanism.update_connect_config(&mut node_info.config);
-                break node_info;
-            }
+            ctx.latency_timer.cache_miss();
+            let config = invalidate_cache(node_info);
+            node_info = wake_compute(&mut num_retries, ctx, api, info).await?;
+
+            node_info.config.reuse_password(&config);
+            mechanism.update_connect_config(&mut node_info.config);
         }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-
-        time::sleep(wait_duration).await;
+        // nothing to do?
+        auth::BackendType::Link(_) => {}
     };
 
     // now that we have a new node, try connect to it repeatedly.
@@ -221,23 +165,3 @@ where
         time::sleep(wait_duration).await;
     }
 }
-
-/// Attempts to wake up the compute node.
-/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
-/// * Returns Ok(Break(node)) if the wakeup succeeded
-/// * Returns Err(e) if there was an error
-pub fn handle_try_wake(
-    result: Result<console::CachedNodeInfo, WakeComputeError>,
-    num_retries: u32,
-) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
-    match result {
-        Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
-                Ok(ControlFlow::Continue(err))
-            }
-            _ => Err(err),
-        },
-        // Ready to try again.
-        Ok(new) => Ok(ControlFlow::Break(new)),
-    }
-}
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 1f57d343c4..2000774224 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -5,9 +5,9 @@ mod mitm;
 use super::connect_compute::ConnectMechanism;
 use super::retry::ShouldRetry;
 use super::*;
-use crate::auth::backend::{ComputeUserInfo, TestBackend};
+use crate::auth::backend::{ComputeUserInfo, MaybeOwned, TestBackend};
 use crate::config::CertResolver;
-use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
+use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
@@ -371,6 +371,7 @@ enum ConnectAction {
     Fail,
 }
 
+#[derive(Clone)]
 struct TestConnectMechanism {
     counter: Arc<std::sync::Mutex<usize>>,
     sequence: Vec<ConnectAction>,
@@ -490,9 +491,16 @@ fn helper_create_cached_node_info() -> CachedNodeInfo {
 
 fn helper_create_connect_info(
     mechanism: &TestConnectMechanism,
-) -> (CachedNodeInfo, auth::BackendType<'_, ComputeUserInfo>) {
+) -> (CachedNodeInfo, auth::BackendType<'static, ComputeUserInfo>) {
     let cache = helper_create_cached_node_info();
-    let user_info = auth::BackendType::Test(mechanism);
+    let user_info = auth::BackendType::Console(
+        MaybeOwned::Owned(ConsoleBackend::Test(Box::new(mechanism.clone()))),
+        ComputeUserInfo {
+            endpoint: "endpoint".into(),
+            user: "user".into(),
+            options: NeonOptions::parse_options_raw(""),
+        },
+    );
     (cache, user_info)
 }
 
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
new file mode 100644
index 0000000000..925727bdab
--- /dev/null
+++ b/proxy/src/proxy/wake_compute.rs
@@ -0,0 +1,95 @@
+use crate::auth::backend::ComputeUserInfo;
+use crate::console::{
+    errors::WakeComputeError,
+    provider::{CachedNodeInfo, ConsoleBackend},
+    Api,
+};
+use crate::context::RequestMonitoring;
+use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES};
+use crate::proxy::retry::retry_after;
+use hyper::StatusCode;
+use std::ops::ControlFlow;
+use tracing::{error, warn};
+
+use super::retry::ShouldRetry;
+
+/// wake a compute (or retrieve an existing compute session from cache)
+pub async fn wake_compute(
+    num_retries: &mut u32,
+    ctx: &mut RequestMonitoring,
+    api: &ConsoleBackend,
+    info: &ComputeUserInfo,
+) -> Result<CachedNodeInfo, WakeComputeError> {
+    loop {
+        let wake_res = api.wake_compute(ctx, info).await;
+        match handle_try_wake(wake_res, *num_retries) {
+            Err(e) => {
+                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
+                report_error(&e, false);
+                return Err(e);
+            }
+            Ok(ControlFlow::Continue(e)) => {
+                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
+                report_error(&e, true);
+            }
+            Ok(ControlFlow::Break(n)) => return Ok(n),
+        }
+
+        let wait_duration = retry_after(*num_retries);
+        *num_retries += 1;
+        tokio::time::sleep(wait_duration).await;
+    }
+}
+
+/// Attempts to wake up the compute node.
+/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
+/// * Returns Ok(Break(node)) if the wakeup succeeded
+/// * Returns Err(e) if there was an error
+pub fn handle_try_wake(
+    result: Result<CachedNodeInfo, WakeComputeError>,
+    num_retries: u32,
+) -> Result<ControlFlow<CachedNodeInfo, WakeComputeError>, WakeComputeError> {
+    match result {
+        Err(err) => match &err {
+            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
+                Ok(ControlFlow::Continue(err))
+            }
+            _ => Err(err),
+        },
+        // Ready to try again.
+        Ok(new) => Ok(ControlFlow::Break(new)),
+    }
+}
+
+fn report_error(e: &WakeComputeError, retry: bool) {
+    use crate::console::errors::ApiError;
+    let retry = bool_to_str(retry);
+    let kind = match e {
+        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
+        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ref text,
+        }) if text.contains("written data quota exceeded")
+            || text.contains("the limit for current plan reached") =>
+        {
+            "quota_exceeded"
+        }
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ..
+        }) => "api_console_locked",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::BAD_REQUEST,
+            ..
+        }) => "api_console_bad_request",
+        WakeComputeError::ApiError(ApiError::Console { status, .. })
+            if status.is_server_error() =>
+        {
+            "api_console_other_server_error"
+        }
+        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
+        WakeComputeError::TimeoutError => "timeout_error",
+    };
+    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
+}
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index a2eb7e62cc..7ff93b23b8 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -3,6 +3,7 @@
 //! Handles both SQL over HTTP and SQL over Websockets.
 
 mod conn_pool;
+mod json;
 mod sql_over_http;
 mod websocket;
 
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
new file mode 100644
index 0000000000..05835b23ce
--- /dev/null
+++ b/proxy/src/serverless/json.rs
@@ -0,0 +1,448 @@
+use serde_json::Map;
+use serde_json::Value;
+use tokio_postgres::types::Kind;
+use tokio_postgres::types::Type;
+use tokio_postgres::Row;
+
+//
+// Convert json non-string types to strings, so that they can be passed to Postgres
+// as parameters.
+//
+pub fn json_to_pg_text(json: Vec<Value>) -> Vec<Option<String>> {
+    json.iter()
+        .map(|value| {
+            match value {
+                // special care for nulls
+                Value::Null => None,
+
+                // convert to text with escaping
+                v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()),
+
+                // avoid escaping here, as we pass this as a parameter
+                Value::String(s) => Some(s.to_string()),
+
+                // special care for arrays
+                Value::Array(_) => json_array_to_pg_array(value),
+            }
+        })
+        .collect()
+}
+
+//
+// Serialize a JSON array to a Postgres array. Contrary to the strings in the params
+// in the array we need to escape the strings. Postgres is okay with arrays of form
+// '{1,"2",3}'::int[], so we don't check that array holds values of the same type, leaving
+// it for Postgres to check.
+//
+// Example of the same escaping in node-postgres: packages/pg/lib/utils.js
+//
+fn json_array_to_pg_array(value: &Value) -> Option<String> {
+    match value {
+        // special care for nulls
+        Value::Null => None,
+
+        // convert to text with escaping
+        // here string needs to be escaped, as it is part of the array
+        v @ (Value::Bool(_) | Value::Number(_) | Value::String(_)) => Some(v.to_string()),
+        v @ Value::Object(_) => json_array_to_pg_array(&Value::String(v.to_string())),
+
+        // recurse into array
+        Value::Array(arr) => {
+            let vals = arr
+                .iter()
+                .map(json_array_to_pg_array)
+                .map(|v| v.unwrap_or_else(|| "NULL".to_string()))
+                .collect::<Vec<_>>()
+                .join(",");
+
+            Some(format!("{{{}}}", vals))
+        }
+    }
+}
+
+//
+// Convert postgres row with text-encoded values to JSON object
+//
+pub fn pg_text_row_to_json(
+    row: &Row,
+    columns: &[Type],
+    raw_output: bool,
+    array_mode: bool,
+) -> Result<Value, anyhow::Error> {
+    let iter = row
+        .columns()
+        .iter()
+        .zip(columns)
+        .enumerate()
+        .map(|(i, (column, typ))| {
+            let name = column.name();
+            let pg_value = row.as_text(i)?;
+            let json_value = if raw_output {
+                match pg_value {
+                    Some(v) => Value::String(v.to_string()),
+                    None => Value::Null,
+                }
+            } else {
+                pg_text_to_json(pg_value, typ)?
+            };
+            Ok((name.to_string(), json_value))
+        });
+
+    if array_mode {
+        // drop keys and aggregate into array
+        let arr = iter
+            .map(|r| r.map(|(_key, val)| val))
+            .collect::<Result<Vec<Value>, anyhow::Error>>()?;
+        Ok(Value::Array(arr))
+    } else {
+        let obj = iter.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
+        Ok(Value::Object(obj))
+    }
+}
+
+//
+// Convert postgres text-encoded value to JSON value
+//
+fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
+    if let Some(val) = pg_value {
+        if let Kind::Array(elem_type) = pg_type.kind() {
+            return pg_array_parse(val, elem_type);
+        }
+
+        match *pg_type {
+            Type::BOOL => Ok(Value::Bool(val == "t")),
+            Type::INT2 | Type::INT4 => {
+                let val = val.parse::<i32>()?;
+                Ok(Value::Number(serde_json::Number::from(val)))
+            }
+            Type::FLOAT4 | Type::FLOAT8 => {
+                let fval = val.parse::<f64>()?;
+                let num = serde_json::Number::from_f64(fval);
+                if let Some(num) = num {
+                    Ok(Value::Number(num))
+                } else {
+                    // Pass Nan, Inf, -Inf as strings
+                    // JS JSON.stringify() does converts them to null, but we
+                    // want to preserve them, so we pass them as strings
+                    Ok(Value::String(val.to_string()))
+                }
+            }
+            Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?),
+            _ => Ok(Value::String(val.to_string())),
+        }
+    } else {
+        Ok(Value::Null)
+    }
+}
+
+//
+// Parse postgres array into JSON array.
+//
+// This is a bit involved because we need to handle nested arrays and quoted
+// values. Unlike postgres we don't check that all nested arrays have the same
+// dimensions, we just return them as is.
+//
+fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result<Value, anyhow::Error> {
+    _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v)
+}
+
+fn _pg_array_parse(
+    pg_array: &str,
+    elem_type: &Type,
+    nested: bool,
+) -> Result<(Value, usize), anyhow::Error> {
+    let mut pg_array_chr = pg_array.char_indices();
+    let mut level = 0;
+    let mut quote = false;
+    let mut entries: Vec<Value> = Vec::new();
+    let mut entry = String::new();
+
+    // skip bounds decoration
+    if let Some('[') = pg_array.chars().next() {
+        for (_, c) in pg_array_chr.by_ref() {
+            if c == '=' {
+                break;
+            }
+        }
+    }
+
+    fn push_checked(
+        entry: &mut String,
+        entries: &mut Vec<Value>,
+        elem_type: &Type,
+    ) -> Result<(), anyhow::Error> {
+        if !entry.is_empty() {
+            // While in usual postgres response we get nulls as None and everything else
+            // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while
+            // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs
+            // here while we have quotation info and convert them to None.
+            if entry == "NULL" {
+                entries.push(pg_text_to_json(None, elem_type)?);
+            } else {
+                entries.push(pg_text_to_json(Some(entry), elem_type)?);
+            }
+            entry.clear();
+        }
+
+        Ok(())
+    }
+
+    while let Some((mut i, mut c)) = pg_array_chr.next() {
+        let mut escaped = false;
+
+        if c == '\\' {
+            escaped = true;
+            (i, c) = pg_array_chr.next().unwrap();
+        }
+
+        match c {
+            '{' if !quote => {
+                level += 1;
+                if level > 1 {
+                    let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?;
+                    entries.push(res);
+                    for _ in 0..off - 1 {
+                        pg_array_chr.next();
+                    }
+                }
+            }
+            '}' if !quote => {
+                level -= 1;
+                if level == 0 {
+                    push_checked(&mut entry, &mut entries, elem_type)?;
+                    if nested {
+                        return Ok((Value::Array(entries), i));
+                    }
+                }
+            }
+            '"' if !escaped => {
+                if quote {
+                    // end of quoted string, so push it manually without any checks
+                    // for emptiness or nulls
+                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
+                    entry.clear();
+                }
+                quote = !quote;
+            }
+            ',' if !quote => {
+                push_checked(&mut entry, &mut entries, elem_type)?;
+            }
+            _ => {
+                entry.push(c);
+            }
+        }
+    }
+
+    if level != 0 {
+        return Err(anyhow::anyhow!("unbalanced array"));
+    }
+
+    Ok((Value::Array(entries), 0))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    #[test]
+    fn test_atomic_types_to_pg_params() {
+        let json = vec![Value::Bool(true), Value::Bool(false)];
+        let pg_params = json_to_pg_text(json);
+        assert_eq!(
+            pg_params,
+            vec![Some("true".to_owned()), Some("false".to_owned())]
+        );
+
+        let json = vec![Value::Number(serde_json::Number::from(42))];
+        let pg_params = json_to_pg_text(json);
+        assert_eq!(pg_params, vec![Some("42".to_owned())]);
+
+        let json = vec![Value::String("foo\"".to_string())];
+        let pg_params = json_to_pg_text(json);
+        assert_eq!(pg_params, vec![Some("foo\"".to_owned())]);
+
+        let json = vec![Value::Null];
+        let pg_params = json_to_pg_text(json);
+        assert_eq!(pg_params, vec![None]);
+    }
+
+    #[test]
+    fn test_json_array_to_pg_array() {
+        // atoms and escaping
+        let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]";
+        let json: Value = serde_json::from_str(json).unwrap();
+        let pg_params = json_to_pg_text(vec![json]);
+        assert_eq!(
+            pg_params,
+            vec![Some(
+                "{true,false,NULL,\"NULL\",42,\"foo\",\"bar\\\"-\\\\\"}".to_owned()
+            )]
+        );
+
+        // nested arrays
+        let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]";
+        let json: Value = serde_json::from_str(json).unwrap();
+        let pg_params = json_to_pg_text(vec![json]);
+        assert_eq!(
+            pg_params,
+            vec![Some(
+                "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned()
+            )]
+        );
+        // array of objects
+        let json = r#"[{"foo": 1},{"bar": 2}]"#;
+        let json: Value = serde_json::from_str(json).unwrap();
+        let pg_params = json_to_pg_text(vec![json]);
+        assert_eq!(
+            pg_params,
+            vec![Some(r#"{"{\"foo\":1}","{\"bar\":2}"}"#.to_owned())]
+        );
+    }
+
+    #[test]
+    fn test_atomic_types_parse() {
+        assert_eq!(
+            pg_text_to_json(Some("foo"), &Type::TEXT).unwrap(),
+            json!("foo")
+        );
+        assert_eq!(pg_text_to_json(None, &Type::TEXT).unwrap(), json!(null));
+        assert_eq!(pg_text_to_json(Some("42"), &Type::INT4).unwrap(), json!(42));
+        assert_eq!(pg_text_to_json(Some("42"), &Type::INT2).unwrap(), json!(42));
+        assert_eq!(
+            pg_text_to_json(Some("42"), &Type::INT8).unwrap(),
+            json!("42")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("42.42"), &Type::FLOAT8).unwrap(),
+            json!(42.42)
+        );
+        assert_eq!(
+            pg_text_to_json(Some("42.42"), &Type::FLOAT4).unwrap(),
+            json!(42.42)
+        );
+        assert_eq!(
+            pg_text_to_json(Some("NaN"), &Type::FLOAT4).unwrap(),
+            json!("NaN")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("Infinity"), &Type::FLOAT4).unwrap(),
+            json!("Infinity")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("-Infinity"), &Type::FLOAT4).unwrap(),
+            json!("-Infinity")
+        );
+
+        let json: Value =
+            serde_json::from_str("{\"s\":\"str\",\"n\":42,\"f\":4.2,\"a\":[null,3,\"a\"]}")
+                .unwrap();
+        assert_eq!(
+            pg_text_to_json(
+                Some(r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#),
+                &Type::JSONB
+            )
+            .unwrap(),
+            json
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_text() {
+        fn pt(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::TEXT).unwrap()
+        }
+        assert_eq!(
+            pt(r#"{"aa\"\\\,a",cha,"bbbb"}"#),
+            json!(["aa\"\\,a", "cha", "bbbb"])
+        );
+        assert_eq!(
+            pt(r#"{{"foo","bar"},{"bee","bop"}}"#),
+            json!([["foo", "bar"], ["bee", "bop"]])
+        );
+        assert_eq!(
+            pt(r#"{{{{"foo",NULL,"bop",bup}}}}"#),
+            json!([[[["foo", null, "bop", "bup"]]]])
+        );
+        assert_eq!(
+            pt(r#"{{"1",2,3},{4,NULL,6},{NULL,NULL,NULL}}"#),
+            json!([["1", "2", "3"], ["4", null, "6"], [null, null, null]])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_bool() {
+        fn pb(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::BOOL).unwrap()
+        }
+        assert_eq!(pb(r#"{t,f,t}"#), json!([true, false, true]));
+        assert_eq!(pb(r#"{{t,f,t}}"#), json!([[true, false, true]]));
+        assert_eq!(
+            pb(r#"{{t,f},{f,t}}"#),
+            json!([[true, false], [false, true]])
+        );
+        assert_eq!(
+            pb(r#"{{t,NULL},{NULL,f}}"#),
+            json!([[true, null], [null, false]])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_numbers() {
+        fn pn(pg_arr: &str, ty: &Type) -> Value {
+            pg_array_parse(pg_arr, ty).unwrap()
+        }
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT4), json!([1, 2, 3]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT2), json!([1, 2, 3]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT8), json!(["1", "2", "3"]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT4), json!([1.0, 2.0, 3.0]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT8), json!([1.0, 2.0, 3.0]));
+        assert_eq!(
+            pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT4),
+            json!([1.1, 2.2, 3.3])
+        );
+        assert_eq!(
+            pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT8),
+            json!([1.1, 2.2, 3.3])
+        );
+        assert_eq!(
+            pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT4),
+            json!(["NaN", "Infinity", "-Infinity"])
+        );
+        assert_eq!(
+            pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT8),
+            json!(["NaN", "Infinity", "-Infinity"])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_with_decoration() {
+        fn p(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::INT2).unwrap()
+        }
+        assert_eq!(
+            p(r#"[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}"#),
+            json!([[[1, 2, 3], [4, 5, 6]]])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_json() {
+        fn pt(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::JSONB).unwrap()
+        }
+        assert_eq!(pt(r#"{"{}"}"#), json!([{}]));
+        assert_eq!(
+            pt(r#"{"{\"foo\": 1, \"bar\": 2}"}"#),
+            json!([{"foo": 1, "bar": 2}])
+        );
+        assert_eq!(
+            pt(r#"{"{\"foo\": 1}", "{\"bar\": 2}"}"#),
+            json!([{"foo": 1}, {"bar": 2}])
+        );
+        assert_eq!(
+            pt(r#"{{"{\"foo\": 1}", "{\"bar\": 2}"}}"#),
+            json!([[{"foo": 1}, {"bar": 2}]])
+        );
+    }
+}
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 27c2134221..96bf39c915 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -12,16 +12,12 @@ use hyper::Response;
 use hyper::StatusCode;
 use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
-use serde_json::Map;
 use serde_json::Value;
 use tokio_postgres::error::DbError;
 use tokio_postgres::error::ErrorPosition;
-use tokio_postgres::types::Kind;
-use tokio_postgres::types::Type;
 use tokio_postgres::GenericClient;
 use tokio_postgres::IsolationLevel;
 use tokio_postgres::ReadyForQueryStatus;
-use tokio_postgres::Row;
 use tokio_postgres::Transaction;
 use tracing::error;
 use tracing::instrument;
@@ -40,6 +36,7 @@ use crate::RoleName;
 
 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;
+use super::json::{json_to_pg_text, pg_text_row_to_json};
 use super::SERVERLESS_DRIVER_SNI;
 
 #[derive(serde::Deserialize)]
@@ -72,62 +69,6 @@ static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrab
 
 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
 
-//
-// Convert json non-string types to strings, so that they can be passed to Postgres
-// as parameters.
-//
-fn json_to_pg_text(json: Vec<Value>) -> Vec<Option<String>> {
-    json.iter()
-        .map(|value| {
-            match value {
-                // special care for nulls
-                Value::Null => None,
-
-                // convert to text with escaping
-                v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()),
-
-                // avoid escaping here, as we pass this as a parameter
-                Value::String(s) => Some(s.to_string()),
-
-                // special care for arrays
-                Value::Array(_) => json_array_to_pg_array(value),
-            }
-        })
-        .collect()
-}
-
-//
-// Serialize a JSON array to a Postgres array. Contrary to the strings in the params
-// in the array we need to escape the strings. Postgres is okay with arrays of form
-// '{1,"2",3}'::int[], so we don't check that array holds values of the same type, leaving
-// it for Postgres to check.
-//
-// Example of the same escaping in node-postgres: packages/pg/lib/utils.js
-//
-fn json_array_to_pg_array(value: &Value) -> Option<String> {
-    match value {
-        // special care for nulls
-        Value::Null => None,
-
-        // convert to text with escaping
-        // here string needs to be escaped, as it is part of the array
-        v @ (Value::Bool(_) | Value::Number(_) | Value::String(_)) => Some(v.to_string()),
-        v @ Value::Object(_) => json_array_to_pg_array(&Value::String(v.to_string())),
-
-        // recurse into array
-        Value::Array(arr) => {
-            let vals = arr
-                .iter()
-                .map(json_array_to_pg_array)
-                .map(|v| v.unwrap_or_else(|| "NULL".to_string()))
-                .collect::<Vec<_>>()
-                .join(",");
-
-            Some(format!("{{{}}}", vals))
-        }
-    }
-}
-
 fn get_conn_info(
     ctx: &mut RequestMonitoring,
     headers: &HeaderMap,
@@ -611,389 +552,3 @@ async fn query_to_json<T: GenericClient>(
         }),
     ))
 }
-
-//
-// Convert postgres row with text-encoded values to JSON object
-//
-pub fn pg_text_row_to_json(
-    row: &Row,
-    columns: &[Type],
-    raw_output: bool,
-    array_mode: bool,
-) -> Result<Value, anyhow::Error> {
-    let iter = row
-        .columns()
-        .iter()
-        .zip(columns)
-        .enumerate()
-        .map(|(i, (column, typ))| {
-            let name = column.name();
-            let pg_value = row.as_text(i)?;
-            let json_value = if raw_output {
-                match pg_value {
-                    Some(v) => Value::String(v.to_string()),
-                    None => Value::Null,
-                }
-            } else {
-                pg_text_to_json(pg_value, typ)?
-            };
-            Ok((name.to_string(), json_value))
-        });
-
-    if array_mode {
-        // drop keys and aggregate into array
-        let arr = iter
-            .map(|r| r.map(|(_key, val)| val))
-            .collect::<Result<Vec<Value>, anyhow::Error>>()?;
-        Ok(Value::Array(arr))
-    } else {
-        let obj = iter.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
-        Ok(Value::Object(obj))
-    }
-}
-
-//
-// Convert postgres text-encoded value to JSON value
-//
-pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
-    if let Some(val) = pg_value {
-        if let Kind::Array(elem_type) = pg_type.kind() {
-            return pg_array_parse(val, elem_type);
-        }
-
-        match *pg_type {
-            Type::BOOL => Ok(Value::Bool(val == "t")),
-            Type::INT2 | Type::INT4 => {
-                let val = val.parse::<i32>()?;
-                Ok(Value::Number(serde_json::Number::from(val)))
-            }
-            Type::FLOAT4 | Type::FLOAT8 => {
-                let fval = val.parse::<f64>()?;
-                let num = serde_json::Number::from_f64(fval);
-                if let Some(num) = num {
-                    Ok(Value::Number(num))
-                } else {
-                    // Pass Nan, Inf, -Inf as strings
-                    // JS JSON.stringify() does converts them to null, but we
-                    // want to preserve them, so we pass them as strings
-                    Ok(Value::String(val.to_string()))
-                }
-            }
-            Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?),
-            _ => Ok(Value::String(val.to_string())),
-        }
-    } else {
-        Ok(Value::Null)
-    }
-}
-
-//
-// Parse postgres array into JSON array.
-//
-// This is a bit involved because we need to handle nested arrays and quoted
-// values. Unlike postgres we don't check that all nested arrays have the same
-// dimensions, we just return them as is.
-//
-fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result<Value, anyhow::Error> {
-    _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v)
-}
-
-fn _pg_array_parse(
-    pg_array: &str,
-    elem_type: &Type,
-    nested: bool,
-) -> Result<(Value, usize), anyhow::Error> {
-    let mut pg_array_chr = pg_array.char_indices();
-    let mut level = 0;
-    let mut quote = false;
-    let mut entries: Vec<Value> = Vec::new();
-    let mut entry = String::new();
-
-    // skip bounds decoration
-    if let Some('[') = pg_array.chars().next() {
-        for (_, c) in pg_array_chr.by_ref() {
-            if c == '=' {
-                break;
-            }
-        }
-    }
-
-    fn push_checked(
-        entry: &mut String,
-        entries: &mut Vec<Value>,
-        elem_type: &Type,
-    ) -> Result<(), anyhow::Error> {
-        if !entry.is_empty() {
-            // While in usual postgres response we get nulls as None and everything else
-            // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while
-            // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs
-            // here while we have quotation info and convert them to None.
-            if entry == "NULL" {
-                entries.push(pg_text_to_json(None, elem_type)?);
-            } else {
-                entries.push(pg_text_to_json(Some(entry), elem_type)?);
-            }
-            entry.clear();
-        }
-
-        Ok(())
-    }
-
-    while let Some((mut i, mut c)) = pg_array_chr.next() {
-        let mut escaped = false;
-
-        if c == '\\' {
-            escaped = true;
-            (i, c) = pg_array_chr.next().unwrap();
-        }
-
-        match c {
-            '{' if !quote => {
-                level += 1;
-                if level > 1 {
-                    let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?;
-                    entries.push(res);
-                    for _ in 0..off - 1 {
-                        pg_array_chr.next();
-                    }
-                }
-            }
-            '}' if !quote => {
-                level -= 1;
-                if level == 0 {
-                    push_checked(&mut entry, &mut entries, elem_type)?;
-                    if nested {
-                        return Ok((Value::Array(entries), i));
-                    }
-                }
-            }
-            '"' if !escaped => {
-                if quote {
-                    // end of quoted string, so push it manually without any checks
-                    // for emptiness or nulls
-                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
-                    entry.clear();
-                }
-                quote = !quote;
-            }
-            ',' if !quote => {
-                push_checked(&mut entry, &mut entries, elem_type)?;
-            }
-            _ => {
-                entry.push(c);
-            }
-        }
-    }
-
-    if level != 0 {
-        return Err(anyhow::anyhow!("unbalanced array"));
-    }
-
-    Ok((Value::Array(entries), 0))
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use serde_json::json;
-
-    #[test]
-    fn test_atomic_types_to_pg_params() {
-        let json = vec![Value::Bool(true), Value::Bool(false)];
-        let pg_params = json_to_pg_text(json);
-        assert_eq!(
-            pg_params,
-            vec![Some("true".to_owned()), Some("false".to_owned())]
-        );
-
-        let json = vec![Value::Number(serde_json::Number::from(42))];
-        let pg_params = json_to_pg_text(json);
-        assert_eq!(pg_params, vec![Some("42".to_owned())]);
-
-        let json = vec![Value::String("foo\"".to_string())];
-        let pg_params = json_to_pg_text(json);
-        assert_eq!(pg_params, vec![Some("foo\"".to_owned())]);
-
-        let json = vec![Value::Null];
-        let pg_params = json_to_pg_text(json);
-        assert_eq!(pg_params, vec![None]);
-    }
-
-    #[test]
-    fn test_json_array_to_pg_array() {
-        // atoms and escaping
-        let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]";
-        let json: Value = serde_json::from_str(json).unwrap();
-        let pg_params = json_to_pg_text(vec![json]);
-        assert_eq!(
-            pg_params,
-            vec![Some(
-                "{true,false,NULL,\"NULL\",42,\"foo\",\"bar\\\"-\\\\\"}".to_owned()
-            )]
-        );
-
-        // nested arrays
-        let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]";
-        let json: Value = serde_json::from_str(json).unwrap();
-        let pg_params = json_to_pg_text(vec![json]);
-        assert_eq!(
-            pg_params,
-            vec![Some(
-                "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned()
-            )]
-        );
-        // array of objects
-        let json = r#"[{"foo": 1},{"bar": 2}]"#;
-        let json: Value = serde_json::from_str(json).unwrap();
-        let pg_params = json_to_pg_text(vec![json]);
-        assert_eq!(
-            pg_params,
-            vec![Some(r#"{"{\"foo\":1}","{\"bar\":2}"}"#.to_owned())]
-        );
-    }
-
-    #[test]
-    fn test_atomic_types_parse() {
-        assert_eq!(
-            pg_text_to_json(Some("foo"), &Type::TEXT).unwrap(),
-            json!("foo")
-        );
-        assert_eq!(pg_text_to_json(None, &Type::TEXT).unwrap(), json!(null));
-        assert_eq!(pg_text_to_json(Some("42"), &Type::INT4).unwrap(), json!(42));
-        assert_eq!(pg_text_to_json(Some("42"), &Type::INT2).unwrap(), json!(42));
-        assert_eq!(
-            pg_text_to_json(Some("42"), &Type::INT8).unwrap(),
-            json!("42")
-        );
-        assert_eq!(
-            pg_text_to_json(Some("42.42"), &Type::FLOAT8).unwrap(),
-            json!(42.42)
-        );
-        assert_eq!(
-            pg_text_to_json(Some("42.42"), &Type::FLOAT4).unwrap(),
-            json!(42.42)
-        );
-        assert_eq!(
-            pg_text_to_json(Some("NaN"), &Type::FLOAT4).unwrap(),
-            json!("NaN")
-        );
-        assert_eq!(
-            pg_text_to_json(Some("Infinity"), &Type::FLOAT4).unwrap(),
-            json!("Infinity")
-        );
-        assert_eq!(
-            pg_text_to_json(Some("-Infinity"), &Type::FLOAT4).unwrap(),
-            json!("-Infinity")
-        );
-
-        let json: Value =
-            serde_json::from_str("{\"s\":\"str\",\"n\":42,\"f\":4.2,\"a\":[null,3,\"a\"]}")
-                .unwrap();
-        assert_eq!(
-            pg_text_to_json(
-                Some(r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#),
-                &Type::JSONB
-            )
-            .unwrap(),
-            json
-        );
-    }
-
-    #[test]
-    fn test_pg_array_parse_text() {
-        fn pt(pg_arr: &str) -> Value {
-            pg_array_parse(pg_arr, &Type::TEXT).unwrap()
-        }
-        assert_eq!(
-            pt(r#"{"aa\"\\\,a",cha,"bbbb"}"#),
-            json!(["aa\"\\,a", "cha", "bbbb"])
-        );
-        assert_eq!(
-            pt(r#"{{"foo","bar"},{"bee","bop"}}"#),
-            json!([["foo", "bar"], ["bee", "bop"]])
-        );
-        assert_eq!(
-            pt(r#"{{{{"foo",NULL,"bop",bup}}}}"#),
-            json!([[[["foo", null, "bop", "bup"]]]])
-        );
-        assert_eq!(
-            pt(r#"{{"1",2,3},{4,NULL,6},{NULL,NULL,NULL}}"#),
-            json!([["1", "2", "3"], ["4", null, "6"], [null, null, null]])
-        );
-    }
-
-    #[test]
-    fn test_pg_array_parse_bool() {
-        fn pb(pg_arr: &str) -> Value {
-            pg_array_parse(pg_arr, &Type::BOOL).unwrap()
-        }
-        assert_eq!(pb(r#"{t,f,t}"#), json!([true, false, true]));
-        assert_eq!(pb(r#"{{t,f,t}}"#), json!([[true, false, true]]));
-        assert_eq!(
-            pb(r#"{{t,f},{f,t}}"#),
-            json!([[true, false], [false, true]])
-        );
-        assert_eq!(
-            pb(r#"{{t,NULL},{NULL,f}}"#),
-            json!([[true, null], [null, false]])
-        );
-    }
-
-    #[test]
-    fn test_pg_array_parse_numbers() {
-        fn pn(pg_arr: &str, ty: &Type) -> Value {
-            pg_array_parse(pg_arr, ty).unwrap()
-        }
-        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT4), json!([1, 2, 3]));
-        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT2), json!([1, 2, 3]));
-        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT8), json!(["1", "2", "3"]));
-        assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT4), json!([1.0, 2.0, 3.0]));
-        assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT8), json!([1.0, 2.0, 3.0]));
-        assert_eq!(
-            pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT4),
-            json!([1.1, 2.2, 3.3])
-        );
-        assert_eq!(
-            pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT8),
-            json!([1.1, 2.2, 3.3])
-        );
-        assert_eq!(
-            pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT4),
-            json!(["NaN", "Infinity", "-Infinity"])
-        );
-        assert_eq!(
-            pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT8),
-            json!(["NaN", "Infinity", "-Infinity"])
-        );
-    }
-
-    #[test]
-    fn test_pg_array_with_decoration() {
-        fn p(pg_arr: &str) -> Value {
-            pg_array_parse(pg_arr, &Type::INT2).unwrap()
-        }
-        assert_eq!(
-            p(r#"[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}"#),
-            json!([[[1, 2, 3], [4, 5, 6]]])
-        );
-    }
-    #[test]
-    fn test_pg_array_parse_json() {
-        fn pt(pg_arr: &str) -> Value {
-            pg_array_parse(pg_arr, &Type::JSONB).unwrap()
-        }
-        assert_eq!(pt(r#"{"{}"}"#), json!([{}]));
-        assert_eq!(
-            pt(r#"{"{\"foo\": 1, \"bar\": 2}"}"#),
-            json!([{"foo": 1, "bar": 2}])
-        );
-        assert_eq!(
-            pt(r#"{"{\"foo\": 1}", "{\"bar\": 2}"}"#),
-            json!([{"foo": 1}, {"bar": 2}])
-        );
-        assert_eq!(
-            pt(r#"{{"{\"foo\": 1}", "{\"bar\": 2}"}}"#),
-            json!([[{"foo": 1}, {"bar": 2}]])
-        );
-    }
-}

From 7e2436695decac52fd0fc5eec11441d0a7e8d407 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Feb 2024 16:57:11 +0000
Subject: [PATCH 0064/1571] storage controller: use AWS Secrets Manager for
 database URL, etc (#6585)

## Problem

Passing secrets in via CLI/environment is awkward when using helm for
deployment, and not ideal for security (secrets may show up in ps,
/proc).

We can bypass these issues by simply connecting directly to the AWS
Secrets Manager service at runtime.

## Summary of changes

- Add dependency on aws-sdk-secretsmanager
- Update other aws dependencies to latest, to match transitive
dependency versions
- Add `Secrets` type in attachment service, using AWS SDK to load if
secrets are not provided on the command line.
---
 Cargo.lock                                   | 242 ++++++++++---------
 Cargo.toml                                   |  11 +-
 control_plane/attachment_service/Cargo.toml  |   2 +
 control_plane/attachment_service/src/main.rs | 110 ++++++++-
 libs/utils/src/auth.rs                       |   4 +
 workspace_hack/Cargo.toml                    |   2 +-
 6 files changed, 249 insertions(+), 122 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ea5a29a142..90991ab0a4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -275,6 +275,8 @@ name = "attachment_service"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "aws-config",
+ "aws-sdk-secretsmanager",
  "camino",
  "clap",
  "control_plane",
@@ -304,12 +306,11 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "aws-config"
-version = "1.0.1"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "80c950a809d39bc9480207cb1cfc879ace88ea7e3a4392a8e9999e45d6e5692e"
+checksum = "8b30c39ebe61f75d1b3785362b1586b41991873c9ab3e317a9181c246fb71d82"
 dependencies = [
  "aws-credential-types",
- "aws-http",
  "aws-runtime",
  "aws-sdk-sso",
  "aws-sdk-ssooidc",
@@ -324,7 +325,7 @@ dependencies = [
  "bytes",
  "fastrand 2.0.0",
  "hex",
- "http",
+ "http 0.2.9",
  "hyper",
  "ring 0.17.6",
  "time",
@@ -335,9 +336,9 @@ dependencies = [
 
 [[package]]
 name = "aws-credential-types"
-version = "1.0.1"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c1317e1a3514b103cf7d5828bbab3b4d30f56bd22d684f8568bc51b6cfbbb1c"
+checksum = "33cc49dcdd31c8b6e79850a179af4c367669150c7ac0135f176c61bec81a70f7"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-runtime-api",
@@ -345,30 +346,13 @@ dependencies = [
  "zeroize",
 ]
 
-[[package]]
-name = "aws-http"
-version = "0.60.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "361c4310fdce94328cc2d1ca0c8a48c13f43009c61d3367585685a50ca8c66b6"
-dependencies = [
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "aws-types",
- "bytes",
- "http",
- "http-body",
- "pin-project-lite",
- "tracing",
-]
-
 [[package]]
 name = "aws-runtime"
-version = "1.0.1"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ed7ef604a15fd0d4d9e43701295161ea6b504b63c44990ead352afea2bc15e9"
+checksum = "eb031bff99877c26c28895766f7bb8484a05e24547e370768d6cc9db514662aa"
 dependencies = [
  "aws-credential-types",
- "aws-http",
  "aws-sigv4",
  "aws-smithy-async",
  "aws-smithy-eventstream",
@@ -376,21 +360,23 @@ dependencies = [
  "aws-smithy-runtime-api",
  "aws-smithy-types",
  "aws-types",
+ "bytes",
  "fastrand 2.0.0",
- "http",
+ "http 0.2.9",
+ "http-body",
  "percent-encoding",
+ "pin-project-lite",
  "tracing",
  "uuid",
 ]
 
 [[package]]
 name = "aws-sdk-s3"
-version = "1.4.0"
+version = "1.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9dcafc2fe52cc30b2d56685e2fa6a879ba50d79704594852112337a472ddbd24"
+checksum = "951f7730f51a2155c711c85c79f337fbc02a577fa99d2a0a8059acfce5392113"
 dependencies = [
  "aws-credential-types",
- "aws-http",
  "aws-runtime",
  "aws-sigv4",
  "aws-smithy-async",
@@ -404,23 +390,22 @@ dependencies = [
  "aws-smithy-xml",
  "aws-types",
  "bytes",
- "http",
+ "http 0.2.9",
  "http-body",
  "once_cell",
  "percent-encoding",
- "regex",
+ "regex-lite",
  "tracing",
  "url",
 ]
 
 [[package]]
-name = "aws-sdk-sso"
-version = "1.3.0"
+name = "aws-sdk-secretsmanager"
+version = "1.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0619ab97a5ca8982e7de073cdc66f93e5f6a1b05afc09e696bec1cb3607cd4df"
+checksum = "0a0b64e61e7d632d9df90a2e0f32630c68c24960cab1d27d848718180af883d3"
 dependencies = [
  "aws-credential-types",
- "aws-http",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
@@ -430,19 +415,42 @@ dependencies = [
  "aws-smithy-types",
  "aws-types",
  "bytes",
- "http",
- "regex",
+ "fastrand 2.0.0",
+ "http 0.2.9",
+ "once_cell",
+ "regex-lite",
+ "tracing",
+]
+
+[[package]]
+name = "aws-sdk-sso"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f486420a66caad72635bc2ce0ff6581646e0d32df02aa39dc983bfe794955a5b"
+dependencies = [
+ "aws-credential-types",
+ "aws-runtime",
+ "aws-smithy-async",
+ "aws-smithy-http",
+ "aws-smithy-json",
+ "aws-smithy-runtime",
+ "aws-smithy-runtime-api",
+ "aws-smithy-types",
+ "aws-types",
+ "bytes",
+ "http 0.2.9",
+ "once_cell",
+ "regex-lite",
  "tracing",
 ]
 
 [[package]]
 name = "aws-sdk-ssooidc"
-version = "1.3.0"
+version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f04b9f5474cc0f35d829510b2ec8c21e352309b46bf9633c5a81fb9321e9b1c7"
+checksum = "39ddccf01d82fce9b4a15c8ae8608211ee7db8ed13a70b514bbfe41df3d24841"
 dependencies = [
  "aws-credential-types",
- "aws-http",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
@@ -452,19 +460,19 @@ dependencies = [
  "aws-smithy-types",
  "aws-types",
  "bytes",
- "http",
- "regex",
+ "http 0.2.9",
+ "once_cell",
+ "regex-lite",
  "tracing",
 ]
 
 [[package]]
 name = "aws-sdk-sts"
-version = "1.3.0"
+version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5700da387716ccfc30b27f44b008f457e1baca5b0f05b6b95455778005e3432a"
+checksum = "1a591f8c7e6a621a501b2b5d2e88e1697fcb6274264523a6ad4d5959889a41ce"
 dependencies = [
  "aws-credential-types",
- "aws-http",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
@@ -475,16 +483,17 @@ dependencies = [
  "aws-smithy-types",
  "aws-smithy-xml",
  "aws-types",
- "http",
- "regex",
+ "http 0.2.9",
+ "once_cell",
+ "regex-lite",
  "tracing",
 ]
 
 [[package]]
 name = "aws-sigv4"
-version = "1.0.1"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "380adcc8134ad8bbdfeb2ace7626a869914ee266322965276cbc54066186d236"
+checksum = "c371c6b0ac54d4605eb6f016624fb5c7c2925d315fdf600ac1bf21b19d5f1742"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-eventstream",
@@ -496,11 +505,11 @@ dependencies = [
  "form_urlencoded",
  "hex",
  "hmac",
- "http",
+ "http 0.2.9",
+ "http 1.0.0",
  "once_cell",
  "p256",
  "percent-encoding",
- "regex",
  "ring 0.17.6",
  "sha2",
  "subtle",
@@ -511,9 +520,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-async"
-version = "1.0.2"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3e37ca17d25fe1e210b6d4bdf59b81caebfe99f986201a1228cb5061233b4b13"
+checksum = "72ee2d09cce0ef3ae526679b522835d63e75fb427aca5413cd371e490d52dcc6"
 dependencies = [
  "futures-util",
  "pin-project-lite",
@@ -522,9 +531,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-checksums"
-version = "0.60.0"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5a373ec01aede3dd066ec018c1bc4e8f5dd11b2c11c59c8eef1a5c68101f397"
+checksum = "be2acd1b9c6ae5859999250ed5a62423aedc5cf69045b844432de15fa2f31f2b"
 dependencies = [
  "aws-smithy-http",
  "aws-smithy-types",
@@ -532,7 +541,7 @@ dependencies = [
  "crc32c",
  "crc32fast",
  "hex",
- "http",
+ "http 0.2.9",
  "http-body",
  "md-5",
  "pin-project-lite",
@@ -543,9 +552,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-eventstream"
-version = "0.60.0"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1c669e1e5fc0d79561bf7a122b118bd50c898758354fe2c53eb8f2d31507cbc3"
+checksum = "e6363078f927f612b970edf9d1903ef5cef9a64d1e8423525ebb1f0a1633c858"
 dependencies = [
  "aws-smithy-types",
  "bytes",
@@ -554,9 +563,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http"
-version = "0.60.0"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b1de8aee22f67de467b2e3d0dd0fb30859dc53f579a63bd5381766b987db644"
+checksum = "dab56aea3cd9e1101a0a999447fb346afb680ab1406cebc44b32346e25b4117d"
 dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-runtime-api",
@@ -564,7 +573,7 @@ dependencies = [
  "bytes",
  "bytes-utils",
  "futures-core",
- "http",
+ "http 0.2.9",
  "http-body",
  "once_cell",
  "percent-encoding",
@@ -575,18 +584,18 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-json"
-version = "0.60.0"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a46dd338dc9576d6a6a5b5a19bd678dcad018ececee11cf28ecd7588bd1a55c"
+checksum = "fd3898ca6518f9215f62678870064398f00031912390efd03f1f6ef56d83aa8e"
 dependencies = [
  "aws-smithy-types",
 ]
 
 [[package]]
 name = "aws-smithy-query"
-version = "0.60.0"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "feb5b8c7a86d4b6399169670723b7e6f21a39fc833a30f5c5a2f997608178129"
+checksum = "bda4b1dfc9810e35fba8a620e900522cd1bd4f9578c446e82f49d1ce41d2e9f9"
 dependencies = [
  "aws-smithy-types",
  "urlencoding",
@@ -594,9 +603,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.0.2"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "273479291efc55e7b0bce985b139d86b6031adb8e50f65c1f712f20ba38f6388"
+checksum = "fafdab38f40ad7816e7da5dec279400dd505160780083759f01441af1bbb10ea"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -605,7 +614,7 @@ dependencies = [
  "bytes",
  "fastrand 2.0.0",
  "h2",
- "http",
+ "http 0.2.9",
  "http-body",
  "hyper",
  "hyper-rustls",
@@ -619,14 +628,14 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime-api"
-version = "1.0.2"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c6cebff0d977b6b6feed2fd07db52aac58ba3ccaf26cdd49f1af4add5061bef9"
+checksum = "c18276dd28852f34b3bf501f4f3719781f4999a51c7bff1a5c6dc8c4529adc29"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-types",
  "bytes",
- "http",
+ "http 0.2.9",
  "pin-project-lite",
  "tokio",
  "tracing",
@@ -635,15 +644,15 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "1.0.2"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7f48b3f27ddb40ab19892a5abda331f403e3cb877965e4e51171447807104af"
+checksum = "bb3e134004170d3303718baa2a4eb4ca64ee0a1c0a7041dca31b38be0fb414f3"
 dependencies = [
  "base64-simd",
  "bytes",
  "bytes-utils",
  "futures-core",
- "http",
+ "http 0.2.9",
  "http-body",
  "itoa",
  "num-integer",
@@ -658,24 +667,24 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-xml"
-version = "0.60.0"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ec40d74a67fd395bc3f6b4ccbdf1543672622d905ef3f979689aea5b730cb95"
+checksum = "8604a11b25e9ecaf32f9aa56b9fe253c5e2f606a3477f0071e96d3155a5ed218"
 dependencies = [
  "xmlparser",
 ]
 
 [[package]]
 name = "aws-types"
-version = "1.0.1"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8403fc56b1f3761e8efe45771ddc1165e47ec3417c68e68a4519b5cb030159ca"
+checksum = "789bbe008e65636fe1b6dbbb374c40c8960d1232b96af5ff4aec349f9c4accf4"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-async",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
- "http",
+ "http 0.2.9",
  "rustc_version",
  "tracing",
 ]
@@ -692,7 +701,7 @@ dependencies = [
  "bitflags 1.3.2",
  "bytes",
  "futures-util",
- "http",
+ "http 0.2.9",
  "http-body",
  "hyper",
  "itoa",
@@ -724,7 +733,7 @@ dependencies = [
  "async-trait",
  "bytes",
  "futures-util",
- "http",
+ "http 0.2.9",
  "http-body",
  "mime",
  "rustversion",
@@ -2003,9 +2012,9 @@ dependencies = [
 
 [[package]]
 name = "futures-channel"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2"
+checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -2013,9 +2022,9 @@ dependencies = [
 
 [[package]]
 name = "futures-core"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c"
+checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
 
 [[package]]
 name = "futures-executor"
@@ -2030,9 +2039,9 @@ dependencies = [
 
 [[package]]
 name = "futures-io"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964"
+checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
 
 [[package]]
 name = "futures-lite"
@@ -2051,9 +2060,9 @@ dependencies = [
 
 [[package]]
 name = "futures-macro"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
+checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -2062,15 +2071,15 @@ dependencies = [
 
 [[package]]
 name = "futures-sink"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e"
+checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
 
 [[package]]
 name = "futures-task"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65"
+checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
 
 [[package]]
 name = "futures-timer"
@@ -2080,9 +2089,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
 
 [[package]]
 name = "futures-util"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533"
+checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -2186,7 +2195,7 @@ dependencies = [
  "futures-core",
  "futures-sink",
  "futures-util",
- "http",
+ "http 0.2.9",
  "indexmap 2.0.1",
  "slab",
  "tokio",
@@ -2337,6 +2346,17 @@ dependencies = [
  "itoa",
 ]
 
+[[package]]
+name = "http"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b32afd38673a8016f7c9ae69e5af41a58f81b1d31689040f2f1959594ce194ea"
+dependencies = [
+ "bytes",
+ "fnv",
+ "itoa",
+]
+
 [[package]]
 name = "http-body"
 version = "0.4.5"
@@ -2344,7 +2364,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1"
 dependencies = [
  "bytes",
- "http",
+ "http 0.2.9",
  "pin-project-lite",
 ]
 
@@ -2407,7 +2427,7 @@ dependencies = [
  "futures-core",
  "futures-util",
  "h2",
- "http",
+ "http 0.2.9",
  "http-body",
  "httparse",
  "httpdate",
@@ -2426,7 +2446,7 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
 dependencies = [
- "http",
+ "http 0.2.9",
  "hyper",
  "log",
  "rustls",
@@ -3108,7 +3128,7 @@ dependencies = [
  "base64 0.13.1",
  "chrono",
  "getrandom 0.2.11",
- "http",
+ "http 0.2.9",
  "rand 0.8.5",
  "serde",
  "serde_json",
@@ -3210,7 +3230,7 @@ checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b"
 dependencies = [
  "async-trait",
  "bytes",
- "http",
+ "http 0.2.9",
  "opentelemetry_api",
  "reqwest",
 ]
@@ -3223,7 +3243,7 @@ checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275"
 dependencies = [
  "async-trait",
  "futures-core",
- "http",
+ "http 0.2.9",
  "opentelemetry-http",
  "opentelemetry-proto",
  "opentelemetry-semantic-conventions",
@@ -4323,6 +4343,12 @@ dependencies = [
  "regex-syntax 0.8.2",
 ]
 
+[[package]]
+name = "regex-lite"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30b661b2f27137bdbc16f00eda72866a92bb28af1753ffbd56744fb6e2e9cd8e"
+
 [[package]]
 name = "regex-syntax"
 version = "0.6.29"
@@ -4392,7 +4418,7 @@ dependencies = [
  "futures-core",
  "futures-util",
  "h2",
- "http",
+ "http 0.2.9",
  "http-body",
  "hyper",
  "hyper-rustls",
@@ -4433,7 +4459,7 @@ checksum = "4531c89d50effe1fac90d095c8b133c20c5c714204feee0bfc3fd158e784209d"
 dependencies = [
  "anyhow",
  "async-trait",
- "http",
+ "http 0.2.9",
  "reqwest",
  "serde",
  "task-local-extensions",
@@ -4451,7 +4477,7 @@ dependencies = [
  "chrono",
  "futures",
  "getrandom 0.2.11",
- "http",
+ "http 0.2.9",
  "hyper",
  "parking_lot 0.11.2",
  "reqwest",
@@ -4538,7 +4564,7 @@ version = "3.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945"
 dependencies = [
- "http",
+ "http 0.2.9",
  "hyper",
  "lazy_static",
  "percent-encoding",
@@ -5868,7 +5894,7 @@ dependencies = [
  "futures-core",
  "futures-util",
  "h2",
- "http",
+ "http 0.2.9",
  "http-body",
  "hyper",
  "hyper-timeout",
@@ -6083,7 +6109,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "data-encoding",
- "http",
+ "http 0.2.9",
  "httparse",
  "log",
  "rand 0.8.5",
diff --git a/Cargo.toml b/Cargo.toml
index d3006985ab..0cfe522ff9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -48,11 +48,12 @@ azure_storage_blobs = "0.18"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "1.0", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "1.0"
-aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] }
-aws-smithy-types = "1.0"
-aws-credential-types = "1.0"
+aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "1.14"
+aws-sdk-secretsmanager = { version = "1.14.0" }
+aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
+aws-smithy-types = "1.1.4"
+aws-credential-types = "1.1.4"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 210a898747..1d3831eea0 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -6,6 +6,8 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
+aws-config.workspace = true
+aws-sdk-secretsmanager.workspace = true
 camino.workspace = true
 clap.workspace = true
 futures.workspace = true
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 7c716a9f53..ed65437ba2 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -8,6 +8,7 @@ use anyhow::anyhow;
 use attachment_service::http::make_router;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service};
+use aws_config::{self, BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -46,6 +47,100 @@ struct Cli {
     database_url: String,
 }
 
+/// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
+/// type encapsulates the logic to decide which and do the loading.
+struct Secrets {
+    database_url: String,
+    public_key: Option<JwtAuth>,
+    jwt_token: Option<String>,
+}
+
+impl Secrets {
+    const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url";
+    const JWT_TOKEN_SECRET: &'static str = "neon-storage-controller-pageserver-jwt-token";
+    const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
+
+    async fn load(args: &Cli) -> anyhow::Result<Self> {
+        if args.database_url.is_empty() {
+            Self::load_aws_sm().await
+        } else {
+            Self::load_cli(args)
+        }
+    }
+
+    async fn load_aws_sm() -> anyhow::Result<Self> {
+        let Ok(region) = std::env::var("AWS_REGION") else {
+            anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
+        };
+        let config = aws_config::defaults(BehaviorVersion::v2023_11_09())
+            .region(Region::new(region.clone()))
+            .load()
+            .await;
+
+        let asm = aws_sdk_secretsmanager::Client::new(&config);
+
+        let Some(database_url) = asm
+            .get_secret_value()
+            .secret_id(Self::DATABASE_URL_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string)
+        else {
+            anyhow::bail!(
+                "Database URL secret not found at {region}/{}",
+                Self::DATABASE_URL_SECRET
+            )
+        };
+
+        let jwt_token = asm
+            .get_secret_value()
+            .secret_id(Self::JWT_TOKEN_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string);
+        if jwt_token.is_none() {
+            tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver");
+        }
+
+        let public_key = asm
+            .get_secret_value()
+            .secret_id(Self::PUBLIC_KEY_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string);
+        let public_key = match public_key {
+            Some(key) => Some(JwtAuth::from_key(key)?),
+            None => {
+                tracing::warn!(
+                    "No public key set: inccoming HTTP requests will not be authenticated"
+                );
+                None
+            }
+        };
+
+        Ok(Self {
+            database_url,
+            public_key,
+            jwt_token,
+        })
+    }
+
+    fn load_cli(args: &Cli) -> anyhow::Result<Self> {
+        let public_key = match &args.public_key {
+            None => None,
+            Some(key_path) => Some(JwtAuth::from_key_path(key_path)?),
+        };
+        Ok(Self {
+            database_url: args.database_url.clone(),
+            public_key,
+            jwt_token: args.jwt_token.clone(),
+        })
+    }
+}
+
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
     let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
@@ -66,23 +161,22 @@ async fn main() -> anyhow::Result<()> {
         args.listen
     );
 
+    let secrets = Secrets::load(&args).await?;
+
     let config = Config {
-        jwt_token: args.jwt_token,
+        jwt_token: secrets.jwt_token,
     };
 
     let json_path = args.path;
-    let persistence = Arc::new(Persistence::new(args.database_url, json_path.clone()));
+    let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone()));
 
     let service = Service::spawn(config, persistence.clone()).await?;
 
     let http_listener = tcp_listener::bind(args.listen)?;
 
-    let auth = if let Some(public_key_path) = &args.public_key {
-        let jwt_auth = JwtAuth::from_key_path(public_key_path)?;
-        Some(Arc::new(SwappableJwtAuth::new(jwt_auth)))
-    } else {
-        None
-    };
+    let auth = secrets
+        .public_key
+        .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
     let router = make_router(service, auth)
         .build()
         .map_err(|err| anyhow!(err))?;
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index 66b1f6e866..15c3f2af1b 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -127,6 +127,10 @@ impl JwtAuth {
         Ok(Self::new(decoding_keys))
     }
 
+    pub fn from_key(key: String) -> Result<Self> {
+        Ok(Self::new(vec![DecodingKey::from_ed_pem(key.as_bytes())?]))
+    }
+
     /// Attempt to decode the token with the internal decoding keys.
     ///
     /// The function tries the stored decoding keys in succession,
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 8fd49956cc..f58b912a77 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -15,7 +15,7 @@ publish = false
 [dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
 aws-config = { version = "1", default-features = false, features = ["rustls", "sso"] }
-aws-runtime = { version = "1", default-features = false, features = ["event-stream", "sigv4a"] }
+aws-runtime = { version = "1", default-features = false, features = ["event-stream", "http-02x", "sigv4a"] }
 aws-sigv4 = { version = "1", features = ["http0-compat", "sign-eventstream", "sigv4a"] }
 aws-smithy-async = { version = "1", default-features = false, features = ["rt-tokio"] }
 aws-smithy-http = { version = "0.60", default-features = false, features = ["event-stream"] }

From caf868e27481017f19e19d70b4d84495eeb7d07c Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 2 Feb 2024 19:46:47 +0200
Subject: [PATCH 0065/1571] test: assert we eventually free space (#6536)

in `test_statvfs_pressure_{usage,min_avail_bytes}` we now race against
initial logical size calculation on-demand downloading the layers. first
wait out the initial logical sizes, then change the final asserts to be
"eventual", which is not great but it is faster than failing and
retrying.

this issue seems to happen only in debug mode tests.

Fixes: #6510
---
 test_runner/fixtures/pageserver/http.py       | 13 ++++++++
 .../regress/test_disk_usage_eviction.py       | 31 ++++++++++++++----
 test_runner/regress/test_timeline_size.py     | 32 +++----------------
 3 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 1a8765d830..92e5027a9f 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -831,3 +831,16 @@ class PageserverHttpClient(requests.Session):
         self.put(
             f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}"
         ).raise_for_status()
+
+    def timeline_wait_logical_size(self, tenant_id: TenantId, timeline_id: TimelineId) -> int:
+        detail = self.timeline_detail(
+            tenant_id,
+            timeline_id,
+            include_non_incremental_logical_size=True,
+            force_await_initial_logical_size=True,
+        )
+        current_logical_size = detail["current_logical_size"]
+        non_incremental = detail["current_logical_size_non_incremental"]
+        assert current_logical_size == non_incremental
+        assert isinstance(current_logical_size, int)
+        return current_logical_size
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 6a4f0edbea..dcbf8a5025 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -155,6 +155,15 @@ class EvictionEnv:
         mock_behavior,
         eviction_order: EvictionOrder,
     ):
+        """
+        Starts pageserver up with mocked statvfs setup. The startup is
+        problematic because of dueling initial logical size calculations
+        requiring layers and disk usage based task evicting.
+
+        Returns after initial logical sizes are complete, but the phase of disk
+        usage eviction task is unknown; it might need to run one more iteration
+        before assertions can be made.
+        """
         disk_usage_config = {
             "period": period,
             "max_usage_pct": max_usage_pct,
@@ -183,9 +192,15 @@ class EvictionEnv:
             ),
         )
 
+        # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction
+        for tenant_id, timeline_id in self.timelines:
+            pageserver_http = self.neon_env.get_tenant_pageserver(tenant_id).http_client()
+            pageserver_http.timeline_wait_logical_size(tenant_id, timeline_id)
+
         def statvfs_called():
             assert pageserver.log_contains(".*running mocked statvfs.*")
 
+        # we most likely have already completed multiple runs
         wait_until(10, 1, statvfs_called)
 
 
@@ -789,9 +804,11 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
 
     wait_until(10, 1, relieved_log_message)
 
-    post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
+    def less_than_max_usage_pct():
+        post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
+        assert post_eviction_total_size < 0.33 * total_size, "we requested max 33% usage"
 
-    assert post_eviction_total_size <= 0.33 * total_size, "we requested max 33% usage"
+    wait_until(2, 2, less_than_max_usage_pct)
 
 
 def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
@@ -831,11 +848,13 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
 
     wait_until(10, 1, relieved_log_message)
 
-    post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
+    def more_than_min_avail_bytes_freed():
+        post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
+        assert (
+            total_size - post_eviction_total_size >= min_avail_bytes
+        ), f"we requested at least {min_avail_bytes} worth of free space"
 
-    assert (
-        total_size - post_eviction_total_size >= min_avail_bytes
-    ), "we requested at least min_avail_bytes worth of free space"
+    wait_until(2, 2, more_than_min_avail_bytes_freed)
 
 
 def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 4c5cb32caa..303aabb58d 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -20,7 +20,7 @@ from fixtures.neon_fixtures import (
     VanillaPostgres,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     assert_tenant_state,
     timeline_delete_wait_completed,
@@ -40,7 +40,7 @@ def test_timeline_size(neon_simple_env: NeonEnv):
     new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty")
 
     client = env.pageserver.http_client()
-    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
+    client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
 
     endpoint_main = env.endpoints.create_start("test_timeline_size")
     log.info("postgres is running on 'test_timeline_size' branch")
@@ -73,7 +73,7 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv):
     new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty")
 
     client = env.pageserver.http_client()
-    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
+    client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
     timeline_details = client.timeline_detail(
         env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True
     )
@@ -153,7 +153,7 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
     client = env.pageserver.http_client()
     new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota_on_startup")
 
-    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
+    client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
 
     endpoint_main = env.endpoints.create(
         "test_timeline_size_quota_on_startup",
@@ -219,7 +219,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
     client = env.pageserver.http_client()
     new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota")
 
-    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
+    client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
 
     endpoint_main = env.endpoints.create(
         "test_timeline_size_quota",
@@ -715,28 +715,6 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
     # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
 
 
-# Timeline logical size initialization is an asynchronous background task that runs once,
-# try a few times to ensure it's activated properly
-def wait_for_timeline_size_init(
-    client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
-):
-    for i in range(10):
-        timeline_details = client.timeline_detail(
-            tenant, timeline, include_non_incremental_logical_size=True
-        )
-        current_logical_size = timeline_details["current_logical_size"]
-        non_incremental = timeline_details["current_logical_size_non_incremental"]
-        if current_logical_size == non_incremental:
-            return
-        log.info(
-            f"waiting for current_logical_size of a timeline to be calculated, iteration {i}: {current_logical_size} vs {non_incremental}"
-        )
-        time.sleep(1)
-    raise Exception(
-        f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}"
-    )
-
-
 def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     """
     Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete

From 2e5eab69c6161bfbf380df355f1ab195171d8601 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Feb 2024 18:20:18 +0000
Subject: [PATCH 0066/1571] tests: remove test_gc_cutoff (#6587)

This test became flaky when postgres retry handling was fixed to use
backoff delays -- each iteration in this test's loop was taking much
longer because pgbench doesn't fail until postgres has given up on
retrying to the pageserver.

We are just removing it, because the condition it tests is no longer
risky: we reload all metadata from remote storage on restart, so
crashing directly between making local changes and doing remote uploads
isn't interesting any more.

Closes:  https://github.com/neondatabase/neon/issues/2856
Closes: https://github.com/neondatabase/neon/issues/5329
---
 pageserver/src/tenant/timeline.rs     |  4 ---
 test_runner/regress/test_gc_cutoff.py | 47 ---------------------------
 2 files changed, 51 deletions(-)
 delete mode 100644 test_runner/regress/test_gc_cutoff.py

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e779f6f32e..0ffe0b6418 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4388,10 +4388,6 @@ impl Timeline {
 
             guard.finish_gc_timeline(&gc_layers);
 
-            if result.layers_removed != 0 {
-                fail_point!("after-timeline-gc-removed-layers");
-            }
-
             #[cfg(feature = "testing")]
             {
                 result.doomed_layers = gc_layers;
diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py
deleted file mode 100644
index 284a8c3563..0000000000
--- a/test_runner/regress/test_gc_cutoff.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import subprocess
-
-import pytest
-from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
-
-
-# Test gc_cutoff
-#
-# This test sets fail point at the end of GC, and checks that pageserver
-# normally restarts after it. Also, there should be GC ERRORs in the log,
-# but the fixture checks the log for any unexpected ERRORs after every
-# test anyway, so it doesn't need any special attention here.
-@pytest.mark.timeout(600)
-def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            "gc_period": "10 s",
-            "gc_horizon": f"{1024 ** 2}",
-            "checkpoint_distance": f"{1024 ** 2}",
-            "compaction_period": "5 s",
-            # set PITR interval to be small, so we can do GC
-            "pitr_interval": "1 s",
-            "compaction_threshold": "3",
-            "image_creation_threshold": "2",
-        }
-    )
-
-    pageserver_http = env.pageserver.http_client()
-
-    # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
-    tenant_id = env.initial_tenant
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
-    connstr = endpoint.connstr(options="-csynchronous_commit=off")
-    pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
-
-    pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
-
-    # Because this test does a rapid series of restarts of the same node, it's possible that
-    # we are restarted again before we can clean up deletion lists form the previous generation,
-    # resulting in a subsequent startup logging a warning.
-    env.pageserver.allowed_errors.append(".*Dropping stale deletions for tenant.*")
-
-    for _ in range(5):
-        with pytest.raises(subprocess.SubprocessError):
-            pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])
-        env.pageserver.stop()
-        env.pageserver.start(extra_env_vars={"FAILPOINTS": "after-timeline-gc-removed-layers=exit"})

From 0b91edb943169ad7804fe337ed3d2a5f64f93b98 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Fri, 2 Feb 2024 19:36:31 +0100
Subject: [PATCH 0067/1571] Revert pgvector 0.6.0 (#6592)

It doesn't work in our VMs. Need more time to investigate
---
 .dockerignore           | 25 +++++++++--------
 Dockerfile.compute-node |  7 ++---
 patches/pgvector.patch  | 60 -----------------------------------------
 3 files changed, 14 insertions(+), 78 deletions(-)
 delete mode 100644 patches/pgvector.patch

diff --git a/.dockerignore b/.dockerignore
index 29abdc37aa..ae0ad8fd77 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,28 +1,27 @@
 *
 
-# Files
-!Cargo.lock
-!Cargo.toml
-!Makefile
 !rust-toolchain.toml
-!scripts/combine_control_files.py
-!scripts/ninstall.sh
-!vm-cgconfig.conf
+!Cargo.toml
+!Cargo.lock
+!Makefile
 
-# Directories
 !.cargo/
 !.config/
-!compute_tools/
 !control_plane/
+!compute_tools/
 !libs/
-!neon_local/
 !pageserver/
-!patches/
 !pgxn/
 !proxy/
-!s3_scrubber/
 !safekeeper/
+!s3_scrubber/
 !storage_broker/
 !trace/
-!vendor/postgres-*/
+!vendor/postgres-v14/
+!vendor/postgres-v15/
+!vendor/postgres-v16/
 !workspace_hack/
+!neon_local/
+!scripts/ninstall.sh
+!scripts/combine_control_files.py
+!vm-cgconfig.conf
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index b13225172d..d91c7cfd72 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,12 +241,9 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-COPY patches/pgvector.patch /pgvector.patch
-
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \
-    echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
+    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
-    patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
diff --git a/patches/pgvector.patch b/patches/pgvector.patch
deleted file mode 100644
index cc1ca2e3a6..0000000000
--- a/patches/pgvector.patch
+++ /dev/null
@@ -1,60 +0,0 @@
-From de3dd0cd034d2bcc12b456171ce163bdc1f4cb65 Mon Sep 17 00:00:00 2001
-From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
-Date: Thu, 1 Feb 2024 17:42:31 +0200
-Subject: [PATCH 1/1] Make v0.6.0 work with Neon
-
-Now that the WAL-logging happens as a separate step at the end of the
-build, we need a few neon-specific hints to make it work.
----
- src/hnswbuild.c | 28 ++++++++++++++++++++++++++++
- 1 file changed, 28 insertions(+)
-
-diff --git a/src/hnswbuild.c b/src/hnswbuild.c
-index 680789b..bfa657a 100644
---- a/src/hnswbuild.c
-+++ b/src/hnswbuild.c
-@@ -1089,13 +1089,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
- 	SeedRandom(42);
- #endif
- 
-+#ifdef NEON_SMGR
-+	smgr_start_unlogged_build(RelationGetSmgr(index));
-+#endif
-+
- 	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
- 
- 	BuildGraph(buildstate, forkNum);
- 
-+#ifdef NEON_SMGR
-+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
-+#endif
-+
- 	if (RelationNeedsWAL(index))
-+	{
- 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
- 
-+#ifdef NEON_SMGR
-+		{
-+#if PG_VERSION_NUM >= 160000
-+			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
-+#else
-+			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
-+#endif
-+
-+			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
-+										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
-+			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
-+		}
-+#endif
-+	}
-+
-+#ifdef NEON_SMGR
-+	smgr_end_unlogged_build(RelationGetSmgr(index));
-+#endif
-+
- 	FreeBuildState(buildstate);
- }
- 
--- 
-2.39.2
-

From 786e9cf75ba482e67b7e7e0626fac21b1696c761 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Feb 2024 19:22:03 +0000
Subject: [PATCH 0068/1571] control_plane: implement HTTP compute hook for
 attachment service (#6471)

## Problem

When we change which physical pageservers a tenant is attached to, we
must update the control plane so that it can update computes. This will
be done via an HTTP hook, as described in
https://www.notion.so/neondatabase/Sharding-Service-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365#1fe185a35d6d41f0a54279ac1a41bc94

## Summary of changes

- Optional CLI args `--control-plane-jwt-token` and `-compute-hook-url`
are added. If these are set, then we will use this HTTP endpoint,
instead of trying to use neon_local LocalEnv to update compute
configuration.
- Implement an HTTP-driven version of ComputeHook that calls into the
configured URL
- Notify for all tenants on startup, to ensure that we don't miss
notifications if we crash partway through a change, and carry a
`pending_compute_notification` flag at runtime to allow notifications to
fail without risking never sending the update.
- Add a test for all this

One might wonder: why not do a "forever" retry for compute hook
notifications, rather than carrying a flag on the shard to call
reconcile() again later. The reason is that we will later limit
concurreny of reconciles, when dealing with larger numbers of shards,
and if reconcile is stuck waiting for the control plane to accept a
notification request, it could jam up the whole system and prevent us
making other changes. Anyway: from the perspective of the outside world,
we _do_ retry forever, but we don't retry forever within a given
Reconciler lifetime.

The `pending_compute_notification` logic is predicated on later adding a
background task that just calls `Service::reconcile_all` on a schedule
to make sure that anything+everything that can fail a
Reconciler::reconcile call will eventually be retried.
---
 Cargo.lock                                    |   1 +
 control_plane/attachment_service/Cargo.toml   |   1 +
 .../attachment_service/src/compute_hook.rs    | 286 +++++++++++++++---
 control_plane/attachment_service/src/main.rs  |  34 ++-
 .../attachment_service/src/reconciler.rs      |  63 +++-
 .../attachment_service/src/service.rs         |  86 +++++-
 .../attachment_service/src/tenant_state.rs    |  60 ++++
 control_plane/src/attachment_service.rs       |   6 +
 control_plane/src/bin/neon_local.rs           |   2 +-
 control_plane/src/endpoint.rs                 |  34 ++-
 control_plane/src/local_env.rs                |   7 +-
 test_runner/fixtures/neon_fixtures.py         |   9 +-
 test_runner/regress/test_sharding_service.py  | 101 ++++++-
 13 files changed, 600 insertions(+), 90 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 90991ab0a4..02450709d1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -288,6 +288,7 @@ dependencies = [
  "pageserver_api",
  "pageserver_client",
  "postgres_connection",
+ "reqwest",
  "serde",
  "serde_json",
  "thiserror",
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 1d3831eea0..d3c62d74d2 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -16,6 +16,7 @@ hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
+reqwest.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs
index 02617cd065..9c1185f259 100644
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -1,24 +1,76 @@
-use std::collections::HashMap;
+use std::{collections::HashMap, time::Duration};
 
-use control_plane::endpoint::ComputeControlPlane;
+use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
-use pageserver_api::shard::{ShardCount, ShardIndex, TenantShardId};
+use hyper::{Method, StatusCode};
+use pageserver_api::shard::{ShardCount, ShardIndex, ShardNumber, TenantShardId};
 use postgres_connection::parse_host_port;
-use utils::id::{NodeId, TenantId};
+use serde::{Deserialize, Serialize};
+use tokio_util::sync::CancellationToken;
+use utils::{
+    backoff::{self},
+    id::{NodeId, TenantId},
+};
+
+use crate::service::Config;
+
+const BUSY_DELAY: Duration = Duration::from_secs(1);
+const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
+
+pub(crate) const API_CONCURRENCY: usize = 32;
 
 pub(super) struct ComputeHookTenant {
     shards: Vec<(ShardIndex, NodeId)>,
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+struct ComputeHookNotifyRequestShard {
+    node_id: NodeId,
+    shard_number: ShardNumber,
+}
+
+/// Request body that we send to the control plane to notify it of where a tenant is attached
+#[derive(Serialize, Deserialize, Debug)]
+struct ComputeHookNotifyRequest {
+    tenant_id: TenantId,
+    shards: Vec<ComputeHookNotifyRequestShard>,
+}
+
+/// Error type for attempts to call into the control plane compute notification hook
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum NotifyError {
+    // Request was not send successfully, e.g. transport error
+    #[error("Sending request: {0}")]
+    Request(#[from] reqwest::Error),
+    // Request could not be serviced right now due to ongoing Operation in control plane, but should be possible soon.
+    #[error("Control plane tenant busy")]
+    Busy,
+    // Explicit 429 response asking us to retry less frequently
+    #[error("Control plane overloaded")]
+    SlowDown,
+    // A 503 response indicates the control plane can't handle the request right now
+    #[error("Control plane unavailable (status {0})")]
+    Unavailable(StatusCode),
+    // API returned unexpected non-success status.  We will retry, but log a warning.
+    #[error("Control plane returned unexpected status {0}")]
+    Unexpected(StatusCode),
+    // We shutdown while sending
+    #[error("Shutting down")]
+    ShuttingDown,
+    // A response indicates we will never succeed, such as 400 or 404
+    #[error("Non-retryable error {0}")]
+    Fatal(StatusCode),
+}
+
 impl ComputeHookTenant {
-    pub(super) async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> anyhow::Result<()> {
+    async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
         // Find the highest shard count and drop any shards that aren't
         // for that shard count.
         let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max();
         let Some(shard_count) = shard_count else {
             // No shards, nothing to do.
             tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards");
-            return Ok(());
+            return None;
         };
 
         self.shards.retain(|(k, _v)| k.shard_count == shard_count);
@@ -26,38 +78,18 @@ impl ComputeHookTenant {
             .sort_by_key(|(shard, _node_id)| shard.shard_number);
 
         if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) {
-            // We have pageservers for all the shards: proceed to reconfigure compute
-            let env = match LocalEnv::load_config() {
-                Ok(e) => e,
-                Err(e) => {
-                    tracing::warn!(
-                        "Couldn't load neon_local config, skipping compute update ({e})"
-                    );
-                    return Ok(());
-                }
-            };
-            let cplane = ComputeControlPlane::load(env.clone())
-                .expect("Error loading compute control plane");
-
-            let compute_pageservers = self
-                .shards
-                .iter()
-                .map(|(_shard, node_id)| {
-                    let ps_conf = env
-                        .get_pageserver_conf(*node_id)
-                        .expect("Unknown pageserver");
-                    let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr)
-                        .expect("Unable to parse listen_pg_addr");
-                    (pg_host, pg_port.unwrap_or(5432))
-                })
-                .collect::<Vec<_>>();
-
-            for (endpoint_name, endpoint) in &cplane.endpoints {
-                if endpoint.tenant_id == tenant_id && endpoint.status() == "running" {
-                    tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
-                    endpoint.reconfigure(compute_pageservers.clone()).await?;
-                }
-            }
+            // We have pageservers for all the shards: emit a configuration update
+            return Some(ComputeHookNotifyRequest {
+                tenant_id,
+                shards: self
+                    .shards
+                    .iter()
+                    .map(|(shard, node_id)| ComputeHookNotifyRequestShard {
+                        shard_number: shard.shard_number,
+                        node_id: *node_id,
+                    })
+                    .collect(),
+            });
         } else {
             tracing::info!(
                 "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
@@ -66,7 +98,7 @@ impl ComputeHookTenant {
             );
         }
 
-        Ok(())
+        None
     }
 }
 
@@ -74,22 +106,171 @@ impl ComputeHookTenant {
 /// mapping.  It aggregates updates for the shards in a tenant, and when appropriate reconfigures
 /// the compute connection string.
 pub(super) struct ComputeHook {
+    config: Config,
     state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
+    authorization_header: Option<String>,
 }
 
 impl ComputeHook {
-    pub(super) fn new() -> Self {
+    pub(super) fn new(config: Config) -> Self {
+        let authorization_header = config
+            .control_plane_jwt_token
+            .clone()
+            .map(|jwt| format!("Bearer {}", jwt));
+
         Self {
             state: Default::default(),
+            config,
+            authorization_header,
         }
     }
 
+    /// For test environments: use neon_local's LocalEnv to update compute
+    async fn do_notify_local(
+        &self,
+        reconfigure_request: ComputeHookNotifyRequest,
+    ) -> anyhow::Result<()> {
+        let env = match LocalEnv::load_config() {
+            Ok(e) => e,
+            Err(e) => {
+                tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})");
+                return Ok(());
+            }
+        };
+        let cplane =
+            ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane");
+        let ComputeHookNotifyRequest { tenant_id, shards } = reconfigure_request;
+
+        let compute_pageservers = shards
+            .into_iter()
+            .map(|shard| {
+                let ps_conf = env
+                    .get_pageserver_conf(shard.node_id)
+                    .expect("Unknown pageserver");
+                let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr)
+                    .expect("Unable to parse listen_pg_addr");
+                (pg_host, pg_port.unwrap_or(5432))
+            })
+            .collect::<Vec<_>>();
+
+        for (endpoint_name, endpoint) in &cplane.endpoints {
+            if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
+                tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
+                endpoint.reconfigure(compute_pageservers.clone()).await?;
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn do_notify_iteration(
+        &self,
+        client: &reqwest::Client,
+        url: &String,
+        reconfigure_request: &ComputeHookNotifyRequest,
+        cancel: &CancellationToken,
+    ) -> Result<(), NotifyError> {
+        let req = client.request(Method::POST, url);
+        let req = if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        };
+
+        tracing::debug!(
+            "Sending notify request to {} ({:?})",
+            url,
+            reconfigure_request
+        );
+        let send_result = req.json(&reconfigure_request).send().await;
+        let response = match send_result {
+            Ok(r) => r,
+            Err(e) => return Err(e.into()),
+        };
+
+        // Treat all 2xx responses as success
+        if response.status() >= StatusCode::OK && response.status() < StatusCode::MULTIPLE_CHOICES {
+            if response.status() != StatusCode::OK {
+                // Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so
+                // log a warning.
+                tracing::warn!(
+                    "Unexpected 2xx response code {} from control plane",
+                    response.status()
+                );
+            }
+
+            return Ok(());
+        }
+
+        // Error response codes
+        match response.status() {
+            StatusCode::TOO_MANY_REQUESTS => {
+                // TODO: 429 handling should be global: set some state visible to other requests
+                // so that they will delay before starting, rather than all notifications trying
+                // once before backing off.
+                tokio::time::timeout(SLOWDOWN_DELAY, cancel.cancelled())
+                    .await
+                    .ok();
+                Err(NotifyError::SlowDown)
+            }
+            StatusCode::LOCKED => {
+                // Delay our retry if busy: the usual fast exponential backoff in backoff::retry
+                // is not appropriate
+                tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
+                    .await
+                    .ok();
+                Err(NotifyError::Busy)
+            }
+            StatusCode::SERVICE_UNAVAILABLE
+            | StatusCode::GATEWAY_TIMEOUT
+            | StatusCode::BAD_GATEWAY => Err(NotifyError::Unavailable(response.status())),
+            StatusCode::BAD_REQUEST | StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => {
+                Err(NotifyError::Fatal(response.status()))
+            }
+            _ => Err(NotifyError::Unexpected(response.status())),
+        }
+    }
+
+    async fn do_notify(
+        &self,
+        url: &String,
+        reconfigure_request: ComputeHookNotifyRequest,
+        cancel: &CancellationToken,
+    ) -> Result<(), NotifyError> {
+        let client = reqwest::Client::new();
+        backoff::retry(
+            || self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
+            |e| matches!(e, NotifyError::Fatal(_)),
+            3,
+            10,
+            "Send compute notification",
+            backoff::Cancel::new(cancel.clone(), || NotifyError::ShuttingDown),
+        )
+        .await
+    }
+
+    /// Call this to notify the compute (postgres) tier of new pageservers to use
+    /// for a tenant.  notify() is called by each shard individually, and this function
+    /// will decide whether an update to the tenant is sent.  An update is sent on the
+    /// condition that:
+    /// - We know a pageserver for every shard.
+    /// - All the shards have the same shard_count (i.e. we are not mid-split)
+    ///
+    /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler
+    /// that is cancelled.
+    ///
+    /// This function is fallible, including in the case that the control plane is transiently
+    /// unavailable.  A limited number of retries are done internally to efficiently hide short unavailability
+    /// periods, but we don't retry forever.  The **caller** is responsible for handling failures and
+    /// ensuring that they eventually call again to ensure that the compute is eventually notified of
+    /// the proper pageserver nodes for a tenant.
+    #[tracing::instrument(skip_all, fields(tenant_shard_id, node_id))]
     pub(super) async fn notify(
         &self,
         tenant_shard_id: TenantShardId,
         node_id: NodeId,
-    ) -> anyhow::Result<()> {
-        tracing::info!("ComputeHook::notify: {}->{}", tenant_shard_id, node_id);
+        cancel: &CancellationToken,
+    ) -> Result<(), NotifyError> {
         let mut locked = self.state.lock().await;
         let entry = locked
             .entry(tenant_shard_id.tenant_id)
@@ -111,6 +292,25 @@ impl ComputeHook {
             entry.shards.push((shard_index, node_id));
         }
 
-        entry.maybe_reconfigure(tenant_shard_id.tenant_id).await
+        let reconfigure_request = entry.maybe_reconfigure(tenant_shard_id.tenant_id).await;
+        let Some(reconfigure_request) = reconfigure_request else {
+            // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
+            // until it does.
+            tracing::debug!("Tenant isn't yet ready to emit a notification",);
+            return Ok(());
+        };
+
+        if let Some(notify_url) = &self.config.compute_hook_url {
+            self.do_notify(notify_url, reconfigure_request, cancel)
+                .await
+        } else {
+            self.do_notify_local(reconfigure_request)
+                .await
+                .map_err(|e| {
+                    // This path is for testing only, so munge the error into our prod-style error type.
+                    tracing::error!("Local notification hook failed: {e}");
+                    NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
+                })
+        }
     }
 }
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index ed65437ba2..eda9c7aad6 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -35,9 +35,18 @@ struct Cli {
     public_key: Option<camino::Utf8PathBuf>,
 
     /// Token for authenticating this service with the pageservers it controls
-    #[arg(short, long)]
+    #[arg(long)]
     jwt_token: Option<String>,
 
+    /// Token for authenticating this service with the control plane, when calling
+    /// the compute notification endpoint
+    #[arg(long)]
+    control_plane_jwt_token: Option<String>,
+
+    /// URL to control plane compute notification endpoint
+    #[arg(long)]
+    compute_hook_url: Option<String>,
+
     /// Path to the .json file to store state (will be created if it doesn't exist)
     #[arg(short, long)]
     path: Option<Utf8PathBuf>,
@@ -53,11 +62,15 @@ struct Secrets {
     database_url: String,
     public_key: Option<JwtAuth>,
     jwt_token: Option<String>,
+    control_plane_jwt_token: Option<String>,
 }
 
 impl Secrets {
     const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url";
-    const JWT_TOKEN_SECRET: &'static str = "neon-storage-controller-pageserver-jwt-token";
+    const PAGESERVER_JWT_TOKEN_SECRET: &'static str =
+        "neon-storage-controller-pageserver-jwt-token";
+    const CONTROL_PLANE_JWT_TOKEN_SECRET: &'static str =
+        "neon-storage-controller-control-plane-jwt-token";
     const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
 
     async fn load(args: &Cli) -> anyhow::Result<Self> {
@@ -95,7 +108,7 @@ impl Secrets {
 
         let jwt_token = asm
             .get_secret_value()
-            .secret_id(Self::JWT_TOKEN_SECRET)
+            .secret_id(Self::PAGESERVER_JWT_TOKEN_SECRET)
             .send()
             .await?
             .secret_string()
@@ -104,6 +117,17 @@ impl Secrets {
             tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver");
         }
 
+        let control_plane_jwt_token = asm
+            .get_secret_value()
+            .secret_id(Self::CONTROL_PLANE_JWT_TOKEN_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string);
+        if jwt_token.is_none() {
+            tracing::warn!("No control plane JWT token set: this will only work if authentication is disabled on the pageserver");
+        }
+
         let public_key = asm
             .get_secret_value()
             .secret_id(Self::PUBLIC_KEY_SECRET)
@@ -125,6 +149,7 @@ impl Secrets {
             database_url,
             public_key,
             jwt_token,
+            control_plane_jwt_token,
         })
     }
 
@@ -137,6 +162,7 @@ impl Secrets {
             database_url: args.database_url.clone(),
             public_key,
             jwt_token: args.jwt_token.clone(),
+            control_plane_jwt_token: args.control_plane_jwt_token.clone(),
         })
     }
 }
@@ -165,6 +191,8 @@ async fn main() -> anyhow::Result<()> {
 
     let config = Config {
         jwt_token: secrets.jwt_token,
+        control_plane_jwt_token: secrets.control_plane_jwt_token,
+        compute_hook_url: args.compute_hook_url,
     };
 
     let json_path = args.path;
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index d7f4c0406a..776e1f9d1e 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -14,7 +14,7 @@ use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
 use utils::lsn::Lsn;
 
-use crate::compute_hook::ComputeHook;
+use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
 use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
 
@@ -37,9 +37,15 @@ pub(super) struct Reconciler {
     pub(crate) pageservers: Arc<HashMap<NodeId, Node>>,
 
     /// A hook to notify the running postgres instances when we change the location
-    /// of a tenant
+    /// of a tenant.  Use this via [`Self::compute_notify`] to update our failure flag
+    /// and guarantee eventual retries.
     pub(crate) compute_hook: Arc<ComputeHook>,
 
+    /// To avoid stalling if the cloud control plane is unavailable, we may proceed
+    /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
+    /// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry.
+    pub(crate) compute_notify_failure: bool,
+
     /// A means to abort background reconciliation: it is essential to
     /// call this when something changes in the original TenantState that
     /// will make this reconciliation impossible or unnecessary, for
@@ -52,7 +58,9 @@ pub(super) struct Reconciler {
 }
 
 #[derive(thiserror::Error, Debug)]
-pub enum ReconcileError {
+pub(crate) enum ReconcileError {
+    #[error(transparent)]
+    Notify(#[from] NotifyError),
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
@@ -317,9 +325,19 @@ impl Reconciler {
         }
 
         tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps_id);
-        self.compute_hook
-            .notify(self.tenant_shard_id, dest_ps_id)
-            .await?;
+
+        // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach
+        // the origin without notifying compute, we will render the tenant unavailable.
+        while let Err(e) = self.compute_notify().await {
+            match e {
+                NotifyError::Fatal(_) => return Err(anyhow::anyhow!(e)),
+                _ => {
+                    tracing::warn!(
+                        "Live migration blocked by compute notification error, retrying: {e}"
+                    );
+                }
+            }
+        }
 
         // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Single, then
         // this location will be deleted in the general case reconciliation that runs after this.
@@ -400,15 +418,7 @@ impl Reconciler {
                     wanted_conf.generation = self.generation.into();
                     tracing::info!("Observed configuration requires update.");
                     self.location_config(node_id, wanted_conf, None).await?;
-                    if let Err(e) = self
-                        .compute_hook
-                        .notify(self.tenant_shard_id, node_id)
-                        .await
-                    {
-                        tracing::warn!(
-                            "Failed to notify compute of newly attached pageserver {node_id}: {e}"
-                        );
-                    }
+                    self.compute_notify().await?;
                 }
             }
         }
@@ -461,6 +471,29 @@ impl Reconciler {
 
         Ok(())
     }
+
+    pub(crate) async fn compute_notify(&mut self) -> Result<(), NotifyError> {
+        // Whenever a particular Reconciler emits a notification, it is always notifying for the intended
+        // destination.
+        if let Some(node_id) = self.intent.attached {
+            let result = self
+                .compute_hook
+                .notify(self.tenant_shard_id, node_id, &self.cancel)
+                .await;
+            if let Err(e) = &result {
+                // It is up to the caller whether they want to drop out on this error, but they don't have to:
+                // in general we should avoid letting unavailability of the cloud control plane stop us from
+                // making progress.
+                tracing::warn!("Failed to notify compute of attached pageserver {node_id}: {e}");
+                // Set this flag so that in our ReconcileResult we will set the flag on the shard that it
+                // needs to retry at some point.
+                self.compute_notify_failure = true;
+            }
+            result
+        } else {
+            Ok(())
+        }
+    }
 }
 
 pub(crate) fn attached_location_conf(
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 8c6a348515..6f0e3ebb74 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -12,6 +12,7 @@ use control_plane::attachment_service::{
     TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
 use diesel::result::DatabaseErrorKind;
+use futures::StreamExt;
 use hyper::StatusCode;
 use pageserver_api::{
     control_api::{
@@ -27,6 +28,7 @@ use pageserver_api::{
     shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
 };
 use pageserver_client::mgmt_api;
+use tokio_util::sync::CancellationToken;
 use utils::{
     completion::Barrier,
     generation::Generation,
@@ -36,7 +38,7 @@ use utils::{
 };
 
 use crate::{
-    compute_hook::ComputeHook,
+    compute_hook::{self, ComputeHook},
     node::Node,
     persistence::{DatabaseError, NodePersistence, Persistence, TenantShardPersistence},
     scheduler::Scheduler,
@@ -66,6 +68,7 @@ struct ServiceState {
 
 impl ServiceState {
     fn new(
+        config: Config,
         result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
         nodes: HashMap<NodeId, Node>,
         tenants: BTreeMap<TenantShardId, TenantState>,
@@ -73,7 +76,7 @@ impl ServiceState {
         Self {
             tenants,
             nodes: Arc::new(nodes),
-            compute_hook: Arc::new(ComputeHook::new()),
+            compute_hook: Arc::new(ComputeHook::new(config)),
             result_tx,
         }
     }
@@ -82,8 +85,17 @@ impl ServiceState {
 #[derive(Clone)]
 pub struct Config {
     // All pageservers managed by one instance of this service must have
-    // the same public key.
+    // the same public key.  This JWT token will be used to authenticate
+    // this service to the pageservers it manages.
     pub jwt_token: Option<String>,
+
+    // This JWT token will be used to authenticate this service to the control plane.
+    pub control_plane_jwt_token: Option<String>,
+
+    /// Where the compute hook should send notifications of pageserver attachment locations
+    /// (this URL points to the control plane in prod). If this is None, the compute hook will
+    /// assume it is running in a test environment and try to update neon_local.
+    pub compute_hook_url: Option<String>,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -163,6 +175,8 @@ impl Service {
 
         let mut cleanup = Vec::new();
 
+        let mut compute_notifications = Vec::new();
+
         // Populate intent and observed states for all tenants, based on reported state on pageservers
         let shard_count = {
             let mut locked = self.inner.write().unwrap();
@@ -187,6 +201,13 @@ impl Service {
                     // not enough pageservers are available.  The tenant may well still be available
                     // to clients.
                     tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
+                } else {
+                    // If we're both intending and observed to be attached at a particular node, we will
+                    // emit a compute notification for this. In the case where our observed state does not
+                    // yet match our intent, we will eventually reconcile, and that will emit a compute notification.
+                    if let Some(attached_at) = tenant_state.stably_attached() {
+                        compute_notifications.push((*tenant_shard_id, attached_at));
+                    }
                 }
             }
 
@@ -235,10 +256,57 @@ impl Service {
             }
         }
 
+        // Emit compute hook notifications for all tenants which are already stably attached.  Other tenants
+        // will emit compute hook notifications when they reconcile.
+        //
+        // Ordering: we must complete these notification attempts before doing any other reconciliation for the
+        // tenants named here, because otherwise our calls to notify() might race with more recent values
+        // generated by reconciliation.
+
+        // Compute notify is fallible.  If it fails here, do not delay overall startup: set the
+        // flag on these shards that they have a pending notification.
+        let compute_hook = self.inner.read().unwrap().compute_hook.clone();
+
+        // Construct an async stream of futures to invoke the compute notify function: we do this
+        // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.
+        let stream = futures::stream::iter(compute_notifications.into_iter())
+            .map(|(tenant_shard_id, node_id)| {
+                let compute_hook = compute_hook.clone();
+                async move {
+                    // TODO: give Service a cancellation token for clean shutdown
+                    let cancel = CancellationToken::new();
+                    if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
+                        tracing::error!(
+                            tenant_shard_id=%tenant_shard_id,
+                            node_id=%node_id,
+                            "Failed to notify compute on startup for shard: {e}"
+                        );
+                        Some(tenant_shard_id)
+                    } else {
+                        None
+                    }
+                }
+            })
+            .buffered(compute_hook::API_CONCURRENCY);
+        let notify_results = stream.collect::<Vec<_>>().await;
+
+        // Update tenant state for any that failed to do their initial compute notify, so that they'll retry later.
+        {
+            let mut locked = self.inner.write().unwrap();
+            for tenant_shard_id in notify_results.into_iter().flatten() {
+                if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) {
+                    shard.pending_compute_notification = true;
+                }
+            }
+        }
+
         // Finally, now that the service is up and running, launch reconcile operations for any tenants
         // which require it: under normal circumstances this should only include tenants that were in some
-        // transient state before we restarted.
+        // transient state before we restarted, or any tenants whose compute hooks failed above.
         let reconcile_tasks = self.reconcile_all();
+        // We will not wait for these reconciliation tasks to run here: we're now done with startup and
+        // normal operations may proceed.
+
         tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
     }
 
@@ -295,6 +363,7 @@ impl Service {
                 waiter: Arc::new(SeqWait::new(Sequence::initial())),
                 error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
                 last_error: Arc::default(),
+                pending_compute_notification: false,
             };
 
             tenants.insert(tenant_shard_id, new_tenant);
@@ -304,7 +373,10 @@ impl Service {
 
         let this = Arc::new(Self {
             inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
-                result_tx, nodes, tenants,
+                config.clone(),
+                result_tx,
+                nodes,
+                tenants,
             ))),
             config,
             persistence,
@@ -330,6 +402,10 @@ impl Service {
                 // needed, but it is used to handle out-of-band updates via. e.g. test hook.
                 tenant.generation = std::cmp::max(tenant.generation, result.generation);
 
+                // If the reconciler signals that it failed to notify compute, set this state on
+                // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
+                tenant.pending_compute_notification = result.pending_compute_notification;
+
                 match result.result {
                     Ok(()) => {
                         for (node_id, loc) in &result.observed.locations {
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 5290197d84..a358e1ff7b 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -71,6 +71,12 @@ pub(crate) struct TenantState {
     /// TODO: generalize to an array of recent events
     /// TOOD: use a ArcSwap instead of mutex for faster reads?
     pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
+
+    /// If we have a pending compute notification that for some reason we weren't able to send,
+    /// set this to true. If this is set, calls to [`Self::maybe_reconcile`] will run a task to retry
+    /// sending it.  This is the mechanism by which compute notifications are included in the scope
+    /// of state that we publish externally in an eventually consistent way.
+    pub(crate) pending_compute_notification: bool,
 }
 
 #[derive(Default, Clone, Debug)]
@@ -164,6 +170,9 @@ pub(crate) struct ReconcileResult {
     pub(crate) tenant_shard_id: TenantShardId,
     pub(crate) generation: Generation,
     pub(crate) observed: ObservedState,
+
+    /// Set [`TenantState::pending_compute_notification`] from this flag
+    pub(crate) pending_compute_notification: bool,
 }
 
 impl IntentState {
@@ -226,6 +235,7 @@ impl TenantState {
             waiter: Arc::new(SeqWait::new(Sequence(0))),
             error_waiter: Arc::new(SeqWait::new(Sequence(0))),
             last_error: Arc::default(),
+            pending_compute_notification: false,
         }
     }
 
@@ -333,6 +343,38 @@ impl TenantState {
         Ok(())
     }
 
+    /// Query whether the tenant's observed state for attached node matches its intent state, and if so,
+    /// yield the node ID.  This is appropriate for emitting compute hook notifications: we are checking that
+    /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
+    ///
+    /// Reconciliation may still be needed for other aspects of state such as secondaries (see [`Self::dirty`]): this
+    /// funciton should not be used to decide whether to reconcile.
+    pub(crate) fn stably_attached(&self) -> Option<NodeId> {
+        if let Some(attach_intent) = self.intent.attached {
+            match self.observed.locations.get(&attach_intent) {
+                Some(loc) => match &loc.conf {
+                    Some(conf) => match conf.mode {
+                        LocationConfigMode::AttachedMulti
+                        | LocationConfigMode::AttachedSingle
+                        | LocationConfigMode::AttachedStale => {
+                            // Our intent and observed state agree that this node is in an attached state.
+                            Some(attach_intent)
+                        }
+                        // Our observed config is not an attached state
+                        _ => None,
+                    },
+                    // Our observed state is None, i.e. in flux
+                    None => None,
+                },
+                // We have no observed state for this node
+                None => None,
+            }
+        } else {
+            // Our intent is not to attach
+            None
+        }
+    }
+
     fn dirty(&self) -> bool {
         if let Some(node_id) = self.intent.attached {
             let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
@@ -354,6 +396,12 @@ impl TenantState {
             }
         }
 
+        // Even if there is no pageserver work to be done, if we have a pending notification to computes,
+        // wake up a reconciler to send it.
+        if self.pending_compute_notification {
+            return true;
+        }
+
         false
     }
 
@@ -415,11 +463,13 @@ impl TenantState {
             service_config: service_config.clone(),
             cancel: cancel.clone(),
             persistence: persistence.clone(),
+            compute_notify_failure: false,
         };
 
         let reconcile_seq = self.sequence;
 
         tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
+        let must_notify = self.pending_compute_notification;
         let join_handle = tokio::task::spawn(async move {
             // Wait for any previous reconcile task to complete before we start
             if let Some(old_handle) = old_handle {
@@ -438,7 +488,16 @@ impl TenantState {
                 return;
             }
 
+            // Attempt to make observed state match intent state
             let result = reconciler.reconcile().await;
+
+            // If we know we had a pending compute notification from some previous action, send a notification irrespective
+            // of whether the above reconcile() did any work
+            if result.is_ok() && must_notify {
+                // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
+                reconciler.compute_notify().await.ok();
+            }
+
             result_tx
                 .send(ReconcileResult {
                     sequence: reconcile_seq,
@@ -446,6 +505,7 @@ impl TenantState {
                     tenant_shard_id: reconciler.tenant_shard_id,
                     generation: reconciler.generation,
                     observed: reconciler.observed,
+                    pending_compute_notification: reconciler.compute_notify_failure,
                 })
                 .ok();
         });
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 7816d0953b..140e5c4e34 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -457,6 +457,12 @@ impl AttachmentService {
             args.push(format!("--public-key={public_key_path}"));
         }
 
+        if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api {
+            args.push(format!(
+                "--compute-hook-url={control_plane_compute_hook_api}"
+            ));
+        }
+
         background_process::start_process(
             COMMAND,
             &self.env.base_data_dir,
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index d5abda729f..e56007dd20 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -795,7 +795,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                     &endpoint.timeline_id.to_string(),
                     branch_name,
                     lsn_str.as_str(),
-                    endpoint.status(),
+                    &format!("{}", endpoint.status()),
                 ]);
             }
 
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index dcad22b992..b19a6a1a18 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -184,7 +184,7 @@ impl ComputeControlPlane {
                 v.tenant_id == tenant_id
                     && v.timeline_id == timeline_id
                     && v.mode == mode
-                    && v.status() != "stopped"
+                    && v.status() != EndpointStatus::Stopped
             });
 
             if let Some((key, _)) = duplicates.next() {
@@ -223,6 +223,26 @@ pub struct Endpoint {
     features: Vec<ComputeFeature>,
 }
 
+#[derive(PartialEq, Eq)]
+pub enum EndpointStatus {
+    Running,
+    Stopped,
+    Crashed,
+    RunningNoPidfile,
+}
+
+impl std::fmt::Display for EndpointStatus {
+    fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result {
+        let s = match self {
+            Self::Running => "running",
+            Self::Stopped => "stopped",
+            Self::Crashed => "crashed",
+            Self::RunningNoPidfile => "running, no pidfile",
+        };
+        write!(writer, "{}", s)
+    }
+}
+
 impl Endpoint {
     fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
         if !entry.file_type()?.is_dir() {
@@ -380,16 +400,16 @@ impl Endpoint {
         self.endpoint_path().join("pgdata")
     }
 
-    pub fn status(&self) -> &str {
+    pub fn status(&self) -> EndpointStatus {
         let timeout = Duration::from_millis(300);
         let has_pidfile = self.pgdata().join("postmaster.pid").exists();
         let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok();
 
         match (has_pidfile, can_connect) {
-            (true, true) => "running",
-            (false, false) => "stopped",
-            (true, false) => "crashed",
-            (false, true) => "running, no pidfile",
+            (true, true) => EndpointStatus::Running,
+            (false, false) => EndpointStatus::Stopped,
+            (true, false) => EndpointStatus::Crashed,
+            (false, true) => EndpointStatus::RunningNoPidfile,
         }
     }
 
@@ -481,7 +501,7 @@ impl Endpoint {
         remote_ext_config: Option<&String>,
         shard_stripe_size: usize,
     ) -> Result<()> {
-        if self.status() == "running" {
+        if self.status() == EndpointStatus::Running {
             anyhow::bail!("The endpoint is already running");
         }
 
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index aefef47da7..786ea6d098 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -72,11 +72,16 @@ pub struct LocalEnv {
     #[serde(default)]
     pub safekeepers: Vec<SafekeeperConf>,
 
-    // Control plane location: if None, we will not run attachment_service.  If set, this will
+    // Control plane upcall API for pageserver: if None, we will not run attachment_service.  If set, this will
     // be propagated into each pageserver's configuration.
     #[serde(default)]
     pub control_plane_api: Option<Url>,
 
+    // Control plane upcall API for attachment service.  If set, this will be propagated into the
+    // attachment service's configuration.
+    #[serde(default)]
+    pub control_plane_compute_hook_api: Option<Url>,
+
     /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
     #[serde(default)]
     // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e2a2291dbc..1e15ebe5a0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -482,6 +482,7 @@ class NeonEnvBuilder:
         self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = []
         self.config_init_force: Optional[str] = None
         self.top_output_dir = top_output_dir
+        self.control_plane_compute_hook_api: Optional[str] = None
 
         self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
 
@@ -1007,6 +1008,9 @@ class NeonEnv:
         # The base URL of the attachment service
         self.attachment_service_api: str = f"http://127.0.0.1:{self.attachment_service_port}"
 
+        # For testing this with a fake HTTP server, enable passing through a URL from config
+        self.control_plane_compute_hook_api = config.control_plane_compute_hook_api
+
         self.attachment_service: NeonAttachmentService = NeonAttachmentService(
             self, config.auth_enabled
         )
@@ -1026,6 +1030,9 @@ class NeonEnv:
         if self.control_plane_api is not None:
             cfg["control_plane_api"] = self.control_plane_api
 
+        if self.control_plane_compute_hook_api is not None:
+            cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api
+
         # Create config for pageserver
         http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
         pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
@@ -1904,7 +1911,7 @@ class Pagectl(AbstractNeonCli):
 
 
 class NeonAttachmentService:
-    def __init__(self, env: NeonEnv, auth_enabled):
+    def __init__(self, env: NeonEnv, auth_enabled: bool):
         self.env = env
         self.running = False
         self.auth_enabled = auth_enabled
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 3b2c9334db..346df708de 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -1,14 +1,24 @@
 import time
 from collections import defaultdict
 
-from fixtures.neon_fixtures import (
-    NeonEnvBuilder,
-)
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import tenant_delete_wait_completed, timeline_delete_wait_completed
 from fixtures.pg_version import PgVersion
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
+from pytest_httpserver import HTTPServer
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
+
+
+def get_node_shard_counts(env: NeonEnv, tenant_ids):
+    counts: defaultdict[str, int] = defaultdict(int)
+    for tid in tenant_ids:
+        for shard in env.attachment_service.locate(tid):
+            counts[shard["node_id"]] += 1
+    return counts
 
 
 def test_sharding_service_smoke(
@@ -54,14 +64,7 @@ def test_sharding_service_smoke(
     for tid in tenant_ids:
         env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)
 
-    def get_node_shard_counts():
-        counts: defaultdict[str, int] = defaultdict(int)
-        for tid in tenant_ids:
-            for shard in env.attachment_service.locate(tid):
-                counts[shard["node_id"]] += 1
-        return counts
-
-    for node_id, count in get_node_shard_counts().items():
+    for node_id, count in get_node_shard_counts(env, tenant_ids).items():
         # we used a multiple of pagservers for the total shard count,
         # so expect equal number on all pageservers
         assert count == tenant_shard_count / len(
@@ -89,7 +92,7 @@ def test_sharding_service_smoke(
     env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
 
     def node_evacuated(node_id: int):
-        counts = get_node_shard_counts()
+        counts = get_node_shard_counts(env, tenant_ids)
         assert counts[node_id] == 0
 
     wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))
@@ -98,7 +101,7 @@ def test_sharding_service_smoke(
     # immediately
     env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Active"})
     time.sleep(1)
-    assert get_node_shard_counts()[env.pageservers[0].id] == 0
+    assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 0
 
     # Delete all the tenants
     for tid in tenant_ids:
@@ -113,7 +116,7 @@ def test_sharding_service_smoke(
     for tid in tenant_ids:
         env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)
 
-    counts = get_node_shard_counts()
+    counts = get_node_shard_counts(env, tenant_ids)
     # Nothing should have been scheduled on the node in Draining
     assert counts[env.pageservers[1].id] == 0
     assert counts[env.pageservers[0].id] == tenant_shard_count // 2
@@ -270,3 +273,73 @@ def test_sharding_service_onboarding(
     # The onboarded tenant should surviev a restart of pageserver
     dest_ps.stop()
     dest_ps.start()
+
+
+def test_sharding_service_compute_hook(
+    httpserver: HTTPServer,
+    neon_env_builder: NeonEnvBuilder,
+    httpserver_listen_address,
+):
+    """
+    Test that the sharding service calls out to the configured HTTP endpoint on attachment changes
+    """
+
+    # We will run two pageserver to migrate and check that the attachment service sends notifications
+    # when migrating.
+    neon_env_builder.num_pageservers = 2
+    (host, port) = httpserver_listen_address
+    neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
+
+    # Set up fake HTTP notify endpoint
+    notifications = []
+
+    def handler(request: Request):
+        log.info(f"Notify request: {request}")
+        notifications.append(request.json)
+        return Response(status=200)
+
+    httpserver.expect_request("/notify", method="POST").respond_with_handler(handler)
+
+    # Start running
+    env = neon_env_builder.init_start()
+
+    # We will to an unclean migration, which will result in deletion queue warnings
+    env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates for tenant.*")
+
+    # Initial notification from tenant creation
+    assert len(notifications) == 1
+    expect = {
+        "tenant_id": str(env.initial_tenant),
+        "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
+    }
+
+    env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
+
+    def node_evacuated(node_id: int):
+        counts = get_node_shard_counts(env, [env.initial_tenant])
+        assert counts[node_id] == 0
+
+    wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))
+
+    # Additional notification from migration
+    log.info(f"notifications: {notifications}")
+    expect = {
+        "tenant_id": str(env.initial_tenant),
+        "shards": [{"node_id": int(env.pageservers[1].id), "shard_number": 0}],
+    }
+
+    def received_migration_notification():
+        assert len(notifications) == 2
+        assert notifications[1] == expect
+
+    wait_until(20, 0.25, received_migration_notification)
+
+    # When we restart, we should re-emit notifications for all tenants
+    env.attachment_service.stop()
+    env.attachment_service.start()
+
+    def received_restart_notification():
+        assert len(notifications) == 3
+        assert notifications[1] == expect
+
+    wait_until(10, 1, received_restart_notification)

From c9876b099397c7b990a7d359dcc0fa3b9dade926 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 2 Feb 2024 21:49:11 +0200
Subject: [PATCH 0069/1571] Fix double-free bug in walredo process. (#6534)

At the end of ApplyRecord(), we called pfree on the decoded record, if
it was "oversized". However, we had alread linked it to the "decode
queue" list in XLogReaderState. If we later called XLogBeginRead(), it
called ResetDecoder and tried to free the same record again.

The conditions to hit this are:

- a large WAL record (larger than aboue 64 kB I think, per
DEFAULT_DECODE_BUFFER_SIZE), and
- another WAL record processed by the same WAL redo process after the
large one.

I think the reason we haven't seen this earlier is that you don't get
WAL records that large that are sent to the WAL redo process, except
when logical replication is enabled. Logical replication adds data to
the WAL records, making them larger.

To fix, allocate the buffer ourselves, and don't link it to the decode
queue. Alternatively, we could perhaps have just removed the pfree(),
but frankly I'm a bit scared about the whole queue thing.
---
 pgxn/neon_walredo/walredoproc.c               | 48 +++++++---------
 .../regress/test_logical_replication.py       | 57 +++++++++++++++++++
 2 files changed, 78 insertions(+), 27 deletions(-)

diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c
index 7ca4fe93df..6ca0b2a274 100644
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -804,6 +804,9 @@ ApplyRecord(StringInfo input_message)
 	ErrorContextCallback errcallback;
 #if PG_VERSION_NUM >= 150000
 	DecodedXLogRecord *decoded;
+#define STATIC_DECODEBUF_SIZE (64 * 1024)
+	static char *static_decodebuf = NULL;
+	size_t		required_space;
 #endif
 
 	/*
@@ -833,7 +836,19 @@ ApplyRecord(StringInfo input_message)
 	XLogBeginRead(reader_state, lsn);
 
 #if PG_VERSION_NUM >= 150000
-	decoded = (DecodedXLogRecord *) XLogReadRecordAlloc(reader_state, record->xl_tot_len, true);
+	/*
+	 * For reasonably small records, reuse a fixed size buffer to reduce
+	 * palloc overhead.
+	 */
+	required_space = DecodeXLogRecordRequiredSpace(record->xl_tot_len);
+	if (required_space <= STATIC_DECODEBUF_SIZE)
+	{
+		if (static_decodebuf == NULL)
+			static_decodebuf = MemoryContextAlloc(TopMemoryContext, STATIC_DECODEBUF_SIZE);
+		decoded = (DecodedXLogRecord *) static_decodebuf;
+	}
+	else
+		decoded = palloc(required_space);
 
 	if (!DecodeXLogRecord(reader_state, decoded, record, lsn, &errormsg))
 		elog(ERROR, "failed to decode WAL record: %s", errormsg);
@@ -842,37 +857,15 @@ ApplyRecord(StringInfo input_message)
 		/* Record the location of the next record. */
 		decoded->next_lsn = reader_state->NextRecPtr;
 
-		/*
-		 * If it's in the decode buffer, mark the decode buffer space as
-		 * occupied.
-		 */
-		if (!decoded->oversized)
-		{
-			/* The new decode buffer head must be MAXALIGNed. */
-			Assert(decoded->size == MAXALIGN(decoded->size));
-			if ((char *) decoded == reader_state->decode_buffer)
-				reader_state->decode_buffer_tail = reader_state->decode_buffer + decoded->size;
-			else
-				reader_state->decode_buffer_tail += decoded->size;
-		}
-
-		/* Insert it into the queue of decoded records. */
-		Assert(reader_state->decode_queue_tail != decoded);
-		if (reader_state->decode_queue_tail)
-			reader_state->decode_queue_tail->next = decoded;
-		reader_state->decode_queue_tail = decoded;
-		if (!reader_state->decode_queue_head)
-			reader_state->decode_queue_head = decoded;
-
 		/*
 		 * Update the pointers to the beginning and one-past-the-end of this
 		 * record, again for the benefit of historical code that expected the
 		 * decoder to track this rather than accessing these fields of the record
 		 * itself.
 		 */
-		reader_state->record = reader_state->decode_queue_head;
-		reader_state->ReadRecPtr = reader_state->record->lsn;
-		reader_state->EndRecPtr = reader_state->record->next_lsn;
+		reader_state->record = decoded;
+		reader_state->ReadRecPtr = decoded->lsn;
+		reader_state->EndRecPtr = decoded->next_lsn;
 	}
 #else
 	/*
@@ -912,8 +905,9 @@ ApplyRecord(StringInfo input_message)
 
 	elog(TRACE, "applied WAL record with LSN %X/%X",
 		 (uint32) (lsn >> 32), (uint32) lsn);
+
 #if PG_VERSION_NUM >= 150000
-	if (decoded && decoded->oversized)
+	if ((char *) decoded != static_decodebuf)
 		pfree(decoded);
 #endif
 }
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 51e358e60d..059ddf79ec 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -1,4 +1,6 @@
 import time
+from random import choice
+from string import ascii_lowercase
 
 import pytest
 from fixtures.log_helper import log
@@ -11,6 +13,10 @@ from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
 
+def random_string(n: int):
+    return "".join([choice(ascii_lowercase) for _ in range(n)])
+
+
 def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -238,6 +244,57 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
     ) == endpoint.safe_psql("select sum(somedata) from replication_example")
 
 
+# Test that WAL redo works for fairly large records.
+#
+# See https://github.com/neondatabase/neon/pull/6534. That wasn't a
+# logical replication bug as such, but without logical replication,
+# records passed ot the WAL redo process are never large enough to hit
+# the bug.
+def test_large_records(neon_simple_env: NeonEnv, vanilla_pg):
+    env = neon_simple_env
+
+    env.neon_cli.create_branch("init")
+    endpoint = env.endpoints.create_start("init")
+
+    cur = endpoint.connect().cursor()
+    cur.execute("CREATE TABLE reptbl(id int, largeval text);")
+    cur.execute("alter table reptbl replica identity full")
+    cur.execute("create publication pub1 for table reptbl")
+
+    # now start subscriber
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("CREATE TABLE reptbl(id int, largeval text);")
+
+    log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
+    connstr = endpoint.connstr().replace("'", "''")
+    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
+
+    # Test simple insert, update, delete. But with very large values
+    value = random_string(10_000_000)
+    cur.execute(f"INSERT INTO reptbl VALUES (1, '{value}')")
+    logical_replication_sync(vanilla_pg, endpoint)
+    assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(1, value)]
+
+    # Test delete, and reinsert another value
+    cur.execute("DELETE FROM reptbl WHERE id = 1")
+    cur.execute(f"INSERT INTO reptbl VALUES (2, '{value}')")
+    logical_replication_sync(vanilla_pg, endpoint)
+    assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)]
+
+    value = random_string(10_000_000)
+    cur.execute(f"UPDATE reptbl SET largeval='{value}'")
+    logical_replication_sync(vanilla_pg, endpoint)
+    assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)]
+
+    endpoint.stop()
+    endpoint.start()
+    cur = endpoint.connect().cursor()
+    value = random_string(10_000_000)
+    cur.execute(f"UPDATE reptbl SET largeval='{value}'")
+    logical_replication_sync(vanilla_pg, endpoint)
+    assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)]
+
+
 #
 # Check that slots are not inherited in brnach
 #

From 2fd8e24c8ff300dc9e640c8765a0311307871e7d Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Fri, 2 Feb 2024 12:32:40 -0900
Subject: [PATCH 0070/1571] Switch sleeps to wait_until (#6575)

## Problem
I didn't know about `wait_until` and was relying on `sleep` to wait for
stuff. This caused some tests to be flaky.
https://github.com/neondatabase/neon/issues/6561
## Summary of changes
Switch to `wait_until`, this should make it tests less flaky
---
 test_runner/fixtures/neon_fixtures.py      | 14 ++++++++++++++
 test_runner/regress/test_migrations.py     | 12 ++++++++----
 test_runner/regress/test_neon_superuser.py | 19 ++++++++++---------
 3 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 1e15ebe5a0..5ce2fca820 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3130,6 +3130,20 @@ class Endpoint(PgProtocol):
             log.info(json.dumps(dict(data_dict, **kwargs)))
             json.dump(dict(data_dict, **kwargs), file, indent=4)
 
+    # Please note: if you didn't respec this endpoint to have the `migrations`
+    # feature, this function will probably fail because neon_migration.migration_id
+    # won't exist. This is temporary - soon we'll get rid of the feature flag and
+    # migrations will be enabled for everyone.
+    def wait_for_migrations(self):
+        with self.cursor() as cur:
+
+            def check_migrations_done():
+                cur.execute("SELECT id FROM neon_migration.migration_id")
+                migration_id = cur.fetchall()[0][0]
+                assert migration_id != 0
+
+            wait_until(20, 0.5, check_migrations_done)
+
     # Mock the extension part of spec passed from control plane for local testing
     # endpooint.rs adds content of this file as a part of the spec.json
     def create_remote_extension_spec(self, spec: dict[str, Any]):
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index dee22f9b48..30dd54a8c1 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -13,12 +13,14 @@ def test_migrations(neon_simple_env: NeonEnv):
     endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"])
     endpoint.start()
 
-    time.sleep(1)  # Sleep to let migrations run
+    endpoint.wait_for_migrations()
+
+    num_migrations = 3
 
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")
         migration_id = cur.fetchall()
-        assert migration_id[0][0] == 3
+        assert migration_id[0][0] == num_migrations
 
     with open(log_path, "r") as log_file:
         logs = log_file.read()
@@ -26,11 +28,13 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     endpoint.stop()
     endpoint.start()
-    time.sleep(1)  # Sleep to let migrations run
+    # We don't have a good way of knowing that the migrations code path finished executing
+    # in compute_ctl in the case that no migrations are being run
+    time.sleep(1)
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")
         migration_id = cur.fetchall()
-        assert migration_id[0][0] == 3
+        assert migration_id[0][0] == num_migrations
 
     with open(log_path, "r") as log_file:
         logs = log_file.read()
diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index 8b9eb1d9c4..eff2cadabf 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -1,8 +1,7 @@
-import time
-
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.pg_version import PgVersion
+from fixtures.utils import wait_until
 
 
 def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
@@ -19,7 +18,8 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
     sub.respec(skip_pg_catalog_updates=False, features=["migrations"])
     sub.start()
 
-    time.sleep(1)  # Sleep to let migrations run
+    pub.wait_for_migrations()
+    sub.wait_for_migrations()
 
     with pub.cursor() as cur:
         cur.execute(
@@ -68,10 +68,11 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
         with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as pcur:
             pcur.execute("INSERT INTO t VALUES (30), (40)")
 
-        time.sleep(1)  # Give the change time to propagate
+        def check_that_changes_propagated():
+            cur.execute("SELECT * FROM t")
+            res = cur.fetchall()
+            log.info(res)
+            assert len(res) == 4
+            assert [r[0] for r in res] == [10, 20, 30, 40]
 
-        cur.execute("SELECT * FROM t")
-        res = cur.fetchall()
-        log.info(res)
-        assert len(res) == 4
-        assert [r[0] for r in res] == [10, 20, 30, 40]
+        wait_until(10, 0.5, check_that_changes_propagated)

From f2aa96f003e4ea59acc5161d7ee708f233dc13db Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 2 Feb 2024 21:41:55 +0000
Subject: [PATCH 0071/1571] Console split RFC (#1997)

[Rendered](https://github.com/neondatabase/neon/blob/rfc-console-split/docs/rfcs/017-console-split.md)

Co-authored-by: Stas Kelvich <stas.kelvich@gmail.com>
---
 docs/rfcs/017-console-split.md | 420 +++++++++++++++++++++++++++++++++
 1 file changed, 420 insertions(+)
 create mode 100644 docs/rfcs/017-console-split.md

diff --git a/docs/rfcs/017-console-split.md b/docs/rfcs/017-console-split.md
new file mode 100644
index 0000000000..8036920610
--- /dev/null
+++ b/docs/rfcs/017-console-split.md
@@ -0,0 +1,420 @@
+# Splitting cloud console
+
+Created on 17.06.2022
+
+## Summary
+
+Currently we have `cloud` repository that contains code implementing public API for our clients as well as code for managing storage and internal infrastructure services. We can split everything user-related from everything storage-related to make it easier to test and maintain.
+
+This RFC proposes to introduce a new control-plane service with HTTP API. The overall architecture will look like this:
+
+```markup
+.                    x
+       external area x internal area
+       (our clients) x (our services)
+                     x
+                     x                                                      ┌───────────────────────┐
+                     x ┌───────────────┐   >    ┌─────────────────────┐     │      Storage (EC2)    │
+                     x │  console db   │   >    │  control-plane db   │     │                       │
+                     x └───────────────┘   >    └─────────────────────┘     │ - safekeepers         │
+                     x         ▲           >               ▲                │ - pageservers         │
+                     x         │           >               │                │                       │
+┌──────────────────┐ x ┌───────┴───────┐   >               │                │     Dependencies      │
+│    browser UI    ├──►│               │   >    ┌──────────┴──────────┐     │                       │
+└──────────────────┘ x │               │   >    │                     │     │ - etcd                │
+                     x │    console    ├───────►│    control-plane    ├────►│ - S3                  │
+┌──────────────────┐ x │               │   >    │  (deployed in k8s)  │     │ - more?               │
+│public API clients├──►│               │   >    │                     │     │                       │
+└──────────────────┘ x └───────┬───────┘   >    └──────────┬──────────┘     └───────────────────────┘
+                     x         │           >          ▲    │                            ▲
+                     x         │           >          │    │                            │
+                     x ┌───────┴───────┐   >          │    │                ┌───────────┴───────────┐
+                     x │ dependencies  │   >          │    │                │                       │
+                     x │- analytics    │   >          │    └───────────────►│       computes        │
+                     x │- auth         │   >          │                     │   (deployed in k8s)   │
+                     x │- billing      │   >          │                     │                       │
+                     x └───────────────┘   >          │                     └───────────────────────┘
+                     x                     >          │                                 ▲
+                     x                     >    ┌─────┴───────────────┐                 │
+┌──────────────────┐ x                     >    │                     │                 │
+│                  │ x                     >    │        proxy        ├─────────────────┘
+│     postgres     ├───────────────────────────►│  (deployed in k8s)  │
+│      users       │ x                     >    │                     │
+│                  │ x                     >    └─────────────────────┘
+└──────────────────┘ x                     >
+                                           >
+                                           >
+                             closed-source > open-source
+                                           >
+                                           >
+```
+
+Notes:
+
+- diagram is simplified in the less-important places
+- directed arrows are strict and mean that connections in the reverse direction are forbidden
+
+This split is quite complex and this RFC proposes several smaller steps to achieve the larger goal: 
+
+1. Start by refactoring the console code, the goal is to have console and control-plane code in the different directories without dependencies on each other.
+2. Do similar refactoring for tables in the console database, remove queries selecting data from both console and control-plane; move control-plane tables to a separate database.
+3. Implement control-plane HTTP API serving on a separate TCP port; make all console→control-plane calls to go through that HTTP API.
+4. Move control-plane source code to the neon repo; start control-plane as a separate service.
+
+## Motivation
+
+These are the two most important problems we want to solve:
+
+- Publish open-source implementation of all our cloud/storage features
+- Make a unified control-plane that is used in all cloud (serverless) and local (tests) setups
+
+Right now we have some closed-source code in the cloud repo. That code contains implementation for running Neon computes in k8s and without that code it’s impossible to automatically scale PostgreSQL computes. That means that we don’t have an open-source serverless PostgreSQL at the moment.
+
+After splitting and open-sourcing control-plane service we will have source code and Docker images for all storage services. That control-plane service should have HTTP API for creating and managing tenants (including all our storage features), while proxy will listen for incoming connections and create computes on-demand.
+
+Improving our test suite is an important task, but requires a lot of prerequisites and may require a separate RFC. Possible implementation of that is described in the section [Next steps](#next-steps).
+
+Another piece of motivation can be a better involvement of storage development team into a control-plane. By splitting control-plane from the console, it can be more convenient to test and develop control-plane with paying less attention to “business” features, such as user management, billing and analytics.
+
+For example, console currently requires authentication providers such as GitHub OAuth to work at all, as well as nodejs to be able to build it locally. It will be more convenient to build and run it locally without these requirements.
+
+## Proposed implementation
+
+### Current state of things
+
+Let’s start with defining the current state of things at the moment of this proposal. We have three repositories containing source code:
+
+- open-source `postgres` — our fork of postgres
+- open-source `neon` — our main repository for storage source code
+- closed-source `cloud` — mostly console backend and UI frontend
+
+This proposal aims not to change anything at the existing code in `neon` and `postgres` repositories, but to create control-plane service and move it’s source code from `cloud` to the `neon` repository. That means that we need to split code in `cloud` repo only, and will consider only this repository for exploring its source code.
+
+Let’s look at the miscellaneous things in the `cloud` repo which are NOT part of the console application, i.e. NOT the Go source code that is compiled to the `./console` binary. There we have:
+
+- command-line tools, such as cloudbench, neonadmin
+- markdown documentation
+- cloud operations scripts (helm, terraform, ansible)
+- configs and other things
+- e2e python tests
+- incidents playbooks
+- UI frontend
+- Make build scripts, code generation scripts
+- database migrations
+- swagger definitions
+
+And also let’s take a look at what we have in the console source code, which is the service we’d like to split:
+
+- API Servers
+    - Public API v2
+    - Management API v2
+    - Public API v1
+    - Admin API v1 (same port as Public API v1)
+    - Management API v1
+- Workers
+    - Monitor Compute Activity
+    - Watch Failed Operations
+    - Availability Checker
+    - Business Metrics Collector
+- Internal Services
+    - Auth Middleware, UserIsAdmin, Cookies
+    - Cable Websocket Server
+    - Admin Services
+        - Global Settings, Operations, Pageservers, Platforms, Projects, Safekeepers, Users
+    - Authenticate Proxy
+    - API Keys
+    - App Controller, serving UI HTML
+    - Auth Controller
+    - Branches
+    - Projects
+    - Psql Connect + Passwordless login
+    - Users
+    - Cloud Metrics
+    - User Metrics
+    - Invites
+    - Pageserver/Safekeeper management
+    - Operations, k8s/docker/common logic
+    - Platforms, Regions
+    - Project State
+    - Projects Roles, SCRAM
+    - Global Settings
+- Other things
+    - segment analytics integration
+    - sentry integration
+    - other common utilities packages
+
+### Drawing the splitting line
+
+The most challenging and the most important thing is to define the line that will split new control-plane service from the existing cloud service. If we don’t get it right, then we can end up with having a lot more issues without many benefits.
+
+We propose to define that line as follows:
+
+- everything user-related stays in the console service
+- everything storage-related should be in the control-plane service
+- something that falls in between should be decided where to go, but most likely should stay in the console service
+- some similar parts should be in both services, such as admin/management/db_migrations
+
+We call user-related all requests that can be connected to some user. The general idea is don’t have any user_id in the control-plane service and operate exclusively on tenant_id+timeline_id, the same way as existing storage services work now (compute, safekeeper, pageserver).
+
+Storage-related things can be defined as doing any of the following:
+
+- using k8s API
+- doing requests to any of the storage services (proxy, compute, safekeeper, pageserver, etc..)
+- tracking current status of tenants/timelines, managing lifetime of computes
+
+Based on that idea, we can say that new control-plane service should have the following components:
+
+- single HTTP API for everything
+    - Create and manage tenants and timelines
+    - Manage global settings and storage configuration (regions, platforms, safekeepers, pageservers)
+    - Admin API for storage health inspection and debugging
+- Workers
+    - Monitor Compute Activity
+    - Watch Failed Operations
+    - Availability Checker
+- Internal Services
+    - Admin Services
+        - Global Settings, Operations, Pageservers, Platforms, Tenants, Safekeepers
+    - Authenticate Proxy
+    - Branches
+    - Psql Connect
+    - Cloud Metrics
+    - Pageserver/Safekeeper management
+    - Operations, k8s/docker/common logic
+    - Platforms, Regions
+    - Tenant State
+    - Compute Roles, SCRAM
+    - Global Settings
+
+---
+
+And other components should probably stay in the console service:
+
+- API Servers (no changes here)
+    - Public API v2
+    - Management API v2
+    - Public API v1
+    - Admin API v1 (same port as Public API v1)
+    - Management API v1
+- Workers
+    - Business Metrics Collector
+- Internal Services
+    - Auth Middleware, UserIsAdmin, Cookies
+    - Cable Websocket Server
+    - Admin Services
+        - Users admin stays the same
+        - Other admin services can redirect requests to the control-plane
+    - API Keys
+    - App Controller, serving UI HTML
+    - Auth Controller
+    - Projects
+    - User Metrics
+    - Invites
+    - Users
+    - Passwordless login
+- Other things
+    - segment analytics integration
+    - sentry integration
+    - other common utilities packages
+
+There are also miscellaneous things that are useful for all kinds of services. So we can say that these things can be in both services:
+
+- markdown documentation
+- e2e python tests
+- make build scripts, code generation scripts
+- database migrations
+- swagger definitions
+
+The single entrypoint to the storage should be control-plane API. After we define that API, we can have code-generated implementation for the client and for the server. The general idea is to move code implementing storage components from the console to the API implementation inside the new control-plane service.
+
+After the code is moved to the new service, we can fill the created void by making API calls to the new service:
+
+- authorization of the client
+- mapping user_id + project_id to the tenant_id
+- calling the control-plane API
+
+### control-plane API
+
+Currently we have the following projects API in the console:
+
+```
+GET /projects/{project_id}
+PATCH /projects/{project_id}
+POST /projects/{project_id}/branches
+GET /projects/{project_id}/databases
+POST /projects/{project_id}/databases
+GET /projects/{project_id}/databases/{database_id}
+PUT /projects/{project_id}/databases/{database_id}
+DELETE /projects/{project_id}/databases/{database_id}
+POST /projects/{project_id}/delete
+GET /projects/{project_id}/issue_token
+GET /projects/{project_id}/operations
+GET /projects/{project_id}/operations/{operation_id}
+POST /projects/{project_id}/query
+GET /projects/{project_id}/roles
+POST /projects/{project_id}/roles
+GET /projects/{project_id}/roles/{role_name}
+DELETE /projects/{project_id}/roles/{role_name}
+POST /projects/{project_id}/roles/{role_name}/reset_password
+POST /projects/{project_id}/start
+POST /projects/{project_id}/stop
+POST /psql_session/{psql_session_id}
+```
+
+It looks fine and we probably already have clients relying on it. So we should not change it, at least for now. But most of these endpoints (if not all) are related to storage, and it can suggest us what control-plane API should look like:
+
+```
+GET /tenants/{tenant_id}
+PATCH /tenants/{tenant_id}
+POST /tenants/{tenant_id}/branches
+GET /tenants/{tenant_id}/databases
+POST /tenants/{tenant_id}/databases
+GET /tenants/{tenant_id}/databases/{database_id}
+PUT /tenants/{tenant_id}/databases/{database_id}
+DELETE /tenants/{tenant_id}/databases/{database_id}
+POST /tenants/{tenant_id}/delete
+GET /tenants/{tenant_id}/issue_token
+GET /tenants/{tenant_id}/operations
+GET /tenants/{tenant_id}/operations/{operation_id}
+POST /tenants/{tenant_id}/query
+GET /tenants/{tenant_id}/roles
+POST /tenants/{tenant_id}/roles
+GET /tenants/{tenant_id}/roles/{role_name}
+DELETE /tenants/{tenant_id}/roles/{role_name}
+POST /tenants/{tenant_id}/roles/{role_name}/reset_password
+POST /tenants/{tenant_id}/start
+POST /tenants/{tenant_id}/stop
+POST /psql_session/{psql_session_id}
+```
+
+One of the options here is to use gRPC instead of the HTTP, which has some useful features, but there are some strong points towards using plain HTTP:
+
+- HTTP API is easier to use for the clients
+- we already have HTTP API in pageserver/safekeeper/console
+- we probably want control-plane API to be similar to the console API, available in the cloud
+
+### Getting updates from the storage
+
+There can be some valid cases, when we would like to know what is changed in the storage. For example, console might want to know when user has queried and started compute and when compute was scaled to zero after that, to know how much user should pay for the service. Another example is to get info about reaching the disk space limits. Yet another example is to do analytics, such as how many users had at least one active project in a month.
+
+All of the above cases can happen without using the console, just by accessing compute through the proxy.
+
+To solve this, we can have a log of events occurring in the storage (event logs). That is very similar to operations table we have right now, the only difference is that events are immutable and we cannot change them after saving to the database. For example, we might want to have events for the following activities:
+
+- We finished processing some HTTP API query, such as resetting the password
+- We changed some state, such as started or stopped a compute
+- Operation is created
+- Operation is started for the first time
+- Operation is failed for the first time
+- Operation is finished
+
+Once we save these events to the database, we can create HTTP API to subscribe to these events. That API can look like this:
+
+```
+GET /events/<cursor>
+
+{
+  "events": [...],
+  "next_cursor": 123
+}
+```
+
+It should be possible to replay event logs from some point of time, to get a state of almost anything from the storage services. That means that if we maintain some state in the control-plane database and we have a reason to have the same state in the console database, it is possible by polling events from the control-plane API and changing the state in the console database according to the events.
+
+### Next steps
+
+After implementing control-plane HTTP API and starting control-plane as a separate service, we might want to think of exploiting benefits of the new architecture, such as reorganizing test infrastructure. Possible options are listed in the  [Next steps](#next-steps-1).
+
+## Non Goals
+
+RFC doesn’t cover the actual cloud deployment scripts and schemas, such as terraform, ansible, k8s yaml’s and so on.
+
+## Impacted components
+
+Mostly console, but can also affect some storage service.
+
+## Scalability
+
+We should support starting several instances of the new control-plane service at the same time.
+
+At the same time, it should be possible to use only single instance of control-plane, which can be useful for local tests.
+
+## Security implications
+
+New control-plane service is an internal service, so no external requests can reach it. But at the same time, it contains API to do absolutely anything with any of the tenants. That means that bad internal actor can potentially read and write all of the tenants. To make this safer, we can have one of these:
+
+- Simple option is to protect all requests with a single private key, so that no one can make requests without having that one key.
+- Another option is to have a separate token for every tenant and store these tokens in another secure place. This way it’s harder to access all tenants at once, because they have the different tokens.
+
+## Alternative implementation
+
+There was an idea to create a k8s operator for managing storage services and computes, but author of this RFC is not really familiar with it.
+
+Regarding less alternative ideas, there are another options for the name of the new control-plane service:
+
+- storage-ctl
+- cloud
+- cloud-ctl
+
+## Pros/cons of proposed approaches (TODO)
+
+Pros:
+
+- All storage features are completely open-source
+- Better tests coverage, less difference between cloud and local setups
+- Easier to develop storage and cloud features, because there is no need to setup console for that
+- Easier to deploy storage-only services to the any cloud
+
+Cons:
+
+- All storage features are completely open-source
+- Distributed services mean more code to connect different services and potential network issues
+- Console needs to have a dependency on storage API, there can be complications with developing new feature in a branch
+- More code to JOIN data from different services (console and control-plane)
+
+## Definition of Done
+
+We have a new control-plane service running in the k8s. Source code for that control-plane service is located in the open-source neon repo.
+
+## Next steps
+
+After we’ve reached DoD, we can make further improvements.
+
+First thing that can benefit from the split is local testing. The same control-plane service can implement starting computes as a local processes instead of k8s deployments. If it will also support starting pageservers/safekeepers/proxy for the local setup, then it can completely replace `./neon_local` binary, which is currently used for testing. The local testing environment can look like this:
+
+```
+┌─────────────────────┐     ┌───────────────────────┐
+│                     │     │      Storage (local)  │
+│  control-plane db   │     │                       │
+│   (local process)   │     │ - safekeepers         │
+│                     │     │ - pageservers         │
+└──────────▲──────────┘     │                       │
+           │                │     Dependencies      │
+┌──────────┴──────────┐     │                       │
+│                     │     │ - etcd                │
+│    control-plane    ├────►│ - S3                  │
+│   (local process)   │     │ - more?               │
+│                     │     │                       │
+└──────────┬──────────┘     └───────────────────────┘
+       ▲   │                            ▲
+       │   │                            │
+       │   │                ┌───────────┴───────────┐
+       │   │                │                       │
+       │   └───────────────►│       computes        │
+       │                    │   (local processes)   │
+       │                    │                       │
+┌──────┴──────────────┐     └───────────────────────┘
+│                     │                 ▲
+│        proxy        │                 │
+│   (local process)   ├─────────────────┘
+│                     │
+└─────────────────────┘
+```
+
+The key thing here is that control-plane local service have the same API and almost the same implementation as the one deployed in the k8s. This allows to run the same e2e tests against both cloud and local setups.
+
+For the python test_runner tests everything can stay mostly the same. To do that, we just need to replace `./neon_local` cli commands with API calls to the control-plane.
+
+The benefit here will be in having fast local tests that are really close to our cloud setup. Bugs in k8s queries are still cannot be found when running computes as a local processes, but it should be really easy to start k8s locally (for example in k3s) and run the same tests with control-plane connected to the local k8s.
+
+Talking about console and UI tests, after the split there should be a way to test these without spinning up all the storage locally. New control-plane service has a well-defined API, allowing us to mock it. This way we can create UI tests to verify the right calls are issued after specific UI interactions and verify that we render correct messages when API returns errors.
\ No newline at end of file

From d820d64e382f052ba92a736557da47728be8aa90 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Fri, 2 Feb 2024 14:39:20 -0800
Subject: [PATCH 0072/1571] Bump vm-builder v0.21.0 -> v0.23.2 (#6480)

Relevant changes were all from v0.23.0:

- neondatabase/autoscaling#724
- neondatabase/autoscaling#726
- neondatabase/autoscaling#732

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 201c77f138..2d7edf2e22 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -872,7 +872,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.21.0
+      VM_BUILDER_VERSION: v0.23.2
 
     steps:
       - name: Checkout

From 0ac2606c8ac0b09859ce6b6a32e9e97066de0130 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 2 Feb 2024 23:45:57 +0100
Subject: [PATCH 0073/1571] S3 restore test: Use a workaround to enable moto's
 self-copy support (#6594)

While working on https://github.com/getmoto/moto/pull/7303 I discovered
that if you enable bucket encryption, moto allows self-copies. So we can
un-ignore the test. I tried it out locally, it works great.

Followup of #6533, part of
https://github.com/neondatabase/cloud/issues/8233
---
 test_runner/fixtures/pageserver/utils.py | 18 +++++++++++++++++-
 test_runner/regress/test_s3_restore.py   |  2 --
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 4cfdee6e01..c2281ae25a 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -356,10 +356,26 @@ def enable_remote_storage_versioning(
     """
     Enable S3 versioning for the remote storage
     """
-    # local_fs has no
+    # local_fs has no support for versioning
     assert isinstance(remote, S3Storage), "localfs is currently not supported"
     assert remote.client is not None
 
+    # The SDK supports enabling versioning on normal S3 as well but we don't want to change
+    # these settings from a test in a live bucket (also, our access isn't enough nor should it be)
+    assert not remote.real, "Enabling storage versioning only supported on Mock S3"
+
+    # Workaround to enable self-copy until upstream bug is fixed: https://github.com/getmoto/moto/issues/7300
+    remote.client.put_bucket_encryption(
+        Bucket=remote.bucket_name,
+        ServerSideEncryptionConfiguration={
+            "Rules": [
+                {
+                    "ApplyServerSideEncryptionByDefault": {"SSEAlgorithm": "AES256"},
+                    "BucketKeyEnabled": False,
+                },
+            ]
+        },
+    )
     # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
     response = remote.client.put_bucket_versioning(
         Bucket=remote.bucket_name,
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
index 188d8a3b33..aaa33f0bcb 100644
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -1,7 +1,6 @@
 import time
 from datetime import datetime, timezone
 
-import pytest
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
@@ -32,7 +31,6 @@ def test_tenant_s3_restore(
         remote_storage = neon_env_builder.pageserver_remote_storage
         assert remote_storage, "remote storage not configured"
         enable_remote_storage_versioning(remote_storage)
-        pytest.skip("moto doesn't support self-copy: https://github.com/getmoto/moto/issues/7300")
 
     env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
     env.pageserver.allowed_errors.extend(

From 3d1b08496a066a1784b179bfee6cb41b6ac56aeb Mon Sep 17 00:00:00 2001
From: Clarence <clarencepenz@users.noreply.github.com>
Date: Sat, 3 Feb 2024 01:59:39 +0100
Subject: [PATCH 0074/1571] Update words in docs for better readability (#6600)

## Problem
 Found typos while reading the docs

## Summary of changes
Fixed the typos found
---
 docs/rfcs/018-storage-messaging-2.md           |  6 +++---
 docs/rfcs/019-tenant-timeline-lifecycles.md    |  4 ++--
 docs/rfcs/020-pageserver-s3-coordination.md    | 12 ++++++------
 docs/rfcs/022-pageserver-delete-from-s3.md     | 18 +++++++++---------
 ...he-state-of-pageserver-tenant-relocation.md |  4 ++--
 docs/rfcs/024-extension-loading.md             |  2 +-
 docs/rfcs/025-generation-numbers.md            |  8 ++++----
 docs/rfcs/026-pageserver-s3-mvcc.md            | 12 ++++++------
 ...-consistent-layer-map-through-index-part.md | 16 ++++++++--------
 docs/rfcs/028-pageserver-migration.md          |  2 +-
 .../029-pageserver-wal-disaster-recovery.md    |  4 ++--
 docs/rfcs/030-vectored-timeline-get.md         |  2 +-
 12 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/docs/rfcs/018-storage-messaging-2.md b/docs/rfcs/018-storage-messaging-2.md
index 364f62dd2e..2419dd5fc5 100644
--- a/docs/rfcs/018-storage-messaging-2.md
+++ b/docs/rfcs/018-storage-messaging-2.md
@@ -78,7 +78,7 @@ with grpc streams and tokio mpsc channels. The implementation description is at
 
 It is just 500 lines of code and core functionality is complete. 1-1 pub sub
 gives about 120k received messages per second; having multiple subscribers in
-different connecitons quickly scales to 1 million received messages per second.
+different connections quickly scales to 1 million received messages per second.
 I had concerns about many concurrent streams in singe connection, but 2^20
 subscribers still work (though eat memory, with 10 publishers 20GB are consumed;
 in this implementation each publisher holds full copy of all subscribers). There
@@ -95,12 +95,12 @@ other members, with best-effort this is simple.
 ### Security implications
 
 Communication happens in a private network that is not exposed to users;
-additionaly we can add auth to the broker.
+additionally we can add auth to the broker.
 
 ## Alternative: get existing pub-sub
 
 We could take some existing pub sub solution, e.g. RabbitMQ, Redis. But in this
-case IMV simplicity of our own outweights external dependency costs (RabbitMQ is
+case IMV simplicity of our own outweighs external dependency costs (RabbitMQ is
 much more complicated and needs VM; Redis Rust client maintenance is not
 ideal...). Also note that projects like CockroachDB and TiDB are based on gRPC
 as well.
diff --git a/docs/rfcs/019-tenant-timeline-lifecycles.md b/docs/rfcs/019-tenant-timeline-lifecycles.md
index 2734bf17b9..558b5335e7 100644
--- a/docs/rfcs/019-tenant-timeline-lifecycles.md
+++ b/docs/rfcs/019-tenant-timeline-lifecycles.md
@@ -74,7 +74,7 @@ TenantMaintenanceGuard: Like ActiveTenantGuard, but can be held even when the
 tenant is not in Active state. Used for operations like attach/detach. Perhaps
 allow only one such guard on a Tenant at a time.
 
-Similarly for Timelines. We don't currentl have a "state" on Timeline, but I think
+Similarly for Timelines. We don't currently have a "state" on Timeline, but I think
 we need at least two states: Active and Stopping. The Stopping state is used at
 deletion, to prevent new TimelineActiveGuards from appearing, while you wait for
 existing TimelineActiveGuards to die out.
@@ -85,7 +85,7 @@ have a TenantActiveGuard, and the tenant's state changes from Active to
 Stopping, the is_shutdown_requested() function should return true, and
 shutdown_watcher() future should return.
 
-This signaling doesn't neessarily need to cover all cases. For example, if you
+This signaling doesn't necessarily need to cover all cases. For example, if you
 have a block of code in spawn_blocking(), it might be acceptable if
 is_shutdown_requested() doesn't return true even though the tenant is in
 Stopping state, as long as the code finishes reasonably fast.
diff --git a/docs/rfcs/020-pageserver-s3-coordination.md b/docs/rfcs/020-pageserver-s3-coordination.md
index 5e2912ba99..90ba3a6f4d 100644
--- a/docs/rfcs/020-pageserver-s3-coordination.md
+++ b/docs/rfcs/020-pageserver-s3-coordination.md
@@ -37,7 +37,7 @@ sequenceDiagram
 ```
 
 At this point it is not possible to restore from index, it contains L2 which
-is no longer available in s3 and doesnt contain L3 added by compaction by the
+is no longer available in s3 and doesn't contain L3 added by compaction by the
 first pageserver. So if any of the pageservers restart initial sync will fail
 (or in on-demand world it will fail a bit later during page request from
 missing layer)
@@ -74,7 +74,7 @@ One possible solution for relocation case is to orchestrate background jobs
 from outside. The oracle who runs migration can turn off background jobs on
 PS1 before migration and then run migration -> enable them on PS2. The problem
 comes if migration fails. In this case in order to resume background jobs
-oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt
+oracle needs to guarantee that PS2 doesn't run background jobs and if it doesn't
 respond then PS1 is stuck unable to run compaction/gc. This cannot be solved
 without human ensuring that no upload from PS2 can happen. In order to be able
 to resolve this automatically CAS is required on S3 side so pageserver can
@@ -128,7 +128,7 @@ During discussion it seems that we converged on the approach consisting of:
   whether we need to apply change to the index state or not.
 - Responsibility for running background jobs is assigned externally. Pageserver
   keeps locally persistent flag for each tenant that indicates whether this
-  pageserver is considered as primary one or not. TODO what happends if we
+  pageserver is considered as primary one or not. TODO what happens if we
   crash and cannot start for some extended period of time? Control plane can
   assign ownership to some other pageserver. Pageserver needs some way to check
   if its still the blessed one. Maybe by explicit request to control plane on
@@ -138,7 +138,7 @@ Requirement for deterministic layer generation was considered overly strict
 because of two reasons:
 
 - It can limit possible optimizations e g when pageserver wants to reshuffle
-  some data locally and doesnt want to coordinate this
+  some data locally and doesn't want to coordinate this
 - The deterministic algorithm itself can change so during deployments for some
   time there will be two different version running at the same time which can
   cause non determinism
@@ -164,7 +164,7 @@ sequenceDiagram
     CP->>PS1: Yes
     deactivate CP
     PS1->>S3: Fetch PS1 index.
-    note over PS1: Continue operations, start backround jobs
+    note over PS1: Continue operations, start background jobs
     note over PS1,PS2: PS1 starts up and still and is not a leader anymore
     PS1->>CP: Am I still the leader for Tenant X?
     CP->>PS1: No
@@ -203,7 +203,7 @@ sequenceDiagram
 ### Eviction
 
 When two pageservers operate on a tenant for extended period of time follower
-doesnt perform write operations in s3. When layer is evicted follower relies
+doesn't perform write operations in s3. When layer is evicted follower relies
 on updates from primary to get info about layers it needs to cover range for
 evicted layer.
 
diff --git a/docs/rfcs/022-pageserver-delete-from-s3.md b/docs/rfcs/022-pageserver-delete-from-s3.md
index 260e549670..c237a3edb8 100644
--- a/docs/rfcs/022-pageserver-delete-from-s3.md
+++ b/docs/rfcs/022-pageserver-delete-from-s3.md
@@ -4,7 +4,7 @@ Created on 08.03.23
 
 ## Motivation
 
-Currently we dont delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC).
+Currently we don't delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC).
 
 This RFC aims to spin a discussion to come to a robust deletion solution that wont put us in into a corner for features like postponed deletion (when we keep data for user to be able to restore a project if it was deleted by accident)
 
@@ -75,9 +75,9 @@ Remote one is needed for cases when pageserver is lost during deletion so other
 
 Why local mark file is needed?
 
-If we dont have one, we have two choices, delete local data before deleting the remote part or do that after.
+If we don't have one, we have two choices, delete local data before deleting the remote part or do that after.
 
-If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote conuterparts of locally available tenants).
+If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote counterparts of locally available tenants).
 
 If we delete local data after remote then at the end of the sequence when remote mark file is deleted if pageserver restart happens then the state is the same to situation when pageserver just missing data on remote without knowing the fact that this data is intended to be deleted. In this case the current behavior is upload everything local-only to remote.
 
@@ -145,7 +145,7 @@ sequenceDiagram
         CP->>PS: Retry delete tenant
         PS->>CP: Not modified
     else Mark is missing
-        note over PS: Continue to operate the tenant as if deletion didnt happen
+        note over PS: Continue to operate the tenant as if deletion didn't happen
 
         note over CP: Eventually console should <br> retry delete request
 
@@ -168,7 +168,7 @@ sequenceDiagram
     PS->>CP: True
 ```
 
-Similar sequence applies when both local and remote marks were persisted but Control Plane still didnt receive a response.
+Similar sequence applies when both local and remote marks were persisted but Control Plane still didn't receive a response.
 
 If pageserver crashes after both mark files were deleted then it will reply to control plane status poll request with 404 which should be treated by control plane as success.
 
@@ -187,7 +187,7 @@ If pageseserver is lost then the deleted tenant should be attached to different
 
 ##### Restrictions for tenant that is in progress of being deleted
 
-I propose to add another state to tenant/timeline - PendingDelete. This state shouldnt allow executing any operations aside from polling the deletion status.
+I propose to add another state to tenant/timeline - PendingDelete. This state shouldn't allow executing any operations aside from polling the deletion status.
 
 #### Summary
 
@@ -237,7 +237,7 @@ New branch gets created
 PS1 starts up (is it possible or we just recycle it?)
 PS1 is unaware of the new branch. It can either fall back to s3 ls, or ask control plane.
 
-So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage neeeds to ask control plane.
+So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage needs to ask control plane.
 
 ### Summary
 
@@ -250,7 +250,7 @@ Cons:
 
 Pros:
 
-- Easier to reason about if you dont have to account for pageserver restarts
+- Easier to reason about if you don't have to account for pageserver restarts
 
 ### Extra notes
 
@@ -262,7 +262,7 @@ Delayed deletion can be done with both approaches. As discussed with Anna (@step
 
 After discussion in comments I see that we settled on two options (though a bit different from ones described in rfc). First one is the same - pageserver owns as much as possible. The second option is that pageserver owns markers thing, but actual deletion happens in control plane by repeatedly calling ls + delete.
 
-To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge abouth paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesnt support shutting down pageservers, which are separate docker containers there instead of just processes.
+To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge about paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesn't support shutting down pageservers, which are separate docker containers there instead of just processes.
 
 With pageserver owning everything we still give the retry logic to control plane but its easier to duplicate if needed compared to sharing inner s3 workings. We will have needed tests for retry logic in neon repo.
 
diff --git a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
index 836c91fb25..97e62bf8c6 100644
--- a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
+++ b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
@@ -75,7 +75,7 @@ sequenceDiagram
 ```
 
 At this point it is not possible to restore the state from index, it contains L2 which
-is no longer available in s3 and doesnt contain L3 added by compaction by the
+is no longer available in s3 and doesn't contain L3 added by compaction by the
 first pageserver. So if any of the pageservers restart, initial sync will fail
 (or in on-demand world it will fail a bit later during page request from
 missing layer)
@@ -171,7 +171,7 @@ sequenceDiagram
 
 Another problem is a possibility of concurrent branch creation calls.
 
-I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we dont need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state.
+I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we don't need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state.
 
 ## Simplistic approach
 
diff --git a/docs/rfcs/024-extension-loading.md b/docs/rfcs/024-extension-loading.md
index 26ba4f7927..7e243b23e3 100644
--- a/docs/rfcs/024-extension-loading.md
+++ b/docs/rfcs/024-extension-loading.md
@@ -55,7 +55,7 @@ When PostgreSQL requests a file, `compute_ctl` downloads it.
 PostgreSQL requests files in the following cases:
 - When loading a preload library set in `local_preload_libraries`
 - When explicitly loading a library with `LOAD`
-- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
+- When creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
 
 
 #### Summary
diff --git a/docs/rfcs/025-generation-numbers.md b/docs/rfcs/025-generation-numbers.md
index 6a0131c66a..dfc8529d2d 100644
--- a/docs/rfcs/025-generation-numbers.md
+++ b/docs/rfcs/025-generation-numbers.md
@@ -26,7 +26,7 @@ plane guarantee prevents robust response to failures, as if a pageserver is unre
 we may not detach from it. The mechanism in this RFC fixes this, by making it safe to
 attach to a new, different pageserver even if an unresponsive pageserver may be running.
 
-Futher, lack of safety during split-brain conditions blocks two important features where occasional
+Further lack of safety during split-brain conditions blocks two important features where occasional
 split-brain conditions are part of the design assumptions:
 
 - seamless tenant migration ([RFC PR](https://github.com/neondatabase/neon/pull/5029))
@@ -490,11 +490,11 @@ The above makes it safe for control plane to change the assignment of
 tenant to pageserver in control plane while a timeline creation is ongoing.
 The reason is that the creation request against the new assigned pageserver
 uses a new generation number. However, care must be taken by control plane
-to ensure that a "timeline creation successul" response from some pageserver
+to ensure that a "timeline creation successful" response from some pageserver
 is checked for the pageserver's generation for that timeline's tenant still being the latest.
 If it is not the latest, the response does not constitute a successful timeline creation.
 It is acceptable to discard such responses, the scrubber will clean up the S3 state.
-It is better to issue a timelien deletion request to the stale attachment.
+It is better to issue a timeline deletion request to the stale attachment.
 
 #### Timeline Deletion
 
@@ -633,7 +633,7 @@ As outlined in the Part 1 on correctness, it is critical that deletions are only
 executed once the key is not referenced anywhere in S3.
 This property is obviously upheld by the scheme above.
 
-#### We Accept Object Leakage In Acceptable Circumcstances
+#### We Accept Object Leakage In Acceptable Circumstances
 
 If we crash in the flow above between (2) and (3), we lose track of unreferenced object.
 Further, enqueuing a single to the persistent queue may not be durable immediately to amortize cost of flush to disk.
diff --git a/docs/rfcs/026-pageserver-s3-mvcc.md b/docs/rfcs/026-pageserver-s3-mvcc.md
index 2a8c925781..473d5a2bd0 100644
--- a/docs/rfcs/026-pageserver-s3-mvcc.md
+++ b/docs/rfcs/026-pageserver-s3-mvcc.md
@@ -162,7 +162,7 @@ struct Tenant {
   ...
 
   txns: HashMap<TxnId, Transaction>,
-  // the most recently started txn's id; only most recently sarted can win
+  // the most recently started txn's id; only most recently started can win
   next_winner_txn: Option<TxnId>,
 }
 struct Transaction {
@@ -186,7 +186,7 @@ A transaction T in state Committed has subsequent transactions that may or may n
 
 So, for garbage collection, we need to assess transactions in state Committed and RejectAcknowledged:
 
-- Commited: delete objects on the deadlist.
+- Committed: delete objects on the deadlist.
     - We don’t need a LIST request here, the deadlist is sufficient. So, it’s really cheap.
     - This is **not true MVCC garbage collection**; by deleting the objects on Committed transaction T ’s deadlist, we might delete data referenced by other transactions that were concurrent with T, i.e., they started while T was still open. However, the fact that T is committed means that the other transactions are RejectPending or RejectAcknowledged, so, they don’t matter. Pageservers executing these doomed RejectPending transactions must handle 404 for GETs gracefully, e.g., by trying to commit txn so they observe the rejection they’re destined to get anyways. 404’s for RejectAcknowledged is handled below.
 - RejectAcknowledged: delete all objects created in that txn, and discard deadlists.
@@ -242,15 +242,15 @@ If a pageserver is unresponsive from Control Plane’s / Compute’s perspective
 
 At this point, availability is restored and user pain relieved.
 
-What’s left is to somehow close the doomed transaction of the unresponsive pageserver, so that it beomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure:
+What’s left is to somehow close the doomed transaction of the unresponsive pageserver, so that it becomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure:
 
 1. Ensure the unresponsive pageserver is taken out of rotation for new attachments. That probably should happen as part of the routine above.
 2. Make a human operator investigate decide what to do (next morning, NO ONCALL ALERT):
     1. Inspect the instance, investigate logs, understand root cause.
     2. Try to re-establish connectivity between pageserver and Control Plane so that pageserver can retry commits, get rejected, ack rejection ⇒ enable GC.
-    3. Use below procedure to decomission pageserver.
+    3. Use below procedure to decommission pageserver.
 
-### Decomissioning A Pageserver (Dead or Alive-but-Unrespsonive)
+### Decommissioning A Pageserver (Dead or Alive-but-Unresponsive)
 
 The solution, enabled by this proposal:
 
@@ -310,7 +310,7 @@ Issues that we discussed:
     1. In abstract terms, this proposal provides a linearized history for a given S3 prefix.
     2. In concrete terms, this proposal provides a linearized history per tenant.
     3. There can be multiple writers at a given time, but only one of them will win to become part of the linearized history.
-4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written prospoal like this one:************************************************************************************
+4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written proposal like this one:************************************************************************************
     1. @Dmitry Rodionov : having linearized storage of index_part.json in some database that allows serializable transactions / atomic compare-and-swap PUT
     2. @Dmitry Rodionov :
     3. @Stas : something like this scheme, but somehow find a way to equate attachment duration with transaction duration, without losing work if pageserver dies months after attachment.
diff --git a/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md
index 2c6b46eabe..e18b7c16c9 100644
--- a/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md
+++ b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md
@@ -54,7 +54,7 @@ If the compaction algorithm doesn't change between the two compaction runs, is d
 *However*:
 1. the file size of the overwritten L1s may not be identical, and
 2. the bit pattern of the overwritten L1s may not be identical, and,
-3. in the future, we may want to make the compaction code non-determinstic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite
+3. in the future, we may want to make the compaction code non-deterministic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite
 
 The items above are a problem for the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919) because it assumes that layer files in S3 are only ever deleted, but never replaced (overPUTted).
 
@@ -63,7 +63,7 @@ But node B based its world view on the version of node A's `index_part.json` fro
 That earlier `index_part.json`` contained the file size of the pre-overwrite L1.
 If the overwritten L1 has a different file size, node B will refuse to read data from the overwritten L1.
 Effectively, the data in the L1 has become inaccessible to node B.
-If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same probem.
+If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same problem.
 
 If we ever introduce checksums instead of checking just the file size, then a mismatching bit pattern (2) will cause similar problems.
 
@@ -121,7 +121,7 @@ Multi-object changes that previously created and removed files in timeline dir a
 * atomic `index_part.json` update in S3, as per guarantee that S3 PUT is atomic
 * local timeline dir state:
   * irrelevant for layer map content => irrelevant for atomic updates / crash consistency
-  * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads neede for them
+  * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads needed for them
   * if we crash before index part PUT, local layer files will be deleted
 
 ## Trade-Offs
@@ -140,7 +140,7 @@ Assuming upload queue allows for unlimited queue depth (that's what it does toda
 * wal ingest: currently unbounded
 * L0 => L1 compaction: CPU time proportional to `O(sum(L0 size))` and upload work proportional to `O()`
   * Compaction threshold is 10 L0s and each L0 can be up to 256M in size. Target size for L1 is 128M.
-  * In practive, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`.
+  * In practice, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`.
 * image layer generation: CPU time `O(sum(input data))` + upload work `O(sum(new image layer size))`
   * I have no intuition how expensive / long-running it is in reality.
 * gc: `update_gc_info`` work (not substantial, AFAIK)
@@ -158,7 +158,7 @@ Pageserver crashes are very rare ; it would likely be acceptable to re-do the lo
 However, regular pageserver restart happen frequently, e.g., during weekly deploys.
 
 In general, pageserver restart faces the problem of tenants that "take too long" to shut down.
-They are a problem because other tenants that shut down quickly are unavailble while we wait for the slow tenants to shut down.
+They are a problem because other tenants that shut down quickly are unavailable while we wait for the slow tenants to shut down.
 We currently allot 10 seconds for graceful shutdown until we SIGKILL the pageserver process (as per `pageserver.service` unit file).
 A longer budget would expose tenants that are done early to a longer downtime.
 A short budget would risk throwing away more work that'd have to be re-done after restart.
@@ -236,7 +236,7 @@ tenants/$tenant/timelines/$timeline/$key_and_lsn_range
 tenants/$tenant/timelines/$timeline/$layer_file_id-$key_and_lsn_range
 ```
 
-To guarantee uniqueness, the unqiue number is a sequence number, stored in `index_part.json`.
+To guarantee uniqueness, the unique number is a sequence number, stored in `index_part.json`.
 
 This alternative does not solve atomic layer map updates.
 In our crash-during-compaction scenario above, the compaction run after the crash will not overwrite the L1s, but write/PUT new files with new sequence numbers.
@@ -246,11 +246,11 @@ We'd need to write a deduplication pass that checks if perfectly overlapping lay
 However, this alternative is appealing because it systematically prevents overwrites at a lower level than this RFC.
 
 So, this alternative is sufficient for the needs of the split-brain safety RFC (immutable layer files locally and in S3).
-But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more accute.
+But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more acute.
 The proposed design in this RFC addresses both.
 
 So, if this alternative sounds appealing, we should implement the proposal in this RFC first, then implement this alternative on top.
-That way, we avoid a phase where the crash-during-compaction problem is accute.
+That way, we avoid a phase where the crash-during-compaction problem is acute.
 
 ## Related issues
 
diff --git a/docs/rfcs/028-pageserver-migration.md b/docs/rfcs/028-pageserver-migration.md
index f708f641aa..17ef9aef52 100644
--- a/docs/rfcs/028-pageserver-migration.md
+++ b/docs/rfcs/028-pageserver-migration.md
@@ -596,4 +596,4 @@ pageservers are updated to be aware of it.
 
 As well as simplifying implementation, putting heatmaps in S3 will be useful
 for future analytics purposes -- gathering aggregated statistics on activity
-pattersn across many tenants may be done directly from data in S3.
+patterns across many tenants may be done directly from data in S3.
diff --git a/docs/rfcs/029-pageserver-wal-disaster-recovery.md b/docs/rfcs/029-pageserver-wal-disaster-recovery.md
index 15ebd72bfe..229e40100e 100644
--- a/docs/rfcs/029-pageserver-wal-disaster-recovery.md
+++ b/docs/rfcs/029-pageserver-wal-disaster-recovery.md
@@ -147,7 +147,7 @@ Separating corrupt writes from non-corrupt ones is a hard problem in general,
 and if the application was involved in making the corrupt write, a recovery
 would also involve the application. Therefore, corruption that has made it into
 the WAL is outside of the scope of this feature. However, the WAL replay can be
-issued to right before the point in time where the corruption occured. Then the
+issued to right before the point in time where the corruption occurred. Then the
 data loss is isolated to post-corruption writes only.
 
 ## Impacted components (e.g. pageserver, safekeeper, console, etc)
@@ -161,7 +161,7 @@ limits and billing we apply to existing timelines.
 
 ## Proposed implementation
 
-The first problem to keep in mind is the reproducability of `initdb`.
+The first problem to keep in mind is the reproducibility of `initdb`.
 So an initial step would be to upload `initdb` snapshots to S3.
 
 After that, we'd have the endpoint spawn a background process which
diff --git a/docs/rfcs/030-vectored-timeline-get.md b/docs/rfcs/030-vectored-timeline-get.md
index d4017471b7..093a964f38 100644
--- a/docs/rfcs/030-vectored-timeline-get.md
+++ b/docs/rfcs/030-vectored-timeline-get.md
@@ -69,7 +69,7 @@ However, unlike above, an ideal solution will
   * This means, read each `DiskBtree` page at most once.
 * Facilitate merging of the reads we issue to the OS and eventually NVMe.
 
-Each of these items above represents a signficant amount of work.
+Each of these items above represents a significant amount of work.
 
 ## Performance
 

From aac8eb2c364e4386674b9d9e99a09e3f38fe31a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 3 Feb 2024 02:16:20 +0100
Subject: [PATCH 0075/1571] Minor logging improvements (#6593)

* log when `lsn_by_timestamp` finished together with its result
* add back logging of the layer name as suggested in
https://github.com/neondatabase/neon/pull/6549#discussion_r1475756808
---
 pageserver/src/http/routes.rs     | 11 +++++++++--
 pageserver/src/tenant/timeline.rs |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 57ee746726..5735489742 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -682,7 +682,7 @@ async fn get_lsn_by_timestamp_handler(
     let result = timeline
         .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
         .await?;
-    #[derive(serde::Serialize)]
+    #[derive(serde::Serialize, Debug)]
     struct Result {
         lsn: Lsn,
         kind: &'static str,
@@ -693,7 +693,14 @@ async fn get_lsn_by_timestamp_handler(
         LsnForTimestamp::Past(lsn) => (lsn, "past"),
         LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
     };
-    json_response(StatusCode::OK, Result { lsn, kind })
+    let result = Result { lsn, kind };
+    tracing::info!(
+        lsn=?result.lsn,
+        kind=%result.kind,
+        timestamp=%timestamp_raw,
+        "lsn_by_timestamp finished"
+    );
+    json_response(StatusCode::OK, result)
 }
 
 async fn get_timestamp_of_lsn_handler(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0ffe0b6418..0ba3fe728a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2843,6 +2843,7 @@ impl Timeline {
     }
 
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
+    #[instrument(skip_all, fields(layer=%frozen_layer))]
     async fn flush_frozen_layer(
         self: &Arc<Self>,
         frozen_layer: Arc<InMemoryLayer>,

From c96aead5029a7d4d2cc026f2d05c0c6286af612a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 2 Feb 2024 22:37:43 +0200
Subject: [PATCH 0076/1571] Reorganize .dockerignore

Author: Alexander Bayandin <alexander@neon.tech>
---
 .dockerignore | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index ae0ad8fd77..8b378b5dab 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,27 +1,27 @@
 *
 
-!rust-toolchain.toml
-!Cargo.toml
+# Files
 !Cargo.lock
+!Cargo.toml
 !Makefile
+!rust-toolchain.toml
+!scripts/combine_control_files.py
+!scripts/ninstall.sh
+!vm-cgconfig.conf
 
+# Directories
 !.cargo/
 !.config/
-!control_plane/
 !compute_tools/
+!control_plane/
 !libs/
+!neon_local/
 !pageserver/
 !pgxn/
 !proxy/
-!safekeeper/
 !s3_scrubber/
+!safekeeper/
 !storage_broker/
 !trace/
-!vendor/postgres-v14/
-!vendor/postgres-v15/
-!vendor/postgres-v16/
+!vendor/postgres-*/
 !workspace_hack/
-!neon_local/
-!scripts/ninstall.sh
-!scripts/combine_control_files.py
-!vm-cgconfig.conf

From 647b85fc15a31861608dfe767b625ce889471359 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 2 Feb 2024 22:28:45 +0200
Subject: [PATCH 0077/1571] Update pgvector to v0.6.0, third attempt

This includes a compatibility patch that is needed because pgvector
now skips WAL-logging during the index build, and WAL-logs the index
only in one go at the end. That's how GIN, GiST and SP-GIST index
builds work in core PostgreSQL too, but we need some Neon-specific
calls to mark the beginning and end of those build phases.

pgvector is the first index AM that does that with parallel workers,
so I had to modify those functions in the Neon extension to be aware
of parallel workers. Only the leader needs to create the underlying
file and perform the WAL-logging. (In principle, the parallel workers
could participate in the WAL-logging too, but pgvector doesn't do
that. This will need some further work if that changes).

The previous attempt at this (#6592) missed that parallel workers
needed those changes, and segfaulted in parallel build that spilled to
disk.

Testing
-------

We don't have a place for regression tests of extensions at the
moment. I tested this manually with the following script:

```
CREATE EXTENSION IF NOT EXISTS vector;

DROP TABLE IF EXISTS tst;
CREATE TABLE tst (i serial, v vector(3));

INSERT INTO tst (v) SELECT ARRAY[random(), random(), random()] FROM generate_series(1, 15000) g;

-- Serial build, in memory
ALTER TABLE tst SET (parallel_workers=0);
SET maintenance_work_mem='50 MB';
CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);

-- Test that the index works. (The table contents are random, and the
-- search is approximate anyway, so we cannot check the exact values.
-- For now, just eyeball that they look reasonable)
set enable_seqscan=off;
explain SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5;
SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5;

DROP INDEX idx;

-- Serial build, spills to on disk

ALTER TABLE tst SET (parallel_workers=0);
SET maintenance_work_mem='5 MB';
CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);
SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5;
DROP INDEX idx;

-- Parallel build, in memory

ALTER TABLE tst SET (parallel_workers=4);
SET maintenance_work_mem='50 MB';
CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);
SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5;
DROP INDEX idx;

-- Parallel build, spills to disk

ALTER TABLE tst SET (parallel_workers=4);
SET maintenance_work_mem='5 MB';
CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);
SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5;
DROP INDEX idx;
```
---
 .dockerignore              |  1 +
 Dockerfile.compute-node    |  7 +++-
 patches/pgvector.patch     | 78 ++++++++++++++++++++++++++++++++++++++
 pgxn/neon/pagestore_smgr.c | 19 +++++++++-
 4 files changed, 101 insertions(+), 4 deletions(-)
 create mode 100644 patches/pgvector.patch

diff --git a/.dockerignore b/.dockerignore
index 8b378b5dab..29abdc37aa 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -17,6 +17,7 @@
 !libs/
 !neon_local/
 !pageserver/
+!patches/
 !pgxn/
 !proxy/
 !s3_scrubber/
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index d91c7cfd72..b13225172d 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,9 +241,12 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
-    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
+COPY patches/pgvector.patch /pgvector.patch
+
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \
+    echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
diff --git a/patches/pgvector.patch b/patches/pgvector.patch
new file mode 100644
index 0000000000..84ac6644c5
--- /dev/null
+++ b/patches/pgvector.patch
@@ -0,0 +1,78 @@
+From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
+From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+Date: Fri, 2 Feb 2024 22:26:45 +0200
+Subject: [PATCH 1/1] Make v0.6.0 work with Neon
+
+Now that the WAL-logging happens as a separate step at the end of the
+build, we need a few neon-specific hints to make it work.
+---
+ src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
+ 1 file changed, 36 insertions(+)
+
+diff --git a/src/hnswbuild.c b/src/hnswbuild.c
+index 680789b..ec54dea 100644
+--- a/src/hnswbuild.c
++++ b/src/hnswbuild.c
+@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
+ 
+ 	hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
+ 
++#ifdef NEON_SMGR
++	smgr_start_unlogged_build(RelationGetSmgr(indexRel));
++#endif
++
+ 	/* Perform inserts */
+ 	HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false);
+ 
++#ifdef NEON_SMGR
++	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel));
++#endif
++
+ 	/* Close relations within worker */
+ 	index_close(indexRel, indexLockmode);
+ 	table_close(heapRel, heapLockmode);
+@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+ 	SeedRandom(42);
+ #endif
+ 
++#ifdef NEON_SMGR
++	smgr_start_unlogged_build(RelationGetSmgr(index));
++#endif
++
+ 	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
+ 
+ 	BuildGraph(buildstate, forkNum);
+ 
++#ifdef NEON_SMGR
++	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
++#endif
++
+ 	if (RelationNeedsWAL(index))
++	{
+ 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
+ 
++#ifdef NEON_SMGR
++		{
++#if PG_VERSION_NUM >= 160000
++			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
++#else
++			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
++#endif
++
++			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
++										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
++			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
++		}
++#endif
++	}
++
++#ifdef NEON_SMGR
++	smgr_end_unlogged_build(RelationGetSmgr(index));
++#endif
++
+ 	FreeBuildState(buildstate);
+ }
+ 
+-- 
+2.39.2
+
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 63e8b8dc1f..f54c86702f 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -45,6 +45,7 @@
  */
 #include "postgres.h"
 
+#include "access/parallel.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xlogdefs.h"
@@ -2712,10 +2713,14 @@ neon_start_unlogged_build(SMgrRelation reln)
 	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
 
 	/*
+	 * Create the local file. In a parallel build, the leader is expected to
+	 * call this first and do it.
+	 *
 	 * FIXME: should we pass isRedo true to create the tablespace dir if it
 	 * doesn't exist? Is it needed?
 	 */
-	mdcreate(reln, MAIN_FORKNUM, false);
+	if (!IsParallelWorker())
+		mdcreate(reln, MAIN_FORKNUM, false);
 }
 
 /*
@@ -2739,7 +2744,17 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
 	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
 
-	unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
+	/*
+	 * In a parallel build, (only) the leader process performs the 2nd
+	 * phase.
+	 */
+	if (IsParallelWorker())
+	{
+		unlogged_build_rel = NULL;
+		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+	}
+	else
+		unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
 }
 
 /*

From 9dd69194d48b46e3f32b2cb9ce688a35669d48ec Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Sun, 4 Feb 2024 00:15:59 +0200
Subject: [PATCH 0078/1571] refactor(proxy): std::io::Write for BytesMut exists
 (#6606)

Replace TODO with an existing implementation via `BufMut::writer``.
---
 proxy/src/context/parquet.rs | 48 ++++++++++++++----------------------
 1 file changed, 18 insertions(+), 30 deletions(-)

diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 1e9e723938..e920d7be01 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -1,7 +1,7 @@
 use std::{sync::Arc, time::SystemTime};
 
 use anyhow::Context;
-use bytes::BytesMut;
+use bytes::{buf::Writer, BufMut, BytesMut};
 use chrono::{Datelike, Timelike};
 use futures::{Stream, StreamExt};
 use parquet::{
@@ -192,8 +192,9 @@ async fn worker_inner(
     let mut rows = Vec::with_capacity(config.rows_per_group);
 
     let schema = rows.as_slice().schema()?;
-    let file = BytesWriter::default();
-    let mut w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?;
+    let buffer = BytesMut::new();
+    let w = buffer.writer();
+    let mut w = SerializedFileWriter::new(w, schema.clone(), config.propeties.clone())?;
 
     let mut last_upload = time::Instant::now();
 
@@ -221,20 +222,23 @@ async fn worker_inner(
     }
 
     if !w.flushed_row_groups().is_empty() {
-        let _: BytesWriter = upload_parquet(w, len, &storage).await?;
+        let _: Writer<BytesMut> = upload_parquet(w, len, &storage).await?;
     }
 
     Ok(())
 }
 
-async fn flush_rows(
+async fn flush_rows<W>(
     rows: Vec<RequestData>,
-    mut w: SerializedFileWriter<BytesWriter>,
+    mut w: SerializedFileWriter<W>,
 ) -> anyhow::Result<(
     Vec<RequestData>,
-    SerializedFileWriter<BytesWriter>,
+    SerializedFileWriter<W>,
     RowGroupMetaDataPtr,
-)> {
+)>
+where
+    W: std::io::Write + Send + 'static,
+{
     let span = Span::current();
     let (mut rows, w, rg_meta) = tokio::task::spawn_blocking(move || {
         let _enter = span.enter();
@@ -258,10 +262,10 @@ async fn flush_rows(
 }
 
 async fn upload_parquet(
-    w: SerializedFileWriter<BytesWriter>,
+    w: SerializedFileWriter<Writer<BytesMut>>,
     len: i64,
     storage: &GenericRemoteStorage,
-) -> anyhow::Result<BytesWriter> {
+) -> anyhow::Result<Writer<BytesMut>> {
     let len_uncompressed = w
         .flushed_row_groups()
         .iter()
@@ -270,11 +274,12 @@ async fn upload_parquet(
 
     // I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry.
     // finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253
-    let (mut file, metadata) = tokio::task::spawn_blocking(move || w.finish())
+    let (writer, metadata) = tokio::task::spawn_blocking(move || w.finish())
         .await
         .unwrap()?;
 
-    let data = file.buf.split().freeze();
+    let mut buffer = writer.into_inner();
+    let data = buffer.split().freeze();
 
     let compression = len as f64 / len_uncompressed as f64;
     let size = data.len();
@@ -315,24 +320,7 @@ async fn upload_parquet(
     .await
     .context("request_data_upload")?;
 
-    Ok(file)
-}
-
-// why doesn't BytesMut impl io::Write?
-#[derive(Default)]
-struct BytesWriter {
-    buf: BytesMut,
-}
-
-impl std::io::Write for BytesWriter {
-    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
-        self.buf.extend_from_slice(buf);
-        Ok(buf.len())
-    }
-
-    fn flush(&mut self) -> std::io::Result<()> {
-        Ok(())
-    }
+    Ok(buffer.writer())
 }
 
 #[cfg(test)]

From 09519c1773724fbceec1257d4e495aa20f901afc Mon Sep 17 00:00:00 2001
From: Clarence <clarencepenz@users.noreply.github.com>
Date: Sun, 4 Feb 2024 20:33:38 +0100
Subject: [PATCH 0079/1571] chore: update wording in docs to improve
 readability (#6607)

## Problem
 Found typos while reading the docs

## Summary of changes
Fixed the typos found
---
 docs/docker.md                 | 4 ++--
 docs/pageserver-storage.md     | 2 +-
 docs/pageserver-thread-mgmt.md | 2 +-
 docs/pageserver-walredo.md     | 2 +-
 docs/synthetic-size.md         | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/docker.md b/docs/docker.md
index 9761cc4346..cbf68be3a7 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -21,7 +21,7 @@ We build all images after a successful `release` tests run and push automaticall
 
 ## Docker Compose example
 
-You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers.
+You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following containers.
 
 - pageserver x 1
 - safekeeper x 3
@@ -38,7 +38,7 @@ You can specify version of neon cluster using following environment values.
 - TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
 ```
 $ cd docker-compose/
-$ docker-compose down   # remove the conainers if exists
+$ docker-compose down   # remove the containers if exists
 $ PG_VERSION=15 TAG=2937 docker-compose up --build -d  # You can specify the postgres and image version
 Creating network "dockercompose_default" with the default driver
 Creating docker-compose_storage_broker_1       ... done
diff --git a/docs/pageserver-storage.md b/docs/pageserver-storage.md
index 77e7ff35bc..9902f6b930 100644
--- a/docs/pageserver-storage.md
+++ b/docs/pageserver-storage.md
@@ -64,7 +64,7 @@ Storage.
 
 The LayerMap tracks what layers exist in a timeline.
 
-Currently, the layer map is just a resizeable array (Vec). On a GetPage@LSN or
+Currently, the layer map is just a resizable array (Vec). On a GetPage@LSN or
 other read request, the layer map scans through the array to find the right layer
 that contains the data for the requested page. The read-code in LayeredTimeline
 is aware of the ancestor, and returns data from the ancestor timeline if it's
diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md
index c911d2c53d..5d862415eb 100644
--- a/docs/pageserver-thread-mgmt.md
+++ b/docs/pageserver-thread-mgmt.md
@@ -22,7 +22,7 @@ timeline to shutdown. It will also wait for them to finish.
 
 A task registered in the task registry can check if it has been
 requested to shut down, by calling `is_shutdown_requested()`. There's
-also a `shudown_watcher()` Future that can be used with `tokio::select!`
+also a `shutdown_watcher()` Future that can be used with `tokio::select!`
 or similar, to wake up on shutdown.
 
 
diff --git a/docs/pageserver-walredo.md b/docs/pageserver-walredo.md
index 1de9c177cc..7b366ff616 100644
--- a/docs/pageserver-walredo.md
+++ b/docs/pageserver-walredo.md
@@ -74,4 +74,4 @@ somewhat wasteful, but because most WAL records only affect one page,
 the overhead is acceptable.
 
 The WAL redo always happens for one particular page. If the WAL record
-coantains changes to other pages, they are ignored.
+contains changes to other pages, they are ignored.
diff --git a/docs/synthetic-size.md b/docs/synthetic-size.md
index 407d7b525a..3acb4e18cb 100644
--- a/docs/synthetic-size.md
+++ b/docs/synthetic-size.md
@@ -21,7 +21,7 @@ implementation where we keep more data than we would need to, do not
 change the synthetic size or incur any costs to the user.
 
 The synthetic size is calculated for the whole project. It is not
-straighforward to attribute size to individual branches. See "What is
+straightforward to attribute size to individual branches. See "What is
 the size of an individual branch?" for discussion on those
 difficulties.
 
@@ -248,7 +248,7 @@ and truncate the WAL.
 
 Synthetic size is calculated for the whole project, and includes all
 branches. There is no such thing as the size of a branch, because it
-is not straighforward to attribute the parts of size to individual
+is not straightforward to attribute the parts of size to individual
 branches.
 
 ## Example: attributing size to branches

From 7e8529bec127aa13f5f4a819a24495c0a8e18aea Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Sun, 4 Feb 2024 23:27:07 +0100
Subject: [PATCH 0080/1571] Revert "Update pgvector to v0.6.0, third attempt"
 (#6610)

The issue is still unsolved because of shmem size in VMs. Need to figure it out before applying this patch.

For more details:

```
ERROR:  could not resize shared memory segment "/PostgreSQL.2892504480" to 16774205952 bytes: No space left on device
```

As an example, the same issue in community pgvector/pgvector#453.
---
 .dockerignore              |  1 -
 Dockerfile.compute-node    |  7 +---
 patches/pgvector.patch     | 78 --------------------------------------
 pgxn/neon/pagestore_smgr.c | 19 +---------
 4 files changed, 4 insertions(+), 101 deletions(-)
 delete mode 100644 patches/pgvector.patch

diff --git a/.dockerignore b/.dockerignore
index 29abdc37aa..8b378b5dab 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -17,7 +17,6 @@
 !libs/
 !neon_local/
 !pageserver/
-!patches/
 !pgxn/
 !proxy/
 !s3_scrubber/
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index b13225172d..d91c7cfd72 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,12 +241,9 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-COPY patches/pgvector.patch /pgvector.patch
-
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \
-    echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
+    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
-    patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
diff --git a/patches/pgvector.patch b/patches/pgvector.patch
deleted file mode 100644
index 84ac6644c5..0000000000
--- a/patches/pgvector.patch
+++ /dev/null
@@ -1,78 +0,0 @@
-From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
-From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
-Date: Fri, 2 Feb 2024 22:26:45 +0200
-Subject: [PATCH 1/1] Make v0.6.0 work with Neon
-
-Now that the WAL-logging happens as a separate step at the end of the
-build, we need a few neon-specific hints to make it work.
----
- src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
- 1 file changed, 36 insertions(+)
-
-diff --git a/src/hnswbuild.c b/src/hnswbuild.c
-index 680789b..ec54dea 100644
---- a/src/hnswbuild.c
-+++ b/src/hnswbuild.c
-@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
- 
- 	hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
- 
-+#ifdef NEON_SMGR
-+	smgr_start_unlogged_build(RelationGetSmgr(indexRel));
-+#endif
-+
- 	/* Perform inserts */
- 	HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false);
- 
-+#ifdef NEON_SMGR
-+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel));
-+#endif
-+
- 	/* Close relations within worker */
- 	index_close(indexRel, indexLockmode);
- 	table_close(heapRel, heapLockmode);
-@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
- 	SeedRandom(42);
- #endif
- 
-+#ifdef NEON_SMGR
-+	smgr_start_unlogged_build(RelationGetSmgr(index));
-+#endif
-+
- 	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
- 
- 	BuildGraph(buildstate, forkNum);
- 
-+#ifdef NEON_SMGR
-+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
-+#endif
-+
- 	if (RelationNeedsWAL(index))
-+	{
- 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
- 
-+#ifdef NEON_SMGR
-+		{
-+#if PG_VERSION_NUM >= 160000
-+			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
-+#else
-+			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
-+#endif
-+
-+			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
-+										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
-+			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
-+		}
-+#endif
-+	}
-+
-+#ifdef NEON_SMGR
-+	smgr_end_unlogged_build(RelationGetSmgr(index));
-+#endif
-+
- 	FreeBuildState(buildstate);
- }
- 
--- 
-2.39.2
-
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index f54c86702f..63e8b8dc1f 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -45,7 +45,6 @@
  */
 #include "postgres.h"
 
-#include "access/parallel.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xlogdefs.h"
@@ -2713,14 +2712,10 @@ neon_start_unlogged_build(SMgrRelation reln)
 	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
 
 	/*
-	 * Create the local file. In a parallel build, the leader is expected to
-	 * call this first and do it.
-	 *
 	 * FIXME: should we pass isRedo true to create the tablespace dir if it
 	 * doesn't exist? Is it needed?
 	 */
-	if (!IsParallelWorker())
-		mdcreate(reln, MAIN_FORKNUM, false);
+	mdcreate(reln, MAIN_FORKNUM, false);
 }
 
 /*
@@ -2744,17 +2739,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
 	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
 
-	/*
-	 * In a parallel build, (only) the leader process performs the 2nd
-	 * phase.
-	 */
-	if (IsParallelWorker())
-	{
-		unlogged_build_rel = NULL;
-		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
-	}
-	else
-		unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
+	unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
 }
 
 /*

From 70f646ffe2fe9829316f1ed02a5a1529bc296fd6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 5 Feb 2024 09:34:03 +0200
Subject: [PATCH 0081/1571] More logging fixes (#6584)

I was on-call this week, these would had made me understand more/faster
of the system:
- move stray attaching start logging inside the span it starts, add
generation
- log ancestor timeline_id or bootstrapping in the beginning of timeline
creation
---
 pageserver/src/http/routes.rs             |  6 +++++
 pageserver/src/tenant.rs                  | 28 ++++++++++-------------
 pageserver/src/tenant/config.rs           |  4 ++--
 pageserver/src/tenant/mgr.rs              |  7 ------
 test_runner/regress/test_timeline_size.py |  2 +-
 5 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 5735489742..b97e272c86 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -489,6 +489,12 @@ async fn timeline_create_handler(
 
         tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
+        if let Some(ancestor_id) = request_data.ancestor_timeline_id.as_ref() {
+            tracing::info!(%ancestor_id, "starting to branch");
+        } else {
+            tracing::info!("bootstrapping");
+        }
+
         match tenant.create_timeline(
             new_timeline_id,
             request_data.ancestor_timeline_id.map(TimelineId::from),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 58af80238d..dd4f9107f9 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -205,7 +205,7 @@ impl AttachedTenantConf {
         match &location_conf.mode {
             LocationMode::Attached(attach_conf) => Ok(Self {
                 tenant_conf: location_conf.tenant_conf,
-                location: attach_conf.clone(),
+                location: *attach_conf,
             }),
             LocationMode::Secondary(_) => {
                 anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode")
@@ -625,6 +625,9 @@ impl Tenant {
             deletion_queue_client,
         } = resources;
 
+        let attach_mode = attached_conf.location.attach_mode;
+        let generation = attached_conf.location.generation;
+
         let tenant = Arc::new(Tenant::new(
             TenantState::Attaching,
             conf,
@@ -654,6 +657,12 @@ impl Tenant {
             "attach tenant",
             false,
             async move {
+
+                info!(
+                    ?attach_mode,
+                    "Attaching tenant"
+                );
+
                 let _gate_guard = attach_gate_guard;
 
                 // Is this tenant being spawned as part of process startup?
@@ -865,7 +874,7 @@ impl Tenant {
                 Ok(())
             }
             .instrument({
-                let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
+                let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation);
                 span.follows_from(Span::current());
                 span
             }),
@@ -2354,12 +2363,7 @@ impl Tenant {
     }
 
     pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
-        self.tenant_conf
-            .read()
-            .unwrap()
-            .location
-            .attach_mode
-            .clone()
+        self.tenant_conf.read().unwrap().location.attach_mode
     }
 
     /// For API access: generate a LocationConfig equivalent to the one that would be used to
@@ -3225,8 +3229,6 @@ impl Tenant {
                 .context("branch initial metadata upload")?;
         }
 
-        info!("branched timeline {dst_id} from {src_id} at {start_lsn}");
-
         Ok(new_timeline)
     }
 
@@ -3444,12 +3446,6 @@ impl Tenant {
         // All done!
         let timeline = raw_timeline.finish_creation()?;
 
-        info!(
-            "created root timeline {} timeline.lsn {}",
-            timeline_id,
-            timeline.get_last_record_lsn()
-        );
-
         Ok(timeline)
     }
 
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 63bd56cf5f..563887088d 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -51,7 +51,7 @@ pub mod defaults {
     pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub(crate) enum AttachmentMode {
     /// Our generation is current as far as we know, and as far as we know we are the only attached
     /// pageserver.  This is the "normal" attachment mode.
@@ -66,7 +66,7 @@ pub(crate) enum AttachmentMode {
     Stale,
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub(crate) struct AttachedLocationConfig {
     pub(crate) generation: Generation,
     pub(crate) attach_mode: AttachmentMode,
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 64fd709386..de0b636d47 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -607,13 +607,6 @@ pub(crate) fn tenant_spawn(
         "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
     );
 
-    info!(
-        tenant_id = %tenant_shard_id.tenant_id,
-        shard_id = %tenant_shard_id.shard_slug(),
-        generation = ?location_conf.location.generation,
-        attach_mode = ?location_conf.location.attach_mode,
-        "Attaching tenant"
-    );
     let tenant = match Tenant::spawn(
         conf,
         tenant_shard_id,
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 303aabb58d..cd7203bba6 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -883,7 +883,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
         # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
         # logical size is paused in a failpoint.  So instead we will use a log observation to check that
         # on-demand activation was triggered by the tenant deletion
-        log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000}}: Activating tenant \\(on-demand\\).*"
+        log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
 
         def activated_on_demand():
             assert env.pageserver.log_contains(log_match) is not None

From df7bee7cfaba8f2129fd9ea88976da5d079684a5 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 3 Feb 2024 00:02:33 +0200
Subject: [PATCH 0082/1571] Fix compilation with recent glibc headers with
 close_range(2).
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I was getting an error:

    /home/heikki/git-sandbox/neon//pgxn/neon_walredo/walredoproc.c:161:5: error: conflicting types for ‘close_range’; have ‘int(unsigned int,  unsigned int,  unsigned int)’
      161 | int close_range(unsigned int start_fd, unsigned int count, unsigned int flags) {
          |     ^~~~~~~~~~~
    In file included from /usr/include/x86_64-linux-gnu/bits/sigstksz.h:24,
                     from /usr/include/signal.h:328,
                     from /home/heikki/git-sandbox/neon//pgxn/neon_walredo/walredoproc.c:50:
    /usr/include/unistd.h:1208:12: note: previous declaration of ‘close_range’ with type ‘int(unsigned int,  unsigned int,  int)’
     1208 | extern int close_range (unsigned int __fd, unsigned int __max_fd,
          |            ^~~~~~~~~~~

The discrepancy is in the 3rd argument. Apparently in the glibc
wrapper it's signed.

As a quick fix, rename our close_range() function, the one that calls
syscall() directly, to avoid the clash with the glibc wrapper. In the
long term, an autoconf test would be nice, and some equivalent on
macOS, see issue #6580.
---
 pgxn/neon_walredo/walredoproc.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c
index 6ca0b2a274..1fdd3801c6 100644
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -158,7 +158,10 @@ static XLogReaderState *reader_state;
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <errno.h>
-int close_range(unsigned int start_fd, unsigned int count, unsigned int flags) {
+
+static int
+close_range_syscall(unsigned int start_fd, unsigned int count, unsigned int flags)
+{
     return syscall(__NR_close_range, start_fd, count, flags);
 }
 
@@ -172,7 +175,7 @@ enter_seccomp_mode(void)
 	 * wal records. See the comment in the Rust code that launches this process.
 	 */
 	int err;
-	if (err = close_range(3, ~0U, 0)) {
+	if (err = close_range_syscall(3, ~0U, 0)) {
 		ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not close files >= fd 3")));
 	}
 

From 56cf3604395125b9283ba643cfbb98efd926ff49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 5 Feb 2024 10:53:37 +0100
Subject: [PATCH 0083/1571] Don't preserve temp files on creation errors of
 delta layers (#6612)

There is currently no cleanup done after a delta layer creation error,
so delta layers can accumulate. The problem gets worse as the operation
gets retried and delta layers accumulate on the disk. Therefore, delete
them from disk (if something has been written to disk).
---
 pageserver/src/tenant/storage_layer/delta_layer.rs | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index ec031d6089..2a51884c0b 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -609,7 +609,19 @@ impl DeltaLayerWriter {
         key_end: Key,
         timeline: &Arc<Timeline>,
     ) -> anyhow::Result<ResidentLayer> {
-        self.inner.take().unwrap().finish(key_end, timeline).await
+        let inner = self.inner.take().unwrap();
+        let temp_path = inner.path.clone();
+        let result = inner.finish(key_end, timeline).await;
+        // The delta layer files can sometimes be really large. Clean them up.
+        if result.is_err() {
+            tracing::warn!(
+                "Cleaning up temporary delta file {temp_path} after error during writing"
+            );
+            if let Err(e) = std::fs::remove_file(&temp_path) {
+                tracing::warn!("Error cleaning up temporary delta layer file {temp_path}: {e:?}")
+            }
+        }
+        result
     }
 }
 

From 01c57ec547cb701f2253c8c445931644cc9f60b9 Mon Sep 17 00:00:00 2001
From: Abhijeet Patil <abhi.gets.mail@gmail.com>
Date: Mon, 5 Feb 2024 10:08:20 +0000
Subject: [PATCH 0084/1571] Removed Uploading of perf result to git repo
 'zenith-perf-data' (#6590)

## Problem
We were archiving the pref benchmarks to

- neon DB
- git repo `zenith-perf-data`

As the pref batch ran in parallel when the uploading of results to
zenith-perf-data` git repo resulted in merge conflicts.
Which made the run flaky and as a side effect the build started failing
.

The problem is been expressed in
https://github.com/neondatabase/neon/issues/5160

## Summary of changes
As the results were not used from the git repo it was redundant hence in
this PR cleaning up the results uploading of of perf results to git repo
The shell script `generate_and_push_perf_report.sh` was using a py
script
[git-upload](https://github.com/neondatabase/neon/compare/remove-perf-benchmark-git-upload?expand=1#diff-c6d938e7f060e487367d9dc8055245c82b51a73c1f97956111a495a8a86e9a33)
and
[scripts/generate_perf_report_page.py](https://github.com/neondatabase/neon/pull/6590/files#diff-81af2147e72d07e4cf8ee4395632596d805d6168ba75c71cab58db2659956ef8)
which are not used anywhere else in repo hence also cleaning that up

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat the commit message to not include the
above checklist
---
 scripts/generate_and_push_perf_report.sh |  14 --
 scripts/generate_perf_report_page.py     | 219 -----------------------
 scripts/git-upload                       | 170 ------------------
 3 files changed, 403 deletions(-)
 delete mode 100755 scripts/generate_perf_report_page.py
 delete mode 100755 scripts/git-upload

diff --git a/scripts/generate_and_push_perf_report.sh b/scripts/generate_and_push_perf_report.sh
index 9e03302b0f..178c570b13 100755
--- a/scripts/generate_and_push_perf_report.sh
+++ b/scripts/generate_and_push_perf_report.sh
@@ -8,17 +8,3 @@ SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 echo "Uploading perf report to neon pg"
 # ingest per test results data into neon backed postgres running in staging to build grafana reports on that data
 DATABASE_URL="$PERF_TEST_RESULT_CONNSTR" poetry run python "$SCRIPT_DIR"/ingest_perf_test_result.py --ingest "$REPORT_FROM"
-
-# Activate poetry's venv. Needed because git upload does not run in a project dir (it uses tmp to store the repository)
-# so the problem occurs because poetry cannot find pyproject.toml in temp dir created by git upload
-# shellcheck source=/dev/null
-. "$(poetry env info --path)"/bin/activate
-
-echo "Uploading perf result to zenith-perf-data"
-scripts/git-upload \
-    --repo=https://"$VIP_VAP_ACCESS_TOKEN"@github.com/neondatabase/zenith-perf-data.git \
-    --message="add performance test result for $GITHUB_SHA neon revision" \
-    --branch=master \
-    copy "$REPORT_FROM" "data/$REPORT_TO" `# COPY FROM TO_RELATIVE`\
-    --merge \
-    --run-cmd "python $SCRIPT_DIR/generate_perf_report_page.py --input-dir data/$REPORT_TO --out reports/$REPORT_TO.html"
diff --git a/scripts/generate_perf_report_page.py b/scripts/generate_perf_report_page.py
deleted file mode 100755
index b5b49bb600..0000000000
--- a/scripts/generate_perf_report_page.py
+++ /dev/null
@@ -1,219 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import json
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, cast
-
-from jinja2 import Template
-
-# skip 'input' columns. They are included in the header and just blow the table
-EXCLUDE_COLUMNS = frozenset(
-    {
-        "scale",
-        "duration",
-        "number_of_clients",
-        "number_of_threads",
-        "init_start_timestamp",
-        "init_end_timestamp",
-        "run_start_timestamp",
-        "run_end_timestamp",
-    }
-)
-
-KEY_EXCLUDE_FIELDS = frozenset(
-    {
-        "init_start_timestamp",
-        "init_end_timestamp",
-        "run_start_timestamp",
-        "run_end_timestamp",
-    }
-)
-NEGATIVE_COLOR = "negative"
-POSITIVE_COLOR = "positive"
-EPS = 1e-6
-
-
-@dataclass
-class SuitRun:
-    revision: str
-    values: Dict[str, Any]
-
-
-@dataclass
-class SuitRuns:
-    platform: str
-    suit: str
-    common_columns: List[Tuple[str, str]]
-    value_columns: List[str]
-    runs: List[SuitRun]
-
-
-@dataclass
-class RowValue:
-    value: str
-    color: str
-    ratio: str
-
-
-def get_columns(values: List[Dict[Any, Any]]) -> Tuple[List[Tuple[str, str]], List[str]]:
-    value_columns = []
-    common_columns = []
-    for item in values:
-        if item["name"] in KEY_EXCLUDE_FIELDS:
-            continue
-        if item["report"] != "test_param":
-            value_columns.append(cast(str, item["name"]))
-        else:
-            common_columns.append((cast(str, item["name"]), cast(str, item["value"])))
-    value_columns.sort()
-    common_columns.sort(key=lambda x: x[0])  # sort by name
-    return common_columns, value_columns
-
-
-def format_ratio(ratio: float, report: str) -> Tuple[str, str]:
-    color = ""
-    sign = "+" if ratio > 0 else ""
-    if abs(ratio) < 0.05:
-        return f"&nbsp({sign}{ratio:.2f})", color
-
-    if report not in {"test_param", "higher_is_better", "lower_is_better"}:
-        raise ValueError(f"Unknown report type: {report}")
-
-    if report == "test_param":
-        return f"{ratio:.2f}", color
-
-    if ratio > 0:
-        if report == "higher_is_better":
-            color = POSITIVE_COLOR
-        elif report == "lower_is_better":
-            color = NEGATIVE_COLOR
-    elif ratio < 0:
-        if report == "higher_is_better":
-            color = NEGATIVE_COLOR
-        elif report == "lower_is_better":
-            color = POSITIVE_COLOR
-
-    return f"&nbsp({sign}{ratio:.2f})", color
-
-
-def extract_value(name: str, suit_run: SuitRun) -> Optional[Dict[str, Any]]:
-    for item in suit_run.values["data"]:
-        if item["name"] == name:
-            return cast(Dict[str, Any], item)
-    return None
-
-
-def get_row_values(
-    columns: List[str], run_result: SuitRun, prev_result: Optional[SuitRun]
-) -> List[RowValue]:
-    row_values = []
-    for column in columns:
-        current_value = extract_value(column, run_result)
-        if current_value is None:
-            # should never happen
-            raise ValueError(f"{column} not found in {run_result.values}")
-
-        value = current_value["value"]
-        if isinstance(value, float):
-            value = f"{value:.2f}"
-
-        if prev_result is None:
-            row_values.append(RowValue(value, "", ""))
-            continue
-
-        prev_value = extract_value(column, prev_result)
-        if prev_value is None:
-            # this might happen when new metric is added and there is no value for it in previous run
-            # let this be here, TODO add proper handling when this actually happens
-            raise ValueError(f"{column} not found in previous result")
-        # adding `EPS` to each term to avoid ZeroDivisionError when the denominator is zero
-        ratio = (float(value) + EPS) / (float(prev_value["value"]) + EPS) - 1
-        ratio_display, color = format_ratio(ratio, current_value["report"])
-        row_values.append(RowValue(value, color, ratio_display))
-    return row_values
-
-
-@dataclass
-class SuiteRunTableRow:
-    revision: str
-    values: List[RowValue]
-
-
-def prepare_rows_from_runs(value_columns: List[str], runs: List[SuitRun]) -> List[SuiteRunTableRow]:
-    rows = []
-    prev_run = None
-    for run in runs:
-        rows.append(
-            SuiteRunTableRow(
-                revision=run.revision, values=get_row_values(value_columns, run, prev_run)
-            )
-        )
-        prev_run = run
-
-    return rows
-
-
-def main(args: argparse.Namespace) -> None:
-    input_dir = Path(args.input_dir)
-    grouped_runs: Dict[str, SuitRuns] = {}
-    # we have files in form: <ctr>_<rev>.json
-    # fill them in the hashmap so we have grouped items for the
-    # same run configuration (scale, duration etc.) ordered by counter.
-    for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split("_")[0])):
-        run_data = json.loads(item.read_text())
-        revision = run_data["revision"]
-
-        for suit_result in run_data["result"]:
-            key = "{}{}".format(run_data["platform"], suit_result["suit"])
-            # pack total duration as a synthetic value
-            total_duration = suit_result["total_duration"]
-            suit_result["data"].append(
-                {
-                    "name": "total_duration",
-                    "value": total_duration,
-                    "unit": "s",
-                    "report": "lower_is_better",
-                }
-            )
-            common_columns, value_columns = get_columns(suit_result["data"])
-
-            grouped_runs.setdefault(
-                key,
-                SuitRuns(
-                    platform=run_data["platform"],
-                    suit=suit_result["suit"],
-                    common_columns=common_columns,
-                    value_columns=value_columns,
-                    runs=[],
-                ),
-            )
-
-            grouped_runs[key].runs.append(SuitRun(revision=revision, values=suit_result))
-    context = {}
-    for result in grouped_runs.values():
-        suit = result.suit
-        context[suit] = {
-            "common_columns": result.common_columns,
-            "value_columns": result.value_columns,
-            "platform": result.platform,
-            # reverse the order so newest results are on top of the table
-            "rows": reversed(prepare_rows_from_runs(result.value_columns, result.runs)),
-        }
-
-    template = Template((Path(__file__).parent / "perf_report_template.html").read_text())
-
-    Path(args.out).write_text(template.render(context=context))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input-dir",
-        dest="input_dir",
-        required=True,
-        help="Directory with jsons generated by the test suite",
-    )
-    parser.add_argument("--out", required=True, help="Output html file path")
-    args = parser.parse_args()
-    main(args)
diff --git a/scripts/git-upload b/scripts/git-upload
deleted file mode 100755
index d56c0f8e94..0000000000
--- a/scripts/git-upload
+++ /dev/null
@@ -1,170 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import os
-import shlex
-import shutil
-import subprocess
-import sys
-import textwrap
-from contextlib import contextmanager
-from distutils.dir_util import copy_tree
-from pathlib import Path
-from tempfile import TemporaryDirectory
-from typing import Optional
-
-
-def absolute_path(path):
-    return Path(path).resolve()
-
-
-def relative_path(path):
-    path = Path(path)
-    if path.is_absolute():
-        raise Exception(f'path `{path}` must be relative!')
-    return path
-
-
-@contextmanager
-def chdir(cwd: Path):
-    old = os.getcwd()
-    os.chdir(cwd)
-    try:
-        yield cwd
-    finally:
-        os.chdir(old)
-
-
-def run(cmd, *args, **kwargs):
-    print('$', ' '.join(cmd))
-    subprocess.check_call(cmd, *args, **kwargs)
-
-
-class GitRepo:
-    def __init__(self, url, branch: Optional[str] = None):
-        self.url = url
-        self.cwd = TemporaryDirectory()
-        self.branch = branch
-
-        args = [
-            'git',
-            'clone',
-            '--single-branch',
-        ]
-        if self.branch:
-            args.extend(['--branch', self.branch])
-
-        subprocess.check_call([
-            *args,
-            str(url),
-            self.cwd.name,
-        ])
-
-    def is_dirty(self):
-        res = subprocess.check_output(['git', 'status', '--porcelain'], text=True).strip()
-        return bool(res)
-
-    def update(self, message, action, branch=None):
-        with chdir(self.cwd.name):
-            if not branch:
-                cmd = ['git', 'branch', '--show-current']
-                branch = subprocess.check_output(cmd, text=True).strip()
-
-            # Run action in repo's directory
-            action()
-
-            run(['git', 'add', '.'])
-
-            if not self.is_dirty():
-                print('No changes detected, quitting')
-                return
-
-            git_with_user = [
-                'git',
-                '-c',
-                'user.name=vipvap',
-                '-c',
-                'user.email=vipvap@zenith.tech',
-            ]
-            run(git_with_user + [
-                'commit',
-                '--author="vipvap <vipvap@zenith.tech>"',
-                f'--message={message}',
-            ])
-
-            for _ in range(5):
-                try:
-                    run(['git', 'fetch', 'origin', branch])
-                    run(git_with_user + ['rebase', f'origin/{branch}'])
-                    run(['git', 'push', 'origin', branch])
-                    return
-
-                except subprocess.CalledProcessError as e:
-                    print(f'failed to update branch `{branch}`: {e}', file=sys.stderr)
-
-            raise Exception(f'failed to update branch `{branch}`')
-
-
-def do_copy(args):
-    src = args.src
-    dst = args.dst
-
-    if args.forbid_overwrite and dst.exists():
-        raise FileExistsError(f"File exists: '{dst}'")
-
-    if src.is_dir():
-        if not args.merge:
-            shutil.rmtree(dst, ignore_errors=True)
-        # distutils is deprecated, but this is a temporary workaround before python version bump
-        # here we need dir_exists_ok=True from shutil.copytree which is available in python 3.8+
-        copy_tree(str(src), str(dst))
-    else:
-        shutil.copy(src, dst)
-
-    if args.run_cmd:
-        run(shlex.split(args.run_cmd))
-
-
-def main():
-    parser = argparse.ArgumentParser(description='Git upload tool')
-    parser.add_argument('--repo', type=str, metavar='URL', required=True, help='git repo url')
-    parser.add_argument('--message', type=str, metavar='TEXT', help='commit message')
-    parser.add_argument('--branch', type=str, metavar='TEXT', help='target git repo branch')
-
-    commands = parser.add_subparsers(title='commands', dest='subparser_name')
-
-    p_copy = commands.add_parser(
-        'copy',
-        help='copy file into the repo',
-        formatter_class=argparse.RawTextHelpFormatter,
-    )
-    p_copy.add_argument('src', type=absolute_path, help='source path')
-    p_copy.add_argument('dst', type=relative_path, help='relative dest path')
-    p_copy.add_argument('--forbid-overwrite', action='store_true', help='do not allow overwrites')
-    p_copy.add_argument(
-        '--merge',
-        action='store_true',
-        help='when copying a directory do not delete existing data, but add new files')
-    p_copy.add_argument('--run-cmd',
-                        help=textwrap.dedent('''\
-                run arbitrary cmd on top of copied files,
-                example usage is static content generation
-                based on current repository state\
-            '''))
-
-    args = parser.parse_args()
-
-    commands = {
-        'copy': do_copy,
-    }
-
-    action = commands.get(args.subparser_name)
-    if action:
-        message = args.message or 'update'
-        GitRepo(args.repo, args.branch).update(message, lambda: action(args))
-    else:
-        parser.print_usage()
-
-
-if __name__ == '__main__':
-    main()

From db89b13aaa45266227b89884490c11e10abb8054 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 5 Feb 2024 14:10:08 +0200
Subject: [PATCH 0085/1571] fix: use the shared constant download buffer size
 (#6620)

Noticed that we had forgotten to use
`remote_timeline_client.rs::BUFFER_SIZE` in one instance.
---
 pageserver/src/tenant/remote_timeline_client/download.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 4309c683e2..b84b5ca33b 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -471,7 +471,7 @@ pub(crate) async fn download_initdb_tar_zst(
                 Err(other) => Err(other)?,
             };
             let mut download = tokio_util::io::StreamReader::new(download.download_stream);
-            let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
+            let mut writer = tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, file);
 
             // TODO: this consumption of the response body should be subject to timeout + cancellation, but
             // not without thinking carefully about how to recover safely from cancelling a write to

From 5e8deca26862f190e6f38b31ccea5f0a22c36c69 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 5 Feb 2024 14:49:35 +0200
Subject: [PATCH 0086/1571] metrics: remove broken tenants (#6586)

Before tenant migration it made sense to leak broken tenants in the
metrics until restart. Nowdays it makes less sense because on
cancellations we set the tenant broken. The set metric still allows
filterable alerting.

Fixes: #6507
---
 pageserver/src/tenant.rs                  | 45 +++++++++++------------
 test_runner/fixtures/metrics.py           |  2 +-
 test_runner/regress/test_tenant_detach.py | 39 +++++---------------
 3 files changed, 32 insertions(+), 54 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index dd4f9107f9..b801347c06 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -67,7 +67,9 @@ use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::metrics::TENANT;
-use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
+use crate::metrics::{
+    remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
+};
 use crate::repository::GcResult;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
@@ -2637,9 +2639,16 @@ impl Tenant {
         let (state, mut rx) = watch::channel(state);
 
         tokio::spawn(async move {
-            // Strings for metric labels
+            // reflect tenant state in metrics:
+            // - global per tenant state: TENANT_STATE_METRIC
+            // - "set" of broken tenants: BROKEN_TENANTS_SET
+            //
+            // set of broken tenants should not have zero counts so that it remains accessible for
+            // alerting.
+
             let tid = tenant_shard_id.to_string();
-            let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
+            let shard_id = tenant_shard_id.shard_slug().to_string();
+            let set_key = &[tid.as_str(), shard_id.as_str()][..];
 
             fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
                 ([state.into()], matches!(state, TenantState::Broken { .. }))
@@ -2648,21 +2657,13 @@ impl Tenant {
             let mut tuple = inspect_state(&rx.borrow_and_update());
 
             let is_broken = tuple.1;
-            let mut counted_broken = if !is_broken {
-                // the tenant might be ignored and reloaded, so first remove any previous set
-                // element. it most likely has already been scraped, as these are manual operations
-                // right now. most likely we will add it back very soon.
-                drop(
-                    crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid, &shard_id_str]),
-                );
-                false
-            } else {
+            let mut counted_broken = if is_broken {
                 // add the id to the set right away, there should not be any updates on the channel
-                // after
-                crate::metrics::BROKEN_TENANTS_SET
-                    .with_label_values(&[&tid, &shard_id_str])
-                    .set(1);
+                // after before tenant is removed, if ever
+                BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
                 true
+            } else {
+                false
             };
 
             loop {
@@ -2671,10 +2672,9 @@ impl Tenant {
                 current.inc();
 
                 if rx.changed().await.is_err() {
-                    // tenant has been dropped; decrement the counter because a tenant with that
-                    // state is no longer in tenant map, but allow any broken set item to exist
-                    // still.
+                    // tenant has been dropped
                     current.dec();
+                    drop(BROKEN_TENANTS_SET.remove_label_values(set_key));
                     break;
                 }
 
@@ -2684,10 +2684,9 @@ impl Tenant {
                 let is_broken = tuple.1;
                 if is_broken && !counted_broken {
                     counted_broken = true;
-                    // insert the tenant_id (back) into the set
-                    crate::metrics::BROKEN_TENANTS_SET
-                        .with_label_values(&[&tid, &shard_id_str])
-                        .inc();
+                    // insert the tenant_id (back) into the set while avoiding needless counter
+                    // access
+                    BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
                 }
             }
         });
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 7c489bda67..ef41774289 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -96,5 +96,5 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_evictions_total",
     "pageserver_evictions_with_low_residence_duration_total",
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
-    # pageserver_broken_tenants_count is a leaked "metric" which is "cleared" on restart or reload
+    # "pageserver_broken_tenants_count" -- used only for broken
 )
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 8d5ef4e3c4..4752699abb 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -742,8 +742,6 @@ def ensure_test_data(data_id: int, data: str, endpoint: Endpoint):
 def test_metrics_while_ignoring_broken_tenant_and_reloading(
     neon_env_builder: NeonEnvBuilder,
 ):
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-
     env = neon_env_builder.init_start()
 
     client = env.pageserver.http_client()
@@ -761,56 +759,37 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
 
     client.tenant_break(env.initial_tenant)
 
-    found_broken = False
-    active, broken, broken_set = ([], [], [])
-    for _ in range(10):
+    def found_broken():
         m = client.get_metrics()
         active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
         broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
         broken_set = m.query_all(
             "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
         )
-        found_broken = only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
+        assert only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
 
-        if found_broken:
-            break
-        log.info(f"active: {active}, broken: {broken}, broken_set: {broken_set}")
-        time.sleep(0.5)
-    assert (
-        found_broken
-    ), f"tenant shows up as broken; active={active}, broken={broken}, broken_set={broken_set}"
+    wait_until(10, 0.5, found_broken)
 
     client.tenant_ignore(env.initial_tenant)
 
-    found_broken = False
-    broken, broken_set = ([], [])
-    for _ in range(10):
+    def found_cleaned_up():
         m = client.get_metrics()
         broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
         broken_set = m.query_all(
             "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
         )
-        found_broken = only_int(broken) == 0 and only_int(broken_set) == 1
+        assert only_int(broken) == 0 and len(broken_set) == 0
 
-        if found_broken:
-            break
-        time.sleep(0.5)
-    assert found_broken, f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
+    wait_until(10, 0.5, found_cleaned_up)
 
     env.pageserver.tenant_load(env.initial_tenant)
 
-    found_active = False
-    active, broken_set = ([], [])
-    for _ in range(10):
+    def found_active():
         m = client.get_metrics()
         active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
         broken_set = m.query_all(
             "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
         )
-        found_active = only_int(active) == 1 and len(broken_set) == 0
+        assert only_int(active) == 1 and len(broken_set) == 0
 
-        if found_active:
-            break
-        time.sleep(0.5)
-
-    assert found_active, f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
+    wait_until(10, 0.5, found_active)

From 74c5e3d9b877ae006c0c5c4b4ea176ed36f647c1 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 5 Feb 2024 14:27:25 +0000
Subject: [PATCH 0087/1571] use string interner for project cache (#6578)

## Problem

Running some memory profiling with high concurrent request rate shows
seemingly some memory fragmentation.

## Summary of changes

Eventually, we will want to separate global memory (caches) from local
memory (per connection handshake and per passthrough).

Using a string interner for project info cache helps reduce some of the
fragmentation of the global cache by having a single heap dedicated to
project strings, and not scattering them throughout all a requests.

At the same time, the interned key is 4 bytes vs the 24 bytes that
`SmolStr` offers.

Important: we should only store verified strings in the interner because
there's no way to remove them afterwards. Good for caching responses
from console.
---
 Cargo.lock                       |  13 ++
 Cargo.toml                       |   1 +
 proxy/Cargo.toml                 |   2 +
 proxy/src/cache/project_info.rs  |  84 ++++++-----
 proxy/src/intern.rs              | 237 +++++++++++++++++++++++++++++++
 proxy/src/lib.rs                 |   1 +
 proxy/src/redis/notifications.rs |  31 ++--
 workspace_hack/Cargo.toml        |   5 +-
 8 files changed, 321 insertions(+), 53 deletions(-)
 create mode 100644 proxy/src/intern.rs

diff --git a/Cargo.lock b/Cargo.lock
index 02450709d1..c16331636a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2718,6 +2718,16 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "lasso"
+version = "0.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2"
+dependencies = [
+ "dashmap",
+ "hashbrown 0.13.2",
+]
+
 [[package]]
 name = "lazy_static"
 version = "1.4.0"
@@ -4075,6 +4085,7 @@ dependencies = [
  "hyper-tungstenite",
  "ipnet",
  "itertools",
+ "lasso",
  "md5",
  "metrics",
  "native-tls",
@@ -4091,6 +4102,7 @@ dependencies = [
  "pq_proto",
  "prometheus",
  "rand 0.8.5",
+ "rand_distr",
  "rcgen",
  "redis",
  "regex",
@@ -6803,6 +6815,7 @@ dependencies = [
  "futures-sink",
  "futures-util",
  "getrandom 0.2.11",
+ "hashbrown 0.13.2",
  "hashbrown 0.14.0",
  "hex",
  "hmac",
diff --git a/Cargo.toml b/Cargo.toml
index 0cfe522ff9..271edee742 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -95,6 +95,7 @@ inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "9"
+lasso = "0.7"
 libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 79abe639ed..1247f08ee6 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -31,6 +31,7 @@ hyper-tungstenite.workspace = true
 hyper.workspace = true
 ipnet.workspace = true
 itertools.workspace = true
+lasso = { workspace = true, features = ["multi-threaded"] }
 md5.workspace = true
 metrics.workspace = true
 once_cell.workspace = true
@@ -92,3 +93,4 @@ rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
 walkdir.workspace = true
+rand_distr = "0.4"
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 6f37868a8c..62015312a9 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -12,15 +12,18 @@ use tokio::time::Instant;
 use tracing::{debug, info};
 
 use crate::{
-    auth::IpPattern, config::ProjectInfoCacheOptions, console::AuthSecret, EndpointId, ProjectId,
-    RoleName,
+    auth::IpPattern,
+    config::ProjectInfoCacheOptions,
+    console::AuthSecret,
+    intern::{EndpointIdInt, ProjectIdInt, RoleNameInt},
+    EndpointId, ProjectId, RoleName,
 };
 
 use super::{Cache, Cached};
 
 pub trait ProjectInfoCache {
-    fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId);
-    fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName);
+    fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
+    fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
     fn enable_ttl(&self);
     fn disable_ttl(&self);
 }
@@ -47,7 +50,7 @@ impl<T> From<T> for Entry<T> {
 
 #[derive(Default)]
 struct EndpointInfo {
-    secret: std::collections::HashMap<RoleName, Entry<Option<AuthSecret>>>,
+    secret: std::collections::HashMap<RoleNameInt, Entry<Option<AuthSecret>>>,
     allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
 }
 
@@ -60,11 +63,11 @@ impl EndpointInfo {
     }
     pub fn get_role_secret(
         &self,
-        role_name: &RoleName,
+        role_name: RoleNameInt,
         valid_since: Instant,
         ignore_cache_since: Option<Instant>,
     ) -> Option<(Option<AuthSecret>, bool)> {
-        if let Some(secret) = self.secret.get(role_name) {
+        if let Some(secret) = self.secret.get(&role_name) {
             if valid_since < secret.created_at {
                 return Some((
                     secret.value.clone(),
@@ -93,8 +96,8 @@ impl EndpointInfo {
     pub fn invalidate_allowed_ips(&mut self) {
         self.allowed_ips = None;
     }
-    pub fn invalidate_role_secret(&mut self, role_name: &RoleName) {
-        self.secret.remove(role_name);
+    pub fn invalidate_role_secret(&mut self, role_name: RoleNameInt) {
+        self.secret.remove(&role_name);
     }
 }
 
@@ -106,9 +109,9 @@ impl EndpointInfo {
 /// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available?
 /// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache.
 pub struct ProjectInfoCacheImpl {
-    cache: DashMap<EndpointId, EndpointInfo>,
+    cache: DashMap<EndpointIdInt, EndpointInfo>,
 
-    project2ep: DashMap<ProjectId, HashSet<EndpointId>>,
+    project2ep: DashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
     config: ProjectInfoCacheOptions,
 
     start_time: Instant,
@@ -116,11 +119,11 @@ pub struct ProjectInfoCacheImpl {
 }
 
 impl ProjectInfoCache for ProjectInfoCacheImpl {
-    fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId) {
+    fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) {
         info!("invalidating allowed ips for project `{}`", project_id);
         let endpoints = self
             .project2ep
-            .get(project_id)
+            .get(&project_id)
             .map(|kv| kv.value().clone())
             .unwrap_or_default();
         for endpoint_id in endpoints {
@@ -129,14 +132,14 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
             }
         }
     }
-    fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName) {
+    fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt) {
         info!(
             "invalidating role secret for project_id `{}` and role_name `{}`",
-            project_id, role_name
+            project_id, role_name,
         );
         let endpoints = self
             .project2ep
-            .get(project_id)
+            .get(&project_id)
             .map(|kv| kv.value().clone())
             .unwrap_or_default();
         for endpoint_id in endpoints {
@@ -173,15 +176,17 @@ impl ProjectInfoCacheImpl {
         endpoint_id: &EndpointId,
         role_name: &RoleName,
     ) -> Option<Cached<&Self, Option<AuthSecret>>> {
+        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
+        let role_name = RoleNameInt::get(role_name)?;
         let (valid_since, ignore_cache_since) = self.get_cache_times();
-        let endpoint_info = self.cache.get(endpoint_id)?;
+        let endpoint_info = self.cache.get(&endpoint_id)?;
         let (value, ignore_cache) =
             endpoint_info.get_role_secret(role_name, valid_since, ignore_cache_since)?;
         if !ignore_cache {
             let cached = Cached {
                 token: Some((
                     self,
-                    CachedLookupInfo::new_role_secret(endpoint_id.clone(), role_name.clone()),
+                    CachedLookupInfo::new_role_secret(endpoint_id, role_name),
                 )),
                 value,
             };
@@ -193,13 +198,14 @@ impl ProjectInfoCacheImpl {
         &self,
         endpoint_id: &EndpointId,
     ) -> Option<Cached<&Self, Arc<Vec<IpPattern>>>> {
+        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
         let (valid_since, ignore_cache_since) = self.get_cache_times();
-        let endpoint_info = self.cache.get(endpoint_id)?;
+        let endpoint_info = self.cache.get(&endpoint_id)?;
         let value = endpoint_info.get_allowed_ips(valid_since, ignore_cache_since);
         let (value, ignore_cache) = value?;
         if !ignore_cache {
             let cached = Cached {
-                token: Some((self, CachedLookupInfo::new_allowed_ips(endpoint_id.clone()))),
+                token: Some((self, CachedLookupInfo::new_allowed_ips(endpoint_id))),
                 value,
             };
             return Some(cached);
@@ -213,14 +219,17 @@ impl ProjectInfoCacheImpl {
         role_name: &RoleName,
         secret: Option<AuthSecret>,
     ) {
+        let project_id = ProjectIdInt::from(project_id);
+        let endpoint_id = EndpointIdInt::from(endpoint_id);
+        let role_name = RoleNameInt::from(role_name);
         if self.cache.len() >= self.config.size {
             // If there are too many entries, wait until the next gc cycle.
             return;
         }
-        self.inser_project2endpoint(project_id, endpoint_id);
-        let mut entry = self.cache.entry(endpoint_id.clone()).or_default();
+        self.insert_project2endpoint(project_id, endpoint_id);
+        let mut entry = self.cache.entry(endpoint_id).or_default();
         if entry.secret.len() < self.config.max_roles {
-            entry.secret.insert(role_name.clone(), secret.into());
+            entry.secret.insert(role_name, secret.into());
         }
     }
     pub fn insert_allowed_ips(
@@ -229,22 +238,21 @@ impl ProjectInfoCacheImpl {
         endpoint_id: &EndpointId,
         allowed_ips: Arc<Vec<IpPattern>>,
     ) {
+        let project_id = ProjectIdInt::from(project_id);
+        let endpoint_id = EndpointIdInt::from(endpoint_id);
         if self.cache.len() >= self.config.size {
             // If there are too many entries, wait until the next gc cycle.
             return;
         }
-        self.inser_project2endpoint(project_id, endpoint_id);
-        self.cache
-            .entry(endpoint_id.clone())
-            .or_default()
-            .allowed_ips = Some(allowed_ips.into());
+        self.insert_project2endpoint(project_id, endpoint_id);
+        self.cache.entry(endpoint_id).or_default().allowed_ips = Some(allowed_ips.into());
     }
-    fn inser_project2endpoint(&self, project_id: &ProjectId, endpoint_id: &EndpointId) {
-        if let Some(mut endpoints) = self.project2ep.get_mut(project_id) {
-            endpoints.insert(endpoint_id.clone());
+    fn insert_project2endpoint(&self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt) {
+        if let Some(mut endpoints) = self.project2ep.get_mut(&project_id) {
+            endpoints.insert(endpoint_id);
         } else {
             self.project2ep
-                .insert(project_id.clone(), HashSet::from([endpoint_id.clone()]));
+                .insert(project_id, HashSet::from([endpoint_id]));
         }
     }
     fn get_cache_times(&self) -> (Instant, Option<Instant>) {
@@ -300,18 +308,18 @@ impl ProjectInfoCacheImpl {
 /// This is used to invalidate cache entries.
 pub struct CachedLookupInfo {
     /// Search by this key.
-    endpoint_id: EndpointId,
+    endpoint_id: EndpointIdInt,
     lookup_type: LookupType,
 }
 
 impl CachedLookupInfo {
-    pub(self) fn new_role_secret(endpoint_id: EndpointId, role_name: RoleName) -> Self {
+    pub(self) fn new_role_secret(endpoint_id: EndpointIdInt, role_name: RoleNameInt) -> Self {
         Self {
             endpoint_id,
             lookup_type: LookupType::RoleSecret(role_name),
         }
     }
-    pub(self) fn new_allowed_ips(endpoint_id: EndpointId) -> Self {
+    pub(self) fn new_allowed_ips(endpoint_id: EndpointIdInt) -> Self {
         Self {
             endpoint_id,
             lookup_type: LookupType::AllowedIps,
@@ -320,7 +328,7 @@ impl CachedLookupInfo {
 }
 
 enum LookupType {
-    RoleSecret(RoleName),
+    RoleSecret(RoleNameInt),
     AllowedIps,
 }
 
@@ -335,7 +343,7 @@ impl Cache for ProjectInfoCacheImpl {
         match &key.lookup_type {
             LookupType::RoleSecret(role_name) => {
                 if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
-                    endpoint_info.invalidate_role_secret(role_name);
+                    endpoint_info.invalidate_role_secret(*role_name);
                 }
             }
             LookupType::AllowedIps => {
@@ -457,7 +465,7 @@ mod tests {
         assert_eq!(cached.value, secret2);
 
         // The only way to invalidate this value is to invalidate via the api.
-        cache.invalidate_role_secret_for_project(&project_id, &user2);
+        cache.invalidate_role_secret_for_project((&project_id).into(), (&user2).into());
         assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
 
         let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
new file mode 100644
index 0000000000..a6519bdff9
--- /dev/null
+++ b/proxy/src/intern.rs
@@ -0,0 +1,237 @@
+use std::{
+    hash::BuildHasherDefault, marker::PhantomData, num::NonZeroUsize, ops::Index, sync::OnceLock,
+};
+
+use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo};
+use rustc_hash::FxHasher;
+
+use crate::{BranchId, EndpointId, ProjectId, RoleName};
+
+pub trait InternId: Sized + 'static {
+    fn get_interner() -> &'static StringInterner<Self>;
+}
+
+pub struct StringInterner<Id> {
+    inner: ThreadedRodeo<Spur, BuildHasherDefault<FxHasher>>,
+    _id: PhantomData<Id>,
+}
+
+#[derive(PartialEq, Debug, Clone, Copy, Eq, Hash)]
+pub struct InternedString<Id> {
+    inner: Spur,
+    _id: PhantomData<Id>,
+}
+
+impl<Id: InternId> std::fmt::Display for InternedString<Id> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.as_str().fmt(f)
+    }
+}
+
+impl<Id: InternId> InternedString<Id> {
+    pub fn as_str(&self) -> &'static str {
+        Id::get_interner().inner.resolve(&self.inner)
+    }
+    pub fn get(s: &str) -> Option<Self> {
+        Id::get_interner().get(s)
+    }
+}
+
+impl<Id: InternId> AsRef<str> for InternedString<Id> {
+    fn as_ref(&self) -> &str {
+        self.as_str()
+    }
+}
+
+impl<Id: InternId> std::ops::Deref for InternedString<Id> {
+    type Target = str;
+    fn deref(&self) -> &str {
+        self.as_str()
+    }
+}
+
+impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString<Id> {
+    fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
+        struct Visitor<Id>(PhantomData<Id>);
+        impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor<Id> {
+            type Value = InternedString<Id>;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                formatter.write_str("a string")
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                Ok(Id::get_interner().get_or_intern(v))
+            }
+        }
+        d.deserialize_str(Visitor::<Id>(PhantomData))
+    }
+}
+
+impl<Id: InternId> serde::Serialize for InternedString<Id> {
+    fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
+        self.as_str().serialize(s)
+    }
+}
+
+impl<Id: InternId> StringInterner<Id> {
+    pub fn new() -> Self {
+        StringInterner {
+            inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher(
+                Capacity::new(2500, NonZeroUsize::new(1 << 16).unwrap()),
+                // unbounded
+                MemoryLimits::for_memory_usage(usize::MAX),
+                BuildHasherDefault::<FxHasher>::default(),
+            ),
+            _id: PhantomData,
+        }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.inner.is_empty()
+    }
+
+    pub fn len(&self) -> usize {
+        self.inner.len()
+    }
+
+    pub fn current_memory_usage(&self) -> usize {
+        self.inner.current_memory_usage()
+    }
+
+    pub fn get_or_intern(&self, s: &str) -> InternedString<Id> {
+        InternedString {
+            inner: self.inner.get_or_intern(s),
+            _id: PhantomData,
+        }
+    }
+
+    pub fn get(&self, s: &str) -> Option<InternedString<Id>> {
+        Some(InternedString {
+            inner: self.inner.get(s)?,
+            _id: PhantomData,
+        })
+    }
+}
+
+impl<Id: InternId> Index<InternedString<Id>> for StringInterner<Id> {
+    type Output = str;
+
+    fn index(&self, index: InternedString<Id>) -> &Self::Output {
+        self.inner.resolve(&index.inner)
+    }
+}
+
+impl<Id: InternId> Default for StringInterner<Id> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct RoleNameTag;
+impl InternId for RoleNameTag {
+    fn get_interner() -> &'static StringInterner<Self> {
+        pub static ROLE_NAMES: OnceLock<StringInterner<RoleNameTag>> = OnceLock::new();
+        ROLE_NAMES.get_or_init(Default::default)
+    }
+}
+pub type RoleNameInt = InternedString<RoleNameTag>;
+impl From<&RoleName> for RoleNameInt {
+    fn from(value: &RoleName) -> Self {
+        RoleNameTag::get_interner().get_or_intern(value)
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct EndpointIdTag;
+impl InternId for EndpointIdTag {
+    fn get_interner() -> &'static StringInterner<Self> {
+        pub static ROLE_NAMES: OnceLock<StringInterner<EndpointIdTag>> = OnceLock::new();
+        ROLE_NAMES.get_or_init(Default::default)
+    }
+}
+pub type EndpointIdInt = InternedString<EndpointIdTag>;
+impl From<&EndpointId> for EndpointIdInt {
+    fn from(value: &EndpointId) -> Self {
+        EndpointIdTag::get_interner().get_or_intern(value)
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct BranchIdTag;
+impl InternId for BranchIdTag {
+    fn get_interner() -> &'static StringInterner<Self> {
+        pub static ROLE_NAMES: OnceLock<StringInterner<BranchIdTag>> = OnceLock::new();
+        ROLE_NAMES.get_or_init(Default::default)
+    }
+}
+pub type BranchIdInt = InternedString<BranchIdTag>;
+impl From<&BranchId> for BranchIdInt {
+    fn from(value: &BranchId) -> Self {
+        BranchIdTag::get_interner().get_or_intern(value)
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct ProjectIdTag;
+impl InternId for ProjectIdTag {
+    fn get_interner() -> &'static StringInterner<Self> {
+        pub static ROLE_NAMES: OnceLock<StringInterner<ProjectIdTag>> = OnceLock::new();
+        ROLE_NAMES.get_or_init(Default::default)
+    }
+}
+pub type ProjectIdInt = InternedString<ProjectIdTag>;
+impl From<&ProjectId> for ProjectIdInt {
+    fn from(value: &ProjectId) -> Self {
+        ProjectIdTag::get_interner().get_or_intern(value)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::OnceLock;
+
+    use crate::intern::StringInterner;
+
+    use super::InternId;
+
+    struct MyId;
+    impl InternId for MyId {
+        fn get_interner() -> &'static StringInterner<Self> {
+            pub static ROLE_NAMES: OnceLock<StringInterner<MyId>> = OnceLock::new();
+            ROLE_NAMES.get_or_init(Default::default)
+        }
+    }
+
+    #[test]
+    fn push_many_strings() {
+        use rand::{rngs::StdRng, Rng, SeedableRng};
+        use rand_distr::Zipf;
+
+        let endpoint_dist = Zipf::new(500000, 0.8).unwrap();
+        let endpoints = StdRng::seed_from_u64(272488357).sample_iter(endpoint_dist);
+
+        let interner = MyId::get_interner();
+
+        const N: usize = 100_000;
+        let mut verify = Vec::with_capacity(N);
+        for endpoint in endpoints.take(N) {
+            let endpoint = format!("ep-string-interning-{endpoint}");
+            let key = interner.get_or_intern(&endpoint);
+            verify.push((endpoint, key));
+        }
+
+        for (s, key) in verify {
+            assert_eq!(interner[key], s);
+        }
+
+        // 2031616/59861 = 34 bytes per string
+        assert_eq!(interner.len(), 59_861);
+        // will have other overhead for the internal hashmaps that are not accounted for.
+        assert_eq!(interner.current_memory_usage(), 2_031_616);
+    }
+}
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index db6256d611..da7c7f3ed2 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -16,6 +16,7 @@ pub mod console;
 pub mod context;
 pub mod error;
 pub mod http;
+pub mod intern;
 pub mod jemalloc;
 pub mod logging;
 pub mod metrics;
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 9cd70b109b..158884aa17 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -4,7 +4,10 @@ use futures::StreamExt;
 use redis::aio::PubSub;
 use serde::Deserialize;
 
-use crate::{cache::project_info::ProjectInfoCache, ProjectId, RoleName};
+use crate::{
+    cache::project_info::ProjectInfoCache,
+    intern::{ProjectIdInt, RoleNameInt},
+};
 
 const CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
 const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
@@ -45,12 +48,12 @@ enum Notification {
 }
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
 struct AllowedIpsUpdate {
-    project_id: ProjectId,
+    project_id: ProjectIdInt,
 }
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
 struct PasswordUpdate {
-    project_id: ProjectId,
-    role_name: RoleName,
+    project_id: ProjectIdInt,
+    role_name: RoleNameInt,
 }
 fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
 where
@@ -65,11 +68,11 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
     use Notification::*;
     match msg {
         AllowedIpsUpdate { allowed_ips_update } => {
-            cache.invalidate_allowed_ips_for_project(&allowed_ips_update.project_id)
+            cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id)
         }
         PasswordUpdate { password_update } => cache.invalidate_role_secret_for_project(
-            &password_update.project_id,
-            &password_update.role_name,
+            password_update.project_id,
+            password_update.role_name,
         ),
     }
 }
@@ -141,12 +144,14 @@ where
 
 #[cfg(test)]
 mod tests {
+    use crate::{ProjectId, RoleName};
+
     use super::*;
     use serde_json::json;
 
     #[test]
     fn parse_allowed_ips() -> anyhow::Result<()> {
-        let project_id = "new_project".to_string();
+        let project_id: ProjectId = "new_project".into();
         let data = format!("{{\"project_id\": \"{project_id}\"}}");
         let text = json!({
             "type": "message",
@@ -161,7 +166,7 @@ mod tests {
             result,
             Notification::AllowedIpsUpdate {
                 allowed_ips_update: AllowedIpsUpdate {
-                    project_id: project_id.into()
+                    project_id: (&project_id).into()
                 }
             }
         );
@@ -171,8 +176,8 @@ mod tests {
 
     #[test]
     fn parse_password_updated() -> anyhow::Result<()> {
-        let project_id = "new_project".to_string();
-        let role_name = "new_role".to_string();
+        let project_id: ProjectId = "new_project".into();
+        let role_name: RoleName = "new_role".into();
         let data = format!("{{\"project_id\": \"{project_id}\", \"role_name\": \"{role_name}\"}}");
         let text = json!({
             "type": "message",
@@ -187,8 +192,8 @@ mod tests {
             result,
             Notification::PasswordUpdate {
                 password_update: PasswordUpdate {
-                    project_id: project_id.into(),
-                    role_name: role_name.into()
+                    project_id: (&project_id).into(),
+                    role_name: (&role_name).into(),
                 }
             }
         );
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index f58b912a77..74464dd4c8 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -39,7 +39,8 @@ futures-io = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown = { version = "0.14", default-features = false, features = ["raw"] }
+hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }
+hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
@@ -91,7 +92,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown = { version = "0.14", default-features = false, features = ["raw"] }
+hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }

From cb7c89332f25c652fa7dd06a9be7d984f8cc3989 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 5 Feb 2024 14:29:05 +0000
Subject: [PATCH 0088/1571] control_plane: fix tenant GET, clean up endpoints
 (#6553)

Cleanups from https://github.com/neondatabase/neon/pull/6394

- There was a rogue `*` breaking the `GET /tenant/:tenant_id`, which
passes through to shard zero
- There was a duplicate migrate endpoint
- There are un-prefixed API endpoints that were only needed for compat
tests and can now be removed.
---
 control_plane/attachment_service/src/http.rs | 10 +---------
 test_runner/regress/test_sharding_service.py |  7 +++++++
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index aa8c73c493..049e66fddf 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -403,10 +403,6 @@ pub fn make_router(
         .put("/v1/tenant/:tenant_id/location_config", |r| {
             tenant_service_handler(r, handle_tenant_location_config)
         })
-        // Tenant Shard operations (low level/maintenance)
-        .put("/tenant/:tenant_shard_id/migrate", |r| {
-            tenant_service_handler(r, handle_tenant_shard_migrate)
-        })
         // Timeline operations
         .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
             tenant_service_handler(r, handle_tenant_timeline_delete)
@@ -415,7 +411,7 @@ pub fn make_router(
             tenant_service_handler(r, handle_tenant_timeline_create)
         })
         // Tenant detail GET passthrough to shard zero
-        .get("/v1/tenant/:tenant_id*", |r| {
+        .get("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(r, handle_tenant_timeline_passthrough)
         })
         // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
@@ -423,8 +419,4 @@ pub fn make_router(
         .get("/v1/tenant/:tenant_id/timeline*", |r| {
             tenant_service_handler(r, handle_tenant_timeline_passthrough)
         })
-        // Path aliases for tests_forward_compatibility
-        // TODO: remove these in future PR
-        .post("/re-attach", |r| request_span(r, handle_re_attach))
-        .post("/validate", |r| request_span(r, handle_validate))
 }
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 346df708de..5c70378ab0 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -140,6 +140,13 @@ def test_sharding_service_passthrough(
     timelines = client.timeline_list(tenant_id=env.initial_tenant)
     assert len(timelines) == 1
 
+    status = client.tenant_status(env.initial_tenant)
+    assert TenantId(status["id"]) == env.initial_tenant
+    assert set(TimelineId(t) for t in status["timelines"]) == {
+        env.initial_timeline,
+    }
+    assert status["state"]["slug"] == "Active"
+
 
 def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()

From 8e114bd6101dee117e1125ea68dfbdbbc59c965f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 5 Feb 2024 19:31:55 +0000
Subject: [PATCH 0089/1571] control_plane/attachment_service: make
 --database-url optional (#6636)

## Problem

This change was left out of #6585 accidentally -- just forgot to push
the very last version of my branch.

Now that we can load database url from Secrets Manager, we don't always
need it on the CLI any more. We should let the user omit it instead of
passing `--database-url ""`

## Summary of changes

- Make `--database-url` optional
---
 control_plane/attachment_service/src/main.rs | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index eda9c7aad6..37b06c4090 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -53,7 +53,7 @@ struct Cli {
 
     /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
     #[arg(long)]
-    database_url: String,
+    database_url: Option<String>,
 }
 
 /// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
@@ -74,10 +74,9 @@ impl Secrets {
     const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
 
     async fn load(args: &Cli) -> anyhow::Result<Self> {
-        if args.database_url.is_empty() {
-            Self::load_aws_sm().await
-        } else {
-            Self::load_cli(args)
+        match &args.database_url {
+            Some(url) => Self::load_cli(url, args),
+            None => Self::load_aws_sm().await,
         }
     }
 
@@ -153,13 +152,13 @@ impl Secrets {
         })
     }
 
-    fn load_cli(args: &Cli) -> anyhow::Result<Self> {
+    fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
         let public_key = match &args.public_key {
             None => None,
             Some(key_path) => Some(JwtAuth::from_key_path(key_path)?),
         };
         Ok(Self {
-            database_url: args.database_url.clone(),
+            database_url: database_url.to_owned(),
             public_key,
             jwt_token: args.jwt_token.clone(),
             control_plane_jwt_token: args.control_plane_jwt_token.clone(),

From 947165788dc2447b17b8cd163568d10b8c4ddeaa Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 6 Feb 2024 09:39:06 +0200
Subject: [PATCH 0090/1571] refactor: needless cancellation token cloning
 (#6618)

The solution we ended up for `backoff::retry` requires always cloning of
cancellation tokens even though there is just `.await`. Fix that, and
also turn the return type into `Option<Result<T, E>>` avoiding the need
for the `E::cancelled()` fn passed in.

Cc: #6096
---
 .../attachment_service/src/compute_hook.rs    |  4 +-
 libs/remote_storage/src/azure_blob.rs         |  2 +-
 libs/remote_storage/src/lib.rs                |  4 +-
 libs/remote_storage/src/local_fs.rs           |  2 +-
 libs/remote_storage/src/s3_bucket.rs          | 14 ++-
 libs/remote_storage/src/simulate_failures.rs  |  2 +-
 libs/remote_storage/tests/test_real_s3.rs     | 11 ++-
 libs/utils/src/backoff.rs                     | 92 ++++++++-----------
 pageserver/src/consumption_metrics/upload.rs  | 56 +++++------
 pageserver/src/control_plane_client.rs        | 35 ++-----
 pageserver/src/deletion_queue/deleter.rs      |  4 +-
 pageserver/src/tenant.rs                      |  8 +-
 pageserver/src/tenant/delete.rs               |  8 +-
 .../src/tenant/remote_timeline_client.rs      | 16 +++-
 .../tenant/remote_timeline_client/download.rs | 19 ++--
 .../tenant/remote_timeline_client/upload.rs   |  8 +-
 pageserver/src/tenant/secondary/downloader.rs |  8 +-
 .../src/tenant/secondary/heatmap_uploader.rs  |  4 +-
 proxy/src/context/parquet.rs                  |  4 +-
 safekeeper/src/wal_backup.rs                  |  9 +-
 20 files changed, 156 insertions(+), 154 deletions(-)

diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs
index 9c1185f259..4ca26431ca 100644
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -244,9 +244,11 @@ impl ComputeHook {
             3,
             10,
             "Send compute notification",
-            backoff::Cancel::new(cancel.clone(), || NotifyError::ShuttingDown),
+            cancel,
         )
         .await
+        .ok_or_else(|| NotifyError::ShuttingDown)
+        .and_then(|x| x)
     }
 
     /// Call this to notify the compute (postgres) tier of new pageservers to use
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 57c57a2b70..c6d5224706 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -379,7 +379,7 @@ impl RemoteStorage for AzureBlobStorage {
         _prefix: Option<&RemotePath>,
         _timestamp: SystemTime,
         _done_if_after: SystemTime,
-        _cancel: CancellationToken,
+        _cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError> {
         // TODO use Azure point in time recovery feature for this
         // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 4aeaee70b1..e64b1de6f9 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -218,7 +218,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
         prefix: Option<&RemotePath>,
         timestamp: SystemTime,
         done_if_after: SystemTime,
-        cancel: CancellationToken,
+        cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError>;
 }
 
@@ -442,7 +442,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         prefix: Option<&RemotePath>,
         timestamp: SystemTime,
         done_if_after: SystemTime,
-        cancel: CancellationToken,
+        cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError> {
         match self {
             Self::LocalFs(s) => {
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index d47fa75b37..36ec15e1b1 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -431,7 +431,7 @@ impl RemoteStorage for LocalFs {
         _prefix: Option<&RemotePath>,
         _timestamp: SystemTime,
         _done_if_after: SystemTime,
-        _cancel: CancellationToken,
+        _cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError> {
         Err(TimeTravelError::Unimplemented)
     }
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 4d6564cba6..c9ad9ef225 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -638,7 +638,7 @@ impl RemoteStorage for S3Bucket {
         prefix: Option<&RemotePath>,
         timestamp: SystemTime,
         done_if_after: SystemTime,
-        cancel: CancellationToken,
+        cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError> {
         let kind = RequestKind::TimeTravel;
         let _guard = self.permit(kind).await;
@@ -678,9 +678,11 @@ impl RemoteStorage for S3Bucket {
                 warn_threshold,
                 max_retries,
                 "listing object versions for time_travel_recover",
-                backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
+                cancel,
             )
-            .await?;
+            .await
+            .ok_or_else(|| TimeTravelError::Cancelled)
+            .and_then(|x| x)?;
 
             tracing::trace!(
                 "  Got List response version_id_marker={:?}, key_marker={:?}",
@@ -805,9 +807,11 @@ impl RemoteStorage for S3Bucket {
                             warn_threshold,
                             max_retries,
                             "copying object version for time_travel_recover",
-                            backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
+                            cancel,
                         )
-                        .await?;
+                        .await
+                        .ok_or_else(|| TimeTravelError::Cancelled)
+                        .and_then(|x| x)?;
                         tracing::info!(%version_id, %key, "Copied old version in S3");
                     }
                     VerOrDelete {
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index ee9792232a..82d5a61fda 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -190,7 +190,7 @@ impl RemoteStorage for UnreliableWrapper {
         prefix: Option<&RemotePath>,
         timestamp: SystemTime,
         done_if_after: SystemTime,
-        cancel: CancellationToken,
+        cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError> {
         self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
             .map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 679be66bf7..fc52dabc36 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -56,9 +56,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
             warn_threshold,
             max_retries,
             "test retry",
-            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+            &CancellationToken::new(),
         )
         .await
+        .expect("never cancelled")
     }
 
     async fn time_point() -> SystemTime {
@@ -76,6 +77,8 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
             .collect::<HashSet<_>>())
     }
 
+    let cancel = CancellationToken::new();
+
     let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
         .with_context(|| "RemotePath conversion")?;
 
@@ -142,7 +145,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     // No changes after recovery to t2 (no-op)
     let t_final = time_point().await;
     ctx.client
-        .time_travel_recover(None, t2, t_final, CancellationToken::new())
+        .time_travel_recover(None, t2, t_final, &cancel)
         .await?;
     let t2_files_recovered = list_files(&ctx.client).await?;
     println!("after recovery to t2: {t2_files_recovered:?}");
@@ -153,7 +156,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     // after recovery to t1: path1 is back, path2 has the old content
     let t_final = time_point().await;
     ctx.client
-        .time_travel_recover(None, t1, t_final, CancellationToken::new())
+        .time_travel_recover(None, t1, t_final, &cancel)
         .await?;
     let t1_files_recovered = list_files(&ctx.client).await?;
     println!("after recovery to t1: {t1_files_recovered:?}");
@@ -164,7 +167,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     // after recovery to t0: everything is gone except for path1
     let t_final = time_point().await;
     ctx.client
-        .time_travel_recover(None, t0, t_final, CancellationToken::new())
+        .time_travel_recover(None, t0, t_final, &cancel)
         .await?;
     let t0_files_recovered = list_files(&ctx.client).await?;
     println!("after recovery to t0: {t0_files_recovered:?}");
diff --git a/libs/utils/src/backoff.rs b/libs/utils/src/backoff.rs
index d50ad39585..096c7e5854 100644
--- a/libs/utils/src/backoff.rs
+++ b/libs/utils/src/backoff.rs
@@ -37,69 +37,53 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec
     }
 }
 
-/// Configure cancellation for a retried operation: when to cancel (the token), and
-/// what kind of error to return on cancellation
-pub struct Cancel<E, CF>
-where
-    E: Display + Debug + 'static,
-    CF: Fn() -> E,
-{
-    token: CancellationToken,
-    on_cancel: CF,
-}
-
-impl<E, CF> Cancel<E, CF>
-where
-    E: Display + Debug + 'static,
-    CF: Fn() -> E,
-{
-    pub fn new(token: CancellationToken, on_cancel: CF) -> Self {
-        Self { token, on_cancel }
-    }
-}
-
-/// retries passed operation until one of the following conditions are met:
-/// Encountered error is considered as permanent (non-retryable)
-/// Retries have been exhausted.
-/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
-/// When attempts cross `warn_threshold` function starts to emit log warnings.
+/// Retries passed operation until one of the following conditions are met:
+/// - encountered error is considered as permanent (non-retryable)
+/// - retries have been exhausted
+/// - cancellation token has been cancelled
+///
+/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent
+/// errors. When attempts cross `warn_threshold` function starts to emit log warnings.
 /// `description` argument is added to log messages. Its value should identify the `op` is doing
-/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken
-/// to drop out promptly on shutdown.
-pub async fn retry<T, O, F, E, CF>(
+/// `cancel` cancels new attempts and the backoff sleep.
+///
+/// If attempts fail, they are being logged with `{:#}` which works for anyhow, but does not work
+/// for any other error type. Final failed attempt is logged with `{:?}`.
+///
+/// Returns `None` if cancellation was noticed during backoff or the terminal result.
+pub async fn retry<T, O, F, E>(
     mut op: O,
     is_permanent: impl Fn(&E) -> bool,
     warn_threshold: u32,
     max_retries: u32,
     description: &str,
-    cancel: Cancel<E, CF>,
-) -> Result<T, E>
+    cancel: &CancellationToken,
+) -> Option<Result<T, E>>
 where
     // Not std::error::Error because anyhow::Error doesnt implement it.
     // For context see https://github.com/dtolnay/anyhow/issues/63
     E: Display + Debug + 'static,
     O: FnMut() -> F,
     F: Future<Output = Result<T, E>>,
-    CF: Fn() -> E,
 {
     let mut attempts = 0;
     loop {
-        if cancel.token.is_cancelled() {
-            return Err((cancel.on_cancel)());
+        if cancel.is_cancelled() {
+            return None;
         }
 
         let result = op().await;
-        match result {
+        match &result {
             Ok(_) => {
                 if attempts > 0 {
                     tracing::info!("{description} succeeded after {attempts} retries");
                 }
-                return result;
+                return Some(result);
             }
 
             // These are "permanent" errors that should not be retried.
-            Err(ref e) if is_permanent(e) => {
-                return result;
+            Err(e) if is_permanent(e) => {
+                return Some(result);
             }
             // Assume that any other failure might be transient, and the operation might
             // succeed if we just keep trying.
@@ -109,12 +93,12 @@ where
             Err(err) if attempts < max_retries => {
                 tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
             }
-            Err(ref err) => {
+            Err(err) => {
                 // Operation failed `max_attempts` times. Time to give up.
                 tracing::warn!(
                     "{description} still failed after {attempts} retries, giving up: {err:?}"
                 );
-                return result;
+                return Some(result);
             }
         }
         // sleep and retry
@@ -122,7 +106,7 @@ where
             attempts,
             DEFAULT_BASE_BACKOFF_SECONDS,
             DEFAULT_MAX_BACKOFF_SECONDS,
-            &cancel.token,
+            cancel,
         )
         .await;
         attempts += 1;
@@ -131,11 +115,9 @@ where
 
 #[cfg(test)]
 mod tests {
-    use std::io;
-
-    use tokio::sync::Mutex;
-
     use super::*;
+    use std::io;
+    use tokio::sync::Mutex;
 
     #[test]
     fn backoff_defaults_produce_growing_backoff_sequence() {
@@ -166,7 +148,7 @@ mod tests {
     #[tokio::test(start_paused = true)]
     async fn retry_always_error() {
         let count = Mutex::new(0);
-        let err_result = retry(
+        retry(
             || async {
                 *count.lock().await += 1;
                 Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
@@ -175,11 +157,11 @@ mod tests {
             1,
             1,
             "work",
-            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
+            &CancellationToken::new(),
         )
-        .await;
-
-        assert!(err_result.is_err());
+        .await
+        .expect("not cancelled")
+        .expect_err("it can only fail");
 
         assert_eq!(*count.lock().await, 2);
     }
@@ -201,10 +183,11 @@ mod tests {
             2,
             2,
             "work",
-            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
+            &CancellationToken::new(),
         )
         .await
-        .unwrap();
+        .expect("not cancelled")
+        .expect("success on second try");
     }
 
     #[tokio::test(start_paused = true)]
@@ -224,10 +207,11 @@ mod tests {
             2,
             2,
             "work",
-            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
+            &CancellationToken::new(),
         )
         .await
-        .unwrap_err();
+        .expect("was not cancellation")
+        .expect_err("it was permanent error");
 
         assert_eq!(*count.lock().await, 1);
     }
diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs
index 322ed95cc8..6b840a3136 100644
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -262,35 +262,33 @@ async fn upload(
 ) -> Result<(), UploadError> {
     let warn_after = 3;
     let max_attempts = 10;
+
+    // this is used only with tests so far
+    let last_value = if is_last { "true" } else { "false" };
+
     let res = utils::backoff::retry(
-        move || {
-            let body = body.clone();
-            async move {
-                let res = client
-                    .post(metric_collection_endpoint.clone())
-                    .header(reqwest::header::CONTENT_TYPE, "application/json")
-                    .header(
-                        LAST_IN_BATCH.clone(),
-                        if is_last { "true" } else { "false" },
-                    )
-                    .body(body)
-                    .send()
-                    .await;
+        || async {
+            let res = client
+                .post(metric_collection_endpoint.clone())
+                .header(reqwest::header::CONTENT_TYPE, "application/json")
+                .header(LAST_IN_BATCH.clone(), last_value)
+                .body(body.clone())
+                .send()
+                .await;
 
-                let res = res.and_then(|res| res.error_for_status());
+            let res = res.and_then(|res| res.error_for_status());
 
-                // 10 redirects are normally allowed, so we don't need worry about 3xx
-                match res {
-                    Ok(_response) => Ok(()),
-                    Err(e) => {
-                        let status = e.status().filter(|s| s.is_client_error());
-                        if let Some(status) = status {
-                            // rejection used to be a thing when the server could reject a
-                            // whole batch of metrics if one metric was bad.
-                            Err(UploadError::Rejected(status))
-                        } else {
-                            Err(UploadError::Reqwest(e))
-                        }
+            // 10 redirects are normally allowed, so we don't need worry about 3xx
+            match res {
+                Ok(_response) => Ok(()),
+                Err(e) => {
+                    let status = e.status().filter(|s| s.is_client_error());
+                    if let Some(status) = status {
+                        // rejection used to be a thing when the server could reject a
+                        // whole batch of metrics if one metric was bad.
+                        Err(UploadError::Rejected(status))
+                    } else {
+                        Err(UploadError::Reqwest(e))
                     }
                 }
             }
@@ -299,9 +297,11 @@ async fn upload(
         warn_after,
         max_attempts,
         "upload consumption_metrics",
-        utils::backoff::Cancel::new(cancel.clone(), || UploadError::Cancelled),
+        cancel,
     )
-    .await;
+    .await
+    .ok_or_else(|| UploadError::Cancelled)
+    .and_then(|x| x);
 
     match &res {
         Ok(_) => {}
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 950791ea48..61c7d03408 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -82,46 +82,29 @@ impl ControlPlaneClient {
         R: Serialize,
         T: DeserializeOwned,
     {
-        #[derive(thiserror::Error, Debug)]
-        enum RemoteAttemptError {
-            #[error("shutdown")]
-            Shutdown,
-            #[error("remote: {0}")]
-            Remote(reqwest::Error),
-        }
-
-        match backoff::retry(
+        let res = backoff::retry(
             || async {
                 let response = self
                     .http_client
                     .post(url.clone())
                     .json(&request)
                     .send()
-                    .await
-                    .map_err(RemoteAttemptError::Remote)?;
+                    .await?;
 
-                response
-                    .error_for_status_ref()
-                    .map_err(RemoteAttemptError::Remote)?;
-                response
-                    .json::<T>()
-                    .await
-                    .map_err(RemoteAttemptError::Remote)
+                response.error_for_status_ref()?;
+                response.json::<T>().await
             },
             |_| false,
             3,
             u32::MAX,
             "calling control plane generation validation API",
-            backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown),
+            &self.cancel,
         )
         .await
-        {
-            Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown),
-            Err(RemoteAttemptError::Remote(_)) => {
-                panic!("We retry forever, this should never be reached");
-            }
-            Ok(r) => Ok(r),
-        }
+        .ok_or(RetryForeverError::ShuttingDown)?
+        .expect("We retry forever, this should never be reached");
+
+        Ok(res)
     }
 }
 
diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs
index 57421b1547..a75c73f2b1 100644
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -77,9 +77,11 @@ impl Deleter {
             3,
             10,
             "executing deletion batch",
-            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Shutting down")),
+            &self.cancel,
         )
         .await
+        .ok_or_else(|| anyhow::anyhow!("Shutting down"))
+        .and_then(|x| x)
     }
 
     /// Block until everything in accumulator has been executed
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index b801347c06..624c3e365f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3294,11 +3294,11 @@ impl Tenant {
             3,
             u32::MAX,
             "persist_initdb_tar_zst",
-            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
+            &self.cancel,
         )
-        .await?;
-
-        Ok(())
+        .await
+        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+        .and_then(|x| x)
     }
 
     /// - run initdb to init temporary instance and get bootstrap data
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 0dbaa3ec93..7c35914b61 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -91,9 +91,11 @@ async fn create_remote_delete_mark(
         FAILED_UPLOAD_WARN_THRESHOLD,
         FAILED_REMOTE_OP_RETRIES,
         "mark_upload",
-        backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
+        cancel,
     )
     .await
+    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+    .and_then(|x| x)
     .context("mark_upload")?;
 
     Ok(())
@@ -187,9 +189,11 @@ async fn remove_tenant_remote_delete_mark(
             FAILED_UPLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
             "remove_tenant_remote_delete_mark",
-            backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
+            cancel,
         )
         .await
+        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+        .and_then(|x| x)
         .context("remove_tenant_remote_delete_mark")?;
     }
     Ok(())
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 2e429ee9bc..831a073d17 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1046,9 +1046,11 @@ impl RemoteTimelineClient {
             // when executed as part of tenant deletion this happens in the background
             2,
             "persist_index_part_with_deleted_flag",
-            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
+            &self.cancel,
         )
-        .await?;
+        .await
+        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+        .and_then(|x| x)?;
 
         // all good, disarm the guard and mark as success
         ScopeGuard::into_inner(undo_deleted_at);
@@ -1083,9 +1085,11 @@ impl RemoteTimelineClient {
             FAILED_DOWNLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
             "preserve_initdb_tar_zst",
-            backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled!")),
+            &cancel.clone(),
         )
         .await
+        .ok_or_else(|| anyhow::anyhow!("Cancellled"))
+        .and_then(|x| x)
         .context("backing up initdb archive")?;
         Ok(())
     }
@@ -1141,6 +1145,8 @@ impl RemoteTimelineClient {
         // taking the burden of listing all the layers that we already know we should delete.
         self.deletion_queue_client.flush_immediate().await?;
 
+        let cancel = shutdown_token();
+
         let remaining = backoff::retry(
             || async {
                 self.storage_impl
@@ -1151,9 +1157,11 @@ impl RemoteTimelineClient {
             FAILED_DOWNLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
             "list_prefixes",
-            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
+            &cancel,
         )
         .await
+        .ok_or_else(|| anyhow::anyhow!("Cancelled!"))
+        .and_then(|x| x)
         .context("list prefixes")?;
 
         // We will delete the current index_part object last, since it acts as a deletion
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index b84b5ca33b..2c50726b43 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -76,7 +76,6 @@ pub async fn download_layer_file<'a>(
     // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
     let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
 
-    let cancel_inner = cancel.clone();
     let (mut destination_file, bytes_amount) = download_retry(
         || async {
             let destination_file = tokio::fs::File::create(&temp_file_path)
@@ -87,7 +86,7 @@ pub async fn download_layer_file<'a>(
             // Cancellation safety: it is safe to cancel this future, because it isn't writing to a local
             // file: the write to local file doesn't start until after the request header is returned
             // and we start draining the body stream below
-            let download = download_cancellable(&cancel_inner, storage.download(&remote_path))
+            let download = download_cancellable(cancel, storage.download(&remote_path))
                 .await
                 .with_context(|| {
                     format!(
@@ -107,7 +106,7 @@ pub async fn download_layer_file<'a>(
             // we will imminiently try and write to again.
             let bytes_amount: u64 = match timeout_cancellable(
                 DOWNLOAD_TIMEOUT,
-                &cancel_inner,
+                cancel,
                 tokio::io::copy_buf(&mut reader, &mut destination_file),
             )
             .await
@@ -386,9 +385,11 @@ pub(super) async fn download_index_part(
         FAILED_DOWNLOAD_WARN_THRESHOLD,
         FAILED_REMOTE_OP_RETRIES,
         "listing index_part files",
-        backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
+        &cancel,
     )
     .await
+    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+    .and_then(|x| x)
     .map_err(DownloadError::Other)?;
 
     // General case logic for which index to use: the latest index whose generation
@@ -510,7 +511,7 @@ pub(crate) async fn download_initdb_tar_zst(
 
 /// Helper function to handle retries for a download operation.
 ///
-/// Remote operations can fail due to rate limits (IAM, S3), spurious network
+/// Remote operations can fail due to rate limits (S3), spurious network
 /// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times,
 /// with backoff.
 ///
@@ -530,9 +531,11 @@ where
         FAILED_DOWNLOAD_WARN_THRESHOLD,
         FAILED_REMOTE_OP_RETRIES,
         description,
-        backoff::Cancel::new(cancel.clone(), || DownloadError::Cancelled),
+        cancel,
     )
     .await
+    .ok_or_else(|| DownloadError::Cancelled)
+    .and_then(|x| x)
 }
 
 async fn download_retry_forever<T, O, F>(
@@ -550,7 +553,9 @@ where
         FAILED_DOWNLOAD_WARN_THRESHOLD,
         u32::MAX,
         description,
-        backoff::Cancel::new(cancel, || DownloadError::Cancelled),
+        &cancel,
     )
     .await
+    .ok_or_else(|| DownloadError::Cancelled)
+    .and_then(|x| x)
 }
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 76df9ba5c4..e8ba1d3d6e 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -188,16 +188,18 @@ pub(crate) async fn time_travel_recover_tenant(
         backoff::retry(
             || async {
                 storage
-                    .time_travel_recover(Some(prefix), timestamp, done_if_after, cancel.clone())
+                    .time_travel_recover(Some(prefix), timestamp, done_if_after, cancel)
                     .await
             },
             |e| !matches!(e, TimeTravelError::Other(_)),
             warn_after,
             max_attempts,
             "time travel recovery of tenant prefix",
-            backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
+            cancel,
         )
-        .await?;
+        .await
+        .ok_or_else(|| TimeTravelError::Cancelled)
+        .and_then(|x| x)?;
     }
     Ok(())
 }
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 702c0b1ec1..55af4f9f2b 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -537,11 +537,11 @@ impl<'a> TenantDownloader<'a> {
             FAILED_DOWNLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
             "download heatmap",
-            backoff::Cancel::new(self.secondary_state.cancel.clone(), || {
-                UpdateError::Cancelled
-            }),
+            &self.secondary_state.cancel,
         )
-        .await?;
+        .await
+        .ok_or_else(|| UpdateError::Cancelled)
+        .and_then(|x| x)?;
 
         SECONDARY_MODE.download_heatmap.inc();
 
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index df865658a4..fff29b2487 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -426,9 +426,11 @@ async fn upload_tenant_heatmap(
         3,
         u32::MAX,
         "Uploading heatmap",
-        backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")),
+        &tenant_cancel,
     )
     .await
+    .ok_or_else(|| anyhow::anyhow!("Shutting down"))
+    .and_then(|x| x)
     {
         if tenant_cancel.is_cancelled() {
             return Err(UploadHeatmapError::Cancelled);
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index e920d7be01..8510c5c586 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -315,9 +315,11 @@ async fn upload_parquet(
         FAILED_UPLOAD_MAX_RETRIES,
         "request_data_upload",
         // we don't want cancellation to interrupt here, so we make a dummy cancel token
-        backoff::Cancel::new(CancellationToken::new(), || anyhow::anyhow!("Cancelled")),
+        &CancellationToken::new(),
     )
     .await
+    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+    .and_then(|x| x)
     .context("request_data_upload")?;
 
     Ok(buffer.writer())
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index c47381351d..df99244770 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -558,16 +558,17 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     backoff::retry(
         || async {
             let files = storage.list_files(Some(&remote_path)).await?;
-            storage.delete_objects(&files).await?;
-            Ok(())
+            storage.delete_objects(&files).await
         },
         |_| false,
         3,
         10,
         "executing WAL segments deletion batch",
-        backoff::Cancel::new(token, || anyhow::anyhow!("canceled")),
+        &token,
     )
-    .await?;
+    .await
+    .ok_or_else(|| anyhow::anyhow!("canceled"))
+    .and_then(|x| x)?;
 
     Ok(())
 }

From e196d974cc585341ee38f8fd6b54c257a3ad78a4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 6 Feb 2024 10:34:16 +0100
Subject: [PATCH 0091/1571] pagebench: actually implement `--num_clients`
 (#6640)

Will need this to validate per-tenant throttling in
https://github.com/neondatabase/neon/issues/5899
---
 .../pagebench/src/cmd/getpage_latest_lsn.rs   | 139 ++++++++++--------
 1 file changed, 78 insertions(+), 61 deletions(-)

diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 400b5476b7..aa809d8d26 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -79,6 +79,12 @@ impl KeyRange {
     }
 }
 
+#[derive(PartialEq, Eq, Hash, Copy, Clone)]
+struct WorkerId {
+    timeline: TenantTimelineId,
+    num_client: usize, // from 0..args.num_clients
+}
+
 #[derive(serde::Serialize)]
 struct Output {
     total: request_stats::Output,
@@ -206,7 +212,7 @@ async fn main_impl(
 
     let live_stats = Arc::new(LiveStats::default());
 
-    let num_client_tasks = timelines.len();
+    let num_client_tasks = args.num_clients.get() * timelines.len();
     let num_live_stats_dump = 1;
     let num_work_sender_tasks = 1;
     let num_main_impl = 1;
@@ -235,19 +241,25 @@ async fn main_impl(
 
     let cancel = CancellationToken::new();
 
-    let mut work_senders: HashMap<TenantTimelineId, _> = HashMap::new();
+    let mut work_senders: HashMap<WorkerId, _> = HashMap::new();
     let mut tasks = Vec::new();
-    for tl in &timelines {
-        let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
-        work_senders.insert(*tl, sender);
-        tasks.push(tokio::spawn(client(
-            args,
-            *tl,
-            Arc::clone(&start_work_barrier),
-            receiver,
-            Arc::clone(&live_stats),
-            cancel.clone(),
-        )));
+    for timeline in timelines.iter().cloned() {
+        for num_client in 0..args.num_clients.get() {
+            let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
+            let worker_id = WorkerId {
+                timeline,
+                num_client,
+            };
+            work_senders.insert(worker_id, sender);
+            tasks.push(tokio::spawn(client(
+                args,
+                worker_id,
+                Arc::clone(&start_work_barrier),
+                receiver,
+                Arc::clone(&live_stats),
+                cancel.clone(),
+            )));
+        }
     }
 
     let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = {
@@ -271,7 +283,10 @@ async fn main_impl(
                         let (rel_tag, block_no) =
                             key_to_rel_block(key).expect("we filter non-rel-block keys out above");
                         (
-                            r.timeline,
+                            WorkerId {
+                                timeline: r.timeline,
+                                num_client: rng.gen_range(0..args.num_clients.get()),
+                            },
                             PagestreamGetPageRequest {
                                 latest: rng.gen_bool(args.req_latest_probability),
                                 lsn: r.timeline_lsn,
@@ -289,56 +304,54 @@ async fn main_impl(
             }),
             Some(rps_limit) => Box::pin(async move {
                 let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
-                let make_timeline_task: &dyn Fn(
-                    TenantTimelineId,
-                )
-                    -> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
-                    let sender = work_senders.get(&timeline).unwrap();
-                    let ranges: Vec<KeyRange> = all_ranges
-                        .iter()
-                        .filter(|r| r.timeline == timeline)
-                        .cloned()
-                        .collect();
-                    let weights = rand::distributions::weighted::WeightedIndex::new(
-                        ranges.iter().map(|v| v.len()),
-                    )
-                    .unwrap();
+                let make_task: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> =
+                    &|worker_id| {
+                        let sender = work_senders.get(&worker_id).unwrap();
+                        let ranges: Vec<KeyRange> = all_ranges
+                            .iter()
+                            .filter(|r| r.timeline == worker_id.timeline)
+                            .cloned()
+                            .collect();
+                        let weights = rand::distributions::weighted::WeightedIndex::new(
+                            ranges.iter().map(|v| v.len()),
+                        )
+                        .unwrap();
 
-                    let cancel = cancel.clone();
-                    Box::pin(async move {
-                        let mut ticker = tokio::time::interval(period);
-                        ticker.set_missed_tick_behavior(
-                            /* TODO review this choice */
-                            tokio::time::MissedTickBehavior::Burst,
-                        );
-                        while !cancel.is_cancelled() {
-                            ticker.tick().await;
-                            let req = {
-                                let mut rng = rand::thread_rng();
-                                let r = &ranges[weights.sample(&mut rng)];
-                                let key: i128 = rng.gen_range(r.start..r.end);
-                                let key = Key::from_i128(key);
-                                assert!(is_rel_block_key(&key));
-                                let (rel_tag, block_no) = key_to_rel_block(key)
-                                    .expect("we filter non-rel-block keys out above");
-                                PagestreamGetPageRequest {
-                                    latest: rng.gen_bool(args.req_latest_probability),
-                                    lsn: r.timeline_lsn,
-                                    rel: rel_tag,
-                                    blkno: block_no,
+                        let cancel = cancel.clone();
+                        Box::pin(async move {
+                            let mut ticker = tokio::time::interval(period);
+                            ticker.set_missed_tick_behavior(
+                                /* TODO review this choice */
+                                tokio::time::MissedTickBehavior::Burst,
+                            );
+                            while !cancel.is_cancelled() {
+                                ticker.tick().await;
+                                let req = {
+                                    let mut rng = rand::thread_rng();
+                                    let r = &ranges[weights.sample(&mut rng)];
+                                    let key: i128 = rng.gen_range(r.start..r.end);
+                                    let key = Key::from_i128(key);
+                                    assert!(is_rel_block_key(&key));
+                                    let (rel_tag, block_no) = key_to_rel_block(key)
+                                        .expect("we filter non-rel-block keys out above");
+                                    PagestreamGetPageRequest {
+                                        latest: rng.gen_bool(args.req_latest_probability),
+                                        lsn: r.timeline_lsn,
+                                        rel: rel_tag,
+                                        blkno: block_no,
+                                    }
+                                };
+                                if sender.send(req).await.is_err() {
+                                    assert!(
+                                        cancel.is_cancelled(),
+                                        "client has gone away unexpectedly"
+                                    );
                                 }
-                            };
-                            if sender.send(req).await.is_err() {
-                                assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
                             }
-                        }
-                    })
-                };
+                        })
+                    };
 
-                let tasks: Vec<_> = work_senders
-                    .keys()
-                    .map(|tl| make_timeline_task(*tl))
-                    .collect();
+                let tasks: Vec<_> = work_senders.keys().map(|tl| make_task(*tl)).collect();
 
                 start_work_barrier.wait().await;
 
@@ -390,12 +403,16 @@ async fn main_impl(
 #[instrument(skip_all)]
 async fn client(
     args: &'static Args,
-    timeline: TenantTimelineId,
+    id: WorkerId,
     start_work_barrier: Arc<Barrier>,
     mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
     live_stats: Arc<LiveStats>,
     cancel: CancellationToken,
 ) {
+    let WorkerId {
+        timeline,
+        num_client: _,
+    } = id;
     let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
         .await
         .unwrap();

From edcde05c1cdf75f9bd5f0669b95ef61946d25549 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 6 Feb 2024 10:44:49 +0100
Subject: [PATCH 0092/1571] refactor(walredo): split up the massive
 `walredo.rs` (#6583)

Part of https://github.com/neondatabase/neon/issues/6581
---
 pageserver/src/walredo.rs                     | 825 +-----------------
 pageserver/src/walredo/apply_neon.rs          | 235 +++++
 pageserver/src/walredo/process.rs             | 406 +++++++++
 .../src/walredo/process/no_leak_child.rs      | 126 +++
 pageserver/src/walredo/process/protocol.rs    |  57 ++
 5 files changed, 848 insertions(+), 801 deletions(-)
 create mode 100644 pageserver/src/walredo/apply_neon.rs
 create mode 100644 pageserver/src/walredo/process.rs
 create mode 100644 pageserver/src/walredo/process/no_leak_child.rs
 create mode 100644 pageserver/src/walredo/process/protocol.rs

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 5bc897b730..773e5fc051 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -17,71 +17,30 @@
 //! records. It achieves it by dropping privileges before replaying
 //! any WAL records, so that even if an attacker hijacks the Postgres
 //! process, he cannot escape out of it.
-//!
-use anyhow::Context;
-use byteorder::{ByteOrder, LittleEndian};
-use bytes::{BufMut, Bytes, BytesMut};
-use nix::poll::*;
-use pageserver_api::models::WalRedoManagerStatus;
-use pageserver_api::shard::TenantShardId;
-use serde::Serialize;
-use std::collections::VecDeque;
-use std::io;
-use std::io::prelude::*;
-use std::ops::{Deref, DerefMut};
-use std::os::unix::io::AsRawFd;
-use std::process::Stdio;
-use std::process::{Child, ChildStdin, ChildStdout, Command};
-use std::sync::{Arc, Mutex, MutexGuard, RwLock};
-use std::time::Duration;
-use std::time::Instant;
-use tracing::*;
-use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock};
 
-#[cfg(feature = "testing")]
-use std::sync::atomic::{AtomicUsize, Ordering};
+/// Process lifecycle and abstracction for the IPC protocol.
+mod process;
+
+/// Code to apply [`NeonWalRecord`]s.
+mod apply_neon;
 
 use crate::config::PageServerConf;
 use crate::metrics::{
-    WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
-    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM,
-    WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
+    WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
+    WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME,
 };
 use crate::repository::Key;
 use crate::walrecord::NeonWalRecord;
-
-use pageserver_api::key::{key_to_rel_block, key_to_slru_block};
-use pageserver_api::reltag::{RelTag, SlruKind};
-use postgres_ffi::pg_constants;
-use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
-use postgres_ffi::v14::nonrelfile_utils::{
-    mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset,
-    transaction_id_set_status,
-};
-use postgres_ffi::BLCKSZ;
-
-///
-/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
-///
-/// In Postgres `BufferTag` structure is used for exactly the same purpose.
-/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
-///
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)]
-pub(crate) struct BufferTag {
-    pub rel: RelTag,
-    pub blknum: u32,
-}
-
-struct ProcessInput {
-    stdin: ChildStdin,
-    n_requests: usize,
-}
-
-struct ProcessOutput {
-    stdout: ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
-}
+use anyhow::Context;
+use bytes::{Bytes, BytesMut};
+use pageserver_api::key::key_to_rel_block;
+use pageserver_api::models::WalRedoManagerStatus;
+use pageserver_api::shard::TenantShardId;
+use std::sync::{Arc, RwLock};
+use std::time::Duration;
+use std::time::Instant;
+use tracing::*;
+use utils::lsn::Lsn;
 
 ///
 /// This is the real implementation that uses a Postgres process to
@@ -94,22 +53,7 @@ pub struct PostgresRedoManager {
     tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
     last_redo_at: std::sync::Mutex<Option<Instant>>,
-    redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
-}
-
-/// Can this request be served by neon redo functions
-/// or we need to pass it to wal-redo postgres process?
-fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {
-    // Currently, we don't have bespoken Rust code to replay any
-    // Postgres WAL records. But everything else is handled in neon.
-    #[allow(clippy::match_like_matches_macro)]
-    match rec {
-        NeonWalRecord::Postgres {
-            will_init: _,
-            rec: _,
-        } => false,
-        _ => true,
-    }
+    redo_process: RwLock<Option<Arc<process::WalRedoProcess>>>,
 }
 
 ///
@@ -139,10 +83,10 @@ impl PostgresRedoManager {
 
         let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
         let mut img = base_img.map(|p| p.1);
-        let mut batch_neon = can_apply_in_neon(&records[0].1);
+        let mut batch_neon = apply_neon::can_apply_in_neon(&records[0].1);
         let mut batch_start = 0;
         for (i, record) in records.iter().enumerate().skip(1) {
-            let rec_neon = can_apply_in_neon(&record.1);
+            let rec_neon = apply_neon::can_apply_in_neon(&record.1);
 
             if rec_neon != batch_neon {
                 let result = if batch_neon {
@@ -248,7 +192,7 @@ impl PostgresRedoManager {
         let mut n_attempts = 0u32;
         loop {
             // launch the WAL redo process on first use
-            let proc: Arc<WalRedoProcess> = {
+            let proc: Arc<process::WalRedoProcess> = {
                 let proc_guard = self.redo_process.read().unwrap();
                 match &*proc_guard {
                     None => {
@@ -259,7 +203,7 @@ impl PostgresRedoManager {
                             None => {
                                 let start = Instant::now();
                                 let proc = Arc::new(
-                                    WalRedoProcess::launch(
+                                    process::WalRedoProcess::launch(
                                         self.conf,
                                         self.tenant_shard_id,
                                         pg_version,
@@ -287,9 +231,8 @@ impl PostgresRedoManager {
             let started_at = std::time::Instant::now();
 
             // Relational WAL records are applied using wal-redo-postgres
-            let buf_tag = BufferTag { rel, blknum };
             let result = proc
-                .apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout)
+                .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
                 .context("apply_wal_records");
 
             let duration = started_at.elapsed();
@@ -416,732 +359,12 @@ impl PostgresRedoManager {
         _record_lsn: Lsn,
         record: &NeonWalRecord,
     ) -> anyhow::Result<()> {
-        match record {
-            NeonWalRecord::Postgres {
-                will_init: _,
-                rec: _,
-            } => {
-                anyhow::bail!("tried to pass postgres wal record to neon WAL redo");
-            }
-            NeonWalRecord::ClearVisibilityMapFlags {
-                new_heap_blkno,
-                old_heap_blkno,
-                flags,
-            } => {
-                // sanity check that this is modifying the correct relation
-                let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
-                assert!(
-                    rel.forknum == VISIBILITYMAP_FORKNUM,
-                    "ClearVisibilityMapFlags record on unexpected rel {}",
-                    rel
-                );
-                if let Some(heap_blkno) = *new_heap_blkno {
-                    // Calculate the VM block and offset that corresponds to the heap block.
-                    let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
-                    let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
-                    let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
-
-                    // Check that we're modifying the correct VM block.
-                    assert!(map_block == blknum);
-
-                    // equivalent to PageGetContents(page)
-                    let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
-
-                    map[map_byte as usize] &= !(flags << map_offset);
-                }
-
-                // Repeat for 'old_heap_blkno', if any
-                if let Some(heap_blkno) = *old_heap_blkno {
-                    let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
-                    let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
-                    let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
-
-                    assert!(map_block == blknum);
-
-                    let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
-
-                    map[map_byte as usize] &= !(flags << map_offset);
-                }
-            }
-            // Non-relational WAL records are handled here, with custom code that has the
-            // same effects as the corresponding Postgres WAL redo function.
-            NeonWalRecord::ClogSetCommitted { xids, timestamp } => {
-                let (slru_kind, segno, blknum) =
-                    key_to_slru_block(key).context("invalid record")?;
-                assert_eq!(
-                    slru_kind,
-                    SlruKind::Clog,
-                    "ClogSetCommitted record with unexpected key {}",
-                    key
-                );
-                for &xid in xids {
-                    let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
-                    let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                    let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-
-                    // Check that we're modifying the correct CLOG block.
-                    assert!(
-                        segno == expected_segno,
-                        "ClogSetCommitted record for XID {} with unexpected key {}",
-                        xid,
-                        key
-                    );
-                    assert!(
-                        blknum == expected_blknum,
-                        "ClogSetCommitted record for XID {} with unexpected key {}",
-                        xid,
-                        key
-                    );
-
-                    transaction_id_set_status(
-                        xid,
-                        pg_constants::TRANSACTION_STATUS_COMMITTED,
-                        page,
-                    );
-                }
-
-                // Append the timestamp
-                if page.len() == BLCKSZ as usize + 8 {
-                    page.truncate(BLCKSZ as usize);
-                }
-                if page.len() == BLCKSZ as usize {
-                    page.extend_from_slice(&timestamp.to_be_bytes());
-                } else {
-                    warn!(
-                        "CLOG blk {} in seg {} has invalid size {}",
-                        blknum,
-                        segno,
-                        page.len()
-                    );
-                }
-            }
-            NeonWalRecord::ClogSetAborted { xids } => {
-                let (slru_kind, segno, blknum) =
-                    key_to_slru_block(key).context("invalid record")?;
-                assert_eq!(
-                    slru_kind,
-                    SlruKind::Clog,
-                    "ClogSetAborted record with unexpected key {}",
-                    key
-                );
-                for &xid in xids {
-                    let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
-                    let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                    let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-
-                    // Check that we're modifying the correct CLOG block.
-                    assert!(
-                        segno == expected_segno,
-                        "ClogSetAborted record for XID {} with unexpected key {}",
-                        xid,
-                        key
-                    );
-                    assert!(
-                        blknum == expected_blknum,
-                        "ClogSetAborted record for XID {} with unexpected key {}",
-                        xid,
-                        key
-                    );
-
-                    transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page);
-                }
-            }
-            NeonWalRecord::MultixactOffsetCreate { mid, moff } => {
-                let (slru_kind, segno, blknum) =
-                    key_to_slru_block(key).context("invalid record")?;
-                assert_eq!(
-                    slru_kind,
-                    SlruKind::MultiXactOffsets,
-                    "MultixactOffsetCreate record with unexpected key {}",
-                    key
-                );
-                // Compute the block and offset to modify.
-                // See RecordNewMultiXact in PostgreSQL sources.
-                let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
-                let entryno = mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
-                let offset = (entryno * 4) as usize;
-
-                // Check that we're modifying the correct multixact-offsets block.
-                let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                assert!(
-                    segno == expected_segno,
-                    "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
-                    mid,
-                    key
-                );
-                assert!(
-                    blknum == expected_blknum,
-                    "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
-                    mid,
-                    key
-                );
-
-                LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
-            }
-            NeonWalRecord::MultixactMembersCreate { moff, members } => {
-                let (slru_kind, segno, blknum) =
-                    key_to_slru_block(key).context("invalid record")?;
-                assert_eq!(
-                    slru_kind,
-                    SlruKind::MultiXactMembers,
-                    "MultixactMembersCreate record with unexpected key {}",
-                    key
-                );
-                for (i, member) in members.iter().enumerate() {
-                    let offset = moff + i as u32;
-
-                    // Compute the block and offset to modify.
-                    // See RecordNewMultiXact in PostgreSQL sources.
-                    let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
-                    let memberoff = mx_offset_to_member_offset(offset);
-                    let flagsoff = mx_offset_to_flags_offset(offset);
-                    let bshift = mx_offset_to_flags_bitshift(offset);
-
-                    // Check that we're modifying the correct multixact-members block.
-                    let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                    let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                    assert!(
-                        segno == expected_segno,
-                        "MultiXactMembersCreate record for offset {} with unexpected key {}",
-                        moff,
-                        key
-                    );
-                    assert!(
-                        blknum == expected_blknum,
-                        "MultiXactMembersCreate record for offset {} with unexpected key {}",
-                        moff,
-                        key
-                    );
-
-                    let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
-                    flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
-                    flagsval |= member.status << bshift;
-                    LittleEndian::write_u32(&mut page[flagsoff..flagsoff + 4], flagsval);
-                    LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
-                }
-            }
-        }
+        apply_neon::apply_in_neon(record, key, page)?;
 
         Ok(())
     }
 }
 
-struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: Mutex<ProcessOutput>,
-    stdin: Mutex<ProcessInput>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
-}
-
-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
-    fn launch(
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-        pg_version: u32,
-    ) -> anyhow::Result<Self> {
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        macro_rules! set_nonblock_or_log_err {
-            ($file:ident) => {{
-                let res = set_nonblock($file.as_raw_fd());
-                if let Err(e) = &res {
-                    error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
-                }
-                res
-            }};
-        }
-        set_nonblock_or_log_err!(stdin)?;
-        set_nonblock_or_log_err!(stdout)?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-            async move {
-                scopeguard::defer! {
-                    debug!("wal-redo-postgres stderr_logger_task finished");
-                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-                }
-                debug!("wal-redo-postgres stderr_logger_task started");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-                use tokio::io::AsyncBufReadExt;
-                let mut stderr_lines = tokio::io::BufReader::new(stderr);
-                let mut buf = Vec::new();
-                let res = loop {
-                    buf.clear();
-                    // TODO we don't trust the process to cap its stderr length.
-                    // Currently it can do unbounded Vec allocation.
-                    match stderr_lines.read_until(b'\n', &mut buf).await {
-                        Ok(0) => break Ok(()), // eof
-                        Ok(num_bytes) => {
-                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                            error!(%output, "received output");
-                        }
-                        Err(e) => {
-                            break Err(e);
-                        }
-                    }
-                };
-                match res {
-                    Ok(()) => (),
-                    Err(e) => {
-                        error!(error=?e, "failed to read from walredo stderr");
-                    }
-                }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-        );
-
-        Ok(Self {
-            conf,
-            tenant_shard_id,
-            child: Some(child),
-            stdin: Mutex::new(ProcessInput {
-                stdin,
-                n_requests: 0,
-            }),
-            stdout: Mutex::new(ProcessOutput {
-                stdout,
-                pending_responses: VecDeque::new(),
-                n_processed_responses: 0,
-            }),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
-        })
-    }
-
-    fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    // Apply given WAL records ('records') over an old page image. Returns
-    // new page image.
-    //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
-    fn apply_wal_records(
-        &self,
-        tag: BufferTag,
-        base_img: &Option<Bytes>,
-        records: &[(Lsn, NeonWalRecord)],
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let input = self.stdin.lock().unwrap();
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
-            }
-        }
-        build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
-    }
-
-    fn apply_wal_records0(
-        &self,
-        writebuf: &[u8],
-        input: MutexGuard<ProcessInput>,
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
-        let mut nwrite = 0usize;
-
-        while nwrite < writebuf.len() {
-            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
-            let n = loop {
-                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
-                    Err(nix::errno::Errno::EINTR) => continue,
-                    res => break res,
-                }
-            }?;
-
-            if n == 0 {
-                anyhow::bail!("WAL redo timed out");
-            }
-
-            // If 'stdin' is writeable, do write.
-            let in_revents = stdin_pollfds[0].revents().unwrap();
-            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            }
-            if in_revents.contains(PollFlags::POLLHUP) {
-                // We still have more data to write, but the process closed the pipe.
-                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
-            }
-        }
-        let request_no = proc.n_requests;
-        proc.n_requests += 1;
-        drop(proc);
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut output = self.stdout.lock().unwrap();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
-            while nresult < BLCKSZ.into() {
-                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
-                // We do two things simultaneously: reading response from stdout
-                // and forward any logging information that the child writes to its stderr to the page server's log.
-                let n = loop {
-                    match nix::poll::poll(
-                        &mut stdout_pollfds[..],
-                        wal_redo_timeout.as_millis() as i32,
-                    ) {
-                        Err(nix::errno::Errno::EINTR) => continue,
-                        res => break res,
-                    }
-                }?;
-
-                if n == 0 {
-                    anyhow::bail!("WAL redo timed out");
-                }
-
-                // If we have some data in stdout, read it to the result buffer.
-                let out_revents = stdout_pollfds[0].revents().unwrap();
-                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                }
-                if out_revents.contains(PollFlags::POLLHUP) {
-                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
-                }
-            }
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
-        }
-    }
-
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
-
-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
-    }
-}
-
-/// Wrapper type around `std::process::Child` which guarantees that the child
-/// will be killed and waited-for by this process before being dropped.
-struct NoLeakChild {
-    tenant_id: TenantShardId,
-    child: Option<Child>,
-}
-
-impl Deref for NoLeakChild {
-    type Target = Child;
-
-    fn deref(&self) -> &Self::Target {
-        self.child.as_ref().expect("must not use from drop")
-    }
-}
-
-impl DerefMut for NoLeakChild {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.child.as_mut().expect("must not use from drop")
-    }
-}
-
-impl NoLeakChild {
-    fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
-        let child = command.spawn()?;
-        Ok(NoLeakChild {
-            tenant_id,
-            child: Some(child),
-        })
-    }
-
-    fn kill_and_wait(mut self, cause: WalRedoKillCause) {
-        let child = match self.child.take() {
-            Some(child) => child,
-            None => return,
-        };
-        Self::kill_and_wait_impl(child, cause);
-    }
-
-    #[instrument(skip_all, fields(pid=child.id(), ?cause))]
-    fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
-        scopeguard::defer! {
-            WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
-        }
-        let res = child.kill();
-        if let Err(e) = res {
-            // This branch is very unlikely because:
-            // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
-            // - This is the only place that calls .kill()
-            // - We consume `self`, so, .kill() can't be called twice.
-            // - If the process exited by itself or was killed by someone else,
-            //   .kill() will still succeed because we haven't wait()'ed yet.
-            //
-            // So, if we arrive here, we have really no idea what happened,
-            // whether the PID stored in self.child is still valid, etc.
-            // If this function were fallible, we'd return an error, but
-            // since it isn't, all we can do is log an error and proceed
-            // with the wait().
-            error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process");
-        }
-
-        match child.wait() {
-            Ok(exit_status) => {
-                info!(exit_status = %exit_status, "wait successful");
-            }
-            Err(e) => {
-                error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
-            }
-        }
-    }
-}
-
-impl Drop for NoLeakChild {
-    fn drop(&mut self) {
-        let child = match self.child.take() {
-            Some(child) => child,
-            None => return,
-        };
-        let tenant_shard_id = self.tenant_id;
-        // Offload the kill+wait of the child process into the background.
-        // If someone stops the runtime, we'll leak the child process.
-        // We can ignore that case because we only stop the runtime on pageserver exit.
-        tokio::runtime::Handle::current().spawn(async move {
-            tokio::task::spawn_blocking(move || {
-                // Intentionally don't inherit the tracing context from whoever is dropping us.
-                // This thread here is going to outlive of our dropper.
-                let span = tracing::info_span!(
-                    "walredo",
-                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard_id = %tenant_shard_id.shard_slug()
-                );
-                let _entered = span.enter();
-                Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
-            })
-            .await
-        });
-    }
-}
-
-trait NoLeakChildCommandExt {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
-}
-
-impl NoLeakChildCommandExt for Command {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
-        NoLeakChild::spawn(tenant_id, self)
-    }
-}
-
-// Functions for constructing messages to send to the postgres WAL redo
-// process. See pgxn/neon_walredo/walredoproc.c for
-// explanation of the protocol.
-
-fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
-    let len = 4 + 1 + 4 * 4;
-
-    buf.put_u8(b'B');
-    buf.put_u32(len as u32);
-
-    tag.ser_into(buf)
-        .expect("serialize BufferTag should always succeed");
-}
-
-fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec<u8>) {
-    assert!(base_img.len() == 8192);
-
-    let len = 4 + 1 + 4 * 4 + base_img.len();
-
-    buf.put_u8(b'P');
-    buf.put_u32(len as u32);
-    tag.ser_into(buf)
-        .expect("serialize BufferTag should always succeed");
-    buf.put(base_img);
-}
-
-fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec<u8>) {
-    let len = 4 + 8 + rec.len();
-
-    buf.put_u8(b'A');
-    buf.put_u32(len as u32);
-    buf.put_u64(endlsn.0);
-    buf.put(rec);
-}
-
-fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
-    let len = 4 + 1 + 4 * 4;
-
-    buf.put_u8(b'G');
-    buf.put_u32(len as u32);
-    tag.ser_into(buf)
-        .expect("serialize BufferTag should always succeed");
-}
-
 #[cfg(test)]
 mod tests {
     use super::PostgresRedoManager;
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
new file mode 100644
index 0000000000..52899349c4
--- /dev/null
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -0,0 +1,235 @@
+use crate::walrecord::NeonWalRecord;
+use anyhow::Context;
+use byteorder::{ByteOrder, LittleEndian};
+use bytes::BytesMut;
+use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key};
+use pageserver_api::reltag::SlruKind;
+use postgres_ffi::pg_constants;
+use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
+use postgres_ffi::v14::nonrelfile_utils::{
+    mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset,
+    transaction_id_set_status,
+};
+use postgres_ffi::BLCKSZ;
+use tracing::*;
+
+/// Can this request be served by neon redo functions
+/// or we need to pass it to wal-redo postgres process?
+pub(crate) fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {
+    // Currently, we don't have bespoken Rust code to replay any
+    // Postgres WAL records. But everything else is handled in neon.
+    #[allow(clippy::match_like_matches_macro)]
+    match rec {
+        NeonWalRecord::Postgres {
+            will_init: _,
+            rec: _,
+        } => false,
+        _ => true,
+    }
+}
+
+pub(crate) fn apply_in_neon(
+    record: &NeonWalRecord,
+    key: Key,
+    page: &mut BytesMut,
+) -> Result<(), anyhow::Error> {
+    match record {
+        NeonWalRecord::Postgres {
+            will_init: _,
+            rec: _,
+        } => {
+            anyhow::bail!("tried to pass postgres wal record to neon WAL redo");
+        }
+        NeonWalRecord::ClearVisibilityMapFlags {
+            new_heap_blkno,
+            old_heap_blkno,
+            flags,
+        } => {
+            // sanity check that this is modifying the correct relation
+            let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
+            assert!(
+                rel.forknum == VISIBILITYMAP_FORKNUM,
+                "ClearVisibilityMapFlags record on unexpected rel {}",
+                rel
+            );
+            if let Some(heap_blkno) = *new_heap_blkno {
+                // Calculate the VM block and offset that corresponds to the heap block.
+                let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
+                let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
+                let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
+
+                // Check that we're modifying the correct VM block.
+                assert!(map_block == blknum);
+
+                // equivalent to PageGetContents(page)
+                let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
+
+                map[map_byte as usize] &= !(flags << map_offset);
+            }
+
+            // Repeat for 'old_heap_blkno', if any
+            if let Some(heap_blkno) = *old_heap_blkno {
+                let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
+                let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
+                let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
+
+                assert!(map_block == blknum);
+
+                let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
+
+                map[map_byte as usize] &= !(flags << map_offset);
+            }
+        }
+        // Non-relational WAL records are handled here, with custom code that has the
+        // same effects as the corresponding Postgres WAL redo function.
+        NeonWalRecord::ClogSetCommitted { xids, timestamp } => {
+            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            assert_eq!(
+                slru_kind,
+                SlruKind::Clog,
+                "ClogSetCommitted record with unexpected key {}",
+                key
+            );
+            for &xid in xids {
+                let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
+                let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+
+                // Check that we're modifying the correct CLOG block.
+                assert!(
+                    segno == expected_segno,
+                    "ClogSetCommitted record for XID {} with unexpected key {}",
+                    xid,
+                    key
+                );
+                assert!(
+                    blknum == expected_blknum,
+                    "ClogSetCommitted record for XID {} with unexpected key {}",
+                    xid,
+                    key
+                );
+
+                transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_COMMITTED, page);
+            }
+
+            // Append the timestamp
+            if page.len() == BLCKSZ as usize + 8 {
+                page.truncate(BLCKSZ as usize);
+            }
+            if page.len() == BLCKSZ as usize {
+                page.extend_from_slice(&timestamp.to_be_bytes());
+            } else {
+                warn!(
+                    "CLOG blk {} in seg {} has invalid size {}",
+                    blknum,
+                    segno,
+                    page.len()
+                );
+            }
+        }
+        NeonWalRecord::ClogSetAborted { xids } => {
+            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            assert_eq!(
+                slru_kind,
+                SlruKind::Clog,
+                "ClogSetAborted record with unexpected key {}",
+                key
+            );
+            for &xid in xids {
+                let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
+                let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+
+                // Check that we're modifying the correct CLOG block.
+                assert!(
+                    segno == expected_segno,
+                    "ClogSetAborted record for XID {} with unexpected key {}",
+                    xid,
+                    key
+                );
+                assert!(
+                    blknum == expected_blknum,
+                    "ClogSetAborted record for XID {} with unexpected key {}",
+                    xid,
+                    key
+                );
+
+                transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page);
+            }
+        }
+        NeonWalRecord::MultixactOffsetCreate { mid, moff } => {
+            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            assert_eq!(
+                slru_kind,
+                SlruKind::MultiXactOffsets,
+                "MultixactOffsetCreate record with unexpected key {}",
+                key
+            );
+            // Compute the block and offset to modify.
+            // See RecordNewMultiXact in PostgreSQL sources.
+            let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
+            let entryno = mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
+            let offset = (entryno * 4) as usize;
+
+            // Check that we're modifying the correct multixact-offsets block.
+            let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+            let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+            assert!(
+                segno == expected_segno,
+                "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
+                mid,
+                key
+            );
+            assert!(
+                blknum == expected_blknum,
+                "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
+                mid,
+                key
+            );
+
+            LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
+        }
+        NeonWalRecord::MultixactMembersCreate { moff, members } => {
+            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            assert_eq!(
+                slru_kind,
+                SlruKind::MultiXactMembers,
+                "MultixactMembersCreate record with unexpected key {}",
+                key
+            );
+            for (i, member) in members.iter().enumerate() {
+                let offset = moff + i as u32;
+
+                // Compute the block and offset to modify.
+                // See RecordNewMultiXact in PostgreSQL sources.
+                let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
+                let memberoff = mx_offset_to_member_offset(offset);
+                let flagsoff = mx_offset_to_flags_offset(offset);
+                let bshift = mx_offset_to_flags_bitshift(offset);
+
+                // Check that we're modifying the correct multixact-members block.
+                let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                assert!(
+                    segno == expected_segno,
+                    "MultiXactMembersCreate record for offset {} with unexpected key {}",
+                    moff,
+                    key
+                );
+                assert!(
+                    blknum == expected_blknum,
+                    "MultiXactMembersCreate record for offset {} with unexpected key {}",
+                    moff,
+                    key
+                );
+
+                let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
+                flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
+                flagsval |= member.status << bshift;
+                LittleEndian::write_u32(&mut page[flagsoff..flagsoff + 4], flagsval);
+                LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
+            }
+        }
+    }
+    Ok(())
+}
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
new file mode 100644
index 0000000000..85db3b4a4a
--- /dev/null
+++ b/pageserver/src/walredo/process.rs
@@ -0,0 +1,406 @@
+use self::no_leak_child::NoLeakChild;
+use crate::{
+    config::PageServerConf,
+    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    walrecord::NeonWalRecord,
+};
+use anyhow::Context;
+use bytes::Bytes;
+use nix::poll::{PollFd, PollFlags};
+use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use postgres_ffi::BLCKSZ;
+use std::os::fd::AsRawFd;
+#[cfg(feature = "testing")]
+use std::sync::atomic::AtomicUsize;
+use std::{
+    collections::VecDeque,
+    io::{Read, Write},
+    process::{ChildStdin, ChildStdout, Command, Stdio},
+    sync::{Mutex, MutexGuard},
+    time::Duration,
+};
+use tracing::{debug, error, instrument, Instrument};
+use utils::{lsn::Lsn, nonblock::set_nonblock};
+
+mod no_leak_child;
+/// The IPC protocol that pageserver and walredo process speak over their shared pipe.
+mod protocol;
+
+pub struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: Mutex<ProcessOutput>,
+    stdin: Mutex<ProcessInput>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
+}
+
+struct ProcessInput {
+    stdin: ChildStdin,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
+
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
+    pub(crate) fn launch(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        pg_version: u32,
+    ) -> anyhow::Result<Self> {
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        use no_leak_child::NoLeakChildCommandExt;
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        macro_rules! set_nonblock_or_log_err {
+        ($file:ident) => {{
+            let res = set_nonblock($file.as_raw_fd());
+            if let Err(e) = &res {
+                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
+            }
+            res
+        }};
+    }
+        set_nonblock_or_log_err!(stdin)?;
+        set_nonblock_or_log_err!(stdout)?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+        async move {
+            scopeguard::defer! {
+                debug!("wal-redo-postgres stderr_logger_task finished");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+            }
+            debug!("wal-redo-postgres stderr_logger_task started");
+            crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+            use tokio::io::AsyncBufReadExt;
+            let mut stderr_lines = tokio::io::BufReader::new(stderr);
+            let mut buf = Vec::new();
+            let res = loop {
+                buf.clear();
+                // TODO we don't trust the process to cap its stderr length.
+                // Currently it can do unbounded Vec allocation.
+                match stderr_lines.read_until(b'\n', &mut buf).await {
+                    Ok(0) => break Ok(()), // eof
+                    Ok(num_bytes) => {
+                        let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                        error!(%output, "received output");
+                    }
+                    Err(e) => {
+                        break Err(e);
+                    }
+                }
+            };
+            match res {
+                Ok(()) => (),
+                Err(e) => {
+                    error!(error=?e, "failed to read from walredo stderr");
+                }
+            }
+        }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+    );
+
+        Ok(Self {
+            conf,
+            tenant_shard_id,
+            child: Some(child),
+            stdin: Mutex::new(ProcessInput {
+                stdin,
+                n_requests: 0,
+            }),
+            stdout: Mutex::new(ProcessOutput {
+                stdout,
+                pending_responses: VecDeque::new(),
+                n_processed_responses: 0,
+            }),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+        })
+    }
+
+    pub(crate) fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    // Apply given WAL records ('records') over an old page image. Returns
+    // new page image.
+    //
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    pub(crate) fn apply_wal_records(
+        &self,
+        rel: RelTag,
+        blknum: u32,
+        base_img: &Option<Bytes>,
+        records: &[(Lsn, NeonWalRecord)],
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let tag = protocol::BufferTag { rel, blknum };
+        let input = self.stdin.lock().unwrap();
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            protocol::build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+            }
+        }
+        protocol::build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
+    }
+
+    fn apply_wal_records0(
+        &self,
+        writebuf: &[u8],
+        input: MutexGuard<ProcessInput>,
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
+        let mut nwrite = 0usize;
+
+        while nwrite < writebuf.len() {
+            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
+            let n = loop {
+                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
+                    Err(nix::errno::Errno::EINTR) => continue,
+                    res => break res,
+                }
+            }?;
+
+            if n == 0 {
+                anyhow::bail!("WAL redo timed out");
+            }
+
+            // If 'stdin' is writeable, do write.
+            let in_revents = stdin_pollfds[0].revents().unwrap();
+            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
+                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
+            }
+            if in_revents.contains(PollFlags::POLLHUP) {
+                // We still have more data to write, but the process closed the pipe.
+                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
+            }
+        }
+        let request_no = proc.n_requests;
+        proc.n_requests += 1;
+        drop(proc);
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut output = self.stdout.lock().unwrap();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+            while nresult < BLCKSZ.into() {
+                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
+                // We do two things simultaneously: reading response from stdout
+                // and forward any logging information that the child writes to its stderr to the page server's log.
+                let n = loop {
+                    match nix::poll::poll(
+                        &mut stdout_pollfds[..],
+                        wal_redo_timeout.as_millis() as i32,
+                    ) {
+                        Err(nix::errno::Errno::EINTR) => continue,
+                        res => break res,
+                    }
+                }?;
+
+                if n == 0 {
+                    anyhow::bail!("WAL redo timed out");
+                }
+
+                // If we have some data in stdout, read it to the result buffer.
+                let out_revents = stdout_pollfds[0].revents().unwrap();
+                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
+                }
+                if out_revents.contains(PollFlags::POLLHUP) {
+                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
+                }
+            }
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        use std::sync::atomic::Ordering;
+
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}
+
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
+    }
+}
diff --git a/pageserver/src/walredo/process/no_leak_child.rs b/pageserver/src/walredo/process/no_leak_child.rs
new file mode 100644
index 0000000000..ca016408e6
--- /dev/null
+++ b/pageserver/src/walredo/process/no_leak_child.rs
@@ -0,0 +1,126 @@
+use tracing;
+use tracing::error;
+use tracing::info;
+use tracing::instrument;
+
+use crate::metrics::WalRedoKillCause;
+use crate::metrics::WAL_REDO_PROCESS_COUNTERS;
+
+use std::io;
+use std::process::Command;
+
+use std::ops::DerefMut;
+
+use std::ops::Deref;
+
+use std::process::Child;
+
+use pageserver_api::shard::TenantShardId;
+
+/// Wrapper type around `std::process::Child` which guarantees that the child
+/// will be killed and waited-for by this process before being dropped.
+pub(crate) struct NoLeakChild {
+    pub(crate) tenant_id: TenantShardId,
+    pub(crate) child: Option<Child>,
+}
+
+impl Deref for NoLeakChild {
+    type Target = Child;
+
+    fn deref(&self) -> &Self::Target {
+        self.child.as_ref().expect("must not use from drop")
+    }
+}
+
+impl DerefMut for NoLeakChild {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.child.as_mut().expect("must not use from drop")
+    }
+}
+
+impl NoLeakChild {
+    pub(crate) fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
+        let child = command.spawn()?;
+        Ok(NoLeakChild {
+            tenant_id,
+            child: Some(child),
+        })
+    }
+
+    pub(crate) fn kill_and_wait(mut self, cause: WalRedoKillCause) {
+        let child = match self.child.take() {
+            Some(child) => child,
+            None => return,
+        };
+        Self::kill_and_wait_impl(child, cause);
+    }
+
+    #[instrument(skip_all, fields(pid=child.id(), ?cause))]
+    pub(crate) fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
+        scopeguard::defer! {
+            WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
+        }
+        let res = child.kill();
+        if let Err(e) = res {
+            // This branch is very unlikely because:
+            // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
+            // - This is the only place that calls .kill()
+            // - We consume `self`, so, .kill() can't be called twice.
+            // - If the process exited by itself or was killed by someone else,
+            //   .kill() will still succeed because we haven't wait()'ed yet.
+            //
+            // So, if we arrive here, we have really no idea what happened,
+            // whether the PID stored in self.child is still valid, etc.
+            // If this function were fallible, we'd return an error, but
+            // since it isn't, all we can do is log an error and proceed
+            // with the wait().
+            error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process");
+        }
+
+        match child.wait() {
+            Ok(exit_status) => {
+                info!(exit_status = %exit_status, "wait successful");
+            }
+            Err(e) => {
+                error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
+            }
+        }
+    }
+}
+
+impl Drop for NoLeakChild {
+    fn drop(&mut self) {
+        let child = match self.child.take() {
+            Some(child) => child,
+            None => return,
+        };
+        let tenant_shard_id = self.tenant_id;
+        // Offload the kill+wait of the child process into the background.
+        // If someone stops the runtime, we'll leak the child process.
+        // We can ignore that case because we only stop the runtime on pageserver exit.
+        tokio::runtime::Handle::current().spawn(async move {
+            tokio::task::spawn_blocking(move || {
+                // Intentionally don't inherit the tracing context from whoever is dropping us.
+                // This thread here is going to outlive of our dropper.
+                let span = tracing::info_span!(
+                    "walredo",
+                    tenant_id = %tenant_shard_id.tenant_id,
+                    shard_id = %tenant_shard_id.shard_slug()
+                );
+                let _entered = span.enter();
+                Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
+            })
+            .await
+        });
+    }
+}
+
+pub(crate) trait NoLeakChildCommandExt {
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
+}
+
+impl NoLeakChildCommandExt for Command {
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
+        NoLeakChild::spawn(tenant_id, self)
+    }
+}
diff --git a/pageserver/src/walredo/process/protocol.rs b/pageserver/src/walredo/process/protocol.rs
new file mode 100644
index 0000000000..b703344cc8
--- /dev/null
+++ b/pageserver/src/walredo/process/protocol.rs
@@ -0,0 +1,57 @@
+use bytes::BufMut;
+use pageserver_api::reltag::RelTag;
+use serde::Serialize;
+use utils::bin_ser::BeSer;
+use utils::lsn::Lsn;
+
+///
+/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
+///
+/// In Postgres `BufferTag` structure is used for exactly the same purpose.
+/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
+///
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)]
+pub(crate) struct BufferTag {
+    pub rel: RelTag,
+    pub blknum: u32,
+}
+
+pub(crate) fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
+    let len = 4 + 1 + 4 * 4;
+
+    buf.put_u8(b'B');
+    buf.put_u32(len as u32);
+
+    tag.ser_into(buf)
+        .expect("serialize BufferTag should always succeed");
+}
+
+pub(crate) fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec<u8>) {
+    assert!(base_img.len() == 8192);
+
+    let len = 4 + 1 + 4 * 4 + base_img.len();
+
+    buf.put_u8(b'P');
+    buf.put_u32(len as u32);
+    tag.ser_into(buf)
+        .expect("serialize BufferTag should always succeed");
+    buf.put(base_img);
+}
+
+pub(crate) fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec<u8>) {
+    let len = 4 + 8 + rec.len();
+
+    buf.put_u8(b'A');
+    buf.put_u32(len as u32);
+    buf.put_u64(endlsn.0);
+    buf.put(rec);
+}
+
+pub(crate) fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
+    let len = 4 + 1 + 4 * 4;
+
+    buf.put_u8(b'G');
+    buf.put_u32(len as u32);
+    tag.ser_into(buf)
+        .expect("serialize BufferTag should always succeed");
+}

From 431f4234d43f3fe42fbda441e601a89d2421b52e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Feb 2024 10:07:10 +0000
Subject: [PATCH 0093/1571] storage controller: embed database migrations in
 binary (#6637)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

We don't have a neat way to carry around migration .sql files during
deploy, and in any case would prefer to avoid depending on diesel CLI to
deploy.

## Summary of changes

- Use `diesel_migrations` crate to embed migrations in our binary
- Run migrations on startup
- Drop the diesel dependency in the `neon_local` binary, as the
attachment_service binary just needs the database to exist. Do database
creation with a simple `createdb`.


Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 Cargo.lock                                   |  1 +
 control_plane/attachment_service/Cargo.toml  |  1 +
 control_plane/attachment_service/src/main.rs | 24 +++++++++++++++++++-
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index c16331636a..b2b2777408 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -281,6 +281,7 @@ dependencies = [
  "clap",
  "control_plane",
  "diesel",
+ "diesel_migrations",
  "futures",
  "git-version",
  "hyper",
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index d3c62d74d2..3a65153c41 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -25,6 +25,7 @@ tokio-util.workspace = true
 tracing.workspace = true
 
 diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
+diesel_migrations = { version = "2.1.0" }
 
 utils = { path = "../../libs/utils/" }
 metrics = { path = "../../libs/metrics/" }
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 37b06c4090..7ac5918244 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -4,13 +4,14 @@
 /// This enables running & testing pageservers without a full-blown
 /// deployment of the Neon cloud platform.
 ///
-use anyhow::anyhow;
+use anyhow::{anyhow, Context};
 use attachment_service::http::make_router;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service};
 use aws_config::{self, BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
+use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use std::sync::Arc;
 use tokio::signal::unix::SignalKind;
@@ -22,6 +23,9 @@ use utils::{project_build_tag, project_git_version, tcp_listener};
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
+use diesel_migrations::{embed_migrations, EmbeddedMigrations};
+pub const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
+
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[command(arg_required_else_help(true))]
@@ -166,6 +170,19 @@ impl Secrets {
     }
 }
 
+async fn migration_run(database_url: &str) -> anyhow::Result<()> {
+    use diesel::PgConnection;
+    use diesel_migrations::{HarnessWithOutput, MigrationHarness};
+    let mut conn = PgConnection::establish(database_url)?;
+
+    HarnessWithOutput::write_to_stdout(&mut conn)
+        .run_pending_migrations(MIGRATIONS)
+        .map(|_| ())
+        .map_err(|e| anyhow::anyhow!(e))?;
+
+    Ok(())
+}
+
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
     let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
@@ -194,6 +211,11 @@ async fn main() -> anyhow::Result<()> {
         compute_hook_url: args.compute_hook_url,
     };
 
+    // After loading secrets & config, but before starting anything else, apply database migrations
+    migration_run(&secrets.database_url)
+        .await
+        .context("Running database migrations")?;
+
     let json_path = args.path;
     let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone()));
 

From 53743991decd9f1d13fd5063a8e840a38cbda383 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 6 Feb 2024 13:34:13 +0200
Subject: [PATCH 0094/1571] uploader: avoid cloning vecs just to get Bytes
 (#6645)

Fix cloning the serialized heatmap on every attempt by just turning it
into `bytes::Bytes` before clone so it will be a refcounted instead of
refcounting a vec clone later on.

Also fixes one cancellation token cloning I had missed in #6618.
Cc: #6096
---
 .../src/tenant/secondary/heatmap_uploader.rs       | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index fff29b2487..806e3fb0e8 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -371,8 +371,6 @@ async fn upload_tenant_heatmap(
     };
     let timelines = tenant.timelines.lock().unwrap().clone();
 
-    let tenant_cancel = tenant.cancel.clone();
-
     // Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise
     // when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind
     // in remote storage.
@@ -401,6 +399,7 @@ async fn upload_tenant_heatmap(
 
     // Serialize the heatmap
     let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
+    let bytes = bytes::Bytes::from(bytes);
     let size = bytes.len();
 
     // Drop out early if nothing changed since our last upload
@@ -411,13 +410,12 @@ async fn upload_tenant_heatmap(
 
     let path = remote_heatmap_path(tenant.get_tenant_shard_id());
 
-    // Write the heatmap.
+    let cancel = &tenant.cancel;
+
     tracing::debug!("Uploading {size} byte heatmap to {path}");
     if let Err(e) = backoff::retry(
         || async {
-            let bytes = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from(
-                bytes.clone(),
-            ))));
+            let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
             remote_storage
                 .upload_storage_object(bytes, size, &path)
                 .await
@@ -426,13 +424,13 @@ async fn upload_tenant_heatmap(
         3,
         u32::MAX,
         "Uploading heatmap",
-        &tenant_cancel,
+        cancel,
     )
     .await
     .ok_or_else(|| anyhow::anyhow!("Shutting down"))
     .and_then(|x| x)
     {
-        if tenant_cancel.is_cancelled() {
+        if cancel.is_cancelled() {
             return Err(UploadHeatmapError::Cancelled);
         } else {
             return Err(e.into());

From 0de46fd6f265e1ef0d27b0ab0f51fb7da2e52705 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 6 Feb 2024 13:04:15 +0100
Subject: [PATCH 0095/1571] heavier_once_cell: switch to tokio::sync::RwLock
 (#6589)

Using the RwLock reduces contention on the hot path.

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 libs/utils/src/sync/heavier_once_cell.rs     | 153 ++++++++++++++-----
 pageserver/src/tenant/storage_layer/layer.rs |  24 +--
 pageserver/src/tenant/timeline.rs            |   2 +-
 3 files changed, 127 insertions(+), 52 deletions(-)

diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 0ccaf4e716..f733d107f1 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -1,6 +1,6 @@
 use std::sync::{
     atomic::{AtomicUsize, Ordering},
-    Arc, Mutex, MutexGuard,
+    Arc,
 };
 use tokio::sync::Semaphore;
 
@@ -12,7 +12,7 @@ use tokio::sync::Semaphore;
 ///
 /// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
 pub struct OnceCell<T> {
-    inner: Mutex<Inner<T>>,
+    inner: tokio::sync::RwLock<Inner<T>>,
     initializers: AtomicUsize,
 }
 
@@ -50,7 +50,7 @@ impl<T> OnceCell<T> {
         let sem = Semaphore::new(1);
         sem.close();
         Self {
-            inner: Mutex::new(Inner {
+            inner: tokio::sync::RwLock::new(Inner {
                 init_semaphore: Arc::new(sem),
                 value: Some(value),
             }),
@@ -61,18 +61,18 @@ impl<T> OnceCell<T> {
     /// Returns a guard to an existing initialized value, or uniquely initializes the value before
     /// returning the guard.
     ///
-    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
+    /// Initializing might wait on any existing [`GuardMut::take_and_deinit`] deinitialization.
     ///
     /// Initialization is panic-safe and cancellation-safe.
-    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
+    pub async fn get_mut_or_init<F, Fut, E>(&self, factory: F) -> Result<GuardMut<'_, T>, E>
     where
         F: FnOnce(InitPermit) -> Fut,
         Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
     {
         let sem = {
-            let guard = self.inner.lock().unwrap();
+            let guard = self.inner.write().await;
             if guard.value.is_some() {
-                return Ok(Guard(guard));
+                return Ok(GuardMut(guard));
             }
             guard.init_semaphore.clone()
         };
@@ -88,29 +88,72 @@ impl<T> OnceCell<T> {
                 let permit = InitPermit(permit);
                 let (value, _permit) = factory(permit).await?;
 
-                let guard = self.inner.lock().unwrap();
+                let guard = self.inner.write().await;
 
                 Ok(Self::set0(value, guard))
             }
             Err(_closed) => {
-                let guard = self.inner.lock().unwrap();
+                let guard = self.inner.write().await;
                 assert!(
                     guard.value.is_some(),
                     "semaphore got closed, must be initialized"
                 );
-                return Ok(Guard(guard));
+                return Ok(GuardMut(guard));
             }
         }
     }
 
-    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
+    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
+    /// returning the guard.
+    ///
+    /// Initialization is panic-safe and cancellation-safe.
+    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<GuardRef<'_, T>, E>
+    where
+        F: FnOnce(InitPermit) -> Fut,
+        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
+    {
+        let sem = {
+            let guard = self.inner.read().await;
+            if guard.value.is_some() {
+                return Ok(GuardRef(guard));
+            }
+            guard.init_semaphore.clone()
+        };
+
+        let permit = {
+            // increment the count for the duration of queued
+            let _guard = CountWaitingInitializers::start(self);
+            sem.acquire_owned().await
+        };
+
+        match permit {
+            Ok(permit) => {
+                let permit = InitPermit(permit);
+                let (value, _permit) = factory(permit).await?;
+
+                let guard = self.inner.write().await;
+
+                Ok(Self::set0(value, guard).downgrade())
+            }
+            Err(_closed) => {
+                let guard = self.inner.read().await;
+                assert!(
+                    guard.value.is_some(),
+                    "semaphore got closed, must be initialized"
+                );
+                return Ok(GuardRef(guard));
+            }
+        }
+    }
+
+    /// Assuming a permit is held after previous call to [`GuardMut::take_and_deinit`], it can be used
     /// to complete initializing the inner value.
     ///
     /// # Panics
     ///
     /// If the inner has already been initialized.
-    pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
-        let guard = self.inner.lock().unwrap();
+    pub async fn set(&self, value: T, _permit: InitPermit) -> GuardMut<'_, T> {
+        let guard = self.inner.write().await;
 
         // cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
         // give more permits right now.
@@ -122,21 +165,31 @@ impl<T> OnceCell<T> {
         Self::set0(value, guard)
     }
 
-    fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
+    fn set0(value: T, mut guard: tokio::sync::RwLockWriteGuard<'_, Inner<T>>) -> GuardMut<'_, T> {
         if guard.value.is_some() {
             drop(guard);
             unreachable!("we won permit, must not be initialized");
         }
         guard.value = Some(value);
         guard.init_semaphore.close();
-        Guard(guard)
+        GuardMut(guard)
     }
 
     /// Returns a guard to an existing initialized value, if any.
-    pub fn get(&self) -> Option<Guard<'_, T>> {
-        let guard = self.inner.lock().unwrap();
+    pub async fn get_mut(&self) -> Option<GuardMut<'_, T>> {
+        let guard = self.inner.write().await;
         if guard.value.is_some() {
-            Some(Guard(guard))
+            Some(GuardMut(guard))
+        } else {
+            None
+        }
+    }
+
+    /// Returns a guard to an existing initialized value, if any.
+    pub async fn get(&self) -> Option<GuardRef<'_, T>> {
+        let guard = self.inner.read().await;
+        if guard.value.is_some() {
+            Some(GuardRef(guard))
         } else {
             None
         }
@@ -168,9 +221,9 @@ impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
 /// Uninteresting guard object to allow short-lived access to inspect or clone the held,
 /// initialized value.
 #[derive(Debug)]
-pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
+pub struct GuardMut<'a, T>(tokio::sync::RwLockWriteGuard<'a, Inner<T>>);
 
-impl<T> std::ops::Deref for Guard<'_, T> {
+impl<T> std::ops::Deref for GuardMut<'_, T> {
     type Target = T;
 
     fn deref(&self) -> &Self::Target {
@@ -181,7 +234,7 @@ impl<T> std::ops::Deref for Guard<'_, T> {
     }
 }
 
-impl<T> std::ops::DerefMut for Guard<'_, T> {
+impl<T> std::ops::DerefMut for GuardMut<'_, T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         self.0
             .value
@@ -190,7 +243,7 @@ impl<T> std::ops::DerefMut for Guard<'_, T> {
     }
 }
 
-impl<'a, T> Guard<'a, T> {
+impl<'a, T> GuardMut<'a, T> {
     /// Take the current value, and a new permit for it's deinitialization.
     ///
     /// The permit will be on a semaphore part of the new internal value, and any following
@@ -208,6 +261,24 @@ impl<'a, T> Guard<'a, T> {
             .map(|v| (v, InitPermit(permit)))
             .expect("guard is not created unless value has been initialized")
     }
+
+    pub fn downgrade(self) -> GuardRef<'a, T> {
+        GuardRef(self.0.downgrade())
+    }
+}
+
+#[derive(Debug)]
+pub struct GuardRef<'a, T>(tokio::sync::RwLockReadGuard<'a, Inner<T>>);
+
+impl<T> std::ops::Deref for GuardRef<'_, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.0
+            .value
+            .as_ref()
+            .expect("guard is not created unless value has been initialized")
+    }
 }
 
 /// Type held by OnceCell (de)initializing task.
@@ -248,7 +319,7 @@ mod tests {
                     barrier.wait().await;
                     let won = {
                         let g = cell
-                            .get_or_init(|permit| {
+                            .get_mut_or_init(|permit| {
                                 counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
                                 async {
                                     counters.future_polled.fetch_add(1, Ordering::Relaxed);
@@ -295,7 +366,11 @@ mod tests {
             let cell = cell.clone();
             let deinitialization_started = deinitialization_started.clone();
             async move {
-                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
+                let (answer, _permit) = cell
+                    .get_mut()
+                    .await
+                    .expect("initialized to value")
+                    .take_and_deinit();
                 assert_eq!(answer, initial);
 
                 deinitialization_started.wait().await;
@@ -306,7 +381,7 @@ mod tests {
         deinitialization_started.wait().await;
 
         let started_at = tokio::time::Instant::now();
-        cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
+        cell.get_mut_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
             .await
             .unwrap();
 
@@ -318,21 +393,21 @@ mod tests {
 
         jh.await.unwrap();
 
-        assert_eq!(*cell.get().unwrap(), reinit);
+        assert_eq!(*cell.get_mut().await.unwrap(), reinit);
     }
 
-    #[test]
-    fn reinit_with_deinit_permit() {
+    #[tokio::test]
+    async fn reinit_with_deinit_permit() {
         let cell = Arc::new(OnceCell::new(42));
 
-        let (mol, permit) = cell.get().unwrap().take_and_deinit();
-        cell.set(5, permit);
-        assert_eq!(*cell.get().unwrap(), 5);
+        let (mol, permit) = cell.get_mut().await.unwrap().take_and_deinit();
+        cell.set(5, permit).await;
+        assert_eq!(*cell.get_mut().await.unwrap(), 5);
 
-        let (five, permit) = cell.get().unwrap().take_and_deinit();
+        let (five, permit) = cell.get_mut().await.unwrap().take_and_deinit();
         assert_eq!(5, five);
-        cell.set(mol, permit);
-        assert_eq!(*cell.get().unwrap(), 42);
+        cell.set(mol, permit).await;
+        assert_eq!(*cell.get_mut().await.unwrap(), 42);
     }
 
     #[tokio::test]
@@ -340,13 +415,13 @@ mod tests {
         let cell = OnceCell::default();
 
         for _ in 0..10 {
-            cell.get_or_init(|_permit| async { Err("whatever error") })
+            cell.get_mut_or_init(|_permit| async { Err("whatever error") })
                 .await
                 .unwrap_err();
         }
 
         let g = cell
-            .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
+            .get_mut_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
             .await
             .unwrap();
         assert_eq!(*g, "finally success");
@@ -358,7 +433,7 @@ mod tests {
 
         let barrier = tokio::sync::Barrier::new(2);
 
-        let initializer = cell.get_or_init(|permit| async {
+        let initializer = cell.get_mut_or_init(|permit| async {
             barrier.wait().await;
             futures::future::pending::<()>().await;
 
@@ -372,10 +447,10 @@ mod tests {
 
         // now initializer is dropped
 
-        assert!(cell.get().is_none());
+        assert!(cell.get_mut().await.is_none());
 
         let g = cell
-            .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
+            .get_mut_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
             .await
             .unwrap();
         assert_eq!(*g, "now initialized");
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 12af866810..1f337adf53 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -299,8 +299,8 @@ impl Layer {
         })
     }
 
-    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        self.0.info(reset)
+    pub(crate) async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        self.0.info(reset).await
     }
 
     pub(crate) fn access_stats(&self) -> &LayerAccessStats {
@@ -611,10 +611,10 @@ impl LayerInner {
         let mut rx = self.status.subscribe();
 
         let strong = {
-            match self.inner.get() {
+            match self.inner.get_mut().await {
                 Some(mut either) => {
                     self.wanted_evicted.store(true, Ordering::Relaxed);
-                    either.downgrade()
+                    ResidentOrWantedEvicted::downgrade(&mut either)
                 }
                 None => return Err(EvictionError::NotFound),
             }
@@ -640,7 +640,7 @@ impl LayerInner {
                 // use however late (compared to the initial expressing of wanted) as the
                 // "outcome" now
                 LAYER_IMPL_METRICS.inc_broadcast_lagged();
-                match self.inner.get() {
+                match self.inner.get_mut().await {
                     Some(_) => Err(EvictionError::Downloaded),
                     None => Ok(()),
                 }
@@ -758,7 +758,7 @@ impl LayerInner {
                 // use the already held initialization permit because it is impossible to hit the
                 // below paths anymore essentially limiting the max loop iterations to 2.
                 let (value, init_permit) = download(init_permit).await?;
-                let mut guard = self.inner.set(value, init_permit);
+                let mut guard = self.inner.set(value, init_permit).await;
                 let (strong, _upgraded) = guard
                     .get_and_upgrade()
                     .expect("init creates strong reference, we held the init permit");
@@ -766,7 +766,7 @@ impl LayerInner {
             }
 
             let (weak, permit) = {
-                let mut locked = self.inner.get_or_init(download).await?;
+                let mut locked = self.inner.get_mut_or_init(download).await?;
 
                 if let Some((strong, upgraded)) = locked.get_and_upgrade() {
                     if upgraded {
@@ -986,12 +986,12 @@ impl LayerInner {
         }
     }
 
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+    async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
         let layer_file_name = self.desc.filename().file_name();
 
         // this is not accurate: we could have the file locally but there was a cancellation
         // and now we are not in sync, or we are currently downloading it.
-        let remote = self.inner.get().is_none();
+        let remote = self.inner.get_mut().await.is_none();
 
         let access_stats = self.access_stats.as_api_model(reset);
 
@@ -1050,7 +1050,7 @@ impl LayerInner {
                     LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
                     return;
                 };
-                match this.evict_blocking(version) {
+                match tokio::runtime::Handle::current().block_on(this.evict_blocking(version)) {
                     Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
                     Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
                 }
@@ -1058,7 +1058,7 @@ impl LayerInner {
         }
     }
 
-    fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
+    async fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
         // deleted or detached timeline, don't do anything.
         let Some(timeline) = self.timeline.upgrade() else {
             return Err(EvictionCancelled::TimelineGone);
@@ -1067,7 +1067,7 @@ impl LayerInner {
         // to avoid starting a new download while we evict, keep holding on to the
         // permit.
         let _permit = {
-            let maybe_downloaded = self.inner.get();
+            let maybe_downloaded = self.inner.get_mut().await;
 
             let (_weak, permit) = match maybe_downloaded {
                 Some(mut guard) => {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0ba3fe728a..50ffc4d265 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1268,7 +1268,7 @@ impl Timeline {
         let mut historic_layers = Vec::new();
         for historic_layer in layer_map.iter_historic_layers() {
             let historic_layer = guard.get_from_desc(&historic_layer);
-            historic_layers.push(historic_layer.info(reset));
+            historic_layers.push(historic_layer.info(reset).await);
         }
 
         LayerMapInfo {

From dae56ef60ca33643b3d80b4d2497fb6902620db0 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Tue, 6 Feb 2024 13:15:42 +0100
Subject: [PATCH 0096/1571] Do not suspend compute if there is an active
 logical replication subscription. (#6570)

## Problem

the idea is to keep compute up and running if there are any active
logical replication subscriptions.

### Rationale

Rationale:
- The Write-Ahead Logging (WAL) files, which contain the data changes,
will need to be retained on the publisher side until the subscriber is
able to connect again and apply these changes. This could potentially
lead to increased disk usage on the publisher - and we do not want to
disrupt the source - I think it is more pain for our customer to resolve
storage issues on the source than to pay for the compute at the target.
- Upon resuming the compute resources, the subscriber will start
consuming and applying the changes from the retained WAL files. The time
taken to catch up will depend on the volume of changes and the
configured vCPUs.
we can avoid explaining complex situations where we lag behind (in
extreme cases we could lag behind hours, days or even months)
- I think an important use case for logical replication from a source is
a one-time migration or release upgrade. In this case the customer would
not mind if we are not suspended for the duration of the migration.

We need to document this in the release notes and the documentation in
the context of logical replication where Neon is the target (subscriber)

### See internal discussion here

https://neondb.slack.com/archives/C04DGM6SMTM/p1706793400746539?thread_ts=1706792628.701279&cid=C04DGM6SMTM
---
 compute_tools/src/monitor.rs | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index f09bd02664..872a3f7750 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -138,6 +138,34 @@ fn watch_compute_activity(compute: &ComputeNode) {
                     }
                 }
                 //
+                // Don't suspend compute if there is an active logical replication subscription
+                //
+                // `where pid is not null` – to filter out read only computes and subscription on branches
+                //
+                let logical_subscriptions_query =
+                    "select count(*) from pg_stat_subscription where pid is not null;";
+                match cli.query_one(logical_subscriptions_query, &[]) {
+                    Ok(row) => match row.try_get::<&str, i64>("count") {
+                        Ok(num_subscribers) => {
+                            if num_subscribers > 0 {
+                                compute.update_last_active(Some(Utc::now()));
+                                continue;
+                            }
+                        }
+                        Err(e) => {
+                            warn!("failed to parse `pg_stat_subscription` count: {:?}", e);
+                            continue;
+                        }
+                    },
+                    Err(e) => {
+                        warn!(
+                            "failed to get list of active logical replication subscriptions: {:?}",
+                            e
+                        );
+                        continue;
+                    }
+                }
+                //
                 // Do not suspend compute if autovacuum is running
                 //
                 let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";

From 62978433176ca6a9679baea769aa751c48fa037d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Feb 2024 12:49:41 +0000
Subject: [PATCH 0097/1571] tests: flakiness fixes in pageserver tests (#6632)

Fix several test flakes:
- test_sharding_service_smoke had log failures on "Dropped LSN updates"
- test_emergency_mode had log failures on a deletion queue shutdown
check, where the check was incorrect because it was expecting channel
receiver to stay alive after cancellation token was fired.
- test_secondary_mode_eviction had racing heatmap uploads because the
test was using a live migration hook to set up locations, where that
migration was itself uploading heatmaps and generally making the
situation more complex than it needed to be.

These are the failure modes that I saw when spot checking the last few
failures of each test.

This will mostly/completely address #6511, but I'll leave that ticket
open for a couple days and then check if either of the tests named in
that ticket are flaky.

Related #6511
---
 pageserver/src/deletion_queue.rs              |  6 ++--
 test_runner/fixtures/neon_fixtures.py         |  3 +-
 .../regress/test_disk_usage_eviction.py       | 30 ++++++++++---------
 test_runner/regress/test_sharding_service.py  |  5 ++++
 4 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 6a820e1bdc..da1da9331a 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -700,8 +700,6 @@ impl DeletionQueue {
     }
 
     pub async fn shutdown(&mut self, timeout: Duration) {
-        self.cancel.cancel();
-
         match tokio::time::timeout(timeout, self.client.flush()).await {
             Ok(Ok(())) => {
                 tracing::info!("Deletion queue flushed successfully on shutdown")
@@ -715,6 +713,10 @@ impl DeletionQueue {
                 tracing::warn!("Timed out flushing deletion queue on shutdown")
             }
         }
+
+        // We only cancel _after_ flushing: otherwise we would be shutting down the
+        // components that do the flush.
+        self.cancel.cancel();
     }
 }
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5ce2fca820..bf7c6ccc14 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1162,7 +1162,8 @@ class NeonEnv:
         to the attachment service.
         """
         meta = self.attachment_service.inspect(tenant_id)
-        assert meta is not None, f"{tenant_id} attachment location not found"
+        if meta is None:
+            return None
         pageserver_id = meta[1]
         return self.get_pageserver(pageserver_id)
 
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index dcbf8a5025..061c57c88b 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -17,7 +17,7 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_upload_queue_empty
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
@@ -194,8 +194,10 @@ class EvictionEnv:
 
         # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction
         for tenant_id, timeline_id in self.timelines:
-            pageserver_http = self.neon_env.get_tenant_pageserver(tenant_id).http_client()
-            pageserver_http.timeline_wait_logical_size(tenant_id, timeline_id)
+            tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id)
+            # Pageserver may be none if we are currently not attached anywhere, e.g. during secondary eviction test
+            if tenant_ps is not None:
+                tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id)
 
         def statvfs_called():
             assert pageserver.log_contains(".*running mocked statvfs.*")
@@ -864,18 +866,18 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
 
     # Set up a situation where one pageserver _only_ has secondary locations on it,
     # so that when we release space we are sure it is via secondary locations.
-
-    log.info("Setting up secondary location...")
-    ps_attached = env.neon_env.pageservers[0]
+    log.info("Setting up secondary locations...")
     ps_secondary = env.neon_env.pageservers[1]
     for tenant_id in tenant_ids:
-        # Migrate all attached tenants to the same pageserver, so that all the secondaries
-        # will run on the other pageserver.  This is necessary because when we create tenants,
-        # they are spread over pageservers by default.
-        env.neon_env.attachment_service.tenant_shard_migrate(
-            TenantShardId(tenant_id, 0, 0), ps_attached.id
-        )
+        # Find where it is attached
+        pageserver = env.neon_env.get_tenant_pageserver(tenant_id)
+        pageserver.http_client().tenant_heatmap_upload(tenant_id)
 
+        # Detach it
+        pageserver.tenant_detach(tenant_id)
+
+        # Create a secondary mode location for the tenant, all tenants on one pageserver that will only
+        # contain secondary locations: this is the one where we will exercise disk usage eviction
         ps_secondary.tenant_location_configure(
             tenant_id,
             {
@@ -887,8 +889,8 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
         readback_conf = ps_secondary.read_tenant_location_conf(tenant_id)
         log.info(f"Read back conf: {readback_conf}")
 
-        # Request secondary location to download all layers that the attached location has
-        ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+        # Request secondary location to download all layers that the attached location indicated
+        # in its heatmap
         ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
     # Configure the secondary pageserver to have a phony small disk size
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 5c70378ab0..ee57fcb2cf 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -35,6 +35,11 @@ def test_sharding_service_smoke(
     neon_env_builder.num_pageservers = 3
     env = neon_env_builder.init_configs()
 
+    for pageserver in env.pageservers:
+        # This test detaches tenants during migration, which can race with deletion queue operations,
+        # during detach we only do an advisory flush, we don't wait for it.
+        pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"])
+
     # Start services by hand so that we can skip a pageserver (this will start + register later)
     env.broker.try_start()
     env.attachment_service.start()

From 27a3c9ecbe8fd09f35bbe534c0628831f29d0a1f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 6 Feb 2024 13:15:07 +0000
Subject: [PATCH 0098/1571] build(deps): bump cryptography from 41.0.6 to
 42.0.0 (#6643)

---
 poetry.lock | 65 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 37 insertions(+), 28 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 2904e2872e..e18cd4a74d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -836,47 +836,56 @@ files = [
 
 [[package]]
 name = "cryptography"
-version = "41.0.6"
+version = "42.0.0"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:0f27acb55a4e77b9be8d550d762b0513ef3fc658cd3eb15110ebbcbd626db12c"},
-    {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ae236bb8760c1e55b7a39b6d4d32d2279bc6c7c8500b7d5a13b6fb9fc97be35b"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afda76d84b053923c27ede5edc1ed7d53e3c9f475ebaf63c68e69f1403c405a8"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da46e2b5df770070412c46f87bac0849b8d685c5f2679771de277a422c7d0b86"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ff369dd19e8fe0528b02e8df9f2aeb2479f89b1270d90f96a63500afe9af5cae"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:b648fe2a45e426aaee684ddca2632f62ec4613ef362f4d681a9a6283d10e079d"},
-    {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5daeb18e7886a358064a68dbcaf441c036cbdb7da52ae744e7b9207b04d3908c"},
-    {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:068bc551698c234742c40049e46840843f3d98ad7ce265fd2bd4ec0d11306596"},
-    {file = "cryptography-41.0.6-cp37-abi3-win32.whl", hash = "sha256:2132d5865eea673fe6712c2ed5fb4fa49dba10768bb4cc798345748380ee3660"},
-    {file = "cryptography-41.0.6-cp37-abi3-win_amd64.whl", hash = "sha256:48783b7e2bef51224020efb61b42704207dde583d7e371ef8fc2a5fb6c0aabc7"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:8efb2af8d4ba9dbc9c9dd8f04d19a7abb5b49eab1f3694e7b5a16a5fc2856f5c"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5a550dc7a3b50b116323e3d376241829fd326ac47bc195e04eb33a8170902a9"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:85abd057699b98fce40b41737afb234fef05c67e116f6f3650782c10862c43da"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f39812f70fc5c71a15aa3c97b2bbe213c3f2a460b79bd21c40d033bb34a9bf36"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:742ae5e9a2310e9dade7932f9576606836ed174da3c7d26bc3d3ab4bd49b9f65"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:35f3f288e83c3f6f10752467c48919a7a94b7d88cc00b0668372a0d2ad4f8ead"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4d03186af98b1c01a4eda396b137f29e4e3fb0173e30f885e27acec8823c1b09"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b27a7fd4229abef715e064269d98a7e2909ebf92eb6912a9603c7e14c181928c"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:398ae1fc711b5eb78e977daa3cbf47cec20f2c08c5da129b7a296055fbb22aed"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7e00fb556bda398b99b0da289ce7053639d33b572847181d6483ad89835115f6"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:60e746b11b937911dc70d164060d28d273e31853bb359e2b2033c9e93e6f3c43"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3288acccef021e3c3c10d58933f44e8602cf04dba96d9796d70d537bb2f4bbc4"},
-    {file = "cryptography-41.0.6.tar.gz", hash = "sha256:422e3e31d63743855e43e5a6fcc8b4acab860f560f9321b0ee6269cc7ed70cc3"},
+    {file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:c640b0ef54138fde761ec99a6c7dc4ce05e80420262c20fa239e694ca371d434"},
+    {file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:678cfa0d1e72ef41d48993a7be75a76b0725d29b820ff3cfd606a5b2b33fda01"},
+    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:146e971e92a6dd042214b537a726c9750496128453146ab0ee8971a0299dc9bd"},
+    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87086eae86a700307b544625e3ba11cc600c3c0ef8ab97b0fda0705d6db3d4e3"},
+    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:0a68bfcf57a6887818307600c3c0ebc3f62fbb6ccad2240aa21887cda1f8df1b"},
+    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5a217bca51f3b91971400890905a9323ad805838ca3fa1e202a01844f485ee87"},
+    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:ca20550bb590db16223eb9ccc5852335b48b8f597e2f6f0878bbfd9e7314eb17"},
+    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:33588310b5c886dfb87dba5f013b8d27df7ffd31dc753775342a1e5ab139e59d"},
+    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9515ea7f596c8092fdc9902627e51b23a75daa2c7815ed5aa8cf4f07469212ec"},
+    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:35cf6ed4c38f054478a9df14f03c1169bb14bd98f0b1705751079b25e1cb58bc"},
+    {file = "cryptography-42.0.0-cp37-abi3-win32.whl", hash = "sha256:8814722cffcfd1fbd91edd9f3451b88a8f26a5fd41b28c1c9193949d1c689dc4"},
+    {file = "cryptography-42.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:a2a8d873667e4fd2f34aedab02ba500b824692c6542e017075a2efc38f60a4c0"},
+    {file = "cryptography-42.0.0-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:8fedec73d590fd30c4e3f0d0f4bc961aeca8390c72f3eaa1a0874d180e868ddf"},
+    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be41b0c7366e5549265adf2145135dca107718fa44b6e418dc7499cfff6b4689"},
+    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ca482ea80626048975360c8e62be3ceb0f11803180b73163acd24bf014133a0"},
+    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c58115384bdcfe9c7f644c72f10f6f42bed7cf59f7b52fe1bf7ae0a622b3a139"},
+    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:56ce0c106d5c3fec1038c3cca3d55ac320a5be1b44bf15116732d0bc716979a2"},
+    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:324721d93b998cb7367f1e6897370644751e5580ff9b370c0a50dc60a2003513"},
+    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:d97aae66b7de41cdf5b12087b5509e4e9805ed6f562406dfcf60e8481a9a28f8"},
+    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:85f759ed59ffd1d0baad296e72780aa62ff8a71f94dc1ab340386a1207d0ea81"},
+    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:206aaf42e031b93f86ad60f9f5d9da1b09164f25488238ac1dc488334eb5e221"},
+    {file = "cryptography-42.0.0-cp39-abi3-win32.whl", hash = "sha256:74f18a4c8ca04134d2052a140322002fef535c99cdbc2a6afc18a8024d5c9d5b"},
+    {file = "cryptography-42.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:14e4b909373bc5bf1095311fa0f7fcabf2d1a160ca13f1e9e467be1ac4cbdf94"},
+    {file = "cryptography-42.0.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3005166a39b70c8b94455fdbe78d87a444da31ff70de3331cdec2c568cf25b7e"},
+    {file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:be14b31eb3a293fc6e6aa2807c8a3224c71426f7c4e3639ccf1a2f3ffd6df8c3"},
+    {file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:bd7cf7a8d9f34cc67220f1195884151426ce616fdc8285df9054bfa10135925f"},
+    {file = "cryptography-42.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c310767268d88803b653fffe6d6f2f17bb9d49ffceb8d70aed50ad45ea49ab08"},
+    {file = "cryptography-42.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bdce70e562c69bb089523e75ef1d9625b7417c6297a76ac27b1b8b1eb51b7d0f"},
+    {file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:e9326ca78111e4c645f7e49cbce4ed2f3f85e17b61a563328c85a5208cf34440"},
+    {file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:69fd009a325cad6fbfd5b04c711a4da563c6c4854fc4c9544bff3088387c77c0"},
+    {file = "cryptography-42.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:988b738f56c665366b1e4bfd9045c3efae89ee366ca3839cd5af53eaa1401bce"},
+    {file = "cryptography-42.0.0.tar.gz", hash = "sha256:6cf9b76d6e93c62114bd19485e5cb003115c134cf9ce91f8ac924c44f8c8c3f4"},
 ]
 
 [package.dependencies]
-cffi = ">=1.12"
+cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""}
 
 [package.extras]
 docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
-docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
+docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"]
 nox = ["nox"]
-pep8test = ["black", "check-sdist", "mypy", "ruff"]
+pep8test = ["check-sdist", "click", "mypy", "ruff"]
 sdist = ["build"]
 ssh = ["bcrypt (>=3.1.5)"]
-test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
+test = ["certifi", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
 test-randomorder = ["pytest-randomly"]
 
 [[package]]

From 53a3ed0a7e26ddba5a6a70b2a5176ee7d5491283 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 6 Feb 2024 15:43:33 +0100
Subject: [PATCH 0099/1571] debug_assert presence of `shard_id` tracing field
 (#6572)

also:
fixes https://github.com/neondatabase/neon/issues/6638
---
 libs/utils/src/tracing_span_assert.rs         | 51 +++++++-------
 pageserver/src/http/routes.rs                 | 10 +--
 pageserver/src/lib.rs                         |  1 +
 pageserver/src/page_service.rs                | 68 +++++++++++++------
 pageserver/src/pgdatadir_mapping.rs           |  3 +-
 pageserver/src/span.rs                        | 43 ++++++++++++
 pageserver/src/tenant.rs                      | 16 +++--
 pageserver/src/tenant/mgr.rs                  | 22 +++---
 .../src/tenant/remote_timeline_client.rs      |  1 +
 .../tenant/remote_timeline_client/download.rs |  2 +-
 pageserver/src/tenant/span.rs                 | 17 -----
 pageserver/src/tenant/storage_layer/layer.rs  |  3 +
 pageserver/src/tenant/timeline.rs             |  8 +--
 pageserver/src/tenant/timeline/span.rs        | 19 ------
 pageserver/src/walredo.rs                     |  9 +++
 pageserver/src/walredo/process.rs             |  4 +-
 16 files changed, 165 insertions(+), 112 deletions(-)
 create mode 100644 pageserver/src/span.rs
 delete mode 100644 pageserver/src/tenant/span.rs

diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs
index db17f7d8cd..d24c81ad0b 100644
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -20,13 +20,13 @@
 //!
 //! // Then, in the main code:
 //!
-//! let span = tracing::info_span!("TestSpan", test_id = 1);
+//! let span = tracing::info_span!("TestSpan", tenant_id = 1);
 //! let _guard = span.enter();
 //!
 //! // ... down the call stack
 //!
-//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
-//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
+//! use utils::tracing_span_assert::{check_fields_present, ConstExtractor};
+//! let extractor = ConstExtractor::new("tenant_id");
 //! if let Err(missing) = check_fields_present!([&extractor]) {
 //!    // if you copypaste this to a custom assert method, remember to add #[track_caller]
 //!    // to get the "user" code location for the panic.
@@ -45,27 +45,26 @@ pub enum ExtractionResult {
 }
 
 pub trait Extractor: Send + Sync + std::fmt::Debug {
-    fn name(&self) -> &str;
+    fn id(&self) -> &str;
     fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult;
 }
 
 #[derive(Debug)]
-pub struct MultiNameExtractor<const L: usize> {
-    name: &'static str,
-    field_names: [&'static str; L],
+pub struct ConstExtractor {
+    field_name: &'static str,
 }
 
-impl<const L: usize> MultiNameExtractor<L> {
-    pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor<L> {
-        MultiNameExtractor { name, field_names }
+impl ConstExtractor {
+    pub const fn new(field_name: &'static str) -> ConstExtractor {
+        ConstExtractor { field_name }
     }
 }
-impl<const L: usize> Extractor for MultiNameExtractor<L> {
-    fn name(&self) -> &str {
-        self.name
+impl Extractor for ConstExtractor {
+    fn id(&self) -> &str {
+        self.field_name
     }
     fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult {
-        if fields.iter().any(|f| self.field_names.contains(&f.name())) {
+        if fields.iter().any(|f| f.name() == self.field_name) {
             ExtractionResult::Present
         } else {
             ExtractionResult::Absent
@@ -203,19 +202,19 @@ mod tests {
     }
     impl<'a> fmt::Debug for MemoryIdentity<'a> {
         fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
-            write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
+            write!(f, "{:p}: {}", self.as_ptr(), self.0.id())
         }
     }
 
     struct Setup {
         _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
-        tenant_extractor: MultiNameExtractor<2>,
-        timeline_extractor: MultiNameExtractor<2>,
+        tenant_extractor: ConstExtractor,
+        timeline_extractor: ConstExtractor,
     }
 
     fn setup_current_thread() -> Setup {
-        let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]);
-        let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]);
+        let tenant_extractor = ConstExtractor::new("tenant_id");
+        let timeline_extractor = ConstExtractor::new("timeline_id");
 
         let registry = tracing_subscriber::registry()
             .with(tracing_subscriber::fmt::layer())
@@ -343,12 +342,12 @@ mod tests {
         let span = tracing::info_span!("foo", e = "some value");
         let _guard = span.enter();
 
-        let extractor = MultiNameExtractor::new("E", ["e"]);
+        let extractor = ConstExtractor::new("e");
         let res = check_fields_present0([&extractor]);
         assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
 
         // similarly for a not found key
-        let extractor = MultiNameExtractor::new("F", ["foobar"]);
+        let extractor = ConstExtractor::new("foobar");
         let res = check_fields_present0([&extractor]);
         assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
     }
@@ -368,16 +367,14 @@ mod tests {
         // normally this would work, but without any tracing-subscriber configured, both
         // check_field_present find nothing
         let _guard = subspan.enter();
-        let extractors: [&dyn Extractor; 2] = [
-            &MultiNameExtractor::new("E", ["e"]),
-            &MultiNameExtractor::new("F", ["f"]),
-        ];
+        let extractors: [&dyn Extractor; 2] =
+            [&ConstExtractor::new("e"), &ConstExtractor::new("f")];
 
         let res = check_fields_present0(extractors);
         assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
 
         // similarly for a not found key
-        let extractor = MultiNameExtractor::new("G", ["g"]);
+        let extractor = ConstExtractor::new("g");
         let res = check_fields_present0([&extractor]);
         assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
     }
@@ -410,7 +407,7 @@ mod tests {
         let span = tracing::info_span!("foo", e = "some value");
         let _guard = span.enter();
 
-        let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])];
+        let extractors: [&dyn Extractor; 1] = [&ConstExtractor::new("e")];
 
         if span.is_disabled() {
             // the tests are running single threaded, or we got lucky and no other tests subscriber
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index b97e272c86..792089ebe7 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -535,7 +535,7 @@ async fn timeline_create_handler(
     }
     .instrument(info_span!("timeline_create",
         tenant_id = %tenant_shard_id.tenant_id,
-        shard = %tenant_shard_id.shard_slug(),
+        shard_id = %tenant_shard_id.shard_slug(),
         timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
     .await
 }
@@ -831,7 +831,7 @@ async fn timeline_delete_handler(
             }
         })?;
     tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-    tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
+    tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))
         .await?;
 
     json_response(StatusCode::ACCEPTED, ())
@@ -856,7 +856,7 @@ async fn tenant_detach_handler(
         detach_ignored.unwrap_or(false),
         &state.deletion_queue_client,
     )
-    .instrument(info_span!("tenant_detach", %tenant_id))
+    .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
     .await?;
 
     json_response(StatusCode::OK, ())
@@ -1007,7 +1007,7 @@ async fn tenant_delete_handler(
         .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
         .instrument(info_span!("tenant_delete_handler",
             tenant_id = %tenant_shard_id.tenant_id,
-            shard = %tenant_shard_id.shard_slug()
+            shard_id = %tenant_shard_id.shard_slug()
         ))
         .await?;
 
@@ -1363,7 +1363,7 @@ async fn put_tenant_location_config_handler(
             mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
                 .instrument(info_span!("tenant_detach",
                     tenant_id = %tenant_shard_id.tenant_id,
-                    shard = %tenant_shard_id.shard_slug()
+                    shard_id = %tenant_shard_id.shard_slug()
                 ))
                 .await
         {
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index bcde1166b7..c3f35142ec 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -17,6 +17,7 @@ pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod repository;
+pub mod span;
 pub(crate) mod statvfs;
 pub mod task_mgr;
 pub mod tenant;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 754c021c88..6fc38a76d4 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -63,9 +63,10 @@ use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::pgdatadir_mapping::Version;
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
-use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
@@ -549,7 +550,7 @@ impl PageServerHandler {
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
-        debug_assert_current_span_has_tenant_and_timeline_id();
+        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
 
         let tenant = mgr::get_active_tenant_with_timeout(
             tenant_id,
@@ -631,6 +632,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::GetPage(req) => {
+                    // shard_id is filled in by the handler
                     let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
                     (
                         self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
@@ -719,7 +721,7 @@ impl PageServerHandler {
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
-        debug_assert_current_span_has_tenant_and_timeline_id();
+        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
 
         // Create empty timeline
         info!("creating new timeline");
@@ -772,7 +774,7 @@ impl PageServerHandler {
         Ok(())
     }
 
-    #[instrument(skip_all, fields(%start_lsn, %end_lsn))]
+    #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
     async fn handle_import_wal<IO>(
         &self,
         pgb: &mut PostgresBackend<IO>,
@@ -785,8 +787,6 @@ impl PageServerHandler {
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-
         let timeline = self
             .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
             .await?;
@@ -893,6 +893,7 @@ impl PageServerHandler {
         Ok(lsn)
     }
 
+    #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_rel_exists_request(
         &mut self,
         tenant_id: TenantId,
@@ -919,6 +920,7 @@ impl PageServerHandler {
         }))
     }
 
+    #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_nblocks_request(
         &mut self,
         tenant_id: TenantId,
@@ -946,6 +948,7 @@ impl PageServerHandler {
         }))
     }
 
+    #[instrument(skip_all, fields(shard_id))]
     async fn handle_db_size_request(
         &mut self,
         tenant_id: TenantId,
@@ -1096,6 +1099,7 @@ impl PageServerHandler {
         }
     }
 
+    #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_page_at_lsn_request(
         &mut self,
         tenant_id: TenantId,
@@ -1129,6 +1133,9 @@ impl PageServerHandler {
             }
         };
 
+        // load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't
+        set_tracing_field_shard_id(timeline);
+
         let _timer = timeline
             .query_metrics
             .start_timer(metrics::SmgrQueryType::GetPageAtLsn);
@@ -1147,6 +1154,7 @@ impl PageServerHandler {
         }))
     }
 
+    #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_slru_segment_request(
         &mut self,
         tenant_id: TenantId,
@@ -1175,7 +1183,7 @@ impl PageServerHandler {
     }
 
     #[allow(clippy::too_many_arguments)]
-    #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
+    #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
     async fn handle_basebackup_request<IO>(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
@@ -1190,8 +1198,6 @@ impl PageServerHandler {
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-
         let started = std::time::Instant::now();
 
         // check that the timeline exists
@@ -1313,6 +1319,7 @@ impl PageServerHandler {
         .await
         .map_err(GetActiveTimelineError::Tenant)?;
         let timeline = tenant.get_timeline(timeline_id, true)?;
+        set_tracing_field_shard_id(&timeline);
         Ok(timeline)
     }
 }
@@ -1477,21 +1484,29 @@ where
                 .record("timeline_id", field::display(timeline_id));
 
             self.check_permission(Some(tenant_id))?;
-            let timeline = self
-                .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
-                .await?;
+            async {
+                let timeline = self
+                    .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+                    .await?;
 
-            let end_of_timeline = timeline.get_last_record_rlsn();
+                let end_of_timeline = timeline.get_last_record_rlsn();
 
-            pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                RowDescriptor::text_col(b"prev_lsn"),
-                RowDescriptor::text_col(b"last_lsn"),
-            ]))?
-            .write_message_noflush(&BeMessage::DataRow(&[
-                Some(end_of_timeline.prev.to_string().as_bytes()),
-                Some(end_of_timeline.last.to_string().as_bytes()),
-            ]))?
-            .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                pgb.write_message_noflush(&BeMessage::RowDescription(&[
+                    RowDescriptor::text_col(b"prev_lsn"),
+                    RowDescriptor::text_col(b"last_lsn"),
+                ]))?
+                .write_message_noflush(&BeMessage::DataRow(&[
+                    Some(end_of_timeline.prev.to_string().as_bytes()),
+                    Some(end_of_timeline.last.to_string().as_bytes()),
+                ]))?
+                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                anyhow::Ok(())
+            }
+            .instrument(info_span!(
+                "handle_get_last_record_lsn",
+                shard_id = tracing::field::Empty
+            ))
+            .await?;
         }
         // same as basebackup, but result includes relational data as well
         else if query_string.starts_with("fullbackup ") {
@@ -1748,3 +1763,12 @@ impl From<GetActiveTimelineError> for QueryError {
         }
     }
 }
+
+fn set_tracing_field_shard_id(timeline: &Timeline) {
+    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
+    tracing::Span::current().record(
+        "shard_id",
+        tracing::field::display(timeline.tenant_shard_id.shard_slug()),
+    );
+    debug_assert_current_span_has_tenant_and_timeline_id();
+}
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index a36785a69f..f1d18c0146 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -10,6 +10,7 @@ use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
@@ -699,7 +700,7 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
-        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
+        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
 
         // Fetch list of database dirs and iterate them
         let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
diff --git a/pageserver/src/span.rs b/pageserver/src/span.rs
new file mode 100644
index 0000000000..91fee50514
--- /dev/null
+++ b/pageserver/src/span.rs
@@ -0,0 +1,43 @@
+use utils::tracing_span_assert::check_fields_present;
+
+mod extractors {
+    use utils::tracing_span_assert::ConstExtractor;
+
+    pub(super) const TENANT_ID: ConstExtractor = ConstExtractor::new("tenant_id");
+    pub(super) const SHARD_ID: ConstExtractor = ConstExtractor::new("shard_id");
+    pub(super) const TIMELINE_ID: ConstExtractor = ConstExtractor::new("timeline_id");
+}
+
+#[track_caller]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {
+    if cfg!(debug_assertions) {
+        if let Err(missing) = check_fields_present!([&extractors::TENANT_ID, &extractors::SHARD_ID])
+        {
+            panic!("missing extractors: {missing:?}")
+        }
+    }
+}
+
+#[track_caller]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
+    if cfg!(debug_assertions) {
+        if let Err(missing) = check_fields_present!([
+            &extractors::TENANT_ID,
+            &extractors::SHARD_ID,
+            &extractors::TIMELINE_ID,
+        ]) {
+            panic!("missing extractors: {missing:?}")
+        }
+    }
+}
+
+#[track_caller]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id() {
+    if cfg!(debug_assertions) {
+        if let Err(missing) =
+            check_fields_present!([&extractors::TENANT_ID, &extractors::TIMELINE_ID,])
+        {
+            panic!("missing extractors: {missing:?}")
+        }
+    }
+}
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 624c3e365f..fe85cf9753 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -100,6 +100,7 @@ use std::sync::Arc;
 use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};
 
+use crate::span;
 use crate::tenant::timeline::delete::DeleteTimelineFlow;
 use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
@@ -150,7 +151,6 @@ pub mod block_io;
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
-mod span;
 
 pub mod metadata;
 mod par_fsync;
@@ -168,7 +168,7 @@ pub(crate) mod timeline;
 
 pub mod size;
 
-pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
+pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 
 // re-export for use in remote_timeline_client.rs
@@ -3998,6 +3998,10 @@ pub(crate) mod harness {
             })
         }
 
+        pub fn span(&self) -> tracing::Span {
+            info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
+        }
+
         pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
             let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
             (
@@ -4602,7 +4606,7 @@ mod tests {
             // so that all uploads finish & we can call harness.load() below again
             tenant
                 .shutdown(Default::default(), true)
-                .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id))
+                .instrument(harness.span())
                 .await
                 .ok()
                 .unwrap();
@@ -4643,7 +4647,7 @@ mod tests {
             // so that all uploads finish & we can call harness.load() below again
             tenant
                 .shutdown(Default::default(), true)
-                .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id))
+                .instrument(harness.span())
                 .await
                 .ok()
                 .unwrap();
@@ -4705,7 +4709,7 @@ mod tests {
         // so that all uploads finish & we can call harness.try_load() below again
         tenant
             .shutdown(Default::default(), true)
-            .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id))
+            .instrument(harness.span())
             .await
             .ok()
             .unwrap();
@@ -5238,7 +5242,7 @@ mod tests {
             let raw_tline = tline.raw_timeline().unwrap();
             raw_tline
                 .shutdown()
-                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, timeline_id=%TIMELINE_ID))
+                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
                 .await;
             std::mem::forget(tline);
         }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index de0b636d47..5ec910ca3e 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -684,7 +684,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
                                     // going to log too many lines
                                     debug!("tenant successfully stopped");
                                 }
-                                .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug())),
+                                .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
                             );
 
                             total_attached += 1;
@@ -1720,6 +1720,7 @@ pub(crate) async fn ignore_tenant(
     ignore_tenant0(conf, &TENANTS, tenant_id).await
 }
 
+#[instrument(skip_all, fields(shard_id))]
 async fn ignore_tenant0(
     conf: &'static PageServerConf,
     tenants: &std::sync::RwLock<TenantsMap>,
@@ -1727,6 +1728,10 @@ async fn ignore_tenant0(
 ) -> Result<(), TenantStateError> {
     // This is a legacy API (replaced by `/location_conf`).  It does not support sharding
     let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+    tracing::Span::current().record(
+        "shard_id",
+        tracing::field::display(tenant_shard_id.shard_slug()),
+    );
 
     remove_tenant_from_memory(tenants, tenant_shard_id, async {
         let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
@@ -2122,7 +2127,7 @@ fn tenant_map_acquire_slot_impl(
     METRICS.tenant_slot_writes.inc();
 
     let mut locked = tenants.write().unwrap();
-    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug());
+    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug());
     let _guard = span.enter();
 
     let m = match &mut *locked {
@@ -2358,7 +2363,7 @@ pub(crate) async fn immediate_gc(
 mod tests {
     use std::collections::BTreeMap;
     use std::sync::Arc;
-    use tracing::{info_span, Instrument};
+    use tracing::Instrument;
 
     use crate::tenant::mgr::TenantSlot;
 
@@ -2369,17 +2374,16 @@ mod tests {
         // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully
         // wait for it to complete before proceeding.
 
-        let (t, _ctx) = TenantHarness::create("shutdown_awaits_in_progress_tenant")
-            .unwrap()
-            .load()
-            .await;
+        let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap();
+        let (t, _ctx) = h.load().await;
 
         // harness loads it to active, which is forced and nothing is running on the tenant
 
         let id = t.tenant_shard_id();
 
         // tenant harness configures the logging and we cannot escape it
-        let _e = info_span!("testing", tenant_id = %id).entered();
+        let span = h.span();
+        let _e = span.enter();
 
         let tenants = BTreeMap::from([(id, TenantSlot::Attached(t.clone()))]);
         let tenants = Arc::new(std::sync::RwLock::new(TenantsMap::Open(tenants)));
@@ -2400,7 +2404,7 @@ mod tests {
                     };
                     super::remove_tenant_from_memory(&tenants, id, cleanup).await
                 }
-                .instrument(info_span!("foobar", tenant_id = %id))
+                .instrument(h.span())
             });
 
             // now the long cleanup should be in place, with the stopping state
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 831a073d17..152c9a2b7d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1952,6 +1952,7 @@ mod tests {
             tracing::info_span!(
                 "test",
                 tenant_id = %self.harness.tenant_shard_id.tenant_id,
+                shard_id = %self.harness.tenant_shard_id.shard_slug(),
                 timeline_id = %TIMELINE_ID
             )
         }
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 2c50726b43..6c1125746b 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -17,11 +17,11 @@ use utils::timeout::timeout_cancellable;
 use utils::{backoff, crashsafe};
 
 use crate::config::PageServerConf;
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{
     download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT,
 };
 use crate::tenant::storage_layer::LayerFileName;
-use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::Generation;
 use crate::virtual_file::on_fatal_io_error;
 use crate::TEMP_FILE_SUFFIX;
diff --git a/pageserver/src/tenant/span.rs b/pageserver/src/tenant/span.rs
deleted file mode 100644
index 04e92f4096..0000000000
--- a/pageserver/src/tenant/span.rs
+++ /dev/null
@@ -1,17 +0,0 @@
-#[cfg(debug_assertions)]
-use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
-
-#[cfg(not(debug_assertions))]
-pub(crate) fn debug_assert_current_span_has_tenant_id() {}
-
-#[cfg(debug_assertions)]
-pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<1>> =
-    once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id"]));
-
-#[cfg(debug_assertions)]
-#[track_caller]
-pub(crate) fn debug_assert_current_span_has_tenant_id() {
-    if let Err(missing) = check_fields_present!([&*TENANT_ID_EXTRACTOR]) {
-        panic!("missing extractors: {missing:?}")
-    }
-}
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 1f337adf53..52c0f8abdc 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -15,6 +15,7 @@ use utils::sync::heavier_once_cell;
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::Key;
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
 
 use super::delta_layer::{self, DeltaEntry};
@@ -836,6 +837,8 @@ impl LayerInner {
         timeline: Arc<Timeline>,
         permit: heavier_once_cell::InitPermit,
     ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
         let task_name = format!("download layer {}", self);
 
         let (tx, rx) = tokio::sync::oneshot::channel();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 50ffc4d265..43aa178ab5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1138,7 +1138,7 @@ impl Timeline {
     /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
     /// the graceful [`Timeline::flush_and_shutdown`] function.
     pub(crate) async fn shutdown(&self) {
-        span::debug_assert_current_span_has_tenant_and_timeline_id();
+        debug_assert_current_span_has_tenant_and_timeline_id();
 
         // Signal any subscribers to our cancellation token to drop out
         tracing::debug!("Cancelling CancellationToken");
@@ -1964,7 +1964,7 @@ impl Timeline {
                     .await;
                 Ok(())
             }
-            .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, timeline_id=%self.timeline_id)),
+            .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id)),
         );
     }
 
@@ -2151,7 +2151,7 @@ impl Timeline {
         cause: LogicalSizeCalculationCause,
         ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
-        span::debug_assert_current_span_has_tenant_and_timeline_id();
+        crate::span::debug_assert_current_span_has_tenant_and_timeline_id();
         // We should never be calculating logical sizes on shard !=0, because these shards do not have
         // accurate relation sizes, and they do not emit consumption metrics.
         debug_assert!(self.tenant_shard_id.is_zero());
@@ -2849,7 +2849,7 @@ impl Timeline {
         frozen_layer: Arc<InMemoryLayer>,
         ctx: &RequestContext,
     ) -> Result<(), FlushLayerError> {
-        span::debug_assert_current_span_has_tenant_and_timeline_id();
+        debug_assert_current_span_has_tenant_and_timeline_id();
         // As a special case, when we have just imported an image into the repository,
         // instead of writing out a L0 delta layer, we directly write out image layer
         // files instead. This is possible as long as *all* the data imported into the
diff --git a/pageserver/src/tenant/timeline/span.rs b/pageserver/src/tenant/timeline/span.rs
index 3b580c9d1b..8b13789179 100644
--- a/pageserver/src/tenant/timeline/span.rs
+++ b/pageserver/src/tenant/timeline/span.rs
@@ -1,20 +1 @@
-#[cfg(debug_assertions)]
-use utils::tracing_span_assert::{check_fields_present, Extractor, MultiNameExtractor};
 
-#[cfg(not(debug_assertions))]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
-
-#[cfg(debug_assertions)]
-#[track_caller]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
-    static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<1>> =
-        once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TimelineId", ["timeline_id"]));
-
-    let fields: [&dyn Extractor; 2] = [
-        &*crate::tenant::span::TENANT_ID_EXTRACTOR,
-        &*TIMELINE_ID_EXTRACTOR,
-    ];
-    if let Err(missing) = check_fields_present!(fields) {
-        panic!("missing extractors: {missing:?}")
-    }
-}
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 773e5fc051..98a6a0bb6c 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -373,6 +373,7 @@ mod tests {
     use bytes::Bytes;
     use pageserver_api::shard::TenantShardId;
     use std::str::FromStr;
+    use tracing::Instrument;
     use utils::{id::TenantId, lsn::Lsn};
 
     #[tokio::test]
@@ -397,6 +398,7 @@ mod tests {
                 short_records(),
                 14,
             )
+            .instrument(h.span())
             .await
             .unwrap();
 
@@ -424,6 +426,7 @@ mod tests {
                 short_records(),
                 14,
             )
+            .instrument(h.span())
             .await
             .unwrap();
 
@@ -444,6 +447,7 @@ mod tests {
                 short_records(),
                 16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
             )
+            .instrument(h.span())
             .await
             .unwrap_err();
     }
@@ -472,6 +476,7 @@ mod tests {
         // underscored because unused, except for removal at drop
         _repo_dir: camino_tempfile::Utf8TempDir,
         manager: PostgresRedoManager,
+        tenant_shard_id: TenantShardId,
     }
 
     impl RedoHarness {
@@ -488,7 +493,11 @@ mod tests {
             Ok(RedoHarness {
                 _repo_dir: repo_dir,
                 manager,
+                tenant_shard_id,
             })
         }
+        fn span(&self) -> tracing::Span {
+            tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
+        }
     }
 }
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index 85db3b4a4a..bcbb263663 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -54,12 +54,14 @@ impl WalRedoProcess {
     //
     // Start postgres binary in special WAL redo mode.
     //
-    #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
+    #[instrument(skip_all,fields(pg_version=pg_version))]
     pub(crate) fn launch(
         conf: &'static PageServerConf,
         tenant_shard_id: TenantShardId,
         pg_version: u32,
     ) -> anyhow::Result<Self> {
+        crate::span::debug_assert_current_span_has_tenant_id();
+
         let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
         let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
 

From d7b29aace7eec730af45e7f12fbe5620545b48aa Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 6 Feb 2024 16:20:02 +0100
Subject: [PATCH 0100/1571] refactor(walredo): don't create WalRedoManager for
 broken tenants  (#6597)

When we'll later introduce a global pool of pre-spawned walredo
processes (https://github.com/neondatabase/neon/issues/6581), this
refactoring avoids plumbing through the reference to the pool to all the
places where we create a broken tenant.

Builds atop the refactoring in #6583
---
 pageserver/src/tenant.rs          | 18 +++++++-----------
 pageserver/src/tenant/tasks.rs    |  4 +++-
 pageserver/src/tenant/timeline.rs |  9 ++++++---
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index fe85cf9753..f704f8c0dd 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -278,7 +278,7 @@ pub struct Tenant {
     // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn
     // timeout...
     gc_cs: tokio::sync::Mutex<()>,
-    walredo_mgr: Arc<WalRedoManager>,
+    walredo_mgr: Option<Arc<WalRedoManager>>,
 
     // provides access to timeline data sitting in the remote storage
     pub(crate) remote_storage: Option<GenericRemoteStorage>,
@@ -635,7 +635,7 @@ impl Tenant {
             conf,
             attached_conf,
             shard_identity,
-            wal_redo_manager,
+            Some(wal_redo_manager),
             tenant_shard_id,
             remote_storage.clone(),
             deletion_queue_client,
@@ -1195,10 +1195,6 @@ impl Tenant {
         tenant_shard_id: TenantShardId,
         reason: String,
     ) -> Arc<Tenant> {
-        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
-            conf,
-            tenant_shard_id,
-        )));
         Arc::new(Tenant::new(
             TenantState::Broken {
                 reason,
@@ -1209,7 +1205,7 @@ impl Tenant {
             // Shard identity isn't meaningful for a broken tenant: it's just a placeholder
             // to occupy the slot for this TenantShardId.
             ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
-            wal_redo_manager,
+            None,
             tenant_shard_id,
             None,
             DeletionQueueClient::broken(),
@@ -1978,7 +1974,7 @@ impl Tenant {
     }
 
     pub(crate) fn wal_redo_manager_status(&self) -> Option<WalRedoManagerStatus> {
-        self.walredo_mgr.status()
+        self.walredo_mgr.as_ref().and_then(|mgr| mgr.status())
     }
 
     /// Changes tenant status to active, unless shutdown was already requested.
@@ -2613,7 +2609,7 @@ impl Tenant {
             self.tenant_shard_id,
             self.generation,
             self.shard_identity,
-            Arc::clone(&self.walredo_mgr),
+            self.walredo_mgr.as_ref().map(Arc::clone),
             resources,
             pg_version,
             state,
@@ -2631,7 +2627,7 @@ impl Tenant {
         conf: &'static PageServerConf,
         attached_conf: AttachedTenantConf,
         shard_identity: ShardIdentity,
-        walredo_mgr: Arc<WalRedoManager>,
+        walredo_mgr: Option<Arc<WalRedoManager>>,
         tenant_shard_id: TenantShardId,
         remote_storage: Option<GenericRemoteStorage>,
         deletion_queue_client: DeletionQueueClient,
@@ -4055,7 +4051,7 @@ pub(crate) mod harness {
                 .unwrap(),
                 // This is a legacy/test code path: sharding isn't supported here.
                 ShardIdentity::unsharded(),
-                walredo_mgr,
+                Some(walredo_mgr),
                 self.tenant_shard_id,
                 Some(self.remote_storage.clone()),
                 self.deletion_queue.new_client(),
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 5f39c46a84..950cc46e71 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -199,7 +199,9 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 
             // Perhaps we did no work and the walredo process has been idle for some time:
             // give it a chance to shut down to avoid leaving walredo process running indefinitely.
-            tenant.walredo_mgr.maybe_quiesce(period * 10);
+            if let Some(walredo_mgr) = &tenant.walredo_mgr {
+                walredo_mgr.maybe_quiesce(period * 10);
+            }
 
             // Sleep
             if tokio::time::timeout(sleep_duration, cancel.cancelled())
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 43aa178ab5..735b8003b4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -215,8 +215,8 @@ pub struct Timeline {
     // Atomic would be more appropriate here.
     last_freeze_ts: RwLock<Instant>,
 
-    // WAL redo manager
-    walredo_mgr: Arc<super::WalRedoManager>,
+    // WAL redo manager. `None` only for broken tenants.
+    walredo_mgr: Option<Arc<super::WalRedoManager>>,
 
     /// Remote storage client.
     /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
@@ -1427,7 +1427,7 @@ impl Timeline {
         tenant_shard_id: TenantShardId,
         generation: Generation,
         shard_identity: ShardIdentity,
-        walredo_mgr: Arc<super::WalRedoManager>,
+        walredo_mgr: Option<Arc<super::WalRedoManager>>,
         resources: TimelineResources,
         pg_version: u32,
         state: TimelineState,
@@ -4457,6 +4457,9 @@ impl Timeline {
 
                 let img = match self
                     .walredo_mgr
+                    .as_ref()
+                    .context("timeline has no walredo manager")
+                    .map_err(PageReconstructError::WalRedo)?
                     .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
                     .await
                     .context("reconstruct a page image")

From bb9272116816690f806b8932af037a8b69e10aa2 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 6 Feb 2024 17:53:04 +0200
Subject: [PATCH 0101/1571] build: migrate check-style-rust to small runners
 (#6588)

We have more small runners than large runners, and often a shortage of
large runners. Migrate `check-style-rust` to run on small runners.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 2d7edf2e22..9fe9636d67 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -132,7 +132,7 @@ jobs:
 
   check-codestyle-rust:
     needs: [ check-permissions, build-buildtools-image ]
-    runs-on: [ self-hosted, gen3, large ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       options: --init

From e65f0fe874aa4762d5d4702349647677ea2c352e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 6 Feb 2024 17:00:55 +0000
Subject: [PATCH 0102/1571] CI(benchmarks): make job split consistent across
 reruns (#6614)

## Problem

We've got several issues with the current `benchmarks` job setup:
- `benchmark_durations.json` file (that we generate in runtime to
split tests into several jobs[0]) is not consistent between these
jobs (and very not consistent with the file if we rerun the job). I.e.
test selection for each job can be different, which could end up in
missed tests in a test run.
- `scripts/benchmark_durations` doesn't fetch all tests from the
database (it doesn't expect any extra directories inside
`test_runner/performance`)
- For some reason, currently split into 4 groups ends up with the 4th
group has no tests to run, which fails the job[1]

- [0] https://github.com/neondatabase/neon/pull/4683
- [1] https://github.com/neondatabase/neon/issues/6629

## Summary of changes
- Generate `benchmark_durations.json` file once before we start
`benchmarks` jobs (this makes it consistent across the jobs) and pass
the file content through the GitHub Actions input (this makes it
consistent for reruns)
- `scripts/benchmark_durations` fix SQL query for getting all required
tests
- Split benchmarks into 5 jobs instead of 4 jobs.
---
 .../actions/run-python-test-set/action.yml    |   6 +-
 .github/workflows/build_and_test.yml          |  39 ++++-
 scripts/benchmark_durations.py                | 133 +++++++++---------
 3 files changed, 111 insertions(+), 67 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 8dfa6c465f..7a88e4f73b 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -44,6 +44,10 @@ inputs:
     description: 'Postgres version to use for tests'
     required: false
     default: 'v14'
+  benchmark_durations:
+    description: 'benchmark durations JSON'
+    required: false
+    default: '{}'
 
 runs:
   using: "composite"
@@ -160,7 +164,7 @@ runs:
         # We use pytest-split plugin to run benchmarks in parallel on different CI runners
         if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then
           mkdir -p $TEST_OUTPUT
-          poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json"
+          echo '${{ inputs.benchmark_durations || '{}' }}' > $TEST_OUTPUT/benchmark_durations.json
 
           EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
         fi
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9fe9636d67..066f4a21eb 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -478,8 +478,40 @@ jobs:
         if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
         uses: ./.github/actions/save-coverage-data
 
+  get-benchmarks-durations:
+    outputs:
+      json: ${{ steps.get-benchmark-durations.outputs.json }}
+    needs: [ check-permissions, build-buildtools-image ]
+    runs-on: [ self-hosted, gen3, small ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      options: --init
+    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Cache poetry deps
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pypoetry/virtualenvs
+          key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+
+      - name: Install Python deps
+        run: ./scripts/pysync
+
+      - name: get benchmark durations
+        id: get-benchmark-durations
+        env:
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+        run: |
+          poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" \
+                                                      --days 10 \
+                                                      --output /tmp/benchmark_durations.json
+          echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
+
   benchmarks:
-    needs: [ check-permissions, build-neon, build-buildtools-image ]
+    needs: [ check-permissions, build-neon, build-buildtools-image, get-benchmarks-durations ]
     runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
@@ -490,7 +522,7 @@ jobs:
       fail-fast: false
       matrix:
         # the amount of groups (N) should be reflected in `extra_params: --splits N ...`
-        pytest_split_group: [ 1, 2, 3, 4 ]
+        pytest_split_group: [ 1, 2, 3, 4, 5 ]
         build_type: [ release ]
     steps:
       - name: Checkout
@@ -503,7 +535,8 @@ jobs:
           test_selection: performance
           run_in_parallel: false
           save_perf_report: ${{ github.ref_name == 'main' }}
-          extra_params: --splits 4 --group ${{ matrix.pytest_split_group }}
+          extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
+          benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
         env:
           VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
diff --git a/scripts/benchmark_durations.py b/scripts/benchmark_durations.py
index 7f05d72a03..01f34a1b96 100755
--- a/scripts/benchmark_durations.py
+++ b/scripts/benchmark_durations.py
@@ -20,7 +20,7 @@ BENCHMARKS_DURATION_QUERY = """
     FROM results
     WHERE
         started_at > CURRENT_DATE - INTERVAL '%s' day
-        AND parent_suite = 'test_runner.performance'
+        AND starts_with(parent_suite, 'test_runner.performance')
         AND status = 'passed'
     GROUP BY
         parent_suite, suite, name
@@ -31,68 +31,75 @@ BENCHMARKS_DURATION_QUERY = """
 # the total duration varies from 8 to 40 minutes.
 # We use some pre-collected durations as a fallback to have a better distribution.
 FALLBACK_DURATION = {
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 62.144,
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 90.941,
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 26.053,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.67,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 14.497,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 18.852,
-    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 26.572,
-    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 6.259,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 21.206,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.474,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 11.262,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 94.225,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 68.159,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 76.719,
-    "test_runner/performance/test_compaction.py::test_compaction": 110.222,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 10.743,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.541,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.109,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.121,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.3,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.086,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 12.024,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.14,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.375,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.075,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.147,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.321,
-    "test_runner/performance/test_copy.py::test_copy[neon]": 16.579,
-    "test_runner/performance/test_copy.py::test_copy[vanilla]": 10.094,
-    "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 590.157,
-    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.102,
-    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 8.677,
-    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 31.079,
-    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 38.119,
-    "test_runner/performance/test_layer_map.py::test_layer_map": 24.784,
-    "test_runner/performance/test_logical_replication.py::test_logical_replication": 117.707,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 21.194,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 59.068,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 73.235,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 82.586,
-    "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 106.536,
-    "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 98.753,
-    "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 6.975,
-    "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 3.69,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.529,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 64.522,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 40.964,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899,
-    "test_runner/performance/test_startup.py::test_startup_simple": 2.51,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 113.653,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 233.728,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 419.093,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 982.461,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 116.522,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 115.583,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 155.282,
-    "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 26.704,
-    "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.088,
+    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-13-30]": 400.15,
+    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-6-30]": 372.521,
+    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-13-30]": 420.017,
+    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-6-30]": 373.769,
+    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-13-30]": 678.742,
+    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-6-30]": 512.135,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 58.036,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 22.104,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 126.073,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.759,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 6.885,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 8.758,
+    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 18.275,
+    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 9.533,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 12.09,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 35.145,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 22.28,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.353,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 75.487,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 54.142,
+    "test_runner/performance/test_compaction.py::test_compaction": 110.715,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.68,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.384,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.315,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.783,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.647,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 17.04,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.01,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.902,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.077,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.4,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.33,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.434,
+    "test_runner/performance/test_copy.py::test_copy[neon]": 13.817,
+    "test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736,
+    "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735,
+    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868,
+    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393,
+    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588,
+    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 30.849,
+    "test_runner/performance/test_layer_map.py::test_layer_map": 39.378,
+    "test_runner/performance/test_lazy_startup.py::test_lazy_startup": 2848.938,
+    "test_runner/performance/test_logical_replication.py::test_logical_replication": 120.952,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 35.552,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 66.762,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 85.177,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 92.12,
+    "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 107.009,
+    "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.582,
+    "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 4.737,
+    "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.686,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.271,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 50.719,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 15.992,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.566,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 13.542,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.35,
+    "test_runner/performance/test_startup.py::test_startup_simple": 13.043,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 194.841,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 286.667,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 85.577,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 297.626,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 646.187,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 989.776,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 125.638,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 123.554,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 190.083,
+    "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 21.016,
+    "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 23.028,
 }
 
 
From dc811d19231273ff9ce3e235d34c45c0fd0d443a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 6 Feb 2024 20:37:35 +0200
Subject: [PATCH 0103/1571] Add a span to 'create_neon_superuser' for better
 OpenTelemetry traces (#6644)

create_neon_superuser runs the first queries in the database after cold
start. Traces suggest that those first queries can make up a significant
fraction of the cold start time. Make it more visible by adding an
explict tracing span to it; currently you just have to deduce it by
looking at the time spent in the parent 'apply_config' span subtracted
by all the other child spans.
---
 compute_tools/src/compute.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 1976299e93..098e06cca9 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -207,6 +207,7 @@ fn maybe_cgexec(cmd: &str) -> Command {
 
 /// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
 /// that we give to customers
+#[instrument(skip_all)]
 fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
     let roles = spec
         .cluster

From 4f57dc6cc6ac69d9d342b8eb566237907dcff85b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Feb 2024 19:08:39 +0000
Subject: [PATCH 0104/1571] control_plane/attachment_service: take public key
 as value (#6651)

It's awkward to point to a file when doing some kinds of ad-hoc
deployment (like right now, when I'm hacking a helm chart having not
quite hooked up secrets properly yet). We take all the rest of the
secrets as CLI args directly, so let's do the same for public key.
---
 control_plane/attachment_service/src/main.rs |  6 ++--
 control_plane/src/attachment_service.rs      | 31 ++++++++++++++++----
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 7ac5918244..bc8a8786c2 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -34,9 +34,9 @@ struct Cli {
     #[arg(short, long)]
     listen: std::net::SocketAddr,
 
-    /// Path to public key for JWT authentication of clients
+    /// Public key for JWT authentication of clients
     #[arg(long)]
-    public_key: Option<camino::Utf8PathBuf>,
+    public_key: Option<String>,
 
     /// Token for authenticating this service with the pageservers it controls
     #[arg(long)]
@@ -159,7 +159,7 @@ impl Secrets {
     fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
         let public_key = match &args.public_key {
             None => None,
-            Some(key_path) => Some(JwtAuth::from_key_path(key_path)?),
+            Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?),
         };
         Ok(Self {
             database_url: database_url.to_owned(),
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 140e5c4e34..a3f832036c 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -28,7 +28,7 @@ pub struct AttachmentService {
     listen: String,
     path: Utf8PathBuf,
     jwt_token: Option<String>,
-    public_key_path: Option<Utf8PathBuf>,
+    public_key: Option<String>,
     postgres_port: u16,
     client: reqwest::Client,
 }
@@ -207,7 +207,7 @@ impl AttachmentService {
             .pageservers
             .first()
             .expect("Config is validated to contain at least one pageserver");
-        let (jwt_token, public_key_path) = match ps_conf.http_auth_type {
+        let (jwt_token, public_key) = match ps_conf.http_auth_type {
             AuthType::Trust => (None, None),
             AuthType::NeonJWT => {
                 let jwt_token = env
@@ -219,7 +219,26 @@ impl AttachmentService {
                 let public_key_path =
                     camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem"))
                         .unwrap();
-                (Some(jwt_token), Some(public_key_path))
+
+                // This service takes keys as a string rather than as a path to a file/dir: read the key into memory.
+                let public_key = if std::fs::metadata(&public_key_path)
+                    .expect("Can't stat public key")
+                    .is_dir()
+                {
+                    // Our config may specify a directory: this is for the pageserver's ability to handle multiple
+                    // keys.  We only use one key at a time, so, arbitrarily load the first one in the directory.
+                    let mut dir =
+                        std::fs::read_dir(&public_key_path).expect("Can't readdir public key path");
+                    let dent = dir
+                        .next()
+                        .expect("Empty key dir")
+                        .expect("Error reading key dir");
+
+                    std::fs::read_to_string(dent.path()).expect("Can't read public key")
+                } else {
+                    std::fs::read_to_string(&public_key_path).expect("Can't read public key")
+                };
+                (Some(jwt_token), Some(public_key))
             }
         };
 
@@ -228,7 +247,7 @@ impl AttachmentService {
             path,
             listen,
             jwt_token,
-            public_key_path,
+            public_key,
             postgres_port,
             client: reqwest::ClientBuilder::new()
                 .build()
@@ -453,8 +472,8 @@ impl AttachmentService {
             args.push(format!("--jwt-token={jwt_token}"));
         }
 
-        if let Some(public_key_path) = &self.public_key_path {
-            args.push(format!("--public-key={public_key_path}"));
+        if let Some(public_key) = &self.public_key {
+            args.push(format!("--public-key=\"{public_key}\""));
         }
 
         if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api {

From f4cc7cae1412c14e49a795dc6a8d0ca21413affd Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 6 Feb 2024 20:30:43 +0000
Subject: [PATCH 0105/1571]  CI(build-tools): Update Python from 3.9.2 to
 3.9.18 (#6615)

## Problem

We use an outdated version of Python (3.9.2)

## Summary of changes
- Update Python to the latest patch version (3.9.18)
- Unify the usage of python caches where possible
---
 .github/actions/allure-report-generate/action.yml | 6 ++++++
 .github/actions/run-python-test-set/action.yml    | 3 +--
 .github/workflows/build_and_test.yml              | 3 +--
 .github/workflows/pg_clients.yml                  | 3 +--
 Dockerfile.buildtools                             | 2 +-
 5 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index a33adf8bdd..f474dd3444 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -179,6 +179,12 @@ runs:
           aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
         fi
 
+    - name: Cache poetry deps
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pypoetry/virtualenvs
+        key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+
     - name: Store Allure test stat in the DB (new)
       if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
       shell: bash -euxo pipefail {0}
diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 7a88e4f73b..8852a28da9 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -86,11 +86,10 @@ runs:
         fetch-depth: 1
 
     - name: Cache poetry deps
-      id: cache_poetry
       uses: actions/cache@v3
       with:
         path: ~/.cache/pypoetry/virtualenvs
-        key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+        key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
 
     - name: Install Python deps
       shell: bash -euxo pipefail {0}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 066f4a21eb..f12f020634 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -112,11 +112,10 @@ jobs:
           fetch-depth: 1
 
       - name: Cache poetry deps
-        id: cache_poetry
         uses: actions/cache@v3
         with:
           path: ~/.cache/pypoetry/virtualenvs
-          key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}
+          key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
 
       - name: Install Python deps
         run: ./scripts/pysync
diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml
index 224b7b4a6d..28016cadb1 100644
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -38,11 +38,10 @@ jobs:
       uses: snok/install-poetry@v1
 
     - name: Cache poetry deps
-      id: cache_poetry
       uses: actions/cache@v3
       with:
         path: ~/.cache/pypoetry/virtualenvs
-        key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+        key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
 
     - name: Install Python deps
       shell: bash -euxo pipefail {0}
diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools
index 213aed1679..220e995d64 100644
--- a/Dockerfile.buildtools
+++ b/Dockerfile.buildtools
@@ -111,7 +111,7 @@ USER nonroot:nonroot
 WORKDIR /home/nonroot
 
 # Python
-ENV PYTHON_VERSION=3.9.2 \
+ENV PYTHON_VERSION=3.9.18 \
     PYENV_ROOT=/home/nonroot/.pyenv \
     PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
 RUN set -e \

From 9f75da7c0ac483e612b7382b0b050588c5587584 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 7 Feb 2024 00:31:26 +0000
Subject: [PATCH 0106/1571] test_lazy_startup: fix statement_timeout setting
 (#6654)

## Problem
Test `test_lazy_startup` is flaky[0], sometimes (pretty frequently) it
fails with `canceling statement due to statement timeout`.

- [0]
https://neon-github-public-dev.s3.amazonaws.com/reports/main/7803316870/index.html#suites/355b1a7a5b1e740b23ea53728913b4fa/7263782d30986c50/history

## Summary of changes
- Fix setting `statement_timeout` setting by reusing a connection for
all queries.
- Also fix label (`lazy`, `eager`) assignment
- Split `test_lazy_startup` into two, by `slru` laziness and make tests smaller
---
 test_runner/performance/test_lazy_startup.py | 143 +++++++++----------
 1 file changed, 69 insertions(+), 74 deletions(-)

diff --git a/test_runner/performance/test_lazy_startup.py b/test_runner/performance/test_lazy_startup.py
index 1a431e272e..e929bd4d05 100644
--- a/test_runner/performance/test_lazy_startup.py
+++ b/test_runner/performance/test_lazy_startup.py
@@ -26,86 +26,81 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 #      apply during config step, like more users, databases, or extensions. By default
 #      we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this
 #      test we only load neon.
-@pytest.mark.timeout(1000)
-def test_lazy_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
+@pytest.mark.timeout(1800)
+@pytest.mark.parametrize("slru", ["lazy", "eager"])
+def test_lazy_startup(slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
     neon_env_builder.num_safekeepers = 3
     env = neon_env_builder.init_start()
 
-    lazy_tenant, _ = env.neon_cli.create_tenant(
+    lazy_slru_download = "true" if slru == "lazy" else "false"
+    tenant, _ = env.neon_cli.create_tenant(
         conf={
-            "lazy_slru_download": "true",
+            "lazy_slru_download": lazy_slru_download,
         }
     )
-    eager_tenant, _ = env.neon_cli.create_tenant(
-        conf={
-            "lazy_slru_download": "false",
-        }
-    )
-    tenants = [lazy_tenant, eager_tenant]
-    slru = "lazy"
-    for tenant in tenants:
-        endpoint = env.endpoints.create_start("main", tenant_id=tenant)
-        endpoint.safe_psql("CREATE TABLE t (pk integer PRIMARY KEY, x integer)")
-        endpoint.safe_psql("ALTER TABLE t SET (autovacuum_enabled = false)")
-        endpoint.safe_psql("INSERT INTO t VALUES (1, 0)")
-        endpoint.safe_psql(
-            """
-          CREATE PROCEDURE updating() as
-          $$
-            DECLARE
-              i integer;
-            BEGIN
-              FOR i IN 1..10000000 LOOP
-                UPDATE t SET x = x + 1 WHERE pk=1;
-                COMMIT;
-              END LOOP;
-            END
-          $$ LANGUAGE plpgsql
-        """
-        )
-        endpoint.safe_psql("SET statement_timeout=0")
-        endpoint.safe_psql("call updating()")
 
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant)
+    with endpoint.cursor() as cur:
+        cur.execute("CREATE TABLE t (pk integer PRIMARY KEY, x integer)")
+        cur.execute("ALTER TABLE t SET (autovacuum_enabled = false)")
+        cur.execute("INSERT INTO t VALUES (1, 0)")
+        cur.execute(
+            """
+            CREATE PROCEDURE updating() as
+            $$
+                DECLARE
+                i integer;
+                BEGIN
+                FOR i IN 1..1000000 LOOP
+                    UPDATE t SET x = x + 1 WHERE pk=1;
+                    COMMIT;
+                END LOOP;
+                END
+            $$ LANGUAGE plpgsql
+            """
+        )
+        cur.execute("SET statement_timeout=0")
+        cur.execute("call updating()")
+
+    endpoint.stop()
+
+    # We do two iterations so we can see if the second startup is faster. It should
+    # be because the compute node should already be configured with roles, databases,
+    # extensions, etc from the first run.
+    for i in range(2):
+        # Start
+        with zenbenchmark.record_duration(f"{slru}_{i}_start"):
+            endpoint.start()
+
+        with zenbenchmark.record_duration(f"{slru}_{i}_select"):
+            sum = endpoint.safe_psql("select sum(x) from t")[0][0]
+            assert sum == 1000000
+
+        # Get metrics
+        metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json()
+        durations = {
+            "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec",
+            "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers",
+            "sync_sk_check_ms": f"{slru}_{i}_sync_sk_check",
+            "basebackup_ms": f"{slru}_{i}_basebackup",
+            "start_postgres_ms": f"{slru}_{i}_start_postgres",
+            "config_ms": f"{slru}_{i}_config",
+            "total_startup_ms": f"{slru}_{i}_total_startup",
+        }
+        for key, name in durations.items():
+            value = metrics[key]
+            zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER)
+
+        basebackup_bytes = metrics["basebackup_bytes"]
+        zenbenchmark.record(
+            f"{slru}_{i}_basebackup_bytes",
+            basebackup_bytes,
+            "bytes",
+            report=MetricReport.LOWER_IS_BETTER,
+        )
+
+        # Stop so we can restart
         endpoint.stop()
 
-        # We do two iterations so we can see if the second startup is faster. It should
-        # be because the compute node should already be configured with roles, databases,
-        # extensions, etc from the first run.
-        for i in range(2):
-            # Start
-            with zenbenchmark.record_duration(f"{slru}_{i}_start"):
-                endpoint.start()
-
-            with zenbenchmark.record_duration(f"{slru}_{i}_select"):
-                sum = endpoint.safe_psql("select sum(x) from t")[0][0]
-                assert sum == 10000000
-
-            # Get metrics
-            metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json()
-            durations = {
-                "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec",
-                "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers",
-                "sync_sk_check_ms": f"{slru}_{i}_sync_sk_check",
-                "basebackup_ms": f"{slru}_{i}_basebackup",
-                "start_postgres_ms": f"{slru}_{i}_start_postgres",
-                "config_ms": f"{slru}_{i}_config",
-                "total_startup_ms": f"{slru}_{i}_total_startup",
-            }
-            for key, name in durations.items():
-                value = metrics[key]
-                zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER)
-
-            basebackup_bytes = metrics["basebackup_bytes"]
-            zenbenchmark.record(
-                f"{slru}_{i}_basebackup_bytes",
-                basebackup_bytes,
-                "bytes",
-                report=MetricReport.LOWER_IS_BETTER,
-            )
-
-            # Stop so we can restart
-            endpoint.stop()
-
-            # Imitate optimizations that console would do for the second start
-            endpoint.respec(skip_pg_catalog_updates=True)
-            slru = "eager"
+        # Imitate optimizations that console would do for the second start
+        endpoint.respec(skip_pg_catalog_updates=True)

From f3d7d2380566948d5bf7250c32c1e11ef5099ab3 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 7 Feb 2024 08:47:19 +0200
Subject: [PATCH 0107/1571] Some small WAL records can write a lot of data to
 KV storage, so perform checkpoint check more frequently (#6639)

## Problem

See
https://neondb.slack.com/archives/C04DGM6SMTM/p1707149618314539?thread_ts=1707081520.140049&cid=C04DGM6SMTM

## Summary of changes


Perform checkpoint check after processing `ingest_batch_size` (default
100) WAL records.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 .../walreceiver/walreceiver_connection.rs     | 17 +++++
 test_runner/regress/test_layer_bloating.py    | 66 +++++++++++++++++++
 2 files changed, 83 insertions(+)
 create mode 100644 test_runner/regress/test_layer_bloating.py

diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 73eb42bb30..9cb53f46d1 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -343,6 +343,23 @@ pub(super) async fn handle_walreceiver_connection(
                             modification.commit(&ctx).await?;
                             uncommitted_records = 0;
                             filtered_records = 0;
+
+                            //
+                            // We should check checkpoint distance after appending each ingest_batch_size bytes because otherwise
+                            // layer size can become much larger than `checkpoint_distance`.
+                            // It can append because wal-sender is sending WAL using 125kb chucks and some WAL records can cause writing large
+                            // amount of data to key-value storage. So performing this check only after processing
+                            // all WAL records in the chunk, can cause huge L0 layer files.
+                            //
+                            timeline
+                                .check_checkpoint_distance()
+                                .await
+                                .with_context(|| {
+                                    format!(
+                                        "Failed to check checkpoint distance for timeline {}",
+                                        timeline.timeline_id
+                                    )
+                                })?;
                         }
                     }
 
diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py
new file mode 100644
index 0000000000..70b115ad61
--- /dev/null
+++ b/test_runner/regress/test_layer_bloating.py
@@ -0,0 +1,66 @@
+import os
+import time
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    logical_replication_sync,
+)
+from fixtures.pg_version import PgVersion
+
+
+def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
+    env = neon_simple_env
+
+    if env.pg_version != PgVersion.V16:
+        pytest.skip("pg_log_standby_snapshot() function is available only in PG16")
+
+    timeline = env.neon_cli.create_branch("test_logical_replication", "empty")
+    endpoint = env.endpoints.create_start(
+        "test_logical_replication", config_lines=["log_statement=all"]
+    )
+
+    log.info("postgres is running on 'test_logical_replication' branch")
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    # create table...
+    cur.execute("create table t(pk integer primary key)")
+    cur.execute("create publication pub1 for table t")
+    # Create slot to hold WAL
+    cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')")
+
+    # now start subscriber
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create table t(pk integer primary key)")
+
+    connstr = endpoint.connstr().replace("'", "''")
+    log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
+    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
+
+    cur.execute(
+        """create or replace function create_snapshots(n integer) returns void as $$
+                   declare
+                     i integer;
+                   begin
+                     for i in 1..n loop
+                       perform pg_log_standby_snapshot();
+                     end loop;
+                   end; $$ language plpgsql"""
+    )
+    cur.execute("set statement_timeout=0")
+    cur.execute("select create_snapshots(10000)")
+    # Wait logical replication to sync
+    logical_replication_sync(vanilla_pg, endpoint)
+    time.sleep(10)
+
+    # Check layer file sizes
+    timeline_path = "{}/tenants/{}/timelines/{}/".format(
+        env.pageserver.workdir, env.initial_tenant, timeline
+    )
+    log.info(f"Check {timeline_path}")
+    for filename in os.listdir(timeline_path):
+        if filename.startswith("00000"):
+            log.info(f"layer {filename} size is {os.path.getsize(timeline_path + filename)}")
+            assert os.path.getsize(timeline_path + filename) < 512_000_000

From f7516df6c155162aa2d935adadf95524379e0a58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 7 Feb 2024 12:56:53 +0100
Subject: [PATCH 0108/1571] Pass timestamp as a datetime (#6656)

This saves some repetition. I did this in #6533 for
`tenant_time_travel_remote_storage` already.
---
 test_runner/fixtures/pageserver/http.py |  4 ++--
 test_runner/regress/test_lsn_mapping.py | 16 ++++------------
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 92e5027a9f..adea9ca764 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -563,13 +563,13 @@ class PageserverHttpClient(requests.Session):
         self,
         tenant_id: Union[TenantId, TenantShardId],
         timeline_id: TimelineId,
-        timestamp,
+        timestamp: datetime,
     ):
         log.info(
             f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
         )
         res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}",
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z",
         )
         self.verbose_error(res)
         res_json = res.json()
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 9788e8c0d7..50d7c74af0 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -64,18 +64,14 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         # Check edge cases
         # Timestamp is in the future
         probe_timestamp = tbl[-1][1] + timedelta(hours=1)
-        result = client.timeline_get_lsn_by_timestamp(
-            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z"
-        )
+        result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp)
         assert result["kind"] == "future"
         # make sure that we return a well advanced lsn here
         assert Lsn(result["lsn"]) > start_lsn
 
         # Timestamp is in the unreachable past
         probe_timestamp = tbl[0][1] - timedelta(hours=10)
-        result = client.timeline_get_lsn_by_timestamp(
-            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z"
-        )
+        result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp)
         assert result["kind"] == "past"
         # make sure that we return the minimum lsn here at the start of the range
         assert Lsn(result["lsn"]) < start_lsn
@@ -83,9 +79,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         # Probe a bunch of timestamps in the valid range
         for i in range(1, len(tbl), 100):
             probe_timestamp = tbl[i][1]
-            result = client.timeline_get_lsn_by_timestamp(
-                tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z"
-            )
+            result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp)
             assert result["kind"] not in ["past", "nodata"]
             lsn = result["lsn"]
             # Call get_lsn_by_timestamp to get the LSN
@@ -108,9 +102,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
 
         # Timestamp is in the unreachable past
         probe_timestamp = tbl[0][1] - timedelta(hours=10)
-        result = client.timeline_get_lsn_by_timestamp(
-            tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z"
-        )
+        result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id_child, probe_timestamp)
         assert result["kind"] == "past"
         # make sure that we return the minimum lsn here at the start of the range
         assert Lsn(result["lsn"]) >= last_flush_lsn

From 3d4fe205ba260c6cd878bf8d0c19623d45920e4f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 7 Feb 2024 13:08:09 +0000
Subject: [PATCH 0109/1571] control_plane/attachment_service: database
 connection pool (#6622)

## Problem

This is mainly to limit our concurrency, rather than to speed up
requests (I was doing some sanity checks on performance of the service
with thousands of shards)

## Summary of changes

- Enable the `diesel:r2d2` feature, which provides an async connection
pool
- Acquire a connection before entering spawn_blocking for a database
transaction (recall that diesel's interface is sync)
- Set a connection pool size of 99 to fit within default postgres limit
(100)
- Also set the tokio blocking thread count to accomodate the same number
of blocking tasks (the only thing we use spawn_blocking for is database
calls).
---
 Cargo.lock                                    | 23 +++++++++++
 control_plane/attachment_service/Cargo.toml   |  3 +-
 control_plane/attachment_service/src/main.rs  | 15 ++++++-
 .../attachment_service/src/persistence.rs     | 41 ++++++++++++++-----
 .../attachment_service/src/service.rs         |  4 +-
 workspace_hack/Cargo.toml                     |  3 +-
 6 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b2b2777408..a25725f90d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -289,6 +289,7 @@ dependencies = [
  "pageserver_api",
  "pageserver_client",
  "postgres_connection",
+ "r2d2",
  "reqwest",
  "serde",
  "serde_json",
@@ -1651,6 +1652,7 @@ dependencies = [
  "diesel_derives",
  "itoa",
  "pq-sys",
+ "r2d2",
  "serde_json",
 ]
 
@@ -4166,6 +4168,17 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "r2d2"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93"
+dependencies = [
+ "log",
+ "parking_lot 0.12.1",
+ "scheduled-thread-pool",
+]
+
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -4879,6 +4892,15 @@ dependencies = [
  "windows-sys 0.42.0",
 ]
 
+[[package]]
+name = "scheduled-thread-pool"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19"
+dependencies = [
+ "parking_lot 0.12.1",
+]
+
 [[package]]
 name = "scopeguard"
 version = "1.1.0"
@@ -6807,6 +6829,7 @@ dependencies = [
  "clap_builder",
  "crossbeam-utils",
  "diesel",
+ "diesel_derives",
  "either",
  "fail",
  "futures-channel",
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 3a65153c41..0b93211dbc 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -24,8 +24,9 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 
-diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
+diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
+r2d2 = { version = "0.8.10" }
 
 utils = { path = "../../libs/utils/" }
 metrics = { path = "../../libs/metrics/" }
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index bc8a8786c2..7229a2517b 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -170,6 +170,7 @@ impl Secrets {
     }
 }
 
+/// Execute the diesel migrations that are built into this binary
 async fn migration_run(database_url: &str) -> anyhow::Result<()> {
     use diesel::PgConnection;
     use diesel_migrations::{HarnessWithOutput, MigrationHarness};
@@ -183,8 +184,18 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
     Ok(())
 }
 
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
+fn main() -> anyhow::Result<()> {
+    tokio::runtime::Builder::new_current_thread()
+        // We use spawn_blocking for database operations, so require approximately
+        // as many blocking threads as we will open database connections.
+        .max_blocking_threads(Persistence::MAX_CONNECTIONS as usize)
+        .enable_all()
+        .build()
+        .unwrap()
+        .block_on(async_main())
+}
+
+async fn async_main() -> anyhow::Result<()> {
     let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
 
     logging::init(
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 574441c409..db487bcec6 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -1,5 +1,6 @@
 use std::collections::HashMap;
 use std::str::FromStr;
+use std::time::Duration;
 
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
@@ -44,7 +45,7 @@ use crate::PlacementPolicy;
 /// updated, and reads of nodes are always from memory, not the database.  We only require that
 /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
-    database_url: String,
+    connection_pool: diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<PgConnection>>,
 
     // In test environments, we support loading+saving a JSON file.  This is temporary, for the benefit of
     // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
@@ -64,6 +65,8 @@ pub(crate) enum DatabaseError {
     Query(#[from] diesel::result::Error),
     #[error(transparent)]
     Connection(#[from] diesel::result::ConnectionError),
+    #[error(transparent)]
+    ConnectionPool(#[from] r2d2::Error),
     #[error("Logical error: {0}")]
     Logical(String),
 }
@@ -71,9 +74,31 @@ pub(crate) enum DatabaseError {
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
 impl Persistence {
+    // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
+    // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
+    pub const MAX_CONNECTIONS: u32 = 99;
+
+    // We don't want to keep a lot of connections alive: close them down promptly if they aren't being used.
+    const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
+    const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);
+
     pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
+        let manager = diesel::r2d2::ConnectionManager::<PgConnection>::new(database_url);
+
+        // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
+        // to execute queries (database queries are not generally on latency-sensitive paths).
+        let connection_pool = diesel::r2d2::Pool::builder()
+            .max_size(Self::MAX_CONNECTIONS)
+            .max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME))
+            .idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT))
+            // Always keep at least one connection ready to go
+            .min_idle(Some(1))
+            .test_on_check_out(true)
+            .build(manager)
+            .expect("Could not build connection pool");
+
         Self {
-            database_url,
+            connection_pool,
             json_path,
         }
     }
@@ -84,14 +109,10 @@ impl Persistence {
         F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
         R: Send + 'static,
     {
-        let database_url = self.database_url.clone();
-        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
-            // TODO: connection pooling, such as via diesel::r2d2
-            let mut conn = PgConnection::establish(&database_url)?;
-            func(&mut conn)
-        })
-        .await
-        .expect("Task panic")
+        let mut conn = self.connection_pool.get()?;
+        tokio::task::spawn_blocking(move || -> DatabaseResult<R> { func(&mut conn) })
+            .await
+            .expect("Task panic")
     }
 
     /// When a node is first registered, persist it before using it for anything
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 6f0e3ebb74..febee1aa0d 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -103,7 +103,9 @@ impl From<DatabaseError> for ApiError {
         match err {
             DatabaseError::Query(e) => ApiError::InternalServerError(e.into()),
             // FIXME: ApiError doesn't have an Unavailable variant, but ShuttingDown maps to 503.
-            DatabaseError::Connection(_e) => ApiError::ShuttingDown,
+            DatabaseError::Connection(_) | DatabaseError::ConnectionPool(_) => {
+                ApiError::ShuttingDown
+            }
             DatabaseError::Logical(reason) => {
                 ApiError::InternalServerError(anyhow::anyhow!(reason))
             }
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 74464dd4c8..70b238913d 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -29,7 +29,7 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
-diesel = { version = "2", features = ["postgres", "serde_json"] }
+diesel = { version = "2", features = ["postgres", "r2d2", "serde_json"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures-channel = { version = "0.3", features = ["sink"] }
@@ -90,6 +90,7 @@ anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
+diesel_derives = { version = "2", features = ["32-column-tables", "postgres", "r2d2", "with-deprecated"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }

From 090a789408e4bd95656132248bdbcbdba0fd3c4a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 7 Feb 2024 13:24:10 +0000
Subject: [PATCH 0110/1571] storage controller: use PUT instead of POST (#6659)

This was a typo, the server expects PUT.
---
 control_plane/attachment_service/src/compute_hook.rs | 2 +-
 test_runner/regress/test_sharding_service.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs
index 4ca26431ca..0d3610aafa 100644
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -170,7 +170,7 @@ impl ComputeHook {
         reconfigure_request: &ComputeHookNotifyRequest,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
-        let req = client.request(Method::POST, url);
+        let req = client.request(Method::PUT, url);
         let req = if let Some(value) = &self.authorization_header {
             req.header(reqwest::header::AUTHORIZATION, value)
         } else {
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index ee57fcb2cf..fd811a9d02 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -310,7 +310,7 @@ def test_sharding_service_compute_hook(
         notifications.append(request.json)
         return Response(status=200)
 
-    httpserver.expect_request("/notify", method="POST").respond_with_handler(handler)
+    httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
 
     # Start running
     env = neon_env_builder.init_start()

From 75f1a01d4aba488012c9fd86b56b6dcf46726c92 Mon Sep 17 00:00:00 2001
From: Abhijeet Patil <abhi.gets.mail@gmail.com>
Date: Wed, 7 Feb 2024 16:14:10 +0000
Subject: [PATCH 0111/1571] Optimise e2e run (#6513)

## Problem
We have finite amount of runners and intermediate results are often
wanted before a PR is ready for merging. Currently all PRs get e2e tests
run and this creates a lot of throwaway e2e results which may or may not
get to start or complete before a new push.

## Summary of changes

1. Skip e2e test when PR is in draft mode
2. Run e2e when PR status changes from draft to ready for review (change
this to having its trigger in below PR and update results of build and
test)
3. Abstract e2e test in a Separate workflow and call it from the main
workflow for the e2e test
5. Add a label, if that label is present run e2e test in draft
(run-e2e-test-in-draft)
6. Auto add a label(approve to ci) so that all the external contributors
PR , e2e run in draft
7. Document the new label changes and the above behaviour

Draft PR  : https://github.com/neondatabase/neon/actions/runs/7729128470
Ready To Review :
https://github.com/neondatabase/neon/actions/runs/7733779916
Draft PR with label :
https://github.com/neondatabase/neon/actions/runs/7725691012/job/21062432342
and https://github.com/neondatabase/neon/actions/runs/7733854028

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/approved-for-ci-run.yml |   1 +
 .github/workflows/build_and_test.yml      |  48 +--------
 .github/workflows/trigger-e2e-tests.yml   | 118 ++++++++++++++++++++++
 CONTRIBUTING.md                           |   3 +
 4 files changed, 126 insertions(+), 44 deletions(-)
 create mode 100644 .github/workflows/trigger-e2e-tests.yml

diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml
index 5b21011b83..ae2f173b47 100644
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -93,6 +93,7 @@ jobs:
                                                        --body-file "body.md" \
                                                        --head "${BRANCH}" \
                                                        --base "main" \
+                                                       --label "run-e2e-tests-in-draft" \
                                                        --draft
           fi
 
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index f12f020634..078916e1ea 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -22,7 +22,7 @@ env:
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
   # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
-  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
 
 jobs:
   check-permissions:
@@ -692,50 +692,10 @@ jobs:
             })
 
   trigger-e2e-tests:
+    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' }}
     needs: [ check-permissions, promote-images, tag ]
-    runs-on: [ self-hosted, gen3, small ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-      options: --init
-    steps:
-      - name: Set PR's status to pending and request a remote CI test
-        run: |
-          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
-          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
-          # to place a job run status update later.
-          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
-          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
-          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
-
-          REMOTE_REPO="${{ github.repository_owner }}/cloud"
-
-          curl -f -X POST \
-          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
-          -H "Accept: application/vnd.github.v3+json" \
-          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
-          --data \
-            "{
-              \"state\": \"pending\",
-              \"context\": \"neon-cloud-e2e\",
-              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
-            }"
-
-          curl -f -X POST \
-          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-          -H "Accept: application/vnd.github.v3+json" \
-          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
-          --data \
-            "{
-              \"ref\": \"main\",
-              \"inputs\": {
-                \"ci_job_name\": \"neon-cloud-e2e\",
-                \"commit_hash\": \"$COMMIT_SHA\",
-                \"remote_repo\": \"${{ github.repository }}\",
-                \"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
-                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
-                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
-              }
-            }"
+    uses: ./.github/workflows/trigger-e2e-tests.yml
+    secrets: inherit
 
   neon-image:
     needs: [ check-permissions, build-buildtools-image, tag ]
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
new file mode 100644
index 0000000000..2776033805
--- /dev/null
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -0,0 +1,118 @@
+name: Trigger E2E Tests
+
+on:
+  pull_request:
+    types:
+      - ready_for_review
+  workflow_call:
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+    
+env:
+  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
+  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+jobs:
+  cancel-previous-e2e-tests:
+    if: github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Cancel previous e2e-tests runs for this PR
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          gh workflow --repo neondatabase/cloud \
+            run cancel-previous-in-concurrency-group.yml \
+              --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
+
+  tag:
+    runs-on: [ ubuntu-latest ]
+    outputs:
+      build-tag: ${{ steps.build-tag.outputs.tag }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Get build tag
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+          CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }}
+          CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        run: |
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
+            echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT
+          fi
+        id: build-tag
+
+  trigger-e2e-tests:
+    needs: [ tag ]
+    runs-on: [ self-hosted, gen3, small ]
+    env:
+      TAG: ${{ needs.tag.outputs.build-tag }}
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+      options: --init
+    steps:
+      - name: check if ecr image are present
+        run: |
+          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
+            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
+            if [ "$OUTPUT" == "" ]; then
+              echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT
+              exit 1
+            fi
+          done
+
+      - name: Set PR's status to pending and request a remote CI test
+        run: |
+          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
+          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
+          # to place a job run status update later.
+          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
+          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+
+          REMOTE_REPO="${{ github.repository_owner }}/cloud"
+
+          curl -f -X POST \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"state\": \"pending\",
+              \"context\": \"neon-cloud-e2e\",
+              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
+            }"
+
+          curl -f -X POST \
+          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"ref\": \"main\",
+              \"inputs\": {
+                \"ci_job_name\": \"neon-cloud-e2e\",
+                \"commit_hash\": \"$COMMIT_SHA\",
+                \"remote_repo\": \"${{ github.repository }}\",
+                \"storage_image_tag\": \"${TAG}\",
+                \"compute_image_tag\": \"${TAG}\",
+                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
+              }
+            }"
+ 
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7e177693fa..2e447fba47 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -54,6 +54,9 @@ _An instruction for maintainers_
 - If and only if it looks **safe** (i.e. it doesn't contain any malicious code which could expose secrets or harm the CI), then:
     - Press the "Approve and run" button in GitHub UI
     - Add the `approved-for-ci-run` label to the PR
+    - Currently draft PR will skip e2e test (only for internal contributors). After turning the PR 'Ready to Review' CI will trigger e2e test
+      - Add `run-e2e-tests-in-draft` label to run e2e test in draft PR (override above behaviour)
+      - The `approved-for-ci-run` workflow will add `run-e2e-tests-in-draft` automatically to run e2e test for external contributors
 
 Repeat all steps after any change to the PR.
 - When the changes are ready to get merged — merge the original PR (not the internal one)

From 7b49e5e5c334bc8d07232f385d08e370ba85fb5a Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Wed, 7 Feb 2024 07:55:55 -0900
Subject: [PATCH 0112/1571] Remove compute migrations feature flag (#6653)

---
 compute_tools/src/compute.rs               | 11 +++++------
 libs/compute_api/src/spec.rs               |  3 ---
 test_runner/fixtures/neon_fixtures.py      |  5 +----
 test_runner/regress/test_migrations.py     |  2 +-
 test_runner/regress/test_neon_superuser.py |  4 ++--
 5 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 098e06cca9..0ca1a47fbf 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -773,12 +773,11 @@ impl ComputeNode {
         // 'Close' connection
         drop(client);
 
-        if self.has_feature(ComputeFeature::Migrations) {
-            thread::spawn(move || {
-                let mut client = Client::connect(connstr.as_str(), NoTls)?;
-                handle_migrations(&mut client)
-            });
-        }
+        // Run migrations separately to not hold up cold starts
+        thread::spawn(move || {
+            let mut client = Client::connect(connstr.as_str(), NoTls)?;
+            handle_migrations(&mut client)
+        });
         Ok(())
     }
 
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 5361d14004..13ac18e0c5 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -90,9 +90,6 @@ pub enum ComputeFeature {
     /// track short-lived connections as user activity.
     ActivityMonitorExperimental,
 
-    /// Enable running migrations
-    Migrations,
-
     /// This is a special feature flag that is used to represent unknown feature flags.
     /// Basically all unknown to enum flags are represented as this one. See unit test
     /// `parse_unknown_features()` for more details.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index bf7c6ccc14..4491655aeb 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3131,10 +3131,7 @@ class Endpoint(PgProtocol):
             log.info(json.dumps(dict(data_dict, **kwargs)))
             json.dump(dict(data_dict, **kwargs), file, indent=4)
 
-    # Please note: if you didn't respec this endpoint to have the `migrations`
-    # feature, this function will probably fail because neon_migration.migration_id
-    # won't exist. This is temporary - soon we'll get rid of the feature flag and
-    # migrations will be enabled for everyone.
+    # Please note: Migrations only run if pg_skip_catalog_updates is false
     def wait_for_migrations(self):
         with self.cursor() as cur:
 
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 30dd54a8c1..8954810451 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -10,7 +10,7 @@ def test_migrations(neon_simple_env: NeonEnv):
     endpoint = env.endpoints.create("test_migrations")
     log_path = endpoint.endpoint_path() / "compute.log"
 
-    endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"])
+    endpoint.respec(skip_pg_catalog_updates=False)
     endpoint.start()
 
     endpoint.wait_for_migrations()
diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index eff2cadabf..34f1e64b34 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -12,10 +12,10 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
     env.neon_cli.create_branch("test_neon_superuser_subscriber")
     sub = env.endpoints.create("test_neon_superuser_subscriber")
 
-    pub.respec(skip_pg_catalog_updates=False, features=["migrations"])
+    pub.respec(skip_pg_catalog_updates=False)
     pub.start()
 
-    sub.respec(skip_pg_catalog_updates=False, features=["migrations"])
+    sub.respec(skip_pg_catalog_updates=False)
     sub.start()
 
     pub.wait_for_migrations()

From 51f9385b1bd60f3152a580332ba4b19ec131f89a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 7 Feb 2024 18:47:55 +0100
Subject: [PATCH 0113/1571] live-reconfigurable virtual_file::IoEngine (#6552)

This PR adds an API to live-reconfigure the VirtualFile io engine.

It also adds a flag to `pagebench get-page-latest-lsn`, which is where I
found this functionality to be useful: it helps compare the io engines
in a benchmark without re-compiling a release build, which took ~50s on
the i3en.3xlarge where I was doing the benchmark.

Switching the IO engine is completely safe at runtime.
---
 libs/pageserver_api/src/models.rs             |  21 +++
 pageserver/client/src/mgmt_api.rs             |  12 ++
 pageserver/ctl/src/layer_map_analyzer.rs      |   2 +-
 pageserver/ctl/src/layers.rs                  |   4 +-
 pageserver/ctl/src/main.rs                    |   2 +-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |   8 ++
 pageserver/src/http/routes.rs                 |  10 ++
 pageserver/src/virtual_file.rs                |   5 +-
 pageserver/src/virtual_file/io_engine.rs      | 130 +++++++++++-------
 pageserver/src/virtual_file/open_options.rs   |   7 +-
 10 files changed, 144 insertions(+), 57 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 5a638df9cc..c08cacb822 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -649,6 +649,27 @@ pub struct WalRedoManagerStatus {
     pub pid: Option<u32>,
 }
 
+pub mod virtual_file {
+    #[derive(
+        Copy,
+        Clone,
+        PartialEq,
+        Eq,
+        Hash,
+        strum_macros::EnumString,
+        strum_macros::Display,
+        serde_with::DeserializeFromStr,
+        serde_with::SerializeDisplay,
+        Debug,
+    )]
+    #[strum(serialize_all = "kebab-case")]
+    pub enum IoEngineKind {
+        StdFs,
+        #[cfg(target_os = "linux")]
+        TokioEpollUring,
+    }
+}
+
 // Wrapped in libpq CopyData
 #[derive(PartialEq, Eq, Debug)]
 pub enum PagestreamFeMessage {
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 91b9afa026..8abe58e1a2 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -339,4 +339,16 @@ impl Client {
             .await
             .map_err(Error::ReceiveBody)
     }
+
+    pub async fn put_io_engine(
+        &self,
+        engine: &pageserver_api::models::virtual_file::IoEngineKind,
+    ) -> Result<()> {
+        let uri = format!("{}/v1/io_engine", self.mgmt_api_endpoint);
+        self.request(Method::PUT, uri, engine)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
 }
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index eb5c3f15cf..42c4e9ff48 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -142,7 +142,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
 
     // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
+    pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
     pageserver::page_cache::init(100);
 
     let mut total_delta_layers = 0usize;
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index dbbcfedac0..27efa6d028 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -59,7 +59,7 @@ pub(crate) enum LayerCmd {
 
 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
     let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
     page_cache::init(100);
     let file = FileBlockReader::new(VirtualFile::open(path).await?);
     let summary_blk = file.read_blk(0, ctx).await?;
@@ -187,7 +187,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             new_tenant_id,
             new_timeline_id,
         } => {
-            pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
+            pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
             pageserver::page_cache::init(100);
 
             let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 3c90933fe9..e73d961e36 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -123,7 +123,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
 
 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
     // Basic initialization of things that don't change after startup
-    virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
     page_cache::init(100);
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
     dump_layerfile_from_path(path, true, &ctx).await
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index aa809d8d26..647f571e59 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -51,6 +51,10 @@ pub(crate) struct Args {
     /// It doesn't get invalidated if the keyspace changes under the hood, e.g., due to new ingested data or compaction.
     #[clap(long)]
     keyspace_cache: Option<Utf8PathBuf>,
+    /// Before starting the benchmark, live-reconfigure the pageserver to use the given
+    /// [`pageserver_api::models::virtual_file::IoEngineKind`].
+    #[clap(long)]
+    set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
     targets: Option<Vec<TenantTimelineId>>,
 }
 
@@ -109,6 +113,10 @@ async fn main_impl(
         args.pageserver_jwt.as_deref(),
     ));
 
+    if let Some(engine_str) = &args.set_io_engine {
+        mgmt_api_client.put_io_engine(engine_str).await?;
+    }
+
     // discover targets
     let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
         &mgmt_api_client,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 792089ebe7..ebcb27fa08 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1908,6 +1908,15 @@ async fn post_tracing_event_handler(
     json_response(StatusCode::OK, ())
 }
 
+async fn put_io_engine_handler(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let kind: crate::virtual_file::IoEngineKind = json_request(&mut r).await?;
+    crate::virtual_file::io_engine::set(kind);
+    json_response(StatusCode::OK, ())
+}
+
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2165,5 +2174,6 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
             |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
         )
+        .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
         .any(handler_404))
 }
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 066f06c88f..059a6596d3 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -28,9 +28,10 @@ use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
 use utils::fs_ext;
 
-mod io_engine;
+pub use pageserver_api::models::virtual_file as api;
+pub(crate) mod io_engine;
 mod open_options;
-pub use io_engine::IoEngineKind;
+pub(crate) use io_engine::IoEngineKind;
 pub(crate) use open_options::*;
 
 ///
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index f7b46fe653..892affa326 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -7,67 +7,100 @@
 //!
 //! Then use [`get`] and  [`super::OpenOptions`].
 
-#[derive(
-    Copy,
-    Clone,
-    PartialEq,
-    Eq,
-    Hash,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-    Debug,
-)]
-#[strum(serialize_all = "kebab-case")]
-pub enum IoEngineKind {
+pub(crate) use super::api::IoEngineKind;
+#[derive(Clone, Copy)]
+#[repr(u8)]
+pub(crate) enum IoEngine {
+    NotSet,
     StdFs,
     #[cfg(target_os = "linux")]
     TokioEpollUring,
 }
 
-static IO_ENGINE: once_cell::sync::OnceCell<IoEngineKind> = once_cell::sync::OnceCell::new();
-
-#[cfg(not(test))]
-pub(super) fn init(engine: IoEngineKind) {
-    if IO_ENGINE.set(engine).is_err() {
-        panic!("called twice");
+impl From<IoEngineKind> for IoEngine {
+    fn from(value: IoEngineKind) -> Self {
+        match value {
+            IoEngineKind::StdFs => IoEngine::StdFs,
+            #[cfg(target_os = "linux")]
+            IoEngineKind::TokioEpollUring => IoEngine::TokioEpollUring,
+        }
     }
-    crate::metrics::virtual_file_io_engine::KIND
-        .with_label_values(&[&format!("{engine}")])
-        .set(1);
 }
 
-pub(super) fn get() -> &'static IoEngineKind {
-    #[cfg(test)]
-    {
-        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
-        IO_ENGINE.get_or_init(|| match std::env::var(env_var_name) {
-            Ok(v) => match v.parse::<IoEngineKind>() {
-                Ok(engine_kind) => engine_kind,
-                Err(e) => {
-                    panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
-                }
-            },
-            Err(std::env::VarError::NotPresent) => {
-                crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
-                    .parse()
-                    .unwrap()
-            }
-            Err(std::env::VarError::NotUnicode(_)) => {
-                panic!("env var {env_var_name} is not unicode");
-            }
+impl TryFrom<u8> for IoEngine {
+    type Error = u8;
+
+    fn try_from(value: u8) -> Result<Self, Self::Error> {
+        Ok(match value {
+            v if v == (IoEngine::NotSet as u8) => IoEngine::NotSet,
+            v if v == (IoEngine::StdFs as u8) => IoEngine::StdFs,
+            #[cfg(target_os = "linux")]
+            v if v == (IoEngine::TokioEpollUring as u8) => IoEngine::TokioEpollUring,
+            x => return Err(x),
         })
     }
-    #[cfg(not(test))]
-    IO_ENGINE.get().unwrap()
 }
 
-use std::os::unix::prelude::FileExt;
+static IO_ENGINE: AtomicU8 = AtomicU8::new(IoEngine::NotSet as u8);
+
+pub(crate) fn set(engine_kind: IoEngineKind) {
+    let engine: IoEngine = engine_kind.into();
+    IO_ENGINE.store(engine as u8, std::sync::atomic::Ordering::Relaxed);
+    #[cfg(not(test))]
+    {
+        let metric = &crate::metrics::virtual_file_io_engine::KIND;
+        metric.reset();
+        metric
+            .with_label_values(&[&format!("{engine_kind}")])
+            .set(1);
+    }
+}
+
+#[cfg(not(test))]
+pub(super) fn init(engine_kind: IoEngineKind) {
+    set(engine_kind);
+}
+
+pub(super) fn get() -> IoEngine {
+    let cur = IoEngine::try_from(IO_ENGINE.load(Ordering::Relaxed)).unwrap();
+    if cfg!(test) {
+        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
+        match cur {
+            IoEngine::NotSet => {
+                let kind = match std::env::var(env_var_name) {
+                    Ok(v) => match v.parse::<IoEngineKind>() {
+                        Ok(engine_kind) => engine_kind,
+                        Err(e) => {
+                            panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
+                        }
+                    },
+                    Err(std::env::VarError::NotPresent) => {
+                        crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
+                            .parse()
+                            .unwrap()
+                    }
+                    Err(std::env::VarError::NotUnicode(_)) => {
+                        panic!("env var {env_var_name} is not unicode");
+                    }
+                };
+                self::set(kind);
+                self::get()
+            }
+            x => x,
+        }
+    } else {
+        cur
+    }
+}
+
+use std::{
+    os::unix::prelude::FileExt,
+    sync::atomic::{AtomicU8, Ordering},
+};
 
 use super::FileGuard;
 
-impl IoEngineKind {
+impl IoEngine {
     pub(super) async fn read_at<B>(
         &self,
         file_guard: FileGuard,
@@ -78,7 +111,8 @@ impl IoEngineKind {
         B: tokio_epoll_uring::BoundedBufMut + Send,
     {
         match self {
-            IoEngineKind::StdFs => {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
                 // SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory.
                 let dst = unsafe {
                     std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total())
@@ -96,7 +130,7 @@ impl IoEngineKind {
                 ((file_guard, buf), res)
             }
             #[cfg(target_os = "linux")]
-            IoEngineKind::TokioEpollUring => {
+            IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring::thread_local_system().await;
                 let (resources, res) = system.read(file_guard, offset, buf).await;
                 (
diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs
index 1e5ffe15cc..f75edb0bac 100644
--- a/pageserver/src/virtual_file/open_options.rs
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -1,6 +1,6 @@
 //! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`];
 
-use super::IoEngineKind;
+use super::io_engine::IoEngine;
 use std::{os::fd::OwnedFd, path::Path};
 
 #[derive(Debug, Clone)]
@@ -13,9 +13,10 @@ pub enum OpenOptions {
 impl Default for OpenOptions {
     fn default() -> Self {
         match super::io_engine::get() {
-            IoEngineKind::StdFs => Self::StdFs(std::fs::OpenOptions::new()),
+            IoEngine::NotSet => panic!("io engine not set"),
+            IoEngine::StdFs => Self::StdFs(std::fs::OpenOptions::new()),
             #[cfg(target_os = "linux")]
-            IoEngineKind::TokioEpollUring => {
+            IoEngine::TokioEpollUring => {
                 Self::TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions::new())
             }
         }

From 2e9b1f7aaf61d5886f312628d4fb54a1526317f2 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 6 Feb 2024 14:34:20 -0600
Subject: [PATCH 0114/1571] Update Postgres 14 to 14.11

---
 vendor/postgres-v14   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index be7a65fe67..018fb05201 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit be7a65fe67dc81d85bbcbebb13e00d94715f4b88
+Subproject commit 018fb052011081dc2733d3118d12e5c36df6eba1
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 80699839ba..c2f9244116 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
     "postgres-v16": "f7ea954989a2e7901f858779cff55259f203479a",
     "postgres-v15": "81e16cd537053f49e175d4a08ab7c8aec3d9b535",
-    "postgres-v14": "be7a65fe67dc81d85bbcbebb13e00d94715f4b88"
+    "postgres-v14": "018fb052011081dc2733d3118d12e5c36df6eba1"
 }

From 5541244dc4736208e802dd60d6f9861392d9b743 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 6 Feb 2024 14:35:37 -0600
Subject: [PATCH 0115/1571] Update Postgres 15 to 15.6

---
 vendor/postgres-v15   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 81e16cd537..6ee78a3c29 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 81e16cd537053f49e175d4a08ab7c8aec3d9b535
+Subproject commit 6ee78a3c29e33cafd85ba09568b6b5eb031d29b9
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c2f9244116..c7076231e5 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
     "postgres-v16": "f7ea954989a2e7901f858779cff55259f203479a",
-    "postgres-v15": "81e16cd537053f49e175d4a08ab7c8aec3d9b535",
+    "postgres-v15": "6ee78a3c29e33cafd85ba09568b6b5eb031d29b9",
     "postgres-v14": "018fb052011081dc2733d3118d12e5c36df6eba1"
 }

From 128fae70548f06ebc8ac44c38576c993ae6cba52 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 6 Feb 2024 14:37:21 -0600
Subject: [PATCH 0116/1571] Update Postgres 16 to 16.2

---
 libs/walproposer/src/walproposer.rs | 7 +++++--
 vendor/postgres-v16                 | 2 +-
 vendor/revisions.json               | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 7251545792..8ab8fb1a07 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -453,9 +453,12 @@ mod tests {
                 event_mask: 0,
             }),
             expected_messages: vec![
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160001, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                // TODO: When updating Postgres versions, this test will cause
+                // problems. Postgres version in message needs updating.
+                //
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
                 vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
                     147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
                     188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index f7ea954989..550cdd26d4 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit f7ea954989a2e7901f858779cff55259f203479a
+Subproject commit 550cdd26d445afdd26b15aa93c8c2f3dc52f8361
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c7076231e5..91ebb8cb34 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "f7ea954989a2e7901f858779cff55259f203479a",
+    "postgres-v16": "550cdd26d445afdd26b15aa93c8c2f3dc52f8361",
     "postgres-v15": "6ee78a3c29e33cafd85ba09568b6b5eb031d29b9",
     "postgres-v14": "018fb052011081dc2733d3118d12e5c36df6eba1"
 }

From 3bd2a4fd56803b0aabb87e9076872ceff0147a77 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 7 Feb 2024 19:14:18 +0000
Subject: [PATCH 0117/1571] control_plane: avoid feedback loop with
 /location_config if compute hook fails. (#6668)

## Problem

The existing behavior isn't exactly incorrect, but is operationally
risky: if the control plane compute hook breaks, then all the control
plane operations trying to call /location_config will end up retrying
forever, which could put more load on the system.

## Summary of changes

- Treat 404s as fatal errors to do fewer retries: a 404 either indicates
we have the wrong URL, or some control plane bug is failing to recognize
our tenant ID as existing.
- Do not return an error on reconcilation errors in a non-creating
/location_config response: this allows the control plane to finish its
Operation (and we will eventually retry the compute notification later)
---
 control_plane/attachment_service/src/compute_hook.rs |  2 +-
 control_plane/attachment_service/src/service.rs      | 10 +++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs
index 0d3610aafa..5bd1b6bf09 100644
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -240,7 +240,7 @@ impl ComputeHook {
         let client = reqwest::Client::new();
         backoff::retry(
             || self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
-            |e| matches!(e, NotifyError::Fatal(_)),
+            |e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
             3,
             10,
             "Send compute notification",
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index febee1aa0d..1db1906df8 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -989,7 +989,15 @@ impl Service {
                 .collect();
         } else {
             // This was an update, wait for reconciliation
-            self.await_waiters(waiters).await?;
+            if let Err(e) = self.await_waiters(waiters).await {
+                // Do not treat a reconcile error as fatal: we have already applied any requested
+                // Intent changes, and the reconcile can fail for external reasons like unavailable
+                // compute notification API.  In these cases, it is important that we do not
+                // cause the cloud control plane to retry forever on this API.
+                tracing::warn!(
+                    "Failed to reconcile after /location_config: {e}, returning success anyway"
+                );
+            }
         }
 
         Ok(result)

From c561ad4e2e900409141e8c6c9963bab90288fd12 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 7 Feb 2024 20:39:52 +0100
Subject: [PATCH 0118/1571] feat: expose locked memory in pageserver `/metrics`
 (#6669)

context: https://github.com/neondatabase/neon/issues/6667
---
 Cargo.lock                               |  3 ++
 Cargo.toml                               |  1 +
 libs/metrics/Cargo.toml                  |  3 ++
 libs/metrics/src/lib.rs                  |  2 +
 libs/metrics/src/more_process_metrics.rs | 54 ++++++++++++++++++++++++
 pageserver/src/bin/pageserver.rs         |  2 +
 6 files changed, 65 insertions(+)
 create mode 100644 libs/metrics/src/more_process_metrics.rs

diff --git a/Cargo.lock b/Cargo.lock
index a25725f90d..bf1ecfa89d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2869,6 +2869,7 @@ dependencies = [
  "chrono",
  "libc",
  "once_cell",
+ "procfs",
  "prometheus",
  "rand 0.8.5",
  "rand_distr",
@@ -3986,6 +3987,8 @@ checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69"
 dependencies = [
  "bitflags 1.3.2",
  "byteorder",
+ "chrono",
+ "flate2",
  "hex",
  "lazy_static",
  "rustix 0.36.16",
diff --git a/Cargo.toml b/Cargo.toml
index 271edee742..6a2c3fa563 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -113,6 +113,7 @@ parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
 parquet_derive = "49.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
+procfs = "0.14"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml
index a547d492df..f6a49a0166 100644
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -13,6 +13,9 @@ twox-hash.workspace = true
 
 workspace_hack.workspace = true
 
+[target.'cfg(target_os = "linux")'.dependencies]
+procfs.workspace = true
+
 [dev-dependencies]
 rand = "0.8"
 rand_distr = "0.4.3"
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index cb9914e5de..b57fd9f33b 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -31,6 +31,8 @@ pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
 pub mod metric_vec_duration;
 pub use hll::{HyperLogLog, HyperLogLogVec};
+#[cfg(target_os = "linux")]
+pub mod more_process_metrics;
 
 pub type UIntGauge = GenericGauge<AtomicU64>;
 pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
diff --git a/libs/metrics/src/more_process_metrics.rs b/libs/metrics/src/more_process_metrics.rs
new file mode 100644
index 0000000000..920724fdec
--- /dev/null
+++ b/libs/metrics/src/more_process_metrics.rs
@@ -0,0 +1,54 @@
+//! process metrics that the [`::prometheus`] crate doesn't provide.
+
+// This module has heavy inspiration from the prometheus crate's `process_collector.rs`.
+
+use crate::UIntGauge;
+
+pub struct Collector {
+    descs: Vec<prometheus::core::Desc>,
+    vmlck: crate::UIntGauge,
+}
+
+const NMETRICS: usize = 1;
+
+impl prometheus::core::Collector for Collector {
+    fn desc(&self) -> Vec<&prometheus::core::Desc> {
+        self.descs.iter().collect()
+    }
+
+    fn collect(&self) -> Vec<prometheus::proto::MetricFamily> {
+        let Ok(myself) = procfs::process::Process::myself() else {
+            return vec![];
+        };
+        let mut mfs = Vec::with_capacity(NMETRICS);
+        if let Ok(status) = myself.status() {
+            if let Some(vmlck) = status.vmlck {
+                self.vmlck.set(vmlck);
+                mfs.extend(self.vmlck.collect())
+            }
+        }
+        mfs
+    }
+}
+
+impl Collector {
+    pub fn new() -> Self {
+        let mut descs = Vec::new();
+
+        let vmlck =
+            UIntGauge::new("libmetrics_process_status_vmlck", "/proc/self/status vmlck").unwrap();
+        descs.extend(
+            prometheus::core::Collector::desc(&vmlck)
+                .into_iter()
+                .cloned(),
+        );
+
+        Self { descs, vmlck }
+    }
+}
+
+impl Default for Collector {
+    fn default() -> Self {
+        Self::new()
+    }
+}
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index eaddcb4607..7a93830c14 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -272,6 +272,8 @@ fn start_pageserver(
     );
     set_build_info_metric(GIT_VERSION, BUILD_TAG);
     set_launch_timestamp_metric(launch_ts);
+    #[cfg(target_os = "linux")]
+    metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap();
     pageserver::preinitialize_metrics();
 
     // If any failpoints were set from FAILPOINTS environment variable,

From 9a017778a9f89d5adfb6869a883ee2532dcaf13a Mon Sep 17 00:00:00 2001
From: Andreas Scherbaum <andreasscherbaum@users.noreply.github.com>
Date: Thu, 8 Feb 2024 00:48:31 +0100
Subject: [PATCH 0119/1571] Update copyright notice, set it to current year
 (#6671)

## Problem

Copyright notice is outdated

## Summary of changes

Replace the initial year `2022` with `2022 - 2024`, after brief
discussion with Stas about the format

Co-authored-by: Andreas Scherbaum <andreas@neon.tech>
---
 NOTICE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NOTICE b/NOTICE
index c13dc2f0b3..52fc751c41 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,5 +1,5 @@
 Neon
-Copyright 2022 Neon Inc.
+Copyright 2022 - 2024 Neon Inc.
 
 The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license.
 See vendor/postgres-vX/COPYRIGHT for details.

From c52495774d5151db63059515a524621660236f75 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 8 Feb 2024 00:58:54 +0100
Subject: [PATCH 0120/1571] tokio-epoll-uring: expose its metrics in
 pageserver's `/metrics` (#6672)

context: https://github.com/neondatabase/neon/issues/6667
---
 Cargo.lock                       |  4 +-
 pageserver/src/bin/pageserver.rs |  4 ++
 pageserver/src/metrics.rs        | 66 ++++++++++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bf1ecfa89d..30e233ecc1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5739,7 +5739,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#d6a1c93442fb6b3a5bec490204961134e54925dc"
 dependencies = [
  "futures",
  "nix 0.26.4",
@@ -6264,7 +6264,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#d6a1c93442fb6b3a5bec490204961134e54925dc"
 dependencies = [
  "io-uring",
  "libc",
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 7a93830c14..2f172bd384 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -274,6 +274,10 @@ fn start_pageserver(
     set_launch_timestamp_metric(launch_ts);
     #[cfg(target_os = "linux")]
     metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap();
+    metrics::register_internal(Box::new(
+        pageserver::metrics::tokio_epoll_uring::Collector::new(),
+    ))
+    .unwrap();
     pageserver::preinitialize_metrics();
 
     // If any failpoints were set from FAILPOINTS environment variable,
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 489ec58e62..98c98ef6e7 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2400,6 +2400,72 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
     }
 }
 
+pub mod tokio_epoll_uring {
+    use metrics::UIntGauge;
+
+    pub struct Collector {
+        descs: Vec<metrics::core::Desc>,
+        systems_created: UIntGauge,
+        systems_destroyed: UIntGauge,
+    }
+
+    const NMETRICS: usize = 2;
+
+    impl metrics::core::Collector for Collector {
+        fn desc(&self) -> Vec<&metrics::core::Desc> {
+            self.descs.iter().collect()
+        }
+
+        fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
+            let mut mfs = Vec::with_capacity(NMETRICS);
+            let tokio_epoll_uring::metrics::Metrics {
+                systems_created,
+                systems_destroyed,
+            } = tokio_epoll_uring::metrics::global();
+            self.systems_created.set(systems_created);
+            mfs.extend(self.systems_created.collect());
+            self.systems_destroyed.set(systems_destroyed);
+            mfs.extend(self.systems_destroyed.collect());
+            mfs
+        }
+    }
+
+    impl Collector {
+        #[allow(clippy::new_without_default)]
+        pub fn new() -> Self {
+            let mut descs = Vec::new();
+
+            let systems_created = UIntGauge::new(
+                "pageserver_tokio_epoll_uring_systems_created",
+                "counter of tokio-epoll-uring systems that were created",
+            )
+            .unwrap();
+            descs.extend(
+                metrics::core::Collector::desc(&systems_created)
+                    .into_iter()
+                    .cloned(),
+            );
+
+            let systems_destroyed = UIntGauge::new(
+                "pageserver_tokio_epoll_uring_systems_destroyed",
+                "counter of tokio-epoll-uring systems that were destroyed",
+            )
+            .unwrap();
+            descs.extend(
+                metrics::core::Collector::desc(&systems_destroyed)
+                    .into_iter()
+                    .cloned(),
+            );
+
+            Self {
+                descs,
+                systems_created,
+                systems_destroyed,
+            }
+        }
+    }
+}
+
 pub fn preinitialize_metrics() {
     // Python tests need these and on some we do alerting.
     //

From c63e3e7e84c2dd9c9792619cc4fee15b07cfe7d7 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 8 Feb 2024 12:57:05 +0100
Subject: [PATCH 0121/1571] Proxy: improve http-pool  (#6577)

## Problem

The password check logic for the sql-over-http is a bit non-intuitive.

## Summary of changes

1. Perform scram auth using the same logic as for websocket cleartext
password.
2. Split establish connection logic and connection pool.
3. Parallelize param parsing logic with authentication + wake compute.
4. Limit the total number of clients
---
 Cargo.lock                            |   1 +
 proxy/Cargo.toml                      |   1 +
 proxy/src/auth/backend.rs             |  12 +
 proxy/src/auth/flow.rs                |   2 +-
 proxy/src/bin/proxy.rs                |   5 +
 proxy/src/console/provider/neon.rs    |   2 +
 proxy/src/context.rs                  |   4 +
 proxy/src/metrics.rs                  |  44 +-
 proxy/src/proxy/connect_compute.rs    |  22 +-
 proxy/src/proxy/tests.rs              |   3 +
 proxy/src/serverless.rs               |  41 +-
 proxy/src/serverless/backend.rs       | 157 +++++
 proxy/src/serverless/conn_pool.rs     | 797 +++++++++++++-------------
 proxy/src/serverless/json.rs          |  28 +-
 proxy/src/serverless/sql_over_http.rs |  92 ++-
 test_runner/regress/test_proxy.py     |  20 +-
 16 files changed, 753 insertions(+), 478 deletions(-)
 create mode 100644 proxy/src/serverless/backend.rs

diff --git a/Cargo.lock b/Cargo.lock
index 30e233ecc1..c0c319cd89 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4079,6 +4079,7 @@ dependencies = [
  "clap",
  "consumption_metrics",
  "dashmap",
+ "env_logger",
  "futures",
  "git-version",
  "hashbrown 0.13.2",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 1247f08ee6..83cab381b3 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -19,6 +19,7 @@ chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
 dashmap.workspace = true
+env_logger.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hashbrown.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 236567163e..fa2782bee3 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -68,6 +68,7 @@ pub trait TestBackend: Send + Sync + 'static {
     fn get_allowed_ips_and_secret(
         &self,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
+    fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError>;
 }
 
 impl std::fmt::Display for BackendType<'_, ()> {
@@ -358,6 +359,17 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
 }
 
 impl BackendType<'_, ComputeUserInfo> {
+    pub async fn get_role_secret(
+        &self,
+        ctx: &mut RequestMonitoring,
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        use BackendType::*;
+        match self {
+            Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
+            Link(_) => Ok(Cached::new_uncached(None)),
+        }
+    }
+
     pub async fn get_allowed_ips_and_secret(
         &self,
         ctx: &mut RequestMonitoring,
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 077178d107..c2783e236c 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -167,7 +167,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
     }
 }
 
-pub(super) fn validate_password_and_exchange(
+pub(crate) fn validate_password_and_exchange(
     password: &[u8],
     secret: AuthSecret,
 ) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 3bbb87808d..6974f1a274 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -165,6 +165,10 @@ struct SqlOverHttpArgs {
     #[clap(long, default_value_t = 20)]
     sql_over_http_pool_max_conns_per_endpoint: usize,
 
+    /// How many connections to pool for each endpoint. Excess connections are discarded
+    #[clap(long, default_value_t = 20000)]
+    sql_over_http_pool_max_total_conns: usize,
+
     /// How long pooled connections should remain idle for before closing
     #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
     sql_over_http_idle_timeout: tokio::time::Duration,
@@ -387,6 +391,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             pool_shards: args.sql_over_http.sql_over_http_pool_shards,
             idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
             opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
+            max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
         },
     };
     let authentication_config = AuthenticationConfig {
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 0785419790..71b34cb676 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -188,6 +188,7 @@ impl super::Api for Api {
                 ep,
                 Arc::new(auth_info.allowed_ips),
             );
+            ctx.set_project_id(project_id);
         }
         // When we just got a secret, we don't need to invalidate it.
         Ok(Cached::new_uncached(auth_info.secret))
@@ -221,6 +222,7 @@ impl super::Api for Api {
             self.caches
                 .project_info
                 .insert_allowed_ips(&project_id, ep, allowed_ips.clone());
+            ctx.set_project_id(project_id);
         }
         Ok((
             Cached::new_uncached(allowed_ips),
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index e2b0294cd3..fe204534b7 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -89,6 +89,10 @@ impl RequestMonitoring {
         self.project = Some(x.project_id);
     }
 
+    pub fn set_project_id(&mut self, project_id: ProjectId) {
+        self.project = Some(project_id);
+    }
+
     pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
         crate::metrics::CONNECTING_ENDPOINTS
             .with_label_values(&[self.protocol])
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index fa663d8ff6..e2d96a9c27 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,8 +1,10 @@
 use ::metrics::{
     exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec,
-    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge_vec, Histogram,
-    HistogramVec, HyperLogLogVec, IntCounterPairVec, IntCounterVec, IntGaugeVec,
+    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge,
+    register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
+    IntCounterVec, IntGauge, IntGaugeVec,
 };
+use metrics::{register_int_counter_pair, IntCounterPair};
 
 use once_cell::sync::Lazy;
 use tokio::time;
@@ -112,6 +114,44 @@ pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
     .unwrap()
 });
 
+pub static HTTP_CONTENT_LENGTH: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_http_conn_content_length_bytes",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // largest bucket = 3^16 * 0.05ms = 2.15s
+        exponential_buckets(8.0, 2.0, 20).unwrap()
+    )
+    .unwrap()
+});
+
+pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_http_pool_reclaimation_lag_seconds",
+        "Time it takes to reclaim unused connection pools",
+        // 1us -> 65ms
+        exponential_buckets(1e-6, 2.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
+    register_int_counter_pair!(
+        "proxy_http_pool_endpoints_registered_total",
+        "Number of endpoints we have registered pools for",
+        "proxy_http_pool_endpoints_unregistered_total",
+        "Number of endpoints we have unregistered pools for",
+    )
+    .unwrap()
+});
+
+pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "proxy_http_pool_opened_connections",
+        "Number of opened connections to a database.",
+    )
+    .unwrap()
+});
+
 #[derive(Clone)]
 pub struct LatencyTimer {
     // time since the stopwatch was started
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 58c59dba36..b9346aa743 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -34,21 +34,6 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg
     node_info.invalidate().config
 }
 
-/// Try to connect to the compute node once.
-#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)]
-async fn connect_to_compute_once(
-    ctx: &mut RequestMonitoring,
-    node_info: &console::CachedNodeInfo,
-    timeout: time::Duration,
-) -> Result<PostgresConnection, compute::ConnectionError> {
-    let allow_self_signed_compute = node_info.allow_self_signed_compute;
-
-    node_info
-        .config
-        .connect(ctx, allow_self_signed_compute, timeout)
-        .await
-}
-
 #[async_trait]
 pub trait ConnectMechanism {
     type Connection;
@@ -75,13 +60,18 @@ impl ConnectMechanism for TcpMechanism<'_> {
     type ConnectError = compute::ConnectionError;
     type Error = compute::ConnectionError;
 
+    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
     async fn connect_once(
         &self,
         ctx: &mut RequestMonitoring,
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
-        connect_to_compute_once(ctx, node_info, timeout).await
+        let allow_self_signed_compute = node_info.allow_self_signed_compute;
+        node_info
+            .config
+            .connect(ctx, allow_self_signed_compute, timeout)
+            .await
     }
 
     fn update_connect_config(&self, config: &mut compute::ConnCfg) {
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 2000774224..656cabac75 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -478,6 +478,9 @@ impl TestBackend for TestConnectMechanism {
     {
         unimplemented!("not used in tests")
     }
+    fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
+        unimplemented!("not used in tests")
+    }
 }
 
 fn helper_create_cached_node_info() -> CachedNodeInfo {
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 7ff93b23b8..58aa925a6a 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -2,6 +2,7 @@
 //!
 //! Handles both SQL over HTTP and SQL over Websockets.
 
+mod backend;
 mod conn_pool;
 mod json;
 mod sql_over_http;
@@ -18,11 +19,11 @@ pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
 
-use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
 use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
 use crate::rate_limiter::EndpointRateLimiter;
+use crate::serverless::backend::PoolingBackend;
 use crate::{cancellation::CancelMap, config::ProxyConfig};
 use futures::StreamExt;
 use hyper::{
@@ -54,12 +55,13 @@ pub async fn task_main(
         info!("websocket server has shut down");
     }
 
-    let conn_pool = conn_pool::GlobalConnPool::new(config);
-
-    let conn_pool2 = Arc::clone(&conn_pool);
-    tokio::spawn(async move {
-        conn_pool2.gc_worker(StdRng::from_entropy()).await;
-    });
+    let conn_pool = conn_pool::GlobalConnPool::new(&config.http_config);
+    {
+        let conn_pool = Arc::clone(&conn_pool);
+        tokio::spawn(async move {
+            conn_pool.gc_worker(StdRng::from_entropy()).await;
+        });
+    }
 
     // shutdown the connection pool
     tokio::spawn({
@@ -73,6 +75,11 @@ pub async fn task_main(
         }
     });
 
+    let backend = Arc::new(PoolingBackend {
+        pool: Arc::clone(&conn_pool),
+        config,
+    });
+
     let tls_config = match config.tls_config.as_ref() {
         Some(config) => config,
         None => {
@@ -106,7 +113,7 @@ pub async fn task_main(
             let client_addr = io.client_addr();
             let remote_addr = io.inner.remote_addr();
             let sni_name = tls.server_name().map(|s| s.to_string());
-            let conn_pool = conn_pool.clone();
+            let backend = backend.clone();
             let ws_connections = ws_connections.clone();
             let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
@@ -119,7 +126,7 @@ pub async fn task_main(
                 Ok(MetricService::new(hyper::service::service_fn(
                     move |req: Request<Body>| {
                         let sni_name = sni_name.clone();
-                        let conn_pool = conn_pool.clone();
+                        let backend = backend.clone();
                         let ws_connections = ws_connections.clone();
                         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
@@ -130,8 +137,7 @@ pub async fn task_main(
                             request_handler(
                                 req,
                                 config,
-                                tls_config,
-                                conn_pool,
+                                backend,
                                 ws_connections,
                                 cancel_map,
                                 session_id,
@@ -200,8 +206,7 @@ where
 async fn request_handler(
     mut request: Request<Body>,
     config: &'static ProxyConfig,
-    tls: &'static TlsConfig,
-    conn_pool: Arc<conn_pool::GlobalConnPool>,
+    backend: Arc<PoolingBackend>,
     ws_connections: TaskTracker,
     cancel_map: Arc<CancelMap>,
     session_id: uuid::Uuid,
@@ -248,15 +253,7 @@ async fn request_handler(
     } else if request.uri().path() == "/sql" && request.method() == Method::POST {
         let mut ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
 
-        sql_over_http::handle(
-            tls,
-            &config.http_config,
-            &mut ctx,
-            request,
-            sni_hostname,
-            conn_pool,
-        )
-        .await
+        sql_over_http::handle(config, &mut ctx, request, sni_hostname, backend).await
     } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
         Response::builder()
             .header("Allow", "OPTIONS, POST")
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
new file mode 100644
index 0000000000..466a74f0ea
--- /dev/null
+++ b/proxy/src/serverless/backend.rs
@@ -0,0 +1,157 @@
+use std::{sync::Arc, time::Duration};
+
+use anyhow::Context;
+use async_trait::async_trait;
+use tracing::info;
+
+use crate::{
+    auth::{backend::ComputeCredentialKeys, check_peer_addr_is_in_list, AuthError},
+    compute,
+    config::ProxyConfig,
+    console::CachedNodeInfo,
+    context::RequestMonitoring,
+    proxy::connect_compute::ConnectMechanism,
+};
+
+use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool, APP_NAME};
+
+pub struct PoolingBackend {
+    pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
+    pub config: &'static ProxyConfig,
+}
+
+impl PoolingBackend {
+    pub async fn authenticate(
+        &self,
+        ctx: &mut RequestMonitoring,
+        conn_info: &ConnInfo,
+    ) -> Result<ComputeCredentialKeys, AuthError> {
+        let user_info = conn_info.user_info.clone();
+        let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
+        let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
+        if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
+            return Err(AuthError::ip_address_not_allowed());
+        }
+        let cached_secret = match maybe_secret {
+            Some(secret) => secret,
+            None => backend.get_role_secret(ctx).await?,
+        };
+
+        let secret = match cached_secret.value.clone() {
+            Some(secret) => secret,
+            None => {
+                // If we don't have an authentication secret, for the http flow we can just return an error.
+                info!("authentication info not found");
+                return Err(AuthError::auth_failed(&*user_info.user));
+            }
+        };
+        let auth_outcome =
+            crate::auth::validate_password_and_exchange(conn_info.password.as_bytes(), secret)?;
+        match auth_outcome {
+            crate::sasl::Outcome::Success(key) => Ok(key),
+            crate::sasl::Outcome::Failure(reason) => {
+                info!("auth backend failed with an error: {reason}");
+                Err(AuthError::auth_failed(&*conn_info.user_info.user))
+            }
+        }
+    }
+
+    // Wake up the destination if needed. Code here is a bit involved because
+    // we reuse the code from the usual proxy and we need to prepare few structures
+    // that this code expects.
+    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
+    pub async fn connect_to_compute(
+        &self,
+        ctx: &mut RequestMonitoring,
+        conn_info: ConnInfo,
+        keys: ComputeCredentialKeys,
+        force_new: bool,
+    ) -> anyhow::Result<Client<tokio_postgres::Client>> {
+        let maybe_client = if !force_new {
+            info!("pool: looking for an existing connection");
+            self.pool.get(ctx, &conn_info).await?
+        } else {
+            info!("pool: pool is disabled");
+            None
+        };
+
+        if let Some(client) = maybe_client {
+            return Ok(client);
+        }
+        let conn_id = uuid::Uuid::new_v4();
+        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
+        ctx.set_application(Some(APP_NAME));
+        let backend = self
+            .config
+            .auth_backend
+            .as_ref()
+            .map(|_| conn_info.user_info.clone());
+
+        let mut node_info = backend
+            .wake_compute(ctx)
+            .await?
+            .context("missing cache entry from wake_compute")?;
+
+        match keys {
+            #[cfg(any(test, feature = "testing"))]
+            ComputeCredentialKeys::Password(password) => node_info.config.password(password),
+            ComputeCredentialKeys::AuthKeys(auth_keys) => node_info.config.auth_keys(auth_keys),
+        };
+
+        ctx.set_project(node_info.aux.clone());
+
+        crate::proxy::connect_compute::connect_to_compute(
+            ctx,
+            &TokioMechanism {
+                conn_id,
+                conn_info,
+                pool: self.pool.clone(),
+            },
+            node_info,
+            &backend,
+        )
+        .await
+    }
+}
+
+struct TokioMechanism {
+    pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
+    conn_info: ConnInfo,
+    conn_id: uuid::Uuid,
+}
+
+#[async_trait]
+impl ConnectMechanism for TokioMechanism {
+    type Connection = Client<tokio_postgres::Client>;
+    type ConnectError = tokio_postgres::Error;
+    type Error = anyhow::Error;
+
+    async fn connect_once(
+        &self,
+        ctx: &mut RequestMonitoring,
+        node_info: &CachedNodeInfo,
+        timeout: Duration,
+    ) -> Result<Self::Connection, Self::ConnectError> {
+        let mut config = (*node_info.config).clone();
+        let config = config
+            .user(&self.conn_info.user_info.user)
+            .password(&*self.conn_info.password)
+            .dbname(&self.conn_info.dbname)
+            .connect_timeout(timeout);
+
+        let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
+
+        tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
+        Ok(poll_client(
+            self.pool.clone(),
+            ctx,
+            self.conn_info.clone(),
+            client,
+            connection,
+            self.conn_id,
+            node_info.aux.clone(),
+        ))
+    }
+
+    fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
+}
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 312fa2b36f..a7b2c532d2 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,15 +1,7 @@
-use anyhow::Context;
-use async_trait::async_trait;
 use dashmap::DashMap;
 use futures::{future::poll_fn, Future};
-use metrics::{register_int_counter_pair, IntCounterPair, IntCounterPairGuard};
-use once_cell::sync::Lazy;
+use metrics::IntCounterPairGuard;
 use parking_lot::RwLock;
-use pbkdf2::{
-    password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString},
-    Params, Pbkdf2,
-};
-use prometheus::{exponential_buckets, register_histogram, Histogram};
 use rand::Rng;
 use smol_str::SmolStr;
 use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
@@ -21,19 +13,17 @@ use std::{
     ops::Deref,
     sync::atomic::{self, AtomicUsize},
 };
-use tokio::time::{self, Instant};
-use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
+use tokio::time::Instant;
+use tokio_postgres::tls::NoTlsStream;
+use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
 
+use crate::console::messages::MetricsAuxInfo;
+use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL};
+use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
 use crate::{
-    auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
-    console::{self, messages::MetricsAuxInfo},
-    context::RequestMonitoring,
-    metrics::NUM_DB_CONNECTIONS_GAUGE,
-    proxy::connect_compute::ConnectMechanism,
-    usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
+    auth::backend::ComputeUserInfo, context::RequestMonitoring, metrics::NUM_DB_CONNECTIONS_GAUGE,
     DbName, EndpointCacheKey, RoleName,
 };
-use crate::{compute, config};
 
 use tracing::{debug, error, warn, Span};
 use tracing::{info, info_span, Instrument};
@@ -72,39 +62,51 @@ impl fmt::Display for ConnInfo {
     }
 }
 
-struct ConnPoolEntry {
-    conn: ClientInner,
+struct ConnPoolEntry<C: ClientInnerExt> {
+    conn: ClientInner<C>,
     _last_access: std::time::Instant,
 }
 
 // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
-pub struct EndpointConnPool {
-    pools: HashMap<(DbName, RoleName), DbUserConnPool>,
+pub struct EndpointConnPool<C: ClientInnerExt> {
+    pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
     total_conns: usize,
     max_conns: usize,
     _guard: IntCounterPairGuard,
+    global_connections_count: Arc<AtomicUsize>,
+    global_pool_size_max_conns: usize,
 }
 
-impl EndpointConnPool {
-    fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry> {
+impl<C: ClientInnerExt> EndpointConnPool<C> {
+    fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry<C>> {
         let Self {
-            pools, total_conns, ..
+            pools,
+            total_conns,
+            global_connections_count,
+            ..
         } = self;
-        pools
-            .get_mut(&db_user)
-            .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
+        pools.get_mut(&db_user).and_then(|pool_entries| {
+            pool_entries.get_conn_entry(total_conns, global_connections_count.clone())
+        })
     }
 
     fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool {
         let Self {
-            pools, total_conns, ..
+            pools,
+            total_conns,
+            global_connections_count,
+            ..
         } = self;
         if let Some(pool) = pools.get_mut(&db_user) {
             let old_len = pool.conns.len();
             pool.conns.retain(|conn| conn.conn.conn_id != conn_id);
             let new_len = pool.conns.len();
             let removed = old_len - new_len;
+            if removed > 0 {
+                global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
+                NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64);
+            }
             *total_conns -= removed;
             removed > 0
         } else {
@@ -112,13 +114,27 @@ impl EndpointConnPool {
         }
     }
 
-    fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
+    fn put(
+        pool: &RwLock<Self>,
+        conn_info: &ConnInfo,
+        client: ClientInner<C>,
+    ) -> anyhow::Result<()> {
         let conn_id = client.conn_id;
 
-        if client.inner.is_closed() {
+        if client.is_closed() {
             info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
             return Ok(());
         }
+        let global_max_conn = pool.read().global_pool_size_max_conns;
+        if pool
+            .read()
+            .global_connections_count
+            .load(atomic::Ordering::Relaxed)
+            >= global_max_conn
+        {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full");
+            return Ok(());
+        }
 
         // return connection to the pool
         let mut returned = false;
@@ -127,18 +143,19 @@ impl EndpointConnPool {
             let mut pool = pool.write();
 
             if pool.total_conns < pool.max_conns {
-                // we create this db-user entry in get, so it should not be None
-                if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
-                    pool_entries.conns.push(ConnPoolEntry {
-                        conn: client,
-                        _last_access: std::time::Instant::now(),
-                    });
+                let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default();
+                pool_entries.conns.push(ConnPoolEntry {
+                    conn: client,
+                    _last_access: std::time::Instant::now(),
+                });
 
-                    returned = true;
-                    per_db_size = pool_entries.conns.len();
+                returned = true;
+                per_db_size = pool_entries.conns.len();
 
-                    pool.total_conns += 1;
-                }
+                pool.total_conns += 1;
+                pool.global_connections_count
+                    .fetch_add(1, atomic::Ordering::Relaxed);
+                NUM_OPEN_CLIENTS_IN_HTTP_POOL.inc();
             }
 
             pool.total_conns
@@ -155,49 +172,61 @@ impl EndpointConnPool {
     }
 }
 
-/// 4096 is the number of rounds that SCRAM-SHA-256 recommends.
-/// It's not the 600,000 that OWASP recommends... but our passwords are high entropy anyway.
-///
-/// Still takes 1.4ms to hash on my hardware.
-/// We don't want to ruin the latency improvements of using the pool by making password verification take too long
-const PARAMS: Params = Params {
-    rounds: 4096,
-    output_length: 32,
-};
-
-#[derive(Default)]
-pub struct DbUserConnPool {
-    conns: Vec<ConnPoolEntry>,
-    password_hash: Option<PasswordHashString>,
+impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
+    fn drop(&mut self) {
+        if self.total_conns > 0 {
+            self.global_connections_count
+                .fetch_sub(self.total_conns, atomic::Ordering::Relaxed);
+            NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(self.total_conns as i64);
+        }
+    }
 }
 
-impl DbUserConnPool {
-    fn clear_closed_clients(&mut self, conns: &mut usize) {
+pub struct DbUserConnPool<C: ClientInnerExt> {
+    conns: Vec<ConnPoolEntry<C>>,
+}
+
+impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
+    fn default() -> Self {
+        Self { conns: Vec::new() }
+    }
+}
+
+impl<C: ClientInnerExt> DbUserConnPool<C> {
+    fn clear_closed_clients(&mut self, conns: &mut usize) -> usize {
         let old_len = self.conns.len();
 
-        self.conns.retain(|conn| !conn.conn.inner.is_closed());
+        self.conns.retain(|conn| !conn.conn.is_closed());
 
         let new_len = self.conns.len();
         let removed = old_len - new_len;
         *conns -= removed;
+        removed
     }
 
-    fn get_conn_entry(&mut self, conns: &mut usize) -> Option<ConnPoolEntry> {
-        self.clear_closed_clients(conns);
+    fn get_conn_entry(
+        &mut self,
+        conns: &mut usize,
+        global_connections_count: Arc<AtomicUsize>,
+    ) -> Option<ConnPoolEntry<C>> {
+        let mut removed = self.clear_closed_clients(conns);
         let conn = self.conns.pop();
         if conn.is_some() {
             *conns -= 1;
+            removed += 1;
         }
+        global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
+        NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64);
         conn
     }
 }
 
-pub struct GlobalConnPool {
+pub struct GlobalConnPool<C: ClientInnerExt> {
     // endpoint -> per-endpoint connection pool
     //
     // That should be a fairly conteded map, so return reference to the per-endpoint
     // pool as early as possible and release the lock.
-    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool>>>,
+    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool<C>>>>,
 
     /// Number of endpoint-connection pools
     ///
@@ -206,7 +235,10 @@ pub struct GlobalConnPool {
     /// It's only used for diagnostics.
     global_pool_size: AtomicUsize,
 
-    proxy_config: &'static crate::config::ProxyConfig,
+    /// Total number of connections in the pool
+    global_connections_count: Arc<AtomicUsize>,
+
+    config: &'static crate::config::HttpConfig,
 }
 
 #[derive(Debug, Clone, Copy)]
@@ -224,45 +256,39 @@ pub struct GlobalConnPoolOptions {
     pub idle_timeout: Duration,
 
     pub opt_in: bool,
+
+    // Total number of connections in the pool.
+    pub max_total_conns: usize,
 }
 
-pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_http_pool_reclaimation_lag_seconds",
-        "Time it takes to reclaim unused connection pools",
-        // 1us -> 65ms
-        exponential_buckets(1e-6, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
-
-pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
-    register_int_counter_pair!(
-        "proxy_http_pool_endpoints_registered_total",
-        "Number of endpoints we have registered pools for",
-        "proxy_http_pool_endpoints_unregistered_total",
-        "Number of endpoints we have unregistered pools for",
-    )
-    .unwrap()
-});
-
-impl GlobalConnPool {
-    pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
-        let shards = config.http_config.pool_options.pool_shards;
+impl<C: ClientInnerExt> GlobalConnPool<C> {
+    pub fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
+        let shards = config.pool_options.pool_shards;
         Arc::new(Self {
             global_pool: DashMap::with_shard_amount(shards),
             global_pool_size: AtomicUsize::new(0),
-            proxy_config: config,
+            config,
+            global_connections_count: Arc::new(AtomicUsize::new(0)),
         })
     }
 
+    #[cfg(test)]
+    pub fn get_global_connections_count(&self) -> usize {
+        self.global_connections_count
+            .load(atomic::Ordering::Relaxed)
+    }
+
+    pub fn get_idle_timeout(&self) -> Duration {
+        self.config.pool_options.idle_timeout
+    }
+
     pub fn shutdown(&self) {
         // drops all strong references to endpoint-pools
         self.global_pool.clear();
     }
 
     pub async fn gc_worker(&self, mut rng: impl Rng) {
-        let epoch = self.proxy_config.http_config.pool_options.gc_epoch;
+        let epoch = self.config.pool_options.gc_epoch;
         let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
         loop {
             interval.tick().await;
@@ -280,6 +306,7 @@ impl GlobalConnPool {
 
         let timer = GC_LATENCY.start_timer();
         let current_len = shard.len();
+        let mut clients_removed = 0;
         shard.retain(|endpoint, x| {
             // if the current endpoint pool is unique (no other strong or weak references)
             // then it is currently not in use by any connections.
@@ -289,9 +316,9 @@ impl GlobalConnPool {
                 } = pool.get_mut();
 
                 // ensure that closed clients are removed
-                pools
-                    .iter_mut()
-                    .for_each(|(_, db_pool)| db_pool.clear_closed_clients(total_conns));
+                pools.iter_mut().for_each(|(_, db_pool)| {
+                    clients_removed += db_pool.clear_closed_clients(total_conns);
+                });
 
                 // we only remove this pool if it has no active connections
                 if *total_conns == 0 {
@@ -302,10 +329,20 @@ impl GlobalConnPool {
 
             true
         });
+
         let new_len = shard.len();
         drop(shard);
         timer.observe_duration();
 
+        // Do logging outside of the lock.
+        if clients_removed > 0 {
+            let size = self
+                .global_connections_count
+                .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
+                - clients_removed;
+            NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(clients_removed as i64);
+            info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
+        }
         let removed = current_len - new_len;
 
         if removed > 0 {
@@ -320,61 +357,24 @@ impl GlobalConnPool {
     pub async fn get(
         self: &Arc<Self>,
         ctx: &mut RequestMonitoring,
-        conn_info: ConnInfo,
-        force_new: bool,
-    ) -> anyhow::Result<Client> {
-        let mut client: Option<ClientInner> = None;
+        conn_info: &ConnInfo,
+    ) -> anyhow::Result<Option<Client<C>>> {
+        let mut client: Option<ClientInner<C>> = None;
 
-        let mut hash_valid = false;
-        let mut endpoint_pool = Weak::new();
-        if !force_new {
-            let pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key());
-            endpoint_pool = Arc::downgrade(&pool);
-            let mut hash = None;
-
-            // find a pool entry by (dbname, username) if exists
-            {
-                let pool = pool.read();
-                if let Some(pool_entries) = pool.pools.get(&conn_info.db_and_user()) {
-                    if !pool_entries.conns.is_empty() {
-                        hash = pool_entries.password_hash.clone();
-                    }
-                }
-            }
-
-            // a connection exists in the pool, verify the password hash
-            if let Some(hash) = hash {
-                let pw = conn_info.password.clone();
-                let validate = tokio::task::spawn_blocking(move || {
-                    Pbkdf2.verify_password(pw.as_bytes(), &hash.password_hash())
-                })
-                .await?;
-
-                // if the hash is invalid, don't error
-                // we will continue with the regular connection flow
-                if validate.is_ok() {
-                    hash_valid = true;
-                    if let Some(entry) = pool.write().get_conn_entry(conn_info.db_and_user()) {
-                        client = Some(entry.conn)
-                    }
-                }
-            }
+        let endpoint_pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key());
+        if let Some(entry) = endpoint_pool
+            .write()
+            .get_conn_entry(conn_info.db_and_user())
+        {
+            client = Some(entry.conn)
         }
+        let endpoint_pool = Arc::downgrade(&endpoint_pool);
 
         // ok return cached connection if found and establish a new one otherwise
-        let new_client = if let Some(client) = client {
-            ctx.set_project(client.aux.clone());
-            if client.inner.is_closed() {
-                let conn_id = uuid::Uuid::new_v4();
-                info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
-                connect_to_compute(
-                    self.proxy_config,
-                    ctx,
-                    &conn_info,
-                    conn_id,
-                    endpoint_pool.clone(),
-                )
-                .await
+        if let Some(client) = client {
+            if client.is_closed() {
+                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
+                return Ok(None);
             } else {
                 info!("pool: reusing connection '{conn_info}'");
                 client.session.send(ctx.session_id)?;
@@ -384,67 +384,16 @@ impl GlobalConnPool {
                 );
                 ctx.latency_timer.pool_hit();
                 ctx.latency_timer.success();
-                return Ok(Client::new(client, conn_info, endpoint_pool).await);
+                return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
             }
-        } else {
-            let conn_id = uuid::Uuid::new_v4();
-            info!(%conn_id, "pool: opening a new connection '{conn_info}'");
-            connect_to_compute(
-                self.proxy_config,
-                ctx,
-                &conn_info,
-                conn_id,
-                endpoint_pool.clone(),
-            )
-            .await
-        };
-        if let Ok(client) = &new_client {
-            tracing::Span::current().record(
-                "pid",
-                &tracing::field::display(client.inner.get_process_id()),
-            );
         }
-
-        match &new_client {
-            // clear the hash. it's no longer valid
-            // TODO: update tokio-postgres fork to allow access to this error kind directly
-            Err(err)
-                if hash_valid && err.to_string().contains("password authentication failed") =>
-            {
-                let pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key());
-                let mut pool = pool.write();
-                if let Some(entry) = pool.pools.get_mut(&conn_info.db_and_user()) {
-                    entry.password_hash = None;
-                }
-            }
-            // new password is valid and we should insert/update it
-            Ok(_) if !force_new && !hash_valid => {
-                let pw = conn_info.password.clone();
-                let new_hash = tokio::task::spawn_blocking(move || {
-                    let salt = SaltString::generate(rand::rngs::OsRng);
-                    Pbkdf2
-                        .hash_password_customized(pw.as_bytes(), None, None, PARAMS, &salt)
-                        .map(|s| s.serialize())
-                })
-                .await??;
-
-                let pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key());
-                let mut pool = pool.write();
-                pool.pools
-                    .entry(conn_info.db_and_user())
-                    .or_default()
-                    .password_hash = Some(new_hash);
-            }
-            _ => {}
-        }
-        let new_client = new_client?;
-        Ok(Client::new(new_client, conn_info, endpoint_pool).await)
+        Ok(None)
     }
 
     fn get_or_create_endpoint_pool(
-        &self,
+        self: &Arc<Self>,
         endpoint: &EndpointCacheKey,
-    ) -> Arc<RwLock<EndpointConnPool>> {
+    ) -> Arc<RwLock<EndpointConnPool<C>>> {
         // fast path
         if let Some(pool) = self.global_pool.get(endpoint) {
             return pool.clone();
@@ -454,12 +403,10 @@ impl GlobalConnPool {
         let new_pool = Arc::new(RwLock::new(EndpointConnPool {
             pools: HashMap::new(),
             total_conns: 0,
-            max_conns: self
-                .proxy_config
-                .http_config
-                .pool_options
-                .max_conns_per_endpoint,
+            max_conns: self.config.pool_options.max_conns_per_endpoint,
             _guard: ENDPOINT_POOLS.guard(),
+            global_connections_count: self.global_connections_count.clone(),
+            global_pool_size_max_conns: self.config.pool_options.max_total_conns,
         }));
 
         // find or create a pool for this endpoint
@@ -488,196 +435,128 @@ impl GlobalConnPool {
     }
 }
 
-struct TokioMechanism<'a> {
-    pool: Weak<RwLock<EndpointConnPool>>,
-    conn_info: &'a ConnInfo,
-    conn_id: uuid::Uuid,
-    idle: Duration,
-}
-
-#[async_trait]
-impl ConnectMechanism for TokioMechanism<'_> {
-    type Connection = ClientInner;
-    type ConnectError = tokio_postgres::Error;
-    type Error = anyhow::Error;
-
-    async fn connect_once(
-        &self,
-        ctx: &mut RequestMonitoring,
-        node_info: &console::CachedNodeInfo,
-        timeout: time::Duration,
-    ) -> Result<Self::Connection, Self::ConnectError> {
-        connect_to_compute_once(
-            ctx,
-            node_info,
-            self.conn_info,
-            timeout,
-            self.conn_id,
-            self.pool.clone(),
-            self.idle,
-        )
-        .await
-    }
-
-    fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
-}
-
-// Wake up the destination if needed. Code here is a bit involved because
-// we reuse the code from the usual proxy and we need to prepare few structures
-// that this code expects.
-#[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
-async fn connect_to_compute(
-    config: &config::ProxyConfig,
+pub fn poll_client<C: ClientInnerExt>(
+    global_pool: Arc<GlobalConnPool<C>>,
     ctx: &mut RequestMonitoring,
-    conn_info: &ConnInfo,
+    conn_info: ConnInfo,
+    client: C,
+    mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
     conn_id: uuid::Uuid,
-    pool: Weak<RwLock<EndpointConnPool>>,
-) -> anyhow::Result<ClientInner> {
-    ctx.set_application(Some(APP_NAME));
-    let backend = config
-        .auth_backend
-        .as_ref()
-        .map(|_| conn_info.user_info.clone());
-
-    if !config.disable_ip_check_for_http {
-        let (allowed_ips, _) = backend.get_allowed_ips_and_secret(ctx).await?;
-        if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
-            return Err(auth::AuthError::ip_address_not_allowed().into());
-        }
-    }
-    let node_info = backend
-        .wake_compute(ctx)
-        .await?
-        .context("missing cache entry from wake_compute")?;
-
-    ctx.set_project(node_info.aux.clone());
-
-    crate::proxy::connect_compute::connect_to_compute(
-        ctx,
-        &TokioMechanism {
-            conn_id,
-            conn_info,
-            pool,
-            idle: config.http_config.pool_options.idle_timeout,
-        },
-        node_info,
-        &backend,
-    )
-    .await
-}
-
-async fn connect_to_compute_once(
-    ctx: &mut RequestMonitoring,
-    node_info: &console::CachedNodeInfo,
-    conn_info: &ConnInfo,
-    timeout: time::Duration,
-    conn_id: uuid::Uuid,
-    pool: Weak<RwLock<EndpointConnPool>>,
-    idle: Duration,
-) -> Result<ClientInner, tokio_postgres::Error> {
-    let mut config = (*node_info.config).clone();
-    let mut session = ctx.session_id;
-
-    let (client, mut connection) = config
-        .user(&conn_info.user_info.user)
-        .password(&*conn_info.password)
-        .dbname(&conn_info.dbname)
-        .connect_timeout(timeout)
-        .connect(tokio_postgres::NoTls)
-        .await?;
-
+    aux: MetricsAuxInfo,
+) -> Client<C> {
     let conn_gauge = NUM_DB_CONNECTIONS_GAUGE
         .with_label_values(&[ctx.protocol])
         .guard();
-
-    tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
-
-    let (tx, mut rx) = tokio::sync::watch::channel(session);
+    let mut session_id = ctx.session_id;
+    let (tx, mut rx) = tokio::sync::watch::channel(session_id);
 
     let span = info_span!(parent: None, "connection", %conn_id);
     span.in_scope(|| {
-        info!(%conn_info, %session, "new connection");
+        info!(%conn_info, %session_id, "new connection");
     });
+    let pool =
+        Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
+    let pool_clone = pool.clone();
 
     let db_user = conn_info.db_and_user();
+    let idle = global_pool.get_idle_timeout();
     tokio::spawn(
-        async move {
-            let _conn_gauge = conn_gauge;
-            let mut idle_timeout = pin!(tokio::time::sleep(idle));
-            poll_fn(move |cx| {
-                if matches!(rx.has_changed(), Ok(true)) {
-                    session = *rx.borrow_and_update();
-                    info!(%session, "changed session");
-                    idle_timeout.as_mut().reset(Instant::now() + idle);
-                }
+    async move {
+        let _conn_gauge = conn_gauge;
+        let mut idle_timeout = pin!(tokio::time::sleep(idle));
+        poll_fn(move |cx| {
+            if matches!(rx.has_changed(), Ok(true)) {
+                session_id = *rx.borrow_and_update();
+                info!(%session_id, "changed session");
+                idle_timeout.as_mut().reset(Instant::now() + idle);
+            }
 
-                // 5 minute idle connection timeout
-                if idle_timeout.as_mut().poll(cx).is_ready() {
-                    idle_timeout.as_mut().reset(Instant::now() + idle);
-                    info!("connection idle");
-                    if let Some(pool) = pool.clone().upgrade() {
-                        // remove client from pool - should close the connection if it's idle.
-                        // does nothing if the client is currently checked-out and in-use
-                        if pool.write().remove_client(db_user.clone(), conn_id) {
-                            info!("idle connection removed");
-                        }
-                    }
-                }
-
-                loop {
-                    let message = ready!(connection.poll_message(cx));
-
-                    match message {
-                        Some(Ok(AsyncMessage::Notice(notice))) => {
-                            info!(%session, "notice: {}", notice);
-                        }
-                        Some(Ok(AsyncMessage::Notification(notif))) => {
-                            warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
-                        }
-                        Some(Ok(_)) => {
-                            warn!(%session, "unknown message");
-                        }
-                        Some(Err(e)) => {
-                            error!(%session, "connection error: {}", e);
-                            break
-                        }
-                        None => {
-                            info!("connection closed");
-                            break
-                        }
-                    }
-                }
-
-                // remove from connection pool
+            // 5 minute idle connection timeout
+            if idle_timeout.as_mut().poll(cx).is_ready() {
+                idle_timeout.as_mut().reset(Instant::now() + idle);
+                info!("connection idle");
                 if let Some(pool) = pool.clone().upgrade() {
+                    // remove client from pool - should close the connection if it's idle.
+                    // does nothing if the client is currently checked-out and in-use
                     if pool.write().remove_client(db_user.clone(), conn_id) {
-                        info!("closed connection removed");
+                        info!("idle connection removed");
                     }
                 }
+            }
 
-                Poll::Ready(())
-            }).await;
+            loop {
+                let message = ready!(connection.poll_message(cx));
 
-        }
-        .instrument(span)
-    );
+                match message {
+                    Some(Ok(AsyncMessage::Notice(notice))) => {
+                        info!(%session_id, "notice: {}", notice);
+                    }
+                    Some(Ok(AsyncMessage::Notification(notif))) => {
+                        warn!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received");
+                    }
+                    Some(Ok(_)) => {
+                        warn!(%session_id, "unknown message");
+                    }
+                    Some(Err(e)) => {
+                        error!(%session_id, "connection error: {}", e);
+                        break
+                    }
+                    None => {
+                        info!("connection closed");
+                        break
+                    }
+                }
+            }
 
-    Ok(ClientInner {
+            // remove from connection pool
+            if let Some(pool) = pool.clone().upgrade() {
+                if pool.write().remove_client(db_user.clone(), conn_id) {
+                    info!("closed connection removed");
+                }
+            }
+
+            Poll::Ready(())
+        }).await;
+
+    }
+    .instrument(span));
+    let inner = ClientInner {
         inner: client,
         session: tx,
-        aux: node_info.aux.clone(),
+        aux,
         conn_id,
-    })
+    };
+    Client::new(inner, conn_info, pool_clone)
 }
 
-struct ClientInner {
-    inner: tokio_postgres::Client,
+struct ClientInner<C: ClientInnerExt> {
+    inner: C,
     session: tokio::sync::watch::Sender<uuid::Uuid>,
     aux: MetricsAuxInfo,
     conn_id: uuid::Uuid,
 }
 
-impl Client {
+pub trait ClientInnerExt: Sync + Send + 'static {
+    fn is_closed(&self) -> bool;
+    fn get_process_id(&self) -> i32;
+}
+
+impl ClientInnerExt for tokio_postgres::Client {
+    fn is_closed(&self) -> bool {
+        self.is_closed()
+    }
+    fn get_process_id(&self) -> i32 {
+        self.get_process_id()
+    }
+}
+
+impl<C: ClientInnerExt> ClientInner<C> {
+    pub fn is_closed(&self) -> bool {
+        self.inner.is_closed()
+    }
+}
+
+impl<C: ClientInnerExt> Client<C> {
     pub fn metrics(&self) -> Arc<MetricCounter> {
         let aux = &self.inner.as_ref().unwrap().aux;
         USAGE_METRICS.register(Ids {
@@ -687,51 +566,46 @@ impl Client {
     }
 }
 
-pub struct Client {
-    conn_id: uuid::Uuid,
+pub struct Client<C: ClientInnerExt> {
     span: Span,
-    inner: Option<ClientInner>,
+    inner: Option<ClientInner<C>>,
     conn_info: ConnInfo,
-    pool: Weak<RwLock<EndpointConnPool>>,
+    pool: Weak<RwLock<EndpointConnPool<C>>>,
 }
 
-pub struct Discard<'a> {
+pub struct Discard<'a, C: ClientInnerExt> {
     conn_id: uuid::Uuid,
     conn_info: &'a ConnInfo,
-    pool: &'a mut Weak<RwLock<EndpointConnPool>>,
+    pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
 }
 
-impl Client {
-    pub(self) async fn new(
-        inner: ClientInner,
+impl<C: ClientInnerExt> Client<C> {
+    pub(self) fn new(
+        inner: ClientInner<C>,
         conn_info: ConnInfo,
-        pool: Weak<RwLock<EndpointConnPool>>,
+        pool: Weak<RwLock<EndpointConnPool<C>>>,
     ) -> Self {
         Self {
-            conn_id: inner.conn_id,
             inner: Some(inner),
             span: Span::current(),
             conn_info,
             pool,
         }
     }
-    pub fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) {
+    pub fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
         let Self {
             inner,
             pool,
-            conn_id,
             conn_info,
             span: _,
         } = self;
+        let inner = inner.as_mut().expect("client inner should not be removed");
         (
-            &mut inner
-                .as_mut()
-                .expect("client inner should not be removed")
-                .inner,
+            &mut inner.inner,
             Discard {
                 pool,
                 conn_info,
-                conn_id: *conn_id,
+                conn_id: inner.conn_id,
             },
         )
     }
@@ -744,7 +618,7 @@ impl Client {
     }
 }
 
-impl Discard<'_> {
+impl<C: ClientInnerExt> Discard<'_, C> {
     pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
         let conn_info = &self.conn_info;
         if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
@@ -759,8 +633,8 @@ impl Discard<'_> {
     }
 }
 
-impl Deref for Client {
-    type Target = tokio_postgres::Client;
+impl<C: ClientInnerExt> Deref for Client<C> {
+    type Target = C;
 
     fn deref(&self) -> &Self::Target {
         &self
@@ -771,8 +645,8 @@ impl Deref for Client {
     }
 }
 
-impl Drop for Client {
-    fn drop(&mut self) {
+impl<C: ClientInnerExt> Client<C> {
+    fn do_drop(&mut self) -> Option<impl FnOnce()> {
         let conn_info = self.conn_info.clone();
         let client = self
             .inner
@@ -781,10 +655,161 @@ impl Drop for Client {
         if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
             let current_span = self.span.clone();
             // return connection to the pool
-            tokio::task::spawn_blocking(move || {
+            return Some(move || {
                 let _span = current_span.enter();
                 let _ = EndpointConnPool::put(&conn_pool, &conn_info, client);
             });
         }
+        None
+    }
+}
+
+impl<C: ClientInnerExt> Drop for Client<C> {
+    fn drop(&mut self) {
+        if let Some(drop) = self.do_drop() {
+            tokio::task::spawn_blocking(drop);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use env_logger;
+    use std::{mem, sync::atomic::AtomicBool};
+
+    use super::*;
+
+    struct MockClient(Arc<AtomicBool>);
+    impl MockClient {
+        fn new(is_closed: bool) -> Self {
+            MockClient(Arc::new(is_closed.into()))
+        }
+    }
+    impl ClientInnerExt for MockClient {
+        fn is_closed(&self) -> bool {
+            self.0.load(atomic::Ordering::Relaxed)
+        }
+        fn get_process_id(&self) -> i32 {
+            0
+        }
+    }
+
+    fn create_inner() -> ClientInner<MockClient> {
+        create_inner_with(MockClient::new(false))
+    }
+
+    fn create_inner_with(client: MockClient) -> ClientInner<MockClient> {
+        ClientInner {
+            inner: client,
+            session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
+            aux: Default::default(),
+            conn_id: uuid::Uuid::new_v4(),
+        }
+    }
+
+    #[tokio::test]
+    async fn test_pool() {
+        let _ = env_logger::try_init();
+        let config = Box::leak(Box::new(crate::config::HttpConfig {
+            pool_options: GlobalConnPoolOptions {
+                max_conns_per_endpoint: 2,
+                gc_epoch: Duration::from_secs(1),
+                pool_shards: 2,
+                idle_timeout: Duration::from_secs(1),
+                opt_in: false,
+                max_total_conns: 3,
+            },
+            request_timeout: Duration::from_secs(1),
+        }));
+        let pool = GlobalConnPool::new(config);
+        let conn_info = ConnInfo {
+            user_info: ComputeUserInfo {
+                user: "user".into(),
+                endpoint: "endpoint".into(),
+                options: Default::default(),
+            },
+            dbname: "dbname".into(),
+            password: "password".into(),
+        };
+        let ep_pool =
+            Arc::downgrade(&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
+        {
+            let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
+            assert_eq!(0, pool.get_global_connections_count());
+            client.discard();
+            // Discard should not add the connection from the pool.
+            assert_eq!(0, pool.get_global_connections_count());
+        }
+        {
+            let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
+            client.do_drop().unwrap()();
+            mem::forget(client); // drop the client
+            assert_eq!(1, pool.get_global_connections_count());
+        }
+        {
+            let mut closed_client = Client::new(
+                create_inner_with(MockClient::new(true)),
+                conn_info.clone(),
+                ep_pool.clone(),
+            );
+            closed_client.do_drop().unwrap()();
+            mem::forget(closed_client); // drop the client
+                                        // The closed client shouldn't be added to the pool.
+            assert_eq!(1, pool.get_global_connections_count());
+        }
+        let is_closed: Arc<AtomicBool> = Arc::new(false.into());
+        {
+            let mut client = Client::new(
+                create_inner_with(MockClient(is_closed.clone())),
+                conn_info.clone(),
+                ep_pool.clone(),
+            );
+            client.do_drop().unwrap()();
+            mem::forget(client); // drop the client
+
+            // The client should be added to the pool.
+            assert_eq!(2, pool.get_global_connections_count());
+        }
+        {
+            let mut client = Client::new(create_inner(), conn_info, ep_pool);
+            client.do_drop().unwrap()();
+            mem::forget(client); // drop the client
+
+            // The client shouldn't be added to the pool. Because the ep-pool is full.
+            assert_eq!(2, pool.get_global_connections_count());
+        }
+
+        let conn_info = ConnInfo {
+            user_info: ComputeUserInfo {
+                user: "user".into(),
+                endpoint: "endpoint-2".into(),
+                options: Default::default(),
+            },
+            dbname: "dbname".into(),
+            password: "password".into(),
+        };
+        let ep_pool =
+            Arc::downgrade(&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
+        {
+            let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
+            client.do_drop().unwrap()();
+            mem::forget(client); // drop the client
+            assert_eq!(3, pool.get_global_connections_count());
+        }
+        {
+            let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
+            client.do_drop().unwrap()();
+            mem::forget(client); // drop the client
+
+            // The client shouldn't be added to the pool. Because the global pool is full.
+            assert_eq!(3, pool.get_global_connections_count());
+        }
+
+        is_closed.store(true, atomic::Ordering::Relaxed);
+        // Do gc for all shards.
+        pool.gc(0);
+        pool.gc(1);
+        // Closed client should be removed from the pool.
+        assert_eq!(2, pool.get_global_connections_count());
     }
 }
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index 05835b23ce..a089d34040 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -9,23 +9,23 @@ use tokio_postgres::Row;
 // as parameters.
 //
 pub fn json_to_pg_text(json: Vec<Value>) -> Vec<Option<String>> {
-    json.iter()
-        .map(|value| {
-            match value {
-                // special care for nulls
-                Value::Null => None,
+    json.iter().map(json_value_to_pg_text).collect()
+}
 
-                // convert to text with escaping
-                v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()),
+fn json_value_to_pg_text(value: &Value) -> Option<String> {
+    match value {
+        // special care for nulls
+        Value::Null => None,
 
-                // avoid escaping here, as we pass this as a parameter
-                Value::String(s) => Some(s.to_string()),
+        // convert to text with escaping
+        v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()),
 
-                // special care for arrays
-                Value::Array(_) => json_array_to_pg_array(value),
-            }
-        })
-        .collect()
+        // avoid escaping here, as we pass this as a parameter
+        Value::String(s) => Some(s.to_string()),
+
+        // special care for arrays
+        Value::Array(_) => json_array_to_pg_array(value),
+    }
 }
 
 //
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 96bf39c915..7092b65f03 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -13,6 +13,7 @@ use hyper::StatusCode;
 use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
 use serde_json::Value;
+use tokio::join;
 use tokio_postgres::error::DbError;
 use tokio_postgres::error::ErrorPosition;
 use tokio_postgres::GenericClient;
@@ -20,6 +21,7 @@ use tokio_postgres::IsolationLevel;
 use tokio_postgres::ReadyForQueryStatus;
 use tokio_postgres::Transaction;
 use tracing::error;
+use tracing::info;
 use tracing::instrument;
 use url::Url;
 use utils::http::error::ApiError;
@@ -27,22 +29,25 @@ use utils::http::json::json_response;
 
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::endpoint_sni;
-use crate::config::HttpConfig;
+use crate::config::ProxyConfig;
 use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
+use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
 use crate::RoleName;
 
+use super::backend::PoolingBackend;
 use super::conn_pool::ConnInfo;
-use super::conn_pool::GlobalConnPool;
-use super::json::{json_to_pg_text, pg_text_row_to_json};
+use super::json::json_to_pg_text;
+use super::json::pg_text_row_to_json;
 use super::SERVERLESS_DRIVER_SNI;
 
 #[derive(serde::Deserialize)]
 struct QueryData {
     query: String,
-    params: Vec<serde_json::Value>,
+    #[serde(deserialize_with = "bytes_to_pg_text")]
+    params: Vec<Option<String>>,
 }
 
 #[derive(serde::Deserialize)]
@@ -69,6 +74,15 @@ static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrab
 
 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
 
+fn bytes_to_pg_text<'de, D>(deserializer: D) -> Result<Vec<Option<String>>, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    // TODO: consider avoiding the allocation here.
+    let json: Vec<Value> = serde::de::Deserialize::deserialize(deserializer)?;
+    Ok(json_to_pg_text(json))
+}
+
 fn get_conn_info(
     ctx: &mut RequestMonitoring,
     headers: &HeaderMap,
@@ -171,16 +185,15 @@ fn check_matches(sni_hostname: &str, hostname: &str) -> Result<bool, anyhow::Err
 
 // TODO: return different http error codes
 pub async fn handle(
-    tls: &'static TlsConfig,
-    config: &'static HttpConfig,
+    config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
     request: Request<Body>,
     sni_hostname: Option<String>,
-    conn_pool: Arc<GlobalConnPool>,
+    backend: Arc<PoolingBackend>,
 ) -> Result<Response<Body>, ApiError> {
     let result = tokio::time::timeout(
-        config.request_timeout,
-        handle_inner(tls, config, ctx, request, sni_hostname, conn_pool),
+        config.http_config.request_timeout,
+        handle_inner(config, ctx, request, sni_hostname, backend),
     )
     .await;
     let mut response = match result {
@@ -265,7 +278,7 @@ pub async fn handle(
         Err(_) => {
             let message = format!(
                 "HTTP-Connection timed out, execution time exeeded {} seconds",
-                config.request_timeout.as_secs()
+                config.http_config.request_timeout.as_secs()
             );
             error!(message);
             json_response(
@@ -283,22 +296,36 @@ pub async fn handle(
 
 #[instrument(name = "sql-over-http", fields(pid = tracing::field::Empty), skip_all)]
 async fn handle_inner(
-    tls: &'static TlsConfig,
-    config: &'static HttpConfig,
+    config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
     request: Request<Body>,
     sni_hostname: Option<String>,
-    conn_pool: Arc<GlobalConnPool>,
+    backend: Arc<PoolingBackend>,
 ) -> anyhow::Result<Response<Body>> {
     let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
-        .with_label_values(&["http"])
+        .with_label_values(&[ctx.protocol])
         .guard();
+    info!(
+        protocol = ctx.protocol,
+        "handling interactive connection from client"
+    );
 
     //
     // Determine the destination and connection params
     //
     let headers = request.headers();
-    let conn_info = get_conn_info(ctx, headers, sni_hostname, tls)?;
+    // TLS config should be there.
+    let conn_info = get_conn_info(
+        ctx,
+        headers,
+        sni_hostname,
+        config.tls_config.as_ref().unwrap(),
+    )?;
+    info!(
+        user = conn_info.user_info.user.as_str(),
+        project = conn_info.user_info.endpoint.as_str(),
+        "credentials"
+    );
 
     // Determine the output options. Default behaviour is 'false'. Anything that is not
     // strictly 'true' assumed to be false.
@@ -307,8 +334,8 @@ async fn handle_inner(
 
     // Allow connection pooling only if explicitly requested
     // or if we have decided that http pool is no longer opt-in
-    let allow_pool =
-        !config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
+    let allow_pool = !config.http_config.pool_options.opt_in
+        || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
 
     // isolation level, read only and deferrable
 
@@ -333,6 +360,8 @@ async fn handle_inner(
         None => MAX_REQUEST_SIZE + 1,
     };
     drop(paused);
+    info!(request_content_length, "request size in bytes");
+    HTTP_CONTENT_LENGTH.observe(request_content_length as f64);
 
     // we don't have a streaming request support yet so this is to prevent OOM
     // from a malicious user sending an extremely large request body
@@ -342,13 +371,28 @@ async fn handle_inner(
         ));
     }
 
-    //
-    // Read the query and query params from the request body
-    //
-    let body = hyper::body::to_bytes(request.into_body()).await?;
-    let payload: Payload = serde_json::from_slice(&body)?;
+    let fetch_and_process_request = async {
+        let body = hyper::body::to_bytes(request.into_body())
+            .await
+            .map_err(anyhow::Error::from)?;
+        let payload: Payload = serde_json::from_slice(&body)?;
+        Ok::<Payload, anyhow::Error>(payload) // Adjust error type accordingly
+    };
 
-    let mut client = conn_pool.get(ctx, conn_info, !allow_pool).await?;
+    let authenticate_and_connect = async {
+        let keys = backend.authenticate(ctx, &conn_info).await?;
+        backend
+            .connect_to_compute(ctx, conn_info, keys, !allow_pool)
+            .await
+    };
+
+    // Run both operations in parallel
+    let (payload_result, auth_and_connect_result) =
+        join!(fetch_and_process_request, authenticate_and_connect,);
+
+    // Handle the results
+    let payload = payload_result?; // Handle errors appropriately
+    let mut client = auth_and_connect_result?; // Handle errors appropriately
 
     let mut response = Response::builder()
         .status(StatusCode::OK)
@@ -482,7 +526,7 @@ async fn query_to_json<T: GenericClient>(
     raw_output: bool,
     array_mode: bool,
 ) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
-    let query_params = json_to_pg_text(data.params);
+    let query_params = data.params;
     let row_stream = client.query_raw_txt(&data.query, query_params).await?;
 
     // Manually drain the stream into a vector to leave row_stream hanging
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 1d62f09840..b3b35e446d 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -393,11 +393,11 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
 def test_sql_over_http_pool(static_proxy: NeonProxy):
     static_proxy.safe_psql("create user http_auth with password 'http' superuser")
 
-    def get_pid(status: int, pw: str) -> Any:
+    def get_pid(status: int, pw: str, user="http_auth") -> Any:
         return static_proxy.http_query(
             GET_CONNECTION_PID_QUERY,
             [],
-            user="http_auth",
+            user=user,
             password=pw,
             expected_code=status,
         )
@@ -418,20 +418,14 @@ def test_sql_over_http_pool(static_proxy: NeonProxy):
 
     static_proxy.safe_psql("alter user http_auth with password 'http2'")
 
-    # after password change, should open a new connection to verify it
-    pid2 = get_pid(200, "http2")["rows"][0]["pid"]
-    assert pid1 != pid2
+    # after password change, shouldn't open a new connection because it checks password in proxy.
+    rows = get_pid(200, "http2")["rows"]
+    assert rows == [{"pid": pid1}]
 
     time.sleep(0.02)
 
-    # query should be on an existing connection
-    pid = get_pid(200, "http2")["rows"][0]["pid"]
-    assert pid in [pid1, pid2]
-
-    time.sleep(0.02)
-
-    # old password should not work
-    res = get_pid(400, "http")
+    # incorrect user shouldn't reveal that the user doesn't exists
+    res = get_pid(400, "http", user="http_auth2")
     assert "password authentication failed for user" in res["message"]
 
 
From 6c34d4cd147eb3704d8e54b434afee35b7d08704 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 8 Feb 2024 14:52:04 +0100
Subject: [PATCH 0122/1571] Proxy: set timeout on establishing connection
 (#6679)

## Problem

There is no timeout on the handshake.

## Summary of changes

Set the timeout on the establishing connection.
---
 proxy/src/bin/proxy.rs | 4 ++++
 proxy/src/config.rs    | 1 +
 proxy/src/proxy.rs     | 9 +++++----
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 6974f1a274..8fbcb56758 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -88,6 +88,9 @@ struct ProxyCliArgs {
     /// path to directory with TLS certificates for client postgres connections
     #[clap(long)]
     certs_dir: Option<String>,
+    /// timeout for the TLS handshake
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    handshake_timeout: tokio::time::Duration,
     /// http endpoint to receive periodic metric updates
     #[clap(long)]
     metric_collection_endpoint: Option<String>,
@@ -411,6 +414,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         require_client_ip: args.require_client_ip,
         disable_ip_check_for_http: args.disable_ip_check_for_http,
         endpoint_rps_limit,
+        handshake_timeout: args.handshake_timeout,
         // TODO: add this argument
         region: args.region.clone(),
     }));
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 2c46458a49..31c9228b35 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -22,6 +22,7 @@ pub struct ProxyConfig {
     pub disable_ip_check_for_http: bool,
     pub endpoint_rps_limit: Vec<RateBucketInfo>,
     pub region: String,
+    pub handshake_timeout: Duration,
 }
 
 #[derive(Debug)]
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index b68fb26e42..b3b221d3e2 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -194,10 +194,11 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     let pause = ctx.latency_timer.pause();
     let do_handshake = handshake(stream, mode.handshake_tls(tls), &cancel_map);
-    let (mut stream, params) = match do_handshake.await? {
-        Some(x) => x,
-        None => return Ok(()), // it's a cancellation request
-    };
+    let (mut stream, params) =
+        match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
+            Some(x) => x,
+            None => return Ok(()), // it's a cancellation request
+        };
     drop(pause);
 
     let hostname = mode.hostname(stream.get_ref());

From 43eae17f0d2e84b0c88e34f3fff6bfe515008b89 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 8 Feb 2024 17:31:15 +0200
Subject: [PATCH 0123/1571] Drop unused replication slots (#6655)

## Problem

See #6626

If there is inactive replication slot then Postgres will not bw able to
shrink WAL and delete unused snapshots.
If she other active subscription is present, then snapshots created each
15 seconds will overflow AUX_DIR.

Setting `max_slot_wal_keep_size` doesn't solve the problem, because even
small WAL segment will be enough to overflow AUX_DIR if there is no
other activity on the system.

## Summary of changes

If there are active subscriptions and some logical replication slots are
not used during `neon.logical_replication_max_time_lag` interval, then
unused slot is dropped.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/neon.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index b930fdb3ca..799f88751c 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -11,16 +11,23 @@
 #include "postgres.h"
 #include "fmgr.h"
 
+#include "miscadmin.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "catalog/pg_type.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/interrupt.h"
+#include "replication/slot.h"
 #include "replication/walsender.h"
+#include "storage/procsignal.h"
+#include "tcop/tcopprot.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
+#include "utils/wait_event.h"
 
 #include "neon.h"
 #include "walproposer.h"
@@ -30,6 +37,130 @@
 PG_MODULE_MAGIC;
 void		_PG_init(void);
 
+static int	logical_replication_max_time_lag = 3600;
+
+static void
+InitLogicalReplicationMonitor(void)
+{
+	BackgroundWorker bgw;
+
+	DefineCustomIntVariable(
+		"neon.logical_replication_max_time_lag",
+		"Threshold for dropping unused logical replication slots",
+		NULL,
+		&logical_replication_max_time_lag,
+		3600, 0, INT_MAX,
+		PGC_SIGHUP,
+		GUC_UNIT_S,
+		NULL, NULL, NULL);
+
+	memset(&bgw, 0, sizeof(bgw));
+	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
+	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
+	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain");
+	snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor");
+	snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor");
+	bgw.bgw_restart_time = 5;
+	bgw.bgw_notify_pid = 0;
+	bgw.bgw_main_arg = (Datum) 0;
+
+	RegisterBackgroundWorker(&bgw);
+}
+
+typedef struct
+{
+	NameData    name;
+	bool        dropped;
+	XLogRecPtr  confirmed_flush_lsn;
+	TimestampTz last_updated;
+} SlotStatus;
+
+/*
+ * Unused logical replication slots pins WAL and prevents deletion of snapshots.
+ */
+PGDLLEXPORT void
+LogicalSlotsMonitorMain(Datum main_arg)
+{
+	SlotStatus* slots;
+	TimestampTz now, last_checked;
+
+	/* Establish signal handlers. */
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
+	pqsignal(SIGTERM, die);
+
+	BackgroundWorkerUnblockSignals();
+
+	slots = (SlotStatus*)calloc(max_replication_slots, sizeof(SlotStatus));
+	last_checked = GetCurrentTimestamp();
+
+	for (;;)
+	{
+		(void) WaitLatch(MyLatch,
+						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
+						 logical_replication_max_time_lag*1000/2,
+						 PG_WAIT_EXTENSION);
+		ResetLatch(MyLatch);
+		CHECK_FOR_INTERRUPTS();
+
+		now = GetCurrentTimestamp();
+
+		if (now - last_checked > logical_replication_max_time_lag*USECS_PER_SEC)
+		{
+			int n_active_slots = 0;
+			last_checked = now;
+
+			LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+			for (int i = 0; i < max_replication_slots; i++)
+			{
+				ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
+
+				/* Consider only logical repliction slots */
+				if (!s->in_use || !SlotIsLogical(s))
+					continue;
+
+				if (s->active_pid != 0)
+				{
+					n_active_slots += 1;
+					continue;
+				}
+
+				/* Check if there was some activity with the slot since last check */
+				if (s->data.confirmed_flush != slots[i].confirmed_flush_lsn)
+				{
+					slots[i].confirmed_flush_lsn = s->data.confirmed_flush;
+					slots[i].last_updated = now;
+				}
+				else if (now - slots[i].last_updated > logical_replication_max_time_lag*USECS_PER_SEC)
+				{
+					slots[i].name = s->data.name;
+					slots[i].dropped = true;
+				}
+			}
+			LWLockRelease(ReplicationSlotControlLock);
+
+			/*
+			 * If there are no active subscriptions, then no new snapshots are generated
+			 * and so no need to force slot deletion.
+			 */
+			if (n_active_slots != 0)
+			{
+				for (int i = 0; i < max_replication_slots; i++)
+				{
+					if (slots[i].dropped)
+					{
+						elog(LOG, "Drop logical replication slot because it was not update more than %ld seconds",
+							 (now - slots[i].last_updated)/USECS_PER_SEC);
+						ReplicationSlotDrop(slots[i].name.data, true);
+						slots[i].dropped = false;
+					}
+				}
+			}
+		}
+	}
+}
+
 void
 _PG_init(void)
 {
@@ -44,6 +175,8 @@ _PG_init(void)
 	pg_init_libpagestore();
 	pg_init_walproposer();
 
+	InitLogicalReplicationMonitor();
+
 	InitControlPlaneConnector();
 
 	pg_init_extension_server();

From af91a28936eef0b1e5149dc71d92394a89410372 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 8 Feb 2024 15:35:13 +0000
Subject: [PATCH 0124/1571] pageserver: shard splitting (#6379)

## Problem

One doesn't know at tenant creation time how large the tenant will grow.
We need to be able to dynamically adjust the shard count at runtime.
This is implemented as "splitting" of shards into smaller child shards,
which cover a subset of the keyspace that the parent covered.

Refer to RFC: https://github.com/neondatabase/neon/pull/6358

Part of epic: #6278

## Summary of changes

This PR implements the happy path (does not cleanly recover from a crash
mid-split, although won't lose any data), without any optimizations
(e.g. child shards re-download their own copies of layers that the
parent shard already had on local disk)

- Add `/v1/tenant/:tenant_shard_id/shard_split` API to pageserver: this
copies the shard's index to the child shards' paths, instantiates child
`Tenant` object, and tears down parent `Tenant` object.
- Add `splitting` column to `tenant_shards` table. This is written into
an existing migration because we haven't deployed yet, so don't need to
cleanly upgrade.
- Add `/control/v1/tenant/:tenant_id/shard_split` API to
attachment_service,
- Add `test_sharding_split_smoke` test. This covers the happy path:
future PRs will add tests that exercise failure cases.
---
 Dockerfile                                    |   5 +
 .../up.sql                                    |   1 +
 control_plane/attachment_service/src/http.rs  |  19 +-
 .../attachment_service/src/persistence.rs     | 102 +++++-
 .../src/persistence/split_state.rs            |  46 +++
 .../attachment_service/src/schema.rs          |   1 +
 .../attachment_service/src/service.rs         | 333 +++++++++++++++++-
 .../attachment_service/src/tenant_state.rs    |  10 +
 control_plane/src/attachment_service.rs       |  21 +-
 control_plane/src/bin/neon_local.rs           |  25 ++
 libs/pageserver_api/src/models.rs             |  10 +
 libs/pageserver_api/src/shard.rs              | 128 +++++++
 pageserver/client/src/mgmt_api.rs             |  16 +
 pageserver/src/http/routes.rs                 |  27 +-
 pageserver/src/tenant.rs                      |  66 ++++
 pageserver/src/tenant/mgr.rs                  | 169 ++++++++-
 .../tenant/remote_timeline_client/upload.rs   |   2 +-
 test_runner/fixtures/neon_fixtures.py         |   2 +-
 test_runner/regress/test_sharding.py          | 129 ++++++-
 19 files changed, 1088 insertions(+), 24 deletions(-)
 create mode 100644 control_plane/attachment_service/src/persistence/split_state.rs

diff --git a/Dockerfile b/Dockerfile
index bb926643dc..c37f94b981 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -100,6 +100,11 @@ RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
        -c "listen_pg_addr='0.0.0.0:6400'" \
        -c "listen_http_addr='0.0.0.0:9898'"
 
+# When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
+# that want a particular postgres version will select it explicitly: this is just a default.
+ENV LD_LIBRARY_PATH /usr/local/v16/lib
+
+
 VOLUME ["/data"]
 USER neon
 EXPOSE 6400
diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
index 585dbc79a0..2ffdae6287 100644
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
@@ -7,6 +7,7 @@ CREATE TABLE tenant_shards (
   generation INTEGER NOT NULL,
   generation_pageserver BIGINT NOT NULL,
   placement_policy VARCHAR NOT NULL,
+  splitting SMALLINT NOT NULL,
   -- config is JSON encoded, opaque to the database.
   config TEXT NOT NULL
 );
\ No newline at end of file
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 049e66fddf..38eecaf7ef 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -3,7 +3,8 @@ use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
-    TenantCreateRequest, TenantLocationConfigRequest, TimelineCreateRequest,
+    TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
+    TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
@@ -292,6 +293,19 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
     json_response(StatusCode::OK, state.service.node_configure(config_req)?)
 }
 
+async fn handle_tenant_shard_split(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
+
+    json_response(
+        StatusCode::OK,
+        service.tenant_shard_split(tenant_id, split_req).await?,
+    )
+}
+
 async fn handle_tenant_shard_migrate(
     service: Arc<Service>,
     mut req: Request<Body>,
@@ -391,6 +405,9 @@ pub fn make_router(
         .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
             tenant_service_handler(r, handle_tenant_shard_migrate)
         })
+        .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
+            tenant_service_handler(r, handle_tenant_shard_split)
+        })
         // Tenant operations
         // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
         // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index db487bcec6..cead540058 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -1,7 +1,9 @@
+pub(crate) mod split_state;
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::time::Duration;
 
+use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
@@ -363,19 +365,101 @@ impl Persistence {
         Ok(())
     }
 
-    // TODO: when we start shard splitting, we must durably mark the tenant so that
-    // on restart, we know that we must go through recovery (list shards that exist
-    // and pick up where we left off and/or revert to parent shards).
+    // When we start shard splitting, we must durably mark the tenant so that
+    // on restart, we know that we must go through recovery.
+    //
+    // We create the child shards here, so that they will be available for increment_generation calls
+    // if some pageserver holding a child shard needs to restart before the overall tenant split is complete.
     #[allow(dead_code)]
-    pub(crate) async fn begin_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
-        todo!();
+    pub(crate) async fn begin_shard_split(
+        &self,
+        old_shard_count: ShardCount,
+        split_tenant_id: TenantId,
+        parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            conn.transaction(|conn| -> DatabaseResult<()> {
+                // Mark parent shards as splitting
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(old_shard_count.0 as i32))
+                    .set((splitting.eq(1),))
+                    .execute(conn)?;
+                if ShardCount(updated.try_into().map_err(|_| DatabaseError::Logical(format!("Overflow existing shard count {} while splitting", updated)))?) != old_shard_count {
+                    // Perhaps a deletion or another split raced with this attempt to split, mutating
+                    // the parent shards that we intend to split. In this case the split request should fail.
+                    return Err(DatabaseError::Logical(
+                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {old_shard_count:?})")
+                    ));
+                }
+
+                // FIXME: spurious clone to sidestep closure move rules
+                let parent_to_children = parent_to_children.clone();
+
+                // Insert child shards
+                for (parent_shard_id, children) in parent_to_children {
+                    let mut parent = crate::schema::tenant_shards::table
+                        .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
+                        .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
+                        .filter(shard_count.eq(parent_shard_id.shard_count.0 as i32))
+                        .load::<TenantShardPersistence>(conn)?;
+                    let parent = if parent.len() != 1 {
+                        return Err(DatabaseError::Logical(format!(
+                            "Parent shard {parent_shard_id} not found"
+                        )));
+                    } else {
+                        parent.pop().unwrap()
+                    };
+                    for mut shard in children {
+                        // Carry the parent's generation into the child
+                        shard.generation = parent.generation;
+
+                        debug_assert!(shard.splitting == SplitState::Splitting);
+                        diesel::insert_into(tenant_shards)
+                            .values(shard)
+                            .execute(conn)?;
+                    }
+                }
+
+                Ok(())
+            })?;
+
+            Ok(())
+        })
+        .await
     }
 
-    // TODO: when we finish shard splitting, we must atomically clean up the old shards
+    // When we finish shard splitting, we must atomically clean up the old shards
     // and insert the new shards, and clear the splitting marker.
     #[allow(dead_code)]
-    pub(crate) async fn complete_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
-        todo!();
+    pub(crate) async fn complete_shard_split(
+        &self,
+        split_tenant_id: TenantId,
+        old_shard_count: ShardCount,
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            conn.transaction(|conn| -> QueryResult<()> {
+                // Drop parent shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(old_shard_count.0 as i32))
+                    .execute(conn)?;
+
+                // Clear sharding flag
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
+                debug_assert!(updated > 0);
+
+                Ok(())
+            })?;
+
+            Ok(())
+        })
+        .await
     }
 }
 
@@ -403,6 +487,8 @@ pub(crate) struct TenantShardPersistence {
     #[serde(default)]
     pub(crate) placement_policy: String,
     #[serde(default)]
+    pub(crate) splitting: SplitState,
+    #[serde(default)]
     pub(crate) config: String,
 }
 
diff --git a/control_plane/attachment_service/src/persistence/split_state.rs b/control_plane/attachment_service/src/persistence/split_state.rs
new file mode 100644
index 0000000000..bce1a75843
--- /dev/null
+++ b/control_plane/attachment_service/src/persistence/split_state.rs
@@ -0,0 +1,46 @@
+use diesel::pg::{Pg, PgValue};
+use diesel::{
+    deserialize::FromSql, deserialize::FromSqlRow, expression::AsExpression, serialize::ToSql,
+    sql_types::Int2,
+};
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, FromSqlRow, AsExpression)]
+#[diesel(sql_type = SplitStateSQLRepr)]
+#[derive(Deserialize, Serialize)]
+pub enum SplitState {
+    Idle = 0,
+    Splitting = 1,
+}
+
+impl Default for SplitState {
+    fn default() -> Self {
+        Self::Idle
+    }
+}
+
+type SplitStateSQLRepr = Int2;
+
+impl ToSql<SplitStateSQLRepr, Pg> for SplitState {
+    fn to_sql<'a>(
+        &'a self,
+        out: &'a mut diesel::serialize::Output<Pg>,
+    ) -> diesel::serialize::Result {
+        let raw_value: i16 = *self as i16;
+        let mut new_out = out.reborrow();
+        ToSql::<SplitStateSQLRepr, Pg>::to_sql(&raw_value, &mut new_out)
+    }
+}
+
+impl FromSql<SplitStateSQLRepr, Pg> for SplitState {
+    fn from_sql(pg_value: PgValue) -> diesel::deserialize::Result<Self> {
+        match FromSql::<SplitStateSQLRepr, Pg>::from_sql(pg_value).map(|v| match v {
+            0 => Some(Self::Idle),
+            1 => Some(Self::Splitting),
+            _ => None,
+        })? {
+            Some(v) => Ok(v),
+            None => Err(format!("Invalid SplitState value, was: {:?}", pg_value.as_bytes()).into()),
+        }
+    }
+}
diff --git a/control_plane/attachment_service/src/schema.rs b/control_plane/attachment_service/src/schema.rs
index de80fc8f64..db5a957443 100644
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -20,6 +20,7 @@ diesel::table! {
         generation -> Int4,
         generation_pageserver -> Int8,
         placement_policy -> Varchar,
+        splitting -> Int2,
         config -> Text,
     }
 }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 1db1906df8..0ec2b9dc4c 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1,4 +1,5 @@
 use std::{
+    cmp::Ordering,
     collections::{BTreeMap, HashMap},
     str::FromStr,
     sync::Arc,
@@ -23,7 +24,7 @@ use pageserver_api::{
     models::{
         LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
         TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
-        TimelineCreateRequest, TimelineInfo,
+        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
     },
     shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
 };
@@ -40,7 +41,11 @@ use utils::{
 use crate::{
     compute_hook::{self, ComputeHook},
     node::Node,
-    persistence::{DatabaseError, NodePersistence, Persistence, TenantShardPersistence},
+    persistence::{
+        split_state::SplitState, DatabaseError, NodePersistence, Persistence,
+        TenantShardPersistence,
+    },
+    reconciler::attached_location_conf,
     scheduler::Scheduler,
     tenant_state::{
         IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
@@ -476,6 +481,7 @@ impl Service {
                 generation_pageserver: i64::MAX,
                 placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
                 config: serde_json::to_string(&TenantConfig::default()).unwrap(),
+                splitting: SplitState::default(),
             };
 
             match self.persistence.insert_tenant_shards(vec![tsp]).await {
@@ -718,6 +724,7 @@ impl Service {
                 generation_pageserver: i64::MAX,
                 placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                 config: serde_json::to_string(&create_req.config).unwrap(),
+                splitting: SplitState::default(),
             })
             .collect();
         self.persistence
@@ -1100,6 +1107,7 @@ impl Service {
         self.ensure_attached_wait(tenant_id).await?;
 
         // TODO: refuse to do this if shard splitting is in progress
+        // (https://github.com/neondatabase/neon/issues/6676)
         let targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
@@ -1180,6 +1188,7 @@ impl Service {
         self.ensure_attached_wait(tenant_id).await?;
 
         // TODO: refuse to do this if shard splitting is in progress
+        // (https://github.com/neondatabase/neon/issues/6676)
         let targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
@@ -1352,6 +1361,326 @@ impl Service {
         })
     }
 
+    pub(crate) async fn tenant_shard_split(
+        &self,
+        tenant_id: TenantId,
+        split_req: TenantShardSplitRequest,
+    ) -> Result<TenantShardSplitResponse, ApiError> {
+        let mut policy = None;
+        let mut shard_ident = None;
+
+        // TODO: put a cancellation token on Service for clean shutdown
+        let cancel = CancellationToken::new();
+
+        // A parent shard which will be split
+        struct SplitTarget {
+            parent_id: TenantShardId,
+            node: Node,
+            child_ids: Vec<TenantShardId>,
+        }
+
+        // Validate input, and calculate which shards we will create
+        let (old_shard_count, targets, compute_hook) = {
+            let locked = self.inner.read().unwrap();
+
+            let pageservers = locked.nodes.clone();
+
+            let mut targets = Vec::new();
+
+            // In case this is a retry, count how many already-split shards we found
+            let mut children_found = Vec::new();
+            let mut old_shard_count = None;
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                match shard.shard.count.0.cmp(&split_req.new_shard_count) {
+                    Ordering::Equal => {
+                        //  Already split this
+                        children_found.push(*tenant_shard_id);
+                        continue;
+                    }
+                    Ordering::Greater => {
+                        return Err(ApiError::BadRequest(anyhow::anyhow!(
+                            "Requested count {} but already have shards at count {}",
+                            split_req.new_shard_count,
+                            shard.shard.count.0
+                        )));
+                    }
+                    Ordering::Less => {
+                        // Fall through: this shard has lower count than requested,
+                        // is a candidate for splitting.
+                    }
+                }
+
+                match old_shard_count {
+                    None => old_shard_count = Some(shard.shard.count),
+                    Some(old_shard_count) => {
+                        if old_shard_count != shard.shard.count {
+                            // We may hit this case if a caller asked for two splits to
+                            // different sizes, before the first one is complete.
+                            // e.g. 1->2, 2->4, where the 4 call comes while we have a mixture
+                            // of shard_count=1 and shard_count=2 shards in the map.
+                            return Err(ApiError::Conflict(
+                                "Cannot split, currently mid-split".to_string(),
+                            ));
+                        }
+                    }
+                }
+                if policy.is_none() {
+                    policy = Some(shard.policy.clone());
+                }
+                if shard_ident.is_none() {
+                    shard_ident = Some(shard.shard);
+                }
+
+                if tenant_shard_id.shard_count == ShardCount(split_req.new_shard_count) {
+                    tracing::info!(
+                        "Tenant shard {} already has shard count {}",
+                        tenant_shard_id,
+                        split_req.new_shard_count
+                    );
+                    continue;
+                }
+
+                let node_id =
+                    shard
+                        .intent
+                        .attached
+                        .ok_or(ApiError::BadRequest(anyhow::anyhow!(
+                            "Cannot split a tenant that is not attached"
+                        )))?;
+
+                let node = pageservers
+                    .get(&node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                // TODO: if any reconciliation is currently in progress for this shard, wait for it.
+
+                targets.push(SplitTarget {
+                    parent_id: *tenant_shard_id,
+                    node: node.clone(),
+                    child_ids: tenant_shard_id.split(ShardCount(split_req.new_shard_count)),
+                });
+            }
+
+            if targets.is_empty() {
+                if children_found.len() == split_req.new_shard_count as usize {
+                    return Ok(TenantShardSplitResponse {
+                        new_shards: children_found,
+                    });
+                } else {
+                    // No shards found to split, and no existing children found: the
+                    // tenant doesn't exist at all.
+                    return Err(ApiError::NotFound(
+                        anyhow::anyhow!("Tenant {} not found", tenant_id).into(),
+                    ));
+                }
+            }
+
+            (old_shard_count, targets, locked.compute_hook.clone())
+        };
+
+        // unwrap safety: we would have returned above if we didn't find at least one shard to split
+        let old_shard_count = old_shard_count.unwrap();
+        let shard_ident = shard_ident.unwrap();
+        let policy = policy.unwrap();
+
+        // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
+        // request could occur here, deleting or mutating the tenant.  begin_shard_split checks that the
+        // parent shards exist as expected, but it would be neater to do the above pre-checks within the
+        // same database transaction rather than pre-check in-memory and then maybe-fail the database write.
+        // (https://github.com/neondatabase/neon/issues/6676)
+
+        // Before creating any new child shards in memory or on the pageservers, persist them: this
+        // enables us to ensure that we will always be able to clean up if something goes wrong.  This also
+        // acts as the protection against two concurrent attempts to split: one of them will get a database
+        // error trying to insert the child shards.
+        let mut child_tsps = Vec::new();
+        for target in &targets {
+            let mut this_child_tsps = Vec::new();
+            for child in &target.child_ids {
+                let mut child_shard = shard_ident;
+                child_shard.number = child.shard_number;
+                child_shard.count = child.shard_count;
+
+                this_child_tsps.push(TenantShardPersistence {
+                    tenant_id: child.tenant_id.to_string(),
+                    shard_number: child.shard_number.0 as i32,
+                    shard_count: child.shard_count.0 as i32,
+                    shard_stripe_size: shard_ident.stripe_size.0 as i32,
+                    // Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
+                    // populate the correct generation as part of its transaction, to protect us
+                    // against racing with changes in the state of the parent.
+                    generation: 0,
+                    generation_pageserver: target.node.id.0 as i64,
+                    placement_policy: serde_json::to_string(&policy).unwrap(),
+                    // TODO: get the config out of the map
+                    config: serde_json::to_string(&TenantConfig::default()).unwrap(),
+                    splitting: SplitState::Splitting,
+                });
+            }
+
+            child_tsps.push((target.parent_id, this_child_tsps));
+        }
+
+        if let Err(e) = self
+            .persistence
+            .begin_shard_split(old_shard_count, tenant_id, child_tsps)
+            .await
+        {
+            match e {
+                DatabaseError::Query(diesel::result::Error::DatabaseError(
+                    DatabaseErrorKind::UniqueViolation,
+                    _,
+                )) => {
+                    // Inserting a child shard violated a unique constraint: we raced with another call to
+                    // this function
+                    tracing::warn!("Conflicting attempt to split {tenant_id}: {e}");
+                    return Err(ApiError::Conflict("Tenant is already splitting".into()));
+                }
+                _ => return Err(ApiError::InternalServerError(e.into())),
+            }
+        }
+
+        // FIXME: we have now committed the shard split state to the database, so any subsequent
+        // failure needs to roll it back.  We will later wrap this function in logic to roll back
+        // the split if it fails.
+        // (https://github.com/neondatabase/neon/issues/6676)
+
+        // TODO: issue split calls concurrently (this only matters once we're splitting
+        // N>1 shards into M shards -- initially we're usually splitting 1 shard into N).
+
+        for target in &targets {
+            let SplitTarget {
+                parent_id,
+                node,
+                child_ids,
+            } = target;
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            let response = client
+                .tenant_shard_split(
+                    *parent_id,
+                    TenantShardSplitRequest {
+                        new_shard_count: split_req.new_shard_count,
+                    },
+                )
+                .await
+                .map_err(|e| ApiError::Conflict(format!("Failed to split {}: {}", parent_id, e)))?;
+
+            tracing::info!(
+                "Split {} into {}",
+                parent_id,
+                response
+                    .new_shards
+                    .iter()
+                    .map(|s| format!("{:?}", s))
+                    .collect::<Vec<_>>()
+                    .join(",")
+            );
+
+            if &response.new_shards != child_ids {
+                // This should never happen: the pageserver should agree with us on how shard splits work.
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                    "Splitting shard {} resulted in unexpected IDs: {:?} (expected {:?})",
+                    parent_id,
+                    response.new_shards,
+                    child_ids
+                )));
+            }
+        }
+
+        // TODO: if the pageserver restarted concurrently with our split API call,
+        // the actual generation of the child shard might differ from the generation
+        // we expect it to have.  In order for our in-database generation to end up
+        // correct, we should carry the child generation back in the response and apply it here
+        // in complete_shard_split (and apply the correct generation in memory)
+        // (or, we can carry generation in the request and reject the request if
+        //  it doesn't match, but that requires more retry logic on this side)
+
+        self.persistence
+            .complete_shard_split(tenant_id, old_shard_count)
+            .await?;
+
+        // Replace all the shards we just split with their children
+        let mut response = TenantShardSplitResponse {
+            new_shards: Vec::new(),
+        };
+        let mut child_locations = Vec::new();
+        {
+            let mut locked = self.inner.write().unwrap();
+            for target in targets {
+                let SplitTarget {
+                    parent_id,
+                    node: _node,
+                    child_ids,
+                } = target;
+                let (pageserver, generation, config) = {
+                    let old_state = locked
+                        .tenants
+                        .remove(&parent_id)
+                        .expect("It was present, we just split it");
+                    (
+                        old_state.intent.attached.unwrap(),
+                        old_state.generation,
+                        old_state.config.clone(),
+                    )
+                };
+
+                locked.tenants.remove(&parent_id);
+
+                for child in child_ids {
+                    let mut child_shard = shard_ident;
+                    child_shard.number = child.shard_number;
+                    child_shard.count = child.shard_count;
+
+                    let mut child_observed: HashMap<NodeId, ObservedStateLocation> = HashMap::new();
+                    child_observed.insert(
+                        pageserver,
+                        ObservedStateLocation {
+                            conf: Some(attached_location_conf(generation, &child_shard, &config)),
+                        },
+                    );
+
+                    let mut child_state = TenantState::new(child, child_shard, policy.clone());
+                    child_state.intent = IntentState::single(Some(pageserver));
+                    child_state.observed = ObservedState {
+                        locations: child_observed,
+                    };
+                    child_state.generation = generation;
+                    child_state.config = config.clone();
+
+                    child_locations.push((child, pageserver));
+
+                    locked.tenants.insert(child, child_state);
+                    response.new_shards.push(child);
+                }
+            }
+        }
+
+        // Send compute notifications for all the new shards
+        let mut failed_notifications = Vec::new();
+        for (child_id, child_ps) in child_locations {
+            if let Err(e) = compute_hook.notify(child_id, child_ps, &cancel).await {
+                tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
+                        child_id, child_ps);
+                failed_notifications.push(child_id);
+            }
+        }
+
+        // If we failed any compute notifications, make a note to retry later.
+        if !failed_notifications.is_empty() {
+            let mut locked = self.inner.write().unwrap();
+            for failed in failed_notifications {
+                if let Some(shard) = locked.tenants.get_mut(&failed) {
+                    shard.pending_compute_notification = true;
+                }
+            }
+        }
+
+        Ok(response)
+    }
+
     pub(crate) async fn tenant_shard_migrate(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index a358e1ff7b..c0ab076a55 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -193,6 +193,13 @@ impl IntentState {
         result
     }
 
+    pub(crate) fn single(node_id: Option<NodeId>) -> Self {
+        Self {
+            attached: node_id,
+            secondary: vec![],
+        }
+    }
+
     /// When a node goes offline, we update intents to avoid using it
     /// as their attached pageserver.
     ///
@@ -286,6 +293,9 @@ impl TenantState {
         // self.intent refers to pageservers that are offline, and pick other
         // pageservers if so.
 
+        // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
+        // change their attach location.
+
         // Build the set of pageservers already in use by this tenant, to avoid scheduling
         // more work on the same pageservers we're already using.
         let mut used_pageservers = self.intent.all_pageservers();
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index a3f832036c..c3e071aa71 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -8,7 +8,10 @@ use diesel::{
 use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 use hyper::Method;
 use pageserver_api::{
-    models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
+    models::{
+        ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
+        TimelineCreateRequest, TimelineInfo,
+    },
     shard::TenantShardId,
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
@@ -648,7 +651,7 @@ impl AttachmentService {
     ) -> anyhow::Result<TenantShardMigrateResponse> {
         self.dispatch(
             Method::PUT,
-            format!("tenant/{tenant_shard_id}/migrate"),
+            format!("control/v1/tenant/{tenant_shard_id}/migrate"),
             Some(TenantShardMigrateRequest {
                 tenant_shard_id,
                 node_id,
@@ -657,6 +660,20 @@ impl AttachmentService {
         .await
     }
 
+    #[instrument(skip(self), fields(%tenant_id, %new_shard_count))]
+    pub async fn tenant_split(
+        &self,
+        tenant_id: TenantId,
+        new_shard_count: u8,
+    ) -> anyhow::Result<TenantShardSplitResponse> {
+        self.dispatch(
+            Method::PUT,
+            format!("control/v1/tenant/{tenant_id}/shard_split"),
+            Some(TenantShardSplitRequest { new_shard_count }),
+        )
+        .await
+    }
+
     #[instrument(skip_all, fields(node_id=%req.node_id))]
     pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
         self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index e56007dd20..b9af467fdf 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -575,6 +575,26 @@ async fn handle_tenant(
             println!("{tenant_table}");
             println!("{shard_table}");
         }
+        Some(("shard-split", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+            let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
+
+            let attachment_service = AttachmentService::from_env(env);
+            let result = attachment_service
+                .tenant_split(tenant_id, shard_count)
+                .await?;
+            println!(
+                "Split tenant {} into shards {}",
+                tenant_id,
+                result
+                    .new_shards
+                    .iter()
+                    .map(|s| format!("{:?}", s))
+                    .collect::<Vec<_>>()
+                    .join(",")
+            );
+        }
+
         Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
         None => bail!("no tenant subcommand provided"),
     }
@@ -1524,6 +1544,11 @@ fn cli() -> Command {
             .subcommand(Command::new("status")
                 .about("Human readable summary of the tenant's shards and attachment locations")
                 .arg(tenant_id_arg.clone()))
+            .subcommand(Command::new("shard-split")
+                .about("Increase the number of shards in the tenant")
+                .arg(tenant_id_arg.clone())
+                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
+                )
         )
         .subcommand(
             Command::new("pageserver")
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index c08cacb822..46324efd43 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -192,6 +192,16 @@ pub struct TimelineCreateRequest {
     pub pg_version: Option<u32>,
 }
 
+#[derive(Serialize, Deserialize)]
+pub struct TenantShardSplitRequest {
+    pub new_shard_count: u8,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantShardSplitResponse {
+    pub new_shards: Vec<TenantShardId>,
+}
+
 /// Parameters that apply to all shards in a tenant.  Used during tenant creation.
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index e27aad8156..322b6c642e 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -88,12 +88,36 @@ impl TenantShardId {
     pub fn is_unsharded(&self) -> bool {
         self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
     }
+
+    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
+    /// is useful when logging from code that is already in a span that includes tenant ID, to
+    /// keep messages reasonably terse.
     pub fn to_index(&self) -> ShardIndex {
         ShardIndex {
             shard_number: self.shard_number,
             shard_count: self.shard_count,
         }
     }
+
+    /// Calculate the children of this TenantShardId when splitting the overall tenant into
+    /// the given number of shards.
+    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
+        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
+        let mut child_shards = Vec::new();
+        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
+            // Key mapping is based on a round robin mapping of key hash modulo shard count,
+            // so our child shards are the ones which the same keys would map to.
+            if shard_number % effective_old_shard_count == self.shard_number.0 {
+                child_shards.push(TenantShardId {
+                    tenant_id: self.tenant_id,
+                    shard_number: ShardNumber(shard_number),
+                    shard_count: new_shard_count,
+                })
+            }
+        }
+
+        child_shards
+    }
 }
 
 /// Formatting helper
@@ -793,4 +817,108 @@ mod tests {
         let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
         assert_eq!(shard, ShardNumber(8));
     }
+
+    #[test]
+    fn shard_id_split() {
+        let tenant_id = TenantId::generate();
+        let parent = TenantShardId::unsharded(tenant_id);
+
+        // Unsharded into 2
+        assert_eq!(
+            parent.split(ShardCount(2)),
+            vec![
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(2),
+                    shard_number: ShardNumber(0)
+                },
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(2),
+                    shard_number: ShardNumber(1)
+                }
+            ]
+        );
+
+        // Unsharded into 4
+        assert_eq!(
+            parent.split(ShardCount(4)),
+            vec![
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(4),
+                    shard_number: ShardNumber(0)
+                },
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(4),
+                    shard_number: ShardNumber(1)
+                },
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(4),
+                    shard_number: ShardNumber(2)
+                },
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(4),
+                    shard_number: ShardNumber(3)
+                }
+            ]
+        );
+
+        // count=1 into 2 (check this works the same as unsharded.)
+        let parent = TenantShardId {
+            tenant_id,
+            shard_count: ShardCount(1),
+            shard_number: ShardNumber(0),
+        };
+        assert_eq!(
+            parent.split(ShardCount(2)),
+            vec![
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(2),
+                    shard_number: ShardNumber(0)
+                },
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(2),
+                    shard_number: ShardNumber(1)
+                }
+            ]
+        );
+
+        // count=2 into count=8
+        let parent = TenantShardId {
+            tenant_id,
+            shard_count: ShardCount(2),
+            shard_number: ShardNumber(1),
+        };
+        assert_eq!(
+            parent.split(ShardCount(8)),
+            vec![
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(8),
+                    shard_number: ShardNumber(1)
+                },
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(8),
+                    shard_number: ShardNumber(3)
+                },
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(8),
+                    shard_number: ShardNumber(5)
+                },
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(8),
+                    shard_number: ShardNumber(7)
+                },
+            ]
+        );
+    }
 }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 8abe58e1a2..200369df90 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -310,6 +310,22 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    pub async fn tenant_shard_split(
+        &self,
+        tenant_shard_id: TenantShardId,
+        req: TenantShardSplitRequest,
+    ) -> Result<TenantShardSplitResponse> {
+        let uri = format!(
+            "{}/v1/tenant/{}/shard_split",
+            self.mgmt_api_endpoint, tenant_shard_id
+        );
+        self.request(Method::PUT, &uri, req)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn timeline_list(
         &self,
         tenant_shard_id: &TenantShardId,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index ebcb27fa08..af9a3c7301 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -19,11 +19,14 @@ use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
 use pageserver_api::models::TenantShardLocation;
+use pageserver_api::models::TenantShardSplitRequest;
+use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
     DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
     TenantLoadRequest, TenantLocationConfigRequest,
 };
+use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
@@ -875,7 +878,7 @@ async fn tenant_reset_handler(
     let state = get_state(&request);
     state
         .tenant_manager
-        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
+        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), &ctx)
         .await
         .map_err(ApiError::InternalServerError)?;
 
@@ -1104,6 +1107,25 @@ async fn tenant_size_handler(
     )
 }
 
+async fn tenant_shard_split_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let req: TenantShardSplitRequest = json_request(&mut request).await?;
+
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let state = get_state(&request);
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    let new_shards = state
+        .tenant_manager
+        .shard_split(tenant_shard_id, ShardCount(req.new_shard_count), &ctx)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, TenantShardSplitResponse { new_shards })
+}
+
 async fn layer_map_info_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -2063,6 +2085,9 @@ pub fn make_router(
         .put("/v1/tenant/config", |r| {
             api_handler(r, update_tenant_config_handler)
         })
+        .put("/v1/tenant/:tenant_shard_id/shard_split", |r| {
+            api_handler(r, tenant_shard_split_handler)
+        })
         .get("/v1/tenant/:tenant_shard_id/config", |r| {
             api_handler(r, get_tenant_config_handler)
         })
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f704f8c0dd..f086f46213 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -53,6 +53,7 @@ use self::metadata::TimelineMetadata;
 use self::mgr::GetActiveTenantError;
 use self::mgr::GetTenantError;
 use self::mgr::TenantsMap;
+use self::remote_timeline_client::upload::upload_index_part;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::TimelineUninitMark;
@@ -2397,6 +2398,67 @@ impl Tenant {
     pub(crate) fn get_generation(&self) -> Generation {
         self.generation
     }
+
+    /// This function partially shuts down the tenant (it shuts down the Timelines) and is fallible,
+    /// and can leave the tenant in a bad state if it fails.  The caller is responsible for
+    /// resetting this tenant to a valid state if we fail.
+    pub(crate) async fn split_prepare(
+        &self,
+        child_shards: &Vec<TenantShardId>,
+    ) -> anyhow::Result<()> {
+        let timelines = self.timelines.lock().unwrap().clone();
+        for timeline in timelines.values() {
+            let Some(tl_client) = &timeline.remote_client else {
+                anyhow::bail!("Remote storage is mandatory");
+            };
+
+            let Some(remote_storage) = &self.remote_storage else {
+                anyhow::bail!("Remote storage is mandatory");
+            };
+
+            // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
+            // to ensure that they do not start a split if currently in the process of doing these.
+
+            // Upload an index from the parent: this is partly to provide freshness for the
+            // child tenants that will copy it, and partly for general ease-of-debugging: there will
+            // always be a parent shard index in the same generation as we wrote the child shard index.
+            tl_client.schedule_index_upload_for_file_changes()?;
+            tl_client.wait_completion().await?;
+
+            // Shut down the timeline's remote client: this means that the indices we write
+            // for child shards will not be invalidated by the parent shard deleting layers.
+            tl_client.shutdown().await?;
+
+            // Download methods can still be used after shutdown, as they don't flow through the remote client's
+            // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
+            // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
+            // we use here really is the remotely persistent one).
+            let result = tl_client
+                .download_index_file(self.cancel.clone())
+                .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
+                .await?;
+            let index_part = match result {
+                MaybeDeletedIndexPart::Deleted(_) => {
+                    anyhow::bail!("Timeline deletion happened concurrently with split")
+                }
+                MaybeDeletedIndexPart::IndexPart(p) => p,
+            };
+
+            for child_shard in child_shards {
+                upload_index_part(
+                    remote_storage,
+                    child_shard,
+                    &timeline.timeline_id,
+                    self.generation,
+                    &index_part,
+                    &self.cancel,
+                )
+                .await?;
+            }
+        }
+
+        Ok(())
+    }
 }
 
 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
@@ -3732,6 +3794,10 @@ impl Tenant {
 
         Ok(())
     }
+
+    pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
+        self.tenant_conf.read().unwrap().tenant_conf
+    }
 }
 
 fn remove_timeline_and_uninit_mark(
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 5ec910ca3e..9aee39bd35 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,6 +2,7 @@
 //! page server.
 
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
+use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
@@ -22,7 +23,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 
 use remote_storage::GenericRemoteStorage;
-use utils::crashsafe;
+use utils::{completion, crashsafe};
 
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -644,8 +645,6 @@ pub(crate) async fn shutdown_all_tenants() {
 }
 
 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
-    use utils::completion;
-
     let mut join_set = JoinSet::new();
 
     // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
@@ -1200,7 +1199,7 @@ impl TenantManager {
         &self,
         tenant_shard_id: TenantShardId,
         drop_cache: bool,
-        ctx: RequestContext,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
         let Some(old_slot) = slot_guard.get_old_value() else {
@@ -1253,7 +1252,7 @@ impl TenantManager {
             None,
             self.tenants,
             SpawnMode::Normal,
-            &ctx,
+            ctx,
         )?;
 
         slot_guard.upsert(TenantSlot::Attached(tenant))?;
@@ -1375,6 +1374,164 @@ impl TenantManager {
         slot_guard.revert();
         result
     }
+
+    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.0))]
+    pub(crate) async fn shard_split(
+        &self,
+        tenant_shard_id: TenantShardId,
+        new_shard_count: ShardCount,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<TenantShardId>> {
+        let tenant = get_tenant(tenant_shard_id, true)?;
+
+        // Plan: identify what the new child shards will be
+        let effective_old_shard_count = std::cmp::max(tenant_shard_id.shard_count.0, 1);
+        if new_shard_count <= ShardCount(effective_old_shard_count) {
+            anyhow::bail!("Requested shard count is not an increase");
+        }
+        let expansion_factor = new_shard_count.0 / effective_old_shard_count;
+        if !expansion_factor.is_power_of_two() {
+            anyhow::bail!("Requested split is not a power of two");
+        }
+
+        let parent_shard_identity = tenant.shard_identity;
+        let parent_tenant_conf = tenant.get_tenant_conf();
+        let parent_generation = tenant.generation;
+
+        let child_shards = tenant_shard_id.split(new_shard_count);
+        tracing::info!(
+            "Shard {} splits into: {}",
+            tenant_shard_id.to_index(),
+            child_shards
+                .iter()
+                .map(|id| format!("{}", id.to_index()))
+                .join(",")
+        );
+
+        // Phase 1: Write out child shards' remote index files, in the parent tenant's current generation
+        if let Err(e) = tenant.split_prepare(&child_shards).await {
+            // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
+            // have been left in a partially-shut-down state.
+            tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning");
+            self.reset_tenant(tenant_shard_id, false, ctx).await?;
+            return Err(e);
+        }
+
+        self.resources.deletion_queue_client.flush_advisory();
+
+        // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
+        drop(tenant);
+        let mut parent_slot_guard =
+            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
+        let parent = match parent_slot_guard.get_old_value() {
+            Some(TenantSlot::Attached(t)) => t,
+            Some(TenantSlot::Secondary(_)) => anyhow::bail!("Tenant location in secondary mode"),
+            Some(TenantSlot::InProgress(_)) => {
+                // tenant_map_acquire_slot never returns InProgress, if a slot was InProgress
+                // it would return an error.
+                unreachable!()
+            }
+            None => {
+                // We don't actually need the parent shard to still be attached to do our work, but it's
+                // a weird enough situation that the caller probably didn't want us to continue working
+                // if they had detached the tenant they requested the split on.
+                anyhow::bail!("Detached parent shard in the middle of split!")
+            }
+        };
+
+        // TODO: hardlink layers from the parent into the child shard directories so that they don't immediately re-download
+        // TODO: erase the dentries from the parent
+
+        // Take a snapshot of where the parent's WAL ingest had got to: we will wait for
+        // child shards to reach this point.
+        let mut target_lsns = HashMap::new();
+        for timeline in parent.timelines.lock().unwrap().clone().values() {
+            target_lsns.insert(timeline.timeline_id, timeline.get_last_record_lsn());
+        }
+
+        // TODO: we should have the parent shard stop its WAL ingest here, it's a waste of resources
+        // and could slow down the children trying to catch up.
+
+        // Phase 3: Spawn the child shards
+        for child_shard in &child_shards {
+            let mut child_shard_identity = parent_shard_identity;
+            child_shard_identity.count = child_shard.shard_count;
+            child_shard_identity.number = child_shard.shard_number;
+
+            let child_location_conf = LocationConf {
+                mode: LocationMode::Attached(AttachedLocationConfig {
+                    generation: parent_generation,
+                    attach_mode: AttachmentMode::Single,
+                }),
+                shard: child_shard_identity,
+                tenant_conf: parent_tenant_conf,
+            };
+
+            self.upsert_location(
+                *child_shard,
+                child_location_conf,
+                None,
+                SpawnMode::Normal,
+                ctx,
+            )
+            .await?;
+        }
+
+        // Phase 4: wait for child chards WAL ingest to catch up to target LSN
+        for child_shard_id in &child_shards {
+            let child_shard = {
+                let locked = TENANTS.read().unwrap();
+                let peek_slot =
+                    tenant_map_peek_slot(&locked, child_shard_id, TenantSlotPeekMode::Read)?;
+                peek_slot.and_then(|s| s.get_attached()).cloned()
+            };
+            if let Some(t) = child_shard {
+                let timelines = t.timelines.lock().unwrap().clone();
+                for timeline in timelines.values() {
+                    let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else {
+                        continue;
+                    };
+
+                    tracing::info!(
+                        "Waiting for child shard {}/{} to reach target lsn {}...",
+                        child_shard_id,
+                        timeline.timeline_id,
+                        target_lsn
+                    );
+                    if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
+                        // Failure here might mean shutdown, in any case this part is an optimization
+                        // and we shouldn't hold up the split operation.
+                        tracing::warn!(
+                            "Failed to wait for timeline {} to reach lsn {target_lsn}: {e}",
+                            timeline.timeline_id
+                        );
+                    } else {
+                        tracing::info!(
+                            "Child shard {}/{} reached target lsn {}",
+                            child_shard_id,
+                            timeline.timeline_id,
+                            target_lsn
+                        );
+                    }
+                }
+            }
+        }
+
+        // Phase 5: Shut down the parent shard.
+        let (_guard, progress) = completion::channel();
+        match parent.shutdown(progress, false).await {
+            Ok(()) => {}
+            Err(other) => {
+                other.wait().await;
+            }
+        }
+        parent_slot_guard.drop_old_value()?;
+
+        // Phase 6: Release the InProgress on the parent shard
+        drop(parent_slot_guard);
+
+        Ok(child_shards)
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -2209,8 +2366,6 @@ async fn remove_tenant_from_memory<V, F>(
 where
     F: std::future::Future<Output = anyhow::Result<V>>,
 {
-    use utils::completion;
-
     let mut slot_guard =
         tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;
 
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index e8ba1d3d6e..c17e27b446 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -27,7 +27,7 @@ use super::index::LayerFileMetadata;
 use tracing::info;
 
 /// Serializes and uploads the given index part data to the remote storage.
-pub(super) async fn upload_index_part<'a>(
+pub(crate) async fn upload_index_part<'a>(
     storage: &'a GenericRemoteStorage,
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 4491655aeb..3d2549a8c3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4054,7 +4054,7 @@ def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -
 
 
 def tenant_get_shards(
-    env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int]
+    env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int] = None
 ) -> list[tuple[TenantShardId, NeonPageserver]]:
     """
     Helper for when you want to talk to one or more pageservers, and the
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index c16bfc2ec6..805eaa34b0 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1,6 +1,7 @@
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
+    tenant_get_shards,
 )
 from fixtures.remote_storage import s3_storage
 from fixtures.types import TimelineId
@@ -82,4 +83,130 @@ def test_sharding_smoke(
         )
         assert timelines == {env.initial_timeline, timeline_b}
 
-    # TODO: test timeline deletion and tenant deletion (depends on change in attachment_service)
+
+def test_sharding_split_smoke(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test the basics of shard splitting:
+    - The API results in more shards than we started with
+    - The tenant's data remains readable
+
+    """
+
+    # We will start with 4 shards and split into 8, then migrate all those
+    # 8 shards onto separate pageservers
+    shard_count = 4
+    split_shard_count = 8
+    neon_env_builder.num_pageservers = split_shard_count
+
+    # 1MiB stripes: enable getting some meaningful data distribution without
+    # writing large quantities of data in this test.  The stripe size is given
+    # in number of 8KiB pages.
+    stripe_size = 128
+
+    # Use S3-compatible remote storage so that we can scrub: this test validates
+    # that the scrubber doesn't barf when it sees a sharded tenant.
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+
+    neon_env_builder.preserve_database_files = True
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
+    )
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    workload = Workload(env, tenant_id, timeline_id, branch_name="main")
+    workload.init()
+
+    # Initial data
+    workload.write_rows(256)
+
+    # Note which pageservers initially hold a shard after tenant creation
+    pre_split_pageserver_ids = [loc["node_id"] for loc in env.attachment_service.locate(tenant_id)]
+
+    # For pageservers holding a shard, validate their ingest statistics
+    # reflect a proper splitting of the WAL.
+    for pageserver in env.pageservers:
+        if pageserver.id not in pre_split_pageserver_ids:
+            continue
+
+        metrics = pageserver.http_client().get_metrics_values(
+            [
+                "pageserver_wal_ingest_records_received_total",
+                "pageserver_wal_ingest_records_committed_total",
+                "pageserver_wal_ingest_records_filtered_total",
+            ]
+        )
+
+        log.info(f"Pageserver {pageserver.id} metrics: {metrics}")
+
+        # Not everything received was committed
+        assert (
+            metrics["pageserver_wal_ingest_records_received_total"]
+            > metrics["pageserver_wal_ingest_records_committed_total"]
+        )
+
+        # Something was committed
+        assert metrics["pageserver_wal_ingest_records_committed_total"] > 0
+
+        # Counts are self consistent
+        assert (
+            metrics["pageserver_wal_ingest_records_received_total"]
+            == metrics["pageserver_wal_ingest_records_committed_total"]
+            + metrics["pageserver_wal_ingest_records_filtered_total"]
+        )
+
+    # TODO: validate that shards have different sizes
+
+    workload.validate()
+
+    assert len(pre_split_pageserver_ids) == 4
+
+    env.attachment_service.tenant_shard_split(tenant_id, shard_count=split_shard_count)
+
+    post_split_pageserver_ids = [loc["node_id"] for loc in env.attachment_service.locate(tenant_id)]
+    # We should have split into 8 shards, on the same 4 pageservers we started on.
+    assert len(post_split_pageserver_ids) == split_shard_count
+    assert len(set(post_split_pageserver_ids)) == shard_count
+    assert set(post_split_pageserver_ids) == set(pre_split_pageserver_ids)
+
+    workload.validate()
+
+    workload.churn_rows(256)
+
+    workload.validate()
+
+    # Run GC on all new shards, to check they don't barf or delete anything that breaks reads
+    # (compaction was already run as part of churn_rows)
+    all_shards = tenant_get_shards(env, tenant_id)
+    for tenant_shard_id, pageserver in all_shards:
+        pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None)
+
+    # Restart all nodes, to check that the newly created shards are durable
+    for ps in env.pageservers:
+        ps.restart()
+
+    workload.validate()
+
+    migrate_to_pageserver_ids = list(
+        set(p.id for p in env.pageservers) - set(pre_split_pageserver_ids)
+    )
+    assert len(migrate_to_pageserver_ids) == split_shard_count - shard_count
+
+    # Migrate shards away from the node where the split happened
+    for ps_id in pre_split_pageserver_ids:
+        shards_here = [
+            tenant_shard_id
+            for (tenant_shard_id, pageserver) in all_shards
+            if pageserver.id == ps_id
+        ]
+        assert len(shards_here) == 2
+        migrate_shard = shards_here[0]
+        destination = migrate_to_pageserver_ids.pop()
+
+        log.info(f"Migrating shard {migrate_shard} from {ps_id} to {destination}")
+        env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10)
+
+    workload.validate()

From e8d2843df63ba05cd74baa8017736a903f9a322a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 8 Feb 2024 18:00:53 +0000
Subject: [PATCH 0125/1571] storage controller: improved handling of node
 availability on restart (#6658)

- Automatically set a node's availability to Active if it is responsive
in startup_reconcile
- Impose a 5s timeout of HTTP request to list location conf, so that an
unresponsive node can't hang it for minutes
- Do several retries if the request fails with a retryable error, to be
tolerant of concurrent pageserver & storage controller restarts
- Add a readiness hook for use with k8s so that we can tell when the
startup reconciliaton is done and the service is fully ready to do work.
- Add /metrics to the list of un-authenticated endpoints (this is
unrelated but we're touching the line in this PR already, and it fixes
auth error spam in deployed container.)
- A test for the above.

Closes: #6670
---
 control_plane/attachment_service/src/http.rs  |  14 ++-
 .../attachment_service/src/service.rs         | 107 +++++++++++++-----
 libs/utils/src/completion.rs                  |   5 +
 pageserver/client/src/mgmt_api.rs             |  10 +-
 test_runner/fixtures/neon_fixtures.py         |   9 ++
 test_runner/regress/test_sharding_service.py  |  32 ++++++
 6 files changed, 149 insertions(+), 28 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 38eecaf7ef..8501e4980f 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -42,7 +42,7 @@ pub struct HttpState {
 
 impl HttpState {
     pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
-        let allowlist_routes = ["/status"]
+        let allowlist_routes = ["/status", "/ready", "/metrics"]
             .iter()
             .map(|v| v.parse().unwrap())
             .collect::<Vec<_>>();
@@ -325,6 +325,17 @@ async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError>
     json_response(StatusCode::OK, ())
 }
 
+/// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling
+/// with remote pageserver nodes).  This is intended for use as a kubernetes readiness probe.
+async fn handle_ready(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&req);
+    if state.service.startup_complete.is_ready() {
+        json_response(StatusCode::OK, ())
+    } else {
+        json_response(StatusCode::SERVICE_UNAVAILABLE, ())
+    }
+}
+
 impl From<ReconcileError> for ApiError {
     fn from(value: ReconcileError) -> Self {
         ApiError::Conflict(format!("Reconciliation error: {}", value))
@@ -380,6 +391,7 @@ pub fn make_router(
         .data(Arc::new(HttpState::new(service, auth)))
         // Non-prefixed generic endpoints (status, metrics)
         .get("/status", |r| request_span(r, handle_status))
+        .get("/ready", |r| request_span(r, handle_ready))
         // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
         .post("/upcall/v1/re-attach", |r| {
             request_span(r, handle_re_attach)
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 0ec2b9dc4c..0331087e0d 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1,6 +1,6 @@
 use std::{
     cmp::Ordering,
-    collections::{BTreeMap, HashMap},
+    collections::{BTreeMap, HashMap, HashSet},
     str::FromStr,
     sync::Arc,
     time::{Duration, Instant},
@@ -31,6 +31,7 @@ use pageserver_api::{
 use pageserver_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
 use utils::{
+    backoff,
     completion::Barrier,
     generation::Generation,
     http::error::ApiError,
@@ -150,31 +151,71 @@ impl Service {
         // indeterminate, same as in [`ObservedStateLocation`])
         let mut observed = HashMap::new();
 
-        let nodes = {
-            let locked = self.inner.read().unwrap();
-            locked.nodes.clone()
-        };
+        let mut nodes_online = HashSet::new();
+
+        // TODO: give Service a cancellation token for clean shutdown
+        let cancel = CancellationToken::new();
 
         // TODO: issue these requests concurrently
-        for node in nodes.values() {
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+        {
+            let nodes = {
+                let locked = self.inner.read().unwrap();
+                locked.nodes.clone()
+            };
+            for node in nodes.values() {
+                let http_client = reqwest::ClientBuilder::new()
+                    .timeout(Duration::from_secs(5))
+                    .build()
+                    .expect("Failed to construct HTTP client");
+                let client = mgmt_api::Client::from_client(
+                    http_client,
+                    node.base_url(),
+                    self.config.jwt_token.as_deref(),
+                );
 
-            tracing::info!("Scanning shards on node {}...", node.id);
-            match client.list_location_config().await {
-                Err(e) => {
-                    tracing::warn!("Could not contact pageserver {} ({e})", node.id);
-                    // TODO: be more tolerant, apply a generous 5-10 second timeout with retries, in case
-                    // pageserver is being restarted at the same time as we are
+                fn is_fatal(e: &mgmt_api::Error) -> bool {
+                    use mgmt_api::Error::*;
+                    match e {
+                        ReceiveBody(_) | ReceiveErrorBody(_) => false,
+                        ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
+                        | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
+                        | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
+                        ApiError(_, _) => true,
+                    }
                 }
-                Ok(listing) => {
-                    tracing::info!(
-                        "Received {} shard statuses from pageserver {}, setting it to Active",
-                        listing.tenant_shards.len(),
-                        node.id
-                    );
 
-                    for (tenant_shard_id, conf_opt) in listing.tenant_shards {
-                        observed.insert(tenant_shard_id, (node.id, conf_opt));
+                let list_response = backoff::retry(
+                    || client.list_location_config(),
+                    is_fatal,
+                    1,
+                    5,
+                    "Location config listing",
+                    &cancel,
+                )
+                .await;
+                let Some(list_response) = list_response else {
+                    tracing::info!("Shutdown during startup_reconcile");
+                    return;
+                };
+
+                tracing::info!("Scanning shards on node {}...", node.id);
+                match list_response {
+                    Err(e) => {
+                        tracing::warn!("Could not contact pageserver {} ({e})", node.id);
+                        // TODO: be more tolerant, do some retries, in case
+                        // pageserver is being restarted at the same time as we are
+                    }
+                    Ok(listing) => {
+                        tracing::info!(
+                            "Received {} shard statuses from pageserver {}, setting it to Active",
+                            listing.tenant_shards.len(),
+                            node.id
+                        );
+                        nodes_online.insert(node.id);
+
+                        for (tenant_shard_id, conf_opt) in listing.tenant_shards {
+                            observed.insert(tenant_shard_id, (node.id, conf_opt));
+                        }
                     }
                 }
             }
@@ -185,8 +226,19 @@ impl Service {
         let mut compute_notifications = Vec::new();
 
         // Populate intent and observed states for all tenants, based on reported state on pageservers
-        let shard_count = {
+        let (shard_count, nodes) = {
             let mut locked = self.inner.write().unwrap();
+
+            // Mark nodes online if they responded to us: nodes are offline by default after a restart.
+            let mut nodes = (*locked.nodes).clone();
+            for (node_id, node) in nodes.iter_mut() {
+                if nodes_online.contains(node_id) {
+                    node.availability = NodeAvailability::Active;
+                }
+            }
+            locked.nodes = Arc::new(nodes);
+            let nodes = locked.nodes.clone();
+
             for (tenant_shard_id, (node_id, observed_loc)) in observed {
                 let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
                     cleanup.push((tenant_shard_id, node_id));
@@ -218,7 +270,7 @@ impl Service {
                 }
             }
 
-            locked.tenants.len()
+            (locked.tenants.len(), nodes)
         };
 
         // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
@@ -279,9 +331,8 @@ impl Service {
         let stream = futures::stream::iter(compute_notifications.into_iter())
             .map(|(tenant_shard_id, node_id)| {
                 let compute_hook = compute_hook.clone();
+                let cancel = cancel.clone();
                 async move {
-                    // TODO: give Service a cancellation token for clean shutdown
-                    let cancel = CancellationToken::new();
                     if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
                         tracing::error!(
                             tenant_shard_id=%tenant_shard_id,
@@ -387,7 +438,7 @@ impl Service {
             ))),
             config,
             persistence,
-            startup_complete,
+            startup_complete: startup_complete.clone(),
         });
 
         let result_task_this = this.clone();
@@ -984,6 +1035,10 @@ impl Service {
             }
         };
 
+        // TODO: if we timeout/fail on reconcile, we should still succeed this request,
+        // because otherwise a broken compute hook causes a feedback loop where
+        // location_config returns 500 and gets retried forever.
+
         if let Some(create_req) = maybe_create {
             let create_resp = self.tenant_create(create_req).await?;
             result.shards = create_resp
diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs
index ca6827c9b8..ea05cf54b1 100644
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -27,6 +27,11 @@ impl Barrier {
             b.wait().await
         }
     }
+
+    /// Return true if a call to wait() would complete immediately
+    pub fn is_ready(&self) -> bool {
+        futures::future::FutureExt::now_or_never(self.0.wait()).is_some()
+    }
 }
 
 impl PartialEq for Barrier {
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 200369df90..baea747d3c 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -56,10 +56,18 @@ pub enum ForceAwaitLogicalSize {
 
 impl Client {
     pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
+        Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt)
+    }
+
+    pub fn from_client(
+        client: reqwest::Client,
+        mgmt_api_endpoint: String,
+        jwt: Option<&str>,
+    ) -> Self {
         Self {
             mgmt_api_endpoint,
             authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")),
-            client: reqwest::Client::new(),
+            client,
         }
     }
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3d2549a8c3..0af8098cad 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1949,6 +1949,15 @@ class NeonAttachmentService:
 
         return headers
 
+    def ready(self) -> bool:
+        resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
+        if resp.status_code == 503:
+            return False
+        elif resp.status_code == 200:
+            return True
+        else:
+            raise RuntimeError(f"Unexpected status {resp.status_code} from readiness endpoint")
+
     def attach_hook_issue(
         self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
     ) -> int:
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index fd811a9d02..babb0d261c 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -128,6 +128,38 @@ def test_sharding_service_smoke(
     assert counts[env.pageservers[2].id] == tenant_shard_count // 2
 
 
+def test_node_status_after_restart(
+    neon_env_builder: NeonEnvBuilder,
+):
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_start()
+
+    # Initially we have two online pageservers
+    nodes = env.attachment_service.node_list()
+    assert len(nodes) == 2
+
+    env.pageservers[1].stop()
+
+    env.attachment_service.stop()
+    env.attachment_service.start()
+
+    # Initially readiness check should fail because we're trying to connect to the offline node
+    assert env.attachment_service.ready() is False
+
+    def is_ready():
+        assert env.attachment_service.ready() is True
+
+    wait_until(30, 1, is_ready)
+
+    # We loaded nodes from database on restart
+    nodes = env.attachment_service.node_list()
+    assert len(nodes) == 2
+
+    # We should still be able to create a tenant, because the pageserver which is still online
+    # should have had its availabilty state set to Active.
+    env.attachment_service.tenant_create(TenantId.generate())
+
+
 def test_sharding_service_passthrough(
     neon_env_builder: NeonEnvBuilder,
 ):

From c0e0fc8151f2c00d45ebb8e39ef3c271c65a38f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 8 Feb 2024 19:57:02 +0100
Subject: [PATCH 0126/1571] Update Rust to 1.76.0 (#6683)

[Release notes](https://github.com/rust-lang/rust/releases/tag/1.75.0).
---
 Dockerfile.buildtools                   | 2 +-
 compute_tools/src/pg_helpers.rs         | 5 +++--
 control_plane/src/background_process.rs | 1 -
 rust-toolchain.toml                     | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools
index 220e995d64..3a452fec32 100644
--- a/Dockerfile.buildtools
+++ b/Dockerfile.buildtools
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.75.0
+ENV RUSTC_VERSION=1.76.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index ce704385c6..5deb50d6b7 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -264,9 +264,10 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
     // case we miss some events for some reason. Not strictly necessary, but
     // better safe than sorry.
     let (tx, rx) = std::sync::mpsc::channel();
-    let (mut watcher, rx): (Box<dyn Watcher>, _) = match notify::recommended_watcher(move |res| {
+    let watcher_res = notify::recommended_watcher(move |res| {
         let _ = tx.send(res);
-    }) {
+    });
+    let (mut watcher, rx): (Box<dyn Watcher>, _) = match watcher_res {
         Ok(watcher) => (Box::new(watcher), rx),
         Err(e) => {
             match e.kind {
diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 364cc01c39..0e59b28230 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -72,7 +72,6 @@ where
     let log_path = datadir.join(format!("{process_name}.log"));
     let process_log_file = fs::OpenOptions::new()
         .create(true)
-        .write(true)
         .append(true)
         .open(&log_path)
         .with_context(|| {
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 9b5a965f7d..b0949c32b1 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.75.0"
+channel = "1.76.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From 9a31311990d19eb607e087e0e12d4369bfab8b6c Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 8 Feb 2024 22:40:14 +0200
Subject: [PATCH 0127/1571] fix(heavier_once_cell): assertion failure can be
 hit (#6652)

@problame noticed that the `tokio::sync::AcquireError` branch assertion
can be hit like in the first commit. We haven't seen this yet in
production, but I'd prefer not to see it there. There `take_and_deinit`
is being used, but this race must be quite timing sensitive.
---
 libs/utils/src/sync/heavier_once_cell.rs | 241 +++++++++++++++++------
 1 file changed, 176 insertions(+), 65 deletions(-)

diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index f733d107f1..81625b907e 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -69,37 +69,44 @@ impl<T> OnceCell<T> {
         F: FnOnce(InitPermit) -> Fut,
         Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
     {
-        let sem = {
+        loop {
+            let sem = {
+                let guard = self.inner.write().await;
+                if guard.value.is_some() {
+                    return Ok(GuardMut(guard));
+                }
+                guard.init_semaphore.clone()
+            };
+
+            {
+                let permit = {
+                    // increment the count for the duration of queued
+                    let _guard = CountWaitingInitializers::start(self);
+                    sem.acquire().await
+                };
+
+                let Ok(permit) = permit else {
+                    let guard = self.inner.write().await;
+                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
+                        // there was a take_and_deinit in between
+                        continue;
+                    }
+                    assert!(
+                        guard.value.is_some(),
+                        "semaphore got closed, must be initialized"
+                    );
+                    return Ok(GuardMut(guard));
+                };
+
+                permit.forget();
+            }
+
+            let permit = InitPermit(sem);
+            let (value, _permit) = factory(permit).await?;
+
             let guard = self.inner.write().await;
-            if guard.value.is_some() {
-                return Ok(GuardMut(guard));
-            }
-            guard.init_semaphore.clone()
-        };
 
-        let permit = {
-            // increment the count for the duration of queued
-            let _guard = CountWaitingInitializers::start(self);
-            sem.acquire_owned().await
-        };
-
-        match permit {
-            Ok(permit) => {
-                let permit = InitPermit(permit);
-                let (value, _permit) = factory(permit).await?;
-
-                let guard = self.inner.write().await;
-
-                Ok(Self::set0(value, guard))
-            }
-            Err(_closed) => {
-                let guard = self.inner.write().await;
-                assert!(
-                    guard.value.is_some(),
-                    "semaphore got closed, must be initialized"
-                );
-                return Ok(GuardMut(guard));
-            }
+            return Ok(Self::set0(value, guard));
         }
     }
 
@@ -112,37 +119,44 @@ impl<T> OnceCell<T> {
         F: FnOnce(InitPermit) -> Fut,
         Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
     {
-        let sem = {
-            let guard = self.inner.read().await;
-            if guard.value.is_some() {
-                return Ok(GuardRef(guard));
-            }
-            guard.init_semaphore.clone()
-        };
-
-        let permit = {
-            // increment the count for the duration of queued
-            let _guard = CountWaitingInitializers::start(self);
-            sem.acquire_owned().await
-        };
-
-        match permit {
-            Ok(permit) => {
-                let permit = InitPermit(permit);
-                let (value, _permit) = factory(permit).await?;
-
-                let guard = self.inner.write().await;
-
-                Ok(Self::set0(value, guard).downgrade())
-            }
-            Err(_closed) => {
+        loop {
+            let sem = {
                 let guard = self.inner.read().await;
-                assert!(
-                    guard.value.is_some(),
-                    "semaphore got closed, must be initialized"
-                );
-                return Ok(GuardRef(guard));
+                if guard.value.is_some() {
+                    return Ok(GuardRef(guard));
+                }
+                guard.init_semaphore.clone()
+            };
+
+            {
+                let permit = {
+                    // increment the count for the duration of queued
+                    let _guard = CountWaitingInitializers::start(self);
+                    sem.acquire().await
+                };
+
+                let Ok(permit) = permit else {
+                    let guard = self.inner.read().await;
+                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
+                        // there was a take_and_deinit in between
+                        continue;
+                    }
+                    assert!(
+                        guard.value.is_some(),
+                        "semaphore got closed, must be initialized"
+                    );
+                    return Ok(GuardRef(guard));
+                };
+
+                permit.forget();
             }
+
+            let permit = InitPermit(sem);
+            let (value, _permit) = factory(permit).await?;
+
+            let guard = self.inner.write().await;
+
+            return Ok(Self::set0(value, guard).downgrade());
         }
     }
 
@@ -250,15 +264,12 @@ impl<'a, T> GuardMut<'a, T> {
     /// [`OnceCell::get_or_init`] will wait on it to complete.
     pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
         let mut swapped = Inner::default();
-        let permit = swapped
-            .init_semaphore
-            .clone()
-            .try_acquire_owned()
-            .expect("we just created this");
+        let sem = swapped.init_semaphore.clone();
+        sem.try_acquire().expect("we just created this").forget();
         std::mem::swap(&mut *self.0, &mut swapped);
         swapped
             .value
-            .map(|v| (v, InitPermit(permit)))
+            .map(|v| (v, InitPermit(sem)))
             .expect("guard is not created unless value has been initialized")
     }
 
@@ -282,13 +293,23 @@ impl<T> std::ops::Deref for GuardRef<'_, T> {
 }
 
 /// Type held by OnceCell (de)initializing task.
-pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
+pub struct InitPermit(Arc<tokio::sync::Semaphore>);
+
+impl Drop for InitPermit {
+    fn drop(&mut self) {
+        debug_assert_eq!(self.0.available_permits(), 0);
+        self.0.add_permits(1);
+    }
+}
 
 #[cfg(test)]
 mod tests {
+    use futures::Future;
+
     use super::*;
     use std::{
         convert::Infallible,
+        pin::{pin, Pin},
         sync::atomic::{AtomicUsize, Ordering},
         time::Duration,
     };
@@ -455,4 +476,94 @@ mod tests {
             .unwrap();
         assert_eq!(*g, "now initialized");
     }
+
+    #[tokio::test(start_paused = true)]
+    async fn reproduce_init_take_deinit_race() {
+        init_take_deinit_scenario(|cell, factory| {
+            Box::pin(async {
+                cell.get_or_init(factory).await.unwrap();
+            })
+        })
+        .await;
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn reproduce_init_take_deinit_race_mut() {
+        init_take_deinit_scenario(|cell, factory| {
+            Box::pin(async {
+                cell.get_mut_or_init(factory).await.unwrap();
+            })
+        })
+        .await;
+    }
+
+    type BoxedInitFuture<T, E> = Pin<Box<dyn Future<Output = Result<(T, InitPermit), E>>>>;
+    type BoxedInitFunction<T, E> = Box<dyn Fn(InitPermit) -> BoxedInitFuture<T, E>>;
+
+    /// Reproduce an assertion failure with both initialization methods.
+    ///
+    /// This has interesting generics to be generic between `get_or_init` and `get_mut_or_init`.
+    /// Alternative would be a macro_rules! but that is the last resort.
+    async fn init_take_deinit_scenario<F>(init_way: F)
+    where
+        F: for<'a> Fn(
+            &'a OnceCell<&'static str>,
+            BoxedInitFunction<&'static str, Infallible>,
+        ) -> Pin<Box<dyn Future<Output = ()> + 'a>>,
+    {
+        let cell = OnceCell::default();
+
+        // acquire the init_semaphore only permit to drive initializing tasks in order to waiting
+        // on the same semaphore.
+        let permit = cell
+            .inner
+            .read()
+            .await
+            .init_semaphore
+            .clone()
+            .try_acquire_owned()
+            .unwrap();
+
+        let mut t1 = pin!(init_way(
+            &cell,
+            Box::new(|permit| Box::pin(async move { Ok(("t1", permit)) })),
+        ));
+
+        let mut t2 = pin!(init_way(
+            &cell,
+            Box::new(|permit| Box::pin(async move { Ok(("t2", permit)) })),
+        ));
+
+        // drive t2 first to the init_semaphore
+        tokio::select! {
+            _ = &mut t2 => unreachable!("it cannot get permit"),
+            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
+        }
+
+        // followed by t1 in the init_semaphore
+        tokio::select! {
+            _ = &mut t1 => unreachable!("it cannot get permit"),
+            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
+        }
+
+        // now let t2 proceed and initialize
+        drop(permit);
+        t2.await;
+
+        let (s, permit) = { cell.get_mut().await.unwrap().take_and_deinit() };
+        assert_eq!("t2", s);
+
+        // now originally t1 would see the semaphore it has as closed. it cannot yet get a permit from
+        // the new one.
+        tokio::select! {
+            _ = &mut t1 => unreachable!("it cannot get permit"),
+            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
+        }
+
+        // only now we get to initialize it
+        drop(permit);
+        t1.await;
+
+        assert_eq!("t1", *cell.get().await.unwrap());
+    }
 }

From c09993396ea026758bfda83c477361d656a5b647 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 9 Feb 2024 00:37:57 +0200
Subject: [PATCH 0128/1571] fix: secondary tenant relative order eviction
 (#6491)

Calculate the `relative_last_activity` using the total evicted and
resident layers similar to what we originally planned.

Cc: #5331
---
 pageserver/src/disk_usage_eviction_task.rs    | 73 +++++++++++++------
 pageserver/src/tenant/secondary.rs            |  2 +-
 pageserver/src/tenant/secondary/downloader.rs | 27 ++++---
 3 files changed, 67 insertions(+), 35 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 1f0525b045..d5f5a20683 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -623,6 +623,7 @@ impl std::fmt::Display for EvictionLayer {
     }
 }
 
+#[derive(Default)]
 pub(crate) struct DiskUsageEvictionInfo {
     /// Timeline's largest layer (remote or resident)
     pub max_layer_size: Option<u64>,
@@ -854,19 +855,27 @@ async fn collect_eviction_candidates(
 
         let total = tenant_candidates.len();
 
-        for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
-            // as we iterate this reverse sorted list, the most recently accessed layer will always
-            // be 1.0; this is for us to evict it last.
-            candidate.relative_last_activity = eviction_order.relative_last_activity(total, i);
+        let tenant_candidates =
+            tenant_candidates
+                .into_iter()
+                .enumerate()
+                .map(|(i, mut candidate)| {
+                    // as we iterate this reverse sorted list, the most recently accessed layer will always
+                    // be 1.0; this is for us to evict it last.
+                    candidate.relative_last_activity =
+                        eviction_order.relative_last_activity(total, i);
 
-            let partition = if cumsum > min_resident_size as i128 {
-                MinResidentSizePartition::Above
-            } else {
-                MinResidentSizePartition::Below
-            };
-            cumsum += i128::from(candidate.layer.get_file_size());
-            candidates.push((partition, candidate));
-        }
+                    let partition = if cumsum > min_resident_size as i128 {
+                        MinResidentSizePartition::Above
+                    } else {
+                        MinResidentSizePartition::Below
+                    };
+                    cumsum += i128::from(candidate.layer.get_file_size());
+
+                    (partition, candidate)
+                });
+
+        candidates.extend(tenant_candidates);
     }
 
     // Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -882,21 +891,41 @@ async fn collect_eviction_candidates(
     );
 
     for secondary_tenant in secondary_tenants {
-        let mut layer_info = secondary_tenant.get_layers_for_eviction();
+        // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
+        // to prevent repeated disk usage based evictions from completely draining less often
+        // updating secondaries.
+        let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();
+
+        debug_assert!(
+            total_layers >= layer_info.resident_layers.len(),
+            "total_layers ({total_layers}) must be at least the resident_layers.len() ({})",
+            layer_info.resident_layers.len()
+        );
 
         layer_info
             .resident_layers
             .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
 
-        candidates.extend(layer_info.resident_layers.into_iter().map(|candidate| {
-            (
-                // Secondary locations' layers are always considered above the min resident size,
-                // i.e. secondary locations are permitted to be trimmed to zero layers if all
-                // the layers have sufficiently old access times.
-                MinResidentSizePartition::Above,
-                candidate,
-            )
-        }));
+        let tenant_candidates =
+            layer_info
+                .resident_layers
+                .into_iter()
+                .enumerate()
+                .map(|(i, mut candidate)| {
+                    candidate.relative_last_activity =
+                        eviction_order.relative_last_activity(total_layers, i);
+                    (
+                        // Secondary locations' layers are always considered above the min resident size,
+                        // i.e. secondary locations are permitted to be trimmed to zero layers if all
+                        // the layers have sufficiently old access times.
+                        MinResidentSizePartition::Above,
+                        candidate,
+                    )
+                });
+
+        candidates.extend(tenant_candidates);
+
+        tokio::task::yield_now().await;
     }
 
     debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 4269e1dec1..926cd0302b 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -160,7 +160,7 @@ impl SecondaryTenant {
         &self.tenant_shard_id
     }
 
-    pub(crate) fn get_layers_for_eviction(self: &Arc<Self>) -> DiskUsageEvictionInfo {
+    pub(crate) fn get_layers_for_eviction(self: &Arc<Self>) -> (DiskUsageEvictionInfo, usize) {
         self.detail.lock().unwrap().get_layers_for_eviction(self)
     }
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 55af4f9f2b..9330edf946 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -146,14 +146,15 @@ impl SecondaryDetail {
         }
     }
 
+    /// Additionally returns the total number of layers, used for more stable relative access time
+    /// based eviction.
     pub(super) fn get_layers_for_eviction(
         &self,
         parent: &Arc<SecondaryTenant>,
-    ) -> DiskUsageEvictionInfo {
-        let mut result = DiskUsageEvictionInfo {
-            max_layer_size: None,
-            resident_layers: Vec::new(),
-        };
+    ) -> (DiskUsageEvictionInfo, usize) {
+        let mut result = DiskUsageEvictionInfo::default();
+        let mut total_layers = 0;
+
         for (timeline_id, timeline_detail) in &self.timelines {
             result
                 .resident_layers
@@ -169,6 +170,10 @@ impl SecondaryDetail {
                         relative_last_activity: finite_f32::FiniteF32::ZERO,
                     }
                 }));
+
+            // total might be missing currently downloading layers, but as a lower than actual
+            // value it is good enough approximation.
+            total_layers += timeline_detail.on_disk_layers.len() + timeline_detail.evicted_at.len();
         }
         result.max_layer_size = result
             .resident_layers
@@ -183,7 +188,7 @@ impl SecondaryDetail {
             result.resident_layers.len()
         );
 
-        result
+        (result, total_layers)
     }
 }
 
@@ -312,9 +317,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
             .tenant_manager
             .get_secondary_tenant_shard(*tenant_shard_id);
         let Some(tenant) = tenant else {
-            {
-                return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
-            }
+            return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
         };
 
         Ok(PendingDownload {
@@ -389,9 +392,9 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
             }
 
             CompleteDownload {
-                    secondary_state,
-                    completed_at: Instant::now(),
-                }
+                secondary_state,
+                completed_at: Instant::now(),
+            }
         }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
     }
 }

From 529a79d2633ebc816024f890a69647c850b22dc0 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 9 Feb 2024 08:14:41 +0200
Subject: [PATCH 0129/1571] Increment generation which LFC is disabled by
 assigning 0 to neon.file_cache_size_limit (#6692)

## Problem

test_lfc_resize sometimes filed with assertion failure when require lock
in write operation:

```
	if (lfc_ctl->generation == generation)
	{
		Assert(LFC_ENABLED());
```

## Summary of changes

Increment generation when 0 is assigned to neon.file_cache_size_limit

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 21db666caa..448b9263f3 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -314,6 +314,9 @@ lfc_change_limit_hook(int newval, void *extra)
 		lfc_ctl->used -= 1;
 	}
 	lfc_ctl->limit = new_size;
+	if (new_size == 0) {
+		lfc_ctl->generation += 1;
+	}
 	neon_log(DEBUG1, "set local file cache limit to %d", new_size);
 
 	LWLockRelease(lfc_lock);

From a18aa14754fc44f7b38970bc546e4340386c32c9 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 9 Feb 2024 11:01:07 +0200
Subject: [PATCH 0130/1571] test: shutdown endpoints before deletion (#6619)

this avoids a page_service error in the log sometimes. keeping the
endpoint running while deleting has no function for this test.
---
 test_runner/regress/test_timeline_delete.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 352b82d525..5fda5aa569 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -651,9 +651,7 @@ def test_timeline_delete_works_for_remote_smoke(
     timeline_ids = [env.initial_timeline]
     for i in range(2):
         branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main")
-        pg = env.endpoints.create_start(f"new{i}")
-
-        with pg.cursor() as cur:
+        with env.endpoints.create_start(f"new{i}") as pg, pg.cursor() as cur:
             cur.execute("CREATE TABLE f (i integer);")
             cur.execute("INSERT INTO f VALUES (generate_series(1,1000));")
             current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))

From 568f91420a9c677e77aeb736cb3f995a85f0b106 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 9 Feb 2024 11:34:15 +0200
Subject: [PATCH 0131/1571] tests: try to make restored-datadir comparison
 tests not flaky (#6666)

This test occasionally fails with a difference in "pg_xact/0000" file
between the local and restored datadirs. My hypothesis is that something
changed in the database between the last explicit checkpoint and the
shutdown. I suspect autovacuum, it could certainly create transactions.

To fix, be more precise about the point in time that we compare. Shut
down the endpoint first, then read the last LSN (i.e. the shutdown
checkpoint's LSN), from the local disk with pg_controldata. And use
exactly that LSN in the basebackup.

Closes #559.

I'm proposing this as an alternative to
https://github.com/neondatabase/neon/pull/6662.
---
 test_runner/fixtures/neon_fixtures.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0af8098cad..a6aff77ddf 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3964,24 +3964,27 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
 
 # pg is the existing and running compute node, that we want to compare with a basebackup
 def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
+    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
+
     # Get the timeline ID. We need it for the 'basebackup' command
     timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
 
-    # many tests already checkpoint, but do it just in case
-    with closing(endpoint.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute("CHECKPOINT")
-
-    # wait for pageserver to catch up
-    wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
     # stop postgres to ensure that files won't change
     endpoint.stop()
 
+    # Read the shutdown checkpoint's LSN
+    pg_controldata_path = os.path.join(pg_bin.pg_bin_path, "pg_controldata")
+    cmd = f"{pg_controldata_path} -D {endpoint.pgdata_dir}"
+    result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
+    checkpoint_lsn = re.findall(
+        "Latest checkpoint location:\\s+([0-9A-F]+/[0-9A-F]+)", result.stdout
+    )[0]
+    log.debug(f"last checkpoint at {checkpoint_lsn}")
+
     # Take a basebackup from pageserver
     restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir"
     restored_dir_path.mkdir(exist_ok=True)
 
-    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
     psql_path = os.path.join(pg_bin.pg_bin_path, "psql")
 
     pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
@@ -3989,7 +3992,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
         {psql_path}                                    \
             --no-psqlrc                                \
             postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg}  \
-            -c 'basebackup {endpoint.tenant_id} {timeline_id}'  \
+            -c 'basebackup {endpoint.tenant_id} {timeline_id} {checkpoint_lsn}'  \
          | tar -x -C {restored_dir_path}
     """
 

From 951c9bf4cad6a651f9531f3c4e1e58d90c27910e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 9 Feb 2024 10:12:40 +0000
Subject: [PATCH 0132/1571] control_plane: fix shard splitting on unsharded
 tenant (#6689)

## Problem

Previous test started with a new-style TenantShardId with a non-zero
ShardCount. We also need to handle the case of a ShardCount() (aka
`unsharded`) parent shard.

**A followup PR will refactor ShardCount to make its inner value private
and thereby make this kind of mistake harder**

## Summary of changes

- Fix a place we were incorrectly treating a ShardCount as a number of
shards rather than as thing that can be zero or the number of shards.
- Add a test for this case.
---
 .../attachment_service/src/persistence.rs     | 10 ++++--
 test_runner/regress/test_sharding.py          | 31 ++++++++++++++++++-
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index cead540058..623d625767 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -381,16 +381,22 @@ impl Persistence {
         self.with_conn(move |conn| -> DatabaseResult<()> {
             conn.transaction(|conn| -> DatabaseResult<()> {
                 // Mark parent shards as splitting
+
+                let expect_parent_records = std::cmp::max(1, old_shard_count.0);
+
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(old_shard_count.0 as i32))
                     .set((splitting.eq(1),))
                     .execute(conn)?;
-                if ShardCount(updated.try_into().map_err(|_| DatabaseError::Logical(format!("Overflow existing shard count {} while splitting", updated)))?) != old_shard_count {
+                if u8::try_from(updated)
+                    .map_err(|_| DatabaseError::Logical(
+                        format!("Overflow existing shard count {} while splitting", updated))
+                    )? != expect_parent_records {
                     // Perhaps a deletion or another split raced with this attempt to split, mutating
                     // the parent shards that we intend to split. In this case the split request should fail.
                     return Err(DatabaseError::Logical(
-                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {old_shard_count:?})")
+                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {expect_parent_records})")
                     ));
                 }
 
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 805eaa34b0..27d1cf2f34 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -4,7 +4,7 @@ from fixtures.neon_fixtures import (
     tenant_get_shards,
 )
 from fixtures.remote_storage import s3_storage
-from fixtures.types import TimelineId
+from fixtures.types import TenantShardId, TimelineId
 from fixtures.workload import Workload
 
 
@@ -84,6 +84,35 @@ def test_sharding_smoke(
         assert timelines == {env.initial_timeline, timeline_b}
 
 
+def test_sharding_split_unsharded(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test that shard splitting works on a tenant created as unsharded (i.e. with
+    ShardCount(0)).
+    """
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    workload = Workload(env, tenant_id, timeline_id, branch_name="main")
+    workload.init()
+    workload.write_rows(256)
+
+    # Check that we created with an unsharded TenantShardId: this is the default,
+    # but check it in case we change the default in future
+    assert env.attachment_service.inspect(TenantShardId(tenant_id, 0, 0)) is not None
+
+    # Split one shard into two
+    env.attachment_service.tenant_shard_split(tenant_id, shard_count=2)
+
+    # Check we got the shard IDs we expected
+    assert env.attachment_service.inspect(TenantShardId(tenant_id, 0, 2)) is not None
+    assert env.attachment_service.inspect(TenantShardId(tenant_id, 1, 2)) is not None
+
+    workload.validate()
+
+
 def test_sharding_split_smoke(
     neon_env_builder: NeonEnvBuilder,
 ):

From ea089dc97700732788f2d9f0ea44e10fb59c2f6f Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 9 Feb 2024 10:29:20 +0000
Subject: [PATCH 0133/1571] proxy: add per query array mode flag (#6678)

## Problem

Drizzle needs to be able to configure the array_mode flag per query.

## Summary of changes

Adds an array_mode flag to the query data json that will otherwise
default to the header flag.
---
 proxy/src/serverless/sql_over_http.rs | 163 ++++++++++++++------------
 test_runner/regress/test_proxy.py     |  33 ++++++
 2 files changed, 119 insertions(+), 77 deletions(-)

diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 7092b65f03..25e8813625 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -44,10 +44,13 @@ use super::json::pg_text_row_to_json;
 use super::SERVERLESS_DRIVER_SNI;
 
 #[derive(serde::Deserialize)]
+#[serde(rename_all = "camelCase")]
 struct QueryData {
     query: String,
     #[serde(deserialize_with = "bytes_to_pg_text")]
     params: Vec<Option<String>>,
+    #[serde(default)]
+    array_mode: Option<bool>,
 }
 
 #[derive(serde::Deserialize)]
@@ -330,7 +333,7 @@ async fn handle_inner(
     // Determine the output options. Default behaviour is 'false'. Anything that is not
     // strictly 'true' assumed to be false.
     let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
-    let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
+    let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
 
     // Allow connection pooling only if explicitly requested
     // or if we have decided that http pool is no longer opt-in
@@ -402,83 +405,87 @@ async fn handle_inner(
     // Now execute the query and return the result
     //
     let mut size = 0;
-    let result =
-        match payload {
-            Payload::Single(stmt) => {
-                let (status, results) =
-                    query_to_json(&*client, stmt, &mut 0, raw_output, array_mode)
-                        .await
-                        .map_err(|e| {
-                            client.discard();
-                            e
-                        })?;
-                client.check_idle(status);
-                results
+    let result = match payload {
+        Payload::Single(stmt) => {
+            let (status, results) =
+                query_to_json(&*client, stmt, &mut 0, raw_output, default_array_mode)
+                    .await
+                    .map_err(|e| {
+                        client.discard();
+                        e
+                    })?;
+            client.check_idle(status);
+            results
+        }
+        Payload::Batch(statements) => {
+            let (inner, mut discard) = client.inner();
+            let mut builder = inner.build_transaction();
+            if let Some(isolation_level) = txn_isolation_level {
+                builder = builder.isolation_level(isolation_level);
             }
-            Payload::Batch(statements) => {
-                let (inner, mut discard) = client.inner();
-                let mut builder = inner.build_transaction();
-                if let Some(isolation_level) = txn_isolation_level {
-                    builder = builder.isolation_level(isolation_level);
-                }
-                if txn_read_only {
-                    builder = builder.read_only(true);
-                }
-                if txn_deferrable {
-                    builder = builder.deferrable(true);
-                }
-
-                let transaction = builder.start().await.map_err(|e| {
-                    // if we cannot start a transaction, we should return immediately
-                    // and not return to the pool. connection is clearly broken
-                    discard.discard();
-                    e
-                })?;
-
-                let results =
-                    match query_batch(&transaction, statements, &mut size, raw_output, array_mode)
-                        .await
-                    {
-                        Ok(results) => {
-                            let status = transaction.commit().await.map_err(|e| {
-                                // if we cannot commit - for now don't return connection to pool
-                                // TODO: get a query status from the error
-                                discard.discard();
-                                e
-                            })?;
-                            discard.check_idle(status);
-                            results
-                        }
-                        Err(err) => {
-                            let status = transaction.rollback().await.map_err(|e| {
-                                // if we cannot rollback - for now don't return connection to pool
-                                // TODO: get a query status from the error
-                                discard.discard();
-                                e
-                            })?;
-                            discard.check_idle(status);
-                            return Err(err);
-                        }
-                    };
-
-                if txn_read_only {
-                    response = response.header(
-                        TXN_READ_ONLY.clone(),
-                        HeaderValue::try_from(txn_read_only.to_string())?,
-                    );
-                }
-                if txn_deferrable {
-                    response = response.header(
-                        TXN_DEFERRABLE.clone(),
-                        HeaderValue::try_from(txn_deferrable.to_string())?,
-                    );
-                }
-                if let Some(txn_isolation_level) = txn_isolation_level_raw {
-                    response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
-                }
-                json!({ "results": results })
+            if txn_read_only {
+                builder = builder.read_only(true);
             }
-        };
+            if txn_deferrable {
+                builder = builder.deferrable(true);
+            }
+
+            let transaction = builder.start().await.map_err(|e| {
+                // if we cannot start a transaction, we should return immediately
+                // and not return to the pool. connection is clearly broken
+                discard.discard();
+                e
+            })?;
+
+            let results = match query_batch(
+                &transaction,
+                statements,
+                &mut size,
+                raw_output,
+                default_array_mode,
+            )
+            .await
+            {
+                Ok(results) => {
+                    let status = transaction.commit().await.map_err(|e| {
+                        // if we cannot commit - for now don't return connection to pool
+                        // TODO: get a query status from the error
+                        discard.discard();
+                        e
+                    })?;
+                    discard.check_idle(status);
+                    results
+                }
+                Err(err) => {
+                    let status = transaction.rollback().await.map_err(|e| {
+                        // if we cannot rollback - for now don't return connection to pool
+                        // TODO: get a query status from the error
+                        discard.discard();
+                        e
+                    })?;
+                    discard.check_idle(status);
+                    return Err(err);
+                }
+            };
+
+            if txn_read_only {
+                response = response.header(
+                    TXN_READ_ONLY.clone(),
+                    HeaderValue::try_from(txn_read_only.to_string())?,
+                );
+            }
+            if txn_deferrable {
+                response = response.header(
+                    TXN_DEFERRABLE.clone(),
+                    HeaderValue::try_from(txn_deferrable.to_string())?,
+                );
+            }
+            if let Some(txn_isolation_level) = txn_isolation_level_raw {
+                response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
+            }
+            json!({ "results": results })
+        }
+    };
 
     ctx.set_success();
     ctx.log();
@@ -524,7 +531,7 @@ async fn query_to_json<T: GenericClient>(
     data: QueryData,
     current_size: &mut usize,
     raw_output: bool,
-    array_mode: bool,
+    default_array_mode: bool,
 ) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
     let query_params = data.params;
     let row_stream = client.query_raw_txt(&data.query, query_params).await?;
@@ -578,6 +585,8 @@ async fn query_to_json<T: GenericClient>(
         columns.push(client.get_type(c.type_oid()).await?);
     }
 
+    let array_mode = data.array_mode.unwrap_or(default_array_mode);
+
     // convert rows to JSON
     let rows = rows
         .iter()
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index b3b35e446d..49a0450f0c 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -390,6 +390,39 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
     assert result[0]["rows"] == [{"answer": 42}]
 
 
+def test_sql_over_http_batch_output_options(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http with login password 'http' superuser")
+
+    connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
+    response = requests.post(
+        f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+        data=json.dumps(
+            {
+                "queries": [
+                    {"query": "select $1 as answer", "params": [42], "arrayMode": True},
+                    {"query": "select $1 as answer", "params": [42], "arrayMode": False},
+                ]
+            }
+        ),
+        headers={
+            "Content-Type": "application/sql",
+            "Neon-Connection-String": connstr,
+            "Neon-Batch-Isolation-Level": "Serializable",
+            "Neon-Batch-Read-Only": "false",
+            "Neon-Batch-Deferrable": "false",
+        },
+        verify=str(static_proxy.test_output_dir / "proxy.crt"),
+    )
+    assert response.status_code == 200
+    results = response.json()["results"]
+
+    assert results[0]["rowAsArray"]
+    assert results[0]["rows"] == [["42"]]
+
+    assert not results[1]["rowAsArray"]
+    assert results[1]["rows"] == [{"answer": "42"}]
+
+
 def test_sql_over_http_pool(static_proxy: NeonProxy):
     static_proxy.safe_psql("create user http_auth with password 'http' superuser")
 

From eec1e1a19223750e16401962c978fdeee2a305c8 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 4 Jan 2024 12:34:15 +0000
Subject: [PATCH 0134/1571] Pre-install anon extension from compute_ctl if anon
 is in shared_preload_libraries. Users cannot install it themselves, because
 superuser is required.

GRANT all priveleged needed to use it to db_owner

We use the neon fork of the extension, because small change to sql file
is needed to allow db_owner to use it.

This feature is behind a feature flag AnonExtension,
so it is not enabled by default.
---
 Dockerfile.compute-node      |   5 +-
 compute_tools/src/compute.rs |  14 +++-
 compute_tools/src/spec.rs    | 132 ++++++++++++++++++++++++++++++++++-
 libs/compute_api/src/spec.rs |   3 +
 4 files changed, 149 insertions(+), 5 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index d91c7cfd72..cc7a110008 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -639,8 +639,8 @@ FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
-    echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
+RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
+    echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
     mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -809,6 +809,7 @@ COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
+COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 0ca1a47fbf..993b5725a4 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -765,7 +765,12 @@ impl ComputeNode {
         handle_roles(spec, &mut client)?;
         handle_databases(spec, &mut client)?;
         handle_role_deletions(spec, connstr.as_str(), &mut client)?;
-        handle_grants(spec, &mut client, connstr.as_str())?;
+        handle_grants(
+            spec,
+            &mut client,
+            connstr.as_str(),
+            self.has_feature(ComputeFeature::AnonExtension),
+        )?;
         handle_extensions(spec, &mut client)?;
         handle_extension_neon(&mut client)?;
         create_availability_check_data(&mut client)?;
@@ -839,7 +844,12 @@ impl ComputeNode {
             handle_roles(&spec, &mut client)?;
             handle_databases(&spec, &mut client)?;
             handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-            handle_grants(&spec, &mut client, self.connstr.as_str())?;
+            handle_grants(
+                &spec,
+                &mut client,
+                self.connstr.as_str(),
+                self.has_feature(ComputeFeature::AnonExtension),
+            )?;
             handle_extensions(&spec, &mut client)?;
             handle_extension_neon(&mut client)?;
             // We can skip handle_migrations here because a new migration can only appear
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 2b1bff75fe..3df5f10e23 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -581,7 +581,12 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> Result<()> {
+pub fn handle_grants(
+    spec: &ComputeSpec,
+    client: &mut Client,
+    connstr: &str,
+    enable_anon_extension: bool,
+) -> Result<()> {
     info!("modifying database permissions");
     let existing_dbs = get_existing_dbs(client)?;
 
@@ -678,6 +683,11 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) ->
             inlinify(&grant_query)
         );
         db_client.simple_query(&grant_query)?;
+
+        // it is important to run this after all grants
+        if enable_anon_extension {
+            handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
+        }
     }
 
     Ok(())
@@ -809,5 +819,125 @@ $$;"#,
         "Ran {} migrations",
         (migrations.len() - starting_migration_id)
     );
+
+    Ok(())
+}
+
+/// Connect to the database as superuser and pre-create anon extension
+/// if it is present in shared_preload_libraries
+#[instrument(skip_all)]
+pub fn handle_extension_anon(
+    spec: &ComputeSpec,
+    db_owner: &str,
+    db_client: &mut Client,
+    grants_only: bool,
+) -> Result<()> {
+    info!("handle extension anon");
+
+    if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
+        if libs.contains("anon") {
+            if !grants_only {
+                // check if extension is already initialized using anon.is_initialized()
+                let query = "SELECT anon.is_initialized()";
+                match db_client.query(query, &[]) {
+                    Ok(rows) => {
+                        if !rows.is_empty() {
+                            let is_initialized: bool = rows[0].get(0);
+                            if is_initialized {
+                                info!("anon extension is already initialized");
+                                return Ok(());
+                            }
+                        }
+                    }
+                    Err(e) => {
+                        warn!(
+                            "anon extension is_installed check failed with expected error: {}",
+                            e
+                        );
+                    }
+                };
+
+                // Create anon extension if this compute needs it
+                // Users cannot create it themselves, because superuser is required.
+                let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE";
+                info!("creating anon extension with query: {}", query);
+                match db_client.query(query, &[]) {
+                    Ok(_) => {}
+                    Err(e) => {
+                        error!("anon extension creation failed with error: {}", e);
+                        return Ok(());
+                    }
+                }
+
+                // check that extension is installed
+                query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
+                let rows = db_client.query(query, &[])?;
+                if rows.is_empty() {
+                    error!("anon extension is not installed");
+                    return Ok(());
+                }
+
+                // Initialize anon extension
+                // This also requires superuser privileges, so users cannot do it themselves.
+                query = "SELECT anon.init()";
+                match db_client.query(query, &[]) {
+                    Ok(_) => {}
+                    Err(e) => {
+                        error!("anon.init() failed with error: {}", e);
+                        return Ok(());
+                    }
+                }
+            }
+
+            // check that extension is installed, if not bail early
+            let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
+            match db_client.query(query, &[]) {
+                Ok(rows) => {
+                    if rows.is_empty() {
+                        error!("anon extension is not installed");
+                        return Ok(());
+                    }
+                }
+                Err(e) => {
+                    error!("anon extension check failed with error: {}", e);
+                    return Ok(());
+                }
+            };
+
+            let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner);
+            info!("granting anon extension permissions with query: {}", query);
+            db_client.simple_query(&query)?;
+
+            // Grant permissions to db_owner to use anon extension functions
+            let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner);
+            info!("granting anon extension permissions with query: {}", query);
+            db_client.simple_query(&query)?;
+
+            // This is needed, because some functions are defined as SECURITY DEFINER.
+            // In Postgres SECURITY DEFINER functions are executed with the privileges
+            // of the owner.
+            // In anon extension this it is needed to access some GUCs, which are only accessible to
+            // superuser. But we've patched postgres to allow db_owner to access them as well.
+            // So we need to change owner of these functions to db_owner.
+            let query = format!("
+                SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {};'
+                from pg_proc p
+                join pg_namespace nsp ON p.pronamespace = nsp.oid
+                where nsp.nspname = 'anon';", db_owner);
+
+            info!("change anon extension functions owner to db owner");
+            db_client.simple_query(&query)?;
+
+            //  affects views as well
+            let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner);
+            info!("granting anon extension permissions with query: {}", query);
+            db_client.simple_query(&query)?;
+
+            let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner);
+            info!("granting anon extension permissions with query: {}", query);
+            db_client.simple_query(&query)?;
+        }
+    }
+
     Ok(())
 }
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 13ac18e0c5..2f412b61a3 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -90,6 +90,9 @@ pub enum ComputeFeature {
     /// track short-lived connections as user activity.
     ActivityMonitorExperimental,
 
+    /// Pre-install and initialize anon extension for every database in the cluster
+    AnonExtension,
+
     /// This is a special feature flag that is used to represent unknown feature flags.
     /// Basically all unknown to enum flags are represented as this one. See unit test
     /// `parse_unknown_features()` for more details.

From eb919cab88b8a28eb423b33eb07a858acbd61eab Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 9 Feb 2024 14:52:58 +0200
Subject: [PATCH 0135/1571] prepare to move timeouts and cancellation handling
 to remote_storage (#6696)

This PR is preliminary cleanups and refactoring around `remote_storage`
for next PR which will move the timeouts and cancellation into
`remote_storage`.

Summary:
- smaller drive-by fixes
- code simplification
- refactor common parts like `DownloadError::is_permanent`
- align error types with `RemoteStorage::list_*` to use more
`download_retry` helper

Cc: #6096
---
 libs/remote_storage/src/lib.rs                | 26 ++++++-
 libs/remote_storage/src/local_fs.rs           | 50 ++++++++----
 libs/remote_storage/src/s3_bucket.rs          | 77 ++++++-------------
 libs/remote_storage/src/simulate_failures.rs  | 28 ++++---
 libs/remote_storage/src/support.rs            | 33 ++++++++
 pageserver/src/task_mgr.rs                    |  4 +-
 pageserver/src/tenant.rs                      |  4 +-
 .../src/tenant/remote_timeline_client.rs      | 35 ++++-----
 .../tenant/remote_timeline_client/download.rs | 59 +++++---------
 pageserver/src/tenant/secondary/downloader.rs |  2 +-
 10 files changed, 175 insertions(+), 143 deletions(-)
 create mode 100644 libs/remote_storage/src/support.rs

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index e64b1de6f9..b6648931ac 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -13,6 +13,7 @@ mod azure_blob;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;
+mod support;
 
 use std::{
     collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
@@ -170,7 +171,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
     /// whereas,
     /// list_prefixes("foo/bar/") = ["cat", "dog"]
     /// See `test_real_s3.rs` for more details.
-    async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_files(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
         let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
         Ok(result)
     }
@@ -179,7 +183,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
         &self,
         prefix: Option<&RemotePath>,
         _mode: ListingMode,
-    ) -> anyhow::Result<Listing, DownloadError>;
+    ) -> Result<Listing, DownloadError>;
 
     /// Streams the local file contents into remote into the remote storage entry.
     async fn upload(
@@ -269,6 +273,19 @@ impl std::fmt::Display for DownloadError {
 
 impl std::error::Error for DownloadError {}
 
+impl DownloadError {
+    /// Returns true if the error should not be retried with backoff
+    pub fn is_permanent(&self) -> bool {
+        use DownloadError::*;
+        match self {
+            BadInput(_) => true,
+            NotFound => true,
+            Cancelled => true,
+            Other(_) => false,
+        }
+    }
+}
+
 #[derive(Debug)]
 pub enum TimeTravelError {
     /// Validation or other error happened due to user input.
@@ -336,7 +353,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
     // A function for listing all the files in a "directory"
     // Example:
     // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
-    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    pub async fn list_files(
+        &self,
+        folder: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
         match self {
             Self::LocalFs(s) => s.list_files(folder).await,
             Self::AwsS3(s) => s.list_files(folder).await,
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 36ec15e1b1..3ebea76181 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -18,9 +18,7 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 
-use crate::{
-    Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath, TimeTravelError,
-};
+use crate::{Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError};
 
 use super::{RemoteStorage, StorageMetadata};
 
@@ -365,27 +363,33 @@ impl RemoteStorage for LocalFs {
                     format!("Failed to open source file {target_path:?} to use in the download")
                 })
                 .map_err(DownloadError::Other)?;
+
+            let len = source
+                .metadata()
+                .await
+                .context("query file length")
+                .map_err(DownloadError::Other)?
+                .len();
+
             source
                 .seek(io::SeekFrom::Start(start_inclusive))
                 .await
                 .context("Failed to seek to the range start in a local storage file")
                 .map_err(DownloadError::Other)?;
+
             let metadata = self
                 .read_storage_metadata(&target_path)
                 .await
                 .map_err(DownloadError::Other)?;
 
-            let download_stream: DownloadStream = match end_exclusive {
-                Some(end_exclusive) => Box::pin(ReaderStream::new(
-                    source.take(end_exclusive - start_inclusive),
-                )),
-                None => Box::pin(ReaderStream::new(source)),
-            };
+            let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
+            let source = ReaderStream::new(source);
+
             Ok(Download {
                 metadata,
                 last_modified: None,
                 etag: None,
-                download_stream,
+                download_stream: Box::pin(source),
             })
         } else {
             Err(DownloadError::NotFound)
@@ -514,10 +518,8 @@ mod fs_tests {
     use futures_util::Stream;
     use std::{collections::HashMap, io::Write};
 
-    async fn read_and_assert_remote_file_contents(
+    async fn read_and_check_metadata(
         storage: &LocalFs,
-        #[allow(clippy::ptr_arg)]
-        // have to use &Utf8PathBuf due to `storage.local_path` parameter requirements
         remote_storage_path: &RemotePath,
         expected_metadata: Option<&StorageMetadata>,
     ) -> anyhow::Result<String> {
@@ -596,7 +598,7 @@ mod fs_tests {
         let upload_name = "upload_1";
         let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
 
-        let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
+        let contents = read_and_check_metadata(&storage, &upload_target, None).await?;
         assert_eq!(
             dummy_contents(upload_name),
             contents,
@@ -618,7 +620,7 @@ mod fs_tests {
         let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
 
         let full_range_download_contents =
-            read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
+            read_and_check_metadata(&storage, &upload_target, None).await?;
         assert_eq!(
             dummy_contents(upload_name),
             full_range_download_contents,
@@ -660,6 +662,22 @@ mod fs_tests {
             "Second part bytes should be returned when requested"
         );
 
+        let suffix_bytes = storage
+            .download_byte_range(&upload_target, 13, None)
+            .await?
+            .download_stream;
+        let suffix_bytes = aggregate(suffix_bytes).await?;
+        let suffix = std::str::from_utf8(&suffix_bytes)?;
+        assert_eq!(upload_name, suffix);
+
+        let all_bytes = storage
+            .download_byte_range(&upload_target, 0, None)
+            .await?
+            .download_stream;
+        let all_bytes = aggregate(all_bytes).await?;
+        let all_bytes = std::str::from_utf8(&all_bytes)?;
+        assert_eq!(dummy_contents("upload_1"), all_bytes);
+
         Ok(())
     }
 
@@ -736,7 +754,7 @@ mod fs_tests {
             upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?;
 
         let full_range_download_contents =
-            read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?;
+            read_and_check_metadata(&storage, &upload_target, Some(&metadata)).await?;
         assert_eq!(
             dummy_contents(upload_name),
             full_range_download_contents,
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index c9ad9ef225..2b33a6ffd1 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -45,8 +45,9 @@ use utils::backoff;
 
 use super::StorageMetadata;
 use crate::{
-    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
-    S3Config, TimeTravelError, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode,
+    RemotePath, RemoteStorage, S3Config, TimeTravelError, MAX_KEYS_PER_DELETE,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 pub(super) mod metrics;
@@ -63,7 +64,6 @@ pub struct S3Bucket {
     concurrency_limiter: ConcurrencyLimiter,
 }
 
-#[derive(Default)]
 struct GetObjectRequest {
     bucket: String,
     key: String,
@@ -232,24 +232,8 @@ impl S3Bucket {
 
         let started_at = ScopeGuard::into_inner(started_at);
 
-        match get_object {
-            Ok(object_output) => {
-                let metadata = object_output.metadata().cloned().map(StorageMetadata);
-                let etag = object_output.e_tag.clone();
-                let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
-
-                let body = object_output.body;
-                let body = ByteStreamAsStream::from(body);
-                let body = PermitCarrying::new(permit, body);
-                let body = TimedDownload::new(started_at, body);
-
-                Ok(Download {
-                    metadata,
-                    etag,
-                    last_modified,
-                    download_stream: Box::pin(body),
-                })
-            }
+        let object_output = match get_object {
+            Ok(object_output) => object_output,
             Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
                 // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                 // an error: we expect to sometimes fetch an object and find it missing,
@@ -259,7 +243,7 @@ impl S3Bucket {
                     AttemptOutcome::Ok,
                     started_at,
                 );
-                Err(DownloadError::NotFound)
+                return Err(DownloadError::NotFound);
             }
             Err(e) => {
                 metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
@@ -268,11 +252,27 @@ impl S3Bucket {
                     started_at,
                 );
 
-                Err(DownloadError::Other(
+                return Err(DownloadError::Other(
                     anyhow::Error::new(e).context("download s3 object"),
-                ))
+                ));
             }
-        }
+        };
+
+        let metadata = object_output.metadata().cloned().map(StorageMetadata);
+        let etag = object_output.e_tag;
+        let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
+
+        let body = object_output.body;
+        let body = ByteStreamAsStream::from(body);
+        let body = PermitCarrying::new(permit, body);
+        let body = TimedDownload::new(started_at, body);
+
+        Ok(Download {
+            metadata,
+            etag,
+            last_modified,
+            download_stream: Box::pin(body),
+        })
     }
 
     async fn delete_oids(
@@ -354,33 +354,6 @@ impl Stream for ByteStreamAsStream {
     // sense and Stream::size_hint does not really
 }
 
-pin_project_lite::pin_project! {
-    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
-    struct PermitCarrying<S> {
-        permit: tokio::sync::OwnedSemaphorePermit,
-        #[pin]
-        inner: S,
-    }
-}
-
-impl<S> PermitCarrying<S> {
-    fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
-        Self { permit, inner }
-    }
-}
-
-impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for PermitCarrying<S> {
-    type Item = <S as Stream>::Item;
-
-    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        self.project().inner.poll_next(cx)
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.inner.size_hint()
-    }
-}
-
 pin_project_lite::pin_project! {
     /// Times and tracks the outcome of the request.
     struct TimedDownload<S> {
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 82d5a61fda..14bdb5ed4d 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -60,7 +60,7 @@ impl UnreliableWrapper {
     /// On the first attempts of this operation, return an error. After 'attempts_to_fail'
     /// attempts, let the operation go ahead, and clear the counter.
     ///
-    fn attempt(&self, op: RemoteOp) -> Result<u64, DownloadError> {
+    fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
         let mut attempts = self.attempts.lock().unwrap();
 
         match attempts.entry(op) {
@@ -78,13 +78,13 @@ impl UnreliableWrapper {
                 } else {
                     let error =
                         anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
-                    Err(DownloadError::Other(error))
+                    Err(error)
                 }
             }
             Entry::Vacant(e) => {
                 let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
                 e.insert(1);
-                Err(DownloadError::Other(error))
+                Err(error)
             }
         }
     }
@@ -105,12 +105,17 @@ impl RemoteStorage for UnreliableWrapper {
         &self,
         prefix: Option<&RemotePath>,
     ) -> Result<Vec<RemotePath>, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
+            .map_err(DownloadError::Other)?;
         self.inner.list_prefixes(prefix).await
     }
 
-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?;
+    async fn list_files(
+        &self,
+        folder: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
+            .map_err(DownloadError::Other)?;
         self.inner.list_files(folder).await
     }
 
@@ -119,7 +124,8 @@ impl RemoteStorage for UnreliableWrapper {
         prefix: Option<&RemotePath>,
         mode: ListingMode,
     ) -> Result<Listing, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
+            .map_err(DownloadError::Other)?;
         self.inner.list(prefix, mode).await
     }
 
@@ -137,7 +143,8 @@ impl RemoteStorage for UnreliableWrapper {
     }
 
     async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
-        self.attempt(RemoteOp::Download(from.clone()))?;
+        self.attempt(RemoteOp::Download(from.clone()))
+            .map_err(DownloadError::Other)?;
         self.inner.download(from).await
     }
 
@@ -150,7 +157,8 @@ impl RemoteStorage for UnreliableWrapper {
         // Note: We treat any download_byte_range as an "attempt" of the same
         // operation. We don't pay attention to the ranges. That's good enough
         // for now.
-        self.attempt(RemoteOp::Download(from.clone()))?;
+        self.attempt(RemoteOp::Download(from.clone()))
+            .map_err(DownloadError::Other)?;
         self.inner
             .download_byte_range(from, start_inclusive, end_exclusive)
             .await
@@ -193,7 +201,7 @@ impl RemoteStorage for UnreliableWrapper {
         cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError> {
         self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
-            .map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
+            .map_err(TimeTravelError::Other)?;
         self.inner
             .time_travel_recover(prefix, timestamp, done_if_after, cancel)
             .await
diff --git a/libs/remote_storage/src/support.rs b/libs/remote_storage/src/support.rs
new file mode 100644
index 0000000000..4688a484a5
--- /dev/null
+++ b/libs/remote_storage/src/support.rs
@@ -0,0 +1,33 @@
+use std::{
+    pin::Pin,
+    task::{Context, Poll},
+};
+
+use futures_util::Stream;
+
+pin_project_lite::pin_project! {
+    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
+    pub(crate) struct PermitCarrying<S> {
+        permit: tokio::sync::OwnedSemaphorePermit,
+        #[pin]
+        inner: S,
+    }
+}
+
+impl<S> PermitCarrying<S> {
+    pub(crate) fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
+        Self { permit, inner }
+    }
+}
+
+impl<S: Stream> Stream for PermitCarrying<S> {
+    type Item = <S as Stream>::Item;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        self.project().inner.poll_next(cx)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
+    }
+}
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 5a06a97525..3cec5fa850 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -576,8 +576,8 @@ pub fn shutdown_token() -> CancellationToken {
 
 /// Has the current task been requested to shut down?
 pub fn is_shutdown_requested() -> bool {
-    if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) {
-        cancel.is_cancelled()
+    if let Ok(true_or_false) = SHUTDOWN_TOKEN.try_with(|t| t.is_cancelled()) {
+        true_or_false
     } else {
         if !cfg!(test) {
             warn!("is_shutdown_requested() called in an unexpected task or thread");
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f086f46213..4446c410b0 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1377,7 +1377,7 @@ impl Tenant {
                 async move {
                     debug!("starting index part download");
 
-                    let index_part = client.download_index_file(cancel_clone).await;
+                    let index_part = client.download_index_file(&cancel_clone).await;
 
                     debug!("finished index part download");
 
@@ -2434,7 +2434,7 @@ impl Tenant {
             // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
             // we use here really is the remotely persistent one).
             let result = tl_client
-                .download_index_file(self.cancel.clone())
+                .download_index_file(&self.cancel)
                 .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
                 .await?;
             let index_part = match result {
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 152c9a2b7d..0c7dd68c3f 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -217,6 +217,7 @@ use crate::metrics::{
 };
 use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::remote_timeline_client::download::download_retry;
 use crate::tenant::storage_layer::AsLayerDesc;
 use crate::tenant::upload_queue::Delete;
 use crate::tenant::TIMELINES_SEGMENT_NAME;
@@ -262,6 +263,11 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
 
+/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows.  It is not
+/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
+pub(crate) const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
+pub(crate) const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
+
 pub enum MaybeDeletedIndexPart {
     IndexPart(IndexPart),
     Deleted(IndexPart),
@@ -325,11 +331,6 @@ pub struct RemoteTimelineClient {
     cancel: CancellationToken,
 }
 
-/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows.  It is not
-/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
-const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-
 /// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
 ///
 /// This is a convenience for the various upload functions.  In future
@@ -506,7 +507,7 @@ impl RemoteTimelineClient {
     /// Download index file
     pub async fn download_index_file(
         &self,
-        cancel: CancellationToken,
+        cancel: &CancellationToken,
     ) -> Result<MaybeDeletedIndexPart, DownloadError> {
         let _unfinished_gauge_guard = self.metrics.call_begin(
             &RemoteOpFileKind::Index,
@@ -1147,22 +1148,17 @@ impl RemoteTimelineClient {
 
         let cancel = shutdown_token();
 
-        let remaining = backoff::retry(
+        let remaining = download_retry(
             || async {
                 self.storage_impl
                     .list_files(Some(&timeline_storage_path))
                     .await
             },
-            |_e| false,
-            FAILED_DOWNLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "list_prefixes",
+            "list remaining files",
             &cancel,
         )
         .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled!"))
-        .and_then(|x| x)
-        .context("list prefixes")?;
+        .context("list files remaining files")?;
 
         // We will delete the current index_part object last, since it acts as a deletion
         // marker via its deleted_at attribute
@@ -1351,6 +1347,7 @@ impl RemoteTimelineClient {
     /// queue.
     ///
     async fn perform_upload_task(self: &Arc<Self>, task: Arc<UploadTask>) {
+        let cancel = shutdown_token();
         // Loop to retry until it completes.
         loop {
             // If we're requested to shut down, close up shop and exit.
@@ -1362,7 +1359,7 @@ impl RemoteTimelineClient {
             // the Future, but we're not 100% sure if the remote storage library
             // is cancellation safe, so we don't dare to do that. Hopefully, the
             // upload finishes or times out soon enough.
-            if task_mgr::is_shutdown_requested() {
+            if cancel.is_cancelled() {
                 info!("upload task cancelled by shutdown request");
                 match self.stop() {
                     Ok(()) => {}
@@ -1473,7 +1470,7 @@ impl RemoteTimelineClient {
                         retries,
                         DEFAULT_BASE_BACKOFF_SECONDS,
                         DEFAULT_MAX_BACKOFF_SECONDS,
-                        &shutdown_token(),
+                        &cancel,
                     )
                     .await;
                 }
@@ -1990,7 +1987,7 @@ mod tests {
 
         // Download back the index.json, and check that the list of files is correct
         let initial_index_part = match client
-            .download_index_file(CancellationToken::new())
+            .download_index_file(&CancellationToken::new())
             .await
             .unwrap()
         {
@@ -2084,7 +2081,7 @@ mod tests {
 
         // Download back the index.json, and check that the list of files is correct
         let index_part = match client
-            .download_index_file(CancellationToken::new())
+            .download_index_file(&CancellationToken::new())
             .await
             .unwrap()
         {
@@ -2286,7 +2283,7 @@ mod tests {
         let client = test_state.build_client(get_generation);
 
         let download_r = client
-            .download_index_file(CancellationToken::new())
+            .download_index_file(&CancellationToken::new())
             .await
             .expect("download should always succeed");
         assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 6c1125746b..33287fc8f4 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -216,16 +216,15 @@ pub async fn list_remote_timelines(
         anyhow::bail!("storage-sync-list-remote-timelines");
     });
 
-    let cancel_inner = cancel.clone();
     let listing = download_retry_forever(
         || {
             download_cancellable(
-                &cancel_inner,
+                &cancel,
                 storage.list(Some(&remote_path), ListingMode::WithDelimiter),
             )
         },
         &format!("list timelines for {tenant_shard_id}"),
-        cancel,
+        &cancel,
     )
     .await?;
 
@@ -258,19 +257,18 @@ async fn do_download_index_part(
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
     index_generation: Generation,
-    cancel: CancellationToken,
+    cancel: &CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
     use futures::stream::StreamExt;
 
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
-    let cancel_inner = cancel.clone();
     let index_part_bytes = download_retry_forever(
         || async {
             // Cancellation: if is safe to cancel this future because we're just downloading into
             // a memory buffer, not touching local disk.
             let index_part_download =
-                download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
+                download_cancellable(cancel, storage.download(&remote_path)).await?;
 
             let mut index_part_bytes = Vec::new();
             let mut stream = std::pin::pin!(index_part_download.download_stream);
@@ -288,7 +286,7 @@ async fn do_download_index_part(
     .await?;
 
     let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
-        .with_context(|| format!("download index part file at {remote_path:?}"))
+        .with_context(|| format!("deserialize index part file at {remote_path:?}"))
         .map_err(DownloadError::Other)?;
 
     Ok(index_part)
@@ -305,7 +303,7 @@ pub(super) async fn download_index_part(
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
     my_generation: Generation,
-    cancel: CancellationToken,
+    cancel: &CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -325,14 +323,8 @@ pub(super) async fn download_index_part(
     // index in our generation.
     //
     // This is an optimization to avoid doing the listing for the general case below.
-    let res = do_download_index_part(
-        storage,
-        tenant_shard_id,
-        timeline_id,
-        my_generation,
-        cancel.clone(),
-    )
-    .await;
+    let res =
+        do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
     match res {
         Ok(index_part) => {
             tracing::debug!(
@@ -357,7 +349,7 @@ pub(super) async fn download_index_part(
         tenant_shard_id,
         timeline_id,
         my_generation.previous(),
-        cancel.clone(),
+        cancel,
     )
     .await;
     match res {
@@ -379,18 +371,13 @@ pub(super) async fn download_index_part(
     // objects, and select the highest one with a generation <= my_generation.  Constructing the prefix is equivalent
     // to constructing a full index path with no generation, because the generation is a suffix.
     let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
-    let indices = backoff::retry(
+
+    let indices = download_retry(
         || async { storage.list_files(Some(&index_prefix)).await },
-        |_| false,
-        FAILED_DOWNLOAD_WARN_THRESHOLD,
-        FAILED_REMOTE_OP_RETRIES,
-        "listing index_part files",
-        &cancel,
+        "list index_part files",
+        cancel,
     )
-    .await
-    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
-    .and_then(|x| x)
-    .map_err(DownloadError::Other)?;
+    .await?;
 
     // General case logic for which index to use: the latest index whose generation
     // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
@@ -447,8 +434,6 @@ pub(crate) async fn download_initdb_tar_zst(
         "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
     ));
 
-    let cancel_inner = cancel.clone();
-
     let file = download_retry(
         || async {
             let file = OpenOptions::new()
@@ -461,13 +446,11 @@ pub(crate) async fn download_initdb_tar_zst(
                 .with_context(|| format!("tempfile creation {temp_path}"))
                 .map_err(DownloadError::Other)?;
 
-            let download = match download_cancellable(&cancel_inner, storage.download(&remote_path))
-                .await
+            let download = match download_cancellable(cancel, storage.download(&remote_path)).await
             {
                 Ok(dl) => dl,
                 Err(DownloadError::NotFound) => {
-                    download_cancellable(&cancel_inner, storage.download(&remote_preserved_path))
-                        .await?
+                    download_cancellable(cancel, storage.download(&remote_preserved_path)).await?
                 }
                 Err(other) => Err(other)?,
             };
@@ -516,7 +499,7 @@ pub(crate) async fn download_initdb_tar_zst(
 /// with backoff.
 ///
 /// (See similar logic for uploads in `perform_upload_task`)
-async fn download_retry<T, O, F>(
+pub(super) async fn download_retry<T, O, F>(
     op: O,
     description: &str,
     cancel: &CancellationToken,
@@ -527,7 +510,7 @@ where
 {
     backoff::retry(
         op,
-        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
+        DownloadError::is_permanent,
         FAILED_DOWNLOAD_WARN_THRESHOLD,
         FAILED_REMOTE_OP_RETRIES,
         description,
@@ -541,7 +524,7 @@ where
 async fn download_retry_forever<T, O, F>(
     op: O,
     description: &str,
-    cancel: CancellationToken,
+    cancel: &CancellationToken,
 ) -> Result<T, DownloadError>
 where
     O: FnMut() -> F,
@@ -549,11 +532,11 @@ where
 {
     backoff::retry(
         op,
-        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
+        DownloadError::is_permanent,
         FAILED_DOWNLOAD_WARN_THRESHOLD,
         u32::MAX,
         description,
-        &cancel,
+        cancel,
     )
     .await
     .ok_or_else(|| DownloadError::Cancelled)
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 9330edf946..0666e104f8 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -533,7 +533,7 @@ impl<'a> TenantDownloader<'a> {
                     .map_err(UpdateError::from)?;
                 let mut heatmap_bytes = Vec::new();
                 let mut body = tokio_util::io::StreamReader::new(download.download_stream);
-                let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?;
+                let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
                 Ok(heatmap_bytes)
             },
             |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),

From 8d98981fe580fcdfb7066a5698c2448af0cbc61d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 9 Feb 2024 13:20:04 +0000
Subject: [PATCH 0136/1571] tests: deflake test_sharding_split_unsharded
 (#6699)

## Problem

This test was a subset of the larger sharding test, and it missed the
validate() call on workload that was implicitly waiting for a tenant to
become active before trying to split it. It could therefore fail to
split due to tenant not yet being active.

## Summary of changes

- Insert .validate() call, and move the Workload setup to after the
check of shard ID (as the shard ID check should pass immediately)
---
 test_runner/regress/test_sharding.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 27d1cf2f34..fa40219d0e 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -95,14 +95,15 @@ def test_sharding_split_unsharded(
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    workload = Workload(env, tenant_id, timeline_id, branch_name="main")
-    workload.init()
-    workload.write_rows(256)
-
     # Check that we created with an unsharded TenantShardId: this is the default,
     # but check it in case we change the default in future
     assert env.attachment_service.inspect(TenantShardId(tenant_id, 0, 0)) is not None
 
+    workload = Workload(env, tenant_id, timeline_id, branch_name="main")
+    workload.init()
+    workload.write_rows(256)
+    workload.validate()
+
     # Split one shard into two
     env.attachment_service.tenant_shard_split(tenant_id, shard_count=2)
 

From 84a0e7b022e37b041004e7d9299060a3777c63eb Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 9 Feb 2024 11:07:42 +0200
Subject: [PATCH 0137/1571] tests: Allow setting shutdown mode separately from
 'destroy' flag

In neon_local, the default mode is now always 'fast', regardless of
'destroy'. You can override it with the "neon_local endpoint stop
--mode=immediate" flag.

In python tests, we still default to 'immediate' mode when using the
stop_and_destroy() function, and 'fast' with plain stop(). I kept that
to avoid changing behavior in existing tests. I don't think existing
tests depend on it, but I wasn't 100% certain.
---
 control_plane/src/bin/neon_local.rs   | 16 +++++++++++++---
 control_plane/src/endpoint.rs         | 18 ++----------------
 test_runner/fixtures/neon_fixtures.py | 11 +++++++----
 3 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index b9af467fdf..d71cdf02c0 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1014,12 +1014,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 .get_one::<String>("endpoint_id")
                 .ok_or_else(|| anyhow!("No endpoint ID was provided to stop"))?;
             let destroy = sub_args.get_flag("destroy");
+            let mode = sub_args.get_one::<String>("mode").expect("has a default");
 
             let endpoint = cplane
                 .endpoints
                 .get(endpoint_id.as_str())
                 .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            endpoint.stop(destroy)?;
+            endpoint.stop(mode, destroy)?;
         }
 
         _ => bail!("Unexpected endpoint subcommand '{sub_name}'"),
@@ -1303,7 +1304,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
     match ComputeControlPlane::load(env.clone()) {
         Ok(cplane) => {
             for (_k, node) in cplane.endpoints {
-                if let Err(e) = node.stop(false) {
+                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
                     eprintln!("postgres stop failed: {e:#}");
                 }
             }
@@ -1652,7 +1653,16 @@ fn cli() -> Command {
                             .long("destroy")
                             .action(ArgAction::SetTrue)
                             .required(false)
-                        )
+                    )
+                    .arg(
+                        Arg::new("mode")
+                            .help("Postgres shutdown mode, passed to \"pg_ctl -m <mode>\"")
+                            .long("mode")
+                            .action(ArgAction::Set)
+                            .required(false)
+                            .value_parser(["smart", "fast", "immediate"])
+                            .default_value("fast")
+                    )
                 )
 
         )
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index b19a6a1a18..f1fe12e05f 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -761,22 +761,8 @@ impl Endpoint {
         }
     }
 
-    pub fn stop(&self, destroy: bool) -> Result<()> {
-        // If we are going to destroy data directory,
-        // use immediate shutdown mode, otherwise,
-        // shutdown gracefully to leave the data directory sane.
-        //
-        // Postgres is always started from scratch, so stop
-        // without destroy only used for testing and debugging.
-        //
-        self.pg_ctl(
-            if destroy {
-                &["-m", "immediate", "stop"]
-            } else {
-                &["stop"]
-            },
-            &None,
-        )?;
+    pub fn stop(&self, mode: &str, destroy: bool) -> Result<()> {
+        self.pg_ctl(&["-m", mode, "stop"], &None)?;
 
         // Also wait for the compute_ctl process to die. It might have some
         // cleanup work to do after postgres stops, like syncing safekeepers,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a6aff77ddf..9996853525 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1816,6 +1816,7 @@ class NeonCli(AbstractNeonCli):
         endpoint_id: str,
         destroy=False,
         check_return_code=True,
+        mode: Optional[str] = None,
     ) -> "subprocess.CompletedProcess[str]":
         args = [
             "endpoint",
@@ -1823,6 +1824,8 @@ class NeonCli(AbstractNeonCli):
         ]
         if destroy:
             args.append("--destroy")
+        if mode is not None:
+            args.append(f"--mode={mode}")
         if endpoint_id is not None:
             args.append(endpoint_id)
 
@@ -3162,7 +3165,7 @@ class Endpoint(PgProtocol):
         with open(remote_extensions_spec_path, "w") as file:
             json.dump(spec, file, indent=4)
 
-    def stop(self) -> "Endpoint":
+    def stop(self, mode: str = "fast") -> "Endpoint":
         """
         Stop the Postgres instance if it's running.
         Returns self.
@@ -3171,13 +3174,13 @@ class Endpoint(PgProtocol):
         if self.running:
             assert self.endpoint_id is not None
             self.env.neon_cli.endpoint_stop(
-                self.endpoint_id, check_return_code=self.check_stop_result
+                self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
             )
             self.running = False
 
         return self
 
-    def stop_and_destroy(self) -> "Endpoint":
+    def stop_and_destroy(self, mode: str = "immediate") -> "Endpoint":
         """
         Stop the Postgres instance, then destroy the endpoint.
         Returns self.
@@ -3185,7 +3188,7 @@ class Endpoint(PgProtocol):
 
         assert self.endpoint_id is not None
         self.env.neon_cli.endpoint_stop(
-            self.endpoint_id, True, check_return_code=self.check_stop_result
+            self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode
         )
         self.endpoint_id = None
         self.running = False

From 5239cdc29fdfe8458798cefad51f8871108f9811 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 9 Feb 2024 11:07:47 +0200
Subject: [PATCH 0138/1571] Fix test_vm_bit_clear_on_heap_lock test

The test was supposed to reproduce the bug fixed in commit 66fa176cc8,
i.e. that the clearing of the VM bit was not replayed in the
pageserver on HEAP_LOCK records. But it was broken in many ways and
failed to reproduce the original problem if you reverted the fix:

- The comparison of XIDs was broken. The test read the XID in to a
  variable in python, but it was treated as a string rather than an
  integer. As a result, e.g. "999" > "1000".

- The test accessed the locked tuple too early, in the loop. Accessing
  it early, before the pg_xact page had been removed, set the hint bits.
  That masked the problem on subsequent accesses.

- The on-demand SLRU download that was introduced in commit 9a9d9beaee
  hid the issue. Even though an SLRU segment was removed by Postgres,
  when it later tried to access it, it could still download it from
  the pageserver. To ensure that doesn't happen, shorten the GC period
  and compact and GC aggressively in the test.

I also added a more direct check that the VM page is updated, using
the get_page_at_lsn() debugging function. Right after locking the row,
we now fetch the VM page from pageserver and directly compare it with
the VM page in the page cache. They should match. That assertion is
more robust to things like on-demand SLRU download that could mask the
bug.
---
 test_runner/regress/test_vm_bits.py | 118 +++++++++++++++++-----------
 1 file changed, 72 insertions(+), 46 deletions(-)

diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 415f086bd3..06c30b8d81 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -1,6 +1,7 @@
-import pytest
+import time
+
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, fork_at_current_lsn
 
 
 #
@@ -118,12 +119,20 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
 # Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK
 # record.
 #
-# FIXME: This test is broken
-@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/6412#issuecomment-1902072541")
-def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv):
-    env = neon_simple_env
+def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
+    tenant_conf = {
+        "checkpoint_distance": f"{128 * 1024}",
+        "compaction_target_size": f"{128 * 1024}",
+        "compaction_threshold": "1",
+        # create image layers eagerly, so that GC can remove some layers
+        "image_creation_threshold": "1",
+        # set PITR interval to be small, so we can do GC
+        "pitr_interval": "0 s",
+    }
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
 
-    env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock", "empty")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock")
     endpoint = env.endpoints.create_start(
         "test_vm_bit_clear_on_heap_lock",
         config_lines=[
@@ -139,72 +148,88 @@ def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv):
 
     # Install extension containing function needed for test
     cur.execute("CREATE EXTENSION neon_test_utils")
-
-    cur.execute("SELECT pg_switch_wal()")
+    cur.execute("CREATE EXTENSION pageinspect")
 
     # Create a test table and freeze it to set the all-frozen VM bit on all pages.
     cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)")
     cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g")
-    cur.execute("VACUUM FREEZE vmtest_lock")
+
+    cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true) vmtest_lock")
 
     # Lock a row. This clears the all-frozen VM bit for that page.
+    cur.execute("BEGIN")
     cur.execute("SELECT * FROM vmtest_lock WHERE id = 40000 FOR UPDATE")
 
     # Remember the XID. We will use it later to verify that we have consumed a lot of
     # XIDs after this.
     cur.execute("select pg_current_xact_id()")
-    locking_xid = cur.fetchall()[0][0]
+    locking_xid = int(cur.fetchall()[0][0])
 
-    # Stop and restart postgres, to clear the buffer cache.
+    cur.execute("COMMIT")
+
+    # The VM page in shared buffer cache, and the same page as reconstructed
+    # by the pageserver, should be equal.
+    cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
+    vm_page_in_cache = (cur.fetchall()[0][0])[:100].hex()
+    cur.execute("select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn() )")
+    vm_page_at_pageserver = (cur.fetchall()[0][0])[:100].hex()
+
+    assert vm_page_at_pageserver == vm_page_in_cache
+
+    # The above assert is enough to verify the bug that was fixed in
+    # commit 66fa176cc8. But for good measure, we also reproduce the
+    # original problem that the missing VM page update caused. The
+    # rest of the test does that.
+
+    # Kill and restart postgres, to clear the buffer cache.
     #
     # NOTE: clear_buffer_cache() will not do, because it evicts the dirty pages
     # in a "clean" way. Our neon extension will write a full-page image of the VM
-    # page, and we want to avoid that.
-    endpoint.stop()
+    # page, and we want to avoid that. A clean shutdown will also not do, for the
+    # same reason.
+    endpoint.stop(mode="immediate")
+
     endpoint.start()
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
-    cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ")
-    tup = cur.fetchall()
-    xmax_before = tup[0][1]
-
     # Consume a lot of XIDs, so that anti-wraparound autovacuum kicks
     # in and the clog gets truncated. We set autovacuum_freeze_max_age to a very
     # low value, so it doesn't take all that many XIDs for autovacuum to kick in.
-    for i in range(1000):
+    #
+    # We could use test_consume_xids() to consume XIDs much faster,
+    # but it wouldn't speed up the overall test, because we'd still
+    # need to wait for autovacuum to run.
+    for _ in range(1000):
+        cur.execute("select test_consume_xids(10000);")
+    for _ in range(1000):
         cur.execute(
-            """
-        CREATE TEMP TABLE othertable (i int) ON COMMIT DROP;
-        do $$
-        begin
-          for i in 1..100000 loop
-            -- Use a begin-exception block to generate a new subtransaction on each iteration
-            begin
-              insert into othertable values (i);
-            exception when others then
-              raise 'not expected %', sqlerrm;
-            end;
-          end loop;
-        end;
-        $$;
-        """
+            "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn() )"
         )
-        cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ")
-        tup = cur.fetchall()
-        log.info(f"tuple = {tup}")
-        xmax = tup[0][1]
-        assert xmax == xmax_before
+        page = (cur.fetchall()[0][0])[:100].hex()
+        log.info(f"VM page contents: {page}")
 
-        if i % 50 == 0:
-            cur.execute("select datfrozenxid from pg_database where datname='postgres'")
-            datfrozenxid = cur.fetchall()[0][0]
-            if datfrozenxid > locking_xid:
-                break
+        cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
+        page = (cur.fetchall()[0][0])[:100].hex()
+        log.info(f"VM page contents in cache: {page}")
+
+        cur.execute("select min(datfrozenxid::text::int) from pg_database")
+        datfrozenxid = int(cur.fetchall()[0][0])
+        log.info(f"datfrozenxid {datfrozenxid} locking_xid: {locking_xid}")
+        if datfrozenxid > locking_xid + 3000000:
+            break
+        time.sleep(0.5)
 
     cur.execute("select pg_current_xact_id()")
-    curr_xid = cur.fetchall()[0][0]
-    assert int(curr_xid) - int(locking_xid) >= 100000
+    curr_xid = int(cur.fetchall()[0][0])
+    assert curr_xid - locking_xid >= 100000
+
+    # Perform GC in the pageserver. Otherwise the compute might still
+    # be able to download the already-deleted SLRU segment from the
+    # pageserver. That masks the original bug.
+    env.pageserver.http_client().timeline_checkpoint(tenant_id, timeline_id)
+    env.pageserver.http_client().timeline_compact(tenant_id, timeline_id)
+    env.pageserver.http_client().timeline_gc(tenant_id, timeline_id, 0)
 
     # Now, if the VM all-frozen bit was not correctly cleared on
     # replay, we will try to fetch the status of the XID that was
@@ -214,3 +239,4 @@ def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv):
     cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 for update")
     tup = cur.fetchall()
     log.info(f"tuple = {tup}")
+    cur.execute("commit transaction")

From 89a5c654bfc688babcdfa6c9dcda68876c0d6f98 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 9 Feb 2024 14:26:50 +0000
Subject: [PATCH 0139/1571] control_plane: follow up for embedded migrations
 (#6647)

## Problem

In https://github.com/neondatabase/neon/pull/6637, we remove the need to
run migrations externally, but for compat tests to work we can't remove
those invocations from the neon_local binary.

Once that previous PR merges, we can make the followup changes without
upsetting compat tests.
---
 Cargo.lock                              |   4 -
 control_plane/Cargo.toml                |   2 -
 control_plane/src/attachment_service.rs | 118 +++++-------------------
 workspace_hack/Cargo.toml               |   2 -
 4 files changed, 22 insertions(+), 104 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c0c319cd89..a2939e6c75 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1329,8 +1329,6 @@ dependencies = [
  "clap",
  "comfy-table",
  "compute_api",
- "diesel",
- "diesel_migrations",
  "futures",
  "git-version",
  "hex",
@@ -6832,8 +6830,6 @@ dependencies = [
  "clap",
  "clap_builder",
  "crossbeam-utils",
- "diesel",
- "diesel_derives",
  "either",
  "fail",
  "futures-channel",
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 09c171f1d3..75e5dcb7f8 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -10,8 +10,6 @@ async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
-diesel = { version = "2.1.4", features = ["postgres"]}
-diesel_migrations = { version = "2.1.0", features = ["postgres"]}
 futures.workspace = true
 git-version.workspace = true
 nix.workspace = true
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index c3e071aa71..14bfda47c3 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,11 +1,5 @@
 use crate::{background_process, local_env::LocalEnv};
 use camino::{Utf8Path, Utf8PathBuf};
-use diesel::{
-    backend::Backend,
-    query_builder::{AstPass, QueryFragment, QueryId},
-    Connection, PgConnection, QueryResult, RunQueryDsl,
-};
-use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 use hyper::Method;
 use pageserver_api::{
     models::{
@@ -17,7 +11,7 @@ use pageserver_api::{
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{env, str::FromStr};
+use std::str::FromStr;
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
@@ -273,37 +267,6 @@ impl AttachmentService {
         .expect("non-Unicode path")
     }
 
-    /// In order to access database migrations, we need to find the Neon source tree
-    async fn find_source_root(&self) -> anyhow::Result<Utf8PathBuf> {
-        // We assume that either prd or our binary is in the source tree. The former is usually
-        // true for automated test runners, the latter is usually true for developer workstations. Often
-        // both are true, which is fine.
-        let candidate_start_points = [
-            // Current working directory
-            Utf8PathBuf::from_path_buf(std::env::current_dir()?).unwrap(),
-            // Directory containing the binary we're running inside
-            Utf8PathBuf::from_path_buf(env::current_exe()?.parent().unwrap().to_owned()).unwrap(),
-        ];
-
-        // For each candidate start point, search through ancestors looking for a neon.git source tree root
-        for start_point in &candidate_start_points {
-            // Start from the build dir: assumes we are running out of a built neon source tree
-            for path in start_point.ancestors() {
-                // A crude approximation: the root of the source tree is whatever contains a "control_plane"
-                // subdirectory.
-                let control_plane = path.join("control_plane");
-                if tokio::fs::try_exists(&control_plane).await? {
-                    return Ok(path.to_owned());
-                }
-            }
-        }
-
-        // Fall-through
-        Err(anyhow::anyhow!(
-            "Could not find control_plane src dir, after searching ancestors of {candidate_start_points:?}"
-        ))
-    }
-
     /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
     ///
     /// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
@@ -343,69 +306,32 @@ impl AttachmentService {
     ///
     /// Returns the database url
     pub async fn setup_database(&self) -> anyhow::Result<String> {
-        let database_url = format!(
-            "postgresql://localhost:{}/attachment_service",
-            self.postgres_port
-        );
-        println!("Running attachment service database setup...");
-        fn change_database_of_url(database_url: &str, default_database: &str) -> (String, String) {
-            let base = ::url::Url::parse(database_url).unwrap();
-            let database = base.path_segments().unwrap().last().unwrap().to_owned();
-            let mut new_url = base.join(default_database).unwrap();
-            new_url.set_query(base.query());
-            (database, new_url.into())
-        }
+        const DB_NAME: &str = "attachment_service";
+        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
 
-        #[derive(Debug, Clone)]
-        pub struct CreateDatabaseStatement {
-            db_name: String,
-        }
+        let pg_bin_dir = self.get_pg_bin_dir().await?;
+        let createdb_path = pg_bin_dir.join("createdb");
+        let output = Command::new(&createdb_path)
+            .args([
+                "-h",
+                "localhost",
+                "-p",
+                &format!("{}", self.postgres_port),
+                &DB_NAME,
+            ])
+            .output()
+            .await
+            .expect("Failed to spawn createdb");
 
-        impl CreateDatabaseStatement {
-            pub fn new(db_name: &str) -> Self {
-                CreateDatabaseStatement {
-                    db_name: db_name.to_owned(),
-                }
+        if !output.status.success() {
+            let stderr = String::from_utf8(output.stderr).expect("Non-UTF8 output from createdb");
+            if stderr.contains("already exists") {
+                tracing::info!("Database {DB_NAME} already exists");
+            } else {
+                anyhow::bail!("createdb failed with status {}: {stderr}", output.status);
             }
         }
 
-        impl<DB: Backend> QueryFragment<DB> for CreateDatabaseStatement {
-            fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, DB>) -> QueryResult<()> {
-                out.push_sql("CREATE DATABASE ");
-                out.push_identifier(&self.db_name)?;
-                Ok(())
-            }
-        }
-
-        impl<Conn> RunQueryDsl<Conn> for CreateDatabaseStatement {}
-
-        impl QueryId for CreateDatabaseStatement {
-            type QueryId = ();
-
-            const HAS_STATIC_QUERY_ID: bool = false;
-        }
-        if PgConnection::establish(&database_url).is_err() {
-            let (database, postgres_url) = change_database_of_url(&database_url, "postgres");
-            println!("Creating database: {database}");
-            let mut conn = PgConnection::establish(&postgres_url)?;
-            CreateDatabaseStatement::new(&database).execute(&mut conn)?;
-        }
-        let mut conn = PgConnection::establish(&database_url)?;
-
-        let migrations_dir = self
-            .find_source_root()
-            .await?
-            .join("control_plane/attachment_service/migrations");
-
-        let migrations = diesel_migrations::FileBasedMigrations::from_path(migrations_dir)?;
-        println!("Running migrations in {}", migrations.path().display());
-        HarnessWithOutput::write_to_stdout(&mut conn)
-            .run_pending_migrations(migrations)
-            .map(|_| ())
-            .map_err(|e| anyhow::anyhow!(e))?;
-
-        println!("Migrations complete");
-
         Ok(database_url)
     }
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 70b238913d..8e9cc43152 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -29,7 +29,6 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
-diesel = { version = "2", features = ["postgres", "r2d2", "serde_json"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures-channel = { version = "0.3", features = ["sink"] }
@@ -90,7 +89,6 @@ anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
-diesel_derives = { version = "2", features = ["32-column-tables", "postgres", "r2d2", "with-deprecated"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }

From 96d89cde5108850d1f0f41c23ff175552297ab9d Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 9 Feb 2024 15:50:51 +0000
Subject: [PATCH 0140/1571] Proxy error reworking (#6453)

## Problem

Taking my ideas from https://github.com/neondatabase/neon/pull/6283 and
doing a bit less radical changes. smaller commits.

We currently don't report error classifications in proxy as the current
error handling made it hard to do so.

## Summary of changes

1. Add a `ReportableError` trait that all errors will implement. This
provides the error classification functionality.
2. Handle Client requests a strongly typed error
    * this error is a `ReportableError` and is logged appropriately
3. The handle client error only has a few possible error types, to
account for the fact that at this point errors should be returned to the
user.
---
 proxy/src/auth.rs                     |  37 ++++++++-
 proxy/src/auth/backend/classic.rs     |   4 +-
 proxy/src/auth/backend/link.rs        |  18 ++--
 proxy/src/auth/credentials.rs         |  14 +++-
 proxy/src/bin/pg_sni_router.rs        |  11 ++-
 proxy/src/cancellation.rs             |  37 +++++++--
 proxy/src/compute.rs                  |  22 ++++-
 proxy/src/console/provider.rs         |  31 ++++++-
 proxy/src/context.rs                  |  18 +++-
 proxy/src/context/parquet.rs          |   2 +-
 proxy/src/error.rs                    |  38 +++++++--
 proxy/src/metrics.rs                  |  19 +++++
 proxy/src/proxy.rs                    |  95 ++++++++++++++++++----
 proxy/src/proxy/handshake.rs          |  76 +++++++++++++----
 proxy/src/proxy/passthrough.rs        |  23 ++++--
 proxy/src/proxy/tests.rs              |   8 +-
 proxy/src/proxy/tests/mitm.rs         |  10 +--
 proxy/src/sasl.rs                     |  14 +++-
 proxy/src/serverless.rs               |  14 ++--
 proxy/src/serverless/backend.rs       |  29 +++++--
 proxy/src/serverless/conn_pool.rs     |   4 +-
 proxy/src/serverless/json.rs          |  32 ++++++--
 proxy/src/serverless/sql_over_http.rs | 113 ++++++++++++--------------
 proxy/src/serverless/websocket.rs     |  30 +++++--
 proxy/src/stream.rs                   |  75 ++++++++++++++---
 25 files changed, 588 insertions(+), 186 deletions(-)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index 8d1b861a66..48de4e2353 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -5,7 +5,8 @@ pub use backend::BackendType;
 
 mod credentials;
 pub use credentials::{
-    check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, IpPattern,
+    check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint,
+    ComputeUserInfoParseError, IpPattern,
 };
 
 mod password_hack;
@@ -14,8 +15,12 @@ use password_hack::PasswordHackPayload;
 
 mod flow;
 pub use flow::*;
+use tokio::time::error::Elapsed;
 
-use crate::{console, error::UserFacingError};
+use crate::{
+    console,
+    error::{ReportableError, UserFacingError},
+};
 use std::io;
 use thiserror::Error;
 
@@ -67,6 +72,9 @@ pub enum AuthErrorImpl {
 
     #[error("Too many connections to this endpoint. Please try again later.")]
     TooManyConnections,
+
+    #[error("Authentication timed out")]
+    UserTimeout(Elapsed),
 }
 
 #[derive(Debug, Error)]
@@ -93,6 +101,10 @@ impl AuthError {
     pub fn is_auth_failed(&self) -> bool {
         matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_))
     }
+
+    pub fn user_timeout(elapsed: Elapsed) -> Self {
+        AuthErrorImpl::UserTimeout(elapsed).into()
+    }
 }
 
 impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
@@ -116,6 +128,27 @@ impl UserFacingError for AuthError {
             Io(_) => "Internal error".to_string(),
             IpAddressNotAllowed => self.to_string(),
             TooManyConnections => self.to_string(),
+            UserTimeout(_) => self.to_string(),
+        }
+    }
+}
+
+impl ReportableError for AuthError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        use AuthErrorImpl::*;
+        match self.0.as_ref() {
+            Link(e) => e.get_error_kind(),
+            GetAuthInfo(e) => e.get_error_kind(),
+            WakeCompute(e) => e.get_error_kind(),
+            Sasl(e) => e.get_error_kind(),
+            AuthFailed(_) => crate::error::ErrorKind::User,
+            BadAuthMethod(_) => crate::error::ErrorKind::User,
+            MalformedPassword(_) => crate::error::ErrorKind::User,
+            MissingEndpointName => crate::error::ErrorKind::User,
+            Io(_) => crate::error::ErrorKind::ClientDisconnect,
+            IpAddressNotAllowed => crate::error::ErrorKind::User,
+            TooManyConnections => crate::error::ErrorKind::RateLimit,
+            UserTimeout(_) => crate::error::ErrorKind::User,
         }
     }
 }
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 384063ceae..745dd75107 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -45,9 +45,9 @@ pub(super) async fn authenticate(
                 }
             )
             .await
-            .map_err(|error| {
+            .map_err(|e| {
                 warn!("error processing scram messages error = authentication timed out, execution time exeeded {} seconds", config.scram_protocol_timeout.as_secs());
-                auth::io::Error::new(auth::io::ErrorKind::TimedOut, error)
+                auth::AuthError::user_timeout(e)
             })??;
 
             let client_key = match auth_outcome {
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index d8ae362c03..c71637dd1a 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -2,7 +2,7 @@ use crate::{
     auth, compute,
     console::{self, provider::NodeInfo},
     context::RequestMonitoring,
-    error::UserFacingError,
+    error::{ReportableError, UserFacingError},
     stream::PqStream,
     waiters,
 };
@@ -14,10 +14,6 @@ use tracing::{info, info_span};
 
 #[derive(Debug, Error)]
 pub enum LinkAuthError {
-    /// Authentication error reported by the console.
-    #[error("Authentication failed: {0}")]
-    AuthFailed(String),
-
     #[error(transparent)]
     WaiterRegister(#[from] waiters::RegisterError),
 
@@ -30,10 +26,16 @@ pub enum LinkAuthError {
 
 impl UserFacingError for LinkAuthError {
     fn to_string_client(&self) -> String {
-        use LinkAuthError::*;
+        "Internal error".to_string()
+    }
+}
+
+impl ReportableError for LinkAuthError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
         match self {
-            AuthFailed(_) => self.to_string(),
-            _ => "Internal error".to_string(),
+            LinkAuthError::WaiterRegister(_) => crate::error::ErrorKind::Service,
+            LinkAuthError::WaiterWait(_) => crate::error::ErrorKind::Service,
+            LinkAuthError::Io(_) => crate::error::ErrorKind::ClientDisconnect,
         }
     }
 }
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 875baaec47..d32609e44c 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,8 +1,12 @@
 //! User credentials used in authentication.
 
 use crate::{
-    auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI,
+    auth::password_hack::parse_endpoint_param,
+    context::RequestMonitoring,
+    error::{ReportableError, UserFacingError},
+    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI,
+    proxy::NeonOptions,
+    serverless::SERVERLESS_DRIVER_SNI,
     EndpointId, RoleName,
 };
 use itertools::Itertools;
@@ -39,6 +43,12 @@ pub enum ComputeUserInfoParseError {
 
 impl UserFacingError for ComputeUserInfoParseError {}
 
+impl ReportableError for ComputeUserInfoParseError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        crate::error::ErrorKind::User
+    }
+}
+
 /// Various client credentials which we use for authentication.
 /// Note that we don't store any kind of client key or password here.
 #[derive(Debug, Clone, PartialEq, Eq)]
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 471be7af25..43b805e8a1 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -240,7 +240,9 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 ?unexpected,
                 "unexpected startup packet, rejecting connection"
             );
-            stream.throw_error_str(ERR_INSECURE_CONNECTION).await?
+            stream
+                .throw_error_str(ERR_INSECURE_CONNECTION, proxy::error::ErrorKind::User)
+                .await?
         }
     }
 }
@@ -272,5 +274,10 @@ async fn handle_client(
     let client = tokio::net::TcpStream::connect(destination).await?;
 
     let metrics_aux: MetricsAuxInfo = Default::default();
-    proxy::proxy::passthrough::proxy_pass(ctx, tls_stream, client, metrics_aux).await
+
+    // doesn't yet matter as pg-sni-router doesn't report analytics logs
+    ctx.set_success();
+    ctx.log();
+
+    proxy::proxy::passthrough::proxy_pass(tls_stream, client, metrics_aux).await
 }
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index d4ee657144..fe614628d8 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,24 +1,45 @@
-use anyhow::Context;
 use dashmap::DashMap;
 use pq_proto::CancelKeyData;
 use std::{net::SocketAddr, sync::Arc};
+use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::{CancelToken, NoTls};
 use tracing::info;
 
+use crate::error::ReportableError;
+
 /// Enables serving `CancelRequest`s.
 #[derive(Default)]
 pub struct CancelMap(DashMap<CancelKeyData, Option<CancelClosure>>);
 
+#[derive(Debug, Error)]
+pub enum CancelError {
+    #[error("{0}")]
+    IO(#[from] std::io::Error),
+    #[error("{0}")]
+    Postgres(#[from] tokio_postgres::Error),
+}
+
+impl ReportableError for CancelError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            CancelError::IO(_) => crate::error::ErrorKind::Compute,
+            CancelError::Postgres(e) if e.as_db_error().is_some() => {
+                crate::error::ErrorKind::Postgres
+            }
+            CancelError::Postgres(_) => crate::error::ErrorKind::Compute,
+        }
+    }
+}
+
 impl CancelMap {
     /// Cancel a running query for the corresponding connection.
-    pub async fn cancel_session(&self, key: CancelKeyData) -> anyhow::Result<()> {
+    pub async fn cancel_session(&self, key: CancelKeyData) -> Result<(), CancelError> {
         // NB: we should immediately release the lock after cloning the token.
-        let cancel_closure = self
-            .0
-            .get(&key)
-            .and_then(|x| x.clone())
-            .with_context(|| format!("query cancellation key not found: {key}"))?;
+        let Some(cancel_closure) = self.0.get(&key).and_then(|x| x.clone()) else {
+            tracing::warn!("query cancellation key not found: {key}");
+            return Ok(());
+        };
 
         info!("cancelling query per user's request using key {key}");
         cancel_closure.try_cancel_query().await
@@ -81,7 +102,7 @@ impl CancelClosure {
     }
 
     /// Cancels the query running on user's compute node.
-    pub async fn try_cancel_query(self) -> anyhow::Result<()> {
+    async fn try_cancel_query(self) -> Result<(), CancelError> {
         let socket = TcpStream::connect(self.socket_addr).await?;
         self.cancel_token.cancel_query_raw(socket, NoTls).await?;
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index aef1aab733..83940d80ec 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,6 +1,10 @@
 use crate::{
-    auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
-    context::RequestMonitoring, error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE,
+    auth::parse_endpoint_param,
+    cancellation::CancelClosure,
+    console::errors::WakeComputeError,
+    context::RequestMonitoring,
+    error::{ReportableError, UserFacingError},
+    metrics::NUM_DB_CONNECTIONS_GAUGE,
     proxy::neon_option,
 };
 use futures::{FutureExt, TryFutureExt};
@@ -58,6 +62,20 @@ impl UserFacingError for ConnectionError {
     }
 }
 
+impl ReportableError for ConnectionError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            ConnectionError::Postgres(e) if e.as_db_error().is_some() => {
+                crate::error::ErrorKind::Postgres
+            }
+            ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute,
+            ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute,
+            ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute,
+            ConnectionError::WakeComputeError(e) => e.get_error_kind(),
+        }
+    }
+}
+
 /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`.
 pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
 
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index c53d929470..e5cad42753 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -20,7 +20,7 @@ use tracing::info;
 
 pub mod errors {
     use crate::{
-        error::{io_error, UserFacingError},
+        error::{io_error, ReportableError, UserFacingError},
         http,
         proxy::retry::ShouldRetry,
     };
@@ -81,6 +81,15 @@ pub mod errors {
         }
     }
 
+    impl ReportableError for ApiError {
+        fn get_error_kind(&self) -> crate::error::ErrorKind {
+            match self {
+                ApiError::Console { .. } => crate::error::ErrorKind::ControlPlane,
+                ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
+            }
+        }
+    }
+
     impl ShouldRetry for ApiError {
         fn could_retry(&self) -> bool {
             match self {
@@ -150,6 +159,16 @@ pub mod errors {
             }
         }
     }
+
+    impl ReportableError for GetAuthInfoError {
+        fn get_error_kind(&self) -> crate::error::ErrorKind {
+            match self {
+                GetAuthInfoError::BadSecret => crate::error::ErrorKind::ControlPlane,
+                GetAuthInfoError::ApiError(_) => crate::error::ErrorKind::ControlPlane,
+            }
+        }
+    }
+
     #[derive(Debug, Error)]
     pub enum WakeComputeError {
         #[error("Console responded with a malformed compute address: {0}")]
@@ -194,6 +213,16 @@ pub mod errors {
             }
         }
     }
+
+    impl ReportableError for WakeComputeError {
+        fn get_error_kind(&self) -> crate::error::ErrorKind {
+            match self {
+                WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
+                WakeComputeError::ApiError(e) => e.get_error_kind(),
+                WakeComputeError::TimeoutError => crate::error::ErrorKind::RateLimit,
+            }
+        }
+    }
 }
 
 /// Auth secret which is managed by the cloud.
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index fe204534b7..d2bf3f68d3 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -8,8 +8,10 @@ use tokio::sync::mpsc;
 use uuid::Uuid;
 
 use crate::{
-    console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer, BranchId,
-    EndpointId, ProjectId, RoleName,
+    console::messages::MetricsAuxInfo,
+    error::ErrorKind,
+    metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
+    BranchId, EndpointId, ProjectId, RoleName,
 };
 
 pub mod parquet;
@@ -108,6 +110,18 @@ impl RequestMonitoring {
         self.user = Some(user);
     }
 
+    pub fn set_error_kind(&mut self, kind: ErrorKind) {
+        ERROR_BY_KIND
+            .with_label_values(&[kind.to_metric_label()])
+            .inc();
+        if let Some(ep) = &self.endpoint_id {
+            ENDPOINT_ERRORS_BY_KIND
+                .with_label_values(&[kind.to_metric_label()])
+                .measure(ep);
+        }
+        self.error_kind = Some(kind);
+    }
+
     pub fn set_success(&mut self) {
         self.success = true;
     }
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 8510c5c586..0fe46915bc 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -108,7 +108,7 @@ impl From<RequestMonitoring> for RequestData {
             branch: value.branch.as_deref().map(String::from),
             protocol: value.protocol,
             region: value.region,
-            error: value.error_kind.as_ref().map(|e| e.to_str()),
+            error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
             success: value.success,
             duration_us: SystemTime::from(value.first_packet)
                 .elapsed()
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index 5b2dd7ecfd..eafe92bf48 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -17,7 +17,7 @@ pub fn log_error<E: fmt::Display>(e: E) -> E {
 /// NOTE: This trait should not be implemented for [`anyhow::Error`], since it
 /// is way too convenient and tends to proliferate all across the codebase,
 /// ultimately leading to accidental leaks of sensitive data.
-pub trait UserFacingError: fmt::Display {
+pub trait UserFacingError: ReportableError {
     /// Format the error for client, stripping all sensitive info.
     ///
     /// Although this might be a no-op for many types, it's highly
@@ -29,13 +29,13 @@ pub trait UserFacingError: fmt::Display {
     }
 }
 
-#[derive(Clone)]
+#[derive(Copy, Clone, Debug)]
 pub enum ErrorKind {
     /// Wrong password, unknown endpoint, protocol violation, etc...
     User,
 
     /// Network error between user and proxy. Not necessarily user error
-    Disconnect,
+    ClientDisconnect,
 
     /// Proxy self-imposed rate limits
     RateLimit,
@@ -46,6 +46,9 @@ pub enum ErrorKind {
     /// Error communicating with control plane
     ControlPlane,
 
+    /// Postgres error
+    Postgres,
+
     /// Error communicating with compute
     Compute,
 }
@@ -54,11 +57,36 @@ impl ErrorKind {
     pub fn to_str(&self) -> &'static str {
         match self {
             ErrorKind::User => "request failed due to user error",
-            ErrorKind::Disconnect => "client disconnected",
+            ErrorKind::ClientDisconnect => "client disconnected",
             ErrorKind::RateLimit => "request cancelled due to rate limit",
             ErrorKind::Service => "internal service error",
             ErrorKind::ControlPlane => "non-retryable control plane error",
-            ErrorKind::Compute => "non-retryable compute error (or exhausted retry capacity)",
+            ErrorKind::Postgres => "postgres error",
+            ErrorKind::Compute => {
+                "non-retryable compute connection error (or exhausted retry capacity)"
+            }
+        }
+    }
+
+    pub fn to_metric_label(&self) -> &'static str {
+        match self {
+            ErrorKind::User => "user",
+            ErrorKind::ClientDisconnect => "clientdisconnect",
+            ErrorKind::RateLimit => "ratelimit",
+            ErrorKind::Service => "service",
+            ErrorKind::ControlPlane => "controlplane",
+            ErrorKind::Postgres => "postgres",
+            ErrorKind::Compute => "compute",
         }
     }
 }
+
+pub trait ReportableError: fmt::Display + Send + 'static {
+    fn get_error_kind(&self) -> ErrorKind;
+}
+
+impl ReportableError for tokio::time::error::Elapsed {
+    fn get_error_kind(&self) -> ErrorKind {
+        ErrorKind::RateLimit
+    }
+}
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index e2d96a9c27..ccf89f9b05 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -274,3 +274,22 @@ pub static CONNECTING_ENDPOINTS: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
     )
     .unwrap()
 });
+
+pub static ERROR_BY_KIND: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_errors_total",
+        "Number of errors by a given classification",
+        &["type"],
+    )
+    .unwrap()
+});
+
+pub static ENDPOINT_ERRORS_BY_KIND: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
+    register_hll_vec!(
+        32,
+        "proxy_endpoints_affected_by_errors",
+        "Number of endpoints affected by errors of a given classification",
+        &["type"],
+    )
+    .unwrap()
+});
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index b3b221d3e2..50e22ec72a 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -13,9 +13,10 @@ use crate::{
     compute,
     config::{ProxyConfig, TlsConfig},
     context::RequestMonitoring,
+    error::ReportableError,
     metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE},
     protocol2::WithClientIp,
-    proxy::{handshake::handshake, passthrough::proxy_pass},
+    proxy::handshake::{handshake, HandshakeData},
     rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
     EndpointCacheKey,
@@ -28,14 +29,17 @@ use pq_proto::{BeMessage as Be, StartupMessageParams};
 use regex::Regex;
 use smol_str::{format_smolstr, SmolStr};
 use std::sync::Arc;
+use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, Instrument};
 
-use self::connect_compute::{connect_to_compute, TcpMechanism};
+use self::{
+    connect_compute::{connect_to_compute, TcpMechanism},
+    passthrough::ProxyPassthrough,
+};
 
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
-const ERR_PROTO_VIOLATION: &str = "protocol violation";
 
 pub async fn run_until_cancelled<F: std::future::Future>(
     f: F,
@@ -98,14 +102,14 @@ pub async fn task_main(
                     bail!("missing required client IP");
                 }
 
-                let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
-
                 socket
                     .inner
                     .set_nodelay(true)
                     .context("failed to set socket option")?;
 
-                handle_client(
+                let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
+
+                let res = handle_client(
                     config,
                     &mut ctx,
                     cancel_map,
@@ -113,7 +117,26 @@ pub async fn task_main(
                     ClientMode::Tcp,
                     endpoint_rate_limiter,
                 )
-                .await
+                .await;
+
+                match res {
+                    Err(e) => {
+                        // todo: log and push to ctx the error kind
+                        ctx.set_error_kind(e.get_error_kind());
+                        ctx.log();
+                        Err(e.into())
+                    }
+                    Ok(None) => {
+                        ctx.set_success();
+                        ctx.log();
+                        Ok(())
+                    }
+                    Ok(Some(p)) => {
+                        ctx.set_success();
+                        ctx.log();
+                        p.proxy_pass().await
+                    }
+                }
             }
             .unwrap_or_else(move |e| {
                 // Acknowledge that the task has finished with an error.
@@ -169,6 +192,37 @@ impl ClientMode {
     }
 }
 
+#[derive(Debug, Error)]
+// almost all errors should be reported to the user, but there's a few cases where we cannot
+// 1. Cancellation: we are not allowed to tell the client any cancellation statuses for security reasons
+// 2. Handshake: handshake reports errors if it can, otherwise if the handshake fails due to protocol violation,
+//    we cannot be sure the client even understands our error message
+// 3. PrepareClient: The client disconnected, so we can't tell them anyway...
+pub enum ClientRequestError {
+    #[error("{0}")]
+    Cancellation(#[from] cancellation::CancelError),
+    #[error("{0}")]
+    Handshake(#[from] handshake::HandshakeError),
+    #[error("{0}")]
+    HandshakeTimeout(#[from] tokio::time::error::Elapsed),
+    #[error("{0}")]
+    PrepareClient(#[from] std::io::Error),
+    #[error("{0}")]
+    ReportedError(#[from] crate::stream::ReportedError),
+}
+
+impl ReportableError for ClientRequestError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            ClientRequestError::Cancellation(e) => e.get_error_kind(),
+            ClientRequestError::Handshake(e) => e.get_error_kind(),
+            ClientRequestError::HandshakeTimeout(_) => crate::error::ErrorKind::RateLimit,
+            ClientRequestError::ReportedError(e) => e.get_error_kind(),
+            ClientRequestError::PrepareClient(_) => crate::error::ErrorKind::ClientDisconnect,
+        }
+    }
+}
+
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
@@ -176,7 +230,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     stream: S,
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-) -> anyhow::Result<()> {
+) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
     info!(
         protocol = ctx.protocol,
         "handling interactive connection from client"
@@ -193,11 +247,16 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let tls = config.tls_config.as_ref();
 
     let pause = ctx.latency_timer.pause();
-    let do_handshake = handshake(stream, mode.handshake_tls(tls), &cancel_map);
+    let do_handshake = handshake(stream, mode.handshake_tls(tls));
     let (mut stream, params) =
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
-            Some(x) => x,
-            None => return Ok(()), // it's a cancellation request
+            HandshakeData::Startup(stream, params) => (stream, params),
+            HandshakeData::Cancel(cancel_key_data) => {
+                return Ok(cancel_map
+                    .cancel_session(cancel_key_data)
+                    .await
+                    .map(|()| None)?)
+            }
         };
     drop(pause);
 
@@ -222,7 +281,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         if !endpoint_rate_limiter.check(ep) {
             return stream
                 .throw_error(auth::AuthError::too_many_connections())
-                .await;
+                .await?;
         }
     }
 
@@ -242,7 +301,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
             let app = params.get("application_name");
             let params_span = tracing::info_span!("", ?user, ?db, ?app);
 
-            return stream.throw_error(e).instrument(params_span).await;
+            return stream.throw_error(e).instrument(params_span).await?;
         }
     };
 
@@ -268,7 +327,13 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let (stream, read_buf) = stream.into_inner();
     node.stream.write_all(&read_buf).await?;
 
-    proxy_pass(ctx, stream, node.stream, aux).await
+    Ok(Some(ProxyPassthrough {
+        client: stream,
+        compute: node,
+        aux,
+        req: _request_gauge,
+        conn: _client_gauge,
+    }))
 }
 
 /// Finish client connection initialization: confirm auth success, send params, etc.
@@ -277,7 +342,7 @@ async fn prepare_client_connection(
     node: &compute::PostgresConnection,
     session: &cancellation::Session,
     stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> anyhow::Result<()> {
+) -> Result<(), std::io::Error> {
     // Register compute's query cancellation token and produce a new, unique one.
     // The new token (cancel_key_data) will be sent to the client.
     let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone());
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index 1ad8da20d7..4665e07d23 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -1,15 +1,60 @@
-use anyhow::{bail, Context};
-use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
+use pq_proto::{BeMessage as Be, CancelKeyData, FeStartupPacket, StartupMessageParams};
+use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
 use crate::{
-    cancellation::CancelMap,
     config::TlsConfig,
-    proxy::{ERR_INSECURE_CONNECTION, ERR_PROTO_VIOLATION},
-    stream::{PqStream, Stream},
+    error::ReportableError,
+    proxy::ERR_INSECURE_CONNECTION,
+    stream::{PqStream, Stream, StreamUpgradeError},
 };
 
+#[derive(Error, Debug)]
+pub enum HandshakeError {
+    #[error("data is sent before server replied with EncryptionResponse")]
+    EarlyData,
+
+    #[error("protocol violation")]
+    ProtocolViolation,
+
+    #[error("missing certificate")]
+    MissingCertificate,
+
+    #[error("{0}")]
+    StreamUpgradeError(#[from] StreamUpgradeError),
+
+    #[error("{0}")]
+    Io(#[from] std::io::Error),
+
+    #[error("{0}")]
+    ReportedError(#[from] crate::stream::ReportedError),
+}
+
+impl ReportableError for HandshakeError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            HandshakeError::EarlyData => crate::error::ErrorKind::User,
+            HandshakeError::ProtocolViolation => crate::error::ErrorKind::User,
+            // This error should not happen, but will if we have no default certificate and
+            // the client sends no SNI extension.
+            // If they provide SNI then we can be sure there is a certificate that matches.
+            HandshakeError::MissingCertificate => crate::error::ErrorKind::Service,
+            HandshakeError::StreamUpgradeError(upgrade) => match upgrade {
+                StreamUpgradeError::AlreadyTls => crate::error::ErrorKind::Service,
+                StreamUpgradeError::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+            },
+            HandshakeError::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+            HandshakeError::ReportedError(e) => e.get_error_kind(),
+        }
+    }
+}
+
+pub enum HandshakeData<S> {
+    Startup(PqStream<Stream<S>>, StartupMessageParams),
+    Cancel(CancelKeyData),
+}
+
 /// Establish a (most probably, secure) connection with the client.
 /// For better testing experience, `stream` can be any object satisfying the traits.
 /// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
@@ -18,8 +63,7 @@ use crate::{
 pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     stream: S,
     mut tls: Option<&TlsConfig>,
-    cancel_map: &CancelMap,
-) -> anyhow::Result<Option<(PqStream<Stream<S>>, StartupMessageParams)>> {
+) -> Result<HandshakeData<S>, HandshakeError> {
     // Client may try upgrading to each protocol only once
     let (mut tried_ssl, mut tried_gss) = (false, false);
 
@@ -49,14 +93,14 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                         // pipelining in our node js driver. We should probably
                         // support that by chaining read_buf with the stream.
                         if !read_buf.is_empty() {
-                            bail!("data is sent before server replied with EncryptionResponse");
+                            return Err(HandshakeError::EarlyData);
                         }
                         let tls_stream = raw.upgrade(tls.to_server_config()).await?;
 
                         let (_, tls_server_end_point) = tls
                             .cert_resolver
                             .resolve(tls_stream.get_ref().1.server_name())
-                            .context("missing certificate")?;
+                            .ok_or(HandshakeError::MissingCertificate)?;
 
                         stream = PqStream::new(Stream::Tls {
                             tls: Box::new(tls_stream),
@@ -64,7 +108,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                         });
                     }
                 }
-                _ => bail!(ERR_PROTO_VIOLATION),
+                _ => return Err(HandshakeError::ProtocolViolation),
             },
             GssEncRequest => match stream.get_ref() {
                 Stream::Raw { .. } if !tried_gss => {
@@ -73,23 +117,23 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                     // Currently, we don't support GSSAPI
                     stream.write_message(&Be::EncryptionResponse(false)).await?;
                 }
-                _ => bail!(ERR_PROTO_VIOLATION),
+                _ => return Err(HandshakeError::ProtocolViolation),
             },
             StartupMessage { params, .. } => {
                 // Check that the config has been consumed during upgrade
                 // OR we didn't provide it at all (for dev purposes).
                 if tls.is_some() {
-                    stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
+                    return stream
+                        .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User)
+                        .await?;
                 }
 
                 info!(session_type = "normal", "successful handshake");
-                break Ok(Some((stream, params)));
+                break Ok(HandshakeData::Startup(stream, params));
             }
             CancelRequest(cancel_key_data) => {
-                cancel_map.cancel_session(cancel_key_data).await?;
-
                 info!(session_type = "cancellation", "successful handshake");
-                break Ok(None);
+                break Ok(HandshakeData::Cancel(cancel_key_data));
             }
         }
     }
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index 53e0c3c8f3..b7018c6fb5 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -1,9 +1,11 @@
 use crate::{
+    compute::PostgresConnection,
     console::messages::MetricsAuxInfo,
-    context::RequestMonitoring,
     metrics::NUM_BYTES_PROXIED_COUNTER,
+    stream::Stream,
     usage_metrics::{Ids, USAGE_METRICS},
 };
+use metrics::IntCounterPairGuard;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 use utils::measured_stream::MeasuredStream;
@@ -11,14 +13,10 @@ use utils::measured_stream::MeasuredStream;
 /// Forward bytes in both directions (client <-> compute).
 #[tracing::instrument(skip_all)]
 pub async fn proxy_pass(
-    ctx: &mut RequestMonitoring,
     client: impl AsyncRead + AsyncWrite + Unpin,
     compute: impl AsyncRead + AsyncWrite + Unpin,
     aux: MetricsAuxInfo,
 ) -> anyhow::Result<()> {
-    ctx.set_success();
-    ctx.log();
-
     let usage = USAGE_METRICS.register(Ids {
         endpoint_id: aux.endpoint_id.clone(),
         branch_id: aux.branch_id.clone(),
@@ -51,3 +49,18 @@ pub async fn proxy_pass(
 
     Ok(())
 }
+
+pub struct ProxyPassthrough<S> {
+    pub client: Stream<S>,
+    pub compute: PostgresConnection,
+    pub aux: MetricsAuxInfo,
+
+    pub req: IntCounterPairGuard,
+    pub conn: IntCounterPairGuard,
+}
+
+impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
+    pub async fn proxy_pass(self) -> anyhow::Result<()> {
+        proxy_pass(self.client, self.compute.stream, self.aux).await
+    }
+}
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 656cabac75..3e961afb41 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -163,11 +163,11 @@ async fn dummy_proxy(
     tls: Option<TlsConfig>,
     auth: impl TestAuth + Send,
 ) -> anyhow::Result<()> {
-    let cancel_map = CancelMap::default();
     let client = WithClientIp::new(client);
-    let (mut stream, _params) = handshake(client, tls.as_ref(), &cancel_map)
-        .await?
-        .context("handshake failed")?;
+    let mut stream = match handshake(client, tls.as_ref()).await? {
+        HandshakeData::Startup(stream, _) => stream,
+        HandshakeData::Cancel(_) => bail!("cancellation not supported"),
+    };
 
     auth.authenticate(&mut stream).await?;
 
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index a0a84a1dc0..ed89e51754 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -35,12 +35,10 @@ async fn proxy_mitm(
     tokio::spawn(async move {
         // begin handshake with end_server
         let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await;
-        // process handshake with end_client
-        let (end_client, startup) =
-            handshake(client1, Some(&server_config1), &CancelMap::default())
-                .await
-                .unwrap()
-                .unwrap();
+        let (end_client, startup) = match handshake(client1, Some(&server_config1)).await.unwrap() {
+            HandshakeData::Startup(stream, params) => (stream, params),
+            HandshakeData::Cancel(_) => panic!("cancellation not supported"),
+        };
 
         let mut end_server = tokio_util::codec::Framed::new(end_server, PgFrame);
         let (end_client, buf) = end_client.framed.into_inner();
diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs
index da1cf21c6a..1cf8b53e11 100644
--- a/proxy/src/sasl.rs
+++ b/proxy/src/sasl.rs
@@ -10,7 +10,7 @@ mod channel_binding;
 mod messages;
 mod stream;
 
-use crate::error::UserFacingError;
+use crate::error::{ReportableError, UserFacingError};
 use std::io;
 use thiserror::Error;
 
@@ -48,6 +48,18 @@ impl UserFacingError for Error {
     }
 }
 
+impl ReportableError for Error {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            Error::ChannelBindingFailed(_) => crate::error::ErrorKind::User,
+            Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User,
+            Error::BadClientMessage(_) => crate::error::ErrorKind::User,
+            Error::MissingBinding => crate::error::ErrorKind::Service,
+            Error::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+        }
+    }
+}
+
 /// A convenient result type for SASL exchange.
 pub type Result<T> = std::result::Result<T, Error>;
 
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 58aa925a6a..a20600b94a 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -109,10 +109,9 @@ pub async fn task_main(
 
     let make_svc = hyper::service::make_service_fn(
         |stream: &tokio_rustls::server::TlsStream<WithClientIp<AddrStream>>| {
-            let (io, tls) = stream.get_ref();
+            let (io, _) = stream.get_ref();
             let client_addr = io.client_addr();
             let remote_addr = io.inner.remote_addr();
-            let sni_name = tls.server_name().map(|s| s.to_string());
             let backend = backend.clone();
             let ws_connections = ws_connections.clone();
             let endpoint_rate_limiter = endpoint_rate_limiter.clone();
@@ -125,7 +124,6 @@ pub async fn task_main(
                 };
                 Ok(MetricService::new(hyper::service::service_fn(
                     move |req: Request<Body>| {
-                        let sni_name = sni_name.clone();
                         let backend = backend.clone();
                         let ws_connections = ws_connections.clone();
                         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
@@ -141,7 +139,6 @@ pub async fn task_main(
                                 ws_connections,
                                 cancel_map,
                                 session_id,
-                                sni_name,
                                 peer_addr.ip(),
                                 endpoint_rate_limiter,
                             )
@@ -210,7 +207,6 @@ async fn request_handler(
     ws_connections: TaskTracker,
     cancel_map: Arc<CancelMap>,
     session_id: uuid::Uuid,
-    sni_hostname: Option<String>,
     peer_addr: IpAddr,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> Result<Response<Body>, ApiError> {
@@ -230,11 +226,11 @@ async fn request_handler(
 
         ws_connections.spawn(
             async move {
-                let mut ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
+                let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
 
                 if let Err(e) = websocket::serve_websocket(
                     config,
-                    &mut ctx,
+                    ctx,
                     websocket,
                     cancel_map,
                     host,
@@ -251,9 +247,9 @@ async fn request_handler(
         // Return the response so the spawned future can continue.
         Ok(response)
     } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        let mut ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
+        let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
 
-        sql_over_http::handle(config, &mut ctx, request, sni_hostname, backend).await
+        sql_over_http::handle(config, ctx, request, backend).await
     } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
         Response::builder()
             .header("Allow", "OPTIONS, POST")
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 466a74f0ea..03257e9161 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -1,6 +1,5 @@
 use std::{sync::Arc, time::Duration};
 
-use anyhow::Context;
 use async_trait::async_trait;
 use tracing::info;
 
@@ -8,7 +7,10 @@ use crate::{
     auth::{backend::ComputeCredentialKeys, check_peer_addr_is_in_list, AuthError},
     compute,
     config::ProxyConfig,
-    console::CachedNodeInfo,
+    console::{
+        errors::{GetAuthInfoError, WakeComputeError},
+        CachedNodeInfo,
+    },
     context::RequestMonitoring,
     proxy::connect_compute::ConnectMechanism,
 };
@@ -66,7 +68,7 @@ impl PoolingBackend {
         conn_info: ConnInfo,
         keys: ComputeCredentialKeys,
         force_new: bool,
-    ) -> anyhow::Result<Client<tokio_postgres::Client>> {
+    ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
         let maybe_client = if !force_new {
             info!("pool: looking for an existing connection");
             self.pool.get(ctx, &conn_info).await?
@@ -90,7 +92,7 @@ impl PoolingBackend {
         let mut node_info = backend
             .wake_compute(ctx)
             .await?
-            .context("missing cache entry from wake_compute")?;
+            .ok_or(HttpConnError::NoComputeInfo)?;
 
         match keys {
             #[cfg(any(test, feature = "testing"))]
@@ -114,6 +116,23 @@ impl PoolingBackend {
     }
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum HttpConnError {
+    #[error("pooled connection closed at inconsistent state")]
+    ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError<uuid::Uuid>),
+    #[error("could not connection to compute")]
+    ConnectionError(#[from] tokio_postgres::Error),
+
+    #[error("could not get auth info")]
+    GetAuthInfo(#[from] GetAuthInfoError),
+    #[error("user not authenticated")]
+    AuthError(#[from] AuthError),
+    #[error("wake_compute returned error")]
+    WakeCompute(#[from] WakeComputeError),
+    #[error("wake_compute returned nothing")]
+    NoComputeInfo,
+}
+
 struct TokioMechanism {
     pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
     conn_info: ConnInfo,
@@ -124,7 +143,7 @@ struct TokioMechanism {
 impl ConnectMechanism for TokioMechanism {
     type Connection = Client<tokio_postgres::Client>;
     type ConnectError = tokio_postgres::Error;
-    type Error = anyhow::Error;
+    type Error = HttpConnError;
 
     async fn connect_once(
         &self,
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index a7b2c532d2..f92793096b 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -28,6 +28,8 @@ use crate::{
 use tracing::{debug, error, warn, Span};
 use tracing::{info, info_span, Instrument};
 
+use super::backend::HttpConnError;
+
 pub const APP_NAME: SmolStr = SmolStr::new_inline("/sql_over_http");
 
 #[derive(Debug, Clone)]
@@ -358,7 +360,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         self: &Arc<Self>,
         ctx: &mut RequestMonitoring,
         conn_info: &ConnInfo,
-    ) -> anyhow::Result<Option<Client<C>>> {
+    ) -> Result<Option<Client<C>>, HttpConnError> {
         let mut client: Option<ClientInner<C>> = None;
 
         let endpoint_pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key());
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index a089d34040..c22c63e85b 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -60,6 +60,20 @@ fn json_array_to_pg_array(value: &Value) -> Option<String> {
     }
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum JsonConversionError {
+    #[error("internal error compute returned invalid data: {0}")]
+    AsTextError(tokio_postgres::Error),
+    #[error("parse int error: {0}")]
+    ParseIntError(#[from] std::num::ParseIntError),
+    #[error("parse float error: {0}")]
+    ParseFloatError(#[from] std::num::ParseFloatError),
+    #[error("parse json error: {0}")]
+    ParseJsonError(#[from] serde_json::Error),
+    #[error("unbalanced array")]
+    UnbalancedArray,
+}
+
 //
 // Convert postgres row with text-encoded values to JSON object
 //
@@ -68,7 +82,7 @@ pub fn pg_text_row_to_json(
     columns: &[Type],
     raw_output: bool,
     array_mode: bool,
-) -> Result<Value, anyhow::Error> {
+) -> Result<Value, JsonConversionError> {
     let iter = row
         .columns()
         .iter()
@@ -76,7 +90,7 @@ pub fn pg_text_row_to_json(
         .enumerate()
         .map(|(i, (column, typ))| {
             let name = column.name();
-            let pg_value = row.as_text(i)?;
+            let pg_value = row.as_text(i).map_err(JsonConversionError::AsTextError)?;
             let json_value = if raw_output {
                 match pg_value {
                     Some(v) => Value::String(v.to_string()),
@@ -92,10 +106,10 @@ pub fn pg_text_row_to_json(
         // drop keys and aggregate into array
         let arr = iter
             .map(|r| r.map(|(_key, val)| val))
-            .collect::<Result<Vec<Value>, anyhow::Error>>()?;
+            .collect::<Result<Vec<Value>, JsonConversionError>>()?;
         Ok(Value::Array(arr))
     } else {
-        let obj = iter.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
+        let obj = iter.collect::<Result<Map<String, Value>, JsonConversionError>>()?;
         Ok(Value::Object(obj))
     }
 }
@@ -103,7 +117,7 @@ pub fn pg_text_row_to_json(
 //
 // Convert postgres text-encoded value to JSON value
 //
-fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
+fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, JsonConversionError> {
     if let Some(val) = pg_value {
         if let Kind::Array(elem_type) = pg_type.kind() {
             return pg_array_parse(val, elem_type);
@@ -142,7 +156,7 @@ fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyh
 // values. Unlike postgres we don't check that all nested arrays have the same
 // dimensions, we just return them as is.
 //
-fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result<Value, anyhow::Error> {
+fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result<Value, JsonConversionError> {
     _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v)
 }
 
@@ -150,7 +164,7 @@ fn _pg_array_parse(
     pg_array: &str,
     elem_type: &Type,
     nested: bool,
-) -> Result<(Value, usize), anyhow::Error> {
+) -> Result<(Value, usize), JsonConversionError> {
     let mut pg_array_chr = pg_array.char_indices();
     let mut level = 0;
     let mut quote = false;
@@ -170,7 +184,7 @@ fn _pg_array_parse(
         entry: &mut String,
         entries: &mut Vec<Value>,
         elem_type: &Type,
-    ) -> Result<(), anyhow::Error> {
+    ) -> Result<(), JsonConversionError> {
         if !entry.is_empty() {
             // While in usual postgres response we get nulls as None and everything else
             // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while
@@ -234,7 +248,7 @@ fn _pg_array_parse(
     }
 
     if level != 0 {
-        return Err(anyhow::anyhow!("unbalanced array"));
+        return Err(JsonConversionError::UnbalancedArray);
     }
 
     Ok((Value::Array(entries), 0))
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 25e8813625..401022347e 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,7 +1,6 @@
 use std::sync::Arc;
 
 use anyhow::bail;
-use anyhow::Context;
 use futures::pin_mut;
 use futures::StreamExt;
 use hyper::body::HttpBody;
@@ -29,9 +28,11 @@ use utils::http::json::json_response;
 
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::endpoint_sni;
+use crate::auth::ComputeUserInfoParseError;
 use crate::config::ProxyConfig;
 use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
+use crate::error::ReportableError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
@@ -41,7 +42,6 @@ use super::backend::PoolingBackend;
 use super::conn_pool::ConnInfo;
 use super::json::json_to_pg_text;
 use super::json::pg_text_row_to_json;
-use super::SERVERLESS_DRIVER_SNI;
 
 #[derive(serde::Deserialize)]
 #[serde(rename_all = "camelCase")]
@@ -86,67 +86,70 @@ where
     Ok(json_to_pg_text(json))
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum ConnInfoError {
+    #[error("invalid header: {0}")]
+    InvalidHeader(&'static str),
+    #[error("invalid connection string: {0}")]
+    UrlParseError(#[from] url::ParseError),
+    #[error("incorrect scheme")]
+    IncorrectScheme,
+    #[error("missing database name")]
+    MissingDbName,
+    #[error("invalid database name")]
+    InvalidDbName,
+    #[error("missing username")]
+    MissingUsername,
+    #[error("missing password")]
+    MissingPassword,
+    #[error("missing hostname")]
+    MissingHostname,
+    #[error("invalid hostname: {0}")]
+    InvalidEndpoint(#[from] ComputeUserInfoParseError),
+    #[error("malformed endpoint")]
+    MalformedEndpoint,
+}
+
 fn get_conn_info(
     ctx: &mut RequestMonitoring,
     headers: &HeaderMap,
-    sni_hostname: Option<String>,
     tls: &TlsConfig,
-) -> Result<ConnInfo, anyhow::Error> {
+) -> Result<ConnInfo, ConnInfoError> {
     let connection_string = headers
         .get("Neon-Connection-String")
-        .ok_or(anyhow::anyhow!("missing connection string"))?
-        .to_str()?;
+        .ok_or(ConnInfoError::InvalidHeader("Neon-Connection-String"))?
+        .to_str()
+        .map_err(|_| ConnInfoError::InvalidHeader("Neon-Connection-String"))?;
 
     let connection_url = Url::parse(connection_string)?;
 
     let protocol = connection_url.scheme();
     if protocol != "postgres" && protocol != "postgresql" {
-        return Err(anyhow::anyhow!(
-            "connection string must start with postgres: or postgresql:"
-        ));
+        return Err(ConnInfoError::IncorrectScheme);
     }
 
     let mut url_path = connection_url
         .path_segments()
-        .ok_or(anyhow::anyhow!("missing database name"))?;
+        .ok_or(ConnInfoError::MissingDbName)?;
 
-    let dbname = url_path
-        .next()
-        .ok_or(anyhow::anyhow!("invalid database name"))?;
+    let dbname = url_path.next().ok_or(ConnInfoError::InvalidDbName)?;
 
     let username = RoleName::from(connection_url.username());
     if username.is_empty() {
-        return Err(anyhow::anyhow!("missing username"));
+        return Err(ConnInfoError::MissingUsername);
     }
     ctx.set_user(username.clone());
 
     let password = connection_url
         .password()
-        .ok_or(anyhow::anyhow!("no password"))?;
-
-    // TLS certificate selector now based on SNI hostname, so if we are running here
-    // we are sure that SNI hostname is set to one of the configured domain names.
-    let sni_hostname = sni_hostname.ok_or(anyhow::anyhow!("no SNI hostname set"))?;
+        .ok_or(ConnInfoError::MissingPassword)?;
 
     let hostname = connection_url
         .host_str()
-        .ok_or(anyhow::anyhow!("no host"))?;
+        .ok_or(ConnInfoError::MissingHostname)?;
 
-    let host_header = headers
-        .get("host")
-        .and_then(|h| h.to_str().ok())
-        .and_then(|h| h.split(':').next());
-
-    // sni_hostname has to be either the same as hostname or the one used in serverless driver.
-    if !check_matches(&sni_hostname, hostname)? {
-        return Err(anyhow::anyhow!("mismatched SNI hostname and hostname"));
-    } else if let Some(h) = host_header {
-        if h != sni_hostname {
-            return Err(anyhow::anyhow!("mismatched host header and hostname"));
-        }
-    }
-
-    let endpoint = endpoint_sni(hostname, &tls.common_names)?.context("malformed endpoint")?;
+    let endpoint =
+        endpoint_sni(hostname, &tls.common_names)?.ok_or(ConnInfoError::MalformedEndpoint)?;
     ctx.set_endpoint_id(endpoint.clone());
 
     let pairs = connection_url.query_pairs();
@@ -173,36 +176,27 @@ fn get_conn_info(
     })
 }
 
-fn check_matches(sni_hostname: &str, hostname: &str) -> Result<bool, anyhow::Error> {
-    if sni_hostname == hostname {
-        return Ok(true);
-    }
-    let (sni_hostname_first, sni_hostname_rest) = sni_hostname
-        .split_once('.')
-        .ok_or_else(|| anyhow::anyhow!("Unexpected sni format."))?;
-    let (_, hostname_rest) = hostname
-        .split_once('.')
-        .ok_or_else(|| anyhow::anyhow!("Unexpected hostname format."))?;
-    Ok(sni_hostname_rest == hostname_rest && sni_hostname_first == SERVERLESS_DRIVER_SNI)
-}
-
 // TODO: return different http error codes
 pub async fn handle(
     config: &'static ProxyConfig,
-    ctx: &mut RequestMonitoring,
+    mut ctx: RequestMonitoring,
     request: Request<Body>,
-    sni_hostname: Option<String>,
     backend: Arc<PoolingBackend>,
 ) -> Result<Response<Body>, ApiError> {
     let result = tokio::time::timeout(
         config.http_config.request_timeout,
-        handle_inner(config, ctx, request, sni_hostname, backend),
+        handle_inner(config, &mut ctx, request, backend),
     )
     .await;
     let mut response = match result {
         Ok(r) => match r {
-            Ok(r) => r,
+            Ok(r) => {
+                ctx.set_success();
+                r
+            }
             Err(e) => {
+                // TODO: ctx.set_error_kind(e.get_error_type());
+
                 let mut message = format!("{:?}", e);
                 let db_error = e
                     .downcast_ref::<tokio_postgres::Error>()
@@ -278,7 +272,9 @@ pub async fn handle(
                 )?
             }
         },
-        Err(_) => {
+        Err(e) => {
+            ctx.set_error_kind(e.get_error_kind());
+
             let message = format!(
                 "HTTP-Connection timed out, execution time exeeded {} seconds",
                 config.http_config.request_timeout.as_secs()
@@ -290,6 +286,7 @@ pub async fn handle(
             )?
         }
     };
+
     response.headers_mut().insert(
         "Access-Control-Allow-Origin",
         hyper::http::HeaderValue::from_static("*"),
@@ -302,7 +299,6 @@ async fn handle_inner(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
     request: Request<Body>,
-    sni_hostname: Option<String>,
     backend: Arc<PoolingBackend>,
 ) -> anyhow::Result<Response<Body>> {
     let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
@@ -318,12 +314,7 @@ async fn handle_inner(
     //
     let headers = request.headers();
     // TLS config should be there.
-    let conn_info = get_conn_info(
-        ctx,
-        headers,
-        sni_hostname,
-        config.tls_config.as_ref().unwrap(),
-    )?;
+    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
     info!(
         user = conn_info.user_info.user.as_str(),
         project = conn_info.user_info.endpoint.as_str(),
@@ -487,8 +478,6 @@ async fn handle_inner(
         }
     };
 
-    ctx.set_success();
-    ctx.log();
     let metrics = client.metrics();
 
     // how could this possibly fail
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index f68b35010a..062dd440b2 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -2,7 +2,7 @@ use crate::{
     cancellation::CancelMap,
     config::ProxyConfig,
     context::RequestMonitoring,
-    error::io_error,
+    error::{io_error, ReportableError},
     proxy::{handle_client, ClientMode},
     rate_limiter::EndpointRateLimiter,
 };
@@ -131,23 +131,41 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
 
 pub async fn serve_websocket(
     config: &'static ProxyConfig,
-    ctx: &mut RequestMonitoring,
+    mut ctx: RequestMonitoring,
     websocket: HyperWebsocket,
     cancel_map: Arc<CancelMap>,
     hostname: Option<String>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
-    handle_client(
+    let res = handle_client(
         config,
-        ctx,
+        &mut ctx,
         cancel_map,
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },
         endpoint_rate_limiter,
     )
-    .await?;
-    Ok(())
+    .await;
+
+    match res {
+        Err(e) => {
+            // todo: log and push to ctx the error kind
+            ctx.set_error_kind(e.get_error_kind());
+            ctx.log();
+            Err(e.into())
+        }
+        Ok(None) => {
+            ctx.set_success();
+            ctx.log();
+            Ok(())
+        }
+        Ok(Some(p)) => {
+            ctx.set_success();
+            ctx.log();
+            p.proxy_pass().await
+        }
+    }
 }
 
 #[cfg(test)]
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index f48b3fe39f..0d639d2c07 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -1,6 +1,5 @@
 use crate::config::TlsServerEndPoint;
-use crate::error::UserFacingError;
-use anyhow::bail;
+use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use bytes::BytesMut;
 
 use pq_proto::framed::{ConnectionError, Framed};
@@ -73,6 +72,30 @@ impl<S: AsyncRead + Unpin> PqStream<S> {
     }
 }
 
+#[derive(Debug)]
+pub struct ReportedError {
+    source: anyhow::Error,
+    error_kind: ErrorKind,
+}
+
+impl std::fmt::Display for ReportedError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.source.fmt(f)
+    }
+}
+
+impl std::error::Error for ReportedError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        self.source.source()
+    }
+}
+
+impl ReportableError for ReportedError {
+    fn get_error_kind(&self) -> ErrorKind {
+        self.error_kind
+    }
+}
+
 impl<S: AsyncWrite + Unpin> PqStream<S> {
     /// Write the message into an internal buffer, but don't flush the underlying stream.
     pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
@@ -98,24 +121,52 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
     /// Write the error message using [`Self::write_message`], then re-throw it.
     /// Allowing string literals is safe under the assumption they might not contain any runtime info.
     /// This method exists due to `&str` not implementing `Into<anyhow::Error>`.
-    pub async fn throw_error_str<T>(&mut self, error: &'static str) -> anyhow::Result<T> {
-        tracing::info!("forwarding error to user: {error}");
-        self.write_message(&BeMessage::ErrorResponse(error, None))
-            .await?;
-        bail!(error)
+    pub async fn throw_error_str<T>(
+        &mut self,
+        msg: &'static str,
+        error_kind: ErrorKind,
+    ) -> Result<T, ReportedError> {
+        tracing::info!(
+            kind = error_kind.to_metric_label(),
+            msg,
+            "forwarding error to user"
+        );
+
+        // already error case, ignore client IO error
+        let _: Result<_, std::io::Error> = self
+            .write_message(&BeMessage::ErrorResponse(msg, None))
+            .await;
+
+        Err(ReportedError {
+            source: anyhow::anyhow!(msg),
+            error_kind,
+        })
     }
 
     /// Write the error message using [`Self::write_message`], then re-throw it.
     /// Trait [`UserFacingError`] acts as an allowlist for error types.
-    pub async fn throw_error<T, E>(&mut self, error: E) -> anyhow::Result<T>
+    pub async fn throw_error<T, E>(&mut self, error: E) -> Result<T, ReportedError>
     where
         E: UserFacingError + Into<anyhow::Error>,
     {
+        let error_kind = error.get_error_kind();
         let msg = error.to_string_client();
-        tracing::info!("forwarding error to user: {msg}");
-        self.write_message(&BeMessage::ErrorResponse(&msg, None))
-            .await?;
-        bail!(error)
+        tracing::info!(
+            kind=error_kind.to_metric_label(),
+            error=%error,
+            msg,
+            "forwarding error to user"
+        );
+
+        // already error case, ignore client IO error
+        let _: Result<_, std::io::Error> = self
+            .write_message(&BeMessage::ErrorResponse(&msg, None))
+            .await;
+
+        Err(ReportedError {
+            source: anyhow::anyhow!(error),
+            error_kind,
+        })
     }
 }
 

From 1bb9abebf2cc380fa5ef0b876280afd2d120c257 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 9 Feb 2024 16:41:43 +0300
Subject: [PATCH 0141/1571] Remove WAL segments from s3 in batches.

Do list-delete operations in batches instead of doing full list first, to ensure
deletion makes progress even if there are a lot of files to remove.

To this end, add max_keys limit to remote storage list_files.
---
 libs/remote_storage/src/azure_blob.rs         | 16 +++++++-
 libs/remote_storage/src/lib.rs                | 38 +++++++++++++------
 libs/remote_storage/src/local_fs.rs           | 13 +++++--
 libs/remote_storage/src/s3_bucket.rs          | 21 +++++++++-
 libs/remote_storage/src/simulate_failures.rs  |  7 +++-
 libs/remote_storage/tests/common/tests.rs     | 15 ++++++--
 libs/remote_storage/tests/test_real_s3.rs     |  2 +-
 .../src/tenant/remote_timeline_client.rs      |  2 +-
 .../tenant/remote_timeline_client/download.rs |  4 +-
 safekeeper/src/wal_backup.rs                  | 29 ++++++++++++--
 10 files changed, 119 insertions(+), 28 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index c6d5224706..df6d45dde1 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -191,6 +191,7 @@ impl RemoteStorage for AzureBlobStorage {
         &self,
         prefix: Option<&RemotePath>,
         mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
     ) -> anyhow::Result<Listing, DownloadError> {
         // get the passed prefix or if it is not set use prefix_in_bucket value
         let list_prefix = prefix
@@ -223,6 +224,8 @@ impl RemoteStorage for AzureBlobStorage {
 
         let mut response = builder.into_stream();
         let mut res = Listing::default();
+        // NonZeroU32 doesn't support subtraction apparently
+        let mut max_keys = max_keys.map(|mk| mk.get());
         while let Some(l) = response.next().await {
             let entry = l.map_err(to_download_error)?;
             let prefix_iter = entry
@@ -235,7 +238,18 @@ impl RemoteStorage for AzureBlobStorage {
                 .blobs
                 .blobs()
                 .map(|k| self.name_to_relative_path(&k.name));
-            res.keys.extend(blob_iter);
+
+            for key in blob_iter {
+                res.keys.push(key);
+                if let Some(mut mk) = max_keys {
+                    assert!(mk > 0);
+                    mk -= 1;
+                    if mk == 0 {
+                        return Ok(res); // limit reached
+                    }
+                    max_keys = Some(mk);
+                }
+            }
         }
         Ok(res)
     }
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index b6648931ac..5a0b74e406 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -16,7 +16,12 @@ mod simulate_failures;
 mod support;
 
 use std::{
-    collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
+    collections::HashMap,
+    fmt::Debug,
+    num::{NonZeroU32, NonZeroUsize},
+    pin::Pin,
+    sync::Arc,
+    time::SystemTime,
 };
 
 use anyhow::{bail, Context};
@@ -155,7 +160,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
         prefix: Option<&RemotePath>,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         let result = self
-            .list(prefix, ListingMode::WithDelimiter)
+            .list(prefix, ListingMode::WithDelimiter, None)
             .await?
             .prefixes;
         Ok(result)
@@ -171,11 +176,17 @@ pub trait RemoteStorage: Send + Sync + 'static {
     /// whereas,
     /// list_prefixes("foo/bar/") = ["cat", "dog"]
     /// See `test_real_s3.rs` for more details.
+    ///
+    /// max_keys limits max number of keys returned; None means unlimited.
     async fn list_files(
         &self,
         prefix: Option<&RemotePath>,
+        max_keys: Option<NonZeroU32>,
     ) -> Result<Vec<RemotePath>, DownloadError> {
-        let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
+        let result = self
+            .list(prefix, ListingMode::NoDelimiter, max_keys)
+            .await?
+            .keys;
         Ok(result)
     }
 
@@ -183,6 +194,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
         &self,
         prefix: Option<&RemotePath>,
         _mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
     ) -> Result<Listing, DownloadError>;
 
     /// Streams the local file contents into remote into the remote storage entry.
@@ -341,27 +353,31 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         &self,
         prefix: Option<&RemotePath>,
         mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
     ) -> anyhow::Result<Listing, DownloadError> {
         match self {
-            Self::LocalFs(s) => s.list(prefix, mode).await,
-            Self::AwsS3(s) => s.list(prefix, mode).await,
-            Self::AzureBlob(s) => s.list(prefix, mode).await,
-            Self::Unreliable(s) => s.list(prefix, mode).await,
+            Self::LocalFs(s) => s.list(prefix, mode, max_keys).await,
+            Self::AwsS3(s) => s.list(prefix, mode, max_keys).await,
+            Self::AzureBlob(s) => s.list(prefix, mode, max_keys).await,
+            Self::Unreliable(s) => s.list(prefix, mode, max_keys).await,
         }
     }
 
     // A function for listing all the files in a "directory"
     // Example:
     // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
+    //
+    // max_keys limits max number of keys returned; None means unlimited.
     pub async fn list_files(
         &self,
         folder: Option<&RemotePath>,
+        max_keys: Option<NonZeroU32>,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         match self {
-            Self::LocalFs(s) => s.list_files(folder).await,
-            Self::AwsS3(s) => s.list_files(folder).await,
-            Self::AzureBlob(s) => s.list_files(folder).await,
-            Self::Unreliable(s) => s.list_files(folder).await,
+            Self::LocalFs(s) => s.list_files(folder, max_keys).await,
+            Self::AwsS3(s) => s.list_files(folder, max_keys).await,
+            Self::AzureBlob(s) => s.list_files(folder, max_keys).await,
+            Self::Unreliable(s) => s.list_files(folder, max_keys).await,
         }
     }
 
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 3ebea76181..f53ba9db07 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -4,7 +4,9 @@
 //! This storage used in tests, but can also be used in cases when a certain persistent
 //! volume is mounted to the local FS.
 
-use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime};
+use std::{
+    borrow::Cow, future::Future, io::ErrorKind, num::NonZeroU32, pin::Pin, time::SystemTime,
+};
 
 use anyhow::{bail, ensure, Context};
 use bytes::Bytes;
@@ -162,6 +164,7 @@ impl RemoteStorage for LocalFs {
         &self,
         prefix: Option<&RemotePath>,
         mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
     ) -> Result<Listing, DownloadError> {
         let mut result = Listing::default();
 
@@ -178,6 +181,9 @@ impl RemoteStorage for LocalFs {
                     !path.is_dir()
                 })
                 .collect();
+            if let Some(max_keys) = max_keys {
+                result.keys.truncate(max_keys.get() as usize);
+            }
 
             return Ok(result);
         }
@@ -790,12 +796,12 @@ mod fs_tests {
         let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
         let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
 
-        let listing = storage.list(None, ListingMode::NoDelimiter).await?;
+        let listing = storage.list(None, ListingMode::NoDelimiter, None).await?;
         assert!(listing.prefixes.is_empty());
         assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
 
         // Delimiter: should only go one deep
-        let listing = storage.list(None, ListingMode::WithDelimiter).await?;
+        let listing = storage.list(None, ListingMode::WithDelimiter, None).await?;
 
         assert_eq!(
             listing.prefixes,
@@ -808,6 +814,7 @@ mod fs_tests {
             .list(
                 Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
                 ListingMode::WithDelimiter,
+                None,
             )
             .await?;
         assert_eq!(
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 2b33a6ffd1..dee5750cac 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -7,6 +7,7 @@
 use std::{
     borrow::Cow,
     collections::HashMap,
+    num::NonZeroU32,
     pin::Pin,
     sync::Arc,
     task::{Context, Poll},
@@ -408,8 +409,11 @@ impl RemoteStorage for S3Bucket {
         &self,
         prefix: Option<&RemotePath>,
         mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
     ) -> Result<Listing, DownloadError> {
         let kind = RequestKind::List;
+        // s3 sdk wants i32
+        let mut max_keys = max_keys.map(|mk| mk.get() as i32);
         let mut result = Listing::default();
 
         // get the passed prefix or if it is not set use prefix_in_bucket value
@@ -433,13 +437,20 @@ impl RemoteStorage for S3Bucket {
             let _guard = self.permit(kind).await;
             let started_at = start_measuring_requests(kind);
 
+            // min of two Options, returning Some if one is value and another is
+            // None (None is smaller than anything, so plain min doesn't work).
+            let request_max_keys = self
+                .max_keys_per_list_response
+                .into_iter()
+                .chain(max_keys.into_iter())
+                .min();
             let mut request = self
                 .client
                 .list_objects_v2()
                 .bucket(self.bucket_name.clone())
                 .set_prefix(list_prefix.clone())
                 .set_continuation_token(continuation_token)
-                .set_max_keys(self.max_keys_per_list_response);
+                .set_max_keys(request_max_keys);
 
             if let ListingMode::WithDelimiter = mode {
                 request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
@@ -469,6 +480,14 @@ impl RemoteStorage for S3Bucket {
                 let object_path = object.key().expect("response does not contain a key");
                 let remote_path = self.s3_object_to_relative_path(object_path);
                 result.keys.push(remote_path);
+                if let Some(mut mk) = max_keys {
+                    assert!(mk > 0);
+                    mk -= 1;
+                    if mk == 0 {
+                        return Ok(result); // limit reached
+                    }
+                    max_keys = Some(mk);
+                }
             }
 
             result.prefixes.extend(
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 14bdb5ed4d..3dfa16b64e 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -4,6 +4,7 @@
 use bytes::Bytes;
 use futures::stream::Stream;
 use std::collections::HashMap;
+use std::num::NonZeroU32;
 use std::sync::Mutex;
 use std::time::SystemTime;
 use std::{collections::hash_map::Entry, sync::Arc};
@@ -113,20 +114,22 @@ impl RemoteStorage for UnreliableWrapper {
     async fn list_files(
         &self,
         folder: Option<&RemotePath>,
+        max_keys: Option<NonZeroU32>,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
             .map_err(DownloadError::Other)?;
-        self.inner.list_files(folder).await
+        self.inner.list_files(folder, max_keys).await
     }
 
     async fn list(
         &self,
         prefix: Option<&RemotePath>,
         mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
     ) -> Result<Listing, DownloadError> {
         self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
             .map_err(DownloadError::Other)?;
-        self.inner.list(prefix, mode).await
+        self.inner.list(prefix, mode, max_keys).await
     }
 
     async fn upload(
diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs
index abccc24c97..6d062f3898 100644
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,8 +1,8 @@
 use anyhow::Context;
 use camino::Utf8Path;
 use remote_storage::RemotePath;
-use std::collections::HashSet;
 use std::sync::Arc;
+use std::{collections::HashSet, num::NonZeroU32};
 use test_context::test_context;
 use tracing::debug;
 
@@ -103,7 +103,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
     let base_prefix =
         RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
     let root_files = test_client
-        .list_files(None)
+        .list_files(None, None)
         .await
         .context("client list root files failure")?
         .into_iter()
@@ -113,8 +113,17 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
         ctx.remote_blobs.clone(),
         "remote storage list_files on root mismatches with the uploads."
     );
+
+    // Test that max_keys limit works. In total there are about 21 files (see
+    // upload_simple_remote_data call in test_real_s3.rs).
+    let limited_root_files = test_client
+        .list_files(None, Some(NonZeroU32::new(2).unwrap()))
+        .await
+        .context("client list root files failure")?;
+    assert_eq!(limited_root_files.len(), 2);
+
     let nested_remote_files = test_client
-        .list_files(Some(&base_prefix))
+        .list_files(Some(&base_prefix), None)
         .await
         .context("client list nested files failure")?
         .into_iter()
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index fc52dabc36..3dc8347c83 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -70,7 +70,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     }
 
     async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(retry(|| client.list_files(None))
+        Ok(retry(|| client.list_files(None, None))
             .await
             .context("list root files failure")?
             .into_iter()
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 0c7dd68c3f..e17dea01a8 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1151,7 +1151,7 @@ impl RemoteTimelineClient {
         let remaining = download_retry(
             || async {
                 self.storage_impl
-                    .list_files(Some(&timeline_storage_path))
+                    .list_files(Some(&timeline_storage_path), None)
                     .await
             },
             "list remaining files",
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 33287fc8f4..e755cd08f3 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -220,7 +220,7 @@ pub async fn list_remote_timelines(
         || {
             download_cancellable(
                 &cancel,
-                storage.list(Some(&remote_path), ListingMode::WithDelimiter),
+                storage.list(Some(&remote_path), ListingMode::WithDelimiter, None),
             )
         },
         &format!("list timelines for {tenant_shard_id}"),
@@ -373,7 +373,7 @@ pub(super) async fn download_index_part(
     let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
 
     let indices = download_retry(
-        || async { storage.list_files(Some(&index_prefix)).await },
+        || async { storage.list_files(Some(&index_prefix), None).await },
         "list index_part files",
         cancel,
     )
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index df99244770..dbdc742d26 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -10,6 +10,7 @@ use utils::id::NodeId;
 
 use std::cmp::min;
 use std::collections::{HashMap, HashSet};
+use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::time::Duration;
@@ -546,6 +547,10 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     let ttid_path = Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string());
     let remote_path = RemotePath::new(&ttid_path)?;
 
+    // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
+    // const Option unwrap is not stable, otherwise it would be const.
+    let batch_size: NonZeroU32 = NonZeroU32::new(1000).unwrap();
+
     // A backoff::retry is used here for two reasons:
     // - To provide a backoff rather than busy-polling the API on errors
     // - To absorb transient 429/503 conditions without hitting our error
@@ -557,8 +562,26 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     let token = CancellationToken::new(); // not really used
     backoff::retry(
         || async {
-            let files = storage.list_files(Some(&remote_path)).await?;
-            storage.delete_objects(&files).await
+            // Do list-delete in batch_size batches to make progress even if there a lot of files.
+            // Alternatively we could make list_files return iterator, but it is more complicated and
+            // I'm not sure deleting while iterating is expected in s3.
+            loop {
+                let files = storage
+                    .list_files(Some(&remote_path), Some(batch_size))
+                    .await?;
+                if files.is_empty() {
+                    return Ok(()); // done
+                }
+                // (at least) s3 results are sorted, so can log min/max:
+                // "List results are always returned in UTF-8 binary order."
+                info!(
+                    "deleting batch of {} WAL segments [{}-{}]",
+                    files.len(),
+                    files.first().unwrap().object_name().unwrap_or(""),
+                    files.last().unwrap().object_name().unwrap_or("")
+                );
+                storage.delete_objects(&files).await?;
+            }
         },
         |_| false,
         3,
@@ -594,7 +617,7 @@ pub async fn copy_s3_segments(
 
     let remote_path = RemotePath::new(&relative_dst_path)?;
 
-    let files = storage.list_files(Some(&remote_path)).await?;
+    let files = storage.list_files(Some(&remote_path), None).await?;
     let uploaded_segments = &files
         .iter()
         .filter_map(|file| file.object_name().map(ToOwned::to_owned))

From ca818c8bd76d815f0d41eb61fdb8fb9b826ffe54 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 9 Feb 2024 20:09:37 +0100
Subject: [PATCH 0142/1571] fix(test_ondemand_download_timetravel):
 occasionally fails with slightly higher physical size (#6687)

---
 test_runner/regress/test_ondemand_download.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index af2d7aae88..3a197875dd 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -197,6 +197,14 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder):
     ##### Stop the first pageserver instance, erase all its data
     env.endpoints.stop_all()
 
+    # Stop safekeepers and take another checkpoint. The endpoints might
+    # have written a few more bytes during shutdown.
+    for sk in env.safekeepers:
+        sk.stop()
+
+    client.timeline_checkpoint(tenant_id, timeline_id)
+    current_lsn = Lsn(client.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
+
     # wait until pageserver has successfully uploaded all the data to remote storage
     wait_for_upload(client, tenant_id, timeline_id, current_lsn)
 

From cbd3a32d4d4275338c851dd158e0cb950d64ee91 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 9 Feb 2024 19:22:23 +0000
Subject: [PATCH 0143/1571] proxy: decode username and password (#6700)

## Problem

usernames and passwords can be URL 'percent' encoded in the connection
string URL provided by serverless driver.

## Summary of changes

Decode the parameters when getting conn info
---
 Cargo.lock                            |  2 ++
 Cargo.toml                            |  1 +
 proxy/Cargo.toml                      |  4 +++-
 proxy/src/serverless/backend.rs       |  2 +-
 proxy/src/serverless/conn_pool.rs     |  7 ++++---
 proxy/src/serverless/sql_over_http.rs | 10 ++++++++--
 test_runner/fixtures/neon_fixtures.py |  6 +++---
 test_runner/regress/test_proxy.py     | 12 ++++++++++++
 8 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a2939e6c75..83afdaf66f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4125,6 +4125,7 @@ dependencies = [
  "serde",
  "serde_json",
  "sha2",
+ "smallvec",
  "smol_str",
  "socket2 0.5.5",
  "sync_wrapper",
@@ -4143,6 +4144,7 @@ dependencies = [
  "tracing-subscriber",
  "tracing-utils",
  "url",
+ "urlencoding",
  "utils",
  "uuid",
  "walkdir",
diff --git a/Cargo.toml b/Cargo.toml
index 6a2c3fa563..ebc3dfa7b1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -171,6 +171,7 @@ tracing-opentelemetry = "0.20.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
+urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
 webpki-roots = "0.25"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 83cab381b3..0777d361d2 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -60,6 +60,8 @@ scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sha2.workspace = true
+smol_str.workspace = true
+smallvec.workspace = true
 socket2.workspace = true
 sync_wrapper.workspace = true
 task-local-extensions.workspace = true
@@ -76,6 +78,7 @@ tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
 url.workspace = true
+urlencoding.workspace = true
 utils.workspace = true
 uuid.workspace = true
 webpki-roots.workspace = true
@@ -84,7 +87,6 @@ native-tls.workspace = true
 postgres-native-tls.workspace = true
 postgres-protocol.workspace = true
 redis.workspace = true
-smol_str.workspace = true
 
 workspace_hack.workspace = true
 
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 03257e9161..8285da68d7 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -48,7 +48,7 @@ impl PoolingBackend {
             }
         };
         let auth_outcome =
-            crate::auth::validate_password_and_exchange(conn_info.password.as_bytes(), secret)?;
+            crate::auth::validate_password_and_exchange(&conn_info.password, secret)?;
         match auth_outcome {
             crate::sasl::Outcome::Success(key) => Ok(key),
             crate::sasl::Outcome::Failure(reason) => {
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index f92793096b..f4e5b145c5 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -3,6 +3,7 @@ use futures::{future::poll_fn, Future};
 use metrics::IntCounterPairGuard;
 use parking_lot::RwLock;
 use rand::Rng;
+use smallvec::SmallVec;
 use smol_str::SmolStr;
 use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
 use std::{
@@ -36,7 +37,7 @@ pub const APP_NAME: SmolStr = SmolStr::new_inline("/sql_over_http");
 pub struct ConnInfo {
     pub user_info: ComputeUserInfo,
     pub dbname: DbName,
-    pub password: SmolStr,
+    pub password: SmallVec<[u8; 16]>,
 }
 
 impl ConnInfo {
@@ -731,7 +732,7 @@ mod tests {
                 options: Default::default(),
             },
             dbname: "dbname".into(),
-            password: "password".into(),
+            password: "password".as_bytes().into(),
         };
         let ep_pool =
             Arc::downgrade(&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
@@ -788,7 +789,7 @@ mod tests {
                 options: Default::default(),
             },
             dbname: "dbname".into(),
-            password: "password".into(),
+            password: "password".as_bytes().into(),
         };
         let ep_pool =
             Arc::downgrade(&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 401022347e..54424360c4 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -100,6 +100,8 @@ pub enum ConnInfoError {
     InvalidDbName,
     #[error("missing username")]
     MissingUsername,
+    #[error("invalid username: {0}")]
+    InvalidUsername(#[from] std::string::FromUtf8Error),
     #[error("missing password")]
     MissingPassword,
     #[error("missing hostname")]
@@ -134,7 +136,7 @@ fn get_conn_info(
 
     let dbname = url_path.next().ok_or(ConnInfoError::InvalidDbName)?;
 
-    let username = RoleName::from(connection_url.username());
+    let username = RoleName::from(urlencoding::decode(connection_url.username())?);
     if username.is_empty() {
         return Err(ConnInfoError::MissingUsername);
     }
@@ -143,6 +145,7 @@ fn get_conn_info(
     let password = connection_url
         .password()
         .ok_or(ConnInfoError::MissingPassword)?;
+    let password = urlencoding::decode_binary(password.as_bytes());
 
     let hostname = connection_url
         .host_str()
@@ -172,7 +175,10 @@ fn get_conn_info(
     Ok(ConnInfo {
         user_info,
         dbname: dbname.into(),
-        password: password.into(),
+        password: match password {
+            std::borrow::Cow::Borrowed(b) => b.into(),
+            std::borrow::Cow::Owned(b) => b.into(),
+        },
     })
 }
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9996853525..231eebff52 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -23,7 +23,7 @@ from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
 from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
-from urllib.parse import urlparse
+from urllib.parse import quote, urlparse
 
 import asyncpg
 import backoff
@@ -2822,8 +2822,8 @@ class NeonProxy(PgProtocol):
 
     def http_query(self, query, args, **kwargs):
         # TODO maybe use default values if not provided
-        user = kwargs["user"]
-        password = kwargs["password"]
+        user = quote(kwargs["user"])
+        password = quote(kwargs["password"])
         expected_code = kwargs.get("expected_code")
 
         connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 49a0450f0c..884643cef0 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -462,6 +462,18 @@ def test_sql_over_http_pool(static_proxy: NeonProxy):
     assert "password authentication failed for user" in res["message"]
 
 
+def test_sql_over_http_urlencoding(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create user \"http+auth$$\" with password '%+$^&*@!' superuser")
+
+    static_proxy.http_query(
+        "select 1",
+        [],
+        user="http+auth$$",
+        password="%+$^&*@!",
+        expected_code=200,
+    )
+
+
 # Beginning a transaction should not impact the next query,
 # which might come from a completely different client.
 def test_http_pool_begin(static_proxy: NeonProxy):

From 1a4dd58b70ad1bf82c4daae520f4550612f91120 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Fri, 9 Feb 2024 11:22:53 -0900
Subject: [PATCH 0144/1571] Grant pg_monitor to neon_superuser (#6691)

## Problem
The people want pg_monitor
https://github.com/neondatabase/neon/issues/6682
## Summary of changes
Gives the people pg_monitor
---
 compute_tools/src/spec.rs                  |  1 +
 test_runner/regress/test_migrations.py     |  4 ++--
 test_runner/regress/test_neon_superuser.py | 18 ++++++++++++++++++
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 3df5f10e23..9c731f257c 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -776,6 +776,7 @@ BEGIN
     END IF;
 END
 $$;"#,
+        "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
     ];
 
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 8954810451..7cc3024ec6 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -15,7 +15,7 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     endpoint.wait_for_migrations()
 
-    num_migrations = 3
+    num_migrations = 4
 
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")
@@ -24,7 +24,7 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     with open(log_path, "r") as log_file:
         logs = log_file.read()
-        assert "INFO handle_migrations: Ran 3 migrations" in logs
+        assert f"INFO handle_migrations: Ran {num_migrations} migrations" in logs
 
     endpoint.stop()
     endpoint.start()
diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index 34f1e64b34..ca8ada4ddb 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -76,3 +76,21 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
             assert [r[0] for r in res] == [10, 20, 30, 40]
 
         wait_until(10, 0.5, check_that_changes_propagated)
+
+        # Test that pg_monitor is working for neon_superuser role
+        cur.execute("SELECT query from pg_stat_activity LIMIT 1")
+        assert cur.fetchall()[0][0] != "<insufficient privilege>"
+        # Test that pg_monitor is not working for non neon_superuser role without grant
+        cur.execute("CREATE ROLE not_a_superuser LOGIN PASSWORD 'Password42!'")
+        cur.execute("GRANT not_a_superuser TO neon_superuser WITH ADMIN OPTION")
+        cur.execute("SET ROLE not_a_superuser")
+        cur.execute("SELECT query from pg_stat_activity LIMIT 1")
+        assert cur.fetchall()[0][0] == "<insufficient privilege>"
+        cur.execute("RESET ROLE")
+        # Test that pg_monitor is working for non neon_superuser role with grant
+        cur.execute("GRANT pg_monitor TO not_a_superuser")
+        cur.execute("SET ROLE not_a_superuser")
+        cur.execute("SELECT query from pg_stat_activity LIMIT 1")
+        assert cur.fetchall()[0][0] != "<insufficient privilege>"
+        cur.execute("RESET ROLE")
+        cur.execute("DROP ROLE not_a_superuser")

From 5779c7908abaadb0c96a5087423e2082101924b9 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 9 Feb 2024 23:22:40 +0100
Subject: [PATCH 0145/1571] revert two recent `heavier_once_cell` changes
 (#6704)

This PR reverts

- https://github.com/neondatabase/neon/pull/6589
- https://github.com/neondatabase/neon/pull/6652

because there's a performance regression that's particularly visible at
high layer counts.

Most likely it's because the switch to RwLock inflates the

```
    inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
```

size from 48 to 88 bytes, which, by itself is almost a doubling of the
cache footprint, and probably the fact that it's now larger than a cache
line also doesn't help.

See this chat on the Neon discord for more context:

https://discord.com/channels/1176467419317940276/1204714372295958548/1205541184634617906

I'm reverting 6652 as well because it might also have perf implications,
and we're getting close to the next release. We should re-do its changes
after the next release, though.

cc @koivunej
cc @ivaxer
---
 libs/utils/src/sync/heavier_once_cell.rs     | 322 ++++---------------
 pageserver/src/tenant/storage_layer/layer.rs |  24 +-
 pageserver/src/tenant/timeline.rs            |   2 +-
 3 files changed, 81 insertions(+), 267 deletions(-)

diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 81625b907e..0ccaf4e716 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -1,6 +1,6 @@
 use std::sync::{
     atomic::{AtomicUsize, Ordering},
-    Arc,
+    Arc, Mutex, MutexGuard,
 };
 use tokio::sync::Semaphore;
 
@@ -12,7 +12,7 @@ use tokio::sync::Semaphore;
 ///
 /// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
 pub struct OnceCell<T> {
-    inner: tokio::sync::RwLock<Inner<T>>,
+    inner: Mutex<Inner<T>>,
     initializers: AtomicUsize,
 }
 
@@ -50,7 +50,7 @@ impl<T> OnceCell<T> {
         let sem = Semaphore::new(1);
         sem.close();
         Self {
-            inner: tokio::sync::RwLock::new(Inner {
+            inner: Mutex::new(Inner {
                 init_semaphore: Arc::new(sem),
                 value: Some(value),
             }),
@@ -61,113 +61,56 @@ impl<T> OnceCell<T> {
     /// Returns a guard to an existing initialized value, or uniquely initializes the value before
     /// returning the guard.
     ///
-    /// Initializing might wait on any existing [`GuardMut::take_and_deinit`] deinitialization.
+    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
     ///
     /// Initialization is panic-safe and cancellation-safe.
-    pub async fn get_mut_or_init<F, Fut, E>(&self, factory: F) -> Result<GuardMut<'_, T>, E>
+    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
     where
         F: FnOnce(InitPermit) -> Fut,
         Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
     {
-        loop {
-            let sem = {
-                let guard = self.inner.write().await;
-                if guard.value.is_some() {
-                    return Ok(GuardMut(guard));
-                }
-                guard.init_semaphore.clone()
-            };
-
-            {
-                let permit = {
-                    // increment the count for the duration of queued
-                    let _guard = CountWaitingInitializers::start(self);
-                    sem.acquire().await
-                };
-
-                let Ok(permit) = permit else {
-                    let guard = self.inner.write().await;
-                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
-                        // there was a take_and_deinit in between
-                        continue;
-                    }
-                    assert!(
-                        guard.value.is_some(),
-                        "semaphore got closed, must be initialized"
-                    );
-                    return Ok(GuardMut(guard));
-                };
-
-                permit.forget();
+        let sem = {
+            let guard = self.inner.lock().unwrap();
+            if guard.value.is_some() {
+                return Ok(Guard(guard));
             }
+            guard.init_semaphore.clone()
+        };
 
-            let permit = InitPermit(sem);
-            let (value, _permit) = factory(permit).await?;
+        let permit = {
+            // increment the count for the duration of queued
+            let _guard = CountWaitingInitializers::start(self);
+            sem.acquire_owned().await
+        };
 
-            let guard = self.inner.write().await;
+        match permit {
+            Ok(permit) => {
+                let permit = InitPermit(permit);
+                let (value, _permit) = factory(permit).await?;
 
-            return Ok(Self::set0(value, guard));
+                let guard = self.inner.lock().unwrap();
+
+                Ok(Self::set0(value, guard))
+            }
+            Err(_closed) => {
+                let guard = self.inner.lock().unwrap();
+                assert!(
+                    guard.value.is_some(),
+                    "semaphore got closed, must be initialized"
+                );
+                return Ok(Guard(guard));
+            }
         }
     }
 
-    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
-    /// returning the guard.
-    ///
-    /// Initialization is panic-safe and cancellation-safe.
-    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<GuardRef<'_, T>, E>
-    where
-        F: FnOnce(InitPermit) -> Fut,
-        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
-    {
-        loop {
-            let sem = {
-                let guard = self.inner.read().await;
-                if guard.value.is_some() {
-                    return Ok(GuardRef(guard));
-                }
-                guard.init_semaphore.clone()
-            };
-
-            {
-                let permit = {
-                    // increment the count for the duration of queued
-                    let _guard = CountWaitingInitializers::start(self);
-                    sem.acquire().await
-                };
-
-                let Ok(permit) = permit else {
-                    let guard = self.inner.read().await;
-                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
-                        // there was a take_and_deinit in between
-                        continue;
-                    }
-                    assert!(
-                        guard.value.is_some(),
-                        "semaphore got closed, must be initialized"
-                    );
-                    return Ok(GuardRef(guard));
-                };
-
-                permit.forget();
-            }
-
-            let permit = InitPermit(sem);
-            let (value, _permit) = factory(permit).await?;
-
-            let guard = self.inner.write().await;
-
-            return Ok(Self::set0(value, guard).downgrade());
-        }
-    }
-
-    /// Assuming a permit is held after previous call to [`GuardMut::take_and_deinit`], it can be used
+    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
     /// to complete initializing the inner value.
     ///
     /// # Panics
     ///
     /// If the inner has already been initialized.
-    pub async fn set(&self, value: T, _permit: InitPermit) -> GuardMut<'_, T> {
-        let guard = self.inner.write().await;
+    pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
+        let guard = self.inner.lock().unwrap();
 
         // cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
         // give more permits right now.
@@ -179,31 +122,21 @@ impl<T> OnceCell<T> {
         Self::set0(value, guard)
     }
 
-    fn set0(value: T, mut guard: tokio::sync::RwLockWriteGuard<'_, Inner<T>>) -> GuardMut<'_, T> {
+    fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
         if guard.value.is_some() {
             drop(guard);
             unreachable!("we won permit, must not be initialized");
         }
         guard.value = Some(value);
         guard.init_semaphore.close();
-        GuardMut(guard)
+        Guard(guard)
     }
 
     /// Returns a guard to an existing initialized value, if any.
-    pub async fn get_mut(&self) -> Option<GuardMut<'_, T>> {
-        let guard = self.inner.write().await;
+    pub fn get(&self) -> Option<Guard<'_, T>> {
+        let guard = self.inner.lock().unwrap();
         if guard.value.is_some() {
-            Some(GuardMut(guard))
-        } else {
-            None
-        }
-    }
-
-    /// Returns a guard to an existing initialized value, if any.
-    pub async fn get(&self) -> Option<GuardRef<'_, T>> {
-        let guard = self.inner.read().await;
-        if guard.value.is_some() {
-            Some(GuardRef(guard))
+            Some(Guard(guard))
         } else {
             None
         }
@@ -235,9 +168,9 @@ impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
 /// Uninteresting guard object to allow short-lived access to inspect or clone the held,
 /// initialized value.
 #[derive(Debug)]
-pub struct GuardMut<'a, T>(tokio::sync::RwLockWriteGuard<'a, Inner<T>>);
+pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
 
-impl<T> std::ops::Deref for GuardMut<'_, T> {
+impl<T> std::ops::Deref for Guard<'_, T> {
     type Target = T;
 
     fn deref(&self) -> &Self::Target {
@@ -248,7 +181,7 @@ impl<T> std::ops::Deref for GuardMut<'_, T> {
     }
 }
 
-impl<T> std::ops::DerefMut for GuardMut<'_, T> {
+impl<T> std::ops::DerefMut for Guard<'_, T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         self.0
             .value
@@ -257,59 +190,34 @@ impl<T> std::ops::DerefMut for GuardMut<'_, T> {
     }
 }
 
-impl<'a, T> GuardMut<'a, T> {
+impl<'a, T> Guard<'a, T> {
     /// Take the current value, and a new permit for it's deinitialization.
     ///
     /// The permit will be on a semaphore part of the new internal value, and any following
     /// [`OnceCell::get_or_init`] will wait on it to complete.
     pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
         let mut swapped = Inner::default();
-        let sem = swapped.init_semaphore.clone();
-        sem.try_acquire().expect("we just created this").forget();
+        let permit = swapped
+            .init_semaphore
+            .clone()
+            .try_acquire_owned()
+            .expect("we just created this");
         std::mem::swap(&mut *self.0, &mut swapped);
         swapped
             .value
-            .map(|v| (v, InitPermit(sem)))
-            .expect("guard is not created unless value has been initialized")
-    }
-
-    pub fn downgrade(self) -> GuardRef<'a, T> {
-        GuardRef(self.0.downgrade())
-    }
-}
-
-#[derive(Debug)]
-pub struct GuardRef<'a, T>(tokio::sync::RwLockReadGuard<'a, Inner<T>>);
-
-impl<T> std::ops::Deref for GuardRef<'_, T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        self.0
-            .value
-            .as_ref()
+            .map(|v| (v, InitPermit(permit)))
             .expect("guard is not created unless value has been initialized")
     }
 }
 
 /// Type held by OnceCell (de)initializing task.
-pub struct InitPermit(Arc<tokio::sync::Semaphore>);
-
-impl Drop for InitPermit {
-    fn drop(&mut self) {
-        debug_assert_eq!(self.0.available_permits(), 0);
-        self.0.add_permits(1);
-    }
-}
+pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
 
 #[cfg(test)]
 mod tests {
-    use futures::Future;
-
     use super::*;
     use std::{
         convert::Infallible,
-        pin::{pin, Pin},
         sync::atomic::{AtomicUsize, Ordering},
         time::Duration,
     };
@@ -340,7 +248,7 @@ mod tests {
                     barrier.wait().await;
                     let won = {
                         let g = cell
-                            .get_mut_or_init(|permit| {
+                            .get_or_init(|permit| {
                                 counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
                                 async {
                                     counters.future_polled.fetch_add(1, Ordering::Relaxed);
@@ -387,11 +295,7 @@ mod tests {
             let cell = cell.clone();
             let deinitialization_started = deinitialization_started.clone();
             async move {
-                let (answer, _permit) = cell
-                    .get_mut()
-                    .await
-                    .expect("initialized to value")
-                    .take_and_deinit();
+                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
                 assert_eq!(answer, initial);
 
                 deinitialization_started.wait().await;
@@ -402,7 +306,7 @@ mod tests {
         deinitialization_started.wait().await;
 
         let started_at = tokio::time::Instant::now();
-        cell.get_mut_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
+        cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
             .await
             .unwrap();
 
@@ -414,21 +318,21 @@ mod tests {
 
         jh.await.unwrap();
 
-        assert_eq!(*cell.get_mut().await.unwrap(), reinit);
+        assert_eq!(*cell.get().unwrap(), reinit);
     }
 
-    #[tokio::test]
-    async fn reinit_with_deinit_permit() {
+    #[test]
+    fn reinit_with_deinit_permit() {
         let cell = Arc::new(OnceCell::new(42));
 
-        let (mol, permit) = cell.get_mut().await.unwrap().take_and_deinit();
-        cell.set(5, permit).await;
-        assert_eq!(*cell.get_mut().await.unwrap(), 5);
+        let (mol, permit) = cell.get().unwrap().take_and_deinit();
+        cell.set(5, permit);
+        assert_eq!(*cell.get().unwrap(), 5);
 
-        let (five, permit) = cell.get_mut().await.unwrap().take_and_deinit();
+        let (five, permit) = cell.get().unwrap().take_and_deinit();
         assert_eq!(5, five);
-        cell.set(mol, permit).await;
-        assert_eq!(*cell.get_mut().await.unwrap(), 42);
+        cell.set(mol, permit);
+        assert_eq!(*cell.get().unwrap(), 42);
     }
 
     #[tokio::test]
@@ -436,13 +340,13 @@ mod tests {
         let cell = OnceCell::default();
 
         for _ in 0..10 {
-            cell.get_mut_or_init(|_permit| async { Err("whatever error") })
+            cell.get_or_init(|_permit| async { Err("whatever error") })
                 .await
                 .unwrap_err();
         }
 
         let g = cell
-            .get_mut_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
+            .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
             .await
             .unwrap();
         assert_eq!(*g, "finally success");
@@ -454,7 +358,7 @@ mod tests {
 
         let barrier = tokio::sync::Barrier::new(2);
 
-        let initializer = cell.get_mut_or_init(|permit| async {
+        let initializer = cell.get_or_init(|permit| async {
             barrier.wait().await;
             futures::future::pending::<()>().await;
 
@@ -468,102 +372,12 @@ mod tests {
 
         // now initializer is dropped
 
-        assert!(cell.get_mut().await.is_none());
+        assert!(cell.get().is_none());
 
         let g = cell
-            .get_mut_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
+            .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
             .await
             .unwrap();
         assert_eq!(*g, "now initialized");
     }
-
-    #[tokio::test(start_paused = true)]
-    async fn reproduce_init_take_deinit_race() {
-        init_take_deinit_scenario(|cell, factory| {
-            Box::pin(async {
-                cell.get_or_init(factory).await.unwrap();
-            })
-        })
-        .await;
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn reproduce_init_take_deinit_race_mut() {
-        init_take_deinit_scenario(|cell, factory| {
-            Box::pin(async {
-                cell.get_mut_or_init(factory).await.unwrap();
-            })
-        })
-        .await;
-    }
-
-    type BoxedInitFuture<T, E> = Pin<Box<dyn Future<Output = Result<(T, InitPermit), E>>>>;
-    type BoxedInitFunction<T, E> = Box<dyn Fn(InitPermit) -> BoxedInitFuture<T, E>>;
-
-    /// Reproduce an assertion failure with both initialization methods.
-    ///
-    /// This has interesting generics to be generic between `get_or_init` and `get_mut_or_init`.
-    /// Alternative would be a macro_rules! but that is the last resort.
-    async fn init_take_deinit_scenario<F>(init_way: F)
-    where
-        F: for<'a> Fn(
-            &'a OnceCell<&'static str>,
-            BoxedInitFunction<&'static str, Infallible>,
-        ) -> Pin<Box<dyn Future<Output = ()> + 'a>>,
-    {
-        let cell = OnceCell::default();
-
-        // acquire the init_semaphore only permit to drive initializing tasks in order to waiting
-        // on the same semaphore.
-        let permit = cell
-            .inner
-            .read()
-            .await
-            .init_semaphore
-            .clone()
-            .try_acquire_owned()
-            .unwrap();
-
-        let mut t1 = pin!(init_way(
-            &cell,
-            Box::new(|permit| Box::pin(async move { Ok(("t1", permit)) })),
-        ));
-
-        let mut t2 = pin!(init_way(
-            &cell,
-            Box::new(|permit| Box::pin(async move { Ok(("t2", permit)) })),
-        ));
-
-        // drive t2 first to the init_semaphore
-        tokio::select! {
-            _ = &mut t2 => unreachable!("it cannot get permit"),
-            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
-        }
-
-        // followed by t1 in the init_semaphore
-        tokio::select! {
-            _ = &mut t1 => unreachable!("it cannot get permit"),
-            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
-        }
-
-        // now let t2 proceed and initialize
-        drop(permit);
-        t2.await;
-
-        let (s, permit) = { cell.get_mut().await.unwrap().take_and_deinit() };
-        assert_eq!("t2", s);
-
-        // now originally t1 would see the semaphore it has as closed. it cannot yet get a permit from
-        // the new one.
-        tokio::select! {
-            _ = &mut t1 => unreachable!("it cannot get permit"),
-            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
-        }
-
-        // only now we get to initialize it
-        drop(permit);
-        t1.await;
-
-        assert_eq!("t1", *cell.get().await.unwrap());
-    }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 52c0f8abdc..dd9de99477 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -300,8 +300,8 @@ impl Layer {
         })
     }
 
-    pub(crate) async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        self.0.info(reset).await
+    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        self.0.info(reset)
     }
 
     pub(crate) fn access_stats(&self) -> &LayerAccessStats {
@@ -612,10 +612,10 @@ impl LayerInner {
         let mut rx = self.status.subscribe();
 
         let strong = {
-            match self.inner.get_mut().await {
+            match self.inner.get() {
                 Some(mut either) => {
                     self.wanted_evicted.store(true, Ordering::Relaxed);
-                    ResidentOrWantedEvicted::downgrade(&mut either)
+                    either.downgrade()
                 }
                 None => return Err(EvictionError::NotFound),
             }
@@ -641,7 +641,7 @@ impl LayerInner {
                 // use however late (compared to the initial expressing of wanted) as the
                 // "outcome" now
                 LAYER_IMPL_METRICS.inc_broadcast_lagged();
-                match self.inner.get_mut().await {
+                match self.inner.get() {
                     Some(_) => Err(EvictionError::Downloaded),
                     None => Ok(()),
                 }
@@ -759,7 +759,7 @@ impl LayerInner {
                 // use the already held initialization permit because it is impossible to hit the
                 // below paths anymore essentially limiting the max loop iterations to 2.
                 let (value, init_permit) = download(init_permit).await?;
-                let mut guard = self.inner.set(value, init_permit).await;
+                let mut guard = self.inner.set(value, init_permit);
                 let (strong, _upgraded) = guard
                     .get_and_upgrade()
                     .expect("init creates strong reference, we held the init permit");
@@ -767,7 +767,7 @@ impl LayerInner {
             }
 
             let (weak, permit) = {
-                let mut locked = self.inner.get_mut_or_init(download).await?;
+                let mut locked = self.inner.get_or_init(download).await?;
 
                 if let Some((strong, upgraded)) = locked.get_and_upgrade() {
                     if upgraded {
@@ -989,12 +989,12 @@ impl LayerInner {
         }
     }
 
-    async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
         let layer_file_name = self.desc.filename().file_name();
 
         // this is not accurate: we could have the file locally but there was a cancellation
         // and now we are not in sync, or we are currently downloading it.
-        let remote = self.inner.get_mut().await.is_none();
+        let remote = self.inner.get().is_none();
 
         let access_stats = self.access_stats.as_api_model(reset);
 
@@ -1053,7 +1053,7 @@ impl LayerInner {
                     LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
                     return;
                 };
-                match tokio::runtime::Handle::current().block_on(this.evict_blocking(version)) {
+                match this.evict_blocking(version) {
                     Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
                     Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
                 }
@@ -1061,7 +1061,7 @@ impl LayerInner {
         }
     }
 
-    async fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
+    fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
         // deleted or detached timeline, don't do anything.
         let Some(timeline) = self.timeline.upgrade() else {
             return Err(EvictionCancelled::TimelineGone);
@@ -1070,7 +1070,7 @@ impl LayerInner {
         // to avoid starting a new download while we evict, keep holding on to the
         // permit.
         let _permit = {
-            let maybe_downloaded = self.inner.get_mut().await;
+            let maybe_downloaded = self.inner.get();
 
             let (_weak, permit) = match maybe_downloaded {
                 Some(mut guard) => {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 735b8003b4..f96679ca69 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1268,7 +1268,7 @@ impl Timeline {
         let mut historic_layers = Vec::new();
         for historic_layer in layer_map.iter_historic_layers() {
             let historic_layer = guard.get_from_desc(&historic_layer);
-            historic_layers.push(historic_layer.info(reset).await);
+            historic_layers.push(historic_layer.info(reset));
         }
 
         LayerMapInfo {

From 0fd3cd27cb7ac66df5938bf219e9f12ce7b78c8a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 9 Feb 2024 17:37:30 +0200
Subject: [PATCH 0146/1571] Tighten up the check for garbage after end-of-tar.

Turn the warning into an error, if there is garbage after the end of
imported tar file. However, it's normal for 'tar' to append extra
empty blocks to the end, so tolerate those without warnings or errors.
---
 pageserver/src/page_service.rs     | 17 ++++++++++++-----
 test_runner/regress/test_import.py | 10 +++-------
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 6fc38a76d4..7b660b5eca 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -91,8 +91,8 @@ const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 /// `tokio_tar` already read the first such block. Read the second all-zeros block,
 /// and check that there is no more data after the EOF marker.
 ///
-/// XXX: Currently, any trailing data after the EOF marker prints a warning.
-/// Perhaps it should be a hard error?
+/// 'tar' command can also write extra blocks of zeros, up to a record
+/// size, controlled by the --record-size argument. Ignore them too.
 async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
     use tokio::io::AsyncReadExt;
     let mut buf = [0u8; 512];
@@ -113,17 +113,24 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
         anyhow::bail!("invalid tar EOF marker");
     }
 
-    // Drain any data after the EOF marker
+    // Drain any extra zero-blocks after the EOF marker
     let mut trailing_bytes = 0;
+    let mut seen_nonzero_bytes = false;
     loop {
         let nbytes = reader.read(&mut buf).await?;
         trailing_bytes += nbytes;
+        if !buf.iter().all(|&x| x == 0) {
+            seen_nonzero_bytes = true;
+        }
         if nbytes == 0 {
             break;
         }
     }
-    if trailing_bytes > 0 {
-        warn!("ignored {trailing_bytes} unexpected bytes after the tar archive");
+    if seen_nonzero_bytes {
+        anyhow::bail!("unexpected non-zero bytes after the tar archive");
+    }
+    if trailing_bytes % 512 != 0 {
+        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
     }
     Ok(())
 }
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 3519cbbaab..7942f5cc9b 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -95,7 +95,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
             ".*InternalServerError.*Tenant .* not found.*",
             ".*InternalServerError.*Timeline .* not found.*",
             ".*InternalServerError.*Cannot delete timeline which has child timelines.*",
-            ".*ignored .* unexpected bytes after the tar archive.*",
         ]
     )
 
@@ -142,12 +141,9 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
     with pytest.raises(RuntimeError):
         import_tar(corrupt_base_tar, wal_tar)
 
-    # A tar with trailing garbage is currently accepted. It prints a warnings
-    # to the pageserver log, however. Check that.
-    import_tar(base_plus_garbage_tar, wal_tar)
-    assert env.pageserver.log_contains(
-        ".*WARN.*ignored .* unexpected bytes after the tar archive.*"
-    )
+    # Importing a tar with trailing garbage fails
+    with pytest.raises(RuntimeError):
+        import_tar(base_plus_garbage_tar, wal_tar)
 
     client = env.pageserver.http_client()
     timeline_delete_wait_completed(client, tenant, timeline)

From df5e2729a9ac3ddd80876e0d40e3ba55b95ebf0c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 9 Feb 2024 17:37:34 +0200
Subject: [PATCH 0147/1571] Remove now unused allowlisted errors.

I'm not sure when we stopped emitting these, but they don't seem to be
needed anymore.
---
 test_runner/regress/test_import.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 7942f5cc9b..db385b3e73 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -98,15 +98,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
         ]
     )
 
-    env.pageserver.allowed_errors.extend(
-        [
-            # FIXME: we should clean up pageserver to not print this
-            ".*exited with error: unexpected message type: CopyData.*",
-            # FIXME: Is this expected?
-            ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*",
-        ]
-    )
-
     def import_tar(base, wal):
         env.neon_cli.raw_cli(
             [

From 12b39c9db95ec52353ab2bb3e21bc4a12306ce2b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Sat, 10 Feb 2024 11:56:52 +0000
Subject: [PATCH 0148/1571] control_plane: add debug APIs for force-dropping
 tenant/node (#6702)

## Problem

When debugging/supporting this service, we sometimes need it to just
forget about a tenant or node, e.g. because of an issue cleanly tearing
them down. For example, if I create a tenant with a PlacementPolicy that
can't be scheduled on the nodes we have, we would never be able to
schedule it for a DELETE to work.

## Summary of changes

- Add APIs for dropping nodes and tenants that do no teardown other than
removing the entity from the DB and removing any references to it.
---
 control_plane/attachment_service/src/http.rs  | 19 +++++++++
 .../attachment_service/src/persistence.rs     | 13 ++++++-
 .../attachment_service/src/service.rs         | 39 +++++++++++++++++++
 .../attachment_service/src/tenant_state.rs    | 14 +++++++
 test_runner/regress/test_sharding_service.py  | 24 ++++++++++++
 5 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 8501e4980f..38785d3a98 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -280,6 +280,12 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
     json_response(StatusCode::OK, state.service.node_list().await?)
 }
 
+async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+    json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
+}
+
 async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
     let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
@@ -320,6 +326,13 @@ async fn handle_tenant_shard_migrate(
     )
 }
 
+async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let state = get_state(&req);
+
+    json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
+}
+
 /// Status endpoint is just used for checking that our HTTP listener is up
 async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(StatusCode::OK, ())
@@ -402,6 +415,12 @@ pub fn make_router(
             request_span(r, handle_attach_hook)
         })
         .post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
+        .post("/debug/v1/tenant/:tenant_id/drop", |r| {
+            request_span(r, handle_tenant_drop)
+        })
+        .post("/debug/v1/node/:node_id/drop", |r| {
+            request_span(r, handle_node_drop)
+        })
         .get("/control/v1/tenant/:tenant_id/locate", |r| {
             tenant_service_handler(r, handle_tenant_locate)
         })
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 623d625767..457dc43232 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -260,7 +260,6 @@ impl Persistence {
 
     /// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for
     /// the tenant from memory on this server.
-    #[allow(unused)]
     pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
         self.with_conn(move |conn| -> DatabaseResult<()> {
@@ -273,6 +272,18 @@ impl Persistence {
         .await
     }
 
+    pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
+        use crate::schema::nodes::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::delete(nodes)
+                .filter(node_id.eq(del_node_id.0 as i64))
+                .execute(conn)?;
+
+            Ok(())
+        })
+        .await
+    }
+
     /// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient
     /// batched increment of the generations of all tenants whose generation_pageserver is equal to
     /// the node that called /re-attach.
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 0331087e0d..95efa8ecd7 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1804,6 +1804,45 @@ impl Service {
         Ok(TenantShardMigrateResponse {})
     }
 
+    /// This is for debug/support only: we simply drop all state for a tenant, without
+    /// detaching or deleting it on pageservers.
+    pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> {
+        self.persistence.delete_tenant(tenant_id).await?;
+
+        let mut locked = self.inner.write().unwrap();
+        let mut shards = Vec::new();
+        for (tenant_shard_id, _) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) {
+            shards.push(*tenant_shard_id);
+        }
+
+        for shard in shards {
+            locked.tenants.remove(&shard);
+        }
+
+        Ok(())
+    }
+
+    /// This is for debug/support only: we simply drop all state for a tenant, without
+    /// detaching or deleting it on pageservers.  We do not try and re-schedule any
+    /// tenants that were on this node.
+    ///
+    /// TODO: proper node deletion API that unhooks things more gracefully
+    pub(crate) async fn node_drop(&self, node_id: NodeId) -> Result<(), ApiError> {
+        self.persistence.delete_node(node_id).await?;
+
+        let mut locked = self.inner.write().unwrap();
+
+        for shard in locked.tenants.values_mut() {
+            shard.deref_node(node_id);
+        }
+
+        let mut nodes = (*locked.nodes).clone();
+        nodes.remove(&node_id);
+        locked.nodes = Arc::new(nodes);
+
+        Ok(())
+    }
+
     pub(crate) async fn node_list(&self) -> Result<Vec<NodePersistence>, ApiError> {
         // It is convenient to avoid taking the big lock and converting Node to a serializable
         // structure, by fetching from storage instead of reading in-memory state.
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index c0ab076a55..1646ed9fcd 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -534,4 +534,18 @@ impl TenantState {
             seq: self.sequence,
         })
     }
+
+    // If we had any state at all referring to this node ID, drop it.  Does not
+    // attempt to reschedule.
+    pub(crate) fn deref_node(&mut self, node_id: NodeId) {
+        if self.intent.attached == Some(node_id) {
+            self.intent.attached = None;
+        }
+
+        self.intent.secondary.retain(|n| n != &node_id);
+
+        self.observed.locations.remove(&node_id);
+
+        debug_assert!(!self.intent.all_pageservers().contains(&node_id));
+    }
 }
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index babb0d261c..248d992851 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -387,3 +387,27 @@ def test_sharding_service_compute_hook(
         assert notifications[1] == expect
 
     wait_until(10, 1, received_restart_notification)
+
+
+def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
+    """
+    Verify that occasional-use debug APIs work as expected.  This is a lightweight test
+    that just hits the endpoints to check that they don't bitrot.
+    """
+
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_start()
+
+    tenant_id = TenantId.generate()
+    env.attachment_service.tenant_create(tenant_id, shard_count=2, shard_stripe_size=8192)
+
+    # These APIs are intentionally not implemented as methods on NeonAttachmentService, as
+    # they're just for use in unanticipated circumstances.
+    env.attachment_service.request(
+        "POST", f"{env.attachment_service_api}/debug/v1/node/{env.pageservers[1].id}/drop"
+    )
+    assert len(env.attachment_service.node_list()) == 1
+
+    env.attachment_service.request(
+        "POST", f"{env.attachment_service_api}/debug/v1/tenant/{tenant_id}/drop"
+    )

From da626fb1facd77b1159e55c5aaa39cc28ed3ed41 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 10 Feb 2024 10:48:11 +0200
Subject: [PATCH 0149/1571] tests: Remove "postgres is running on  ... branch"
 messages

It seems like useless chatter. The endpoint.start() itself prints a
"Running command ... neon_local endpoint start" message too.
---
 test_runner/regress/test_ancestor_branch.py     | 2 --
 test_runner/regress/test_backpressure.py        | 1 -
 test_runner/regress/test_branch_behind.py       | 1 -
 test_runner/regress/test_clog_truncate.py       | 2 --
 test_runner/regress/test_config.py              | 2 --
 test_runner/regress/test_createdropdb.py        | 2 --
 test_runner/regress/test_createuser.py          | 2 --
 test_runner/regress/test_ddl_forwarding.py      | 1 -
 test_runner/regress/test_fullbackup.py          | 1 -
 test_runner/regress/test_gc_aggressive.py       | 1 -
 test_runner/regress/test_layer_bloating.py      | 1 -
 test_runner/regress/test_lfc_resize.py          | 1 -
 test_runner/regress/test_logical_replication.py | 2 --
 test_runner/regress/test_lsn_mapping.py         | 2 --
 test_runner/regress/test_multixact.py           | 3 ---
 test_runner/regress/test_neon_extension.py      | 3 ---
 test_runner/regress/test_old_request_lsn.py     | 1 -
 test_runner/regress/test_parallel_copy.py       | 2 --
 test_runner/regress/test_pitr_gc.py             | 1 -
 test_runner/regress/test_read_validation.py     | 2 --
 test_runner/regress/test_readonly_node.py       | 1 -
 test_runner/regress/test_recovery.py            | 1 -
 test_runner/regress/test_subxacts.py            | 8 +-------
 test_runner/regress/test_timeline_size.py       | 6 ------
 test_runner/regress/test_twophase.py            | 1 -
 test_runner/regress/test_vm_bits.py             | 2 --
 26 files changed, 1 insertion(+), 51 deletions(-)

diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py
index 0e390ba9e5..d16d2d6a24 100644
--- a/test_runner/regress/test_ancestor_branch.py
+++ b/test_runner/regress/test_ancestor_branch.py
@@ -45,7 +45,6 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder):
     # Create branch1.
     env.neon_cli.create_branch("branch1", "main", tenant_id=tenant, ancestor_start_lsn=lsn_100)
     endpoint_branch1 = env.endpoints.create_start("branch1", tenant_id=tenant)
-    log.info("postgres is running on 'branch1' branch")
 
     branch1_cur = endpoint_branch1.connect().cursor()
     branch1_timeline = TimelineId(query_scalar(branch1_cur, "SHOW neon.timeline_id"))
@@ -68,7 +67,6 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder):
     # Create branch2.
     env.neon_cli.create_branch("branch2", "branch1", tenant_id=tenant, ancestor_start_lsn=lsn_200)
     endpoint_branch2 = env.endpoints.create_start("branch2", tenant_id=tenant)
-    log.info("postgres is running on 'branch2' branch")
     branch2_cur = endpoint_branch2.connect().cursor()
 
     branch2_timeline = TimelineId(query_scalar(branch2_cur, "SHOW neon.timeline_id"))
diff --git a/test_runner/regress/test_backpressure.py b/test_runner/regress/test_backpressure.py
index bc3faf9271..819912dd05 100644
--- a/test_runner/regress/test_backpressure.py
+++ b/test_runner/regress/test_backpressure.py
@@ -107,7 +107,6 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder):
     # which is needed for backpressure_lsns() to work
     endpoint.respec(skip_pg_catalog_updates=False)
     endpoint.start()
-    log.info("postgres is running on 'test_backpressure' branch")
 
     # setup check thread
     check_stop_event = threading.Event()
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index 9879254897..46c74a26b8 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -21,7 +21,6 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     # Branch at the point where only 100 rows were inserted
     branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind")
     endpoint_main = env.endpoints.create_start("test_branch_behind")
-    log.info("postgres is running on 'test_branch_behind' branch")
 
     main_cur = endpoint_main.connect().cursor()
 
diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py
index f22eca02cc..26e6e336b9 100644
--- a/test_runner/regress/test_clog_truncate.py
+++ b/test_runner/regress/test_clog_truncate.py
@@ -25,7 +25,6 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
     ]
 
     endpoint = env.endpoints.create_start("test_clog_truncate", config_lines=config)
-    log.info("postgres is running on test_clog_truncate branch")
 
     # Install extension containing function needed for test
     endpoint.safe_psql("CREATE EXTENSION neon_test_utils")
@@ -62,7 +61,6 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
         "test_clog_truncate_new", "test_clog_truncate", ancestor_start_lsn=lsn_after_truncation
     )
     endpoint2 = env.endpoints.create_start("test_clog_truncate_new")
-    log.info("postgres is running on test_clog_truncate_new branch")
 
     # check that new node doesn't contain truncated segment
     pg_xact_0000_path_new = os.path.join(endpoint2.pg_xact_dir_path(), "0000")
diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py
index 0ea5784b67..4bb7df1e6a 100644
--- a/test_runner/regress/test_config.py
+++ b/test_runner/regress/test_config.py
@@ -1,6 +1,5 @@
 from contextlib import closing
 
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 
 
@@ -13,7 +12,6 @@ def test_config(neon_simple_env: NeonEnv):
 
     # change config
     endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"])
-    log.info("postgres is running on test_config branch")
 
     with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
diff --git a/test_runner/regress/test_createdropdb.py b/test_runner/regress/test_createdropdb.py
index 500d19cf31..f741a9fc87 100644
--- a/test_runner/regress/test_createdropdb.py
+++ b/test_runner/regress/test_createdropdb.py
@@ -20,7 +20,6 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str):
     env.neon_cli.create_branch("test_createdb", "empty")
 
     endpoint = env.endpoints.create_start("test_createdb")
-    log.info("postgres is running on 'test_createdb' branch")
 
     with endpoint.cursor() as cur:
         # Cause a 'relmapper' change in the original branch
@@ -65,7 +64,6 @@ def test_dropdb(neon_simple_env: NeonEnv, test_output_dir):
     env = neon_simple_env
     env.neon_cli.create_branch("test_dropdb", "empty")
     endpoint = env.endpoints.create_start("test_dropdb")
-    log.info("postgres is running on 'test_dropdb' branch")
 
     with endpoint.cursor() as cur:
         cur.execute("CREATE DATABASE foodb")
diff --git a/test_runner/regress/test_createuser.py b/test_runner/regress/test_createuser.py
index f1bc405287..17d9824f52 100644
--- a/test_runner/regress/test_createuser.py
+++ b/test_runner/regress/test_createuser.py
@@ -1,4 +1,3 @@
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.utils import query_scalar
 
@@ -10,7 +9,6 @@ def test_createuser(neon_simple_env: NeonEnv):
     env = neon_simple_env
     env.neon_cli.create_branch("test_createuser", "empty")
     endpoint = env.endpoints.create_start("test_createuser")
-    log.info("postgres is running on 'test_createuser' branch")
 
     with endpoint.cursor() as cur:
         # Cause a 'relmapper' change in the original branch
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 7174487e68..50da673d87 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -296,7 +296,6 @@ def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv):
         # Some non-existent url
         config_lines=["neon.console_url=http://localhost:9999/unknown/api/v0/roles_and_databases"],
     )
-    log.info("postgres is running on 'test_ddl_forwarding_invalid_db' branch")
 
     with endpoint.cursor() as cur:
         cur.execute("SET neon.forward_ddl = false")
diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py
index a456c06862..9a22084671 100644
--- a/test_runner/regress/test_fullbackup.py
+++ b/test_runner/regress/test_fullbackup.py
@@ -26,7 +26,6 @@ def test_fullbackup(
 
     env.neon_cli.create_branch("test_fullbackup")
     endpoint_main = env.endpoints.create_start("test_fullbackup")
-    log.info("postgres is running on 'test_fullbackup' branch")
 
     with endpoint_main.cursor() as cur:
         timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index ef68049ee7..c5070ee815 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -71,7 +71,6 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     timeline = env.neon_cli.create_branch("test_gc_aggressive", "main")
     endpoint = env.endpoints.create_start("test_gc_aggressive")
-    log.info("postgres is running on test_gc_aggressive branch")
 
     with endpoint.cursor() as cur:
         # Create table, and insert the first 100 rows
diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py
index 70b115ad61..bf5834b665 100644
--- a/test_runner/regress/test_layer_bloating.py
+++ b/test_runner/regress/test_layer_bloating.py
@@ -21,7 +21,6 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
         "test_logical_replication", config_lines=["log_statement=all"]
     )
 
-    log.info("postgres is running on 'test_logical_replication' branch")
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py
index 5c68a63d06..2a3442448a 100644
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -23,7 +23,6 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
     )
     n_resize = 10
     scale = 10
-    log.info("postgres is running on 'test_lfc_resize' branch")
 
     def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 059ddf79ec..eff0b124d3 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -26,7 +26,6 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
         "test_logical_replication", config_lines=["log_statement=all"]
     )
 
-    log.info("postgres is running on 'test_logical_replication' branch")
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
@@ -315,7 +314,6 @@ def test_slots_and_branching(neon_simple_env: NeonEnv):
     # Create branch ws.
     env.neon_cli.create_branch("ws", "main", tenant_id=tenant)
     ws_branch = env.endpoints.create_start("ws", tenant_id=tenant)
-    log.info("postgres is running on 'ws' branch")
 
     # Check that we can create slot with the same name
     ws_cur = ws_branch.connect().cursor()
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 50d7c74af0..5813231aab 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -28,7 +28,6 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
     timeline_id = env.neon_cli.create_branch("test_lsn_mapping", tenant_id=tenant_id)
     endpoint_main = env.endpoints.create_start("test_lsn_mapping", tenant_id=tenant_id)
     timeline_id = endpoint_main.safe_psql("show neon.timeline_id")[0][0]
-    log.info("postgres is running on 'main' branch")
 
     cur = endpoint_main.connect().cursor()
 
@@ -114,7 +113,6 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
 
     new_timeline_id = env.neon_cli.create_branch("test_ts_of_lsn_api")
     endpoint_main = env.endpoints.create_start("test_ts_of_lsn_api")
-    log.info("postgres is running on 'test_ts_of_lsn_api' branch")
 
     cur = endpoint_main.connect().cursor()
     # Create table, and insert rows, each in a separate transaction
diff --git a/test_runner/regress/test_multixact.py b/test_runner/regress/test_multixact.py
index 9db463dc4a..88f7a5db59 100644
--- a/test_runner/regress/test_multixact.py
+++ b/test_runner/regress/test_multixact.py
@@ -1,4 +1,3 @@
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
 from fixtures.utils import query_scalar
 
@@ -18,7 +17,6 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
     env.neon_cli.create_branch("test_multixact", "empty")
     endpoint = env.endpoints.create_start("test_multixact")
 
-    log.info("postgres is running on 'test_multixact' branch")
     cur = endpoint.connect().cursor()
     cur.execute(
         """
@@ -78,7 +76,6 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
     env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn)
     endpoint_new = env.endpoints.create_start("test_multixact_new")
 
-    log.info("postgres is running on 'test_multixact_new' branch")
     next_multixact_id_new = endpoint_new.safe_psql(
         "SELECT next_multixact_id FROM pg_control_checkpoint()"
     )[0][0]
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index 998f84f968..62225e7b92 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -1,6 +1,5 @@
 from contextlib import closing
 
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 
 
@@ -14,8 +13,6 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
     endpoint_main.respec(skip_pg_catalog_updates=False)
     endpoint_main.start()
 
-    log.info("postgres is running on 'test_create_extension_neon' branch")
-
     with closing(endpoint_main.connect()) as conn:
         with conn.cursor() as cur:
             cur.execute("SELECT extversion from pg_extension where extname='neon'")
diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py
index 9b0bab5125..391305c58a 100644
--- a/test_runner/regress/test_old_request_lsn.py
+++ b/test_runner/regress/test_old_request_lsn.py
@@ -20,7 +20,6 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     env.neon_cli.create_branch("test_old_request_lsn", "main")
     endpoint = env.endpoints.create_start("test_old_request_lsn")
-    log.info("postgres is running on test_old_request_lsn branch")
 
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/regress/test_parallel_copy.py
index 6f74d50b92..b33e387a66 100644
--- a/test_runner/regress/test_parallel_copy.py
+++ b/test_runner/regress/test_parallel_copy.py
@@ -1,7 +1,6 @@
 import asyncio
 from io import BytesIO
 
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import Endpoint, NeonEnv
 
 
@@ -44,7 +43,6 @@ def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5):
     env = neon_simple_env
     env.neon_cli.create_branch("test_parallel_copy", "empty")
     endpoint = env.endpoints.create_start("test_parallel_copy")
-    log.info("postgres is running on 'test_parallel_copy' branch")
 
     # Create test table
     conn = endpoint.connect()
diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py
index c2ea5b332a..539ef3eda7 100644
--- a/test_runner/regress/test_pitr_gc.py
+++ b/test_runner/regress/test_pitr_gc.py
@@ -16,7 +16,6 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder):
 
     env = neon_env_builder.init_start()
     endpoint_main = env.endpoints.create_start("main")
-    log.info("postgres is running on 'main' branch")
 
     main_pg_conn = endpoint_main.connect()
     main_cur = main_pg_conn.cursor()
diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py
index d695410efc..effb7e83f9 100644
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -18,7 +18,6 @@ def test_read_validation(neon_simple_env: NeonEnv):
     env.neon_cli.create_branch("test_read_validation", "empty")
 
     endpoint = env.endpoints.create_start("test_read_validation")
-    log.info("postgres is running on 'test_read_validation' branch")
 
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
@@ -145,7 +144,6 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
     env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*")
 
     endpoint = env.endpoints.create_start("test_read_validation_neg")
-    log.info("postgres is running on 'test_read_validation_neg' branch")
 
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index 2d641e36a7..b7c8f36107 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -16,7 +16,6 @@ def test_readonly_node(neon_simple_env: NeonEnv):
     env = neon_simple_env
     env.neon_cli.create_branch("test_readonly_node", "empty")
     endpoint_main = env.endpoints.create_start("test_readonly_node")
-    log.info("postgres is running on 'test_readonly_node' branch")
 
     env.pageserver.allowed_errors.append(".*basebackup .* failed: invalid basebackup lsn.*")
 
diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py
index 9d7a4a8fd6..6aac1e1d84 100644
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -19,7 +19,6 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
     env.neon_cli.create_branch("test_pageserver_recovery", "main")
 
     endpoint = env.endpoints.create_start("test_pageserver_recovery")
-    log.info("postgres is running on 'test_pageserver_recovery' branch")
 
     with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py
index eb96a8faa4..10cb00c780 100644
--- a/test_runner/regress/test_subxacts.py
+++ b/test_runner/regress/test_subxacts.py
@@ -1,4 +1,3 @@
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
 
 
@@ -13,15 +12,10 @@ def test_subxacts(neon_simple_env: NeonEnv, test_output_dir):
     env.neon_cli.create_branch("test_subxacts", "empty")
     endpoint = env.endpoints.create_start("test_subxacts")
 
-    log.info("postgres is running on 'test_subxacts' branch")
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
-    cur.execute(
-        """
-        CREATE TABLE t1(i int, j int);
-    """
-    )
+    cur.execute("CREATE TABLE t1(i int, j int);")
 
     cur.execute("select pg_switch_wal();")
 
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index cd7203bba6..a3f99948d3 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -43,7 +43,6 @@ def test_timeline_size(neon_simple_env: NeonEnv):
     client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
 
     endpoint_main = env.endpoints.create_start("test_timeline_size")
-    log.info("postgres is running on 'test_timeline_size' branch")
 
     with closing(endpoint_main.connect()) as conn:
         with conn.cursor() as cur:
@@ -79,7 +78,6 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv):
     )
 
     endpoint_main = env.endpoints.create_start("test_timeline_size_createdropdb")
-    log.info("postgres is running on 'test_timeline_size_createdropdb' branch")
 
     with closing(endpoint_main.connect()) as conn:
         with conn.cursor() as cur:
@@ -162,8 +160,6 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
     )
     endpoint_main.start()
 
-    log.info("postgres is running on 'test_timeline_size_quota_on_startup' branch")
-
     with closing(endpoint_main.connect()) as conn:
         with conn.cursor() as cur:
             cur.execute("CREATE TABLE foo (t text)")
@@ -231,8 +227,6 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
     endpoint_main.respec(skip_pg_catalog_updates=False)
     endpoint_main.start()
 
-    log.info("postgres is running on 'test_timeline_size_quota' branch")
-
     with closing(endpoint_main.connect()) as conn:
         with conn.cursor() as cur:
             cur.execute("CREATE TABLE foo (t text)")
diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py
index 305271c715..dd76689008 100644
--- a/test_runner/regress/test_twophase.py
+++ b/test_runner/regress/test_twophase.py
@@ -13,7 +13,6 @@ def test_twophase(neon_simple_env: NeonEnv):
     endpoint = env.endpoints.create_start(
         "test_twophase", config_lines=["max_prepared_transactions=5"]
     )
-    log.info("postgres is running on 'test_twophase' branch")
 
     conn = endpoint.connect()
     cur = conn.cursor()
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 06c30b8d81..1377bed6f6 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -14,7 +14,6 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     env.neon_cli.create_branch("test_vm_bit_clear", "empty")
     endpoint = env.endpoints.create_start("test_vm_bit_clear")
 
-    log.info("postgres is running on 'test_vm_bit_clear' branch")
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
@@ -93,7 +92,6 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     # server at the right point-in-time avoids that full-page image.
     endpoint_new = env.endpoints.create_start("test_vm_bit_clear_new")
 
-    log.info("postgres is running on 'test_vm_bit_clear_new' branch")
     pg_new_conn = endpoint_new.connect()
     cur_new = pg_new_conn.cursor()
 

From 241dcbf70ce117a8b956fb990f13fee67029a197 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 10 Feb 2024 10:50:52 +0200
Subject: [PATCH 0150/1571] tests: Remove "Running in ..." log message from
 every CLI call

It's always the same directory, the test's "repo" directory.
---
 test_runner/fixtures/neon_fixtures.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 231eebff52..31acb045ae 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1400,7 +1400,6 @@ class AbstractNeonCli(abc.ABC):
 
         args = [bin_neon] + arguments
         log.info('Running command "{}"'.format(" ".join(args)))
-        log.info(f'Running in "{self.env.repo_dir}"')
 
         env_vars = os.environ.copy()
         env_vars["NEON_REPO_DIR"] = str(self.env.repo_dir)

From d77583c86ab3cf4d5b555d86a7b665c1457f97c8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 10 Feb 2024 11:10:48 +0200
Subject: [PATCH 0151/1571] tests: Remove obsolete allowlist entries

Commit 9a6c0be823 removed the code that printed these warnings:

    marking {} as locally complete, while it doesnt exist in remote index
    No timelines to attach received

Remove those warnings from all the allowlists in tests.
---
 test_runner/regress/test_compatibility.py        |  5 -----
 test_runner/regress/test_import.py               |  5 -----
 test_runner/regress/test_remote_storage.py       |  3 ---
 test_runner/regress/test_tenant_relocation.py    |  2 --
 test_runner/regress/test_tenants.py              |  1 -
 .../regress/test_tenants_with_remote_storage.py  | 16 ----------------
 test_runner/regress/test_wal_acceptor.py         | 10 ----------
 7 files changed, 42 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index d5d70951be..826821e52b 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -112,11 +112,6 @@ def test_create_snapshot(
     env = neon_env_builder.init_start()
     endpoint = env.endpoints.create_start("main")
 
-    # FIXME: Is this expected?
-    env.pageserver.allowed_errors.append(
-        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
-    )
-
     pg_bin.run_capture(["pgbench", "--initialize", "--scale=10", endpoint.connstr()])
     pg_bin.run_capture(["pgbench", "--time=60", "--progress=2", endpoint.connstr()])
     pg_bin.run_capture(
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index db385b3e73..ec57860033 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -159,11 +159,6 @@ def test_import_from_pageserver_small(
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
     env = neon_env_builder.init_start()
 
-    # FIXME: Is this expected?
-    env.pageserver.allowed_errors.append(
-        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
-    )
-
     timeline = env.neon_cli.create_branch("test_import_from_pageserver_small")
     endpoint = env.endpoints.create_start("test_import_from_pageserver_small")
 
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 98b2e856ec..32b4f54fbd 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -73,9 +73,6 @@ def test_remote_storage_backup_and_restore(
 
     env.pageserver.allowed_errors.extend(
         [
-            # FIXME: Is this expected?
-            ".*marking .* as locally complete, while it doesnt exist in remote index.*",
-            ".*No timelines to attach received.*",
             ".*Failed to get local tenant state.*",
             # FIXME retry downloads without throwing errors
             ".*failed to load remote timeline.*",
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 80b4fab1d3..f4eb6b092d 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -213,8 +213,6 @@ def test_tenant_relocation(
 
     env.pageservers[0].allowed_errors.extend(
         [
-            # FIXME: Is this expected?
-            ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*",
             # Needed for detach polling on the original pageserver
             f".*NotFound: tenant {tenant_id}.*",
             # We will dual-attach in this test, so stale generations are expected
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index ba391a69d8..bf317808ee 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -285,7 +285,6 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder):
 
     env.pageserver.allowed_errors.extend(
         [
-            ".*marking .* as locally complete, while it doesnt exist in remote index.*",
             ".*load failed.*list timelines directory.*",
         ]
     )
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 6f05d7f7cb..1c693a0df5 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -61,11 +61,6 @@ async def all_tenants_workload(env: NeonEnv, tenants_endpoints):
 def test_tenants_many(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
-    # FIXME: Is this expected?
-    env.pageserver.allowed_errors.append(
-        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
-    )
-
     tenants_endpoints: List[Tuple[TenantId, Endpoint]] = []
 
     for _ in range(1, 5):
@@ -117,14 +112,6 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder):
     ##### First start, insert secret data and upload it to the remote storage
     env = neon_env_builder.init_start()
 
-    env.pageserver.allowed_errors.extend(
-        [
-            # FIXME: Are these expected?
-            ".*No timelines to attach received.*",
-            ".*marking .* as locally complete, while it doesnt exist in remote index.*",
-        ]
-    )
-
     pageserver_http = env.pageserver.http_client()
     endpoint = env.endpoints.create_start("main")
 
@@ -223,9 +210,6 @@ def test_tenant_redownloads_truncated_file_on_startup(
     env.pageserver.allowed_errors.extend(
         [
             ".*removing local file .* because .*",
-            # FIXME: Are these expected?
-            ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*",
-            ".*No timelines to attach received.*",
         ]
     )
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index dab446fcfd..3d7bba6153 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -280,11 +280,6 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
     tenant_id = env.initial_tenant
     timeline_id = env.neon_cli.create_branch("test_broker", "main")
 
-    # FIXME: Is this expected?
-    env.pageserver.allowed_errors.append(
-        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
-    )
-
     endpoint = env.endpoints.create_start("test_broker")
     endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
 
@@ -342,11 +337,6 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     neon_env_builder.auth_enabled = auth_enabled
     env = neon_env_builder.init_start()
 
-    # FIXME: Is this expected?
-    env.pageserver.allowed_errors.append(
-        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
-    )
-
     tenant_id = env.initial_tenant
     timeline_id = env.neon_cli.create_branch("test_safekeepers_wal_removal")
     endpoint = env.endpoints.create_start("test_safekeepers_wal_removal")

From e5daf366ac92a5398c09ea956ba03ac03848d3f8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 10 Feb 2024 11:25:47 +0200
Subject: [PATCH 0152/1571] tests: Remove unnecessary port config with
 VanillaPostgres class

VanillaPostgres constructor prints the "port={port}" line to the
config file, no need to do it in the callers.

The TODO comment that it would be nice if VanillaPostgres could pick
the port by itself is still valid though.
---
 test_runner/fixtures/neon_fixtures.py     | 1 +
 test_runner/regress/test_fullbackup.py    | 6 ------
 test_runner/regress/test_timeline_size.py | 1 -
 3 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 31acb045ae..faa8effe10 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2458,6 +2458,7 @@ def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -
     return PgBin(test_output_dir, pg_distrib_dir, pg_version)
 
 
+# TODO make port an optional argument
 class VanillaPostgres(PgProtocol):
     def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init: bool = True):
         super().__init__(host="localhost", port=port, dbname="postgres")
diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py
index 9a22084671..d5f898492b 100644
--- a/test_runner/regress/test_fullbackup.py
+++ b/test_runner/regress/test_fullbackup.py
@@ -66,12 +66,6 @@ def test_fullbackup(
     # Restore from the backup and find the data we inserted
     port = port_distributor.get_port()
     with VanillaPostgres(restored_dir_path, pg_bin, port, init=False) as vanilla_pg:
-        # TODO make port an optional argument
-        vanilla_pg.configure(
-            [
-                f"port={port}",
-            ]
-        )
         vanilla_pg.start()
         num_rows_found = vanilla_pg.safe_psql("select count(*) from tbl;", user="cloud_admin")[0][0]
         assert num_rows == num_rows_found
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index a3f99948d3..0788c49c7b 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -579,7 +579,6 @@ def test_timeline_size_metrics(
     pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version)
     port = port_distributor.get_port()
     with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg:
-        vanilla_pg.configure([f"port={port}"])
         vanilla_pg.start()
 
         # Create database based on template0 because we can't connect to template0

From aeda82a0105f18393e8d56d7ff2f6202059edde6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 12 Feb 2024 11:57:29 +0200
Subject: [PATCH 0153/1571] fix(heavier_once_cell): assertion failure can be
 hit (#6722)

@problame noticed that the `tokio::sync::AcquireError` branch assertion
can be hit like in the added test. We haven't seen this yet in
production, but I'd prefer not to see it there. There `take_and_deinit`
is being used, but this race must be quite timing sensitive.

Rework of earlier: #6652.
---
 libs/utils/src/sync/heavier_once_cell.rs | 174 ++++++++++++++++++-----
 1 file changed, 138 insertions(+), 36 deletions(-)

diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 0ccaf4e716..0773abba2d 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -69,37 +69,44 @@ impl<T> OnceCell<T> {
         F: FnOnce(InitPermit) -> Fut,
         Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
     {
-        let sem = {
+        loop {
+            let sem = {
+                let guard = self.inner.lock().unwrap();
+                if guard.value.is_some() {
+                    return Ok(Guard(guard));
+                }
+                guard.init_semaphore.clone()
+            };
+
+            {
+                let permit = {
+                    // increment the count for the duration of queued
+                    let _guard = CountWaitingInitializers::start(self);
+                    sem.acquire().await
+                };
+
+                let Ok(permit) = permit else {
+                    let guard = self.inner.lock().unwrap();
+                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
+                        // there was a take_and_deinit in between
+                        continue;
+                    }
+                    assert!(
+                        guard.value.is_some(),
+                        "semaphore got closed, must be initialized"
+                    );
+                    return Ok(Guard(guard));
+                };
+
+                permit.forget();
+            }
+
+            let permit = InitPermit(sem);
+            let (value, _permit) = factory(permit).await?;
+
             let guard = self.inner.lock().unwrap();
-            if guard.value.is_some() {
-                return Ok(Guard(guard));
-            }
-            guard.init_semaphore.clone()
-        };
 
-        let permit = {
-            // increment the count for the duration of queued
-            let _guard = CountWaitingInitializers::start(self);
-            sem.acquire_owned().await
-        };
-
-        match permit {
-            Ok(permit) => {
-                let permit = InitPermit(permit);
-                let (value, _permit) = factory(permit).await?;
-
-                let guard = self.inner.lock().unwrap();
-
-                Ok(Self::set0(value, guard))
-            }
-            Err(_closed) => {
-                let guard = self.inner.lock().unwrap();
-                assert!(
-                    guard.value.is_some(),
-                    "semaphore got closed, must be initialized"
-                );
-                return Ok(Guard(guard));
-            }
+            return Ok(Self::set0(value, guard));
         }
     }
 
@@ -197,27 +204,41 @@ impl<'a, T> Guard<'a, T> {
     /// [`OnceCell::get_or_init`] will wait on it to complete.
     pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
         let mut swapped = Inner::default();
-        let permit = swapped
-            .init_semaphore
-            .clone()
-            .try_acquire_owned()
-            .expect("we just created this");
+        let sem = swapped.init_semaphore.clone();
+        // acquire and forget right away, moving the control over to InitPermit
+        sem.try_acquire().expect("we just created this").forget();
         std::mem::swap(&mut *self.0, &mut swapped);
         swapped
             .value
-            .map(|v| (v, InitPermit(permit)))
+            .map(|v| (v, InitPermit(sem)))
             .expect("guard is not created unless value has been initialized")
     }
 }
 
 /// Type held by OnceCell (de)initializing task.
-pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
+///
+/// On drop, this type will return the permit.
+pub struct InitPermit(Arc<tokio::sync::Semaphore>);
+
+impl Drop for InitPermit {
+    fn drop(&mut self) {
+        assert_eq!(
+            self.0.available_permits(),
+            0,
+            "InitPermit should only exist as the unique permit"
+        );
+        self.0.add_permits(1);
+    }
+}
 
 #[cfg(test)]
 mod tests {
+    use futures::Future;
+
     use super::*;
     use std::{
         convert::Infallible,
+        pin::{pin, Pin},
         sync::atomic::{AtomicUsize, Ordering},
         time::Duration,
     };
@@ -380,4 +401,85 @@ mod tests {
             .unwrap();
         assert_eq!(*g, "now initialized");
     }
+
+    #[tokio::test(start_paused = true)]
+    async fn reproduce_init_take_deinit_race() {
+        init_take_deinit_scenario(|cell, factory| {
+            Box::pin(async {
+                cell.get_or_init(factory).await.unwrap();
+            })
+        })
+        .await;
+    }
+
+    type BoxedInitFuture<T, E> = Pin<Box<dyn Future<Output = Result<(T, InitPermit), E>>>>;
+    type BoxedInitFunction<T, E> = Box<dyn Fn(InitPermit) -> BoxedInitFuture<T, E>>;
+
+    /// Reproduce an assertion failure.
+    ///
+    /// This has interesting generics to be generic between `get_or_init` and `get_mut_or_init`.
+    /// We currently only have one, but the structure is kept.
+    async fn init_take_deinit_scenario<F>(init_way: F)
+    where
+        F: for<'a> Fn(
+            &'a OnceCell<&'static str>,
+            BoxedInitFunction<&'static str, Infallible>,
+        ) -> Pin<Box<dyn Future<Output = ()> + 'a>>,
+    {
+        let cell = OnceCell::default();
+
+        // acquire the init_semaphore only permit to drive initializing tasks in order to waiting
+        // on the same semaphore.
+        let permit = cell
+            .inner
+            .lock()
+            .unwrap()
+            .init_semaphore
+            .clone()
+            .try_acquire_owned()
+            .unwrap();
+
+        let mut t1 = pin!(init_way(
+            &cell,
+            Box::new(|permit| Box::pin(async move { Ok(("t1", permit)) })),
+        ));
+
+        let mut t2 = pin!(init_way(
+            &cell,
+            Box::new(|permit| Box::pin(async move { Ok(("t2", permit)) })),
+        ));
+
+        // drive t2 first to the init_semaphore -- the timeout will be hit once t2 future can
+        // no longer make progress
+        tokio::select! {
+            _ = &mut t2 => unreachable!("it cannot get permit"),
+            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
+        }
+
+        // followed by t1 in the init_semaphore
+        tokio::select! {
+            _ = &mut t1 => unreachable!("it cannot get permit"),
+            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
+        }
+
+        // now let t2 proceed and initialize
+        drop(permit);
+        t2.await;
+
+        let (s, permit) = { cell.get().unwrap().take_and_deinit() };
+        assert_eq!("t2", s);
+
+        // now originally t1 would see the semaphore it has as closed. it cannot yet get a permit from
+        // the new one.
+        tokio::select! {
+            _ = &mut t1 => unreachable!("it cannot get permit"),
+            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
+        }
+
+        // only now we get to initialize it
+        drop(permit);
+        t1.await;
+
+        assert_eq!("t1", *cell.get().unwrap());
+    }
 }

From c77411e9035ac38925652bf1f772b333acb0b9ac Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 12 Feb 2024 14:52:20 +0200
Subject: [PATCH 0154/1571] cleanup around `attach` (#6621)

The smaller changes I found while looking around #6584.

- rustfmt was not able to format handle_timeline_create
- fix Generation::get_suffix always allocating
- Generation was missing a `#[track_caller]` for panicky method
- attach has a lot of issues, but even with this PR it cannot be
formatted by rustfmt
- moved the `preload` span to be on top of `attach` -- it is awaited
inline
- make disconnected panic! or unreachable! into expect, expect_err
---
 libs/utils/src/generation.rs             |  41 ++++-
 pageserver/src/http/routes.rs            |  76 +++++----
 pageserver/src/tenant.rs                 | 199 +++++++++++------------
 pageserver/src/tenant/delete.rs          |   8 +-
 pageserver/src/tenant/timeline/delete.rs |   9 +-
 5 files changed, 177 insertions(+), 156 deletions(-)

diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs
index 46eadee1da..6f6c46cfeb 100644
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -54,12 +54,10 @@ impl Generation {
     }
 
     #[track_caller]
-    pub fn get_suffix(&self) -> String {
+    pub fn get_suffix(&self) -> impl std::fmt::Display {
         match self {
-            Self::Valid(v) => {
-                format!("-{:08x}", v)
-            }
-            Self::None => "".into(),
+            Self::Valid(v) => GenerationFileSuffix(Some(*v)),
+            Self::None => GenerationFileSuffix(None),
             Self::Broken => {
                 panic!("Tried to use a broken generation");
             }
@@ -90,6 +88,7 @@ impl Generation {
         }
     }
 
+    #[track_caller]
     pub fn next(&self) -> Generation {
         match self {
             Self::Valid(n) => Self::Valid(*n + 1),
@@ -107,6 +106,18 @@ impl Generation {
     }
 }
 
+struct GenerationFileSuffix(Option<u32>);
+
+impl std::fmt::Display for GenerationFileSuffix {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if let Some(g) = self.0 {
+            write!(f, "-{g:08x}")
+        } else {
+            Ok(())
+        }
+    }
+}
+
 impl Serialize for Generation {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
@@ -164,4 +175,24 @@ mod test {
         assert!(Generation::none() < Generation::new(0));
         assert!(Generation::none() < Generation::new(1));
     }
+
+    #[test]
+    fn suffix_is_stable() {
+        use std::fmt::Write as _;
+
+        // the suffix must remain stable through-out the pageserver remote storage evolution and
+        // not be changed accidentially without thinking about migration
+        let examples = [
+            (line!(), Generation::None, ""),
+            (line!(), Generation::Valid(0), "-00000000"),
+            (line!(), Generation::Valid(u32::MAX), "-ffffffff"),
+        ];
+
+        let mut s = String::new();
+        for (line, gen, expected) in examples {
+            s.clear();
+            write!(s, "{}", &gen.get_suffix()).expect("string grows");
+            assert_eq!(s, expected, "example on {line}");
+        }
+    }
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index af9a3c7301..4be8ee9892 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -488,7 +488,9 @@ async fn timeline_create_handler(
     let state = get_state(&request);
 
     async {
-        let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id, false)?;
 
         tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
@@ -498,48 +500,62 @@ async fn timeline_create_handler(
             tracing::info!("bootstrapping");
         }
 
-        match tenant.create_timeline(
-            new_timeline_id,
-            request_data.ancestor_timeline_id.map(TimelineId::from),
-            request_data.ancestor_start_lsn,
-            request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
-            request_data.existing_initdb_timeline_id,
-            state.broker_client.clone(),
-            &ctx,
-        )
-        .await {
+        match tenant
+            .create_timeline(
+                new_timeline_id,
+                request_data.ancestor_timeline_id,
+                request_data.ancestor_start_lsn,
+                request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+                request_data.existing_initdb_timeline_id,
+                state.broker_client.clone(),
+                &ctx,
+            )
+            .await
+        {
             Ok(new_timeline) => {
                 // Created. Construct a TimelineInfo for it.
-                let timeline_info = build_timeline_info_common(&new_timeline, &ctx, tenant::timeline::GetLogicalSizePriority::User)
-                    .await
-                    .map_err(ApiError::InternalServerError)?;
+                let timeline_info = build_timeline_info_common(
+                    &new_timeline,
+                    &ctx,
+                    tenant::timeline::GetLogicalSizePriority::User,
+                )
+                .await
+                .map_err(ApiError::InternalServerError)?;
                 json_response(StatusCode::CREATED, timeline_info)
             }
             Err(_) if tenant.cancel.is_cancelled() => {
                 // In case we get some ugly error type during shutdown, cast it into a clean 503.
-                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string()))
-            }
-            Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
-                json_response(StatusCode::CONFLICT, ())
-            }
-            Err(tenant::CreateTimelineError::AncestorLsn(err)) => {
-                json_response(StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg(
-                    format!("{err:#}")
-                ))
-            }
-            Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
-                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
-            }
-            Err(tenant::CreateTimelineError::ShuttingDown) => {
-                json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string()))
+                json_response(
+                    StatusCode::SERVICE_UNAVAILABLE,
+                    HttpErrorBody::from_msg("Tenant shutting down".to_string()),
+                )
             }
+            Err(
+                tenant::CreateTimelineError::Conflict
+                | tenant::CreateTimelineError::AlreadyCreating,
+            ) => json_response(StatusCode::CONFLICT, ()),
+            Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
+                StatusCode::NOT_ACCEPTABLE,
+                HttpErrorBody::from_msg(format!("{err:#}")),
+            ),
+            Err(e @ tenant::CreateTimelineError::AncestorNotActive) => json_response(
+                StatusCode::SERVICE_UNAVAILABLE,
+                HttpErrorBody::from_msg(e.to_string()),
+            ),
+            Err(tenant::CreateTimelineError::ShuttingDown) => json_response(
+                StatusCode::SERVICE_UNAVAILABLE,
+                HttpErrorBody::from_msg("tenant shutting down".to_string()),
+            ),
             Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
         }
     }
     .instrument(info_span!("timeline_create",
         tenant_id = %tenant_shard_id.tenant_id,
         shard_id = %tenant_shard_id.shard_slug(),
-        timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
+        timeline_id = %new_timeline_id,
+        lsn=?request_data.ancestor_start_lsn,
+        pg_version=?request_data.pg_version
+    ))
     .await
 }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4446c410b0..d946c57118 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -644,10 +644,10 @@ impl Tenant {
 
         // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
         // we shut down while attaching.
-        let Ok(attach_gate_guard) = tenant.gate.enter() else {
-            // We just created the Tenant: nothing else can have shut it down yet
-            unreachable!();
-        };
+        let attach_gate_guard = tenant
+            .gate
+            .enter()
+            .expect("We just created the Tenant: nothing else can have shut it down yet");
 
         // Do all the hard work in the background
         let tenant_clone = Arc::clone(&tenant);
@@ -755,36 +755,27 @@ impl Tenant {
                     AttachType::Normal
                 };
 
-                let preload_timer = TENANT.preload.start_timer();
-                let preload = match mode {
-                    SpawnMode::Create => {
-                        // Don't count the skipped preload into the histogram of preload durations
-                        preload_timer.stop_and_discard();
+                let preload = match (&mode, &remote_storage) {
+                    (SpawnMode::Create, _) => {
                         None
                     },
-                    SpawnMode::Normal => {
-                        match &remote_storage {
-                            Some(remote_storage) => Some(
-                                match tenant_clone
-                                    .preload(remote_storage, task_mgr::shutdown_token())
-                                    .instrument(
-                                        tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()),
-                                    )
-                                    .await {
-                                        Ok(p) => {
-                                            preload_timer.observe_duration();
-                                            p
-                                        }
-                                            ,
-                                        Err(e) => {
-                                            make_broken(&tenant_clone, anyhow::anyhow!(e));
-                                                return Ok(());
-                                        }
-                                    },
-                            ),
-                            None => None,
+                    (SpawnMode::Normal, Some(remote_storage)) => {
+                        let _preload_timer = TENANT.preload.start_timer();
+                        let res = tenant_clone
+                            .preload(remote_storage, task_mgr::shutdown_token())
+                            .await;
+                        match res {
+                            Ok(p) => Some(p),
+                            Err(e) => {
+                                make_broken(&tenant_clone, anyhow::anyhow!(e));
+                                return Ok(());
+                            }
                         }
                     }
+                    (SpawnMode::Normal, None) => {
+                        let _preload_timer = TENANT.preload.start_timer();
+                        None
+                    }
                 };
 
                 // Remote preload is complete.
@@ -820,36 +811,37 @@ impl Tenant {
                         info!("ready for backgound jobs barrier");
                     }
 
-                    match DeleteTenantFlow::resume_from_attach(
+                    let deleted = DeleteTenantFlow::resume_from_attach(
                         deletion,
                         &tenant_clone,
                         preload,
                         tenants,
                         &ctx,
                     )
-                    .await
-                    {
-                        Err(err) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(err));
-                            return Ok(());
-                        }
-                        Ok(()) => return Ok(()),
+                    .await;
+
+                    if let Err(e) = deleted {
+                        make_broken(&tenant_clone, anyhow::anyhow!(e));
                     }
+
+                    return Ok(());
                 }
 
                 // We will time the duration of the attach phase unless this is a creation (attach will do no work)
-                let attach_timer = match mode {
-                    SpawnMode::Create => None,
-                    SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
+                let attached = {
+                    let _attach_timer = match mode {
+                        SpawnMode::Create => None,
+                        SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
+                    };
+                    tenant_clone.attach(preload, mode, &ctx).await
                 };
-                match tenant_clone.attach(preload, mode, &ctx).await {
+
+                match attached {
                     Ok(()) => {
                         info!("attach finished, activating");
-                        if let Some(t)=  attach_timer {t.observe_duration();}
                         tenant_clone.activate(broker_client, None, &ctx);
                     }
                     Err(e) => {
-                        if let Some(t)=  attach_timer {t.observe_duration();}
                         make_broken(&tenant_clone, anyhow::anyhow!(e));
                     }
                 }
@@ -862,34 +854,26 @@ impl Tenant {
                 // logical size calculations: if logical size calculation semaphore is saturated,
                 // then warmup will wait for that before proceeding to the next tenant.
                 if let AttachType::Warmup(_permit) = attach_type {
-                    let mut futs = FuturesUnordered::new();
-                    let timelines: Vec<_> = tenant_clone.timelines.lock().unwrap().values().cloned().collect();
-                    for t in timelines {
-                        futs.push(t.await_initial_logical_size())
-                    }
+                    let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
                     tracing::info!("Waiting for initial logical sizes while warming up...");
-                    while futs.next().await.is_some() {
-
-                    }
+                    while futs.next().await.is_some() {}
                     tracing::info!("Warm-up complete");
                 }
 
                 Ok(())
             }
-            .instrument({
-                let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation);
-                span.follows_from(Span::current());
-                span
-            }),
+            .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
         );
         Ok(tenant)
     }
 
+    #[instrument(skip_all)]
     pub(crate) async fn preload(
         self: &Arc<Tenant>,
         remote_storage: &GenericRemoteStorage,
         cancel: CancellationToken,
     ) -> anyhow::Result<TenantPreload> {
+        span::debug_assert_current_span_has_tenant_id();
         // Get list of remote timelines
         // download index files for every tenant timeline
         info!("listing remote timelines");
@@ -3982,6 +3966,8 @@ pub(crate) mod harness {
         }
     }
 
+    #[cfg(test)]
+    #[derive(Debug)]
     enum LoadMode {
         Local,
         Remote,
@@ -4064,7 +4050,7 @@ pub(crate) mod harness {
             info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
         }
 
-        pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
+        pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
             let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
             (
                 self.try_load(&ctx)
@@ -4074,31 +4060,31 @@ pub(crate) mod harness {
             )
         }
 
-        fn remote_empty(&self) -> bool {
-            let tenant_path = self.conf.tenant_path(&self.tenant_shard_id);
-            let remote_tenant_dir = self
-                .remote_fs_dir
-                .join(tenant_path.strip_prefix(&self.conf.workdir).unwrap());
-            if std::fs::metadata(&remote_tenant_dir).is_err() {
-                return true;
-            }
-
-            match std::fs::read_dir(remote_tenant_dir)
-                .unwrap()
-                .flatten()
-                .next()
-            {
-                Some(entry) => {
-                    tracing::debug!(
-                        "remote_empty: not empty, found file {}",
-                        entry.file_name().to_string_lossy(),
-                    );
-                    false
-                }
-                None => true,
-            }
+        /// For tests that specifically want to exercise the local load path, which does
+        /// not use remote storage.
+        pub(crate) async fn try_load_local(
+            &self,
+            ctx: &RequestContext,
+        ) -> anyhow::Result<Arc<Tenant>> {
+            self.do_try_load(ctx, LoadMode::Local).await
         }
 
+        /// The 'load' in this function is either a local load or a normal attachment,
+        pub(crate) async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
+            // If we have nothing in remote storage, must use load_local instead of attach: attach
+            // will error out if there are no timelines.
+            //
+            // See https://github.com/neondatabase/neon/issues/5456 for how we will eliminate
+            // this weird state of a Tenant which exists but doesn't have any timelines.
+            let mode = match self.remote_empty() {
+                true => LoadMode::Local,
+                false => LoadMode::Remote,
+            };
+
+            self.do_try_load(ctx, mode).await
+        }
+
+        #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), ?mode))]
         async fn do_try_load(
             &self,
             ctx: &RequestContext,
@@ -4125,20 +4111,13 @@ pub(crate) mod harness {
 
             match mode {
                 LoadMode::Local => {
-                    tenant
-                        .load_local(ctx)
-                        .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
-                        .await?;
+                    tenant.load_local(ctx).await?;
                 }
                 LoadMode::Remote => {
                     let preload = tenant
                         .preload(&self.remote_storage, CancellationToken::new())
-                        .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
-                        .await?;
-                    tenant
-                        .attach(Some(preload), SpawnMode::Normal, ctx)
-                        .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                         .await?;
+                    tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
                 }
             }
 
@@ -4149,25 +4128,29 @@ pub(crate) mod harness {
             Ok(tenant)
         }
 
-        /// For tests that specifically want to exercise the local load path, which does
-        /// not use remote storage.
-        pub async fn try_load_local(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
-            self.do_try_load(ctx, LoadMode::Local).await
-        }
+        fn remote_empty(&self) -> bool {
+            let tenant_path = self.conf.tenant_path(&self.tenant_shard_id);
+            let remote_tenant_dir = self
+                .remote_fs_dir
+                .join(tenant_path.strip_prefix(&self.conf.workdir).unwrap());
+            if std::fs::metadata(&remote_tenant_dir).is_err() {
+                return true;
+            }
 
-        /// The 'load' in this function is either a local load or a normal attachment,
-        pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
-            // If we have nothing in remote storage, must use load_local instead of attach: attach
-            // will error out if there are no timelines.
-            //
-            // See https://github.com/neondatabase/neon/issues/5456 for how we will eliminate
-            // this weird state of a Tenant which exists but doesn't have any timelines.
-            let mode = match self.remote_empty() {
-                true => LoadMode::Local,
-                false => LoadMode::Remote,
-            };
-
-            self.do_try_load(ctx, mode).await
+            match std::fs::read_dir(remote_tenant_dir)
+                .unwrap()
+                .flatten()
+                .next()
+            {
+                Some(entry) => {
+                    tracing::debug!(
+                        "remote_empty: not empty, found file {}",
+                        entry.file_name().to_string_lossy(),
+                    );
+                    false
+                }
+                None => true,
+            }
         }
 
         pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf {
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 7c35914b61..0e192b577c 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -6,7 +6,7 @@ use pageserver_api::{models::TenantState, shard::TenantShardId};
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, instrument, Instrument, Span};
+use tracing::{error, instrument, Instrument};
 
 use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};
 
@@ -496,11 +496,7 @@ impl DeleteTenantFlow {
                 };
                 Ok(())
             }
-            .instrument({
-                let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
-                span.follows_from(Span::current());
-                span
-            }),
+            .instrument(tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
         );
     }
 
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 88d7ce61dd..dc499197b0 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -6,7 +6,7 @@ use std::{
 use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
-use tracing::{debug, error, info, instrument, warn, Instrument, Span};
+use tracing::{debug, error, info, instrument, warn, Instrument};
 use utils::{crashsafe, fs_ext, id::TimelineId};
 
 use crate::{
@@ -541,12 +541,7 @@ impl DeleteTimelineFlow {
                 };
                 Ok(())
             }
-            .instrument({
-                let span =
-                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id);
-                span.follows_from(Span::current());
-                span
-            }),
+            .instrument(tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id)),
         );
     }
 

From 020e607637fe00ec869fd6eb71dfa732ae501b37 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 12 Feb 2024 14:04:46 +0100
Subject: [PATCH 0155/1571] Proxy: copy bidirectional fork (#6720)

## Problem

`tokio::io::copy_bidirectional` doesn't close the connection once one of
the sides closes it. It's not really suitable for the postgres protocol.

## Summary of changes

Fork `copy_bidirectional` and initiate a shutdown for both connections.

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 proxy/src/proxy.rs                    |   1 +
 proxy/src/proxy/copy_bidirectional.rs | 256 ++++++++++++++++++++++++++
 proxy/src/proxy/passthrough.rs        |   2 +-
 3 files changed, 258 insertions(+), 1 deletion(-)
 create mode 100644 proxy/src/proxy/copy_bidirectional.rs

diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 50e22ec72a..77aadb6f28 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -2,6 +2,7 @@
 mod tests;
 
 pub mod connect_compute;
+mod copy_bidirectional;
 pub mod handshake;
 pub mod passthrough;
 pub mod retry;
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
new file mode 100644
index 0000000000..2ecc1151da
--- /dev/null
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -0,0 +1,256 @@
+use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+
+use std::future::poll_fn;
+use std::io;
+use std::pin::Pin;
+use std::task::{ready, Context, Poll};
+
+#[derive(Debug)]
+enum TransferState {
+    Running(CopyBuffer),
+    ShuttingDown(u64),
+    Done(u64),
+}
+
+fn transfer_one_direction<A, B>(
+    cx: &mut Context<'_>,
+    state: &mut TransferState,
+    r: &mut A,
+    w: &mut B,
+) -> Poll<io::Result<u64>>
+where
+    A: AsyncRead + AsyncWrite + Unpin + ?Sized,
+    B: AsyncRead + AsyncWrite + Unpin + ?Sized,
+{
+    let mut r = Pin::new(r);
+    let mut w = Pin::new(w);
+    loop {
+        match state {
+            TransferState::Running(buf) => {
+                let count = ready!(buf.poll_copy(cx, r.as_mut(), w.as_mut()))?;
+                *state = TransferState::ShuttingDown(count);
+            }
+            TransferState::ShuttingDown(count) => {
+                ready!(w.as_mut().poll_shutdown(cx))?;
+                *state = TransferState::Done(*count);
+            }
+            TransferState::Done(count) => return Poll::Ready(Ok(*count)),
+        }
+    }
+}
+
+pub(super) async fn copy_bidirectional<A, B>(
+    a: &mut A,
+    b: &mut B,
+) -> Result<(u64, u64), std::io::Error>
+where
+    A: AsyncRead + AsyncWrite + Unpin + ?Sized,
+    B: AsyncRead + AsyncWrite + Unpin + ?Sized,
+{
+    let mut a_to_b = TransferState::Running(CopyBuffer::new());
+    let mut b_to_a = TransferState::Running(CopyBuffer::new());
+
+    poll_fn(|cx| {
+        let mut a_to_b_result = transfer_one_direction(cx, &mut a_to_b, a, b)?;
+        let mut b_to_a_result = transfer_one_direction(cx, &mut b_to_a, b, a)?;
+
+        // Early termination checks
+        if let TransferState::Done(_) = a_to_b {
+            if let TransferState::Running(buf) = &b_to_a {
+                // Initiate shutdown
+                b_to_a = TransferState::ShuttingDown(buf.amt);
+                b_to_a_result = transfer_one_direction(cx, &mut b_to_a, b, a)?;
+            }
+        }
+        if let TransferState::Done(_) = b_to_a {
+            if let TransferState::Running(buf) = &a_to_b {
+                // Initiate shutdown
+                a_to_b = TransferState::ShuttingDown(buf.amt);
+                a_to_b_result = transfer_one_direction(cx, &mut a_to_b, a, b)?;
+            }
+        }
+
+        // It is not a problem if ready! returns early ... (comment remains the same)
+        let a_to_b = ready!(a_to_b_result);
+        let b_to_a = ready!(b_to_a_result);
+
+        Poll::Ready(Ok((a_to_b, b_to_a)))
+    })
+    .await
+}
+
+#[derive(Debug)]
+pub(super) struct CopyBuffer {
+    read_done: bool,
+    need_flush: bool,
+    pos: usize,
+    cap: usize,
+    amt: u64,
+    buf: Box<[u8]>,
+}
+const DEFAULT_BUF_SIZE: usize = 8 * 1024;
+
+impl CopyBuffer {
+    pub(super) fn new() -> Self {
+        Self {
+            read_done: false,
+            need_flush: false,
+            pos: 0,
+            cap: 0,
+            amt: 0,
+            buf: vec![0; DEFAULT_BUF_SIZE].into_boxed_slice(),
+        }
+    }
+
+    fn poll_fill_buf<R>(
+        &mut self,
+        cx: &mut Context<'_>,
+        reader: Pin<&mut R>,
+    ) -> Poll<io::Result<()>>
+    where
+        R: AsyncRead + ?Sized,
+    {
+        let me = &mut *self;
+        let mut buf = ReadBuf::new(&mut me.buf);
+        buf.set_filled(me.cap);
+
+        let res = reader.poll_read(cx, &mut buf);
+        if let Poll::Ready(Ok(())) = res {
+            let filled_len = buf.filled().len();
+            me.read_done = me.cap == filled_len;
+            me.cap = filled_len;
+        }
+        res
+    }
+
+    fn poll_write_buf<R, W>(
+        &mut self,
+        cx: &mut Context<'_>,
+        mut reader: Pin<&mut R>,
+        mut writer: Pin<&mut W>,
+    ) -> Poll<io::Result<usize>>
+    where
+        R: AsyncRead + ?Sized,
+        W: AsyncWrite + ?Sized,
+    {
+        let me = &mut *self;
+        match writer.as_mut().poll_write(cx, &me.buf[me.pos..me.cap]) {
+            Poll::Pending => {
+                // Top up the buffer towards full if we can read a bit more
+                // data - this should improve the chances of a large write
+                if !me.read_done && me.cap < me.buf.len() {
+                    ready!(me.poll_fill_buf(cx, reader.as_mut()))?;
+                }
+                Poll::Pending
+            }
+            res => res,
+        }
+    }
+
+    pub(super) fn poll_copy<R, W>(
+        &mut self,
+        cx: &mut Context<'_>,
+        mut reader: Pin<&mut R>,
+        mut writer: Pin<&mut W>,
+    ) -> Poll<io::Result<u64>>
+    where
+        R: AsyncRead + ?Sized,
+        W: AsyncWrite + ?Sized,
+    {
+        loop {
+            // If our buffer is empty, then we need to read some data to
+            // continue.
+            if self.pos == self.cap && !self.read_done {
+                self.pos = 0;
+                self.cap = 0;
+
+                match self.poll_fill_buf(cx, reader.as_mut()) {
+                    Poll::Ready(Ok(())) => (),
+                    Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+                    Poll::Pending => {
+                        // Try flushing when the reader has no progress to avoid deadlock
+                        // when the reader depends on buffered writer.
+                        if self.need_flush {
+                            ready!(writer.as_mut().poll_flush(cx))?;
+                            self.need_flush = false;
+                        }
+
+                        return Poll::Pending;
+                    }
+                }
+            }
+
+            // If our buffer has some data, let's write it out!
+            while self.pos < self.cap {
+                let i = ready!(self.poll_write_buf(cx, reader.as_mut(), writer.as_mut()))?;
+                if i == 0 {
+                    return Poll::Ready(Err(io::Error::new(
+                        io::ErrorKind::WriteZero,
+                        "write zero byte into writer",
+                    )));
+                } else {
+                    self.pos += i;
+                    self.amt += i as u64;
+                    self.need_flush = true;
+                }
+            }
+
+            // If pos larger than cap, this loop will never stop.
+            // In particular, user's wrong poll_write implementation returning
+            // incorrect written length may lead to thread blocking.
+            debug_assert!(
+                self.pos <= self.cap,
+                "writer returned length larger than input slice"
+            );
+
+            // If we've written all the data and we've seen EOF, flush out the
+            // data and finish the transfer.
+            if self.pos == self.cap && self.read_done {
+                ready!(writer.as_mut().poll_flush(cx))?;
+                return Poll::Ready(Ok(self.amt));
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tokio::io::AsyncWriteExt;
+
+    #[tokio::test]
+    async fn test_early_termination_a_to_d() {
+        let (mut a_mock, mut b_mock) = tokio::io::duplex(8); // Create a mock duplex stream
+        let (mut c_mock, mut d_mock) = tokio::io::duplex(32); // Create a mock duplex stream
+
+        // Simulate 'a' finishing while there's still data for 'b'
+        a_mock.write_all(b"hello").await.unwrap();
+        a_mock.shutdown().await.unwrap();
+        d_mock.write_all(b"Neon Serverless Postgres").await.unwrap();
+
+        let result = copy_bidirectional(&mut b_mock, &mut c_mock).await.unwrap();
+
+        // Assert correct transferred amounts
+        let (a_to_d_count, d_to_a_count) = result;
+        assert_eq!(a_to_d_count, 5); // 'hello' was transferred
+        assert!(d_to_a_count <= 8); // response only partially transferred or not at all
+    }
+
+    #[tokio::test]
+    async fn test_early_termination_d_to_a() {
+        let (mut a_mock, mut b_mock) = tokio::io::duplex(32); // Create a mock duplex stream
+        let (mut c_mock, mut d_mock) = tokio::io::duplex(8); // Create a mock duplex stream
+
+        // Simulate 'a' finishing while there's still data for 'b'
+        d_mock.write_all(b"hello").await.unwrap();
+        d_mock.shutdown().await.unwrap();
+        a_mock.write_all(b"Neon Serverless Postgres").await.unwrap();
+
+        let result = copy_bidirectional(&mut b_mock, &mut c_mock).await.unwrap();
+
+        // Assert correct transferred amounts
+        let (a_to_d_count, d_to_a_count) = result;
+        assert_eq!(d_to_a_count, 5); // 'hello' was transferred
+        assert!(a_to_d_count <= 8); // response only partially transferred or not at all
+    }
+}
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index b7018c6fb5..c98f68d8d1 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -45,7 +45,7 @@ pub async fn proxy_pass(
 
     // Starting from here we only proxy the client's traffic.
     info!("performing the proxy pass...");
-    let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
+    let _ = crate::proxy::copy_bidirectional::copy_bidirectional(&mut client, &mut compute).await?;
 
     Ok(())
 }

From 98ec5c5c466158fcb10394303077132efa680690 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 12 Feb 2024 13:14:06 +0000
Subject: [PATCH 0156/1571] proxy: some more parquet data (#6711)

## Summary of changes

add auth_method and database to the parquet logs
---
 proxy/src/auth/backend.rs             |  8 ++--
 proxy/src/auth/backend/classic.rs     |  8 ++--
 proxy/src/auth/backend/hacks.rs       | 12 +++--
 proxy/src/auth/backend/link.rs        |  2 +
 proxy/src/auth/credentials.rs         |  3 ++
 proxy/src/auth/flow.rs                | 17 ++++++-
 proxy/src/context.rs                  | 23 ++++++++-
 proxy/src/context/parquet.rs          | 69 ++++++++++++++++-----------
 proxy/src/proxy/tests.rs              |  2 +-
 proxy/src/serverless/sql_over_http.rs |  9 +++-
 10 files changed, 104 insertions(+), 49 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index fa2782bee3..c9f21f1cf5 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -194,8 +194,7 @@ async fn auth_quirks(
     // We now expect to see a very specific payload in the place of password.
     let (info, unauthenticated_password) = match user_info.try_into() {
         Err(info) => {
-            let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer)
-                .await?;
+            let res = hacks::password_hack_no_authentication(ctx, info, client).await?;
 
             ctx.set_endpoint_id(res.info.endpoint.clone());
             tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
@@ -276,11 +275,12 @@ async fn authenticate_with_secret(
     // Perform cleartext auth if we're allowed to do that.
     // Currently, we use it for websocket connections (latency).
     if allow_cleartext {
-        return hacks::authenticate_cleartext(info, client, &mut ctx.latency_timer, secret).await;
+        ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
+        return hacks::authenticate_cleartext(ctx, info, client, secret).await;
     }
 
     // Finally, proceed with the main auth flow (SCRAM-based).
-    classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
+    classic::authenticate(ctx, info, client, config, secret).await
 }
 
 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 745dd75107..e855843bc3 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -4,7 +4,7 @@ use crate::{
     compute,
     config::AuthenticationConfig,
     console::AuthSecret,
-    metrics::LatencyTimer,
+    context::RequestMonitoring,
     sasl,
     stream::{PqStream, Stream},
 };
@@ -12,10 +12,10 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
 
 pub(super) async fn authenticate(
+    ctx: &mut RequestMonitoring,
     creds: ComputeUserInfo,
     client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     config: &'static AuthenticationConfig,
-    latency_timer: &mut LatencyTimer,
     secret: AuthSecret,
 ) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     let flow = AuthFlow::new(client);
@@ -27,13 +27,11 @@ pub(super) async fn authenticate(
         }
         AuthSecret::Scram(secret) => {
             info!("auth endpoint chooses SCRAM");
-            let scram = auth::Scram(&secret);
+            let scram = auth::Scram(&secret, &mut *ctx);
 
             let auth_outcome = tokio::time::timeout(
                 config.scram_protocol_timeout,
                 async {
-                    // pause the timer while we communicate with the client
-                    let _paused = latency_timer.pause();
 
                     flow.begin(scram).await.map_err(|error| {
                         warn!(?error, "error sending scram acknowledgement");
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index b6c1a92d3c..9f60b709d4 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -4,7 +4,7 @@ use super::{
 use crate::{
     auth::{self, AuthFlow},
     console::AuthSecret,
-    metrics::LatencyTimer,
+    context::RequestMonitoring,
     sasl,
     stream::{self, Stream},
 };
@@ -16,15 +16,16 @@ use tracing::{info, warn};
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
 pub async fn authenticate_cleartext(
+    ctx: &mut RequestMonitoring,
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-    latency_timer: &mut LatencyTimer,
     secret: AuthSecret,
 ) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     warn!("cleartext auth flow override is enabled, proceeding");
+    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let _paused = latency_timer.pause();
+    let _paused = ctx.latency_timer.pause();
 
     let auth_outcome = AuthFlow::new(client)
         .begin(auth::CleartextPassword(secret))
@@ -47,14 +48,15 @@ pub async fn authenticate_cleartext(
 /// Similar to [`authenticate_cleartext`], but there's a specific password format,
 /// and passwords are not yet validated (we don't know how to validate them!)
 pub async fn password_hack_no_authentication(
+    ctx: &mut RequestMonitoring,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<ComputeCredentials<Vec<u8>>> {
     warn!("project not specified, resorting to the password hack auth flow");
+    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let _paused = latency_timer.pause();
+    let _paused = ctx.latency_timer.pause();
 
     let payload = AuthFlow::new(client)
         .begin(auth::PasswordHack)
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index c71637dd1a..bf9ebf4c18 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -61,6 +61,8 @@ pub(super) async fn authenticate(
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
+    ctx.set_auth_method(crate::context::AuthMethod::Web);
+
     // registering waiter can fail if we get unlucky with rng.
     // just try again.
     let (psql_session_id, waiter) = loop {
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index d32609e44c..d318b3be54 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -99,6 +99,9 @@ impl ComputeUserInfoMaybeEndpoint {
         // record the values if we have them
         ctx.set_application(params.get("application_name").map(SmolStr::from));
         ctx.set_user(user.clone());
+        if let Some(dbname) = params.get("database") {
+            ctx.set_dbname(dbname.into());
+        }
 
         // Project name might be passed via PG's command-line options.
         let endpoint_option = params
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index c2783e236c..dce73138c6 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -4,9 +4,11 @@ use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload};
 use crate::{
     config::TlsServerEndPoint,
     console::AuthSecret,
+    context::RequestMonitoring,
     sasl, scram,
     stream::{PqStream, Stream},
 };
+use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
 use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
 use std::io;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -23,7 +25,7 @@ pub trait AuthMethod {
 pub struct Begin;
 
 /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
-pub struct Scram<'a>(pub &'a scram::ServerSecret);
+pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring);
 
 impl AuthMethod for Scram<'_> {
     #[inline(always)]
@@ -138,6 +140,11 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
     /// Perform user authentication. Raise an error in case authentication failed.
     pub async fn authenticate(self) -> super::Result<sasl::Outcome<scram::ScramKey>> {
+        let Scram(secret, ctx) = self.state;
+
+        // pause the timer while we communicate with the client
+        let _paused = ctx.latency_timer.pause();
+
         // Initial client message contains the chosen auth method's name.
         let msg = self.stream.read_password_message().await?;
         let sasl = sasl::FirstMessage::parse(&msg)
@@ -148,9 +155,15 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
             return Err(super::AuthError::bad_auth_method(sasl.method));
         }
 
+        match sasl.method {
+            SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256),
+            SCRAM_SHA_256_PLUS => {
+                ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus)
+            }
+            _ => {}
+        }
         info!("client chooses {}", sasl.method);
 
-        let secret = self.state.0;
         let outcome = sasl::SaslStream::new(self.stream, sasl.message)
             .authenticate(scram::Exchange::new(
                 secret,
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index d2bf3f68d3..0cea53ae63 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -11,7 +11,7 @@ use crate::{
     console::messages::MetricsAuxInfo,
     error::ErrorKind,
     metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
-    BranchId, EndpointId, ProjectId, RoleName,
+    BranchId, DbName, EndpointId, ProjectId, RoleName,
 };
 
 pub mod parquet;
@@ -34,9 +34,11 @@ pub struct RequestMonitoring {
     project: Option<ProjectId>,
     branch: Option<BranchId>,
     endpoint_id: Option<EndpointId>,
+    dbname: Option<DbName>,
     user: Option<RoleName>,
     application: Option<SmolStr>,
     error_kind: Option<ErrorKind>,
+    pub(crate) auth_method: Option<AuthMethod>,
     success: bool,
 
     // extra
@@ -45,6 +47,15 @@ pub struct RequestMonitoring {
     pub latency_timer: LatencyTimer,
 }
 
+#[derive(Clone, Debug)]
+pub enum AuthMethod {
+    // aka link aka passwordless
+    Web,
+    ScramSha256,
+    ScramSha256Plus,
+    Cleartext,
+}
+
 impl RequestMonitoring {
     pub fn new(
         session_id: Uuid,
@@ -62,9 +73,11 @@ impl RequestMonitoring {
             project: None,
             branch: None,
             endpoint_id: None,
+            dbname: None,
             user: None,
             application: None,
             error_kind: None,
+            auth_method: None,
             success: false,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
@@ -106,10 +119,18 @@ impl RequestMonitoring {
         self.application = app.or_else(|| self.application.clone());
     }
 
+    pub fn set_dbname(&mut self, dbname: DbName) {
+        self.dbname = Some(dbname);
+    }
+
     pub fn set_user(&mut self, user: RoleName) {
         self.user = Some(user);
     }
 
+    pub fn set_auth_method(&mut self, auth_method: AuthMethod) {
+        self.auth_method = Some(auth_method);
+    }
+
     pub fn set_error_kind(&mut self, kind: ErrorKind) {
         ERROR_BY_KIND
             .with_label_values(&[kind.to_metric_label()])
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 0fe46915bc..ad22829183 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -84,8 +84,10 @@ struct RequestData {
     username: Option<String>,
     application_name: Option<String>,
     endpoint_id: Option<String>,
+    database: Option<String>,
     project: Option<String>,
     branch: Option<String>,
+    auth_method: Option<&'static str>,
     error: Option<&'static str>,
     /// Success is counted if we form a HTTP response with sql rows inside
     /// Or if we make it to proxy_pass
@@ -104,8 +106,15 @@ impl From<RequestMonitoring> for RequestData {
             username: value.user.as_deref().map(String::from),
             application_name: value.application.as_deref().map(String::from),
             endpoint_id: value.endpoint_id.as_deref().map(String::from),
+            database: value.dbname.as_deref().map(String::from),
             project: value.project.as_deref().map(String::from),
             branch: value.branch.as_deref().map(String::from),
+            auth_method: value.auth_method.as_ref().map(|x| match x {
+                super::AuthMethod::Web => "web",
+                super::AuthMethod::ScramSha256 => "scram_sha_256",
+                super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus",
+                super::AuthMethod::Cleartext => "cleartext",
+            }),
             protocol: value.protocol,
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
@@ -431,8 +440,10 @@ mod tests {
             application_name: Some("test".to_owned()),
             username: Some(hex::encode(rng.gen::<[u8; 4]>())),
             endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())),
+            database: Some(hex::encode(rng.gen::<[u8; 16]>())),
             project: Some(hex::encode(rng.gen::<[u8; 16]>())),
             branch: Some(hex::encode(rng.gen::<[u8; 16]>())),
+            auth_method: None,
             protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
             region: "us-east-1",
             error: None,
@@ -505,15 +516,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1087635, 3, 6000),
-                (1087288, 3, 6000),
-                (1087444, 3, 6000),
-                (1087572, 3, 6000),
-                (1087468, 3, 6000),
-                (1087500, 3, 6000),
-                (1087533, 3, 6000),
-                (1087566, 3, 6000),
-                (362671, 1, 2000)
+                (1313727, 3, 6000),
+                (1313720, 3, 6000),
+                (1313780, 3, 6000),
+                (1313737, 3, 6000),
+                (1313867, 3, 6000),
+                (1313709, 3, 6000),
+                (1313501, 3, 6000),
+                (1313737, 3, 6000),
+                (438118, 1, 2000)
             ],
         );
 
@@ -543,11 +554,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1028637, 5, 10000),
-                (1031969, 5, 10000),
-                (1019900, 5, 10000),
-                (1020365, 5, 10000),
-                (1025010, 5, 10000)
+                (1219459, 5, 10000),
+                (1225609, 5, 10000),
+                (1227403, 5, 10000),
+                (1226765, 5, 10000),
+                (1218043, 5, 10000)
             ],
         );
 
@@ -579,11 +590,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1210770, 6, 12000),
-                (1211036, 6, 12000),
-                (1210990, 6, 12000),
-                (1210861, 6, 12000),
-                (202073, 1, 2000)
+                (1205106, 5, 10000),
+                (1204837, 5, 10000),
+                (1205130, 5, 10000),
+                (1205118, 5, 10000),
+                (1205373, 5, 10000)
             ],
         );
 
@@ -608,15 +619,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1087635, 3, 6000),
-                (1087288, 3, 6000),
-                (1087444, 3, 6000),
-                (1087572, 3, 6000),
-                (1087468, 3, 6000),
-                (1087500, 3, 6000),
-                (1087533, 3, 6000),
-                (1087566, 3, 6000),
-                (362671, 1, 2000)
+                (1313727, 3, 6000),
+                (1313720, 3, 6000),
+                (1313780, 3, 6000),
+                (1313737, 3, 6000),
+                (1313867, 3, 6000),
+                (1313709, 3, 6000),
+                (1313501, 3, 6000),
+                (1313737, 3, 6000),
+                (438118, 1, 2000)
             ],
         );
 
@@ -653,7 +664,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(545264, 2, 3001), (545025, 2, 3000), (544857, 2, 2999)],
+            [(658383, 2, 3001), (658097, 2, 3000), (657893, 2, 2999)],
         );
 
         tmpdir.close().unwrap();
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 3e961afb41..5bb43c0375 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -144,7 +144,7 @@ impl TestAuth for Scram {
         stream: &mut PqStream<Stream<S>>,
     ) -> anyhow::Result<()> {
         let outcome = auth::AuthFlow::new(stream)
-            .begin(auth::Scram(&self.0))
+            .begin(auth::Scram(&self.0, &mut RequestMonitoring::test()))
             .await?
             .authenticate()
             .await?;
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 54424360c4..e9f868d51e 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -36,6 +36,7 @@ use crate::error::ReportableError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
+use crate::DbName;
 use crate::RoleName;
 
 use super::backend::PoolingBackend;
@@ -117,6 +118,9 @@ fn get_conn_info(
     headers: &HeaderMap,
     tls: &TlsConfig,
 ) -> Result<ConnInfo, ConnInfoError> {
+    // HTTP only uses cleartext (for now and likely always)
+    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
+
     let connection_string = headers
         .get("Neon-Connection-String")
         .ok_or(ConnInfoError::InvalidHeader("Neon-Connection-String"))?
@@ -134,7 +138,8 @@ fn get_conn_info(
         .path_segments()
         .ok_or(ConnInfoError::MissingDbName)?;
 
-    let dbname = url_path.next().ok_or(ConnInfoError::InvalidDbName)?;
+    let dbname: DbName = url_path.next().ok_or(ConnInfoError::InvalidDbName)?.into();
+    ctx.set_dbname(dbname.clone());
 
     let username = RoleName::from(urlencoding::decode(connection_url.username())?);
     if username.is_empty() {
@@ -174,7 +179,7 @@ fn get_conn_info(
 
     Ok(ConnInfo {
         user_info,
-        dbname: dbname.into(),
+        dbname,
         password: match password {
             std::borrow::Cow::Borrowed(b) => b.into(),
             std::borrow::Cow::Owned(b) => b.into(),

From 242dd8398c8d6728270c8d8c2a0b45dae480cb97 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 12 Feb 2024 15:58:55 +0100
Subject: [PATCH 0157/1571] refactor(blob_io): use owned buffers (#6660)

This PR refactors the `blob_io` code away from using slices towards
taking owned buffers and return them after use.
Using owned buffers will eventually allow us to use io_uring for writes.

part of https://github.com/neondatabase/neon/issues/6663

Depends on https://github.com/neondatabase/tokio-epoll-uring/pull/43

The high level scheme is as follows:
- call writing functions with the `BoundedBuf`
- return the underlying `BoundedBuf::Buf` for potential reuse in the
caller

NB: Invoking `BoundedBuf::slice(..)` will return a slice that _includes
the uninitialized portion of `BoundedBuf`_.
I.e., the portion between `bytes_init()` and `bytes_total()`.
It's a safe API that actually permits access to uninitialized memory.
Not great.

Another wrinkle is that it panics if the range has length 0.

However, I don't want to switch away from the `BoundedBuf` API, since
it's what tokio-uring uses.
We can always weed this out later by replacing `BoundedBuf` with our own
type.
Created an issue so we don't forget:
https://github.com/neondatabase/tokio-epoll-uring/issues/46
---
 Cargo.lock                                    |   5 +-
 pageserver/src/tenant/blob_io.rs              | 121 +++++++++++++-----
 .../src/tenant/storage_layer/delta_layer.rs   |  26 ++--
 .../src/tenant/storage_layer/image_layer.rs   |   8 +-
 .../tenant/storage_layer/inmemory_layer.rs    |   8 +-
 pageserver/src/tenant/timeline.rs             |   2 +-
 6 files changed, 115 insertions(+), 55 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 83afdaf66f..520163e41b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5740,7 +5740,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#d6a1c93442fb6b3a5bec490204961134e54925dc"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
 dependencies = [
  "futures",
  "nix 0.26.4",
@@ -6265,8 +6265,9 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#d6a1c93442fb6b3a5bec490204961134e54925dc"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
 dependencies = [
+ "bytes",
  "io-uring",
  "libc",
 ]
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 6de2e95055..e2ff12665a 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -11,6 +11,9 @@
 //! len <  128: 0XXXXXXX
 //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
+use bytes::{BufMut, BytesMut};
+use tokio_epoll_uring::{BoundedBuf, Slice};
+
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::tenant::block_io::BlockCursor;
@@ -100,6 +103,8 @@ pub struct BlobWriter<const BUFFERED: bool> {
     offset: u64,
     /// A buffer to save on write calls, only used if BUFFERED=true
     buf: Vec<u8>,
+    /// We do tiny writes for the length headers; they need to be in an owned buffer;
+    io_buf: Option<BytesMut>,
 }
 
 impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
@@ -108,6 +113,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
             inner,
             offset: start_offset,
             buf: Vec::with_capacity(Self::CAPACITY),
+            io_buf: Some(BytesMut::new()),
         }
     }
 
@@ -117,14 +123,28 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
 
     const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };
 
-    #[inline(always)]
     /// Writes the given buffer directly to the underlying `VirtualFile`.
     /// You need to make sure that the internal buffer is empty, otherwise
     /// data will be written in wrong order.
-    async fn write_all_unbuffered(&mut self, src_buf: &[u8]) -> Result<(), Error> {
-        self.inner.write_all(src_buf).await?;
-        self.offset += src_buf.len() as u64;
-        Ok(())
+    #[inline(always)]
+    async fn write_all_unbuffered<B: BoundedBuf>(
+        &mut self,
+        src_buf: B,
+    ) -> (B::Buf, Result<(), Error>) {
+        let src_buf_len = src_buf.bytes_init();
+        let (src_buf, res) = if src_buf_len > 0 {
+            let src_buf = src_buf.slice(0..src_buf_len);
+            let res = self.inner.write_all(&src_buf).await;
+            let src_buf = Slice::into_inner(src_buf);
+            (src_buf, res)
+        } else {
+            let res = self.inner.write_all(&[]).await;
+            (Slice::into_inner(src_buf.slice_full()), res)
+        };
+        if let Ok(()) = &res {
+            self.offset += src_buf_len as u64;
+        }
+        (src_buf, res)
     }
 
     #[inline(always)]
@@ -146,62 +166,91 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     }
 
     /// Internal, possibly buffered, write function
-    async fn write_all(&mut self, mut src_buf: &[u8]) -> Result<(), Error> {
+    async fn write_all<B: BoundedBuf>(&mut self, src_buf: B) -> (B::Buf, Result<(), Error>) {
         if !BUFFERED {
             assert!(self.buf.is_empty());
-            self.write_all_unbuffered(src_buf).await?;
-            return Ok(());
+            return self.write_all_unbuffered(src_buf).await;
         }
         let remaining = Self::CAPACITY - self.buf.len();
+        let src_buf_len = src_buf.bytes_init();
+        if src_buf_len == 0 {
+            return (Slice::into_inner(src_buf.slice_full()), Ok(()));
+        }
+        let mut src_buf = src_buf.slice(0..src_buf_len);
         // First try to copy as much as we can into the buffer
         if remaining > 0 {
-            let copied = self.write_into_buffer(src_buf);
-            src_buf = &src_buf[copied..];
+            let copied = self.write_into_buffer(&src_buf);
+            src_buf = src_buf.slice(copied..);
         }
         // Then, if the buffer is full, flush it out
         if self.buf.len() == Self::CAPACITY {
-            self.flush_buffer().await?;
+            if let Err(e) = self.flush_buffer().await {
+                return (Slice::into_inner(src_buf), Err(e));
+            }
         }
         // Finally, write the tail of src_buf:
         // If it wholly fits into the buffer without
         // completely filling it, then put it there.
         // If not, write it out directly.
-        if !src_buf.is_empty() {
+        let src_buf = if !src_buf.is_empty() {
             assert_eq!(self.buf.len(), 0);
             if src_buf.len() < Self::CAPACITY {
-                let copied = self.write_into_buffer(src_buf);
+                let copied = self.write_into_buffer(&src_buf);
                 // We just verified above that src_buf fits into our internal buffer.
                 assert_eq!(copied, src_buf.len());
+                Slice::into_inner(src_buf)
             } else {
-                self.write_all_unbuffered(src_buf).await?;
+                let (src_buf, res) = self.write_all_unbuffered(src_buf).await;
+                if let Err(e) = res {
+                    return (src_buf, Err(e));
+                }
+                src_buf
             }
-        }
-        Ok(())
+        } else {
+            Slice::into_inner(src_buf)
+        };
+        (src_buf, Ok(()))
     }
 
     /// Write a blob of data. Returns the offset that it was written to,
     /// which can be used to retrieve the data later.
-    pub async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
+    pub async fn write_blob<B: BoundedBuf>(&mut self, srcbuf: B) -> (B::Buf, Result<u64, Error>) {
         let offset = self.offset;
 
-        if srcbuf.len() < 128 {
-            // Short blob. Write a 1-byte length header
-            let len_buf = srcbuf.len() as u8;
-            self.write_all(&[len_buf]).await?;
-        } else {
-            // Write a 4-byte length header
-            if srcbuf.len() > 0x7fff_ffff {
-                return Err(Error::new(
-                    ErrorKind::Other,
-                    format!("blob too large ({} bytes)", srcbuf.len()),
-                ));
+        let len = srcbuf.bytes_init();
+
+        let mut io_buf = self.io_buf.take().expect("we always put it back below");
+        io_buf.clear();
+        let (io_buf, hdr_res) = async {
+            if len < 128 {
+                // Short blob. Write a 1-byte length header
+                io_buf.put_u8(len as u8);
+                self.write_all(io_buf).await
+            } else {
+                // Write a 4-byte length header
+                if len > 0x7fff_ffff {
+                    return (
+                        io_buf,
+                        Err(Error::new(
+                            ErrorKind::Other,
+                            format!("blob too large ({} bytes)", len),
+                        )),
+                    );
+                }
+                let mut len_buf = (len as u32).to_be_bytes();
+                len_buf[0] |= 0x80;
+                io_buf.extend_from_slice(&len_buf[..]);
+                self.write_all(io_buf).await
             }
-            let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes();
-            len_buf[0] |= 0x80;
-            self.write_all(&len_buf).await?;
         }
-        self.write_all(srcbuf).await?;
-        Ok(offset)
+        .await;
+        self.io_buf = Some(io_buf);
+        match hdr_res {
+            Ok(_) => (),
+            Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
+        }
+        let (srcbuf, res) = self.write_all(srcbuf).await;
+        (srcbuf, res.map(|_| offset))
     }
 }
 
@@ -248,12 +297,14 @@ mod tests {
             let file = VirtualFile::create(pathbuf.as_path()).await?;
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
-                let offs = wtr.write_blob(blob).await?;
+                let (_, res) = wtr.write_blob(blob.clone()).await;
+                let offs = res?;
                 offsets.push(offs);
             }
             // Write out one page worth of zeros so that we can
             // read again with read_blk
-            let offs = wtr.write_blob(&vec![0; PAGE_SZ]).await?;
+            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ]).await;
+            let offs = res?;
             println!("Writing final blob at offs={offs}");
             wtr.flush_buffer().await?;
         }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 2a51884c0b..7a5dc7a59f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -416,27 +416,31 @@ impl DeltaLayerWriterInner {
     /// The values must be appended in key, lsn order.
     ///
     async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
-        self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
-            .await
+        let (_, res) = self
+            .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init())
+            .await;
+        res
     }
 
     async fn put_value_bytes(
         &mut self,
         key: Key,
         lsn: Lsn,
-        val: &[u8],
+        val: Vec<u8>,
         will_init: bool,
-    ) -> anyhow::Result<()> {
+    ) -> (Vec<u8>, anyhow::Result<()>) {
         assert!(self.lsn_range.start <= lsn);
-
-        let off = self.blob_writer.write_blob(val).await?;
+        let (val, res) = self.blob_writer.write_blob(val).await;
+        let off = match res {
+            Ok(off) => off,
+            Err(e) => return (val, Err(anyhow::anyhow!(e))),
+        };
 
         let blob_ref = BlobRef::new(off, will_init);
 
         let delta_key = DeltaKey::from_key_lsn(&key, lsn);
-        self.tree.append(&delta_key.0, blob_ref.0)?;
-
-        Ok(())
+        let res = self.tree.append(&delta_key.0, blob_ref.0);
+        (val, res.map_err(|e| anyhow::anyhow!(e)))
     }
 
     fn size(&self) -> u64 {
@@ -587,9 +591,9 @@ impl DeltaLayerWriter {
         &mut self,
         key: Key,
         lsn: Lsn,
-        val: &[u8],
+        val: Vec<u8>,
         will_init: bool,
-    ) -> anyhow::Result<()> {
+    ) -> (Vec<u8>, anyhow::Result<()>) {
         self.inner
             .as_mut()
             .unwrap()
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index c62e6aed51..1ad195032d 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -528,9 +528,11 @@ impl ImageLayerWriterInner {
     ///
     /// The page versions must be appended in blknum order.
     ///
-    async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
+    async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
         ensure!(self.key_range.contains(&key));
-        let off = self.blob_writer.write_blob(img).await?;
+        let (_img, res) = self.blob_writer.write_blob(img).await;
+        // TODO: re-use the buffer for `img` further upstack
+        let off = res?;
 
         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
         key.write_to_byte_slice(&mut keybuf);
@@ -659,7 +661,7 @@ impl ImageLayerWriter {
     ///
     /// The page versions must be appended in blknum order.
     ///
-    pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
+    pub async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
         self.inner.as_mut().unwrap().put_image(key, img).await
     }
 
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 7c9103eea8..c597b15533 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -383,9 +383,11 @@ impl InMemoryLayer {
             for (lsn, pos) in vec_map.as_slice() {
                 cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
                 let will_init = Value::des(&buf)?.will_init();
-                delta_layer_writer
-                    .put_value_bytes(key, *lsn, &buf, will_init)
-                    .await?;
+                let res;
+                (buf, res) = delta_layer_writer
+                    .put_value_bytes(key, *lsn, buf, will_init)
+                    .await;
+                res?;
             }
         }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f96679ca69..74676277d5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3328,7 +3328,7 @@ impl Timeline {
                                     }
                                 };
 
-                                image_layer_writer.put_image(img_key, &img).await?;
+                                image_layer_writer.put_image(img_key, img).await?;
                             }
                         }
 

From 789a71c4ee6722f26ae4929a10e1316568e2006f Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 12 Feb 2024 15:03:45 +0000
Subject: [PATCH 0158/1571] proxy: add more http logging (#6726)

## Problem

hard to see where time is taken during HTTP flow.

## Summary of changes

add a lot more for query state. add a conn_id field to the sql-over-http
span
---
 proxy/src/metrics.rs                  |  5 ++--
 proxy/src/serverless/backend.rs       |  8 +++----
 proxy/src/serverless/conn_pool.rs     | 22 +++++-------------
 proxy/src/serverless/sql_over_http.rs | 33 +++++++++++++++++++++++----
 4 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index ccf89f9b05..f7f162a075 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -200,8 +200,9 @@ impl LatencyTimer {
 
     pub fn success(&mut self) {
         // stop the stopwatch and record the time that we have accumulated
-        let start = self.start.take().expect("latency timer should be started");
-        self.accumulated += start.elapsed();
+        if let Some(start) = self.start.take() {
+            self.accumulated += start.elapsed();
+        }
 
         // success
         self.outcome = "success";
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 8285da68d7..156002006d 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -1,7 +1,7 @@
 use std::{sync::Arc, time::Duration};
 
 use async_trait::async_trait;
-use tracing::info;
+use tracing::{field::display, info};
 
 use crate::{
     auth::{backend::ComputeCredentialKeys, check_peer_addr_is_in_list, AuthError},
@@ -15,7 +15,7 @@ use crate::{
     proxy::connect_compute::ConnectMechanism,
 };
 
-use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool, APP_NAME};
+use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
 
 pub struct PoolingBackend {
     pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
@@ -81,8 +81,8 @@ impl PoolingBackend {
             return Ok(client);
         }
         let conn_id = uuid::Uuid::new_v4();
-        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
-        ctx.set_application(Some(APP_NAME));
+        tracing::Span::current().record("conn_id", display(conn_id));
+        info!("pool: opening a new connection '{conn_info}'");
         let backend = self
             .config
             .auth_backend
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index f4e5b145c5..53e7c1c2ee 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -4,7 +4,6 @@ use metrics::IntCounterPairGuard;
 use parking_lot::RwLock;
 use rand::Rng;
 use smallvec::SmallVec;
-use smol_str::SmolStr;
 use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
 use std::{
     fmt,
@@ -31,8 +30,6 @@ use tracing::{info, info_span, Instrument};
 
 use super::backend::HttpConnError;
 
-pub const APP_NAME: SmolStr = SmolStr::new_inline("/sql_over_http");
-
 #[derive(Debug, Clone)]
 pub struct ConnInfo {
     pub user_info: ComputeUserInfo,
@@ -379,12 +376,13 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                 info!("pool: cached connection '{conn_info}' is closed, opening a new one");
                 return Ok(None);
             } else {
-                info!("pool: reusing connection '{conn_info}'");
-                client.session.send(ctx.session_id)?;
+                tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
                 tracing::Span::current().record(
                     "pid",
                     &tracing::field::display(client.inner.get_process_id()),
                 );
+                info!("pool: reusing connection '{conn_info}'");
+                client.session.send(ctx.session_id)?;
                 ctx.latency_timer.pool_hit();
                 ctx.latency_timer.success();
                 return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
@@ -577,7 +575,6 @@ pub struct Client<C: ClientInnerExt> {
 }
 
 pub struct Discard<'a, C: ClientInnerExt> {
-    conn_id: uuid::Uuid,
     conn_info: &'a ConnInfo,
     pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
 }
@@ -603,14 +600,7 @@ impl<C: ClientInnerExt> Client<C> {
             span: _,
         } = self;
         let inner = inner.as_mut().expect("client inner should not be removed");
-        (
-            &mut inner.inner,
-            Discard {
-                pool,
-                conn_info,
-                conn_id: inner.conn_id,
-            },
-        )
+        (&mut inner.inner, Discard { pool, conn_info })
     }
 
     pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
@@ -625,13 +615,13 @@ impl<C: ClientInnerExt> Discard<'_, C> {
     pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
         let conn_info = &self.conn_info;
         if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
-            info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
+            info!("pool: throwing away connection '{conn_info}' because connection is not idle")
         }
     }
     pub fn discard(&mut self) {
         let conn_info = &self.conn_info;
         if std::mem::take(self.pool).strong_count() > 0 {
-            info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
+            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
         }
     }
 }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index e9f868d51e..ecb72abe73 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -36,6 +36,7 @@ use crate::error::ReportableError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
+use crate::serverless::backend::HttpConnError;
 use crate::DbName;
 use crate::RoleName;
 
@@ -305,7 +306,14 @@ pub async fn handle(
     Ok(response)
 }
 
-#[instrument(name = "sql-over-http", fields(pid = tracing::field::Empty), skip_all)]
+#[instrument(
+    name = "sql-over-http",
+    skip_all,
+    fields(
+        pid = tracing::field::Empty,
+        conn_id = tracing::field::Empty
+    )
+)]
 async fn handle_inner(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
@@ -359,12 +367,10 @@ async fn handle_inner(
     let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
     let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
 
-    let paused = ctx.latency_timer.pause();
     let request_content_length = match request.body().size_hint().upper() {
         Some(v) => v,
         None => MAX_REQUEST_SIZE + 1,
     };
-    drop(paused);
     info!(request_content_length, "request size in bytes");
     HTTP_CONTENT_LENGTH.observe(request_content_length as f64);
 
@@ -380,15 +386,20 @@ async fn handle_inner(
         let body = hyper::body::to_bytes(request.into_body())
             .await
             .map_err(anyhow::Error::from)?;
+        info!(length = body.len(), "request payload read");
         let payload: Payload = serde_json::from_slice(&body)?;
         Ok::<Payload, anyhow::Error>(payload) // Adjust error type accordingly
     };
 
     let authenticate_and_connect = async {
         let keys = backend.authenticate(ctx, &conn_info).await?;
-        backend
+        let client = backend
             .connect_to_compute(ctx, conn_info, keys, !allow_pool)
-            .await
+            .await?;
+        // not strictly necessary to mark success here,
+        // but it's just insurance for if we forget it somewhere else
+        ctx.latency_timer.success();
+        Ok::<_, HttpConnError>(client)
     };
 
     // Run both operations in parallel
@@ -420,6 +431,7 @@ async fn handle_inner(
             results
         }
         Payload::Batch(statements) => {
+            info!("starting transaction");
             let (inner, mut discard) = client.inner();
             let mut builder = inner.build_transaction();
             if let Some(isolation_level) = txn_isolation_level {
@@ -449,6 +461,7 @@ async fn handle_inner(
             .await
             {
                 Ok(results) => {
+                    info!("commit");
                     let status = transaction.commit().await.map_err(|e| {
                         // if we cannot commit - for now don't return connection to pool
                         // TODO: get a query status from the error
@@ -459,6 +472,7 @@ async fn handle_inner(
                     results
                 }
                 Err(err) => {
+                    info!("rollback");
                     let status = transaction.rollback().await.map_err(|e| {
                         // if we cannot rollback - for now don't return connection to pool
                         // TODO: get a query status from the error
@@ -533,8 +547,10 @@ async fn query_to_json<T: GenericClient>(
     raw_output: bool,
     default_array_mode: bool,
 ) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
+    info!("executing query");
     let query_params = data.params;
     let row_stream = client.query_raw_txt(&data.query, query_params).await?;
+    info!("finished executing query");
 
     // Manually drain the stream into a vector to leave row_stream hanging
     // around to get a command tag. Also check that the response is not too
@@ -569,6 +585,13 @@ async fn query_to_json<T: GenericClient>(
     }
     .and_then(|s| s.parse::<i64>().ok());
 
+    info!(
+        rows = rows.len(),
+        ?ready,
+        command_tag,
+        "finished reading rows"
+    );
+
     let mut fields = vec![];
     let mut columns = vec![];
 

From 7ea593db2292324e136d3325cd96217c9d652395 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 12 Feb 2024 17:13:35 +0200
Subject: [PATCH 0159/1571] refactor(LayerManager): resident layers query
 (#6634)

Refactor out layer accesses so that we can have easy access to resident
layers, which are needed for number of cases instead of layers for
eviction. Simplifies the heatmap building by only using Layers, not
RemoteTimelineClient.

Cc: #5331
---
 .../src/tenant/remote_timeline_client.rs      | 17 ----
 pageserver/src/tenant/storage_layer.rs        |  8 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  4 -
 pageserver/src/tenant/timeline.rs             | 97 ++++++-------------
 .../src/tenant/timeline/eviction_task.rs      |  7 +-
 .../src/tenant/timeline/layer_manager.rs      | 45 ++++++---
 6 files changed, 74 insertions(+), 104 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index e17dea01a8..483f53d5c8 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1700,23 +1700,6 @@ impl RemoteTimelineClient {
             }
         }
     }
-
-    pub(crate) fn get_layers_metadata(
-        &self,
-        layers: Vec<LayerFileName>,
-    ) -> anyhow::Result<Vec<Option<LayerFileMetadata>>> {
-        let q = self.upload_queue.lock().unwrap();
-        let q = match &*q {
-            UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
-                anyhow::bail!("queue is in state {}", q.as_str())
-            }
-            UploadQueue::Initialized(inner) => inner,
-        };
-
-        let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned());
-
-        Ok(decorated.collect())
-    }
 }
 
 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 6e9a4932d8..2d92baccbe 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -257,6 +257,12 @@ impl LayerAccessStats {
         ret
     }
 
+    /// Get the latest access timestamp, falling back to latest residence event, further falling
+    /// back to `SystemTime::now` for a usable timestamp for eviction.
+    pub(crate) fn latest_activity_or_now(&self) -> SystemTime {
+        self.latest_activity().unwrap_or_else(SystemTime::now)
+    }
+
     /// Get the latest access timestamp, falling back to latest residence event.
     ///
     /// This function can only return `None` if there has not yet been a call to the
@@ -271,7 +277,7 @@ impl LayerAccessStats {
     /// that that type can only be produced by inserting into the layer map.
     ///
     /// [`record_residence_event`]: Self::record_residence_event
-    pub(crate) fn latest_activity(&self) -> Option<SystemTime> {
+    fn latest_activity(&self) -> Option<SystemTime> {
         let locked = self.0.lock().unwrap();
         let inner = &locked.for_eviction_policy;
         match inner.last_accesses.recent() {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index dd9de99477..bfcc031863 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1413,10 +1413,6 @@ impl ResidentLayer {
         &self.owner.0.path
     }
 
-    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
-        self.owner.access_stats()
-    }
-
     pub(crate) fn metadata(&self) -> LayerFileMetadata {
         self.owner.metadata()
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 74676277d5..625be7a644 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -12,6 +12,7 @@ use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use enumset::EnumSet;
 use fail::fail_point;
+use futures::stream::StreamExt;
 use itertools::Itertools;
 use pageserver_api::{
     keyspace::{key_range_size, KeySpaceAccum},
@@ -105,7 +106,7 @@ use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
 use super::config::TenantConf;
-use super::remote_timeline_client::index::{IndexLayerMetadata, IndexPart};
+use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
 use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
@@ -1458,7 +1459,7 @@ impl Timeline {
                 generation,
                 shard_identity,
                 pg_version,
-                layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())),
+                layers: Default::default(),
                 wanted_image_layers: Mutex::new(None),
 
                 walredo_mgr,
@@ -2283,45 +2284,28 @@ impl Timeline {
     /// should treat this as a cue to simply skip doing any heatmap uploading
     /// for this timeline.
     pub(crate) async fn generate_heatmap(&self) -> Option<HeatMapTimeline> {
-        let eviction_info = self.get_local_layers_for_disk_usage_eviction().await;
+        // no point in heatmaps without remote client
+        let _remote_client = self.remote_client.as_ref()?;
 
-        let remote_client = match &self.remote_client {
-            Some(c) => c,
-            None => return None,
-        };
+        if !self.is_active() {
+            return None;
+        }
 
-        let layer_file_names = eviction_info
-            .resident_layers
-            .iter()
-            .map(|l| l.layer.get_name())
-            .collect::<Vec<_>>();
+        let guard = self.layers.read().await;
 
-        let decorated = match remote_client.get_layers_metadata(layer_file_names) {
-            Ok(d) => d,
-            Err(_) => {
-                // Getting metadata only fails on Timeline in bad state.
-                return None;
-            }
-        };
+        let resident = guard.resident_layers().map(|layer| {
+            let last_activity_ts = layer.access_stats().latest_activity_or_now();
 
-        let heatmap_layers = std::iter::zip(
-            eviction_info.resident_layers.into_iter(),
-            decorated.into_iter(),
-        )
-        .filter_map(|(layer, remote_info)| {
-            remote_info.map(|remote_info| {
-                HeatMapLayer::new(
-                    layer.layer.get_name(),
-                    IndexLayerMetadata::from(remote_info),
-                    layer.last_activity_ts,
-                )
-            })
+            HeatMapLayer::new(
+                layer.layer_desc().filename(),
+                layer.metadata().into(),
+                last_activity_ts,
+            )
         });
 
-        Some(HeatMapTimeline::new(
-            self.timeline_id,
-            heatmap_layers.collect(),
-        ))
+        let layers = resident.collect().await;
+
+        Some(HeatMapTimeline::new(self.timeline_id, layers))
     }
 }
 
@@ -4662,41 +4646,24 @@ impl Timeline {
     /// Returns non-remote layers for eviction.
     pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
         let guard = self.layers.read().await;
-        let layers = guard.layer_map();
-
         let mut max_layer_size: Option<u64> = None;
-        let mut resident_layers = Vec::new();
 
-        for l in layers.iter_historic_layers() {
-            let file_size = l.file_size();
-            max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
+        let resident_layers = guard
+            .resident_layers()
+            .map(|layer| {
+                let file_size = layer.layer_desc().file_size;
+                max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
 
-            let l = guard.get_from_desc(&l);
+                let last_activity_ts = layer.access_stats().latest_activity_or_now();
 
-            let l = match l.keep_resident().await {
-                Ok(Some(l)) => l,
-                Ok(None) => continue,
-                Err(e) => {
-                    // these should not happen, but we cannot make them statically impossible right
-                    // now.
-                    tracing::warn!(layer=%l, "failed to keep the layer resident: {e:#}");
-                    continue;
+                EvictionCandidate {
+                    layer: layer.into(),
+                    last_activity_ts,
+                    relative_last_activity: finite_f32::FiniteF32::ZERO,
                 }
-            };
-
-            let last_activity_ts = l.access_stats().latest_activity().unwrap_or_else(|| {
-                // We only use this fallback if there's an implementation error.
-                // `latest_activity` already does rate-limited warn!() log.
-                debug!(layer=%l, "last_activity returns None, using SystemTime::now");
-                SystemTime::now()
-            });
-
-            resident_layers.push(EvictionCandidate {
-                layer: l.drop_eviction_guard().into(),
-                last_activity_ts,
-                relative_last_activity: finite_f32::FiniteF32::ZERO,
-            });
-        }
+            })
+            .collect()
+            .await;
 
         DiskUsageEvictionInfo {
             max_layer_size,
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 9bdd52e809..d87f78e35f 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -239,12 +239,7 @@ impl Timeline {
                     }
                 };
 
-                let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
-                    // We only use this fallback if there's an implementation error.
-                    // `latest_activity` already does rate-limited warn!() log.
-                    debug!(layer=%hist_layer, "last_activity returns None, using SystemTime::now");
-                    SystemTime::now()
-                });
+                let last_activity_ts = hist_layer.access_stats().latest_activity_or_now();
 
                 let no_activity_for = match now.duration_since(last_activity_ts) {
                     Ok(d) => d,
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index e38f5be209..ebcdcfdb4d 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,4 +1,5 @@
 use anyhow::{bail, ensure, Context, Result};
+use futures::StreamExt;
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
 use tracing::trace;
@@ -20,19 +21,13 @@ use crate::{
 };
 
 /// Provides semantic APIs to manipulate the layer map.
+#[derive(Default)]
 pub(crate) struct LayerManager {
     layer_map: LayerMap,
     layer_fmgr: LayerFileManager<Layer>,
 }
 
 impl LayerManager {
-    pub(crate) fn create() -> Self {
-        Self {
-            layer_map: LayerMap::default(),
-            layer_fmgr: LayerFileManager::new(),
-        }
-    }
-
     pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
         self.layer_fmgr.get_from_desc(desc)
     }
@@ -246,6 +241,32 @@ impl LayerManager {
         layer.delete_on_drop();
     }
 
+    pub(crate) fn resident_layers(&self) -> impl futures::stream::Stream<Item = Layer> + '_ {
+        // for small layer maps, we most likely have all resident, but for larger more are likely
+        // to be evicted assuming lots of layers correlated with longer lifespan.
+
+        let layers = self
+            .layer_map()
+            .iter_historic_layers()
+            .map(|desc| self.get_from_desc(&desc));
+
+        let layers = futures::stream::iter(layers);
+
+        layers.filter_map(|layer| async move {
+            // TODO(#6028): this query does not really need to see the ResidentLayer
+            match layer.keep_resident().await {
+                Ok(Some(layer)) => Some(layer.drop_eviction_guard()),
+                Ok(None) => None,
+                Err(e) => {
+                    // these should not happen, but we cannot make them statically impossible right
+                    // now.
+                    tracing::warn!(%layer, "failed to keep the layer resident: {e:#}");
+                    None
+                }
+            }
+        })
+    }
+
     pub(crate) fn contains(&self, layer: &Layer) -> bool {
         self.layer_fmgr.contains(layer)
     }
@@ -253,6 +274,12 @@ impl LayerManager {
 
 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
 
+impl<T> Default for LayerFileManager<T> {
+    fn default() -> Self {
+        Self(HashMap::default())
+    }
+}
+
 impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
     fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
         // The assumption for the `expect()` is that all code maintains the following invariant:
@@ -275,10 +302,6 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
         self.0.contains_key(&layer.layer_desc().key())
     }
 
-    pub(crate) fn new() -> Self {
-        Self(HashMap::new())
-    }
-
     pub(crate) fn remove(&mut self, layer: &T) {
         let present = self.0.remove(&layer.layer_desc().key());
         if present.is_none() && cfg!(debug_assertions) {

From 8b8ff88e4b0e1a1b1c14f0edbe50e0c6236afa93 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 12 Feb 2024 16:25:33 +0100
Subject: [PATCH 0160/1571] GH actions: label to disable CI runs completely
 (#6677)

I don't want my very-early-draft PRs to trigger any CI runs.
So, add a label `run-no-ci`, and piggy-back on the `check-permissions` job.
---
 .github/workflows/actionlint.yml        | 1 +
 .github/workflows/build_and_test.yml    | 2 +-
 .github/workflows/neon_extra_builds.yml | 2 ++
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index 584828c1d0..c290ff88e2 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -17,6 +17,7 @@ concurrency:
 
 jobs:
   actionlint:
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 078916e1ea..6e4020a1b8 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -26,8 +26,8 @@ env:
 
 jobs:
   check-permissions:
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
     runs-on: ubuntu-latest
-
     steps:
     - name: Disallow PRs from forks
       if: |
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index c90ef60074..ff2a3a040a 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -117,6 +117,7 @@ jobs:
 
   check-linux-arm-build:
     timeout-minutes: 90
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
     runs-on: [ self-hosted, dev, arm64 ]
 
     env:
@@ -237,6 +238,7 @@ jobs:
 
   check-codestyle-rust-arm:
     timeout-minutes: 90
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
     runs-on: [ self-hosted, dev, arm64 ]
 
     container:

From a1f37cba1c790e5b89958fb7df13cde39429add8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 12 Feb 2024 19:15:21 +0100
Subject: [PATCH 0161/1571] Add test that runs the S3 scrubber (#6641)

In #6079 it was found that there is no test that executes the scrubber.
We now add such a test, which does the following things:

* create a tenant, write some data
* run the scrubber
* remove the tenant
* run the scrubber again

Each time, the scrubber runs the scan-metadata command. Before #6079 we
would have errored, now we don't.

Fixes #6080
---
 test_runner/fixtures/neon_fixtures.py         |  8 ++--
 .../regress/test_pageserver_generations.py    |  4 +-
 .../regress/test_pageserver_secondary.py      |  2 +-
 test_runner/regress/test_tenant_delete.py     | 40 ++++++++++++++++++-
 4 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index faa8effe10..26f2b999a6 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -899,7 +899,7 @@ class NeonEnvBuilder:
 
             if self.scrub_on_exit:
                 try:
-                    S3Scrubber(self.test_output_dir, self).scan_metadata()
+                    S3Scrubber(self).scan_metadata()
                 except Exception as e:
                     log.error(f"Error during remote storage scrub: {e}")
                     cleanup_error = e
@@ -3659,9 +3659,9 @@ class SafekeeperHttpClient(requests.Session):
 
 
 class S3Scrubber:
-    def __init__(self, log_dir: Path, env: NeonEnvBuilder):
+    def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
         self.env = env
-        self.log_dir = log_dir
+        self.log_dir = log_dir or env.test_output_dir
 
     def scrubber_cli(self, args: list[str], timeout) -> str:
         assert isinstance(self.env.pageserver_remote_storage, S3Storage)
@@ -3682,7 +3682,7 @@ class S3Scrubber:
         args = base_args + args
 
         (output_path, stdout, status_code) = subprocess_capture(
-            self.log_dir,
+            self.env.test_output_dir,
             args,
             echo_stderr=True,
             echo_stdout=True,
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 725ed63d1c..de9f3b6945 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -265,9 +265,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
 
     # Having written a mixture of generation-aware and legacy index_part.json,
     # ensure the scrubber handles the situation as expected.
-    metadata_summary = S3Scrubber(
-        neon_env_builder.test_output_dir, neon_env_builder
-    ).scan_metadata()
+    metadata_summary = S3Scrubber(neon_env_builder).scan_metadata()
     assert metadata_summary["tenant_count"] == 1  # Scrubber should have seen our timeline
     assert metadata_summary["timeline_count"] == 1
     assert metadata_summary["timeline_shard_count"] == 1
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 293152dd62..aec989252c 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -498,7 +498,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     # Scrub the remote storage
     # ========================
     # This confirms that the scrubber isn't upset by the presence of the heatmap
-    S3Scrubber(neon_env_builder.test_output_dir, neon_env_builder).scan_metadata()
+    S3Scrubber(neon_env_builder).scan_metadata()
 
     # Detach secondary and delete tenant
     # ===================================
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index b4e5a550f3..e928ea8bb1 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -9,6 +9,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
+    S3Scrubber,
     last_flush_lsn_upload,
     wait_for_last_flush_lsn,
 )
@@ -19,12 +20,13 @@ from fixtures.pageserver.utils import (
     assert_prefix_not_empty,
     poll_for_remote_storage_iterations,
     tenant_delete_wait_completed,
+    wait_for_upload,
     wait_tenant_status_404,
     wait_until_tenant_active,
     wait_until_tenant_state,
 )
 from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage
-from fixtures.types import TenantId, TimelineId
+from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import run_pg_bench_small, wait_until
 from requests.exceptions import ReadTimeout
 
@@ -669,3 +671,39 @@ def test_tenant_delete_races_timeline_creation(
 
     # Zero tenants remain (we deleted the default tenant)
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
+
+
+def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
+    """
+    Validate that creating and then deleting the tenant both survives the scrubber,
+    and that one can run the scrubber without problems.
+    """
+
+    remote_storage_kind = RemoteStorageKind.MOCK_S3
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+    scrubber = S3Scrubber(neon_env_builder)
+    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+
+    ps_http = env.pageserver.http_client()
+    # create a tenant separate from the main tenant so that we have one remaining
+    # after we deleted it, as the scrubber treats empty buckets as an error.
+    (tenant_id, timeline_id) = env.neon_cli.create_tenant()
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        run_pg_bench_small(pg_bin, endpoint.connstr())
+        last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    ps_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
+    env.stop()
+
+    result = scrubber.scan_metadata()
+    assert result["with_warnings"] == []
+
+    env.start()
+    ps_http = env.pageserver.http_client()
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
+    env.stop()
+
+    scrubber.scan_metadata()
+    assert result["with_warnings"] == []

From fac50a6264fb8ee59778d0720ba799a24c46695a Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 12 Feb 2024 19:41:02 +0100
Subject: [PATCH 0162/1571] Proxy refactor auth+connect (#6708)

## Problem

Not really a problem, just refactoring.

## Summary of changes

Separate authenticate from wake compute.

Do not call wake compute second time if we managed to connect to
postgres or if we got it not from cache.
---
 proxy/src/auth.rs                  |   5 -
 proxy/src/auth/backend.rs          | 146 ++++++++++++++++-------------
 proxy/src/auth/backend/classic.rs  |   2 +-
 proxy/src/auth/backend/hacks.rs    |   6 +-
 proxy/src/bin/proxy.rs             |   2 +-
 proxy/src/compute.rs               |   8 +-
 proxy/src/config.rs                |   2 +-
 proxy/src/console/provider.rs      |  33 ++++++-
 proxy/src/console/provider/mock.rs |   4 +-
 proxy/src/error.rs                 |  12 ++-
 proxy/src/proxy.rs                 |  13 +--
 proxy/src/proxy/connect_compute.rs |  67 ++++++++-----
 proxy/src/proxy/tests.rs           | 142 +++++++++++++++++++++-------
 proxy/src/proxy/wake_compute.rs    |  16 +---
 proxy/src/serverless/backend.rs    |  40 +++-----
 15 files changed, 307 insertions(+), 191 deletions(-)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index 48de4e2353..c8028d1bf0 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -36,9 +36,6 @@ pub enum AuthErrorImpl {
     #[error(transparent)]
     GetAuthInfo(#[from] console::errors::GetAuthInfoError),
 
-    #[error(transparent)]
-    WakeCompute(#[from] console::errors::WakeComputeError),
-
     /// SASL protocol errors (includes [SCRAM](crate::scram)).
     #[error(transparent)]
     Sasl(#[from] crate::sasl::Error),
@@ -119,7 +116,6 @@ impl UserFacingError for AuthError {
         match self.0.as_ref() {
             Link(e) => e.to_string_client(),
             GetAuthInfo(e) => e.to_string_client(),
-            WakeCompute(e) => e.to_string_client(),
             Sasl(e) => e.to_string_client(),
             AuthFailed(_) => self.to_string(),
             BadAuthMethod(_) => self.to_string(),
@@ -139,7 +135,6 @@ impl ReportableError for AuthError {
         match self.0.as_ref() {
             Link(e) => e.get_error_kind(),
             GetAuthInfo(e) => e.get_error_kind(),
-            WakeCompute(e) => e.get_error_kind(),
             Sasl(e) => e.get_error_kind(),
             AuthFailed(_) => crate::error::ErrorKind::User,
             BadAuthMethod(_) => crate::error::ErrorKind::User,
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index c9f21f1cf5..47c1dc4e92 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -10,9 +10,9 @@ use crate::auth::validate_password_and_exchange;
 use crate::cache::Cached;
 use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
-use crate::console::AuthSecret;
+use crate::console::{AuthSecret, NodeInfo};
 use crate::context::RequestMonitoring;
-use crate::proxy::wake_compute::wake_compute;
+use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
 use crate::stream::Stream;
 use crate::{
@@ -26,7 +26,6 @@ use crate::{
     stream, url,
 };
 use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
-use futures::TryFutureExt;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
@@ -56,11 +55,11 @@ impl<T> std::ops::Deref for MaybeOwned<'_, T> {
 /// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`],
 ///   this helps us provide the credentials only to those auth
 ///   backends which require them for the authentication process.
-pub enum BackendType<'a, T> {
+pub enum BackendType<'a, T, D> {
     /// Cloud API (V2).
     Console(MaybeOwned<'a, ConsoleBackend>, T),
     /// Authentication via a web browser.
-    Link(MaybeOwned<'a, url::ApiUrl>),
+    Link(MaybeOwned<'a, url::ApiUrl>, D),
 }
 
 pub trait TestBackend: Send + Sync + 'static {
@@ -71,7 +70,7 @@ pub trait TestBackend: Send + Sync + 'static {
     fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError>;
 }
 
-impl std::fmt::Display for BackendType<'_, ()> {
+impl std::fmt::Display for BackendType<'_, (), ()> {
     fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         use BackendType::*;
         match self {
@@ -86,51 +85,50 @@ impl std::fmt::Display for BackendType<'_, ()> {
                 #[cfg(test)]
                 ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
             },
-            Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
         }
     }
 }
 
-impl<T> BackendType<'_, T> {
+impl<T, D> BackendType<'_, T, D> {
     /// Very similar to [`std::option::Option::as_ref`].
     /// This helps us pass structured config to async tasks.
-    pub fn as_ref(&self) -> BackendType<'_, &T> {
+    pub fn as_ref(&self) -> BackendType<'_, &T, &D> {
         use BackendType::*;
         match self {
             Console(c, x) => Console(MaybeOwned::Borrowed(c), x),
-            Link(c) => Link(MaybeOwned::Borrowed(c)),
+            Link(c, x) => Link(MaybeOwned::Borrowed(c), x),
         }
     }
 }
 
-impl<'a, T> BackendType<'a, T> {
+impl<'a, T, D> BackendType<'a, T, D> {
     /// Very similar to [`std::option::Option::map`].
     /// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
     /// a function to a contained value.
-    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R> {
+    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
         use BackendType::*;
         match self {
             Console(c, x) => Console(c, f(x)),
-            Link(c) => Link(c),
+            Link(c, x) => Link(c, x),
         }
     }
 }
-
-impl<'a, T, E> BackendType<'a, Result<T, E>> {
+impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
     /// Very similar to [`std::option::Option::transpose`].
     /// This is most useful for error handling.
-    pub fn transpose(self) -> Result<BackendType<'a, T>, E> {
+    pub fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
         use BackendType::*;
         match self {
             Console(c, x) => x.map(|x| Console(c, x)),
-            Link(c) => Ok(Link(c)),
+            Link(c, x) => Ok(Link(c, x)),
         }
     }
 }
 
-pub struct ComputeCredentials<T> {
+pub struct ComputeCredentials {
     pub info: ComputeUserInfo,
-    pub keys: T,
+    pub keys: ComputeCredentialKeys,
 }
 
 #[derive(Debug, Clone)]
@@ -153,7 +151,6 @@ impl ComputeUserInfo {
 }
 
 pub enum ComputeCredentialKeys {
-    #[cfg(any(test, feature = "testing"))]
     Password(Vec<u8>),
     AuthKeys(AuthKeys),
 }
@@ -188,7 +185,7 @@ async fn auth_quirks(
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     allow_cleartext: bool,
     config: &'static AuthenticationConfig,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<ComputeCredentials> {
     // If there's no project so far, that entails that client doesn't
     // support SNI or other means of passing the endpoint (project) name.
     // We now expect to see a very specific payload in the place of password.
@@ -198,8 +195,11 @@ async fn auth_quirks(
 
             ctx.set_endpoint_id(res.info.endpoint.clone());
             tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
-
-            (res.info, Some(res.keys))
+            let password = match res.keys {
+                ComputeCredentialKeys::Password(p) => p,
+                _ => unreachable!("password hack should return a password"),
+            };
+            (res.info, Some(password))
         }
         Ok(info) => (info, None),
     };
@@ -253,7 +253,7 @@ async fn authenticate_with_secret(
     unauthenticated_password: Option<Vec<u8>>,
     allow_cleartext: bool,
     config: &'static AuthenticationConfig,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<ComputeCredentials> {
     if let Some(password) = unauthenticated_password {
         let auth_outcome = validate_password_and_exchange(&password, secret)?;
         let keys = match auth_outcome {
@@ -283,14 +283,14 @@ async fn authenticate_with_secret(
     classic::authenticate(ctx, info, client, config, secret).await
 }
 
-impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
+impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
     /// Get compute endpoint name from the credentials.
     pub fn get_endpoint(&self) -> Option<EndpointId> {
         use BackendType::*;
 
         match self {
             Console(_, user_info) => user_info.endpoint_id.clone(),
-            Link(_) => Some("link".into()),
+            Link(_, _) => Some("link".into()),
         }
     }
 
@@ -300,7 +300,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
 
         match self {
             Console(_, user_info) => &user_info.user,
-            Link(_) => "link",
+            Link(_, _) => "link",
         }
     }
 
@@ -312,7 +312,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
         client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
-    ) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> {
+    ) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
         use BackendType::*;
 
         let res = match self {
@@ -323,33 +323,17 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
                     "performing authentication using the console"
                 );
 
-                let compute_credentials =
+                let credentials =
                     auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
-
-                let mut num_retries = 0;
-                let mut node =
-                    wake_compute(&mut num_retries, ctx, &api, &compute_credentials.info).await?;
-
-                ctx.set_project(node.aux.clone());
-
-                match compute_credentials.keys {
-                    #[cfg(any(test, feature = "testing"))]
-                    ComputeCredentialKeys::Password(password) => node.config.password(password),
-                    ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
-                };
-
-                (node, BackendType::Console(api, compute_credentials.info))
+                BackendType::Console(api, credentials)
             }
             // NOTE: this auth backend doesn't use client credentials.
-            Link(url) => {
+            Link(url, _) => {
                 info!("performing link authentication");
 
-                let node_info = link::authenticate(ctx, &url, client).await?;
+                let info = link::authenticate(ctx, &url, client).await?;
 
-                (
-                    CachedNodeInfo::new_uncached(node_info),
-                    BackendType::Link(url),
-                )
+                BackendType::Link(url, info)
             }
         };
 
@@ -358,7 +342,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
     }
 }
 
-impl BackendType<'_, ComputeUserInfo> {
+impl BackendType<'_, ComputeUserInfo, &()> {
     pub async fn get_role_secret(
         &self,
         ctx: &mut RequestMonitoring,
@@ -366,7 +350,7 @@ impl BackendType<'_, ComputeUserInfo> {
         use BackendType::*;
         match self {
             Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
-            Link(_) => Ok(Cached::new_uncached(None)),
+            Link(_, _) => Ok(Cached::new_uncached(None)),
         }
     }
 
@@ -377,21 +361,51 @@ impl BackendType<'_, ComputeUserInfo> {
         use BackendType::*;
         match self {
             Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Link(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
-        }
-    }
-
-    /// When applicable, wake the compute node, gaining its connection info in the process.
-    /// The link auth flow doesn't support this, so we return [`None`] in that case.
-    pub async fn wake_compute(
-        &self,
-        ctx: &mut RequestMonitoring,
-    ) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
-        use BackendType::*;
-
-        match self {
-            Console(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await,
-            Link(_) => Ok(None),
+            Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
+    async fn wake_compute(
+        &self,
+        ctx: &mut RequestMonitoring,
+    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
+        use BackendType::*;
+
+        match self {
+            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Link(_, info) => Ok(Cached::new_uncached(info.clone())),
+        }
+    }
+
+    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
+        match self {
+            BackendType::Console(_, creds) => Some(&creds.keys),
+            BackendType::Link(_, _) => None,
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
+    async fn wake_compute(
+        &self,
+        ctx: &mut RequestMonitoring,
+    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
+        use BackendType::*;
+
+        match self {
+            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
+        }
+    }
+
+    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
+        match self {
+            BackendType::Console(_, creds) => Some(&creds.keys),
+            BackendType::Link(_, _) => None,
         }
     }
 }
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index e855843bc3..d075331846 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -17,7 +17,7 @@ pub(super) async fn authenticate(
     client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     config: &'static AuthenticationConfig,
     secret: AuthSecret,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<ComputeCredentials> {
     let flow = AuthFlow::new(client);
     let scram_keys = match secret {
         #[cfg(any(test, feature = "testing"))]
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 9f60b709d4..26cf7a01f2 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -20,7 +20,7 @@ pub async fn authenticate_cleartext(
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     secret: AuthSecret,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<ComputeCredentials> {
     warn!("cleartext auth flow override is enabled, proceeding");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
@@ -51,7 +51,7 @@ pub async fn password_hack_no_authentication(
     ctx: &mut RequestMonitoring,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-) -> auth::Result<ComputeCredentials<Vec<u8>>> {
+) -> auth::Result<ComputeCredentials> {
     warn!("project not specified, resorting to the password hack auth flow");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
@@ -73,6 +73,6 @@ pub async fn password_hack_no_authentication(
             options: info.options,
             endpoint: payload.endpoint,
         },
-        keys: payload.password,
+        keys: ComputeCredentialKeys::Password(payload.password),
     })
 }
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 8fbcb56758..00a229c135 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -383,7 +383,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         }
         AuthBackend::Link => {
             let url = args.uri.parse()?;
-            auth::BackendType::Link(MaybeOwned::Owned(url))
+            auth::BackendType::Link(MaybeOwned::Owned(url), ())
         }
     };
     let http_config = HttpConfig {
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 83940d80ec..b61c1fb9ef 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,7 +1,7 @@
 use crate::{
     auth::parse_endpoint_param,
     cancellation::CancelClosure,
-    console::errors::WakeComputeError,
+    console::{errors::WakeComputeError, messages::MetricsAuxInfo},
     context::RequestMonitoring,
     error::{ReportableError, UserFacingError},
     metrics::NUM_DB_CONNECTIONS_GAUGE,
@@ -93,7 +93,7 @@ impl ConnCfg {
     }
 
     /// Reuse password or auth keys from the other config.
-    pub fn reuse_password(&mut self, other: &Self) {
+    pub fn reuse_password(&mut self, other: Self) {
         if let Some(password) = other.get_password() {
             self.password(password);
         }
@@ -253,6 +253,8 @@ pub struct PostgresConnection {
     pub params: std::collections::HashMap<String, String>,
     /// Query cancellation token.
     pub cancel_closure: CancelClosure,
+    /// Labels for proxy's metrics.
+    pub aux: MetricsAuxInfo,
 
     _guage: IntCounterPairGuard,
 }
@@ -263,6 +265,7 @@ impl ConnCfg {
         &self,
         ctx: &mut RequestMonitoring,
         allow_self_signed_compute: bool,
+        aux: MetricsAuxInfo,
         timeout: Duration,
     ) -> Result<PostgresConnection, ConnectionError> {
         let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
@@ -297,6 +300,7 @@ impl ConnCfg {
             stream,
             params,
             cancel_closure,
+            aux,
             _guage: NUM_DB_CONNECTIONS_GAUGE
                 .with_label_values(&[ctx.protocol])
                 .guard(),
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 31c9228b35..5fcb537834 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -13,7 +13,7 @@ use x509_parser::oid_registry;
 
 pub struct ProxyConfig {
     pub tls_config: Option<TlsConfig>,
-    pub auth_backend: auth::BackendType<'static, ()>,
+    pub auth_backend: auth::BackendType<'static, (), ()>,
     pub metric_collection: Option<MetricCollectionConfig>,
     pub allow_self_signed_compute: bool,
     pub http_config: HttpConfig,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index e5cad42753..640444d14e 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -4,7 +4,10 @@ pub mod neon;
 
 use super::messages::MetricsAuxInfo;
 use crate::{
-    auth::{backend::ComputeUserInfo, IpPattern},
+    auth::{
+        backend::{ComputeCredentialKeys, ComputeUserInfo},
+        IpPattern,
+    },
     cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru},
     compute,
     config::{CacheOptions, ProjectInfoCacheOptions},
@@ -261,6 +264,34 @@ pub struct NodeInfo {
     pub allow_self_signed_compute: bool,
 }
 
+impl NodeInfo {
+    pub async fn connect(
+        &self,
+        ctx: &mut RequestMonitoring,
+        timeout: Duration,
+    ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
+        self.config
+            .connect(
+                ctx,
+                self.allow_self_signed_compute,
+                self.aux.clone(),
+                timeout,
+            )
+            .await
+    }
+    pub fn reuse_settings(&mut self, other: Self) {
+        self.allow_self_signed_compute = other.allow_self_signed_compute;
+        self.config.reuse_password(other.config);
+    }
+
+    pub fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
+        match keys {
+            ComputeCredentialKeys::Password(password) => self.config.password(password),
+            ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
+        };
+    }
+}
+
 pub type NodeInfoCache = TimedLru<EndpointCacheKey, NodeInfo>;
 pub type CachedNodeInfo = Cached<&'static NodeInfoCache>;
 pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 79a04f255d..0579ef6fc4 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -176,9 +176,7 @@ impl super::Api for Api {
         _ctx: &mut RequestMonitoring,
         _user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
-        self.do_wake_compute()
-            .map_ok(CachedNodeInfo::new_uncached)
-            .await
+        self.do_wake_compute().map_ok(Cached::new_uncached).await
     }
 }
 
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index eafe92bf48..69fe1ebc12 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -29,7 +29,7 @@ pub trait UserFacingError: ReportableError {
     }
 }
 
-#[derive(Copy, Clone, Debug)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
 pub enum ErrorKind {
     /// Wrong password, unknown endpoint, protocol violation, etc...
     User,
@@ -90,3 +90,13 @@ impl ReportableError for tokio::time::error::Elapsed {
         ErrorKind::RateLimit
     }
 }
+
+impl ReportableError for tokio_postgres::error::Error {
+    fn get_error_kind(&self) -> ErrorKind {
+        if self.as_db_error().is_some() {
+            ErrorKind::Postgres
+        } else {
+            ErrorKind::Compute
+        }
+    }
+}
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 77aadb6f28..5f65de4c98 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -163,14 +163,14 @@ pub enum ClientMode {
 
 /// Abstracts the logic of handling TCP vs WS clients
 impl ClientMode {
-    fn allow_cleartext(&self) -> bool {
+    pub fn allow_cleartext(&self) -> bool {
         match self {
             ClientMode::Tcp => false,
             ClientMode::Websockets { .. } => true,
         }
     }
 
-    fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
+    pub fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
         match self {
             ClientMode::Tcp => config.allow_self_signed_compute,
             ClientMode::Websockets { .. } => false,
@@ -287,7 +287,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     }
 
     let user = user_info.get_user().to_owned();
-    let (mut node_info, user_info) = match user_info
+    let user_info = match user_info
         .authenticate(
             ctx,
             &mut stream,
@@ -306,14 +306,11 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         }
     };
 
-    node_info.allow_self_signed_compute = mode.allow_self_signed_compute(config);
-
-    let aux = node_info.aux.clone();
     let mut node = connect_to_compute(
         ctx,
         &TcpMechanism { params: &params },
-        node_info,
         &user_info,
+        mode.allow_self_signed_compute(config),
     )
     .or_else(|e| stream.throw_error(e))
     .await?;
@@ -330,8 +327,8 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     Ok(Some(ProxyPassthrough {
         client: stream,
+        aux: node.aux.clone(),
         compute: node,
-        aux,
         req: _request_gauge,
         conn: _client_gauge,
     }))
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index b9346aa743..6e57caf998 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -1,8 +1,9 @@
 use crate::{
-    auth,
+    auth::backend::ComputeCredentialKeys,
     compute::{self, PostgresConnection},
-    console::{self, errors::WakeComputeError},
+    console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
     context::RequestMonitoring,
+    error::ReportableError,
     metrics::NUM_CONNECTION_FAILURES,
     proxy::{
         retry::{retry_after, ShouldRetry},
@@ -20,7 +21,7 @@ const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
 /// (e.g. the compute node's address might've changed at the wrong time).
 /// Invalidate the cache entry (if any) to prevent subsequent errors.
 #[tracing::instrument(name = "invalidate_cache", skip_all)]
-pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg {
+pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
     let is_cached = node_info.cached();
     if is_cached {
         warn!("invalidating stalled compute node info cache entry");
@@ -31,13 +32,13 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg
     };
     NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
 
-    node_info.invalidate().config
+    node_info.invalidate()
 }
 
 #[async_trait]
 pub trait ConnectMechanism {
     type Connection;
-    type ConnectError;
+    type ConnectError: ReportableError;
     type Error: From<Self::ConnectError>;
     async fn connect_once(
         &self,
@@ -49,6 +50,16 @@ pub trait ConnectMechanism {
     fn update_connect_config(&self, conf: &mut compute::ConnCfg);
 }
 
+#[async_trait]
+pub trait ComputeConnectBackend {
+    async fn wake_compute(
+        &self,
+        ctx: &mut RequestMonitoring,
+    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
+
+    fn get_keys(&self) -> Option<&ComputeCredentialKeys>;
+}
+
 pub struct TcpMechanism<'a> {
     /// KV-dictionary with PostgreSQL connection params.
     pub params: &'a StartupMessageParams,
@@ -67,11 +78,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
-        let allow_self_signed_compute = node_info.allow_self_signed_compute;
-        node_info
-            .config
-            .connect(ctx, allow_self_signed_compute, timeout)
-            .await
+        node_info.connect(ctx, timeout).await
     }
 
     fn update_connect_config(&self, config: &mut compute::ConnCfg) {
@@ -82,16 +89,23 @@ impl ConnectMechanism for TcpMechanism<'_> {
 /// Try to connect to the compute node, retrying if necessary.
 /// This function might update `node_info`, so we take it by `&mut`.
 #[tracing::instrument(skip_all)]
-pub async fn connect_to_compute<M: ConnectMechanism>(
+pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
     ctx: &mut RequestMonitoring,
     mechanism: &M,
-    mut node_info: console::CachedNodeInfo,
-    user_info: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
+    user_info: &B,
+    allow_self_signed_compute: bool,
 ) -> Result<M::Connection, M::Error>
 where
     M::ConnectError: ShouldRetry + std::fmt::Debug,
     M::Error: From<WakeComputeError>,
 {
+    let mut num_retries = 0;
+    let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+    if let Some(keys) = user_info.get_keys() {
+        node_info.set_keys(keys);
+    }
+    node_info.allow_self_signed_compute = allow_self_signed_compute;
+    // let mut node_info = credentials.get_node_info(ctx, user_info).await?;
     mechanism.update_connect_config(&mut node_info.config);
 
     // try once
@@ -108,28 +122,31 @@ where
 
     error!(error = ?err, "could not connect to compute node");
 
-    let mut num_retries = 1;
-
-    match user_info {
-        auth::BackendType::Console(api, info) => {
+    let node_info =
+        if err.get_error_kind() == crate::error::ErrorKind::Postgres || !node_info.cached() {
+            // If the error is Postgres, that means that we managed to connect to the compute node, but there was an error.
+            // Do not need to retrieve a new node_info, just return the old one.
+            if !err.should_retry(num_retries) {
+                return Err(err.into());
+            }
+            node_info
+        } else {
             // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
             info!("compute node's state has likely changed; requesting a wake-up");
-
             ctx.latency_timer.cache_miss();
-            let config = invalidate_cache(node_info);
-            node_info = wake_compute(&mut num_retries, ctx, api, info).await?;
+            let old_node_info = invalidate_cache(node_info);
+            let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+            node_info.reuse_settings(old_node_info);
 
-            node_info.config.reuse_password(&config);
             mechanism.update_connect_config(&mut node_info.config);
-        }
-        // nothing to do?
-        auth::BackendType::Link(_) => {}
-    };
+            node_info
+        };
 
     // now that we have a new node, try connect to it repeatedly.
     // this can error for a few reasons, for instance:
     // * DNS connection settings haven't quite propagated yet
     info!("wake_compute success. attempting to connect");
+    num_retries = 1;
     loop {
         match mechanism
             .connect_once(ctx, &node_info, CONNECT_TIMEOUT)
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 5bb43c0375..efbd661bbf 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -2,13 +2,19 @@
 
 mod mitm;
 
+use std::time::Duration;
+
 use super::connect_compute::ConnectMechanism;
 use super::retry::ShouldRetry;
 use super::*;
-use crate::auth::backend::{ComputeUserInfo, MaybeOwned, TestBackend};
+use crate::auth::backend::{
+    ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
+};
 use crate::config::CertResolver;
+use crate::console::caches::NodeInfoCache;
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
+use crate::error::ErrorKind;
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
 use async_trait::async_trait;
@@ -369,12 +375,15 @@ enum ConnectAction {
     Connect,
     Retry,
     Fail,
+    RetryPg,
+    FailPg,
 }
 
 #[derive(Clone)]
 struct TestConnectMechanism {
     counter: Arc<std::sync::Mutex<usize>>,
     sequence: Vec<ConnectAction>,
+    cache: &'static NodeInfoCache,
 }
 
 impl TestConnectMechanism {
@@ -393,6 +402,12 @@ impl TestConnectMechanism {
         Self {
             counter: Arc::new(std::sync::Mutex::new(0)),
             sequence,
+            cache: Box::leak(Box::new(NodeInfoCache::new(
+                "test",
+                1,
+                Duration::from_secs(100),
+                false,
+            ))),
         }
     }
 }
@@ -403,6 +418,13 @@ struct TestConnection;
 #[derive(Debug)]
 struct TestConnectError {
     retryable: bool,
+    kind: crate::error::ErrorKind,
+}
+
+impl ReportableError for TestConnectError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        self.kind
+    }
 }
 
 impl std::fmt::Display for TestConnectError {
@@ -436,8 +458,22 @@ impl ConnectMechanism for TestConnectMechanism {
         *counter += 1;
         match action {
             ConnectAction::Connect => Ok(TestConnection),
-            ConnectAction::Retry => Err(TestConnectError { retryable: true }),
-            ConnectAction::Fail => Err(TestConnectError { retryable: false }),
+            ConnectAction::Retry => Err(TestConnectError {
+                retryable: true,
+                kind: ErrorKind::Compute,
+            }),
+            ConnectAction::Fail => Err(TestConnectError {
+                retryable: false,
+                kind: ErrorKind::Compute,
+            }),
+            ConnectAction::FailPg => Err(TestConnectError {
+                retryable: false,
+                kind: ErrorKind::Postgres,
+            }),
+            ConnectAction::RetryPg => Err(TestConnectError {
+                retryable: true,
+                kind: ErrorKind::Postgres,
+            }),
             x => panic!("expecting action {:?}, connect is called instead", x),
         }
     }
@@ -451,7 +487,7 @@ impl TestBackend for TestConnectMechanism {
         let action = self.sequence[*counter];
         *counter += 1;
         match action {
-            ConnectAction::Wake => Ok(helper_create_cached_node_info()),
+            ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
             ConnectAction::WakeFail => {
                 let err = console::errors::ApiError::Console {
                     status: http::StatusCode::FORBIDDEN,
@@ -483,37 +519,41 @@ impl TestBackend for TestConnectMechanism {
     }
 }
 
-fn helper_create_cached_node_info() -> CachedNodeInfo {
+fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
     let node = NodeInfo {
         config: compute::ConnCfg::new(),
         aux: Default::default(),
         allow_self_signed_compute: false,
     };
-    CachedNodeInfo::new_uncached(node)
+    let (_, node) = cache.insert("key".into(), node);
+    node
 }
 
 fn helper_create_connect_info(
     mechanism: &TestConnectMechanism,
-) -> (CachedNodeInfo, auth::BackendType<'static, ComputeUserInfo>) {
-    let cache = helper_create_cached_node_info();
+) -> auth::BackendType<'static, ComputeCredentials, &()> {
     let user_info = auth::BackendType::Console(
         MaybeOwned::Owned(ConsoleBackend::Test(Box::new(mechanism.clone()))),
-        ComputeUserInfo {
-            endpoint: "endpoint".into(),
-            user: "user".into(),
-            options: NeonOptions::parse_options_raw(""),
+        ComputeCredentials {
+            info: ComputeUserInfo {
+                endpoint: "endpoint".into(),
+                user: "user".into(),
+                options: NeonOptions::parse_options_raw(""),
+            },
+            keys: ComputeCredentialKeys::Password("password".into()),
         },
     );
-    (cache, user_info)
+    user_info
 }
 
 #[tokio::test]
 async fn connect_to_compute_success() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Connect]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap();
     mechanism.verify();
@@ -521,24 +561,52 @@ async fn connect_to_compute_success() {
 
 #[tokio::test]
 async fn connect_to_compute_retry() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap();
     mechanism.verify();
 }
 
+#[tokio::test]
+async fn connect_to_compute_retry_pg() {
+    let _ = env_logger::try_init();
+    use ConnectAction::*;
+    let mut ctx = RequestMonitoring::test();
+    let mechanism = TestConnectMechanism::new(vec![Wake, RetryPg, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+        .await
+        .unwrap();
+    mechanism.verify();
+}
+
+#[tokio::test]
+async fn connect_to_compute_fail_pg() {
+    let _ = env_logger::try_init();
+    use ConnectAction::*;
+    let mut ctx = RequestMonitoring::test();
+    let mechanism = TestConnectMechanism::new(vec![Wake, FailPg]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+        .await
+        .unwrap_err();
+    mechanism.verify();
+}
+
 /// Test that we don't retry if the error is not retryable.
 #[tokio::test]
 async fn connect_to_compute_non_retry_1() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -547,11 +615,12 @@ async fn connect_to_compute_non_retry_1() {
 /// Even for non-retryable errors, we should retry at least once.
 #[tokio::test]
 async fn connect_to_compute_non_retry_2() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap();
     mechanism.verify();
@@ -560,15 +629,16 @@ async fn connect_to_compute_non_retry_2() {
 /// Retry for at most `NUM_RETRIES_CONNECT` times.
 #[tokio::test]
 async fn connect_to_compute_non_retry_3() {
+    let _ = env_logger::try_init();
     assert_eq!(NUM_RETRIES_CONNECT, 16);
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![
-        Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
-        Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
+        Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
+        Retry, Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
     ]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -577,11 +647,12 @@ async fn connect_to_compute_non_retry_3() {
 /// Should retry wake compute.
 #[tokio::test]
 async fn wake_retry() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap();
     mechanism.verify();
@@ -590,11 +661,12 @@ async fn wake_retry() {
 /// Wake failed with a non-retryable error.
 #[tokio::test]
 async fn wake_non_retry() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap_err();
     mechanism.verify();
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 925727bdab..2c593451b4 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,9 +1,4 @@
-use crate::auth::backend::ComputeUserInfo;
-use crate::console::{
-    errors::WakeComputeError,
-    provider::{CachedNodeInfo, ConsoleBackend},
-    Api,
-};
+use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
 use crate::context::RequestMonitoring;
 use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES};
 use crate::proxy::retry::retry_after;
@@ -11,17 +6,16 @@ use hyper::StatusCode;
 use std::ops::ControlFlow;
 use tracing::{error, warn};
 
+use super::connect_compute::ComputeConnectBackend;
 use super::retry::ShouldRetry;
 
-/// wake a compute (or retrieve an existing compute session from cache)
-pub async fn wake_compute(
+pub async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
     ctx: &mut RequestMonitoring,
-    api: &ConsoleBackend,
-    info: &ComputeUserInfo,
+    api: &B,
 ) -> Result<CachedNodeInfo, WakeComputeError> {
     loop {
-        let wake_res = api.wake_compute(ctx, info).await;
+        let wake_res = api.wake_compute(ctx).await;
         match handle_try_wake(wake_res, *num_retries) {
             Err(e) => {
                 error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 156002006d..6f93f86d5f 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -4,7 +4,7 @@ use async_trait::async_trait;
 use tracing::{field::display, info};
 
 use crate::{
-    auth::{backend::ComputeCredentialKeys, check_peer_addr_is_in_list, AuthError},
+    auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError},
     compute,
     config::ProxyConfig,
     console::{
@@ -27,7 +27,7 @@ impl PoolingBackend {
         &self,
         ctx: &mut RequestMonitoring,
         conn_info: &ConnInfo,
-    ) -> Result<ComputeCredentialKeys, AuthError> {
+    ) -> Result<ComputeCredentials, AuthError> {
         let user_info = conn_info.user_info.clone();
         let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
@@ -49,13 +49,17 @@ impl PoolingBackend {
         };
         let auth_outcome =
             crate::auth::validate_password_and_exchange(&conn_info.password, secret)?;
-        match auth_outcome {
+        let res = match auth_outcome {
             crate::sasl::Outcome::Success(key) => Ok(key),
             crate::sasl::Outcome::Failure(reason) => {
                 info!("auth backend failed with an error: {reason}");
                 Err(AuthError::auth_failed(&*conn_info.user_info.user))
             }
-        }
+        };
+        res.map(|key| ComputeCredentials {
+            info: user_info,
+            keys: key,
+        })
     }
 
     // Wake up the destination if needed. Code here is a bit involved because
@@ -66,7 +70,7 @@ impl PoolingBackend {
         &self,
         ctx: &mut RequestMonitoring,
         conn_info: ConnInfo,
-        keys: ComputeCredentialKeys,
+        keys: ComputeCredentials,
         force_new: bool,
     ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
         let maybe_client = if !force_new {
@@ -82,26 +86,8 @@ impl PoolingBackend {
         }
         let conn_id = uuid::Uuid::new_v4();
         tracing::Span::current().record("conn_id", display(conn_id));
-        info!("pool: opening a new connection '{conn_info}'");
-        let backend = self
-            .config
-            .auth_backend
-            .as_ref()
-            .map(|_| conn_info.user_info.clone());
-
-        let mut node_info = backend
-            .wake_compute(ctx)
-            .await?
-            .ok_or(HttpConnError::NoComputeInfo)?;
-
-        match keys {
-            #[cfg(any(test, feature = "testing"))]
-            ComputeCredentialKeys::Password(password) => node_info.config.password(password),
-            ComputeCredentialKeys::AuthKeys(auth_keys) => node_info.config.auth_keys(auth_keys),
-        };
-
-        ctx.set_project(node_info.aux.clone());
-
+        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
+        let backend = self.config.auth_backend.as_ref().map(|_| keys);
         crate::proxy::connect_compute::connect_to_compute(
             ctx,
             &TokioMechanism {
@@ -109,8 +95,8 @@ impl PoolingBackend {
                 conn_info,
                 pool: self.pool.clone(),
             },
-            node_info,
             &backend,
+            false, // do not allow self signed compute for http flow
         )
         .await
     }
@@ -129,8 +115,6 @@ pub enum HttpConnError {
     AuthError(#[from] AuthError),
     #[error("wake_compute returned error")]
     WakeCompute(#[from] WakeComputeError),
-    #[error("wake_compute returned nothing")]
-    NoComputeInfo,
 }
 
 struct TokioMechanism {

From 4be2223a4cd80fdc40c37aab2206bb6f505dc008 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 12 Feb 2024 20:29:57 +0000
Subject: [PATCH 0163/1571] Discrete event simulation for safekeepers (#5804)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR contains the first version of a
[FoundationDB-like](https://www.youtube.com/watch?v=4fFDFbi3toc)
simulation testing for safekeeper and walproposer.

### desim

This is a core "framework" for running determenistic simulation. It
operates on threads, allowing to test syncronous code (like walproposer).

`libs/desim/src/executor.rs` contains implementation of a determenistic
thread execution. This is achieved by blocking all threads, and each
time allowing only a single thread to make an execution step. All
executor's threads are blocked using `yield_me(after_ms)` function. This
function is called when a thread wants to sleep or wait for an external
notification (like blocking on a channel until it has a ready message).

`libs/desim/src/chan.rs` contains implementation of a channel (basic
sync primitive). It has unlimited capacity and any thread can push or
read messages to/from it.

`libs/desim/src/network.rs` has a very naive implementation of a network
(only reliable TCP-like connections are supported for now), that can
have arbitrary delays for each package and failure injections for
breaking connections with some probability.

`libs/desim/src/world.rs` ties everything together, to have a concept of
virtual nodes that can have network connections between them.

### walproposer_sim

Has everything to run walproposer and safekeepers in a simulation.

`safekeeper.rs` reimplements all necesary stuff from `receive_wal.rs`,
`send_wal.rs` and `timelines_global_map.rs`.

`walproposer_api.rs` implements all walproposer callback to use
simulation library.

`simulation.rs` defines a schedule – a set of events like `restart <sk>`
or `write_wal` that should happen at time `<ts>`. It also has code to
spawn walproposer/safekeeper threads and provide config to them.

### tests

`simple_test.rs` has tests that just start walproposer and 3 safekeepers
together in a simulation, and tests that they are not crashing right
away.

`misc_test.rs` has tests checking more advanced simulation cases, like
crashing or restarting threads, testing memory deallocation, etc.

`random_test.rs` is the main test, it checks thousands of random seeds
(schedules) for correctness. It roughly corresponds to running a real
python integration test in an environment with very unstable network and
cpu, but in a determenistic way (each seed results in the same execution
log) and much much faster.

Closes #547

---------

Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
---
 Cargo.lock                                    |  20 +
 Cargo.toml                                    |   2 +
 libs/desim/Cargo.toml                         |  18 +
 libs/desim/README.md                          |   7 +
 libs/desim/src/chan.rs                        | 108 +++
 libs/desim/src/executor.rs                    | 483 +++++++++++++
 libs/desim/src/lib.rs                         |   8 +
 libs/desim/src/network.rs                     | 451 ++++++++++++
 libs/desim/src/node_os.rs                     |  54 ++
 libs/desim/src/options.rs                     |  50 ++
 libs/desim/src/proto.rs                       |  63 ++
 libs/desim/src/time.rs                        | 129 ++++
 libs/desim/src/world.rs                       | 180 +++++
 libs/desim/tests/reliable_copy_test.rs        | 244 +++++++
 libs/postgres_ffi/src/xlog_utils.rs           |  10 +-
 libs/walproposer/build.rs                     |   4 +
 libs/walproposer/src/api_bindings.rs          |  20 +-
 libs/walproposer/src/walproposer.rs           |  45 +-
 pageserver/src/walingest.rs                   |   2 +-
 pgxn/neon/walproposer.c                       |  15 +-
 pgxn/neon/walproposer.h                       |   9 +
 safekeeper/Cargo.toml                         |   7 +
 safekeeper/tests/misc_test.rs                 | 155 ++++
 safekeeper/tests/random_test.rs               |  56 ++
 safekeeper/tests/simple_test.rs               |  45 ++
 .../tests/walproposer_sim/block_storage.rs    |  57 ++
 safekeeper/tests/walproposer_sim/log.rs       |  77 ++
 safekeeper/tests/walproposer_sim/mod.rs       |   8 +
 .../tests/walproposer_sim/safekeeper.rs       | 410 +++++++++++
 .../tests/walproposer_sim/safekeeper_disk.rs  | 278 +++++++
 .../tests/walproposer_sim/simulation.rs       | 436 +++++++++++
 .../tests/walproposer_sim/simulation_logs.rs  | 187 +++++
 .../tests/walproposer_sim/walproposer_api.rs  | 676 ++++++++++++++++++
 .../tests/walproposer_sim/walproposer_disk.rs | 314 ++++++++
 34 files changed, 4603 insertions(+), 25 deletions(-)
 create mode 100644 libs/desim/Cargo.toml
 create mode 100644 libs/desim/README.md
 create mode 100644 libs/desim/src/chan.rs
 create mode 100644 libs/desim/src/executor.rs
 create mode 100644 libs/desim/src/lib.rs
 create mode 100644 libs/desim/src/network.rs
 create mode 100644 libs/desim/src/node_os.rs
 create mode 100644 libs/desim/src/options.rs
 create mode 100644 libs/desim/src/proto.rs
 create mode 100644 libs/desim/src/time.rs
 create mode 100644 libs/desim/src/world.rs
 create mode 100644 libs/desim/tests/reliable_copy_test.rs
 create mode 100644 safekeeper/tests/misc_test.rs
 create mode 100644 safekeeper/tests/random_test.rs
 create mode 100644 safekeeper/tests/simple_test.rs
 create mode 100644 safekeeper/tests/walproposer_sim/block_storage.rs
 create mode 100644 safekeeper/tests/walproposer_sim/log.rs
 create mode 100644 safekeeper/tests/walproposer_sim/mod.rs
 create mode 100644 safekeeper/tests/walproposer_sim/safekeeper.rs
 create mode 100644 safekeeper/tests/walproposer_sim/safekeeper_disk.rs
 create mode 100644 safekeeper/tests/walproposer_sim/simulation.rs
 create mode 100644 safekeeper/tests/walproposer_sim/simulation_logs.rs
 create mode 100644 safekeeper/tests/walproposer_sim/walproposer_api.rs
 create mode 100644 safekeeper/tests/walproposer_sim/walproposer_disk.rs

diff --git a/Cargo.lock b/Cargo.lock
index 520163e41b..f11c774016 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1639,6 +1639,22 @@ dependencies = [
  "rusticata-macros",
 ]
 
+[[package]]
+name = "desim"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bytes",
+ "hex",
+ "parking_lot 0.12.1",
+ "rand 0.8.5",
+ "scopeguard",
+ "smallvec",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "diesel"
 version = "2.1.4"
@@ -4827,6 +4843,7 @@ dependencies = [
  "clap",
  "const_format",
  "crc32c",
+ "desim",
  "fail",
  "fs2",
  "futures",
@@ -4842,6 +4859,7 @@ dependencies = [
  "postgres_backend",
  "postgres_ffi",
  "pq_proto",
+ "rand 0.8.5",
  "regex",
  "remote_storage",
  "reqwest",
@@ -4862,8 +4880,10 @@ dependencies = [
  "tokio-util",
  "toml_edit",
  "tracing",
+ "tracing-subscriber",
  "url",
  "utils",
+ "walproposer",
  "workspace_hack",
 ]
 
diff --git a/Cargo.toml b/Cargo.toml
index ebc3dfa7b1..8df9ca9988 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,6 +18,7 @@ members = [
     "libs/pageserver_api",
     "libs/postgres_ffi",
     "libs/safekeeper_api",
+    "libs/desim",
     "libs/utils",
     "libs/consumption_metrics",
     "libs/postgres_backend",
@@ -203,6 +204,7 @@ postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
 pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
 remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
 safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
+desim = { version = "0.1", path = "./libs/desim" }
 storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
diff --git a/libs/desim/Cargo.toml b/libs/desim/Cargo.toml
new file mode 100644
index 0000000000..6f442d8243
--- /dev/null
+++ b/libs/desim/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "desim"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+rand.workspace = true
+tracing.workspace = true
+bytes.workspace = true
+utils.workspace = true
+parking_lot.workspace = true
+hex.workspace = true
+scopeguard.workspace = true
+smallvec = { workspace = true, features = ["write"] }
+
+workspace_hack.workspace = true
diff --git a/libs/desim/README.md b/libs/desim/README.md
new file mode 100644
index 0000000000..80568ebb1b
--- /dev/null
+++ b/libs/desim/README.md
@@ -0,0 +1,7 @@
+# Discrete Event SIMulator
+
+This is a library for running simulations of distributed systems. The main idea is borrowed from [FoundationDB](https://www.youtube.com/watch?v=4fFDFbi3toc).
+
+Each node runs as a separate thread. This library was not optimized for speed yet, but it's already much faster than running usual intergration tests in real time, because it uses virtual simulation time and can fast-forward time to skip intervals where all nodes are doing nothing but sleeping or waiting for something.
+
+The original purpose for this library is to test walproposer and safekeeper implementation working together, in a scenarios close to the real world environment. This simulator is determenistic and can inject failures in networking without waiting minutes of wall-time to trigger timeout, which makes it easier to find bugs in our consensus implementation compared to using integration tests.
diff --git a/libs/desim/src/chan.rs b/libs/desim/src/chan.rs
new file mode 100644
index 0000000000..6661d59871
--- /dev/null
+++ b/libs/desim/src/chan.rs
@@ -0,0 +1,108 @@
+use std::{collections::VecDeque, sync::Arc};
+
+use parking_lot::{Mutex, MutexGuard};
+
+use crate::executor::{self, PollSome, Waker};
+
+/// FIFO channel with blocking send and receive. Can be cloned and shared between threads.
+/// Blocking functions should be used only from threads that are managed by the executor.
+pub struct Chan<T> {
+    shared: Arc<State<T>>,
+}
+
+impl<T> Clone for Chan<T> {
+    fn clone(&self) -> Self {
+        Chan {
+            shared: self.shared.clone(),
+        }
+    }
+}
+
+impl<T> Default for Chan<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<T> Chan<T> {
+    pub fn new() -> Chan<T> {
+        Chan {
+            shared: Arc::new(State {
+                queue: Mutex::new(VecDeque::new()),
+                waker: Waker::new(),
+            }),
+        }
+    }
+
+    /// Get a message from the front of the queue, block if the queue is empty.
+    /// If not called from the executor thread, it can block forever.
+    pub fn recv(&self) -> T {
+        self.shared.recv()
+    }
+
+    /// Panic if the queue is empty.
+    pub fn must_recv(&self) -> T {
+        self.shared
+            .try_recv()
+            .expect("message should've been ready")
+    }
+
+    /// Get a message from the front of the queue, return None if the queue is empty.
+    /// Never blocks.
+    pub fn try_recv(&self) -> Option<T> {
+        self.shared.try_recv()
+    }
+
+    /// Send a message to the back of the queue.
+    pub fn send(&self, t: T) {
+        self.shared.send(t);
+    }
+}
+
+struct State<T> {
+    queue: Mutex<VecDeque<T>>,
+    waker: Waker,
+}
+
+impl<T> State<T> {
+    fn send(&self, t: T) {
+        self.queue.lock().push_back(t);
+        self.waker.wake_all();
+    }
+
+    fn try_recv(&self) -> Option<T> {
+        let mut q = self.queue.lock();
+        q.pop_front()
+    }
+
+    fn recv(&self) -> T {
+        // interrupt the receiver to prevent consuming everything at once
+        executor::yield_me(0);
+
+        let mut queue = self.queue.lock();
+        if let Some(t) = queue.pop_front() {
+            return t;
+        }
+        loop {
+            self.waker.wake_me_later();
+            if let Some(t) = queue.pop_front() {
+                return t;
+            }
+            MutexGuard::unlocked(&mut queue, || {
+                executor::yield_me(-1);
+            });
+        }
+    }
+}
+
+impl<T> PollSome for Chan<T> {
+    /// Schedules a wakeup for the current thread.
+    fn wake_me(&self) {
+        self.shared.waker.wake_me_later();
+    }
+
+    /// Checks if chan has any pending messages.
+    fn has_some(&self) -> bool {
+        !self.shared.queue.lock().is_empty()
+    }
+}
diff --git a/libs/desim/src/executor.rs b/libs/desim/src/executor.rs
new file mode 100644
index 0000000000..9d44bd7741
--- /dev/null
+++ b/libs/desim/src/executor.rs
@@ -0,0 +1,483 @@
+use std::{
+    panic::AssertUnwindSafe,
+    sync::{
+        atomic::{AtomicBool, AtomicU32, AtomicU8, Ordering},
+        mpsc, Arc, OnceLock,
+    },
+    thread::JoinHandle,
+};
+
+use tracing::{debug, error, trace};
+
+use crate::time::Timing;
+
+/// Stores status of the running threads. Threads are registered in the runtime upon creation
+/// and deregistered upon termination.
+pub struct Runtime {
+    // stores handles to all threads that are currently running
+    threads: Vec<ThreadHandle>,
+    // stores current time and pending wakeups
+    clock: Arc<Timing>,
+    // thread counter
+    thread_counter: AtomicU32,
+    // Thread step counter -- how many times all threads has been actually
+    // stepped (note that all world/time/executor/thread have slightly different
+    // meaning of steps). For observability.
+    pub step_counter: u64,
+}
+
+impl Runtime {
+    /// Init new runtime, no running threads.
+    pub fn new(clock: Arc<Timing>) -> Self {
+        Self {
+            threads: Vec::new(),
+            clock,
+            thread_counter: AtomicU32::new(0),
+            step_counter: 0,
+        }
+    }
+
+    /// Spawn a new thread and register it in the runtime.
+    pub fn spawn<F>(&mut self, f: F) -> ExternalHandle
+    where
+        F: FnOnce() + Send + 'static,
+    {
+        let (tx, rx) = mpsc::channel();
+
+        let clock = self.clock.clone();
+        let tid = self.thread_counter.fetch_add(1, Ordering::SeqCst);
+        debug!("spawning thread-{}", tid);
+
+        let join = std::thread::spawn(move || {
+            let _guard = tracing::info_span!("", tid).entered();
+
+            let res = std::panic::catch_unwind(AssertUnwindSafe(|| {
+                with_thread_context(|ctx| {
+                    assert!(ctx.clock.set(clock).is_ok());
+                    ctx.id.store(tid, Ordering::SeqCst);
+                    tx.send(ctx.clone()).expect("failed to send thread context");
+                    // suspend thread to put it to `threads` in sleeping state
+                    ctx.yield_me(0);
+                });
+
+                // start user-provided function
+                f();
+            }));
+            debug!("thread finished");
+
+            if let Err(e) = res {
+                with_thread_context(|ctx| {
+                    if !ctx.allow_panic.load(std::sync::atomic::Ordering::SeqCst) {
+                        error!("thread panicked, terminating the process: {:?}", e);
+                        std::process::exit(1);
+                    }
+
+                    debug!("thread panicked: {:?}", e);
+                    let mut result = ctx.result.lock();
+                    if result.0 == -1 {
+                        *result = (256, format!("thread panicked: {:?}", e));
+                    }
+                });
+            }
+
+            with_thread_context(|ctx| {
+                ctx.finish_me();
+            });
+        });
+
+        let ctx = rx.recv().expect("failed to receive thread context");
+        let handle = ThreadHandle::new(ctx.clone(), join);
+
+        self.threads.push(handle);
+
+        ExternalHandle { ctx }
+    }
+
+    /// Returns true if there are any unfinished activity, such as running thread or pending events.
+    /// Otherwise returns false, which means all threads are blocked forever.
+    pub fn step(&mut self) -> bool {
+        trace!("runtime step");
+
+        // have we run any thread?
+        let mut ran = false;
+
+        self.threads.retain(|thread: &ThreadHandle| {
+            let res = thread.ctx.wakeup.compare_exchange(
+                PENDING_WAKEUP,
+                NO_WAKEUP,
+                Ordering::SeqCst,
+                Ordering::SeqCst,
+            );
+            if res.is_err() {
+                // thread has no pending wakeups, leaving as is
+                return true;
+            }
+            ran = true;
+
+            trace!("entering thread-{}", thread.ctx.tid());
+            let status = thread.step();
+            self.step_counter += 1;
+            trace!(
+                "out of thread-{} with status {:?}",
+                thread.ctx.tid(),
+                status
+            );
+
+            if status == Status::Sleep {
+                true
+            } else {
+                trace!("thread has finished");
+                // removing the thread from the list
+                false
+            }
+        });
+
+        if !ran {
+            trace!("no threads were run, stepping clock");
+            if let Some(ctx_to_wake) = self.clock.step() {
+                trace!("waking up thread-{}", ctx_to_wake.tid());
+                ctx_to_wake.inc_wake();
+            } else {
+                return false;
+            }
+        }
+
+        true
+    }
+
+    /// Kill all threads. This is done by setting a flag in each thread context and waking it up.
+    pub fn crash_all_threads(&mut self) {
+        for thread in self.threads.iter() {
+            thread.ctx.crash_stop();
+        }
+
+        // all threads should be finished after a few steps
+        while !self.threads.is_empty() {
+            self.step();
+        }
+    }
+}
+
+impl Drop for Runtime {
+    fn drop(&mut self) {
+        debug!("dropping the runtime");
+        self.crash_all_threads();
+    }
+}
+
+#[derive(Clone)]
+pub struct ExternalHandle {
+    ctx: Arc<ThreadContext>,
+}
+
+impl ExternalHandle {
+    /// Returns true if thread has finished execution.
+    pub fn is_finished(&self) -> bool {
+        let status = self.ctx.mutex.lock();
+        *status == Status::Finished
+    }
+
+    /// Returns exitcode and message, which is available after thread has finished execution.
+    pub fn result(&self) -> (i32, String) {
+        let result = self.ctx.result.lock();
+        result.clone()
+    }
+
+    /// Returns thread id.
+    pub fn id(&self) -> u32 {
+        self.ctx.id.load(Ordering::SeqCst)
+    }
+
+    /// Sets a flag to crash thread on the next wakeup.
+    pub fn crash_stop(&self) {
+        self.ctx.crash_stop();
+    }
+}
+
+struct ThreadHandle {
+    ctx: Arc<ThreadContext>,
+    _join: JoinHandle<()>,
+}
+
+impl ThreadHandle {
+    /// Create a new [`ThreadHandle`] and wait until thread will enter [`Status::Sleep`] state.
+    fn new(ctx: Arc<ThreadContext>, join: JoinHandle<()>) -> Self {
+        let mut status = ctx.mutex.lock();
+        // wait until thread will go into the first yield
+        while *status != Status::Sleep {
+            ctx.condvar.wait(&mut status);
+        }
+        drop(status);
+
+        Self { ctx, _join: join }
+    }
+
+    /// Allows thread to execute one step of its execution.
+    /// Returns [`Status`] of the thread after the step.
+    fn step(&self) -> Status {
+        let mut status = self.ctx.mutex.lock();
+        assert!(matches!(*status, Status::Sleep));
+
+        *status = Status::Running;
+        self.ctx.condvar.notify_all();
+
+        while *status == Status::Running {
+            self.ctx.condvar.wait(&mut status);
+        }
+
+        *status
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum Status {
+    /// Thread is running.
+    Running,
+    /// Waiting for event to complete, will be resumed by the executor step, once wakeup flag is set.
+    Sleep,
+    /// Thread finished execution.
+    Finished,
+}
+
+const NO_WAKEUP: u8 = 0;
+const PENDING_WAKEUP: u8 = 1;
+
+pub struct ThreadContext {
+    id: AtomicU32,
+    // used to block thread until it is woken up
+    mutex: parking_lot::Mutex<Status>,
+    condvar: parking_lot::Condvar,
+    // used as a flag to indicate runtime that thread is ready to be woken up
+    wakeup: AtomicU8,
+    clock: OnceLock<Arc<Timing>>,
+    // execution result, set by exit() call
+    result: parking_lot::Mutex<(i32, String)>,
+    // determines if process should be killed on receiving panic
+    allow_panic: AtomicBool,
+    // acts as a signal that thread should crash itself on the next wakeup
+    crash_request: AtomicBool,
+}
+
+impl ThreadContext {
+    pub(crate) fn new() -> Self {
+        Self {
+            id: AtomicU32::new(0),
+            mutex: parking_lot::Mutex::new(Status::Running),
+            condvar: parking_lot::Condvar::new(),
+            wakeup: AtomicU8::new(NO_WAKEUP),
+            clock: OnceLock::new(),
+            result: parking_lot::Mutex::new((-1, String::new())),
+            allow_panic: AtomicBool::new(false),
+            crash_request: AtomicBool::new(false),
+        }
+    }
+}
+
+// Functions for executor to control thread execution.
+impl ThreadContext {
+    /// Set atomic flag to indicate that thread is ready to be woken up.
+    fn inc_wake(&self) {
+        self.wakeup.store(PENDING_WAKEUP, Ordering::SeqCst);
+    }
+
+    /// Internal function used for event queues.
+    pub(crate) fn schedule_wakeup(self: &Arc<Self>, after_ms: u64) {
+        self.clock
+            .get()
+            .unwrap()
+            .schedule_wakeup(after_ms, self.clone());
+    }
+
+    fn tid(&self) -> u32 {
+        self.id.load(Ordering::SeqCst)
+    }
+
+    fn crash_stop(&self) {
+        let status = self.mutex.lock();
+        if *status == Status::Finished {
+            debug!(
+                "trying to crash thread-{}, which is already finished",
+                self.tid()
+            );
+            return;
+        }
+        assert!(matches!(*status, Status::Sleep));
+        drop(status);
+
+        self.allow_panic.store(true, Ordering::SeqCst);
+        self.crash_request.store(true, Ordering::SeqCst);
+        // set a wakeup
+        self.inc_wake();
+        // it will panic on the next wakeup
+    }
+}
+
+// Internal functions.
+impl ThreadContext {
+    /// Blocks thread until it's woken up by the executor. If `after_ms` is 0, is will be
+    /// woken on the next step. If `after_ms` > 0, wakeup is scheduled after that time.
+    /// Otherwise wakeup is not scheduled inside `yield_me`, and should be arranged before
+    /// calling this function.
+    fn yield_me(self: &Arc<Self>, after_ms: i64) {
+        let mut status = self.mutex.lock();
+        assert!(matches!(*status, Status::Running));
+
+        match after_ms.cmp(&0) {
+            std::cmp::Ordering::Less => {
+                // block until something wakes us up
+            }
+            std::cmp::Ordering::Equal => {
+                // tell executor that we are ready to be woken up
+                self.inc_wake();
+            }
+            std::cmp::Ordering::Greater => {
+                // schedule wakeup
+                self.clock
+                    .get()
+                    .unwrap()
+                    .schedule_wakeup(after_ms as u64, self.clone());
+            }
+        }
+
+        *status = Status::Sleep;
+        self.condvar.notify_all();
+
+        // wait until executor wakes us up
+        while *status != Status::Running {
+            self.condvar.wait(&mut status);
+        }
+
+        if self.crash_request.load(Ordering::SeqCst) {
+            panic!("crashed by request");
+        }
+    }
+
+    /// Called only once, exactly before thread finishes execution.
+    fn finish_me(&self) {
+        let mut status = self.mutex.lock();
+        assert!(matches!(*status, Status::Running));
+
+        *status = Status::Finished;
+        {
+            let mut result = self.result.lock();
+            if result.0 == -1 {
+                *result = (0, "finished normally".to_owned());
+            }
+        }
+        self.condvar.notify_all();
+    }
+}
+
+/// Invokes the given closure with a reference to the current thread [`ThreadContext`].
+#[inline(always)]
+fn with_thread_context<T>(f: impl FnOnce(&Arc<ThreadContext>) -> T) -> T {
+    thread_local!(static THREAD_DATA: Arc<ThreadContext> = Arc::new(ThreadContext::new()));
+    THREAD_DATA.with(f)
+}
+
+/// Waker is used to wake up threads that are blocked on condition.
+/// It keeps track of contexts [`Arc<ThreadContext>`] and can increment the counter
+/// of several contexts to send a notification.
+pub struct Waker {
+    // contexts that are waiting for a notification
+    contexts: parking_lot::Mutex<smallvec::SmallVec<[Arc<ThreadContext>; 8]>>,
+}
+
+impl Default for Waker {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Waker {
+    pub fn new() -> Self {
+        Self {
+            contexts: parking_lot::Mutex::new(smallvec::SmallVec::new()),
+        }
+    }
+
+    /// Subscribe current thread to receive a wake notification later.
+    pub fn wake_me_later(&self) {
+        with_thread_context(|ctx| {
+            self.contexts.lock().push(ctx.clone());
+        });
+    }
+
+    /// Wake up all threads that are waiting for a notification and clear the list.
+    pub fn wake_all(&self) {
+        let mut v = self.contexts.lock();
+        for ctx in v.iter() {
+            ctx.inc_wake();
+        }
+        v.clear();
+    }
+}
+
+/// See [`ThreadContext::yield_me`].
+pub fn yield_me(after_ms: i64) {
+    with_thread_context(|ctx| ctx.yield_me(after_ms))
+}
+
+/// Get current time.
+pub fn now() -> u64 {
+    with_thread_context(|ctx| ctx.clock.get().unwrap().now())
+}
+
+pub fn exit(code: i32, msg: String) {
+    with_thread_context(|ctx| {
+        ctx.allow_panic.store(true, Ordering::SeqCst);
+        let mut result = ctx.result.lock();
+        *result = (code, msg);
+        panic!("exit");
+    });
+}
+
+pub(crate) fn get_thread_ctx() -> Arc<ThreadContext> {
+    with_thread_context(|ctx| ctx.clone())
+}
+
+/// Trait for polling channels until they have something.
+pub trait PollSome {
+    /// Schedule wakeup for message arrival.
+    fn wake_me(&self);
+
+    /// Check if channel has a ready message.
+    fn has_some(&self) -> bool;
+}
+
+/// Blocks current thread until one of the channels has a ready message. Returns
+/// index of the channel that has a message. If timeout is reached, returns None.
+///
+/// Negative timeout means block forever. Zero timeout means check channels and return
+/// immediately. Positive timeout means block until timeout is reached.
+pub fn epoll_chans(chans: &[Box<dyn PollSome>], timeout: i64) -> Option<usize> {
+    let deadline = if timeout < 0 {
+        0
+    } else {
+        now() + timeout as u64
+    };
+
+    loop {
+        for chan in chans {
+            chan.wake_me()
+        }
+
+        for (i, chan) in chans.iter().enumerate() {
+            if chan.has_some() {
+                return Some(i);
+            }
+        }
+
+        if timeout < 0 {
+            // block until wakeup
+            yield_me(-1);
+        } else {
+            let current_time = now();
+            if current_time >= deadline {
+                return None;
+            }
+
+            yield_me((deadline - current_time) as i64);
+        }
+    }
+}
diff --git a/libs/desim/src/lib.rs b/libs/desim/src/lib.rs
new file mode 100644
index 0000000000..14f5a885c5
--- /dev/null
+++ b/libs/desim/src/lib.rs
@@ -0,0 +1,8 @@
+pub mod chan;
+pub mod executor;
+pub mod network;
+pub mod node_os;
+pub mod options;
+pub mod proto;
+pub mod time;
+pub mod world;
diff --git a/libs/desim/src/network.rs b/libs/desim/src/network.rs
new file mode 100644
index 0000000000..e15a714daa
--- /dev/null
+++ b/libs/desim/src/network.rs
@@ -0,0 +1,451 @@
+use std::{
+    cmp::Ordering,
+    collections::{BinaryHeap, VecDeque},
+    fmt::{self, Debug},
+    ops::DerefMut,
+    sync::{mpsc, Arc},
+};
+
+use parking_lot::{
+    lock_api::{MappedMutexGuard, MutexGuard},
+    Mutex, RawMutex,
+};
+use rand::rngs::StdRng;
+use tracing::debug;
+
+use crate::{
+    executor::{self, ThreadContext},
+    options::NetworkOptions,
+    proto::NetEvent,
+    proto::NodeEvent,
+};
+
+use super::{chan::Chan, proto::AnyMessage};
+
+pub struct NetworkTask {
+    options: Arc<NetworkOptions>,
+    connections: Mutex<Vec<VirtualConnection>>,
+    /// min-heap of connections having something to deliver.
+    events: Mutex<BinaryHeap<Event>>,
+    task_context: Arc<ThreadContext>,
+}
+
+impl NetworkTask {
+    pub fn start_new(options: Arc<NetworkOptions>, tx: mpsc::Sender<Arc<NetworkTask>>) {
+        let ctx = executor::get_thread_ctx();
+        let task = Arc::new(Self {
+            options,
+            connections: Mutex::new(Vec::new()),
+            events: Mutex::new(BinaryHeap::new()),
+            task_context: ctx,
+        });
+
+        // send the task upstream
+        tx.send(task.clone()).unwrap();
+
+        // start the task
+        task.start();
+    }
+
+    pub fn start_new_connection(self: &Arc<Self>, rng: StdRng, dst_accept: Chan<NodeEvent>) -> TCP {
+        let now = executor::now();
+        let connection_id = self.connections.lock().len();
+
+        let vc = VirtualConnection {
+            connection_id,
+            dst_accept,
+            dst_sockets: [Chan::new(), Chan::new()],
+            state: Mutex::new(ConnectionState {
+                buffers: [NetworkBuffer::new(None), NetworkBuffer::new(Some(now))],
+                rng,
+            }),
+        };
+        vc.schedule_timeout(self);
+        vc.send_connect(self);
+
+        let recv_chan = vc.dst_sockets[0].clone();
+        self.connections.lock().push(vc);
+
+        TCP {
+            net: self.clone(),
+            conn_id: connection_id,
+            dir: 0,
+            recv_chan,
+        }
+    }
+}
+
+// private functions
+impl NetworkTask {
+    /// Schedule to wakeup network task (self) `after_ms` later to deliver
+    /// messages of connection `id`.
+    fn schedule(&self, id: usize, after_ms: u64) {
+        self.events.lock().push(Event {
+            time: executor::now() + after_ms,
+            conn_id: id,
+        });
+        self.task_context.schedule_wakeup(after_ms);
+    }
+
+    /// Get locked connection `id`.
+    fn get(&self, id: usize) -> MappedMutexGuard<'_, RawMutex, VirtualConnection> {
+        MutexGuard::map(self.connections.lock(), |connections| {
+            connections.get_mut(id).unwrap()
+        })
+    }
+
+    fn collect_pending_events(&self, now: u64, vec: &mut Vec<Event>) {
+        vec.clear();
+        let mut events = self.events.lock();
+        while let Some(event) = events.peek() {
+            if event.time > now {
+                break;
+            }
+            let event = events.pop().unwrap();
+            vec.push(event);
+        }
+    }
+
+    fn start(self: &Arc<Self>) {
+        debug!("started network task");
+
+        let mut events = Vec::new();
+        loop {
+            let now = executor::now();
+            self.collect_pending_events(now, &mut events);
+
+            for event in events.drain(..) {
+                let conn = self.get(event.conn_id);
+                conn.process(self);
+            }
+
+            // block until wakeup
+            executor::yield_me(-1);
+        }
+    }
+}
+
+// 0 - from node(0) to node(1)
+// 1 - from node(1) to node(0)
+type MessageDirection = u8;
+
+fn sender_str(dir: MessageDirection) -> &'static str {
+    match dir {
+        0 => "client",
+        1 => "server",
+        _ => unreachable!(),
+    }
+}
+
+fn receiver_str(dir: MessageDirection) -> &'static str {
+    match dir {
+        0 => "server",
+        1 => "client",
+        _ => unreachable!(),
+    }
+}
+
+/// Virtual connection between two nodes.
+/// Node 0 is the creator of the connection (client),
+/// and node 1 is the acceptor (server).
+struct VirtualConnection {
+    connection_id: usize,
+    /// one-off chan, used to deliver Accept message to dst
+    dst_accept: Chan<NodeEvent>,
+    /// message sinks
+    dst_sockets: [Chan<NetEvent>; 2],
+    state: Mutex<ConnectionState>,
+}
+
+struct ConnectionState {
+    buffers: [NetworkBuffer; 2],
+    rng: StdRng,
+}
+
+impl VirtualConnection {
+    /// Notify the future about the possible timeout.
+    fn schedule_timeout(&self, net: &NetworkTask) {
+        if let Some(timeout) = net.options.keepalive_timeout {
+            net.schedule(self.connection_id, timeout);
+        }
+    }
+
+    /// Send the handshake (Accept) to the server.
+    fn send_connect(&self, net: &NetworkTask) {
+        let now = executor::now();
+        let mut state = self.state.lock();
+        let delay = net.options.connect_delay.delay(&mut state.rng);
+        let buffer = &mut state.buffers[0];
+        assert!(buffer.buf.is_empty());
+        assert!(!buffer.recv_closed);
+        assert!(!buffer.send_closed);
+        assert!(buffer.last_recv.is_none());
+
+        let delay = if let Some(ms) = delay {
+            ms
+        } else {
+            debug!("NET: TCP #{} dropped connect", self.connection_id);
+            buffer.send_closed = true;
+            return;
+        };
+
+        // Send a message into the future.
+        buffer
+            .buf
+            .push_back((now + delay, AnyMessage::InternalConnect));
+        net.schedule(self.connection_id, delay);
+    }
+
+    /// Transmit some of the messages from the buffer to the nodes.
+    fn process(&self, net: &Arc<NetworkTask>) {
+        let now = executor::now();
+
+        let mut state = self.state.lock();
+
+        for direction in 0..2 {
+            self.process_direction(
+                net,
+                state.deref_mut(),
+                now,
+                direction as MessageDirection,
+                &self.dst_sockets[direction ^ 1],
+            );
+        }
+
+        // Close the one side of the connection by timeout if the node
+        // has not received any messages for a long time.
+        if let Some(timeout) = net.options.keepalive_timeout {
+            let mut to_close = [false, false];
+            for direction in 0..2 {
+                let buffer = &mut state.buffers[direction];
+                if buffer.recv_closed {
+                    continue;
+                }
+                if let Some(last_recv) = buffer.last_recv {
+                    if now - last_recv >= timeout {
+                        debug!(
+                            "NET: connection {} timed out at {}",
+                            self.connection_id,
+                            receiver_str(direction as MessageDirection)
+                        );
+                        let node_idx = direction ^ 1;
+                        to_close[node_idx] = true;
+                    }
+                }
+            }
+            drop(state);
+
+            for (node_idx, should_close) in to_close.iter().enumerate() {
+                if *should_close {
+                    self.close(node_idx);
+                }
+            }
+        }
+    }
+
+    /// Process messages in the buffer in the given direction.
+    fn process_direction(
+        &self,
+        net: &Arc<NetworkTask>,
+        state: &mut ConnectionState,
+        now: u64,
+        direction: MessageDirection,
+        to_socket: &Chan<NetEvent>,
+    ) {
+        let buffer = &mut state.buffers[direction as usize];
+        if buffer.recv_closed {
+            assert!(buffer.buf.is_empty());
+        }
+
+        while !buffer.buf.is_empty() && buffer.buf.front().unwrap().0 <= now {
+            let msg = buffer.buf.pop_front().unwrap().1;
+
+            buffer.last_recv = Some(now);
+            self.schedule_timeout(net);
+
+            if let AnyMessage::InternalConnect = msg {
+                // TODO: assert to_socket is the server
+                let server_to_client = TCP {
+                    net: net.clone(),
+                    conn_id: self.connection_id,
+                    dir: direction ^ 1,
+                    recv_chan: to_socket.clone(),
+                };
+                // special case, we need to deliver new connection to a separate channel
+                self.dst_accept.send(NodeEvent::Accept(server_to_client));
+            } else {
+                to_socket.send(NetEvent::Message(msg));
+            }
+        }
+    }
+
+    /// Try to send a message to the buffer, optionally dropping it and
+    /// determining delivery timestamp.
+    fn send(&self, net: &NetworkTask, direction: MessageDirection, msg: AnyMessage) {
+        let now = executor::now();
+        let mut state = self.state.lock();
+
+        let (delay, close) = if let Some(ms) = net.options.send_delay.delay(&mut state.rng) {
+            (ms, false)
+        } else {
+            (0, true)
+        };
+
+        let buffer = &mut state.buffers[direction as usize];
+        if buffer.send_closed {
+            debug!(
+                "NET: TCP #{} dropped message {:?} (broken pipe)",
+                self.connection_id, msg
+            );
+            return;
+        }
+
+        if close {
+            debug!(
+                "NET: TCP #{} dropped message {:?} (pipe just broke)",
+                self.connection_id, msg
+            );
+            buffer.send_closed = true;
+            return;
+        }
+
+        if buffer.recv_closed {
+            debug!(
+                "NET: TCP #{} dropped message {:?} (recv closed)",
+                self.connection_id, msg
+            );
+            return;
+        }
+
+        // Send a message into the future.
+        buffer.buf.push_back((now + delay, msg));
+        net.schedule(self.connection_id, delay);
+    }
+
+    /// Close the connection. Only one side of the connection will be closed,
+    /// and no further messages will be delivered. The other side will not be notified.
+    fn close(&self, node_idx: usize) {
+        let mut state = self.state.lock();
+        let recv_buffer = &mut state.buffers[1 ^ node_idx];
+        if recv_buffer.recv_closed {
+            debug!(
+                "NET: TCP #{} closed twice at {}",
+                self.connection_id,
+                sender_str(node_idx as MessageDirection),
+            );
+            return;
+        }
+
+        debug!(
+            "NET: TCP #{} closed at {}",
+            self.connection_id,
+            sender_str(node_idx as MessageDirection),
+        );
+        recv_buffer.recv_closed = true;
+        for msg in recv_buffer.buf.drain(..) {
+            debug!(
+                "NET: TCP #{} dropped message {:?} (closed)",
+                self.connection_id, msg
+            );
+        }
+
+        let send_buffer = &mut state.buffers[node_idx];
+        send_buffer.send_closed = true;
+        drop(state);
+
+        // TODO: notify the other side?
+
+        self.dst_sockets[node_idx].send(NetEvent::Closed);
+    }
+}
+
+struct NetworkBuffer {
+    /// Messages paired with time of delivery
+    buf: VecDeque<(u64, AnyMessage)>,
+    /// True if the connection is closed on the receiving side,
+    /// i.e. no more messages from the buffer will be delivered.
+    recv_closed: bool,
+    /// True if the connection is closed on the sending side,
+    /// i.e. no more messages will be added to the buffer.
+    send_closed: bool,
+    /// Last time a message was delivered from the buffer.
+    /// If None, it means that the server is the receiver and
+    /// it has not yet aware of this connection (i.e. has not
+    /// received the Accept).
+    last_recv: Option<u64>,
+}
+
+impl NetworkBuffer {
+    fn new(last_recv: Option<u64>) -> Self {
+        Self {
+            buf: VecDeque::new(),
+            recv_closed: false,
+            send_closed: false,
+            last_recv,
+        }
+    }
+}
+
+/// Single end of a bidirectional network stream without reordering (TCP-like).
+/// Reads are implemented using channels, writes go to the buffer inside VirtualConnection.
+pub struct TCP {
+    net: Arc<NetworkTask>,
+    conn_id: usize,
+    dir: MessageDirection,
+    recv_chan: Chan<NetEvent>,
+}
+
+impl Debug for TCP {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "TCP #{} ({})", self.conn_id, sender_str(self.dir),)
+    }
+}
+
+impl TCP {
+    /// Send a message to the other side. It's guaranteed that it will not arrive
+    /// before the arrival of all messages sent earlier.
+    pub fn send(&self, msg: AnyMessage) {
+        let conn = self.net.get(self.conn_id);
+        conn.send(&self.net, self.dir, msg);
+    }
+
+    /// Get a channel to receive incoming messages.
+    pub fn recv_chan(&self) -> Chan<NetEvent> {
+        self.recv_chan.clone()
+    }
+
+    pub fn connection_id(&self) -> usize {
+        self.conn_id
+    }
+
+    pub fn close(&self) {
+        let conn = self.net.get(self.conn_id);
+        conn.close(self.dir as usize);
+    }
+}
+struct Event {
+    time: u64,
+    conn_id: usize,
+}
+
+// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
+// to get that.
+impl PartialOrd for Event {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for Event {
+    fn cmp(&self, other: &Self) -> Ordering {
+        (other.time, other.conn_id).cmp(&(self.time, self.conn_id))
+    }
+}
+
+impl PartialEq for Event {
+    fn eq(&self, other: &Self) -> bool {
+        (other.time, other.conn_id) == (self.time, self.conn_id)
+    }
+}
+
+impl Eq for Event {}
diff --git a/libs/desim/src/node_os.rs b/libs/desim/src/node_os.rs
new file mode 100644
index 0000000000..7744a9f5e1
--- /dev/null
+++ b/libs/desim/src/node_os.rs
@@ -0,0 +1,54 @@
+use std::sync::Arc;
+
+use rand::Rng;
+
+use crate::proto::NodeEvent;
+
+use super::{
+    chan::Chan,
+    network::TCP,
+    world::{Node, NodeId, World},
+};
+
+/// Abstraction with all functions (aka syscalls) available to the node.
+#[derive(Clone)]
+pub struct NodeOs {
+    world: Arc<World>,
+    internal: Arc<Node>,
+}
+
+impl NodeOs {
+    pub fn new(world: Arc<World>, internal: Arc<Node>) -> NodeOs {
+        NodeOs { world, internal }
+    }
+
+    /// Get the node id.
+    pub fn id(&self) -> NodeId {
+        self.internal.id
+    }
+
+    /// Opens a bidirectional connection with the other node. Always successful.
+    pub fn open_tcp(&self, dst: NodeId) -> TCP {
+        self.world.open_tcp(dst)
+    }
+
+    /// Returns a channel to receive node events (socket Accept and internal messages).
+    pub fn node_events(&self) -> Chan<NodeEvent> {
+        self.internal.node_events()
+    }
+
+    /// Get current time.
+    pub fn now(&self) -> u64 {
+        self.world.now()
+    }
+
+    /// Generate a random number in range [0, max).
+    pub fn random(&self, max: u64) -> u64 {
+        self.internal.rng.lock().gen_range(0..max)
+    }
+
+    /// Append a new event to the world event log.
+    pub fn log_event(&self, data: String) {
+        self.internal.log_event(data)
+    }
+}
diff --git a/libs/desim/src/options.rs b/libs/desim/src/options.rs
new file mode 100644
index 0000000000..5da7c2c482
--- /dev/null
+++ b/libs/desim/src/options.rs
@@ -0,0 +1,50 @@
+use rand::{rngs::StdRng, Rng};
+
+/// Describes random delays and failures. Delay will be uniformly distributed in [min, max].
+/// Connection failure will occur with the probablity fail_prob.
+#[derive(Clone, Debug)]
+pub struct Delay {
+    pub min: u64,
+    pub max: u64,
+    pub fail_prob: f64, // [0; 1]
+}
+
+impl Delay {
+    /// Create a struct with no delay, no failures.
+    pub fn empty() -> Delay {
+        Delay {
+            min: 0,
+            max: 0,
+            fail_prob: 0.0,
+        }
+    }
+
+    /// Create a struct with a fixed delay.
+    pub fn fixed(ms: u64) -> Delay {
+        Delay {
+            min: ms,
+            max: ms,
+            fail_prob: 0.0,
+        }
+    }
+
+    /// Generate a random delay in range [min, max]. Return None if the
+    /// message should be dropped.
+    pub fn delay(&self, rng: &mut StdRng) -> Option<u64> {
+        if rng.gen_bool(self.fail_prob) {
+            return None;
+        }
+        Some(rng.gen_range(self.min..=self.max))
+    }
+}
+
+/// Describes network settings. All network packets will be subjected to the same delays and failures.
+#[derive(Clone, Debug)]
+pub struct NetworkOptions {
+    /// Connection will be automatically closed after this timeout if no data is received.
+    pub keepalive_timeout: Option<u64>,
+    /// New connections will be delayed by this amount of time.
+    pub connect_delay: Delay,
+    /// Each message will be delayed by this amount of time.
+    pub send_delay: Delay,
+}
diff --git a/libs/desim/src/proto.rs b/libs/desim/src/proto.rs
new file mode 100644
index 0000000000..92a7e8a27d
--- /dev/null
+++ b/libs/desim/src/proto.rs
@@ -0,0 +1,63 @@
+use std::fmt::Debug;
+
+use bytes::Bytes;
+use utils::lsn::Lsn;
+
+use crate::{network::TCP, world::NodeId};
+
+/// Internal node events.
+#[derive(Debug)]
+pub enum NodeEvent {
+    Accept(TCP),
+    Internal(AnyMessage),
+}
+
+/// Events that are coming from a network socket.
+#[derive(Clone, Debug)]
+pub enum NetEvent {
+    Message(AnyMessage),
+    Closed,
+}
+
+/// Custom events generated throughout the simulation. Can be used by the test to verify the correctness.
+#[derive(Debug)]
+pub struct SimEvent {
+    pub time: u64,
+    pub node: NodeId,
+    pub data: String,
+}
+
+/// Umbrella type for all possible flavours of messages. These events can be sent over network
+/// or to an internal node events channel.
+#[derive(Clone)]
+pub enum AnyMessage {
+    /// Not used, empty placeholder.
+    None,
+    /// Used internally for notifying node about new incoming connection.
+    InternalConnect,
+    Just32(u32),
+    ReplCell(ReplCell),
+    Bytes(Bytes),
+    LSN(u64),
+}
+
+impl Debug for AnyMessage {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            AnyMessage::None => write!(f, "None"),
+            AnyMessage::InternalConnect => write!(f, "InternalConnect"),
+            AnyMessage::Just32(v) => write!(f, "Just32({})", v),
+            AnyMessage::ReplCell(v) => write!(f, "ReplCell({:?})", v),
+            AnyMessage::Bytes(v) => write!(f, "Bytes({})", hex::encode(v)),
+            AnyMessage::LSN(v) => write!(f, "LSN({})", Lsn(*v)),
+        }
+    }
+}
+
+/// Used in reliable_copy_test.rs
+#[derive(Clone, Debug)]
+pub struct ReplCell {
+    pub value: u32,
+    pub client_id: u32,
+    pub seqno: u32,
+}
diff --git a/libs/desim/src/time.rs b/libs/desim/src/time.rs
new file mode 100644
index 0000000000..7bb71db95c
--- /dev/null
+++ b/libs/desim/src/time.rs
@@ -0,0 +1,129 @@
+use std::{
+    cmp::Ordering,
+    collections::BinaryHeap,
+    ops::DerefMut,
+    sync::{
+        atomic::{AtomicU32, AtomicU64},
+        Arc,
+    },
+};
+
+use parking_lot::Mutex;
+use tracing::trace;
+
+use crate::executor::ThreadContext;
+
+/// Holds current time and all pending wakeup events.
+pub struct Timing {
+    /// Current world's time.
+    current_time: AtomicU64,
+    /// Pending timers.
+    queue: Mutex<BinaryHeap<Pending>>,
+    /// Global nonce. Makes picking events from binary heap queue deterministic
+    /// by appending a number to events with the same timestamp.
+    nonce: AtomicU32,
+    /// Used to schedule fake events.
+    fake_context: Arc<ThreadContext>,
+}
+
+impl Default for Timing {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Timing {
+    /// Create a new empty clock with time set to 0.
+    pub fn new() -> Timing {
+        Timing {
+            current_time: AtomicU64::new(0),
+            queue: Mutex::new(BinaryHeap::new()),
+            nonce: AtomicU32::new(0),
+            fake_context: Arc::new(ThreadContext::new()),
+        }
+    }
+
+    /// Return the current world's time.
+    pub fn now(&self) -> u64 {
+        self.current_time.load(std::sync::atomic::Ordering::SeqCst)
+    }
+
+    /// Tick-tock the global clock. Return the event ready to be processed
+    /// or move the clock forward and then return the event.
+    pub(crate) fn step(&self) -> Option<Arc<ThreadContext>> {
+        let mut queue = self.queue.lock();
+
+        if queue.is_empty() {
+            // no future events
+            return None;
+        }
+
+        if !self.is_event_ready(queue.deref_mut()) {
+            let next_time = queue.peek().unwrap().time;
+            self.current_time
+                .store(next_time, std::sync::atomic::Ordering::SeqCst);
+            trace!("rewind time to {}", next_time);
+            assert!(self.is_event_ready(queue.deref_mut()));
+        }
+
+        Some(queue.pop().unwrap().wake_context)
+    }
+
+    /// Append an event to the queue, to wakeup the thread in `ms` milliseconds.
+    pub(crate) fn schedule_wakeup(&self, ms: u64, wake_context: Arc<ThreadContext>) {
+        self.nonce.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
+        let nonce = self.nonce.load(std::sync::atomic::Ordering::SeqCst);
+        self.queue.lock().push(Pending {
+            time: self.now() + ms,
+            nonce,
+            wake_context,
+        })
+    }
+
+    /// Append a fake event to the queue, to prevent clocks from skipping this time.
+    pub fn schedule_fake(&self, ms: u64) {
+        self.queue.lock().push(Pending {
+            time: self.now() + ms,
+            nonce: 0,
+            wake_context: self.fake_context.clone(),
+        });
+    }
+
+    /// Return true if there is a ready event.
+    fn is_event_ready(&self, queue: &mut BinaryHeap<Pending>) -> bool {
+        queue.peek().map_or(false, |x| x.time <= self.now())
+    }
+
+    /// Clear all pending events.
+    pub(crate) fn clear(&self) {
+        self.queue.lock().clear();
+    }
+}
+
+struct Pending {
+    time: u64,
+    nonce: u32,
+    wake_context: Arc<ThreadContext>,
+}
+
+// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
+// to get that.
+impl PartialOrd for Pending {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for Pending {
+    fn cmp(&self, other: &Self) -> Ordering {
+        (other.time, other.nonce).cmp(&(self.time, self.nonce))
+    }
+}
+
+impl PartialEq for Pending {
+    fn eq(&self, other: &Self) -> bool {
+        (other.time, other.nonce) == (self.time, self.nonce)
+    }
+}
+
+impl Eq for Pending {}
diff --git a/libs/desim/src/world.rs b/libs/desim/src/world.rs
new file mode 100644
index 0000000000..7d60be04b5
--- /dev/null
+++ b/libs/desim/src/world.rs
@@ -0,0 +1,180 @@
+use parking_lot::Mutex;
+use rand::{rngs::StdRng, SeedableRng};
+use std::{
+    ops::DerefMut,
+    sync::{mpsc, Arc},
+};
+
+use crate::{
+    executor::{ExternalHandle, Runtime},
+    network::NetworkTask,
+    options::NetworkOptions,
+    proto::{NodeEvent, SimEvent},
+    time::Timing,
+};
+
+use super::{chan::Chan, network::TCP, node_os::NodeOs};
+
+pub type NodeId = u32;
+
+/// World contains simulation state.
+pub struct World {
+    nodes: Mutex<Vec<Arc<Node>>>,
+    /// Random number generator.
+    rng: Mutex<StdRng>,
+    /// Internal event log.
+    events: Mutex<Vec<SimEvent>>,
+    /// Separate task that processes all network messages.
+    network_task: Arc<NetworkTask>,
+    /// Runtime for running threads and moving time.
+    runtime: Mutex<Runtime>,
+    /// To get current time.
+    timing: Arc<Timing>,
+}
+
+impl World {
+    pub fn new(seed: u64, options: Arc<NetworkOptions>) -> World {
+        let timing = Arc::new(Timing::new());
+        let mut runtime = Runtime::new(timing.clone());
+
+        let (tx, rx) = mpsc::channel();
+
+        runtime.spawn(move || {
+            // create and start network background thread, and send it back via the channel
+            NetworkTask::start_new(options, tx)
+        });
+
+        // wait for the network task to start
+        while runtime.step() {}
+
+        let network_task = rx.recv().unwrap();
+
+        World {
+            nodes: Mutex::new(Vec::new()),
+            rng: Mutex::new(StdRng::seed_from_u64(seed)),
+            events: Mutex::new(Vec::new()),
+            network_task,
+            runtime: Mutex::new(runtime),
+            timing,
+        }
+    }
+
+    pub fn step(&self) -> bool {
+        self.runtime.lock().step()
+    }
+
+    pub fn get_thread_step_count(&self) -> u64 {
+        self.runtime.lock().step_counter
+    }
+
+    /// Create a new random number generator.
+    pub fn new_rng(&self) -> StdRng {
+        let mut rng = self.rng.lock();
+        StdRng::from_rng(rng.deref_mut()).unwrap()
+    }
+
+    /// Create a new node.
+    pub fn new_node(self: &Arc<Self>) -> Arc<Node> {
+        let mut nodes = self.nodes.lock();
+        let id = nodes.len() as NodeId;
+        let node = Arc::new(Node::new(id, self.clone(), self.new_rng()));
+        nodes.push(node.clone());
+        node
+    }
+
+    /// Get an internal node state by id.
+    fn get_node(&self, id: NodeId) -> Option<Arc<Node>> {
+        let nodes = self.nodes.lock();
+        let num = id as usize;
+        if num < nodes.len() {
+            Some(nodes[num].clone())
+        } else {
+            None
+        }
+    }
+
+    pub fn stop_all(&self) {
+        self.runtime.lock().crash_all_threads();
+    }
+
+    /// Returns a writable end of a TCP connection, to send src->dst messages.
+    pub fn open_tcp(self: &Arc<World>, dst: NodeId) -> TCP {
+        // TODO: replace unwrap() with /dev/null socket.
+        let dst = self.get_node(dst).unwrap();
+        let dst_accept = dst.node_events.lock().clone();
+
+        let rng = self.new_rng();
+        self.network_task.start_new_connection(rng, dst_accept)
+    }
+
+    /// Get current time.
+    pub fn now(&self) -> u64 {
+        self.timing.now()
+    }
+
+    /// Get a copy of the internal clock.
+    pub fn clock(&self) -> Arc<Timing> {
+        self.timing.clone()
+    }
+
+    pub fn add_event(&self, node: NodeId, data: String) {
+        let time = self.now();
+        self.events.lock().push(SimEvent { time, node, data });
+    }
+
+    pub fn take_events(&self) -> Vec<SimEvent> {
+        let mut events = self.events.lock();
+        let mut res = Vec::new();
+        std::mem::swap(&mut res, &mut events);
+        res
+    }
+
+    pub fn deallocate(&self) {
+        self.stop_all();
+        self.timing.clear();
+        self.nodes.lock().clear();
+    }
+}
+
+/// Internal node state.
+pub struct Node {
+    pub id: NodeId,
+    node_events: Mutex<Chan<NodeEvent>>,
+    world: Arc<World>,
+    pub(crate) rng: Mutex<StdRng>,
+}
+
+impl Node {
+    pub fn new(id: NodeId, world: Arc<World>, rng: StdRng) -> Node {
+        Node {
+            id,
+            node_events: Mutex::new(Chan::new()),
+            world,
+            rng: Mutex::new(rng),
+        }
+    }
+
+    /// Spawn a new thread with this node context.
+    pub fn launch(self: &Arc<Self>, f: impl FnOnce(NodeOs) + Send + 'static) -> ExternalHandle {
+        let node = self.clone();
+        let world = self.world.clone();
+        self.world.runtime.lock().spawn(move || {
+            f(NodeOs::new(world, node.clone()));
+        })
+    }
+
+    /// Returns a channel to receive Accepts and internal messages.
+    pub fn node_events(&self) -> Chan<NodeEvent> {
+        self.node_events.lock().clone()
+    }
+
+    /// This will drop all in-flight Accept messages.
+    pub fn replug_node_events(&self, chan: Chan<NodeEvent>) {
+        *self.node_events.lock() = chan;
+    }
+
+    /// Append event to the world's log.
+    pub fn log_event(&self, data: String) {
+        self.world.add_event(self.id, data)
+    }
+}
diff --git a/libs/desim/tests/reliable_copy_test.rs b/libs/desim/tests/reliable_copy_test.rs
new file mode 100644
index 0000000000..cf7bff8f5a
--- /dev/null
+++ b/libs/desim/tests/reliable_copy_test.rs
@@ -0,0 +1,244 @@
+//! Simple test to verify that simulator is working.
+#[cfg(test)]
+mod reliable_copy_test {
+    use anyhow::Result;
+    use desim::executor::{self, PollSome};
+    use desim::options::{Delay, NetworkOptions};
+    use desim::proto::{NetEvent, NodeEvent, ReplCell};
+    use desim::world::{NodeId, World};
+    use desim::{node_os::NodeOs, proto::AnyMessage};
+    use parking_lot::Mutex;
+    use std::sync::Arc;
+    use tracing::info;
+
+    /// Disk storage trait and implementation.
+    pub trait Storage<T> {
+        fn flush_pos(&self) -> u32;
+        fn flush(&mut self) -> Result<()>;
+        fn write(&mut self, t: T);
+    }
+
+    #[derive(Clone)]
+    pub struct SharedStorage<T> {
+        pub state: Arc<Mutex<InMemoryStorage<T>>>,
+    }
+
+    impl<T> SharedStorage<T> {
+        pub fn new() -> Self {
+            Self {
+                state: Arc::new(Mutex::new(InMemoryStorage::new())),
+            }
+        }
+    }
+
+    impl<T> Storage<T> for SharedStorage<T> {
+        fn flush_pos(&self) -> u32 {
+            self.state.lock().flush_pos
+        }
+
+        fn flush(&mut self) -> Result<()> {
+            executor::yield_me(0);
+            self.state.lock().flush()
+        }
+
+        fn write(&mut self, t: T) {
+            executor::yield_me(0);
+            self.state.lock().write(t);
+        }
+    }
+
+    pub struct InMemoryStorage<T> {
+        pub data: Vec<T>,
+        pub flush_pos: u32,
+    }
+
+    impl<T> InMemoryStorage<T> {
+        pub fn new() -> Self {
+            Self {
+                data: Vec::new(),
+                flush_pos: 0,
+            }
+        }
+
+        pub fn flush(&mut self) -> Result<()> {
+            self.flush_pos = self.data.len() as u32;
+            Ok(())
+        }
+
+        pub fn write(&mut self, t: T) {
+            self.data.push(t);
+        }
+    }
+
+    /// Server implementation.
+    pub fn run_server(os: NodeOs, mut storage: Box<dyn Storage<u32>>) {
+        info!("started server");
+
+        let node_events = os.node_events();
+        let mut epoll_vec: Vec<Box<dyn PollSome>> = vec![Box::new(node_events.clone())];
+        let mut sockets = vec![];
+
+        loop {
+            let index = executor::epoll_chans(&epoll_vec, -1).unwrap();
+
+            if index == 0 {
+                let node_event = node_events.must_recv();
+                info!("got node event: {:?}", node_event);
+                if let NodeEvent::Accept(tcp) = node_event {
+                    tcp.send(AnyMessage::Just32(storage.flush_pos()));
+                    epoll_vec.push(Box::new(tcp.recv_chan()));
+                    sockets.push(tcp);
+                }
+                continue;
+            }
+
+            let recv_chan = sockets[index - 1].recv_chan();
+            let socket = &sockets[index - 1];
+
+            let event = recv_chan.must_recv();
+            info!("got event: {:?}", event);
+            if let NetEvent::Message(AnyMessage::ReplCell(cell)) = event {
+                if cell.seqno != storage.flush_pos() {
+                    info!("got out of order data: {:?}", cell);
+                    continue;
+                }
+                storage.write(cell.value);
+                storage.flush().unwrap();
+                socket.send(AnyMessage::Just32(storage.flush_pos()));
+            }
+        }
+    }
+
+    /// Client copies all data from array to the remote node.
+    pub fn run_client(os: NodeOs, data: &[ReplCell], dst: NodeId) {
+        info!("started client");
+
+        let mut delivered = 0;
+
+        let mut sock = os.open_tcp(dst);
+        let mut recv_chan = sock.recv_chan();
+
+        while delivered < data.len() {
+            let num = &data[delivered];
+            info!("sending data: {:?}", num.clone());
+            sock.send(AnyMessage::ReplCell(num.clone()));
+
+            // loop {
+            let event = recv_chan.recv();
+            match event {
+                NetEvent::Message(AnyMessage::Just32(flush_pos)) => {
+                    if flush_pos == 1 + delivered as u32 {
+                        delivered += 1;
+                    }
+                }
+                NetEvent::Closed => {
+                    info!("connection closed, reestablishing");
+                    sock = os.open_tcp(dst);
+                    recv_chan = sock.recv_chan();
+                }
+                _ => {}
+            }
+
+            // }
+        }
+
+        let sock = os.open_tcp(dst);
+        for num in data {
+            info!("sending data: {:?}", num.clone());
+            sock.send(AnyMessage::ReplCell(num.clone()));
+        }
+
+        info!("sent all data and finished client");
+    }
+
+    /// Run test simulations.
+    #[test]
+    fn sim_example_reliable_copy() {
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+            utils::logging::Output::Stdout,
+        )
+        .expect("logging init failed");
+
+        let delay = Delay {
+            min: 1,
+            max: 60,
+            fail_prob: 0.4,
+        };
+
+        let network = NetworkOptions {
+            keepalive_timeout: Some(50),
+            connect_delay: delay.clone(),
+            send_delay: delay.clone(),
+        };
+
+        for seed in 0..20 {
+            let u32_data: [u32; 5] = [1, 2, 3, 4, 5];
+            let data = u32_to_cells(&u32_data, 1);
+            let world = Arc::new(World::new(seed, Arc::new(network.clone())));
+
+            start_simulation(Options {
+                world,
+                time_limit: 1_000_000,
+                client_fn: Box::new(move |os, server_id| run_client(os, &data, server_id)),
+                u32_data,
+            });
+        }
+    }
+
+    pub struct Options {
+        pub world: Arc<World>,
+        pub time_limit: u64,
+        pub u32_data: [u32; 5],
+        pub client_fn: Box<dyn FnOnce(NodeOs, u32) + Send + 'static>,
+    }
+
+    pub fn start_simulation(options: Options) {
+        let world = options.world;
+
+        let client_node = world.new_node();
+        let server_node = world.new_node();
+        let server_id = server_node.id;
+
+        // start the client thread
+        client_node.launch(move |os| {
+            let client_fn = options.client_fn;
+            client_fn(os, server_id);
+        });
+
+        // start the server thread
+        let shared_storage = SharedStorage::new();
+        let server_storage = shared_storage.clone();
+        server_node.launch(move |os| run_server(os, Box::new(server_storage)));
+
+        while world.step() && world.now() < options.time_limit {}
+
+        let disk_data = shared_storage.state.lock().data.clone();
+        assert!(verify_data(&disk_data, &options.u32_data[..]));
+    }
+
+    pub fn u32_to_cells(data: &[u32], client_id: u32) -> Vec<ReplCell> {
+        let mut res = Vec::new();
+        for (i, _) in data.iter().enumerate() {
+            res.push(ReplCell {
+                client_id,
+                seqno: i as u32,
+                value: data[i],
+            });
+        }
+        res
+    }
+
+    fn verify_data(disk_data: &[u32], data: &[u32]) -> bool {
+        if disk_data.len() != data.len() {
+            return false;
+        }
+        for i in 0..data.len() {
+            if disk_data[i] != data[i] {
+                return false;
+            }
+        }
+        true
+    }
+}
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index a863fad269..977653848d 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -431,11 +431,11 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
 
 #[repr(C)]
 #[derive(Serialize)]
-struct XlLogicalMessage {
-    db_id: Oid,
-    transactional: uint32, // bool, takes 4 bytes due to alignment in C structures
-    prefix_size: uint64,
-    message_size: uint64,
+pub struct XlLogicalMessage {
+    pub db_id: Oid,
+    pub transactional: uint32, // bool, takes 4 bytes due to alignment in C structures
+    pub prefix_size: uint64,
+    pub message_size: uint64,
 }
 
 impl XlLogicalMessage {
diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs
index fd09030dbd..3126b170a4 100644
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -34,6 +34,9 @@ fn main() -> anyhow::Result<()> {
     println!("cargo:rustc-link-lib=static=walproposer");
     println!("cargo:rustc-link-search={walproposer_lib_search_str}");
 
+    // Rebuild crate when libwalproposer.a changes
+    println!("cargo:rerun-if-changed={walproposer_lib_search_str}/libwalproposer.a");
+
     let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config");
     let inc_server_path: String = if pg_config_bin.exists() {
         let output = Command::new(pg_config_bin)
@@ -79,6 +82,7 @@ fn main() -> anyhow::Result<()> {
         .allowlist_function("WalProposerBroadcast")
         .allowlist_function("WalProposerPoll")
         .allowlist_function("WalProposerFree")
+        .allowlist_function("SafekeeperStateDesiredEvents")
         .allowlist_var("DEBUG5")
         .allowlist_var("DEBUG4")
         .allowlist_var("DEBUG3")
diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index 1f7bf952dc..8317e2fa03 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -22,6 +22,7 @@ use crate::bindings::WalProposerExecStatusType;
 use crate::bindings::WalproposerShmemState;
 use crate::bindings::XLogRecPtr;
 use crate::walproposer::ApiImpl;
+use crate::walproposer::StreamingCallback;
 use crate::walproposer::WaitResult;
 
 extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState {
@@ -36,7 +37,8 @@ extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
     unsafe {
         let callback_data = (*(*wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).start_streaming(startpos)
+        let callback = StreamingCallback::new(wp);
+        (*api).start_streaming(startpos, &callback);
     }
 }
 
@@ -134,19 +136,18 @@ extern "C" fn conn_async_read(
     unsafe {
         let callback_data = (*(*(*sk).wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
-        let (res, result) = (*api).conn_async_read(&mut (*sk));
 
         // This function has guarantee that returned buf will be valid until
         // the next call. So we can store a Vec in each Safekeeper and reuse
         // it on the next call.
         let mut inbuf = take_vec_u8(&mut (*sk).inbuf).unwrap_or_default();
-
         inbuf.clear();
-        inbuf.extend_from_slice(res);
+
+        let result = (*api).conn_async_read(&mut (*sk), &mut inbuf);
 
         // Put a Vec back to sk->inbuf and return data ptr.
+        *amount = inbuf.len() as i32;
         *buf = store_vec_u8(&mut (*sk).inbuf, inbuf);
-        *amount = res.len() as i32;
 
         result
     }
@@ -182,6 +183,10 @@ extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bo
     unsafe {
         let callback_data = (*(*(*sk).wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
+
+        // currently `recovery_download` is always called right after election
+        (*api).after_election(&mut (*wp));
+
         (*api).recovery_download(&mut (*wp), &mut (*sk))
     }
 }
@@ -277,7 +282,8 @@ extern "C" fn wait_event_set(
             }
             WaitResult::Timeout => {
                 *event_sk = std::ptr::null_mut();
-                *events = crate::bindings::WL_TIMEOUT;
+                // WaitEventSetWait returns 0 for timeout.
+                *events = 0;
                 0
             }
             WaitResult::Network(sk, event_mask) => {
@@ -340,7 +346,7 @@ extern "C" fn log_internal(
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum Level {
     Debug5,
     Debug4,
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 8ab8fb1a07..13fade220c 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -1,13 +1,13 @@
 use std::ffi::CString;
 
 use postgres_ffi::WAL_SEGMENT_SIZE;
-use utils::id::TenantTimelineId;
+use utils::{id::TenantTimelineId, lsn::Lsn};
 
 use crate::{
     api_bindings::{create_api, take_vec_u8, Level},
     bindings::{
-        NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate,
-        WalProposerFree, WalProposerStart,
+        NeonWALReadResult, Safekeeper, WalProposer, WalProposerBroadcast, WalProposerConfig,
+        WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart,
     },
 };
 
@@ -16,11 +16,11 @@ use crate::{
 ///
 /// Refer to `pgxn/neon/walproposer.h` for documentation.
 pub trait ApiImpl {
-    fn get_shmem_state(&self) -> &mut crate::bindings::WalproposerShmemState {
+    fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState {
         todo!()
     }
 
-    fn start_streaming(&self, _startpos: u64) {
+    fn start_streaming(&self, _startpos: u64, _callback: &StreamingCallback) {
         todo!()
     }
 
@@ -70,7 +70,11 @@ pub trait ApiImpl {
         todo!()
     }
 
-    fn conn_async_read(&self, _sk: &mut Safekeeper) -> (&[u8], crate::bindings::PGAsyncReadResult) {
+    fn conn_async_read(
+        &self,
+        _sk: &mut Safekeeper,
+        _vec: &mut Vec<u8>,
+    ) -> crate::bindings::PGAsyncReadResult {
         todo!()
     }
 
@@ -151,12 +155,14 @@ pub trait ApiImpl {
     }
 }
 
+#[derive(Debug)]
 pub enum WaitResult {
     Latch,
     Timeout,
     Network(*mut Safekeeper, u32),
 }
 
+#[derive(Clone)]
 pub struct Config {
     /// Tenant and timeline id
     pub ttid: TenantTimelineId,
@@ -242,6 +248,24 @@ impl Drop for Wrapper {
     }
 }
 
+pub struct StreamingCallback {
+    wp: *mut WalProposer,
+}
+
+impl StreamingCallback {
+    pub fn new(wp: *mut WalProposer) -> StreamingCallback {
+        StreamingCallback { wp }
+    }
+
+    pub fn broadcast(&self, startpos: Lsn, endpos: Lsn) {
+        unsafe { WalProposerBroadcast(self.wp, startpos.0, endpos.0) }
+    }
+
+    pub fn poll(&self) {
+        unsafe { WalProposerPoll(self.wp) }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use core::panic;
@@ -344,14 +368,13 @@ mod tests {
         fn conn_async_read(
             &self,
             _: &mut crate::bindings::Safekeeper,
-        ) -> (&[u8], crate::bindings::PGAsyncReadResult) {
+            vec: &mut Vec<u8>,
+        ) -> crate::bindings::PGAsyncReadResult {
             println!("conn_async_read");
             let reply = self.next_safekeeper_reply();
             println!("conn_async_read result: {:?}", reply);
-            (
-                reply,
-                crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS,
-            )
+            vec.extend_from_slice(reply);
+            crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS
         }
 
         fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool {
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 93d1dcab35..12ceac0191 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -346,7 +346,7 @@ impl WalIngest {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
 
                 if info == pg_constants::XLOG_LOGICAL_MESSAGE {
-                    let xlrec = XlLogicalMessage::decode(&mut buf);
+                    let xlrec = crate::walrecord::XlLogicalMessage::decode(&mut buf);
                     let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
                     let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size];
                     if prefix == "neon-test" {
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 171af7d2aa..0d5007ef73 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -688,7 +688,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
 		return;
 
-	wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
+	wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, term=" INT64_FORMAT, sk->host, sk->port, sk->greetResponse.term);
 
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;
@@ -922,6 +922,7 @@ static void
 DetermineEpochStartLsn(WalProposer *wp)
 {
 	TermHistory *dth;
+	int          n_ready = 0;
 
 	wp->propEpochStartLsn = InvalidXLogRecPtr;
 	wp->donorEpoch = 0;
@@ -932,6 +933,8 @@ DetermineEpochStartLsn(WalProposer *wp)
 	{
 		if (wp->safekeeper[i].state == SS_IDLE)
 		{
+			n_ready++;
+
 			if (GetEpoch(&wp->safekeeper[i]) > wp->donorEpoch ||
 				(GetEpoch(&wp->safekeeper[i]) == wp->donorEpoch &&
 				 wp->safekeeper[i].voteResponse.flushLsn > wp->propEpochStartLsn))
@@ -958,6 +961,16 @@ DetermineEpochStartLsn(WalProposer *wp)
 		}
 	}
 
+	if (n_ready < wp->quorum)
+	{
+		/*
+		 * This is a rare case that can be triggered if safekeeper has voted and disconnected.
+		 * In this case, its state will not be SS_IDLE and its vote cannot be used, because
+		 * we clean up `voteResponse` in `ShutdownConnection`.
+		 */
+		wp_log(FATAL, "missing majority of votes, collected %d, expected %d, got %d", wp->n_votes, wp->quorum, n_ready);
+	}
+
 	/*
 	 * If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are bootstrapping
 	 * and nothing was committed yet. Start streaming then from the basebackup LSN.
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 688d8e6e52..53820f6e1b 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -486,6 +486,8 @@ typedef struct walproposer_api
 	 *
 	 * On success, the data is placed in *buf. It is valid until the next call
 	 * to this function.
+	 * 
+	 * Returns PG_ASYNC_READ_FAIL on closed connection.
 	 */
 	PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount);
 
@@ -532,6 +534,13 @@ typedef struct walproposer_api
 	 * Returns 0 if timeout is reached, 1 if some event happened. Updates
 	 * events mask to indicate events and sets sk to the safekeeper which has
 	 * an event.
+	 * 
+	 * On timeout, events is set to WL_NO_EVENTS. On socket event, events is
+	 * set to WL_SOCKET_READABLE and/or WL_SOCKET_WRITEABLE. When socket is
+	 * closed, events is set to WL_SOCKET_READABLE.
+	 * 
+	 * WL_SOCKET_WRITEABLE is usually set only when we need to flush the buffer.
+	 * It can be returned only if caller asked for this event in the last *_event_set call.
 	 */
 	int			(*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events);
 
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 364cad7892..cb4a1def1f 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -61,3 +61,10 @@ tokio-stream.workspace = true
 utils.workspace = true
 
 workspace_hack.workspace = true
+
+[dev-dependencies]
+walproposer.workspace = true
+rand.workspace = true
+desim.workspace = true
+tracing.workspace = true
+tracing-subscriber = { workspace = true, features = ["json"] }
diff --git a/safekeeper/tests/misc_test.rs b/safekeeper/tests/misc_test.rs
new file mode 100644
index 0000000000..8e5b17a143
--- /dev/null
+++ b/safekeeper/tests/misc_test.rs
@@ -0,0 +1,155 @@
+use std::sync::Arc;
+
+use tracing::{info, warn};
+use utils::lsn::Lsn;
+
+use crate::walproposer_sim::{
+    log::{init_logger, init_tracing_logger},
+    simulation::{generate_network_opts, generate_schedule, Schedule, TestAction, TestConfig},
+};
+
+pub mod walproposer_sim;
+
+// Test that simulation supports restarting (crashing) safekeepers.
+#[test]
+fn crash_safekeeper() {
+    let clock = init_logger();
+    let config = TestConfig::new(Some(clock));
+    let test = config.start(1337);
+
+    let lsn = test.sync_safekeepers().unwrap();
+    assert_eq!(lsn, Lsn(0));
+    info!("Sucessfully synced empty safekeepers at 0/0");
+
+    let mut wp = test.launch_walproposer(lsn);
+
+    // Write some WAL and crash safekeeper 0 without waiting for replication.
+    test.poll_for_duration(30);
+    wp.write_tx(3);
+    test.servers[0].restart();
+
+    // Wait some time, so that walproposer can reconnect.
+    test.poll_for_duration(2000);
+}
+
+// Test that walproposer can be crashed (stopped).
+#[test]
+fn test_simple_restart() {
+    let clock = init_logger();
+    let config = TestConfig::new(Some(clock));
+    let test = config.start(1337);
+
+    let lsn = test.sync_safekeepers().unwrap();
+    assert_eq!(lsn, Lsn(0));
+    info!("Sucessfully synced empty safekeepers at 0/0");
+
+    let mut wp = test.launch_walproposer(lsn);
+
+    test.poll_for_duration(30);
+    wp.write_tx(3);
+    test.poll_for_duration(100);
+
+    wp.stop();
+    drop(wp);
+
+    let lsn = test.sync_safekeepers().unwrap();
+    info!("Sucessfully synced safekeepers at {}", lsn);
+}
+
+// Test runnning a simple schedule, restarting everything a several times.
+#[test]
+fn test_simple_schedule() -> anyhow::Result<()> {
+    let clock = init_logger();
+    let mut config = TestConfig::new(Some(clock));
+    config.network.keepalive_timeout = Some(100);
+    let test = config.start(1337);
+
+    let schedule: Schedule = vec![
+        (0, TestAction::RestartWalProposer),
+        (50, TestAction::WriteTx(5)),
+        (100, TestAction::RestartSafekeeper(0)),
+        (100, TestAction::WriteTx(5)),
+        (110, TestAction::RestartSafekeeper(1)),
+        (110, TestAction::WriteTx(5)),
+        (120, TestAction::RestartSafekeeper(2)),
+        (120, TestAction::WriteTx(5)),
+        (201, TestAction::RestartWalProposer),
+        (251, TestAction::RestartSafekeeper(0)),
+        (251, TestAction::RestartSafekeeper(1)),
+        (251, TestAction::RestartSafekeeper(2)),
+        (251, TestAction::WriteTx(5)),
+        (255, TestAction::WriteTx(5)),
+        (1000, TestAction::WriteTx(5)),
+    ];
+
+    test.run_schedule(&schedule)?;
+    info!("Test finished, stopping all threads");
+    test.world.deallocate();
+
+    Ok(())
+}
+
+// Test that simulation can process 10^4 transactions.
+#[test]
+fn test_many_tx() -> anyhow::Result<()> {
+    let clock = init_logger();
+    let config = TestConfig::new(Some(clock));
+    let test = config.start(1337);
+
+    let mut schedule: Schedule = vec![];
+    for i in 0..100 {
+        schedule.push((i * 10, TestAction::WriteTx(100)));
+    }
+
+    test.run_schedule(&schedule)?;
+    info!("Test finished, stopping all threads");
+    test.world.stop_all();
+
+    let events = test.world.take_events();
+    info!("Events: {:?}", events);
+    let last_commit_lsn = events
+        .iter()
+        .filter_map(|event| {
+            if event.data.starts_with("commit_lsn;") {
+                let lsn: u64 = event.data.split(';').nth(1).unwrap().parse().unwrap();
+                return Some(lsn);
+            }
+            None
+        })
+        .last()
+        .unwrap();
+
+    let initdb_lsn = 21623024;
+    let diff = last_commit_lsn - initdb_lsn;
+    info!("Last commit lsn: {}, diff: {}", last_commit_lsn, diff);
+    // each tx is at least 8 bytes, it's written a 100 times for in a loop for 100 times
+    assert!(diff > 100 * 100 * 8);
+    Ok(())
+}
+
+// Checks that we don't have nasty circular dependencies, preventing Arc from deallocating.
+// This test doesn't really assert anything, you need to run it manually to check if there
+// is any issue.
+#[test]
+fn test_res_dealloc() -> anyhow::Result<()> {
+    let clock = init_tracing_logger(true);
+    let mut config = TestConfig::new(Some(clock));
+
+    let seed = 123456;
+    config.network = generate_network_opts(seed);
+    let test = config.start(seed);
+    warn!("Running test with seed {}", seed);
+
+    let schedule = generate_schedule(seed);
+    info!("schedule: {:?}", schedule);
+    test.run_schedule(&schedule).unwrap();
+    test.world.stop_all();
+
+    let world = test.world.clone();
+    drop(test);
+    info!("world strong count: {}", Arc::strong_count(&world));
+    world.deallocate();
+    info!("world strong count: {}", Arc::strong_count(&world));
+
+    Ok(())
+}
diff --git a/safekeeper/tests/random_test.rs b/safekeeper/tests/random_test.rs
new file mode 100644
index 0000000000..6c6f6a8c96
--- /dev/null
+++ b/safekeeper/tests/random_test.rs
@@ -0,0 +1,56 @@
+use rand::Rng;
+use tracing::{info, warn};
+
+use crate::walproposer_sim::{
+    log::{init_logger, init_tracing_logger},
+    simulation::{generate_network_opts, generate_schedule, TestConfig},
+    simulation_logs::validate_events,
+};
+
+pub mod walproposer_sim;
+
+// Generates 2000 random seeds and runs a schedule for each of them.
+// If you seed this test fail, please report the last seed to the
+// @safekeeper team.
+#[test]
+fn test_random_schedules() -> anyhow::Result<()> {
+    let clock = init_logger();
+    let mut config = TestConfig::new(Some(clock));
+
+    for _ in 0..2000 {
+        let seed: u64 = rand::thread_rng().gen();
+        config.network = generate_network_opts(seed);
+
+        let test = config.start(seed);
+        warn!("Running test with seed {}", seed);
+
+        let schedule = generate_schedule(seed);
+        test.run_schedule(&schedule).unwrap();
+        validate_events(test.world.take_events());
+        test.world.deallocate();
+    }
+
+    Ok(())
+}
+
+// After you found a seed that fails, you can insert this seed here
+// and run the test to see the full debug output.
+#[test]
+fn test_one_schedule() -> anyhow::Result<()> {
+    let clock = init_tracing_logger(true);
+    let mut config = TestConfig::new(Some(clock));
+
+    let seed = 11047466935058776390;
+    config.network = generate_network_opts(seed);
+    info!("network: {:?}", config.network);
+    let test = config.start(seed);
+    warn!("Running test with seed {}", seed);
+
+    let schedule = generate_schedule(seed);
+    info!("schedule: {:?}", schedule);
+    test.run_schedule(&schedule).unwrap();
+    validate_events(test.world.take_events());
+    test.world.deallocate();
+
+    Ok(())
+}
diff --git a/safekeeper/tests/simple_test.rs b/safekeeper/tests/simple_test.rs
new file mode 100644
index 0000000000..0be9d0deef
--- /dev/null
+++ b/safekeeper/tests/simple_test.rs
@@ -0,0 +1,45 @@
+use tracing::info;
+use utils::lsn::Lsn;
+
+use crate::walproposer_sim::{log::init_logger, simulation::TestConfig};
+
+pub mod walproposer_sim;
+
+// Check that first start of sync_safekeepers() returns 0/0 on empty safekeepers.
+#[test]
+fn sync_empty_safekeepers() {
+    let clock = init_logger();
+    let config = TestConfig::new(Some(clock));
+    let test = config.start(1337);
+
+    let lsn = test.sync_safekeepers().unwrap();
+    assert_eq!(lsn, Lsn(0));
+    info!("Sucessfully synced empty safekeepers at 0/0");
+
+    let lsn = test.sync_safekeepers().unwrap();
+    assert_eq!(lsn, Lsn(0));
+    info!("Sucessfully synced (again) empty safekeepers at 0/0");
+}
+
+// Check that there are no panics when we are writing and streaming WAL to safekeepers.
+#[test]
+fn run_walproposer_generate_wal() {
+    let clock = init_logger();
+    let config = TestConfig::new(Some(clock));
+    let test = config.start(1337);
+
+    let lsn = test.sync_safekeepers().unwrap();
+    assert_eq!(lsn, Lsn(0));
+    info!("Sucessfully synced empty safekeepers at 0/0");
+
+    let mut wp = test.launch_walproposer(lsn);
+
+    // wait for walproposer to start
+    test.poll_for_duration(30);
+
+    // just write some WAL
+    for _ in 0..100 {
+        wp.write_tx(1);
+        test.poll_for_duration(5);
+    }
+}
diff --git a/safekeeper/tests/walproposer_sim/block_storage.rs b/safekeeper/tests/walproposer_sim/block_storage.rs
new file mode 100644
index 0000000000..468c02ad2f
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/block_storage.rs
@@ -0,0 +1,57 @@
+use std::collections::HashMap;
+
+const BLOCK_SIZE: usize = 8192;
+
+/// A simple in-memory implementation of a block storage. Can be used to implement external
+/// storage in tests.
+pub struct BlockStorage {
+    blocks: HashMap<u64, [u8; BLOCK_SIZE]>,
+}
+
+impl Default for BlockStorage {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl BlockStorage {
+    pub fn new() -> Self {
+        BlockStorage {
+            blocks: HashMap::new(),
+        }
+    }
+
+    pub fn read(&self, pos: u64, buf: &mut [u8]) {
+        let mut buf_offset = 0;
+        let mut storage_pos = pos;
+        while buf_offset < buf.len() {
+            let block_id = storage_pos / BLOCK_SIZE as u64;
+            let block = self.blocks.get(&block_id).unwrap_or(&[0; BLOCK_SIZE]);
+            let block_offset = storage_pos % BLOCK_SIZE as u64;
+            let block_len = BLOCK_SIZE as u64 - block_offset;
+            let buf_len = buf.len() - buf_offset;
+            let copy_len = std::cmp::min(block_len as usize, buf_len);
+            buf[buf_offset..buf_offset + copy_len]
+                .copy_from_slice(&block[block_offset as usize..block_offset as usize + copy_len]);
+            buf_offset += copy_len;
+            storage_pos += copy_len as u64;
+        }
+    }
+
+    pub fn write(&mut self, pos: u64, buf: &[u8]) {
+        let mut buf_offset = 0;
+        let mut storage_pos = pos;
+        while buf_offset < buf.len() {
+            let block_id = storage_pos / BLOCK_SIZE as u64;
+            let block = self.blocks.entry(block_id).or_insert([0; BLOCK_SIZE]);
+            let block_offset = storage_pos % BLOCK_SIZE as u64;
+            let block_len = BLOCK_SIZE as u64 - block_offset;
+            let buf_len = buf.len() - buf_offset;
+            let copy_len = std::cmp::min(block_len as usize, buf_len);
+            block[block_offset as usize..block_offset as usize + copy_len]
+                .copy_from_slice(&buf[buf_offset..buf_offset + copy_len]);
+            buf_offset += copy_len;
+            storage_pos += copy_len as u64
+        }
+    }
+}
diff --git a/safekeeper/tests/walproposer_sim/log.rs b/safekeeper/tests/walproposer_sim/log.rs
new file mode 100644
index 0000000000..870f30de4f
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/log.rs
@@ -0,0 +1,77 @@
+use std::{fmt, sync::Arc};
+
+use desim::time::Timing;
+use once_cell::sync::OnceCell;
+use parking_lot::Mutex;
+use tracing_subscriber::fmt::{format::Writer, time::FormatTime};
+
+/// SimClock can be plugged into tracing logger to print simulation time.
+#[derive(Clone)]
+pub struct SimClock {
+    clock_ptr: Arc<Mutex<Option<Arc<Timing>>>>,
+}
+
+impl Default for SimClock {
+    fn default() -> Self {
+        SimClock {
+            clock_ptr: Arc::new(Mutex::new(None)),
+        }
+    }
+}
+
+impl SimClock {
+    pub fn set_clock(&self, clock: Arc<Timing>) {
+        *self.clock_ptr.lock() = Some(clock);
+    }
+}
+
+impl FormatTime for SimClock {
+    fn format_time(&self, w: &mut Writer<'_>) -> fmt::Result {
+        let clock = self.clock_ptr.lock();
+
+        if let Some(clock) = clock.as_ref() {
+            let now = clock.now();
+            write!(w, "[{}]", now)
+        } else {
+            write!(w, "[?]")
+        }
+    }
+}
+
+static LOGGING_DONE: OnceCell<SimClock> = OnceCell::new();
+
+/// Returns ptr to clocks attached to tracing logger to update them when the
+/// world is (re)created.
+pub fn init_tracing_logger(debug_enabled: bool) -> SimClock {
+    LOGGING_DONE
+        .get_or_init(|| {
+            let clock = SimClock::default();
+            let base_logger = tracing_subscriber::fmt()
+                .with_target(false)
+                // prefix log lines with simulated time timestamp
+                .with_timer(clock.clone())
+                // .with_ansi(true) TODO
+                .with_max_level(match debug_enabled {
+                    true => tracing::Level::DEBUG,
+                    false => tracing::Level::WARN,
+                })
+                .with_writer(std::io::stdout);
+            base_logger.init();
+
+            // logging::replace_panic_hook_with_tracing_panic_hook().forget();
+
+            if !debug_enabled {
+                std::panic::set_hook(Box::new(|_| {}));
+            }
+
+            clock
+        })
+        .clone()
+}
+
+pub fn init_logger() -> SimClock {
+    // RUST_TRACEBACK envvar controls whether we print all logs or only warnings.
+    let debug_enabled = std::env::var("RUST_TRACEBACK").is_ok();
+
+    init_tracing_logger(debug_enabled)
+}
diff --git a/safekeeper/tests/walproposer_sim/mod.rs b/safekeeper/tests/walproposer_sim/mod.rs
new file mode 100644
index 0000000000..ec560dcb3b
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/mod.rs
@@ -0,0 +1,8 @@
+pub mod block_storage;
+pub mod log;
+pub mod safekeeper;
+pub mod safekeeper_disk;
+pub mod simulation;
+pub mod simulation_logs;
+pub mod walproposer_api;
+pub mod walproposer_disk;
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
new file mode 100644
index 0000000000..1945b9d0cb
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -0,0 +1,410 @@
+//! Safekeeper communication endpoint to WAL proposer (compute node).
+//! Gets messages from the network, passes them down to consensus module and
+//! sends replies back.
+
+use std::{collections::HashMap, sync::Arc, time::Duration};
+
+use anyhow::{bail, Result};
+use bytes::{Bytes, BytesMut};
+use camino::Utf8PathBuf;
+use desim::{
+    executor::{self, PollSome},
+    network::TCP,
+    node_os::NodeOs,
+    proto::{AnyMessage, NetEvent, NodeEvent},
+};
+use hyper::Uri;
+use safekeeper::{
+    safekeeper::{ProposerAcceptorMessage, SafeKeeper, ServerInfo, UNKNOWN_SERVER_VERSION},
+    state::TimelinePersistentState,
+    timeline::TimelineError,
+    wal_storage::Storage,
+    SafeKeeperConf,
+};
+use tracing::{debug, info_span};
+use utils::{
+    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
+    lsn::Lsn,
+};
+
+use super::safekeeper_disk::{DiskStateStorage, DiskWALStorage, SafekeeperDisk, TimelineDisk};
+
+struct SharedState {
+    sk: SafeKeeper<DiskStateStorage, DiskWALStorage>,
+    disk: Arc<TimelineDisk>,
+}
+
+struct GlobalMap {
+    timelines: HashMap<TenantTimelineId, SharedState>,
+    conf: SafeKeeperConf,
+    disk: Arc<SafekeeperDisk>,
+}
+
+impl GlobalMap {
+    /// Restores global state from disk.
+    fn new(disk: Arc<SafekeeperDisk>, conf: SafeKeeperConf) -> Result<Self> {
+        let mut timelines = HashMap::new();
+
+        for (&ttid, disk) in disk.timelines.lock().iter() {
+            debug!("loading timeline {}", ttid);
+            let state = disk.state.lock().clone();
+
+            if state.server.wal_seg_size == 0 {
+                bail!(TimelineError::UninitializedWalSegSize(ttid));
+            }
+
+            if state.server.pg_version == UNKNOWN_SERVER_VERSION {
+                bail!(TimelineError::UninitialinzedPgVersion(ttid));
+            }
+
+            if state.commit_lsn < state.local_start_lsn {
+                bail!(
+                    "commit_lsn {} is higher than local_start_lsn {}",
+                    state.commit_lsn,
+                    state.local_start_lsn
+                );
+            }
+
+            let control_store = DiskStateStorage::new(disk.clone());
+            let wal_store = DiskWALStorage::new(disk.clone(), &control_store)?;
+
+            let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
+            timelines.insert(
+                ttid,
+                SharedState {
+                    sk,
+                    disk: disk.clone(),
+                },
+            );
+        }
+
+        Ok(Self {
+            timelines,
+            conf,
+            disk,
+        })
+    }
+
+    fn create(&mut self, ttid: TenantTimelineId, server_info: ServerInfo) -> Result<()> {
+        if self.timelines.contains_key(&ttid) {
+            bail!("timeline {} already exists", ttid);
+        }
+
+        debug!("creating new timeline {}", ttid);
+
+        let commit_lsn = Lsn::INVALID;
+        let local_start_lsn = Lsn::INVALID;
+
+        let state =
+            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
+
+        if state.server.wal_seg_size == 0 {
+            bail!(TimelineError::UninitializedWalSegSize(ttid));
+        }
+
+        if state.server.pg_version == UNKNOWN_SERVER_VERSION {
+            bail!(TimelineError::UninitialinzedPgVersion(ttid));
+        }
+
+        if state.commit_lsn < state.local_start_lsn {
+            bail!(
+                "commit_lsn {} is higher than local_start_lsn {}",
+                state.commit_lsn,
+                state.local_start_lsn
+            );
+        }
+
+        let disk_timeline = self.disk.put_state(&ttid, state);
+        let control_store = DiskStateStorage::new(disk_timeline.clone());
+        let wal_store = DiskWALStorage::new(disk_timeline.clone(), &control_store)?;
+
+        let sk = SafeKeeper::new(control_store, wal_store, self.conf.my_id)?;
+
+        self.timelines.insert(
+            ttid,
+            SharedState {
+                sk,
+                disk: disk_timeline,
+            },
+        );
+        Ok(())
+    }
+
+    fn get(&mut self, ttid: &TenantTimelineId) -> &mut SharedState {
+        self.timelines.get_mut(ttid).expect("timeline must exist")
+    }
+
+    fn has_tli(&self, ttid: &TenantTimelineId) -> bool {
+        self.timelines.contains_key(ttid)
+    }
+}
+
+/// State of a single connection to walproposer.
+struct ConnState {
+    tcp: TCP,
+
+    greeting: bool,
+    ttid: TenantTimelineId,
+    flush_pending: bool,
+
+    runtime: tokio::runtime::Runtime,
+}
+
+pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
+    let _enter = info_span!("safekeeper", id = os.id()).entered();
+    debug!("started server");
+    os.log_event("started;safekeeper".to_owned());
+    let conf = SafeKeeperConf {
+        workdir: Utf8PathBuf::from("."),
+        my_id: NodeId(os.id() as u64),
+        listen_pg_addr: String::new(),
+        listen_http_addr: String::new(),
+        no_sync: false,
+        broker_endpoint: "/".parse::<Uri>().unwrap(),
+        broker_keepalive_interval: Duration::from_secs(0),
+        heartbeat_timeout: Duration::from_secs(0),
+        remote_storage: None,
+        max_offloader_lag_bytes: 0,
+        wal_backup_enabled: false,
+        listen_pg_addr_tenant_only: None,
+        advertise_pg_addr: None,
+        availability_zone: None,
+        peer_recovery_enabled: false,
+        backup_parallel_jobs: 0,
+        pg_auth: None,
+        pg_tenant_only_auth: None,
+        http_auth: None,
+        current_thread_runtime: false,
+    };
+
+    let mut global = GlobalMap::new(disk, conf.clone())?;
+    let mut conns: HashMap<usize, ConnState> = HashMap::new();
+
+    for (&_ttid, shared_state) in global.timelines.iter_mut() {
+        let flush_lsn = shared_state.sk.wal_store.flush_lsn();
+        let commit_lsn = shared_state.sk.state.commit_lsn;
+        os.log_event(format!("tli_loaded;{};{}", flush_lsn.0, commit_lsn.0));
+    }
+
+    let node_events = os.node_events();
+    let mut epoll_vec: Vec<Box<dyn PollSome>> = vec![];
+    let mut epoll_idx: Vec<usize> = vec![];
+
+    // TODO: batch events processing (multiple events per tick)
+    loop {
+        epoll_vec.clear();
+        epoll_idx.clear();
+
+        // node events channel
+        epoll_vec.push(Box::new(node_events.clone()));
+        epoll_idx.push(0);
+
+        // tcp connections
+        for conn in conns.values() {
+            epoll_vec.push(Box::new(conn.tcp.recv_chan()));
+            epoll_idx.push(conn.tcp.connection_id());
+        }
+
+        // waiting for the next message
+        let index = executor::epoll_chans(&epoll_vec, -1).unwrap();
+
+        if index == 0 {
+            // got a new connection
+            match node_events.must_recv() {
+                NodeEvent::Accept(tcp) => {
+                    conns.insert(
+                        tcp.connection_id(),
+                        ConnState {
+                            tcp,
+                            greeting: false,
+                            ttid: TenantTimelineId::empty(),
+                            flush_pending: false,
+                            runtime: tokio::runtime::Builder::new_current_thread().build()?,
+                        },
+                    );
+                }
+                NodeEvent::Internal(_) => unreachable!(),
+            }
+            continue;
+        }
+
+        let connection_id = epoll_idx[index];
+        let conn = conns.get_mut(&connection_id).unwrap();
+        let mut next_event = Some(conn.tcp.recv_chan().must_recv());
+
+        loop {
+            let event = match next_event {
+                Some(event) => event,
+                None => break,
+            };
+
+            match event {
+                NetEvent::Message(msg) => {
+                    let res = conn.process_any(msg, &mut global);
+                    if res.is_err() {
+                        debug!("conn {:?} error: {:#}", connection_id, res.unwrap_err());
+                        conns.remove(&connection_id);
+                        break;
+                    }
+                }
+                NetEvent::Closed => {
+                    // TODO: remove from conns?
+                }
+            }
+
+            next_event = conn.tcp.recv_chan().try_recv();
+        }
+
+        conns.retain(|_, conn| {
+            let res = conn.flush(&mut global);
+            if res.is_err() {
+                debug!("conn {:?} error: {:?}", conn.tcp, res);
+            }
+            res.is_ok()
+        });
+    }
+}
+
+impl ConnState {
+    /// Process a message from the network. It can be START_REPLICATION request or a valid ProposerAcceptorMessage message.
+    fn process_any(&mut self, any: AnyMessage, global: &mut GlobalMap) -> Result<()> {
+        if let AnyMessage::Bytes(copy_data) = any {
+            let repl_prefix = b"START_REPLICATION ";
+            if !self.greeting && copy_data.starts_with(repl_prefix) {
+                self.process_start_replication(copy_data.slice(repl_prefix.len()..), global)?;
+                bail!("finished processing START_REPLICATION")
+            }
+
+            let msg = ProposerAcceptorMessage::parse(copy_data)?;
+            debug!("got msg: {:?}", msg);
+            self.process(msg, global)
+        } else {
+            bail!("unexpected message, expected AnyMessage::Bytes");
+        }
+    }
+
+    /// Process START_REPLICATION request.
+    fn process_start_replication(
+        &mut self,
+        copy_data: Bytes,
+        global: &mut GlobalMap,
+    ) -> Result<()> {
+        // format is "<tenant_id> <timeline_id> <start_lsn> <end_lsn>"
+        let str = String::from_utf8(copy_data.to_vec())?;
+
+        let mut parts = str.split(' ');
+        let tenant_id = parts.next().unwrap().parse::<TenantId>()?;
+        let timeline_id = parts.next().unwrap().parse::<TimelineId>()?;
+        let start_lsn = parts.next().unwrap().parse::<u64>()?;
+        let end_lsn = parts.next().unwrap().parse::<u64>()?;
+
+        let ttid = TenantTimelineId::new(tenant_id, timeline_id);
+        let shared_state = global.get(&ttid);
+
+        // read bytes from start_lsn to end_lsn
+        let mut buf = vec![0; (end_lsn - start_lsn) as usize];
+        shared_state.disk.wal.lock().read(start_lsn, &mut buf);
+
+        // send bytes to the client
+        self.tcp.send(AnyMessage::Bytes(Bytes::from(buf)));
+        Ok(())
+    }
+
+    /// Get or create a timeline.
+    fn init_timeline(
+        &mut self,
+        ttid: TenantTimelineId,
+        server_info: ServerInfo,
+        global: &mut GlobalMap,
+    ) -> Result<()> {
+        self.ttid = ttid;
+        if global.has_tli(&ttid) {
+            return Ok(());
+        }
+
+        global.create(ttid, server_info)
+    }
+
+    /// Process a ProposerAcceptorMessage.
+    fn process(&mut self, msg: ProposerAcceptorMessage, global: &mut GlobalMap) -> Result<()> {
+        if !self.greeting {
+            self.greeting = true;
+
+            match msg {
+                ProposerAcceptorMessage::Greeting(ref greeting) => {
+                    tracing::info!(
+                        "start handshake with walproposer {:?} {:?}",
+                        self.tcp,
+                        greeting
+                    );
+                    let server_info = ServerInfo {
+                        pg_version: greeting.pg_version,
+                        system_id: greeting.system_id,
+                        wal_seg_size: greeting.wal_seg_size,
+                    };
+                    let ttid = TenantTimelineId::new(greeting.tenant_id, greeting.timeline_id);
+                    self.init_timeline(ttid, server_info, global)?
+                }
+                _ => {
+                    bail!("unexpected message {msg:?} instead of greeting");
+                }
+            }
+        }
+
+        let tli = global.get(&self.ttid);
+
+        match msg {
+            ProposerAcceptorMessage::AppendRequest(append_request) => {
+                self.flush_pending = true;
+                self.process_sk_msg(
+                    tli,
+                    &ProposerAcceptorMessage::NoFlushAppendRequest(append_request),
+                )?;
+            }
+            other => {
+                self.process_sk_msg(tli, &other)?;
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Process FlushWAL if needed.
+    fn flush(&mut self, global: &mut GlobalMap) -> Result<()> {
+        // TODO: try to add extra flushes in simulation, to verify that extra flushes don't break anything
+        if !self.flush_pending {
+            return Ok(());
+        }
+        self.flush_pending = false;
+        let shared_state = global.get(&self.ttid);
+        self.process_sk_msg(shared_state, &ProposerAcceptorMessage::FlushWAL)
+    }
+
+    /// Make safekeeper process a message and send a reply to the TCP
+    fn process_sk_msg(
+        &mut self,
+        shared_state: &mut SharedState,
+        msg: &ProposerAcceptorMessage,
+    ) -> Result<()> {
+        let mut reply = self.runtime.block_on(shared_state.sk.process_msg(msg))?;
+        if let Some(reply) = &mut reply {
+            // TODO: if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn
+
+            let mut buf = BytesMut::with_capacity(128);
+            reply.serialize(&mut buf)?;
+
+            self.tcp.send(AnyMessage::Bytes(buf.into()));
+        }
+        Ok(())
+    }
+}
+
+impl Drop for ConnState {
+    fn drop(&mut self) {
+        debug!("dropping conn: {:?}", self.tcp);
+        if !std::thread::panicking() {
+            self.tcp.close();
+        }
+        // TODO: clean up non-fsynced WAL
+    }
+}
diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
new file mode 100644
index 0000000000..35bca325aa
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
@@ -0,0 +1,278 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use parking_lot::Mutex;
+use safekeeper::state::TimelinePersistentState;
+use utils::id::TenantTimelineId;
+
+use super::block_storage::BlockStorage;
+
+use std::{ops::Deref, time::Instant};
+
+use anyhow::Result;
+use bytes::{Buf, BytesMut};
+use futures::future::BoxFuture;
+use postgres_ffi::{waldecoder::WalStreamDecoder, XLogSegNo};
+use safekeeper::{control_file, metrics::WalStorageMetrics, wal_storage};
+use tracing::{debug, info};
+use utils::lsn::Lsn;
+
+/// All safekeeper state that is usually saved to disk.
+pub struct SafekeeperDisk {
+    pub timelines: Mutex<HashMap<TenantTimelineId, Arc<TimelineDisk>>>,
+}
+
+impl Default for SafekeeperDisk {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SafekeeperDisk {
+    pub fn new() -> Self {
+        SafekeeperDisk {
+            timelines: Mutex::new(HashMap::new()),
+        }
+    }
+
+    pub fn put_state(
+        &self,
+        ttid: &TenantTimelineId,
+        state: TimelinePersistentState,
+    ) -> Arc<TimelineDisk> {
+        self.timelines
+            .lock()
+            .entry(*ttid)
+            .and_modify(|e| {
+                let mut mu = e.state.lock();
+                *mu = state.clone();
+            })
+            .or_insert_with(|| {
+                Arc::new(TimelineDisk {
+                    state: Mutex::new(state),
+                    wal: Mutex::new(BlockStorage::new()),
+                })
+            })
+            .clone()
+    }
+}
+
+/// Control file state and WAL storage.
+pub struct TimelineDisk {
+    pub state: Mutex<TimelinePersistentState>,
+    pub wal: Mutex<BlockStorage>,
+}
+
+/// Implementation of `control_file::Storage` trait.
+pub struct DiskStateStorage {
+    persisted_state: TimelinePersistentState,
+    disk: Arc<TimelineDisk>,
+    last_persist_at: Instant,
+}
+
+impl DiskStateStorage {
+    pub fn new(disk: Arc<TimelineDisk>) -> Self {
+        let guard = disk.state.lock();
+        let state = guard.clone();
+        drop(guard);
+        DiskStateStorage {
+            persisted_state: state,
+            disk,
+            last_persist_at: Instant::now(),
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl control_file::Storage for DiskStateStorage {
+    /// Persist safekeeper state on disk and update internal state.
+    async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> {
+        self.persisted_state = s.clone();
+        *self.disk.state.lock() = s.clone();
+        Ok(())
+    }
+
+    /// Timestamp of last persist.
+    fn last_persist_at(&self) -> Instant {
+        // TODO: don't rely on it in tests
+        self.last_persist_at
+    }
+}
+
+impl Deref for DiskStateStorage {
+    type Target = TimelinePersistentState;
+
+    fn deref(&self) -> &Self::Target {
+        &self.persisted_state
+    }
+}
+
+/// Implementation of `wal_storage::Storage` trait.
+pub struct DiskWALStorage {
+    /// Written to disk, but possibly still in the cache and not fully persisted.
+    /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
+    write_lsn: Lsn,
+
+    /// The LSN of the last WAL record written to disk. Still can be not fully flushed.
+    write_record_lsn: Lsn,
+
+    /// The LSN of the last WAL record flushed to disk.
+    flush_record_lsn: Lsn,
+
+    /// Decoder is required for detecting boundaries of WAL records.
+    decoder: WalStreamDecoder,
+
+    /// Bytes of WAL records that are not yet written to disk.
+    unflushed_bytes: BytesMut,
+
+    /// Contains BlockStorage for WAL.
+    disk: Arc<TimelineDisk>,
+}
+
+impl DiskWALStorage {
+    pub fn new(disk: Arc<TimelineDisk>, state: &TimelinePersistentState) -> Result<Self> {
+        let write_lsn = if state.commit_lsn == Lsn(0) {
+            Lsn(0)
+        } else {
+            Self::find_end_of_wal(disk.clone(), state.commit_lsn)?
+        };
+
+        let flush_lsn = write_lsn;
+        Ok(DiskWALStorage {
+            write_lsn,
+            write_record_lsn: flush_lsn,
+            flush_record_lsn: flush_lsn,
+            decoder: WalStreamDecoder::new(flush_lsn, 16),
+            unflushed_bytes: BytesMut::new(),
+            disk,
+        })
+    }
+
+    fn find_end_of_wal(disk: Arc<TimelineDisk>, start_lsn: Lsn) -> Result<Lsn> {
+        let mut buf = [0; 8192];
+        let mut pos = start_lsn.0;
+        let mut decoder = WalStreamDecoder::new(start_lsn, 16);
+        let mut result = start_lsn;
+        loop {
+            disk.wal.lock().read(pos, &mut buf);
+            pos += buf.len() as u64;
+            decoder.feed_bytes(&buf);
+
+            loop {
+                match decoder.poll_decode() {
+                    Ok(Some(record)) => result = record.0,
+                    Err(e) => {
+                        debug!(
+                            "find_end_of_wal reached end at {:?}, decode error: {:?}",
+                            result, e
+                        );
+                        return Ok(result);
+                    }
+                    Ok(None) => break, // need more data
+                }
+            }
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl wal_storage::Storage for DiskWALStorage {
+    /// LSN of last durably stored WAL record.
+    fn flush_lsn(&self) -> Lsn {
+        self.flush_record_lsn
+    }
+
+    /// Write piece of WAL from buf to disk, but not necessarily sync it.
+    async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
+        if self.write_lsn != startpos {
+            panic!("write_wal called with wrong startpos");
+        }
+
+        self.unflushed_bytes.extend_from_slice(buf);
+        self.write_lsn += buf.len() as u64;
+
+        if self.decoder.available() != startpos {
+            info!(
+                "restart decoder from {} to {}",
+                self.decoder.available(),
+                startpos,
+            );
+            self.decoder = WalStreamDecoder::new(startpos, 16);
+        }
+        self.decoder.feed_bytes(buf);
+        loop {
+            match self.decoder.poll_decode()? {
+                None => break, // no full record yet
+                Some((lsn, _rec)) => {
+                    self.write_record_lsn = lsn;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Truncate WAL at specified LSN, which must be the end of WAL record.
+    async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
+        if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
+            panic!(
+                "truncate_wal called on non-written WAL, write_lsn={}, end_pos={}",
+                self.write_lsn, end_pos
+            );
+        }
+
+        self.flush_wal().await?;
+
+        // write zeroes to disk from end_pos until self.write_lsn
+        let buf = [0; 8192];
+        let mut pos = end_pos.0;
+        while pos < self.write_lsn.0 {
+            self.disk.wal.lock().write(pos, &buf);
+            pos += buf.len() as u64;
+        }
+
+        self.write_lsn = end_pos;
+        self.write_record_lsn = end_pos;
+        self.flush_record_lsn = end_pos;
+        self.unflushed_bytes.clear();
+        self.decoder = WalStreamDecoder::new(end_pos, 16);
+
+        Ok(())
+    }
+
+    /// Durably store WAL on disk, up to the last written WAL record.
+    async fn flush_wal(&mut self) -> Result<()> {
+        if self.flush_record_lsn == self.write_record_lsn {
+            // no need to do extra flush
+            return Ok(());
+        }
+
+        let num_bytes = self.write_record_lsn.0 - self.flush_record_lsn.0;
+
+        self.disk.wal.lock().write(
+            self.flush_record_lsn.0,
+            &self.unflushed_bytes[..num_bytes as usize],
+        );
+        self.unflushed_bytes.advance(num_bytes as usize);
+        self.flush_record_lsn = self.write_record_lsn;
+
+        Ok(())
+    }
+
+    /// Remove all segments <= given segno. Returns function doing that as we
+    /// want to perform it without timeline lock.
+    fn remove_up_to(&self, _segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> {
+        Box::pin(async move { Ok(()) })
+    }
+
+    /// Release resources associated with the storage -- technically, close FDs.
+    /// Currently we don't remove timelines until restart (#3146), so need to
+    /// spare descriptors. This would be useful for temporary tli detach as
+    /// well.
+    fn close(&mut self) {}
+
+    /// Get metrics for this timeline.
+    fn get_metrics(&self) -> WalStorageMetrics {
+        WalStorageMetrics::default()
+    }
+}
diff --git a/safekeeper/tests/walproposer_sim/simulation.rs b/safekeeper/tests/walproposer_sim/simulation.rs
new file mode 100644
index 0000000000..0d7aaf517b
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/simulation.rs
@@ -0,0 +1,436 @@
+use std::{cell::Cell, str::FromStr, sync::Arc};
+
+use crate::walproposer_sim::{safekeeper::run_server, walproposer_api::SimulationApi};
+use desim::{
+    executor::{self, ExternalHandle},
+    node_os::NodeOs,
+    options::{Delay, NetworkOptions},
+    proto::{AnyMessage, NodeEvent},
+    world::Node,
+    world::World,
+};
+use rand::{Rng, SeedableRng};
+use tracing::{debug, info_span, warn};
+use utils::{id::TenantTimelineId, lsn::Lsn};
+use walproposer::walproposer::{Config, Wrapper};
+
+use super::{
+    log::SimClock, safekeeper_disk::SafekeeperDisk, walproposer_api,
+    walproposer_disk::DiskWalProposer,
+};
+
+/// Simulated safekeeper node.
+pub struct SafekeeperNode {
+    pub node: Arc<Node>,
+    pub id: u32,
+    pub disk: Arc<SafekeeperDisk>,
+    pub thread: Cell<ExternalHandle>,
+}
+
+impl SafekeeperNode {
+    /// Create and start a safekeeper at the specified Node.
+    pub fn new(node: Arc<Node>) -> Self {
+        let disk = Arc::new(SafekeeperDisk::new());
+        let thread = Cell::new(SafekeeperNode::launch(disk.clone(), node.clone()));
+
+        Self {
+            id: node.id,
+            node,
+            disk,
+            thread,
+        }
+    }
+
+    fn launch(disk: Arc<SafekeeperDisk>, node: Arc<Node>) -> ExternalHandle {
+        // start the server thread
+        node.launch(move |os| {
+            run_server(os, disk).expect("server should finish without errors");
+        })
+    }
+
+    /// Restart the safekeeper.
+    pub fn restart(&self) {
+        let new_thread = SafekeeperNode::launch(self.disk.clone(), self.node.clone());
+        let old_thread = self.thread.replace(new_thread);
+        old_thread.crash_stop();
+    }
+}
+
+/// Simulated walproposer node.
+pub struct WalProposer {
+    thread: ExternalHandle,
+    node: Arc<Node>,
+    disk: Arc<DiskWalProposer>,
+    sync_safekeepers: bool,
+}
+
+impl WalProposer {
+    /// Generic start function for both modes.
+    fn start(
+        os: NodeOs,
+        disk: Arc<DiskWalProposer>,
+        ttid: TenantTimelineId,
+        addrs: Vec<String>,
+        lsn: Option<Lsn>,
+    ) {
+        let sync_safekeepers = lsn.is_none();
+
+        let _enter = if sync_safekeepers {
+            info_span!("sync", started = executor::now()).entered()
+        } else {
+            info_span!("walproposer", started = executor::now()).entered()
+        };
+
+        os.log_event(format!("started;walproposer;{}", sync_safekeepers as i32));
+
+        let config = Config {
+            ttid,
+            safekeepers_list: addrs,
+            safekeeper_reconnect_timeout: 1000,
+            safekeeper_connection_timeout: 5000,
+            sync_safekeepers,
+        };
+        let args = walproposer_api::Args {
+            os,
+            config: config.clone(),
+            disk,
+            redo_start_lsn: lsn,
+        };
+        let api = SimulationApi::new(args);
+        let wp = Wrapper::new(Box::new(api), config);
+        wp.start();
+    }
+
+    /// Start walproposer in a sync_safekeepers mode.
+    pub fn launch_sync(ttid: TenantTimelineId, addrs: Vec<String>, node: Arc<Node>) -> Self {
+        debug!("sync_safekeepers started at node {}", node.id);
+        let disk = DiskWalProposer::new();
+        let disk_wp = disk.clone();
+
+        // start the client thread
+        let handle = node.launch(move |os| {
+            WalProposer::start(os, disk_wp, ttid, addrs, None);
+        });
+
+        Self {
+            thread: handle,
+            node,
+            disk,
+            sync_safekeepers: true,
+        }
+    }
+
+    /// Start walproposer in a normal mode.
+    pub fn launch_walproposer(
+        ttid: TenantTimelineId,
+        addrs: Vec<String>,
+        node: Arc<Node>,
+        lsn: Lsn,
+    ) -> Self {
+        debug!("walproposer started at node {}", node.id);
+        let disk = DiskWalProposer::new();
+        disk.lock().reset_to(lsn);
+        let disk_wp = disk.clone();
+
+        // start the client thread
+        let handle = node.launch(move |os| {
+            WalProposer::start(os, disk_wp, ttid, addrs, Some(lsn));
+        });
+
+        Self {
+            thread: handle,
+            node,
+            disk,
+            sync_safekeepers: false,
+        }
+    }
+
+    pub fn write_tx(&mut self, cnt: usize) {
+        let start_lsn = self.disk.lock().flush_rec_ptr();
+
+        for _ in 0..cnt {
+            self.disk
+                .lock()
+                .insert_logical_message("prefix", b"message")
+                .expect("failed to generate logical message");
+        }
+
+        let end_lsn = self.disk.lock().flush_rec_ptr();
+
+        // log event
+        self.node
+            .log_event(format!("write_wal;{};{};{}", start_lsn.0, end_lsn.0, cnt));
+
+        // now we need to set "Latch" in walproposer
+        self.node
+            .node_events()
+            .send(NodeEvent::Internal(AnyMessage::Just32(0)));
+    }
+
+    pub fn stop(&self) {
+        self.thread.crash_stop();
+    }
+}
+
+/// Holds basic simulation settings, such as network options.
+pub struct TestConfig {
+    pub network: NetworkOptions,
+    pub timeout: u64,
+    pub clock: Option<SimClock>,
+}
+
+impl TestConfig {
+    /// Create a new TestConfig with default settings.
+    pub fn new(clock: Option<SimClock>) -> Self {
+        Self {
+            network: NetworkOptions {
+                keepalive_timeout: Some(2000),
+                connect_delay: Delay {
+                    min: 1,
+                    max: 5,
+                    fail_prob: 0.0,
+                },
+                send_delay: Delay {
+                    min: 1,
+                    max: 5,
+                    fail_prob: 0.0,
+                },
+            },
+            timeout: 1_000 * 10,
+            clock,
+        }
+    }
+
+    /// Start a new simulation with the specified seed.
+    pub fn start(&self, seed: u64) -> Test {
+        let world = Arc::new(World::new(seed, Arc::new(self.network.clone())));
+
+        if let Some(clock) = &self.clock {
+            clock.set_clock(world.clock());
+        }
+
+        let servers = [
+            SafekeeperNode::new(world.new_node()),
+            SafekeeperNode::new(world.new_node()),
+            SafekeeperNode::new(world.new_node()),
+        ];
+
+        let server_ids = [servers[0].id, servers[1].id, servers[2].id];
+        let safekeepers_addrs = server_ids.map(|id| format!("node:{}", id)).to_vec();
+
+        let ttid = TenantTimelineId::generate();
+
+        Test {
+            world,
+            servers,
+            sk_list: safekeepers_addrs,
+            ttid,
+            timeout: self.timeout,
+        }
+    }
+}
+
+/// Holds simulation state.
+pub struct Test {
+    pub world: Arc<World>,
+    pub servers: [SafekeeperNode; 3],
+    pub sk_list: Vec<String>,
+    pub ttid: TenantTimelineId,
+    pub timeout: u64,
+}
+
+impl Test {
+    /// Start a sync_safekeepers thread and wait for it to finish.
+    pub fn sync_safekeepers(&self) -> anyhow::Result<Lsn> {
+        let wp = self.launch_sync_safekeepers();
+
+        // poll until exit or timeout
+        let time_limit = self.timeout;
+        while self.world.step() && self.world.now() < time_limit && !wp.thread.is_finished() {}
+
+        if !wp.thread.is_finished() {
+            anyhow::bail!("timeout or idle stuck");
+        }
+
+        let res = wp.thread.result();
+        if res.0 != 0 {
+            anyhow::bail!("non-zero exitcode: {:?}", res);
+        }
+        let lsn = Lsn::from_str(&res.1)?;
+        Ok(lsn)
+    }
+
+    /// Spawn a new sync_safekeepers thread.
+    pub fn launch_sync_safekeepers(&self) -> WalProposer {
+        WalProposer::launch_sync(self.ttid, self.sk_list.clone(), self.world.new_node())
+    }
+
+    /// Spawn a new walproposer thread.
+    pub fn launch_walproposer(&self, lsn: Lsn) -> WalProposer {
+        let lsn = if lsn.0 == 0 {
+            // usual LSN after basebackup
+            Lsn(21623024)
+        } else {
+            lsn
+        };
+
+        WalProposer::launch_walproposer(self.ttid, self.sk_list.clone(), self.world.new_node(), lsn)
+    }
+
+    /// Execute the simulation for the specified duration.
+    pub fn poll_for_duration(&self, duration: u64) {
+        let time_limit = std::cmp::min(self.world.now() + duration, self.timeout);
+        while self.world.step() && self.world.now() < time_limit {}
+    }
+
+    /// Execute the simulation together with events defined in some schedule.
+    pub fn run_schedule(&self, schedule: &Schedule) -> anyhow::Result<()> {
+        // scheduling empty events so that world will stop in those points
+        {
+            let clock = self.world.clock();
+
+            let now = self.world.now();
+            for (time, _) in schedule {
+                if *time < now {
+                    continue;
+                }
+                clock.schedule_fake(*time - now);
+            }
+        }
+
+        let mut wp = self.launch_sync_safekeepers();
+
+        let mut skipped_tx = 0;
+        let mut started_tx = 0;
+
+        let mut schedule_ptr = 0;
+
+        loop {
+            if wp.sync_safekeepers && wp.thread.is_finished() {
+                let res = wp.thread.result();
+                if res.0 != 0 {
+                    warn!("sync non-zero exitcode: {:?}", res);
+                    debug!("restarting sync_safekeepers");
+                    // restart the sync_safekeepers
+                    wp = self.launch_sync_safekeepers();
+                    continue;
+                }
+                let lsn = Lsn::from_str(&res.1)?;
+                debug!("sync_safekeepers finished at LSN {}", lsn);
+                wp = self.launch_walproposer(lsn);
+                debug!("walproposer started at thread {}", wp.thread.id());
+            }
+
+            let now = self.world.now();
+            while schedule_ptr < schedule.len() && schedule[schedule_ptr].0 <= now {
+                if now != schedule[schedule_ptr].0 {
+                    warn!("skipped event {:?} at {}", schedule[schedule_ptr], now);
+                }
+
+                let action = &schedule[schedule_ptr].1;
+                match action {
+                    TestAction::WriteTx(size) => {
+                        if !wp.sync_safekeepers && !wp.thread.is_finished() {
+                            started_tx += *size;
+                            wp.write_tx(*size);
+                            debug!("written {} transactions", size);
+                        } else {
+                            skipped_tx += size;
+                            debug!("skipped {} transactions", size);
+                        }
+                    }
+                    TestAction::RestartSafekeeper(id) => {
+                        debug!("restarting safekeeper {}", id);
+                        self.servers[*id].restart();
+                    }
+                    TestAction::RestartWalProposer => {
+                        debug!("restarting sync_safekeepers");
+                        wp.stop();
+                        wp = self.launch_sync_safekeepers();
+                    }
+                }
+                schedule_ptr += 1;
+            }
+
+            if schedule_ptr == schedule.len() {
+                break;
+            }
+            let next_event_time = schedule[schedule_ptr].0;
+
+            // poll until the next event
+            if wp.thread.is_finished() {
+                while self.world.step() && self.world.now() < next_event_time {}
+            } else {
+                while self.world.step()
+                    && self.world.now() < next_event_time
+                    && !wp.thread.is_finished()
+                {}
+            }
+        }
+
+        debug!(
+            "finished schedule, total steps: {}",
+            self.world.get_thread_step_count()
+        );
+        debug!("skipped_tx: {}", skipped_tx);
+        debug!("started_tx: {}", started_tx);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone)]
+pub enum TestAction {
+    WriteTx(usize),
+    RestartSafekeeper(usize),
+    RestartWalProposer,
+}
+
+pub type Schedule = Vec<(u64, TestAction)>;
+
+pub fn generate_schedule(seed: u64) -> Schedule {
+    let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
+    let mut schedule = Vec::new();
+    let mut time = 0;
+
+    let cnt = rng.gen_range(1..100);
+
+    for _ in 0..cnt {
+        time += rng.gen_range(0..500);
+        let action = match rng.gen_range(0..3) {
+            0 => TestAction::WriteTx(rng.gen_range(1..10)),
+            1 => TestAction::RestartSafekeeper(rng.gen_range(0..3)),
+            2 => TestAction::RestartWalProposer,
+            _ => unreachable!(),
+        };
+        schedule.push((time, action));
+    }
+
+    schedule
+}
+
+pub fn generate_network_opts(seed: u64) -> NetworkOptions {
+    let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
+
+    let timeout = rng.gen_range(100..2000);
+    let max_delay = rng.gen_range(1..2 * timeout);
+    let min_delay = rng.gen_range(1..=max_delay);
+
+    let max_fail_prob = rng.gen_range(0.0..0.9);
+    let connect_fail_prob = rng.gen_range(0.0..max_fail_prob);
+    let send_fail_prob = rng.gen_range(0.0..connect_fail_prob);
+
+    NetworkOptions {
+        keepalive_timeout: Some(timeout),
+        connect_delay: Delay {
+            min: min_delay,
+            max: max_delay,
+            fail_prob: connect_fail_prob,
+        },
+        send_delay: Delay {
+            min: min_delay,
+            max: max_delay,
+            fail_prob: send_fail_prob,
+        },
+    }
+}
diff --git a/safekeeper/tests/walproposer_sim/simulation_logs.rs b/safekeeper/tests/walproposer_sim/simulation_logs.rs
new file mode 100644
index 0000000000..38885e5dd0
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/simulation_logs.rs
@@ -0,0 +1,187 @@
+use desim::proto::SimEvent;
+use tracing::debug;
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+enum NodeKind {
+    Unknown,
+    Safekeeper,
+    WalProposer,
+}
+
+impl Default for NodeKind {
+    fn default() -> Self {
+        Self::Unknown
+    }
+}
+
+/// Simulation state of walproposer/safekeeper, derived from the simulation logs.
+#[derive(Clone, Debug, Default)]
+struct NodeInfo {
+    kind: NodeKind,
+
+    // walproposer
+    is_sync: bool,
+    term: u64,
+    epoch_lsn: u64,
+
+    // safekeeper
+    commit_lsn: u64,
+    flush_lsn: u64,
+}
+
+impl NodeInfo {
+    fn init_kind(&mut self, kind: NodeKind) {
+        if self.kind == NodeKind::Unknown {
+            self.kind = kind;
+        } else {
+            assert!(self.kind == kind);
+        }
+    }
+
+    fn started(&mut self, data: &str) {
+        let mut parts = data.split(';');
+        assert!(parts.next().unwrap() == "started");
+        match parts.next().unwrap() {
+            "safekeeper" => {
+                self.init_kind(NodeKind::Safekeeper);
+            }
+            "walproposer" => {
+                self.init_kind(NodeKind::WalProposer);
+                let is_sync: u8 = parts.next().unwrap().parse().unwrap();
+                self.is_sync = is_sync != 0;
+            }
+            _ => unreachable!(),
+        }
+    }
+}
+
+/// Global state of the simulation, derived from the simulation logs.
+#[derive(Debug, Default)]
+struct GlobalState {
+    nodes: Vec<NodeInfo>,
+    commit_lsn: u64,
+    write_lsn: u64,
+    max_write_lsn: u64,
+
+    written_wal: u64,
+    written_records: u64,
+}
+
+impl GlobalState {
+    fn new() -> Self {
+        Default::default()
+    }
+
+    fn get(&mut self, id: u32) -> &mut NodeInfo {
+        let id = id as usize;
+        if id >= self.nodes.len() {
+            self.nodes.resize(id + 1, NodeInfo::default());
+        }
+        &mut self.nodes[id]
+    }
+}
+
+/// Try to find inconsistencies in the simulation log.
+pub fn validate_events(events: Vec<SimEvent>) {
+    const INITDB_LSN: u64 = 21623024;
+
+    let hook = std::panic::take_hook();
+    scopeguard::defer_on_success! {
+        std::panic::set_hook(hook);
+    };
+
+    let mut state = GlobalState::new();
+    state.max_write_lsn = INITDB_LSN;
+
+    for event in events {
+        debug!("{:?}", event);
+
+        let node = state.get(event.node);
+        if event.data.starts_with("started;") {
+            node.started(&event.data);
+            continue;
+        }
+        assert!(node.kind != NodeKind::Unknown);
+
+        // drop reference to unlock state
+        let mut node = node.clone();
+
+        let mut parts = event.data.split(';');
+        match node.kind {
+            NodeKind::Safekeeper => match parts.next().unwrap() {
+                "tli_loaded" => {
+                    let flush_lsn: u64 = parts.next().unwrap().parse().unwrap();
+                    let commit_lsn: u64 = parts.next().unwrap().parse().unwrap();
+                    node.flush_lsn = flush_lsn;
+                    node.commit_lsn = commit_lsn;
+                }
+                _ => unreachable!(),
+            },
+            NodeKind::WalProposer => {
+                match parts.next().unwrap() {
+                    "prop_elected" => {
+                        let prop_lsn: u64 = parts.next().unwrap().parse().unwrap();
+                        let prop_term: u64 = parts.next().unwrap().parse().unwrap();
+                        let prev_lsn: u64 = parts.next().unwrap().parse().unwrap();
+                        let prev_term: u64 = parts.next().unwrap().parse().unwrap();
+
+                        assert!(prop_lsn >= prev_lsn);
+                        assert!(prop_term >= prev_term);
+
+                        assert!(prop_lsn >= state.commit_lsn);
+
+                        if prop_lsn > state.write_lsn {
+                            assert!(prop_lsn <= state.max_write_lsn);
+                            debug!(
+                                "moving write_lsn up from {} to {}",
+                                state.write_lsn, prop_lsn
+                            );
+                            state.write_lsn = prop_lsn;
+                        }
+                        if prop_lsn < state.write_lsn {
+                            debug!(
+                                "moving write_lsn down from {} to {}",
+                                state.write_lsn, prop_lsn
+                            );
+                            state.write_lsn = prop_lsn;
+                        }
+
+                        node.epoch_lsn = prop_lsn;
+                        node.term = prop_term;
+                    }
+                    "write_wal" => {
+                        assert!(!node.is_sync);
+                        let start_lsn: u64 = parts.next().unwrap().parse().unwrap();
+                        let end_lsn: u64 = parts.next().unwrap().parse().unwrap();
+                        let cnt: u64 = parts.next().unwrap().parse().unwrap();
+
+                        let size = end_lsn - start_lsn;
+                        state.written_wal += size;
+                        state.written_records += cnt;
+
+                        // TODO: If we allow writing WAL before winning the election
+
+                        assert!(start_lsn >= state.commit_lsn);
+                        assert!(end_lsn >= start_lsn);
+                        // assert!(start_lsn == state.write_lsn);
+                        state.write_lsn = end_lsn;
+
+                        if end_lsn > state.max_write_lsn {
+                            state.max_write_lsn = end_lsn;
+                        }
+                    }
+                    "commit_lsn" => {
+                        let lsn: u64 = parts.next().unwrap().parse().unwrap();
+                        assert!(lsn >= state.commit_lsn);
+                        state.commit_lsn = lsn;
+                    }
+                    _ => unreachable!(),
+                }
+            }
+            _ => unreachable!(),
+        }
+
+        // update the node in the state struct
+        *state.get(event.node) = node;
+    }
+}
diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs
new file mode 100644
index 0000000000..746cac019e
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs
@@ -0,0 +1,676 @@
+use std::{
+    cell::{RefCell, RefMut, UnsafeCell},
+    ffi::CStr,
+    sync::Arc,
+};
+
+use bytes::Bytes;
+use desim::{
+    executor::{self, PollSome},
+    network::TCP,
+    node_os::NodeOs,
+    proto::{AnyMessage, NetEvent, NodeEvent},
+    world::NodeId,
+};
+use tracing::debug;
+use utils::lsn::Lsn;
+use walproposer::{
+    api_bindings::Level,
+    bindings::{
+        pg_atomic_uint64, NeonWALReadResult, PageserverFeedback, SafekeeperStateDesiredEvents,
+        WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE,
+    },
+    walproposer::{ApiImpl, Config},
+};
+
+use super::walproposer_disk::DiskWalProposer;
+
+/// Special state for each wp->sk connection.
+struct SafekeeperConn {
+    host: String,
+    port: String,
+    node_id: NodeId,
+    // socket is Some(..) equals to connection is established
+    socket: Option<TCP>,
+    // connection is in progress
+    is_connecting: bool,
+    // START_WAL_PUSH is in progress
+    is_start_wal_push: bool,
+    // pointer to Safekeeper in walproposer for callbacks
+    raw_ptr: *mut walproposer::bindings::Safekeeper,
+}
+
+impl SafekeeperConn {
+    pub fn new(host: String, port: String) -> Self {
+        // port number is the same as NodeId
+        let port_num = port.parse::<u32>().unwrap();
+        Self {
+            host,
+            port,
+            node_id: port_num,
+            socket: None,
+            is_connecting: false,
+            is_start_wal_push: false,
+            raw_ptr: std::ptr::null_mut(),
+        }
+    }
+}
+
+/// Simulation version of a postgres WaitEventSet. At pos 0 there is always
+/// a special NodeEvents channel, which is used as a latch.
+struct EventSet {
+    os: NodeOs,
+    // all pollable channels, 0 is always NodeEvent channel
+    chans: Vec<Box<dyn PollSome>>,
+    // 0 is always nullptr
+    sk_ptrs: Vec<*mut walproposer::bindings::Safekeeper>,
+    // event mask for each channel
+    masks: Vec<u32>,
+}
+
+impl EventSet {
+    pub fn new(os: NodeOs) -> Self {
+        let node_events = os.node_events();
+        Self {
+            os,
+            chans: vec![Box::new(node_events)],
+            sk_ptrs: vec![std::ptr::null_mut()],
+            masks: vec![WL_SOCKET_READABLE],
+        }
+    }
+
+    /// Leaves all readable channels at the beginning of the array.
+    fn sort_readable(&mut self) -> usize {
+        let mut cnt = 1;
+        for i in 1..self.chans.len() {
+            if self.masks[i] & WL_SOCKET_READABLE != 0 {
+                self.chans.swap(i, cnt);
+                self.sk_ptrs.swap(i, cnt);
+                self.masks.swap(i, cnt);
+                cnt += 1;
+            }
+        }
+        cnt
+    }
+
+    fn update_event_set(&mut self, conn: &SafekeeperConn, event_mask: u32) {
+        let index = self
+            .sk_ptrs
+            .iter()
+            .position(|&ptr| ptr == conn.raw_ptr)
+            .expect("safekeeper should exist in event set");
+        self.masks[index] = event_mask;
+    }
+
+    fn add_safekeeper(&mut self, sk: &SafekeeperConn, event_mask: u32) {
+        for ptr in self.sk_ptrs.iter() {
+            assert!(*ptr != sk.raw_ptr);
+        }
+
+        self.chans.push(Box::new(
+            sk.socket
+                .as_ref()
+                .expect("socket should not be closed")
+                .recv_chan(),
+        ));
+        self.sk_ptrs.push(sk.raw_ptr);
+        self.masks.push(event_mask);
+    }
+
+    fn remove_safekeeper(&mut self, sk: &SafekeeperConn) {
+        let index = self.sk_ptrs.iter().position(|&ptr| ptr == sk.raw_ptr);
+        if index.is_none() {
+            debug!("remove_safekeeper: sk={:?} not found", sk.raw_ptr);
+            return;
+        }
+        let index = index.unwrap();
+
+        self.chans.remove(index);
+        self.sk_ptrs.remove(index);
+        self.masks.remove(index);
+
+        // to simulate the actual behaviour
+        self.refresh_event_set();
+    }
+
+    /// Updates all masks to match the result of a SafekeeperStateDesiredEvents.
+    fn refresh_event_set(&mut self) {
+        for (i, mask) in self.masks.iter_mut().enumerate() {
+            if i == 0 {
+                continue;
+            }
+
+            let mut mask_sk: u32 = 0;
+            let mut mask_nwr: u32 = 0;
+            unsafe { SafekeeperStateDesiredEvents(self.sk_ptrs[i], &mut mask_sk, &mut mask_nwr) };
+
+            if mask_sk != *mask {
+                debug!(
+                    "refresh_event_set: sk={:?}, old_mask={:#b}, new_mask={:#b}",
+                    self.sk_ptrs[i], *mask, mask_sk
+                );
+                *mask = mask_sk;
+            }
+        }
+    }
+
+    /// Wait for events on all channels.
+    fn wait(&mut self, timeout_millis: i64) -> walproposer::walproposer::WaitResult {
+        // all channels are always writeable
+        for (i, mask) in self.masks.iter().enumerate() {
+            if *mask & WL_SOCKET_WRITEABLE != 0 {
+                return walproposer::walproposer::WaitResult::Network(
+                    self.sk_ptrs[i],
+                    WL_SOCKET_WRITEABLE,
+                );
+            }
+        }
+
+        let cnt = self.sort_readable();
+
+        let slice = &self.chans[0..cnt];
+        match executor::epoll_chans(slice, timeout_millis) {
+            None => walproposer::walproposer::WaitResult::Timeout,
+            Some(0) => {
+                let msg = self.os.node_events().must_recv();
+                match msg {
+                    NodeEvent::Internal(AnyMessage::Just32(0)) => {
+                        // got a notification about new WAL available
+                    }
+                    NodeEvent::Internal(_) => unreachable!(),
+                    NodeEvent::Accept(_) => unreachable!(),
+                }
+                walproposer::walproposer::WaitResult::Latch
+            }
+            Some(index) => walproposer::walproposer::WaitResult::Network(
+                self.sk_ptrs[index],
+                WL_SOCKET_READABLE,
+            ),
+        }
+    }
+}
+
+/// This struct handles all calls from walproposer into walproposer_api.
+pub struct SimulationApi {
+    os: NodeOs,
+    safekeepers: RefCell<Vec<SafekeeperConn>>,
+    disk: Arc<DiskWalProposer>,
+    redo_start_lsn: Option<Lsn>,
+    shmem: UnsafeCell<walproposer::bindings::WalproposerShmemState>,
+    config: Config,
+    event_set: RefCell<Option<EventSet>>,
+}
+
+pub struct Args {
+    pub os: NodeOs,
+    pub config: Config,
+    pub disk: Arc<DiskWalProposer>,
+    pub redo_start_lsn: Option<Lsn>,
+}
+
+impl SimulationApi {
+    pub fn new(args: Args) -> Self {
+        // initialize connection state for each safekeeper
+        let sk_conns = args
+            .config
+            .safekeepers_list
+            .iter()
+            .map(|s| {
+                SafekeeperConn::new(
+                    s.split(':').next().unwrap().to_string(),
+                    s.split(':').nth(1).unwrap().to_string(),
+                )
+            })
+            .collect::<Vec<_>>();
+
+        Self {
+            os: args.os,
+            safekeepers: RefCell::new(sk_conns),
+            disk: args.disk,
+            redo_start_lsn: args.redo_start_lsn,
+            shmem: UnsafeCell::new(walproposer::bindings::WalproposerShmemState {
+                mutex: 0,
+                feedback: PageserverFeedback {
+                    currentClusterSize: 0,
+                    last_received_lsn: 0,
+                    disk_consistent_lsn: 0,
+                    remote_consistent_lsn: 0,
+                    replytime: 0,
+                },
+                mineLastElectedTerm: 0,
+                backpressureThrottlingTime: pg_atomic_uint64 { value: 0 },
+            }),
+            config: args.config,
+            event_set: RefCell::new(None),
+        }
+    }
+
+    /// Get SafekeeperConn for the given Safekeeper.
+    fn get_conn(&self, sk: &mut walproposer::bindings::Safekeeper) -> RefMut<'_, SafekeeperConn> {
+        let sk_port = unsafe { CStr::from_ptr(sk.port).to_str().unwrap() };
+        let state = self.safekeepers.borrow_mut();
+        RefMut::map(state, |v| {
+            v.iter_mut()
+                .find(|conn| conn.port == sk_port)
+                .expect("safekeeper conn not found by port")
+        })
+    }
+}
+
+impl ApiImpl for SimulationApi {
+    fn get_current_timestamp(&self) -> i64 {
+        debug!("get_current_timestamp");
+        // PG TimestampTZ is microseconds, but simulation unit is assumed to be
+        // milliseconds, so add 10^3
+        self.os.now() as i64 * 1000
+    }
+
+    fn conn_status(
+        &self,
+        _: &mut walproposer::bindings::Safekeeper,
+    ) -> walproposer::bindings::WalProposerConnStatusType {
+        debug!("conn_status");
+        // break the connection with a 10% chance
+        if self.os.random(100) < 10 {
+            walproposer::bindings::WalProposerConnStatusType_WP_CONNECTION_BAD
+        } else {
+            walproposer::bindings::WalProposerConnStatusType_WP_CONNECTION_OK
+        }
+    }
+
+    fn conn_connect_start(&self, sk: &mut walproposer::bindings::Safekeeper) {
+        debug!("conn_connect_start");
+        let mut conn = self.get_conn(sk);
+
+        assert!(conn.socket.is_none());
+        let socket = self.os.open_tcp(conn.node_id);
+        conn.socket = Some(socket);
+        conn.raw_ptr = sk;
+        conn.is_connecting = true;
+    }
+
+    fn conn_connect_poll(
+        &self,
+        _: &mut walproposer::bindings::Safekeeper,
+    ) -> walproposer::bindings::WalProposerConnectPollStatusType {
+        debug!("conn_connect_poll");
+        // TODO: break the connection here
+        walproposer::bindings::WalProposerConnectPollStatusType_WP_CONN_POLLING_OK
+    }
+
+    fn conn_send_query(&self, sk: &mut walproposer::bindings::Safekeeper, query: &str) -> bool {
+        debug!("conn_send_query: {}", query);
+        self.get_conn(sk).is_start_wal_push = true;
+        true
+    }
+
+    fn conn_get_query_result(
+        &self,
+        _: &mut walproposer::bindings::Safekeeper,
+    ) -> walproposer::bindings::WalProposerExecStatusType {
+        debug!("conn_get_query_result");
+        // TODO: break the connection here
+        walproposer::bindings::WalProposerExecStatusType_WP_EXEC_SUCCESS_COPYBOTH
+    }
+
+    fn conn_async_read(
+        &self,
+        sk: &mut walproposer::bindings::Safekeeper,
+        vec: &mut Vec<u8>,
+    ) -> walproposer::bindings::PGAsyncReadResult {
+        debug!("conn_async_read");
+        let mut conn = self.get_conn(sk);
+
+        let socket = if let Some(socket) = conn.socket.as_mut() {
+            socket
+        } else {
+            // socket is already closed
+            return walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_FAIL;
+        };
+
+        let msg = socket.recv_chan().try_recv();
+
+        match msg {
+            None => {
+                // no message is ready
+                walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_TRY_AGAIN
+            }
+            Some(NetEvent::Closed) => {
+                // connection is closed
+                debug!("conn_async_read: connection is closed");
+                conn.socket = None;
+                walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_FAIL
+            }
+            Some(NetEvent::Message(msg)) => {
+                // got a message
+                let b = match msg {
+                    desim::proto::AnyMessage::Bytes(b) => b,
+                    _ => unreachable!(),
+                };
+                vec.extend_from_slice(&b);
+                walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS
+            }
+        }
+    }
+
+    fn conn_blocking_write(&self, sk: &mut walproposer::bindings::Safekeeper, buf: &[u8]) -> bool {
+        let mut conn = self.get_conn(sk);
+        debug!("conn_blocking_write to {}: {:?}", conn.node_id, buf);
+        let socket = conn.socket.as_mut().unwrap();
+        socket.send(desim::proto::AnyMessage::Bytes(Bytes::copy_from_slice(buf)));
+        true
+    }
+
+    fn conn_async_write(
+        &self,
+        sk: &mut walproposer::bindings::Safekeeper,
+        buf: &[u8],
+    ) -> walproposer::bindings::PGAsyncWriteResult {
+        let mut conn = self.get_conn(sk);
+        debug!("conn_async_write to {}: {:?}", conn.node_id, buf);
+        if let Some(socket) = conn.socket.as_mut() {
+            socket.send(desim::proto::AnyMessage::Bytes(Bytes::copy_from_slice(buf)));
+        } else {
+            // connection is already closed
+            debug!("conn_async_write: writing to a closed socket!");
+            // TODO: maybe we should return error here?
+        }
+        walproposer::bindings::PGAsyncWriteResult_PG_ASYNC_WRITE_SUCCESS
+    }
+
+    fn wal_reader_allocate(&self, _: &mut walproposer::bindings::Safekeeper) -> NeonWALReadResult {
+        debug!("wal_reader_allocate");
+        walproposer::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS
+    }
+
+    fn wal_read(
+        &self,
+        _sk: &mut walproposer::bindings::Safekeeper,
+        buf: &mut [u8],
+        startpos: u64,
+    ) -> NeonWALReadResult {
+        self.disk.lock().read(startpos, buf);
+        walproposer::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS
+    }
+
+    fn init_event_set(&self, _: &mut walproposer::bindings::WalProposer) {
+        debug!("init_event_set");
+        let new_event_set = EventSet::new(self.os.clone());
+        let old_event_set = self.event_set.replace(Some(new_event_set));
+        assert!(old_event_set.is_none());
+    }
+
+    fn update_event_set(&self, sk: &mut walproposer::bindings::Safekeeper, event_mask: u32) {
+        debug!(
+            "update_event_set, sk={:?}, events_mask={:#b}",
+            sk as *mut walproposer::bindings::Safekeeper, event_mask
+        );
+        let conn = self.get_conn(sk);
+
+        self.event_set
+            .borrow_mut()
+            .as_mut()
+            .unwrap()
+            .update_event_set(&conn, event_mask);
+    }
+
+    fn add_safekeeper_event_set(
+        &self,
+        sk: &mut walproposer::bindings::Safekeeper,
+        event_mask: u32,
+    ) {
+        debug!(
+            "add_safekeeper_event_set, sk={:?}, events_mask={:#b}",
+            sk as *mut walproposer::bindings::Safekeeper, event_mask
+        );
+
+        self.event_set
+            .borrow_mut()
+            .as_mut()
+            .unwrap()
+            .add_safekeeper(&self.get_conn(sk), event_mask);
+    }
+
+    fn rm_safekeeper_event_set(&self, sk: &mut walproposer::bindings::Safekeeper) {
+        debug!(
+            "rm_safekeeper_event_set, sk={:?}",
+            sk as *mut walproposer::bindings::Safekeeper,
+        );
+
+        self.event_set
+            .borrow_mut()
+            .as_mut()
+            .unwrap()
+            .remove_safekeeper(&self.get_conn(sk));
+    }
+
+    fn active_state_update_event_set(&self, sk: &mut walproposer::bindings::Safekeeper) {
+        debug!("active_state_update_event_set");
+
+        assert!(sk.state == walproposer::bindings::SafekeeperState_SS_ACTIVE);
+        self.event_set
+            .borrow_mut()
+            .as_mut()
+            .unwrap()
+            .refresh_event_set();
+    }
+
+    fn wal_reader_events(&self, _sk: &mut walproposer::bindings::Safekeeper) -> u32 {
+        0
+    }
+
+    fn wait_event_set(
+        &self,
+        _: &mut walproposer::bindings::WalProposer,
+        timeout_millis: i64,
+    ) -> walproposer::walproposer::WaitResult {
+        // TODO: handle multiple stages as part of the simulation (e.g. connect, start_wal_push, etc)
+        let mut conns = self.safekeepers.borrow_mut();
+        for conn in conns.iter_mut() {
+            if conn.socket.is_some() && conn.is_connecting {
+                conn.is_connecting = false;
+                debug!("wait_event_set, connecting to {}:{}", conn.host, conn.port);
+                return walproposer::walproposer::WaitResult::Network(
+                    conn.raw_ptr,
+                    WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE,
+                );
+            }
+            if conn.socket.is_some() && conn.is_start_wal_push {
+                conn.is_start_wal_push = false;
+                debug!(
+                    "wait_event_set, start wal push to {}:{}",
+                    conn.host, conn.port
+                );
+                return walproposer::walproposer::WaitResult::Network(
+                    conn.raw_ptr,
+                    WL_SOCKET_READABLE,
+                );
+            }
+        }
+        drop(conns);
+
+        let res = self
+            .event_set
+            .borrow_mut()
+            .as_mut()
+            .unwrap()
+            .wait(timeout_millis);
+
+        debug!(
+            "wait_event_set, timeout_millis={}, res={:?}",
+            timeout_millis, res,
+        );
+        res
+    }
+
+    fn strong_random(&self, buf: &mut [u8]) -> bool {
+        debug!("strong_random");
+        buf.fill(0);
+        true
+    }
+
+    fn finish_sync_safekeepers(&self, lsn: u64) {
+        debug!("finish_sync_safekeepers, lsn={}", lsn);
+        executor::exit(0, Lsn(lsn).to_string());
+    }
+
+    fn log_internal(&self, _wp: &mut walproposer::bindings::WalProposer, level: Level, msg: &str) {
+        debug!("wp_log[{}] {}", level, msg);
+        if level == Level::Fatal || level == Level::Panic {
+            if msg.contains("rejects our connection request with term") {
+                // collected quorum with lower term, then got rejected by next connected safekeeper
+                executor::exit(1, msg.to_owned());
+            }
+            if msg.contains("collected propEpochStartLsn") && msg.contains(", but basebackup LSN ")
+            {
+                // sync-safekeepers collected wrong quorum, walproposer collected another quorum
+                executor::exit(1, msg.to_owned());
+            }
+            if msg.contains("failed to download WAL for logical replicaiton") {
+                // Recovery connection broken and recovery was failed
+                executor::exit(1, msg.to_owned());
+            }
+            if msg.contains("missing majority of votes, collected") {
+                // Voting bug when safekeeper disconnects after voting
+                executor::exit(1, msg.to_owned());
+            }
+            panic!("unknown FATAL error from walproposer: {}", msg);
+        }
+    }
+
+    fn after_election(&self, wp: &mut walproposer::bindings::WalProposer) {
+        let prop_lsn = wp.propEpochStartLsn;
+        let prop_term = wp.propTerm;
+
+        let mut prev_lsn: u64 = 0;
+        let mut prev_term: u64 = 0;
+
+        unsafe {
+            let history = wp.propTermHistory.entries;
+            let len = wp.propTermHistory.n_entries as usize;
+            if len > 1 {
+                let entry = *history.wrapping_add(len - 2);
+                prev_lsn = entry.lsn;
+                prev_term = entry.term;
+            }
+        }
+
+        let msg = format!(
+            "prop_elected;{};{};{};{}",
+            prop_lsn, prop_term, prev_lsn, prev_term
+        );
+
+        debug!(msg);
+        self.os.log_event(msg);
+    }
+
+    fn get_redo_start_lsn(&self) -> u64 {
+        debug!("get_redo_start_lsn -> {:?}", self.redo_start_lsn);
+        self.redo_start_lsn.expect("redo_start_lsn is not set").0
+    }
+
+    fn get_shmem_state(&self) -> *mut walproposer::bindings::WalproposerShmemState {
+        self.shmem.get()
+    }
+
+    fn start_streaming(
+        &self,
+        startpos: u64,
+        callback: &walproposer::walproposer::StreamingCallback,
+    ) {
+        let disk = &self.disk;
+        let disk_lsn = disk.lock().flush_rec_ptr().0;
+        debug!("start_streaming at {} (disk_lsn={})", startpos, disk_lsn);
+        if startpos < disk_lsn {
+            debug!("startpos < disk_lsn, it means we wrote some transaction even before streaming started");
+        }
+        assert!(startpos <= disk_lsn);
+        let mut broadcasted = Lsn(startpos);
+
+        loop {
+            let available = disk.lock().flush_rec_ptr();
+            assert!(available >= broadcasted);
+            callback.broadcast(broadcasted, available);
+            broadcasted = available;
+            callback.poll();
+        }
+    }
+
+    fn process_safekeeper_feedback(
+        &self,
+        wp: &mut walproposer::bindings::WalProposer,
+        commit_lsn: u64,
+    ) {
+        debug!("process_safekeeper_feedback, commit_lsn={}", commit_lsn);
+        if commit_lsn > wp.lastSentCommitLsn {
+            self.os.log_event(format!("commit_lsn;{}", commit_lsn));
+        }
+    }
+
+    fn get_flush_rec_ptr(&self) -> u64 {
+        let lsn = self.disk.lock().flush_rec_ptr();
+        debug!("get_flush_rec_ptr: {}", lsn);
+        lsn.0
+    }
+
+    fn recovery_download(
+        &self,
+        wp: &mut walproposer::bindings::WalProposer,
+        sk: &mut walproposer::bindings::Safekeeper,
+    ) -> bool {
+        let mut startpos = wp.truncateLsn;
+        let endpos = wp.propEpochStartLsn;
+
+        if startpos == endpos {
+            debug!("recovery_download: nothing to download");
+            return true;
+        }
+
+        debug!("recovery_download from {} to {}", startpos, endpos,);
+
+        let replication_prompt = format!(
+            "START_REPLICATION {} {} {} {}",
+            self.config.ttid.tenant_id, self.config.ttid.timeline_id, startpos, endpos,
+        );
+        let async_conn = self.get_conn(sk);
+
+        let conn = self.os.open_tcp(async_conn.node_id);
+        conn.send(desim::proto::AnyMessage::Bytes(replication_prompt.into()));
+
+        let chan = conn.recv_chan();
+        while startpos < endpos {
+            let event = chan.recv();
+            match event {
+                NetEvent::Closed => {
+                    debug!("connection closed in recovery");
+                    break;
+                }
+                NetEvent::Message(AnyMessage::Bytes(b)) => {
+                    debug!("got recovery bytes from safekeeper");
+                    self.disk.lock().write(startpos, &b);
+                    startpos += b.len() as u64;
+                }
+                NetEvent::Message(_) => unreachable!(),
+            }
+        }
+
+        debug!("recovery finished at {}", startpos);
+
+        startpos == endpos
+    }
+
+    fn conn_finish(&self, sk: &mut walproposer::bindings::Safekeeper) {
+        let mut conn = self.get_conn(sk);
+        debug!("conn_finish to {}", conn.node_id);
+        if let Some(socket) = conn.socket.as_mut() {
+            socket.close();
+        } else {
+            // connection is already closed
+        }
+        conn.socket = None;
+    }
+
+    fn conn_error_message(&self, _sk: &mut walproposer::bindings::Safekeeper) -> String {
+        "connection is closed, probably".into()
+    }
+}
diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs
new file mode 100644
index 0000000000..aa329bd2f0
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs
@@ -0,0 +1,314 @@
+use std::{ffi::CString, sync::Arc};
+
+use byteorder::{LittleEndian, WriteBytesExt};
+use crc32c::crc32c_append;
+use parking_lot::{Mutex, MutexGuard};
+use postgres_ffi::{
+    pg_constants::{
+        RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE, XLP_LONG_HEADER, XLR_BLOCK_ID_DATA_LONG,
+        XLR_BLOCK_ID_DATA_SHORT,
+    },
+    v16::{
+        wal_craft_test_export::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC},
+        xlog_utils::{
+            XLogSegNoOffsetToRecPtr, XlLogicalMessage, XLOG_RECORD_CRC_OFFS,
+            XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
+            XLP_FIRST_IS_CONTRECORD,
+        },
+        XLogRecord,
+    },
+    WAL_SEGMENT_SIZE, XLOG_BLCKSZ,
+};
+use utils::lsn::Lsn;
+
+use super::block_storage::BlockStorage;
+
+/// Simulation implementation of walproposer WAL storage.
+pub struct DiskWalProposer {
+    state: Mutex<State>,
+}
+
+impl DiskWalProposer {
+    pub fn new() -> Arc<DiskWalProposer> {
+        Arc::new(DiskWalProposer {
+            state: Mutex::new(State {
+                internal_available_lsn: Lsn(0),
+                prev_lsn: Lsn(0),
+                disk: BlockStorage::new(),
+            }),
+        })
+    }
+
+    pub fn lock(&self) -> MutexGuard<State> {
+        self.state.lock()
+    }
+}
+
+pub struct State {
+    // flush_lsn
+    internal_available_lsn: Lsn,
+    // needed for WAL generation
+    prev_lsn: Lsn,
+    // actual WAL storage
+    disk: BlockStorage,
+}
+
+impl State {
+    pub fn read(&self, pos: u64, buf: &mut [u8]) {
+        self.disk.read(pos, buf);
+        // TODO: fail on reading uninitialized data
+    }
+
+    pub fn write(&mut self, pos: u64, buf: &[u8]) {
+        self.disk.write(pos, buf);
+    }
+
+    /// Update the internal available LSN to the given value.
+    pub fn reset_to(&mut self, lsn: Lsn) {
+        self.internal_available_lsn = lsn;
+    }
+
+    /// Get current LSN.
+    pub fn flush_rec_ptr(&self) -> Lsn {
+        self.internal_available_lsn
+    }
+
+    /// Generate a new WAL record at the current LSN.
+    pub fn insert_logical_message(&mut self, prefix: &str, msg: &[u8]) -> anyhow::Result<()> {
+        let prefix_cstr = CString::new(prefix)?;
+        let prefix_bytes = prefix_cstr.as_bytes_with_nul();
+
+        let lm = XlLogicalMessage {
+            db_id: 0,
+            transactional: 0,
+            prefix_size: prefix_bytes.len() as ::std::os::raw::c_ulong,
+            message_size: msg.len() as ::std::os::raw::c_ulong,
+        };
+
+        let record_bytes = lm.encode();
+        let rdatas: Vec<&[u8]> = vec![&record_bytes, prefix_bytes, msg];
+        insert_wal_record(self, rdatas, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE)
+    }
+}
+
+fn insert_wal_record(
+    state: &mut State,
+    rdatas: Vec<&[u8]>,
+    rmid: u8,
+    info: u8,
+) -> anyhow::Result<()> {
+    // bytes right after the header, in the same rdata block
+    let mut scratch = Vec::new();
+    let mainrdata_len: usize = rdatas.iter().map(|rdata| rdata.len()).sum();
+
+    if mainrdata_len > 0 {
+        if mainrdata_len > 255 {
+            scratch.push(XLR_BLOCK_ID_DATA_LONG);
+            // TODO: verify endiness
+            let _ = scratch.write_u32::<LittleEndian>(mainrdata_len as u32);
+        } else {
+            scratch.push(XLR_BLOCK_ID_DATA_SHORT);
+            scratch.push(mainrdata_len as u8);
+        }
+    }
+
+    let total_len: u32 = (XLOG_SIZE_OF_XLOG_RECORD + scratch.len() + mainrdata_len) as u32;
+    let size = maxalign(total_len);
+    assert!(size as usize > XLOG_SIZE_OF_XLOG_RECORD);
+
+    let start_bytepos = recptr_to_bytepos(state.internal_available_lsn);
+    let end_bytepos = start_bytepos + size as u64;
+
+    let start_recptr = bytepos_to_recptr(start_bytepos);
+    let end_recptr = bytepos_to_recptr(end_bytepos);
+
+    assert!(recptr_to_bytepos(start_recptr) == start_bytepos);
+    assert!(recptr_to_bytepos(end_recptr) == end_bytepos);
+
+    let mut crc = crc32c_append(0, &scratch);
+    for rdata in &rdatas {
+        crc = crc32c_append(crc, rdata);
+    }
+
+    let mut header = XLogRecord {
+        xl_tot_len: total_len,
+        xl_xid: 0,
+        xl_prev: state.prev_lsn.0,
+        xl_info: info,
+        xl_rmid: rmid,
+        __bindgen_padding_0: [0u8; 2usize],
+        xl_crc: crc,
+    };
+
+    // now we have the header and can finish the crc
+    let header_bytes = header.encode()?;
+    let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]);
+    header.xl_crc = crc;
+
+    let mut header_bytes = header.encode()?.to_vec();
+    assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_RECORD);
+
+    header_bytes.extend_from_slice(&scratch);
+
+    // finish rdatas
+    let mut rdatas = rdatas;
+    rdatas.insert(0, &header_bytes);
+
+    write_walrecord_to_disk(state, total_len as u64, rdatas, start_recptr, end_recptr)?;
+
+    state.internal_available_lsn = end_recptr;
+    state.prev_lsn = start_recptr;
+    Ok(())
+}
+
+fn write_walrecord_to_disk(
+    state: &mut State,
+    total_len: u64,
+    rdatas: Vec<&[u8]>,
+    start: Lsn,
+    end: Lsn,
+) -> anyhow::Result<()> {
+    let mut curr_ptr = start;
+    let mut freespace = insert_freespace(curr_ptr);
+    let mut written: usize = 0;
+
+    assert!(freespace >= std::mem::size_of::<u32>());
+
+    for mut rdata in rdatas {
+        while rdata.len() >= freespace {
+            assert!(
+                curr_ptr.segment_offset(WAL_SEGMENT_SIZE) >= XLOG_SIZE_OF_XLOG_SHORT_PHD
+                    || freespace == 0
+            );
+
+            state.write(curr_ptr.0, &rdata[..freespace]);
+            rdata = &rdata[freespace..];
+            written += freespace;
+            curr_ptr = Lsn(curr_ptr.0 + freespace as u64);
+
+            let mut new_page = XLogPageHeaderData {
+                xlp_magic: XLOG_PAGE_MAGIC as u16,
+                xlp_info: XLP_BKP_REMOVABLE,
+                xlp_tli: 1,
+                xlp_pageaddr: curr_ptr.0,
+                xlp_rem_len: (total_len - written as u64) as u32,
+                ..Default::default() // Put 0 in padding fields.
+            };
+            if new_page.xlp_rem_len > 0 {
+                new_page.xlp_info |= XLP_FIRST_IS_CONTRECORD;
+            }
+
+            if curr_ptr.segment_offset(WAL_SEGMENT_SIZE) == 0 {
+                new_page.xlp_info |= XLP_LONG_HEADER;
+                let long_page = XLogLongPageHeaderData {
+                    std: new_page,
+                    xlp_sysid: 0,
+                    xlp_seg_size: WAL_SEGMENT_SIZE as u32,
+                    xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
+                };
+                let header_bytes = long_page.encode()?;
+                assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_LONG_PHD);
+                state.write(curr_ptr.0, &header_bytes);
+                curr_ptr = Lsn(curr_ptr.0 + header_bytes.len() as u64);
+            } else {
+                let header_bytes = new_page.encode()?;
+                assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_SHORT_PHD);
+                state.write(curr_ptr.0, &header_bytes);
+                curr_ptr = Lsn(curr_ptr.0 + header_bytes.len() as u64);
+            }
+            freespace = insert_freespace(curr_ptr);
+        }
+
+        assert!(
+            curr_ptr.segment_offset(WAL_SEGMENT_SIZE) >= XLOG_SIZE_OF_XLOG_SHORT_PHD
+                || rdata.is_empty()
+        );
+        state.write(curr_ptr.0, rdata);
+        curr_ptr = Lsn(curr_ptr.0 + rdata.len() as u64);
+        written += rdata.len();
+        freespace -= rdata.len();
+    }
+
+    assert!(written == total_len as usize);
+    curr_ptr.0 = maxalign(curr_ptr.0);
+    assert!(curr_ptr == end);
+    Ok(())
+}
+
+fn maxalign<T>(size: T) -> T
+where
+    T: std::ops::BitAnd<Output = T>
+        + std::ops::Add<Output = T>
+        + std::ops::Not<Output = T>
+        + From<u8>,
+{
+    (size + T::from(7)) & !T::from(7)
+}
+
+fn insert_freespace(ptr: Lsn) -> usize {
+    if ptr.block_offset() == 0 {
+        0
+    } else {
+        (XLOG_BLCKSZ as u64 - ptr.block_offset()) as usize
+    }
+}
+
+const XLP_BKP_REMOVABLE: u16 = 0x0004;
+const USABLE_BYTES_IN_PAGE: u64 = (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64;
+const USABLE_BYTES_IN_SEGMENT: u64 = ((WAL_SEGMENT_SIZE / XLOG_BLCKSZ) as u64
+    * USABLE_BYTES_IN_PAGE)
+    - (XLOG_SIZE_OF_XLOG_RECORD - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64;
+
+fn bytepos_to_recptr(bytepos: u64) -> Lsn {
+    let fullsegs = bytepos / USABLE_BYTES_IN_SEGMENT;
+    let mut bytesleft = bytepos % USABLE_BYTES_IN_SEGMENT;
+
+    let seg_offset = if bytesleft < (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64 {
+        // fits on first page of segment
+        bytesleft + XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
+    } else {
+        // account for the first page on segment with long header
+        bytesleft -= (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64;
+        let fullpages = bytesleft / USABLE_BYTES_IN_PAGE;
+        bytesleft %= USABLE_BYTES_IN_PAGE;
+
+        XLOG_BLCKSZ as u64
+            + fullpages * XLOG_BLCKSZ as u64
+            + bytesleft
+            + XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
+    };
+
+    Lsn(XLogSegNoOffsetToRecPtr(
+        fullsegs,
+        seg_offset as u32,
+        WAL_SEGMENT_SIZE,
+    ))
+}
+
+fn recptr_to_bytepos(ptr: Lsn) -> u64 {
+    let fullsegs = ptr.segment_number(WAL_SEGMENT_SIZE);
+    let offset = ptr.segment_offset(WAL_SEGMENT_SIZE) as u64;
+
+    let fullpages = offset / XLOG_BLCKSZ as u64;
+    let offset = offset % XLOG_BLCKSZ as u64;
+
+    if fullpages == 0 {
+        fullsegs * USABLE_BYTES_IN_SEGMENT
+            + if offset > 0 {
+                assert!(offset >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
+                offset - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
+            } else {
+                0
+            }
+    } else {
+        fullsegs * USABLE_BYTES_IN_SEGMENT
+            + (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64
+            + (fullpages - 1) * USABLE_BYTES_IN_PAGE
+            + if offset > 0 {
+                assert!(offset >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
+                offset - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
+            } else {
+                0
+            }
+    }
+}

From a8eb4042baa6ca1ae4268a1f1b22a89941b0d942 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 13 Feb 2024 07:00:50 +0000
Subject: [PATCH 0164/1571] tests: test_secondary_mode_eviction: avoid use of
 mocked statvfs (#6698)

## Problem

Test sometimes fails with `used_blocks > total_blocks`, because when
using mocked statvfs with the total blocks set to the size of data on
disk before starting, we are implicitly asserting that nothing at all
can be written to disk between startup and calling statvfs.

Related: https://github.com/neondatabase/neon/issues/6511

## Summary of changes

- Use HTTP API to invoke disk usage eviction instead of mocked statvfs
---
 .../regress/test_disk_usage_eviction.py       | 33 +++----------------
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 061c57c88b..eb4e370ea7 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -893,37 +893,14 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
         # in its heatmap
         ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
-    # Configure the secondary pageserver to have a phony small disk size
-    ps_secondary.stop()
     total_size, _, _ = env.timelines_du(ps_secondary)
-    blocksize = 512
-    total_blocks = (total_size + (blocksize - 1)) // blocksize
+    evict_bytes = total_size // 3
 
-    min_avail_bytes = total_size // 3
-
-    env.pageserver_start_with_disk_usage_eviction(
-        ps_secondary,
-        period="1s",
-        max_usage_pct=100,
-        min_avail_bytes=min_avail_bytes,
-        mock_behavior={
-            "type": "Success",
-            "blocksize": blocksize,
-            "total_blocks": total_blocks,
-            # Only count layer files towards used bytes in the mock_statvfs.
-            # This avoids accounting for metadata files & tenant conf in the tests.
-            "name_filter": ".*__.*",
-        },
-        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
-    )
-
-    def relieved_log_message():
-        assert ps_secondary.log_contains(".*disk usage pressure relieved")
-
-    wait_until(10, 1, relieved_log_message)
+    response = ps_secondary.http_client().disk_usage_eviction_run({"evict_bytes": evict_bytes})
+    log.info(f"{response}")
 
     post_eviction_total_size, _, _ = env.timelines_du(ps_secondary)
 
     assert (
-        total_size - post_eviction_total_size >= min_avail_bytes
-    ), "we requested at least min_avail_bytes worth of free space"
+        total_size - post_eviction_total_size >= evict_bytes
+    ), "we requested at least evict_bytes worth of free space"

From 331935df91abe03a1e8a081bc96b6ef871f71bb1 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 13 Feb 2024 17:58:58 +0100
Subject: [PATCH 0165/1571] Proxy: send cancel notifications to all instances
 (#6719)

## Problem

If cancel request ends up on the wrong proxy instance, it doesn't take
an effect.

## Summary of changes

Send redis notifications to all proxy pods about the cancel request.

Related issue: https://github.com/neondatabase/neon/issues/5839,
https://github.com/neondatabase/cloud/issues/10262
---
 Cargo.lock                        |   7 +-
 Cargo.toml                        |   2 +-
 libs/pq_proto/Cargo.toml          |   1 +
 libs/pq_proto/src/lib.rs          |   3 +-
 proxy/src/bin/proxy.rs            |  32 ++++-
 proxy/src/cancellation.rs         | 109 ++++++++++++++---
 proxy/src/config.rs               |   1 +
 proxy/src/metrics.rs              |   9 ++
 proxy/src/proxy.rs                |  16 +--
 proxy/src/rate_limiter.rs         |   2 +-
 proxy/src/rate_limiter/limiter.rs |  38 ++++++
 proxy/src/redis.rs                |   1 +
 proxy/src/redis/notifications.rs  | 197 +++++++++++++++++++++++-------
 proxy/src/redis/publisher.rs      |  80 ++++++++++++
 proxy/src/serverless.rs           |  13 +-
 proxy/src/serverless/websocket.rs |   6 +-
 workspace_hack/Cargo.toml         |   4 +-
 17 files changed, 432 insertions(+), 89 deletions(-)
 create mode 100644 proxy/src/redis/publisher.rs

diff --git a/Cargo.lock b/Cargo.lock
index f11c774016..45a313a72b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2263,11 +2263,11 @@ dependencies = [
 
 [[package]]
 name = "hashlink"
-version = "0.8.2"
+version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0761a1b9491c4f2e3d66aa0f62d0fba0af9a0e2852e4d48ea506632a4b56e6aa"
+checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
 dependencies = [
- "hashbrown 0.13.2",
+ "hashbrown 0.14.0",
 ]
 
 [[package]]
@@ -3952,6 +3952,7 @@ dependencies = [
  "pin-project-lite",
  "postgres-protocol",
  "rand 0.8.5",
+ "serde",
  "thiserror",
  "tokio",
  "tracing",
diff --git a/Cargo.toml b/Cargo.toml
index 8df9ca9988..8952f7627f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -81,7 +81,7 @@ futures-core = "0.3"
 futures-util = "0.3"
 git-version = "0.3"
 hashbrown = "0.13"
-hashlink = "0.8.1"
+hashlink = "0.8.4"
 hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index b286eb0358..6eeb3bafef 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -13,5 +13,6 @@ rand.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 thiserror.workspace = true
+serde.workspace = true
 
 workspace_hack.workspace = true
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index c52a21bcd3..522b65f5d1 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -7,6 +7,7 @@ pub mod framed;
 
 use byteorder::{BigEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
+use serde::{Deserialize, Serialize};
 use std::{borrow::Cow, collections::HashMap, fmt, io, str};
 
 // re-export for use in utils pageserver_feedback.rs
@@ -123,7 +124,7 @@ impl StartupMessageParams {
     }
 }
 
-#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
+#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
 pub struct CancelKeyData {
     pub backend_pid: i32,
     pub cancel_key: i32,
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 00a229c135..b3d4fc0411 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,6 +1,8 @@
 use futures::future::Either;
 use proxy::auth;
 use proxy::auth::backend::MaybeOwned;
+use proxy::cancellation::CancelMap;
+use proxy::cancellation::CancellationHandler;
 use proxy::config::AuthenticationConfig;
 use proxy::config::CacheOptions;
 use proxy::config::HttpConfig;
@@ -12,6 +14,7 @@ use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
 use proxy::redis::notifications;
+use proxy::redis::publisher::RedisPublisherClient;
 use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;
 
@@ -22,6 +25,7 @@ use std::net::SocketAddr;
 use std::pin::pin;
 use std::sync::Arc;
 use tokio::net::TcpListener;
+use tokio::sync::Mutex;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
@@ -129,6 +133,9 @@ struct ProxyCliArgs {
     /// Can be given multiple times for different bucket sizes.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
     endpoint_rps_limit: Vec<RateBucketInfo>,
+    /// Redis rate limiter max number of requests per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    redis_rps_limit: Vec<RateBucketInfo>,
     /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
     #[clap(long, default_value_t = 100)]
     initial_limit: usize,
@@ -225,6 +232,19 @@ async fn main() -> anyhow::Result<()> {
     let cancellation_token = CancellationToken::new();
 
     let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
+    let cancel_map = CancelMap::default();
+    let redis_publisher = match &args.redis_notifications {
+        Some(url) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
+            url,
+            args.region.clone(),
+            &config.redis_rps_limit,
+        )?))),
+        None => None,
+    };
+    let cancellation_handler = Arc::new(CancellationHandler::new(
+        cancel_map.clone(),
+        redis_publisher,
+    ));
 
     // client facing tasks. these will exit on error or on cancellation
     // cancellation returns Ok(())
@@ -234,6 +254,7 @@ async fn main() -> anyhow::Result<()> {
         proxy_listener,
         cancellation_token.clone(),
         endpoint_rate_limiter.clone(),
+        cancellation_handler.clone(),
     ));
 
     // TODO: rename the argument to something like serverless.
@@ -248,6 +269,7 @@ async fn main() -> anyhow::Result<()> {
             serverless_listener,
             cancellation_token.clone(),
             endpoint_rate_limiter.clone(),
+            cancellation_handler.clone(),
         ));
     }
 
@@ -271,7 +293,12 @@ async fn main() -> anyhow::Result<()> {
             let cache = api.caches.project_info.clone();
             if let Some(url) = args.redis_notifications {
                 info!("Starting redis notifications listener ({url})");
-                maintenance_tasks.spawn(notifications::task_main(url.to_owned(), cache.clone()));
+                maintenance_tasks.spawn(notifications::task_main(
+                    url.to_owned(),
+                    cache.clone(),
+                    cancel_map.clone(),
+                    args.region.clone(),
+                ));
             }
             maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
         }
@@ -403,6 +430,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
 
     let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
     RateBucketInfo::validate(&mut endpoint_rps_limit)?;
+    let mut redis_rps_limit = args.redis_rps_limit.clone();
+    RateBucketInfo::validate(&mut redis_rps_limit)?;
 
     let config = Box::leak(Box::new(ProxyConfig {
         tls_config,
@@ -414,6 +443,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         require_client_ip: args.require_client_ip,
         disable_ip_check_for_http: args.disable_ip_check_for_http,
         endpoint_rps_limit,
+        redis_rps_limit,
         handshake_timeout: args.handshake_timeout,
         // TODO: add this argument
         region: args.region.clone(),
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index fe614628d8..93a77bc4ae 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,16 +1,28 @@
+use async_trait::async_trait;
 use dashmap::DashMap;
 use pq_proto::CancelKeyData;
 use std::{net::SocketAddr, sync::Arc};
 use thiserror::Error;
 use tokio::net::TcpStream;
+use tokio::sync::Mutex;
 use tokio_postgres::{CancelToken, NoTls};
 use tracing::info;
+use uuid::Uuid;
 
-use crate::error::ReportableError;
+use crate::{
+    error::ReportableError, metrics::NUM_CANCELLATION_REQUESTS,
+    redis::publisher::RedisPublisherClient,
+};
+
+pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
 
 /// Enables serving `CancelRequest`s.
-#[derive(Default)]
-pub struct CancelMap(DashMap<CancelKeyData, Option<CancelClosure>>);
+///
+/// If there is a `RedisPublisherClient` available, it will be used to publish the cancellation key to other proxy instances.
+pub struct CancellationHandler {
+    map: CancelMap,
+    redis_client: Option<Arc<Mutex<RedisPublisherClient>>>,
+}
 
 #[derive(Debug, Error)]
 pub enum CancelError {
@@ -32,15 +44,43 @@ impl ReportableError for CancelError {
     }
 }
 
-impl CancelMap {
+impl CancellationHandler {
+    pub fn new(map: CancelMap, redis_client: Option<Arc<Mutex<RedisPublisherClient>>>) -> Self {
+        Self { map, redis_client }
+    }
     /// Cancel a running query for the corresponding connection.
-    pub async fn cancel_session(&self, key: CancelKeyData) -> Result<(), CancelError> {
+    pub async fn cancel_session(
+        &self,
+        key: CancelKeyData,
+        session_id: Uuid,
+    ) -> Result<(), CancelError> {
+        let from = "from_client";
         // NB: we should immediately release the lock after cloning the token.
-        let Some(cancel_closure) = self.0.get(&key).and_then(|x| x.clone()) else {
+        let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
             tracing::warn!("query cancellation key not found: {key}");
+            if let Some(redis_client) = &self.redis_client {
+                NUM_CANCELLATION_REQUESTS
+                    .with_label_values(&[from, "not_found"])
+                    .inc();
+                info!("publishing cancellation key to Redis");
+                match redis_client.lock().await.try_publish(key, session_id).await {
+                    Ok(()) => {
+                        info!("cancellation key successfuly published to Redis");
+                    }
+                    Err(e) => {
+                        tracing::error!("failed to publish a message: {e}");
+                        return Err(CancelError::IO(std::io::Error::new(
+                            std::io::ErrorKind::Other,
+                            e.to_string(),
+                        )));
+                    }
+                }
+            }
             return Ok(());
         };
-
+        NUM_CANCELLATION_REQUESTS
+            .with_label_values(&[from, "found"])
+            .inc();
         info!("cancelling query per user's request using key {key}");
         cancel_closure.try_cancel_query().await
     }
@@ -57,7 +97,7 @@ impl CancelMap {
 
             // Random key collisions are unlikely to happen here, but they're still possible,
             // which is why we have to take care not to rewrite an existing key.
-            match self.0.entry(key) {
+            match self.map.entry(key) {
                 dashmap::mapref::entry::Entry::Occupied(_) => continue,
                 dashmap::mapref::entry::Entry::Vacant(e) => {
                     e.insert(None);
@@ -69,18 +109,46 @@ impl CancelMap {
         info!("registered new query cancellation key {key}");
         Session {
             key,
-            cancel_map: self,
+            cancellation_handler: self,
         }
     }
 
     #[cfg(test)]
     fn contains(&self, session: &Session) -> bool {
-        self.0.contains_key(&session.key)
+        self.map.contains_key(&session.key)
     }
 
     #[cfg(test)]
     fn is_empty(&self) -> bool {
-        self.0.is_empty()
+        self.map.is_empty()
+    }
+}
+
+#[async_trait]
+pub trait NotificationsCancellationHandler {
+    async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError>;
+}
+
+#[async_trait]
+impl NotificationsCancellationHandler for CancellationHandler {
+    async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError> {
+        let from = "from_redis";
+        let cancel_closure = self.map.get(&key).and_then(|x| x.clone());
+        match cancel_closure {
+            Some(cancel_closure) => {
+                NUM_CANCELLATION_REQUESTS
+                    .with_label_values(&[from, "found"])
+                    .inc();
+                cancel_closure.try_cancel_query().await
+            }
+            None => {
+                NUM_CANCELLATION_REQUESTS
+                    .with_label_values(&[from, "not_found"])
+                    .inc();
+                tracing::warn!("query cancellation key not found: {key}");
+                Ok(())
+            }
+        }
     }
 }
 
@@ -115,7 +183,7 @@ pub struct Session {
     /// The user-facing key identifying this session.
     key: CancelKeyData,
     /// The [`CancelMap`] this session belongs to.
-    cancel_map: Arc<CancelMap>,
+    cancellation_handler: Arc<CancellationHandler>,
 }
 
 impl Session {
@@ -123,7 +191,9 @@ impl Session {
     /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
     pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
         info!("enabling query cancellation for this session");
-        self.cancel_map.0.insert(self.key, Some(cancel_closure));
+        self.cancellation_handler
+            .map
+            .insert(self.key, Some(cancel_closure));
 
         self.key
     }
@@ -131,7 +201,7 @@ impl Session {
 
 impl Drop for Session {
     fn drop(&mut self) {
-        self.cancel_map.0.remove(&self.key);
+        self.cancellation_handler.map.remove(&self.key);
         info!("dropped query cancellation key {}", &self.key);
     }
 }
@@ -142,13 +212,16 @@ mod tests {
 
     #[tokio::test]
     async fn check_session_drop() -> anyhow::Result<()> {
-        let cancel_map: Arc<CancelMap> = Default::default();
+        let cancellation_handler = Arc::new(CancellationHandler {
+            map: CancelMap::default(),
+            redis_client: None,
+        });
 
-        let session = cancel_map.clone().get_session();
-        assert!(cancel_map.contains(&session));
+        let session = cancellation_handler.clone().get_session();
+        assert!(cancellation_handler.contains(&session));
         drop(session);
         // Check that the session has been dropped.
-        assert!(cancel_map.is_empty());
+        assert!(cancellation_handler.is_empty());
 
         Ok(())
     }
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 5fcb537834..9f276c3c24 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -21,6 +21,7 @@ pub struct ProxyConfig {
     pub require_client_ip: bool,
     pub disable_ip_check_for_http: bool,
     pub endpoint_rps_limit: Vec<RateBucketInfo>,
+    pub redis_rps_limit: Vec<RateBucketInfo>,
     pub region: String,
     pub handshake_timeout: Duration,
 }
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index f7f162a075..66031f5eb2 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -152,6 +152,15 @@ pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy<IntGauge> = Lazy::new(|| {
     .unwrap()
 });
 
+pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_cancellation_requests_total",
+        "Number of cancellation requests (per found/not_found).",
+        &["source", "kind"],
+    )
+    .unwrap()
+});
+
 #[derive(Clone)]
 pub struct LatencyTimer {
     // time since the stopwatch was started
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 5f65de4c98..ce77098a5f 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -10,7 +10,7 @@ pub mod wake_compute;
 
 use crate::{
     auth,
-    cancellation::{self, CancelMap},
+    cancellation::{self, CancellationHandler},
     compute,
     config::{ProxyConfig, TlsConfig},
     context::RequestMonitoring,
@@ -62,6 +62,7 @@ pub async fn task_main(
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    cancellation_handler: Arc<CancellationHandler>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("proxy has shut down");
@@ -72,7 +73,6 @@ pub async fn task_main(
     socket2::SockRef::from(&listener).set_keepalive(true)?;
 
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
-    let cancel_map = Arc::new(CancelMap::default());
 
     while let Some(accept_result) =
         run_until_cancelled(listener.accept(), &cancellation_token).await
@@ -80,7 +80,7 @@ pub async fn task_main(
         let (socket, peer_addr) = accept_result?;
 
         let session_id = uuid::Uuid::new_v4();
-        let cancel_map = Arc::clone(&cancel_map);
+        let cancellation_handler = Arc::clone(&cancellation_handler);
         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
         let session_span = info_span!(
@@ -113,7 +113,7 @@ pub async fn task_main(
                 let res = handle_client(
                     config,
                     &mut ctx,
-                    cancel_map,
+                    cancellation_handler,
                     socket,
                     ClientMode::Tcp,
                     endpoint_rate_limiter,
@@ -227,7 +227,7 @@ impl ReportableError for ClientRequestError {
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
-    cancel_map: Arc<CancelMap>,
+    cancellation_handler: Arc<CancellationHandler>,
     stream: S,
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -253,8 +253,8 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(cancel_key_data) => {
-                return Ok(cancel_map
-                    .cancel_session(cancel_key_data)
+                return Ok(cancellation_handler
+                    .cancel_session(cancel_key_data, ctx.session_id)
                     .await
                     .map(|()| None)?)
             }
@@ -315,7 +315,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     .or_else(|e| stream.throw_error(e))
     .await?;
 
-    let session = cancel_map.get_session();
+    let session = cancellation_handler.get_session();
     prepare_client_connection(&node, &session, &mut stream).await?;
 
     // Before proxy passing, forward to compute whatever data is left in the
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index b26386d159..f0da4ead23 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -4,4 +4,4 @@ mod limiter;
 pub use aimd::Aimd;
 pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
 pub use limiter::Limiter;
-pub use limiter::{EndpointRateLimiter, RateBucketInfo};
+pub use limiter::{EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index cbae72711c..3181060e2f 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -22,6 +22,44 @@ use super::{
     RateLimiterConfig,
 };
 
+pub struct RedisRateLimiter {
+    data: Vec<RateBucket>,
+    info: &'static [RateBucketInfo],
+}
+
+impl RedisRateLimiter {
+    pub fn new(info: &'static [RateBucketInfo]) -> Self {
+        Self {
+            data: vec![
+                RateBucket {
+                    start: Instant::now(),
+                    count: 0,
+                };
+                info.len()
+            ],
+            info,
+        }
+    }
+
+    /// Check that number of connections is below `max_rps` rps.
+    pub fn check(&mut self) -> bool {
+        let now = Instant::now();
+
+        let should_allow_request = self
+            .data
+            .iter_mut()
+            .zip(self.info)
+            .all(|(bucket, info)| bucket.should_allow_request(info, now));
+
+        if should_allow_request {
+            // only increment the bucket counts if the request will actually be accepted
+            self.data.iter_mut().for_each(RateBucket::inc);
+        }
+
+        should_allow_request
+    }
+}
+
 // Simple per-endpoint rate limiter.
 //
 // Check that number of connections to the endpoint is below `max_rps` rps.
diff --git a/proxy/src/redis.rs b/proxy/src/redis.rs
index c2a91bed97..35d6db074e 100644
--- a/proxy/src/redis.rs
+++ b/proxy/src/redis.rs
@@ -1 +1,2 @@
 pub mod notifications;
+pub mod publisher;
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 158884aa17..b8297a206c 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -1,38 +1,44 @@
 use std::{convert::Infallible, sync::Arc};
 
 use futures::StreamExt;
+use pq_proto::CancelKeyData;
 use redis::aio::PubSub;
-use serde::Deserialize;
+use serde::{Deserialize, Serialize};
+use uuid::Uuid;
 
 use crate::{
     cache::project_info::ProjectInfoCache,
+    cancellation::{CancelMap, CancellationHandler, NotificationsCancellationHandler},
     intern::{ProjectIdInt, RoleNameInt},
 };
 
-const CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
+const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
+pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
 const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
 const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20);
 
-struct ConsoleRedisClient {
+struct RedisConsumerClient {
     client: redis::Client,
 }
 
-impl ConsoleRedisClient {
+impl RedisConsumerClient {
     pub fn new(url: &str) -> anyhow::Result<Self> {
         let client = redis::Client::open(url)?;
         Ok(Self { client })
     }
     async fn try_connect(&self) -> anyhow::Result<PubSub> {
         let mut conn = self.client.get_async_connection().await?.into_pubsub();
-        tracing::info!("subscribing to a channel `{CHANNEL_NAME}`");
-        conn.subscribe(CHANNEL_NAME).await?;
+        tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`");
+        conn.subscribe(CPLANE_CHANNEL_NAME).await?;
+        tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`");
+        conn.subscribe(PROXY_CHANNEL_NAME).await?;
         Ok(conn)
     }
 }
 
-#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 #[serde(tag = "topic", content = "data")]
-enum Notification {
+pub(crate) enum Notification {
     #[serde(
         rename = "/allowed_ips_updated",
         deserialize_with = "deserialize_json_string"
@@ -45,16 +51,25 @@ enum Notification {
         deserialize_with = "deserialize_json_string"
     )]
     PasswordUpdate { password_update: PasswordUpdate },
+    #[serde(rename = "/cancel_session")]
+    Cancel(CancelSession),
 }
-#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
-struct AllowedIpsUpdate {
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) struct AllowedIpsUpdate {
     project_id: ProjectIdInt,
 }
-#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
-struct PasswordUpdate {
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) struct PasswordUpdate {
     project_id: ProjectIdInt,
     role_name: RoleNameInt,
 }
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) struct CancelSession {
+    pub region_id: Option<String>,
+    pub cancel_key_data: CancelKeyData,
+    pub session_id: Uuid,
+}
+
 fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
 where
     T: for<'de2> serde::Deserialize<'de2>,
@@ -64,6 +79,88 @@ where
     serde_json::from_str(&s).map_err(<D::Error as serde::de::Error>::custom)
 }
 
+struct MessageHandler<
+    C: ProjectInfoCache + Send + Sync + 'static,
+    H: NotificationsCancellationHandler + Send + Sync + 'static,
+> {
+    cache: Arc<C>,
+    cancellation_handler: Arc<H>,
+    region_id: String,
+}
+
+impl<
+        C: ProjectInfoCache + Send + Sync + 'static,
+        H: NotificationsCancellationHandler + Send + Sync + 'static,
+    > MessageHandler<C, H>
+{
+    pub fn new(cache: Arc<C>, cancellation_handler: Arc<H>, region_id: String) -> Self {
+        Self {
+            cache,
+            cancellation_handler,
+            region_id,
+        }
+    }
+    pub fn disable_ttl(&self) {
+        self.cache.disable_ttl();
+    }
+    pub fn enable_ttl(&self) {
+        self.cache.enable_ttl();
+    }
+    #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
+    async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> {
+        use Notification::*;
+        let payload: String = msg.get_payload()?;
+        tracing::debug!(?payload, "received a message payload");
+
+        let msg: Notification = match serde_json::from_str(&payload) {
+            Ok(msg) => msg,
+            Err(e) => {
+                tracing::error!("broken message: {e}");
+                return Ok(());
+            }
+        };
+        tracing::debug!(?msg, "received a message");
+        match msg {
+            Cancel(cancel_session) => {
+                tracing::Span::current().record(
+                    "session_id",
+                    &tracing::field::display(cancel_session.session_id),
+                );
+                if let Some(cancel_region) = cancel_session.region_id {
+                    // If the message is not for this region, ignore it.
+                    if cancel_region != self.region_id {
+                        return Ok(());
+                    }
+                }
+                // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message.
+                match self
+                    .cancellation_handler
+                    .cancel_session_no_publish(cancel_session.cancel_key_data)
+                    .await
+                {
+                    Ok(()) => {}
+                    Err(e) => {
+                        tracing::error!("failed to cancel session: {e}");
+                    }
+                }
+            }
+            _ => {
+                invalidate_cache(self.cache.clone(), msg.clone());
+                // It might happen that the invalid entry is on the way to be cached.
+                // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
+                // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message.
+                let cache = self.cache.clone();
+                tokio::spawn(async move {
+                    tokio::time::sleep(INVALIDATION_LAG).await;
+                    invalidate_cache(cache, msg);
+                });
+            }
+        }
+
+        Ok(())
+    }
+}
+
 fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
     use Notification::*;
     match msg {
@@ -74,50 +171,33 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
             password_update.project_id,
             password_update.role_name,
         ),
+        Cancel(_) => unreachable!("cancel message should be handled separately"),
     }
 }
 
-#[tracing::instrument(skip(cache))]
-fn handle_message<C>(msg: redis::Msg, cache: Arc<C>) -> anyhow::Result<()>
-where
-    C: ProjectInfoCache + Send + Sync + 'static,
-{
-    let payload: String = msg.get_payload()?;
-    tracing::debug!(?payload, "received a message payload");
-
-    let msg: Notification = match serde_json::from_str(&payload) {
-        Ok(msg) => msg,
-        Err(e) => {
-            tracing::error!("broken message: {e}");
-            return Ok(());
-        }
-    };
-    tracing::debug!(?msg, "received a message");
-    invalidate_cache(cache.clone(), msg.clone());
-    // It might happen that the invalid entry is on the way to be cached.
-    // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
-    // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message.
-    tokio::spawn(async move {
-        tokio::time::sleep(INVALIDATION_LAG).await;
-        invalidate_cache(cache, msg.clone());
-    });
-
-    Ok(())
-}
-
 /// Handle console's invalidation messages.
 #[tracing::instrument(name = "console_notifications", skip_all)]
-pub async fn task_main<C>(url: String, cache: Arc<C>) -> anyhow::Result<Infallible>
+pub async fn task_main<C>(
+    url: String,
+    cache: Arc<C>,
+    cancel_map: CancelMap,
+    region_id: String,
+) -> anyhow::Result<Infallible>
 where
     C: ProjectInfoCache + Send + Sync + 'static,
 {
     cache.enable_ttl();
+    let handler = MessageHandler::new(
+        cache,
+        Arc::new(CancellationHandler::new(cancel_map, None)),
+        region_id,
+    );
 
     loop {
-        let redis = ConsoleRedisClient::new(&url)?;
+        let redis = RedisConsumerClient::new(&url)?;
         let conn = match redis.try_connect().await {
             Ok(conn) => {
-                cache.disable_ttl();
+                handler.disable_ttl();
                 conn
             }
             Err(e) => {
@@ -130,7 +210,7 @@ where
         };
         let mut stream = conn.into_on_message();
         while let Some(msg) = stream.next().await {
-            match handle_message(msg, cache.clone()) {
+            match handler.handle_message(msg).await {
                 Ok(()) => {}
                 Err(e) => {
                     tracing::error!("failed to handle message: {e}, will try to reconnect");
@@ -138,7 +218,7 @@ where
                 }
             }
         }
-        cache.enable_ttl();
+        handler.enable_ttl();
     }
 }
 
@@ -198,6 +278,33 @@ mod tests {
             }
         );
 
+        Ok(())
+    }
+    #[test]
+    fn parse_cancel_session() -> anyhow::Result<()> {
+        let cancel_key_data = CancelKeyData {
+            backend_pid: 42,
+            cancel_key: 41,
+        };
+        let uuid = uuid::Uuid::new_v4();
+        let msg = Notification::Cancel(CancelSession {
+            cancel_key_data,
+            region_id: None,
+            session_id: uuid,
+        });
+        let text = serde_json::to_string(&msg)?;
+        let result: Notification = serde_json::from_str(&text)?;
+        assert_eq!(msg, result);
+
+        let msg = Notification::Cancel(CancelSession {
+            cancel_key_data,
+            region_id: Some("region".to_string()),
+            session_id: uuid,
+        });
+        let text = serde_json::to_string(&msg)?;
+        let result: Notification = serde_json::from_str(&text)?;
+        assert_eq!(msg, result,);
+
         Ok(())
     }
 }
diff --git a/proxy/src/redis/publisher.rs b/proxy/src/redis/publisher.rs
new file mode 100644
index 0000000000..f85593afdd
--- /dev/null
+++ b/proxy/src/redis/publisher.rs
@@ -0,0 +1,80 @@
+use pq_proto::CancelKeyData;
+use redis::AsyncCommands;
+use uuid::Uuid;
+
+use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
+
+use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME};
+
+pub struct RedisPublisherClient {
+    client: redis::Client,
+    publisher: Option<redis::aio::Connection>,
+    region_id: String,
+    limiter: RedisRateLimiter,
+}
+
+impl RedisPublisherClient {
+    pub fn new(
+        url: &str,
+        region_id: String,
+        info: &'static [RateBucketInfo],
+    ) -> anyhow::Result<Self> {
+        let client = redis::Client::open(url)?;
+        Ok(Self {
+            client,
+            publisher: None,
+            region_id,
+            limiter: RedisRateLimiter::new(info),
+        })
+    }
+    pub async fn try_publish(
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping cancellation message");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+        match self.publish(cancel_key_data, session_id).await {
+            Ok(()) => return Ok(()),
+            Err(e) => {
+                tracing::error!("failed to publish a message: {e}");
+                self.publisher = None;
+            }
+        }
+        tracing::info!("Publisher is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.publish(cancel_key_data, session_id).await
+    }
+
+    async fn publish(
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        let conn = self
+            .publisher
+            .as_mut()
+            .ok_or_else(|| anyhow::anyhow!("not connected"))?;
+        let payload = serde_json::to_string(&Notification::Cancel(CancelSession {
+            region_id: Some(self.region_id.clone()),
+            cancel_key_data,
+            session_id,
+        }))?;
+        conn.publish(PROXY_CHANNEL_NAME, payload).await?;
+        Ok(())
+    }
+    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
+        match self.client.get_async_connection().await {
+            Ok(conn) => {
+                self.publisher = Some(conn);
+            }
+            Err(e) => {
+                tracing::error!("failed to connect to redis: {e}");
+                return Err(e.into());
+            }
+        }
+        Ok(())
+    }
+}
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index a20600b94a..ee3e91495b 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -24,7 +24,7 @@ use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
-use crate::{cancellation::CancelMap, config::ProxyConfig};
+use crate::{cancellation::CancellationHandler, config::ProxyConfig};
 use futures::StreamExt;
 use hyper::{
     server::{
@@ -50,6 +50,7 @@ pub async fn task_main(
     ws_listener: TcpListener,
     cancellation_token: CancellationToken,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    cancellation_handler: Arc<CancellationHandler>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("websocket server has shut down");
@@ -115,7 +116,7 @@ pub async fn task_main(
             let backend = backend.clone();
             let ws_connections = ws_connections.clone();
             let endpoint_rate_limiter = endpoint_rate_limiter.clone();
-
+            let cancellation_handler = cancellation_handler.clone();
             async move {
                 let peer_addr = match client_addr {
                     Some(addr) => addr,
@@ -127,9 +128,9 @@ pub async fn task_main(
                         let backend = backend.clone();
                         let ws_connections = ws_connections.clone();
                         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
+                        let cancellation_handler = cancellation_handler.clone();
 
                         async move {
-                            let cancel_map = Arc::new(CancelMap::default());
                             let session_id = uuid::Uuid::new_v4();
 
                             request_handler(
@@ -137,7 +138,7 @@ pub async fn task_main(
                                 config,
                                 backend,
                                 ws_connections,
-                                cancel_map,
+                                cancellation_handler,
                                 session_id,
                                 peer_addr.ip(),
                                 endpoint_rate_limiter,
@@ -205,7 +206,7 @@ async fn request_handler(
     config: &'static ProxyConfig,
     backend: Arc<PoolingBackend>,
     ws_connections: TaskTracker,
-    cancel_map: Arc<CancelMap>,
+    cancellation_handler: Arc<CancellationHandler>,
     session_id: uuid::Uuid,
     peer_addr: IpAddr,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -232,7 +233,7 @@ async fn request_handler(
                     config,
                     ctx,
                     websocket,
-                    cancel_map,
+                    cancellation_handler,
                     host,
                     endpoint_rate_limiter,
                 )
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 062dd440b2..24f2bb7e8c 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -1,5 +1,5 @@
 use crate::{
-    cancellation::CancelMap,
+    cancellation::CancellationHandler,
     config::ProxyConfig,
     context::RequestMonitoring,
     error::{io_error, ReportableError},
@@ -133,7 +133,7 @@ pub async fn serve_websocket(
     config: &'static ProxyConfig,
     mut ctx: RequestMonitoring,
     websocket: HyperWebsocket,
-    cancel_map: Arc<CancelMap>,
+    cancellation_handler: Arc<CancellationHandler>,
     hostname: Option<String>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
@@ -141,7 +141,7 @@ pub async fn serve_websocket(
     let res = handle_client(
         config,
         &mut ctx,
-        cancel_map,
+        cancellation_handler,
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },
         endpoint_rate_limiter,
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 8e9cc43152..e808fabbe7 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -38,7 +38,7 @@ futures-io = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }
+hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
 hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
@@ -91,7 +91,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }
+hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }

From 7fa732c96c6382fd0468991b40f922348e653d3c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 13 Feb 2024 18:46:25 +0100
Subject: [PATCH 0166/1571] refactor(virtual_file): take owned buffer in
 VirtualFile::write_all (#6664)

Building atop #6660 , this PR converts VirtualFile::write_all to
owned buffers.

Part of https://github.com/neondatabase/neon/issues/6663
---
 pageserver/src/deletion_queue.rs              |  4 +-
 pageserver/src/tenant.rs                      |  4 +-
 pageserver/src/tenant/blob_io.rs              | 26 ++++----
 pageserver/src/tenant/metadata.rs             |  2 +-
 pageserver/src/tenant/secondary/downloader.rs |  2 +-
 .../src/tenant/storage_layer/delta_layer.rs   | 30 +++------
 .../src/tenant/storage_layer/image_layer.rs   | 30 +++------
 pageserver/src/virtual_file.rs                | 66 ++++++++++++-------
 8 files changed, 81 insertions(+), 83 deletions(-)

diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index da1da9331a..9046fe881b 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -234,7 +234,7 @@ impl DeletionHeader {
         let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
         let header_path = conf.deletion_header_path();
         let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
-        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
+        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, header_bytes)
             .await
             .maybe_fatal_err("save deletion header")?;
 
@@ -325,7 +325,7 @@ impl DeletionList {
         let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
 
         let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
-        VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
+        VirtualFile::crashsafe_overwrite(&path, &temp_path, bytes)
             .await
             .maybe_fatal_err("save deletion list")
             .map_err(Into::into)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d946c57118..9f1f188bf2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2880,7 +2880,7 @@ impl Tenant {
         let config_path = config_path.to_owned();
         tokio::task::spawn_blocking(move || {
             Handle::current().block_on(async move {
-                let conf_content = conf_content.as_bytes();
+                let conf_content = conf_content.into_bytes();
                 VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content)
                     .await
                     .with_context(|| {
@@ -2917,7 +2917,7 @@ impl Tenant {
         let target_config_path = target_config_path.to_owned();
         tokio::task::spawn_blocking(move || {
             Handle::current().block_on(async move {
-                let conf_content = conf_content.as_bytes();
+                let conf_content = conf_content.into_bytes();
                 VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content)
                     .await
                     .with_context(|| {
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index e2ff12665a..ec70bdc679 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -131,27 +131,23 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         &mut self,
         src_buf: B,
     ) -> (B::Buf, Result<(), Error>) {
-        let src_buf_len = src_buf.bytes_init();
-        let (src_buf, res) = if src_buf_len > 0 {
-            let src_buf = src_buf.slice(0..src_buf_len);
-            let res = self.inner.write_all(&src_buf).await;
-            let src_buf = Slice::into_inner(src_buf);
-            (src_buf, res)
-        } else {
-            let res = self.inner.write_all(&[]).await;
-            (Slice::into_inner(src_buf.slice_full()), res)
+        let (src_buf, res) = self.inner.write_all(src_buf).await;
+        let nbytes = match res {
+            Ok(nbytes) => nbytes,
+            Err(e) => return (src_buf, Err(e)),
         };
-        if let Ok(()) = &res {
-            self.offset += src_buf_len as u64;
-        }
-        (src_buf, res)
+        self.offset += nbytes as u64;
+        (src_buf, Ok(()))
     }
 
     #[inline(always)]
     /// Flushes the internal buffer to the underlying `VirtualFile`.
     pub async fn flush_buffer(&mut self) -> Result<(), Error> {
-        self.inner.write_all(&self.buf).await?;
-        self.buf.clear();
+        let buf = std::mem::take(&mut self.buf);
+        let (mut buf, res) = self.inner.write_all(buf).await;
+        res?;
+        buf.clear();
+        self.buf = buf;
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 6fb86c65e2..dcbe781f90 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -279,7 +279,7 @@ pub async fn save_metadata(
     let path = conf.metadata_path(tenant_shard_id, timeline_id);
     let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
     let metadata_bytes = data.to_bytes().context("serialize metadata")?;
-    VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
+    VirtualFile::crashsafe_overwrite(&path, &temp_path, metadata_bytes)
         .await
         .context("write metadata")?;
     Ok(())
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 0666e104f8..c23416a7f0 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -486,7 +486,7 @@ impl<'a> TenantDownloader<'a> {
         let heatmap_path_bg = heatmap_path.clone();
         tokio::task::spawn_blocking(move || {
             tokio::runtime::Handle::current().block_on(async move {
-                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await
+                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, heatmap_bytes).await
             })
         })
         .await
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 7a5dc7a59f..9a7bcbcebe 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -461,7 +461,8 @@ impl DeltaLayerWriterInner {
         file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
             .await?;
         for buf in block_buf.blocks {
-            file.write_all(buf.as_ref()).await?;
+            let (_buf, res) = file.write_all(buf).await;
+            res?;
         }
         assert!(self.lsn_range.start < self.lsn_range.end);
         // Fill in the summary on blk 0
@@ -476,17 +477,12 @@ impl DeltaLayerWriterInner {
             index_root_blk,
         };
 
-        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        let mut buf = Vec::with_capacity(PAGE_SZ);
+        // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&summary, &mut buf)?;
-        if buf.spilled() {
-            // This is bad as we only have one free block for the summary
-            warn!(
-                "Used more than one page size for summary buffer: {}",
-                buf.len()
-            );
-        }
         file.seek(SeekFrom::Start(0)).await?;
-        file.write_all(&buf).await?;
+        let (_buf, res) = file.write_all(buf).await;
+        res?;
 
         let metadata = file
             .metadata()
@@ -679,18 +675,12 @@ impl DeltaLayer {
 
         let new_summary = rewrite(actual_summary);
 
-        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        let mut buf = Vec::with_capacity(PAGE_SZ);
+        // TODO: could use smallvec here, but it's a pain with Slice<T>
         Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
-        if buf.spilled() {
-            // The code in DeltaLayerWriterInner just warn!()s for this.
-            // It should probably error out as well.
-            return Err(RewriteSummaryError::Other(anyhow::anyhow!(
-                "Used more than one page size for summary buffer: {}",
-                buf.len()
-            )));
-        }
         file.seek(SeekFrom::Start(0)).await?;
-        file.write_all(&buf).await?;
+        let (_buf, res) = file.write_all(buf).await;
+        res?;
         Ok(())
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 1ad195032d..458131b572 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -341,18 +341,12 @@ impl ImageLayer {
 
         let new_summary = rewrite(actual_summary);
 
-        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        let mut buf = Vec::with_capacity(PAGE_SZ);
+        // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
-        if buf.spilled() {
-            // The code in ImageLayerWriterInner just warn!()s for this.
-            // It should probably error out as well.
-            return Err(RewriteSummaryError::Other(anyhow::anyhow!(
-                "Used more than one page size for summary buffer: {}",
-                buf.len()
-            )));
-        }
         file.seek(SeekFrom::Start(0)).await?;
-        file.write_all(&buf).await?;
+        let (_buf, res) = file.write_all(buf).await;
+        res?;
         Ok(())
     }
 }
@@ -555,7 +549,8 @@ impl ImageLayerWriterInner {
             .await?;
         let (index_root_blk, block_buf) = self.tree.finish()?;
         for buf in block_buf.blocks {
-            file.write_all(buf.as_ref()).await?;
+            let (_buf, res) = file.write_all(buf).await;
+            res?;
         }
 
         // Fill in the summary on blk 0
@@ -570,17 +565,12 @@ impl ImageLayerWriterInner {
             index_root_blk,
         };
 
-        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        let mut buf = Vec::with_capacity(PAGE_SZ);
+        // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&summary, &mut buf)?;
-        if buf.spilled() {
-            // This is bad as we only have one free block for the summary
-            warn!(
-                "Used more than one page size for summary buffer: {}",
-                buf.len()
-            );
-        }
         file.seek(SeekFrom::Start(0)).await?;
-        file.write_all(&buf).await?;
+        let (_buf, res) = file.write_all(buf).await;
+        res?;
 
         let metadata = file
             .metadata()
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 059a6596d3..6cff748d42 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -19,7 +19,7 @@ use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use std::fs::{self, File};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
-use tokio_epoll_uring::IoBufMut;
+use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
 
 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
 use std::os::unix::fs::FileExt;
@@ -410,10 +410,10 @@ impl VirtualFile {
     /// step, the tmp path is renamed to the final path. As renames are
     /// atomic, a crash during the write operation will never leave behind a
     /// partially written file.
-    pub async fn crashsafe_overwrite(
+    pub async fn crashsafe_overwrite<B: BoundedBuf>(
         final_path: &Utf8Path,
         tmp_path: &Utf8Path,
-        content: &[u8],
+        content: B,
     ) -> std::io::Result<()> {
         let Some(final_path_parent) = final_path.parent() else {
             return Err(std::io::Error::from_raw_os_error(
@@ -430,7 +430,8 @@ impl VirtualFile {
                 .create_new(true),
         )
         .await?;
-        file.write_all(content).await?;
+        let (_content, res) = file.write_all(content).await;
+        res?;
         file.sync_all().await?;
         drop(file); // before the rename, that's important!
                     // renames are atomic
@@ -601,23 +602,36 @@ impl VirtualFile {
         Ok(())
     }
 
-    pub async fn write_all(&mut self, mut buf: &[u8]) -> Result<(), Error> {
+    /// Writes `buf.slice(0..buf.bytes_init())`.
+    /// Returns the IoBuf that is underlying the BoundedBuf `buf`.
+    /// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in.
+    /// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant.
+    pub async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> (B::Buf, Result<usize, Error>) {
+        let nbytes = buf.bytes_init();
+        if nbytes == 0 {
+            return (Slice::into_inner(buf.slice_full()), Ok(0));
+        }
+        let mut buf = buf.slice(0..nbytes);
         while !buf.is_empty() {
-            match self.write(buf).await {
+            // TODO: push `Slice` further down
+            match self.write(&buf).await {
                 Ok(0) => {
-                    return Err(Error::new(
-                        std::io::ErrorKind::WriteZero,
-                        "failed to write whole buffer",
-                    ));
+                    return (
+                        Slice::into_inner(buf),
+                        Err(Error::new(
+                            std::io::ErrorKind::WriteZero,
+                            "failed to write whole buffer",
+                        )),
+                    );
                 }
                 Ok(n) => {
-                    buf = &buf[n..];
+                    buf = buf.slice(n..);
                 }
                 Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
+                Err(e) => return (Slice::into_inner(buf), Err(e)),
             }
         }
-        Ok(())
+        (Slice::into_inner(buf), Ok(nbytes))
     }
 
     async fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
@@ -676,7 +690,6 @@ where
     F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
     Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
 {
-    use tokio_epoll_uring::BoundedBuf;
     let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
     while buf.bytes_total() != 0 {
         let res;
@@ -1063,10 +1076,19 @@ mod tests {
                 MaybeVirtualFile::File(file) => file.seek(pos),
             }
         }
-        async fn write_all(&mut self, buf: &[u8]) -> Result<(), Error> {
+        async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> Result<(), Error> {
             match self {
-                MaybeVirtualFile::VirtualFile(file) => file.write_all(buf).await,
-                MaybeVirtualFile::File(file) => file.write_all(buf),
+                MaybeVirtualFile::VirtualFile(file) => {
+                    let (_buf, res) = file.write_all(buf).await;
+                    res.map(|_| ())
+                }
+                MaybeVirtualFile::File(file) => {
+                    let buf_len = buf.bytes_init();
+                    if buf_len == 0 {
+                        return Ok(());
+                    }
+                    file.write_all(&buf.slice(0..buf_len))
+                }
             }
         }
 
@@ -1141,7 +1163,7 @@ mod tests {
                 .to_owned(),
         )
         .await?;
-        file_a.write_all(b"foobar").await?;
+        file_a.write_all(b"foobar".to_vec()).await?;
 
         // cannot read from a file opened in write-only mode
         let _ = file_a.read_string().await.unwrap_err();
@@ -1150,7 +1172,7 @@ mod tests {
         let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
 
         // cannot write to a file opened in read-only mode
-        let _ = file_a.write_all(b"bar").await.unwrap_err();
+        let _ = file_a.write_all(b"bar".to_vec()).await.unwrap_err();
 
         // Try simple read
         assert_eq!("foobar", file_a.read_string().await?);
@@ -1293,7 +1315,7 @@ mod tests {
         let path = testdir.join("myfile");
         let tmp_path = testdir.join("myfile.tmp");
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1302,7 +1324,7 @@ mod tests {
         assert!(!tmp_path.exists());
         drop(file);
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar")
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1324,7 +1346,7 @@ mod tests {
         std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
         assert!(tmp_path.exists());
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
             .await
             .unwrap();
 

From b6e070bf85c6f4fa204d36ae2d761db30b47d277 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 13 Feb 2024 20:41:17 +0200
Subject: [PATCH 0167/1571] Do not perform fast exit for catalog pages in redo
 filter (#6730)

## Problem

See https://github.com/neondatabase/neon/issues/6674

Current implementation of `neon_redo_read_buffer_filter` performs fast
exist for catalog pages:
```
       /*
        * Out of an abundance of caution, we always run redo on shared catalogs,
        * regardless of whether the block is stored in shared buffers. See also
        * this function's top comment.
        */
       if (!OidIsValid(NInfoGetDbOid(rinfo)))
               return false;
*/

as a result last written lsn and relation size for FSM fork are not correctly updated for catalog relations.

## Summary of changes

Do not perform fast path return for catalog relations.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/pagestore_smgr.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 63e8b8dc1f..213e396328 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -3079,14 +3079,6 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno);
 #endif
 
-	/*
-	 * Out of an abundance of caution, we always run redo on shared catalogs,
-	 * regardless of whether the block is stored in shared buffers. See also
-	 * this function's top comment.
-	 */
-	if (!OidIsValid(NInfoGetDbOid(rinfo)))
-		return false;
-
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forknum;
 	tag.blockNum = blkno;
@@ -3100,17 +3092,28 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	 */
 	LWLockAcquire(partitionLock, LW_SHARED);
 
-	/* Try to find the relevant buffer */
-	buffer = BufTableLookup(&tag, hash);
-
-	no_redo_needed = buffer < 0;
+	/*
+	 * Out of an abundance of caution, we always run redo on shared catalogs,
+	 * regardless of whether the block is stored in shared buffers. See also
+	 * this function's top comment.
+	 */
+	if (!OidIsValid(NInfoGetDbOid(rinfo)))
+	{
+		no_redo_needed = false;
+	}
+	else
+	{
+		/* Try to find the relevant buffer */
+		buffer = BufTableLookup(&tag, hash);
 
+		no_redo_needed = buffer < 0;
+	}
 	/* In both cases st lwlsn past this WAL record */
 	SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
 
 	/*
 	 * we don't have the buffer in memory, update lwLsn past this record, also
-	 * evict page fro file cache
+	 * evict page from file cache
 	 */
 	if (no_redo_needed)
 		lfc_evict(rinfo, forknum, blkno);

From ee7bbdda0e58af4350a6886544cd75f3cc1b2de9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 14 Feb 2024 02:12:00 +0100
Subject: [PATCH 0168/1571] Create new metric for directory counts (#6736)

There is O(n^2) issues due to how we store these directories (#6626), so
it's good to keep an eye on them and ensure the numbers stay low.

The new per-timeline metric `pageserver_directory_entries_count`
isn't perfect, namely we don't calculate it every time we attach
the timeline, but only if there is an actual change.
Also, it is a collective metric over multiple scalars. Lastly,
we only emit the metric if it is above a certain threshold.

However, the metric still give a feel for the general size of the timeline.
We care less for small values as the metric is mainly there to
detect and track tenants with large directory counts.

We also expose the directory counts in `TimelineInfo` so that one can
get the detailed size distribution directly via the pageserver's API.

Related: #6642 , https://github.com/neondatabase/cloud/issues/10273
---
 libs/pageserver_api/src/models.rs   |  2 +
 libs/pageserver_api/src/reltag.rs   |  1 +
 pageserver/src/http/routes.rs       |  1 +
 pageserver/src/metrics.rs           | 34 +++++++++++++++-
 pageserver/src/pgdatadir_mapping.rs | 62 +++++++++++++++++++++++++++++
 pageserver/src/tenant/timeline.rs   | 39 +++++++++++++++++-
 test_runner/fixtures/metrics.py     |  1 +
 7 files changed, 137 insertions(+), 3 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 46324efd43..1226eaa312 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -494,6 +494,8 @@ pub struct TimelineInfo {
     pub current_logical_size: u64,
     pub current_logical_size_is_accurate: bool,
 
+    pub directory_entries_counts: Vec<u64>,
+
     /// Sum of the size of all layer files.
     /// If a layer is present in both local FS and S3, it counts only once.
     pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs
index 8eb848a514..38693ab847 100644
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -124,6 +124,7 @@ impl RelTag {
     Ord,
     strum_macros::EnumIter,
     strum_macros::FromRepr,
+    enum_map::Enum,
 )]
 #[repr(u8)]
 pub enum SlruKind {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 4be8ee9892..c354cc9ab6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -422,6 +422,7 @@ async fn build_timeline_info_common(
             tenant::timeline::logical_size::Accuracy::Approximate => false,
             tenant::timeline::logical_size::Accuracy::Exact => true,
         },
+        directory_entries_counts: timeline.get_directory_metrics().to_vec(),
         current_physical_size,
         current_logical_size_non_incremental: None,
         timeline_dir_layer_file_size_sum: None,
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 98c98ef6e7..c2b1eafc3a 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -602,6 +602,15 @@ pub(crate) mod initial_logical_size {
         });
 }
 
+static DIRECTORY_ENTRIES_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_directory_entries_count",
+        "Sum of the entries in pageserver-stored directory listings",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_tenant_states_count",
@@ -1809,6 +1818,7 @@ pub(crate) struct TimelineMetrics {
     resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
+    pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
     pub num_persistent_files_created: IntCounter,
     pub persistent_bytes_written: IntCounter,
     pub evictions: IntCounter,
@@ -1818,12 +1828,12 @@ pub(crate) struct TimelineMetrics {
 impl TimelineMetrics {
     pub fn new(
         tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
+        timeline_id_raw: &TimelineId,
         evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
     ) -> Self {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
         let shard_id = format!("{}", tenant_shard_id.shard_slug());
-        let timeline_id = timeline_id.to_string();
+        let timeline_id = timeline_id_raw.to_string();
         let flush_time_histo = StorageTimeMetrics::new(
             StorageTimeOperation::LayerFlush,
             &tenant_id,
@@ -1876,6 +1886,22 @@ impl TimelineMetrics {
         let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
+        // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065
+        let directory_entries_count_gauge_closure = {
+            let tenant_shard_id = *tenant_shard_id;
+            let timeline_id_raw = *timeline_id_raw;
+            move || {
+                let tenant_id = tenant_shard_id.tenant_id.to_string();
+                let shard_id = format!("{}", tenant_shard_id.shard_slug());
+                let timeline_id = timeline_id_raw.to_string();
+                let gauge: UIntGauge = DIRECTORY_ENTRIES_COUNT
+                    .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+                    .unwrap();
+                gauge
+            }
+        };
+        let directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>> =
+            Lazy::new(Box::new(directory_entries_count_gauge_closure));
         let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -1902,6 +1928,7 @@ impl TimelineMetrics {
             last_record_gauge,
             resident_physical_size_gauge,
             current_logical_size_gauge,
+            directory_entries_count_gauge,
             num_persistent_files_created,
             persistent_bytes_written,
             evictions,
@@ -1944,6 +1971,9 @@ impl Drop for TimelineMetrics {
                 RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
         }
         let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
+            let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        }
         let _ =
             NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
         let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index f1d18c0146..5f80ea9b5e 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -14,6 +14,7 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_i
 use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
+use enum_map::Enum;
 use pageserver_api::key::{
     dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
     rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -155,6 +156,7 @@ impl Timeline {
             pending_updates: HashMap::new(),
             pending_deletions: Vec::new(),
             pending_nblocks: 0,
+            pending_directory_entries: Vec::new(),
             lsn,
         }
     }
@@ -868,6 +870,7 @@ pub struct DatadirModification<'a> {
     pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
     pending_deletions: Vec<(Range<Key>, Lsn)>,
     pending_nblocks: i64,
+    pending_directory_entries: Vec<(DirectoryKind, usize)>,
 }
 
 impl<'a> DatadirModification<'a> {
@@ -899,6 +902,7 @@ impl<'a> DatadirModification<'a> {
         let buf = DbDirectory::ser(&DbDirectory {
             dbdirs: HashMap::new(),
         })?;
+        self.pending_directory_entries.push((DirectoryKind::Db, 0));
         self.put(DBDIR_KEY, Value::Image(buf.into()));
 
         // Create AuxFilesDirectory
@@ -907,16 +911,24 @@ impl<'a> DatadirModification<'a> {
         let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
             xids: HashSet::new(),
         })?;
+        self.pending_directory_entries
+            .push((DirectoryKind::TwoPhase, 0));
         self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));
 
         let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into();
         let empty_dir = Value::Image(buf);
         self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
+        self.pending_directory_entries
+            .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
         self.put(
             slru_dir_to_key(SlruKind::MultiXactMembers),
             empty_dir.clone(),
         );
+        self.pending_directory_entries
+            .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
         self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
+        self.pending_directory_entries
+            .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));
 
         Ok(())
     }
@@ -1017,6 +1029,7 @@ impl<'a> DatadirModification<'a> {
             let buf = RelDirectory::ser(&RelDirectory {
                 rels: HashSet::new(),
             })?;
+            self.pending_directory_entries.push((DirectoryKind::Rel, 0));
             self.put(
                 rel_dir_to_key(spcnode, dbnode),
                 Value::Image(Bytes::from(buf)),
@@ -1039,6 +1052,8 @@ impl<'a> DatadirModification<'a> {
         if !dir.xids.insert(xid) {
             anyhow::bail!("twophase file for xid {} already exists", xid);
         }
+        self.pending_directory_entries
+            .push((DirectoryKind::TwoPhase, dir.xids.len()));
         self.put(
             TWOPHASEDIR_KEY,
             Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
@@ -1074,6 +1089,8 @@ impl<'a> DatadirModification<'a> {
         let mut dir = DbDirectory::des(&buf)?;
         if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
             let buf = DbDirectory::ser(&dir)?;
+            self.pending_directory_entries
+                .push((DirectoryKind::Db, dir.dbdirs.len()));
             self.put(DBDIR_KEY, Value::Image(buf.into()));
         } else {
             warn!(
@@ -1111,6 +1128,8 @@ impl<'a> DatadirModification<'a> {
             // Didn't exist. Update dbdir
             dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
             let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
+            self.pending_directory_entries
+                .push((DirectoryKind::Db, dbdir.dbdirs.len()));
             self.put(DBDIR_KEY, Value::Image(buf.into()));
 
             // and create the RelDirectory
@@ -1125,6 +1144,10 @@ impl<'a> DatadirModification<'a> {
         if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
             return Err(RelationError::AlreadyExists);
         }
+
+        self.pending_directory_entries
+            .push((DirectoryKind::Rel, rel_dir.rels.len()));
+
         self.put(
             rel_dir_key,
             Value::Image(Bytes::from(
@@ -1216,6 +1239,9 @@ impl<'a> DatadirModification<'a> {
         let buf = self.get(dir_key, ctx).await?;
         let mut dir = RelDirectory::des(&buf)?;
 
+        self.pending_directory_entries
+            .push((DirectoryKind::Rel, dir.rels.len()));
+
         if dir.rels.remove(&(rel.relnode, rel.forknum)) {
             self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
         } else {
@@ -1251,6 +1277,8 @@ impl<'a> DatadirModification<'a> {
         if !dir.segments.insert(segno) {
             anyhow::bail!("slru segment {kind:?}/{segno} already exists");
         }
+        self.pending_directory_entries
+            .push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
         self.put(
             dir_key,
             Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -1295,6 +1323,8 @@ impl<'a> DatadirModification<'a> {
         if !dir.segments.remove(&segno) {
             warn!("slru segment {:?}/{} does not exist", kind, segno);
         }
+        self.pending_directory_entries
+            .push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
         self.put(
             dir_key,
             Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -1325,6 +1355,8 @@ impl<'a> DatadirModification<'a> {
         if !dir.xids.remove(&xid) {
             warn!("twophase file for xid {} does not exist", xid);
         }
+        self.pending_directory_entries
+            .push((DirectoryKind::TwoPhase, dir.xids.len()));
         self.put(
             TWOPHASEDIR_KEY,
             Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
@@ -1340,6 +1372,8 @@ impl<'a> DatadirModification<'a> {
         let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
             files: HashMap::new(),
         })?;
+        self.pending_directory_entries
+            .push((DirectoryKind::AuxFiles, 0));
         self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
         Ok(())
     }
@@ -1366,6 +1400,9 @@ impl<'a> DatadirModification<'a> {
         } else {
             dir.files.insert(path, Bytes::copy_from_slice(content));
         }
+        self.pending_directory_entries
+            .push((DirectoryKind::AuxFiles, dir.files.len()));
+
         self.put(
             AUX_FILES_KEY,
             Value::Image(Bytes::from(
@@ -1427,6 +1464,10 @@ impl<'a> DatadirModification<'a> {
             self.pending_nblocks = 0;
         }
 
+        for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
+            writer.update_directory_entries_count(kind, count as u64);
+        }
+
         Ok(())
     }
 
@@ -1464,6 +1505,10 @@ impl<'a> DatadirModification<'a> {
             writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
         }
 
+        for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
+            writer.update_directory_entries_count(kind, count as u64);
+        }
+
         Ok(())
     }
 
@@ -1588,6 +1633,23 @@ struct SlruSegmentDirectory {
     segments: HashSet<u32>,
 }
 
+#[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)]
+#[repr(u8)]
+pub(crate) enum DirectoryKind {
+    Db,
+    TwoPhase,
+    Rel,
+    AuxFiles,
+    SlruSegment(SlruKind),
+}
+
+impl DirectoryKind {
+    pub(crate) const KINDS_NUM: usize = <DirectoryKind as Enum>::LENGTH;
+    pub(crate) fn offset(&self) -> usize {
+        self.into_usize()
+    }
+}
+
 static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 
 #[allow(clippy::bool_assert_comparison)]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 625be7a644..87cf0ac6ea 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,6 +14,7 @@ use enumset::EnumSet;
 use fail::fail_point;
 use futures::stream::StreamExt;
 use itertools::Itertools;
+use once_cell::sync::Lazy;
 use pageserver_api::{
     keyspace::{key_range_size, KeySpaceAccum},
     models::{
@@ -34,17 +35,22 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::sync::gate::Gate;
 
-use std::collections::{BTreeMap, BinaryHeap, HashMap, HashSet};
 use std::ops::{Deref, Range};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
+use std::{
+    array,
+    collections::{BTreeMap, BinaryHeap, HashMap, HashSet},
+    sync::atomic::AtomicU64,
+};
 use std::{
     cmp::{max, min, Ordering},
     ops::ControlFlow,
 };
 
+use crate::pgdatadir_mapping::DirectoryKind;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
@@ -258,6 +264,8 @@ pub struct Timeline {
     // in `crate::page_service` writes these metrics.
     pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,
 
+    directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM],
+
     /// Ensures layers aren't frozen by checkpointer between
     /// [`Timeline::get_layer_for_write`] and layer reads.
     /// Locked automatically by [`TimelineWriter`] and checkpointer.
@@ -790,6 +798,10 @@ impl Timeline {
         self.metrics.resident_physical_size_get()
     }
 
+    pub(crate) fn get_directory_metrics(&self) -> [u64; DirectoryKind::KINDS_NUM] {
+        array::from_fn(|idx| self.directory_metrics[idx].load(AtomicOrdering::Relaxed))
+    }
+
     ///
     /// Wait until WAL has been received and processed up to this LSN.
     ///
@@ -1496,6 +1508,8 @@ impl Timeline {
                     &timeline_id,
                 ),
 
+                directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
+
                 flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
 
                 layer_flush_start_tx,
@@ -2264,6 +2278,29 @@ impl Timeline {
         }
     }
 
+    pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) {
+        self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
+        let aux_metric =
+            self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed);
+
+        let sum_of_entries = self
+            .directory_metrics
+            .iter()
+            .map(|v| v.load(AtomicOrdering::Relaxed))
+            .sum();
+        // Set a high general threshold and a lower threshold for the auxiliary files,
+        // as we can have large numbers of relations in the db directory.
+        const SUM_THRESHOLD: u64 = 5000;
+        const AUX_THRESHOLD: u64 = 1000;
+        if sum_of_entries >= SUM_THRESHOLD || aux_metric >= AUX_THRESHOLD {
+            self.metrics
+                .directory_entries_count_gauge
+                .set(sum_of_entries);
+        } else if let Some(metric) = Lazy::get(&self.metrics.directory_entries_count_gauge) {
+            metric.set(sum_of_entries);
+        }
+    }
+
     async fn find_layer(&self, layer_file_name: &str) -> Option<Layer> {
         let guard = self.layers.read().await;
         for historic_layer in guard.layer_map().iter_historic_layers() {
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index ef41774289..418370c3ab 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -96,5 +96,6 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_evictions_total",
     "pageserver_evictions_with_low_residence_duration_total",
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
+    # "pageserver_directory_entries_count", -- only used if above a certain threshold
     # "pageserver_broken_tenants_count" -- used only for broken
 )

From a5114a99b275b52fc7a512e62a7f80a5a103433d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 14 Feb 2024 10:34:58 +0200
Subject: [PATCH 0169/1571] Create a symlink from pg_dynshmem to /dev/shm

See included comment and issue
https://github.com/neondatabase/autoscaling/issues/800 for details.

This has no effect, unless you set "dynamic_shared_memory_type = mmap"
in postgresql.conf.
---
 compute_tools/src/compute.rs | 44 +++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 993b5725a4..83db8e09ec 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -2,7 +2,7 @@ use std::collections::HashMap;
 use std::env;
 use std::fs;
 use std::io::BufRead;
-use std::os::unix::fs::PermissionsExt;
+use std::os::unix::fs::{symlink, PermissionsExt};
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
@@ -634,6 +634,48 @@ impl ComputeNode {
         // Update pg_hba.conf received with basebackup.
         update_pg_hba(pgdata_path)?;
 
+        // Place pg_dynshmem under /dev/shm. This allows us to use
+        // 'dynamic_shared_memory_type = mmap' so that the files are placed in
+        // /dev/shm, similar to how 'dynamic_shared_memory_type = posix' works.
+        //
+        // Why on earth don't we just stick to the 'posix' default, you might
+        // ask.  It turns out that making large allocations with 'posix' doesn't
+        // work very well with autoscaling. The behavior we want is that:
+        //
+        // 1. You can make large DSM allocations, larger than the current RAM
+        //    size of the VM, without errors
+        //
+        // 2. If the allocated memory is really used, the VM is scaled up
+        //    automatically to accommodate that
+        //
+        // We try to make that possible by having swap in the VM. But with the
+        // default 'posix' DSM implementation, we fail step 1, even when there's
+        // plenty of swap available. PostgreSQL uses posix_fallocate() to create
+        // the shmem segment, which is really just a file in /dev/shm in Linux,
+        // but posix_fallocate() on tmpfs returns ENOMEM if the size is larger
+        // than available RAM.
+        //
+        // Using 'dynamic_shared_memory_type = mmap' works around that, because
+        // the Postgres 'mmap' DSM implementation doesn't use
+        // posix_fallocate(). Instead, it uses repeated calls to write(2) to
+        // fill the file with zeros. It's weird that that differs between
+        // 'posix' and 'mmap', but we take advantage of it. When the file is
+        // filled slowly with write(2), the kernel allows it to grow larger, as
+        // long as there's swap available.
+        //
+        // In short, using 'dynamic_shared_memory_type = mmap' allows us one DSM
+        // segment to be larger than currently available RAM. But because we
+        // don't want to store it on a real file, which the kernel would try to
+        // flush to disk, so symlink pg_dynshm to /dev/shm.
+        //
+        // We don't set 'dynamic_shared_memory_type = mmap' here, we let the
+        // control plane control that option. If 'mmap' is not used, this
+        // symlink doesn't affect anything.
+        //
+        // See https://github.com/neondatabase/autoscaling/issues/800
+        std::fs::remove_dir(pgdata_path.join("pg_dynshmem"))?;
+        symlink("/dev/shm/", pgdata_path.join("pg_dynshmem"))?;
+
         match spec.mode {
             ComputeMode::Primary => {}
             ComputeMode::Replica | ComputeMode::Static(..) => {

From a97b54e3b9e692532962d65b89b7e5f67a9c28a4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 14 Feb 2024 10:35:59 +0200
Subject: [PATCH 0170/1571] Cherry-pick Postgres bugfix to 'mmap' DSM
 implementation

Cherry-pick Upstream commit fbf9a7ac4d to neon stable branches. We'll
get it in the next PostgreSQL minor release anyway, but we need it
now, if we want to start using the 'mmap' implementation.

See https://github.com/neondatabase/autoscaling/issues/800 for the
plans on doing that.
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 018fb05201..9dd9956c55 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 018fb052011081dc2733d3118d12e5c36df6eba1
+Subproject commit 9dd9956c55ffbbd9abe77d10382453757fedfcf5
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 6ee78a3c29..ca2def9993 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 6ee78a3c29e33cafd85ba09568b6b5eb031d29b9
+Subproject commit ca2def999368d9df098a637234ad5a9003189463
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 550cdd26d4..9c37a49884 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 550cdd26d445afdd26b15aa93c8c2f3dc52f8361
+Subproject commit 9c37a4988463a97d9cacb321acf3828b09823269
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 91ebb8cb34..72bc0d7e0d 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "550cdd26d445afdd26b15aa93c8c2f3dc52f8361",
-    "postgres-v15": "6ee78a3c29e33cafd85ba09568b6b5eb031d29b9",
-    "postgres-v14": "018fb052011081dc2733d3118d12e5c36df6eba1"
+    "postgres-v16": "9c37a4988463a97d9cacb321acf3828b09823269",
+    "postgres-v15": "ca2def999368d9df098a637234ad5a9003189463",
+    "postgres-v14": "9dd9956c55ffbbd9abe77d10382453757fedfcf5"
 }

From a9ec4eb4fc7777a529ff8c5ede814dd657390e58 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 14 Feb 2024 10:26:32 +0000
Subject: [PATCH 0171/1571] hold cancel session (#6750)

## Problem

In a recent refactor, we accidentally dropped the cancel session early

## Summary of changes

Hold the cancel session during proxy passthrough
---
 proxy/src/proxy.rs             | 1 +
 proxy/src/proxy/passthrough.rs | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index ce77098a5f..8a9445303a 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -331,6 +331,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         compute: node,
         req: _request_gauge,
         conn: _client_gauge,
+        cancel: session,
     }))
 }
 
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index c98f68d8d1..73c170fc0b 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -1,4 +1,5 @@
 use crate::{
+    cancellation,
     compute::PostgresConnection,
     console::messages::MetricsAuxInfo,
     metrics::NUM_BYTES_PROXIED_COUNTER,
@@ -57,6 +58,7 @@ pub struct ProxyPassthrough<S> {
 
     pub req: IntCounterPairGuard,
     pub conn: IntCounterPairGuard,
+    pub cancel: cancellation::Session,
 }
 
 impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {

From f39b0fce9b24a049208e74cc7d2a6b006b487839 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 14 Feb 2024 10:57:01 +0000
Subject: [PATCH 0172/1571] Revert #6666 "tests: try to make restored-datadir
 comparison tests not flaky" (#6751)

The #6666  change appears to have made the test fail more often.

PR https://github.com/neondatabase/neon/pull/6712 should re-instate this
change, along with its change to make the overall flow more reliable.

This reverts commit 568f91420a9c677e77aeb736cb3f995a85f0b106.
---
 test_runner/fixtures/neon_fixtures.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 26f2b999a6..04af73c327 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3967,27 +3967,24 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
 
 # pg is the existing and running compute node, that we want to compare with a basebackup
 def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
-    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
-
     # Get the timeline ID. We need it for the 'basebackup' command
     timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
 
+    # many tests already checkpoint, but do it just in case
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("CHECKPOINT")
+
+    # wait for pageserver to catch up
+    wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
     # stop postgres to ensure that files won't change
     endpoint.stop()
 
-    # Read the shutdown checkpoint's LSN
-    pg_controldata_path = os.path.join(pg_bin.pg_bin_path, "pg_controldata")
-    cmd = f"{pg_controldata_path} -D {endpoint.pgdata_dir}"
-    result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
-    checkpoint_lsn = re.findall(
-        "Latest checkpoint location:\\s+([0-9A-F]+/[0-9A-F]+)", result.stdout
-    )[0]
-    log.debug(f"last checkpoint at {checkpoint_lsn}")
-
     # Take a basebackup from pageserver
     restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir"
     restored_dir_path.mkdir(exist_ok=True)
 
+    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
     psql_path = os.path.join(pg_bin.pg_bin_path, "psql")
 
     pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
@@ -3995,7 +3992,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
         {psql_path}                                    \
             --no-psqlrc                                \
             postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg}  \
-            -c 'basebackup {endpoint.tenant_id} {timeline_id} {checkpoint_lsn}'  \
+            -c 'basebackup {endpoint.tenant_id} {timeline_id}'  \
          | tar -x -C {restored_dir_path}
     """
 

From df5d588f63fd329c701c37e61f77d9524ebcb19b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 14 Feb 2024 15:22:41 +0100
Subject: [PATCH 0173/1571] refactor(VirtualFile::crashsafe_overwrite): avoid
 Handle::block_on in callers (#6731)

Some callers of `VirtualFile::crashsafe_overwrite` call it on the
executor thread, thereby potentially stalling it.

Others are more diligent and wrap it in `spawn_blocking(...,
Handle::block_on, ... )` to avoid stalling the executor thread.

However, because `crashsafe_overwrite` uses
VirtualFile::open_with_options internally, we spawn a new thread-local
`tokio-epoll-uring::System` in the blocking pool thread that's used for
the `spawn_blocking` call.

This PR refactors the situation such that we do the `spawn_blocking`
inside `VirtualFile::crashsafe_overwrite`. This unifies the situation
for the better:

1. Callers who didn't wrap in `spawn_blocking(..., Handle::block_on,
...)` before no longer stall the executor.
2. Callers who did it before now can avoid the `block_on`, resolving the
problem with the short-lived `tokio-epoll-uring::System`s in the
blocking pool threads.

A future PR will build on top of this and divert to tokio-epoll-uring if
it's configures as the IO engine.

Changes
-------

- Convert implementation to std::fs and move it into `crashsafe.rs`
- Yes, I know, Safekeepers (cc @arssher ) added `durable_rename` and
`fsync_async_opt` recently. However, `crashsafe_overwrite` is different
in the sense that it's higher level, i.e., it's more like
`std::fs::write` and the Safekeeper team's code is more building block
style.
- The consequence is that we don't use the VirtualFile file descriptor
cache anymore.
- I don't think it's a big deal because we have plenty of slack wrt
production file descriptor limit rlimit (see [this
dashboard](https://neonprod.grafana.net/d/e4a40325-9acf-4aa0-8fd9-f6322b3f30bd/pageserver-open-file-descriptors?orgId=1))

- Use `tokio::task::spawn_blocking` in
`VirtualFile::crashsafe_overwrite` to call the new
`crashsafe::overwrite` API.
- Inspect all callers to remove any double-`spawn_blocking`
- spawn_blocking requires the captures data to be 'static + Send. So,
refactor the callers. We'll need this for future tokio-epoll-uring
support anyway, because tokio-epoll-uring requires owned buffers.

Related Issues
--------------

- overall epic to enable write path to tokio-epoll-uring: #6663
- this is also kind of relevant to the tokio-epoll-uring System creation
failures that we encountered in staging, investigation being tracked in
#6667
- why is it relevant? Because this PR removes two uses of
`spawn_blocking+Handle::block_on`
---
 libs/utils/src/crashsafe.rs                   | 44 +++++++++++-
 pageserver/src/deletion_queue.rs              |  5 +-
 pageserver/src/tenant.rs                      | 33 +++------
 pageserver/src/tenant/metadata.rs             |  2 +-
 pageserver/src/tenant/secondary/downloader.rs | 11 +--
 pageserver/src/virtual_file.rs                | 72 ++++++++-----------
 6 files changed, 89 insertions(+), 78 deletions(-)

diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 1c72e9cae9..756b19138c 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::{
     borrow::Cow,
     fs::{self, File},
-    io,
+    io::{self, Write},
 };
 
 use camino::{Utf8Path, Utf8PathBuf};
@@ -161,6 +161,48 @@ pub async fn durable_rename(
     Ok(())
 }
 
+/// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`].
+///
+/// The file is first written to the specified `tmp_path`, and in a second
+/// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync
+/// and atomic rename guarantee that, if we crash at any point, there will never
+/// be a partially written file at `final_path` (but maybe at `tmp_path`).
+///
+/// Callers are responsible for serializing calls of this function for a given `final_path`.
+/// If they don't, there may be an error due to conflicting `tmp_path`, or there will
+/// be no error and the content of `final_path` will be the "winner" caller's `content`.
+/// I.e., the atomticity guarantees still hold.
+pub fn overwrite(
+    final_path: &Utf8Path,
+    tmp_path: &Utf8Path,
+    content: &[u8],
+) -> std::io::Result<()> {
+    let Some(final_path_parent) = final_path.parent() else {
+        return Err(std::io::Error::from_raw_os_error(
+            nix::errno::Errno::EINVAL as i32,
+        ));
+    };
+    std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
+    let mut file = std::fs::OpenOptions::new()
+        .write(true)
+        // Use `create_new` so that, if we race with ourselves or something else,
+        // we bail out instead of causing damage.
+        .create_new(true)
+        .open(tmp_path)?;
+    file.write_all(content)?;
+    file.sync_all()?;
+    drop(file); // don't keep the fd open for longer than we have to
+
+    std::fs::rename(tmp_path, final_path)?;
+
+    let final_parent_dirfd = std::fs::OpenOptions::new()
+        .read(true)
+        .open(final_path_parent)?;
+
+    final_parent_dirfd.sync_all()?;
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
 
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 9046fe881b..e0c40ea1b0 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -234,7 +234,7 @@ impl DeletionHeader {
         let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
         let header_path = conf.deletion_header_path();
         let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
-        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, header_bytes)
+        VirtualFile::crashsafe_overwrite(header_path, temp_path, header_bytes)
             .await
             .maybe_fatal_err("save deletion header")?;
 
@@ -325,7 +325,8 @@ impl DeletionList {
         let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
 
         let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
-        VirtualFile::crashsafe_overwrite(&path, &temp_path, bytes)
+
+        VirtualFile::crashsafe_overwrite(path, temp_path, bytes)
             .await
             .maybe_fatal_err("save deletion list")
             .map_err(Into::into)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 9f1f188bf2..1f3bc13472 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -28,7 +28,6 @@ use remote_storage::GenericRemoteStorage;
 use std::fmt;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
-use tokio::runtime::Handle;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
@@ -2878,17 +2877,10 @@ impl Tenant {
 
         let tenant_shard_id = *tenant_shard_id;
         let config_path = config_path.to_owned();
-        tokio::task::spawn_blocking(move || {
-            Handle::current().block_on(async move {
-                let conf_content = conf_content.into_bytes();
-                VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content)
-                    .await
-                    .with_context(|| {
-                        format!("write tenant {tenant_shard_id} config to {config_path}")
-                    })
-            })
-        })
-        .await??;
+        let conf_content = conf_content.into_bytes();
+        VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content)
+            .await
+            .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?;
 
         Ok(())
     }
@@ -2915,17 +2907,12 @@ impl Tenant {
 
         let tenant_shard_id = *tenant_shard_id;
         let target_config_path = target_config_path.to_owned();
-        tokio::task::spawn_blocking(move || {
-            Handle::current().block_on(async move {
-                let conf_content = conf_content.into_bytes();
-                VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content)
-                    .await
-                    .with_context(|| {
-                        format!("write tenant {tenant_shard_id} config to {target_config_path}")
-                    })
-            })
-        })
-        .await??;
+        let conf_content = conf_content.into_bytes();
+        VirtualFile::crashsafe_overwrite(target_config_path.clone(), temp_path, conf_content)
+            .await
+            .with_context(|| {
+                format!("write tenant {tenant_shard_id} config to {target_config_path}")
+            })?;
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index dcbe781f90..233acfd431 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -279,7 +279,7 @@ pub async fn save_metadata(
     let path = conf.metadata_path(tenant_shard_id, timeline_id);
     let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
     let metadata_bytes = data.to_bytes().context("serialize metadata")?;
-    VirtualFile::crashsafe_overwrite(&path, &temp_path, metadata_bytes)
+    VirtualFile::crashsafe_overwrite(path, temp_path, metadata_bytes)
         .await
         .context("write metadata")?;
     Ok(())
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index c23416a7f0..c8288acc20 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -484,14 +484,9 @@ impl<'a> TenantDownloader<'a> {
         let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
         let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
         let heatmap_path_bg = heatmap_path.clone();
-        tokio::task::spawn_blocking(move || {
-            tokio::runtime::Handle::current().block_on(async move {
-                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, heatmap_bytes).await
-            })
-        })
-        .await
-        .expect("Blocking task is never aborted")
-        .maybe_fatal_err(&context_msg)?;
+        VirtualFile::crashsafe_overwrite(heatmap_path_bg, temp_path, heatmap_bytes)
+            .await
+            .maybe_fatal_err(&context_msg)?;
 
         tracing::debug!("Wrote local heatmap to {}", heatmap_path);
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 6cff748d42..2a8c22430b 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -19,14 +19,13 @@ use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use std::fs::{self, File};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
-use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
+use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
 
 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
-use utils::fs_ext;
 
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
@@ -404,47 +403,34 @@ impl VirtualFile {
         Ok(vfile)
     }
 
-    /// Writes a file to the specified `final_path` in a crash safe fasion
+    /// Async version of [`::utils::crashsafe::overwrite`].
     ///
-    /// The file is first written to the specified tmp_path, and in a second
-    /// step, the tmp path is renamed to the final path. As renames are
-    /// atomic, a crash during the write operation will never leave behind a
-    /// partially written file.
-    pub async fn crashsafe_overwrite<B: BoundedBuf>(
-        final_path: &Utf8Path,
-        tmp_path: &Utf8Path,
+    /// # NB:
+    ///
+    /// Doesn't actually use the [`VirtualFile`] file descriptor cache, but,
+    /// it did at an earlier time.
+    /// And it will use this module's [`io_engine`] in the near future, so, leaving it here.
+    pub async fn crashsafe_overwrite<B: BoundedBuf<Buf = Buf> + Send, Buf: IoBuf + Send>(
+        final_path: Utf8PathBuf,
+        tmp_path: Utf8PathBuf,
         content: B,
     ) -> std::io::Result<()> {
-        let Some(final_path_parent) = final_path.parent() else {
-            return Err(std::io::Error::from_raw_os_error(
-                nix::errno::Errno::EINVAL as i32,
-            ));
-        };
-        std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?;
-        let mut file = Self::open_with_options(
-            tmp_path,
-            OpenOptions::new()
-                .write(true)
-                // Use `create_new` so that, if we race with ourselves or something else,
-                // we bail out instead of causing damage.
-                .create_new(true),
-        )
-        .await?;
-        let (_content, res) = file.write_all(content).await;
-        res?;
-        file.sync_all().await?;
-        drop(file); // before the rename, that's important!
-                    // renames are atomic
-        std::fs::rename(tmp_path, final_path)?;
-        // Only open final path parent dirfd now, so that this operation only
-        // ever holds one VirtualFile fd at a time.  That's important because
-        // the current `find_victim_slot` impl might pick the same slot for both
-        // VirtualFile., and it eventually does a blocking write lock instead of
-        // try_lock.
-        let final_parent_dirfd =
-            Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?;
-        final_parent_dirfd.sync_all().await?;
-        Ok(())
+        // TODO: use tokio_epoll_uring if configured as `io_engine`.
+        // See https://github.com/neondatabase/neon/issues/6663
+
+        tokio::task::spawn_blocking(move || {
+            let slice_storage;
+            let content_len = content.bytes_init();
+            let content = if content.bytes_init() > 0 {
+                slice_storage = Some(content.slice(0..content_len));
+                slice_storage.as_deref().expect("just set it to Some()")
+            } else {
+                &[]
+            };
+            utils::crashsafe::overwrite(&final_path, &tmp_path, content)
+        })
+        .await
+        .expect("blocking task is never aborted")
     }
 
     /// Call File::sync_all() on the underlying File.
@@ -1315,7 +1301,7 @@ mod tests {
         let path = testdir.join("myfile");
         let tmp_path = testdir.join("myfile.tmp");
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1324,7 +1310,7 @@ mod tests {
         assert!(!tmp_path.exists());
         drop(file);
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1346,7 +1332,7 @@ mod tests {
         std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
         assert!(tmp_path.exists());
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
             .await
             .unwrap();
 

From 774a6e74757d1b1d1e3c75ab103bdd38587a38f1 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 14 Feb 2024 15:59:06 +0100
Subject: [PATCH 0174/1571] refactor(virtual_file) make write_all_at take owned
 buffers (#6673)

context: https://github.com/neondatabase/neon/issues/6663

Building atop #6664, this PR switches `write_all_at` to take owned
buffers.

The main challenge here is the `EphemeralFile::mutable_tail`, for which
I'm picking the ugly solution of an `Option` that is `None` while the IO
is in flight.

After this, we will be able to switch `write_at` to take owned buffers
and call tokio-epoll-uring's `write` function with that owned buffer.
That'll be done in #6378.
---
 pageserver/src/tenant/ephemeral_file.rs | 51 ++++++++++++++++++-------
 pageserver/src/virtual_file.rs          | 50 +++++++++++++++++-------
 2 files changed, 74 insertions(+), 27 deletions(-)

diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 6b8cd77d78..2bedbf7f61 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -6,6 +6,7 @@ use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::{self, VirtualFile};
+use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use std::cmp::min;
@@ -26,7 +27,10 @@ pub struct EphemeralFile {
     /// An ephemeral file is append-only.
     /// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
     /// The other pages, which can no longer be modified, are accessed through the page cache.
-    mutable_tail: [u8; PAGE_SZ],
+    ///
+    /// None <=> IO is ongoing.
+    /// Size is fixed to PAGE_SZ at creation time and must not be changed.
+    mutable_tail: Option<BytesMut>,
 }
 
 impl EphemeralFile {
@@ -60,7 +64,7 @@ impl EphemeralFile {
             _timeline_id: timeline_id,
             file,
             len: 0,
-            mutable_tail: [0u8; PAGE_SZ],
+            mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)),
         })
     }
 
@@ -103,7 +107,13 @@ impl EphemeralFile {
             };
         } else {
             debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
-            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
+            Ok(BlockLease::EphemeralFileMutableTail(
+                self.mutable_tail
+                    .as_deref()
+                    .expect("we're not doing IO, it must be Some()")
+                    .try_into()
+                    .expect("we ensure that it's always PAGE_SZ"),
+            ))
         }
     }
 
@@ -135,21 +145,27 @@ impl EphemeralFile {
             ) -> Result<(), io::Error> {
                 let mut src_remaining = src;
                 while !src_remaining.is_empty() {
-                    let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
+                    let dst_remaining = &mut self
+                        .ephemeral_file
+                        .mutable_tail
+                        .as_deref_mut()
+                        .expect("IO is not yet ongoing")[self.off..];
                     let n = min(dst_remaining.len(), src_remaining.len());
                     dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
                     self.off += n;
                     src_remaining = &src_remaining[n..];
                     if self.off == PAGE_SZ {
-                        match self
+                        let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail)
+                            .expect("IO is not yet ongoing");
+                        let (mutable_tail, res) = self
                             .ephemeral_file
                             .file
-                            .write_all_at(
-                                &self.ephemeral_file.mutable_tail,
-                                self.blknum as u64 * PAGE_SZ as u64,
-                            )
-                            .await
-                        {
+                            .write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64)
+                            .await;
+                        // TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail.
+                        // I.e., the IO isn't retryable if we panic.
+                        self.ephemeral_file.mutable_tail = Some(mutable_tail);
+                        match res {
                             Ok(_) => {
                                 // Pre-warm the page cache with what we just wrote.
                                 // This isn't necessary for coherency/correctness, but it's how we've always done it.
@@ -169,7 +185,12 @@ impl EphemeralFile {
                                     Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
                                         let buf: &mut [u8] = write_guard.deref_mut();
                                         debug_assert_eq!(buf.len(), PAGE_SZ);
-                                        buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
+                                        buf.copy_from_slice(
+                                            self.ephemeral_file
+                                                .mutable_tail
+                                                .as_deref()
+                                                .expect("IO is not ongoing"),
+                                        );
                                         let _ = write_guard.mark_valid();
                                         // pre-warm successful
                                     }
@@ -181,7 +202,11 @@ impl EphemeralFile {
                                 // Zero the buffer for re-use.
                                 // Zeroing is critical for correcntess because the write_blob code below
                                 // and similarly read_blk expect zeroed pages.
-                                self.ephemeral_file.mutable_tail.fill(0);
+                                self.ephemeral_file
+                                    .mutable_tail
+                                    .as_deref_mut()
+                                    .expect("IO is not ongoing")
+                                    .fill(0);
                                 // This block is done, move to next one.
                                 self.blknum += 1;
                                 self.off = 0;
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 2a8c22430b..858fc0ef64 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -568,24 +568,37 @@ impl VirtualFile {
     }
 
     // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
-    pub async fn write_all_at(&self, mut buf: &[u8], mut offset: u64) -> Result<(), Error> {
+    pub async fn write_all_at<B: BoundedBuf>(
+        &self,
+        buf: B,
+        mut offset: u64,
+    ) -> (B::Buf, Result<(), Error>) {
+        let buf_len = buf.bytes_init();
+        if buf_len == 0 {
+            return (Slice::into_inner(buf.slice_full()), Ok(()));
+        }
+        let mut buf = buf.slice(0..buf_len);
         while !buf.is_empty() {
-            match self.write_at(buf, offset).await {
+            // TODO: push `buf` further down
+            match self.write_at(&buf, offset).await {
                 Ok(0) => {
-                    return Err(Error::new(
-                        std::io::ErrorKind::WriteZero,
-                        "failed to write whole buffer",
-                    ));
+                    return (
+                        Slice::into_inner(buf),
+                        Err(Error::new(
+                            std::io::ErrorKind::WriteZero,
+                            "failed to write whole buffer",
+                        )),
+                    );
                 }
                 Ok(n) => {
-                    buf = &buf[n..];
+                    buf = buf.slice(n..);
                     offset += n as u64;
                 }
                 Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
+                Err(e) => return (Slice::into_inner(buf), Err(e)),
             }
         }
-        Ok(())
+        (Slice::into_inner(buf), Ok(()))
     }
 
     /// Writes `buf.slice(0..buf.bytes_init())`.
@@ -1050,10 +1063,19 @@ mod tests {
                 MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
             }
         }
-        async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> {
+        async fn write_all_at<B: BoundedBuf>(&self, buf: B, offset: u64) -> Result<(), Error> {
             match self {
-                MaybeVirtualFile::VirtualFile(file) => file.write_all_at(buf, offset).await,
-                MaybeVirtualFile::File(file) => file.write_all_at(buf, offset),
+                MaybeVirtualFile::VirtualFile(file) => {
+                    let (_buf, res) = file.write_all_at(buf, offset).await;
+                    res
+                }
+                MaybeVirtualFile::File(file) => {
+                    let buf_len = buf.bytes_init();
+                    if buf_len == 0 {
+                        return Ok(());
+                    }
+                    file.write_all_at(&buf.slice(0..buf_len), offset)
+                }
             }
         }
         async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
@@ -1200,8 +1222,8 @@ mod tests {
                 .to_owned(),
         )
         .await?;
-        file_b.write_all_at(b"BAR", 3).await?;
-        file_b.write_all_at(b"FOO", 0).await?;
+        file_b.write_all_at(b"BAR".to_vec(), 3).await?;
+        file_b.write_all_at(b"FOO".to_vec(), 0).await?;
 
         assert_eq!(file_b.read_string_at(2, 3).await?, "OBA");
 

From 840abe395413508db40d0428e30f09343c051fed Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 14 Feb 2024 15:01:16 +0000
Subject: [PATCH 0175/1571] pageserver: store aux files as deltas (#6742)

## Problem

Aux files were stored with an O(N^2) cost, since on each modification
the entire map is re-written as a page image.

This addresses one axis of the inefficiency in logical replication's use
of storage (https://github.com/neondatabase/neon/issues/6626). It will
still be writing a large amount of duplicative data if writing the same
slot's state every 15 seconds, but the impact will be O(N) instead of
O(N^2).

## Summary of changes

- Introduce `NeonWalRecord::AuxFile`
- In `DatadirModification`, if the AUX_FILES_KEY has already been set,
then write a delta instead of an image
---
 pageserver/src/pgdatadir_mapping.rs  | 162 +++++++++++++++++++++++----
 pageserver/src/tenant.rs             |  41 ++++---
 pageserver/src/walrecord.rs          |   5 +
 pageserver/src/walredo.rs            |   2 +-
 pageserver/src/walredo/apply_neon.rs |  70 +++++++++++-
 5 files changed, 242 insertions(+), 38 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 5f80ea9b5e..0ff03303d4 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -156,6 +156,7 @@ impl Timeline {
             pending_updates: HashMap::new(),
             pending_deletions: Vec::new(),
             pending_nblocks: 0,
+            pending_aux_files: None,
             pending_directory_entries: Vec::new(),
             lsn,
         }
@@ -870,6 +871,14 @@ pub struct DatadirModification<'a> {
     pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
     pending_deletions: Vec<(Range<Key>, Lsn)>,
     pending_nblocks: i64,
+
+    // If we already wrote any aux file changes in this modification, stash the latest dir.  If set,
+    // [`Self::put_file`] may assume that it is safe to emit a delta rather than checking
+    // if AUX_FILES_KEY is already set.
+    pending_aux_files: Option<AuxFilesDirectory>,
+
+    /// For special "directory" keys that store key-value maps, track the size of the map
+    /// if it was updated in this modification.
     pending_directory_entries: Vec<(DirectoryKind, usize)>,
 }
 
@@ -1384,31 +1393,76 @@ impl<'a> DatadirModification<'a> {
         content: &[u8],
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
-            Ok(buf) => AuxFilesDirectory::des(&buf)?,
-            Err(e) => {
-                // This is expected: historical databases do not have the key.
-                debug!("Failed to get info about AUX files: {}", e);
-                AuxFilesDirectory {
-                    files: HashMap::new(),
+        let file_path = path.to_string();
+        let content = if content.is_empty() {
+            None
+        } else {
+            Some(Bytes::copy_from_slice(content))
+        };
+
+        let dir = if let Some(mut dir) = self.pending_aux_files.take() {
+            // We already updated aux files in `self`: emit a delta and update our latest value
+
+            self.put(
+                AUX_FILES_KEY,
+                Value::WalRecord(NeonWalRecord::AuxFile {
+                    file_path: file_path.clone(),
+                    content: content.clone(),
+                }),
+            );
+
+            dir.upsert(file_path, content);
+            dir
+        } else {
+            // Check if the AUX_FILES_KEY is initialized
+            match self.get(AUX_FILES_KEY, ctx).await {
+                Ok(dir_bytes) => {
+                    let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
+                    // Key is already set, we may append a delta
+                    self.put(
+                        AUX_FILES_KEY,
+                        Value::WalRecord(NeonWalRecord::AuxFile {
+                            file_path: file_path.clone(),
+                            content: content.clone(),
+                        }),
+                    );
+                    dir.upsert(file_path, content);
+                    dir
+                }
+                Err(
+                    e @ (PageReconstructError::AncestorStopping(_)
+                    | PageReconstructError::Cancelled
+                    | PageReconstructError::AncestorLsnTimeout(_)),
+                ) => {
+                    // Important that we do not interpret a shutdown error as "not found" and thereby
+                    // reset the map.
+                    return Err(e.into());
+                }
+                // FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
+                // we are assuming that all _other_ possible errors represents a missing key.  If some
+                // other error occurs, we may incorrectly reset the map of aux files.
+                Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
+                    // Key is missing, we must insert an image as the basis for subsequent deltas.
+
+                    let mut dir = AuxFilesDirectory {
+                        files: HashMap::new(),
+                    };
+                    dir.upsert(file_path, content);
+                    self.put(
+                        AUX_FILES_KEY,
+                        Value::Image(Bytes::from(
+                            AuxFilesDirectory::ser(&dir).context("serialize")?,
+                        )),
+                    );
+                    dir
                 }
             }
         };
-        let path = path.to_string();
-        if content.is_empty() {
-            dir.files.remove(&path);
-        } else {
-            dir.files.insert(path, Bytes::copy_from_slice(content));
-        }
+
         self.pending_directory_entries
             .push((DirectoryKind::AuxFiles, dir.files.len()));
+        self.pending_aux_files = Some(dir);
 
-        self.put(
-            AUX_FILES_KEY,
-            Value::Image(Bytes::from(
-                AuxFilesDirectory::ser(&dir).context("serialize")?,
-            )),
-        );
         Ok(())
     }
 
@@ -1618,8 +1672,18 @@ struct RelDirectory {
 }
 
 #[derive(Debug, Serialize, Deserialize, Default)]
-struct AuxFilesDirectory {
-    files: HashMap<String, Bytes>,
+pub(crate) struct AuxFilesDirectory {
+    pub(crate) files: HashMap<String, Bytes>,
+}
+
+impl AuxFilesDirectory {
+    pub(crate) fn upsert(&mut self, key: String, value: Option<Bytes>) {
+        if let Some(value) = value {
+            self.files.insert(key, value);
+        } else {
+            self.files.remove(&key);
+        }
+    }
 }
 
 #[derive(Debug, Serialize, Deserialize)]
@@ -1655,8 +1719,60 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
-    //use super::repo_harness::*;
-    //use super::*;
+    use hex_literal::hex;
+    use utils::id::TimelineId;
+
+    use super::*;
+
+    use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
+
+    /// Test a round trip of aux file updates, from DatadirModification to reading back from the Timeline
+    #[tokio::test]
+    async fn aux_files_round_trip() -> anyhow::Result<()> {
+        let name = "aux_files_round_trip";
+        let harness = TenantHarness::create(name)?;
+
+        pub const TIMELINE_ID: TimelineId =
+            TimelineId::from_array(hex!("11223344556677881122334455667788"));
+
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        let tline = tline.raw_timeline().unwrap();
+
+        // First modification: insert two keys
+        let mut modification = tline.begin_modification(Lsn(0x1000));
+        modification.put_file("foo/bar1", b"content1", &ctx).await?;
+        modification.set_lsn(Lsn(0x1008))?;
+        modification.put_file("foo/bar2", b"content2", &ctx).await?;
+        modification.commit(&ctx).await?;
+        let expect_1008 = HashMap::from([
+            ("foo/bar1".to_string(), Bytes::from_static(b"content1")),
+            ("foo/bar2".to_string(), Bytes::from_static(b"content2")),
+        ]);
+
+        let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
+        assert_eq!(readback, expect_1008);
+
+        // Second modification: update one key, remove the other
+        let mut modification = tline.begin_modification(Lsn(0x2000));
+        modification.put_file("foo/bar1", b"content3", &ctx).await?;
+        modification.set_lsn(Lsn(0x2008))?;
+        modification.put_file("foo/bar2", b"", &ctx).await?;
+        modification.commit(&ctx).await?;
+        let expect_2008 =
+            HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]);
+
+        let readback = tline.list_aux_files(Lsn(0x2008), &ctx).await?;
+        assert_eq!(readback, expect_2008);
+
+        // Reading back in time works
+        let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
+        assert_eq!(readback, expect_1008);
+
+        Ok(())
+    }
 
     /*
         fn assert_current_logical_size<R: Repository>(timeline: &DatadirTimeline<R>, lsn: Lsn) {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1f3bc13472..44a446d697 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3901,6 +3901,7 @@ pub(crate) mod harness {
     use utils::lsn::Lsn;
 
     use crate::deletion_queue::mock::MockDeletionQueue;
+    use crate::walredo::apply_neon;
     use crate::{
         config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord,
     };
@@ -4160,20 +4161,34 @@ pub(crate) mod harness {
             records: Vec<(Lsn, NeonWalRecord)>,
             _pg_version: u32,
         ) -> anyhow::Result<Bytes> {
-            let s = format!(
-                "redo for {} to get to {}, with {} and {} records",
-                key,
-                lsn,
-                if base_img.is_some() {
-                    "base image"
-                } else {
-                    "no base image"
-                },
-                records.len()
-            );
-            println!("{s}");
+            let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
 
-            Ok(TEST_IMG(&s))
+            if records_neon {
+                // For Neon wal records, we can decode without spawning postgres, so do so.
+                let base_img = base_img.expect("Neon WAL redo requires base image").1;
+                let mut page = BytesMut::new();
+                page.extend_from_slice(&base_img);
+                for (_record_lsn, record) in records {
+                    apply_neon::apply_in_neon(&record, key, &mut page)?;
+                }
+                Ok(page.freeze())
+            } else {
+                // We never spawn a postgres walredo process in unit tests: just log what we might have done.
+                let s = format!(
+                    "redo for {} to get to {}, with {} and {} records",
+                    key,
+                    lsn,
+                    if base_img.is_some() {
+                        "base image"
+                    } else {
+                        "no base image"
+                    },
+                    records.len()
+                );
+                println!("{s}");
+
+                Ok(TEST_IMG(&s))
+            }
         }
     }
 }
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index ff6bc9194b..1b7777a544 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -44,6 +44,11 @@ pub enum NeonWalRecord {
         moff: MultiXactOffset,
         members: Vec<MultiXactMember>,
     },
+    /// Update the map of AUX files, either writing or dropping an entry
+    AuxFile {
+        file_path: String,
+        content: Option<Bytes>,
+    },
 }
 
 impl NeonWalRecord {
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 98a6a0bb6c..35cbefb92c 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,7 +22,7 @@
 mod process;
 
 /// Code to apply [`NeonWalRecord`]s.
-mod apply_neon;
+pub(crate) mod apply_neon;
 
 use crate::config::PageServerConf;
 use crate::metrics::{
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index 52899349c4..6ce90e0c47 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -1,7 +1,8 @@
+use crate::pgdatadir_mapping::AuxFilesDirectory;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
-use bytes::BytesMut;
+use bytes::{BufMut, BytesMut};
 use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key};
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants;
@@ -12,6 +13,7 @@ use postgres_ffi::v14::nonrelfile_utils::{
 };
 use postgres_ffi::BLCKSZ;
 use tracing::*;
+use utils::bin_ser::BeSer;
 
 /// Can this request be served by neon redo functions
 /// or we need to pass it to wal-redo postgres process?
@@ -230,6 +232,72 @@ pub(crate) fn apply_in_neon(
                 LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
             }
         }
+        NeonWalRecord::AuxFile { file_path, content } => {
+            let mut dir = AuxFilesDirectory::des(page)?;
+            dir.upsert(file_path.clone(), content.clone());
+
+            page.clear();
+            let mut writer = page.writer();
+            dir.ser_into(&mut writer)?;
+        }
     }
     Ok(())
 }
+
+#[cfg(test)]
+mod test {
+    use bytes::Bytes;
+    use pageserver_api::key::AUX_FILES_KEY;
+
+    use super::*;
+    use std::collections::HashMap;
+
+    use crate::{pgdatadir_mapping::AuxFilesDirectory, walrecord::NeonWalRecord};
+
+    /// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile
+    #[test]
+    fn apply_aux_file_deltas() -> anyhow::Result<()> {
+        let base_dir = AuxFilesDirectory {
+            files: HashMap::from([
+                ("two".to_string(), Bytes::from_static(b"content0")),
+                ("three".to_string(), Bytes::from_static(b"contentX")),
+            ]),
+        };
+        let base_image = AuxFilesDirectory::ser(&base_dir)?;
+
+        let deltas = vec![
+            // Insert
+            NeonWalRecord::AuxFile {
+                file_path: "one".to_string(),
+                content: Some(Bytes::from_static(b"content1")),
+            },
+            // Update
+            NeonWalRecord::AuxFile {
+                file_path: "two".to_string(),
+                content: Some(Bytes::from_static(b"content99")),
+            },
+            // Delete
+            NeonWalRecord::AuxFile {
+                file_path: "three".to_string(),
+                content: None,
+            },
+        ];
+
+        let file_path = AUX_FILES_KEY;
+        let mut page = BytesMut::from_iter(base_image);
+
+        for record in deltas {
+            apply_in_neon(&record, file_path, &mut page)?;
+        }
+
+        let reconstructed = AuxFilesDirectory::des(&page)?;
+        let expect = HashMap::from([
+            ("one".to_string(), Bytes::from_static(b"content1")),
+            ("two".to_string(), Bytes::from_static(b"content99")),
+        ]);
+
+        assert_eq!(reconstructed.files, expect);
+
+        Ok(())
+    }
+}

From 7d3cdc05d486ee1a1ef5ec8d7137949bcf7d036e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 14 Feb 2024 18:01:15 +0100
Subject: [PATCH 0176/1571] fix(pageserver): pagebench doesn't work with
 released artifacts (#6757)

The canonical release artifact of neon.git is the Docker image with all
the binaries in them:

```
docker pull neondatabase/neon:release-4854
docker create --name extract neondatabase/neon:release-4854
docker cp extract:/usr/local/bin/pageserver ./pageserver.release-4854
chmod +x pageserver.release-4854
cp -a pageserver.release-4854 ./target/release/pageserver
```

Before this PR, these artifacts didn't expose the `keyspace` API,
thereby preventing `pagebench get-page-latest-lsn` from working.

Having working pagebench is useful, e.g., for experiments in staging.
So, expose the API, but don't document it, as it's not part of the
interface with control plane.
---
 pageserver/src/http/routes.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index c354cc9ab6..ab546c873a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2214,7 +2214,7 @@ pub fn make_router(
         )
         .get(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
-            |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
+            |r| api_handler(r, timeline_collect_keyspace),
         )
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
         .any(handler_404))

From a2d0d44b4248769c30fff79ef70f42e3174f4023 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 14 Feb 2024 19:16:05 +0100
Subject: [PATCH 0177/1571] Remove unused allow's (#6760)

These allow's became redundant some time ago so remove them, or address
them if addressing is very simple.
---
 control_plane/attachment_service/src/persistence.rs | 2 --
 libs/metrics/src/lib.rs                             | 1 -
 libs/postgres_ffi/src/lib.rs                        | 2 +-
 libs/remote_storage/src/local_fs.rs                 | 1 -
 libs/utils/benches/benchmarks.rs                    | 2 --
 pageserver/src/deletion_queue.rs                    | 1 -
 pageserver/src/disk_usage_eviction_task.rs          | 6 ------
 pageserver/src/task_mgr.rs                          | 5 -----
 pageserver/src/tenant.rs                            | 1 -
 pageserver/src/tenant/disk_btree.rs                 | 1 -
 pageserver/src/tenant/timeline/eviction_task.rs     | 2 +-
 s3_scrubber/src/cloud_admin_api.rs                  | 6 +-----
 12 files changed, 3 insertions(+), 27 deletions(-)

diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 457dc43232..5b3b032bc9 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -381,7 +381,6 @@ impl Persistence {
     //
     // We create the child shards here, so that they will be available for increment_generation calls
     // if some pageserver holding a child shard needs to restart before the overall tenant split is complete.
-    #[allow(dead_code)]
     pub(crate) async fn begin_shard_split(
         &self,
         old_shard_count: ShardCount,
@@ -449,7 +448,6 @@ impl Persistence {
 
     // When we finish shard splitting, we must atomically clean up the old shards
     // and insert the new shards, and clear the splitting marker.
-    #[allow(dead_code)]
     pub(crate) async fn complete_shard_split(
         &self,
         split_tenant_id: TenantId,
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index b57fd9f33b..18786106d1 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -115,7 +115,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
 // performed by the process.
 // We know the size of the block, so we can determine the I/O bytes out of it.
 // The value might be not 100% exact, but should be fine for Prometheus metrics in this case.
-#[allow(clippy::unnecessary_cast)]
 fn update_rusage_metrics() {
     let rusage_stats = get_rusage_stats();
 
diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index d10ebfe277..aa6845b9b1 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -3,7 +3,7 @@
 #![allow(non_snake_case)]
 // bindgen creates some unsafe code with no doc comments.
 #![allow(clippy::missing_safety_doc)]
-// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code.
+// noted at 1.63 that in many cases there's u32 -> u32 transmutes in bindgen code.
 #![allow(clippy::useless_transmute)]
 // modules included with the postgres_ffi macro depend on the types of the specific version's
 // types, and trigger a too eager lint.
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index f53ba9db07..e88111e8e2 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -435,7 +435,6 @@ impl RemoteStorage for LocalFs {
         Ok(())
     }
 
-    #[allow(clippy::diverging_sub_expression)]
     async fn time_travel_recover(
         &self,
         _prefix: Option<&RemotePath>,
diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs
index 98d839ca55..44eb36387c 100644
--- a/libs/utils/benches/benchmarks.rs
+++ b/libs/utils/benches/benchmarks.rs
@@ -1,5 +1,3 @@
-#![allow(unused)]
-
 use criterion::{criterion_group, criterion_main, Criterion};
 use utils::id;
 
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index e0c40ea1b0..f8f2866a3b 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -835,7 +835,6 @@ mod test {
     }
 
     impl ControlPlaneGenerationsApi for MockControlPlane {
-        #[allow(clippy::diverging_sub_expression)] // False positive via async_trait
         async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
             unimplemented!()
         }
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index d5f5a20683..b1c6f35704 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -351,7 +351,6 @@ pub enum IterationOutcome<U> {
     Finished(IterationOutcomeFinished<U>),
 }
 
-#[allow(dead_code)]
 #[derive(Debug, Serialize)]
 pub struct IterationOutcomeFinished<U> {
     /// The actual usage observed before we started the iteration.
@@ -366,7 +365,6 @@ pub struct IterationOutcomeFinished<U> {
 }
 
 #[derive(Debug, Serialize)]
-#[allow(dead_code)]
 struct AssumedUsage<U> {
     /// The expected value for `after`, after phase 2.
     projected_after: U,
@@ -374,14 +372,12 @@ struct AssumedUsage<U> {
     failed: LayerCount,
 }
 
-#[allow(dead_code)]
 #[derive(Debug, Serialize)]
 struct PlannedUsage<U> {
     respecting_tenant_min_resident_size: U,
     fallback_to_global_lru: Option<U>,
 }
 
-#[allow(dead_code)]
 #[derive(Debug, Default, Serialize)]
 struct LayerCount {
     file_sizes: u64,
@@ -565,7 +561,6 @@ pub(crate) struct EvictionSecondaryLayer {
 #[derive(Clone)]
 pub(crate) enum EvictionLayer {
     Attached(Layer),
-    #[allow(dead_code)]
     Secondary(EvictionSecondaryLayer),
 }
 
@@ -1105,7 +1100,6 @@ mod filesystem_level_usage {
     use super::DiskUsageEvictionTaskConfig;
 
     #[derive(Debug, Clone, Copy)]
-    #[allow(dead_code)]
     pub struct Usage<'a> {
         config: &'a DiskUsageEvictionTaskConfig,
 
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 3cec5fa850..6317b0a7ae 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -30,10 +30,6 @@
 //! only a single tenant or timeline.
 //!
 
-// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro.
-// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224.
-#![allow(clippy::declare_interior_mutable_const)]
-
 use std::collections::HashMap;
 use std::fmt;
 use std::future::Future;
@@ -312,7 +308,6 @@ struct MutableTaskState {
 }
 
 struct PageServerTask {
-    #[allow(dead_code)] // unused currently
     task_id: PageserverTaskId,
 
     kind: TaskKind,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 44a446d697..dc9b8247a5 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4360,7 +4360,6 @@ mod tests {
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let mut lsn = start_lsn;
-        #[allow(non_snake_case)]
         {
             let writer = tline.writer().await;
             // Create a relation on the timeline
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 06a04bf536..9f104aff86 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -36,7 +36,6 @@ use crate::{
 pub const VALUE_SZ: usize = 5;
 pub const MAX_VALUE: u64 = 0x007f_ffff_ffff;
 
-#[allow(dead_code)]
 pub const PAGE_SZ: usize = 8192;
 
 #[derive(Clone, Copy, Debug)]
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index d87f78e35f..33ba234a63 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -196,13 +196,13 @@ impl Timeline {
             ControlFlow::Continue(()) => (),
         }
 
-        #[allow(dead_code)]
         #[derive(Debug, Default)]
         struct EvictionStats {
             candidates: usize,
             evicted: usize,
             errors: usize,
             not_evictable: usize,
+            #[allow(dead_code)]
             skipped_for_shutdown: usize,
         }
 
diff --git a/s3_scrubber/src/cloud_admin_api.rs b/s3_scrubber/src/cloud_admin_api.rs
index 151421c84f..45cac23690 100644
--- a/s3_scrubber/src/cloud_admin_api.rs
+++ b/s3_scrubber/src/cloud_admin_api.rs
@@ -1,11 +1,7 @@
-#![allow(unused)]
-
-use std::str::FromStr;
 use std::time::Duration;
 
 use chrono::{DateTime, Utc};
 use hex::FromHex;
-use pageserver::tenant::Tenant;
 use reqwest::{header, Client, StatusCode, Url};
 use serde::Deserialize;
 use tokio::sync::Semaphore;
@@ -290,7 +286,7 @@ impl CloudAdminApiClient {
                     tokio::time::sleep(Duration::from_millis(500)).await;
                     continue;
                 }
-                status => {
+                _status => {
                     return Err(Error::new(
                         "List active projects".to_string(),
                         ErrorKind::ResponseStatus(response.status()),

From c7538a2c20178ecd32662de3200cfe9fff19e8a3 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 14 Feb 2024 19:43:52 +0100
Subject: [PATCH 0178/1571] Proxy: remove fail fast logic to connect to compute
 (#6759)

## Problem

Flaky tests

## Summary of changes

Remove failfast logic
---
 proxy/src/proxy/connect_compute.rs | 35 ++++++++++++++---------------
 proxy/src/proxy/tests.rs           | 36 ------------------------------
 2 files changed, 17 insertions(+), 54 deletions(-)

diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 6e57caf998..c76e2ff6d9 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -122,25 +122,24 @@ where
 
     error!(error = ?err, "could not connect to compute node");
 
-    let node_info =
-        if err.get_error_kind() == crate::error::ErrorKind::Postgres || !node_info.cached() {
-            // If the error is Postgres, that means that we managed to connect to the compute node, but there was an error.
-            // Do not need to retrieve a new node_info, just return the old one.
-            if !err.should_retry(num_retries) {
-                return Err(err.into());
-            }
-            node_info
-        } else {
-            // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
-            info!("compute node's state has likely changed; requesting a wake-up");
-            ctx.latency_timer.cache_miss();
-            let old_node_info = invalidate_cache(node_info);
-            let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
-            node_info.reuse_settings(old_node_info);
+    let node_info = if !node_info.cached() {
+        // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
+        // Do not need to retrieve a new node_info, just return the old one.
+        if !err.should_retry(num_retries) {
+            return Err(err.into());
+        }
+        node_info
+    } else {
+        // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
+        info!("compute node's state has likely changed; requesting a wake-up");
+        ctx.latency_timer.cache_miss();
+        let old_node_info = invalidate_cache(node_info);
+        let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+        node_info.reuse_settings(old_node_info);
 
-            mechanism.update_connect_config(&mut node_info.config);
-            node_info
-        };
+        mechanism.update_connect_config(&mut node_info.config);
+        node_info
+    };
 
     // now that we have a new node, try connect to it repeatedly.
     // this can error for a few reasons, for instance:
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index efbd661bbf..1a01f32339 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -375,8 +375,6 @@ enum ConnectAction {
     Connect,
     Retry,
     Fail,
-    RetryPg,
-    FailPg,
 }
 
 #[derive(Clone)]
@@ -466,14 +464,6 @@ impl ConnectMechanism for TestConnectMechanism {
                 retryable: false,
                 kind: ErrorKind::Compute,
             }),
-            ConnectAction::FailPg => Err(TestConnectError {
-                retryable: false,
-                kind: ErrorKind::Postgres,
-            }),
-            ConnectAction::RetryPg => Err(TestConnectError {
-                retryable: true,
-                kind: ErrorKind::Postgres,
-            }),
             x => panic!("expecting action {:?}, connect is called instead", x),
         }
     }
@@ -572,32 +562,6 @@ async fn connect_to_compute_retry() {
     mechanism.verify();
 }
 
-#[tokio::test]
-async fn connect_to_compute_retry_pg() {
-    let _ = env_logger::try_init();
-    use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Wake, RetryPg, Connect]);
-    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
-        .await
-        .unwrap();
-    mechanism.verify();
-}
-
-#[tokio::test]
-async fn connect_to_compute_fail_pg() {
-    let _ = env_logger::try_init();
-    use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Wake, FailPg]);
-    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
-        .await
-        .unwrap_err();
-    mechanism.verify();
-}
-
 /// Test that we don't retry if the error is not retryable.
 #[tokio::test]
 async fn connect_to_compute_non_retry_1() {

From fff2468aa2780edb3941f9851e19ee0bfb1fafd1 Mon Sep 17 00:00:00 2001
From: Shayan Hosseini <shayan@neon.tech>
Date: Wed, 14 Feb 2024 10:45:05 -0800
Subject: [PATCH 0179/1571] Add resource consume test funcs (#6747)

## Problem

Building on #5875 to add handy test functions for autoscaling.

Resolves #5609

## Summary of changes

This PR makes the following changes to #5875:
- Enable `neon_test_utils` extension in the compute node docker image,
so we could use it in the e2e tests (as discussed with @kelvich).
- Removed test functions related to disk as we don't use them for
autoscaling.
- Fix the warning with printf-ing unsigned long variables.

---------

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 Dockerfile.compute-node                       |   4 +
 pgxn/neon_test_utils/neon_test_utils--1.0.sql |  18 +++
 pgxn/neon_test_utils/neon_test_utils.control  |   1 +
 pgxn/neon_test_utils/neontest.c               | 118 ++++++++++++++++++
 .../sql_regress/expected/neon-test-utils.out  |  28 +++++
 test_runner/sql_regress/parallel_schedule     |   1 +
 .../sql_regress/sql/neon-test-utils.sql       |  11 ++
 7 files changed, 181 insertions(+)
 create mode 100644 test_runner/sql_regress/expected/neon-test-utils.out
 create mode 100644 test_runner/sql_regress/sql/neon-test-utils.sql

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index cc7a110008..4eb6dc91c0 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -820,6 +820,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
         PG_CONFIG=/usr/local/pgsql/bin/pg_config \
         -C pgxn/neon_utils \
         -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
+        -C pgxn/neon_test_utils \
+        -s install && \
     make -j $(getconf _NPROCESSORS_ONLN) \
         PG_CONFIG=/usr/local/pgsql/bin/pg_config \
         -C pgxn/neon_rmgr \
diff --git a/pgxn/neon_test_utils/neon_test_utils--1.0.sql b/pgxn/neon_test_utils/neon_test_utils--1.0.sql
index 402981a9a6..23340e352e 100644
--- a/pgxn/neon_test_utils/neon_test_utils--1.0.sql
+++ b/pgxn/neon_test_utils/neon_test_utils--1.0.sql
@@ -7,6 +7,24 @@ AS 'MODULE_PATHNAME', 'test_consume_xids'
 LANGUAGE C STRICT
 PARALLEL UNSAFE;
 
+CREATE FUNCTION test_consume_cpu(seconds int)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'test_consume_cpu'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
+
+CREATE FUNCTION test_consume_memory(megabytes int)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'test_consume_memory'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
+
+CREATE FUNCTION test_release_memory(megabytes int DEFAULT NULL)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'test_release_memory'
+LANGUAGE C
+PARALLEL UNSAFE;
+
 CREATE FUNCTION clear_buffer_cache()
 RETURNS VOID
 AS 'MODULE_PATHNAME', 'clear_buffer_cache'
diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control
index 94e6720503..5219571f11 100644
--- a/pgxn/neon_test_utils/neon_test_utils.control
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -3,3 +3,4 @@ comment = 'helpers for neon testing and debugging'
 default_version = '1.0'
 module_pathname = '$libdir/neon_test_utils'
 relocatable = true
+trusted = true
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index aa644efd40..7c618848e2 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -21,10 +21,12 @@
 #include "miscadmin.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
+#include "storage/fd.h"
 #include "utils/builtins.h"
 #include "utils/pg_lsn.h"
 #include "utils/rel.h"
 #include "utils/varlena.h"
+#include "utils/wait_event.h"
 #include "../neon/pagestore_client.h"
 
 PG_MODULE_MAGIC;
@@ -32,6 +34,9 @@ PG_MODULE_MAGIC;
 extern void _PG_init(void);
 
 PG_FUNCTION_INFO_V1(test_consume_xids);
+PG_FUNCTION_INFO_V1(test_consume_cpu);
+PG_FUNCTION_INFO_V1(test_consume_memory);
+PG_FUNCTION_INFO_V1(test_release_memory);
 PG_FUNCTION_INFO_V1(clear_buffer_cache);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
@@ -97,6 +102,119 @@ test_consume_xids(PG_FUNCTION_ARGS)
 	PG_RETURN_VOID();
 }
 
+
+/*
+ * test_consume_cpu(seconds int). Keeps one CPU busy for the given number of seconds.
+ */
+Datum
+test_consume_cpu(PG_FUNCTION_ARGS)
+{
+	int32		seconds = PG_GETARG_INT32(0);
+	TimestampTz start;
+	uint64		total_iterations = 0;
+
+	start = GetCurrentTimestamp();
+
+	for (;;)
+	{
+		TimestampTz elapsed;
+
+		elapsed = GetCurrentTimestamp() - start;
+		if (elapsed > (TimestampTz) seconds * USECS_PER_SEC)
+			break;
+
+		/* keep spinning */
+		for (int i = 0; i < 1000000; i++)
+			total_iterations++;
+		elog(DEBUG2, "test_consume_cpu(): %lu iterations in total", total_iterations);
+
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	PG_RETURN_VOID();
+}
+
+static MemoryContext consume_cxt = NULL;
+static slist_head consumed_memory_chunks;
+static int64 num_memory_chunks;
+
+/*
+ * test_consume_memory(megabytes int).
+ *
+ * Consume given amount of memory. The allocation is made in TopMemoryContext,
+ * so it outlives the function, until you call test_release_memory to
+ * explicitly release it, or close the session.
+ */
+Datum
+test_consume_memory(PG_FUNCTION_ARGS)
+{
+	int32		megabytes = PG_GETARG_INT32(0);
+
+	/*
+	 * Consume the memory in a new memory context, so that it's convenient to
+	 * release and to display it separately in a possible memory context dump.
+	 */
+	if (consume_cxt == NULL)
+		consume_cxt = AllocSetContextCreate(TopMemoryContext,
+											"test_consume_memory",
+											ALLOCSET_DEFAULT_SIZES);
+
+	for (int32 i = 0; i < megabytes; i++)
+	{
+		char	   *p;
+
+		p = MemoryContextAllocZero(consume_cxt, 1024 * 1024);
+
+		/* touch the memory, so that it's really allocated by the kernel */
+		for (int j = 0; j < 1024 * 1024; j += 1024)
+			p[j] = j % 0xFF;
+
+		slist_push_head(&consumed_memory_chunks, (slist_node *) p);
+		num_memory_chunks++;
+	}
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * test_release_memory(megabytes int). NULL releases all
+ */
+Datum
+test_release_memory(PG_FUNCTION_ARGS)
+{
+	TimestampTz start;
+
+	if (PG_ARGISNULL(0))
+	{
+		if (consume_cxt)
+		{
+			MemoryContextDelete(consume_cxt);
+			consume_cxt = NULL;
+			num_memory_chunks = 0;
+		}
+	}
+	else
+	{
+		int32		chunks_to_release = PG_GETARG_INT32(0);
+
+		if (chunks_to_release > num_memory_chunks)
+		{
+			elog(WARNING, "only %lu MB is consumed, releasing it all", num_memory_chunks);
+			chunks_to_release = num_memory_chunks;
+		}
+
+		for (int32 i = 0; i < chunks_to_release; i++)
+		{
+			slist_node *chunk = slist_pop_head_node(&consumed_memory_chunks);
+
+			pfree(chunk);
+			num_memory_chunks--;
+		}
+	}
+
+	PG_RETURN_VOID();
+}
+
 /*
  * Flush the buffer cache, evicting all pages that are not currently pinned.
  */
diff --git a/test_runner/sql_regress/expected/neon-test-utils.out b/test_runner/sql_regress/expected/neon-test-utils.out
new file mode 100644
index 0000000000..7d1634a6b8
--- /dev/null
+++ b/test_runner/sql_regress/expected/neon-test-utils.out
@@ -0,0 +1,28 @@
+-- Test the test utils in pgxn/neon_test_utils. We don't test that
+-- these actually consume resources like they should - that would be
+-- tricky - but at least we check that they don't crash.
+CREATE EXTENSION neon_test_utils;
+select test_consume_cpu(1);
+ test_consume_cpu 
+------------------
+ 
+(1 row)
+
+select test_consume_memory(20); -- Allocate 20 MB
+ test_consume_memory 
+---------------------
+ 
+(1 row)
+
+select test_release_memory(5);  -- Release 5 MB
+ test_release_memory 
+---------------------
+ 
+(1 row)
+
+select test_release_memory();   -- Release the remaining 15 MB
+ test_release_memory 
+---------------------
+ 
+(1 row)
+
diff --git a/test_runner/sql_regress/parallel_schedule b/test_runner/sql_regress/parallel_schedule
index 569c7b5066..d9508d1c90 100644
--- a/test_runner/sql_regress/parallel_schedule
+++ b/test_runner/sql_regress/parallel_schedule
@@ -7,4 +7,5 @@
 test: neon-cid
 test: neon-rel-truncate
 test: neon-clog
+test: neon-test-utils
 test: neon-vacuum-full
diff --git a/test_runner/sql_regress/sql/neon-test-utils.sql b/test_runner/sql_regress/sql/neon-test-utils.sql
new file mode 100644
index 0000000000..c5ca6c624b
--- /dev/null
+++ b/test_runner/sql_regress/sql/neon-test-utils.sql
@@ -0,0 +1,11 @@
+-- Test the test utils in pgxn/neon_test_utils. We don't test that
+-- these actually consume resources like they should - that would be
+-- tricky - but at least we check that they don't crash.
+
+CREATE EXTENSION neon_test_utils;
+
+select test_consume_cpu(1);
+
+select test_consume_memory(20); -- Allocate 20 MB
+select test_release_memory(5);  -- Release 5 MB
+select test_release_memory();   -- Release the remaining 15 MB

From 024372a3db071c945cbdd7f4cc1b759e56386534 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 14 Feb 2024 20:17:12 +0100
Subject: [PATCH 0180/1571] Revert "refactor(VirtualFile::crashsafe_overwrite):
 avoid Handle::block_on in callers" (#6765)

Reverts neondatabase/neon#6731

On high tenant count Pageservers in staging, memory and CPU usage shoots
to 100% with this change. (NB: staging currently has tokio-epoll-uring
enabled)

Will analyze tomorrow.


https://neondb.slack.com/archives/C03H1K0PGKH/p1707933875639379?thread_ts=1707929541.125329&cid=C03H1K0PGKH
---
 libs/utils/src/crashsafe.rs                   | 44 +-----------
 pageserver/src/deletion_queue.rs              |  5 +-
 pageserver/src/tenant.rs                      | 33 ++++++---
 pageserver/src/tenant/metadata.rs             |  2 +-
 pageserver/src/tenant/secondary/downloader.rs | 11 ++-
 pageserver/src/virtual_file.rs                | 72 +++++++++++--------
 6 files changed, 78 insertions(+), 89 deletions(-)

diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 756b19138c..1c72e9cae9 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::{
     borrow::Cow,
     fs::{self, File},
-    io::{self, Write},
+    io,
 };
 
 use camino::{Utf8Path, Utf8PathBuf};
@@ -161,48 +161,6 @@ pub async fn durable_rename(
     Ok(())
 }
 
-/// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`].
-///
-/// The file is first written to the specified `tmp_path`, and in a second
-/// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync
-/// and atomic rename guarantee that, if we crash at any point, there will never
-/// be a partially written file at `final_path` (but maybe at `tmp_path`).
-///
-/// Callers are responsible for serializing calls of this function for a given `final_path`.
-/// If they don't, there may be an error due to conflicting `tmp_path`, or there will
-/// be no error and the content of `final_path` will be the "winner" caller's `content`.
-/// I.e., the atomticity guarantees still hold.
-pub fn overwrite(
-    final_path: &Utf8Path,
-    tmp_path: &Utf8Path,
-    content: &[u8],
-) -> std::io::Result<()> {
-    let Some(final_path_parent) = final_path.parent() else {
-        return Err(std::io::Error::from_raw_os_error(
-            nix::errno::Errno::EINVAL as i32,
-        ));
-    };
-    std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
-    let mut file = std::fs::OpenOptions::new()
-        .write(true)
-        // Use `create_new` so that, if we race with ourselves or something else,
-        // we bail out instead of causing damage.
-        .create_new(true)
-        .open(tmp_path)?;
-    file.write_all(content)?;
-    file.sync_all()?;
-    drop(file); // don't keep the fd open for longer than we have to
-
-    std::fs::rename(tmp_path, final_path)?;
-
-    let final_parent_dirfd = std::fs::OpenOptions::new()
-        .read(true)
-        .open(final_path_parent)?;
-
-    final_parent_dirfd.sync_all()?;
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {
 
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index f8f2866a3b..81938b14b3 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -234,7 +234,7 @@ impl DeletionHeader {
         let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
         let header_path = conf.deletion_header_path();
         let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
-        VirtualFile::crashsafe_overwrite(header_path, temp_path, header_bytes)
+        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, header_bytes)
             .await
             .maybe_fatal_err("save deletion header")?;
 
@@ -325,8 +325,7 @@ impl DeletionList {
         let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
 
         let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
-
-        VirtualFile::crashsafe_overwrite(path, temp_path, bytes)
+        VirtualFile::crashsafe_overwrite(&path, &temp_path, bytes)
             .await
             .maybe_fatal_err("save deletion list")
             .map_err(Into::into)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index dc9b8247a5..88f4ae7086 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -28,6 +28,7 @@ use remote_storage::GenericRemoteStorage;
 use std::fmt;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
+use tokio::runtime::Handle;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
@@ -2877,10 +2878,17 @@ impl Tenant {
 
         let tenant_shard_id = *tenant_shard_id;
         let config_path = config_path.to_owned();
-        let conf_content = conf_content.into_bytes();
-        VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content)
-            .await
-            .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?;
+        tokio::task::spawn_blocking(move || {
+            Handle::current().block_on(async move {
+                let conf_content = conf_content.into_bytes();
+                VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content)
+                    .await
+                    .with_context(|| {
+                        format!("write tenant {tenant_shard_id} config to {config_path}")
+                    })
+            })
+        })
+        .await??;
 
         Ok(())
     }
@@ -2907,12 +2915,17 @@ impl Tenant {
 
         let tenant_shard_id = *tenant_shard_id;
         let target_config_path = target_config_path.to_owned();
-        let conf_content = conf_content.into_bytes();
-        VirtualFile::crashsafe_overwrite(target_config_path.clone(), temp_path, conf_content)
-            .await
-            .with_context(|| {
-                format!("write tenant {tenant_shard_id} config to {target_config_path}")
-            })?;
+        tokio::task::spawn_blocking(move || {
+            Handle::current().block_on(async move {
+                let conf_content = conf_content.into_bytes();
+                VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content)
+                    .await
+                    .with_context(|| {
+                        format!("write tenant {tenant_shard_id} config to {target_config_path}")
+                    })
+            })
+        })
+        .await??;
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 233acfd431..dcbe781f90 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -279,7 +279,7 @@ pub async fn save_metadata(
     let path = conf.metadata_path(tenant_shard_id, timeline_id);
     let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
     let metadata_bytes = data.to_bytes().context("serialize metadata")?;
-    VirtualFile::crashsafe_overwrite(path, temp_path, metadata_bytes)
+    VirtualFile::crashsafe_overwrite(&path, &temp_path, metadata_bytes)
         .await
         .context("write metadata")?;
     Ok(())
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index c8288acc20..c23416a7f0 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -484,9 +484,14 @@ impl<'a> TenantDownloader<'a> {
         let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
         let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
         let heatmap_path_bg = heatmap_path.clone();
-        VirtualFile::crashsafe_overwrite(heatmap_path_bg, temp_path, heatmap_bytes)
-            .await
-            .maybe_fatal_err(&context_msg)?;
+        tokio::task::spawn_blocking(move || {
+            tokio::runtime::Handle::current().block_on(async move {
+                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, heatmap_bytes).await
+            })
+        })
+        .await
+        .expect("Blocking task is never aborted")
+        .maybe_fatal_err(&context_msg)?;
 
         tracing::debug!("Wrote local heatmap to {}", heatmap_path);
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 858fc0ef64..45c3e19cfc 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -19,13 +19,14 @@ use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use std::fs::{self, File};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
-use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
+use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
 
 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
+use utils::fs_ext;
 
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
@@ -403,34 +404,47 @@ impl VirtualFile {
         Ok(vfile)
     }
 
-    /// Async version of [`::utils::crashsafe::overwrite`].
+    /// Writes a file to the specified `final_path` in a crash safe fasion
     ///
-    /// # NB:
-    ///
-    /// Doesn't actually use the [`VirtualFile`] file descriptor cache, but,
-    /// it did at an earlier time.
-    /// And it will use this module's [`io_engine`] in the near future, so, leaving it here.
-    pub async fn crashsafe_overwrite<B: BoundedBuf<Buf = Buf> + Send, Buf: IoBuf + Send>(
-        final_path: Utf8PathBuf,
-        tmp_path: Utf8PathBuf,
+    /// The file is first written to the specified tmp_path, and in a second
+    /// step, the tmp path is renamed to the final path. As renames are
+    /// atomic, a crash during the write operation will never leave behind a
+    /// partially written file.
+    pub async fn crashsafe_overwrite<B: BoundedBuf>(
+        final_path: &Utf8Path,
+        tmp_path: &Utf8Path,
         content: B,
     ) -> std::io::Result<()> {
-        // TODO: use tokio_epoll_uring if configured as `io_engine`.
-        // See https://github.com/neondatabase/neon/issues/6663
-
-        tokio::task::spawn_blocking(move || {
-            let slice_storage;
-            let content_len = content.bytes_init();
-            let content = if content.bytes_init() > 0 {
-                slice_storage = Some(content.slice(0..content_len));
-                slice_storage.as_deref().expect("just set it to Some()")
-            } else {
-                &[]
-            };
-            utils::crashsafe::overwrite(&final_path, &tmp_path, content)
-        })
-        .await
-        .expect("blocking task is never aborted")
+        let Some(final_path_parent) = final_path.parent() else {
+            return Err(std::io::Error::from_raw_os_error(
+                nix::errno::Errno::EINVAL as i32,
+            ));
+        };
+        std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?;
+        let mut file = Self::open_with_options(
+            tmp_path,
+            OpenOptions::new()
+                .write(true)
+                // Use `create_new` so that, if we race with ourselves or something else,
+                // we bail out instead of causing damage.
+                .create_new(true),
+        )
+        .await?;
+        let (_content, res) = file.write_all(content).await;
+        res?;
+        file.sync_all().await?;
+        drop(file); // before the rename, that's important!
+                    // renames are atomic
+        std::fs::rename(tmp_path, final_path)?;
+        // Only open final path parent dirfd now, so that this operation only
+        // ever holds one VirtualFile fd at a time.  That's important because
+        // the current `find_victim_slot` impl might pick the same slot for both
+        // VirtualFile., and it eventually does a blocking write lock instead of
+        // try_lock.
+        let final_parent_dirfd =
+            Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?;
+        final_parent_dirfd.sync_all().await?;
+        Ok(())
     }
 
     /// Call File::sync_all() on the underlying File.
@@ -1323,7 +1337,7 @@ mod tests {
         let path = testdir.join("myfile");
         let tmp_path = testdir.join("myfile.tmp");
 
-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1332,7 +1346,7 @@ mod tests {
         assert!(!tmp_path.exists());
         drop(file);
 
-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1354,7 +1368,7 @@ mod tests {
         std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
         assert!(tmp_path.exists());
 
-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
             .await
             .unwrap();
 

From 80854b98ff0dad7b385c972523ac03352d10a938 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 15 Feb 2024 01:24:07 +0200
Subject: [PATCH 0181/1571] move timeouts and cancellation handling to
 remote_storage (#6697)

Cancellation and timeouts are handled at remote_storage callsites, if
they are. However they should always be handled, because we've had
transient problems with remote storage connections.

- Add cancellation token to the `trait RemoteStorage` methods
- For `download*`, `list*` methods there is
`DownloadError::{Cancelled,Timeout}`
- For the rest now using `anyhow::Error`, it will have root cause
`remote_storage::TimeoutOrCancel::{Cancel,Timeout}`
- Both types have `::is_permanent` equivalent which should be passed to
`backoff::retry`
- New generic RemoteStorageConfig option `timeout`, defaults to 120s
- Start counting timeouts only after acquiring concurrency limiter
permit
- Cancellable permit acquiring
- Download stream timeout or cancellation is communicated via an
`std::io::Error`
- Exit backoff::retry by marking cancellation errors permanent

Fixes: #6096
Closes: #4781

Co-authored-by: arpad-m <arpad-m@users.noreply.github.com>
---
 Cargo.lock                                    |   2 +
 libs/remote_storage/Cargo.toml                |   2 +
 libs/remote_storage/src/azure_blob.rs         | 425 +++++++++++-------
 libs/remote_storage/src/error.rs              | 181 ++++++++
 libs/remote_storage/src/lib.rs                | 329 ++++++++------
 libs/remote_storage/src/local_fs.rs           | 420 ++++++++++++-----
 libs/remote_storage/src/s3_bucket.rs          | 273 +++++++----
 libs/remote_storage/src/simulate_failures.rs  |  55 ++-
 libs/remote_storage/src/support.rs            | 136 ++++++
 libs/remote_storage/tests/common/mod.rs       |  21 +-
 libs/remote_storage/tests/common/tests.rs     |  72 ++-
 libs/remote_storage/tests/test_real_azure.rs  |  14 +-
 libs/remote_storage/tests/test_real_s3.rs     | 215 ++++++++-
 pageserver/src/config.rs                      |   2 +
 pageserver/src/deletion_queue.rs              |  12 +-
 pageserver/src/deletion_queue/deleter.rs      |   7 +-
 pageserver/src/tenant.rs                      |   8 +-
 pageserver/src/tenant/delete.rs               |  14 +-
 .../src/tenant/remote_timeline_client.rs      |  55 +--
 .../tenant/remote_timeline_client/download.rs |  98 ++--
 .../tenant/remote_timeline_client/upload.rs   |  35 +-
 pageserver/src/tenant/secondary/downloader.rs |   5 +-
 .../src/tenant/secondary/heatmap_uploader.rs  |  11 +-
 proxy/src/context/parquet.rs                  |  17 +-
 safekeeper/src/wal_backup.rs                  |  29 +-
 25 files changed, 1712 insertions(+), 726 deletions(-)
 create mode 100644 libs/remote_storage/src/error.rs

diff --git a/Cargo.lock b/Cargo.lock
index 45a313a72b..74cd2c8d2c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4436,6 +4436,7 @@ dependencies = [
  "futures",
  "futures-util",
  "http-types",
+ "humantime",
  "hyper",
  "itertools",
  "metrics",
@@ -4447,6 +4448,7 @@ dependencies = [
  "serde_json",
  "test-context",
  "tokio",
+ "tokio-stream",
  "tokio-util",
  "toml_edit",
  "tracing",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 2cc59a947b..15f3cd3b80 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -15,11 +15,13 @@ aws-sdk-s3.workspace = true
 aws-credential-types.workspace = true
 bytes.workspace = true
 camino.workspace = true
+humantime.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
+tokio-stream.workspace = true
 tokio-util = { workspace = true, features = ["compat"] }
 toml_edit.workspace = true
 tracing.workspace = true
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index df6d45dde1..12ec680cb6 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -22,16 +22,15 @@ use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerCl
 use bytes::Bytes;
 use futures::stream::Stream;
 use futures_util::StreamExt;
+use futures_util::TryStreamExt;
 use http_types::{StatusCode, Url};
-use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 
-use crate::s3_bucket::RequestKind;
-use crate::TimeTravelError;
 use crate::{
-    AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
-    RemoteStorage, StorageMetadata,
+    error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
+    DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
+    TimeTravelError, TimeoutOrCancel,
 };
 
 pub struct AzureBlobStorage {
@@ -39,10 +38,12 @@ pub struct AzureBlobStorage {
     prefix_in_container: Option<String>,
     max_keys_per_list_response: Option<NonZeroU32>,
     concurrency_limiter: ConcurrencyLimiter,
+    // Per-request timeout. Accessible for tests.
+    pub timeout: Duration,
 }
 
 impl AzureBlobStorage {
-    pub fn new(azure_config: &AzureConfig) -> Result<Self> {
+    pub fn new(azure_config: &AzureConfig, timeout: Duration) -> Result<Self> {
         debug!(
             "Creating azure remote storage for azure container {}",
             azure_config.container_name
@@ -79,6 +80,7 @@ impl AzureBlobStorage {
             prefix_in_container: azure_config.prefix_in_container.to_owned(),
             max_keys_per_list_response,
             concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
+            timeout,
         })
     }
 
@@ -121,8 +123,11 @@ impl AzureBlobStorage {
     async fn download_for_builder(
         &self,
         builder: GetBlobBuilder,
+        cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
-        let mut response = builder.into_stream();
+        let kind = RequestKind::Get;
+
+        let _permit = self.permit(kind, cancel).await?;
 
         let mut etag = None;
         let mut last_modified = None;
@@ -130,39 +135,70 @@ impl AzureBlobStorage {
         // TODO give proper streaming response instead of buffering into RAM
         // https://github.com/neondatabase/neon/issues/5563
 
-        let mut bufs = Vec::new();
-        while let Some(part) = response.next().await {
-            let part = part.map_err(to_download_error)?;
-            let etag_str: &str = part.blob.properties.etag.as_ref();
-            if etag.is_none() {
-                etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
+        let download = async {
+            let response = builder
+                // convert to concrete Pageable
+                .into_stream()
+                // convert to TryStream
+                .into_stream()
+                .map_err(to_download_error);
+
+            // apply per request timeout
+            let response = tokio_stream::StreamExt::timeout(response, self.timeout);
+
+            // flatten
+            let response = response.map(|res| match res {
+                Ok(res) => res,
+                Err(_elapsed) => Err(DownloadError::Timeout),
+            });
+
+            let mut response = std::pin::pin!(response);
+
+            let mut bufs = Vec::new();
+            while let Some(part) = response.next().await {
+                let part = part?;
+                let etag_str: &str = part.blob.properties.etag.as_ref();
+                if etag.is_none() {
+                    etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
+                }
+                if last_modified.is_none() {
+                    last_modified = Some(part.blob.properties.last_modified.into());
+                }
+                if let Some(blob_meta) = part.blob.metadata {
+                    metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
+                }
+                let data = part
+                    .data
+                    .collect()
+                    .await
+                    .map_err(|e| DownloadError::Other(e.into()))?;
+                bufs.push(data);
             }
-            if last_modified.is_none() {
-                last_modified = Some(part.blob.properties.last_modified.into());
-            }
-            if let Some(blob_meta) = part.blob.metadata {
-                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
-            }
-            let data = part
-                .data
-                .collect()
-                .await
-                .map_err(|e| DownloadError::Other(e.into()))?;
-            bufs.push(data);
+            Ok(Download {
+                download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
+                etag,
+                last_modified,
+                metadata: Some(StorageMetadata(metadata)),
+            })
+        };
+
+        tokio::select! {
+            bufs = download => bufs,
+            _ = cancel.cancelled() => Err(DownloadError::Cancelled),
         }
-        Ok(Download {
-            download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
-            etag,
-            last_modified,
-            metadata: Some(StorageMetadata(metadata)),
-        })
     }
 
-    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
-        self.concurrency_limiter
-            .acquire(kind)
-            .await
-            .expect("semaphore is never closed")
+    async fn permit(
+        &self,
+        kind: RequestKind,
+        cancel: &CancellationToken,
+    ) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
+        let acquire = self.concurrency_limiter.acquire(kind);
+
+        tokio::select! {
+            permit = acquire => Ok(permit.expect("never closed")),
+            _ = cancel.cancelled() => Err(Cancelled),
+        }
     }
 }
 
@@ -192,66 +228,87 @@ impl RemoteStorage for AzureBlobStorage {
         prefix: Option<&RemotePath>,
         mode: ListingMode,
         max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<Listing, DownloadError> {
-        // get the passed prefix or if it is not set use prefix_in_bucket value
-        let list_prefix = prefix
-            .map(|p| self.relative_path_to_name(p))
-            .or_else(|| self.prefix_in_container.clone())
-            .map(|mut p| {
-                // required to end with a separator
-                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
-                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                }
-                p
+        let _permit = self.permit(RequestKind::List, cancel).await?;
+
+        let op = async {
+            // get the passed prefix or if it is not set use prefix_in_bucket value
+            let list_prefix = prefix
+                .map(|p| self.relative_path_to_name(p))
+                .or_else(|| self.prefix_in_container.clone())
+                .map(|mut p| {
+                    // required to end with a separator
+                    // otherwise request will return only the entry of a prefix
+                    if matches!(mode, ListingMode::WithDelimiter)
+                        && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                    {
+                        p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                    }
+                    p
+                });
+
+            let mut builder = self.client.list_blobs();
+
+            if let ListingMode::WithDelimiter = mode {
+                builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
+            }
+
+            if let Some(prefix) = list_prefix {
+                builder = builder.prefix(Cow::from(prefix.to_owned()));
+            }
+
+            if let Some(limit) = self.max_keys_per_list_response {
+                builder = builder.max_results(MaxResults::new(limit));
+            }
+
+            let response = builder.into_stream();
+            let response = response.into_stream().map_err(to_download_error);
+            let response = tokio_stream::StreamExt::timeout(response, self.timeout);
+            let response = response.map(|res| match res {
+                Ok(res) => res,
+                Err(_elapsed) => Err(DownloadError::Timeout),
             });
 
-        let mut builder = self.client.list_blobs();
+            let mut response = std::pin::pin!(response);
 
-        if let ListingMode::WithDelimiter = mode {
-            builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
-        }
+            let mut res = Listing::default();
 
-        if let Some(prefix) = list_prefix {
-            builder = builder.prefix(Cow::from(prefix.to_owned()));
-        }
+            let mut max_keys = max_keys.map(|mk| mk.get());
+            while let Some(entry) = response.next().await {
+                let entry = entry?;
+                let prefix_iter = entry
+                    .blobs
+                    .prefixes()
+                    .map(|prefix| self.name_to_relative_path(&prefix.name));
+                res.prefixes.extend(prefix_iter);
 
-        if let Some(limit) = self.max_keys_per_list_response {
-            builder = builder.max_results(MaxResults::new(limit));
-        }
+                let blob_iter = entry
+                    .blobs
+                    .blobs()
+                    .map(|k| self.name_to_relative_path(&k.name));
 
-        let mut response = builder.into_stream();
-        let mut res = Listing::default();
-        // NonZeroU32 doesn't support subtraction apparently
-        let mut max_keys = max_keys.map(|mk| mk.get());
-        while let Some(l) = response.next().await {
-            let entry = l.map_err(to_download_error)?;
-            let prefix_iter = entry
-                .blobs
-                .prefixes()
-                .map(|prefix| self.name_to_relative_path(&prefix.name));
-            res.prefixes.extend(prefix_iter);
+                for key in blob_iter {
+                    res.keys.push(key);
 
-            let blob_iter = entry
-                .blobs
-                .blobs()
-                .map(|k| self.name_to_relative_path(&k.name));
-
-            for key in blob_iter {
-                res.keys.push(key);
-                if let Some(mut mk) = max_keys {
-                    assert!(mk > 0);
-                    mk -= 1;
-                    if mk == 0 {
-                        return Ok(res); // limit reached
+                    if let Some(mut mk) = max_keys {
+                        assert!(mk > 0);
+                        mk -= 1;
+                        if mk == 0 {
+                            return Ok(res); // limit reached
+                        }
+                        max_keys = Some(mk);
                     }
-                    max_keys = Some(mk);
                 }
             }
+
+            Ok(res)
+        };
+
+        tokio::select! {
+            res = op => res,
+            _ = cancel.cancelled() => Err(DownloadError::Cancelled),
         }
-        Ok(res)
     }
 
     async fn upload(
@@ -260,35 +317,52 @@ impl RemoteStorage for AzureBlobStorage {
         data_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Put).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(to));
+        let _permit = self.permit(RequestKind::Put, cancel).await?;
 
-        let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
-            Box::pin(from);
+        let op = async {
+            let blob_client = self.client.blob_client(self.relative_path_to_name(to));
 
-        let from = NonSeekableStream::new(from, data_size_bytes);
+            let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
+                Box::pin(from);
 
-        let body = azure_core::Body::SeekableStream(Box::new(from));
+            let from = NonSeekableStream::new(from, data_size_bytes);
 
-        let mut builder = blob_client.put_block_blob(body);
+            let body = azure_core::Body::SeekableStream(Box::new(from));
 
-        if let Some(metadata) = metadata {
-            builder = builder.metadata(to_azure_metadata(metadata));
+            let mut builder = blob_client.put_block_blob(body);
+
+            if let Some(metadata) = metadata {
+                builder = builder.metadata(to_azure_metadata(metadata));
+            }
+
+            let fut = builder.into_future();
+            let fut = tokio::time::timeout(self.timeout, fut);
+
+            match fut.await {
+                Ok(Ok(_response)) => Ok(()),
+                Ok(Err(azure)) => Err(azure.into()),
+                Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()),
+            }
+        };
+
+        tokio::select! {
+            res = op => res,
+            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
         }
-
-        let _response = builder.into_future().await?;
-
-        Ok(())
     }
 
-    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
-        let _permit = self.permit(RequestKind::Get).await;
+    async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
         let blob_client = self.client.blob_client(self.relative_path_to_name(from));
 
         let builder = blob_client.get();
 
-        self.download_for_builder(builder).await
+        self.download_for_builder(builder, cancel).await
     }
 
     async fn download_byte_range(
@@ -296,8 +370,8 @@ impl RemoteStorage for AzureBlobStorage {
         from: &RemotePath,
         start_inclusive: u64,
         end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
-        let _permit = self.permit(RequestKind::Get).await;
         let blob_client = self.client.blob_client(self.relative_path_to_name(from));
 
         let mut builder = blob_client.get();
@@ -309,82 +383,113 @@ impl RemoteStorage for AzureBlobStorage {
         };
         builder = builder.range(range);
 
-        self.download_for_builder(builder).await
+        self.download_for_builder(builder, cancel).await
     }
 
-    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Delete).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(path));
+    async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
+        self.delete_objects(std::array::from_ref(path), cancel)
+            .await
+    }
 
-        let builder = blob_client.delete();
+    async fn delete_objects<'a>(
+        &self,
+        paths: &'a [RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        let _permit = self.permit(RequestKind::Delete, cancel).await?;
 
-        match builder.into_future().await {
-            Ok(_response) => Ok(()),
-            Err(e) => {
-                if let Some(http_err) = e.as_http_error() {
-                    if http_err.status() == StatusCode::NotFound {
-                        return Ok(());
+        let op = async {
+            // TODO batch requests are also not supported by the SDK
+            // https://github.com/Azure/azure-sdk-for-rust/issues/1068
+            // https://github.com/Azure/azure-sdk-for-rust/issues/1249
+            for path in paths {
+                let blob_client = self.client.blob_client(self.relative_path_to_name(path));
+
+                let request = blob_client.delete().into_future();
+
+                let res = tokio::time::timeout(self.timeout, request).await;
+
+                match res {
+                    Ok(Ok(_response)) => continue,
+                    Ok(Err(e)) => {
+                        if let Some(http_err) = e.as_http_error() {
+                            if http_err.status() == StatusCode::NotFound {
+                                continue;
+                            }
+                        }
+                        return Err(e.into());
                     }
+                    Err(_elapsed) => return Err(TimeoutOrCancel::Timeout.into()),
                 }
-                Err(anyhow::Error::new(e))
             }
+
+            Ok(())
+        };
+
+        tokio::select! {
+            res = op => res,
+            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
         }
     }
 
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        // Permit is already obtained by inner delete function
+    async fn copy(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        let _permit = self.permit(RequestKind::Copy, cancel).await?;
 
-        // TODO batch requests are also not supported by the SDK
-        // https://github.com/Azure/azure-sdk-for-rust/issues/1068
-        // https://github.com/Azure/azure-sdk-for-rust/issues/1249
-        for path in paths {
-            self.delete(path).await?;
-        }
-        Ok(())
-    }
+        let timeout = tokio::time::sleep(self.timeout);
 
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Copy).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(to));
+        let mut copy_status = None;
 
-        let source_url = format!(
-            "{}/{}",
-            self.client.url()?,
-            self.relative_path_to_name(from)
-        );
-        let builder = blob_client.copy(Url::from_str(&source_url)?);
+        let op = async {
+            let blob_client = self.client.blob_client(self.relative_path_to_name(to));
 
-        let result = builder.into_future().await?;
+            let source_url = format!(
+                "{}/{}",
+                self.client.url()?,
+                self.relative_path_to_name(from)
+            );
 
-        let mut copy_status = result.copy_status;
-        let start_time = Instant::now();
-        const MAX_WAIT_TIME: Duration = Duration::from_secs(60);
-        loop {
-            match copy_status {
-                CopyStatus::Aborted => {
-                    anyhow::bail!("Received abort for copy from {from} to {to}.");
+            let builder = blob_client.copy(Url::from_str(&source_url)?);
+            let copy = builder.into_future();
+
+            let result = copy.await?;
+
+            copy_status = Some(result.copy_status);
+            loop {
+                match copy_status.as_ref().expect("we always set it to Some") {
+                    CopyStatus::Aborted => {
+                        anyhow::bail!("Received abort for copy from {from} to {to}.");
+                    }
+                    CopyStatus::Failed => {
+                        anyhow::bail!("Received failure response for copy from {from} to {to}.");
+                    }
+                    CopyStatus::Success => return Ok(()),
+                    CopyStatus::Pending => (),
                 }
-                CopyStatus::Failed => {
-                    anyhow::bail!("Received failure response for copy from {from} to {to}.");
-                }
-                CopyStatus::Success => return Ok(()),
-                CopyStatus::Pending => (),
+                // The copy is taking longer. Waiting a second and then re-trying.
+                // TODO estimate time based on copy_progress and adjust time based on that
+                tokio::time::sleep(Duration::from_millis(1000)).await;
+                let properties = blob_client.get_properties().into_future().await?;
+                let Some(status) = properties.blob.properties.copy_status else {
+                    tracing::warn!("copy_status for copy is None!, from={from}, to={to}");
+                    return Ok(());
+                };
+                copy_status = Some(status);
             }
-            // The copy is taking longer. Waiting a second and then re-trying.
-            // TODO estimate time based on copy_progress and adjust time based on that
-            tokio::time::sleep(Duration::from_millis(1000)).await;
-            let properties = blob_client.get_properties().into_future().await?;
-            let Some(status) = properties.blob.properties.copy_status else {
-                tracing::warn!("copy_status for copy is None!, from={from}, to={to}");
-                return Ok(());
-            };
-            if start_time.elapsed() > MAX_WAIT_TIME {
-                anyhow::bail!("Copy from from {from} to {to} took longer than limit MAX_WAIT_TIME={}s. copy_pogress={:?}.",
-                    MAX_WAIT_TIME.as_secs_f32(),
-                    properties.blob.properties.copy_progress,
-                );
-            }
-            copy_status = status;
+        };
+
+        tokio::select! {
+            res = op => res,
+            _ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
+            _ = timeout => {
+                let e = anyhow::Error::new(TimeoutOrCancel::Timeout);
+                let e = e.context(format!("Timeout, last status: {copy_status:?}"));
+                Err(e)
+            },
         }
     }
 
diff --git a/libs/remote_storage/src/error.rs b/libs/remote_storage/src/error.rs
new file mode 100644
index 0000000000..96f044e087
--- /dev/null
+++ b/libs/remote_storage/src/error.rs
@@ -0,0 +1,181 @@
+/// Reasons for downloads or listings to fail.
+#[derive(Debug)]
+pub enum DownloadError {
+    /// Validation or other error happened due to user input.
+    BadInput(anyhow::Error),
+    /// The file was not found in the remote storage.
+    NotFound,
+    /// A cancellation token aborted the download, typically during
+    /// tenant detach or process shutdown.
+    Cancelled,
+    /// A timeout happened while executing the request. Possible reasons:
+    /// - stuck tcp connection
+    ///
+    /// Concurrency control is not timed within timeout.
+    Timeout,
+    /// The file was found in the remote storage, but the download failed.
+    Other(anyhow::Error),
+}
+
+impl std::fmt::Display for DownloadError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            DownloadError::BadInput(e) => {
+                write!(f, "Failed to download a remote file due to user input: {e}")
+            }
+            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
+            DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
+            DownloadError::Timeout => write!(f, "timeout"),
+            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
+        }
+    }
+}
+
+impl std::error::Error for DownloadError {}
+
+impl DownloadError {
+    /// Returns true if the error should not be retried with backoff
+    pub fn is_permanent(&self) -> bool {
+        use DownloadError::*;
+        match self {
+            BadInput(_) | NotFound | Cancelled => true,
+            Timeout | Other(_) => false,
+        }
+    }
+}
+
+#[derive(Debug)]
+pub enum TimeTravelError {
+    /// Validation or other error happened due to user input.
+    BadInput(anyhow::Error),
+    /// The used remote storage does not have time travel recovery implemented
+    Unimplemented,
+    /// The number of versions/deletion markers is above our limit.
+    TooManyVersions,
+    /// A cancellation token aborted the process, typically during
+    /// request closure or process shutdown.
+    Cancelled,
+    /// Other errors
+    Other(anyhow::Error),
+}
+
+impl std::fmt::Display for TimeTravelError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            TimeTravelError::BadInput(e) => {
+                write!(
+                    f,
+                    "Failed to time travel recover a prefix due to user input: {e}"
+                )
+            }
+            TimeTravelError::Unimplemented => write!(
+                f,
+                "time travel recovery is not implemented for the current storage backend"
+            ),
+            TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"),
+            TimeTravelError::TooManyVersions => {
+                write!(f, "Number of versions/delete markers above limit")
+            }
+            TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"),
+        }
+    }
+}
+
+impl std::error::Error for TimeTravelError {}
+
+/// Plain cancelled error.
+///
+/// By design this type does not not implement `std::error::Error` so it cannot be put as the root
+/// cause of `std::io::Error` or `anyhow::Error`. It should never need to be exposed out of this
+/// crate.
+///
+/// It exists to implement permit acquiring in `{Download,TimeTravel}Error` and `anyhow::Error` returning
+/// operations and ensuring that those get converted to proper versions with just `?`.
+#[derive(Debug)]
+pub(crate) struct Cancelled;
+
+impl From<Cancelled> for anyhow::Error {
+    fn from(_: Cancelled) -> Self {
+        anyhow::Error::new(TimeoutOrCancel::Cancel)
+    }
+}
+
+impl From<Cancelled> for TimeTravelError {
+    fn from(_: Cancelled) -> Self {
+        TimeTravelError::Cancelled
+    }
+}
+
+impl From<Cancelled> for TimeoutOrCancel {
+    fn from(_: Cancelled) -> Self {
+        TimeoutOrCancel::Cancel
+    }
+}
+
+impl From<Cancelled> for DownloadError {
+    fn from(_: Cancelled) -> Self {
+        DownloadError::Cancelled
+    }
+}
+
+/// This type is used at as the root cause for timeouts and cancellations with `anyhow::Error` returning
+/// RemoteStorage methods.
+///
+/// For use with `utils::backoff::retry` and `anyhow::Error` returning operations there is
+/// `TimeoutOrCancel::caused_by_cancel` method to query "proper form" errors.
+#[derive(Debug)]
+pub enum TimeoutOrCancel {
+    Timeout,
+    Cancel,
+}
+
+impl std::fmt::Display for TimeoutOrCancel {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use TimeoutOrCancel::*;
+        match self {
+            Timeout => write!(f, "timeout"),
+            Cancel => write!(f, "cancel"),
+        }
+    }
+}
+
+impl std::error::Error for TimeoutOrCancel {}
+
+impl TimeoutOrCancel {
+    pub fn caused(error: &anyhow::Error) -> Option<&Self> {
+        error.root_cause().downcast_ref()
+    }
+
+    /// Returns true if the error was caused by [`TimeoutOrCancel::Cancel`].
+    pub fn caused_by_cancel(error: &anyhow::Error) -> bool {
+        Self::caused(error).is_some_and(Self::is_cancel)
+    }
+
+    pub fn is_cancel(&self) -> bool {
+        matches!(self, TimeoutOrCancel::Cancel)
+    }
+
+    pub fn is_timeout(&self) -> bool {
+        matches!(self, TimeoutOrCancel::Timeout)
+    }
+}
+
+/// This conversion is used when [`crate::support::DownloadStream`] notices a cancellation or
+/// timeout to wrap it in an `std::io::Error`.
+impl From<TimeoutOrCancel> for std::io::Error {
+    fn from(value: TimeoutOrCancel) -> Self {
+        let e = DownloadError::from(value);
+        std::io::Error::other(e)
+    }
+}
+
+impl From<TimeoutOrCancel> for DownloadError {
+    fn from(value: TimeoutOrCancel) -> Self {
+        use TimeoutOrCancel::*;
+
+        match value {
+            Timeout => DownloadError::Timeout,
+            Cancel => DownloadError::Cancelled,
+        }
+    }
+}
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 5a0b74e406..b0b69f9155 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -10,6 +10,7 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 
 mod azure_blob;
+mod error;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;
@@ -21,7 +22,7 @@ use std::{
     num::{NonZeroU32, NonZeroUsize},
     pin::Pin,
     sync::Arc,
-    time::SystemTime,
+    time::{Duration, SystemTime},
 };
 
 use anyhow::{bail, Context};
@@ -41,6 +42,8 @@ pub use self::{
 };
 use s3_bucket::RequestKind;
 
+pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
+
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -158,9 +161,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
     async fn list_prefixes(
         &self,
         prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         let result = self
-            .list(prefix, ListingMode::WithDelimiter, None)
+            .list(prefix, ListingMode::WithDelimiter, None, cancel)
             .await?
             .prefixes;
         Ok(result)
@@ -182,9 +186,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
         &self,
         prefix: Option<&RemotePath>,
         max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         let result = self
-            .list(prefix, ListingMode::NoDelimiter, max_keys)
+            .list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
             .await?
             .keys;
         Ok(result)
@@ -195,9 +200,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
         prefix: Option<&RemotePath>,
         _mode: ListingMode,
         max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
     ) -> Result<Listing, DownloadError>;
 
     /// Streams the local file contents into remote into the remote storage entry.
+    ///
+    /// If the operation fails because of timeout or cancellation, the root cause of the error will be
+    /// set to `TimeoutOrCancel`.
     async fn upload(
         &self,
         from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
@@ -206,27 +215,61 @@ pub trait RemoteStorage: Send + Sync + 'static {
         data_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<()>;
 
-    /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
+    /// Streams the remote storage entry contents.
+    ///
+    /// The returned download stream will obey initial timeout and cancellation signal by erroring
+    /// on whichever happens first. Only one of the reasons will fail the stream, which is usually
+    /// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out.
+    ///
     /// Returns the metadata, if any was stored with the file previously.
-    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError>;
+    async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError>;
 
-    /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
+    /// Streams a given byte range of the remote storage entry contents.
+    ///
+    /// The returned download stream will obey initial timeout and cancellation signal by erroring
+    /// on whichever happens first. Only one of the reasons will fail the stream, which is usually
+    /// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out.
+    ///
     /// Returns the metadata, if any was stored with the file previously.
     async fn download_byte_range(
         &self,
         from: &RemotePath,
         start_inclusive: u64,
         end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
     ) -> Result<Download, DownloadError>;
 
-    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
+    /// Delete a single path from remote storage.
+    ///
+    /// If the operation fails because of timeout or cancellation, the root cause of the error will be
+    /// set to `TimeoutOrCancel`. In such situation it is unknown if the deletion went through.
+    async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()>;
 
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
+    /// Delete a multiple paths from remote storage.
+    ///
+    /// If the operation fails because of timeout or cancellation, the root cause of the error will be
+    /// set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went
+    /// through.
+    async fn delete_objects<'a>(
+        &self,
+        paths: &'a [RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()>;
 
     /// Copy a remote object inside a bucket from one path to another.
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
+    async fn copy(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()>;
 
     /// Resets the content of everything with the given prefix to the given state
     async fn time_travel_recover(
@@ -238,7 +281,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
     ) -> Result<(), TimeTravelError>;
 }
 
-pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
+/// DownloadStream is sensitive to the timeout and cancellation used with the original
+/// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
+/// with `tokio::io::copy_buf`.
+// This has 'static because safekeepers do not use cancellation tokens (yet)
+pub type DownloadStream =
+    Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>>;
+
 pub struct Download {
     pub download_stream: DownloadStream,
     /// The last time the file was modified (`last-modified` HTTP header)
@@ -257,86 +306,6 @@ impl Debug for Download {
     }
 }
 
-#[derive(Debug)]
-pub enum DownloadError {
-    /// Validation or other error happened due to user input.
-    BadInput(anyhow::Error),
-    /// The file was not found in the remote storage.
-    NotFound,
-    /// A cancellation token aborted the download, typically during
-    /// tenant detach or process shutdown.
-    Cancelled,
-    /// The file was found in the remote storage, but the download failed.
-    Other(anyhow::Error),
-}
-
-impl std::fmt::Display for DownloadError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            DownloadError::BadInput(e) => {
-                write!(f, "Failed to download a remote file due to user input: {e}")
-            }
-            DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
-            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
-            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
-        }
-    }
-}
-
-impl std::error::Error for DownloadError {}
-
-impl DownloadError {
-    /// Returns true if the error should not be retried with backoff
-    pub fn is_permanent(&self) -> bool {
-        use DownloadError::*;
-        match self {
-            BadInput(_) => true,
-            NotFound => true,
-            Cancelled => true,
-            Other(_) => false,
-        }
-    }
-}
-
-#[derive(Debug)]
-pub enum TimeTravelError {
-    /// Validation or other error happened due to user input.
-    BadInput(anyhow::Error),
-    /// The used remote storage does not have time travel recovery implemented
-    Unimplemented,
-    /// The number of versions/deletion markers is above our limit.
-    TooManyVersions,
-    /// A cancellation token aborted the process, typically during
-    /// request closure or process shutdown.
-    Cancelled,
-    /// Other errors
-    Other(anyhow::Error),
-}
-
-impl std::fmt::Display for TimeTravelError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            TimeTravelError::BadInput(e) => {
-                write!(
-                    f,
-                    "Failed to time travel recover a prefix due to user input: {e}"
-                )
-            }
-            TimeTravelError::Unimplemented => write!(
-                f,
-                "time travel recovery is not implemented for the current storage backend"
-            ),
-            TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"),
-            TimeTravelError::TooManyVersions => {
-                write!(f, "Number of versions/delete markers above limit")
-            }
-            TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"),
-        }
-    }
-}
-
-impl std::error::Error for TimeTravelError {}
-
 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
 #[derive(Clone)]
@@ -354,12 +323,13 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         prefix: Option<&RemotePath>,
         mode: ListingMode,
         max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<Listing, DownloadError> {
         match self {
-            Self::LocalFs(s) => s.list(prefix, mode, max_keys).await,
-            Self::AwsS3(s) => s.list(prefix, mode, max_keys).await,
-            Self::AzureBlob(s) => s.list(prefix, mode, max_keys).await,
-            Self::Unreliable(s) => s.list(prefix, mode, max_keys).await,
+            Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await,
+            Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await,
+            Self::AzureBlob(s) => s.list(prefix, mode, max_keys, cancel).await,
+            Self::Unreliable(s) => s.list(prefix, mode, max_keys, cancel).await,
         }
     }
 
@@ -372,12 +342,13 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         &self,
         folder: Option<&RemotePath>,
         max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         match self {
-            Self::LocalFs(s) => s.list_files(folder, max_keys).await,
-            Self::AwsS3(s) => s.list_files(folder, max_keys).await,
-            Self::AzureBlob(s) => s.list_files(folder, max_keys).await,
-            Self::Unreliable(s) => s.list_files(folder, max_keys).await,
+            Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
         }
     }
 
@@ -387,36 +358,43 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
     pub async fn list_prefixes(
         &self,
         prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         match self {
-            Self::LocalFs(s) => s.list_prefixes(prefix).await,
-            Self::AwsS3(s) => s.list_prefixes(prefix).await,
-            Self::AzureBlob(s) => s.list_prefixes(prefix).await,
-            Self::Unreliable(s) => s.list_prefixes(prefix).await,
+            Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
+            Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
+            Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
+            Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
         }
     }
 
+    /// See [`RemoteStorage::upload`]
     pub async fn upload(
         &self,
         from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
         data_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
         match self {
-            Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await,
-            Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await,
-            Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata).await,
-            Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await,
+            Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
+            Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
+            Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
+            Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
         }
     }
 
-    pub async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+    pub async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
         match self {
-            Self::LocalFs(s) => s.download(from).await,
-            Self::AwsS3(s) => s.download(from).await,
-            Self::AzureBlob(s) => s.download(from).await,
-            Self::Unreliable(s) => s.download(from).await,
+            Self::LocalFs(s) => s.download(from, cancel).await,
+            Self::AwsS3(s) => s.download(from, cancel).await,
+            Self::AzureBlob(s) => s.download(from, cancel).await,
+            Self::Unreliable(s) => s.download(from, cancel).await,
         }
     }
 
@@ -425,54 +403,72 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         from: &RemotePath,
         start_inclusive: u64,
         end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
         match self {
             Self::LocalFs(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive)
+                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
                     .await
             }
             Self::AwsS3(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive)
+                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
                     .await
             }
             Self::AzureBlob(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive)
+                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
                     .await
             }
             Self::Unreliable(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive)
+                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
                     .await
             }
         }
     }
 
-    pub async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+    /// See [`RemoteStorage::delete`]
+    pub async fn delete(
+        &self,
+        path: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         match self {
-            Self::LocalFs(s) => s.delete(path).await,
-            Self::AwsS3(s) => s.delete(path).await,
-            Self::AzureBlob(s) => s.delete(path).await,
-            Self::Unreliable(s) => s.delete(path).await,
+            Self::LocalFs(s) => s.delete(path, cancel).await,
+            Self::AwsS3(s) => s.delete(path, cancel).await,
+            Self::AzureBlob(s) => s.delete(path, cancel).await,
+            Self::Unreliable(s) => s.delete(path, cancel).await,
         }
     }
 
-    pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+    /// See [`RemoteStorage::delete_objects`]
+    pub async fn delete_objects(
+        &self,
+        paths: &[RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         match self {
-            Self::LocalFs(s) => s.delete_objects(paths).await,
-            Self::AwsS3(s) => s.delete_objects(paths).await,
-            Self::AzureBlob(s) => s.delete_objects(paths).await,
-            Self::Unreliable(s) => s.delete_objects(paths).await,
+            Self::LocalFs(s) => s.delete_objects(paths, cancel).await,
+            Self::AwsS3(s) => s.delete_objects(paths, cancel).await,
+            Self::AzureBlob(s) => s.delete_objects(paths, cancel).await,
+            Self::Unreliable(s) => s.delete_objects(paths, cancel).await,
         }
     }
 
-    pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+    /// See [`RemoteStorage::copy`]
+    pub async fn copy_object(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         match self {
-            Self::LocalFs(s) => s.copy(from, to).await,
-            Self::AwsS3(s) => s.copy(from, to).await,
-            Self::AzureBlob(s) => s.copy(from, to).await,
-            Self::Unreliable(s) => s.copy(from, to).await,
+            Self::LocalFs(s) => s.copy(from, to, cancel).await,
+            Self::AwsS3(s) => s.copy(from, to, cancel).await,
+            Self::AzureBlob(s) => s.copy(from, to, cancel).await,
+            Self::Unreliable(s) => s.copy(from, to, cancel).await,
         }
     }
 
+    /// See [`RemoteStorage::time_travel_recover`].
     pub async fn time_travel_recover(
         &self,
         prefix: Option<&RemotePath>,
@@ -503,10 +499,11 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
 
 impl GenericRemoteStorage {
     pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
+        let timeout = storage_config.timeout;
         Ok(match &storage_config.storage {
-            RemoteStorageKind::LocalFs(root) => {
-                info!("Using fs root '{root}' as a remote storage");
-                Self::LocalFs(LocalFs::new(root.clone())?)
+            RemoteStorageKind::LocalFs(path) => {
+                info!("Using fs root '{path}' as a remote storage");
+                Self::LocalFs(LocalFs::new(path.clone(), timeout)?)
             }
             RemoteStorageKind::AwsS3(s3_config) => {
                 // The profile and access key id are only printed here for debugging purposes,
@@ -516,12 +513,12 @@ impl GenericRemoteStorage {
                     std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
                 info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
                       s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
+                Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
             }
             RemoteStorageKind::AzureContainer(azure_config) => {
                 info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'",
                       azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
-                Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config)?))
+                Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config, timeout)?))
             }
         })
     }
@@ -530,18 +527,15 @@ impl GenericRemoteStorage {
         Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
     }
 
-    /// Takes storage object contents and its size and uploads to remote storage,
-    /// mapping `from_path` to the corresponding remote object id in the storage.
-    ///
-    /// The storage object does not have to be present on the `from_path`,
-    /// this path is used for the remote object id conversion only.
+    /// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
     pub async fn upload_storage_object(
         &self,
         from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
         from_size_bytes: usize,
         to: &RemotePath,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
-        self.upload(from, from_size_bytes, to, None)
+        self.upload(from, from_size_bytes, to, None, cancel)
             .await
             .with_context(|| {
                 format!("Failed to upload data of length {from_size_bytes} to storage path {to:?}")
@@ -554,10 +548,11 @@ impl GenericRemoteStorage {
         &self,
         byte_range: Option<(u64, Option<u64>)>,
         from: &RemotePath,
+        cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
         match byte_range {
-            Some((start, end)) => self.download_byte_range(from, start, end).await,
-            None => self.download(from).await,
+            Some((start, end)) => self.download_byte_range(from, start, end, cancel).await,
+            None => self.download(from, cancel).await,
         }
     }
 }
@@ -572,6 +567,9 @@ pub struct StorageMetadata(HashMap<String, String>);
 pub struct RemoteStorageConfig {
     /// The storage connection configuration.
     pub storage: RemoteStorageKind,
+    /// A common timeout enforced for all requests after concurrency limiter permit has been
+    /// acquired.
+    pub timeout: Duration,
 }
 
 /// A kind of a remote storage to connect to, with its connection configuration.
@@ -656,6 +654,8 @@ impl Debug for AzureConfig {
 }
 
 impl RemoteStorageConfig {
+    pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
+
     pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
         let local_path = toml.get("local_path");
         let bucket_name = toml.get("bucket_name");
@@ -685,6 +685,27 @@ impl RemoteStorageConfig {
             .map(|endpoint| parse_toml_string("endpoint", endpoint))
             .transpose()?;
 
+        let timeout = toml
+            .get("timeout")
+            .map(|timeout| {
+                timeout
+                    .as_str()
+                    .ok_or_else(|| anyhow::Error::msg("timeout was not a string"))
+            })
+            .transpose()
+            .and_then(|timeout| {
+                timeout
+                    .map(humantime::parse_duration)
+                    .transpose()
+                    .map_err(anyhow::Error::new)
+            })
+            .context("parse timeout")?
+            .unwrap_or(Self::DEFAULT_TIMEOUT);
+
+        if timeout < Duration::from_secs(1) {
+            bail!("timeout was specified as {timeout:?} which is too low");
+        }
+
         let storage = match (
             local_path,
             bucket_name,
@@ -746,7 +767,7 @@ impl RemoteStorageConfig {
             }
         };
 
-        Ok(Some(RemoteStorageConfig { storage }))
+        Ok(Some(RemoteStorageConfig { storage, timeout }))
     }
 }
 
@@ -842,4 +863,24 @@ mod tests {
         let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths");
         assert_eq!(err.to_string(), "Path \"/\" is not relative");
     }
+
+    #[test]
+    fn parse_localfs_config_with_timeout() {
+        let input = "local_path = '.'
+timeout = '5s'";
+
+        let toml = input.parse::<toml_edit::Document>().unwrap();
+
+        let config = RemoteStorageConfig::from_toml(toml.as_item())
+            .unwrap()
+            .expect("it exists");
+
+        assert_eq!(
+            config,
+            RemoteStorageConfig {
+                storage: RemoteStorageKind::LocalFs(Utf8PathBuf::from(".")),
+                timeout: Duration::from_secs(5)
+            }
+        );
+    }
 }
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index e88111e8e2..6f847cf9d7 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,7 +5,12 @@
 //! volume is mounted to the local FS.
 
 use std::{
-    borrow::Cow, future::Future, io::ErrorKind, num::NonZeroU32, pin::Pin, time::SystemTime,
+    borrow::Cow,
+    future::Future,
+    io::ErrorKind,
+    num::NonZeroU32,
+    pin::Pin,
+    time::{Duration, SystemTime},
 };
 
 use anyhow::{bail, ensure, Context};
@@ -20,7 +25,9 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 
-use crate::{Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError};
+use crate::{
+    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
+};
 
 use super::{RemoteStorage, StorageMetadata};
 
@@ -29,12 +36,13 @@ const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
 #[derive(Debug, Clone)]
 pub struct LocalFs {
     storage_root: Utf8PathBuf,
+    timeout: Duration,
 }
 
 impl LocalFs {
     /// Attempts to create local FS storage, along with its root directory.
     /// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative).
-    pub fn new(mut storage_root: Utf8PathBuf) -> anyhow::Result<Self> {
+    pub fn new(mut storage_root: Utf8PathBuf, timeout: Duration) -> anyhow::Result<Self> {
         if !storage_root.exists() {
             std::fs::create_dir_all(&storage_root).with_context(|| {
                 format!("Failed to create all directories in the given root path {storage_root:?}")
@@ -46,7 +54,10 @@ impl LocalFs {
             })?;
         }
 
-        Ok(Self { storage_root })
+        Ok(Self {
+            storage_root,
+            timeout,
+        })
     }
 
     // mirrors S3Bucket::s3_object_to_relative_path
@@ -157,80 +168,14 @@ impl LocalFs {
 
         Ok(files)
     }
-}
 
-impl RemoteStorage for LocalFs {
-    async fn list(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
-    ) -> Result<Listing, DownloadError> {
-        let mut result = Listing::default();
-
-        if let ListingMode::NoDelimiter = mode {
-            let keys = self
-                .list_recursive(prefix)
-                .await
-                .map_err(DownloadError::Other)?;
-
-            result.keys = keys
-                .into_iter()
-                .filter(|k| {
-                    let path = k.with_base(&self.storage_root);
-                    !path.is_dir()
-                })
-                .collect();
-            if let Some(max_keys) = max_keys {
-                result.keys.truncate(max_keys.get() as usize);
-            }
-
-            return Ok(result);
-        }
-
-        let path = match prefix {
-            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
-            None => Cow::Borrowed(&self.storage_root),
-        };
-
-        let prefixes_to_filter = get_all_files(path.as_ref(), false)
-            .await
-            .map_err(DownloadError::Other)?;
-
-        // filter out empty directories to mirror s3 behavior.
-        for prefix in prefixes_to_filter {
-            if prefix.is_dir()
-                && is_directory_empty(&prefix)
-                    .await
-                    .map_err(DownloadError::Other)?
-            {
-                continue;
-            }
-
-            let stripped = prefix
-                .strip_prefix(&self.storage_root)
-                .context("Failed to strip prefix")
-                .and_then(RemotePath::new)
-                .expect(
-                    "We list files for storage root, hence should be able to remote the prefix",
-                );
-
-            if prefix.is_dir() {
-                result.prefixes.push(stripped);
-            } else {
-                result.keys.push(stripped);
-            }
-        }
-
-        Ok(result)
-    }
-
-    async fn upload(
+    async fn upload0(
         &self,
         data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
         data_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
         let target_file_path = to.with_base(&self.storage_root);
         create_target_directory(&target_file_path).await?;
@@ -265,9 +210,26 @@ impl RemoteStorage for LocalFs {
         let mut buffer_to_read = data.take(from_size_bytes);
 
         // alternatively we could just write the bytes to a file, but local_fs is a testing utility
-        let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
-            .await
-            .with_context(|| {
+        let copy = io::copy_buf(&mut buffer_to_read, &mut destination);
+
+        let bytes_read = tokio::select! {
+            biased;
+            _ = cancel.cancelled() => {
+                let file = destination.into_inner();
+                // wait for the inflight operation(s) to complete so that there could be a next
+                // attempt right away and our writes are not directed to their file.
+                file.into_std().await;
+
+                // TODO: leave the temp or not? leaving is probably less racy. enabled truncate at
+                // least.
+                fs::remove_file(temp_file_path).await.context("remove temp_file_path after cancellation or timeout")?;
+                return Err(TimeoutOrCancel::Cancel.into());
+            }
+            read = copy => read,
+        };
+
+        let bytes_read =
+            bytes_read.with_context(|| {
                 format!(
                     "Failed to upload file (write temp) to the local storage at '{temp_file_path}'",
                 )
@@ -299,6 +261,9 @@ impl RemoteStorage for LocalFs {
             })?;
 
         if let Some(storage_metadata) = metadata {
+            // FIXME: we must not be using metadata much, since this would forget the old metadata
+            // for new writes? or perhaps metadata is sticky; could consider removing if it's never
+            // used.
             let storage_metadata_path = storage_metadata_path(&target_file_path);
             fs::write(
                 &storage_metadata_path,
@@ -315,8 +280,131 @@ impl RemoteStorage for LocalFs {
 
         Ok(())
     }
+}
 
-    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+impl RemoteStorage for LocalFs {
+    async fn list(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<Listing, DownloadError> {
+        let op = async {
+            let mut result = Listing::default();
+
+            if let ListingMode::NoDelimiter = mode {
+                let keys = self
+                    .list_recursive(prefix)
+                    .await
+                    .map_err(DownloadError::Other)?;
+
+                result.keys = keys
+                    .into_iter()
+                    .filter(|k| {
+                        let path = k.with_base(&self.storage_root);
+                        !path.is_dir()
+                    })
+                    .collect();
+
+                if let Some(max_keys) = max_keys {
+                    result.keys.truncate(max_keys.get() as usize);
+                }
+
+                return Ok(result);
+            }
+
+            let path = match prefix {
+                Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
+                None => Cow::Borrowed(&self.storage_root),
+            };
+
+            let prefixes_to_filter = get_all_files(path.as_ref(), false)
+                .await
+                .map_err(DownloadError::Other)?;
+
+            // filter out empty directories to mirror s3 behavior.
+            for prefix in prefixes_to_filter {
+                if prefix.is_dir()
+                    && is_directory_empty(&prefix)
+                        .await
+                        .map_err(DownloadError::Other)?
+                {
+                    continue;
+                }
+
+                let stripped = prefix
+                    .strip_prefix(&self.storage_root)
+                    .context("Failed to strip prefix")
+                    .and_then(RemotePath::new)
+                    .expect(
+                        "We list files for storage root, hence should be able to remote the prefix",
+                    );
+
+                if prefix.is_dir() {
+                    result.prefixes.push(stripped);
+                } else {
+                    result.keys.push(stripped);
+                }
+            }
+
+            Ok(result)
+        };
+
+        let timeout = async {
+            tokio::time::sleep(self.timeout).await;
+            Err(DownloadError::Timeout)
+        };
+
+        let cancelled = async {
+            cancel.cancelled().await;
+            Err(DownloadError::Cancelled)
+        };
+
+        tokio::select! {
+            res = op => res,
+            res = timeout => res,
+            res = cancelled => res,
+        }
+    }
+
+    async fn upload(
+        &self,
+        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
+        data_size_bytes: usize,
+        to: &RemotePath,
+        metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        let cancel = cancel.child_token();
+
+        let op = self.upload0(data, data_size_bytes, to, metadata, &cancel);
+        let mut op = std::pin::pin!(op);
+
+        // race the upload0 to the timeout; if it goes over, do a graceful shutdown
+        let (res, timeout) = tokio::select! {
+            res = &mut op => (res, false),
+            _ = tokio::time::sleep(self.timeout) => {
+                cancel.cancel();
+                (op.await, true)
+            }
+        };
+
+        match res {
+            Err(e) if timeout && TimeoutOrCancel::caused_by_cancel(&e) => {
+                // we caused this cancel (or they happened simultaneously) -- swap it out to
+                // Timeout
+                Err(TimeoutOrCancel::Timeout.into())
+            }
+            res => res,
+        }
+    }
+
+    async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
         let target_path = from.with_base(&self.storage_root);
         if file_exists(&target_path).map_err(DownloadError::BadInput)? {
             let source = ReaderStream::new(
@@ -334,6 +422,10 @@ impl RemoteStorage for LocalFs {
                 .read_storage_metadata(&target_path)
                 .await
                 .map_err(DownloadError::Other)?;
+
+            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+
             Ok(Download {
                 metadata,
                 last_modified: None,
@@ -350,6 +442,7 @@ impl RemoteStorage for LocalFs {
         from: &RemotePath,
         start_inclusive: u64,
         end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
         if let Some(end_exclusive) = end_exclusive {
             if end_exclusive <= start_inclusive {
@@ -391,6 +484,9 @@ impl RemoteStorage for LocalFs {
             let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
             let source = ReaderStream::new(source);
 
+            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+
             Ok(Download {
                 metadata,
                 last_modified: None,
@@ -402,7 +498,7 @@ impl RemoteStorage for LocalFs {
         }
     }
 
-    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+    async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
         let file_path = path.with_base(&self.storage_root);
         match fs::remove_file(&file_path).await {
             Ok(()) => Ok(()),
@@ -414,14 +510,23 @@ impl RemoteStorage for LocalFs {
         }
     }
 
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+    async fn delete_objects<'a>(
+        &self,
+        paths: &'a [RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         for path in paths {
-            self.delete(path).await?
+            self.delete(path, cancel).await?
         }
         Ok(())
     }
 
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+    async fn copy(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        _cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         let from_path = from.with_base(&self.storage_root);
         let to_path = to.with_base(&self.storage_root);
         create_target_directory(&to_path).await?;
@@ -528,8 +633,9 @@ mod fs_tests {
         remote_storage_path: &RemotePath,
         expected_metadata: Option<&StorageMetadata>,
     ) -> anyhow::Result<String> {
+        let cancel = CancellationToken::new();
         let download = storage
-            .download(remote_storage_path)
+            .download(remote_storage_path, &cancel)
             .await
             .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
         ensure!(
@@ -544,16 +650,16 @@ mod fs_tests {
 
     #[tokio::test]
     async fn upload_file() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
 
-        let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
+        let target_path_1 = upload_dummy_file(&storage, "upload_1", None, &cancel).await?;
         assert_eq!(
             storage.list_all().await?,
             vec![target_path_1.clone()],
             "Should list a single file after first upload"
         );
 
-        let target_path_2 = upload_dummy_file(&storage, "upload_2", None).await?;
+        let target_path_2 = upload_dummy_file(&storage, "upload_2", None, &cancel).await?;
         assert_eq!(
             list_files_sorted(&storage).await?,
             vec![target_path_1.clone(), target_path_2.clone()],
@@ -565,7 +671,7 @@ mod fs_tests {
 
     #[tokio::test]
     async fn upload_file_negatives() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
 
         let id = RemotePath::new(Utf8Path::new("dummy"))?;
         let content = Bytes::from_static(b"12345");
@@ -574,34 +680,34 @@ mod fs_tests {
         // Check that you get an error if the size parameter doesn't match the actual
         // size of the stream.
         storage
-            .upload(content(), 0, &id, None)
+            .upload(content(), 0, &id, None, &cancel)
             .await
             .expect_err("upload with zero size succeeded");
         storage
-            .upload(content(), 4, &id, None)
+            .upload(content(), 4, &id, None, &cancel)
             .await
             .expect_err("upload with too short size succeeded");
         storage
-            .upload(content(), 6, &id, None)
+            .upload(content(), 6, &id, None, &cancel)
             .await
             .expect_err("upload with too large size succeeded");
 
         // Correct size is 5, this should succeed.
-        storage.upload(content(), 5, &id, None).await?;
+        storage.upload(content(), 5, &id, None, &cancel).await?;
 
         Ok(())
     }
 
-    fn create_storage() -> anyhow::Result<LocalFs> {
+    fn create_storage() -> anyhow::Result<(LocalFs, CancellationToken)> {
         let storage_root = tempdir()?.path().to_path_buf();
-        LocalFs::new(storage_root)
+        LocalFs::new(storage_root, Duration::from_secs(120)).map(|s| (s, CancellationToken::new()))
     }
 
     #[tokio::test]
     async fn download_file() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
         let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
 
         let contents = read_and_check_metadata(&storage, &upload_target, None).await?;
         assert_eq!(
@@ -611,7 +717,7 @@ mod fs_tests {
         );
 
         let non_existing_path = "somewhere/else";
-        match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?).await {
+        match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?, &cancel).await {
             Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
             other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
         }
@@ -620,9 +726,9 @@ mod fs_tests {
 
     #[tokio::test]
     async fn download_file_range_positive() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
         let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
 
         let full_range_download_contents =
             read_and_check_metadata(&storage, &upload_target, None).await?;
@@ -636,7 +742,12 @@ mod fs_tests {
         let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
 
         let first_part_download = storage
-            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
+            .download_byte_range(
+                &upload_target,
+                0,
+                Some(first_part_local.len() as u64),
+                &cancel,
+            )
             .await?;
         assert!(
             first_part_download.metadata.is_none(),
@@ -654,6 +765,7 @@ mod fs_tests {
                 &upload_target,
                 first_part_local.len() as u64,
                 Some((first_part_local.len() + second_part_local.len()) as u64),
+                &cancel,
             )
             .await?;
         assert!(
@@ -668,7 +780,7 @@ mod fs_tests {
         );
 
         let suffix_bytes = storage
-            .download_byte_range(&upload_target, 13, None)
+            .download_byte_range(&upload_target, 13, None, &cancel)
             .await?
             .download_stream;
         let suffix_bytes = aggregate(suffix_bytes).await?;
@@ -676,7 +788,7 @@ mod fs_tests {
         assert_eq!(upload_name, suffix);
 
         let all_bytes = storage
-            .download_byte_range(&upload_target, 0, None)
+            .download_byte_range(&upload_target, 0, None, &cancel)
             .await?
             .download_stream;
         let all_bytes = aggregate(all_bytes).await?;
@@ -688,9 +800,9 @@ mod fs_tests {
 
     #[tokio::test]
     async fn download_file_range_negative() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
         let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
 
         let start = 1_000_000_000;
         let end = start + 1;
@@ -699,6 +811,7 @@ mod fs_tests {
                 &upload_target,
                 start,
                 Some(end), // exclusive end
+                &cancel,
             )
             .await
         {
@@ -715,7 +828,7 @@ mod fs_tests {
         let end = 234;
         assert!(start > end, "Should test an incorrect range");
         match storage
-            .download_byte_range(&upload_target, start, Some(end))
+            .download_byte_range(&upload_target, start, Some(end), &cancel)
             .await
         {
             Ok(_) => panic!("Should not allow downloading wrong ranges"),
@@ -732,15 +845,15 @@ mod fs_tests {
 
     #[tokio::test]
     async fn delete_file() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
         let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
 
-        storage.delete(&upload_target).await?;
+        storage.delete(&upload_target, &cancel).await?;
         assert!(storage.list_all().await?.is_empty());
 
         storage
-            .delete(&upload_target)
+            .delete(&upload_target, &cancel)
             .await
             .expect("Should allow deleting non-existing storage files");
 
@@ -749,14 +862,14 @@ mod fs_tests {
 
     #[tokio::test]
     async fn file_with_metadata() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
         let upload_name = "upload_1";
         let metadata = StorageMetadata(HashMap::from([
             ("one".to_string(), "1".to_string()),
             ("two".to_string(), "2".to_string()),
         ]));
         let upload_target =
-            upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?;
+            upload_dummy_file(&storage, upload_name, Some(metadata.clone()), &cancel).await?;
 
         let full_range_download_contents =
             read_and_check_metadata(&storage, &upload_target, Some(&metadata)).await?;
@@ -770,7 +883,12 @@ mod fs_tests {
         let (first_part_local, _) = uploaded_bytes.split_at(3);
 
         let partial_download_with_metadata = storage
-            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
+            .download_byte_range(
+                &upload_target,
+                0,
+                Some(first_part_local.len() as u64),
+                &cancel,
+            )
             .await?;
         let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
         assert_eq!(
@@ -791,16 +909,20 @@ mod fs_tests {
     #[tokio::test]
     async fn list() -> anyhow::Result<()> {
         // No delimiter: should recursively list everything
-        let storage = create_storage()?;
-        let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
-        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
+        let (storage, cancel) = create_storage()?;
+        let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
+        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
 
-        let listing = storage.list(None, ListingMode::NoDelimiter, None).await?;
+        let listing = storage
+            .list(None, ListingMode::NoDelimiter, None, &cancel)
+            .await?;
         assert!(listing.prefixes.is_empty());
         assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
 
         // Delimiter: should only go one deep
-        let listing = storage.list(None, ListingMode::WithDelimiter, None).await?;
+        let listing = storage
+            .list(None, ListingMode::WithDelimiter, None, &cancel)
+            .await?;
 
         assert_eq!(
             listing.prefixes,
@@ -814,6 +936,7 @@ mod fs_tests {
                 Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
                 ListingMode::WithDelimiter,
                 None,
+                &cancel,
             )
             .await?;
         assert_eq!(
@@ -826,10 +949,75 @@ mod fs_tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn overwrite_shorter_file() -> anyhow::Result<()> {
+        let (storage, cancel) = create_storage()?;
+
+        let path = RemotePath::new("does/not/matter/file".into())?;
+
+        let body = Bytes::from_static(b"long file contents is long");
+        {
+            let len = body.len();
+            let body =
+                futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
+            storage.upload(body, len, &path, None, &cancel).await?;
+        }
+
+        let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
+        assert_eq!(body, read);
+
+        let shorter = Bytes::from_static(b"shorter body");
+        {
+            let len = shorter.len();
+            let body =
+                futures::stream::once(futures::future::ready(std::io::Result::Ok(shorter.clone())));
+            storage.upload(body, len, &path, None, &cancel).await?;
+        }
+
+        let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
+        assert_eq!(shorter, read);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn cancelled_upload_can_later_be_retried() -> anyhow::Result<()> {
+        let (storage, cancel) = create_storage()?;
+
+        let path = RemotePath::new("does/not/matter/file".into())?;
+
+        let body = Bytes::from_static(b"long file contents is long");
+        {
+            let len = body.len();
+            let body =
+                futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
+            let cancel = cancel.child_token();
+            cancel.cancel();
+            let e = storage
+                .upload(body, len, &path, None, &cancel)
+                .await
+                .unwrap_err();
+
+            assert!(TimeoutOrCancel::caused_by_cancel(&e));
+        }
+
+        {
+            let len = body.len();
+            let body =
+                futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
+            storage.upload(body, len, &path, None, &cancel).await?;
+        }
+
+        let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
+        assert_eq!(body, read);
+
+        Ok(())
+    }
+
     async fn upload_dummy_file(
         storage: &LocalFs,
         name: &str,
         metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<RemotePath> {
         let from_path = storage
             .storage_root
@@ -851,7 +1039,9 @@ mod fs_tests {
 
         let file = tokio_util::io::ReaderStream::new(file);
 
-        storage.upload(file, size, &relative_path, metadata).await?;
+        storage
+            .upload(file, size, &relative_path, metadata, cancel)
+            .await?;
         Ok(relative_path)
     }
 
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index dee5750cac..af70dc7ca2 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -11,7 +11,7 @@ use std::{
     pin::Pin,
     sync::Arc,
     task::{Context, Poll},
-    time::SystemTime,
+    time::{Duration, SystemTime},
 };
 
 use anyhow::{anyhow, Context as _};
@@ -46,9 +46,9 @@ use utils::backoff;
 
 use super::StorageMetadata;
 use crate::{
-    support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode,
-    RemotePath, RemoteStorage, S3Config, TimeTravelError, MAX_KEYS_PER_DELETE,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
+    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
+    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 pub(super) mod metrics;
@@ -63,6 +63,8 @@ pub struct S3Bucket {
     prefix_in_bucket: Option<String>,
     max_keys_per_list_response: Option<i32>,
     concurrency_limiter: ConcurrencyLimiter,
+    // Per-request timeout. Accessible for tests.
+    pub timeout: Duration,
 }
 
 struct GetObjectRequest {
@@ -72,7 +74,7 @@ struct GetObjectRequest {
 }
 impl S3Bucket {
     /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
+    pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
         tracing::debug!(
             "Creating s3 remote storage for S3 bucket {}",
             aws_config.bucket_name
@@ -152,6 +154,7 @@ impl S3Bucket {
             max_keys_per_list_response: aws_config.max_keys_per_list_response,
             prefix_in_bucket,
             concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
+            timeout,
         })
     }
 
@@ -185,40 +188,55 @@ impl S3Bucket {
         }
     }
 
-    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
+    async fn permit(
+        &self,
+        kind: RequestKind,
+        cancel: &CancellationToken,
+    ) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
         let started_at = start_counting_cancelled_wait(kind);
-        let permit = self
-            .concurrency_limiter
-            .acquire(kind)
-            .await
-            .expect("semaphore is never closed");
+        let acquire = self.concurrency_limiter.acquire(kind);
+
+        let permit = tokio::select! {
+            permit = acquire => permit.expect("semaphore is never closed"),
+            _ = cancel.cancelled() => return Err(Cancelled),
+        };
 
         let started_at = ScopeGuard::into_inner(started_at);
         metrics::BUCKET_METRICS
             .wait_seconds
             .observe_elapsed(kind, started_at);
 
-        permit
+        Ok(permit)
     }
 
-    async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit {
+    async fn owned_permit(
+        &self,
+        kind: RequestKind,
+        cancel: &CancellationToken,
+    ) -> Result<tokio::sync::OwnedSemaphorePermit, Cancelled> {
         let started_at = start_counting_cancelled_wait(kind);
-        let permit = self
-            .concurrency_limiter
-            .acquire_owned(kind)
-            .await
-            .expect("semaphore is never closed");
+        let acquire = self.concurrency_limiter.acquire_owned(kind);
+
+        let permit = tokio::select! {
+            permit = acquire => permit.expect("semaphore is never closed"),
+            _ = cancel.cancelled() => return Err(Cancelled),
+        };
 
         let started_at = ScopeGuard::into_inner(started_at);
         metrics::BUCKET_METRICS
             .wait_seconds
             .observe_elapsed(kind, started_at);
-        permit
+        Ok(permit)
     }
 
-    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
+    async fn download_object(
+        &self,
+        request: GetObjectRequest,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
         let kind = RequestKind::Get;
-        let permit = self.owned_permit(kind).await;
+
+        let permit = self.owned_permit(kind, cancel).await?;
 
         let started_at = start_measuring_requests(kind);
 
@@ -228,8 +246,13 @@ impl S3Bucket {
             .bucket(request.bucket)
             .key(request.key)
             .set_range(request.range)
-            .send()
-            .await;
+            .send();
+
+        let get_object = tokio::select! {
+            res = get_object => res,
+            _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
+            _ = cancel.cancelled() => return Err(DownloadError::Cancelled),
+        };
 
         let started_at = ScopeGuard::into_inner(started_at);
 
@@ -259,6 +282,10 @@ impl S3Bucket {
             }
         };
 
+        // even if we would have no timeout left, continue anyways. the caller can decide to ignore
+        // the errors considering timeouts and cancellation.
+        let remaining = self.timeout.saturating_sub(started_at.elapsed());
+
         let metadata = object_output.metadata().cloned().map(StorageMetadata);
         let etag = object_output.e_tag;
         let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
@@ -268,6 +295,9 @@ impl S3Bucket {
         let body = PermitCarrying::new(permit, body);
         let body = TimedDownload::new(started_at, body);
 
+        let cancel_or_timeout = crate::support::cancel_or_timeout(remaining, cancel.clone());
+        let body = crate::support::DownloadStream::new(cancel_or_timeout, body);
+
         Ok(Download {
             metadata,
             etag,
@@ -278,33 +308,44 @@ impl S3Bucket {
 
     async fn delete_oids(
         &self,
-        kind: RequestKind,
+        _permit: &tokio::sync::SemaphorePermit<'_>,
         delete_objects: &[ObjectIdentifier],
+        cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
+        let kind = RequestKind::Delete;
+        let mut cancel = std::pin::pin!(cancel.cancelled());
+
         for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
             let started_at = start_measuring_requests(kind);
 
-            let resp = self
+            let req = self
                 .client
                 .delete_objects()
                 .bucket(self.bucket_name.clone())
                 .delete(
                     Delete::builder()
                         .set_objects(Some(chunk.to_vec()))
-                        .build()?,
+                        .build()
+                        .context("build request")?,
                 )
-                .send()
-                .await;
+                .send();
+
+            let resp = tokio::select! {
+                resp = req => resp,
+                _ = tokio::time::sleep(self.timeout) => return Err(TimeoutOrCancel::Timeout.into()),
+                _ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()),
+            };
 
             let started_at = ScopeGuard::into_inner(started_at);
             metrics::BUCKET_METRICS
                 .req_seconds
                 .observe_elapsed(kind, &resp, started_at);
 
-            let resp = resp?;
+            let resp = resp.context("request deletion")?;
             metrics::BUCKET_METRICS
                 .deleted_objects_total
                 .inc_by(chunk.len() as u64);
+
             if let Some(errors) = resp.errors {
                 // Log a bounded number of the errors within the response:
                 // these requests can carry 1000 keys so logging each one
@@ -320,9 +361,10 @@ impl S3Bucket {
                     );
                 }
 
-                return Err(anyhow::format_err!(
-                    "Failed to delete {} objects",
-                    errors.len()
+                return Err(anyhow::anyhow!(
+                    "Failed to delete {}/{} objects",
+                    errors.len(),
+                    chunk.len(),
                 ));
             }
         }
@@ -410,6 +452,7 @@ impl RemoteStorage for S3Bucket {
         prefix: Option<&RemotePath>,
         mode: ListingMode,
         max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
     ) -> Result<Listing, DownloadError> {
         let kind = RequestKind::List;
         // s3 sdk wants i32
@@ -431,10 +474,11 @@ impl RemoteStorage for S3Bucket {
                 p
             });
 
+        let _permit = self.permit(kind, cancel).await?;
+
         let mut continuation_token = None;
 
         loop {
-            let _guard = self.permit(kind).await;
             let started_at = start_measuring_requests(kind);
 
             // min of two Options, returning Some if one is value and another is
@@ -456,9 +500,15 @@ impl RemoteStorage for S3Bucket {
                 request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
             }
 
-            let response = request
-                .send()
-                .await
+            let request = request.send();
+
+            let response = tokio::select! {
+                res = request => res,
+                _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
+                _ = cancel.cancelled() => return Err(DownloadError::Cancelled),
+            };
+
+            let response = response
                 .context("Failed to list S3 prefixes")
                 .map_err(DownloadError::Other);
 
@@ -511,16 +561,17 @@ impl RemoteStorage for S3Bucket {
         from_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
         let kind = RequestKind::Put;
-        let _guard = self.permit(kind).await;
+        let _permit = self.permit(kind, cancel).await?;
 
         let started_at = start_measuring_requests(kind);
 
         let body = Body::wrap_stream(from);
         let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
 
-        let res = self
+        let upload = self
             .client
             .put_object()
             .bucket(self.bucket_name.clone())
@@ -528,22 +579,40 @@ impl RemoteStorage for S3Bucket {
             .set_metadata(metadata.map(|m| m.0))
             .content_length(from_size_bytes.try_into()?)
             .body(bytes_stream)
-            .send()
-            .await;
+            .send();
 
-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
+        let upload = tokio::time::timeout(self.timeout, upload);
 
-        res?;
+        let res = tokio::select! {
+            res = upload => res,
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
 
-        Ok(())
+        if let Ok(inner) = &res {
+            // do not incl. timeouts as errors in metrics but cancellations
+            let started_at = ScopeGuard::into_inner(started_at);
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, inner, started_at);
+        }
+
+        match res {
+            Ok(Ok(_put)) => Ok(()),
+            Ok(Err(sdk)) => Err(sdk.into()),
+            Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
+        }
     }
 
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+    async fn copy(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         let kind = RequestKind::Copy;
-        let _guard = self.permit(kind).await;
+        let _permit = self.permit(kind, cancel).await?;
+
+        let timeout = tokio::time::sleep(self.timeout);
 
         let started_at = start_measuring_requests(kind);
 
@@ -554,14 +623,19 @@ impl RemoteStorage for S3Bucket {
             self.relative_path_to_s3_object(from)
         );
 
-        let res = self
+        let op = self
             .client
             .copy_object()
             .bucket(self.bucket_name.clone())
             .key(self.relative_path_to_s3_object(to))
             .copy_source(copy_source)
-            .send()
-            .await;
+            .send();
+
+        let res = tokio::select! {
+            res = op => res,
+            _ = timeout => return Err(TimeoutOrCancel::Timeout.into()),
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
 
         let started_at = ScopeGuard::into_inner(started_at);
         metrics::BUCKET_METRICS
@@ -573,14 +647,21 @@ impl RemoteStorage for S3Bucket {
         Ok(())
     }
 
-    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+    async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
         // if prefix is not none then download file `prefix/from`
         // if prefix is none then download file `from`
-        self.download_object(GetObjectRequest {
-            bucket: self.bucket_name.clone(),
-            key: self.relative_path_to_s3_object(from),
-            range: None,
-        })
+        self.download_object(
+            GetObjectRequest {
+                bucket: self.bucket_name.clone(),
+                key: self.relative_path_to_s3_object(from),
+                range: None,
+            },
+            cancel,
+        )
         .await
     }
 
@@ -589,6 +670,7 @@ impl RemoteStorage for S3Bucket {
         from: &RemotePath,
         start_inclusive: u64,
         end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
         // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
         // and needs both ends to be exclusive
@@ -598,31 +680,39 @@ impl RemoteStorage for S3Bucket {
             None => format!("bytes={start_inclusive}-"),
         });
 
-        self.download_object(GetObjectRequest {
-            bucket: self.bucket_name.clone(),
-            key: self.relative_path_to_s3_object(from),
-            range,
-        })
+        self.download_object(
+            GetObjectRequest {
+                bucket: self.bucket_name.clone(),
+                key: self.relative_path_to_s3_object(from),
+                range,
+            },
+            cancel,
+        )
         .await
     }
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        let kind = RequestKind::Delete;
-        let _guard = self.permit(kind).await;
 
+    async fn delete_objects<'a>(
+        &self,
+        paths: &'a [RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        let kind = RequestKind::Delete;
+        let permit = self.permit(kind, cancel).await?;
         let mut delete_objects = Vec::with_capacity(paths.len());
         for path in paths {
             let obj_id = ObjectIdentifier::builder()
                 .set_key(Some(self.relative_path_to_s3_object(path)))
-                .build()?;
+                .build()
+                .context("convert path to oid")?;
             delete_objects.push(obj_id);
         }
 
-        self.delete_oids(kind, &delete_objects).await
+        self.delete_oids(&permit, &delete_objects, cancel).await
     }
 
-    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+    async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
         let paths = std::array::from_ref(path);
-        self.delete_objects(paths).await
+        self.delete_objects(paths, cancel).await
     }
 
     async fn time_travel_recover(
@@ -633,7 +723,7 @@ impl RemoteStorage for S3Bucket {
         cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError> {
         let kind = RequestKind::TimeTravel;
-        let _guard = self.permit(kind).await;
+        let permit = self.permit(kind, cancel).await?;
 
         let timestamp = DateTime::from(timestamp);
         let done_if_after = DateTime::from(done_if_after);
@@ -647,7 +737,7 @@ impl RemoteStorage for S3Bucket {
 
         let warn_threshold = 3;
         let max_retries = 10;
-        let is_permanent = |_e: &_| false;
+        let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled);
 
         let mut key_marker = None;
         let mut version_id_marker = None;
@@ -656,15 +746,19 @@ impl RemoteStorage for S3Bucket {
         loop {
             let response = backoff::retry(
                 || async {
-                    self.client
+                    let op = self
+                        .client
                         .list_object_versions()
                         .bucket(self.bucket_name.clone())
                         .set_prefix(prefix.clone())
                         .set_key_marker(key_marker.clone())
                         .set_version_id_marker(version_id_marker.clone())
-                        .send()
-                        .await
-                        .map_err(|e| TimeTravelError::Other(e.into()))
+                        .send();
+
+                    tokio::select! {
+                        res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
+                        _ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
+                    }
                 },
                 is_permanent,
                 warn_threshold,
@@ -786,14 +880,18 @@ impl RemoteStorage for S3Bucket {
 
                         backoff::retry(
                             || async {
-                                self.client
+                                let op = self
+                                    .client
                                     .copy_object()
                                     .bucket(self.bucket_name.clone())
                                     .key(key)
                                     .copy_source(&source_id)
-                                    .send()
-                                    .await
-                                    .map_err(|e| TimeTravelError::Other(e.into()))
+                                    .send();
+
+                                tokio::select! {
+                                    res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
+                                    _ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
+                                }
                             },
                             is_permanent,
                             warn_threshold,
@@ -824,10 +922,18 @@ impl RemoteStorage for S3Bucket {
                     let oid = ObjectIdentifier::builder()
                         .key(key.to_owned())
                         .build()
-                        .map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
-                    self.delete_oids(kind, &[oid])
+                        .map_err(|e| TimeTravelError::Other(e.into()))?;
+
+                    self.delete_oids(&permit, &[oid], cancel)
                         .await
-                        .map_err(TimeTravelError::Other)?;
+                        .map_err(|e| {
+                            // delete_oid0 will use TimeoutOrCancel
+                            if TimeoutOrCancel::caused_by_cancel(&e) {
+                                TimeTravelError::Cancelled
+                            } else {
+                                TimeTravelError::Other(e)
+                            }
+                        })?;
                 }
             }
         }
@@ -963,7 +1069,8 @@ mod tests {
                 concurrency_limit: NonZeroUsize::new(100).unwrap(),
                 max_keys_per_list_response: Some(5),
             };
-            let storage = S3Bucket::new(&config).expect("remote storage init");
+            let storage =
+                S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
             for (test_path_idx, test_path) in all_paths.iter().enumerate() {
                 let result = storage.relative_path_to_s3_object(test_path);
                 let expected = expected_outputs[prefix_idx][test_path_idx];
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 3dfa16b64e..f5344d3ae2 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -90,11 +90,16 @@ impl UnreliableWrapper {
         }
     }
 
-    async fn delete_inner(&self, path: &RemotePath, attempt: bool) -> anyhow::Result<()> {
+    async fn delete_inner(
+        &self,
+        path: &RemotePath,
+        attempt: bool,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         if attempt {
             self.attempt(RemoteOp::Delete(path.clone()))?;
         }
-        self.inner.delete(path).await
+        self.inner.delete(path, cancel).await
     }
 }
 
@@ -105,20 +110,22 @@ impl RemoteStorage for UnreliableWrapper {
     async fn list_prefixes(
         &self,
         prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
             .map_err(DownloadError::Other)?;
-        self.inner.list_prefixes(prefix).await
+        self.inner.list_prefixes(prefix, cancel).await
     }
 
     async fn list_files(
         &self,
         folder: Option<&RemotePath>,
         max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
             .map_err(DownloadError::Other)?;
-        self.inner.list_files(folder, max_keys).await
+        self.inner.list_files(folder, max_keys, cancel).await
     }
 
     async fn list(
@@ -126,10 +133,11 @@ impl RemoteStorage for UnreliableWrapper {
         prefix: Option<&RemotePath>,
         mode: ListingMode,
         max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
     ) -> Result<Listing, DownloadError> {
         self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
             .map_err(DownloadError::Other)?;
-        self.inner.list(prefix, mode, max_keys).await
+        self.inner.list(prefix, mode, max_keys, cancel).await
     }
 
     async fn upload(
@@ -140,15 +148,22 @@ impl RemoteStorage for UnreliableWrapper {
         data_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
         self.attempt(RemoteOp::Upload(to.clone()))?;
-        self.inner.upload(data, data_size_bytes, to, metadata).await
+        self.inner
+            .upload(data, data_size_bytes, to, metadata, cancel)
+            .await
     }
 
-    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+    async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
         self.attempt(RemoteOp::Download(from.clone()))
             .map_err(DownloadError::Other)?;
-        self.inner.download(from).await
+        self.inner.download(from, cancel).await
     }
 
     async fn download_byte_range(
@@ -156,6 +171,7 @@ impl RemoteStorage for UnreliableWrapper {
         from: &RemotePath,
         start_inclusive: u64,
         end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
         // Note: We treat any download_byte_range as an "attempt" of the same
         // operation. We don't pay attention to the ranges. That's good enough
@@ -163,20 +179,24 @@ impl RemoteStorage for UnreliableWrapper {
         self.attempt(RemoteOp::Download(from.clone()))
             .map_err(DownloadError::Other)?;
         self.inner
-            .download_byte_range(from, start_inclusive, end_exclusive)
+            .download_byte_range(from, start_inclusive, end_exclusive, cancel)
             .await
     }
 
-    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        self.delete_inner(path, true).await
+    async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
+        self.delete_inner(path, true, cancel).await
     }
 
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+    async fn delete_objects<'a>(
+        &self,
+        paths: &'a [RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
         let mut error_counter = 0;
         for path in paths {
             // Dont record attempt because it was already recorded above
-            if (self.delete_inner(path, false).await).is_err() {
+            if (self.delete_inner(path, false, cancel).await).is_err() {
                 error_counter += 1;
             }
         }
@@ -189,11 +209,16 @@ impl RemoteStorage for UnreliableWrapper {
         Ok(())
     }
 
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+    async fn copy(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         // copy is equivalent to download + upload
         self.attempt(RemoteOp::Download(from.clone()))?;
         self.attempt(RemoteOp::Upload(to.clone()))?;
-        self.inner.copy_object(from, to).await
+        self.inner.copy_object(from, to, cancel).await
     }
 
     async fn time_travel_recover(
diff --git a/libs/remote_storage/src/support.rs b/libs/remote_storage/src/support.rs
index 4688a484a5..20f193c6c8 100644
--- a/libs/remote_storage/src/support.rs
+++ b/libs/remote_storage/src/support.rs
@@ -1,9 +1,15 @@
 use std::{
+    future::Future,
     pin::Pin,
     task::{Context, Poll},
+    time::Duration,
 };
 
+use bytes::Bytes;
 use futures_util::Stream;
+use tokio_util::sync::CancellationToken;
+
+use crate::TimeoutOrCancel;
 
 pin_project_lite::pin_project! {
     /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
@@ -31,3 +37,133 @@ impl<S: Stream> Stream for PermitCarrying<S> {
         self.inner.size_hint()
     }
 }
+
+pin_project_lite::pin_project! {
+    pub(crate) struct DownloadStream<F, S> {
+        hit: bool,
+        #[pin]
+        cancellation: F,
+        #[pin]
+        inner: S,
+    }
+}
+
+impl<F, S> DownloadStream<F, S> {
+    pub(crate) fn new(cancellation: F, inner: S) -> Self {
+        Self {
+            cancellation,
+            hit: false,
+            inner,
+        }
+    }
+}
+
+/// See documentation on [`crate::DownloadStream`] on rationale why `std::io::Error` is used.
+impl<E, F, S> Stream for DownloadStream<F, S>
+where
+    std::io::Error: From<E>,
+    F: Future<Output = E>,
+    S: Stream<Item = std::io::Result<Bytes>>,
+{
+    type Item = <S as Stream>::Item;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.project();
+
+        if !*this.hit {
+            if let Poll::Ready(e) = this.cancellation.poll(cx) {
+                *this.hit = true;
+                let e = Err(std::io::Error::from(e));
+                return Poll::Ready(Some(e));
+            }
+        }
+
+        this.inner.poll_next(cx)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
+    }
+}
+
+/// Fires only on the first cancel or timeout, not on both.
+pub(crate) async fn cancel_or_timeout(
+    timeout: Duration,
+    cancel: CancellationToken,
+) -> TimeoutOrCancel {
+    tokio::select! {
+        _ = tokio::time::sleep(timeout) => TimeoutOrCancel::Timeout,
+        _ = cancel.cancelled() => TimeoutOrCancel::Cancel,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::DownloadError;
+    use futures::stream::StreamExt;
+
+    #[tokio::test(start_paused = true)]
+    async fn cancelled_download_stream() {
+        let inner = futures::stream::pending();
+        let timeout = Duration::from_secs(120);
+        let cancel = CancellationToken::new();
+
+        let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
+        let mut stream = std::pin::pin!(stream);
+
+        let mut first = stream.next();
+
+        tokio::select! {
+            _ = &mut first => unreachable!("we haven't yet cancelled nor is timeout passed"),
+            _ = tokio::time::sleep(Duration::from_secs(1)) => {},
+        }
+
+        cancel.cancel();
+
+        let e = first.await.expect("there must be some").unwrap_err();
+        assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}");
+        let inner = e.get_ref().expect("inner should be set");
+        assert!(
+            inner
+                .downcast_ref::<DownloadError>()
+                .is_some_and(|e| matches!(e, DownloadError::Cancelled)),
+            "{inner:?}"
+        );
+
+        tokio::select! {
+            _ = stream.next() => unreachable!("no timeout ever happens as we were already cancelled"),
+            _ = tokio::time::sleep(Duration::from_secs(121)) => {},
+        }
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn timeouted_download_stream() {
+        let inner = futures::stream::pending();
+        let timeout = Duration::from_secs(120);
+        let cancel = CancellationToken::new();
+
+        let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
+        let mut stream = std::pin::pin!(stream);
+
+        // because the stream uses 120s timeout we are paused, we advance to 120s right away.
+        let first = stream.next();
+
+        let e = first.await.expect("there must be some").unwrap_err();
+        assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}");
+        let inner = e.get_ref().expect("inner should be set");
+        assert!(
+            inner
+                .downcast_ref::<DownloadError>()
+                .is_some_and(|e| matches!(e, DownloadError::Timeout)),
+            "{inner:?}"
+        );
+
+        cancel.cancel();
+
+        tokio::select! {
+            _ = stream.next() => unreachable!("no cancellation ever happens because we already timed out"),
+            _ = tokio::time::sleep(Duration::from_secs(121)) => {},
+        }
+    }
+}
diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs
index bca117ed1a..da9dc08d8d 100644
--- a/libs/remote_storage/tests/common/mod.rs
+++ b/libs/remote_storage/tests/common/mod.rs
@@ -10,6 +10,7 @@ use futures::stream::Stream;
 use once_cell::sync::OnceCell;
 use remote_storage::{Download, GenericRemoteStorage, RemotePath};
 use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info};
 
 static LOGGING_DONE: OnceCell<()> = OnceCell::new();
@@ -58,8 +59,12 @@ pub(crate) async fn upload_simple_remote_data(
 ) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
     info!("Creating {upload_tasks_count} remote files");
     let mut upload_tasks = JoinSet::new();
+    let cancel = CancellationToken::new();
+
     for i in 1..upload_tasks_count + 1 {
         let task_client = Arc::clone(client);
+        let cancel = cancel.clone();
+
         upload_tasks.spawn(async move {
             let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
             let blob_path = RemotePath::new(
@@ -69,7 +74,9 @@ pub(crate) async fn upload_simple_remote_data(
             debug!("Creating remote item {i} at path {blob_path:?}");
 
             let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, len, &blob_path, None).await?;
+            task_client
+                .upload(data, len, &blob_path, None, &cancel)
+                .await?;
 
             Ok::<_, anyhow::Error>(blob_path)
         });
@@ -107,13 +114,15 @@ pub(crate) async fn cleanup(
         "Removing {} objects from the remote storage during cleanup",
         objects_to_delete.len()
     );
+    let cancel = CancellationToken::new();
     let mut delete_tasks = JoinSet::new();
     for object_to_delete in objects_to_delete {
         let task_client = Arc::clone(client);
+        let cancel = cancel.clone();
         delete_tasks.spawn(async move {
             debug!("Deleting remote item at path {object_to_delete:?}");
             task_client
-                .delete(&object_to_delete)
+                .delete(&object_to_delete, &cancel)
                 .await
                 .with_context(|| format!("{object_to_delete:?} removal"))
         });
@@ -141,8 +150,12 @@ pub(crate) async fn upload_remote_data(
 ) -> ControlFlow<Uploads, Uploads> {
     info!("Creating {upload_tasks_count} remote files");
     let mut upload_tasks = JoinSet::new();
+    let cancel = CancellationToken::new();
+
     for i in 1..upload_tasks_count + 1 {
         let task_client = Arc::clone(client);
+        let cancel = cancel.clone();
+
         upload_tasks.spawn(async move {
             let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
             let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
@@ -152,7 +165,9 @@ pub(crate) async fn upload_remote_data(
 
             let (data, data_len) =
                 upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, data_len, &blob_path, None).await?;
+            task_client
+                .upload(data, data_len, &blob_path, None, &cancel)
+                .await?;
 
             Ok::<_, anyhow::Error>((blob_prefix, blob_path))
         });
diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs
index 6d062f3898..72f6f956e0 100644
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -4,6 +4,7 @@ use remote_storage::RemotePath;
 use std::sync::Arc;
 use std::{collections::HashSet, num::NonZeroU32};
 use test_context::test_context;
+use tokio_util::sync::CancellationToken;
 use tracing::debug;
 
 use crate::common::{download_to_vec, upload_stream, wrap_stream};
@@ -45,13 +46,15 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
         }
     };
 
+    let cancel = CancellationToken::new();
+
     let test_client = Arc::clone(&ctx.enabled.client);
     let expected_remote_prefixes = ctx.remote_prefixes.clone();
 
     let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
         .context("common_prefix construction")?;
     let root_remote_prefixes = test_client
-        .list_prefixes(None)
+        .list_prefixes(None, &cancel)
         .await
         .context("client list root prefixes failure")?
         .into_iter()
@@ -62,7 +65,7 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
     );
 
     let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix))
+        .list_prefixes(Some(&base_prefix), &cancel)
         .await
         .context("client list nested prefixes failure")?
         .into_iter()
@@ -99,11 +102,12 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
             anyhow::bail!("S3 init failed: {e:?}")
         }
     };
+    let cancel = CancellationToken::new();
     let test_client = Arc::clone(&ctx.enabled.client);
     let base_prefix =
         RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
     let root_files = test_client
-        .list_files(None, None)
+        .list_files(None, None, &cancel)
         .await
         .context("client list root files failure")?
         .into_iter()
@@ -117,13 +121,13 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
     // Test that max_keys limit works. In total there are about 21 files (see
     // upload_simple_remote_data call in test_real_s3.rs).
     let limited_root_files = test_client
-        .list_files(None, Some(NonZeroU32::new(2).unwrap()))
+        .list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
         .await
         .context("client list root files failure")?;
     assert_eq!(limited_root_files.len(), 2);
 
     let nested_remote_files = test_client
-        .list_files(Some(&base_prefix), None)
+        .list_files(Some(&base_prefix), None, &cancel)
         .await
         .context("client list nested files failure")?
         .into_iter()
@@ -150,12 +154,17 @@ async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Resu
         MaybeEnabledStorage::Disabled => return Ok(()),
     };
 
+    let cancel = CancellationToken::new();
+
     let path = RemotePath::new(Utf8Path::new(
         format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
     ))
     .with_context(|| "RemotePath conversion")?;
 
-    ctx.client.delete(&path).await.expect("should succeed");
+    ctx.client
+        .delete(&path, &cancel)
+        .await
+        .expect("should succeed");
 
     Ok(())
 }
@@ -168,6 +177,8 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
         MaybeEnabledStorage::Disabled => return Ok(()),
     };
 
+    let cancel = CancellationToken::new();
+
     let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
         .with_context(|| "RemotePath conversion")?;
 
@@ -178,21 +189,21 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
         .with_context(|| "RemotePath conversion")?;
 
     let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-    ctx.client.upload(data, len, &path1, None).await?;
+    ctx.client.upload(data, len, &path1, None, &cancel).await?;
 
     let (data, len) = upload_stream("remote blob data2".as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
+    ctx.client.upload(data, len, &path2, None, &cancel).await?;
 
     let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-    ctx.client.upload(data, len, &path3, None).await?;
+    ctx.client.upload(data, len, &path3, None, &cancel).await?;
 
-    ctx.client.delete_objects(&[path1, path2]).await?;
+    ctx.client.delete_objects(&[path1, path2], &cancel).await?;
 
-    let prefixes = ctx.client.list_prefixes(None).await?;
+    let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
 
     assert_eq!(prefixes.len(), 1);
 
-    ctx.client.delete_objects(&[path3]).await?;
+    ctx.client.delete_objects(&[path3], &cancel).await?;
 
     Ok(())
 }
@@ -204,6 +215,8 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
         return Ok(());
     };
 
+    let cancel = CancellationToken::new();
+
     let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
         .with_context(|| "RemotePath conversion")?;
 
@@ -211,47 +224,56 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
 
     let (data, len) = wrap_stream(orig.clone());
 
-    ctx.client.upload(data, len, &path, None).await?;
+    ctx.client.upload(data, len, &path, None, &cancel).await?;
 
     // Normal download request
-    let dl = ctx.client.download(&path).await?;
+    let dl = ctx.client.download(&path, &cancel).await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
 
     // Full range (end specified)
     let dl = ctx
         .client
-        .download_byte_range(&path, 0, Some(len as u64))
+        .download_byte_range(&path, 0, Some(len as u64), &cancel)
         .await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
 
     // partial range (end specified)
-    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 4, Some(10), &cancel)
+        .await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[4..10]);
 
     // partial range (end beyond real end)
     let dl = ctx
         .client
-        .download_byte_range(&path, 8, Some(len as u64 * 100))
+        .download_byte_range(&path, 8, Some(len as u64 * 100), &cancel)
         .await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[8..]);
 
     // Partial range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 4, None, &cancel)
+        .await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[4..]);
 
     // Full range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 0, None, &cancel)
+        .await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
 
     debug!("Cleanup: deleting file at path {path:?}");
     ctx.client
-        .delete(&path)
+        .delete(&path, &cancel)
         .await
         .with_context(|| format!("{path:?} removal"))?;
 
@@ -265,6 +287,8 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
         return Ok(());
     };
 
+    let cancel = CancellationToken::new();
+
     let path = RemotePath::new(Utf8Path::new(
         format!("{}/file_to_copy", ctx.base_prefix).as_str(),
     ))
@@ -278,18 +302,18 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
 
     let (data, len) = wrap_stream(orig.clone());
 
-    ctx.client.upload(data, len, &path, None).await?;
+    ctx.client.upload(data, len, &path, None, &cancel).await?;
 
     // Normal download request
-    ctx.client.copy_object(&path, &path_dest).await?;
+    ctx.client.copy_object(&path, &path_dest, &cancel).await?;
 
-    let dl = ctx.client.download(&path_dest).await?;
+    let dl = ctx.client.download(&path_dest, &cancel).await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
 
     debug!("Cleanup: deleting file at path {path:?}");
     ctx.client
-        .delete_objects(&[path.clone(), path_dest.clone()])
+        .delete_objects(&[path.clone(), path_dest.clone()], &cancel)
         .await
         .with_context(|| format!("{path:?} removal"))?;
 
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 6f9a1ec6f7..6adddf52a9 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -1,9 +1,9 @@
-use std::collections::HashSet;
 use std::env;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;
+use std::{collections::HashSet, time::Duration};
 
 use anyhow::Context;
 use remote_storage::{
@@ -39,6 +39,17 @@ impl EnabledAzure {
             base_prefix: BASE_PREFIX,
         }
     }
+
+    #[allow(unused)] // this will be needed when moving the timeout integration tests back
+    fn configure_request_timeout(&mut self, timeout: Duration) {
+        match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") {
+            GenericRemoteStorage::AzureBlob(azure) => {
+                let azure = Arc::get_mut(azure).expect("inner Arc::get_mut");
+                azure.timeout = timeout;
+            }
+            _ => unreachable!(),
+        }
+    }
 }
 
 enum MaybeEnabledStorage {
@@ -213,6 +224,7 @@ fn create_azure_client(
             concurrency_limit: NonZeroUsize::new(100).unwrap(),
             max_keys_per_list_response,
         }),
+        timeout: Duration::from_secs(120),
     };
     Ok(Arc::new(
         GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 3dc8347c83..e927b40e80 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,5 +1,6 @@
 use std::env;
 use std::fmt::{Debug, Display};
+use std::future::Future;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::sync::Arc;
@@ -9,9 +10,10 @@ use std::{collections::HashSet, time::SystemTime};
 use crate::common::{download_to_vec, upload_stream};
 use anyhow::Context;
 use camino::Utf8Path;
-use futures_util::Future;
+use futures_util::StreamExt;
 use remote_storage::{
-    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
+    DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
+    S3Config,
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
@@ -27,7 +29,6 @@ use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_re
 use utils::backoff;
 
 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
-
 const BASE_PREFIX: &str = "test";
 
 #[test_context(MaybeEnabledStorage)]
@@ -69,8 +70,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
         ret
     }
 
-    async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(retry(|| client.list_files(None, None))
+    async fn list_files(
+        client: &Arc<GenericRemoteStorage>,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<HashSet<RemotePath>> {
+        Ok(retry(|| client.list_files(None, None, cancel))
             .await
             .context("list root files failure")?
             .into_iter()
@@ -90,11 +94,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
 
     retry(|| {
         let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-        ctx.client.upload(data, len, &path1, None)
+        ctx.client.upload(data, len, &path1, None, &cancel)
     })
     .await?;
 
-    let t0_files = list_files(&ctx.client).await?;
+    let t0_files = list_files(&ctx.client, &cancel).await?;
     let t0 = time_point().await;
     println!("at t0: {t0_files:?}");
 
@@ -102,17 +106,17 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
 
     retry(|| {
         let (data, len) = upload_stream(old_data.as_bytes().into());
-        ctx.client.upload(data, len, &path2, None)
+        ctx.client.upload(data, len, &path2, None, &cancel)
     })
     .await?;
 
-    let t1_files = list_files(&ctx.client).await?;
+    let t1_files = list_files(&ctx.client, &cancel).await?;
     let t1 = time_point().await;
     println!("at t1: {t1_files:?}");
 
     // A little check to ensure that our clock is not too far off from the S3 clock
     {
-        let dl = retry(|| ctx.client.download(&path2)).await?;
+        let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
         let last_modified = dl.last_modified.unwrap();
         let half_wt = WAIT_TIME.mul_f32(0.5);
         let t0_hwt = t0 + half_wt;
@@ -125,7 +129,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
 
     retry(|| {
         let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-        ctx.client.upload(data, len, &path3, None)
+        ctx.client.upload(data, len, &path3, None, &cancel)
     })
     .await?;
 
@@ -133,12 +137,12 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
 
     retry(|| {
         let (data, len) = upload_stream(new_data.as_bytes().into());
-        ctx.client.upload(data, len, &path2, None)
+        ctx.client.upload(data, len, &path2, None, &cancel)
     })
     .await?;
 
-    retry(|| ctx.client.delete(&path1)).await?;
-    let t2_files = list_files(&ctx.client).await?;
+    retry(|| ctx.client.delete(&path1, &cancel)).await?;
+    let t2_files = list_files(&ctx.client, &cancel).await?;
     let t2 = time_point().await;
     println!("at t2: {t2_files:?}");
 
@@ -147,10 +151,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     ctx.client
         .time_travel_recover(None, t2, t_final, &cancel)
         .await?;
-    let t2_files_recovered = list_files(&ctx.client).await?;
+    let t2_files_recovered = list_files(&ctx.client, &cancel).await?;
     println!("after recovery to t2: {t2_files_recovered:?}");
     assert_eq!(t2_files, t2_files_recovered);
-    let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?;
+    let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?;
     assert_eq!(path2_recovered_t2, new_data.as_bytes());
 
     // after recovery to t1: path1 is back, path2 has the old content
@@ -158,10 +162,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     ctx.client
         .time_travel_recover(None, t1, t_final, &cancel)
         .await?;
-    let t1_files_recovered = list_files(&ctx.client).await?;
+    let t1_files_recovered = list_files(&ctx.client, &cancel).await?;
     println!("after recovery to t1: {t1_files_recovered:?}");
     assert_eq!(t1_files, t1_files_recovered);
-    let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?;
+    let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?;
     assert_eq!(path2_recovered_t1, old_data.as_bytes());
 
     // after recovery to t0: everything is gone except for path1
@@ -169,14 +173,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     ctx.client
         .time_travel_recover(None, t0, t_final, &cancel)
         .await?;
-    let t0_files_recovered = list_files(&ctx.client).await?;
+    let t0_files_recovered = list_files(&ctx.client, &cancel).await?;
     println!("after recovery to t0: {t0_files_recovered:?}");
     assert_eq!(t0_files, t0_files_recovered);
 
     // cleanup
 
     let paths = &[path1, path2, path3];
-    retry(|| ctx.client.delete_objects(paths)).await?;
+    retry(|| ctx.client.delete_objects(paths, &cancel)).await?;
 
     Ok(())
 }
@@ -197,6 +201,16 @@ impl EnabledS3 {
             base_prefix: BASE_PREFIX,
         }
     }
+
+    fn configure_request_timeout(&mut self, timeout: Duration) {
+        match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") {
+            GenericRemoteStorage::AwsS3(s3) => {
+                let s3 = Arc::get_mut(s3).expect("inner Arc::get_mut");
+                s3.timeout = timeout;
+            }
+            _ => unreachable!(),
+        }
+    }
 }
 
 enum MaybeEnabledStorage {
@@ -370,8 +384,169 @@ fn create_s3_client(
             concurrency_limit: NonZeroUsize::new(100).unwrap(),
             max_keys_per_list_response,
         }),
+        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
     };
     Ok(Arc::new(
         GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
     ))
 }
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn download_is_timeouted(ctx: &mut MaybeEnabledStorage) {
+    let MaybeEnabledStorage::Enabled(ctx) = ctx else {
+        return;
+    };
+
+    let cancel = CancellationToken::new();
+
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/file_to_copy", ctx.base_prefix).as_str(),
+    ))
+    .unwrap();
+
+    let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
+
+    let timeout = std::time::Duration::from_secs(5);
+
+    ctx.configure_request_timeout(timeout);
+
+    let started_at = std::time::Instant::now();
+    let mut stream = ctx
+        .client
+        .download(&path, &cancel)
+        .await
+        .expect("download succeeds")
+        .download_stream;
+
+    if started_at.elapsed().mul_f32(0.9) >= timeout {
+        tracing::warn!(
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "timeout might be too low, consumed most of it during headers"
+        );
+    }
+
+    let first = stream
+        .next()
+        .await
+        .expect("should have the first blob")
+        .expect("should have succeeded");
+
+    tracing::info!(len = first.len(), "downloaded first chunk");
+
+    assert!(
+        first.len() < len,
+        "uploaded file is too small, we downloaded all on first chunk"
+    );
+
+    tokio::time::sleep(timeout).await;
+
+    {
+        let started_at = std::time::Instant::now();
+        let next = stream
+            .next()
+            .await
+            .expect("stream should not have ended yet");
+
+        tracing::info!(
+            next.is_err = next.is_err(),
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "received item after timeout"
+        );
+
+        let e = next.expect_err("expected an error, but got a chunk?");
+
+        let inner = e.get_ref().expect("std::io::Error::inner should be set");
+        assert!(
+            inner
+                .downcast_ref::<DownloadError>()
+                .is_some_and(|e| matches!(e, DownloadError::Timeout)),
+            "{inner:?}"
+        );
+    }
+
+    ctx.configure_request_timeout(RemoteStorageConfig::DEFAULT_TIMEOUT);
+
+    ctx.client.delete_objects(&[path], &cancel).await.unwrap()
+}
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
+    let MaybeEnabledStorage::Enabled(ctx) = ctx else {
+        return;
+    };
+
+    let cancel = CancellationToken::new();
+
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/file_to_copy", ctx.base_prefix).as_str(),
+    ))
+    .unwrap();
+
+    let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
+
+    {
+        let mut stream = ctx
+            .client
+            .download(&path, &cancel)
+            .await
+            .expect("download succeeds")
+            .download_stream;
+
+        let first = stream
+            .next()
+            .await
+            .expect("should have the first blob")
+            .expect("should have succeeded");
+
+        tracing::info!(len = first.len(), "downloaded first chunk");
+
+        assert!(
+            first.len() < len,
+            "uploaded file is too small, we downloaded all on first chunk"
+        );
+
+        cancel.cancel();
+
+        let next = stream.next().await.expect("stream should have more");
+
+        let e = next.expect_err("expected an error, but got a chunk?");
+
+        let inner = e.get_ref().expect("std::io::Error::inner should be set");
+        assert!(
+            inner
+                .downcast_ref::<DownloadError>()
+                .is_some_and(|e| matches!(e, DownloadError::Cancelled)),
+            "{inner:?}"
+        );
+    }
+
+    let cancel = CancellationToken::new();
+
+    ctx.client.delete_objects(&[path], &cancel).await.unwrap();
+}
+
+/// Upload a long enough file so that we cannot download it in single chunk
+///
+/// For s3 the first chunk seems to be less than 10kB, so this has a bit of a safety margin
+async fn upload_large_enough_file(
+    client: &GenericRemoteStorage,
+    path: &RemotePath,
+    cancel: &CancellationToken,
+) -> usize {
+    let header = bytes::Bytes::from_static("remote blob data content".as_bytes());
+    let body = bytes::Bytes::from(vec![0u8; 1024]);
+    let contents = std::iter::once(header).chain(std::iter::repeat(body).take(128));
+
+    let len = contents.clone().fold(0, |acc, next| acc + next.len());
+
+    let contents = futures::stream::iter(contents.map(std::io::Result::Ok));
+
+    client
+        .upload(contents, len, path, None, cancel)
+        .await
+        .expect("upload succeeds");
+
+    len
+}
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 1989bef817..6d71ff1dd4 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1359,6 +1359,7 @@ broker_endpoint = '{broker_endpoint}'
                 parsed_remote_storage_config,
                 RemoteStorageConfig {
                     storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
+                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                 },
                 "Remote storage config should correctly parse the local FS config and fill other storage defaults"
             );
@@ -1426,6 +1427,7 @@ broker_endpoint = '{broker_endpoint}'
                         concurrency_limit: s3_concurrency_limit,
                         max_keys_per_list_response: None,
                     }),
+                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                 },
                 "Remote storage config should correctly parse the S3 config"
             );
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 81938b14b3..62ba702db7 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -867,6 +867,7 @@ mod test {
         let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
         let storage_config = RemoteStorageConfig {
             storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+            timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
         };
         let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
 
@@ -1170,6 +1171,7 @@ pub(crate) mod mock {
     pub struct ConsumerState {
         rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
         executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
+        cancel: CancellationToken,
     }
 
     impl ConsumerState {
@@ -1183,7 +1185,7 @@ pub(crate) mod mock {
                 match msg {
                     DeleterMessage::Delete(objects) => {
                         for path in objects {
-                            match remote_storage.delete(&path).await {
+                            match remote_storage.delete(&path, &self.cancel).await {
                                 Ok(_) => {
                                     debug!("Deleted {path}");
                                 }
@@ -1216,7 +1218,7 @@ pub(crate) mod mock {
 
                         for path in objects {
                             info!("Executing deletion {path}");
-                            match remote_storage.delete(&path).await {
+                            match remote_storage.delete(&path, &self.cancel).await {
                                 Ok(_) => {
                                     debug!("Deleted {path}");
                                 }
@@ -1266,7 +1268,11 @@ pub(crate) mod mock {
                 executor_tx,
                 executed,
                 remote_storage,
-                consumer: std::sync::Mutex::new(ConsumerState { rx, executor_rx }),
+                consumer: std::sync::Mutex::new(ConsumerState {
+                    rx,
+                    executor_rx,
+                    cancel: CancellationToken::new(),
+                }),
                 lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
             }
         }
diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs
index a75c73f2b1..1f04bc0410 100644
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -8,6 +8,7 @@
 
 use remote_storage::GenericRemoteStorage;
 use remote_storage::RemotePath;
+use remote_storage::TimeoutOrCancel;
 use remote_storage::MAX_KEYS_PER_DELETE;
 use std::time::Duration;
 use tokio_util::sync::CancellationToken;
@@ -71,9 +72,11 @@ impl Deleter {
                     Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
                 });
 
-                self.remote_storage.delete_objects(&self.accumulator).await
+                self.remote_storage
+                    .delete_objects(&self.accumulator, &self.cancel)
+                    .await
             },
-            |_| false,
+            TimeoutOrCancel::caused_by_cancel,
             3,
             10,
             "executing deletion batch",
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 88f4ae7086..e500a6123c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -25,6 +25,7 @@ use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
+use remote_storage::TimeoutOrCancel;
 use std::fmt;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
@@ -3339,7 +3340,7 @@ impl Tenant {
             &self.cancel,
         )
         .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
         .and_then(|x| x)
     }
 
@@ -3389,8 +3390,10 @@ impl Tenant {
                 );
                 let dest_path =
                     &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id);
+
+                // if this fails, it will get retried by retried control plane requests
                 storage
-                    .copy_object(source_path, dest_path)
+                    .copy_object(source_path, dest_path, &self.cancel)
                     .await
                     .context("copy initdb tar")?;
             }
@@ -4031,6 +4034,7 @@ pub(crate) mod harness {
             std::fs::create_dir_all(&remote_fs_dir).unwrap();
             let config = RemoteStorageConfig {
                 storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+                timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
             };
             let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
             let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 0e192b577c..b64be8dcc5 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -3,7 +3,7 @@ use std::sync::Arc;
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::{models::TenantState, shard::TenantShardId};
-use remote_storage::{GenericRemoteStorage, RemotePath};
+use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, instrument, Instrument};
@@ -84,17 +84,17 @@ async fn create_remote_delete_mark(
             let data = bytes::Bytes::from_static(data);
             let stream = futures::stream::once(futures::future::ready(Ok(data)));
             remote_storage
-                .upload(stream, 0, &remote_mark_path, None)
+                .upload(stream, 0, &remote_mark_path, None, cancel)
                 .await
         },
-        |_e| false,
+        TimeoutOrCancel::caused_by_cancel,
         FAILED_UPLOAD_WARN_THRESHOLD,
         FAILED_REMOTE_OP_RETRIES,
         "mark_upload",
         cancel,
     )
     .await
-    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
     .and_then(|x| x)
     .context("mark_upload")?;
 
@@ -184,15 +184,15 @@ async fn remove_tenant_remote_delete_mark(
     if let Some(remote_storage) = remote_storage {
         let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
         backoff::retry(
-            || async { remote_storage.delete(&path).await },
-            |_e| false,
+            || async { remote_storage.delete(&path, cancel).await },
+            TimeoutOrCancel::caused_by_cancel,
             FAILED_UPLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
             "remove_tenant_remote_delete_mark",
             cancel,
         )
         .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
         .and_then(|x| x)
         .context("remove_tenant_remote_delete_mark")?;
     }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 483f53d5c8..91e1179e53 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -196,14 +196,12 @@ pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
     self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
-use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
 
 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
-use std::time::Duration;
 
-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use std::ops::DerefMut;
 use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
@@ -263,11 +261,6 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
 
-/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows.  It is not
-/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
-pub(crate) const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-pub(crate) const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-
 pub enum MaybeDeletedIndexPart {
     IndexPart(IndexPart),
     Deleted(IndexPart),
@@ -331,40 +324,6 @@ pub struct RemoteTimelineClient {
     cancel: CancellationToken,
 }
 
-/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
-///
-/// This is a convenience for the various upload functions.  In future
-/// the anyhow::Error result should be replaced with a more structured type that
-/// enables callers to avoid handling shutdown as an error.
-async fn upload_cancellable<F>(cancel: &CancellationToken, future: F) -> anyhow::Result<()>
-where
-    F: std::future::Future<Output = anyhow::Result<()>>,
-{
-    match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await {
-        Ok(Ok(())) => Ok(()),
-        Ok(Err(e)) => Err(e),
-        Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")),
-        Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")),
-    }
-}
-/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError.
-async fn download_cancellable<F, R>(
-    cancel: &CancellationToken,
-    future: F,
-) -> Result<R, DownloadError>
-where
-    F: std::future::Future<Output = Result<R, DownloadError>>,
-{
-    match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await {
-        Ok(Ok(r)) => Ok(r),
-        Ok(Err(e)) => Err(e),
-        Err(TimeoutCancellableError::Timeout) => {
-            Err(DownloadError::Other(anyhow::anyhow!("Timed out")))
-        }
-        Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled),
-    }
-}
-
 impl RemoteTimelineClient {
     ///
     /// Create a remote storage client for given timeline
@@ -1050,7 +1009,7 @@ impl RemoteTimelineClient {
             &self.cancel,
         )
         .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
         .and_then(|x| x)?;
 
         // all good, disarm the guard and mark as success
@@ -1082,14 +1041,14 @@ impl RemoteTimelineClient {
                 upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel)
                     .await
             },
-            |_e| false,
+            TimeoutOrCancel::caused_by_cancel,
             FAILED_DOWNLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
             "preserve_initdb_tar_zst",
             &cancel.clone(),
         )
         .await
-        .ok_or_else(|| anyhow::anyhow!("Cancellled"))
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
         .and_then(|x| x)
         .context("backing up initdb archive")?;
         Ok(())
@@ -1151,7 +1110,7 @@ impl RemoteTimelineClient {
         let remaining = download_retry(
             || async {
                 self.storage_impl
-                    .list_files(Some(&timeline_storage_path), None)
+                    .list_files(Some(&timeline_storage_path), None, &cancel)
                     .await
             },
             "list remaining files",
@@ -1445,6 +1404,10 @@ impl RemoteTimelineClient {
                 Ok(()) => {
                     break;
                 }
+                Err(e) if TimeoutOrCancel::caused_by_cancel(&e) => {
+                    // loop around to do the proper stopping
+                    continue;
+                }
                 Err(e) => {
                     let retries = task.retries.fetch_add(1, Ordering::SeqCst);
 
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index e755cd08f3..43f5e6c182 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -11,16 +11,14 @@ use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::shard::TenantShardId;
 use tokio::fs::{self, File, OpenOptions};
 use tokio::io::{AsyncSeekExt, AsyncWriteExt};
+use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
-use utils::timeout::timeout_cancellable;
 use utils::{backoff, crashsafe};
 
 use crate::config::PageServerConf;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::remote_timeline_client::{
-    download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT,
-};
+use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::Generation;
 use crate::virtual_file::on_fatal_io_error;
@@ -83,15 +81,13 @@ pub async fn download_layer_file<'a>(
                 .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
                 .map_err(DownloadError::Other)?;
 
-            // Cancellation safety: it is safe to cancel this future, because it isn't writing to a local
-            // file: the write to local file doesn't start until after the request header is returned
-            // and we start draining the body stream below
-            let download = download_cancellable(cancel, storage.download(&remote_path))
+            let download = storage
+                .download(&remote_path, cancel)
                 .await
                 .with_context(|| {
                     format!(
-                    "open a download stream for layer with remote storage path '{remote_path:?}'"
-                )
+                        "open a download stream for layer with remote storage path '{remote_path:?}'"
+                    )
                 })
                 .map_err(DownloadError::Other)?;
 
@@ -100,43 +96,26 @@ pub async fn download_layer_file<'a>(
 
             let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
 
-            // Cancellation safety: it is safe to cancel this future because it is writing into a temporary file,
-            // and we will unlink the temporary file if there is an error.  This unlink is important because we
-            // are in a retry loop, and we wouldn't want to leave behind a rogue write I/O to a file that
-            // we will imminiently try and write to again.
-            let bytes_amount: u64 = match timeout_cancellable(
-                DOWNLOAD_TIMEOUT,
-                cancel,
-                tokio::io::copy_buf(&mut reader, &mut destination_file),
-            )
-            .await
-            .with_context(|| {
-                format!(
+            let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file)
+                .await
+                .with_context(|| format!(
                     "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
-                )
-            })
-            .map_err(DownloadError::Other)?
-            {
-                Ok(b) => Ok(b),
+                ))
+                .map_err(DownloadError::Other);
+
+            match bytes_amount {
+                Ok(bytes_amount) => {
+                    let destination_file = destination_file.into_inner();
+                    Ok((destination_file, bytes_amount))
+                }
                 Err(e) => {
-                    // Remove incomplete files: on restart Timeline would do this anyway, but we must
-                    // do it here for the retry case.
                     if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
                         on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
                     }
+
                     Err(e)
                 }
             }
-            .with_context(|| {
-                format!(
-                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
-                )
-            })
-            .map_err(DownloadError::Other)?;
-
-            let destination_file = destination_file.into_inner();
-
-            Ok((destination_file, bytes_amount))
         },
         &format!("download {remote_path:?}"),
         cancel,
@@ -218,9 +197,11 @@ pub async fn list_remote_timelines(
 
     let listing = download_retry_forever(
         || {
-            download_cancellable(
+            storage.list(
+                Some(&remote_path),
+                ListingMode::WithDelimiter,
+                None,
                 &cancel,
-                storage.list(Some(&remote_path), ListingMode::WithDelimiter, None),
             )
         },
         &format!("list timelines for {tenant_shard_id}"),
@@ -259,26 +240,23 @@ async fn do_download_index_part(
     index_generation: Generation,
     cancel: &CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
-    use futures::stream::StreamExt;
-
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
     let index_part_bytes = download_retry_forever(
         || async {
-            // Cancellation: if is safe to cancel this future because we're just downloading into
-            // a memory buffer, not touching local disk.
-            let index_part_download =
-                download_cancellable(cancel, storage.download(&remote_path)).await?;
+            let download = storage.download(&remote_path, cancel).await?;
 
-            let mut index_part_bytes = Vec::new();
-            let mut stream = std::pin::pin!(index_part_download.download_stream);
-            while let Some(chunk) = stream.next().await {
-                let chunk = chunk
-                    .with_context(|| format!("download index part at {remote_path:?}"))
-                    .map_err(DownloadError::Other)?;
-                index_part_bytes.extend_from_slice(&chunk[..]);
-            }
-            Ok(index_part_bytes)
+            let mut bytes = Vec::new();
+
+            let stream = download.download_stream;
+            let mut stream = StreamReader::new(stream);
+
+            tokio::io::copy_buf(&mut stream, &mut bytes)
+                .await
+                .with_context(|| format!("download index part at {remote_path:?}"))
+                .map_err(DownloadError::Other)?;
+
+            Ok(bytes)
         },
         &format!("download {remote_path:?}"),
         cancel,
@@ -373,7 +351,7 @@ pub(super) async fn download_index_part(
     let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
 
     let indices = download_retry(
-        || async { storage.list_files(Some(&index_prefix), None).await },
+        || async { storage.list_files(Some(&index_prefix), None, cancel).await },
         "list index_part files",
         cancel,
     )
@@ -446,11 +424,10 @@ pub(crate) async fn download_initdb_tar_zst(
                 .with_context(|| format!("tempfile creation {temp_path}"))
                 .map_err(DownloadError::Other)?;
 
-            let download = match download_cancellable(cancel, storage.download(&remote_path)).await
-            {
+            let download = match storage.download(&remote_path, cancel).await {
                 Ok(dl) => dl,
                 Err(DownloadError::NotFound) => {
-                    download_cancellable(cancel, storage.download(&remote_preserved_path)).await?
+                    storage.download(&remote_preserved_path, cancel).await?
                 }
                 Err(other) => Err(other)?,
             };
@@ -460,6 +437,7 @@ pub(crate) async fn download_initdb_tar_zst(
             // TODO: this consumption of the response body should be subject to timeout + cancellation, but
             // not without thinking carefully about how to recover safely from cancelling a write to
             // local storage (e.g. by writing into a temp file as we do in download_layer)
+            // FIXME: flip the weird error wrapping
             tokio::io::copy_buf(&mut download, &mut writer)
                 .await
                 .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index c17e27b446..137fe48b73 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -16,7 +16,7 @@ use crate::{
     config::PageServerConf,
     tenant::remote_timeline_client::{
         index::IndexPart, remote_index_path, remote_initdb_archive_path,
-        remote_initdb_preserved_archive_path, remote_path, upload_cancellable,
+        remote_initdb_preserved_archive_path, remote_path,
     },
 };
 use remote_storage::{GenericRemoteStorage, TimeTravelError};
@@ -49,16 +49,15 @@ pub(crate) async fn upload_index_part<'a>(
     let index_part_bytes = bytes::Bytes::from(index_part_bytes);
 
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
-    upload_cancellable(
-        cancel,
-        storage.upload_storage_object(
+    storage
+        .upload_storage_object(
             futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
             index_part_size,
             &remote_path,
-        ),
-    )
-    .await
-    .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
+            cancel,
+        )
+        .await
+        .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }
 
 /// Attempts to upload given layer files.
@@ -115,11 +114,10 @@ pub(super) async fn upload_timeline_layer<'a>(
 
     let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
 
-    upload_cancellable(cancel, storage.upload(reader, fs_size, &storage_path, None))
+    storage
+        .upload(reader, fs_size, &storage_path, None, cancel)
         .await
-        .with_context(|| format!("upload layer from local path '{source_path}'"))?;
-
-    Ok(())
+        .with_context(|| format!("upload layer from local path '{source_path}'"))
 }
 
 /// Uploads the given `initdb` data to the remote storage.
@@ -139,12 +137,10 @@ pub(crate) async fn upload_initdb_dir(
     let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
 
     let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
-    upload_cancellable(
-        cancel,
-        storage.upload_storage_object(file, size as usize, &remote_path),
-    )
-    .await
-    .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
+    storage
+        .upload_storage_object(file, size as usize, &remote_path, cancel)
+        .await
+        .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
 }
 
 pub(crate) async fn preserve_initdb_archive(
@@ -155,7 +151,8 @@ pub(crate) async fn preserve_initdb_archive(
 ) -> anyhow::Result<()> {
     let source_path = remote_initdb_archive_path(tenant_id, timeline_id);
     let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id);
-    upload_cancellable(cancel, storage.copy_object(&source_path, &dest_path))
+    storage
+        .copy_object(&source_path, &dest_path, cancel)
         .await
         .with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'"))
 }
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index c23416a7f0..6966cf7709 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -523,12 +523,13 @@ impl<'a> TenantDownloader<'a> {
         tracing::debug!("Downloading heatmap for secondary tenant",);
 
         let heatmap_path = remote_heatmap_path(tenant_shard_id);
+        let cancel = &self.secondary_state.cancel;
 
         let heatmap_bytes = backoff::retry(
             || async {
                 let download = self
                     .remote_storage
-                    .download(&heatmap_path)
+                    .download(&heatmap_path, cancel)
                     .await
                     .map_err(UpdateError::from)?;
                 let mut heatmap_bytes = Vec::new();
@@ -540,7 +541,7 @@ impl<'a> TenantDownloader<'a> {
             FAILED_DOWNLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
             "download heatmap",
-            &self.secondary_state.cancel,
+            cancel,
         )
         .await
         .ok_or_else(|| UpdateError::Cancelled)
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 806e3fb0e8..660459a733 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -21,18 +21,17 @@ use futures::Future;
 use md5;
 use pageserver_api::shard::TenantShardId;
 use rand::Rng;
-use remote_storage::GenericRemoteStorage;
+use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};
 
 use super::{
+    heatmap::HeatMapTenant,
     scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
-    CommandRequest,
+    CommandRequest, UploadCommand,
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, Instrument};
 use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
 
-use super::{heatmap::HeatMapTenant, UploadCommand};
-
 pub(super) async fn heatmap_uploader_task(
     tenant_manager: Arc<TenantManager>,
     remote_storage: GenericRemoteStorage,
@@ -417,10 +416,10 @@ async fn upload_tenant_heatmap(
         || async {
             let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
             remote_storage
-                .upload_storage_object(bytes, size, &path)
+                .upload_storage_object(bytes, size, &path, cancel)
                 .await
         },
-        |_| false,
+        TimeoutOrCancel::caused_by_cancel,
         3,
         u32::MAX,
         "Uploading heatmap",
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index ad22829183..d941445c2d 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -13,7 +13,7 @@ use parquet::{
     },
     record::RecordWriter,
 };
-use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig};
+use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel};
 use tokio::{sync::mpsc, time};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
@@ -314,20 +314,23 @@ async fn upload_parquet(
     let path = RemotePath::from_string(&format!(
         "{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
     ))?;
+    let cancel = CancellationToken::new();
     backoff::retry(
         || async {
             let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
-            storage.upload(stream, data.len(), &path, None).await
+            storage
+                .upload(stream, data.len(), &path, None, &cancel)
+                .await
         },
-        |_e| false,
+        TimeoutOrCancel::caused_by_cancel,
         FAILED_UPLOAD_WARN_THRESHOLD,
         FAILED_UPLOAD_MAX_RETRIES,
         "request_data_upload",
         // we don't want cancellation to interrupt here, so we make a dummy cancel token
-        &CancellationToken::new(),
+        &cancel,
     )
     .await
-    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
     .and_then(|x| x)
     .context("request_data_upload")?;
 
@@ -413,7 +416,8 @@ mod tests {
                     )
                     .unwrap(),
                     max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
-                })
+                }),
+                timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
             })
         );
         assert_eq!(parquet_upload.parquet_upload_row_group_size, 100);
@@ -466,6 +470,7 @@ mod tests {
     ) -> Vec<(u64, usize, i64)> {
         let remote_storage_config = RemoteStorageConfig {
             storage: RemoteStorageKind::LocalFs(tmpdir.to_path_buf()),
+            timeout: std::time::Duration::from_secs(120),
         };
         let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap();
 
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index dbdc742d26..944d80f777 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -511,7 +511,11 @@ async fn backup_object(
 
     let file = tokio_util::io::ReaderStream::with_capacity(file, BUFFER_SIZE);
 
-    storage.upload_storage_object(file, size, target_file).await
+    let cancel = CancellationToken::new();
+
+    storage
+        .upload_storage_object(file, size, target_file, &cancel)
+        .await
 }
 
 pub async fn read_object(
@@ -526,8 +530,10 @@ pub async fn read_object(
 
     info!("segment download about to start from remote path {file_path:?} at offset {offset}");
 
+    let cancel = CancellationToken::new();
+
     let download = storage
-        .download_storage_object(Some((offset, None)), file_path)
+        .download_storage_object(Some((offset, None)), file_path, &cancel)
         .await
         .with_context(|| {
             format!("Failed to open WAL segment download stream for remote path {file_path:?}")
@@ -559,7 +565,8 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     // Note: listing segments might take a long time if there are many of them.
     // We don't currently have http requests timeout cancellation, but if/once
     // we have listing should get streaming interface to make progress.
-    let token = CancellationToken::new(); // not really used
+
+    let cancel = CancellationToken::new(); // not really used
     backoff::retry(
         || async {
             // Do list-delete in batch_size batches to make progress even if there a lot of files.
@@ -567,7 +574,7 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
             // I'm not sure deleting while iterating is expected in s3.
             loop {
                 let files = storage
-                    .list_files(Some(&remote_path), Some(batch_size))
+                    .list_files(Some(&remote_path), Some(batch_size), &cancel)
                     .await?;
                 if files.is_empty() {
                     return Ok(()); // done
@@ -580,14 +587,15 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
                     files.first().unwrap().object_name().unwrap_or(""),
                     files.last().unwrap().object_name().unwrap_or("")
                 );
-                storage.delete_objects(&files).await?;
+                storage.delete_objects(&files, &cancel).await?;
             }
         },
+        // consider TimeoutOrCancel::caused_by_cancel when using cancellation
         |_| false,
         3,
         10,
         "executing WAL segments deletion batch",
-        &token,
+        &cancel,
     )
     .await
     .ok_or_else(|| anyhow::anyhow!("canceled"))
@@ -617,7 +625,12 @@ pub async fn copy_s3_segments(
 
     let remote_path = RemotePath::new(&relative_dst_path)?;
 
-    let files = storage.list_files(Some(&remote_path), None).await?;
+    let cancel = CancellationToken::new();
+
+    let files = storage
+        .list_files(Some(&remote_path), None, &cancel)
+        .await?;
+
     let uploaded_segments = &files
         .iter()
         .filter_map(|file| file.object_name().map(ToOwned::to_owned))
@@ -645,7 +658,7 @@ pub async fn copy_s3_segments(
         let from = RemotePath::new(&relative_src_path.join(&segment_name))?;
         let to = RemotePath::new(&relative_dst_path.join(&segment_name))?;
 
-        storage.copy_object(&from, &to).await?;
+        storage.copy_object(&from, &to, &cancel).await?;
     }
 
     info!(

From 5fa747e493bbbcc6878c03742c5a63622ec31165 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 15 Feb 2024 08:21:53 +0000
Subject: [PATCH 0182/1571] pageserver: shard splitting refinements (parent
 deletion, hard linking) (#6725)

## Problem

- We weren't deleting parent shard contents once the split was done
- Re-downloading layers into child shards is wasteful

## Summary of changes

- Hard-link layers into child chart local storage during split
- Delete parent shards content at the end

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/src/tenant/mgr.rs         | 154 ++++++++++++++++++++++++++-
 test_runner/regress/test_sharding.py |  15 +++
 2 files changed, 165 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 9aee39bd35..7260080720 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,6 +2,7 @@
 //! page server.
 
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
+use futures::stream::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::ShardParameters;
@@ -1439,8 +1440,10 @@ impl TenantManager {
             }
         };
 
-        // TODO: hardlink layers from the parent into the child shard directories so that they don't immediately re-download
-        // TODO: erase the dentries from the parent
+        // Optimization: hardlink layers from the parent into the children, so that they don't have to
+        // re-download & duplicate the data referenced in their initial IndexPart
+        self.shard_split_hardlink(parent, child_shards.clone())
+            .await?;
 
         // Take a snapshot of where the parent's WAL ingest had got to: we will wait for
         // child shards to reach this point.
@@ -1479,10 +1482,11 @@ impl TenantManager {
 
         // Phase 4: wait for child chards WAL ingest to catch up to target LSN
         for child_shard_id in &child_shards {
+            let child_shard_id = *child_shard_id;
             let child_shard = {
                 let locked = TENANTS.read().unwrap();
                 let peek_slot =
-                    tenant_map_peek_slot(&locked, child_shard_id, TenantSlotPeekMode::Read)?;
+                    tenant_map_peek_slot(&locked, &child_shard_id, TenantSlotPeekMode::Read)?;
                 peek_slot.and_then(|s| s.get_attached()).cloned()
             };
             if let Some(t) = child_shard {
@@ -1517,7 +1521,7 @@ impl TenantManager {
             }
         }
 
-        // Phase 5: Shut down the parent shard.
+        // Phase 5: Shut down the parent shard, and erase it from disk
         let (_guard, progress) = completion::channel();
         match parent.shutdown(progress, false).await {
             Ok(()) => {}
@@ -1525,6 +1529,24 @@ impl TenantManager {
                 other.wait().await;
             }
         }
+        let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
+        let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
+            .await
+            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MgmtRequest,
+            None,
+            None,
+            "tenant_files_delete",
+            false,
+            async move {
+                fs::remove_dir_all(tmp_path.as_path())
+                    .await
+                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+            },
+        );
+
         parent_slot_guard.drop_old_value()?;
 
         // Phase 6: Release the InProgress on the parent shard
@@ -1532,6 +1554,130 @@ impl TenantManager {
 
         Ok(child_shards)
     }
+
+    /// Part of [`Self::shard_split`]: hard link parent shard layers into child shards, as an optimization
+    /// to avoid the children downloading them again.
+    ///
+    /// For each resident layer in the parent shard, we will hard link it into all of the child shards.
+    async fn shard_split_hardlink(
+        &self,
+        parent_shard: &Tenant,
+        child_shards: Vec<TenantShardId>,
+    ) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_id();
+
+        let parent_path = self.conf.tenant_path(parent_shard.get_tenant_shard_id());
+        let (parent_timelines, parent_layers) = {
+            let mut parent_layers = Vec::new();
+            let timelines = parent_shard.timelines.lock().unwrap().clone();
+            let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
+            for timeline in timelines.values() {
+                let timeline_layers = timeline
+                    .layers
+                    .read()
+                    .await
+                    .resident_layers()
+                    .collect::<Vec<_>>()
+                    .await;
+                for layer in timeline_layers {
+                    let relative_path = layer
+                        .local_path()
+                        .strip_prefix(&parent_path)
+                        .context("Removing prefix from parent layer path")?;
+                    parent_layers.push(relative_path.to_owned());
+                }
+            }
+            debug_assert!(
+                !parent_layers.is_empty(),
+                "shutdown cannot empty the layermap"
+            );
+            (parent_timelines, parent_layers)
+        };
+
+        let mut child_prefixes = Vec::new();
+        let mut create_dirs = Vec::new();
+
+        for child in child_shards {
+            let child_prefix = self.conf.tenant_path(&child);
+            create_dirs.push(child_prefix.clone());
+            create_dirs.extend(
+                parent_timelines
+                    .iter()
+                    .map(|t| self.conf.timeline_path(&child, t)),
+            );
+
+            child_prefixes.push(child_prefix);
+        }
+
+        // Since we will do a large number of small filesystem metadata operations, batch them into
+        // spawn_blocking calls rather than doing each one as a tokio::fs round-trip.
+        let jh = tokio::task::spawn_blocking(move || -> anyhow::Result<usize> {
+            for dir in &create_dirs {
+                if let Err(e) = std::fs::create_dir_all(dir) {
+                    // Ignore AlreadyExists errors, drop out on all other errors
+                    match e.kind() {
+                        std::io::ErrorKind::AlreadyExists => {}
+                        _ => {
+                            return Err(anyhow::anyhow!(e).context(format!("Creating {dir}")));
+                        }
+                    }
+                }
+            }
+
+            for child_prefix in child_prefixes {
+                for relative_layer in &parent_layers {
+                    let parent_path = parent_path.join(relative_layer);
+                    let child_path = child_prefix.join(relative_layer);
+                    if let Err(e) = std::fs::hard_link(&parent_path, &child_path) {
+                        match e.kind() {
+                            std::io::ErrorKind::AlreadyExists => {}
+                            std::io::ErrorKind::NotFound => {
+                                tracing::info!(
+                                    "Layer {} not found during hard-linking, evicted during split?",
+                                    relative_layer
+                                );
+                            }
+                            _ => {
+                                return Err(anyhow::anyhow!(e).context(format!(
+                                    "Hard linking {relative_layer} into {child_prefix}"
+                                )))
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Durability is not required for correctness, but if we crashed during split and
+            // then came restarted with empty timeline dirs, it would be very inefficient to
+            // re-populate from remote storage.
+            for dir in create_dirs {
+                if let Err(e) = crashsafe::fsync(&dir) {
+                    // Something removed a newly created timeline dir out from underneath us?  Extremely
+                    // unexpected, but not worth panic'ing over as this whole function is just an
+                    // optimization.
+                    tracing::warn!("Failed to fsync directory {dir}: {e}")
+                }
+            }
+
+            Ok(parent_layers.len())
+        });
+
+        match jh.await {
+            Ok(Ok(layer_count)) => {
+                tracing::info!(count = layer_count, "Hard linked layers into child shards");
+            }
+            Ok(Err(e)) => {
+                // This is an optimization, so we tolerate failure.
+                tracing::warn!("Error hard-linking layers, proceeding anyway: {e}")
+            }
+            Err(e) => {
+                // This is something totally unexpected like a panic, so bail out.
+                anyhow::bail!("Error joining hard linking task: {e}");
+            }
+        }
+
+        Ok(())
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index fa40219d0e..fcf4b9f72a 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -194,6 +194,18 @@ def test_sharding_split_smoke(
 
     assert len(pre_split_pageserver_ids) == 4
 
+    def shards_on_disk(shard_ids):
+        for pageserver in env.pageservers:
+            for shard_id in shard_ids:
+                if pageserver.tenant_dir(shard_id).exists():
+                    return True
+
+        return False
+
+    old_shard_ids = [TenantShardId(tenant_id, i, shard_count) for i in range(0, shard_count)]
+    # Before split, old shards exist
+    assert shards_on_disk(old_shard_ids)
+
     env.attachment_service.tenant_shard_split(tenant_id, shard_count=split_shard_count)
 
     post_split_pageserver_ids = [loc["node_id"] for loc in env.attachment_service.locate(tenant_id)]
@@ -202,6 +214,9 @@ def test_sharding_split_smoke(
     assert len(set(post_split_pageserver_ids)) == shard_count
     assert set(post_split_pageserver_ids) == set(pre_split_pageserver_ids)
 
+    # The old parent shards should no longer exist on disk
+    assert not shards_on_disk(old_shard_ids)
+
     workload.validate()
 
     workload.churn_rows(256)

From 1af047dd3ee9eed0de955b61c295142a95a3fde4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 15 Feb 2024 14:34:19 +0200
Subject: [PATCH 0183/1571] Fix typo in CI message (#6749)

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6e4020a1b8..c53cbada7d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -253,7 +253,7 @@ jobs:
           done
 
           if [ "${FAILED}" = "true" ]; then
-            echo >&2 "Please update vendors/revisions.json if these changes are intentional"
+            echo >&2 "Please update vendor/revisions.json if these changes are intentional"
             exit 1
           fi
 

From 936f2ee2a59af86a76df29f0fd6693d1a61da0f7 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 15 Feb 2024 15:48:44 +0200
Subject: [PATCH 0184/1571] fix: accidential wide span in tests (#6772)

introduced in a PR without other #[tracing::instrument] changes.
---
 pageserver/src/tenant.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e500a6123c..fdf04244c3 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3276,7 +3276,7 @@ impl Tenant {
 
     /// For unit tests, make this visible so that other modules can directly create timelines
     #[cfg(test)]
-    #[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
+    #[tracing::instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
     pub(crate) async fn bootstrap_timeline_test(
         &self,
         timeline_id: TimelineId,

From 9ad940086cebd02041142117a76914bc5120c060 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 15 Feb 2024 09:59:13 -0500
Subject: [PATCH 0185/1571] fix superuser permission check for extensions
 (#6733)

close https://github.com/neondatabase/neon/issues/6236

This pull request bumps neon postgres dependencies. The corresponding
postgres commits fix the checks for superuser permission when creating
an extension. Also, for creating native functinos, it now allows
neon_superuser only in the extension creation process.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 9dd9956c55..b4bae26a0f 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 9dd9956c55ffbbd9abe77d10382453757fedfcf5
+Subproject commit b4bae26a0f09c69e979e6cb55780398e3102e022
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index ca2def9993..9eef016e18 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit ca2def999368d9df098a637234ad5a9003189463
+Subproject commit 9eef016e18bf61753e3cbaa755f705db6a4f7b1d
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 9c37a49884..f7b63d8cf9 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 9c37a4988463a97d9cacb321acf3828b09823269
+Subproject commit f7b63d8cf9ae040f6907c3c13ef25fcf15a36161
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 72bc0d7e0d..37ca812c4a 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "9c37a4988463a97d9cacb321acf3828b09823269",
-    "postgres-v15": "ca2def999368d9df098a637234ad5a9003189463",
-    "postgres-v14": "9dd9956c55ffbbd9abe77d10382453757fedfcf5"
+    "postgres-v16": "f7b63d8cf9ae040f6907c3c13ef25fcf15a36161",
+    "postgres-v15": "9eef016e18bf61753e3cbaa755f705db6a4f7b1d",
+    "postgres-v14": "b4bae26a0f09c69e979e6cb55780398e3102e022"
 }

From cd3e4ac18d1f6998325855d0f9b7b194a10676cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 15 Feb 2024 16:14:51 +0100
Subject: [PATCH 0186/1571] Rename TEST_IMG function to test_img (#6762)

Latter follows the canonical way to naming functions in Rust.
---
 pageserver/src/tenant.rs    | 64 ++++++++++++++++++-------------------
 pageserver/src/walingest.rs | 54 +++++++++++++++----------------
 2 files changed, 58 insertions(+), 60 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index fdf04244c3..ced4bb5af4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3933,8 +3933,7 @@ pub(crate) mod harness {
         TimelineId::from_array(hex!("AA223344556677881122334455667788"));
 
     /// Convenience function to create a page image with given string as the only content
-    #[allow(non_snake_case)]
-    pub fn TEST_IMG(s: &str) -> Bytes {
+    pub fn test_img(s: &str) -> Bytes {
         let mut buf = BytesMut::new();
         buf.extend_from_slice(s.as_bytes());
         buf.resize(64, 0);
@@ -4179,7 +4178,6 @@ pub(crate) mod harness {
             _pg_version: u32,
         ) -> anyhow::Result<Bytes> {
             let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
-
             if records_neon {
                 // For Neon wal records, we can decode without spawning postgres, so do so.
                 let base_img = base_img.expect("Neon WAL redo requires base image").1;
@@ -4204,7 +4202,7 @@ pub(crate) mod harness {
                 );
                 println!("{s}");
 
-                Ok(TEST_IMG(&s))
+                Ok(test_img(&s))
             }
         }
     }
@@ -4239,7 +4237,7 @@ mod tests {
             .put(
                 *TEST_KEY,
                 Lsn(0x10),
-                &Value::Image(TEST_IMG("foo at 0x10")),
+                &Value::Image(test_img("foo at 0x10")),
                 &ctx,
             )
             .await?;
@@ -4251,7 +4249,7 @@ mod tests {
             .put(
                 *TEST_KEY,
                 Lsn(0x20),
-                &Value::Image(TEST_IMG("foo at 0x20")),
+                &Value::Image(test_img("foo at 0x20")),
                 &ctx,
             )
             .await?;
@@ -4260,15 +4258,15 @@ mod tests {
 
         assert_eq!(
             tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
-            TEST_IMG("foo at 0x10")
+            test_img("foo at 0x10")
         );
         assert_eq!(
             tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
-            TEST_IMG("foo at 0x10")
+            test_img("foo at 0x10")
         );
         assert_eq!(
             tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
-            TEST_IMG("foo at 0x20")
+            test_img("foo at 0x20")
         );
 
         Ok(())
@@ -4384,7 +4382,7 @@ mod tests {
                 .put(
                     *TEST_KEY,
                     lsn,
-                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    &Value::Image(test_img(&format!("foo at {}", lsn))),
                     ctx,
                 )
                 .await?;
@@ -4394,7 +4392,7 @@ mod tests {
                 .put(
                     *TEST_KEY,
                     lsn,
-                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    &Value::Image(test_img(&format!("foo at {}", lsn))),
                     ctx,
                 )
                 .await?;
@@ -4408,7 +4406,7 @@ mod tests {
                 .put(
                     *TEST_KEY,
                     lsn,
-                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    &Value::Image(test_img(&format!("foo at {}", lsn))),
                     ctx,
                 )
                 .await?;
@@ -4418,7 +4416,7 @@ mod tests {
                 .put(
                     *TEST_KEY,
                     lsn,
-                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    &Value::Image(test_img(&format!("foo at {}", lsn))),
                     ctx,
                 )
                 .await?;
@@ -4573,7 +4571,7 @@ mod tests {
         // Broken, as long as you don't need to access data from the parent.
         assert_eq!(
             newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?,
-            TEST_IMG(&format!("foo at {}", Lsn(0x70)))
+            test_img(&format!("foo at {}", Lsn(0x70)))
         );
 
         // This needs to traverse to the parent, and fails.
@@ -4650,7 +4648,7 @@ mod tests {
         // Check that the data is still accessible on the branch.
         assert_eq!(
             newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?,
-            TEST_IMG(&format!("foo at {}", Lsn(0x40)))
+            test_img(&format!("foo at {}", Lsn(0x40)))
         );
 
         Ok(())
@@ -4825,7 +4823,7 @@ mod tests {
             .put(
                 *TEST_KEY,
                 Lsn(0x10),
-                &Value::Image(TEST_IMG("foo at 0x10")),
+                &Value::Image(test_img("foo at 0x10")),
                 &ctx,
             )
             .await?;
@@ -4842,7 +4840,7 @@ mod tests {
             .put(
                 *TEST_KEY,
                 Lsn(0x20),
-                &Value::Image(TEST_IMG("foo at 0x20")),
+                &Value::Image(test_img("foo at 0x20")),
                 &ctx,
             )
             .await?;
@@ -4859,7 +4857,7 @@ mod tests {
             .put(
                 *TEST_KEY,
                 Lsn(0x30),
-                &Value::Image(TEST_IMG("foo at 0x30")),
+                &Value::Image(test_img("foo at 0x30")),
                 &ctx,
             )
             .await?;
@@ -4876,7 +4874,7 @@ mod tests {
             .put(
                 *TEST_KEY,
                 Lsn(0x40),
-                &Value::Image(TEST_IMG("foo at 0x40")),
+                &Value::Image(test_img("foo at 0x40")),
                 &ctx,
             )
             .await?;
@@ -4890,23 +4888,23 @@ mod tests {
 
         assert_eq!(
             tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
-            TEST_IMG("foo at 0x10")
+            test_img("foo at 0x10")
         );
         assert_eq!(
             tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
-            TEST_IMG("foo at 0x10")
+            test_img("foo at 0x10")
         );
         assert_eq!(
             tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
-            TEST_IMG("foo at 0x20")
+            test_img("foo at 0x20")
         );
         assert_eq!(
             tline.get(*TEST_KEY, Lsn(0x30), &ctx).await?,
-            TEST_IMG("foo at 0x30")
+            test_img("foo at 0x30")
         );
         assert_eq!(
             tline.get(*TEST_KEY, Lsn(0x40), &ctx).await?,
-            TEST_IMG("foo at 0x40")
+            test_img("foo at 0x40")
         );
 
         Ok(())
@@ -4938,7 +4936,7 @@ mod tests {
                     .put(
                         test_key,
                         lsn,
-                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                         &ctx,
                     )
                     .await?;
@@ -5000,7 +4998,7 @@ mod tests {
                 .put(
                     test_key,
                     lsn,
-                    &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                     &ctx,
                 )
                 .await?;
@@ -5021,7 +5019,7 @@ mod tests {
                     .put(
                         test_key,
                         lsn,
-                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                         &ctx,
                     )
                     .await?;
@@ -5035,7 +5033,7 @@ mod tests {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
                     tline.get(test_key, lsn, &ctx).await?,
-                    TEST_IMG(&format!("{} at {}", blknum, last_lsn))
+                    test_img(&format!("{} at {}", blknum, last_lsn))
                 );
             }
 
@@ -5089,7 +5087,7 @@ mod tests {
                 .put(
                     test_key,
                     lsn,
-                    &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                     &ctx,
                 )
                 .await?;
@@ -5118,7 +5116,7 @@ mod tests {
                     .put(
                         test_key,
                         lsn,
-                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                         &ctx,
                     )
                     .await?;
@@ -5133,7 +5131,7 @@ mod tests {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
                     tline.get(test_key, lsn, &ctx).await?,
-                    TEST_IMG(&format!("{} at {}", blknum, last_lsn))
+                    test_img(&format!("{} at {}", blknum, last_lsn))
                 );
             }
 
@@ -5195,7 +5193,7 @@ mod tests {
                     .put(
                         test_key,
                         lsn,
-                        &Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))),
+                        &Value::Image(test_img(&format!("{} {} at {}", idx, blknum, lsn))),
                         &ctx,
                     )
                     .await?;
@@ -5217,7 +5215,7 @@ mod tests {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
                     tline.get(test_key, *lsn, &ctx).await?,
-                    TEST_IMG(&format!("{idx} {blknum} at {lsn}"))
+                    test_img(&format!("{idx} {blknum} at {lsn}"))
                 );
             }
         }
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 12ceac0191..8df2f1713a 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1695,22 +1695,22 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest.put_rel_creation(&mut m, TESTREL_A, &ctx).await?;
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
             .await?;
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x30));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx)
             .await?;
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx)
             .await?;
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x50));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx)
             .await?;
         m.commit(&ctx).await?;
 
@@ -1751,46 +1751,46 @@ mod tests {
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 0 at 2")
+            test_img("foo blk 0 at 2")
         );
 
         assert_eq!(
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 0 at 3")
+            test_img("foo blk 0 at 3")
         );
 
         assert_eq!(
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 0 at 3")
+            test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 1 at 4")
+            test_img("foo blk 1 at 4")
         );
 
         assert_eq!(
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 0 at 3")
+            test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 1 at 4")
+            test_img("foo blk 1 at 4")
         );
         assert_eq!(
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 2 at 5")
+            test_img("foo blk 2 at 5")
         );
 
         // Truncate last block
@@ -1812,13 +1812,13 @@ mod tests {
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 0 at 3")
+            test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 1 at 4")
+            test_img("foo blk 1 at 4")
         );
 
         // should still see the truncated block with older LSN
@@ -1832,7 +1832,7 @@ mod tests {
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 2 at 5")
+            test_img("foo blk 2 at 5")
         );
 
         // Truncate to zero length
@@ -1851,7 +1851,7 @@ mod tests {
         // Extend from 0 to 2 blocks, leaving a gap
         let mut m = tline.begin_modification(Lsn(0x70));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx)
             .await?;
         m.commit(&ctx).await?;
         assert_eq!(
@@ -1870,13 +1870,13 @@ mod tests {
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 1")
+            test_img("foo blk 1")
         );
 
         // Extend a lot more, leaving a big gap that spans across segments
         let mut m = tline.begin_modification(Lsn(0x80));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx)
             .await?;
         m.commit(&ctx).await?;
         assert_eq!(
@@ -1897,7 +1897,7 @@ mod tests {
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 1500")
+            test_img("foo blk 1500")
         );
 
         Ok(())
@@ -1915,7 +1915,7 @@ mod tests {
 
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
             .await?;
         m.commit(&ctx).await?;
 
@@ -1952,7 +1952,7 @@ mod tests {
         // Re-create it
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 4"), &ctx)
             .await?;
         m.commit(&ctx).await?;
 
@@ -1990,7 +1990,7 @@ mod tests {
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
             walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx)
                 .await?;
         }
         m.commit(&ctx).await?;
@@ -2028,7 +2028,7 @@ mod tests {
                 tline
                     .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
                     .await?,
-                TEST_IMG(&data)
+                test_img(&data)
             );
         }
 
@@ -2055,7 +2055,7 @@ mod tests {
                 tline
                     .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
                     .await?,
-                TEST_IMG(&data)
+                test_img(&data)
             );
         }
 
@@ -2073,7 +2073,7 @@ mod tests {
                 tline
                     .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
                     .await?,
-                TEST_IMG(&data)
+                test_img(&data)
             );
         }
 
@@ -2084,7 +2084,7 @@ mod tests {
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, lsn);
             walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx)
                 .await?;
         }
         m.commit(&ctx).await?;
@@ -2109,7 +2109,7 @@ mod tests {
                 tline
                     .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
                     .await?,
-                TEST_IMG(&data)
+                test_img(&data)
             );
         }
 
@@ -2130,7 +2130,7 @@ mod tests {
         for blknum in 0..RELSEG_SIZE + 1 {
             lsn += 0x10;
             let mut m = tline.begin_modification(Lsn(lsn));
-            let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
+            let img = test_img(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
             walingest
                 .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
                 .await?;

From c72cb44213e1ffeccaa321d2d43a90c7fa9c8881 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 15 Feb 2024 15:53:58 +0000
Subject: [PATCH 0187/1571] test_runner/performance: parametrize benchmarks
 (#6744)

## Problem
Currently, we don't store `PLATFORM` for Nightly Benchmarks. It
causes them to be merged as reruns in Allure report (because they have
the same test name).

## Summary of changes
- Parametrize benchmarks by
  - Postgres Version (14/15/16)
  - Build Type (debug/release/remote)
  - PLATFORM (neon-staging/github-actions-selfhosted/...)

---------

Co-authored-by: Bodobolero <peterbendel@neon.tech>
---
 test_runner/fixtures/parametrize.py | 51 +++++++++++++++--------------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index d8ac92abb6..57ca1932b0 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -2,57 +2,58 @@ import os
 from typing import Optional
 
 import pytest
-from _pytest.fixtures import FixtureRequest
 from _pytest.python import Metafunc
 
 from fixtures.pg_version import PgVersion
 
 """
-Dynamically parametrize tests by Postgres version, build type (debug/release/remote), and possibly by other parameters
+Dynamically parametrize tests by different parameters
 """
 
 
 @pytest.fixture(scope="function", autouse=True)
-def pg_version(request: FixtureRequest) -> Optional[PgVersion]:
-    # Do not parametrize performance tests yet, we need to prepare grafana charts first
-    if "test_runner/performance" in str(request.node.path):
-        v = os.environ.get("DEFAULT_PG_VERSION")
-        return PgVersion(v)
-
+def pg_version() -> Optional[PgVersion]:
     return None
 
 
 @pytest.fixture(scope="function", autouse=True)
-def build_type(request: FixtureRequest) -> Optional[str]:
-    # Do not parametrize performance tests yet, we need to prepare grafana charts first
-    if "test_runner/performance" in str(request.node.path):
-        return os.environ.get("BUILD_TYPE", "").lower()
-
+def build_type() -> Optional[str]:
     return None
 
 
 @pytest.fixture(scope="function", autouse=True)
-def pageserver_virtual_file_io_engine(request: FixtureRequest) -> Optional[str]:
+def platform() -> Optional[str]:
+    return None
+
+
+@pytest.fixture(scope="function", autouse=True)
+def pageserver_virtual_file_io_engine() -> Optional[str]:
     return None
 
 
 def pytest_generate_tests(metafunc: Metafunc):
-    if (v := os.environ.get("DEFAULT_PG_VERSION")) is None:
-        pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET]
-    else:
-        pg_versions = [PgVersion(v)]
-
-    if (bt := os.environ.get("BUILD_TYPE")) is None:
+    if (bt := os.getenv("BUILD_TYPE")) is None:
         build_types = ["debug", "release"]
     else:
         build_types = [bt.lower()]
 
-    # Do not parametrize performance tests yet by Postgres version or build type, we need to prepare grafana charts first
-    if "test_runner/performance" not in metafunc.definition._nodeid:
-        metafunc.parametrize("build_type", build_types)
-        metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
+    metafunc.parametrize("build_type", build_types)
+
+    if (v := os.getenv("DEFAULT_PG_VERSION")) is None:
+        pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET]
+    else:
+        pg_versions = [PgVersion(v)]
+
+    metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
 
     # A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=tokio-epoll-uring`
     # And do not change test name for default `pageserver_virtual_file_io_engine=std-fs` to keep tests statistics
-    if (io_engine := os.environ.get("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
+    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
         metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine])
+
+    # For performance tests, parametrize also by platform
+    if (
+        "test_runner/performance" in metafunc.definition._nodeid
+        and (platform := os.getenv("PLATFORM")) is not None
+    ):
+        metafunc.parametrize("platform", [platform.lower()])

From 046d9c69e6734c8e60b6da91d3fb5dd4983001f2 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 15 Feb 2024 18:58:26 +0200
Subject: [PATCH 0188/1571] fix: require wider jwt for changing the io engine
 (#6770)

io-engine should not be changeable with any JWT token, for example the
tenant_id scoped token which computes have.
---
 pageserver/src/http/routes.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index ab546c873a..df3794f222 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1951,6 +1951,7 @@ async fn put_io_engine_handler(
     mut r: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
     let kind: crate::virtual_file::IoEngineKind = json_request(&mut r).await?;
     crate::virtual_file::io_engine::set(kind);
     json_response(StatusCode::OK, ())

From f0d8bd7855812100bb9ec8f43f1535981f40f5da Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Thu, 15 Feb 2024 20:48:50 +0100
Subject: [PATCH 0189/1571] Update Makefile (#6779)

This fixes issues where `neon-pg-ext-clean-vYY` is used as target and
resolves using the `neon-pg-ext-%` template with `$*` resolving as `clean-vYY`, for
older versions of GNU Make, rather than `neon-pg-ext-clean-%` using `$*` = `vYY`

## Problem

```
$ make clean
...
rm -f pg_config_paths.h

Compiling neon clean-v14

mkdir -p /Users/<user>/neon-build//pg_install//build/neon-clean-v14

/Applications/Xcode.app/Contents/Developer/usr/bin/make PG_CONFIG=/Users/<user>/neon-build//pg_install//clean-v14/bin/pg_config CFLAGS='-O0 -g3  ' \

        -C /Users/<user>/neon-build//pg_install//build/neon-clean-v14 \

        -f /Users/<user>/neon-build//pgxn/neon/Makefile install

make[1]: /Users/<user>/neon-build//pg_install//clean-v14/bin/pg_config: Command not found

make[1]: *** No rule to make target `install'.  Stop.

make: *** [neon-pg-ext-clean-v14] Error 2
```
---
 Makefile | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index 5bed4cb9fc..ea782cb369 100644
--- a/Makefile
+++ b/Makefile
@@ -159,8 +159,8 @@ neon-pg-ext-%: postgres-%
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
 
-.PHONY: neon-pg-ext-clean-%
-neon-pg-ext-clean-%:
+.PHONY: neon-pg-clean-ext-%
+neon-pg-clean-ext-%:
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
@@ -216,11 +216,11 @@ neon-pg-ext: \
 	neon-pg-ext-v15 \
 	neon-pg-ext-v16
 
-.PHONY: neon-pg-ext-clean
-neon-pg-ext-clean: \
-	neon-pg-ext-clean-v14 \
-	neon-pg-ext-clean-v15 \
-	neon-pg-ext-clean-v16
+.PHONY: neon-pg-clean-ext
+neon-pg-clean-ext: \
+	neon-pg-clean-ext-v14 \
+	neon-pg-clean-ext-v15 \
+	neon-pg-clean-ext-v16
 
 # shorthand to build all Postgres versions
 .PHONY: postgres
@@ -249,7 +249,7 @@ postgres-check: \
 
 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
-clean: postgres-clean neon-pg-ext-clean
+clean: postgres-clean neon-pg-clean-ext
 	$(CARGO_CMD_PREFIX) cargo clean
 
 # This removes everything

From 6b980f38da82a19ef4ad1cafd11cdfde521e0bfb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 15 Feb 2024 21:59:39 +0000
Subject: [PATCH 0190/1571] libs: refactor ShardCount.0 to private (#6690)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

The ShardCount type has a magic '0' value that represents a legacy
single-sharded tenant, whose TenantShardId is formatted without a
`-0001` suffix (i.e. formatted as a traditional TenantId).

This was error-prone in code locations that wanted the actual number of
shards: they had to handle the 0 case specially.

## Summary of changes

- Make the internal value of ShardCount private, and expose `count()`
and `literal()` getters so that callers have to explicitly say whether
they want the literal value (e.g. for storing in a TenantShardId), or
the actual number of shards in the tenant.


---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 .../attachment_service/src/compute_hook.rs    |  6 ++--
 .../attachment_service/src/persistence.rs     | 20 +++++------
 .../attachment_service/src/reconciler.rs      |  8 ++---
 .../attachment_service/src/service.rs         | 33 +++++++----------
 control_plane/src/bin/neon_local.rs           |  2 +-
 libs/pageserver_api/src/models.rs             |  4 +--
 libs/pageserver_api/src/shard.rs              | 35 +++++++++++++++++--
 pageserver/src/http/routes.rs                 |  2 +-
 pageserver/src/page_service.rs                |  4 +--
 pageserver/src/tenant.rs                      |  2 +-
 pageserver/src/tenant/config.rs               |  2 +-
 pageserver/src/tenant/mgr.rs                  |  9 +++--
 pageserver/src/tenant/secondary.rs            |  2 +-
 13 files changed, 75 insertions(+), 54 deletions(-)

diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs
index 5bd1b6bf09..bac378d218 100644
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -3,7 +3,7 @@ use std::{collections::HashMap, time::Duration};
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
 use hyper::{Method, StatusCode};
-use pageserver_api::shard::{ShardCount, ShardIndex, ShardNumber, TenantShardId};
+use pageserver_api::shard::{ShardIndex, ShardNumber, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
@@ -77,7 +77,7 @@ impl ComputeHookTenant {
         self.shards
             .sort_by_key(|(shard, _node_id)| shard.shard_number);
 
-        if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) {
+        if self.shards.len() == shard_count.count() as usize || shard_count.is_unsharded() {
             // We have pageservers for all the shards: emit a configuration update
             return Some(ComputeHookNotifyRequest {
                 tenant_id,
@@ -94,7 +94,7 @@ impl ComputeHookTenant {
             tracing::info!(
                 "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
                 self.shards.len(),
-                shard_count.0
+                shard_count.count()
             );
         }
 
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 5b3b032bc9..c5829cae88 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -222,7 +222,7 @@ impl Persistence {
             let tenant_shard_id = TenantShardId {
                 tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
                 shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount(tsp.shard_count as u8),
+                shard_count: ShardCount::new(tsp.shard_count as u8),
             };
 
             tenants_map.insert(tenant_shard_id, tsp);
@@ -318,7 +318,7 @@ impl Persistence {
                 tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
                     .map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
                 shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount(tsp.shard_count as u8),
+                shard_count: ShardCount::new(tsp.shard_count as u8),
             };
             result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
         }
@@ -340,7 +340,7 @@ impl Persistence {
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                     .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
                     .set((
                         generation.eq(generation + 1),
                         generation_pageserver.eq(node_id.0 as i64),
@@ -362,7 +362,7 @@ impl Persistence {
             let updated = diesel::update(tenant_shards)
                 .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                 .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
                 .set((
                     generation_pageserver.eq(i64::MAX),
                     placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
@@ -392,21 +392,19 @@ impl Persistence {
             conn.transaction(|conn| -> DatabaseResult<()> {
                 // Mark parent shards as splitting
 
-                let expect_parent_records = std::cmp::max(1, old_shard_count.0);
-
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(old_shard_count.0 as i32))
+                    .filter(shard_count.eq(old_shard_count.literal() as i32))
                     .set((splitting.eq(1),))
                     .execute(conn)?;
                 if u8::try_from(updated)
                     .map_err(|_| DatabaseError::Logical(
                         format!("Overflow existing shard count {} while splitting", updated))
-                    )? != expect_parent_records {
+                    )? != old_shard_count.count() {
                     // Perhaps a deletion or another split raced with this attempt to split, mutating
                     // the parent shards that we intend to split. In this case the split request should fail.
                     return Err(DatabaseError::Logical(
-                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {expect_parent_records})")
+                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
                     ));
                 }
 
@@ -418,7 +416,7 @@ impl Persistence {
                     let mut parent = crate::schema::tenant_shards::table
                         .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
                         .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(parent_shard_id.shard_count.0 as i32))
+                        .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
                         .load::<TenantShardPersistence>(conn)?;
                     let parent = if parent.len() != 1 {
                         return Err(DatabaseError::Logical(format!(
@@ -459,7 +457,7 @@ impl Persistence {
                 // Drop parent shards
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(old_shard_count.0 as i32))
+                    .filter(shard_count.eq(old_shard_count.literal() as i32))
                     .execute(conn)?;
 
                 // Clear sharding flag
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index 776e1f9d1e..65bbfa7181 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -263,7 +263,7 @@ impl Reconciler {
                 secondary_conf,
                 tenant_conf: config.clone(),
                 shard_number: shard.number.0,
-                shard_count: shard.count.0,
+                shard_count: shard.count.literal(),
                 shard_stripe_size: shard.stripe_size.0,
             }
         }
@@ -458,7 +458,7 @@ impl Reconciler {
                     generation: None,
                     secondary_conf: None,
                     shard_number: self.shard.number.0,
-                    shard_count: self.shard.count.0,
+                    shard_count: self.shard.count.literal(),
                     shard_stripe_size: self.shard.stripe_size.0,
                     tenant_conf: self.config.clone(),
                 },
@@ -506,7 +506,7 @@ pub(crate) fn attached_location_conf(
         generation: generation.into(),
         secondary_conf: None,
         shard_number: shard.number.0,
-        shard_count: shard.count.0,
+        shard_count: shard.count.literal(),
         shard_stripe_size: shard.stripe_size.0,
         tenant_conf: config.clone(),
     }
@@ -521,7 +521,7 @@ pub(crate) fn secondary_location_conf(
         generation: None,
         secondary_conf: Some(LocationConfigSecondary { warm: true }),
         shard_number: shard.number.0,
-        shard_count: shard.count.0,
+        shard_count: shard.count.literal(),
         shard_stripe_size: shard.stripe_size.0,
         tenant_conf: config.clone(),
     }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 95efa8ecd7..616b74e55d 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -292,7 +292,7 @@ impl Service {
                         generation: None,
                         secondary_conf: None,
                         shard_number: tenant_shard_id.shard_number.0,
-                        shard_count: tenant_shard_id.shard_count.0,
+                        shard_count: tenant_shard_id.shard_count.literal(),
                         shard_stripe_size: 0,
                         tenant_conf: models::TenantConfig::default(),
                     },
@@ -389,14 +389,14 @@ impl Service {
             let tenant_shard_id = TenantShardId {
                 tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
                 shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount(tsp.shard_count as u8),
+                shard_count: ShardCount::new(tsp.shard_count as u8),
             };
             let shard_identity = if tsp.shard_count == 0 {
                 ShardIdentity::unsharded()
             } else {
                 ShardIdentity::new(
                     ShardNumber(tsp.shard_number as u8),
-                    ShardCount(tsp.shard_count as u8),
+                    ShardCount::new(tsp.shard_count as u8),
                     ShardStripeSize(tsp.shard_stripe_size as u32),
                 )?
             };
@@ -526,7 +526,7 @@ impl Service {
             let tsp = TenantShardPersistence {
                 tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
                 shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
-                shard_count: attach_req.tenant_shard_id.shard_count.0 as i32,
+                shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32,
                 shard_stripe_size: 0,
                 generation: 0,
                 generation_pageserver: i64::MAX,
@@ -726,16 +726,9 @@ impl Service {
         &self,
         create_req: TenantCreateRequest,
     ) -> Result<TenantCreateResponse, ApiError> {
-        // Shard count 0 is valid: it means create a single shard (ShardCount(0) means "unsharded")
-        let literal_shard_count = if create_req.shard_parameters.is_unsharded() {
-            1
-        } else {
-            create_req.shard_parameters.count.0
-        };
-
         // This service expects to handle sharding itself: it is an error to try and directly create
         // a particular shard here.
-        let tenant_id = if create_req.new_tenant_id.shard_count > ShardCount(1) {
+        let tenant_id = if !create_req.new_tenant_id.is_unsharded() {
             return Err(ApiError::BadRequest(anyhow::anyhow!(
                 "Attempted to create a specific shard, this API is for creating the whole tenant"
             )));
@@ -749,7 +742,7 @@ impl Service {
             create_req.shard_parameters.count,
         );
 
-        let create_ids = (0..literal_shard_count)
+        let create_ids = (0..create_req.shard_parameters.count.count())
             .map(|i| TenantShardId {
                 tenant_id,
                 shard_number: ShardNumber(i),
@@ -769,7 +762,7 @@ impl Service {
             .map(|tenant_shard_id| TenantShardPersistence {
                 tenant_id: tenant_shard_id.tenant_id.to_string(),
                 shard_number: tenant_shard_id.shard_number.0 as i32,
-                shard_count: tenant_shard_id.shard_count.0 as i32,
+                shard_count: tenant_shard_id.shard_count.literal() as i32,
                 shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
                 generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
                 generation_pageserver: i64::MAX,
@@ -914,7 +907,7 @@ impl Service {
         tenant_id: TenantId,
         req: TenantLocationConfigRequest,
     ) -> Result<TenantLocationConfigResponse, ApiError> {
-        if req.tenant_id.shard_count.0 > 1 {
+        if !req.tenant_id.is_unsharded() {
             return Err(ApiError::BadRequest(anyhow::anyhow!(
                 "This API is for importing single-sharded or unsharded tenants"
             )));
@@ -1449,7 +1442,7 @@ impl Service {
             for (tenant_shard_id, shard) in
                 locked.tenants.range(TenantShardId::tenant_range(tenant_id))
             {
-                match shard.shard.count.0.cmp(&split_req.new_shard_count) {
+                match shard.shard.count.count().cmp(&split_req.new_shard_count) {
                     Ordering::Equal => {
                         //  Already split this
                         children_found.push(*tenant_shard_id);
@@ -1459,7 +1452,7 @@ impl Service {
                         return Err(ApiError::BadRequest(anyhow::anyhow!(
                             "Requested count {} but already have shards at count {}",
                             split_req.new_shard_count,
-                            shard.shard.count.0
+                            shard.shard.count.count()
                         )));
                     }
                     Ordering::Less => {
@@ -1489,7 +1482,7 @@ impl Service {
                     shard_ident = Some(shard.shard);
                 }
 
-                if tenant_shard_id.shard_count == ShardCount(split_req.new_shard_count) {
+                if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
                     tracing::info!(
                         "Tenant shard {} already has shard count {}",
                         tenant_shard_id,
@@ -1515,7 +1508,7 @@ impl Service {
                 targets.push(SplitTarget {
                     parent_id: *tenant_shard_id,
                     node: node.clone(),
-                    child_ids: tenant_shard_id.split(ShardCount(split_req.new_shard_count)),
+                    child_ids: tenant_shard_id.split(ShardCount::new(split_req.new_shard_count)),
                 });
             }
 
@@ -1562,7 +1555,7 @@ impl Service {
                 this_child_tsps.push(TenantShardPersistence {
                     tenant_id: child.tenant_id.to_string(),
                     shard_number: child.shard_number.0 as i32,
-                    shard_count: child.shard_count.0 as i32,
+                    shard_count: child.shard_count.literal() as i32,
                     shard_stripe_size: shard_ident.stripe_size.0 as i32,
                     // Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
                     // populate the correct generation as part of its transaction, to protect us
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index d71cdf02c0..a155e9ebb2 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -450,7 +450,7 @@ async fn handle_tenant(
                     new_tenant_id: TenantShardId::unsharded(tenant_id),
                     generation: None,
                     shard_parameters: ShardParameters {
-                        count: ShardCount(shard_count),
+                        count: ShardCount::new(shard_count),
                         stripe_size: shard_stripe_size
                             .map(ShardStripeSize)
                             .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 1226eaa312..db2292072c 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -214,14 +214,14 @@ impl ShardParameters {
     pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
 
     pub fn is_unsharded(&self) -> bool {
-        self.count == ShardCount(0)
+        self.count.is_unsharded()
     }
 }
 
 impl Default for ShardParameters {
     fn default() -> Self {
         Self {
-            count: ShardCount(0),
+            count: ShardCount::new(0),
             stripe_size: Self::DEFAULT_STRIPE_SIZE,
         }
     }
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 322b6c642e..a50ac74af1 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -13,10 +13,41 @@ use utils::id::TenantId;
 pub struct ShardNumber(pub u8);
 
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardCount(pub u8);
+pub struct ShardCount(u8);
 
 impl ShardCount {
     pub const MAX: Self = Self(u8::MAX);
+
+    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
+    /// legacy format for TenantShardId that excludes the shard suffix", also known
+    /// as `TenantShardId::unsharded`.
+    ///
+    /// This method returns the actual number of shards, i.e. if our internal value is
+    /// zero, we return 1 (unsharded tenants have 1 shard).
+    pub fn count(&self) -> u8 {
+        if self.0 > 0 {
+            self.0
+        } else {
+            1
+        }
+    }
+
+    /// The literal internal value: this is **not** the number of shards in the
+    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
+    /// [`Self::count`] if you want to know the cardinality of shards.
+    pub fn literal(&self) -> u8 {
+        self.0
+    }
+
+    pub fn is_unsharded(&self) -> bool {
+        self.0 == 0
+    }
+
+    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
+    /// [`Self::literal`] would return.
+    pub fn new(val: u8) -> Self {
+        Self(val)
+    }
 }
 
 impl ShardNumber {
@@ -86,7 +117,7 @@ impl TenantShardId {
     }
 
     pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
     }
 
     /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index df3794f222..10ca96a2c1 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1136,7 +1136,7 @@ async fn tenant_shard_split_handler(
 
     let new_shards = state
         .tenant_manager
-        .shard_split(tenant_shard_id, ShardCount(req.new_shard_count), &ctx)
+        .shard_split(tenant_shard_id, ShardCount::new(req.new_shard_count), &ctx)
         .await
         .map_err(ApiError::InternalServerError)?;
 
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 7b660b5eca..11eb512750 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -26,7 +26,7 @@ use pageserver_api::models::{
     PagestreamNblocksResponse,
 };
 use pageserver_api::shard::ShardIndex;
-use pageserver_api::shard::{ShardCount, ShardNumber};
+use pageserver_api::shard::ShardNumber;
 use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
@@ -998,7 +998,7 @@ impl PageServerHandler {
     ) -> Result<&Arc<Timeline>, Key> {
         let key = if let Some((first_idx, first_timeline)) = self.shard_timelines.iter().next() {
             // Fastest path: single sharded case
-            if first_idx.shard_count < ShardCount(2) {
+            if first_idx.shard_count.count() == 1 {
                 return Ok(&first_timeline.timeline);
             }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ced4bb5af4..25d13a01ac 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2370,7 +2370,7 @@ impl Tenant {
             generation: self.generation.into(),
             secondary_conf: None,
             shard_number: self.shard_identity.number.0,
-            shard_count: self.shard_identity.count.0,
+            shard_count: self.shard_identity.count.literal(),
             shard_stripe_size: self.shard_identity.stripe_size.0,
             tenant_conf: tenant_config,
         }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 563887088d..961decd247 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -251,7 +251,7 @@ impl LocationConf {
         } else {
             ShardIdentity::new(
                 ShardNumber(conf.shard_number),
-                ShardCount(conf.shard_count),
+                ShardCount::new(conf.shard_count),
                 ShardStripeSize(conf.shard_stripe_size),
             )?
         };
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 7260080720..90c442464f 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -794,7 +794,7 @@ pub(crate) async fn set_new_tenant_config(
     info!("configuring tenant {tenant_id}");
     let tenant = get_tenant(tenant_shard_id, true)?;
 
-    if tenant.tenant_shard_id().shard_count > ShardCount(0) {
+    if !tenant.tenant_shard_id().shard_count.is_unsharded() {
         // Note that we use ShardParameters::default below.
         return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
             "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
@@ -1376,7 +1376,7 @@ impl TenantManager {
         result
     }
 
-    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.0))]
+    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.literal()))]
     pub(crate) async fn shard_split(
         &self,
         tenant_shard_id: TenantShardId,
@@ -1386,11 +1386,10 @@ impl TenantManager {
         let tenant = get_tenant(tenant_shard_id, true)?;
 
         // Plan: identify what the new child shards will be
-        let effective_old_shard_count = std::cmp::max(tenant_shard_id.shard_count.0, 1);
-        if new_shard_count <= ShardCount(effective_old_shard_count) {
+        if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
             anyhow::bail!("Requested shard count is not an increase");
         }
-        let expansion_factor = new_shard_count.0 / effective_old_shard_count;
+        let expansion_factor = new_shard_count.count() / tenant_shard_id.shard_count.count();
         if !expansion_factor.is_power_of_two() {
             anyhow::bail!("Requested split is not a power of two");
         }
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 926cd0302b..2c8ced4eb7 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -150,7 +150,7 @@ impl SecondaryTenant {
             generation: None,
             secondary_conf: Some(conf),
             shard_number: self.tenant_shard_id.shard_number.0,
-            shard_count: self.tenant_shard_id.shard_count.0,
+            shard_count: self.tenant_shard_id.shard_count.literal(),
             shard_stripe_size: self.shard_identity.stripe_size.0,
             tenant_conf: tenant_conf.into(),
         }

From 45e929c069c83043e770b7c6e430e9f5311cc26d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 16 Feb 2024 10:35:11 +0100
Subject: [PATCH 0191/1571] stop reading local `metadata` file (#6777)

---
 pageserver/src/tenant.rs          | 446 +-----------------------------
 pageserver/src/tenant/metadata.rs |  11 -
 pageserver/src/tenant/timeline.rs |   2 +-
 3 files changed, 9 insertions(+), 450 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 25d13a01ac..e2d66711c8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -49,7 +49,6 @@ use self::config::AttachmentMode;
 use self::config::LocationConf;
 use self::config::TenantConf;
 use self::delete::DeleteTenantFlow;
-use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
 use self::mgr::GetActiveTenantError;
 use self::mgr::GetTenantError;
@@ -77,7 +76,6 @@ use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::LocationMode;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::metadata::load_metadata;
 pub use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::remote_initdb_archive_path;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
@@ -94,7 +92,6 @@ use std::fmt::Debug;
 use std::fmt::Display;
 use std::fs;
 use std::fs::File;
-use std::io;
 use std::ops::Bound::Included;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
@@ -488,11 +485,6 @@ impl From<std::io::Error> for InitdbError {
     }
 }
 
-struct TenantDirectoryScan {
-    sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>,
-    timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)>,
-}
-
 enum CreateTimelineCause {
     Load,
     Delete,
@@ -928,9 +920,7 @@ impl Tenant {
                 timelines: HashMap::new(),
             },
             (None, SpawnMode::Normal) => {
-                // Deprecated dev mode: load from local disk state instead of remote storage
-                // https://github.com/neondatabase/neon/issues/5624
-                return self.load_local(ctx).await;
+                anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
             }
         };
 
@@ -1198,149 +1188,6 @@ impl Tenant {
         ))
     }
 
-    fn scan_and_sort_timelines_dir(self: Arc<Tenant>) -> anyhow::Result<TenantDirectoryScan> {
-        let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
-        // Note timelines_to_resume_deletion needs to be separate because it can be not sortable
-        // from the point of `tree_sort_timelines`. I e some parents can be missing because deletion
-        // completed in non topological order (for example because parent has smaller number of layer files in it)
-        let mut timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)> = vec![];
-
-        let timelines_dir = self.conf.timelines_path(&self.tenant_shard_id);
-
-        for entry in timelines_dir
-            .read_dir_utf8()
-            .context("list timelines directory for tenant")?
-        {
-            let entry = entry.context("read timeline dir entry")?;
-            let timeline_dir = entry.path();
-
-            if crate::is_temporary(timeline_dir) {
-                info!("Found temporary timeline directory, removing: {timeline_dir}");
-                if let Err(e) = std::fs::remove_dir_all(timeline_dir) {
-                    error!("Failed to remove temporary directory '{timeline_dir}': {e:?}");
-                }
-            } else if is_uninit_mark(timeline_dir) {
-                if !timeline_dir.exists() {
-                    warn!("Timeline dir entry become invalid: {timeline_dir}");
-                    continue;
-                }
-
-                let timeline_uninit_mark_file = &timeline_dir;
-                info!(
-                    "Found an uninit mark file {timeline_uninit_mark_file}, removing the timeline and its uninit mark",
-                );
-                let timeline_id =
-                    TimelineId::try_from(timeline_uninit_mark_file.file_stem())
-                        .with_context(|| {
-                            format!(
-                                "Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}",
-                            )
-                        })?;
-                let timeline_dir = self.conf.timeline_path(&self.tenant_shard_id, &timeline_id);
-                if let Err(e) =
-                    remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
-                {
-                    error!("Failed to clean up uninit marked timeline: {e:?}");
-                }
-            } else if crate::is_delete_mark(timeline_dir) {
-                // If metadata exists, load as usual, continue deletion
-                let timeline_id = TimelineId::try_from(timeline_dir.file_stem())
-                    .with_context(|| {
-                        format!(
-                            "Could not parse timeline id out of the timeline uninit mark name {timeline_dir}",
-                        )
-                    })?;
-
-                info!("Found deletion mark for timeline {}", timeline_id);
-
-                match load_metadata(self.conf, &self.tenant_shard_id, &timeline_id) {
-                    Ok(metadata) => {
-                        timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
-                    }
-                    Err(e) => match &e {
-                        LoadMetadataError::Read(r) => {
-                            if r.kind() != io::ErrorKind::NotFound {
-                                return Err(anyhow::anyhow!(e)).with_context(|| {
-                                    format!("Failed to load metadata for timeline_id {timeline_id}")
-                                });
-                            }
-
-                            // If metadata doesnt exist it means that we've crashed without
-                            // completing cleanup_remaining_timeline_fs_traces in DeleteTimelineFlow.
-                            // So save timeline_id for later call to `DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`.
-                            // We cant do it here because the method is async so we'd need block_on
-                            // and here we're in spawn_blocking. cleanup_remaining_timeline_fs_traces uses fs operations
-                            // so that basically results in a cycle:
-                            // spawn_blocking
-                            // - block_on
-                            //   - spawn_blocking
-                            // which can lead to running out of threads in blocing pool.
-                            timelines_to_resume_deletion.push((timeline_id, None));
-                        }
-                        _ => {
-                            return Err(anyhow::anyhow!(e)).with_context(|| {
-                                format!("Failed to load metadata for timeline_id {timeline_id}")
-                            })
-                        }
-                    },
-                }
-            } else {
-                if !timeline_dir.exists() {
-                    warn!("Timeline dir entry become invalid: {timeline_dir}");
-                    continue;
-                }
-                let timeline_id = TimelineId::try_from(timeline_dir.file_name())
-                    .with_context(|| {
-                        format!(
-                            "Could not parse timeline id out of the timeline dir name {timeline_dir}",
-                        )
-                    })?;
-                let timeline_uninit_mark_file = self
-                    .conf
-                    .timeline_uninit_mark_file_path(self.tenant_shard_id, timeline_id);
-                if timeline_uninit_mark_file.exists() {
-                    info!(
-                        %timeline_id,
-                        "Found an uninit mark file, removing the timeline and its uninit mark",
-                    );
-                    if let Err(e) =
-                        remove_timeline_and_uninit_mark(timeline_dir, &timeline_uninit_mark_file)
-                    {
-                        error!("Failed to clean up uninit marked timeline: {e:?}");
-                    }
-                    continue;
-                }
-
-                let timeline_delete_mark_file = self
-                    .conf
-                    .timeline_delete_mark_file_path(self.tenant_shard_id, timeline_id);
-                if timeline_delete_mark_file.exists() {
-                    // Cleanup should be done in `is_delete_mark` branch above
-                    continue;
-                }
-
-                let file_name = entry.file_name();
-                if let Ok(timeline_id) = file_name.parse::<TimelineId>() {
-                    let metadata = load_metadata(self.conf, &self.tenant_shard_id, &timeline_id)
-                        .context("failed to load metadata")?;
-                    timelines_to_load.insert(timeline_id, metadata);
-                } else {
-                    // A file or directory that doesn't look like a timeline ID
-                    warn!("unexpected file or directory in timelines directory: {file_name}");
-                }
-            }
-        }
-
-        // Sort the array of timeline IDs into tree-order, so that parent comes before
-        // all its children.
-        tree_sort_timelines(timelines_to_load, |m| m.ancestor_timeline()).map(|sorted_timelines| {
-            TenantDirectoryScan {
-                sorted_timelines_to_load: sorted_timelines,
-                timelines_to_resume_deletion,
-            }
-        })
-    }
-
     async fn load_timeline_metadata(
         self: &Arc<Tenant>,
         timeline_ids: HashSet<TimelineId>,
@@ -1404,141 +1251,6 @@ impl Tenant {
         Ok(timeline_preloads)
     }
 
-    ///
-    /// Background task to load in-memory data structures for this tenant, from
-    /// files on disk. Used at pageserver startup.
-    ///
-    /// No background tasks are started as part of this routine.
-    async fn load_local(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
-        span::debug_assert_current_span_has_tenant_id();
-
-        debug!("loading tenant task");
-
-        // Load in-memory state to reflect the local files on disk
-        //
-        // Scan the directory, peek into the metadata file of each timeline, and
-        // collect a list of timelines and their ancestors.
-        let span = info_span!("blocking");
-        let cloned = Arc::clone(self);
-
-        let scan = tokio::task::spawn_blocking(move || {
-            let _g = span.entered();
-            cloned.scan_and_sort_timelines_dir()
-        })
-        .await
-        .context("load spawn_blocking")
-        .and_then(|res| res)?;
-
-        // FIXME original collect_timeline_files contained one more check:
-        //    1. "Timeline has no ancestor and no layer files"
-
-        // Process loadable timelines first
-        for (timeline_id, local_metadata) in scan.sorted_timelines_to_load {
-            if let Err(e) = self
-                .load_local_timeline(timeline_id, local_metadata, ctx, false)
-                .await
-            {
-                match e {
-                    LoadLocalTimelineError::Load(source) => {
-                        return Err(anyhow::anyhow!(source)).with_context(|| {
-                            format!("Failed to load local timeline: {timeline_id}")
-                        })
-                    }
-                    LoadLocalTimelineError::ResumeDeletion(source) => {
-                        // Make sure resumed deletion wont fail loading for entire tenant.
-                        error!("Failed to resume timeline deletion: {source:#}")
-                    }
-                }
-            }
-        }
-
-        // Resume deletion ones with deleted_mark
-        for (timeline_id, maybe_local_metadata) in scan.timelines_to_resume_deletion {
-            match maybe_local_metadata {
-                None => {
-                    // See comment in `scan_and_sort_timelines_dir`.
-                    if let Err(e) =
-                        DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(self, timeline_id)
-                            .await
-                    {
-                        warn!(
-                            "cannot clean up deleted timeline dir timeline_id: {} error: {:#}",
-                            timeline_id, e
-                        );
-                    }
-                }
-                Some(local_metadata) => {
-                    if let Err(e) = self
-                        .load_local_timeline(timeline_id, local_metadata, ctx, true)
-                        .await
-                    {
-                        match e {
-                            LoadLocalTimelineError::Load(source) => {
-                                // We tried to load deleted timeline, this is a bug.
-                                return Err(anyhow::anyhow!(source).context(
-                                    format!("This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}")
-                                ));
-                            }
-                            LoadLocalTimelineError::ResumeDeletion(source) => {
-                                // Make sure resumed deletion wont fail loading for entire tenant.
-                                error!("Failed to resume timeline deletion: {source:#}")
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        trace!("Done");
-
-        Ok(())
-    }
-
-    /// Subroutine of `load_tenant`, to load an individual timeline
-    ///
-    /// NB: The parent is assumed to be already loaded!
-    #[instrument(skip(self, local_metadata, ctx))]
-    async fn load_local_timeline(
-        self: &Arc<Self>,
-        timeline_id: TimelineId,
-        local_metadata: TimelineMetadata,
-        ctx: &RequestContext,
-        found_delete_mark: bool,
-    ) -> Result<(), LoadLocalTimelineError> {
-        span::debug_assert_current_span_has_tenant_id();
-
-        let resources = self.build_timeline_resources(timeline_id);
-
-        if found_delete_mark {
-            // There is no remote client, we found local metadata.
-            // Continue cleaning up local disk.
-            DeleteTimelineFlow::resume_deletion(
-                Arc::clone(self),
-                timeline_id,
-                &local_metadata,
-                None,
-                self.deletion_queue_client.clone(),
-            )
-            .await
-            .context("resume deletion")
-            .map_err(LoadLocalTimelineError::ResumeDeletion)?;
-            return Ok(());
-        }
-
-        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
-            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
-                .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))
-                .map_err(LoadLocalTimelineError::Load)?;
-            Some(ancestor_timeline)
-        } else {
-            None
-        };
-
-        self.timeline_init_and_sync(timeline_id, resources, None, local_metadata, ancestor, ctx)
-            .await
-            .map_err(LoadLocalTimelineError::Load)
-    }
-
     pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
         self.tenant_shard_id
     }
@@ -3787,29 +3499,6 @@ impl Tenant {
     }
 }
 
-fn remove_timeline_and_uninit_mark(
-    timeline_dir: &Utf8Path,
-    uninit_mark: &Utf8Path,
-) -> anyhow::Result<()> {
-    fs::remove_dir_all(timeline_dir)
-        .or_else(|e| {
-            if e.kind() == std::io::ErrorKind::NotFound {
-                // we can leave the uninit mark without a timeline dir,
-                // just remove the mark then
-                Ok(())
-            } else {
-                Err(e)
-            }
-        })
-        .with_context(|| {
-            format!("Failed to remove unit marked timeline directory {timeline_dir}")
-        })?;
-    fs::remove_file(uninit_mark)
-        .with_context(|| format!("Failed to remove timeline uninit mark file {uninit_mark}"))?;
-
-    Ok(())
-}
-
 /// Create the cluster temporarily in 'initdbpath' directory inside the repository
 /// to get bootstrap data for timeline initialization.
 async fn run_initdb(
@@ -3969,13 +3658,6 @@ pub(crate) mod harness {
         }
     }
 
-    #[cfg(test)]
-    #[derive(Debug)]
-    enum LoadMode {
-        Local,
-        Remote,
-    }
-
     pub struct TenantHarness {
         pub conf: &'static PageServerConf,
         pub tenant_conf: TenantConf,
@@ -4057,42 +3739,17 @@ pub(crate) mod harness {
         pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
             let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
             (
-                self.try_load(&ctx)
+                self.do_try_load(&ctx)
                     .await
                     .expect("failed to load test tenant"),
                 ctx,
             )
         }
 
-        /// For tests that specifically want to exercise the local load path, which does
-        /// not use remote storage.
-        pub(crate) async fn try_load_local(
+        #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
+        pub(crate) async fn do_try_load(
             &self,
             ctx: &RequestContext,
-        ) -> anyhow::Result<Arc<Tenant>> {
-            self.do_try_load(ctx, LoadMode::Local).await
-        }
-
-        /// The 'load' in this function is either a local load or a normal attachment,
-        pub(crate) async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
-            // If we have nothing in remote storage, must use load_local instead of attach: attach
-            // will error out if there are no timelines.
-            //
-            // See https://github.com/neondatabase/neon/issues/5456 for how we will eliminate
-            // this weird state of a Tenant which exists but doesn't have any timelines.
-            let mode = match self.remote_empty() {
-                true => LoadMode::Local,
-                false => LoadMode::Remote,
-            };
-
-            self.do_try_load(ctx, mode).await
-        }
-
-        #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), ?mode))]
-        async fn do_try_load(
-            &self,
-            ctx: &RequestContext,
-            mode: LoadMode,
         ) -> anyhow::Result<Arc<Tenant>> {
             let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
 
@@ -4113,17 +3770,10 @@ pub(crate) mod harness {
                 self.deletion_queue.new_client(),
             ));
 
-            match mode {
-                LoadMode::Local => {
-                    tenant.load_local(ctx).await?;
-                }
-                LoadMode::Remote => {
-                    let preload = tenant
-                        .preload(&self.remote_storage, CancellationToken::new())
-                        .await?;
-                    tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
-                }
-            }
+            let preload = tenant
+                .preload(&self.remote_storage, CancellationToken::new())
+                .await?;
+            tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
 
             tenant.state.send_replace(TenantState::Active);
             for timeline in tenant.timelines.lock().unwrap().values() {
@@ -4132,31 +3782,6 @@ pub(crate) mod harness {
             Ok(tenant)
         }
 
-        fn remote_empty(&self) -> bool {
-            let tenant_path = self.conf.tenant_path(&self.tenant_shard_id);
-            let remote_tenant_dir = self
-                .remote_fs_dir
-                .join(tenant_path.strip_prefix(&self.conf.workdir).unwrap());
-            if std::fs::metadata(&remote_tenant_dir).is_err() {
-                return true;
-            }
-
-            match std::fs::read_dir(remote_tenant_dir)
-                .unwrap()
-                .flatten()
-                .next()
-            {
-                Some(entry) => {
-                    tracing::debug!(
-                        "remote_empty: not empty, found file {}",
-                        entry.file_name().to_string_lossy(),
-                    );
-                    false
-                }
-                None => true,
-            }
-        }
-
         pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf {
             self.conf.timeline_path(&self.tenant_shard_id, timeline_id)
         }
@@ -4215,7 +3840,6 @@ mod tests {
     use crate::repository::{Key, Value};
     use crate::tenant::harness::*;
     use crate::DEFAULT_PG_VERSION;
-    use crate::METADATA_FILE_NAME;
     use bytes::BytesMut;
     use hex_literal::hex;
     use once_cell::sync::Lazy;
@@ -4757,60 +4381,6 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
-    async fn corrupt_local_metadata() -> anyhow::Result<()> {
-        const TEST_NAME: &str = "corrupt_metadata";
-        let harness = TenantHarness::create(TEST_NAME)?;
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await?;
-        drop(tline);
-        // so that all uploads finish & we can call harness.try_load() below again
-        tenant
-            .shutdown(Default::default(), true)
-            .instrument(harness.span())
-            .await
-            .ok()
-            .unwrap();
-        drop(tenant);
-
-        // Corrupt local metadata
-        let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
-        assert!(metadata_path.is_file());
-        let mut metadata_bytes = std::fs::read(&metadata_path)?;
-        assert_eq!(metadata_bytes.len(), 512);
-        metadata_bytes[8] ^= 1;
-        std::fs::write(metadata_path, metadata_bytes)?;
-
-        let err = harness.try_load_local(&ctx).await.expect_err("should fail");
-        // get all the stack with all .context, not only the last one
-        let message = format!("{err:#}");
-        let expected = "failed to load metadata";
-        assert!(
-            message.contains(expected),
-            "message '{message}' expected to contain {expected}"
-        );
-
-        let mut found_error_message = false;
-        let mut err_source = err.source();
-        while let Some(source) = err_source {
-            if source.to_string().contains("metadata checksum mismatch") {
-                found_error_message = true;
-                break;
-            }
-            err_source = source.source();
-        }
-        assert!(
-            found_error_message,
-            "didn't find the corrupted metadata error in {}",
-            message
-        );
-
-        Ok(())
-    }
-
     #[tokio::test]
     async fn test_images() -> anyhow::Result<()> {
         let (tenant, ctx) = TenantHarness::create("test_images")?.load().await;
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index dcbe781f90..1a20a237a7 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -294,17 +294,6 @@ pub enum LoadMetadataError {
     Decode(#[from] anyhow::Error),
 }
 
-pub fn load_metadata(
-    conf: &'static PageServerConf,
-    tenant_shard_id: &TenantShardId,
-    timeline_id: &TimelineId,
-) -> Result<TimelineMetadata, LoadMetadataError> {
-    let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id);
-    let metadata_bytes = std::fs::read(metadata_path)?;
-
-    Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 87cf0ac6ea..7f7713a6c6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4849,7 +4849,7 @@ mod tests {
             TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
 
         let ctx = any_context();
-        let tenant = harness.try_load(&ctx).await.unwrap();
+        let tenant = harness.do_try_load(&ctx).await.unwrap();
         let timeline = tenant
             .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
             .await

From 568bc1fde3f770aa8e1fd0dc8128a7add779a29f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 16 Feb 2024 11:12:34 +0100
Subject: [PATCH 0192/1571] fix(build): production flamegraphs are useless
 (#6764)

---
 Dockerfile |  2 +-
 README.md  | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index c37f94b981..47954a671b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -47,7 +47,7 @@ COPY --chown=nonroot . .
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-    && mold -run cargo build  \
+    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build  \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \
diff --git a/README.md b/README.md
index a0b368fb94..fedb787ac2 100644
--- a/README.md
+++ b/README.md
@@ -249,6 +249,16 @@ testing locally, it is convenient to run just one set of permutations, like this
 DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
 ```
 
+## Flamegraphs
+
+You may find yourself in need of flamegraphs for software in this repository.
+You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or the original [`flamegraph.pl`](https://github.com/brendangregg/FlameGraph). Your choice!
+
+>[!IMPORTANT]
+> If you're using `lld` or `mold`, you need the `--no-rosegment` linker argument.
+> It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
+> See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).
+
 ## Documentation
 
 [docs](/docs) Contains a top-level overview of all available markdown documentation.

From f2e5212fed2d806c7a02e5c7456f24557fba06ac Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 16 Feb 2024 13:00:53 +0000
Subject: [PATCH 0193/1571] storage controller: background reconcile, graceful
 shutdown, better logging (#6709)

## Problem

Now that the storage controller is working end to end, we start burning
down the robustness aspects.

## Summary of changes

- Add a background task that periodically calls `reconcile_all`. This
ensures that if earlier operations couldn't succeed (e.g. because a node
was unavailable), we will eventually retry. This is a naive initial
implementation can start an unlimited number of reconcile tasks:
limiting reconcile concurrency is a later item in #6342
- Add a number of tracing spans in key locations: each background task,
each reconciler task.
- Add a top level CancellationToken and Gate, and use these to implement
a graceful shutdown that waits for tasks to shut down. This is not
bulletproof yet, because within these tasks we have remote HTTP calls
that aren't wrapped in cancellation/timeouts, but it creates the
structure, and if we don't shutdown promptly then k8s will kill us.
- To protect shard splits from background reconciliation, expose the `SplitState`
in memory and use it to guard any APIs that require an attached tenant.
---
 control_plane/attachment_service/Cargo.toml   |   5 +
 .../attachment_service/src/compute_hook.rs    |   8 +-
 control_plane/attachment_service/src/lib.rs   |   6 +
 control_plane/attachment_service/src/main.rs  |  28 +-
 .../attachment_service/src/reconciler.rs      |   5 +
 .../attachment_service/src/scheduler.rs       |   9 +-
 .../attachment_service/src/service.rs         | 282 +++++++++++++-----
 .../attachment_service/src/tenant_state.rs    | 116 ++++---
 .../regress/test_pageserver_generations.py    |  67 +++--
 9 files changed, 370 insertions(+), 156 deletions(-)

diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 0b93211dbc..ada35295f9 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -4,6 +4,11 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+default = []
+# Enables test-only APIs and behaviors
+testing = []
+
 [dependencies]
 anyhow.workspace = true
 aws-config.workspace = true
diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs
index bac378d218..b5e90491c6 100644
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -155,7 +155,7 @@ impl ComputeHook {
 
         for (endpoint_name, endpoint) in &cplane.endpoints {
             if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
-                tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
+                tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
                 endpoint.reconfigure(compute_pageservers.clone()).await?;
             }
         }
@@ -177,7 +177,7 @@ impl ComputeHook {
             req
         };
 
-        tracing::debug!(
+        tracing::info!(
             "Sending notify request to {} ({:?})",
             url,
             reconfigure_request
@@ -266,7 +266,7 @@ impl ComputeHook {
     /// periods, but we don't retry forever.  The **caller** is responsible for handling failures and
     /// ensuring that they eventually call again to ensure that the compute is eventually notified of
     /// the proper pageserver nodes for a tenant.
-    #[tracing::instrument(skip_all, fields(tenant_shard_id, node_id))]
+    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
     pub(super) async fn notify(
         &self,
         tenant_shard_id: TenantShardId,
@@ -298,7 +298,7 @@ impl ComputeHook {
         let Some(reconfigure_request) = reconfigure_request else {
             // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
             // until it does.
-            tracing::debug!("Tenant isn't yet ready to emit a notification",);
+            tracing::info!("Tenant isn't yet ready to emit a notification");
             return Ok(());
         };
 
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index 082afb4157..238efdf5a8 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -37,6 +37,12 @@ impl std::fmt::Display for Sequence {
     }
 }
 
+impl std::fmt::Debug for Sequence {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
 impl MonotonicCounter<Sequence> for Sequence {
     fn cnt_advance(&mut self, v: Sequence) {
         assert!(*self <= v);
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 7229a2517b..b323ae8820 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -15,6 +15,7 @@ use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use std::sync::Arc;
 use tokio::signal::unix::SignalKind;
+use tokio_util::sync::CancellationToken;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};
 
@@ -237,15 +238,23 @@ async fn async_main() -> anyhow::Result<()> {
     let auth = secrets
         .public_key
         .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
-    let router = make_router(service, auth)
+    let router = make_router(service.clone(), auth)
         .build()
         .map_err(|err| anyhow!(err))?;
     let router_service = utils::http::RouterService::new(router).unwrap();
-    let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);
 
+    // Start HTTP server
+    let server_shutdown = CancellationToken::new();
+    let server = hyper::Server::from_tcp(http_listener)?
+        .serve(router_service)
+        .with_graceful_shutdown({
+            let server_shutdown = server_shutdown.clone();
+            async move {
+                server_shutdown.cancelled().await;
+            }
+        });
     tracing::info!("Serving on {0}", args.listen);
-
-    tokio::task::spawn(server);
+    let server_task = tokio::task::spawn(server);
 
     // Wait until we receive a signal
     let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
@@ -266,5 +275,16 @@ async fn async_main() -> anyhow::Result<()> {
         }
     }
 
+    // Stop HTTP server first, so that we don't have to service requests
+    // while shutting down Service
+    server_shutdown.cancel();
+    if let Err(e) = server_task.await {
+        tracing::error!("Error joining HTTP server task: {e}")
+    }
+    tracing::info!("Joined HTTP server task");
+
+    service.shutdown().await;
+    tracing::info!("Service shutdown complete");
+
     std::process::exit(0);
 }
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index 65bbfa7181..a4fbd80dc3 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -13,6 +13,7 @@ use tokio_util::sync::CancellationToken;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
 use utils::lsn::Lsn;
+use utils::sync::gate::GateGuard;
 
 use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
@@ -53,6 +54,10 @@ pub(super) struct Reconciler {
     /// the tenant is changed.
     pub(crate) cancel: CancellationToken,
 
+    /// Reconcilers are registered with a Gate so that during a graceful shutdown we
+    /// can wait for all the reconcilers to respond to their cancellation tokens.
+    pub(crate) _gate_guard: GateGuard,
+
     /// Access to persistent storage for updating generation numbers
     pub(crate) persistence: Arc<Persistence>,
 }
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 1966a7ea2a..3b4c9e3464 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -77,12 +77,11 @@ impl Scheduler {
             return Err(ScheduleError::ImpossibleConstraint);
         }
 
-        for (node_id, count) in &tenant_counts {
-            tracing::info!("tenant_counts[{node_id}]={count}");
-        }
-
         let node_id = tenant_counts.first().unwrap().0;
-        tracing::info!("scheduler selected node {node_id}");
+        tracing::info!(
+            "scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
+            tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
+        );
         *self.tenant_counts.get_mut(&node_id).unwrap() += 1;
         Ok(node_id)
     }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 616b74e55d..149cb7f2ba 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -30,6 +30,7 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
+use tracing::instrument;
 use utils::{
     backoff,
     completion::Barrier,
@@ -37,6 +38,7 @@ use utils::{
     http::error::ApiError,
     id::{NodeId, TenantId, TimelineId},
     seqwait::SeqWait,
+    sync::gate::Gate,
 };
 
 use crate::{
@@ -124,6 +126,12 @@ pub struct Service {
     config: Config,
     persistence: Arc<Persistence>,
 
+    // Process shutdown will fire this token
+    cancel: CancellationToken,
+
+    // Background tasks will hold this gate
+    gate: Gate,
+
     /// This waits for initial reconciliation with pageservers to complete.  Until this barrier
     /// passes, it isn't safe to do any actions that mutate tenants.
     pub(crate) startup_complete: Barrier,
@@ -144,8 +152,9 @@ impl Service {
         &self.config
     }
 
-    /// TODO: don't allow other API calls until this is done, don't start doing any background housekeeping
-    /// until this is done.
+    /// Called once on startup, this function attempts to contact all pageservers to build an up-to-date
+    /// view of the world, and determine which pageservers are responsive.
+    #[instrument(skip_all)]
     async fn startup_reconcile(&self) {
         // For all tenant shards, a vector of observed states on nodes (where None means
         // indeterminate, same as in [`ObservedStateLocation`])
@@ -153,9 +162,6 @@ impl Service {
 
         let mut nodes_online = HashSet::new();
 
-        // TODO: give Service a cancellation token for clean shutdown
-        let cancel = CancellationToken::new();
-
         // TODO: issue these requests concurrently
         {
             let nodes = {
@@ -190,7 +196,7 @@ impl Service {
                     1,
                     5,
                     "Location config listing",
-                    &cancel,
+                    &self.cancel,
                 )
                 .await;
                 let Some(list_response) = list_response else {
@@ -331,7 +337,7 @@ impl Service {
         let stream = futures::stream::iter(compute_notifications.into_iter())
             .map(|(tenant_shard_id, node_id)| {
                 let compute_hook = compute_hook.clone();
-                let cancel = cancel.clone();
+                let cancel = self.cancel.clone();
                 async move {
                     if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
                         tracing::error!(
@@ -368,8 +374,98 @@ impl Service {
         tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
     }
 
+    /// Long running background task that periodically wakes up and looks for shards that need
+    /// reconciliation.  Reconciliation is fallible, so any reconciliation tasks that fail during
+    /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible
+    /// for those retries.
+    #[instrument(skip_all)]
+    async fn background_reconcile(&self) {
+        self.startup_complete.clone().wait().await;
+
+        const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20);
+
+        let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD);
+        while !self.cancel.is_cancelled() {
+            tokio::select! {
+              _ = interval.tick() => { self.reconcile_all(); }
+              _ = self.cancel.cancelled() => return
+            }
+        }
+    }
+
+    #[instrument(skip_all)]
+    async fn process_results(
+        &self,
+        mut result_rx: tokio::sync::mpsc::UnboundedReceiver<ReconcileResult>,
+    ) {
+        loop {
+            // Wait for the next result, or for cancellation
+            let result = tokio::select! {
+                r = result_rx.recv() => {
+                    match r {
+                        Some(result) => {result},
+                        None => {break;}
+                    }
+                }
+                _ = self.cancel.cancelled() => {
+                    break;
+                }
+            };
+
+            tracing::info!(
+                "Reconcile result for sequence {}, ok={}",
+                result.sequence,
+                result.result.is_ok()
+            );
+            let mut locked = self.inner.write().unwrap();
+            let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
+                // A reconciliation result might race with removing a tenant: drop results for
+                // tenants that aren't in our map.
+                continue;
+            };
+
+            // Usually generation should only be updated via this path, so the max() isn't
+            // needed, but it is used to handle out-of-band updates via. e.g. test hook.
+            tenant.generation = std::cmp::max(tenant.generation, result.generation);
+
+            // If the reconciler signals that it failed to notify compute, set this state on
+            // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
+            tenant.pending_compute_notification = result.pending_compute_notification;
+
+            match result.result {
+                Ok(()) => {
+                    for (node_id, loc) in &result.observed.locations {
+                        if let Some(conf) = &loc.conf {
+                            tracing::info!("Updating observed location {}: {:?}", node_id, conf);
+                        } else {
+                            tracing::info!("Setting observed location {} to None", node_id,)
+                        }
+                    }
+                    tenant.observed = result.observed;
+                    tenant.waiter.advance(result.sequence);
+                }
+                Err(e) => {
+                    tracing::warn!(
+                        "Reconcile error on tenant {}: {}",
+                        tenant.tenant_shard_id,
+                        e
+                    );
+
+                    // Ordering: populate last_error before advancing error_seq,
+                    // so that waiters will see the correct error after waiting.
+                    *(tenant.last_error.lock().unwrap()) = format!("{e}");
+                    tenant.error_waiter.advance(result.sequence);
+
+                    for (node_id, o) in result.observed.locations {
+                        tenant.observed.locations.insert(node_id, o);
+                    }
+                }
+            }
+        }
+    }
+
     pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
-        let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
+        let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
 
         tracing::info!("Loading nodes from database...");
         let nodes = persistence.list_nodes().await?;
@@ -418,6 +514,7 @@ impl Service {
                 observed: ObservedState::new(),
                 config: serde_json::from_str(&tsp.config).unwrap(),
                 reconciler: None,
+                splitting: tsp.splitting,
                 waiter: Arc::new(SeqWait::new(Sequence::initial())),
                 error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
                 last_error: Arc::default(),
@@ -439,73 +536,35 @@ impl Service {
             config,
             persistence,
             startup_complete: startup_complete.clone(),
+            cancel: CancellationToken::new(),
+            gate: Gate::default(),
         });
 
         let result_task_this = this.clone();
         tokio::task::spawn(async move {
-            while let Some(result) = result_rx.recv().await {
-                tracing::info!(
-                    "Reconcile result for sequence {}, ok={}",
-                    result.sequence,
-                    result.result.is_ok()
-                );
-                let mut locked = result_task_this.inner.write().unwrap();
-                let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
-                    // A reconciliation result might race with removing a tenant: drop results for
-                    // tenants that aren't in our map.
-                    continue;
-                };
-
-                // Usually generation should only be updated via this path, so the max() isn't
-                // needed, but it is used to handle out-of-band updates via. e.g. test hook.
-                tenant.generation = std::cmp::max(tenant.generation, result.generation);
-
-                // If the reconciler signals that it failed to notify compute, set this state on
-                // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
-                tenant.pending_compute_notification = result.pending_compute_notification;
-
-                match result.result {
-                    Ok(()) => {
-                        for (node_id, loc) in &result.observed.locations {
-                            if let Some(conf) = &loc.conf {
-                                tracing::info!(
-                                    "Updating observed location {}: {:?}",
-                                    node_id,
-                                    conf
-                                );
-                            } else {
-                                tracing::info!("Setting observed location {} to None", node_id,)
-                            }
-                        }
-                        tenant.observed = result.observed;
-                        tenant.waiter.advance(result.sequence);
-                    }
-                    Err(e) => {
-                        tracing::warn!(
-                            "Reconcile error on tenant {}: {}",
-                            tenant.tenant_shard_id,
-                            e
-                        );
-
-                        // Ordering: populate last_error before advancing error_seq,
-                        // so that waiters will see the correct error after waiting.
-                        *(tenant.last_error.lock().unwrap()) = format!("{e}");
-                        tenant.error_waiter.advance(result.sequence);
-
-                        for (node_id, o) in result.observed.locations {
-                            tenant.observed.locations.insert(node_id, o);
-                        }
-                    }
-                }
+            // Block shutdown until we're done (we must respect self.cancel)
+            if let Ok(_gate) = result_task_this.gate.enter() {
+                result_task_this.process_results(result_rx).await
             }
         });
 
-        let startup_reconcile_this = this.clone();
-        tokio::task::spawn(async move {
-            // Block the [`Service::startup_complete`] barrier until we're done
-            let _completion = startup_completion;
+        tokio::task::spawn({
+            let this = this.clone();
+            // We will block the [`Service::startup_complete`] barrier until [`Self::startup_reconcile`]
+            // is done.
+            let startup_completion = startup_completion.clone();
+            async move {
+                // Block shutdown until we're done (we must respect self.cancel)
+                let Ok(_gate) = this.gate.enter() else {
+                    return;
+                };
 
-            startup_reconcile_this.startup_reconcile().await
+                this.startup_reconcile().await;
+
+                drop(startup_completion);
+
+                this.background_reconcile().await;
+            }
         });
 
         Ok(this)
@@ -620,6 +679,28 @@ impl Service {
             attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
         );
 
+        // Trick the reconciler into not doing anything for this tenant: this helps
+        // tests that manually configure a tenant on the pagesrever, and then call this
+        // attach hook: they don't want background reconciliation to modify what they
+        // did to the pageserver.
+        #[cfg(feature = "testing")]
+        {
+            if let Some(node_id) = attach_req.node_id {
+                tenant_state.observed.locations = HashMap::from([(
+                    node_id,
+                    ObservedStateLocation {
+                        conf: Some(attached_location_conf(
+                            tenant_state.generation,
+                            &tenant_state.shard,
+                            &tenant_state.config,
+                        )),
+                    },
+                )]);
+            } else {
+                tenant_state.observed.locations.clear();
+            }
+        }
+
         Ok(AttachHookResponse {
             gen: attach_req
                 .node_id
@@ -868,6 +949,8 @@ impl Service {
                         &compute_hook,
                         &self.config,
                         &self.persistence,
+                        &self.gate,
+                        &self.cancel,
                     )
                 })
                 .collect::<Vec<_>>();
@@ -970,6 +1053,8 @@ impl Service {
                     &compute_hook,
                     &self.config,
                     &self.persistence,
+                    &self.gate,
+                    &self.cancel,
                 );
                 if let Some(waiter) = maybe_waiter {
                     waiters.push(waiter);
@@ -1059,6 +1144,8 @@ impl Service {
     }
 
     pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
+        self.ensure_attached_wait(tenant_id).await?;
+
         // TODO: refactor into helper
         let targets = {
             let locked = self.inner.read().unwrap();
@@ -1080,8 +1167,6 @@ impl Service {
             targets
         };
 
-        // TODO: error out if the tenant is not attached anywhere.
-
         // Phase 1: delete on the pageservers
         let mut any_pending = false;
         for (tenant_shard_id, node) in targets {
@@ -1417,9 +1502,6 @@ impl Service {
         let mut policy = None;
         let mut shard_ident = None;
 
-        // TODO: put a cancellation token on Service for clean shutdown
-        let cancel = CancellationToken::new();
-
         // A parent shard which will be split
         struct SplitTarget {
             parent_id: TenantShardId,
@@ -1591,6 +1673,18 @@ impl Service {
             }
         }
 
+        // Now that I have persisted the splitting state, apply it in-memory.  This is infallible, so
+        // callers may assume that if splitting is set in memory, then it was persisted, and if splitting
+        // is not set in memory, then it was not persisted.
+        {
+            let mut locked = self.inner.write().unwrap();
+            for target in &targets {
+                if let Some(parent_shard) = locked.tenants.get_mut(&target.parent_id) {
+                    parent_shard.splitting = SplitState::Splitting;
+                }
+            }
+        }
+
         // FIXME: we have now committed the shard split state to the database, so any subsequent
         // failure needs to roll it back.  We will later wrap this function in logic to roll back
         // the split if it fails.
@@ -1650,7 +1744,7 @@ impl Service {
             .complete_shard_split(tenant_id, old_shard_count)
             .await?;
 
-        // Replace all the shards we just split with their children
+        // Replace all the shards we just split with their children: this phase is infallible.
         let mut response = TenantShardSplitResponse {
             new_shards: Vec::new(),
         };
@@ -1698,6 +1792,10 @@ impl Service {
                     child_state.generation = generation;
                     child_state.config = config.clone();
 
+                    // The child's TenantState::splitting is intentionally left at the default value of Idle,
+                    // as at this point in the split process we have succeeded and this part is infallible:
+                    // we will never need to do any special recovery from this state.
+
                     child_locations.push((child, pageserver));
 
                     locked.tenants.insert(child, child_state);
@@ -1709,7 +1807,7 @@ impl Service {
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
         for (child_id, child_ps) in child_locations {
-            if let Err(e) = compute_hook.notify(child_id, child_ps, &cancel).await {
+            if let Err(e) = compute_hook.notify(child_id, child_ps, &self.cancel).await {
                 tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
                         child_id, child_ps);
                 failed_notifications.push(child_id);
@@ -1785,6 +1883,8 @@ impl Service {
                 &compute_hook,
                 &self.config,
                 &self.persistence,
+                &self.gate,
+                &self.cancel,
             )
         };
 
@@ -1986,6 +2086,8 @@ impl Service {
                                 &compute_hook,
                                 &self.config,
                                 &self.persistence,
+                                &self.gate,
+                                &self.cancel,
                             );
                         }
                     }
@@ -2007,6 +2109,8 @@ impl Service {
                             &compute_hook,
                             &self.config,
                             &self.persistence,
+                            &self.gate,
+                            &self.cancel,
                         );
                     }
                 }
@@ -2046,6 +2150,8 @@ impl Service {
                 &compute_hook,
                 &self.config,
                 &self.persistence,
+                &self.gate,
+                &self.cancel,
             ) {
                 waiters.push(waiter);
             }
@@ -2057,6 +2163,17 @@ impl Service {
         let ensure_waiters = {
             let locked = self.inner.write().unwrap();
 
+            // Check if the tenant is splitting: in this case, even if it is attached,
+            // we must act as if it is not: this blocks e.g. timeline creation/deletion
+            // operations during the split.
+            for (_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) {
+                if !matches!(shard.splitting, SplitState::Idle) {
+                    return Err(ApiError::ResourceUnavailable(
+                        "Tenant shards are currently splitting".into(),
+                    ));
+                }
+            }
+
             self.ensure_attached_schedule(locked, tenant_id)
                 .map_err(ApiError::InternalServerError)?
         };
@@ -2088,8 +2205,25 @@ impl Service {
                     &compute_hook,
                     &self.config,
                     &self.persistence,
+                    &self.gate,
+                    &self.cancel,
                 )
             })
             .count()
     }
+
+    pub async fn shutdown(&self) {
+        // Note that this already stops processing any results from reconciles: so
+        // we do not expect that our [`TenantState`] objects will reach a neat
+        // final state.
+        self.cancel.cancel();
+
+        // The cancellation tokens in [`crate::reconciler::Reconciler`] are children
+        // of our cancellation token, so we do not need to explicitly cancel each of
+        // them.
+
+        // Background tasks and reconcilers hold gate guards: this waits for them all
+        // to complete.
+        self.gate.close().await;
+    }
 }
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 1646ed9fcd..dd753ece3d 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -7,16 +7,18 @@ use pageserver_api::{
 };
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
+use tracing::{instrument, Instrument};
 use utils::{
     generation::Generation,
     id::NodeId,
     seqwait::{SeqWait, SeqWaitError},
+    sync::gate::Gate,
 };
 
 use crate::{
     compute_hook::ComputeHook,
     node::Node,
-    persistence::Persistence,
+    persistence::{split_state::SplitState, Persistence},
     reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
     scheduler::{ScheduleError, Scheduler},
     service, PlacementPolicy, Sequence,
@@ -58,6 +60,11 @@ pub(crate) struct TenantState {
     /// cancellation token has been fired)
     pub(crate) reconciler: Option<ReconcilerHandle>,
 
+    /// If a tenant is being split, then all shards with that TenantId will have a
+    /// SplitState set, this acts as a guard against other operations such as background
+    /// reconciliation, and timeline creation.
+    pub(crate) splitting: SplitState,
+
     /// Optionally wait for reconciliation to complete up to a particular
     /// sequence number.
     pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
@@ -238,6 +245,7 @@ impl TenantState {
             observed: ObservedState::default(),
             config: TenantConfig::default(),
             reconciler: None,
+            splitting: SplitState::Idle,
             sequence: Sequence(1),
             waiter: Arc::new(SeqWait::new(Sequence(0))),
             error_waiter: Arc::new(SeqWait::new(Sequence(0))),
@@ -415,6 +423,8 @@ impl TenantState {
         false
     }
 
+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
     pub(crate) fn maybe_reconcile(
         &mut self,
         result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
@@ -422,6 +432,8 @@ impl TenantState {
         compute_hook: &Arc<ComputeHook>,
         service_config: &service::Config,
         persistence: &Arc<Persistence>,
+        gate: &Gate,
+        cancel: &CancellationToken,
     ) -> Option<ReconcilerWaiter> {
         // If there are any ambiguous observed states, and the nodes they refer to are available,
         // we should reconcile to clean them up.
@@ -443,6 +455,14 @@ impl TenantState {
             return None;
         }
 
+        // If we are currently splitting, then never start a reconciler task: the splitting logic
+        // requires that shards are not interfered with while it runs. Do this check here rather than
+        // up top, so that we only log this message if we would otherwise have done a reconciliation.
+        if !matches!(self.splitting, SplitState::Idle) {
+            tracing::info!("Refusing to reconcile, splitting in progress");
+            return None;
+        }
+
         // Reconcile already in flight for the current sequence?
         if let Some(handle) = &self.reconciler {
             if handle.sequence == self.sequence {
@@ -460,7 +480,12 @@ impl TenantState {
         // doing our sequence's work.
         let old_handle = self.reconciler.take();
 
-        let cancel = CancellationToken::new();
+        let Ok(gate_guard) = gate.enter() else {
+            // Shutting down, don't start a reconciler
+            return None;
+        };
+
+        let reconciler_cancel = cancel.child_token();
         let mut reconciler = Reconciler {
             tenant_shard_id: self.tenant_shard_id,
             shard: self.shard,
@@ -471,59 +496,66 @@ impl TenantState {
             pageservers: pageservers.clone(),
             compute_hook: compute_hook.clone(),
             service_config: service_config.clone(),
-            cancel: cancel.clone(),
+            _gate_guard: gate_guard,
+            cancel: reconciler_cancel.clone(),
             persistence: persistence.clone(),
             compute_notify_failure: false,
         };
 
         let reconcile_seq = self.sequence;
 
-        tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
+        tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence);
         let must_notify = self.pending_compute_notification;
-        let join_handle = tokio::task::spawn(async move {
-            // Wait for any previous reconcile task to complete before we start
-            if let Some(old_handle) = old_handle {
-                old_handle.cancel.cancel();
-                if let Err(e) = old_handle.handle.await {
-                    // We can't do much with this other than log it: the task is done, so
-                    // we may proceed with our work.
-                    tracing::error!("Unexpected join error waiting for reconcile task: {e}");
+        let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
+                                                        tenant_id=%reconciler.tenant_shard_id.tenant_id,
+                                                        shard_id=%reconciler.tenant_shard_id.shard_slug());
+        let join_handle = tokio::task::spawn(
+            async move {
+                // Wait for any previous reconcile task to complete before we start
+                if let Some(old_handle) = old_handle {
+                    old_handle.cancel.cancel();
+                    if let Err(e) = old_handle.handle.await {
+                        // We can't do much with this other than log it: the task is done, so
+                        // we may proceed with our work.
+                        tracing::error!("Unexpected join error waiting for reconcile task: {e}");
+                    }
                 }
+
+                // Early check for cancellation before doing any work
+                // TODO: wrap all remote API operations in cancellation check
+                // as well.
+                if reconciler.cancel.is_cancelled() {
+                    return;
+                }
+
+                // Attempt to make observed state match intent state
+                let result = reconciler.reconcile().await;
+
+                // If we know we had a pending compute notification from some previous action, send a notification irrespective
+                // of whether the above reconcile() did any work
+                if result.is_ok() && must_notify {
+                    // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
+                    reconciler.compute_notify().await.ok();
+                }
+
+                result_tx
+                    .send(ReconcileResult {
+                        sequence: reconcile_seq,
+                        result,
+                        tenant_shard_id: reconciler.tenant_shard_id,
+                        generation: reconciler.generation,
+                        observed: reconciler.observed,
+                        pending_compute_notification: reconciler.compute_notify_failure,
+                    })
+                    .ok();
             }
-
-            // Early check for cancellation before doing any work
-            // TODO: wrap all remote API operations in cancellation check
-            // as well.
-            if reconciler.cancel.is_cancelled() {
-                return;
-            }
-
-            // Attempt to make observed state match intent state
-            let result = reconciler.reconcile().await;
-
-            // If we know we had a pending compute notification from some previous action, send a notification irrespective
-            // of whether the above reconcile() did any work
-            if result.is_ok() && must_notify {
-                // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
-                reconciler.compute_notify().await.ok();
-            }
-
-            result_tx
-                .send(ReconcileResult {
-                    sequence: reconcile_seq,
-                    result,
-                    tenant_shard_id: reconciler.tenant_shard_id,
-                    generation: reconciler.generation,
-                    observed: reconciler.observed,
-                    pending_compute_notification: reconciler.compute_notify_failure,
-                })
-                .ok();
-        });
+            .instrument(reconciler_span),
+        );
 
         self.reconciler = Some(ReconcilerHandle {
             sequence: self.sequence,
             handle: join_handle,
-            cancel,
+            cancel: reconciler_cancel,
         });
 
         Some(ReconcilerWaiter {
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index de9f3b6945..1070d06ed0 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -20,6 +20,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
+    NeonPageserver,
     PgBin,
     S3Scrubber,
     last_flush_lsn_upload,
@@ -62,7 +63,7 @@ def generate_uploads_and_deletions(
     tenant_id: Optional[TenantId] = None,
     timeline_id: Optional[TimelineId] = None,
     data: Optional[str] = None,
-    pageserver_id: Optional[int] = None,
+    pageserver: NeonPageserver,
 ):
     """
     Using the environment's default tenant + timeline, generate a load pattern
@@ -77,14 +78,16 @@ def generate_uploads_and_deletions(
         timeline_id = env.initial_timeline
     assert timeline_id is not None
 
-    ps_http = env.pageserver.http_client()
+    ps_http = pageserver.http_client()
 
     with env.endpoints.create_start(
-        "main", tenant_id=tenant_id, pageserver_id=pageserver_id
+        "main", tenant_id=tenant_id, pageserver_id=pageserver.id
     ) as endpoint:
         if init:
             endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
-            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+            last_flush_lsn_upload(
+                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+            )
 
         def churn(data):
             endpoint.safe_psql_many(
@@ -105,7 +108,9 @@ def generate_uploads_and_deletions(
             # We are waiting for uploads as well as local flush, in order to avoid leaving the system
             # in a state where there are "future layers" in remote storage that will generate deletions
             # after a restart.
-            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+            last_flush_lsn_upload(
+                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+            )
             ps_http.timeline_checkpoint(tenant_id, timeline_id)
 
         # Compaction should generate some GC-elegible layers
@@ -205,7 +210,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     env.neon_cli.create_tenant(
         tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
     )
-    generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id)
+    generate_uploads_and_deletions(env, pageserver=env.pageserver)
 
     def parse_generation_suffix(key):
         m = re.match(".+-([0-9a-zA-Z]{8})$", key)
@@ -233,7 +238,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     # Starting without the override that disabled control_plane_api
     env.pageserver.start()
 
-    generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id, init=False)
+    generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False)
 
     legacy_objects: list[str] = []
     suffixed_objects = []
@@ -277,13 +282,16 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
+    neon_env_builder.num_pageservers = 2
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
 
-    some_other_pageserver = 1234
+    attached_to_id = env.attachment_service.locate(env.initial_tenant)[0]["node_id"]
+    main_pageserver = env.get_pageserver(attached_to_id)
+    other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0]
 
-    ps_http = env.pageserver.http_client()
+    ps_http = main_pageserver.http_client()
 
-    generate_uploads_and_deletions(env)
+    generate_uploads_and_deletions(env, pageserver=main_pageserver)
 
     # Flush: pending deletions should all complete
     assert_deletion_queue(ps_http, lambda n: n > 0)
@@ -296,14 +304,14 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
     assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"]
     assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
 
-    env.pageserver.allowed_errors.extend(
+    main_pageserver.allowed_errors.extend(
         [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
     )
 
     # Now advance the generation in the control plane: subsequent validations
     # from the running pageserver will fail.  No more deletions should happen.
-    env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
-    generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id)
+    env.attachment_service.attach_hook_issue(env.initial_tenant, other_pageserver.id)
+    generate_uploads_and_deletions(env, init=False, pageserver=main_pageserver)
 
     assert_deletion_queue(ps_http, lambda n: n > 0)
     queue_depth_before = get_deletion_queue_depth(ps_http)
@@ -355,9 +363,14 @@ def test_deletion_queue_recovery(
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
+    neon_env_builder.num_pageservers = 2
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
 
-    ps_http = env.pageserver.http_client()
+    attached_to_id = env.attachment_service.locate(env.initial_tenant)[0]["node_id"]
+    main_pageserver = env.get_pageserver(attached_to_id)
+    other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0]
+
+    ps_http = main_pageserver.http_client()
 
     failpoints = [
         # Prevent deletion lists from being executed, to build up some backlog of deletions
@@ -374,7 +387,7 @@ def test_deletion_queue_recovery(
 
     ps_http.configure_failpoints(failpoints)
 
-    generate_uploads_and_deletions(env)
+    generate_uploads_and_deletions(env, pageserver=main_pageserver)
 
     # There should be entries in the deletion queue
     assert_deletion_queue(ps_http, lambda n: n > 0)
@@ -401,7 +414,7 @@ def test_deletion_queue_recovery(
         # also wait to see the header hit the disk: this seems paranoid but the race
         # can really happen on a heavily overloaded test machine.
         def assert_header_written():
-            assert (env.pageserver.workdir / "deletion" / "header-01").exists()
+            assert (main_pageserver.workdir / "deletion" / "header-01").exists()
 
         wait_until(20, 1, assert_header_written)
 
@@ -411,13 +424,13 @@ def test_deletion_queue_recovery(
             before_restart_depth = get_deletion_queue_validated(ps_http)
 
     log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
-    env.pageserver.stop(immediate=True)
+    main_pageserver.stop(immediate=True)
 
     if keep_attachment == KeepAttachment.LOSE:
-        some_other_pageserver = 101010
+        some_other_pageserver = other_pageserver.id
         env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
 
-    env.pageserver.start()
+    main_pageserver.start()
 
     def assert_deletions_submitted(n: int):
         assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
@@ -440,7 +453,7 @@ def test_deletion_queue_recovery(
         #   validated before restart.
         assert get_deletion_queue_executed(ps_http) == before_restart_depth
     else:
-        env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
+        main_pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
 
         # If we lost the attachment, we should have dropped our pre-restart deletions.
         assert get_deletion_queue_dropped(ps_http) == before_restart_depth
@@ -449,8 +462,8 @@ def test_deletion_queue_recovery(
     assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
 
     # Restart again
-    env.pageserver.stop(immediate=True)
-    env.pageserver.start()
+    main_pageserver.stop(immediate=True)
+    main_pageserver.start()
 
     # No deletion lists should be recovered: this demonstrates that deletion lists
     # were cleaned up after being executed or dropped in the previous process lifetime.
@@ -469,7 +482,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
 
     ps_http = env.pageserver.http_client()
 
-    generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id)
+    generate_uploads_and_deletions(env, pageserver=env.pageserver)
 
     env.pageserver.allowed_errors.extend(
         [
@@ -486,7 +499,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     # Remember how many validations had happened before the control plane went offline
     validated = get_deletion_queue_validated(ps_http)
 
-    generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id)
+    generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)
 
     # The running pageserver should stop progressing deletions
     time.sleep(10)
@@ -502,7 +515,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     )
 
     # The pageserver should provide service to clients
-    generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id)
+    generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)
 
     # The pageserver should neither validate nor execute any deletions, it should have
     # loaded the DeletionLists from before though
@@ -523,7 +536,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     env.pageserver.stop()  # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
     env.pageserver.start()
 
-    generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id)
+    generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)
     ps_http.deletion_queue_flush(execute=True)
     assert get_deletion_queue_depth(ps_http) == 0
     assert get_deletion_queue_validated(ps_http) > 0
@@ -561,7 +574,7 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    generate_uploads_and_deletions(env)
+    generate_uploads_and_deletions(env, pageserver=env.pageserver)
 
     read_all(env, tenant_id, timeline_id)
     evict_all_layers(env, tenant_id, timeline_id)

From c19625a29ccd3b1433c0351b2146eafe410be129 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 16 Feb 2024 16:50:09 +0200
Subject: [PATCH 0194/1571] Support sharding for compute_ctl (#6787)

## Problem

See https://github.com/neondatabase/neon/issues/6786

## Summary of changes

Split connection string in compute.rs when requesting basebackup
---
 compute_tools/src/compute.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 83db8e09ec..1c5363d048 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -324,7 +324,8 @@ impl ComputeNode {
         let spec = compute_state.pspec.as_ref().expect("spec must be set");
         let start_time = Instant::now();
 
-        let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;
+        let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
+        let mut config = postgres::Config::from_str(shard0_connstr)?;
 
         // Use the storage auth token from the config file, if given.
         // Note: this overrides any password set in the connection string.

From 0f3b87d02310e552a57f89a9766288913e4fb90a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 16 Feb 2024 15:53:36 +0100
Subject: [PATCH 0195/1571] Add test for pageserver_directory_entries_count
 metric (#6767)

Adds a simple test to ensure the metric works.

The test creates a bunch of relations to activate the metric.

Follow-up of #6736
---
 test_runner/regress/test_tenants.py | 48 +++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index bf317808ee..1e13a2f20f 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -18,6 +18,7 @@ from fixtures.metrics import (
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
+    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import timeline_delete_wait_completed, wait_until_tenant_active
@@ -414,3 +415,50 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
 
     # The tenant should end up active
     wait_until_tenant_active(env.pageserver.http_client(), tenant_id, iterations=10, period=1)
+
+
+def test_pageserver_metrics_many_relations(neon_env_builder: NeonEnvBuilder):
+    """Test for the directory_entries_count metric"""
+
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+
+    endpoint_tenant = env.endpoints.create_start("main", tenant_id=env.initial_tenant)
+
+    # Not sure why but this many tables creates more relations than our limit
+    TABLE_COUNT = 1600
+    COUNT_AT_LEAST_EXPECTED = 5500
+
+    with endpoint_tenant.connect() as conn:
+        with conn.cursor() as cur:
+            # Wrapping begin; commit; around this and the loop below keeps the reproduction
+            # but it also doesn't have a performance benefit
+            cur.execute("CREATE TABLE template_tbl(key int primary key, value text);")
+            for i in range(TABLE_COUNT):
+                cur.execute(f"CREATE TABLE tbl_{i}(like template_tbl INCLUDING ALL);")
+    wait_for_last_flush_lsn(env, endpoint_tenant, env.initial_tenant, env.initial_timeline)
+    endpoint_tenant.stop()
+
+    m = ps_http.get_metrics()
+    directory_entries_count_metric = m.query_all(
+        "pageserver_directory_entries_count", {"tenant_id": str(env.initial_tenant)}
+    )
+
+    def only_int(samples: List[Sample]) -> int:
+        assert len(samples) == 1
+        return int(samples[0].value)
+
+    directory_entries_count = only_int(directory_entries_count_metric)
+
+    log.info(f"pageserver_directory_entries_count metric value: {directory_entries_count}")
+
+    assert directory_entries_count > COUNT_AT_LEAST_EXPECTED
+
+    timeline_detail = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
+
+    counts = timeline_detail["directory_entries_counts"]
+    assert counts
+    log.info(f"directory counts: {counts}")
+    assert counts[2] > COUNT_AT_LEAST_EXPECTED

From 59c5b374de8934e76ce7739720fc31547ac9de00 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 16 Feb 2024 15:30:04 +0000
Subject: [PATCH 0196/1571] 
 test_pageserver_max_throughput_getpage_at_latest_lsn: disable on CI (#6785)

## Problem
`test_pageserver_max_throughput_getpage_at_latest_lsn` is flaky which
makes CI status red pretty frequently. `benchmarks` is not a blocking
job (doesn't block `deploy`), so having it red might hide failures in
other jobs

Ref: https://github.com/neondatabase/neon/issues/6724

## Summary of changes
- Disable `test_pageserver_max_throughput_getpage_at_latest_lsn` on CI
until it fixed
---
 .../test_pageserver_max_throughput_getpage_at_latest_lsn.py  | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 1ed7e577b9..307b3848db 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -1,4 +1,5 @@
 import json
+import os
 from pathlib import Path
 from typing import Any, Dict, Tuple
 
@@ -33,6 +34,10 @@ from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
 @pytest.mark.timeout(
     10000
 )  # TODO: this value is just "a really high number"; have this per instance type
+@pytest.mark.skipif(
+    os.getenv("CI", "false") == "true",
+    reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/6724",
+)
 def test_pageserver_max_throughput_getpage_at_latest_lsn(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,

From 36e11009494609e6c48846013957a0ad2248561d Mon Sep 17 00:00:00 2001
From: Calin Anca <49310764+calinanca99@users.noreply.github.com>
Date: Fri, 16 Feb 2024 16:31:54 +0100
Subject: [PATCH 0197/1571] bench_walredo: use tokio multi-threaded runtime
 (#6743)

fixes https://github.com/neondatabase/neon/issues/6648

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/benches/bench_walredo.rs | 177 +++++++++++-----------------
 1 file changed, 72 insertions(+), 105 deletions(-)

diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 4837626086..47c8bd75c6 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -6,14 +6,28 @@
 //! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
 //! logging what happens when a sequential scan is requested on a small table, then picking out two
 //! suitable from logs.
+//!
+//!
+//! Reference data (git blame to see commit) on an i3en.3xlarge
+// ```text
+//! short/short/1           time:   [39.175 µs 39.348 µs 39.536 µs]
+//! short/short/2           time:   [51.227 µs 51.487 µs 51.755 µs]
+//! short/short/4           time:   [76.048 µs 76.362 µs 76.674 µs]
+//! short/short/8           time:   [128.94 µs 129.82 µs 130.74 µs]
+//! short/short/16          time:   [227.84 µs 229.00 µs 230.28 µs]
+//! short/short/32          time:   [455.97 µs 457.81 µs 459.90 µs]
+//! short/short/64          time:   [902.46 µs 904.84 µs 907.32 µs]
+//! short/short/128         time:   [1.7416 ms 1.7487 ms 1.7561 ms]
+//! ``
 
-use std::sync::{Arc, Barrier};
+use std::sync::Arc;
 
 use bytes::{Buf, Bytes};
 use pageserver::{
     config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
 };
 use pageserver_api::shard::TenantShardId;
+use tokio::task::JoinSet;
 use utils::{id::TenantId, lsn::Lsn};
 
 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
@@ -39,11 +53,11 @@ fn redo_scenarios(c: &mut Criterion) {
             .build()
             .unwrap();
         tracing::info!("executing first");
-        short().execute(rt.handle(), &manager).unwrap();
+        rt.block_on(short().execute(&manager)).unwrap();
         tracing::info!("first executed");
     }
 
-    let thread_counts = [1, 2, 4, 8, 16];
+    let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
 
     let mut group = c.benchmark_group("short");
     group.sampling_mode(criterion::SamplingMode::Flat);
@@ -74,114 +88,69 @@ fn redo_scenarios(c: &mut Criterion) {
     drop(group);
 }
 
-/// Sets up `threads` number of requesters to `request_redo`, with the given input.
+/// Sets up a multi-threaded tokio runtime with default worker thread count,
+/// then, spawn `requesters` tasks that repeatedly:
+/// - get input from `input_factor()`
+/// - call `manager.request_redo()` with their input
+///
+/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
+///
+/// Using tokio's default worker thread count means the results will differ on machines
+/// with different core countrs. We don't care about that, the performance will always
+/// be different on different hardware. To compare performance of different software versions,
+/// use the same hardware.
 fn add_multithreaded_walredo_requesters(
     b: &mut criterion::Bencher,
-    threads: u32,
+    nrequesters: usize,
     manager: &Arc<PostgresRedoManager>,
     input_factory: fn() -> Request,
 ) {
-    assert_ne!(threads, 0);
+    assert_ne!(nrequesters, 0);
 
-    if threads == 1 {
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .unwrap();
-        let handle = rt.handle();
-        b.iter_batched_ref(
-            || Some(input_factory()),
-            |input| execute_all(input.take(), handle, manager),
-            criterion::BatchSize::PerIteration,
-        );
-    } else {
-        let (work_tx, work_rx) = std::sync::mpsc::sync_channel(threads as usize);
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .unwrap();
 
-        let work_rx = std::sync::Arc::new(std::sync::Mutex::new(work_rx));
+    let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));
 
-        let barrier = Arc::new(Barrier::new(threads as usize + 1));
-
-        let jhs = (0..threads)
-            .map(|_| {
-                std::thread::spawn({
-                    let manager = manager.clone();
-                    let barrier = barrier.clone();
-                    let work_rx = work_rx.clone();
-                    move || {
-                        let rt = tokio::runtime::Builder::new_current_thread()
-                            .enable_all()
-                            .build()
-                            .unwrap();
-                        let handle = rt.handle();
-                        loop {
-                            // queue up and wait if we want to go another round
-                            if work_rx.lock().unwrap().recv().is_err() {
-                                break;
-                            }
-
-                            let input = Some(input_factory());
-
-                            barrier.wait();
-
-                            execute_all(input, handle, &manager).unwrap();
-
-                            barrier.wait();
-                        }
-                    }
-                })
-            })
-            .collect::<Vec<_>>();
-
-        let _jhs = JoinOnDrop(jhs);
-
-        b.iter_batched(
-            || {
-                for _ in 0..threads {
-                    work_tx.send(()).unwrap()
-                }
-            },
-            |()| {
-                // start the work
-                barrier.wait();
-
-                // wait for work to complete
-                barrier.wait();
-            },
-            criterion::BatchSize::PerIteration,
-        );
-
-        drop(work_tx);
+    let mut requesters = JoinSet::new();
+    for _ in 0..nrequesters {
+        let _entered = rt.enter();
+        let manager = manager.clone();
+        let barrier = barrier.clone();
+        requesters.spawn(async move {
+            loop {
+                let input = input_factory();
+                barrier.wait().await;
+                let page = input.execute(&manager).await.unwrap();
+                assert_eq!(page.remaining(), 8192);
+                barrier.wait().await;
+            }
+        });
     }
-}
 
-struct JoinOnDrop(Vec<std::thread::JoinHandle<()>>);
+    let do_one_iteration = || {
+        rt.block_on(async {
+            barrier.wait().await;
+            // wait for work to complete
+            barrier.wait().await;
+        })
+    };
 
-impl Drop for JoinOnDrop {
-    // it's not really needless because we want join all then check for panicks
-    #[allow(clippy::needless_collect)]
-    fn drop(&mut self) {
-        // first join all
-        let results = self.0.drain(..).map(|jh| jh.join()).collect::<Vec<_>>();
-        // then check the results; panicking here is not great, but it does get the message across
-        // to the user, and sets an exit value.
-        results.into_iter().try_for_each(|res| res).unwrap();
-    }
-}
+    b.iter_batched(
+        || {
+            // warmup
+            do_one_iteration();
+        },
+        |()| {
+            // work loop
+            do_one_iteration();
+        },
+        criterion::BatchSize::PerIteration,
+    );
 
-fn execute_all<I>(
-    input: I,
-    handle: &tokio::runtime::Handle,
-    manager: &PostgresRedoManager,
-) -> anyhow::Result<()>
-where
-    I: IntoIterator<Item = Request>,
-{
-    // just fire all requests as fast as possible
-    input.into_iter().try_for_each(|req| {
-        let page = req.execute(handle, manager)?;
-        assert_eq!(page.remaining(), 8192);
-        anyhow::Ok(())
-    })
+    rt.block_on(requesters.shutdown());
 }
 
 criterion_group!(benches, redo_scenarios);
@@ -493,11 +462,7 @@ struct Request {
 }
 
 impl Request {
-    fn execute(
-        self,
-        rt: &tokio::runtime::Handle,
-        manager: &PostgresRedoManager,
-    ) -> anyhow::Result<Bytes> {
+    async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
         let Request {
             key,
             lsn,
@@ -506,6 +471,8 @@ impl Request {
             pg_version,
         } = self;
 
-        rt.block_on(manager.request_redo(key, lsn, base_img, records, pg_version))
+        manager
+            .request_redo(key, lsn, base_img, records, pg_version)
+            .await
     }
 }

From 5d039c6e9b0662bb81407819540162a06334791c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 16 Feb 2024 15:53:09 +0000
Subject: [PATCH 0198/1571] libs: add 'generations_api' auth scope (#6783)

## Problem

Even if you're not enforcing auth, the JwtAuth middleware barfs on
scopes it doesn't know about.

Add `generations_api` scope, which was invented in the cloud control
plane for the pageserver's /re-attach and /validate upcalls: this will
be enforced in storage controller's implementation of these in a later
PR.

Unfortunately the scope's naming doesn't match the other scope's naming
styles, so needs a manual serde decorator to give it an underscore.

## Summary of changes

- Add `Scope::GenerationsApi` variant
- Update pageserver + safekeeper auth code to print appropriate message
if they see it.
---
 libs/utils/src/auth.rs           | 3 +++
 pageserver/src/auth.rs           | 8 ++++++--
 safekeeper/src/auth.rs           | 8 ++++++--
 test_runner/regress/test_auth.py | 4 +---
 4 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index 15c3f2af1b..e031699cfb 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -29,6 +29,9 @@ pub enum Scope {
     // Should only be used e.g. for status check.
     // Currently also used for connection from any pageserver to any safekeeper.
     SafekeeperData,
+    // The scope used by pageservers in upcalls to storage controller and cloud control plane
+    #[serde(rename = "generations_api")]
+    GenerationsApi,
 }
 
 /// JWT payload. See docs/authentication.md for the format
diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs
index 2cb661863d..4dee61d3ea 100644
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -14,8 +14,12 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
         }
         (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
         (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::SafekeeperData, _) => Err(AuthError(
-            "SafekeeperData scope makes no sense for Pageserver".into(),
+        (Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
+            format!(
+                "JWT scope '{:?}' is ineligible for Pageserver auth",
+                claims.scope
+            )
+            .into(),
         )),
     }
 }
diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs
index bf4905aaa7..96676be04d 100644
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -12,8 +12,12 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
             }
             Ok(())
         }
-        (Scope::PageServerApi, _) => Err(AuthError(
-            "PageServerApi scope makes no sense for Safekeeper".into(),
+        (Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError(
+            format!(
+                "JWT scope '{:?}' is ineligible for Safekeeper auth",
+                claims.scope
+            )
+            .into(),
         )),
         (Scope::SafekeeperData, _) => Ok(()),
     }
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index bd87ff3efd..ea88b5d8e9 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -225,9 +225,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
 
         check_pageserver(True, password=pageserver_token)
 
-        env.pageserver.allowed_errors.append(
-            ".*SafekeeperData scope makes no sense for Pageserver.*"
-        )
+        env.pageserver.allowed_errors.append(".*JWT scope '.+' is ineligible for Pageserver auth.*")
         check_pageserver(False, password=safekeeper_token)
 
     def check_safekeeper(expect_success: bool, **conn_kwargs):

From ca07fa5f8b37a09d802814d2aebc0bc7f59da529 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 16 Feb 2024 21:26:59 +0100
Subject: [PATCH 0199/1571] per-TenantShard read throttling (#6706)

---
 Cargo.lock                                    |  15 +
 Cargo.toml                                    |   1 +
 control_plane/src/pageserver.rs               |  10 +
 libs/pageserver_api/src/models.rs             |  30 ++
 libs/utils/Cargo.toml                         |   1 +
 pageserver/Cargo.toml                         |   4 +-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   | 284 +++++++-----------
 pageserver/src/metrics.rs                     |  51 ++++
 pageserver/src/task_mgr.rs                    |   1 +
 pageserver/src/tenant.rs                      |  70 +++--
 pageserver/src/tenant/config.rs               |  17 +-
 pageserver/src/tenant/mgr.rs                  |   6 +-
 pageserver/src/tenant/secondary.rs            |   4 +-
 pageserver/src/tenant/tasks.rs                |  24 ++
 pageserver/src/tenant/throttle.rs             | 162 ++++++++++
 pageserver/src/tenant/timeline.rs             |  32 +-
 pageserver/src/tenant/timeline/delete.rs      |   1 +
 .../regress/test_attach_tenant_config.py      |   8 +
 18 files changed, 510 insertions(+), 211 deletions(-)
 create mode 100644 pageserver/src/tenant/throttle.rs

diff --git a/Cargo.lock b/Cargo.lock
index 74cd2c8d2c..e7a0d8b965 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1813,6 +1813,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e875f1719c16de097dee81ed675e2d9bb63096823ed3f0ca827b7dea3028bbbb"
 dependencies = [
  "enumset_derive",
+ "serde",
 ]
 
 [[package]]
@@ -2757,6 +2758,17 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
+[[package]]
+name = "leaky-bucket"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853"
+dependencies = [
+ "parking_lot 0.12.1",
+ "tokio",
+ "tracing",
+]
+
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3448,6 +3460,7 @@ name = "pageserver"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "arc-swap",
  "async-compression",
  "async-stream",
  "async-trait",
@@ -3475,6 +3488,7 @@ dependencies = [
  "humantime-serde",
  "hyper",
  "itertools",
+ "leaky-bucket",
  "md5",
  "metrics",
  "nix 0.27.1",
@@ -6347,6 +6361,7 @@ dependencies = [
  "hex-literal",
  "hyper",
  "jsonwebtoken",
+ "leaky-bucket",
  "metrics",
  "nix 0.27.1",
  "once_cell",
diff --git a/Cargo.toml b/Cargo.toml
index 8952f7627f..98fbc9c4f4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -97,6 +97,7 @@ ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "9"
 lasso = "0.7"
+leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index a1b0ba4252..8dd86bad96 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -400,6 +400,11 @@ impl PageServerNode {
                 .map(|x| x.parse::<bool>())
                 .transpose()
                 .context("Failed to parse 'lazy_slru_download' as bool")?,
+            timeline_get_throttle: settings
+                .remove("timeline_get_throttle")
+                .map(serde_json::from_str)
+                .transpose()
+                .context("parse `timeline_get_throttle` from json")?,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
@@ -505,6 +510,11 @@ impl PageServerNode {
                     .map(|x| x.parse::<bool>())
                     .transpose()
                     .context("Failed to parse 'lazy_slru_download' as bool")?,
+                timeline_get_throttle: settings
+                    .remove("timeline_get_throttle")
+                    .map(serde_json::from_str)
+                    .transpose()
+                    .context("parse `timeline_get_throttle` from json")?,
             }
         };
 
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index db2292072c..d546cb5c54 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -283,6 +283,7 @@ pub struct TenantConfig {
     pub gc_feedback: Option<bool>,
     pub heatmap_period: Option<String>,
     pub lazy_slru_download: Option<bool>,
+    pub timeline_get_throttle: Option<ThrottleConfig>,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -309,6 +310,35 @@ pub struct EvictionPolicyLayerAccessThreshold {
     pub threshold: Duration,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
+pub struct ThrottleConfig {
+    pub task_kinds: Vec<String>, // TaskKind
+    pub initial: usize,
+    #[serde(with = "humantime_serde")]
+    pub refill_interval: Duration,
+    pub refill_amount: NonZeroUsize,
+    pub max: usize,
+    pub fair: bool,
+}
+
+impl ThrottleConfig {
+    pub fn disabled() -> Self {
+        Self {
+            task_kinds: vec![], // effectively disables the throttle
+            // other values don't matter with emtpy `task_kinds`.
+            initial: 0,
+            refill_interval: Duration::from_millis(1),
+            refill_amount: NonZeroUsize::new(1).unwrap(),
+            max: 1,
+            fair: true,
+        }
+    }
+    /// The requests per second allowed  by the given config.
+    pub fn steady_rps(&self) -> f64 {
+        (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64()) / 1e3
+    }
+}
+
 /// A flattened analog of a `pagesever::tenant::LocationMode`, which
 /// lists out all possible states (and the virtual "Detached" state)
 /// in a flat form rather than using rust-style enums.
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 706b7a3187..983e94d963 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -25,6 +25,7 @@ hyper = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
 jsonwebtoken.workspace = true
+leaky-bucket.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 95d558bb7b..eeee2055c2 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -12,6 +12,7 @@ testing = ["fail/failpoints"]
 
 [dependencies]
 anyhow.workspace = true
+arc-swap.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
@@ -35,6 +36,7 @@ humantime.workspace = true
 humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
+leaky-bucket.workspace = true
 md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
@@ -82,7 +84,7 @@ workspace_hack.workspace = true
 reqwest.workspace = true
 rpds.workspace = true
 enum-map.workspace = true
-enumset.workspace = true
+enumset = { workspace = true, features = ["serde"]}
 strum.workspace = true
 strum_macros.workspace = true
 
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 647f571e59..2838511a77 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,6 +1,5 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use futures::future::join_all;
 use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::models::PagestreamGetPageRequest;
@@ -10,11 +9,10 @@ use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
 
 use rand::prelude::*;
-use tokio::sync::Barrier;
 use tokio::task::JoinSet;
-use tracing::{info, instrument};
+use tracing::info;
 
-use std::collections::{HashMap, HashSet};
+use std::collections::HashSet;
 use std::future::Future;
 use std::num::NonZeroUsize;
 use std::pin::Pin;
@@ -38,8 +36,12 @@ pub(crate) struct Args {
     num_clients: NonZeroUsize,
     #[clap(long)]
     runtime: Option<humantime::Duration>,
+    /// Each client sends requests at the given rate.
+    ///
+    /// If a request takes too long and we should be issuing a new request already,
+    /// we skip that request and account it as `MISSED`.
     #[clap(long)]
-    per_target_rate_limit: Option<usize>,
+    per_client_rate: Option<usize>,
     /// Probability for sending `latest=true` in the request (uniform distribution).
     #[clap(long, default_value = "1")]
     req_latest_probability: f64,
@@ -61,12 +63,16 @@ pub(crate) struct Args {
 #[derive(Debug, Default)]
 struct LiveStats {
     completed_requests: AtomicU64,
+    missed: AtomicU64,
 }
 
 impl LiveStats {
-    fn inc(&self) {
+    fn request_done(&self) {
         self.completed_requests.fetch_add(1, Ordering::Relaxed);
     }
+    fn missed(&self, n: u64) {
+        self.missed.fetch_add(n, Ordering::Relaxed);
+    }
 }
 
 #[derive(Clone, serde::Serialize, serde::Deserialize)]
@@ -220,13 +226,12 @@ async fn main_impl(
 
     let live_stats = Arc::new(LiveStats::default());
 
-    let num_client_tasks = args.num_clients.get() * timelines.len();
     let num_live_stats_dump = 1;
-    let num_work_sender_tasks = 1;
+    let num_work_sender_tasks = args.num_clients.get() * timelines.len();
     let num_main_impl = 1;
 
     let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
-        num_client_tasks + num_live_stats_dump + num_work_sender_tasks + num_main_impl,
+        num_live_stats_dump + num_work_sender_tasks + num_main_impl,
     ));
 
     tokio::spawn({
@@ -238,10 +243,12 @@ async fn main_impl(
                 let start = std::time::Instant::now();
                 tokio::time::sleep(std::time::Duration::from_secs(1)).await;
                 let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
+                let missed = stats.missed.swap(0, Ordering::Relaxed);
                 let elapsed = start.elapsed();
                 info!(
-                    "RPS: {:.0}",
-                    completed_requests as f64 / elapsed.as_secs_f64()
+                    "RPS: {:.0}   MISSED: {:.0}",
+                    completed_requests as f64 / elapsed.as_secs_f64(),
+                    missed as f64 / elapsed.as_secs_f64()
                 );
             }
         }
@@ -249,127 +256,105 @@ async fn main_impl(
 
     let cancel = CancellationToken::new();
 
-    let mut work_senders: HashMap<WorkerId, _> = HashMap::new();
-    let mut tasks = Vec::new();
+    let rps_period = args
+        .per_client_rate
+        .map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
+    let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
+        let live_stats = live_stats.clone();
+        let start_work_barrier = start_work_barrier.clone();
+        let ranges: Vec<KeyRange> = all_ranges
+            .iter()
+            .filter(|r| r.timeline == worker_id.timeline)
+            .cloned()
+            .collect();
+        let weights =
+            rand::distributions::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len()))
+                .unwrap();
+
+        let cancel = cancel.clone();
+        Box::pin(async move {
+            let client =
+                pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
+                    .await
+                    .unwrap();
+            let mut client = client
+                .pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
+                .await
+                .unwrap();
+
+            start_work_barrier.wait().await;
+            let client_start = Instant::now();
+            let mut ticks_processed = 0;
+            while !cancel.is_cancelled() {
+                // Detect if a request took longer than the RPS rate
+                if let Some(period) = &rps_period {
+                    let periods_passed_until_now =
+                        usize::try_from(client_start.elapsed().as_micros() / period.as_micros())
+                            .unwrap();
+
+                    if periods_passed_until_now > ticks_processed {
+                        live_stats.missed((periods_passed_until_now - ticks_processed) as u64);
+                    }
+                    ticks_processed = periods_passed_until_now;
+                }
+
+                let start = Instant::now();
+                let req = {
+                    let mut rng = rand::thread_rng();
+                    let r = &ranges[weights.sample(&mut rng)];
+                    let key: i128 = rng.gen_range(r.start..r.end);
+                    let key = Key::from_i128(key);
+                    assert!(is_rel_block_key(&key));
+                    let (rel_tag, block_no) =
+                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
+                    PagestreamGetPageRequest {
+                        latest: rng.gen_bool(args.req_latest_probability),
+                        lsn: r.timeline_lsn,
+                        rel: rel_tag,
+                        blkno: block_no,
+                    }
+                };
+                client.getpage(req).await.unwrap();
+                let end = Instant::now();
+                live_stats.request_done();
+                ticks_processed += 1;
+                STATS.with(|stats| {
+                    stats
+                        .borrow()
+                        .lock()
+                        .unwrap()
+                        .observe(end.duration_since(start))
+                        .unwrap();
+                });
+
+                if let Some(period) = &rps_period {
+                    let next_at = client_start
+                        + Duration::from_micros(
+                            (ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
+                        );
+                    tokio::time::sleep_until(next_at.into()).await;
+                }
+            }
+        })
+    };
+
+    info!("spawning workers");
+    let mut workers = JoinSet::new();
     for timeline in timelines.iter().cloned() {
         for num_client in 0..args.num_clients.get() {
-            let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
             let worker_id = WorkerId {
                 timeline,
                 num_client,
             };
-            work_senders.insert(worker_id, sender);
-            tasks.push(tokio::spawn(client(
-                args,
-                worker_id,
-                Arc::clone(&start_work_barrier),
-                receiver,
-                Arc::clone(&live_stats),
-                cancel.clone(),
-            )));
+            workers.spawn(make_worker(worker_id));
         }
     }
-
-    let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = {
-        let start_work_barrier = start_work_barrier.clone();
-        let cancel = cancel.clone();
-        match args.per_target_rate_limit {
-            None => Box::pin(async move {
-                let weights = rand::distributions::weighted::WeightedIndex::new(
-                    all_ranges.iter().map(|v| v.len()),
-                )
-                .unwrap();
-
-                start_work_barrier.wait().await;
-
-                while !cancel.is_cancelled() {
-                    let (timeline, req) = {
-                        let mut rng = rand::thread_rng();
-                        let r = &all_ranges[weights.sample(&mut rng)];
-                        let key: i128 = rng.gen_range(r.start..r.end);
-                        let key = Key::from_i128(key);
-                        let (rel_tag, block_no) =
-                            key_to_rel_block(key).expect("we filter non-rel-block keys out above");
-                        (
-                            WorkerId {
-                                timeline: r.timeline,
-                                num_client: rng.gen_range(0..args.num_clients.get()),
-                            },
-                            PagestreamGetPageRequest {
-                                latest: rng.gen_bool(args.req_latest_probability),
-                                lsn: r.timeline_lsn,
-                                rel: rel_tag,
-                                blkno: block_no,
-                            },
-                        )
-                    };
-                    let sender = work_senders.get(&timeline).unwrap();
-                    // TODO: what if this blocks?
-                    if sender.send(req).await.is_err() {
-                        assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
-                    }
-                }
-            }),
-            Some(rps_limit) => Box::pin(async move {
-                let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
-                let make_task: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> =
-                    &|worker_id| {
-                        let sender = work_senders.get(&worker_id).unwrap();
-                        let ranges: Vec<KeyRange> = all_ranges
-                            .iter()
-                            .filter(|r| r.timeline == worker_id.timeline)
-                            .cloned()
-                            .collect();
-                        let weights = rand::distributions::weighted::WeightedIndex::new(
-                            ranges.iter().map(|v| v.len()),
-                        )
-                        .unwrap();
-
-                        let cancel = cancel.clone();
-                        Box::pin(async move {
-                            let mut ticker = tokio::time::interval(period);
-                            ticker.set_missed_tick_behavior(
-                                /* TODO review this choice */
-                                tokio::time::MissedTickBehavior::Burst,
-                            );
-                            while !cancel.is_cancelled() {
-                                ticker.tick().await;
-                                let req = {
-                                    let mut rng = rand::thread_rng();
-                                    let r = &ranges[weights.sample(&mut rng)];
-                                    let key: i128 = rng.gen_range(r.start..r.end);
-                                    let key = Key::from_i128(key);
-                                    assert!(is_rel_block_key(&key));
-                                    let (rel_tag, block_no) = key_to_rel_block(key)
-                                        .expect("we filter non-rel-block keys out above");
-                                    PagestreamGetPageRequest {
-                                        latest: rng.gen_bool(args.req_latest_probability),
-                                        lsn: r.timeline_lsn,
-                                        rel: rel_tag,
-                                        blkno: block_no,
-                                    }
-                                };
-                                if sender.send(req).await.is_err() {
-                                    assert!(
-                                        cancel.is_cancelled(),
-                                        "client has gone away unexpectedly"
-                                    );
-                                }
-                            }
-                        })
-                    };
-
-                let tasks: Vec<_> = work_senders.keys().map(|tl| make_task(*tl)).collect();
-
-                start_work_barrier.wait().await;
-
-                join_all(tasks).await;
-            }),
+    let workers = async move {
+        while let Some(res) = workers.join_next().await {
+            res.unwrap();
         }
     };
 
-    let work_sender_task = tokio::spawn(work_sender);
-
     info!("waiting for everything to become ready");
     start_work_barrier.wait().await;
     info!("work started");
@@ -377,20 +362,13 @@ async fn main_impl(
         tokio::time::sleep(runtime.into()).await;
         info!("runtime over, signalling cancellation");
         cancel.cancel();
-        work_sender_task.await.unwrap();
+        workers.await;
         info!("work sender exited");
     } else {
-        work_sender_task.await.unwrap();
+        workers.await;
         unreachable!("work sender never terminates");
     }
 
-    info!("joining clients");
-    for t in tasks {
-        t.await.unwrap();
-    }
-
-    info!("all clients stopped");
-
     let output = Output {
         total: {
             let mut agg_stats = request_stats::Stats::new();
@@ -407,49 +385,3 @@ async fn main_impl(
 
     anyhow::Ok(())
 }
-
-#[instrument(skip_all)]
-async fn client(
-    args: &'static Args,
-    id: WorkerId,
-    start_work_barrier: Arc<Barrier>,
-    mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
-    live_stats: Arc<LiveStats>,
-    cancel: CancellationToken,
-) {
-    let WorkerId {
-        timeline,
-        num_client: _,
-    } = id;
-    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
-        .await
-        .unwrap();
-    let mut client = client
-        .pagestream(timeline.tenant_id, timeline.timeline_id)
-        .await
-        .unwrap();
-
-    let do_requests = async {
-        start_work_barrier.wait().await;
-        while let Some(req) = work.recv().await {
-            let start = Instant::now();
-            client
-                .getpage(req)
-                .await
-                .with_context(|| format!("getpage for {timeline}"))
-                .unwrap();
-            let elapsed = start.elapsed();
-            live_stats.inc();
-            STATS.with(|stats| {
-                stats.borrow().lock().unwrap().observe(elapsed).unwrap();
-            });
-        }
-    };
-    tokio::select! {
-        res = do_requests => { res },
-        _ = cancel.cancelled() => {
-            // fallthrough to shutdown
-        }
-    }
-    client.shutdown().await;
-}
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index c2b1eafc3a..a0fda39605 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2496,6 +2496,56 @@ pub mod tokio_epoll_uring {
     }
 }
 
+pub(crate) mod tenant_throttling {
+    use metrics::{register_int_counter_vec, IntCounter};
+    use once_cell::sync::Lazy;
+
+    use crate::tenant::{self, throttle::Metric};
+
+    pub(crate) struct TimelineGet {
+        wait_time: IntCounter,
+        count: IntCounter,
+    }
+
+    pub(crate) static TIMELINE_GET: Lazy<TimelineGet> = Lazy::new(|| {
+        static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+            register_int_counter_vec!(
+            "pageserver_tenant_throttling_wait_usecs_sum_global",
+            "Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.",
+            &["kind"]
+        )
+            .unwrap()
+        });
+
+        static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+            register_int_counter_vec!(
+                "pageserver_tenant_throttling_count_global",
+                "Count of tenant throttlings, by kind of throttle.",
+                &["kind"]
+            )
+            .unwrap()
+        });
+
+        let kind = "timeline_get";
+        TimelineGet {
+            wait_time: WAIT_USECS.with_label_values(&[kind]),
+            count: WAIT_COUNT.with_label_values(&[kind]),
+        }
+    });
+
+    impl Metric for &'static TimelineGet {
+        #[inline(always)]
+        fn observe_throttling(
+            &self,
+            tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation,
+        ) {
+            let val = u64::try_from(wait_time.as_micros()).unwrap();
+            self.wait_time.inc_by(val);
+            self.count.inc();
+        }
+    }
+}
+
 pub fn preinitialize_metrics() {
     // Python tests need these and on some we do alerting.
     //
@@ -2557,4 +2607,5 @@ pub fn preinitialize_metrics() {
 
     // Custom
     Lazy::force(&RECONSTRUCT_TIME);
+    Lazy::force(&tenant_throttling::TIMELINE_GET);
 }
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 6317b0a7ae..adaa55c179 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -188,6 +188,7 @@ task_local! {
     serde::Serialize,
     serde::Deserialize,
     strum_macros::IntoStaticStr,
+    strum_macros::EnumString,
 )]
 pub enum TaskKind {
     // Pageserver startup, i.e., `main`
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e2d66711c8..a4d3a4142a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -167,6 +167,8 @@ pub(crate) mod timeline;
 
 pub mod size;
 
+pub(crate) mod throttle;
+
 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 
@@ -305,6 +307,11 @@ pub struct Tenant {
     // Users of the Tenant such as the page service must take this Gate to avoid
     // trying to use a Tenant which is shutting down.
     pub(crate) gate: Gate,
+
+    /// Throttle applied at the top of [`Timeline::get`].
+    /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
+    pub(crate) timeline_get_throttle:
+        Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
 }
 
 impl std::fmt::Debug for Tenant {
@@ -990,6 +997,7 @@ impl Tenant {
                 TimelineResources {
                     remote_client: Some(remote_client),
                     deletion_queue_client: self.deletion_queue_client.clone(),
+                    timeline_get_throttle: self.timeline_get_throttle.clone(),
                 },
                 ctx,
             )
@@ -2075,7 +2083,7 @@ impl Tenant {
         };
 
         // We have a pageserver TenantConf, we need the API-facing TenantConfig.
-        let tenant_config: models::TenantConfig = conf.tenant_conf.into();
+        let tenant_config: models::TenantConfig = conf.tenant_conf.clone().into();
 
         models::LocationConfig {
             mode: location_config_mode,
@@ -2209,93 +2217,93 @@ where
 
 impl Tenant {
     pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
-        self.tenant_conf.read().unwrap().tenant_conf
+        self.tenant_conf.read().unwrap().tenant_conf.clone()
     }
 
     pub fn effective_config(&self) -> TenantConf {
         self.tenant_specific_overrides()
-            .merge(self.conf.default_tenant_conf)
+            .merge(self.conf.default_tenant_conf.clone())
     }
 
     pub fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .checkpoint_distance
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
     }
 
     pub fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .checkpoint_timeout
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
     }
 
     pub fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .compaction_target_size
             .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
     }
 
     pub fn get_compaction_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .compaction_period
             .unwrap_or(self.conf.default_tenant_conf.compaction_period)
     }
 
     pub fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .compaction_threshold
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
     pub fn get_gc_horizon(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .gc_horizon
             .unwrap_or(self.conf.default_tenant_conf.gc_horizon)
     }
 
     pub fn get_gc_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .gc_period
             .unwrap_or(self.conf.default_tenant_conf.gc_period)
     }
 
     pub fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .image_creation_threshold
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
     pub fn get_pitr_interval(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .pitr_interval
             .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
     }
 
     pub fn get_trace_read_requests(&self) -> bool {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .trace_read_requests
             .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
     }
 
     pub fn get_min_resident_size_override(&self) -> Option<u64> {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .min_resident_size_override
             .or(self.conf.default_tenant_conf.min_resident_size_override)
     }
 
     pub fn get_heatmap_period(&self) -> Option<Duration> {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         let heatmap_period = tenant_conf
             .heatmap_period
             .unwrap_or(self.conf.default_tenant_conf.heatmap_period);
@@ -2308,6 +2316,7 @@ impl Tenant {
 
     pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
         self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
+        self.tenant_conf_updated();
         // Don't hold self.timelines.lock() during the notifies.
         // There's no risk of deadlock right now, but there could be if we consolidate
         // mutexes in struct Timeline in the future.
@@ -2319,6 +2328,7 @@ impl Tenant {
 
     pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
         *self.tenant_conf.write().unwrap() = new_conf;
+        self.tenant_conf_updated();
         // Don't hold self.timelines.lock() during the notifies.
         // There's no risk of deadlock right now, but there could be if we consolidate
         // mutexes in struct Timeline in the future.
@@ -2328,6 +2338,24 @@ impl Tenant {
         }
     }
 
+    fn get_timeline_get_throttle_config(
+        psconf: &'static PageServerConf,
+        overrides: &TenantConfOpt,
+    ) -> throttle::Config {
+        overrides
+            .timeline_get_throttle
+            .clone()
+            .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
+    }
+
+    pub(crate) fn tenant_conf_updated(&self) {
+        let conf = {
+            let guard = self.tenant_conf.read().unwrap();
+            Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf)
+        };
+        self.timeline_get_throttle.reconfigure(conf)
+    }
+
     /// Helper function to create a new Timeline struct.
     ///
     /// The returned Timeline is in Loading state. The caller is responsible for
@@ -2454,7 +2482,6 @@ impl Tenant {
             // using now here is good enough approximation to catch tenants with really long
             // activation times.
             constructed_at: Instant::now(),
-            tenant_conf: Arc::new(RwLock::new(attached_conf)),
             timelines: Mutex::new(HashMap::new()),
             timelines_creating: Mutex::new(HashSet::new()),
             gc_cs: tokio::sync::Mutex::new(()),
@@ -2469,6 +2496,11 @@ impl Tenant {
             delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
             cancel: CancellationToken::default(),
             gate: Gate::default(),
+            timeline_get_throttle: Arc::new(throttle::Throttle::new(
+                Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
+                &crate::metrics::tenant_throttling::TIMELINE_GET,
+            )),
+            tenant_conf: Arc::new(RwLock::new(attached_conf)),
         }
     }
 
@@ -3224,6 +3256,7 @@ impl Tenant {
         TimelineResources {
             remote_client,
             deletion_queue_client: self.deletion_queue_client.clone(),
+            timeline_get_throttle: self.timeline_get_throttle.clone(),
         }
     }
 
@@ -3495,7 +3528,7 @@ impl Tenant {
     }
 
     pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
-        self.tenant_conf.read().unwrap().tenant_conf
+        self.tenant_conf.read().unwrap().tenant_conf.clone()
     }
 }
 
@@ -3654,6 +3687,7 @@ pub(crate) mod harness {
                 gc_feedback: Some(tenant_conf.gc_feedback),
                 heatmap_period: Some(tenant_conf.heatmap_period),
                 lazy_slru_download: Some(tenant_conf.lazy_slru_download),
+                timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
             }
         }
     }
@@ -3757,7 +3791,7 @@ pub(crate) mod harness {
                 TenantState::Loading,
                 self.conf,
                 AttachedTenantConf::try_from(LocationConf::attached_single(
-                    TenantConfOpt::from(self.tenant_conf),
+                    TenantConfOpt::from(self.tenant_conf.clone()),
                     self.generation,
                     &ShardParameters::default(),
                 ))
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 961decd247..5c88d30caf 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,8 +9,8 @@
 //! may lead to a data loss.
 //!
 use anyhow::bail;
-use pageserver_api::models;
 use pageserver_api::models::EvictionPolicy;
+use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -285,7 +285,7 @@ impl Default for LocationConf {
 ///
 /// For storing and transmitting individual tenant's configuration, see
 /// TenantConfOpt.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub struct TenantConf {
     // Flush out an inmemory layer, if it's holding WAL older than this
     // This puts a backstop on how much WAL needs to be re-digested if the
@@ -348,11 +348,13 @@ pub struct TenantConf {
 
     /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
     pub lazy_slru_download: bool,
+
+    pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
 }
 
 /// Same as TenantConf, but this struct preserves the information about
 /// which parameters are set and which are not.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
 pub struct TenantConfOpt {
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
@@ -437,6 +439,9 @@ pub struct TenantConfOpt {
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub lazy_slru_download: Option<bool>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
 }
 
 impl TenantConfOpt {
@@ -485,6 +490,10 @@ impl TenantConfOpt {
             lazy_slru_download: self
                 .lazy_slru_download
                 .unwrap_or(global_conf.lazy_slru_download),
+            timeline_get_throttle: self
+                .timeline_get_throttle
+                .clone()
+                .unwrap_or(global_conf.timeline_get_throttle),
         }
     }
 }
@@ -524,6 +533,7 @@ impl Default for TenantConf {
             gc_feedback: false,
             heatmap_period: Duration::ZERO,
             lazy_slru_download: false,
+            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
         }
     }
 }
@@ -596,6 +606,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             gc_feedback: value.gc_feedback,
             heatmap_period: value.heatmap_period.map(humantime),
             lazy_slru_download: value.lazy_slru_download,
+            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
         }
     }
 }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 90c442464f..b7f4723702 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -484,7 +484,7 @@ pub async fn init_tenant_mgr(
                             TenantSlot::Secondary(SecondaryTenant::new(
                                 tenant_shard_id,
                                 location_conf.shard,
-                                location_conf.tenant_conf,
+                                location_conf.tenant_conf.clone(),
                                 &SecondaryLocationConfig { warm: false },
                             )),
                         );
@@ -805,7 +805,7 @@ pub(crate) async fn set_new_tenant_config(
     // API to use is the location_config/ endpoint, which lets the caller provide
     // the full LocationConf.
     let location_conf = LocationConf::attached_single(
-        new_tenant_conf,
+        new_tenant_conf.clone(),
         tenant.generation,
         &ShardParameters::default(),
     );
@@ -1466,7 +1466,7 @@ impl TenantManager {
                     attach_mode: AttachmentMode::Single,
                 }),
                 shard: child_shard_identity,
-                tenant_conf: parent_tenant_conf,
+                tenant_conf: parent_tenant_conf.clone(),
             };
 
             self.upsert_location(
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 2c8ced4eb7..c466ac0c24 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -133,7 +133,7 @@ impl SecondaryTenant {
     }
 
     pub(crate) fn set_tenant_conf(&self, config: &TenantConfOpt) {
-        *(self.tenant_conf.lock().unwrap()) = *config;
+        *(self.tenant_conf.lock().unwrap()) = config.clone();
     }
 
     /// For API access: generate a LocationConfig equivalent to the one that would be used to
@@ -144,7 +144,7 @@ impl SecondaryTenant {
 
         let conf = models::LocationConfigSecondary { warm: conf.warm };
 
-        let tenant_conf = *self.tenant_conf.lock().unwrap();
+        let tenant_conf = self.tenant_conf.lock().unwrap().clone();
         models::LocationConfig {
             mode: models::LocationConfigMode::Secondary,
             generation: None,
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 950cc46e71..45ce6c9381 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -9,6 +9,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
@@ -139,6 +140,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     // How many errors we have seen consequtively
     let mut error_run_count = 0;
 
+    let mut last_throttle_flag_reset_at = Instant::now();
+
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
         let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -203,6 +206,27 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 walredo_mgr.maybe_quiesce(period * 10);
             }
 
+            // TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off,
+            // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
+            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
+                let now = Instant::now();
+                let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
+                let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats();
+                if count_throttled == 0 {
+                    return;
+                }
+                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
+                let delta = now - prev;
+                warn!(
+                    n_seconds=%format_args!("{:.3}",
+                    delta.as_secs_f64()),
+                    count_accounted,
+                    count_throttled,
+                    sum_throttled_usecs,
+                    allowed_rps=%format_args!("{allowed_rps:.0}"),
+                    "shard was throttled in the last n_seconds")
+            });
+
             // Sleep
             if tokio::time::timeout(sleep_duration, cancel.cancelled())
                 .await
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
new file mode 100644
index 0000000000..6894a88b93
--- /dev/null
+++ b/pageserver/src/tenant/throttle.rs
@@ -0,0 +1,162 @@
+use std::{
+    str::FromStr,
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Arc,
+    },
+    time::{Duration, Instant},
+};
+
+use arc_swap::ArcSwap;
+use enumset::EnumSet;
+use tracing::error;
+
+use crate::{context::RequestContext, task_mgr::TaskKind};
+
+/// Throttle for `async` functions.
+///
+/// Runtime reconfigurable.
+///
+/// To share a throttle among multiple entities, wrap it in an [`Arc`].
+///
+/// The intial use case for this is tenant-wide throttling of getpage@lsn requests.
+pub struct Throttle<M: Metric> {
+    inner: ArcSwap<Inner>,
+    metric: M,
+    /// will be turned into [`Stats::count_accounted`]
+    count_accounted: AtomicU64,
+    /// will be turned into [`Stats::count_throttled`]
+    count_throttled: AtomicU64,
+    /// will be turned into [`Stats::sum_throttled_usecs`]
+    sum_throttled_usecs: AtomicU64,
+}
+
+pub struct Inner {
+    task_kinds: EnumSet<TaskKind>,
+    rate_limiter: Arc<leaky_bucket::RateLimiter>,
+    config: Config,
+}
+
+pub type Config = pageserver_api::models::ThrottleConfig;
+
+pub struct Observation {
+    pub wait_time: Duration,
+}
+pub trait Metric {
+    fn observe_throttling(&self, observation: &Observation);
+}
+
+/// See [`Throttle::reset_stats`].
+pub struct Stats {
+    // Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`].
+    pub count_accounted: u64,
+    // Subset of the `accounted` requests that were actually throttled.
+    // Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
+    pub count_throttled: u64,
+    // Sum of microseconds that throttled requests spent waiting for throttling.
+    pub sum_throttled_usecs: u64,
+}
+
+impl<M> Throttle<M>
+where
+    M: Metric,
+{
+    pub fn new(config: Config, metric: M) -> Self {
+        Self {
+            inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
+            metric,
+            count_accounted: AtomicU64::new(0),
+            count_throttled: AtomicU64::new(0),
+            sum_throttled_usecs: AtomicU64::new(0),
+        }
+    }
+    fn new_inner(config: Config) -> Inner {
+        let Config {
+            task_kinds,
+            initial,
+            refill_interval,
+            refill_amount,
+            max,
+            fair,
+        } = &config;
+        let task_kinds: EnumSet<TaskKind> = task_kinds
+            .iter()
+            .filter_map(|s| match TaskKind::from_str(s) {
+                Ok(v) => Some(v),
+                Err(e) => {
+                    // TODO: avoid this failure mode
+                    error!(
+                        "cannot parse task kind, ignoring for rate limiting {}",
+                        utils::error::report_compact_sources(&e)
+                    );
+                    None
+                }
+            })
+            .collect();
+        Inner {
+            task_kinds,
+            rate_limiter: Arc::new(
+                leaky_bucket::RateLimiter::builder()
+                    .initial(*initial)
+                    .interval(*refill_interval)
+                    .refill(refill_amount.get())
+                    .max(*max)
+                    .fair(*fair)
+                    .build(),
+            ),
+            config,
+        }
+    }
+    pub fn reconfigure(&self, config: Config) {
+        self.inner.store(Arc::new(Self::new_inner(config)));
+    }
+
+    /// The [`Throttle`] keeps an internal flag that is true if there was ever any actual throttling.
+    /// This method allows retrieving & resetting that flag.
+    /// Useful for periodic reporting.
+    pub fn reset_stats(&self) -> Stats {
+        let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed);
+        let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed);
+        let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed);
+        Stats {
+            count_accounted,
+            count_throttled,
+            sum_throttled_usecs,
+        }
+    }
+
+    /// See [`Config::steady_rps`].
+    pub fn steady_rps(&self) -> f64 {
+        self.inner.load().config.steady_rps()
+    }
+
+    pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) {
+        let inner = self.inner.load_full(); // clones the `Inner` Arc
+        if !inner.task_kinds.contains(ctx.task_kind()) {
+            return;
+        };
+        let start = std::time::Instant::now();
+        let mut did_throttle = false;
+        let acquire = inner.rate_limiter.acquire(key_count);
+        // turn off runtime-induced preemption (aka coop) so our `did_throttle` is accurate
+        let acquire = tokio::task::unconstrained(acquire);
+        let mut acquire = std::pin::pin!(acquire);
+        std::future::poll_fn(|cx| {
+            use std::future::Future;
+            let poll = acquire.as_mut().poll(cx);
+            did_throttle = did_throttle || poll.is_pending();
+            poll
+        })
+        .await;
+        self.count_accounted.fetch_add(1, Ordering::Relaxed);
+        if did_throttle {
+            self.count_throttled.fetch_add(1, Ordering::Relaxed);
+            let now = Instant::now();
+            let wait_time = now - start;
+            self.sum_throttled_usecs
+                .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
+            let observation = Observation { wait_time };
+            self.metric.observe_throttling(&observation);
+        }
+    }
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7f7713a6c6..cd88327f34 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -164,6 +164,9 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 pub struct TimelineResources {
     pub remote_client: Option<RemoteTimelineClient>,
     pub deletion_queue_client: DeletionQueueClient,
+    pub timeline_get_throttle: Arc<
+        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
+    >,
 }
 
 pub struct Timeline {
@@ -355,6 +358,11 @@ pub struct Timeline {
     ///
     /// Timeline deletion will acquire both compaction and gc locks in whatever order.
     gc_lock: tokio::sync::Mutex<()>,
+
+    /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction.
+    timeline_get_throttle: Arc<
+        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
+    >,
 }
 
 pub struct WalReceiverInfo {
@@ -615,6 +623,8 @@ impl Timeline {
             return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
         }
 
+        self.timeline_get_throttle.throttle(ctx, 1).await;
+
         // This check is debug-only because of the cost of hashing, and because it's a double-check: we
         // already checked the key against the shard_identity when looking up the Timeline from
         // page_service.
@@ -714,6 +724,10 @@ impl Timeline {
             return Err(GetVectoredError::Oversized(key_count));
         }
 
+        self.timeline_get_throttle
+            .throttle(ctx, key_count as usize)
+            .await;
+
         let _timer = crate::metrics::GET_VECTORED_LATENCY
             .for_task_kind(ctx.task_kind())
             .map(|t| t.start_timer());
@@ -1335,49 +1349,49 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 // Private functions
 impl Timeline {
     pub(crate) fn get_lazy_slru_download(&self) -> bool {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .lazy_slru_download
             .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
     }
 
     fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .checkpoint_distance
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
     }
 
     fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .checkpoint_timeout
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
     }
 
     fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .compaction_target_size
             .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
     }
 
     fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .compaction_threshold
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
     fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .image_creation_threshold
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
     fn get_eviction_policy(&self) -> EvictionPolicy {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .eviction_policy
             .unwrap_or(self.conf.default_tenant_conf.eviction_policy)
@@ -1393,7 +1407,7 @@ impl Timeline {
     }
 
     fn get_gc_feedback(&self) -> bool {
-        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .gc_feedback
             .unwrap_or(self.conf.default_tenant_conf.gc_feedback)
@@ -1555,6 +1569,8 @@ impl Timeline {
 
                 compaction_lock: tokio::sync::Mutex::default(),
                 gc_lock: tokio::sync::Mutex::default(),
+
+                timeline_get_throttle: resources.timeline_get_throttle,
             };
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index dc499197b0..d2e9eda906 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -419,6 +419,7 @@ impl DeleteTimelineFlow {
                 TimelineResources {
                     remote_client,
                     deletion_queue_client,
+                    timeline_get_throttle: tenant.timeline_get_throttle.clone(),
                 },
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 7cdc314658..1aaded222c 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -176,6 +176,14 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "lazy_slru_download": True,
         "max_lsn_wal_lag": 230000,
         "min_resident_size_override": 23,
+        "timeline_get_throttle": {
+            "task_kinds": ["PageRequestHandler"],
+            "fair": True,
+            "initial": 0,
+            "refill_interval": "1s",
+            "refill_amount": 1000,
+            "max": 1000,
+        },
         "trace_read_requests": True,
         "walreceiver_connect_timeout": "13m",
     }

From 29fb6754320b985e478426a34eff49d7412e73e0 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 16 Feb 2024 15:50:09 -0500
Subject: [PATCH 0200/1571] Revert "fix superuser permission check for
 extensions (#6733)" (#6791)

This reverts commit 9ad940086cebd02041142117a76914bc5120c060.

This pull request reverts #6733 to avoid incompatibility with pgvector
and I will push further fixes later. Note that after reverting this pull
request, the postgres submodule will point to some detached branches.
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index b4bae26a0f..9dd9956c55 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit b4bae26a0f09c69e979e6cb55780398e3102e022
+Subproject commit 9dd9956c55ffbbd9abe77d10382453757fedfcf5
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 9eef016e18..ca2def9993 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 9eef016e18bf61753e3cbaa755f705db6a4f7b1d
+Subproject commit ca2def999368d9df098a637234ad5a9003189463
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index f7b63d8cf9..9c37a49884 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit f7b63d8cf9ae040f6907c3c13ef25fcf15a36161
+Subproject commit 9c37a4988463a97d9cacb321acf3828b09823269
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 37ca812c4a..72bc0d7e0d 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "f7b63d8cf9ae040f6907c3c13ef25fcf15a36161",
-    "postgres-v15": "9eef016e18bf61753e3cbaa755f705db6a4f7b1d",
-    "postgres-v14": "b4bae26a0f09c69e979e6cb55780398e3102e022"
+    "postgres-v16": "9c37a4988463a97d9cacb321acf3828b09823269",
+    "postgres-v15": "ca2def999368d9df098a637234ad5a9003189463",
+    "postgres-v14": "9dd9956c55ffbbd9abe77d10382453757fedfcf5"
 }

From 9b714c85728922f8ad71e6a5871cf17a86fd75b7 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 17 Feb 2024 19:15:21 +0000
Subject: [PATCH 0201/1571] build(deps): bump cryptography from 42.0.0 to
 42.0.2 (#6792)

---
 poetry.lock | 66 ++++++++++++++++++++++++++---------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index e18cd4a74d..ad0a0afd81 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -836,43 +836,43 @@ files = [
 
 [[package]]
 name = "cryptography"
-version = "42.0.0"
+version = "42.0.2"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:c640b0ef54138fde761ec99a6c7dc4ce05e80420262c20fa239e694ca371d434"},
-    {file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:678cfa0d1e72ef41d48993a7be75a76b0725d29b820ff3cfd606a5b2b33fda01"},
-    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:146e971e92a6dd042214b537a726c9750496128453146ab0ee8971a0299dc9bd"},
-    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87086eae86a700307b544625e3ba11cc600c3c0ef8ab97b0fda0705d6db3d4e3"},
-    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:0a68bfcf57a6887818307600c3c0ebc3f62fbb6ccad2240aa21887cda1f8df1b"},
-    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5a217bca51f3b91971400890905a9323ad805838ca3fa1e202a01844f485ee87"},
-    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:ca20550bb590db16223eb9ccc5852335b48b8f597e2f6f0878bbfd9e7314eb17"},
-    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:33588310b5c886dfb87dba5f013b8d27df7ffd31dc753775342a1e5ab139e59d"},
-    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9515ea7f596c8092fdc9902627e51b23a75daa2c7815ed5aa8cf4f07469212ec"},
-    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:35cf6ed4c38f054478a9df14f03c1169bb14bd98f0b1705751079b25e1cb58bc"},
-    {file = "cryptography-42.0.0-cp37-abi3-win32.whl", hash = "sha256:8814722cffcfd1fbd91edd9f3451b88a8f26a5fd41b28c1c9193949d1c689dc4"},
-    {file = "cryptography-42.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:a2a8d873667e4fd2f34aedab02ba500b824692c6542e017075a2efc38f60a4c0"},
-    {file = "cryptography-42.0.0-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:8fedec73d590fd30c4e3f0d0f4bc961aeca8390c72f3eaa1a0874d180e868ddf"},
-    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be41b0c7366e5549265adf2145135dca107718fa44b6e418dc7499cfff6b4689"},
-    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ca482ea80626048975360c8e62be3ceb0f11803180b73163acd24bf014133a0"},
-    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c58115384bdcfe9c7f644c72f10f6f42bed7cf59f7b52fe1bf7ae0a622b3a139"},
-    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:56ce0c106d5c3fec1038c3cca3d55ac320a5be1b44bf15116732d0bc716979a2"},
-    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:324721d93b998cb7367f1e6897370644751e5580ff9b370c0a50dc60a2003513"},
-    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:d97aae66b7de41cdf5b12087b5509e4e9805ed6f562406dfcf60e8481a9a28f8"},
-    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:85f759ed59ffd1d0baad296e72780aa62ff8a71f94dc1ab340386a1207d0ea81"},
-    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:206aaf42e031b93f86ad60f9f5d9da1b09164f25488238ac1dc488334eb5e221"},
-    {file = "cryptography-42.0.0-cp39-abi3-win32.whl", hash = "sha256:74f18a4c8ca04134d2052a140322002fef535c99cdbc2a6afc18a8024d5c9d5b"},
-    {file = "cryptography-42.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:14e4b909373bc5bf1095311fa0f7fcabf2d1a160ca13f1e9e467be1ac4cbdf94"},
-    {file = "cryptography-42.0.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3005166a39b70c8b94455fdbe78d87a444da31ff70de3331cdec2c568cf25b7e"},
-    {file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:be14b31eb3a293fc6e6aa2807c8a3224c71426f7c4e3639ccf1a2f3ffd6df8c3"},
-    {file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:bd7cf7a8d9f34cc67220f1195884151426ce616fdc8285df9054bfa10135925f"},
-    {file = "cryptography-42.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c310767268d88803b653fffe6d6f2f17bb9d49ffceb8d70aed50ad45ea49ab08"},
-    {file = "cryptography-42.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bdce70e562c69bb089523e75ef1d9625b7417c6297a76ac27b1b8b1eb51b7d0f"},
-    {file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:e9326ca78111e4c645f7e49cbce4ed2f3f85e17b61a563328c85a5208cf34440"},
-    {file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:69fd009a325cad6fbfd5b04c711a4da563c6c4854fc4c9544bff3088387c77c0"},
-    {file = "cryptography-42.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:988b738f56c665366b1e4bfd9045c3efae89ee366ca3839cd5af53eaa1401bce"},
-    {file = "cryptography-42.0.0.tar.gz", hash = "sha256:6cf9b76d6e93c62114bd19485e5cb003115c134cf9ce91f8ac924c44f8c8c3f4"},
+    {file = "cryptography-42.0.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:701171f825dcab90969596ce2af253143b93b08f1a716d4b2a9d2db5084ef7be"},
+    {file = "cryptography-42.0.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:61321672b3ac7aade25c40449ccedbc6db72c7f5f0fdf34def5e2f8b51ca530d"},
+    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea2c3ffb662fec8bbbfce5602e2c159ff097a4631d96235fcf0fb00e59e3ece4"},
+    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b15c678f27d66d247132cbf13df2f75255627bcc9b6a570f7d2fd08e8c081d2"},
+    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:8e88bb9eafbf6a4014d55fb222e7360eef53e613215085e65a13290577394529"},
+    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a047682d324ba56e61b7ea7c7299d51e61fd3bca7dad2ccc39b72bd0118d60a1"},
+    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:36d4b7c4be6411f58f60d9ce555a73df8406d484ba12a63549c88bd64f7967f1"},
+    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:a00aee5d1b6c20620161984f8ab2ab69134466c51f58c052c11b076715e72929"},
+    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b97fe7d7991c25e6a31e5d5e795986b18fbbb3107b873d5f3ae6dc9a103278e9"},
+    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5fa82a26f92871eca593b53359c12ad7949772462f887c35edaf36f87953c0e2"},
+    {file = "cryptography-42.0.2-cp37-abi3-win32.whl", hash = "sha256:4b063d3413f853e056161eb0c7724822a9740ad3caa24b8424d776cebf98e7ee"},
+    {file = "cryptography-42.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:841ec8af7a8491ac76ec5a9522226e287187a3107e12b7d686ad354bb78facee"},
+    {file = "cryptography-42.0.2-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:55d1580e2d7e17f45d19d3b12098e352f3a37fe86d380bf45846ef257054b242"},
+    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28cb2c41f131a5758d6ba6a0504150d644054fd9f3203a1e8e8d7ac3aea7f73a"},
+    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9097a208875fc7bbeb1286d0125d90bdfed961f61f214d3f5be62cd4ed8a446"},
+    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:44c95c0e96b3cb628e8452ec060413a49002a247b2b9938989e23a2c8291fc90"},
+    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2f9f14185962e6a04ab32d1abe34eae8a9001569ee4edb64d2304bf0d65c53f3"},
+    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:09a77e5b2e8ca732a19a90c5bca2d124621a1edb5438c5daa2d2738bfeb02589"},
+    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:ad28cff53f60d99a928dfcf1e861e0b2ceb2bc1f08a074fdd601b314e1cc9e0a"},
+    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:130c0f77022b2b9c99d8cebcdd834d81705f61c68e91ddd614ce74c657f8b3ea"},
+    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:fa3dec4ba8fb6e662770b74f62f1a0c7d4e37e25b58b2bf2c1be4c95372b4a33"},
+    {file = "cryptography-42.0.2-cp39-abi3-win32.whl", hash = "sha256:3dbd37e14ce795b4af61b89b037d4bc157f2cb23e676fa16932185a04dfbf635"},
+    {file = "cryptography-42.0.2-cp39-abi3-win_amd64.whl", hash = "sha256:8a06641fb07d4e8f6c7dda4fc3f8871d327803ab6542e33831c7ccfdcb4d0ad6"},
+    {file = "cryptography-42.0.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:087887e55e0b9c8724cf05361357875adb5c20dec27e5816b653492980d20380"},
+    {file = "cryptography-42.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a7ef8dd0bf2e1d0a27042b231a3baac6883cdd5557036f5e8df7139255feaac6"},
+    {file = "cryptography-42.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4383b47f45b14459cab66048d384614019965ba6c1a1a141f11b5a551cace1b2"},
+    {file = "cryptography-42.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:fbeb725c9dc799a574518109336acccaf1303c30d45c075c665c0793c2f79a7f"},
+    {file = "cryptography-42.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:320948ab49883557a256eab46149df79435a22d2fefd6a66fe6946f1b9d9d008"},
+    {file = "cryptography-42.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5ef9bc3d046ce83c4bbf4c25e1e0547b9c441c01d30922d812e887dc5f125c12"},
+    {file = "cryptography-42.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:52ed9ebf8ac602385126c9a2fe951db36f2cb0c2538d22971487f89d0de4065a"},
+    {file = "cryptography-42.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:141e2aa5ba100d3788c0ad7919b288f89d1fe015878b9659b307c9ef867d3a65"},
+    {file = "cryptography-42.0.2.tar.gz", hash = "sha256:e0ec52ba3c7f1b7d813cd52649a5b3ef1fc0d433219dc8c93827c57eab6cf888"},
 ]
 
 [package.dependencies]

From e3ded64d1bea6a44477fdbc1dd2b9fca0970de31 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sat, 17 Feb 2024 22:13:25 +0200
Subject: [PATCH 0202/1571] Support pg-ivm extension (#6793)

## Problem

See https://github.com/neondatabase/cloud/issues/10268

## Summary of changes

Add pg_ivm extension

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 Dockerfile.compute-node | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 4eb6dc91c0..c34f3684e9 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -769,6 +769,24 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install
 
+#########################################################################################
+#
+# Layer "pg_ivm"
+# compile pg_ivm extension
+#
+#########################################################################################
+FROM build-deps AS pg-ivm-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
+    echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
+    mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
+
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -810,6 +828,7 @@ COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \

From 24014d838334132388039058e7d9208d8c75edd3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Sun, 18 Feb 2024 08:51:12 +0000
Subject: [PATCH 0203/1571] pageserver: fix sharding emitting empty image
 layers during compaction (#6776)

## Problem

Sharded tenants would sometimes try to write empty image layers during
compaction: this was more noticeable on larger databases.
- https://github.com/neondatabase/neon/issues/6755

**Note to reviewers: the last commit is a refactor that de-intents a
whole block, I recommend reviewing the earlier commits one by one to see
the real changes**

## Summary of changes

- Fix a case where when we drop a key during compaction, we might fail
to write out keys (this was broken when vectored get was added)
- If an image layer is empty, then do not try and write it out, but
leave `start` where it is so that if the subsequent key range meets
criteria for writing an image layer, we will extend its key range to
cover the empty area.
- Add a compaction test that configures small layers and compaction
thresholds, and asserts that we really successfully did image layer
generation. This fails before the fix.
---
 compute_tools/src/config.rs       |   3 +
 libs/pageserver_api/src/shard.rs  |  10 +-
 pageserver/src/tenant/timeline.rs | 161 +++++++++++++++++-------------
 3 files changed, 98 insertions(+), 76 deletions(-)

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index a7ef8cea92..03fd56aa97 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -51,6 +51,9 @@ pub fn write_postgres_conf(
     if let Some(s) = &spec.pageserver_connstring {
         writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
     }
+    if let Some(stripe_size) = spec.shard_stripe_size {
+        writeln!(file, "neon.stripe_size={stripe_size}")?;
+    }
     if !spec.safekeeper_connstrings.is_empty() {
         writeln!(
             file,
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index a50ac74af1..467a4cf0c1 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -502,10 +502,12 @@ impl ShardIdentity {
     pub fn is_key_disposable(&self, key: &Key) -> bool {
         if key_is_shard0(key) {
             // Q: Why can't we dispose of shard0 content if we're not shard 0?
-            // A: because the WAL ingestion logic currently ingests some shard 0
-            //    content on all shards, even though it's only read on shard 0.  If we
-            //    dropped it, then subsequent WAL ingest to these keys would encounter
-            //    an error.
+            // A1: because the WAL ingestion logic currently ingests some shard 0
+            //     content on all shards, even though it's only read on shard 0.  If we
+            //     dropped it, then subsequent WAL ingest to these keys would encounter
+            //     an error.
+            // A2: because key_is_shard0 also covers relation size keys, which are written
+            //     on all shards even though they're only maintained accurately on shard 0.
             false
         } else {
             !self.is_key_local(key)
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index cd88327f34..ec1dbddfc6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3290,90 +3290,107 @@ impl Timeline {
 
         for partition in partitioning.parts.iter() {
             let img_range = start..partition.ranges.last().unwrap().end;
-            start = img_range.end;
-            if force || self.time_for_new_image_layer(partition, lsn).await {
-                let mut image_layer_writer = ImageLayerWriter::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_shard_id,
-                    &img_range,
-                    lsn,
-                )
-                .await?;
+            if !force && !self.time_for_new_image_layer(partition, lsn).await {
+                start = img_range.end;
+                continue;
+            }
 
-                fail_point!("image-layer-writer-fail-before-finish", |_| {
-                    Err(CreateImageLayersError::Other(anyhow::anyhow!(
-                        "failpoint image-layer-writer-fail-before-finish"
-                    )))
-                });
+            let mut image_layer_writer = ImageLayerWriter::new(
+                self.conf,
+                self.timeline_id,
+                self.tenant_shard_id,
+                &img_range,
+                lsn,
+            )
+            .await?;
 
-                let mut key_request_accum = KeySpaceAccum::new();
-                for range in &partition.ranges {
-                    let mut key = range.start;
-                    while key < range.end {
-                        if self.shard_identity.is_key_disposable(&key) {
-                            debug!(
-                                "Dropping key {} during compaction (it belongs on shard {:?})",
-                                key,
-                                self.shard_identity.get_shard_number(&key)
-                            );
-                            key = key.next();
-                            continue;
-                        }
+            fail_point!("image-layer-writer-fail-before-finish", |_| {
+                Err(CreateImageLayersError::Other(anyhow::anyhow!(
+                    "failpoint image-layer-writer-fail-before-finish"
+                )))
+            });
 
+            let mut wrote_keys = false;
+
+            let mut key_request_accum = KeySpaceAccum::new();
+            for range in &partition.ranges {
+                let mut key = range.start;
+                while key < range.end {
+                    // Decide whether to retain this key: usually we do, but sharded tenants may
+                    // need to drop keys that don't belong to them.  If we retain the key, add it
+                    // to `key_request_accum` for later issuing a vectored get
+                    if self.shard_identity.is_key_disposable(&key) {
+                        debug!(
+                            "Dropping key {} during compaction (it belongs on shard {:?})",
+                            key,
+                            self.shard_identity.get_shard_number(&key)
+                        );
+                    } else {
                         key_request_accum.add_key(key);
-                        if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
-                            || key.next() == range.end
-                        {
-                            let results = self
-                                .get_vectored(
-                                    &key_request_accum.consume_keyspace().ranges,
-                                    lsn,
-                                    ctx,
-                                )
-                                .await?;
+                    }
 
-                            for (img_key, img) in results {
-                                let img = match img {
-                                    Ok(img) => img,
-                                    Err(err) => {
-                                        // If we fail to reconstruct a VM or FSM page, we can zero the
-                                        // page without losing any actual user data. That seems better
-                                        // than failing repeatedly and getting stuck.
-                                        //
-                                        // We had a bug at one point, where we truncated the FSM and VM
-                                        // in the pageserver, but the Postgres didn't know about that
-                                        // and continued to generate incremental WAL records for pages
-                                        // that didn't exist in the pageserver. Trying to replay those
-                                        // WAL records failed to find the previous image of the page.
-                                        // This special case allows us to recover from that situation.
-                                        // See https://github.com/neondatabase/neon/issues/2601.
-                                        //
-                                        // Unfortunately we cannot do this for the main fork, or for
-                                        // any metadata keys, keys, as that would lead to actual data
-                                        // loss.
-                                        if is_rel_fsm_block_key(img_key)
-                                            || is_rel_vm_block_key(img_key)
-                                        {
-                                            warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
-                                            ZERO_PAGE.clone()
-                                        } else {
-                                            return Err(
-                                                CreateImageLayersError::PageReconstructError(err),
-                                            );
-                                        }
+                    let last_key_in_range = key.next() == range.end;
+                    key = key.next();
+
+                    // Maybe flush `key_rest_accum`
+                    if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
+                        || last_key_in_range
+                    {
+                        let results = self
+                            .get_vectored(&key_request_accum.consume_keyspace().ranges, lsn, ctx)
+                            .await?;
+
+                        for (img_key, img) in results {
+                            let img = match img {
+                                Ok(img) => img,
+                                Err(err) => {
+                                    // If we fail to reconstruct a VM or FSM page, we can zero the
+                                    // page without losing any actual user data. That seems better
+                                    // than failing repeatedly and getting stuck.
+                                    //
+                                    // We had a bug at one point, where we truncated the FSM and VM
+                                    // in the pageserver, but the Postgres didn't know about that
+                                    // and continued to generate incremental WAL records for pages
+                                    // that didn't exist in the pageserver. Trying to replay those
+                                    // WAL records failed to find the previous image of the page.
+                                    // This special case allows us to recover from that situation.
+                                    // See https://github.com/neondatabase/neon/issues/2601.
+                                    //
+                                    // Unfortunately we cannot do this for the main fork, or for
+                                    // any metadata keys, keys, as that would lead to actual data
+                                    // loss.
+                                    if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key)
+                                    {
+                                        warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
+                                        ZERO_PAGE.clone()
+                                    } else {
+                                        return Err(CreateImageLayersError::PageReconstructError(
+                                            err,
+                                        ));
                                     }
-                                };
+                                }
+                            };
 
-                                image_layer_writer.put_image(img_key, img).await?;
-                            }
+                            // Write all the keys we just read into our new image layer.
+                            image_layer_writer.put_image(img_key, img).await?;
+                            wrote_keys = true;
                         }
-
-                        key = key.next();
                     }
                 }
+            }
+
+            if wrote_keys {
+                // Normal path: we have written some data into the new image layer for this
+                // partition, so flush it to disk.
+                start = img_range.end;
                 let image_layer = image_layer_writer.finish(self).await?;
                 image_layers.push(image_layer);
+            } else {
+                // Special case: the image layer may be empty if this is a sharded tenant and the
+                // partition does not cover any keys owned by this shard.  In this case, to ensure
+                // we don't leave gaps between image layers, leave `start` where it is, so that the next
+                // layer we write will cover the key range that we just scanned.
+                tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
             }
         }
         // All layers that the GC wanted us to create have now been created.

From 61f99d703df5f7b5612e54acee4baea4d78ca2af Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sun, 18 Feb 2024 12:16:07 +0000
Subject: [PATCH 0204/1571]  test_create_snapshot: do not try to copy
 pg_dynshmem dir (#6796)

## Problem
`test_create_snapshot` is flaky[0] on CI and fails constantly on macOS,
but with a slightly different error:
```
shutil.Error: [('/Users/bayandin/work/neon/test_output/test_create_snapshot[release-pg15-1-100]/repo/endpoints/ep-1/pgdata/pg_dynshmem', '/Users/bayandin/work/neon/test_output/compatibility_snapshot_pgv15/repo/endpoints/ep-1/pgdata/pg_dynshmem', "[Errno 2] No such file or directory: '/Users/bayandin/work/neon/test_output/test_create_snapshot[release-pg15-1-100]/repo/endpoints/ep-1/pgdata/pg_dynshmem'")]
```
Also (on macOS) `repo/endpoints/ep-1/pgdata/pg_dynshmem` is a symlink
to `/dev/shm/`.

- [0] https://github.com/neondatabase/neon/issues/6784

## Summary of changes
Ignore `pg_dynshmem` directory while copying a snapshot
---
 test_runner/regress/test_compatibility.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 826821e52b..465101f64f 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -141,7 +141,12 @@ def test_create_snapshot(
     )
     if compatibility_snapshot_dir.exists():
         shutil.rmtree(compatibility_snapshot_dir)
-    shutil.copytree(test_output_dir, compatibility_snapshot_dir)
+
+    shutil.copytree(
+        test_output_dir,
+        compatibility_snapshot_dir,
+        ignore=shutil.ignore_patterns("pg_dynshmem"),
+    )
 
 
 @check_ondisk_data_compatibility_if_enabled

From 5667372c61dada38405afe73a6d52c886e63c267 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Sun, 18 Feb 2024 15:55:19 +0000
Subject: [PATCH 0205/1571] pageserver: during shard split, wait for child to
 activate (#6789)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

test_sharding_split_unsharded was flaky with log errors from tenants not
being active. This was happening when the split function enters
wait_lsn() while the child shard might still be activating. It's flaky
rather than an outright failure because activation is usually very fast.

This is also a real bug fix, because in realistic scenarios we could
proceed to detach the parent shard before the children are ready,
leading to an availability gap for clients.

## Summary of changes

- Do a short wait_to_become_active on the child shards before proceeding
to wait for their LSNs to advance

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/http/routes.rs | 12 +++++++++---
 pageserver/src/tenant/mgr.rs  | 11 +++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 10ca96a2c1..107eed6801 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -83,12 +83,12 @@ use utils::{
 // This is not functionally necessary (clients will retry), but avoids generating a lot of
 // failed API calls while tenants are activating.
 #[cfg(not(feature = "testing"))]
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
+pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
 
 // Tests run on slow/oversubscribed nodes, and may need to wait much longer for tenants to
 // finish attaching, if calls to remote storage are slow.
 #[cfg(feature = "testing")]
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
+pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 
 pub struct State {
     conf: &'static PageServerConf,
@@ -571,10 +571,16 @@ async fn timeline_list_handler(
         parse_query_param(&request, "force-await-initial-logical-size")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
+    let state = get_state(&request);
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
 
     let response_data = async {
-        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id, false)?;
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
         let timelines = tenant.list_timelines();
 
         let mut response_data = Vec::with_capacity(timelines.len());
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index b7f4723702..c765c6bacf 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -32,6 +32,7 @@ use crate::control_plane_client::{
     ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
 };
 use crate::deletion_queue::DeletionQueueClient;
+use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
 use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
@@ -1489,6 +1490,16 @@ impl TenantManager {
                 peek_slot.and_then(|s| s.get_attached()).cloned()
             };
             if let Some(t) = child_shard {
+                // Wait for the child shard to become active: this should be very quick because it only
+                // has to download the index_part that we just uploaded when creating it.
+                if let Err(e) = t.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await {
+                    // This is not fatal: we have durably created the child shard.  It just makes the
+                    // split operation less seamless for clients, as we will may detach the parent
+                    // shard before the child shards are fully ready to serve requests.
+                    tracing::warn!("Failed to wait for shard {child_shard_id} to activate: {e}");
+                    continue;
+                }
+
                 let timelines = t.timelines.lock().unwrap().clone();
                 for timeline in timelines.values() {
                     let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else {

From 4d2bf55e6c5b9b40a82daa26d258870556daa370 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 19 Feb 2024 11:07:27 +0000
Subject: [PATCH 0206/1571] CI: temporary disable coverage report for
 regression tests (#6798)

## Problem

The merging coverage data step recently started to be too flaky.
This failure blocks staging deployment and along with the flakiness of
regression tests might require 4-5-6 manual restarts of a CI job.

Refs:
- https://github.com/neondatabase/neon/issues/4540
- https://github.com/neondatabase/neon/issues/6485
- https://neondb.slack.com/archives/C059ZC138NR/p1704131143740669

## Summary of changes
- Disable code coverage report for functional tests
---
 .github/workflows/build_and_test.yml | 6 +++++-
 scripts/comment-test-report.js       | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index c53cbada7d..3ce5d9c2b3 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -473,8 +473,12 @@ jobs:
           BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
 
+      # Temporary disable this step until we figure out why it's so flaky
+      # Ref https://github.com/neondatabase/neon/issues/4540
       - name: Merge and upload coverage data
-        if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
+        if: |
+          false &&
+          matrix.build_type == 'debug' && matrix.pg_version == 'v14'
         uses: ./.github/actions/save-coverage-data
 
   get-benchmarks-durations:
diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js
index 89befda71f..f42262cf48 100755
--- a/scripts/comment-test-report.js
+++ b/scripts/comment-test-report.js
@@ -188,7 +188,7 @@ const reportSummary = async (params) => {
 }
 
 const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => {
-    let summary = `\n### Code coverage ([full report](${coverageUrl}))\n`
+    let summary = `\n### Code coverage* ([full report](${coverageUrl}))\n`
 
     const coverage = await (await fetch(summaryJsonUrl)).json()
     for (const covType of Object.keys(coverage).sort()) {
@@ -198,7 +198,7 @@ const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => {
 
         summary += `- \`${covType}s\`: \`${coverage[covType]["_summary"]}\`\n`
     }
-
+    summary += "\n\\* collected from Rust tests only\n"
     summary += `\n___\n`
 
     return summary

From 587cb705b898565d459d044df84d1ac2633f00bf Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 19 Feb 2024 12:34:27 +0000
Subject: [PATCH 0207/1571] pageserver: roll open layer in timeline writer
 (#6661)

## Problem
One WAL record can actually produce an arbitrary amount of key value pairs.
This is problematic since it might cause our frozen layers to bloat past the
max allowed size of S3 single shot uploads.

[#6639](https://github.com/neondatabase/neon/pull/6639) introduced a "should roll"
check after every batch of `ingest_batch_size` (100 WAL records by default). This helps,
but the original problem still exists.

## Summary of changes
This patch moves the responsibility of rolling the currently open layer
to the `TimelineWriter`. Previously, this was done ad-hoc via calls
to `check_checkpoint_distance`. The advantages of this approach are:
* ability to split one batch over multiple open layers
* less layer map locking
* remove ad-hoc check_checkpoint_distance calls

More specifically, we track the current size of the open layer in the
writer. On each `put` check whether the current layer should be closed
and a new one opened. Keeping track of the currently open layer results
in less contention on the layer map lock. It only needs to be acquired
on the first write and on writes that require a roll afterwards.

Rolling the open layer can be triggered by:
1. The distance from the last LSN we rolled at. This bounds the amount
of WAL that the safekeepers need to store.
2. The size of the currently open layer.
3. The time since the last roll. It helps safekeepers to regard
pageserver as caught up and suspend activity.

Closes #6624
---
 pageserver/src/pgdatadir_mapping.rs           |  17 +-
 pageserver/src/tenant.rs                      |  32 +-
 .../tenant/storage_layer/inmemory_layer.rs    |  38 +--
 pageserver/src/tenant/timeline.rs             | 300 ++++++++++++------
 .../walreceiver/walreceiver_connection.rs     |  27 --
 .../fixtures/pageserver/allowed_errors.py     |   5 +
 6 files changed, 253 insertions(+), 166 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 0ff03303d4..65f8ddaab4 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,6 +15,7 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
+use itertools::Itertools;
 use pageserver_api::key::{
     dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
     rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -1492,7 +1493,7 @@ impl<'a> DatadirModification<'a> {
             return Ok(());
         }
 
-        let writer = self.tline.writer().await;
+        let mut writer = self.tline.writer().await;
 
         // Flush relation and  SLRU data blocks, keep metadata.
         let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
@@ -1531,13 +1532,23 @@ impl<'a> DatadirModification<'a> {
     /// All the modifications in this atomic update are stamped by the specified LSN.
     ///
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let writer = self.tline.writer().await;
+        let mut writer = self.tline.writer().await;
 
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
         if !self.pending_updates.is_empty() {
-            writer.put_batch(&self.pending_updates, ctx).await?;
+            let prev_pending_updates = std::mem::take(&mut self.pending_updates);
+
+            // The put_batch call below expects expects the inputs to be sorted by Lsn,
+            // so we do that first.
+            let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = prev_pending_updates
+                .into_iter()
+                .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
+                .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
+                .collect();
+
+            writer.put_batch(lsn_ordered_batch, ctx).await?;
             self.pending_updates.clear();
         }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a4d3a4142a..c646e5cf90 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3890,7 +3890,7 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -3902,7 +3902,7 @@ mod tests {
         writer.finish_write(Lsn(0x10));
         drop(writer);
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -3968,7 +3968,7 @@ mod tests {
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
 
         #[allow(non_snake_case)]
         let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
@@ -4002,7 +4002,7 @@ mod tests {
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
-        let new_writer = newtline.writer().await;
+        let mut new_writer = newtline.writer().await;
         new_writer
             .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
             .await?;
@@ -4034,7 +4034,7 @@ mod tests {
     ) -> anyhow::Result<()> {
         let mut lsn = start_lsn;
         {
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             // Create a relation on the timeline
             writer
                 .put(
@@ -4059,7 +4059,7 @@ mod tests {
         }
         tline.freeze_and_flush().await?;
         {
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             writer
                 .put(
                     *TEST_KEY,
@@ -4422,7 +4422,7 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4439,7 +4439,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4456,7 +4456,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4473,7 +4473,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4535,7 +4535,7 @@ mod tests {
         for _ in 0..50 {
             for _ in 0..10000 {
                 test_key.field6 = blknum;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4597,7 +4597,7 @@ mod tests {
         for blknum in 0..NUM_KEYS {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             writer
                 .put(
                     test_key,
@@ -4618,7 +4618,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4686,7 +4686,7 @@ mod tests {
         for blknum in 0..NUM_KEYS {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             writer
                 .put(
                     test_key,
@@ -4715,7 +4715,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4792,7 +4792,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index c597b15533..4b06a787ce 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -246,32 +246,17 @@ impl InMemoryLayer {
 
     /// Common subroutine of the public put_wal_record() and put_page_image() functions.
     /// Adds the page version to the in-memory tree
+
     pub(crate) async fn put_value(
         &self,
         key: Key,
         lsn: Lsn,
-        val: &Value,
+        buf: &[u8],
         ctx: &RequestContext,
     ) -> Result<()> {
         let mut inner = self.inner.write().await;
         self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, val, ctx).await
-    }
-
-    pub(crate) async fn put_values(
-        &self,
-        values: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> Result<()> {
-        let mut inner = self.inner.write().await;
-        self.assert_writable();
-        for (key, vals) in values {
-            for (lsn, val) in vals {
-                self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
-                    .await?;
-            }
-        }
-        Ok(())
+        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
     }
 
     async fn put_value_locked(
@@ -279,22 +264,16 @@ impl InMemoryLayer {
         locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
         key: Key,
         lsn: Lsn,
-        val: &Value,
+        buf: &[u8],
         ctx: &RequestContext,
     ) -> Result<()> {
         trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
 
         let off = {
-            // Avoid doing allocations for "small" values.
-            // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
-            // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
-            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
-            buf.clear();
-            val.ser_into(&mut buf)?;
             locked_inner
                 .file
                 .write_blob(
-                    &buf,
+                    buf,
                     &RequestContextBuilder::extend(ctx)
                         .page_content_kind(PageContentKind::InMemoryLayer)
                         .build(),
@@ -322,7 +301,12 @@ impl InMemoryLayer {
     pub async fn freeze(&self, end_lsn: Lsn) {
         let inner = self.inner.write().await;
 
-        assert!(self.start_lsn < end_lsn);
+        assert!(
+            self.start_lsn < end_lsn,
+            "{} >= {}",
+            self.start_lsn,
+            end_lsn
+        );
         self.end_lsn.set(end_lsn).expect("end_lsn set only once");
 
         for vec_map in inner.index.values() {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ec1dbddfc6..dcb00a1683 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -33,7 +33,7 @@ use tokio::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::sync::gate::Gate;
+use utils::{bin_ser::BeSer, sync::gate::Gate};
 
 use std::ops::{Deref, Range};
 use std::pin::pin;
@@ -274,7 +274,7 @@ pub struct Timeline {
     /// Locked automatically by [`TimelineWriter`] and checkpointer.
     /// Must always be acquired before the layer map/individual layer lock
     /// to avoid deadlock.
-    write_lock: tokio::sync::Mutex<()>,
+    write_lock: tokio::sync::Mutex<Option<TimelineWriterState>>,
 
     /// Used to avoid multiple `flush_loop` tasks running
     pub(super) flush_loop_state: Mutex<FlushLoopState>,
@@ -1051,53 +1051,10 @@ impl Timeline {
     pub(crate) async fn writer(&self) -> TimelineWriter<'_> {
         TimelineWriter {
             tl: self,
-            _write_guard: self.write_lock.lock().await,
+            write_guard: self.write_lock.lock().await,
         }
     }
 
-    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
-    /// the in-memory layer, and initiate flushing it if so.
-    ///
-    /// Also flush after a period of time without new data -- it helps
-    /// safekeepers to regard pageserver as caught up and suspend activity.
-    pub(crate) async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
-        let last_lsn = self.get_last_record_lsn();
-        let open_layer_size = {
-            let guard = self.layers.read().await;
-            let layers = guard.layer_map();
-            let Some(open_layer) = layers.open_layer.as_ref() else {
-                return Ok(());
-            };
-            open_layer.size().await?
-        };
-        let last_freeze_at = self.last_freeze_at.load();
-        let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
-        let distance = last_lsn.widening_sub(last_freeze_at);
-        // Checkpointing the open layer can be triggered by layer size or LSN range.
-        // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and
-        // we want to stay below that with a big margin.  The LSN distance determines how
-        // much WAL the safekeepers need to store.
-        if distance >= self.get_checkpoint_distance().into()
-            || open_layer_size > self.get_checkpoint_distance()
-            || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
-        {
-            info!(
-                "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
-                distance,
-                open_layer_size,
-                last_freeze_ts.elapsed()
-            );
-
-            self.freeze_inmem_layer(true).await;
-            self.last_freeze_at.store(last_lsn);
-            *(self.last_freeze_ts.write().unwrap()) = Instant::now();
-
-            // Wake up the layer flusher
-            self.flush_frozen_layers();
-        }
-        Ok(())
-    }
-
     pub(crate) fn activate(
         self: &Arc<Self>,
         broker_client: BrokerClientChannel,
@@ -1529,7 +1486,7 @@ impl Timeline {
                 layer_flush_start_tx,
                 layer_flush_done_tx,
 
-                write_lock: tokio::sync::Mutex::new(()),
+                write_lock: tokio::sync::Mutex::new(None),
 
                 gc_info: std::sync::RwLock::new(GcInfo {
                     retain_lsns: Vec::new(),
@@ -2702,43 +2659,6 @@ impl Timeline {
         Ok(layer)
     }
 
-    async fn put_value(
-        &self,
-        key: Key,
-        lsn: Lsn,
-        val: &Value,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        //info!("PUT: key {} at {}", key, lsn);
-        let layer = self.get_layer_for_write(lsn).await?;
-        layer.put_value(key, lsn, val, ctx).await?;
-        Ok(())
-    }
-
-    async fn put_values(
-        &self,
-        values: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // Pick the first LSN in the batch to get the layer to write to.
-        for lsns in values.values() {
-            if let Some((lsn, _)) = lsns.first() {
-                let layer = self.get_layer_for_write(*lsn).await?;
-                layer.put_values(values, ctx).await?;
-                break;
-            }
-        }
-        Ok(())
-    }
-
-    async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        if let Some((_, lsn)) = tombstones.first() {
-            let layer = self.get_layer_for_write(*lsn).await?;
-            layer.put_tombstones(tombstones).await?;
-        }
-        Ok(())
-    }
-
     pub(crate) fn finish_write(&self, new_lsn: Lsn) {
         assert!(new_lsn.is_aligned());
 
@@ -2749,14 +2669,20 @@ impl Timeline {
     async fn freeze_inmem_layer(&self, write_lock_held: bool) {
         // Freeze the current open in-memory layer. It will be written to disk on next
         // iteration.
+
         let _write_guard = if write_lock_held {
             None
         } else {
             Some(self.write_lock.lock().await)
         };
+
+        self.freeze_inmem_layer_at(self.get_last_record_lsn()).await;
+    }
+
+    async fn freeze_inmem_layer_at(&self, at: Lsn) {
         let mut guard = self.layers.write().await;
         guard
-            .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
+            .try_freeze_in_memory_layer(at, &self.last_freeze_at)
             .await;
     }
 
@@ -4779,13 +4705,43 @@ fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageRecon
     PageReconstructError::from(msg)
 }
 
+struct TimelineWriterState {
+    open_layer: Arc<InMemoryLayer>,
+    current_size: u64,
+    // Previous Lsn which passed through
+    prev_lsn: Option<Lsn>,
+    // Largest Lsn which passed through the current writer
+    max_lsn: Option<Lsn>,
+    // Cached details of the last freeze. Avoids going trough the atomic/lock on every put.
+    cached_last_freeze_at: Lsn,
+    cached_last_freeze_ts: Instant,
+}
+
+impl TimelineWriterState {
+    fn new(
+        open_layer: Arc<InMemoryLayer>,
+        current_size: u64,
+        last_freeze_at: Lsn,
+        last_freeze_ts: Instant,
+    ) -> Self {
+        Self {
+            open_layer,
+            current_size,
+            prev_lsn: None,
+            max_lsn: None,
+            cached_last_freeze_at: last_freeze_at,
+            cached_last_freeze_ts: last_freeze_ts,
+        }
+    }
+}
+
 /// Various functions to mutate the timeline.
 // TODO Currently, Deref is used to allow easy access to read methods from this trait.
 // This is probably considered a bad practice in Rust and should be fixed eventually,
 // but will cause large code changes.
 pub(crate) struct TimelineWriter<'a> {
     tl: &'a Timeline,
-    _write_guard: tokio::sync::MutexGuard<'a, ()>,
+    write_guard: tokio::sync::MutexGuard<'a, Option<TimelineWriterState>>,
 }
 
 impl Deref for TimelineWriter<'_> {
@@ -4796,31 +4752,189 @@ impl Deref for TimelineWriter<'_> {
     }
 }
 
+impl Drop for TimelineWriter<'_> {
+    fn drop(&mut self) {
+        self.write_guard.take();
+    }
+}
+
+enum OpenLayerAction {
+    Roll,
+    Open,
+    None,
+}
+
 impl<'a> TimelineWriter<'a> {
     /// Put a new page version that can be constructed from a WAL record
     ///
     /// This will implicitly extend the relation, if the page is beyond the
     /// current end-of-file.
     pub(crate) async fn put(
-        &self,
+        &mut self,
         key: Key,
         lsn: Lsn,
         value: &Value,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        self.tl.put_value(key, lsn, value, ctx).await
+        // Avoid doing allocations for "small" values.
+        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
+        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
+        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
+        buf.clear();
+        value.ser_into(&mut buf)?;
+        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
+
+        let action = self.get_open_layer_action(lsn, buf_size);
+        let layer = self.handle_open_layer_action(lsn, action).await?;
+        let res = layer.put_value(key, lsn, &buf, ctx).await;
+
+        if res.is_ok() {
+            // Update the current size only when the entire write was ok.
+            // In case of failures, we may have had partial writes which
+            // render the size tracking out of sync. That's ok because
+            // the checkpoint distance should be significantly smaller
+            // than the S3 single shot upload limit of 5GiB.
+            let state = self.write_guard.as_mut().unwrap();
+
+            state.current_size += buf_size;
+            state.prev_lsn = Some(lsn);
+            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
+        }
+
+        res
     }
 
+    async fn handle_open_layer_action(
+        &mut self,
+        at: Lsn,
+        action: OpenLayerAction,
+    ) -> anyhow::Result<&Arc<InMemoryLayer>> {
+        match action {
+            OpenLayerAction::Roll => {
+                let max_lsn = self.write_guard.as_ref().unwrap().max_lsn.unwrap();
+                self.tl.freeze_inmem_layer_at(max_lsn).await;
+
+                let now = Instant::now();
+                *(self.last_freeze_ts.write().unwrap()) = now;
+
+                self.tl.flush_frozen_layers();
+
+                let current_size = self.write_guard.as_ref().unwrap().current_size;
+                if current_size > self.get_checkpoint_distance() {
+                    warn!("Flushed oversized open layer with size {}", current_size)
+                }
+
+                assert!(self.write_guard.is_some());
+
+                let layer = self.tl.get_layer_for_write(at).await?;
+                let initial_size = layer.size().await?;
+                self.write_guard.replace(TimelineWriterState::new(
+                    layer,
+                    initial_size,
+                    Lsn(max_lsn.0 + 1),
+                    now,
+                ));
+            }
+            OpenLayerAction::Open => {
+                assert!(self.write_guard.is_none());
+
+                let layer = self.tl.get_layer_for_write(at).await?;
+                let initial_size = layer.size().await?;
+
+                let last_freeze_at = self.last_freeze_at.load();
+                let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
+                self.write_guard.replace(TimelineWriterState::new(
+                    layer,
+                    initial_size,
+                    last_freeze_at,
+                    last_freeze_ts,
+                ));
+            }
+            OpenLayerAction::None => {
+                assert!(self.write_guard.is_some());
+            }
+        }
+
+        Ok(&self.write_guard.as_ref().unwrap().open_layer)
+    }
+
+    fn get_open_layer_action(&self, lsn: Lsn, new_value_size: u64) -> OpenLayerAction {
+        let state = &*self.write_guard;
+        let Some(state) = &state else {
+            return OpenLayerAction::Open;
+        };
+
+        if state.prev_lsn == Some(lsn) {
+            // Rolling mid LSN is not supported by downstream code.
+            // Hence, only roll at LSN boundaries.
+            return OpenLayerAction::None;
+        }
+
+        let distance = lsn.widening_sub(state.cached_last_freeze_at);
+        let proposed_open_layer_size = state.current_size + new_value_size;
+
+        // Rolling the open layer can be triggered by:
+        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
+        //    the safekeepers need to store.
+        // 2. The size of the currently open layer.
+        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
+        //    up and suspend activity.
+        if distance >= self.get_checkpoint_distance().into() {
+            info!(
+                "Will roll layer at {} with layer size {} due to LSN distance ({})",
+                lsn, state.current_size, distance
+            );
+
+            OpenLayerAction::Roll
+        } else if state.current_size > 0
+            && proposed_open_layer_size >= self.get_checkpoint_distance()
+        {
+            info!(
+                "Will roll layer at {} with layer size {} due to layer size ({})",
+                lsn, state.current_size, proposed_open_layer_size
+            );
+
+            OpenLayerAction::Roll
+        } else if distance > 0
+            && state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()
+        {
+            info!(
+                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
+                lsn,
+                state.current_size,
+                state.cached_last_freeze_ts.elapsed()
+            );
+
+            OpenLayerAction::Roll
+        } else {
+            OpenLayerAction::None
+        }
+    }
+
+    /// Put a batch keys at the specified Lsns.
+    ///
+    /// The batch should be sorted by Lsn such that it's safe
+    /// to roll the open layer mid batch.
     pub(crate) async fn put_batch(
-        &self,
-        batch: &HashMap<Key, Vec<(Lsn, Value)>>,
+        &mut self,
+        batch: Vec<(Key, Lsn, Value)>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        self.tl.put_values(batch, ctx).await
+        for (key, lsn, val) in batch {
+            self.put(key, lsn, &val, ctx).await?
+        }
+
+        Ok(())
     }
 
-    pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        self.tl.put_tombstones(batch).await
+    pub(crate) async fn delete_batch(&mut self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        if let Some((_, lsn)) = batch.first() {
+            let action = self.get_open_layer_action(*lsn, 0);
+            let layer = self.handle_open_layer_action(*lsn, action).await?;
+            layer.put_tombstones(batch).await?;
+        }
+
+        Ok(())
     }
 
     /// Track the end of the latest digested WAL record.
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 9cb53f46d1..0333fcac67 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -343,23 +343,6 @@ pub(super) async fn handle_walreceiver_connection(
                             modification.commit(&ctx).await?;
                             uncommitted_records = 0;
                             filtered_records = 0;
-
-                            //
-                            // We should check checkpoint distance after appending each ingest_batch_size bytes because otherwise
-                            // layer size can become much larger than `checkpoint_distance`.
-                            // It can append because wal-sender is sending WAL using 125kb chucks and some WAL records can cause writing large
-                            // amount of data to key-value storage. So performing this check only after processing
-                            // all WAL records in the chunk, can cause huge L0 layer files.
-                            //
-                            timeline
-                                .check_checkpoint_distance()
-                                .await
-                                .with_context(|| {
-                                    format!(
-                                        "Failed to check checkpoint distance for timeline {}",
-                                        timeline.timeline_id
-                                    )
-                                })?;
                         }
                     }
 
@@ -406,16 +389,6 @@ pub(super) async fn handle_walreceiver_connection(
             }
         }
 
-        timeline
-            .check_checkpoint_distance()
-            .await
-            .with_context(|| {
-                format!(
-                    "Failed to check checkpoint distance for timeline {}",
-                    timeline.timeline_id
-                )
-            })?;
-
         if let Some(last_lsn) = status_update {
             let timeline_remote_consistent_lsn = timeline
                 .get_remote_consistent_lsn_visible()
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 74c6bddf23..8ff4341cc0 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -82,6 +82,11 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     # During shutdown, DownloadError::Cancelled may be logged as an error.  Cleaning this
     # up is tracked in https://github.com/neondatabase/neon/issues/6096
     ".*Cancelled, shutting down.*",
+    # Open layers are only rolled at Lsn boundaries to avoid name clashses.
+    # Hence, we can overshoot the soft limit set by checkpoint distance.
+    # This is especially pronounced in tests that set small checkpoint
+    # distances.
+    ".*Flushed oversized open layer with size.*",
 )
 
 
From d0d48716828e430c99af1d9cd91705e9508e872e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 19 Feb 2024 12:54:17 +0000
Subject: [PATCH 0208/1571] proxy: use postgres_protocol scram/sasl code
 (#4748)

1) `scram::password` was used in tests only. can be replaced with
`postgres_protocol::password`.
2) `postgres_protocol::authentication::sasl` provides a client impl of
SASL which improves our ability to test
---
 proxy/src/proxy/tests.rs    |  5 +--
 proxy/src/scram.rs          | 56 +++++++++++++++++++---------
 proxy/src/scram/key.rs      |  2 +-
 proxy/src/scram/password.rs | 74 -------------------------------------
 proxy/src/scram/secret.rs   | 37 +++----------------
 5 files changed, 46 insertions(+), 128 deletions(-)
 delete mode 100644 proxy/src/scram/password.rs

diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 1a01f32339..c407a5572a 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -132,9 +132,8 @@ struct Scram(scram::ServerSecret);
 
 impl Scram {
     fn new(password: &str) -> anyhow::Result<Self> {
-        let salt = rand::random::<[u8; 16]>();
-        let secret = scram::ServerSecret::build(password, &salt, 256)
-            .context("failed to generate scram secret")?;
+        let secret =
+            scram::ServerSecret::build(password).context("failed to generate scram secret")?;
         Ok(Scram(secret))
     }
 
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index 49a7a13043..a95e734d06 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -12,9 +12,6 @@ mod messages;
 mod secret;
 mod signature;
 
-#[cfg(any(test, doc))]
-mod password;
-
 pub use exchange::{exchange, Exchange};
 pub use key::ScramKey;
 pub use secret::ServerSecret;
@@ -59,27 +56,21 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
 
 #[cfg(test)]
 mod tests {
+    use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
+
     use crate::sasl::{Mechanism, Step};
 
-    use super::{password::SaltedPassword, Exchange, ServerSecret};
+    use super::{Exchange, ServerSecret};
 
     #[test]
-    fn happy_path() {
+    fn snapshot() {
         let iterations = 4096;
-        let salt_base64 = "QSXCR+Q6sek8bf92";
-        let pw = SaltedPassword::new(
-            b"pencil",
-            base64::decode(salt_base64).unwrap().as_slice(),
-            iterations,
-        );
+        let salt = "QSXCR+Q6sek8bf92";
+        let stored_key = "FO+9jBb3MUukt6jJnzjPZOWc5ow/Pu6JtPyju0aqaE8=";
+        let server_key = "qxJ1SbmSAi5EcS0J5Ck/cKAm/+Ixa+Kwp63f4OHDgzo=";
+        let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}",);
+        let secret = ServerSecret::parse(&secret).unwrap();
 
-        let secret = ServerSecret {
-            iterations,
-            salt_base64: salt_base64.to_owned(),
-            stored_key: pw.client_key().sha256(),
-            server_key: pw.server_key(),
-            doomed: false,
-        };
         const NONCE: [u8; 18] = [
             1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
         ];
@@ -121,4 +112,33 @@ mod tests {
             ]
         );
     }
+
+    fn run_round_trip_test(server_password: &str, client_password: &str) {
+        let scram_secret = ServerSecret::build(server_password).unwrap();
+        let sasl_client =
+            ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
+
+        let outcome = super::exchange(
+            &scram_secret,
+            sasl_client,
+            crate::config::TlsServerEndPoint::Undefined,
+        )
+        .unwrap();
+
+        match outcome {
+            crate::sasl::Outcome::Success(_) => {}
+            crate::sasl::Outcome::Failure(r) => panic!("{r}"),
+        }
+    }
+
+    #[test]
+    fn round_trip() {
+        run_round_trip_test("pencil", "pencil")
+    }
+
+    #[test]
+    #[should_panic(expected = "password doesn't match")]
+    fn failure() {
+        run_round_trip_test("pencil", "eraser")
+    }
 }
diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs
index 66c2c6b207..973126e729 100644
--- a/proxy/src/scram/key.rs
+++ b/proxy/src/scram/key.rs
@@ -3,7 +3,7 @@
 /// Faithfully taken from PostgreSQL.
 pub const SCRAM_KEY_LEN: usize = 32;
 
-/// One of the keys derived from the [password](super::password::SaltedPassword).
+/// One of the keys derived from the user's password.
 /// We use the same structure for all keys, i.e.
 /// `ClientKey`, `StoredKey`, and `ServerKey`.
 #[derive(Clone, Default, PartialEq, Eq, Debug)]
diff --git a/proxy/src/scram/password.rs b/proxy/src/scram/password.rs
deleted file mode 100644
index 022f2842dd..0000000000
--- a/proxy/src/scram/password.rs
+++ /dev/null
@@ -1,74 +0,0 @@
-//! Password hashing routines.
-
-use super::key::ScramKey;
-
-pub const SALTED_PASSWORD_LEN: usize = 32;
-
-/// Salted hashed password is essential for [key](super::key) derivation.
-#[repr(transparent)]
-pub struct SaltedPassword {
-    bytes: [u8; SALTED_PASSWORD_LEN],
-}
-
-impl SaltedPassword {
-    /// See `scram-common.c : scram_SaltedPassword` for details.
-    /// Further reading: <https://datatracker.ietf.org/doc/html/rfc2898> (see `PBKDF2`).
-    pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
-        pbkdf2::pbkdf2_hmac_array::<sha2::Sha256, 32>(password, salt, iterations).into()
-    }
-
-    /// Derive `ClientKey` from a salted hashed password.
-    pub fn client_key(&self) -> ScramKey {
-        super::hmac_sha256(&self.bytes, [b"Client Key".as_ref()]).into()
-    }
-
-    /// Derive `ServerKey` from a salted hashed password.
-    pub fn server_key(&self) -> ScramKey {
-        super::hmac_sha256(&self.bytes, [b"Server Key".as_ref()]).into()
-    }
-}
-
-impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword {
-    #[inline(always)]
-    fn from(bytes: [u8; SALTED_PASSWORD_LEN]) -> Self {
-        Self { bytes }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::SaltedPassword;
-
-    fn legacy_pbkdf2_impl(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
-        let one = 1_u32.to_be_bytes(); // magic
-
-        let mut current = super::super::hmac_sha256(password, [salt, &one]);
-        let mut result = current;
-        for _ in 1..iterations {
-            current = super::super::hmac_sha256(password, [current.as_ref()]);
-            // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094
-            for (i, x) in current.iter().enumerate() {
-                result[i] ^= x;
-            }
-        }
-
-        result.into()
-    }
-
-    #[test]
-    fn pbkdf2() {
-        let password = "a-very-secure-password";
-        let salt = "such-a-random-salt";
-        let iterations = 4096;
-        let output = [
-            203, 18, 206, 81, 4, 154, 193, 100, 147, 41, 211, 217, 177, 203, 69, 210, 194, 211,
-            101, 1, 248, 156, 96, 0, 8, 223, 30, 87, 158, 41, 20, 42,
-        ];
-
-        let actual = SaltedPassword::new(password.as_bytes(), salt.as_bytes(), iterations);
-        let expected = legacy_pbkdf2_impl(password.as_bytes(), salt.as_bytes(), iterations);
-
-        assert_eq!(actual.bytes, output);
-        assert_eq!(actual.bytes, expected.bytes);
-    }
-}
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index 041548014a..fb3c45816e 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -3,7 +3,7 @@
 use super::base64_decode_array;
 use super::key::ScramKey;
 
-/// Server secret is produced from [password](super::password::SaltedPassword)
+/// Server secret is produced from user's password,
 /// and is used throughout the authentication process.
 #[derive(Clone, Eq, PartialEq, Debug)]
 pub struct ServerSecret {
@@ -59,21 +59,10 @@ impl ServerSecret {
     /// Build a new server secret from the prerequisites.
     /// XXX: We only use this function in tests.
     #[cfg(test)]
-    pub fn build(password: &str, salt: &[u8], iterations: u32) -> Option<Self> {
-        // TODO: implement proper password normalization required by the RFC
-        if !password.is_ascii() {
-            return None;
-        }
-
-        let password = super::password::SaltedPassword::new(password.as_bytes(), salt, iterations);
-
-        Some(Self {
-            iterations,
-            salt_base64: base64::encode(salt),
-            stored_key: password.client_key().sha256(),
-            server_key: password.server_key(),
-            doomed: false,
-        })
+    pub fn build(password: &str) -> Option<Self> {
+        Self::parse(&postgres_protocol::password::scram_sha_256(
+            password.as_bytes(),
+        ))
     }
 }
 
@@ -103,20 +92,4 @@ mod tests {
         assert_eq!(base64::encode(parsed.stored_key), stored_key);
         assert_eq!(base64::encode(parsed.server_key), server_key);
     }
-
-    #[test]
-    fn build_scram_secret() {
-        let salt = b"salt";
-        let secret = ServerSecret::build("password", salt, 4096).unwrap();
-        assert_eq!(secret.iterations, 4096);
-        assert_eq!(secret.salt_base64, base64::encode(salt));
-        assert_eq!(
-            base64::encode(secret.stored_key.as_ref()),
-            "lF4cRm/Jky763CN4HtxdHnjV4Q8AWTNlKvGmEFFU8IQ="
-        );
-        assert_eq!(
-            base64::encode(secret.server_key.as_ref()),
-            "ub8OgRsftnk2ccDMOt7ffHXNcikRkQkq1lh4xaAqrSw="
-        );
-    }
 }

From 349b37501050052432c284210a4eff687e5b8335 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 19 Feb 2024 14:01:36 +0000
Subject: [PATCH 0209/1571] pageserver: remove heatmap file during tenant
 delete (#6806)

## Problem

Secondary mode locations keep a local copy of the heatmap, which needs
cleaning up during deletion.

Closes: https://github.com/neondatabase/neon/issues/6802

## Summary of changes

- Extend test_live_migration to reproduce the issue
- Remove heatmap-v1.json during tenant deletion
---
 pageserver/src/tenant/delete.rs                  |  2 ++
 test_runner/regress/test_pageserver_secondary.py | 12 +++++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index b64be8dcc5..3d138da7af 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -246,6 +246,8 @@ async fn cleanup_remaining_fs_traces(
 
     rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
 
+    rm(conf.tenant_heatmap_path(tenant_shard_id), false).await?;
+
     fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
         Err(anyhow::anyhow!(
             "failpoint: tenant-delete-before-remove-tenant-dir"
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index aec989252c..cbff01dc2a 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -7,6 +7,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
 from fixtures.pageserver.utils import (
     assert_prefix_empty,
+    poll_for_remote_storage_iterations,
     tenant_delete_wait_completed,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
@@ -224,9 +225,8 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
     Test the sequence of location states that are used in a live migration.
     """
     neon_env_builder.num_pageservers = 2
-    neon_env_builder.enable_pageserver_remote_storage(
-        remote_storage_kind=RemoteStorageKind.MOCK_S3,
-    )
+    remote_storage_kind = RemoteStorageKind.MOCK_S3
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind=remote_storage_kind)
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
 
     tenant_id = env.initial_tenant
@@ -342,6 +342,12 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
 
     workload.churn_rows(64, pageserver_b.id)
     workload.validate(pageserver_b.id)
+    del workload
+
+    # Check that deletion works properly on a tenant that was live-migrated
+    # (reproduce https://github.com/neondatabase/neon/issues/6802)
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+    tenant_delete_wait_completed(pageserver_b.http_client(), tenant_id, iterations)
 
 
 def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):

From 7e4280955e6a93536adf9abd3a6123b1783554ab Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 19 Feb 2024 14:12:20 +0000
Subject: [PATCH 0210/1571] control_plane/attachment_service: improve Scheduler
 (#6633)

## Problem

One of the major shortcuts in the initial version of this code was to
construct a fresh `Scheduler` each time we need it, which is an O(N^2)
cost as the tenant count increases.

## Summary of changes

- Keep `Scheduler` alive through the lifetime of ServiceState
- Use `IntentState` as a reference tracking helper, updating Scheduler
refcounts as nodes are added/removed from the intent.

There is an automated test that checks things don't get pathologically
slow with thousands of shards, but it's not included in this PR because
tests that implicitly test the runner node performance take some thought
to stabilize/land in CI.
---
 .../attachment_service/src/reconciler.rs      |  28 +-
 .../attachment_service/src/scheduler.rs       | 193 ++++++-
 .../attachment_service/src/service.rs         | 519 ++++++++++--------
 .../attachment_service/src/tenant_state.rs    | 161 ++++--
 4 files changed, 590 insertions(+), 311 deletions(-)

diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index a4fbd80dc3..e765dfc2ae 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -27,7 +27,7 @@ pub(super) struct Reconciler {
     pub(super) tenant_shard_id: TenantShardId,
     pub(crate) shard: ShardIdentity,
     pub(crate) generation: Generation,
-    pub(crate) intent: IntentState,
+    pub(crate) intent: TargetState,
     pub(crate) config: TenantConfig,
     pub(crate) observed: ObservedState,
 
@@ -62,6 +62,32 @@ pub(super) struct Reconciler {
     pub(crate) persistence: Arc<Persistence>,
 }
 
+/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
+/// reference counting for Scheduler.  The IntentState is what the scheduler works with,
+/// and the TargetState is just the instruction for a particular Reconciler run.
+#[derive(Debug)]
+pub(crate) struct TargetState {
+    pub(crate) attached: Option<NodeId>,
+    pub(crate) secondary: Vec<NodeId>,
+}
+
+impl TargetState {
+    pub(crate) fn from_intent(intent: &IntentState) -> Self {
+        Self {
+            attached: *intent.get_attached(),
+            secondary: intent.get_secondary().clone(),
+        }
+    }
+
+    fn all_pageservers(&self) -> Vec<NodeId> {
+        let mut result = self.secondary.clone();
+        if let Some(node_id) = &self.attached {
+            result.push(*node_id);
+        }
+        result
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum ReconcileError {
     #[error(transparent)]
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 3b4c9e3464..7a99118312 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -1,9 +1,7 @@
-use pageserver_api::shard::TenantShardId;
-use std::collections::{BTreeMap, HashMap};
+use crate::node::Node;
+use std::collections::HashMap;
 use utils::{http::error::ApiError, id::NodeId};
 
-use crate::{node::Node, tenant_state::TenantState};
-
 /// Scenarios in which we cannot find a suitable location for a tenant shard
 #[derive(thiserror::Error, Debug)]
 pub enum ScheduleError {
@@ -19,52 +17,95 @@ impl From<ScheduleError> for ApiError {
     }
 }
 
+struct SchedulerNode {
+    /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
+    shard_count: usize,
+
+    /// Whether this node is currently elegible to have new shards scheduled (this is derived
+    /// from a node's availability state and scheduling policy).
+    may_schedule: bool,
+}
+
 pub(crate) struct Scheduler {
-    tenant_counts: HashMap<NodeId, usize>,
+    nodes: HashMap<NodeId, SchedulerNode>,
 }
 
 impl Scheduler {
-    pub(crate) fn new(
-        tenants: &BTreeMap<TenantShardId, TenantState>,
-        nodes: &HashMap<NodeId, Node>,
-    ) -> Self {
-        let mut tenant_counts = HashMap::new();
-        for node_id in nodes.keys() {
-            tenant_counts.insert(*node_id, 0);
+    pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
+        let mut scheduler_nodes = HashMap::new();
+        for node in nodes {
+            scheduler_nodes.insert(
+                node.id,
+                SchedulerNode {
+                    shard_count: 0,
+                    may_schedule: node.may_schedule(),
+                },
+            );
         }
 
-        for tenant in tenants.values() {
-            if let Some(ps) = tenant.intent.attached {
-                let entry = tenant_counts.entry(ps).or_insert(0);
-                *entry += 1;
+        Self {
+            nodes: scheduler_nodes,
+        }
+    }
+
+    /// Increment the reference count of a node.  This reference count is used to guide scheduling
+    /// decisions, not for memory management: it represents one tenant shard whose IntentState targets
+    /// this node.
+    ///
+    /// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
+    /// [`Self::new`] or [`Self::node_upsert`])
+    pub(crate) fn node_inc_ref(&mut self, node_id: NodeId) {
+        let Some(node) = self.nodes.get_mut(&node_id) else {
+            tracing::error!("Scheduler missing node {node_id}");
+            debug_assert!(false);
+            return;
+        };
+
+        node.shard_count += 1;
+    }
+
+    /// Decrement a node's reference count.  Inverse of [`Self::node_inc_ref`].
+    pub(crate) fn node_dec_ref(&mut self, node_id: NodeId) {
+        let Some(node) = self.nodes.get_mut(&node_id) else {
+            debug_assert!(false);
+            tracing::error!("Scheduler missing node {node_id}");
+            return;
+        };
+
+        node.shard_count -= 1;
+    }
+
+    pub(crate) fn node_upsert(&mut self, node: &Node) {
+        use std::collections::hash_map::Entry::*;
+        match self.nodes.entry(node.id) {
+            Occupied(mut entry) => {
+                entry.get_mut().may_schedule = node.may_schedule();
+            }
+            Vacant(entry) => {
+                entry.insert(SchedulerNode {
+                    shard_count: 0,
+                    may_schedule: node.may_schedule(),
+                });
             }
         }
-
-        for (node_id, node) in nodes {
-            if !node.may_schedule() {
-                tenant_counts.remove(node_id);
-            }
-        }
-
-        Self { tenant_counts }
     }
 
     pub(crate) fn schedule_shard(
         &mut self,
         hard_exclude: &[NodeId],
     ) -> Result<NodeId, ScheduleError> {
-        if self.tenant_counts.is_empty() {
+        if self.nodes.is_empty() {
             return Err(ScheduleError::NoPageservers);
         }
 
         let mut tenant_counts: Vec<(NodeId, usize)> = self
-            .tenant_counts
+            .nodes
             .iter()
             .filter_map(|(k, v)| {
-                if hard_exclude.contains(k) {
+                if hard_exclude.contains(k) || !v.may_schedule {
                     None
                 } else {
-                    Some((*k, *v))
+                    Some((*k, v.shard_count))
                 }
             })
             .collect();
@@ -73,7 +114,18 @@ impl Scheduler {
         tenant_counts.sort_by_key(|i| (i.1, i.0));
 
         if tenant_counts.is_empty() {
-            // After applying constraints, no pageservers were left
+            // After applying constraints, no pageservers were left.  We log some detail about
+            // the state of nodes to help understand why this happened.  This is not logged as an error because
+            // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
+            tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
+            for (node_id, node) in &self.nodes {
+                tracing::info!(
+                    "Node {node_id}: may_schedule={} shards={}",
+                    node.may_schedule,
+                    node.shard_count
+                );
+            }
+
             return Err(ScheduleError::ImpossibleConstraint);
         }
 
@@ -82,7 +134,88 @@ impl Scheduler {
             "scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
             tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
         );
-        *self.tenant_counts.get_mut(&node_id).unwrap() += 1;
+
+        // Note that we do not update shard count here to reflect the scheduling: that
+        // is IntentState's job when the scheduled location is used.
+
         Ok(node_id)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::collections::HashMap;
+
+    use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+    use utils::id::NodeId;
+
+    use crate::{node::Node, tenant_state::IntentState};
+
+    #[test]
+    fn scheduler_basic() -> anyhow::Result<()> {
+        let mut nodes = HashMap::new();
+        nodes.insert(
+            NodeId(1),
+            Node {
+                id: NodeId(1),
+                availability: NodeAvailability::Active,
+                scheduling: NodeSchedulingPolicy::Active,
+                listen_http_addr: String::new(),
+                listen_http_port: 0,
+                listen_pg_addr: String::new(),
+                listen_pg_port: 0,
+            },
+        );
+
+        nodes.insert(
+            NodeId(2),
+            Node {
+                id: NodeId(2),
+                availability: NodeAvailability::Active,
+                scheduling: NodeSchedulingPolicy::Active,
+                listen_http_addr: String::new(),
+                listen_http_port: 0,
+                listen_pg_addr: String::new(),
+                listen_pg_port: 0,
+            },
+        );
+
+        let mut scheduler = Scheduler::new(nodes.values());
+        let mut t1_intent = IntentState::new();
+        let mut t2_intent = IntentState::new();
+
+        let scheduled = scheduler.schedule_shard(&[])?;
+        t1_intent.set_attached(&mut scheduler, Some(scheduled));
+        let scheduled = scheduler.schedule_shard(&[])?;
+        t2_intent.set_attached(&mut scheduler, Some(scheduled));
+
+        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
+        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
+
+        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
+        t1_intent.push_secondary(&mut scheduler, scheduled);
+
+        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
+        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 2);
+
+        t1_intent.clear(&mut scheduler);
+        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
+        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
+
+        if cfg!(debug_assertions) {
+            // Dropping an IntentState without clearing it causes a panic in debug mode,
+            // because we have failed to properly update scheduler shard counts.
+            let result = std::panic::catch_unwind(move || {
+                drop(t2_intent);
+            });
+            assert!(result.is_err());
+        } else {
+            t2_intent.clear(&mut scheduler);
+            assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
+            assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 0);
+        }
+
+        Ok(())
+    }
+}
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 149cb7f2ba..097b4a1a47 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -69,6 +69,8 @@ struct ServiceState {
 
     nodes: Arc<HashMap<NodeId, Node>>,
 
+    scheduler: Scheduler,
+
     compute_hook: Arc<ComputeHook>,
 
     result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
@@ -80,14 +82,26 @@ impl ServiceState {
         result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
         nodes: HashMap<NodeId, Node>,
         tenants: BTreeMap<TenantShardId, TenantState>,
+        scheduler: Scheduler,
     ) -> Self {
         Self {
             tenants,
             nodes: Arc::new(nodes),
+            scheduler,
             compute_hook: Arc::new(ComputeHook::new(config)),
             result_tx,
         }
     }
+
+    fn parts_mut(
+        &mut self,
+    ) -> (
+        &mut Arc<HashMap<NodeId, Node>>,
+        &mut BTreeMap<TenantShardId, TenantState>,
+        &mut Scheduler,
+    ) {
+        (&mut self.nodes, &mut self.tenants, &mut self.scheduler)
+    }
 }
 
 #[derive(Clone)]
@@ -234,19 +248,20 @@ impl Service {
         // Populate intent and observed states for all tenants, based on reported state on pageservers
         let (shard_count, nodes) = {
             let mut locked = self.inner.write().unwrap();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
 
             // Mark nodes online if they responded to us: nodes are offline by default after a restart.
-            let mut nodes = (*locked.nodes).clone();
-            for (node_id, node) in nodes.iter_mut() {
+            let mut new_nodes = (**nodes).clone();
+            for (node_id, node) in new_nodes.iter_mut() {
                 if nodes_online.contains(node_id) {
                     node.availability = NodeAvailability::Active;
+                    scheduler.node_upsert(node);
                 }
             }
-            locked.nodes = Arc::new(nodes);
-            let nodes = locked.nodes.clone();
+            *nodes = Arc::new(new_nodes);
 
             for (tenant_shard_id, (node_id, observed_loc)) in observed {
-                let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
+                let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
                     cleanup.push((tenant_shard_id, node_id));
                     continue;
                 };
@@ -258,10 +273,9 @@ impl Service {
             }
 
             // Populate each tenant's intent state
-            let mut scheduler = Scheduler::new(&locked.tenants, &nodes);
-            for (tenant_shard_id, tenant_state) in locked.tenants.iter_mut() {
+            for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
                 tenant_state.intent_from_observed();
-                if let Err(e) = tenant_state.schedule(&mut scheduler) {
+                if let Err(e) = tenant_state.schedule(scheduler) {
                     // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
                     // not enough pageservers are available.  The tenant may well still be available
                     // to clients.
@@ -276,7 +290,7 @@ impl Service {
                 }
             }
 
-            (locked.tenants.len(), nodes)
+            (tenants.len(), nodes.clone())
         };
 
         // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
@@ -393,7 +407,56 @@ impl Service {
         }
     }
 
-    #[instrument(skip_all)]
+    /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation
+    /// was successful, this will update the observed state of the tenant such that subsequent
+    /// calls to [`TenantState::maybe_reconcile`] will do nothing.
+    #[instrument(skip_all, fields(
+        tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(),
+        sequence=%result.sequence
+    ))]
+    fn process_result(&self, result: ReconcileResult) {
+        let mut locked = self.inner.write().unwrap();
+        let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
+            // A reconciliation result might race with removing a tenant: drop results for
+            // tenants that aren't in our map.
+            return;
+        };
+
+        // Usually generation should only be updated via this path, so the max() isn't
+        // needed, but it is used to handle out-of-band updates via. e.g. test hook.
+        tenant.generation = std::cmp::max(tenant.generation, result.generation);
+
+        // If the reconciler signals that it failed to notify compute, set this state on
+        // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
+        tenant.pending_compute_notification = result.pending_compute_notification;
+
+        match result.result {
+            Ok(()) => {
+                for (node_id, loc) in &result.observed.locations {
+                    if let Some(conf) = &loc.conf {
+                        tracing::info!("Updating observed location {}: {:?}", node_id, conf);
+                    } else {
+                        tracing::info!("Setting observed location {} to None", node_id,)
+                    }
+                }
+                tenant.observed = result.observed;
+                tenant.waiter.advance(result.sequence);
+            }
+            Err(e) => {
+                tracing::warn!("Reconcile error: {}", e);
+
+                // Ordering: populate last_error before advancing error_seq,
+                // so that waiters will see the correct error after waiting.
+                *(tenant.last_error.lock().unwrap()) = format!("{e}");
+                tenant.error_waiter.advance(result.sequence);
+
+                for (node_id, o) in result.observed.locations {
+                    tenant.observed.locations.insert(node_id, o);
+                }
+            }
+        }
+    }
+
     async fn process_results(
         &self,
         mut result_rx: tokio::sync::mpsc::UnboundedReceiver<ReconcileResult>,
@@ -412,55 +475,7 @@ impl Service {
                 }
             };
 
-            tracing::info!(
-                "Reconcile result for sequence {}, ok={}",
-                result.sequence,
-                result.result.is_ok()
-            );
-            let mut locked = self.inner.write().unwrap();
-            let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
-                // A reconciliation result might race with removing a tenant: drop results for
-                // tenants that aren't in our map.
-                continue;
-            };
-
-            // Usually generation should only be updated via this path, so the max() isn't
-            // needed, but it is used to handle out-of-band updates via. e.g. test hook.
-            tenant.generation = std::cmp::max(tenant.generation, result.generation);
-
-            // If the reconciler signals that it failed to notify compute, set this state on
-            // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
-            tenant.pending_compute_notification = result.pending_compute_notification;
-
-            match result.result {
-                Ok(()) => {
-                    for (node_id, loc) in &result.observed.locations {
-                        if let Some(conf) = &loc.conf {
-                            tracing::info!("Updating observed location {}: {:?}", node_id, conf);
-                        } else {
-                            tracing::info!("Setting observed location {} to None", node_id,)
-                        }
-                    }
-                    tenant.observed = result.observed;
-                    tenant.waiter.advance(result.sequence);
-                }
-                Err(e) => {
-                    tracing::warn!(
-                        "Reconcile error on tenant {}: {}",
-                        tenant.tenant_shard_id,
-                        e
-                    );
-
-                    // Ordering: populate last_error before advancing error_seq,
-                    // so that waiters will see the correct error after waiting.
-                    *(tenant.last_error.lock().unwrap()) = format!("{e}");
-                    tenant.error_waiter.advance(result.sequence);
-
-                    for (node_id, o) in result.observed.locations {
-                        tenant.observed.locations.insert(node_id, o);
-                    }
-                }
-            }
+            self.process_result(result);
         }
     }
 
@@ -481,6 +496,32 @@ impl Service {
 
         let mut tenants = BTreeMap::new();
 
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        #[cfg(feature = "testing")]
+        {
+            // Hack: insert scheduler state for all nodes referenced by shards, as compatibility
+            // tests only store the shards, not the nodes.  The nodes will be loaded shortly
+            // after when pageservers start up and register.
+            let mut node_ids = HashSet::new();
+            for tsp in &tenant_shard_persistence {
+                node_ids.insert(tsp.generation_pageserver);
+            }
+            for node_id in node_ids {
+                tracing::info!("Creating node {} in scheduler for tests", node_id);
+                let node = Node {
+                    id: NodeId(node_id as u64),
+                    availability: NodeAvailability::Active,
+                    scheduling: NodeSchedulingPolicy::Active,
+                    listen_http_addr: "".to_string(),
+                    listen_http_port: 123,
+                    listen_pg_addr: "".to_string(),
+                    listen_pg_port: 123,
+                };
+
+                scheduler.node_upsert(&node);
+            }
+        }
         for tsp in tenant_shard_persistence {
             let tenant_shard_id = TenantShardId {
                 tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
@@ -501,7 +542,10 @@ impl Service {
             // it with what we can infer: the node for which a generation was most recently issued.
             let mut intent = IntentState::new();
             if tsp.generation_pageserver != i64::MAX {
-                intent.attached = Some(NodeId(tsp.generation_pageserver as u64))
+                intent.set_attached(
+                    &mut scheduler,
+                    Some(NodeId(tsp.generation_pageserver as u64)),
+                );
             }
 
             let new_tenant = TenantState {
@@ -532,6 +576,7 @@ impl Service {
                 result_tx,
                 nodes,
                 tenants,
+                scheduler,
             ))),
             config,
             persistence,
@@ -636,8 +681,9 @@ impl Service {
         };
 
         let mut locked = self.inner.write().unwrap();
-        let tenant_state = locked
-            .tenants
+        let (_nodes, tenants, scheduler) = locked.parts_mut();
+
+        let tenant_state = tenants
             .get_mut(&attach_req.tenant_shard_id)
             .expect("Checked for existence above");
 
@@ -657,7 +703,7 @@ impl Service {
                 generation = ?tenant_state.generation,
                 "issuing",
             );
-        } else if let Some(ps_id) = tenant_state.intent.attached {
+        } else if let Some(ps_id) = tenant_state.intent.get_attached() {
             tracing::info!(
                 tenant_id = %attach_req.tenant_shard_id,
                 %ps_id,
@@ -669,7 +715,9 @@ impl Service {
             tenant_id = %attach_req.tenant_shard_id,
             "no-op: tenant already has no pageserver");
         }
-        tenant_state.intent.attached = attach_req.node_id;
+        tenant_state
+            .intent
+            .set_attached(scheduler, attach_req.node_id);
 
         tracing::info!(
             "attach_hook: tenant {} set generation {:?}, pageserver {}",
@@ -716,7 +764,7 @@ impl Service {
         InspectResponse {
             attachment: tenant_state.and_then(|s| {
                 s.intent
-                    .attached
+                    .get_attached()
                     .map(|ps| (s.generation.into().unwrap(), ps))
             }),
         }
@@ -862,16 +910,15 @@ impl Service {
 
         let (waiters, response_shards) = {
             let mut locked = self.inner.write().unwrap();
+            let (_nodes, tenants, scheduler) = locked.parts_mut();
 
             let mut response_shards = Vec::new();
 
-            let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
-
             for tenant_shard_id in create_ids {
                 tracing::info!("Creating shard {tenant_shard_id}...");
 
                 use std::collections::btree_map::Entry;
-                match locked.tenants.entry(tenant_shard_id) {
+                match tenants.entry(tenant_shard_id) {
                     Entry::Occupied(mut entry) => {
                         tracing::info!(
                             "Tenant shard {tenant_shard_id} already exists while creating"
@@ -881,7 +928,7 @@ impl Service {
                         // attached and secondary locations (independently) away frorm those
                         // pageservers also holding a shard for this tenant.
 
-                        entry.get_mut().schedule(&mut scheduler).map_err(|e| {
+                        entry.get_mut().schedule(scheduler).map_err(|e| {
                             ApiError::Conflict(format!(
                                 "Failed to schedule shard {tenant_shard_id}: {e}"
                             ))
@@ -892,7 +939,7 @@ impl Service {
                             node_id: entry
                                 .get()
                                 .intent
-                                .attached
+                                .get_attached()
                                 .expect("We just set pageserver if it was None"),
                             generation: entry.get().generation.into().unwrap(),
                         });
@@ -914,7 +961,7 @@ impl Service {
                         }
                         state.config = create_req.config.clone();
 
-                        state.schedule(&mut scheduler).map_err(|e| {
+                        state.schedule(scheduler).map_err(|e| {
                             ApiError::Conflict(format!(
                                 "Failed to schedule shard {tenant_shard_id}: {e}"
                             ))
@@ -924,7 +971,7 @@ impl Service {
                             shard_id: tenant_shard_id,
                             node_id: state
                                 .intent
-                                .attached
+                                .get_attached()
                                 .expect("We just set pageserver if it was None"),
                             generation: state.generation.into().unwrap(),
                         });
@@ -1002,16 +1049,11 @@ impl Service {
             let mut locked = self.inner.write().unwrap();
             let result_tx = locked.result_tx.clone();
             let compute_hook = locked.compute_hook.clone();
-            let pageservers = locked.nodes.clone();
-
-            let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
+            let (nodes, tenants, scheduler) = locked.parts_mut();
 
             // Maybe we have existing shards
             let mut create = true;
-            for (shard_id, shard) in locked
-                .tenants
-                .range_mut(TenantShardId::tenant_range(tenant_id))
-            {
+            for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
                 // Saw an existing shard: this is not a creation
                 create = false;
 
@@ -1035,7 +1077,7 @@ impl Service {
                     | LocationConfigMode::AttachedSingle
                     | LocationConfigMode::AttachedStale => {
                         // TODO: persistence for changes in policy
-                        if pageservers.len() > 1 {
+                        if nodes.len() > 1 {
                             shard.policy = PlacementPolicy::Double(1)
                         } else {
                             // Convenience for dev/test: if we just have one pageserver, import
@@ -1045,11 +1087,11 @@ impl Service {
                     }
                 }
 
-                shard.schedule(&mut scheduler)?;
+                shard.schedule(scheduler)?;
 
                 let maybe_waiter = shard.maybe_reconcile(
                     result_tx.clone(),
-                    &pageservers,
+                    nodes,
                     &compute_hook,
                     &self.config,
                     &self.persistence,
@@ -1060,10 +1102,10 @@ impl Service {
                     waiters.push(waiter);
                 }
 
-                if let Some(node_id) = shard.intent.attached {
+                if let Some(node_id) = shard.intent.get_attached() {
                     result.shards.push(TenantShardLocation {
                         shard_id: *shard_id,
-                        node_id,
+                        node_id: *node_id,
                     })
                 }
             }
@@ -1154,7 +1196,7 @@ impl Service {
             for (tenant_shard_id, shard) in
                 locked.tenants.range(TenantShardId::tenant_range(tenant_id))
             {
-                let node_id = shard.intent.attached.ok_or_else(|| {
+                let node_id = shard.intent.get_attached().ok_or_else(|| {
                     ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
                 })?;
                 let node = locked
@@ -1211,9 +1253,16 @@ impl Service {
         // Drop in-memory state
         {
             let mut locked = self.inner.write().unwrap();
-            locked
-                .tenants
-                .retain(|tenant_shard_id, _shard| tenant_shard_id.tenant_id != tenant_id);
+            let (_nodes, tenants, scheduler) = locked.parts_mut();
+
+            // Dereference Scheduler from shards before dropping them
+            for (_tenant_shard_id, shard) in
+                tenants.range_mut(TenantShardId::tenant_range(tenant_id))
+            {
+                shard.intent.clear(scheduler);
+            }
+
+            tenants.retain(|tenant_shard_id, _shard| tenant_shard_id.tenant_id != tenant_id);
             tracing::info!(
                 "Deleted tenant {tenant_id}, now have {} tenants",
                 locked.tenants.len()
@@ -1248,7 +1297,7 @@ impl Service {
             for (tenant_shard_id, shard) in
                 locked.tenants.range(TenantShardId::tenant_range(tenant_id))
             {
-                let node_id = shard.intent.attached.ok_or_else(|| {
+                let node_id = shard.intent.get_attached().ok_or_else(|| {
                     ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
                 })?;
                 let node = locked
@@ -1329,7 +1378,7 @@ impl Service {
             for (tenant_shard_id, shard) in
                 locked.tenants.range(TenantShardId::tenant_range(tenant_id))
             {
-                let node_id = shard.intent.attached.ok_or_else(|| {
+                let node_id = shard.intent.get_attached().ok_or_else(|| {
                     ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
                 })?;
                 let node = locked
@@ -1401,13 +1450,13 @@ impl Service {
 
         // TODO: should use the ID last published to compute_hook, rather than the intent: the intent might
         // point to somewhere we haven't attached yet.
-        let Some(node_id) = shard.intent.attached else {
+        let Some(node_id) = shard.intent.get_attached() else {
             return Err(ApiError::Conflict(
                 "Cannot call timeline API on non-attached tenant".to_string(),
             ));
         };
 
-        let Some(node) = locked.nodes.get(&node_id) else {
+        let Some(node) = locked.nodes.get(node_id) else {
             // This should never happen
             return Err(ApiError::InternalServerError(anyhow::anyhow!(
                 "Shard refers to nonexistent node"
@@ -1432,12 +1481,13 @@ impl Service {
 
         for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
         {
-            let node_id = shard
-                .intent
-                .attached
-                .ok_or(ApiError::BadRequest(anyhow::anyhow!(
-                    "Cannot locate a tenant that is not attached"
-                )))?;
+            let node_id =
+                shard
+                    .intent
+                    .get_attached()
+                    .ok_or(ApiError::BadRequest(anyhow::anyhow!(
+                        "Cannot locate a tenant that is not attached"
+                    )))?;
 
             let node = pageservers
                 .get(&node_id)
@@ -1510,106 +1560,104 @@ impl Service {
         }
 
         // Validate input, and calculate which shards we will create
-        let (old_shard_count, targets, compute_hook) = {
-            let locked = self.inner.read().unwrap();
-
-            let pageservers = locked.nodes.clone();
-
-            let mut targets = Vec::new();
-
-            // In case this is a retry, count how many already-split shards we found
-            let mut children_found = Vec::new();
-            let mut old_shard_count = None;
-
-            for (tenant_shard_id, shard) in
-                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+        let (old_shard_count, targets, compute_hook) =
             {
-                match shard.shard.count.count().cmp(&split_req.new_shard_count) {
-                    Ordering::Equal => {
-                        //  Already split this
-                        children_found.push(*tenant_shard_id);
-                        continue;
-                    }
-                    Ordering::Greater => {
-                        return Err(ApiError::BadRequest(anyhow::anyhow!(
-                            "Requested count {} but already have shards at count {}",
-                            split_req.new_shard_count,
-                            shard.shard.count.count()
-                        )));
-                    }
-                    Ordering::Less => {
-                        // Fall through: this shard has lower count than requested,
-                        // is a candidate for splitting.
-                    }
-                }
+                let locked = self.inner.read().unwrap();
 
-                match old_shard_count {
-                    None => old_shard_count = Some(shard.shard.count),
-                    Some(old_shard_count) => {
-                        if old_shard_count != shard.shard.count {
-                            // We may hit this case if a caller asked for two splits to
-                            // different sizes, before the first one is complete.
-                            // e.g. 1->2, 2->4, where the 4 call comes while we have a mixture
-                            // of shard_count=1 and shard_count=2 shards in the map.
-                            return Err(ApiError::Conflict(
-                                "Cannot split, currently mid-split".to_string(),
-                            ));
+                let pageservers = locked.nodes.clone();
+
+                let mut targets = Vec::new();
+
+                // In case this is a retry, count how many already-split shards we found
+                let mut children_found = Vec::new();
+                let mut old_shard_count = None;
+
+                for (tenant_shard_id, shard) in
+                    locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+                {
+                    match shard.shard.count.count().cmp(&split_req.new_shard_count) {
+                        Ordering::Equal => {
+                            //  Already split this
+                            children_found.push(*tenant_shard_id);
+                            continue;
+                        }
+                        Ordering::Greater => {
+                            return Err(ApiError::BadRequest(anyhow::anyhow!(
+                                "Requested count {} but already have shards at count {}",
+                                split_req.new_shard_count,
+                                shard.shard.count.count()
+                            )));
+                        }
+                        Ordering::Less => {
+                            // Fall through: this shard has lower count than requested,
+                            // is a candidate for splitting.
                         }
                     }
-                }
-                if policy.is_none() {
-                    policy = Some(shard.policy.clone());
-                }
-                if shard_ident.is_none() {
-                    shard_ident = Some(shard.shard);
-                }
 
-                if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
-                    tracing::info!(
-                        "Tenant shard {} already has shard count {}",
-                        tenant_shard_id,
-                        split_req.new_shard_count
-                    );
-                    continue;
-                }
+                    match old_shard_count {
+                        None => old_shard_count = Some(shard.shard.count),
+                        Some(old_shard_count) => {
+                            if old_shard_count != shard.shard.count {
+                                // We may hit this case if a caller asked for two splits to
+                                // different sizes, before the first one is complete.
+                                // e.g. 1->2, 2->4, where the 4 call comes while we have a mixture
+                                // of shard_count=1 and shard_count=2 shards in the map.
+                                return Err(ApiError::Conflict(
+                                    "Cannot split, currently mid-split".to_string(),
+                                ));
+                            }
+                        }
+                    }
+                    if policy.is_none() {
+                        policy = Some(shard.policy.clone());
+                    }
+                    if shard_ident.is_none() {
+                        shard_ident = Some(shard.shard);
+                    }
 
-                let node_id =
-                    shard
-                        .intent
-                        .attached
-                        .ok_or(ApiError::BadRequest(anyhow::anyhow!(
-                            "Cannot split a tenant that is not attached"
-                        )))?;
+                    if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
+                        tracing::info!(
+                            "Tenant shard {} already has shard count {}",
+                            tenant_shard_id,
+                            split_req.new_shard_count
+                        );
+                        continue;
+                    }
 
-                let node = pageservers
-                    .get(&node_id)
-                    .expect("Pageservers may not be deleted while referenced");
+                    let node_id = shard.intent.get_attached().ok_or(ApiError::BadRequest(
+                        anyhow::anyhow!("Cannot split a tenant that is not attached"),
+                    ))?;
 
-                // TODO: if any reconciliation is currently in progress for this shard, wait for it.
+                    let node = pageservers
+                        .get(&node_id)
+                        .expect("Pageservers may not be deleted while referenced");
 
-                targets.push(SplitTarget {
-                    parent_id: *tenant_shard_id,
-                    node: node.clone(),
-                    child_ids: tenant_shard_id.split(ShardCount::new(split_req.new_shard_count)),
-                });
-            }
+                    // TODO: if any reconciliation is currently in progress for this shard, wait for it.
 
-            if targets.is_empty() {
-                if children_found.len() == split_req.new_shard_count as usize {
-                    return Ok(TenantShardSplitResponse {
-                        new_shards: children_found,
+                    targets.push(SplitTarget {
+                        parent_id: *tenant_shard_id,
+                        node: node.clone(),
+                        child_ids: tenant_shard_id
+                            .split(ShardCount::new(split_req.new_shard_count)),
                     });
-                } else {
-                    // No shards found to split, and no existing children found: the
-                    // tenant doesn't exist at all.
-                    return Err(ApiError::NotFound(
-                        anyhow::anyhow!("Tenant {} not found", tenant_id).into(),
-                    ));
                 }
-            }
 
-            (old_shard_count, targets, locked.compute_hook.clone())
-        };
+                if targets.is_empty() {
+                    if children_found.len() == split_req.new_shard_count as usize {
+                        return Ok(TenantShardSplitResponse {
+                            new_shards: children_found,
+                        });
+                    } else {
+                        // No shards found to split, and no existing children found: the
+                        // tenant doesn't exist at all.
+                        return Err(ApiError::NotFound(
+                            anyhow::anyhow!("Tenant {} not found", tenant_id).into(),
+                        ));
+                    }
+                }
+
+                (old_shard_count, targets, locked.compute_hook.clone())
+            };
 
         // unwrap safety: we would have returned above if we didn't find at least one shard to split
         let old_shard_count = old_shard_count.unwrap();
@@ -1751,6 +1799,7 @@ impl Service {
         let mut child_locations = Vec::new();
         {
             let mut locked = self.inner.write().unwrap();
+            let (_nodes, tenants, scheduler) = locked.parts_mut();
             for target in targets {
                 let SplitTarget {
                     parent_id,
@@ -1758,19 +1807,14 @@ impl Service {
                     child_ids,
                 } = target;
                 let (pageserver, generation, config) = {
-                    let old_state = locked
-                        .tenants
+                    let mut old_state = tenants
                         .remove(&parent_id)
                         .expect("It was present, we just split it");
-                    (
-                        old_state.intent.attached.unwrap(),
-                        old_state.generation,
-                        old_state.config.clone(),
-                    )
+                    let old_attached = old_state.intent.get_attached().unwrap();
+                    old_state.intent.clear(scheduler);
+                    (old_attached, old_state.generation, old_state.config.clone())
                 };
 
-                locked.tenants.remove(&parent_id);
-
                 for child in child_ids {
                     let mut child_shard = shard_ident;
                     child_shard.number = child.shard_number;
@@ -1785,7 +1829,7 @@ impl Service {
                     );
 
                     let mut child_state = TenantState::new(child, child_shard, policy.clone());
-                    child_state.intent = IntentState::single(Some(pageserver));
+                    child_state.intent = IntentState::single(scheduler, Some(pageserver));
                     child_state.observed = ObservedState {
                         locations: child_observed,
                     };
@@ -1798,7 +1842,7 @@ impl Service {
 
                     child_locations.push((child, pageserver));
 
-                    locked.tenants.insert(child, child_state);
+                    tenants.insert(child, child_state);
                     response.new_shards.push(child);
                 }
             }
@@ -1834,35 +1878,34 @@ impl Service {
     ) -> Result<TenantShardMigrateResponse, ApiError> {
         let waiter = {
             let mut locked = self.inner.write().unwrap();
-
             let result_tx = locked.result_tx.clone();
-            let pageservers = locked.nodes.clone();
             let compute_hook = locked.compute_hook.clone();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
 
-            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
+            let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
                 return Err(ApiError::NotFound(
                     anyhow::anyhow!("Tenant shard not found").into(),
                 ));
             };
 
-            if shard.intent.attached == Some(migrate_req.node_id) {
+            if shard.intent.get_attached() == &Some(migrate_req.node_id) {
                 // No-op case: we will still proceed to wait for reconciliation in case it is
                 // incomplete from an earlier update to the intent.
                 tracing::info!("Migrating: intent is unchanged {:?}", shard.intent);
             } else {
-                let old_attached = shard.intent.attached;
+                let old_attached = *shard.intent.get_attached();
 
                 match shard.policy {
                     PlacementPolicy::Single => {
-                        shard.intent.secondary.clear();
+                        shard.intent.clear_secondary(scheduler);
                     }
                     PlacementPolicy::Double(_n) => {
                         // If our new attached node was a secondary, it no longer should be.
-                        shard.intent.secondary.retain(|s| s != &migrate_req.node_id);
+                        shard.intent.remove_secondary(scheduler, migrate_req.node_id);
 
                         // If we were already attached to something, demote that to a secondary
                         if let Some(old_attached) = old_attached {
-                            shard.intent.secondary.push(old_attached);
+                            shard.intent.push_secondary(scheduler, old_attached);
                         }
                     }
                     PlacementPolicy::Detached => {
@@ -1871,7 +1914,9 @@ impl Service {
                         )))
                     }
                 }
-                shard.intent.attached = Some(migrate_req.node_id);
+                shard
+                    .intent
+                    .set_attached(scheduler, Some(migrate_req.node_id));
 
                 tracing::info!("Migrating: new intent {:?}", shard.intent);
                 shard.sequence = shard.sequence.next();
@@ -1879,7 +1924,7 @@ impl Service {
 
             shard.maybe_reconcile(
                 result_tx,
-                &pageservers,
+                nodes,
                 &compute_hook,
                 &self.config,
                 &self.persistence,
@@ -1903,13 +1948,16 @@ impl Service {
         self.persistence.delete_tenant(tenant_id).await?;
 
         let mut locked = self.inner.write().unwrap();
+        let (_nodes, tenants, scheduler) = locked.parts_mut();
         let mut shards = Vec::new();
-        for (tenant_shard_id, _) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) {
+        for (tenant_shard_id, _) in tenants.range(TenantShardId::tenant_range(tenant_id)) {
             shards.push(*tenant_shard_id);
         }
 
-        for shard in shards {
-            locked.tenants.remove(&shard);
+        for shard_id in shards {
+            if let Some(mut shard) = tenants.remove(&shard_id) {
+                shard.intent.clear(scheduler);
+            }
         }
 
         Ok(())
@@ -2004,6 +2052,7 @@ impl Service {
         let mut locked = self.inner.write().unwrap();
         let mut new_nodes = (*locked.nodes).clone();
 
+        locked.scheduler.node_upsert(&new_node);
         new_nodes.insert(register_req.node_id, new_node);
 
         locked.nodes = Arc::new(new_nodes);
@@ -2020,8 +2069,9 @@ impl Service {
         let mut locked = self.inner.write().unwrap();
         let result_tx = locked.result_tx.clone();
         let compute_hook = locked.compute_hook.clone();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
 
-        let mut new_nodes = (*locked.nodes).clone();
+        let mut new_nodes = (**nodes).clone();
 
         let Some(node) = new_nodes.get_mut(&config_req.node_id) else {
             return Err(ApiError::NotFound(
@@ -2057,11 +2107,14 @@ impl Service {
             // to wake up and start working.
         }
 
+        // Update the scheduler, in case the elegibility of the node for new shards has changed
+        scheduler.node_upsert(node);
+
         let new_nodes = Arc::new(new_nodes);
 
-        let mut scheduler = Scheduler::new(&locked.tenants, &new_nodes);
         if offline_transition {
-            for (tenant_shard_id, tenant_state) in &mut locked.tenants {
+            let mut tenants_affected: usize = 0;
+            for (tenant_shard_id, tenant_state) in tenants {
                 if let Some(observed_loc) =
                     tenant_state.observed.locations.get_mut(&config_req.node_id)
                 {
@@ -2072,7 +2125,7 @@ impl Service {
 
                 if tenant_state.intent.notify_offline(config_req.node_id) {
                     tenant_state.sequence = tenant_state.sequence.next();
-                    match tenant_state.schedule(&mut scheduler) {
+                    match tenant_state.schedule(scheduler) {
                         Err(e) => {
                             // It is possible that some tenants will become unschedulable when too many pageservers
                             // go offline: in this case there isn't much we can do other than make the issue observable.
@@ -2080,19 +2133,29 @@ impl Service {
                             tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id);
                         }
                         Ok(()) => {
-                            tenant_state.maybe_reconcile(
-                                result_tx.clone(),
-                                &new_nodes,
-                                &compute_hook,
-                                &self.config,
-                                &self.persistence,
-                                &self.gate,
-                                &self.cancel,
-                            );
+                            if tenant_state
+                                .maybe_reconcile(
+                                    result_tx.clone(),
+                                    &new_nodes,
+                                    &compute_hook,
+                                    &self.config,
+                                    &self.persistence,
+                                    &self.gate,
+                                    &self.cancel,
+                                )
+                                .is_some()
+                            {
+                                tenants_affected += 1;
+                            };
                         }
                     }
                 }
             }
+            tracing::info!(
+                "Launched {} reconciler tasks for tenants affected by node {} going offline",
+                tenants_affected,
+                config_req.node_id
+            )
         }
 
         if active_transition {
@@ -2135,18 +2198,14 @@ impl Service {
         let mut waiters = Vec::new();
         let result_tx = locked.result_tx.clone();
         let compute_hook = locked.compute_hook.clone();
-        let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
-        let pageservers = locked.nodes.clone();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
 
-        for (_tenant_shard_id, shard) in locked
-            .tenants
-            .range_mut(TenantShardId::tenant_range(tenant_id))
-        {
-            shard.schedule(&mut scheduler)?;
+        for (_tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+            shard.schedule(scheduler)?;
 
             if let Some(waiter) = shard.maybe_reconcile(
                 result_tx.clone(),
-                &pageservers,
+                nodes,
                 &compute_hook,
                 &self.config,
                 &self.persistence,
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index dd753ece3d..1a68864091 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -19,7 +19,9 @@ use crate::{
     compute_hook::ComputeHook,
     node::Node,
     persistence::{split_state::SplitState, Persistence},
-    reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
+    reconciler::{
+        attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState,
+    },
     scheduler::{ScheduleError, Scheduler},
     service, PlacementPolicy, Sequence,
 };
@@ -88,8 +90,107 @@ pub(crate) struct TenantState {
 
 #[derive(Default, Clone, Debug)]
 pub(crate) struct IntentState {
-    pub(crate) attached: Option<NodeId>,
-    pub(crate) secondary: Vec<NodeId>,
+    attached: Option<NodeId>,
+    secondary: Vec<NodeId>,
+}
+
+impl IntentState {
+    pub(crate) fn new() -> Self {
+        Self {
+            attached: None,
+            secondary: vec![],
+        }
+    }
+    pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option<NodeId>) -> Self {
+        if let Some(node_id) = node_id {
+            scheduler.node_inc_ref(node_id);
+        }
+        Self {
+            attached: node_id,
+            secondary: vec![],
+        }
+    }
+
+    pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option<NodeId>) {
+        if self.attached != new_attached {
+            if let Some(old_attached) = self.attached.take() {
+                scheduler.node_dec_ref(old_attached);
+            }
+            if let Some(new_attached) = &new_attached {
+                scheduler.node_inc_ref(*new_attached);
+            }
+            self.attached = new_attached;
+        }
+    }
+
+    pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
+        debug_assert!(!self.secondary.contains(&new_secondary));
+        scheduler.node_inc_ref(new_secondary);
+        self.secondary.push(new_secondary);
+    }
+
+    /// It is legal to call this with a node that is not currently a secondary: that is a no-op
+    pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) {
+        let index = self.secondary.iter().position(|n| *n == node_id);
+        if let Some(index) = index {
+            scheduler.node_dec_ref(node_id);
+            self.secondary.remove(index);
+        }
+    }
+
+    pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) {
+        for secondary in self.secondary.drain(..) {
+            scheduler.node_dec_ref(secondary);
+        }
+    }
+
+    pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
+        if let Some(old_attached) = self.attached.take() {
+            scheduler.node_dec_ref(old_attached);
+        }
+
+        self.clear_secondary(scheduler);
+    }
+
+    pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
+        let mut result = Vec::new();
+        if let Some(p) = self.attached {
+            result.push(p)
+        }
+
+        result.extend(self.secondary.iter().copied());
+
+        result
+    }
+
+    pub(crate) fn get_attached(&self) -> &Option<NodeId> {
+        &self.attached
+    }
+
+    pub(crate) fn get_secondary(&self) -> &Vec<NodeId> {
+        &self.secondary
+    }
+
+    /// When a node goes offline, we update intents to avoid using it
+    /// as their attached pageserver.
+    ///
+    /// Returns true if a change was made
+    pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
+        if self.attached == Some(node_id) {
+            self.attached = None;
+            self.secondary.push(node_id);
+            true
+        } else {
+            false
+        }
+    }
+}
+
+impl Drop for IntentState {
+    fn drop(&mut self) {
+        // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
+        debug_assert!(self.attached.is_none() && self.secondary.is_empty());
+    }
 }
 
 #[derive(Default, Clone)]
@@ -182,46 +283,6 @@ pub(crate) struct ReconcileResult {
     pub(crate) pending_compute_notification: bool,
 }
 
-impl IntentState {
-    pub(crate) fn new() -> Self {
-        Self {
-            attached: None,
-            secondary: vec![],
-        }
-    }
-    pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
-        let mut result = Vec::new();
-        if let Some(p) = self.attached {
-            result.push(p)
-        }
-
-        result.extend(self.secondary.iter().copied());
-
-        result
-    }
-
-    pub(crate) fn single(node_id: Option<NodeId>) -> Self {
-        Self {
-            attached: node_id,
-            secondary: vec![],
-        }
-    }
-
-    /// When a node goes offline, we update intents to avoid using it
-    /// as their attached pageserver.
-    ///
-    /// Returns true if a change was made
-    pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
-        if self.attached == Some(node_id) {
-            self.attached = None;
-            self.secondary.push(node_id);
-            true
-        } else {
-            false
-        }
-    }
-}
-
 impl ObservedState {
     pub(crate) fn new() -> Self {
         Self {
@@ -315,12 +376,12 @@ impl TenantState {
                 // Should have exactly one attached, and zero secondaries
                 if self.intent.attached.is_none() {
                     let node_id = scheduler.schedule_shard(&used_pageservers)?;
-                    self.intent.attached = Some(node_id);
+                    self.intent.set_attached(scheduler, Some(node_id));
                     used_pageservers.push(node_id);
                     modified = true;
                 }
                 if !self.intent.secondary.is_empty() {
-                    self.intent.secondary.clear();
+                    self.intent.clear_secondary(scheduler);
                     modified = true;
                 }
             }
@@ -328,14 +389,14 @@ impl TenantState {
                 // Should have exactly one attached, and N secondaries
                 if self.intent.attached.is_none() {
                     let node_id = scheduler.schedule_shard(&used_pageservers)?;
-                    self.intent.attached = Some(node_id);
+                    self.intent.set_attached(scheduler, Some(node_id));
                     used_pageservers.push(node_id);
                     modified = true;
                 }
 
                 while self.intent.secondary.len() < secondary_count {
                     let node_id = scheduler.schedule_shard(&used_pageservers)?;
-                    self.intent.secondary.push(node_id);
+                    self.intent.push_secondary(scheduler, node_id);
                     used_pageservers.push(node_id);
                     modified = true;
                 }
@@ -343,12 +404,12 @@ impl TenantState {
             Detached => {
                 // Should have no attached or secondary pageservers
                 if self.intent.attached.is_some() {
-                    self.intent.attached = None;
+                    self.intent.set_attached(scheduler, None);
                     modified = true;
                 }
 
                 if !self.intent.secondary.is_empty() {
-                    self.intent.secondary.clear();
+                    self.intent.clear_secondary(scheduler);
                     modified = true;
                 }
             }
@@ -490,7 +551,7 @@ impl TenantState {
             tenant_shard_id: self.tenant_shard_id,
             shard: self.shard,
             generation: self.generation,
-            intent: self.intent.clone(),
+            intent: TargetState::from_intent(&self.intent),
             config: self.config.clone(),
             observed: self.observed.clone(),
             pageservers: pageservers.clone(),

From 2f8a2681b87cb6104aa347b662738102eecaca59 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 19 Feb 2024 15:07:07 +0000
Subject: [PATCH 0211/1571] pageserver: ensure we never try to save empty delta
 layer (#6805)

## Problem

Sharded tenants could panic during compaction when they try to generate
an L1 delta layer for a region that contains no keys on a particular
shard.

This is a variant of https://github.com/neondatabase/neon/issues/6755,
where we attempt to save a delta layer with no keys. It is harder to
reproduce than the case of image layers fixed in
https://github.com/neondatabase/neon/pull/6776.

It will become even less likely once
https://github.com/neondatabase/neon/pull/6778 tweaks keyspace
generation, but even then, we should not rely on keyspace partitioning
to guarantee at least one stored key in each partition.

## Summary of changes

- Move construction of `writer` in `compact_level0_phase1`, so that we
never leave a writer constructed but without any keys.
---
 pageserver/src/tenant/timeline.rs | 42 +++++++++++++++----------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index dcb00a1683..92e5b52c75 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3856,27 +3856,6 @@ impl Timeline {
                 // Remember size of key value because at next iteration we will access next item
                 key_values_total_size = next_key_size;
             }
-            if writer.is_none() {
-                // Create writer if not initiaized yet
-                writer = Some(
-                    DeltaLayerWriter::new(
-                        self.conf,
-                        self.timeline_id,
-                        self.tenant_shard_id,
-                        key,
-                        if dup_end_lsn.is_valid() {
-                            // this is a layer containing slice of values of the same key
-                            debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
-                            dup_start_lsn..dup_end_lsn
-                        } else {
-                            debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
-                            lsn_range.clone()
-                        },
-                    )
-                    .await?,
-                );
-            }
-
             fail_point!("delta-layer-writer-fail-before-finish", |_| {
                 Err(CompactionError::Other(anyhow::anyhow!(
                     "failpoint delta-layer-writer-fail-before-finish"
@@ -3884,6 +3863,27 @@ impl Timeline {
             });
 
             if !self.shard_identity.is_key_disposable(&key) {
+                if writer.is_none() {
+                    // Create writer if not initiaized yet
+                    writer = Some(
+                        DeltaLayerWriter::new(
+                            self.conf,
+                            self.timeline_id,
+                            self.tenant_shard_id,
+                            key,
+                            if dup_end_lsn.is_valid() {
+                                // this is a layer containing slice of values of the same key
+                                debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
+                                dup_start_lsn..dup_end_lsn
+                            } else {
+                                debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+                                lsn_range.clone()
+                            },
+                        )
+                        .await?,
+                    );
+                }
+
                 writer.as_mut().unwrap().put_value(key, lsn, value).await?;
             } else {
                 debug!(

From e0c12faabda2877171ef80a661a4ba894cf665dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 19 Feb 2024 17:27:02 +0100
Subject: [PATCH 0212/1571] Allow initdb preservation for broken tenants
 (#6790)

Often times the tenants we want to (WAL) DR are the ones which the
pageserver marks as broken. Therefore, we should allow initdb
preservation also for broken tenants.

Fixes #6781.
---
 pageserver/src/http/routes.rs            |  2 +-
 test_runner/fixtures/pageserver/utils.py | 26 ++++++++++++--
 test_runner/regress/test_wal_restore.py  | 43 +++++++++++++++++++++---
 3 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 107eed6801..175353762c 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -622,7 +622,7 @@ async fn timeline_preserve_initdb_handler(
     // location where timeline recreation cand find it.
 
     async {
-        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+        let tenant = mgr::get_tenant(tenant_shard_id, false)?;
 
         let timeline = tenant
             .get_timeline(timeline_id, false)
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index c2281ae25a..201a34f964 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -2,6 +2,7 @@ import time
 from typing import Any, Dict, List, Optional, Union
 
 from mypy_boto3_s3.type_defs import (
+    DeleteObjectOutputTypeDef,
     EmptyResponseMetadataTypeDef,
     ListObjectsV2OutputTypeDef,
     ObjectTypeDef,
@@ -331,7 +332,6 @@ def list_prefix(
     """
     # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
     assert isinstance(remote, S3Storage), "localfs is currently not supported"
-    assert remote.client is not None
 
     prefix_in_bucket = remote.prefix_in_bucket or ""
     if not prefix:
@@ -350,6 +350,29 @@ def list_prefix(
     return response
 
 
+def remote_storage_delete_key(
+    remote: RemoteStorage,
+    key: str,
+) -> DeleteObjectOutputTypeDef:
+    """
+    Note that this function takes into account prefix_in_bucket.
+    """
+    # For local_fs we need to use a different implementation. As we don't need local_fs, just don't support it for now.
+    assert isinstance(remote, S3Storage), "localfs is currently not supported"
+
+    prefix_in_bucket = remote.prefix_in_bucket or ""
+
+    # real s3 tests have uniqie per test prefix
+    # mock_s3 tests use special pageserver prefix for pageserver stuff
+    key = "/".join((prefix_in_bucket, key))
+
+    response = remote.client.delete_object(
+        Bucket=remote.bucket_name,
+        Key=key,
+    )
+    return response
+
+
 def enable_remote_storage_versioning(
     remote: RemoteStorage,
 ) -> EmptyResponseMetadataTypeDef:
@@ -358,7 +381,6 @@ def enable_remote_storage_versioning(
     """
     # local_fs has no support for versioning
     assert isinstance(remote, S3Storage), "localfs is currently not supported"
-    assert remote.client is not None
 
     # The SDK supports enabling versioning on normal S3 as well but we don't want to change
     # these settings from a test in a live bucket (also, our access isn't enough nor should it be)
diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py
index 97db857c74..083a259d85 100644
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -2,6 +2,7 @@ import sys
 import tarfile
 import tempfile
 from pathlib import Path
+from typing import List
 
 import pytest
 import zstandard
@@ -11,10 +12,17 @@ from fixtures.neon_fixtures import (
     PgBin,
     VanillaPostgres,
 )
-from fixtures.pageserver.utils import timeline_delete_wait_completed
+from fixtures.pageserver.utils import (
+    list_prefix,
+    remote_storage_delete_key,
+    timeline_delete_wait_completed,
+)
 from fixtures.port_distributor import PortDistributor
-from fixtures.remote_storage import LocalFsStorage
+from fixtures.remote_storage import LocalFsStorage, S3Storage, s3_storage
 from fixtures.types import Lsn, TenantId, TimelineId
+from mypy_boto3_s3.type_defs import (
+    ObjectTypeDef,
+)
 
 
 @pytest.mark.skipif(
@@ -128,7 +136,11 @@ def test_wal_restore_initdb(
         assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]
 
 
-def test_wal_restore_http(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("broken_tenant", [True, False])
+def test_wal_restore_http(neon_env_builder: NeonEnvBuilder, broken_tenant: bool):
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
     env = neon_env_builder.init_start()
     endpoint = env.endpoints.create_start("main")
     endpoint.safe_psql("create table t as select generate_series(1,300000)")
@@ -137,15 +149,36 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder):
 
     ps_client = env.pageserver.http_client()
 
+    if broken_tenant:
+        env.pageserver.allowed_errors.append(
+            r".* Changing Active tenant to Broken state, reason: broken from test"
+        )
+        ps_client.tenant_break(tenant_id)
+
     # Mark the initdb archive for preservation
     ps_client.timeline_preserve_initdb_archive(tenant_id, timeline_id)
 
     # shut down the endpoint and delete the timeline from the pageserver
     endpoint.stop()
 
-    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+    assert isinstance(env.pageserver_remote_storage, S3Storage)
 
-    timeline_delete_wait_completed(ps_client, tenant_id, timeline_id)
+    if broken_tenant:
+        ps_client.tenant_detach(tenant_id)
+        objects: List[ObjectTypeDef] = list_prefix(
+            env.pageserver_remote_storage, f"tenants/{tenant_id}/timelines/{timeline_id}/"
+        ).get("Contents", [])
+        for obj in objects:
+            obj_key = obj["Key"]
+            if "initdb-preserved.tar.zst" in obj_key:
+                continue
+            log.info(f"Deleting key from remote storage: {obj_key}")
+            remote_storage_delete_key(env.pageserver_remote_storage, obj_key)
+            pass
+
+        ps_client.tenant_attach(tenant_id, generation=10)
+    else:
+        timeline_delete_wait_completed(ps_client, tenant_id, timeline_id)
 
     # issue the restoration command
     ps_client.timeline_create(

From 4f7704af245b80d2c2883b993d5c4920e53dbf70 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 19 Feb 2024 17:44:20 +0000
Subject: [PATCH 0213/1571] storage controller: fix spurious reconciles after
 pageserver restarts (#6814)

## Problem

When investigating test failures
(https://github.com/neondatabase/neon/issues/6813) I noticed we were
doing a bunch of Reconciler runs right after splitting a tenant.

It's because the splitting test does a pageserver restart, and there was
a bug in /re-attach handling, where we would update the generation
correctly in the database and intent state, but not observed state,
thereby triggering a reconciliation on the next call to maybe_reconcile.
This didn't break anything profound (underlying rules about generations
were respected), but caused the storage controller to do an un-needed
extra round of bumping the generation and reconciling.

## Summary of changes

- Start adding metrics to the storage controller
- Assert on the number of reconciles done in test_sharding_split_smoke
- Fix /re-attach to update `observed` such that we don't spuriously
re-reconcile tenants.
---
 Cargo.lock                                    |  1 +
 control_plane/attachment_service/Cargo.toml   |  1 +
 control_plane/attachment_service/src/lib.rs   |  1 +
 control_plane/attachment_service/src/main.rs  |  3 ++
 .../attachment_service/src/metrics.rs         | 32 ++++++++++++
 .../attachment_service/src/reconciler.rs      |  5 ++
 .../attachment_service/src/service.rs         |  9 ++++
 .../attachment_service/src/tenant_state.rs    | 20 ++++++++
 test_runner/fixtures/metrics.py               | 51 +++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py         |  8 ++-
 test_runner/fixtures/pageserver/http.py       | 43 +---------------
 test_runner/regress/test_sharding.py          | 23 +++++++++
 12 files changed, 155 insertions(+), 42 deletions(-)
 create mode 100644 control_plane/attachment_service/src/metrics.rs

diff --git a/Cargo.lock b/Cargo.lock
index e7a0d8b965..f25e3d1574 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -286,6 +286,7 @@ dependencies = [
  "git-version",
  "hyper",
  "metrics",
+ "once_cell",
  "pageserver_api",
  "pageserver_client",
  "postgres_connection",
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index ada35295f9..9e1c6377ee 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -18,6 +18,7 @@ clap.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hyper.workspace = true
+once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index 238efdf5a8..1a2b001392 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -3,6 +3,7 @@ use utils::seqwait::MonotonicCounter;
 
 mod compute_hook;
 pub mod http;
+pub mod metrics;
 mod node;
 pub mod persistence;
 mod reconciler;
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index b323ae8820..db4f00644f 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -6,6 +6,7 @@
 ///
 use anyhow::{anyhow, Context};
 use attachment_service::http::make_router;
+use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service};
 use aws_config::{self, BehaviorVersion, Region};
@@ -205,6 +206,8 @@ async fn async_main() -> anyhow::Result<()> {
         logging::Output::Stdout,
     )?;
 
+    preinitialize_metrics();
+
     let args = Cli::parse();
     tracing::info!(
         "version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
diff --git a/control_plane/attachment_service/src/metrics.rs b/control_plane/attachment_service/src/metrics.rs
new file mode 100644
index 0000000000..ffe093b9c8
--- /dev/null
+++ b/control_plane/attachment_service/src/metrics.rs
@@ -0,0 +1,32 @@
+use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
+use once_cell::sync::Lazy;
+
+pub(crate) struct ReconcilerMetrics {
+    pub(crate) spawned: IntCounter,
+    pub(crate) complete: IntCounterVec,
+}
+
+impl ReconcilerMetrics {
+    // Labels used on [`Self::complete`]
+    pub(crate) const SUCCESS: &'static str = "ok";
+    pub(crate) const ERROR: &'static str = "success";
+    pub(crate) const CANCEL: &'static str = "cancel";
+}
+
+pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
+    spawned: register_int_counter!(
+        "storage_controller_reconcile_spawn",
+        "Count of how many times we spawn a reconcile task",
+    )
+    .expect("failed to define a metric"),
+    complete: register_int_counter_vec!(
+        "storage_controller_reconcile_complete",
+        "Reconciler tasks completed, broken down by success/failure/cancelled",
+        &["status"],
+    )
+    .expect("failed to define a metric"),
+});
+
+pub fn preinitialize_metrics() {
+    Lazy::force(&RECONCILER);
+}
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index e765dfc2ae..cdd6f76b14 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -92,6 +92,8 @@ impl TargetState {
 pub(crate) enum ReconcileError {
     #[error(transparent)]
     Notify(#[from] NotifyError),
+    #[error("Cancelled")]
+    Cancel,
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
@@ -497,6 +499,9 @@ impl Reconciler {
         }
 
         for (node_id, conf) in changes {
+            if self.cancel.is_cancelled() {
+                return Err(ReconcileError::Cancel);
+            }
             self.location_config(node_id, conf, None).await?;
         }
 
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 097b4a1a47..b1e66ebdad 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -807,6 +807,15 @@ impl Service {
             };
 
             shard_state.generation = std::cmp::max(shard_state.generation, new_gen);
+            if let Some(observed) = shard_state
+                .observed
+                .locations
+                .get_mut(&reattach_req.node_id)
+            {
+                if let Some(conf) = observed.conf.as_mut() {
+                    conf.generation = new_gen.into();
+                }
+            }
 
             // TODO: cancel/restart any running reconciliation for this tenant, it might be trying
             // to call location_conf API with an old generation.  Wait for cancellation to complete
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 1a68864091..b0ddb83f06 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -1,5 +1,6 @@
 use std::{collections::HashMap, sync::Arc, time::Duration};
 
+use crate::metrics;
 use control_plane::attachment_service::NodeAvailability;
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
@@ -570,6 +571,7 @@ impl TenantState {
         let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
                                                         tenant_id=%reconciler.tenant_shard_id.tenant_id,
                                                         shard_id=%reconciler.tenant_shard_id.shard_slug());
+        metrics::RECONCILER.spawned.inc();
         let join_handle = tokio::task::spawn(
             async move {
                 // Wait for any previous reconcile task to complete before we start
@@ -586,6 +588,10 @@ impl TenantState {
                 // TODO: wrap all remote API operations in cancellation check
                 // as well.
                 if reconciler.cancel.is_cancelled() {
+                    metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
+                        .inc();
                     return;
                 }
 
@@ -599,6 +605,20 @@ impl TenantState {
                     reconciler.compute_notify().await.ok();
                 }
 
+                // Update result counter
+                match &result {
+                    Ok(_) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
+                    Err(ReconcileError::Cancel) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
+                    Err(_) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
+                }
+                .inc();
+
                 result_tx
                     .send(ReconcileResult {
                         sequence: reconcile_seq,
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 418370c3ab..f433db2167 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -4,6 +4,8 @@ from typing import Dict, List, Optional, Tuple
 from prometheus_client.parser import text_string_to_metric_families
 from prometheus_client.samples import Sample
 
+from fixtures.log_helper import log
+
 
 class Metrics:
     metrics: Dict[str, List[Sample]]
@@ -31,6 +33,55 @@ class Metrics:
         return res[0]
 
 
+class MetricsGetter:
+    """
+    Mixin for types that implement a `get_metrics` function and would like associated
+    helpers for querying the metrics
+    """
+
+    def get_metrics(self) -> Metrics:
+        raise NotImplementedError()
+
+    def get_metric_value(
+        self, name: str, filter: Optional[Dict[str, str]] = None
+    ) -> Optional[float]:
+        metrics = self.get_metrics()
+        results = metrics.query_all(name, filter=filter)
+        if not results:
+            log.info(f'could not find metric "{name}"')
+            return None
+        assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
+        return results[0].value
+
+    def get_metrics_values(
+        self, names: list[str], filter: Optional[Dict[str, str]] = None
+    ) -> Dict[str, float]:
+        """
+        When fetching multiple named metrics, it is more efficient to use this
+        than to call `get_metric_value` repeatedly.
+
+        Throws RuntimeError if no metrics matching `names` are found, or if
+        not all of `names` are found: this method is intended for loading sets
+        of metrics whose existence is coupled.
+        """
+        metrics = self.get_metrics()
+        samples = []
+        for name in names:
+            samples.extend(metrics.query_all(name, filter=filter))
+
+        result = {}
+        for sample in samples:
+            if sample.name in result:
+                raise RuntimeError(f"Multiple values found for {sample.name}")
+            result[sample.name] = sample.value
+
+        if len(result) != len(names):
+            log.info(f"Metrics found: {metrics.metrics}")
+            raise RuntimeError(f"could not find all metrics {' '.join(names)}")
+
+        return result
+
+
 def parse_metrics(text: str, name: str = "") -> Metrics:
     metrics = Metrics(name)
     gen = text_string_to_metric_families(text)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 04af73c327..b347ff44e9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -46,6 +46,7 @@ from urllib3.util.retry import Retry
 from fixtures import overlayfs
 from fixtures.broker import NeonBroker
 from fixtures.log_helper import log
+from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pageserver.allowed_errors import (
     DEFAULT_PAGESERVER_ALLOWED_ERRORS,
     scan_pageserver_log_for_errors,
@@ -1913,7 +1914,7 @@ class Pagectl(AbstractNeonCli):
         return IndexPartDump.from_json(parsed)
 
 
-class NeonAttachmentService:
+class NeonAttachmentService(MetricsGetter):
     def __init__(self, env: NeonEnv, auth_enabled: bool):
         self.env = env
         self.running = False
@@ -1951,6 +1952,11 @@ class NeonAttachmentService:
 
         return headers
 
+    def get_metrics(self) -> Metrics:
+        res = self.request("GET", f"{self.env.attachment_service_api}/metrics")
+        res.raise_for_status()
+        return parse_metrics(res.text)
+
     def ready(self) -> bool:
         resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
         if resp.status_code == 503:
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index adea9ca764..6af3b6a912 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -12,7 +12,7 @@ from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 
 from fixtures.log_helper import log
-from fixtures.metrics import Metrics, parse_metrics
+from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import Fn
@@ -125,7 +125,7 @@ class TenantConfig:
         )
 
 
-class PageserverHttpClient(requests.Session):
+class PageserverHttpClient(requests.Session, MetricsGetter):
     def __init__(
         self,
         port: int,
@@ -721,45 +721,6 @@ class PageserverHttpClient(requests.Session):
             assert len(matches) < 2, "above filter should uniquely identify metric"
         return value
 
-    def get_metric_value(
-        self, name: str, filter: Optional[Dict[str, str]] = None
-    ) -> Optional[float]:
-        metrics = self.get_metrics()
-        results = metrics.query_all(name, filter=filter)
-        if not results:
-            log.info(f'could not find metric "{name}"')
-            return None
-        assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
-        return results[0].value
-
-    def get_metrics_values(
-        self, names: list[str], filter: Optional[Dict[str, str]] = None
-    ) -> Dict[str, float]:
-        """
-        When fetching multiple named metrics, it is more efficient to use this
-        than to call `get_metric_value` repeatedly.
-
-        Throws RuntimeError if no metrics matching `names` are found, or if
-        not all of `names` are found: this method is intended for loading sets
-        of metrics whose existence is coupled.
-        """
-        metrics = self.get_metrics()
-        samples = []
-        for name in names:
-            samples.extend(metrics.query_all(name, filter=filter))
-
-        result = {}
-        for sample in samples:
-            if sample.name in result:
-                raise RuntimeError(f"Multiple values found for {sample.name}")
-            result[sample.name] = sample.value
-
-        if len(result) != len(names):
-            log.info(f"Metrics found: {metrics.metrics}")
-            raise RuntimeError(f"could not find all metrics {' '.join(names)}")
-
-        return result
-
     def layer_map_info(
         self,
         tenant_id: Union[TenantId, TenantShardId],
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index fcf4b9f72a..5676727a2e 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -255,3 +255,26 @@ def test_sharding_split_smoke(
         env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10)
 
     workload.validate()
+
+    # Check that we didn't do any spurious reconciliations.
+    # Total number of reconciles should have been one per original shard, plus
+    # one for each shard that was migrated.
+    reconcile_ok = env.attachment_service.get_metric_value(
+        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
+    )
+    assert reconcile_ok == shard_count + split_shard_count // 2
+
+    # Check that no cancelled or errored reconciliations occurred: this test does no
+    # failure injection and should run clean.
+    assert (
+        env.attachment_service.get_metric_value(
+            "storage_controller_reconcile_complete_total", filter={"status": "cancel"}
+        )
+        is None
+    )
+    assert (
+        env.attachment_service.get_metric_value(
+            "storage_controller_reconcile_complete_total", filter={"status": "error"}
+        )
+        is None
+    )

From 0c105ef3529562214aaba9a7ca9006977ea3e9c0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 19 Feb 2024 20:29:23 +0000
Subject: [PATCH 0214/1571] storage controller: debug observability endpoints
 and self-test (#6820)

This PR stacks on https://github.com/neondatabase/neon/pull/6814

Observability:
- Because we only persist a subset of our state, and our external API is
pretty high level, it can be hard to get at the detail of what's going
on internally (e.g. the IntentState of a shard).
- Add debug endpoints for getting a full dump of all TenantState and
SchedulerNode objects
- Enrich the /control/v1/node listing endpoint to include full in-memory
detail of `Node` rather than just the `NodePersistence` subset

Consistency checks:
- The storage controller maintains separate in-memory and on-disk
states, by design. To catch subtle bugs, it is useful to occasionally
cross-check these.
- The Scheduler maintains reference counts for shard->node
relationships, which could drift if there was a bug in IntentState:
exhausively cross check them in tests.
---
 control_plane/attachment_service/src/http.rs  |  23 +++
 control_plane/attachment_service/src/lib.rs   |   4 +-
 control_plane/attachment_service/src/node.rs  |   9 +-
 .../attachment_service/src/persistence.rs     |   2 +-
 .../attachment_service/src/scheduler.rs       |  87 ++++++++++-
 .../attachment_service/src/service.rs         | 135 ++++++++++++++++--
 .../attachment_service/src/tenant_state.rs    |  41 +++++-
 control_plane/src/attachment_service.rs       |   4 +-
 test_runner/fixtures/neon_fixtures.py         |  11 ++
 test_runner/regress/test_sharding.py          |   6 +
 test_runner/regress/test_sharding_service.py  |  54 ++++++-
 11 files changed, 346 insertions(+), 30 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 38785d3a98..d6c8fa084b 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -333,6 +333,22 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
     json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }
 
+async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&req);
+    state.service.tenants_dump()
+}
+
+async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&req);
+    state.service.scheduler_dump()
+}
+
+async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&req);
+
+    json_response(StatusCode::OK, state.service.consistency_check().await?)
+}
+
 /// Status endpoint is just used for checking that our HTTP listener is up
 async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(StatusCode::OK, ())
@@ -421,6 +437,13 @@ pub fn make_router(
         .post("/debug/v1/node/:node_id/drop", |r| {
             request_span(r, handle_node_drop)
         })
+        .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
+        .get("/debug/v1/scheduler", |r| {
+            request_span(r, handle_scheduler_dump)
+        })
+        .post("/debug/v1/consistency_check", |r| {
+            request_span(r, handle_consistency_check)
+        })
         .get("/control/v1/tenant/:tenant_id/locate", |r| {
             tenant_service_handler(r, handle_tenant_locate)
         })
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index 1a2b001392..e950a57e57 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -12,7 +12,7 @@ mod schema;
 pub mod service;
 mod tenant_state;
 
-#[derive(Clone, Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize, Debug)]
 enum PlacementPolicy {
     /// Cheapest way to attach a tenant: just one pageserver, no secondary
     Single,
@@ -23,7 +23,7 @@ enum PlacementPolicy {
     Detached,
 }
 
-#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
+#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
 struct Sequence(u64);
 
 impl Sequence {
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index 47f61702d8..59784249d7 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,9 +1,16 @@
 use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+use serde::Serialize;
 use utils::id::NodeId;
 
 use crate::persistence::NodePersistence;
 
-#[derive(Clone)]
+/// Represents the in-memory description of a Node.
+///
+/// Scheduling statistics are maintened separately in [`crate::scheduler`].
+///
+/// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the
+/// implementation of serialization on this type is only for debug dumps.
+#[derive(Clone, Serialize, Eq, PartialEq)]
 pub(crate) struct Node {
     pub(crate) id: NodeId,
 
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index c5829cae88..2d0c8a9d15 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -477,7 +477,7 @@ impl Persistence {
 }
 
 /// Parts of [`crate::tenant_state::TenantState`] that are stored durably
-#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
+#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
     #[serde(default)]
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 7a99118312..39d8d0a260 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -1,4 +1,5 @@
-use crate::node::Node;
+use crate::{node::Node, tenant_state::TenantState};
+use serde::Serialize;
 use std::collections::HashMap;
 use utils::{http::error::ApiError, id::NodeId};
 
@@ -17,6 +18,7 @@ impl From<ScheduleError> for ApiError {
     }
 }
 
+#[derive(Serialize, Eq, PartialEq)]
 struct SchedulerNode {
     /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
     shard_count: usize,
@@ -26,6 +28,12 @@ struct SchedulerNode {
     may_schedule: bool,
 }
 
+/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
+/// on which to run.
+///
+/// The type has no persistent state of its own: this is all populated at startup.  The Serialize
+/// impl is only for debug dumps.
+#[derive(Serialize)]
 pub(crate) struct Scheduler {
     nodes: HashMap<NodeId, SchedulerNode>,
 }
@@ -48,6 +56,77 @@ impl Scheduler {
         }
     }
 
+    /// For debug/support: check that our internal statistics are in sync with the state of
+    /// the nodes & tenant shards.
+    ///
+    /// If anything is inconsistent, log details and return an error.
+    pub(crate) fn consistency_check<'a>(
+        &self,
+        nodes: impl Iterator<Item = &'a Node>,
+        shards: impl Iterator<Item = &'a TenantState>,
+    ) -> anyhow::Result<()> {
+        let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
+        for node in nodes {
+            expect_nodes.insert(
+                node.id,
+                SchedulerNode {
+                    shard_count: 0,
+                    may_schedule: node.may_schedule(),
+                },
+            );
+        }
+
+        for shard in shards {
+            if let Some(node_id) = shard.intent.get_attached() {
+                match expect_nodes.get_mut(node_id) {
+                    Some(node) => node.shard_count += 1,
+                    None => anyhow::bail!(
+                        "Tenant {} references nonexistent node {}",
+                        shard.tenant_shard_id,
+                        node_id
+                    ),
+                }
+            }
+
+            for node_id in shard.intent.get_secondary() {
+                match expect_nodes.get_mut(node_id) {
+                    Some(node) => node.shard_count += 1,
+                    None => anyhow::bail!(
+                        "Tenant {} references nonexistent node {}",
+                        shard.tenant_shard_id,
+                        node_id
+                    ),
+                }
+            }
+        }
+
+        for (node_id, expect_node) in &expect_nodes {
+            let Some(self_node) = self.nodes.get(node_id) else {
+                anyhow::bail!("Node {node_id} not found in Self")
+            };
+
+            if self_node != expect_node {
+                tracing::error!("Inconsistency detected in scheduling state for node {node_id}");
+                tracing::error!("Expected state: {}", serde_json::to_string(expect_node)?);
+                tracing::error!("Self state: {}", serde_json::to_string(self_node)?);
+
+                anyhow::bail!("Inconsistent state on {node_id}");
+            }
+        }
+
+        if expect_nodes.len() != self.nodes.len() {
+            // We just checked that all the expected nodes are present.  If the lengths don't match,
+            // it means that we have nodes in Self that are unexpected.
+            for node_id in self.nodes.keys() {
+                if !expect_nodes.contains_key(node_id) {
+                    anyhow::bail!("Node {node_id} found in Self but not in expected nodes");
+                }
+            }
+        }
+
+        Ok(())
+    }
+
     /// Increment the reference count of a node.  This reference count is used to guide scheduling
     /// decisions, not for memory management: it represents one tenant shard whose IntentState targets
     /// this node.
@@ -90,6 +169,12 @@ impl Scheduler {
         }
     }
 
+    pub(crate) fn node_remove(&mut self, node_id: NodeId) {
+        if self.nodes.remove(&node_id).is_none() {
+            tracing::warn!(node_id=%node_id, "Removed non-existent node from scheduler");
+        }
+    }
+
     pub(crate) fn schedule_shard(
         &mut self,
         hard_exclude: &[NodeId],
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index b1e66ebdad..0fe758e731 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -6,6 +6,7 @@ use std::{
     time::{Duration, Instant},
 };
 
+use anyhow::Context;
 use control_plane::attachment_service::{
     AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, NodeAvailability,
     NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, TenantCreateResponse,
@@ -44,10 +45,7 @@ use utils::{
 use crate::{
     compute_hook::{self, ComputeHook},
     node::Node,
-    persistence::{
-        split_state::SplitState, DatabaseError, NodePersistence, Persistence,
-        TenantShardPersistence,
-    },
+    persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
     reconciler::attached_location_conf,
     scheduler::Scheduler,
     tenant_state::{
@@ -505,7 +503,9 @@ impl Service {
             // after when pageservers start up and register.
             let mut node_ids = HashSet::new();
             for tsp in &tenant_shard_persistence {
-                node_ids.insert(tsp.generation_pageserver);
+                if tsp.generation_pageserver != i64::MAX {
+                    node_ids.insert(tsp.generation_pageserver);
+                }
             }
             for node_id in node_ids {
                 tracing::info!("Creating node {} in scheduler for tests", node_id);
@@ -1460,6 +1460,11 @@ impl Service {
         // TODO: should use the ID last published to compute_hook, rather than the intent: the intent might
         // point to somewhere we haven't attached yet.
         let Some(node_id) = shard.intent.get_attached() else {
+            tracing::warn!(
+                tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                "Shard not scheduled (policy {:?}), cannot generate pass-through URL",
+                shard.policy
+            );
             return Err(ApiError::Conflict(
                 "Cannot call timeline API on non-attached tenant".to_string(),
             ));
@@ -1972,6 +1977,104 @@ impl Service {
         Ok(())
     }
 
+    /// For debug/support: a full JSON dump of TenantStates.  Returns a response so that
+    /// we don't have to make TenantState clonable in the return path.
+    pub(crate) fn tenants_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
+        let serialized = {
+            let locked = self.inner.read().unwrap();
+            let result = locked.tenants.values().collect::<Vec<_>>();
+            serde_json::to_string(&result).map_err(|e| ApiError::InternalServerError(e.into()))?
+        };
+
+        hyper::Response::builder()
+            .status(hyper::StatusCode::OK)
+            .header(hyper::header::CONTENT_TYPE, "application/json")
+            .body(hyper::Body::from(serialized))
+            .map_err(|e| ApiError::InternalServerError(e.into()))
+    }
+
+    /// Check the consistency of in-memory state vs. persistent state, and check that the
+    /// scheduler's statistics are up to date.
+    ///
+    /// These consistency checks expect an **idle** system.  If changes are going on while
+    /// we run, then we can falsely indicate a consistency issue.  This is sufficient for end-of-test
+    /// checks, but not suitable for running continuously in the background in the field.
+    pub(crate) async fn consistency_check(&self) -> Result<(), ApiError> {
+        let (mut expect_nodes, mut expect_shards) = {
+            let locked = self.inner.read().unwrap();
+
+            locked
+                .scheduler
+                .consistency_check(locked.nodes.values(), locked.tenants.values())
+                .context("Scheduler checks")
+                .map_err(ApiError::InternalServerError)?;
+
+            let expect_nodes = locked.nodes.values().cloned().collect::<Vec<_>>();
+
+            let expect_shards = locked
+                .tenants
+                .values()
+                .map(|t| t.to_persistent())
+                .collect::<Vec<_>>();
+
+            (expect_nodes, expect_shards)
+        };
+
+        let mut nodes = self.persistence.list_nodes().await?;
+        expect_nodes.sort_by_key(|n| n.id);
+        nodes.sort_by_key(|n| n.id);
+
+        if nodes != expect_nodes {
+            tracing::error!("Consistency check failed on nodes.");
+            tracing::error!(
+                "Nodes in memory: {}",
+                serde_json::to_string(&expect_nodes)
+                    .map_err(|e| ApiError::InternalServerError(e.into()))?
+            );
+            tracing::error!(
+                "Nodes in database: {}",
+                serde_json::to_string(&nodes)
+                    .map_err(|e| ApiError::InternalServerError(e.into()))?
+            );
+        }
+
+        let mut shards = self.persistence.list_tenant_shards().await?;
+        shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count));
+        expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count));
+
+        if shards != expect_shards {
+            tracing::error!("Consistency check failed on shards.");
+            tracing::error!(
+                "Shards in memory: {}",
+                serde_json::to_string(&expect_nodes)
+                    .map_err(|e| ApiError::InternalServerError(e.into()))?
+            );
+            tracing::error!(
+                "Shards in database: {}",
+                serde_json::to_string(&nodes)
+                    .map_err(|e| ApiError::InternalServerError(e.into()))?
+            );
+        }
+
+        Ok(())
+    }
+
+    /// For debug/support: a JSON dump of the [`Scheduler`].  Returns a response so that
+    /// we don't have to make TenantState clonable in the return path.
+    pub(crate) fn scheduler_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
+        let serialized = {
+            let locked = self.inner.read().unwrap();
+            serde_json::to_string(&locked.scheduler)
+                .map_err(|e| ApiError::InternalServerError(e.into()))?
+        };
+
+        hyper::Response::builder()
+            .status(hyper::StatusCode::OK)
+            .header(hyper::header::CONTENT_TYPE, "application/json")
+            .body(hyper::Body::from(serialized))
+            .map_err(|e| ApiError::InternalServerError(e.into()))
+    }
+
     /// This is for debug/support only: we simply drop all state for a tenant, without
     /// detaching or deleting it on pageservers.  We do not try and re-schedule any
     /// tenants that were on this node.
@@ -1990,19 +2093,21 @@ impl Service {
         nodes.remove(&node_id);
         locked.nodes = Arc::new(nodes);
 
+        locked.scheduler.node_remove(node_id);
+
         Ok(())
     }
 
-    pub(crate) async fn node_list(&self) -> Result<Vec<NodePersistence>, ApiError> {
-        // It is convenient to avoid taking the big lock and converting Node to a serializable
-        // structure, by fetching from storage instead of reading in-memory state.
-        let nodes = self
-            .persistence
-            .list_nodes()
-            .await?
-            .into_iter()
-            .map(|n| n.to_persistent())
-            .collect();
+    pub(crate) async fn node_list(&self) -> Result<Vec<Node>, ApiError> {
+        let nodes = {
+            self.inner
+                .read()
+                .unwrap()
+                .nodes
+                .values()
+                .cloned()
+                .collect::<Vec<_>>()
+        };
 
         Ok(nodes)
     }
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index b0ddb83f06..4ec6fdca67 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -1,11 +1,12 @@
 use std::{collections::HashMap, sync::Arc, time::Duration};
 
-use crate::metrics;
+use crate::{metrics, persistence::TenantShardPersistence};
 use control_plane::attachment_service::NodeAvailability;
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
     shard::{ShardIdentity, TenantShardId},
 };
+use serde::Serialize;
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::{instrument, Instrument};
@@ -27,6 +28,20 @@ use crate::{
     service, PlacementPolicy, Sequence,
 };
 
+/// Serialization helper
+fn read_mutex_content<S, T>(v: &std::sync::Mutex<T>, serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: serde::ser::Serializer,
+    T: Clone + std::fmt::Display,
+{
+    serializer.collect_str(&v.lock().unwrap())
+}
+
+/// In-memory state for a particular tenant shard.
+///
+/// This struct implement Serialize for debugging purposes, but is _not_ persisted
+/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
+#[derive(Serialize)]
 pub(crate) struct TenantState {
     pub(crate) tenant_shard_id: TenantShardId,
 
@@ -61,6 +76,7 @@ pub(crate) struct TenantState {
     /// If a reconcile task is currently in flight, it may be joined here (it is
     /// only safe to join if either the result has been received or the reconciler's
     /// cancellation token has been fired)
+    #[serde(skip)]
     pub(crate) reconciler: Option<ReconcilerHandle>,
 
     /// If a tenant is being split, then all shards with that TenantId will have a
@@ -70,16 +86,19 @@ pub(crate) struct TenantState {
 
     /// Optionally wait for reconciliation to complete up to a particular
     /// sequence number.
+    #[serde(skip)]
     pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
 
     /// Indicates sequence number for which we have encountered an error reconciling.  If
     /// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred,
     /// and callers should stop waiting for `waiter` and propagate the error.
+    #[serde(skip)]
     pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
 
     /// The most recent error from a reconcile on this tenant
     /// TODO: generalize to an array of recent events
     /// TOOD: use a ArcSwap instead of mutex for faster reads?
+    #[serde(serialize_with = "read_mutex_content")]
     pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
 
     /// If we have a pending compute notification that for some reason we weren't able to send,
@@ -89,7 +108,7 @@ pub(crate) struct TenantState {
     pub(crate) pending_compute_notification: bool,
 }
 
-#[derive(Default, Clone, Debug)]
+#[derive(Default, Clone, Debug, Serialize)]
 pub(crate) struct IntentState {
     attached: Option<NodeId>,
     secondary: Vec<NodeId>,
@@ -194,7 +213,7 @@ impl Drop for IntentState {
     }
 }
 
-#[derive(Default, Clone)]
+#[derive(Default, Clone, Serialize)]
 pub(crate) struct ObservedState {
     pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
 }
@@ -208,7 +227,7 @@ pub(crate) struct ObservedState {
 ///       what it is (e.g. we failed partway through configuring it)
 ///     * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
 ///       and that configuration will still be present unless something external interfered.
-#[derive(Clone)]
+#[derive(Clone, Serialize)]
 pub(crate) struct ObservedStateLocation {
     /// If None, it means we do not know the status of this shard's location on this node, but
     /// we know that we might have some state on this node.
@@ -661,4 +680,18 @@ impl TenantState {
 
         debug_assert!(!self.intent.all_pageservers().contains(&node_id));
     }
+
+    pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
+        TenantShardPersistence {
+            tenant_id: self.tenant_shard_id.tenant_id.to_string(),
+            shard_number: self.tenant_shard_id.shard_number.0 as i32,
+            shard_count: self.tenant_shard_id.shard_count.literal() as i32,
+            shard_stripe_size: self.shard.stripe_size.0 as i32,
+            generation: self.generation.into().unwrap_or(0) as i32,
+            generation_pageserver: i64::MAX,
+            placement_policy: serde_json::to_string(&self.policy).unwrap(),
+            config: serde_json::to_string(&self.config).unwrap(),
+            splitting: SplitState::default(),
+        }
+    }
 }
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 14bfda47c3..4a1d316fe7 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -113,7 +113,7 @@ pub struct TenantShardMigrateRequest {
     pub node_id: NodeId,
 }
 
-#[derive(Serialize, Deserialize, Clone, Copy)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeAvailability {
     // Normal, happy state
     Active,
@@ -137,7 +137,7 @@ impl FromStr for NodeAvailability {
 
 /// FIXME: this is a duplicate of the type in the attachment_service crate, because the
 /// type needs to be defined with diesel traits in there.
-#[derive(Serialize, Deserialize, Clone, Copy)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeSchedulingPolicy {
     Active,
     Filling,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b347ff44e9..cbf6e0e4de 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2100,6 +2100,17 @@ class NeonAttachmentService(MetricsGetter):
         log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}")
         assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id
 
+    def consistency_check(self):
+        """
+        Throw an exception if the service finds any inconsistencies in its state
+        """
+        response = self.request(
+            "POST",
+            f"{self.env.attachment_service_api}/debug/v1/consistency_check",
+        )
+        response.raise_for_status()
+        log.info("Attachment service passed consistency check")
+
     def __enter__(self) -> "NeonAttachmentService":
         return self
 
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 5676727a2e..99b2ceb8bc 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -83,6 +83,8 @@ def test_sharding_smoke(
         )
         assert timelines == {env.initial_timeline, timeline_b}
 
+    env.attachment_service.consistency_check()
+
 
 def test_sharding_split_unsharded(
     neon_env_builder: NeonEnvBuilder,
@@ -113,6 +115,8 @@ def test_sharding_split_unsharded(
 
     workload.validate()
 
+    env.attachment_service.consistency_check()
+
 
 def test_sharding_split_smoke(
     neon_env_builder: NeonEnvBuilder,
@@ -278,3 +282,5 @@ def test_sharding_split_smoke(
         )
         is None
     )
+
+    env.attachment_service.consistency_check()
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 248d992851..d2334c7776 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -51,13 +51,13 @@ def test_sharding_service_smoke(
     # The pageservers we started should have registered with the sharding service on startup
     nodes = env.attachment_service.node_list()
     assert len(nodes) == 2
-    assert set(n["node_id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}
+    assert set(n["id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}
 
     # Starting an additional pageserver should register successfully
     env.pageservers[2].start()
     nodes = env.attachment_service.node_list()
     assert len(nodes) == 3
-    assert set(n["node_id"] for n in nodes) == {ps.id for ps in env.pageservers}
+    assert set(n["id"] for n in nodes) == {ps.id for ps in env.pageservers}
 
     # Use a multiple of pageservers to get nice even number of shards on each one
     tenant_shard_count = len(env.pageservers) * 4
@@ -127,6 +127,8 @@ def test_sharding_service_smoke(
     assert counts[env.pageservers[0].id] == tenant_shard_count // 2
     assert counts[env.pageservers[2].id] == tenant_shard_count // 2
 
+    env.attachment_service.consistency_check()
+
 
 def test_node_status_after_restart(
     neon_env_builder: NeonEnvBuilder,
@@ -159,6 +161,8 @@ def test_node_status_after_restart(
     # should have had its availabilty state set to Active.
     env.attachment_service.tenant_create(TenantId.generate())
 
+    env.attachment_service.consistency_check()
+
 
 def test_sharding_service_passthrough(
     neon_env_builder: NeonEnvBuilder,
@@ -184,6 +188,8 @@ def test_sharding_service_passthrough(
     }
     assert status["state"]["slug"] == "Active"
 
+    env.attachment_service.consistency_check()
+
 
 def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
@@ -216,6 +222,8 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
     assert tenant_a not in observed
     assert tenant_b in observed
 
+    env.attachment_service.consistency_check()
+
 
 def test_sharding_service_onboarding(
     neon_env_builder: NeonEnvBuilder,
@@ -318,6 +326,8 @@ def test_sharding_service_onboarding(
     dest_ps.stop()
     dest_ps.start()
 
+    env.attachment_service.consistency_check()
+
 
 def test_sharding_service_compute_hook(
     httpserver: HTTPServer,
@@ -388,6 +398,8 @@ def test_sharding_service_compute_hook(
 
     wait_until(10, 1, received_restart_notification)
 
+    env.attachment_service.consistency_check()
+
 
 def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
     """
@@ -401,13 +413,47 @@ def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
     tenant_id = TenantId.generate()
     env.attachment_service.tenant_create(tenant_id, shard_count=2, shard_stripe_size=8192)
 
+    # Check that the consistency check passes on a freshly setup system
+    env.attachment_service.consistency_check()
+
     # These APIs are intentionally not implemented as methods on NeonAttachmentService, as
     # they're just for use in unanticipated circumstances.
-    env.attachment_service.request(
+
+    # Initial tenant (1 shard) and the one we just created (2 shards) should be visible
+    response = env.attachment_service.request(
+        "GET", f"{env.attachment_service_api}/debug/v1/tenant"
+    )
+    response.raise_for_status()
+    assert len(response.json()) == 3
+
+    # Scheduler should report the expected nodes and shard counts
+    response = env.attachment_service.request(
+        "GET", f"{env.attachment_service_api}/debug/v1/scheduler"
+    )
+    response.raise_for_status()
+    # Two nodes, in a dict of node_id->node
+    assert len(response.json()["nodes"]) == 2
+    assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
+    assert all(v["may_schedule"] for v in response.json()["nodes"].values())
+
+    response = env.attachment_service.request(
         "POST", f"{env.attachment_service_api}/debug/v1/node/{env.pageservers[1].id}/drop"
     )
+    response.raise_for_status()
     assert len(env.attachment_service.node_list()) == 1
 
-    env.attachment_service.request(
+    response = env.attachment_service.request(
         "POST", f"{env.attachment_service_api}/debug/v1/tenant/{tenant_id}/drop"
     )
+    response.raise_for_status()
+
+    # Tenant drop should be reflected in dump output
+    response = env.attachment_service.request(
+        "GET", f"{env.attachment_service_api}/debug/v1/tenant"
+    )
+    response.raise_for_status()
+    assert len(response.json()) == 1
+
+    # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
+    # meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
+    env.attachment_service.consistency_check()

From feb359b45924252da4eb4863a6c92d970ab46958 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 19 Feb 2024 21:46:22 +0000
Subject: [PATCH 0215/1571] CI: Update deprecated GitHub Actions (#6822)

## Problem

We use a bunch of deprecated actions.
See https://github.com/neondatabase/neon/actions/runs/7958569728
(Annotations section)

```
Node.js 16 actions are deprecated. Please update the following actions to use Node.js 20: actions/checkout@v3, actions/setup-java@v3, actions/cache@v3, actions/github-script@v6. For more information see: https://github.blog/changelog/2023-09-22-github-actions-transitioning-from-node-16-to-node-20/.
```

## Summary of changes
- `actions/cache@v3` -> `actions/cache@v4`
- `actions/checkout@v3` -> `actions/checkout@v4`
- `actions/github-script@v6` -> `actions/github-script@v7`
- `actions/setup-java@v3` -> `actions/setup-java@v4`
- `actions/upload-artifact@v3` -> `actions/upload-artifact@v4`
---
 .../actions/allure-report-generate/action.yml |  6 +--
 .../actions/run-python-test-set/action.yml    |  4 +-
 .github/workflows/approved-for-ci-run.yml     |  2 +-
 .github/workflows/benchmarking.yml            | 10 ++---
 .github/workflows/build_and_test.yml          | 44 +++++++++----------
 .github/workflows/neon_extra_builds.yml       | 16 +++----
 .github/workflows/pg_clients.yml              |  6 +--
 .github/workflows/trigger-e2e-tests.yml       |  5 +--
 8 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index f474dd3444..79f054cb06 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -59,7 +59,7 @@ runs:
         BUCKET: neon-github-public-dev
 
     # TODO: We can replace with a special docker image with Java and Allure pre-installed
-    - uses: actions/setup-java@v3
+    - uses: actions/setup-java@v4
       with:
         distribution: 'temurin'
         java-version: '17'
@@ -180,7 +180,7 @@ runs:
         fi
 
     - name: Cache poetry deps
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/pypoetry/virtualenvs
         key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
@@ -215,7 +215,7 @@ runs:
           rm -rf ${WORKDIR}
         fi
 
-    - uses: actions/github-script@v6
+    - uses: actions/github-script@v7
       if: always()
       env:
         REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 8852a28da9..d9e543d4bb 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -80,13 +80,13 @@ runs:
 
     - name: Checkout
       if: inputs.needs_postgres_source == 'true'
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
         fetch-depth: 1
 
     - name: Cache poetry deps
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/pypoetry/virtualenvs
         key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml
index ae2f173b47..69c48d86b9 100644
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -64,7 +64,7 @@ jobs:
     steps:
       - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
 
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           ref: main
           token: ${{ secrets.CI_ACCESS_TOKEN }}
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 8bf12c31b1..fc245f42a8 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -66,7 +66,7 @@ jobs:
       options: --init
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Download Neon artifact
       uses: ./.github/actions/download
@@ -221,7 +221,7 @@ jobs:
     timeout-minutes: 480
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Download Neon artifact
       uses: ./.github/actions/download
@@ -366,7 +366,7 @@ jobs:
       options: --init
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Download Neon artifact
       uses: ./.github/actions/download
@@ -465,7 +465,7 @@ jobs:
       options: --init
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Download Neon artifact
       uses: ./.github/actions/download
@@ -562,7 +562,7 @@ jobs:
       options: --init
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Download Neon artifact
       uses: ./.github/actions/download
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 3ce5d9c2b3..2a1c79e437 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -69,7 +69,7 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
@@ -106,13 +106,13 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           submodules: false
           fetch-depth: 1
 
       - name: Cache poetry deps
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cache/pypoetry/virtualenvs
           key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
@@ -138,7 +138,7 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           submodules: true
           fetch-depth: 1
@@ -146,7 +146,7 @@ jobs:
 #      Disabled for now
 #      - name: Restore cargo deps cache
 #        id: cache_cargo
-#        uses: actions/cache@v3
+#        uses: actions/cache@v4
 #        with:
 #          path: |
 #            !~/.cargo/registry/src
@@ -231,7 +231,7 @@ jobs:
           done
 
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           submodules: true
           fetch-depth: 1
@@ -303,7 +303,7 @@ jobs:
       # compressed crates.
 #      - name: Cache cargo deps
 #        id: cache_cargo
-#        uses: actions/cache@v3
+#        uses: actions/cache@v4
 #        with:
 #          path: |
 #            ~/.cargo/registry/
@@ -317,21 +317,21 @@ jobs:
 
       - name: Cache postgres v14 build
         id: cache_pg_14
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v14
           key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v15
           key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v16
           key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
@@ -451,7 +451,7 @@ jobs:
         pg_version: [ v14, v15, v16 ]
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           submodules: true
           fetch-depth: 1
@@ -492,10 +492,10 @@ jobs:
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Cache poetry deps
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cache/pypoetry/virtualenvs
           key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
@@ -529,7 +529,7 @@ jobs:
         build_type: [ release ]
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Pytest benchmarks
         uses: ./.github/actions/run-python-test-set
@@ -558,7 +558,7 @@ jobs:
       options: --init
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Create Allure report
         if: ${{ !cancelled() }}
@@ -569,7 +569,7 @@ jobs:
         env:
           REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
-      - uses: actions/github-script@v6
+      - uses: actions/github-script@v7
         if: ${{ !cancelled() }}
         with:
           # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
@@ -609,7 +609,7 @@ jobs:
         coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           submodules: true
           fetch-depth: 0
@@ -678,7 +678,7 @@ jobs:
           REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json
           echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT
 
-      - uses: actions/github-script@v6
+      - uses: actions/github-script@v7
         env:
           REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
           COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
@@ -904,7 +904,7 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
@@ -1118,7 +1118,7 @@ jobs:
           done
 
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           submodules: false
           fetch-depth: 0
@@ -1141,7 +1141,7 @@ jobs:
 
       - name: Create git tag
         if: github.ref_name == 'release'
-        uses: actions/github-script@v6
+        uses: actions/github-script@v7
         with:
           # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
           retries: 5
@@ -1155,7 +1155,7 @@ jobs:
 
       - name: Create GitHub release
         if: github.ref_name == 'release'
-        uses: actions/github-script@v6
+        uses: actions/github-script@v7
         with:
           # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
           retries: 5
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index ff2a3a040a..5c2f202b6b 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -57,21 +57,21 @@ jobs:
 
       - name: Cache postgres v14 build
         id: cache_pg_14
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v14
           key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v15
           key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v16
           key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
@@ -82,7 +82,7 @@ jobs:
           echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
 
       - name: Cache cargo deps
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: |
             ~/.cargo/registry
@@ -172,21 +172,21 @@ jobs:
 
       - name: Cache postgres v14 build
         id: cache_pg_14
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v14
           key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v15
           key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v16
           key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
@@ -356,7 +356,7 @@ jobs:
           echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
 
       - name: Publish build stats report
-        uses: actions/github-script@v6
+        uses: actions/github-script@v7
         env:
           REPORT_URL: ${{ steps.upload-stats.outputs.report-url }}
           SHA: ${{ github.event.pull_request.head.sha || github.sha }}
diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml
index 28016cadb1..50e3227a74 100644
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -28,7 +28,7 @@ jobs:
 
     steps:
     - name: Checkout
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - uses: actions/setup-python@v4
       with:
@@ -38,7 +38,7 @@ jobs:
       uses: snok/install-poetry@v1
 
     - name: Cache poetry deps
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/pypoetry/virtualenvs
         key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
@@ -82,7 +82,7 @@ jobs:
     # It will be fixed after switching to gen2 runner
     - name: Upload python test logs
       if: always()
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         retention-days: 7
         name: python-test-pg_clients-${{ runner.os }}-stage-logs
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 2776033805..7d04a8ec8a 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -9,7 +9,7 @@ on:
 defaults:
   run:
     shell: bash -euxo pipefail {0}
-    
+
 env:
   # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
   E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
@@ -37,7 +37,7 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
@@ -115,4 +115,3 @@ jobs:
                 \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
               }
             }"
- 
\ No newline at end of file

From 02a8b7fbe0bfee9d78b1d234f8c0c1946211326f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 20 Feb 2024 10:13:21 +0000
Subject: [PATCH 0216/1571] storage controller: issue timeline create/delete
 calls concurrently (#6827)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

Timeline creation is meant to be very fast: it should only take
approximately on S3 PUT latency. When we have many shards in a tenant,
we should preserve that responsiveness.

## Summary of changes

- Issue create/delete pageserver API calls concurrently across all >0
shards
- During tenant deletion, delete shard zero last, separately, to avoid
confusing anything using GETs on the timeline.
- Return 201 instead of 200 on creations to make cloud control plane
happy

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 control_plane/attachment_service/src/http.rs  |   7 +-
 .../attachment_service/src/service.rs         | 151 +++++++++++++-----
 libs/pageserver_api/src/models.rs             |   2 +-
 3 files changed, 114 insertions(+), 46 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index d6c8fa084b..67ab37dfc1 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -114,7 +114,10 @@ async fn handle_tenant_create(
     mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
-    json_response(StatusCode::OK, service.tenant_create(create_req).await?)
+    json_response(
+        StatusCode::CREATED,
+        service.tenant_create(create_req).await?,
+    )
 }
 
 // For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
@@ -196,7 +199,7 @@ async fn handle_tenant_timeline_create(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
     json_response(
-        StatusCode::OK,
+        StatusCode::CREATED,
         service
             .tenant_timeline_create(tenant_id, create_req)
             .await?,
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 0fe758e731..4082af3fe6 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -14,7 +14,7 @@ use control_plane::attachment_service::{
     TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
 use diesel::result::DatabaseErrorKind;
-use futures::StreamExt;
+use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
 use pageserver_api::{
     control_api::{
@@ -1287,8 +1287,6 @@ impl Service {
         tenant_id: TenantId,
         mut create_req: TimelineCreateRequest,
     ) -> Result<TimelineInfo, ApiError> {
-        let mut timeline_info = None;
-
         tracing::info!(
             "Creating timeline {}/{}",
             tenant_id,
@@ -1299,7 +1297,7 @@ impl Service {
 
         // TODO: refuse to do this if shard splitting is in progress
         // (https://github.com/neondatabase/neon/issues/6676)
-        let targets = {
+        let mut targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
 
@@ -1323,21 +1321,24 @@ impl Service {
             return Err(ApiError::NotFound(
                 anyhow::anyhow!("Tenant not found").into(),
             ));
-        }
-
-        for (tenant_shard_id, node) in targets {
-            // TODO: issue shard timeline creates in parallel, once the 0th is done.
-
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+        };
+        let shard_zero = targets.remove(0);
 
+        async fn create_one(
+            tenant_shard_id: TenantShardId,
+            node: Node,
+            jwt: Option<String>,
+            create_req: TimelineCreateRequest,
+        ) -> Result<TimelineInfo, ApiError> {
             tracing::info!(
                 "Creating timeline on shard {}/{}, attached to node {}",
                 tenant_shard_id,
                 create_req.new_timeline_id,
                 node.id
             );
+            let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
 
-            let shard_timeline_info = client
+            client
                 .timeline_create(tenant_shard_id, &create_req)
                 .await
                 .map_err(|e| match e {
@@ -1350,23 +1351,66 @@ impl Service {
                         ApiError::InternalServerError(anyhow::anyhow!(msg))
                     }
                     _ => ApiError::Conflict(format!("Failed to create timeline: {e}")),
-                })?;
-
-            if timeline_info.is_none() {
-                // If the caller specified an ancestor but no ancestor LSN, we are responsible for
-                // propagating the LSN chosen by the first shard to the other shards: it is important
-                // that all shards end up with the same ancestor_start_lsn.
-                if create_req.ancestor_timeline_id.is_some()
-                    && create_req.ancestor_start_lsn.is_none()
-                {
-                    create_req.ancestor_start_lsn = shard_timeline_info.ancestor_lsn;
-                }
-
-                // We will return the TimelineInfo from the first shard
-                timeline_info = Some(shard_timeline_info);
-            }
+                })
         }
-        Ok(timeline_info.expect("targets cannot be empty"))
+
+        // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then
+        // use whatever LSN that shard picked when creating on subsequent shards.  We arbitrarily use shard zero as the shard
+        // that will get the first creation request, and propagate the LSN to all the >0 shards.
+        let timeline_info = create_one(
+            shard_zero.0,
+            shard_zero.1,
+            self.config.jwt_token.clone(),
+            create_req.clone(),
+        )
+        .await?;
+
+        // Propagate the LSN that shard zero picked, if caller didn't provide one
+        if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none() {
+            create_req.ancestor_start_lsn = timeline_info.ancestor_lsn;
+        }
+
+        // Create timeline on remaining shards with number >0
+        if !targets.is_empty() {
+            // If we had multiple shards, issue requests for the remainder now.
+            let jwt = self.config.jwt_token.clone();
+            self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
+                let create_req = create_req.clone();
+                Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req))
+            })
+            .await?;
+        }
+
+        Ok(timeline_info)
+    }
+
+    /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
+    ///
+    /// On success, the returned vector contains exactly the same number of elements as the input `locations`.
+    async fn tenant_for_shards<F, R>(
+        &self,
+        locations: Vec<(TenantShardId, Node)>,
+        mut req_fn: F,
+    ) -> Result<Vec<R>, ApiError>
+    where
+        F: FnMut(
+            TenantShardId,
+            Node,
+        )
+            -> std::pin::Pin<Box<dyn futures::Future<Output = Result<R, ApiError>> + Send>>,
+    {
+        let mut futs = FuturesUnordered::new();
+        let mut results = Vec::with_capacity(locations.len());
+
+        for (tenant_shard_id, node) in locations {
+            futs.push(req_fn(tenant_shard_id, node));
+        }
+
+        while let Some(r) = futs.next().await {
+            results.push(r?);
+        }
+
+        Ok(results)
     }
 
     pub(crate) async fn tenant_timeline_delete(
@@ -1380,7 +1424,7 @@ impl Service {
 
         // TODO: refuse to do this if shard splitting is in progress
         // (https://github.com/neondatabase/neon/issues/6676)
-        let targets = {
+        let mut targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
 
@@ -1405,12 +1449,14 @@ impl Service {
                 anyhow::anyhow!("Tenant not found").into(),
             ));
         }
+        let shard_zero = targets.remove(0);
 
-        // TODO: call into shards concurrently
-        let mut any_pending = false;
-        for (tenant_shard_id, node) in targets {
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
-
+        async fn delete_one(
+            tenant_shard_id: TenantShardId,
+            timeline_id: TimelineId,
+            node: Node,
+            jwt: Option<String>,
+        ) -> Result<StatusCode, ApiError> {
             tracing::info!(
                 "Deleting timeline on shard {}/{}, attached to node {}",
                 tenant_shard_id,
@@ -1418,7 +1464,8 @@ impl Service {
                 node.id
             );
 
-            let status = client
+            let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
+            client
                 .timeline_delete(tenant_shard_id, timeline_id)
                 .await
                 .map_err(|e| {
@@ -1426,18 +1473,36 @@ impl Service {
                     "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {}: {e}",
                     node.id
                 ))
-                })?;
-
-            if status == StatusCode::ACCEPTED {
-                any_pending = true;
-            }
+                })
         }
 
-        if any_pending {
-            Ok(StatusCode::ACCEPTED)
-        } else {
-            Ok(StatusCode::NOT_FOUND)
+        let statuses = self
+            .tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
+                Box::pin(delete_one(
+                    tenant_shard_id,
+                    timeline_id,
+                    node,
+                    self.config.jwt_token.clone(),
+                ))
+            })
+            .await?;
+
+        // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero
+        if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) {
+            return Ok(StatusCode::ACCEPTED);
         }
+
+        // Delete shard zero last: this is not strictly necessary, but since a caller's GET on a timeline will be routed
+        // to shard zero, it gives a more obvious behavior that a GET returns 404 once the deletion is done.
+        let shard_zero_status = delete_one(
+            shard_zero.0,
+            timeline_id,
+            shard_zero.1,
+            self.config.jwt_token.clone(),
+        )
+        .await?;
+
+        Ok(shard_zero_status)
     }
 
     /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d546cb5c54..557a4d7de9 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -180,7 +180,7 @@ pub enum TimelineState {
     Broken { reason: String, backtrace: String },
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub struct TimelineCreateRequest {
     pub new_timeline_id: TimelineId,
     #[serde(default)]

From 686b3c79c8548d189ecd5db266da40e86719ab7c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 20 Feb 2024 10:44:46 +0000
Subject: [PATCH 0217/1571] http2 alpn (#6815)

## Problem

Proxy already supported HTTP2, but I expect no one is using it because
we don't advertise it in the TLS handshake.

## Summary of changes

#6335 without the websocket changes.
---
 poetry.lock                           | 140 +++++++++++++++++++++++---
 proxy/src/serverless.rs               |   5 +-
 pyproject.toml                        |   1 +
 test_runner/fixtures/neon_fixtures.py |  28 +++++-
 test_runner/regress/test_proxy.py     |  10 ++
 5 files changed, 170 insertions(+), 14 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index ad0a0afd81..8e1d713d29 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -158,6 +158,28 @@ files = [
 attrs = ">=16.0.0"
 pluggy = ">=0.4.0"
 
+[[package]]
+name = "anyio"
+version = "4.3.0"
+description = "High level compatibility layer for multiple asynchronous event loop implementations"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"},
+    {file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"},
+]
+
+[package.dependencies]
+exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
+idna = ">=2.8"
+sniffio = ">=1.1"
+typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""}
+
+[package.extras]
+doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
+test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
+trio = ["trio (>=0.23)"]
+
 [[package]]
 name = "async-timeout"
 version = "4.0.3"
@@ -1073,6 +1095,100 @@ files = [
     {file = "graphql_core-3.2.1-py3-none-any.whl", hash = "sha256:f83c658e4968998eed1923a2e3e3eddd347e005ac0315fbb7ca4d70ea9156323"},
 ]
 
+[[package]]
+name = "h11"
+version = "0.14.0"
+description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
+    {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
+]
+
+[[package]]
+name = "h2"
+version = "4.1.0"
+description = "HTTP/2 State-Machine based protocol implementation"
+optional = false
+python-versions = ">=3.6.1"
+files = [
+    {file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"},
+    {file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"},
+]
+
+[package.dependencies]
+hpack = ">=4.0,<5"
+hyperframe = ">=6.0,<7"
+
+[[package]]
+name = "hpack"
+version = "4.0.0"
+description = "Pure-Python HPACK header compression"
+optional = false
+python-versions = ">=3.6.1"
+files = [
+    {file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"},
+    {file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"},
+]
+
+[[package]]
+name = "httpcore"
+version = "1.0.3"
+description = "A minimal low-level HTTP client."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "httpcore-1.0.3-py3-none-any.whl", hash = "sha256:9a6a501c3099307d9fd76ac244e08503427679b1e81ceb1d922485e2f2462ad2"},
+    {file = "httpcore-1.0.3.tar.gz", hash = "sha256:5c0f9546ad17dac4d0772b0808856eb616eb8b48ce94f49ed819fd6982a8a544"},
+]
+
+[package.dependencies]
+certifi = "*"
+h11 = ">=0.13,<0.15"
+
+[package.extras]
+asyncio = ["anyio (>=4.0,<5.0)"]
+http2 = ["h2 (>=3,<5)"]
+socks = ["socksio (==1.*)"]
+trio = ["trio (>=0.22.0,<0.24.0)"]
+
+[[package]]
+name = "httpx"
+version = "0.26.0"
+description = "The next generation HTTP client."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "httpx-0.26.0-py3-none-any.whl", hash = "sha256:8915f5a3627c4d47b73e8202457cb28f1266982d1159bd5779d86a80c0eab1cd"},
+    {file = "httpx-0.26.0.tar.gz", hash = "sha256:451b55c30d5185ea6b23c2c793abf9bb237d2a7dfb901ced6ff69ad37ec1dfaf"},
+]
+
+[package.dependencies]
+anyio = "*"
+certifi = "*"
+h2 = {version = ">=3,<5", optional = true, markers = "extra == \"http2\""}
+httpcore = "==1.*"
+idna = "*"
+sniffio = "*"
+
+[package.extras]
+brotli = ["brotli", "brotlicffi"]
+cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
+http2 = ["h2 (>=3,<5)"]
+socks = ["socksio (==1.*)"]
+
+[[package]]
+name = "hyperframe"
+version = "6.0.1"
+description = "HTTP/2 framing layer for Python"
+optional = false
+python-versions = ">=3.6.1"
+files = [
+    {file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"},
+    {file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"},
+]
+
 [[package]]
 name = "idna"
 version = "3.3"
@@ -2052,7 +2168,6 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2225,6 +2340,17 @@ files = [
     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
 ]
 
+[[package]]
+name = "sniffio"
+version = "1.3.0"
+description = "Sniff out which async library your code is running under"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"},
+    {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
+]
+
 [[package]]
 name = "sshpubkeys"
 version = "3.3.1"
@@ -2431,16 +2557,6 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2678,4 +2794,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "e99954cbbfef8dcc5e13cea7103c87657639a192f2372983bdb8c5d624c2e447"
+content-hash = "cab9cf8cbf8dcd52022acfdabfae4778be3ed5a4afda832bd9c074a50c746763"
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index ee3e91495b..dbf4f9cc74 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -88,7 +88,10 @@ pub async fn task_main(
             return Ok(());
         }
     };
-    let tls_acceptor: tokio_rustls::TlsAcceptor = tls_config.to_server_config().into();
+    let mut tls_server_config = rustls::ServerConfig::clone(&tls_config.to_server_config());
+    // prefer http2, but support http/1.1
+    tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
+    let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into();
 
     let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
     let _ = addr_incoming.set_nodelay(true);
diff --git a/pyproject.toml b/pyproject.toml
index 8ddaf0cdfb..b498f8acce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ pytest-rerunfailures = "^13.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
+httpx = {extras = ["http2"], version = "^0.26.0"}
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index cbf6e0e4de..51b126b84b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -27,6 +27,7 @@ from urllib.parse import quote, urlparse
 
 import asyncpg
 import backoff
+import httpx
 import jwt
 import psycopg2
 import pytest
@@ -2856,9 +2857,34 @@ class NeonProxy(PgProtocol):
         )
 
         if expected_code is not None:
-            assert response.status_code == kwargs["expected_code"], f"response: {response.json()}"
+            assert response.status_code == expected_code, f"response: {response.json()}"
         return response.json()
 
+    async def http2_query(self, query, args, **kwargs):
+        # TODO maybe use default values if not provided
+        user = kwargs["user"]
+        password = kwargs["password"]
+        expected_code = kwargs.get("expected_code")
+
+        connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
+        async with httpx.AsyncClient(
+            http2=True, verify=str(self.test_output_dir / "proxy.crt")
+        ) as client:
+            response = await client.post(
+                f"https://{self.domain}:{self.external_http_port}/sql",
+                json={"query": query, "params": args},
+                headers={
+                    "Content-Type": "application/sql",
+                    "Neon-Connection-String": connstr,
+                    "Neon-Pool-Opt-In": "true",
+                },
+            )
+            assert response.http_version == "HTTP/2"
+
+            if expected_code is not None:
+                assert response.status_code == expected_code, f"response: {response.json()}"
+            return response.json()
+
     def get_metrics(self) -> str:
         request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics")
         request_result.raise_for_status()
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 884643cef0..9905f120e1 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -554,3 +554,13 @@ def test_sql_over_http_pool_custom_types(static_proxy: NeonProxy):
         "select array['foo'::foo, 'bar'::foo, 'baz'::foo] as data",
     )
     assert response["rows"][0]["data"] == ["foo", "bar", "baz"]
+
+
+@pytest.mark.asyncio
+async def test_sql_over_http2(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http with login password 'http' superuser")
+
+    resp = await static_proxy.http2_query(
+        "select 42 as answer", [], user="http", password="http", expected_code=200
+    )
+    assert resp["rows"] == [{"answer": 42}]

From 21a86487a2d1795b58cc7fac10097f299baf3542 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 20 Feb 2024 10:58:01 +0000
Subject: [PATCH 0218/1571] proxy: fix #6529 (#6807)

## Problem

`application_name` for HTTP is not being recorded

## Summary of changes

get `application_name` query param
---
 proxy/src/serverless/sql_over_http.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index ecb72abe73..e49c1c4db9 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -166,9 +166,12 @@ fn get_conn_info(
     let mut options = Option::None;
 
     for (key, value) in pairs {
-        if key == "options" {
-            options = Some(NeonOptions::parse_options_raw(&value));
-            break;
+        match &*key {
+            "options" => {
+                options = Some(NeonOptions::parse_options_raw(&value));
+            }
+            "application_name" => ctx.set_application(Some(value.into())),
+            _ => {}
         }
     }
 

From a48b23d777b2bf3bb19a759d50f87ea13149826c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 20 Feb 2024 14:06:25 +0100
Subject: [PATCH 0219/1571] fix(startup + remote_timeline_client): no-op
 deletion ops scheduled during startup (#6825)

Before this PR, if remote storage is configured, `load_layer_map`'s call
to `RemoteTimelineClient::schedule_layer_file_deletion` would schedule
an empty UploadOp::Delete for each timeline.

It's jsut CPU overhead, no actual interaction with deletion queue
on-disk state or S3, as far as I can tell.

However, it shows up in the "RemoteTimelineClient calls started
metrics", which I'm refining in an orthogonal PR.
---
 pageserver/src/tenant/remote_timeline_client.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 91e1179e53..547679c435 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -823,6 +823,10 @@ impl RemoteTimelineClient {
         }
 
         // schedule the actual deletions
+        if with_metadata.is_empty() {
+            // avoid scheduling the op & bumping the metric
+            return;
+        }
         let op = UploadOp::Delete(Delete {
             layers: with_metadata,
         });

From b467d8067bd03a973a1bc630e428e89949ac0d4b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 20 Feb 2024 14:09:15 +0100
Subject: [PATCH 0220/1571] fix(test_ondemand_download_timetravel):
 occasionally fails with WAL timeout during layer creation (#6818)

refs https://github.com/neondatabase/neon/issues/4112
amends https://github.com/neondatabase/neon/pull/6687

Since my last PR #6687 regarding this test, the type of flakiness that
has been observed has shifted to the beginning of the test, where we
create the layers:

```
timed out while waiting for remote_consistent_lsn to reach 0/411A5D8, was 0/411A5A0
```

[Example Allure
Report](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-6789/7932503173/index.html#/testresult/ddb877cfa4062f7d)

Analysis
--------

I suspect there was the following race condition:
- endpoints push out some tiny piece of WAL during their
  endpoints.stop_all()
- that WAL reaches the SK (it's just one SK according to logs)
- the SKs send it into the walreceiver connection
- the SK gets shut down
- the checkpoint is taken, with last_record_lsn = 0/411A5A0
- the PS's walreceiver_connection_handler processes the WAL that was
  sent into the connection by the SKs; this advances
  last_record_lsn to 0/411A5D8
- we get current_lsn = 0/411A5D8
- nothing flushes a layer

Changes
-------

There's no testing / debug interface to shut down / server all
walreceiver connections.
So, this PR restarts pageserver to achieve it.

Also, it lifts the "wait for image layer uploads" further up, so that
after this first
restart, the pageserver really does _nothing_ by itself, and so, the
origianl physical size mismatch issue quoted in #6687 should be fixed.
(My initial suspicion hasn't changed that it was due to the tiny chunk
of endpoint.stop_all() WAL being ingested after the second PS restart.)
---
 test_runner/regress/test_ondemand_download.py | 37 +++++++++++++------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 3a197875dd..caa52cbbfe 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -17,6 +17,7 @@ from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
     wait_for_upload,
     wait_for_upload_queue_empty,
+    wait_until_tenant_active,
 )
 from fixtures.remote_storage import RemoteStorageKind
 from fixtures.types import Lsn
@@ -165,6 +166,10 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder):
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
+    ####
+    # Produce layers
+    ####
+
     lsns = []
 
     table_len = 10000
@@ -194,19 +199,29 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder):
         # run checkpoint manually to be sure that data landed in remote storage
         client.timeline_checkpoint(tenant_id, timeline_id)
 
-    ##### Stop the first pageserver instance, erase all its data
+    # prevent new WAL from being produced, wait for layers to reach remote storage
     env.endpoints.stop_all()
-
-    # Stop safekeepers and take another checkpoint. The endpoints might
-    # have written a few more bytes during shutdown.
     for sk in env.safekeepers:
         sk.stop()
-
-    client.timeline_checkpoint(tenant_id, timeline_id)
-    current_lsn = Lsn(client.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
-
-    # wait until pageserver has successfully uploaded all the data to remote storage
+    # NB: the wait_for_upload returns as soon as remote_consistent_lsn == current_lsn.
+    # But the checkpoint also triggers a compaction
+    # => image layer generation =>
+    # => doesn't advance LSN
+    # => but we want the remote state to deterministic, so additionally, wait for upload queue to drain
     wait_for_upload(client, tenant_id, timeline_id, current_lsn)
+    wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id)
+    client.deletion_queue_flush(execute=True)
+    del current_lsn
+    env.pageserver.stop()
+    env.pageserver.start()
+    # We've shut down the SKs, then restarted the PSes to sever all walreceiver connections;
+    # This means pageserver's remote_consistent_lsn is now frozen to whatever it was after the pageserver.stop() call.
+    wait_until_tenant_active(client, tenant_id)
+
+    ###
+    # Produce layers complete;
+    # Start the actual testing.
+    ###
 
     def get_api_current_physical_size():
         d = client.timeline_detail(tenant_id, timeline_id)
@@ -223,9 +238,7 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder):
     log.info(filled_size)
     assert filled_current_physical == filled_size, "we don't yet do layer eviction"
 
-    # Wait until generated image layers are uploaded to S3
-    wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id)
-
+    # Stop the first pageserver instance, erase all its data
     env.pageserver.stop()
 
     # remove all the layer files

From d152d4f16f9a82fe0ea6eb815e1178d6e8540386 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 20 Feb 2024 13:40:46 +0000
Subject: [PATCH 0221/1571] pageserver: fix treating all download errors as
 'Other' (#6836)

## Problem

`download_retry` correctly uses a fatal check to avoid retrying forever
on cancellations and NotFound cases. However, `download_layer_file` was
casting all download errors to "Other" in order to attach an
anyhow::Context.

Noticed this issue in the context of secondary downloads, where requests
to download layers that might not exist are issued intentionally, and
this resulted in lots of error spam from retries that shouldn't have
happened.

## Summary of changes

- Remove the `.context()` so that the original DownloadError is visible
to backoff::retry
---
 .../tenant/remote_timeline_client/download.rs    | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 43f5e6c182..c70267474e 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -81,15 +81,7 @@ pub async fn download_layer_file<'a>(
                 .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
                 .map_err(DownloadError::Other)?;
 
-            let download = storage
-                .download(&remote_path, cancel)
-                .await
-                .with_context(|| {
-                    format!(
-                        "open a download stream for layer with remote storage path '{remote_path:?}'"
-                    )
-                })
-                .map_err(DownloadError::Other)?;
+            let download = storage.download(&remote_path, cancel).await?;
 
             let mut destination_file =
                 tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
@@ -98,9 +90,11 @@ pub async fn download_layer_file<'a>(
 
             let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file)
                 .await
-                .with_context(|| format!(
+                .with_context(|| {
+                    format!(
                     "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
-                ))
+                )
+                })
                 .map_err(DownloadError::Other);
 
             match bytes_amount {

From 9b8df2634f3a41a0da641aa2ab1e9cab86d1f430 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 20 Feb 2024 15:55:51 +0000
Subject: [PATCH 0222/1571] Fix active_timelines_count metric (#6839)

---
 safekeeper/src/metrics.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index fbba2e00fc..f12e079632 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -695,9 +695,11 @@ impl Collector for TimelineCollector {
 
         // report total number of timelines
         self.timelines_count.set(timelines_count as i64);
+        mfs.extend(self.timelines_count.collect());
+
         self.active_timelines_count
             .set(active_timelines_count as i64);
-        mfs.extend(self.timelines_count.collect());
+        mfs.extend(self.active_timelines_count.collect());
 
         mfs
     }

From eb02f4619e7cccdab7c4553b6ad257994b9460a0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 20 Feb 2024 16:34:12 +0000
Subject: [PATCH 0223/1571] tests: add a shutdown log noise case to
 test_location_conf_churn (#6828)

This test does lots of shutdowns, and we may emit this layer warning during shutdown.

Saw a spurious failure here:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-6820/7964134049/index.html#/testresult/784218040583d963
---
 test_runner/regress/test_pageserver_secondary.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index cbff01dc2a..8f694de2e1 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -74,16 +74,19 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    # We will make no effort to avoid stale attachments
     for ps in env.pageservers:
         ps.allowed_errors.extend(
             [
+                # We will make no effort to avoid stale attachments
                 ".*Dropped remote consistent LSN updates.*",
                 ".*Dropping stale deletions.*",
                 # page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found
                 ".*query handler.*Tenant.*not found.*",
                 # page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active
                 ".*query handler.*Tenant.*not active.*",
+                # this shutdown case is logged at WARN severity by the time it bubbles up to logical size calculation code
+                # WARN ...: initial size calculation failed: downloading failed, possibly for shutdown
+                ".*downloading failed, possibly for shutdown",
             ]
         )
 

From e49602ecf59ea0bc5be43990241c408be4de8d65 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 20 Feb 2024 17:52:23 +0100
Subject: [PATCH 0224/1571] feat(metrics): per-timeline metric for on-demand
 downloads, remove calls_started histogram (#6834)

refs #6737

# Problem

Before this PR, on-demand downloads weren't  measured per tenant_id.
This makes root-cause analysis of latency spikes harder, requiring us to
resort to log scraping for

```
{neon_service="pageserver"} |= `downloading on-demand` |= `$tenant_id`
```

which can be expensive when zooming out in Grafana.

Context: https://neondb.slack.com/archives/C033RQ5SPDH/p1707809037868189

# Solution / Changes

- Remove the calls_started histogram
- I did the dilegence, there are only 2 dashboards using this histogram,
    and in fact only one uses it as a histogram, the other just as a
    a counter.
- [Link
1](https://github.com/neondatabase/grafana-dashboard-export/blob/8115b54d9fa14c76da831ae21bbfbb56cc59ffb5/neonprod/dashboards/hkXNF7oVz/dashboard-Z31XmM24k.yaml#L1454):
`Pageserver Thrashing` dashboard, linked from playbook, will fix.
- [Link
2](https://github.com/neondatabase/grafana-dashboard-export/blob/8115b54d9fa14c76da831ae21bbfbb56cc59ffb5/neonprod/dashboards/CEllzAO4z/dashboard-sJqfNFL4k.yaml#L599):
one of my personal dashboards, unused for a long time, already broken in
other ways, no need to fix.
- replace `pageserver_remote_timeline_client_calls_unfinished` gauge
with a counter pair
- Required `Clone`-able `IntCounterPair`, made the necessary changes in
the `libs/metrics` crate
-  fix tests to deal with the fallout

A subsequent PR will remove a timeline-scoped metric to compensate.

Note that we don't need additional global counters for the per-timeline
counters affected by this PR; we can use the `remote_storage` histogram
for those, which, conveniently, also include the secondary-mode
downloads, which aren't covered by the remote timeline client metrics
(should they?).
---
 libs/metrics/src/lib.rs                       | 14 +++
 pageserver/src/metrics.rs                     | 99 ++++++-------------
 .../src/tenant/remote_timeline_client.rs      | 22 ++---
 test_runner/fixtures/metrics.py               | 17 ++--
 test_runner/fixtures/pageserver/http.py       | 29 +++---
 test_runner/fixtures/pageserver/utils.py      | 36 +++++--
 test_runner/regress/test_remote_storage.py    | 44 +++------
 7 files changed, 125 insertions(+), 136 deletions(-)

diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 18786106d1..744fc18e61 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -201,6 +201,11 @@ impl<P: Atomic> GenericCounterPairVec<P> {
     pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
         self.get_metric_with_label_values(vals).unwrap()
     }
+
+    pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
+        res[0] = self.inc.remove_label_values(vals);
+        res[1] = self.dec.remove_label_values(vals);
+    }
 }
 
 impl<P: Atomic> GenericCounterPair<P> {
@@ -247,6 +252,15 @@ impl<P: Atomic> GenericCounterPair<P> {
     }
 }
 
+impl<P: Atomic> Clone for GenericCounterPair<P> {
+    fn clone(&self) -> Self {
+        Self {
+            inc: self.inc.clone(),
+            dec: self.dec.clone(),
+        }
+    }
+}
+
 /// Guard returned by [`GenericCounterPair::guard`]
 pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);
 
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index a0fda39605..ee0bd268cc 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -4,8 +4,8 @@ use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
     register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
     register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
-    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec,
-    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
@@ -1266,13 +1266,12 @@ pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
 
 // remote storage metrics
 
-/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
-static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "pageserver_remote_timeline_client_calls_unfinished",
-        "Number of ongoing calls to remote timeline client. \
-         Used to populate pageserver_remote_timeline_client_calls_started. \
-         This metric is not useful for sampling from Prometheus, but useful in tests.",
+static REMOTE_TIMELINE_CLIENT_CALLS: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "pageserver_remote_timeline_client_calls_started",
+        "Number of started calls to remote timeline client.",
+        "pageserver_remote_timeline_client_calls_finished",
+        "Number of finshed calls to remote timeline client.",
         &[
             "tenant_id",
             "shard_id",
@@ -1281,23 +1280,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::
             "op_kind"
         ],
     )
-    .expect("failed to define a metric")
-});
-
-static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "pageserver_remote_timeline_client_calls_started",
-        "When calling a remote timeline client method, we record the current value \
-         of the calls_unfinished gauge in this histogram. Plot the histogram \
-         over time in a heatmap to visualize how many operations were ongoing \
-         at a given instant. It gives you a better idea of the queue depth \
-         than plotting the gauge directly, since operations may complete faster \
-         than the sampling interval.",
-        &["file_kind", "op_kind"],
-        // The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
-        vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
-    )
-    .expect("failed to define a metric")
+    .unwrap()
 });
 
 static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> =
@@ -2078,7 +2061,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
     shard_id: String,
     timeline_id: String,
     remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
-    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
+    calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
     bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
     bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
 }
@@ -2089,7 +2072,7 @@ impl RemoteTimelineClientMetrics {
             tenant_id: tenant_shard_id.tenant_id.to_string(),
             shard_id: format!("{}", tenant_shard_id.shard_slug()),
             timeline_id: timeline_id.to_string(),
-            calls_unfinished_gauge: Mutex::new(HashMap::default()),
+            calls: Mutex::new(HashMap::default()),
             bytes_started_counter: Mutex::new(HashMap::default()),
             bytes_finished_counter: Mutex::new(HashMap::default()),
             remote_physical_size_gauge: Mutex::new(None),
@@ -2129,15 +2112,15 @@ impl RemoteTimelineClientMetrics {
             .unwrap()
     }
 
-    fn calls_unfinished_gauge(
+    fn calls_counter_pair(
         &self,
         file_kind: &RemoteOpFileKind,
         op_kind: &RemoteOpKind,
-    ) -> IntGauge {
-        let mut guard = self.calls_unfinished_gauge.lock().unwrap();
+    ) -> IntCounterPair {
+        let mut guard = self.calls.lock().unwrap();
         let key = (file_kind.as_str(), op_kind.as_str());
         let metric = guard.entry(key).or_insert_with(move || {
-            REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
+            REMOTE_TIMELINE_CLIENT_CALLS
                 .get_metric_with_label_values(&[
                     &self.tenant_id,
                     &self.shard_id,
@@ -2150,17 +2133,6 @@ impl RemoteTimelineClientMetrics {
         metric.clone()
     }
 
-    fn calls_started_hist(
-        &self,
-        file_kind: &RemoteOpFileKind,
-        op_kind: &RemoteOpKind,
-    ) -> Histogram {
-        let key = (file_kind.as_str(), op_kind.as_str());
-        REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
-            .get_metric_with_label_values(&[key.0, key.1])
-            .unwrap()
-    }
-
     fn bytes_started_counter(
         &self,
         file_kind: &RemoteOpFileKind,
@@ -2231,7 +2203,7 @@ impl RemoteTimelineClientMetrics {
 #[must_use]
 pub(crate) struct RemoteTimelineClientCallMetricGuard {
     /// Decremented on drop.
-    calls_unfinished_metric: Option<IntGauge>,
+    calls_counter_pair: Option<IntCounterPair>,
     /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop.
     bytes_finished: Option<(IntCounter, u64)>,
 }
@@ -2241,10 +2213,10 @@ impl RemoteTimelineClientCallMetricGuard {
     /// The caller vouches to do the metric updates manually.
     pub fn will_decrement_manually(mut self) {
         let RemoteTimelineClientCallMetricGuard {
-            calls_unfinished_metric,
+            calls_counter_pair,
             bytes_finished,
         } = &mut self;
-        calls_unfinished_metric.take();
+        calls_counter_pair.take();
         bytes_finished.take();
     }
 }
@@ -2252,10 +2224,10 @@ impl RemoteTimelineClientCallMetricGuard {
 impl Drop for RemoteTimelineClientCallMetricGuard {
     fn drop(&mut self) {
         let RemoteTimelineClientCallMetricGuard {
-            calls_unfinished_metric,
+            calls_counter_pair,
             bytes_finished,
         } = self;
-        if let Some(guard) = calls_unfinished_metric.take() {
+        if let Some(guard) = calls_counter_pair.take() {
             guard.dec();
         }
         if let Some((bytes_finished_metric, value)) = bytes_finished {
@@ -2288,10 +2260,8 @@ impl RemoteTimelineClientMetrics {
         op_kind: &RemoteOpKind,
         size: RemoteTimelineClientMetricsCallTrackSize,
     ) -> RemoteTimelineClientCallMetricGuard {
-        let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
-        self.calls_started_hist(file_kind, op_kind)
-            .observe(calls_unfinished_metric.get() as f64);
-        calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric
+        let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
+        calls_counter_pair.inc();
 
         let bytes_finished = match size {
             RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {
@@ -2305,7 +2275,7 @@ impl RemoteTimelineClientMetrics {
             }
         };
         RemoteTimelineClientCallMetricGuard {
-            calls_unfinished_metric: Some(calls_unfinished_metric),
+            calls_counter_pair: Some(calls_counter_pair),
             bytes_finished,
         }
     }
@@ -2319,12 +2289,8 @@ impl RemoteTimelineClientMetrics {
         op_kind: &RemoteOpKind,
         size: RemoteTimelineClientMetricsCallTrackSize,
     ) {
-        let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
-        debug_assert!(
-            calls_unfinished_metric.get() > 0,
-            "begin and end should cancel out"
-        );
-        calls_unfinished_metric.dec();
+        let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
+        calls_counter_pair.dec();
         match size {
             RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {}
             RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
@@ -2341,18 +2307,15 @@ impl Drop for RemoteTimelineClientMetrics {
             shard_id,
             timeline_id,
             remote_physical_size_gauge,
-            calls_unfinished_gauge,
+            calls,
             bytes_started_counter,
             bytes_finished_counter,
         } = self;
-        for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
-            let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
-                tenant_id,
-                shard_id,
-                timeline_id,
-                a,
-                b,
-            ]);
+        for ((a, b), _) in calls.get_mut().unwrap().drain() {
+            let mut res = [Ok(()), Ok(())];
+            REMOTE_TIMELINE_CLIENT_CALLS
+                .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id, a, b]);
+            // don't care about results
         }
         for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
             let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 547679c435..7d30745a0d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -614,7 +614,7 @@ impl RemoteTimelineClient {
             metadata,
         );
         let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
-        self.calls_unfinished_metric_begin(&op);
+        self.metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
         upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
 
@@ -654,7 +654,7 @@ impl RemoteTimelineClient {
             metadata.generation, metadata.shard
         );
         let op = UploadOp::UploadLayer(layer, metadata);
-        self.calls_unfinished_metric_begin(&op);
+        self.metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
     }
 
@@ -830,7 +830,7 @@ impl RemoteTimelineClient {
         let op = UploadOp::Delete(Delete {
             layers: with_metadata,
         });
-        self.calls_unfinished_metric_begin(&op);
+        self.metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
     }
 
@@ -1520,10 +1520,10 @@ impl RemoteTimelineClient {
                 .await;
         }
 
-        self.calls_unfinished_metric_end(&task.op);
+        self.metric_end(&task.op);
     }
 
-    fn calls_unfinished_metric_impl(
+    fn metric_impl(
         &self,
         op: &UploadOp,
     ) -> Option<(
@@ -1560,17 +1560,17 @@ impl RemoteTimelineClient {
         Some(res)
     }
 
-    fn calls_unfinished_metric_begin(&self, op: &UploadOp) {
-        let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
+    fn metric_begin(&self, op: &UploadOp) {
+        let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
             Some(x) => x,
             None => return,
         };
         let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes);
-        guard.will_decrement_manually(); // in unfinished_ops_metric_end()
+        guard.will_decrement_manually(); // in metric_end(), see right below
     }
 
-    fn calls_unfinished_metric_end(&self, op: &UploadOp) {
-        let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
+    fn metric_end(&self, op: &UploadOp) {
+        let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
             Some(x) => x,
             None => return,
         };
@@ -1655,7 +1655,7 @@ impl RemoteTimelineClient {
 
                 // Tear down queued ops
                 for op in qi.queued_operations.into_iter() {
-                    self.calls_unfinished_metric_end(&op);
+                    self.metric_end(&op);
                     // Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
                     // which is exactly what we want to happen.
                     drop(op);
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index f433db2167..fd4618ca6a 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -54,7 +54,7 @@ class MetricsGetter:
         return results[0].value
 
     def get_metrics_values(
-        self, names: list[str], filter: Optional[Dict[str, str]] = None
+        self, names: list[str], filter: Optional[Dict[str, str]] = None, absence_ok=False
     ) -> Dict[str, float]:
         """
         When fetching multiple named metrics, it is more efficient to use this
@@ -63,6 +63,10 @@ class MetricsGetter:
         Throws RuntimeError if no metrics matching `names` are found, or if
         not all of `names` are found: this method is intended for loading sets
         of metrics whose existence is coupled.
+
+        If it's expected that there may be no results for some of the metrics,
+        specify `absence_ok=True`. The returned dict will then not contain values
+        for these metrics.
         """
         metrics = self.get_metrics()
         samples = []
@@ -75,9 +79,10 @@ class MetricsGetter:
                 raise RuntimeError(f"Multiple values found for {sample.name}")
             result[sample.name] = sample.value
 
-        if len(result) != len(names):
-            log.info(f"Metrics found: {metrics.metrics}")
-            raise RuntimeError(f"could not find all metrics {' '.join(names)}")
+        if not absence_ok:
+            if len(result) != len(names):
+                log.info(f"Metrics found: {metrics.metrics}")
+                raise RuntimeError(f"could not find all metrics {' '.join(names)}")
 
         return result
 
@@ -98,7 +103,8 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
 
 
 PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
-    "pageserver_remote_timeline_client_calls_unfinished",
+    "pageserver_remote_timeline_client_calls_started_total",
+    "pageserver_remote_timeline_client_calls_finished_total",
     "pageserver_remote_physical_size",
     "pageserver_remote_timeline_client_bytes_started_total",
     "pageserver_remote_timeline_client_bytes_finished_total",
@@ -127,7 +133,6 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
     *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
     *histogram("pageserver_wait_lsn_seconds"),
     *histogram("pageserver_remote_operation_seconds"),
-    *histogram("pageserver_remote_timeline_client_calls_started"),
     *histogram("pageserver_io_operations_seconds"),
     "pageserver_tenant_states_count",
 )
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 6af3b6a912..d4583308ff 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -694,32 +694,33 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
             },
         ).value
 
-    def get_remote_timeline_client_metric(
+    def get_remote_timeline_client_queue_count(
         self,
-        metric_name: str,
         tenant_id: TenantId,
         timeline_id: TimelineId,
         file_kind: str,
         op_kind: str,
-    ) -> Optional[float]:
-        metrics = self.get_metrics()
-        matches = metrics.query_all(
-            name=metric_name,
+    ) -> Optional[int]:
+        metrics = [
+            "pageserver_remote_timeline_client_calls_started_total",
+            "pageserver_remote_timeline_client_calls_finished_total",
+        ]
+        res = self.get_metrics_values(
+            metrics,
             filter={
                 "tenant_id": str(tenant_id),
                 "timeline_id": str(timeline_id),
                 "file_kind": str(file_kind),
                 "op_kind": str(op_kind),
             },
+            absence_ok=True,
         )
-        if len(matches) == 0:
-            value = None
-        elif len(matches) == 1:
-            value = matches[0].value
-            assert value is not None
-        else:
-            assert len(matches) < 2, "above filter should uniquely identify metric"
-        return value
+        if len(res) != 2:
+            return None
+        inc, dec = [res[metric] for metric in metrics]
+        queue_count = int(inc) - int(dec)
+        assert queue_count >= 0
+        return queue_count
 
     def layer_map_info(
         self,
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 201a34f964..1812eb438d 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -1,5 +1,5 @@
 import time
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from mypy_boto3_s3.type_defs import (
     DeleteObjectOutputTypeDef,
@@ -221,16 +221,40 @@ def wait_for_upload_queue_empty(
 ):
     while True:
         all_metrics = pageserver_http.get_metrics()
-        tl = all_metrics.query_all(
-            "pageserver_remote_timeline_client_calls_unfinished",
+        started = all_metrics.query_all(
+            "pageserver_remote_timeline_client_calls_started_total",
             {
                 "tenant_id": str(tenant_id),
                 "timeline_id": str(timeline_id),
             },
         )
-        assert len(tl) > 0
-        log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}")
-        if all(m.value == 0 for m in tl):
+        finished = all_metrics.query_all(
+            "pageserver_remote_timeline_client_calls_finished_total",
+            {
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+            },
+        )
+        assert len(started) == len(finished)
+        # this is `started left join finished`; if match, subtracting start from finished, resulting in queue depth
+        remaining_labels = ["shard_id", "file_kind", "op_kind"]
+        tl: List[Tuple[Any, float]] = []
+        for s in started:
+            found = False
+            for f in finished:
+                if all([s.labels[label] == f.labels[label] for label in remaining_labels]):
+                    assert (
+                        not found
+                    ), "duplicate match, remaining_labels don't uniquely identify sample"
+                    tl.append((s.labels, int(s.value) - int(f.value)))
+                    found = True
+            if not found:
+                tl.append((s.labels, int(s.value)))
+        assert len(tl) == len(started), "something broken with join logic"
+        log.info(f"upload queue for {tenant_id}/{timeline_id}:")
+        for labels, queue_count in tl:
+            log.info(f"  {labels}: {queue_count}")
+        if all(queue_count == 0 for (_, queue_count) in tl):
             return
         time.sleep(0.2)
 
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 32b4f54fbd..18eba6e1c3 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -274,15 +274,9 @@ def test_remote_storage_upload_queue_retries(
         wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     def get_queued_count(file_kind, op_kind):
-        val = client.get_remote_timeline_client_metric(
-            "pageserver_remote_timeline_client_calls_unfinished",
-            tenant_id,
-            timeline_id,
-            file_kind,
-            op_kind,
+        return client.get_remote_timeline_client_queue_count(
+            tenant_id, timeline_id, file_kind, op_kind
         )
-        assert val is not None, "expecting metric to be present"
-        return int(val)
 
     # create some layers & wait for uploads to finish
     overwrite_data_and_wait_for_it_to_arrive_at_pageserver("a")
@@ -434,7 +428,7 @@ def test_remote_timeline_client_calls_started_metric(
         assert timeline_id is not None
         for (file_kind, op_kind), observations in calls_started.items():
             val = client.get_metric_value(
-                name="pageserver_remote_timeline_client_calls_started_count",
+                name="pageserver_remote_timeline_client_calls_started_total",
                 filter={
                     "file_kind": str(file_kind),
                     "op_kind": str(op_kind),
@@ -537,16 +531,6 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
 
     client = env.pageserver.http_client()
 
-    def get_queued_count(file_kind, op_kind):
-        val = client.get_remote_timeline_client_metric(
-            "pageserver_remote_timeline_client_calls_unfinished",
-            tenant_id,
-            timeline_id,
-            file_kind,
-            op_kind,
-        )
-        return int(val) if val is not None else val
-
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
 
     client.configure_failpoints(("before-upload-layer", "return"))
@@ -580,7 +564,10 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
     def assert_compacted_and_uploads_queued():
         assert timeline_path.exists()
         assert len(list(timeline_path.glob("*"))) >= 8
-        assert get_queued_count(file_kind="index", op_kind="upload") > 0
+        assert (
+            get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload")
+            > 0
+        )
 
     wait_until(20, 0.1, assert_compacted_and_uploads_queued)
 
@@ -618,7 +605,10 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
     assert len(filtered) == 0
 
     # timeline deletion should kill ongoing uploads, so, the metric will be gone
-    assert get_queued_count(file_kind="index", op_kind="upload") is None
+    assert (
+        get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload")
+        is None
+    )
 
     # timeline deletion should be unblocking checkpoint ops
     checkpoint_thread.join(2.0)
@@ -919,16 +909,8 @@ def get_queued_count(
     file_kind: str,
     op_kind: str,
 ):
-    val = client.get_remote_timeline_client_metric(
-        "pageserver_remote_timeline_client_calls_unfinished",
-        tenant_id,
-        timeline_id,
-        file_kind,
-        op_kind,
-    )
-    if val is None:
-        return val
-    return int(val)
+    """The most important aspect of this function is shorter name & no return type so asserts are more concise."""
+    return client.get_remote_timeline_client_queue_count(tenant_id, timeline_id, file_kind, op_kind)
 
 
 def assert_nothing_to_upload(

From cbb599f353a7489e18201dbbcf8e7d596f9bfb66 Mon Sep 17 00:00:00 2001
From: Nikita Kalyanov <44959448+nikitakalyanov@users.noreply.github.com>
Date: Tue, 20 Feb 2024 19:42:36 +0200
Subject: [PATCH 0225/1571] Add /terminate API (#6745)

this is to speed up suspends, see
https://github.com/neondatabase/cloud/issues/10284

## Problem

## Summary of changes

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 compute_tools/src/bin/compute_ctl.rs     | 25 +++++------
 compute_tools/src/compute.rs             | 16 +++++++
 compute_tools/src/http/api.rs            | 55 ++++++++++++++++++++++++
 compute_tools/src/http/openapi_spec.yaml | 23 ++++++++++
 control_plane/src/endpoint.rs            |  4 +-
 libs/compute_api/src/responses.rs        |  4 ++
 6 files changed, 114 insertions(+), 13 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index a7e10d0aee..117919786e 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -45,7 +45,6 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
-use nix::sys::signal::{kill, Signal};
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info};
@@ -53,7 +52,9 @@ use url::Url;
 
 use compute_api::responses::ComputeStatus;
 
-use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
+use compute_tools::compute::{
+    forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
+};
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
@@ -394,6 +395,15 @@ fn main() -> Result<()> {
         info!("synced safekeepers at lsn {lsn}");
     }
 
+    let mut state = compute.state.lock().unwrap();
+    if state.status == ComputeStatus::TerminationPending {
+        state.status = ComputeStatus::Terminated;
+        compute.state_changed.notify_all();
+        // we were asked to terminate gracefully, don't exit to avoid restart
+        delay_exit = true
+    }
+    drop(state);
+
     if let Err(err) = compute.check_for_core_dumps() {
         error!("error while checking for core dumps: {err:?}");
     }
@@ -523,16 +533,7 @@ fn cli() -> clap::Command {
 /// wait for termination which would be easy then.
 fn handle_exit_signal(sig: i32) {
     info!("received {sig} termination signal");
-    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
-    if ss_pid != 0 {
-        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
-        kill(ss_pid, Signal::SIGTERM).ok();
-    }
-    let pg_pid = PG_PID.load(Ordering::SeqCst);
-    if pg_pid != 0 {
-        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        kill(pg_pid, Signal::SIGTERM).ok();
-    }
+    forward_termination_signal();
     exit(1);
 }
 
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 1c5363d048..142bb14fe5 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -28,6 +28,8 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;
 
+use nix::sys::signal::{kill, Signal};
+
 use remote_storage::{DownloadError, RemotePath};
 
 use crate::checker::create_availability_check_data;
@@ -1322,3 +1324,17 @@ LIMIT 100",
         Ok(remote_ext_metrics)
     }
 }
+
+pub fn forward_termination_signal() {
+    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
+    if ss_pid != 0 {
+        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
+        kill(ss_pid, Signal::SIGTERM).ok();
+    }
+    let pg_pid = PG_PID.load(Ordering::SeqCst);
+    if pg_pid != 0 {
+        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
+        // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
+        kill(pg_pid, Signal::SIGQUIT).ok();
+    }
+}
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index fa2c4cff28..f076951239 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -5,6 +5,7 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
 
+use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
@@ -123,6 +124,17 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
         }
 
+        (&Method::POST, "/terminate") => {
+            info!("serving /terminate POST request");
+            match handle_terminate_request(compute).await {
+                Ok(()) => Response::new(Body::empty()),
+                Err((msg, code)) => {
+                    error!("error handling /terminate request: {msg}");
+                    render_json_error(&msg, code)
+                }
+            }
+        }
+
         // download extension files from remote extension storage on demand
         (&Method::POST, route) if route.starts_with("/extension_server/") => {
             info!("serving {:?} POST request", route);
@@ -297,6 +309,49 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
         .unwrap()
 }
 
+async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
+    {
+        let mut state = compute.state.lock().unwrap();
+        if state.status == ComputeStatus::Terminated {
+            return Ok(());
+        }
+        if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
+            let msg = format!(
+                "invalid compute status for termination request: {:?}",
+                state.status.clone()
+            );
+            return Err((msg, StatusCode::PRECONDITION_FAILED));
+        }
+        state.status = ComputeStatus::TerminationPending;
+        compute.state_changed.notify_all();
+        drop(state);
+    }
+    forward_termination_signal();
+    info!("sent signal and notified waiters");
+
+    // Spawn a blocking thread to wait for compute to become Terminated.
+    // This is needed to do not block the main pool of workers and
+    // be able to serve other requests while some particular request
+    // is waiting for compute to finish configuration.
+    let c = compute.clone();
+    task::spawn_blocking(move || {
+        let mut state = c.state.lock().unwrap();
+        while state.status != ComputeStatus::Terminated {
+            state = c.state_changed.wait(state).unwrap();
+            info!(
+                "waiting for compute to become Terminated, current status: {:?}",
+                state.status
+            );
+        }
+
+        Ok(())
+    })
+    .await
+    .unwrap()?;
+    info!("terminated Postgres");
+    Ok(())
+}
+
 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
 async fn serve(port: u16, state: Arc<ComputeNode>) {
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index cedc6ece8f..d2ec54299f 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -168,6 +168,29 @@ paths:
               schema:
                 $ref: "#/components/schemas/GenericError"
 
+  /terminate:
+    post:
+      tags:
+      - Terminate
+      summary: Terminate Postgres and wait for it to exit
+      description: ""
+      operationId: terminate
+      responses:
+        200:
+          description: Result
+        412:
+          description: "wrong state"
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        500:
+          description: "Unexpected error"
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
 components:
   securitySchemes:
     JWT:
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index f1fe12e05f..ce8f035dfc 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -652,7 +652,9 @@ impl Endpoint {
                         }
                         ComputeStatus::Empty
                         | ComputeStatus::ConfigurationPending
-                        | ComputeStatus::Configuration => {
+                        | ComputeStatus::Configuration
+                        | ComputeStatus::TerminationPending
+                        | ComputeStatus::Terminated => {
                             bail!("unexpected compute status: {:?}", state.status)
                         }
                     }
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 92bbf79cd4..fd0c90d447 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -52,6 +52,10 @@ pub enum ComputeStatus {
     // compute will exit soon or is waiting for
     // control-plane to terminate it.
     Failed,
+    // Termination requested
+    TerminationPending,
+    // Terminated Postgres
+    Terminated,
 }
 
 fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>

From fcbe9fb1840b7628fd242eec3bfd0df83535d0f7 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 20 Feb 2024 19:42:54 +0000
Subject: [PATCH 0226/1571] test: adjust checkpoint distance in
 `test_layer_map` (#6842)

https://github.com/neondatabase/neon/commit/587cb705b898565d459d044df84d1ac2633f00bf
changed the layer rolling logic to more closely obey the
`checkpoint_distance` config. Previously, this test was getting
layers significantly larger than the 8K it was asking for. Now the
payload in the layers is closer to 8K (which means more layers in
total).

Tweak the `checkpoint_distance` to get a number of layers more
reasonable for this test. Note that we still get more layers than
before (~8K vs ~5K).
---
 test_runner/performance/test_layer_map.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index 6bd0d85fa2..9b20954d45 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -17,10 +17,10 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
     tenant, _ = env.neon_cli.create_tenant(
         conf={
             "gc_period": "0s",
-            "checkpoint_distance": "8192",
+            "checkpoint_distance": "16384",
             "compaction_period": "1 s",
             "compaction_threshold": "1",
-            "compaction_target_size": "8192",
+            "compaction_target_size": "16384",
         }
     )
 

From 04190a1fea389138f1851630d340030cf73758ef Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 20 Feb 2024 20:45:00 +0000
Subject: [PATCH 0227/1571] CI(test_runner): misc small changes (#6801)

## Problem

A set of small changes that are too small to open a separate for each.

A notable change is adding `pytest-repeat` plugin, which can help to
ensure that a flaky test is fixed by running such a test several times.

## Summary of changes
- Update Allure from 2.24.0 to 2.27.0
- Update Ruff from 0.1.11 to 0.2.2 (update `[tool.ruff]` section of
`pyproject.toml` for it)
- Install pytest-repeat plugin
---
 .../actions/allure-report-generate/action.yml |  4 +-
 poetry.lock                                   | 52 ++++++++++++-------
 pyproject.toml                                | 17 ++++--
 3 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index 79f054cb06..9a0c79a221 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -76,8 +76,8 @@ runs:
           rm -f ${ALLURE_ZIP}
         fi
       env:
-        ALLURE_VERSION: 2.24.0
-        ALLURE_ZIP_SHA256: 60b1d6ce65d9ef24b23cf9c2c19fd736a123487c38e54759f1ed1a7a77353c90
+        ALLURE_VERSION: 2.27.0
+        ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777
 
     # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
     - name: Acquire lock
diff --git a/poetry.lock b/poetry.lock
index 8e1d713d29..347f0a16a7 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2025,6 +2025,20 @@ pytest = [
     {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
 ]
 
+[[package]]
+name = "pytest-repeat"
+version = "0.9.3"
+description = "pytest plugin for repeating tests"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytest_repeat-0.9.3-py3-none-any.whl", hash = "sha256:26ab2df18226af9d5ce441c858f273121e92ff55f5bb311d25755b8d7abdd8ed"},
+    {file = "pytest_repeat-0.9.3.tar.gz", hash = "sha256:ffd3836dfcd67bb270bec648b330e20be37d2966448c4148c4092d1e8aba8185"},
+]
+
+[package.dependencies]
+pytest = "*"
+
 [[package]]
 name = "pytest-rerunfailures"
 version = "13.0"
@@ -2257,28 +2271,28 @@ pyasn1 = ">=0.1.3"
 
 [[package]]
 name = "ruff"
-version = "0.1.11"
+version = "0.2.2"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a7f772696b4cdc0a3b2e527fc3c7ccc41cdcb98f5c80fdd4f2b8c50eb1458196"},
-    {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:934832f6ed9b34a7d5feea58972635c2039c7a3b434fe5ba2ce015064cb6e955"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea0d3e950e394c4b332bcdd112aa566010a9f9c95814844a7468325290aabfd9"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9bd4025b9c5b429a48280785a2b71d479798a69f5c2919e7d274c5f4b32c3607"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1ad00662305dcb1e987f5ec214d31f7d6a062cae3e74c1cbccef15afd96611d"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4b077ce83f47dd6bea1991af08b140e8b8339f0ba8cb9b7a484c30ebab18a23f"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4a88efecec23c37b11076fe676e15c6cdb1271a38f2b415e381e87fe4517f18"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b25093dad3b055667730a9b491129c42d45e11cdb7043b702e97125bcec48a1"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231d8fb11b2cc7c0366a326a66dafc6ad449d7fcdbc268497ee47e1334f66f77"},
-    {file = "ruff-0.1.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:09c415716884950080921dd6237767e52e227e397e2008e2bed410117679975b"},
-    {file = "ruff-0.1.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f58948c6d212a6b8d41cd59e349751018797ce1727f961c2fa755ad6208ba45"},
-    {file = "ruff-0.1.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:190a566c8f766c37074d99640cd9ca3da11d8deae2deae7c9505e68a4a30f740"},
-    {file = "ruff-0.1.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6464289bd67b2344d2a5d9158d5eb81025258f169e69a46b741b396ffb0cda95"},
-    {file = "ruff-0.1.11-py3-none-win32.whl", hash = "sha256:9b8f397902f92bc2e70fb6bebfa2139008dc72ae5177e66c383fa5426cb0bf2c"},
-    {file = "ruff-0.1.11-py3-none-win_amd64.whl", hash = "sha256:eb85ee287b11f901037a6683b2374bb0ec82928c5cbc984f575d0437979c521a"},
-    {file = "ruff-0.1.11-py3-none-win_arm64.whl", hash = "sha256:97ce4d752f964ba559c7023a86e5f8e97f026d511e48013987623915431c7ea9"},
-    {file = "ruff-0.1.11.tar.gz", hash = "sha256:f9d4d88cb6eeb4dfe20f9f0519bd2eaba8119bde87c3d5065c541dbae2b5a2cb"},
+    {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0a9efb032855ffb3c21f6405751d5e147b0c6b631e3ca3f6b20f917572b97eb6"},
+    {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d450b7fbff85913f866a5384d8912710936e2b96da74541c82c1b458472ddb39"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecd46e3106850a5c26aee114e562c329f9a1fbe9e4821b008c4404f64ff9ce73"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e22676a5b875bd72acd3d11d5fa9075d3a5f53b877fe7b4793e4673499318ba"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1695700d1e25a99d28f7a1636d85bafcc5030bba9d0578c0781ba1790dbcf51c"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b0c232af3d0bd8f521806223723456ffebf8e323bd1e4e82b0befb20ba18388e"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f63d96494eeec2fc70d909393bcd76c69f35334cdbd9e20d089fb3f0640216ca"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a61ea0ff048e06de273b2e45bd72629f470f5da8f71daf09fe481278b175001"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1439c8f407e4f356470e54cdecdca1bd5439a0673792dbe34a2b0a551a2fe3"},
+    {file = "ruff-0.2.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:940de32dc8853eba0f67f7198b3e79bc6ba95c2edbfdfac2144c8235114d6726"},
+    {file = "ruff-0.2.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c126da55c38dd917621552ab430213bdb3273bb10ddb67bc4b761989210eb6e"},
+    {file = "ruff-0.2.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:3b65494f7e4bed2e74110dac1f0d17dc8e1f42faaa784e7c58a98e335ec83d7e"},
+    {file = "ruff-0.2.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1ec49be4fe6ddac0503833f3ed8930528e26d1e60ad35c2446da372d16651ce9"},
+    {file = "ruff-0.2.2-py3-none-win32.whl", hash = "sha256:d920499b576f6c68295bc04e7b17b6544d9d05f196bb3aac4358792ef6f34325"},
+    {file = "ruff-0.2.2-py3-none-win_amd64.whl", hash = "sha256:cc9a91ae137d687f43a44c900e5d95e9617cb37d4c989e462980ba27039d239d"},
+    {file = "ruff-0.2.2-py3-none-win_arm64.whl", hash = "sha256:c9d15fc41e6054bfc7200478720570078f0b41c9ae4f010bcc16bd6f4d1aacdd"},
+    {file = "ruff-0.2.2.tar.gz", hash = "sha256:e62ed7f36b3068a30ba39193a14274cd706bc486fad521276458022f7bccb31d"},
 ]
 
 [[package]]
@@ -2794,4 +2808,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "cab9cf8cbf8dcd52022acfdabfae4778be3ed5a4afda832bd9c074a50c746763"
+content-hash = "af9d5b45310c12411bfe67cb9677d2236808d0780ca1bd81525d2763a928f7f9"
diff --git a/pyproject.toml b/pyproject.toml
index b498f8acce..6dff112a5e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,17 +39,21 @@ types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
 httpx = {extras = ["http2"], version = "^0.26.0"}
+pytest-repeat = "^0.9.3"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
-ruff = "^0.1.11"
+ruff = "^0.2.2"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.mypy]
-exclude = "^vendor/"
+exclude = [
+    "^vendor/",
+    "^target/",
+]
 check_untyped_defs = true
 # Help mypy find imports when running against list of individual files.
 # Without this line it would behave differently when executed on the entire project.
@@ -73,7 +77,13 @@ ignore_missing_imports = true
 
 [tool.ruff]
 target-version = "py39"
-extend-exclude = ["vendor/"]
+extend-exclude = [
+    "vendor/",
+    "target/",
+]
+line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter
+
+[tool.ruff.lint]
 ignore = [
     "E501", # Line too long, we don't want to be too strict about it
 ]
@@ -84,4 +94,3 @@ select = [
     "W", # pycodestyle
     "B", # bugbear
 ]
-line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter

From 3882f570016b21dc264418a32e51cc05536c3238 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 20 Feb 2024 19:20:42 -0500
Subject: [PATCH 0228/1571] neon_local: add flag to create test user and
 database (#6848)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This pull request adds two flags: `--update-catalog true` for `endpoint
create`, and `--create-test-user true` for `endpoint start`. The former
enables catalog updates for neon_superuser permission and many other
things, while the latter adds the user `test` and the database `neondb`
when setting up the database. A combination of these two flags will
create a Postgres similar to the production environment so that it would
be easier for us to test if extensions behave correctly when added to
Neon Postgres.

Example output:

```
❯ cargo neon endpoint start main --create-test-user true
    Finished dev [unoptimized + debuginfo] target(s) in 0.22s
     Running `target/debug/neon_local endpoint start main --create-test-user true`
Starting existing endpoint main...
Starting postgres node at 'postgresql://cloud_admin@127.0.0.1:55432/postgres'
Also at 'postgresql://user@127.0.0.1:55432/neondb'
```

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/src/bin/neon_local.rs | 31 +++++++++++++++++++
 control_plane/src/endpoint.rs       | 47 +++++++++++++++++++++++------
 2 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index a155e9ebb2..5c0d008943 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -652,6 +652,10 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
             let name = import_match
                 .get_one::<String>("node-name")
                 .ok_or_else(|| anyhow!("No node name provided"))?;
+            let update_catalog = import_match
+                .get_one::<bool>("update-catalog")
+                .cloned()
+                .unwrap_or_default();
 
             // Parse base inputs
             let base_tarfile = import_match
@@ -694,6 +698,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                 None,
                 pg_version,
                 ComputeMode::Primary,
+                !update_catalog,
             )?;
             println!("Done");
         }
@@ -831,6 +836,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 .get_one::<String>("endpoint_id")
                 .map(String::to_string)
                 .unwrap_or_else(|| format!("ep-{branch_name}"));
+            let update_catalog = sub_args
+                .get_one::<bool>("update-catalog")
+                .cloned()
+                .unwrap_or_default();
 
             let lsn = sub_args
                 .get_one::<String>("lsn")
@@ -880,6 +889,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 http_port,
                 pg_version,
                 mode,
+                !update_catalog,
             )?;
         }
         "start" => {
@@ -918,6 +928,11 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 .get(endpoint_id.as_str())
                 .ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
 
+            let create_test_user = sub_args
+                .get_one::<bool>("create-test-user")
+                .cloned()
+                .unwrap_or_default();
+
             cplane.check_conflicting_endpoints(
                 endpoint.mode,
                 endpoint.tenant_id,
@@ -972,6 +987,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                     pageservers,
                     remote_ext_config,
                     stripe_size.0 as usize,
+                    create_test_user,
                 )
                 .await?;
         }
@@ -1457,6 +1473,18 @@ fn cli() -> Command {
         .required(false)
         .default_value("1");
 
+    let update_catalog = Arg::new("update-catalog")
+        .value_parser(value_parser!(bool))
+        .long("update-catalog")
+        .help("If set, will set up the catalog for neon_superuser")
+        .required(false);
+
+    let create_test_user = Arg::new("create-test-user")
+        .value_parser(value_parser!(bool))
+        .long("create-test-user")
+        .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
+        .required(false);
+
     Command::new("Neon CLI")
         .arg_required_else_help(true)
         .version(GIT_VERSION)
@@ -1517,6 +1545,7 @@ fn cli() -> Command {
                 .arg(Arg::new("end-lsn").long("end-lsn")
                     .help("Lsn the basebackup ends at"))
                 .arg(pg_version_arg.clone())
+                .arg(update_catalog.clone())
             )
         ).subcommand(
             Command::new("tenant")
@@ -1630,6 +1659,7 @@ fn cli() -> Command {
                             .required(false))
                     .arg(pg_version_arg.clone())
                     .arg(hot_standby_arg.clone())
+                    .arg(update_catalog)
                 )
                 .subcommand(Command::new("start")
                     .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
@@ -1637,6 +1667,7 @@ fn cli() -> Command {
                     .arg(endpoint_pageserver_id_arg.clone())
                     .arg(safekeepers_arg)
                     .arg(remote_ext_config_args)
+                    .arg(create_test_user)
                 )
                 .subcommand(Command::new("reconfigure")
                             .about("Reconfigure the endpoint")
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index ce8f035dfc..bab7a70ce7 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -41,11 +41,15 @@ use std::net::SocketAddr;
 use std::net::TcpStream;
 use std::path::PathBuf;
 use std::process::Command;
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 
 use anyhow::{anyhow, bail, Context, Result};
+use compute_api::spec::Database;
+use compute_api::spec::PgIdent;
 use compute_api::spec::RemoteExtSpec;
+use compute_api::spec::Role;
 use nix::sys::signal::kill;
 use nix::sys::signal::Signal;
 use serde::{Deserialize, Serialize};
@@ -122,6 +126,7 @@ impl ComputeControlPlane {
         http_port: Option<u16>,
         pg_version: u32,
         mode: ComputeMode,
+        skip_pg_catalog_updates: bool,
     ) -> Result<Arc<Endpoint>> {
         let pg_port = pg_port.unwrap_or_else(|| self.get_port());
         let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
@@ -140,7 +145,7 @@ impl ComputeControlPlane {
             // before and after start are the same. So, skip catalog updates,
             // with this we basically test a case of waking up an idle compute, where
             // we also skip catalog updates in the cloud.
-            skip_pg_catalog_updates: true,
+            skip_pg_catalog_updates,
             features: vec![],
         });
 
@@ -155,7 +160,7 @@ impl ComputeControlPlane {
                 http_port,
                 pg_port,
                 pg_version,
-                skip_pg_catalog_updates: true,
+                skip_pg_catalog_updates,
                 features: vec![],
             })?,
         )?;
@@ -500,6 +505,7 @@ impl Endpoint {
         pageservers: Vec<(Host, u16)>,
         remote_ext_config: Option<&String>,
         shard_stripe_size: usize,
+        create_test_user: bool,
     ) -> Result<()> {
         if self.status() == EndpointStatus::Running {
             anyhow::bail!("The endpoint is already running");
@@ -551,8 +557,26 @@ impl Endpoint {
                 cluster_id: None, // project ID: not used
                 name: None,       // project name: not used
                 state: None,
-                roles: vec![],
-                databases: vec![],
+                roles: if create_test_user {
+                    vec![Role {
+                        name: PgIdent::from_str("test").unwrap(),
+                        encrypted_password: None,
+                        options: None,
+                    }]
+                } else {
+                    Vec::new()
+                },
+                databases: if create_test_user {
+                    vec![Database {
+                        name: PgIdent::from_str("neondb").unwrap(),
+                        owner: PgIdent::from_str("test").unwrap(),
+                        options: None,
+                        restrict_conn: false,
+                        invalid: false,
+                    }]
+                } else {
+                    Vec::new()
+                },
                 settings: None,
                 postgresql_conf: Some(postgresql_conf),
             },
@@ -577,11 +601,16 @@ impl Endpoint {
             .open(self.endpoint_path().join("compute.log"))?;
 
         // Launch compute_ctl
-        println!("Starting postgres node at '{}'", self.connstr());
+        let conn_str = self.connstr("cloud_admin", "postgres");
+        println!("Starting postgres node at '{}'", conn_str);
+        if create_test_user {
+            let conn_str = self.connstr("user", "neondb");
+            println!("Also at '{}'", conn_str);
+        }
         let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
         cmd.args(["--http-port", &self.http_address.port().to_string()])
             .args(["--pgdata", self.pgdata().to_str().unwrap()])
-            .args(["--connstr", &self.connstr()])
+            .args(["--connstr", &conn_str])
             .args([
                 "--spec-path",
                 self.endpoint_path().join("spec.json").to_str().unwrap(),
@@ -785,13 +814,13 @@ impl Endpoint {
         Ok(())
     }
 
-    pub fn connstr(&self) -> String {
+    pub fn connstr(&self, user: &str, db_name: &str) -> String {
         format!(
             "postgresql://{}@{}:{}/{}",
-            "cloud_admin",
+            user,
             self.pg_address.ip(),
             self.pg_address.port(),
-            "postgres"
+            db_name
         )
     }
 }

From 5d6083bfc61701877be2ae8b9d9d726a4d0e773b Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 21 Feb 2024 09:49:46 +0000
Subject: [PATCH 0229/1571] pageserver: add vectored get implementation (#6576)

This PR introduces a new vectored implementation of the read path.

The search is basically a DFS if you squint at it long enough.
LayerFringe tracks the next layers to visit and acts as our stack.
Vertices are tuples of (layer, keyspace, lsn range). Continuously
pop the top of the stack (most recent layer) and do all the reads
for one layer at once.

The search maintains a fringe (`LayerFringe`) which tracks all the
layers that intersect the current keyspace being searched. Continuously
pop the top of the fringe (layer with highest LSN) and get all the data
required from the layer in one go.

Said search is done on one timeline at a time. If data is still required for
some keys, then search the ancestor timeline.

Apart from the high level layer traversal, vectored variants have been
introduced for grabbing data from each layer type. They still suffer from
read amplification issues and that will be addressed in a different PR.

You might notice that in some places we duplicate the code for the
existing read path. All of that code will be removed when we switch
the non-vectored read path to proxy into the vectored read path.
In the meantime, we'll have to contend with the extra cruft for the sake
of testing and gentle releasing.
---
 .github/workflows/build_and_test.yml          |   1 +
 Cargo.lock                                    |   1 +
 libs/pageserver_api/Cargo.toml                |   1 +
 libs/pageserver_api/src/keyspace.rs           |  52 ++-
 pageserver/src/basebackup.rs                  |   5 +-
 pageserver/src/config.rs                      |  24 ++
 pageserver/src/tenant.rs                      | 175 +++++++--
 pageserver/src/tenant/layer_map.rs            | 118 ++++--
 pageserver/src/tenant/storage_layer.rs        | 282 +++++++++++++
 .../src/tenant/storage_layer/delta_layer.rs   | 139 ++++++-
 .../src/tenant/storage_layer/image_layer.rs   |  78 +++-
 .../tenant/storage_layer/inmemory_layer.rs    | 100 ++++-
 pageserver/src/tenant/storage_layer/layer.rs  |  51 ++-
 .../src/tenant/storage_layer/layer_desc.rs    |   2 +-
 pageserver/src/tenant/timeline.rs             | 371 ++++++++++++++++--
 test_runner/fixtures/neon_fixtures.py         |   7 +
 test_runner/regress/test_compatibility.py     |   4 +
 17 files changed, 1284 insertions(+), 127 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 2a1c79e437..1744616888 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -472,6 +472,7 @@ jobs:
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
           BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
+          PAGESERVER_GET_VECTORED_IMPL: vectored
 
       # Temporary disable this step until we figure out why it's so flaky
       # Ref https://github.com/neondatabase/neon/issues/4540
diff --git a/Cargo.lock b/Cargo.lock
index f25e3d1574..ac8cceb5f6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3552,6 +3552,7 @@ dependencies = [
  "enum-map",
  "hex",
  "humantime-serde",
+ "itertools",
  "postgres_ffi",
  "rand 0.8.5",
  "serde",
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 902af21965..938910caea 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -21,6 +21,7 @@ hex.workspace = true
 thiserror.workspace = true
 humantime-serde.workspace = true
 chrono.workspace = true
+itertools.workspace = true
 
 workspace_hack.workspace = true
 
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 396c801606..443ffdcf03 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -2,6 +2,7 @@ use postgres_ffi::BLCKSZ;
 use std::ops::Range;
 
 use crate::key::Key;
+use itertools::Itertools;
 
 ///
 /// Represents a set of Keys, in a compact form.
@@ -63,9 +64,36 @@ impl KeySpace {
         KeyPartitioning { parts }
     }
 
-    /// Update the keyspace such that it doesn't contain any range
-    /// that is overlapping with `other`. This can involve splitting or
-    /// removing of existing ranges.
+    /// Merge another keyspace into the current one.
+    /// Note: the keyspaces must not ovelap (enforced via assertions)
+    pub fn merge(&mut self, other: &KeySpace) {
+        let all_ranges = self
+            .ranges
+            .iter()
+            .merge_by(other.ranges.iter(), |lhs, rhs| lhs.start < rhs.start);
+
+        let mut accum = KeySpaceAccum::new();
+        let mut prev: Option<&Range<Key>> = None;
+        for range in all_ranges {
+            if let Some(prev) = prev {
+                let overlap =
+                    std::cmp::max(range.start, prev.start) < std::cmp::min(range.end, prev.end);
+                assert!(
+                    !overlap,
+                    "Attempt to merge ovelapping keyspaces: {:?} overlaps {:?}",
+                    prev, range
+                );
+            }
+
+            accum.add_range(range.clone());
+            prev = Some(range);
+        }
+
+        self.ranges = accum.to_keyspace().ranges;
+    }
+
+    /// Remove all keys in `other` from `self`.
+    /// This can involve splitting or removing of existing ranges.
     pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
         let (self_start, self_end) = match (self.start(), self.end()) {
             (Some(start), Some(end)) => (start, end),
@@ -220,16 +248,7 @@ impl KeySpaceAccum {
     }
 
     pub fn consume_keyspace(&mut self) -> KeySpace {
-        if let Some(accum) = self.accum.take() {
-            self.ranges.push(accum);
-        }
-
-        let mut prev_accum = KeySpaceAccum::new();
-        std::mem::swap(self, &mut prev_accum);
-
-        KeySpace {
-            ranges: prev_accum.ranges,
-        }
+        std::mem::take(self).to_keyspace()
     }
 
     pub fn size(&self) -> u64 {
@@ -279,6 +298,13 @@ impl KeySpaceRandomAccum {
         }
         KeySpace { ranges }
     }
+
+    pub fn consume_keyspace(&mut self) -> KeySpace {
+        let mut prev_accum = KeySpaceRandomAccum::new();
+        std::mem::swap(self, &mut prev_accum);
+
+        prev_accum.to_keyspace()
+    }
 }
 
 pub fn key_range_size(key_range: &Range<Key>) -> u32 {
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 7edfab75d4..c862816b80 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -261,10 +261,7 @@ where
             let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
 
             for part in slru_partitions.parts {
-                let blocks = self
-                    .timeline
-                    .get_vectored(&part.ranges, self.lsn, self.ctx)
-                    .await?;
+                let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;
 
                 for (key, block) in blocks {
                     slru_builder.add_block(&key, block?).await?;
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 6d71ff1dd4..6c00c55f39 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,6 +33,7 @@ use utils::{
 use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
+use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::{
     TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
@@ -84,6 +85,8 @@ pub mod defaults {
 
     pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
 
+    pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
+
     ///
     /// Default built-in configuration file.
     ///
@@ -121,6 +124,8 @@ pub mod defaults {
 
 #virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
 
+#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -256,6 +261,8 @@ pub struct PageServerConf {
     pub ingest_batch_size: u64,
 
     pub virtual_file_io_engine: virtual_file::IoEngineKind,
+
+    pub get_vectored_impl: GetVectoredImpl,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -342,6 +349,8 @@ struct PageServerConfigBuilder {
     ingest_batch_size: BuilderValue<u64>,
 
     virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
+
+    get_vectored_impl: BuilderValue<GetVectoredImpl>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -419,6 +428,8 @@ impl Default for PageServerConfigBuilder {
             ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
 
             virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
+
+            get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
         }
     }
 }
@@ -579,6 +590,10 @@ impl PageServerConfigBuilder {
         self.virtual_file_io_engine = BuilderValue::Set(value);
     }
 
+    pub fn get_vectored_impl(&mut self, value: GetVectoredImpl) {
+        self.get_vectored_impl = BuilderValue::Set(value);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let concurrent_tenant_warmup = self
             .concurrent_tenant_warmup
@@ -689,6 +704,9 @@ impl PageServerConfigBuilder {
             virtual_file_io_engine: self
                 .virtual_file_io_engine
                 .ok_or(anyhow!("missing virtual_file_io_engine"))?,
+            get_vectored_impl: self
+                .get_vectored_impl
+                .ok_or(anyhow!("missing get_vectored_impl"))?,
         })
     }
 }
@@ -943,6 +961,9 @@ impl PageServerConf {
                 "virtual_file_io_engine" => {
                     builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
                 }
+                "get_vectored_impl" => {
+                    builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1017,6 +1038,7 @@ impl PageServerConf {
             secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
             ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
             virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
+            get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
         }
     }
 }
@@ -1250,6 +1272,7 @@ background_task_maximum_delay = '334 s'
                 secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                 ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
+                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1314,6 +1337,7 @@ background_task_maximum_delay = '334 s'
                 secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                 ingest_batch_size: 100,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
+                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c646e5cf90..7021921b12 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3877,6 +3877,7 @@ mod tests {
     use bytes::BytesMut;
     use hex_literal::hex;
     use once_cell::sync::Lazy;
+    use pageserver_api::keyspace::KeySpace;
     use rand::{thread_rng, Rng};
     use tokio_util::sync::CancellationToken;
 
@@ -4514,6 +4515,61 @@ mod tests {
         Ok(())
     }
 
+    async fn bulk_insert_compact_gc(
+        timeline: Arc<Timeline>,
+        ctx: &RequestContext,
+        mut lsn: Lsn,
+        repeat: usize,
+        key_count: usize,
+    ) -> anyhow::Result<()> {
+        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let mut blknum = 0;
+
+        // Enforce that key range is monotonously increasing
+        let mut keyspace = KeySpaceAccum::new();
+
+        for _ in 0..repeat {
+            for _ in 0..key_count {
+                test_key.field6 = blknum;
+                let mut writer = timeline.writer().await;
+                writer
+                    .put(
+                        test_key,
+                        lsn,
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
+                        ctx,
+                    )
+                    .await?;
+                writer.finish_write(lsn);
+                drop(writer);
+
+                keyspace.add_key(test_key);
+
+                lsn = Lsn(lsn.0 + 0x10);
+                blknum += 1;
+            }
+
+            let cutoff = timeline.get_last_record_lsn();
+
+            timeline
+                .update_gc_info(
+                    Vec::new(),
+                    cutoff,
+                    Duration::ZERO,
+                    &CancellationToken::new(),
+                    ctx,
+                )
+                .await?;
+            timeline.freeze_and_flush().await?;
+            timeline
+                .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
+                .await?;
+            timeline.gc().await?;
+        }
+
+        Ok(())
+    }
+
     //
     // Insert 1000 key-value pairs with increasing keys, flush, compact, GC.
     // Repeat 50 times.
@@ -4526,49 +4582,98 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let mut lsn = Lsn(0x10);
+        let lsn = Lsn(0x10);
+        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
 
-        let mut keyspace = KeySpaceAccum::new();
+        Ok(())
+    }
 
-        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
-        let mut blknum = 0;
-        for _ in 0..50 {
-            for _ in 0..10000 {
-                test_key.field6 = blknum;
-                let mut writer = tline.writer().await;
-                writer
-                    .put(
-                        test_key,
-                        lsn,
-                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
-                        &ctx,
-                    )
-                    .await?;
-                writer.finish_write(lsn);
-                drop(writer);
+    // Test the vectored get real implementation against a simple sequential implementation.
+    //
+    // The test generates a keyspace by repeatedly flushing the in-memory layer and compacting.
+    // Projected to 2D the key space looks like below. Lsn grows upwards on the Y axis and keys
+    // grow to the right on the X axis.
+    //                       [Delta]
+    //                 [Delta]
+    //           [Delta]
+    //    [Delta]
+    // ------------ Image ---------------
+    //
+    // After layer generation we pick the ranges to query as follows:
+    // 1. The beginning of each delta layer
+    // 2. At the seam between two adjacent delta layers
+    //
+    // There's one major downside to this test: delta layers only contains images,
+    // so the search can stop at the first delta layer and doesn't traverse any deeper.
+    #[tokio::test]
+    async fn test_get_vectored() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_get_vectored")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
+            .await?;
 
-                keyspace.add_key(test_key);
+        let lsn = Lsn(0x10);
+        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
 
-                lsn = Lsn(lsn.0 + 0x10);
-                blknum += 1;
+        let guard = tline.layers.read().await;
+        guard.layer_map().dump(true, &ctx).await?;
+
+        let mut reads = Vec::new();
+        let mut prev = None;
+        guard.layer_map().iter_historic_layers().for_each(|desc| {
+            if !desc.is_delta() {
+                prev = Some(desc.clone());
+                return;
             }
 
-            let cutoff = tline.get_last_record_lsn();
+            let start = desc.key_range.start;
+            let end = desc
+                .key_range
+                .start
+                .add(Timeline::MAX_GET_VECTORED_KEYS.try_into().unwrap());
+            reads.push(KeySpace {
+                ranges: vec![start..end],
+            });
 
+            if let Some(prev) = &prev {
+                if !prev.is_delta() {
+                    return;
+                }
+
+                let first_range = Key {
+                    field6: prev.key_range.end.field6 - 4,
+                    ..prev.key_range.end
+                }..prev.key_range.end;
+
+                let second_range = desc.key_range.start..Key {
+                    field6: desc.key_range.start.field6 + 4,
+                    ..desc.key_range.start
+                };
+
+                reads.push(KeySpace {
+                    ranges: vec![first_range, second_range],
+                });
+            };
+
+            prev = Some(desc.clone());
+        });
+
+        drop(guard);
+
+        // Pick a big LSN such that we query over all the changes.
+        // Technically, u64::MAX - 1 is the largest LSN supported by the read path,
+        // but there seems to be a bug on the non-vectored search path which surfaces
+        // in that case.
+        let reads_lsn = Lsn(u64::MAX - 1000);
+
+        for read in reads {
+            info!("Doing vectored read on {:?}", read);
+
+            let vectored_res = tline.get_vectored_impl(read.clone(), reads_lsn, &ctx).await;
             tline
-                .update_gc_info(
-                    Vec::new(),
-                    cutoff,
-                    Duration::ZERO,
-                    &CancellationToken::new(),
-                    &ctx,
-                )
-                .await?;
-            tline.freeze_and_flush().await?;
-            tline
-                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
-                .await?;
-            tline.gc().await?;
+                .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
+                .await;
         }
 
         Ok(())
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index bb52e586d1..5f4814cc6b 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -52,8 +52,7 @@ use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
 use pageserver_api::keyspace::KeySpaceAccum;
-use std::cmp::Ordering;
-use std::collections::{BTreeMap, VecDeque};
+use std::collections::{HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
 use std::sync::Arc;
@@ -147,43 +146,28 @@ impl Drop for BatchedUpdates<'_> {
 }
 
 /// Return value of LayerMap::search
-#[derive(Eq, PartialEq, Debug)]
+#[derive(Eq, PartialEq, Debug, Hash)]
 pub struct SearchResult {
     pub layer: Arc<PersistentLayerDesc>,
     pub lsn_floor: Lsn,
 }
 
-pub struct OrderedSearchResult(SearchResult);
-
-impl Ord for OrderedSearchResult {
-    fn cmp(&self, other: &Self) -> Ordering {
-        self.0.lsn_floor.cmp(&other.0.lsn_floor)
-    }
-}
-
-impl PartialOrd for OrderedSearchResult {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl PartialEq for OrderedSearchResult {
-    fn eq(&self, other: &Self) -> bool {
-        self.0.lsn_floor == other.0.lsn_floor
-    }
-}
-
-impl Eq for OrderedSearchResult {}
-
+/// Return value of [`LayerMap::range_search`]
+///
+/// Contains a mapping from a layer description to a keyspace
+/// accumulator that contains all the keys which intersect the layer
+/// from the original search space. Keys that were not found are accumulated
+/// in a separate key space accumulator.
+#[derive(Debug)]
 pub struct RangeSearchResult {
-    pub found: BTreeMap<OrderedSearchResult, KeySpaceAccum>,
+    pub found: HashMap<SearchResult, KeySpaceAccum>,
     pub not_found: KeySpaceAccum,
 }
 
 impl RangeSearchResult {
     fn new() -> Self {
         Self {
-            found: BTreeMap::new(),
+            found: HashMap::new(),
             not_found: KeySpaceAccum::new(),
         }
     }
@@ -314,7 +298,7 @@ where
             Some(search_result) => self
                 .result
                 .found
-                .entry(OrderedSearchResult(search_result))
+                .entry(search_result)
                 .or_default()
                 .add_range(covered_range),
             None => self.pad_range(covered_range),
@@ -362,6 +346,35 @@ where
     }
 }
 
+#[derive(PartialEq, Eq, Hash, Debug, Clone)]
+pub enum InMemoryLayerHandle {
+    Open {
+        lsn_floor: Lsn,
+        end_lsn: Lsn,
+    },
+    Frozen {
+        idx: usize,
+        lsn_floor: Lsn,
+        end_lsn: Lsn,
+    },
+}
+
+impl InMemoryLayerHandle {
+    pub fn get_lsn_floor(&self) -> Lsn {
+        match self {
+            InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
+            InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
+        }
+    }
+
+    pub fn get_end_lsn(&self) -> Lsn {
+        match self {
+            InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
+            InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
+        }
+    }
+}
+
 impl LayerMap {
     ///
     /// Find the latest layer (by lsn.end) that covers the given
@@ -556,6 +569,43 @@ impl LayerMap {
         self.historic.iter()
     }
 
+    /// Get a handle for the first in memory layer that matches the provided predicate.
+    /// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
+    ///
+    /// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
+    /// the same exclusive region established by holding the layer manager lock.
+    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
+    where
+        Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
+    {
+        if let Some(open) = &self.open_layer {
+            if pred(open) {
+                return Some(InMemoryLayerHandle::Open {
+                    lsn_floor: open.get_lsn_range().start,
+                    end_lsn: open.get_lsn_range().end,
+                });
+            }
+        }
+
+        let pos = self.frozen_layers.iter().rev().position(pred);
+        pos.map(|rev_idx| {
+            let idx = self.frozen_layers.len() - 1 - rev_idx;
+            InMemoryLayerHandle::Frozen {
+                idx,
+                lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
+                end_lsn: self.frozen_layers[idx].get_lsn_range().end,
+            }
+        })
+    }
+
+    /// Get the layer pointed to by the provided handle.
+    pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
+        match handle {
+            InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
+            InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
+        }
+    }
+
     ///
     /// Divide the whole given range of keys into sub-ranges based on the latest
     /// image layer that covers each range at the specified lsn (inclusive).
@@ -869,6 +919,8 @@ impl LayerMap {
 
 #[cfg(test)]
 mod tests {
+    use pageserver_api::keyspace::KeySpace;
+
     use super::*;
 
     #[derive(Clone)]
@@ -895,15 +947,15 @@ mod tests {
 
     fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
         assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
-        let lhs: Vec<_> = lhs
+        let lhs: HashMap<SearchResult, KeySpace> = lhs
             .found
             .into_iter()
-            .map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
+            .map(|(search_result, accum)| (search_result, accum.to_keyspace()))
             .collect();
-        let rhs: Vec<_> = rhs
+        let rhs: HashMap<SearchResult, KeySpace> = rhs
             .found
             .into_iter()
-            .map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
+            .map(|(search_result, accum)| (search_result, accum.to_keyspace()))
             .collect();
 
         assert_eq!(lhs, rhs);
@@ -923,7 +975,7 @@ mod tests {
                 Some(res) => {
                     range_search_result
                         .found
-                        .entry(OrderedSearchResult(res))
+                        .entry(res)
                         .or_default()
                         .add_key(key);
                 }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 2d92baccbe..73c018db31 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,15 +8,21 @@ pub(crate) mod layer;
 mod layer_desc;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
+use crate::repository::Value;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
 use bytes::Bytes;
 use enum_map::EnumMap;
 use enumset::EnumSet;
 use once_cell::sync::Lazy;
+use pageserver_api::key::Key;
+use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::models::{
     LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
+use std::cmp::{Ordering, Reverse};
+use std::collections::hash_map::Entry;
+use std::collections::{BinaryHeap, HashMap};
 use std::ops::Range;
 use std::sync::Mutex;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
@@ -34,6 +40,11 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 
 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
 
+use super::layer_map::InMemoryLayerHandle;
+use super::timeline::layer_manager::LayerManager;
+use super::timeline::GetVectoredError;
+use super::PageReconstructError;
+
 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
     T: PartialOrd<T>,
@@ -67,6 +78,277 @@ pub struct ValueReconstructState {
     pub img: Option<(Lsn, Bytes)>,
 }
 
+#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
+pub(crate) enum ValueReconstructSituation {
+    Complete,
+    #[default]
+    Continue,
+}
+
+/// Reconstruct data accumulated for a single key during a vectored get
+#[derive(Debug, Default, Clone)]
+pub(crate) struct VectoredValueReconstructState {
+    pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
+    pub(crate) img: Option<(Lsn, Bytes)>,
+
+    situation: ValueReconstructSituation,
+}
+
+impl VectoredValueReconstructState {
+    fn get_cached_lsn(&self) -> Option<Lsn> {
+        self.img.as_ref().map(|img| img.0)
+    }
+}
+
+impl From<VectoredValueReconstructState> for ValueReconstructState {
+    fn from(mut state: VectoredValueReconstructState) -> Self {
+        // walredo expects the records to be descending in terms of Lsn
+        state.records.sort_by_key(|(lsn, _)| Reverse(*lsn));
+
+        ValueReconstructState {
+            records: state.records,
+            img: state.img,
+        }
+    }
+}
+
+/// Bag of data accumulated during a vectored get
+pub(crate) struct ValuesReconstructState {
+    pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
+
+    keys_done: KeySpaceRandomAccum,
+}
+
+impl ValuesReconstructState {
+    pub(crate) fn new() -> Self {
+        Self {
+            keys: HashMap::new(),
+            keys_done: KeySpaceRandomAccum::new(),
+        }
+    }
+
+    /// Associate a key with the error which it encountered and mark it as done
+    pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) {
+        let previous = self.keys.insert(key, Err(err));
+        if let Some(Ok(state)) = previous {
+            if state.situation == ValueReconstructSituation::Continue {
+                self.keys_done.add_key(key);
+            }
+        }
+    }
+
+    /// Update the state collected for a given key.
+    /// Returns true if this was the last value needed for the key and false otherwise.
+    ///
+    /// If the key is done after the update, mark it as such.
+    pub(crate) fn update_key(
+        &mut self,
+        key: &Key,
+        lsn: Lsn,
+        value: Value,
+    ) -> ValueReconstructSituation {
+        let state = self
+            .keys
+            .entry(*key)
+            .or_insert(Ok(VectoredValueReconstructState::default()));
+
+        if let Ok(state) = state {
+            let key_done = match state.situation {
+                ValueReconstructSituation::Complete => unreachable!(),
+                ValueReconstructSituation::Continue => match value {
+                    Value::Image(img) => {
+                        state.img = Some((lsn, img));
+                        true
+                    }
+                    Value::WalRecord(rec) => {
+                        let reached_cache =
+                            state.get_cached_lsn().map(|clsn| clsn + 1) == Some(lsn);
+                        let will_init = rec.will_init();
+                        state.records.push((lsn, rec));
+                        will_init || reached_cache
+                    }
+                },
+            };
+
+            if key_done && state.situation == ValueReconstructSituation::Continue {
+                state.situation = ValueReconstructSituation::Complete;
+                self.keys_done.add_key(*key);
+            }
+
+            state.situation
+        } else {
+            ValueReconstructSituation::Complete
+        }
+    }
+
+    /// Returns the Lsn at which this key is cached if one exists.
+    /// The read path should go no further than this Lsn for the given key.
+    pub(crate) fn get_cached_lsn(&self, key: &Key) -> Option<Lsn> {
+        self.keys
+            .get(key)
+            .and_then(|k| k.as_ref().ok())
+            .and_then(|state| state.get_cached_lsn())
+    }
+
+    /// Returns the key space describing the keys that have
+    /// been marked as completed since the last call to this function.
+    pub(crate) fn consume_done_keys(&mut self) -> KeySpace {
+        self.keys_done.consume_keyspace()
+    }
+}
+
+impl Default for ValuesReconstructState {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Description of layer to be read - the layer map can turn
+/// this description into the actual layer.
+#[derive(PartialEq, Eq, Hash, Debug, Clone)]
+pub(crate) enum ReadableLayerDesc {
+    Persistent {
+        desc: PersistentLayerDesc,
+        lsn_floor: Lsn,
+        lsn_ceil: Lsn,
+    },
+    InMemory {
+        handle: InMemoryLayerHandle,
+        lsn_ceil: Lsn,
+    },
+}
+
+/// Wraper for 'ReadableLayerDesc' sorted by Lsn
+#[derive(Debug)]
+struct ReadableLayerDescOrdered(ReadableLayerDesc);
+
+/// Data structure which maintains a fringe of layers for the
+/// read path. The fringe is the set of layers which intersects
+/// the current keyspace that the search is descending on.
+/// Each layer tracks the keyspace that intersects it.
+///
+/// The fringe must appear sorted by Lsn. Hence, it uses
+/// a two layer indexing scheme.
+#[derive(Debug)]
+pub(crate) struct LayerFringe {
+    layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
+    layers: HashMap<ReadableLayerDesc, KeySpace>,
+}
+
+impl LayerFringe {
+    pub(crate) fn new() -> Self {
+        LayerFringe {
+            layers_by_lsn: BinaryHeap::new(),
+            layers: HashMap::new(),
+        }
+    }
+
+    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
+        let handle = match self.layers_by_lsn.pop() {
+            Some(h) => h,
+            None => return None,
+        };
+
+        let removed = self.layers.remove_entry(&handle.0);
+        match removed {
+            Some((layer, keyspace)) => Some((layer, keyspace)),
+            None => unreachable!("fringe internals are always consistent"),
+        }
+    }
+
+    pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
+        let entry = self.layers.entry(layer.clone());
+        match entry {
+            Entry::Occupied(mut entry) => {
+                entry.get_mut().merge(&keyspace);
+            }
+            Entry::Vacant(entry) => {
+                self.layers_by_lsn
+                    .push(ReadableLayerDescOrdered(entry.key().clone()));
+                entry.insert(keyspace);
+            }
+        }
+    }
+}
+
+impl Default for LayerFringe {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Ord for ReadableLayerDescOrdered {
+    fn cmp(&self, other: &Self) -> Ordering {
+        let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
+        if ord == std::cmp::Ordering::Equal {
+            self.0
+                .get_lsn_floor()
+                .cmp(&other.0.get_lsn_floor())
+                .reverse()
+        } else {
+            ord
+        }
+    }
+}
+
+impl PartialOrd for ReadableLayerDescOrdered {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl PartialEq for ReadableLayerDescOrdered {
+    fn eq(&self, other: &Self) -> bool {
+        self.0.get_lsn_floor() == other.0.get_lsn_floor()
+            && self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
+    }
+}
+
+impl Eq for ReadableLayerDescOrdered {}
+
+impl ReadableLayerDesc {
+    pub(crate) fn get_lsn_floor(&self) -> Lsn {
+        match self {
+            ReadableLayerDesc::Persistent { lsn_floor, .. } => *lsn_floor,
+            ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
+        }
+    }
+
+    pub(crate) fn get_lsn_ceil(&self) -> Lsn {
+        match self {
+            ReadableLayerDesc::Persistent { lsn_ceil, .. } => *lsn_ceil,
+            ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
+        }
+    }
+
+    pub(crate) async fn get_values_reconstruct_data(
+        &self,
+        layer_manager: &LayerManager,
+        keyspace: KeySpace,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        match self {
+            ReadableLayerDesc::Persistent { desc, lsn_ceil, .. } => {
+                let layer = layer_manager.get_from_desc(desc);
+                layer
+                    .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
+                    .await
+            }
+            ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
+                let layer = layer_manager
+                    .layer_map()
+                    .get_in_memory_layer(handle)
+                    .unwrap();
+
+                layer
+                    .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
+                    .await
+            }
+        }
+    }
+}
+
 /// Return value from [`Layer::get_value_reconstruct_data`]
 #[derive(Clone, Copy, Debug)]
 pub enum ValueReconstructResult {
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 9a7bcbcebe..19eebf5531 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -35,16 +35,19 @@ use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
-use crate::tenant::Timeline;
+use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{anyhow, bail, ensure, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
+use std::collections::BTreeMap;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -59,7 +62,10 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
+use super::{
+    AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValueReconstructSituation,
+    ValuesReconstructState,
+};
 
 ///
 /// Header stored in the beginning of the file
@@ -818,6 +824,133 @@ impl DeltaLayerInner {
         }
     }
 
+    // Look up the keys in the provided keyspace and update
+    // the reconstruct state with whatever is found.
+    //
+    // If the key is cached, go no further than the cached Lsn.
+    //
+    // Currently, the index is visited for each range, but this
+    // can be further optimised to visit the index only once.
+    pub(super) async fn get_values_reconstruct_data(
+        &self,
+        keyspace: KeySpace,
+        end_lsn: Lsn,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        let file = &self.file;
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            file,
+        );
+
+        let mut offsets: BTreeMap<Key, Vec<(Lsn, u64)>> = BTreeMap::new();
+
+        for range in keyspace.ranges.iter() {
+            let mut ignore_key = None;
+
+            // Scan the page versions backwards, starting from the last key in the range.
+            // to collect all the offsets at which need to be read.
+            let end_key = DeltaKey::from_key_lsn(&range.end, Lsn(end_lsn.0 - 1));
+            tree_reader
+                .visit(
+                    &end_key.0,
+                    VisitDirection::Backwards,
+                    |raw_key, value| {
+                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+                        let entry_lsn = DeltaKey::extract_lsn_from_buf(raw_key);
+
+                        if entry_lsn >= end_lsn {
+                            return true;
+                        }
+
+                        if key < range.start {
+                            return false;
+                        }
+
+                        if key >= range.end {
+                            return true;
+                        }
+
+                        if Some(key) == ignore_key {
+                            return true;
+                        }
+
+                        if let Some(cached_lsn) = reconstruct_state.get_cached_lsn(&key) {
+                            if entry_lsn <= cached_lsn {
+                                return key != range.start;
+                            }
+                        }
+
+                        let blob_ref = BlobRef(value);
+                        let lsns_at = offsets.entry(key).or_default();
+                        lsns_at.push((entry_lsn, blob_ref.pos()));
+
+                        if blob_ref.will_init() {
+                            if key == range.start {
+                                return false;
+                            } else {
+                                ignore_key = Some(key);
+                                return true;
+                            }
+                        }
+
+                        true
+                    },
+                    &RequestContextBuilder::extend(ctx)
+                        .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
+                        .build(),
+                )
+                .await
+                .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
+        }
+
+        let ctx = &RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::DeltaLayerValue)
+            .build();
+
+        let cursor = file.block_cursor();
+        let mut buf = Vec::new();
+        for (key, lsns_at) in offsets {
+            for (lsn, block_offset) in lsns_at {
+                let res = cursor.read_blob_into_buf(block_offset, &mut buf, ctx).await;
+
+                if let Err(e) = res {
+                    reconstruct_state.on_key_error(
+                        key,
+                        PageReconstructError::from(anyhow!(e).context(format!(
+                            "Failed to read blob from virtual file {}",
+                            file.file.path
+                        ))),
+                    );
+
+                    break;
+                }
+
+                let value = Value::des(&buf);
+                if let Err(e) = value {
+                    reconstruct_state.on_key_error(
+                        key,
+                        PageReconstructError::from(anyhow!(e).context(format!(
+                            "Failed to deserialize file blob from virtual file {}",
+                            file.file.path
+                        ))),
+                    );
+
+                    break;
+                }
+
+                let key_situation = reconstruct_state.update_key(&key, lsn, value.unwrap());
+                if key_situation == ValueReconstructSituation::Complete {
+                    break;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
     pub(super) async fn load_keys<'a>(
         &'a self,
         ctx: &RequestContext,
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 458131b572..b867cb0333 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -26,20 +26,22 @@
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::PAGE_SZ;
-use crate::repository::{Key, KEY_SIZE};
+use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
     LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
-use crate::tenant::Timeline;
+use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
@@ -59,7 +61,7 @@ use utils::{
 };
 
 use super::filename::ImageFileName;
-use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
+use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer, ValuesReconstructState};
 
 ///
 /// Header stored in the beginning of the file
@@ -438,6 +440,74 @@ impl ImageLayerInner {
             Ok(ValueReconstructResult::Missing)
         }
     }
+
+    // Look up the keys in the provided keyspace and update
+    // the reconstruct state with whatever is found.
+    pub(super) async fn get_values_reconstruct_data(
+        &self,
+        keyspace: KeySpace,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        let file = &self.file;
+        let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
+
+        let mut offsets = Vec::new();
+
+        for range in keyspace.ranges.iter() {
+            let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+            range.start.write_to_byte_slice(&mut search_key);
+
+            tree_reader
+                .visit(
+                    &search_key,
+                    VisitDirection::Forwards,
+                    |raw_key, value| {
+                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+                        assert!(key >= range.start);
+
+                        if !range.contains(&key) {
+                            return false;
+                        }
+
+                        offsets.push((key, value));
+
+                        true
+                    },
+                    &RequestContextBuilder::extend(ctx)
+                        .page_content_kind(PageContentKind::ImageLayerBtreeNode)
+                        .build(),
+                )
+                .await
+                .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
+        }
+
+        let ctx = &RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::ImageLayerValue)
+            .build();
+
+        let cursor = file.block_cursor();
+        let mut buf = Vec::new();
+        for (key, offset) in offsets {
+            let res = cursor.read_blob_into_buf(offset, &mut buf, ctx).await;
+            if let Err(e) = res {
+                reconstruct_state.on_key_error(
+                    key,
+                    PageReconstructError::from(anyhow!(e).context(format!(
+                        "Failed to read blob from virtual file {}",
+                        file.file.path
+                    ))),
+                );
+
+                continue;
+            }
+
+            let blob = Bytes::copy_from_slice(buf.as_slice());
+            reconstruct_state.update_key(&key, self.lsn, Value::Image(blob));
+        }
+
+        Ok(())
+    }
 }
 
 /// A builder object for constructing a new image layer.
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 4b06a787ce..5f1db21d49 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -9,13 +9,15 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::repository::{Key, Value};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
-use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
-use crate::tenant::Timeline;
+use crate::tenant::storage_layer::ValueReconstructResult;
+use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::walrecord;
-use anyhow::{ensure, Result};
+use anyhow::{anyhow, ensure, Result};
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use std::collections::HashMap;
+use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::sync::{Arc, OnceLock};
 use tracing::*;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
@@ -25,7 +27,10 @@ use std::fmt::Write as _;
 use std::ops::Range;
 use tokio::sync::{RwLock, RwLockWriteGuard};
 
-use super::{DeltaLayerWriter, ResidentLayer};
+use super::{
+    DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState,
+    ValuesReconstructState,
+};
 
 pub struct InMemoryLayer {
     conf: &'static PageServerConf,
@@ -202,6 +207,91 @@ impl InMemoryLayer {
             Ok(ValueReconstructResult::Complete)
         }
     }
+
+    // Look up the keys in the provided keyspace and update
+    // the reconstruct state with whatever is found.
+    //
+    // If the key is cached, go no further than the cached Lsn.
+    pub(crate) async fn get_values_reconstruct_data(
+        &self,
+        keyspace: KeySpace,
+        end_lsn: Lsn,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::InMemoryLayer)
+            .build();
+
+        let inner = self.inner.read().await;
+        let reader = inner.file.block_cursor();
+
+        #[derive(Eq, PartialEq, Ord, PartialOrd)]
+        struct BlockRead {
+            key: Key,
+            lsn: Lsn,
+            block_offset: u64,
+        }
+
+        let mut planned_block_reads = BinaryHeap::new();
+
+        for range in keyspace.ranges.iter() {
+            let mut key = range.start;
+            while key < range.end {
+                if let Some(vec_map) = inner.index.get(&key) {
+                    let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
+                        Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
+                        None => self.start_lsn..end_lsn,
+                    };
+
+                    let slice = vec_map.slice_range(lsn_range);
+                    for (entry_lsn, pos) in slice.iter().rev() {
+                        planned_block_reads.push(BlockRead {
+                            key,
+                            lsn: *entry_lsn,
+                            block_offset: *pos,
+                        });
+                    }
+                }
+
+                key = key.next();
+            }
+        }
+
+        let keyspace_size = keyspace.total_size();
+
+        let mut completed_keys = HashSet::new();
+        while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() {
+            let block_read = planned_block_reads.pop().unwrap();
+            if completed_keys.contains(&block_read.key) {
+                continue;
+            }
+
+            let buf = reader.read_blob(block_read.block_offset, &ctx).await;
+            if let Err(e) = buf {
+                reconstruct_state
+                    .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
+                completed_keys.insert(block_read.key);
+                continue;
+            }
+
+            let value = Value::des(&buf.unwrap());
+            if let Err(e) = value {
+                reconstruct_state
+                    .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
+                completed_keys.insert(block_read.key);
+                continue;
+            }
+
+            let key_situation =
+                reconstruct_state.update_key(&block_read.key, block_read.lsn, value.unwrap());
+            if key_situation == ValueReconstructSituation::Complete {
+                completed_keys.insert(block_read.key);
+            }
+        }
+
+        Ok(())
+    }
 }
 
 impl std::fmt::Display for InMemoryLayer {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index bfcc031863..cc5b7ade6a 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1,5 +1,6 @@
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::{
     HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
 };
@@ -16,13 +17,14 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::Key;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
 
 use super::delta_layer::{self, DeltaEntry};
 use super::image_layer;
 use super::{
     AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc,
-    ValueReconstructResult, ValueReconstructState,
+    ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
 };
 
 use utils::generation::Generation;
@@ -262,6 +264,29 @@ impl Layer {
             .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
     }
 
+    pub(crate) async fn get_values_reconstruct_data(
+        &self,
+        keyspace: KeySpace,
+        end_lsn: Lsn,
+        reconstruct_data: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        let layer = self
+            .0
+            .get_or_maybe_download(true, Some(ctx))
+            .await
+            .map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;
+
+        self.0
+            .access_stats
+            .record_access(LayerAccessKind::GetValueReconstructData, ctx);
+
+        layer
+            .get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, &self.0, ctx)
+            .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
+            .await
+    }
+
     /// Download the layer if evicted.
     ///
     /// Will not error when the layer is already downloaded.
@@ -1177,7 +1202,7 @@ pub(crate) enum EvictionError {
 
 /// Error internal to the [`LayerInner::get_or_maybe_download`]
 #[derive(Debug, thiserror::Error)]
-enum DownloadError {
+pub(crate) enum DownloadError {
     #[error("timeline has already shutdown")]
     TimelineShutdown,
     #[error("no remote storage configured")]
@@ -1337,6 +1362,28 @@ impl DownloadedLayer {
         }
     }
 
+    async fn get_values_reconstruct_data(
+        &self,
+        keyspace: KeySpace,
+        end_lsn: Lsn,
+        reconstruct_data: &mut ValuesReconstructState,
+        owner: &Arc<LayerInner>,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        use LayerKind::*;
+
+        match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
+            Delta(d) => {
+                d.get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, ctx)
+                    .await
+            }
+            Image(i) => {
+                i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx)
+                    .await
+            }
+        }
+    }
+
     async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
         use LayerKind::*;
         match self.get(owner, ctx).await? {
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index fa78e9fdb2..c375923e81 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -15,7 +15,7 @@ use utils::id::TenantId;
 /// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
 /// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)]
 pub struct PersistentLayerDesc {
     pub tenant_shard_id: TenantShardId,
     pub timeline_id: TimelineId,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 92e5b52c75..0f22284c55 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -16,7 +16,7 @@ use futures::stream::StreamExt;
 use itertools::Itertools;
 use once_cell::sync::Lazy;
 use pageserver_api::{
-    keyspace::{key_range_size, KeySpaceAccum},
+    keyspace::KeySpaceAccum,
     models::{
         DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
         LayerMapInfo, TimelineState,
@@ -67,7 +67,7 @@ use crate::{
     tenant::storage_layer::{
         AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
         LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
-        ValueReconstructState,
+        ValueReconstructState, ValuesReconstructState,
     },
 };
 use crate::{
@@ -111,11 +111,11 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::config::TenantConf;
-use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
 use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
+use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
+use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(super) enum FlushLoopState {
@@ -472,6 +472,15 @@ pub(crate) enum GetVectoredError {
 
     #[error("Requested at invalid LSN: {0}")]
     InvalidLsn(Lsn),
+
+    #[error("Requested key {0} not found")]
+    MissingKey(Key),
+
+    #[error(transparent)]
+    GetReadyAncestorError(GetReadyAncestorError),
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
 }
 
 #[derive(thiserror::Error, Debug)]
@@ -579,6 +588,23 @@ impl From<GetReadyAncestorError> for PageReconstructError {
     }
 }
 
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum GetVectoredImpl {
+    Sequential,
+    Vectored,
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -708,7 +734,7 @@ impl Timeline {
     /// which actually vectorizes the read path.
     pub(crate) async fn get_vectored(
         &self,
-        key_ranges: &[Range<Key>],
+        keyspace: KeySpace,
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
@@ -716,10 +742,7 @@ impl Timeline {
             return Err(GetVectoredError::InvalidLsn(lsn));
         }
 
-        let key_count = key_ranges
-            .iter()
-            .map(|range| key_range_size(range) as u64)
-            .sum();
+        let key_count = keyspace.total_size().try_into().unwrap();
         if key_count > Timeline::MAX_GET_VECTORED_KEYS {
             return Err(GetVectoredError::Oversized(key_count));
         }
@@ -728,33 +751,163 @@ impl Timeline {
             .throttle(ctx, key_count as usize)
             .await;
 
-        let _timer = crate::metrics::GET_VECTORED_LATENCY
-            .for_task_kind(ctx.task_kind())
-            .map(|t| t.start_timer());
-
-        let mut values = BTreeMap::new();
-        for range in key_ranges {
+        for range in &keyspace.ranges {
             let mut key = range.start;
             while key != range.end {
                 assert!(!self.shard_identity.is_key_disposable(&key));
-
-                let block = self.get(key, lsn, ctx).await;
-
-                if matches!(
-                    block,
-                    Err(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
-                ) {
-                    return Err(GetVectoredError::Cancelled);
-                }
-
-                values.insert(key, block);
                 key = key.next();
             }
         }
 
+        trace!(
+            "get vectored request for {:?}@{} from task kind {:?} will use {} implementation",
+            keyspace,
+            lsn,
+            ctx.task_kind(),
+            self.conf.get_vectored_impl
+        );
+
+        let _timer = crate::metrics::GET_VECTORED_LATENCY
+            .for_task_kind(ctx.task_kind())
+            .map(|t| t.start_timer());
+
+        match self.conf.get_vectored_impl {
+            GetVectoredImpl::Sequential => {
+                self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
+            }
+            GetVectoredImpl::Vectored => {
+                let vectored_res = self.get_vectored_impl(keyspace.clone(), lsn, ctx).await;
+
+                self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
+                    .await;
+
+                vectored_res
+            }
+        }
+    }
+
+    pub(super) async fn get_vectored_sequential_impl(
+        &self,
+        keyspace: KeySpace,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        let mut values = BTreeMap::new();
+        for range in keyspace.ranges {
+            let mut key = range.start;
+            while key != range.end {
+                let block = self.get(key, lsn, ctx).await;
+
+                use PageReconstructError::*;
+                match block {
+                    Err(Cancelled | AncestorStopping(_)) => {
+                        return Err(GetVectoredError::Cancelled)
+                    }
+                    Err(Other(err)) if err.to_string().contains("could not find data for key") => {
+                        return Err(GetVectoredError::MissingKey(key))
+                    }
+                    _ => {
+                        values.insert(key, block);
+                        key = key.next();
+                    }
+                }
+            }
+        }
+
         Ok(values)
     }
 
+    pub(super) async fn get_vectored_impl(
+        &self,
+        keyspace: KeySpace,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        let mut reconstruct_state = ValuesReconstructState::new();
+
+        self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
+            .await?;
+
+        let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
+        for (key, res) in reconstruct_state.keys {
+            match res {
+                Err(err) => {
+                    results.insert(key, Err(err));
+                }
+                Ok(state) => {
+                    let state = ValueReconstructState::from(state);
+
+                    let reconstruct_res = self.reconstruct_value(key, lsn, state).await;
+                    results.insert(key, reconstruct_res);
+                }
+            }
+        }
+
+        Ok(results)
+    }
+
+    pub(super) async fn validate_get_vectored_impl(
+        &self,
+        vectored_res: &Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError>,
+        keyspace: KeySpace,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) {
+        let sequential_res = self
+            .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx)
+            .await;
+
+        fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool {
+            use GetVectoredError::*;
+            match (lhs, rhs) {
+                (Cancelled, Cancelled) => true,
+                (_, Cancelled) => true,
+                (Oversized(l), Oversized(r)) => l == r,
+                (InvalidLsn(l), InvalidLsn(r)) => l == r,
+                (MissingKey(l), MissingKey(r)) => l == r,
+                (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true,
+                (Other(_), Other(_)) => true,
+                _ => false,
+            }
+        }
+
+        match (&sequential_res, vectored_res) {
+            (Err(seq_err), Ok(_)) => {
+                panic!(concat!("Sequential get failed with {}, but vectored get did not",
+                               " - keyspace={:?} lsn={}"),
+                       seq_err, keyspace, lsn) },
+            (Ok(_), Err(vec_err)) => {
+                panic!(concat!("Vectored get failed with {}, but sequential get did not",
+                               " - keyspace={:?} lsn={}"),
+                       vec_err, keyspace, lsn) },
+            (Err(seq_err), Err(vec_err)) => {
+                assert!(errors_match(seq_err, vec_err),
+                        "Mismatched errors: {seq_err} != {vec_err} - keyspace={keyspace:?} lsn={lsn}")},
+            (Ok(seq_values), Ok(vec_values)) => {
+                seq_values.iter().zip(vec_values.iter()).for_each(|((seq_key, seq_res), (vec_key, vec_res))| {
+                    assert_eq!(seq_key, vec_key);
+                    match (seq_res, vec_res) {
+                        (Ok(seq_blob), Ok(vec_blob)) => {
+                            assert_eq!(seq_blob, vec_blob,
+                                       "Image mismatch for key {seq_key} - keyspace={keyspace:?} lsn={lsn}");
+                        },
+                        (Err(err), Ok(_)) => {
+                            panic!(
+                                concat!("Sequential get failed with {} for key {}, but vectored get did not",
+                                        " - keyspace={:?} lsn={}"),
+                                err, seq_key, keyspace, lsn) },
+                        (Ok(_), Err(err)) => {
+                            panic!(
+                                concat!("Vectored get failed with {} for key {}, but sequential get did not",
+                                        " - keyspace={:?} lsn={}"),
+                                err, seq_key, keyspace, lsn) },
+                        (Err(_), Err(_)) => {}
+                    }
+                })
+            }
+        }
+    }
+
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
     pub(crate) fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last
@@ -2547,6 +2700,170 @@ impl Timeline {
         }
     }
 
+    /// Get the data needed to reconstruct all keys in the provided keyspace
+    ///
+    /// The algorithm is as follows:
+    /// 1.   While some keys are still not done and there's a timeline to visit:
+    /// 2.   Visit the timeline (see [`Timeline::get_vectored_reconstruct_data_timeline`]:
+    /// 2.1: Build the fringe for the current keyspace
+    /// 2.2  Visit the newest layer from the fringe to collect all values for the range it
+    ///      intersects
+    /// 2.3. Pop the timeline from the fringe
+    /// 2.4. If the fringe is empty, go back to 1
+    async fn get_vectored_reconstruct_data(
+        &self,
+        mut keyspace: KeySpace,
+        request_lsn: Lsn,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        let mut timeline_owned: Arc<Timeline>;
+        let mut timeline = self;
+
+        let mut cont_lsn = Lsn(request_lsn.0 + 1);
+
+        loop {
+            if self.cancel.is_cancelled() {
+                return Err(GetVectoredError::Cancelled);
+            }
+
+            let completed = Self::get_vectored_reconstruct_data_timeline(
+                timeline,
+                keyspace.clone(),
+                cont_lsn,
+                reconstruct_state,
+                &self.cancel,
+                ctx,
+            )
+            .await?;
+
+            keyspace.remove_overlapping_with(&completed);
+            if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() {
+                break;
+            }
+
+            cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
+            timeline_owned = timeline
+                .get_ready_ancestor_timeline(ctx)
+                .await
+                .map_err(GetVectoredError::GetReadyAncestorError)?;
+            timeline = &*timeline_owned;
+        }
+
+        if keyspace.total_size() != 0 {
+            return Err(GetVectoredError::MissingKey(keyspace.start().unwrap()));
+        }
+
+        Ok(())
+    }
+
+    /// Collect the reconstruct data for a ketspace from the specified timeline.
+    ///
+    /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
+    /// the current keyspace. The current keyspace of the search at any given timeline
+    /// is the original keyspace minus all the keys that have been completed minus
+    /// any keys for which we couldn't find an intersecting layer. It's not tracked explicitly,
+    /// but if you merge all the keyspaces in the fringe, you get the "current keyspace".
+    ///
+    /// This is basically a depth-first search visitor implementation where a vertex
+    /// is the (layer, lsn range, key space) tuple. The fringe acts as the stack.
+    ///
+    /// At each iteration pop the top of the fringe (the layer with the highest Lsn)
+    /// and get all the required reconstruct data from the layer in one go.
+    async fn get_vectored_reconstruct_data_timeline(
+        timeline: &Timeline,
+        keyspace: KeySpace,
+        mut cont_lsn: Lsn,
+        reconstruct_state: &mut ValuesReconstructState,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> Result<KeySpace, GetVectoredError> {
+        let mut unmapped_keyspace = keyspace.clone();
+        let mut fringe = LayerFringe::new();
+
+        let mut completed_keyspace = KeySpace::default();
+
+        // Hold the layer map whilst visiting the timeline to prevent
+        // compaction, eviction and flushes from rendering the layers unreadable.
+        //
+        // TODO: Do we actually need to do this? In theory holding on
+        // to [`tenant::storage_layer::Layer`] should be enough. However,
+        // [`Timeline::get`] also holds the lock during IO, so more investigation
+        // is needed.
+        let guard = timeline.layers.read().await;
+        let layers = guard.layer_map();
+
+        'outer: loop {
+            if cancel.is_cancelled() {
+                return Err(GetVectoredError::Cancelled);
+            }
+
+            let keys_done_last_step = reconstruct_state.consume_done_keys();
+            unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
+            completed_keyspace.merge(&keys_done_last_step);
+
+            let in_memory_layer = layers.find_in_memory_layer(|l| {
+                let start_lsn = l.get_lsn_range().start;
+                cont_lsn > start_lsn
+            });
+
+            match in_memory_layer {
+                Some(l) => {
+                    fringe.update(
+                        ReadableLayerDesc::InMemory {
+                            handle: l,
+                            lsn_ceil: cont_lsn,
+                        },
+                        unmapped_keyspace.clone(),
+                    );
+                }
+                None => {
+                    for range in unmapped_keyspace.ranges.iter() {
+                        let results = match layers.range_search(range.clone(), cont_lsn) {
+                            Some(res) => res,
+                            None => {
+                                break 'outer;
+                            }
+                        };
+
+                        results
+                            .found
+                            .into_iter()
+                            .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
+                                (
+                                    ReadableLayerDesc::Persistent {
+                                        desc: (*layer).clone(),
+                                        lsn_floor,
+                                        lsn_ceil: cont_lsn,
+                                    },
+                                    keyspace_accum.to_keyspace(),
+                                )
+                            })
+                            .for_each(|(layer, keyspace)| fringe.update(layer, keyspace));
+                    }
+                }
+            }
+
+            if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() {
+                layer_to_read
+                    .get_values_reconstruct_data(
+                        &guard,
+                        keyspace_to_read.clone(),
+                        reconstruct_state,
+                        ctx,
+                    )
+                    .await?;
+
+                unmapped_keyspace = keyspace_to_read;
+                cont_lsn = layer_to_read.get_lsn_floor();
+            } else {
+                break;
+            }
+        }
+
+        Ok(completed_keyspace)
+    }
+
     /// # Cancel-safety
     ///
     /// This method is cancellation-safe.
@@ -3263,7 +3580,7 @@ impl Timeline {
                         || last_key_in_range
                     {
                         let results = self
-                            .get_vectored(&key_request_accum.consume_keyspace().ranges, lsn, ctx)
+                            .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
                             .await?;
 
                         for (img_key, img) in results {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 51b126b84b..ce5ef66d22 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -488,6 +488,11 @@ class NeonEnvBuilder:
 
         self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
 
+        self.pageserver_get_vectored_impl: Optional[str] = None
+        if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored":
+            self.pageserver_get_vectored_impl = "vectored"
+            log.debug('Overriding pageserver get_vectored_impl config to "vectored"')
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -1055,6 +1060,8 @@ class NeonEnv:
             }
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
+            if config.pageserver_get_vectored_impl is not None:
+                ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl
 
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 465101f64f..0ea76d447e 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -226,6 +226,10 @@ def test_forward_compatibility(
     )
 
     try:
+        # TODO: remove this once the previous pageserrver version understands
+        # the 'get_vectored_impl' config
+        neon_env_builder.pageserver_get_vectored_impl = None
+
         neon_env_builder.num_safekeepers = 3
         neon_local_binpath = neon_env_builder.neon_binpath
         env = neon_env_builder.from_repo_dir(

From e7452d3756c3bd00a56cb3cd49dc991f5e533baf Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 21 Feb 2024 09:54:25 +0000
Subject: [PATCH 0230/1571] storage controller: concurrency + deadlines during
 startup reconcile (#6823)

## Problem

During startup_reconcile we do a couple of potentially-slow things:
- Calling out to all nodes to read their locations
- Calling out to the cloud control plane to notify it of all tenants'
attached nodes

The read of node locations was not being done concurrently across nodes,
and neither operation was bounded by a well defined deadline.

## Summary of changes

- Refactor the async parts of startup_reconcile into separate functions
- Add concurrency and deadline to `scan_node_locations`
- Add deadline to `compute_notify_many`
- Run `cleanup_locations` in the background: there's no need for
startup_reconcile to wait for this to complete.
---
 .../attachment_service/src/service.rs         | 333 ++++++++++++------
 1 file changed, 234 insertions(+), 99 deletions(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 4082af3fe6..0236496c61 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -21,11 +21,11 @@ use pageserver_api::{
         ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
         ValidateResponse, ValidateResponseTenant,
     },
-    models,
     models::{
-        LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
-        TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
-        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
+        self, LocationConfig, LocationConfigListResponse, LocationConfigMode, ShardParameters,
+        TenantConfig, TenantCreateRequest, TenantLocationConfigRequest,
+        TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest,
+        TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
     },
     shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
 };
@@ -167,84 +167,53 @@ impl Service {
     /// Called once on startup, this function attempts to contact all pageservers to build an up-to-date
     /// view of the world, and determine which pageservers are responsive.
     #[instrument(skip_all)]
-    async fn startup_reconcile(&self) {
+    async fn startup_reconcile(self: &Arc<Service>) {
         // For all tenant shards, a vector of observed states on nodes (where None means
         // indeterminate, same as in [`ObservedStateLocation`])
         let mut observed = HashMap::new();
 
         let mut nodes_online = HashSet::new();
 
-        // TODO: issue these requests concurrently
-        {
-            let nodes = {
-                let locked = self.inner.read().unwrap();
-                locked.nodes.clone()
-            };
-            for node in nodes.values() {
-                let http_client = reqwest::ClientBuilder::new()
-                    .timeout(Duration::from_secs(5))
-                    .build()
-                    .expect("Failed to construct HTTP client");
-                let client = mgmt_api::Client::from_client(
-                    http_client,
-                    node.base_url(),
-                    self.config.jwt_token.as_deref(),
-                );
+        // Startup reconciliation does I/O to other services: whether they
+        // are responsive or not, we should aim to finish within our deadline, because:
+        // - If we don't, a k8s readiness hook watching /ready will kill us.
+        // - While we're waiting for startup reconciliation, we are not fully
+        //   available for end user operations like creating/deleting tenants and timelines.
+        //
+        // We set multiple deadlines to break up the time available between the phases of work: this is
+        // arbitrary, but avoids a situation where the first phase could burn our entire timeout period.
+        let start_at = Instant::now();
+        let node_scan_deadline = start_at
+            .checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
+            .expect("Reconcile timeout is a modest constant");
 
-                fn is_fatal(e: &mgmt_api::Error) -> bool {
-                    use mgmt_api::Error::*;
-                    match e {
-                        ReceiveBody(_) | ReceiveErrorBody(_) => false,
-                        ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
-                        | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
-                        | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
-                        ApiError(_, _) => true,
-                    }
-                }
+        let compute_notify_deadline = start_at
+            .checked_add((STARTUP_RECONCILE_TIMEOUT / 4) * 3)
+            .expect("Reconcile timeout is a modest constant");
 
-                let list_response = backoff::retry(
-                    || client.list_location_config(),
-                    is_fatal,
-                    1,
-                    5,
-                    "Location config listing",
-                    &self.cancel,
-                )
-                .await;
-                let Some(list_response) = list_response else {
-                    tracing::info!("Shutdown during startup_reconcile");
-                    return;
-                };
+        // Accumulate a list of any tenant locations that ought to be detached
+        let mut cleanup = Vec::new();
 
-                tracing::info!("Scanning shards on node {}...", node.id);
-                match list_response {
-                    Err(e) => {
-                        tracing::warn!("Could not contact pageserver {} ({e})", node.id);
-                        // TODO: be more tolerant, do some retries, in case
-                        // pageserver is being restarted at the same time as we are
-                    }
-                    Ok(listing) => {
-                        tracing::info!(
-                            "Received {} shard statuses from pageserver {}, setting it to Active",
-                            listing.tenant_shards.len(),
-                            node.id
-                        );
-                        nodes_online.insert(node.id);
+        let node_listings = self.scan_node_locations(node_scan_deadline).await;
+        for (node_id, list_response) in node_listings {
+            let tenant_shards = list_response.tenant_shards;
+            tracing::info!(
+                "Received {} shard statuses from pageserver {}, setting it to Active",
+                tenant_shards.len(),
+                node_id
+            );
+            nodes_online.insert(node_id);
 
-                        for (tenant_shard_id, conf_opt) in listing.tenant_shards {
-                            observed.insert(tenant_shard_id, (node.id, conf_opt));
-                        }
-                    }
-                }
+            for (tenant_shard_id, conf_opt) in tenant_shards {
+                observed.insert(tenant_shard_id, (node_id, conf_opt));
             }
         }
 
-        let mut cleanup = Vec::new();
-
+        // List of tenants for which we will attempt to notify compute of their location at startup
         let mut compute_notifications = Vec::new();
 
         // Populate intent and observed states for all tenants, based on reported state on pageservers
-        let (shard_count, nodes) = {
+        let shard_count = {
             let mut locked = self.inner.write().unwrap();
             let (nodes, tenants, scheduler) = locked.parts_mut();
 
@@ -288,18 +257,171 @@ impl Service {
                 }
             }
 
-            (tenants.len(), nodes.clone())
+            tenants.len()
         };
 
         // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
         // generation_pageserver in the database.
 
-        // Clean up any tenants that were found on pageservers but are not known to us.
+        // Emit compute hook notifications for all tenants which are already stably attached.  Other tenants
+        // will emit compute hook notifications when they reconcile.
+        //
+        // Ordering: we must complete these notification attempts before doing any other reconciliation for the
+        // tenants named here, because otherwise our calls to notify() might race with more recent values
+        // generated by reconciliation.
+        let notify_failures = self
+            .compute_notify_many(compute_notifications, compute_notify_deadline)
+            .await;
+
+        // Compute notify is fallible.  If it fails here, do not delay overall startup: set the
+        // flag on these shards that they have a pending notification.
+        // Update tenant state for any that failed to do their initial compute notify, so that they'll retry later.
+        {
+            let mut locked = self.inner.write().unwrap();
+            for tenant_shard_id in notify_failures.into_iter() {
+                if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) {
+                    shard.pending_compute_notification = true;
+                }
+            }
+        }
+
+        // Finally, now that the service is up and running, launch reconcile operations for any tenants
+        // which require it: under normal circumstances this should only include tenants that were in some
+        // transient state before we restarted, or any tenants whose compute hooks failed above.
+        let reconcile_tasks = self.reconcile_all();
+        // We will not wait for these reconciliation tasks to run here: we're now done with startup and
+        // normal operations may proceed.
+
+        // Clean up any tenants that were found on pageservers but are not known to us.  Do this in the
+        // background because it does not need to complete in order to proceed with other work.
+        if !cleanup.is_empty() {
+            tracing::info!("Cleaning up {} locations in the background", cleanup.len());
+            tokio::task::spawn({
+                let cleanup_self = self.clone();
+                async move { cleanup_self.cleanup_locations(cleanup).await }
+            });
+        }
+
+        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
+    }
+
+    /// Used during [`Self::startup_reconcile`]: issue GETs to all nodes concurrently, with a deadline.
+    ///
+    /// The result includes only nodes which responded within the deadline
+    async fn scan_node_locations(
+        &self,
+        deadline: Instant,
+    ) -> HashMap<NodeId, LocationConfigListResponse> {
+        let nodes = {
+            let locked = self.inner.read().unwrap();
+            locked.nodes.clone()
+        };
+
+        let mut node_results = HashMap::new();
+
+        let mut node_list_futs = FuturesUnordered::new();
+
+        for node in nodes.values() {
+            node_list_futs.push({
+                async move {
+                    let http_client = reqwest::ClientBuilder::new()
+                        .timeout(Duration::from_secs(5))
+                        .build()
+                        .expect("Failed to construct HTTP client");
+                    let client = mgmt_api::Client::from_client(
+                        http_client,
+                        node.base_url(),
+                        self.config.jwt_token.as_deref(),
+                    );
+
+                    fn is_fatal(e: &mgmt_api::Error) -> bool {
+                        use mgmt_api::Error::*;
+                        match e {
+                            ReceiveBody(_) | ReceiveErrorBody(_) => false,
+                            ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
+                            | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
+                            | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
+                            ApiError(_, _) => true,
+                        }
+                    }
+
+                    tracing::info!("Scanning shards on node {}...", node.id);
+                    let description = format!("List locations on {}", node.id);
+                    let response = backoff::retry(
+                        || client.list_location_config(),
+                        is_fatal,
+                        1,
+                        5,
+                        &description,
+                        &self.cancel,
+                    )
+                    .await;
+
+                    (node.id, response)
+                }
+            });
+        }
+
+        loop {
+            let (node_id, result) = tokio::select! {
+                next = node_list_futs.next() => {
+                    match next {
+                        Some(result) => result,
+                        None =>{
+                            // We got results for all our nodes
+                            break;
+                        }
+
+                    }
+                },
+                _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
+                    // Give up waiting for anyone who hasn't responded: we will yield the results that we have
+                    tracing::info!("Reached deadline while waiting for nodes to respond to location listing requests");
+                    break;
+                }
+            };
+
+            let Some(list_response) = result else {
+                tracing::info!("Shutdown during startup_reconcile");
+                break;
+            };
+
+            match list_response {
+                Err(e) => {
+                    tracing::warn!("Could not scan node {} ({e})", node_id);
+                }
+                Ok(listing) => {
+                    node_results.insert(node_id, listing);
+                }
+            }
+        }
+
+        node_results
+    }
+
+    /// Used during [`Self::startup_reconcile`]: detach a list of unknown-to-us tenants from pageservers.
+    ///
+    /// This is safe to run in the background, because if we don't have this TenantShardId in our map of
+    /// tenants, then it is probably something incompletely deleted before: we will not fight with any
+    /// other task trying to attach it.
+    #[instrument(skip_all)]
+    async fn cleanup_locations(&self, cleanup: Vec<(TenantShardId, NodeId)>) {
+        let nodes = self.inner.read().unwrap().nodes.clone();
+
         for (tenant_shard_id, node_id) in cleanup {
             // A node reported a tenant_shard_id which is unknown to us: detach it.
-            let node = nodes
-                .get(&node_id)
-                .expect("Always exists: only known nodes are scanned");
+            let Some(node) = nodes.get(&node_id) else {
+                // This is legitimate; we run in the background and [`Self::startup_reconcile`] might have identified
+                // a location to clean up on a node that has since been removed.
+                tracing::info!(
+                    "Not cleaning up location {node_id}/{tenant_shard_id}: node not found"
+                );
+                continue;
+            };
+
+            if self.cancel.is_cancelled() {
+                break;
+            }
 
             let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
             match client
@@ -332,21 +454,24 @@ impl Service {
                 }
             }
         }
+    }
 
-        // Emit compute hook notifications for all tenants which are already stably attached.  Other tenants
-        // will emit compute hook notifications when they reconcile.
-        //
-        // Ordering: we must complete these notification attempts before doing any other reconciliation for the
-        // tenants named here, because otherwise our calls to notify() might race with more recent values
-        // generated by reconciliation.
-
-        // Compute notify is fallible.  If it fails here, do not delay overall startup: set the
-        // flag on these shards that they have a pending notification.
+    /// Used during [`Self::startup_reconcile`]: issue many concurrent compute notifications.
+    ///
+    /// Returns a set of any shards for which notifications where not acked within the deadline.
+    async fn compute_notify_many(
+        &self,
+        notifications: Vec<(TenantShardId, NodeId)>,
+        deadline: Instant,
+    ) -> HashSet<TenantShardId> {
         let compute_hook = self.inner.read().unwrap().compute_hook.clone();
 
+        let attempt_shards = notifications.iter().map(|i| i.0).collect::<HashSet<_>>();
+        let mut success_shards = HashSet::new();
+
         // Construct an async stream of futures to invoke the compute notify function: we do this
         // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.
-        let stream = futures::stream::iter(compute_notifications.into_iter())
+        let mut stream = futures::stream::iter(notifications.into_iter())
             .map(|(tenant_shard_id, node_id)| {
                 let compute_hook = compute_hook.clone();
                 let cancel = self.cancel.clone();
@@ -357,33 +482,43 @@ impl Service {
                             node_id=%node_id,
                             "Failed to notify compute on startup for shard: {e}"
                         );
-                        Some(tenant_shard_id)
-                    } else {
                         None
+                    } else {
+                        Some(tenant_shard_id)
                     }
                 }
             })
             .buffered(compute_hook::API_CONCURRENCY);
-        let notify_results = stream.collect::<Vec<_>>().await;
 
-        // Update tenant state for any that failed to do their initial compute notify, so that they'll retry later.
-        {
-            let mut locked = self.inner.write().unwrap();
-            for tenant_shard_id in notify_results.into_iter().flatten() {
-                if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) {
-                    shard.pending_compute_notification = true;
+        loop {
+            tokio::select! {
+                next = stream.next() => {
+                    match next {
+                        Some(Some(success_shard)) => {
+                            // A notification succeeded
+                            success_shards.insert(success_shard);
+                            },
+                        Some(None) => {
+                            // A notification that failed
+                        },
+                        None => {
+                            tracing::info!("Successfully sent all compute notifications");
+                            break;
+                        }
+                    }
+                },
+                _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
+                    // Give up sending any that didn't succeed yet
+                    tracing::info!("Reached deadline while sending compute notifications");
+                    break;
                 }
-            }
+            };
         }
 
-        // Finally, now that the service is up and running, launch reconcile operations for any tenants
-        // which require it: under normal circumstances this should only include tenants that were in some
-        // transient state before we restarted, or any tenants whose compute hooks failed above.
-        let reconcile_tasks = self.reconcile_all();
-        // We will not wait for these reconciliation tasks to run here: we're now done with startup and
-        // normal operations may proceed.
-
-        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
+        attempt_shards
+            .difference(&success_shards)
+            .cloned()
+            .collect()
     }
 
     /// Long running background task that periodically wakes up and looks for shards that need

From e0af945f8f552c546bc114edce10aff35d990b5b Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 21 Feb 2024 10:04:09 +0000
Subject: [PATCH 0231/1571] proxy: improve error classification (#6841)

## Problem

## Summary of changes

1. Classify further cplane API errors
2. add 'serviceratelimit' and make a few of the timeout errors return
that.
3. a few additional minor changes
---
 proxy/src/bin/pg_sni_router.rs        | 14 +++-----------
 proxy/src/console/provider.rs         | 18 +++++++++++++++++-
 proxy/src/context.rs                  | 12 +++++-------
 proxy/src/error.rs                    | 26 +++++---------------------
 proxy/src/serverless/sql_over_http.rs | 19 +++++++------------
 5 files changed, 37 insertions(+), 52 deletions(-)

diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 43b805e8a1..5024ba3744 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -171,16 +171,8 @@ async fn task_main(
                     .context("failed to set socket option")?;
 
                 info!(%peer_addr, "serving");
-                let mut ctx =
-                    RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
-                handle_client(
-                    &mut ctx,
-                    dest_suffix,
-                    tls_config,
-                    tls_server_end_point,
-                    socket,
-                )
-                .await
+                let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
+                handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
             }
             .unwrap_or_else(|e| {
                 // Acknowledge that the task has finished with an error.
@@ -248,7 +240,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
 }
 
 async fn handle_client(
-    ctx: &mut RequestMonitoring,
+    mut ctx: RequestMonitoring,
     dest_suffix: Arc<String>,
     tls_config: Arc<rustls::ServerConfig>,
     tls_server_end_point: TlsServerEndPoint,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 640444d14e..0b74cd90cc 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -87,6 +87,22 @@ pub mod errors {
     impl ReportableError for ApiError {
         fn get_error_kind(&self) -> crate::error::ErrorKind {
             match self {
+                ApiError::Console {
+                    status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
+                    ..
+                } => crate::error::ErrorKind::User,
+                ApiError::Console {
+                    status: http::StatusCode::LOCKED,
+                    text,
+                } if text.contains("quota exceeded")
+                    || text.contains("the limit for current plan reached") =>
+                {
+                    crate::error::ErrorKind::User
+                }
+                ApiError::Console {
+                    status: http::StatusCode::TOO_MANY_REQUESTS,
+                    ..
+                } => crate::error::ErrorKind::ServiceRateLimit,
                 ApiError::Console { .. } => crate::error::ErrorKind::ControlPlane,
                 ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
             }
@@ -222,7 +238,7 @@ pub mod errors {
             match self {
                 WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
                 WakeComputeError::ApiError(e) => e.get_error_kind(),
-                WakeComputeError::TimeoutError => crate::error::ErrorKind::RateLimit,
+                WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
             }
         }
     }
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 0cea53ae63..e5caa5bd59 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -147,15 +147,13 @@ impl RequestMonitoring {
         self.success = true;
     }
 
-    pub fn log(&mut self) {
+    pub fn log(self) {}
+}
+
+impl Drop for RequestMonitoring {
+    fn drop(&mut self) {
         if let Some(tx) = self.sender.take() {
             let _: Result<(), _> = tx.send(self.clone());
         }
     }
 }
-
-impl Drop for RequestMonitoring {
-    fn drop(&mut self) {
-        self.log()
-    }
-}
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index 69fe1ebc12..4614f3913d 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -37,9 +37,12 @@ pub enum ErrorKind {
     /// Network error between user and proxy. Not necessarily user error
     ClientDisconnect,
 
-    /// Proxy self-imposed rate limits
+    /// Proxy self-imposed user rate limits
     RateLimit,
 
+    /// Proxy self-imposed service-wise rate limits
+    ServiceRateLimit,
+
     /// internal errors
     Service,
 
@@ -54,25 +57,12 @@ pub enum ErrorKind {
 }
 
 impl ErrorKind {
-    pub fn to_str(&self) -> &'static str {
-        match self {
-            ErrorKind::User => "request failed due to user error",
-            ErrorKind::ClientDisconnect => "client disconnected",
-            ErrorKind::RateLimit => "request cancelled due to rate limit",
-            ErrorKind::Service => "internal service error",
-            ErrorKind::ControlPlane => "non-retryable control plane error",
-            ErrorKind::Postgres => "postgres error",
-            ErrorKind::Compute => {
-                "non-retryable compute connection error (or exhausted retry capacity)"
-            }
-        }
-    }
-
     pub fn to_metric_label(&self) -> &'static str {
         match self {
             ErrorKind::User => "user",
             ErrorKind::ClientDisconnect => "clientdisconnect",
             ErrorKind::RateLimit => "ratelimit",
+            ErrorKind::ServiceRateLimit => "serviceratelimit",
             ErrorKind::Service => "service",
             ErrorKind::ControlPlane => "controlplane",
             ErrorKind::Postgres => "postgres",
@@ -85,12 +75,6 @@ pub trait ReportableError: fmt::Display + Send + 'static {
     fn get_error_kind(&self) -> ErrorKind;
 }
 
-impl ReportableError for tokio::time::error::Elapsed {
-    fn get_error_kind(&self) -> ErrorKind {
-        ErrorKind::RateLimit
-    }
-}
-
 impl ReportableError for tokio_postgres::error::Error {
     fn get_error_kind(&self) -> ErrorKind {
         if self.as_db_error().is_some() {
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index e49c1c4db9..63fe87eade 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -12,7 +12,7 @@ use hyper::StatusCode;
 use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
 use serde_json::Value;
-use tokio::join;
+use tokio::try_join;
 use tokio_postgres::error::DbError;
 use tokio_postgres::error::ErrorPosition;
 use tokio_postgres::GenericClient;
@@ -32,11 +32,9 @@ use crate::auth::ComputeUserInfoParseError;
 use crate::config::ProxyConfig;
 use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
-use crate::error::ReportableError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
-use crate::serverless::backend::HttpConnError;
 use crate::DbName;
 use crate::RoleName;
 
@@ -287,8 +285,10 @@ pub async fn handle(
                 )?
             }
         },
-        Err(e) => {
-            ctx.set_error_kind(e.get_error_kind());
+        Err(_) => {
+            // TODO: when http error classification is done, distinguish between
+            // timeout on sql vs timeout in proxy/cplane
+            // ctx.set_error_kind(crate::error::ErrorKind::RateLimit);
 
             let message = format!(
                 "HTTP-Connection timed out, execution time exeeded {} seconds",
@@ -402,16 +402,11 @@ async fn handle_inner(
         // not strictly necessary to mark success here,
         // but it's just insurance for if we forget it somewhere else
         ctx.latency_timer.success();
-        Ok::<_, HttpConnError>(client)
+        Ok::<_, anyhow::Error>(client)
     };
 
     // Run both operations in parallel
-    let (payload_result, auth_and_connect_result) =
-        join!(fetch_and_process_request, authenticate_and_connect,);
-
-    // Handle the results
-    let payload = payload_result?; // Handle errors appropriately
-    let mut client = auth_and_connect_result?; // Handle errors appropriately
+    let (payload, mut client) = try_join!(fetch_and_process_request, authenticate_and_connect)?;
 
     let mut response = Response::builder()
         .status(StatusCode::OK)

From 428d9fe69e5e70fb2f633117c2137146dfb3d42b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 21 Feb 2024 14:36:57 +0200
Subject: [PATCH 0232/1571] tests: Make test_vm_bit_clear_on_heap_lock more
 robust again. (#6714)

When checking that the contents of the VM page in cache and in
pageserver match, ignore the LSN on the page. It could be different, if
the page was flushed from cache by a checkpoint, for example.

Here's one such failure from the CI that this hopefully fixes:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-6687/7847132649/index.html#suites/8545ca7650e609b2963d4035816a356b/5f9018db15ef4408/

In the passing, also remove some log.infos from the loop. I added them
while developing the tests, but now they're just noise.
---
 test_runner/regress/test_vm_bits.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 1377bed6f6..eff103ca09 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -167,10 +167,14 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
 
     # The VM page in shared buffer cache, and the same page as reconstructed
     # by the pageserver, should be equal.
+    #
+    # Ignore the LSN on the page though (first 8 bytes). If the dirty
+    # VM page is flushed from the cache for some reason, it gets WAL-logged,
+    # which changes the LSN on the page.
     cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
-    vm_page_in_cache = (cur.fetchall()[0][0])[:100].hex()
+    vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex()
     cur.execute("select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn() )")
-    vm_page_at_pageserver = (cur.fetchall()[0][0])[:100].hex()
+    vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex()
 
     assert vm_page_at_pageserver == vm_page_in_cache
 
@@ -201,16 +205,6 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
     for _ in range(1000):
         cur.execute("select test_consume_xids(10000);")
     for _ in range(1000):
-        cur.execute(
-            "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn() )"
-        )
-        page = (cur.fetchall()[0][0])[:100].hex()
-        log.info(f"VM page contents: {page}")
-
-        cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
-        page = (cur.fetchall()[0][0])[:100].hex()
-        log.info(f"VM page contents in cache: {page}")
-
         cur.execute("select min(datfrozenxid::text::int) from pg_database")
         datfrozenxid = int(cur.fetchall()[0][0])
         log.info(f"datfrozenxid {datfrozenxid} locking_xid: {locking_xid}")

From 84f027357d425110a50657f13b34a0c602111050 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 21 Feb 2024 14:12:35 +0000
Subject: [PATCH 0233/1571] pageserver: adjust checkpoint distance for sharded
 tenants (#6852)

## Problem

Where the stripe size is the same order of magnitude as the checkpoint
distance (such as with default settings), tenant shards can easily pass
through `checkpoint_distance` bytes of LSN without actually ingesting
anything. This results in emitting many tiny L0 delta layers.

## Summary of changes

- Multiply checkpoint distance by shard count before comparing with LSN
distance. This is a heuristic and does not guarantee that we won't emit
small layers, but it fixes the issue for typical cases where the writes
in a (checkpoint_distance * shard_count) range of LSN bytes are somewhat
distributed across shards.
- Add a test that checks the size of layers after ingesting to a sharded
tenant; this fails before the fix.

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/src/tenant/timeline.rs    |  8 ++-
 test_runner/fixtures/workload.py     | 11 ++--
 test_runner/regress/test_sharding.py | 83 +++++++++++++++++++++++++++-
 3 files changed, 95 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0f22284c55..6ee05116f8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5192,11 +5192,15 @@ impl<'a> TimelineWriter<'a> {
 
         // Rolling the open layer can be triggered by:
         // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
-        //    the safekeepers need to store.
+        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
+        //    account for how writes are distributed across shards: we expect each node to consume
+        //    1/count of the LSN on average.
         // 2. The size of the currently open layer.
         // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
         //    up and suspend activity.
-        if distance >= self.get_checkpoint_distance().into() {
+        if distance
+            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128
+        {
             info!(
                 "Will roll layer at {} with layer size {} due to LSN distance ({})",
                 lsn, state.current_size, distance
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index f29a6cbf3c..1d5394dc1d 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -73,7 +73,7 @@ class Workload:
             self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
         )
 
-    def write_rows(self, n, pageserver_id: Optional[int] = None):
+    def write_rows(self, n, pageserver_id: Optional[int] = None, upload: bool = True):
         endpoint = self.endpoint(pageserver_id)
         start = self.expect_rows
         end = start + n - 1
@@ -87,9 +87,12 @@ class Workload:
             """
         )
 
-        return last_flush_lsn_upload(
-            self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
-        )
+        if upload:
+            return last_flush_lsn_upload(
+                self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
+            )
+        else:
+            return False
 
     def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True):
         assert self.expect_rows >= n
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 99b2ceb8bc..9e491d450c 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -4,7 +4,7 @@ from fixtures.neon_fixtures import (
     tenant_get_shards,
 )
 from fixtures.remote_storage import s3_storage
-from fixtures.types import TenantShardId, TimelineId
+from fixtures.types import Lsn, TenantShardId, TimelineId
 from fixtures.workload import Workload
 
 
@@ -284,3 +284,84 @@ def test_sharding_split_smoke(
     )
 
     env.attachment_service.consistency_check()
+
+
+def test_sharding_ingest(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Check behaviors related to ingest:
+    - That we generate properly sized layers
+    - TODO: that updates to remote_consistent_lsn are made correctly via safekeepers
+    """
+
+    # Set a small stripe size and checkpoint distance, so that we can exercise rolling logic
+    # without writing a lot of data.
+    expect_layer_size = 131072
+    TENANT_CONF = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": f"{expect_layer_size}",
+        "compaction_target_size": f"{expect_layer_size}",
+    }
+    shard_count = 4
+    neon_env_builder.num_pageservers = shard_count
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF,
+        initial_tenant_shard_count=shard_count,
+        # A stripe size the same order of magnitude as layer size: this ensures that
+        # within checkpoint_distance some shards will have no data to ingest, if LSN
+        # contains sequential page writes.  This test checks that this kind of
+        # scenario doesn't result in some shards emitting empty/tiny layers.
+        initial_tenant_shard_stripe_size=expect_layer_size // 8192,
+    )
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(512, upload=False)
+    workload.write_rows(512, upload=False)
+    workload.write_rows(512, upload=False)
+    workload.write_rows(512, upload=False)
+    workload.validate()
+
+    small_layer_count = 0
+    ok_layer_count = 0
+    huge_layer_count = 0
+
+    # Inspect the resulting layer map, count how many layers are undersized.
+    for shard in env.attachment_service.locate(tenant_id):
+        pageserver = env.get_pageserver(shard["node_id"])
+        shard_id = shard["shard_id"]
+        layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id)
+
+        for layer in layer_map.historic_layers:
+            assert layer.layer_file_size is not None
+            if layer.layer_file_size < expect_layer_size // 2:
+                classification = "Small"
+                small_layer_count += 1
+            elif layer.layer_file_size > expect_layer_size * 2:
+                classification = "Huge "
+                huge_layer_count += 1
+            else:
+                classification = "OK   "
+                ok_layer_count += 1
+
+            if layer.kind == "Delta":
+                assert layer.lsn_end is not None
+                lsn_size = Lsn(layer.lsn_end) - Lsn(layer.lsn_start)
+            else:
+                lsn_size = 0
+
+            log.info(
+                f"{classification} layer[{pageserver.id}]: {layer.layer_file_name} (size {layer.layer_file_size}, LSN distance {lsn_size})"
+            )
+
+    # Why an inexact check?
+    # - Because we roll layers on checkpoint_distance * shard_count, we expect to obey the target
+    #   layer size on average, but it is still possible to write some tiny layers.
+    log.info(f"Totals: {small_layer_count} small layers, {ok_layer_count} ok layers")
+    assert float(small_layer_count) / float(ok_layer_count) < 0.25
+
+    # Each shard may emit up to one huge layer, because initdb ingest doesn't respect checkpoint_distance.
+    assert huge_layer_count <= shard_count

From 7257ffbf75d8dd75f5e1bd5cf2b3f4a06555cde9 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 21 Feb 2024 16:57:30 +0200
Subject: [PATCH 0234/1571] feat: imitiation_only eviction_task policy (#6598)

mostly reusing the existing and perhaps controversially sharing the
histogram. in practice we don't configure this per-tenant.

Cc: #5331
---
 libs/pageserver_api/src/models.rs             |   2 +
 pageserver/src/config.rs                      |  41 +++++-
 .../src/tenant/timeline/eviction_task.rs      | 119 +++++++++++-------
 3 files changed, 116 insertions(+), 46 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 557a4d7de9..af3c8018c4 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -291,6 +291,7 @@ pub struct TenantConfig {
 pub enum EvictionPolicy {
     NoEviction,
     LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
+    OnlyImitiate(EvictionPolicyLayerAccessThreshold),
 }
 
 impl EvictionPolicy {
@@ -298,6 +299,7 @@ impl EvictionPolicy {
         match self {
             EvictionPolicy::NoEviction => "NoEviction",
             EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
+            EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate",
         }
     }
 }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 6c00c55f39..34d9636673 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1572,17 +1572,50 @@ threshold = "20m"
                 eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
             })
         );
+
         match &conf.default_tenant_conf.eviction_policy {
-            EvictionPolicy::NoEviction => panic!("Unexpected eviction opolicy tenant settings"),
-            EvictionPolicy::LayerAccessThreshold(eviction_thresold) => {
-                assert_eq!(eviction_thresold.period, Duration::from_secs(20 * 60));
-                assert_eq!(eviction_thresold.threshold, Duration::from_secs(20 * 60));
+            EvictionPolicy::LayerAccessThreshold(eviction_threshold) => {
+                assert_eq!(eviction_threshold.period, Duration::from_secs(20 * 60));
+                assert_eq!(eviction_threshold.threshold, Duration::from_secs(20 * 60));
             }
+            other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"),
         }
 
         Ok(())
     }
 
+    #[test]
+    fn parse_imitation_only_pageserver_config() {
+        let tempdir = tempdir().unwrap();
+        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir).unwrap();
+
+        let pageserver_conf_toml = format!(
+            r#"pg_distrib_dir = "{pg_distrib_dir}"
+metric_collection_endpoint = "http://sample.url"
+metric_collection_interval = "10min"
+id = 222
+
+[tenant_config]
+evictions_low_residence_duration_metric_threshold = "20m"
+
+[tenant_config.eviction_policy]
+kind = "OnlyImitiate"
+period = "20m"
+threshold = "20m"
+"#,
+        );
+        let toml: Document = pageserver_conf_toml.parse().unwrap();
+        let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap();
+
+        match &conf.default_tenant_conf.eviction_policy {
+            EvictionPolicy::OnlyImitiate(t) => {
+                assert_eq!(t.period, Duration::from_secs(20 * 60));
+                assert_eq!(t.threshold, Duration::from_secs(20 * 60));
+            }
+            other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"),
+        }
+    }
+
     fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
         let tempdir_path = tempdir.path();
 
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 33ba234a63..127e351c14 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -85,6 +85,7 @@ impl Timeline {
             let policy = self.get_eviction_policy();
             let period = match policy {
                 EvictionPolicy::LayerAccessThreshold(lat) => lat.period,
+                EvictionPolicy::OnlyImitiate(lat) => lat.period,
                 EvictionPolicy::NoEviction => Duration::from_secs(10),
             };
             if random_init_delay(period, &cancel).await.is_err() {
@@ -119,33 +120,45 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> ControlFlow<(), Instant> {
         debug!("eviction iteration: {policy:?}");
-        match policy {
+        let start = Instant::now();
+        let (period, threshold) = match policy {
             EvictionPolicy::NoEviction => {
                 // check again in 10 seconds; XXX config watch mechanism
-                ControlFlow::Continue(Instant::now() + Duration::from_secs(10))
+                return ControlFlow::Continue(Instant::now() + Duration::from_secs(10));
             }
             EvictionPolicy::LayerAccessThreshold(p) => {
-                let start = Instant::now();
                 match self.eviction_iteration_threshold(p, cancel, ctx).await {
                     ControlFlow::Break(()) => return ControlFlow::Break(()),
                     ControlFlow::Continue(()) => (),
                 }
-                let elapsed = start.elapsed();
-                crate::tenant::tasks::warn_when_period_overrun(
-                    elapsed,
-                    p.period,
-                    BackgroundLoopKind::Eviction,
-                );
-                crate::metrics::EVICTION_ITERATION_DURATION
-                    .get_metric_with_label_values(&[
-                        &format!("{}", p.period.as_secs()),
-                        &format!("{}", p.threshold.as_secs()),
-                    ])
-                    .unwrap()
-                    .observe(elapsed.as_secs_f64());
-                ControlFlow::Continue(start + p.period)
+                (p.period, p.threshold)
             }
-        }
+            EvictionPolicy::OnlyImitiate(p) => {
+                if self.imitiate_only(p, cancel, ctx).await.is_break() {
+                    return ControlFlow::Break(());
+                }
+                (p.period, p.threshold)
+            }
+        };
+
+        let elapsed = start.elapsed();
+        crate::tenant::tasks::warn_when_period_overrun(
+            elapsed,
+            period,
+            BackgroundLoopKind::Eviction,
+        );
+        // FIXME: if we were to mix policies on a pageserver, we would have no way to sense this. I
+        // don't think that is a relevant fear however, and regardless the imitation should be the
+        // most costly part.
+        crate::metrics::EVICTION_ITERATION_DURATION
+            .get_metric_with_label_values(&[
+                &format!("{}", period.as_secs()),
+                &format!("{}", threshold.as_secs()),
+            ])
+            .unwrap()
+            .observe(elapsed.as_secs_f64());
+
+        ControlFlow::Continue(start + period)
     }
 
     async fn eviction_iteration_threshold(
@@ -167,30 +180,6 @@ impl Timeline {
             _ = self.cancel.cancelled() => return ControlFlow::Break(()),
         };
 
-        // If we evict layers but keep cached values derived from those layers, then
-        // we face a storm of on-demand downloads after pageserver restart.
-        // The reason is that the restart empties the caches, and so, the values
-        // need to be re-computed by accessing layers, which we evicted while the
-        // caches were filled.
-        //
-        // Solutions here would be one of the following:
-        // 1. Have a persistent cache.
-        // 2. Count every access to a cached value to the access stats of all layers
-        //    that were accessed to compute the value in the first place.
-        // 3. Invalidate the caches at a period of < p.threshold/2, so that the values
-        //    get re-computed from layers, thereby counting towards layer access stats.
-        // 4. Make the eviction task imitate the layer accesses that typically hit caches.
-        //
-        // We follow approach (4) here because in Neon prod deployment:
-        // - page cache is quite small => high churn => low hit rate
-        //   => eviction gets correct access stats
-        // - value-level caches such as logical size & repatition have a high hit rate,
-        //   especially for inactive tenants
-        //   => eviction sees zero accesses for these
-        //   => they cause the on-demand download storm on pageserver restart
-        //
-        // We should probably move to persistent caches in the future, or avoid
-        // having inactive tenants attached to pageserver in the first place.
         match self.imitate_layer_accesses(p, cancel, ctx).await {
             ControlFlow::Break(()) => return ControlFlow::Break(()),
             ControlFlow::Continue(()) => (),
@@ -307,6 +296,52 @@ impl Timeline {
         ControlFlow::Continue(())
     }
 
+    /// Like `eviction_iteration_threshold`, but without any eviction. Eviction will be done by
+    /// disk usage based eviction task.
+    async fn imitiate_only(
+        self: &Arc<Self>,
+        p: &EvictionPolicyLayerAccessThreshold,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> ControlFlow<()> {
+        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
+            BackgroundLoopKind::Eviction,
+            ctx,
+        );
+
+        let _permit = tokio::select! {
+            permit = acquire_permit => permit,
+            _ = cancel.cancelled() => return ControlFlow::Break(()),
+            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
+        };
+
+        self.imitate_layer_accesses(p, cancel, ctx).await
+    }
+
+    /// If we evict layers but keep cached values derived from those layers, then
+    /// we face a storm of on-demand downloads after pageserver restart.
+    /// The reason is that the restart empties the caches, and so, the values
+    /// need to be re-computed by accessing layers, which we evicted while the
+    /// caches were filled.
+    ///
+    /// Solutions here would be one of the following:
+    /// 1. Have a persistent cache.
+    /// 2. Count every access to a cached value to the access stats of all layers
+    ///    that were accessed to compute the value in the first place.
+    /// 3. Invalidate the caches at a period of < p.threshold/2, so that the values
+    ///    get re-computed from layers, thereby counting towards layer access stats.
+    /// 4. Make the eviction task imitate the layer accesses that typically hit caches.
+    ///
+    /// We follow approach (4) here because in Neon prod deployment:
+    /// - page cache is quite small => high churn => low hit rate
+    ///   => eviction gets correct access stats
+    /// - value-level caches such as logical size & repatition have a high hit rate,
+    ///   especially for inactive tenants
+    ///   => eviction sees zero accesses for these
+    ///   => they cause the on-demand download storm on pageserver restart
+    ///
+    /// We should probably move to persistent caches in the future, or avoid
+    /// having inactive tenants attached to pageserver in the first place.
     #[instrument(skip_all)]
     async fn imitate_layer_accesses(
         &self,

From 41464325c7b84d90884dcff94d25551fbf03ecde Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 21 Feb 2024 17:20:59 +0200
Subject: [PATCH 0235/1571] fix: remaining missed cancellations and timeouts
 (#6843)

As noticed in #6836 some occurances of error conversions were missed in
#6697:
- `std::io::Error` popped up by `tokio::io::copy_buf` containing
`DownloadError` was turned into `DownloadError::Other`
- similarly for secondary downloader errors

These changes come at the loss of pathname context.

Cc: #6096
---
 libs/remote_storage/src/error.rs              | 29 ++++++++++++++---
 libs/remote_storage/src/support.rs            |  8 ++++-
 .../tenant/remote_timeline_client/download.rs | 25 +++------------
 pageserver/src/tenant/secondary/downloader.rs | 31 ++++++++++---------
 4 files changed, 52 insertions(+), 41 deletions(-)

diff --git a/libs/remote_storage/src/error.rs b/libs/remote_storage/src/error.rs
index 96f044e087..66422853e1 100644
--- a/libs/remote_storage/src/error.rs
+++ b/libs/remote_storage/src/error.rs
@@ -44,6 +44,26 @@ impl DownloadError {
     }
 }
 
+impl From<std::io::Error> for DownloadError {
+    fn from(value: std::io::Error) -> Self {
+        let needs_unwrap = value.kind() == std::io::ErrorKind::Other
+            && value
+                .get_ref()
+                .and_then(|x| x.downcast_ref::<DownloadError>())
+                .is_some();
+
+        if needs_unwrap {
+            *value
+                .into_inner()
+                .expect("just checked")
+                .downcast::<DownloadError>()
+                .expect("just checked")
+        } else {
+            DownloadError::Other(value.into())
+        }
+    }
+}
+
 #[derive(Debug)]
 pub enum TimeTravelError {
     /// Validation or other error happened due to user input.
@@ -142,13 +162,12 @@ impl std::fmt::Display for TimeoutOrCancel {
 impl std::error::Error for TimeoutOrCancel {}
 
 impl TimeoutOrCancel {
-    pub fn caused(error: &anyhow::Error) -> Option<&Self> {
-        error.root_cause().downcast_ref()
-    }
-
     /// Returns true if the error was caused by [`TimeoutOrCancel::Cancel`].
     pub fn caused_by_cancel(error: &anyhow::Error) -> bool {
-        Self::caused(error).is_some_and(Self::is_cancel)
+        error
+            .root_cause()
+            .downcast_ref::<Self>()
+            .is_some_and(Self::is_cancel)
     }
 
     pub fn is_cancel(&self) -> bool {
diff --git a/libs/remote_storage/src/support.rs b/libs/remote_storage/src/support.rs
index 20f193c6c8..d146b5445b 100644
--- a/libs/remote_storage/src/support.rs
+++ b/libs/remote_storage/src/support.rs
@@ -73,6 +73,8 @@ where
         if !*this.hit {
             if let Poll::Ready(e) = this.cancellation.poll(cx) {
                 *this.hit = true;
+
+                // most likely this will be a std::io::Error wrapping a DownloadError
                 let e = Err(std::io::Error::from(e));
                 return Poll::Ready(Some(e));
             }
@@ -130,6 +132,8 @@ mod tests {
                 .is_some_and(|e| matches!(e, DownloadError::Cancelled)),
             "{inner:?}"
         );
+        let e = DownloadError::from(e);
+        assert!(matches!(e, DownloadError::Cancelled), "{e:?}");
 
         tokio::select! {
             _ = stream.next() => unreachable!("no timeout ever happens as we were already cancelled"),
@@ -146,7 +150,7 @@ mod tests {
         let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
         let mut stream = std::pin::pin!(stream);
 
-        // because the stream uses 120s timeout we are paused, we advance to 120s right away.
+        // because the stream uses 120s timeout and we are paused, we advance to 120s right away.
         let first = stream.next();
 
         let e = first.await.expect("there must be some").unwrap_err();
@@ -158,6 +162,8 @@ mod tests {
                 .is_some_and(|e| matches!(e, DownloadError::Timeout)),
             "{inner:?}"
         );
+        let e = DownloadError::from(e);
+        assert!(matches!(e, DownloadError::Timeout), "{e:?}");
 
         cancel.cancel();
 
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index c70267474e..962cf5d12e 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -88,14 +88,7 @@ pub async fn download_layer_file<'a>(
 
             let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
 
-            let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file)
-                .await
-                .with_context(|| {
-                    format!(
-                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
-                )
-                })
-                .map_err(DownloadError::Other);
+            let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file).await;
 
             match bytes_amount {
                 Ok(bytes_amount) => {
@@ -107,7 +100,7 @@ pub async fn download_layer_file<'a>(
                         on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
                     }
 
-                    Err(e)
+                    Err(e.into())
                 }
             }
         },
@@ -245,10 +238,7 @@ async fn do_download_index_part(
             let stream = download.download_stream;
             let mut stream = StreamReader::new(stream);
 
-            tokio::io::copy_buf(&mut stream, &mut bytes)
-                .await
-                .with_context(|| format!("download index part at {remote_path:?}"))
-                .map_err(DownloadError::Other)?;
+            tokio::io::copy_buf(&mut stream, &mut bytes).await?;
 
             Ok(bytes)
         },
@@ -428,14 +418,7 @@ pub(crate) async fn download_initdb_tar_zst(
             let mut download = tokio_util::io::StreamReader::new(download.download_stream);
             let mut writer = tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, file);
 
-            // TODO: this consumption of the response body should be subject to timeout + cancellation, but
-            // not without thinking carefully about how to recover safely from cancelling a write to
-            // local storage (e.g. by writing into a temp file as we do in download_layer)
-            // FIXME: flip the weird error wrapping
-            tokio::io::copy_buf(&mut download, &mut writer)
-                .await
-                .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
-                .map_err(DownloadError::Other)?;
+            tokio::io::copy_buf(&mut download, &mut writer).await?;
 
             let mut file = writer.into_inner();
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 6966cf7709..51ab421b58 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -438,8 +438,14 @@ impl From<std::io::Error> for UpdateError {
     fn from(value: std::io::Error) -> Self {
         if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
             UpdateError::NoSpace
+        } else if value
+            .get_ref()
+            .and_then(|x| x.downcast_ref::<DownloadError>())
+            .is_some()
+        {
+            UpdateError::from(DownloadError::from(value))
         } else {
-            // An I/O error from e.g. tokio::io::copy is most likely a remote storage issue
+            // An I/O error from e.g. tokio::io::copy_buf is most likely a remote storage issue
             UpdateError::Other(anyhow::anyhow!(value))
         }
     }
@@ -672,20 +678,17 @@ impl<'a> TenantDownloader<'a> {
             .await
             {
                 Ok(bytes) => bytes,
-                Err(e) => {
-                    if let DownloadError::NotFound = e {
-                        // A heatmap might be out of date and refer to a layer that doesn't exist any more.
-                        // This is harmless: continue to download the next layer. It is expected during compaction
-                        // GC.
-                        tracing::debug!(
-                            "Skipped downloading missing layer {}, raced with compaction/gc?",
-                            layer.name
-                        );
-                        continue;
-                    } else {
-                        return Err(e.into());
-                    }
+                Err(DownloadError::NotFound) => {
+                    // A heatmap might be out of date and refer to a layer that doesn't exist any more.
+                    // This is harmless: continue to download the next layer. It is expected during compaction
+                    // GC.
+                    tracing::debug!(
+                        "Skipped downloading missing layer {}, raced with compaction/gc?",
+                        layer.name
+                    );
+                    continue;
                 }
+                Err(e) => return Err(e.into()),
             };
 
             if downloaded_bytes != layer.metadata.file_size {

From 4de2f0f3e021cd7e84c5f8ef5251da13f0127c3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 21 Feb 2024 16:35:37 +0100
Subject: [PATCH 0236/1571] Implement a sharded time travel recovery endpoint
 (#6821)

The sharding service didn't have support for S3 disaster recovery.

This PR adds a new endpoint to the attachment service, which is slightly
different from the endpoint on the pageserver, in that it takes the
shard count history of the tenant as json parameters: we need to do
time travel recovery for both the shard count at the target time and the
shard count at the current moment in time, as well as the past shard
counts that either still reference.

Fixes #6604, part of https://github.com/neondatabase/cloud/issues/8233

---------

Co-authored-by: John Spray <john@neon.tech>
---
 Cargo.lock                                    |   1 +
 control_plane/attachment_service/Cargo.toml   |   1 +
 control_plane/attachment_service/src/http.rs  |  40 +++++-
 .../attachment_service/src/scheduler.rs       |   5 +-
 .../attachment_service/src/service.rs         |  92 +++++++++++-
 .../attachment_service/src/tenant_state.rs    |   7 +
 control_plane/src/bin/neon_local.rs           |   2 +-
 libs/pageserver_api/src/models.rs             |   8 +-
 pageserver/client/src/mgmt_api.rs             |  14 ++
 test_runner/fixtures/neon_fixtures.py         |   4 +-
 test_runner/fixtures/pageserver/http.py       |  10 +-
 test_runner/fixtures/pageserver/utils.py      |   4 +-
 test_runner/regress/test_sharding_service.py  | 133 +++++++++++++++++-
 13 files changed, 304 insertions(+), 17 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ac8cceb5f6..51c433cd07 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -284,6 +284,7 @@ dependencies = [
  "diesel_migrations",
  "futures",
  "git-version",
+ "humantime",
  "hyper",
  "metrics",
  "once_cell",
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 9e1c6377ee..bfdfd4c77d 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -18,6 +18,7 @@ clap.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hyper.workspace = true
+humantime.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 67ab37dfc1..d85753bedc 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -4,7 +4,7 @@ use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
     TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
-    TimelineCreateRequest,
+    TenantTimeTravelRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
@@ -12,7 +12,7 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};
 use utils::auth::SwappableJwtAuth;
 use utils::http::endpoint::{auth_middleware, request_span};
-use utils::http::request::parse_request_param;
+use utils::http::request::{must_get_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};
 
 use utils::{
@@ -180,6 +180,39 @@ async fn handle_tenant_location_config(
     )
 }
 
+async fn handle_tenant_time_travel_remote_storage(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;
+
+    let timestamp_raw = must_get_query_param(&req, "travel_to")?;
+    let _timestamp = humantime::parse_rfc3339(&timestamp_raw).map_err(|_e| {
+        ApiError::BadRequest(anyhow::anyhow!(
+            "Invalid time for travel_to: {timestamp_raw:?}"
+        ))
+    })?;
+
+    let done_if_after_raw = must_get_query_param(&req, "done_if_after")?;
+    let _done_if_after = humantime::parse_rfc3339(&done_if_after_raw).map_err(|_e| {
+        ApiError::BadRequest(anyhow::anyhow!(
+            "Invalid time for done_if_after: {done_if_after_raw:?}"
+        ))
+    })?;
+
+    service
+        .tenant_time_travel_remote_storage(
+            &time_travel_req,
+            tenant_id,
+            timestamp_raw,
+            done_if_after_raw,
+        )
+        .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn handle_tenant_delete(
     service: Arc<Service>,
     req: Request<Body>,
@@ -477,6 +510,9 @@ pub fn make_router(
         .put("/v1/tenant/:tenant_id/location_config", |r| {
             tenant_service_handler(r, handle_tenant_location_config)
         })
+        .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
+            tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
+        })
         // Timeline operations
         .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
             tenant_service_handler(r, handle_tenant_timeline_delete)
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 39d8d0a260..fb3c7f634c 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -175,10 +175,7 @@ impl Scheduler {
         }
     }
 
-    pub(crate) fn schedule_shard(
-        &mut self,
-        hard_exclude: &[NodeId],
-    ) -> Result<NodeId, ScheduleError> {
+    pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
         if self.nodes.is_empty() {
             return Err(ScheduleError::NoPageservers);
         }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 0236496c61..74e1296709 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1,4 +1,5 @@
 use std::{
+    borrow::Cow,
     cmp::Ordering,
     collections::{BTreeMap, HashMap, HashSet},
     str::FromStr,
@@ -25,7 +26,7 @@ use pageserver_api::{
         self, LocationConfig, LocationConfigListResponse, LocationConfigMode, ShardParameters,
         TenantConfig, TenantCreateRequest, TenantLocationConfigRequest,
         TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest,
-        TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
+        TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo,
     },
     shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
 };
@@ -1329,6 +1330,95 @@ impl Service {
         Ok(result)
     }
 
+    pub(crate) async fn tenant_time_travel_remote_storage(
+        &self,
+        time_travel_req: &TenantTimeTravelRequest,
+        tenant_id: TenantId,
+        timestamp: Cow<'_, str>,
+        done_if_after: Cow<'_, str>,
+    ) -> Result<(), ApiError> {
+        let node = {
+            let locked = self.inner.read().unwrap();
+            // Just a sanity check to prevent misuse: the API expects that the tenant is fully
+            // detached everywhere, and nothing writes to S3 storage. Here, we verify that,
+            // but only at the start of the process, so it's really just to prevent operator
+            // mistakes.
+            for (shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) {
+                if shard.intent.get_attached().is_some() || !shard.intent.get_secondary().is_empty()
+                {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "We want tenant to be attached in shard with tenant_shard_id={shard_id}"
+                    )));
+                }
+                let maybe_attached = shard
+                    .observed
+                    .locations
+                    .iter()
+                    .filter_map(|(node_id, observed_location)| {
+                        observed_location
+                            .conf
+                            .as_ref()
+                            .map(|loc| (node_id, observed_location, loc.mode))
+                    })
+                    .find(|(_, _, mode)| *mode != LocationConfigMode::Detached);
+                if let Some((node_id, _observed_location, mode)) = maybe_attached {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}")));
+                }
+            }
+            let scheduler = &locked.scheduler;
+            // Right now we only perform the operation on a single node without parallelization
+            // TODO fan out the operation to multiple nodes for better performance
+            let node_id = scheduler.schedule_shard(&[])?;
+            let node = locked
+                .nodes
+                .get(&node_id)
+                .expect("Pageservers may not be deleted while lock is active");
+            node.clone()
+        };
+
+        // The shard count is encoded in the remote storage's URL, so we need to handle all historically used shard counts
+        let mut counts = time_travel_req
+            .shard_counts
+            .iter()
+            .copied()
+            .collect::<HashSet<_>>()
+            .into_iter()
+            .collect::<Vec<_>>();
+        counts.sort_unstable();
+
+        for count in counts {
+            let shard_ids = (0..count.count())
+                .map(|i| TenantShardId {
+                    tenant_id,
+                    shard_number: ShardNumber(i),
+                    shard_count: count,
+                })
+                .collect::<Vec<_>>();
+            for tenant_shard_id in shard_ids {
+                let client =
+                    mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+
+                tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);
+
+                client
+                        .tenant_time_travel_remote_storage(
+                            tenant_shard_id,
+                            &timestamp,
+                            &done_if_after,
+                        )
+                        .await
+                        .map_err(|e| {
+                            ApiError::InternalServerError(anyhow::anyhow!(
+                                "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}",
+                                node.id
+                            ))
+                        })?;
+            }
+        }
+
+        Ok(())
+    }
+
     pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
         self.ensure_attached_wait(tenant_id).await?;
 
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 4ec6fdca67..7970207e27 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -495,6 +495,13 @@ impl TenantState {
             }
         }
 
+        for node_id in self.observed.locations.keys() {
+            if self.intent.attached != Some(*node_id) && !self.intent.secondary.contains(node_id) {
+                // We have observed state that isn't part of our intent: need to clean it up.
+                return true;
+            }
+        }
+
         // Even if there is no pageserver work to be done, if we have a pending notification to computes,
         // wake up a reconciler to send it.
         if self.pending_compute_notification {
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 5c0d008943..f824003d01 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -616,7 +616,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
             let tenant_id = get_tenant_id(create_match, env)?;
             let new_branch_name = create_match
                 .get_one::<String>("branch-name")
-                .ok_or_else(|| anyhow!("No branch name provided"))?;
+                .ok_or_else(|| anyhow!("No branch name provided"))?; // TODO
 
             let pg_version = create_match
                 .get_one::<u32>("pg-version")
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index af3c8018c4..b68ab9fd59 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -344,7 +344,7 @@ impl ThrottleConfig {
 /// A flattened analog of a `pagesever::tenant::LocationMode`, which
 /// lists out all possible states (and the virtual "Detached" state)
 /// in a flat form rather than using rust-style enums.
-#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
 pub enum LocationConfigMode {
     AttachedSingle,
     AttachedMulti,
@@ -408,6 +408,12 @@ pub struct TenantLocationConfigRequest {
     pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantTimeTravelRequest {
+    pub shard_counts: Vec<ShardCount>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantShardLocation {
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index baea747d3c..969d0d99c0 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -217,6 +217,20 @@ impl Client {
         }
     }
 
+    pub async fn tenant_time_travel_remote_storage(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timestamp: &str,
+        done_if_after: &str,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/time_travel_remote_storage?travel_to={timestamp}&done_if_after={done_if_after}",
+            self.mgmt_api_endpoint
+        );
+        self.request(Method::PUT, &uri, ()).await?;
+        Ok(())
+    }
+
     pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
         let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
         self.request(Method::PUT, &uri, req).await?;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ce5ef66d22..79a4c7cde8 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -512,7 +512,7 @@ class NeonEnvBuilder:
 
     def init_start(
         self,
-        initial_tenant_conf: Optional[Dict[str, str]] = None,
+        initial_tenant_conf: Optional[Dict[str, Any]] = None,
         default_remote_storage_if_missing: bool = True,
         initial_tenant_shard_count: Optional[int] = None,
         initial_tenant_shard_stripe_size: Optional[int] = None,
@@ -1497,7 +1497,7 @@ class NeonCli(AbstractNeonCli):
         self,
         tenant_id: Optional[TenantId] = None,
         timeline_id: Optional[TimelineId] = None,
-        conf: Optional[Dict[str, str]] = None,
+        conf: Optional[Dict[str, Any]] = None,
         shard_count: Optional[int] = None,
         shard_stripe_size: Optional[int] = None,
         set_default: bool = False,
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index d4583308ff..98eb89d30c 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -395,12 +395,20 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         tenant_id: Union[TenantId, TenantShardId],
         timestamp: datetime,
         done_if_after: datetime,
+        shard_counts: Optional[List[int]] = None,
     ):
         """
         Issues a request to perform time travel operations on the remote storage
         """
+
+        if shard_counts is None:
+            shard_counts = []
+        body: Dict[str, Any] = {
+            "shard_counts": shard_counts,
+        }
         res = self.put(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/time_travel_remote_storage?travel_to={timestamp.isoformat()}Z&done_if_after={done_if_after.isoformat()}Z"
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/time_travel_remote_storage?travel_to={timestamp.isoformat()}Z&done_if_after={done_if_after.isoformat()}Z",
+            json=body,
         )
         self.verbose_error(res)
 
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 1812eb438d..225cfcd143 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -482,8 +482,8 @@ def tenant_delete_wait_completed(
 MANY_SMALL_LAYERS_TENANT_CONFIG = {
     "gc_period": "0s",
     "compaction_period": "0s",
-    "checkpoint_distance": f"{1024**2}",
-    "image_creation_threshold": "100",
+    "checkpoint_distance": 1024**2,
+    "image_creation_threshold": 100,
 }
 
 
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index d2334c7776..6525f9733f 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -1,13 +1,30 @@
 import time
 from collections import defaultdict
+from datetime import datetime, timezone
+from typing import List
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+)
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pageserver.utils import tenant_delete_wait_completed, timeline_delete_wait_completed
+from fixtures.pageserver.utils import (
+    MANY_SMALL_LAYERS_TENANT_CONFIG,
+    enable_remote_storage_versioning,
+    list_prefix,
+    remote_storage_delete_key,
+    tenant_delete_wait_completed,
+    timeline_delete_wait_completed,
+)
 from fixtures.pg_version import PgVersion
+from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.types import TenantId, TimelineId
-from fixtures.utils import wait_until
+from fixtures.utils import run_pg_bench_small, wait_until
+from mypy_boto3_s3.type_defs import (
+    ObjectTypeDef,
+)
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -457,3 +474,113 @@ def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
     # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
     # meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
     env.attachment_service.consistency_check()
+
+
+def test_sharding_service_s3_time_travel_recovery(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+):
+    """
+    Test for S3 time travel
+    """
+
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    # Mock S3 doesn't have versioning enabled by default, enable it
+    # (also do it before there is any writes to the bucket)
+    if remote_storage_kind == RemoteStorageKind.MOCK_S3:
+        remote_storage = neon_env_builder.pageserver_remote_storage
+        assert remote_storage, "remote storage not configured"
+        enable_remote_storage_versioning(remote_storage)
+
+    neon_env_builder.num_pageservers = 1
+
+    env = neon_env_builder.init_start()
+    virtual_ps_http = PageserverHttpClient(env.attachment_service_port, lambda: True)
+
+    tenant_id = TenantId.generate()
+    env.attachment_service.tenant_create(
+        tenant_id,
+        shard_count=2,
+        shard_stripe_size=8192,
+        tenant_config=MANY_SMALL_LAYERS_TENANT_CONFIG,
+    )
+
+    # Check that the consistency check passes
+    env.attachment_service.consistency_check()
+
+    branch_name = "main"
+    timeline_id = env.neon_cli.create_timeline(
+        branch_name,
+        tenant_id=tenant_id,
+    )
+    # Write some nontrivial amount of data into the endpoint and wait until it is uploaded
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        run_pg_bench_small(pg_bin, endpoint.connstr())
+        endpoint.safe_psql("CREATE TABLE created_foo(id integer);")
+        # last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+
+    # Give the data time to be uploaded
+    time.sleep(4)
+
+    # Detach the tenant
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+
+    time.sleep(4)
+    ts_before_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None)
+    time.sleep(4)
+
+    # Simulate a "disaster": delete some random files from remote storage for one of the shards
+    assert env.pageserver_remote_storage
+    shard_id_for_list = "0002"
+    objects: List[ObjectTypeDef] = list_prefix(
+        env.pageserver_remote_storage,
+        f"tenants/{tenant_id}-{shard_id_for_list}/timelines/{timeline_id}/",
+    ).get("Contents", [])
+    assert len(objects) > 1
+    log.info(f"Found {len(objects)} objects in remote storage")
+    should_delete = False
+    for obj in objects:
+        obj_key = obj["Key"]
+        should_delete = not should_delete
+        if not should_delete:
+            log.info(f"Keeping key on remote storage: {obj_key}")
+            continue
+        log.info(f"Deleting key from remote storage: {obj_key}")
+        remote_storage_delete_key(env.pageserver_remote_storage, obj_key)
+        pass
+
+    time.sleep(4)
+    ts_after_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None)
+    time.sleep(4)
+
+    # Do time travel recovery
+    virtual_ps_http.tenant_time_travel_remote_storage(
+        tenant_id, ts_before_disaster, ts_after_disaster, shard_counts=[2]
+    )
+    time.sleep(4)
+
+    # Attach the tenant again
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "AttachedSingle",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": 100,
+        },
+    )
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        endpoint.safe_psql("SELECT * FROM created_foo;")
+
+    env.attachment_service.consistency_check()

From 532b0fa52b950730d9cc9f7a0089b31f4fc1fa42 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 21 Feb 2024 15:45:22 +0000
Subject: [PATCH 0237/1571] Revise CODEOWNERS (#6840)

## Problem

- Current file has ambiguous ownership for some paths
- The /control_plane/attachment_service is storage specific & updates
there don't need to request reviews from other teams.

## Summary of changes

- Define a single owning team per path, so that we can make reviews by
that team mandatory in future.
- Remove the top-level /control_plane as no one specific team owns
neon_local, and we would rarely see a PR that exclusively touches that
path.
- Add an entry for /control_plane/attachment_service, which is newer
storage-specific code.
---
 CODEOWNERS | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index e384dc39f1..5b601f0566 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,10 +1,10 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/control_plane/ @neondatabase/compute @neondatabase/storage
-/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
+/control_plane/attachment_service @neondatabase/storage
+/libs/pageserver_api/ @neondatabase/storage
 /libs/postgres_ffi/ @neondatabase/compute
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
-/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
+/libs/vm_monitor/ @neondatabase/autoscaling
 /pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
 /proxy/ @neondatabase/proxy

From ce1673a8c46c2e61a7d5e8509ccc563c7fbd2a30 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 21 Feb 2024 16:00:17 +0000
Subject: [PATCH 0238/1571] tests: improve stability of  tests using
 `wait_for_upload_queue_empty` (#6856)

## Problem

PR #6834 introduced an assertion that the sets of metric labels on
finished operations should equal those on started operations, which is
not true if no operations have finished yet for a particular set of
labels.

## Summary of changes

- Instead of asserting out, wait and re-check in the case that finished
metrics don't match started
---
 test_runner/fixtures/pageserver/utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 225cfcd143..1415038f69 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -219,6 +219,7 @@ def wait_for_last_record_lsn(
 def wait_for_upload_queue_empty(
     pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
 ):
+    wait_period_secs = 0.2
     while True:
         all_metrics = pageserver_http.get_metrics()
         started = all_metrics.query_all(
@@ -235,7 +236,7 @@ def wait_for_upload_queue_empty(
                 "timeline_id": str(timeline_id),
             },
         )
-        assert len(started) == len(finished)
+
         # this is `started left join finished`; if match, subtracting start from finished, resulting in queue depth
         remaining_labels = ["shard_id", "file_kind", "op_kind"]
         tl: List[Tuple[Any, float]] = []
@@ -256,7 +257,7 @@ def wait_for_upload_queue_empty(
             log.info(f"  {labels}: {queue_count}")
         if all(queue_count == 0 for (_, queue_count) in tl):
             return
-        time.sleep(0.2)
+        time.sleep(wait_period_secs)
 
 
 def wait_timeline_detail_404(

From afda4420bd660eec1d53d4d9f6e1f1ecba86bfa9 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 21 Feb 2024 17:03:55 +0000
Subject: [PATCH 0239/1571] test_sharding_ingress: bigger data, skip in debug
 mode (#6859)

## Problem

Accidentally merged #6852 without this test stability change. The test
as-written could sometimes fail on debug-pg14.

## Summary of changes

- Write more data so that the test can more reliably assert on the ratio
of total layers to small layers
- Skip the test in debug mode, since writing any more than a tiny bit of
data tends to result in a flaky test in the much slower debug
environment.
---
 test_runner/regress/test_sharding.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 9e491d450c..5413b178a5 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1,3 +1,6 @@
+import os
+
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -286,6 +289,12 @@ def test_sharding_split_smoke(
     env.attachment_service.consistency_check()
 
 
+@pytest.mark.skipif(
+    # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
+    # validating in this test don't benefit much from debug assertions.
+    os.getenv("BUILD_TYPE") == "debug",
+    reason="Avoid running bulkier ingest tests in debug mode",
+)
 def test_sharding_ingest(
     neon_env_builder: NeonEnvBuilder,
 ):
@@ -319,10 +328,10 @@ def test_sharding_ingest(
 
     workload = Workload(env, tenant_id, timeline_id)
     workload.init()
-    workload.write_rows(512, upload=False)
-    workload.write_rows(512, upload=False)
-    workload.write_rows(512, upload=False)
-    workload.write_rows(512, upload=False)
+    workload.write_rows(4096, upload=False)
+    workload.write_rows(4096, upload=False)
+    workload.write_rows(4096, upload=False)
+    workload.write_rows(4096, upload=False)
     workload.validate()
 
     small_layer_count = 0
@@ -361,7 +370,12 @@ def test_sharding_ingest(
     # - Because we roll layers on checkpoint_distance * shard_count, we expect to obey the target
     #   layer size on average, but it is still possible to write some tiny layers.
     log.info(f"Totals: {small_layer_count} small layers, {ok_layer_count} ok layers")
-    assert float(small_layer_count) / float(ok_layer_count) < 0.25
+    if small_layer_count <= shard_count:
+        # If each shard has <= 1 small layer
+        pass
+    else:
+        # General case:
+        assert float(small_layer_count) / float(ok_layer_count) < 0.25
 
     # Each shard may emit up to one huge layer, because initdb ingest doesn't respect checkpoint_distance.
     assert huge_layer_count <= shard_count

From 60e5a56a5a08b72ffb11d4918b03f5a99ce6326f Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 21 Feb 2024 17:24:59 +0000
Subject: [PATCH 0240/1571] proxy: include client IP in ip deny message (#6854)

## Problem

Debugging IP deny errors is difficult for our users

## Summary of changes

Include the client IP in the deny message
---
 proxy/src/auth.rs                             | 17 +++++++++--------
 proxy/src/auth/backend.rs                     |  2 +-
 proxy/src/serverless/backend.rs               |  2 +-
 test_runner/regress/test_proxy_allowed_ips.py |  2 +-
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index c8028d1bf0..8c44823c98 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -21,7 +21,7 @@ use crate::{
     console,
     error::{ReportableError, UserFacingError},
 };
-use std::io;
+use std::{io, net::IpAddr};
 use thiserror::Error;
 
 /// Convenience wrapper for the authentication error.
@@ -62,10 +62,11 @@ pub enum AuthErrorImpl {
     Io(#[from] io::Error),
 
     #[error(
-        "This IP address is not allowed to connect to this endpoint. \
-        Please add it to the allowed list in the Neon console."
+        "This IP address {0} is not allowed to connect to this endpoint. \
+        Please add it to the allowed list in the Neon console. \
+        Make sure to check for IPv4 or IPv6 addresses."
     )]
-    IpAddressNotAllowed,
+    IpAddressNotAllowed(IpAddr),
 
     #[error("Too many connections to this endpoint. Please try again later.")]
     TooManyConnections,
@@ -87,8 +88,8 @@ impl AuthError {
         AuthErrorImpl::AuthFailed(user.into()).into()
     }
 
-    pub fn ip_address_not_allowed() -> Self {
-        AuthErrorImpl::IpAddressNotAllowed.into()
+    pub fn ip_address_not_allowed(ip: IpAddr) -> Self {
+        AuthErrorImpl::IpAddressNotAllowed(ip).into()
     }
 
     pub fn too_many_connections() -> Self {
@@ -122,7 +123,7 @@ impl UserFacingError for AuthError {
             MalformedPassword(_) => self.to_string(),
             MissingEndpointName => self.to_string(),
             Io(_) => "Internal error".to_string(),
-            IpAddressNotAllowed => self.to_string(),
+            IpAddressNotAllowed(_) => self.to_string(),
             TooManyConnections => self.to_string(),
             UserTimeout(_) => self.to_string(),
         }
@@ -141,7 +142,7 @@ impl ReportableError for AuthError {
             MalformedPassword(_) => crate::error::ErrorKind::User,
             MissingEndpointName => crate::error::ErrorKind::User,
             Io(_) => crate::error::ErrorKind::ClientDisconnect,
-            IpAddressNotAllowed => crate::error::ErrorKind::User,
+            IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
             TooManyConnections => crate::error::ErrorKind::RateLimit,
             UserTimeout(_) => crate::error::ErrorKind::User,
         }
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 47c1dc4e92..5cb8074cd5 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -209,7 +209,7 @@ async fn auth_quirks(
 
     // check allowed list
     if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
-        return Err(auth::AuthError::ip_address_not_allowed());
+        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
     }
     let cached_secret = match maybe_secret {
         Some(secret) => secret,
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 6f93f86d5f..2e63ad6c99 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -32,7 +32,7 @@ impl PoolingBackend {
         let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
         if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
-            return Err(AuthError::ip_address_not_allowed());
+            return Err(AuthError::ip_address_not_allowed(ctx.peer_addr));
         }
         let cached_secret = match maybe_secret {
             Some(secret) => secret,
diff --git a/test_runner/regress/test_proxy_allowed_ips.py b/test_runner/regress/test_proxy_allowed_ips.py
index f533579811..7a804114ba 100644
--- a/test_runner/regress/test_proxy_allowed_ips.py
+++ b/test_runner/regress/test_proxy_allowed_ips.py
@@ -24,7 +24,7 @@ async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil
         with pytest.raises(psycopg2.Error) as exprinfo:
             static_proxy.safe_psql(**kwargs)
         text = str(exprinfo.value).strip()
-        assert "This IP address is not allowed to connect" in text
+        assert "not allowed to connect" in text
 
     # no SNI, deprecated `options=project` syntax (before we had several endpoint in project)
     check_cannot_connect(query="select 1", sslsni=0, options="project=private-project")

From 03f8a42ed9d5eba142c162000f69bef8bf239b70 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 21 Feb 2024 19:09:40 +0000
Subject: [PATCH 0241/1571] Add walsenders_keep_horizon option (#6860)

Add `--walsenders-keep-horizon` argument to safekeeper cmdline. It will
prevent deleting WAL segments from disk if they are needed by the active
START_REPLICATION connection.

This is useful for sharding. Without this option, if one of the shard
falls behind, it starts to read WAL from S3, which is much slower than
disk. This can result in huge shard lagging.
---
 safekeeper/src/bin/safekeeper.rs              |  5 +++
 safekeeper/src/lib.rs                         |  2 +
 safekeeper/src/safekeeper.rs                  | 20 +--------
 safekeeper/src/send_wal.rs                    | 15 +++++++
 safekeeper/src/timeline.rs                    | 43 ++++++++++++++++++-
 .../tests/walproposer_sim/safekeeper.rs       |  1 +
 6 files changed, 67 insertions(+), 19 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 33047051df..3c4c81e499 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -166,6 +166,10 @@ struct Args {
     /// useful for debugging.
     #[arg(long)]
     current_thread_runtime: bool,
+    /// Keep horizon for walsenders, i.e. don't remove WAL segments that are
+    /// still needed for existing replication connection.
+    #[arg(long)]
+    walsenders_keep_horizon: bool,
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -295,6 +299,7 @@ async fn main() -> anyhow::Result<()> {
         pg_tenant_only_auth,
         http_auth,
         current_thread_runtime: args.current_thread_runtime,
+        walsenders_keep_horizon: args.walsenders_keep_horizon,
     };
 
     // initialize sentry if SENTRY_DSN is provided
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 27b80fcbe8..ce4b4d7bd0 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -78,6 +78,7 @@ pub struct SafeKeeperConf {
     pub pg_tenant_only_auth: Option<Arc<JwtAuth>>,
     pub http_auth: Option<Arc<SwappableJwtAuth>>,
     pub current_thread_runtime: bool,
+    pub walsenders_keep_horizon: bool,
 }
 
 impl SafeKeeperConf {
@@ -121,6 +122,7 @@ impl SafeKeeperConf {
             heartbeat_timeout: Duration::new(5, 0),
             max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
             current_thread_runtime: false,
+            walsenders_keep_horizon: false,
         }
     }
 }
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index d66db9b652..84393d8dab 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -4,7 +4,7 @@ use anyhow::{bail, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 
-use postgres_ffi::{TimeLineID, XLogSegNo, MAX_SEND_SIZE};
+use postgres_ffi::{TimeLineID, MAX_SEND_SIZE};
 use serde::{Deserialize, Serialize};
 use std::cmp::max;
 use std::cmp::min;
@@ -946,28 +946,12 @@ where
         }
         Ok(())
     }
-
-    /// Get oldest segno we still need to keep. We hold WAL till it is consumed
-    /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
-    /// offloading.
-    /// While it is safe to use inmem values for determining horizon,
-    /// we use persistent to make possible normal states less surprising.
-    pub fn get_horizon_segno(&self, wal_backup_enabled: bool) -> XLogSegNo {
-        let mut horizon_lsn = min(
-            self.state.remote_consistent_lsn,
-            self.state.peer_horizon_lsn,
-        );
-        if wal_backup_enabled {
-            horizon_lsn = min(horizon_lsn, self.state.backup_lsn);
-        }
-        horizon_lsn.segment_number(self.state.server.wal_seg_size as usize)
-    }
 }
 
 #[cfg(test)]
 mod tests {
     use futures::future::BoxFuture;
-    use postgres_ffi::WAL_SEGMENT_SIZE;
+    use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE};
 
     use super::*;
     use crate::{
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index ee3e4c8ead..4b887f36b7 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -136,6 +136,21 @@ impl WalSenders {
         self.mutex.lock().slots.iter().flatten().cloned().collect()
     }
 
+    /// Get LSN of the most lagging pageserver receiver. Return None if there are no
+    /// active walsenders.
+    pub fn laggard_lsn(self: &Arc<WalSenders>) -> Option<Lsn> {
+        self.mutex
+            .lock()
+            .slots
+            .iter()
+            .flatten()
+            .filter_map(|s| match s.feedback {
+                ReplicationFeedback::Pageserver(feedback) => Some(feedback.last_received_lsn),
+                ReplicationFeedback::Standby(_) => None,
+            })
+            .min()
+    }
+
     /// Get aggregated pageserver feedback.
     pub fn get_ps_feedback(self: &Arc<WalSenders>) -> PageserverFeedback {
         self.mutex.lock().agg_ps_feedback
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 730a80a583..9b7ab14218 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -286,6 +286,29 @@ impl SharedState {
             .cloned()
             .collect()
     }
+
+    /// Get oldest segno we still need to keep. We hold WAL till it is consumed
+    /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
+    /// offloading.
+    /// While it is safe to use inmem values for determining horizon,
+    /// we use persistent to make possible normal states less surprising.
+    fn get_horizon_segno(
+        &self,
+        wal_backup_enabled: bool,
+        extra_horizon_lsn: Option<Lsn>,
+    ) -> XLogSegNo {
+        let state = &self.sk.state;
+
+        use std::cmp::min;
+        let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn);
+        if wal_backup_enabled {
+            horizon_lsn = min(horizon_lsn, state.backup_lsn);
+        }
+        if let Some(extra_horizon_lsn) = extra_horizon_lsn {
+            horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
+        }
+        horizon_lsn.segment_number(state.server.wal_seg_size as usize)
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -353,6 +376,12 @@ pub struct Timeline {
 
     /// Directory where timeline state is stored.
     pub timeline_dir: Utf8PathBuf,
+
+    /// Should we keep WAL on disk for active replication connections.
+    /// Especially useful for sharding, when different shards process WAL
+    /// with different speed.
+    // TODO: add `Arc<SafeKeeperConf>` here instead of adding each field separately.
+    walsenders_keep_horizon: bool,
 }
 
 impl Timeline {
@@ -386,6 +415,7 @@ impl Timeline {
             cancellation_rx,
             cancellation_tx,
             timeline_dir: conf.timeline_dir(&ttid),
+            walsenders_keep_horizon: conf.walsenders_keep_horizon,
         })
     }
 
@@ -418,6 +448,7 @@ impl Timeline {
             cancellation_rx,
             cancellation_tx,
             timeline_dir: conf.timeline_dir(&ttid),
+            walsenders_keep_horizon: conf.walsenders_keep_horizon,
         })
     }
 
@@ -817,10 +848,20 @@ impl Timeline {
             bail!(TimelineError::Cancelled(self.ttid));
         }
 
+        // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
+        // This allows to get better read speed for pageservers that are lagging behind,
+        // at the cost of keeping more WAL on disk.
+        let replication_horizon_lsn = if self.walsenders_keep_horizon {
+            self.walsenders.laggard_lsn()
+        } else {
+            None
+        };
+
         let horizon_segno: XLogSegNo;
         let remover = {
             let shared_state = self.write_shared_state().await;
-            horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled);
+            horizon_segno =
+                shared_state.get_horizon_segno(wal_backup_enabled, replication_horizon_lsn);
             if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
                 return Ok(()); // nothing to do
             }
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 1945b9d0cb..e3aaf5d391 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -175,6 +175,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         pg_tenant_only_auth: None,
         http_auth: None,
         current_thread_runtime: false,
+        walsenders_keep_horizon: false,
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;

From 76b92e33893d565409d671ce34313ae08d1ced1d Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 12 Feb 2024 08:33:37 -0600
Subject: [PATCH 0242/1571] Fix multithreaded postmaster on macOS

curl_global_init() with an IPv6 enabled curl build on macOS will cause
the calling program to become multithreaded. Unfortunately for
shared_preload_libraries, that means the postmaster becomes
multithreaded, which CANNOT happen. There are checks in Postgres to make
sure that this is not the case.
---
 pgxn/neon/control_plane_connector.c | 96 +++++++++++++++--------------
 pgxn/neon/extension_server.c        | 46 +++++++-------
 pgxn/neon/neon_utils.c              | 50 ++++++++++++++-
 pgxn/neon/neon_utils.h              | 12 ++++
 4 files changed, 134 insertions(+), 70 deletions(-)

diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index f6f006cba4..00a582d718 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -35,16 +35,16 @@
 #include "utils/memutils.h"
 #include "utils/jsonb.h"
 
+#include "neon_utils.h"
+
 static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
 
+static const char *jwt_token = NULL;
+
 /* GUCs */
 static char *ConsoleURL = NULL;
 static bool ForwardDDL = true;
 
-/* Curl structures for sending the HTTP requests */
-static CURL *CurlHandle;
-static struct curl_slist *ContentHeader = NULL;
-
 /*
  * CURL docs say that this buffer must exist until we call curl_easy_cleanup
  * (which we never do), so we make this a static
@@ -226,6 +226,8 @@ ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata)
 static void
 SendDeltasToControlPlane()
 {
+	static CURL		*handle = NULL;
+
 	if (!RootTable.db_table && !RootTable.role_table)
 		return;
 	if (!ConsoleURL)
@@ -236,29 +238,57 @@ SendDeltasToControlPlane()
 	if (!ForwardDDL)
 		return;
 
-	char	   *message = ConstructDeltaMessage();
-	ErrorString str = {};
+	if (handle == NULL)
+	{
+		struct curl_slist *headers = NULL;
 
-	curl_easy_setopt(CurlHandle, CURLOPT_CUSTOMREQUEST, "PATCH");
-	curl_easy_setopt(CurlHandle, CURLOPT_HTTPHEADER, ContentHeader);
-	curl_easy_setopt(CurlHandle, CURLOPT_POSTFIELDS, message);
-	curl_easy_setopt(CurlHandle, CURLOPT_URL, ConsoleURL);
-	curl_easy_setopt(CurlHandle, CURLOPT_ERRORBUFFER, CurlErrorBuf);
-	curl_easy_setopt(CurlHandle, CURLOPT_TIMEOUT, 3L /* seconds */ );
-	curl_easy_setopt(CurlHandle, CURLOPT_WRITEDATA, &str);
-	curl_easy_setopt(CurlHandle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback);
+		headers = curl_slist_append(headers, "Content-Type: application/json");
+		if (headers == NULL)
+		{
+			elog(ERROR, "Failed to set Content-Type header");
+		}
+
+		if (jwt_token)
+		{
+			char		auth_header[8192];
+
+			snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token);
+			headers = curl_slist_append(headers, auth_header);
+			if (headers == NULL)
+			{
+				elog(ERROR, "Failed to set Authorization header");
+			}
+		}
+
+		handle = alloc_curl_handle();
+
+		curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "PATCH");
+		curl_easy_setopt(handle, CURLOPT_HTTPHEADER, headers);
+		curl_easy_setopt(handle, CURLOPT_URL, ConsoleURL);
+		curl_easy_setopt(handle, CURLOPT_ERRORBUFFER, CurlErrorBuf);
+		curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ );
+		curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback);
+	}
+
+	char	   *message = ConstructDeltaMessage();
+	ErrorString str;
+
+	str.size = 0;
+
+	curl_easy_setopt(handle, CURLOPT_POSTFIELDS, message);
+	curl_easy_setopt(handle, CURLOPT_WRITEDATA, &str);
 
 	const int	num_retries = 5;
-	int			curl_status;
+	CURLcode	curl_status;
 
 	for (int i = 0; i < num_retries; i++)
 	{
-		if ((curl_status = curl_easy_perform(CurlHandle)) == 0)
+		if ((curl_status = curl_easy_perform(handle)) == 0)
 			break;
 		elog(LOG, "Curl request failed on attempt %d: %s", i, CurlErrorBuf);
 		pg_usleep(1000 * 1000);
 	}
-	if (curl_status != 0)
+	if (curl_status != CURLE_OK)
 	{
 		elog(ERROR, "Failed to perform curl request: %s", CurlErrorBuf);
 	}
@@ -266,13 +296,11 @@ SendDeltasToControlPlane()
 	{
 		long		response_code;
 
-		if (curl_easy_getinfo(CurlHandle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION)
+		if (curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION)
 		{
-			bool		error_exists = str.size != 0;
-
 			if (response_code != 200)
 			{
-				if (error_exists)
+				if (str.size != 0)
 				{
 					elog(ERROR,
 						 "Received HTTP code %ld from control plane: %s",
@@ -835,34 +863,10 @@ InitControlPlaneConnector()
 							 NULL,
 							 NULL);
 
-	const char *jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
-
+	jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
 	if (!jwt_token)
 	{
 		elog(LOG, "Missing NEON_CONTROL_PLANE_TOKEN environment variable, forwarding will not be authenticated");
 	}
 
-	if (curl_global_init(CURL_GLOBAL_DEFAULT))
-	{
-		elog(ERROR, "Failed to initialize curl");
-	}
-	if ((CurlHandle = curl_easy_init()) == NULL)
-	{
-		elog(ERROR, "Failed to initialize curl handle");
-	}
-	if ((ContentHeader = curl_slist_append(ContentHeader, "Content-Type: application/json")) == NULL)
-	{
-		elog(ERROR, "Failed to initialize content header");
-	}
-
-	if (jwt_token)
-	{
-		char		auth_header[8192];
-
-		snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token);
-		if ((ContentHeader = curl_slist_append(ContentHeader, auth_header)) == NULL)
-		{
-			elog(ERROR, "Failed to initialize authorization header");
-		}
-	}
 }
diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c
index d9a75142f1..039405e2cd 100644
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -14,6 +14,8 @@
 
 #include "utils/guc.h"
 
+#include "neon_utils.h"
+
 static int	extension_server_port = 0;
 
 static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
@@ -31,15 +33,19 @@ static download_extension_file_hook_type prev_download_extension_file_hook = NUL
 static bool
 neon_download_extension_file_http(const char *filename, bool is_library)
 {
-	CURL	   *curl;
+	static CURL	   *handle = NULL;
+
 	CURLcode	res;
 	char	   *compute_ctl_url;
 	char	   *postdata;
 	bool		ret = false;
 
-	if ((curl = curl_easy_init()) == NULL)
+	if (handle == NULL)
 	{
-		elog(ERROR, "Failed to initialize curl handle");
+		handle = alloc_curl_handle();
+
+		curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST");
+		curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ );
 	}
 
 	compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
@@ -47,28 +53,22 @@ neon_download_extension_file_http(const char *filename, bool is_library)
 
 	elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
 
-	curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
-	curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
-	curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */ );
+	curl_easy_setopt(handle, CURLOPT_URL, compute_ctl_url);
 
-	if (curl)
+	/* Perform the request, res will get the return code */
+	res = curl_easy_perform(handle);
+	/* Check for errors */
+	if (res == CURLE_OK)
 	{
-		/* Perform the request, res will get the return code */
-		res = curl_easy_perform(curl);
-		/* Check for errors */
-		if (res == CURLE_OK)
-		{
-			ret = true;
-		}
-		else
-		{
-			/* Don't error here because postgres will try to find the file */
-			/* and will fail with some proper error message if it's not found. */
-			elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
-		}
-
-		/* always cleanup */
-		curl_easy_cleanup(curl);
+		ret = true;
+	}
+	else
+	{
+		/*
+		 * Don't error here because postgres will try to find the file and will
+		 * fail with some proper error message if it's not found.
+		 */
+		elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
 	}
 
 	return ret;
diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c
index 9135847aaf..ce554c89df 100644
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -1,6 +1,9 @@
-
 #include <sys/resource.h>
 
+#ifndef WALPROPOSER_LIB
+#include <curl/curl.h>
+#endif
+
 #include "postgres.h"
 
 #include "lib/stringinfo.h"
@@ -114,3 +117,48 @@ disable_core_dump()
 		fprintf(stderr, "WARNING: disable cores setrlimit failed: %s", strerror(save_errno));
 	}
 }
+
+#ifndef WALPROPOSER_LIB
+
+/*
+ * On macOS with a libcurl that has IPv6 support, curl_global_init() calls
+ * SCDynamicStoreCopyProxies(), which makes the program multithreaded. An ideal
+ * place to call curl_global_init() would be _PG_init(), but Neon has to be
+ * added to shared_preload_libraries, which are loaded in the Postmaster
+ * process. The Postmaster is not supposed to become multithreaded at any point
+ * in its lifecycle. Postgres doesn't have any good hook that I know of to
+ * initialize per-backend structures, so we have to check this on any
+ * allocation of a CURL handle.
+ *
+ * Free the allocated CURL handle with curl_easy_cleanup(3).
+ *
+ * https://developer.apple.com/documentation/systemconfiguration/1517088-scdynamicstorecopyproxies
+ */
+CURL *
+alloc_curl_handle(void)
+{
+	static bool curl_initialized = false;
+
+	CURL *handle;
+
+	if (unlikely(!curl_initialized))
+	{
+		/* Protected by mutex internally */
+		if (curl_global_init(CURL_GLOBAL_DEFAULT))
+		{
+			elog(ERROR, "Failed to initialize curl");
+		}
+
+		curl_initialized = true;
+	}
+
+	handle = curl_easy_init();
+	if (handle == NULL)
+	{
+		elog(ERROR, "Failed to initialize curl handle");
+	}
+
+	return handle;
+}
+
+#endif
diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h
index a86f1e061c..10d41db102 100644
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -1,6 +1,12 @@
 #ifndef __NEON_UTILS_H__
 #define __NEON_UTILS_H__
 
+#include "lib/stringinfo.h"
+
+#ifndef WALPROPOSER_LIB
+#include <curl/curl.h>
+#endif
+
 bool		HexDecodeString(uint8 *result, char *input, int nbytes);
 uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
@@ -8,4 +14,10 @@ void		pq_sendint32_le(StringInfo buf, uint32 i);
 void		pq_sendint64_le(StringInfo buf, uint64 i);
 extern void disable_core_dump();
 
+#ifndef WALPROPOSER_LIB
+
+CURL *		alloc_curl_handle(void);
+
+#endif
+
 #endif							/* __NEON_UTILS_H__ */

From f2767d20564d09e7afa933e5538143f7b5d78d64 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 21 Feb 2024 19:32:12 +0000
Subject: [PATCH 0243/1571] CI: run check-permissions before all jobs (#6794)

## Problem
For PRs from external contributors, we're still running `actionlint` and
`neon_extra_builds` workflows (which could fail due to lack of
permissions to secrets).

## Summary of changes
- Extract `check-permissions` job to a separate reusable workflow
- Depend all jobs from `actionlint` and `neon_extra_builds` workflows on
`check-permissions`
---
 .github/workflows/actionlint.yml        |  8 +++++-
 .github/workflows/build_and_test.yml    | 21 +++------------
 .github/workflows/check-permissions.yml | 36 +++++++++++++++++++++++++
 .github/workflows/neon_extra_builds.yml | 12 +++++++--
 4 files changed, 56 insertions(+), 21 deletions(-)
 create mode 100644 .github/workflows/check-permissions.yml

diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index c290ff88e2..f2736614bf 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -16,8 +16,14 @@ concurrency:
   cancel-in-progress: ${{ github.event_name == 'pull_request' }}
 
 jobs:
-  actionlint:
+  check-permissions:
     if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
+    uses: ./.github/workflows/check-permissions.yml
+    with:
+      github-event-name: ${{ github.event_name}}
+
+  actionlint:
+    needs: [ check-permissions ]
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1744616888..5a807aa9fd 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -27,24 +27,9 @@ env:
 jobs:
   check-permissions:
     if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
-    runs-on: ubuntu-latest
-    steps:
-    - name: Disallow PRs from forks
-      if: |
-        github.event_name == 'pull_request' &&
-        github.event.pull_request.head.repo.full_name != github.repository
-
-      run: |
-        if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
-          MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
-        else
-          MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
-        fi
-
-        echo >&2 "We don't run CI for PRs from forks"
-        echo >&2 "${MESSAGE}"
-
-        exit 1
+    uses: ./.github/workflows/check-permissions.yml
+    with:
+      github-event-name: ${{ github.event_name}}
 
   cancel-previous-e2e-tests:
     needs: [ check-permissions ]
diff --git a/.github/workflows/check-permissions.yml b/.github/workflows/check-permissions.yml
new file mode 100644
index 0000000000..c3357c6cf8
--- /dev/null
+++ b/.github/workflows/check-permissions.yml
@@ -0,0 +1,36 @@
+name: Check Permissions
+
+on:
+  workflow_call:
+    inputs:
+      github-event-name:
+        required: true
+        type: string
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+jobs:
+  check-permissions:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Disallow CI runs on PRs from forks
+      if: |
+        inputs.github-event-name  == 'pull_request' &&
+        github.event.pull_request.head.repo.full_name != github.repository
+      run: |
+        if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
+          MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
+        else
+          MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
+        fi
+
+        # TODO: use actions/github-script to post this message as a PR comment
+        echo >&2 "We don't run CI for PRs from forks"
+        echo >&2 "${MESSAGE}"
+
+        exit 1
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 5c2f202b6b..1c9763cc00 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -20,7 +20,14 @@ env:
   COPT: '-Werror'
 
 jobs:
+  check-permissions:
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
+    uses: ./.github/workflows/check-permissions.yml
+    with:
+      github-event-name: ${{ github.event_name}}
+
   check-macos-build:
+    needs: [ check-permissions ]
     if: |
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
@@ -116,8 +123,8 @@ jobs:
         run: ./run_clippy.sh
 
   check-linux-arm-build:
+    needs: [ check-permissions ]
     timeout-minutes: 90
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
     runs-on: [ self-hosted, dev, arm64 ]
 
     env:
@@ -237,8 +244,8 @@ jobs:
           cargo nextest run --package remote_storage --test test_real_azure
 
   check-codestyle-rust-arm:
+    needs: [ check-permissions ]
     timeout-minutes: 90
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
     runs-on: [ self-hosted, dev, arm64 ]
 
     container:
@@ -309,6 +316,7 @@ jobs:
         run: cargo deny check
 
   gather-rust-build-stats:
+    needs: [ check-permissions ]
     if: |
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||

From 20fff0569987229939421d68ba0003c4824948a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 21 Feb 2024 20:39:14 +0100
Subject: [PATCH 0244/1571] Remove stray del and TODO (#6867)

The TODO has made it into #6821. I originally just put it there for
bookmarking purposes.

The `del` has been added by #6818 but is also redundant.
---
 control_plane/src/bin/neon_local.rs           | 2 +-
 test_runner/regress/test_ondemand_download.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index f824003d01..5c0d008943 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -616,7 +616,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
             let tenant_id = get_tenant_id(create_match, env)?;
             let new_branch_name = create_match
                 .get_one::<String>("branch-name")
-                .ok_or_else(|| anyhow!("No branch name provided"))?; // TODO
+                .ok_or_else(|| anyhow!("No branch name provided"))?;
 
             let pg_version = create_match
                 .get_one::<u32>("pg-version")
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index caa52cbbfe..8bbf50373e 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -211,7 +211,6 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder):
     wait_for_upload(client, tenant_id, timeline_id, current_lsn)
     wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id)
     client.deletion_queue_flush(execute=True)
-    del current_lsn
     env.pageserver.stop()
     env.pageserver.start()
     # We've shut down the SKs, then restarted the PSes to sever all walreceiver connections;

From 6921577cec639250a165993b0596d12335595922 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 21 Feb 2024 16:09:34 -0500
Subject: [PATCH 0245/1571] compute_ctl: grant default privileges on table to
 `neon_superuser` (#6845)

## Problem

fix https://github.com/neondatabase/neon/issues/6236 again

## Summary of changes

This pull request adds a setup command in compute spec to modify default
privileges of public schema to have full permission on table/sequence
for neon_superuser. If an extension upgrades to superuser during
creation, the tables/sequences they create in the public schema will be
automatically granted to neon_superuser.

Questions:
* does it impose any security flaws? public schema should be fine...
* for all extensions that create tables in schemas other than public, we
will need to manually handle them (e.g., pg_anon).
* we can modify some extensions to remove their superuser requirement in
the future.
* we may contribute to Postgres to allow for the creation of extensions
with a specific user in the future.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/spec.rs              | 3 +++
 test_runner/regress/test_migrations.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 9c731f257c..27d95c30e7 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -777,6 +777,9 @@ BEGIN
 END
 $$;"#,
         "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
+        // ensure tables created by superusers (i.e., when creating extensions) can be used by neon_superuser.
+        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser",
+        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser",
     ];
 
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 7cc3024ec6..997297a5cd 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -15,7 +15,7 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     endpoint.wait_for_migrations()
 
-    num_migrations = 4
+    num_migrations = 6
 
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")

From 555ee9fdd0b11216cfbca9bdb92b8df96b55728c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 21 Feb 2024 21:41:51 +0000
Subject: [PATCH 0246/1571] build(deps): bump cryptography from 42.0.2 to
 42.0.4 (#6870)

---
 poetry.lock | 77 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 44 insertions(+), 33 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 347f0a16a7..832d7c4334 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -858,43 +858,43 @@ files = [
 
 [[package]]
 name = "cryptography"
-version = "42.0.2"
+version = "42.0.4"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-42.0.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:701171f825dcab90969596ce2af253143b93b08f1a716d4b2a9d2db5084ef7be"},
-    {file = "cryptography-42.0.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:61321672b3ac7aade25c40449ccedbc6db72c7f5f0fdf34def5e2f8b51ca530d"},
-    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea2c3ffb662fec8bbbfce5602e2c159ff097a4631d96235fcf0fb00e59e3ece4"},
-    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b15c678f27d66d247132cbf13df2f75255627bcc9b6a570f7d2fd08e8c081d2"},
-    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:8e88bb9eafbf6a4014d55fb222e7360eef53e613215085e65a13290577394529"},
-    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a047682d324ba56e61b7ea7c7299d51e61fd3bca7dad2ccc39b72bd0118d60a1"},
-    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:36d4b7c4be6411f58f60d9ce555a73df8406d484ba12a63549c88bd64f7967f1"},
-    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:a00aee5d1b6c20620161984f8ab2ab69134466c51f58c052c11b076715e72929"},
-    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b97fe7d7991c25e6a31e5d5e795986b18fbbb3107b873d5f3ae6dc9a103278e9"},
-    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5fa82a26f92871eca593b53359c12ad7949772462f887c35edaf36f87953c0e2"},
-    {file = "cryptography-42.0.2-cp37-abi3-win32.whl", hash = "sha256:4b063d3413f853e056161eb0c7724822a9740ad3caa24b8424d776cebf98e7ee"},
-    {file = "cryptography-42.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:841ec8af7a8491ac76ec5a9522226e287187a3107e12b7d686ad354bb78facee"},
-    {file = "cryptography-42.0.2-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:55d1580e2d7e17f45d19d3b12098e352f3a37fe86d380bf45846ef257054b242"},
-    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28cb2c41f131a5758d6ba6a0504150d644054fd9f3203a1e8e8d7ac3aea7f73a"},
-    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9097a208875fc7bbeb1286d0125d90bdfed961f61f214d3f5be62cd4ed8a446"},
-    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:44c95c0e96b3cb628e8452ec060413a49002a247b2b9938989e23a2c8291fc90"},
-    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2f9f14185962e6a04ab32d1abe34eae8a9001569ee4edb64d2304bf0d65c53f3"},
-    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:09a77e5b2e8ca732a19a90c5bca2d124621a1edb5438c5daa2d2738bfeb02589"},
-    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:ad28cff53f60d99a928dfcf1e861e0b2ceb2bc1f08a074fdd601b314e1cc9e0a"},
-    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:130c0f77022b2b9c99d8cebcdd834d81705f61c68e91ddd614ce74c657f8b3ea"},
-    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:fa3dec4ba8fb6e662770b74f62f1a0c7d4e37e25b58b2bf2c1be4c95372b4a33"},
-    {file = "cryptography-42.0.2-cp39-abi3-win32.whl", hash = "sha256:3dbd37e14ce795b4af61b89b037d4bc157f2cb23e676fa16932185a04dfbf635"},
-    {file = "cryptography-42.0.2-cp39-abi3-win_amd64.whl", hash = "sha256:8a06641fb07d4e8f6c7dda4fc3f8871d327803ab6542e33831c7ccfdcb4d0ad6"},
-    {file = "cryptography-42.0.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:087887e55e0b9c8724cf05361357875adb5c20dec27e5816b653492980d20380"},
-    {file = "cryptography-42.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a7ef8dd0bf2e1d0a27042b231a3baac6883cdd5557036f5e8df7139255feaac6"},
-    {file = "cryptography-42.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4383b47f45b14459cab66048d384614019965ba6c1a1a141f11b5a551cace1b2"},
-    {file = "cryptography-42.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:fbeb725c9dc799a574518109336acccaf1303c30d45c075c665c0793c2f79a7f"},
-    {file = "cryptography-42.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:320948ab49883557a256eab46149df79435a22d2fefd6a66fe6946f1b9d9d008"},
-    {file = "cryptography-42.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5ef9bc3d046ce83c4bbf4c25e1e0547b9c441c01d30922d812e887dc5f125c12"},
-    {file = "cryptography-42.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:52ed9ebf8ac602385126c9a2fe951db36f2cb0c2538d22971487f89d0de4065a"},
-    {file = "cryptography-42.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:141e2aa5ba100d3788c0ad7919b288f89d1fe015878b9659b307c9ef867d3a65"},
-    {file = "cryptography-42.0.2.tar.gz", hash = "sha256:e0ec52ba3c7f1b7d813cd52649a5b3ef1fc0d433219dc8c93827c57eab6cf888"},
+    {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449"},
+    {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18"},
+    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2"},
+    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1"},
+    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b"},
+    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1"},
+    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992"},
+    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885"},
+    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824"},
+    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b"},
+    {file = "cryptography-42.0.4-cp37-abi3-win32.whl", hash = "sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925"},
+    {file = "cryptography-42.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923"},
+    {file = "cryptography-42.0.4-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7"},
+    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52"},
+    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a"},
+    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9"},
+    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764"},
+    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff"},
+    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257"},
+    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929"},
+    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0"},
+    {file = "cryptography-42.0.4-cp39-abi3-win32.whl", hash = "sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129"},
+    {file = "cryptography-42.0.4-cp39-abi3-win_amd64.whl", hash = "sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854"},
+    {file = "cryptography-42.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298"},
+    {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88"},
+    {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20"},
+    {file = "cryptography-42.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce"},
+    {file = "cryptography-42.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74"},
+    {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd"},
+    {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b"},
+    {file = "cryptography-42.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660"},
+    {file = "cryptography-42.0.4.tar.gz", hash = "sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb"},
 ]
 
 [package.dependencies]
@@ -2182,6 +2182,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2571,6 +2572,16 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},

From 8107ae837797f1781d37bfc552a1f2069faf6c20 Mon Sep 17 00:00:00 2001
From: Joe Drumgoole <joe@joedrumgoole.com>
Date: Wed, 21 Feb 2024 22:42:24 +0000
Subject: [PATCH 0247/1571] README: Fix the link to the free tier request
 (#6858)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fedb787ac2..1c4f32d286 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.
 
 ## Quick start
-Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
+Try the [Neon Free Tier](https://neon.tech) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
 
 Alternatively, compile and run the project [locally](#running-local-installation).
 

From 1718c0b59befddb84ebb9565d1ce7cc7cede804a Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 21 Feb 2024 23:43:55 +0100
Subject: [PATCH 0248/1571] Proxy: cancel query on connection drop (#6832)

## Problem

https://github.com/neondatabase/cloud/issues/10259

## Summary of changes

Make sure that the request is dropped once the connection was dropped.
---
 proxy/src/cancellation.rs             |   5 +-
 proxy/src/proxy/copy_bidirectional.rs | 100 +++++++++++++++-----------
 proxy/src/proxy/passthrough.rs        |  10 ++-
 3 files changed, 69 insertions(+), 46 deletions(-)

diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 93a77bc4ae..c9607909b3 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -168,12 +168,11 @@ impl CancelClosure {
             cancel_token,
         }
     }
-
     /// Cancels the query running on user's compute node.
-    async fn try_cancel_query(self) -> Result<(), CancelError> {
+    pub async fn try_cancel_query(self) -> Result<(), CancelError> {
         let socket = TcpStream::connect(self.socket_addr).await?;
         self.cancel_token.cancel_query_raw(socket, NoTls).await?;
-
+        info!("query was cancelled");
         Ok(())
     }
 }
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 2ecc1151da..684be74f9a 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -1,4 +1,5 @@
 use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+use tracing::info;
 
 use std::future::poll_fn;
 use std::io;
@@ -39,42 +40,51 @@ where
     }
 }
 
-pub(super) async fn copy_bidirectional<A, B>(
-    a: &mut A,
-    b: &mut B,
+#[tracing::instrument(skip_all)]
+pub(super) async fn copy_bidirectional_client_compute<Client, Compute>(
+    client: &mut Client,
+    compute: &mut Compute,
 ) -> Result<(u64, u64), std::io::Error>
 where
-    A: AsyncRead + AsyncWrite + Unpin + ?Sized,
-    B: AsyncRead + AsyncWrite + Unpin + ?Sized,
+    Client: AsyncRead + AsyncWrite + Unpin + ?Sized,
+    Compute: AsyncRead + AsyncWrite + Unpin + ?Sized,
 {
-    let mut a_to_b = TransferState::Running(CopyBuffer::new());
-    let mut b_to_a = TransferState::Running(CopyBuffer::new());
+    let mut client_to_compute = TransferState::Running(CopyBuffer::new());
+    let mut compute_to_client = TransferState::Running(CopyBuffer::new());
 
     poll_fn(|cx| {
-        let mut a_to_b_result = transfer_one_direction(cx, &mut a_to_b, a, b)?;
-        let mut b_to_a_result = transfer_one_direction(cx, &mut b_to_a, b, a)?;
+        let mut client_to_compute_result =
+            transfer_one_direction(cx, &mut client_to_compute, client, compute)?;
+        let mut compute_to_client_result =
+            transfer_one_direction(cx, &mut compute_to_client, compute, client)?;
 
-        // Early termination checks
-        if let TransferState::Done(_) = a_to_b {
-            if let TransferState::Running(buf) = &b_to_a {
+        // Early termination checks from compute to client.
+        if let TransferState::Done(_) = compute_to_client {
+            if let TransferState::Running(buf) = &client_to_compute {
+                info!("Compute is done, terminate client");
                 // Initiate shutdown
-                b_to_a = TransferState::ShuttingDown(buf.amt);
-                b_to_a_result = transfer_one_direction(cx, &mut b_to_a, b, a)?;
+                client_to_compute = TransferState::ShuttingDown(buf.amt);
+                client_to_compute_result =
+                    transfer_one_direction(cx, &mut client_to_compute, client, compute)?;
             }
         }
-        if let TransferState::Done(_) = b_to_a {
-            if let TransferState::Running(buf) = &a_to_b {
+
+        // Early termination checks from compute to client.
+        if let TransferState::Done(_) = client_to_compute {
+            if let TransferState::Running(buf) = &compute_to_client {
+                info!("Client is done, terminate compute");
                 // Initiate shutdown
-                a_to_b = TransferState::ShuttingDown(buf.amt);
-                a_to_b_result = transfer_one_direction(cx, &mut a_to_b, a, b)?;
+                compute_to_client = TransferState::ShuttingDown(buf.amt);
+                compute_to_client_result =
+                    transfer_one_direction(cx, &mut compute_to_client, client, compute)?;
             }
         }
 
         // It is not a problem if ready! returns early ... (comment remains the same)
-        let a_to_b = ready!(a_to_b_result);
-        let b_to_a = ready!(b_to_a_result);
+        let client_to_compute = ready!(client_to_compute_result);
+        let compute_to_client = ready!(compute_to_client_result);
 
-        Poll::Ready(Ok((a_to_b, b_to_a)))
+        Poll::Ready(Ok((client_to_compute, compute_to_client)))
     })
     .await
 }
@@ -219,38 +229,46 @@ mod tests {
     use tokio::io::AsyncWriteExt;
 
     #[tokio::test]
-    async fn test_early_termination_a_to_d() {
-        let (mut a_mock, mut b_mock) = tokio::io::duplex(8); // Create a mock duplex stream
-        let (mut c_mock, mut d_mock) = tokio::io::duplex(32); // Create a mock duplex stream
+    async fn test_client_to_compute() {
+        let (mut client_client, mut client_proxy) = tokio::io::duplex(8); // Create a mock duplex stream
+        let (mut compute_proxy, mut compute_client) = tokio::io::duplex(32); // Create a mock duplex stream
 
         // Simulate 'a' finishing while there's still data for 'b'
-        a_mock.write_all(b"hello").await.unwrap();
-        a_mock.shutdown().await.unwrap();
-        d_mock.write_all(b"Neon Serverless Postgres").await.unwrap();
+        client_client.write_all(b"hello").await.unwrap();
+        client_client.shutdown().await.unwrap();
+        compute_client.write_all(b"Neon").await.unwrap();
+        compute_client.shutdown().await.unwrap();
 
-        let result = copy_bidirectional(&mut b_mock, &mut c_mock).await.unwrap();
+        let result = copy_bidirectional_client_compute(&mut client_proxy, &mut compute_proxy)
+            .await
+            .unwrap();
 
         // Assert correct transferred amounts
-        let (a_to_d_count, d_to_a_count) = result;
-        assert_eq!(a_to_d_count, 5); // 'hello' was transferred
-        assert!(d_to_a_count <= 8); // response only partially transferred or not at all
+        let (client_to_compute_count, compute_to_client_count) = result;
+        assert_eq!(client_to_compute_count, 5); // 'hello' was transferred
+        assert_eq!(compute_to_client_count, 4); // response only partially transferred or not at all
     }
 
     #[tokio::test]
-    async fn test_early_termination_d_to_a() {
-        let (mut a_mock, mut b_mock) = tokio::io::duplex(32); // Create a mock duplex stream
-        let (mut c_mock, mut d_mock) = tokio::io::duplex(8); // Create a mock duplex stream
+    async fn test_compute_to_client() {
+        let (mut client_client, mut client_proxy) = tokio::io::duplex(32); // Create a mock duplex stream
+        let (mut compute_proxy, mut compute_client) = tokio::io::duplex(8); // Create a mock duplex stream
 
         // Simulate 'a' finishing while there's still data for 'b'
-        d_mock.write_all(b"hello").await.unwrap();
-        d_mock.shutdown().await.unwrap();
-        a_mock.write_all(b"Neon Serverless Postgres").await.unwrap();
+        compute_client.write_all(b"hello").await.unwrap();
+        compute_client.shutdown().await.unwrap();
+        client_client
+            .write_all(b"Neon Serverless Postgres")
+            .await
+            .unwrap();
 
-        let result = copy_bidirectional(&mut b_mock, &mut c_mock).await.unwrap();
+        let result = copy_bidirectional_client_compute(&mut client_proxy, &mut compute_proxy)
+            .await
+            .unwrap();
 
         // Assert correct transferred amounts
-        let (a_to_d_count, d_to_a_count) = result;
-        assert_eq!(d_to_a_count, 5); // 'hello' was transferred
-        assert!(a_to_d_count <= 8); // response only partially transferred or not at all
+        let (client_to_compute_count, compute_to_client_count) = result;
+        assert_eq!(compute_to_client_count, 5); // 'hello' was transferred
+        assert!(client_to_compute_count <= 8); // response only partially transferred or not at all
     }
 }
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index 73c170fc0b..b2f682fd2f 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -46,7 +46,11 @@ pub async fn proxy_pass(
 
     // Starting from here we only proxy the client's traffic.
     info!("performing the proxy pass...");
-    let _ = crate::proxy::copy_bidirectional::copy_bidirectional(&mut client, &mut compute).await?;
+    let _ = crate::proxy::copy_bidirectional::copy_bidirectional_client_compute(
+        &mut client,
+        &mut compute,
+    )
+    .await?;
 
     Ok(())
 }
@@ -63,6 +67,8 @@ pub struct ProxyPassthrough<S> {
 
 impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
     pub async fn proxy_pass(self) -> anyhow::Result<()> {
-        proxy_pass(self.client, self.compute.stream, self.aux).await
+        let res = proxy_pass(self.client, self.compute.stream, self.aux).await;
+        self.compute.cancel_closure.try_cancel_query().await?;
+        res
     }
 }

From c1095f4c52667f3818aa631c34f8d8c20b24c8ac Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 22 Feb 2024 09:32:27 +0000
Subject: [PATCH 0249/1571] pageserver: don't warn on tempfiles in secondary
 location (#6837)

## Problem

When a secondary mode location starts up, it scans local layer files.
Currently it warns on any layers whose names don't parse as a
LayerFileName, generating warning spam from perfectly normal tempfiles.

## Summary of changes

- Refactor local vars to build a Utf8PathBuf for the layer file
candidate
- Use the crate::is_temporary check to identify + clean up temp files.


---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/tenant/secondary/downloader.rs | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 51ab421b58..88a0cb8025 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -37,6 +37,7 @@ use crate::tenant::{
     remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
 };
 
+use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
 use pageserver_api::shard::TenantShardId;
@@ -778,19 +779,32 @@ async fn init_timeline_state(
         .await
         .fatal_err(&format!("Listing {timeline_path}"))
     {
-        let dentry_file_name = dentry.file_name();
-        let file_name = dentry_file_name.to_string_lossy();
-        let local_meta = dentry.metadata().await.fatal_err(&format!(
-            "Read metadata on {}",
-            dentry.path().to_string_lossy()
-        ));
+        let Ok(file_path) = Utf8PathBuf::from_path_buf(dentry.path()) else {
+            tracing::warn!("Malformed filename at {}", dentry.path().to_string_lossy());
+            continue;
+        };
+        let local_meta = dentry
+            .metadata()
+            .await
+            .fatal_err(&format!("Read metadata on {}", file_path));
 
-        // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
+        let file_name = file_path.file_name().expect("created it from the dentry");
         if file_name == METADATA_FILE_NAME {
+            // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
+            continue;
+        } else if crate::is_temporary(&file_path) {
+            // Temporary files are frequently left behind from restarting during downloads
+            tracing::info!("Cleaning up temporary file {file_path}");
+            if let Err(e) = tokio::fs::remove_file(&file_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)
+            {
+                tracing::error!("Failed to remove temporary file {file_path}: {e}");
+            }
             continue;
         }
 
-        match LayerFileName::from_str(&file_name) {
+        match LayerFileName::from_str(file_name) {
             Ok(name) => {
                 let remote_meta = heatmap_metadata.get(&name);
                 match remote_meta {

From b5246753bfe89221492823f74e7cdc284dcb8541 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 22 Feb 2024 09:33:40 +0000
Subject: [PATCH 0250/1571] storage controller: miscellaneous improvements
 (#6800)

- Add some context to logs
- Add tests for pageserver restarts when managed by storage controller
- Make /location_config tolerate compute hook failures on shard
creations, not just modifications.
---
 .../attachment_service/src/reconciler.rs      |  4 +-
 .../attachment_service/src/service.rs         | 67 ++++++++++++-------
 test_runner/fixtures/pageserver/http.py       |  9 +++
 test_runner/regress/test_sharding.py          | 31 +++++++--
 test_runner/regress/test_sharding_service.py  | 14 ++++
 5 files changed, 94 insertions(+), 31 deletions(-)

diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index cdd6f76b14..751b06f93a 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -438,7 +438,7 @@ impl Reconciler {
             match self.observed.locations.get(&node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     // Nothing to do
-                    tracing::info!("Observed configuration already correct.")
+                    tracing::info!(%node_id, "Observed configuration already correct.")
                 }
                 _ => {
                     // In all cases other than a matching observed configuration, we will
@@ -449,7 +449,7 @@ impl Reconciler {
                         .increment_generation(self.tenant_shard_id, node_id)
                         .await?;
                     wanted_conf.generation = self.generation.into();
-                    tracing::info!("Observed configuration requires update.");
+                    tracing::info!(%node_id, "Observed configuration requires update.");
                     self.location_config(node_id, wanted_conf, None).await?;
                     self.compute_notify().await?;
                 }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 74e1296709..6366348017 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -56,6 +56,11 @@ use crate::{
     PlacementPolicy, Sequence,
 };
 
+// For operations that should be quick, like attaching a new tenant
+const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
+
+// For operations that might be slow, like migrating a tenant with
+// some data in it.
 const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
 /// How long [`Service::startup_reconcile`] is allowed to take before it should give
@@ -479,8 +484,8 @@ impl Service {
                 async move {
                     if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
                         tracing::error!(
-                            tenant_shard_id=%tenant_shard_id,
-                            node_id=%node_id,
+                            %tenant_shard_id,
+                            %node_id,
                             "Failed to notify compute on startup for shard: {e}"
                         );
                         None
@@ -1000,6 +1005,16 @@ impl Service {
         &self,
         create_req: TenantCreateRequest,
     ) -> Result<TenantCreateResponse, ApiError> {
+        let (response, waiters) = self.do_tenant_create(create_req).await?;
+
+        self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
+        Ok(response)
+    }
+
+    pub(crate) async fn do_tenant_create(
+        &self,
+        create_req: TenantCreateRequest,
+    ) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
         // This service expects to handle sharding itself: it is an error to try and directly create
         // a particular shard here.
         let tenant_id = if !create_req.new_tenant_id.is_unsharded() {
@@ -1149,11 +1164,12 @@ impl Service {
             (waiters, response_shards)
         };
 
-        self.await_waiters(waiters).await?;
-
-        Ok(TenantCreateResponse {
-            shards: response_shards,
-        })
+        Ok((
+            TenantCreateResponse {
+                shards: response_shards,
+            },
+            waiters,
+        ))
     }
 
     /// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
@@ -1161,8 +1177,9 @@ impl Service {
     async fn await_waiters(
         &self,
         waiters: Vec<ReconcilerWaiter>,
+        timeout: Duration,
     ) -> Result<(), ReconcileWaitError> {
-        let deadline = Instant::now().checked_add(Duration::from_secs(30)).unwrap();
+        let deadline = Instant::now().checked_add(timeout).unwrap();
         for waiter in waiters {
             let timeout = deadline.duration_since(Instant::now());
             waiter.wait_timeout(timeout).await?;
@@ -1300,12 +1317,8 @@ impl Service {
             }
         };
 
-        // TODO: if we timeout/fail on reconcile, we should still succeed this request,
-        // because otherwise a broken compute hook causes a feedback loop where
-        // location_config returns 500 and gets retried forever.
-
-        if let Some(create_req) = maybe_create {
-            let create_resp = self.tenant_create(create_req).await?;
+        let waiters = if let Some(create_req) = maybe_create {
+            let (create_resp, waiters) = self.do_tenant_create(create_req).await?;
             result.shards = create_resp
                 .shards
                 .into_iter()
@@ -1314,19 +1327,25 @@ impl Service {
                     shard_id: s.shard_id,
                 })
                 .collect();
+            waiters
         } else {
-            // This was an update, wait for reconciliation
-            if let Err(e) = self.await_waiters(waiters).await {
-                // Do not treat a reconcile error as fatal: we have already applied any requested
-                // Intent changes, and the reconcile can fail for external reasons like unavailable
-                // compute notification API.  In these cases, it is important that we do not
-                // cause the cloud control plane to retry forever on this API.
-                tracing::warn!(
-                    "Failed to reconcile after /location_config: {e}, returning success anyway"
-                );
-            }
+            waiters
+        };
+
+        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
+            // Do not treat a reconcile error as fatal: we have already applied any requested
+            // Intent changes, and the reconcile can fail for external reasons like unavailable
+            // compute notification API.  In these cases, it is important that we do not
+            // cause the cloud control plane to retry forever on this API.
+            tracing::warn!(
+                "Failed to reconcile after /location_config: {e}, returning success anyway"
+            );
         }
 
+        // Logging the full result is useful because it lets us cross-check what the cloud control
+        // plane's tenant_shards table should contain.
+        tracing::info!("Complete, returning {result:?}");
+
         Ok(result)
     }
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 98eb89d30c..427ef00c78 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -302,6 +302,15 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         )
         self.verbose_error(res)
 
+    def tenant_list_locations(self):
+        res = self.get(
+            f"http://localhost:{self.port}/v1/location_config",
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert isinstance(res_json["tenant_shards"], list)
+        return res_json
+
     def tenant_delete(self, tenant_id: Union[TenantId, TenantShardId]):
         res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
         self.verbose_error(res)
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 5413b178a5..57c8d1f849 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -235,11 +235,6 @@ def test_sharding_split_smoke(
     all_shards = tenant_get_shards(env, tenant_id)
     for tenant_shard_id, pageserver in all_shards:
         pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None)
-
-    # Restart all nodes, to check that the newly created shards are durable
-    for ps in env.pageservers:
-        ps.restart()
-
     workload.validate()
 
     migrate_to_pageserver_ids = list(
@@ -288,6 +283,32 @@ def test_sharding_split_smoke(
 
     env.attachment_service.consistency_check()
 
+    # Validate pageserver state
+    shards_exist: list[TenantShardId] = []
+    for pageserver in env.pageservers:
+        locations = pageserver.http_client().tenant_list_locations()
+        shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])
+
+    log.info("Shards after split: {shards_exist}")
+    assert len(shards_exist) == split_shard_count
+
+    # Ensure post-split pageserver locations survive a restart (i.e. the child shards
+    # correctly wrote config to disk, and the storage controller responds correctly
+    # to /re-attach)
+    for pageserver in env.pageservers:
+        pageserver.stop()
+        pageserver.start()
+
+    shards_exist = []
+    for pageserver in env.pageservers:
+        locations = pageserver.http_client().tenant_list_locations()
+        shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])
+
+    log.info("Shards after restart: {shards_exist}")
+    assert len(shards_exist) == split_shard_count
+
+    workload.validate()
+
 
 @pytest.mark.skipif(
     # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 6525f9733f..e62d239d77 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -125,6 +125,20 @@ def test_sharding_service_smoke(
     time.sleep(1)
     assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 0
 
+    # Restarting a pageserver should not detach any tenants (i.e. /re-attach works)
+    before_restart = env.pageservers[1].http_client().tenant_list_locations()
+    env.pageservers[1].stop()
+    env.pageservers[1].start()
+    after_restart = env.pageservers[1].http_client().tenant_list_locations()
+    assert len(after_restart) == len(before_restart)
+
+    # Locations should be the same before & after restart, apart from generations
+    for _shard_id, tenant in after_restart["tenant_shards"]:
+        del tenant["generation"]
+    for _shard_id, tenant in before_restart["tenant_shards"]:
+        del tenant["generation"]
+    assert before_restart == after_restart
+
     # Delete all the tenants
     for tid in tenant_ids:
         tenant_delete_wait_completed(env.attachment_service.pageserver_api(), tid, 10)

From bc7a82caf2d56b6ee6ce80ece76aeb100d276e31 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 22 Feb 2024 13:58:59 +0200
Subject: [PATCH 0251/1571] feat: bare-bones /v1/utilization (#6831)

PR adds a simple at most 1Hz refreshed informational API for querying
pageserver utilization. In this first phase, no actual background
calculation is performed. Instead, the worst possible score is always
returned. The returned bytes information is however correct.

Cc: #6835
Cc: #5331
---
 Cargo.lock                                    |  1 +
 libs/pageserver_api/Cargo.toml                |  1 +
 libs/pageserver_api/src/models.rs             |  3 +
 libs/pageserver_api/src/models/utilization.rs | 70 +++++++++++++++++++
 pageserver/src/http/openapi_spec.yml          | 46 ++++++++++++
 pageserver/src/http/routes.rs                 | 51 ++++++++++++++
 pageserver/src/lib.rs                         |  1 +
 pageserver/src/utilization.rs                 | 38 ++++++++++
 8 files changed, 211 insertions(+)
 create mode 100644 libs/pageserver_api/src/models/utilization.rs
 create mode 100644 pageserver/src/utilization.rs

diff --git a/Cargo.lock b/Cargo.lock
index 51c433cd07..abb335e97c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3552,6 +3552,7 @@ dependencies = [
  "const_format",
  "enum-map",
  "hex",
+ "humantime",
  "humantime-serde",
  "itertools",
  "postgres_ffi",
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 938910caea..3bba89c76d 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -18,6 +18,7 @@ enum-map.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 hex.workspace = true
+humantime.workspace = true
 thiserror.workspace = true
 humantime-serde.workspace = true
 chrono.workspace = true
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index b68ab9fd59..36aafe7341 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1,4 +1,7 @@
 pub mod partitioning;
+pub mod utilization;
+
+pub use utilization::PageserverUtilization;
 
 use std::{
     collections::HashMap,
diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs
new file mode 100644
index 0000000000..7195a12395
--- /dev/null
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -0,0 +1,70 @@
+use std::time::SystemTime;
+
+/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
+/// the next tenant.
+///
+/// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth.
+///
+/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
+/// not handle full u64 values properly.
+#[derive(serde::Serialize, Debug)]
+pub struct PageserverUtilization {
+    /// Used disk space
+    #[serde(serialize_with = "ser_saturating_u63")]
+    pub disk_usage_bytes: u64,
+    /// Free disk space
+    #[serde(serialize_with = "ser_saturating_u63")]
+    pub free_space_bytes: u64,
+    /// Lower is better score for how good candidate for a next tenant would this pageserver be.
+    #[serde(serialize_with = "ser_saturating_u63")]
+    pub utilization_score: u64,
+    /// When was this snapshot captured, pageserver local time.
+    ///
+    /// Use millis to give confidence that the value is regenerated often enough.
+    #[serde(serialize_with = "ser_rfc3339_millis")]
+    pub captured_at: SystemTime,
+}
+
+fn ser_rfc3339_millis<S: serde::Serializer>(
+    ts: &SystemTime,
+    serializer: S,
+) -> Result<S::Ok, S::Error> {
+    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
+}
+
+/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
+///
+/// Instead of newtype, use this because a newtype would get require handling deserializing values
+/// with the highest bit set which is properly parsed by serde formats, but would create a
+/// conundrum on how to handle and again serialize such values at type level. It will be a few
+/// years until we can use more than `i64::MAX` bytes on a disk.
+fn ser_saturating_u63<S: serde::Serializer>(value: &u64, serializer: S) -> Result<S::Ok, S::Error> {
+    const MAX_FORMAT_INT64: u64 = i64::MAX as u64;
+
+    let value = (*value).min(MAX_FORMAT_INT64);
+
+    serializer.serialize_u64(value)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use super::*;
+
+    #[test]
+    fn u64_max_is_serialized_as_u63_max() {
+        let doc = PageserverUtilization {
+            disk_usage_bytes: u64::MAX,
+            free_space_bytes: 0,
+            utilization_score: u64::MAX,
+            captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
+        };
+
+        let s = serde_json::to_string(&doc).unwrap();
+
+        let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#;
+
+        assert_eq!(s, expected);
+    }
+}
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index a6fe7c67e1..479c7ca0f5 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1379,6 +1379,25 @@ paths:
               schema:
                 $ref: "#/components/schemas/ServiceUnavailableError"
 
+  /v1/utilization:
+    get:
+      description: |
+        Returns the pageservers current utilization and fitness score for new tenants.
+
+      responses:
+        "200":
+            description: Pageserver utilization and fitness score
+            content:
+              application/json:
+                schema:
+                  $ref: "#/components/schemas/PageserverUtilization"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
 components:
   securitySchemes:
     JWT:
@@ -1691,6 +1710,33 @@ components:
           type: string
           enum: [past, present, future, nodata]
 
+    PageserverUtilization:
+      type: object
+      required:
+        - disk_usage_bytes
+        - free_space_bytes
+        - utilization_score
+      properties:
+        disk_usage_bytes:
+          type: integer
+          format: int64
+          minimum: 0
+          description: The amount of disk space currently utilized by layer files.
+        free_space_bytes:
+          type: integer
+          format: int64
+          minimum: 0
+          description: The amount of usable disk space left.
+        utilization_score:
+          type: integer
+          format: int64
+          minimum: 0
+          maximum: 9223372036854775807
+          default: 9223372036854775807
+          description: |
+            Lower is better score for how good this pageserver would be for the next tenant.
+            The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated.
+
     Error:
       type: object
       required:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 175353762c..1339229a70 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -100,6 +100,7 @@ pub struct State {
     disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
     deletion_queue_client: DeletionQueueClient,
     secondary_controller: SecondaryController,
+    latest_utilization: tokio::sync::Mutex<Option<(std::time::Instant, bytes::Bytes)>>,
 }
 
 impl State {
@@ -128,6 +129,7 @@ impl State {
             disk_usage_eviction_state,
             deletion_queue_client,
             secondary_controller,
+            latest_utilization: Default::default(),
         })
     }
 }
@@ -1963,6 +1965,54 @@ async fn put_io_engine_handler(
     json_response(StatusCode::OK, ())
 }
 
+/// Polled by control plane.
+///
+/// See [`crate::utilization`].
+async fn get_utilization(
+    r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    // this probably could be completely public, but lets make that change later.
+    check_permission(&r, None)?;
+
+    let state = get_state(&r);
+    let mut g = state.latest_utilization.lock().await;
+
+    let regenerate_every = Duration::from_secs(1);
+    let still_valid = g
+        .as_ref()
+        .is_some_and(|(captured_at, _)| captured_at.elapsed() < regenerate_every);
+
+    // avoid needless statvfs calls even though those should be non-blocking fast.
+    // regenerate at most 1Hz to allow polling at any rate.
+    if !still_valid {
+        let path = state.conf.tenants_path();
+        let doc = crate::utilization::regenerate(path.as_std_path())
+            .map_err(ApiError::InternalServerError)?;
+
+        let mut buf = Vec::new();
+        serde_json::to_writer(&mut buf, &doc)
+            .context("serialize")
+            .map_err(ApiError::InternalServerError)?;
+
+        let body = bytes::Bytes::from(buf);
+
+        *g = Some((std::time::Instant::now(), body));
+    }
+
+    // hyper 0.14 doesn't yet have Response::clone so this is a bit of extra legwork
+    let cached = g.as_ref().expect("just set").1.clone();
+
+    Response::builder()
+        .header(hyper::http::header::CONTENT_TYPE, "application/json")
+        // thought of using http date header, but that is second precision which does not give any
+        // debugging aid
+        .status(StatusCode::OK)
+        .body(hyper::Body::from(cached))
+        .context("build response")
+        .map_err(ApiError::InternalServerError)
+}
+
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2224,5 +2274,6 @@ pub fn make_router(
             |r| api_handler(r, timeline_collect_keyspace),
         )
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
+        .get("/v1/utilization", |r| api_handler(r, get_utilization))
         .any(handler_404))
 }
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index c3f35142ec..cf6856458a 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -22,6 +22,7 @@ pub(crate) mod statvfs;
 pub mod task_mgr;
 pub mod tenant;
 pub mod trace;
+pub mod utilization;
 pub mod virtual_file;
 pub mod walingest;
 pub mod walrecord;
diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs
new file mode 100644
index 0000000000..830c9897ca
--- /dev/null
+++ b/pageserver/src/utilization.rs
@@ -0,0 +1,38 @@
+//! An utilization metric which is used to decide on which pageserver to put next tenant.
+//!
+//! The metric is exposed via `GET /v1/utilization`. Refer and maintain it's openapi spec as the
+//! truth.
+
+use anyhow::Context;
+use std::path::Path;
+
+use pageserver_api::models::PageserverUtilization;
+
+pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtilization> {
+    // TODO: currently the http api ratelimits this to 1Hz at most, which is probably good enough
+
+    let statvfs = nix::sys::statvfs::statvfs(tenants_path)
+        .map_err(std::io::Error::from)
+        .context("statvfs tenants directory")?;
+
+    let blocksz = statvfs.block_size();
+
+    #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
+    let free = statvfs.blocks_available() as u64 * blocksz;
+    let used = crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.get();
+    let captured_at = std::time::SystemTime::now();
+
+    let doc = PageserverUtilization {
+        disk_usage_bytes: used,
+        free_space_bytes: free,
+        // lower is better; start with a constant
+        //
+        // note that u64::MAX will be output as i64::MAX as u64, but that should not matter
+        utilization_score: u64::MAX,
+        captured_at,
+    };
+
+    // TODO: make utilization_score into a metric
+
+    Ok(doc)
+}

From c671aeacd425ce96ace8849c06fc1f9d2342e8c8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 22 Feb 2024 14:19:11 +0100
Subject: [PATCH 0252/1571] fix(per-tenant throttling): incorrect `allowed_rps`
 field in log message (#6869)

The `refill_interval` switched from a milliseconds usize to a Duration
during a review follow-up, hence this slipped through manual testing.

Part of https://github.com/neondatabase/neon/issues/5899
---
 libs/pageserver_api/src/models.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 36aafe7341..aa1a8ae487 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -340,7 +340,7 @@ impl ThrottleConfig {
     }
     /// The requests per second allowed  by the given config.
     pub fn steady_rps(&self) -> f64 {
-        (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64()) / 1e3
+        (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64())
     }
 }
 

From 9c48b5c4ab5321ba45048c42b21c6eba70d519ce Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 22 Feb 2024 14:01:06 +0000
Subject: [PATCH 0253/1571] controller: improved handling of offline nodes
 (#6846)

Stacks on https://github.com/neondatabase/neon/pull/6823

- Pending a heartbeating mechanism (#6844 ), use /re-attach calls as a
cue to mark an offline node as active, so that a node which is
unavailable during controller startup doesn't require manual
intervention if it later starts/restarts.
- Tweak scheduling logic so that when we schedule the attached location
for a tenant, we prefer to select from secondary locations rather than
picking a fresh one.

This is an interim state until we implement #6844 and full chaos testing
for handling failures.
---
 control_plane/attachment_service/src/http.rs  |   9 +-
 .../attachment_service/src/scheduler.rs       |  90 +++++++----
 .../attachment_service/src/service.rs         |  10 +-
 .../attachment_service/src/tenant_state.rs    | 148 ++++++++++++++++--
 control_plane/src/pageserver.rs               |  36 +++--
 test_runner/regress/test_sharding_service.py  |   9 +-
 6 files changed, 230 insertions(+), 72 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index d85753bedc..15ae2a26b4 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -66,14 +66,7 @@ fn get_state(request: &Request<Body>) -> &HttpState {
 async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
     let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
     let state = get_state(&req);
-    json_response(
-        StatusCode::OK,
-        state
-            .service
-            .re_attach(reattach_req)
-            .await
-            .map_err(ApiError::InternalServerError)?,
-    )
+    json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
 }
 
 /// Pageserver calls into this before doing deletions, to confirm that it still
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index fb3c7f634c..7059071bee 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -175,6 +175,33 @@ impl Scheduler {
         }
     }
 
+    /// Where we have several nodes to choose from, for example when picking a secondary location
+    /// to promote to an attached location, this method may be used to pick the best choice based
+    /// on the scheduler's knowledge of utilization and availability.
+    ///
+    /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
+    /// caller can pick a node some other way.
+    pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option<NodeId> {
+        if nodes.is_empty() {
+            return None;
+        }
+
+        let node = nodes
+            .iter()
+            .map(|node_id| {
+                let may_schedule = self
+                    .nodes
+                    .get(node_id)
+                    .map(|n| n.may_schedule)
+                    .unwrap_or(false);
+                (*node_id, may_schedule)
+            })
+            .max_by_key(|(_n, may_schedule)| *may_schedule);
+
+        // If even the preferred node has may_schedule==false, return None
+        node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
+    }
+
     pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
         if self.nodes.is_empty() {
             return Err(ScheduleError::NoPageservers);
@@ -224,44 +251,45 @@ impl Scheduler {
     }
 }
 
+#[cfg(test)]
+pub(crate) mod test_utils {
+
+    use crate::node::Node;
+    use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+    use std::collections::HashMap;
+    use utils::id::NodeId;
+    /// Test helper: synthesize the requested number of nodes, all in active state.
+    ///
+    /// Node IDs start at one.
+    pub(crate) fn make_test_nodes(n: u64) -> HashMap<NodeId, Node> {
+        (1..n + 1)
+            .map(|i| {
+                (
+                    NodeId(i),
+                    Node {
+                        id: NodeId(i),
+                        availability: NodeAvailability::Active,
+                        scheduling: NodeSchedulingPolicy::Active,
+                        listen_http_addr: format!("httphost-{i}"),
+                        listen_http_port: 80 + i as u16,
+                        listen_pg_addr: format!("pghost-{i}"),
+                        listen_pg_port: 5432 + i as u16,
+                    },
+                )
+            })
+            .collect()
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
-    use std::collections::HashMap;
-
-    use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
     use utils::id::NodeId;
 
-    use crate::{node::Node, tenant_state::IntentState};
-
+    use crate::tenant_state::IntentState;
     #[test]
     fn scheduler_basic() -> anyhow::Result<()> {
-        let mut nodes = HashMap::new();
-        nodes.insert(
-            NodeId(1),
-            Node {
-                id: NodeId(1),
-                availability: NodeAvailability::Active,
-                scheduling: NodeSchedulingPolicy::Active,
-                listen_http_addr: String::new(),
-                listen_http_port: 0,
-                listen_pg_addr: String::new(),
-                listen_pg_port: 0,
-            },
-        );
-
-        nodes.insert(
-            NodeId(2),
-            Node {
-                id: NodeId(2),
-                availability: NodeAvailability::Active,
-                scheduling: NodeSchedulingPolicy::Active,
-                listen_http_addr: String::new(),
-                listen_http_port: 0,
-                listen_pg_addr: String::new(),
-                listen_pg_port: 0,
-            },
-        );
+        let nodes = test_utils::make_test_nodes(2);
 
         let mut scheduler = Scheduler::new(nodes.values());
         let mut t1_intent = IntentState::new();
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 6366348017..0b9a7d8a69 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -914,7 +914,15 @@ impl Service {
     pub(crate) async fn re_attach(
         &self,
         reattach_req: ReAttachRequest,
-    ) -> anyhow::Result<ReAttachResponse> {
+    ) -> Result<ReAttachResponse, ApiError> {
+        // Take a re-attach as indication that the node is available: this is a precursor to proper
+        // heartbeating in https://github.com/neondatabase/neon/issues/6844
+        self.node_configure(NodeConfigureRequest {
+            node_id: reattach_req.node_id,
+            availability: Some(NodeAvailability::Active),
+            scheduling: None,
+        })?;
+
         // Ordering: we must persist generation number updates before making them visible in the in-memory state
         let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?;
 
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 7970207e27..3cfffc6c45 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -143,6 +143,23 @@ impl IntentState {
         }
     }
 
+    /// Like set_attached, but the node is from [`Self::secondary`].  This swaps the node from
+    /// secondary to attached while maintaining the scheduler's reference counts.
+    pub(crate) fn promote_attached(
+        &mut self,
+        _scheduler: &mut Scheduler,
+        promote_secondary: NodeId,
+    ) {
+        // If we call this with a node that isn't in secondary, it would cause incorrect
+        // scheduler reference counting, since we assume the node is already referenced as a secondary.
+        debug_assert!(self.secondary.contains(&promote_secondary));
+
+        // TODO: when scheduler starts tracking attached + secondary counts separately, we will
+        // need to call into it here.
+        self.secondary.retain(|n| n != &promote_secondary);
+        self.attached = Some(promote_secondary);
+    }
+
     pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
         debug_assert!(!self.secondary.contains(&new_secondary));
         scheduler.node_inc_ref(new_secondary);
@@ -197,6 +214,8 @@ impl IntentState {
     /// Returns true if a change was made
     pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
         if self.attached == Some(node_id) {
+            // TODO: when scheduler starts tracking attached + secondary counts separately, we will
+            // need to call into it here.
             self.attached = None;
             self.secondary.push(node_id);
             true
@@ -370,6 +389,9 @@ impl TenantState {
         // All remaining observed locations generate secondary intents.  This includes None
         // observations, as these may well have some local content on disk that is usable (this
         // is an edge case that might occur if we restarted during a migration or other change)
+        //
+        // We may leave intent.attached empty if we didn't find any attached locations: [`Self::schedule`]
+        // will take care of promoting one of these secondaries to be attached.
         self.observed.locations.keys().for_each(|node_id| {
             if Some(*node_id) != self.intent.attached {
                 self.intent.secondary.push(*node_id);
@@ -377,6 +399,33 @@ impl TenantState {
         });
     }
 
+    /// Part of [`Self::schedule`] that is used to choose exactly one node to act as the
+    /// attached pageserver for a shard.
+    ///
+    /// Returns whether we modified it, and the NodeId selected.
+    fn schedule_attached(
+        &mut self,
+        scheduler: &mut Scheduler,
+    ) -> Result<(bool, NodeId), ScheduleError> {
+        // No work to do if we already have an attached tenant
+        if let Some(node_id) = self.intent.attached {
+            return Ok((false, node_id));
+        }
+
+        if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) {
+            // Promote a secondary
+            tracing::debug!("Promoted secondary {} to attached", promote_secondary);
+            self.intent.promote_attached(scheduler, promote_secondary);
+            Ok((true, promote_secondary))
+        } else {
+            // Pick a fresh node: either we had no secondaries or none were schedulable
+            let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
+            tracing::debug!("Selected {} as attached", node_id);
+            self.intent.set_attached(scheduler, Some(node_id));
+            Ok((true, node_id))
+        }
+    }
+
     pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
         // TODO: before scheduling new nodes, check if any existing content in
         // self.intent refers to pageservers that are offline, and pick other
@@ -387,19 +436,15 @@ impl TenantState {
 
         // Build the set of pageservers already in use by this tenant, to avoid scheduling
         // more work on the same pageservers we're already using.
-        let mut used_pageservers = self.intent.all_pageservers();
         let mut modified = false;
 
         use PlacementPolicy::*;
         match self.policy {
             Single => {
                 // Should have exactly one attached, and zero secondaries
-                if self.intent.attached.is_none() {
-                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
-                    self.intent.set_attached(scheduler, Some(node_id));
-                    used_pageservers.push(node_id);
-                    modified = true;
-                }
+                let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
+                modified |= modified_attached;
+
                 if !self.intent.secondary.is_empty() {
                     self.intent.clear_secondary(scheduler);
                     modified = true;
@@ -407,13 +452,10 @@ impl TenantState {
             }
             Double(secondary_count) => {
                 // Should have exactly one attached, and N secondaries
-                if self.intent.attached.is_none() {
-                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
-                    self.intent.set_attached(scheduler, Some(node_id));
-                    used_pageservers.push(node_id);
-                    modified = true;
-                }
+                let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
+                modified |= modified_attached;
 
+                let mut used_pageservers = vec![attached_node_id];
                 while self.intent.secondary.len() < secondary_count {
                     let node_id = scheduler.schedule_shard(&used_pageservers)?;
                     self.intent.push_secondary(scheduler, node_id);
@@ -702,3 +744,83 @@ impl TenantState {
         }
     }
 }
+
+#[cfg(test)]
+pub(crate) mod tests {
+    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use utils::id::TenantId;
+
+    use crate::scheduler::test_utils::make_test_nodes;
+
+    use super::*;
+
+    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
+        let tenant_id = TenantId::generate();
+        let shard_number = ShardNumber(0);
+        let shard_count = ShardCount::new(1);
+
+        let tenant_shard_id = TenantShardId {
+            tenant_id,
+            shard_number,
+            shard_count,
+        };
+        TenantState::new(
+            tenant_shard_id,
+            ShardIdentity::new(
+                shard_number,
+                shard_count,
+                pageserver_api::shard::ShardStripeSize(32768),
+            )
+            .unwrap(),
+            policy,
+        )
+    }
+
+    /// Test the scheduling behaviors used when a tenant configured for HA is subject
+    /// to nodes being marked offline.
+    #[test]
+    fn tenant_ha_scheduling() -> anyhow::Result<()> {
+        // Start with three nodes.  Our tenant will only use two.  The third one is
+        // expected to remain unused.
+        let mut nodes = make_test_nodes(3);
+
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+        tenant_state
+            .schedule(&mut scheduler)
+            .expect("we have enough nodes, scheduling should work");
+
+        // Expect to initially be schedule on to different nodes
+        assert_eq!(tenant_state.intent.secondary.len(), 1);
+        assert!(tenant_state.intent.attached.is_some());
+
+        let attached_node_id = tenant_state.intent.attached.unwrap();
+        let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
+        assert_ne!(attached_node_id, secondary_node_id);
+
+        // Notifying the attached node is offline should demote it to a secondary
+        let changed = tenant_state.intent.notify_offline(attached_node_id);
+        assert!(changed);
+
+        // Update the scheduler state to indicate the node is offline
+        nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
+        scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
+
+        // Scheduling the node should promote the still-available secondary node to attached
+        tenant_state
+            .schedule(&mut scheduler)
+            .expect("active nodes are available");
+        assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
+
+        // The original attached node should have been retained as a secondary
+        assert_eq!(
+            *tenant_state.intent.secondary.iter().last().unwrap(),
+            attached_node_id
+        );
+
+        tenant_state.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
+}
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 8dd86bad96..5909477586 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -210,6 +210,25 @@ impl PageServerNode {
         update_config: bool,
         register: bool,
     ) -> anyhow::Result<()> {
+        // Register the node with the storage controller before starting pageserver: pageserver must be registered to
+        // successfully call /re-attach and finish starting up.
+        if register {
+            let attachment_service = AttachmentService::from_env(&self.env);
+            let (pg_host, pg_port) =
+                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
+            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
+                .expect("Unable to parse listen_http_addr");
+            attachment_service
+                .node_register(NodeRegisterRequest {
+                    node_id: self.conf.id,
+                    listen_pg_addr: pg_host.to_string(),
+                    listen_pg_port: pg_port.unwrap_or(5432),
+                    listen_http_addr: http_host.to_string(),
+                    listen_http_port: http_port.unwrap_or(80),
+                })
+                .await?;
+        }
+
         // TODO: using a thread here because start_process() is not async but we need to call check_status()
         let datadir = self.repo_path();
         print!(
@@ -248,23 +267,6 @@ impl PageServerNode {
         )
         .await?;
 
-        if register {
-            let attachment_service = AttachmentService::from_env(&self.env);
-            let (pg_host, pg_port) =
-                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            attachment_service
-                .node_register(NodeRegisterRequest {
-                    node_id: self.conf.id,
-                    listen_pg_addr: pg_host.to_string(),
-                    listen_pg_port: pg_port.unwrap_or(5432),
-                    listen_http_addr: http_host.to_string(),
-                    listen_http_port: http_port.unwrap_or(80),
-                })
-                .await?;
-        }
-
         Ok(())
     }
 
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index e62d239d77..00c3a1628e 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -272,8 +272,13 @@ def test_sharding_service_onboarding(
     env.broker.try_start()
     env.attachment_service.start()
 
-    # This is the pageserver where we'll initially create the tenant
-    env.pageservers[0].start(register=False)
+    # This is the pageserver where we'll initially create the tenant.  Run it in emergency
+    # mode so that it doesn't talk to storage controller, and do not register it.
+    env.pageservers[0].allowed_errors.append(".*Emergency mode!.*")
+    env.pageservers[0].start(
+        overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
+        register=False,
+    )
     origin_ps = env.pageservers[0]
 
     # This is the pageserver managed by the sharding service, where the tenant

From cf3baf60395b500f7632c7afc10a3c81f2a98e40 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 22 Feb 2024 14:10:49 +0000
Subject: [PATCH 0254/1571] storage controller: fix consistency check (#6855)

- Some checks weren't properly returning an error when they failed
- TenantState::to_persistent wasn't setting generation_pageserver
properly
- Changes to node scheduling policy weren't being persisted.
---
 control_plane/attachment_service/src/http.rs  |  5 +-
 control_plane/attachment_service/src/node.rs  |  2 +-
 .../attachment_service/src/persistence.rs     | 49 +++++++++++-------
 .../attachment_service/src/service.rs         | 50 ++++++++++++++++---
 .../attachment_service/src/tenant_state.rs    |  7 ++-
 5 files changed, 84 insertions(+), 29 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 15ae2a26b4..f9c4535bd5 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -325,7 +325,10 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
     }
     let state = get_state(&req);
 
-    json_response(StatusCode::OK, state.service.node_configure(config_req)?)
+    json_response(
+        StatusCode::OK,
+        state.service.node_configure(config_req).await?,
+    )
 }
 
 async fn handle_tenant_shard_split(
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index 59784249d7..09162701ac 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -10,7 +10,7 @@ use crate::persistence::NodePersistence;
 ///
 /// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the
 /// implementation of serialization on this type is only for debug dumps.
-#[derive(Clone, Serialize, Eq, PartialEq)]
+#[derive(Clone, Serialize)]
 pub(crate) struct Node {
     pub(crate) id: NodeId,
 
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 2d0c8a9d15..4f336093cf 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -6,7 +6,7 @@ use std::time::Duration;
 use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
-use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+use control_plane::attachment_service::NodeSchedulingPolicy;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
@@ -130,24 +130,10 @@ impl Persistence {
     }
 
     /// At startup, populate the list of nodes which our shards may be placed on
-    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
-        let nodes: Vec<Node> = self
+    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
+        let nodes: Vec<NodePersistence> = self
             .with_conn(move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::nodes::table
-                    .load::<NodePersistence>(conn)?
-                    .into_iter()
-                    .map(|n| Node {
-                        id: NodeId(n.node_id as u64),
-                        // At startup we consider a node offline until proven otherwise.
-                        availability: NodeAvailability::Offline,
-                        scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
-                            .expect("Bad scheduling policy in DB"),
-                        listen_http_addr: n.listen_http_addr,
-                        listen_http_port: n.listen_http_port as u16,
-                        listen_pg_addr: n.listen_pg_addr,
-                        listen_pg_port: n.listen_pg_port as u16,
-                    })
-                    .collect::<Vec<Node>>())
+                Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
             })
             .await?;
 
@@ -156,6 +142,31 @@ impl Persistence {
         Ok(nodes)
     }
 
+    pub(crate) async fn update_node(
+        &self,
+        input_node_id: NodeId,
+        input_scheduling: NodeSchedulingPolicy,
+    ) -> DatabaseResult<()> {
+        use crate::schema::nodes::dsl::*;
+        let updated = self
+            .with_conn(move |conn| {
+                let updated = diesel::update(nodes)
+                    .filter(node_id.eq(input_node_id.0 as i64))
+                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
+                    .execute(conn)?;
+                Ok(updated)
+            })
+            .await?;
+
+        if updated != 1 {
+            Err(DatabaseError::Logical(format!(
+                "Node {node_id:?} not found for update",
+            )))
+        } else {
+            Ok(())
+        }
+    }
+
     /// At startup, load the high level state for shards, such as their config + policy.  This will
     /// be enriched at runtime with state discovered on pageservers.
     pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
@@ -506,7 +517,7 @@ pub(crate) struct TenantShardPersistence {
 }
 
 /// Parts of [`crate::node::Node`] that are stored durably
-#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
+#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::nodes)]
 pub(crate) struct NodePersistence {
     pub(crate) node_id: i64,
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 0b9a7d8a69..38249b9223 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -622,7 +622,22 @@ impl Service {
         let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
 
         tracing::info!("Loading nodes from database...");
-        let nodes = persistence.list_nodes().await?;
+        let nodes = persistence
+            .list_nodes()
+            .await?
+            .into_iter()
+            .map(|n| Node {
+                id: NodeId(n.node_id as u64),
+                // At startup we consider a node offline until proven otherwise.
+                availability: NodeAvailability::Offline,
+                scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
+                    .expect("Bad scheduling policy in DB"),
+                listen_http_addr: n.listen_http_addr,
+                listen_http_port: n.listen_http_port as u16,
+                listen_pg_addr: n.listen_pg_addr,
+                listen_pg_port: n.listen_pg_port as u16,
+            })
+            .collect::<Vec<_>>();
         let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
         tracing::info!("Loaded {} nodes from database.", nodes.len());
 
@@ -2326,7 +2341,11 @@ impl Service {
                 .context("Scheduler checks")
                 .map_err(ApiError::InternalServerError)?;
 
-            let expect_nodes = locked.nodes.values().cloned().collect::<Vec<_>>();
+            let expect_nodes = locked
+                .nodes
+                .values()
+                .map(|n| n.to_persistent())
+                .collect::<Vec<_>>();
 
             let expect_shards = locked
                 .tenants
@@ -2338,8 +2357,8 @@ impl Service {
         };
 
         let mut nodes = self.persistence.list_nodes().await?;
-        expect_nodes.sort_by_key(|n| n.id);
-        nodes.sort_by_key(|n| n.id);
+        expect_nodes.sort_by_key(|n| n.node_id);
+        nodes.sort_by_key(|n| n.node_id);
 
         if nodes != expect_nodes {
             tracing::error!("Consistency check failed on nodes.");
@@ -2353,6 +2372,9 @@ impl Service {
                 serde_json::to_string(&nodes)
                     .map_err(|e| ApiError::InternalServerError(e.into()))?
             );
+            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Node consistency failure"
+            )));
         }
 
         let mut shards = self.persistence.list_tenant_shards().await?;
@@ -2363,14 +2385,17 @@ impl Service {
             tracing::error!("Consistency check failed on shards.");
             tracing::error!(
                 "Shards in memory: {}",
-                serde_json::to_string(&expect_nodes)
+                serde_json::to_string(&expect_shards)
                     .map_err(|e| ApiError::InternalServerError(e.into()))?
             );
             tracing::error!(
                 "Shards in database: {}",
-                serde_json::to_string(&nodes)
+                serde_json::to_string(&shards)
                     .map_err(|e| ApiError::InternalServerError(e.into()))?
             );
+            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Shard consistency failure"
+            )));
         }
 
         Ok(())
@@ -2496,7 +2521,18 @@ impl Service {
         Ok(())
     }
 
-    pub(crate) fn node_configure(&self, config_req: NodeConfigureRequest) -> Result<(), ApiError> {
+    pub(crate) async fn node_configure(
+        &self,
+        config_req: NodeConfigureRequest,
+    ) -> Result<(), ApiError> {
+        if let Some(scheduling) = config_req.scheduling {
+            // Scheduling is a persistent part of Node: we must write updates to the database before
+            // applying them in memory
+            self.persistence
+                .update_node(config_req.node_id, scheduling)
+                .await?;
+        }
+
         let mut locked = self.inner.write().unwrap();
         let result_tx = locked.result_tx.clone();
         let compute_hook = locked.compute_hook.clone();
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 3cfffc6c45..02f0171c29 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -737,7 +737,12 @@ impl TenantState {
             shard_count: self.tenant_shard_id.shard_count.literal() as i32,
             shard_stripe_size: self.shard.stripe_size.0 as i32,
             generation: self.generation.into().unwrap_or(0) as i32,
-            generation_pageserver: i64::MAX,
+            generation_pageserver: self
+                .intent
+                .get_attached()
+                .map(|n| n.0 as i64)
+                .unwrap_or(i64::MAX),
+
             placement_policy: serde_json::to_string(&self.policy).unwrap(),
             config: serde_json::to_string(&self.config).unwrap(),
             splitting: SplitState::default(),

From 2424d908831360eb143af8da06e56df5478b6e86 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 22 Feb 2024 17:15:18 +0100
Subject: [PATCH 0255/1571] CI: Split Proxy and Storage releases (#6797)

## Problem

We want to release Proxy at a different cadence.

## Summary of changes

- build-and-test workflow:
  - Handle the `release-proxy` branch
  - Tag images built on this branch with `release-proxy-XXX` tag
- Trigger deploy workflow with `deployStorage=true` &
`deployStorageBroker=true` on `release` branch
- Trigger deploy workflow with `deployPgSniRouter=true` &
`deployProxy=true` on `release-proxy` branch
- release workflow (scheduled creation of release branch):
- Schedule Proxy releases for Thursdays (a random day to make it
different from Storage releases)
---
 .github/workflows/build_and_test.yml | 38 ++++++++-----
 .github/workflows/release.yml        | 83 ++++++++++++++++++++++++----
 2 files changed, 96 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 5a807aa9fd..bc2f7dfe24 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -5,6 +5,7 @@ on:
     branches:
       - main
       - release
+      - release-proxy
   pull_request:
 
 defaults:
@@ -67,6 +68,8 @@ jobs:
             echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
             echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
           else
             echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
             echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
@@ -682,7 +685,7 @@ jobs:
             })
 
   trigger-e2e-tests:
-    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' }}
+    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }}
     needs: [ check-permissions, promote-images, tag ]
     uses: ./.github/workflows/trigger-e2e-tests.yml
     secrets: inherit
@@ -952,9 +955,7 @@ jobs:
           crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
 
       - name: Add latest tag to images
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-           github.event_name != 'workflow_dispatch'
+        if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
         run: |
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -966,9 +967,7 @@ jobs:
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
 
       - name: Push images to production ECR
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-           github.event_name != 'workflow_dispatch'
+        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
         run: |
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
@@ -992,9 +991,7 @@ jobs:
           crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
 
       - name: Push latest tags to Docker Hub
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-          github.event_name != 'workflow_dispatch'
+        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
         run: |
           crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -1084,7 +1081,7 @@ jobs:
 
   deploy:
     needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
-    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
+    if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
 
     runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
@@ -1119,14 +1116,28 @@ jobs:
             # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
             gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
+            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+              -f deployPgSniRouter=false \
+              -f deployProxy=false \
+              -f deployStorage=true \
+              -f deployStorageBroker=true \
+              -f branch=main \
+              -f dockerTag=${{needs.tag.outputs.build-tag}}
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+              -f deployPgSniRouter=true \
+              -f deployProxy=true \
+              -f deployStorage=false \
+              -f deployStorageBroker=false \
+              -f branch=main \
+              -f dockerTag=${{needs.tag.outputs.build-tag}}
           else
             echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
             exit 1
           fi
 
       - name: Create git tag
-        if: github.ref_name == 'release'
+        if: github.ref_name == 'release' || github.ref_name == 'release-proxy'
         uses: actions/github-script@v7
         with:
           # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
@@ -1139,6 +1150,7 @@ jobs:
               sha: context.sha,
             })
 
+      # TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
       - name: Create GitHub release
         if: github.ref_name == 'release'
         uses: actions/github-script@v7
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index ba37c5827a..80a718d61a 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,12 +2,31 @@ name: Create Release Branch
 
 on:
   schedule:
-    - cron: '0 6 * * 1'
+    # It should be kept in sync with if-condition in jobs
+    - cron: '0 6 * * MON' # Storage release
+    - cron: '0 6 * * THU' # Proxy release
   workflow_dispatch:
+    inputs:
+      create-storage-release-branch:
+        type: boolean
+        description: 'Create Storage release PR'
+        required: false
+      create-proxy-release-branch:
+        type: boolean
+        description: 'Create Proxy release PR'
+        required: false
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
 
 jobs:
-  create_release_branch:
-    runs-on: [ ubuntu-latest ]
+  create-storage-release-branch:
+    if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
+    runs-on: ubuntu-latest
 
     permissions:
       contents: write # for `git push`
@@ -18,27 +37,67 @@ jobs:
       with:
         ref: main
 
-    - name: Get current date
-      id: date
-      run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+    - name: Set environment variables
+      run: |
+        echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
+        echo "RELEASE_BRANCH=rc/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
 
     - name: Create release branch
-      run: git checkout -b releases/${{ steps.date.outputs.date }}
+      run: git checkout -b $RELEASE_BRANCH
 
     - name: Push new branch
-      run: git push origin releases/${{ steps.date.outputs.date }}
+      run: git push origin $RELEASE_BRANCH
 
     - name: Create pull request into release
       env:
         GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
       run: |
         cat << EOF > body.md
-          ## Release ${{ steps.date.outputs.date }}
+          ## Release ${RELEASE_DATE}
 
-          **Please merge this PR using 'Create a merge commit'!**
+          **Please merge this Pull Request using 'Create a merge commit' button**
         EOF
 
-        gh pr create --title "Release ${{ steps.date.outputs.date }}" \
+        gh pr create --title "Release ${RELEASE_DATE}" \
                      --body-file "body.md" \
-                     --head "releases/${{ steps.date.outputs.date }}" \
+                     --head "${RELEASE_BRANCH}" \
                      --base "release"
+
+  create-proxy-release-branch:
+    if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: write # for `git push`
+
+    steps:
+    - name: Check out code
+      uses: actions/checkout@v4
+      with:
+        ref: main
+
+    - name: Set environment variables
+      run: |
+        echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
+        echo "RELEASE_BRANCH=rc/proxy/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
+
+    - name: Create release branch
+      run: git checkout -b $RELEASE_BRANCH
+
+    - name: Push new branch
+      run: git push origin $RELEASE_BRANCH
+
+    - name: Create pull request into release
+      env:
+        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+      run: |
+        cat << EOF > body.md
+          ## Proxy release ${RELEASE_DATE}
+
+          **Please merge this Pull Request using 'Create a merge commit' button**
+        EOF
+
+        gh pr create --title "Proxy release ${RELEASE_DATE}}" \
+                     --body-file "body.md" \
+                     --head "${RELEASE_BRANCH}" \
+                     --base "release-proxy"

From 9c6145f0a990cad18af412dc0262920969e3b469 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 22 Feb 2024 16:51:46 +0000
Subject: [PATCH 0256/1571] control_plane: fix a compilation error from racing
 PRs (#6882)

Merge of two green PRs raced, and ended up with a non-compiling result.
---
 control_plane/attachment_service/src/service.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 38249b9223..8a80d0c746 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -936,7 +936,8 @@ impl Service {
             node_id: reattach_req.node_id,
             availability: Some(NodeAvailability::Active),
             scheduling: None,
-        })?;
+        })
+        .await?;
 
         // Ordering: we must persist generation number updates before making them visible in the in-memory state
         let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?;

From 837988b6c9958138ba2471b210db48214fea9d2d Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 22 Feb 2024 12:49:02 -0500
Subject: [PATCH 0257/1571] compute_ctl: run migrations to grant default
 grantable privileges (#6884)

## Problem

Following up on https://github.com/neondatabase/neon/pull/6845, we did
not make the default privileges grantable before, and therefore, even if
the users have full privileges, they are not able to grant them to
others.

Should be a final fix for
https://github.com/neondatabase/neon/issues/6236.

## Summary of changes

Add `WITH GRANT` to migrations so that neon_superuser can grant the
permissions.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/spec.rs              | 6 ++++--
 test_runner/regress/test_migrations.py | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 27d95c30e7..8667a76b1f 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -778,8 +778,10 @@ END
 $$;"#,
         "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
         // ensure tables created by superusers (i.e., when creating extensions) can be used by neon_superuser.
-        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser",
-        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser",
+        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser", // to-be removed in the future
+        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser", // to-be removed in the future
+        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION",
+        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION",
     ];
 
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 997297a5cd..3f626c5c7c 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -15,7 +15,7 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     endpoint.wait_for_migrations()
 
-    num_migrations = 6
+    num_migrations = 8
 
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")

From d669dacd71465054a14d172fb4b521933fa0ea6d Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 22 Feb 2024 09:05:37 -0900
Subject: [PATCH 0258/1571] Add pgpartman (#6849)

## Problem

## Summary of changes

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 Dockerfile.compute-node | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index c34f3684e9..149ca5109b 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -786,6 +786,22 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
 
+#########################################################################################
+#
+# Layer "pg_partman"
+# compile pg_partman extension
+#
+#########################################################################################
+FROM build-deps AS pg-partman-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
+    echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
+    mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
 
 #########################################################################################
 #
@@ -829,6 +845,7 @@ COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \

From 47657f2df4defda9630fc3728ce50d35cdf9a0dd Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 22 Feb 2024 21:33:38 +0200
Subject: [PATCH 0259/1571] Flush logical messages with snapshots and
 replication origin (#6826)

## Problem

See https://neondb.slack.com/archives/C04DGM6SMTM/p1708363190710839

## Summary of changes

Flush logical message with snapshot and origin state

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 9dd9956c55..17101190de 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 9dd9956c55ffbbd9abe77d10382453757fedfcf5
+Subproject commit 17101190de8a54b95e0831c66c3da426ed33db34
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index ca2def9993..0baccce15a 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit ca2def999368d9df098a637234ad5a9003189463
+Subproject commit 0baccce15a3b0446af5c403d2e869a04541b63c4
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 9c37a49884..dc40299045 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 9c37a4988463a97d9cacb321acf3828b09823269
+Subproject commit dc40299045a377ec3b302c900134468a1b0f58ee
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 72bc0d7e0d..d18f1588f5 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "9c37a4988463a97d9cacb321acf3828b09823269",
-    "postgres-v15": "ca2def999368d9df098a637234ad5a9003189463",
-    "postgres-v14": "9dd9956c55ffbbd9abe77d10382453757fedfcf5"
+    "postgres-v16": "dc40299045a377ec3b302c900134468a1b0f58ee",
+    "postgres-v15": "0baccce15a3b0446af5c403d2e869a04541b63c4",
+    "postgres-v14": "17101190de8a54b95e0831c66c3da426ed33db34"
 }

From 5bcae3a86e52b806f48e1c747353ad9cb7fb06d1 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 13 Feb 2024 12:23:38 +0300
Subject: [PATCH 0260/1571] Drop LR slots if too many .snap files are found.

PR #6655 turned out to be not enough to prevent .snap files bloat; some
subscribers just don't ack flushed position, thus never advancing the
slot. Probably other bloating scenarios are also possible, so add a more direct
restriction -- drop all slots if too many .snap files has been discovered.
---
 pgxn/neon/neon.c                              | 226 +++++++++++++-----
 .../regress/test_logical_replication.py       |  48 +++-
 2 files changed, 213 insertions(+), 61 deletions(-)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 799f88751c..24ec909c79 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -37,7 +37,7 @@
 PG_MODULE_MAGIC;
 void		_PG_init(void);
 
-static int	logical_replication_max_time_lag = 3600;
+static int	logical_replication_max_snap_files = 300;
 
 static void
 InitLogicalReplicationMonitor(void)
@@ -45,14 +45,14 @@ InitLogicalReplicationMonitor(void)
 	BackgroundWorker bgw;
 
 	DefineCustomIntVariable(
-		"neon.logical_replication_max_time_lag",
-		"Threshold for dropping unused logical replication slots",
-		NULL,
-		&logical_replication_max_time_lag,
-		3600, 0, INT_MAX,
-		PGC_SIGHUP,
-		GUC_UNIT_S,
-		NULL, NULL, NULL);
+							"neon.logical_replication_max_snap_files",
+							"Maximum allowed logical replication .snap files",
+							NULL,
+							&logical_replication_max_snap_files,
+							300, 0, INT_MAX,
+							PGC_SIGHUP,
+							0,
+							NULL, NULL, NULL);
 
 	memset(&bgw, 0, sizeof(bgw));
 	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
@@ -68,22 +68,99 @@ InitLogicalReplicationMonitor(void)
 	RegisterBackgroundWorker(&bgw);
 }
 
-typedef struct
+static int
+LsnDescComparator(const void *a, const void *b)
 {
-	NameData    name;
-	bool        dropped;
-	XLogRecPtr  confirmed_flush_lsn;
-	TimestampTz last_updated;
-} SlotStatus;
+	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
+	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
+
+	if (lsn1 < lsn2)
+		return 1;
+	else if (lsn1 == lsn2)
+		return 0;
+	else
+		return -1;
+}
+
+/*
+ * Look at .snap files and calculate minimum allowed restart_lsn of slot so that
+ * next gc would leave not more than logical_replication_max_snap_files; all
+ * slots having lower restart_lsn should be dropped.
+ */
+static XLogRecPtr
+get_num_snap_files_lsn_threshold(void)
+{
+	DIR		   *dirdesc;
+	struct dirent *de;
+	char	   *snap_path = "pg_logical/snapshots/";
+	int			cnt = 0;
+	int			lsns_allocated = 1024;
+	int			lsns_num = 0;
+	XLogRecPtr *lsns;
+	XLogRecPtr	cutoff;
+
+	if (logical_replication_max_snap_files < 0)
+		return 0;
+
+	lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
+
+	/* find all .snap files and get their lsns */
+	dirdesc = AllocateDir(snap_path);
+	while ((de = ReadDir(dirdesc, snap_path)) != NULL)
+	{
+		XLogRecPtr	lsn;
+		uint32		hi;
+		uint32		lo;
+
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2)
+		{
+			ereport(LOG,
+					(errmsg("could not parse file name as .snap file \"%s\"", de->d_name)));
+			continue;
+		}
+
+		lsn = ((uint64) hi) << 32 | lo;
+		elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
+		if (lsns_allocated == lsns_num)
+		{
+			lsns_allocated *= 2;
+			lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
+		}
+		lsns[lsns_num++] = lsn;
+	}
+	/* sort by lsn desc */
+	qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
+	/* and take cutoff at logical_replication_max_snap_files */
+	if (logical_replication_max_snap_files > lsns_num)
+		cutoff = 0;
+	/* have less files than cutoff */
+	else
+	{
+		cutoff = lsns[logical_replication_max_snap_files - 1];
+		elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
+			 LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
+	}
+	pfree(lsns);
+	FreeDir(dirdesc);
+	return cutoff;
+}
+
+#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */
 
 /*
  * Unused logical replication slots pins WAL and prevents deletion of snapshots.
+ * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
+ * need too many .snap files.
  */
 PGDLLEXPORT void
 LogicalSlotsMonitorMain(Datum main_arg)
 {
-	SlotStatus* slots;
-	TimestampTz now, last_checked;
+	TimestampTz now,
+				last_checked;
 
 	/* Establish signal handlers. */
 	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
@@ -92,72 +169,101 @@ LogicalSlotsMonitorMain(Datum main_arg)
 
 	BackgroundWorkerUnblockSignals();
 
-	slots = (SlotStatus*)calloc(max_replication_slots, sizeof(SlotStatus));
-	last_checked = GetCurrentTimestamp();
-
 	for (;;)
 	{
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
-						 logical_replication_max_time_lag*1000/2,
-						 PG_WAIT_EXTENSION);
-		ResetLatch(MyLatch);
-		CHECK_FOR_INTERRUPTS();
+		XLogRecPtr	cutoff_lsn;
 
-		now = GetCurrentTimestamp();
-
-		if (now - last_checked > logical_replication_max_time_lag*USECS_PER_SEC)
+		/*
+		 * If there are too many .snap files, just drop all logical slots to
+		 * prevent aux files bloat.
+		 */
+		cutoff_lsn = get_num_snap_files_lsn_threshold();
+		if (cutoff_lsn > 0)
 		{
-			int n_active_slots = 0;
-			last_checked = now;
-
-			LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
 			for (int i = 0; i < max_replication_slots; i++)
 			{
+				char		slot_name[NAMEDATALEN];
 				ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
+				XLogRecPtr	restart_lsn;
 
+				/* find the name */
+				LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
 				/* Consider only logical repliction slots */
 				if (!s->in_use || !SlotIsLogical(s))
-					continue;
-
-				if (s->active_pid != 0)
 				{
-					n_active_slots += 1;
+					LWLockRelease(ReplicationSlotControlLock);
 					continue;
 				}
 
-				/* Check if there was some activity with the slot since last check */
-				if (s->data.confirmed_flush != slots[i].confirmed_flush_lsn)
+				/* do we need to drop it? */
+				SpinLockAcquire(&s->mutex);
+				restart_lsn = s->data.restart_lsn;
+				SpinLockRelease(&s->mutex);
+				if (restart_lsn >= cutoff_lsn)
 				{
-					slots[i].confirmed_flush_lsn = s->data.confirmed_flush;
-					slots[i].last_updated = now;
+					LWLockRelease(ReplicationSlotControlLock);
+					continue;
 				}
-				else if (now - slots[i].last_updated > logical_replication_max_time_lag*USECS_PER_SEC)
-				{
-					slots[i].name = s->data.name;
-					slots[i].dropped = true;
-				}
-			}
-			LWLockRelease(ReplicationSlotControlLock);
 
-			/*
-			 * If there are no active subscriptions, then no new snapshots are generated
-			 * and so no need to force slot deletion.
-			 */
-			if (n_active_slots != 0)
-			{
-				for (int i = 0; i < max_replication_slots; i++)
+				strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
+				elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
+					 slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
+				LWLockRelease(ReplicationSlotControlLock);
+
+				/* now try to drop it, killing owner before if any */
+				for (;;)
 				{
-					if (slots[i].dropped)
+					pid_t		active_pid;
+
+					SpinLockAcquire(&s->mutex);
+					active_pid = s->active_pid;
+					SpinLockRelease(&s->mutex);
+
+					if (active_pid == 0)
 					{
-						elog(LOG, "Drop logical replication slot because it was not update more than %ld seconds",
-							 (now - slots[i].last_updated)/USECS_PER_SEC);
-						ReplicationSlotDrop(slots[i].name.data, true);
-						slots[i].dropped = false;
+						/*
+						 * Slot is releasted, try to drop it. Though of course
+						 * it could have been reacquired, so drop can ERROR
+						 * out. Similarly it could have been dropped in the
+						 * meanwhile.
+						 *
+						 * In principle we could remove pg_try/pg_catch, that
+						 * would restart the whole bgworker.
+						 */
+						ConditionVariableCancelSleep();
+						PG_TRY();
+						{
+							ReplicationSlotDrop(slot_name, true);
+							elog(LOG, "ls_monitor: slot %s dropped", slot_name);
+						}
+						PG_CATCH();
+						{
+							/* log ERROR and reset elog stack */
+							EmitErrorReport();
+							FlushErrorState();
+							elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
+						}
+						PG_END_TRY();
+						break;
+					}
+					else
+					{
+						/* kill the owner and wait for release */
+						elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
+						(void) kill(active_pid, SIGTERM);
+						/* We shouldn't get stuck, but to be safe add timeout. */
+						ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);
 					}
 				}
 			}
 		}
+
+		(void) WaitLatch(MyLatch,
+						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
+						 LS_MONITOR_CHECK_INTERVAL,
+						 PG_WAIT_EXTENSION);
+		ResetLatch(MyLatch);
+		CHECK_FOR_INTERRUPTS();
 	}
 }
 
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index eff0b124d3..3f4ca8070d 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -1,4 +1,5 @@
 import time
+from functools import partial
 from random import choice
 from string import ascii_lowercase
 
@@ -10,7 +11,7 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.types import Lsn
-from fixtures.utils import query_scalar
+from fixtures.utils import query_scalar, wait_until
 
 
 def random_string(n: int):
@@ -157,6 +158,51 @@ COMMIT;
     assert endpoint.safe_psql("select count(*) from pg_replication_slots")[0][0] == 1
 
 
+# Test that neon.logical_replication_max_snap_files works
+def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
+    def slot_removed(ep):
+        assert (
+            endpoint.safe_psql(
+                "select count(*) from pg_replication_slots where slot_name = 'stale_slot'"
+            )[0][0]
+            == 0
+        )
+
+    env = neon_simple_env
+
+    env.neon_cli.create_branch("test_logical_replication", "empty")
+    # set low neon.logical_replication_max_snap_files
+    endpoint = env.endpoints.create_start(
+        "test_logical_replication",
+        config_lines=["log_statement=all", "neon.logical_replication_max_snap_files=1"],
+    )
+
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    # create obsolete slot
+    cur.execute("select pg_create_logical_replication_slot('stale_slot', 'pgoutput');")
+    assert (
+        endpoint.safe_psql(
+            "select count(*) from pg_replication_slots where slot_name = 'stale_slot'"
+        )[0][0]
+        == 1
+    )
+
+    # now insert some data and create and start live subscriber to create more .snap files
+    # (in most cases this is not needed as stale_slot snap will have higher LSN than restart_lsn anyway)
+    cur.execute("create table t(pk integer primary key, payload integer)")
+    cur.execute("create publication pub1 for table t")
+
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)")
+    connstr = endpoint.connstr().replace("'", "''")
+    log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
+    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
+
+    wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint))
+
+
 # Test compute start at LSN page of which starts with contrecord
 # https://github.com/neondatabase/neon/issues/5749
 def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):

From 12487e662de751e5125797389b5052141aa7e41b Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 22 Feb 2024 17:00:03 -0500
Subject: [PATCH 0261/1571] compute_ctl: move default privileges grants to
 handle_grants (#6885)

## Problem

Following up https://github.com/neondatabase/neon/pull/6884, hopefully,
a real final fix for https://github.com/neondatabase/neon/issues/6236.

## Summary of changes

`handle_migrations` is done over the main `postgres` db connection.
Therefore, the privileges assigned here do not work with databases
created later (i.e., `neondb`). This pull request moves the grants to
`handle_grants`, so that it runs for each DB created. The SQL is added
into the `BEGIN/END` block, so that it takes only one RTT to apply all
of them.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/spec.rs | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 8667a76b1f..b515f9f408 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -655,6 +655,9 @@ pub fn handle_grants(
         // remove this code if possible. The worst thing that could happen is that
         // user won't be able to use public schema in NEW databases created in the
         // very OLD project.
+        //
+        // Also, alter default permissions so that relations created by extensions can be
+        // used by neon_superuser without permission issues.
         let grant_query = "DO $$\n\
                 BEGIN\n\
                     IF EXISTS(\n\
@@ -673,6 +676,8 @@ pub fn handle_grants(
                             GRANT CREATE ON SCHEMA public TO web_access;\n\
                         END IF;\n\
                     END IF;\n\
+                    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
+                    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
                 END\n\
             $$;"
         .to_string();
@@ -777,11 +782,12 @@ BEGIN
 END
 $$;"#,
         "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
-        // ensure tables created by superusers (i.e., when creating extensions) can be used by neon_superuser.
-        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser", // to-be removed in the future
-        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser", // to-be removed in the future
-        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION",
-        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION",
+        // Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else.
+        "",
+        "",
+        "",
+        "",
+        // Add new migrations below.
     ];
 
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
@@ -808,8 +814,13 @@ $$;"#,
     client.simple_query(query)?;
 
     while current_migration < migrations.len() {
-        info!("Running migration:\n{}\n", migrations[current_migration]);
-        client.simple_query(migrations[current_migration])?;
+        let migration = &migrations[current_migration];
+        if migration.is_empty() {
+            info!("Skip migration id={}", current_migration);
+        } else {
+            info!("Running migration:\n{}\n", migration);
+            client.simple_query(migration)?;
+        }
         current_migration += 1;
     }
     let setval = format!(

From 6f8f7c7de9cb925d99797e379f6aa936b98ed05a Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 23 Feb 2024 12:36:18 +0100
Subject: [PATCH 0262/1571] CI: Build images using docker buildx instead of
 kaniko (#6871)

## Problem

To "build" a compute image that doesn't have anything new, kaniko takes
13m[0], docker buildx does it in 5m[1].
Also, kaniko doesn't fully support bash expressions in the Dockerfile
`RUN`, so we have to use different workarounds for this (like `bash -c
...`).

- [0]
https://github.com/neondatabase/neon/actions/runs/8011512414/job/21884933687
- [1]
https://github.com/neondatabase/neon/actions/runs/8008245697/job/21874278162

## Summary of changes
- Use docker buildx to build `compute-node` images
- Use docker buildx to build `neon-image` image
- Use docker buildx to build `compute-tools` image
- Use docker hub for image cache (instead of ECR)
---
 .github/workflows/build_and_test.yml | 267 ++++++++++++++-------------
 1 file changed, 141 insertions(+), 126 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index bc2f7dfe24..5def619c07 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -693,158 +693,173 @@ jobs:
   neon-image:
     needs: [ check-permissions, build-buildtools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
-    container: gcr.io/kaniko-project/executor:v1.9.2-debug
-    defaults:
-      run:
-        shell: sh -eu {0}
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
+        uses: actions/checkout@v4
         with:
           submodules: true
           fetch-depth: 0
 
-      - name: Configure ECR and Docker Hub login
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
         run: |
-          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
-          echo "::add-mask::${DOCKERHUB_AUTH}"
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+      - uses: docker/setup-buildx-action@v3
 
-          cat <<-EOF > /kaniko/.docker/config.json
-            {
-              "auths": {
-                "https://index.docker.io/v1/": {
-                  "auth": "${DOCKERHUB_AUTH}"
-                }
-              },
-              "credHelpers": {
-                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
-              }
-            }
-          EOF
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - name: Kaniko build neon
-        run:
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
-                           --context .
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-                           --build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
-                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
-      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
+      - uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+            TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile
+          cache-from: type=registry,ref=neondatabase/neon:cache
+          cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
+          tags: |
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+            neondatabase/neon:${{needs.tag.outputs.build-tag}}
+
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
 
   compute-tools-image:
     runs-on: [ self-hosted, gen3, large ]
     needs: [ check-permissions, build-buildtools-image, tag ]
-    container: gcr.io/kaniko-project/executor:v1.9.2-debug
-    defaults:
-      run:
-        shell: sh -eu {0}
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
-
-      - name: Configure ECR and Docker Hub login
-        run: |
-          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
-          echo "::add-mask::${DOCKERHUB_AUTH}"
-
-          cat <<-EOF > /kaniko/.docker/config.json
-            {
-              "auths": {
-                "https://index.docker.io/v1/": {
-                  "auth": "${DOCKERHUB_AUTH}"
-                }
-              },
-              "credHelpers": {
-                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
-              }
-            }
-          EOF
-
-      - name: Kaniko build compute tools
-        run:
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
-                           --context .
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
-                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-                           --dockerfile Dockerfile.compute-tools
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
-                           --destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
-
-      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
-
-  compute-node-image:
-    needs: [ check-permissions, build-buildtools-image, tag ]
-    runs-on: [ self-hosted, gen3, large ]
-    container:
-      image: gcr.io/kaniko-project/executor:v1.9.2-debug
-      # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
-      # Should be prevented by https://github.com/neondatabase/neon/issues/4281
-      options: --add-host=download.osgeo.org:140.211.15.30
-    strategy:
-      fail-fast: false
-      matrix:
-        version: [ v14, v15, v16 ]
-    defaults:
-      run:
-        shell: sh -eu {0}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
+        uses: actions/checkout@v4
         with:
           submodules: true
           fetch-depth: 0
 
-      - name: Configure ECR and Docker Hub login
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
         run: |
-          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
-          echo "::add-mask::${DOCKERHUB_AUTH}"
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+      - uses: docker/setup-buildx-action@v3
 
-          cat <<-EOF > /kaniko/.docker/config.json
-            {
-              "auths": {
-                "https://index.docker.io/v1/": {
-                  "auth": "${DOCKERHUB_AUTH}"
-                }
-              },
-              "credHelpers": {
-                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
-              }
-            }
-          EOF
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - name: Kaniko build compute node with extensions
-        run:
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
-                           --context .
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-                           --build-arg PG_VERSION=${{ matrix.version }}
-                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
-                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-                           --dockerfile Dockerfile.compute-node
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
-                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
-                           --cleanup
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
-      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
+      - uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            BUILD_TAG=${{needs.tag.outputs.build-tag}}
+            TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
+            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.compute-tools
+          cache-from: type=registry,ref=neondatabase/compute-tools:cache
+          cache-to: type=registry,ref=neondatabase/compute-tools:cache,mode=max
+          tags: |
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+            neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
+
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
+
+  compute-node-image:
+    needs: [ check-permissions, build-buildtools-image, tag ]
+    runs-on: [ self-hosted, gen3, large ]
+
+    strategy:
+      fail-fast: false
+      matrix:
+        version: [ v14, v15, v16 ]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+      - uses: docker/setup-buildx-action@v3
+        with:
+          # Disable parallelism for docker buildkit.
+          # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
+          config-inline: |
+            [worker.oci]
+              max-parallelism = 1
+
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            PG_VERSION=${{ matrix.version }}
+            BUILD_TAG=${{needs.tag.outputs.build-tag}}
+            TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
+            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.compute-node
+          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
+          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
+          tags: |
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+            neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
 
   vm-compute-node-image:
     needs: [ check-permissions, tag, compute-node-image ]

From cd449d66ea29ad2d7269458e90623c3ae40e1816 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 23 Feb 2024 14:33:47 +0100
Subject: [PATCH 0263/1571] stop writing `metadata` file (#6769)

Building atop #6777, this PR removes the code that writes the `metadata`
file and adds a piece of migration code that removes any remaining
`metadata` files.

We'll remove the migration code after this PR has been deployed.

part of https://github.com/neondatabase/neon/issues/6663

More cleanups punted into follow-up issue, as they touch a lot of code:
https://github.com/neondatabase/neon/issues/6890
---
 pageserver/src/config.rs                      |  13 +-
 pageserver/src/lib.rs                         |   9 --
 pageserver/src/tenant.rs                      |  34 +----
 pageserver/src/tenant/metadata.rs             |  37 +-----
 pageserver/src/tenant/mgr.rs                  |  61 ++++++++-
 pageserver/src/tenant/secondary/downloader.rs |   3 +-
 pageserver/src/tenant/timeline.rs             |  68 ++--------
 pageserver/src/tenant/timeline/delete.rs      | 123 +++---------------
 test_runner/regress/test_remote_storage.py    |   6 +-
 test_runner/regress/test_tenant_delete.py     |   1 -
 test_runner/regress/test_tenant_relocation.py |   5 +-
 test_runner/regress/test_timeline_delete.py   |   5 +-
 12 files changed, 95 insertions(+), 270 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 34d9636673..3b7672fa26 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -39,7 +39,7 @@ use crate::tenant::{
 };
 use crate::virtual_file;
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
+    IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
     TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };
 
@@ -826,17 +826,6 @@ impl PageServerConf {
             .join(connection_id.to_string())
     }
 
-    /// Points to a place in pageserver's local directory,
-    /// where certain timeline's metadata file should be located.
-    pub fn metadata_path(
-        &self,
-        tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
-    ) -> Utf8PathBuf {
-        self.timeline_path(tenant_shard_id, timeline_id)
-            .join(METADATA_FILE_NAME)
-    }
-
     /// Turns storage remote path of a file into its local path.
     pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
         remote_path.with_base(&self.workdir)
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index cf6856458a..02a690d4e1 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -169,15 +169,6 @@ pub fn is_delete_mark(path: &Utf8Path) -> bool {
     ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
 }
 
-fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
-    if let Some(e) = e.io_error() {
-        if e.kind() == std::io::ErrorKind::NotFound {
-            return true;
-        }
-    }
-    false
-}
-
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7021921b12..9fa087f0d9 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -172,9 +172,6 @@ pub(crate) mod throttle;
 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 
-// re-export for use in remote_timeline_client.rs
-pub use crate::tenant::metadata::save_metadata;
-
 // re-export for use in walreceiver
 pub use crate::tenant::timeline::WalReceiverInfo;
 
@@ -1151,17 +1148,6 @@ impl Tenant {
             None
         };
 
-        // timeline loading after attach expects to find metadata file for each metadata
-        save_metadata(
-            self.conf,
-            &self.tenant_shard_id,
-            &timeline_id,
-            &remote_metadata,
-        )
-        .await
-        .context("save_metadata")
-        .map_err(LoadLocalTimelineError::Load)?;
-
         self.timeline_init_and_sync(
             timeline_id,
             resources,
@@ -3293,10 +3279,7 @@ impl Tenant {
 
         timeline_struct.init_empty_layer_map(start_lsn);
 
-        if let Err(e) = self
-            .create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata)
-            .await
-        {
+        if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await {
             error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
             cleanup_timeline_directory(uninit_mark);
             return Err(e);
@@ -3313,26 +3296,13 @@ impl Tenant {
         ))
     }
 
-    async fn create_timeline_files(
-        &self,
-        timeline_path: &Utf8Path,
-        new_timeline_id: &TimelineId,
-        new_metadata: &TimelineMetadata,
-    ) -> anyhow::Result<()> {
+    async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
         crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
 
         fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
             anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
         });
 
-        save_metadata(
-            self.conf,
-            &self.tenant_shard_id,
-            new_timeline_id,
-            new_metadata,
-        )
-        .await
-        .context("Failed to create timeline metadata")?;
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 1a20a237a7..1736950d1f 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -8,20 +8,11 @@
 //!
 //! [`remote_timeline_client`]: super::remote_timeline_client
 
-use std::io::{self};
-
-use anyhow::{ensure, Context};
-use pageserver_api::shard::TenantShardId;
+use anyhow::ensure;
 use serde::{de::Error, Deserialize, Serialize, Serializer};
-use thiserror::Error;
 use utils::bin_ser::SerializeError;
-use utils::crashsafe::path_with_suffix_extension;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
 
-use crate::config::PageServerConf;
-use crate::virtual_file::VirtualFile;
-use crate::TEMP_FILE_SUFFIX;
-
 /// Use special format number to enable backward compatibility.
 const METADATA_FORMAT_VERSION: u16 = 4;
 
@@ -268,32 +259,6 @@ impl Serialize for TimelineMetadata {
     }
 }
 
-/// Save timeline metadata to file
-#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))]
-pub async fn save_metadata(
-    conf: &'static PageServerConf,
-    tenant_shard_id: &TenantShardId,
-    timeline_id: &TimelineId,
-    data: &TimelineMetadata,
-) -> anyhow::Result<()> {
-    let path = conf.metadata_path(tenant_shard_id, timeline_id);
-    let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
-    let metadata_bytes = data.to_bytes().context("serialize metadata")?;
-    VirtualFile::crashsafe_overwrite(&path, &temp_path, metadata_bytes)
-        .await
-        .context("write metadata")?;
-    Ok(())
-}
-
-#[derive(Error, Debug)]
-pub enum LoadMetadataError {
-    #[error(transparent)]
-    Read(#[from] io::Error),
-
-    #[error(transparent)]
-    Decode(#[from] anyhow::Error),
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index c765c6bacf..8f0f73d4b5 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -42,7 +42,7 @@ use crate::tenant::config::{
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
-use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
 
 use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext::PathExt;
@@ -359,12 +359,6 @@ fn load_tenant_config(
         return Ok(None);
     }
 
-    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
-    if tenant_ignore_mark_file.exists() {
-        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
-        return Ok(None);
-    }
-
     let tenant_shard_id = match tenant_dir_path
         .file_name()
         .unwrap_or_default()
@@ -377,6 +371,59 @@ fn load_tenant_config(
         }
     };
 
+    // Clean up legacy `metadata` files.
+    // Doing it here because every single tenant directory is visited here.
+    // In any later code, there's different treatment of tenant dirs
+    // ... depending on whether the tenant is in re-attach response or not
+    // ... epending on whether the tenant is ignored or not
+    assert_eq!(
+        &conf.tenant_path(&tenant_shard_id),
+        &tenant_dir_path,
+        "later use of conf....path() methods would be dubious"
+    );
+    let timelines: Vec<TimelineId> = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() {
+        Ok(iter) => {
+            let mut timelines = Vec::new();
+            for res in iter {
+                let p = res?;
+                let Some(timeline_id) = p.file_name().parse::<TimelineId>().ok() else {
+                    // skip any entries that aren't TimelineId, such as
+                    // - *.___temp dirs
+                    // - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart)
+                    continue;
+                };
+                timelines.push(timeline_id);
+            }
+            timelines
+        }
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![],
+        Err(e) => return Err(anyhow::anyhow!(e)),
+    };
+    for timeline_id in timelines {
+        let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id);
+        let metadata_path = timeline_path.join(METADATA_FILE_NAME);
+        match std::fs::remove_file(&metadata_path) {
+            Ok(()) => {
+                crashsafe::fsync(timeline_path)
+                    .context("fsync timeline dir after removing legacy metadata file")?;
+                info!("removed legacy metadata file at {metadata_path}");
+            }
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+                // something removed the file earlier, or it was never there
+                // We don't care, this software version doesn't write it again, so, we're good.
+            }
+            Err(e) => {
+                anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}");
+            }
+        }
+    }
+
+    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
+    if tenant_ignore_mark_file.exists() {
+        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
+        return Ok(None);
+    }
+
     Ok(Some((
         tenant_shard_id,
         Tenant::load_tenant_config(conf, &tenant_shard_id),
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 88a0cb8025..c8dc89cc6c 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -45,7 +45,7 @@ use rand::Rng;
 use remote_storage::{DownloadError, GenericRemoteStorage};
 
 use tokio_util::sync::CancellationToken;
-use tracing::{info_span, instrument, Instrument};
+use tracing::{info_span, instrument, warn, Instrument};
 use utils::{
     backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
 };
@@ -791,6 +791,7 @@ async fn init_timeline_state(
         let file_name = file_path.file_name().expect("created it from the dentry");
         if file_name == METADATA_FILE_NAME {
             // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
+            warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
             continue;
         } else if crate::is_temporary(&file_path) {
             // Temporary files are frequently left behind from restarting during downloads
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6ee05116f8..2c2351d531 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -54,7 +54,7 @@ use crate::pgdatadir_mapping::DirectoryKind;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
-    metadata::{save_metadata, TimelineMetadata},
+    metadata::TimelineMetadata,
     par_fsync,
 };
 use crate::{
@@ -345,7 +345,7 @@ pub struct Timeline {
     ///
     /// Must only be taken in two places:
     /// - [`Timeline::compact`] (this file)
-    /// - [`delete::delete_local_layer_files`]
+    /// - [`delete::delete_local_timeline_directory`]
     ///
     /// Timeline deletion will acquire both compaction and gc locks in whatever order.
     compaction_lock: tokio::sync::Mutex<()>,
@@ -354,7 +354,7 @@ pub struct Timeline {
     ///
     /// Must only be taken in two places:
     /// - [`Timeline::gc`] (this file)
-    /// - [`delete::delete_local_layer_files`]
+    /// - [`delete::delete_local_timeline_directory`]
     ///
     /// Timeline deletion will acquire both compaction and gc locks in whatever order.
     gc_lock: tokio::sync::Mutex<()>,
@@ -1845,7 +1845,11 @@ impl Timeline {
                             discovered_layers.push((file_name, file_size));
                             continue;
                         }
-                        Discovered::Metadata | Discovered::IgnoredBackup => {
+                        Discovered::Metadata => {
+                            warn!("found legacy metadata file, these should have been removed in load_tenant_config");
+                            continue;
+                        }
+                        Discovered::IgnoredBackup => {
                             continue;
                         }
                         Discovered::Unknown(file_name) => {
@@ -2352,7 +2356,7 @@ impl Timeline {
         fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| {
             if !self
                 .conf
-                .metadata_path(&self.tenant_shard_id, &self.timeline_id)
+                .timeline_path(&self.tenant_shard_id, &self.timeline_id)
                 .exists()
             {
                 error!("timeline-calculate-logical-size-pre metadata file does not exist")
@@ -3207,7 +3211,7 @@ impl Timeline {
         // The new on-disk layers are now in the layer map. We can remove the
         // in-memory layer from the map now. The flushed layer is stored in
         // the mapping in `create_delta_layer`.
-        let metadata = {
+        {
             let mut guard = self.layers.write().await;
 
             if self.cancel.is_cancelled() {
@@ -3221,9 +3225,7 @@ impl Timeline {
                 self.disk_consistent_lsn.store(disk_consistent_lsn);
 
                 // Schedule remote uploads that will reflect our new disk_consistent_lsn
-                Some(self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?)
-            } else {
-                None
+                self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
             }
             // release lock on 'layers'
         };
@@ -3238,22 +3240,6 @@ impl Timeline {
         // This failpoint is used by another test case `test_pageserver_recovery`.
         fail_point!("flush-frozen-exit");
 
-        // Update the metadata file, with new 'disk_consistent_lsn'
-        //
-        // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing
-        // *all* the layers, to avoid fsyncing the file multiple times.
-
-        // If we updated our disk_consistent_lsn, persist the updated metadata to local disk.
-        if let Some(metadata) = metadata {
-            save_metadata(
-                self.conf,
-                &self.tenant_shard_id,
-                &self.timeline_id,
-                &metadata,
-            )
-            .await
-            .context("save_metadata")?;
-        }
         Ok(())
     }
 
@@ -3309,25 +3295,6 @@ impl Timeline {
         Ok(metadata)
     }
 
-    async fn update_metadata_file(
-        &self,
-        disk_consistent_lsn: Lsn,
-        layers_to_upload: impl IntoIterator<Item = ResidentLayer>,
-    ) -> anyhow::Result<()> {
-        let metadata = self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
-
-        save_metadata(
-            self.conf,
-            &self.tenant_shard_id,
-            &self.timeline_id,
-            &metadata,
-        )
-        .await
-        .context("save_metadata")?;
-
-        Ok(())
-    }
-
     pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
         if let Some(remote_client) = &self.remote_client {
             remote_client
@@ -4660,18 +4627,11 @@ impl Timeline {
             .replace((new_gc_cutoff, wanted_image_layers.to_keyspace()));
 
         if !layers_to_remove.is_empty() {
-            // Persist the new GC cutoff value in the metadata file, before
-            // we actually remove anything.
-            //
-            // This does not in fact have any effect as we no longer consider local metadata unless
-            // running without remote storage.
-            //
+            // Persist the new GC cutoff value before we actually remove anything.
             // This unconditionally schedules also an index_part.json update, even though, we will
             // be doing one a bit later with the unlinked gc'd layers.
-            //
-            // TODO: remove when implementing <https://github.com/neondatabase/neon/issues/4099>.
-            self.update_metadata_file(self.disk_consistent_lsn.load(), None)
-                .await?;
+            let disk_consistent_lsn = self.disk_consistent_lsn.load();
+            self.schedule_uploads(disk_consistent_lsn, None)?;
 
             let gc_layers = layers_to_remove
                 .iter()
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index d2e9eda906..a0c9d99196 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -6,7 +6,7 @@ use std::{
 use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
-use tracing::{debug, error, info, instrument, warn, Instrument};
+use tracing::{debug, error, info, instrument, Instrument};
 use utils::{crashsafe, fs_ext, id::TimelineId};
 
 use crate::{
@@ -124,7 +124,7 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi
 /// No timeout here, GC & Compaction should be responsive to the
 /// `TimelineState::Stopping` change.
 // pub(super): documentation link
-pub(super) async fn delete_local_layer_files(
+pub(super) async fn delete_local_timeline_directory(
     conf: &PageServerConf,
     tenant_shard_id: TenantShardId,
     timeline: &Timeline,
@@ -149,8 +149,6 @@ pub(super) async fn delete_local_layer_files(
     // NB: This need not be atomic because the deleted flag in the IndexPart
     // will be observed during tenant/timeline load. The deletion will be resumed there.
     //
-    // For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
-    //
     // Note that here we do not bail out on std::io::ErrorKind::NotFound.
     // This can happen if we're called a second time, e.g.,
     // because of a previous failure/cancellation at/after
@@ -158,72 +156,21 @@ pub(super) async fn delete_local_layer_files(
     //
     // ErrorKind::NotFound can also happen if we race with tenant detach, because,
     // no locks are shared.
-    //
-    // For now, log and continue.
-    // warn! level is technically not appropriate for the
-    // first case because we should expect retries to happen.
-    // But the error is so rare, it seems better to get attention if it happens.
-    //
-    // Note that metadata removal is skipped, this is not technically needed,
-    // but allows to reuse timeline loading code during resumed deletion.
-    // (we always expect that metadata is in place when timeline is being loaded)
+    tokio::fs::remove_dir_all(local_timeline_directory)
+        .await
+        .or_else(fs_ext::ignore_not_found)
+        .context("remove local timeline directory")?;
 
-    #[cfg(feature = "testing")]
-    let mut counter = 0;
-
-    // Timeline directory may not exist if we failed to delete mark file and request was retried.
-    if !local_timeline_directory.exists() {
-        return Ok(());
-    }
-
-    let metadata_path = conf.metadata_path(&tenant_shard_id, &timeline.timeline_id);
-
-    for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
-        #[cfg(feature = "testing")]
-        {
-            counter += 1;
-            if counter == 2 {
-                fail::fail_point!("timeline-delete-during-rm", |_| {
-                    Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
-                });
-            }
-        }
-
-        let entry = entry?;
-        if entry.path() == metadata_path {
-            debug!("found metadata, skipping");
-            continue;
-        }
-
-        if entry.path() == local_timeline_directory {
-            // Keeping directory because metedata file is still there
-            debug!("found timeline dir itself, skipping");
-            continue;
-        }
-
-        let metadata = match entry.metadata() {
-            Ok(metadata) => metadata,
-            Err(e) => {
-                if crate::is_walkdir_io_not_found(&e) {
-                    warn!(
-                        timeline_dir=?local_timeline_directory,
-                        path=?entry.path().display(),
-                        "got not found err while removing timeline dir, proceeding anyway"
-                    );
-                    continue;
-                }
-                anyhow::bail!(e);
-            }
-        };
-
-        if metadata.is_dir() {
-            warn!(path=%entry.path().display(), "unexpected directory under timeline dir");
-            tokio::fs::remove_dir(entry.path()).await
-        } else {
-            tokio::fs::remove_file(entry.path()).await
-        }
-        .with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
-    }
+    // Make sure previous deletions are ordered before mark removal.
+    // Otherwise there is no guarantee that they reach the disk before mark deletion.
+    // So its possible for mark to reach disk first and for other deletions
+    // to be reordered later and thus missed if a crash occurs.
+    // Note that we dont need to sync after mark file is removed
+    // because we can tolerate the case when mark file reappears on startup.
+    let timeline_path = conf.timelines_path(&tenant_shard_id);
+    crashsafe::fsync_async(timeline_path)
+        .await
+        .context("fsync_pre_mark_remove")?;
 
     info!("finished deleting layer files, releasing locks");
     drop(guards);
@@ -254,39 +201,6 @@ async fn cleanup_remaining_timeline_fs_traces(
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
 ) -> anyhow::Result<()> {
-    // Remove local metadata
-    tokio::fs::remove_file(conf.metadata_path(&tenant_shard_id, &timeline_id))
-        .await
-        .or_else(fs_ext::ignore_not_found)
-        .context("remove metadata")?;
-
-    fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: timeline-delete-after-rm-metadata"
-        ))?
-    });
-
-    // Remove timeline dir
-    tokio::fs::remove_dir(conf.timeline_path(&tenant_shard_id, &timeline_id))
-        .await
-        .or_else(fs_ext::ignore_not_found)
-        .context("timeline dir")?;
-
-    fail::fail_point!("timeline-delete-after-rm-dir", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
-    });
-
-    // Make sure previous deletions are ordered before mark removal.
-    // Otherwise there is no guarantee that they reach the disk before mark deletion.
-    // So its possible for mark to reach disk first and for other deletions
-    // to be reordered later and thus missed if a crash occurs.
-    // Note that we dont need to sync after mark file is removed
-    // because we can tolerate the case when mark file reappears on startup.
-    let timeline_path = conf.timelines_path(&tenant_shard_id);
-    crashsafe::fsync_async(timeline_path)
-        .await
-        .context("fsync_pre_mark_remove")?;
-
     // Remove delete mark
     // TODO: once we are confident that no more exist in the field, remove this
     // line.  It cleans up a legacy marker file that might in rare cases be present.
@@ -552,15 +466,12 @@ impl DeleteTimelineFlow {
         tenant: &Tenant,
         timeline: &Timeline,
     ) -> Result<(), DeleteTimelineError> {
-        delete_local_layer_files(conf, tenant.tenant_shard_id, timeline).await?;
+        delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
 
         delete_remote_layers_and_index(timeline).await?;
 
         pausable_failpoint!("in_progress_delete");
 
-        cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_shard_id, timeline.timeline_id)
-            .await?;
-
         remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
 
         *guard = Self::Finished;
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 18eba6e1c3..95f912ccc5 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -694,10 +694,8 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
     # index upload is now hitting the failpoint, it should block the shutdown
     env.pageserver.stop(immediate=True)
 
-    local_metadata = (
-        env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata"
-    )
-    assert local_metadata.is_file()
+    timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id)
+    assert timeline_dir.is_dir()
 
     assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
 
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index e928ea8bb1..8c7d332e1d 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -130,7 +130,6 @@ FAILPOINTS = [
     "timeline-delete-before-index-deleted-at",
     "timeline-delete-before-rm",
     "timeline-delete-before-index-delete",
-    "timeline-delete-after-rm-dir",
 ]
 
 FAILPOINTS_BEFORE_BACKGROUND = [
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index f4eb6b092d..b70131472a 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -157,10 +157,7 @@ def switch_pg_to_new_pageserver(
     timeline_to_detach_local_path = origin_ps.timeline_dir(tenant_id, timeline_id)
     files_before_detach = os.listdir(timeline_to_detach_local_path)
     assert (
-        "metadata" in files_before_detach
-    ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file, but got: {files_before_detach}"
-    assert (
-        len(files_before_detach) >= 2
+        len(files_before_detach) >= 1
     ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file, but got {files_before_detach}"
 
     return timeline_to_detach_local_path
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 5fda5aa569..a6a6fb47cc 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -136,12 +136,9 @@ DELETE_FAILPOINTS = [
     "timeline-delete-before-index-deleted-at",
     "timeline-delete-before-schedule",
     "timeline-delete-before-rm",
-    "timeline-delete-during-rm",
     "timeline-delete-after-rm",
     "timeline-delete-before-index-delete",
     "timeline-delete-after-index-delete",
-    "timeline-delete-after-rm-metadata",
-    "timeline-delete-after-rm-dir",
 ]
 
 
@@ -801,7 +798,7 @@ def test_timeline_delete_resumed_on_attach(
         )
 
     # failpoint before we remove index_part from s3
-    failpoint = "timeline-delete-during-rm"
+    failpoint = "timeline-delete-after-rm"
     ps_http.configure_failpoints((failpoint, "return"))
 
     env.pageserver.allowed_errors.extend(

From a12e4261a32522f3e95602870ca44a18c95766fb Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 23 Feb 2024 13:56:41 +0000
Subject: [PATCH 0264/1571] Add neon.primary_is_running GUC. (#6705)

We set it for neon replica, if primary is running.

Postgres uses this GUC at the start,
to determine if replica should wait for
RUNNING_XACTS from primary or not.

Corresponding cloud PR is
https://github.com/neondatabase/cloud/pull/10183

* Add test hot-standby replica startup.
* Extract oldest_running_xid from XlRunningXits WAL records.
---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Konstantin Knizhnik <knizhnik@garret.ru>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 compute_tools/src/config.rs                   |  6 ++++
 control_plane/src/endpoint.rs                 |  1 +
 libs/compute_api/src/spec.rs                  |  6 ++++
 libs/postgres_ffi/src/pg_constants.rs         |  3 ++
 libs/postgres_ffi/src/xlog_utils.rs           |  5 ---
 pageserver/src/walingest.rs                   | 13 +++++++
 pageserver/src/walrecord.rs                   | 36 +++++++++++++++++++
 pgxn/neon/neon.c                              | 11 ++++++
 test_runner/fixtures/neon_fixtures.py         | 17 +++++++++
 test_runner/regress/test_hot_standby.py       | 19 ++--------
 test_runner/regress/test_replication_start.py | 30 ++++++++++++++++
 vendor/postgres-v14                           |  2 +-
 vendor/postgres-v15                           |  2 +-
 vendor/postgres-v16                           |  2 +-
 vendor/revisions.json                         |  7 ++--
 15 files changed, 132 insertions(+), 28 deletions(-)
 create mode 100644 test_runner/regress/test_replication_start.py

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 03fd56aa97..42b8480211 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -82,6 +82,12 @@ pub fn write_postgres_conf(
         ComputeMode::Replica => {
             // hot_standby is 'on' by default, but let's be explicit
             writeln!(file, "hot_standby=on")?;
+
+            // Inform the replica about the primary state
+            // Default is 'false'
+            if let Some(primary_is_running) = spec.primary_is_running {
+                writeln!(file, "neon.primary_is_running={}", primary_is_running)?;
+            }
         }
     }
 
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index bab7a70ce7..de7eb797d6 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -590,6 +590,7 @@ impl Endpoint {
             remote_extensions,
             pgbouncer_settings: None,
             shard_stripe_size: Some(shard_stripe_size),
+            primary_is_running: None,
         };
         let spec_path = self.endpoint_path().join("spec.json");
         std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 2f412b61a3..71ae66c45c 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -79,6 +79,12 @@ pub struct ComputeSpec {
     // Stripe size for pageserver sharding, in pages
     #[serde(default)]
     pub shard_stripe_size: Option<usize>,
+
+    // When we are starting a new replica in hot standby mode,
+    // we need to know if the primary is running.
+    // This is used to determine if replica should wait for
+    // RUNNING_XACTS from primary or not.
+    pub primary_is_running: Option<bool>,
 }
 
 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs
index d59e0e4a15..2701ddf5e0 100644
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -80,6 +80,9 @@ pub const XLOG_XACT_ABORT: u8 = 0x20;
 pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30;
 pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;
 
+// From standbydefs.h
+pub const XLOG_RUNNING_XACTS: u8 = 0x10;
+
 // From srlu.h
 pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
 pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usize;
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 977653848d..4a66a0ab1d 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -119,11 +119,6 @@ pub fn generate_pg_control(
     // Generate new pg_control needed for bootstrap
     checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;
 
-    //reset some fields we don't want to preserve
-    //TODO Check this.
-    //We may need to determine the value from twophase data.
-    checkpoint.oldestActiveXid = 0;
-
     //save new values in pg_control
     pg_control.checkPoint = 0;
     pg_control.checkPointCopy = checkpoint;
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 8df2f1713a..3a2705bb50 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -334,6 +334,12 @@ impl WalIngest {
                     {
                         self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
                     }
+                    trace!(
+                        "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
+                        xlog_checkpoint.oldestActiveXid,
+                        self.checkpoint.oldestActiveXid
+                    );
+                    self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
 
                     // Write a new checkpoint key-value pair on every checkpoint record, even
                     // if nothing really changed. Not strictly required, but it seems nice to
@@ -360,6 +366,13 @@ impl WalIngest {
                     }
                 }
             }
+            pg_constants::RM_STANDBY_ID => {
+                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+                if info == pg_constants::XLOG_RUNNING_XACTS {
+                    let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
+                    self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
+                }
+            }
             _x => {
                 // TODO: should probably log & fail here instead of blindly
                 // doing something without understanding the protocol
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index 1b7777a544..ae2d996879 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -773,6 +773,42 @@ impl XlLogicalMessage {
     }
 }
 
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlRunningXacts {
+    pub xcnt: u32,
+    pub subxcnt: u32,
+    pub subxid_overflow: bool,
+    pub next_xid: TransactionId,
+    pub oldest_running_xid: TransactionId,
+    pub latest_completed_xid: TransactionId,
+    pub xids: Vec<TransactionId>,
+}
+
+impl XlRunningXacts {
+    pub fn decode(buf: &mut Bytes) -> XlRunningXacts {
+        let xcnt = buf.get_u32_le();
+        let subxcnt = buf.get_u32_le();
+        let subxid_overflow = buf.get_u32_le() != 0;
+        let next_xid = buf.get_u32_le();
+        let oldest_running_xid = buf.get_u32_le();
+        let latest_completed_xid = buf.get_u32_le();
+        let mut xids = Vec::new();
+        for _ in 0..(xcnt + subxcnt) {
+            xids.push(buf.get_u32_le());
+        }
+        XlRunningXacts {
+            xcnt,
+            subxcnt,
+            subxid_overflow,
+            next_xid,
+            oldest_running_xid,
+            latest_completed_xid,
+            xids,
+        }
+    }
+}
+
 /// Main routine to decode a WAL record and figure out which blocks are modified
 //
 // See xlogrecord.h for details
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 24ec909c79..a14288b33a 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -38,6 +38,7 @@ PG_MODULE_MAGIC;
 void		_PG_init(void);
 
 static int	logical_replication_max_snap_files = 300;
+bool primary_is_running = false;
 
 static void
 InitLogicalReplicationMonitor(void)
@@ -267,6 +268,7 @@ LogicalSlotsMonitorMain(Datum main_arg)
 	}
 }
 
+
 void
 _PG_init(void)
 {
@@ -287,6 +289,15 @@ _PG_init(void)
 
 	pg_init_extension_server();
 
+	DefineCustomBoolVariable(
+		"neon.primary_is_running",
+		"true if the primary was running at replica startup. false otherwise",
+		NULL,
+		&primary_is_running,
+		false,
+		PGC_POSTMASTER,
+		0,
+		NULL, NULL, NULL);
 	/*
 	 * Important: This must happen after other parts of the extension are
 	 * loaded, otherwise any settings to GUCs that were set before the
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 79a4c7cde8..441b64ebfc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3104,6 +3104,8 @@ class Endpoint(PgProtocol):
         # set small 'max_replication_write_lag' to enable backpressure
         # and make tests more stable.
         config_lines = ["max_replication_write_lag=15MB"] + config_lines
+
+        config_lines = ["neon.primary_is_running=on"] + config_lines
         self.config(config_lines)
 
         return self
@@ -4147,6 +4149,21 @@ def tenant_get_shards(
         return [(TenantShardId(tenant_id, 0, 0), override_pageserver or env.pageserver)]
 
 
+def wait_replica_caughtup(primary: Endpoint, secondary: Endpoint):
+    primary_lsn = Lsn(
+        primary.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()", log_query=False)
+    )
+    while True:
+        secondary_lsn = Lsn(
+            secondary.safe_psql_scalar("SELECT pg_last_wal_replay_lsn()", log_query=False)
+        )
+        caught_up = secondary_lsn >= primary_lsn
+        log.info(f"caughtup={caught_up}, primary_lsn={primary_lsn}, secondary_lsn={secondary_lsn}")
+        if caught_up:
+            return
+        time.sleep(1)
+
+
 def wait_for_last_flush_lsn(
     env: NeonEnv,
     endpoint: Endpoint,
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 7822e29ed9..0497e1965c 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -3,22 +3,7 @@ import re
 import time
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import Endpoint, NeonEnv
-
-
-def wait_caughtup(primary: Endpoint, secondary: Endpoint):
-    primary_lsn = primary.safe_psql_scalar(
-        "SELECT pg_current_wal_insert_lsn()::text", log_query=False
-    )
-    while True:
-        secondary_lsn = secondary.safe_psql_scalar(
-            "SELECT pg_last_wal_replay_lsn()", log_query=False
-        )
-        caught_up = secondary_lsn >= primary_lsn
-        log.info(f"caughtup={caught_up}, primary_lsn={primary_lsn}, secondary_lsn={secondary_lsn}")
-        if caught_up:
-            return
-        time.sleep(1)
+from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup
 
 
 # Check for corrupted WAL messages which might otherwise go unnoticed if
@@ -79,7 +64,7 @@ def test_hot_standby(neon_simple_env: NeonEnv):
                     primary.safe_psql("create table t(key int, value text)")
                     primary.safe_psql("insert into t select generate_series(1, 100000), 'payload'")
 
-            wait_caughtup(primary, secondary)
+            wait_replica_caughtup(primary, secondary)
 
             with secondary.connect() as s_con:
                 with s_con.cursor() as s_cur:
diff --git a/test_runner/regress/test_replication_start.py b/test_runner/regress/test_replication_start.py
new file mode 100644
index 0000000000..b4699c7be8
--- /dev/null
+++ b/test_runner/regress/test_replication_start.py
@@ -0,0 +1,30 @@
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup
+
+
+def test_replication_start(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    with env.endpoints.create_start(branch_name="main", endpoint_id="primary") as primary:
+        with primary.connect() as p_con:
+            with p_con.cursor() as p_cur:
+                p_cur.execute("begin")
+                p_cur.execute("create table t(pk integer primary key, payload integer)")
+                p_cur.execute("insert into t values (generate_series(1,100000), 0)")
+                p_cur.execute("select txid_current()")
+                xid = p_cur.fetchall()[0][0]
+                log.info(f"Master transaction {xid}")
+                with env.endpoints.new_replica_start(
+                    origin=primary, endpoint_id="secondary"
+                ) as secondary:
+                    wait_replica_caughtup(primary, secondary)
+                    with secondary.connect() as s_con:
+                        with s_con.cursor() as s_cur:
+                            # Enforce setting hint bits for pg_class tuples.
+                            # If master's transaction is not marked as in-progress in MVCC snapshot,
+                            # then XMIN_INVALID hint bit will be set for table's 't' tuple makeing it invisible.
+                            s_cur.execute("select * from pg_class")
+                            p_cur.execute("commit")
+                            wait_replica_caughtup(primary, secondary)
+                            s_cur.execute("select * from t where pk = 1")
+                            assert s_cur.fetchone() == (1, 0)
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 17101190de..4cdba8ec5a 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 17101190de8a54b95e0831c66c3da426ed33db34
+Subproject commit 4cdba8ec5a3868cec4826bbb3f16c1d3d2ac2283
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 0baccce15a..0ec04712d5 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 0baccce15a3b0446af5c403d2e869a04541b63c4
+Subproject commit 0ec04712d55539550278595e853c172f7aa5fe3e
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index dc40299045..cc98378b0f 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit dc40299045a377ec3b302c900134468a1b0f58ee
+Subproject commit cc98378b0fa7413b78a197e3292a806865e4056a
diff --git a/vendor/revisions.json b/vendor/revisions.json
index d18f1588f5..540b7ec898 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,6 @@
 {
-    "postgres-v16": "dc40299045a377ec3b302c900134468a1b0f58ee",
-    "postgres-v15": "0baccce15a3b0446af5c403d2e869a04541b63c4",
-    "postgres-v14": "17101190de8a54b95e0831c66c3da426ed33db34"
+    "postgres-v16": "cc98378b0fa7413b78a197e3292a806865e4056a",
+    "postgres-v15": "0ec04712d55539550278595e853c172f7aa5fe3e",
+    "postgres-v14": "4cdba8ec5a3868cec4826bbb3f16c1d3d2ac2283"
 }
+

From 94f6b488edd9d6042a5dd130347e765ab0fa1fb0 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 23 Feb 2024 15:12:09 +0100
Subject: [PATCH 0265/1571] CI(release-proxy): fix a couple missed
 release-proxy branch handling (#6892)

## Problem

In the original PR[0], I've missed a couple of `release` occurrences
that should also be handled for `release-proxy` branch

- [0] https://github.com/neondatabase/neon/pull/6797

## Summary of changes
- Add handling for `release-proxy` branch to allure report
- Add handling for `release-proxy` branch to e2e tests malts.com
---
 .github/actions/allure-report-generate/action.yml | 2 +-
 .github/actions/allure-report-store/action.yml    | 2 +-
 .github/workflows/trigger-e2e-tests.yml           | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index 9a0c79a221..1ecb5ecc7e 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -39,7 +39,7 @@ runs:
         PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
         if [ "${PR_NUMBER}" != "null" ]; then
           BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
           # Shortcut for special branches
           BRANCH_OR_PR=${GITHUB_REF_NAME}
         else
diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml
index 7ae9937d42..df4a6712ac 100644
--- a/.github/actions/allure-report-store/action.yml
+++ b/.github/actions/allure-report-store/action.yml
@@ -19,7 +19,7 @@ runs:
         PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
         if [ "${PR_NUMBER}" != "null" ]; then
           BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
           # Shortcut for special branches
           BRANCH_OR_PR=${GITHUB_REF_NAME}
         else
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 7d04a8ec8a..ae34cbffe0 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -51,6 +51,8 @@ jobs:
             echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
             echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
           else
             echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
             BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')

From ec3efc56a8a03a772bb59f5084179d65b8432b0b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 23 Feb 2024 17:16:43 +0100
Subject: [PATCH 0266/1571] Revert "Revert
 "refactor(VirtualFile::crashsafe_overwrite): avoid Handle::block_on in
 callers"" (#6775)

Reverts neondatabase/neon#6765 , bringing back #6731

We concluded that #6731 never was the root cause for the instability in
staging.
More details:
https://neondb.slack.com/archives/C033RQ5SPDH/p1708011674755319

However, the massive amount of concurrent `spawn_blocking` calls from
the `save_metadata` calls during startups might cause a performance
regression.
So, we'll merge this PR here after we've stopped writing the metadata
#6769).
---
 libs/utils/src/crashsafe.rs                   | 44 +++++++++++-
 pageserver/src/deletion_queue.rs              |  5 +-
 pageserver/src/tenant.rs                      | 33 +++------
 pageserver/src/tenant/secondary/downloader.rs | 11 +--
 pageserver/src/virtual_file.rs                | 72 ++++++++-----------
 5 files changed, 88 insertions(+), 77 deletions(-)

diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 1c72e9cae9..756b19138c 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::{
     borrow::Cow,
     fs::{self, File},
-    io,
+    io::{self, Write},
 };
 
 use camino::{Utf8Path, Utf8PathBuf};
@@ -161,6 +161,48 @@ pub async fn durable_rename(
     Ok(())
 }
 
+/// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`].
+///
+/// The file is first written to the specified `tmp_path`, and in a second
+/// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync
+/// and atomic rename guarantee that, if we crash at any point, there will never
+/// be a partially written file at `final_path` (but maybe at `tmp_path`).
+///
+/// Callers are responsible for serializing calls of this function for a given `final_path`.
+/// If they don't, there may be an error due to conflicting `tmp_path`, or there will
+/// be no error and the content of `final_path` will be the "winner" caller's `content`.
+/// I.e., the atomticity guarantees still hold.
+pub fn overwrite(
+    final_path: &Utf8Path,
+    tmp_path: &Utf8Path,
+    content: &[u8],
+) -> std::io::Result<()> {
+    let Some(final_path_parent) = final_path.parent() else {
+        return Err(std::io::Error::from_raw_os_error(
+            nix::errno::Errno::EINVAL as i32,
+        ));
+    };
+    std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
+    let mut file = std::fs::OpenOptions::new()
+        .write(true)
+        // Use `create_new` so that, if we race with ourselves or something else,
+        // we bail out instead of causing damage.
+        .create_new(true)
+        .open(tmp_path)?;
+    file.write_all(content)?;
+    file.sync_all()?;
+    drop(file); // don't keep the fd open for longer than we have to
+
+    std::fs::rename(tmp_path, final_path)?;
+
+    let final_parent_dirfd = std::fs::OpenOptions::new()
+        .read(true)
+        .open(final_path_parent)?;
+
+    final_parent_dirfd.sync_all()?;
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
 
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 62ba702db7..ca9ae8f983 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -234,7 +234,7 @@ impl DeletionHeader {
         let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
         let header_path = conf.deletion_header_path();
         let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
-        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, header_bytes)
+        VirtualFile::crashsafe_overwrite(header_path, temp_path, header_bytes)
             .await
             .maybe_fatal_err("save deletion header")?;
 
@@ -325,7 +325,8 @@ impl DeletionList {
         let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
 
         let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
-        VirtualFile::crashsafe_overwrite(&path, &temp_path, bytes)
+
+        VirtualFile::crashsafe_overwrite(path, temp_path, bytes)
             .await
             .maybe_fatal_err("save deletion list")
             .map_err(Into::into)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 9fa087f0d9..6389d52014 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -29,7 +29,6 @@ use remote_storage::TimeoutOrCancel;
 use std::fmt;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
-use tokio::runtime::Handle;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
@@ -2609,17 +2608,10 @@ impl Tenant {
 
         let tenant_shard_id = *tenant_shard_id;
         let config_path = config_path.to_owned();
-        tokio::task::spawn_blocking(move || {
-            Handle::current().block_on(async move {
-                let conf_content = conf_content.into_bytes();
-                VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content)
-                    .await
-                    .with_context(|| {
-                        format!("write tenant {tenant_shard_id} config to {config_path}")
-                    })
-            })
-        })
-        .await??;
+        let conf_content = conf_content.into_bytes();
+        VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content)
+            .await
+            .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?;
 
         Ok(())
     }
@@ -2646,17 +2638,12 @@ impl Tenant {
 
         let tenant_shard_id = *tenant_shard_id;
         let target_config_path = target_config_path.to_owned();
-        tokio::task::spawn_blocking(move || {
-            Handle::current().block_on(async move {
-                let conf_content = conf_content.into_bytes();
-                VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content)
-                    .await
-                    .with_context(|| {
-                        format!("write tenant {tenant_shard_id} config to {target_config_path}")
-                    })
-            })
-        })
-        .await??;
+        let conf_content = conf_content.into_bytes();
+        VirtualFile::crashsafe_overwrite(target_config_path.clone(), temp_path, conf_content)
+            .await
+            .with_context(|| {
+                format!("write tenant {tenant_shard_id} config to {target_config_path}")
+            })?;
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index c8dc89cc6c..5c4e4fd160 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -491,14 +491,9 @@ impl<'a> TenantDownloader<'a> {
         let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
         let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
         let heatmap_path_bg = heatmap_path.clone();
-        tokio::task::spawn_blocking(move || {
-            tokio::runtime::Handle::current().block_on(async move {
-                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, heatmap_bytes).await
-            })
-        })
-        .await
-        .expect("Blocking task is never aborted")
-        .maybe_fatal_err(&context_msg)?;
+        VirtualFile::crashsafe_overwrite(heatmap_path_bg, temp_path, heatmap_bytes)
+            .await
+            .maybe_fatal_err(&context_msg)?;
 
         tracing::debug!("Wrote local heatmap to {}", heatmap_path);
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 45c3e19cfc..858fc0ef64 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -19,14 +19,13 @@ use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use std::fs::{self, File};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
-use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
+use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
 
 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
-use utils::fs_ext;
 
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
@@ -404,47 +403,34 @@ impl VirtualFile {
         Ok(vfile)
     }
 
-    /// Writes a file to the specified `final_path` in a crash safe fasion
+    /// Async version of [`::utils::crashsafe::overwrite`].
     ///
-    /// The file is first written to the specified tmp_path, and in a second
-    /// step, the tmp path is renamed to the final path. As renames are
-    /// atomic, a crash during the write operation will never leave behind a
-    /// partially written file.
-    pub async fn crashsafe_overwrite<B: BoundedBuf>(
-        final_path: &Utf8Path,
-        tmp_path: &Utf8Path,
+    /// # NB:
+    ///
+    /// Doesn't actually use the [`VirtualFile`] file descriptor cache, but,
+    /// it did at an earlier time.
+    /// And it will use this module's [`io_engine`] in the near future, so, leaving it here.
+    pub async fn crashsafe_overwrite<B: BoundedBuf<Buf = Buf> + Send, Buf: IoBuf + Send>(
+        final_path: Utf8PathBuf,
+        tmp_path: Utf8PathBuf,
         content: B,
     ) -> std::io::Result<()> {
-        let Some(final_path_parent) = final_path.parent() else {
-            return Err(std::io::Error::from_raw_os_error(
-                nix::errno::Errno::EINVAL as i32,
-            ));
-        };
-        std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?;
-        let mut file = Self::open_with_options(
-            tmp_path,
-            OpenOptions::new()
-                .write(true)
-                // Use `create_new` so that, if we race with ourselves or something else,
-                // we bail out instead of causing damage.
-                .create_new(true),
-        )
-        .await?;
-        let (_content, res) = file.write_all(content).await;
-        res?;
-        file.sync_all().await?;
-        drop(file); // before the rename, that's important!
-                    // renames are atomic
-        std::fs::rename(tmp_path, final_path)?;
-        // Only open final path parent dirfd now, so that this operation only
-        // ever holds one VirtualFile fd at a time.  That's important because
-        // the current `find_victim_slot` impl might pick the same slot for both
-        // VirtualFile., and it eventually does a blocking write lock instead of
-        // try_lock.
-        let final_parent_dirfd =
-            Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?;
-        final_parent_dirfd.sync_all().await?;
-        Ok(())
+        // TODO: use tokio_epoll_uring if configured as `io_engine`.
+        // See https://github.com/neondatabase/neon/issues/6663
+
+        tokio::task::spawn_blocking(move || {
+            let slice_storage;
+            let content_len = content.bytes_init();
+            let content = if content.bytes_init() > 0 {
+                slice_storage = Some(content.slice(0..content_len));
+                slice_storage.as_deref().expect("just set it to Some()")
+            } else {
+                &[]
+            };
+            utils::crashsafe::overwrite(&final_path, &tmp_path, content)
+        })
+        .await
+        .expect("blocking task is never aborted")
     }
 
     /// Call File::sync_all() on the underlying File.
@@ -1337,7 +1323,7 @@ mod tests {
         let path = testdir.join("myfile");
         let tmp_path = testdir.join("myfile.tmp");
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1346,7 +1332,7 @@ mod tests {
         assert!(!tmp_path.exists());
         drop(file);
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1368,7 +1354,7 @@ mod tests {
         std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
         assert!(tmp_path.exists());
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
             .await
             .unwrap();
 

From b8f9e3a9ebb1f6008569e51a84669091851973e6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Sat, 24 Feb 2024 23:32:41 +0200
Subject: [PATCH 0267/1571] fix(flaky): typo Stopping/Stopped (#6894)

introduced in 8dee9908f83fdebea1dfd36304272bdbe684ad5c, should help with
the #6681 common problem which is just a mismatched allowed error.
---
 pageserver/src/tenant/upload_queue.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 0b61bc0a10..a5516bb9a9 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -130,7 +130,7 @@ pub(super) struct UploadQueueStopped {
 pub(crate) enum NotInitialized {
     #[error("queue is in state Uninitialized")]
     Uninitialized,
-    #[error("queue is in state Stopping")]
+    #[error("queue is in state Stopped")]
     Stopped,
     #[error("queue is shutting down")]
     ShuttingDown,

From 8283779ee84d351f520c4f95327e646e2db0f7d7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Sun, 25 Feb 2024 14:53:17 +0000
Subject: [PATCH 0268/1571] pageserver: remove legacy attach/detach APIs from
 swagger (#6883)

## Problem

Since the location config API was added, the attach and detach endpoints
are deprecated. Hiding them from consumers of the swagger definition is
a precursor to removing them

Neon's cloud no longer uses this api since
https://github.com/neondatabase/cloud/pull/10538

Fully removing the APIs will implicitly make use of generation numbers
mandatory, and should happen alongside
https://github.com/neondatabase/neon/issues/5388, which will happen once
we're happy that the storage controller is ready for prime time.

## Summary of changes

- Remove /attach and /detach from pageserver's swagger file
---
 pageserver/src/http/openapi_spec.yml | 178 ---------------------------
 1 file changed, 178 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 479c7ca0f5..5afb3ba63d 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -567,114 +567,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/ServiceUnavailableError"
-
-  /v1/tenant/{tenant_id}/attach:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-    post:
-      description: |
-        Schedules attach operation to happen in the background for the given tenant.
-        As soon as the caller sends this request, it must assume the pageserver
-        starts writing to the tenant's S3 state unless it receives one of the
-        distinguished errors below that state otherwise.
-
-        If a client receives a not-distinguished response, e.g., a network timeout,
-        it MUST retry the /attach request and poll again for the tenant's
-        attachment status.
-
-        After the client has received a 202, it MUST poll the tenant's
-        attachment status (field `attachment_status`) to reach state `attached`.
-        If the `attachment_status` is missing, the client MUST retry the `/attach`
-        request (goto previous paragraph). This is a robustness measure in case the tenant
-        status endpoint is buggy, but the attach operation is ongoing.
-
-        There is no way to cancel an in-flight request.
-
-        In any case, the client
-        * MUST NOT ASSUME that the /attach request has been lost in the network,
-        * MUST NOT ASSUME that the request has been lost, based on the observation
-          that a subsequent tenant status request returns 404. The request may
-          still be in flight. It must be retried.
-
-        The client SHOULD supply a `TenantConfig` for the tenant in the request body.
-        Settings specified in the config override the pageserver's defaults.
-        It is guaranteed that the config settings are applied before the pageserver
-        starts operating on the tenant. E.g., if the config specifies a specific
-        PITR interval for a tenant, then that setting will be in effect before the
-        pageserver starts the garbage collection loop. This enables a client to
-        guarantee a specific PITR setting across detach/attach cycles.
-        The pageserver will reject the request if it cannot parse the config, or
-        if there are any unknown fields in it.
-
-        If the client does not supply a config, the pageserver will use its defaults.
-        This behavior is deprecated: https://github.com/neondatabase/neon/issues/4282
-      requestBody:
-        required: false
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/TenantAttachRequest"
-      responses:
-        "202":
-          description: Tenant attaching scheduled
-        "400":
-          description: Bad Request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Timeline not found
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "409":
-          description: |
-            The tenant is already known to Pageserver in some way,
-            and hence this `/attach` call has been rejected.
-
-            Some examples of how this can happen:
-            - tenant was created on this pageserver
-            - tenant attachment was started by an earlier call to `/attach`.
-
-            Callers should poll the tenant status's `attachment_status` field,
-            like for status 202. See the longer description for `POST /attach`
-            for details.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ConflictError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
-
   /v1/tenant/{tenant_id}/location_config:
     parameters:
       - name: tenant_id
@@ -770,66 +662,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/Error"
-
-  /v1/tenant/{tenant_id}/detach:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: detach_ignored
-        in: query
-        required: false
-        schema:
-          type: boolean
-        description: |
-          When true, allow to detach a tenant which state is ignored.
-    post:
-      description: |
-        Remove tenant data (including all corresponding timelines) from pageserver's memory and file system.
-        Files on the remote storage are not affected.
-      responses:
-        "200":
-          description: Tenant detached
-        "400":
-          description: Error when no tenant id found in path parameters
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Tenant not found
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
-
   /v1/tenant/{tenant_id}/ignore:
     parameters:
       - name: tenant_id
@@ -1464,16 +1296,6 @@ components:
         generation:
           type: integer
           description: Attachment generation number.
-    TenantAttachRequest:
-      type: object
-      required:
-        - config
-      properties:
-        config:
-          $ref: '#/components/schemas/TenantConfig'
-        generation:
-          type: integer
-          description: Attachment generation number.
     TenantConfigRequest:
       allOf:
         - $ref: '#/components/schemas/TenantConfig'

From dedf66ba5b348951fdf6cdb5c93b0934415f07db Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 26 Feb 2024 10:05:24 +0100
Subject: [PATCH 0269/1571] remove `gc_feedback` mechanism (#6863)

It's been dead-code-at-runtime for 9 months, let's remove it.
We can always re-introduce it at a later point.

Came across this while working on #6861, which will touch
`time_for_new_image_layer`. This is an opporunity to make that function
simpler.
---
 control_plane/src/pageserver.rs               | 10 ---
 libs/pageserver_api/src/models.rs             |  1 -
 pageserver/src/config.rs                      |  1 -
 pageserver/src/tenant.rs                      |  1 -
 pageserver/src/tenant/config.rs               |  8 ---
 pageserver/src/tenant/timeline.rs             | 66 +------------------
 test_runner/performance/test_gc_feedback.py   |  5 ++
 .../regress/test_attach_tenant_config.py      |  1 -
 8 files changed, 6 insertions(+), 87 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 5909477586..a52fcb4a3f 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -391,11 +391,6 @@ impl PageServerNode {
             evictions_low_residence_duration_metric_threshold: settings
                 .remove("evictions_low_residence_duration_metric_threshold")
                 .map(|x| x.to_string()),
-            gc_feedback: settings
-                .remove("gc_feedback")
-                .map(|x| x.parse::<bool>())
-                .transpose()
-                .context("Failed to parse 'gc_feedback' as bool")?,
             heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
             lazy_slru_download: settings
                 .remove("lazy_slru_download")
@@ -501,11 +496,6 @@ impl PageServerNode {
                 evictions_low_residence_duration_metric_threshold: settings
                     .remove("evictions_low_residence_duration_metric_threshold")
                     .map(|x| x.to_string()),
-                gc_feedback: settings
-                    .remove("gc_feedback")
-                    .map(|x| x.parse::<bool>())
-                    .transpose()
-                    .context("Failed to parse 'gc_feedback' as bool")?,
                 heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
                 lazy_slru_download: settings
                     .remove("lazy_slru_download")
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index aa1a8ae487..ce9afd65ac 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -283,7 +283,6 @@ pub struct TenantConfig {
     pub eviction_policy: Option<EvictionPolicy>,
     pub min_resident_size_override: Option<u64>,
     pub evictions_low_residence_duration_metric_threshold: Option<String>,
-    pub gc_feedback: Option<bool>,
     pub heatmap_period: Option<String>,
     pub lazy_slru_download: Option<bool>,
     pub timeline_get_throttle: Option<ThrottleConfig>,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 3b7672fa26..b0d828d066 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -140,7 +140,6 @@ pub mod defaults {
 
 #min_resident_size_override = .. # in bytes
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
-#gc_feedback = false
 
 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
 #secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 6389d52014..c97f24c0fc 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3641,7 +3641,6 @@ pub(crate) mod harness {
                 evictions_low_residence_duration_metric_threshold: Some(
                     tenant_conf.evictions_low_residence_duration_metric_threshold,
                 ),
-                gc_feedback: Some(tenant_conf.gc_feedback),
                 heatmap_period: Some(tenant_conf.heatmap_period),
                 lazy_slru_download: Some(tenant_conf.lazy_slru_download),
                 timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 5c88d30caf..cce30e900e 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -339,7 +339,6 @@ pub struct TenantConf {
     // See the corresponding metric's help string.
     #[serde(with = "humantime_serde")]
     pub evictions_low_residence_duration_metric_threshold: Duration,
-    pub gc_feedback: bool,
 
     /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
     /// may be disabled if a Tenant will not have secondary locations: only secondary
@@ -427,10 +426,6 @@ pub struct TenantConfOpt {
     #[serde(default)]
     pub evictions_low_residence_duration_metric_threshold: Option<Duration>,
 
-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(default)]
-    pub gc_feedback: Option<bool>,
-
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(with = "humantime_serde")]
     #[serde(default)]
@@ -485,7 +480,6 @@ impl TenantConfOpt {
             evictions_low_residence_duration_metric_threshold: self
                 .evictions_low_residence_duration_metric_threshold
                 .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
-            gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
             heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period),
             lazy_slru_download: self
                 .lazy_slru_download
@@ -530,7 +524,6 @@ impl Default for TenantConf {
                 DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
             )
             .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
-            gc_feedback: false,
             heatmap_period: Duration::ZERO,
             lazy_slru_download: false,
             timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
@@ -603,7 +596,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
             evictions_low_residence_duration_metric_threshold: value
                 .evictions_low_residence_duration_metric_threshold
                 .map(humantime),
-            gc_feedback: value.gc_feedback,
             heatmap_period: value.heatmap_period.map(humantime),
             lazy_slru_download: value.lazy_slru_download,
             timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2c2351d531..0586ec38c8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -76,7 +76,7 @@ use crate::{
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
 
 use crate::config::PageServerConf;
-use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
+use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::{
     TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
@@ -210,17 +210,6 @@ pub struct Timeline {
     /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
     pub(crate) layers: Arc<tokio::sync::RwLock<LayerManager>>,
 
-    /// Set of key ranges which should be covered by image layers to
-    /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
-    /// It is used by compaction task when it checks if new image layer should be created.
-    /// Newly created image layer doesn't help to remove the delta layer, until the
-    /// newly created image layer falls off the PITR horizon. So on next GC cycle,
-    /// gc_timeline may still want the new image layer to be created. To avoid redundant
-    /// image layers creation we should check if image layer exists but beyond PITR horizon.
-    /// This is why we need remember GC cutoff LSN.
-    ///
-    wanted_image_layers: Mutex<Option<(Lsn, KeySpace)>>,
-
     last_freeze_at: AtomicLsn,
     // Atomic would be more appropriate here.
     last_freeze_ts: RwLock<Instant>,
@@ -1516,13 +1505,6 @@ impl Timeline {
             .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
     }
 
-    fn get_gc_feedback(&self) -> bool {
-        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf.clone();
-        tenant_conf
-            .gc_feedback
-            .unwrap_or(self.conf.default_tenant_conf.gc_feedback)
-    }
-
     pub(super) fn tenant_conf_updated(&self) {
         // NB: Most tenant conf options are read by background loops, so,
         // changes will automatically be picked up.
@@ -1596,7 +1578,6 @@ impl Timeline {
                 shard_identity,
                 pg_version,
                 layers: Default::default(),
-                wanted_image_layers: Mutex::new(None),
 
                 walredo_mgr,
                 walreceiver: Mutex::new(None),
@@ -3408,31 +3389,6 @@ impl Timeline {
         let layers = guard.layer_map();
 
         let mut max_deltas = 0;
-        {
-            let wanted_image_layers = self.wanted_image_layers.lock().unwrap();
-            if let Some((cutoff_lsn, wanted)) = &*wanted_image_layers {
-                let img_range =
-                    partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
-                if wanted.overlaps(&img_range) {
-                    //
-                    // gc_timeline only pays attention to image layers that are older than the GC cutoff,
-                    // but create_image_layers creates image layers at last-record-lsn.
-                    // So it's possible that gc_timeline wants a new image layer to be created for a key range,
-                    // but the range is already covered by image layers at more recent LSNs. Before we
-                    // create a new image layer, check if the range is already covered at more recent LSNs.
-                    if !layers
-                        .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))
-                    {
-                        debug!(
-                            "Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})",
-                            img_range.start, img_range.end, cutoff_lsn, lsn
-                        );
-                        return true;
-                    }
-                }
-            }
-        }
-
         for part_range in &partition.ranges {
             let image_coverage = layers.image_coverage(part_range, lsn);
             for (img_range, last_img) in image_coverage {
@@ -3603,12 +3559,6 @@ impl Timeline {
                 tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
             }
         }
-        // All layers that the GC wanted us to create have now been created.
-        //
-        // It's possible that another GC cycle happened while we were compacting, and added
-        // something new to wanted_image_layers, and we now clear that before processing it.
-        // That's OK, because the next GC iteration will put it back in.
-        *self.wanted_image_layers.lock().unwrap() = None;
 
         // Sync the new layer to disk before adding it to the layer map, to make sure
         // we don't garbage collect something based on the new layer, before it has
@@ -4518,7 +4468,6 @@ impl Timeline {
         debug!("retain_lsns: {:?}", retain_lsns);
 
         let mut layers_to_remove = Vec::new();
-        let mut wanted_image_layers = KeySpaceRandomAccum::default();
 
         // Scan all layers in the timeline (remote or on-disk).
         //
@@ -4600,15 +4549,6 @@ impl Timeline {
                 .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
             {
                 debug!("keeping {} because it is the latest layer", l.filename());
-                // Collect delta key ranges that need image layers to allow garbage
-                // collecting the layers.
-                // It is not so obvious whether we need to propagate information only about
-                // delta layers. Image layers can form "stairs" preventing old image from been deleted.
-                // But image layers are in any case less sparse than delta layers. Also we need some
-                // protection from replacing recent image layers with new one after each GC iteration.
-                if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&l) {
-                    wanted_image_layers.add_range(l.get_key_range());
-                }
                 result.layers_not_updated += 1;
                 continue 'outer;
             }
@@ -4621,10 +4561,6 @@ impl Timeline {
             );
             layers_to_remove.push(l);
         }
-        self.wanted_image_layers
-            .lock()
-            .unwrap()
-            .replace((new_gc_cutoff, wanted_image_layers.to_keyspace()));
 
         if !layers_to_remove.is_empty() {
             // Persist the new GC cutoff value before we actually remove anything.
diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py
index cf9e4808fc..48dd84fb06 100644
--- a/test_runner/performance/test_gc_feedback.py
+++ b/test_runner/performance/test_gc_feedback.py
@@ -13,6 +13,11 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
     Information about image layers needed to collect old layers should
     be propagated by GC to compaction task which should take in in account
     when make a decision which new image layers needs to be created.
+
+    NB: this test demonstrates the problem. The source tree contained the
+    `gc_feedback` mechanism for about 9 months, but, there were problems
+    with it and it wasn't enabled at runtime.
+    This PR removed the code: https://github.com/neondatabase/neon/pull/6863
     """
     env = neon_env_builder.init_start()
     client = env.pageserver.http_client()
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 1aaded222c..43e035d303 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -166,7 +166,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
             "threshold": "23h",
         },
         "evictions_low_residence_duration_metric_threshold": "2days",
-        "gc_feedback": True,
         "gc_horizon": 23 * (1024 * 1024),
         "gc_period": "2h 13m",
         "heatmap_period": "10m",

From 5273c94c59c751cec058a10934a6da94379ba805 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 26 Feb 2024 10:19:24 +0100
Subject: [PATCH 0270/1571] pageserver: remove two obsolete/unused per-timeline
 metrics (#6893)

over-compensating the addition of a new per-timeline metric in
https://github.com/neondatabase/neon/pull/6834

part of https://github.com/neondatabase/neon/issues/6737
---
 pageserver/src/metrics.rs       | 35 ---------------------------------
 test_runner/fixtures/metrics.py |  2 --
 2 files changed, 37 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ee0bd268cc..1749e02c7f 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -642,26 +642,6 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|
     .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
 });
 
-// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
-// or in testing they estimate how much we would upload if we did.
-static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_created_persistent_files_total",
-        "Number of files created that are meant to be uploaded to cloud storage",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
-static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_written_persistent_bytes_total",
-        "Total bytes written that are meant to be uploaded to cloud storage",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_eviction_iteration_duration_seconds_global",
@@ -1802,8 +1782,6 @@ pub(crate) struct TimelineMetrics {
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
     pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
-    pub num_persistent_files_created: IntCounter,
-    pub persistent_bytes_written: IntCounter,
     pub evictions: IntCounter,
     pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
 }
@@ -1885,12 +1863,6 @@ impl TimelineMetrics {
         };
         let directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>> =
             Lazy::new(Box::new(directory_entries_count_gauge_closure));
-        let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
-        let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
         let evictions = EVICTIONS
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -1912,8 +1884,6 @@ impl TimelineMetrics {
             resident_physical_size_gauge,
             current_logical_size_gauge,
             directory_entries_count_gauge,
-            num_persistent_files_created,
-            persistent_bytes_written,
             evictions,
             evictions_with_low_residence_duration: std::sync::RwLock::new(
                 evictions_with_low_residence_duration,
@@ -1923,8 +1893,6 @@ impl TimelineMetrics {
 
     pub(crate) fn record_new_file_metrics(&self, sz: u64) {
         self.resident_physical_size_add(sz);
-        self.num_persistent_files_created.inc_by(1);
-        self.persistent_bytes_written.inc_by(sz);
     }
 
     pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
@@ -1957,9 +1925,6 @@ impl Drop for TimelineMetrics {
         if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
             let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
         }
-        let _ =
-            NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
-        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
         let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
 
         self.evictions_with_low_residence_duration
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index fd4618ca6a..c615dd154f 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -147,8 +147,6 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_smgr_query_seconds_sum",
     "pageserver_storage_operations_seconds_count_total",
     "pageserver_storage_operations_seconds_sum_total",
-    "pageserver_created_persistent_files_total",
-    "pageserver_written_persistent_bytes_total",
     "pageserver_evictions_total",
     "pageserver_evictions_with_low_residence_duration_total",
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,

From ceedc3ef736dfb6ee77f0bc7e3b4a82bf7dcb19a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 26 Feb 2024 11:22:15 +0100
Subject: [PATCH 0271/1571] Timeline::repartition: enforce no concurrent
 callers & lsn to not move backwards (#6862)

This PR enforces aspects of `Timeline::repartition` that were already
true at runtime:

- it's not called concurrently, so, bail out if it is anyway (see
  comment why it's not called concurrently)
- the `lsn` should never be moving backwards over the lifetime of a
  Timeline object, because last_record_lsn() can only move forwards
  over the lifetime of a Timeline object

The switch to tokio::sync::Mutex blows up the size of the `partitioning`
field from 40 bytes to 72 bytes on Linux x86_64.
That would be concerning if it was a hot field, but, `partitioning` is
only accessed every 20s by one task, so, there won't be excessive cache
pain on it.
(It still sucks that it's now >1 cache line, but I need the Send-able
MutexGuard in the next PR)

part of https://github.com/neondatabase/neon/issues/6861
---
 pageserver/src/tenant/timeline.rs | 48 +++++++++++++++++--------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0586ec38c8..f09617849c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -292,7 +292,7 @@ pub struct Timeline {
     pub initdb_lsn: Lsn,
 
     /// When did we last calculate the partitioning?
-    partitioning: Mutex<(KeyPartitioning, Lsn)>,
+    partitioning: tokio::sync::Mutex<(KeyPartitioning, Lsn)>,
 
     /// Configuration: how often should the partitioning be recalculated.
     repartition_threshold: u64,
@@ -1640,7 +1640,7 @@ impl Timeline {
                     // initial logical size is 0.
                     LogicalSize::empty_initial()
                 },
-                partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
+                partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
                 repartition_threshold: 0,
 
                 last_received_wal: Mutex::new(None),
@@ -3354,30 +3354,34 @@ impl Timeline {
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
     ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
-        {
-            let partitioning_guard = self.partitioning.lock().unwrap();
-            let distance = lsn.0 - partitioning_guard.1 .0;
-            if partitioning_guard.1 != Lsn(0)
-                && distance <= self.repartition_threshold
-                && !flags.contains(CompactFlags::ForceRepartition)
-            {
-                debug!(
-                    distance,
-                    threshold = self.repartition_threshold,
-                    "no repartitioning needed"
-                );
-                return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
-            }
+        let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
+            // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
+            // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
+            // and hence before the compaction task starts.
+            anyhow::bail!("repartition() called concurrently, this should not happen");
+        };
+        if lsn < partitioning_guard.1 {
+            anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
         }
+
+        let distance = lsn.0 - partitioning_guard.1 .0;
+        if partitioning_guard.1 != Lsn(0)
+            && distance <= self.repartition_threshold
+            && !flags.contains(CompactFlags::ForceRepartition)
+        {
+            debug!(
+                distance,
+                threshold = self.repartition_threshold,
+                "no repartitioning needed"
+            );
+            return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
+        }
+
         let keyspace = self.collect_keyspace(lsn, ctx).await?;
         let partitioning = keyspace.partition(partition_size);
 
-        let mut partitioning_guard = self.partitioning.lock().unwrap();
-        if lsn > partitioning_guard.1 {
-            *partitioning_guard = (partitioning, lsn);
-        } else {
-            warn!("Concurrent repartitioning of keyspace. This unexpected, but probably harmless");
-        }
+        *partitioning_guard = (partitioning, lsn);
+
         Ok((partitioning_guard.0.clone(), partitioning_guard.1))
     }
 

From 256058f2abb044e4deacd71c8743cd14203fdd43 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 26 Feb 2024 10:24:58 +0000
Subject: [PATCH 0272/1571] pageserver: only write out legacy tenant config if
 no generation (#6891)

## Problem

Previously we always wrote out both legacy and modern tenant config
files. The legacy write enabled rollbacks, but we are long past the
point where that is needed.

We still need the legacy format for situations where someone is running
tenants without generations (that will be yanked as well eventually),
but we can avoid writing it out at all if we do have a generation number
set. We implicitly also avoid writing the legacy config if our mode is
Secondary (secondary mode is newer than generations).

## Summary of changes

- Make writing legacy tenant config conditional on there being no
generation number set.
---
 pageserver/src/tenant.rs                | 27 +++++++++++++++----------
 test_runner/fixtures/neon_fixtures.py   |  2 +-
 test_runner/regress/test_tenant_conf.py |  3 +--
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c97f24c0fc..2362f19068 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2573,19 +2573,24 @@ impl Tenant {
         legacy_config_path: &Utf8Path,
         location_conf: &LocationConf,
     ) -> anyhow::Result<()> {
-        // Forward compat: write out an old-style configuration that old versions can read, in case we roll back
-        Self::persist_tenant_config_legacy(
-            tenant_shard_id,
-            legacy_config_path,
-            &location_conf.tenant_conf,
-        )
-        .await?;
-
         if let LocationMode::Attached(attach_conf) = &location_conf.mode {
-            // Once we use LocationMode, generations are mandatory.  If we aren't using generations,
-            // then drop out after writing legacy-style config.
+            // The modern-style LocationConf config file requires a generation to be set. In case someone
+            // is running a pageserver without the infrastructure to set generations, write out the legacy-style
+            // config file that only contains TenantConf.
+            //
+            // This will eventually be removed in https://github.com/neondatabase/neon/issues/5388
+
             if attach_conf.generation.is_none() {
-                tracing::debug!("Running without generations, not writing new-style LocationConf");
+                tracing::info!(
+                    "Running without generations, writing legacy-style tenant config file"
+                );
+                Self::persist_tenant_config_legacy(
+                    tenant_shard_id,
+                    legacy_config_path,
+                    &location_conf.tenant_conf,
+                )
+                .await?;
+
                 return Ok(());
             }
         }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 441b64ebfc..6cb7656660 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3812,7 +3812,7 @@ def pytest_addoption(parser: Parser):
 
 
 SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
-    r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)"
+    r"config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)"
 )
 
 
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 2ed22cabc4..a2ffd200a6 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -299,8 +299,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
 
     # tenant is created with defaults, as in without config file
     (tenant_id, timeline_id) = env.neon_cli.create_tenant()
-    config_path = env.pageserver.tenant_dir(tenant_id) / "config"
-    assert config_path.exists(), "config file is always initially created"
+    config_path = env.pageserver.tenant_dir(tenant_id) / "config-v1"
 
     http_client = env.pageserver.http_client()
 

From 51a43b121c0409ab49f443c1a0f93645199a50bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 26 Feb 2024 13:21:40 +0100
Subject: [PATCH 0273/1571] Fix test_remote_storage_upload_queue_retries
 flakiness (#6898)

* decreases checkpointing and compaction targets for even more layer
files
* write 10 thousand rows 2 times instead of writing 20 thousand rows 1
time so that there is more to GC. Before it was noisily jumping between
1 and 0 layer files, now it's jumping between 19 and 20 layer files. The
0 caused an assertion error that gave the test most of its flakiness.
* larger timeout for the churn while failpoints are active thread: this
is mostly so that the test is more robust on systems with more load

Fixes #3051
---
 test_runner/regress/test_remote_storage.py | 37 ++++++++++++----------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 95f912ccc5..176a5e57dc 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -228,9 +228,9 @@ def test_remote_storage_upload_queue_retries(
     tenant_id, timeline_id = env.neon_cli.create_tenant(
         conf={
             # small checkpointing and compaction targets to ensure we generate many upload operations
-            "checkpoint_distance": f"{128 * 1024}",
+            "checkpoint_distance": f"{64 * 1024}",
             "compaction_threshold": "1",
-            "compaction_target_size": f"{128 * 1024}",
+            "compaction_target_size": f"{64 * 1024}",
             # no PITR horizon, we specify the horizon when we request on-demand GC
             "pitr_interval": "0s",
             # disable background compaction and GC. We invoke it manually when we want it to happen.
@@ -256,21 +256,24 @@ def test_remote_storage_upload_queue_retries(
             ]
         )
 
+    FOO_ROWS_COUNT = 4000
+
     def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data):
         # create initial set of layers & upload them with failpoints configured
-        endpoint.safe_psql_many(
-            [
-                f"""
-               INSERT INTO foo (id, val)
-               SELECT g, '{data}'
-               FROM generate_series(1, 20000) g
-               ON CONFLICT (id) DO UPDATE
-               SET val = EXCLUDED.val
-               """,
-                # to ensure that GC can actually remove some layers
-                "VACUUM foo",
-            ]
-        )
+        for _v in range(2):
+            endpoint.safe_psql_many(
+                [
+                    f"""
+                    INSERT INTO foo (id, val)
+                    SELECT g, '{data}'
+                    FROM generate_series(1, {FOO_ROWS_COUNT}) g
+                    ON CONFLICT (id) DO UPDATE
+                    SET val = EXCLUDED.val
+                    """,
+                    # to ensure that GC can actually remove some layers
+                    "VACUUM foo",
+                ]
+            )
         wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     def get_queued_count(file_kind, op_kind):
@@ -333,7 +336,7 @@ def test_remote_storage_upload_queue_retries(
 
     # The churn thread doesn't make progress once it blocks on the first wait_completion() call,
     # so, give it some time to wrap up.
-    churn_while_failpoints_active_thread.join(30)
+    churn_while_failpoints_active_thread.join(60)
     assert not churn_while_failpoints_active_thread.is_alive()
     assert churn_thread_result[0]
 
@@ -365,7 +368,7 @@ def test_remote_storage_upload_queue_retries(
     log.info("restarting postgres to validate")
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
     with endpoint.cursor() as cur:
-        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 20000
+        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == FOO_ROWS_COUNT
 
 
 def test_remote_timeline_client_calls_started_metric(

From 459c2af8c1884f58e58f3e7eece7bd01b5b07779 Mon Sep 17 00:00:00 2001
From: Roman Zaynetdinov <roman@zaynetro.com>
Date: Mon, 26 Feb 2024 17:36:11 +0200
Subject: [PATCH 0274/1571] Expose LFC cache size limit from sql_exporter
 (#6912)

## Problem

We want to report how much cache was used and what the limit was.

## Summary of changes

Added one more query to sql_exporter to expose
`neon.file_cache_size_limit`.
---
 vm-image-spec.yaml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 16ceb06617..5723b634d6 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -102,7 +102,7 @@ files:
 
       - metric_name: lfc_used
         type: gauge
-        help: 'lfc_used'
+        help: 'LFC chunks used (chunk = 1MB)'
         key_labels:
         values: [lfc_used]
         query: |
@@ -124,6 +124,14 @@ files:
         query: |
           select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
 
+      - metric_name: lfc_cache_size_limit
+        type: gauge
+        help: 'LFC cache size limit in bytes'
+        key_labels:
+        values: [lfc_cache_size_limit]
+        query: |
+          select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
+
 build: |
   # Build cgroup-tools
   #

From 75baf83fce79ae7415b2525d285391970cd9b3cf Mon Sep 17 00:00:00 2001
From: Bodobolero <peterbendel@neon.tech>
Date: Mon, 26 Feb 2024 17:06:00 +0100
Subject: [PATCH 0275/1571] externalize statistics on LFC cache usage (#6906)

## Problem

Customers should be able to determine the size of their workload's
working set to right size their compute.
Since Neon uses Local file cache (LFC) instead of shared buffers on
bigger compute nodes to cache pages we need to externalize a means to
determine LFC hit ratio in addition to shared buffer hit ratio.

Currently the following end user documentation
https://github.com/neondatabase/website/blob/fb7cd3af0e90b74bad8c2ef1166e7798bfdefe20/content/docs/manage/endpoints.md?plain=1#L137
is wrong because it describes how to right size a compute node based on
shared buffer hit ratio.

Note that the existing functionality in extension "neon" is NOT
available to end users but only to superuser / cloud_admin.

## Summary of changes

- externalize functions and views in neon extension to end users
- introduce a new view `NEON_STAT_FILE_CACHE` with the following DDL

```sql
CREATE OR REPLACE VIEW NEON_STAT_FILE_CACHE AS
   WITH lfc_stats AS (
   SELECT
     stat_name,
     count
   FROM neon_get_lfc_stats() AS t(stat_name text, count bigint)
   ),
   lfc_values AS (
   SELECT
     MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE NULL END) AS file_cache_misses,
     MAX(CASE WHEN stat_name = 'file_cache_hits'   THEN count ELSE NULL END) AS file_cache_hits,
     MAX(CASE WHEN stat_name = 'file_cache_used'   THEN count ELSE NULL END) AS file_cache_used,
     MAX(CASE WHEN stat_name = 'file_cache_writes' THEN count ELSE NULL END) AS file_cache_writes,
     -- Calculate the file_cache_hit_ratio within the same CTE for simplicity
     CASE
        WHEN MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) = 0 THEN NULL
        ELSE ROUND((MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END)::DECIMAL /
        (MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END))) * 100, 2)
     END AS file_cache_hit_ratio
   FROM lfc_stats
   )
SELECT file_cache_misses, file_cache_hits, file_cache_used, file_cache_writes, file_cache_hit_ratio from lfc_values;
```

This view can be used by an end user as follows:

```sql
CREATE EXTENSION NEON;
SELECT * from neon. NEON_STAT_FILE_CACHE"
```

The output looks like the following:

```
select * from NEON_STAT_FILE_CACHE;
 file_cache_misses | file_cache_hits | file_cache_used | file_cache_writes | file_cache_hit_ratio
-------------------+-----------------+-----------------+-------------------+----------------------
           2133643 |       108999742 |             607 |          10767410 |                98.08
(1 row)

```

## Checklist before requesting a review

- [x ] I have performed a self-review of my code.
- [x ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [x ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 pgxn/neon/Makefile                         |  2 +-
 pgxn/neon/neon--1.1--1.2.sql               | 29 ++++++++++++++++++++++
 pgxn/neon/neon.control                     |  3 ++-
 test_runner/regress/test_neon_extension.py |  8 +++++-
 4 files changed, 39 insertions(+), 3 deletions(-)
 create mode 100644 pgxn/neon/neon--1.1--1.2.sql

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index c6b224a14d..ef0a79a50c 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl
 
 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
 EXTRA_CLEAN = \
diff --git a/pgxn/neon/neon--1.1--1.2.sql b/pgxn/neon/neon--1.1--1.2.sql
new file mode 100644
index 0000000000..5818b4ffe5
--- /dev/null
+++ b/pgxn/neon/neon--1.1--1.2.sql
@@ -0,0 +1,29 @@
+\echo Use "ALTER EXTENSION neon UPDATE TO '1.2'" to load this file. \quit
+
+-- Create a convenient view similar to pg_stat_database
+-- that exposes all lfc stat values in one row.
+CREATE OR REPLACE VIEW NEON_STAT_FILE_CACHE AS 
+   WITH lfc_stats AS (
+   SELECT 
+     stat_name, 
+     count
+   FROM neon_get_lfc_stats() AS t(stat_name text, count bigint)
+   ),
+   lfc_values AS (
+   SELECT 
+     MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE NULL END) AS file_cache_misses,
+     MAX(CASE WHEN stat_name = 'file_cache_hits'   THEN count ELSE NULL END) AS file_cache_hits,
+     MAX(CASE WHEN stat_name = 'file_cache_used'   THEN count ELSE NULL END) AS file_cache_used,
+     MAX(CASE WHEN stat_name = 'file_cache_writes' THEN count ELSE NULL END) AS file_cache_writes,
+     -- Calculate the file_cache_hit_ratio within the same CTE for simplicity
+     CASE 
+        WHEN MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) = 0 THEN NULL
+        ELSE ROUND((MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END)::DECIMAL / 
+        (MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END))) * 100, 2)
+     END AS file_cache_hit_ratio
+   FROM lfc_stats
+   )
+SELECT file_cache_misses, file_cache_hits, file_cache_used, file_cache_writes, file_cache_hit_ratio from lfc_values;
+
+-- externalize the view to all users in role pg_monitor
+GRANT SELECT ON NEON_STAT_FILE_CACHE TO PG_MONITOR;
\ No newline at end of file
diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control
index 4e4cb9f372..599b54b2ff 100644
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -1,5 +1,6 @@
 # neon extension
 comment = 'cloud storage for PostgreSQL'
-default_version = '1.1'
+default_version = '1.2'
 module_pathname = '$libdir/neon'
 relocatable = true
+trusted = true
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index 62225e7b92..672f2b495d 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -1,5 +1,6 @@
 from contextlib import closing
 
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 
 
@@ -22,4 +23,9 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
             # IMPORTANT:
             # If the version has changed, the test should be updated.
             # Ensure that the default version is also updated in the neon.control file
-            assert cur.fetchone() == ("1.1",)
+            assert cur.fetchone() == ("1.2",)
+            cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
+            res = cur.fetchall()
+            log.info(res)
+            assert len(res) == 1
+            assert len(res[0]) == 5

From c4059939e67d45f8fa0e9b7a9bac02f3f77d991d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 26 Feb 2024 17:28:00 +0100
Subject: [PATCH 0276/1571] fixup(#6893): report_size() still used
 pageserver_created_persistent_* metrics (#6909)

Use the remote_timeline_client metrics instead, they work for layer file
uploads and are reasonable close to what the
`pageserver_created_persistent_*` metrics were.

Should we wait for empty upload queue before calling `report_size()`?

part of https://github.com/neondatabase/neon/issues/6737
---
 test_runner/fixtures/compare_fixtures.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 6fbaa08512..429b6af548 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -155,12 +155,23 @@ class NeonCompare(PgCompare):
             "size", timeline_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER
         )
 
-        metric_filters = {"tenant_id": str(self.tenant), "timeline_id": str(self.timeline)}
+        metric_filters = {
+            "tenant_id": str(self.tenant),
+            "timeline_id": str(self.timeline),
+            "file_kind": "layer",
+            "op_kind": "upload",
+        }
+        # use `started` (not `finished`) counters here, because some callers
+        # don't wait for upload queue to drain
         total_files = self.zenbenchmark.get_int_counter_value(
-            self.env.pageserver, "pageserver_created_persistent_files_total", metric_filters
+            self.env.pageserver,
+            "pageserver_remote_timeline_client_calls_started_total",
+            metric_filters,
         )
         total_bytes = self.zenbenchmark.get_int_counter_value(
-            self.env.pageserver, "pageserver_written_persistent_bytes_total", metric_filters
+            self.env.pageserver,
+            "pageserver_remote_timeline_client_bytes_started_total",
+            metric_filters,
         )
         self.zenbenchmark.record(
             "data_uploaded", total_bytes / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER

From 975786265c7ba4c6c73ae174d1998ce7bcbe724e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 26 Feb 2024 18:17:22 +0100
Subject: [PATCH 0277/1571] CI: Delete GitHub Actions caches once PR is closed
 (#6900)

## Problem

> Approaching total cache storage limit (9.25 GB of 10 GB Used)
> Least recently used caches will be automatically evicted to limit the
total cache storage to 10 GB. [Learn more about cache
usage.](https://docs.github.com/actions/using-workflows/caching-dependencies-to-speed-up-workflows#usage-limits-and-eviction-policy)

From https://github.com/neondatabase/neon/actions/caches

Some of these caches are from closed/merged PRs.

## Summary of changes
- Add a workflow that deletes caches for closed branches
---
 .../workflows/cleanup-caches-by-a-branch.yml  | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 .github/workflows/cleanup-caches-by-a-branch.yml

diff --git a/.github/workflows/cleanup-caches-by-a-branch.yml b/.github/workflows/cleanup-caches-by-a-branch.yml
new file mode 100644
index 0000000000..d8c225dedb
--- /dev/null
+++ b/.github/workflows/cleanup-caches-by-a-branch.yml
@@ -0,0 +1,32 @@
+# A workflow from
+# https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#force-deleting-cache-entries
+
+name: cleanup caches by a branch
+on:
+  pull_request:
+    types:
+      - closed
+
+jobs:
+  cleanup:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cleanup
+        run: |
+          gh extension install actions/gh-actions-cache
+
+          echo "Fetching list of cache key"
+          cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH -L 100 | cut -f 1 )
+
+          ## Setting this to not fail the workflow while deleting cache keys.
+          set +e
+          echo "Deleting caches..."
+          for cacheKey in $cacheKeysForPR
+          do
+              gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm
+          done
+          echo "Done"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO: ${{ github.repository }}
+          BRANCH: refs/pull/${{ github.event.pull_request.number }}/merge

From 0881d4f9e3506feb312fb2aa69747c023d78ae96 Mon Sep 17 00:00:00 2001
From: Andreas Scherbaum <andreasscherbaum@users.noreply.github.com>
Date: Mon, 26 Feb 2024 18:53:48 +0100
Subject: [PATCH 0278/1571] Update README, include cleanup details (#6816)

## Problem

README.md is missing cleanup instructions

## Summary of changes

Add cleanup instructions
Add instructions how to handle errors during initialization

---------

Co-authored-by: Andreas Scherbaum <andreas@neon.tech>
---
 README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/README.md b/README.md
index 1c4f32d286..72a924fe9e 100644
--- a/README.md
+++ b/README.md
@@ -230,6 +230,10 @@ postgres=# select * from t;
 > cargo neon stop
 ```
 
+#### Handling build failures
+
+If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again.
+
 ## Running tests
 
 Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
@@ -259,6 +263,12 @@ You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or th
 > It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
 > See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).
 
+## Cleanup
+
+For cleaning up the source tree from build artifacts, run `make clean` in the source directory.
+
+For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directorz will remove your database, with all data in it. You have been warned!
+
 ## Documentation
 
 [docs](/docs) Contains a top-level overview of all available markdown documentation.

From 5accf6e24aa4c604c8ffc81c3becc85cc09e6d65 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 26 Feb 2024 18:17:06 +0000
Subject: [PATCH 0279/1571] attachment_service: JWT auth enforcement (#6897)

## Problem
Attachment service does not do auth based on JWT scopes.

## Summary of changes
Do JWT based permission checking for requests coming into the attachment
service.

Requests into the attachment service must use different tokens based on
the endpoint:
* `/control` and `/debug` require `admin` scope
* `/upcall` requires `generations_api` scope
* `/v1/...` requires `pageserverapi` scope

Requests into the pageserver from the attachment service must use
`pageserverapi` scope.
---
 control_plane/attachment_service/src/auth.rs |   9 ++
 control_plane/attachment_service/src/http.rs |  52 ++++++++-
 control_plane/attachment_service/src/lib.rs  |   1 +
 control_plane/src/attachment_service.rs      |  51 +++++---
 control_plane/src/local_env.rs               |  13 ++-
 control_plane/src/pageserver.rs              |   2 +-
 docs/authentication.md                       |   3 +
 libs/utils/src/auth.rs                       |   2 +
 pageserver/src/auth.rs                       |   2 +-
 safekeeper/src/auth.rs                       |   2 +-
 test_runner/fixtures/neon_fixtures.py        | 117 +++++++++++++------
 test_runner/regress/test_sharding_service.py |  87 ++++++++++++--
 12 files changed, 268 insertions(+), 73 deletions(-)
 create mode 100644 control_plane/attachment_service/src/auth.rs

diff --git a/control_plane/attachment_service/src/auth.rs b/control_plane/attachment_service/src/auth.rs
new file mode 100644
index 0000000000..ef47abf8c7
--- /dev/null
+++ b/control_plane/attachment_service/src/auth.rs
@@ -0,0 +1,9 @@
+use utils::auth::{AuthError, Claims, Scope};
+
+pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), AuthError> {
+    if claims.scope != required_scope {
+        return Err(AuthError("Scope mismatch. Permission denied".into()));
+    }
+
+    Ok(())
+}
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index f9c4535bd5..d341187ef7 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -10,8 +10,8 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
-use utils::auth::SwappableJwtAuth;
-use utils::http::endpoint::{auth_middleware, request_span};
+use utils::auth::{Scope, SwappableJwtAuth};
+use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
 use utils::http::request::{must_get_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};
 
@@ -64,6 +64,8 @@ fn get_state(request: &Request<Body>) -> &HttpState {
 
 /// Pageserver calls into this on startup, to learn which tenants it should attach
 async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::GenerationsApi)?;
+
     let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
     let state = get_state(&req);
     json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
@@ -72,6 +74,8 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
 /// Pageserver calls into this before doing deletions, to confirm that it still
 /// holds the latest generation for the tenants with deletions enqueued
 async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::GenerationsApi)?;
+
     let validate_req = json_request::<ValidateRequest>(&mut req).await?;
     let state = get_state(&req);
     json_response(StatusCode::OK, state.service.validate(validate_req))
@@ -81,6 +85,8 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
 /// (in the real control plane this is unnecessary, because the same program is managing
 ///  generation numbers and doing attachments).
 async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
     let state = get_state(&req);
 
@@ -95,6 +101,8 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
 }
 
 async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let inspect_req = json_request::<InspectRequest>(&mut req).await?;
 
     let state = get_state(&req);
@@ -106,6 +114,8 @@ async fn handle_tenant_create(
     service: Arc<Service>,
     mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::PageServerApi)?;
+
     let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
     json_response(
         StatusCode::CREATED,
@@ -164,6 +174,8 @@ async fn handle_tenant_location_config(
     mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
     let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
     json_response(
         StatusCode::OK,
@@ -178,6 +190,8 @@ async fn handle_tenant_time_travel_remote_storage(
     mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
     let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;
 
     let timestamp_raw = must_get_query_param(&req, "travel_to")?;
@@ -211,6 +225,7 @@ async fn handle_tenant_delete(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
 
     deletion_wrapper(service, move |service| async move {
         service.tenant_delete(tenant_id).await
@@ -223,6 +238,8 @@ async fn handle_tenant_timeline_create(
     mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
     let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
     json_response(
         StatusCode::CREATED,
@@ -237,6 +254,8 @@ async fn handle_tenant_timeline_delete(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
     let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
 
     deletion_wrapper(service, move |service| async move {
@@ -250,6 +269,7 @@ async fn handle_tenant_timeline_passthrough(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
 
     let Some(path) = req.uri().path_and_query() else {
         // This should never happen, our request router only calls us if there is a path
@@ -293,11 +313,15 @@ async fn handle_tenant_locate(
     service: Arc<Service>,
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }
 
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
     let state = get_state(&req);
     state.service.node_register(register_req).await?;
@@ -305,17 +329,23 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
 }
 
 async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let state = get_state(&req);
     json_response(StatusCode::OK, state.service.node_list().await?)
 }
 
 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
     json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
 }
 
 async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
     let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
     if node_id != config_req.node_id {
@@ -335,6 +365,8 @@ async fn handle_tenant_shard_split(
     service: Arc<Service>,
     mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
 
@@ -348,6 +380,8 @@ async fn handle_tenant_shard_migrate(
     service: Arc<Service>,
     mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
     let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
     json_response(
@@ -360,22 +394,30 @@ async fn handle_tenant_shard_migrate(
 
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
     let state = get_state(&req);
 
     json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }
 
 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let state = get_state(&req);
     state.service.tenants_dump()
 }
 
 async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let state = get_state(&req);
     state.service.scheduler_dump()
 }
 
 async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let state = get_state(&req);
 
     json_response(StatusCode::OK, state.service.consistency_check().await?)
@@ -432,6 +474,12 @@ where
     .await
 }
 
+fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
+    check_permission_with(request, |claims| {
+        crate::auth::check_permission(claims, required_scope)
+    })
+}
+
 pub fn make_router(
     service: Arc<Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index e950a57e57..ce613e858f 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -1,6 +1,7 @@
 use serde::{Deserialize, Serialize};
 use utils::seqwait::MonotonicCounter;
 
+mod auth;
 mod compute_hook;
 pub mod http;
 pub mod metrics;
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 4a1d316fe7..f0bee1ce08 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -11,12 +11,12 @@ use pageserver_api::{
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::str::FromStr;
+use std::{fs, str::FromStr};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
 use utils::{
-    auth::{Claims, Scope},
+    auth::{encode_from_key_file, Claims, Scope},
     id::{NodeId, TenantId},
 };
 
@@ -24,7 +24,7 @@ pub struct AttachmentService {
     env: LocalEnv,
     listen: String,
     path: Utf8PathBuf,
-    jwt_token: Option<String>,
+    private_key: Option<Vec<u8>>,
     public_key: Option<String>,
     postgres_port: u16,
     client: reqwest::Client,
@@ -204,12 +204,11 @@ impl AttachmentService {
             .pageservers
             .first()
             .expect("Config is validated to contain at least one pageserver");
-        let (jwt_token, public_key) = match ps_conf.http_auth_type {
+        let (private_key, public_key) = match ps_conf.http_auth_type {
             AuthType::Trust => (None, None),
             AuthType::NeonJWT => {
-                let jwt_token = env
-                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
-                    .unwrap();
+                let private_key_path = env.get_private_key_path();
+                let private_key = fs::read(private_key_path).expect("failed to read private key");
 
                 // If pageserver auth is enabled, this implicitly enables auth for this service,
                 // using the same credentials.
@@ -235,7 +234,7 @@ impl AttachmentService {
                 } else {
                     std::fs::read_to_string(&public_key_path).expect("Can't read public key")
                 };
-                (Some(jwt_token), Some(public_key))
+                (Some(private_key), Some(public_key))
             }
         };
 
@@ -243,7 +242,7 @@ impl AttachmentService {
             env: env.clone(),
             path,
             listen,
-            jwt_token,
+            private_key,
             public_key,
             postgres_port,
             client: reqwest::ClientBuilder::new()
@@ -397,7 +396,10 @@ impl AttachmentService {
         .into_iter()
         .map(|s| s.to_string())
         .collect::<Vec<_>>();
-        if let Some(jwt_token) = &self.jwt_token {
+        if let Some(private_key) = &self.private_key {
+            let claims = Claims::new(None, Scope::PageServerApi);
+            let jwt_token =
+                encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
             args.push(format!("--jwt-token={jwt_token}"));
         }
 
@@ -468,6 +470,20 @@ impl AttachmentService {
         Ok(())
     }
 
+    fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
+        let category = match path.find('/') {
+            Some(idx) => &path[..idx],
+            None => path,
+        };
+
+        match category {
+            "status" | "ready" => Ok(None),
+            "control" | "debug" => Ok(Some(Claims::new(None, Scope::Admin))),
+            "v1" => Ok(Some(Claims::new(None, Scope::PageServerApi))),
+            _ => Err(anyhow::anyhow!("Failed to determine claims for {}", path)),
+        }
+    }
+
     /// Simple HTTP request wrapper for calling into attachment service
     async fn dispatch<RQ, RS>(
         &self,
@@ -493,11 +509,16 @@ impl AttachmentService {
         if let Some(body) = body {
             builder = builder.json(&body)
         }
-        if let Some(jwt_token) = &self.jwt_token {
-            builder = builder.header(
-                reqwest::header::AUTHORIZATION,
-                format!("Bearer {jwt_token}"),
-            );
+        if let Some(private_key) = &self.private_key {
+            println!("Getting claims for path {}", path);
+            if let Some(required_claims) = Self::get_claims_for_path(&path)? {
+                println!("Got claims {:?} for path {}", required_claims, path);
+                let jwt_token = encode_from_key_file(&required_claims, private_key)?;
+                builder = builder.header(
+                    reqwest::header::AUTHORIZATION,
+                    format!("Bearer {jwt_token}"),
+                );
+            }
         }
 
         let response = builder.send().await?;
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 786ea6d098..a5e1325cfe 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -412,14 +412,17 @@ impl LocalEnv {
 
     // this function is used only for testing purposes in CLI e g generate tokens during init
     pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
-        let private_key_path = if self.private_key_path.is_absolute() {
+        let private_key_path = self.get_private_key_path();
+        let key_data = fs::read(private_key_path)?;
+        encode_from_key_file(claims, &key_data)
+    }
+
+    pub fn get_private_key_path(&self) -> PathBuf {
+        if self.private_key_path.is_absolute() {
             self.private_key_path.to_path_buf()
         } else {
             self.base_data_dir.join(&self.private_key_path)
-        };
-
-        let key_data = fs::read(private_key_path)?;
-        encode_from_key_file(claims, &key_data)
+        }
     }
 
     //
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index a52fcb4a3f..2c5cac327a 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -115,7 +115,7 @@ impl PageServerNode {
             if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
                 let jwt_token = self
                     .env
-                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
+                    .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
                     .unwrap();
                 overrides.push(format!("control_plane_api_token='{}'", jwt_token));
             }
diff --git a/docs/authentication.md b/docs/authentication.md
index f768b04c5b..faac7aa28e 100644
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -70,6 +70,9 @@ Should only be used e.g. for status check/tenant creation/list.
 Should only be used e.g. for status check.
 Currently also used for connection from any pageserver to any safekeeper.
 
+"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
+
+"admin": Provides access to the control plane and admin APIs of the attachment service.
 
 ### CLI
 CLI generates a key pair during call to `neon_local init` with the following commands:
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index e031699cfb..51ab238d77 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -32,6 +32,8 @@ pub enum Scope {
     // The scope used by pageservers in upcalls to storage controller and cloud control plane
     #[serde(rename = "generations_api")]
     GenerationsApi,
+    // Allows access to control plane managment API and some storage controller endpoints.
+    Admin,
 }
 
 /// JWT payload. See docs/authentication.md for the format
diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs
index 4dee61d3ea..4785c8c4c5 100644
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -14,7 +14,7 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
         }
         (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
         (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
+        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
             format!(
                 "JWT scope '{:?}' is ineligible for Pageserver auth",
                 claims.scope
diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs
index 96676be04d..dd9058c468 100644
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -12,7 +12,7 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
             }
             Ok(())
         }
-        (Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError(
+        (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError(
             format!(
                 "JWT scope '{:?}' is ineligible for Safekeeper auth",
                 claims.scope
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 6cb7656660..55c16f73b0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -17,6 +17,7 @@ import uuid
 from contextlib import closing, contextmanager
 from dataclasses import dataclass, field
 from datetime import datetime
+from enum import Enum
 from fcntl import LOCK_EX, LOCK_UN, flock
 from functools import cached_property
 from itertools import chain, product
@@ -388,7 +389,8 @@ class PgProtocol:
 class AuthKeys:
     priv: str
 
-    def generate_token(self, *, scope: str, **token_data: str) -> str:
+    def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str:
+        token_data = {key: str(val) for key, val in token_data.items()}
         token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA")
         # cast(Any, self.priv)
 
@@ -401,14 +403,23 @@ class AuthKeys:
         return token
 
     def generate_pageserver_token(self) -> str:
-        return self.generate_token(scope="pageserverapi")
+        return self.generate_token(scope=TokenScope.PAGE_SERVER_API)
 
     def generate_safekeeper_token(self) -> str:
-        return self.generate_token(scope="safekeeperdata")
+        return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA)
 
     # generate token giving access to only one tenant
     def generate_tenant_token(self, tenant_id: TenantId) -> str:
-        return self.generate_token(scope="tenant", tenant_id=str(tenant_id))
+        return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id))
+
+
+# TODO: Replace with `StrEnum` when we upgrade to python 3.11
+class TokenScope(str, Enum):
+    ADMIN = "admin"
+    PAGE_SERVER_API = "pageserverapi"
+    GENERATIONS_API = "generations_api"
+    SAFEKEEPER_DATA = "safekeeperdata"
+    TENANT = "tenant"
 
 
 class NeonEnvBuilder:
@@ -1922,6 +1933,13 @@ class Pagectl(AbstractNeonCli):
         return IndexPartDump.from_json(parsed)
 
 
+class AttachmentServiceApiException(Exception):
+    def __init__(self, message, status_code: int):
+        super().__init__(message)
+        self.message = message
+        self.status_code = status_code
+
+
 class NeonAttachmentService(MetricsGetter):
     def __init__(self, env: NeonEnv, auth_enabled: bool):
         self.env = env
@@ -1940,39 +1958,60 @@ class NeonAttachmentService(MetricsGetter):
             self.running = False
         return self
 
+    @staticmethod
+    def raise_api_exception(res: requests.Response):
+        try:
+            res.raise_for_status()
+        except requests.RequestException as e:
+            try:
+                msg = res.json()["msg"]
+            except:  # noqa: E722
+                msg = ""
+            raise AttachmentServiceApiException(msg, res.status_code) from e
+
     def pageserver_api(self) -> PageserverHttpClient:
         """
         The attachment service implements a subset of the pageserver REST API, for mapping
         per-tenant actions into per-shard actions (e.g. timeline creation).  Tests should invoke those
         functions via the HttpClient, as an implicit check that these APIs remain compatible.
         """
-        return PageserverHttpClient(self.env.attachment_service_port, lambda: True)
+        auth_token = None
+        if self.auth_enabled:
+            auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
+        return PageserverHttpClient(self.env.attachment_service_port, lambda: True, auth_token)
 
     def request(self, method, *args, **kwargs) -> requests.Response:
-        kwargs["headers"] = self.headers()
-        return requests.request(method, *args, **kwargs)
+        resp = requests.request(method, *args, **kwargs)
+        NeonAttachmentService.raise_api_exception(resp)
 
-    def headers(self) -> Dict[str, str]:
+        return resp
+
+    def headers(self, scope: Optional[TokenScope]) -> Dict[str, str]:
         headers = {}
-        if self.auth_enabled:
-            jwt_token = self.env.auth_keys.generate_pageserver_token()
+        if self.auth_enabled and scope is not None:
+            jwt_token = self.env.auth_keys.generate_token(scope=scope)
             headers["Authorization"] = f"Bearer {jwt_token}"
 
         return headers
 
     def get_metrics(self) -> Metrics:
         res = self.request("GET", f"{self.env.attachment_service_api}/metrics")
-        res.raise_for_status()
         return parse_metrics(res.text)
 
     def ready(self) -> bool:
-        resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
-        if resp.status_code == 503:
+        status = None
+        try:
+            resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
+            status = resp.status_code
+        except AttachmentServiceApiException as e:
+            status = e.status_code
+
+        if status == 503:
             return False
-        elif resp.status_code == 200:
+        elif status == 200:
             return True
         else:
-            raise RuntimeError(f"Unexpected status {resp.status_code} from readiness endpoint")
+            raise RuntimeError(f"Unexpected status {status} from readiness endpoint")
 
     def attach_hook_issue(
         self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
@@ -1981,21 +2020,19 @@ class NeonAttachmentService(MetricsGetter):
             "POST",
             f"{self.env.attachment_service_api}/debug/v1/attach-hook",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id},
-            headers=self.headers(),
+            headers=self.headers(TokenScope.ADMIN),
         )
-        response.raise_for_status()
         gen = response.json()["gen"]
         assert isinstance(gen, int)
         return gen
 
     def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
-        response = self.request(
+        self.request(
             "POST",
             f"{self.env.attachment_service_api}/debug/v1/attach-hook",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
-            headers=self.headers(),
+            headers=self.headers(TokenScope.ADMIN),
         )
-        response.raise_for_status()
 
     def inspect(self, tenant_shard_id: Union[TenantId, TenantShardId]) -> Optional[tuple[int, int]]:
         """
@@ -2005,9 +2042,8 @@ class NeonAttachmentService(MetricsGetter):
             "POST",
             f"{self.env.attachment_service_api}/debug/v1/inspect",
             json={"tenant_shard_id": str(tenant_shard_id)},
-            headers=self.headers(),
+            headers=self.headers(TokenScope.ADMIN),
         )
-        response.raise_for_status()
         json = response.json()
         log.info(f"Response: {json}")
         if json["attachment"]:
@@ -2027,14 +2063,15 @@ class NeonAttachmentService(MetricsGetter):
             "POST",
             f"{self.env.attachment_service_api}/control/v1/node",
             json=body,
-            headers=self.headers(),
-        ).raise_for_status()
+            headers=self.headers(TokenScope.ADMIN),
+        )
 
     def node_list(self):
         response = self.request(
-            "GET", f"{self.env.attachment_service_api}/control/v1/node", headers=self.headers()
+            "GET",
+            f"{self.env.attachment_service_api}/control/v1/node",
+            headers=self.headers(TokenScope.ADMIN),
         )
-        response.raise_for_status()
         return response.json()
 
     def node_configure(self, node_id, body: dict[str, Any]):
@@ -2044,8 +2081,8 @@ class NeonAttachmentService(MetricsGetter):
             "PUT",
             f"{self.env.attachment_service_api}/control/v1/node/{node_id}/config",
             json=body,
-            headers=self.headers(),
-        ).raise_for_status()
+            headers=self.headers(TokenScope.ADMIN),
+        )
 
     def tenant_create(
         self,
@@ -2070,8 +2107,12 @@ class NeonAttachmentService(MetricsGetter):
             for k, v in tenant_config.items():
                 body[k] = v
 
-        response = self.request("POST", f"{self.env.attachment_service_api}/v1/tenant", json=body)
-        response.raise_for_status()
+        response = self.request(
+            "POST",
+            f"{self.env.attachment_service_api}/v1/tenant",
+            json=body,
+            headers=self.headers(TokenScope.PAGE_SERVER_API),
+        )
         log.info(f"tenant_create success: {response.json()}")
 
     def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
@@ -2079,9 +2120,10 @@ class NeonAttachmentService(MetricsGetter):
         :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
         """
         response = self.request(
-            "GET", f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/locate"
+            "GET",
+            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/locate",
+            headers=self.headers(TokenScope.ADMIN),
         )
-        response.raise_for_status()
         body = response.json()
         shards: list[dict[str, Any]] = body["shards"]
         return shards
@@ -2091,20 +2133,20 @@ class NeonAttachmentService(MetricsGetter):
             "PUT",
             f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/shard_split",
             json={"new_shard_count": shard_count},
+            headers=self.headers(TokenScope.ADMIN),
         )
-        response.raise_for_status()
         body = response.json()
         log.info(f"tenant_shard_split success: {body}")
         shards: list[TenantShardId] = body["new_shards"]
         return shards
 
     def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
-        response = self.request(
+        self.request(
             "PUT",
             f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_shard_id}/migrate",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
+            headers=self.headers(TokenScope.ADMIN),
         )
-        response.raise_for_status()
         log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}")
         assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id
 
@@ -2112,11 +2154,11 @@ class NeonAttachmentService(MetricsGetter):
         """
         Throw an exception if the service finds any inconsistencies in its state
         """
-        response = self.request(
+        self.request(
             "POST",
             f"{self.env.attachment_service_api}/debug/v1/consistency_check",
+            headers=self.headers(TokenScope.ADMIN),
         )
-        response.raise_for_status()
         log.info("Attachment service passed consistency check")
 
     def __enter__(self) -> "NeonAttachmentService":
@@ -2894,7 +2936,6 @@ class NeonProxy(PgProtocol):
 
     def get_metrics(self) -> str:
         request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics")
-        request_result.raise_for_status()
         return request_result.text
 
     @staticmethod
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 00c3a1628e..b4f1f49543 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -1,13 +1,16 @@
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
-from typing import List
+from typing import Any, Dict, List
 
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    AttachmentServiceApiException,
     NeonEnv,
     NeonEnvBuilder,
     PgBin,
+    TokenScope,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -457,37 +460,40 @@ def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
 
     # Initial tenant (1 shard) and the one we just created (2 shards) should be visible
     response = env.attachment_service.request(
-        "GET", f"{env.attachment_service_api}/debug/v1/tenant"
+        "GET",
+        f"{env.attachment_service_api}/debug/v1/tenant",
+        headers=env.attachment_service.headers(TokenScope.ADMIN),
     )
-    response.raise_for_status()
     assert len(response.json()) == 3
 
     # Scheduler should report the expected nodes and shard counts
     response = env.attachment_service.request(
         "GET", f"{env.attachment_service_api}/debug/v1/scheduler"
     )
-    response.raise_for_status()
     # Two nodes, in a dict of node_id->node
     assert len(response.json()["nodes"]) == 2
     assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
     assert all(v["may_schedule"] for v in response.json()["nodes"].values())
 
     response = env.attachment_service.request(
-        "POST", f"{env.attachment_service_api}/debug/v1/node/{env.pageservers[1].id}/drop"
+        "POST",
+        f"{env.attachment_service_api}/debug/v1/node/{env.pageservers[1].id}/drop",
+        headers=env.attachment_service.headers(TokenScope.ADMIN),
     )
-    response.raise_for_status()
     assert len(env.attachment_service.node_list()) == 1
 
     response = env.attachment_service.request(
-        "POST", f"{env.attachment_service_api}/debug/v1/tenant/{tenant_id}/drop"
+        "POST",
+        f"{env.attachment_service_api}/debug/v1/tenant/{tenant_id}/drop",
+        headers=env.attachment_service.headers(TokenScope.ADMIN),
     )
-    response.raise_for_status()
 
     # Tenant drop should be reflected in dump output
     response = env.attachment_service.request(
-        "GET", f"{env.attachment_service_api}/debug/v1/tenant"
+        "GET",
+        f"{env.attachment_service_api}/debug/v1/tenant",
+        headers=env.attachment_service.headers(TokenScope.ADMIN),
     )
-    response.raise_for_status()
     assert len(response.json()) == 1
 
     # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
@@ -603,3 +609,64 @@ def test_sharding_service_s3_time_travel_recovery(
         endpoint.safe_psql("SELECT * FROM created_foo;")
 
     env.attachment_service.consistency_check()
+
+
+def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.auth_enabled = True
+    env = neon_env_builder.init_start()
+    svc = env.attachment_service
+    api = env.attachment_service_api
+
+    tenant_id = TenantId.generate()
+    body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
+
+    # No token
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Unauthorized: missing authorization header",
+    ):
+        svc.request("POST", f"{env.attachment_service_api}/v1/tenant", json=body)
+
+    # Token with incorrect scope
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Forbidden: JWT authentication error",
+    ):
+        svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN))
+
+    # Token with correct scope
+    svc.request(
+        "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.PAGE_SERVER_API)
+    )
+
+    # No token
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Unauthorized: missing authorization header",
+    ):
+        svc.request("GET", f"{api}/debug/v1/tenant")
+
+    # Token with incorrect scope
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Forbidden: JWT authentication error",
+    ):
+        svc.request(
+            "GET", f"{api}/debug/v1/tenant", headers=svc.headers(TokenScope.GENERATIONS_API)
+        )
+
+    # No token
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Unauthorized: missing authorization header",
+    ):
+        svc.request("POST", f"{api}/upcall/v1/re-attach")
+
+    # Token with incorrect scope
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Forbidden: JWT authentication error",
+    ):
+        svc.request(
+            "POST", f"{api}/upcall/v1/re-attach", headers=svc.headers(TokenScope.PAGE_SERVER_API)
+        )

From b2bbc20311ad95baafb8430250f43b07233ce1ff Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 26 Feb 2024 15:48:56 -0500
Subject: [PATCH 0280/1571] fix: only alter default privileges when public
 schema exists (#6914)

## Problem

Following up https://github.com/neondatabase/neon/pull/6885, only alter
default privileges when the public schema exists.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/spec.rs | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index b515f9f408..d5fd2c9462 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -676,8 +676,15 @@ pub fn handle_grants(
                             GRANT CREATE ON SCHEMA public TO web_access;\n\
                         END IF;\n\
                     END IF;\n\
-                    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
-                    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
+                    IF EXISTS(\n\
+                        SELECT nspname\n\
+                        FROM pg_catalog.pg_namespace\n\
+                        WHERE nspname = 'public'\n\
+                    )\n\
+                    THEN\n\
+                        ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
+                        ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
+                    END IF;\n\
                 END\n\
             $$;"
         .to_string();

From 62d77e263f2b3f4b6847b6a9a14c319da6cfbfa4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 27 Feb 2024 10:55:10 +0100
Subject: [PATCH 0281/1571] test_remote_timeline_client_calls_started_metric:
 fix flakiness (#6911)

fixes https://github.com/neondatabase/neon/issues/6889

# Problem

The failure in the last 3 flaky runs on `main` is

```
test_runner/regress/test_remote_storage.py:460: in test_remote_timeline_client_calls_started_metric
    churn("a", "b")
test_runner/regress/test_remote_storage.py:457: in churn
    assert gc_result["layers_removed"] > 0
E   assert 0 > 0
```

That's this code


https://github.com/neondatabase/neon/blob/cd449d66ea29ad2d7269458e90623c3ae40e1816/test_runner/regress/test_remote_storage.py#L448-L460

So, the test expects GC to remove some layers but the GC doesn't.

# Fix

My impression is that the VACUUM isn't re-using pages aggressively
enough, but I can't really prove that. Tried to analyze the layer map
dump but it's too complex.

So, this PR:

- Creates more churn by doing the overwrite twice.
- Forces image layer creation.

It also drive-by removes the redundant call to timeline_compact,
because, timeline_checkpoint already does that internally.
---
 pageserver/src/http/routes.rs              |  8 ++++++++
 pageserver/src/tenant/timeline.rs          |  8 +++++++-
 test_runner/fixtures/pageserver/http.py    |  6 ++++++
 test_runner/regress/test_remote_storage.py | 16 ++++++++++------
 4 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 1339229a70..04211fbb7f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1616,6 +1616,10 @@ async fn timeline_compact_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
         flags |= CompactFlags::ForceRepartition;
     }
+    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
+        flags |= CompactFlags::ForceImageLayerCreation;
+    }
+
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
         let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
@@ -1642,6 +1646,10 @@ async fn timeline_checkpoint_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
         flags |= CompactFlags::ForceRepartition;
     }
+    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
+        flags |= CompactFlags::ForceImageLayerCreation;
+    }
+
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
         let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f09617849c..b14eafa194 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -503,6 +503,7 @@ pub enum GetLogicalSizePriority {
 #[derive(enumset::EnumSetType)]
 pub(crate) enum CompactFlags {
     ForceRepartition,
+    ForceImageLayerCreation,
 }
 
 impl std::fmt::Debug for Timeline {
@@ -1157,7 +1158,12 @@ impl Timeline {
                 // 3. Create new image layers for partitions that have been modified
                 // "enough".
                 let layers = self
-                    .create_image_layers(&partitioning, lsn, false, &image_ctx)
+                    .create_image_layers(
+                        &partitioning,
+                        lsn,
+                        flags.contains(CompactFlags::ForceImageLayerCreation),
+                        &image_ctx,
+                    )
                     .await
                     .map_err(anyhow::Error::from)?;
                 if let Some(remote_client) = &self.remote_client {
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 427ef00c78..ad3efb5837 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -549,11 +549,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         tenant_id: Union[TenantId, TenantShardId],
         timeline_id: TimelineId,
         force_repartition=False,
+        force_image_layer_creation=False,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
         if force_repartition:
             query["force_repartition"] = "true"
+        if force_image_layer_creation:
+            query["force_image_layer_creation"] = "true"
 
         log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
         res = self.put(
@@ -608,11 +611,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         tenant_id: Union[TenantId, TenantShardId],
         timeline_id: TimelineId,
         force_repartition=False,
+        force_image_layer_creation=False,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
         if force_repartition:
             query["force_repartition"] = "true"
+        if force_image_layer_creation:
+            query["force_image_layer_creation"] = "true"
 
         log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
         res = self.put(
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 176a5e57dc..73ebe0a76f 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -382,6 +382,7 @@ def test_remote_timeline_client_calls_started_metric(
         initial_tenant_conf={
             # small checkpointing and compaction targets to ensure we generate many upload operations
             "checkpoint_distance": f"{128 * 1024}",
+            # ensure each timeline_checkpoint() calls creates L1s
             "compaction_threshold": "1",
             "compaction_target_size": f"{128 * 1024}",
             # no PITR horizon, we specify the horizon when we request on-demand GC
@@ -389,8 +390,6 @@ def test_remote_timeline_client_calls_started_metric(
             # disable background compaction and GC. We invoke it manually when we want it to happen.
             "gc_period": "0s",
             "compaction_period": "0s",
-            # create image layers eagerly, so that GC can remove some layers
-            "image_creation_threshold": "1",
         }
     )
 
@@ -449,12 +448,17 @@ def test_remote_timeline_client_calls_started_metric(
             ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}"
 
     def churn(data_pass1, data_pass2):
+        # overwrite the same data in place, vacuum inbetween, and
+        # and create image layers; then run a gc().
+        # this should
+        # - create new layers
+        # - delete some layers
         overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
-        client.timeline_checkpoint(tenant_id, timeline_id)
-        client.timeline_compact(tenant_id, timeline_id)
         overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
-        client.timeline_checkpoint(tenant_id, timeline_id)
-        client.timeline_compact(tenant_id, timeline_id)
+        client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True)
+        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
+        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
+        client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True)
         gc_result = client.timeline_gc(tenant_id, timeline_id, 0)
         print_gc_result(gc_result)
         assert gc_result["layers_removed"] > 0

From e8956445550be3ac9564874ad04b624313cadb14 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 27 Feb 2024 14:45:54 +0200
Subject: [PATCH 0282/1571] Show LFC statistic in EXPLAIN (#6851)

## Problem

LFC has high impact on Neon application performance but there is no way
for user to check efficiency of its usage

## Summary of changes

Show LFC statistic in EXPLAIN ANALYZE

## Description

**Local file cache (LFC)**

A layer of caching that stores frequently accessed data from the storage
layer in the local memory of the Neon compute instance. This cache helps
to reduce latency and improve query performance by minimizing the need
to fetch data from the storage layer repeatedly.

**Externalization of LFC in explain output**

Then EXPLAIN ANALYZE output is extended to display important counts for
local file cache (LFC) hits and misses.
This works both, for EXPLAIN text and json output.

**File cache: hits**

Whenever the Postgres backend retrieves a page/block from SGMR, it is
not found in shared buffer but the page is already found in the LFC this
counter is incremented.

**File cache: misses**

Whenever the Postgres backend retrieves a page/block from SGMR, it is
not found in shared buffer and also not in then LFC but the page is
retrieved from Neon storage (page server) this counter is incremented.

Example (for explain text output)

```sql
explain (analyze,buffers,prefetch,filecache) select count(*) from pgbench_accounts;
                                                                                         QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Finalize Aggregate  (cost=214486.94..214486.95 rows=1 width=8) (actual time=5195.378..5196.034 rows=1 loops=1)
   Buffers: shared hit=178875 read=143691 dirtied=128597 written=127346
   Prefetch: hits=0 misses=1865 expired=0 duplicates=0
   File cache: hits=141826 misses=1865
   ->  Gather  (cost=214486.73..214486.94 rows=2 width=8) (actual time=5195.366..5196.025 rows=3 loops=1)
         Workers Planned: 2
         Workers Launched: 2
         Buffers: shared hit=178875 read=143691 dirtied=128597 written=127346
         Prefetch: hits=0 misses=1865 expired=0 duplicates=0
         File cache: hits=141826 misses=1865
         ->  Partial Aggregate  (cost=213486.73..213486.74 rows=1 width=8) (actual time=5187.670..5187.670 rows=1 loops=3)
               Buffers: shared hit=178875 read=143691 dirtied=128597 written=127346
               Prefetch: hits=0 misses=1865 expired=0 duplicates=0
               File cache: hits=141826 misses=1865
               ->  Parallel Index Only Scan using pgbench_accounts_pkey on pgbench_accounts  (cost=0.43..203003.02 rows=4193481 width=0) (actual time=0.574..4928.995 rows=3333333 loops=3)
                     Heap Fetches: 3675286
                     Buffers: shared hit=178875 read=143691 dirtied=128597 written=127346
                     Prefetch: hits=0 misses=1865 expired=0 duplicates=0
                     File cache: hits=141826 misses=1865
```

The json output uses the following keys and provides integer values for
those keys:

```
...
"File Cache Hits": 141826,
"File Cache Misses": 1865
...
```

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c | 2 ++
 vendor/postgres-v14    | 2 +-
 vendor/postgres-v15    | 2 +-
 vendor/postgres-v16    | 2 +-
 vendor/revisions.json  | 6 +++---
 5 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 448b9263f3..11d6f6aec5 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -533,6 +533,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	{
 		/* Page is not cached */
 		lfc_ctl->misses += 1;
+		pgBufferUsage.file_cache.misses += 1;
 		LWLockRelease(lfc_lock);
 		return false;
 	}
@@ -558,6 +559,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	{
 		Assert(LFC_ENABLED());
 		lfc_ctl->hits += 1;
+		pgBufferUsage.file_cache.hits += 1;
 		Assert(entry->access_count > 0);
 		if (--entry->access_count == 0)
 			dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 4cdba8ec5a..f49a962b9b 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 4cdba8ec5a3868cec4826bbb3f16c1d3d2ac2283
+Subproject commit f49a962b9b3715d6f47017d1dcf905c36f93ae5e
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 0ec04712d5..e8b9a28006 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 0ec04712d55539550278595e853c172f7aa5fe3e
+Subproject commit e8b9a28006a550d7ca7cbb9bd0238eb9cd57bbd8
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index cc98378b0f..072697b225 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit cc98378b0fa7413b78a197e3292a806865e4056a
+Subproject commit 072697b2250da3251af75887b577104554b9cd44
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 540b7ec898..1529d87bcb 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,6 +1,6 @@
 {
-    "postgres-v16": "cc98378b0fa7413b78a197e3292a806865e4056a",
-    "postgres-v15": "0ec04712d55539550278595e853c172f7aa5fe3e",
-    "postgres-v14": "4cdba8ec5a3868cec4826bbb3f16c1d3d2ac2283"
+    "postgres-v16": "072697b2250da3251af75887b577104554b9cd44",
+    "postgres-v15": "e8b9a28006a550d7ca7cbb9bd0238eb9cd57bbd8",
+    "postgres-v14": "f49a962b9b3715d6f47017d1dcf905c36f93ae5e"
 }
 

From 2991d01b61851273fcaea66936fccc926dd082ba Mon Sep 17 00:00:00 2001
From: Roman Zaynetdinov <roman@neon.tech>
Date: Tue, 27 Feb 2024 15:47:05 +0200
Subject: [PATCH 0283/1571] Export connection counts from sql_exporter (#6926)

## Problem

We want to show connection counts to console users.

## Summary of changes

Start exporting connection counts grouped by database name and
connection state.
---
 vm-image-spec.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 5723b634d6..4520a5fc9c 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -132,6 +132,16 @@ files:
         query: |
           select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
 
+      - metric_name: connection_counts
+        type: gauge
+        help: 'Connection counts'
+        key_labels:
+          - datname
+          - state
+        values: [count]
+        query: |
+          select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
+
 build: |
   # Build cgroup-tools
   #

From a691786ce26c3f365c44afff5e93f7f19c439bf5 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 27 Feb 2024 16:27:13 +0200
Subject: [PATCH 0284/1571] fix: logical size calculation gating (#6915)

Noticed that we are failing to handle `Result::Err` when entering a gate
for logical size calculation. Audited rest of the gate enters, which
seem fine, unified two instances.

Noticed that the gate guard allows to remove a failpoint, then noticed
that adjacent failpoint was blocking the executor thread instead of
using `pausable_failpoint!`, fix both.

eviction_task.rs now maintains a gate guard as well.

Cc: #4733
---
 pageserver/src/tenant.rs                      |  5 +--
 .../src/tenant/secondary/heatmap_uploader.rs  |  9 ++--
 pageserver/src/tenant/timeline.rs             | 37 ++++++----------
 .../src/tenant/timeline/eviction_task.rs      | 42 ++++++++++++++----
 test_runner/regress/test_timeline_size.py     | 44 ++++---------------
 5 files changed, 60 insertions(+), 77 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2362f19068..c3103917ee 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3461,9 +3461,8 @@ impl Tenant {
             // Run each timeline's flush in a task holding the timeline's gate: this
             // means that if this function's future is cancelled, the Timeline shutdown
             // will still wait for any I/O in here to complete.
-            let gate = match timeline.gate.enter() {
-                Ok(g) => g,
-                Err(_) => continue,
+            let Ok(gate) = timeline.gate.enter() else {
+                continue;
             };
             let jh = tokio::task::spawn(async move { flush_timeline(gate, timeline).await });
             results.push(jh);
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 660459a733..147cf683ba 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -373,12 +373,9 @@ async fn upload_tenant_heatmap(
     // Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise
     // when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind
     // in remote storage.
-    let _guard = match tenant.gate.enter() {
-        Ok(g) => g,
-        Err(_) => {
-            tracing::info!("Skipping heatmap upload for tenant which is shutting down");
-            return Err(UploadHeatmapError::Cancelled);
-        }
+    let Ok(_guard) = tenant.gate.enter() else {
+        tracing::info!("Skipping heatmap upload for tenant which is shutting down");
+        return Err(UploadHeatmapError::Cancelled);
     };
 
     for (timeline_id, timeline) in timelines {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b14eafa194..d13d4dc7d4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -33,7 +33,10 @@ use tokio::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::{bin_ser::BeSer, sync::gate::Gate};
+use utils::{
+    bin_ser::BeSer,
+    sync::gate::{Gate, GateGuard},
+};
 
 use std::ops::{Deref, Range};
 use std::pin::pin;
@@ -2288,14 +2291,17 @@ impl Timeline {
         // accurate relation sizes, and they do not emit consumption metrics.
         debug_assert!(self.tenant_shard_id.is_zero());
 
-        let _guard = self.gate.enter();
+        let guard = self
+            .gate
+            .enter()
+            .map_err(|_| CalculateLogicalSizeError::Cancelled)?;
 
         let self_calculation = Arc::clone(self);
 
         let mut calculation = pin!(async {
             let ctx = ctx.attached_child();
             self_calculation
-                .calculate_logical_size(lsn, cause, &ctx)
+                .calculate_logical_size(lsn, cause, &guard, &ctx)
                 .await
         });
 
@@ -2324,33 +2330,16 @@ impl Timeline {
         &self,
         up_to_lsn: Lsn,
         cause: LogicalSizeCalculationCause,
+        _guard: &GateGuard,
         ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         info!(
             "Calculating logical size for timeline {} at {}",
             self.timeline_id, up_to_lsn
         );
-        // These failpoints are used by python tests to ensure that we don't delete
-        // the timeline while the logical size computation is ongoing.
-        // The first failpoint is used to make this function pause.
-        // Then the python test initiates timeline delete operation in a thread.
-        // It waits for a few seconds, then arms the second failpoint and disables
-        // the first failpoint. The second failpoint prints an error if the timeline
-        // delete code has deleted the on-disk state while we're still running here.
-        // It shouldn't do that. If it does it anyway, the error will be caught
-        // by the test suite, highlighting the problem.
-        fail::fail_point!("timeline-calculate-logical-size-pause");
-        fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| {
-            if !self
-                .conf
-                .timeline_path(&self.tenant_shard_id, &self.timeline_id)
-                .exists()
-            {
-                error!("timeline-calculate-logical-size-pre metadata file does not exist")
-            }
-            // need to return something
-            Ok(0)
-        });
+
+        pausable_failpoint!("timeline-calculate-logical-size-pause");
+
         // See if we've already done the work for initial size calculation.
         // This is a short-cut for timelines that are mostly unused.
         if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) {
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 127e351c14..008f9482c4 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -34,7 +34,7 @@ use crate::{
     },
 };
 
-use utils::completion;
+use utils::{completion, sync::gate::GateGuard};
 
 use super::Timeline;
 
@@ -81,6 +81,12 @@ impl Timeline {
     #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
     async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
         use crate::tenant::tasks::random_init_delay;
+
+        // acquire the gate guard only once within a useful span
+        let Ok(guard) = self.gate.enter() else {
+            return;
+        };
+
         {
             let policy = self.get_eviction_policy();
             let period = match policy {
@@ -96,7 +102,9 @@ impl Timeline {
         let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn);
         loop {
             let policy = self.get_eviction_policy();
-            let cf = self.eviction_iteration(&policy, &cancel, &ctx).await;
+            let cf = self
+                .eviction_iteration(&policy, &cancel, &guard, &ctx)
+                .await;
 
             match cf {
                 ControlFlow::Break(()) => break,
@@ -117,6 +125,7 @@ impl Timeline {
         self: &Arc<Self>,
         policy: &EvictionPolicy,
         cancel: &CancellationToken,
+        gate: &GateGuard,
         ctx: &RequestContext,
     ) -> ControlFlow<(), Instant> {
         debug!("eviction iteration: {policy:?}");
@@ -127,14 +136,17 @@ impl Timeline {
                 return ControlFlow::Continue(Instant::now() + Duration::from_secs(10));
             }
             EvictionPolicy::LayerAccessThreshold(p) => {
-                match self.eviction_iteration_threshold(p, cancel, ctx).await {
+                match self
+                    .eviction_iteration_threshold(p, cancel, gate, ctx)
+                    .await
+                {
                     ControlFlow::Break(()) => return ControlFlow::Break(()),
                     ControlFlow::Continue(()) => (),
                 }
                 (p.period, p.threshold)
             }
             EvictionPolicy::OnlyImitiate(p) => {
-                if self.imitiate_only(p, cancel, ctx).await.is_break() {
+                if self.imitiate_only(p, cancel, gate, ctx).await.is_break() {
                     return ControlFlow::Break(());
                 }
                 (p.period, p.threshold)
@@ -165,6 +177,7 @@ impl Timeline {
         self: &Arc<Self>,
         p: &EvictionPolicyLayerAccessThreshold,
         cancel: &CancellationToken,
+        gate: &GateGuard,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
         let now = SystemTime::now();
@@ -180,7 +193,7 @@ impl Timeline {
             _ = self.cancel.cancelled() => return ControlFlow::Break(()),
         };
 
-        match self.imitate_layer_accesses(p, cancel, ctx).await {
+        match self.imitate_layer_accesses(p, cancel, gate, ctx).await {
             ControlFlow::Break(()) => return ControlFlow::Break(()),
             ControlFlow::Continue(()) => (),
         }
@@ -302,6 +315,7 @@ impl Timeline {
         self: &Arc<Self>,
         p: &EvictionPolicyLayerAccessThreshold,
         cancel: &CancellationToken,
+        gate: &GateGuard,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
         let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
@@ -315,7 +329,7 @@ impl Timeline {
             _ = self.cancel.cancelled() => return ControlFlow::Break(()),
         };
 
-        self.imitate_layer_accesses(p, cancel, ctx).await
+        self.imitate_layer_accesses(p, cancel, gate, ctx).await
     }
 
     /// If we evict layers but keep cached values derived from those layers, then
@@ -347,6 +361,7 @@ impl Timeline {
         &self,
         p: &EvictionPolicyLayerAccessThreshold,
         cancel: &CancellationToken,
+        gate: &GateGuard,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
         if !self.tenant_shard_id.is_zero() {
@@ -365,7 +380,7 @@ impl Timeline {
         match state.last_layer_access_imitation {
             Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
             _ => {
-                self.imitate_timeline_cached_layer_accesses(ctx).await;
+                self.imitate_timeline_cached_layer_accesses(gate, ctx).await;
                 state.last_layer_access_imitation = Some(tokio::time::Instant::now())
             }
         }
@@ -405,12 +420,21 @@ impl Timeline {
 
     /// Recompute the values which would cause on-demand downloads during restart.
     #[instrument(skip_all)]
-    async fn imitate_timeline_cached_layer_accesses(&self, ctx: &RequestContext) {
+    async fn imitate_timeline_cached_layer_accesses(
+        &self,
+        guard: &GateGuard,
+        ctx: &RequestContext,
+    ) {
         let lsn = self.get_last_record_lsn();
 
         // imitiate on-restart initial logical size
         let size = self
-            .calculate_logical_size(lsn, LogicalSizeCalculationCause::EvictionTaskImitation, ctx)
+            .calculate_logical_size(
+                lsn,
+                LogicalSizeCalculationCause::EvictionTaskImitation,
+                guard,
+                ctx,
+            )
             .instrument(info_span!("calculate_logical_size"))
             .await;
 
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 0788c49c7b..327e5abe26 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -1,8 +1,6 @@
 import concurrent.futures
 import math
-import queue
 import random
-import threading
 import time
 from contextlib import closing
 from pathlib import Path
@@ -20,7 +18,6 @@ from fixtures.neon_fixtures import (
     VanillaPostgres,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     assert_tenant_state,
     timeline_delete_wait_completed,
@@ -331,41 +328,18 @@ def test_timeline_initial_logical_size_calculation_cancellation(
     assert_size_calculation_not_done()
 
     log.info(
-        f"try to delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish"
+        f"delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish"
     )
-    delete_timeline_success: queue.Queue[bool] = queue.Queue(maxsize=1)
 
-    def delete_timeline_thread_fn():
-        try:
-            if deletion_method == "tenant_detach":
-                client.tenant_detach(tenant_id)
-            elif deletion_method == "timeline_delete":
-                timeline_delete_wait_completed(client, tenant_id, timeline_id)
-            delete_timeline_success.put(True)
-        except PageserverApiException:
-            delete_timeline_success.put(False)
-            raise
+    if deletion_method == "tenant_detach":
+        client.tenant_detach(tenant_id)
+    elif deletion_method == "timeline_delete":
+        timeline_delete_wait_completed(client, tenant_id, timeline_id)
+    else:
+        raise RuntimeError(deletion_method)
 
-    delete_timeline_thread = threading.Thread(target=delete_timeline_thread_fn)
-    delete_timeline_thread.start()
-    # give it some time to settle in the state where it waits for size computation task
-    time.sleep(5)
-    if not delete_timeline_success.empty():
-        raise AssertionError(
-            f"test is broken, the {deletion_method} should be stuck waiting for size computation task, got result {delete_timeline_success.get()}"
-        )
-
-    log.info(
-        "resume the size calculation. The failpoint checks that the timeline directory still exists."
-    )
-    client.configure_failpoints(("timeline-calculate-logical-size-check-dir-exists", "return"))
-    client.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
-
-    log.info("wait for delete timeline thread to finish and assert that it succeeded")
-    assert delete_timeline_success.get()
-
-    # if the implementation is incorrect, the teardown would complain about an error log
-    # message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists"
+    # timeline-calculate-logical-size-pause is still paused, but it doesn't
+    # matter because it's a pausable_failpoint, which can be cancelled by drop.
 
 
 def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder):

From 896d51367ecb17773677b5f845803dc3c6aa2a70 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 27 Feb 2024 19:53:02 +0400
Subject: [PATCH 0285/1571] proxy: introdice is cold start for analytics
 (#6902)

## Problem

Data team cannot distinguish between cold start and not cold start.

## Summary of changes

Report `is_cold_start` to analytics.

---------

Co-authored-by: Conrad Ludgate <conrad@neon.tech>
---
 proxy/src/console/messages.rs |  1 +
 proxy/src/context.rs          |  3 ++
 proxy/src/context/parquet.rs  | 62 +++++++++++++++++++----------------
 3 files changed, 37 insertions(+), 29 deletions(-)

diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 4e5920436f..1f94059f1e 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -98,6 +98,7 @@ pub struct MetricsAuxInfo {
     pub endpoint_id: EndpointId,
     pub project_id: ProjectId,
     pub branch_id: BranchId,
+    pub is_cold_start: Option<bool>,
 }
 
 #[cfg(test)]
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index e5caa5bd59..4d8ced6f8f 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -40,6 +40,7 @@ pub struct RequestMonitoring {
     error_kind: Option<ErrorKind>,
     pub(crate) auth_method: Option<AuthMethod>,
     success: bool,
+    is_cold_start: Option<bool>,
 
     // extra
     // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -79,6 +80,7 @@ impl RequestMonitoring {
             error_kind: None,
             auth_method: None,
             success: false,
+            is_cold_start: None,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
             latency_timer: LatencyTimer::new(protocol),
@@ -102,6 +104,7 @@ impl RequestMonitoring {
         self.branch = Some(x.branch_id);
         self.endpoint_id = Some(x.endpoint_id);
         self.project = Some(x.project_id);
+        self.is_cold_start = x.is_cold_start;
     }
 
     pub fn set_project_id(&mut self, project_id: ProjectId) {
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index d941445c2d..54f51604bf 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -92,6 +92,8 @@ struct RequestData {
     /// Success is counted if we form a HTTP response with sql rows inside
     /// Or if we make it to proxy_pass
     success: bool,
+    /// Indicates if the cplane started the new compute node for this request.
+    is_cold_start: Option<bool>,
     /// Tracks time from session start (HTTP request/libpq TCP handshake)
     /// Through to success/failure
     duration_us: u64,
@@ -119,6 +121,7 @@ impl From<RequestMonitoring> for RequestData {
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
             success: value.success,
+            is_cold_start: value.is_cold_start,
             duration_us: SystemTime::from(value.first_packet)
                 .elapsed()
                 .unwrap_or_default()
@@ -452,6 +455,7 @@ mod tests {
             region: "us-east-1",
             error: None,
             success: rng.gen(),
+            is_cold_start: Some(true),
             duration_us: rng.gen_range(0..30_000_000),
         }
     }
@@ -521,15 +525,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1313727, 3, 6000),
-                (1313720, 3, 6000),
-                (1313780, 3, 6000),
-                (1313737, 3, 6000),
-                (1313867, 3, 6000),
-                (1313709, 3, 6000),
-                (1313501, 3, 6000),
-                (1313737, 3, 6000),
-                (438118, 1, 2000)
+                (1315032, 3, 6000),
+                (1315025, 3, 6000),
+                (1315085, 3, 6000),
+                (1315042, 3, 6000),
+                (1315172, 3, 6000),
+                (1315014, 3, 6000),
+                (1314806, 3, 6000),
+                (1315042, 3, 6000),
+                (438563, 1, 2000)
             ],
         );
 
@@ -559,11 +563,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1219459, 5, 10000),
-                (1225609, 5, 10000),
-                (1227403, 5, 10000),
-                (1226765, 5, 10000),
-                (1218043, 5, 10000)
+                (1220433, 5, 10000),
+                (1226583, 5, 10000),
+                (1228377, 5, 10000),
+                (1227739, 5, 10000),
+                (1219017, 5, 10000)
             ],
         );
 
@@ -595,11 +599,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1205106, 5, 10000),
-                (1204837, 5, 10000),
-                (1205130, 5, 10000),
-                (1205118, 5, 10000),
-                (1205373, 5, 10000)
+                (1206080, 5, 10000),
+                (1205811, 5, 10000),
+                (1206104, 5, 10000),
+                (1206092, 5, 10000),
+                (1206347, 5, 10000)
             ],
         );
 
@@ -624,15 +628,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1313727, 3, 6000),
-                (1313720, 3, 6000),
-                (1313780, 3, 6000),
-                (1313737, 3, 6000),
-                (1313867, 3, 6000),
-                (1313709, 3, 6000),
-                (1313501, 3, 6000),
-                (1313737, 3, 6000),
-                (438118, 1, 2000)
+                (1315032, 3, 6000),
+                (1315025, 3, 6000),
+                (1315085, 3, 6000),
+                (1315042, 3, 6000),
+                (1315172, 3, 6000),
+                (1315014, 3, 6000),
+                (1314806, 3, 6000),
+                (1315042, 3, 6000),
+                (438563, 1, 2000)
             ],
         );
 
@@ -669,7 +673,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(658383, 2, 3001), (658097, 2, 3000), (657893, 2, 2999)],
+            [(659129, 2, 3001), (658842, 2, 3000), (658638, 2, 2999)],
         );
 
         tmpdir.close().unwrap();

From c8ac4c054e3705514415bef26860e33273878d1b Mon Sep 17 00:00:00 2001
From: siegerts <stephen.siegert@gmail.com>
Date: Tue, 27 Feb 2024 11:08:43 -0500
Subject: [PATCH 0286/1571] readme: Update Neon link URL (#6918)

## Problem

## Summary of changes

Updates the neon.tech link to point to a /github page in order to
correctly attribute visits originating from the repo.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 72a924fe9e..ce14a32a2a 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.
 
 ## Quick start
-Try the [Neon Free Tier](https://neon.tech) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
+Try the [Neon Free Tier](https://neon.tech/github) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
 
 Alternatively, compile and run the project [locally](#running-local-installation).
 

From 045bc6af8bae53305cf30771faa2d8478299868b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 27 Feb 2024 17:15:46 +0100
Subject: [PATCH 0287/1571] Add new compaction abstraction, simulator, and
 implementation. (#6830)

Rebased version of #5234, part of #6768

This consists of three parts:

1. A refactoring and new contract for implementing and testing
compaction.

The logic is now in a separate crate, with no dependency on the
'pageserver' crate. It defines an interface that the real pageserver
must implement, in order to call the compaction algorithm. The interface
models things like delta and image layers, but just the parts that the
compaction algorithm needs to make decisions. That makes it easier unit
test the algorithm and experiment with different implementations.

I did not convert the current code to the new abstraction, however. When
compaction algorithm is set to "Legacy", we just use the old code. It
might be worthwhile to convert the old code to the new abstraction, so
that we can compare the behavior of the new algorithm against the old
one, using the same simulated cases. If we do that, have to be careful
that the converted code really is equivalent to the old.

This inclues only trivial changes to the main pageserver code. All the
new code is behind a tenant config option. So this should be pretty safe
to merge, even if the new implementation is buggy, as long as we don't
enable it.

2. A new compaction algorithm, implemented using the new abstraction.

The new algorithm is tiered compaction. It is inspired by the PoC at PR
#4539, although I did not use that code directly, as I needed the new
implementation to fit the new abstraction. The algorithm here is less
advanced, I did not implement partial image layers, for example. I
wanted to keep it simple on purpose, so that as we add bells and
whistles, we can see the effects using the included simulator.

One difference to #4539 and your typical LSM tree implementations is how
we keep track of the LSM tree levels. This PR doesn't have a permanent
concept of a level, tier or sorted run at all. There are just delta and
image layers. However, when compaction starts, we look at the layers
that exist, and arrange them into levels, depending on their shapes.
That is ephemeral: when the compaction finishes, we forget that
information. This allows the new algorithm to work without any extra
bookkeeping. That makes it easier to transition from the old algorithm
to new, and back again.

There is just a new tenant config option to choose the compaction
algorithm. The default is "Legacy", meaning the current algorithm in
'main'. If you set it to "Tiered", the new algorithm is used.

3. A simulator, which implements the new abstraction.

The simulator can be used to analyze write and storage amplification,
without running a test with the full pageserver. It can also draw an SVG
animation of the simulation, to visualize how layers are created and
deleted.

To run the simulator:

    cargo run --bin compaction-simulator run-suite

---------

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 .gitignore                                    |   1 +
 Cargo.lock                                    |  48 +
 Cargo.toml                                    |   2 +
 control_plane/src/pageserver.rs               |  10 +
 libs/pageserver_api/src/keyspace.rs           |   1 +
 libs/pageserver_api/src/models.rs             |   9 +
 pageserver/Cargo.toml                         |   1 +
 pageserver/compaction/Cargo.toml              |  54 ++
 pageserver/compaction/TODO.md                 |  51 ++
 .../src/bin/compaction-simulator.rs           | 214 +++++
 pageserver/compaction/src/compact_tiered.rs   | 866 ++++++++++++++++++
 pageserver/compaction/src/helpers.rs          | 243 +++++
 pageserver/compaction/src/identify_levels.rs  | 376 ++++++++
 pageserver/compaction/src/interface.rs        | 167 ++++
 pageserver/compaction/src/lib.rs              |  12 +
 pageserver/compaction/src/simulator.rs        | 613 +++++++++++++
 pageserver/compaction/src/simulator/draw.rs   | 411 +++++++++
 pageserver/compaction/tests/tests.rs          |  35 +
 pageserver/src/consumption_metrics.rs         |   2 +-
 pageserver/src/tenant.rs                      |   1 +
 pageserver/src/tenant/config.rs               |  17 +
 .../src/tenant/storage_layer/delta_layer.rs   |  12 +
 pageserver/src/tenant/timeline.rs             |  75 +-
 pageserver/src/tenant/timeline/compaction.rs  | 477 ++++++++++
 .../regress/test_attach_tenant_config.py      |   3 +
 25 files changed, 3687 insertions(+), 14 deletions(-)
 create mode 100644 pageserver/compaction/Cargo.toml
 create mode 100644 pageserver/compaction/TODO.md
 create mode 100644 pageserver/compaction/src/bin/compaction-simulator.rs
 create mode 100644 pageserver/compaction/src/compact_tiered.rs
 create mode 100644 pageserver/compaction/src/helpers.rs
 create mode 100644 pageserver/compaction/src/identify_levels.rs
 create mode 100644 pageserver/compaction/src/interface.rs
 create mode 100644 pageserver/compaction/src/lib.rs
 create mode 100644 pageserver/compaction/src/simulator.rs
 create mode 100644 pageserver/compaction/src/simulator/draw.rs
 create mode 100644 pageserver/compaction/tests/tests.rs
 create mode 100644 pageserver/src/tenant/timeline/compaction.rs

diff --git a/.gitignore b/.gitignore
index 3f4495c9e7..2c38cdcc59 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ test_output/
 neon.iml
 /.neon
 /integration_tests/.neon
+compaction-suite-results.*
 
 # Coverage
 *.profraw
diff --git a/Cargo.lock b/Cargo.lock
index abb335e97c..dead212156 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3498,6 +3498,7 @@ dependencies = [
  "num_cpus",
  "once_cell",
  "pageserver_api",
+ "pageserver_compaction",
  "pin-project-lite",
  "postgres",
  "postgres-protocol",
@@ -3588,6 +3589,53 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "pageserver_compaction"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-compression",
+ "async-stream",
+ "async-trait",
+ "byteorder",
+ "bytes",
+ "chrono",
+ "clap",
+ "const_format",
+ "consumption_metrics",
+ "criterion",
+ "crossbeam-utils",
+ "either",
+ "fail",
+ "flate2",
+ "futures",
+ "git-version",
+ "hex",
+ "hex-literal",
+ "humantime",
+ "humantime-serde",
+ "itertools",
+ "metrics",
+ "once_cell",
+ "pageserver_api",
+ "pin-project-lite",
+ "rand 0.8.5",
+ "smallvec",
+ "svg_fmt",
+ "sync_wrapper",
+ "thiserror",
+ "tokio",
+ "tokio-io-timeout",
+ "tokio-util",
+ "tracing",
+ "tracing-error",
+ "tracing-subscriber",
+ "url",
+ "utils",
+ "walkdir",
+ "workspace_hack",
+]
+
 [[package]]
 name = "parking"
 version = "2.1.1"
diff --git a/Cargo.toml b/Cargo.toml
index 98fbc9c4f4..90b02b30ec 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,6 +5,7 @@ members = [
     "control_plane",
     "control_plane/attachment_service",
     "pageserver",
+    "pageserver/compaction",
     "pageserver/ctl",
     "pageserver/client",
     "pageserver/pagebench",
@@ -199,6 +200,7 @@ consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
+pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 2c5cac327a..59cd4789a8 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -352,6 +352,11 @@ impl PageServerNode {
                 .remove("compaction_threshold")
                 .map(|x| x.parse::<usize>())
                 .transpose()?,
+            compaction_algorithm: settings
+                .remove("compaction_algorithm")
+                .map(serde_json::from_str)
+                .transpose()
+                .context("Failed to parse 'compaction_algorithm' json")?,
             gc_horizon: settings
                 .remove("gc_horizon")
                 .map(|x| x.parse::<u64>())
@@ -455,6 +460,11 @@ impl PageServerNode {
                     .map(|x| x.parse::<usize>())
                     .transpose()
                     .context("Failed to parse 'compaction_threshold' as an integer")?,
+                compaction_algorithm: settings
+                    .remove("compactin_algorithm")
+                    .map(serde_json::from_str)
+                    .transpose()
+                    .context("Failed to parse 'compaction_algorithm' json")?,
                 gc_horizon: settings
                     .remove("gc_horizon")
                     .map(|x| x.parse::<u64>())
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 443ffdcf03..05fa4562e1 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -307,6 +307,7 @@ impl KeySpaceRandomAccum {
     }
 }
 
+#[inline(always)]
 pub fn key_range_size(key_range: &Range<Key>) -> u32 {
     let start = key_range.start;
     let end = key_range.end;
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ce9afd65ac..61aa8a5ae8 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -272,6 +272,8 @@ pub struct TenantConfig {
     pub compaction_target_size: Option<u64>,
     pub compaction_period: Option<String>,
     pub compaction_threshold: Option<usize>,
+    // defer parsing compaction_algorithm, like eviction_policy
+    pub compaction_algorithm: Option<CompactionAlgorithm>,
     pub gc_horizon: Option<u64>,
     pub gc_period: Option<String>,
     pub image_creation_threshold: Option<usize>,
@@ -306,6 +308,13 @@ impl EvictionPolicy {
     }
 }
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(tag = "kind")]
+pub enum CompactionAlgorithm {
+    Legacy,
+    Tiered,
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
     #[serde(with = "humantime_serde")]
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index eeee2055c2..5adeaffe1a 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -73,6 +73,7 @@ url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
 pageserver_api.workspace = true
+pageserver_compaction.workspace = true
 postgres_connection.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml
new file mode 100644
index 0000000000..47f318db63
--- /dev/null
+++ b/pageserver/compaction/Cargo.toml
@@ -0,0 +1,54 @@
+[package]
+name = "pageserver_compaction"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[features]
+default = []
+
+[dependencies]
+anyhow.workspace = true
+async-compression.workspace = true
+async-stream.workspace = true
+async-trait.workspace = true
+byteorder.workspace = true
+bytes.workspace = true
+chrono = { workspace = true, features = ["serde"] }
+clap = { workspace = true, features = ["string"] }
+const_format.workspace = true
+consumption_metrics.workspace = true
+crossbeam-utils.workspace = true
+either.workspace = true
+flate2.workspace = true
+fail.workspace = true
+futures.workspace = true
+git-version.workspace = true
+hex.workspace = true
+humantime.workspace = true
+humantime-serde.workspace = true
+itertools.workspace = true
+once_cell.workspace = true
+pageserver_api.workspace = true
+pin-project-lite.workspace = true
+rand.workspace = true
+smallvec = { workspace = true, features = ["write"] }
+svg_fmt.workspace = true
+sync_wrapper.workspace = true
+thiserror.workspace = true
+tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
+tokio-io-timeout.workspace = true
+tokio-util.workspace = true
+tracing.workspace = true
+tracing-error.workspace = true
+tracing-subscriber.workspace = true
+url.workspace = true
+walkdir.workspace = true
+metrics.workspace = true
+utils.workspace = true
+workspace_hack.workspace = true
+
+[dev-dependencies]
+criterion.workspace = true
+hex-literal.workspace = true
+tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
diff --git a/pageserver/compaction/TODO.md b/pageserver/compaction/TODO.md
new file mode 100644
index 0000000000..85523ad5b3
--- /dev/null
+++ b/pageserver/compaction/TODO.md
@@ -0,0 +1,51 @@
+# TODO
+
+- If the key space can be perfectly partitioned at some key, perform planning on each
+  partition separately. For example, if we are compacting a level with layers like this:
+
+  ```
+              :
+  +--+ +----+ :  +------+
+  |  | |    | :  |      |
+  +--+ +----+ :  +------+
+              :
+  +-----+ +-+ : +--------+
+  |     | | | : |        |
+  +-----+ +-+ : +--------+
+              :
+  ```
+
+  At the dotted line, there is a natural split in the key space, such that all
+  layers are either on the left or the right of it. We can compact the
+  partitions separately.  We could choose to create image layers for one
+  partition but not the other one, for example.
+
+- All the layers don't have to be exactly the same size, we can choose to cut a
+  layer short or stretch it a little larger than the target size, if it helps
+  the overall system. We can help perfect partitions (see previous bullet point)
+  to happen more frequently, by choosing the cut points wisely. For example, try
+  to cut layers at boundaries of underlying image layers. And "snap to grid",
+  i.e. don't cut layers at any key, but e.g. only when key % 10000 = 0.
+
+- Avoid rewriting layers when we'd just create an identical layer to an input
+  layer.
+
+- Parallelism. The code is already split up into planning and execution, so that
+  we first split up the compaction work into "Jobs", and then execute them.
+  It would be straightforward to execute multiple jobs in parallel.
+
+- Materialize extra pages in delta layers during compaction. This would reduce
+  read amplification. There has been the idea of partial image layers. Materializing
+  extra pages in the delta layers achieve the same goal, without introducing a new
+  concept.
+
+## Simulator
+
+- Expand the simulator for more workloads
+- Automate a test suite that runs the simluator with different workloads and
+  spits out a table of results
+- Model read amplification
+- More sanity checking. One idea is to keep a reference count of each
+  MockRecord, i.e. use Arc<MockRecord> instead of plain MockRecord, and panic if
+  a MockRecord that is newer than PITR horizon is completely dropped. That would
+  indicate that the record was lost.
diff --git a/pageserver/compaction/src/bin/compaction-simulator.rs b/pageserver/compaction/src/bin/compaction-simulator.rs
new file mode 100644
index 0000000000..1fd69407d3
--- /dev/null
+++ b/pageserver/compaction/src/bin/compaction-simulator.rs
@@ -0,0 +1,214 @@
+use clap::{Parser, Subcommand};
+use pageserver_compaction::simulator::MockTimeline;
+use rand::Rng;
+use std::io::Write;
+use std::path::{Path, PathBuf};
+use std::sync::OnceLock;
+
+use utils::project_git_version;
+
+project_git_version!(GIT_VERSION);
+
+#[derive(Parser)]
+#[command(
+    version = GIT_VERSION,
+    about = "Neon Pageserver compaction simulator",
+    long_about = "A developer tool to visualize and test compaction"
+)]
+#[command(propagate_version = true)]
+struct CliOpts {
+    #[command(subcommand)]
+    command: Commands,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    RunSuite,
+    Simulate(SimulateCmd),
+}
+
+#[derive(Clone, clap::ValueEnum)]
+enum Distribution {
+    Uniform,
+    HotCold,
+}
+
+/// Read and update pageserver metadata file
+#[derive(Parser)]
+struct SimulateCmd {
+    distribution: Distribution,
+
+    /// Number of records to digest
+    num_records: u64,
+    /// Record length
+    record_len: u64,
+
+    // Logical database size in MB
+    logical_size: u64,
+}
+
+async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> {
+    let mut executor = MockTimeline::new();
+
+    // Convert the logical size in MB into a key range.
+    let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
+    //let key_range = u64::MIN..u64::MAX;
+    println!(
+        "starting simulation with key range {:016X}-{:016X}",
+        key_range.start, key_range.end
+    );
+
+    // helper function to print progress indicator
+    let print_progress = |i| -> anyhow::Result<()> {
+        if i == 0 || (i + 1) % 10000 == 0 || i == cmd.num_records - 1 {
+            print!(
+                "\ringested {} / {} records, {} MiB / {} MiB...",
+                i + 1,
+                cmd.num_records,
+                (i + 1) * cmd.record_len / (1_000_000),
+                cmd.num_records * cmd.record_len / (1_000_000),
+            );
+            std::io::stdout().flush()?;
+        }
+        Ok(())
+    };
+
+    match cmd.distribution {
+        Distribution::Uniform => {
+            for i in 0..cmd.num_records {
+                executor.ingest_uniform(1, cmd.record_len, &key_range)?;
+                executor.compact_if_needed().await?;
+
+                print_progress(i)?;
+            }
+        }
+        Distribution::HotCold => {
+            let splitpoint = key_range.start + (key_range.end - key_range.start) / 10;
+            let hot_key_range = 0..splitpoint;
+            let cold_key_range = splitpoint..key_range.end;
+
+            for i in 0..cmd.num_records {
+                let chosen_range = if rand::thread_rng().gen_bool(0.9) {
+                    &hot_key_range
+                } else {
+                    &cold_key_range
+                };
+                executor.ingest_uniform(1, cmd.record_len, chosen_range)?;
+                executor.compact_if_needed().await?;
+
+                print_progress(i)?;
+            }
+        }
+    }
+    println!("done!");
+    executor.flush_l0();
+    executor.compact_if_needed().await?;
+    let stats = executor.stats()?;
+
+    // Print the stats to stdout, and also to a file
+    print!("{stats}");
+    std::fs::write(results_path.join("stats.txt"), stats)?;
+
+    let animation_path = results_path.join("compaction-animation.html");
+    executor.draw_history(std::fs::File::create(&animation_path)?)?;
+    println!(
+        "animation: file://{}",
+        animation_path.canonicalize()?.display()
+    );
+
+    Ok(())
+}
+
+async fn run_suite_cmd(results_path: &Path, workload: &SimulateCmd) -> anyhow::Result<()> {
+    std::fs::create_dir(results_path)?;
+
+    set_log_file(File::create(results_path.join("log"))?);
+    let result = simulate(workload, results_path).await;
+    set_log_stdout();
+    result
+}
+
+async fn run_suite() -> anyhow::Result<()> {
+    let top_results_path = PathBuf::from(format!(
+        "compaction-suite-results.{}",
+        std::time::SystemTime::UNIX_EPOCH.elapsed()?.as_secs()
+    ));
+    std::fs::create_dir(&top_results_path)?;
+
+    let workload = SimulateCmd {
+        distribution: Distribution::Uniform,
+        // Generate 20 GB of WAL
+        record_len: 1_000,
+        num_records: 20_000_000,
+        // Logical size 5 GB
+        logical_size: 5_000,
+    };
+
+    run_suite_cmd(&top_results_path.join("uniform-20GB-5GB"), &workload).await?;
+
+    println!(
+        "All tests finished. Results in {}",
+        top_results_path.display()
+    );
+    Ok(())
+}
+
+use std::fs::File;
+use std::io::Stdout;
+use std::sync::Mutex;
+use tracing_subscriber::fmt::writer::EitherWriter;
+use tracing_subscriber::fmt::MakeWriter;
+
+static LOG_FILE: OnceLock<Mutex<EitherWriter<File, Stdout>>> = OnceLock::new();
+fn get_log_output() -> &'static Mutex<EitherWriter<File, Stdout>> {
+    LOG_FILE.get_or_init(|| std::sync::Mutex::new(EitherWriter::B(std::io::stdout())))
+}
+
+fn set_log_file(f: File) {
+    *get_log_output().lock().unwrap() = EitherWriter::A(f);
+}
+
+fn set_log_stdout() {
+    *get_log_output().lock().unwrap() = EitherWriter::B(std::io::stdout());
+}
+
+fn init_logging() -> anyhow::Result<()> {
+    // We fall back to printing all spans at info-level or above if
+    // the RUST_LOG environment variable is not set.
+    let rust_log_env_filter = || {
+        tracing_subscriber::EnvFilter::try_from_default_env()
+            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
+    };
+
+    // NB: the order of the with() calls does not matter.
+    // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
+    use tracing_subscriber::prelude::*;
+    tracing_subscriber::registry()
+        .with({
+            let log_layer = tracing_subscriber::fmt::layer()
+                .with_target(false)
+                .with_ansi(false)
+                .with_writer(|| get_log_output().make_writer());
+            log_layer.with_filter(rust_log_env_filter())
+        })
+        .init();
+
+    Ok(())
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let cli = CliOpts::parse();
+
+    init_logging()?;
+
+    match cli.command {
+        Commands::Simulate(cmd) => {
+            simulate(&cmd, &PathBuf::from("/tmp/compactions.html")).await?;
+        }
+        Commands::RunSuite => {
+            run_suite().await?;
+        }
+    };
+    Ok(())
+}
diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
new file mode 100644
index 0000000000..52219a014c
--- /dev/null
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -0,0 +1,866 @@
+//! # Tiered compaction algorithm.
+//!
+//! Read all the input delta files, and write a new set of delta files that
+//! include all the input WAL records. See retile_deltas().
+//!
+//! In a "normal" LSM tree, you get to remove any values that are overwritten by
+//! later values, but in our system, we keep all the history. So the reshuffling
+//! doesn't remove any garbage, it just reshuffles the records to reduce read
+//! amplification, i.e. the number of files that you need to access to find the
+//! WAL records for a given key.
+//!
+//! If the new delta files would be very "narrow", i.e. each file would cover
+//! only a narrow key range, then we create a new set of image files
+//! instead. The current threshold is that if the estimated total size of the
+//! image layers is smaller than the size of the deltas, then we create image
+//! layers. That amounts to 2x storage amplification, and it means that the
+//! distance of image layers in LSN dimension is roughly equal to the logical
+//! database size. For example, if the logical database size is 10 GB, we would
+//! generate new image layers every 10 GB of WAL.
+use futures::StreamExt;
+use tracing::{debug, info};
+
+use std::collections::{HashSet, VecDeque};
+use std::ops::Range;
+
+use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
+use crate::interface::*;
+use utils::lsn::Lsn;
+
+use crate::identify_levels::identify_level;
+
+/// Main entry point to compaction.
+///
+/// The starting point is a cutoff LSN (`end_lsn`). The compaction is run on
+/// everything below that point, that needs compaction. The cutoff LSN must
+/// partition the layers so that there are no layers that span across that
+/// LSN. To start compaction at the top of the tree, pass the end LSN of the
+/// written last L0 layer.
+pub async fn compact_tiered<E: CompactionJobExecutor>(
+    executor: &mut E,
+    end_lsn: Lsn,
+    target_file_size: u64,
+    fanout: u64,
+    ctx: &E::RequestContext,
+) -> anyhow::Result<()> {
+    assert!(fanout >= 2);
+    // Start at L0
+    let mut current_level_no = 0;
+    let mut current_level_target_height = target_file_size;
+    loop {
+        // end LSN +1 to include possible image layers exactly at 'end_lsn'.
+        let all_layers = executor
+            .get_layers(
+                &(E::Key::MIN..E::Key::MAX),
+                &(Lsn(u64::MIN)..end_lsn + 1),
+                ctx,
+            )
+            .await?;
+        info!(
+            "Compacting L{}, total # of layers: {}",
+            current_level_no,
+            all_layers.len()
+        );
+
+        // Identify the range of LSNs that belong to this level. We assume that
+        // each file in this level span an LSN range up to 1.75x target file
+        // size. That should give us enough slop that if we created a slightly
+        // oversized L0 layer, e.g. because flushing the in-memory layer was
+        // delayed for some reason, we don't consider the oversized layer to
+        // belong to L1. But not too much slop, that we don't accidentally
+        // "skip" levels.
+        let max_height = (current_level_target_height as f64 * 1.75) as u64;
+        let Some(level) = identify_level(all_layers, end_lsn, max_height).await? else {
+            break;
+        };
+
+        // Calculate the height of this level. If the # of tiers exceeds the
+        // fanout parameter, it's time to compact it.
+        let depth = level.depth();
+        info!(
+            "Level {} identified as LSN range {}-{}: depth {}",
+            current_level_no, level.lsn_range.start, level.lsn_range.end, depth
+        );
+        for l in &level.layers {
+            debug!("LEVEL {} layer: {}", current_level_no, l.short_id());
+        }
+        if depth < fanout {
+            debug!(
+                level = current_level_no,
+                depth = depth,
+                fanout,
+                "too few deltas to compact"
+            );
+            break;
+        }
+
+        compact_level(
+            &level.lsn_range,
+            &level.layers,
+            executor,
+            target_file_size,
+            ctx,
+        )
+        .await?;
+        if target_file_size == u64::MAX {
+            break;
+        }
+        current_level_no += 1;
+        current_level_target_height = current_level_target_height.saturating_mul(fanout);
+    }
+    Ok(())
+}
+
+async fn compact_level<E: CompactionJobExecutor>(
+    lsn_range: &Range<Lsn>,
+    layers: &[E::Layer],
+    executor: &mut E,
+    target_file_size: u64,
+    ctx: &E::RequestContext,
+) -> anyhow::Result<bool> {
+    let mut layer_fragments = Vec::new();
+    for l in layers {
+        layer_fragments.push(LayerFragment::new(l.clone()));
+    }
+
+    let mut state = LevelCompactionState {
+        target_file_size,
+        _lsn_range: lsn_range.clone(),
+        layers: layer_fragments,
+        jobs: Vec::new(),
+        job_queue: Vec::new(),
+        next_level: false,
+        executor,
+    };
+
+    let first_job = CompactionJob {
+        key_range: E::Key::MIN..E::Key::MAX,
+        lsn_range: lsn_range.clone(),
+        strategy: CompactionStrategy::Divide,
+        input_layers: state
+            .layers
+            .iter()
+            .enumerate()
+            .map(|i| LayerId(i.0))
+            .collect(),
+        completed: false,
+    };
+
+    state.jobs.push(first_job);
+    state.job_queue.push(JobId(0));
+    state.execute(ctx).await?;
+
+    info!(
+        "compaction completed! Need to process next level: {}",
+        state.next_level
+    );
+
+    Ok(state.next_level)
+}
+
+/// Blackboard that keeps track of the state of all the jobs and work remaining
+struct LevelCompactionState<'a, E>
+where
+    E: CompactionJobExecutor,
+{
+    // parameters
+    target_file_size: u64,
+
+    _lsn_range: Range<Lsn>,
+    layers: Vec<LayerFragment<E>>,
+
+    // job queue
+    jobs: Vec<CompactionJob<E>>,
+    job_queue: Vec<JobId>,
+
+    /// If false, no need to compact levels below this
+    next_level: bool,
+
+    /// Interface to the outside world
+    executor: &'a mut E,
+}
+
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+struct LayerId(usize);
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+struct JobId(usize);
+
+struct PendingJobSet {
+    pending: HashSet<JobId>,
+    completed: HashSet<JobId>,
+}
+
+impl PendingJobSet {
+    fn new() -> Self {
+        PendingJobSet {
+            pending: HashSet::new(),
+            completed: HashSet::new(),
+        }
+    }
+
+    fn complete_job(&mut self, job_id: JobId) {
+        self.pending.remove(&job_id);
+        self.completed.insert(job_id);
+    }
+
+    fn all_completed(&self) -> bool {
+        self.pending.is_empty()
+    }
+}
+
+// When we decide to rewrite a set of layers, LayerFragment is used to keep
+// track which new layers supersede an old layer. When all the stakeholder jobs
+// have completed, this layer can be deleted.
+struct LayerFragment<E>
+where
+    E: CompactionJobExecutor,
+{
+    layer: E::Layer,
+
+    // If we will write new layers to replace this one, this keeps track of the
+    // jobs that need to complete before this layer can be deleted. As the jobs
+    // complete, they are moved from 'pending' to 'completed' set. Once the
+    // 'pending' set becomes empty, the layer can be deleted.
+    //
+    // If None, this layer is not rewritten and must not be deleted.
+    deletable_after: Option<PendingJobSet>,
+
+    deleted: bool,
+}
+
+impl<E> LayerFragment<E>
+where
+    E: CompactionJobExecutor,
+{
+    fn new(layer: E::Layer) -> Self {
+        LayerFragment {
+            layer,
+            deletable_after: None,
+            deleted: false,
+        }
+    }
+}
+
+#[derive(PartialEq)]
+enum CompactionStrategy {
+    Divide,
+    CreateDelta,
+    CreateImage,
+}
+
+#[allow(dead_code)] // Todo
+struct CompactionJob<E: CompactionJobExecutor> {
+    key_range: Range<E::Key>,
+    lsn_range: Range<Lsn>,
+
+    strategy: CompactionStrategy,
+
+    input_layers: Vec<LayerId>,
+
+    completed: bool,
+}
+
+impl<'a, E> LevelCompactionState<'a, E>
+where
+    E: CompactionJobExecutor,
+{
+    /// Main loop of the executor.
+    ///
+    /// In each iteration, we take the next job from the queue, and execute it.
+    /// The execution might add new jobs to the queue. Keep going until the
+    /// queue is empty.
+    ///
+    /// Initially, the job queue consists of one Divide job over the whole
+    /// level. On first call, it is divided into smaller jobs.
+    async fn execute(&mut self, ctx: &E::RequestContext) -> anyhow::Result<()> {
+        // TODO: this would be pretty straightforward to parallelize with FuturesUnordered
+        while let Some(next_job_id) = self.job_queue.pop() {
+            info!("executing job {}", next_job_id.0);
+            self.execute_job(next_job_id, ctx).await?;
+        }
+
+        // all done!
+        Ok(())
+    }
+
+    async fn execute_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
+        let job = &self.jobs[job_id.0];
+        match job.strategy {
+            CompactionStrategy::Divide => {
+                self.divide_job(job_id, ctx).await?;
+                Ok(())
+            }
+            CompactionStrategy::CreateDelta => {
+                let mut deltas: Vec<E::DeltaLayer> = Vec::new();
+                let mut layer_ids: Vec<LayerId> = Vec::new();
+                for layer_id in &job.input_layers {
+                    let layer = &self.layers[layer_id.0].layer;
+                    if let Some(dl) = self.executor.downcast_delta_layer(layer).await? {
+                        deltas.push(dl.clone());
+                        layer_ids.push(*layer_id);
+                    }
+                }
+
+                self.executor
+                    .create_delta(&job.lsn_range, &job.key_range, &deltas, ctx)
+                    .await?;
+                self.jobs[job_id.0].completed = true;
+
+                // did we complete any fragments?
+                for layer_id in layer_ids {
+                    let l = &mut self.layers[layer_id.0];
+                    if let Some(deletable_after) = l.deletable_after.as_mut() {
+                        deletable_after.complete_job(job_id);
+                        if deletable_after.all_completed() {
+                            self.executor.delete_layer(&l.layer, ctx).await?;
+                            l.deleted = true;
+                        }
+                    }
+                }
+
+                self.next_level = true;
+
+                Ok(())
+            }
+            CompactionStrategy::CreateImage => {
+                self.executor
+                    .create_image(job.lsn_range.end, &job.key_range, ctx)
+                    .await?;
+                self.jobs[job_id.0].completed = true;
+
+                // TODO: we could check if any layers < PITR horizon became deletable
+                Ok(())
+            }
+        }
+    }
+
+    fn push_job(&mut self, job: CompactionJob<E>) -> JobId {
+        let job_id = JobId(self.jobs.len());
+        self.jobs.push(job);
+        self.job_queue.push(job_id);
+        job_id
+    }
+
+    /// Take a partition of the key space, and decide how to compact it.
+    ///
+    /// TODO: Currently, this is called exactly once for the level, and we
+    /// decide whether to create new image layers to cover the whole level, or
+    /// write a new set of delta. In the future, this should try to partition
+    /// the key space, and make the decision separately for each partition.
+    async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
+        let job = &self.jobs[job_id.0];
+        assert!(job.strategy == CompactionStrategy::Divide);
+
+        // Check for dummy cases
+        if job.input_layers.is_empty() {
+            return Ok(());
+        }
+
+        let job = &self.jobs[job_id.0];
+        assert!(job.strategy == CompactionStrategy::Divide);
+
+        // Would it be better to create images for this partition?
+        // Decide based on the average density of the level
+        let keyspace_size = keyspace_total_size(
+            &self
+                .executor
+                .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
+                .await?,
+        ) * 8192;
+
+        let wal_size = job
+            .input_layers
+            .iter()
+            .filter(|layer_id| self.layers[layer_id.0].layer.is_delta())
+            .map(|layer_id| self.layers[layer_id.0].layer.file_size())
+            .sum::<u64>();
+        if keyspace_size < wal_size {
+            // seems worth it
+            info!(
+                "covering with images, because keyspace_size is {}, size of deltas between {}-{} is {}",
+                keyspace_size, job.lsn_range.start, job.lsn_range.end, wal_size
+            );
+            self.cover_with_images(job_id, ctx).await
+        } else {
+            // do deltas
+            info!(
+                "coverage not worth it, keyspace_size {}, wal_size {}",
+                keyspace_size, wal_size
+            );
+            self.retile_deltas(job_id, ctx).await
+        }
+    }
+
+    // LSN
+    //  ^
+    //  |
+    //  |                          ###|###|#####
+    //  | +--+-----+--+            +--+-----+--+
+    //  | |  |     |  |            |  |     |  |
+    //  | +--+--+--+--+            +--+--+--+--+
+    //  | |     |     |            |     |     |
+    //  | +---+-+-+---+     ==>    +---+-+-+---+
+    //  | |   |   |   |            |   |   |   |
+    //  | +---+-+-++--+            +---+-+-++--+
+    //  | |     |  |  |            |     |  |  |
+    //  | +-----+--+--+            +-----+--+--+
+    //  |
+    //  +--------------> key
+    //
+    async fn cover_with_images(
+        &mut self,
+        job_id: JobId,
+        ctx: &E::RequestContext,
+    ) -> anyhow::Result<()> {
+        let job = &self.jobs[job_id.0];
+        assert!(job.strategy == CompactionStrategy::Divide);
+
+        // XXX: do we still need the "holes" stuff?
+
+        let mut new_jobs = Vec::new();
+
+        // Slide a window through the keyspace
+        let keyspace = self
+            .executor
+            .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
+            .await?;
+
+        let mut window = KeyspaceWindow::new(
+            E::Key::MIN..E::Key::MAX,
+            keyspace,
+            self.target_file_size / 8192,
+        );
+        while let Some(key_range) = window.choose_next_image() {
+            new_jobs.push(CompactionJob::<E> {
+                key_range,
+                lsn_range: job.lsn_range.clone(),
+                strategy: CompactionStrategy::CreateImage,
+                input_layers: Vec::new(), // XXX: Is it OK for  this to be empty for image layer?
+                completed: false,
+            });
+        }
+
+        for j in new_jobs.into_iter().rev() {
+            let _job_id = self.push_job(j);
+
+            // TODO: image layers don't let us delete anything. unless < PITR horizon
+            //let j = &self.jobs[job_id.0];
+            // for layer_id in j.input_layers.iter() {
+            //    self.layers[layer_id.0].pending_stakeholders.insert(job_id);
+            //}
+        }
+
+        Ok(())
+    }
+
+    // Merge the contents of all the input delta layers into a new set
+    // of delta layers, based on the current partitioning.
+    //
+    // We split the new delta layers on the key dimension. We iterate through
+    // the key space, and for each key, check if including the next key to the
+    // current output layer we're building would cause the layer to become too
+    // large. If so, dump the current output layer and start new one.  It's
+    // possible that there is a single key with so many page versions that
+    // storing all of them in a single layer file would be too large. In that
+    // case, we also split on the LSN dimension.
+    //
+    // LSN
+    //  ^
+    //  |
+    //  | +-----------+            +--+--+--+--+
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            |  |  |  |  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+     ==>    |  |  |  |  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            |  |  |  |  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            +--+--+--+--+
+    //  |
+    //  +--------------> key
+    //
+    //
+    // If one key (X) has a lot of page versions:
+    //
+    // LSN
+    //  ^
+    //  |                                 (X)
+    //  | +-----------+            +--+--+--+--+
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            |  |  +--+  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+     ==>    |  |  |  |  |
+    //  | |           |            |  |  +--+  |
+    //  | +-----------+            |  |  |  |  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            +--+--+--+--+
+    //  |
+    //  +--------------> key
+    //
+    // TODO: this actually divides the layers into fixed-size chunks, not
+    // based on the partitioning.
+    //
+    // TODO: we should also opportunistically materialize and
+    // garbage collect what we can.
+    async fn retile_deltas(
+        &mut self,
+        job_id: JobId,
+        ctx: &E::RequestContext,
+    ) -> anyhow::Result<()> {
+        let job = &self.jobs[job_id.0];
+        assert!(job.strategy == CompactionStrategy::Divide);
+
+        // Sweep the key space left to right, running an estimate of how much
+        // disk size and keyspace we have accumulated
+        //
+        // Once the disk size reaches the target threshold, stop and think.
+        // If we have accumulated only a narrow band of keyspace, create an
+        // image layer. Otherwise write a delta layer.
+
+        // FIXME: deal with the case of lots of values for same key
+
+        // FIXME: we are ignoring images here. Did we already divide the work
+        // so that we won't encounter them here?
+
+        let mut deltas: Vec<E::DeltaLayer> = Vec::new();
+        for layer_id in &job.input_layers {
+            let l = &self.layers[layer_id.0];
+            if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? {
+                deltas.push(dl.clone());
+            }
+        }
+        // Open stream
+        let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
+        let mut new_jobs = Vec::new();
+
+        // Slide a window through the keyspace
+        let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
+        let mut all_in_window: bool = false;
+        let mut window = Window::new();
+        loop {
+            if all_in_window && window.elems.is_empty() {
+                // All done!
+                break;
+            }
+            if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
+            {
+                let batch_layers: Vec<LayerId> = job
+                    .input_layers
+                    .iter()
+                    .filter(|layer_id| {
+                        overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
+                    })
+                    .cloned()
+                    .collect();
+                assert!(!batch_layers.is_empty());
+                new_jobs.push(CompactionJob {
+                    key_range,
+                    lsn_range: job.lsn_range.clone(),
+                    strategy: CompactionStrategy::CreateDelta,
+                    input_layers: batch_layers,
+                    completed: false,
+                });
+            } else {
+                assert!(!all_in_window);
+                if let Some(next_key) = key_accum.next().await.transpose()? {
+                    window.feed(next_key.key, next_key.size);
+                } else {
+                    all_in_window = true;
+                }
+            }
+        }
+
+        // All the input files are rewritten. Set up the tracking for when they can
+        // be deleted.
+        for layer_id in job.input_layers.iter() {
+            let l = &mut self.layers[layer_id.0];
+            assert!(l.deletable_after.is_none());
+            l.deletable_after = Some(PendingJobSet::new());
+        }
+        for j in new_jobs.into_iter().rev() {
+            let job_id = self.push_job(j);
+            let j = &self.jobs[job_id.0];
+            for layer_id in j.input_layers.iter() {
+                self.layers[layer_id.0]
+                    .deletable_after
+                    .as_mut()
+                    .unwrap()
+                    .pending
+                    .insert(job_id);
+            }
+        }
+
+        Ok(())
+    }
+}
+
+// Sliding window through keyspace and values
+// This is used by over_with_images to decide on good split points
+struct KeyspaceWindow<K> {
+    head: KeyspaceWindowHead<K>,
+
+    start_pos: KeyspaceWindowPos<K>,
+}
+struct KeyspaceWindowHead<K> {
+    // overall key range to cover
+    key_range: Range<K>,
+
+    keyspace: Vec<Range<K>>,
+    target_keysize: u64,
+}
+
+#[derive(Clone)]
+struct KeyspaceWindowPos<K> {
+    end_key: K,
+
+    keyspace_idx: usize,
+
+    accum_keysize: u64,
+}
+impl<K: CompactionKey> KeyspaceWindowPos<K> {
+    fn reached_end(&self, w: &KeyspaceWindowHead<K>) -> bool {
+        self.keyspace_idx == w.keyspace.len()
+    }
+
+    // Advance the cursor until it reaches 'target_keysize'.
+    fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
+        while self.accum_keysize < max_size && !self.reached_end(w) {
+            let curr_range = &w.keyspace[self.keyspace_idx];
+            if self.end_key < curr_range.start {
+                // skip over any unused space
+                self.end_key = curr_range.start;
+            }
+
+            // We're now within 'curr_range'. Can we advance past it completely?
+            let distance = K::key_range_size(&(self.end_key..curr_range.end));
+            if (self.accum_keysize + distance as u64) < max_size {
+                // oh yeah, it fits
+                self.end_key = curr_range.end;
+                self.keyspace_idx += 1;
+                self.accum_keysize += distance as u64;
+            } else {
+                // advance within the range
+                let skip_key = self.end_key.skip_some();
+                let distance = K::key_range_size(&(self.end_key..skip_key));
+                if (self.accum_keysize + distance as u64) < max_size {
+                    self.end_key = skip_key;
+                    self.accum_keysize += distance as u64;
+                } else {
+                    self.end_key = self.end_key.next();
+                    self.accum_keysize += 1;
+                }
+            }
+        }
+    }
+}
+
+impl<K> KeyspaceWindow<K>
+where
+    K: CompactionKey,
+{
+    fn new(key_range: Range<K>, keyspace: CompactionKeySpace<K>, target_keysize: u64) -> Self {
+        assert!(keyspace.first().unwrap().start >= key_range.start);
+
+        let start_key = key_range.start;
+        let start_pos = KeyspaceWindowPos::<K> {
+            end_key: start_key,
+            keyspace_idx: 0,
+            accum_keysize: 0,
+        };
+        Self {
+            head: KeyspaceWindowHead::<K> {
+                key_range,
+                keyspace,
+                target_keysize,
+            },
+            start_pos,
+        }
+    }
+
+    fn choose_next_image(&mut self) -> Option<Range<K>> {
+        if self.start_pos.keyspace_idx == self.head.keyspace.len() {
+            // we've reached the end
+            return None;
+        }
+
+        let mut next_pos = self.start_pos.clone();
+        next_pos.advance_until_size(
+            &self.head,
+            self.start_pos.accum_keysize + self.head.target_keysize,
+        );
+
+        // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
+        // 1.25x target size
+        let mut end_pos = next_pos.clone();
+        end_pos.advance_until_size(
+            &self.head,
+            self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
+        );
+        if end_pos.reached_end(&self.head) {
+            // gobble up any unused keyspace between the last used key and end of the range
+            assert!(end_pos.end_key <= self.head.key_range.end);
+            end_pos.end_key = self.head.key_range.end;
+            next_pos = end_pos;
+        }
+
+        let start_key = self.start_pos.end_key;
+        self.start_pos = next_pos;
+        Some(start_key..self.start_pos.end_key)
+    }
+}
+
+// Sliding window through keyspace and values
+//
+// This is used to decide what layer to write next, from the beginning of the window.
+//
+// Candidates:
+//
+// 1. Create an image layer, snapping to previous images
+// 2. Create a delta layer, snapping to previous images
+// 3. Create an image layer, snapping to
+//
+//
+
+// Take previous partitioning, based on the image layers below.
+//
+// Candidate is at the front:
+//
+// Consider stretching an image layer to next divider? If it's close enough,
+// that's the image candidate
+//
+// If it's too far, consider splitting at a reasonable point
+//
+// Is the image candidate smaller than the equivalent delta? If so,
+// split off the image. Otherwise, split off one delta.
+// Try to snap off the delta at a reasonable point
+
+struct WindowElement<K> {
+    start_key: K, // inclusive
+    last_key: K,  // inclusive
+    accum_size: u64,
+}
+struct Window<K> {
+    elems: VecDeque<WindowElement<K>>,
+
+    // last key that was split off, inclusive
+    splitoff_key: Option<K>,
+    splitoff_size: u64,
+}
+
+impl<K> Window<K>
+where
+    K: CompactionKey,
+{
+    fn new() -> Self {
+        Self {
+            elems: VecDeque::new(),
+            splitoff_key: None,
+            splitoff_size: 0,
+        }
+    }
+
+    fn feed(&mut self, key: K, size: u64) {
+        let last_size;
+        if let Some(last) = self.elems.back_mut() {
+            assert!(last.last_key <= key);
+            if key == last.last_key {
+                last.accum_size += size;
+                return;
+            }
+            last_size = last.accum_size;
+        } else {
+            last_size = 0;
+        }
+        // This is a new key.
+        let elem = WindowElement {
+            start_key: key,
+            last_key: key,
+            accum_size: last_size + size,
+        };
+        self.elems.push_back(elem);
+    }
+
+    fn remain_size(&self) -> u64 {
+        self.elems.back().unwrap().accum_size - self.splitoff_size
+    }
+
+    fn peek_size(&self) -> u64 {
+        self.elems.front().unwrap().accum_size - self.splitoff_size
+    }
+
+    fn commit_upto(&mut self, mut upto: usize) {
+        while upto > 1 {
+            let popped = self.elems.pop_front().unwrap();
+            self.elems.front_mut().unwrap().start_key = popped.start_key;
+            upto -= 1;
+        }
+    }
+
+    fn find_size_split(&self, target_size: u64) -> usize {
+        self.elems
+            .partition_point(|elem| elem.accum_size - self.splitoff_size < target_size)
+    }
+
+    fn pop(&mut self) {
+        let first = self.elems.pop_front().unwrap();
+        self.splitoff_size = first.accum_size;
+
+        self.splitoff_key = Some(first.last_key);
+    }
+
+    // the difference between delta and image is that an image covers
+    // any unused keyspace before and after, while a delta tries to
+    // minimize that. TODO: difference not implemented
+    fn pop_delta(&mut self) -> Range<K> {
+        let first = self.elems.front().unwrap();
+        let key_range = first.start_key..first.last_key.next();
+
+        self.pop();
+        key_range
+    }
+
+    // Prerequisite: we have enough input in the window
+    //
+    // On return None, the caller should feed more data and call again
+    fn choose_next_delta(&mut self, target_size: u64, has_more: bool) -> Option<Range<K>> {
+        if has_more && self.elems.is_empty() {
+            // Starting up
+            return None;
+        }
+
+        // If we still have an undersized candidate, just keep going
+        while self.peek_size() < target_size {
+            if self.elems.len() > 1 {
+                self.commit_upto(2);
+            } else if has_more {
+                return None;
+            } else {
+                break;
+            }
+        }
+
+        // Ensure we have enough input in the window to make a good decision
+        if has_more && self.remain_size() < target_size * 5 / 4 {
+            return None;
+        }
+
+        // The candidate on the front is now large enough, for a delta.
+        // And we have enough data in the window to decide.
+
+        // If we're willing to stretch it up to 1.25 target size, could we
+        // gobble up the rest of the work? This avoids creating very small
+        // "tail" layers at the end of the keyspace
+        if !has_more && self.remain_size() < target_size * 5 / 3 {
+            self.commit_upto(self.elems.len());
+        } else {
+            let delta_split_at = self.find_size_split(target_size);
+            self.commit_upto(delta_split_at);
+
+            // If it's still not large enough, request the caller to fill the window
+            if self.elems.len() == 1 && has_more {
+                return None;
+            }
+        }
+        Some(self.pop_delta())
+    }
+}
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
new file mode 100644
index 0000000000..a12f691504
--- /dev/null
+++ b/pageserver/compaction/src/helpers.rs
@@ -0,0 +1,243 @@
+//! This file contains generic utility functions over the interface types,
+//! which could be handy for any compaction implementation.
+use crate::interface::*;
+
+use futures::future::BoxFuture;
+use futures::{Stream, StreamExt};
+use itertools::Itertools;
+use pin_project_lite::pin_project;
+use std::cmp::Ord;
+use std::collections::BinaryHeap;
+use std::collections::VecDeque;
+use std::future::Future;
+use std::ops::{DerefMut, Range};
+use std::pin::Pin;
+use std::task::{ready, Poll};
+
+pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
+where
+    K: CompactionKey,
+{
+    keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
+}
+
+pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
+    !(a.end <= b.start || b.end <= a.start)
+}
+
+pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
+    let x = std::mem::take(a);
+    let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
+        .into_iter()
+        .kmerge_by(|a, b| a.start < b.start);
+    let mut ranges = Vec::new();
+    if let Some(first) = all_ranges_iter.next() {
+        let (mut start, mut end) = (first.start, first.end);
+
+        for r in all_ranges_iter {
+            assert!(r.start >= start);
+            if r.start > end {
+                ranges.push(start..end);
+                start = r.start;
+                end = r.end;
+            } else if r.end > end {
+                end = r.end;
+            }
+        }
+        ranges.push(start..end);
+    }
+    *a = ranges
+}
+
+pub fn intersect_keyspace<K: Ord + Clone + Copy>(
+    a: &CompactionKeySpace<K>,
+    r: &Range<K>,
+) -> CompactionKeySpace<K> {
+    let mut ranges: Vec<Range<K>> = Vec::new();
+
+    for x in a.iter() {
+        if x.end <= r.start {
+            continue;
+        }
+        if x.start >= r.end {
+            break;
+        }
+        ranges.push(x.clone())
+    }
+
+    // trim the ends
+    if let Some(first) = ranges.first_mut() {
+        first.start = std::cmp::max(first.start, r.start);
+    }
+    if let Some(last) = ranges.last_mut() {
+        last.end = std::cmp::min(last.end, r.end);
+    }
+    ranges
+}
+
+/// Create a stream that iterates through all DeltaEntrys among all input
+/// layers, in key-lsn order.
+///
+/// This is public because the create_delta() implementation likely wants to use this too
+/// TODO: move to a more shared place
+pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
+    layers: &'a [E::DeltaLayer],
+    ctx: &'a E::RequestContext,
+) -> MergeDeltaKeys<'a, E> {
+    // Use a binary heap to merge the layers. Each input layer is initially
+    // represented by a LazyLoadLayer::Unloaded element, which uses the start of
+    // the layer's key range as the key. The first time a layer reaches the top
+    // of the heap, all the keys of the layer are loaded into a sorted vector.
+    //
+    // This helps to keep the memory usage reasonable: we only need to hold in
+    // memory the DeltaEntrys of the layers that overlap with the "current" key.
+    let mut heap: BinaryHeap<LazyLoadLayer<'a, E>> = BinaryHeap::new();
+    for l in layers {
+        heap.push(LazyLoadLayer::Unloaded(l));
+    }
+    MergeDeltaKeys {
+        heap,
+        ctx,
+        load_future: None,
+    }
+}
+
+enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
+    Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
+    Unloaded(&'a E::DeltaLayer),
+}
+impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
+    fn key(&self) -> E::Key {
+        match self {
+            Self::Loaded(entries) => entries.front().unwrap().key(),
+            Self::Unloaded(dl) => dl.key_range().start,
+        }
+    }
+}
+impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        // reverse order so that we get a min-heap
+        other.key().cmp(&self.key())
+    }
+}
+impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
+    fn eq(&self, other: &Self) -> bool {
+        self.key().eq(&other.key())
+    }
+}
+impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
+
+type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
+
+// Stream returned by `merge_delta_keys`
+pin_project! {
+#[allow(clippy::type_complexity)]
+pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> {
+    heap: BinaryHeap<LazyLoadLayer<'a, E>>,
+
+    #[pin]
+    load_future: Option<LoadFuture<'a, <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>,
+
+    ctx: &'a E::RequestContext,
+}
+}
+
+impl<'a, E> Stream for MergeDeltaKeys<'a, E>
+where
+    E: CompactionJobExecutor + 'a,
+{
+    type Item = anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>;
+
+    fn poll_next(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<std::option::Option<<Self as futures::Stream>::Item>> {
+        let mut this = self.project();
+        loop {
+            if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() {
+                // We are waiting for loading the keys to finish
+                match ready!(load_future.as_mut().poll(cx)) {
+                    Ok(entries) => {
+                        this.load_future.set(None);
+                        *this.heap.peek_mut().unwrap() =
+                            LazyLoadLayer::Loaded(VecDeque::from(entries));
+                    }
+                    Err(e) => {
+                        return Poll::Ready(Some(Err(e)));
+                    }
+                }
+            }
+
+            // If the topmost layer in the heap hasn't been loaded yet, start
+            // loading it. Otherwise return the next entry from it and update
+            // the layer's position in the heap (this decreaseKey operation is
+            // performed implicitly when `top` is dropped).
+            if let Some(mut top) = this.heap.peek_mut() {
+                match top.deref_mut() {
+                    LazyLoadLayer::Unloaded(ref mut l) => {
+                        let fut = l.load_keys(this.ctx);
+                        this.load_future.set(Some(fut));
+                        continue;
+                    }
+                    LazyLoadLayer::Loaded(ref mut entries) => {
+                        let result = entries.pop_front().unwrap();
+                        if entries.is_empty() {
+                            std::collections::binary_heap::PeekMut::pop(top);
+                        }
+                        return Poll::Ready(Some(Ok(result)));
+                    }
+                }
+            } else {
+                return Poll::Ready(None);
+            }
+        }
+    }
+}
+
+// Accumulate values at key boundaries
+pub struct KeySize<K> {
+    pub key: K,
+    pub num_values: u64,
+    pub size: u64,
+}
+
+pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
+where
+    K: Eq,
+    I: Stream<Item = Result<D, E>>,
+    D: CompactionDeltaEntry<'a, K>,
+{
+    async_stream::try_stream! {
+        // Initialize the state from the first value
+        let mut input = std::pin::pin!(input);
+
+        if let Some(first) = input.next().await {
+            let first = first?;
+            let mut accum: KeySize<K> = KeySize {
+                key: first.key(),
+                num_values: 1,
+                size: first.size(),
+            };
+            while let Some(this) = input.next().await {
+                let this = this?;
+                if this.key() == accum.key {
+                    accum.size += this.size();
+                    accum.num_values += 1;
+                } else {
+                    yield accum;
+                    accum = KeySize {
+                        key: this.key(),
+                        num_values: 1,
+                        size: this.size(),
+                    };
+                }
+            }
+            yield accum;
+        }
+    }
+}
diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs
new file mode 100644
index 0000000000..ef388fd92b
--- /dev/null
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -0,0 +1,376 @@
+//! An LSM tree consists of multiple levels, each exponential larger than the
+//! previous level. And each level consists of be multiple "tiers". With tiered
+//! compaction, a level is compacted when it has accumulated more than N tiers,
+//! forming one tier on the next level.
+//!
+//! In the pageserver, we don't explicitly track the levels and tiers. Instead,
+//! we identify them by looking at the shapes of the layers. It's an easy task
+//! for a human, but it's not straightforward to come up with the exact
+//! rules. Especially if there are cases like interrupted, half-finished
+//! compactions, or highly skewed data distributions that have let us "skip"
+//! some levels. It's not critical to classify all cases correctly; at worst we
+//! delay some compaction work, and suffer from more read amplification, or we
+//! perform some unnecessary compaction work.
+//!
+//! `identify_level` performs that shape-matching.
+//!
+//! It returns a Level struct, which has `depth()` function to count the number
+//! of "tiers" in the level. The tier count is the max depth of stacked layers
+//! within the level. That's a good measure, because the point of compacting is
+//! to reduce read amplification, and the depth is what determines that.
+//!
+//! One interesting effect of this is that if we generate very small delta
+//! layers at L0, e.g. because the L0 layers are flushed by timeout rather than
+//! because they reach the target size, the L0 compaction will combine them to
+//! one larger file. But if the combined file is still smaller than the target
+//! file size, the file will still be considered to be part of L0 at the next
+//! iteration.
+
+use anyhow::bail;
+use std::collections::BTreeSet;
+use std::ops::Range;
+use utils::lsn::Lsn;
+
+use crate::interface::*;
+
+use tracing::{info, trace};
+
+pub struct Level<L> {
+    pub lsn_range: Range<Lsn>,
+    pub layers: Vec<L>,
+}
+
+/// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are
+/// no layers that cross the boundary LSN.
+///
+/// A further restriction is that all layers in the returned partition cover at
+/// most 'lsn_max_size' LSN bytes.
+pub async fn identify_level<K, L>(
+    all_layers: Vec<L>,
+    end_lsn: Lsn,
+    lsn_max_size: u64,
+) -> anyhow::Result<Option<Level<L>>>
+where
+    K: CompactionKey,
+    L: CompactionLayer<K> + Clone,
+{
+    // filter out layers that are above the `end_lsn`, they are completely irrelevant.
+    let mut layers = Vec::new();
+    for l in all_layers {
+        if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn {
+            // shouldn't happen. Indicates that the caller passed a bogus
+            // end_lsn.
+            bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id());
+        }
+        // include image layers sitting exacty at `end_lsn`.
+        let is_image = !l.is_delta();
+        if (is_image && l.lsn_range().start > end_lsn)
+            || (!is_image && l.lsn_range().start >= end_lsn)
+        {
+            continue;
+        }
+        layers.push(l);
+    }
+    // All the remaining layers either belong to this level, or are below it.
+    info!(
+        "identify level at {}, size {}, num layers below: {}",
+        end_lsn,
+        lsn_max_size,
+        layers.len()
+    );
+    if layers.is_empty() {
+        return Ok(None);
+    }
+
+    // Walk the ranges in LSN order.
+    //
+    // ----- end_lsn
+    //  |
+    //  |
+    //  v
+    //
+    layers.sort_by_key(|l| l.lsn_range().end);
+    let mut candidate_start_lsn = end_lsn;
+    let mut candidate_layers: Vec<L> = Vec::new();
+    let mut current_best_start_lsn = end_lsn;
+    let mut current_best_layers: Vec<L> = Vec::new();
+    let mut iter = layers.into_iter();
+    loop {
+        let Some(l) = iter.next_back() else {
+            // Reached end. Accept the last candidate
+            current_best_start_lsn = candidate_start_lsn;
+            current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
+            break;
+        };
+        trace!(
+            "inspecting {} for candidate {}, current best {}",
+            l.short_id(),
+            candidate_start_lsn,
+            current_best_start_lsn
+        );
+
+        let r = l.lsn_range();
+
+        // Image layers don't restrict our choice of cutoff LSN
+        if l.is_delta() {
+            // Is this candidate workable? In other words, are there any
+            // delta layers that span across this LSN
+            //
+            // Valid:                 Not valid:
+            //  +                     +
+            //  |                     | +
+            //  +  <- candidate       + |   <- candidate
+            //     +                    +
+            //     |
+            //     +
+            if r.end <= candidate_start_lsn {
+                // Hooray, there are no crossing LSNs. And we have visited
+                // through all the layers within candidate..end_lsn. The
+                // current candidate can be accepted.
+                current_best_start_lsn = r.end;
+                current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
+                candidate_start_lsn = r.start;
+            }
+
+            // Is it small enough to be considered part of this level?
+            if r.end.0 - r.start.0 > lsn_max_size {
+                // Too large, this layer belongs to next level. Stop.
+                trace!(
+                    "too large {}, size {} vs {}",
+                    l.short_id(),
+                    r.end.0 - r.start.0,
+                    lsn_max_size
+                );
+                break;
+            }
+
+            // If this crosses the candidate lsn, push it down.
+            if r.start < candidate_start_lsn {
+                trace!(
+                    "layer {} prevents from stopping at {}",
+                    l.short_id(),
+                    candidate_start_lsn
+                );
+                candidate_start_lsn = r.start;
+            }
+        }
+
+        // Include this layer in our candidate
+        candidate_layers.push(l);
+    }
+
+    Ok(if current_best_start_lsn == end_lsn {
+        // empty level
+        None
+    } else {
+        Some(Level {
+            lsn_range: current_best_start_lsn..end_lsn,
+            layers: current_best_layers,
+        })
+    })
+}
+
+// helper struct used in depth()
+struct Event<K> {
+    key: K,
+    layer_idx: usize,
+    start: bool,
+}
+
+impl<L> Level<L> {
+    /// Count the number of deltas stacked on each other.
+    pub fn depth<K>(&self) -> u64
+    where
+        K: CompactionKey,
+        L: CompactionLayer<K>,
+    {
+        let mut events: Vec<Event<K>> = Vec::new();
+        for (idx, l) in self.layers.iter().enumerate() {
+            events.push(Event {
+                key: l.key_range().start,
+                layer_idx: idx,
+                start: true,
+            });
+            events.push(Event {
+                key: l.key_range().end,
+                layer_idx: idx,
+                start: false,
+            });
+        }
+        events.sort_by_key(|e| (e.key, e.start));
+
+        // Sweep the key space left to right. Stop at each distinct key, and
+        // count the number of deltas on top of the highest image at that key.
+        //
+        // This is a little enefficient, as we walk through the active_set on
+        // every key. We could increment/decrement a counter on each step
+        // instead, but that'd require a bit more complex bookkeeping.
+        let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
+        let mut max_depth = 0;
+        let mut events_iter = events.iter().peekable();
+        while let Some(e) = events_iter.next() {
+            let l = &self.layers[e.layer_idx];
+            let is_image = !l.is_delta();
+
+            // update the active set
+            if e.start {
+                active_set.insert((l.lsn_range().end, is_image, e.layer_idx));
+            } else {
+                active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx));
+            }
+
+            // recalculate depth if this was the last event at this point
+            let more_events_at_this_key = events_iter
+                .peek()
+                .map_or(false, |next_e| next_e.key == e.key);
+            if !more_events_at_this_key {
+                let mut active_depth = 0;
+                for (_end_lsn, is_image, _idx) in active_set.iter().rev() {
+                    if *is_image {
+                        break;
+                    }
+                    active_depth += 1;
+                }
+                if active_depth > max_depth {
+                    max_depth = active_depth;
+                }
+            }
+        }
+        max_depth
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer};
+    use std::sync::{Arc, Mutex};
+
+    fn delta(key_range: Range<Key>, lsn_range: Range<Lsn>) -> MockLayer {
+        MockLayer::Delta(Arc::new(MockDeltaLayer {
+            key_range,
+            lsn_range,
+            // identify_level() doesn't pay attention to the rest of the fields
+            file_size: 0,
+            deleted: Mutex::new(false),
+            records: vec![],
+        }))
+    }
+
+    fn image(key_range: Range<Key>, lsn: Lsn) -> MockLayer {
+        MockLayer::Image(Arc::new(MockImageLayer {
+            key_range,
+            lsn_range: lsn..(lsn + 1),
+            // identify_level() doesn't pay attention to the rest of the fields
+            file_size: 0,
+            deleted: Mutex::new(false),
+        }))
+    }
+
+    #[tokio::test]
+    async fn test_identify_level() -> anyhow::Result<()> {
+        let layers = vec![
+            delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)),
+        ];
+
+        // All layers fit in the max file size
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
+            .await?
+            .unwrap();
+        assert_eq!(level.depth(), 6);
+
+        // Same LSN with smaller max file size. The second layer from the top is larger
+        // and belongs to next level.
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
+            .await?
+            .unwrap();
+        assert_eq!(level.depth(), 1);
+
+        // Call with a smaller LSN
+        let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000)
+            .await?
+            .unwrap();
+        assert_eq!(level.depth(), 2);
+
+        // Call with an LSN that doesn't partition the space
+        let result = identify_level(layers, Lsn(0x6000), 0x1000).await;
+        assert!(result.is_err());
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> {
+        // The files LSN ranges overlap, so even though there are more files that
+        // fit under the file size, they are not included in the level because they
+        // overlap so that we'd need to include the oldest file, too, which is
+        // larger
+        let layers = vec![
+            delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap
+            delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap
+            delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap
+            delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger
+        ];
+
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
+            .await?
+            .unwrap();
+        assert_eq!(level.depth(), 1);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_depth_nonoverlapping() -> anyhow::Result<()> {
+        // The key ranges don't overlap, so depth is only 1.
+        let layers = vec![
+            delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)),
+            delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)),
+            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
+        ];
+
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
+            .await?
+            .unwrap();
+        assert_eq!(level.layers.len(), 3);
+        assert_eq!(level.depth(), 1);
+
+        // Staggered. The 1st and 3rd layer don't overlap with each other.
+        let layers = vec![
+            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
+            delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
+            delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
+        ];
+
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
+            .await?
+            .unwrap();
+        assert_eq!(level.layers.len(), 3);
+        assert_eq!(level.depth(), 2);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_depth_images() -> anyhow::Result<()> {
+        let layers: Vec<MockLayer> = vec![
+            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
+            delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
+            delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
+            // This covers the same key range as the 2nd delta layer. The depth
+            // in that key range is therefore 0.
+            image(1500..2500, Lsn(0x9000)),
+        ];
+
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
+            .await?
+            .unwrap();
+        assert_eq!(level.layers.len(), 4);
+        assert_eq!(level.depth(), 1);
+        Ok(())
+    }
+}
diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs
new file mode 100644
index 0000000000..979ceebf0e
--- /dev/null
+++ b/pageserver/compaction/src/interface.rs
@@ -0,0 +1,167 @@
+//! This is what the compaction implementation needs to know about
+//! layers, keyspace etc.
+//!
+//! All the heavy lifting is done by the create_image and create_delta
+//! functions that the implementor provides.
+use async_trait::async_trait;
+use pageserver_api::{key::Key, keyspace::key_range_size};
+use std::ops::Range;
+use utils::lsn::Lsn;
+
+/// Public interface. This is the main thing that the implementor needs to provide
+#[async_trait]
+pub trait CompactionJobExecutor {
+    // Type system.
+    //
+    // We assume that there are two kinds of layers, deltas and images. The
+    // compaction doesn't distinguish whether they are stored locally or
+    // remotely.
+    //
+    // The keyspace is defined by CompactionKey trait.
+    //
+    type Key: CompactionKey;
+
+    type Layer: CompactionLayer<Self::Key> + Clone;
+    type DeltaLayer: CompactionDeltaLayer<Self> + Clone;
+    type ImageLayer: CompactionImageLayer<Self> + Clone;
+
+    // This is passed through to all the interface functions. The compaction
+    // implementation doesn't do anything with it, but it might be useful for
+    // the interface implementation.
+    type RequestContext: CompactionRequestContext;
+
+    // ----
+    // Functions that the planner uses to support its decisions
+    // ----
+
+    /// Return all layers that overlap the given bounding box.
+    async fn get_layers(
+        &mut self,
+        key_range: &Range<Self::Key>,
+        lsn_range: &Range<Lsn>,
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<Vec<Self::Layer>>;
+
+    async fn get_keyspace(
+        &mut self,
+        key_range: &Range<Self::Key>,
+        lsn: Lsn,
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<CompactionKeySpace<Self::Key>>;
+
+    /// NB: This is a pretty expensive operation. In the real pageserver
+    /// implementation, it downloads the layer, and keeps it resident
+    /// until the DeltaLayer is dropped.
+    async fn downcast_delta_layer(
+        &self,
+        layer: &Self::Layer,
+    ) -> anyhow::Result<Option<Self::DeltaLayer>>;
+
+    // ----
+    // Functions to execute the plan
+    // ----
+
+    /// Create a new image layer, materializing all the values in the key range,
+    /// at given 'lsn'.
+    async fn create_image(
+        &mut self,
+        lsn: Lsn,
+        key_range: &Range<Self::Key>,
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<()>;
+
+    /// Create a new delta layer, containing all the values from 'input_layers'
+    /// in the given key and LSN range.
+    async fn create_delta(
+        &mut self,
+        lsn_range: &Range<Lsn>,
+        key_range: &Range<Self::Key>,
+        input_layers: &[Self::DeltaLayer],
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<()>;
+
+    /// Delete a layer. The compaction implementation will call this only after
+    /// all the create_image() or create_delta() calls that deletion of this
+    /// layer depends on have finished. But if the implementor has extra lazy
+    /// background tasks, like uploading the index json file to remote storage,
+    /// it is the implementation's responsibility to track those.
+    async fn delete_layer(
+        &mut self,
+        layer: &Self::Layer,
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<()>;
+}
+
+pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
+    const MIN: Self;
+    const MAX: Self;
+
+    /// Calculate distance between key_range.start and key_range.end.
+    ///
+    /// This returns u32, for compatibility with Repository::key. If the
+    /// distance is larger, return u32::MAX.
+    fn key_range_size(key_range: &Range<Self>) -> u32;
+
+    // return "self + 1"
+    fn next(&self) -> Self;
+
+    // return "self + <some decent amount to skip>". The amount to skip
+    // is left to the implementation.
+    // FIXME: why not just "add(u32)" ?  This is hard to use
+    fn skip_some(&self) -> Self;
+}
+
+impl CompactionKey for Key {
+    const MIN: Self = Self::MIN;
+    const MAX: Self = Self::MAX;
+
+    fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
+        key_range_size(r)
+    }
+    fn next(&self) -> Key {
+        (self as &Key).next()
+    }
+    fn skip_some(&self) -> Key {
+        self.add(128)
+    }
+}
+
+/// Contiguous ranges of keys that belong to the key space. In key order, and
+/// with no overlap.
+pub type CompactionKeySpace<K> = Vec<Range<K>>;
+
+/// Functions needed from all layers.
+pub trait CompactionLayer<K: CompactionKey + ?Sized> {
+    fn key_range(&self) -> &Range<K>;
+    fn lsn_range(&self) -> &Range<Lsn>;
+
+    fn file_size(&self) -> u64;
+
+    /// For debugging, short human-readable representation of the layer. E.g. filename.
+    fn short_id(&self) -> String;
+
+    fn is_delta(&self) -> bool;
+}
+
+#[async_trait]
+pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
+    type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
+    where
+        Self: 'a;
+
+    /// Return all keys in this delta layer.
+    async fn load_keys<'a>(
+        &self,
+        ctx: &E::RequestContext,
+    ) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
+}
+
+pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
+
+pub trait CompactionDeltaEntry<'a, K> {
+    fn key(&self) -> K;
+    fn lsn(&self) -> Lsn;
+    fn size(&self) -> u64;
+}
+
+pub trait CompactionRequestContext {}
diff --git a/pageserver/compaction/src/lib.rs b/pageserver/compaction/src/lib.rs
new file mode 100644
index 0000000000..2d6d673de5
--- /dev/null
+++ b/pageserver/compaction/src/lib.rs
@@ -0,0 +1,12 @@
+// The main module implementing the compaction algorithm
+pub mod compact_tiered;
+pub(crate) mod identify_levels;
+
+// Traits that the caller of the compaction needs to implement
+pub mod interface;
+
+// Utility functions, useful for the implementation
+pub mod helpers;
+
+// A simulator with mock implementations of 'interface'
+pub mod simulator;
diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs
new file mode 100644
index 0000000000..6d07038dcd
--- /dev/null
+++ b/pageserver/compaction/src/simulator.rs
@@ -0,0 +1,613 @@
+mod draw;
+
+use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
+
+use async_trait::async_trait;
+use futures::StreamExt;
+use rand::Rng;
+use tracing::info;
+
+use utils::lsn::Lsn;
+
+use std::fmt::Write;
+use std::ops::Range;
+use std::sync::Arc;
+use std::sync::Mutex;
+
+use crate::helpers::{merge_delta_keys, overlaps_with};
+
+use crate::interface;
+use crate::interface::CompactionLayer;
+
+//
+// Implementation for the CompactionExecutor interface
+//
+pub struct MockTimeline {
+    // Parameters for the compaction algorithm
+    pub target_file_size: u64,
+    tiers_per_level: u64,
+
+    num_l0_flushes: u64,
+    last_compact_at_flush: u64,
+    last_flush_lsn: Lsn,
+
+    // In-memory layer
+    records: Vec<MockRecord>,
+    total_len: u64,
+    start_lsn: Lsn,
+    end_lsn: Lsn,
+
+    // Current keyspace at `end_lsn`. This is updated on every ingested record.
+    keyspace: KeySpace,
+
+    // historic keyspaces
+    old_keyspaces: Vec<(Lsn, KeySpace)>,
+
+    // "on-disk" layers
+    pub live_layers: Vec<MockLayer>,
+
+    num_deleted_layers: u64,
+
+    // Statistics
+    wal_ingested: u64,
+    bytes_written: u64,
+    bytes_deleted: u64,
+    layers_created: u64,
+    layers_deleted: u64,
+
+    // All the events - creation and deletion of files - are collected
+    // in 'history'. It is used to draw the SVG animation at the end.
+    time: u64,
+    history: Vec<draw::LayerTraceEvent>,
+}
+
+type KeySpace = interface::CompactionKeySpace<Key>;
+
+pub struct MockRequestContext {}
+impl interface::CompactionRequestContext for MockRequestContext {}
+
+pub type Key = u64;
+
+impl interface::CompactionKey for Key {
+    const MIN: Self = u64::MIN;
+    const MAX: Self = u64::MAX;
+
+    fn key_range_size(key_range: &Range<Self>) -> u32 {
+        std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
+    }
+
+    fn next(&self) -> Self {
+        self + 1
+    }
+    fn skip_some(&self) -> Self {
+        // round up to next xx
+        self + 100
+    }
+}
+
+#[derive(Clone)]
+pub struct MockRecord {
+    lsn: Lsn,
+    key: Key,
+    len: u64,
+}
+
+impl interface::CompactionDeltaEntry<'_, Key> for MockRecord {
+    fn key(&self) -> Key {
+        self.key
+    }
+    fn lsn(&self) -> Lsn {
+        self.lsn
+    }
+    fn size(&self) -> u64 {
+        self.len
+    }
+}
+
+pub struct MockDeltaLayer {
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+
+    pub file_size: u64,
+
+    pub deleted: Mutex<bool>,
+
+    pub records: Vec<MockRecord>,
+}
+
+impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
+    fn key_range(&self) -> &Range<Key> {
+        &self.key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.lsn_range
+    }
+
+    fn file_size(&self) -> u64 {
+        self.file_size
+    }
+
+    fn short_id(&self) -> String {
+        format!(
+            "{:016X}-{:016X}__{:08X}-{:08X}",
+            self.key_range.start, self.key_range.end, self.lsn_range.start.0, self.lsn_range.end.0
+        )
+    }
+
+    fn is_delta(&self) -> bool {
+        true
+    }
+}
+
+#[async_trait]
+impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
+    type DeltaEntry<'a> = MockRecord;
+
+    async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result<Vec<MockRecord>> {
+        Ok(self.records.clone())
+    }
+}
+
+pub struct MockImageLayer {
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+
+    pub file_size: u64,
+
+    pub deleted: Mutex<bool>,
+}
+
+impl interface::CompactionImageLayer<MockTimeline> for Arc<MockImageLayer> {}
+
+impl interface::CompactionLayer<Key> for Arc<MockImageLayer> {
+    fn key_range(&self) -> &Range<Key> {
+        &self.key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.lsn_range
+    }
+
+    fn file_size(&self) -> u64 {
+        self.file_size
+    }
+
+    fn short_id(&self) -> String {
+        format!(
+            "{:016X}-{:016X}__{:08X}",
+            self.key_range.start, self.key_range.end, self.lsn_range.start.0,
+        )
+    }
+
+    fn is_delta(&self) -> bool {
+        false
+    }
+}
+
+impl MockTimeline {
+    pub fn new() -> Self {
+        MockTimeline {
+            target_file_size: 256 * 1024 * 1024,
+            tiers_per_level: 4,
+
+            num_l0_flushes: 0,
+            last_compact_at_flush: 0,
+            last_flush_lsn: Lsn(0),
+
+            records: Vec::new(),
+            total_len: 0,
+            start_lsn: Lsn(1000),
+            end_lsn: Lsn(1000),
+            keyspace: KeySpace::new(),
+
+            old_keyspaces: vec![],
+
+            live_layers: vec![],
+
+            num_deleted_layers: 0,
+
+            wal_ingested: 0,
+            bytes_written: 0,
+            bytes_deleted: 0,
+            layers_created: 0,
+            layers_deleted: 0,
+
+            time: 0,
+            history: Vec::new(),
+        }
+    }
+
+    pub async fn compact(&mut self) -> anyhow::Result<()> {
+        let ctx = MockRequestContext {};
+
+        crate::compact_tiered::compact_tiered(
+            self,
+            self.last_flush_lsn,
+            self.target_file_size,
+            self.tiers_per_level,
+            &ctx,
+        )
+        .await?;
+
+        Ok(())
+    }
+
+    // Ingest one record to the timeline
+    pub fn ingest_record(&mut self, key: Key, len: u64) {
+        self.records.push(MockRecord {
+            lsn: self.end_lsn,
+            key,
+            len,
+        });
+        self.total_len += len;
+        self.end_lsn += len;
+
+        if self.total_len > self.target_file_size {
+            self.flush_l0();
+        }
+    }
+
+    pub async fn compact_if_needed(&mut self) -> anyhow::Result<()> {
+        if self.num_l0_flushes - self.last_compact_at_flush >= self.tiers_per_level {
+            self.compact().await?;
+            self.last_compact_at_flush = self.num_l0_flushes;
+        }
+        Ok(())
+    }
+
+    pub fn flush_l0(&mut self) {
+        if self.records.is_empty() {
+            return;
+        }
+
+        let mut records = std::mem::take(&mut self.records);
+        records.sort_by_key(|rec| rec.key);
+
+        let lsn_range = self.start_lsn..self.end_lsn;
+        let new_layer = Arc::new(MockDeltaLayer {
+            key_range: Key::MIN..Key::MAX,
+            lsn_range: lsn_range.clone(),
+            file_size: self.total_len,
+            records,
+            deleted: Mutex::new(false),
+        });
+        info!("flushed L0 layer {}", new_layer.short_id());
+        self.live_layers.push(MockLayer::from(&new_layer));
+
+        // reset L0
+        self.start_lsn = self.end_lsn;
+        self.total_len = 0;
+        self.records = Vec::new();
+
+        self.layers_created += 1;
+        self.bytes_written += new_layer.file_size;
+
+        self.time += 1;
+        self.history.push(LayerTraceEvent {
+            time_rel: self.time,
+            op: LayerTraceOp::Flush,
+            file: LayerTraceFile {
+                filename: new_layer.short_id(),
+                key_range: new_layer.key_range.clone(),
+                lsn_range: new_layer.lsn_range.clone(),
+            },
+        });
+
+        self.num_l0_flushes += 1;
+        self.last_flush_lsn = self.end_lsn;
+    }
+
+    // Ingest `num_records' records to the timeline, with random keys
+    // uniformly distributed in `key_range`
+    pub fn ingest_uniform(
+        &mut self,
+        num_records: u64,
+        len: u64,
+        key_range: &Range<Key>,
+    ) -> anyhow::Result<()> {
+        crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]);
+        let mut rng = rand::thread_rng();
+        for _ in 0..num_records {
+            self.ingest_record(rng.gen_range(key_range.clone()), len);
+            self.wal_ingested += len;
+        }
+        Ok(())
+    }
+
+    pub fn stats(&self) -> anyhow::Result<String> {
+        let mut s = String::new();
+
+        writeln!(s, "STATISTICS:")?;
+        writeln!(
+            s,
+            "WAL ingested:   {:>10} MB",
+            self.wal_ingested / (1024 * 1024)
+        )?;
+        writeln!(
+            s,
+            "size created:   {:>10} MB",
+            self.bytes_written / (1024 * 1024)
+        )?;
+        writeln!(
+            s,
+            "size deleted:   {:>10} MB",
+            self.bytes_deleted / (1024 * 1024)
+        )?;
+        writeln!(s, "files created:     {:>10}", self.layers_created)?;
+        writeln!(s, "files deleted:     {:>10}", self.layers_deleted)?;
+        writeln!(
+            s,
+            "write amp:         {:>10.2}",
+            self.bytes_written as f64 / self.wal_ingested as f64
+        )?;
+        writeln!(
+            s,
+            "storage amp:       {:>10.2}",
+            (self.bytes_written - self.bytes_deleted) as f64 / self.wal_ingested as f64
+        )?;
+
+        Ok(s)
+    }
+
+    pub fn draw_history<W: std::io::Write>(&self, output: W) -> anyhow::Result<()> {
+        draw::draw_history(&self.history, output)
+    }
+}
+
+impl Default for MockTimeline {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[derive(Clone)]
+pub enum MockLayer {
+    Delta(Arc<MockDeltaLayer>),
+    Image(Arc<MockImageLayer>),
+}
+
+impl interface::CompactionLayer<Key> for MockLayer {
+    fn key_range(&self) -> &Range<Key> {
+        match self {
+            MockLayer::Delta(this) => this.key_range(),
+            MockLayer::Image(this) => this.key_range(),
+        }
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        match self {
+            MockLayer::Delta(this) => this.lsn_range(),
+            MockLayer::Image(this) => this.lsn_range(),
+        }
+    }
+    fn file_size(&self) -> u64 {
+        match self {
+            MockLayer::Delta(this) => this.file_size(),
+            MockLayer::Image(this) => this.file_size(),
+        }
+    }
+    fn short_id(&self) -> String {
+        match self {
+            MockLayer::Delta(this) => this.short_id(),
+            MockLayer::Image(this) => this.short_id(),
+        }
+    }
+
+    fn is_delta(&self) -> bool {
+        match self {
+            MockLayer::Delta(_) => true,
+            MockLayer::Image(_) => false,
+        }
+    }
+}
+
+impl MockLayer {
+    fn is_deleted(&self) -> bool {
+        let guard = match self {
+            MockLayer::Delta(this) => this.deleted.lock().unwrap(),
+            MockLayer::Image(this) => this.deleted.lock().unwrap(),
+        };
+        *guard
+    }
+    fn mark_deleted(&self) {
+        let mut deleted_guard = match self {
+            MockLayer::Delta(this) => this.deleted.lock().unwrap(),
+            MockLayer::Image(this) => this.deleted.lock().unwrap(),
+        };
+        assert!(!*deleted_guard, "layer already deleted");
+        *deleted_guard = true;
+    }
+}
+
+impl From<&Arc<MockDeltaLayer>> for MockLayer {
+    fn from(l: &Arc<MockDeltaLayer>) -> Self {
+        MockLayer::Delta(l.clone())
+    }
+}
+
+impl From<&Arc<MockImageLayer>> for MockLayer {
+    fn from(l: &Arc<MockImageLayer>) -> Self {
+        MockLayer::Image(l.clone())
+    }
+}
+
+#[async_trait]
+impl interface::CompactionJobExecutor for MockTimeline {
+    type Key = Key;
+    type Layer = MockLayer;
+    type DeltaLayer = Arc<MockDeltaLayer>;
+    type ImageLayer = Arc<MockImageLayer>;
+    type RequestContext = MockRequestContext;
+
+    async fn get_layers(
+        &mut self,
+        key_range: &Range<Self::Key>,
+        lsn_range: &Range<Lsn>,
+        _ctx: &Self::RequestContext,
+    ) -> anyhow::Result<Vec<Self::Layer>> {
+        // Clear any deleted layers from our vec
+        self.live_layers.retain(|l| !l.is_deleted());
+
+        let layers: Vec<MockLayer> = self
+            .live_layers
+            .iter()
+            .filter(|l| {
+                overlaps_with(l.lsn_range(), lsn_range) && overlaps_with(l.key_range(), key_range)
+            })
+            .cloned()
+            .collect();
+
+        Ok(layers)
+    }
+
+    async fn get_keyspace(
+        &mut self,
+        key_range: &Range<Self::Key>,
+        _lsn: Lsn,
+        _ctx: &Self::RequestContext,
+    ) -> anyhow::Result<interface::CompactionKeySpace<Key>> {
+        // find it in the levels
+        if self.old_keyspaces.is_empty() {
+            Ok(crate::helpers::intersect_keyspace(
+                &self.keyspace,
+                key_range,
+            ))
+        } else {
+            // not implemented
+
+            // The mock implementation only allows requesting the
+            // keyspace at the level's end LSN. That's all that the
+            // current implementation needs.
+            panic!("keyspace not available for requested lsn");
+        }
+    }
+
+    async fn downcast_delta_layer(
+        &self,
+        layer: &MockLayer,
+    ) -> anyhow::Result<Option<Arc<MockDeltaLayer>>> {
+        Ok(match layer {
+            MockLayer::Delta(l) => Some(l.clone()),
+            MockLayer::Image(_) => None,
+        })
+    }
+
+    async fn create_image(
+        &mut self,
+        lsn: Lsn,
+        key_range: &Range<Key>,
+        ctx: &MockRequestContext,
+    ) -> anyhow::Result<()> {
+        let keyspace = self.get_keyspace(key_range, lsn, ctx).await?;
+
+        let mut accum_size: u64 = 0;
+        for r in keyspace {
+            accum_size += r.end - r.start;
+        }
+
+        let new_layer = Arc::new(MockImageLayer {
+            key_range: key_range.clone(),
+            lsn_range: lsn..lsn,
+            file_size: accum_size * 8192,
+            deleted: Mutex::new(false),
+        });
+        info!(
+            "created image layer, size {}: {}",
+            new_layer.file_size,
+            new_layer.short_id()
+        );
+        self.live_layers.push(MockLayer::Image(new_layer.clone()));
+
+        // update stats
+        self.bytes_written += new_layer.file_size;
+        self.layers_created += 1;
+
+        self.time += 1;
+        self.history.push(LayerTraceEvent {
+            time_rel: self.time,
+            op: LayerTraceOp::CreateImage,
+            file: LayerTraceFile {
+                filename: new_layer.short_id(),
+                key_range: new_layer.key_range.clone(),
+                lsn_range: new_layer.lsn_range.clone(),
+            },
+        });
+
+        Ok(())
+    }
+
+    async fn create_delta(
+        &mut self,
+        lsn_range: &Range<Lsn>,
+        key_range: &Range<Key>,
+        input_layers: &[Arc<MockDeltaLayer>],
+        ctx: &MockRequestContext,
+    ) -> anyhow::Result<()> {
+        let mut key_value_stream =
+            std::pin::pin!(merge_delta_keys::<MockTimeline>(input_layers, ctx));
+        let mut records: Vec<MockRecord> = Vec::new();
+        let mut total_len = 2;
+        while let Some(delta_entry) = key_value_stream.next().await {
+            let delta_entry: MockRecord = delta_entry?;
+            if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) {
+                total_len += delta_entry.len;
+                records.push(delta_entry);
+            }
+        }
+        let total_records = records.len();
+        let new_layer = Arc::new(MockDeltaLayer {
+            key_range: key_range.clone(),
+            lsn_range: lsn_range.clone(),
+            file_size: total_len,
+            records,
+            deleted: Mutex::new(false),
+        });
+        info!(
+            "created delta layer, recs {}, size {}: {}",
+            total_records,
+            total_len,
+            new_layer.short_id()
+        );
+        self.live_layers.push(MockLayer::Delta(new_layer.clone()));
+
+        // update stats
+        self.bytes_written += total_len;
+        self.layers_created += 1;
+
+        self.time += 1;
+        self.history.push(LayerTraceEvent {
+            time_rel: self.time,
+            op: LayerTraceOp::CreateDelta,
+            file: LayerTraceFile {
+                filename: new_layer.short_id(),
+                key_range: new_layer.key_range.clone(),
+                lsn_range: new_layer.lsn_range.clone(),
+            },
+        });
+
+        Ok(())
+    }
+
+    async fn delete_layer(
+        &mut self,
+        layer: &Self::Layer,
+        _ctx: &MockRequestContext,
+    ) -> anyhow::Result<()> {
+        let layer = std::pin::pin!(layer);
+        info!("deleting layer: {}", layer.short_id());
+        self.num_deleted_layers += 1;
+        self.bytes_deleted += layer.file_size();
+        layer.mark_deleted();
+
+        self.time += 1;
+        self.history.push(LayerTraceEvent {
+            time_rel: self.time,
+            op: LayerTraceOp::Delete,
+            file: LayerTraceFile {
+                filename: layer.short_id(),
+                key_range: layer.key_range().clone(),
+                lsn_range: layer.lsn_range().clone(),
+            },
+        });
+
+        Ok(())
+    }
+}
diff --git a/pageserver/compaction/src/simulator/draw.rs b/pageserver/compaction/src/simulator/draw.rs
new file mode 100644
index 0000000000..997925067f
--- /dev/null
+++ b/pageserver/compaction/src/simulator/draw.rs
@@ -0,0 +1,411 @@
+use super::Key;
+use anyhow::Result;
+use std::cmp::Ordering;
+use std::{
+    collections::{BTreeMap, BTreeSet, HashSet},
+    fmt::Write,
+    ops::Range,
+};
+use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style};
+use utils::lsn::Lsn;
+
+// Map values to their compressed coordinate - the index the value
+// would have in a sorted and deduplicated list of all values.
+struct CoordinateMap<T: Ord + Copy> {
+    map: BTreeMap<T, usize>,
+    stretch: f32,
+}
+
+impl<T: Ord + Copy> CoordinateMap<T> {
+    fn new(coords: Vec<T>, stretch: f32) -> Self {
+        let set: BTreeSet<T> = coords.into_iter().collect();
+
+        let mut map: BTreeMap<T, usize> = BTreeMap::new();
+        for (i, e) in set.iter().enumerate() {
+            map.insert(*e, i);
+        }
+
+        Self { map, stretch }
+    }
+
+    // This assumes that the map contains an exact point for this.
+    // Use map_inexact for values inbetween
+    fn map(&self, val: T) -> f32 {
+        *self.map.get(&val).unwrap() as f32 * self.stretch
+    }
+
+    // the value is still assumed to be within the min/max bounds
+    // (this is currently unused)
+    fn _map_inexact(&self, val: T) -> f32 {
+        let prev = *self.map.range(..=val).next().unwrap().1;
+        let next = *self.map.range(val..).next().unwrap().1;
+
+        // interpolate
+        (prev as f32 + (next - prev) as f32) * self.stretch
+    }
+
+    fn max(&self) -> f32 {
+        self.map.len() as f32 * self.stretch
+    }
+}
+
+#[derive(PartialEq, Hash, Eq)]
+pub enum LayerTraceOp {
+    Flush,
+    CreateDelta,
+    CreateImage,
+    Delete,
+}
+
+impl std::fmt::Display for LayerTraceOp {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        let op_str = match self {
+            LayerTraceOp::Flush => "flush",
+            LayerTraceOp::CreateDelta => "create_delta",
+            LayerTraceOp::CreateImage => "create_image",
+            LayerTraceOp::Delete => "delete",
+        };
+        f.write_str(op_str)
+    }
+}
+
+#[derive(PartialEq, Hash, Eq, Clone)]
+pub struct LayerTraceFile {
+    pub filename: String,
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+}
+
+impl LayerTraceFile {
+    fn is_image(&self) -> bool {
+        self.lsn_range.end == self.lsn_range.start
+    }
+}
+
+pub struct LayerTraceEvent {
+    pub time_rel: u64,
+    pub op: LayerTraceOp,
+    pub file: LayerTraceFile,
+}
+
+pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output: W) -> Result<()> {
+    let mut files: Vec<LayerTraceFile> = Vec::new();
+
+    for event in history {
+        files.push(event.file.clone());
+    }
+    let last_time_rel = history.last().unwrap().time_rel;
+
+    // Collect all coordinates
+    let mut keys: Vec<Key> = vec![];
+    let mut lsns: Vec<Lsn> = vec![];
+    for f in files.iter() {
+        keys.push(f.key_range.start);
+        keys.push(f.key_range.end);
+        lsns.push(f.lsn_range.start);
+        lsns.push(f.lsn_range.end);
+    }
+
+    // Analyze
+    let key_map = CoordinateMap::new(keys, 2.0);
+    // Stretch out vertically for better visibility
+    let lsn_map = CoordinateMap::new(lsns, 3.0);
+
+    let mut svg = String::new();
+
+    // Draw
+    writeln!(
+        svg,
+        "{}",
+        BeginSvg {
+            w: key_map.max(),
+            h: lsn_map.max(),
+        }
+    )?;
+    let lsn_max = lsn_map.max();
+
+    // Sort the files by LSN, but so that image layers go after all delta layers
+    // The SVG is painted in the order the elements appear, and we want to draw
+    // image layers on top of the delta layers if they overlap
+    //
+    // (This could also be implemented via z coordinates: image layers get one z
+    // coord, delta layers get another z coord.)
+    let mut files_sorted: Vec<LayerTraceFile> = files.into_iter().collect();
+    files_sorted.sort_by(|a, b| {
+        if a.is_image() && !b.is_image() {
+            Ordering::Greater
+        } else if !a.is_image() && b.is_image() {
+            Ordering::Less
+        } else {
+            a.lsn_range.end.cmp(&b.lsn_range.end)
+        }
+    });
+
+    writeln!(svg, "<!-- layers -->")?;
+    let mut files_seen = HashSet::new();
+    for f in files_sorted {
+        if files_seen.contains(&f) {
+            continue;
+        }
+        let key_start = key_map.map(f.key_range.start);
+        let key_end = key_map.map(f.key_range.end);
+        let key_diff = key_end - key_start;
+
+        if key_start >= key_end {
+            panic!("Invalid key range {}-{}", key_start, key_end);
+        }
+
+        let lsn_start = lsn_map.map(f.lsn_range.start);
+        let lsn_end = lsn_map.map(f.lsn_range.end);
+
+        // Fill in and thicken rectangle if it's an
+        // image layer so that we can see it.
+        let mut style = Style::default();
+        style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
+        style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5);
+
+        let y_start = lsn_max - lsn_start;
+        let y_end = lsn_max - lsn_end;
+
+        let x_margin = 0.25;
+        let y_margin = 0.5;
+
+        match f.lsn_range.start.cmp(&f.lsn_range.end) {
+            Ordering::Less => {
+                write!(
+                    svg,
+                    r#"    <rect id="layer_{}" x="{}" y="{}" width="{}" height="{}" ry="{}" style="{}">"#,
+                    f.filename,
+                    key_start + x_margin,
+                    y_end + y_margin,
+                    key_diff - x_margin * 2.0,
+                    y_start - y_end - y_margin * 2.0,
+                    1.0, // border_radius,
+                    style,
+                )?;
+                write!(svg, "<title>{}</title>", f.filename)?;
+                writeln!(svg, "</rect>")?;
+            }
+            Ordering::Equal => {
+                //lsn_diff = 0.3;
+                //lsn_offset = -lsn_diff / 2.0;
+                //margin = 0.05;
+                style.fill = Fill::Color(rgb(0x80, 0, 0x80));
+                style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0);
+                write!(
+                    svg,
+                    r#"    <line id="layer_{}" x1="{}" y1="{}" x2="{}" y2="{}" style="{}">"#,
+                    f.filename,
+                    key_start + x_margin,
+                    y_end,
+                    key_end - x_margin,
+                    y_end,
+                    style,
+                )?;
+                write!(
+                    svg,
+                    "<title>{}<br>{} - {}</title>",
+                    f.filename, lsn_end, y_end
+                )?;
+                writeln!(svg, "</line>")?;
+            }
+            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
+        }
+        files_seen.insert(f);
+    }
+
+    let mut record_style = Style::default();
+    record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
+    record_style.stroke = Stroke::None;
+
+    writeln!(svg, "{}", EndSvg)?;
+
+    let mut layer_events_str = String::new();
+    let mut first = true;
+    for e in history {
+        if !first {
+            writeln!(layer_events_str, ",")?;
+        }
+        write!(
+            layer_events_str,
+            r#"  {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#,
+            e.time_rel, e.file.filename, e.op
+        )?;
+        first = false;
+    }
+    writeln!(layer_events_str)?;
+
+    writeln!(
+        output,
+        r#"<!DOCTYPE html>
+<html>
+<head>
+<style>
+/* Keep the slider pinned at top */
+.topbar {{
+  display: block;
+  overflow: hidden;
+  background-color: lightgrey;
+  position: fixed;
+  top: 0;
+  width: 100%;
+/*  width: 500px; */
+}}
+.slidercontainer {{
+  float: left;
+  width: 50%;
+  margin-right: 200px;
+}}
+.slider {{
+  float: left;
+  width: 100%;
+}}
+.legend {{
+  width: 200px;
+  float: right;
+}}
+
+/* Main content */
+.main {{
+  margin-top: 50px; /* Add a top margin to avoid content overlay */
+}}
+</style>
+</head>
+
+  <body onload="init()">
+    <script type="text/javascript">
+
+      var layer_events = [{layer_events_str}]
+
+      let ticker;
+
+      function init() {{
+          for (let i = 0; i < layer_events.length; i++) {{
+              var layer = document.getElementById("layer_" + layer_events[i].filename);
+              layer.style.visibility = "hidden";
+          }}
+          last_layer_event = -1;
+          moveSlider(last_slider_pos)
+      }}
+
+      function startAnimation() {{
+          ticker = setInterval(animateStep, 100);
+      }}
+      function stopAnimation() {{
+          clearInterval(ticker);
+      }}
+
+      function animateStep() {{
+          if (last_layer_event < layer_events.length - 1) {{
+              var slider = document.getElementById("time-slider");
+              let prevPos = slider.value
+              let nextEvent = last_layer_event + 1
+              while (nextEvent <= layer_events.length - 1) {{
+                  if (layer_events[nextEvent].time_rel > prevPos) {{
+                      break;
+                  }}
+                  nextEvent += 1;
+              }}
+              let nextPos = layer_events[nextEvent].time_rel
+              slider.value = nextPos
+              moveSlider(nextPos)
+          }}
+      }}
+
+      function redoLayerEvent(n, dir) {{
+          var layer = document.getElementById("layer_" + layer_events[n].filename);
+          switch (layer_events[n].op) {{
+              case "flush":
+                  layer.style.visibility = "visible";
+                  break;
+              case "create_delta":
+                  layer.style.visibility = "visible";
+                  break;
+              case "create_image":
+                  layer.style.visibility = "visible";
+                  break;
+              case "delete":
+                  layer.style.visibility = "hidden";
+                  break;
+          }}
+      }}
+      function undoLayerEvent(n) {{
+          var layer = document.getElementById("layer_" + layer_events[n].filename);
+          switch (layer_events[n].op) {{
+              case "flush":
+                  layer.style.visibility = "hidden";
+                  break;
+              case "create_delta":
+                  layer.style.visibility = "hidden";
+                  break;
+              case "create_image":
+                  layer.style.visibility = "hidden";
+                  break;
+              case "delete":
+                  layer.style.visibility = "visible";
+                  break;
+          }}
+      }}
+
+      var last_slider_pos = 0
+      var last_layer_event = 0
+
+      var moveSlider = function(new_pos) {{
+          if (new_pos > last_slider_pos) {{
+              while (last_layer_event < layer_events.length - 1) {{
+                  if (layer_events[last_layer_event + 1].time_rel > new_pos) {{
+                      break;
+                  }}
+                  last_layer_event += 1;
+                  redoLayerEvent(last_layer_event)
+              }}
+          }}
+          if (new_pos < last_slider_pos) {{
+              while (last_layer_event >= 0) {{
+                  if (layer_events[last_layer_event].time_rel <= new_pos) {{
+                      break;
+                  }}
+                  undoLayerEvent(last_layer_event)
+                  last_layer_event -= 1;
+              }}
+          }}
+          last_slider_pos = new_pos;
+          document.getElementById("debug_pos").textContent=new_pos;
+          if (last_layer_event >= 0) {{
+              document.getElementById("debug_layer_event").textContent=last_layer_event + " " + layer_events[last_layer_event].time_rel + " " + layer_events[last_layer_event].op;
+          }} else {{
+              document.getElementById("debug_layer_event").textContent="begin";
+          }}
+      }}
+    </script>
+
+    <div class="topbar">
+      <div class="slidercontainer">
+        <label for="time-slider">TIME</label>:
+        <input id="time-slider" class="slider" type="range" min="0" max="{last_time_rel}" value="0" oninput="moveSlider(this.value)"><br>
+
+        pos: <span id="debug_pos"></span><br>
+        event: <span id="debug_layer_event"></span><br>
+        gc: <span id="debug_gc_event"></span><br>
+      </div>
+
+      <button onclick="startAnimation()">Play</button>
+      <button onclick="stopAnimation()">Stop</button>
+
+      <svg class="legend">
+        <rect x=5 y=0 width=20 height=20 style="fill:rgb(128,128,128);stroke:rgb(0,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
+        <line x1=5 y1=30 x2=25 y2=30 style="fill:rgb(128,0,128);stroke:rgb(128,0,128);stroke-width:3;fill-opacity:1;stroke-opacity:1;"/>
+        <line x1=0 y1=40 x2=30 y2=40 style="fill:none;stroke:rgb(255,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
+      </svg>
+    </div>
+
+    <div class="main">
+{svg}
+    </div>
+  </body>
+</html>
+"#
+    )?;
+
+    Ok(())
+}
diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs
new file mode 100644
index 0000000000..1cea2a20e1
--- /dev/null
+++ b/pageserver/compaction/tests/tests.rs
@@ -0,0 +1,35 @@
+use pageserver_compaction::interface::CompactionLayer;
+use pageserver_compaction::simulator::MockTimeline;
+
+/// Test the extreme case that there are so many updates for a single key that
+/// even if we produce an extremely narrow delta layer, spanning just that one
+/// key, we still too many records to fit in the target file size. We need to
+/// split in the LSN dimension too in that case.
+///
+/// TODO: The code to avoid this problem has not been implemented yet! So the
+/// assertion currently fails, but we need to make it not fail.
+#[ignore]
+#[tokio::test]
+async fn test_many_updates_for_single_key() {
+    let mut executor = MockTimeline::new();
+    executor.target_file_size = 10_000_000; // 10 MB
+
+    // Ingest 100 MB of updates to a single key.
+    for _ in 1..1000 {
+        executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
+        executor.ingest_uniform(10_000, 10, &(0..1)).unwrap();
+        executor.compact().await.unwrap();
+    }
+
+    // Check that all the layers are smaller than the target size (with some slop)
+    for l in executor.live_layers.iter() {
+        println!("layer {}: {}", l.short_id(), l.file_size());
+    }
+    for l in executor.live_layers.iter() {
+        assert!(l.file_size() < executor.target_file_size * 2);
+        // sanity check that none of the delta layers are stupidly small either
+        if l.is_delta() {
+            assert!(l.file_size() > executor.target_file_size / 2);
+        }
+    }
+}
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 012a950b60..c7f9d596c6 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -17,7 +17,7 @@ use tracing::*;
 use utils::id::NodeId;
 
 mod metrics;
-use metrics::MetricsKey;
+use crate::consumption_metrics::metrics::MetricsKey;
 mod disk_cache;
 mod upload;
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c3103917ee..15dd125de2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3632,6 +3632,7 @@ pub(crate) mod harness {
                 compaction_target_size: Some(tenant_conf.compaction_target_size),
                 compaction_period: Some(tenant_conf.compaction_period),
                 compaction_threshold: Some(tenant_conf.compaction_threshold),
+                compaction_algorithm: Some(tenant_conf.compaction_algorithm),
                 gc_horizon: Some(tenant_conf.gc_horizon),
                 gc_period: Some(tenant_conf.gc_period),
                 image_creation_threshold: Some(tenant_conf.image_creation_threshold),
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index cce30e900e..18c4ea664e 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,6 +9,7 @@
 //! may lead to a data loss.
 //!
 use anyhow::bail;
+use pageserver_api::models::CompactionAlgorithm;
 use pageserver_api::models::EvictionPolicy;
 use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
@@ -20,6 +21,7 @@ use std::time::Duration;
 use utils::generation::Generation;
 
 pub mod defaults {
+
     // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
     // would be more appropriate. But a low value forces the code to be exercised more,
     // which is good for now to trigger bugs.
@@ -27,12 +29,17 @@ pub mod defaults {
     pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
     pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
 
+    // FIXME the below configs are only used by legacy algorithm. The new algorithm
+    // has different parameters.
+
     // Target file size, when creating image and delta layers.
     // This parameter determines L1 layer file size.
     pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
 
     pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
     pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
+    pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
+        super::CompactionAlgorithm::Legacy;
 
     pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
 
@@ -305,6 +312,7 @@ pub struct TenantConf {
     pub compaction_period: Duration,
     // Level0 delta layer threshold for compaction.
     pub compaction_threshold: usize,
+    pub compaction_algorithm: CompactionAlgorithm,
     // Determines how much history is retained, to allow
     // branching and read replicas at an older point in time.
     // The unit is #of bytes of WAL.
@@ -377,6 +385,10 @@ pub struct TenantConfOpt {
     #[serde(default)]
     pub compaction_threshold: Option<usize>,
 
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub compaction_algorithm: Option<CompactionAlgorithm>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub gc_horizon: Option<u64>,
@@ -457,6 +469,9 @@ impl TenantConfOpt {
             compaction_threshold: self
                 .compaction_threshold
                 .unwrap_or(global_conf.compaction_threshold),
+            compaction_algorithm: self
+                .compaction_algorithm
+                .unwrap_or(global_conf.compaction_algorithm),
             gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
             gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
             image_creation_threshold: self
@@ -503,6 +518,7 @@ impl Default for TenantConf {
             compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
                 .expect("cannot parse default compaction period"),
             compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
+            compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
             gc_horizon: DEFAULT_GC_HORIZON,
             gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                 .expect("cannot parse default gc period"),
@@ -580,6 +596,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
         Self {
             checkpoint_distance: value.checkpoint_distance,
             checkpoint_timeout: value.checkpoint_timeout.map(humantime),
+            compaction_algorithm: value.compaction_algorithm,
             compaction_target_size: value.compaction_target_size,
             compaction_period: value.compaction_period.map(humantime),
             compaction_threshold: value.compaction_threshold,
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 19eebf5531..e636073113 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1120,3 +1120,15 @@ impl AsRef<DeltaLayerInner> for DeltaLayerInner {
         self
     }
 }
+
+impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for DeltaEntry<'a> {
+    fn key(&self) -> Key {
+        self.key
+    }
+    fn lsn(&self) -> Lsn {
+        self.lsn
+    }
+    fn size(&self) -> u64 {
+        self.size
+    }
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d13d4dc7d4..59a7dcd4bd 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,3 +1,4 @@
+mod compaction;
 pub mod delete;
 mod eviction_task;
 mod init;
@@ -18,8 +19,8 @@ use once_cell::sync::Lazy;
 use pageserver_api::{
     keyspace::KeySpaceAccum,
     models::{
-        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
-        LayerMapInfo, TimelineState,
+        CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
+        EvictionPolicy, LayerMapInfo, TimelineState,
     },
     reltag::BlockNumber,
     shard::{ShardIdentity, TenantShardId},
@@ -63,6 +64,7 @@ use crate::tenant::{
 use crate::{
     context::{AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder},
     disk_usage_eviction_task::DiskUsageEvictionInfo,
+    pgdatadir_mapping::CollectKeySpaceError,
 };
 use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};
 use crate::{
@@ -1093,6 +1095,19 @@ impl Timeline {
             return Ok(());
         }
 
+        match self.get_compaction_algorithm() {
+            CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
+            CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
+        }
+    }
+
+    /// TODO: cancellation
+    async fn compact_legacy(
+        self: &Arc<Self>,
+        _cancel: &CancellationToken,
+        flags: EnumSet<CompactFlags>,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
         // High level strategy for compaction / image creation:
         //
         // 1. First, calculate the desired "partitioning" of the
@@ -1498,6 +1513,13 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
+    fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
+        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
+        tenant_conf
+            .compaction_algorithm
+            .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
+    }
+
     fn get_eviction_policy(&self) -> EvictionPolicy {
         let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
@@ -3639,6 +3661,18 @@ pub(crate) enum CompactionError {
     Other(#[from] anyhow::Error),
 }
 
+impl From<CollectKeySpaceError> for CompactionError {
+    fn from(err: CollectKeySpaceError) -> Self {
+        match err {
+            CollectKeySpaceError::Cancelled
+            | CollectKeySpaceError::PageRead(PageReconstructError::Cancelled) => {
+                CompactionError::ShuttingDown
+            }
+            e => CompactionError::Other(e.into()),
+        }
+    }
+}
+
 #[serde_as]
 #[derive(serde::Serialize)]
 struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration);
@@ -3758,7 +3792,7 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
 }
 
 impl Timeline {
-    /// Level0 files first phase of compaction, explained in the [`Self::compact`] comment.
+    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
     async fn compact_level0_phase1(
         self: &Arc<Self>,
         guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
@@ -4237,13 +4271,24 @@ impl Timeline {
             return Ok(());
         }
 
+        self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
+            .await?;
+        Ok(())
+    }
+
+    async fn finish_compact_batch(
+        self: &Arc<Self>,
+        new_deltas: &[ResidentLayer],
+        new_images: &[ResidentLayer],
+        layers_to_remove: &[Layer],
+    ) -> anyhow::Result<()> {
         let mut guard = self.layers.write().await;
 
         let mut duplicated_layers = HashSet::new();
 
-        let mut insert_layers = Vec::with_capacity(new_layers.len());
+        let mut insert_layers = Vec::with_capacity(new_deltas.len());
 
-        for l in &new_layers {
+        for l in new_deltas {
             if guard.contains(l.as_ref()) {
                 // expected in tests
                 tracing::error!(layer=%l, "duplicated L1 layer");
@@ -4254,24 +4299,28 @@ impl Timeline {
                 // because we have not implemented L0 => L0 compaction.
                 duplicated_layers.insert(l.layer_desc().key());
             } else if LayerMap::is_l0(l.layer_desc()) {
-                return Err(CompactionError::Other(anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
+                bail!("compaction generates a L0 layer file as output, which will cause infinite compaction.");
             } else {
                 insert_layers.push(l.clone());
             }
         }
 
-        let remove_layers = {
-            let mut deltas_to_compact = deltas_to_compact;
-            // only remove those inputs which were not outputs
-            deltas_to_compact.retain(|l| !duplicated_layers.contains(&l.layer_desc().key()));
-            deltas_to_compact
-        };
+        // only remove those inputs which were not outputs
+        let remove_layers: Vec<Layer> = layers_to_remove
+            .iter()
+            .filter(|l| !duplicated_layers.contains(&l.layer_desc().key()))
+            .cloned()
+            .collect();
+
+        if !new_images.is_empty() {
+            guard.track_new_image_layers(new_images, &self.metrics);
+        }
 
         // deletion will happen later, the layer file manager calls garbage_collect_on_drop
         guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics);
 
         if let Some(remote_client) = self.remote_client.as_ref() {
-            remote_client.schedule_compaction_update(&remove_layers, &new_layers)?;
+            remote_client.schedule_compaction_update(&remove_layers, new_deltas)?;
         }
 
         drop_wlock(guard);
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
new file mode 100644
index 0000000000..950459cbf9
--- /dev/null
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -0,0 +1,477 @@
+//! New compaction implementation. The algorithm itself is implemented in the
+//! compaction crate. This file implements the callbacks and structs that allow
+//! the algorithm to drive the process.
+//!
+//! The old legacy algorithm is implemented directly in `timeline.rs`.
+
+use std::ops::{Deref, Range};
+use std::sync::Arc;
+
+use super::Timeline;
+
+use async_trait::async_trait;
+use fail::fail_point;
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, trace, warn};
+
+use crate::context::RequestContext;
+use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
+use crate::tenant::timeline::{is_rel_fsm_block_key, is_rel_vm_block_key};
+use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
+use crate::tenant::timeline::{Layer, ResidentLayer};
+use crate::tenant::DeltaLayer;
+use crate::tenant::PageReconstructError;
+use crate::ZERO_PAGE;
+
+use crate::keyspace::KeySpace;
+use crate::repository::Key;
+
+use utils::lsn::Lsn;
+
+use pageserver_compaction::helpers::overlaps_with;
+use pageserver_compaction::interface::*;
+
+use super::CompactionError;
+
+impl Timeline {
+    /// Entry point for new tiered compaction algorithm.
+    ///
+    /// All the real work is in the implementation in the pageserver_compaction
+    /// crate. The code here would apply to any algorithm implemented by the
+    /// same interface, but tiered is the only one at the moment.
+    ///
+    /// TODO: cancellation
+    pub(crate) async fn compact_tiered(
+        self: &Arc<Self>,
+        _cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
+        let fanout = self.get_compaction_threshold() as u64;
+        let target_file_size = self.get_checkpoint_distance();
+
+        // Find the top of the historical layers
+        let end_lsn = {
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+
+            let l0_deltas = layers.get_level0_deltas()?;
+            drop(guard);
+
+            // As an optimization, if we find that there are too few L0 layers,
+            // bail out early. We know that the compaction algorithm would do
+            // nothing in that case.
+            if l0_deltas.len() < fanout as usize {
+                // doesn't need compacting
+                return Ok(());
+            }
+            l0_deltas.iter().map(|l| l.lsn_range.end).max().unwrap()
+        };
+
+        // Is the timeline being deleted?
+        if self.is_stopping() {
+            trace!("Dropping out of compaction on timeline shutdown");
+            return Err(CompactionError::ShuttingDown);
+        }
+
+        let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
+        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
+        let ctx_adaptor = RequestContextAdaptor(ctx.clone());
+
+        pageserver_compaction::compact_tiered::compact_tiered(
+            &mut adaptor,
+            end_lsn,
+            target_file_size,
+            fanout,
+            &ctx_adaptor,
+        )
+        .await?;
+
+        adaptor.flush_updates().await?;
+        Ok(())
+    }
+}
+
+struct TimelineAdaptor {
+    timeline: Arc<Timeline>,
+
+    keyspace: (Lsn, KeySpace),
+
+    new_deltas: Vec<ResidentLayer>,
+    new_images: Vec<ResidentLayer>,
+    layers_to_delete: Vec<Arc<PersistentLayerDesc>>,
+}
+
+impl TimelineAdaptor {
+    pub fn new(timeline: &Arc<Timeline>, keyspace: (Lsn, KeySpace)) -> Self {
+        Self {
+            timeline: timeline.clone(),
+            keyspace,
+            new_images: Vec::new(),
+            new_deltas: Vec::new(),
+            layers_to_delete: Vec::new(),
+        }
+    }
+
+    pub async fn flush_updates(&mut self) -> anyhow::Result<()> {
+        let layers_to_delete = {
+            let guard = self.timeline.layers.read().await;
+            self.layers_to_delete
+                .iter()
+                .map(|x| guard.get_from_desc(x))
+                .collect::<Vec<Layer>>()
+        };
+        self.timeline
+            .finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete)
+            .await?;
+        self.new_images.clear();
+        self.new_deltas.clear();
+        self.layers_to_delete.clear();
+        Ok(())
+    }
+}
+
+#[derive(Clone)]
+struct ResidentDeltaLayer(ResidentLayer);
+#[derive(Clone)]
+struct ResidentImageLayer(ResidentLayer);
+
+#[async_trait]
+impl CompactionJobExecutor for TimelineAdaptor {
+    type Key = crate::repository::Key;
+
+    type Layer = OwnArc<PersistentLayerDesc>;
+    type DeltaLayer = ResidentDeltaLayer;
+    type ImageLayer = ResidentImageLayer;
+
+    type RequestContext = RequestContextAdaptor;
+
+    async fn get_layers(
+        &mut self,
+        key_range: &Range<Key>,
+        lsn_range: &Range<Lsn>,
+        _ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<Vec<OwnArc<PersistentLayerDesc>>> {
+        self.flush_updates().await?;
+
+        let guard = self.timeline.layers.read().await;
+        let layer_map = guard.layer_map();
+
+        let result = layer_map
+            .iter_historic_layers()
+            .filter(|l| {
+                overlaps_with(&l.lsn_range, lsn_range) && overlaps_with(&l.key_range, key_range)
+            })
+            .map(OwnArc)
+            .collect();
+        Ok(result)
+    }
+
+    async fn get_keyspace(
+        &mut self,
+        key_range: &Range<Key>,
+        lsn: Lsn,
+        _ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<Vec<Range<Key>>> {
+        if lsn == self.keyspace.0 {
+            Ok(pageserver_compaction::helpers::intersect_keyspace(
+                &self.keyspace.1.ranges,
+                key_range,
+            ))
+        } else {
+            // The current compaction implementatin only ever requests the key space
+            // at the compaction end LSN.
+            anyhow::bail!("keyspace not available for requested lsn");
+        }
+    }
+
+    async fn downcast_delta_layer(
+        &self,
+        layer: &OwnArc<PersistentLayerDesc>,
+    ) -> anyhow::Result<Option<ResidentDeltaLayer>> {
+        // this is a lot more complex than a simple downcast...
+        if layer.is_delta() {
+            let l = {
+                let guard = self.timeline.layers.read().await;
+                guard.get_from_desc(layer)
+            };
+            let result = l.download_and_keep_resident().await?;
+
+            Ok(Some(ResidentDeltaLayer(result)))
+        } else {
+            Ok(None)
+        }
+    }
+
+    async fn create_image(
+        &mut self,
+        lsn: Lsn,
+        key_range: &Range<Key>,
+        ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<()> {
+        Ok(self.create_image_impl(lsn, key_range, ctx).await?)
+    }
+
+    async fn create_delta(
+        &mut self,
+        lsn_range: &Range<Lsn>,
+        key_range: &Range<Key>,
+        input_layers: &[ResidentDeltaLayer],
+        ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<()> {
+        debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+
+        let mut all_entries = Vec::new();
+        for dl in input_layers.iter() {
+            all_entries.extend(dl.load_keys(ctx).await?);
+        }
+
+        // The current stdlib sorting implementation is designed in a way where it is
+        // particularly fast where the slice is made up of sorted sub-ranges.
+        all_entries.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
+
+        let mut writer = DeltaLayerWriter::new(
+            self.timeline.conf,
+            self.timeline.timeline_id,
+            self.timeline.tenant_shard_id,
+            key_range.start,
+            lsn_range.clone(),
+        )
+        .await?;
+
+        let mut dup_values = 0;
+
+        // This iterator walks through all key-value pairs from all the layers
+        // we're compacting, in key, LSN order.
+        let mut prev: Option<(Key, Lsn)> = None;
+        for &DeltaEntry {
+            key, lsn, ref val, ..
+        } in all_entries.iter()
+        {
+            if prev == Some((key, lsn)) {
+                // This is a duplicate. Skip it.
+                //
+                // It can happen if compaction is interrupted after writing some
+                // layers but not all, and we are compacting the range again.
+                // The calculations in the algorithm assume that there are no
+                // duplicates, so the math on targeted file size is likely off,
+                // and we will create smaller files than expected.
+                dup_values += 1;
+                continue;
+            }
+
+            let value = val.load(ctx).await?;
+
+            writer.put_value(key, lsn, value).await?;
+
+            prev = Some((key, lsn));
+        }
+
+        if dup_values > 0 {
+            warn!("delta layer created with {} duplicate values", dup_values);
+        }
+
+        fail_point!("delta-layer-writer-fail-before-finish", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint delta-layer-writer-fail-before-finish"
+            ))
+        });
+
+        let new_delta_layer = writer
+            .finish(prev.unwrap().0.next(), &self.timeline)
+            .await?;
+
+        self.new_deltas.push(new_delta_layer);
+        Ok(())
+    }
+
+    async fn delete_layer(
+        &mut self,
+        layer: &OwnArc<PersistentLayerDesc>,
+        _ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<()> {
+        self.layers_to_delete.push(layer.clone().0);
+        Ok(())
+    }
+}
+
+impl TimelineAdaptor {
+    async fn create_image_impl(
+        &mut self,
+        lsn: Lsn,
+        key_range: &Range<Key>,
+        ctx: &RequestContextAdaptor,
+    ) -> Result<(), PageReconstructError> {
+        let timer = self.timeline.metrics.create_images_time_histo.start_timer();
+
+        let mut image_layer_writer = ImageLayerWriter::new(
+            self.timeline.conf,
+            self.timeline.timeline_id,
+            self.timeline.tenant_shard_id,
+            key_range,
+            lsn,
+        )
+        .await?;
+
+        fail_point!("image-layer-writer-fail-before-finish", |_| {
+            Err(PageReconstructError::Other(anyhow::anyhow!(
+                "failpoint image-layer-writer-fail-before-finish"
+            )))
+        });
+        let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?;
+        for range in &keyspace_ranges {
+            let mut key = range.start;
+            while key < range.end {
+                let img = match self.timeline.get(key, lsn, ctx).await {
+                    Ok(img) => img,
+                    Err(err) => {
+                        // If we fail to reconstruct a VM or FSM page, we can zero the
+                        // page without losing any actual user data. That seems better
+                        // than failing repeatedly and getting stuck.
+                        //
+                        // We had a bug at one point, where we truncated the FSM and VM
+                        // in the pageserver, but the Postgres didn't know about that
+                        // and continued to generate incremental WAL records for pages
+                        // that didn't exist in the pageserver. Trying to replay those
+                        // WAL records failed to find the previous image of the page.
+                        // This special case allows us to recover from that situation.
+                        // See https://github.com/neondatabase/neon/issues/2601.
+                        //
+                        // Unfortunately we cannot do this for the main fork, or for
+                        // any metadata keys, keys, as that would lead to actual data
+                        // loss.
+                        if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
+                            warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
+                            ZERO_PAGE.clone()
+                        } else {
+                            return Err(err);
+                        }
+                    }
+                };
+                image_layer_writer.put_image(key, img).await?;
+                key = key.next();
+            }
+        }
+        let image_layer = image_layer_writer.finish(&self.timeline).await?;
+
+        self.new_images.push(image_layer);
+
+        timer.stop_and_record();
+
+        Ok(())
+    }
+}
+
+pub struct RequestContextAdaptor(pub RequestContext);
+
+impl std::ops::Deref for RequestContextAdaptor {
+    type Target = RequestContext;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl CompactionRequestContext for RequestContextAdaptor {}
+
+#[derive(Debug, Clone)]
+pub struct OwnArc<T>(pub Arc<T>);
+
+impl<T> Deref for OwnArc<T> {
+    type Target = <Arc<T> as Deref>::Target;
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T> AsRef<T> for OwnArc<T> {
+    fn as_ref(&self) -> &T {
+        self.0.as_ref()
+    }
+}
+
+impl CompactionLayer<Key> for OwnArc<PersistentLayerDesc> {
+    fn key_range(&self) -> &Range<Key> {
+        &self.key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.lsn_range
+    }
+    fn file_size(&self) -> u64 {
+        self.file_size
+    }
+    fn short_id(&self) -> std::string::String {
+        self.as_ref().short_id().to_string()
+    }
+    fn is_delta(&self) -> bool {
+        self.as_ref().is_delta()
+    }
+}
+
+impl CompactionLayer<Key> for OwnArc<DeltaLayer> {
+    fn key_range(&self) -> &Range<Key> {
+        &self.layer_desc().key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.layer_desc().lsn_range
+    }
+    fn file_size(&self) -> u64 {
+        self.layer_desc().file_size
+    }
+    fn short_id(&self) -> std::string::String {
+        self.layer_desc().short_id().to_string()
+    }
+    fn is_delta(&self) -> bool {
+        true
+    }
+}
+
+use crate::tenant::timeline::DeltaEntry;
+
+impl CompactionLayer<Key> for ResidentDeltaLayer {
+    fn key_range(&self) -> &Range<Key> {
+        &self.0.layer_desc().key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.0.layer_desc().lsn_range
+    }
+    fn file_size(&self) -> u64 {
+        self.0.layer_desc().file_size
+    }
+    fn short_id(&self) -> std::string::String {
+        self.0.layer_desc().short_id().to_string()
+    }
+    fn is_delta(&self) -> bool {
+        true
+    }
+}
+
+#[async_trait]
+impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
+    type DeltaEntry<'a> = DeltaEntry<'a>;
+
+    async fn load_keys<'a>(
+        &self,
+        ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
+        self.0.load_keys(ctx).await
+    }
+}
+
+impl CompactionLayer<Key> for ResidentImageLayer {
+    fn key_range(&self) -> &Range<Key> {
+        &self.0.layer_desc().key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.0.layer_desc().lsn_range
+    }
+    fn file_size(&self) -> u64 {
+        self.0.layer_desc().file_size
+    }
+    fn short_id(&self) -> std::string::String {
+        self.0.layer_desc().short_id().to_string()
+    }
+    fn is_delta(&self) -> bool {
+        false
+    }
+}
+impl CompactionImageLayer<TimelineAdaptor> for ResidentImageLayer {}
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 43e035d303..6cae663842 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -160,6 +160,9 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "compaction_target_size": 1048576,
         "checkpoint_distance": 10000,
         "checkpoint_timeout": "13m",
+        "compaction_algorithm": {
+            "kind": "Tiered",
+        },
         "eviction_policy": {
             "kind": "LayerAccessThreshold",
             "period": "20s",

From a8ec18c0f4ca9d5b31333d00cd30cf8b0053ee9e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 27 Feb 2024 17:24:01 +0000
Subject: [PATCH 0288/1571] refactor: move storage controller API structs into
 pageserver_api (#6927)

## Problem

This is a precursor to adding a convenience CLI for the storage
controller.

## Summary of changes

- move controller api structs into pageserver_api::controller_api to
make them visible to other crates
- rename pageserver_api::control_api to pageserver_api::upcall_api to
match the /upcall/v1/ naming in the storage controller.

Why here rather than a totally separate crate? It's convenient to have
all the pageserver-related stuff in one place, and if we ever wanted to
move it to a different crate it's super easy to do that later.
---
 control_plane/attachment_service/src/http.rs  |  10 +-
 control_plane/attachment_service/src/node.rs  |   2 +-
 .../attachment_service/src/persistence.rs     |   2 +-
 .../attachment_service/src/reconciler.rs      |   2 +-
 .../attachment_service/src/scheduler.rs       |   2 +-
 .../attachment_service/src/service.rs         |  18 +--
 .../attachment_service/src/tenant_state.rs    |   2 +-
 control_plane/src/attachment_service.rs       | 126 +----------------
 control_plane/src/bin/neon_local.rs           |   7 +-
 control_plane/src/pageserver.rs               |   3 +-
 libs/pageserver_api/src/controller_api.rs     | 129 ++++++++++++++++++
 libs/pageserver_api/src/lib.rs                |   5 +-
 .../src/{control_api.rs => upcall_api.rs}     |   0
 pageserver/src/control_plane_client.rs        |   4 +-
 14 files changed, 165 insertions(+), 147 deletions(-)
 create mode 100644 libs/pageserver_api/src/controller_api.rs
 rename libs/pageserver_api/src/{control_api.rs => upcall_api.rs} (100%)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index d341187ef7..f1153c2c18 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -25,12 +25,12 @@ use utils::{
     id::NodeId,
 };
 
-use pageserver_api::control_api::{ReAttachRequest, ValidateRequest};
-
-use control_plane::attachment_service::{
-    AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest,
-    TenantShardMigrateRequest,
+use pageserver_api::controller_api::{
+    NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
 };
+use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
+
+use control_plane::attachment_service::{AttachHookRequest, InspectRequest};
 
 /// State available to HTTP request handlers
 #[derive(Clone)]
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index 09162701ac..1f9dcef033 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,4 +1,4 @@
-use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
 use serde::Serialize;
 use utils::id::NodeId;
 
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 4f336093cf..1b98cc7655 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -6,10 +6,10 @@ use std::time::Duration;
 use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
-use control_plane::attachment_service::NodeSchedulingPolicy;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
+use pageserver_api::controller_api::NodeSchedulingPolicy;
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use serde::{Deserialize, Serialize};
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index 751b06f93a..ce91c1f5e9 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,6 +1,6 @@
 use crate::persistence::Persistence;
 use crate::service;
-use control_plane::attachment_service::NodeAvailability;
+use pageserver_api::controller_api::NodeAvailability;
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 7059071bee..3224751e47 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -255,7 +255,7 @@ impl Scheduler {
 pub(crate) mod test_utils {
 
     use crate::node::Node;
-    use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+    use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
     use std::collections::HashMap;
     use utils::id::NodeId;
     /// Test helper: synthesize the requested number of nodes, all in active state.
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 8a80d0c746..02c1a65545 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -9,19 +9,17 @@ use std::{
 
 use anyhow::Context;
 use control_plane::attachment_service::{
-    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, NodeAvailability,
-    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, TenantCreateResponse,
-    TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard,
-    TenantShardMigrateRequest, TenantShardMigrateResponse,
+    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
 };
 use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
+use pageserver_api::controller_api::{
+    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
+    TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
+    TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
+};
 use pageserver_api::{
-    control_api::{
-        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
-        ValidateResponse, ValidateResponseTenant,
-    },
     models::{
         self, LocationConfig, LocationConfigListResponse, LocationConfigMode, ShardParameters,
         TenantConfig, TenantCreateRequest, TenantLocationConfigRequest,
@@ -29,6 +27,10 @@ use pageserver_api::{
         TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo,
     },
     shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
+    upcall_api::{
+        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
+        ValidateResponse, ValidateResponseTenant,
+    },
 };
 use pageserver_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 02f0171c29..c14fe6699e 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -1,7 +1,7 @@
 use std::{collections::HashMap, sync::Arc, time::Duration};
 
 use crate::{metrics, persistence::TenantShardPersistence};
-use control_plane::attachment_service::NodeAvailability;
+use pageserver_api::controller_api::NodeAvailability;
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
     shard::{ShardIdentity, TenantShardId},
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index f0bee1ce08..0c416267fb 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -2,8 +2,12 @@ use crate::{background_process, local_env::LocalEnv};
 use camino::{Utf8Path, Utf8PathBuf};
 use hyper::Method;
 use pageserver_api::{
+    controller_api::{
+        NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
+        TenantShardMigrateRequest, TenantShardMigrateResponse,
+    },
     models::{
-        ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
+        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
         TimelineCreateRequest, TimelineInfo,
     },
     shard::TenantShardId,
@@ -55,126 +59,6 @@ pub struct InspectResponse {
     pub attachment: Option<(u32, NodeId)>,
 }
 
-#[derive(Serialize, Deserialize)]
-pub struct TenantCreateResponseShard {
-    pub shard_id: TenantShardId,
-    pub node_id: NodeId,
-    pub generation: u32,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantCreateResponse {
-    pub shards: Vec<TenantCreateResponseShard>,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct NodeRegisterRequest {
-    pub node_id: NodeId,
-
-    pub listen_pg_addr: String,
-    pub listen_pg_port: u16,
-
-    pub listen_http_addr: String,
-    pub listen_http_port: u16,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct NodeConfigureRequest {
-    pub node_id: NodeId,
-
-    pub availability: Option<NodeAvailability>,
-    pub scheduling: Option<NodeSchedulingPolicy>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantLocateResponseShard {
-    pub shard_id: TenantShardId,
-    pub node_id: NodeId,
-
-    pub listen_pg_addr: String,
-    pub listen_pg_port: u16,
-
-    pub listen_http_addr: String,
-    pub listen_http_port: u16,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantLocateResponse {
-    pub shards: Vec<TenantLocateResponseShard>,
-    pub shard_params: ShardParameters,
-}
-
-/// Explicitly migrating a particular shard is a low level operation
-/// TODO: higher level "Reschedule tenant" operation where the request
-/// specifies some constraints, e.g. asking it to get off particular node(s)
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantShardMigrateRequest {
-    pub tenant_shard_id: TenantShardId,
-    pub node_id: NodeId,
-}
-
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
-pub enum NodeAvailability {
-    // Normal, happy state
-    Active,
-    // Offline: Tenants shouldn't try to attach here, but they may assume that their
-    // secondary locations on this node still exist.  Newly added nodes are in this
-    // state until we successfully contact them.
-    Offline,
-}
-
-impl FromStr for NodeAvailability {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self::Active),
-            "offline" => Ok(Self::Offline),
-            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
-        }
-    }
-}
-
-/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
-/// type needs to be defined with diesel traits in there.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
-pub enum NodeSchedulingPolicy {
-    Active,
-    Filling,
-    Pause,
-    Draining,
-}
-
-impl FromStr for NodeSchedulingPolicy {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self::Active),
-            "filling" => Ok(Self::Filling),
-            "pause" => Ok(Self::Pause),
-            "draining" => Ok(Self::Draining),
-            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
-        }
-    }
-}
-
-impl From<NodeSchedulingPolicy> for String {
-    fn from(value: NodeSchedulingPolicy) -> String {
-        use NodeSchedulingPolicy::*;
-        match value {
-            Active => "active",
-            Filling => "filling",
-            Pause => "pause",
-            Draining => "draining",
-        }
-        .to_string()
-    }
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantShardMigrateResponse {}
-
 impl AttachmentService {
     pub fn from_env(env: &LocalEnv) -> Self {
         let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 5c0d008943..cf647a5f9b 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,14 +8,15 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
-use control_plane::attachment_service::{
-    AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
-};
+use control_plane::attachment_service::AttachmentService;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::{InitForceMode, LocalEnv};
 use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::{broker, local_env};
+use pageserver_api::controller_api::{
+    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
+};
 use pageserver_api::models::{
     ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 59cd4789a8..642f153f2d 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,6 +17,7 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
+use pageserver_api::controller_api::NodeRegisterRequest;
 use pageserver_api::models::{
     self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
 };
@@ -30,7 +31,7 @@ use utils::{
     lsn::Lsn,
 };
 
-use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
+use crate::attachment_service::AttachmentService;
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};
 
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
new file mode 100644
index 0000000000..64b70a1a51
--- /dev/null
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -0,0 +1,129 @@
+use std::str::FromStr;
+
+/// Request/response types for the storage controller
+/// API (`/control/v1` prefix).  Implemented by the server
+/// in [`attachment_service::http`]
+use serde::{Deserialize, Serialize};
+use utils::id::NodeId;
+
+use crate::{models::ShardParameters, shard::TenantShardId};
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponseShard {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+    pub generation: u32,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponse {
+    pub shards: Vec<TenantCreateResponseShard>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeRegisterRequest {
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeConfigureRequest {
+    pub node_id: NodeId,
+
+    pub availability: Option<NodeAvailability>,
+    pub scheduling: Option<NodeSchedulingPolicy>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantLocateResponseShard {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantLocateResponse {
+    pub shards: Vec<TenantLocateResponseShard>,
+    pub shard_params: ShardParameters,
+}
+
+/// Explicitly migrating a particular shard is a low level operation
+/// TODO: higher level "Reschedule tenant" operation where the request
+/// specifies some constraints, e.g. asking it to get off particular node(s)
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateRequest {
+    pub tenant_shard_id: TenantShardId,
+    pub node_id: NodeId,
+}
+
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+pub enum NodeAvailability {
+    // Normal, happy state
+    Active,
+    // Offline: Tenants shouldn't try to attach here, but they may assume that their
+    // secondary locations on this node still exist.  Newly added nodes are in this
+    // state until we successfully contact them.
+    Offline,
+}
+
+impl FromStr for NodeAvailability {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "offline" => Ok(Self::Offline),
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
+        }
+    }
+}
+
+/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
+/// type needs to be defined with diesel traits in there.
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+pub enum NodeSchedulingPolicy {
+    Active,
+    Filling,
+    Pause,
+    Draining,
+}
+
+impl FromStr for NodeSchedulingPolicy {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "filling" => Ok(Self::Filling),
+            "pause" => Ok(Self::Pause),
+            "draining" => Ok(Self::Draining),
+            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
+        }
+    }
+}
+
+impl From<NodeSchedulingPolicy> for String {
+    fn from(value: NodeSchedulingPolicy) -> String {
+        use NodeSchedulingPolicy::*;
+        match value {
+            Active => "active",
+            Filling => "filling",
+            Pause => "pause",
+            Draining => "draining",
+        }
+        .to_string()
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateResponse {}
diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs
index b236b93428..1b948d60c3 100644
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -2,13 +2,14 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;
 
-/// Public API types
-pub mod control_api;
+pub mod controller_api;
 pub mod key;
 pub mod keyspace;
 pub mod models;
 pub mod reltag;
 pub mod shard;
+/// Public API types
+pub mod upcall_api;
 
 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
diff --git a/libs/pageserver_api/src/control_api.rs b/libs/pageserver_api/src/upcall_api.rs
similarity index 100%
rename from libs/pageserver_api/src/control_api.rs
rename to libs/pageserver_api/src/upcall_api.rs
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 61c7d03408..3fcf3a983b 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -2,10 +2,10 @@ use std::collections::HashMap;
 
 use futures::Future;
 use pageserver_api::{
-    control_api::{
+    shard::TenantShardId,
+    upcall_api::{
         ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
     },
-    shard::TenantShardId,
 };
 use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;

From e1b4d96b5b70f0a2a0830e8b46b3928b59ee3625 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 27 Feb 2024 21:18:46 +0200
Subject: [PATCH 0289/1571] Limit number of AUX files deltas to reduce
 reconstruct time (#6874)

## Problem
After commit [840abe395413508db40d0428e30f09343c051fed] (store AUX files
as deltas) we avoid quadratic growth of storage size when storing LR
snapshots but get quadratic slowdown of reconstruct time.
As a result storing 70k snapshots at my local Neon instance took more
than 3 hours and starting node (creation of basecbackup): ~10 minutes.
In prod 70k AUX files cause increase of startup time to 40 minutes:

https://neondb.slack.com/archives/C03F5SM1N02/p1708513010480179

## Summary of changes

Enforce storing full AUX directory (some analog of FPI) each 1024 files.
Time of creation 70k snapshots is reduced to 6 minutes and startup time
- to 1.5 minutes (100 seconds).

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs        | 52 ++++++++++++----------
 pageserver/src/tenant/timeline.rs          | 15 ++++++-
 test_runner/regress/test_layer_bloating.py |  2 +
 3 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 65f8ddaab4..024e66d112 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -36,6 +36,8 @@ use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
+const MAX_AUX_FILE_DELTAS: usize = 1024;
+
 #[derive(Debug)]
 pub enum LsnForTimestamp {
     /// Found commits both before and after the given timestamp
@@ -157,7 +159,6 @@ impl Timeline {
             pending_updates: HashMap::new(),
             pending_deletions: Vec::new(),
             pending_nblocks: 0,
-            pending_aux_files: None,
             pending_directory_entries: Vec::new(),
             lsn,
         }
@@ -873,11 +874,6 @@ pub struct DatadirModification<'a> {
     pending_deletions: Vec<(Range<Key>, Lsn)>,
     pending_nblocks: i64,
 
-    // If we already wrote any aux file changes in this modification, stash the latest dir.  If set,
-    // [`Self::put_file`] may assume that it is safe to emit a delta rather than checking
-    // if AUX_FILES_KEY is already set.
-    pending_aux_files: Option<AuxFilesDirectory>,
-
     /// For special "directory" keys that store key-value maps, track the size of the map
     /// if it was updated in this modification.
     pending_directory_entries: Vec<(DirectoryKind, usize)>,
@@ -1401,19 +1397,28 @@ impl<'a> DatadirModification<'a> {
             Some(Bytes::copy_from_slice(content))
         };
 
-        let dir = if let Some(mut dir) = self.pending_aux_files.take() {
+        let n_files;
+        let mut aux_files = self.tline.aux_files.lock().await;
+        if let Some(mut dir) = aux_files.dir.take() {
             // We already updated aux files in `self`: emit a delta and update our latest value
-
-            self.put(
-                AUX_FILES_KEY,
-                Value::WalRecord(NeonWalRecord::AuxFile {
-                    file_path: file_path.clone(),
-                    content: content.clone(),
-                }),
-            );
-
-            dir.upsert(file_path, content);
-            dir
+            dir.upsert(file_path.clone(), content.clone());
+            n_files = dir.files.len();
+            if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
+                self.put(
+                    AUX_FILES_KEY,
+                    Value::Image(Bytes::from(
+                        AuxFilesDirectory::ser(&dir).context("serialize")?,
+                    )),
+                );
+                aux_files.n_deltas = 0;
+            } else {
+                self.put(
+                    AUX_FILES_KEY,
+                    Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
+                );
+                aux_files.n_deltas += 1;
+            }
+            aux_files.dir = Some(dir);
         } else {
             // Check if the AUX_FILES_KEY is initialized
             match self.get(AUX_FILES_KEY, ctx).await {
@@ -1428,7 +1433,8 @@ impl<'a> DatadirModification<'a> {
                         }),
                     );
                     dir.upsert(file_path, content);
-                    dir
+                    n_files = dir.files.len();
+                    aux_files.dir = Some(dir);
                 }
                 Err(
                     e @ (PageReconstructError::AncestorStopping(_)
@@ -1455,14 +1461,14 @@ impl<'a> DatadirModification<'a> {
                             AuxFilesDirectory::ser(&dir).context("serialize")?,
                         )),
                     );
-                    dir
+                    n_files = 1;
+                    aux_files.dir = Some(dir);
                 }
             }
-        };
+        }
 
         self.pending_directory_entries
-            .push((DirectoryKind::AuxFiles, dir.files.len()));
-        self.pending_aux_files = Some(dir);
+            .push((DirectoryKind::AuxFiles, n_files));
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 59a7dcd4bd..b94ad5760a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -54,7 +54,7 @@ use std::{
     ops::ControlFlow,
 };
 
-use crate::pgdatadir_mapping::DirectoryKind;
+use crate::pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind};
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
@@ -174,6 +174,11 @@ pub struct TimelineResources {
     >,
 }
 
+pub(crate) struct AuxFilesState {
+    pub(crate) dir: Option<AuxFilesDirectory>,
+    pub(crate) n_deltas: usize,
+}
+
 pub struct Timeline {
     conf: &'static PageServerConf,
     tenant_conf: Arc<RwLock<AttachedTenantConf>>,
@@ -357,6 +362,9 @@ pub struct Timeline {
     timeline_get_throttle: Arc<
         crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
     >,
+
+    /// Keep aux directory cache to avoid it's reconstruction on each update
+    pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
 }
 
 pub struct WalReceiverInfo {
@@ -1693,6 +1701,11 @@ impl Timeline {
                 gc_lock: tokio::sync::Mutex::default(),
 
                 timeline_get_throttle: resources.timeline_get_throttle,
+
+                aux_files: tokio::sync::Mutex::new(AuxFilesState {
+                    dir: None,
+                    n_deltas: 0,
+                }),
             };
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py
index bf5834b665..2fdee89389 100644
--- a/test_runner/regress/test_layer_bloating.py
+++ b/test_runner/regress/test_layer_bloating.py
@@ -6,6 +6,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     logical_replication_sync,
+    wait_for_last_flush_lsn,
 )
 from fixtures.pg_version import PgVersion
 
@@ -52,6 +53,7 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
     cur.execute("select create_snapshots(10000)")
     # Wait logical replication to sync
     logical_replication_sync(vanilla_pg, endpoint)
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline)
     time.sleep(10)
 
     # Check layer file sizes

From 1b1320a2632aa117131f475d8d9cad08ae9466a6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 28 Feb 2024 00:02:44 +0200
Subject: [PATCH 0290/1571] fix: allow evicting wanted deleted layers (#6931)

Not allowing evicting wanted deleted layers is something I've forgotten
to implement on #5645. This PR makes it possible to evict such layers,
which should reduce the amount of hanging evictions.

Fixes: #6928

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/tenant/storage_layer/layer.rs  |  11 +-
 .../src/tenant/storage_layer/layer/tests.rs   | 263 ++++++++++++++++++
 2 files changed, 267 insertions(+), 7 deletions(-)
 create mode 100644 pageserver/src/tenant/storage_layer/layer/tests.rs

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index cc5b7ade6a..61eba07be6 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -29,6 +29,9 @@ use super::{
 
 use utils::generation::Generation;
 
+#[cfg(test)]
+mod tests;
+
 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
 ///
@@ -1049,16 +1052,10 @@ impl LayerInner {
 
     /// `DownloadedLayer` is being dropped, so it calls this method.
     fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
-        let delete = self.wanted_deleted.load(Ordering::Acquire);
         let evict = self.wanted_evicted.load(Ordering::Acquire);
         let can_evict = self.have_remote_client;
 
-        if delete {
-            // do nothing now, only in LayerInner::drop -- this was originally implemented because
-            // we could had already scheduled the deletion at the time.
-            //
-            // FIXME: this is not true anymore, we can safely evict wanted deleted files.
-        } else if can_evict && evict {
+        if can_evict && evict {
             let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);
 
             // downgrade for queueing, in case there's a tear down already ongoing we should not
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
new file mode 100644
index 0000000000..01c62b6f83
--- /dev/null
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -0,0 +1,263 @@
+use futures::StreamExt;
+use tokio::task::JoinSet;
+use utils::{
+    completion::{self, Completion},
+    id::TimelineId,
+};
+
+use super::*;
+use crate::task_mgr::BACKGROUND_RUNTIME;
+use crate::tenant::harness::TenantHarness;
+
+/// This test demonstrates a previous hang when a eviction and deletion were requested at the same
+/// time. Now both of them complete per Arc drop semantics.
+#[tokio::test(start_paused = true)]
+async fn evict_and_wait_on_wanted_deleted() {
+    // this is the runtime on which Layer spawns the blocking tasks on
+    let handle = BACKGROUND_RUNTIME.handle();
+
+    let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
+    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
+    let (tenant, ctx) = h.load().await;
+
+    let timeline = tenant
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .await
+        .unwrap();
+
+    let layer = {
+        let mut layers = {
+            let layers = timeline.layers.read().await;
+            layers.resident_layers().collect::<Vec<_>>().await
+        };
+
+        assert_eq!(layers.len(), 1);
+
+        layers.swap_remove(0)
+    };
+
+    // setup done
+
+    let resident = layer.keep_resident().await.unwrap();
+
+    {
+        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
+
+        // drive the future to await on the status channel
+        tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+            .await
+            .expect_err("should had been a timeout since we are holding the layer resident");
+
+        layer.delete_on_drop();
+
+        drop(resident);
+
+        // make sure the eviction task gets to run
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+
+        let resident = layer.keep_resident().await;
+        assert!(
+            matches!(resident, Ok(None)),
+            "keep_resident should not have re-initialized: {resident:?}"
+        );
+
+        evict_and_wait
+            .await
+            .expect("evict_and_wait should had succeeded");
+
+        // works as intended
+    }
+
+    // assert that once we remove the `layer` from the layer map and drop our reference,
+    // the deletion of the layer in remote_storage happens.
+    {
+        let mut layers = timeline.layers.write().await;
+        layers.finish_gc_timeline(&[layer]);
+    }
+
+    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+
+    assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get());
+    assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get());
+    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+    assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+}
+
+/// This test shows that ensures we are able to read the layer while the layer eviction has been
+/// started but not completed due to spawn_blocking pool being blocked.
+///
+/// Here `Layer::keep_resident` is used to "simulate" reads, because it cannot download.
+#[tokio::test(start_paused = true)]
+async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
+    // this is the runtime on which Layer spawns the blocking tasks on
+    let handle = BACKGROUND_RUNTIME.handle();
+    let h = TenantHarness::create("residency_check_while_evict_and_wait_on_clogged_spawn_blocking")
+        .unwrap();
+    let (tenant, ctx) = h.load().await;
+
+    let timeline = tenant
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .await
+        .unwrap();
+
+    let layer = {
+        let mut layers = {
+            let layers = timeline.layers.read().await;
+            layers.resident_layers().collect::<Vec<_>>().await
+        };
+
+        assert_eq!(layers.len(), 1);
+
+        layers.swap_remove(0)
+    };
+
+    // setup done
+
+    let resident = layer.keep_resident().await.unwrap();
+
+    let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
+
+    // drive the future to await on the status channel
+    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+        .await
+        .expect_err("should had been a timeout since we are holding the layer resident");
+    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+
+    // clog up BACKGROUND_RUNTIME spawn_blocking
+    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+
+    // now the eviction cannot proceed because the threads are consumed while completion exists
+    drop(resident);
+
+    // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
+    layer
+        .keep_resident()
+        .await
+        .expect("keep_resident should had reinitialized without downloading")
+        .expect("ResidentLayer");
+
+    // because the keep_resident check alters wanted evicted without sending a message, we will
+    // never get completed
+    let e = tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+        .await
+        .expect("no timeout, because keep_resident re-initialized")
+        .expect_err("eviction should not have succeeded because re-initialized");
+
+    // works as intended: evictions lose to "downloads"
+    assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
+    assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+    // this is not wrong: the eviction is technically still "on the way" as it's still queued
+    // because spawn_blocking is clogged up
+    assert_eq!(
+        0,
+        LAYER_IMPL_METRICS
+            .cancelled_evictions
+            .values()
+            .map(|ctr| ctr.get())
+            .sum::<u64>()
+    );
+
+    let mut second_eviction = std::pin::pin!(layer.evict_and_wait());
+
+    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
+        .await
+        .expect_err("timeout because spawn_blocking is clogged");
+
+    // in this case we don't leak started evictions, but I think there is still a chance of that
+    // happening, because we could have upgrades race multiple evictions while only one of them
+    // happens?
+    assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
+
+    helper.release().await;
+
+    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
+        .await
+        .expect("eviction goes through now that spawn_blocking is unclogged")
+        .expect("eviction should succeed, because version matches");
+
+    assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+
+    // now we finally can observe the original spawn_blocking failing
+    // it would had been possible to observe it earlier, but here it is guaranteed to have
+    // happened.
+    assert_eq!(
+        1,
+        LAYER_IMPL_METRICS
+            .cancelled_evictions
+            .values()
+            .map(|ctr| ctr.get())
+            .sum::<u64>()
+    );
+}
+
+struct SpawnBlockingPoolHelper {
+    awaited_by_spawn_blocking_tasks: Completion,
+    blocking_tasks: JoinSet<()>,
+}
+
+impl SpawnBlockingPoolHelper {
+    /// All `crate::task_mgr::BACKGROUND_RUNTIME` spawn_blocking threads will be consumed until
+    /// release is called.
+    ///
+    /// In the tests this can be used to ensure something cannot be started on the target runtimes
+    /// spawn_blocking pool.
+    ///
+    /// This should be no issue nowdays, because nextest runs each test in it's own process.
+    async fn consume_all_spawn_blocking_threads(handle: &tokio::runtime::Handle) -> Self {
+        let (completion, barrier) = completion::channel();
+        let (tx, mut rx) = tokio::sync::mpsc::channel(8);
+
+        let assumed_max_blocking_threads = 512;
+
+        let mut blocking_tasks = JoinSet::new();
+
+        for _ in 0..assumed_max_blocking_threads {
+            let barrier = barrier.clone();
+            let tx = tx.clone();
+            blocking_tasks.spawn_blocking_on(
+                move || {
+                    tx.blocking_send(()).unwrap();
+                    drop(tx);
+                    tokio::runtime::Handle::current().block_on(barrier.wait());
+                },
+                handle,
+            );
+        }
+
+        drop(barrier);
+
+        for _ in 0..assumed_max_blocking_threads {
+            rx.recv().await.unwrap();
+        }
+
+        SpawnBlockingPoolHelper {
+            awaited_by_spawn_blocking_tasks: completion,
+            blocking_tasks,
+        }
+    }
+
+    /// Release all previously blocked spawn_blocking threads
+    async fn release(self) {
+        let SpawnBlockingPoolHelper {
+            awaited_by_spawn_blocking_tasks,
+            mut blocking_tasks,
+        } = self;
+
+        drop(awaited_by_spawn_blocking_tasks);
+
+        while let Some(res) = blocking_tasks.join_next().await {
+            res.expect("none of the tasks should had panicked");
+        }
+    }
+
+    /// In the tests it is used as an easy way of making sure something scheduled on the target
+    /// runtimes `spawn_blocking` has completed, because it must've been scheduled and completed
+    /// before our tasks have a chance to schedule and complete.
+    async fn consume_and_release_all_of_spawn_blocking_threads(handle: &tokio::runtime::Handle) {
+        Self::consume_all_spawn_blocking_threads(handle)
+            .await
+            .release()
+            .await
+    }
+}

From c3a40a06f3b35058acfa63490052c21309b9f745 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 28 Feb 2024 09:52:22 +0000
Subject: [PATCH 0291/1571] test: wait for storage controller readiness (#6930)

## Problem
Starting up the pageserver before the storage controller is ready can
lead
to a round of reconciliation, which leads to the previous tenant being
shut down.
This disturbs some tests.

## Summary of changes
Wait for the storage controller to become ready on neon env start-up.

Closes https://github.com/neondatabase/neon/issues/6724
---
 control_plane/src/attachment_service.rs      | 6 +++---
 test_runner/regress/test_sharding_service.py | 3 ---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 0c416267fb..92342b478b 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -308,7 +308,7 @@ impl AttachmentService {
             )],
             background_process::InitialPidFile::Create(self.pid_file()),
             || async {
-                match self.status().await {
+                match self.ready().await {
                     Ok(_) => Ok(true),
                     Err(_) => Ok(false),
                 }
@@ -522,8 +522,8 @@ impl AttachmentService {
     }
 
     #[instrument(skip(self))]
-    pub async fn status(&self) -> anyhow::Result<()> {
-        self.dispatch::<(), ()>(Method::GET, "status".to_string(), None)
+    pub async fn ready(&self) -> anyhow::Result<()> {
+        self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None)
             .await
     }
 
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index b4f1f49543..6ed49d7fd6 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -179,9 +179,6 @@ def test_node_status_after_restart(
     env.attachment_service.stop()
     env.attachment_service.start()
 
-    # Initially readiness check should fail because we're trying to connect to the offline node
-    assert env.attachment_service.ready() is False
-
     def is_ready():
         assert env.attachment_service.ready() is True
 

From fcb77f3d8f71faf28a34f524d3be344527b169f1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 28 Feb 2024 12:58:13 +0200
Subject: [PATCH 0292/1571] build: add a timeout for test-images (#6942)

normal runtime seems to be 3min, add 20min timeout.
---
 .github/workflows/build_and_test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 5def619c07..0e67259b3f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -937,6 +937,7 @@ jobs:
           fi
 
       - name: Verify docker-compose example
+        timeout-minutes: 20
         run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
 
       - name: Print logs and clean up

From b6bd75964f25aadbeaca5a055cd67dadd9c4ed62 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 28 Feb 2024 12:38:23 +0100
Subject: [PATCH 0293/1571] Revert  "pageserver: roll open layer in timeline
 writer (#6661)" + PR #6842 (#6938)

This reverts commits 587cb705b898565d459d044df84d1ac2633f00bf (PR #6661)
and fcbe9fb1840b7628fd242eec3bfd0df83535d0f7 (PR #6842).

Conflicts:
	pageserver/src/tenant.rs
	pageserver/src/tenant/timeline.rs

The conflicts were with
* pageserver: adjust checkpoint distance for sharded tenants (#6852)
* pageserver: add vectored get implementation (#6576)

Also we had to keep the `allowed_errors` to make `test_forward_compatibility` happy,
see the PR thread on GitHub for details.
---
 pageserver/src/pgdatadir_mapping.rs           |  17 +-
 pageserver/src/tenant.rs                      |  32 +-
 .../tenant/storage_layer/inmemory_layer.rs    |  38 +-
 pageserver/src/tenant/timeline.rs             | 327 ++++++------------
 .../walreceiver/walreceiver_connection.rs     |  27 ++
 test_runner/performance/test_layer_map.py     |   4 +-
 6 files changed, 180 insertions(+), 265 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 024e66d112..7be08f86b1 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,7 +15,6 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
-use itertools::Itertools;
 use pageserver_api::key::{
     dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
     rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -1499,7 +1498,7 @@ impl<'a> DatadirModification<'a> {
             return Ok(());
         }
 
-        let mut writer = self.tline.writer().await;
+        let writer = self.tline.writer().await;
 
         // Flush relation and  SLRU data blocks, keep metadata.
         let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
@@ -1538,23 +1537,13 @@ impl<'a> DatadirModification<'a> {
     /// All the modifications in this atomic update are stamped by the specified LSN.
     ///
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let mut writer = self.tline.writer().await;
+        let writer = self.tline.writer().await;
 
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
         if !self.pending_updates.is_empty() {
-            let prev_pending_updates = std::mem::take(&mut self.pending_updates);
-
-            // The put_batch call below expects expects the inputs to be sorted by Lsn,
-            // so we do that first.
-            let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = prev_pending_updates
-                .into_iter()
-                .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
-                .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
-                .collect();
-
-            writer.put_batch(lsn_ordered_batch, ctx).await?;
+            writer.put_batch(&self.pending_updates, ctx).await?;
             self.pending_updates.clear();
         }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 15dd125de2..96b78de50c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3852,7 +3852,7 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -3864,7 +3864,7 @@ mod tests {
         writer.finish_write(Lsn(0x10));
         drop(writer);
 
-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -3930,7 +3930,7 @@ mod tests {
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
 
         #[allow(non_snake_case)]
         let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
@@ -3964,7 +3964,7 @@ mod tests {
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
-        let mut new_writer = newtline.writer().await;
+        let new_writer = newtline.writer().await;
         new_writer
             .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
             .await?;
@@ -3996,7 +3996,7 @@ mod tests {
     ) -> anyhow::Result<()> {
         let mut lsn = start_lsn;
         {
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
             // Create a relation on the timeline
             writer
                 .put(
@@ -4021,7 +4021,7 @@ mod tests {
         }
         tline.freeze_and_flush().await?;
         {
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
             writer
                 .put(
                     *TEST_KEY,
@@ -4384,7 +4384,7 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4401,7 +4401,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4418,7 +4418,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4435,7 +4435,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4492,7 +4492,7 @@ mod tests {
         for _ in 0..repeat {
             for _ in 0..key_count {
                 test_key.field6 = blknum;
-                let mut writer = timeline.writer().await;
+                let writer = timeline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4663,7 +4663,7 @@ mod tests {
         for blknum in 0..NUM_KEYS {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
             writer
                 .put(
                     test_key,
@@ -4684,7 +4684,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let mut writer = tline.writer().await;
+                let writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4752,7 +4752,7 @@ mod tests {
         for blknum in 0..NUM_KEYS {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
             writer
                 .put(
                     test_key,
@@ -4781,7 +4781,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let mut writer = tline.writer().await;
+                let writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4858,7 +4858,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let mut writer = tline.writer().await;
+                let writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 5f1db21d49..e7da28b8d6 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -336,17 +336,32 @@ impl InMemoryLayer {
 
     /// Common subroutine of the public put_wal_record() and put_page_image() functions.
     /// Adds the page version to the in-memory tree
-
     pub(crate) async fn put_value(
         &self,
         key: Key,
         lsn: Lsn,
-        buf: &[u8],
+        val: &Value,
         ctx: &RequestContext,
     ) -> Result<()> {
         let mut inner = self.inner.write().await;
         self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
+        self.put_value_locked(&mut inner, key, lsn, val, ctx).await
+    }
+
+    pub(crate) async fn put_values(
+        &self,
+        values: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        let mut inner = self.inner.write().await;
+        self.assert_writable();
+        for (key, vals) in values {
+            for (lsn, val) in vals {
+                self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
+                    .await?;
+            }
+        }
+        Ok(())
     }
 
     async fn put_value_locked(
@@ -354,16 +369,22 @@ impl InMemoryLayer {
         locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
         key: Key,
         lsn: Lsn,
-        buf: &[u8],
+        val: &Value,
         ctx: &RequestContext,
     ) -> Result<()> {
         trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
 
         let off = {
+            // Avoid doing allocations for "small" values.
+            // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
+            // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
+            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
+            buf.clear();
+            val.ser_into(&mut buf)?;
             locked_inner
                 .file
                 .write_blob(
-                    buf,
+                    &buf,
                     &RequestContextBuilder::extend(ctx)
                         .page_content_kind(PageContentKind::InMemoryLayer)
                         .build(),
@@ -391,12 +412,7 @@ impl InMemoryLayer {
     pub async fn freeze(&self, end_lsn: Lsn) {
         let inner = self.inner.write().await;
 
-        assert!(
-            self.start_lsn < end_lsn,
-            "{} >= {}",
-            self.start_lsn,
-            end_lsn
-        );
+        assert!(self.start_lsn < end_lsn);
         self.end_lsn.set(end_lsn).expect("end_lsn set only once");
 
         for vec_map in inner.index.values() {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b94ad5760a..4d820f7b13 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -27,18 +27,6 @@ use pageserver_api::{
 };
 use rand::Rng;
 use serde_with::serde_as;
-use storage_broker::BrokerClientChannel;
-use tokio::{
-    runtime::Handle,
-    sync::{oneshot, watch},
-};
-use tokio_util::sync::CancellationToken;
-use tracing::*;
-use utils::{
-    bin_ser::BeSer,
-    sync::gate::{Gate, GateGuard},
-};
-
 use std::ops::{Deref, Range};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
@@ -53,6 +41,14 @@ use std::{
     cmp::{max, min, Ordering},
     ops::ControlFlow,
 };
+use storage_broker::BrokerClientChannel;
+use tokio::{
+    runtime::Handle,
+    sync::{oneshot, watch},
+};
+use tokio_util::sync::CancellationToken;
+use tracing::*;
+use utils::sync::gate::{Gate, GateGuard};
 
 use crate::pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind};
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
@@ -273,7 +269,7 @@ pub struct Timeline {
     /// Locked automatically by [`TimelineWriter`] and checkpointer.
     /// Must always be acquired before the layer map/individual layer lock
     /// to avoid deadlock.
-    write_lock: tokio::sync::Mutex<Option<TimelineWriterState>>,
+    write_lock: tokio::sync::Mutex<()>,
 
     /// Used to avoid multiple `flush_loop` tasks running
     pub(super) flush_loop_state: Mutex<FlushLoopState>,
@@ -1225,10 +1221,58 @@ impl Timeline {
     pub(crate) async fn writer(&self) -> TimelineWriter<'_> {
         TimelineWriter {
             tl: self,
-            write_guard: self.write_lock.lock().await,
+            _write_guard: self.write_lock.lock().await,
         }
     }
 
+    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
+    /// the in-memory layer, and initiate flushing it if so.
+    ///
+    /// Also flush after a period of time without new data -- it helps
+    /// safekeepers to regard pageserver as caught up and suspend activity.
+    pub(crate) async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
+        let last_lsn = self.get_last_record_lsn();
+        let open_layer_size = {
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+            let Some(open_layer) = layers.open_layer.as_ref() else {
+                return Ok(());
+            };
+            open_layer.size().await?
+        };
+        let last_freeze_at = self.last_freeze_at.load();
+        let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
+        let distance = last_lsn.widening_sub(last_freeze_at);
+        // Rolling the open layer can be triggered by:
+        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
+        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
+        //    account for how writes are distributed across shards: we expect each node to consume
+        //    1/count of the LSN on average.
+        // 2. The size of the currently open layer.
+        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
+        //    up and suspend activity.
+        if (distance
+            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128)
+            || open_layer_size > self.get_checkpoint_distance()
+            || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
+        {
+            info!(
+                "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
+                distance,
+                open_layer_size,
+                last_freeze_ts.elapsed()
+            );
+
+            self.freeze_inmem_layer(true).await;
+            self.last_freeze_at.store(last_lsn);
+            *(self.last_freeze_ts.write().unwrap()) = Instant::now();
+
+            // Wake up the layer flusher
+            self.flush_frozen_layers();
+        }
+        Ok(())
+    }
+
     pub(crate) fn activate(
         self: &Arc<Self>,
         broker_client: BrokerClientChannel,
@@ -1659,7 +1703,7 @@ impl Timeline {
                 layer_flush_start_tx,
                 layer_flush_done_tx,
 
-                write_lock: tokio::sync::Mutex::new(None),
+                write_lock: tokio::sync::Mutex::new(()),
 
                 gc_info: std::sync::RwLock::new(GcInfo {
                     retain_lsns: Vec::new(),
@@ -2991,6 +3035,43 @@ impl Timeline {
         Ok(layer)
     }
 
+    async fn put_value(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        val: &Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        //info!("PUT: key {} at {}", key, lsn);
+        let layer = self.get_layer_for_write(lsn).await?;
+        layer.put_value(key, lsn, val, ctx).await?;
+        Ok(())
+    }
+
+    async fn put_values(
+        &self,
+        values: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // Pick the first LSN in the batch to get the layer to write to.
+        for lsns in values.values() {
+            if let Some((lsn, _)) = lsns.first() {
+                let layer = self.get_layer_for_write(*lsn).await?;
+                layer.put_values(values, ctx).await?;
+                break;
+            }
+        }
+        Ok(())
+    }
+
+    async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        if let Some((_, lsn)) = tombstones.first() {
+            let layer = self.get_layer_for_write(*lsn).await?;
+            layer.put_tombstones(tombstones).await?;
+        }
+        Ok(())
+    }
+
     pub(crate) fn finish_write(&self, new_lsn: Lsn) {
         assert!(new_lsn.is_aligned());
 
@@ -3001,20 +3082,14 @@ impl Timeline {
     async fn freeze_inmem_layer(&self, write_lock_held: bool) {
         // Freeze the current open in-memory layer. It will be written to disk on next
         // iteration.
-
         let _write_guard = if write_lock_held {
             None
         } else {
             Some(self.write_lock.lock().await)
         };
-
-        self.freeze_inmem_layer_at(self.get_last_record_lsn()).await;
-    }
-
-    async fn freeze_inmem_layer_at(&self, at: Lsn) {
         let mut guard = self.layers.write().await;
         guard
-            .try_freeze_in_memory_layer(at, &self.last_freeze_at)
+            .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
             .await;
     }
 
@@ -4979,43 +5054,13 @@ fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageRecon
     PageReconstructError::from(msg)
 }
 
-struct TimelineWriterState {
-    open_layer: Arc<InMemoryLayer>,
-    current_size: u64,
-    // Previous Lsn which passed through
-    prev_lsn: Option<Lsn>,
-    // Largest Lsn which passed through the current writer
-    max_lsn: Option<Lsn>,
-    // Cached details of the last freeze. Avoids going trough the atomic/lock on every put.
-    cached_last_freeze_at: Lsn,
-    cached_last_freeze_ts: Instant,
-}
-
-impl TimelineWriterState {
-    fn new(
-        open_layer: Arc<InMemoryLayer>,
-        current_size: u64,
-        last_freeze_at: Lsn,
-        last_freeze_ts: Instant,
-    ) -> Self {
-        Self {
-            open_layer,
-            current_size,
-            prev_lsn: None,
-            max_lsn: None,
-            cached_last_freeze_at: last_freeze_at,
-            cached_last_freeze_ts: last_freeze_ts,
-        }
-    }
-}
-
 /// Various functions to mutate the timeline.
 // TODO Currently, Deref is used to allow easy access to read methods from this trait.
 // This is probably considered a bad practice in Rust and should be fixed eventually,
 // but will cause large code changes.
 pub(crate) struct TimelineWriter<'a> {
     tl: &'a Timeline,
-    write_guard: tokio::sync::MutexGuard<'a, Option<TimelineWriterState>>,
+    _write_guard: tokio::sync::MutexGuard<'a, ()>,
 }
 
 impl Deref for TimelineWriter<'_> {
@@ -5026,193 +5071,31 @@ impl Deref for TimelineWriter<'_> {
     }
 }
 
-impl Drop for TimelineWriter<'_> {
-    fn drop(&mut self) {
-        self.write_guard.take();
-    }
-}
-
-enum OpenLayerAction {
-    Roll,
-    Open,
-    None,
-}
-
 impl<'a> TimelineWriter<'a> {
     /// Put a new page version that can be constructed from a WAL record
     ///
     /// This will implicitly extend the relation, if the page is beyond the
     /// current end-of-file.
     pub(crate) async fn put(
-        &mut self,
+        &self,
         key: Key,
         lsn: Lsn,
         value: &Value,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        // Avoid doing allocations for "small" values.
-        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
-        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
-        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
-        buf.clear();
-        value.ser_into(&mut buf)?;
-        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
-
-        let action = self.get_open_layer_action(lsn, buf_size);
-        let layer = self.handle_open_layer_action(lsn, action).await?;
-        let res = layer.put_value(key, lsn, &buf, ctx).await;
-
-        if res.is_ok() {
-            // Update the current size only when the entire write was ok.
-            // In case of failures, we may have had partial writes which
-            // render the size tracking out of sync. That's ok because
-            // the checkpoint distance should be significantly smaller
-            // than the S3 single shot upload limit of 5GiB.
-            let state = self.write_guard.as_mut().unwrap();
-
-            state.current_size += buf_size;
-            state.prev_lsn = Some(lsn);
-            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
-        }
-
-        res
+        self.tl.put_value(key, lsn, value, ctx).await
     }
 
-    async fn handle_open_layer_action(
-        &mut self,
-        at: Lsn,
-        action: OpenLayerAction,
-    ) -> anyhow::Result<&Arc<InMemoryLayer>> {
-        match action {
-            OpenLayerAction::Roll => {
-                let max_lsn = self.write_guard.as_ref().unwrap().max_lsn.unwrap();
-                self.tl.freeze_inmem_layer_at(max_lsn).await;
-
-                let now = Instant::now();
-                *(self.last_freeze_ts.write().unwrap()) = now;
-
-                self.tl.flush_frozen_layers();
-
-                let current_size = self.write_guard.as_ref().unwrap().current_size;
-                if current_size > self.get_checkpoint_distance() {
-                    warn!("Flushed oversized open layer with size {}", current_size)
-                }
-
-                assert!(self.write_guard.is_some());
-
-                let layer = self.tl.get_layer_for_write(at).await?;
-                let initial_size = layer.size().await?;
-                self.write_guard.replace(TimelineWriterState::new(
-                    layer,
-                    initial_size,
-                    Lsn(max_lsn.0 + 1),
-                    now,
-                ));
-            }
-            OpenLayerAction::Open => {
-                assert!(self.write_guard.is_none());
-
-                let layer = self.tl.get_layer_for_write(at).await?;
-                let initial_size = layer.size().await?;
-
-                let last_freeze_at = self.last_freeze_at.load();
-                let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
-                self.write_guard.replace(TimelineWriterState::new(
-                    layer,
-                    initial_size,
-                    last_freeze_at,
-                    last_freeze_ts,
-                ));
-            }
-            OpenLayerAction::None => {
-                assert!(self.write_guard.is_some());
-            }
-        }
-
-        Ok(&self.write_guard.as_ref().unwrap().open_layer)
-    }
-
-    fn get_open_layer_action(&self, lsn: Lsn, new_value_size: u64) -> OpenLayerAction {
-        let state = &*self.write_guard;
-        let Some(state) = &state else {
-            return OpenLayerAction::Open;
-        };
-
-        if state.prev_lsn == Some(lsn) {
-            // Rolling mid LSN is not supported by downstream code.
-            // Hence, only roll at LSN boundaries.
-            return OpenLayerAction::None;
-        }
-
-        let distance = lsn.widening_sub(state.cached_last_freeze_at);
-        let proposed_open_layer_size = state.current_size + new_value_size;
-
-        // Rolling the open layer can be triggered by:
-        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
-        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
-        //    account for how writes are distributed across shards: we expect each node to consume
-        //    1/count of the LSN on average.
-        // 2. The size of the currently open layer.
-        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
-        //    up and suspend activity.
-        if distance
-            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128
-        {
-            info!(
-                "Will roll layer at {} with layer size {} due to LSN distance ({})",
-                lsn, state.current_size, distance
-            );
-
-            OpenLayerAction::Roll
-        } else if state.current_size > 0
-            && proposed_open_layer_size >= self.get_checkpoint_distance()
-        {
-            info!(
-                "Will roll layer at {} with layer size {} due to layer size ({})",
-                lsn, state.current_size, proposed_open_layer_size
-            );
-
-            OpenLayerAction::Roll
-        } else if distance > 0
-            && state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()
-        {
-            info!(
-                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
-                lsn,
-                state.current_size,
-                state.cached_last_freeze_ts.elapsed()
-            );
-
-            OpenLayerAction::Roll
-        } else {
-            OpenLayerAction::None
-        }
-    }
-
-    /// Put a batch keys at the specified Lsns.
-    ///
-    /// The batch should be sorted by Lsn such that it's safe
-    /// to roll the open layer mid batch.
     pub(crate) async fn put_batch(
-        &mut self,
-        batch: Vec<(Key, Lsn, Value)>,
+        &self,
+        batch: &HashMap<Key, Vec<(Lsn, Value)>>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        for (key, lsn, val) in batch {
-            self.put(key, lsn, &val, ctx).await?
-        }
-
-        Ok(())
+        self.tl.put_values(batch, ctx).await
     }
 
-    pub(crate) async fn delete_batch(&mut self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        if let Some((_, lsn)) = batch.first() {
-            let action = self.get_open_layer_action(*lsn, 0);
-            let layer = self.handle_open_layer_action(*lsn, action).await?;
-            layer.put_tombstones(batch).await?;
-        }
-
-        Ok(())
+    pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        self.tl.put_tombstones(batch).await
     }
 
     /// Track the end of the latest digested WAL record.
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 0333fcac67..9cb53f46d1 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -343,6 +343,23 @@ pub(super) async fn handle_walreceiver_connection(
                             modification.commit(&ctx).await?;
                             uncommitted_records = 0;
                             filtered_records = 0;
+
+                            //
+                            // We should check checkpoint distance after appending each ingest_batch_size bytes because otherwise
+                            // layer size can become much larger than `checkpoint_distance`.
+                            // It can append because wal-sender is sending WAL using 125kb chucks and some WAL records can cause writing large
+                            // amount of data to key-value storage. So performing this check only after processing
+                            // all WAL records in the chunk, can cause huge L0 layer files.
+                            //
+                            timeline
+                                .check_checkpoint_distance()
+                                .await
+                                .with_context(|| {
+                                    format!(
+                                        "Failed to check checkpoint distance for timeline {}",
+                                        timeline.timeline_id
+                                    )
+                                })?;
                         }
                     }
 
@@ -389,6 +406,16 @@ pub(super) async fn handle_walreceiver_connection(
             }
         }
 
+        timeline
+            .check_checkpoint_distance()
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to check checkpoint distance for timeline {}",
+                    timeline.timeline_id
+                )
+            })?;
+
         if let Some(last_lsn) = status_update {
             let timeline_remote_consistent_lsn = timeline
                 .get_remote_consistent_lsn_visible()
diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index 9b20954d45..6bd0d85fa2 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -17,10 +17,10 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
     tenant, _ = env.neon_cli.create_tenant(
         conf={
             "gc_period": "0s",
-            "checkpoint_distance": "16384",
+            "checkpoint_distance": "8192",
             "compaction_period": "1 s",
             "compaction_threshold": "1",
-            "compaction_target_size": "16384",
+            "compaction_target_size": "8192",
         }
     )
 

From 2b11466b590b90dcd5fd73924d82f1e00cbf1991 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 28 Feb 2024 12:06:00 +0000
Subject: [PATCH 0294/1571] pageserver: optimise disk io for vectored get
 (#6780)

## Problem
The vectored read path proposed in
https://github.com/neondatabase/neon/pull/6576 seems
to be functionally correct, but in my testing (see below) it is about 10-20% slower than the naive
sequential vectored implementation.

## Summary of changes
There's three parts to this PR:
1. Supporting vectored blob reads. This is actually trickier than it
sounds because on disk blobs are prefixed with a variable length size header.
Since the blobs are not necessarily fixed size, we need to juggle the offsets
such that the callers can retrieve the blobs from the resulting buffer.

2. Merge disk read requests issued by the vectored read path up to a
maximum size. Again, the merging is complicated by the fact that blobs
are not fixed size. We keep track of the begin and end offset of each blob
and pass them into the vectored blob reader. In turn, the reader will return
a buffer and the offsets at which the blobs begin and end.

3. A benchmark for basebackup requests against tenant with large SLRU
block counts is added. This required a small change to pagebench and a new config
variable for the pageserver which toggles the vectored get validation.

We can probably optimise things further by adding a little bit of
concurrency for our IO. In principle, it's as simple as spawning a task which deals with issuing
IO and doing the serialisation and handling on the parent task which receives input via a
channel.
---
 pageserver/ctl/src/layer_map_analyzer.rs      |  10 +-
 pageserver/ctl/src/layers.rs                  |  10 +-
 pageserver/pagebench/src/cmd/basebackup.rs    |  17 +-
 pageserver/pagebench/src/main.rs              |   1 -
 pageserver/pagebench/src/util/connstring.rs   |   8 -
 pageserver/src/basebackup.rs                  |  17 +-
 pageserver/src/config.rs                      |  59 +++
 pageserver/src/tenant.rs                      |   1 +
 pageserver/src/tenant/block_io.rs             |  20 +-
 pageserver/src/tenant/storage_layer.rs        |  16 +-
 .../src/tenant/storage_layer/delta_layer.rs   | 278 ++++++-----
 .../src/tenant/storage_layer/image_layer.rs   | 165 +++++--
 pageserver/src/tenant/storage_layer/layer.rs  |  31 +-
 pageserver/src/tenant/timeline.rs             |   9 +-
 pageserver/src/tenant/vectored_blob_io.rs     | 436 ++++++++++++++++++
 pageserver/src/virtual_file.rs                |  52 ++-
 test_runner/fixtures/neon_fixtures.py         |   7 +
 .../pagebench/test_large_slru_basebackup.py   | 195 ++++++++
 ...er_max_throughput_getpage_at_latest_lsn.py | 149 +++---
 test_runner/performance/pageserver/util.py    |  28 +-
 20 files changed, 1201 insertions(+), 308 deletions(-)
 delete mode 100644 pageserver/pagebench/src/util/connstring.rs
 create mode 100644 pageserver/src/tenant/vectored_blob_io.rs
 create mode 100644 test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py

diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index 42c4e9ff48..c4c282f33d 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -12,7 +12,7 @@ use std::collections::BinaryHeap;
 use std::ops::Range;
 use std::{fs, str};
 
-use pageserver::page_cache::PAGE_SZ;
+use pageserver::page_cache::{self, PAGE_SZ};
 use pageserver::repository::{Key, KEY_SIZE};
 use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
@@ -100,13 +100,15 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 
 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
-    let file = FileBlockReader::new(VirtualFile::open(path).await?);
-    let summary_blk = file.read_blk(0, ctx).await?;
+    let file = VirtualFile::open(path).await?;
+    let file_id = page_cache::next_file_id();
+    let block_reader = FileBlockReader::new(&file, file_id);
+    let summary_blk = block_reader.read_blk(0, ctx).await?;
     let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
     let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
         actual_summary.index_start_blk,
         actual_summary.index_root_blk,
-        file,
+        block_reader,
     );
     // min-heap (reserve space for one more element added before eviction)
     let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index 27efa6d028..be8f91675d 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -61,13 +61,15 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
     let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
     virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
     page_cache::init(100);
-    let file = FileBlockReader::new(VirtualFile::open(path).await?);
-    let summary_blk = file.read_blk(0, ctx).await?;
+    let file = VirtualFile::open(path).await?;
+    let file_id = page_cache::next_file_id();
+    let block_reader = FileBlockReader::new(&file, file_id);
+    let summary_blk = block_reader.read_blk(0, ctx).await?;
     let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
     let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
         actual_summary.index_start_blk,
         actual_summary.index_root_blk,
-        &file,
+        &block_reader,
     );
     // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
     let mut all = vec![];
@@ -83,7 +85,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
             ctx,
         )
         .await?;
-    let cursor = BlockCursor::new_fileblockreader(&file);
+    let cursor = BlockCursor::new_fileblockreader(&block_reader);
     for (k, v) in all {
         let value = cursor.read_blob(v.pos(), ctx).await?;
         println!("key:{} value_len:{}", k, value.len());
diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs
index 2d61b0e252..55844be041 100644
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -8,7 +8,7 @@ use utils::lsn::Lsn;
 use rand::prelude::*;
 use tokio::sync::Barrier;
 use tokio::task::JoinSet;
-use tracing::{debug, info, instrument};
+use tracing::{info, instrument};
 
 use std::collections::HashMap;
 use std::num::NonZeroUsize;
@@ -25,8 +25,8 @@ use crate::util::{request_stats, tokio_thread_local_stats};
 pub(crate) struct Args {
     #[clap(long, default_value = "http://localhost:9898")]
     mgmt_api_endpoint: String,
-    #[clap(long, default_value = "localhost:64000")]
-    page_service_host_port: String,
+    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
+    page_service_connstring: String,
     #[clap(long)]
     pageserver_jwt: Option<String>,
     #[clap(long, default_value = "1")]
@@ -230,12 +230,9 @@ async fn client(
 ) {
     start_work_barrier.wait().await;
 
-    let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
-        &args.page_service_host_port,
-        args.pageserver_jwt.as_deref(),
-    ))
-    .await
-    .unwrap();
+    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
+        .await
+        .unwrap();
 
     while let Some(Work { lsn, gzip }) = work.recv().await {
         let start = Instant::now();
@@ -263,7 +260,7 @@ async fn client(
                 }
             })
             .await;
-        debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
+        info!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
         let elapsed = start.elapsed();
         live_stats.inc();
         STATS.with(|stats| {
diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs
index 9fa77f0671..5d688ed2d1 100644
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -3,7 +3,6 @@ use utils::logging;
 
 /// Re-usable pieces of code that aren't CLI-specific.
 mod util {
-    pub(crate) mod connstring;
     pub(crate) mod request_stats;
     #[macro_use]
     pub(crate) mod tokio_thread_local_stats;
diff --git a/pageserver/pagebench/src/util/connstring.rs b/pageserver/pagebench/src/util/connstring.rs
deleted file mode 100644
index 07a0ff042d..0000000000
--- a/pageserver/pagebench/src/util/connstring.rs
+++ /dev/null
@@ -1,8 +0,0 @@
-pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
-    let colon_and_jwt = if let Some(jwt) = jwt {
-        format!(":{jwt}") // TODO: urlescape
-    } else {
-        String::new()
-    };
-    format!("postgres://postgres{colon_and_jwt}@{host_port}")
-}
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index c862816b80..0479d05f8f 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -143,6 +143,7 @@ where
     ar: &'a mut Builder<&'b mut W>,
     buf: Vec<u8>,
     current_segment: Option<(SlruKind, u32)>,
+    total_blocks: usize,
 }
 
 impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W>
@@ -154,6 +155,7 @@ where
             ar,
             buf: Vec::new(),
             current_segment: None,
+            total_blocks: 0,
         }
     }
 
@@ -199,7 +201,8 @@ where
         let header = new_tar_header(&segname, self.buf.len() as u64)?;
         self.ar.append(&header, self.buf.as_slice()).await?;
 
-        trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
+        self.total_blocks += nblocks;
+        debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
 
         self.buf.clear();
 
@@ -207,11 +210,15 @@ where
     }
 
     async fn finish(mut self) -> anyhow::Result<()> {
-        if self.current_segment.is_none() || self.buf.is_empty() {
-            return Ok(());
-        }
+        let res = if self.current_segment.is_none() || self.buf.is_empty() {
+            Ok(())
+        } else {
+            self.flush().await
+        };
 
-        self.flush().await
+        info!("Collected {} SLRU blocks", self.total_blocks);
+
+        res
     }
 }
 
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index b0d828d066..d18b8d6885 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -34,6 +34,7 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
+use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{
     TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
@@ -87,6 +88,10 @@ pub mod defaults {
 
     pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
 
+    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
+
+    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
+
     ///
     /// Default built-in configuration file.
     ///
@@ -126,6 +131,10 @@ pub mod defaults {
 
 #get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
 
+#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
+
+#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -262,6 +271,10 @@ pub struct PageServerConf {
     pub virtual_file_io_engine: virtual_file::IoEngineKind,
 
     pub get_vectored_impl: GetVectoredImpl,
+
+    pub max_vectored_read_bytes: MaxVectoredReadBytes,
+
+    pub validate_vectored_get: bool,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -350,6 +363,10 @@ struct PageServerConfigBuilder {
     virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
 
     get_vectored_impl: BuilderValue<GetVectoredImpl>,
+
+    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
+
+    validate_vectored_get: BuilderValue<bool>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -429,6 +446,10 @@ impl Default for PageServerConfigBuilder {
             virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
 
             get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
+            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
+                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
+            )),
+            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
         }
     }
 }
@@ -593,6 +614,14 @@ impl PageServerConfigBuilder {
         self.get_vectored_impl = BuilderValue::Set(value);
     }
 
+    pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
+        self.max_vectored_read_bytes = BuilderValue::Set(value);
+    }
+
+    pub fn get_validate_vectored_get(&mut self, value: bool) {
+        self.validate_vectored_get = BuilderValue::Set(value);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let concurrent_tenant_warmup = self
             .concurrent_tenant_warmup
@@ -706,6 +735,12 @@ impl PageServerConfigBuilder {
             get_vectored_impl: self
                 .get_vectored_impl
                 .ok_or(anyhow!("missing get_vectored_impl"))?,
+            max_vectored_read_bytes: self
+                .max_vectored_read_bytes
+                .ok_or(anyhow!("missing max_vectored_read_bytes"))?,
+            validate_vectored_get: self
+                .validate_vectored_get
+                .ok_or(anyhow!("missing validate_vectored_get"))?,
         })
     }
 }
@@ -952,6 +987,15 @@ impl PageServerConf {
                 "get_vectored_impl" => {
                     builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
                 }
+                "max_vectored_read_bytes" => {
+                    let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
+                    builder.get_max_vectored_read_bytes(
+                        MaxVectoredReadBytes(
+                            NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0")))
+                }
+                "validate_vectored_get" => {
+                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1027,6 +1071,11 @@ impl PageServerConf {
             ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
             virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
             get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+            max_vectored_read_bytes: MaxVectoredReadBytes(
+                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
+                    .expect("Invalid default constant"),
+            ),
+            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
         }
     }
 }
@@ -1261,6 +1310,11 @@ background_task_maximum_delay = '334 s'
                 ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                 get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                max_vectored_read_bytes: MaxVectoredReadBytes(
+                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
+                        .expect("Invalid default constant")
+                ),
+                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1326,6 +1380,11 @@ background_task_maximum_delay = '334 s'
                 ingest_batch_size: 100,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                 get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                max_vectored_read_bytes: MaxVectoredReadBytes(
+                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
+                        .expect("Invalid default constant")
+                ),
+                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 96b78de50c..6a63a2adeb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -145,6 +145,7 @@ macro_rules! pausable_failpoint {
 
 pub mod blob_io;
 pub mod block_io;
+pub mod vectored_blob_io;
 
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 1b6bccc120..37c84be342 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -5,7 +5,7 @@
 use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
-use crate::page_cache::{self, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
+use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
 use std::ops::Deref;
@@ -78,7 +78,7 @@ impl<'a> Deref for BlockLease<'a> {
 ///
 /// Unlike traits, we also support the read function to be async though.
 pub(crate) enum BlockReaderRef<'a> {
-    FileBlockReader(&'a FileBlockReader),
+    FileBlockReader(&'a FileBlockReader<'a>),
     EphemeralFile(&'a EphemeralFile),
     Adapter(Adapter<&'a DeltaLayerInner>),
     #[cfg(test)]
@@ -160,17 +160,15 @@ impl<'a> BlockCursor<'a> {
 ///
 /// The file is assumed to be immutable. This doesn't provide any functions
 /// for modifying the file, nor for invalidating the cache if it is modified.
-pub struct FileBlockReader {
-    pub file: VirtualFile,
+pub struct FileBlockReader<'a> {
+    pub file: &'a VirtualFile,
 
     /// Unique ID of this file, used as key in the page cache.
     file_id: page_cache::FileId,
 }
 
-impl FileBlockReader {
-    pub fn new(file: VirtualFile) -> Self {
-        let file_id = page_cache::next_file_id();
-
+impl<'a> FileBlockReader<'a> {
+    pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
         FileBlockReader { file_id, file }
     }
 
@@ -190,11 +188,11 @@ impl FileBlockReader {
     /// Returns a "lease" object that can be used to
     /// access to the contents of the page. (For the page cache, the
     /// lease object represents a lock on the buffer.)
-    pub async fn read_blk(
+    pub async fn read_blk<'b>(
         &self,
         blknum: u32,
         ctx: &RequestContext,
-    ) -> Result<BlockLease, std::io::Error> {
+    ) -> Result<BlockLease<'b>, std::io::Error> {
         let cache = page_cache::get();
         match cache
             .read_immutable_buf(self.file_id, blknum, ctx)
@@ -215,7 +213,7 @@ impl FileBlockReader {
     }
 }
 
-impl BlockReader for FileBlockReader {
+impl BlockReader for FileBlockReader<'_> {
     fn block_cursor(&self) -> BlockCursor<'_> {
         BlockCursor::new(BlockReaderRef::FileBlockReader(self))
     }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 73c018db31..9de820912e 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -209,8 +209,7 @@ impl Default for ValuesReconstructState {
 pub(crate) enum ReadableLayerDesc {
     Persistent {
         desc: PersistentLayerDesc,
-        lsn_floor: Lsn,
-        lsn_ceil: Lsn,
+        lsn_range: Range<Lsn>,
     },
     InMemory {
         handle: InMemoryLayerHandle,
@@ -309,14 +308,14 @@ impl Eq for ReadableLayerDescOrdered {}
 impl ReadableLayerDesc {
     pub(crate) fn get_lsn_floor(&self) -> Lsn {
         match self {
-            ReadableLayerDesc::Persistent { lsn_floor, .. } => *lsn_floor,
+            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
             ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
         }
     }
 
     pub(crate) fn get_lsn_ceil(&self) -> Lsn {
         match self {
-            ReadableLayerDesc::Persistent { lsn_ceil, .. } => *lsn_ceil,
+            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
             ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
         }
     }
@@ -329,10 +328,15 @@ impl ReadableLayerDesc {
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
         match self {
-            ReadableLayerDesc::Persistent { desc, lsn_ceil, .. } => {
+            ReadableLayerDesc::Persistent { desc, lsn_range } => {
                 let layer = layer_manager.get_from_desc(desc);
                 layer
-                    .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(
+                        keyspace,
+                        lsn_range.clone(),
+                        reconstruct_state,
+                        ctx,
+                    )
                     .await
             }
             ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index e636073113..5eaf1cc1ce 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -29,25 +29,28 @@
 //!
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
-use crate::page_cache::PAGE_SZ;
+use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::vectored_blob_io::{
+    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+};
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
+use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::collections::BTreeMap;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -63,8 +66,7 @@ use utils::{
 };
 
 use super::{
-    AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValueReconstructSituation,
-    ValuesReconstructState,
+    AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
 };
 
 ///
@@ -214,8 +216,10 @@ pub struct DeltaLayerInner {
     index_start_blk: u32,
     index_root_blk: u32,
 
-    /// Reader object for reading blocks from the file.
-    file: FileBlockReader,
+    file: VirtualFile,
+    file_id: FileId,
+
+    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }
 
 impl std::fmt::Debug for DeltaLayerInner {
@@ -297,7 +301,7 @@ impl DeltaLayer {
     async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
         let path = self.path();
 
-        let loaded = DeltaLayerInner::load(&path, None, ctx)
+        let loaded = DeltaLayerInner::load(&path, None, None, ctx)
             .await
             .and_then(|res| res)?;
 
@@ -665,16 +669,16 @@ impl DeltaLayer {
     where
         F: Fn(Summary) -> Summary,
     {
-        let file = VirtualFile::open_with_options(
+        let mut file = VirtualFile::open_with_options(
             path,
             virtual_file::OpenOptions::new().read(true).write(true),
         )
         .await
         .with_context(|| format!("Failed to open file '{}'", path))?;
-        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0, ctx).await?;
+        let file_id = page_cache::next_file_id();
+        let block_reader = FileBlockReader::new(&file, file_id);
+        let summary_blk = block_reader.read_blk(0, ctx).await?;
         let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
-        let mut file = file.file;
         if actual_summary.magic != DELTA_FILE_MAGIC {
             return Err(RewriteSummaryError::MagicMismatch);
         }
@@ -698,15 +702,18 @@ impl DeltaLayerInner {
     pub(super) async fn load(
         path: &Utf8Path,
         summary: Option<Summary>,
+        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
     ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
         let file = match VirtualFile::open(path).await {
             Ok(file) => file,
             Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
         };
-        let file = FileBlockReader::new(file);
+        let file_id = page_cache::next_file_id();
 
-        let summary_blk = match file.read_blk(0, ctx).await {
+        let block_reader = FileBlockReader::new(&file, file_id);
+
+        let summary_blk = match block_reader.read_blk(0, ctx).await {
             Ok(blk) => blk,
             Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
         };
@@ -730,8 +737,10 @@ impl DeltaLayerInner {
 
         Ok(Ok(DeltaLayerInner {
             file,
+            file_id,
             index_start_blk: actual_summary.index_start_blk,
             index_root_blk: actual_summary.index_root_blk,
+            max_vectored_read_bytes,
         }))
     }
 
@@ -744,11 +753,11 @@ impl DeltaLayerInner {
     ) -> anyhow::Result<ValueReconstructResult> {
         let mut need_image = true;
         // Scan the page versions backwards, starting from `lsn`.
-        let file = &self.file;
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
             self.index_start_blk,
             self.index_root_blk,
-            file,
+            &block_reader,
         );
         let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
 
@@ -782,19 +791,19 @@ impl DeltaLayerInner {
             .build();
 
         // Ok, 'offsets' now contains the offsets of all the entries we need to read
-        let cursor = file.block_cursor();
+        let cursor = block_reader.block_cursor();
         let mut buf = Vec::new();
         for (entry_lsn, pos) in offsets {
             cursor
                 .read_blob_into_buf(pos, &mut buf, ctx)
                 .await
                 .with_context(|| {
-                    format!("Failed to read blob from virtual file {}", file.file.path)
+                    format!("Failed to read blob from virtual file {}", self.file.path)
                 })?;
             let val = Value::des(&buf).with_context(|| {
                 format!(
                     "Failed to deserialize file blob from virtual file {}",
-                    file.file.path
+                    self.file.path
                 )
             })?;
             match val {
@@ -834,133 +843,181 @@ impl DeltaLayerInner {
     pub(super) async fn get_values_reconstruct_data(
         &self,
         keyspace: KeySpace,
-        end_lsn: Lsn,
+        lsn_range: Range<Lsn>,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
-        let file = &self.file;
+        let reads = self
+            .plan_reads(keyspace, lsn_range, reconstruct_state, ctx)
+            .await
+            .map_err(GetVectoredError::Other)?;
+
+        self.do_reads_and_update_state(reads, reconstruct_state)
+            .await;
+
+        Ok(())
+    }
+
+    async fn plan_reads(
+        &self,
+        keyspace: KeySpace,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<VectoredRead>> {
+        let mut planner = VectoredReadPlanner::new(
+            self.max_vectored_read_bytes
+                .expect("Layer is loaded with max vectored bytes config")
+                .0
+                .into(),
+        );
+
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
             self.index_start_blk,
             self.index_root_blk,
-            file,
+            block_reader,
         );
 
-        let mut offsets: BTreeMap<Key, Vec<(Lsn, u64)>> = BTreeMap::new();
-
         for range in keyspace.ranges.iter() {
-            let mut ignore_key = None;
+            let mut range_end_handled = false;
 
-            // Scan the page versions backwards, starting from the last key in the range.
-            // to collect all the offsets at which need to be read.
-            let end_key = DeltaKey::from_key_lsn(&range.end, Lsn(end_lsn.0 - 1));
+            let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start);
             tree_reader
                 .visit(
-                    &end_key.0,
-                    VisitDirection::Backwards,
+                    &start_key.0,
+                    VisitDirection::Forwards,
                     |raw_key, value| {
                         let key = Key::from_slice(&raw_key[..KEY_SIZE]);
-                        let entry_lsn = DeltaKey::extract_lsn_from_buf(raw_key);
-
-                        if entry_lsn >= end_lsn {
-                            return true;
-                        }
-
-                        if key < range.start {
-                            return false;
-                        }
-
-                        if key >= range.end {
-                            return true;
-                        }
-
-                        if Some(key) == ignore_key {
-                            return true;
-                        }
-
-                        if let Some(cached_lsn) = reconstruct_state.get_cached_lsn(&key) {
-                            if entry_lsn <= cached_lsn {
-                                return key != range.start;
-                            }
-                        }
-
+                        let lsn = DeltaKey::extract_lsn_from_buf(raw_key);
                         let blob_ref = BlobRef(value);
-                        let lsns_at = offsets.entry(key).or_default();
-                        lsns_at.push((entry_lsn, blob_ref.pos()));
 
-                        if blob_ref.will_init() {
-                            if key == range.start {
-                                return false;
+                        assert!(key >= range.start && lsn >= lsn_range.start);
+
+                        let cached_lsn = reconstruct_state.get_cached_lsn(&key);
+                        let flag = {
+                            if cached_lsn >= Some(lsn) {
+                                BlobFlag::Ignore
+                            } else if blob_ref.will_init() {
+                                BlobFlag::Replaces
                             } else {
-                                ignore_key = Some(key);
-                                return true;
+                                BlobFlag::None
                             }
-                        }
+                        };
 
-                        true
+                        if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) {
+                            planner.handle_range_end(blob_ref.pos());
+                            range_end_handled = true;
+                            false
+                        } else {
+                            planner.handle(key, lsn, blob_ref.pos(), flag);
+                            true
+                        }
                     },
                     &RequestContextBuilder::extend(ctx)
                         .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
                         .build(),
                 )
                 .await
-                .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
-        }
+                .map_err(|err| anyhow!(err))?;
 
-        let ctx = &RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::DeltaLayerValue)
-            .build();
-
-        let cursor = file.block_cursor();
-        let mut buf = Vec::new();
-        for (key, lsns_at) in offsets {
-            for (lsn, block_offset) in lsns_at {
-                let res = cursor.read_blob_into_buf(block_offset, &mut buf, ctx).await;
-
-                if let Err(e) = res {
-                    reconstruct_state.on_key_error(
-                        key,
-                        PageReconstructError::from(anyhow!(e).context(format!(
-                            "Failed to read blob from virtual file {}",
-                            file.file.path
-                        ))),
-                    );
-
-                    break;
-                }
-
-                let value = Value::des(&buf);
-                if let Err(e) = value {
-                    reconstruct_state.on_key_error(
-                        key,
-                        PageReconstructError::from(anyhow!(e).context(format!(
-                            "Failed to deserialize file blob from virtual file {}",
-                            file.file.path
-                        ))),
-                    );
-
-                    break;
-                }
-
-                let key_situation = reconstruct_state.update_key(&key, lsn, value.unwrap());
-                if key_situation == ValueReconstructSituation::Complete {
-                    break;
-                }
+            if !range_end_handled {
+                let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
+                tracing::info!("Handling range end fallback at {}", payload_end);
+                planner.handle_range_end(payload_end);
             }
         }
 
-        Ok(())
+        Ok(planner.finish())
+    }
+
+    async fn do_reads_and_update_state(
+        &self,
+        reads: Vec<VectoredRead>,
+        reconstruct_state: &mut ValuesReconstructState,
+    ) {
+        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
+        let mut ignore_key_with_err = None;
+
+        let max_vectored_read_bytes = self
+            .max_vectored_read_bytes
+            .expect("Layer is loaded with max vectored bytes config")
+            .0
+            .into();
+        let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
+
+        // Note that reads are processed in reverse order (from highest key+lsn).
+        // This is the order that `ReconstructState` requires such that it can
+        // track when a key is done.
+        for read in reads.into_iter().rev() {
+            let res = vectored_blob_reader
+                .read_blobs(&read, buf.take().expect("Should have a buffer"))
+                .await;
+
+            let blobs_buf = match res {
+                Ok(blobs_buf) => blobs_buf,
+                Err(err) => {
+                    let kind = err.kind();
+                    for (_, blob_meta) in read.blobs_at.as_slice() {
+                        reconstruct_state.on_key_error(
+                            blob_meta.key,
+                            PageReconstructError::from(anyhow!(
+                                "Failed to read blobs from virtual file {}: {}",
+                                self.file.path,
+                                kind
+                            )),
+                        );
+                    }
+
+                    // We have "lost" the buffer since the lower level IO api
+                    // doesn't return the buffer on error. Allocate a new one.
+                    buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
+
+                    continue;
+                }
+            };
+
+            for meta in blobs_buf.blobs.iter().rev() {
+                if Some(meta.meta.key) == ignore_key_with_err {
+                    continue;
+                }
+
+                let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
+                let value = match value {
+                    Ok(v) => v,
+                    Err(e) => {
+                        reconstruct_state.on_key_error(
+                            meta.meta.key,
+                            PageReconstructError::from(anyhow!(e).context(format!(
+                                "Failed to deserialize blob from virtual file {}",
+                                self.file.path,
+                            ))),
+                        );
+
+                        ignore_key_with_err = Some(meta.meta.key);
+                        continue;
+                    }
+                };
+
+                // Invariant: once a key reaches [`ValueReconstructSituation::Complete`]
+                // state, no further updates shall be made to it. The call below will
+                // panic if the invariant is violated.
+                reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value);
+            }
+
+            buf = Some(blobs_buf.buf);
+        }
     }
 
     pub(super) async fn load_keys<'a>(
         &'a self,
         ctx: &RequestContext,
     ) -> Result<Vec<DeltaEntry<'a>>> {
-        let file = &self.file;
-
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
             self.index_start_blk,
             self.index_root_blk,
-            file,
+            block_reader,
         );
 
         let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();
@@ -1012,11 +1069,11 @@ impl DeltaLayerInner {
             self.index_start_blk, self.index_root_blk
         );
 
-        let file = &self.file;
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
             self.index_start_blk,
             self.index_root_blk,
-            file,
+            block_reader,
         );
 
         tree_reader.dump().await?;
@@ -1111,7 +1168,8 @@ impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
         blknum: u32,
         ctx: &RequestContext,
     ) -> Result<BlockLease, std::io::Error> {
-        self.0.as_ref().file.read_blk(blknum, ctx).await
+        let block_reader = FileBlockReader::new(&self.0.as_ref().file, self.0.as_ref().file_id);
+        block_reader.read_blk(blknum, ctx).await
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index b867cb0333..0a707295cc 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -25,7 +25,7 @@
 //! actual page images are stored in the "values" part.
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
-use crate::page_cache::PAGE_SZ;
+use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
@@ -34,11 +34,14 @@ use crate::tenant::storage_layer::{
     LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
 use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::vectored_blob_io::{
+    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+};
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use bytes::Bytes;
+use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use pageserver_api::keyspace::KeySpace;
@@ -152,8 +155,10 @@ pub struct ImageLayerInner {
 
     lsn: Lsn,
 
-    /// Reader object for reading blocks from the file.
-    file: FileBlockReader,
+    file: VirtualFile,
+    file_id: FileId,
+
+    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }
 
 impl std::fmt::Debug for ImageLayerInner {
@@ -167,9 +172,12 @@ impl std::fmt::Debug for ImageLayerInner {
 
 impl ImageLayerInner {
     pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let file = &self.file;
-        let tree_reader =
-            DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            block_reader,
+        );
 
         tree_reader.dump().await?;
 
@@ -252,7 +260,7 @@ impl ImageLayer {
     async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
         let path = self.path();
 
-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx)
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
             .await
             .and_then(|res| res)?;
 
@@ -327,16 +335,16 @@ impl ImageLayer {
     where
         F: Fn(Summary) -> Summary,
     {
-        let file = VirtualFile::open_with_options(
+        let mut file = VirtualFile::open_with_options(
             path,
             virtual_file::OpenOptions::new().read(true).write(true),
         )
         .await
         .with_context(|| format!("Failed to open file '{}'", path))?;
-        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0, ctx).await?;
+        let file_id = page_cache::next_file_id();
+        let block_reader = FileBlockReader::new(&file, file_id);
+        let summary_blk = block_reader.read_blk(0, ctx).await?;
         let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
-        let mut file = file.file;
         if actual_summary.magic != IMAGE_FILE_MAGIC {
             return Err(RewriteSummaryError::MagicMismatch);
         }
@@ -361,14 +369,16 @@ impl ImageLayerInner {
         path: &Utf8Path,
         lsn: Lsn,
         summary: Option<Summary>,
+        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
     ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
         let file = match VirtualFile::open(path).await {
             Ok(file) => file,
             Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
         };
-        let file = FileBlockReader::new(file);
-        let summary_blk = match file.read_blk(0, ctx).await {
+        let file_id = page_cache::next_file_id();
+        let block_reader = FileBlockReader::new(&file, file_id);
+        let summary_blk = match block_reader.read_blk(0, ctx).await {
             Ok(blk) => blk,
             Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
         };
@@ -399,6 +409,8 @@ impl ImageLayerInner {
             index_root_blk: actual_summary.index_root_blk,
             lsn,
             file,
+            file_id,
+            max_vectored_read_bytes,
         }))
     }
 
@@ -408,8 +420,9 @@ impl ImageLayerInner {
         reconstruct_state: &mut ValueReconstructState,
         ctx: &RequestContext,
     ) -> anyhow::Result<ValueReconstructResult> {
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
 
         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
         key.write_to_byte_slice(&mut keybuf);
@@ -422,7 +435,7 @@ impl ImageLayerInner {
             )
             .await?
         {
-            let blob = file
+            let blob = block_reader
                 .block_cursor()
                 .read_blob(
                     offset,
@@ -449,12 +462,36 @@ impl ImageLayerInner {
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
+        let reads = self
+            .plan_reads(keyspace, ctx)
+            .await
+            .map_err(GetVectoredError::Other)?;
 
-        let mut offsets = Vec::new();
+        self.do_reads_and_update_state(reads, reconstruct_state)
+            .await;
+
+        Ok(())
+    }
+
+    async fn plan_reads(
+        &self,
+        keyspace: KeySpace,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<VectoredRead>> {
+        let mut planner = VectoredReadPlanner::new(
+            self.max_vectored_read_bytes
+                .expect("Layer is loaded with max vectored bytes config")
+                .0
+                .into(),
+        );
+
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
 
         for range in keyspace.ranges.iter() {
+            let mut range_end_handled = false;
+
             let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
             range.start.write_to_byte_slice(&mut search_key);
 
@@ -462,17 +499,18 @@ impl ImageLayerInner {
                 .visit(
                     &search_key,
                     VisitDirection::Forwards,
-                    |raw_key, value| {
+                    |raw_key, offset| {
                         let key = Key::from_slice(&raw_key[..KEY_SIZE]);
                         assert!(key >= range.start);
 
-                        if !range.contains(&key) {
-                            return false;
+                        if key >= range.end {
+                            planner.handle_range_end(offset);
+                            range_end_handled = true;
+                            false
+                        } else {
+                            planner.handle(key, self.lsn, offset, BlobFlag::None);
+                            true
                         }
-
-                        offsets.push((key, value));
-
-                        true
                     },
                     &RequestContextBuilder::extend(ctx)
                         .page_content_kind(PageContentKind::ImageLayerBtreeNode)
@@ -480,33 +518,60 @@ impl ImageLayerInner {
                 )
                 .await
                 .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
-        }
 
-        let ctx = &RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::ImageLayerValue)
-            .build();
-
-        let cursor = file.block_cursor();
-        let mut buf = Vec::new();
-        for (key, offset) in offsets {
-            let res = cursor.read_blob_into_buf(offset, &mut buf, ctx).await;
-            if let Err(e) = res {
-                reconstruct_state.on_key_error(
-                    key,
-                    PageReconstructError::from(anyhow!(e).context(format!(
-                        "Failed to read blob from virtual file {}",
-                        file.file.path
-                    ))),
-                );
-
-                continue;
+            if !range_end_handled {
+                let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
+                planner.handle_range_end(payload_end);
             }
-
-            let blob = Bytes::copy_from_slice(buf.as_slice());
-            reconstruct_state.update_key(&key, self.lsn, Value::Image(blob));
         }
 
-        Ok(())
+        Ok(planner.finish())
+    }
+
+    async fn do_reads_and_update_state(
+        &self,
+        reads: Vec<VectoredRead>,
+        reconstruct_state: &mut ValuesReconstructState,
+    ) {
+        let max_vectored_read_bytes = self
+            .max_vectored_read_bytes
+            .expect("Layer is loaded with max vectored bytes config")
+            .0
+            .into();
+
+        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
+        for read in reads.into_iter() {
+            let buf = BytesMut::with_capacity(max_vectored_read_bytes);
+            let res = vectored_blob_reader.read_blobs(&read, buf).await;
+
+            match res {
+                Ok(blobs_buf) => {
+                    let frozen_buf = blobs_buf.buf.freeze();
+
+                    for meta in blobs_buf.blobs.iter() {
+                        let img_buf = frozen_buf.slice(meta.start..meta.end);
+                        reconstruct_state.update_key(
+                            &meta.meta.key,
+                            self.lsn,
+                            Value::Image(img_buf),
+                        );
+                    }
+                }
+                Err(err) => {
+                    let kind = err.kind();
+                    for (_, blob_meta) in read.blobs_at.as_slice() {
+                        reconstruct_state.on_key_error(
+                            blob_meta.key,
+                            PageReconstructError::from(anyhow!(
+                                "Failed to read blobs from virtual file {}: {}",
+                                self.file.path,
+                                kind
+                            )),
+                        );
+                    }
+                }
+            };
+        }
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 61eba07be6..13c9e5c989 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -270,7 +270,7 @@ impl Layer {
     pub(crate) async fn get_values_reconstruct_data(
         &self,
         keyspace: KeySpace,
-        end_lsn: Lsn,
+        lsn_range: Range<Lsn>,
         reconstruct_data: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
@@ -285,7 +285,7 @@ impl Layer {
             .record_access(LayerAccessKind::GetValueReconstructData, ctx);
 
         layer
-            .get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, &self.0, ctx)
+            .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
             .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
             .await
     }
@@ -1296,9 +1296,14 @@ impl DownloadedLayer {
                     owner.desc.key_range.clone(),
                     owner.desc.lsn_range.clone(),
                 ));
-                delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
-                    .await
-                    .map(|res| res.map(LayerKind::Delta))
+                delta_layer::DeltaLayerInner::load(
+                    &owner.path,
+                    summary,
+                    Some(owner.conf.max_vectored_read_bytes),
+                    ctx,
+                )
+                .await
+                .map(|res| res.map(LayerKind::Delta))
             } else {
                 let lsn = owner.desc.image_layer_lsn();
                 let summary = Some(image_layer::Summary::expected(
@@ -1307,9 +1312,15 @@ impl DownloadedLayer {
                     owner.desc.key_range.clone(),
                     lsn,
                 ));
-                image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
-                    .await
-                    .map(|res| res.map(LayerKind::Image))
+                image_layer::ImageLayerInner::load(
+                    &owner.path,
+                    lsn,
+                    summary,
+                    Some(owner.conf.max_vectored_read_bytes),
+                    ctx,
+                )
+                .await
+                .map(|res| res.map(LayerKind::Image))
             };
 
             match res {
@@ -1362,7 +1373,7 @@ impl DownloadedLayer {
     async fn get_values_reconstruct_data(
         &self,
         keyspace: KeySpace,
-        end_lsn: Lsn,
+        lsn_range: Range<Lsn>,
         reconstruct_data: &mut ValuesReconstructState,
         owner: &Arc<LayerInner>,
         ctx: &RequestContext,
@@ -1371,7 +1382,7 @@ impl DownloadedLayer {
 
         match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
             Delta(d) => {
-                d.get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, ctx)
+                d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx)
                     .await
             }
             Image(i) => {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4d820f7b13..fa5e7b3685 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -777,8 +777,10 @@ impl Timeline {
             GetVectoredImpl::Vectored => {
                 let vectored_res = self.get_vectored_impl(keyspace.clone(), lsn, ctx).await;
 
-                self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
-                    .await;
+                if self.conf.validate_vectored_get {
+                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
+                        .await;
+                }
 
                 vectored_res
             }
@@ -2892,8 +2894,7 @@ impl Timeline {
                                 (
                                     ReadableLayerDesc::Persistent {
                                         desc: (*layer).clone(),
-                                        lsn_floor,
-                                        lsn_ceil: cont_lsn,
+                                        lsn_range: lsn_floor..cont_lsn,
                                     },
                                     keyspace_accum.to_keyspace(),
                                 )
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
new file mode 100644
index 0000000000..a8d9649d36
--- /dev/null
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -0,0 +1,436 @@
+//!
+//! Utilities for vectored reading of variable-sized "blobs".
+//!
+//! The "blob" api is an abstraction on top of the "block" api,
+//! with the main difference being that blobs do not have a fixed
+//! size (each blob is prefixed with 1 or 4 byte length field)
+//!
+//! The vectored apis provided in this module allow for planning
+//! and executing disk IO which covers multiple blobs.
+//!
+//! Reads are planned with [`VectoredReadPlanner`] which will coalesce
+//! adjacent blocks into a single disk IO request and exectuted by
+//! [`VectoredBlobReader`] which does all the required offset juggling
+//! and returns a buffer housing all the blobs and a list of offsets.
+//!
+//! Note that the vectored blob api does *not* go through the page cache.
+
+use std::collections::BTreeMap;
+use std::num::NonZeroUsize;
+
+use bytes::BytesMut;
+use pageserver_api::key::Key;
+use utils::lsn::Lsn;
+use utils::vec_map::VecMap;
+
+use crate::virtual_file::VirtualFile;
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub struct MaxVectoredReadBytes(pub NonZeroUsize);
+
+/// Metadata bundled with the start and end offset of a blob.
+#[derive(Copy, Clone, Debug)]
+pub struct BlobMeta {
+    pub key: Key,
+    pub lsn: Lsn,
+}
+
+/// Blob offsets into [`VectoredBlobsBuf::buf`]
+pub struct VectoredBlob {
+    pub start: usize,
+    pub end: usize,
+    pub meta: BlobMeta,
+}
+
+/// Return type of [`VectoredBlobReader::read_blobs`]
+pub struct VectoredBlobsBuf {
+    /// Buffer for all blobs in this read
+    pub buf: BytesMut,
+    /// Offsets into the buffer and metadata for all blobs in this read
+    pub blobs: Vec<VectoredBlob>,
+}
+
+/// Description of one disk read for multiple blobs.
+/// Used as the argument form [`VectoredBlobReader::read_blobs`]
+#[derive(Debug)]
+pub struct VectoredRead {
+    pub start: u64,
+    pub end: u64,
+    /// Starting offsets and metadata for each blob in this read
+    pub blobs_at: VecMap<u64, BlobMeta>,
+}
+
+impl VectoredRead {
+    fn size(&self) -> usize {
+        (self.end - self.start) as usize
+    }
+}
+
+#[derive(Eq, PartialEq)]
+enum VectoredReadExtended {
+    Yes,
+    No,
+}
+
+struct VectoredReadBuilder {
+    start: u64,
+    end: u64,
+    blobs_at: VecMap<u64, BlobMeta>,
+    max_read_size: usize,
+}
+
+impl VectoredReadBuilder {
+    fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self {
+        let mut blobs_at = VecMap::default();
+        blobs_at
+            .append(start_offset, meta)
+            .expect("First insertion always succeeds");
+
+        Self {
+            start: start_offset,
+            end: end_offset,
+            blobs_at,
+            max_read_size,
+        }
+    }
+
+    /// Attempt to extend the current read with a new blob if the start
+    /// offset matches with the current end of the vectored read
+    /// and the resuting size is below the max read size
+    fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
+        let size = (end - start) as usize;
+        if self.end == start && self.size() + size <= self.max_read_size {
+            self.end = end;
+            self.blobs_at
+                .append(start, meta)
+                .expect("LSNs are ordered within vectored reads");
+
+            return VectoredReadExtended::Yes;
+        }
+
+        VectoredReadExtended::No
+    }
+
+    fn size(&self) -> usize {
+        (self.end - self.start) as usize
+    }
+
+    fn build(self) -> VectoredRead {
+        VectoredRead {
+            start: self.start,
+            end: self.end,
+            blobs_at: self.blobs_at,
+        }
+    }
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum BlobFlag {
+    None,
+    Ignore,
+    Replaces,
+}
+
+/// Planner for vectored blob reads.
+///
+/// Blob offsets are received via [`VectoredReadPlanner::handle`]
+/// and coalesced into disk reads.
+///
+/// The implementation is very simple:
+/// * Collect all blob offsets in an ordered structure
+/// * Iterate over the collected blobs and coalesce them into reads at the end
+pub struct VectoredReadPlanner {
+    // Track all the blob offsets. Start offsets must be ordered.
+    blobs: BTreeMap<Key, Vec<(Lsn, u64, u64)>>,
+    // Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
+    prev: Option<(Key, Lsn, u64, BlobFlag)>,
+
+    max_read_size: usize,
+}
+
+impl VectoredReadPlanner {
+    pub fn new(max_read_size: usize) -> Self {
+        Self {
+            blobs: BTreeMap::new(),
+            prev: None,
+            max_read_size,
+        }
+    }
+
+    /// Include a new blob in the read plan.
+    ///
+    /// This function is called from a B-Tree index visitor (see `DeltaLayerInner::plan_reads`
+    /// and `ImageLayerInner::plan_reads`). Said visitor wants to collect blob offsets for all
+    /// keys in a given keyspace. This function must be called for each key in the desired
+    /// keyspace (monotonically continuous). [`Self::handle_range_end`] must
+    /// be called after every range in the offset.
+    ///
+    /// In the event that keys are skipped, the behaviour is undefined and can lead to an
+    /// incorrect read plan. We can end up asserting, erroring in wal redo or returning
+    /// incorrect data to the user.
+    ///
+    /// The `flag` argument has two interesting values:
+    /// * [`BlobFlag::Replaces`]: The blob for this key should replace all existing blobs.
+    /// This is used for WAL records that `will_init`.
+    /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
+    /// if the blob is cached.
+    pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) {
+        // Implementation note: internally lag behind by one blob such that
+        // we have a start and end offset when initialising [`VectoredRead`]
+        let (prev_key, prev_lsn, prev_offset, prev_flag) = match self.prev {
+            None => {
+                self.prev = Some((key, lsn, offset, flag));
+                return;
+            }
+            Some(prev) => prev,
+        };
+
+        self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
+
+        self.prev = Some((key, lsn, offset, flag));
+    }
+
+    pub fn handle_range_end(&mut self, offset: u64) {
+        if let Some((prev_key, prev_lsn, prev_offset, prev_flag)) = self.prev {
+            self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
+        }
+
+        self.prev = None;
+    }
+
+    fn add_blob(&mut self, key: Key, lsn: Lsn, start_offset: u64, end_offset: u64, flag: BlobFlag) {
+        match flag {
+            BlobFlag::None => {
+                let blobs_for_key = self.blobs.entry(key).or_default();
+                blobs_for_key.push((lsn, start_offset, end_offset));
+            }
+            BlobFlag::Replaces => {
+                let blobs_for_key = self.blobs.entry(key).or_default();
+                blobs_for_key.clear();
+                blobs_for_key.push((lsn, start_offset, end_offset));
+            }
+            BlobFlag::Ignore => {}
+        }
+    }
+
+    pub fn finish(self) -> Vec<VectoredRead> {
+        let mut current_read_builder: Option<VectoredReadBuilder> = None;
+        let mut reads = Vec::new();
+
+        for (key, blobs_for_key) in self.blobs {
+            for (lsn, start_offset, end_offset) in blobs_for_key {
+                let extended = match &mut current_read_builder {
+                    Some(read_builder) => {
+                        read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn })
+                    }
+                    None => VectoredReadExtended::No,
+                };
+
+                if extended == VectoredReadExtended::No {
+                    let next_read_builder = VectoredReadBuilder::new(
+                        start_offset,
+                        end_offset,
+                        BlobMeta { key, lsn },
+                        self.max_read_size,
+                    );
+
+                    let prev_read_builder = current_read_builder.replace(next_read_builder);
+
+                    // `current_read_builder` is None in the first iteration of the outer loop
+                    if let Some(read_builder) = prev_read_builder {
+                        reads.push(read_builder.build());
+                    }
+                }
+            }
+        }
+
+        if let Some(read_builder) = current_read_builder {
+            reads.push(read_builder.build());
+        }
+
+        reads
+    }
+}
+
+/// Disk reader for vectored blob spans (does not go through the page cache)
+pub struct VectoredBlobReader<'a> {
+    file: &'a VirtualFile,
+}
+
+impl<'a> VectoredBlobReader<'a> {
+    pub fn new(file: &'a VirtualFile) -> Self {
+        Self { file }
+    }
+
+    /// Read the requested blobs into the buffer.
+    ///
+    /// We have to deal with the fact that blobs are not fixed size.
+    /// Each blob is prefixed by a size header.
+    ///
+    /// The success return value is a struct which contains the buffer
+    /// filled from disk and a list of offsets at which each blob lies
+    /// in the buffer.
+    pub async fn read_blobs(
+        &self,
+        read: &VectoredRead,
+        buf: BytesMut,
+    ) -> Result<VectoredBlobsBuf, std::io::Error> {
+        assert!(read.size() > 0);
+        assert!(
+            read.size() <= buf.capacity(),
+            "{} > {}",
+            read.size(),
+            buf.capacity()
+        );
+        let buf = self
+            .file
+            .read_exact_at_n(buf, read.start, read.size())
+            .await?;
+
+        let blobs_at = read.blobs_at.as_slice();
+        let start_offset = blobs_at.first().expect("VectoredRead is never empty").0;
+
+        let mut metas = Vec::with_capacity(blobs_at.len());
+
+        // Blobs in `read` only provide their starting offset. The end offset
+        // of a blob is implicit: the start of the next blob if one exists
+        // or the end of the read.
+        let pairs = blobs_at.iter().zip(
+            blobs_at
+                .iter()
+                .map(Some)
+                .skip(1)
+                .chain(std::iter::once(None)),
+        );
+
+        for ((offset, meta), next) in pairs {
+            let offset_in_buf = offset - start_offset;
+            let first_len_byte = buf[offset_in_buf as usize];
+
+            // Each blob is prefixed by a header containing it's size.
+            // Extract the size and skip that header to find the start of the data.
+            // The size can be 1 or 4 bytes. The most significant bit is 0 in the
+            // 1 byte case and 1 in the 4 byte case.
+            let (size_length, blob_size) = if first_len_byte < 0x80 {
+                (1, first_len_byte as u64)
+            } else {
+                let mut blob_size_buf = [0u8; 4];
+                let offset_in_buf = offset_in_buf as usize;
+
+                blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
+                blob_size_buf[0] &= 0x7f;
+                (4, u32::from_be_bytes(blob_size_buf) as u64)
+            };
+
+            let start = offset_in_buf + size_length;
+            let end = match next {
+                Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
+                None => start + blob_size,
+            };
+
+            assert_eq!(end - start, blob_size);
+
+            metas.push(VectoredBlob {
+                start: start as usize,
+                end: end as usize,
+                meta: *meta,
+            })
+        }
+
+        Ok(VectoredBlobsBuf { buf, blobs: metas })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
+        assert_eq!(read.start, offset_range.first().unwrap().2);
+
+        let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect();
+
+        let offsets_in_read: Vec<_> = read
+            .blobs_at
+            .as_slice()
+            .iter()
+            .map(|(offset, _)| *offset)
+            .collect();
+
+        assert_eq!(expected_offsets_in_read, offsets_in_read);
+    }
+
+    #[test]
+    fn planner_max_read_size_test() {
+        let max_read_size = 128 * 1024;
+        let key = Key::MIN;
+        let lsn = Lsn(0);
+
+        let blob_descriptions = vec![
+            (key, lsn, 0, BlobFlag::None),
+            (key, lsn, 32 * 1024, BlobFlag::None),
+            (key, lsn, 96 * 1024, BlobFlag::None), // Last in read 1
+            (key, lsn, 128 * 1024, BlobFlag::None), // Last in read 2
+            (key, lsn, 198 * 1024, BlobFlag::None), // Last in read 3
+            (key, lsn, 268 * 1024, BlobFlag::None), // Last in read 4
+            (key, lsn, 396 * 1024, BlobFlag::None), // Last in read 5
+            (key, lsn, 652 * 1024, BlobFlag::None), // Last in read 6
+        ];
+
+        let ranges = [
+            &blob_descriptions[0..3],
+            &blob_descriptions[3..4],
+            &blob_descriptions[4..5],
+            &blob_descriptions[5..6],
+            &blob_descriptions[6..7],
+            &blob_descriptions[7..],
+        ];
+
+        let mut planner = VectoredReadPlanner::new(max_read_size);
+        for (key, lsn, offset, flag) in blob_descriptions.clone() {
+            planner.handle(key, lsn, offset, flag);
+        }
+
+        planner.handle_range_end(652 * 1024);
+
+        let reads = planner.finish();
+        assert_eq!(reads.len(), 6);
+
+        for (idx, read) in reads.iter().enumerate() {
+            validate_read(read, ranges[idx]);
+        }
+    }
+
+    #[test]
+    fn planner_replacement_test() {
+        let max_read_size = 128 * 1024;
+        let first_key = Key::MIN;
+        let second_key = first_key.next();
+        let lsn = Lsn(0);
+
+        let blob_descriptions = vec![
+            (first_key, lsn, 0, BlobFlag::None),    // First in read 1
+            (first_key, lsn, 1024, BlobFlag::None), // Last in read 1
+            (second_key, lsn, 2 * 1024, BlobFlag::Replaces),
+            (second_key, lsn, 3 * 1024, BlobFlag::None),
+            (second_key, lsn, 4 * 1024, BlobFlag::Replaces), // First in read 2
+            (second_key, lsn, 5 * 1024, BlobFlag::None),     // Last in read 2
+        ];
+
+        let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]];
+
+        let mut planner = VectoredReadPlanner::new(max_read_size);
+        for (key, lsn, offset, flag) in blob_descriptions.clone() {
+            planner.handle(key, lsn, offset, flag);
+        }
+
+        planner.handle_range_end(6 * 1024);
+
+        let reads = planner.finish();
+        assert_eq!(reads.len(), 2);
+
+        for (idx, read) in reads.iter().enumerate() {
+            validate_read(read, ranges[idx]);
+        }
+    }
+}
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 858fc0ef64..b7112108f2 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -548,7 +548,18 @@ impl VirtualFile {
         B: IoBufMut + Send,
     {
         let (buf, res) =
-            read_exact_at_impl(buf, offset, |buf, offset| self.read_at(buf, offset)).await;
+            read_exact_at_impl(buf, offset, None, |buf, offset| self.read_at(buf, offset)).await;
+        res.map(|()| buf)
+    }
+
+    pub async fn read_exact_at_n<B>(&self, buf: B, offset: u64, count: usize) -> Result<B, Error>
+    where
+        B: IoBufMut + Send,
+    {
+        let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| {
+            self.read_at(buf, offset)
+        })
+        .await;
         res.map(|()| buf)
     }
 
@@ -682,6 +693,7 @@ impl VirtualFile {
 pub async fn read_exact_at_impl<B, F, Fut>(
     buf: B,
     mut offset: u64,
+    count: Option<usize>,
     mut read_at: F,
 ) -> (B, std::io::Result<()>)
 where
@@ -689,7 +701,15 @@ where
     F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
     Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
 {
-    let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
+    let mut buf: tokio_epoll_uring::Slice<B> = match count {
+        Some(count) => {
+            assert!(count <= buf.bytes_total());
+            assert!(count > 0);
+            buf.slice(..count) // may include uninitialized memory
+        }
+        None => buf.slice_full(), // includes all the uninitialized memory
+    };
+
     while buf.bytes_total() != 0 {
         let res;
         (buf, res) = read_at(buf, offset).await;
@@ -779,7 +799,7 @@ mod test_read_exact_at_impl {
                 result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
             }]),
         }));
-        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
             let mock_read_at = Arc::clone(&mock_read_at);
             async move { mock_read_at.lock().await.read_at(buf, offset).await }
         })
@@ -788,13 +808,33 @@ mod test_read_exact_at_impl {
         assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
     }
 
+    #[tokio::test]
+    async fn test_with_count() {
+        let buf = Vec::with_capacity(5);
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::from(vec![Expectation {
+                offset: 0,
+                bytes_total: 3,
+                result: Ok(vec![b'a', b'b', b'c']),
+            }]),
+        }));
+
+        let (buf, res) = read_exact_at_impl(buf, 0, Some(3), |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        assert!(res.is_ok());
+        assert_eq!(buf, vec![b'a', b'b', b'c']);
+    }
+
     #[tokio::test]
     async fn test_empty_buf_issues_no_syscall() {
         let buf = Vec::new();
         let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
             expectations: VecDeque::new(),
         }));
-        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
             let mock_read_at = Arc::clone(&mock_read_at);
             async move { mock_read_at.lock().await.read_at(buf, offset).await }
         })
@@ -819,7 +859,7 @@ mod test_read_exact_at_impl {
                 },
             ]),
         }));
-        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
             let mock_read_at = Arc::clone(&mock_read_at);
             async move { mock_read_at.lock().await.read_at(buf, offset).await }
         })
@@ -850,7 +890,7 @@ mod test_read_exact_at_impl {
                 },
             ]),
         }));
-        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
             let mock_read_at = Arc::clone(&mock_read_at);
             async move { mock_read_at.lock().await.read_at(buf, offset).await }
         })
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 55c16f73b0..71e77334a1 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1115,6 +1115,13 @@ class NeonEnv:
         # bounce through retries on startup
         self.attachment_service.start()
 
+        def attachment_service_ready():
+            assert self.attachment_service.ready() is True
+
+        # Wait for attachment service readiness to prevent unnecessary post start-up
+        # reconcile.
+        wait_until(30, 1, attachment_service_ready)
+
         # Start up broker, pageserver and all safekeepers
         futs = []
         with concurrent.futures.ThreadPoolExecutor(
diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
new file mode 100644
index 0000000000..e2e7fffdbe
--- /dev/null
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -0,0 +1,195 @@
+import asyncio
+import json
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
+from fixtures.utils import get_scale_for_db, humantime_to_ms
+
+from performance.pageserver.util import (
+    setup_pageserver_with_tenants,
+)
+
+
+@pytest.mark.parametrize("duration", [30])
+@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
+@pytest.mark.parametrize("n_tenants", [10])
+@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
+@pytest.mark.timeout(1000)
+def test_basebackup_with_high_slru_count(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    get_vectored_impl: str,
+    n_tenants: int,
+    pgbench_scale: int,
+    duration: int,
+):
+    def record(metric, **kwargs):
+        zenbenchmark.record(metric_name=f"pageserver_basebackup.{metric}", **kwargs)
+
+    params: Dict[str, Tuple[Any, Dict[str, Any]]] = {}
+
+    # params from fixtures
+    params.update(
+        {
+            "n_tenants": (n_tenants, {"unit": ""}),
+            "pgbench_scale": (pgbench_scale, {"unit": ""}),
+            "duration": (duration, {"unit": "s"}),
+        }
+    )
+
+    # configure cache sizes like in prod
+    page_cache_size = 16384
+    max_file_descriptors = 500000
+    neon_env_builder.pageserver_config_override = (
+        f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; "
+        f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false"
+    )
+    params.update(
+        {
+            "pageserver_config_override.page_cache_size": (
+                page_cache_size * 8192,
+                {"unit": "byte"},
+            ),
+            "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}),
+        }
+    )
+
+    for param, (value, kwargs) in params.items():
+        record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs)
+
+    n_txns = 500000
+
+    def setup_wrapper(env: NeonEnv):
+        return setup_tenant_template(env, n_txns)
+
+    env = setup_pageserver_with_tenants(
+        neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper
+    )
+    run_benchmark(env, pg_bin, record, duration)
+
+
+def setup_tenant_template(env: NeonEnv, n_txns: int):
+    config = {
+        "gc_period": "0s",  # disable periodic gc
+        "checkpoint_timeout": "10 years",
+        "compaction_period": "0s",  # disable periodic compaction
+        "compaction_threshold": 10,
+        "compaction_target_size": 134217728,
+        "checkpoint_distance": 268435456,
+        "image_creation_threshold": 3,
+    }
+
+    template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
+    env.pageserver.tenant_detach(template_tenant)
+    env.pageserver.allowed_errors.append(
+        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        ".*Dropped remote consistent LSN updates.*",
+    )
+    env.pageserver.tenant_attach(template_tenant, config)
+
+    ps_http = env.pageserver.http_client()
+
+    with env.endpoints.create_start(
+        "main", tenant_id=template_tenant, config_lines=["shared_buffers=1MB"]
+    ) as ep:
+        rels = 10
+
+        asyncio.run(run_updates(ep, n_txns, rels))
+
+        wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+        ps_http.timeline_checkpoint(template_tenant, template_timeline)
+        ps_http.timeline_compact(template_tenant, template_timeline)
+
+    return (template_tenant, template_timeline, config)
+
+
+# Takes about 5 minutes and produces tenants with around 300 SLRU blocks
+# of 8 KiB each.
+async def run_updates(ep: Endpoint, n_txns: int, workers_count: int):
+    workers = []
+    for i in range(workers_count):
+        workers.append(asyncio.create_task(run_update_loop_worker(ep, n_txns, i)))
+
+    await asyncio.gather(*workers)
+
+
+async def run_update_loop_worker(ep: Endpoint, n_txns: int, idx: int):
+    table = f"t_{idx}"
+    conn = await ep.connect_async()
+    await conn.execute(f"CREATE TABLE {table} (pk integer PRIMARY KEY, x integer)")
+    await conn.execute(f"ALTER TABLE {table} SET (autovacuum_enabled = false)")
+    await conn.execute(f"INSERT INTO {table} VALUES (1, 0)")
+    await conn.execute(
+        """
+         CREATE PROCEDURE updating{0}() as
+         $$
+             DECLARE
+             i integer;
+             BEGIN
+             FOR i IN 1..{1} LOOP
+                 UPDATE {0} SET x = x + 1 WHERE pk=1;
+                 COMMIT;
+             END LOOP;
+             END
+         $$ LANGUAGE plpgsql
+         """.format(table, n_txns)
+    )
+    await conn.execute("SET statement_timeout=0")
+    await conn.execute(f"call updating{table}()")
+
+
+def run_benchmark(env: NeonEnv, pg_bin: PgBin, record, duration_secs: int):
+    ps_http = env.pageserver.http_client()
+    cmd = [
+        str(env.neon_binpath / "pagebench"),
+        "basebackup",
+        "--mgmt-api-endpoint",
+        ps_http.base_url,
+        "--page-service-connstring",
+        env.pageserver.connstr(password=None),
+        "--gzip-probability",
+        "1",
+        "--runtime",
+        f"{duration_secs}s",
+        # don't specify the targets explicitly, let pagebench auto-discover them
+    ]
+
+    log.info(f"command: {' '.join(cmd)}")
+    basepath = pg_bin.run_capture(cmd, with_command_header=False)
+    results_path = Path(basepath + ".stdout")
+    log.info(f"Benchmark results at: {results_path}")
+
+    with open(results_path, "r") as f:
+        results = json.load(f)
+    log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
+
+    total = results["total"]
+    metric = "request_count"
+    record(
+        metric,
+        metric_value=total[metric],
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+    metric = "latency_mean"
+    record(
+        metric,
+        metric_value=humantime_to_ms(total[metric]),
+        unit="ms",
+        report=MetricReport.LOWER_IS_BETTER,
+    )
+
+    metric = "latency_percentiles"
+    for k, v in total[metric].items():
+        record(
+            f"{metric}.{k}",
+            metric_value=humantime_to_ms(v),
+            unit="ms",
+            report=MetricReport.LOWER_IS_BETTER,
+        )
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 307b3848db..8cd3569ea5 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -3,7 +3,6 @@ import os
 from pathlib import Path
 from typing import Any, Dict, Tuple
 
-import fixtures.pageserver.many_tenants as many_tenants
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.log_helper import log
@@ -15,7 +14,9 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.utils import get_scale_for_db, humantime_to_ms
 
-from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
+from performance.pageserver.util import (
+    setup_pageserver_with_tenants,
+)
 
 
 # For reference, the space usage of the snapshots:
@@ -80,10 +81,77 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
 
     for param, (value, kwargs) in params.items():
         record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs)
-    env = setup_pageserver_with_pgbench_tenants(neon_env_builder, pg_bin, n_tenants, pgbench_scale)
+
+    def setup_wrapper(env: NeonEnv):
+        return setup_tenant_template(env, pg_bin, pgbench_scale)
+
+    env = setup_pageserver_with_tenants(
+        neon_env_builder,
+        f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}",
+        n_tenants,
+        setup_wrapper,
+    )
     run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration)
 
 
+def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
+    """
+    Set up a template tenant which will be replicated by the test infra.
+    It's a pgbench tenant, initialized to a certain scale, and treated afterwards
+    with a repeat application of (pgbench simple-update workload, checkpoint, compact).
+    """
+    # use a config that makes production of on-disk state timing-insensitive
+    # as we ingest data into the tenant.
+    config = {
+        "gc_period": "0s",  # disable periodic gc
+        "checkpoint_timeout": "10 years",
+        "compaction_period": "0s",  # disable periodic compaction
+        "compaction_threshold": 10,
+        "compaction_target_size": 134217728,
+        "checkpoint_distance": 268435456,
+        "image_creation_threshold": 3,
+    }
+    template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
+    env.pageserver.tenant_detach(template_tenant)
+    env.pageserver.allowed_errors.append(
+        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        ".*Dropped remote consistent LSN updates.*",
+    )
+    env.pageserver.tenant_attach(template_tenant, config)
+    ps_http = env.pageserver.http_client()
+    with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
+        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()])
+        wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+        ps_http.timeline_checkpoint(template_tenant, template_timeline)
+        ps_http.timeline_compact(template_tenant, template_timeline)
+        for _ in range(
+            0, 17
+        ):  # some prime number to avoid potential resonances with the "_threshold" variables from the config
+            # the L0s produced by this appear to have size ~5MiB
+            num_txns = 10_000
+            pg_bin.run_capture(
+                ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()]
+            )
+            wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+            ps_http.timeline_checkpoint(template_tenant, template_timeline)
+            ps_http.timeline_compact(template_tenant, template_timeline)
+    # for reference, the output at scale=6 looked like so (306M total)
+    # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59
+    # total 306M
+    # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829
+    # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919
+    #  33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71
+    #  36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791
+    #  16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1
+    # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9
+    # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639
+    # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799
+    # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19
+    # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021
+
+    return (template_tenant, template_timeline, config)
+
+
 def run_benchmark_max_throughput_latest_lsn(
     env: NeonEnv, pg_bin: PgBin, record, duration_secs: int
 ):
@@ -138,78 +206,3 @@ def run_benchmark_max_throughput_latest_lsn(
             unit="ms",
             report=MetricReport.LOWER_IS_BETTER,
         )
-
-
-def setup_pageserver_with_pgbench_tenants(
-    neon_env_builder: NeonEnvBuilder,
-    pg_bin: PgBin,
-    n_tenants: int,
-    scale: int,
-) -> NeonEnv:
-    """
-    Utility function to set up a pageserver with a given number of identical tenants.
-    Each tenant is a pgbench tenant, initialize to a certain scale, and treated afterwards
-    with a repeat application of (pgbench simple-update workload, checkpoint, compact).
-    """
-
-    def setup_template(env: NeonEnv):
-        # use a config that makes production of on-disk state timing-insensitive
-        # as we ingest data into the tenant.
-        config = {
-            "gc_period": "0s",  # disable periodic gc
-            "checkpoint_timeout": "10 years",
-            "compaction_period": "0s",  # disable periodic compaction
-            "compaction_threshold": 10,
-            "compaction_target_size": 134217728,
-            "checkpoint_distance": 268435456,
-            "image_creation_threshold": 3,
-        }
-        template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
-        env.pageserver.tenant_detach(template_tenant)
-        env.pageserver.allowed_errors.append(
-            # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
-            ".*Dropped remote consistent LSN updates.*",
-        )
-        env.pageserver.tenant_attach(template_tenant, config)
-        ps_http = env.pageserver.http_client()
-        with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
-            pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()])
-            wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
-            ps_http.timeline_checkpoint(template_tenant, template_timeline)
-            ps_http.timeline_compact(template_tenant, template_timeline)
-            for _ in range(
-                0, 17
-            ):  # some prime number to avoid potential resonances with the "_threshold" variables from the config
-                # the L0s produced by this appear to have size ~5MiB
-                num_txns = 10_000
-                pg_bin.run_capture(
-                    ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()]
-                )
-                wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
-                ps_http.timeline_checkpoint(template_tenant, template_timeline)
-                ps_http.timeline_compact(template_tenant, template_timeline)
-        # for reference, the output at scale=6 looked like so (306M total)
-        # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59
-        # total 306M
-        # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829
-        # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919
-        #  33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71
-        #  36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791
-        #  16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1
-        # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9
-        # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639
-        # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799
-        # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19
-        # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021
-
-        return (template_tenant, template_timeline, config)
-
-    def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
-        return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants)
-
-    env = neon_env_builder.build_and_use_snapshot(
-        f"max_throughput_latest_lsn-{n_tenants}-{scale}", doit
-    )
-    env.start()
-    ensure_pageserver_ready_for_benchmarking(env, n_tenants)
-    return env
diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py
index 45eb652362..009d62c9ba 100644
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -2,9 +2,16 @@
 Utilities used by all code in this sub-directory
 """
 
+from typing import Any, Callable, Dict, Tuple
+
+import fixtures.pageserver.many_tenants as many_tenants
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+)
 from fixtures.pageserver.utils import wait_until_all_tenants_state
+from fixtures.types import TenantId, TimelineId
 
 
 def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
@@ -27,3 +34,22 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
                 assert not layer.remote
 
     log.info("ready")
+
+
+def setup_pageserver_with_tenants(
+    neon_env_builder: NeonEnvBuilder,
+    name: str,
+    n_tenants: int,
+    setup: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]],
+) -> NeonEnv:
+    """
+    Utility function to set up a pageserver with a given number of identical tenants.
+    """
+
+    def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
+        return many_tenants.single_timeline(neon_env_builder, setup, n_tenants)
+
+    env = neon_env_builder.build_and_use_snapshot(name, doit)
+    env.start()
+    ensure_pageserver_ready_for_benchmarking(env, n_tenants)
+    return env

From 1d5e476c961cb53089f9eebbd8d67c9902611232 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 28 Feb 2024 13:38:11 +0100
Subject: [PATCH 0295/1571] CI: use build-tools image from dockerhub (#6795)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

Currently, after updating `Dockerfile.build-tools` in a PR, it requires
a manual action to make it `pinned`, i.e., the default for everyone. It
also makes all opened PRs use such images (even created in the PR and
without such changes).
This PR overhauls the way we build and use `build-tools` image (and uses
the image from Docker Hub).

## Summary of changes
- The `neondatabase/build-tools` image gets tagged with the latest
commit sha for the `Dockerfile.build-tools` file
- Each PR calculates the tag for `neondatabase/build-tools`, tries to
pull it, and rebuilds the image with such tag if it doesn't exist.
- Use `neondatabase/build-tools` as a default image
- When running on `main` branch — create a `pinned` tag and push it to
ECR
- Use `concurrency` to ensure we don't build `build-tools` image for the
same commit in parallel from different PRs
---
 .github/workflows/benchmarking.yml            |  10 +-
 .github/workflows/build-build-tools-image.yml | 105 +++++++++++++++
 .../workflows/build_and_push_docker_image.yml | 124 ------------------
 .github/workflows/build_and_test.yml          |  91 ++++++++-----
 .github/workflows/check-build-tools-image.yml |  58 ++++++++
 .github/workflows/neon_extra_builds.yml       |  32 ++++-
 .github/workflows/pin-build-tools-image.yml   |  72 ++++++++++
 .../workflows/update_build_tools_image.yml    |  70 ----------
 CONTRIBUTING.md                               |  15 +--
 ...rfile.buildtools => Dockerfile.build-tools |   0
 10 files changed, 332 insertions(+), 245 deletions(-)
 create mode 100644 .github/workflows/build-build-tools-image.yml
 delete mode 100644 .github/workflows/build_and_push_docker_image.yml
 create mode 100644 .github/workflows/check-build-tools-image.yml
 create mode 100644 .github/workflows/pin-build-tools-image.yml
 delete mode 100644 .github/workflows/update_build_tools_image.yml
 rename Dockerfile.buildtools => Dockerfile.build-tools (100%)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index fc245f42a8..2e56bf909f 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -62,7 +62,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
       options: --init
 
     steps:
@@ -214,7 +214,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
       options: --init
 
     # Increase timeout to 8h, default timeout is 6h
@@ -362,7 +362,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
       options: --init
 
     steps:
@@ -461,7 +461,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
       options: --init
 
     steps:
@@ -558,7 +558,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
       options: --init
 
     steps:
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
new file mode 100644
index 0000000000..251423e701
--- /dev/null
+++ b/.github/workflows/build-build-tools-image.yml
@@ -0,0 +1,105 @@
+name: Build build-tools image
+
+on:
+  workflow_call:
+    inputs:
+      image-tag:
+        description: "build-tools image tag"
+        required: true
+        type: string
+    outputs:
+      image-tag:
+        description: "build-tools tag"
+        value: ${{ inputs.image-tag }}
+      image:
+        description: "build-tools image"
+        value: neondatabase/build-tools:${{ inputs.image-tag }}
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+concurrency:
+  group: build-build-tools-image-${{ inputs.image-tag }}
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+jobs:
+  check-image:
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  # This job uses older version of GitHub Actions because it's run on gen2 runners, which don't support node 20 (for newer versions)
+  build-image:
+    needs: [ check-image ]
+    if: needs.check-image.outputs.found == 'false'
+
+    strategy:
+      matrix:
+        arch: [ x64, arm64 ]
+
+    runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
+
+    env:
+      IMAGE_TAG: ${{ inputs.image-tag }}
+
+    steps:
+      - name: Check `input.tag` is correct
+        env:
+          INPUTS_IMAGE_TAG: ${{ inputs.image-tag }}
+          CHECK_IMAGE_TAG : ${{ needs.check-image.outputs.image-tag }}
+        run: |
+          if [ "${INPUTS_IMAGE_TAG}" != "${CHECK_IMAGE_TAG}" ]; then
+            echo "'inputs.image-tag' (${INPUTS_IMAGE_TAG}) does not match the tag of the latest build-tools image 'inputs.image-tag' (${CHECK_IMAGE_TAG})"
+            exit 1
+          fi
+
+      - uses: actions/checkout@v3
+
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p /tmp/.docker-custom
+          echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
+
+      - uses: docker/setup-buildx-action@v2
+
+      - uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - uses: docker/build-push-action@v4
+        with:
+          context: .
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.build-tools
+          cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }}
+          cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max
+          tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
+
+      - name: Remove custom docker config directory
+        run: |
+          rm -rf /tmp/.docker-custom
+
+  merge-images:
+    needs: [ build-image ]
+    runs-on: ubuntu-latest
+
+    env:
+      IMAGE_TAG: ${{ inputs.image-tag }}
+
+    steps:
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Create multi-arch image
+        run: |
+          docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \
+                                             neondatabase/build-tools:${IMAGE_TAG}-x64 \
+                                             neondatabase/build-tools:${IMAGE_TAG}-arm64
diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml
deleted file mode 100644
index 892e21114b..0000000000
--- a/.github/workflows/build_and_push_docker_image.yml
+++ /dev/null
@@ -1,124 +0,0 @@
-name: Build and Push Docker Image
-
-on:
-  workflow_call:
-    inputs:
-      dockerfile-path:
-        required: true
-        type: string
-      image-name:
-        required: true
-        type: string
-    outputs:
-      build-tools-tag:
-        description: "tag generated for build tools"
-        value: ${{ jobs.tag.outputs.build-tools-tag }}
-
-jobs:
-  check-if-build-tools-dockerfile-changed:
-    runs-on: ubuntu-latest
-    outputs:
-      docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
-    steps:
-      - name: Check if Dockerfile.buildtools has changed
-        id: dockerfile
-        run: |
-          if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
-            echo "docker_file_changed=false" >> $GITHUB_OUTPUT
-            exit
-          fi
-          updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
-          if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
-            echo "docker_file_changed=true" >> $GITHUB_OUTPUT
-          fi
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-  tag:
-    runs-on: ubuntu-latest
-    needs: [ check-if-build-tools-dockerfile-changed ]
-    outputs:
-      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
-
-    steps:
-      - name: Get buildtools tag
-        env:
-          DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
-        run: |
-          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
-            IMAGE_TAG=$GITHUB_RUN_ID
-          else
-            IMAGE_TAG=pinned
-          fi
-
-          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
-        shell: bash
-        id: buildtools-tag
-
-  kaniko:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    needs: [ tag, check-if-build-tools-dockerfile-changed ]
-    runs-on: [ self-hosted, dev, x64 ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-      - name: Kaniko build
-        run: |
-          /kaniko/executor \
-            --reproducible \
-            --snapshotMode=redo \
-            --skip-unused-stages \
-            --dockerfile ${{ inputs.dockerfile-path }} \
-            --cache=true \
-            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
-
-  kaniko-arm:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    needs: [ tag, check-if-build-tools-dockerfile-changed ]
-    runs-on: [ self-hosted, dev, arm64 ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-      - name: Kaniko build
-        run: |
-          /kaniko/executor \
-            --reproducible \
-            --snapshotMode=redo \
-            --skip-unused-stages \
-            --dockerfile ${{ inputs.dockerfile-path }} \
-            --cache=true \
-            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
-
-  manifest:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    name: 'manifest'
-    runs-on: [ self-hosted, dev, x64 ]
-    needs:
-      - tag
-      - kaniko
-      - kaniko-arm
-      - check-if-build-tools-dockerfile-changed
-
-    steps:
-      - name: Create manifest
-        run: |
-          docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
-                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
-                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
-
-      - name: Push manifest
-        run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 0e67259b3f..e29a58bbe2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -77,19 +77,25 @@ jobs:
         shell: bash
         id: build-tag
 
-  build-buildtools-image:
+  check-build-tools-image:
     needs: [ check-permissions ]
-    uses: ./.github/workflows/build_and_push_docker_image.yml
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  build-build-tools-image:
+    needs: [ check-build-tools-image ]
+    uses: ./.github/workflows/build-build-tools-image.yml
     with:
-      dockerfile-path: Dockerfile.buildtools
-      image-name: build-tools
+      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
     secrets: inherit
 
   check-codestyle-python:
-    needs: [ check-permissions, build-buildtools-image ]
+    needs: [ check-permissions, build-build-tools-image ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     steps:
@@ -118,10 +124,13 @@ jobs:
         run: poetry run mypy .
 
   check-codestyle-rust:
-    needs: [ check-permissions, build-buildtools-image ]
+    needs: [ check-permissions, build-build-tools-image ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     steps:
@@ -185,10 +194,13 @@ jobs:
         run: cargo deny check --hide-inclusion-graph
 
   build-neon:
-    needs: [ check-permissions, tag, build-buildtools-image ]
+    needs: [ check-permissions, tag, build-build-tools-image ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       # Raise locked memory limit for tokio-epoll-uring.
       # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
       # io_uring will account the memory of the CQ and SQ as locked.
@@ -426,10 +438,13 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   regress-tests:
-    needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
+    needs: [ check-permissions, build-neon, build-build-tools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       # for changed limits, see comments on `options:` earlier in this file
       options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
     strategy:
@@ -473,10 +488,13 @@ jobs:
   get-benchmarks-durations:
     outputs:
       json: ${{ steps.get-benchmark-durations.outputs.json }}
-    needs: [ check-permissions, build-buildtools-image ]
+    needs: [ check-permissions, build-build-tools-image ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
     steps:
@@ -503,10 +521,13 @@ jobs:
           echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
 
   benchmarks:
-    needs: [ check-permissions, build-neon, build-buildtools-image, get-benchmarks-durations ]
+    needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       # for changed limits, see comments on `options:` earlier in this file
       options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -538,12 +559,15 @@ jobs:
       # while coverage is currently collected for the debug ones
 
   create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
     if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
 
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     steps:
@@ -584,10 +608,13 @@ jobs:
             })
 
   coverage-report:
-    needs: [ check-permissions, regress-tests, build-buildtools-image ]
+    needs: [ check-permissions, regress-tests, build-build-tools-image ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
     strategy:
       fail-fast: false
@@ -691,7 +718,7 @@ jobs:
     secrets: inherit
 
   neon-image:
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, build-build-tools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
 
     steps:
@@ -726,8 +753,7 @@ jobs:
           build-args: |
             GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
             BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
           provenance: false
           push: true
           pull: true
@@ -745,7 +771,7 @@ jobs:
 
   compute-tools-image:
     runs-on: [ self-hosted, gen3, large ]
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, build-build-tools-image, tag ]
 
     steps:
       - name: Checkout
@@ -779,8 +805,7 @@ jobs:
           build-args: |
             GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
             BUILD_TAG=${{needs.tag.outputs.build-tag}}
-            TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
-            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
           provenance: false
           push: true
           pull: true
@@ -797,7 +822,7 @@ jobs:
           rm -rf .docker-custom
 
   compute-node-image:
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, build-build-tools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
 
     strategy:
@@ -844,8 +869,7 @@ jobs:
             GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
             PG_VERSION=${{ matrix.version }}
             BUILD_TAG=${{needs.tag.outputs.build-tag}}
-            TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
-            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
           provenance: false
           push: true
           pull: true
@@ -938,7 +962,7 @@ jobs:
 
       - name: Verify docker-compose example
         timeout-minutes: 20
-        run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
+        run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
 
       - name: Print logs and clean up
         if: always()
@@ -1218,3 +1242,10 @@ jobs:
 
             time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
           done
+
+  pin-build-tools-image:
+    needs: [ build-build-tools-image, promote-images, regress-tests ]
+    if: github.ref_name == 'main'
+    uses: ./.github/workflows/pin-build-tools-image.yml
+    with:
+      from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml
new file mode 100644
index 0000000000..28646dfc19
--- /dev/null
+++ b/.github/workflows/check-build-tools-image.yml
@@ -0,0 +1,58 @@
+name: Check build-tools image
+
+on:
+  workflow_call:
+    outputs:
+      image-tag:
+        description: "build-tools image tag"
+        value: ${{ jobs.check-image.outputs.tag }}
+      found:
+        description: "Whether the image is found in the registry"
+        value: ${{ jobs.check-image.outputs.found }}
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+jobs:
+  check-image:
+    runs-on: ubuntu-latest
+    outputs:
+      tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
+      found: ${{ steps.check-image.outputs.found }}
+
+    steps:
+      - name: Get build-tools image tag for the current commit
+        id: get-build-tools-tag
+        env:
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          LAST_BUILD_TOOLS_SHA=$(
+            gh api \
+              -H "Accept: application/vnd.github+json" \
+              -H "X-GitHub-Api-Version: 2022-11-28" \
+              --method GET \
+              --field path=Dockerfile.build-tools \
+              --field sha=${COMMIT_SHA} \
+              --field per_page=1 \
+              --jq ".[0].sha" \
+              "/repos/${GITHUB_REPOSITORY}/commits"
+          )
+          echo "image-tag=${LAST_BUILD_TOOLS_SHA}" | tee -a $GITHUB_OUTPUT
+
+      - name: Check if such tag found in the registry
+        id: check-image
+        env:
+          IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }}
+        run: |
+          if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then
+            found=true
+          else
+            found=false
+          fi
+
+          echo "found=${found}" | tee -a $GITHUB_OUTPUT
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 1c9763cc00..5a2f9d6645 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -26,6 +26,17 @@ jobs:
     with:
       github-event-name: ${{ github.event_name}}
 
+  check-build-tools-image:
+    needs: [ check-permissions ]
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  build-build-tools-image:
+    needs: [ check-build-tools-image ]
+    uses: ./.github/workflows/build-build-tools-image.yml
+    with:
+      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
+    secrets: inherit
+
   check-macos-build:
     needs: [ check-permissions ]
     if: |
@@ -123,7 +134,7 @@ jobs:
         run: ./run_clippy.sh
 
   check-linux-arm-build:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, build-build-tools-image ]
     timeout-minutes: 90
     runs-on: [ self-hosted, dev, arm64 ]
 
@@ -137,7 +148,10 @@ jobs:
       AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     steps:
@@ -244,12 +258,15 @@ jobs:
           cargo nextest run --package remote_storage --test test_real_azure
 
   check-codestyle-rust-arm:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, build-build-tools-image ]
     timeout-minutes: 90
     runs-on: [ self-hosted, dev, arm64 ]
 
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     steps:
@@ -316,14 +333,17 @@ jobs:
         run: cargo deny check
 
   gather-rust-build-stats:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, build-build-tools-image ]
     if: |
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
       github.ref_name == 'main'
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     env:
diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
new file mode 100644
index 0000000000..c941692066
--- /dev/null
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -0,0 +1,72 @@
+name: 'Pin build-tools image'
+
+on:
+  workflow_dispatch:
+    inputs:
+      from-tag:
+        description: 'Source tag'
+        required: true
+        type: string
+  workflow_call:
+    inputs:
+      from-tag:
+        description: 'Source tag'
+        required: true
+        type: string
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+concurrency:
+  group: pin-build-tools-image-${{ inputs.from-tag }}
+
+permissions: {}
+
+jobs:
+  tag-image:
+    runs-on: ubuntu-latest
+
+    env:
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: pinned
+
+    steps:
+      - name: Check if we really need to pin the image
+        id: check-manifests
+        run: |
+          docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json
+          docker manifest inspect neondatabase/build-tools:${TO_TAG}   > ${TO_TAG}.json
+
+          if diff ${FROM_TAG}.json ${TO_TAG}.json; then
+            skip=true
+          else
+            skip=false
+          fi
+
+          echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
+
+      - uses: docker/login-action@v3
+        if: steps.check-manifests.outputs.skip == 'false'
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
+        if: steps.check-manifests.outputs.skip == 'false'
+        run: |
+          docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
+                                             neondatabase/build-tools:${FROM_TAG}
+
+      - uses: docker/login-action@v3
+        if: steps.check-manifests.outputs.skip == 'false'
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
+        if: steps.check-manifests.outputs.skip == 'false'
+        run: |
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
+                                             neondatabase/build-tools:${FROM_TAG}
diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml
deleted file mode 100644
index 900724fc60..0000000000
--- a/.github/workflows/update_build_tools_image.yml
+++ /dev/null
@@ -1,70 +0,0 @@
-name: 'Update build tools image tag'
-
-# This workflow it used to update tag of build tools in ECR.
-# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
-
-on:
-  workflow_dispatch:
-    inputs:
-      from-tag:
-        description: 'Source tag'
-        required: true
-        type: string
-      to-tag:
-        description: 'Destination tag'
-        required: true
-        type: string
-        default: 'pinned'
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-permissions: {}
-
-jobs:
-  tag-image:
-    runs-on: [ self-hosted, gen3, small ]
-
-    env:
-      ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: ${{ inputs.to-tag }}
-
-    steps:
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-
-      - uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - uses: docker/login-action@v2
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.21'
-
-      - name: Install crane
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
-
-      - name: Copy images
-        run: |
-          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
-          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
-
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2e447fba47..164eb77f58 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -74,16 +74,11 @@ We're using the following approach to make it work:
 
 For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
 
-## How do I add the "pinned" tag to an buildtools image?
-We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
+## How do I make build-tools image "pinned"
 
-You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
-or using GitHub CLI:
+It's possible to update the `pinned` tag of the `build-tools` image using the `pin-build-tools-image.yml` workflow.
 
 ```bash
-gh workflow -R neondatabase/neon run update_build_tools_image.yml \
-            -f from-tag=6254913013 \
-            -f to-tag=pinned \
-
-# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
-```
\ No newline at end of file
+gh workflow -R neondatabase/neon run pin-build-tools-image.yml \
+            -f from-tag=cc98d9b00d670f182c507ae3783342bd7e64c31e
+```
diff --git a/Dockerfile.buildtools b/Dockerfile.build-tools
similarity index 100%
rename from Dockerfile.buildtools
rename to Dockerfile.build-tools

From 48957e23b719250b81414b8183628b997212b516 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 28 Feb 2024 17:10:07 +0400
Subject: [PATCH 0296/1571] proxy: refactor span usage (#6946)

## Problem

Hard to find error reasons by endpoint for HTTP flow.

## Summary of changes

I want all root spans to have session id and endpoint id. I want all
root spans to be consistent.
---
 proxy/src/auth/backend.rs             |   1 -
 proxy/src/auth/backend/classic.rs     |   2 +-
 proxy/src/auth/backend/link.rs        |   1 -
 proxy/src/auth/credentials.rs         |   3 +-
 proxy/src/context.rs                  |  14 +++-
 proxy/src/proxy.rs                    | 115 ++++++++++++--------------
 proxy/src/proxy/tests.rs              |   1 +
 proxy/src/serverless.rs               |  50 +++++------
 proxy/src/serverless/sql_over_http.rs |  22 +----
 9 files changed, 99 insertions(+), 110 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 5cb8074cd5..11af85caa4 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -194,7 +194,6 @@ async fn auth_quirks(
             let res = hacks::password_hack_no_authentication(ctx, info, client).await?;
 
             ctx.set_endpoint_id(res.info.endpoint.clone());
-            tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
             let password = match res.keys {
                 ComputeCredentialKeys::Password(p) => p,
                 _ => unreachable!("password hack should return a password"),
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index d075331846..b98fa63120 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -44,7 +44,7 @@ pub(super) async fn authenticate(
             )
             .await
             .map_err(|e| {
-                warn!("error processing scram messages error = authentication timed out, execution time exeeded {} seconds", config.scram_protocol_timeout.as_secs());
+                warn!("error processing scram messages error = authentication timed out, execution time exceeded {} seconds", config.scram_protocol_timeout.as_secs());
                 auth::AuthError::user_timeout(e)
             })??;
 
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index bf9ebf4c18..ec7d891247 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -102,7 +102,6 @@ pub(super) async fn authenticate(
 
     ctx.set_user(db_info.user.into());
     ctx.set_project(db_info.aux.clone());
-    tracing::Span::current().record("ep", &tracing::field::display(&db_info.aux.endpoint_id));
 
     // Backwards compatibility. pg_sni_proxy uses "--" in domain names
     // while direct connections do not. Once we migrate to pg_sni_proxy
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index d318b3be54..89773aa1ff 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -142,10 +142,9 @@ impl ComputeUserInfoMaybeEndpoint {
 
         if let Some(ep) = &endpoint {
             ctx.set_endpoint_id(ep.clone());
-            tracing::Span::current().record("ep", &tracing::field::display(ep));
         }
 
-        info!(%user, project = endpoint.as_deref(), "credentials");
+        info!(%user, "credentials");
         if sni.is_some() {
             info!("Connection with sni");
             NUM_CONNECTION_ACCEPTED_BY_SNI
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 4d8ced6f8f..abad8a6412 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -5,6 +5,7 @@ use once_cell::sync::OnceCell;
 use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
+use tracing::{field::display, info_span, Span};
 use uuid::Uuid;
 
 use crate::{
@@ -29,6 +30,7 @@ pub struct RequestMonitoring {
     pub protocol: &'static str,
     first_packet: chrono::DateTime<Utc>,
     region: &'static str,
+    pub span: Span,
 
     // filled in as they are discovered
     project: Option<ProjectId>,
@@ -64,12 +66,21 @@ impl RequestMonitoring {
         protocol: &'static str,
         region: &'static str,
     ) -> Self {
+        let span = info_span!(
+            "connect_request",
+            %protocol,
+            ?session_id,
+            %peer_addr,
+            ep = tracing::field::Empty,
+        );
+
         Self {
             peer_addr,
             session_id,
             protocol,
             first_packet: Utc::now(),
             region,
+            span,
 
             project: None,
             branch: None,
@@ -101,8 +112,8 @@ impl RequestMonitoring {
     }
 
     pub fn set_project(&mut self, x: MetricsAuxInfo) {
+        self.set_endpoint_id(x.endpoint_id);
         self.branch = Some(x.branch_id);
-        self.endpoint_id = Some(x.endpoint_id);
         self.project = Some(x.project_id);
         self.is_cold_start = x.is_cold_start;
     }
@@ -112,6 +123,7 @@ impl RequestMonitoring {
     }
 
     pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
+        self.span.record("ep", display(&endpoint_id));
         crate::metrics::CONNECTING_ENDPOINTS
             .with_label_values(&[self.protocol])
             .measure(&endpoint_id);
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 8a9445303a..d94fc67491 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -22,7 +22,6 @@ use crate::{
     stream::{PqStream, Stream},
     EndpointCacheKey,
 };
-use anyhow::{bail, Context};
 use futures::TryFutureExt;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
@@ -33,7 +32,7 @@ use std::sync::Arc;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, Instrument};
+use tracing::{error, info, Instrument};
 
 use self::{
     connect_compute::{connect_to_compute, TcpMechanism},
@@ -83,68 +82,67 @@ pub async fn task_main(
         let cancellation_handler = Arc::clone(&cancellation_handler);
         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
-        let session_span = info_span!(
-            "handle_client",
-            ?session_id,
-            peer_addr = tracing::field::Empty,
-            ep = tracing::field::Empty,
-        );
-
-        connections.spawn(
-            async move {
-                info!("accepted postgres client connection");
-
-                let mut socket = WithClientIp::new(socket);
-                let mut peer_addr = peer_addr.ip();
-                if let Some(addr) = socket.wait_for_addr().await? {
-                    peer_addr = addr.ip();
-                    tracing::Span::current().record("peer_addr", &tracing::field::display(addr));
-                } else if config.require_client_ip {
-                    bail!("missing required client IP");
+        connections.spawn(async move {
+            let mut socket = WithClientIp::new(socket);
+            let mut peer_addr = peer_addr.ip();
+            match socket.wait_for_addr().await {
+                Ok(Some(addr)) => peer_addr = addr.ip(),
+                Err(e) => {
+                    error!("per-client task finished with an error: {e:#}");
+                    return;
                 }
+                Ok(None) if config.require_client_ip => {
+                    error!("missing required client IP");
+                    return;
+                }
+                Ok(None) => {}
+            }
 
-                socket
-                    .inner
-                    .set_nodelay(true)
-                    .context("failed to set socket option")?;
+            match socket.inner.set_nodelay(true) {
+                Ok(()) => {},
+                Err(e) => {
+                    error!("per-client task finished with an error: failed to set socket option: {e:#}");
+                    return;
+                },
+            };
 
-                let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
+            let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
+            let span = ctx.span.clone();
 
-                let res = handle_client(
-                    config,
-                    &mut ctx,
-                    cancellation_handler,
-                    socket,
-                    ClientMode::Tcp,
-                    endpoint_rate_limiter,
-                )
-                .await;
+            let res = handle_client(
+                config,
+                &mut ctx,
+                cancellation_handler,
+                socket,
+                ClientMode::Tcp,
+                endpoint_rate_limiter,
+            )
+            .instrument(span.clone())
+            .await;
 
-                match res {
-                    Err(e) => {
-                        // todo: log and push to ctx the error kind
-                        ctx.set_error_kind(e.get_error_kind());
-                        ctx.log();
-                        Err(e.into())
-                    }
-                    Ok(None) => {
-                        ctx.set_success();
-                        ctx.log();
-                        Ok(())
-                    }
-                    Ok(Some(p)) => {
-                        ctx.set_success();
-                        ctx.log();
-                        p.proxy_pass().await
+            match res {
+                Err(e) => {
+                    // todo: log and push to ctx the error kind
+                    ctx.set_error_kind(e.get_error_kind());
+                    ctx.log();
+                    error!(parent: &span, "per-client task finished with an error: {e:#}");
+                }
+                Ok(None) => {
+                    ctx.set_success();
+                    ctx.log();
+                }
+                Ok(Some(p)) => {
+                    ctx.set_success();
+                    ctx.log();
+                    match p.proxy_pass().instrument(span.clone()).await {
+                        Ok(()) => {}
+                        Err(e) => {
+                            error!(parent: &span, "per-client task finished with an error: {e:#}");
+                        }
                     }
                 }
             }
-            .unwrap_or_else(move |e| {
-                // Acknowledge that the task has finished with an error.
-                error!("per-client task finished with an error: {e:#}");
-            })
-            .instrument(session_span),
-        );
+        });
     }
 
     connections.close();
@@ -232,10 +230,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
-    info!(
-        protocol = ctx.protocol,
-        "handling interactive connection from client"
-    );
+    info!("handling interactive connection from client");
 
     let proto = ctx.protocol;
     let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index c407a5572a..595d9c4979 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -17,6 +17,7 @@ use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
+use anyhow::{bail, Context};
 use async_trait::async_trait;
 use rstest::rstest;
 use tokio_postgres::config::SslMode;
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index dbf4f9cc74..b5806aec53 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -34,13 +34,14 @@ use hyper::{
     Body, Method, Request, Response,
 };
 
+use std::convert::Infallible;
 use std::net::IpAddr;
 use std::task::Poll;
 use std::{future::ready, sync::Arc};
 use tls_listener::TlsListener;
 use tokio::net::TcpListener;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{error, info, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};
 
 pub const SERVERLESS_DRIVER_SNI: &str = "api";
@@ -134,24 +135,19 @@ pub async fn task_main(
                         let cancellation_handler = cancellation_handler.clone();
 
                         async move {
-                            let session_id = uuid::Uuid::new_v4();
-
-                            request_handler(
-                                req,
-                                config,
-                                backend,
-                                ws_connections,
-                                cancellation_handler,
-                                session_id,
-                                peer_addr.ip(),
-                                endpoint_rate_limiter,
+                            Ok::<_, Infallible>(
+                                request_handler(
+                                    req,
+                                    config,
+                                    backend,
+                                    ws_connections,
+                                    cancellation_handler,
+                                    peer_addr.ip(),
+                                    endpoint_rate_limiter,
+                                )
+                                .await
+                                .map_or_else(|e| e.into_response(), |r| r),
                             )
-                            .instrument(info_span!(
-                                "serverless",
-                                session = %session_id,
-                                %peer_addr,
-                            ))
-                            .await
                         }
                     },
                 )))
@@ -210,10 +206,11 @@ async fn request_handler(
     backend: Arc<PoolingBackend>,
     ws_connections: TaskTracker,
     cancellation_handler: Arc<CancellationHandler>,
-    session_id: uuid::Uuid,
     peer_addr: IpAddr,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> Result<Response<Body>, ApiError> {
+    let session_id = uuid::Uuid::new_v4();
+
     let host = request
         .headers()
         .get("host")
@@ -223,15 +220,15 @@ async fn request_handler(
 
     // Check if the request is a websocket upgrade request.
     if hyper_tungstenite::is_upgrade_request(&request) {
-        info!(session_id = ?session_id, "performing websocket upgrade");
+        let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
+        let span = ctx.span.clone();
+        info!(parent: &span, "performing websocket upgrade");
 
         let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
             .map_err(|e| ApiError::BadRequest(e.into()))?;
 
         ws_connections.spawn(
             async move {
-                let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
-
                 if let Err(e) = websocket::serve_websocket(
                     config,
                     ctx,
@@ -242,18 +239,21 @@ async fn request_handler(
                 )
                 .await
                 {
-                    error!(session_id = ?session_id, "error in websocket connection: {e:#}");
+                    error!("error in websocket connection: {e:#}");
                 }
             }
-            .in_current_span(),
+            .instrument(span),
         );
 
         // Return the response so the spawned future can continue.
         Ok(response)
     } else if request.uri().path() == "/sql" && request.method() == Method::POST {
         let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
+        let span = ctx.span.clone();
 
-        sql_over_http::handle(config, ctx, request, backend).await
+        sql_over_http::handle(config, ctx, request, backend)
+            .instrument(span)
+            .await
     } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
         Response::builder()
             .header("Allow", "OPTIONS, POST")
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 63fe87eade..7f51ba82cc 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -21,7 +21,6 @@ use tokio_postgres::ReadyForQueryStatus;
 use tokio_postgres::Transaction;
 use tracing::error;
 use tracing::info;
-use tracing::instrument;
 use url::Url;
 use utils::http::error::ApiError;
 use utils::http::json::json_response;
@@ -291,7 +290,7 @@ pub async fn handle(
             // ctx.set_error_kind(crate::error::ErrorKind::RateLimit);
 
             let message = format!(
-                "HTTP-Connection timed out, execution time exeeded {} seconds",
+                "HTTP-Connection timed out, execution time exceeded {} seconds",
                 config.http_config.request_timeout.as_secs()
             );
             error!(message);
@@ -309,14 +308,6 @@ pub async fn handle(
     Ok(response)
 }
 
-#[instrument(
-    name = "sql-over-http",
-    skip_all,
-    fields(
-        pid = tracing::field::Empty,
-        conn_id = tracing::field::Empty
-    )
-)]
 async fn handle_inner(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
@@ -326,10 +317,7 @@ async fn handle_inner(
     let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
         .with_label_values(&[ctx.protocol])
         .guard();
-    info!(
-        protocol = ctx.protocol,
-        "handling interactive connection from client"
-    );
+    info!("handling interactive connection from client");
 
     //
     // Determine the destination and connection params
@@ -337,11 +325,7 @@ async fn handle_inner(
     let headers = request.headers();
     // TLS config should be there.
     let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
-    info!(
-        user = conn_info.user_info.user.as_str(),
-        project = conn_info.user_info.endpoint.as_str(),
-        "credentials"
-    );
+    info!(user = conn_info.user_info.user.as_str(), "credentials");
 
     // Determine the output options. Default behaviour is 'false'. Anything that is not
     // strictly 'true' assumed to be false.

From edd809747bc8558dd297ac50f41b213c629700c7 Mon Sep 17 00:00:00 2001
From: Andreas Scherbaum <andreasscherbaum@users.noreply.github.com>
Date: Wed, 28 Feb 2024 14:10:58 +0100
Subject: [PATCH 0297/1571] English keyboard has "z" and "y" switched (#6947)

## Problem

The "z" and "y" letters are switched on the English keyboard, and I'm
used to a German keyboard. Very embarrassing.

## Summary of changes

Fix syntax error in README

Co-authored-by: Andreas Scherbaum <andreas@neon.tech>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ce14a32a2a..95926b4628 100644
--- a/README.md
+++ b/README.md
@@ -267,7 +267,7 @@ You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or th
 
 For cleaning up the source tree from build artifacts, run `make clean` in the source directory.
 
-For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directorz will remove your database, with all data in it. You have been warned!
+For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directory will remove your database, with all data in it. You have been warned!
 
 ## Documentation
 

From 60a232400b23859914777039196fddad38ba2d6d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 28 Feb 2024 15:36:17 +0100
Subject: [PATCH 0298/1571] CI(pin-build-tools-image): pass secrets to the job
 (#6949)

## Problem

`pin-build-tools-image` job doesn't have access to secrets and thus
fails. Missed in the original PR[0]

- [0] https://github.com/neondatabase/neon/pull/6795

## Summary of changes
- pass secrets to `pin-build-tools-image` job
---
 .github/workflows/build_and_test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e29a58bbe2..2517c97355 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1249,3 +1249,4 @@ jobs:
     uses: ./.github/workflows/pin-build-tools-image.yml
     with:
       from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
+    secrets: inherit

From e5384ebefc1c983c2e5eb73a5763b4c514b2c599 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 28 Feb 2024 14:53:35 +0000
Subject: [PATCH 0299/1571] pageserver: accelerate tenant activation on HTTP
 API timeline read requests (#6944)

## Problem

Callers of the timeline creation API may issue timeline GETs ahead of
creation to e.g. check if their intended timeline already exists, or to
learn the LSN of a parent timeline.

Although the timeline creation API already triggers activation of a
timeline if it's currently waiting to activate, the GET endpoint
doesn't, so such callers will encounter 503 responses for several
minutes after a pageserver restarts, while tenants are lazily warming
up.

The original scope of which APIs will activate a timeline was quite
small, but really it makes sense to do it for any API that needs a
particular timeline to be active.

## Summary of changes

- In the timeline detail GET handler, use wait_to_become_active, which
triggers immediate activation of a tenant if it was currently waiting
for the warmup semaphore, then waits up to 5 seconds for the activation
to complete. If it doesn't complete promptly, we return a 503 as before.
- Modify active_timeline_for_active_tenant to also use
wait_to_become_active, which indirectly makes several other
timeline-scope request handlers fast-activate a tenant when called. This
is important because a timeline creation flow could also use e.g.
get_lsn_for_timestamp as a precursor to creating a timeline.
- There is some risk to this change: an excessive number of timeline GET
requests could cause too many tenant activations to happen at the same
time, leading to excessive queue depth to the S3 client. However, this
was already the case for e.g. many concurrent timeline creations.
---
 pageserver/src/http/routes.rs | 64 ++++++++++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 04211fbb7f..12bd21fd7b 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -661,9 +661,14 @@ async fn timeline_detail_handler(
 
     // Logical size calculation needs downloading.
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let state = get_state(&request);
 
     let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id, false)?;
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
         let timeline = tenant
             .get_timeline(timeline_id, false)
@@ -696,6 +701,7 @@ async fn get_lsn_by_timestamp_handler(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
 
     if !tenant_shard_id.is_zero() {
         // Requires SLRU contents, which are only stored on shard zero
@@ -712,7 +718,10 @@ async fn get_lsn_by_timestamp_handler(
     let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
     let result = timeline
         .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
         .await?;
@@ -743,6 +752,7 @@ async fn get_timestamp_of_lsn_handler(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
 
     if !tenant_shard_id.is_zero() {
         // Requires SLRU contents, which are only stored on shard zero
@@ -759,7 +769,9 @@ async fn get_timestamp_of_lsn_handler(
         .map_err(ApiError::BadRequest)?;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
     let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
 
     match result {
@@ -1159,10 +1171,13 @@ async fn layer_map_info_handler(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let reset: LayerAccessStatsReset =
         parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset);
+    let state = get_state(&request);
 
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
     let layer_map_info = timeline.layer_map_info(reset).await;
 
     json_response(StatusCode::OK, layer_map_info)
@@ -1176,8 +1191,11 @@ async fn layer_download_handler(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let layer_file_name = get_request_param(&request, "layer_file_name")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
 
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
     let downloaded = timeline
         .download_layer(layer_file_name)
         .await
@@ -1201,8 +1219,11 @@ async fn evict_timeline_layer_handler(
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let layer_file_name = get_request_param(&request, "layer_file_name")?;
+    let state = get_state(&request);
 
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
     let evicted = timeline
         .evict_layer(layer_file_name)
         .await
@@ -1612,6 +1633,8 @@ async fn timeline_compact_handler(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
+    let state = get_state(&request);
+
     let mut flags = EnumSet::empty();
     if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
         flags |= CompactFlags::ForceRepartition;
@@ -1622,7 +1645,7 @@ async fn timeline_compact_handler(
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
         timeline
             .compact(&cancel, flags, &ctx)
             .await
@@ -1642,6 +1665,8 @@ async fn timeline_checkpoint_handler(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
+    let state = get_state(&request);
+
     let mut flags = EnumSet::empty();
     if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
         flags |= CompactFlags::ForceRepartition;
@@ -1652,7 +1677,7 @@ async fn timeline_checkpoint_handler(
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
         timeline
             .freeze_and_flush()
             .await
@@ -1677,7 +1702,11 @@ async fn timeline_download_remote_layers_handler_post(
     let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let state = get_state(&request);
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
     match timeline.spawn_download_all_remote_layers(body).await {
         Ok(st) => json_response(StatusCode::ACCEPTED, st),
         Err(st) => json_response(StatusCode::CONFLICT, st),
@@ -1691,8 +1720,11 @@ async fn timeline_download_remote_layers_handler_get(
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let state = get_state(&request);
 
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
     let info = timeline
         .get_download_all_remote_layers_task_info()
         .context("task never started since last pageserver process start")
@@ -1741,6 +1773,7 @@ async fn getpage_at_lsn_handler(
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
 
     struct Key(crate::repository::Key);
 
@@ -1759,7 +1792,7 @@ async fn getpage_at_lsn_handler(
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
 
         let page = timeline.get(key.0, lsn, &ctx).await?;
 
@@ -1782,12 +1815,13 @@ async fn timeline_collect_keyspace(
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
 
     let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
         let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
         let keys = timeline
             .collect_keyspace(at_lsn, &ctx)
@@ -1803,10 +1837,14 @@ async fn timeline_collect_keyspace(
 }
 
 async fn active_timeline_of_active_tenant(
+    tenant_manager: &TenantManager,
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+    let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
+
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
     tenant
         .get_timeline(timeline_id, true)
         .map_err(|e| ApiError::NotFound(e.into()))

From 54586d6b575a0a49e905db45b11147f294d5ba69 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 28 Feb 2024 16:24:35 +0100
Subject: [PATCH 0300/1571] CI: create compute-tools image from compute-node
 image (#6899)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

We build compute-tools binary twice — in `compute-node` and in
`compute-tools` jobs, and we build them slightly differently:
- `cargo build --locked --profile release-line-debug-size-lto`
(previously in `compute-node`)
- `mold -run cargo build -p compute_tools --locked --release`
(previously in `compute-tools`)

Before:
- compute-node: **6m 34s**
- compute-tools (as a separate job): **7m 47s**

After:
- compute-node: **7m 34s**
- compute-tools (as a separate step, within compute-node job):  **5s**

## Summary of changes
- Move compute-tools image creation to `Dockerfile.compute-node`
- Delete `Dockerfile.compute-tools`
---
 .github/workflows/build_and_test.yml | 78 ++++++++--------------------
 Dockerfile.compute-node              | 12 ++++-
 Dockerfile.compute-tools             | 32 ------------
 3 files changed, 34 insertions(+), 88 deletions(-)
 delete mode 100644 Dockerfile.compute-tools

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 2517c97355..2e52e7c28f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -769,58 +769,6 @@ jobs:
         run: |
           rm -rf .docker-custom
 
-  compute-tools-image:
-    runs-on: [ self-hosted, gen3, large ]
-    needs: [ check-permissions, build-build-tools-image, tag ]
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
-
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - uses: docker/login-action@v3
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-      - uses: docker/build-push-action@v5
-        with:
-          context: .
-          build-args: |
-            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-            BUILD_TAG=${{needs.tag.outputs.build-tag}}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
-          provenance: false
-          push: true
-          pull: true
-          file: Dockerfile.compute-tools
-          cache-from: type=registry,ref=neondatabase/compute-tools:cache
-          cache-to: type=registry,ref=neondatabase/compute-tools:cache,mode=max
-          tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
-            neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
-
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
-
   compute-node-image:
     needs: [ check-permissions, build-build-tools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
@@ -862,13 +810,14 @@ jobs:
           username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
           password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
-      - uses: docker/build-push-action@v5
+      - name: Build compute-node image
+        uses: docker/build-push-action@v5
         with:
           context: .
           build-args: |
             GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
             PG_VERSION=${{ matrix.version }}
-            BUILD_TAG=${{needs.tag.outputs.build-tag}}
+            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
             TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
           provenance: false
           push: true
@@ -880,6 +829,25 @@ jobs:
             369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
             neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
+      - name: Build compute-tools image
+        # compute-tools are Postgres independent, so build it only once
+        if: ${{ matrix.version == 'v16' }}
+        uses: docker/build-push-action@v5
+        with:
+          target: compute-tools-image
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.compute-node
+          tags: |
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
+            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
+
       - name: Remove custom docker config directory
         if: always()
         run: |
@@ -927,7 +895,7 @@ jobs:
           docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
   test-images:
-    needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ]
+    needs: [ check-permissions, tag, neon-image, compute-node-image ]
     runs-on: [ self-hosted, gen3, small ]
 
     steps:
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 149ca5109b..c73b9ce5c9 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -891,7 +891,17 @@ ENV BUILD_TAG=$BUILD_TAG
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
-RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
+RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
+
+#########################################################################################
+#
+# Final compute-tools image
+#
+#########################################################################################
+
+FROM debian:bullseye-slim AS compute-tools-image
+
+COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
 
 #########################################################################################
 #
diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools
deleted file mode 100644
index cc305cc556..0000000000
--- a/Dockerfile.compute-tools
+++ /dev/null
@@ -1,32 +0,0 @@
-# First transient image to build compute_tools binaries
-# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
-ARG REPOSITORY=neondatabase
-ARG IMAGE=build-tools
-ARG TAG=pinned
-ARG BUILD_TAG
-
-FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
-WORKDIR /home/nonroot
-
-# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
-# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
-# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
-ARG RUSTC_WRAPPER=cachepot
-ENV AWS_REGION=eu-central-1
-ENV CACHEPOT_S3_KEY_PREFIX=cachepot
-ARG CACHEPOT_BUCKET=neon-github-dev
-#ARG AWS_ACCESS_KEY_ID
-#ARG AWS_SECRET_ACCESS_KEY
-ARG BUILD_TAG
-ENV BUILD_TAG=$BUILD_TAG
-
-COPY . .
-
-RUN set -e \
-    && mold -run cargo build -p compute_tools --locked --release \
-    && cachepot -s
-
-# Final image that only has one binary
-FROM debian:bullseye-slim
-
-COPY --from=rust-build /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl

From d04af08567cc3ff94ff19a2f6b3f7a2a1e3c55d1 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 29 Feb 2024 10:00:01 +0000
Subject: [PATCH 0301/1571] control_plane: storage controller secrets by env
 (#6952)

## Problem

Sometimes folks prefer not to expose secrets as CLI args.

## Summary of changes

- Add ability to load secrets from environment variables.

We can eventually remove the AWS SM code path here if nobody is using it
-- we don't need to maintain three ways to load secrets.
---
 control_plane/attachment_service/src/main.rs | 27 +++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index db4f00644f..5b952ae4fc 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -79,13 +79,38 @@ impl Secrets {
         "neon-storage-controller-control-plane-jwt-token";
     const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
 
+    const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
+    const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
+    const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
+    const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
+
+    /// Load secrets from, in order of preference:
+    /// - CLI args if database URL is provided on the CLI
+    /// - Environment variables if DATABASE_URL is set.
+    /// - AWS Secrets Manager secrets
     async fn load(args: &Cli) -> anyhow::Result<Self> {
         match &args.database_url {
             Some(url) => Self::load_cli(url, args),
-            None => Self::load_aws_sm().await,
+            None => match std::env::var(Self::DATABASE_URL_ENV) {
+                Ok(database_url) => Self::load_env(database_url),
+                Err(_) => Self::load_aws_sm().await,
+            },
         }
     }
 
+    fn load_env(database_url: String) -> anyhow::Result<Self> {
+        let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
+            Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
+            Err(_) => None,
+        };
+        Ok(Self {
+            database_url,
+            public_key,
+            jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
+            control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
+        })
+    }
+
     async fn load_aws_sm() -> anyhow::Result<Self> {
         let Ok(region) = std::env::var("AWS_REGION") else {
             anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");

From 4d426f6fbe596a12c19b86bbf43313e3452ac73b Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 29 Feb 2024 13:26:29 +0200
Subject: [PATCH 0302/1571] feat: support lazy, queued tenant attaches (#6907)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add off-by-default support for lazy queued tenant activation on attach.
This should be useful on bulk migrations as some tenants will be
activated faster due to operations or endpoint startup. Eventually all
tenants will get activated by reusing the same mechanism we have at
startup (`PageserverConf::concurrent_tenant_warmup`).

The difference to lazy attached tenants to startup ones is that we leave
their initial logical size calculation be triggered by WalReceiver or
consumption metrics.

Fixes: #6315

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/config.rs                  |   6 +-
 pageserver/src/http/openapi_spec.yml      |   6 +
 pageserver/src/http/routes.rs             |  25 ++-
 pageserver/src/tenant.rs                  |  68 ++++----
 pageserver/src/tenant/delete.rs           |   2 +-
 pageserver/src/tenant/mgr.rs              |  12 +-
 test_runner/fixtures/pageserver/http.py   |   9 +-
 test_runner/regress/test_timeline_size.py | 200 ++++++++++++++++++++--
 8 files changed, 255 insertions(+), 73 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index d18b8d6885..0a7172bde2 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -212,9 +212,9 @@ pub struct PageServerConf {
 
     pub log_format: LogFormat,
 
-    /// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
-    /// does not limit tenants loaded in response to client I/O.  A lower value implicitly deprioritizes
-    /// loading such tenants, vs. other work in the system.
+    /// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach.
+    ///
+    /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
     pub concurrent_tenant_warmup: ConfigurableSemaphore,
 
     /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 5afb3ba63d..19b5fb7e79 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -579,6 +579,12 @@ paths:
         required: false
         schema:
           type: integer
+      - name: lazy
+        in: query
+        required: false
+        schema:
+          type: boolean
+        description: Set to true for attaches to queue up until activated by compute. Eager (false) is the default.
     put:
       description: |
         Configures a _tenant location_, that is how a particular pageserver handles
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 12bd21fd7b..9d92fbaee0 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -816,13 +816,7 @@ async fn tenant_attach_handler(
 
     let tenant = state
         .tenant_manager
-        .upsert_location(
-            tenant_shard_id,
-            location_conf,
-            None,
-            SpawnMode::Normal,
-            &ctx,
-        )
+        .upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx)
         .await?;
 
     let Some(tenant) = tenant else {
@@ -1418,6 +1412,7 @@ async fn put_tenant_location_config_handler(
 
     let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
     let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
+    let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false);
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
@@ -1448,15 +1443,17 @@ async fn put_tenant_location_config_handler(
     let location_conf =
         LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
 
+    // lazy==true queues up for activation or jumps the queue like normal when a compute connects,
+    // similar to at startup ordering.
+    let spawn_mode = if lazy {
+        tenant::SpawnMode::Lazy
+    } else {
+        tenant::SpawnMode::Eager
+    };
+
     let attached = state
         .tenant_manager
-        .upsert_location(
-            tenant_shard_id,
-            location_conf,
-            flush,
-            tenant::SpawnMode::Normal,
-            &ctx,
-        )
+        .upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx)
         .await?
         .is_some();
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 6a63a2adeb..f027e9d4b1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -227,7 +227,11 @@ pub(crate) struct TenantPreload {
 /// When we spawn a tenant, there is a special mode for tenant creation that
 /// avoids trying to read anything from remote storage.
 pub(crate) enum SpawnMode {
-    Normal,
+    /// Activate as soon as possible
+    Eager,
+    /// Lazy activation in the background, with the option to skip the queue if the need comes up
+    Lazy,
+    /// Tenant has been created during the lifetime of this process
     Create,
 }
 
@@ -700,41 +704,37 @@ impl Tenant {
                     .and_then(|x| x.initial_tenant_load_remote.take());
 
                 enum AttachType<'a> {
-                    // During pageserver startup, we are attaching this tenant lazily in the background
-                    Warmup(tokio::sync::SemaphorePermit<'a>),
-                    // During pageserver startup, we are attaching this tenant as soon as we can,
-                    // because a client tried to access it.
+                    /// We are attaching this tenant lazily in the background.
+                    Warmup {
+                        _permit: tokio::sync::SemaphorePermit<'a>,
+                        during_startup: bool
+                    },
+                    /// We are attaching this tenant as soon as we can, because for example an
+                    /// endpoint tried to access it.
                     OnDemand,
-                    // During normal operations after startup, we are attaching a tenant.
+                    /// During normal operations after startup, we are attaching a tenant, and
+                    /// eager attach was requested.
                     Normal,
                 }
 
-                // Before doing any I/O, wait for either or:
-                // - A client to attempt to access to this tenant (on-demand loading)
-                // - A permit to become available in the warmup semaphore (background warmup)
-                //
-                // Some-ness of init_order is how we know if we're attaching during startup or later
-                // in process lifetime.
-                let attach_type = if init_order.is_some() {
+                let attach_type = if matches!(mode, SpawnMode::Lazy) {
+                    // Before doing any I/O, wait for at least one of:
+                    // - A client attempting to access to this tenant (on-demand loading)
+                    // - A permit becoming available in the warmup semaphore (background warmup)
+
                     tokio::select!(
-                        _ = tenant_clone.activate_now_sem.acquire() => {
+                        permit = tenant_clone.activate_now_sem.acquire() => {
+                            let _ = permit.expect("activate_now_sem is never closed");
                             tracing::info!("Activating tenant (on-demand)");
                             AttachType::OnDemand
                         },
-                        permit_result = conf.concurrent_tenant_warmup.inner().acquire() => {
-                            match permit_result {
-                                Ok(p) => {
-                                    tracing::info!("Activating tenant (warmup)");
-                                    AttachType::Warmup(p)
-                                }
-                                Err(_) => {
-                                    // This is unexpected: the warmup semaphore should stay alive
-                                    // for the lifetime of init_order.  Log a warning and proceed.
-                                    tracing::warn!("warmup_limit semaphore unexpectedly closed");
-                                    AttachType::Normal
-                                }
+                        permit = conf.concurrent_tenant_warmup.inner().acquire() => {
+                            let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed");
+                            tracing::info!("Activating tenant (warmup)");
+                            AttachType::Warmup {
+                                _permit,
+                                during_startup: init_order.is_some()
                             }
-
                         }
                         _ = tenant_clone.cancel.cancelled() => {
                             // This is safe, but should be pretty rare: it is interesting if a tenant
@@ -749,6 +749,8 @@ impl Tenant {
                         },
                     )
                 } else {
+                    // SpawnMode::{Create,Eager} always cause jumping ahead of the
+                    // concurrent_tenant_warmup queue
                     AttachType::Normal
                 };
 
@@ -756,7 +758,7 @@ impl Tenant {
                     (SpawnMode::Create, _) => {
                         None
                     },
-                    (SpawnMode::Normal, Some(remote_storage)) => {
+                    (SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => {
                         let _preload_timer = TENANT.preload.start_timer();
                         let res = tenant_clone
                             .preload(remote_storage, task_mgr::shutdown_token())
@@ -769,7 +771,7 @@ impl Tenant {
                             }
                         }
                     }
-                    (SpawnMode::Normal, None) => {
+                    (_, None) => {
                         let _preload_timer = TENANT.preload.start_timer();
                         None
                     }
@@ -828,7 +830,7 @@ impl Tenant {
                 let attached = {
                     let _attach_timer = match mode {
                         SpawnMode::Create => None,
-                        SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
+                        SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()),
                     };
                     tenant_clone.attach(preload, mode, &ctx).await
                 };
@@ -850,7 +852,7 @@ impl Tenant {
                 // It also prevents the warmup proccess competing with the concurrency limit on
                 // logical size calculations: if logical size calculation semaphore is saturated,
                 // then warmup will wait for that before proceeding to the next tenant.
-                if let AttachType::Warmup(_permit) = attach_type {
+                if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) {
                     let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
                     tracing::info!("Waiting for initial logical sizes while warming up...");
                     while futs.next().await.is_some() {}
@@ -923,7 +925,7 @@ impl Tenant {
                 deleting: false,
                 timelines: HashMap::new(),
             },
-            (None, SpawnMode::Normal) => {
+            (None, _) => {
                 anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
             }
         };
@@ -3769,7 +3771,7 @@ pub(crate) mod harness {
             let preload = tenant
                 .preload(&self.remote_storage, CancellationToken::new())
                 .await?;
-            tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
+            tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?;
 
             tenant.state.send_replace(TenantState::Active);
             for timeline in tenant.timelines.lock().unwrap().values() {
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 3d138da7af..ffb7206b1e 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -420,7 +420,7 @@ impl DeleteTenantFlow {
             .expect("cant be stopping or broken");
 
         tenant
-            .attach(preload, super::SpawnMode::Normal, ctx)
+            .attach(preload, super::SpawnMode::Eager, ctx)
             .await
             .context("attach")?;
 
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 8f0f73d4b5..805d44f93d 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -595,7 +595,7 @@ pub async fn init_tenant_mgr(
             shard_identity,
             Some(init_order.clone()),
             &TENANTS,
-            SpawnMode::Normal,
+            SpawnMode::Lazy,
             &ctx,
         ) {
             Ok(tenant) => {
@@ -1106,9 +1106,9 @@ impl TenantManager {
 
                 // Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
                 // the caller thinks they're creating but the tenant already existed.  We must switch to
-                // Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
+                // Eager mode so that when starting this Tenant we properly probe remote storage for timelines,
                 // rather than assuming it to be empty.
-                spawn_mode = SpawnMode::Normal;
+                spawn_mode = SpawnMode::Eager;
             }
             Some(TenantSlot::Secondary(state)) => {
                 info!("Shutting down secondary tenant");
@@ -1300,7 +1300,7 @@ impl TenantManager {
             shard_identity,
             None,
             self.tenants,
-            SpawnMode::Normal,
+            SpawnMode::Eager,
             ctx,
         )?;
 
@@ -1521,7 +1521,7 @@ impl TenantManager {
                 *child_shard,
                 child_location_conf,
                 None,
-                SpawnMode::Normal,
+                SpawnMode::Eager,
                 ctx,
             )
             .await?;
@@ -2064,7 +2064,7 @@ pub(crate) async fn load_tenant(
         shard_identity,
         None,
         &TENANTS,
-        SpawnMode::Normal,
+        SpawnMode::Eager,
         ctx,
     )
     .with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index ad3efb5837..b8e20c451f 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -286,7 +286,11 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         self.verbose_error(res)
 
     def tenant_location_conf(
-        self, tenant_id: Union[TenantId, TenantShardId], location_conf=dict[str, Any], flush_ms=None
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+        location_conf=dict[str, Any],
+        flush_ms=None,
+        lazy: Optional[bool] = None,
     ):
         body = location_conf.copy()
         body["tenant_id"] = str(tenant_id)
@@ -295,6 +299,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         if flush_ms is not None:
             params["flush_ms"] = str(flush_ms)
 
+        if lazy is not None:
+            params["lazy"] = "true" if lazy else "false"
+
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config",
             json=body,
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 327e5abe26..cbf7059c92 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -2,6 +2,7 @@ import concurrent.futures
 import math
 import random
 import time
+from collections import defaultdict
 from contextlib import closing
 from pathlib import Path
 from typing import Optional
@@ -14,6 +15,7 @@ from fixtures.neon_fixtures import (
     Endpoint,
     NeonEnv,
     NeonEnvBuilder,
+    NeonPageserver,
     PgBin,
     VanillaPostgres,
     wait_for_last_flush_lsn,
@@ -839,22 +841,40 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     )
 
     # Deleting a stuck tenant should prompt it to go active
+    # in some cases, it has already been activated because it's behind the detach
+    delete_lazy_activating(delete_tenant_id, env.pageserver, expect_attaching=False)
+    tenant_ids.remove(delete_tenant_id)
+
+    # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one
+    # we detached)
+    wait_until(10, 1, all_active)
+    assert len(get_tenant_states()) == n_tenants - 2
+
+
+def delete_lazy_activating(
+    delete_tenant_id: TenantId, pageserver: NeonPageserver, expect_attaching: bool
+):
+    pageserver_http = pageserver.http_client()
+
+    # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
+    # logical size is paused in a failpoint.  So instead we will use a log observation to check that
+    # on-demand activation was triggered by the tenant deletion
+    log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
+
+    if expect_attaching:
+        assert pageserver_http.tenant_status(delete_tenant_id)["state"]["slug"] == "Attaching"
+
     with concurrent.futures.ThreadPoolExecutor() as executor:
         log.info("Starting background delete")
 
+        def activated_on_demand():
+            assert pageserver.log_contains(log_match) is not None
+
         def delete_tenant():
-            env.pageserver.http_client().tenant_delete(delete_tenant_id)
+            pageserver_http.tenant_delete(delete_tenant_id)
 
         background_delete = executor.submit(delete_tenant)
 
-        # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
-        # logical size is paused in a failpoint.  So instead we will use a log observation to check that
-        # on-demand activation was triggered by the tenant deletion
-        log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
-
-        def activated_on_demand():
-            assert env.pageserver.log_contains(log_match) is not None
-
         log.info(f"Waiting for activation message '{log_match}'")
         try:
             wait_until(10, 1, activated_on_demand)
@@ -868,12 +888,6 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
 
         # Poll for deletion to complete
         wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40)
-        tenant_ids.remove(delete_tenant_id)
-
-    # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one
-    # we detached)
-    wait_until(10, 1, all_active)
-    assert len(get_tenant_states()) == n_tenants - 2
 
 
 def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
@@ -939,3 +953,159 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
     client.configure_failpoints(
         [("initial-size-calculation-permit-pause", "off"), ("walreceiver-after-ingest", "off")]
     )
+
+
+def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+
+    env = neon_env_builder.init_start()
+
+    # the supporting_second does nothing except queue behind env.initial_tenant
+    # for purposes of showing that eager_tenant breezes past the queue
+    supporting_second, _ = env.neon_cli.create_tenant()
+    eager_tenant, _ = env.neon_cli.create_tenant()
+
+    client = env.pageserver.http_client()
+    client.tenant_location_conf(
+        eager_tenant,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+
+    env.pageserver.stop()
+
+    # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation
+    env.pageserver.start(
+        extra_env_vars={
+            "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause"
+        }
+    )
+
+    tenant_ids = [env.initial_tenant, supporting_second]
+
+    def get_tenant_states() -> dict[str, list[TenantId]]:
+        states = defaultdict(list)
+        for id in tenant_ids:
+            state = client.tenant_status(id)["state"]["slug"]
+            states[state].append(id)
+        return dict(states)
+
+    def one_is_active():
+        states = get_tenant_states()
+        log.info(f"{states}")
+        assert len(states["Active"]) == 1
+
+    wait_until(10, 1, one_is_active)
+
+    def other_is_attaching():
+        states = get_tenant_states()
+        assert len(states["Attaching"]) == 1
+
+    wait_until(10, 1, other_is_attaching)
+
+    def eager_tenant_is_active():
+        resp = client.tenant_status(eager_tenant)
+        assert resp["state"]["slug"] == "Active"
+
+    gen = env.attachment_service.attach_hook_issue(eager_tenant, env.pageserver.id)
+    client.tenant_location_conf(
+        eager_tenant,
+        {
+            "mode": "AttachedSingle",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": gen,
+        },
+        lazy=False,
+    )
+    wait_until(10, 1, eager_tenant_is_active)
+
+    other_is_attaching()
+
+    client.configure_failpoints(
+        [("timeline-calculate-logical-size-pause", "off"), ("walreceiver-after-ingest", "off")]
+    )
+
+
+@pytest.mark.parametrize("activation_method", ["endpoint", "branch", "delete"])
+def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_method: str):
+    # env.initial_tenant will take up this permit when attaching with lazy because of a failpoint activated after restart
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+
+    env = neon_env_builder.init_start()
+
+    # because this returns (also elsewhere in this file), we know that SpawnMode::Create skips the queue
+    lazy_tenant, _ = env.neon_cli.create_tenant()
+
+    client = env.pageserver.http_client()
+    client.tenant_location_conf(
+        lazy_tenant,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+
+    env.pageserver.stop()
+
+    # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation
+    env.pageserver.start(
+        extra_env_vars={
+            "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause"
+        }
+    )
+
+    def initial_tenant_is_active():
+        resp = client.tenant_status(env.initial_tenant)
+        assert resp["state"]["slug"] == "Active"
+
+    wait_until(10, 1, initial_tenant_is_active)
+
+    # even though the initial tenant is now active, because it was startup time
+    # attach, it will consume the only permit because logical size calculation
+    # is paused.
+
+    gen = env.attachment_service.attach_hook_issue(lazy_tenant, env.pageserver.id)
+    client.tenant_location_conf(
+        lazy_tenant,
+        {
+            "mode": "AttachedSingle",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": gen,
+        },
+        lazy=True,
+    )
+
+    def lazy_tenant_is_attaching():
+        resp = client.tenant_status(lazy_tenant)
+        assert resp["state"]["slug"] == "Attaching"
+
+    # paused logical size calculation of env.initial_tenant is keeping it attaching
+    wait_until(10, 1, lazy_tenant_is_attaching)
+
+    for _ in range(5):
+        lazy_tenant_is_attaching()
+        time.sleep(0.5)
+
+    def lazy_tenant_is_active():
+        resp = client.tenant_status(lazy_tenant)
+        assert resp["state"]["slug"] == "Active"
+
+    if activation_method == "endpoint":
+        with env.endpoints.create_start("main", tenant_id=lazy_tenant):
+            # starting up the endpoint should make it jump the queue
+            wait_until(10, 1, lazy_tenant_is_active)
+    elif activation_method == "branch":
+        env.neon_cli.create_timeline("second_branch", lazy_tenant)
+        wait_until(10, 1, lazy_tenant_is_active)
+    elif activation_method == "delete":
+        delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True)
+    else:
+        raise RuntimeError(activation_method)

From 3eb83a0ebbae56acad54190bc71085c7b424fb13 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 29 Feb 2024 15:54:58 +0200
Subject: [PATCH 0303/1571] Provide appoximation of working set using
 hyper-log-log algorithm in LFC (#6935)

## Summary of changes

Calculate number of unique page accesses at compute.
It can be used to estimate working set size and adjust cache size
(shared_buffers or local file cache).

Approximation is made using HyperLogLog algorithm.
It is performed by local file cache and so is available only when local
file cache is enabled.

This calculation doesn't take in account access to the pages present in
shared buffers, but includes pages available in local file cache.

This information can be retrieved using
approximate_working_set_size(reset bool) function from neon extension.
reset parameter can be used to reset statistic and so collect unique
accesses for the particular interval.

Below is an example of estimating working set size after pgbench -c 10
-S -T 100 -s 10:
```
postgres=# select approximate_working_set_size(false);
 approximate_working_set_size
------------------------------
                        19052
(1 row)

postgres=# select pg_table_size('pgbench_accounts')/8192;
 ?column?
----------
    16402
(1 row)
```


## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/Makefile                         |  2 +-
 pgxn/neon/file_cache.c                     | 36 ++++++++++++++++++++++
 pgxn/neon/neon--1.2--1.3.sql               |  9 ++++++
 pgxn/neon/neon.control                     |  2 +-
 test_runner/regress/test_neon_extension.py |  2 +-
 5 files changed, 48 insertions(+), 3 deletions(-)
 create mode 100644 pgxn/neon/neon--1.2--1.3.sql

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index ef0a79a50c..7ea767ec74 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl
 
 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
 EXTRA_CLEAN = \
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 11d6f6aec5..25275ef31f 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -25,6 +25,8 @@
 #include "funcapi.h"
 #include "miscadmin.h"
 #include "pagestore_client.h"
+#include "common/hashfn.h"
+#include "lib/hyperloglog.h"
 #include "pgstat.h"
 #include "postmaster/bgworker.h"
 #include RELFILEINFO_HDR
@@ -60,6 +62,7 @@
 #define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
 #define MB					((uint64)1024*1024)
 
+#define HYPER_LOG_LOG_BIT_WIDTH   10
 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
 
 typedef struct FileCacheEntry
@@ -84,6 +87,8 @@ typedef struct FileCacheControl
 	uint64		writes;
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
+	hyperLogLogState wss_estimation; /* estimation of wroking set size */
+	uint8_t		hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1];
 } FileCacheControl;
 
 static HTAB *lfc_hash;
@@ -232,6 +237,14 @@ lfc_shmem_startup(void)
 		lfc_ctl->writes = 0;
 		dlist_init(&lfc_ctl->lru);
 
+		/* Initialize hyper-log-log structure for estimating working set size */
+		initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH);
+
+		/* We need hashes in shared memory */
+		pfree(lfc_ctl->wss_estimation.hashesArr);
+		memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
+		lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes;
+
 		/* Recreate file cache on restart */
 		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
 		if (fd < 0)
@@ -529,6 +542,11 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	}
 
 	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+
+	/* Approximate working set */
+	tag.blockNum = blkno;
+	addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+
 	if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
 	{
 		/* Page is not cached */
@@ -967,3 +985,21 @@ local_cache_pages(PG_FUNCTION_ARGS)
 	else
 		SRF_RETURN_DONE(funcctx);
 }
+
+PG_FUNCTION_INFO_V1(approximate_working_set_size);
+
+Datum
+approximate_working_set_size(PG_FUNCTION_ARGS)
+{
+	int32 dc = -1;
+	if (lfc_size_limit != 0)
+	{
+		bool reset = PG_GETARG_BOOL(0);
+		LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
+		dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation);
+		if (reset)
+			memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
+		LWLockRelease(lfc_lock);
+	}
+	PG_RETURN_INT32(dc);
+}
diff --git a/pgxn/neon/neon--1.2--1.3.sql b/pgxn/neon/neon--1.2--1.3.sql
new file mode 100644
index 0000000000..9583008777
--- /dev/null
+++ b/pgxn/neon/neon--1.2--1.3.sql
@@ -0,0 +1,9 @@
+\echo Use "ALTER EXTENSION neon UPDATE TO '1.3'" to load this file. \quit
+
+CREATE FUNCTION approximate_working_set_size(reset bool)
+RETURNS integer
+AS 'MODULE_PATHNAME', 'approximate_working_set_size'
+LANGUAGE C PARALLEL SAFE;
+
+GRANT EXECUTE ON FUNCTION approximate_working_set_size(bool) TO pg_monitor;
+
diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control
index 599b54b2ff..cee2f336f2 100644
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -1,6 +1,6 @@
 # neon extension
 comment = 'cloud storage for PostgreSQL'
-default_version = '1.2'
+default_version = '1.3'
 module_pathname = '$libdir/neon'
 relocatable = true
 trusted = true
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index 672f2b495d..1179a3afe9 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -23,7 +23,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
             # IMPORTANT:
             # If the version has changed, the test should be updated.
             # Ensure that the default version is also updated in the neon.control file
-            assert cur.fetchone() == ("1.2",)
+            assert cur.fetchone() == ("1.3",)
             cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
             res = cur.fetchall()
             log.info(res)

From 5984edaecd9c1914fb88f17fcffaeeb7e1d3b1ca Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 29 Feb 2024 13:55:38 +0000
Subject: [PATCH 0304/1571] libs: fix expired token in auth decode test (#6963)

The test token expired earlier today (1709200879). I regenerated the
token, but without an expiration date this time.
---
 libs/utils/src/auth.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index 51ab238d77..fbf0dff665 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -206,12 +206,11 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
         //   "scope": "tenant",
         //   "tenant_id": "3d1f7595b468230304e0b73cecbcb081",
         //   "iss": "neon.controlplane",
-        //   "exp": 1709200879,
         //   "iat": 1678442479
         // }
         // ```
         //
-        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
+        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw";
 
         // Check it can be validated with the public key
         let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);

From 76ab57f33f88ea44de76a4da97cd877ae8acfcc7 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 29 Feb 2024 13:51:15 -0500
Subject: [PATCH 0305/1571] test: disable test_superuser on pg15 (#6972)

ref https://github.com/neondatabase/neon/issues/6969

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_neon_superuser.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index ca8ada4ddb..e0364dd13f 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -1,9 +1,12 @@
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
-from fixtures.pg_version import PgVersion
+from fixtures.pg_version import PgVersion, skip_on_postgres
 from fixtures.utils import wait_until
 
 
+@skip_on_postgres(
+    PgVersion.V15, reason="skip on pg15 due to https://github.com/neondatabase/neon/issues/6969"
+)
 def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
     env = neon_simple_env
     env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")

From 502b69b33bbd4ad1b0647e921a9c665249a2cd62 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 29 Feb 2024 20:50:23 +0100
Subject: [PATCH 0306/1571] refactor(compaction): `RequestContext` shouldn't be
 `Clone`, only `RequestContextAdaptor` uses it (#6961)

Extracted from https://github.com/neondatabase/neon/pull/6953

Part of https://github.com/neondatabase/neon/issues/5899
---
 pageserver/src/tenant/timeline/compaction.rs | 34 ++++++--------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 950459cbf9..914e3948ef 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -75,14 +75,13 @@ impl Timeline {
 
         let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
         let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
-        let ctx_adaptor = RequestContextAdaptor(ctx.clone());
 
         pageserver_compaction::compact_tiered::compact_tiered(
             &mut adaptor,
             end_lsn,
             target_file_size,
             fanout,
-            &ctx_adaptor,
+            ctx,
         )
         .await?;
 
@@ -143,13 +142,13 @@ impl CompactionJobExecutor for TimelineAdaptor {
     type DeltaLayer = ResidentDeltaLayer;
     type ImageLayer = ResidentImageLayer;
 
-    type RequestContext = RequestContextAdaptor;
+    type RequestContext = crate::context::RequestContext;
 
     async fn get_layers(
         &mut self,
         key_range: &Range<Key>,
         lsn_range: &Range<Lsn>,
-        _ctx: &RequestContextAdaptor,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<Vec<OwnArc<PersistentLayerDesc>>> {
         self.flush_updates().await?;
 
@@ -170,7 +169,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
         &mut self,
         key_range: &Range<Key>,
         lsn: Lsn,
-        _ctx: &RequestContextAdaptor,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<Vec<Range<Key>>> {
         if lsn == self.keyspace.0 {
             Ok(pageserver_compaction::helpers::intersect_keyspace(
@@ -206,7 +205,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
         &mut self,
         lsn: Lsn,
         key_range: &Range<Key>,
-        ctx: &RequestContextAdaptor,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         Ok(self.create_image_impl(lsn, key_range, ctx).await?)
     }
@@ -216,7 +215,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
         lsn_range: &Range<Lsn>,
         key_range: &Range<Key>,
         input_layers: &[ResidentDeltaLayer],
-        ctx: &RequestContextAdaptor,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
 
@@ -287,7 +286,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
     async fn delete_layer(
         &mut self,
         layer: &OwnArc<PersistentLayerDesc>,
-        _ctx: &RequestContextAdaptor,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         self.layers_to_delete.push(layer.clone().0);
         Ok(())
@@ -299,7 +298,7 @@ impl TimelineAdaptor {
         &mut self,
         lsn: Lsn,
         key_range: &Range<Key>,
-        ctx: &RequestContextAdaptor,
+        ctx: &RequestContext,
     ) -> Result<(), PageReconstructError> {
         let timer = self.timeline.metrics.create_images_time_histo.start_timer();
 
@@ -361,17 +360,7 @@ impl TimelineAdaptor {
     }
 }
 
-pub struct RequestContextAdaptor(pub RequestContext);
-
-impl std::ops::Deref for RequestContextAdaptor {
-    type Target = RequestContext;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
-
-impl CompactionRequestContext for RequestContextAdaptor {}
+impl CompactionRequestContext for crate::context::RequestContext {}
 
 #[derive(Debug, Clone)]
 pub struct OwnArc<T>(pub Arc<T>);
@@ -449,10 +438,7 @@ impl CompactionLayer<Key> for ResidentDeltaLayer {
 impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
     type DeltaEntry<'a> = DeltaEntry<'a>;
 
-    async fn load_keys<'a>(
-        &self,
-        ctx: &RequestContextAdaptor,
-    ) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
+    async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
         self.0.load_keys(ctx).await
     }
 }

From ee93700a0fe5548c391ba8da5f10d5841c8911db Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 29 Feb 2024 22:54:16 +0200
Subject: [PATCH 0307/1571] dube: timeout individual layer evictions, log
 progress and record metrics (#6131)

Because of bugs evictions could hang and pause disk usage eviction task.
One such bug is known and fixed #6928. Guard each layer eviction with a
modest timeout deeming timeouted evictions as failures, to be
conservative.

In addition, add logging and metrics recording on each eviction
iteration:
- log collection completed with duration and amount of layers
    - per tenant collection time is observed in a new histogram
    - per tenant layer count is observed in a new histogram
- record metric for collected, selected and evicted layer counts
- log if eviction takes more than 10s
- log eviction completion with eviction duration

Additionally remove dead code for which no dead code warnings appeared
in earlier PR.

Follow-up to: #6060.
---
 pageserver/src/disk_usage_eviction_task.rs    | 145 ++++++++---
 pageserver/src/metrics.rs                     |  59 +++++
 pageserver/src/tenant/secondary.rs            |  89 ++++---
 pageserver/src/tenant/storage_layer.rs        |   2 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  35 ++-
 .../src/tenant/storage_layer/layer/tests.rs   | 232 +++++++++++++++++-
 pageserver/src/tenant/timeline.rs             |  21 +-
 .../src/tenant/timeline/eviction_task.rs      |  13 +-
 8 files changed, 492 insertions(+), 104 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index b1c6f35704..92c1475aef 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -58,6 +58,7 @@ use utils::{completion, id::TimelineId};
 
 use crate::{
     config::PageServerConf,
+    metrics::disk_usage_based_eviction::METRICS,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     tenant::{
         self,
@@ -65,7 +66,6 @@ use crate::{
         remote_timeline_client::LayerFileMetadata,
         secondary::SecondaryTenant,
         storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
-        Timeline,
     },
 };
 
@@ -409,13 +409,23 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
         "running disk usage based eviction due to pressure"
     );
 
-    let candidates =
+    let (candidates, collection_time) = {
+        let started_at = std::time::Instant::now();
         match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
             EvictionCandidates::Cancelled => {
                 return Ok(IterationOutcome::Cancelled);
             }
-            EvictionCandidates::Finished(partitioned) => partitioned,
-        };
+            EvictionCandidates::Finished(partitioned) => (partitioned, started_at.elapsed()),
+        }
+    };
+
+    METRICS.layers_collected.inc_by(candidates.len() as u64);
+
+    tracing::info!(
+        elapsed_ms = collection_time.as_millis(),
+        total_layers = candidates.len(),
+        "collection completed"
+    );
 
     // Debug-log the list of candidates
     let now = SystemTime::now();
@@ -446,9 +456,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
     // the tenant's min-resident-size threshold, print a warning, and memorize the disk
     // usage at that point, in 'usage_planned_min_resident_size_respecting'.
 
-    let selection = select_victims(&candidates, usage_pre);
+    let (evicted_amount, usage_planned) =
+        select_victims(&candidates, usage_pre).into_amount_and_planned();
 
-    let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
+    METRICS.layers_selected.inc_by(evicted_amount as u64);
 
     // phase2: evict layers
 
@@ -477,9 +488,15 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
             if let Some(next) = next {
                 match next {
                     Ok(Ok(file_size)) => {
+                        METRICS.layers_evicted.inc();
                         usage_assumed.add_available_bytes(file_size);
                     }
-                    Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
+                    Ok(Err((
+                        file_size,
+                        EvictionError::NotFound
+                        | EvictionError::Downloaded
+                        | EvictionError::Timeout,
+                    ))) => {
                         evictions_failed.file_sizes += file_size;
                         evictions_failed.count += 1;
                     }
@@ -495,7 +512,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 
             // calling again when consumed_all is fine as evicted is fused.
             let Some((_partition, candidate)) = evicted.next() else {
-                consumed_all = true;
+                if !consumed_all {
+                    tracing::info!("all evictions started, waiting");
+                    consumed_all = true;
+                }
                 continue;
             };
 
@@ -503,11 +523,15 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                 EvictionLayer::Attached(layer) => {
                     let file_size = layer.layer_desc().file_size;
                     js.spawn(async move {
-                        layer
-                            .evict_and_wait()
-                            .await
-                            .map(|()| file_size)
-                            .map_err(|e| (file_size, e))
+                        // have a low eviction waiting timeout because our LRU calculations go stale fast;
+                        // also individual layer evictions could hang because of bugs and we do not want to
+                        // pause disk_usage_based_eviction for such.
+                        let timeout = std::time::Duration::from_secs(5);
+
+                        match layer.evict_and_wait(timeout).await {
+                            Ok(()) => Ok(file_size),
+                            Err(e) => Err((file_size, e)),
+                        }
                     });
                 }
                 EvictionLayer::Secondary(layer) => {
@@ -529,6 +553,30 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
         (usage_assumed, evictions_failed)
     };
 
+    let started_at = std::time::Instant::now();
+
+    let evict_layers = async move {
+        let mut evict_layers = std::pin::pin!(evict_layers);
+
+        let maximum_expected = std::time::Duration::from_secs(10);
+
+        let res = tokio::time::timeout(maximum_expected, &mut evict_layers).await;
+        let tuple = if let Ok(tuple) = res {
+            tuple
+        } else {
+            let elapsed = started_at.elapsed();
+            tracing::info!(elapsed_ms = elapsed.as_millis(), "still ongoing");
+            evict_layers.await
+        };
+
+        let elapsed = started_at.elapsed();
+        tracing::info!(elapsed_ms = elapsed.as_millis(), "completed");
+        tuple
+    };
+
+    let evict_layers =
+        evict_layers.instrument(tracing::info_span!("evict_layers", layers=%evicted_amount));
+
     let (usage_assumed, evictions_failed) = tokio::select! {
         tuple = evict_layers => { tuple },
         _ = cancel.cancelled() => {
@@ -763,6 +811,8 @@ async fn collect_eviction_candidates(
     eviction_order: EvictionOrder,
     cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
+    const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
+
     // get a snapshot of the list of tenants
     let tenants = tenant::mgr::list_tenants()
         .await
@@ -791,6 +841,8 @@ async fn collect_eviction_candidates(
             continue;
         }
 
+        let started_at = std::time::Instant::now();
+
         // collect layers from all timelines in this tenant
         //
         // If one of the timelines becomes `!is_active()` during the iteration,
@@ -805,6 +857,7 @@ async fn collect_eviction_candidates(
             }
             let info = tl.get_local_layers_for_disk_usage_eviction().await;
             debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
+
             tenant_candidates.extend(info.resident_layers.into_iter());
             max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
 
@@ -870,7 +923,25 @@ async fn collect_eviction_candidates(
                     (partition, candidate)
                 });
 
+        METRICS
+            .tenant_layer_count
+            .observe(tenant_candidates.len() as f64);
+
         candidates.extend(tenant_candidates);
+
+        let elapsed = started_at.elapsed();
+        METRICS
+            .tenant_collection_time
+            .observe(elapsed.as_secs_f64());
+
+        if elapsed > LOG_DURATION_THRESHOLD {
+            tracing::info!(
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
+                elapsed_ms = elapsed.as_millis(),
+                "collection took longer than threshold"
+            );
+        }
     }
 
     // Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -885,11 +956,11 @@ async fn collect_eviction_candidates(
         },
     );
 
-    for secondary_tenant in secondary_tenants {
+    for tenant in secondary_tenants {
         // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
         // to prevent repeated disk usage based evictions from completely draining less often
         // updating secondaries.
-        let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();
+        let (mut layer_info, total_layers) = tenant.get_layers_for_eviction();
 
         debug_assert!(
             total_layers >= layer_info.resident_layers.len(),
@@ -897,6 +968,8 @@ async fn collect_eviction_candidates(
             layer_info.resident_layers.len()
         );
 
+        let started_at = std::time::Instant::now();
+
         layer_info
             .resident_layers
             .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
@@ -918,9 +991,27 @@ async fn collect_eviction_candidates(
                     )
                 });
 
+        METRICS
+            .tenant_layer_count
+            .observe(tenant_candidates.len() as f64);
         candidates.extend(tenant_candidates);
 
         tokio::task::yield_now().await;
+
+        let elapsed = started_at.elapsed();
+
+        METRICS
+            .tenant_collection_time
+            .observe(elapsed.as_secs_f64());
+
+        if elapsed > LOG_DURATION_THRESHOLD {
+            tracing::info!(
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
+                elapsed_ms = elapsed.as_millis(),
+                "collection took longer than threshold"
+            );
+        }
     }
 
     debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
@@ -997,30 +1088,6 @@ impl<U: Usage> VictimSelection<U> {
     }
 }
 
-struct TimelineKey(Arc<Timeline>);
-
-impl PartialEq for TimelineKey {
-    fn eq(&self, other: &Self) -> bool {
-        Arc::ptr_eq(&self.0, &other.0)
-    }
-}
-
-impl Eq for TimelineKey {}
-
-impl std::hash::Hash for TimelineKey {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        Arc::as_ptr(&self.0).hash(state);
-    }
-}
-
-impl std::ops::Deref for TimelineKey {
-    type Target = Timeline;
-
-    fn deref(&self) -> &Self::Target {
-        self.0.as_ref()
-    }
-}
-
 /// A totally ordered f32 subset we can use with sorting functions.
 pub(crate) mod finite_f32 {
 
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 1749e02c7f..1d894ed8a5 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2474,6 +2474,64 @@ pub(crate) mod tenant_throttling {
     }
 }
 
+pub(crate) mod disk_usage_based_eviction {
+    use super::*;
+
+    pub(crate) struct Metrics {
+        pub(crate) tenant_collection_time: Histogram,
+        pub(crate) tenant_layer_count: Histogram,
+        pub(crate) layers_collected: IntCounter,
+        pub(crate) layers_selected: IntCounter,
+        pub(crate) layers_evicted: IntCounter,
+    }
+
+    impl Default for Metrics {
+        fn default() -> Self {
+            let tenant_collection_time = register_histogram!(
+                "pageserver_disk_usage_based_eviction_tenant_collection_seconds",
+                "Time spent collecting layers from a tenant -- not normalized by collected layer amount",
+                vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
+            )
+            .unwrap();
+
+            let tenant_layer_count = register_histogram!(
+                "pageserver_disk_usage_based_eviction_tenant_collected_layers",
+                "Amount of layers gathered from a tenant",
+                vec![5.0, 50.0, 500.0, 5000.0, 50000.0]
+            )
+            .unwrap();
+
+            let layers_collected = register_int_counter!(
+                "pageserver_disk_usage_based_eviction_collected_layers_total",
+                "Amount of layers collected"
+            )
+            .unwrap();
+
+            let layers_selected = register_int_counter!(
+                "pageserver_disk_usage_based_eviction_select_layers_total",
+                "Amount of layers selected"
+            )
+            .unwrap();
+
+            let layers_evicted = register_int_counter!(
+                "pageserver_disk_usage_based_eviction_evicted_layers_total",
+                "Amount of layers successfully evicted"
+            )
+            .unwrap();
+
+            Self {
+                tenant_collection_time,
+                tenant_layer_count,
+                layers_collected,
+                layers_selected,
+                layers_evicted,
+            }
+        }
+    }
+
+    pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
+}
+
 pub fn preinitialize_metrics() {
     // Python tests need these and on some we do alerting.
     //
@@ -2508,6 +2566,7 @@ pub fn preinitialize_metrics() {
     Lazy::force(&TENANT_MANAGER);
 
     Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
+    Lazy::force(&disk_usage_based_eviction::METRICS);
 
     // countervecs
     [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index c466ac0c24..14e88b836e 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -32,7 +32,7 @@ use remote_storage::GenericRemoteStorage;
 
 use tokio_util::sync::CancellationToken;
 use tracing::instrument;
-use utils::{completion::Barrier, fs_ext, id::TimelineId, sync::gate::Gate};
+use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate};
 
 enum DownloadCommand {
     Download(TenantShardId),
@@ -121,6 +121,10 @@ impl SecondaryTenant {
         })
     }
 
+    pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
+        self.tenant_shard_id
+    }
+
     pub(crate) async fn shutdown(&self) {
         self.cancel.cancel();
 
@@ -164,16 +168,17 @@ impl SecondaryTenant {
         self.detail.lock().unwrap().get_layers_for_eviction(self)
     }
 
+    /// Cancellation safe, but on cancellation the eviction will go through
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
     pub(crate) async fn evict_layer(
-        &self,
+        self: &Arc<Self>,
         conf: &PageServerConf,
         timeline_id: TimelineId,
         name: LayerFileName,
     ) {
         debug_assert_current_span_has_tenant_id();
 
-        let _guard = match self.gate.enter() {
+        let guard = match self.gate.enter() {
             Ok(g) => g,
             Err(_) => {
                 tracing::debug!("Dropping layer evictions, secondary tenant shutting down",);
@@ -187,35 +192,57 @@ impl SecondaryTenant {
             .timeline_path(&self.tenant_shard_id, &timeline_id)
             .join(name.file_name());
 
-        // We tolerate ENOENT, because between planning eviction and executing
-        // it, the secondary downloader could have seen an updated heatmap that
-        // resulted in a layer being deleted.
-        // Other local I/O errors are process-fatal: these should never happen.
-        tokio::fs::remove_file(path)
-            .await
-            .or_else(fs_ext::ignore_not_found)
-            .fatal_err("Deleting layer during eviction");
+        let this = self.clone();
 
-        // Update the timeline's state.  This does not have to be synchronized with
-        // the download process, because:
-        // - If downloader is racing with us to remove a file (e.g. because it is
-        //   removed from heatmap), then our mutual .remove() operations will both
-        //   succeed.
-        // - If downloader is racing with us to download the object (this would require
-        //   multiple eviction iterations to race with multiple download iterations), then
-        //   if we remove it from the state, the worst that happens is the downloader
-        //   downloads it again before re-inserting, or we delete the file but it remains
-        //   in the state map (in which case it will be downloaded if this secondary
-        //   tenant transitions to attached and tries to access it)
-        //
-        // The important assumption here is that the secondary timeline state does not
-        // have to 100% match what is on disk, because it's a best-effort warming
-        // of the cache.
-        let mut detail = self.detail.lock().unwrap();
-        if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
-            timeline_detail.on_disk_layers.remove(&name);
-            timeline_detail.evicted_at.insert(name, now);
-        }
+        // spawn it to be cancellation safe
+        tokio::task::spawn_blocking(move || {
+            let _guard = guard;
+            // We tolerate ENOENT, because between planning eviction and executing
+            // it, the secondary downloader could have seen an updated heatmap that
+            // resulted in a layer being deleted.
+            // Other local I/O errors are process-fatal: these should never happen.
+            let deleted = std::fs::remove_file(path);
+
+            let not_found = deleted
+                .as_ref()
+                .is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
+
+            let deleted = if not_found {
+                false
+            } else {
+                deleted
+                    .map(|()| true)
+                    .fatal_err("Deleting layer during eviction")
+            };
+
+            if !deleted {
+                // skip updating accounting and putting perhaps later timestamp
+                return;
+            }
+
+            // Update the timeline's state.  This does not have to be synchronized with
+            // the download process, because:
+            // - If downloader is racing with us to remove a file (e.g. because it is
+            //   removed from heatmap), then our mutual .remove() operations will both
+            //   succeed.
+            // - If downloader is racing with us to download the object (this would require
+            //   multiple eviction iterations to race with multiple download iterations), then
+            //   if we remove it from the state, the worst that happens is the downloader
+            //   downloads it again before re-inserting, or we delete the file but it remains
+            //   in the state map (in which case it will be downloaded if this secondary
+            //   tenant transitions to attached and tries to access it)
+            //
+            // The important assumption here is that the secondary timeline state does not
+            // have to 100% match what is on disk, because it's a best-effort warming
+            // of the cache.
+            let mut detail = this.detail.lock().unwrap();
+            if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
+                timeline_detail.on_disk_layers.remove(&name);
+                timeline_detail.evicted_at.insert(name, now);
+            }
+        })
+        .await
+        .expect("secondary eviction should not have panicked");
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9de820912e..299950cc21 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -72,7 +72,7 @@ where
 /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
 /// call, to collect more records.
 ///
-#[derive(Debug)]
+#[derive(Debug, Default)]
 pub struct ValueReconstructState {
     pub records: Vec<(Lsn, NeonWalRecord)>,
     pub img: Option<(Lsn, Bytes)>,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 13c9e5c989..247dd1a8e4 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -8,7 +8,7 @@ use pageserver_api::shard::ShardIndex;
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
-use std::time::SystemTime;
+use std::time::{Duration, SystemTime};
 use tracing::Instrument;
 use utils::lsn::Lsn;
 use utils::sync::heavier_once_cell;
@@ -208,10 +208,15 @@ impl Layer {
     /// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is
     /// re-downloaded, [`EvictionError::Downloaded`] is returned.
     ///
+    /// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction
+    /// will happen regardless the future returned by this method completing unless there is a
+    /// read access (currently including [`Layer::keep_resident`]) before eviction gets to
+    /// complete.
+    ///
     /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
     /// of download-evict cycle on retry.
-    pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
-        self.0.evict_and_wait().await
+    pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
+        self.0.evict_and_wait(timeout).await
     }
 
     /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
@@ -363,7 +368,7 @@ impl Layer {
     ///
     /// Does not start local deletion, use [`Self::delete_on_drop`] for that
     /// separatedly.
-    #[cfg(feature = "testing")]
+    #[cfg(any(feature = "testing", test))]
     pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
         let mut rx = self.0.status.subscribe();
 
@@ -632,7 +637,7 @@ impl LayerInner {
 
     /// Cancellation safe, however dropping the future and calling this method again might result
     /// in a new attempt to evict OR join the previously started attempt.
-    pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
+    pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
         use tokio::sync::broadcast::error::RecvError;
 
         assert!(self.have_remote_client);
@@ -652,16 +657,22 @@ impl LayerInner {
         if strong.is_some() {
             // drop the DownloadedLayer outside of the holding the guard
             drop(strong);
+
+            // idea here is that only one evicter should ever get to witness a strong reference,
+            // which means whenever get_or_maybe_download upgrades a weak, it must mark up a
+            // cancelled eviction and signal us, like it currently does.
+            //
+            // a second concurrent evict_and_wait will not see a strong reference.
             LAYER_IMPL_METRICS.inc_started_evictions();
         }
 
-        match rx.recv().await {
-            Ok(Status::Evicted) => Ok(()),
-            Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
-            Err(RecvError::Closed) => {
+        match tokio::time::timeout(timeout, rx.recv()).await {
+            Ok(Ok(Status::Evicted)) => Ok(()),
+            Ok(Ok(Status::Downloaded)) => Err(EvictionError::Downloaded),
+            Ok(Err(RecvError::Closed)) => {
                 unreachable!("sender cannot be dropped while we are in &self method")
             }
-            Err(RecvError::Lagged(_)) => {
+            Ok(Err(RecvError::Lagged(_))) => {
                 // this is quite unlikely, but we are blocking a lot in the async context, so
                 // we might be missing this because we are stuck on a LIFO slot on a thread
                 // which is busy blocking for a 1TB database create_image_layers.
@@ -674,6 +685,7 @@ impl LayerInner {
                     None => Ok(()),
                 }
             }
+            Err(_timeout) => Err(EvictionError::Timeout),
         }
     }
 
@@ -1195,6 +1207,9 @@ pub(crate) enum EvictionError {
     /// Evictions must always lose to downloads in races, and this time it happened.
     #[error("layer was downloaded instead")]
     Downloaded,
+
+    #[error("eviction did not happen within timeout")]
+    Timeout,
 }
 
 /// Error internal to the [`LayerInner::get_or_maybe_download`]
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 01c62b6f83..b43534efd4 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,13 +1,173 @@
 use futures::StreamExt;
+use pageserver_api::key::CONTROLFILE_KEY;
 use tokio::task::JoinSet;
+use tracing::Instrument;
 use utils::{
     completion::{self, Completion},
     id::TimelineId,
 };
 
 use super::*;
-use crate::task_mgr::BACKGROUND_RUNTIME;
-use crate::tenant::harness::TenantHarness;
+use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME};
+use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
+
+/// Used in tests to advance a future to wanted await point, and not futher.
+const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600);
+
+/// Used in tests to indicate forever long timeout; has to be longer than the amount of ADVANCE
+/// timeout uses to advance futures.
+const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_secs() * 24 * 7);
+
+/// Demonstrate the API and resident -> evicted -> resident -> deleted transitions.
+#[tokio::test]
+async fn smoke_test() {
+    let handle = BACKGROUND_RUNTIME.handle();
+
+    let h = TenantHarness::create("smoke_test").unwrap();
+    let span = h.span();
+    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
+    let (tenant, _) = h.load().await;
+
+    let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
+
+    let timeline = tenant
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .await
+        .unwrap();
+
+    let layer = {
+        let mut layers = {
+            let layers = timeline.layers.read().await;
+            layers.resident_layers().collect::<Vec<_>>().await
+        };
+
+        assert_eq!(layers.len(), 1);
+
+        layers.swap_remove(0)
+    };
+
+    // all layers created at pageserver are like `layer`, initialized with strong
+    // Arc<DownloadedLayer>.
+
+    let img_before = {
+        let mut data = ValueReconstructState::default();
+        layer
+            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
+            .await
+            .unwrap();
+        data.img
+            .take()
+            .expect("tenant harness writes the control file")
+    };
+
+    // important part is evicting the layer, which can be done when there are no more ResidentLayer
+    // instances -- there currently are none, only two `Layer` values, one in the layermap and on
+    // in scope.
+    layer.evict_and_wait(FOREVER).await.unwrap();
+
+    // double-evict returns an error, which is valid if both eviction_task and disk usage based
+    // eviction would both evict the same layer at the same time.
+
+    let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
+    assert!(matches!(e, EvictionError::NotFound));
+
+    // on accesses when the layer is evicted, it will automatically be downloaded.
+    let img_after = {
+        let mut data = ValueReconstructState::default();
+        layer
+            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
+            .instrument(download_span.clone())
+            .await
+            .unwrap();
+        data.img.take().unwrap()
+    };
+
+    assert_eq!(img_before, img_after);
+
+    // evict_and_wait can timeout, but it doesn't cancel the evicting itself
+    //
+    // ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to
+    // artificially slow it down.
+    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+
+    match layer
+        .evict_and_wait(std::time::Duration::ZERO)
+        .await
+        .unwrap_err()
+    {
+        EvictionError::Timeout => {
+            // expected, but note that the eviction is "still ongoing"
+            helper.release().await;
+            // exhaust spawn_blocking pool to ensure it is now complete
+            SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle)
+                .await;
+        }
+        other => unreachable!("{other:?}"),
+    }
+
+    // only way to query if a layer is resident is to acquire a ResidentLayer instance.
+    // Layer::keep_resident never downloads, but it might initialize if the layer file is found
+    // downloaded locally.
+    let none = layer.keep_resident().await.unwrap();
+    assert!(
+        none.is_none(),
+        "Expected none, because eviction removed the local file, found: {none:?}"
+    );
+
+    // plain downloading is rarely needed
+    layer
+        .download_and_keep_resident()
+        .instrument(download_span)
+        .await
+        .unwrap();
+
+    // last important part is deletion on drop: gc and compaction use it for compacted L0 layers
+    // or fully garbage collected layers. deletion means deleting the local file, and scheduling a
+    // deletion of the already unlinked from index_part.json remote file.
+    //
+    // marking a layer to be deleted on drop is irreversible; there is no technical reason against
+    // reversiblity, but currently it is not needed so it is not provided.
+    layer.delete_on_drop();
+
+    let path = layer.local_path().to_owned();
+
+    // wait_drop produces an unconnected to Layer future which will resolve when the
+    // LayerInner::drop has completed.
+    let mut wait_drop = std::pin::pin!(layer.wait_drop());
+
+    // paused time doesn't really work well with timeouts and evict_and_wait, so delay pausing
+    // until here
+    tokio::time::pause();
+    tokio::time::timeout(ADVANCE, &mut wait_drop)
+        .await
+        .expect_err("should had timed out because two strong references exist");
+
+    tokio::fs::metadata(&path)
+        .await
+        .expect("the local layer file still exists");
+
+    let rtc = timeline.remote_client.as_ref().unwrap();
+
+    {
+        let layers = &[layer];
+        let mut g = timeline.layers.write().await;
+        g.finish_gc_timeline(layers);
+        // this just updates the remote_physical_size for demonstration purposes
+        rtc.schedule_gc_update(layers).unwrap();
+    }
+
+    // when strong references are dropped, the file is deleted and remote deletion is scheduled
+    wait_drop.await;
+
+    let e = tokio::fs::metadata(&path)
+        .await
+        .expect_err("the local file is deleted");
+    assert_eq!(e.kind(), std::io::ErrorKind::NotFound);
+
+    rtc.wait_completion().await.unwrap();
+
+    assert_eq!(rtc.get_remote_physical_size(), 0);
+}
 
 /// This test demonstrates a previous hang when a eviction and deletion were requested at the same
 /// time. Now both of them complete per Arc drop semantics.
@@ -41,10 +201,10 @@ async fn evict_and_wait_on_wanted_deleted() {
     let resident = layer.keep_resident().await.unwrap();
 
     {
-        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
+        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
 
         // drive the future to await on the status channel
-        tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
             .await
             .expect_err("should had been a timeout since we are holding the layer resident");
 
@@ -115,10 +275,10 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
 
     let resident = layer.keep_resident().await.unwrap();
 
-    let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
+    let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
 
     // drive the future to await on the status channel
-    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+    tokio::time::timeout(ADVANCE, &mut evict_and_wait)
         .await
         .expect_err("should had been a timeout since we are holding the layer resident");
     assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
@@ -138,7 +298,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
 
     // because the keep_resident check alters wanted evicted without sending a message, we will
     // never get completed
-    let e = tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+    let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
         .await
         .expect("no timeout, because keep_resident re-initialized")
         .expect_err("eviction should not have succeeded because re-initialized");
@@ -158,9 +318,10 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
             .sum::<u64>()
     );
 
-    let mut second_eviction = std::pin::pin!(layer.evict_and_wait());
+    let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
 
-    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
+    // advance to the wait on the queue
+    tokio::time::timeout(ADVANCE, &mut second_eviction)
         .await
         .expect_err("timeout because spawn_blocking is clogged");
 
@@ -171,7 +332,12 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
 
     helper.release().await;
 
-    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
+    // the second_eviction gets to run here
+    //
+    // synchronize to be *strictly* after the second_eviction spawn_blocking run
+    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+
+    tokio::time::timeout(ADVANCE, &mut second_eviction)
         .await
         .expect("eviction goes through now that spawn_blocking is unclogged")
         .expect("eviction should succeed, because version matches");
@@ -261,3 +427,49 @@ impl SpawnBlockingPoolHelper {
             .await
     }
 }
+
+#[test]
+fn spawn_blocking_pool_helper_actually_works() {
+    // create a custom runtime for which we know and control how many blocking threads it has
+    //
+    // because the amount is not configurable for our helper, expect the same amount as
+    // BACKGROUND_RUNTIME using the tokio defaults would have.
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .max_blocking_threads(512)
+        .enable_all()
+        .build()
+        .unwrap();
+
+    let handle = rt.handle();
+
+    rt.block_on(async move {
+        // this will not return until all threads are spun up and actually executing the code
+        // waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d.
+        let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+
+        println!("consumed");
+
+        let mut jh = std::pin::pin!(tokio::task::spawn_blocking(move || {
+            // this will not get to run before we release
+        }));
+
+        println!("spawned");
+
+        tokio::time::timeout(std::time::Duration::from_secs(1), &mut jh)
+            .await
+            .expect_err("the task should not have gotten to run yet");
+
+        println!("tried to join");
+
+        consumed.release().await;
+
+        println!("released");
+
+        tokio::time::timeout(std::time::Duration::from_secs(1), jh)
+            .await
+            .expect("no timeout")
+            .expect("no join error");
+
+        println!("joined");
+    });
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fa5e7b3685..206f20306e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1512,10 +1512,14 @@ impl Timeline {
             return Ok(None);
         };
 
-        match local_layer.evict_and_wait().await {
+        // curl has this by default
+        let timeout = std::time::Duration::from_secs(120);
+
+        match local_layer.evict_and_wait(timeout).await {
             Ok(()) => Ok(Some(true)),
             Err(EvictionError::NotFound) => Ok(Some(false)),
             Err(EvictionError::Downloaded) => Ok(Some(false)),
+            Err(EvictionError::Timeout) => Ok(Some(false)),
         }
     }
 }
@@ -5157,8 +5161,7 @@ mod tests {
         let harness =
             TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
 
-        let ctx = any_context();
-        let tenant = harness.do_try_load(&ctx).await.unwrap();
+        let (tenant, ctx) = harness.load().await;
         let timeline = tenant
             .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
             .await
@@ -5172,8 +5175,10 @@ mod tests {
             .expect("should had been resident")
             .drop_eviction_guard();
 
-        let first = async { layer.evict_and_wait().await };
-        let second = async { layer.evict_and_wait().await };
+        let forever = std::time::Duration::from_secs(120);
+
+        let first = layer.evict_and_wait(forever);
+        let second = layer.evict_and_wait(forever);
 
         let (first, second) = tokio::join!(first, second);
 
@@ -5192,12 +5197,6 @@ mod tests {
         }
     }
 
-    fn any_context() -> crate::context::RequestContext {
-        use crate::context::*;
-        use crate::task_mgr::*;
-        RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
-    }
-
     async fn find_some_layer(timeline: &Timeline) -> Layer {
         let layers = timeline.layers.read().await;
         let desc = layers
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 008f9482c4..dd603135d2 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -204,6 +204,7 @@ impl Timeline {
             evicted: usize,
             errors: usize,
             not_evictable: usize,
+            timeouts: usize,
             #[allow(dead_code)]
             skipped_for_shutdown: usize,
         }
@@ -267,7 +268,11 @@ impl Timeline {
                 let layer = guard.drop_eviction_guard();
                 if no_activity_for > p.threshold {
                     // this could cause a lot of allocations in some cases
-                    js.spawn(async move { layer.evict_and_wait().await });
+                    js.spawn(async move {
+                        layer
+                            .evict_and_wait(std::time::Duration::from_secs(5))
+                            .await
+                    });
                     stats.candidates += 1;
                 }
             }
@@ -280,6 +285,9 @@ impl Timeline {
                     Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
                         stats.not_evictable += 1;
                     }
+                    Ok(Err(EvictionError::Timeout)) => {
+                        stats.timeouts += 1;
+                    }
                     Err(je) if je.is_cancelled() => unreachable!("not used"),
                     Err(je) if je.is_panic() => {
                         /* already logged */
@@ -295,7 +303,8 @@ impl Timeline {
             stats = join_all => {
                 if stats.candidates == stats.not_evictable {
                     debug!(stats=?stats, "eviction iteration complete");
-                } else if stats.errors > 0 || stats.not_evictable > 0 {
+                } else if stats.errors > 0 || stats.not_evictable > 0 || stats.timeouts > 0 {
+                    // reminder: timeouts are not eviction cancellations
                     warn!(stats=?stats, "eviction iteration complete");
                 } else {
                     info!(stats=?stats, "eviction iteration complete");

From e9e77ee744298f4a79ec24734ffd5d76ddb83d02 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 1 Mar 2024 10:45:39 +0100
Subject: [PATCH 0308/1571] tests: add optional cursor to `log_contains` + fix
 truthiness issues in callers (#6960)

Extracted from https://github.com/neondatabase/neon/pull/6953

Part of https://github.com/neondatabase/neon/issues/5899

Core Change
-----------

In #6953, we need the ability to scan the log _after_ a specific line
and ignore anything before that line.

This PR changes `log_contains` to returns a tuple of `(matching line,
cursor)`.
Hand that cursor to a subsequent `log_contains` call to search the log
for the next occurrence of the pattern.

Other Changes
-------------

- Inspect all the callsites of `log_contains` to handle the new tuple
return type.
- Above inspection unveiled many callers aren't using `assert
log_contains(...) is not None` but some weaker version of the code that
breaks if `log_contains` ever returns a not-None but falsy value. Fix
that.
- Above changes unveiled that `test_remote_storage_upload_queue_retries`
was using `wait_until` incorrectly; after fixing the usage, I had to
raise the `wait_until` timeout. So, maybe this will fix its flakiness.
---
 test_runner/fixtures/neon_fixtures.py         | 27 ++++++++--
 test_runner/fixtures/pageserver/utils.py      |  6 +--
 test_runner/fixtures/utils.py                 | 19 ++++++-
 .../regress/test_attach_tenant_config.py      |  9 ++--
 .../regress/test_disk_usage_eviction.py       | 20 ++++---
 test_runner/regress/test_duplicate_layers.py  |  2 +-
 .../regress/test_layers_from_future.py        | 11 ++--
 test_runner/regress/test_logging.py           |  2 +-
 .../regress/test_pageserver_generations.py    |  2 +-
 test_runner/regress/test_remote_storage.py    | 52 ++++++++++---------
 test_runner/regress/test_sharding_service.py  |  4 +-
 test_runner/regress/test_tenant_delete.py     | 12 ++---
 test_runner/regress/test_tenant_detach.py     |  4 +-
 test_runner/regress/test_tenant_relocation.py |  4 +-
 .../test_tenants_with_remote_storage.py       |  4 +-
 .../regress/test_threshold_based_eviction.py  |  4 +-
 test_runner/regress/test_timeline_delete.py   | 11 ++--
 17 files changed, 119 insertions(+), 74 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 71e77334a1..b933d391ab 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2180,6 +2180,11 @@ class NeonAttachmentService(MetricsGetter):
         self.stop(immediate=True)
 
 
+@dataclass
+class LogCursor:
+    _line_no: int
+
+
 class NeonPageserver(PgProtocol):
     """
     An object representing a running pageserver.
@@ -2343,7 +2348,18 @@ class NeonPageserver(PgProtocol):
             value = self.http_client().get_metric_value(metric)
             assert value == 0, f"Nonzero {metric} == {value}"
 
-    def log_contains(self, pattern: str) -> Optional[str]:
+    def assert_log_contains(
+        self, pattern: str, offset: None | LogCursor = None
+    ) -> Tuple[str, LogCursor]:
+        """Convenient for use inside wait_until()"""
+
+        res = self.log_contains(pattern, offset=offset)
+        assert res is not None
+        return res
+
+    def log_contains(
+        self, pattern: str, offset: None | LogCursor = None
+    ) -> Optional[Tuple[str, LogCursor]]:
         """Check that the pageserver log contains a line that matches the given regex"""
         logfile = self.workdir / "pageserver.log"
         if not logfile.exists():
@@ -2357,12 +2373,17 @@ class NeonPageserver(PgProtocol):
         # no guarantee it is already present in the log file. This hasn't
         # been a problem in practice, our python tests are not fast enough
         # to hit that race condition.
+        skip_until_line_no = 0 if offset is None else offset._line_no
+        cur_line_no = 0
         with logfile.open("r") as f:
             for line in f:
+                if cur_line_no < skip_until_line_no:
+                    cur_line_no += 1
+                    continue
                 if contains_re.search(line):
                     # found it!
-                    return line
-
+                    cur_line_no += 1
+                    return (line, LogCursor(cur_line_no))
         return None
 
     def tenant_attach(
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 1415038f69..c600733e41 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -20,7 +20,7 @@ def assert_tenant_state(
     tenant: TenantId,
     expected_state: str,
     message: Optional[str] = None,
-):
+) -> None:
     tenant_status = pageserver_http.tenant_status(tenant)
     log.info(f"tenant_status: {tenant_status}")
     assert tenant_status["state"]["slug"] == expected_state, message or tenant_status
@@ -292,7 +292,7 @@ def timeline_delete_wait_completed(
     iterations: int = 20,
     interval: Optional[float] = None,
     **delete_args,
-):
+) -> None:
     pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
     wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)
 
@@ -302,7 +302,7 @@ def assert_prefix_empty(
     remote_storage: Optional[RemoteStorage],
     prefix: Optional[str] = None,
     allowed_postfix: Optional[str] = None,
-):
+) -> None:
     assert remote_storage is not None
     response = list_prefix(remote_storage, prefix)
     keys = response["KeyCount"]
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 91f33e1196..7fc3bae3af 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -369,7 +369,12 @@ def start_in_background(
         return spawned_process
 
 
-def wait_until(number_of_iterations: int, interval: float, func: Fn):
+WaitUntilRet = TypeVar("WaitUntilRet")
+
+
+def wait_until(
+    number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet]
+) -> WaitUntilRet:
     """
     Wait until 'func' returns successfully, without exception. Returns the
     last return value from the function.
@@ -387,6 +392,18 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn):
     raise Exception("timed out while waiting for %s" % func) from last_exception
 
 
+def assert_eq(a, b) -> None:
+    assert a == b
+
+
+def assert_gt(a, b) -> None:
+    assert a > b
+
+
+def assert_ge(a, b) -> None:
+    assert a >= b
+
+
 def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
     """
     Fast way to populate data.
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 6cae663842..7fbce6a10c 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -63,10 +63,11 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
         ]
     )
 
-    def log_contains_bad_request():
-        env.pageserver.log_contains(".*Error processing HTTP request: Bad request")
-
-    wait_until(50, 0.1, log_contains_bad_request)
+    wait_until(
+        50,
+        0.1,
+        lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"),
+    )
 
 
 def test_null_body(negative_env: NegativeTests):
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index eb4e370ea7..b83545216d 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -200,7 +200,7 @@ class EvictionEnv:
                 tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id)
 
         def statvfs_called():
-            assert pageserver.log_contains(".*running mocked statvfs.*")
+            pageserver.assert_log_contains(".*running mocked statvfs.*")
 
         # we most likely have already completed multiple runs
         wait_until(10, 1, statvfs_called)
@@ -533,7 +533,7 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E
     assert actual_change >= target, "eviction must always evict more than target"
 
     time.sleep(1)  # give log time to flush
-    assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE)
+    env.neon_env.pageserver.assert_log_contains(GLOBAL_LRU_LOG_LINE)
     env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
 
 
@@ -767,7 +767,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
         eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
-    assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO")
+    env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO")
     env.neon_env.pageserver.allowed_errors.append(".*statvfs failed.*EIO")
 
 
@@ -801,10 +801,9 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
         eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
-    def relieved_log_message():
-        assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
-
-    wait_until(10, 1, relieved_log_message)
+    wait_until(
+        10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
+    )
 
     def less_than_max_usage_pct():
         post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
@@ -845,10 +844,9 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
         eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
-    def relieved_log_message():
-        assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
-
-    wait_until(10, 1, relieved_log_message)
+    wait_until(
+        10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
+    )
 
     def more_than_min_avail_bytes_freed():
         post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py
index 224e6f50c7..cb4fa43be7 100644
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_duplicate_layers.py
@@ -36,7 +36,7 @@ def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])
 
     time.sleep(10)  # let compaction to be performed
-    assert env.pageserver.log_contains("compact-level0-phase1-return-same")
+    env.pageserver.assert_log_contains("compact-level0-phase1-return-same")
 
 
 def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 999e077e45..9da47b9fd3 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -184,10 +184,13 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
 
     # NB: the layer file is unlinked index part now, but, because we made the delete
     # operation stuck, the layer file itself is still in the remote_storage
-    def delete_at_pause_point():
-        assert env.pageserver.log_contains(f".*{tenant_id}.*at failpoint.*{failpoint_name}")
-
-    wait_until(10, 0.5, delete_at_pause_point)
+    wait_until(
+        10,
+        0.5,
+        lambda: env.pageserver.assert_log_contains(
+            f".*{tenant_id}.*at failpoint.*{failpoint_name}"
+        ),
+    )
     future_layer_path = env.pageserver_remote_storage.remote_layer_path(
         tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach
     )
diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py
index d62b5e531c..bfffad7572 100644
--- a/test_runner/regress/test_logging.py
+++ b/test_runner/regress/test_logging.py
@@ -34,7 +34,7 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
     def assert_logged():
         if not log_expected:
             return
-        assert env.pageserver.log_contains(f".*{msg_id}.*")
+        env.pageserver.assert_log_contains(f".*{msg_id}.*")
 
     wait_until(10, 0.5, assert_logged)
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 1070d06ed0..89fc48a49f 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -432,7 +432,7 @@ def test_deletion_queue_recovery(
 
     main_pageserver.start()
 
-    def assert_deletions_submitted(n: int):
+    def assert_deletions_submitted(n: int) -> None:
         assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
 
     # After restart, issue a flush to kick the deletion frontend to do recovery.
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 73ebe0a76f..f8a0bef954 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -28,7 +28,14 @@ from fixtures.remote_storage import (
     available_remote_storages,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import print_gc_result, query_scalar, wait_until
+from fixtures.utils import (
+    assert_eq,
+    assert_ge,
+    assert_gt,
+    print_gc_result,
+    query_scalar,
+    wait_until,
+)
 from requests import ReadTimeout
 
 
@@ -120,10 +127,10 @@ def test_remote_storage_backup_and_restore(
         log.info(f"upload of checkpoint {checkpoint_number} is done")
 
     # Check that we had to retry the uploads
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadLayer.*, will retry.*"
     )
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadMetadata.*, will retry.*"
     )
 
@@ -292,9 +299,9 @@ def test_remote_storage_upload_queue_retries(
     print_gc_result(gc_result)
     assert gc_result["layers_removed"] > 0
 
-    wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
-    wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
-    wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
+    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
+    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
+    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
 
     # let all future operations queue up
     configure_storage_sync_failpoints("return")
@@ -322,17 +329,17 @@ def test_remote_storage_upload_queue_retries(
     churn_while_failpoints_active_thread.start()
 
     # wait for churn thread's data to get stuck in the upload queue
-    wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="upload") > 0)
-    wait_until(10, 0.1, lambda: get_queued_count(file_kind="index", op_kind="upload") >= 2)
-    wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="delete") > 0)
+    wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
+    wait_until(10, 0.5, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
+    wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
 
     # unblock churn operations
     configure_storage_sync_failpoints("off")
 
     # ... and wait for them to finish. Exponential back-off in upload queue, so, gracious timeouts.
-    wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
-    wait_until(30, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
-    wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
+    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
+    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
+    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
 
     # The churn thread doesn't make progress once it blocks on the first wait_completion() call,
     # so, give it some time to wrap up.
@@ -884,26 +891,23 @@ def wait_upload_queue_empty(
     wait_until(
         2,
         1,
-        lambda: get_queued_count(
-            client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"
-        )
-        == 0,
+        lambda: assert_eq(
+            get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0
+        ),
     )
     wait_until(
         2,
         1,
-        lambda: get_queued_count(
-            client, tenant_id, timeline_id, file_kind="index", op_kind="upload"
-        )
-        == 0,
+        lambda: assert_eq(
+            get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0
+        ),
     )
     wait_until(
         2,
         1,
-        lambda: get_queued_count(
-            client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"
-        )
-        == 0,
+        lambda: assert_eq(
+            get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0
+        ),
     )
 
 
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 6ed49d7fd6..c8224c1c67 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -116,7 +116,7 @@ def test_sharding_service_smoke(
     # Marking a pageserver offline should migrate tenants away from it.
     env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
 
-    def node_evacuated(node_id: int):
+    def node_evacuated(node_id: int) -> None:
         counts = get_node_shard_counts(env, tenant_ids)
         assert counts[node_id] == 0
 
@@ -405,7 +405,7 @@ def test_sharding_service_compute_hook(
 
     env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
 
-    def node_evacuated(node_id: int):
+    def node_evacuated(node_id: int) -> None:
         counts = get_node_shard_counts(env, [env.initial_tenant])
         assert counts[node_id] == 0
 
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 8c7d332e1d..c4b4e5fb77 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -505,10 +505,10 @@ def test_tenant_delete_concurrent(
         return ps_http.tenant_delete(tenant_id)
 
     def hit_remove_failpoint():
-        assert env.pageserver.log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
+        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
 
     def hit_run_failpoint():
-        assert env.pageserver.log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
+        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
 
     with concurrent.futures.ThreadPoolExecutor() as executor:
         background_200_req = executor.submit(delete_tenant)
@@ -612,12 +612,12 @@ def test_tenant_delete_races_timeline_creation(
     Thread(target=timeline_create).start()
 
     def hit_initdb_upload_failpoint():
-        assert env.pageserver.log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
+        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
 
     wait_until(100, 0.1, hit_initdb_upload_failpoint)
 
     def creation_connection_timed_out():
-        assert env.pageserver.log_contains(
+        env.pageserver.assert_log_contains(
             "POST.*/timeline.* request was dropped before completing"
         )
 
@@ -636,7 +636,7 @@ def test_tenant_delete_races_timeline_creation(
     Thread(target=tenant_delete).start()
 
     def deletion_arrived():
-        assert env.pageserver.log_contains(
+        env.pageserver.assert_log_contains(
             f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
         )
 
@@ -663,7 +663,7 @@ def test_tenant_delete_races_timeline_creation(
     )
 
     # Ensure that creation cancelled and deletion didn't end up in broken state or encountered the leftover temp file
-    assert env.pageserver.log_contains(CANCELLED_ERROR)
+    env.pageserver.assert_log_contains(CANCELLED_ERROR)
     assert not env.pageserver.log_contains(
         ".*ERROR.*delete_tenant.*Timelines directory is not empty after all timelines deletion"
     )
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 4752699abb..d3f24cb06e 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -92,10 +92,10 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str):
     wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
 
     # Check that we had to retry the uploads
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadLayer.*, will retry.*"
     )
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadMetadata.*, will retry.*"
     )
 
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index b70131472a..9def3ad1c2 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -495,7 +495,7 @@ def test_emergency_relocate_with_branches_slow_replay(
         assert cur.fetchall() == [("before pause",), ("after pause",)]
 
     # Sanity check that the failpoint was reached
-    assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
+    env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
     assert time.time() - before_attach_time > 5
 
     # Clean up
@@ -632,7 +632,7 @@ def test_emergency_relocate_with_branches_createdb(
         assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200
 
     # Sanity check that the failpoint was reached
-    assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
+    env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
     assert time.time() - before_attach_time > 5
 
     # Clean up
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 1c693a0df5..d16978d02a 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -147,10 +147,10 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder):
         log.info(f"upload of checkpoint {checkpoint_number} is done")
 
     # Check that we had to retry the uploads
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadLayer.*, will retry.*"
     )
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadMetadata.*, will retry.*"
     )
 
diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py
index 5f72cfd747..7bf49a0874 100644
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -179,6 +179,6 @@ def test_threshold_based_eviction(
     assert len(post.remote_layers) > 0, "some layers should be evicted once it's stabilized"
     assert len(post.local_layers) > 0, "the imitate accesses should keep some layers resident"
 
-    assert env.pageserver.log_contains(
-        metrics_refused_log_line
+    assert (
+        env.pageserver.log_contains(metrics_refused_log_line) is not None
     ), "ensure the metrics collection worker ran"
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index a6a6fb47cc..795110d90b 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -89,6 +89,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
     assert timeline_path.exists()
 
     # retry deletes when compaction or gc is running in pageserver
+    # TODO: review whether this wait_until is actually necessary, we do an await() internally
     wait_until(
         number_of_iterations=3,
         interval=0.2,
@@ -531,7 +532,7 @@ def test_concurrent_timeline_delete_stuck_on(
     try:
 
         def first_call_hit_failpoint():
-            assert env.pageserver.log_contains(
+            env.pageserver.assert_log_contains(
                 f".*{child_timeline_id}.*at failpoint {stuck_failpoint}"
             )
 
@@ -602,7 +603,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
     at_failpoint_log_message = f".*{child_timeline_id}.*at failpoint {failpoint_name}.*"
 
     def hit_failpoint():
-        assert env.pageserver.log_contains(at_failpoint_log_message)
+        env.pageserver.assert_log_contains(at_failpoint_log_message)
 
     wait_until(50, 0.1, hit_failpoint)
 
@@ -612,7 +613,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
     env.pageserver.allowed_errors.append(hangup_log_message)
 
     def got_hangup_log_message():
-        assert env.pageserver.log_contains(hangup_log_message)
+        env.pageserver.assert_log_contains(hangup_log_message)
 
     wait_until(50, 0.1, got_hangup_log_message)
 
@@ -624,7 +625,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
 
     def first_request_finished():
         message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished"
-        assert env.pageserver.log_contains(message)
+        env.pageserver.assert_log_contains(message)
 
     wait_until(50, 0.1, first_request_finished)
 
@@ -759,7 +760,7 @@ def test_delete_orphaned_objects(
 
     for orphan in orphans:
         assert not orphan.exists()
-        assert env.pageserver.log_contains(
+        env.pageserver.assert_log_contains(
             f"deleting a file not referenced from index_part.json name={orphan.stem}"
         )
 

From 7ba50708e3450b501806568d5f37cd5e20d609fd Mon Sep 17 00:00:00 2001
From: Bodobolero <peterbendel@neon.tech>
Date: Fri, 1 Mar 2024 13:29:08 +0100
Subject: [PATCH 0309/1571] Testcase for neon extension function
 approximate_working_set_size() (#6980)

## Problem

PR https://github.com/neondatabase/neon/pull/6935 introduced a new
function in neon extension:

approximate_working_set_size

This test case verifies its working correctly.

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .../test_lfc_working_set_approximation.py     | 74 +++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 test_runner/regress/test_lfc_working_set_approximation.py

diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py
new file mode 100644
index 0000000000..a6f05fe0f7
--- /dev/null
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -0,0 +1,74 @@
+from pathlib import Path
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.utils import query_scalar
+
+
+def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    cache_dir = Path(env.repo_dir) / "file_cache"
+    cache_dir.mkdir(exist_ok=True)
+
+    branchname = "test_approximate_working_set_size"
+    env.neon_cli.create_branch(branchname, "empty")
+    log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
+    endpoint = env.endpoints.create_start(
+        branchname,
+        config_lines=[
+            "shared_buffers='1MB'",
+            f"neon.file_cache_path='{cache_dir}/file.cache'",
+            "neon.max_file_cache_size='128MB'",
+            "neon.file_cache_size_limit='64MB'",
+        ],
+    )
+
+    cur = endpoint.connect().cursor()
+    cur.execute("create extension neon")
+
+    log.info(f"preparing some data in {endpoint.connstr()}")
+
+    ddl = """
+CREATE TABLE pgbench_accounts (
+    aid bigint NOT NULL,
+    bid integer,
+    abalance integer,
+    filler character(84),
+    -- more web-app like columns
+    text_column_plain TEXT  DEFAULT repeat('NeonIsCool', 5),
+    jsonb_column_extended JSONB  DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb
+)
+WITH (fillfactor='100');
+"""
+
+    cur.execute(ddl)
+    # prepare index access below
+    cur.execute(
+        "ALTER TABLE ONLY pgbench_accounts ADD CONSTRAINT pgbench_accounts_pkey PRIMARY KEY (aid)"
+    )
+    cur.execute(
+        "insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;"
+    )
+    # ensure correct query plans and stats
+    cur.execute("vacuum ANALYZE pgbench_accounts")
+    # determine table size - working set should approximate table size after sequential scan
+    pages = query_scalar(cur, "SELECT relpages FROM pg_class WHERE relname = 'pgbench_accounts'")
+    log.info(f"pgbench_accounts has {pages} pages, resetting working set to zero")
+    cur.execute("select approximate_working_set_size(true)")
+    cur.execute(
+        'SELECT count(*) FROM pgbench_accounts WHERE abalance > 0 or jsonb_column_extended @> \'{"tell everyone": [{"Neon": "IsCool"}]}\'::jsonb'
+    )
+    # verify working set size after sequential scan matches table size and reset working set for next test
+    blocks = query_scalar(cur, "select approximate_working_set_size(true)")
+    log.info(f"working set size after sequential scan on pgbench_accounts {blocks}")
+    assert pages * 0.8 < blocks < pages * 1.2
+    # run a few point queries with index lookup
+    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid =   4242")
+    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid =  54242")
+    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242")
+    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242")
+    # verify working set size after some index access of a few select pages only
+    blocks = query_scalar(cur, "select approximate_working_set_size(true)")
+    log.info(f"working set size after some index access of a few select pages only {blocks}")
+    assert blocks < 10

From f8bdce101542ace882cf891f001f53c702a9685b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 1 Mar 2024 13:26:45 +0000
Subject: [PATCH 0310/1571] pageserver: fix duplicate shard_id in span (#6981)

## Problem

shard_id in span is repeated:
- https://github.com/neondatabase/neon/issues/6723

Closes: #6723

## Summary of changes

- Only add shard_id to the span when fetching a cached timeline, as it
is already added when loading an uncached timeline.
---
 pageserver/src/page_service.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 11eb512750..cd9c48f9af 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1115,7 +1115,10 @@ impl PageServerHandler {
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
         let timeline = match self.get_cached_timeline_for_page(req) {
-            Ok(tl) => tl,
+            Ok(tl) => {
+                set_tracing_field_shard_id(tl);
+                tl
+            }
             Err(key) => {
                 match self
                     .load_timeline_for_page(tenant_id, timeline_id, key)
@@ -1140,9 +1143,6 @@ impl PageServerHandler {
             }
         };
 
-        // load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't
-        set_tracing_field_shard_id(timeline);
-
         let _timer = timeline
             .query_metrics
             .start_timer(metrics::SmgrQueryType::GetPageAtLsn);

From 5ab10d051d28b930b81ef3b712a5f13de695285a Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 1 Mar 2024 16:04:39 +0200
Subject: [PATCH 0311/1571] metrics: record more details of the responding
 (#6979)

On eu-west-1 during benchmarks we sometimes lose samples. Add more time
measurements.
---
 libs/utils/src/http/endpoint.rs | 48 +++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 550ab10700..3c71628870 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -156,6 +156,10 @@ pub struct ChannelWriter {
     buffer: BytesMut,
     pub tx: mpsc::Sender<std::io::Result<Bytes>>,
     written: usize,
+    /// Time spent waiting for the channel to make progress. It is not the same as time to upload a
+    /// buffer because we cannot know anything about that, but this should allow us to understand
+    /// the actual time taken without the time spent `std::thread::park`ed.
+    wait_time: std::time::Duration,
 }
 
 impl ChannelWriter {
@@ -168,6 +172,7 @@ impl ChannelWriter {
             buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
             tx,
             written: 0,
+            wait_time: std::time::Duration::ZERO,
         }
     }
 
@@ -180,6 +185,8 @@ impl ChannelWriter {
         tracing::trace!(n, "flushing");
         let ready = self.buffer.split().freeze();
 
+        let wait_started_at = std::time::Instant::now();
+
         // not ideal to call from blocking code to block_on, but we are sure that this
         // operation does not spawn_blocking other tasks
         let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
@@ -192,6 +199,9 @@ impl ChannelWriter {
             // sending it to the client.
             Ok(())
         });
+
+        self.wait_time += wait_started_at.elapsed();
+
         if res.is_err() {
             return Err(std::io::ErrorKind::BrokenPipe.into());
         }
@@ -202,6 +212,10 @@ impl ChannelWriter {
     pub fn flushed_bytes(&self) -> usize {
         self.written
     }
+
+    pub fn wait_time(&self) -> std::time::Duration {
+        self.wait_time
+    }
 }
 
 impl std::io::Write for ChannelWriter {
@@ -252,22 +266,52 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
 
     let span = info_span!("blocking");
     tokio::task::spawn_blocking(move || {
+        // there are situations where we lose scraped metrics under load, try to gather some clues
+        // since all nodes are queried this, keep the message count low.
+        let spawned_at = std::time::Instant::now();
+
         let _span = span.entered();
+
         let metrics = metrics::gather();
+
+        let gathered_at = std::time::Instant::now();
+
         let res = encoder
             .encode(&metrics, &mut writer)
             .and_then(|_| writer.flush().map_err(|e| e.into()));
 
+        // this instant is not when we finally got the full response sent, sending is done by hyper
+        // in another task.
+        let encoded_at = std::time::Instant::now();
+
+        let spawned_in = spawned_at - started_at;
+        let collected_in = gathered_at - spawned_at;
+        // remove the wait time here in case the tcp connection was clogged
+        let encoded_in = encoded_at - gathered_at - writer.wait_time();
+        let total = encoded_at - started_at;
+
         match res {
             Ok(()) => {
                 tracing::info!(
                     bytes = writer.flushed_bytes(),
-                    elapsed_ms = started_at.elapsed().as_millis(),
+                    total_ms = total.as_millis(),
+                    spawning_ms = spawned_in.as_millis(),
+                    collection_ms = collected_in.as_millis(),
+                    encoding_ms = encoded_in.as_millis(),
                     "responded /metrics"
                 );
             }
             Err(e) => {
-                tracing::warn!("failed to write out /metrics response: {e:#}");
+                // there is a chance that this error is not the BrokenPipe we generate in the writer
+                // for "closed connection", but it is highly unlikely.
+                tracing::warn!(
+                    after_bytes = writer.flushed_bytes(),
+                    total_ms = total.as_millis(),
+                    spawning_ms = spawned_in.as_millis(),
+                    collection_ms = collected_in.as_millis(),
+                    encoding_ms = encoded_in.as_millis(),
+                    "failed to write out /metrics response: {e:?}"
+                );
                 // semantics of this error are quite... unclear. we want to error the stream out to
                 // abort the response to somehow notify the client that we failed.
                 //

From 4dbb74b559d09361df09b96a1225d889cb2f577d Mon Sep 17 00:00:00 2001
From: Bodobolero <peterbendel@neon.tech>
Date: Fri, 1 Mar 2024 15:33:08 +0100
Subject: [PATCH 0312/1571] new test for LFC stats in explain (#6968)

## Problem

PR https://github.com/neondatabase/neon/pull/6851 implemented new output
in PostgreSQL explain.
this is a test case for the new function.

## Summary of changes

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [x] If it is a core feature, I have added thorough tests.
- [no ] Do we need to implement analytics? if so did you add the
relevant metrics to the dashboard?
- [no] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 .../regress/test_explain_with_lfc_stats.py    | 84 +++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 test_runner/regress/test_explain_with_lfc_stats.py

diff --git a/test_runner/regress/test_explain_with_lfc_stats.py b/test_runner/regress/test_explain_with_lfc_stats.py
new file mode 100644
index 0000000000..5231dedcda
--- /dev/null
+++ b/test_runner/regress/test_explain_with_lfc_stats.py
@@ -0,0 +1,84 @@
+from pathlib import Path
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_explain_with_lfc_stats(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    cache_dir = Path(env.repo_dir) / "file_cache"
+    cache_dir.mkdir(exist_ok=True)
+
+    branchname = "test_explain_with_lfc_stats"
+    env.neon_cli.create_branch(branchname, "empty")
+    log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
+    endpoint = env.endpoints.create_start(
+        branchname,
+        config_lines=[
+            "shared_buffers='1MB'",
+            f"neon.file_cache_path='{cache_dir}/file.cache'",
+            "neon.max_file_cache_size='128MB'",
+            "neon.file_cache_size_limit='64MB'",
+        ],
+    )
+
+    cur = endpoint.connect().cursor()
+
+    log.info(f"preparing some data in {endpoint.connstr()}")
+
+    ddl = """
+CREATE TABLE pgbench_accounts (
+    aid bigint NOT NULL,
+    bid integer,
+    abalance integer,
+    filler character(84),
+    -- more web-app like columns
+    text_column_plain TEXT  DEFAULT repeat('NeonIsCool', 5),
+    jsonb_column_extended JSONB  DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb
+)
+WITH (fillfactor='100');
+"""
+
+    cur.execute(ddl)
+    cur.execute(
+        "insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;"
+    )
+
+    log.info(f"warming up caches with sequential scan in {endpoint.connstr()}")
+    cur.execute("SELECT * FROM pgbench_accounts WHERE abalance > 0")
+
+    log.info("running explain analyze without LFC values to verify they do not show up in the plan")
+    cur.execute("EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM pgbench_accounts WHERE abalance > 0")
+    rows = cur.fetchall()
+    plan = "\n".join(r[0] for r in rows)
+    log.debug(plan)
+    assert "Seq Scan on pgbench_accounts" in plan
+    assert "Buffers: shared hit" in plan
+    assert "File cache: hits=" not in plan
+    log.info("running explain analyze WITH LFC values to verify they do now show up")
+    cur.execute(
+        "EXPLAIN (ANALYZE, BUFFERS,FILECACHE) SELECT * FROM pgbench_accounts WHERE abalance > 0"
+    )
+    rows = cur.fetchall()
+    plan = "\n".join(r[0] for r in rows)
+    log.debug(plan)
+    assert "Seq Scan on pgbench_accounts" in plan
+    assert "Buffers: shared hit" in plan
+    assert "File cache: hits=" in plan
+    log.info("running explain analyze WITH LFC values to verify json output")
+    cur.execute(
+        "EXPLAIN (ANALYZE, BUFFERS,FILECACHE, FORMAT JSON) SELECT * FROM pgbench_accounts WHERE abalance > 0"
+    )
+    jsonplan = cur.fetchall()[0][0]
+    log.debug(jsonplan)
+    # Directly access the 'Plan' part of the first element of the JSON array
+    plan_details = jsonplan[0]["Plan"]
+
+    # Extract "File Cache Hits" and "File Cache Misses"
+    file_cache_hits = plan_details.get("File Cache Hits")
+    file_cache_misses = plan_details.get("File Cache Misses")
+
+    # Now you can assert the values
+    assert file_cache_hits >= 5000, f"Expected File Cache Hits to be > 5000, got {file_cache_hits}"
+    assert file_cache_misses == 0, f"Expected File Cache Misses to be 0, got {file_cache_misses}"

From 1efaa16260d081345febe46be26ff01b68053056 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 1 Mar 2024 14:43:33 +0000
Subject: [PATCH 0313/1571] test: add test for checkpoint timeout flushing
 (#6950)

## Problem
https://github.com/neondatabase/neon/pull/6661 changed the layer
flushing logic and led to OOMs in staging.
The issue turned out to be holding on to in-memory layers for too long.
After OOMing we'd need to replay potentially
a lot of WAL.

## Summary of changes
Test that open layers get flushed after the `checkpoint_timeout` config
and do not require WAL reingest upon restart.
The workload creates a number of timelines and writes some data to each,
but not enough to trigger flushes via the `checkpoint_distance` config.

I ran this test against https://github.com/neondatabase/neon/pull/6661
and it was indeed failing.
---
 test_runner/fixtures/pageserver/utils.py      |   4 +-
 .../test_pageserver_small_inmemory_layers.py  | 110 ++++++++++++++++++
 2 files changed, 112 insertions(+), 2 deletions(-)
 create mode 100644 test_runner/regress/test_pageserver_small_inmemory_layers.py

diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index c600733e41..cf64c86821 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -206,8 +206,8 @@ def wait_for_last_record_lsn(
             return current_lsn
         if i % 10 == 0:
             log.info(
-                "waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
-                    lsn, current_lsn, i + 1
+                "{}/{} waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
+                    tenant, timeline, lsn, current_lsn, i + 1
                 )
             )
         time.sleep(0.1)
diff --git a/test_runner/regress/test_pageserver_small_inmemory_layers.py b/test_runner/regress/test_pageserver_small_inmemory_layers.py
new file mode 100644
index 0000000000..5d55020e3c
--- /dev/null
+++ b/test_runner/regress/test_pageserver_small_inmemory_layers.py
@@ -0,0 +1,110 @@
+import asyncio
+import time
+from typing import Tuple
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    tenant_get_shards,
+)
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.utils import wait_for_last_record_lsn
+from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import wait_until
+
+TIMELINE_COUNT = 10
+ENTRIES_PER_TIMELINE = 10_000
+CHECKPOINT_TIMEOUT_SECONDS = 60
+
+TENANT_CONF = {
+    # Large `checkpoint_distance` effectively disables size
+    # based checkpointing.
+    "checkpoint_distance": f"{2 * 1024 ** 3}",
+    "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s",
+}
+
+
+async def run_worker(env: NeonEnv, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
+    tenant, timeline = env.neon_cli.create_tenant(conf=TENANT_CONF)
+    with env.endpoints.create_start("main", tenant_id=tenant) as ep:
+        conn = await ep.connect_async()
+        try:
+            await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)")
+            await conn.execute(
+                f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i"
+            )
+        finally:
+            await conn.close(timeout=10)
+
+        last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+        return tenant, timeline, last_flush_lsn
+
+
+async def workload(
+    env: NeonEnv, timelines: int, entries: int
+) -> list[Tuple[TenantId, TimelineId, Lsn]]:
+    workers = [asyncio.create_task(run_worker(env, entries)) for _ in range(timelines)]
+    return await asyncio.gather(*workers)
+
+
+def wait_until_pageserver_is_caught_up(
+    env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]]
+):
+    for tenant, timeline, last_flush_lsn in last_flush_lsns:
+        shards = tenant_get_shards(env, tenant)
+        for tenant_shard_id, pageserver in shards:
+            waited = wait_for_last_record_lsn(
+                pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
+            )
+            assert waited >= last_flush_lsn
+
+
+def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float:
+    def query():
+        value = pageserver_http.get_metric_value("pageserver_wal_ingest_records_received_total")
+        assert value is not None
+        return value
+
+    # The metric gets initialised on the first update.
+    # Retry a few times, but return 0 if it's stable.
+    try:
+        return float(wait_until(3, 0.5, query))
+    except Exception:
+        return 0
+
+
+@pytest.mark.parametrize("immediate_shutdown", [True, False])
+def test_pageserver_small_inmemory_layers(
+    neon_env_builder: NeonEnvBuilder, immediate_shutdown: bool
+):
+    """
+    Test that open layers get flushed after the `checkpoint_timeout` config
+    and do not require WAL reingest upon restart.
+
+    The workload creates a number of timelines and writes some data to each,
+    but not enough to trigger flushes via the `checkpoint_distance` config.
+    """
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    last_flush_lsns = asyncio.run(workload(env, TIMELINE_COUNT, ENTRIES_PER_TIMELINE))
+    wait_until_pageserver_is_caught_up(env, last_flush_lsns)
+
+    ps_http_client = env.pageserver.http_client()
+    total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client)
+
+    log.info("Sleeping for checkpoint timeout ...")
+    time.sleep(CHECKPOINT_TIMEOUT_SECONDS + 5)
+
+    env.pageserver.restart(immediate=immediate_shutdown)
+    wait_until_pageserver_is_caught_up(env, last_flush_lsns)
+
+    total_wal_ingested_after_restart = wait_for_wal_ingest_metric(ps_http_client)
+
+    log.info(f"WAL ingested before restart: {total_wal_ingested_before_restart}")
+    log.info(f"WAL ingested after restart: {total_wal_ingested_after_restart}")
+
+    leeway = total_wal_ingested_before_restart * 5 / 100
+    assert total_wal_ingested_after_restart <= leeway

From 82853cc1d1047a2efefa40355293ee9f348357ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 1 Mar 2024 17:14:19 +0100
Subject: [PATCH 0314/1571] Fix warnings and compile errors on nightly (#6886)

Nightly has added a bunch of compiler and linter warnings. There is also
two dependencies that fail compilation on latest nightly due to using
the old `stdsimd` feature name. This PR fixes them.
---
 Cargo.lock                                      |  8 ++++----
 compute_tools/src/compute.rs                    |  2 --
 compute_tools/src/extension_server.rs           |  2 +-
 compute_tools/src/http/api.rs                   |  2 --
 control_plane/attachment_service/src/main.rs    |  2 +-
 .../attachment_service/src/persistence.rs       |  6 ++++--
 .../attachment_service/src/scheduler.rs         |  1 -
 control_plane/src/attachment_service.rs         |  2 +-
 libs/pageserver_api/src/models.rs               |  2 --
 libs/pageserver_api/src/shard.rs                |  6 +-----
 libs/remote_storage/src/local_fs.rs             |  2 --
 libs/remote_storage/src/s3_bucket.rs            |  2 +-
 libs/utils/src/auth.rs                          |  1 -
 libs/utils/src/completion.rs                    |  6 ++++--
 libs/utils/src/http/endpoint.rs                 |  2 +-
 libs/utils/src/lsn.rs                           |  1 -
 libs/utils/src/seqwait.rs                       |  3 +--
 libs/utils/src/simple_rcu.rs                    |  2 +-
 libs/utils/src/sync/heavier_once_cell.rs        |  1 -
 pageserver/compaction/src/helpers.rs            |  1 -
 pageserver/src/config.rs                        |  6 +-----
 .../src/consumption_metrics/metrics/tests.rs    |  2 --
 pageserver/src/deletion_queue.rs                | 17 ++++-------------
 pageserver/src/metrics.rs                       | 11 +++++------
 pageserver/src/page_cache.rs                    | 11 ++++++-----
 pageserver/src/page_service.rs                  |  3 +--
 pageserver/src/repository.rs                    |  1 -
 pageserver/src/tenant.rs                        | 16 +++-------------
 pageserver/src/tenant/disk_btree.rs             |  3 ---
 pageserver/src/tenant/ephemeral_file.rs         |  2 +-
 pageserver/src/tenant/mgr.rs                    |  2 +-
 pageserver/src/tenant/remote_timeline_client.rs |  4 +---
 .../src/tenant/secondary/heatmap_uploader.rs    |  1 -
 .../src/tenant/storage_layer/image_layer.rs     |  1 -
 pageserver/src/walingest.rs                     |  2 --
 pageserver/src/walredo/apply_neon.rs            |  2 --
 pageserver/src/walredo/process/no_leak_child.rs |  4 +---
 proxy/src/bin/pg_sni_router.rs                  |  2 +-
 proxy/src/cache/project_info.rs                 |  3 +--
 proxy/src/console/mgmt.rs                       |  2 +-
 proxy/src/proxy/tests.rs                        |  2 +-
 proxy/src/proxy/tests/mitm.rs                   |  1 -
 proxy/src/serverless/conn_pool.rs               |  1 -
 safekeeper/src/control_file.rs                  |  7 +------
 safekeeper/src/handler.rs                       |  5 ++---
 45 files changed, 51 insertions(+), 114 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index dead212156..c23162971e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
 [[package]]
 name = "ahash"
-version = "0.8.5"
+version = "0.8.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
+checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
 dependencies = [
  "cfg-if",
  "const-random",
@@ -1389,9 +1389,9 @@ dependencies = [
 
 [[package]]
 name = "crc32c"
-version = "0.6.3"
+version = "0.6.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e"
+checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
 dependencies = [
  "rustc_version",
 ]
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 142bb14fe5..a82b999cfb 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -18,8 +18,6 @@ use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use postgres::{Client, NoTls};
-use tokio;
-use tokio_postgres;
 use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index 2cec12119f..ef1db73982 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -71,7 +71,7 @@ More specifically, here is an example ext_index.json
     }
 }
 */
-use anyhow::{self, Result};
+use anyhow::Result;
 use anyhow::{bail, Context};
 use bytes::Bytes;
 use compute_api::spec::RemoteExtSpec;
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index f076951239..128783b477 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -13,8 +13,6 @@ use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIErr
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
-use num_cpus;
-use serde_json;
 use tokio::task;
 use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 5b952ae4fc..d9acbc0abd 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -9,7 +9,7 @@ use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service};
-use aws_config::{self, BehaviorVersion, Region};
+use aws_config::{BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 1b98cc7655..4c6eb2291c 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -7,8 +7,10 @@ use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
-use diesel::prelude::*;
-use diesel::Connection;
+use diesel::{
+    Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl,
+    Selectable, SelectableHelper,
+};
 use pageserver_api::controller_api::NodeSchedulingPolicy;
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 3224751e47..87fce3df25 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -284,7 +284,6 @@ pub(crate) mod test_utils {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use utils::id::NodeId;
 
     use crate::tenant_state::IntentState;
     #[test]
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 92342b478b..610d7386d9 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -200,7 +200,7 @@ impl AttachmentService {
                 "localhost",
                 "-p",
                 &format!("{}", self.postgres_port),
-                &DB_NAME,
+                DB_NAME,
             ])
             .output()
             .await
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 61aa8a5ae8..d583866290 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -14,7 +14,6 @@ use byteorder::{BigEndian, ReadBytesExt};
 use postgres_ffi::BLCKSZ;
 use serde::{Deserialize, Serialize};
 use serde_with::serde_as;
-use strum_macros;
 use utils::{
     completion,
     history_buffer::HistoryBufferWithDropCounter,
@@ -1077,7 +1076,6 @@ impl PagestreamBeMessage {
 
 #[cfg(test)]
 mod tests {
-    use bytes::Buf;
     use serde_json::json;
 
     use super::*;
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 467a4cf0c1..a2a9165184 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -6,7 +6,6 @@ use crate::{
 };
 use hex::FromHex;
 use serde::{Deserialize, Serialize};
-use thiserror;
 use utils::id::TenantId;
 
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
@@ -656,10 +655,7 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
 
 #[cfg(test)]
 mod tests {
-    use std::str::FromStr;
-
-    use bincode;
-    use utils::{id::TenantId, Hex};
+    use utils::Hex;
 
     use super::*;
 
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 6f847cf9d7..478ad81dc1 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -623,9 +623,7 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
 mod fs_tests {
     use super::*;
 
-    use bytes::Bytes;
     use camino_tempfile::tempdir;
-    use futures_util::Stream;
     use std::{collections::HashMap, io::Write};
 
     async fn read_and_check_metadata(
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index af70dc7ca2..438f45fbde 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -1040,7 +1040,7 @@ mod tests {
             Some("test/prefix/"),
             Some("/test/prefix/"),
         ];
-        let expected_outputs = vec![
+        let expected_outputs = [
             vec!["", "some/path", "some/path"],
             vec!["/", "/some/path", "/some/path"],
             vec![
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index fbf0dff665..03e65f74fe 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,7 +1,6 @@
 // For details about authentication see docs/authentication.md
 
 use arc_swap::ArcSwap;
-use serde;
 use std::{borrow::Cow, fmt::Display, fs, sync::Arc};
 
 use anyhow::Result;
diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs
index ea05cf54b1..2fef8d35df 100644
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -4,7 +4,9 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
 ///
 /// Can be cloned, moved and kept around in futures as "guard objects".
 #[derive(Clone)]
-pub struct Completion(TaskTrackerToken);
+pub struct Completion {
+    _token: TaskTrackerToken,
+}
 
 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
@@ -49,5 +51,5 @@ pub fn channel() -> (Completion, Barrier) {
     tracker.close();
 
     let token = tracker.token();
-    (Completion(token), Barrier(tracker))
+    (Completion { _token: token }, Barrier(tracker))
 }
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 3c71628870..a60971abf0 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -9,7 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
-use tracing::{self, debug, info, info_span, warn, Instrument};
+use tracing::{debug, info, info_span, warn, Instrument};
 
 use std::future::Future;
 use std::str::FromStr;
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index b3269ae049..1aebe91428 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -415,7 +415,6 @@ mod tests {
 
     use super::*;
 
-    use serde::ser::Serialize;
     use serde_assert::{Deserializer, Serializer, Token, Tokens};
 
     #[test]
diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs
index effc9c67b5..b7301776eb 100644
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -1,6 +1,6 @@
 #![warn(missing_docs)]
 
-use std::cmp::{Eq, Ordering, PartialOrd};
+use std::cmp::{Eq, Ordering};
 use std::collections::BinaryHeap;
 use std::fmt::Debug;
 use std::mem;
@@ -249,7 +249,6 @@ where
 mod tests {
     use super::*;
     use std::sync::Arc;
-    use std::time::Duration;
 
     impl MonotonicCounter<i32> for i32 {
         fn cnt_advance(&mut self, val: i32) {
diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs
index dc4a599111..ecc5353be3 100644
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -221,7 +221,7 @@ impl RcuWaitList {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use std::sync::{Arc, Mutex};
+    use std::sync::Mutex;
     use std::time::Duration;
 
     #[tokio::test]
diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 0773abba2d..703a6dfd52 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -239,7 +239,6 @@ mod tests {
     use std::{
         convert::Infallible,
         pin::{pin, Pin},
-        sync::atomic::{AtomicUsize, Ordering},
         time::Duration,
     };
 
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index a12f691504..22a410b4af 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -6,7 +6,6 @@ use futures::future::BoxFuture;
 use futures::{Stream, StreamExt};
 use itertools::Itertools;
 use pin_project_lite::pin_project;
-use std::cmp::Ord;
 use std::collections::BinaryHeap;
 use std::collections::VecDeque;
 use std::future::Future;
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 0a7172bde2..437387164d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -20,7 +20,6 @@ use std::num::NonZeroUsize;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
-use toml_edit;
 use toml_edit::{Document, Item};
 
 use camino::{Utf8Path, Utf8PathBuf};
@@ -1203,10 +1202,7 @@ impl ConfigurableSemaphore {
 
 #[cfg(test)]
 mod tests {
-    use std::{
-        fs,
-        num::{NonZeroU32, NonZeroUsize},
-    };
+    use std::{fs, num::NonZeroU32};
 
     use camino_tempfile::{tempdir, Utf8TempDir};
     use pageserver_api::models::EvictionPolicy;
diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs
index 38a4c9eb5d..f9cbcea565 100644
--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -1,7 +1,5 @@
 use super::*;
 use std::collections::HashMap;
-use std::time::SystemTime;
-use utils::lsn::Lsn;
 
 #[test]
 fn startup_collected_timeline_metrics_before_advancing() {
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index ca9ae8f983..313eb2663d 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -20,10 +20,9 @@ use remote_storage::{GenericRemoteStorage, RemotePath};
 use serde::Deserialize;
 use serde::Serialize;
 use thiserror::Error;
-use tokio;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
-use tracing::{self, debug, error};
+use tracing::{debug, error};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::generation::Generation;
 use utils::id::TimelineId;
@@ -726,7 +725,7 @@ mod test {
     use camino::Utf8Path;
     use hex_literal::hex;
     use pageserver_api::shard::ShardIndex;
-    use std::{io::ErrorKind, time::Duration};
+    use std::io::ErrorKind;
     use tracing::info;
 
     use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -735,10 +734,7 @@ mod test {
     use crate::{
         control_plane_client::RetryForeverError,
         repository::Key,
-        tenant::{
-            harness::TenantHarness, remote_timeline_client::remote_timeline_path,
-            storage_layer::DeltaFileName,
-        },
+        tenant::{harness::TenantHarness, storage_layer::DeltaFileName},
     };
 
     use super::*;
@@ -1161,13 +1157,8 @@ mod test {
 pub(crate) mod mock {
     use tracing::info;
 
-    use crate::tenant::remote_timeline_client::remote_layer_path;
-
     use super::*;
-    use std::sync::{
-        atomic::{AtomicUsize, Ordering},
-        Arc,
-    };
+    use std::sync::atomic::{AtomicUsize, Ordering};
 
     pub struct ConsumerState {
         rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 1d894ed8a5..ce5561b431 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1915,17 +1915,16 @@ impl Drop for TimelineMetrics {
         let tenant_id = &self.tenant_id;
         let timeline_id = &self.timeline_id;
         let shard_id = &self.shard_id;
-        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
-            let _ =
-                RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
-        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
-            let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+            let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
-        let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
 
         self.evictions_with_low_residence_duration
             .write()
diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs
index 28d2584bf4..529fb9bb07 100644
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -73,7 +73,6 @@
 
 use std::{
     collections::{hash_map::Entry, HashMap},
-    convert::TryInto,
     sync::{
         atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
         Arc, Weak,
@@ -262,7 +261,9 @@ pub struct PageCache {
     size_metrics: &'static PageCacheSizeMetrics,
 }
 
-struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
+struct PinnedSlotsPermit {
+    _permit: tokio::sync::OwnedSemaphorePermit,
+}
 
 ///
 /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
@@ -558,9 +559,9 @@ impl PageCache {
         )
         .await
         {
-            Ok(res) => Ok(PinnedSlotsPermit(
-                res.expect("this semaphore is never closed"),
-            )),
+            Ok(res) => Ok(PinnedSlotsPermit {
+                _permit: res.expect("this semaphore is never closed"),
+            }),
             Err(_timeout) => {
                 crate::metrics::page_cache_errors_inc(
                     crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index cd9c48f9af..689bc5cb3c 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -27,7 +27,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::ShardNumber;
-use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
+use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
@@ -44,7 +44,6 @@ use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
-use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::sync::gate::GateGuard;
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index c726139524..9959d105eb 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,7 +37,6 @@ impl Value {
 mod test {
     use super::*;
 
-    use bytes::Bytes;
     use utils::bin_ser::BeSer;
 
     macro_rules! roundtrip {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f027e9d4b1..4158133111 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -109,7 +109,6 @@ pub use pageserver_api::models::TenantState;
 use tokio::sync::Semaphore;
 
 static INIT_DB_SEMAPHORE: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(8));
-use toml_edit;
 use utils::{
     crashsafe,
     generation::Generation,
@@ -2384,7 +2383,7 @@ impl Tenant {
             self.tenant_shard_id,
             self.generation,
             self.shard_identity,
-            self.walredo_mgr.as_ref().map(Arc::clone),
+            self.walredo_mgr.clone(),
             resources,
             pg_version,
             state,
@@ -3593,25 +3592,18 @@ pub async fn dump_layerfile_from_path(
 #[cfg(test)]
 pub(crate) mod harness {
     use bytes::{Bytes, BytesMut};
-    use camino::Utf8PathBuf;
     use once_cell::sync::OnceCell;
     use pageserver_api::models::ShardParameters;
     use pageserver_api::shard::ShardIndex;
-    use std::fs;
-    use std::sync::Arc;
     use utils::logging;
-    use utils::lsn::Lsn;
 
     use crate::deletion_queue::mock::MockDeletionQueue;
     use crate::walredo::apply_neon;
-    use crate::{
-        config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord,
-    };
+    use crate::{repository::Key, walrecord::NeonWalRecord};
 
     use super::*;
-    use crate::tenant::config::{TenantConf, TenantConfOpt};
     use hex_literal::hex;
-    use utils::id::{TenantId, TimelineId};
+    use utils::id::TenantId;
 
     pub const TIMELINE_ID: TimelineId =
         TimelineId::from_array(hex!("11223344556677881122334455667788"));
@@ -3840,10 +3832,8 @@ mod tests {
     use crate::DEFAULT_PG_VERSION;
     use bytes::BytesMut;
     use hex_literal::hex;
-    use once_cell::sync::Lazy;
     use pageserver_api::keyspace::KeySpace;
     use rand::{thread_rng, Rng};
-    use tokio_util::sync::CancellationToken;
 
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 9f104aff86..ca30b0ac4f 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -21,7 +21,6 @@
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
 use either::Either;
-use hex;
 use std::{cmp::Ordering, io, result};
 use thiserror::Error;
 use tracing::error;
@@ -700,8 +699,6 @@ impl<const L: usize> BuildNode<L> {
 #[cfg(test)]
 pub(crate) mod tests {
     use super::*;
-    use crate::context::DownloadBehavior;
-    use crate::task_mgr::TaskKind;
     use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
     use rand::Rng;
     use std::collections::BTreeMap;
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 2bedbf7f61..e48b9e83bd 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -300,7 +300,7 @@ mod tests {
     use super::*;
     use crate::context::DownloadBehavior;
     use crate::task_mgr::TaskKind;
-    use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
+    use crate::tenant::block_io::BlockReaderRef;
     use rand::{thread_rng, RngCore};
     use std::fs;
     use std::str::FromStr;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 805d44f93d..06b61d4631 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2648,7 +2648,7 @@ pub(crate) async fn immediate_gc(
 
     let tenant = guard
         .get(&tenant_shard_id)
-        .map(Arc::clone)
+        .cloned()
         .with_context(|| format!("tenant {tenant_shard_id}"))
         .map_err(|e| ApiError::NotFound(e.into()))?;
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 7d30745a0d..40be2ca8f3 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1791,14 +1791,12 @@ mod tests {
         context::RequestContext,
         tenant::{
             harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::Layer,
-            Generation, Tenant, Timeline,
+            Tenant, Timeline,
         },
         DEFAULT_PG_VERSION,
     };
 
     use std::collections::HashSet;
-    use utils::lsn::Lsn;
 
     pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
         format!("contents for {name}").into()
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 147cf683ba..a8b05f4c0e 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -18,7 +18,6 @@ use crate::{
 };
 
 use futures::Future;
-use md5;
 use pageserver_api::shard::TenantShardId;
 use rand::Rng;
 use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 0a707295cc..56cfaeda15 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -43,7 +43,6 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
-use hex;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 3a2705bb50..63a2b30d09 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1667,8 +1667,6 @@ mod tests {
     use super::*;
     use crate::tenant::harness::*;
     use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH};
-    use crate::tenant::Timeline;
-    use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
     use postgres_ffi::RELSEG_SIZE;
 
     use crate::DEFAULT_PG_VERSION;
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index 6ce90e0c47..247704e2a5 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -252,8 +252,6 @@ mod test {
     use super::*;
     use std::collections::HashMap;
 
-    use crate::{pgdatadir_mapping::AuxFilesDirectory, walrecord::NeonWalRecord};
-
     /// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile
     #[test]
     fn apply_aux_file_deltas() -> anyhow::Result<()> {
diff --git a/pageserver/src/walredo/process/no_leak_child.rs b/pageserver/src/walredo/process/no_leak_child.rs
index ca016408e6..1a0d7039df 100644
--- a/pageserver/src/walredo/process/no_leak_child.rs
+++ b/pageserver/src/walredo/process/no_leak_child.rs
@@ -1,7 +1,5 @@
-use tracing;
-use tracing::error;
-use tracing::info;
 use tracing::instrument;
+use tracing::{error, info};
 
 use crate::metrics::WalRedoKillCause;
 use crate::metrics::WAL_REDO_PROCESS_COUNTERS;
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 5024ba3744..d5ab66d6aa 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -13,7 +13,7 @@ use proxy::proxy::run_until_cancelled;
 use tokio::net::TcpListener;
 
 use anyhow::{anyhow, bail, ensure, Context};
-use clap::{self, Arg};
+use clap::Arg;
 use futures::TryFutureExt;
 use proxy::console::messages::MetricsAuxInfo;
 use proxy::stream::{PqStream, Stream};
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 62015312a9..6e3eb8c1b0 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -358,8 +358,7 @@ impl Cache for ProjectInfoCacheImpl {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{console::AuthSecret, scram::ServerSecret};
-    use std::{sync::Arc, time::Duration};
+    use crate::scram::ServerSecret;
 
     #[tokio::test]
     async fn test_project_info_cache_settings() {
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index 373138b09e..c7a2d467c0 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -4,7 +4,7 @@ use crate::{
 };
 use anyhow::Context;
 use once_cell::sync::Lazy;
-use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
+use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
 use std::{convert::Infallible, future};
 use tokio::net::{TcpListener, TcpStream};
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 595d9c4979..d866b1820f 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -16,7 +16,7 @@ use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBacken
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
-use crate::{auth, http, sasl, scram};
+use crate::{http, sasl, scram};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use rstest::rstest;
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index ed89e51754..e0c2d836f4 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -11,7 +11,6 @@ use bytes::{Bytes, BytesMut};
 use futures::{SinkExt, StreamExt};
 use postgres_protocol::message::frontend;
 use tokio::io::{AsyncReadExt, DuplexStream};
-use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::TlsConnect;
 use tokio_util::codec::{Decoder, Encoder};
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 53e7c1c2ee..7d705ba049 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -667,7 +667,6 @@ impl<C: ClientInnerExt> Drop for Client<C> {
 
 #[cfg(test)]
 mod tests {
-    use env_logger;
     use std::{mem, sync::atomic::AtomicBool};
 
     use super::*;
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index c39c1dbf28..d822c87c0e 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -19,8 +19,6 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId};
 
 use crate::SafeKeeperConf;
 
-use std::convert::TryInto;
-
 pub const SK_MAGIC: u32 = 0xcafeceefu32;
 pub const SK_FORMAT_VERSION: u32 = 7;
 
@@ -219,12 +217,9 @@ impl Storage for FileStorage {
 
 #[cfg(test)]
 mod test {
-    use super::FileStorage;
     use super::*;
-    use crate::SafeKeeperConf;
-    use anyhow::Result;
     use tokio::fs;
-    use utils::{id::TenantTimelineId, lsn::Lsn};
+    use utils::lsn::Lsn;
 
     fn stub_conf() -> SafeKeeperConf {
         let workdir = camino_tempfile::tempdir().unwrap().into_path();
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 761541168c..f45bfb95fa 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -2,8 +2,7 @@
 //! protocol commands.
 
 use anyhow::Context;
-use std::str::FromStr;
-use std::str::{self};
+use std::str::{self, FromStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{debug, info, info_span, Instrument};
@@ -16,8 +15,8 @@ use crate::safekeeper::Term;
 use crate::timeline::TimelineError;
 use crate::wal_service::ConnectionId;
 use crate::{GlobalTimelines, SafeKeeperConf};
+use postgres_backend::PostgresBackend;
 use postgres_backend::QueryError;
-use postgres_backend::{self, PostgresBackend};
 use postgres_ffi::PG_TLI;
 use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
 use regex::Regex;

From d999c4669240a6dec67311ae8a10c8e0bd026977 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 1 Mar 2024 16:19:40 +0000
Subject: [PATCH 0315/1571] pageserver: handle temp_download files in secondary
 locations (#6990)

## Problem

PR #6837 fixed secondary locations to avoid spamming log warnings on
temp files, but we also have ".temp_download" files to consider.

## Summary of changes

- Give temp_download files the same behavior as temp files.
- Refactor the relevant helper to pub(crate) from pub
---
 pageserver/src/tenant/remote_timeline_client/download.rs | 2 +-
 pageserver/src/tenant/secondary/downloader.rs            | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 962cf5d12e..167e18a829 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -161,7 +161,7 @@ pub async fn download_layer_file<'a>(
 
 const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
 
-pub fn is_temp_download_file(path: &Utf8Path) -> bool {
+pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
     let extension = path.extension();
     match extension {
         Some(TEMP_DOWNLOAD_EXTENSION) => true,
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 5c4e4fd160..b679077358 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -16,7 +16,8 @@ use crate::{
         config::SecondaryLocationConfig,
         debug_assert_current_span_has_tenant_and_timeline_id,
         remote_timeline_client::{
-            index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
+            index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
         },
         span::debug_assert_current_span_has_tenant_id,
         storage_layer::LayerFileName,
@@ -788,7 +789,7 @@ async fn init_timeline_state(
             // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
             warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
             continue;
-        } else if crate::is_temporary(&file_path) {
+        } else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
             // Temporary files are frequently left behind from restarting during downloads
             tracing::info!("Cleaning up temporary file {file_path}");
             if let Err(e) = tokio::fs::remove_file(&file_path)

From e34059cd185998d8ae60ba3e2086a7258ec6fdb7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 1 Mar 2024 16:49:37 +0000
Subject: [PATCH 0316/1571] pageserver: increase
 DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG (#6970)

## Problem

At high ingest rates, pageservers spuriously disconnect from safekeepers
because stats updates don't come in frequently enough to keep the
broker/safekeeper LSN delta under the wal lag limit.

## Summary of changes

- Increase DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG from 10MiB to 1GiB. This
should be enough for realistic per-timeline throughputs.
---
 pageserver/src/tenant/config.rs         | 5 ++++-
 test_runner/regress/test_tenant_conf.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 18c4ea664e..9464324413 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -52,7 +52,10 @@ pub mod defaults {
     pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
     pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
     pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
-    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
+    // The default limit on WAL lag should be set to avoid causing disconnects under high throughput
+    // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
+    // throughputs up to 1GiB/s per timeline.
+    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
     pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
 
     pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index a2ffd200a6..fc099297e1 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -270,7 +270,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
         "period": "20s",
         "threshold": "23h",
     }
-    assert final_effective_config["max_lsn_wal_lag"] == 10 * 1024 * 1024
+    assert final_effective_config["max_lsn_wal_lag"] == 1024 * 1024 * 1024
 
     # restart the pageserver and ensure that the config is still correct
     env.pageserver.stop()

From ea0d35f3ca7b58ba4be820d4a161fd2380806b2b Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 1 Mar 2024 14:54:07 -0500
Subject: [PATCH 0317/1571] neon_local: improved docs and fix wrong connstr
 (#6954)

The user created with the `--create-test-user` flag is `test` instead of
`user`.

ref https://github.com/neondatabase/neon/pull/6848

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 README.md                     |  2 ++
 control_plane/README.md       | 26 ++++++++++++++++++++++++++
 control_plane/src/endpoint.rs |  2 +-
 3 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 control_plane/README.md

diff --git a/README.md b/README.md
index 95926b4628..c44ae695d6 100644
--- a/README.md
+++ b/README.md
@@ -230,6 +230,8 @@ postgres=# select * from t;
 > cargo neon stop
 ```
 
+More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md).
+
 #### Handling build failures
 
 If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again.
diff --git a/control_plane/README.md b/control_plane/README.md
new file mode 100644
index 0000000000..827aba5c1f
--- /dev/null
+++ b/control_plane/README.md
@@ -0,0 +1,26 @@
+# Control Plane and Neon Local
+
+This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.
+
+## Example: Start with Postgres 16
+
+To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands.
+
+```shell
+cargo neon init --pg-version 16
+cargo neon start
+cargo neon tenant create --set-default --pg-version 16
+cargo neon endpoint create main --pg-version 16
+cargo neon endpoint start main
+```
+
+## Example: Create Test User and Database
+
+By default, `cargo neon` starts an endpoint with `cloud_admin` and `postgres` database. If you want to have a role and a database similar to what we have on the cloud service, you can do it with the following commands when starting an endpoint.
+
+```shell
+cargo neon endpoint create main --pg-version 16 --update-catalog true
+cargo neon endpoint start main --create-test-user true
+```
+
+The first command creates `neon_superuser` and necessary roles. The second command creates `test` user and `neondb` database. You will see a connection string that connects you to the test user after running the second command.
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index de7eb797d6..5a75bc2a1d 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -605,7 +605,7 @@ impl Endpoint {
         let conn_str = self.connstr("cloud_admin", "postgres");
         println!("Starting postgres node at '{}'", conn_str);
         if create_test_user {
-            let conn_str = self.connstr("user", "neondb");
+            let conn_str = self.connstr("test", "neondb");
             println!("Also at '{}'", conn_str);
         }
         let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));

From 20d0939b0032a4ed99359af33f2bbc253de4807a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 1 Mar 2024 20:25:53 +0000
Subject: [PATCH 0318/1571] control_plane/attachment_service: implement
 PlacementPolicy::Secondary, configuration updates (#6521)

During onboarding, the control plane may attempt ad-hoc creation of a
secondary location to facilitate live migration. This gives us two
problems to solve:
- Accept 'Secondary' mode in /location_config and use it to put the
tenant into secondary mode on some physical pageserver, then pass
through /tenant/xyz/secondary/download requests
- Create tenants with no generation initially, since the initial
`Secondary` mode call will not provide us a generation.

This PR also fixes modification of a tenant's TenantConf during
/location_conf, which was previously ignored, and refines the flow for
config modification:
- avoid bumping generations when the only reason we're reconciling an
attached location is a config change
- increment TenantState.sequence when spawning a reconciler: usually
schedule() does this, but when we do config changes that doesn't happen,
so without this change waiters would think reconciliation was done
immediately. `sequence` is a bit of a murky thing right now, as it's
dual-purposed for tracking waiters, and for checking if an existing
reconciliation is already making updates to our current sequence. I'll
follow up at some point to clarify it's purpose.
- test config modification at the end of onboarding test
---
 .../down.sql                                  |   2 +
 .../2024-02-29-094122_generations_null/up.sql |   4 +
 control_plane/attachment_service/src/http.rs  |  48 +-
 control_plane/attachment_service/src/lib.rs   |  10 +-
 .../attachment_service/src/persistence.rs     | 101 ++-
 .../attachment_service/src/reconciler.rs      |  73 +-
 .../attachment_service/src/schema.rs          |   4 +-
 .../attachment_service/src/service.rs         | 623 +++++++++++++-----
 .../attachment_service/src/tenant_state.rs    | 115 +++-
 libs/utils/src/generation.rs                  |   2 +-
 test_runner/regress/test_sharding_service.py  |  91 ++-
 11 files changed, 842 insertions(+), 231 deletions(-)
 create mode 100644 control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql

diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
new file mode 100644
index 0000000000..503231f69d
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
@@ -0,0 +1,2 @@
+ALTER TABLE tenant_shards ALTER generation SET NOT NULL;
+ALTER TABLE tenant_shards ALTER generation_pageserver SET NOT NULL;
diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
new file mode 100644
index 0000000000..7e1e3cfe90
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
@@ -0,0 +1,4 @@
+
+
+ALTER TABLE tenant_shards ALTER generation DROP NOT NULL;
+ALTER TABLE tenant_shards ALTER generation_pageserver DROP NOT NULL;
\ No newline at end of file
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index f1153c2c18..384bdcef0c 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,9 +1,10 @@
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
+use crate::PlacementPolicy;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
-    TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
+    TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
     TenantTimeTravelRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
@@ -117,9 +118,14 @@ async fn handle_tenant_create(
     check_permissions(&req, Scope::PageServerApi)?;
 
     let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
+
+    // TODO: enable specifying this.  Using Single as a default helps legacy tests to work (they
+    // have no expectation of HA).
+    let placement_policy = PlacementPolicy::Single;
+
     json_response(
         StatusCode::CREATED,
-        service.tenant_create(create_req).await?,
+        service.tenant_create(create_req, placement_policy).await?,
     )
 }
 
@@ -185,6 +191,27 @@ async fn handle_tenant_location_config(
     )
 }
 
+async fn handle_tenant_config_set(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let config_req = json_request::<TenantConfigRequest>(&mut req).await?;
+
+    json_response(StatusCode::OK, service.tenant_config_set(config_req).await?)
+}
+
+async fn handle_tenant_config_get(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?)
+}
+
 async fn handle_tenant_time_travel_remote_storage(
     service: Arc<Service>,
     mut req: Request<Body>,
@@ -216,7 +243,15 @@ async fn handle_tenant_time_travel_remote_storage(
             done_if_after_raw,
         )
         .await?;
+    json_response(StatusCode::OK, ())
+}
 
+async fn handle_tenant_secondary_download(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    service.tenant_secondary_download(tenant_id).await?;
     json_response(StatusCode::OK, ())
 }
 
@@ -551,12 +586,21 @@ pub fn make_router(
         .delete("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(r, handle_tenant_delete)
         })
+        .put("/v1/tenant/config", |r| {
+            tenant_service_handler(r, handle_tenant_config_set)
+        })
+        .get("/v1/tenant/:tenant_id/config", |r| {
+            tenant_service_handler(r, handle_tenant_config_get)
+        })
         .put("/v1/tenant/:tenant_id/location_config", |r| {
             tenant_service_handler(r, handle_tenant_location_config)
         })
         .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
             tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
         })
+        .post("/v1/tenant/:tenant_id/secondary/download", |r| {
+            tenant_service_handler(r, handle_tenant_secondary_download)
+        })
         // Timeline operations
         .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
             tenant_service_handler(r, handle_tenant_timeline_delete)
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index ce613e858f..7ae7e264c7 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -13,14 +13,20 @@ mod schema;
 pub mod service;
 mod tenant_state;
 
-#[derive(Clone, Serialize, Deserialize, Debug)]
+#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
 enum PlacementPolicy {
     /// Cheapest way to attach a tenant: just one pageserver, no secondary
     Single,
     /// Production-ready way to attach a tenant: one attached pageserver and
     /// some number of secondaries.
     Double(usize),
-    /// Do not attach to any pageservers
+    /// Create one secondary mode locations. This is useful when onboarding
+    /// a tenant, or for an idle tenant that we might want to bring online quickly.
+    Secondary,
+
+    /// Do not attach to any pageservers.  This is appropriate for tenants that
+    /// have been idle for a long time, where we do not mind some delay in making
+    /// them available in future.
     Detached,
 }
 
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 4c6eb2291c..d5c304385c 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -333,7 +333,15 @@ impl Persistence {
                 shard_number: ShardNumber(tsp.shard_number as u8),
                 shard_count: ShardCount::new(tsp.shard_count as u8),
             };
-            result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
+
+            let Some(g) = tsp.generation else {
+                // If the generation_pageserver column was non-NULL, then the generation column should also be non-NULL:
+                // we only set generation_pageserver when setting generation.
+                return Err(DatabaseError::Logical(
+                    "Generation should always be set after incrementing".to_string(),
+                ));
+            };
+            result.insert(tenant_shard_id, Generation::new(g as u32));
         }
 
         Ok(result)
@@ -366,7 +374,85 @@ impl Persistence {
             })
             .await?;
 
-        Ok(Generation::new(updated.generation as u32))
+        // Generation is always non-null in the rseult: if the generation column had been NULL, then we
+        // should have experienced an SQL Confilict error while executing a query that tries to increment it.
+        debug_assert!(updated.generation.is_some());
+        let Some(g) = updated.generation else {
+            return Err(DatabaseError::Logical(
+                "Generation should always be set after incrementing".to_string(),
+            )
+            .into());
+        };
+
+        Ok(Generation::new(g as u32))
+    }
+
+    /// For use when updating a persistent property of a tenant, such as its config or placement_policy.
+    ///
+    /// Do not use this for settting generation, unless in the special onboarding code path (/location_config)
+    /// API: use [`Self::increment_generation`] instead.  Setting the generation via this route is a one-time thing
+    /// that we only do the first time a tenant is set to an attached policy via /location_config.
+    pub(crate) async fn update_tenant_shard(
+        &self,
+        tenant_shard_id: TenantShardId,
+        input_placement_policy: PlacementPolicy,
+        input_config: TenantConfig,
+        input_generation: Option<Generation>,
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+
+        self.with_conn(move |conn| {
+            let query = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
+
+            if let Some(input_generation) = input_generation {
+                // Update includes generation column
+                query
+                    .set((
+                        generation.eq(Some(input_generation.into().unwrap() as i32)),
+                        placement_policy
+                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
+                        config.eq(serde_json::to_string(&input_config).unwrap()),
+                    ))
+                    .execute(conn)?;
+            } else {
+                // Update does not include generation column
+                query
+                    .set((
+                        placement_policy
+                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
+                        config.eq(serde_json::to_string(&input_config).unwrap()),
+                    ))
+                    .execute(conn)?;
+            }
+
+            Ok(())
+        })
+        .await?;
+
+        Ok(())
+    }
+
+    pub(crate) async fn update_tenant_config(
+        &self,
+        input_tenant_id: TenantId,
+        input_config: TenantConfig,
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+
+        self.with_conn(move |conn| {
+            diesel::update(tenant_shards)
+                .filter(tenant_id.eq(input_tenant_id.to_string()))
+                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
+                .execute(conn)?;
+
+            Ok(())
+        })
+        .await?;
+
+        Ok(())
     }
 
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
@@ -377,7 +463,7 @@ impl Persistence {
                 .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
                 .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
                 .set((
-                    generation_pageserver.eq(i64::MAX),
+                    generation_pageserver.eq(Option::<i64>::None),
                     placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
                 ))
                 .execute(conn)?;
@@ -503,12 +589,15 @@ pub(crate) struct TenantShardPersistence {
     pub(crate) shard_stripe_size: i32,
 
     // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching
-    pub(crate) generation: i32,
+    // and use the incremented number when attaching.
+    //
+    // Generation is only None when first onboarding a tenant, where it may
+    // be in PlacementPolicy::Secondary and therefore have no valid generation state.
+    pub(crate) generation: Option<i32>,
 
     // Currently attached pageserver
     #[serde(rename = "pageserver")]
-    pub(crate) generation_pageserver: i64,
+    pub(crate) generation_pageserver: Option<i64>,
 
     #[serde(default)]
     pub(crate) placement_policy: String,
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index ce91c1f5e9..b633b217c7 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -26,7 +26,7 @@ pub(super) struct Reconciler {
     /// of a tenant's state from when we spawned a reconcile task.
     pub(super) tenant_shard_id: TenantShardId,
     pub(crate) shard: ShardIdentity,
-    pub(crate) generation: Generation,
+    pub(crate) generation: Option<Generation>,
     pub(crate) intent: TargetState,
     pub(crate) config: TenantConfig,
     pub(crate) observed: ObservedState,
@@ -312,7 +312,7 @@ impl Reconciler {
             &self.shard,
             &self.config,
             LocationConfigMode::AttachedStale,
-            Some(self.generation),
+            self.generation,
             None,
         );
         self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
@@ -335,16 +335,17 @@ impl Reconciler {
         }
 
         // Increment generation before attaching to new pageserver
-        self.generation = self
-            .persistence
-            .increment_generation(self.tenant_shard_id, dest_ps_id)
-            .await?;
+        self.generation = Some(
+            self.persistence
+                .increment_generation(self.tenant_shard_id, dest_ps_id)
+                .await?,
+        );
 
         let dest_conf = build_location_config(
             &self.shard,
             &self.config,
             LocationConfigMode::AttachedMulti,
-            Some(self.generation),
+            self.generation,
             None,
         );
 
@@ -401,7 +402,7 @@ impl Reconciler {
             &self.shard,
             &self.config,
             LocationConfigMode::AttachedSingle,
-            Some(self.generation),
+            self.generation,
             None,
         );
         self.location_config(dest_ps_id, dest_final_conf.clone(), None)
@@ -433,22 +434,62 @@ impl Reconciler {
 
         // If the attached pageserver is not attached, do so now.
         if let Some(node_id) = self.intent.attached {
-            let mut wanted_conf =
-                attached_location_conf(self.generation, &self.shard, &self.config);
+            // If we are in an attached policy, then generation must have been set (null generations
+            // are only present when a tenant is initially loaded with a secondary policy)
+            debug_assert!(self.generation.is_some());
+            let Some(generation) = self.generation else {
+                return Err(ReconcileError::Other(anyhow::anyhow!(
+                    "Attempted to attach with NULL generation"
+                )));
+            };
+
+            let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
             match self.observed.locations.get(&node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     // Nothing to do
                     tracing::info!(%node_id, "Observed configuration already correct.")
                 }
-                _ => {
+                observed => {
                     // In all cases other than a matching observed configuration, we will
                     // reconcile this location.  This includes locations with different configurations, as well
                     // as locations with unknown (None) observed state.
-                    self.generation = self
-                        .persistence
-                        .increment_generation(self.tenant_shard_id, node_id)
-                        .await?;
-                    wanted_conf.generation = self.generation.into();
+
+                    // The general case is to increment the generation.  However, there are cases
+                    // where this is not necessary:
+                    // - if we are only updating the TenantConf part of the location
+                    // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
+                    //   and the location was already in the correct generation
+                    let increment_generation = match observed {
+                        None => true,
+                        Some(ObservedStateLocation { conf: None }) => true,
+                        Some(ObservedStateLocation {
+                            conf: Some(observed),
+                        }) => {
+                            let generations_match = observed.generation == wanted_conf.generation;
+
+                            use LocationConfigMode::*;
+                            let mode_transition_requires_gen_inc =
+                                match (observed.mode, wanted_conf.mode) {
+                                    // Usually the short-lived attachment modes (multi and stale) are only used
+                                    // in the case of [`Self::live_migrate`], but it is simple to handle them correctly
+                                    // here too.  Locations are allowed to go Single->Stale and Multi->Single within the same generation.
+                                    (AttachedSingle, AttachedStale) => false,
+                                    (AttachedMulti, AttachedSingle) => false,
+                                    (lhs, rhs) => lhs != rhs,
+                                };
+
+                            !generations_match || mode_transition_requires_gen_inc
+                        }
+                    };
+
+                    if increment_generation {
+                        let generation = self
+                            .persistence
+                            .increment_generation(self.tenant_shard_id, node_id)
+                            .await?;
+                        self.generation = Some(generation);
+                        wanted_conf.generation = generation.into();
+                    }
                     tracing::info!(%node_id, "Observed configuration requires update.");
                     self.location_config(node_id, wanted_conf, None).await?;
                     self.compute_notify().await?;
diff --git a/control_plane/attachment_service/src/schema.rs b/control_plane/attachment_service/src/schema.rs
index db5a957443..76e4e56a66 100644
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -17,8 +17,8 @@ diesel::table! {
         shard_number -> Int4,
         shard_count -> Int4,
         shard_stripe_size -> Int4,
-        generation -> Int4,
-        generation_pageserver -> Int8,
+        generation -> Nullable<Int4>,
+        generation_pageserver -> Nullable<Int8>,
         placement_policy -> Varchar,
         splitting -> Int2,
         config -> Text,
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 02c1a65545..4209b62db3 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -14,10 +14,13 @@ use control_plane::attachment_service::{
 use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
-use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
-    TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
-    TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
+use pageserver_api::{
+    controller_api::{
+        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
+        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
+        TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
+    },
+    models::TenantConfigRequest,
 };
 use pageserver_api::{
     models::{
@@ -65,6 +68,11 @@ const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
 // some data in it.
 const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
+// If we receive a call using Secondary mode initially, it will omit generation.  We will initialize
+// tenant shards into this generation, and as long as it remains in this generation, we will accept
+// input generation from future requests as authoritative.
+const INITIAL_GENERATION: Generation = Generation::new(0);
+
 /// How long [`Service::startup_reconcile`] is allowed to take before it should give
 /// up on unresponsive pageservers and proceed.
 pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
@@ -167,6 +175,21 @@ impl From<ReconcileWaitError> for ApiError {
     }
 }
 
+#[allow(clippy::large_enum_variant)]
+enum TenantCreateOrUpdate {
+    Create((TenantCreateRequest, PlacementPolicy)),
+    Update(Vec<ShardUpdate>),
+}
+
+struct ShardUpdate {
+    tenant_shard_id: TenantShardId,
+    placement_policy: PlacementPolicy,
+    tenant_config: TenantConfig,
+
+    /// If this is None, generation is not updated.
+    generation: Option<Generation>,
+}
+
 impl Service {
     pub fn get_config(&self) -> &Config {
         &self.config
@@ -571,6 +594,9 @@ impl Service {
         // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
         tenant.pending_compute_notification = result.pending_compute_notification;
 
+        // Let the TenantState know it is idle.
+        tenant.reconcile_complete(result.sequence);
+
         match result.result {
             Ok(()) => {
                 for (node_id, loc) in &result.observed.locations {
@@ -661,8 +687,8 @@ impl Service {
             // after when pageservers start up and register.
             let mut node_ids = HashSet::new();
             for tsp in &tenant_shard_persistence {
-                if tsp.generation_pageserver != i64::MAX {
-                    node_ids.insert(tsp.generation_pageserver);
+                if let Some(node_id) = tsp.generation_pageserver {
+                    node_ids.insert(node_id);
                 }
             }
             for node_id in node_ids {
@@ -699,18 +725,15 @@ impl Service {
             // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
             // it with what we can infer: the node for which a generation was most recently issued.
             let mut intent = IntentState::new();
-            if tsp.generation_pageserver != i64::MAX {
-                intent.set_attached(
-                    &mut scheduler,
-                    Some(NodeId(tsp.generation_pageserver as u64)),
-                );
+            if let Some(generation_pageserver) = tsp.generation_pageserver {
+                intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
             }
 
             let new_tenant = TenantState {
                 tenant_shard_id,
                 shard: shard_identity,
                 sequence: Sequence::initial(),
-                generation: Generation::new(tsp.generation as u32),
+                generation: tsp.generation.map(|g| Generation::new(g as u32)),
                 policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
                 intent,
                 observed: ObservedState::new(),
@@ -790,8 +813,8 @@ impl Service {
                 shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
                 shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32,
                 shard_stripe_size: 0,
-                generation: 0,
-                generation_pageserver: i64::MAX,
+                generation: Some(0),
+                generation_pageserver: None,
                 placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
                 config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                 splitting: SplitState::default(),
@@ -846,7 +869,7 @@ impl Service {
             .expect("Checked for existence above");
 
         if let Some(new_generation) = new_generation {
-            tenant_state.generation = new_generation;
+            tenant_state.generation = Some(new_generation);
         } else {
             // This is a detach notification.  We must update placement policy to avoid re-attaching
             // during background scheduling/reconciliation, or during attachment service restart.
@@ -896,7 +919,7 @@ impl Service {
                     node_id,
                     ObservedStateLocation {
                         conf: Some(attached_location_conf(
-                            tenant_state.generation,
+                            tenant_state.generation.unwrap(),
                             &tenant_state.shard,
                             &tenant_state.config,
                         )),
@@ -910,7 +933,7 @@ impl Service {
         Ok(AttachHookResponse {
             gen: attach_req
                 .node_id
-                .map(|_| tenant_state.generation.into().unwrap()),
+                .map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
         })
     }
 
@@ -923,7 +946,7 @@ impl Service {
             attachment: tenant_state.and_then(|s| {
                 s.intent
                     .get_attached()
-                    .map(|ps| (s.generation.into().unwrap(), ps))
+                    .map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps))
             }),
         }
     }
@@ -973,7 +996,17 @@ impl Service {
                 continue;
             };
 
-            shard_state.generation = std::cmp::max(shard_state.generation, new_gen);
+            // If [`Persistence::re_attach`] selected this shard, it must have alread
+            // had a generation set.
+            debug_assert!(shard_state.generation.is_some());
+            let Some(old_gen) = shard_state.generation else {
+                // Should never happen:  would only return incremented generation
+                // for a tenant that already had a non-null generation.
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                    "Generation must be set while re-attaching"
+                )));
+            };
+            shard_state.generation = Some(std::cmp::max(old_gen, new_gen));
             if let Some(observed) = shard_state
                 .observed
                 .locations
@@ -1003,7 +1036,7 @@ impl Service {
 
         for req_tenant in validate_req.tenants {
             if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
-                let valid = tenant_state.generation == Generation::new(req_tenant.gen);
+                let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen));
                 tracing::info!(
                     "handle_validate: {}(gen {}): valid={valid} (latest {:?})",
                     req_tenant.id,
@@ -1030,8 +1063,9 @@ impl Service {
     pub(crate) async fn tenant_create(
         &self,
         create_req: TenantCreateRequest,
+        placement_policy: PlacementPolicy,
     ) -> Result<TenantCreateResponse, ApiError> {
-        let (response, waiters) = self.do_tenant_create(create_req).await?;
+        let (response, waiters) = self.do_tenant_create(create_req, placement_policy).await?;
 
         self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
         Ok(response)
@@ -1040,6 +1074,7 @@ impl Service {
     pub(crate) async fn do_tenant_create(
         &self,
         create_req: TenantCreateRequest,
+        placement_policy: PlacementPolicy,
     ) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
         // This service expects to handle sharding itself: it is an error to try and directly create
         // a particular shard here.
@@ -1065,9 +1100,27 @@ impl Service {
             })
             .collect::<Vec<_>>();
 
-        // TODO: enable specifying this.  Using Single as a default helps legacy tests to work (they
-        // have no expectation of HA).
-        let placement_policy: PlacementPolicy = PlacementPolicy::Single;
+        // If the caller specifies a None generation, it means "start from default".  This is different
+        // to [`Self::tenant_location_config`], where a None generation is used to represent
+        // an incompletely-onboarded tenant.
+        let initial_generation = if matches!(placement_policy, PlacementPolicy::Secondary) {
+            tracing::info!(
+                "tenant_create: secondary mode, generation is_some={}",
+                create_req.generation.is_some()
+            );
+            create_req.generation.map(Generation::new)
+        } else {
+            tracing::info!(
+                "tenant_create: not secondary mode, generation is_some={}",
+                create_req.generation.is_some()
+            );
+            Some(
+                create_req
+                    .generation
+                    .map(Generation::new)
+                    .unwrap_or(INITIAL_GENERATION),
+            )
+        };
 
         // Ordering: we persist tenant shards before creating them on the pageserver.  This enables a caller
         // to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart
@@ -1079,8 +1132,10 @@ impl Service {
                 shard_number: tenant_shard_id.shard_number.0 as i32,
                 shard_count: tenant_shard_id.shard_count.literal() as i32,
                 shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
-                generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
-                generation_pageserver: i64::MAX,
+                generation: initial_generation.map(|g| g.into().unwrap() as i32),
+                // The pageserver is not known until scheduling happens: we will set this column when
+                // incrementing the generation the first time we attach to a pageserver.
+                generation_pageserver: None,
                 placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                 config: serde_json::to_string(&create_req.config).unwrap(),
                 splitting: SplitState::default(),
@@ -1120,15 +1175,17 @@ impl Service {
                             ))
                         })?;
 
-                        response_shards.push(TenantCreateResponseShard {
-                            shard_id: tenant_shard_id,
-                            node_id: entry
+                        if let Some(node_id) = entry.get().intent.get_attached() {
+                            let generation = entry
                                 .get()
-                                .intent
-                                .get_attached()
-                                .expect("We just set pageserver if it was None"),
-                            generation: entry.get().generation.into().unwrap(),
-                        });
+                                .generation
+                                .expect("Generation is set when in attached mode");
+                            response_shards.push(TenantCreateResponseShard {
+                                shard_id: tenant_shard_id,
+                                node_id: *node_id,
+                                generation: generation.into().unwrap(),
+                            });
+                        }
 
                         continue;
                     }
@@ -1142,9 +1199,7 @@ impl Service {
                             placement_policy.clone(),
                         );
 
-                        if let Some(create_gen) = create_req.generation {
-                            state.generation = Generation::new(create_gen);
-                        }
+                        state.generation = initial_generation;
                         state.config = create_req.config.clone();
 
                         state.schedule(scheduler).map_err(|e| {
@@ -1153,14 +1208,18 @@ impl Service {
                             ))
                         })?;
 
-                        response_shards.push(TenantCreateResponseShard {
-                            shard_id: tenant_shard_id,
-                            node_id: state
-                                .intent
-                                .get_attached()
-                                .expect("We just set pageserver if it was None"),
-                            generation: state.generation.into().unwrap(),
-                        });
+                        // Only include shards in result if we are attaching: the purpose
+                        // of the response is to tell the caller where the shards are attached.
+                        if let Some(node_id) = state.intent.get_attached() {
+                            let generation = state
+                                .generation
+                                .expect("Generation is set when in attached mode");
+                            response_shards.push(TenantCreateResponseShard {
+                                shard_id: tenant_shard_id,
+                                node_id: *node_id,
+                                generation: generation.into().unwrap(),
+                            });
+                        }
                         entry.insert(state)
                     }
                 };
@@ -1214,12 +1273,114 @@ impl Service {
         Ok(())
     }
 
-    /// This API is used by the cloud control plane to do coarse-grained control of tenants:
-    /// - Call with mode Attached* to upsert the tenant.
-    /// - Call with mode Detached to switch to PolicyMode::Detached
+    /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
+    /// and transform it into either a tenant creation of a series of shard updates.
+    fn tenant_location_config_prepare(
+        &self,
+        tenant_id: TenantId,
+        req: TenantLocationConfigRequest,
+    ) -> TenantCreateOrUpdate {
+        let mut updates = Vec::new();
+        let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, _scheduler) = locked.parts_mut();
+
+        // Use location config mode as an indicator of policy.
+        let placement_policy = match req.config.mode {
+            LocationConfigMode::Detached => PlacementPolicy::Detached,
+            LocationConfigMode::Secondary => PlacementPolicy::Secondary,
+            LocationConfigMode::AttachedMulti
+            | LocationConfigMode::AttachedSingle
+            | LocationConfigMode::AttachedStale => {
+                if nodes.len() > 1 {
+                    PlacementPolicy::Double(1)
+                } else {
+                    // Convenience for dev/test: if we just have one pageserver, import
+                    // tenants into Single mode so that scheduling will succeed.
+                    PlacementPolicy::Single
+                }
+            }
+        };
+
+        let mut create = true;
+        for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+            // Saw an existing shard: this is not a creation
+            create = false;
+
+            // Shards may have initially been created by a Secondary request, where we
+            // would have left generation as None.
+            //
+            // We only update generation the first time we see an attached-mode request,
+            // and if there is no existing generation set. The caller is responsible for
+            // ensuring that no non-storage-controller pageserver ever uses a higher
+            // generation than they passed in here.
+            use LocationConfigMode::*;
+            let set_generation = match req.config.mode {
+                AttachedMulti | AttachedSingle | AttachedStale if shard.generation.is_none() => {
+                    req.config.generation.map(Generation::new)
+                }
+                _ => None,
+            };
+
+            if shard.policy != placement_policy
+                || shard.config != req.config.tenant_conf
+                || set_generation.is_some()
+            {
+                updates.push(ShardUpdate {
+                    tenant_shard_id: *shard_id,
+                    placement_policy: placement_policy.clone(),
+                    tenant_config: req.config.tenant_conf.clone(),
+                    generation: set_generation,
+                });
+            }
+        }
+
+        if create {
+            use LocationConfigMode::*;
+            let generation = match req.config.mode {
+                AttachedMulti | AttachedSingle | AttachedStale => req.config.generation,
+                // If a caller provided a generation in a non-attached request, ignore it
+                // and leave our generation as None: this enables a subsequent update to set
+                // the generation when setting an attached mode for the first time.
+                _ => None,
+            };
+
+            TenantCreateOrUpdate::Create(
+                // Synthesize a creation request
+                (
+                    TenantCreateRequest {
+                        new_tenant_id: TenantShardId::unsharded(tenant_id),
+                        generation,
+                        shard_parameters: ShardParameters {
+                            // Must preserve the incoming shard_count do distinguish unsharded (0)
+                            // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
+                            count: req.tenant_id.shard_count,
+                            // We only import un-sharded or single-sharded tenants, so stripe
+                            // size can be made up arbitrarily here.
+                            stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
+                        },
+                        config: req.config.tenant_conf,
+                    },
+                    placement_policy,
+                ),
+            )
+        } else {
+            TenantCreateOrUpdate::Update(updates)
+        }
+    }
+
+    /// This API is used by the cloud control plane to migrate unsharded tenants that it created
+    /// directly with pageservers into this service.
     ///
-    /// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has
-    /// secondary locations.
+    /// Cloud control plane MUST NOT continue issuing GENERATION NUMBERS for this tenant once it
+    /// has attempted to call this API. Failure to oblige to this rule may lead to S3 corruption.
+    /// Think of the first attempt to call this API as a transfer of absolute authority over the
+    /// tenant's source of generation numbers.
+    ///
+    /// The mode in this request coarse-grained control of tenants:
+    /// - Call with mode Attached* to upsert the tenant.
+    /// - Call with mode Secondary to either onboard a tenant without attaching it, or
+    ///   to set an existing tenant to PolicyMode::Secondary
+    /// - Call with mode Detached to switch to PolicyMode::Detached
     pub(crate) async fn tenant_location_config(
         &self,
         tenant_id: TenantId,
@@ -1231,131 +1392,96 @@ impl Service {
             )));
         }
 
-        let mut waiters = Vec::new();
+        // First check if this is a creation or an update
+        let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
+
         let mut result = TenantLocationConfigResponse { shards: Vec::new() };
-        let maybe_create = {
-            let mut locked = self.inner.write().unwrap();
-            let result_tx = locked.result_tx.clone();
-            let compute_hook = locked.compute_hook.clone();
-            let (nodes, tenants, scheduler) = locked.parts_mut();
+        let waiters = match create_or_update {
+            TenantCreateOrUpdate::Create((create_req, placement_policy)) => {
+                let (create_resp, waiters) =
+                    self.do_tenant_create(create_req, placement_policy).await?;
+                result.shards = create_resp
+                    .shards
+                    .into_iter()
+                    .map(|s| TenantShardLocation {
+                        node_id: s.node_id,
+                        shard_id: s.shard_id,
+                    })
+                    .collect();
+                waiters
+            }
+            TenantCreateOrUpdate::Update(updates) => {
+                // Persist updates
+                // Ordering: write to the database before applying changes in-memory, so that
+                // we will not appear time-travel backwards on a restart.
+                for ShardUpdate {
+                    tenant_shard_id,
+                    placement_policy,
+                    tenant_config,
+                    generation,
+                } in &updates
+                {
+                    self.persistence
+                        .update_tenant_shard(
+                            *tenant_shard_id,
+                            placement_policy.clone(),
+                            tenant_config.clone(),
+                            *generation,
+                        )
+                        .await?;
+                }
 
-            // Maybe we have existing shards
-            let mut create = true;
-            for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
-                // Saw an existing shard: this is not a creation
-                create = false;
+                // Apply updates in-memory
+                let mut waiters = Vec::new();
+                {
+                    let mut locked = self.inner.write().unwrap();
+                    let result_tx = locked.result_tx.clone();
+                    let compute_hook = locked.compute_hook.clone();
+                    let (nodes, tenants, scheduler) = locked.parts_mut();
 
-                // Note that for existing tenants we do _not_ respect the generation in the request: this is likely
-                // to be stale.  Once a tenant is created in this service, our view of generation is authoritative, and
-                // callers' generations may be ignored.  This represents a one-way migration of tenants from the outer
-                // cloud control plane into this service.
+                    for ShardUpdate {
+                        tenant_shard_id,
+                        placement_policy,
+                        tenant_config,
+                        generation: update_generation,
+                    } in updates
+                    {
+                        let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
+                            tracing::warn!("Shard {tenant_shard_id} removed while updating");
+                            continue;
+                        };
 
-                // Use location config mode as an indicator of policy: if they ask for
-                // attached we go to default HA attached mode.  If they ask for secondary
-                // we go to secondary-only mode.  If they ask for detached we detach.
-                match req.config.mode {
-                    LocationConfigMode::Detached => {
-                        shard.policy = PlacementPolicy::Detached;
-                    }
-                    LocationConfigMode::Secondary => {
-                        // TODO: implement secondary-only mode.
-                        todo!();
-                    }
-                    LocationConfigMode::AttachedMulti
-                    | LocationConfigMode::AttachedSingle
-                    | LocationConfigMode::AttachedStale => {
-                        // TODO: persistence for changes in policy
-                        if nodes.len() > 1 {
-                            shard.policy = PlacementPolicy::Double(1)
-                        } else {
-                            // Convenience for dev/test: if we just have one pageserver, import
-                            // tenants into Single mode so that scheduling will succeed.
-                            shard.policy = PlacementPolicy::Single
+                        shard.policy = placement_policy;
+                        shard.config = tenant_config;
+                        if let Some(generation) = update_generation {
+                            shard.generation = Some(generation);
+                        }
+
+                        shard.schedule(scheduler)?;
+
+                        let maybe_waiter = shard.maybe_reconcile(
+                            result_tx.clone(),
+                            nodes,
+                            &compute_hook,
+                            &self.config,
+                            &self.persistence,
+                            &self.gate,
+                            &self.cancel,
+                        );
+                        if let Some(waiter) = maybe_waiter {
+                            waiters.push(waiter);
+                        }
+
+                        if let Some(node_id) = shard.intent.get_attached() {
+                            result.shards.push(TenantShardLocation {
+                                shard_id: tenant_shard_id,
+                                node_id: *node_id,
+                            })
                         }
                     }
                 }
-
-                shard.schedule(scheduler)?;
-
-                let maybe_waiter = shard.maybe_reconcile(
-                    result_tx.clone(),
-                    nodes,
-                    &compute_hook,
-                    &self.config,
-                    &self.persistence,
-                    &self.gate,
-                    &self.cancel,
-                );
-                if let Some(waiter) = maybe_waiter {
-                    waiters.push(waiter);
-                }
-
-                if let Some(node_id) = shard.intent.get_attached() {
-                    result.shards.push(TenantShardLocation {
-                        shard_id: *shard_id,
-                        node_id: *node_id,
-                    })
-                }
+                waiters
             }
-
-            if create {
-                // Validate request mode
-                match req.config.mode {
-                    LocationConfigMode::Detached | LocationConfigMode::Secondary => {
-                        // When using this API to onboard an existing tenant to this service, it must start in
-                        // an attached state, because we need the request to come with a generation
-                        return Err(ApiError::BadRequest(anyhow::anyhow!(
-                            "Imported tenant must be in attached mode"
-                        )));
-                    }
-
-                    LocationConfigMode::AttachedMulti
-                    | LocationConfigMode::AttachedSingle
-                    | LocationConfigMode::AttachedStale => {
-                        // Pass
-                    }
-                }
-
-                // Validate request generation
-                let Some(generation) = req.config.generation else {
-                    // We can only import attached tenants, because we need the request to come with a generation
-                    return Err(ApiError::BadRequest(anyhow::anyhow!(
-                        "Generation is mandatory when importing tenant"
-                    )));
-                };
-
-                // Synthesize a creation request
-                Some(TenantCreateRequest {
-                    new_tenant_id: TenantShardId::unsharded(tenant_id),
-                    generation: Some(generation),
-                    shard_parameters: ShardParameters {
-                        // Must preserve the incoming shard_count do distinguish unsharded (0)
-                        // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
-                        count: req.tenant_id.shard_count,
-                        // We only import un-sharded or single-sharded tenants, so stripe
-                        // size can be made up arbitrarily here.
-                        stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
-                    },
-                    config: req.config.tenant_conf,
-                })
-            } else {
-                None
-            }
-        };
-
-        let waiters = if let Some(create_req) = maybe_create {
-            let (create_resp, waiters) = self.do_tenant_create(create_req).await?;
-            result.shards = create_resp
-                .shards
-                .into_iter()
-                .map(|s| TenantShardLocation {
-                    node_id: s.node_id,
-                    shard_id: s.shard_id,
-                })
-                .collect();
-            waiters
-        } else {
-            waiters
         };
 
         if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
@@ -1375,6 +1501,91 @@ impl Service {
         Ok(result)
     }
 
+    pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
+        let tenant_id = req.tenant_id;
+        let config = req.config;
+
+        self.persistence
+            .update_tenant_config(req.tenant_id, config.clone())
+            .await?;
+
+        let waiters = {
+            let mut waiters = Vec::new();
+            let mut locked = self.inner.write().unwrap();
+            let result_tx = locked.result_tx.clone();
+            let compute_hook = locked.compute_hook.clone();
+            let (nodes, tenants, _scheduler) = locked.parts_mut();
+            for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+                shard.config = config.clone();
+                if let Some(waiter) = shard.maybe_reconcile(
+                    result_tx.clone(),
+                    nodes,
+                    &compute_hook,
+                    &self.config,
+                    &self.persistence,
+                    &self.gate,
+                    &self.cancel,
+                ) {
+                    waiters.push(waiter);
+                }
+            }
+            waiters
+        };
+
+        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
+            // Treat this as success because we have stored the configuration.  If e.g.
+            // a node was unavailable at this time, it should not stop us accepting a
+            // configuration change.
+            tracing::warn!(%tenant_id, "Accepted configuration update but reconciliation failed: {e}");
+        }
+
+        Ok(())
+    }
+
+    pub(crate) fn tenant_config_get(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<HashMap<&str, serde_json::Value>, ApiError> {
+        let config = {
+            let locked = self.inner.read().unwrap();
+
+            match locked
+                .tenants
+                .range(TenantShardId::tenant_range(tenant_id))
+                .next()
+            {
+                Some((_tenant_shard_id, shard)) => shard.config.clone(),
+                None => {
+                    return Err(ApiError::NotFound(
+                        anyhow::anyhow!("Tenant not found").into(),
+                    ))
+                }
+            }
+        };
+
+        // Unlike the pageserver, we do not have a set of global defaults: the config is
+        // entirely per-tenant.  Therefore the distinction between `tenant_specific_overrides`
+        // and `effective_config` in the response is meaningless, but we retain that syntax
+        // in order to remain compatible with the pageserver API.
+
+        let response = HashMap::from([
+            (
+                "tenant_specific_overrides",
+                serde_json::to_value(&config)
+                    .context("serializing tenant specific overrides")
+                    .map_err(ApiError::InternalServerError)?,
+            ),
+            (
+                "effective_config",
+                serde_json::to_value(&config)
+                    .context("serializing effective config")
+                    .map_err(ApiError::InternalServerError)?,
+            ),
+        ]);
+
+        Ok(response)
+    }
+
     pub(crate) async fn tenant_time_travel_remote_storage(
         &self,
         time_travel_req: &TenantTimeTravelRequest,
@@ -1460,6 +1671,60 @@ impl Service {
                         })?;
             }
         }
+        Ok(())
+    }
+
+    pub(crate) async fn tenant_secondary_download(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<(), ApiError> {
+        // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                for node_id in shard.intent.get_secondary() {
+                    let node = locked
+                        .nodes
+                        .get(node_id)
+                        .expect("Pageservers may not be deleted while referenced");
+
+                    targets.push((*tenant_shard_id, node.clone()));
+                }
+            }
+            targets
+        };
+
+        // TODO: this API, and the underlying pageserver API, should take a timeout argument so that for long running
+        // downloads, they can return a clean 202 response instead of the HTTP client timing out.
+
+        // Issue concurrent requests to all shards' locations
+        let mut futs = FuturesUnordered::new();
+        for (tenant_shard_id, node) in targets {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            futs.push(async move {
+                let result = client.tenant_secondary_download(tenant_shard_id).await;
+                (result, node)
+            })
+        }
+
+        // Handle any errors returned by pageservers.  This includes cases like this request racing with
+        // a scheduling operation, such that the tenant shard we're calling doesn't exist on that pageserver any more, as
+        // well as more general cases like 503s, 500s, or timeouts.
+        while let Some((result, node)) = futs.next().await {
+            let Err(e) = result else { continue };
+
+            // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
+            // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
+            // than they had hoped for.
+            tracing::warn!(
+                "Ignoring tenant secondary download error from pageserver {}: {e}",
+                node.id,
+            );
+        }
 
         Ok(())
     }
@@ -2039,8 +2304,8 @@ impl Service {
                     // Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
                     // populate the correct generation as part of its transaction, to protect us
                     // against racing with changes in the state of the parent.
-                    generation: 0,
-                    generation_pageserver: target.node.id.0 as i64,
+                    generation: None,
+                    generation_pageserver: Some(target.node.id.0 as i64),
                     placement_policy: serde_json::to_string(&policy).unwrap(),
                     // TODO: get the config out of the map
                     config: serde_json::to_string(&TenantConfig::default()).unwrap(),
@@ -2161,7 +2426,8 @@ impl Service {
                         .expect("It was present, we just split it");
                     let old_attached = old_state.intent.get_attached().unwrap();
                     old_state.intent.clear(scheduler);
-                    (old_attached, old_state.generation, old_state.config.clone())
+                    let generation = old_state.generation.expect("Shard must have been attached");
+                    (old_attached, generation, old_state.config.clone())
                 };
 
                 for child in child_ids {
@@ -2182,7 +2448,7 @@ impl Service {
                     child_state.observed = ObservedState {
                         locations: child_observed,
                     };
-                    child_state.generation = generation;
+                    child_state.generation = Some(generation);
                     child_state.config = config.clone();
 
                     // The child's TenantState::splitting is intentionally left at the default value of Idle,
@@ -2247,6 +2513,7 @@ impl Service {
                 match shard.policy {
                     PlacementPolicy::Single => {
                         shard.intent.clear_secondary(scheduler);
+                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
                     }
                     PlacementPolicy::Double(_n) => {
                         // If our new attached node was a secondary, it no longer should be.
@@ -2256,6 +2523,12 @@ impl Service {
                         if let Some(old_attached) = old_attached {
                             shard.intent.push_secondary(scheduler, old_attached);
                         }
+
+                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
+                    }
+                    PlacementPolicy::Secondary => {
+                        shard.intent.clear(scheduler);
+                        shard.intent.push_secondary(scheduler, migrate_req.node_id);
                     }
                     PlacementPolicy::Detached => {
                         return Err(ApiError::BadRequest(anyhow::anyhow!(
@@ -2263,9 +2536,6 @@ impl Service {
                         )))
                     }
                 }
-                shard
-                    .intent
-                    .set_attached(scheduler, Some(migrate_req.node_id));
 
                 tracing::info!("Migrating: new intent {:?}", shard.intent);
                 shard.sequence = shard.sequence.next();
@@ -2593,7 +2863,7 @@ impl Service {
                     observed_loc.conf = None;
                 }
 
-                if tenant_state.intent.notify_offline(config_req.node_id) {
+                if tenant_state.intent.demote_attached(config_req.node_id) {
                     tenant_state.sequence = tenant_state.sequence.next();
                     match tenant_state.schedule(scheduler) {
                         Err(e) => {
@@ -2660,6 +2930,9 @@ impl Service {
     /// Helper for methods that will try and call pageserver APIs for
     /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
     /// is attached somewhere.
+    ///
+    /// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
+    /// an attached policy.  We should error out if it isn't.
     fn ensure_attached_schedule(
         &self,
         mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index c14fe6699e..33b7d578c7 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -53,8 +53,11 @@ pub(crate) struct TenantState {
     pub(crate) sequence: Sequence,
 
     // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching
-    pub(crate) generation: Generation,
+    // and use the incremented number when attaching.
+    //
+    // None represents an incompletely onboarded tenant via the [`Service::location_config`]
+    // API, where this tenant may only run in PlacementPolicy::Secondary.
+    pub(crate) generation: Option<Generation>,
 
     // High level description of how the tenant should be set up.  Provided
     // externally.
@@ -181,6 +184,13 @@ impl IntentState {
         }
     }
 
+    /// Remove the last secondary node from the list of secondaries
+    pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
+        if let Some(node_id) = self.secondary.pop() {
+            scheduler.node_dec_ref(node_id);
+        }
+    }
+
     pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
         if let Some(old_attached) = self.attached.take() {
             scheduler.node_dec_ref(old_attached);
@@ -208,11 +218,13 @@ impl IntentState {
         &self.secondary
     }
 
-    /// When a node goes offline, we update intents to avoid using it
-    /// as their attached pageserver.
+    /// If the node is in use as the attached location, demote it into
+    /// the list of secondary locations.  This is used when a node goes offline,
+    /// and we want to use a different node for attachment, but not permanently
+    /// forget the location on the offline node.
     ///
     /// Returns true if a change was made
-    pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
+    pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool {
         if self.attached == Some(node_id) {
             // TODO: when scheduler starts tracking attached + secondary counts separately, we will
             // need to call into it here.
@@ -315,7 +327,7 @@ pub(crate) struct ReconcileResult {
     pub(crate) result: Result<(), ReconcileError>,
 
     pub(crate) tenant_shard_id: TenantShardId,
-    pub(crate) generation: Generation,
+    pub(crate) generation: Option<Generation>,
     pub(crate) observed: ObservedState,
 
     /// Set [`TenantState::pending_compute_notification`] from this flag
@@ -340,7 +352,7 @@ impl TenantState {
             tenant_shard_id,
             policy,
             intent: IntentState::default(),
-            generation: Generation::new(0),
+            generation: Some(Generation::new(0)),
             shard,
             observed: ObservedState::default(),
             config: TenantConfig::default(),
@@ -438,10 +450,16 @@ impl TenantState {
         // more work on the same pageservers we're already using.
         let mut modified = false;
 
+        // Add/remove nodes to fulfil policy
         use PlacementPolicy::*;
         match self.policy {
             Single => {
                 // Should have exactly one attached, and zero secondaries
+                if !self.intent.secondary.is_empty() {
+                    self.intent.clear_secondary(scheduler);
+                    modified = true;
+                }
+
                 let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
                 modified |= modified_attached;
 
@@ -451,6 +469,23 @@ impl TenantState {
                 }
             }
             Double(secondary_count) => {
+                let retain_secondaries = if self.intent.attached.is_none()
+                    && scheduler.node_preferred(&self.intent.secondary).is_some()
+                {
+                    // If we have no attached, and one of the secondaries is elegible to be promoted, retain
+                    // one more secondary than we usually would, as one of them will become attached futher down this function.
+                    secondary_count + 1
+                } else {
+                    secondary_count
+                };
+
+                while self.intent.secondary.len() > retain_secondaries {
+                    // We have no particular preference for one secondary location over another: just
+                    // arbitrarily drop from the end
+                    self.intent.pop_secondary(scheduler);
+                    modified = true;
+                }
+
                 // Should have exactly one attached, and N secondaries
                 let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
                 modified |= modified_attached;
@@ -463,15 +498,28 @@ impl TenantState {
                     modified = true;
                 }
             }
-            Detached => {
-                // Should have no attached or secondary pageservers
-                if self.intent.attached.is_some() {
-                    self.intent.set_attached(scheduler, None);
+            Secondary => {
+                if let Some(node_id) = self.intent.get_attached() {
+                    // Populate secondary by demoting the attached node
+                    self.intent.demote_attached(*node_id);
+                    modified = true;
+                } else if self.intent.secondary.is_empty() {
+                    // Populate secondary by scheduling a fresh node
+                    let node_id = scheduler.schedule_shard(&[])?;
+                    self.intent.push_secondary(scheduler, node_id);
                     modified = true;
                 }
-
-                if !self.intent.secondary.is_empty() {
-                    self.intent.clear_secondary(scheduler);
+                while self.intent.secondary.len() > 1 {
+                    // We have no particular preference for one secondary location over another: just
+                    // arbitrarily drop from the end
+                    self.intent.pop_secondary(scheduler);
+                    modified = true;
+                }
+            }
+            Detached => {
+                // Never add locations in this mode
+                if self.intent.get_attached().is_some() || !self.intent.get_secondary().is_empty() {
+                    self.intent.clear(scheduler);
                     modified = true;
                 }
             }
@@ -518,7 +566,12 @@ impl TenantState {
 
     fn dirty(&self) -> bool {
         if let Some(node_id) = self.intent.attached {
-            let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
+            // Maybe panic: it is a severe bug if we try to attach while generation is null.
+            let generation = self
+                .generation
+                .expect("Attempted to enter attached state without a generation");
+
+            let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
             match self.observed.locations.get(&node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                 Some(_) | None => {
@@ -596,6 +649,10 @@ impl TenantState {
         // Reconcile already in flight for the current sequence?
         if let Some(handle) = &self.reconciler {
             if handle.sequence == self.sequence {
+                tracing::info!(
+                    "Reconciliation already in progress for sequence {:?}",
+                    self.sequence,
+                );
                 return Some(ReconcilerWaiter {
                     tenant_shard_id: self.tenant_shard_id,
                     seq_wait: self.waiter.clone(),
@@ -615,6 +672,10 @@ impl TenantState {
             return None;
         };
 
+        // Advance the sequence before spawning a reconciler, so that sequence waiters
+        // can distinguish between before+after the reconcile completes.
+        self.sequence = self.sequence.next();
+
         let reconciler_cancel = cancel.child_token();
         let mut reconciler = Reconciler {
             tenant_shard_id: self.tenant_shard_id,
@@ -716,6 +777,17 @@ impl TenantState {
         })
     }
 
+    /// Called when a ReconcileResult has been emitted and the service is updating
+    /// our state: if the result is from a sequence >= my ReconcileHandle, then drop
+    /// the handle to indicate there is no longer a reconciliation in progress.
+    pub(crate) fn reconcile_complete(&mut self, sequence: Sequence) {
+        if let Some(reconcile_handle) = &self.reconciler {
+            if reconcile_handle.sequence <= sequence {
+                self.reconciler = None;
+            }
+        }
+    }
+
     // If we had any state at all referring to this node ID, drop it.  Does not
     // attempt to reschedule.
     pub(crate) fn deref_node(&mut self, node_id: NodeId) {
@@ -736,13 +808,8 @@ impl TenantState {
             shard_number: self.tenant_shard_id.shard_number.0 as i32,
             shard_count: self.tenant_shard_id.shard_count.literal() as i32,
             shard_stripe_size: self.shard.stripe_size.0 as i32,
-            generation: self.generation.into().unwrap_or(0) as i32,
-            generation_pageserver: self
-                .intent
-                .get_attached()
-                .map(|n| n.0 as i64)
-                .unwrap_or(i64::MAX),
-
+            generation: self.generation.map(|g| g.into().unwrap_or(0) as i32),
+            generation_pageserver: self.intent.get_attached().map(|n| n.0 as i64),
             placement_policy: serde_json::to_string(&self.policy).unwrap(),
             config: serde_json::to_string(&self.config).unwrap(),
             splitting: SplitState::default(),
@@ -805,8 +872,10 @@ pub(crate) mod tests {
         assert_ne!(attached_node_id, secondary_node_id);
 
         // Notifying the attached node is offline should demote it to a secondary
-        let changed = tenant_state.intent.notify_offline(attached_node_id);
+        let changed = tenant_state.intent.demote_attached(attached_node_id);
         assert!(changed);
+        assert!(tenant_state.intent.attached.is_none());
+        assert_eq!(tenant_state.intent.secondary.len(), 2);
 
         // Update the scheduler state to indicate the node is offline
         nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs
index 6f6c46cfeb..af15cee924 100644
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -45,7 +45,7 @@ impl Generation {
         Self::Broken
     }
 
-    pub fn new(v: u32) -> Self {
+    pub const fn new(v: u32) -> Self {
         Self::Valid(v)
     }
 
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index c8224c1c67..bc77dfd084 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -146,6 +146,8 @@ def test_sharding_service_smoke(
     for tid in tenant_ids:
         tenant_delete_wait_completed(env.attachment_service.pageserver_api(), tid, 10)
 
+    env.attachment_service.consistency_check()
+
     # Set a scheduling policy on one node, create all the tenants, observe
     # that the scheduling policy is respected.
     env.attachment_service.node_configure(env.pageservers[1].id, {"scheduling": "Draining"})
@@ -256,9 +258,8 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
     env.attachment_service.consistency_check()
 
 
-def test_sharding_service_onboarding(
-    neon_env_builder: NeonEnvBuilder,
-):
+@pytest.mark.parametrize("warm_up", [True, False])
+def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
     """
     We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
     which provides the /location_config API.  This is similar to creating a tenant,
@@ -306,6 +307,23 @@ def test_sharding_service_onboarding(
         },
     )
 
+    if warm_up:
+        origin_ps.http_client().tenant_heatmap_upload(tenant_id)
+
+        # We expect to be called via live migration code, which may try to configure the tenant into secondary
+        # mode before attaching it.
+        virtual_ps_http.tenant_location_conf(
+            tenant_id,
+            {
+                "mode": "Secondary",
+                "secondary_conf": {"warm": True},
+                "tenant_conf": {},
+                "generation": None,
+            },
+        )
+
+        virtual_ps_http.tenant_secondary_download(tenant_id)
+
     # Call into attachment service to onboard the tenant
     generation += 1
     virtual_ps_http.tenant_location_conf(
@@ -351,7 +369,9 @@ def test_sharding_service_onboarding(
     assert len(dest_tenants) == 1
     assert TenantId(dest_tenants[0]["id"]) == tenant_id
 
-    # sharding service advances generation by 1 when it first attaches
+    # sharding service advances generation by 1 when it first attaches.  We started
+    # with a nonzero generation so this equality also proves that the generation
+    # was properly carried over during onboarding.
     assert dest_tenants[0]["generation"] == generation + 1
 
     # The onboarded tenant should survive a restart of sharding service
@@ -362,6 +382,31 @@ def test_sharding_service_onboarding(
     dest_ps.stop()
     dest_ps.start()
 
+    # Having onboarded via /location_config, we should also be able to update the
+    # TenantConf part of LocationConf, without inadvertently resetting the generation
+    modified_tenant_conf = {"max_lsn_wal_lag": 1024 * 1024 * 1024 * 100}
+    dest_tenant_before_conf_change = dest_ps.http_client().tenant_status(tenant_id)
+
+    # The generation has moved on since we onboarded
+    assert generation != dest_tenant_before_conf_change["generation"]
+
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "AttachedSingle",
+            "secondary_conf": None,
+            "tenant_conf": modified_tenant_conf,
+            # This is intentionally a stale generation
+            "generation": generation,
+        },
+    )
+    dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id)
+    assert (
+        dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]
+    )
+    dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
+    assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf
+
     env.attachment_service.consistency_check()
 
 
@@ -667,3 +712,41 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
         svc.request(
             "POST", f"{api}/upcall/v1/re-attach", headers=svc.headers(TokenScope.PAGE_SERVER_API)
         )
+
+
+def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
+    """
+    Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without
+    supplying the whole LocationConf.
+    """
+
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+
+    http = env.attachment_service.pageserver_api()
+
+    default_value = "7days"
+    new_value = "1h"
+    http.set_tenant_config(tenant_id, {"pitr_interval": new_value})
+
+    # Ensure the change landed on the storage controller
+    readback_controller = http.tenant_config(tenant_id)
+    assert readback_controller.effective_config["pitr_interval"] == new_value
+    assert readback_controller.tenant_specific_overrides["pitr_interval"] == new_value
+
+    # Ensure the change made it down to the pageserver
+    readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
+    assert readback_ps.effective_config["pitr_interval"] == new_value
+    assert readback_ps.tenant_specific_overrides["pitr_interval"] == new_value
+
+    # Omitting a value clears it.  This looks different in storage controller
+    # vs. pageserver API calls, because pageserver has defaults.
+    http.set_tenant_config(tenant_id, {})
+    readback_controller = http.tenant_config(tenant_id)
+    assert readback_controller.effective_config["pitr_interval"] is None
+    assert readback_controller.tenant_specific_overrides["pitr_interval"] is None
+    readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
+    assert readback_ps.effective_config["pitr_interval"] == default_value
+    assert "pitr_interval" not in readback_ps.tenant_specific_overrides
+
+    env.attachment_service.consistency_check()

From fad9be459883467310bdd08d2f336ad3ce9deb80 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 4 Mar 2024 08:56:55 +0000
Subject: [PATCH 0319/1571] pageserver: mention key in walredo errors (#6988)

## Problem

- Walredo errors, e.g. during image creation, mention the LSN affected
but not the key.

## Summary of changes

- Add key to "error applying ... WAL records" log message
---
 pageserver/src/walredo.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 35cbefb92c..0004f4f3c9 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -262,7 +262,7 @@ impl PostgresRedoManager {
             // next request will launch a new one.
             if let Err(e) = result.as_ref() {
                 error!(
-                    "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
+                    "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
                     records.len(),
                     records.first().map(|p| p.0).unwrap_or(Lsn(0)),
                     records.last().map(|p| p.0).unwrap_or(Lsn(0)),

From 8dc7dc79dd493f81e78f2afd37c1fe8a1d79afaa Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 4 Mar 2024 09:10:04 +0000
Subject: [PATCH 0320/1571] tests: debugging for `test_secondary_downloads`
 failures (#6984)

## Problem

- #6966
- Existing logs aren't pointing to a cause: it looks like heatmap upload
and download are happening, but for some reason the evicted layer isn't
removed on the secondary location.

## Summary of changes

- Assert evicted layer is gone from heatmap before checking its gone
from local disk: this will give clarity on whether the issue is with the
uploads or downloads.
- On assertion failures, log the contents of heatmap.
---
 test_runner/fixtures/remote_storage.py        | 10 +++++
 .../regress/test_pageserver_secondary.py      | 41 ++++++++++++++-----
 2 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 4a692688e0..60591d8d46 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -252,6 +252,16 @@ class S3Storage:
 
         log.info(f"deleted {cnt} objects from remote storage")
 
+    def tenant_path(self, tenant_id: TenantId) -> str:
+        return f"{self.prefix_in_bucket}/tenants/{tenant_id}"
+
+    def heatmap_key(self, tenant_id: TenantId) -> str:
+        return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
+
+    def heatmap_content(self, tenant_id: TenantId):
+        r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id))
+        return json.loads(r["Body"].read().decode("utf-8"))
+
 
 RemoteStorage = Union[LocalFsStorage, S3Storage]
 
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 8f694de2e1..8ba9d767dd 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -1,3 +1,4 @@
+import json
 import random
 from pathlib import Path
 from typing import Any, Dict, Optional
@@ -10,7 +11,7 @@ from fixtures.pageserver.utils import (
     poll_for_remote_storage_iterations,
     tenant_delete_wait_completed,
 )
-from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
+from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
@@ -436,6 +437,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     )
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
     assert env.attachment_service is not None
+    assert isinstance(env.pageserver_remote_storage, S3Storage)  # Satisfy linter
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
@@ -491,18 +493,35 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
 
     # Do evictions on attached pageserver, check secondary follows along
     # ==================================================================
-    log.info("Evicting a layer...")
-    layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
-    ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name)
+    try:
+        log.info("Evicting a layer...")
+        layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
+        some_other_layer = list_layers(ps_attached, tenant_id, timeline_id)[1]
+        log.info(f"Victim layer: {layer_to_evict.name}")
+        ps_attached.http_client().evict_layer(
+            tenant_id, timeline_id, layer_name=layer_to_evict.name
+        )
 
-    log.info("Synchronizing after eviction...")
-    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
-    ps_secondary.http_client().tenant_secondary_download(tenant_id)
+        log.info("Synchronizing after eviction...")
+        ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+        heatmap_after_eviction = env.pageserver_remote_storage.heatmap_content(tenant_id)
+        heatmap_layers = set(
+            layer["name"] for layer in heatmap_after_eviction["timelines"][0]["layers"]
+        )
+        assert layer_to_evict.name not in heatmap_layers
+        assert some_other_layer.name in heatmap_layers
 
-    assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
-    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
-        ps_secondary, tenant_id, timeline_id
-    )
+        ps_secondary.http_client().tenant_secondary_download(tenant_id)
+
+        assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
+        assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+            ps_secondary, tenant_id, timeline_id
+        )
+    except:
+        # On assertion failures, log some details to help with debugging
+        heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)
+        log.warn(f"heatmap contents: {json.dumps(heatmap,indent=2)}")
+        raise
 
     # Scrub the remote storage
     # ========================

From 3114be034a5845fa95ffe1e05f420eae9e84d031 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 4 Mar 2024 13:31:28 +0400
Subject: [PATCH 0321/1571] proxy: change is cold start to enum (#6948)

## Problem

Actually it's good idea to distinguish between cases when it's a cold
start, but we took the compute from the pool

## Summary of changes

Updated to enum.
---
 proxy/src/console/messages.rs | 14 ++++++-
 proxy/src/context.rs          |  8 ++--
 proxy/src/context/parquet.rs  | 75 ++++++++++++++++++-----------------
 3 files changed, 55 insertions(+), 42 deletions(-)

diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 1f94059f1e..85adb31654 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,4 +1,4 @@
-use serde::Deserialize;
+use serde::{Deserialize, Serialize};
 use std::fmt;
 
 use crate::auth::IpPattern;
@@ -98,7 +98,16 @@ pub struct MetricsAuxInfo {
     pub endpoint_id: EndpointId,
     pub project_id: ProjectId,
     pub branch_id: BranchId,
-    pub is_cold_start: Option<bool>,
+    pub cold_start_info: Option<ColdStartInfo>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum ColdStartInfo {
+    Unknown = 0,
+    Warm = 1,
+    PoolHit = 2,
+    PoolMiss = 3,
 }
 
 #[cfg(test)]
@@ -111,6 +120,7 @@ mod tests {
             "endpoint_id": "endpoint",
             "project_id": "project",
             "branch_id": "branch",
+            "cold_start_info": "unknown",
         })
     }
 
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index abad8a6412..1b48e01358 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -9,7 +9,7 @@ use tracing::{field::display, info_span, Span};
 use uuid::Uuid;
 
 use crate::{
-    console::messages::MetricsAuxInfo,
+    console::messages::{ColdStartInfo, MetricsAuxInfo},
     error::ErrorKind,
     metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
     BranchId, DbName, EndpointId, ProjectId, RoleName,
@@ -42,7 +42,7 @@ pub struct RequestMonitoring {
     error_kind: Option<ErrorKind>,
     pub(crate) auth_method: Option<AuthMethod>,
     success: bool,
-    is_cold_start: Option<bool>,
+    cold_start_info: Option<ColdStartInfo>,
 
     // extra
     // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -91,7 +91,7 @@ impl RequestMonitoring {
             error_kind: None,
             auth_method: None,
             success: false,
-            is_cold_start: None,
+            cold_start_info: None,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
             latency_timer: LatencyTimer::new(protocol),
@@ -115,7 +115,7 @@ impl RequestMonitoring {
         self.set_endpoint_id(x.endpoint_id);
         self.branch = Some(x.branch_id);
         self.project = Some(x.project_id);
-        self.is_cold_start = x.is_cold_start;
+        self.cold_start_info = x.cold_start_info;
     }
 
     pub fn set_project_id(&mut self, project_id: ProjectId) {
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 54f51604bf..1b1274b196 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -93,7 +93,7 @@ struct RequestData {
     /// Or if we make it to proxy_pass
     success: bool,
     /// Indicates if the cplane started the new compute node for this request.
-    is_cold_start: Option<bool>,
+    cold_start_info: Option<String>,
     /// Tracks time from session start (HTTP request/libpq TCP handshake)
     /// Through to success/failure
     duration_us: u64,
@@ -121,7 +121,10 @@ impl From<RequestMonitoring> for RequestData {
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
             success: value.success,
-            is_cold_start: value.is_cold_start,
+            cold_start_info: value
+                .cold_start_info
+                .as_ref()
+                .map(|x| serde_json::to_string(x).unwrap_or_default()),
             duration_us: SystemTime::from(value.first_packet)
                 .elapsed()
                 .unwrap_or_default()
@@ -455,7 +458,7 @@ mod tests {
             region: "us-east-1",
             error: None,
             success: rng.gen(),
-            is_cold_start: Some(true),
+            cold_start_info: Some("no".into()),
             duration_us: rng.gen_range(0..30_000_000),
         }
     }
@@ -525,16 +528,16 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315032, 3, 6000),
-                (1315025, 3, 6000),
-                (1315085, 3, 6000),
-                (1315042, 3, 6000),
-                (1315172, 3, 6000),
-                (1315014, 3, 6000),
-                (1314806, 3, 6000),
-                (1315042, 3, 6000),
-                (438563, 1, 2000)
-            ],
+                (1314406, 3, 6000),
+                (1314399, 3, 6000),
+                (1314459, 3, 6000),
+                (1314416, 3, 6000),
+                (1314546, 3, 6000),
+                (1314388, 3, 6000),
+                (1314180, 3, 6000),
+                (1314416, 3, 6000),
+                (438359, 1, 2000)
+            ]
         );
 
         tmpdir.close().unwrap();
@@ -563,12 +566,12 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1220433, 5, 10000),
-                (1226583, 5, 10000),
-                (1228377, 5, 10000),
-                (1227739, 5, 10000),
-                (1219017, 5, 10000)
-            ],
+                (1220668, 5, 10000),
+                (1226818, 5, 10000),
+                (1228612, 5, 10000),
+                (1227974, 5, 10000),
+                (1219252, 5, 10000)
+            ]
         );
 
         tmpdir.close().unwrap();
@@ -599,12 +602,12 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1206080, 5, 10000),
-                (1205811, 5, 10000),
-                (1206104, 5, 10000),
-                (1206092, 5, 10000),
-                (1206347, 5, 10000)
-            ],
+                (1206315, 5, 10000),
+                (1206046, 5, 10000),
+                (1206339, 5, 10000),
+                (1206327, 5, 10000),
+                (1206582, 5, 10000)
+            ]
         );
 
         tmpdir.close().unwrap();
@@ -628,16 +631,16 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315032, 3, 6000),
-                (1315025, 3, 6000),
-                (1315085, 3, 6000),
-                (1315042, 3, 6000),
-                (1315172, 3, 6000),
-                (1315014, 3, 6000),
-                (1314806, 3, 6000),
-                (1315042, 3, 6000),
-                (438563, 1, 2000)
-            ],
+                (1314406, 3, 6000),
+                (1314399, 3, 6000),
+                (1314459, 3, 6000),
+                (1314416, 3, 6000),
+                (1314546, 3, 6000),
+                (1314388, 3, 6000),
+                (1314180, 3, 6000),
+                (1314416, 3, 6000),
+                (438359, 1, 2000)
+            ]
         );
 
         tmpdir.close().unwrap();
@@ -673,7 +676,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(659129, 2, 3001), (658842, 2, 3000), (658638, 2, 2999)],
+            [(658837, 2, 3001), (658551, 2, 3000), (658347, 2, 2999)]
         );
 
         tmpdir.close().unwrap();

From 3fd77eb0d46dba7de3bd51ada2a7c46f56fd6f72 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Mar 2024 12:33:42 +0100
Subject: [PATCH 0322/1571] layer file creation: remove redundant fsync()s
 (#6983)

The `writer.finish()` methods already fsync the inode, using
`VirtualFile::sync_all()`.

All that the callers need to do is fsync their directory, i.e., the
timeline directory.

Note that there's a call in the new compaction code that is apparently
dead-at-runtime, so, I couldn't fix up any fsyncs there
[Link](https://github.com/neondatabase/neon/blob/502b69b33bbd4ad1b0647e921a9c665249a2cd62/pageserver/src/tenant/timeline/compaction.rs#L204-L211).

Note that layer durability still matters somewhat, even after #5198
which made remote storage authoritative.
We do have the layer file length as an indicator, but no checksums on
the layer file contents.
So, a series of overwrites without fsyncs in the middle, plus a
subsequent crash, could cause us to end up in a state where the file
length matches but the contents are garbage.

part of https://github.com/neondatabase/neon/issues/6663
---
 pageserver/src/tenant/timeline.rs | 63 ++++++-------------------------
 1 file changed, 11 insertions(+), 52 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 206f20306e..0c03ef33c3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -10,7 +10,7 @@ mod walreceiver;
 
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::Bytes;
-use camino::{Utf8Path, Utf8PathBuf};
+use camino::Utf8Path;
 use enumset::EnumSet;
 use fail::fail_point;
 use futures::stream::StreamExt;
@@ -3422,26 +3422,10 @@ impl Timeline {
                 let _g = span.entered();
                 let new_delta =
                     Handle::current().block_on(frozen_layer.write_to_disk(&self_clone, &ctx))?;
-                let new_delta_path = new_delta.local_path().to_owned();
 
-                // Sync it to disk.
-                //
-                // We must also fsync the timeline dir to ensure the directory entries for
-                // new layer files are durable.
-                //
-                // NB: timeline dir must be synced _after_ the file contents are durable.
-                // So, two separate fsyncs are required, they mustn't be batched.
-                //
-                // TODO: If we're running inside 'flush_frozen_layers' and there are multiple
-                // files to flush, the fsync overhead can be reduces as follows:
-                // 1. write them all to temporary file names
-                // 2. fsync them
-                // 3. rename to the final name
-                // 4. fsync the parent directory.
-                // Note that (1),(2),(3) today happen inside write_to_disk().
-                //
-                // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
-                par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?;
+                // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
+                // We just need to fsync the directory in which these inodes are linked,
+                // which we know to be the timeline directory.
                 par_fsync::par_fsync(&[self_clone
                     .conf
                     .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)])
@@ -3674,25 +3658,10 @@ impl Timeline {
             }
         }
 
-        // Sync the new layer to disk before adding it to the layer map, to make sure
-        // we don't garbage collect something based on the new layer, before it has
-        // reached the disk.
-        //
-        // We must also fsync the timeline dir to ensure the directory entries for
-        // new layer files are durable
-        //
-        // Compaction creates multiple image layers. It would be better to create them all
-        // and fsync them all in parallel.
-        let all_paths = image_layers
-            .iter()
-            .map(|layer| layer.local_path().to_owned())
-            .collect::<Vec<_>>();
-
-        par_fsync::par_fsync_async(&all_paths)
-            .await
-            .context("fsync of newly created layer files")?;
-
-        if !all_paths.is_empty() {
+        // The writer.finish() above already did the fsync of the inodes.
+        // We just need to fsync the directory in which these inodes are linked,
+        // which we know to be the timeline directory.
+        if !image_layers.is_empty() {
             par_fsync::par_fsync_async(&[self
                 .conf
                 .timeline_path(&self.tenant_shard_id, &self.timeline_id)])
@@ -4279,22 +4248,12 @@ impl Timeline {
                 }
             }
 
-            // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
-            let layer_paths: Vec<Utf8PathBuf> = new_layers
-                .iter()
-                .map(|l| l.local_path().to_owned())
-                .collect();
-
-            // Fsync all the layer files and directory using multiple threads to
-            // minimize latency.
-            par_fsync::par_fsync_async(&layer_paths)
-                .await
-                .context("fsync all new layers")?;
-
+            // The writer.finish() above already did the fsync of the inodes.
+            // We just need to fsync the directory in which these inodes are linked,
+            // which we know to be the timeline directory.
             let timeline_dir = self
                 .conf
                 .timeline_path(&self.tenant_shard_id, &self.timeline_id);
-
             par_fsync::par_fsync_async(&[timeline_dir])
                 .await
                 .context("fsync of timeline dir")?;

From 5c6d78d4692dcf1096cf95f759d89203f824bf07 Mon Sep 17 00:00:00 2001
From: Andreas Scherbaum <andreasscherbaum@users.noreply.github.com>
Date: Mon, 4 Mar 2024 13:02:18 +0100
Subject: [PATCH 0323/1571] Rename "zenith" to "neon" (#6957)

Usually RFC documents are not modified, but the vast mentions of
"zenith" in early RFC documents make it desirable to update the product
name to today's name, to avoid confusion.

## Problem

Early RFC documents use the old "zenith" product name a lot, which is
not something everyone is aware of after the product was renamed.

## Summary of changes

Replace occurrences of "zenith" with "neon".
Images are excluded.

---------

Co-authored-by: Andreas Scherbaum <andreas@neon.tech>
---
 docs/rfcs/002-storage.md                      |   2 +-
 docs/rfcs/003-laptop-cli.md                   | 122 +++++++++---------
 docs/rfcs/004-durability.md                   |   2 +-
 docs/rfcs/005-zenith_local.md                 |  46 +++----
 docs/rfcs/006-laptop-cli-v2-CLI.md            |  48 +++----
 .../006-laptop-cli-v2-repository-structure.md |  44 +++----
 docs/rfcs/007-serverless-on-laptop.md         |  26 ++--
 docs/rfcs/008-push-pull.md                    |  12 +-
 docs/rfcs/009-snapshot-first-storage-cli.md   |  20 +--
 docs/rfcs/013-term-history.md                 |   2 +-
 docs/rfcs/014-safekeepers-gossip.md           |   2 +-
 docs/rfcs/015-storage-messaging.md            |   4 +-
 12 files changed, 165 insertions(+), 165 deletions(-)

diff --git a/docs/rfcs/002-storage.md b/docs/rfcs/002-storage.md
index f99683cf09..d11b750e73 100644
--- a/docs/rfcs/002-storage.md
+++ b/docs/rfcs/002-storage.md
@@ -1,4 +1,4 @@
-# Zenith storage node — alternative
+# Neon storage node — alternative
 
 ## **Design considerations**
 
diff --git a/docs/rfcs/003-laptop-cli.md b/docs/rfcs/003-laptop-cli.md
index 1a549c2df5..003a05bd16 100644
--- a/docs/rfcs/003-laptop-cli.md
+++ b/docs/rfcs/003-laptop-cli.md
@@ -1,6 +1,6 @@
 # Command line interface (end-user)
 
-Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start.
+Neon CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside neon distribution at least at the start.
 
 This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots.
 
@@ -8,40 +8,40 @@ The most important concept here is a snapshot, which can be created/pushed/pulle
 
 # Possible usage scenarios
 
-## Install zenith, run a postgres
+## Install neon, run a postgres
 
 ```
-> brew install pg-zenith 
-> zenith pg create # creates pgdata with default pattern pgdata$i
-> zenith pg list
+> brew install pg-neon 
+> neon pg create # creates pgdata with default pattern pgdata$i
+> neon pg list
 ID            PGDATA        USED    STORAGE            ENDPOINT
-primary1      pgdata1       0G      zenith-local       localhost:5432
+primary1      pgdata1       0G      neon-local       localhost:5432
 ```
 
-## Import standalone postgres to zenith
+## Import standalone postgres to neon
 
 ```
-> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg
+> neon snapshot import --from=basebackup://replication@localhost:5432/ oldpg
 [====================------------] 60% | 20MB/s
-> zenith snapshot list
+> neon snapshot list
 ID          SIZE        PARENT
 oldpg       5G          -
 
-> zenith pg create --snapshot oldpg
+> neon pg create --snapshot oldpg
 Started postgres on localhost:5432
 
-> zenith pg list
+> neon pg list
 ID            PGDATA        USED    STORAGE            ENDPOINT
-primary1      pgdata1       5G      zenith-local       localhost:5432
+primary1      pgdata1       5G      neon-local       localhost:5432
 
-> zenith snapshot destroy oldpg
+> neon snapshot destroy oldpg
 Ok
 ```
 
 Also, we may start snapshot import implicitly by looking at snapshot schema
 
 ```
-> zenith pg create --snapshot basebackup://replication@localhost:5432/
+> neon pg create --snapshot basebackup://replication@localhost:5432/
 Downloading snapshot... Done.
 Started postgres on localhost:5432
 Destroying snapshot... Done.
@@ -52,39 +52,39 @@ Destroying snapshot... Done.
 Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage).
 
 ```
-> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies
+> neon pg create --snapshot http://learn-postgres.com/movies_db.neon movies
 ```
 
 ## Create snapshot and push it to the cloud
 
 ```
-> zenith snapshot create pgdata1@snap1
-> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1
+> neon snapshot create pgdata1@snap1
+> neon snapshot push --to ssh://stas@neon.tech pgdata1@snap1
 ```
 
 ## Rollback database to the snapshot
 
-One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`.
+One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `neon pg checkout`.
 
 ```
-> zenith pg list
+> neon pg list
 ID            PGDATA        USED    STORAGE            ENDPOINT
-primary1      pgdata1       5G      zenith-local       localhost:5432
+primary1      pgdata1       5G      neon-local       localhost:5432
 
-> zenith snapshot create pgdata1@snap1
+> neon snapshot create pgdata1@snap1
 
-> zenith snapshot list
+> neon snapshot list
 ID                    SIZE        PARENT
 oldpg                 5G          -
 pgdata1@snap1         6G          -
 pgdata1@CURRENT       6G          -
 
-> zenith pg checkout pgdata1@snap1
+> neon pg checkout pgdata1@snap1
 Stopping postgres on pgdata1.
 Rolling back pgdata1@CURRENT to pgdata1@snap1.
 Starting postgres on pgdata1.
 
-> zenith snapshot list
+> neon snapshot list
 ID                    SIZE        PARENT
 oldpg                 5G          -
 pgdata1@snap1         6G          -
@@ -99,7 +99,7 @@ Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state
 PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite).
 
 ```
-> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month
+> neon pitr create --storage s3tank --ttl 30d --name pitr_last_month
 ```
 
 Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area.
@@ -108,29 +108,29 @@ Resetting the database to some state in past would require creating a snapshot o
 
 ## storage
 
-Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default.
+Storage is either neon pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default.
 
-**zenith storage attach** -t [native|s3] -c key=value -n name
+**neon storage attach** -t [native|s3] -c key=value -n name
 
-Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'.
+Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=neon.tech/stas/mystore. Other possible term for native is 'zstore'.
 
 
-**zenith storage list**
+**neon storage list**
 
 Show currently attached storages. For example:
 
 ```
-> zenith storage list
+> neon storage list
 NAME            USED    TYPE                OPTIONS          PATH
-local           5.1G    zenith-local                         /opt/zenith/store/local
-local.compr     20.4G   zenith-local        compression=on    /opt/zenith/store/local.compr
-zcloud          60G     zenith-remote                        zenith.tech/stas/mystore
+local           5.1G    neon-local                         /opt/neon/store/local
+local.compr     20.4G   neon-local        compression=on    /opt/neon/store/local.compr
+zcloud          60G     neon-remote                        neon.tech/stas/mystore
 s3tank          80G     S3
 ```
 
-**zenith storage detach**
+**neon storage detach**
 
-**zenith storage show**
+**neon storage show**
 
 
@@ -140,29 +140,29 @@ Manages postgres data directories and can start postgres instances with proper c
 
 Pg is a term for a single postgres running on some data. I'm trying to avoid separation of datadir management and postgres instance management -- both that concepts bundled here together.
 
-**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata
+**neon pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata
 
 Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr.
 
 --no-start: just init datadir without creating 
 
---snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1)
+--snapshot snap: init from the snapshot. Snap is a name or URL (neon.tech/stas/mystore/snap1)
 
 --cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database)
 
-**zenith pg destroy**
+**neon pg destroy**
 
-**zenith pg start** [--replica] pgdata
+**neon pg start** [--replica] pgdata
 
 Start postgres with proper extensions preloaded/installed.
 
-**zenith pg checkout**
+**neon pg checkout**
 
 Rollback data directory to some previous snapshot. 
 
-**zenith pg stop** pg_id
+**neon pg stop** pg_id
 
-**zenith pg list**
+**neon pg list**
 
 ```
 ROLE                 PGDATA        USED    STORAGE            ENDPOINT
@@ -173,7 +173,7 @@ primary              my_pg2        3.2G    local.compr        localhost:5435
 -                    my_pg3        9.2G    local.compr        -
 ```
 
-**zenith pg show**
+**neon pg show**
 
 ```
 my_pg:
@@ -194,7 +194,7 @@ my_pg:
 
 ```
 
-**zenith pg start-rest/graphql** pgdata
+**neon pg start-rest/graphql** pgdata
 
 Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea.
 
@@ -203,35 +203,35 @@ Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that,
 
 Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout.
 
-**zenith snapshot create** pgdata_name@snap_name
+**neon snapshot create** pgdata_name@snap_name
 
 Creates a new snapshot in the same storage where pgdata_name exists.
 
-**zenith snapshot push** --to url pgdata_name@snap_name
+**neon snapshot push** --to url pgdata_name@snap_name
 
-Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go.
+Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `neon snapshot recv` before push happens. If url has some special schema like neon:// receiving side may require auth start `neon snapshot recv` on the go.
 
-**zenith snapshot recv**
+**neon snapshot recv**
 
 Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket.
 
-**zenith snapshot pull** --from url or path
+**neon snapshot pull** --from url or path
 
-Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format.
+Connects to a remote neon/s3/file and pulls snapshot. The remote site should be neon service or files in our format.
 
-**zenith snapshot import** --from basebackup://<...>  or path
+**neon snapshot import** --from basebackup://<...>  or path
 
 Creates a new snapshot out of running postgres via basebackup protocol or basebackup files.
 
-**zenith snapshot export**
+**neon snapshot export**
 
-Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay).
+Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be neon own format which is handy for us (but I think just tar of basebackup would be okay).
 
-**zenith snapshot diff** snap1 snap2
+**neon snapshot diff** snap1 snap2
 
 Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses.
 
-**zenith snapshot destroy**
+**neon snapshot destroy**
 
 ## pitr
 
@@ -239,7 +239,7 @@ Pitr represents wal stream and ttl policy for that stream
 
 XXX: any suggestions on a better name?
 
-**zenith pitr create** name
+**neon pitr create** name
 
 --ttl = inf | period
 
@@ -247,21 +247,21 @@ XXX: any suggestions on a better name?
 
 --storage = storage_name
 
-**zenith pitr extract-snapshot** pitr_name --lsn xxx
+**neon pitr extract-snapshot** pitr_name --lsn xxx
 
 Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export)
 
-**zenith pitr gc** pitr_name
+**neon pitr gc** pitr_name
 
 Force garbage collection on some PITR area.
 
-**zenith pitr list**
+**neon pitr list**
 
-**zenith pitr destroy**
+**neon pitr destroy**
 
 
 ## console
 
-**zenith console**
+**neon console**
 
 Opens browser targeted at web console with the more or less same functionality as described here.
diff --git a/docs/rfcs/004-durability.md b/docs/rfcs/004-durability.md
index d4716156d1..6b83c77403 100644
--- a/docs/rfcs/004-durability.md
+++ b/docs/rfcs/004-durability.md
@@ -6,7 +6,7 @@ When do we consider the WAL record as durable, so that we can
 acknowledge the commit to the client and be reasonably certain that we
 will not lose the transaction?
 
-Zenith uses a group of WAL safekeeper nodes to hold the generated WAL.
+Neon uses a group of WAL safekeeper nodes to hold the generated WAL.
 A WAL record is considered durable, when it has been written to a
 majority of WAL safekeeper nodes. In this document, I use 5
 safekeepers, because I have five fingers. A WAL record is durable,
diff --git a/docs/rfcs/005-zenith_local.md b/docs/rfcs/005-zenith_local.md
index e36d0a9ae3..6c283d7a37 100644
--- a/docs/rfcs/005-zenith_local.md
+++ b/docs/rfcs/005-zenith_local.md
@@ -1,23 +1,23 @@
-# Zenith local
+# Neon local
 
-Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together.  Your comments on both parts are very welcome.
+Here I list some objectives to keep in mind when discussing neon-local design and a proposal that brings all components together.  Your comments on both parts are very welcome.
 
 #### Why do we need it?
 - For distribution - this easy to use binary will help us to build adoption among developers.
 - For internal use - to test all components together.
 
-In my understanding, we consider it to be just a mock-up version of zenith-cloud.
+In my understanding, we consider it to be just a mock-up version of neon-cloud.
 > Question: How much should we care about durability and security issues for a local setup?
 
 
 #### Why is it better than a simple local postgres?
 
-- Easy one-line setup. As simple as `cargo install zenith && zenith start`
+- Easy one-line setup. As simple as `cargo install neon && neon start`
 
 - Quick and cheap creation of compute nodes over the same storage.
 > Question: How can we describe a use-case for this feature?
 
-- Zenith-local can work with S3 directly. 
+- Neon-local can work with S3 directly. 
 
 - Push and pull images (snapshots) to remote S3 to exchange data with other users.
 
@@ -31,50 +31,50 @@ Ideally, just one binary that incorporates all elements we need.
 
 #### Components:
 
-- **zenith-CLI** - interface for end-users.  Turns commands to REST requests and handles responses to show them in a user-friendly way.  
-CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md
-WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli
+- **neon-CLI** - interface for end-users.  Turns commands to REST requests and handles responses to show them in a user-friendly way.  
+CLI proposal is here https://github.com/neondatabase/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md
+WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src/bin/cli
 
-- **zenith-console** - WEB UI with same functionality as CLI.
+- **neon-console** - WEB UI with same functionality as CLI.
 >Note: not for the first release.
 
-- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below.
-    > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local.
+- **neon-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below.
+    > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping neon-local.
 
-- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation).
+- **neon-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation).
 > Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server?
 
-WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src
+WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src
 
-- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith.
+- **neon-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to neon.
 > Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)?
 > Question: Do we use it together with local page store or they are interchangeable?
 
 WIP code is ???
 
-- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed.
+- **neon-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed.
 > Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system.
 
-WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper
+WIP code is here: https://github.com/neondatabase/postgres/tree/main/src/bin/safekeeper
 
-- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database.
+- **neon-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database.
  
- WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node
+ WIP code is in main branch and here: https://github.com/neondatabase/postgres/commits/compute_node
 
 #### REST API:
 
 Service endpoint: `http://localhost:3000`
 
 Resources:
-- /storages - Where data lives: zenith-pageserver or zenith-s3
-- /pgs - Postgres - zenith-computenode
+- /storages - Where data lives: neon-pageserver or neon-s3
+- /pgs - Postgres - neon-computenode
 - /snapshots - snapshots **TODO**
 
->Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all?
+>Question: Do we want to extend this API to manage neon components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all?
 
 Methods and their mapping to CLI:
 
-- /storages - zenith-pageserver or zenith-s3
+- /storages - neon-pageserver or neon-s3
 
 CLI  | REST API
 ------------- | -------------
@@ -84,7 +84,7 @@ storage list | GET /storages
 storage show -n name | GET /storages/:storage_name 
 
 
-- /pgs - zenith-computenode
+- /pgs - neon-computenode
 
 CLI  | REST API
 ------------- | -------------
diff --git a/docs/rfcs/006-laptop-cli-v2-CLI.md b/docs/rfcs/006-laptop-cli-v2-CLI.md
index 84dc932211..5030ecc7e7 100644
--- a/docs/rfcs/006-laptop-cli-v2-CLI.md
+++ b/docs/rfcs/006-laptop-cli-v2-CLI.md
@@ -1,45 +1,45 @@
-Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog".
+Neon CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog".
 
 # CLI v2 (after chatting with Carl)
 
-Zenith introduces the notion of a repository.
+Neon introduces the notion of a repository.
 
 ```bash
-zenith init
-zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory
+neon init
+neon clone neon://neon.tech/piedpiper/northwind -- clones a repo to the northwind directory
 ```
 
 Once you have a cluster catalog you can explore it
 
 ```bash
-zenith log -- returns a list of commits
-zenith status -- returns if there are changes in the catalog that can be committed
-zenith commit -- commits the changes and generates a new commit hash
-zenith branch experimental <hash> -- creates a branch called testdb based on a given commit hash
+neon log -- returns a list of commits
+neon status -- returns if there are changes in the catalog that can be committed
+neon commit -- commits the changes and generates a new commit hash
+neon branch experimental <hash> -- creates a branch called testdb based on a given commit hash
 ```
 
 To make changes in the catalog you need to run compute nodes
 
 ```bash
 -- here is how you a compute node
-zenith start /home/pipedpiper/northwind:main -- starts a compute instance
-zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud
+neon start /home/pipedpiper/northwind:main -- starts a compute instance
+neon start neon://neon.tech/northwind:main -- starts a compute instance in the cloud
 -- you can start a compute node against any hash or branch
-zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port)
+neon start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port)
 -- you can start a compute node against any hash or branch
-zenith start /home/pipedpiper/northwind:<hash> --port 8009 -- start another compute instance (on different port)
+neon start /home/pipedpiper/northwind:<hash> --port 8009 -- start another compute instance (on different port)
 
 -- After running some DML you can run 
--- zenith status and see how there are two WAL streams one on top of 
+-- neon status and see how there are two WAL streams one on top of 
 -- the main branch
-zenith status 
+neon status 
 -- and another on top of the experimental branch
-zenith status -b experimental
+neon status -b experimental
 
 -- you can commit each branch separately
-zenith commit main
+neon commit main
 -- or
-zenith commit -c /home/pipedpiper/northwind:experimental
+neon commit -c /home/pipedpiper/northwind:experimental
 ```
 
 Starting compute instances against cloud environments
@@ -47,18 +47,18 @@ Starting compute instances against cloud environments
 ```bash
 -- you can start a compute instance against the cloud environment
 -- in this case all of the changes will be streamed into the cloud
-zenith start https://zenith:tech/pipedpiper/northwind:main
-zenith start https://zenith:tech/pipedpiper/northwind:main
-zenith status -c https://zenith:tech/pipedpiper/northwind:main
-zenith commit -c https://zenith:tech/pipedpiper/northwind:main
-zenith branch -c https://zenith:tech/pipedpiper/northwind:<hash> experimental
+neon start https://neon:tecj/pipedpiper/northwind:main
+neon start https://neon:tecj/pipedpiper/northwind:main
+neon status -c https://neon:tecj/pipedpiper/northwind:main
+neon commit -c https://neon:tecj/pipedpiper/northwind:main
+neon branch -c https://neon:tecj/pipedpiper/northwind:<hash> experimental
 ```
 
 Pushing data into the cloud
 
 ```bash
 -- pull all the commits from the cloud
-zenith pull
+neon pull
 -- push all the commits to the cloud
-zenith push
+neon push
 ```
diff --git a/docs/rfcs/006-laptop-cli-v2-repository-structure.md b/docs/rfcs/006-laptop-cli-v2-repository-structure.md
index e6e6e172ad..749a940313 100644
--- a/docs/rfcs/006-laptop-cli-v2-repository-structure.md
+++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md
@@ -1,14 +1,14 @@
 # Repository format
 
-A Zenith repository is similar to a traditional PostgreSQL backup
+A Neon repository is similar to a traditional PostgreSQL backup
 archive, like a WAL-G bucket or pgbarman backup catalogue. It holds
 multiple versions of a PostgreSQL database cluster.
 
-The distinguishing feature is that you can launch a Zenith Postgres
+The distinguishing feature is that you can launch a Neon Postgres
 server directly against a branch in the repository, without having to
-"restore" it first. Also, Zenith manages the storage automatically,
+"restore" it first. Also, Neon manages the storage automatically,
 there is no separation between full and incremental backups nor WAL
-archive. Zenith relies heavily on the WAL, and uses concepts similar
+archive. Neon relies heavily on the WAL, and uses concepts similar
 to incremental backups and WAL archiving internally, but it is hidden
 from the user.
 
@@ -19,15 +19,15 @@ efficient. Just something to get us started.
 
 The repository directory looks like this:
 
-    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/
-    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots/<lsn>/
-    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history
+    .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/
+    .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots/<lsn>/
+    .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history
     
-    .zenith/refs/branches/mybranch
-    .zenith/refs/tags/foo
-    .zenith/refs/tags/bar
+    .neon/refs/branches/mybranch
+    .neon/refs/tags/foo
+    .neon/refs/tags/bar
     
-    .zenith/datadirs/<timeline uuid>
+    .neon/datadirs/<timeline uuid>
 
 ### Timelines
 
@@ -39,7 +39,7 @@ All WAL is generated on a timeline. You can launch a read-only node
 against a tag or arbitrary LSN on a timeline, but in order to write,
 you need to create a timeline.
 
-Each timeline is stored in a directory under .zenith/timelines. It
+Each timeline is stored in a directory under .neon/timelines. It
 consists of a WAL archive, containing all the WAL in the standard
 PostgreSQL format, under the wal/ subdirectory.
 
@@ -66,18 +66,18 @@ contains the UUID of the timeline (and LSN, for tags).
 
 ### Datadirs
 
-.zenith/datadirs contains PostgreSQL data directories. You can launch
+.neon/datadirs contains PostgreSQL data directories. You can launch
 a Postgres instance on one of them with:
 
 ```
-  postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c
+  postgres -D .neon/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c
 ```
 
 All the actual data is kept in the timeline directories, under
-.zenith/timelines. The data directories are only needed for active
+.neon/timelines. The data directories are only needed for active
 PostgreQSL instances. After an instance is stopped, the data directory
-can be safely removed. "zenith start" will recreate it quickly from
-the data in .zenith/timelines, if it's missing.
+can be safely removed. "neon start" will recreate it quickly from
+the data in .neon/timelines, if it's missing.
 
 ## Version 2
 
@@ -103,14 +103,14 @@ more advanced. The exact format is TODO. But it should support:
 
 ### Garbage collection
 
-When you run "zenith gc", old timelines that are no longer needed are
+When you run "neon gc", old timelines that are no longer needed are
 removed. That involves collecting the list of "unreachable" objects,
 starting from the named branches and tags.
 
 Also, if enough WAL has been generated on a timeline since last
 snapshot, a new snapshot or delta is created.
 
-### zenith push/pull
+### neon push/pull
 
 Compare the tags and branches on both servers, and copy missing ones.
 For each branch, compare the timeline it points to in both servers. If
@@ -123,7 +123,7 @@ every time you start up an instance? Then you would detect that the
 timelines have diverged. That would match with the "epoch" concept
 that we have in the WAL safekeeper
 
-### zenith checkout/commit
+### neon checkout/commit
 
 In this format, there is no concept of a "working tree", and hence no
 concept of checking out or committing. All modifications are done on
@@ -134,7 +134,7 @@ You can easily fork off a temporary timeline to emulate a "working tree".
 You can later remove it and have it garbage collected, or to "commit",
 re-point the branch to the new timeline.
 
-If we want to have a worktree and "zenith checkout/commit" concept, we can
+If we want to have a worktree and "neon checkout/commit" concept, we can
 emulate that with a temporary timeline. Create the temporary timeline at
-"zenith checkout", and have "zenith commit" modify the branch to point to
+"neon checkout", and have "neon commit" modify the branch to point to
 the new timeline.
diff --git a/docs/rfcs/007-serverless-on-laptop.md b/docs/rfcs/007-serverless-on-laptop.md
index e6355f4a03..96f117bfe9 100644
--- a/docs/rfcs/007-serverless-on-laptop.md
+++ b/docs/rfcs/007-serverless-on-laptop.md
@@ -4,27 +4,27 @@ How it works now
 1. Create repository, start page server on it
 
 ```
-$ zenith init
+$ neon init
 ...
 created main branch
-new zenith repository was created in .zenith
+new neon repository was created in .neon
 
-$ zenith pageserver start
-Starting pageserver at '127.0.0.1:64000' in .zenith
+$ neon pageserver start
+Starting pageserver at '127.0.0.1:64000' in .neon
 Page server started
 ```
 
 2. Create a branch, and start a Postgres instance on it
 
 ```
-$ zenith branch heikki main
+$ neon branch heikki main
 branching at end of WAL: 0/15ECF68
 
-$ zenith pg create heikki
+$ neon pg create heikki
 Initializing Postgres on timeline 76cf9279915be7797095241638e64644...
-Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432
+Extracting base backup to create postgres instance: path=.neon/pgdatadirs/pg1 port=55432
 
-$ zenith pg start pg1
+$ neon pg start pg1
 Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki'
 waiting for server to start.... done
 server started
@@ -52,20 +52,20 @@ serverless on your laptop, so that the workflow becomes just:
 1. Create repository, start page server on it (same as before)
 
 ```
-$ zenith init
+$ neon init
 ...
 created main branch
-new zenith repository was created in .zenith
+new neon repository was created in .neon
 
-$ zenith pageserver start
-Starting pageserver at '127.0.0.1:64000' in .zenith
+$ neon pageserver start
+Starting pageserver at '127.0.0.1:64000' in .neon
 Page server started
 ```
 
 2. Create branch
 
 ```
-$ zenith branch heikki main
+$ neon branch heikki main
 branching at end of WAL: 0/15ECF68
 ```
 
diff --git a/docs/rfcs/008-push-pull.md b/docs/rfcs/008-push-pull.md
index 272628e1ce..a36932222a 100644
--- a/docs/rfcs/008-push-pull.md
+++ b/docs/rfcs/008-push-pull.md
@@ -7,22 +7,22 @@ Here is a proposal about implementing push/pull mechanics between pageservers. W
 The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that).
 
 ```
-zenith origin add <name> <connection_uri>
-zenith origin list
-zenith origin remove <name>
+neon origin add <name> <connection_uri>
+neon origin list
+neon origin remove <name>
 ```
 
 Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport.
 
-Behind the scenes, this commands may update toml file inside .zenith directory.
+Behind the scenes, this commands may update toml file inside .neon directory.
 
 ## Push
 
 ### Pushing branch
 
 ```
-zenith push mybranch cloudserver # push to eponymous branch in cloudserver
-zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver
+neon push mybranch cloudserver # push to eponymous branch in cloudserver
+neon push mybranch cloudserver:otherbranch # push to a different branch in cloudserver
 ```
 
 Exact mechanics would be slightly different in the following situations:
diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md
index 0acbd68f86..bbd0f75fe2 100644
--- a/docs/rfcs/009-snapshot-first-storage-cli.md
+++ b/docs/rfcs/009-snapshot-first-storage-cli.md
@@ -2,7 +2,7 @@ While working on export/import commands, I understood that they fit really well
 
 We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files.
 
-Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to zenith.
+Even if neon aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to neon.
 
 So here is an attempt to design consistent CLI for different usage scenarios:
 
@@ -16,8 +16,8 @@ Save`storage_dest` and other parameters in config.
 Push snapshots to `storage_dest` in background.
 
 ```
-zenith init --storage_dest=S3_PREFIX
-zenith start
+neon init --storage_dest=S3_PREFIX
+neon start
 ```
 
 #### 2. Restart pageserver (manually or crash-recovery).
@@ -25,7 +25,7 @@ Take `storage_dest` from pageserver config, start pageserver from latest snapsho
 Push snapshots to `storage_dest` in background.
 
 ```
-zenith start
+neon start
 ```
 
 #### 3. Import.
@@ -35,22 +35,22 @@ Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time
 Save`storage_dest` parameters in config.
 Push snapshots to `storage_dest` in background.
 ```
-//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage.
-zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX
-zenith start
+//I.e. we want to start neon on top of existing $PGDATA and use s3 as a persistent storage.
+neon init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX
+neon start
 ```
 How to pass credentials needed for `snapshot_path`?
 
 #### 4. Export.
 Manually push snapshot to `snapshot_path` which differs from `storage_dest`
-Optionally set `snapshot_format`, which can be plain pgdata format or zenith format.
+Optionally set `snapshot_format`, which can be plain pgdata format or neon format.
 ```
-zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata
+neon export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata
 ```
 
 #### Notes and questions
 - safekeeper s3_offload should use same (similar) syntax for storage. How to set it in UI?
-- Why do we need `zenith init` as a separate command? Can't we init everything at first start?
+- Why do we need `neon init` as a separate command? Can't we init everything at first start?
 - We can think of better names for all options.
 - Export to plain postgres format will be useless, if we are not 100% compatible on page level.
 I can recall at least one such difference - PD_WAL_LOGGED flag in pages.
diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md
index 7e815abf73..2f3ccbc09b 100644
--- a/docs/rfcs/013-term-history.md
+++ b/docs/rfcs/013-term-history.md
@@ -9,7 +9,7 @@ receival and this might lag behind `term`; safekeeper switches to epoch `n` when
 it has received all committed log records from all `< n` terms. This roughly
 corresponds to proposed in
 
-https://github.com/zenithdb/rfcs/pull/3/files
+https://github.com/neondatabase/rfcs/pull/3/files
 
 
 This makes our biggest our difference from Raft. In Raft, every log record is
diff --git a/docs/rfcs/014-safekeepers-gossip.md b/docs/rfcs/014-safekeepers-gossip.md
index 3d6cc04b94..ff38a0a0ef 100644
--- a/docs/rfcs/014-safekeepers-gossip.md
+++ b/docs/rfcs/014-safekeepers-gossip.md
@@ -1,6 +1,6 @@
 # Safekeeper gossip
 
-Extracted from this [PR](https://github.com/zenithdb/rfcs/pull/13)
+Extracted from this [PR](https://github.com/neondatabase/rfcs/pull/13)
 
 ## Motivation
 
diff --git a/docs/rfcs/015-storage-messaging.md b/docs/rfcs/015-storage-messaging.md
index a415b90459..7702311d65 100644
--- a/docs/rfcs/015-storage-messaging.md
+++ b/docs/rfcs/015-storage-messaging.md
@@ -2,7 +2,7 @@
 
 Created on 19.01.22
 
-Initially created [here](https://github.com/zenithdb/rfcs/pull/16) by @kelvich.
+Initially created [here](https://github.com/neondatabase/rfcs/pull/16) by @kelvich.
 
 That it is an alternative to (014-safekeeper-gossip)[]
 
@@ -292,4 +292,4 @@ But with an etcd we are in a bit different situation:
 1. We don't need persistency and strong consistency guarantees for the data we store in the etcd
 2. etcd uses Grpc as a protocol, and messages are pretty simple
 
-So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local zenith installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres).
+So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local neon installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres).

From 6e46204712a68e34b40caaa9cf01c7f4141ab0a1 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 4 Mar 2024 12:08:44 +0000
Subject: [PATCH 0324/1571] CI(deploy): use separate workflow for proxy deploys
 (#6995)

## Problem

The current implementation of `deploy-prod` workflow doesn't allow to
run parallel deploys on Storage and Proxy.

## Summary of changes
- Call `deploy-proxy-prod` workflow that deploys only Proxy components,
and that can be run in parallel with `deploy-prod` for Storage.
---
 .github/workflows/build_and_test.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 2e52e7c28f..276c71c6e0 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1132,11 +1132,9 @@ jobs:
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
               -f deployPgSniRouter=true \
               -f deployProxy=true \
-              -f deployStorage=false \
-              -f deployStorageBroker=false \
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}}
           else

From c861d71eeb6d3acfc4c99ced41dd0df778cda802 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Mar 2024 13:18:22 +0100
Subject: [PATCH 0325/1571] layer file creation: fatal_err on timeline dir
 fsync (#6985)

As pointed out in the comments added in this PR:
the in-memory state of the filesystem already has the layer file in its
final place.
If the fsync fails, but pageserver continues to execute, it's quite easy
for subsequent pageserver code to observe the file being there and
assume it's durable, when it really isn't.

It can happen that we get ENOSPC during the fsync.
However,
1. the timeline dir is small (remember, the big layer _file_ has already
been synced).
Small data means ENOSPC due to delayed allocation races etc are less
likely.
2. what else are we going to do in that case?

If we decide to bubble up the error, the file remains on disk.
We could try to unlink it and fsync after the unlink.
If that fails, we would _definitely_ need to error out.
Is it worth the trouble though?

Side note: all this logic about not carrying on after fsync failure
implies that we `sync` the filesystem successfully before we restart
the pageserver. We don't do that right now, but should (=>
https://github.com/neondatabase/neon/issues/6989)

part of https://github.com/neondatabase/neon/issues/6663
---
 pageserver/src/tenant/timeline.rs | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0c03ef33c3..0a2ae5d8bd 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -50,7 +50,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::sync::gate::{Gate, GateGuard};
 
-use crate::pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind};
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
@@ -75,6 +74,10 @@ use crate::{
     disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
+use crate::{
+    pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
+    virtual_file::MaybeFatalIo,
+};
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
@@ -3426,10 +3429,14 @@ impl Timeline {
                 // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
                 // We just need to fsync the directory in which these inodes are linked,
                 // which we know to be the timeline directory.
+                //
+                // We use fatal_err() below because the after write_to_disk returns with success,
+                // the in-memory state of the filesystem already has the layer file in its final place,
+                // and subsequent pageserver code could think it's durable while it really isn't.
                 par_fsync::par_fsync(&[self_clone
                     .conf
                     .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)])
-                .context("fsync of timeline dir")?;
+                .fatal_err("fsync of timeline dir");
 
                 anyhow::Ok(new_delta)
             }
@@ -3662,11 +3669,14 @@ impl Timeline {
         // We just need to fsync the directory in which these inodes are linked,
         // which we know to be the timeline directory.
         if !image_layers.is_empty() {
+            // We use fatal_err() below because the after writer.finish() returns with success,
+            // the in-memory state of the filesystem already has the layer file in its final place,
+            // and subsequent pageserver code could think it's durable while it really isn't.
             par_fsync::par_fsync_async(&[self
                 .conf
                 .timeline_path(&self.tenant_shard_id, &self.timeline_id)])
             .await
-            .context("fsync of timeline dir")?;
+            .fatal_err("fsync of timeline dir");
         }
 
         let mut guard = self.layers.write().await;
@@ -4251,12 +4261,16 @@ impl Timeline {
             // The writer.finish() above already did the fsync of the inodes.
             // We just need to fsync the directory in which these inodes are linked,
             // which we know to be the timeline directory.
+            //
+            // We use fatal_err() below because the after writer.finish() returns with success,
+            // the in-memory state of the filesystem already has the layer file in its final place,
+            // and subsequent pageserver code could think it's durable while it really isn't.
             let timeline_dir = self
                 .conf
                 .timeline_path(&self.tenant_shard_id, &self.timeline_id);
             par_fsync::par_fsync_async(&[timeline_dir])
                 .await
-                .context("fsync of timeline dir")?;
+                .fatal_err("fsync of timeline dir");
         }
 
         stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();

From e1c032fb3ccabf61f5d41301cedbbb11a3d303a6 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 4 Mar 2024 17:26:16 +0400
Subject: [PATCH 0326/1571] Fix type (#6998)

## Problem

Typo

## Summary of changes

Fix
---
 .github/workflows/release.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 80a718d61a..b2c9a19588 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -97,7 +97,7 @@ jobs:
           **Please merge this Pull Request using 'Create a merge commit' button**
         EOF
 
-        gh pr create --title "Proxy release ${RELEASE_DATE}}" \
+        gh pr create --title "Proxy release ${RELEASE_DATE}" \
                      --body-file "body.md" \
                      --head "${RELEASE_BRANCH}" \
                      --base "release-proxy"

From 944cac950d9a151d7408f544952c4fdabb9cc9dd Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Mar 2024 14:31:09 +0100
Subject: [PATCH 0327/1571] layer file creation: fsync timeline directories
 using `VirtualFile::sync_all()` (#6986)

Except for the involvement of the VirtualFile fd cache, this is
equivalent to what happened before at runtime.

Future PR https://github.com/neondatabase/neon/pull/6378 will implement
`VirtualFile::sync_all()` using
tokio-epoll-uring if that's configured as the io engine.
This PR is preliminary work for that.

part of https://github.com/neondatabase/neon/issues/6663
---
 pageserver/src/tenant.rs           |  1 -
 pageserver/src/tenant/par_fsync.rs | 84 ------------------------------
 pageserver/src/tenant/timeline.rs  | 79 ++++++++++++++++------------
 3 files changed, 46 insertions(+), 118 deletions(-)
 delete mode 100644 pageserver/src/tenant/par_fsync.rs

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4158133111..3423b50eaa 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -151,7 +151,6 @@ pub(crate) mod ephemeral_file;
 pub mod layer_map;
 
 pub mod metadata;
-mod par_fsync;
 pub mod remote_timeline_client;
 pub mod storage_layer;
 
diff --git a/pageserver/src/tenant/par_fsync.rs b/pageserver/src/tenant/par_fsync.rs
deleted file mode 100644
index 3acb0fb431..0000000000
--- a/pageserver/src/tenant/par_fsync.rs
+++ /dev/null
@@ -1,84 +0,0 @@
-use std::{
-    io,
-    sync::atomic::{AtomicUsize, Ordering},
-};
-
-use camino::{Utf8Path, Utf8PathBuf};
-
-fn fsync_path(path: &Utf8Path) -> io::Result<()> {
-    // TODO use VirtualFile::fsync_all once we fully go async.
-    let file = std::fs::File::open(path)?;
-    file.sync_all()
-}
-
-fn parallel_worker(paths: &[Utf8PathBuf], next_path_idx: &AtomicUsize) -> io::Result<()> {
-    while let Some(path) = paths.get(next_path_idx.fetch_add(1, Ordering::Relaxed)) {
-        fsync_path(path)?;
-    }
-
-    Ok(())
-}
-
-fn fsync_in_thread_pool(paths: &[Utf8PathBuf]) -> io::Result<()> {
-    // TODO: remove this function in favor of `par_fsync_async` once we asyncify everything.
-
-    /// Use at most this number of threads.
-    /// Increasing this limit will
-    /// - use more memory
-    /// - increase the cost of spawn/join latency
-    const MAX_NUM_THREADS: usize = 64;
-    let num_threads = paths.len().min(MAX_NUM_THREADS);
-    let next_path_idx = AtomicUsize::new(0);
-
-    std::thread::scope(|s| -> io::Result<()> {
-        let mut handles = vec![];
-        // Spawn `num_threads - 1`, as the current thread is also a worker.
-        for _ in 1..num_threads {
-            handles.push(s.spawn(|| parallel_worker(paths, &next_path_idx)));
-        }
-
-        parallel_worker(paths, &next_path_idx)?;
-
-        for handle in handles {
-            handle.join().unwrap()?;
-        }
-
-        Ok(())
-    })
-}
-
-/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool.
-pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> {
-    if paths.len() == 1 {
-        fsync_path(&paths[0])?;
-        return Ok(());
-    }
-
-    fsync_in_thread_pool(paths)
-}
-
-/// Parallel fsync asynchronously.
-pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> {
-    const MAX_CONCURRENT_FSYNC: usize = 64;
-    let mut next = paths.iter().peekable();
-    let mut js = tokio::task::JoinSet::new();
-    loop {
-        while js.len() < MAX_CONCURRENT_FSYNC && next.peek().is_some() {
-            let next = next.next().expect("just peeked");
-            let next = next.to_owned();
-            js.spawn_blocking(move || fsync_path(&next));
-        }
-
-        // now the joinset has been filled up, wait for next to complete
-        if let Some(res) = js.join_next().await {
-            res??;
-        } else {
-            // last item had already completed
-            assert!(
-                next.peek().is_none(),
-                "joinset emptied, we shouldn't have more work"
-            );
-            return Ok(());
-        }
-    }
-}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0a2ae5d8bd..64c324a5c8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -54,7 +54,6 @@ use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
     metadata::TimelineMetadata,
-    par_fsync,
 };
 use crate::{
     context::{AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder},
@@ -76,7 +75,7 @@ use crate::{
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
 use crate::{
     pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
-    virtual_file::MaybeFatalIo,
+    virtual_file::{MaybeFatalIo, VirtualFile},
 };
 
 use crate::config::PageServerConf;
@@ -3417,28 +3416,31 @@ impl Timeline {
             let frozen_layer = Arc::clone(frozen_layer);
             let ctx = ctx.attached_child();
             move || {
-                // Write it out
-                // Keep this inside `spawn_blocking` and `Handle::current`
-                // as long as the write path is still sync and the read impl
-                // is still not fully async. Otherwise executor threads would
-                // be blocked.
-                let _g = span.entered();
-                let new_delta =
-                    Handle::current().block_on(frozen_layer.write_to_disk(&self_clone, &ctx))?;
-
-                // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
-                // We just need to fsync the directory in which these inodes are linked,
-                // which we know to be the timeline directory.
-                //
-                // We use fatal_err() below because the after write_to_disk returns with success,
-                // the in-memory state of the filesystem already has the layer file in its final place,
-                // and subsequent pageserver code could think it's durable while it really isn't.
-                par_fsync::par_fsync(&[self_clone
-                    .conf
-                    .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)])
-                .fatal_err("fsync of timeline dir");
-
-                anyhow::Ok(new_delta)
+                Handle::current().block_on(
+                    async move {
+                        let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
+                        // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
+                        // We just need to fsync the directory in which these inodes are linked,
+                        // which we know to be the timeline directory.
+                        //
+                        // We use fatal_err() below because the after write_to_disk returns with success,
+                        // the in-memory state of the filesystem already has the layer file in its final place,
+                        // and subsequent pageserver code could think it's durable while it really isn't.
+                        let timeline_dir =
+                            VirtualFile::open(&self_clone.conf.timeline_path(
+                                &self_clone.tenant_shard_id,
+                                &self_clone.timeline_id,
+                            ))
+                            .await
+                            .fatal_err("VirtualFile::open for timeline dir fsync");
+                        timeline_dir
+                            .sync_all()
+                            .await
+                            .fatal_err("VirtualFile::sync_all timeline dir");
+                        anyhow::Ok(new_delta)
+                    }
+                    .instrument(span),
+                )
             }
         })
         .await
@@ -3672,11 +3674,17 @@ impl Timeline {
             // We use fatal_err() below because the after writer.finish() returns with success,
             // the in-memory state of the filesystem already has the layer file in its final place,
             // and subsequent pageserver code could think it's durable while it really isn't.
-            par_fsync::par_fsync_async(&[self
-                .conf
-                .timeline_path(&self.tenant_shard_id, &self.timeline_id)])
+            let timeline_dir = VirtualFile::open(
+                &self
+                    .conf
+                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+            )
             .await
-            .fatal_err("fsync of timeline dir");
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
         }
 
         let mut guard = self.layers.write().await;
@@ -4265,12 +4273,17 @@ impl Timeline {
             // We use fatal_err() below because the after writer.finish() returns with success,
             // the in-memory state of the filesystem already has the layer file in its final place,
             // and subsequent pageserver code could think it's durable while it really isn't.
-            let timeline_dir = self
-                .conf
-                .timeline_path(&self.tenant_shard_id, &self.timeline_id);
-            par_fsync::par_fsync_async(&[timeline_dir])
+            let timeline_dir = VirtualFile::open(
+                &self
+                    .conf
+                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+            )
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
                 .await
-                .fatal_err("fsync of timeline dir");
+                .fatal_err("VirtualFile::sync_all timeline dir");
         }
 
         stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();

From e938bb815763d1980540c8fa84781e160688d44a Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 4 Mar 2024 09:17:14 -0500
Subject: [PATCH 0328/1571] fix epic issue template (#6920)

The template does not parse on GitHub
---
 .github/ISSUE_TEMPLATE/epic-template.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md
index 019e6e7345..c442f50fde 100644
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -16,9 +16,9 @@ assignees: ''
 
 ## Implementation ideas
 
-
+## Tasks
 ```[tasklist]
-### Tasks
+- [ ] Example Task
 ```
 
 
From f0be9400f25cfbad356f5417e199325d2c12f7df Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Mar 2024 15:47:13 +0100
Subject: [PATCH 0329/1571] fix(test_remote_storage_upload_queue_retries):
 became flakier since #6960 (#6999)

This PR increases the `wait_until` timeout.
These are where things became more flaky as of
https://github.com/neondatabase/neon/pull/6960.
Most likely because it doubles the work in the
`churn_while_failpoints_active_thread`.

Slack context:
https://neondb.slack.com/archives/C033RQ5SPDH/p1709554455962959?thread_ts=1709286362.850549&cid=C033RQ5SPDH
---
 test_runner/regress/test_remote_storage.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index f8a0bef954..06c13cc07d 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -329,14 +329,15 @@ def test_remote_storage_upload_queue_retries(
     churn_while_failpoints_active_thread.start()
 
     # wait for churn thread's data to get stuck in the upload queue
-    wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(10, 0.5, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
-    wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    # Exponential back-off in upload queue, so, gracious timeouts.
+
+    wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
+    wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
+    wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
 
     # unblock churn operations
     configure_storage_sync_failpoints("off")
 
-    # ... and wait for them to finish. Exponential back-off in upload queue, so, gracious timeouts.
     wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
     wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
     wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))

From 0d2395fe96dfadaea3b026990b5a77aa4a72c0e4 Mon Sep 17 00:00:00 2001
From: Roman Zaynetdinov <roman@neon.tech>
Date: Mon, 4 Mar 2024 18:02:10 +0200
Subject: [PATCH 0330/1571] Update postgres-exporter to v0.12.1 (#7004)

Fixes https://github.com/neondatabase/neon/issues/6996

Thanks to @bayandin
---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 4520a5fc9c..a04dac6336 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -176,7 +176,7 @@ build: |
       # actually build the thing...
       && make install
 
-  FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.0 AS postgres-exporter
+  FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
 
   FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
 

From 191d8ac7e044e867b07f5007b783d00d0a87be45 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 4 Mar 2024 16:04:12 +0000
Subject: [PATCH 0331/1571] vm-image: update pgbouncer from 1.22.0 to 1.22.1
 (#7005)

pgbouncer 1.22.1 has been released
> This release fixes issues caused by some clients using COPY FROM STDIN
queries. Such queries could introduce memory leaks, performance
regressions and prepared statement misbehavior.

- NEWS: https://www.pgbouncer.org/2024/03/pgbouncer-1-22-1
- CHANGES:
https://github.com/pgbouncer/pgbouncer/compare/pgbouncer_1_22_0...pgbouncer_1_22_1


## Summary of changes
- vm-image: update pgbouncer from 1.22.0 to 1.22.1
---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index a04dac6336..c1b7ad533a 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -193,7 +193,7 @@ build: |
           pkg-config
 
   # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
-  ENV PGBOUNCER_TAG pgbouncer_1_22_0
+  ENV PGBOUNCER_TAG pgbouncer_1_22_1
   RUN set -e \
       && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
       && cd pgbouncer \

From e62baa97041e10ce45772b3724e24e679a650d69 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Mar 2024 18:36:29 +0100
Subject: [PATCH 0332/1571] upgrade tokio 1.34 => 1.36 (#7008)

tokio 1.36 has been out for a month.

Release notes don't indicate major changes.

Skimming through their issue tracker, I can't find open `C-bug` issues
that would affect us.

(My personal motivation for this is `JoinSet::try_join_next`.)
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c23162971e..f937f3a372 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5810,9 +5810,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.34.0"
+version = "1.36.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9"
+checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
 dependencies = [
  "backtrace",
  "bytes",

From 3dfae4be8d5aba629e42ba4ae69017e4b4979350 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 4 Mar 2024 19:16:07 +0000
Subject: [PATCH 0333/1571] upgrade mio 0.8.10 => 0.8.11 (#7009)

## Problem

`cargo deny` fails
- https://rustsec.org/advisories/RUSTSEC-2024-0019
-
https://github.com/tokio-rs/mio/security/advisories/GHSA-r8w9-5wcg-vfj7

> The vulnerability is Windows-specific, and can only happen if you are
using named pipes. Other IO resources are not affected.

## Summary of changes
- Upgrade `mio` from 0.8.10 to 0.8.11 (`cargo update -p mio`)
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f937f3a372..864e5c9046 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2959,9 +2959,9 @@ dependencies = [
 
 [[package]]
 name = "mio"
-version = "0.8.10"
+version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
+checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
 dependencies = [
  "libc",
  "log",

From b7db912be6296bb2569a1162892b6d047702afbf Mon Sep 17 00:00:00 2001
From: Alex Chi Z <chi@neon.tech>
Date: Mon, 4 Mar 2024 14:28:45 -0500
Subject: [PATCH 0334/1571] compute_ctl: only try zenith_admin if could not
 authenticate (#6955)

## Problem

Fix https://github.com/neondatabase/neon/issues/6498

## Summary of changes

Only re-authenticate with zenith_admin if authentication fails.
Otherwise, directly return the error message.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/compute.rs | 44 +++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index a82b999cfb..da271e49cd 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -17,6 +17,7 @@ use chrono::{DateTime, Utc};
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
+use postgres::error::SqlState;
 use postgres::{Client, NoTls};
 use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
@@ -774,27 +775,34 @@ impl ComputeNode {
         // but we can create a new one and grant it all privileges.
         let connstr = self.connstr.clone();
         let mut client = match Client::connect(connstr.as_str(), NoTls) {
-            Err(e) => {
-                info!(
-                    "cannot connect to postgres: {}, retrying with `zenith_admin` username",
-                    e
-                );
-                let mut zenith_admin_connstr = connstr.clone();
+            Err(e) => match e.code() {
+                Some(&SqlState::INVALID_PASSWORD)
+                | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => {
+                    // connect with zenith_admin if cloud_admin could not authenticate
+                    info!(
+                        "cannot connect to postgres: {}, retrying with `zenith_admin` username",
+                        e
+                    );
+                    let mut zenith_admin_connstr = connstr.clone();
 
-                zenith_admin_connstr
-                    .set_username("zenith_admin")
-                    .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
+                    zenith_admin_connstr
+                        .set_username("zenith_admin")
+                        .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
 
-                let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
-                // Disable forwarding so that users don't get a cloud_admin role
-                client.simple_query("SET neon.forward_ddl = false")?;
-                client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
-                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
-                drop(client);
+                    let mut client =
+                        Client::connect(zenith_admin_connstr.as_str(), NoTls)
+                            .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
+                    // Disable forwarding so that users don't get a cloud_admin role
+                    client.simple_query("SET neon.forward_ddl = false")?;
+                    client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
+                    client.simple_query("GRANT zenith_admin TO cloud_admin")?;
+                    drop(client);
 
-                // reconnect with connstring with expected name
-                Client::connect(connstr.as_str(), NoTls)?
-            }
+                    // reconnect with connstring with expected name
+                    Client::connect(connstr.as_str(), NoTls)?
+                }
+                _ => return Err(e.into()),
+            },
             Ok(client) => client,
         };
 

From 3da410c8fee05b0cd65a5c0b83fffa3d5680cd77 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 5 Mar 2024 10:03:54 +0100
Subject: [PATCH 0335/1571] tokio-epoll-uring: use it on the layer-creating
 code paths (#6378)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

part of #6663
See that epic for more context & related commits.

Problem
-------

Before this PR, the layer-file-creating code paths were using
VirtualFile, but under the hood these were still blocking system calls.

Generally this meant we'd stall the executor thread, unless the caller
"knew" and used the following pattern instead:

```
spawn_blocking(|| {
    Handle::block_on(async {
        VirtualFile::....().await;
    })
}).await
```

Solution
--------

This PR adopts `tokio-epoll-uring` on the layer-file-creating code paths
in pageserver.

Note that on-demand downloads still use `tokio::fs`, these will be
converted in a future PR.

Design: Avoiding Regressions With `std-fs`
------------------------------------------

If we make the VirtualFile write path truly async using
`tokio-epoll-uring`, should we then remove the `spawn_blocking` +
`Handle::block_on` usage upstack in the same commit?

No, because if we’re still using the `std-fs` io engine, we’d then block
the executor in those places where previously we were protecting us from
that through the `spawn_blocking` .

So, if we want to see benefits from `tokio-epoll-uring` on the write
path while also preserving the ability to switch between
`tokio-epoll-uring` and `std-fs` , where `std-fs` will behave identical
to what we have now, we need to ***conditionally* use `spawn_blocking +
Handle::block_on`** .

I.e., in the places where we use that know, we’ll need to make that
conditional based on the currently configured io engine.

It boils down to investigating all the places where we do
`spawn_blocking(... block_on(... VirtualFile::...))`.

Detailed [write-up of that investigation in
Notion](https://neondatabase.notion.site/Surveying-VirtualFile-write-path-usage-wrt-tokio-epoll-uring-integration-spawn_blocking-Handle-bl-5dc2270dbb764db7b2e60803f375e015?pvs=4
), made publicly accessible.

tl;dr: Preceding PRs addressed the relevant call sites:
- `metadata` file: turns out we could simply remove it (#6777, #6769,
#6775)
- `create_delta_layer()`: made sensitive to `virtual_file_io_engine` in
#6986

NB: once we are switched over to `tokio-epoll-uring` everywhere in
production, we can deprecate `std-fs`; to keep macOS support, we can use
`tokio::fs` instead. That will remove this whole headache.


Code Changes In This PR
-----------------------

- VirtualFile API changes
  - `VirtualFile::write_at`
- implement an `ioengine` operation and switch `VirtualFile::write_at`
to it
  - `VirtualFile::metadata()`
- curiously, we only use it from the layer writers' `finish()` methods
- introduce a wrapper `Metadata` enum because `std::fs::Metadata` cannot
be constructed by code outside rust std
- `VirtualFile::sync_all()` and for completeness sake, add
`VirtualFile::sync_data()`

Testing & Rollout
-----------------

Before merging this PR, we ran the CI with both io engines.

Additionally, the changes will soak in staging.

We could have a feature gate / add a new io engine
`tokio-epoll-uring-write-path` to do a gradual rollout. However, that's
not part of this PR.


Future Work
-----------

There's still some use of `std::fs` and/or `tokio::fs` for directory
namespace operations, e.g. `std::fs::rename`.

We're not addressing those in this PR, as we'll need to add the support
in tokio-epoll-uring first. Note that rename itself is usually fast if
the directory is in the kernel dentry cache, and only the fsync after
rename is slow. These fsyncs are using tokio-epoll-uring, so, the impact
should be small.
---
 pageserver/src/tenant/blob_io.rs             |  14 ++-
 pageserver/src/tenant/storage_layer/layer.rs |   1 +
 pageserver/src/tenant/timeline.rs            |  78 +++++++-------
 pageserver/src/virtual_file.rs               | 105 ++++++++++++++-----
 pageserver/src/virtual_file/io_engine.rs     |  96 +++++++++++++++--
 pageserver/src/virtual_file/metadata.rs      |  30 ++++++
 6 files changed, 246 insertions(+), 78 deletions(-)
 create mode 100644 pageserver/src/virtual_file/metadata.rs

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index ec70bdc679..0d33100ead 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -12,7 +12,7 @@
 //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
 use bytes::{BufMut, BytesMut};
-use tokio_epoll_uring::{BoundedBuf, Slice};
+use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
 
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
@@ -127,7 +127,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     /// You need to make sure that the internal buffer is empty, otherwise
     /// data will be written in wrong order.
     #[inline(always)]
-    async fn write_all_unbuffered<B: BoundedBuf>(
+    async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         src_buf: B,
     ) -> (B::Buf, Result<(), Error>) {
@@ -162,7 +162,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     }
 
     /// Internal, possibly buffered, write function
-    async fn write_all<B: BoundedBuf>(&mut self, src_buf: B) -> (B::Buf, Result<(), Error>) {
+    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        src_buf: B,
+    ) -> (B::Buf, Result<(), Error>) {
         if !BUFFERED {
             assert!(self.buf.is_empty());
             return self.write_all_unbuffered(src_buf).await;
@@ -210,7 +213,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
 
     /// Write a blob of data. Returns the offset that it was written to,
     /// which can be used to retrieve the data later.
-    pub async fn write_blob<B: BoundedBuf>(&mut self, srcbuf: B) -> (B::Buf, Result<u64, Error>) {
+    pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        srcbuf: B,
+    ) -> (B::Buf, Result<u64, Error>) {
         let offset = self.offset;
 
         let len = srcbuf.bytes_init();
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 247dd1a8e4..e14a2f22cf 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -195,6 +195,7 @@ impl Layer {
         let downloaded = resident.expect("just initialized");
 
         // if the rename works, the path is as expected
+        // TODO: sync system call
         std::fs::rename(temp_path, owner.local_path())
             .with_context(|| format!("rename temporary file as correct path for {owner}"))?;
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 64c324a5c8..1f811155f6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3410,44 +3410,48 @@ impl Timeline {
         frozen_layer: &Arc<InMemoryLayer>,
         ctx: &RequestContext,
     ) -> anyhow::Result<ResidentLayer> {
-        let span = tracing::info_span!("blocking");
-        let new_delta: ResidentLayer = tokio::task::spawn_blocking({
-            let self_clone = Arc::clone(self);
-            let frozen_layer = Arc::clone(frozen_layer);
-            let ctx = ctx.attached_child();
-            move || {
-                Handle::current().block_on(
-                    async move {
-                        let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
-                        // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
-                        // We just need to fsync the directory in which these inodes are linked,
-                        // which we know to be the timeline directory.
-                        //
-                        // We use fatal_err() below because the after write_to_disk returns with success,
-                        // the in-memory state of the filesystem already has the layer file in its final place,
-                        // and subsequent pageserver code could think it's durable while it really isn't.
-                        let timeline_dir =
-                            VirtualFile::open(&self_clone.conf.timeline_path(
-                                &self_clone.tenant_shard_id,
-                                &self_clone.timeline_id,
-                            ))
-                            .await
-                            .fatal_err("VirtualFile::open for timeline dir fsync");
-                        timeline_dir
-                            .sync_all()
-                            .await
-                            .fatal_err("VirtualFile::sync_all timeline dir");
-                        anyhow::Ok(new_delta)
-                    }
-                    .instrument(span),
-                )
+        let self_clone = Arc::clone(self);
+        let frozen_layer = Arc::clone(frozen_layer);
+        let ctx = ctx.attached_child();
+        let work = async move {
+            let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
+            // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
+            // We just need to fsync the directory in which these inodes are linked,
+            // which we know to be the timeline directory.
+            //
+            // We use fatal_err() below because the after write_to_disk returns with success,
+            // the in-memory state of the filesystem already has the layer file in its final place,
+            // and subsequent pageserver code could think it's durable while it really isn't.
+            let timeline_dir = VirtualFile::open(
+                &self_clone
+                    .conf
+                    .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id),
+            )
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
+            anyhow::Ok(new_delta)
+        };
+        // Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking.
+        // Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`.
+        use crate::virtual_file::io_engine::IoEngine;
+        match crate::virtual_file::io_engine::get() {
+            IoEngine::NotSet => panic!("io engine not set"),
+            IoEngine::StdFs => {
+                let span = tracing::info_span!("blocking");
+                tokio::task::spawn_blocking({
+                    move || Handle::current().block_on(work.instrument(span))
+                })
+                .await
+                .context("spawn_blocking")
+                .and_then(|x| x)
             }
-        })
-        .await
-        .context("spawn_blocking")
-        .and_then(|x| x)?;
-
-        Ok(new_delta)
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => work.await,
+        }
     }
 
     async fn repartition(
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index b7112108f2..6d4774cf75 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -17,20 +17,21 @@ use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
-use std::fs::{self, File};
+use std::fs::File;
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
 
 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
-use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
 
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
+mod metadata;
 mod open_options;
 pub(crate) use io_engine::IoEngineKind;
+pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
 
 ///
@@ -435,13 +436,25 @@ impl VirtualFile {
 
     /// Call File::sync_all() on the underlying File.
     pub async fn sync_all(&self) -> Result<(), Error> {
-        with_file!(self, StorageIoOperation::Fsync, |file_guard| file_guard
-            .with_std_file(|std_file| std_file.sync_all()))
+        with_file!(self, StorageIoOperation::Fsync, |file_guard| {
+            let (_file_guard, res) = io_engine::get().sync_all(file_guard).await;
+            res
+        })
     }
 
-    pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
-        with_file!(self, StorageIoOperation::Metadata, |file_guard| file_guard
-            .with_std_file(|std_file| std_file.metadata()))
+    /// Call File::sync_data() on the underlying File.
+    pub async fn sync_data(&self) -> Result<(), Error> {
+        with_file!(self, StorageIoOperation::Fsync, |file_guard| {
+            let (_file_guard, res) = io_engine::get().sync_data(file_guard).await;
+            res
+        })
+    }
+
+    pub async fn metadata(&self) -> Result<Metadata, Error> {
+        with_file!(self, StorageIoOperation::Metadata, |file_guard| {
+            let (_file_guard, res) = io_engine::get().metadata(file_guard).await;
+            res
+        })
     }
 
     /// Helper function internal to `VirtualFile` that looks up the underlying File,
@@ -579,7 +592,7 @@ impl VirtualFile {
     }
 
     // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
-    pub async fn write_all_at<B: BoundedBuf>(
+    pub async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &self,
         buf: B,
         mut offset: u64,
@@ -590,8 +603,9 @@ impl VirtualFile {
         }
         let mut buf = buf.slice(0..buf_len);
         while !buf.is_empty() {
-            // TODO: push `buf` further down
-            match self.write_at(&buf, offset).await {
+            let res;
+            (buf, res) = self.write_at(buf, offset).await;
+            match res {
                 Ok(0) => {
                     return (
                         Slice::into_inner(buf),
@@ -605,7 +619,7 @@ impl VirtualFile {
                     buf = buf.slice(n..);
                     offset += n as u64;
                 }
-                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+                Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {}
                 Err(e) => return (Slice::into_inner(buf), Err(e)),
             }
         }
@@ -616,15 +630,19 @@ impl VirtualFile {
     /// Returns the IoBuf that is underlying the BoundedBuf `buf`.
     /// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in.
     /// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant.
-    pub async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> (B::Buf, Result<usize, Error>) {
+    pub async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        buf: B,
+    ) -> (B::Buf, Result<usize, Error>) {
         let nbytes = buf.bytes_init();
         if nbytes == 0 {
             return (Slice::into_inner(buf.slice_full()), Ok(0));
         }
         let mut buf = buf.slice(0..nbytes);
         while !buf.is_empty() {
-            // TODO: push `Slice` further down
-            match self.write(&buf).await {
+            let res;
+            (buf, res) = self.write(buf).await;
+            match res {
                 Ok(0) => {
                     return (
                         Slice::into_inner(buf),
@@ -644,11 +662,18 @@ impl VirtualFile {
         (Slice::into_inner(buf), Ok(nbytes))
     }
 
-    async fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
+    async fn write<B: IoBuf + Send>(
+        &mut self,
+        buf: Slice<B>,
+    ) -> (Slice<B>, Result<usize, std::io::Error>) {
         let pos = self.pos;
-        let n = self.write_at(buf, pos).await?;
+        let (buf, res) = self.write_at(buf, pos).await;
+        let n = match res {
+            Ok(n) => n,
+            Err(e) => return (buf, Err(e)),
+        };
         self.pos += n as u64;
-        Ok(n)
+        (buf, Ok(n))
     }
 
     pub(crate) async fn read_at<B>(&self, buf: B, offset: u64) -> (B, Result<usize, Error>)
@@ -676,16 +701,30 @@ impl VirtualFile {
         })
     }
 
-    async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Write, |file_guard| {
-            file_guard.with_std_file(|std_file| std_file.write_at(buf, offset))
-        });
-        if let Ok(size) = result {
-            STORAGE_IO_SIZE
-                .with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id])
-                .add(size as i64);
-        }
-        result
+    async fn write_at<B: IoBuf + Send>(
+        &self,
+        buf: Slice<B>,
+        offset: u64,
+    ) -> (Slice<B>, Result<usize, Error>) {
+        let file_guard = match self.lock_file().await {
+            Ok(file_guard) => file_guard,
+            Err(e) => return (buf, Err(e)),
+        };
+        observe_duration!(StorageIoOperation::Write, {
+            let ((_file_guard, buf), result) =
+                io_engine::get().write_at(file_guard, offset, buf).await;
+            if let Ok(size) = result {
+                STORAGE_IO_SIZE
+                    .with_label_values(&[
+                        "write",
+                        &self.tenant_id,
+                        &self.shard_id,
+                        &self.timeline_id,
+                    ])
+                    .add(size as i64);
+            }
+            (buf, result)
+        })
     }
 }
 
@@ -1083,6 +1122,7 @@ mod tests {
     use rand::Rng;
     use std::future::Future;
     use std::io::Write;
+    use std::os::unix::fs::FileExt;
     use std::sync::Arc;
 
     enum MaybeVirtualFile {
@@ -1103,7 +1143,11 @@ mod tests {
                 MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
             }
         }
-        async fn write_all_at<B: BoundedBuf>(&self, buf: B, offset: u64) -> Result<(), Error> {
+        async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+            &self,
+            buf: B,
+            offset: u64,
+        ) -> Result<(), Error> {
             match self {
                 MaybeVirtualFile::VirtualFile(file) => {
                     let (_buf, res) = file.write_all_at(buf, offset).await;
@@ -1124,7 +1168,10 @@ mod tests {
                 MaybeVirtualFile::File(file) => file.seek(pos),
             }
         }
-        async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> Result<(), Error> {
+        async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+            &mut self,
+            buf: B,
+        ) -> Result<(), Error> {
             match self {
                 MaybeVirtualFile::VirtualFile(file) => {
                     let (_buf, res) = file.write_all(buf).await;
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 892affa326..1a8cd9f562 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -7,6 +7,8 @@
 //!
 //! Then use [`get`] and  [`super::OpenOptions`].
 
+use tokio_epoll_uring::{IoBuf, Slice};
+
 pub(crate) use super::api::IoEngineKind;
 #[derive(Clone, Copy)]
 #[repr(u8)]
@@ -61,7 +63,8 @@ pub(super) fn init(engine_kind: IoEngineKind) {
     set(engine_kind);
 }
 
-pub(super) fn get() -> IoEngine {
+/// Longer-term, this API should only be used by [`super::VirtualFile`].
+pub(crate) fn get() -> IoEngine {
     let cur = IoEngine::try_from(IO_ENGINE.load(Ordering::Relaxed)).unwrap();
     if cfg!(test) {
         let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
@@ -98,7 +101,17 @@ use std::{
     sync::atomic::{AtomicU8, Ordering},
 };
 
-use super::FileGuard;
+use super::{FileGuard, Metadata};
+
+#[cfg(target_os = "linux")]
+fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std::io::Error {
+    match e {
+        tokio_epoll_uring::Error::Op(e) => e,
+        tokio_epoll_uring::Error::System(system) => {
+            std::io::Error::new(std::io::ErrorKind::Other, system)
+        }
+    }
+}
 
 impl IoEngine {
     pub(super) async fn read_at<B>(
@@ -133,16 +146,83 @@ impl IoEngine {
             IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring::thread_local_system().await;
                 let (resources, res) = system.read(file_guard, offset, buf).await;
+                (resources, res.map_err(epoll_uring_error_to_std))
+            }
+        }
+    }
+    pub(super) async fn sync_all(&self, file_guard: FileGuard) -> (FileGuard, std::io::Result<()>) {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let res = file_guard.with_std_file(|std_file| std_file.sync_all());
+                (file_guard, res)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.fsync(file_guard).await;
+                (resources, res.map_err(epoll_uring_error_to_std))
+            }
+        }
+    }
+    pub(super) async fn sync_data(
+        &self,
+        file_guard: FileGuard,
+    ) -> (FileGuard, std::io::Result<()>) {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let res = file_guard.with_std_file(|std_file| std_file.sync_data());
+                (file_guard, res)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.fdatasync(file_guard).await;
+                (resources, res.map_err(epoll_uring_error_to_std))
+            }
+        }
+    }
+    pub(super) async fn metadata(
+        &self,
+        file_guard: FileGuard,
+    ) -> (FileGuard, std::io::Result<Metadata>) {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let res =
+                    file_guard.with_std_file(|std_file| std_file.metadata().map(Metadata::from));
+                (file_guard, res)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.statx(file_guard).await;
                 (
                     resources,
-                    res.map_err(|e| match e {
-                        tokio_epoll_uring::Error::Op(e) => e,
-                        tokio_epoll_uring::Error::System(system) => {
-                            std::io::Error::new(std::io::ErrorKind::Other, system)
-                        }
-                    }),
+                    res.map_err(epoll_uring_error_to_std).map(Metadata::from),
                 )
             }
         }
     }
+    pub(super) async fn write_at<B: IoBuf + Send>(
+        &self,
+        file_guard: FileGuard,
+        offset: u64,
+        buf: Slice<B>,
+    ) -> ((FileGuard, Slice<B>), std::io::Result<usize>) {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let result = file_guard.with_std_file(|std_file| std_file.write_at(&buf, offset));
+                ((file_guard, buf), result)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.write(file_guard, offset, buf).await;
+                (resources, res.map_err(epoll_uring_error_to_std))
+            }
+        }
+    }
 }
diff --git a/pageserver/src/virtual_file/metadata.rs b/pageserver/src/virtual_file/metadata.rs
new file mode 100644
index 0000000000..f530c50988
--- /dev/null
+++ b/pageserver/src/virtual_file/metadata.rs
@@ -0,0 +1,30 @@
+use std::fs;
+
+pub enum Metadata {
+    StdFs(fs::Metadata),
+    #[cfg(target_os = "linux")]
+    TokioEpollUring(Box<tokio_epoll_uring::ops::statx::statx>),
+}
+
+#[cfg(target_os = "linux")]
+impl From<Box<tokio_epoll_uring::ops::statx::statx>> for Metadata {
+    fn from(value: Box<tokio_epoll_uring::ops::statx::statx>) -> Self {
+        Metadata::TokioEpollUring(value)
+    }
+}
+
+impl From<std::fs::Metadata> for Metadata {
+    fn from(value: std::fs::Metadata) -> Self {
+        Metadata::StdFs(value)
+    }
+}
+
+impl Metadata {
+    pub fn len(&self) -> u64 {
+        match self {
+            Metadata::StdFs(metadata) => metadata.len(),
+            #[cfg(target_os = "linux")]
+            Metadata::TokioEpollUring(statx) => statx.stx_size,
+        }
+    }
+}

From 752bf5a22f8b53a163102820d845c87bf848cb55 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 5 Mar 2024 12:14:37 +0200
Subject: [PATCH 0336/1571] build: clippy disallow futures::pin_mut macro
 (#7016)

`std` has had `pin!` macro for some time, there is no need for us to use
the older alternatives. Cannot disallow `tokio::pin` because tokio
macros use that.
---
 clippy.toml                           |  7 +++++++
 control_plane/src/pageserver.rs       |  2 +-
 libs/postgres_backend/src/lib.rs      |  4 +---
 proxy/src/serverless/sql_over_http.rs |  4 +---
 s3_scrubber/src/checks.rs             |  5 ++---
 s3_scrubber/src/garbage.rs            | 14 +++++++-------
 s3_scrubber/src/scan_metadata.rs      |  5 ++---
 safekeeper/src/wal_service.rs         |  2 +-
 8 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/clippy.toml b/clippy.toml
index d788afc84d..5f7dc66152 100644
--- a/clippy.toml
+++ b/clippy.toml
@@ -3,3 +3,10 @@ disallowed-methods = [
     # Allow this for now, to deny it later once we stop using Handle::block_on completely
     # "tokio::runtime::Handle::block_on",
 ]
+
+disallowed-macros = [
+    # use std::pin::pin
+    "futures::pin_mut",
+    # cannot disallow this, because clippy finds used from tokio macros
+    #"tokio::pin",
+]
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 642f153f2d..7d0c07a938 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -605,7 +605,7 @@ impl PageServerNode {
                 eprintln!("connection error: {}", e);
             }
         });
-        tokio::pin!(client);
+        let client = std::pin::pin!(client);
 
         // Init base reader
         let (start_lsn, base_tarfile_path) = base;
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 73d25619c3..260018ad89 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -6,7 +6,6 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::Context;
 use bytes::Bytes;
-use futures::pin_mut;
 use serde::{Deserialize, Serialize};
 use std::io::ErrorKind;
 use std::net::SocketAddr;
@@ -378,8 +377,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         &mut self,
         cx: &mut std::task::Context<'_>,
     ) -> Poll<Result<(), std::io::Error>> {
-        let flush_fut = self.flush();
-        pin_mut!(flush_fut);
+        let flush_fut = std::pin::pin!(self.flush());
         flush_fut.poll(cx)
     }
 
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 7f51ba82cc..74af985211 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,7 +1,6 @@
 use std::sync::Arc;
 
 use anyhow::bail;
-use futures::pin_mut;
 use futures::StreamExt;
 use hyper::body::HttpBody;
 use hyper::header;
@@ -531,13 +530,12 @@ async fn query_to_json<T: GenericClient>(
 ) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
     info!("executing query");
     let query_params = data.params;
-    let row_stream = client.query_raw_txt(&data.query, query_params).await?;
+    let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?);
     info!("finished executing query");
 
     // Manually drain the stream into a vector to leave row_stream hanging
     // around to get a command tag. Also check that the response is not too
     // big.
-    pin_mut!(row_stream);
     let mut rows: Vec<tokio_postgres::Row> = Vec::new();
     while let Some(row) = row_stream.next().await {
         let row = row?;
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index 7b9f96dce3..7c0f699958 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -11,7 +11,7 @@ use utils::id::TimelineId;
 use crate::cloud_admin_api::BranchData;
 use crate::metadata_stream::stream_listing;
 use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
-use futures_util::{pin_mut, StreamExt};
+use futures_util::StreamExt;
 use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
 use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::IndexPart;
@@ -285,8 +285,7 @@ pub(crate) async fn list_timeline_blobs(
     let mut index_parts: Vec<ObjectIdentifier> = Vec::new();
     let mut initdb_archive: bool = false;
 
-    let stream = stream_listing(s3_client, &timeline_dir_target);
-    pin_mut!(stream);
+    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
     while let Some(obj) = stream.next().await {
         let obj = obj?;
         let key = obj.key();
diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs
index 93bb115883..7a08dffc66 100644
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -12,7 +12,7 @@ use aws_sdk_s3::{
     types::{Delete, ObjectIdentifier},
     Client,
 };
-use futures_util::{pin_mut, TryStreamExt};
+use futures_util::TryStreamExt;
 use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
 use tokio_stream::StreamExt;
@@ -199,12 +199,12 @@ async fn find_garbage_inner(
             }
         }
     });
-    let tenants_checked = tenants_checked.try_buffer_unordered(CONSOLE_CONCURRENCY);
+    let mut tenants_checked =
+        std::pin::pin!(tenants_checked.try_buffer_unordered(CONSOLE_CONCURRENCY));
 
     // Process the results of Tenant checks.  If a Tenant is garbage, it goes into
     // the `GarbageList`.  Else it goes into `active_tenants` for more detailed timeline
     // checks if they are enabled by the `depth` parameter.
-    pin_mut!(tenants_checked);
     let mut garbage = GarbageList::new(node_kind, bucket_config);
     let mut active_tenants: Vec<TenantShardId> = vec![];
     let mut counter = 0;
@@ -267,10 +267,10 @@ async fn find_garbage_inner(
                 .map(|r| (ttid, r))
         }
     });
-    let timelines_checked = timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY);
+    let mut timelines_checked =
+        std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY));
 
     // Update the GarbageList with any timelines which appear not to exist.
-    pin_mut!(timelines_checked);
     while let Some(result) = timelines_checked.next().await {
         let (ttid, console_result) = result?;
         if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) {
@@ -425,9 +425,9 @@ pub async fn purge_garbage(
             }
         }
     });
-    let get_objects_results = get_objects_results.try_buffer_unordered(S3_CONCURRENCY);
+    let mut get_objects_results =
+        std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY));
 
-    pin_mut!(get_objects_results);
     let mut objects_to_delete = Vec::new();
     while let Some(result) = get_objects_results.next().await {
         let mut object_list = result?;
diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs
index 4b63bb3884..6ff9783875 100644
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -7,7 +7,7 @@ use crate::checks::{
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use aws_sdk_s3::Client;
-use futures_util::{pin_mut, StreamExt, TryStreamExt};
+use futures_util::{StreamExt, TryStreamExt};
 use histogram::Histogram;
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
 use pageserver::tenant::IndexPart;
@@ -226,7 +226,7 @@ pub async fn scan_metadata(
         Ok((ttid, data))
     }
     let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid));
-    let timelines = timelines.try_buffered(CONCURRENCY);
+    let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
     // We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different
     // shards in the same tenant might refer to one anothers' keys if a shard split has happened.
@@ -309,7 +309,6 @@ pub async fn scan_metadata(
     // all results for the same tenant will be adjacent.  We accumulate these,
     // and then call `analyze_tenant` to flush, when we see the next tenant ID.
     let mut summary = MetadataSummary::new();
-    pin_mut!(timelines);
     while let Some(i) = timelines.next().await {
         let (ttid, data) = i?;
         summary.update_data(&data);
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index bceaad1e16..4a97eb3993 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -68,7 +68,7 @@ async fn handle_socket(
     // is not Unpin, and all pgbackend/framed/tokio dependencies require stream
     // to be Unpin. Which is reasonable, as indeed something like TimeoutReader
     // shouldn't be moved.
-    tokio::pin!(socket);
+    let socket = std::pin::pin!(socket);
 
     let traffic_metrics = TrafficMetrics::new();
     if let Some(current_az) = conf.availability_zone.as_deref() {

From f3e4f85e65a9b6fa23a28893676d341a909bae51 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 5 Mar 2024 12:09:13 +0100
Subject: [PATCH 0337/1571] layer file download: final rename: fix durability
 (#6991)

Before this PR, the layer file download code would fsync the inode after
rename instead of the timeline directory. That is not in line with what
a comment further up says we're doing, and it's obviously not achieving
the goal of making the rename durable.

part of https://github.com/neondatabase/neon/issues/6663
---
 .../tenant/remote_timeline_client/download.rs | 28 +++++++++++++------
 pageserver/src/virtual_file/io_engine.rs      | 26 +++++++++++++++++
 2 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 167e18a829..6fff6e78e2 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -14,14 +14,14 @@ use tokio::io::{AsyncSeekExt, AsyncWriteExt};
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
-use utils::{backoff, crashsafe};
+use utils::backoff;
 
 use crate::config::PageServerConf;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::Generation;
-use crate::virtual_file::on_fatal_io_error;
+use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
 use utils::crashsafe::path_with_suffix_extension;
@@ -50,9 +50,8 @@ pub async fn download_layer_file<'a>(
 ) -> Result<u64, DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
-    let local_path = conf
-        .timeline_path(&tenant_shard_id, &timeline_id)
-        .join(layer_file_name.file_name());
+    let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
+    let local_path = timeline_path.join(layer_file_name.file_name());
 
     let remote_path = remote_layer_path(
         &tenant_shard_id.tenant_id,
@@ -149,10 +148,21 @@ pub async fn download_layer_file<'a>(
         .with_context(|| format!("rename download layer file to {local_path}"))
         .map_err(DownloadError::Other)?;
 
-    crashsafe::fsync_async(&local_path)
-        .await
-        .with_context(|| format!("fsync layer file {local_path}"))
-        .map_err(DownloadError::Other)?;
+    // We use fatal_err() below because the after the rename above,
+    // the in-memory state of the filesystem already has the layer file in its final place,
+    // and subsequent pageserver code could think it's durable while it really isn't.
+    let work = async move {
+        let timeline_dir = VirtualFile::open(&timeline_path)
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+        timeline_dir
+            .sync_all()
+            .await
+            .fatal_err("VirtualFile::sync_all timeline dir");
+    };
+    crate::virtual_file::io_engine::get()
+        .spawn_blocking_and_block_on_if_std(work)
+        .await;
 
     tracing::debug!("download complete: {local_path}");
 
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 1a8cd9f562..5fef826477 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -8,6 +8,7 @@
 //! Then use [`get`] and  [`super::OpenOptions`].
 
 use tokio_epoll_uring::{IoBuf, Slice};
+use tracing::Instrument;
 
 pub(crate) use super::api::IoEngineKind;
 #[derive(Clone, Copy)]
@@ -225,4 +226,29 @@ impl IoEngine {
             }
         }
     }
+
+    /// If we switch a user of [`tokio::fs`] to use [`super::io_engine`],
+    /// they'd start blocking the executor thread if [`IoEngine::StdFs`] is configured
+    /// whereas before the switch to [`super::io_engine`], that wasn't the case.
+    /// This method helps avoid such a regression.
+    ///
+    /// Panics if the `spawn_blocking` fails, see [`tokio::task::JoinError`] for reasons why that can happen.
+    pub(crate) async fn spawn_blocking_and_block_on_if_std<Fut, R>(&self, work: Fut) -> R
+    where
+        Fut: 'static + Send + std::future::Future<Output = R>,
+        R: 'static + Send,
+    {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let span = tracing::info_span!("spawn_blocking_block_on_if_std");
+                tokio::task::spawn_blocking({
+                    move || tokio::runtime::Handle::current().block_on(work.instrument(span))
+                })
+                .await
+                .expect("failed to join blocking code most likely it panicked, panicking as well")
+            }
+            IoEngine::TokioEpollUring => work.await,
+        }
+    }
 }

From ae8468f97e4783474940a568379bbac6c70a29c9 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 5 Mar 2024 13:30:43 +0000
Subject: [PATCH 0338/1571] pageserver: fix AUX key vectored get validation
 (#7018)

## Problem
The value reconstruct of AUX_FILES_KEY from records is not deterministic
since it uses a hash map under the hood. This caused vectored get validation
failures when enabled in staging.

## Summary of changes
Deserialise AUX_FILES_KEY blobs comparing. All other keys should
reconstruct deterministically, so we simply compare the blobs.
---
 pageserver/src/pgdatadir_mapping.rs |  2 +-
 pageserver/src/tenant/timeline.rs   | 41 +++++++++++++++++++++++++++--
 2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 7be08f86b1..628aeb5a28 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1677,7 +1677,7 @@ struct RelDirectory {
     rels: HashSet<(Oid, u8)>,
 }
 
-#[derive(Debug, Serialize, Deserialize, Default)]
+#[derive(Debug, Serialize, Deserialize, Default, PartialEq)]
 pub(crate) struct AuxFilesDirectory {
     pub(crate) files: HashMap<String, Bytes>,
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1f811155f6..309ec2e829 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -17,6 +17,7 @@ use futures::stream::StreamExt;
 use itertools::Itertools;
 use once_cell::sync::Lazy;
 use pageserver_api::{
+    key::AUX_FILES_KEY,
     keyspace::KeySpaceAccum,
     models::{
         CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
@@ -891,8 +892,7 @@ impl Timeline {
                     assert_eq!(seq_key, vec_key);
                     match (seq_res, vec_res) {
                         (Ok(seq_blob), Ok(vec_blob)) => {
-                            assert_eq!(seq_blob, vec_blob,
-                                       "Image mismatch for key {seq_key} - keyspace={keyspace:?} lsn={lsn}");
+                            Self::validate_key_equivalence(seq_key, &keyspace, lsn, seq_blob, vec_blob);
                         },
                         (Err(err), Ok(_)) => {
                             panic!(
@@ -911,6 +911,43 @@ impl Timeline {
         }
     }
 
+    fn validate_key_equivalence(
+        key: &Key,
+        keyspace: &KeySpace,
+        lsn: Lsn,
+        seq: &Bytes,
+        vec: &Bytes,
+    ) {
+        use utils::bin_ser::BeSer;
+
+        if *key == AUX_FILES_KEY {
+            // The value reconstruct of AUX_FILES_KEY from records is not deterministic
+            // since it uses a hash map under the hood. Hence, deserialise both results
+            // before comparing.
+            let seq_aux_dir_res = AuxFilesDirectory::des(seq);
+            let vec_aux_dir_res = AuxFilesDirectory::des(vec);
+            match (&seq_aux_dir_res, &vec_aux_dir_res) {
+                (Ok(seq_aux_dir), Ok(vec_aux_dir)) => {
+                    assert_eq!(
+                        seq_aux_dir, vec_aux_dir,
+                        "Mismatch for key {} - keyspace={:?} lsn={}",
+                        key, keyspace, lsn
+                    );
+                }
+                (Err(_), Err(_)) => {}
+                _ => {
+                    panic!("Mismatch for {key}: {seq_aux_dir_res:?} != {vec_aux_dir_res:?}");
+                }
+            }
+        } else {
+            // All other keys should reconstruct deterministically, so we simply compare the blobs.
+            assert_eq!(
+                seq, vec,
+                "Image mismatch for key {key} - keyspace={keyspace:?} lsn={lsn}"
+            );
+        }
+    }
+
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
     pub(crate) fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last

From 9dec65b75b5262c63d89ecaaf85a2dfb4d5e84f1 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 5 Mar 2024 13:35:45 +0000
Subject: [PATCH 0339/1571] pageserver: fix vectored read path delta layer
 index traversal (#7001)

## Problem
Last weeks enablement of vectored get generated a number of panics.
From them, I diagnosed two issues in the delta layer index traversal
logic
1. The `key >= range.start && lsn >= lsn_range.start`
was too aggressive. Lsns are not monotonically increasing in the delta
layer index (keys are though), so we cannot assert on them.
2. Lsns greater or equal to `lsn_range.end` were not skipped. This
caused the query to consider records newer than the request Lsn.

## Summary of changes
* Fix the issues mentioned above inline
* Refactor the layer traversal logic to make it unit testable
* Add unit test which reproduces the failure modes listed above.
---
 pageserver/src/tenant/disk_btree.rs           |  95 ++++++-
 .../src/tenant/storage_layer/delta_layer.rs   | 257 ++++++++++++++----
 .../src/tenant/storage_layer/image_layer.rs   |  44 +--
 pageserver/src/tenant/vectored_blob_io.rs     |  12 +-
 4 files changed, 322 insertions(+), 86 deletions(-)

diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index ca30b0ac4f..6d85d1e60e 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -18,10 +18,19 @@
 //! - An Iterator interface would be more convenient for the callers than the
 //!   'visit' function
 //!
+use async_stream::try_stream;
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
 use either::Either;
-use std::{cmp::Ordering, io, result};
+use futures::Stream;
+use hex;
+use std::{
+    cmp::Ordering,
+    io,
+    iter::Rev,
+    ops::{Range, RangeInclusive},
+    result,
+};
 use thiserror::Error;
 use tracing::error;
 
@@ -250,6 +259,90 @@ where
         Ok(result)
     }
 
+    /// Return a stream which yields all key, value pairs from the index
+    /// starting from the first key greater or equal to `start_key`.
+    ///
+    /// Note that this is a copy of [`Self::visit`].
+    /// TODO: Once the sequential read path is removed this will become
+    /// the only index traversal method.
+    pub fn get_stream_from<'a>(
+        &'a self,
+        start_key: &'a [u8; L],
+        ctx: &'a RequestContext,
+    ) -> impl Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a {
+        try_stream! {
+            let mut stack = Vec::new();
+            stack.push((self.root_blk, None));
+            let block_cursor = self.reader.block_cursor();
+            while let Some((node_blknum, opt_iter)) = stack.pop() {
+                // Locate the node.
+                let node_buf = block_cursor
+                    .read_blk(self.start_blk + node_blknum, ctx)
+                    .await?;
+
+                let node = OnDiskNode::deparse(node_buf.as_ref())?;
+                let prefix_len = node.prefix_len as usize;
+                let suffix_len = node.suffix_len as usize;
+
+                assert!(node.num_children > 0);
+
+                let mut keybuf = Vec::new();
+                keybuf.extend(node.prefix);
+                keybuf.resize(prefix_len + suffix_len, 0);
+
+                let mut iter: Either<Range<usize>, Rev<RangeInclusive<usize>>> = if let Some(iter) = opt_iter {
+                    iter
+                } else {
+                    // Locate the first match
+                    let idx = match node.binary_search(start_key, keybuf.as_mut_slice()) {
+                        Ok(idx) => idx,
+                        Err(idx) => {
+                            if node.level == 0 {
+                                // Imagine that the node contains the following keys:
+                                //
+                                // 1
+                                // 3  <-- idx
+                                // 5
+                                //
+                                // If the search key is '2' and there is exact match,
+                                // the binary search would return the index of key
+                                // '3'. That's cool, '3' is the first key to return.
+                                idx
+                            } else {
+                                // This is an internal page, so each key represents a lower
+                                // bound for what's in the child page. If there is no exact
+                                // match, we have to return the *previous* entry.
+                                //
+                                // 1  <-- return this
+                                // 3  <-- idx
+                                // 5
+                                idx.saturating_sub(1)
+                            }
+                        }
+                    };
+                    Either::Left(idx..node.num_children.into())
+                };
+
+                // idx points to the first match now. Keep going from there
+                while let Some(idx) = iter.next() {
+                    let key_off = idx * suffix_len;
+                    let suffix = &node.keys[key_off..key_off + suffix_len];
+                    keybuf[prefix_len..].copy_from_slice(suffix);
+                    let value = node.value(idx);
+                    #[allow(clippy::collapsible_if)]
+                    if node.level == 0 {
+                        // leaf
+                        yield (keybuf.clone(), value.to_u64());
+                    } else {
+                        stack.push((node_blknum, Some(iter)));
+                        stack.push((value.to_blknum(), None));
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
     ///
     /// Scan the tree, starting from 'search_key', in the given direction. 'visitor'
     /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 5eaf1cc1ce..b7132ee3bf 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -46,6 +46,7 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
+use futures::StreamExt;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -847,10 +848,33 @@ impl DeltaLayerInner {
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
-        let reads = self
-            .plan_reads(keyspace, lsn_range, reconstruct_state, ctx)
-            .await
-            .map_err(GetVectoredError::Other)?;
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            block_reader,
+        );
+
+        let planner = VectoredReadPlanner::new(
+            self.max_vectored_read_bytes
+                .expect("Layer is loaded with max vectored bytes config")
+                .0
+                .into(),
+        );
+
+        let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64;
+
+        let reads = Self::plan_reads(
+            keyspace,
+            lsn_range,
+            data_end_offset,
+            index_reader,
+            planner,
+            reconstruct_state,
+            ctx,
+        )
+        .await
+        .map_err(GetVectoredError::Other)?;
 
         self.do_reads_and_update_state(reads, reconstruct_state)
             .await;
@@ -858,73 +882,64 @@ impl DeltaLayerInner {
         Ok(())
     }
 
-    async fn plan_reads(
-        &self,
+    async fn plan_reads<Reader>(
         keyspace: KeySpace,
         lsn_range: Range<Lsn>,
+        data_end_offset: u64,
+        index_reader: DiskBtreeReader<Reader, DELTA_KEY_SIZE>,
+        mut planner: VectoredReadPlanner,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<VectoredRead>> {
-        let mut planner = VectoredReadPlanner::new(
-            self.max_vectored_read_bytes
-                .expect("Layer is loaded with max vectored bytes config")
-                .0
-                .into(),
-        );
-
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            block_reader,
-        );
+    ) -> anyhow::Result<Vec<VectoredRead>>
+    where
+        Reader: BlockReader,
+    {
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
+            .build();
 
         for range in keyspace.ranges.iter() {
             let mut range_end_handled = false;
 
             let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start);
-            tree_reader
-                .visit(
-                    &start_key.0,
-                    VisitDirection::Forwards,
-                    |raw_key, value| {
-                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
-                        let lsn = DeltaKey::extract_lsn_from_buf(raw_key);
-                        let blob_ref = BlobRef(value);
+            let index_stream = index_reader.get_stream_from(&start_key.0, &ctx);
+            let mut index_stream = std::pin::pin!(index_stream);
 
-                        assert!(key >= range.start && lsn >= lsn_range.start);
+            while let Some(index_entry) = index_stream.next().await {
+                let (raw_key, value) = index_entry?;
+                let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+                let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
+                let blob_ref = BlobRef(value);
 
-                        let cached_lsn = reconstruct_state.get_cached_lsn(&key);
-                        let flag = {
-                            if cached_lsn >= Some(lsn) {
-                                BlobFlag::Ignore
-                            } else if blob_ref.will_init() {
-                                BlobFlag::Replaces
-                            } else {
-                                BlobFlag::None
-                            }
-                        };
+                // Lsns are not monotonically increasing across keys, so we don't assert on them.
+                assert!(key >= range.start);
 
-                        if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) {
-                            planner.handle_range_end(blob_ref.pos());
-                            range_end_handled = true;
-                            false
-                        } else {
-                            planner.handle(key, lsn, blob_ref.pos(), flag);
-                            true
-                        }
-                    },
-                    &RequestContextBuilder::extend(ctx)
-                        .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
-                        .build(),
-                )
-                .await
-                .map_err(|err| anyhow!(err))?;
+                let outside_lsn_range = !lsn_range.contains(&lsn);
+                let below_cached_lsn = reconstruct_state.get_cached_lsn(&key) >= Some(lsn);
+
+                let flag = {
+                    if outside_lsn_range || below_cached_lsn {
+                        BlobFlag::Ignore
+                    } else if blob_ref.will_init() {
+                        BlobFlag::ReplaceAll
+                    } else {
+                        // Usual path: add blob to the read
+                        BlobFlag::None
+                    }
+                };
+
+                if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) {
+                    planner.handle_range_end(blob_ref.pos());
+                    range_end_handled = true;
+                    break;
+                } else {
+                    planner.handle(key, lsn, blob_ref.pos(), flag);
+                }
+            }
 
             if !range_end_handled {
-                let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
-                tracing::info!("Handling range end fallback at {}", payload_end);
-                planner.handle_range_end(payload_end);
+                tracing::info!("Handling range end fallback at {}", data_end_offset);
+                planner.handle_range_end(data_end_offset);
             }
         }
 
@@ -1190,3 +1205,131 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
         self.size
     }
 }
+
+#[cfg(test)]
+mod test {
+    use std::collections::BTreeMap;
+
+    use super::*;
+    use crate::{
+        context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
+    };
+
+    /// Construct an index for a fictional delta layer and and then
+    /// traverse in order to plan vectored reads for a query. Finally,
+    /// verify that the traversal fed the right index key and value
+    /// pairs into the planner.
+    #[tokio::test]
+    async fn test_delta_layer_index_traversal() {
+        let base_key = Key {
+            field1: 0,
+            field2: 1663,
+            field3: 12972,
+            field4: 16396,
+            field5: 0,
+            field6: 246080,
+        };
+
+        // Populate the index with some entries
+        let entries: BTreeMap<Key, Vec<Lsn>> = BTreeMap::from([
+            (base_key, vec![Lsn(1), Lsn(5), Lsn(25), Lsn(26), Lsn(28)]),
+            (base_key.add(1), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]),
+            (base_key.add(2), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]),
+            (base_key.add(5), vec![Lsn(10), Lsn(15), Lsn(16), Lsn(20)]),
+        ]);
+
+        let mut disk = TestDisk::default();
+        let mut writer = DiskBtreeBuilder::<_, DELTA_KEY_SIZE>::new(&mut disk);
+
+        let mut disk_offset = 0;
+        for (key, lsns) in &entries {
+            for lsn in lsns {
+                let index_key = DeltaKey::from_key_lsn(key, *lsn);
+                let blob_ref = BlobRef::new(disk_offset, false);
+                writer
+                    .append(&index_key.0, blob_ref.0)
+                    .expect("In memory disk append should never fail");
+
+                disk_offset += 1;
+            }
+        }
+
+        // Prepare all the arguments for the call into `plan_reads` below
+        let (root_offset, _writer) = writer
+            .finish()
+            .expect("In memory disk finish should never fail");
+        let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(0, root_offset, disk);
+        let planner = VectoredReadPlanner::new(100);
+        let mut reconstruct_state = ValuesReconstructState::new();
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
+        let keyspace = KeySpace {
+            ranges: vec![
+                base_key..base_key.add(3),
+                base_key.add(3)..base_key.add(100),
+            ],
+        };
+        let lsn_range = Lsn(2)..Lsn(40);
+
+        // Plan and validate
+        let vectored_reads = DeltaLayerInner::plan_reads(
+            keyspace.clone(),
+            lsn_range.clone(),
+            disk_offset,
+            reader,
+            planner,
+            &mut reconstruct_state,
+            &ctx,
+        )
+        .await
+        .expect("Read planning should not fail");
+
+        validate(keyspace, lsn_range, vectored_reads, entries);
+    }
+
+    fn validate(
+        keyspace: KeySpace,
+        lsn_range: Range<Lsn>,
+        vectored_reads: Vec<VectoredRead>,
+        index_entries: BTreeMap<Key, Vec<Lsn>>,
+    ) {
+        #[derive(Debug, PartialEq, Eq)]
+        struct BlobSpec {
+            key: Key,
+            lsn: Lsn,
+            at: u64,
+        }
+
+        let mut planned_blobs = Vec::new();
+        for read in vectored_reads {
+            for (at, meta) in read.blobs_at.as_slice() {
+                planned_blobs.push(BlobSpec {
+                    key: meta.key,
+                    lsn: meta.lsn,
+                    at: *at,
+                });
+            }
+        }
+
+        let mut expected_blobs = Vec::new();
+        let mut disk_offset = 0;
+        for (key, lsns) in index_entries {
+            for lsn in lsns {
+                let key_included = keyspace.ranges.iter().any(|range| range.contains(&key));
+                let lsn_included = lsn_range.contains(&lsn);
+
+                if key_included && lsn_included {
+                    expected_blobs.push(BlobSpec {
+                        key,
+                        lsn,
+                        at: disk_offset,
+                    });
+                }
+
+                disk_offset += 1;
+            }
+        }
+
+        assert_eq!(planned_blobs, expected_blobs);
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 56cfaeda15..14c79e413c 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -43,6 +43,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
+use hex;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -54,6 +55,7 @@ use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::sync::Arc;
 use tokio::sync::OnceCell;
+use tokio_stream::StreamExt;
 use tracing::*;
 
 use utils::{
@@ -488,35 +490,33 @@ impl ImageLayerInner {
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
 
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::ImageLayerBtreeNode)
+            .build();
+
         for range in keyspace.ranges.iter() {
             let mut range_end_handled = false;
 
             let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
             range.start.write_to_byte_slice(&mut search_key);
 
-            tree_reader
-                .visit(
-                    &search_key,
-                    VisitDirection::Forwards,
-                    |raw_key, offset| {
-                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
-                        assert!(key >= range.start);
+            let index_stream = tree_reader.get_stream_from(&search_key, &ctx);
+            let mut index_stream = std::pin::pin!(index_stream);
 
-                        if key >= range.end {
-                            planner.handle_range_end(offset);
-                            range_end_handled = true;
-                            false
-                        } else {
-                            planner.handle(key, self.lsn, offset, BlobFlag::None);
-                            true
-                        }
-                    },
-                    &RequestContextBuilder::extend(ctx)
-                        .page_content_kind(PageContentKind::ImageLayerBtreeNode)
-                        .build(),
-                )
-                .await
-                .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
+            while let Some(index_entry) = index_stream.next().await {
+                let (raw_key, offset) = index_entry?;
+
+                let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+                assert!(key >= range.start);
+
+                if key >= range.end {
+                    planner.handle_range_end(offset);
+                    range_end_handled = true;
+                    break;
+                } else {
+                    planner.handle(key, self.lsn, offset, BlobFlag::None);
+                }
+            }
 
             if !range_end_handled {
                 let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index a8d9649d36..805f70b23b 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -128,7 +128,7 @@ impl VectoredReadBuilder {
 pub enum BlobFlag {
     None,
     Ignore,
-    Replaces,
+    ReplaceAll,
 }
 
 /// Planner for vectored blob reads.
@@ -170,7 +170,7 @@ impl VectoredReadPlanner {
     /// incorrect data to the user.
     ///
     /// The `flag` argument has two interesting values:
-    /// * [`BlobFlag::Replaces`]: The blob for this key should replace all existing blobs.
+    /// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs.
     /// This is used for WAL records that `will_init`.
     /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
     /// if the blob is cached.
@@ -204,7 +204,7 @@ impl VectoredReadPlanner {
                 let blobs_for_key = self.blobs.entry(key).or_default();
                 blobs_for_key.push((lsn, start_offset, end_offset));
             }
-            BlobFlag::Replaces => {
+            BlobFlag::ReplaceAll => {
                 let blobs_for_key = self.blobs.entry(key).or_default();
                 blobs_for_key.clear();
                 blobs_for_key.push((lsn, start_offset, end_offset));
@@ -411,10 +411,10 @@ mod tests {
         let blob_descriptions = vec![
             (first_key, lsn, 0, BlobFlag::None),    // First in read 1
             (first_key, lsn, 1024, BlobFlag::None), // Last in read 1
-            (second_key, lsn, 2 * 1024, BlobFlag::Replaces),
+            (second_key, lsn, 2 * 1024, BlobFlag::ReplaceAll),
             (second_key, lsn, 3 * 1024, BlobFlag::None),
-            (second_key, lsn, 4 * 1024, BlobFlag::Replaces), // First in read 2
-            (second_key, lsn, 5 * 1024, BlobFlag::None),     // Last in read 2
+            (second_key, lsn, 4 * 1024, BlobFlag::ReplaceAll), // First in read 2
+            (second_key, lsn, 5 * 1024, BlobFlag::None),       // Last in read 2
         ];
 
         let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]];

From 270d3be507643f068120b52838c497f6c1b45b61 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 5 Mar 2024 14:44:00 +0100
Subject: [PATCH 0340/1571] feat(per-tenant throttling): exclude throttled time
 from page_service metrics + regression test (#6953)

part of https://github.com/neondatabase/neon/issues/5899

Problem
-------

Before this PR, the time spent waiting on the throttle was charged
towards the higher-level page_service metrics, i.e.,
`pageserver_smgr_query_seconds`.
The metrics are the foundation of internal SLIs / SLOs.
A throttled tenant would cause the SLI to degrade / SLO alerts to fire.

Changes
-------


- don't charge time spent in throttle towards the page_service metrics
- record time spent in throttle in RequestContext and subtract it from
the elapsed time
- this works because the page_service path doesn't create child context,
so, all the throttle time is recorded in the parent
- it's quite brittle and will break if we ever decide to spawn child
tasks that need child RequestContexts, which would have separate
instances of the `micros_spent_throttled` counter.
- however, let's punt that to a more general refactoring of
RequestContext
- add a test case that ensures that
- throttling happens for getpage requests; this aspect of the test
passed before this PR
- throttling delays aren't charged towards the page_service metrics;
this aspect of the test only passes with this PR
- drive-by: make the throttle log message `info!`, it's an expected
condition

Performance
-----------

I took the same measurements as in #6706 , no meaningful change in CPU
overhead.

Future Work
-----------

This PR enables us to experiment with the throttle for select tenants
without affecting the SLI metrics / triggering SLO alerts.

Before declaring this feature done, we need more work to happen,
specifically:

- decide on whether we want to retain the flexibility of throttling any
`Timeline::get` call, filtered by TaskKind
- versus: separate throttles for each page_service endpoint, potentially
with separate config options
- the trouble here is that this decision implies changes to the
TenantConfig, so, if we start using the current config style now, then
decide to switch to a different config, it'll be a breaking change

Nice-to-haves but probably not worth the time right now:

- Equivalent tests to ensure the throttle applies to all other
page_service handlers.
---
 pageserver/src/context.rs                     |   7 +-
 pageserver/src/context/optional_counter.rs    | 101 +++++++++++++++
 pageserver/src/metrics.rs                     |  68 +++++++++-
 pageserver/src/page_service.rs                |  10 +-
 pageserver/src/tenant/tasks.rs                |   2 +-
 pageserver/src/tenant/throttle.rs             |  17 ++-
 .../test_pageserver_getpage_throttle.py       | 118 ++++++++++++++++++
 7 files changed, 308 insertions(+), 15 deletions(-)
 create mode 100644 pageserver/src/context/optional_counter.rs
 create mode 100644 test_runner/regress/test_pageserver_getpage_throttle.py

diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index ee331ea154..86d0390c30 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -88,13 +88,16 @@
 
 use crate::task_mgr::TaskKind;
 
+pub(crate) mod optional_counter;
+
 // The main structure of this module, see module-level comment.
-#[derive(Clone, Debug)]
+#[derive(Debug)]
 pub struct RequestContext {
     task_kind: TaskKind,
     download_behavior: DownloadBehavior,
     access_stats_behavior: AccessStatsBehavior,
     page_content_kind: PageContentKind,
+    pub micros_spent_throttled: optional_counter::MicroSecondsCounterU32,
 }
 
 /// The kind of access to the page cache.
@@ -150,6 +153,7 @@ impl RequestContextBuilder {
                 download_behavior: DownloadBehavior::Download,
                 access_stats_behavior: AccessStatsBehavior::Update,
                 page_content_kind: PageContentKind::Unknown,
+                micros_spent_throttled: Default::default(),
             },
         }
     }
@@ -163,6 +167,7 @@ impl RequestContextBuilder {
                 download_behavior: original.download_behavior,
                 access_stats_behavior: original.access_stats_behavior,
                 page_content_kind: original.page_content_kind,
+                micros_spent_throttled: Default::default(),
             },
         }
     }
diff --git a/pageserver/src/context/optional_counter.rs b/pageserver/src/context/optional_counter.rs
new file mode 100644
index 0000000000..100c649f18
--- /dev/null
+++ b/pageserver/src/context/optional_counter.rs
@@ -0,0 +1,101 @@
+use std::{
+    sync::atomic::{AtomicU32, Ordering},
+    time::Duration,
+};
+
+#[derive(Debug)]
+pub struct CounterU32 {
+    inner: AtomicU32,
+}
+impl Default for CounterU32 {
+    fn default() -> Self {
+        Self {
+            inner: AtomicU32::new(u32::MAX),
+        }
+    }
+}
+impl CounterU32 {
+    pub fn open(&self) -> Result<(), &'static str> {
+        match self
+            .inner
+            .compare_exchange(u32::MAX, 0, Ordering::Relaxed, Ordering::Relaxed)
+        {
+            Ok(_) => Ok(()),
+            Err(_) => Err("open() called on clsoed state"),
+        }
+    }
+    pub fn close(&self) -> Result<u32, &'static str> {
+        match self.inner.swap(u32::MAX, Ordering::Relaxed) {
+            u32::MAX => Err("close() called on closed state"),
+            x => Ok(x),
+        }
+    }
+
+    pub fn add(&self, count: u32) -> Result<(), &'static str> {
+        if count == 0 {
+            return Ok(());
+        }
+        let mut had_err = None;
+        self.inner
+            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| match cur {
+                u32::MAX => {
+                    had_err = Some("add() called on closed state");
+                    None
+                }
+                x => {
+                    let (new, overflowed) = x.overflowing_add(count);
+                    if new == u32::MAX || overflowed {
+                        had_err = Some("add() overflowed the counter");
+                        None
+                    } else {
+                        Some(new)
+                    }
+                }
+            })
+            .map_err(|_| had_err.expect("we set it whenever the function returns None"))
+            .map(|_| ())
+    }
+}
+
+#[derive(Default, Debug)]
+pub struct MicroSecondsCounterU32 {
+    inner: CounterU32,
+}
+
+impl MicroSecondsCounterU32 {
+    pub fn open(&self) -> Result<(), &'static str> {
+        self.inner.open()
+    }
+    pub fn add(&self, duration: Duration) -> Result<(), &'static str> {
+        match duration.as_micros().try_into() {
+            Ok(x) => self.inner.add(x),
+            Err(_) => Err("add(): duration conversion error"),
+        }
+    }
+    pub fn close_and_checked_sub_from(&self, from: Duration) -> Result<Duration, &'static str> {
+        let val = self.inner.close()?;
+        let val = Duration::from_micros(val as u64);
+        let subbed = match from.checked_sub(val) {
+            Some(v) => v,
+            None => return Err("Duration::checked_sub"),
+        };
+        Ok(subbed)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[test]
+    fn test_basic() {
+        let counter = MicroSecondsCounterU32::default();
+        counter.open().unwrap();
+        counter.add(Duration::from_micros(23)).unwrap();
+        let res = counter
+            .close_and_checked_sub_from(Duration::from_micros(42))
+            .unwrap();
+        assert_eq!(res, Duration::from_micros(42 - 23));
+    }
+}
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ce5561b431..ee62ee0367 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -11,6 +11,7 @@ use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
 use strum::{EnumCount, IntoEnumIterator, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
+use tracing::warn;
 use utils::id::TimelineId;
 
 /// Prometheus histogram buckets (in seconds) for operations in the critical
@@ -1005,15 +1006,39 @@ impl GlobalAndPerTimelineHistogram {
     }
 }
 
-struct GlobalAndPerTimelineHistogramTimer<'a> {
+struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
     h: &'a GlobalAndPerTimelineHistogram,
+    ctx: &'c RequestContext,
     start: std::time::Instant,
+    op: SmgrQueryType,
 }
 
-impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
+impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
     fn drop(&mut self) {
         let elapsed = self.start.elapsed();
-        self.h.observe(elapsed.as_secs_f64());
+        let ex_throttled = self
+            .ctx
+            .micros_spent_throttled
+            .close_and_checked_sub_from(elapsed);
+        let ex_throttled = match ex_throttled {
+            Ok(res) => res,
+            Err(error) => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
+                    Lazy::new(|| {
+                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
+                            RateLimit::new(Duration::from_secs(10))
+                        })))
+                    });
+                let mut guard = LOGGED.lock().unwrap();
+                let rate_limit = &mut guard[self.op];
+                rate_limit.call(|| {
+                    warn!(op=?self.op, error, "error deducting time spent throttled; this message is logged at a global rate limit");
+                });
+                elapsed
+            }
+        };
+        self.h.observe(ex_throttled.as_secs_f64());
     }
 }
 
@@ -1025,6 +1050,7 @@ impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
     strum_macros::EnumCount,
     strum_macros::EnumIter,
     strum_macros::FromRepr,
+    enum_map::Enum,
 )]
 #[strum(serialize_all = "snake_case")]
 pub enum SmgrQueryType {
@@ -1130,11 +1156,35 @@ impl SmgrQueryTimePerTimeline {
         });
         Self { metrics }
     }
-    pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ {
+    pub(crate) fn start_timer<'c: 'a, 'a>(
+        &'a self,
+        op: SmgrQueryType,
+        ctx: &'c RequestContext,
+    ) -> impl Drop + '_ {
         let metric = &self.metrics[op as usize];
+        let start = Instant::now();
+        match ctx.micros_spent_throttled.open() {
+            Ok(()) => (),
+            Err(error) => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
+                    Lazy::new(|| {
+                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
+                            RateLimit::new(Duration::from_secs(10))
+                        })))
+                    });
+                let mut guard = LOGGED.lock().unwrap();
+                let rate_limit = &mut guard[op];
+                rate_limit.call(|| {
+                    warn!(?op, error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
+                });
+            }
+        }
         GlobalAndPerTimelineHistogramTimer {
             h: metric,
-            start: std::time::Instant::now(),
+            ctx,
+            start,
+            op,
         }
     }
 }
@@ -1145,6 +1195,11 @@ mod smgr_query_time_tests {
     use strum::IntoEnumIterator;
     use utils::id::{TenantId, TimelineId};
 
+    use crate::{
+        context::{DownloadBehavior, RequestContext},
+        task_mgr::TaskKind,
+    };
+
     // Regression test, we used hard-coded string constants before using an enum.
     #[test]
     fn op_label_name() {
@@ -1193,7 +1248,8 @@ mod smgr_query_time_tests {
             let (pre_global, pre_per_tenant_timeline) = get_counts();
             assert_eq!(pre_per_tenant_timeline, 0);
 
-            let timer = metrics.start_timer(*op);
+            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
+            let timer = metrics.start_timer(*op, &ctx);
             drop(timer);
 
             let (post_global, post_per_tenant_timeline) = get_counts();
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 689bc5cb3c..dacee41e6e 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -910,7 +910,7 @@ impl PageServerHandler {
         let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
         let _timer = timeline
             .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetRelExists);
+            .start_timer(metrics::SmgrQueryType::GetRelExists, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
@@ -938,7 +938,7 @@ impl PageServerHandler {
 
         let _timer = timeline
             .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetRelSize);
+            .start_timer(metrics::SmgrQueryType::GetRelSize, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
@@ -966,7 +966,7 @@ impl PageServerHandler {
 
         let _timer = timeline
             .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetDbSize);
+            .start_timer(metrics::SmgrQueryType::GetDbSize, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
@@ -1144,7 +1144,7 @@ impl PageServerHandler {
 
         let _timer = timeline
             .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetPageAtLsn);
+            .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
@@ -1172,7 +1172,7 @@ impl PageServerHandler {
 
         let _timer = timeline
             .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetSlruSegment);
+            .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 45ce6c9381..57c3edcddd 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -217,7 +217,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
                 let allowed_rps = tenant.timeline_get_throttle.steady_rps();
                 let delta = now - prev;
-                warn!(
+                info!(
                     n_seconds=%format_args!("{:.3}",
                     delta.as_secs_f64()),
                     count_accounted,
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 6894a88b93..280773e9c3 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -2,14 +2,14 @@ use std::{
     str::FromStr,
     sync::{
         atomic::{AtomicU64, Ordering},
-        Arc,
+        Arc, Mutex,
     },
     time::{Duration, Instant},
 };
 
 use arc_swap::ArcSwap;
 use enumset::EnumSet;
-use tracing::error;
+use tracing::{error, warn};
 
 use crate::{context::RequestContext, task_mgr::TaskKind};
 
@@ -157,6 +157,19 @@ where
                 .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
             let observation = Observation { wait_time };
             self.metric.observe_throttling(&observation);
+            match ctx.micros_spent_throttled.add(wait_time) {
+                Ok(res) => res,
+                Err(error) => {
+                    use once_cell::sync::Lazy;
+                    use utils::rate_limit::RateLimit;
+                    static WARN_RATE_LIMIT: Lazy<Mutex<RateLimit>> =
+                        Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                    let mut guard = WARN_RATE_LIMIT.lock().unwrap();
+                    guard.call(move || {
+                        warn!(error, "error adding time spent throttled; this message is logged at a global rate limit");
+                    });
+                }
+            }
         }
     }
 }
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
new file mode 100644
index 0000000000..42cc28efee
--- /dev/null
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -0,0 +1,118 @@
+import json
+import uuid
+
+from anyio import Path
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
+from fixtures.pg_version import PgVersion
+from fixtures.types import TenantId, TimelineId
+from fixtures.utils import wait_until
+
+
+def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    env = neon_env_builder.init_start()
+
+    env.pageserver.tenant_detach(env.initial_tenant)
+
+    env.pageserver.allowed_errors.append(
+        # https://github.com/neondatabase/neon/issues/6925
+        r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
+    )
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+
+    rate_limit_rps = 100
+    compaction_period = 5
+    env.pageserver.tenant_create(
+        tenant_id,
+        conf={
+            "compaction_period": f"{compaction_period}s",
+            "timeline_get_throttle": {
+                "task_kinds": ["PageRequestHandler"],
+                "initial": 0,
+                "refill_interval": "100ms",
+                "refill_amount": int(rate_limit_rps / 10),
+                "max": int(rate_limit_rps / 10),
+                "fair": True,
+            },
+        },
+    )
+
+    ps_http = env.pageserver.http_client()
+
+    ps_http.timeline_create(PgVersion.V16, tenant_id, timeline_id)
+
+    def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: int):
+        cmd = [
+            str(env.neon_binpath / "pagebench"),
+            "get-page-latest-lsn",
+            "--mgmt-api-endpoint",
+            ps_http.base_url,
+            "--page-service-connstring",
+            env.pageserver.connstr(password=None),
+            "--runtime",
+            f"{duration_secs}s",
+            f"{tenant_id}/{timeline_id}",
+        ]
+
+        basepath = pg_bin.run_capture(cmd, with_command_header=False)
+        results_path = Path(basepath + ".stdout")
+        log.info(f"Benchmark results at: {results_path}")
+
+        with open(results_path, "r") as f:
+            results = json.load(f)
+        log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
+        return int(results["total"]["request_count"])
+
+    log.info("warmup / make sure metrics are present")
+    run_pagebench_at_max_speed_and_get_total_requests_completed(2)
+    metrics_query = {
+        "tenant_id": str(tenant_id),
+        "timeline_id": str(timeline_id),
+        "smgr_query_type": "get_page_at_lsn",
+    }
+    metric_name = "pageserver_smgr_query_seconds_sum"
+    smgr_query_seconds_pre = ps_http.get_metric_value(metric_name, metrics_query)
+    assert smgr_query_seconds_pre is not None
+
+    marker = uuid.uuid4().hex
+    ps_http.post_tracing_event("info", marker)
+    _, marker_offset = wait_until(
+        10, 0.5, lambda: env.pageserver.assert_log_contains(marker, offset=None)
+    )
+
+    log.info("run pagebench")
+    duration_secs = 10
+    actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs)
+
+    log.info("validate the client is capped at the configured rps limit")
+    expect_ncompleted = duration_secs * rate_limit_rps
+    delta_abs = abs(expect_ncompleted - actual_ncompleted)
+    threshold = 0.05 * expect_ncompleted
+    assert (
+        threshold / rate_limit_rps < 0.1 * duration_secs
+    ), "test self-test: unrealistic expecations regarding precision in this test"
+    assert (
+        delta_abs < 0.05 * expect_ncompleted
+    ), "the throttling deviates more than 5percent from the expectation"
+
+    log.info("validate that we logged the throttling")
+
+    wait_until(
+        10,
+        compaction_period / 10,
+        lambda: env.pageserver.assert_log_contains(
+            f".*{tenant_id}.*shard was throttled in the last n_seconds.*",
+            offset=marker_offset,
+        ),
+    )
+
+    log.info("validate that the metric doesn't include throttle wait time")
+    smgr_query_seconds_post = ps_http.get_metric_value(metric_name, metrics_query)
+    assert smgr_query_seconds_post is not None
+    actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre
+
+    assert (
+        duration_secs >= 10 * actual_smgr_query_seconds
+    ), "smgr metrics should not include throttle wait time"

From bdbb2f4afc8c02620b45d52fecd71fdeb848a3c9 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 5 Mar 2024 19:02:51 +0400
Subject: [PATCH 0341/1571] proxy: report redis broken message metric (#7021)

## Problem

Not really a problem. Improving visibility around redis communication.

## Summary of changes

Added metric on the number of broken messages.
---
 proxy/src/metrics.rs             | 9 +++++++++
 proxy/src/redis/notifications.rs | 4 ++++
 2 files changed, 13 insertions(+)

diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 66031f5eb2..2464b1e611 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -303,3 +303,12 @@ pub static ENDPOINT_ERRORS_BY_KIND: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
     )
     .unwrap()
 });
+
+pub static REDIS_BROKEN_MESSAGES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_redis_errors_total",
+        "Number of errors by a given classification",
+        &["channel"],
+    )
+    .unwrap()
+});
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index b8297a206c..6ae848c0d2 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -10,6 +10,7 @@ use crate::{
     cache::project_info::ProjectInfoCache,
     cancellation::{CancelMap, CancellationHandler, NotificationsCancellationHandler},
     intern::{ProjectIdInt, RoleNameInt},
+    metrics::REDIS_BROKEN_MESSAGES,
 };
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -115,6 +116,9 @@ impl<
         let msg: Notification = match serde_json::from_str(&payload) {
             Ok(msg) => msg,
             Err(e) => {
+                REDIS_BROKEN_MESSAGES
+                    .with_label_values(&[msg.get_channel_name()])
+                    .inc();
                 tracing::error!("broken message: {e}");
                 return Ok(());
             }

From b036c32262871a0942211c4fba6a7099cfacacd7 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <chi@neon.tech>
Date: Tue, 5 Mar 2024 10:03:44 -0500
Subject: [PATCH 0342/1571] fix -Wmissing-prototypes for neon extension (#7010)

## Problem

ref https://github.com/neondatabase/neon/issues/6188

## Summary of changes

This pull request fixes `-Wmissing-prototypes` for the neon extension.
Note that (1) the gcc version in CI and macOS is different, therefore
some of the warning does not get reported when developing the neon
extension locally. (2) the CI env variable `COPT = -Werror` does not get
passed into the docker build process, therefore warnings are not treated
as errors on CI.


https://github.com/neondatabase/neon/blob/e62baa97041e10ce45772b3724e24e679a650d69/.github/workflows/build_and_test.yml#L22

There will be follow-up pull requests on solving other warnings. By the
way, I did not figure out the default compile parameters in the CI env,
and therefore this pull request is tested by manually adding
`-Wmissing-prototypes` into the `COPT`.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pgxn/neon/control_plane_connector.c | 11 ++++++-----
 pgxn/neon/control_plane_connector.h |  2 +-
 pgxn/neon/extension_server.c        |  1 +
 pgxn/neon/extension_server.h        | 17 +++++++++++++++++
 pgxn/neon/neon.c                    |  1 +
 pgxn/neon/neon.h                    |  3 +--
 pgxn/neon/neon_utils.c              |  3 ++-
 pgxn/neon/neon_utils.h              |  2 +-
 pgxn/neon/walproposer.c             |  4 ++--
 pgxn/neon/walproposer_pg.c          |  2 +-
 10 files changed, 33 insertions(+), 13 deletions(-)
 create mode 100644 pgxn/neon/extension_server.h

diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index 00a582d718..93252e6b29 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -35,6 +35,7 @@
 #include "utils/memutils.h"
 #include "utils/jsonb.h"
 
+#include "control_plane_connector.h"
 #include "neon_utils.h"
 
 static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
@@ -113,6 +114,8 @@ ConstructDeltaMessage()
 	if (RootTable.db_table)
 	{
 		JsonbValue	dbs;
+		HASH_SEQ_STATUS status;
+		DbEntry    *entry;
 
 		dbs.type = jbvString;
 		dbs.val.string.val = "dbs";
@@ -120,9 +123,6 @@ ConstructDeltaMessage()
 		pushJsonbValue(&state, WJB_KEY, &dbs);
 		pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL);
 
-		HASH_SEQ_STATUS status;
-		DbEntry    *entry;
-
 		hash_seq_init(&status, RootTable.db_table);
 		while ((entry = hash_seq_search(&status)) != NULL)
 		{
@@ -168,8 +168,9 @@ ConstructDeltaMessage()
 #else
 				const char *logdetail;
 #endif
+				char	   *encrypted_password;
 				PushKeyValue(&state, "password", (char *) entry->password);
-				char	   *encrypted_password = get_role_password(entry->name, &logdetail);
+				encrypted_password = get_role_password(entry->name, &logdetail);
 
 				if (encrypted_password)
 				{
@@ -831,7 +832,7 @@ NeonProcessUtility(
 	}
 }
 
-extern void
+void
 InitControlPlaneConnector()
 {
 	PreviousProcessUtilityHook = ProcessUtility_hook;
diff --git a/pgxn/neon/control_plane_connector.h b/pgxn/neon/control_plane_connector.h
index 12d6a97562..7eed449200 100644
--- a/pgxn/neon/control_plane_connector.h
+++ b/pgxn/neon/control_plane_connector.h
@@ -1,6 +1,6 @@
 #ifndef CONTROL_PLANE_CONNECTOR_H
 #define CONTROL_PLANE_CONNECTOR_H
 
-void		InitControlPlaneConnector();
+void		InitControlPlaneConnector(void);
 
 #endif
diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c
index 039405e2cd..1329e2d17b 100644
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -14,6 +14,7 @@
 
 #include "utils/guc.h"
 
+#include "extension_server.h" 
 #include "neon_utils.h"
 
 static int	extension_server_port = 0;
diff --git a/pgxn/neon/extension_server.h b/pgxn/neon/extension_server.h
new file mode 100644
index 0000000000..3e67708b85
--- /dev/null
+++ b/pgxn/neon/extension_server.h
@@ -0,0 +1,17 @@
+/*-------------------------------------------------------------------------
+ *
+ * extension_server.h
+ *	  Request compute_ctl to download extension files.
+ *
+ * IDENTIFICATION
+ *	 contrib/neon/extension_server.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef EXTENSION_SERVER_H
+#define EXTENSION_SERVER_H
+
+void pg_init_extension_server(void);
+
+#endif							/* EXTENSION_SERVER_H */
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index a14288b33a..1f456d9a3f 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -29,6 +29,7 @@
 #include "utils/guc.h"
 #include "utils/wait_event.h"
 
+#include "extension_server.h"
 #include "neon.h"
 #include "walproposer.h"
 #include "pagestore_client.h"
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index c3afecc679..a0f8c97497 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -25,12 +25,11 @@ extern int	wal_acceptor_connection_timeout;
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);
 
-extern void pg_init_extension_server(void);
-
 extern uint64 BackpressureThrottlingTime(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
 
 extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
 extern void PGDLLEXPORT WalProposerMain(Datum main_arg);
+PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
 
 #endif							/* NEON_H */
diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c
index ce554c89df..1fb4ed9522 100644
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -6,6 +6,7 @@
 
 #include "postgres.h"
 
+#include "neon_utils.h"
 #include "lib/stringinfo.h"
 #include "libpq/pqformat.h"
 
@@ -14,7 +15,7 @@
  *
  * Returns -1 if the character is not a hexadecimal digit.
  */
-int
+static int
 HexDecodeChar(char c)
 {
 	if (c >= '0' && c <= '9')
diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h
index 10d41db102..89683714f1 100644
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -12,7 +12,7 @@ uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
 void		pq_sendint32_le(StringInfo buf, uint32 i);
 void		pq_sendint64_le(StringInfo buf, uint64 i);
-extern void disable_core_dump();
+void        disable_core_dump(void);
 
 #ifndef WALPROPOSER_LIB
 
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 0d5007ef73..10487636ae 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1460,7 +1460,7 @@ RecvAppendResponses(Safekeeper *sk)
 }
 
 /* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */
-void
+static void
 ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *rf)
 {
 	uint8		nkeys;
@@ -1590,9 +1590,9 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
 Safekeeper *
 GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
 {
-	*donor_lsn = InvalidXLogRecPtr;
 	Safekeeper *donor = NULL;
 	int			i;
+	*donor_lsn = InvalidXLogRecPtr;
 
 	if (wp->n_votes < wp->quorum)
 	{
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 61a2a54809..7f07913fa6 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -398,7 +398,7 @@ walprop_pg_get_shmem_state(WalProposer *wp)
 	return walprop_shared;
 }
 
-void
+static void
 replication_feedback_set(PageserverFeedback *rf)
 {
 	SpinLockAcquire(&walprop_shared->mutex);

From e69a25542b4b696bcec6cd47aec62c06217a0958 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 5 Mar 2024 16:26:51 +0100
Subject: [PATCH 0343/1571] Minor improvements to tiered compaction (#7020)

Minor non-functional improvements to tiered compaction, mostly
consisting of comment fixes.

Followup of  #6830, part of #6768
---
 pageserver/compaction/src/compact_tiered.rs  | 21 ++++---------
 pageserver/compaction/src/identify_levels.rs | 19 ++++++------
 pageserver/compaction/src/interface.rs       | 31 ++++++++++----------
 pageserver/compaction/src/simulator.rs       |  1 -
 pageserver/src/tenant/timeline/compaction.rs |  1 -
 5 files changed, 30 insertions(+), 43 deletions(-)

diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index 52219a014c..60fc7ac925 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -63,7 +63,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
         );
 
         // Identify the range of LSNs that belong to this level. We assume that
-        // each file in this level span an LSN range up to 1.75x target file
+        // each file in this level spans an LSN range up to 1.75x target file
         // size. That should give us enough slop that if we created a slightly
         // oversized L0 layer, e.g. because flushing the in-memory layer was
         // delayed for some reason, we don't consider the oversized layer to
@@ -248,7 +248,6 @@ enum CompactionStrategy {
     CreateImage,
 }
 
-#[allow(dead_code)] // Todo
 struct CompactionJob<E: CompactionJobExecutor> {
     key_range: Range<E::Key>,
     lsn_range: Range<Lsn>,
@@ -345,7 +344,7 @@ where
     ///
     /// TODO: Currently, this is called exactly once for the level, and we
     /// decide whether to create new image layers to cover the whole level, or
-    /// write a new set of delta. In the future, this should try to partition
+    /// write a new set of deltas. In the future, this should try to partition
     /// the key space, and make the decision separately for each partition.
     async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
         let job = &self.jobs[job_id.0];
@@ -709,18 +708,6 @@ where
     }
 }
 
-// Sliding window through keyspace and values
-//
-// This is used to decide what layer to write next, from the beginning of the window.
-//
-// Candidates:
-//
-// 1. Create an image layer, snapping to previous images
-// 2. Create a delta layer, snapping to previous images
-// 3. Create an image layer, snapping to
-//
-//
-
 // Take previous partitioning, based on the image layers below.
 //
 // Candidate is at the front:
@@ -739,6 +726,10 @@ struct WindowElement<K> {
     last_key: K,  // inclusive
     accum_size: u64,
 }
+
+// Sliding window through keyspace and values
+//
+// This is used to decide what layer to write next, from the beginning of the window.
 struct Window<K> {
     elems: VecDeque<WindowElement<K>>,
 
diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs
index ef388fd92b..98dd46925c 100644
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -1,5 +1,5 @@
-//! An LSM tree consists of multiple levels, each exponential larger than the
-//! previous level. And each level consists of be multiple "tiers". With tiered
+//! An LSM tree consists of multiple levels, each exponentially larger than the
+//! previous level. And each level consists of multiple "tiers". With tiered
 //! compaction, a level is compacted when it has accumulated more than N tiers,
 //! forming one tier on the next level.
 //!
@@ -170,13 +170,6 @@ where
     })
 }
 
-// helper struct used in depth()
-struct Event<K> {
-    key: K,
-    layer_idx: usize,
-    start: bool,
-}
-
 impl<L> Level<L> {
     /// Count the number of deltas stacked on each other.
     pub fn depth<K>(&self) -> u64
@@ -184,6 +177,11 @@ impl<L> Level<L> {
         K: CompactionKey,
         L: CompactionLayer<K>,
     {
+        struct Event<K> {
+            key: K,
+            layer_idx: usize,
+            start: bool,
+        }
         let mut events: Vec<Event<K>> = Vec::new();
         for (idx, l) in self.layers.iter().enumerate() {
             events.push(Event {
@@ -202,7 +200,7 @@ impl<L> Level<L> {
         // Sweep the key space left to right. Stop at each distinct key, and
         // count the number of deltas on top of the highest image at that key.
         //
-        // This is a little enefficient, as we walk through the active_set on
+        // This is a little inefficient, as we walk through the active_set on
         // every key. We could increment/decrement a counter on each step
         // instead, but that'd require a bit more complex bookkeeping.
         let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
@@ -236,6 +234,7 @@ impl<L> Level<L> {
                 }
             }
         }
+        debug_assert_eq!(active_set, BTreeSet::new());
         max_depth
     }
 }
diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs
index 979ceebf0e..2bb2e749c0 100644
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -4,12 +4,12 @@
 //! All the heavy lifting is done by the create_image and create_delta
 //! functions that the implementor provides.
 use async_trait::async_trait;
+use futures::Future;
 use pageserver_api::{key::Key, keyspace::key_range_size};
 use std::ops::Range;
 use utils::lsn::Lsn;
 
 /// Public interface. This is the main thing that the implementor needs to provide
-#[async_trait]
 pub trait CompactionJobExecutor {
     // Type system.
     //
@@ -17,8 +17,7 @@ pub trait CompactionJobExecutor {
     // compaction doesn't distinguish whether they are stored locally or
     // remotely.
     //
-    // The keyspace is defined by CompactionKey trait.
-    //
+    // The keyspace is defined by the CompactionKey trait.
     type Key: CompactionKey;
 
     type Layer: CompactionLayer<Self::Key> + Clone;
@@ -35,27 +34,27 @@ pub trait CompactionJobExecutor {
     // ----
 
     /// Return all layers that overlap the given bounding box.
-    async fn get_layers(
+    fn get_layers(
         &mut self,
         key_range: &Range<Self::Key>,
         lsn_range: &Range<Lsn>,
         ctx: &Self::RequestContext,
-    ) -> anyhow::Result<Vec<Self::Layer>>;
+    ) -> impl Future<Output = anyhow::Result<Vec<Self::Layer>>> + Send;
 
-    async fn get_keyspace(
+    fn get_keyspace(
         &mut self,
         key_range: &Range<Self::Key>,
         lsn: Lsn,
         ctx: &Self::RequestContext,
-    ) -> anyhow::Result<CompactionKeySpace<Self::Key>>;
+    ) -> impl Future<Output = anyhow::Result<CompactionKeySpace<Self::Key>>> + Send;
 
     /// NB: This is a pretty expensive operation. In the real pageserver
     /// implementation, it downloads the layer, and keeps it resident
     /// until the DeltaLayer is dropped.
-    async fn downcast_delta_layer(
+    fn downcast_delta_layer(
         &self,
         layer: &Self::Layer,
-    ) -> anyhow::Result<Option<Self::DeltaLayer>>;
+    ) -> impl Future<Output = anyhow::Result<Option<Self::DeltaLayer>>> + Send;
 
     // ----
     // Functions to execute the plan
@@ -63,33 +62,33 @@ pub trait CompactionJobExecutor {
 
     /// Create a new image layer, materializing all the values in the key range,
     /// at given 'lsn'.
-    async fn create_image(
+    fn create_image(
         &mut self,
         lsn: Lsn,
         key_range: &Range<Self::Key>,
         ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
+    ) -> impl Future<Output = anyhow::Result<()>> + Send;
 
     /// Create a new delta layer, containing all the values from 'input_layers'
     /// in the given key and LSN range.
-    async fn create_delta(
+    fn create_delta(
         &mut self,
         lsn_range: &Range<Lsn>,
         key_range: &Range<Self::Key>,
         input_layers: &[Self::DeltaLayer],
         ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
+    ) -> impl Future<Output = anyhow::Result<()>> + Send;
 
     /// Delete a layer. The compaction implementation will call this only after
     /// all the create_image() or create_delta() calls that deletion of this
     /// layer depends on have finished. But if the implementor has extra lazy
-    /// background tasks, like uploading the index json file to remote storage,
+    /// background tasks, like uploading the index json file to remote storage.
     /// it is the implementation's responsibility to track those.
-    async fn delete_layer(
+    fn delete_layer(
         &mut self,
         layer: &Self::Layer,
         ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
+    ) -> impl Future<Output = anyhow::Result<()>> + Send;
 }
 
 pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs
index 6d07038dcd..def7983e75 100644
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -429,7 +429,6 @@ impl From<&Arc<MockImageLayer>> for MockLayer {
     }
 }
 
-#[async_trait]
 impl interface::CompactionJobExecutor for MockTimeline {
     type Key = Key;
     type Layer = MockLayer;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 914e3948ef..8b544b1c3a 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -134,7 +134,6 @@ struct ResidentDeltaLayer(ResidentLayer);
 #[derive(Clone)]
 struct ResidentImageLayer(ResidentLayer);
 
-#[async_trait]
 impl CompactionJobExecutor for TimelineAdaptor {
     type Key = crate::repository::Key;
 

From 15b3665dc4810c4539dc3c40e94520506a56154d Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 5 Mar 2024 19:32:58 +0400
Subject: [PATCH 0344/1571] proxy: fix bug with populating the data (#7023)

## Problem

Branch/project and coldStart were not populated to data events.

## Summary of changes

Populate it. Also added logging for the coldstart info.
---
 proxy/src/auth/backend/link.rs     | 2 ++
 proxy/src/console/messages.rs      | 3 ++-
 proxy/src/console/provider/neon.rs | 3 +++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index ec7d891247..7db76f3d9e 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -102,6 +102,8 @@ pub(super) async fn authenticate(
 
     ctx.set_user(db_info.user.into());
     ctx.set_project(db_info.aux.clone());
+    let cold_start_info = db_info.aux.cold_start_info.clone().unwrap_or_default();
+    info!(?cold_start_info, "woken up a compute node");
 
     // Backwards compatibility. pg_sni_proxy uses "--" in domain names
     // while direct connections do not. Once we migrate to pg_sni_proxy
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 85adb31654..102076f2c6 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -101,9 +101,10 @@ pub struct MetricsAuxInfo {
     pub cold_start_info: Option<ColdStartInfo>,
 }
 
-#[derive(Debug, Serialize, Deserialize, Clone)]
+#[derive(Debug, Default, Serialize, Deserialize, Clone)]
 #[serde(rename_all = "snake_case")]
 pub enum ColdStartInfo {
+    #[default]
     Unknown = 0,
     Warm = 1,
     PoolHit = 2,
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 71b34cb676..f3befa33e0 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -259,6 +259,9 @@ impl super::Api for Api {
         }
 
         let node = self.do_wake_compute(ctx, user_info).await?;
+        ctx.set_project(node.aux.clone());
+        let cold_start_info = node.aux.cold_start_info.clone().unwrap_or_default();
+        info!(?cold_start_info, "woken up a compute node");
         let (_, cached) = self.caches.node_info.insert(key.clone(), node);
         info!(key = &*key, "created a cache entry for compute node info");
 

From 2daa2f1d1059c033ac25718c6e67d7b3953c20a6 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 5 Mar 2024 15:41:05 +0000
Subject: [PATCH 0345/1571] test: disable large slru basebackup bench in ci
 (#7025)

The test is flaky due to
https://github.com/neondatabase/neon/issues/7006.
---
 .../pageserver/pagebench/test_large_slru_basebackup.py       | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index e2e7fffdbe..921b7c5b76 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -1,5 +1,6 @@
 import asyncio
 import json
+import os
 from pathlib import Path
 from typing import Any, Dict, Tuple
 
@@ -19,6 +20,10 @@ from performance.pageserver.util import (
 @pytest.mark.parametrize("n_tenants", [10])
 @pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
 @pytest.mark.timeout(1000)
+@pytest.mark.skipif(
+    os.getenv("CI", "false") == "true",
+    reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/7006",
+)
 def test_basebackup_with_high_slru_count(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,

From eacdc179dc0e396ef12a098478cb807be4f847cf Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 5 Mar 2024 18:03:51 +0100
Subject: [PATCH 0346/1571] fixup(#6991): it broke the macOS build (#7024)

---
 pageserver/src/virtual_file/io_engine.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 5fef826477..e369d28711 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -248,6 +248,7 @@ impl IoEngine {
                 .await
                 .expect("failed to join blocking code most likely it panicked, panicking as well")
             }
+            #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => work.await,
         }
     }

From 2f88e7a921b4b37f3aa992bc1b419d24b24b965b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 6 Mar 2024 02:40:23 +0100
Subject: [PATCH 0347/1571] Move compaction code to compaction.rs (#7026)

Moves some of the (legacy) compaction code to compaction.rs. No
functional changes, just moves of code.

Before, compaction.rs was only for the new tiered compaction mechanism,
now it's for both the old and new mechanisms.

Part of #6768
---
 pageserver/src/tenant/timeline.rs            | 693 +-----------------
 pageserver/src/tenant/timeline/compaction.rs | 706 ++++++++++++++++++-
 2 files changed, 703 insertions(+), 696 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 309ec2e829..37acebb10a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,7 +14,6 @@ use camino::Utf8Path;
 use enumset::EnumSet;
 use fail::fail_point;
 use futures::stream::StreamExt;
-use itertools::Itertools;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::AUX_FILES_KEY,
@@ -35,7 +34,7 @@ use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 use std::{
     array,
-    collections::{BTreeMap, BinaryHeap, HashMap, HashSet},
+    collections::{BTreeMap, HashMap, HashSet},
     sync::atomic::AtomicU64,
 };
 use std::{
@@ -57,7 +56,7 @@ use crate::tenant::{
     metadata::TimelineMetadata,
 };
 use crate::{
-    context::{AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder},
+    context::{DownloadBehavior, RequestContext},
     disk_usage_eviction_task::DiskUsageEvictionInfo,
     pgdatadir_mapping::CollectKeySpaceError,
 };
@@ -1146,118 +1145,6 @@ impl Timeline {
         }
     }
 
-    /// TODO: cancellation
-    async fn compact_legacy(
-        self: &Arc<Self>,
-        _cancel: &CancellationToken,
-        flags: EnumSet<CompactFlags>,
-        ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
-        // High level strategy for compaction / image creation:
-        //
-        // 1. First, calculate the desired "partitioning" of the
-        // currently in-use key space. The goal is to partition the
-        // key space into roughly fixed-size chunks, but also take into
-        // account any existing image layers, and try to align the
-        // chunk boundaries with the existing image layers to avoid
-        // too much churn. Also try to align chunk boundaries with
-        // relation boundaries.  In principle, we don't know about
-        // relation boundaries here, we just deal with key-value
-        // pairs, and the code in pgdatadir_mapping.rs knows how to
-        // map relations into key-value pairs. But in practice we know
-        // that 'field6' is the block number, and the fields 1-5
-        // identify a relation. This is just an optimization,
-        // though.
-        //
-        // 2. Once we know the partitioning, for each partition,
-        // decide if it's time to create a new image layer. The
-        // criteria is: there has been too much "churn" since the last
-        // image layer? The "churn" is fuzzy concept, it's a
-        // combination of too many delta files, or too much WAL in
-        // total in the delta file. Or perhaps: if creating an image
-        // file would allow to delete some older files.
-        //
-        // 3. After that, we compact all level0 delta files if there
-        // are too many of them.  While compacting, we also garbage
-        // collect any page versions that are no longer needed because
-        // of the new image layers we created in step 2.
-        //
-        // TODO: This high level strategy hasn't been implemented yet.
-        // Below are functions compact_level0() and create_image_layers()
-        // but they are a bit ad hoc and don't quite work like it's explained
-        // above. Rewrite it.
-
-        // Is the timeline being deleted?
-        if self.is_stopping() {
-            trace!("Dropping out of compaction on timeline shutdown");
-            return Err(CompactionError::ShuttingDown);
-        }
-
-        let target_file_size = self.get_checkpoint_distance();
-
-        // Define partitioning schema if needed
-
-        // FIXME: the match should only cover repartitioning, not the next steps
-        match self
-            .repartition(
-                self.get_last_record_lsn(),
-                self.get_compaction_target_size(),
-                flags,
-                ctx,
-            )
-            .await
-        {
-            Ok((partitioning, lsn)) => {
-                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
-                let image_ctx = RequestContextBuilder::extend(ctx)
-                    .access_stats_behavior(AccessStatsBehavior::Skip)
-                    .build();
-
-                // 2. Compact
-                let timer = self.metrics.compact_time_histo.start_timer();
-                self.compact_level0(target_file_size, ctx).await?;
-                timer.stop_and_record();
-
-                // 3. Create new image layers for partitions that have been modified
-                // "enough".
-                let layers = self
-                    .create_image_layers(
-                        &partitioning,
-                        lsn,
-                        flags.contains(CompactFlags::ForceImageLayerCreation),
-                        &image_ctx,
-                    )
-                    .await
-                    .map_err(anyhow::Error::from)?;
-                if let Some(remote_client) = &self.remote_client {
-                    for layer in layers {
-                        remote_client.schedule_layer_file_upload(layer)?;
-                    }
-                }
-
-                if let Some(remote_client) = &self.remote_client {
-                    // should any new image layer been created, not uploading index_part will
-                    // result in a mismatch between remote_physical_size and layermap calculated
-                    // size, which will fail some tests, but should not be an issue otherwise.
-                    remote_client.schedule_index_upload_for_file_changes()?;
-                }
-            }
-            Err(err) => {
-                // no partitioning? This is normal, if the timeline was just created
-                // as an empty timeline. Also in unit tests, when we use the timeline
-                // as a simple key-value store, ignoring the datadir layout. Log the
-                // error but continue.
-                //
-                // Suppress error when it's due to cancellation
-                if !self.cancel.is_cancelled() {
-                    error!("could not compact, repartitioning keyspace failed: {err:?}");
-                }
-            }
-        };
-
-        Ok(())
-    }
-
     /// Mutate the timeline with a [`TimelineWriter`].
     pub(crate) async fn writer(&self) -> TimelineWriter<'_> {
         TimelineWriter {
@@ -3766,12 +3653,6 @@ impl Timeline {
     }
 }
 
-#[derive(Default)]
-struct CompactLevel0Phase1Result {
-    new_layers: Vec<ResidentLayer>,
-    deltas_to_compact: Vec<Layer>,
-}
-
 /// Top-level failure to compact.
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum CompactionError {
@@ -3825,577 +3706,7 @@ impl DurationRecorder {
     }
 }
 
-#[derive(Default)]
-struct CompactLevel0Phase1StatsBuilder {
-    version: Option<u64>,
-    tenant_id: Option<TenantShardId>,
-    timeline_id: Option<TimelineId>,
-    read_lock_acquisition_micros: DurationRecorder,
-    read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
-    read_lock_held_key_sort_micros: DurationRecorder,
-    read_lock_held_prerequisites_micros: DurationRecorder,
-    read_lock_held_compute_holes_micros: DurationRecorder,
-    read_lock_drop_micros: DurationRecorder,
-    write_layer_files_micros: DurationRecorder,
-    level0_deltas_count: Option<usize>,
-    new_deltas_count: Option<usize>,
-    new_deltas_size: Option<u64>,
-}
-
-#[derive(serde::Serialize)]
-struct CompactLevel0Phase1Stats {
-    version: u64,
-    tenant_id: TenantShardId,
-    timeline_id: TimelineId,
-    read_lock_acquisition_micros: RecordedDuration,
-    read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
-    read_lock_held_key_sort_micros: RecordedDuration,
-    read_lock_held_prerequisites_micros: RecordedDuration,
-    read_lock_held_compute_holes_micros: RecordedDuration,
-    read_lock_drop_micros: RecordedDuration,
-    write_layer_files_micros: RecordedDuration,
-    level0_deltas_count: usize,
-    new_deltas_count: usize,
-    new_deltas_size: u64,
-}
-
-impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
-    type Error = anyhow::Error;
-
-    fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result<Self, Self::Error> {
-        Ok(Self {
-            version: value.version.ok_or_else(|| anyhow!("version not set"))?,
-            tenant_id: value
-                .tenant_id
-                .ok_or_else(|| anyhow!("tenant_id not set"))?,
-            timeline_id: value
-                .timeline_id
-                .ok_or_else(|| anyhow!("timeline_id not set"))?,
-            read_lock_acquisition_micros: value
-                .read_lock_acquisition_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?,
-            read_lock_held_spawn_blocking_startup_micros: value
-                .read_lock_held_spawn_blocking_startup_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
-            read_lock_held_key_sort_micros: value
-                .read_lock_held_key_sort_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
-            read_lock_held_prerequisites_micros: value
-                .read_lock_held_prerequisites_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?,
-            read_lock_held_compute_holes_micros: value
-                .read_lock_held_compute_holes_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?,
-            read_lock_drop_micros: value
-                .read_lock_drop_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
-            write_layer_files_micros: value
-                .write_layer_files_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?,
-            level0_deltas_count: value
-                .level0_deltas_count
-                .ok_or_else(|| anyhow!("level0_deltas_count not set"))?,
-            new_deltas_count: value
-                .new_deltas_count
-                .ok_or_else(|| anyhow!("new_deltas_count not set"))?,
-            new_deltas_size: value
-                .new_deltas_size
-                .ok_or_else(|| anyhow!("new_deltas_size not set"))?,
-        })
-    }
-}
-
 impl Timeline {
-    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
-    async fn compact_level0_phase1(
-        self: &Arc<Self>,
-        guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
-        mut stats: CompactLevel0Phase1StatsBuilder,
-        target_file_size: u64,
-        ctx: &RequestContext,
-    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
-        stats.read_lock_held_spawn_blocking_startup_micros =
-            stats.read_lock_acquisition_micros.till_now(); // set by caller
-        let layers = guard.layer_map();
-        let level0_deltas = layers.get_level0_deltas()?;
-        let mut level0_deltas = level0_deltas
-            .into_iter()
-            .map(|x| guard.get_from_desc(&x))
-            .collect_vec();
-        stats.level0_deltas_count = Some(level0_deltas.len());
-        // Only compact if enough layers have accumulated.
-        let threshold = self.get_compaction_threshold();
-        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
-            debug!(
-                level0_deltas = level0_deltas.len(),
-                threshold, "too few deltas to compact"
-            );
-            return Ok(CompactLevel0Phase1Result::default());
-        }
-
-        // This failpoint is used together with `test_duplicate_layers` integration test.
-        // It returns the compaction result exactly the same layers as input to compaction.
-        // We want to ensure that this will not cause any problem when updating the layer map
-        // after the compaction is finished.
-        //
-        // Currently, there are two rare edge cases that will cause duplicated layers being
-        // inserted.
-        // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
-        //    is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
-        //    map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
-        //    point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
-        //    and this causes an overwrite. This is acceptable because the content is the same, and we should do a
-        //    layer replace instead of the normal remove / upload process.
-        // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
-        //    size length. Compaction will likely create the same set of n files afterwards.
-        //
-        // This failpoint is a superset of both of the cases.
-        if cfg!(feature = "testing") {
-            let active = (|| {
-                ::fail::fail_point!("compact-level0-phase1-return-same", |_| true);
-                false
-            })();
-
-            if active {
-                let mut new_layers = Vec::with_capacity(level0_deltas.len());
-                for delta in &level0_deltas {
-                    // we are just faking these layers as being produced again for this failpoint
-                    new_layers.push(
-                        delta
-                            .download_and_keep_resident()
-                            .await
-                            .context("download layer for failpoint")?,
-                    );
-                }
-                tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
-                return Ok(CompactLevel0Phase1Result {
-                    new_layers,
-                    deltas_to_compact: level0_deltas,
-                });
-            }
-        }
-
-        // Gather the files to compact in this iteration.
-        //
-        // Start with the oldest Level 0 delta file, and collect any other
-        // level 0 files that form a contiguous sequence, such that the end
-        // LSN of previous file matches the start LSN of the next file.
-        //
-        // Note that if the files don't form such a sequence, we might
-        // "compact" just a single file. That's a bit pointless, but it allows
-        // us to get rid of the level 0 file, and compact the other files on
-        // the next iteration. This could probably made smarter, but such
-        // "gaps" in the sequence of level 0 files should only happen in case
-        // of a crash, partial download from cloud storage, or something like
-        // that, so it's not a big deal in practice.
-        level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start);
-        let mut level0_deltas_iter = level0_deltas.iter();
-
-        let first_level0_delta = level0_deltas_iter.next().unwrap();
-        let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
-        let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
-
-        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
-        for l in level0_deltas_iter {
-            let lsn_range = &l.layer_desc().lsn_range;
-
-            if lsn_range.start != prev_lsn_end {
-                break;
-            }
-            deltas_to_compact.push(l.download_and_keep_resident().await?);
-            prev_lsn_end = lsn_range.end;
-        }
-        let lsn_range = Range {
-            start: deltas_to_compact
-                .first()
-                .unwrap()
-                .layer_desc()
-                .lsn_range
-                .start,
-            end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end,
-        };
-
-        info!(
-            "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
-            lsn_range.start,
-            lsn_range.end,
-            deltas_to_compact.len(),
-            level0_deltas.len()
-        );
-
-        for l in deltas_to_compact.iter() {
-            info!("compact includes {l}");
-        }
-
-        // We don't need the original list of layers anymore. Drop it so that
-        // we don't accidentally use it later in the function.
-        drop(level0_deltas);
-
-        stats.read_lock_held_prerequisites_micros = stats
-            .read_lock_held_spawn_blocking_startup_micros
-            .till_now();
-
-        // Determine N largest holes where N is number of compacted layers.
-        let max_holes = deltas_to_compact.len();
-        let last_record_lsn = self.get_last_record_lsn();
-        let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
-        let min_hole_coverage_size = 3; // TODO: something more flexible?
-
-        // min-heap (reserve space for one more element added before eviction)
-        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
-        let mut prev: Option<Key> = None;
-
-        let mut all_keys = Vec::new();
-
-        for l in deltas_to_compact.iter() {
-            all_keys.extend(l.load_keys(ctx).await?);
-        }
-
-        // FIXME: should spawn_blocking the rest of this function
-
-        // The current stdlib sorting implementation is designed in a way where it is
-        // particularly fast where the slice is made up of sorted sub-ranges.
-        all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
-
-        stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
-
-        for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
-            if let Some(prev_key) = prev {
-                // just first fast filter
-                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
-                    let key_range = prev_key..next_key;
-                    // Measuring hole by just subtraction of i128 representation of key range boundaries
-                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
-                    // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
-                    // That is why it is better to measure size of hole as number of covering image layers.
-                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
-                    if coverage_size >= min_hole_coverage_size {
-                        heap.push(Hole {
-                            key_range,
-                            coverage_size,
-                        });
-                        if heap.len() > max_holes {
-                            heap.pop(); // remove smallest hole
-                        }
-                    }
-                }
-            }
-            prev = Some(next_key.next());
-        }
-        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
-        drop_rlock(guard);
-        stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
-        let mut holes = heap.into_vec();
-        holes.sort_unstable_by_key(|hole| hole.key_range.start);
-        let mut next_hole = 0; // index of next hole in holes vector
-
-        // This iterator walks through all key-value pairs from all the layers
-        // we're compacting, in key, LSN order.
-        let all_values_iter = all_keys.iter();
-
-        // This iterator walks through all keys and is needed to calculate size used by each key
-        let mut all_keys_iter = all_keys
-            .iter()
-            .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size))
-            .coalesce(|mut prev, cur| {
-                // Coalesce keys that belong to the same key pair.
-                // This ensures that compaction doesn't put them
-                // into different layer files.
-                // Still limit this by the target file size,
-                // so that we keep the size of the files in
-                // check.
-                if prev.0 == cur.0 && prev.2 < target_file_size {
-                    prev.2 += cur.2;
-                    Ok(prev)
-                } else {
-                    Err((prev, cur))
-                }
-            });
-
-        // Merge the contents of all the input delta layers into a new set
-        // of delta layers, based on the current partitioning.
-        //
-        // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one.
-        // It's possible that there is a single key with so many page versions that storing all of them in a single layer file
-        // would be too large. In that case, we also split on the LSN dimension.
-        //
-        // LSN
-        //  ^
-        //  |
-        //  | +-----------+            +--+--+--+--+
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            |  |  |  |  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+     ==>    |  |  |  |  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            |  |  |  |  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            +--+--+--+--+
-        //  |
-        //  +--------------> key
-        //
-        //
-        // If one key (X) has a lot of page versions:
-        //
-        // LSN
-        //  ^
-        //  |                                 (X)
-        //  | +-----------+            +--+--+--+--+
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            |  |  +--+  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+     ==>    |  |  |  |  |
-        //  | |           |            |  |  +--+  |
-        //  | +-----------+            |  |  |  |  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            +--+--+--+--+
-        //  |
-        //  +--------------> key
-        // TODO: this actually divides the layers into fixed-size chunks, not
-        // based on the partitioning.
-        //
-        // TODO: we should also opportunistically materialize and
-        // garbage collect what we can.
-        let mut new_layers = Vec::new();
-        let mut prev_key: Option<Key> = None;
-        let mut writer: Option<DeltaLayerWriter> = None;
-        let mut key_values_total_size = 0u64;
-        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
-        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
-
-        for &DeltaEntry {
-            key, lsn, ref val, ..
-        } in all_values_iter
-        {
-            let value = val.load(ctx).await?;
-            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
-            // We need to check key boundaries once we reach next key or end of layer with the same key
-            if !same_key || lsn == dup_end_lsn {
-                let mut next_key_size = 0u64;
-                let is_dup_layer = dup_end_lsn.is_valid();
-                dup_start_lsn = Lsn::INVALID;
-                if !same_key {
-                    dup_end_lsn = Lsn::INVALID;
-                }
-                // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
-                for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
-                    next_key_size = next_size;
-                    if key != next_key {
-                        if dup_end_lsn.is_valid() {
-                            // We are writting segment with duplicates:
-                            // place all remaining values of this key in separate segment
-                            dup_start_lsn = dup_end_lsn; // new segments starts where old stops
-                            dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
-                        }
-                        break;
-                    }
-                    key_values_total_size += next_size;
-                    // Check if it is time to split segment: if total keys size is larger than target file size.
-                    // We need to avoid generation of empty segments if next_size > target_file_size.
-                    if key_values_total_size > target_file_size && lsn != next_lsn {
-                        // Split key between multiple layers: such layer can contain only single key
-                        dup_start_lsn = if dup_end_lsn.is_valid() {
-                            dup_end_lsn // new segment with duplicates starts where old one stops
-                        } else {
-                            lsn // start with the first LSN for this key
-                        };
-                        dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
-                        break;
-                    }
-                }
-                // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
-                if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
-                    dup_start_lsn = dup_end_lsn;
-                    dup_end_lsn = lsn_range.end;
-                }
-                if writer.is_some() {
-                    let written_size = writer.as_mut().unwrap().size();
-                    let contains_hole =
-                        next_hole < holes.len() && key >= holes[next_hole].key_range.end;
-                    // check if key cause layer overflow or contains hole...
-                    if is_dup_layer
-                        || dup_end_lsn.is_valid()
-                        || written_size + key_values_total_size > target_file_size
-                        || contains_hole
-                    {
-                        // ... if so, flush previous layer and prepare to write new one
-                        new_layers.push(
-                            writer
-                                .take()
-                                .unwrap()
-                                .finish(prev_key.unwrap().next(), self)
-                                .await?,
-                        );
-                        writer = None;
-
-                        if contains_hole {
-                            // skip hole
-                            next_hole += 1;
-                        }
-                    }
-                }
-                // Remember size of key value because at next iteration we will access next item
-                key_values_total_size = next_key_size;
-            }
-            fail_point!("delta-layer-writer-fail-before-finish", |_| {
-                Err(CompactionError::Other(anyhow::anyhow!(
-                    "failpoint delta-layer-writer-fail-before-finish"
-                )))
-            });
-
-            if !self.shard_identity.is_key_disposable(&key) {
-                if writer.is_none() {
-                    // Create writer if not initiaized yet
-                    writer = Some(
-                        DeltaLayerWriter::new(
-                            self.conf,
-                            self.timeline_id,
-                            self.tenant_shard_id,
-                            key,
-                            if dup_end_lsn.is_valid() {
-                                // this is a layer containing slice of values of the same key
-                                debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
-                                dup_start_lsn..dup_end_lsn
-                            } else {
-                                debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
-                                lsn_range.clone()
-                            },
-                        )
-                        .await?,
-                    );
-                }
-
-                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
-            } else {
-                debug!(
-                    "Dropping key {} during compaction (it belongs on shard {:?})",
-                    key,
-                    self.shard_identity.get_shard_number(&key)
-                );
-            }
-
-            if !new_layers.is_empty() {
-                fail_point!("after-timeline-compacted-first-L1");
-            }
-
-            prev_key = Some(key);
-        }
-        if let Some(writer) = writer {
-            new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?);
-        }
-
-        // Sync layers
-        if !new_layers.is_empty() {
-            // Print a warning if the created layer is larger than double the target size
-            // Add two pages for potential overhead. This should in theory be already
-            // accounted for in the target calculation, but for very small targets,
-            // we still might easily hit the limit otherwise.
-            let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
-            for layer in new_layers.iter() {
-                if layer.layer_desc().file_size > warn_limit {
-                    warn!(
-                        %layer,
-                        "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size
-                    );
-                }
-            }
-
-            // The writer.finish() above already did the fsync of the inodes.
-            // We just need to fsync the directory in which these inodes are linked,
-            // which we know to be the timeline directory.
-            //
-            // We use fatal_err() below because the after writer.finish() returns with success,
-            // the in-memory state of the filesystem already has the layer file in its final place,
-            // and subsequent pageserver code could think it's durable while it really isn't.
-            let timeline_dir = VirtualFile::open(
-                &self
-                    .conf
-                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
-            )
-            .await
-            .fatal_err("VirtualFile::open for timeline dir fsync");
-            timeline_dir
-                .sync_all()
-                .await
-                .fatal_err("VirtualFile::sync_all timeline dir");
-        }
-
-        stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
-        stats.new_deltas_count = Some(new_layers.len());
-        stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum());
-
-        match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
-            .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
-        {
-            Ok(stats_json) => {
-                info!(
-                    stats_json = stats_json.as_str(),
-                    "compact_level0_phase1 stats available"
-                )
-            }
-            Err(e) => {
-                warn!("compact_level0_phase1 stats failed to serialize: {:#}", e);
-            }
-        }
-
-        Ok(CompactLevel0Phase1Result {
-            new_layers,
-            deltas_to_compact: deltas_to_compact
-                .into_iter()
-                .map(|x| x.drop_eviction_guard())
-                .collect::<Vec<_>>(),
-        })
-    }
-
-    ///
-    /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
-    /// as Level 1 files.
-    ///
-    async fn compact_level0(
-        self: &Arc<Self>,
-        target_file_size: u64,
-        ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
-        let CompactLevel0Phase1Result {
-            new_layers,
-            deltas_to_compact,
-        } = {
-            let phase1_span = info_span!("compact_level0_phase1");
-            let ctx = ctx.attached_child();
-            let mut stats = CompactLevel0Phase1StatsBuilder {
-                version: Some(2),
-                tenant_id: Some(self.tenant_shard_id),
-                timeline_id: Some(self.timeline_id),
-                ..Default::default()
-            };
-
-            let begin = tokio::time::Instant::now();
-            let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
-            let now = tokio::time::Instant::now();
-            stats.read_lock_acquisition_micros =
-                DurationRecorder::Recorded(RecordedDuration(now - begin), now);
-            self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx)
-                .instrument(phase1_span)
-                .await?
-        };
-
-        if new_layers.is_empty() && deltas_to_compact.is_empty() {
-            // nothing to do
-            return Ok(());
-        }
-
-        self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
-            .await?;
-        Ok(())
-    }
-
     async fn finish_compact_batch(
         self: &Arc<Self>,
         new_deltas: &[ResidentLayer],
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8b544b1c3a..74b75dabf0 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,24 +4,32 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.
 
+use std::collections::BinaryHeap;
 use std::ops::{Deref, Range};
 use std::sync::Arc;
 
-use super::Timeline;
+use super::layer_manager::LayerManager;
+use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};
 
+use anyhow::{anyhow, Context};
 use async_trait::async_trait;
+use enumset::EnumSet;
 use fail::fail_point;
+use itertools::Itertools;
+use pageserver_api::shard::TenantShardId;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, trace, warn};
+use tracing::{debug, info, info_span, trace, warn, Instrument};
+use utils::id::TimelineId;
 
-use crate::context::RequestContext;
+use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
-use crate::tenant::timeline::{is_rel_fsm_block_key, is_rel_vm_block_key};
+use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole};
 use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::tenant::PageReconstructError;
-use crate::ZERO_PAGE;
+use crate::virtual_file::{MaybeFatalIo, VirtualFile};
+use crate::{page_cache, ZERO_PAGE};
 
 use crate::keyspace::KeySpace;
 use crate::repository::Key;
@@ -33,6 +41,694 @@ use pageserver_compaction::interface::*;
 
 use super::CompactionError;
 
+impl Timeline {
+    /// TODO: cancellation
+    pub(crate) async fn compact_legacy(
+        self: &Arc<Self>,
+        _cancel: &CancellationToken,
+        flags: EnumSet<CompactFlags>,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
+        // High level strategy for compaction / image creation:
+        //
+        // 1. First, calculate the desired "partitioning" of the
+        // currently in-use key space. The goal is to partition the
+        // key space into roughly fixed-size chunks, but also take into
+        // account any existing image layers, and try to align the
+        // chunk boundaries with the existing image layers to avoid
+        // too much churn. Also try to align chunk boundaries with
+        // relation boundaries.  In principle, we don't know about
+        // relation boundaries here, we just deal with key-value
+        // pairs, and the code in pgdatadir_mapping.rs knows how to
+        // map relations into key-value pairs. But in practice we know
+        // that 'field6' is the block number, and the fields 1-5
+        // identify a relation. This is just an optimization,
+        // though.
+        //
+        // 2. Once we know the partitioning, for each partition,
+        // decide if it's time to create a new image layer. The
+        // criteria is: there has been too much "churn" since the last
+        // image layer? The "churn" is fuzzy concept, it's a
+        // combination of too many delta files, or too much WAL in
+        // total in the delta file. Or perhaps: if creating an image
+        // file would allow to delete some older files.
+        //
+        // 3. After that, we compact all level0 delta files if there
+        // are too many of them.  While compacting, we also garbage
+        // collect any page versions that are no longer needed because
+        // of the new image layers we created in step 2.
+        //
+        // TODO: This high level strategy hasn't been implemented yet.
+        // Below are functions compact_level0() and create_image_layers()
+        // but they are a bit ad hoc and don't quite work like it's explained
+        // above. Rewrite it.
+
+        // Is the timeline being deleted?
+        if self.is_stopping() {
+            trace!("Dropping out of compaction on timeline shutdown");
+            return Err(CompactionError::ShuttingDown);
+        }
+
+        let target_file_size = self.get_checkpoint_distance();
+
+        // Define partitioning schema if needed
+
+        // FIXME: the match should only cover repartitioning, not the next steps
+        match self
+            .repartition(
+                self.get_last_record_lsn(),
+                self.get_compaction_target_size(),
+                flags,
+                ctx,
+            )
+            .await
+        {
+            Ok((partitioning, lsn)) => {
+                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
+                let image_ctx = RequestContextBuilder::extend(ctx)
+                    .access_stats_behavior(AccessStatsBehavior::Skip)
+                    .build();
+
+                // 2. Compact
+                let timer = self.metrics.compact_time_histo.start_timer();
+                self.compact_level0(target_file_size, ctx).await?;
+                timer.stop_and_record();
+
+                // 3. Create new image layers for partitions that have been modified
+                // "enough".
+                let layers = self
+                    .create_image_layers(
+                        &partitioning,
+                        lsn,
+                        flags.contains(CompactFlags::ForceImageLayerCreation),
+                        &image_ctx,
+                    )
+                    .await
+                    .map_err(anyhow::Error::from)?;
+                if let Some(remote_client) = &self.remote_client {
+                    for layer in layers {
+                        remote_client.schedule_layer_file_upload(layer)?;
+                    }
+                }
+
+                if let Some(remote_client) = &self.remote_client {
+                    // should any new image layer been created, not uploading index_part will
+                    // result in a mismatch between remote_physical_size and layermap calculated
+                    // size, which will fail some tests, but should not be an issue otherwise.
+                    remote_client.schedule_index_upload_for_file_changes()?;
+                }
+            }
+            Err(err) => {
+                // no partitioning? This is normal, if the timeline was just created
+                // as an empty timeline. Also in unit tests, when we use the timeline
+                // as a simple key-value store, ignoring the datadir layout. Log the
+                // error but continue.
+                //
+                // Suppress error when it's due to cancellation
+                if !self.cancel.is_cancelled() {
+                    tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
+                }
+            }
+        };
+
+        Ok(())
+    }
+
+    /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
+    /// as Level 1 files.
+    async fn compact_level0(
+        self: &Arc<Self>,
+        target_file_size: u64,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
+        let CompactLevel0Phase1Result {
+            new_layers,
+            deltas_to_compact,
+        } = {
+            let phase1_span = info_span!("compact_level0_phase1");
+            let ctx = ctx.attached_child();
+            let mut stats = CompactLevel0Phase1StatsBuilder {
+                version: Some(2),
+                tenant_id: Some(self.tenant_shard_id),
+                timeline_id: Some(self.timeline_id),
+                ..Default::default()
+            };
+
+            let begin = tokio::time::Instant::now();
+            let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
+            let now = tokio::time::Instant::now();
+            stats.read_lock_acquisition_micros =
+                DurationRecorder::Recorded(RecordedDuration(now - begin), now);
+            self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx)
+                .instrument(phase1_span)
+                .await?
+        };
+
+        if new_layers.is_empty() && deltas_to_compact.is_empty() {
+            // nothing to do
+            return Ok(());
+        }
+
+        self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
+            .await?;
+        Ok(())
+    }
+
+    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
+    async fn compact_level0_phase1(
+        self: &Arc<Self>,
+        guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
+        mut stats: CompactLevel0Phase1StatsBuilder,
+        target_file_size: u64,
+        ctx: &RequestContext,
+    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
+        stats.read_lock_held_spawn_blocking_startup_micros =
+            stats.read_lock_acquisition_micros.till_now(); // set by caller
+        let layers = guard.layer_map();
+        let level0_deltas = layers.get_level0_deltas()?;
+        let mut level0_deltas = level0_deltas
+            .into_iter()
+            .map(|x| guard.get_from_desc(&x))
+            .collect_vec();
+        stats.level0_deltas_count = Some(level0_deltas.len());
+        // Only compact if enough layers have accumulated.
+        let threshold = self.get_compaction_threshold();
+        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
+            debug!(
+                level0_deltas = level0_deltas.len(),
+                threshold, "too few deltas to compact"
+            );
+            return Ok(CompactLevel0Phase1Result::default());
+        }
+
+        // This failpoint is used together with `test_duplicate_layers` integration test.
+        // It returns the compaction result exactly the same layers as input to compaction.
+        // We want to ensure that this will not cause any problem when updating the layer map
+        // after the compaction is finished.
+        //
+        // Currently, there are two rare edge cases that will cause duplicated layers being
+        // inserted.
+        // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
+        //    is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
+        //    map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
+        //    point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
+        //    and this causes an overwrite. This is acceptable because the content is the same, and we should do a
+        //    layer replace instead of the normal remove / upload process.
+        // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
+        //    size length. Compaction will likely create the same set of n files afterwards.
+        //
+        // This failpoint is a superset of both of the cases.
+        if cfg!(feature = "testing") {
+            let active = (|| {
+                ::fail::fail_point!("compact-level0-phase1-return-same", |_| true);
+                false
+            })();
+
+            if active {
+                let mut new_layers = Vec::with_capacity(level0_deltas.len());
+                for delta in &level0_deltas {
+                    // we are just faking these layers as being produced again for this failpoint
+                    new_layers.push(
+                        delta
+                            .download_and_keep_resident()
+                            .await
+                            .context("download layer for failpoint")?,
+                    );
+                }
+                tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
+                return Ok(CompactLevel0Phase1Result {
+                    new_layers,
+                    deltas_to_compact: level0_deltas,
+                });
+            }
+        }
+
+        // Gather the files to compact in this iteration.
+        //
+        // Start with the oldest Level 0 delta file, and collect any other
+        // level 0 files that form a contiguous sequence, such that the end
+        // LSN of previous file matches the start LSN of the next file.
+        //
+        // Note that if the files don't form such a sequence, we might
+        // "compact" just a single file. That's a bit pointless, but it allows
+        // us to get rid of the level 0 file, and compact the other files on
+        // the next iteration. This could probably made smarter, but such
+        // "gaps" in the sequence of level 0 files should only happen in case
+        // of a crash, partial download from cloud storage, or something like
+        // that, so it's not a big deal in practice.
+        level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start);
+        let mut level0_deltas_iter = level0_deltas.iter();
+
+        let first_level0_delta = level0_deltas_iter.next().unwrap();
+        let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
+        let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
+
+        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
+        for l in level0_deltas_iter {
+            let lsn_range = &l.layer_desc().lsn_range;
+
+            if lsn_range.start != prev_lsn_end {
+                break;
+            }
+            deltas_to_compact.push(l.download_and_keep_resident().await?);
+            prev_lsn_end = lsn_range.end;
+        }
+        let lsn_range = Range {
+            start: deltas_to_compact
+                .first()
+                .unwrap()
+                .layer_desc()
+                .lsn_range
+                .start,
+            end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end,
+        };
+
+        info!(
+            "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
+            lsn_range.start,
+            lsn_range.end,
+            deltas_to_compact.len(),
+            level0_deltas.len()
+        );
+
+        for l in deltas_to_compact.iter() {
+            info!("compact includes {l}");
+        }
+
+        // We don't need the original list of layers anymore. Drop it so that
+        // we don't accidentally use it later in the function.
+        drop(level0_deltas);
+
+        stats.read_lock_held_prerequisites_micros = stats
+            .read_lock_held_spawn_blocking_startup_micros
+            .till_now();
+
+        // Determine N largest holes where N is number of compacted layers.
+        let max_holes = deltas_to_compact.len();
+        let last_record_lsn = self.get_last_record_lsn();
+        let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
+        let min_hole_coverage_size = 3; // TODO: something more flexible?
+
+        // min-heap (reserve space for one more element added before eviction)
+        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
+        let mut prev: Option<Key> = None;
+
+        let mut all_keys = Vec::new();
+
+        for l in deltas_to_compact.iter() {
+            all_keys.extend(l.load_keys(ctx).await?);
+        }
+
+        // FIXME: should spawn_blocking the rest of this function
+
+        // The current stdlib sorting implementation is designed in a way where it is
+        // particularly fast where the slice is made up of sorted sub-ranges.
+        all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
+
+        stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
+
+        for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
+            if let Some(prev_key) = prev {
+                // just first fast filter
+                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
+                    let key_range = prev_key..next_key;
+                    // Measuring hole by just subtraction of i128 representation of key range boundaries
+                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
+                    // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
+                    // That is why it is better to measure size of hole as number of covering image layers.
+                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
+                    if coverage_size >= min_hole_coverage_size {
+                        heap.push(Hole {
+                            key_range,
+                            coverage_size,
+                        });
+                        if heap.len() > max_holes {
+                            heap.pop(); // remove smallest hole
+                        }
+                    }
+                }
+            }
+            prev = Some(next_key.next());
+        }
+        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
+        drop_rlock(guard);
+        stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
+        let mut holes = heap.into_vec();
+        holes.sort_unstable_by_key(|hole| hole.key_range.start);
+        let mut next_hole = 0; // index of next hole in holes vector
+
+        // This iterator walks through all key-value pairs from all the layers
+        // we're compacting, in key, LSN order.
+        let all_values_iter = all_keys.iter();
+
+        // This iterator walks through all keys and is needed to calculate size used by each key
+        let mut all_keys_iter = all_keys
+            .iter()
+            .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size))
+            .coalesce(|mut prev, cur| {
+                // Coalesce keys that belong to the same key pair.
+                // This ensures that compaction doesn't put them
+                // into different layer files.
+                // Still limit this by the target file size,
+                // so that we keep the size of the files in
+                // check.
+                if prev.0 == cur.0 && prev.2 < target_file_size {
+                    prev.2 += cur.2;
+                    Ok(prev)
+                } else {
+                    Err((prev, cur))
+                }
+            });
+
+        // Merge the contents of all the input delta layers into a new set
+        // of delta layers, based on the current partitioning.
+        //
+        // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one.
+        // It's possible that there is a single key with so many page versions that storing all of them in a single layer file
+        // would be too large. In that case, we also split on the LSN dimension.
+        //
+        // LSN
+        //  ^
+        //  |
+        //  | +-----------+            +--+--+--+--+
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            |  |  |  |  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+     ==>    |  |  |  |  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            |  |  |  |  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            +--+--+--+--+
+        //  |
+        //  +--------------> key
+        //
+        //
+        // If one key (X) has a lot of page versions:
+        //
+        // LSN
+        //  ^
+        //  |                                 (X)
+        //  | +-----------+            +--+--+--+--+
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            |  |  +--+  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+     ==>    |  |  |  |  |
+        //  | |           |            |  |  +--+  |
+        //  | +-----------+            |  |  |  |  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            +--+--+--+--+
+        //  |
+        //  +--------------> key
+        // TODO: this actually divides the layers into fixed-size chunks, not
+        // based on the partitioning.
+        //
+        // TODO: we should also opportunistically materialize and
+        // garbage collect what we can.
+        let mut new_layers = Vec::new();
+        let mut prev_key: Option<Key> = None;
+        let mut writer: Option<DeltaLayerWriter> = None;
+        let mut key_values_total_size = 0u64;
+        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
+        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
+
+        for &DeltaEntry {
+            key, lsn, ref val, ..
+        } in all_values_iter
+        {
+            let value = val.load(ctx).await?;
+            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
+            // We need to check key boundaries once we reach next key or end of layer with the same key
+            if !same_key || lsn == dup_end_lsn {
+                let mut next_key_size = 0u64;
+                let is_dup_layer = dup_end_lsn.is_valid();
+                dup_start_lsn = Lsn::INVALID;
+                if !same_key {
+                    dup_end_lsn = Lsn::INVALID;
+                }
+                // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
+                for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
+                    next_key_size = next_size;
+                    if key != next_key {
+                        if dup_end_lsn.is_valid() {
+                            // We are writting segment with duplicates:
+                            // place all remaining values of this key in separate segment
+                            dup_start_lsn = dup_end_lsn; // new segments starts where old stops
+                            dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
+                        }
+                        break;
+                    }
+                    key_values_total_size += next_size;
+                    // Check if it is time to split segment: if total keys size is larger than target file size.
+                    // We need to avoid generation of empty segments if next_size > target_file_size.
+                    if key_values_total_size > target_file_size && lsn != next_lsn {
+                        // Split key between multiple layers: such layer can contain only single key
+                        dup_start_lsn = if dup_end_lsn.is_valid() {
+                            dup_end_lsn // new segment with duplicates starts where old one stops
+                        } else {
+                            lsn // start with the first LSN for this key
+                        };
+                        dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
+                        break;
+                    }
+                }
+                // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
+                if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
+                    dup_start_lsn = dup_end_lsn;
+                    dup_end_lsn = lsn_range.end;
+                }
+                if writer.is_some() {
+                    let written_size = writer.as_mut().unwrap().size();
+                    let contains_hole =
+                        next_hole < holes.len() && key >= holes[next_hole].key_range.end;
+                    // check if key cause layer overflow or contains hole...
+                    if is_dup_layer
+                        || dup_end_lsn.is_valid()
+                        || written_size + key_values_total_size > target_file_size
+                        || contains_hole
+                    {
+                        // ... if so, flush previous layer and prepare to write new one
+                        new_layers.push(
+                            writer
+                                .take()
+                                .unwrap()
+                                .finish(prev_key.unwrap().next(), self)
+                                .await?,
+                        );
+                        writer = None;
+
+                        if contains_hole {
+                            // skip hole
+                            next_hole += 1;
+                        }
+                    }
+                }
+                // Remember size of key value because at next iteration we will access next item
+                key_values_total_size = next_key_size;
+            }
+            fail_point!("delta-layer-writer-fail-before-finish", |_| {
+                Err(CompactionError::Other(anyhow::anyhow!(
+                    "failpoint delta-layer-writer-fail-before-finish"
+                )))
+            });
+
+            if !self.shard_identity.is_key_disposable(&key) {
+                if writer.is_none() {
+                    // Create writer if not initiaized yet
+                    writer = Some(
+                        DeltaLayerWriter::new(
+                            self.conf,
+                            self.timeline_id,
+                            self.tenant_shard_id,
+                            key,
+                            if dup_end_lsn.is_valid() {
+                                // this is a layer containing slice of values of the same key
+                                debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
+                                dup_start_lsn..dup_end_lsn
+                            } else {
+                                debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+                                lsn_range.clone()
+                            },
+                        )
+                        .await?,
+                    );
+                }
+
+                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+            } else {
+                debug!(
+                    "Dropping key {} during compaction (it belongs on shard {:?})",
+                    key,
+                    self.shard_identity.get_shard_number(&key)
+                );
+            }
+
+            if !new_layers.is_empty() {
+                fail_point!("after-timeline-compacted-first-L1");
+            }
+
+            prev_key = Some(key);
+        }
+        if let Some(writer) = writer {
+            new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?);
+        }
+
+        // Sync layers
+        if !new_layers.is_empty() {
+            // Print a warning if the created layer is larger than double the target size
+            // Add two pages for potential overhead. This should in theory be already
+            // accounted for in the target calculation, but for very small targets,
+            // we still might easily hit the limit otherwise.
+            let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
+            for layer in new_layers.iter() {
+                if layer.layer_desc().file_size > warn_limit {
+                    warn!(
+                        %layer,
+                        "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size
+                    );
+                }
+            }
+
+            // The writer.finish() above already did the fsync of the inodes.
+            // We just need to fsync the directory in which these inodes are linked,
+            // which we know to be the timeline directory.
+            //
+            // We use fatal_err() below because the after writer.finish() returns with success,
+            // the in-memory state of the filesystem already has the layer file in its final place,
+            // and subsequent pageserver code could think it's durable while it really isn't.
+            let timeline_dir = VirtualFile::open(
+                &self
+                    .conf
+                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+            )
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
+        }
+
+        stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
+        stats.new_deltas_count = Some(new_layers.len());
+        stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum());
+
+        match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
+            .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
+        {
+            Ok(stats_json) => {
+                info!(
+                    stats_json = stats_json.as_str(),
+                    "compact_level0_phase1 stats available"
+                )
+            }
+            Err(e) => {
+                warn!("compact_level0_phase1 stats failed to serialize: {:#}", e);
+            }
+        }
+
+        Ok(CompactLevel0Phase1Result {
+            new_layers,
+            deltas_to_compact: deltas_to_compact
+                .into_iter()
+                .map(|x| x.drop_eviction_guard())
+                .collect::<Vec<_>>(),
+        })
+    }
+}
+
+#[derive(Default)]
+struct CompactLevel0Phase1Result {
+    new_layers: Vec<ResidentLayer>,
+    deltas_to_compact: Vec<Layer>,
+}
+
+#[derive(Default)]
+struct CompactLevel0Phase1StatsBuilder {
+    version: Option<u64>,
+    tenant_id: Option<TenantShardId>,
+    timeline_id: Option<TimelineId>,
+    read_lock_acquisition_micros: DurationRecorder,
+    read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
+    read_lock_held_key_sort_micros: DurationRecorder,
+    read_lock_held_prerequisites_micros: DurationRecorder,
+    read_lock_held_compute_holes_micros: DurationRecorder,
+    read_lock_drop_micros: DurationRecorder,
+    write_layer_files_micros: DurationRecorder,
+    level0_deltas_count: Option<usize>,
+    new_deltas_count: Option<usize>,
+    new_deltas_size: Option<u64>,
+}
+
+#[derive(serde::Serialize)]
+struct CompactLevel0Phase1Stats {
+    version: u64,
+    tenant_id: TenantShardId,
+    timeline_id: TimelineId,
+    read_lock_acquisition_micros: RecordedDuration,
+    read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
+    read_lock_held_key_sort_micros: RecordedDuration,
+    read_lock_held_prerequisites_micros: RecordedDuration,
+    read_lock_held_compute_holes_micros: RecordedDuration,
+    read_lock_drop_micros: RecordedDuration,
+    write_layer_files_micros: RecordedDuration,
+    level0_deltas_count: usize,
+    new_deltas_count: usize,
+    new_deltas_size: u64,
+}
+
+impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
+    type Error = anyhow::Error;
+
+    fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result<Self, Self::Error> {
+        Ok(Self {
+            version: value.version.ok_or_else(|| anyhow!("version not set"))?,
+            tenant_id: value
+                .tenant_id
+                .ok_or_else(|| anyhow!("tenant_id not set"))?,
+            timeline_id: value
+                .timeline_id
+                .ok_or_else(|| anyhow!("timeline_id not set"))?,
+            read_lock_acquisition_micros: value
+                .read_lock_acquisition_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?,
+            read_lock_held_spawn_blocking_startup_micros: value
+                .read_lock_held_spawn_blocking_startup_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
+            read_lock_held_key_sort_micros: value
+                .read_lock_held_key_sort_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
+            read_lock_held_prerequisites_micros: value
+                .read_lock_held_prerequisites_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?,
+            read_lock_held_compute_holes_micros: value
+                .read_lock_held_compute_holes_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?,
+            read_lock_drop_micros: value
+                .read_lock_drop_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
+            write_layer_files_micros: value
+                .write_layer_files_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?,
+            level0_deltas_count: value
+                .level0_deltas_count
+                .ok_or_else(|| anyhow!("level0_deltas_count not set"))?,
+            new_deltas_count: value
+                .new_deltas_count
+                .ok_or_else(|| anyhow!("new_deltas_count not set"))?,
+            new_deltas_size: value
+                .new_deltas_size
+                .ok_or_else(|| anyhow!("new_deltas_size not set"))?,
+        })
+    }
+}
+
 impl Timeline {
     /// Entry point for new tiered compaction algorithm.
     ///

From a3ef50c9b60b2652eb6cc863acf0f4c92ed157a0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 6 Mar 2024 11:26:29 +0000
Subject: [PATCH 0348/1571] storage controller: use 'lazy' mode for
 location_config (#6987)

## Problem

If large numbers of shards are attached to a pageserver concurrently,
for example after another node fails, it can cause excessive I/O queue
depths due to all the newly attached shards trying to calculate logical
sizes concurrently.

#6907 added the `lazy` flag to handle this.

## Summary of changes

- Use `lazy=true` from all /location_config calls in the storage
controller Reconciler.
---
 .../attachment_service/src/reconciler.rs      | 26 +++++++++++++------
 .../attachment_service/src/service.rs         |  1 +
 control_plane/src/pageserver.rs               |  3 ++-
 pageserver/client/src/mgmt_api.rs             | 25 ++++++++++++------
 4 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index b633b217c7..d4f940373f 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -104,6 +104,7 @@ impl Reconciler {
         node_id: NodeId,
         config: LocationConfig,
         flush_ms: Option<Duration>,
+        lazy: bool,
     ) -> anyhow::Result<()> {
         let node = self
             .pageservers
@@ -118,7 +119,7 @@ impl Reconciler {
         let client =
             mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
         client
-            .location_config(self.tenant_shard_id, config.clone(), flush_ms)
+            .location_config(self.tenant_shard_id, config.clone(), flush_ms, lazy)
             .await?;
         tracing::info!("location_config({}) complete: {:?}", node_id, config);
 
@@ -315,8 +316,13 @@ impl Reconciler {
             self.generation,
             None,
         );
-        self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
-            .await?;
+        self.location_config(
+            origin_ps_id,
+            stale_conf,
+            Some(Duration::from_secs(10)),
+            false,
+        )
+        .await?;
 
         let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?);
 
@@ -350,7 +356,8 @@ impl Reconciler {
         );
 
         tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id);
-        self.location_config(dest_ps_id, dest_conf, None).await?;
+        self.location_config(dest_ps_id, dest_conf, None, false)
+            .await?;
 
         if let Some(baseline) = baseline_lsns {
             tracing::info!("🕑 Waiting for LSN to catch up...");
@@ -382,7 +389,7 @@ impl Reconciler {
             None,
             Some(LocationConfigSecondary { warm: true }),
         );
-        self.location_config(origin_ps_id, origin_secondary_conf.clone(), None)
+        self.location_config(origin_ps_id, origin_secondary_conf.clone(), None, false)
             .await?;
         // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
         // partway through.  In fact, all location conf API calls should be in a wrapper that sets
@@ -405,7 +412,7 @@ impl Reconciler {
             self.generation,
             None,
         );
-        self.location_config(dest_ps_id, dest_final_conf.clone(), None)
+        self.location_config(dest_ps_id, dest_final_conf.clone(), None, false)
             .await?;
         self.observed.locations.insert(
             dest_ps_id,
@@ -491,7 +498,10 @@ impl Reconciler {
                         wanted_conf.generation = generation.into();
                     }
                     tracing::info!(%node_id, "Observed configuration requires update.");
-                    self.location_config(node_id, wanted_conf, None).await?;
+                    // Use lazy=true, because we may run many of Self concurrently, and do not want to
+                    // overload the pageserver with logical size calculations.
+                    self.location_config(node_id, wanted_conf, None, true)
+                        .await?;
                     self.compute_notify().await?;
                 }
             }
@@ -543,7 +553,7 @@ impl Reconciler {
             if self.cancel.is_cancelled() {
                 return Err(ReconcileError::Cancel);
             }
-            self.location_config(node_id, conf, None).await?;
+            self.location_config(node_id, conf, None, false).await?;
         }
 
         Ok(())
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 4209b62db3..bc34c9dcf6 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -468,6 +468,7 @@ impl Service {
                         tenant_conf: models::TenantConfig::default(),
                     },
                     None,
+                    false,
                 )
                 .await
             {
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 7d0c07a938..b2904c1191 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -537,10 +537,11 @@ impl PageServerNode {
         tenant_shard_id: TenantShardId,
         config: LocationConfig,
         flush_ms: Option<Duration>,
+        lazy: bool,
     ) -> anyhow::Result<()> {
         Ok(self
             .http_client
-            .location_config(tenant_shard_id, config, flush_ms)
+            .location_config(tenant_shard_id, config, flush_ms, lazy)
             .await?)
     }
 
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 969d0d99c0..4dde7bdf0b 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -251,21 +251,30 @@ impl Client {
         tenant_shard_id: TenantShardId,
         config: LocationConfig,
         flush_ms: Option<std::time::Duration>,
+        lazy: bool,
     ) -> Result<()> {
         let req_body = TenantLocationConfigRequest {
             tenant_id: tenant_shard_id,
             config,
         };
-        let path = format!(
+
+        let mut path = reqwest::Url::parse(&format!(
             "{}/v1/tenant/{}/location_config",
             self.mgmt_api_endpoint, tenant_shard_id
-        );
-        let path = if let Some(flush_ms) = flush_ms {
-            format!("{}?flush_ms={}", path, flush_ms.as_millis())
-        } else {
-            path
-        };
-        self.request(Method::PUT, &path, &req_body).await?;
+        ))
+        // Should always work: mgmt_api_endpoint is configuration, not user input.
+        .expect("Cannot build URL");
+
+        if lazy {
+            path.query_pairs_mut().append_pair("lazy", "true");
+        }
+
+        if let Some(flush_ms) = flush_ms {
+            path.query_pairs_mut()
+                .append_pair("flush_ms", &format!("{}", flush_ms.as_millis()));
+        }
+
+        self.request(Method::PUT, path, &req_body).await?;
         Ok(())
     }
 

From 4a31e18c81edbfdf78fddcc8cba6391d64dc169c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 6 Mar 2024 13:56:30 +0000
Subject: [PATCH 0349/1571] storage controller: include stripe size in compute
 notifications (#6974)

## Problem

- The storage controller is the source of truth for a tenant's stripe
size, but doesn't currently have a way to propagate that to compute:
we're just using the default stripe size everywhere.

Closes: https://github.com/neondatabase/neon/issues/6903

## Summary of changes

- Include stripe size in `ComputeHookNotifyRequest`
- Include stripe size in `LocationConfigResponse`

The stripe size is optional: it will only be advertised for
multi-sharded tenants. This enables the controller to defer the choice
of stripe size until we split a tenant for the first time.
---
 .../attachment_service/src/compute_hook.rs    | 258 ++++++++++++++----
 .../attachment_service/src/reconciler.rs      |   7 +-
 .../attachment_service/src/service.rs         |  34 ++-
 control_plane/src/bin/neon_local.rs           |   2 +-
 control_plane/src/endpoint.rs                 |  10 +-
 libs/pageserver_api/src/models.rs             |   2 +
 pageserver/src/http/openapi_spec.yml          |   4 +
 pageserver/src/http/routes.rs                 |  19 +-
 pageserver/src/tenant.rs                      |   5 +
 test_runner/regress/test_sharding_service.py  |  26 +-
 10 files changed, 291 insertions(+), 76 deletions(-)

diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs
index b5e90491c6..bebc62ac2f 100644
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -3,7 +3,7 @@ use std::{collections::HashMap, time::Duration};
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
 use hyper::{Method, StatusCode};
-use pageserver_api::shard::{ShardIndex, ShardNumber, TenantShardId};
+use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
@@ -19,8 +19,66 @@ const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
 
 pub(crate) const API_CONCURRENCY: usize = 32;
 
-pub(super) struct ComputeHookTenant {
-    shards: Vec<(ShardIndex, NodeId)>,
+struct ShardedComputeHookTenant {
+    stripe_size: ShardStripeSize,
+    shard_count: ShardCount,
+    shards: Vec<(ShardNumber, NodeId)>,
+}
+
+enum ComputeHookTenant {
+    Unsharded(NodeId),
+    Sharded(ShardedComputeHookTenant),
+}
+
+impl ComputeHookTenant {
+    /// Construct with at least one shard's information
+    fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self {
+        if tenant_shard_id.shard_count.count() > 1 {
+            Self::Sharded(ShardedComputeHookTenant {
+                shards: vec![(tenant_shard_id.shard_number, node_id)],
+                stripe_size,
+                shard_count: tenant_shard_id.shard_count,
+            })
+        } else {
+            Self::Unsharded(node_id)
+        }
+    }
+
+    /// Set one shard's location.  If stripe size or shard count have changed, Self is reset
+    /// and drops existing content.
+    fn update(
+        &mut self,
+        tenant_shard_id: TenantShardId,
+        stripe_size: ShardStripeSize,
+        node_id: NodeId,
+    ) {
+        match self {
+            Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
+                *existing_node_id = node_id
+            }
+            Self::Sharded(sharded_tenant)
+                if sharded_tenant.stripe_size == stripe_size
+                    && sharded_tenant.shard_count == tenant_shard_id.shard_count =>
+            {
+                if let Some(existing) = sharded_tenant
+                    .shards
+                    .iter()
+                    .position(|s| s.0 == tenant_shard_id.shard_number)
+                {
+                    sharded_tenant.shards.get_mut(existing).unwrap().1 = node_id;
+                } else {
+                    sharded_tenant
+                        .shards
+                        .push((tenant_shard_id.shard_number, node_id));
+                    sharded_tenant.shards.sort_by_key(|s| s.0)
+                }
+            }
+            _ => {
+                // Shard count changed: reset struct.
+                *self = Self::new(tenant_shard_id, stripe_size, node_id);
+            }
+        }
+    }
 }
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -33,6 +91,7 @@ struct ComputeHookNotifyRequestShard {
 #[derive(Serialize, Deserialize, Debug)]
 struct ComputeHookNotifyRequest {
     tenant_id: TenantId,
+    stripe_size: Option<ShardStripeSize>,
     shards: Vec<ComputeHookNotifyRequestShard>,
 }
 
@@ -63,42 +122,43 @@ pub(crate) enum NotifyError {
 }
 
 impl ComputeHookTenant {
-    async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
-        // Find the highest shard count and drop any shards that aren't
-        // for that shard count.
-        let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max();
-        let Some(shard_count) = shard_count else {
-            // No shards, nothing to do.
-            tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards");
-            return None;
-        };
-
-        self.shards.retain(|(k, _v)| k.shard_count == shard_count);
-        self.shards
-            .sort_by_key(|(shard, _node_id)| shard.shard_number);
-
-        if self.shards.len() == shard_count.count() as usize || shard_count.is_unsharded() {
-            // We have pageservers for all the shards: emit a configuration update
-            return Some(ComputeHookNotifyRequest {
+    fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
+        match self {
+            Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
                 tenant_id,
-                shards: self
-                    .shards
-                    .iter()
-                    .map(|(shard, node_id)| ComputeHookNotifyRequestShard {
-                        shard_number: shard.shard_number,
-                        node_id: *node_id,
-                    })
-                    .collect(),
-            });
-        } else {
-            tracing::info!(
-                "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
-                self.shards.len(),
-                shard_count.count()
-            );
-        }
+                shards: vec![ComputeHookNotifyRequestShard {
+                    shard_number: ShardNumber(0),
+                    node_id: *node_id,
+                }],
+                stripe_size: None,
+            }),
+            Self::Sharded(sharded_tenant)
+                if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize =>
+            {
+                Some(ComputeHookNotifyRequest {
+                    tenant_id,
+                    shards: sharded_tenant
+                        .shards
+                        .iter()
+                        .map(|(shard_number, node_id)| ComputeHookNotifyRequestShard {
+                            shard_number: *shard_number,
+                            node_id: *node_id,
+                        })
+                        .collect(),
+                    stripe_size: Some(sharded_tenant.stripe_size),
+                })
+            }
+            Self::Sharded(sharded_tenant) => {
+                // Sharded tenant doesn't yet have information for all its shards
 
-        None
+                tracing::info!(
+                    "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
+                    sharded_tenant.shards.len(),
+                    sharded_tenant.shard_count.count()
+                );
+                None
+            }
+        }
     }
 }
 
@@ -139,7 +199,11 @@ impl ComputeHook {
         };
         let cplane =
             ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane");
-        let ComputeHookNotifyRequest { tenant_id, shards } = reconfigure_request;
+        let ComputeHookNotifyRequest {
+            tenant_id,
+            shards,
+            stripe_size,
+        } = reconfigure_request;
 
         let compute_pageservers = shards
             .into_iter()
@@ -156,7 +220,9 @@ impl ComputeHook {
         for (endpoint_name, endpoint) in &cplane.endpoints {
             if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
                 tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
-                endpoint.reconfigure(compute_pageservers.clone()).await?;
+                endpoint
+                    .reconfigure(compute_pageservers.clone(), stripe_size)
+                    .await?;
             }
         }
 
@@ -271,30 +337,26 @@ impl ComputeHook {
         &self,
         tenant_shard_id: TenantShardId,
         node_id: NodeId,
+        stripe_size: ShardStripeSize,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
         let mut locked = self.state.lock().await;
-        let entry = locked
-            .entry(tenant_shard_id.tenant_id)
-            .or_insert_with(|| ComputeHookTenant { shards: Vec::new() });
 
-        let shard_index = ShardIndex {
-            shard_count: tenant_shard_id.shard_count,
-            shard_number: tenant_shard_id.shard_number,
+        use std::collections::hash_map::Entry;
+        let tenant = match locked.entry(tenant_shard_id.tenant_id) {
+            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
+                tenant_shard_id,
+                stripe_size,
+                node_id,
+            )),
+            Entry::Occupied(e) => {
+                let tenant = e.into_mut();
+                tenant.update(tenant_shard_id, stripe_size, node_id);
+                tenant
+            }
         };
 
-        let mut set = false;
-        for (existing_shard, existing_node) in &mut entry.shards {
-            if *existing_shard == shard_index {
-                *existing_node = node_id;
-                set = true;
-            }
-        }
-        if !set {
-            entry.shards.push((shard_index, node_id));
-        }
-
-        let reconfigure_request = entry.maybe_reconfigure(tenant_shard_id.tenant_id).await;
+        let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
         let Some(reconfigure_request) = reconfigure_request else {
             // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
             // until it does.
@@ -316,3 +378,85 @@ impl ComputeHook {
         }
     }
 }
+
+#[cfg(test)]
+pub(crate) mod tests {
+    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use utils::id::TenantId;
+
+    use super::*;
+
+    #[test]
+    fn tenant_updates() -> anyhow::Result<()> {
+        let tenant_id = TenantId::generate();
+        let mut tenant_state = ComputeHookTenant::new(
+            TenantShardId {
+                tenant_id,
+                shard_count: ShardCount::new(0),
+                shard_number: ShardNumber(0),
+            },
+            ShardStripeSize(12345),
+            NodeId(1),
+        );
+
+        // An unsharded tenant is always ready to emit a notification
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .shards
+                .len(),
+            1
+        );
+        assert!(tenant_state
+            .maybe_reconfigure(tenant_id)
+            .unwrap()
+            .stripe_size
+            .is_none());
+
+        // Writing the first shard of a multi-sharded situation (i.e. in a split)
+        // resets the tenant state and puts it in an non-notifying state (need to
+        // see all shards)
+        tenant_state.update(
+            TenantShardId {
+                tenant_id,
+                shard_count: ShardCount::new(2),
+                shard_number: ShardNumber(1),
+            },
+            ShardStripeSize(32768),
+            NodeId(1),
+        );
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
+
+        // Writing the second shard makes it ready to notify
+        tenant_state.update(
+            TenantShardId {
+                tenant_id,
+                shard_count: ShardCount::new(2),
+                shard_number: ShardNumber(0),
+            },
+            ShardStripeSize(32768),
+            NodeId(1),
+        );
+
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .shards
+                .len(),
+            2
+        );
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .stripe_size,
+            Some(ShardStripeSize(32768))
+        );
+
+        Ok(())
+    }
+}
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index d4f940373f..0fa6e8e2f8 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -565,7 +565,12 @@ impl Reconciler {
         if let Some(node_id) = self.intent.attached {
             let result = self
                 .compute_hook
-                .notify(self.tenant_shard_id, node_id, &self.cancel)
+                .notify(
+                    self.tenant_shard_id,
+                    node_id,
+                    self.shard.stripe_size,
+                    &self.cancel,
+                )
                 .await;
             if let Err(e) = &result {
                 // It is up to the caller whether they want to drop out on this error, but they don't have to:
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index bc34c9dcf6..ff35567ff3 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -283,7 +283,11 @@ impl Service {
                     // emit a compute notification for this. In the case where our observed state does not
                     // yet match our intent, we will eventually reconcile, and that will emit a compute notification.
                     if let Some(attached_at) = tenant_state.stably_attached() {
-                        compute_notifications.push((*tenant_shard_id, attached_at));
+                        compute_notifications.push((
+                            *tenant_shard_id,
+                            attached_at,
+                            tenant_state.shard.stripe_size,
+                        ));
                     }
                 }
             }
@@ -493,7 +497,7 @@ impl Service {
     /// Returns a set of any shards for which notifications where not acked within the deadline.
     async fn compute_notify_many(
         &self,
-        notifications: Vec<(TenantShardId, NodeId)>,
+        notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>,
         deadline: Instant,
     ) -> HashSet<TenantShardId> {
         let compute_hook = self.inner.read().unwrap().compute_hook.clone();
@@ -504,11 +508,14 @@ impl Service {
         // Construct an async stream of futures to invoke the compute notify function: we do this
         // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.
         let mut stream = futures::stream::iter(notifications.into_iter())
-            .map(|(tenant_shard_id, node_id)| {
+            .map(|(tenant_shard_id, node_id, stripe_size)| {
                 let compute_hook = compute_hook.clone();
                 let cancel = self.cancel.clone();
                 async move {
-                    if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
+                    if let Err(e) = compute_hook
+                        .notify(tenant_shard_id, node_id, stripe_size, &cancel)
+                        .await
+                    {
                         tracing::error!(
                             %tenant_shard_id,
                             %node_id,
@@ -1396,7 +1403,10 @@ impl Service {
         // First check if this is a creation or an update
         let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
 
-        let mut result = TenantLocationConfigResponse { shards: Vec::new() };
+        let mut result = TenantLocationConfigResponse {
+            shards: Vec::new(),
+            stripe_size: None,
+        };
         let waiters = match create_or_update {
             TenantCreateOrUpdate::Create((create_req, placement_policy)) => {
                 let (create_resp, waiters) =
@@ -1452,6 +1462,11 @@ impl Service {
                             continue;
                         };
 
+                        // Update stripe size
+                        if result.stripe_size.is_none() && shard.shard.count.count() > 1 {
+                            result.stripe_size = Some(shard.shard.stripe_size);
+                        }
+
                         shard.policy = placement_policy;
                         shard.config = tenant_config;
                         if let Some(generation) = update_generation {
@@ -2456,7 +2471,7 @@ impl Service {
                     // as at this point in the split process we have succeeded and this part is infallible:
                     // we will never need to do any special recovery from this state.
 
-                    child_locations.push((child, pageserver));
+                    child_locations.push((child, pageserver, child_shard.stripe_size));
 
                     tenants.insert(child, child_state);
                     response.new_shards.push(child);
@@ -2466,8 +2481,11 @@ impl Service {
 
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
-        for (child_id, child_ps) in child_locations {
-            if let Err(e) = compute_hook.notify(child_id, child_ps, &self.cancel).await {
+        for (child_id, child_ps, stripe_size) in child_locations {
+            if let Err(e) = compute_hook
+                .notify(child_id, child_ps, stripe_size, &self.cancel)
+                .await
+            {
                 tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
                         child_id, child_ps);
                 failed_notifications.push(child_id);
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index cf647a5f9b..1feec5cd9b 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1024,7 +1024,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                         })
                         .collect::<Vec<_>>()
                 };
-            endpoint.reconfigure(pageservers).await?;
+            endpoint.reconfigure(pageservers, None).await?;
         }
         "stop" => {
             let endpoint_id = sub_args
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 5a75bc2a1d..10e4c5d69f 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -52,6 +52,7 @@ use compute_api::spec::RemoteExtSpec;
 use compute_api::spec::Role;
 use nix::sys::signal::kill;
 use nix::sys::signal::Signal;
+use pageserver_api::shard::ShardStripeSize;
 use serde::{Deserialize, Serialize};
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -735,7 +736,11 @@ impl Endpoint {
         }
     }
 
-    pub async fn reconfigure(&self, mut pageservers: Vec<(Host, u16)>) -> Result<()> {
+    pub async fn reconfigure(
+        &self,
+        mut pageservers: Vec<(Host, u16)>,
+        stripe_size: Option<ShardStripeSize>,
+    ) -> Result<()> {
         let mut spec: ComputeSpec = {
             let spec_path = self.endpoint_path().join("spec.json");
             let file = std::fs::File::open(spec_path)?;
@@ -765,6 +770,9 @@ impl Endpoint {
         let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
         assert!(!pageserver_connstr.is_empty());
         spec.pageserver_connstring = Some(pageserver_connstr);
+        if stripe_size.is_some() {
+            spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
+        }
 
         let client = reqwest::Client::new();
         let response = client
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d583866290..57497e3831 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -435,6 +435,8 @@ pub struct TenantShardLocation {
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigResponse {
     pub shards: Vec<TenantShardLocation>,
+    // If the shards' ShardCount count is >1, stripe_size will be set.
+    pub stripe_size: Option<ShardStripeSize>,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 19b5fb7e79..d924224a32 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1339,6 +1339,10 @@ components:
           type: array
           items:
             $ref: "#/components/schemas/TenantShardLocation"
+        stripe_size:
+          description: If multiple shards are present, this field contains the sharding stripe size, else it is null.
+          type: integer
+          nullable: true
     TenantShardLocation:
       type: object
       required:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 9d92fbaee0..6aaf1ab27e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1451,11 +1451,12 @@ async fn put_tenant_location_config_handler(
         tenant::SpawnMode::Eager
     };
 
-    let attached = state
+    let tenant = state
         .tenant_manager
         .upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx)
-        .await?
-        .is_some();
+        .await?;
+    let stripe_size = tenant.as_ref().map(|t| t.get_shard_stripe_size());
+    let attached = tenant.is_some();
 
     if let Some(_flush_ms) = flush {
         match state
@@ -1477,12 +1478,20 @@ async fn put_tenant_location_config_handler(
     // This API returns a vector of pageservers where the tenant is attached: this is
     // primarily for use in the sharding service.  For compatibilty, we also return this
     // when called directly on a pageserver, but the payload is always zero or one shards.
-    let mut response = TenantLocationConfigResponse { shards: Vec::new() };
+    let mut response = TenantLocationConfigResponse {
+        shards: Vec::new(),
+        stripe_size: None,
+    };
     if attached {
         response.shards.push(TenantShardLocation {
             shard_id: tenant_shard_id,
             node_id: state.conf.id,
-        })
+        });
+        if tenant_shard_id.shard_count.count() > 1 {
+            // Stripe size should be set if we are attached
+            debug_assert!(stripe_size.is_some());
+            response.stripe_size = stripe_size;
+        }
     }
 
     json_response(StatusCode::OK, response)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 3423b50eaa..b24c06c4da 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -22,6 +22,7 @@ use pageserver_api::models;
 use pageserver_api::models::TimelineState;
 use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::ShardIdentity;
+use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
@@ -2086,6 +2087,10 @@ impl Tenant {
         &self.tenant_shard_id
     }
 
+    pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize {
+        self.shard_identity.stripe_size
+    }
+
     pub(crate) fn get_generation(&self) -> Generation {
         self.generation
     }
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index bc77dfd084..aecc244a47 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -1,7 +1,7 @@
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 
 import pytest
 from fixtures.log_helper import log
@@ -443,10 +443,12 @@ def test_sharding_service_compute_hook(
 
     # Initial notification from tenant creation
     assert len(notifications) == 1
-    expect = {
+    expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = {
         "tenant_id": str(env.initial_tenant),
+        "stripe_size": None,
         "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
     }
+    assert notifications[0] == expect
 
     env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
 
@@ -460,6 +462,7 @@ def test_sharding_service_compute_hook(
     log.info(f"notifications: {notifications}")
     expect = {
         "tenant_id": str(env.initial_tenant),
+        "stripe_size": None,
         "shards": [{"node_id": int(env.pageservers[1].id), "shard_number": 0}],
     }
 
@@ -475,10 +478,27 @@ def test_sharding_service_compute_hook(
 
     def received_restart_notification():
         assert len(notifications) == 3
-        assert notifications[1] == expect
+        assert notifications[2] == expect
 
     wait_until(10, 1, received_restart_notification)
 
+    # Splitting a tenant should cause its stripe size to become visible in the compute notification
+    env.attachment_service.tenant_shard_split(env.initial_tenant, shard_count=2)
+    expect = {
+        "tenant_id": str(env.initial_tenant),
+        "stripe_size": 32768,
+        "shards": [
+            {"node_id": int(env.pageservers[1].id), "shard_number": 0},
+            {"node_id": int(env.pageservers[1].id), "shard_number": 1},
+        ],
+    }
+
+    def received_split_notification():
+        assert len(notifications) == 4
+        assert notifications[3] == expect
+
+    wait_until(10, 1, received_split_notification)
+
     env.attachment_service.consistency_check()
 
 
From 5dc2088cf3dd2ff7ed984a337e7331f5a7eabf6c Mon Sep 17 00:00:00 2001
From: Alex Chi Z <chi@neon.tech>
Date: Wed, 6 Mar 2024 10:52:24 -0500
Subject: [PATCH 0350/1571] fix(test): drop subscription when test completes
 (#6975)

This pull request mitigates
https://github.com/neondatabase/neon/issues/6969, but the longer-term
problem is that we cannot properly stop Postgres if there is a
subscription.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_neon_superuser.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index e0364dd13f..fd31df84da 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -1,12 +1,9 @@
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
-from fixtures.pg_version import PgVersion, skip_on_postgres
+from fixtures.pg_version import PgVersion
 from fixtures.utils import wait_until
 
 
-@skip_on_postgres(
-    PgVersion.V15, reason="skip on pg15 due to https://github.com/neondatabase/neon/issues/6969"
-)
 def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
     env = neon_simple_env
     env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")
@@ -97,3 +94,6 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
         assert cur.fetchall()[0][0] != "<insufficient privilege>"
         cur.execute("RESET ROLE")
         cur.execute("DROP ROLE not_a_superuser")
+        query = "DROP SUBSCRIPTION sub CASCADE"
+        log.info(f"Dropping subscription: {query}")
+        cur.execute(query)

From a9a4a76d1394e330d8ff91188c0987a19bbbdf3a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 6 Mar 2024 16:47:32 +0000
Subject: [PATCH 0351/1571] storage controller: misc fixes  (#7036)

## Problem

Collection of small changes, batched together to reduce CI overhead.

## Summary of changes

- Layer download messages include size -- this is useful when watching a
pageserver hydrate its on disk cache in the log.
- Controller migrate API could put an invalid NodeId into TenantState
- Scheduling errors during tenant create could result in creating some
shards and not others.
- Consistency check could give hard-to-understand failures in tests if a
reconcile was in process: explicitly fail the check if reconciles are in
progress instead.
---
 .../attachment_service/src/service.rs         | 64 +++++++++++++------
 pageserver/src/tenant/storage_layer/layer.rs  |  2 +-
 2 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index ff35567ff3..d162ab5c65 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1159,9 +1159,12 @@ impl Service {
 
         let (waiters, response_shards) = {
             let mut locked = self.inner.write().unwrap();
-            let (_nodes, tenants, scheduler) = locked.parts_mut();
+            let result_tx = locked.result_tx.clone();
+            let compute_hook = locked.compute_hook.clone();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
 
             let mut response_shards = Vec::new();
+            let mut schcedule_error = None;
 
             for tenant_shard_id in create_ids {
                 tracing::info!("Creating shard {tenant_shard_id}...");
@@ -1198,23 +1201,20 @@ impl Service {
                         continue;
                     }
                     Entry::Vacant(entry) => {
-                        let mut state = TenantState::new(
+                        let state = entry.insert(TenantState::new(
                             tenant_shard_id,
                             ShardIdentity::from_params(
                                 tenant_shard_id.shard_number,
                                 &create_req.shard_parameters,
                             ),
                             placement_policy.clone(),
-                        );
+                        ));
 
                         state.generation = initial_generation;
                         state.config = create_req.config.clone();
-
-                        state.schedule(scheduler).map_err(|e| {
-                            ApiError::Conflict(format!(
-                                "Failed to schedule shard {tenant_shard_id}: {e}"
-                            ))
-                        })?;
+                        if let Err(e) = state.schedule(scheduler) {
+                            schcedule_error = Some(e);
+                        }
 
                         // Only include shards in result if we are attaching: the purpose
                         // of the response is to tell the caller where the shards are attached.
@@ -1228,24 +1228,27 @@ impl Service {
                                 generation: generation.into().unwrap(),
                             });
                         }
-                        entry.insert(state)
                     }
                 };
             }
 
-            // Take a snapshot of pageservers
-            let pageservers = locked.nodes.clone();
+            // If we failed to schedule shards, then they are still created in the controller,
+            // but we return an error to the requester to avoid a silent failure when someone
+            // tries to e.g. create a tenant whose placement policy requires more nodes than
+            // are present in the system.  We do this here rather than in the above loop, to
+            // avoid situations where we only create a subset of shards in the tenant.
+            if let Some(e) = schcedule_error {
+                return Err(ApiError::Conflict(format!(
+                    "Failed to schedule shard(s): {e}"
+                )));
+            }
 
-            let result_tx = locked.result_tx.clone();
-            let compute_hook = locked.compute_hook.clone();
-
-            let waiters = locked
-                .tenants
+            let waiters = tenants
                 .range_mut(TenantShardId::tenant_range(tenant_id))
                 .filter_map(|(_shard_id, shard)| {
                     shard.maybe_reconcile(
                         result_tx.clone(),
-                        &pageservers,
+                        nodes,
                         &compute_hook,
                         &self.config,
                         &self.persistence,
@@ -2516,6 +2519,19 @@ impl Service {
             let compute_hook = locked.compute_hook.clone();
             let (nodes, tenants, scheduler) = locked.parts_mut();
 
+            let Some(node) = nodes.get(&migrate_req.node_id) else {
+                return Err(ApiError::BadRequest(anyhow::anyhow!(
+                    "Node {} not found",
+                    migrate_req.node_id
+                )));
+            };
+
+            if node.availability != NodeAvailability::Active {
+                // Warn but proceed: the caller may intend to manually adjust the placement of
+                // a shard even if the node is down, e.g. if intervening during an incident.
+                tracing::warn!("Migrating to an unavailable node ({})", node.id);
+            }
+
             let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
                 return Err(ApiError::NotFound(
                     anyhow::anyhow!("Tenant shard not found").into(),
@@ -2645,6 +2661,18 @@ impl Service {
                 .map(|t| t.to_persistent())
                 .collect::<Vec<_>>();
 
+            // This method can only validate the state of an idle system: if a reconcile is in
+            // progress, fail out early to avoid giving false errors on state that won't match
+            // between database and memory under a ReconcileResult is processed.
+            for t in locked.tenants.values() {
+                if t.reconciler.is_some() {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "Shard {} reconciliation in progress",
+                        t.tenant_shard_id
+                    )));
+                }
+            }
+
             (expect_nodes, expect_shards)
         };
 
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index e14a2f22cf..6c46b83622 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -976,7 +976,7 @@ impl LayerInner {
                 }
 
                 self.consecutive_failures.store(0, Ordering::Relaxed);
-                tracing::info!("on-demand download successful");
+                tracing::info!(size=%self.desc.file_size, "on-demand download successful");
 
                 Ok(permit)
             }

From f40b13d801782535737530118fbd6b85ef542658 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 6 Mar 2024 17:09:54 +0000
Subject: [PATCH 0352/1571] Update client libs for test_runner/pg_clients to
 their latest versions (#7022)

## Problem
Closes https://github.com/neondatabase/neon/security/dependabot/56
Supersedes https://github.com/neondatabase/neon/pull/7013

Workflow run:
https://github.com/neondatabase/neon/actions/runs/8157302480

## Summary of changes
- Update client libs for `test_runner/pg_clients` to their latest
versions
---
 .../pg_clients/csharp/npgsql/Dockerfile       |   4 +-
 .../csharp/npgsql/csharp-npgsql.csproj        |   4 +-
 test_runner/pg_clients/java/jdbc/Dockerfile   |   4 +-
 .../pg_clients/python/asyncpg/Dockerfile      |   2 +-
 .../python/asyncpg/requirements.txt           |   2 +-
 .../pg_clients/python/pg8000/Dockerfile       |   2 +-
 .../pg_clients/python/pg8000/requirements.txt |   2 +-
 .../pg_clients/rust/tokio-postgres/Cargo.lock | 340 ++++++++++--------
 .../pg_clients/rust/tokio-postgres/Cargo.toml |   2 +-
 .../pg_clients/rust/tokio-postgres/Dockerfile |   2 +-
 .../swift/PostgresClientKitExample/Dockerfile |   4 +-
 .../swift/PostgresNIOExample/Dockerfile       |   4 +-
 .../swift/PostgresNIOExample/Package.resolved |  37 +-
 .../swift/PostgresNIOExample/Package.swift    |   4 +-
 .../typescript/postgresql-client/Dockerfile   |   2 +-
 .../postgresql-client/package-lock.json       |  75 ++--
 .../typescript/postgresql-client/package.json |   2 +-
 .../typescript/serverless-driver/Dockerfile   |   2 +-
 .../serverless-driver/package-lock.json       |  16 +-
 .../typescript/serverless-driver/package.json |   4 +-
 20 files changed, 291 insertions(+), 223 deletions(-)

diff --git a/test_runner/pg_clients/csharp/npgsql/Dockerfile b/test_runner/pg_clients/csharp/npgsql/Dockerfile
index b23eb2e5eb..71717a6006 100644
--- a/test_runner/pg_clients/csharp/npgsql/Dockerfile
+++ b/test_runner/pg_clients/csharp/npgsql/Dockerfile
@@ -1,4 +1,4 @@
-FROM mcr.microsoft.com/dotnet/sdk:7.0 AS build
+FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build
 WORKDIR /source
 
 COPY *.csproj .
@@ -7,7 +7,7 @@ RUN dotnet restore
 COPY . .
 RUN dotnet publish -c release -o /app --no-restore
 
-FROM mcr.microsoft.com/dotnet/runtime:7.0
+FROM mcr.microsoft.com/dotnet/runtime:8.0
 WORKDIR /app
 COPY --from=build /app .
 
diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
index bb4427f2c4..50243e3ea7 100644
--- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
+++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
@@ -2,13 +2,13 @@
 
   <PropertyGroup>
     <OutputType>Exe</OutputType>
-    <TargetFramework>net7.0</TargetFramework>
+    <TargetFramework>net8.0</TargetFramework>
     <ImplicitUsings>enable</ImplicitUsings>
     <Nullable>enable</Nullable>
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Npgsql" Version="7.0.4" />
+    <PackageReference Include="Npgsql" Version="8.0.2" />
   </ItemGroup>
 
 </Project>
diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile
index 74eb9bdc32..7e074e07b8 100644
--- a/test_runner/pg_clients/java/jdbc/Dockerfile
+++ b/test_runner/pg_clients/java/jdbc/Dockerfile
@@ -1,10 +1,10 @@
-FROM openjdk:20
+FROM openjdk:21
 WORKDIR /source
 
 COPY . .
 
 WORKDIR /app
-RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.6.0.jar && \
+RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.7.2.jar && \
     javac -d /app /source/Example.java
 
 CMD ["java", "-cp", "/app/postgresql.jar:.", "Example"]
diff --git a/test_runner/pg_clients/python/asyncpg/Dockerfile b/test_runner/pg_clients/python/asyncpg/Dockerfile
index 8b6d56b8fb..f2cc37a7bb 100644
--- a/test_runner/pg_clients/python/asyncpg/Dockerfile
+++ b/test_runner/pg_clients/python/asyncpg/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11
+FROM python:3.12
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/python/asyncpg/requirements.txt b/test_runner/pg_clients/python/asyncpg/requirements.txt
index b33c21474c..61972959a9 100644
--- a/test_runner/pg_clients/python/asyncpg/requirements.txt
+++ b/test_runner/pg_clients/python/asyncpg/requirements.txt
@@ -1 +1 @@
-asyncpg==0.27.0
+asyncpg==0.29.0
diff --git a/test_runner/pg_clients/python/pg8000/Dockerfile b/test_runner/pg_clients/python/pg8000/Dockerfile
index ebef1f9059..ee1de20da5 100644
--- a/test_runner/pg_clients/python/pg8000/Dockerfile
+++ b/test_runner/pg_clients/python/pg8000/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11
+FROM python:3.12
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt
index a8407c3cb0..e086a937e6 100644
--- a/test_runner/pg_clients/python/pg8000/requirements.txt
+++ b/test_runner/pg_clients/python/pg8000/requirements.txt
@@ -1,2 +1,2 @@
-pg8000==1.29.8
+pg8000==1.30.5
 scramp>=1.4.3
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
index 3ac0f16e4b..a4a2426b97 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
@@ -19,9 +19,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
 [[package]]
 name = "async-trait"
-version = "0.1.74"
+version = "0.1.77"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a66537f1bb974b254c98ed142ff995236e81b9d0fe4db0575f46612cb15eb0f9"
+checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -51,9 +51,9 @@ dependencies = [
 
 [[package]]
 name = "base64"
-version = "0.21.4"
+version = "0.21.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ba43ea6f343b788c8764558649e08df62f86c6ef251fdaeb1ffd010a9ae50a2"
+checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
 
 [[package]]
 name = "bitflags"
@@ -63,9 +63,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "bitflags"
-version = "2.4.1"
+version = "2.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
+checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf"
 
 [[package]]
 name = "block-buffer"
@@ -78,9 +78,9 @@ dependencies = [
 
 [[package]]
 name = "bumpalo"
-version = "3.14.0"
+version = "3.15.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec"
+checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b"
 
 [[package]]
 name = "byteorder"
@@ -96,12 +96,9 @@ checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
 
 [[package]]
 name = "cc"
-version = "1.0.83"
+version = "1.0.89"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
-dependencies = [
- "libc",
-]
+checksum = "a0ba8f7aaa012f30d5b2861462f6708eccd49c3c39863fe083a308035f63d723"
 
 [[package]]
 name = "cfg-if"
@@ -111,9 +108,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
 name = "core-foundation"
-version = "0.9.3"
+version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146"
+checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -121,15 +118,15 @@ dependencies = [
 
 [[package]]
 name = "core-foundation-sys"
-version = "0.8.4"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
+checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
 
 [[package]]
 name = "cpufeatures"
-version = "0.2.9"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1"
+checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504"
 dependencies = [
  "libc",
 ]
@@ -157,12 +154,12 @@ dependencies = [
 
 [[package]]
 name = "errno"
-version = "0.3.5"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
+checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
 dependencies = [
  "libc",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -200,9 +197,9 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
 
 [[package]]
 name = "futures"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40"
+checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -215,9 +212,9 @@ dependencies = [
 
 [[package]]
 name = "futures-channel"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2"
+checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -225,15 +222,15 @@ dependencies = [
 
 [[package]]
 name = "futures-core"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c"
+checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
 
 [[package]]
 name = "futures-executor"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0"
+checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d"
 dependencies = [
  "futures-core",
  "futures-task",
@@ -242,15 +239,15 @@ dependencies = [
 
 [[package]]
 name = "futures-io"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964"
+checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
 
 [[package]]
 name = "futures-macro"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
+checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -259,21 +256,21 @@ dependencies = [
 
 [[package]]
 name = "futures-sink"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e"
+checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
 
 [[package]]
 name = "futures-task"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65"
+checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
 
 [[package]]
 name = "futures-util"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533"
+checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -299,9 +296,9 @@ dependencies = [
 
 [[package]]
 name = "getrandom"
-version = "0.2.10"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427"
+checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5"
 dependencies = [
  "cfg-if",
  "libc",
@@ -310,9 +307,9 @@ dependencies = [
 
 [[package]]
 name = "gimli"
-version = "0.28.0"
+version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0"
+checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
 
 [[package]]
 name = "hmac"
@@ -325,9 +322,9 @@ dependencies = [
 
 [[package]]
 name = "js-sys"
-version = "0.3.64"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a"
+checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
 dependencies = [
  "wasm-bindgen",
 ]
@@ -340,15 +337,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
 
 [[package]]
 name = "libc"
-version = "0.2.149"
+version = "0.2.153"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
+checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.10"
+version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
+checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
 
 [[package]]
 name = "lock_api"
@@ -362,9 +359,9 @@ dependencies = [
 
 [[package]]
 name = "log"
-version = "0.4.20"
+version = "0.4.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
+checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
 
 [[package]]
 name = "md-5"
@@ -378,28 +375,28 @@ dependencies = [
 
 [[package]]
 name = "memchr"
-version = "2.6.4"
+version = "2.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
+checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149"
 
 [[package]]
 name = "miniz_oxide"
-version = "0.7.1"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7"
 dependencies = [
  "adler",
 ]
 
 [[package]]
 name = "mio"
-version = "0.8.8"
+version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2"
+checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
 dependencies = [
  "libc",
  "wasi",
- "windows-sys",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -422,26 +419,26 @@ dependencies = [
 
 [[package]]
 name = "object"
-version = "0.32.1"
+version = "0.32.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9cf5f9dd3933bd50a9e1f149ec995f39ae2c496d31fd772c1fd45ebc27e902b0"
+checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
 dependencies = [
  "memchr",
 ]
 
 [[package]]
 name = "once_cell"
-version = "1.18.0"
+version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
 [[package]]
 name = "openssl"
-version = "0.10.60"
+version = "0.10.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800"
+checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.4.2",
  "cfg-if",
  "foreign-types",
  "libc",
@@ -469,9 +466,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.96"
+version = "0.9.101"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f"
+checksum = "dda2b0f344e78efc2facf7d195d098df0dd72151b26ab98da807afc26c198dff"
 dependencies = [
  "cc",
  "libc",
@@ -497,16 +494,16 @@ checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
 dependencies = [
  "cfg-if",
  "libc",
- "redox_syscall 0.4.1",
+ "redox_syscall",
  "smallvec",
- "windows-targets",
+ "windows-targets 0.48.5",
 ]
 
 [[package]]
 name = "percent-encoding"
-version = "2.3.0"
+version = "2.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
 
 [[package]]
 name = "phf"
@@ -540,9 +537,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
 [[package]]
 name = "pkg-config"
-version = "0.3.27"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
+checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"
 
 [[package]]
 name = "postgres-native-tls"
@@ -594,18 +591,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.69"
+version = "1.0.78"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da"
+checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.33"
+version = "1.0.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
+checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
 dependencies = [
  "proc-macro2",
 ]
@@ -640,15 +637,6 @@ dependencies = [
  "getrandom",
 ]
 
-[[package]]
-name = "redox_syscall"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
-dependencies = [
- "bitflags 1.3.2",
-]
-
 [[package]]
 name = "redox_syscall"
 version = "0.4.1"
@@ -676,24 +664,24 @@ checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
 
 [[package]]
 name = "rustix"
-version = "0.38.19"
+version = "0.38.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "745ecfa778e66b2b63c88a61cb36e0eea109e803b0b86bf9879fbc77c70e86ed"
+checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.4.2",
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "schannel"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c3733bf4cf7ea0880754e19cb5a462007c4a8c1914bff372ccc95b464f1df88"
+checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534"
 dependencies = [
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -753,18 +741,18 @@ dependencies = [
 
 [[package]]
 name = "smallvec"
-version = "1.11.1"
+version = "1.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a"
+checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
 
 [[package]]
 name = "socket2"
-version = "0.5.4"
+version = "0.5.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4031e820eb552adee9295814c0ced9e5cf38ddf1e8b7d566d6de8e2538ea989e"
+checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871"
 dependencies = [
  "libc",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -786,9 +774,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 
 [[package]]
 name = "syn"
-version = "2.0.38"
+version = "2.0.52"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b"
+checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -797,15 +785,14 @@ dependencies = [
 
 [[package]]
 name = "tempfile"
-version = "3.8.0"
+version = "3.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef"
+checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1"
 dependencies = [
  "cfg-if",
  "fastrand",
- "redox_syscall 0.3.5",
  "rustix",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -825,9 +812,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.33.0"
+version = "1.36.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653"
+checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
 dependencies = [
  "backtrace",
  "bytes",
@@ -836,14 +823,14 @@ dependencies = [
  "pin-project-lite",
  "socket2",
  "tokio-macros",
- "windows-sys",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
 name = "tokio-macros"
-version = "2.1.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
+checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -888,9 +875,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-util"
-version = "0.7.9"
+version = "0.7.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d68074620f57a0b21594d9735eb2e98ab38b17f80d3fcb189fca266771ca60d"
+checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
 dependencies = [
  "bytes",
  "futures-core",
@@ -927,9 +914,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
 
 [[package]]
 name = "unicode-bidi"
-version = "0.3.13"
+version = "0.3.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
+checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75"
 
 [[package]]
 name = "unicode-ident"
@@ -939,9 +926,9 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
 
 [[package]]
 name = "unicode-normalization"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
+checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
 dependencies = [
  "tinyvec",
 ]
@@ -965,10 +952,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
-name = "wasm-bindgen"
-version = "0.2.87"
+name = "wasite"
+version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342"
+checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
 dependencies = [
  "cfg-if",
  "wasm-bindgen-macro",
@@ -976,9 +969,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd"
+checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
 dependencies = [
  "bumpalo",
  "log",
@@ -991,9 +984,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d"
+checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -1001,9 +994,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
+checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1014,15 +1007,15 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1"
+checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
 
 [[package]]
 name = "web-sys"
-version = "0.3.64"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b"
+checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -1030,11 +1023,12 @@ dependencies = [
 
 [[package]]
 name = "whoami"
-version = "1.4.1"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50"
+checksum = "0fec781d48b41f8163426ed18e8fc2864c12937df9ce54c88ede7bd47270893e"
 dependencies = [
- "wasm-bindgen",
+ "redox_syscall",
+ "wasite",
  "web-sys",
 ]
 
@@ -1044,7 +1038,16 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
 dependencies = [
- "windows-targets",
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.4",
 ]
 
 [[package]]
@@ -1053,13 +1056,28 @@ version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.4",
+ "windows_aarch64_msvc 0.52.4",
+ "windows_i686_gnu 0.52.4",
+ "windows_i686_msvc 0.52.4",
+ "windows_x86_64_gnu 0.52.4",
+ "windows_x86_64_gnullvm 0.52.4",
+ "windows_x86_64_msvc 0.52.4",
 ]
 
 [[package]]
@@ -1068,38 +1086,80 @@ version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
 
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
 
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
index 6f100aafd5..0f420e5b06 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
@@ -9,7 +9,7 @@ publish = false
 [dependencies]
 native-tls = "0.2.11"
 postgres-native-tls = "0.5.0"
-tokio = { version = "1.33", features=["rt", "macros"] }
+tokio = { version = "1.36", features=["rt", "macros"] }
 tokio-postgres = "0.7.10"
 
 
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
index 1d3709803e..8611e66cbb 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
+++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
@@ -1,4 +1,4 @@
-FROM rust:1.73
+FROM rust:1.76
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
index 9538cf4ed4..0402838820 100644
--- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
+++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
@@ -1,11 +1,11 @@
-FROM swift:5.8 AS build
+FROM swift:5.9 AS build
 RUN apt-get -q update && apt-get -q install -y libssl-dev
 WORKDIR /source
 
 COPY . .
 RUN swift build --configuration release
 
-FROM swift:5.8
+FROM swift:5.9
 WORKDIR /app
 COPY --from=build /source/.build/release .
 CMD ["/app/PostgresClientKitExample"]
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
index 61e1d1bba6..9130e0973f 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
@@ -1,10 +1,10 @@
-FROM swift:5.8 AS build
+FROM swift:5.9 AS build
 WORKDIR /source
 
 COPY . .
 RUN swift build --configuration release
 
-FROM swift:5.8
+FROM swift:5.9
 WORKDIR /app
 COPY --from=build /source/.build/release .
 CMD ["/app/PostgresNIOExample"]
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
index 9f13106011..023e03a7b1 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
@@ -5,8 +5,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/vapor/postgres-nio.git",
       "state" : {
-        "revision" : "061a0836d7c1887e04a975d1d2eaa2ef5fd7dfab",
-        "version" : "1.16.0"
+        "revision" : "69ccfdf4c80144d845e3b439961b7ec6cd7ae33f",
+        "version" : "1.20.2"
       }
     },
     {
@@ -14,8 +14,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-atomics.git",
       "state" : {
-        "revision" : "6c89474e62719ddcc1e9614989fff2f68208fe10",
-        "version" : "1.1.0"
+        "revision" : "cd142fd2f64be2100422d658e7411e39489da985",
+        "version" : "1.2.0"
       }
     },
     {
@@ -41,8 +41,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-log.git",
       "state" : {
-        "revision" : "32e8d724467f8fe623624570367e3d50c5638e46",
-        "version" : "1.5.2"
+        "revision" : "e97a6fcb1ab07462881ac165fdbb37f067e205d5",
+        "version" : "1.5.4"
       }
     },
     {
@@ -50,8 +50,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-metrics.git",
       "state" : {
-        "revision" : "9b39d811a83cf18b79d7d5513b06f8b290198b10",
-        "version" : "2.3.3"
+        "revision" : "971ba26378ab69c43737ee7ba967a896cb74c0d1",
+        "version" : "2.4.1"
       }
     },
     {
@@ -59,8 +59,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-nio.git",
       "state" : {
-        "revision" : "6213ba7a06febe8fef60563a4a7d26a4085783cf",
-        "version" : "2.54.0"
+        "revision" : "635b2589494c97e48c62514bc8b37ced762e0a62",
+        "version" : "2.63.0"
       }
     },
     {
@@ -68,8 +68,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-nio-ssl.git",
       "state" : {
-        "revision" : "e866a626e105042a6a72a870c88b4c531ba05f83",
-        "version" : "2.24.0"
+        "revision" : "7c381eb6083542b124a6c18fae742f55001dc2b5",
+        "version" : "2.26.0"
       }
     },
     {
@@ -77,8 +77,17 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-nio-transport-services.git",
       "state" : {
-        "revision" : "41f4098903878418537020075a4d8a6e20a0b182",
-        "version" : "1.17.0"
+        "revision" : "6cbe0ed2b394f21ab0d46b9f0c50c6be964968ce",
+        "version" : "1.20.1"
+      }
+    },
+    {
+      "identity" : "swift-system",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-system.git",
+      "state" : {
+        "revision" : "025bcb1165deab2e20d4eaba79967ce73013f496",
+        "version" : "1.2.1"
       }
     }
   ],
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
index a80590daa2..637eb4bc9d 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
@@ -1,10 +1,10 @@
-// swift-tools-version:5.8
+// swift-tools-version:5.9
 import PackageDescription
 
 let package = Package(
     name: "PostgresNIOExample",
     dependencies: [
-        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.16.0")
+        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.20.2")
     ],
     targets: [
         .executableTarget(
diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
index 07e98c586b..004b383749 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
+++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
@@ -1,4 +1,4 @@
-FROM node:20
+FROM node:21
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
index 4cedf56acd..b4f8587eac 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
+++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
@@ -5,24 +5,24 @@
   "packages": {
     "": {
       "dependencies": {
-        "postgresql-client": "2.5.9"
+        "postgresql-client": "2.10.5"
       }
     },
     "node_modules/doublylinked": {
-      "version": "2.5.2",
-      "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.2.tgz",
-      "integrity": "sha512-TDh0XfQWWDrfvGdAN0hLNIdkTXlw04nVCO5B/37ie4dV0yw1iT9ZrZ6tD+q/0SwXxeI/u6TF9Mxgd7s5/XYV6A==",
+      "version": "2.5.4",
+      "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.4.tgz",
+      "integrity": "sha512-jBCKDnFkEHJRjQvYEl5N9VngRV8ypHgw6a52OK4VN57eV2r2rYvgOx9uABdY78INNoW7S6auULp+KBVm/jfYqw==",
       "engines": {
         "node": ">= 10.0"
       }
     },
     "node_modules/lightning-pool": {
-      "version": "4.2.1",
-      "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-4.2.1.tgz",
-      "integrity": "sha512-/pUIoGD3nzTH/wI4TYiJM3cLPeUOzGMTfFeBRuxaOAnwL0LZfwvqn5YFqsfyF98M0C3UXxWgfTz+Lu6okkno+g==",
+      "version": "4.2.2",
+      "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-4.2.2.tgz",
+      "integrity": "sha512-KW0Df0IbjNLxy5wAsdErTKYtHGwefLRQseHNksEctyaL7gtRwJT0nqLa2uiRdNYDwKSnZtqOjSjUNtfxmfH1qw==",
       "dependencies": {
-        "doublylinked": "^2.5.2",
-        "putil-promisify": "^1.8.6"
+        "doublylinked": "^2.5.3",
+        "putil-promisify": "^1.10.1"
       }
     },
     "node_modules/obuf": {
@@ -42,16 +42,16 @@
       }
     },
     "node_modules/postgresql-client": {
-      "version": "2.5.9",
-      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.9.tgz",
-      "integrity": "sha512-s+kgTN6TfWLzehEyxw4Im4odnxVRCbZ0DEJzWS6SLowPAmB2m1/DOiOvZC0+ZVoi5AfbGE6SBqFxKguSyVAXZg==",
+      "version": "2.10.5",
+      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.10.5.tgz",
+      "integrity": "sha512-R3EC16pUdbgrzk1J2MQLj7jY2TepWurJHoK90nOeLZj1XTpL/+wL1VCneTmclRVKDuKVjFHr+FASV47KrLpAbw==",
       "dependencies": {
-        "doublylinked": "^2.5.2",
-        "lightning-pool": "^4.2.1",
+        "doublylinked": "^2.5.4",
+        "lightning-pool": "^4.2.2",
         "postgres-bytea": "^3.0.0",
-        "power-tasks": "^1.7.0",
-        "putil-merge": "^3.10.3",
-        "putil-promisify": "^1.10.0",
+        "power-tasks": "^1.7.3",
+        "putil-merge": "^3.12.1",
+        "putil-promisify": "^1.10.1",
         "putil-varhelpers": "^1.6.5"
       },
       "engines": {
@@ -60,30 +60,29 @@
       }
     },
     "node_modules/power-tasks": {
-      "version": "1.7.0",
-      "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.0.tgz",
-      "integrity": "sha512-rndZXCDxhuIDjPUJJvQwBDHaYagCkjvbPF/NA+omh/Ef4rAI9KtnvdA0k98dyiGpn1zXOpc6c2c0JWzg/xAhJg==",
+      "version": "1.7.3",
+      "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.3.tgz",
+      "integrity": "sha512-EnkjLfaX4PxFYHbUWyWzlE4I8SgctaW9jx4qQXrVRoELlqBXrxIMtuhHzRwsHv2qs1tO7efOcZa6/wDCdCjRfA==",
       "dependencies": {
-        "doublylinked": "^2.5.2",
-        "strict-typed-events": "^2.3.1"
+        "doublylinked": "^2.5.4",
+        "strict-typed-events": "^2.3.3"
       },
       "engines": {
-        "node": ">=14.0",
-        "npm": ">=7.0.0"
+        "node": ">=16.0"
       }
     },
     "node_modules/putil-merge": {
-      "version": "3.10.3",
-      "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.10.3.tgz",
-      "integrity": "sha512-B18CYi0/SmBYl9+fgowYWkgzJM/8XcLSeafHrFrGzwySQuOzLW0sOGx0CdFVp9zqaxgLctexUdGoSPpm6CPM6A==",
+      "version": "3.12.1",
+      "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.12.1.tgz",
+      "integrity": "sha512-4clPyRkJPrd5zl98AP7I3JamyXbx0ixe2CnfvGwoTyWSr7Kslcv8weoKjfU4BMBifkWIRL54l4OrNe97pYcDwQ==",
       "engines": {
         "node": ">= 10.0"
       }
     },
     "node_modules/putil-promisify": {
-      "version": "1.10.0",
-      "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.10.0.tgz",
-      "integrity": "sha512-zYPoAoMxmf8pC+I75kRkYkVMwU4ZbZl82aTGema175bmhQ06BEJuuOlzOy1buQK9G+hCyQ+BFpzMTKAJhD8rZw==",
+      "version": "1.10.1",
+      "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.10.1.tgz",
+      "integrity": "sha512-1jm0egJNrj5eBDRj15Cg08RNHDV91OVEHeeYjAFRcs663PXxFokndxcJAGbaO6CSErCTp8eTgC8vuOF+fvXIAA==",
       "engines": {
         "node": ">= 14.0"
       }
@@ -97,21 +96,21 @@
       }
     },
     "node_modules/strict-typed-events": {
-      "version": "2.3.1",
-      "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.3.1.tgz",
-      "integrity": "sha512-Z1h8KpVbrVg34Vwy/VwTD/tS9tFebH2h1Kvw4xnPkKpkISMwUpnqwU44rMfkKMpXbFCybIgDt7ARoCGTzURZhQ==",
+      "version": "2.3.3",
+      "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.3.3.tgz",
+      "integrity": "sha512-Vc8/N5giCVpO2n5BCskqDD9ns7RkdEq0pFd4yQk1ROULusJDbjORNvbtyEPxxK7Xqn9/NdW8XHLxv/PvUTgFsA==",
       "dependencies": {
-        "putil-promisify": "^1.8.5",
-        "ts-gems": "^2.2.0"
+        "putil-promisify": "^1.10.1",
+        "ts-gems": "^3.1.0"
       },
       "engines": {
         "node": ">=16.0"
       }
     },
     "node_modules/ts-gems": {
-      "version": "2.4.0",
-      "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.4.0.tgz",
-      "integrity": "sha512-SdugYAXoWvbqrxLodIObzxhEKacDxh5LfAJIiIkiH7q5thvuuCzdmkdTVQYf7uEDrEpPhfx4tokDMamdO3be9A=="
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-3.1.1.tgz",
+      "integrity": "sha512-Li1Z44FnxN06c1lBwFepb932jPYT+4eOvOmoiC30lOTkvOJOERr9xZFg3UA9y19OYO9CrW3ZSqNL66DUSuwFTw=="
     }
   }
 }
diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json
index 12703ce89f..07ec100d0d 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/package.json
+++ b/test_runner/pg_clients/typescript/postgresql-client/package.json
@@ -1,6 +1,6 @@
 {
   "type": "module",
   "dependencies": {
-    "postgresql-client": "2.5.9"
+    "postgresql-client": "2.10.5"
   }
 }
diff --git a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
index 07e98c586b..004b383749 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
+++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
@@ -1,4 +1,4 @@
-FROM node:20
+FROM node:21
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
index 72cc452817..5a3ad3c238 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
+++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
@@ -5,14 +5,14 @@
   "packages": {
     "": {
       "dependencies": {
-        "@neondatabase/serverless": "0.4.18",
-        "ws": "8.13.0"
+        "@neondatabase/serverless": "0.9.0",
+        "ws": "8.16.0"
       }
     },
     "node_modules/@neondatabase/serverless": {
-      "version": "0.4.18",
-      "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.4.18.tgz",
-      "integrity": "sha512-2TZnIyRGC/+0fjZ8TKCzaSTPUD94PM7NBGuantGZbUrbWyqBwGnUoRtdZAQ95qBKVHqORLVfymlv2NE+HQMFeA==",
+      "version": "0.9.0",
+      "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.0.tgz",
+      "integrity": "sha512-mmJnUAzlzvxNSZuuhI6kgJjH+JgFdBMYUWxihtq/nj0Tjt+Y5UU3W+SvRFoucnd5NObYkuLYQzk+zV5DGFKGJg==",
       "dependencies": {
         "@types/pg": "8.6.6"
       }
@@ -96,9 +96,9 @@
       }
     },
     "node_modules/ws": {
-      "version": "8.13.0",
-      "resolved": "https://registry.npmjs.org/ws/-/ws-8.13.0.tgz",
-      "integrity": "sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==",
+      "version": "8.16.0",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-8.16.0.tgz",
+      "integrity": "sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==",
       "engines": {
         "node": ">=10.0.0"
       },
diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json
index 840c7a5c4c..9d9da0f42c 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/package.json
+++ b/test_runner/pg_clients/typescript/serverless-driver/package.json
@@ -1,7 +1,7 @@
 {
   "type": "module",
   "dependencies": {
-    "@neondatabase/serverless": "0.4.18",
-    "ws": "8.13.0"
+    "@neondatabase/serverless": "0.9.0",
+    "ws": "8.16.0"
   }
 }

From 0b330e1310916221b4f43c1e8c53414a68633189 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <chi@neon.tech>
Date: Wed, 6 Mar 2024 12:20:44 -0500
Subject: [PATCH 0353/1571] upgrade neon extension on startup (#7029)

## Problem

Fix https://github.com/neondatabase/neon/issues/7003. Fix
https://github.com/neondatabase/neon/issues/6982. Currently, neon
extension is only upgraded when new compute spec gets applied, for
example, when creating a new role or creating a new database. This also
resolves `neon.lfc_stat` not found warnings in prod.

## Summary of changes

This pull request adds the logic to spawn a background thread to upgrade
the neon extension version if the compute is a primary. If for whatever
reason the upgrade fails, it reports an error to the console and does
not impact compute node state.

This change can be further applied to 3rd-party extension upgrades. We
can silently upgrade the version of 3rd party extensions in the
background in the future.

Questions:

* Does alter extension takes some kind of lock that will block user
requests?
* Does `ALTER EXTENSION` writes to the database if nothing needs to be
upgraded? (may impact storage size).

Otherwise it's safe to land this pull request.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/compute.rs | 43 +++++++++++++++++++++++++++---------
 compute_tools/src/spec.rs    | 12 +++++++++-
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index da271e49cd..5613e6c868 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -764,6 +764,26 @@ impl ComputeNode {
         Ok((pg, logs_handle))
     }
 
+    /// Do post configuration of the already started Postgres. This function spawns a background thread to
+    /// configure the database after applying the compute spec. Currently, it upgrades the neon extension
+    /// version. In the future, it may upgrade all 3rd-party extensions.
+    #[instrument(skip_all)]
+    pub fn post_apply_config(&self) -> Result<()> {
+        let connstr = self.connstr.clone();
+        thread::spawn(move || {
+            let func = || {
+                let mut client = Client::connect(connstr.as_str(), NoTls)?;
+                handle_neon_extension_upgrade(&mut client)
+                    .context("handle_neon_extension_upgrade")?;
+                Ok::<_, anyhow::Error>(())
+            };
+            if let Err(err) = func() {
+                error!("error while post_apply_config: {err:#}");
+            }
+        });
+        Ok(())
+    }
+
     /// Do initial configuration of the already started Postgres.
     #[instrument(skip_all)]
     pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
@@ -998,18 +1018,21 @@ impl ComputeNode {
         let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
 
         let config_time = Utc::now();
-        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
-            let pgdata_path = Path::new(&self.pgdata);
-            // temporarily reset max_cluster_size in config
-            // to avoid the possibility of hitting the limit, while we are applying config:
-            // creating new extensions, roles, etc...
-            config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
-            self.pg_reload_conf()?;
+        if pspec.spec.mode == ComputeMode::Primary {
+            if !pspec.spec.skip_pg_catalog_updates {
+                let pgdata_path = Path::new(&self.pgdata);
+                // temporarily reset max_cluster_size in config
+                // to avoid the possibility of hitting the limit, while we are applying config:
+                // creating new extensions, roles, etc...
+                config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
+                self.pg_reload_conf()?;
 
-            self.apply_config(&compute_state)?;
+                self.apply_config(&compute_state)?;
 
-            config::compute_ctl_temp_override_remove(pgdata_path)?;
-            self.pg_reload_conf()?;
+                config::compute_ctl_temp_override_remove(pgdata_path)?;
+                self.pg_reload_conf()?;
+            }
+            self.post_apply_config()?;
         }
 
         let startup_end_time = Utc::now();
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index d5fd2c9462..84a5a263af 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -744,7 +744,17 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
     // - extension was just installed
     // - extension was already installed and is up to date
     let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension schema with query: {}", query);
+    info!("update neon extension version with query: {}", query);
+    client.simple_query(query)?;
+
+    Ok(())
+}
+
+#[instrument(skip_all)]
+pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
+    info!("handle neon extension upgrade");
+    let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension version with query: {}", query);
     client.simple_query(query)?;
 
     Ok(())

From c2876ec55d985d2820467bd0e248500a29be649c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 7 Mar 2024 12:36:47 +0000
Subject: [PATCH 0354/1571] proxy http tls investigations (#7045)

## Problem

Some HTTP-specific TLS errors

## Summary of changes

Add more logging, vendor `tls-listener` with minor modifications.
---
 Cargo.lock                           |  15 --
 Cargo.toml                           |   1 -
 proxy/Cargo.toml                     |   1 -
 proxy/src/metrics.rs                 |  10 +-
 proxy/src/protocol2.rs               |  78 +++++++-
 proxy/src/proxy.rs                   |  14 +-
 proxy/src/serverless.rs              |  50 +++--
 proxy/src/serverless/tls_listener.rs | 283 +++++++++++++++++++++++++++
 proxy/src/serverless/websocket.rs    |   6 +
 proxy/src/stream.rs                  |   6 +-
 10 files changed, 418 insertions(+), 46 deletions(-)
 create mode 100644 proxy/src/serverless/tls_listener.rs

diff --git a/Cargo.lock b/Cargo.lock
index 864e5c9046..167a2b2179 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4216,7 +4216,6 @@ dependencies = [
  "thiserror",
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
- "tls-listener",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
@@ -5794,20 +5793,6 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
-[[package]]
-name = "tls-listener"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81294c017957a1a69794f506723519255879e15a870507faf45dfed288b763dd"
-dependencies = [
- "futures-util",
- "hyper",
- "pin-project-lite",
- "thiserror",
- "tokio",
- "tokio-rustls",
-]
-
 [[package]]
 name = "tokio"
 version = "1.36.0"
diff --git a/Cargo.toml b/Cargo.toml
index 90b02b30ec..42deaac19b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -156,7 +156,6 @@ test-context = "0.1"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"
 tikv-jemalloc-ctl = "0.5"
-tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 0777d361d2..d8112c8bf0 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -68,7 +68,6 @@ task-local-extensions.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
-tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 2464b1e611..0477176c45 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -4,7 +4,7 @@ use ::metrics::{
     register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
     IntCounterVec, IntGauge, IntGaugeVec,
 };
-use metrics::{register_int_counter_pair, IntCounterPair};
+use metrics::{register_int_counter, register_int_counter_pair, IntCounter, IntCounterPair};
 
 use once_cell::sync::Lazy;
 use tokio::time;
@@ -312,3 +312,11 @@ pub static REDIS_BROKEN_MESSAGES: Lazy<IntCounterVec> = Lazy::new(|| {
     )
     .unwrap()
 });
+
+pub static TLS_HANDSHAKE_FAILURES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "proxy_tls_handshake_failures",
+        "Number of TLS handshake failures",
+    )
+    .unwrap()
+});
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 1d8931be85..3a7aabca32 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -1,22 +1,27 @@
 //! Proxy Protocol V2 implementation
 
 use std::{
-    future::poll_fn,
-    future::Future,
+    future::{poll_fn, Future},
     io,
     net::SocketAddr,
     pin::{pin, Pin},
+    sync::Mutex,
     task::{ready, Context, Poll},
 };
 
 use bytes::{Buf, BytesMut};
+use hyper::server::accept::Accept;
 use hyper::server::conn::{AddrIncoming, AddrStream};
+use metrics::IntCounterPairGuard;
 use pin_project_lite::pin_project;
-use tls_listener::AsyncAccept;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
+use uuid::Uuid;
+
+use crate::{metrics::NUM_CLIENT_CONNECTION_GAUGE, serverless::tls_listener::AsyncAccept};
 
 pub struct ProxyProtocolAccept {
     pub incoming: AddrIncoming,
+    pub protocol: &'static str,
 }
 
 pin_project! {
@@ -327,7 +332,7 @@ impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
 }
 
 impl AsyncAccept for ProxyProtocolAccept {
-    type Connection = WithClientIp<AddrStream>;
+    type Connection = WithConnectionGuard<WithClientIp<AddrStream>>;
 
     type Error = io::Error;
 
@@ -336,11 +341,74 @@ impl AsyncAccept for ProxyProtocolAccept {
         cx: &mut Context<'_>,
     ) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
         let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
+        tracing::info!(protocol = self.protocol, "accepted new TCP connection");
         let Some(conn) = conn else {
             return Poll::Ready(None);
         };
 
-        Poll::Ready(Some(Ok(WithClientIp::new(conn))))
+        Poll::Ready(Some(Ok(WithConnectionGuard {
+            inner: WithClientIp::new(conn),
+            connection_id: Uuid::new_v4(),
+            gauge: Mutex::new(Some(
+                NUM_CLIENT_CONNECTION_GAUGE
+                    .with_label_values(&[self.protocol])
+                    .guard(),
+            )),
+        })))
+    }
+}
+
+pin_project! {
+    pub struct WithConnectionGuard<T> {
+        #[pin]
+        pub inner: T,
+        pub connection_id: Uuid,
+        pub gauge: Mutex<Option<IntCounterPairGuard>>,
+    }
+}
+
+impl<T: AsyncWrite> AsyncWrite for WithConnectionGuard<T> {
+    #[inline]
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &[u8],
+    ) -> Poll<Result<usize, io::Error>> {
+        self.project().inner.poll_write(cx, buf)
+    }
+
+    #[inline]
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
+        self.project().inner.poll_flush(cx)
+    }
+
+    #[inline]
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
+        self.project().inner.poll_shutdown(cx)
+    }
+
+    #[inline]
+    fn poll_write_vectored(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        bufs: &[io::IoSlice<'_>],
+    ) -> Poll<Result<usize, io::Error>> {
+        self.project().inner.poll_write_vectored(cx, bufs)
+    }
+
+    #[inline]
+    fn is_write_vectored(&self) -> bool {
+        self.inner.is_write_vectored()
+    }
+}
+
+impl<T: AsyncRead> AsyncRead for WithConnectionGuard<T> {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        self.project().inner.poll_read(cx, buf)
     }
 }
 
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index d94fc67491..aeba08bc4f 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -24,6 +24,7 @@ use crate::{
 };
 use futures::TryFutureExt;
 use itertools::Itertools;
+use metrics::IntCounterPairGuard;
 use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, StartupMessageParams};
 use regex::Regex;
@@ -78,10 +79,16 @@ pub async fn task_main(
     {
         let (socket, peer_addr) = accept_result?;
 
+        let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
+            .with_label_values(&["tcp"])
+            .guard();
+
         let session_id = uuid::Uuid::new_v4();
         let cancellation_handler = Arc::clone(&cancellation_handler);
         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
+        tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
+
         connections.spawn(async move {
             let mut socket = WithClientIp::new(socket);
             let mut peer_addr = peer_addr.ip();
@@ -116,6 +123,7 @@ pub async fn task_main(
                 socket,
                 ClientMode::Tcp,
                 endpoint_rate_limiter,
+                conn_gauge,
             )
             .instrument(span.clone())
             .await;
@@ -229,13 +237,11 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     stream: S,
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    conn_gauge: IntCounterPairGuard,
 ) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
     info!("handling interactive connection from client");
 
     let proto = ctx.protocol;
-    let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE
-        .with_label_values(&[proto])
-        .guard();
     let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
         .with_label_values(&[proto])
         .guard();
@@ -325,7 +331,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         aux: node.aux.clone(),
         compute: node,
         req: _request_gauge,
-        conn: _client_gauge,
+        conn: conn_gauge,
         cancel: session,
     }))
 }
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index b5806aec53..c81ae03b23 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -6,6 +6,7 @@ mod backend;
 mod conn_pool;
 mod json;
 mod sql_over_http;
+pub mod tls_listener;
 mod websocket;
 
 pub use conn_pool::GlobalConnPoolOptions;
@@ -20,8 +21,8 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
 
 use crate::context::RequestMonitoring;
-use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
-use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
+use crate::metrics::TLS_HANDSHAKE_FAILURES;
+use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
 use crate::{cancellation::CancellationHandler, config::ProxyConfig};
@@ -98,6 +99,7 @@ pub async fn task_main(
     let _ = addr_incoming.set_nodelay(true);
     let addr_incoming = ProxyProtocolAccept {
         incoming: addr_incoming,
+        protocol: "http",
     };
 
     let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
@@ -105,18 +107,34 @@ pub async fn task_main(
 
     let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
         if let Err(err) = conn {
-            error!("failed to accept TLS connection for websockets: {err:?}");
+            error!(
+                protocol = "http",
+                "failed to accept TLS connection: {err:?}"
+            );
+            TLS_HANDSHAKE_FAILURES.inc();
             ready(false)
         } else {
+            info!(protocol = "http", "accepted new TLS connection");
             ready(true)
         }
     });
 
     let make_svc = hyper::service::make_service_fn(
-        |stream: &tokio_rustls::server::TlsStream<WithClientIp<AddrStream>>| {
-            let (io, _) = stream.get_ref();
-            let client_addr = io.client_addr();
-            let remote_addr = io.inner.remote_addr();
+        |stream: &tokio_rustls::server::TlsStream<
+            WithConnectionGuard<WithClientIp<AddrStream>>,
+        >| {
+            let (conn, _) = stream.get_ref();
+
+            // this is jank. should dissapear with hyper 1.0 migration.
+            let gauge = conn
+                .gauge
+                .lock()
+                .expect("lock should not be poisoned")
+                .take()
+                .expect("gauge should be set on connection start");
+
+            let client_addr = conn.inner.client_addr();
+            let remote_addr = conn.inner.inner.remote_addr();
             let backend = backend.clone();
             let ws_connections = ws_connections.clone();
             let endpoint_rate_limiter = endpoint_rate_limiter.clone();
@@ -127,8 +145,8 @@ pub async fn task_main(
                     None if config.require_client_ip => bail!("missing required client ip"),
                     None => remote_addr,
                 };
-                Ok(MetricService::new(hyper::service::service_fn(
-                    move |req: Request<Body>| {
+                Ok(MetricService::new(
+                    hyper::service::service_fn(move |req: Request<Body>| {
                         let backend = backend.clone();
                         let ws_connections = ws_connections.clone();
                         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
@@ -149,8 +167,9 @@ pub async fn task_main(
                                 .map_or_else(|e| e.into_response(), |r| r),
                             )
                         }
-                    },
-                )))
+                    }),
+                    gauge,
+                ))
             }
         },
     );
@@ -172,13 +191,8 @@ struct MetricService<S> {
 }
 
 impl<S> MetricService<S> {
-    fn new(inner: S) -> MetricService<S> {
-        MetricService {
-            inner,
-            _gauge: NUM_CLIENT_CONNECTION_GAUGE
-                .with_label_values(&["http"])
-                .guard(),
-        }
+    fn new(inner: S, _gauge: IntCounterPairGuard) -> MetricService<S> {
+        MetricService { inner, _gauge }
     }
 }
 
diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs
new file mode 100644
index 0000000000..6196ff393c
--- /dev/null
+++ b/proxy/src/serverless/tls_listener.rs
@@ -0,0 +1,283 @@
+use std::{
+    pin::Pin,
+    task::{Context, Poll},
+    time::Duration,
+};
+
+use futures::{Future, Stream, StreamExt};
+use pin_project_lite::pin_project;
+use thiserror::Error;
+use tokio::{
+    io::{AsyncRead, AsyncWrite},
+    task::JoinSet,
+    time::timeout,
+};
+
+/// Default timeout for the TLS handshake.
+pub const DEFAULT_HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(10);
+
+/// Trait for TLS implementation.
+///
+/// Implementations are provided by the rustls and native-tls features.
+pub trait AsyncTls<C: AsyncRead + AsyncWrite>: Clone {
+    /// The type of the TLS stream created from the underlying stream.
+    type Stream: Send + 'static;
+    /// Error type for completing the TLS handshake
+    type Error: std::error::Error + Send + 'static;
+    /// Type of the Future for the TLS stream that is accepted.
+    type AcceptFuture: Future<Output = Result<Self::Stream, Self::Error>> + Send + 'static;
+
+    /// Accept a TLS connection on an underlying stream
+    fn accept(&self, stream: C) -> Self::AcceptFuture;
+}
+
+/// Asynchronously accept connections.
+pub trait AsyncAccept {
+    /// The type of the connection that is accepted.
+    type Connection: AsyncRead + AsyncWrite;
+    /// The type of error that may be returned.
+    type Error;
+
+    /// Poll to accept the next connection.
+    fn poll_accept(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<Self::Connection, Self::Error>>>;
+
+    /// Return a new `AsyncAccept` that stops accepting connections after
+    /// `ender` completes.
+    ///
+    /// Useful for graceful shutdown.
+    ///
+    /// See [examples/echo.rs](https://github.com/tmccombs/tls-listener/blob/main/examples/echo.rs)
+    /// for example of how to use.
+    fn until<F: Future>(self, ender: F) -> Until<Self, F>
+    where
+        Self: Sized,
+    {
+        Until {
+            acceptor: self,
+            ender,
+        }
+    }
+}
+
+pin_project! {
+    ///
+    /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself
+    /// encrypted using TLS.
+    ///
+    /// It is similar to:
+    ///
+    /// ```ignore
+    /// tcpListener.and_then(|s| tlsAcceptor.accept(s))
+    /// ```
+    ///
+    /// except that it has the ability to accept multiple transport-level connections
+    /// simultaneously while the TLS handshake is pending for other connections.
+    ///
+    /// By default, if a client fails the TLS handshake, that is treated as an error, and the
+    /// `TlsListener` will return an `Err`. If the `TlsListener` is passed directly to a hyper
+    /// [`Server`][1], then an invalid handshake can cause the server to stop accepting connections.
+    /// See [`http-stream.rs`][2] or [`http-low-level`][3] examples, for examples of how to avoid this.
+    ///
+    /// Note that if the maximum number of pending connections is greater than 1, the resulting
+    /// [`T::Stream`][4] connections may come in a different order than the connections produced by the
+    /// underlying listener.
+    ///
+    /// [1]: https://docs.rs/hyper/latest/hyper/server/struct.Server.html
+    /// [2]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-stream.rs
+    /// [3]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-low-level.rs
+    /// [4]: AsyncTls::Stream
+    ///
+    #[allow(clippy::type_complexity)]
+    pub struct TlsListener<A: AsyncAccept, T: AsyncTls<A::Connection>> {
+        #[pin]
+        listener: A,
+        tls: T,
+        waiting: JoinSet<Result<Result<T::Stream, T::Error>, tokio::time::error::Elapsed>>,
+        timeout: Duration,
+    }
+}
+
+/// Builder for `TlsListener`.
+#[derive(Clone)]
+pub struct Builder<T> {
+    tls: T,
+    handshake_timeout: Duration,
+}
+
+/// Wraps errors from either the listener or the TLS Acceptor
+#[derive(Debug, Error)]
+pub enum Error<LE: std::error::Error, TE: std::error::Error> {
+    /// An error that arose from the listener ([AsyncAccept::Error])
+    #[error("{0}")]
+    ListenerError(#[source] LE),
+    /// An error that occurred during the TLS accept handshake
+    #[error("{0}")]
+    TlsAcceptError(#[source] TE),
+}
+
+impl<A: AsyncAccept, T> TlsListener<A, T>
+where
+    T: AsyncTls<A::Connection>,
+{
+    /// Create a `TlsListener` with default options.
+    pub fn new(tls: T, listener: A) -> Self {
+        builder(tls).listen(listener)
+    }
+}
+
+impl<A, T> TlsListener<A, T>
+where
+    A: AsyncAccept,
+    A::Error: std::error::Error,
+    T: AsyncTls<A::Connection>,
+{
+    /// Accept the next connection
+    ///
+    /// This is essentially an alias to `self.next()` with a more domain-appropriate name.
+    pub async fn accept(&mut self) -> Option<<Self as Stream>::Item>
+    where
+        Self: Unpin,
+    {
+        self.next().await
+    }
+
+    /// Replaces the Tls Acceptor configuration, which will be used for new connections.
+    ///
+    /// This can be used to change the certificate used at runtime.
+    pub fn replace_acceptor(&mut self, acceptor: T) {
+        self.tls = acceptor;
+    }
+
+    /// Replaces the Tls Acceptor configuration from a pinned reference to `Self`.
+    ///
+    /// This is useful if your listener is `!Unpin`.
+    ///
+    /// This can be used to change the certificate used at runtime.
+    pub fn replace_acceptor_pin(self: Pin<&mut Self>, acceptor: T) {
+        *self.project().tls = acceptor;
+    }
+}
+
+impl<A, T> Stream for TlsListener<A, T>
+where
+    A: AsyncAccept,
+    A::Error: std::error::Error,
+    T: AsyncTls<A::Connection>,
+{
+    type Item = Result<T::Stream, Error<A::Error, T::Error>>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let mut this = self.project();
+
+        loop {
+            match this.listener.as_mut().poll_accept(cx) {
+                Poll::Pending => break,
+                Poll::Ready(Some(Ok(conn))) => {
+                    this.waiting
+                        .spawn(timeout(*this.timeout, this.tls.accept(conn)));
+                }
+                Poll::Ready(Some(Err(e))) => {
+                    return Poll::Ready(Some(Err(Error::ListenerError(e))));
+                }
+                Poll::Ready(None) => return Poll::Ready(None),
+            }
+        }
+
+        loop {
+            return match this.waiting.poll_join_next(cx) {
+                Poll::Ready(Some(Ok(Ok(conn)))) => {
+                    Poll::Ready(Some(conn.map_err(Error::TlsAcceptError)))
+                }
+                // The handshake timed out, try getting another connection from the queue
+                Poll::Ready(Some(Ok(Err(_)))) => continue,
+                // The handshake panicked
+                Poll::Ready(Some(Err(e))) if e.is_panic() => {
+                    std::panic::resume_unwind(e.into_panic())
+                }
+                // The handshake was externally aborted
+                Poll::Ready(Some(Err(_))) => unreachable!("handshake tasks are never aborted"),
+                _ => Poll::Pending,
+            };
+        }
+    }
+}
+
+impl<C: AsyncRead + AsyncWrite + Unpin + Send + 'static> AsyncTls<C> for tokio_rustls::TlsAcceptor {
+    type Stream = tokio_rustls::server::TlsStream<C>;
+    type Error = std::io::Error;
+    type AcceptFuture = tokio_rustls::Accept<C>;
+
+    fn accept(&self, conn: C) -> Self::AcceptFuture {
+        tokio_rustls::TlsAcceptor::accept(self, conn)
+    }
+}
+
+impl<T> Builder<T> {
+    /// Set the timeout for handshakes.
+    ///
+    /// If a timeout takes longer than `timeout`, then the handshake will be
+    /// aborted and the underlying connection will be dropped.
+    ///
+    /// Defaults to `DEFAULT_HANDSHAKE_TIMEOUT`.
+    pub fn handshake_timeout(&mut self, timeout: Duration) -> &mut Self {
+        self.handshake_timeout = timeout;
+        self
+    }
+
+    /// Create a `TlsListener` from the builder
+    ///
+    /// Actually build the `TlsListener`. The `listener` argument should be
+    /// an implementation of the `AsyncAccept` trait that accepts new connections
+    /// that the `TlsListener` will  encrypt using TLS.
+    pub fn listen<A: AsyncAccept>(&self, listener: A) -> TlsListener<A, T>
+    where
+        T: AsyncTls<A::Connection>,
+    {
+        TlsListener {
+            listener,
+            tls: self.tls.clone(),
+            waiting: JoinSet::new(),
+            timeout: self.handshake_timeout,
+        }
+    }
+}
+
+/// Create a new Builder for a TlsListener
+///
+/// `server_config` will be used to configure the TLS sessions.
+pub fn builder<T>(tls: T) -> Builder<T> {
+    Builder {
+        tls,
+        handshake_timeout: DEFAULT_HANDSHAKE_TIMEOUT,
+    }
+}
+
+pin_project! {
+    /// See [`AsyncAccept::until`]
+    pub struct Until<A, E> {
+        #[pin]
+        acceptor: A,
+        #[pin]
+        ender: E,
+    }
+}
+
+impl<A: AsyncAccept, E: Future> AsyncAccept for Until<A, E> {
+    type Connection = A::Connection;
+    type Error = A::Error;
+
+    fn poll_accept(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
+        let this = self.project();
+
+        match this.ender.poll(cx) {
+            Poll::Pending => this.acceptor.poll_accept(cx),
+            Poll::Ready(_) => Poll::Ready(None),
+        }
+    }
+}
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 24f2bb7e8c..a72ede6d0a 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -3,6 +3,7 @@ use crate::{
     config::ProxyConfig,
     context::RequestMonitoring,
     error::{io_error, ReportableError},
+    metrics::NUM_CLIENT_CONNECTION_GAUGE,
     proxy::{handle_client, ClientMode},
     rate_limiter::EndpointRateLimiter,
 };
@@ -138,6 +139,10 @@ pub async fn serve_websocket(
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
+    let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
+        .with_label_values(&["ws"])
+        .guard();
+
     let res = handle_client(
         config,
         &mut ctx,
@@ -145,6 +150,7 @@ pub async fn serve_websocket(
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },
         endpoint_rate_limiter,
+        conn_gauge,
     )
     .await;
 
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 0d639d2c07..b6b7a85659 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -1,5 +1,6 @@
 use crate::config::TlsServerEndPoint;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
+use crate::metrics::TLS_HANDSHAKE_FAILURES;
 use bytes::BytesMut;
 
 use pq_proto::framed::{ConnectionError, Framed};
@@ -224,7 +225,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
     /// If possible, upgrade raw stream into a secure TLS-based stream.
     pub async fn upgrade(self, cfg: Arc<ServerConfig>) -> Result<TlsStream<S>, StreamUpgradeError> {
         match self {
-            Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?),
+            Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg)
+                .accept(raw)
+                .await
+                .inspect_err(|_| TLS_HANDSHAKE_FAILURES.inc())?),
             Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
         }
     }

From d03ec9d9983554ebf5d0a2ee182536b6c267ff98 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 7 Mar 2024 12:37:52 +0000
Subject: [PATCH 0355/1571] pageserver: don't validate vectored get on
 shut-down (#7039)

## Problem
We attempted validation for cancelled errors under the assumption that
if vectored get fails, sequential get will too.
That's not right 100% of times though because sequential get may have
the values cached and slip them through
even when shutting down.

## Summary of changes
Don't validate if either search impl failed due to tenant shutdown.
---
 pageserver/src/tenant/timeline.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 37acebb10a..7ac7c15876 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -863,8 +863,6 @@ impl Timeline {
         fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool {
             use GetVectoredError::*;
             match (lhs, rhs) {
-                (Cancelled, Cancelled) => true,
-                (_, Cancelled) => true,
                 (Oversized(l), Oversized(r)) => l == r,
                 (InvalidLsn(l), InvalidLsn(r)) => l == r,
                 (MissingKey(l), MissingKey(r)) => l == r,
@@ -875,6 +873,8 @@ impl Timeline {
         }
 
         match (&sequential_res, vectored_res) {
+            (Err(GetVectoredError::Cancelled), _) => {},
+            (_, Err(GetVectoredError::Cancelled)) => {},
             (Err(seq_err), Ok(_)) => {
                 panic!(concat!("Sequential get failed with {}, but vectored get did not",
                                " - keyspace={:?} lsn={}"),

From d3c583efbe2a5f736ae43da4de84479ec4ee81b4 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 7 Mar 2024 14:06:48 +0000
Subject: [PATCH 0356/1571] Rename binary attachment_service ->
 storage_controller (#7042)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

The storage controller binary still has its historic
`attachment_service` name -- it will be painful to change this later
because we can't atomically update this repo and the helm charts used to
deploy.

Companion helm chart change:
https://github.com/neondatabase/helm-charts/pull/70

## Summary of changes

- Change the name of the binary to `storage_controller`
- Skipping renaming things in the source right now: this is just to get
rid of the legacy name in external interfaces.

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 Dockerfile                                  | 4 ++--
 control_plane/attachment_service/Cargo.toml | 4 ++++
 control_plane/src/attachment_service.rs     | 2 +-
 control_plane/src/local_env.rs              | 2 +-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 47954a671b..5f82df3e18 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -53,7 +53,7 @@ RUN set -e \
       --bin pagectl  \
       --bin safekeeper  \
       --bin storage_broker  \
-      --bin attachment_service  \
+      --bin storage_controller  \
       --bin proxy  \
       --bin neon_local \
       --locked --release \
@@ -81,7 +81,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/attachment_service  /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
 
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index bfdfd4c77d..a5fad7216c 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -4,6 +4,10 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[[bin]]
+name = "storage_controller"
+path = "src/main.rs"
+
 [features]
 default = []
 # Enables test-only APIs and behaviors
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 610d7386d9..5c97561985 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -34,7 +34,7 @@ pub struct AttachmentService {
     client: reqwest::Client,
 }
 
-const COMMAND: &str = "attachment_service";
+const COMMAND: &str = "storage_controller";
 
 const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
 
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index a5e1325cfe..03270723a6 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -232,7 +232,7 @@ impl LocalEnv {
         // run from the same location as neon_local.  This means that for compatibility
         // tests that run old pageserver/safekeeper, they still run latest attachment service.
         let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
-        neon_local_bin_dir.join("attachment_service")
+        neon_local_bin_dir.join("storage_controller")
     }
 
     pub fn safekeeper_bin(&self) -> PathBuf {

From 602a4da9a5cdfac7f04509950704da811f08b968 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 7 Mar 2024 16:23:42 +0200
Subject: [PATCH 0357/1571] bench: run branch_creation_many at 500, seeded
 (#6959)

We have a benchmark for creating a lot of branches, but it does random
things, and the branch count is not what we is the largest maximum we
aim to support. If this PR would stabilize the benchmark total duration
it means that there are some structures which are very much slower than
others. Then we should add a seed-outputting variant to help find and
reproduce such cases.

Additionally, record for the benchmark:
- shutdown duration
- startup metrics once done (on restart)
- duration of first compaction completion via debug logging
---
 pageserver/src/tenant/tasks.rs                |   7 +-
 .../performance/test_branch_creation.py       | 110 ++++++++++++++++--
 2 files changed, 109 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 57c3edcddd..e4f5f75132 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -101,6 +101,7 @@ pub fn start_background_loops(
                     _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
                 };
                 compaction_loop(tenant, cancel)
+                    // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py
                     .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                     .await;
                 Ok(())
@@ -198,7 +199,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
             };
 
-            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
+            let elapsed = started_at.elapsed();
+            warn_when_period_overrun(elapsed, period, BackgroundLoopKind::Compaction);
+
+            // the duration is recorded by performance tests by enabling debug in this function
+            tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
 
             // Perhaps we did no work and the walredo process has been idle for some time:
             // give it a chance to shut down to avoid leaving walredo process running indefinitely.
diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index 6edcb8f1f2..9777bf6748 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -1,4 +1,5 @@
 import random
+import re
 import statistics
 import threading
 import time
@@ -7,11 +8,14 @@ from contextlib import closing
 from typing import List
 
 import pytest
-from fixtures.benchmark_fixture import MetricReport
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.compare_fixtures import NeonCompare
 from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonPageserver
 from fixtures.pageserver.utils import wait_for_last_record_lsn
 from fixtures.types import Lsn
+from fixtures.utils import wait_until
+from prometheus_client.samples import Sample
 
 
 def _record_branch_creation_durations(neon_compare: NeonCompare, durs: List[float]):
@@ -89,11 +93,17 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int)
     _record_branch_creation_durations(neon_compare, branch_creation_durations)
 
 
-@pytest.mark.parametrize("n_branches", [1024])
-# Test measures the latency of branch creation when creating a lot of branches.
-def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int):
+@pytest.mark.parametrize("n_branches", [500, 1024])
+@pytest.mark.parametrize("shape", ["one_ancestor", "random"])
+def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: str):
+    """
+    Test measures the latency of branch creation when creating a lot of branches.
+    """
     env = neon_compare.env
 
+    # seed the prng so we will measure the same structure every time
+    rng = random.Random("2024-02-29")
+
     env.neon_cli.create_branch("b0")
 
     endpoint = env.endpoints.create_start("b0")
@@ -102,15 +112,101 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int):
     branch_creation_durations = []
 
     for i in range(n_branches):
-        # random a source branch
-        p = random.randint(0, i)
+        if shape == "random":
+            parent = f"b{rng.randint(0, i)}"
+        elif shape == "one_ancestor":
+            parent = "b0"
+        else:
+            raise RuntimeError(f"unimplemented shape: {shape}")
+
         timer = timeit.default_timer()
-        env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p))
+        # each of these uploads to remote storage before completion
+        env.neon_cli.create_branch(f"b{i + 1}", parent)
         dur = timeit.default_timer() - timer
         branch_creation_durations.append(dur)
 
     _record_branch_creation_durations(neon_compare, branch_creation_durations)
 
+    endpoint.stop_and_destroy()
+
+    with neon_compare.record_duration("shutdown"):
+        # this sleeps 100ms between polls
+        env.pageserver.stop()
+
+    startup_line = "INFO version: git(-env)?:"
+
+    # find the first line of the log file so we can find the next start later
+    _, first_start = wait_until(5, 1, lambda: env.pageserver.assert_log_contains(startup_line))
+
+    # start without gc so we can time compaction with less noise; use shorter
+    # period for compaction so it starts earlier
+    env.pageserver.start(
+        overrides=(
+            "--pageserver-config-override=tenant_config={ compaction_period = '3s', gc_period = '0s' }",
+        ),
+        # this does print more than we want, but the number should be comparable between runs
+        extra_env_vars={
+            "RUST_LOG": f"[compaction_loop{{tenant_id={env.initial_tenant}}}]=debug,info"
+        },
+    )
+
+    _, second_start = wait_until(
+        5, 1, lambda: env.pageserver.assert_log_contains(startup_line, first_start)
+    )
+    env.pageserver.quiesce_tenants()
+
+    wait_and_record_startup_metrics(env.pageserver, neon_compare.zenbenchmark, "restart_after")
+
+    # wait for compaction to complete, which most likely has already done so multiple times
+    msg, _ = wait_until(
+        30,
+        1,
+        lambda: env.pageserver.assert_log_contains(
+            f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", second_start
+        ),
+    )
+    needle = re.search(" elapsed_ms=([0-9]+)", msg)
+    assert needle is not None, "failed to find the elapsed time"
+    duration = int(needle.group(1)) / 1000.0
+    neon_compare.zenbenchmark.record("compaction", duration, "s", MetricReport.LOWER_IS_BETTER)
+
+
+def wait_and_record_startup_metrics(
+    pageserver: NeonPageserver, target: NeonBenchmarker, prefix: str
+):
+    """
+    Waits until all startup metrics have non-zero values on the pageserver, then records them on the target
+    """
+
+    client = pageserver.http_client()
+
+    expected_labels = set(
+        [
+            "background_jobs_can_start",
+            "complete",
+            "initial",
+            "initial_tenant_load",
+            "initial_tenant_load_remote",
+        ]
+    )
+
+    def metrics_are_filled() -> List[Sample]:
+        m = client.get_metrics()
+        samples = m.query_all("pageserver_startup_duration_seconds")
+        # we should not have duplicate labels
+        matching = [
+            x for x in samples if x.labels.get("phase") in expected_labels and x.value > 0.0
+        ]
+        assert len(matching) == len(expected_labels)
+        return matching
+
+    samples = wait_until(10, 1, metrics_are_filled)
+
+    for sample in samples:
+        phase = sample.labels["phase"]
+        name = f"{prefix}.{phase}"
+        target.record(name, sample.value, "s", MetricReport.LOWER_IS_BETTER)
+
 
 # Test measures the branch creation time when branching from a timeline with a lot of relations.
 #

From 871977f14c2ca93f736a82c07da93a3c142d0ab0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 7 Mar 2024 16:02:20 +0000
Subject: [PATCH 0358/1571] pageserver: fix early bail out in vectored get
 (#7038)

## Problem
When vectored get encountered a portion of the key range that could
not be mapped to any layer in the current timeline it would incorrectly
bail out of the current timeline. This is incorrect since we may have
had layers queued for a visit in the fringe.

## Summary of changes
* Add a repro unit test
* Remove the early bail out path
* Simplify range search return value
---
 pageserver/src/tenant.rs           | 165 +++++++++++++++++++++++++++--
 pageserver/src/tenant/layer_map.rs |  24 +++--
 pageserver/src/tenant/timeline.rs  |   9 +-
 3 files changed, 176 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index b24c06c4da..2f23e535fa 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3679,7 +3679,10 @@ pub(crate) mod harness {
     }
 
     impl TenantHarness {
-        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
+        pub fn create_custom(
+            test_name: &'static str,
+            tenant_conf: TenantConf,
+        ) -> anyhow::Result<Self> {
             setup_logging();
 
             let repo_dir = PageServerConf::test_repo_dir(test_name);
@@ -3691,14 +3694,6 @@ pub(crate) mod harness {
             // OK in a test.
             let conf: &'static PageServerConf = Box::leak(Box::new(conf));
 
-            // Disable automatic GC and compaction to make the unit tests more deterministic.
-            // The tests perform them manually if needed.
-            let tenant_conf = TenantConf {
-                gc_period: Duration::ZERO,
-                compaction_period: Duration::ZERO,
-                ..TenantConf::default()
-            };
-
             let tenant_id = TenantId::generate();
             let tenant_shard_id = TenantShardId::unsharded(tenant_id);
             fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?;
@@ -3726,6 +3721,18 @@ pub(crate) mod harness {
             })
         }
 
+        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
+            // Disable automatic GC and compaction to make the unit tests more deterministic.
+            // The tests perform them manually if needed.
+            let tenant_conf = TenantConf {
+                gc_period: Duration::ZERO,
+                compaction_period: Duration::ZERO,
+                ..TenantConf::default()
+            };
+
+            Self::create_custom(test_name, tenant_conf)
+        }
+
         pub fn span(&self) -> tracing::Span {
             info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
         }
@@ -3833,6 +3840,7 @@ mod tests {
     use crate::keyspace::KeySpaceAccum;
     use crate::repository::{Key, Value};
     use crate::tenant::harness::*;
+    use crate::tenant::timeline::CompactFlags;
     use crate::DEFAULT_PG_VERSION;
     use bytes::BytesMut;
     use hex_literal::hex;
@@ -4637,6 +4645,145 @@ mod tests {
         Ok(())
     }
 
+    // Test that vectored get handles layer gaps correctly
+    // by advancing into the next ancestor timeline if required.
+    //
+    // The test generates timelines that look like the diagram below.
+    // We leave a gap in one of the L1 layers at `gap_at_key` (`/` in the diagram).
+    // The reconstruct data for that key lies in the ancestor timeline (`X` in the diagram).
+    //
+    // ```
+    //-------------------------------+
+    //                          ...  |
+    //               [   L1   ]      |
+    //     [ / L1   ]                | Child Timeline
+    // ...                           |
+    // ------------------------------+
+    //     [ X L1   ]                | Parent Timeline
+    // ------------------------------+
+    // ```
+    #[tokio::test]
+    async fn test_get_vectored_key_gap() -> anyhow::Result<()> {
+        let tenant_conf = TenantConf {
+            // Make compaction deterministic
+            gc_period: Duration::ZERO,
+            compaction_period: Duration::ZERO,
+            // Encourage creation of L1 layers
+            checkpoint_distance: 16 * 1024,
+            compaction_target_size: 8 * 1024,
+            ..TenantConf::default()
+        };
+
+        let harness = TenantHarness::create_custom("test_get_vectored_key_gap", tenant_conf)?;
+        let (tenant, ctx) = harness.load().await;
+
+        let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let gap_at_key = current_key.add(100);
+        let mut current_lsn = Lsn(0x10);
+
+        const KEY_COUNT: usize = 10_000;
+
+        let timeline_id = TimelineId::generate();
+        let current_timeline = tenant
+            .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        current_lsn += 0x100;
+
+        let writer = current_timeline.writer().await;
+        writer
+            .put(
+                gap_at_key,
+                current_lsn,
+                &Value::Image(test_img(&format!("{} at {}", gap_at_key, current_lsn))),
+                &ctx,
+            )
+            .await?;
+        writer.finish_write(current_lsn);
+        drop(writer);
+
+        let mut latest_lsns = HashMap::new();
+        latest_lsns.insert(gap_at_key, current_lsn);
+
+        current_timeline.freeze_and_flush().await?;
+
+        let child_timeline_id = TimelineId::generate();
+
+        tenant
+            .branch_timeline_test(
+                &current_timeline,
+                child_timeline_id,
+                Some(current_lsn),
+                &ctx,
+            )
+            .await?;
+        let child_timeline = tenant
+            .get_timeline(child_timeline_id, true)
+            .expect("Should have the branched timeline");
+
+        for i in 0..KEY_COUNT {
+            if current_key == gap_at_key {
+                current_key = current_key.next();
+                continue;
+            }
+
+            current_lsn += 0x10;
+
+            let writer = child_timeline.writer().await;
+            writer
+                .put(
+                    current_key,
+                    current_lsn,
+                    &Value::Image(test_img(&format!("{} at {}", current_key, current_lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(current_lsn);
+            drop(writer);
+
+            latest_lsns.insert(current_key, current_lsn);
+            current_key = current_key.next();
+
+            // Flush every now and then to encourage layer file creation.
+            if i % 500 == 0 {
+                child_timeline.freeze_and_flush().await?;
+            }
+        }
+
+        child_timeline.freeze_and_flush().await?;
+        let mut flags = EnumSet::new();
+        flags.insert(CompactFlags::ForceRepartition);
+        child_timeline
+            .compact(&CancellationToken::new(), flags, &ctx)
+            .await?;
+
+        let key_near_end = {
+            let mut tmp = current_key;
+            tmp.field6 -= 10;
+            tmp
+        };
+
+        let key_near_gap = {
+            let mut tmp = gap_at_key;
+            tmp.field6 -= 10;
+            tmp
+        };
+
+        let read = KeySpace {
+            ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
+        };
+        let results = child_timeline
+            .get_vectored_impl(read.clone(), current_lsn, &ctx)
+            .await?;
+
+        for (key, img_res) in results {
+            let expected = test_img(&format!("{} at {}", key, latest_lsns[&key]));
+            assert_eq!(img_res?, expected);
+        }
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_random_updates")?;
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 5f4814cc6b..b8ed69052f 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -460,15 +460,22 @@ impl LayerMap {
         }
     }
 
-    pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> Option<RangeSearchResult> {
-        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
+    pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> RangeSearchResult {
+        let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) {
+            Some(version) => version,
+            None => {
+                let mut result = RangeSearchResult::new();
+                result.not_found.add_range(key_range);
+                return result;
+            }
+        };
 
         let raw_range = key_range.start.to_i128()..key_range.end.to_i128();
         let delta_changes = version.delta_coverage.range_overlaps(&raw_range);
         let image_changes = version.image_coverage.range_overlaps(&raw_range);
 
         let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes);
-        Some(collector.collect())
+        collector.collect()
     }
 
     /// Start a batch of updates, applied on drop
@@ -995,8 +1002,13 @@ mod tests {
         let layer_map = LayerMap::default();
         let range = Key::from_i128(100)..Key::from_i128(200);
 
-        let res = layer_map.range_search(range, Lsn(100));
-        assert!(res.is_none());
+        let res = layer_map.range_search(range.clone(), Lsn(100));
+        assert_eq!(
+            res.not_found.to_keyspace(),
+            KeySpace {
+                ranges: vec![range]
+            }
+        );
     }
 
     #[test]
@@ -1033,7 +1045,7 @@ mod tests {
         for start in 0..60 {
             for end in (start + 1)..60 {
                 let range = Key::from_i128(start)..Key::from_i128(end);
-                let result = layer_map.range_search(range.clone(), Lsn(100)).unwrap();
+                let result = layer_map.range_search(range.clone(), Lsn(100));
                 let expected = brute_force_range_search(&layer_map, range, Lsn(100));
 
                 assert_range_search_result_eq(result, expected);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7ac7c15876..71a958206c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2784,7 +2784,7 @@ impl Timeline {
         let guard = timeline.layers.read().await;
         let layers = guard.layer_map();
 
-        'outer: loop {
+        loop {
             if cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
             }
@@ -2810,12 +2810,7 @@ impl Timeline {
                 }
                 None => {
                     for range in unmapped_keyspace.ranges.iter() {
-                        let results = match layers.range_search(range.clone(), cont_lsn) {
-                            Some(res) => res,
-                            None => {
-                                break 'outer;
-                            }
-                        };
+                        let results = layers.range_search(range.clone(), cont_lsn);
 
                         results
                             .found

From d5a6a2a16d7e63d21ef00b3d582da57485f42d06 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 7 Mar 2024 17:10:03 +0000
Subject: [PATCH 0359/1571] storage controller: robustness improvements (#7027)

## Problem


Closes: https://github.com/neondatabase/neon/issues/6847
Closes: https://github.com/neondatabase/neon/issues/7006

## Summary of changes

- Pageserver API calls are wrapped in timeout/retry logic: this prevents
a reconciler getting hung on a pageserver API hang, and prevents
reconcilers having to totally retry if one API call returns a retryable
error (e.g. 503).
- Add a cancellation token to `Node`, so that when we mark a node
offline we will cancel any API calls in progress to that node, and avoid
issuing any more API calls to that offline node.
- If the dirty locations of a shard are all on offline nodes, then don't
spawn a reconciler
- In re-attach, if we have no observed state object for a tenant then
construct one with conf: None (which means "unknown"). Then in
Reconciler, implement a TODO for scanning such locations before running,
so that we will avoid spuriously incrementing a generation in the case
of a node that was offline while we started (this is the case that
tripped up #7006)
- Refactoring: make Node contents private (and thereby guarantee that
updates to availability mode reliably update the cancellation token.)
- Refactoring: don't pass the whole map of nodes into Reconciler (and
thereby remove a bunch of .expect() calls)

Some of this was discovered/tested with a new failure injection test
that will come in a separate PR, once it is stable enough for CI.
---
 control_plane/attachment_service/src/node.rs  | 218 ++++++++++-
 .../attachment_service/src/reconciler.rs      | 356 +++++++++++-------
 .../attachment_service/src/scheduler.rs       |  30 +-
 .../attachment_service/src/service.rs         | 348 ++++++++---------
 .../attachment_service/src/tenant_state.rs    | 129 +++++--
 pageserver/client/src/mgmt_api.rs             |  20 +-
 pageserver/src/http/routes.rs                 |  27 ++
 pageserver/src/tenant/mgr.rs                  |  10 +
 8 files changed, 749 insertions(+), 389 deletions(-)

diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index 1f9dcef033..27b03608fa 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,6 +1,16 @@
-use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
+use std::{str::FromStr, time::Duration};
+
+use hyper::StatusCode;
+use pageserver_api::{
+    controller_api::{
+        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
+    },
+    shard::TenantShardId,
+};
+use pageserver_client::mgmt_api;
 use serde::Serialize;
-use utils::id::NodeId;
+use tokio_util::sync::CancellationToken;
+use utils::{backoff, id::NodeId};
 
 use crate::persistence::NodePersistence;
 
@@ -12,16 +22,29 @@ use crate::persistence::NodePersistence;
 /// implementation of serialization on this type is only for debug dumps.
 #[derive(Clone, Serialize)]
 pub(crate) struct Node {
-    pub(crate) id: NodeId,
+    id: NodeId,
 
-    pub(crate) availability: NodeAvailability,
-    pub(crate) scheduling: NodeSchedulingPolicy,
+    availability: NodeAvailability,
+    scheduling: NodeSchedulingPolicy,
 
-    pub(crate) listen_http_addr: String,
-    pub(crate) listen_http_port: u16,
+    listen_http_addr: String,
+    listen_http_port: u16,
 
-    pub(crate) listen_pg_addr: String,
-    pub(crate) listen_pg_port: u16,
+    listen_pg_addr: String,
+    listen_pg_port: u16,
+
+    // This cancellation token means "stop any RPCs in flight to this node, and don't start
+    // any more". It is not related to process shutdown.
+    #[serde(skip)]
+    cancel: CancellationToken,
+}
+
+/// When updating [`Node::availability`] we use this type to indicate to the caller
+/// whether/how they changed it.
+pub(crate) enum AvailabilityTransition {
+    ToActive,
+    ToOffline,
+    Unchanged,
 }
 
 impl Node {
@@ -29,6 +52,71 @@ impl Node {
         format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
     }
 
+    pub(crate) fn get_id(&self) -> NodeId {
+        self.id
+    }
+
+    pub(crate) fn set_scheduling(&mut self, scheduling: NodeSchedulingPolicy) {
+        self.scheduling = scheduling
+    }
+
+    /// Does this registration request match `self`?  This is used when deciding whether a registration
+    /// request should be allowed to update an existing record with the same node ID.
+    pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
+        self.id == register_req.node_id
+            && self.listen_http_addr == register_req.listen_http_addr
+            && self.listen_http_port == register_req.listen_http_port
+            && self.listen_pg_addr == register_req.listen_pg_addr
+            && self.listen_pg_port == register_req.listen_pg_port
+    }
+
+    /// For a shard located on this node, populate a response object
+    /// with this node's address information.
+    pub(crate) fn shard_location(&self, shard_id: TenantShardId) -> TenantLocateResponseShard {
+        TenantLocateResponseShard {
+            shard_id,
+            node_id: self.id,
+            listen_http_addr: self.listen_http_addr.clone(),
+            listen_http_port: self.listen_http_port,
+            listen_pg_addr: self.listen_pg_addr.clone(),
+            listen_pg_port: self.listen_pg_port,
+        }
+    }
+
+    pub(crate) fn set_availability(
+        &mut self,
+        availability: NodeAvailability,
+    ) -> AvailabilityTransition {
+        use NodeAvailability::*;
+        let transition = match (self.availability, availability) {
+            (Offline, Active) => {
+                // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
+                // users of previously-cloned copies of the node will still see the old cancellation
+                // state.  For example, Reconcilers in flight will have to complete and be spawned
+                // again to realize that the node has become available.
+                self.cancel = CancellationToken::new();
+                AvailabilityTransition::ToActive
+            }
+            (Active, Offline) => {
+                // Fire the node's cancellation token to cancel any in-flight API requests to it
+                self.cancel.cancel();
+                AvailabilityTransition::ToOffline
+            }
+            _ => AvailabilityTransition::Unchanged,
+        };
+        self.availability = availability;
+        transition
+    }
+
+    /// Whether we may send API requests to this node.
+    pub(crate) fn is_available(&self) -> bool {
+        // When we clone a node, [`Self::availability`] is a snapshot, but [`Self::cancel`] holds
+        // a reference to the original Node's cancellation status.  Checking both of these results
+        // in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
+        // when we cloned it, or if the original Node instance's cancellation token was fired.
+        matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled()
+    }
+
     /// Is this node elegible to have work scheduled onto it?
     pub(crate) fn may_schedule(&self) -> bool {
         match self.availability {
@@ -44,6 +132,26 @@ impl Node {
         }
     }
 
+    pub(crate) fn new(
+        id: NodeId,
+        listen_http_addr: String,
+        listen_http_port: u16,
+        listen_pg_addr: String,
+        listen_pg_port: u16,
+    ) -> Self {
+        Self {
+            id,
+            listen_http_addr,
+            listen_http_port,
+            listen_pg_addr,
+            listen_pg_port,
+            scheduling: NodeSchedulingPolicy::Filling,
+            // TODO: we shouldn't really call this Active until we've heartbeated it.
+            availability: NodeAvailability::Active,
+            cancel: CancellationToken::new(),
+        }
+    }
+
     pub(crate) fn to_persistent(&self) -> NodePersistence {
         NodePersistence {
             node_id: self.id.0 as i64,
@@ -54,4 +162,96 @@ impl Node {
             listen_pg_port: self.listen_pg_port as i32,
         }
     }
+
+    pub(crate) fn from_persistent(np: NodePersistence) -> Self {
+        Self {
+            id: NodeId(np.node_id as u64),
+            // At startup we consider a node offline until proven otherwise.
+            availability: NodeAvailability::Offline,
+            scheduling: NodeSchedulingPolicy::from_str(&np.scheduling_policy)
+                .expect("Bad scheduling policy in DB"),
+            listen_http_addr: np.listen_http_addr,
+            listen_http_port: np.listen_http_port as u16,
+            listen_pg_addr: np.listen_pg_addr,
+            listen_pg_port: np.listen_pg_port as u16,
+            cancel: CancellationToken::new(),
+        }
+    }
+
+    /// Wrapper for issuing requests to pageserver management API: takes care of generic
+    /// retry/backoff for retryable HTTP status codes.
+    ///
+    /// This will return None to indicate cancellation.  Cancellation may happen from
+    /// the cancellation token passed in, or from Self's cancellation token (i.e. node
+    /// going offline).
+    pub(crate) async fn with_client_retries<T, O, F>(
+        &self,
+        mut op: O,
+        jwt: &Option<String>,
+        warn_threshold: u32,
+        max_retries: u32,
+        timeout: Duration,
+        cancel: &CancellationToken,
+    ) -> Option<mgmt_api::Result<T>>
+    where
+        O: FnMut(mgmt_api::Client) -> F,
+        F: std::future::Future<Output = mgmt_api::Result<T>>,
+    {
+        fn is_fatal(e: &mgmt_api::Error) -> bool {
+            use mgmt_api::Error::*;
+            match e {
+                ReceiveBody(_) | ReceiveErrorBody(_) => false,
+                ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
+                | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
+                | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
+                ApiError(_, _) => true,
+                Cancelled => true,
+            }
+        }
+
+        backoff::retry(
+            || {
+                let http_client = reqwest::ClientBuilder::new()
+                    .timeout(timeout)
+                    .build()
+                    .expect("Failed to construct HTTP client");
+
+                let client =
+                    mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());
+
+                let node_cancel_fut = self.cancel.cancelled();
+
+                let op_fut = op(client);
+
+                async {
+                    tokio::select! {
+                        r = op_fut=> {r},
+                        _ = node_cancel_fut => {
+                        Err(mgmt_api::Error::Cancelled)
+                    }}
+                }
+            },
+            is_fatal,
+            warn_threshold,
+            max_retries,
+            &format!(
+                "Call to node {} ({}:{}) management API",
+                self.id, self.listen_http_addr, self.listen_http_port
+            ),
+            cancel,
+        )
+        .await
+    }
+}
+
+impl std::fmt::Display for Node {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{} ({})", self.id, self.listen_http_addr)
+    }
+}
+
+impl std::fmt::Debug for Node {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{} ({})", self.id, self.listen_http_addr)
+    }
 }
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index 0fa6e8e2f8..603da9bf02 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,6 +1,5 @@
 use crate::persistence::Persistence;
 use crate::service;
-use pageserver_api::controller_api::NodeAvailability;
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
@@ -28,15 +27,16 @@ pub(super) struct Reconciler {
     pub(crate) shard: ShardIdentity,
     pub(crate) generation: Option<Generation>,
     pub(crate) intent: TargetState,
+
+    /// Nodes not referenced by [`Self::intent`], from which we should try
+    /// to detach this tenant shard.
+    pub(crate) detach: Vec<Node>,
+
     pub(crate) config: TenantConfig,
     pub(crate) observed: ObservedState,
 
     pub(crate) service_config: service::Config,
 
-    /// A snapshot of the pageservers as they were when we were asked
-    /// to reconcile.
-    pub(crate) pageservers: Arc<HashMap<NodeId, Node>>,
-
     /// A hook to notify the running postgres instances when we change the location
     /// of a tenant.  Use this via [`Self::compute_notify`] to update our failure flag
     /// and guarantee eventual retries.
@@ -67,29 +67,37 @@ pub(super) struct Reconciler {
 /// and the TargetState is just the instruction for a particular Reconciler run.
 #[derive(Debug)]
 pub(crate) struct TargetState {
-    pub(crate) attached: Option<NodeId>,
-    pub(crate) secondary: Vec<NodeId>,
+    pub(crate) attached: Option<Node>,
+    pub(crate) secondary: Vec<Node>,
 }
 
 impl TargetState {
-    pub(crate) fn from_intent(intent: &IntentState) -> Self {
+    pub(crate) fn from_intent(nodes: &HashMap<NodeId, Node>, intent: &IntentState) -> Self {
         Self {
-            attached: *intent.get_attached(),
-            secondary: intent.get_secondary().clone(),
+            attached: intent.get_attached().map(|n| {
+                nodes
+                    .get(&n)
+                    .expect("Intent attached referenced non-existent node")
+                    .clone()
+            }),
+            secondary: intent
+                .get_secondary()
+                .iter()
+                .map(|n| {
+                    nodes
+                        .get(n)
+                        .expect("Intent secondary referenced non-existent node")
+                        .clone()
+                })
+                .collect(),
         }
     }
-
-    fn all_pageservers(&self) -> Vec<NodeId> {
-        let mut result = self.secondary.clone();
-        if let Some(node_id) = &self.attached {
-            result.push(*node_id);
-        }
-        result
-    }
 }
 
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum ReconcileError {
+    #[error(transparent)]
+    Remote(#[from] mgmt_api::Error),
     #[error(transparent)]
     Notify(#[from] NotifyError),
     #[error("Cancelled")]
@@ -101,45 +109,83 @@ pub(crate) enum ReconcileError {
 impl Reconciler {
     async fn location_config(
         &mut self,
-        node_id: NodeId,
+        node: &Node,
         config: LocationConfig,
         flush_ms: Option<Duration>,
         lazy: bool,
-    ) -> anyhow::Result<()> {
-        let node = self
-            .pageservers
-            .get(&node_id)
-            .expect("Pageserver may not be removed while referenced");
+    ) -> Result<(), ReconcileError> {
+        self.observed
+            .locations
+            .insert(node.get_id(), ObservedStateLocation { conf: None });
+
+        // TODO: amend locations that use long-polling: they will hit this timeout.
+        let timeout = Duration::from_secs(25);
+
+        tracing::info!("location_config({node}) calling: {:?}", config);
+        let tenant_shard_id = self.tenant_shard_id;
+        let config_ref = &config;
+        match node
+            .with_client_retries(
+                |client| async move {
+                    let config = config_ref.clone();
+                    client
+                        .location_config(tenant_shard_id, config.clone(), flush_ms, lazy)
+                        .await
+                },
+                &self.service_config.jwt_token,
+                1,
+                3,
+                timeout,
+                &self.cancel,
+            )
+            .await
+        {
+            Some(Ok(_)) => {}
+            Some(Err(e)) => return Err(e.into()),
+            None => return Err(ReconcileError::Cancel),
+        };
+        tracing::info!("location_config({node}) complete: {:?}", config);
 
         self.observed
             .locations
-            .insert(node.id, ObservedStateLocation { conf: None });
-
-        tracing::info!("location_config({}) calling: {:?}", node_id, config);
-        let client =
-            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
-        client
-            .location_config(self.tenant_shard_id, config.clone(), flush_ms, lazy)
-            .await?;
-        tracing::info!("location_config({}) complete: {:?}", node_id, config);
-
-        self.observed
-            .locations
-            .insert(node.id, ObservedStateLocation { conf: Some(config) });
+            .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
 
         Ok(())
     }
 
+    fn get_node(&self, node_id: &NodeId) -> Option<&Node> {
+        if let Some(node) = self.intent.attached.as_ref() {
+            if node.get_id() == *node_id {
+                return Some(node);
+            }
+        }
+
+        if let Some(node) = self
+            .intent
+            .secondary
+            .iter()
+            .find(|n| n.get_id() == *node_id)
+        {
+            return Some(node);
+        }
+
+        if let Some(node) = self.detach.iter().find(|n| n.get_id() == *node_id) {
+            return Some(node);
+        }
+
+        None
+    }
+
     async fn maybe_live_migrate(&mut self) -> Result<(), ReconcileError> {
-        let destination = if let Some(node_id) = self.intent.attached {
-            match self.observed.locations.get(&node_id) {
+        let destination = if let Some(node) = &self.intent.attached {
+            match self.observed.locations.get(&node.get_id()) {
                 Some(conf) => {
                     // We will do a live migration only if the intended destination is not
                     // currently in an attached state.
                     match &conf.conf {
                         Some(conf) if conf.mode == LocationConfigMode::Secondary => {
                             // Fall through to do a live migration
-                            node_id
+                            node
                         }
                         None | Some(_) => {
                             // Attached or uncertain: don't do a live migration, proceed
@@ -152,7 +198,7 @@ impl Reconciler {
                 None => {
                     // Our destination is not attached: maybe live migrate if some other
                     // node is currently attached.  Fall through.
-                    node_id
+                    node
                 }
             }
         } else {
@@ -165,15 +211,13 @@ impl Reconciler {
         for (node_id, state) in &self.observed.locations {
             if let Some(observed_conf) = &state.conf {
                 if observed_conf.mode == LocationConfigMode::AttachedSingle {
-                    let node = self
-                        .pageservers
-                        .get(node_id)
-                        .expect("Nodes may not be removed while referenced");
                     // We will only attempt live migration if the origin is not offline: this
                     // avoids trying to do it while reconciling after responding to an HA failover.
-                    if !matches!(node.availability, NodeAvailability::Offline) {
-                        origin = Some(*node_id);
-                        break;
+                    if let Some(node) = self.get_node(node_id) {
+                        if node.is_available() {
+                            origin = Some(node.clone());
+                            break;
+                        }
                     }
                 }
             }
@@ -186,7 +230,7 @@ impl Reconciler {
 
         // We have an origin and a destination: proceed to do the live migration
         tracing::info!("Live migrating {}->{}", origin, destination);
-        self.live_migrate(origin, destination).await?;
+        self.live_migrate(origin, destination.clone()).await?;
 
         Ok(())
     }
@@ -194,13 +238,8 @@ impl Reconciler {
     async fn get_lsns(
         &self,
         tenant_shard_id: TenantShardId,
-        node_id: &NodeId,
+        node: &Node,
     ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-        let node = self
-            .pageservers
-            .get(node_id)
-            .expect("Pageserver may not be removed while referenced");
-
         let client =
             mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
 
@@ -211,19 +250,27 @@ impl Reconciler {
             .collect())
     }
 
-    async fn secondary_download(&self, tenant_shard_id: TenantShardId, node_id: &NodeId) {
-        let node = self
-            .pageservers
-            .get(node_id)
-            .expect("Pageserver may not be removed while referenced");
-
-        let client =
-            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
-
-        match client.tenant_secondary_download(tenant_shard_id).await {
-            Ok(()) => {}
-            Err(_) => {
-                tracing::info!("  (skipping, destination wasn't in secondary mode)")
+    async fn secondary_download(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node: &Node,
+    ) -> Result<(), ReconcileError> {
+        match node
+            .with_client_retries(
+                |client| async move { client.tenant_secondary_download(tenant_shard_id).await },
+                &self.service_config.jwt_token,
+                1,
+                1,
+                Duration::from_secs(60),
+                &self.cancel,
+            )
+            .await
+        {
+            None => Err(ReconcileError::Cancel),
+            Some(Ok(_)) => Ok(()),
+            Some(Err(e)) => {
+                tracing::info!("  (skipping destination download: {})", e);
+                Ok(())
             }
         }
     }
@@ -231,17 +278,14 @@ impl Reconciler {
     async fn await_lsn(
         &self,
         tenant_shard_id: TenantShardId,
-        pageserver_id: &NodeId,
+        node: &Node,
         baseline: HashMap<TimelineId, Lsn>,
     ) -> anyhow::Result<()> {
         loop {
-            let latest = match self.get_lsns(tenant_shard_id, pageserver_id).await {
+            let latest = match self.get_lsns(tenant_shard_id, node).await {
                 Ok(l) => l,
                 Err(e) => {
-                    println!(
-                        "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
-                        pageserver_id
-                    );
+                    tracing::info!("🕑 Can't get LSNs on node {node} yet, waiting ({e})",);
                     std::thread::sleep(Duration::from_millis(500));
                     continue;
                 }
@@ -251,7 +295,7 @@ impl Reconciler {
             for (timeline_id, baseline_lsn) in &baseline {
                 match latest.get(timeline_id) {
                     Some(latest_lsn) => {
-                        println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
+                        tracing::info!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
                         if latest_lsn < baseline_lsn {
                             any_behind = true;
                         }
@@ -266,7 +310,7 @@ impl Reconciler {
             }
 
             if !any_behind {
-                println!("✅ LSN caught up.  Proceeding...");
+                tracing::info!("✅ LSN caught up.  Proceeding...");
                 break;
             } else {
                 std::thread::sleep(Duration::from_millis(500));
@@ -278,11 +322,11 @@ impl Reconciler {
 
     pub async fn live_migrate(
         &mut self,
-        origin_ps_id: NodeId,
-        dest_ps_id: NodeId,
-    ) -> anyhow::Result<()> {
+        origin_ps: Node,
+        dest_ps: Node,
+    ) -> Result<(), ReconcileError> {
         // `maybe_live_migrate` is responsibble for sanity of inputs
-        assert!(origin_ps_id != dest_ps_id);
+        assert!(origin_ps.get_id() != dest_ps.get_id());
 
         fn build_location_config(
             shard: &ShardIdentity,
@@ -302,10 +346,7 @@ impl Reconciler {
             }
         }
 
-        tracing::info!(
-            "🔁 Switching origin pageserver {} to stale mode",
-            origin_ps_id
-        );
+        tracing::info!("🔁 Switching origin node {origin_ps} to stale mode",);
 
         // FIXME: it is incorrect to use self.generation here, we should use the generation
         // from the ObservedState of the origin pageserver (it might be older than self.generation)
@@ -316,26 +357,18 @@ impl Reconciler {
             self.generation,
             None,
         );
-        self.location_config(
-            origin_ps_id,
-            stale_conf,
-            Some(Duration::from_secs(10)),
-            false,
-        )
-        .await?;
+        self.location_config(&origin_ps, stale_conf, Some(Duration::from_secs(10)), false)
+            .await?;
 
-        let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?);
+        let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps).await?);
 
         // If we are migrating to a destination that has a secondary location, warm it up first
-        if let Some(destination_conf) = self.observed.locations.get(&dest_ps_id) {
+        if let Some(destination_conf) = self.observed.locations.get(&dest_ps.get_id()) {
             if let Some(destination_conf) = &destination_conf.conf {
                 if destination_conf.mode == LocationConfigMode::Secondary {
-                    tracing::info!(
-                        "🔁 Downloading latest layers to destination pageserver {}",
-                        dest_ps_id,
-                    );
-                    self.secondary_download(self.tenant_shard_id, &dest_ps_id)
-                        .await;
+                    tracing::info!("🔁 Downloading latest layers to destination node {dest_ps}",);
+                    self.secondary_download(self.tenant_shard_id, &dest_ps)
+                        .await?;
                 }
             }
         }
@@ -343,7 +376,7 @@ impl Reconciler {
         // Increment generation before attaching to new pageserver
         self.generation = Some(
             self.persistence
-                .increment_generation(self.tenant_shard_id, dest_ps_id)
+                .increment_generation(self.tenant_shard_id, dest_ps.get_id())
                 .await?,
         );
 
@@ -355,23 +388,23 @@ impl Reconciler {
             None,
         );
 
-        tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id);
-        self.location_config(dest_ps_id, dest_conf, None, false)
+        tracing::info!("🔁 Attaching to pageserver {dest_ps}");
+        self.location_config(&dest_ps, dest_conf, None, false)
             .await?;
 
         if let Some(baseline) = baseline_lsns {
             tracing::info!("🕑 Waiting for LSN to catch up...");
-            self.await_lsn(self.tenant_shard_id, &dest_ps_id, baseline)
+            self.await_lsn(self.tenant_shard_id, &dest_ps, baseline)
                 .await?;
         }
 
-        tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps_id);
+        tracing::info!("🔁 Notifying compute to use pageserver {dest_ps}");
 
         // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach
         // the origin without notifying compute, we will render the tenant unavailable.
         while let Err(e) = self.compute_notify().await {
             match e {
-                NotifyError::Fatal(_) => return Err(anyhow::anyhow!(e)),
+                NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
                 _ => {
                     tracing::warn!(
                         "Live migration blocked by compute notification error, retrying: {e}"
@@ -389,22 +422,19 @@ impl Reconciler {
             None,
             Some(LocationConfigSecondary { warm: true }),
         );
-        self.location_config(origin_ps_id, origin_secondary_conf.clone(), None, false)
+        self.location_config(&origin_ps, origin_secondary_conf.clone(), None, false)
             .await?;
         // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
         // partway through.  In fact, all location conf API calls should be in a wrapper that sets
         // the observed state to None, then runs, then sets it to what we wrote.
         self.observed.locations.insert(
-            origin_ps_id,
+            origin_ps.get_id(),
             ObservedStateLocation {
                 conf: Some(origin_secondary_conf),
             },
         );
 
-        println!(
-            "🔁 Switching to AttachedSingle mode on pageserver {}",
-            dest_ps_id
-        );
+        tracing::info!("🔁 Switching to AttachedSingle mode on node {dest_ps}",);
         let dest_final_conf = build_location_config(
             &self.shard,
             &self.config,
@@ -412,16 +442,61 @@ impl Reconciler {
             self.generation,
             None,
         );
-        self.location_config(dest_ps_id, dest_final_conf.clone(), None, false)
+        self.location_config(&dest_ps, dest_final_conf.clone(), None, false)
             .await?;
         self.observed.locations.insert(
-            dest_ps_id,
+            dest_ps.get_id(),
             ObservedStateLocation {
                 conf: Some(dest_final_conf),
             },
         );
 
-        println!("✅ Migration complete");
+        tracing::info!("✅ Migration complete");
+
+        Ok(())
+    }
+
+    async fn maybe_refresh_observed(&mut self) -> Result<(), ReconcileError> {
+        // If the attached node has uncertain state, read it from the pageserver before proceeding: this
+        // is important to avoid spurious generation increments.
+        //
+        // We don't need to do this for secondary/detach locations because it's harmless to just PUT their
+        // location conf, whereas for attached locations it can interrupt clients if we spuriously destroy/recreate
+        // the `Timeline` object in the pageserver.
+
+        let Some(attached_node) = self.intent.attached.as_ref() else {
+            // Nothing to do
+            return Ok(());
+        };
+
+        if matches!(
+            self.observed.locations.get(&attached_node.get_id()),
+            Some(ObservedStateLocation { conf: None })
+        ) {
+            let tenant_shard_id = self.tenant_shard_id;
+            let observed_conf = match attached_node
+                .with_client_retries(
+                    |client| async move { client.get_location_config(tenant_shard_id).await },
+                    &self.service_config.jwt_token,
+                    1,
+                    1,
+                    Duration::from_secs(5),
+                    &self.cancel,
+                )
+                .await
+            {
+                Some(Ok(observed)) => observed,
+                Some(Err(e)) => return Err(e.into()),
+                None => return Err(ReconcileError::Cancel),
+            };
+            tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
+            self.observed.locations.insert(
+                attached_node.get_id(),
+                ObservedStateLocation {
+                    conf: observed_conf,
+                },
+            );
+        }
 
         Ok(())
     }
@@ -433,14 +508,14 @@ impl Reconciler {
     /// general case reconciliation where we walk through the intent by pageserver
     /// and call out to the pageserver to apply the desired state.
     pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
-        // TODO: if any of self.observed is None, call to remote pageservers
-        // to learn correct state.
+        // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it
+        self.maybe_refresh_observed().await?;
 
         // Special case: live migration
         self.maybe_live_migrate().await?;
 
         // If the attached pageserver is not attached, do so now.
-        if let Some(node_id) = self.intent.attached {
+        if let Some(node) = self.intent.attached.as_ref() {
             // If we are in an attached policy, then generation must have been set (null generations
             // are only present when a tenant is initially loaded with a secondary policy)
             debug_assert!(self.generation.is_some());
@@ -451,10 +526,10 @@ impl Reconciler {
             };
 
             let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
-            match self.observed.locations.get(&node_id) {
+            match self.observed.locations.get(&node.get_id()) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     // Nothing to do
-                    tracing::info!(%node_id, "Observed configuration already correct.")
+                    tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
                 }
                 observed => {
                     // In all cases other than a matching observed configuration, we will
@@ -492,16 +567,21 @@ impl Reconciler {
                     if increment_generation {
                         let generation = self
                             .persistence
-                            .increment_generation(self.tenant_shard_id, node_id)
+                            .increment_generation(self.tenant_shard_id, node.get_id())
                             .await?;
                         self.generation = Some(generation);
                         wanted_conf.generation = generation.into();
                     }
-                    tracing::info!(%node_id, "Observed configuration requires update.");
+                    tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
+
+                    // Because `node` comes from a ref to &self, clone it before calling into a &mut self
+                    // function: this could be avoided by refactoring the state mutated by location_config into
+                    // a separate type to Self.
+                    let node = node.clone();
+
                     // Use lazy=true, because we may run many of Self concurrently, and do not want to
                     // overload the pageserver with logical size calculations.
-                    self.location_config(node_id, wanted_conf, None, true)
-                        .await?;
+                    self.location_config(&node, wanted_conf, None, true).await?;
                     self.compute_notify().await?;
                 }
             }
@@ -510,33 +590,27 @@ impl Reconciler {
         // Configure secondary locations: if these were previously attached this
         // implicitly downgrades them from attached to secondary.
         let mut changes = Vec::new();
-        for node_id in &self.intent.secondary {
+        for node in &self.intent.secondary {
             let wanted_conf = secondary_location_conf(&self.shard, &self.config);
-            match self.observed.locations.get(node_id) {
+            match self.observed.locations.get(&node.get_id()) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     // Nothing to do
-                    tracing::info!(%node_id, "Observed configuration already correct.")
+                    tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
                 }
                 _ => {
                     // In all cases other than a matching observed configuration, we will
                     // reconcile this location.
-                    tracing::info!(%node_id, "Observed configuration requires update.");
-                    changes.push((*node_id, wanted_conf))
+                    tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
+                    changes.push((node.clone(), wanted_conf))
                 }
             }
         }
 
         // Detach any extraneous pageservers that are no longer referenced
         // by our intent.
-        let all_pageservers = self.intent.all_pageservers();
-        for node_id in self.observed.locations.keys() {
-            if all_pageservers.contains(node_id) {
-                // We are only detaching pageservers that aren't used at all.
-                continue;
-            }
-
+        for node in &self.detach {
             changes.push((
-                *node_id,
+                node.clone(),
                 LocationConfig {
                     mode: LocationConfigMode::Detached,
                     generation: None,
@@ -549,11 +623,11 @@ impl Reconciler {
             ));
         }
 
-        for (node_id, conf) in changes {
+        for (node, conf) in changes {
             if self.cancel.is_cancelled() {
                 return Err(ReconcileError::Cancel);
             }
-            self.location_config(node_id, conf, None, false).await?;
+            self.location_config(&node, conf, None, false).await?;
         }
 
         Ok(())
@@ -562,12 +636,12 @@ impl Reconciler {
     pub(crate) async fn compute_notify(&mut self) -> Result<(), NotifyError> {
         // Whenever a particular Reconciler emits a notification, it is always notifying for the intended
         // destination.
-        if let Some(node_id) = self.intent.attached {
+        if let Some(node) = &self.intent.attached {
             let result = self
                 .compute_hook
                 .notify(
                     self.tenant_shard_id,
-                    node_id,
+                    node.get_id(),
                     self.shard.stripe_size,
                     &self.cancel,
                 )
@@ -576,7 +650,7 @@ impl Reconciler {
                 // It is up to the caller whether they want to drop out on this error, but they don't have to:
                 // in general we should avoid letting unavailability of the cloud control plane stop us from
                 // making progress.
-                tracing::warn!("Failed to notify compute of attached pageserver {node_id}: {e}");
+                tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
                 // Set this flag so that in our ReconcileResult we will set the flag on the shard that it
                 // needs to retry at some point.
                 self.compute_notify_failure = true;
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 87fce3df25..26a2707e8d 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -43,7 +43,7 @@ impl Scheduler {
         let mut scheduler_nodes = HashMap::new();
         for node in nodes {
             scheduler_nodes.insert(
-                node.id,
+                node.get_id(),
                 SchedulerNode {
                     shard_count: 0,
                     may_schedule: node.may_schedule(),
@@ -68,7 +68,7 @@ impl Scheduler {
         let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
         for node in nodes {
             expect_nodes.insert(
-                node.id,
+                node.get_id(),
                 SchedulerNode {
                     shard_count: 0,
                     may_schedule: node.may_schedule(),
@@ -156,7 +156,7 @@ impl Scheduler {
 
     pub(crate) fn node_upsert(&mut self, node: &Node) {
         use std::collections::hash_map::Entry::*;
-        match self.nodes.entry(node.id) {
+        match self.nodes.entry(node.get_id()) {
             Occupied(mut entry) => {
                 entry.get_mut().may_schedule = node.may_schedule();
             }
@@ -255,7 +255,6 @@ impl Scheduler {
 pub(crate) mod test_utils {
 
     use crate::node::Node;
-    use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
     use std::collections::HashMap;
     use utils::id::NodeId;
     /// Test helper: synthesize the requested number of nodes, all in active state.
@@ -264,18 +263,17 @@ pub(crate) mod test_utils {
     pub(crate) fn make_test_nodes(n: u64) -> HashMap<NodeId, Node> {
         (1..n + 1)
             .map(|i| {
-                (
-                    NodeId(i),
-                    Node {
-                        id: NodeId(i),
-                        availability: NodeAvailability::Active,
-                        scheduling: NodeSchedulingPolicy::Active,
-                        listen_http_addr: format!("httphost-{i}"),
-                        listen_http_port: 80 + i as u16,
-                        listen_pg_addr: format!("pghost-{i}"),
-                        listen_pg_port: 5432 + i as u16,
-                    },
-                )
+                (NodeId(i), {
+                    let node = Node::new(
+                        NodeId(i),
+                        format!("httphost-{i}"),
+                        80 + i as u16,
+                        format!("pghost-{i}"),
+                        5432 + i as u16,
+                    );
+                    assert!(node.is_available());
+                    node
+                })
             })
             .collect()
     }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index d162ab5c65..f41c4f89b9 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -16,9 +16,9 @@ use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
 use pageserver_api::{
     controller_api::{
-        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
-        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
-        TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
+        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse,
+        TenantCreateResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
+        TenantShardMigrateResponse,
     },
     models::TenantConfigRequest,
 };
@@ -39,7 +39,6 @@ use pageserver_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
 use tracing::instrument;
 use utils::{
-    backoff,
     completion::Barrier,
     generation::Generation,
     http::error::ApiError,
@@ -50,7 +49,7 @@ use utils::{
 
 use crate::{
     compute_hook::{self, ComputeHook},
-    node::Node,
+    node::{AvailabilityTransition, Node},
     persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
     reconciler::attached_location_conf,
     scheduler::Scheduler,
@@ -201,7 +200,8 @@ impl Service {
     async fn startup_reconcile(self: &Arc<Service>) {
         // For all tenant shards, a vector of observed states on nodes (where None means
         // indeterminate, same as in [`ObservedStateLocation`])
-        let mut observed = HashMap::new();
+        let mut observed: HashMap<TenantShardId, Vec<(NodeId, Option<LocationConfig>)>> =
+            HashMap::new();
 
         let mut nodes_online = HashSet::new();
 
@@ -236,7 +236,8 @@ impl Service {
             nodes_online.insert(node_id);
 
             for (tenant_shard_id, conf_opt) in tenant_shards {
-                observed.insert(tenant_shard_id, (node_id, conf_opt));
+                let shard_observations = observed.entry(tenant_shard_id).or_default();
+                shard_observations.push((node_id, conf_opt));
             }
         }
 
@@ -252,27 +253,28 @@ impl Service {
             let mut new_nodes = (**nodes).clone();
             for (node_id, node) in new_nodes.iter_mut() {
                 if nodes_online.contains(node_id) {
-                    node.availability = NodeAvailability::Active;
+                    node.set_availability(NodeAvailability::Active);
                     scheduler.node_upsert(node);
                 }
             }
             *nodes = Arc::new(new_nodes);
 
-            for (tenant_shard_id, (node_id, observed_loc)) in observed {
-                let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
-                    cleanup.push((tenant_shard_id, node_id));
-                    continue;
-                };
-
-                tenant_state
-                    .observed
-                    .locations
-                    .insert(node_id, ObservedStateLocation { conf: observed_loc });
+            for (tenant_shard_id, shard_observations) in observed {
+                for (node_id, observed_loc) in shard_observations {
+                    let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
+                        cleanup.push((tenant_shard_id, node_id));
+                        continue;
+                    };
+                    tenant_state
+                        .observed
+                        .locations
+                        .insert(node_id, ObservedStateLocation { conf: observed_loc });
+                }
             }
 
             // Populate each tenant's intent state
             for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
-                tenant_state.intent_from_observed();
+                tenant_state.intent_from_observed(scheduler);
                 if let Err(e) = tenant_state.schedule(scheduler) {
                     // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
                     // not enough pageservers are available.  The tenant may well still be available
@@ -359,40 +361,19 @@ impl Service {
         for node in nodes.values() {
             node_list_futs.push({
                 async move {
-                    let http_client = reqwest::ClientBuilder::new()
-                        .timeout(Duration::from_secs(5))
-                        .build()
-                        .expect("Failed to construct HTTP client");
-                    let client = mgmt_api::Client::from_client(
-                        http_client,
-                        node.base_url(),
-                        self.config.jwt_token.as_deref(),
-                    );
-
-                    fn is_fatal(e: &mgmt_api::Error) -> bool {
-                        use mgmt_api::Error::*;
-                        match e {
-                            ReceiveBody(_) | ReceiveErrorBody(_) => false,
-                            ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
-                            | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
-                            | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
-                            ApiError(_, _) => true,
-                        }
-                    }
-
-                    tracing::info!("Scanning shards on node {}...", node.id);
-                    let description = format!("List locations on {}", node.id);
-                    let response = backoff::retry(
-                        || client.list_location_config(),
-                        is_fatal,
-                        1,
-                        5,
-                        &description,
-                        &self.cancel,
-                    )
-                    .await;
-
-                    (node.id, response)
+                    tracing::info!("Scanning shards on node {node}...");
+                    let timeout = Duration::from_secs(5);
+                    let response = node
+                        .with_client_retries(
+                            |client| async move { client.list_location_config().await },
+                            &self.config.jwt_token,
+                            1,
+                            5,
+                            timeout,
+                            &self.cancel,
+                        )
+                        .await;
+                    (node.get_id(), response)
                 }
             });
         }
@@ -662,19 +643,9 @@ impl Service {
             .list_nodes()
             .await?
             .into_iter()
-            .map(|n| Node {
-                id: NodeId(n.node_id as u64),
-                // At startup we consider a node offline until proven otherwise.
-                availability: NodeAvailability::Offline,
-                scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
-                    .expect("Bad scheduling policy in DB"),
-                listen_http_addr: n.listen_http_addr,
-                listen_http_port: n.listen_http_port as u16,
-                listen_pg_addr: n.listen_pg_addr,
-                listen_pg_port: n.listen_pg_port as u16,
-            })
+            .map(Node::from_persistent)
             .collect::<Vec<_>>();
-        let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
+        let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.get_id(), n)).collect();
         tracing::info!("Loaded {} nodes from database.", nodes.len());
 
         tracing::info!("Loading shards from database...");
@@ -701,15 +672,13 @@ impl Service {
             }
             for node_id in node_ids {
                 tracing::info!("Creating node {} in scheduler for tests", node_id);
-                let node = Node {
-                    id: NodeId(node_id as u64),
-                    availability: NodeAvailability::Active,
-                    scheduling: NodeSchedulingPolicy::Active,
-                    listen_http_addr: "".to_string(),
-                    listen_http_port: 123,
-                    listen_pg_addr: "".to_string(),
-                    listen_pg_port: 123,
-                };
+                let node = Node::new(
+                    NodeId(node_id as u64),
+                    "".to_string(),
+                    123,
+                    "".to_string(),
+                    123,
+                );
 
                 scheduler.node_upsert(&node);
             }
@@ -975,6 +944,12 @@ impl Service {
         // Ordering: we must persist generation number updates before making them visible in the in-memory state
         let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?;
 
+        tracing::info!(
+            node_id=%reattach_req.node_id,
+            "Incremented {} tenant shards' generations",
+            incremented_generations.len()
+        );
+
         // Apply the updated generation to our in-memory state
         let mut locked = self.inner.write().unwrap();
 
@@ -987,7 +962,6 @@ impl Service {
                 id: tenant_shard_id,
                 gen: new_gen.into().unwrap(),
             });
-
             // Apply the new generation number to our in-memory state
             let shard_state = locked.tenants.get_mut(&tenant_shard_id);
             let Some(shard_state) = shard_state else {
@@ -1023,6 +997,14 @@ impl Service {
                 if let Some(conf) = observed.conf.as_mut() {
                     conf.generation = new_gen.into();
                 }
+            } else {
+                // This node has no observed state for the shard: perhaps it was offline
+                // when the pageserver restarted.  Insert a None, so that the Reconciler
+                // will be prompted to learn the location's state before it makes changes.
+                shard_state
+                    .observed
+                    .locations
+                    .insert(reattach_req.node_id, ObservedStateLocation { conf: None });
             }
 
             // TODO: cancel/restart any running reconciliation for this tenant, it might be trying
@@ -1685,7 +1667,7 @@ impl Service {
                         .map_err(|e| {
                             ApiError::InternalServerError(anyhow::anyhow!(
                                 "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}",
-                                node.id
+                                node
                             ))
                         })?;
             }
@@ -1739,10 +1721,7 @@ impl Service {
             // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
             // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
             // than they had hoped for.
-            tracing::warn!(
-                "Ignoring tenant secondary download error from pageserver {}: {e}",
-                node.id,
-            );
+            tracing::warn!("Ignoring tenant secondary download error from pageserver {node}: {e}",);
         }
 
         Ok(())
@@ -1780,13 +1759,11 @@ impl Service {
             // surface immediately as an error to our caller.
             let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
                 ApiError::InternalServerError(anyhow::anyhow!(
-                    "Error deleting shard {tenant_shard_id} on node {}: {e}",
-                    node.id
+                    "Error deleting shard {tenant_shard_id} on node {node}: {e}",
                 ))
             })?;
             tracing::info!(
-                "Shard {tenant_shard_id} on node {}, delete returned {}",
-                node.id,
+                "Shard {tenant_shard_id} on node {node}, delete returned {}",
                 status
             );
             if status == StatusCode::ACCEPTED {
@@ -1885,10 +1862,9 @@ impl Service {
             create_req: TimelineCreateRequest,
         ) -> Result<TimelineInfo, ApiError> {
             tracing::info!(
-                "Creating timeline on shard {}/{}, attached to node {}",
+                "Creating timeline on shard {}/{}, attached to node {node}",
                 tenant_shard_id,
                 create_req.new_timeline_id,
-                node.id
             );
             let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
 
@@ -2012,10 +1988,7 @@ impl Service {
             jwt: Option<String>,
         ) -> Result<StatusCode, ApiError> {
             tracing::info!(
-                "Deleting timeline on shard {}/{}, attached to node {}",
-                tenant_shard_id,
-                timeline_id,
-                node.id
+                "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
             );
 
             let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
@@ -2024,8 +1997,7 @@ impl Service {
                 .await
                 .map_err(|e| {
                     ApiError::InternalServerError(anyhow::anyhow!(
-                    "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {}: {e}",
-                    node.id
+                    "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
                 ))
                 })
         }
@@ -2126,14 +2098,7 @@ impl Service {
                 .get(&node_id)
                 .expect("Pageservers may not be deleted while referenced");
 
-            result.push(TenantLocateResponseShard {
-                shard_id: *tenant_shard_id,
-                node_id,
-                listen_http_addr: node.listen_http_addr.clone(),
-                listen_http_port: node.listen_http_port,
-                listen_pg_addr: node.listen_pg_addr.clone(),
-                listen_pg_port: node.listen_pg_port,
-            });
+            result.push(node.shard_location(*tenant_shard_id));
 
             match &shard_params {
                 None => {
@@ -2324,7 +2289,7 @@ impl Service {
                     // populate the correct generation as part of its transaction, to protect us
                     // against racing with changes in the state of the parent.
                     generation: None,
-                    generation_pageserver: Some(target.node.id.0 as i64),
+                    generation_pageserver: Some(target.node.get_id().0 as i64),
                     placement_policy: serde_json::to_string(&policy).unwrap(),
                     // TODO: get the config out of the map
                     config: serde_json::to_string(&TenantConfig::default()).unwrap(),
@@ -2526,10 +2491,10 @@ impl Service {
                 )));
             };
 
-            if node.availability != NodeAvailability::Active {
+            if !node.is_available() {
                 // Warn but proceed: the caller may intend to manually adjust the placement of
                 // a shard even if the node is down, e.g. if intervening during an incident.
-                tracing::warn!("Migrating to an unavailable node ({})", node.id);
+                tracing::warn!("Migrating to unavailable node {node}");
             }
 
             let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
@@ -2784,11 +2749,7 @@ impl Service {
             if let Some(node) = locked.nodes.get(&register_req.node_id) {
                 // Note that we do not do a total equality of the struct, because we don't require
                 // the availability/scheduling states to agree for a POST to be idempotent.
-                if node.listen_http_addr == register_req.listen_http_addr
-                    && node.listen_http_port == register_req.listen_http_port
-                    && node.listen_pg_addr == register_req.listen_pg_addr
-                    && node.listen_pg_port == register_req.listen_pg_port
-                {
+                if node.registration_match(&register_req) {
                     tracing::info!(
                         "Node {} re-registered with matching address",
                         register_req.node_id
@@ -2812,16 +2773,14 @@ impl Service {
         // Ordering: we must persist the new node _before_ adding it to in-memory state.
         // This ensures that before we use it for anything or expose it via any external
         // API, it is guaranteed to be available after a restart.
-        let new_node = Node {
-            id: register_req.node_id,
-            listen_http_addr: register_req.listen_http_addr,
-            listen_http_port: register_req.listen_http_port,
-            listen_pg_addr: register_req.listen_pg_addr,
-            listen_pg_port: register_req.listen_pg_port,
-            scheduling: NodeSchedulingPolicy::Filling,
-            // TODO: we shouldn't really call this Active until we've heartbeated it.
-            availability: NodeAvailability::Active,
-        };
+        let new_node = Node::new(
+            register_req.node_id,
+            register_req.listen_http_addr,
+            register_req.listen_http_port,
+            register_req.listen_pg_addr,
+            register_req.listen_pg_port,
+        );
+
         // TODO: idempotency if the node already exists in the database
         self.persistence.insert_node(&new_node).await?;
 
@@ -2866,29 +2825,14 @@ impl Service {
             ));
         };
 
-        let mut offline_transition = false;
-        let mut active_transition = false;
-
-        if let Some(availability) = &config_req.availability {
-            match (availability, &node.availability) {
-                (NodeAvailability::Offline, NodeAvailability::Active) => {
-                    tracing::info!("Node {} transition to offline", config_req.node_id);
-                    offline_transition = true;
-                }
-                (NodeAvailability::Active, NodeAvailability::Offline) => {
-                    tracing::info!("Node {} transition to active", config_req.node_id);
-                    active_transition = true;
-                }
-                _ => {
-                    tracing::info!("Node {} no change during config", config_req.node_id);
-                    // No change
-                }
-            };
-            node.availability = *availability;
-        }
+        let availability_transition = if let Some(availability) = &config_req.availability {
+            node.set_availability(*availability)
+        } else {
+            AvailabilityTransition::Unchanged
+        };
 
         if let Some(scheduling) = config_req.scheduling {
-            node.scheduling = scheduling;
+            node.set_scheduling(scheduling);
 
             // TODO: once we have a background scheduling ticker for fill/drain, kick it
             // to wake up and start working.
@@ -2899,74 +2843,80 @@ impl Service {
 
         let new_nodes = Arc::new(new_nodes);
 
-        if offline_transition {
-            let mut tenants_affected: usize = 0;
-            for (tenant_shard_id, tenant_state) in tenants {
-                if let Some(observed_loc) =
-                    tenant_state.observed.locations.get_mut(&config_req.node_id)
-                {
-                    // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
-                    // not assume our knowledge of the node's configuration is accurate until it comes back online
-                    observed_loc.conf = None;
-                }
+        match availability_transition {
+            AvailabilityTransition::ToOffline => {
+                tracing::info!("Node {} transition to offline", config_req.node_id);
+                let mut tenants_affected: usize = 0;
+                for (tenant_shard_id, tenant_state) in tenants {
+                    if let Some(observed_loc) =
+                        tenant_state.observed.locations.get_mut(&config_req.node_id)
+                    {
+                        // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
+                        // not assume our knowledge of the node's configuration is accurate until it comes back online
+                        observed_loc.conf = None;
+                    }
 
-                if tenant_state.intent.demote_attached(config_req.node_id) {
-                    tenant_state.sequence = tenant_state.sequence.next();
-                    match tenant_state.schedule(scheduler) {
-                        Err(e) => {
-                            // It is possible that some tenants will become unschedulable when too many pageservers
-                            // go offline: in this case there isn't much we can do other than make the issue observable.
-                            // TODO: give TenantState a scheduling error attribute to be queried later.
-                            tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id);
-                        }
-                        Ok(()) => {
-                            if tenant_state
-                                .maybe_reconcile(
-                                    result_tx.clone(),
-                                    &new_nodes,
-                                    &compute_hook,
-                                    &self.config,
-                                    &self.persistence,
-                                    &self.gate,
-                                    &self.cancel,
-                                )
-                                .is_some()
-                            {
-                                tenants_affected += 1;
-                            };
+                    if tenant_state.intent.demote_attached(config_req.node_id) {
+                        tenant_state.sequence = tenant_state.sequence.next();
+                        match tenant_state.schedule(scheduler) {
+                            Err(e) => {
+                                // It is possible that some tenants will become unschedulable when too many pageservers
+                                // go offline: in this case there isn't much we can do other than make the issue observable.
+                                // TODO: give TenantState a scheduling error attribute to be queried later.
+                                tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id);
+                            }
+                            Ok(()) => {
+                                if tenant_state
+                                    .maybe_reconcile(
+                                        result_tx.clone(),
+                                        &new_nodes,
+                                        &compute_hook,
+                                        &self.config,
+                                        &self.persistence,
+                                        &self.gate,
+                                        &self.cancel,
+                                    )
+                                    .is_some()
+                                {
+                                    tenants_affected += 1;
+                                };
+                            }
                         }
                     }
                 }
+                tracing::info!(
+                    "Launched {} reconciler tasks for tenants affected by node {} going offline",
+                    tenants_affected,
+                    config_req.node_id
+                )
             }
-            tracing::info!(
-                "Launched {} reconciler tasks for tenants affected by node {} going offline",
-                tenants_affected,
-                config_req.node_id
-            )
-        }
-
-        if active_transition {
-            // When a node comes back online, we must reconcile any tenant that has a None observed
-            // location on the node.
-            for tenant_state in locked.tenants.values_mut() {
-                if let Some(observed_loc) =
-                    tenant_state.observed.locations.get_mut(&config_req.node_id)
-                {
-                    if observed_loc.conf.is_none() {
-                        tenant_state.maybe_reconcile(
-                            result_tx.clone(),
-                            &new_nodes,
-                            &compute_hook,
-                            &self.config,
-                            &self.persistence,
-                            &self.gate,
-                            &self.cancel,
-                        );
+            AvailabilityTransition::ToActive => {
+                tracing::info!("Node {} transition to active", config_req.node_id);
+                // When a node comes back online, we must reconcile any tenant that has a None observed
+                // location on the node.
+                for tenant_state in locked.tenants.values_mut() {
+                    if let Some(observed_loc) =
+                        tenant_state.observed.locations.get_mut(&config_req.node_id)
+                    {
+                        if observed_loc.conf.is_none() {
+                            tenant_state.maybe_reconcile(
+                                result_tx.clone(),
+                                &new_nodes,
+                                &compute_hook,
+                                &self.config,
+                                &self.persistence,
+                                &self.gate,
+                                &self.cancel,
+                            );
+                        }
                     }
                 }
-            }
 
-            // TODO: in the background, we should balance work back onto this pageserver
+                // TODO: in the background, we should balance work back onto this pageserver
+            }
+            AvailabilityTransition::Unchanged => {
+                tracing::info!("Node {} no change during config", config_req.node_id);
+            }
         }
 
         locked.nodes = new_nodes;
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 33b7d578c7..ddb9866527 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -1,7 +1,10 @@
-use std::{collections::HashMap, sync::Arc, time::Duration};
+use std::{
+    collections::{HashMap, HashSet},
+    sync::Arc,
+    time::Duration,
+};
 
 use crate::{metrics, persistence::TenantShardPersistence};
-use pageserver_api::controller_api::NodeAvailability;
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
     shard::{ShardIdentity, TenantShardId},
@@ -370,7 +373,7 @@ impl TenantState {
     /// [`ObservedState`], even if it violates my [`PlacementPolicy`].  Call [`Self::schedule`] next,
     /// to get an intent state that complies with placement policy.  The overall goal is to do scheduling
     /// in a way that makes use of any configured locations that already exist in the outside world.
-    pub(crate) fn intent_from_observed(&mut self) {
+    pub(crate) fn intent_from_observed(&mut self, scheduler: &mut Scheduler) {
         // Choose an attached location by filtering observed locations, and then sorting to get the highest
         // generation
         let mut attached_locs = self
@@ -395,7 +398,7 @@ impl TenantState {
 
         attached_locs.sort_by_key(|i| i.1);
         if let Some((node_id, _gen)) = attached_locs.into_iter().last() {
-            self.intent.attached = Some(*node_id);
+            self.intent.set_attached(scheduler, Some(*node_id));
         }
 
         // All remaining observed locations generate secondary intents.  This includes None
@@ -406,7 +409,7 @@ impl TenantState {
         // will take care of promoting one of these secondaries to be attached.
         self.observed.locations.keys().for_each(|node_id| {
             if Some(*node_id) != self.intent.attached {
-                self.intent.secondary.push(*node_id);
+                self.intent.push_secondary(scheduler, *node_id);
             }
         });
     }
@@ -564,7 +567,9 @@ impl TenantState {
         }
     }
 
-    fn dirty(&self) -> bool {
+    fn dirty(&self, nodes: &Arc<HashMap<NodeId, Node>>) -> bool {
+        let mut dirty_nodes = HashSet::new();
+
         if let Some(node_id) = self.intent.attached {
             // Maybe panic: it is a severe bug if we try to attach while generation is null.
             let generation = self
@@ -575,7 +580,7 @@ impl TenantState {
             match self.observed.locations.get(&node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                 Some(_) | None => {
-                    return true;
+                    dirty_nodes.insert(node_id);
                 }
             }
         }
@@ -585,7 +590,7 @@ impl TenantState {
             match self.observed.locations.get(node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                 Some(_) | None => {
-                    return true;
+                    dirty_nodes.insert(*node_id);
                 }
             }
         }
@@ -593,17 +598,18 @@ impl TenantState {
         for node_id in self.observed.locations.keys() {
             if self.intent.attached != Some(*node_id) && !self.intent.secondary.contains(node_id) {
                 // We have observed state that isn't part of our intent: need to clean it up.
-                return true;
+                dirty_nodes.insert(*node_id);
             }
         }
 
-        // Even if there is no pageserver work to be done, if we have a pending notification to computes,
-        // wake up a reconciler to send it.
-        if self.pending_compute_notification {
-            return true;
-        }
+        dirty_nodes.retain(|node_id| {
+            nodes
+                .get(node_id)
+                .map(|n| n.is_available())
+                .unwrap_or(false)
+        });
 
-        false
+        !dirty_nodes.is_empty()
     }
 
     #[allow(clippy::too_many_arguments)]
@@ -625,15 +631,20 @@ impl TenantState {
             let node = pageservers
                 .get(node_id)
                 .expect("Nodes may not be removed while referenced");
-            if observed_loc.conf.is_none()
-                && !matches!(node.availability, NodeAvailability::Offline)
-            {
+            if observed_loc.conf.is_none() && node.is_available() {
                 dirty_observed = true;
                 break;
             }
         }
 
-        if !self.dirty() && !dirty_observed {
+        let active_nodes_dirty = self.dirty(pageservers);
+
+        // Even if there is no pageserver work to be done, if we have a pending notification to computes,
+        // wake up a reconciler to send it.
+        let do_reconcile =
+            active_nodes_dirty || dirty_observed || self.pending_compute_notification;
+
+        if !do_reconcile {
             tracing::info!("Not dirty, no reconciliation needed.");
             return None;
         }
@@ -663,6 +674,21 @@ impl TenantState {
             }
         }
 
+        // Build list of nodes from which the reconciler should detach
+        let mut detach = Vec::new();
+        for node_id in self.observed.locations.keys() {
+            if self.intent.get_attached() != &Some(*node_id)
+                && !self.intent.secondary.contains(node_id)
+            {
+                detach.push(
+                    pageservers
+                        .get(node_id)
+                        .expect("Intent references non-existent pageserver")
+                        .clone(),
+                )
+            }
+        }
+
         // Reconcile in flight for a stale sequence?  Our sequence's task will wait for it before
         // doing our sequence's work.
         let old_handle = self.reconciler.take();
@@ -677,14 +703,15 @@ impl TenantState {
         self.sequence = self.sequence.next();
 
         let reconciler_cancel = cancel.child_token();
+        let reconciler_intent = TargetState::from_intent(pageservers, &self.intent);
         let mut reconciler = Reconciler {
             tenant_shard_id: self.tenant_shard_id,
             shard: self.shard,
             generation: self.generation,
-            intent: TargetState::from_intent(&self.intent),
+            intent: reconciler_intent,
+            detach,
             config: self.config.clone(),
             observed: self.observed.clone(),
-            pageservers: pageservers.clone(),
             compute_hook: compute_hook.clone(),
             service_config: service_config.clone(),
             _gate_guard: gate_guard,
@@ -819,7 +846,10 @@ impl TenantState {
 
 #[cfg(test)]
 pub(crate) mod tests {
-    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use pageserver_api::{
+        controller_api::NodeAvailability,
+        shard::{ShardCount, ShardNumber},
+    };
     use utils::id::TenantId;
 
     use crate::scheduler::test_utils::make_test_nodes;
@@ -878,7 +908,10 @@ pub(crate) mod tests {
         assert_eq!(tenant_state.intent.secondary.len(), 2);
 
         // Update the scheduler state to indicate the node is offline
-        nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
+        nodes
+            .get_mut(&attached_node_id)
+            .unwrap()
+            .set_availability(NodeAvailability::Offline);
         scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
 
         // Scheduling the node should promote the still-available secondary node to attached
@@ -897,4 +930,54 @@ pub(crate) mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn intent_from_observed() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(3);
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+
+        tenant_state.observed.locations.insert(
+            NodeId(3),
+            ObservedStateLocation {
+                conf: Some(LocationConfig {
+                    mode: LocationConfigMode::AttachedMulti,
+                    generation: Some(2),
+                    secondary_conf: None,
+                    shard_number: tenant_state.shard.number.0,
+                    shard_count: tenant_state.shard.count.literal(),
+                    shard_stripe_size: tenant_state.shard.stripe_size.0,
+                    tenant_conf: TenantConfig::default(),
+                }),
+            },
+        );
+
+        tenant_state.observed.locations.insert(
+            NodeId(2),
+            ObservedStateLocation {
+                conf: Some(LocationConfig {
+                    mode: LocationConfigMode::AttachedStale,
+                    generation: Some(1),
+                    secondary_conf: None,
+                    shard_number: tenant_state.shard.number.0,
+                    shard_count: tenant_state.shard.count.literal(),
+                    shard_stripe_size: tenant_state.shard.stripe_size.0,
+                    tenant_conf: TenantConfig::default(),
+                }),
+            },
+        );
+
+        tenant_state.intent_from_observed(&mut scheduler);
+
+        // The highest generationed attached location gets used as attached
+        assert_eq!(tenant_state.intent.attached, Some(NodeId(3)));
+        // Other locations get used as secondary
+        assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]);
+
+        scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?;
+
+        tenant_state.intent.clear(&mut scheduler);
+        Ok(())
+    }
 }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 4dde7bdf0b..732eb951c9 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -7,7 +7,7 @@ use utils::{
 
 pub mod util;
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct Client {
     mgmt_api_endpoint: String,
     authorization_header: Option<String>,
@@ -24,6 +24,9 @@ pub enum Error {
 
     #[error("pageserver API: {1}")]
     ApiError(StatusCode, String),
+
+    #[error("Cancelled")]
+    Cancelled,
 }
 
 pub type Result<T> = std::result::Result<T, Error>;
@@ -287,6 +290,21 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    pub async fn get_location_config(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<Option<LocationConfig>> {
+        let path = format!(
+            "{}/v1/location_config/{tenant_shard_id}",
+            self.mgmt_api_endpoint
+        );
+        self.request(Method::GET, &path, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn timeline_create(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6aaf1ab27e..eafad9ab73 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -14,6 +14,7 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
@@ -1519,6 +1520,29 @@ async fn list_location_config_handler(
     json_response(StatusCode::OK, result)
 }
 
+async fn get_location_config_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let slot = state.tenant_manager.get(tenant_shard_id);
+
+    let Some(slot) = slot else {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("Tenant shard not found").into(),
+        ));
+    };
+
+    let result: Option<LocationConfig> = match slot {
+        TenantSlot::Attached(t) => Some(t.get_location_conf()),
+        TenantSlot::Secondary(s) => Some(s.get_location_conf()),
+        TenantSlot::InProgress(_) => None,
+    };
+
+    json_response(StatusCode::OK, result)
+}
+
 // Do a time travel recovery on the given tenant/tenant shard. Tenant needs to be detached
 // (from all pageservers) as it invalidates consistency assumptions.
 async fn tenant_time_travel_remote_storage_handler(
@@ -2223,6 +2247,9 @@ pub fn make_router(
         .get("/v1/location_config", |r| {
             api_handler(r, list_location_config_handler)
         })
+        .get("/v1/location_config/:tenant_id", |r| {
+            api_handler(r, get_location_config_handler)
+        })
         .put(
             "/v1/tenant/:tenant_shard_id/time_travel_remote_storage",
             |r| api_handler(r, tenant_time_travel_remote_storage_handler),
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 06b61d4631..fc08b3c82e 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1358,6 +1358,16 @@ impl TenantManager {
         }
     }
 
+    pub(crate) fn get(&self, tenant_shard_id: TenantShardId) -> Option<TenantSlot> {
+        let locked = self.tenants.read().unwrap();
+        match &*locked {
+            TenantsMap::Initializing => None,
+            TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => {
+                map.get(&tenant_shard_id).cloned()
+            }
+        }
+    }
+
     pub(crate) async fn delete_tenant(
         &self,
         tenant_shard_id: TenantShardId,

From ce7a82db058cecdba996a210b5afea8451bfbc4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 7 Mar 2024 18:32:09 +0100
Subject: [PATCH 0360/1571] Update svg_fmt (#7049)

Gets upstream PR https://github.com/nical/rust_debug/pull/3 , removes
trailing "s from output.
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 167a2b2179..5c48942d41 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5525,9 +5525,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 
 [[package]]
 name = "svg_fmt"
-version = "0.4.1"
+version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
+checksum = "f83ba502a3265efb76efb89b0a2f7782ad6f2675015d4ce37e4b547dda42b499"
 
 [[package]]
 name = "syn"

From 2fc89428c33508bee9fa5772c0c5c35ba3e38548 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 7 Mar 2024 09:12:06 -0900
Subject: [PATCH 0361/1571] Hopefully stabilize test_bad_connection.py (#6976)

## Problem
It seems that even though we have a retry on basebackup, it still
sometimes fails to fetch it with the failpoint enabled, resulting in a
test error.

## Summary of changes
If we fail to get the basebackup, disable the failpoint and try again.
---
 compute_tools/src/compute.rs  | 8 ++++----
 control_plane/src/endpoint.rs | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 5613e6c868..96ab4a06a5 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -396,9 +396,9 @@ impl ComputeNode {
     // Gets the basebackup in a retry loop
     #[instrument(skip_all, fields(%lsn))]
     pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
-        let mut retry_period_ms = 500;
+        let mut retry_period_ms = 500.0;
         let mut attempts = 0;
-        let max_attempts = 5;
+        let max_attempts = 10;
         loop {
             let result = self.try_get_basebackup(compute_state, lsn);
             match result {
@@ -410,8 +410,8 @@ impl ComputeNode {
                         "Failed to get basebackup: {} (attempt {}/{})",
                         e, attempts, max_attempts
                     );
-                    std::thread::sleep(std::time::Duration::from_millis(retry_period_ms));
-                    retry_period_ms *= 2;
+                    std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
+                    retry_period_ms *= 1.5;
                 }
                 Err(_) => {
                     return result;
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 10e4c5d69f..ac0a8417ae 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -656,7 +656,7 @@ impl Endpoint {
         // Wait for it to start
         let mut attempt = 0;
         const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
-        const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
+        const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min
         loop {
             attempt += 1;
             match self.get_status().await {

From 02358b21a41311be2ee610bd461093a68b14222e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 7 Mar 2024 18:23:19 +0000
Subject: [PATCH 0362/1571] update rustls (#7048)

## Summary of changes

Update rustls from 0.21 to 0.22.

reqwest/tonic/aws-smithy still use rustls 0.21. no upgrade route
available yet.
---
 Cargo.lock                                   | 293 +++++++++++++------
 Cargo.toml                                   |  10 +-
 libs/postgres_backend/tests/simple_select.rs |  19 +-
 proxy/src/bin/pg_sni_router.rs               |  38 +--
 proxy/src/config.rs                          |  54 ++--
 proxy/src/proxy/tests.rs                     |  19 +-
 workspace_hack/Cargo.toml                    |   2 +-
 7 files changed, 281 insertions(+), 154 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5c48942d41..7fd9053f62 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -241,7 +241,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -252,7 +252,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -626,7 +626,7 @@ dependencies = [
  "once_cell",
  "pin-project-lite",
  "pin-utils",
- "rustls",
+ "rustls 0.21.9",
  "tokio",
  "tracing",
 ]
@@ -907,6 +907,16 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
 
+[[package]]
+name = "bcder"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c627747a6774aab38beb35990d88309481378558875a41da1a4b2e373c906ef0"
+dependencies = [
+ "bytes",
+ "smallvec",
+]
+
 [[package]]
 name = "bincode"
 version = "1.3.3"
@@ -935,7 +945,7 @@ dependencies = [
  "regex",
  "rustc-hash",
  "shlex",
- "syn 2.0.32",
+ "syn 2.0.52",
  "which",
 ]
 
@@ -986,9 +996,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
 
 [[package]]
 name = "bytes"
-version = "1.4.0"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be"
+checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
 dependencies = [
  "serde",
 ]
@@ -1149,7 +1159,7 @@ dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1574,7 +1584,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1585,7 +1595,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1627,6 +1637,16 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "der"
+version = "0.7.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c"
+dependencies = [
+ "const-oid",
+ "zeroize",
+]
+
 [[package]]
 name = "der-parser"
 version = "8.2.0"
@@ -1681,7 +1701,7 @@ dependencies = [
  "diesel_table_macro_syntax",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1701,7 +1721,7 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5"
 dependencies = [
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1723,7 +1743,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1747,10 +1767,10 @@ version = "0.14.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c"
 dependencies = [
- "der",
+ "der 0.6.1",
  "elliptic-curve",
  "rfc6979",
- "signature",
+ "signature 1.6.4",
 ]
 
 [[package]]
@@ -1767,7 +1787,7 @@ checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3"
 dependencies = [
  "base16ct",
  "crypto-bigint 0.4.9",
- "der",
+ "der 0.6.1",
  "digest",
  "ff",
  "generic-array",
@@ -1827,7 +1847,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -2087,7 +2107,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -2470,10 +2490,10 @@ dependencies = [
  "http 0.2.9",
  "hyper",
  "log",
- "rustls",
+ "rustls 0.21.9",
  "rustls-native-certs",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.24.0",
 ]
 
 [[package]]
@@ -2711,7 +2731,7 @@ checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
 dependencies = [
  "base64 0.21.1",
  "js-sys",
- "pem 3.0.3",
+ "pem",
  "ring 0.17.6",
  "serde",
  "serde_json",
@@ -3234,7 +3254,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -3716,7 +3736,7 @@ dependencies = [
  "parquet",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -3754,16 +3774,6 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
 
-[[package]]
-name = "pem"
-version = "2.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a"
-dependencies = [
- "base64 0.21.1",
- "serde",
-]
-
 [[package]]
 name = "pem"
 version = "3.0.3"
@@ -3825,7 +3835,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -3846,8 +3856,8 @@ version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba"
 dependencies = [
- "der",
- "spki",
+ "der 0.6.1",
+ "spki 0.6.0",
 ]
 
 [[package]]
@@ -3946,14 +3956,14 @@ dependencies = [
  "futures",
  "once_cell",
  "pq_proto",
- "rustls",
- "rustls-pemfile",
+ "rustls 0.22.2",
+ "rustls-pemfile 2.1.1",
  "serde",
  "thiserror",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
- "tokio-rustls",
+ "tokio-rustls 0.25.0",
  "tracing",
  "workspace_hack",
 ]
@@ -4042,7 +4052,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
 dependencies = [
  "proc-macro2",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -4053,9 +4063,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.66"
+version = "1.0.78"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
+checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
 dependencies = [
  "unicode-ident",
 ]
@@ -4202,8 +4212,8 @@ dependencies = [
  "routerify",
  "rstest",
  "rustc-hash",
- "rustls",
- "rustls-pemfile",
+ "rustls 0.22.2",
+ "rustls-pemfile 2.1.1",
  "scopeguard",
  "serde",
  "serde_json",
@@ -4219,7 +4229,7 @@ dependencies = [
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
- "tokio-rustls",
+ "tokio-rustls 0.25.0",
  "tokio-util",
  "tracing",
  "tracing-opentelemetry",
@@ -4247,9 +4257,9 @@ dependencies = [
 
 [[package]]
 name = "quote"
-version = "1.0.32"
+version = "1.0.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
+checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
 dependencies = [
  "proc-macro2",
 ]
@@ -4370,12 +4380,12 @@ dependencies = [
 
 [[package]]
 name = "rcgen"
-version = "0.11.1"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4954fbc00dcd4d8282c987710e50ba513d351400dbdd00e803a05172a90d8976"
+checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1"
 dependencies = [
- "pem 2.0.1",
- "ring 0.16.20",
+ "pem",
+ "ring 0.17.6",
  "time",
  "yasna",
 ]
@@ -4393,15 +4403,15 @@ dependencies = [
  "itoa",
  "percent-encoding",
  "pin-project-lite",
- "rustls",
+ "rustls 0.21.9",
  "rustls-native-certs",
- "rustls-pemfile",
+ "rustls-pemfile 1.0.2",
  "rustls-webpki 0.101.7",
  "ryu",
  "sha1_smol",
  "socket2 0.4.9",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.24.0",
  "tokio-util",
  "url",
 ]
@@ -4547,14 +4557,14 @@ dependencies = [
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
- "rustls",
- "rustls-pemfile",
+ "rustls 0.21.9",
+ "rustls-pemfile 1.0.2",
  "serde",
  "serde_json",
  "serde_urlencoded",
  "tokio",
  "tokio-native-tls",
- "tokio-rustls",
+ "tokio-rustls 0.24.0",
  "tokio-util",
  "tower-service",
  "url",
@@ -4720,7 +4730,7 @@ dependencies = [
  "regex",
  "relative-path",
  "rustc_version",
- "syn 2.0.32",
+ "syn 2.0.52",
  "unicode-ident",
 ]
 
@@ -4804,6 +4814,20 @@ dependencies = [
  "sct",
 ]
 
+[[package]]
+name = "rustls"
+version = "0.22.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41"
+dependencies = [
+ "log",
+ "ring 0.17.6",
+ "rustls-pki-types",
+ "rustls-webpki 0.102.2",
+ "subtle",
+ "zeroize",
+]
+
 [[package]]
 name = "rustls-native-certs"
 version = "0.6.2"
@@ -4811,7 +4835,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0167bac7a9f490495f3c33013e7722b53cb087ecbe082fb0c6387c96f634ea50"
 dependencies = [
  "openssl-probe",
- "rustls-pemfile",
+ "rustls-pemfile 1.0.2",
  "schannel",
  "security-framework",
 ]
@@ -4825,6 +4849,22 @@ dependencies = [
  "base64 0.21.1",
 ]
 
+[[package]]
+name = "rustls-pemfile"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab"
+dependencies = [
+ "base64 0.21.1",
+ "rustls-pki-types",
+]
+
+[[package]]
+name = "rustls-pki-types"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8"
+
 [[package]]
 name = "rustls-webpki"
 version = "0.100.2"
@@ -4845,6 +4885,17 @@ dependencies = [
  "untrusted 0.9.0",
 ]
 
+[[package]]
+name = "rustls-webpki"
+version = "0.102.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
+dependencies = [
+ "ring 0.17.6",
+ "rustls-pki-types",
+ "untrusted 0.9.0",
+]
+
 [[package]]
 name = "rustversion"
 version = "1.0.12"
@@ -4887,7 +4938,7 @@ dependencies = [
  "serde_with",
  "thiserror",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.25.0",
  "tokio-stream",
  "tracing",
  "tracing-appender",
@@ -5022,7 +5073,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928"
 dependencies = [
  "base16ct",
- "der",
+ "der 0.6.1",
  "generic-array",
  "pkcs8",
  "subtle",
@@ -5066,7 +5117,7 @@ checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
 dependencies = [
  "httpdate",
  "reqwest",
- "rustls",
+ "rustls 0.21.9",
  "sentry-backtrace",
  "sentry-contexts",
  "sentry-core",
@@ -5188,7 +5239,7 @@ checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -5269,7 +5320,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -5355,6 +5406,15 @@ dependencies = [
  "rand_core 0.6.4",
 ]
 
+[[package]]
+name = "signature"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
+dependencies = [
+ "rand_core 0.6.4",
+]
+
 [[package]]
 name = "simple_asn1"
 version = "0.6.2"
@@ -5439,7 +5499,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b"
 dependencies = [
  "base64ct",
- "der",
+ "der 0.6.1",
+]
+
+[[package]]
+name = "spki"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d"
+dependencies = [
+ "base64ct",
+ "der 0.7.8",
 ]
 
 [[package]]
@@ -5542,9 +5612,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.32"
+version = "2.0.52"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2"
+checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -5659,22 +5729,22 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "1.0.47"
+version = "1.0.57"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f"
+checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.47"
+version = "1.0.57"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b"
+checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -5845,7 +5915,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -5883,16 +5953,17 @@ dependencies = [
 
 [[package]]
 name = "tokio-postgres-rustls"
-version = "0.10.0"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd5831152cb0d3f79ef5523b357319ba154795d64c7078b2daa95a803b54057f"
+checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
 dependencies = [
  "futures",
- "ring 0.16.20",
- "rustls",
+ "ring 0.17.6",
+ "rustls 0.22.2",
  "tokio",
  "tokio-postgres",
- "tokio-rustls",
+ "tokio-rustls 0.25.0",
+ "x509-certificate",
 ]
 
 [[package]]
@@ -5901,7 +5972,18 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
 dependencies = [
- "rustls",
+ "rustls 0.21.9",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-rustls"
+version = "0.25.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f"
+dependencies = [
+ "rustls 0.22.2",
+ "rustls-pki-types",
  "tokio",
 ]
 
@@ -6016,9 +6098,9 @@ dependencies = [
  "pin-project",
  "prost",
  "rustls-native-certs",
- "rustls-pemfile",
+ "rustls-pemfile 1.0.2",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.24.0",
  "tokio-stream",
  "tower",
  "tower-layer",
@@ -6114,7 +6196,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -6330,7 +6412,7 @@ dependencies = [
  "base64 0.21.1",
  "log",
  "once_cell",
- "rustls",
+ "rustls 0.21.9",
  "rustls-webpki 0.100.2",
  "url",
  "webpki-roots 0.23.1",
@@ -6572,7 +6654,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
  "wasm-bindgen-shared",
 ]
 
@@ -6606,7 +6688,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -6939,19 +7021,18 @@ dependencies = [
  "regex-automata 0.4.3",
  "regex-syntax 0.8.2",
  "reqwest",
- "ring 0.16.20",
- "rustls",
+ "rustls 0.21.9",
  "scopeguard",
  "serde",
  "serde_json",
  "smallvec",
  "subtle",
  "syn 1.0.109",
- "syn 2.0.32",
+ "syn 2.0.52",
  "time",
  "time-macros",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.24.0",
  "tokio-util",
  "toml_datetime",
  "toml_edit",
@@ -6962,11 +7043,31 @@ dependencies = [
  "tungstenite",
  "url",
  "uuid",
+ "zeroize",
  "zstd",
  "zstd-safe",
  "zstd-sys",
 ]
 
+[[package]]
+name = "x509-certificate"
+version = "0.23.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "66534846dec7a11d7c50a74b7cdb208b9a581cad890b7866430d438455847c85"
+dependencies = [
+ "bcder",
+ "bytes",
+ "chrono",
+ "der 0.7.8",
+ "hex",
+ "pem",
+ "ring 0.17.6",
+ "signature 2.2.0",
+ "spki 0.7.3",
+ "thiserror",
+ "zeroize",
+]
+
 [[package]]
 name = "x509-parser"
 version = "0.15.0"
@@ -7025,7 +7126,7 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -7033,6 +7134,20 @@ name = "zeroize"
 version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
+dependencies = [
+ "zeroize_derive",
+]
+
+[[package]]
+name = "zeroize_derive"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.52",
+]
 
 [[package]]
 name = "zstd"
diff --git a/Cargo.toml b/Cargo.toml
index 42deaac19b..76f4ff041c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -129,8 +129,8 @@ reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
-rustls = "0.21"
-rustls-pemfile = "1"
+rustls = "0.22"
+rustls-pemfile = "2"
 rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
@@ -159,8 +159,8 @@ tikv-jemalloc-ctl = "0.5"
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
-tokio-postgres-rustls = "0.10.0"
-tokio-rustls = "0.24"
+tokio-postgres-rustls = "0.11.0"
+tokio-rustls = "0.25"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
@@ -219,7 +219,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
 
 ## Build dependencies
 criterion = "0.5.1"
-rcgen = "0.11"
+rcgen = "0.12"
 rstest = "0.18"
 camino-tempfile = "1.0.2"
 tonic-build = "0.9"
diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs
index e046fa5260..80df9db858 100644
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -72,14 +72,19 @@ async fn simple_select() {
     }
 }
 
-static KEY: Lazy<rustls::PrivateKey> = Lazy::new(|| {
+static KEY: Lazy<rustls::pki_types::PrivateKeyDer<'static>> = Lazy::new(|| {
     let mut cursor = Cursor::new(include_bytes!("key.pem"));
-    rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
+    let key = rustls_pemfile::rsa_private_keys(&mut cursor)
+        .next()
+        .unwrap()
+        .unwrap();
+    rustls::pki_types::PrivateKeyDer::Pkcs1(key)
 });
 
-static CERT: Lazy<rustls::Certificate> = Lazy::new(|| {
+static CERT: Lazy<rustls::pki_types::CertificateDer<'static>> = Lazy::new(|| {
     let mut cursor = Cursor::new(include_bytes!("cert.pem"));
-    rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
+    let cert = rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap();
+    cert
 });
 
 // test that basic select with ssl works
@@ -88,9 +93,8 @@ async fn simple_select_ssl() {
     let (client_sock, server_sock) = make_tcp_pair().await;
 
     let server_cfg = rustls::ServerConfig::builder()
-        .with_safe_defaults()
         .with_no_client_auth()
-        .with_single_cert(vec![CERT.clone()], KEY.clone())
+        .with_single_cert(vec![CERT.clone()], KEY.clone_key())
         .unwrap();
     let tls_config = Some(Arc::new(server_cfg));
     let pgbackend =
@@ -102,10 +106,9 @@ async fn simple_select_ssl() {
     });
 
     let client_cfg = rustls::ClientConfig::builder()
-        .with_safe_defaults()
         .with_root_certificates({
             let mut store = rustls::RootCertStore::empty();
-            store.add(&CERT).unwrap();
+            store.add(CERT.clone()).unwrap();
             store
         })
         .with_no_client_auth();
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index d5ab66d6aa..385f7820cb 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -10,6 +10,7 @@ use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
 use proxy::proxy::run_until_cancelled;
+use rustls::pki_types::PrivateKeyDer;
 use tokio::net::TcpListener;
 
 use anyhow::{anyhow, bail, ensure, Context};
@@ -76,37 +77,40 @@ async fn main() -> anyhow::Result<()> {
         (Some(key_path), Some(cert_path)) => {
             let key = {
                 let key_bytes = std::fs::read(key_path).context("TLS key file")?;
-                let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
-                    .context(format!("Failed to read TLS keys at '{key_path}'"))?;
+
+                let mut keys =
+                    rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec();
 
                 ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
-                keys.pop().map(rustls::PrivateKey).unwrap()
+                PrivateKeyDer::Pkcs8(
+                    keys.pop()
+                        .unwrap()
+                        .context(format!("Failed to read TLS keys at '{key_path}'"))?,
+                )
             };
 
             let cert_chain_bytes = std::fs::read(cert_path)
                 .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
 
-            let cert_chain = {
+            let cert_chain: Vec<_> = {
                 rustls_pemfile::certs(&mut &cert_chain_bytes[..])
-                    .context(format!(
-                        "Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
-                    ))?
-                    .into_iter()
-                    .map(rustls::Certificate)
-                    .collect_vec()
+                .try_collect()
+                .with_context(|| {
+                    format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.")
+                })?
             };
 
             // needed for channel bindings
             let first_cert = cert_chain.first().context("missing certificate")?;
             let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
 
-            let tls_config = rustls::ServerConfig::builder()
-                .with_safe_default_cipher_suites()
-                .with_safe_default_kx_groups()
-                .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
-                .with_no_client_auth()
-                .with_single_cert(cert_chain, key)?
-                .into();
+            let tls_config = rustls::ServerConfig::builder_with_protocol_versions(&[
+                &rustls::version::TLS13,
+                &rustls::version::TLS12,
+            ])
+            .with_no_client_auth()
+            .with_single_cert(cert_chain, key)?
+            .into();
 
             (tls_config, tls_server_end_point)
         }
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 9f276c3c24..437ec9f401 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,6 +1,10 @@
 use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions};
 use anyhow::{bail, ensure, Context, Ok};
-use rustls::{sign, Certificate, PrivateKey};
+use itertools::Itertools;
+use rustls::{
+    crypto::ring::sign,
+    pki_types::{CertificateDer, PrivateKeyDer},
+};
 use sha2::{Digest, Sha256};
 use std::{
     collections::{HashMap, HashSet},
@@ -88,14 +92,14 @@ pub fn configure_tls(
 
     let cert_resolver = Arc::new(cert_resolver);
 
-    let config = rustls::ServerConfig::builder()
-        .with_safe_default_cipher_suites()
-        .with_safe_default_kx_groups()
-        // allow TLS 1.2 to be compatible with older client libraries
-        .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
-        .with_no_client_auth()
-        .with_cert_resolver(cert_resolver.clone())
-        .into();
+    // allow TLS 1.2 to be compatible with older client libraries
+    let config = rustls::ServerConfig::builder_with_protocol_versions(&[
+        &rustls::version::TLS13,
+        &rustls::version::TLS12,
+    ])
+    .with_no_client_auth()
+    .with_cert_resolver(cert_resolver.clone())
+    .into();
 
     Ok(TlsConfig {
         config,
@@ -133,14 +137,14 @@ pub enum TlsServerEndPoint {
 }
 
 impl TlsServerEndPoint {
-    pub fn new(cert: &Certificate) -> anyhow::Result<Self> {
+    pub fn new(cert: &CertificateDer) -> anyhow::Result<Self> {
         let sha256_oids = [
             // I'm explicitly not adding MD5 or SHA1 here... They're bad.
             oid_registry::OID_SIG_ECDSA_WITH_SHA256,
             oid_registry::OID_PKCS1_SHA256WITHRSA,
         ];
 
-        let pem = x509_parser::parse_x509_certificate(&cert.0)
+        let pem = x509_parser::parse_x509_certificate(cert)
             .context("Failed to parse PEM object from cerficiate")?
             .1;
 
@@ -150,8 +154,7 @@ impl TlsServerEndPoint {
         let oid = pem.signature_algorithm.oid();
         let alg = reg.get(oid);
         if sha256_oids.contains(oid) {
-            let tls_server_end_point: [u8; 32] =
-                Sha256::new().chain_update(&cert.0).finalize().into();
+            let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into();
             info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding");
             Ok(Self::Sha256(tls_server_end_point))
         } else {
@@ -165,7 +168,7 @@ impl TlsServerEndPoint {
     }
 }
 
-#[derive(Default)]
+#[derive(Default, Debug)]
 pub struct CertResolver {
     certs: HashMap<String, (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
     default: Option<(Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
@@ -185,11 +188,14 @@ impl CertResolver {
         let priv_key = {
             let key_bytes = std::fs::read(key_path)
                 .context(format!("Failed to read TLS keys at '{key_path}'"))?;
-            let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
-                .context(format!("Failed to parse TLS keys at '{key_path}'"))?;
+            let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec();
 
             ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
-            keys.pop().map(rustls::PrivateKey).unwrap()
+            PrivateKeyDer::Pkcs8(
+                keys.pop()
+                    .unwrap()
+                    .context(format!("Failed to parse TLS keys at '{key_path}'"))?,
+            )
         };
 
         let cert_chain_bytes = std::fs::read(cert_path)
@@ -197,14 +203,10 @@ impl CertResolver {
 
         let cert_chain = {
             rustls_pemfile::certs(&mut &cert_chain_bytes[..])
+                .try_collect()
                 .with_context(|| {
-                    format!(
-                    "Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
-                )
+                    format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.")
                 })?
-                .into_iter()
-                .map(rustls::Certificate)
-                .collect()
         };
 
         self.add_cert(priv_key, cert_chain, is_default)
@@ -212,15 +214,15 @@ impl CertResolver {
 
     pub fn add_cert(
         &mut self,
-        priv_key: PrivateKey,
-        cert_chain: Vec<Certificate>,
+        priv_key: PrivateKeyDer<'static>,
+        cert_chain: Vec<CertificateDer<'static>>,
         is_default: bool,
     ) -> anyhow::Result<()> {
         let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
 
         let first_cert = &cert_chain[0];
         let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
-        let pem = x509_parser::parse_x509_certificate(&first_cert.0)
+        let pem = x509_parser::parse_x509_certificate(first_cert)
             .context("Failed to parse PEM object from cerficiate")?
             .1;
 
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index d866b1820f..5d0340e852 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -20,6 +20,7 @@ use crate::{http, sasl, scram};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use rstest::rstest;
+use rustls::pki_types;
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::{MakeTlsConnect, NoTls};
 use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
@@ -28,7 +29,11 @@ use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
 fn generate_certs(
     hostname: &str,
     common_name: &str,
-) -> anyhow::Result<(rustls::Certificate, rustls::Certificate, rustls::PrivateKey)> {
+) -> anyhow::Result<(
+    pki_types::CertificateDer<'static>,
+    pki_types::CertificateDer<'static>,
+    pki_types::PrivateKeyDer<'static>,
+)> {
     let ca = rcgen::Certificate::from_params({
         let mut params = rcgen::CertificateParams::default();
         params.is_ca = rcgen::IsCa::Ca(rcgen::BasicConstraints::Unconstrained);
@@ -45,9 +50,9 @@ fn generate_certs(
     })?;
 
     Ok((
-        rustls::Certificate(ca.serialize_der()?),
-        rustls::Certificate(cert.serialize_der_with_signer(&ca)?),
-        rustls::PrivateKey(cert.serialize_private_key_der()),
+        pki_types::CertificateDer::from(ca.serialize_der()?),
+        pki_types::CertificateDer::from(cert.serialize_der_with_signer(&ca)?),
+        pki_types::PrivateKeyDer::Pkcs8(cert.serialize_private_key_der().into()),
     ))
 }
 
@@ -82,9 +87,8 @@ fn generate_tls_config<'a>(
 
     let tls_config = {
         let config = rustls::ServerConfig::builder()
-            .with_safe_defaults()
             .with_no_client_auth()
-            .with_single_cert(vec![cert.clone()], key.clone())?
+            .with_single_cert(vec![cert.clone()], key.clone_key())?
             .into();
 
         let mut cert_resolver = CertResolver::new();
@@ -101,10 +105,9 @@ fn generate_tls_config<'a>(
 
     let client_config = {
         let config = rustls::ClientConfig::builder()
-            .with_safe_defaults()
             .with_root_certificates({
                 let mut store = rustls::RootCertStore::empty();
-                store.add(&ca)?;
+                store.add(ca)?;
                 store
             })
             .with_no_client_auth();
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index e808fabbe7..8593b752c2 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -60,7 +60,6 @@ regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] }
-ring = { version = "0.16" }
 rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
@@ -80,6 +79,7 @@ tracing-core = { version = "0.1" }
 tungstenite = { version = "0.20" }
 url = { version = "2", features = ["serde"] }
 uuid = { version = "1", features = ["serde", "v4", "v7"] }
+zeroize = { version = "1", features = ["derive"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }

From 0f05ef67e28fc0c26e0b1300edad82d4e054e24f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 7 Mar 2024 19:53:10 +0000
Subject: [PATCH 0363/1571] pageserver: revert open layer rolling revert
 (#6962)

## Problem
We reverted https://github.com/neondatabase/neon/pull/6661 a few days
ago. The change led to OOMs in
benchmarks followed by large WAL reingests.

The issue was that we removed [this
code](https://github.com/neondatabase/neon/blob/d04af08567cc3ff94ff19a2f6b3f7a2a1e3c55d1/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs#L409-L417).
That call may trigger a roll of the open layer due to
the keepalive messages received from the safekeeper. Removing it meant
that enforcing
of checkpoint timeout became even more lax and led to using up large
amounts of memory
for the in memory layer indices.

## Summary of changes
Piggyback on keep alive messages to enforce checkpoint timeout. This is
a hack, but it's exactly what
the current code is doing.

## Alternatives
Christhian, Joonas and myself sketched out a timer based approach
[here](https://github.com/neondatabase/neon/pull/6940). While discussing
it further, it became obvious that's also a bit of a hack and not the
desired end state. I chose not
to take that further since it's not what we ultimately want and it'll be
harder to rip out.

Right now it's unclear what the ideal system behaviour is:
* early flushing on memory pressure, or ...
* detaching tenants on memory pressure
---
 pageserver/src/pgdatadir_mapping.rs           |  17 +-
 pageserver/src/tenant.rs                      |  36 +-
 .../tenant/storage_layer/inmemory_layer.rs    |  38 +-
 pageserver/src/tenant/timeline.rs             | 375 +++++++++++++-----
 .../walreceiver/walreceiver_connection.rs     |  36 +-
 test_runner/performance/test_layer_map.py     |   4 +-
 6 files changed, 322 insertions(+), 184 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 628aeb5a28..727650a5a5 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,6 +15,7 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
+use itertools::Itertools;
 use pageserver_api::key::{
     dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
     rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -1498,7 +1499,7 @@ impl<'a> DatadirModification<'a> {
             return Ok(());
         }
 
-        let writer = self.tline.writer().await;
+        let mut writer = self.tline.writer().await;
 
         // Flush relation and  SLRU data blocks, keep metadata.
         let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
@@ -1537,14 +1538,22 @@ impl<'a> DatadirModification<'a> {
     /// All the modifications in this atomic update are stamped by the specified LSN.
     ///
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let writer = self.tline.writer().await;
+        let mut writer = self.tline.writer().await;
 
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
         if !self.pending_updates.is_empty() {
-            writer.put_batch(&self.pending_updates, ctx).await?;
-            self.pending_updates.clear();
+            // The put_batch call below expects expects the inputs to be sorted by Lsn,
+            // so we do that first.
+            let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = self
+                .pending_updates
+                .drain()
+                .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
+                .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
+                .collect();
+
+            writer.put_batch(lsn_ordered_batch, ctx).await?;
         }
 
         if !self.pending_deletions.is_empty() {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2f23e535fa..4f4654422b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3857,7 +3857,7 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -3869,7 +3869,7 @@ mod tests {
         writer.finish_write(Lsn(0x10));
         drop(writer);
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -3935,7 +3935,7 @@ mod tests {
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
 
         #[allow(non_snake_case)]
         let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
@@ -3969,7 +3969,7 @@ mod tests {
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
-        let new_writer = newtline.writer().await;
+        let mut new_writer = newtline.writer().await;
         new_writer
             .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
             .await?;
@@ -4001,7 +4001,7 @@ mod tests {
     ) -> anyhow::Result<()> {
         let mut lsn = start_lsn;
         {
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             // Create a relation on the timeline
             writer
                 .put(
@@ -4026,7 +4026,7 @@ mod tests {
         }
         tline.freeze_and_flush().await?;
         {
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             writer
                 .put(
                     *TEST_KEY,
@@ -4389,7 +4389,7 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4406,7 +4406,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4423,7 +4423,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4440,7 +4440,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4497,7 +4497,7 @@ mod tests {
         for _ in 0..repeat {
             for _ in 0..key_count {
                 test_key.field6 = blknum;
-                let writer = timeline.writer().await;
+                let mut writer = timeline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4690,7 +4690,7 @@ mod tests {
 
         current_lsn += 0x100;
 
-        let writer = current_timeline.writer().await;
+        let mut writer = current_timeline.writer().await;
         writer
             .put(
                 gap_at_key,
@@ -4729,7 +4729,7 @@ mod tests {
 
             current_lsn += 0x10;
 
-            let writer = child_timeline.writer().await;
+            let mut writer = child_timeline.writer().await;
             writer
                 .put(
                     current_key,
@@ -4807,7 +4807,7 @@ mod tests {
         for blknum in 0..NUM_KEYS {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             writer
                 .put(
                     test_key,
@@ -4828,7 +4828,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4896,7 +4896,7 @@ mod tests {
         for blknum in 0..NUM_KEYS {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             writer
                 .put(
                     test_key,
@@ -4925,7 +4925,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -5002,7 +5002,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index e7da28b8d6..5f1db21d49 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -336,32 +336,17 @@ impl InMemoryLayer {
 
     /// Common subroutine of the public put_wal_record() and put_page_image() functions.
     /// Adds the page version to the in-memory tree
+
     pub(crate) async fn put_value(
         &self,
         key: Key,
         lsn: Lsn,
-        val: &Value,
+        buf: &[u8],
         ctx: &RequestContext,
     ) -> Result<()> {
         let mut inner = self.inner.write().await;
         self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, val, ctx).await
-    }
-
-    pub(crate) async fn put_values(
-        &self,
-        values: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> Result<()> {
-        let mut inner = self.inner.write().await;
-        self.assert_writable();
-        for (key, vals) in values {
-            for (lsn, val) in vals {
-                self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
-                    .await?;
-            }
-        }
-        Ok(())
+        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
     }
 
     async fn put_value_locked(
@@ -369,22 +354,16 @@ impl InMemoryLayer {
         locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
         key: Key,
         lsn: Lsn,
-        val: &Value,
+        buf: &[u8],
         ctx: &RequestContext,
     ) -> Result<()> {
         trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
 
         let off = {
-            // Avoid doing allocations for "small" values.
-            // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
-            // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
-            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
-            buf.clear();
-            val.ser_into(&mut buf)?;
             locked_inner
                 .file
                 .write_blob(
-                    &buf,
+                    buf,
                     &RequestContextBuilder::extend(ctx)
                         .page_content_kind(PageContentKind::InMemoryLayer)
                         .build(),
@@ -412,7 +391,12 @@ impl InMemoryLayer {
     pub async fn freeze(&self, end_lsn: Lsn) {
         let inner = self.inner.write().await;
 
-        assert!(self.start_lsn < end_lsn);
+        assert!(
+            self.start_lsn < end_lsn,
+            "{} >= {}",
+            self.start_lsn,
+            end_lsn
+        );
         self.end_lsn.set(end_lsn).expect("end_lsn set only once");
 
         for vec_map in inner.index.values() {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 71a958206c..7004db1cb5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -27,6 +27,18 @@ use pageserver_api::{
 };
 use rand::Rng;
 use serde_with::serde_as;
+use storage_broker::BrokerClientChannel;
+use tokio::{
+    runtime::Handle,
+    sync::{oneshot, watch},
+};
+use tokio_util::sync::CancellationToken;
+use tracing::*;
+use utils::{
+    bin_ser::BeSer,
+    sync::gate::{Gate, GateGuard},
+};
+
 use std::ops::{Deref, Range};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
@@ -41,14 +53,6 @@ use std::{
     cmp::{max, min, Ordering},
     ops::ControlFlow,
 };
-use storage_broker::BrokerClientChannel;
-use tokio::{
-    runtime::Handle,
-    sync::{oneshot, watch},
-};
-use tokio_util::sync::CancellationToken;
-use tracing::*;
-use utils::sync::gate::{Gate, GateGuard};
 
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
@@ -271,7 +275,7 @@ pub struct Timeline {
     /// Locked automatically by [`TimelineWriter`] and checkpointer.
     /// Must always be acquired before the layer map/individual layer lock
     /// to avoid deadlock.
-    write_lock: tokio::sync::Mutex<()>,
+    write_lock: tokio::sync::Mutex<Option<TimelineWriterState>>,
 
     /// Used to avoid multiple `flush_loop` tasks running
     pub(super) flush_loop_state: Mutex<FlushLoopState>,
@@ -917,8 +921,6 @@ impl Timeline {
         seq: &Bytes,
         vec: &Bytes,
     ) {
-        use utils::bin_ser::BeSer;
-
         if *key == AUX_FILES_KEY {
             // The value reconstruct of AUX_FILES_KEY from records is not deterministic
             // since it uses a hash map under the hood. Hence, deserialise both results
@@ -1149,58 +1151,10 @@ impl Timeline {
     pub(crate) async fn writer(&self) -> TimelineWriter<'_> {
         TimelineWriter {
             tl: self,
-            _write_guard: self.write_lock.lock().await,
+            write_guard: self.write_lock.lock().await,
         }
     }
 
-    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
-    /// the in-memory layer, and initiate flushing it if so.
-    ///
-    /// Also flush after a period of time without new data -- it helps
-    /// safekeepers to regard pageserver as caught up and suspend activity.
-    pub(crate) async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
-        let last_lsn = self.get_last_record_lsn();
-        let open_layer_size = {
-            let guard = self.layers.read().await;
-            let layers = guard.layer_map();
-            let Some(open_layer) = layers.open_layer.as_ref() else {
-                return Ok(());
-            };
-            open_layer.size().await?
-        };
-        let last_freeze_at = self.last_freeze_at.load();
-        let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
-        let distance = last_lsn.widening_sub(last_freeze_at);
-        // Rolling the open layer can be triggered by:
-        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
-        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
-        //    account for how writes are distributed across shards: we expect each node to consume
-        //    1/count of the LSN on average.
-        // 2. The size of the currently open layer.
-        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
-        //    up and suspend activity.
-        if (distance
-            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128)
-            || open_layer_size > self.get_checkpoint_distance()
-            || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
-        {
-            info!(
-                "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
-                distance,
-                open_layer_size,
-                last_freeze_ts.elapsed()
-            );
-
-            self.freeze_inmem_layer(true).await;
-            self.last_freeze_at.store(last_lsn);
-            *(self.last_freeze_ts.write().unwrap()) = Instant::now();
-
-            // Wake up the layer flusher
-            self.flush_frozen_layers();
-        }
-        Ok(())
-    }
-
     pub(crate) fn activate(
         self: &Arc<Self>,
         broker_client: BrokerClientChannel,
@@ -1635,7 +1589,7 @@ impl Timeline {
                 layer_flush_start_tx,
                 layer_flush_done_tx,
 
-                write_lock: tokio::sync::Mutex::new(()),
+                write_lock: tokio::sync::Mutex::new(None),
 
                 gc_info: std::sync::RwLock::new(GcInfo {
                     retain_lsns: Vec::new(),
@@ -2961,43 +2915,6 @@ impl Timeline {
         Ok(layer)
     }
 
-    async fn put_value(
-        &self,
-        key: Key,
-        lsn: Lsn,
-        val: &Value,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        //info!("PUT: key {} at {}", key, lsn);
-        let layer = self.get_layer_for_write(lsn).await?;
-        layer.put_value(key, lsn, val, ctx).await?;
-        Ok(())
-    }
-
-    async fn put_values(
-        &self,
-        values: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // Pick the first LSN in the batch to get the layer to write to.
-        for lsns in values.values() {
-            if let Some((lsn, _)) = lsns.first() {
-                let layer = self.get_layer_for_write(*lsn).await?;
-                layer.put_values(values, ctx).await?;
-                break;
-            }
-        }
-        Ok(())
-    }
-
-    async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        if let Some((_, lsn)) = tombstones.first() {
-            let layer = self.get_layer_for_write(*lsn).await?;
-            layer.put_tombstones(tombstones).await?;
-        }
-        Ok(())
-    }
-
     pub(crate) fn finish_write(&self, new_lsn: Lsn) {
         assert!(new_lsn.is_aligned());
 
@@ -3008,14 +2925,20 @@ impl Timeline {
     async fn freeze_inmem_layer(&self, write_lock_held: bool) {
         // Freeze the current open in-memory layer. It will be written to disk on next
         // iteration.
+
         let _write_guard = if write_lock_held {
             None
         } else {
             Some(self.write_lock.lock().await)
         };
+
+        self.freeze_inmem_layer_at(self.get_last_record_lsn()).await;
+    }
+
+    async fn freeze_inmem_layer_at(&self, at: Lsn) {
         let mut guard = self.layers.write().await;
         guard
-            .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
+            .try_freeze_in_memory_layer(at, &self.last_freeze_at)
             .await;
     }
 
@@ -4392,13 +4315,43 @@ fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageRecon
     PageReconstructError::from(msg)
 }
 
+struct TimelineWriterState {
+    open_layer: Arc<InMemoryLayer>,
+    current_size: u64,
+    // Previous Lsn which passed through
+    prev_lsn: Option<Lsn>,
+    // Largest Lsn which passed through the current writer
+    max_lsn: Option<Lsn>,
+    // Cached details of the last freeze. Avoids going trough the atomic/lock on every put.
+    cached_last_freeze_at: Lsn,
+    cached_last_freeze_ts: Instant,
+}
+
+impl TimelineWriterState {
+    fn new(
+        open_layer: Arc<InMemoryLayer>,
+        current_size: u64,
+        last_freeze_at: Lsn,
+        last_freeze_ts: Instant,
+    ) -> Self {
+        Self {
+            open_layer,
+            current_size,
+            prev_lsn: None,
+            max_lsn: None,
+            cached_last_freeze_at: last_freeze_at,
+            cached_last_freeze_ts: last_freeze_ts,
+        }
+    }
+}
+
 /// Various functions to mutate the timeline.
 // TODO Currently, Deref is used to allow easy access to read methods from this trait.
 // This is probably considered a bad practice in Rust and should be fixed eventually,
 // but will cause large code changes.
 pub(crate) struct TimelineWriter<'a> {
     tl: &'a Timeline,
-    _write_guard: tokio::sync::MutexGuard<'a, ()>,
+    write_guard: tokio::sync::MutexGuard<'a, Option<TimelineWriterState>>,
 }
 
 impl Deref for TimelineWriter<'_> {
@@ -4409,31 +4362,239 @@ impl Deref for TimelineWriter<'_> {
     }
 }
 
+impl Drop for TimelineWriter<'_> {
+    fn drop(&mut self) {
+        self.write_guard.take();
+    }
+}
+
+#[derive(PartialEq)]
+enum OpenLayerAction {
+    Roll,
+    Open,
+    None,
+}
+
 impl<'a> TimelineWriter<'a> {
     /// Put a new page version that can be constructed from a WAL record
     ///
     /// This will implicitly extend the relation, if the page is beyond the
     /// current end-of-file.
     pub(crate) async fn put(
-        &self,
+        &mut self,
         key: Key,
         lsn: Lsn,
         value: &Value,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        self.tl.put_value(key, lsn, value, ctx).await
+        // Avoid doing allocations for "small" values.
+        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
+        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
+        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
+        value.ser_into(&mut buf)?;
+        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
+
+        let action = self.get_open_layer_action(lsn, buf_size);
+        let layer = self.handle_open_layer_action(lsn, action).await?;
+        let res = layer.put_value(key, lsn, &buf, ctx).await;
+
+        if res.is_ok() {
+            // Update the current size only when the entire write was ok.
+            // In case of failures, we may have had partial writes which
+            // render the size tracking out of sync. That's ok because
+            // the checkpoint distance should be significantly smaller
+            // than the S3 single shot upload limit of 5GiB.
+            let state = self.write_guard.as_mut().unwrap();
+
+            state.current_size += buf_size;
+            state.prev_lsn = Some(lsn);
+            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
+        }
+
+        res
     }
 
+    /// "Tick" the timeline writer: it will roll the open layer if required
+    /// and do nothing else.
+    pub(crate) async fn tick(&mut self) -> anyhow::Result<()> {
+        self.open_layer_if_present().await?;
+
+        let last_record_lsn = self.get_last_record_lsn();
+        let action = self.get_open_layer_action(last_record_lsn, 0);
+        if action == OpenLayerAction::Roll {
+            self.roll_layer(last_record_lsn).await?;
+        }
+
+        Ok(())
+    }
+
+    /// Populate the timeline writer state only if an in-memory layer
+    /// is already open.
+    async fn open_layer_if_present(&mut self) -> anyhow::Result<()> {
+        assert!(self.write_guard.is_none());
+
+        let open_layer = {
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+            match layers.open_layer {
+                Some(ref open_layer) => open_layer.clone(),
+                None => {
+                    return Ok(());
+                }
+            }
+        };
+
+        let initial_size = open_layer.size().await?;
+        let last_freeze_at = self.last_freeze_at.load();
+        let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
+        self.write_guard.replace(TimelineWriterState::new(
+            open_layer,
+            initial_size,
+            last_freeze_at,
+            last_freeze_ts,
+        ));
+
+        Ok(())
+    }
+
+    async fn handle_open_layer_action(
+        &mut self,
+        at: Lsn,
+        action: OpenLayerAction,
+    ) -> anyhow::Result<&Arc<InMemoryLayer>> {
+        match action {
+            OpenLayerAction::Roll => {
+                let freeze_at = self.write_guard.as_ref().unwrap().max_lsn.unwrap();
+                self.roll_layer(freeze_at).await?;
+                self.open_layer(at).await?;
+            }
+            OpenLayerAction::Open => self.open_layer(at).await?,
+            OpenLayerAction::None => {
+                assert!(self.write_guard.is_some());
+            }
+        }
+
+        Ok(&self.write_guard.as_ref().unwrap().open_layer)
+    }
+
+    async fn open_layer(&mut self, at: Lsn) -> anyhow::Result<()> {
+        let layer = self.tl.get_layer_for_write(at).await?;
+        let initial_size = layer.size().await?;
+
+        let last_freeze_at = self.last_freeze_at.load();
+        let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
+        self.write_guard.replace(TimelineWriterState::new(
+            layer,
+            initial_size,
+            last_freeze_at,
+            last_freeze_ts,
+        ));
+
+        Ok(())
+    }
+
+    async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> {
+        assert!(self.write_guard.is_some());
+
+        self.tl.freeze_inmem_layer_at(freeze_at).await;
+
+        let now = Instant::now();
+        *(self.last_freeze_ts.write().unwrap()) = now;
+
+        self.tl.flush_frozen_layers();
+
+        let current_size = self.write_guard.as_ref().unwrap().current_size;
+        if current_size > self.get_checkpoint_distance() {
+            warn!("Flushed oversized open layer with size {}", current_size)
+        }
+
+        Ok(())
+    }
+
+    fn get_open_layer_action(&self, lsn: Lsn, new_value_size: u64) -> OpenLayerAction {
+        let state = &*self.write_guard;
+        let Some(state) = &state else {
+            return OpenLayerAction::Open;
+        };
+
+        if state.prev_lsn == Some(lsn) {
+            // Rolling mid LSN is not supported by downstream code.
+            // Hence, only roll at LSN boundaries.
+            return OpenLayerAction::None;
+        }
+
+        if state.current_size == 0 {
+            // Don't roll empty layers
+            return OpenLayerAction::None;
+        }
+
+        let distance = lsn.widening_sub(state.cached_last_freeze_at);
+        let proposed_open_layer_size = state.current_size + new_value_size;
+
+        // Rolling the open layer can be triggered by:
+        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
+        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
+        //    account for how writes are distributed across shards: we expect each node to consume
+        //    1/count of the LSN on average.
+        // 2. The size of the currently open layer.
+        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
+        //    up and suspend activity.
+        if distance
+            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128
+        {
+            info!(
+                "Will roll layer at {} with layer size {} due to LSN distance ({})",
+                lsn, state.current_size, distance
+            );
+
+            OpenLayerAction::Roll
+        } else if proposed_open_layer_size >= self.get_checkpoint_distance() {
+            info!(
+                "Will roll layer at {} with layer size {} due to layer size ({})",
+                lsn, state.current_size, proposed_open_layer_size
+            );
+
+            OpenLayerAction::Roll
+        } else if distance > 0
+            && state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()
+        {
+            info!(
+                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
+                lsn,
+                state.current_size,
+                state.cached_last_freeze_ts.elapsed()
+            );
+
+            OpenLayerAction::Roll
+        } else {
+            OpenLayerAction::None
+        }
+    }
+
+    /// Put a batch keys at the specified Lsns.
+    ///
+    /// The batch should be sorted by Lsn such that it's safe
+    /// to roll the open layer mid batch.
     pub(crate) async fn put_batch(
-        &self,
-        batch: &HashMap<Key, Vec<(Lsn, Value)>>,
+        &mut self,
+        batch: Vec<(Key, Lsn, Value)>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        self.tl.put_values(batch, ctx).await
+        for (key, lsn, val) in batch {
+            self.put(key, lsn, &val, ctx).await?
+        }
+
+        Ok(())
     }
 
-    pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        self.tl.put_tombstones(batch).await
+    pub(crate) async fn delete_batch(&mut self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        if let Some((_, lsn)) = batch.first() {
+            let action = self.get_open_layer_action(*lsn, 0);
+            let layer = self.handle_open_layer_action(*lsn, action).await?;
+            layer.put_tombstones(batch).await?;
+        }
+
+        Ok(())
     }
 
     /// Track the end of the latest digested WAL record.
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 9cb53f46d1..8297ca6563 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -343,23 +343,6 @@ pub(super) async fn handle_walreceiver_connection(
                             modification.commit(&ctx).await?;
                             uncommitted_records = 0;
                             filtered_records = 0;
-
-                            //
-                            // We should check checkpoint distance after appending each ingest_batch_size bytes because otherwise
-                            // layer size can become much larger than `checkpoint_distance`.
-                            // It can append because wal-sender is sending WAL using 125kb chucks and some WAL records can cause writing large
-                            // amount of data to key-value storage. So performing this check only after processing
-                            // all WAL records in the chunk, can cause huge L0 layer files.
-                            //
-                            timeline
-                                .check_checkpoint_distance()
-                                .await
-                                .with_context(|| {
-                                    format!(
-                                        "Failed to check checkpoint distance for timeline {}",
-                                        timeline.timeline_id
-                                    )
-                                })?;
                         }
                     }
 
@@ -406,15 +389,16 @@ pub(super) async fn handle_walreceiver_connection(
             }
         }
 
-        timeline
-            .check_checkpoint_distance()
-            .await
-            .with_context(|| {
-                format!(
-                    "Failed to check checkpoint distance for timeline {}",
-                    timeline.timeline_id
-                )
-            })?;
+        {
+            // This is a hack. It piggybacks on the keepalive messages sent by the
+            // safekeeper in order to enforce `checkpoint_timeout` on the currently
+            // open layer. This hack doesn't provide a bound on the total size of
+            // in-memory layers on a pageserver. See https://github.com/neondatabase/neon/issues/6916.
+            let mut writer = timeline.writer().await;
+            if let Err(err) = writer.tick().await {
+                warn!("Timeline writer tick failed: {err}");
+            }
+        }
 
         if let Some(last_lsn) = status_update {
             let timeline_remote_consistent_lsn = timeline
diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index 6bd0d85fa2..9b20954d45 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -17,10 +17,10 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
     tenant, _ = env.neon_cli.create_tenant(
         conf={
             "gc_period": "0s",
-            "checkpoint_distance": "8192",
+            "checkpoint_distance": "16384",
             "compaction_period": "1 s",
             "compaction_threshold": "1",
-            "compaction_target_size": "8192",
+            "compaction_target_size": "16384",
         }
     )
 

From 2c132e45cb624a39ac7f23ea78f082078277a450 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 8 Mar 2024 07:56:23 +0000
Subject: [PATCH 0364/1571] proxy: do not store ephemeral endpoints in http
 pool (#6819)

## Problem

For the ephemeral endpoint feature, it's not really too helpful to keep
them around in the connection pool. This isn't really pressing but I
think it's still a bit better this way.

## Summary of changes

Add `is_ephemeral` function to `NeonOptions`. Allow
`serverless::ConnInfo::endpoint_cache_key()` to return an `Option`.
Handle that option appropriately
---
 proxy/src/proxy.rs                |  5 +++++
 proxy/src/serverless/conn_pool.rs | 30 +++++++++++++++++++++---------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index aeba08bc4f..7848fc2ac2 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -380,6 +380,11 @@ impl NeonOptions {
         Self::parse_from_iter(StartupMessageParams::parse_options_raw(options))
     }
 
+    pub fn is_ephemeral(&self) -> bool {
+        // Currently, neon endpoint options are all reserved for ephemeral endpoints.
+        !self.0.is_empty()
+    }
+
     fn parse_from_iter<'a>(options: impl Iterator<Item = &'a str>) -> Self {
         let mut options = options
             .filter_map(neon_option)
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 7d705ba049..73f213d074 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -43,8 +43,13 @@ impl ConnInfo {
         (self.dbname.clone(), self.user_info.user.clone())
     }
 
-    pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
-        self.user_info.endpoint_cache_key()
+    pub fn endpoint_cache_key(&self) -> Option<EndpointCacheKey> {
+        // We don't want to cache http connections for ephemeral endpoints.
+        if self.user_info.options.is_ephemeral() {
+            None
+        } else {
+            Some(self.user_info.endpoint_cache_key())
+        }
     }
 }
 
@@ -360,8 +365,11 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         conn_info: &ConnInfo,
     ) -> Result<Option<Client<C>>, HttpConnError> {
         let mut client: Option<ClientInner<C>> = None;
+        let Some(endpoint) = conn_info.endpoint_cache_key() else {
+            return Ok(None);
+        };
 
-        let endpoint_pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key());
+        let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
         if let Some(entry) = endpoint_pool
             .write()
             .get_conn_entry(conn_info.db_and_user())
@@ -455,8 +463,10 @@ pub fn poll_client<C: ClientInnerExt>(
     span.in_scope(|| {
         info!(%conn_info, %session_id, "new connection");
     });
-    let pool =
-        Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
+    let pool = match conn_info.endpoint_cache_key() {
+        Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)),
+        None => Weak::new(),
+    };
     let pool_clone = pool.clone();
 
     let db_user = conn_info.db_and_user();
@@ -723,8 +733,9 @@ mod tests {
             dbname: "dbname".into(),
             password: "password".as_bytes().into(),
         };
-        let ep_pool =
-            Arc::downgrade(&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
+        let ep_pool = Arc::downgrade(
+            &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
+        );
         {
             let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
             assert_eq!(0, pool.get_global_connections_count());
@@ -780,8 +791,9 @@ mod tests {
             dbname: "dbname".into(),
             password: "password".as_bytes().into(),
         };
-        let ep_pool =
-            Arc::downgrade(&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
+        let ep_pool = Arc::downgrade(
+            &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
+        );
         {
             let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
             client.do_drop().unwrap()();

From 7329413705be0939b550553be2f40d4bb11a1a9b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 8 Mar 2024 15:34:53 +0000
Subject: [PATCH 0365/1571] storage controller: enable setting PlacementPolicy
 in tenant creation (#7037)

## Problem

Tenants created via the storage controller have a `PlacementPolicy` that
defines their HA/secondary/detach intent. For backward compat we can
just set it to Single, for onboarding tenants using /location_conf it is
automatically set to Double(1) if there are at least two pageservers,
but for freshly created tenants we didn't have a way to specify it.

This unblocks writing tests that create HA tenants on the storage
controller and do failure injection testing.

## Summary of changes

- Add optional fields to TenantCreateRequest for specifying
PlacementPolicy. This request structure is used both on pageserver API
and storage controller API, but this method is only meaningful for the
storage controller (same as existing `shard_parameters` attribute).
- Use the value from the creation request in tenant creation, if
provided.
---
 control_plane/attachment_service/src/http.rs  |  7 +--
 control_plane/attachment_service/src/lib.rs   | 25 +--------
 .../attachment_service/src/persistence.rs     | 11 ++--
 .../attachment_service/src/service.rs         | 55 ++++++++++---------
 .../attachment_service/src/tenant_state.rs    |  3 +-
 control_plane/src/bin/neon_local.rs           |  9 ++-
 control_plane/src/pageserver.rs               |  2 +
 libs/pageserver_api/src/controller_api.rs     | 40 ++++++++++++++
 libs/pageserver_api/src/models.rs             |  6 ++
 9 files changed, 92 insertions(+), 66 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 384bdcef0c..7e4030b221 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,6 +1,5 @@
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
-use crate::PlacementPolicy;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
@@ -119,13 +118,9 @@ async fn handle_tenant_create(
 
     let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
 
-    // TODO: enable specifying this.  Using Single as a default helps legacy tests to work (they
-    // have no expectation of HA).
-    let placement_policy = PlacementPolicy::Single;
-
     json_response(
         StatusCode::CREATED,
-        service.tenant_create(create_req, placement_policy).await?,
+        service.tenant_create(create_req).await?,
     )
 }
 
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index 7ae7e264c7..796b465c10 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -1,4 +1,4 @@
-use serde::{Deserialize, Serialize};
+use serde::Serialize;
 use utils::seqwait::MonotonicCounter;
 
 mod auth;
@@ -13,23 +13,6 @@ mod schema;
 pub mod service;
 mod tenant_state;
 
-#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
-enum PlacementPolicy {
-    /// Cheapest way to attach a tenant: just one pageserver, no secondary
-    Single,
-    /// Production-ready way to attach a tenant: one attached pageserver and
-    /// some number of secondaries.
-    Double(usize),
-    /// Create one secondary mode locations. This is useful when onboarding
-    /// a tenant, or for an idle tenant that we might want to bring online quickly.
-    Secondary,
-
-    /// Do not attach to any pageservers.  This is appropriate for tenants that
-    /// have been idle for a long time, where we do not mind some delay in making
-    /// them available in future.
-    Detached,
-}
-
 #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
 struct Sequence(u64);
 
@@ -66,9 +49,3 @@ impl Sequence {
         Sequence(self.0 + 1)
     }
 }
-
-impl Default for PlacementPolicy {
-    fn default() -> Self {
-        PlacementPolicy::Double(1)
-    }
-}
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index d5c304385c..d5c6d74ebe 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -7,11 +7,9 @@ use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
-use diesel::{
-    Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl,
-    Selectable, SelectableHelper,
-};
-use pageserver_api::controller_api::NodeSchedulingPolicy;
+use diesel::prelude::*;
+use diesel::Connection;
+use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use serde::{Deserialize, Serialize};
@@ -19,7 +17,6 @@ use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
 
 use crate::node::Node;
-use crate::PlacementPolicy;
 
 /// ## What do we store?
 ///
@@ -210,7 +207,7 @@ impl Persistence {
                 tenant.tenant_id = tenant_id.to_string();
                 tenant.config = serde_json::to_string(&TenantConfig::default())
                     .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
-                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())
+                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
                     .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
             }
         }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index f41c4f89b9..556d6a6828 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -16,9 +16,9 @@ use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
 use pageserver_api::{
     controller_api::{
-        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse,
-        TenantCreateResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
-        TenantShardMigrateResponse,
+        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, PlacementPolicy,
+        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
+        TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
     models::TenantConfigRequest,
 };
@@ -57,7 +57,7 @@ use crate::{
         IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
         ReconcilerWaiter, TenantState,
     },
-    PlacementPolicy, Sequence,
+    Sequence,
 };
 
 // For operations that should be quick, like attaching a new tenant
@@ -176,7 +176,7 @@ impl From<ReconcileWaitError> for ApiError {
 
 #[allow(clippy::large_enum_variant)]
 enum TenantCreateOrUpdate {
-    Create((TenantCreateRequest, PlacementPolicy)),
+    Create(TenantCreateRequest),
     Update(Vec<ShardUpdate>),
 }
 
@@ -792,7 +792,7 @@ impl Service {
                 shard_stripe_size: 0,
                 generation: Some(0),
                 generation_pageserver: None,
-                placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
+                placement_policy: serde_json::to_string(&PlacementPolicy::Single).unwrap(),
                 config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                 splitting: SplitState::default(),
             };
@@ -1053,9 +1053,8 @@ impl Service {
     pub(crate) async fn tenant_create(
         &self,
         create_req: TenantCreateRequest,
-        placement_policy: PlacementPolicy,
     ) -> Result<TenantCreateResponse, ApiError> {
-        let (response, waiters) = self.do_tenant_create(create_req, placement_policy).await?;
+        let (response, waiters) = self.do_tenant_create(create_req).await?;
 
         self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
         Ok(response)
@@ -1064,8 +1063,13 @@ impl Service {
     pub(crate) async fn do_tenant_create(
         &self,
         create_req: TenantCreateRequest,
-        placement_policy: PlacementPolicy,
     ) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
+        // As a default, single is convenient for tests that don't choose a policy.
+        let placement_policy = create_req
+            .placement_policy
+            .clone()
+            .unwrap_or(PlacementPolicy::Single);
+
         // This service expects to handle sharding itself: it is an error to try and directly create
         // a particular shard here.
         let tenant_id = if !create_req.new_tenant_id.is_unsharded() {
@@ -1339,22 +1343,20 @@ impl Service {
 
             TenantCreateOrUpdate::Create(
                 // Synthesize a creation request
-                (
-                    TenantCreateRequest {
-                        new_tenant_id: TenantShardId::unsharded(tenant_id),
-                        generation,
-                        shard_parameters: ShardParameters {
-                            // Must preserve the incoming shard_count do distinguish unsharded (0)
-                            // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
-                            count: req.tenant_id.shard_count,
-                            // We only import un-sharded or single-sharded tenants, so stripe
-                            // size can be made up arbitrarily here.
-                            stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
-                        },
-                        config: req.config.tenant_conf,
+                TenantCreateRequest {
+                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    generation,
+                    shard_parameters: ShardParameters {
+                        // Must preserve the incoming shard_count do distinguish unsharded (0)
+                        // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
+                        count: req.tenant_id.shard_count,
+                        // We only import un-sharded or single-sharded tenants, so stripe
+                        // size can be made up arbitrarily here.
+                        stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
                     },
-                    placement_policy,
-                ),
+                    placement_policy: Some(placement_policy),
+                    config: req.config.tenant_conf,
+                },
             )
         } else {
             TenantCreateOrUpdate::Update(updates)
@@ -1393,9 +1395,8 @@ impl Service {
             stripe_size: None,
         };
         let waiters = match create_or_update {
-            TenantCreateOrUpdate::Create((create_req, placement_policy)) => {
-                let (create_resp, waiters) =
-                    self.do_tenant_create(create_req, placement_policy).await?;
+            TenantCreateOrUpdate::Create(create_req) => {
+                let (create_resp, waiters) = self.do_tenant_create(create_req).await?;
                 result.shards = create_resp
                     .shards
                     .into_iter()
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index ddb9866527..c775736b31 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -5,6 +5,7 @@ use std::{
 };
 
 use crate::{metrics, persistence::TenantShardPersistence};
+use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
     shard::{ShardIdentity, TenantShardId},
@@ -28,7 +29,7 @@ use crate::{
         attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState,
     },
     scheduler::{ScheduleError, Scheduler},
-    service, PlacementPolicy, Sequence,
+    service, Sequence,
 };
 
 /// Serialization helper
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 1feec5cd9b..27abcb182a 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -15,7 +15,7 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::{broker, local_env};
 use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
+    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
 };
 use pageserver_api::models::{
     ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
@@ -435,6 +435,11 @@ async fn handle_tenant(
             let shard_stripe_size: Option<u32> =
                 create_match.get_one::<u32>("shard-stripe-size").cloned();
 
+            let placement_policy = match create_match.get_one::<String>("placement-policy") {
+                Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
+                _ => PlacementPolicy::Single,
+            };
+
             let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
 
             // If tenant ID was not specified, generate one
@@ -456,6 +461,7 @@ async fn handle_tenant(
                             .map(ShardStripeSize)
                             .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
                     },
+                    placement_policy: Some(placement_policy),
                     config: tenant_conf,
                 })
                 .await?;
@@ -1562,6 +1568,7 @@ fn cli() -> Command {
                     .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
                 .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
                 .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
+                .arg(Arg::new("placement-policy").value_parser(value_parser!(String)).long("placement-policy").action(ArgAction::Set).help("Placement policy shards in this tenant"))
                 )
             .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
                 .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index b2904c1191..ae1bd60c52 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -429,6 +429,8 @@ impl PageServerNode {
             generation,
             config,
             shard_parameters: ShardParameters::default(),
+            // Placement policy is not meaningful for creations not done via storage controller
+            placement_policy: None,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 64b70a1a51..38e61239c5 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -125,5 +125,45 @@ impl From<NodeSchedulingPolicy> for String {
     }
 }
 
+/// Controls how tenant shards are mapped to locations on pageservers, e.g. whether
+/// to create secondary locations.
+#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
+pub enum PlacementPolicy {
+    /// Cheapest way to attach a tenant: just one pageserver, no secondary
+    Single,
+    /// Production-ready way to attach a tenant: one attached pageserver and
+    /// some number of secondaries.
+    Double(usize),
+    /// Create one secondary mode locations. This is useful when onboarding
+    /// a tenant, or for an idle tenant that we might want to bring online quickly.
+    Secondary,
+
+    /// Do not attach to any pageservers.  This is appropriate for tenants that
+    /// have been idle for a long time, where we do not mind some delay in making
+    /// them available in future.
+    Detached,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateResponse {}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use serde_json;
+
+    /// Check stability of PlacementPolicy's serialization
+    #[test]
+    fn placement_policy_encoding() -> anyhow::Result<()> {
+        let v = PlacementPolicy::Double(1);
+        let encoded = serde_json::to_string(&v)?;
+        assert_eq!(encoded, "{\"Double\":1}");
+        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
+
+        let v = PlacementPolicy::Single;
+        let encoded = serde_json::to_string(&v)?;
+        assert_eq!(encoded, "\"Single\"");
+        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
+        Ok(())
+    }
+}
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 57497e3831..fe5bbd1c06 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -21,6 +21,7 @@ use utils::{
     lsn::Lsn,
 };
 
+use crate::controller_api::PlacementPolicy;
 use crate::{
     reltag::RelTag,
     shard::{ShardCount, ShardStripeSize, TenantShardId},
@@ -242,6 +243,11 @@ pub struct TenantCreateRequest {
     #[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
     pub shard_parameters: ShardParameters,
 
+    // This parameter is only meaningful in requests sent to the storage controller
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub placement_policy: Option<PlacementPolicy>,
+
     #[serde(flatten)]
     pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

From 86e8c43ddf817c7e3ee112e5c399cc5d60b34f29 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 8 Mar 2024 20:42:35 +0000
Subject: [PATCH 0366/1571] Add downgrade scripts for neon extension. (#7065)

## Problem

When we start compute with newer version of extension (i.e. 1.2) and
then rollback the release, downgrading the compute version, next compute
start will try to update extension to the latest version available in
neon.control (i.e. 1.1).

Thus we need to provide downgrade scripts like neon--1.2--1.1.sql

These scripts must revert the changes made by the upgrade scripts in the
reverse order. This is necessary to ensure that the next upgrade will
work correctly.

In general, we need to write upgrade and downgrade scripts to be more
robust and add IF EXISTS / CREATE OR REPLACE clauses to all statements
(where applicable).

## Summary of changes
Adds downgrade scripts.
Adds test cases for extension downgrade/upgrade.

fixes #7066

This is a follow-up for
https://app.incident.io/neondb/incidents/167?tab=follow-ups

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Alex Chi Z <iskyzh@gmail.com>
Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
---
 pgxn/neon/Makefile                         |  2 +-
 pgxn/neon/neon--1.1--1.0.sql               |  6 +++++
 pgxn/neon/neon--1.2--1.1.sql               |  1 +
 pgxn/neon/neon--1.3--1.2.sql               |  1 +
 test_runner/regress/test_neon_extension.py | 31 ++++++++++++++++++++++
 5 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 pgxn/neon/neon--1.1--1.0.sql
 create mode 100644 pgxn/neon/neon--1.2--1.1.sql
 create mode 100644 pgxn/neon/neon--1.3--1.2.sql

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 7ea767ec74..0bcb9545a6 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl
 
 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
 EXTRA_CLEAN = \
diff --git a/pgxn/neon/neon--1.1--1.0.sql b/pgxn/neon/neon--1.1--1.0.sql
new file mode 100644
index 0000000000..e83e3104e8
--- /dev/null
+++ b/pgxn/neon/neon--1.1--1.0.sql
@@ -0,0 +1,6 @@
+-- the order of operations is important here
+-- because the view depends on the function
+
+DROP VIEW IF EXISTS neon_lfc_stats CASCADE;
+
+DROP FUNCTION IF EXISTS neon_get_lfc_stats CASCADE;
diff --git a/pgxn/neon/neon--1.2--1.1.sql b/pgxn/neon/neon--1.2--1.1.sql
new file mode 100644
index 0000000000..c9f6a40f73
--- /dev/null
+++ b/pgxn/neon/neon--1.2--1.1.sql
@@ -0,0 +1 @@
+DROP VIEW IF EXISTS NEON_STAT_FILE_CACHE CASCADE;
diff --git a/pgxn/neon/neon--1.3--1.2.sql b/pgxn/neon/neon--1.3--1.2.sql
new file mode 100644
index 0000000000..2733a15c75
--- /dev/null
+++ b/pgxn/neon/neon--1.3--1.2.sql
@@ -0,0 +1 @@
+DROP FUNCTION IF EXISTS approximate_working_set_size(bool) CASCADE;
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index 1179a3afe9..e31e1cab51 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -29,3 +29,34 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
             log.info(res)
             assert len(res) == 1
             assert len(res[0]) == 5
+
+
+# Verify that the neon extension can be upgraded/downgraded.
+def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_neon_extension_compatibility")
+
+    endpoint_main = env.endpoints.create("test_neon_extension_compatibility")
+    # don't skip pg_catalog updates - it runs CREATE EXTENSION neon
+    endpoint_main.respec(skip_pg_catalog_updates=False)
+    endpoint_main.start()
+
+    with closing(endpoint_main.connect()) as conn:
+        with conn.cursor() as cur:
+            all_versions = ["1.3", "1.2", "1.1", "1.0"]
+            current_version = "1.3"
+            for idx, begin_version in enumerate(all_versions):
+                for target_version in all_versions[idx + 1 :]:
+                    if current_version != begin_version:
+                        cur.execute(
+                            f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {current_version}->{begin_version}"
+                        )
+                        current_version = begin_version
+                    # downgrade
+                    cur.execute(
+                        f"ALTER EXTENSION neon UPDATE TO '{target_version}'; -- {begin_version}->{target_version}"
+                    )
+                    # upgrade
+                    cur.execute(
+                        f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {target_version}->{begin_version}"
+                    )

From 4834d22d2d99bb7f9726c1cac3176550cc404e38 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Fri, 8 Mar 2024 13:24:30 -0900
Subject: [PATCH 0367/1571] Revoke REPLICATION (#7052)

## Problem
Currently users can cause problems with replication
## Summary of changes
Don't let them replicate
---
 compute_tools/src/spec.rs              | 16 ++++++++++++++--
 test_runner/regress/test_migrations.py |  2 +-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 84a5a263af..ba3a84cda8 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
             RoleAction::Create => {
                 // This branch only runs when roles are created through the console, so it is
                 // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
-                // from neon_superuser.
+                // from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
                 let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
                     name.pg_quote()
                 );
                 info!("running role create query: '{}'", &query);
@@ -805,6 +805,18 @@ $$;"#,
         "",
         "",
         // Add new migrations below.
+        r#"
+DO $$
+DECLARE
+    role_name TEXT;
+BEGIN
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
+    END LOOP;
+END
+$$;"#,
     ];
 
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 3f626c5c7c..526ae14b87 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -15,7 +15,7 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     endpoint.wait_for_migrations()
 
-    num_migrations = 8
+    num_migrations = 9
 
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")

From 74d24582cfe67f4115b54d26e5fb787a221dcae4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sat, 9 Mar 2024 13:37:02 +0100
Subject: [PATCH 0368/1571] throttling: exclude throttled time from basebackup
 (fixup of #6953) (#7072)

PR #6953 only excluded throttled time from the handle_pagerequests
(aka smgr metrics).

This PR implements the deduction for `basebackup ` queries.

The other page_service methods either don't use Timeline::get
or they aren't used in production.

Found by manually inspecting in [staging
logs](https://neonprod.grafana.net/explore?schemaVersion=1&panes=%7B%22wx8%22:%7B%22datasource%22:%22xHHYY0dVz%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%7Bhostname%3D%5C%22pageserver-0.eu-west-1.aws.neon.build%5C%22%7D%20%7C~%20%60git-env%7CERR%7CWARN%60%22,%22queryType%22:%22range%22,%22datasource%22:%7B%22type%22:%22loki%22,%22uid%22:%22xHHYY0dVz%22%7D,%22editorMode%22:%22code%22%7D%5D,%22range%22:%7B%22to%22:%221709919114642%22,%22from%22:%221709904430898%22%7D%7D%7D).
---
 libs/metrics/src/lib.rs                 |  1 -
 libs/metrics/src/metric_vec_duration.rs | 23 ---------
 pageserver/src/metrics.rs               | 63 +++++++++++++++++++++++--
 pageserver/src/page_service.rs          | 50 ++++++++++----------
 4 files changed, 83 insertions(+), 54 deletions(-)
 delete mode 100644 libs/metrics/src/metric_vec_duration.rs

diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 744fc18e61..22b0a18933 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -29,7 +29,6 @@ pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
-pub mod metric_vec_duration;
 pub use hll::{HyperLogLog, HyperLogLogVec};
 #[cfg(target_os = "linux")]
 pub mod more_process_metrics;
diff --git a/libs/metrics/src/metric_vec_duration.rs b/libs/metrics/src/metric_vec_duration.rs
deleted file mode 100644
index e9a0a65570..0000000000
--- a/libs/metrics/src/metric_vec_duration.rs
+++ /dev/null
@@ -1,23 +0,0 @@
-//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
-
-use std::{future::Future, time::Instant};
-
-pub trait DurationResultObserver {
-    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
-}
-
-pub async fn observe_async_block_duration_by_result<
-    T,
-    E,
-    F: Future<Output = Result<T, E>>,
-    O: DurationResultObserver,
->(
-    observer: &O,
-    block: F,
-) -> Result<T, E> {
-    let start = Instant::now();
-    let result = block.await;
-    let duration = start.elapsed();
-    observer.observe_result(&result, duration);
-    result
-}
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ee62ee0367..27e754e999 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,5 +1,4 @@
 use enum_map::EnumMap;
-use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
     register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
@@ -1283,11 +1282,65 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
     })
 });
 
-impl DurationResultObserver for BasebackupQueryTime {
-    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
+pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
+    parent: &'a BasebackupQueryTime,
+    ctx: &'c RequestContext,
+    start: std::time::Instant,
+}
+
+impl BasebackupQueryTime {
+    pub(crate) fn start_recording<'c: 'a, 'a>(
+        &'a self,
+        ctx: &'c RequestContext,
+    ) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
+        let start = Instant::now();
+        match ctx.micros_spent_throttled.open() {
+            Ok(()) => (),
+            Err(error) => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                let mut rate_limit = LOGGED.lock().unwrap();
+                rate_limit.call(|| {
+                    warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
+                });
+            }
+        }
+        BasebackupQueryTimeOngoingRecording {
+            parent: self,
+            ctx,
+            start,
+        }
+    }
+}
+
+impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
+    pub(crate) fn observe<T, E>(self, res: &Result<T, E>) {
+        let elapsed = self.start.elapsed();
+        let ex_throttled = self
+            .ctx
+            .micros_spent_throttled
+            .close_and_checked_sub_from(elapsed);
+        let ex_throttled = match ex_throttled {
+            Ok(ex_throttled) => ex_throttled,
+            Err(error) => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                let mut rate_limit = LOGGED.lock().unwrap();
+                rate_limit.call(|| {
+                    warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit");
+                });
+                elapsed
+            }
+        };
         let label_value = if res.is_ok() { "ok" } else { "error" };
-        let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
-        metric.observe(duration.as_secs_f64());
+        let metric = self
+            .parent
+            .0
+            .get_metric_with_label_values(&[label_value])
+            .unwrap();
+        metric.observe(ex_throttled.as_secs_f64());
     }
 }
 
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index dacee41e6e..f3ceb7d3e6 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1199,7 +1199,7 @@ impl PageServerHandler {
         prev_lsn: Option<Lsn>,
         full_backup: bool,
         gzip: bool,
-        ctx: RequestContext,
+        ctx: &RequestContext,
     ) -> Result<(), QueryError>
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -1214,7 +1214,7 @@ impl PageServerHandler {
         if let Some(lsn) = lsn {
             // Backup was requested at a particular LSN. Wait for it to arrive.
             info!("waiting for {}", lsn);
-            timeline.wait_lsn(lsn, &ctx).await?;
+            timeline.wait_lsn(lsn, ctx).await?;
             timeline
                 .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
                 .context("invalid basebackup lsn")?;
@@ -1236,7 +1236,7 @@ impl PageServerHandler {
                 lsn,
                 prev_lsn,
                 full_backup,
-                &ctx,
+                ctx,
             )
             .await?;
         } else {
@@ -1257,7 +1257,7 @@ impl PageServerHandler {
                     lsn,
                     prev_lsn,
                     full_backup,
-                    &ctx,
+                    ctx,
                 )
                 .await?;
                 // shutdown the encoder to ensure the gzip footer is written
@@ -1269,7 +1269,7 @@ impl PageServerHandler {
                     lsn,
                     prev_lsn,
                     full_backup,
-                    &ctx,
+                    ctx,
                 )
                 .await?;
             }
@@ -1449,25 +1449,25 @@ where
                 false
             };
 
-            ::metrics::metric_vec_duration::observe_async_block_duration_by_result(
-                &*metrics::BASEBACKUP_QUERY_TIME,
-                async move {
-                    self.handle_basebackup_request(
-                        pgb,
-                        tenant_id,
-                        timeline_id,
-                        lsn,
-                        None,
-                        false,
-                        gzip,
-                        ctx,
-                    )
-                    .await?;
-                    pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                    Result::<(), QueryError>::Ok(())
-                },
-            )
-            .await?;
+            let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
+            let res = async {
+                self.handle_basebackup_request(
+                    pgb,
+                    tenant_id,
+                    timeline_id,
+                    lsn,
+                    None,
+                    false,
+                    gzip,
+                    &ctx,
+                )
+                .await?;
+                pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                Result::<(), QueryError>::Ok(())
+            }
+            .await;
+            metric_recording.observe(&res);
+            res?;
         }
         // return pair of prev_lsn and last_lsn
         else if query_string.starts_with("get_last_record_rlsn ") {
@@ -1563,7 +1563,7 @@ where
                 prev_lsn,
                 true,
                 false,
-                ctx,
+                &ctx,
             )
             .await?;
             pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;

From b09d68633510bdb12b017fb01ac055ffe7298833 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Sat, 9 Mar 2024 15:09:08 +0200
Subject: [PATCH 0369/1571] fix: on-demand downloads can outlive timeline
 shutdown (#7051)

## Problem

Before this PR, it was possible that on-demand downloads were started
after `Timeline::shutdown()`.

For example, we have observed a walreceiver-connection-handler-initiated
on-demand download that was started after `Timeline::shutdown()`s final
`task_mgr::shutdown_tasks()` call.

The underlying issue is that `task_mgr::shutdown_tasks()` isn't sticky,
i.e., new tasks can be spawned during or after
`task_mgr::shutdown_tasks()`.

Cc: https://github.com/neondatabase/neon/issues/4175 in lieu of a more
specific issue for task_mgr. We already decided we want to get rid of it
anyways.

Original investigation:
https://neondb.slack.com/archives/C033RQ5SPDH/p1709824952465949

## Changes

- enter gate while downloading
- use timeline cancellation token for cancelling download

thereby, fixes #7054

Entering the gate might also remove recent "kept the gate from closing"
in staging.
---
 libs/remote_storage/tests/test_real_s3.rs    | 26 +++++++++++--------
 pageserver/src/task_mgr.rs                   |  3 ---
 pageserver/src/tenant/storage_layer/layer.rs | 27 ++++++++------------
 test_runner/regress/test_tenant_delete.py    |  2 ++
 test_runner/regress/test_timeline_delete.py  |  4 ++-
 5 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index e927b40e80..d8b9824d99 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -17,6 +17,7 @@ use remote_storage::{
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
+use tokio::io::AsyncBufReadExt;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 
@@ -484,32 +485,33 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
     ))
     .unwrap();
 
-    let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
+    let file_len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
 
     {
-        let mut stream = ctx
+        let stream = ctx
             .client
             .download(&path, &cancel)
             .await
             .expect("download succeeds")
             .download_stream;
 
-        let first = stream
-            .next()
-            .await
-            .expect("should have the first blob")
-            .expect("should have succeeded");
+        let mut reader = std::pin::pin!(tokio_util::io::StreamReader::new(stream));
 
-        tracing::info!(len = first.len(), "downloaded first chunk");
+        let first = reader.fill_buf().await.expect("should have the first blob");
+
+        let len = first.len();
+        tracing::info!(len, "downloaded first chunk");
 
         assert!(
-            first.len() < len,
+            first.len() < file_len,
             "uploaded file is too small, we downloaded all on first chunk"
         );
 
+        reader.consume(len);
+
         cancel.cancel();
 
-        let next = stream.next().await.expect("stream should have more");
+        let next = reader.fill_buf().await;
 
         let e = next.expect_err("expected an error, but got a chunk?");
 
@@ -520,6 +522,10 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
                 .is_some_and(|e| matches!(e, DownloadError::Cancelled)),
             "{inner:?}"
         );
+
+        let e = DownloadError::from(e);
+
+        assert!(matches!(e, DownloadError::Cancelled), "{e:?}");
     }
 
     let cancel = CancellationToken::new();
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index adaa55c179..275a72c0b0 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -272,9 +272,6 @@ pub enum TaskKind {
     // Task that uploads a file to remote storage
     RemoteUploadTask,
 
-    // Task that downloads a file from remote storage
-    RemoteDownloadTask,
-
     // task that handles the initial downloading of all tenants
     InitialLoad,
 
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 6c46b83622..aabb13b15c 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -880,23 +880,18 @@ impl LayerInner {
     ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
-        let task_name = format!("download layer {}", self);
-
         let (tx, rx) = tokio::sync::oneshot::channel();
 
-        // this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
-        // block tenant::mgr::remove_tenant_from_memory.
-
         let this: Arc<Self> = self.clone();
 
-        crate::task_mgr::spawn(
-            &tokio::runtime::Handle::current(),
-            crate::task_mgr::TaskKind::RemoteDownloadTask,
-            Some(self.desc.tenant_shard_id),
-            Some(self.desc.timeline_id),
-            &task_name,
-            false,
-            async move {
+        let guard = timeline
+            .gate
+            .enter()
+            .map_err(|_| DownloadError::DownloadCancelled)?;
+
+        tokio::task::spawn(async move {
+
+                let _guard = guard;
 
                 let client = timeline
                     .remote_client
@@ -906,7 +901,7 @@ impl LayerInner {
                 let result = client.download_layer_file(
                     &this.desc.filename(),
                     &this.metadata(),
-                    &crate::task_mgr::shutdown_token()
+                    &timeline.cancel
                 )
                 .await;
 
@@ -929,7 +924,6 @@ impl LayerInner {
 
                         tokio::select! {
                             _ = tokio::time::sleep(backoff) => {},
-                            _ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
                             _ = timeline.cancel.cancelled() => {},
                         };
 
@@ -959,11 +953,10 @@ impl LayerInner {
                         }
                     }
                 }
-
-                Ok(())
             }
             .in_current_span(),
         );
+
         match rx.await {
             Ok((Ok(()), permit)) => {
                 if let Some(reason) = self
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index c4b4e5fb77..52de889084 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -190,6 +190,8 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
             # So by ignoring these instead of waiting for empty upload queue
             # we execute more distinct code paths.
             '.*stopping left-over name="remote upload".*',
+            # an on-demand is cancelled by shutdown
+            ".*initial size calculation failed: downloading failed, possibly for shutdown",
         ]
     )
 
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 795110d90b..96a5cc491a 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -213,7 +213,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
             # This happens when timeline remains are cleaned up during loading
             ".*Timeline dir entry become invalid.*",
             # In one of the branches we poll for tenant to become active. Polls can generate this log message:
-            f".*Tenant {env.initial_tenant} is not active*",
+            f".*Tenant {env.initial_tenant} is not active.*",
+            # an on-demand is cancelled by shutdown
+            ".*initial size calculation failed: downloading failed, possibly for shutdown",
         ]
     )
 

From d894d2b4501d40a15589093a85ab7b9f98491701 Mon Sep 17 00:00:00 2001
From: Roman Zaynetdinov <roman@neon.tech>
Date: Mon, 11 Mar 2024 10:10:04 +0200
Subject: [PATCH 0370/1571] Export db size, deadlocks and changed row metrics
 (#7050)

## Problem

We want to report metrics for the oldest user database.
---
 vm-image-spec.yaml | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index c1b7ad533a..5b93088303 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -142,6 +142,51 @@ files:
         query: |
           select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
 
+      - metric_name: pg_stats_userdb
+        type: gauge
+        help: 'Stats for the oldest non-system db'
+        key_labels:
+          - datname
+        value_label: kind
+        values:
+          - db_size
+          - deadlocks
+          # Rows
+          - inserted
+          - updated
+          - deleted
+        # We export stats for only one non-system database. Without this limit
+        # it is too easy to abuse the system by creating lots of databases.
+        # We can try lifting this limit in the future after we understand the needs better.
+        query: |
+          select pg_database_size(datname) as db_size, deadlocks,
+                 tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
+                 datname
+            from pg_stat_database
+           where datname IN (
+             select datname
+               from pg_database
+              where datname <> 'postgres' and not datistemplate
+              order by oid
+              limit 1
+           );
+
+      - metric_name: max_cluster_size
+        type: gauge
+        help: 'neon.max_cluster_size setting'
+        key_labels:
+        values: [max_cluster_size]
+        query: |
+          select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
+
+      - metric_name: db_total_size
+        type: gauge
+        help: 'Size of all databases'
+        key_labels:
+        values: [total]
+        query: |
+          select sum(pg_database_size(datname)) as total from pg_database;
+
 build: |
   # Build cgroup-tools
   #

From cc5d6c66b35ba91020d859e8bf39e92f040d0254 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 11 Mar 2024 08:20:09 +0000
Subject: [PATCH 0371/1571] proxy: categorise new cplane error message (#7057)

## Problem

`422 Unprocessable Entity: compute time quota of non-primary branches is
exceeded` being marked as a control plane error.

## Summary of changes

Add the manual checks to make this a user error that should not be
retried.
---
 proxy/src/console/provider.rs   | 13 ++++++++++++-
 proxy/src/proxy/wake_compute.rs |  6 ++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 0b74cd90cc..8609606273 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -73,7 +73,7 @@ pub mod errors {
                         // Status 406: endpoint is disabled (we don't allow connections).
                         format!("{REQUEST_FAILED}: endpoint is disabled")
                     }
-                    http::StatusCode::LOCKED => {
+                    http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
                         // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
                         format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support")
                     }
@@ -91,6 +91,12 @@ pub mod errors {
                     status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
                     ..
                 } => crate::error::ErrorKind::User,
+                ApiError::Console {
+                    status: http::StatusCode::UNPROCESSABLE_ENTITY,
+                    text,
+                } if text.contains("compute time quota of non-primary branches is exceeded") => {
+                    crate::error::ErrorKind::User
+                }
                 ApiError::Console {
                     status: http::StatusCode::LOCKED,
                     text,
@@ -120,6 +126,11 @@ pub mod errors {
                     status: http::StatusCode::BAD_REQUEST,
                     ..
                 } => true,
+                // don't retry when quotas are exceeded
+                Self::Console {
+                    status: http::StatusCode::UNPROCESSABLE_ENTITY,
+                    ref text,
+                } => !text.contains("compute time quota of non-primary branches is exceeded"),
                 // locked can be returned when the endpoint was in transition
                 // or when quotas are exceeded. don't retry when quotas are exceeded
                 Self::Console {
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 2c593451b4..bfe4b7ec3a 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -69,6 +69,12 @@ fn report_error(e: &WakeComputeError, retry: bool) {
         {
             "quota_exceeded"
         }
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::UNPROCESSABLE_ENTITY,
+            ref text,
+        }) if text.contains("compute time quota of non-primary branches is exceeded") => {
+            "quota_exceeded"
+        }
         WakeComputeError::ApiError(ApiError::Console {
             status: StatusCode::LOCKED,
             ..

From f8483cc4a38a06da2481dee557237298d8dc147b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 11 Mar 2024 09:32:17 +0000
Subject: [PATCH 0372/1571] pageserver: update swagger for HA APIs (#7070)

- The type of heatmap_period in tenant config was wrrong
- Secondary download and heatmap upload endpoints weren't in swagger.
---
 pageserver/src/http/openapi_spec.yml | 55 +++++++++++++++++++++++++++-
 pageserver/src/tenant/config.rs      |  1 +
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index d924224a32..6a070e2135 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -932,6 +932,59 @@ paths:
               schema:
                 $ref: "#/components/schemas/ServiceUnavailableError"
 
+  /v1/tenant/{tenant_shard_id}/heatmap_upload:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+    post:
+      description: |
+        If the location is in an attached mode, upload the current state to the remote heatmap
+      responses:
+        "200":
+          description: Success
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
+  /v1/tenant/{tenant_shard_id}/secondary/download:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+    post:
+      description: |
+        If the location is in secondary mode, download latest heatmap and layers
+      responses:
+        "200":
+          description: Success
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
+
 
   /v1/tenant/{tenant_id}/timeline/:
     parameters:
@@ -1391,7 +1444,7 @@ components:
         trace_read_requests:
           type: boolean
         heatmap_period:
-          type: integer
+          type: string
     TenantConfigResponse:
       type: object
       properties:
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 9464324413..57fc444cdd 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -354,6 +354,7 @@ pub struct TenantConf {
     /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
     /// may be disabled if a Tenant will not have secondary locations: only secondary
     /// locations will use the heatmap uploaded by attached locations.
+    #[serde(with = "humantime_serde")]
     pub heatmap_period: Duration,
 
     /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup

From 26ae7b0b3e2e4371d644d9bdfe9baca4dc98418e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 11 Mar 2024 15:25:53 +0200
Subject: [PATCH 0373/1571] fix(metrics): reset TENANT_STATE metric on startup
 (#7084)

Otherwise, it might happen that we never get to witness the same state
on subsequent restarts, thus the time series will show the value from a
few restarts ago.

The actual case here was that "Activating" was showing `3` while I was
doing tenant migration testing on staging. The number 3 was however from
a startup that happened some time ago which had been interrupted by
another deployment.
---
 pageserver/src/metrics.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 27e754e999..74e91210fc 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2676,6 +2676,12 @@ pub fn preinitialize_metrics() {
     Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
     Lazy::force(&disk_usage_based_eviction::METRICS);
 
+    for state_name in pageserver_api::models::TenantState::VARIANTS {
+        // initialize the metric for all gauges, otherwise the time series might seemingly show
+        // values from last restart.
+        TENANT_STATE_METRIC.with_label_values(&[state_name]).set(0);
+    }
+
     // countervecs
     [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
         .into_iter()

From b4972d07d41fce43550dc5ceb63806c3cf7d8f8d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 11 Mar 2024 14:29:32 +0000
Subject: [PATCH 0374/1571] storage controller: refactor non-mutable members up
 into Service (#7086)

result_tx and compute_hook were in ServiceState (i.e. behind a sync
mutex), but didn't need to be.

Moving them up into Service removes a bunch of boilerplate clones.

While we're here, create a helper `Service::maybe_reconcile_shard` which
avoids writing out all the `&self.` arguments to
`TenantState::maybe_reconcile` everywhere we call it.
---
 .../attachment_service/src/service.rs         | 149 +++++-------------
 .../attachment_service/src/tenant_state.rs    |   3 +-
 2 files changed, 40 insertions(+), 112 deletions(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 556d6a6828..f3d97c0dfb 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -83,16 +83,10 @@ struct ServiceState {
     nodes: Arc<HashMap<NodeId, Node>>,
 
     scheduler: Scheduler,
-
-    compute_hook: Arc<ComputeHook>,
-
-    result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
 }
 
 impl ServiceState {
     fn new(
-        config: Config,
-        result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
         nodes: HashMap<NodeId, Node>,
         tenants: BTreeMap<TenantShardId, TenantState>,
         scheduler: Scheduler,
@@ -101,8 +95,6 @@ impl ServiceState {
             tenants,
             nodes: Arc::new(nodes),
             scheduler,
-            compute_hook: Arc::new(ComputeHook::new(config)),
-            result_tx,
         }
     }
 
@@ -152,6 +144,8 @@ pub struct Service {
     inner: Arc<std::sync::RwLock<ServiceState>>,
     config: Config,
     persistence: Arc<Persistence>,
+    compute_hook: Arc<ComputeHook>,
+    result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
 
     // Process shutdown will fire this token
     cancel: CancellationToken,
@@ -481,8 +475,6 @@ impl Service {
         notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>,
         deadline: Instant,
     ) -> HashSet<TenantShardId> {
-        let compute_hook = self.inner.read().unwrap().compute_hook.clone();
-
         let attempt_shards = notifications.iter().map(|i| i.0).collect::<HashSet<_>>();
         let mut success_shards = HashSet::new();
 
@@ -490,7 +482,7 @@ impl Service {
         // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.
         let mut stream = futures::stream::iter(notifications.into_iter())
             .map(|(tenant_shard_id, node_id, stripe_size)| {
-                let compute_hook = compute_hook.clone();
+                let compute_hook = self.compute_hook.clone();
                 let cancel = self.cancel.clone();
                 async move {
                     if let Err(e) = compute_hook
@@ -730,14 +722,12 @@ impl Service {
 
         let this = Arc::new(Self {
             inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
-                config.clone(),
-                result_tx,
-                nodes,
-                tenants,
-                scheduler,
+                nodes, tenants, scheduler,
             ))),
-            config,
+            config: config.clone(),
             persistence,
+            compute_hook: Arc::new(ComputeHook::new(config)),
+            result_tx,
             startup_complete: startup_complete.clone(),
             cancel: CancellationToken::new(),
             gate: Gate::default(),
@@ -1145,8 +1135,6 @@ impl Service {
 
         let (waiters, response_shards) = {
             let mut locked = self.inner.write().unwrap();
-            let result_tx = locked.result_tx.clone();
-            let compute_hook = locked.compute_hook.clone();
             let (nodes, tenants, scheduler) = locked.parts_mut();
 
             let mut response_shards = Vec::new();
@@ -1231,17 +1219,7 @@ impl Service {
 
             let waiters = tenants
                 .range_mut(TenantShardId::tenant_range(tenant_id))
-                .filter_map(|(_shard_id, shard)| {
-                    shard.maybe_reconcile(
-                        result_tx.clone(),
-                        nodes,
-                        &compute_hook,
-                        &self.config,
-                        &self.persistence,
-                        &self.gate,
-                        &self.cancel,
-                    )
-                })
+                .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes))
                 .collect::<Vec<_>>();
             (waiters, response_shards)
         };
@@ -1432,8 +1410,6 @@ impl Service {
                 let mut waiters = Vec::new();
                 {
                     let mut locked = self.inner.write().unwrap();
-                    let result_tx = locked.result_tx.clone();
-                    let compute_hook = locked.compute_hook.clone();
                     let (nodes, tenants, scheduler) = locked.parts_mut();
 
                     for ShardUpdate {
@@ -1461,15 +1437,7 @@ impl Service {
 
                         shard.schedule(scheduler)?;
 
-                        let maybe_waiter = shard.maybe_reconcile(
-                            result_tx.clone(),
-                            nodes,
-                            &compute_hook,
-                            &self.config,
-                            &self.persistence,
-                            &self.gate,
-                            &self.cancel,
-                        );
+                        let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
                         if let Some(waiter) = maybe_waiter {
                             waiters.push(waiter);
                         }
@@ -1514,20 +1482,10 @@ impl Service {
         let waiters = {
             let mut waiters = Vec::new();
             let mut locked = self.inner.write().unwrap();
-            let result_tx = locked.result_tx.clone();
-            let compute_hook = locked.compute_hook.clone();
             let (nodes, tenants, _scheduler) = locked.parts_mut();
             for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
                 shard.config = config.clone();
-                if let Some(waiter) = shard.maybe_reconcile(
-                    result_tx.clone(),
-                    nodes,
-                    &compute_hook,
-                    &self.config,
-                    &self.persistence,
-                    &self.gate,
-                    &self.cancel,
-                ) {
+                if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
                     waiters.push(waiter);
                 }
             }
@@ -2159,7 +2117,7 @@ impl Service {
         }
 
         // Validate input, and calculate which shards we will create
-        let (old_shard_count, targets, compute_hook) =
+        let (old_shard_count, targets) =
             {
                 let locked = self.inner.read().unwrap();
 
@@ -2255,7 +2213,7 @@ impl Service {
                     }
                 }
 
-                (old_shard_count, targets, locked.compute_hook.clone())
+                (old_shard_count, targets)
             };
 
         // unwrap safety: we would have returned above if we didn't find at least one shard to split
@@ -2451,7 +2409,8 @@ impl Service {
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
         for (child_id, child_ps, stripe_size) in child_locations {
-            if let Err(e) = compute_hook
+            if let Err(e) = self
+                .compute_hook
                 .notify(child_id, child_ps, stripe_size, &self.cancel)
                 .await
             {
@@ -2481,8 +2440,6 @@ impl Service {
     ) -> Result<TenantShardMigrateResponse, ApiError> {
         let waiter = {
             let mut locked = self.inner.write().unwrap();
-            let result_tx = locked.result_tx.clone();
-            let compute_hook = locked.compute_hook.clone();
             let (nodes, tenants, scheduler) = locked.parts_mut();
 
             let Some(node) = nodes.get(&migrate_req.node_id) else {
@@ -2542,15 +2499,7 @@ impl Service {
                 shard.sequence = shard.sequence.next();
             }
 
-            shard.maybe_reconcile(
-                result_tx,
-                nodes,
-                &compute_hook,
-                &self.config,
-                &self.persistence,
-                &self.gate,
-                &self.cancel,
-            )
+            self.maybe_reconcile_shard(shard, nodes)
         };
 
         if let Some(waiter) = waiter {
@@ -2814,8 +2763,6 @@ impl Service {
         }
 
         let mut locked = self.inner.write().unwrap();
-        let result_tx = locked.result_tx.clone();
-        let compute_hook = locked.compute_hook.clone();
         let (nodes, tenants, scheduler) = locked.parts_mut();
 
         let mut new_nodes = (**nodes).clone();
@@ -2867,16 +2814,8 @@ impl Service {
                                 tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id);
                             }
                             Ok(()) => {
-                                if tenant_state
-                                    .maybe_reconcile(
-                                        result_tx.clone(),
-                                        &new_nodes,
-                                        &compute_hook,
-                                        &self.config,
-                                        &self.persistence,
-                                        &self.gate,
-                                        &self.cancel,
-                                    )
+                                if self
+                                    .maybe_reconcile_shard(tenant_state, &new_nodes)
                                     .is_some()
                                 {
                                     tenants_affected += 1;
@@ -2900,15 +2839,7 @@ impl Service {
                         tenant_state.observed.locations.get_mut(&config_req.node_id)
                     {
                         if observed_loc.conf.is_none() {
-                            tenant_state.maybe_reconcile(
-                                result_tx.clone(),
-                                &new_nodes,
-                                &compute_hook,
-                                &self.config,
-                                &self.persistence,
-                                &self.gate,
-                                &self.cancel,
-                            );
+                            self.maybe_reconcile_shard(tenant_state, &new_nodes);
                         }
                     }
                 }
@@ -2937,22 +2868,12 @@ impl Service {
         tenant_id: TenantId,
     ) -> Result<Vec<ReconcilerWaiter>, anyhow::Error> {
         let mut waiters = Vec::new();
-        let result_tx = locked.result_tx.clone();
-        let compute_hook = locked.compute_hook.clone();
         let (nodes, tenants, scheduler) = locked.parts_mut();
 
         for (_tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
             shard.schedule(scheduler)?;
 
-            if let Some(waiter) = shard.maybe_reconcile(
-                result_tx.clone(),
-                nodes,
-                &compute_hook,
-                &self.config,
-                &self.persistence,
-                &self.gate,
-                &self.cancel,
-            ) {
+            if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
                 waiters.push(waiter);
             }
         }
@@ -2987,28 +2908,34 @@ impl Service {
         Ok(())
     }
 
+    /// Convenience wrapper around [`TenantState::maybe_reconcile`] that provides
+    /// all the references to parts of Self that are needed
+    fn maybe_reconcile_shard(
+        &self,
+        shard: &mut TenantState,
+        nodes: &Arc<HashMap<NodeId, Node>>,
+    ) -> Option<ReconcilerWaiter> {
+        shard.maybe_reconcile(
+            &self.result_tx,
+            nodes,
+            &self.compute_hook,
+            &self.config,
+            &self.persistence,
+            &self.gate,
+            &self.cancel,
+        )
+    }
+
     /// Check all tenants for pending reconciliation work, and reconcile those in need
     ///
     /// Returns how many reconciliation tasks were started
     fn reconcile_all(&self) -> usize {
         let mut locked = self.inner.write().unwrap();
-        let result_tx = locked.result_tx.clone();
-        let compute_hook = locked.compute_hook.clone();
         let pageservers = locked.nodes.clone();
         locked
             .tenants
             .iter_mut()
-            .filter_map(|(_tenant_shard_id, shard)| {
-                shard.maybe_reconcile(
-                    result_tx.clone(),
-                    &pageservers,
-                    &compute_hook,
-                    &self.config,
-                    &self.persistence,
-                    &self.gate,
-                    &self.cancel,
-                )
-            })
+            .filter_map(|(_tenant_shard_id, shard)| self.maybe_reconcile_shard(shard, &pageservers))
             .count()
     }
 
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index c775736b31..3c91e09ac3 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -617,7 +617,7 @@ impl TenantState {
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
     pub(crate) fn maybe_reconcile(
         &mut self,
-        result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
+        result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
         pageservers: &Arc<HashMap<NodeId, Node>>,
         compute_hook: &Arc<ComputeHook>,
         service_config: &service::Config,
@@ -729,6 +729,7 @@ impl TenantState {
                                                         tenant_id=%reconciler.tenant_shard_id.tenant_id,
                                                         shard_id=%reconciler.tenant_shard_id.shard_slug());
         metrics::RECONCILER.spawned.inc();
+        let result_tx = result_tx.clone();
         let join_handle = tokio::task::spawn(
             async move {
                 // Wait for any previous reconcile task to complete before we start

From 2b0f3549f7dad4ed7c62f89fada39f4e2ae33d34 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 11 Mar 2024 15:35:59 +0100
Subject: [PATCH 0375/1571] default to tokio-epoll-uring in CI tests & on Linux
 (#7077)

All of production is using it now as of
https://github.com/neondatabase/aws/pull/1121

The change in `flaky_tests.py` resets the flakiness detection logic.

The alternative would have been to repeat the choice of io engine in
each test name, which would junk up the various test reports too much.

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/build_and_test.yml |  4 ++--
 pageserver/src/config.rs             |  4 ++++
 scripts/flaky_tests.py               | 10 +++++++---
 test_runner/fixtures/parametrize.py  |  9 ++++++---
 4 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 276c71c6e0..810c61de2d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -474,7 +474,7 @@ jobs:
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
           BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
           PAGESERVER_GET_VECTORED_IMPL: vectored
 
       # Temporary disable this step until we figure out why it's so flaky
@@ -554,7 +554,7 @@ jobs:
           VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 437387164d..4adcedafd1 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -83,6 +83,10 @@ pub mod defaults {
 
     pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 
+    #[cfg(target_os = "linux")]
+    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "tokio-epoll-uring";
+
+    #[cfg(not(target_os = "linux"))]
     pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
 
     pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
index 61a97f520d..4464f09c29 100755
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -15,7 +15,8 @@ FLAKY_TESTS_QUERY = """
         DISTINCT parent_suite, suite, name
     FROM results
     WHERE
-        started_at > CURRENT_DATE - INTERVAL '%s' day
+        started_at > CURRENT_DATE - INTERVAL '10' day
+        AND started_at > '2024-03-11 11:32:12.874+00' -- TODO(update the date in a separate PR): we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs`
         AND (
             (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
             OR flaky
@@ -46,11 +47,14 @@ def main(args: argparse.Namespace):
         logging.error("cannot fetch flaky tests from the DB due to an error", exc)
         rows = []
 
-    # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not std-fs),
+    # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not tokio-epoll-uring),
     # use it to parametrize test name along with build_type and pg_version
     #
     # See test_runner/fixtures/parametrize.py for details
-    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
+    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in (
+        "",
+        "tokio-epoll-uring",
+    ):
         pageserver_virtual_file_io_engine_parameter = f"-{io_engine}"
     else:
         pageserver_virtual_file_io_engine_parameter = ""
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 57ca1932b0..b28da83508 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -46,9 +46,12 @@ def pytest_generate_tests(metafunc: Metafunc):
 
     metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
 
-    # A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=tokio-epoll-uring`
-    # And do not change test name for default `pageserver_virtual_file_io_engine=std-fs` to keep tests statistics
-    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
+    # A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=std-fs`
+    # And do not change test name for default `pageserver_virtual_file_io_engine=tokio-epoll-uring` to keep tests statistics
+    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in (
+        "",
+        "tokio-epoll-uring",
+    ):
         metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine])
 
     # For performance tests, parametrize also by platform

From 8224580f3e0517a9d5792d2ddae275c0e26377d6 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 11 Mar 2024 15:41:41 +0100
Subject: [PATCH 0376/1571] fix(tenant/timeline metrics): race condition during
 shutdown + recreation (#7064)

Tenant::shutdown or Timeline::shutdown completes and becomes externally
observable before the corresponding Tenant/Timeline object is dropped.

For example, after observing a Tenant::shutdown to complete, we could
attach the same tenant_id again. The shut down Tenant object might still
be around at the time of the attach.

The race is then the following:
- old object's metrics are still around
- new object uses with_label_values
- old object calls remove_label_values

The outcome is that the new object will have the metric objects (they're
an Arc internall) but the metrics won't be part of the internal registry
and hence they'll be missing in `/metrics`.

Later, when the new object gets shut down and tries to
remove_label_value, it will observe an error because
the metric was already removed by the old object.

Changes
-------

This PR moves metric removal to `shutdown()`.

An alternative design would be to multi-version the metrics using a
distinguishing label, or, to use a better metrics crate that allows
removing metrics from the registry through the locally held metric
handle instead of interacting with the (globally shared) registry.

refs https://github.com/neondatabase/neon/pull/7051
---
 pageserver/src/metrics.rs         | 4 +---
 pageserver/src/tenant.rs          | 7 ++-----
 pageserver/src/tenant/timeline.rs | 2 ++
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 74e91210fc..814b3e1f96 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2017,10 +2017,8 @@ impl TimelineMetrics {
     pub(crate) fn resident_physical_size_get(&self) -> u64 {
         self.resident_physical_size_gauge.get()
     }
-}
 
-impl Drop for TimelineMetrics {
-    fn drop(&mut self) {
+    pub(crate) fn shutdown(&self) {
         let tenant_id = &self.tenant_id;
         let timeline_id = &self.timeline_id;
         let shard_id = &self.shard_id;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4f4654422b..961995b2d6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1846,6 +1846,8 @@ impl Tenant {
         // Wait for any in-flight operations to complete
         self.gate.close().await;
 
+        remove_tenant_metrics(&self.tenant_shard_id);
+
         Ok(())
     }
 
@@ -3557,11 +3559,6 @@ async fn run_initdb(
     Ok(())
 }
 
-impl Drop for Tenant {
-    fn drop(&mut self) {
-        remove_tenant_metrics(&self.tenant_shard_id);
-    }
-}
 /// Dump contents of a layer file to stdout.
 pub async fn dump_layerfile_from_path(
     path: &Utf8Path,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7004db1cb5..c017d30f45 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1257,6 +1257,8 @@ impl Timeline {
 
         // Finally wait until any gate-holders are complete
         self.gate.close().await;
+
+        self.metrics.shutdown();
     }
 
     pub(crate) fn set_state(&self, new_state: TimelineState) {

From 8c5b3100904ac24a102fd086c076790d2c688e39 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 11 Mar 2024 17:54:06 +0200
Subject: [PATCH 0377/1571] fix: Layer delete on drop and eviction can outlive
 timeline shutdown (#7082)

This is a follow-up to #7051 where `LayerInner::drop` and
`LayerInner::evict_blocking` were not noticed to require a gate before
the file deletion. The lack of entering a gate opens up a similar
possibility of deleting a layer file which a newer Timeline instance has
already checked out to be resident in a similar case as #7051.
---
 pageserver/src/tenant/storage_layer/layer.rs | 54 ++++++++++++--------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index aabb13b15c..959065bc4c 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -536,6 +536,18 @@ impl Drop for LayerInner {
             // carry this until we are finished for [`Layer::wait_drop`] support
             let _status = status;
 
+            let Some(timeline) = timeline.upgrade() else {
+                // no need to nag that timeline is gone: under normal situation on
+                // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
+                return;
+            };
+
+            let Ok(_guard) = timeline.gate.enter() else {
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
+                return;
+            };
+
             let removed = match std::fs::remove_file(path) {
                 Ok(()) => true,
                 Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
@@ -554,32 +566,26 @@ impl Drop for LayerInner {
                 }
             };
 
-            if let Some(timeline) = timeline.upgrade() {
-                if removed {
-                    timeline.metrics.resident_physical_size_sub(file_size);
-                }
-                if let Some(remote_client) = timeline.remote_client.as_ref() {
-                    let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
+            if removed {
+                timeline.metrics.resident_physical_size_sub(file_size);
+            }
+            if let Some(remote_client) = timeline.remote_client.as_ref() {
+                let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
 
-                    if let Err(e) = res {
-                        // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
-                        // demonstrating this deadlock (without spawn_blocking): stop will drop
-                        // queued items, which will have ResidentLayer's, and those drops would try
-                        // to re-entrantly lock the RemoteTimelineClient inner state.
-                        if !timeline.is_active() {
-                            tracing::info!("scheduling deletion on drop failed: {e:#}");
-                        } else {
-                            tracing::warn!("scheduling deletion on drop failed: {e:#}");
-                        }
-                        LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
+                if let Err(e) = res {
+                    // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
+                    // demonstrating this deadlock (without spawn_blocking): stop will drop
+                    // queued items, which will have ResidentLayer's, and those drops would try
+                    // to re-entrantly lock the RemoteTimelineClient inner state.
+                    if !timeline.is_active() {
+                        tracing::info!("scheduling deletion on drop failed: {e:#}");
                     } else {
-                        LAYER_IMPL_METRICS.inc_completed_deletes();
+                        tracing::warn!("scheduling deletion on drop failed: {e:#}");
                     }
+                    LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
+                } else {
+                    LAYER_IMPL_METRICS.inc_completed_deletes();
                 }
-            } else {
-                // no need to nag that timeline is gone: under normal situation on
-                // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
-                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
             }
         });
     }
@@ -1095,6 +1101,10 @@ impl LayerInner {
             return Err(EvictionCancelled::TimelineGone);
         };
 
+        let Ok(_gate) = timeline.gate.enter() else {
+            return Err(EvictionCancelled::TimelineGone);
+        };
+
         // to avoid starting a new download while we evict, keep holding on to the
         // permit.
         let _permit = {

From 17a3c9036e4da341d9f1ca05316eefb3e7575232 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 11 Mar 2024 17:36:49 +0100
Subject: [PATCH 0378/1571] follow-up(#7077): adjust flaky-test-detection
 cutoff date for tokio-epoll-uring (#7090)

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 scripts/flaky_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
index 4464f09c29..853c67d218 100755
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -16,7 +16,7 @@ FLAKY_TESTS_QUERY = """
     FROM results
     WHERE
         started_at > CURRENT_DATE - INTERVAL '10' day
-        AND started_at > '2024-03-11 11:32:12.874+00' -- TODO(update the date in a separate PR): we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs`
+        AND started_at > '2024-03-11 14:50:11.845+00' -- we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs`
         AND (
             (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
             OR flaky

From 73a8c97ac8280cefd103871b7e20bce3aae35635 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 11 Mar 2024 13:49:58 -0400
Subject: [PATCH 0379/1571] fix: warnings when compiling neon extensions
 (#7053)

proceeding https://github.com/neondatabase/neon/pull/7010, close
https://github.com/neondatabase/neon/issues/6188

## Summary of changes

This pull request (should) fix all warnings except
`-Wdeclaration-after-statement` in the neon extension compilation.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pgxn/hnsw/hnsw.c                | 2 +-
 pgxn/neon/extension_server.c    | 1 -
 pgxn/neon/neon.c                | 4 ----
 pgxn/neon/pagestore_smgr.c      | 1 -
 pgxn/neon/walproposer_pg.c      | 3 +--
 pgxn/neon_test_utils/neontest.c | 2 --
 pgxn/neon_walredo/walredoproc.c | 3 +++
 7 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/pgxn/hnsw/hnsw.c b/pgxn/hnsw/hnsw.c
index 45bf78ed3b..e624cb831f 100644
--- a/pgxn/hnsw/hnsw.c
+++ b/pgxn/hnsw/hnsw.c
@@ -149,7 +149,7 @@ hnsw_check_available_memory(Size requested)
 	struct sysinfo si;
 	Size total;
 	if (sysinfo(&si) < 0)
-		elog(ERROR, "Failed to get amount of RAM: %n");
+		elog(ERROR, "Failed to get amount of RAM: %m");
 
 	total = si.totalram*si.mem_unit;
 	if ((Size)NBuffers*BLCKSZ + requested >= total)
diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c
index 1329e2d17b..e38af08f89 100644
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -38,7 +38,6 @@ neon_download_extension_file_http(const char *filename, bool is_library)
 
 	CURLcode	res;
 	char	   *compute_ctl_url;
-	char	   *postdata;
 	bool		ret = false;
 
 	if (handle == NULL)
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 1f456d9a3f..6ede78a576 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -95,7 +95,6 @@ get_num_snap_files_lsn_threshold(void)
 	DIR		   *dirdesc;
 	struct dirent *de;
 	char	   *snap_path = "pg_logical/snapshots/";
-	int			cnt = 0;
 	int			lsns_allocated = 1024;
 	int			lsns_num = 0;
 	XLogRecPtr *lsns;
@@ -161,9 +160,6 @@ get_num_snap_files_lsn_threshold(void)
 PGDLLEXPORT void
 LogicalSlotsMonitorMain(Datum main_arg)
 {
-	TimestampTz now,
-				last_checked;
-
 	/* Establish signal handlers. */
 	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
 	pqsignal(SIGHUP, SignalHandlerForConfigReload);
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 213e396328..0256de2b9a 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1888,7 +1888,6 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 				int nblocks, bool skipFsync)
 {
 	const PGAlignedBlock buffer = {0};
-	BlockNumber curblocknum = blocknum;
 	int			remblocks = nblocks;
 	XLogRecPtr	lsn = 0;
 
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 7f07913fa6..cf76a495b5 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1026,7 +1026,7 @@ static void
 StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
 {
 	XLogRecPtr	FlushPtr;
-	TimeLineID	currTLI;
+	 __attribute__((unused)) TimeLineID	currTLI;
 
 #if PG_VERSION_NUM < 150000
 	if (ThisTimeLineID == 0)
@@ -1230,7 +1230,6 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 	TimeLineID	timeline;
 	XLogRecPtr	startpos;
 	XLogRecPtr	endpos;
-	uint64		download_range_mb;
 
 	startpos = GetLogRepRestartLSN(wp);
 	if (startpos == InvalidXLogRecPtr)
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 7c618848e2..82ce5be9f6 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -182,8 +182,6 @@ test_consume_memory(PG_FUNCTION_ARGS)
 Datum
 test_release_memory(PG_FUNCTION_ARGS)
 {
-	TimestampTz start;
-
 	if (PG_ARGISNULL(0))
 	{
 		if (consume_cxt)
diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c
index 1fdd3801c6..c4ab22636b 100644
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -220,6 +220,9 @@ enter_seccomp_mode(void)
 }
 #endif /* HAVE_LIBSECCOMP */
 
+PGDLLEXPORT void
+WalRedoMain(int argc, char *argv[]);
+
 /*
  * Entry point for the WAL redo process.
  *

From 98723844ee86fb3392fd59d7a9f60545257cee03 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Mon, 11 Mar 2024 10:36:39 -0800
Subject: [PATCH 0380/1571] Don't return from inside PG_TRY (#7095)

## Problem
Returning from PG_TRY is a bug, and we currently do that

## Summary of changes
Make it break and then return false. This should also help stabilize
test_bad_connection.py
---
 pgxn/neon/libpagestore.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index a3543bca78..e31de3c6b5 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -316,6 +316,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
 	TimestampTz now;
 	uint64_t	us_since_last_connect;
+	bool	broke_from_loop = false;
 
 	Assert(page_servers[shard_no].conn == NULL);
 
@@ -418,7 +419,9 @@ pageserver_connect(shardno_t shard_no, int elevel)
 
 					neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
 								   msg);
-					return false;
+					/* Returning from inside PG_TRY is bad, so we break/return later */
+					broke_from_loop = true;
+					break;
 				}
 			}
 		}
@@ -431,6 +434,11 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	}
 	PG_END_TRY();
 
+	if (broke_from_loop)
+	{
+		return false;
+	}
+
 	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
 	page_servers[shard_no].conn = conn;
 	page_servers[shard_no].wes = wes;

From 0cf0731d8bd2dc55187697a4f3b4b523c7e927e1 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 11 Mar 2024 12:19:15 +0300
Subject: [PATCH 0381/1571] SIGQUIT instead of SIGKILL prewarmed postgres.

To avoid orphaned processes using wiped datadir with confusing logging.
---
 compute_tools/src/compute.rs | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 96ab4a06a5..0fa315682d 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -17,6 +17,7 @@ use chrono::{DateTime, Utc};
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
+use nix::unistd::Pid;
 use postgres::error::SqlState;
 use postgres::{Client, NoTls};
 use tracing::{debug, error, info, instrument, warn};
@@ -722,8 +723,12 @@ impl ComputeNode {
         // Stop it when it's ready
         info!("waiting for postgres");
         wait_for_postgres(&mut pg, Path::new(pgdata))?;
-        pg.kill()?;
-        info!("sent kill signal");
+        // SIGQUIT orders postgres to exit immediately. We don't want to SIGKILL
+        // it to avoid orphaned processes prowling around while datadir is
+        // wiped.
+        let pm_pid = Pid::from_raw(pg.id() as i32);
+        kill(pm_pid, Signal::SIGQUIT)?;
+        info!("sent SIGQUIT signal");
         pg.wait()?;
         info!("done prewarming");
 

From 74d09b78c740039bb0c86752bf6858b3a37c6c9c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 9 Feb 2024 22:01:20 +0200
Subject: [PATCH 0382/1571] Keep walproposer alive until shutdown checkpoint is
 safe on safekepeers

The walproposer pretends to be a walsender in many ways. It has a
WalSnd slot, it claims to be a walsender by calling
MarkPostmasterChildWalSender() etc. But one different to real
walsenders was that the postmaster still treated it as a bgworker
rather than a walsender. The difference is that at shutdown,
walsenders are not killed until the very end, after the checkpointer
process has written the shutdown checkpoint and exited.

As a result, the walproposer always got killed before the shutdown
checkpoint was written, so the shutdown checkpoint never made it to
safekeepers. That's fine in principle, we don't require a clean
shutdown after all. But it also feels a bit silly not to stream the
shutdown checkpoint. It could be useful for initializing hot standby
mode in a read replica, for example.

Change postmaster to treat background workers that have called
MarkPostmasterChildWalSender() as walsenders. That unfortunately
requires another small change in postgres core.

After doing that, walproposers stay alive longer. However, it also
means that the checkpointer will wait for the walproposer to switch to
WALSNDSTATE_STOPPING state, when the checkpointer sends the
PROCSIG_WALSND_INIT_STOPPING signal. We don't have the machinery in
walproposer to receive and handle that signal reliably. Instead, we
mark walproposer as being in WALSNDSTATE_STOPPING always.

In commit 568f91420a, I assumed that shutdown will wait for all the
remaining WAL to be streamed to safekeepers, but before this commit
that was not true, and the test became flaky. This should make it
stable again.

Some tests wrongly assumed that no WAL could have been written between
pg_current_wal_flush_lsn and quick pg stop after it. Fix them by introducing
flush_ep_to_pageserver which first stops the endpoint and then waits till all
committed WAL reaches the pageserver.

In passing extract safekeeper http client to its own module.
---
 libs/walproposer/src/api_bindings.rs          |   4 +-
 libs/walproposer/src/walproposer.rs           |   2 +-
 pgxn/neon/walproposer.c                       |  23 +-
 pgxn/neon/walproposer.h                       |   6 +-
 pgxn/neon/walproposer_pg.c                    | 102 ++++++-
 .../tests/walproposer_sim/walproposer_api.rs  |  15 +-
 test_runner/fixtures/neon_fixtures.py         | 277 ++++--------------
 test_runner/fixtures/safekeeper/__init__.py   |   0
 test_runner/fixtures/safekeeper/http.py       | 227 ++++++++++++++
 test_runner/fixtures/safekeeper/utils.py      |  11 +
 test_runner/regress/test_layer_eviction.py    |  14 +-
 .../regress/test_layers_from_future.py        |   5 +-
 test_runner/regress/test_ondemand_download.py |   3 +-
 test_runner/regress/test_wal_acceptor.py      |  39 ++-
 vendor/postgres-v14                           |   2 +-
 vendor/postgres-v15                           |   2 +-
 vendor/postgres-v16                           |   2 +-
 vendor/revisions.json                         |   7 +-
 18 files changed, 460 insertions(+), 281 deletions(-)
 create mode 100644 test_runner/fixtures/safekeeper/__init__.py
 create mode 100644 test_runner/fixtures/safekeeper/http.py
 create mode 100644 test_runner/fixtures/safekeeper/utils.py

diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index 8317e2fa03..f5ed6ebb97 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
     }
 }
 
-extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
+extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer) {
     unsafe {
         let callback_data = (*(*wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
+        (*api).process_safekeeper_feedback(&mut (*wp))
     }
 }
 
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 13fade220c..734967da3f 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -142,7 +142,7 @@ pub trait ApiImpl {
         todo!()
     }
 
-    fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
+    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer) {
         todo!()
     }
 
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 10487636ae..9ff0493352 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1220,7 +1220,7 @@ PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr begin
 	req->epochStartLsn = wp->propEpochStartLsn;
 	req->beginLsn = beginLsn;
 	req->endLsn = endLsn;
-	req->commitLsn = GetAcknowledgedByQuorumWALPosition(wp);
+	req->commitLsn = wp->commitLsn;
 	req->truncateLsn = wp->truncateLsn;
 	req->proposerId = wp->greetRequest.proposerId;
 }
@@ -1405,7 +1405,7 @@ static bool
 RecvAppendResponses(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
-	XLogRecPtr	minQuorumLsn;
+	XLogRecPtr	newCommitLsn;
 	bool		readAnything = false;
 
 	while (true)
@@ -1444,18 +1444,19 @@ RecvAppendResponses(Safekeeper *sk)
 	if (!readAnything)
 		return sk->state == SS_ACTIVE;
 
-	HandleSafekeeperResponse(wp);
-
+	/* update commit_lsn */
+	newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
 	/*
-	 * Also send the new commit lsn to all the safekeepers.
+	 * Send the new value to all safekeepers.
 	 */
-	minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp);
-	if (minQuorumLsn > wp->lastSentCommitLsn)
+	if (newCommitLsn > wp->commitLsn)
 	{
+		wp->commitLsn = newCommitLsn;
 		BroadcastAppendRequest(wp);
-		wp->lastSentCommitLsn = minQuorumLsn;
 	}
 
+	HandleSafekeeperResponse(wp);
+
 	return sk->state == SS_ACTIVE;
 }
 
@@ -1632,11 +1633,9 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
 static void
 HandleSafekeeperResponse(WalProposer *wp)
 {
-	XLogRecPtr	minQuorumLsn;
 	XLogRecPtr	candidateTruncateLsn;
 
-	minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp);
-	wp->api.process_safekeeper_feedback(wp, minQuorumLsn);
+	wp->api.process_safekeeper_feedback(wp);
 
 	/*
 	 * Try to advance truncateLsn -- the last record flushed to all
@@ -1649,7 +1648,7 @@ HandleSafekeeperResponse(WalProposer *wp)
 	 * can't commit entries from previous term' in Raft); 2)
 	 */
 	candidateTruncateLsn = CalculateMinFlushLsn(wp);
-	candidateTruncateLsn = Min(candidateTruncateLsn, minQuorumLsn);
+	candidateTruncateLsn = Min(candidateTruncateLsn, wp->commitLsn);
 	if (candidateTruncateLsn > wp->truncateLsn)
 	{
 		wp->truncateLsn = candidateTruncateLsn;
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 53820f6e1b..bc674fd979 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -564,7 +564,7 @@ typedef struct walproposer_api
 	 * backpressure feedback and to confirm WAL persistence (has been commited
 	 * on the quorum of safekeepers).
 	 */
-	void		(*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn);
+	void		(*process_safekeeper_feedback) (WalProposer *wp);
 
 	/*
 	 * Write a log message to the internal log processor. This is used only
@@ -646,8 +646,8 @@ typedef struct WalProposer
 	/* WAL has been generated up to this point */
 	XLogRecPtr	availableLsn;
 
-	/* last commitLsn broadcasted to safekeepers */
-	XLogRecPtr	lastSentCommitLsn;
+	/* cached GetAcknowledgedByQuorumWALPosition result */
+	XLogRecPtr	commitLsn;
 
 	ProposerGreeting greetRequest;
 
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index cf76a495b5..8eec2f02c1 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -68,6 +68,8 @@ static WalproposerShmemState *walprop_shared;
 static WalProposerConfig walprop_config;
 static XLogRecPtr sentPtr = InvalidXLogRecPtr;
 static const walproposer_api walprop_pg;
+static volatile sig_atomic_t got_SIGUSR2 = false;
+static bool reported_sigusr2 = false;
 
 static void nwp_shmem_startup_hook(void);
 static void nwp_register_gucs(void);
@@ -101,6 +103,8 @@ static void add_nwr_event_set(Safekeeper *sk, uint32 events);
 static void update_nwr_event_set(Safekeeper *sk, uint32 events);
 static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
 
+static void CheckGracefulShutdown(WalProposer *wp);
+
 static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp);
 
 static void
@@ -492,6 +496,24 @@ walprop_pg_init_standalone_sync_safekeepers(void)
 	BackgroundWorkerUnblockSignals();
 }
 
+/*
+ * We pretend to be a walsender process, and the lifecycle of a walsender is
+ * slightly different than other procesess. At shutdown, walsender processes
+ * stay alive until the very end, after the checkpointer has written the
+ * shutdown checkpoint. When the checkpointer exits, the postmaster sends all
+ * remaining walsender processes SIGUSR2. On receiving SIGUSR2, we try to send
+ * the remaining WAL, and then exit. This ensures that the checkpoint record
+ * reaches durable storage (in safekeepers), before the server shuts down
+ * completely.
+ */
+static void
+walprop_sigusr2(SIGNAL_ARGS)
+{
+	got_SIGUSR2 = true;
+
+	SetLatch(MyLatch);
+}
+
 static void
 walprop_pg_init_bgworker(void)
 {
@@ -503,6 +525,7 @@ walprop_pg_init_bgworker(void)
 	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
 	pqsignal(SIGHUP, SignalHandlerForConfigReload);
 	pqsignal(SIGTERM, die);
+	pqsignal(SIGUSR2, walprop_sigusr2);
 
 	BackgroundWorkerUnblockSignals();
 
@@ -1075,14 +1098,26 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
 #endif
 
 	/*
-	 * When we first start replication the standby will be behind the primary.
-	 * For some applications, for example synchronous replication, it is
-	 * important to have a clear state for this initial catchup mode, so we
-	 * can trigger actions when we change streaming state later. We may stay
-	 * in this state for a long time, which is exactly why we want to be able
-	 * to monitor whether or not we are still here.
+	 * XXX: Move straight to STOPPING state, skipping the STREAMING state.
+	 *
+	 * This is a bit weird. Normal walsenders stay in STREAMING state, until
+	 * the checkpointer signals them that it is about to start writing the
+	 * shutdown checkpoint. The walsenders acknowledge that they have received
+	 * that signal by switching to STOPPING state. That tells the walsenders
+	 * that they must not write any new WAL.
+	 *
+	 * However, we cannot easily intercept that signal from the checkpointer.
+	 * It's sent by WalSndInitStopping(), using
+	 * SendProcSignal(PROCSIGNAL_WALSND_INIT_STOPPING). It's received by
+	 * HandleWalSndInitStopping, which sets a process-local got_STOPPING flag.
+	 * However, that's all private to walsender.c.
+	 *
+	 * We don't need to do anything special upon receiving the signal, the
+	 * walproposer doesn't write any WAL anyway, so we skip the STREAMING
+	 * state and go directly to STOPPING mode. That way, the checkpointer
+	 * won't wait for us.
 	 */
-	WalSndSetState(WALSNDSTATE_CATCHUP);
+	WalSndSetState(WALSNDSTATE_STOPPING);
 
 	/*
 	 * Don't allow a request to stream from a future point in WAL that hasn't
@@ -1122,6 +1157,8 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
 static void
 WalSndLoop(WalProposer *wp)
 {
+	XLogRecPtr	flushPtr;
+
 	/* Clear any already-pending wakeups */
 	ResetLatch(MyLatch);
 
@@ -1130,9 +1167,6 @@ WalSndLoop(WalProposer *wp)
 		CHECK_FOR_INTERRUPTS();
 
 		XLogBroadcastWalProposer(wp);
-
-		if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
-			WalSndSetState(WALSNDSTATE_STREAMING);
 		WalProposerPoll(wp);
 	}
 }
@@ -1744,6 +1778,9 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
 	{
 		ConditionVariableCancelSleep();
 		ResetLatch(MyLatch);
+
+		CheckGracefulShutdown(wp);
+
 		*events = WL_LATCH_SET;
 		return 1;
 	}
@@ -1797,6 +1834,41 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn)
 	exit(0);
 }
 
+/*
+ * Like vanilla walsender, on sigusr2 send all remaining WAL and exit.
+ *
+ * Note that unlike sync-safekeepers waiting here is not reliable: we
+ * don't check that majority of safekeepers received and persisted
+ * commit_lsn -- only that walproposer reached it (which immediately
+ * broadcasts new value). Doing that without incurring redundant control
+ * file syncing would need wp -> sk protocol change. OTOH unlike
+ * sync-safekeepers which must bump commit_lsn or basebackup will fail,
+ * this catchup is important only for tests where safekeepers/network
+ * don't crash on their own.
+ */
+static void
+CheckGracefulShutdown(WalProposer *wp)
+{
+	if (got_SIGUSR2)
+	{
+		if (!reported_sigusr2)
+		{
+			XLogRecPtr	flushPtr = walprop_pg_get_flush_rec_ptr(wp);
+
+			wpg_log(LOG, "walproposer will send and wait for remaining WAL between %X/%X and %X/%X",
+					LSN_FORMAT_ARGS(wp->commitLsn), LSN_FORMAT_ARGS(flushPtr));
+			reported_sigusr2 = true;
+		}
+
+		if (wp->commitLsn >= walprop_pg_get_flush_rec_ptr(wp))
+		{
+			wpg_log(LOG, "walproposer sent all WAL up to %X/%X, exiting",
+					LSN_FORMAT_ARGS(wp->commitLsn));
+			proc_exit(0);
+		}
+	}
+}
+
 /*
  * Choose most advanced PageserverFeedback and set it to *rf.
  */
@@ -1877,7 +1949,7 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
  * None of that is functional in sync-safekeepers.
  */
 static void
-walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
+walprop_pg_process_safekeeper_feedback(WalProposer *wp)
 {
 	HotStandbyFeedback hsFeedback;
 	XLogRecPtr	oldDiskConsistentLsn;
@@ -1892,10 +1964,10 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
 	replication_feedback_set(&quorumFeedback.rf);
 	SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
 
-	if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
+	if (wp->commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
 	{
-		if (commitLsn > quorumFeedback.flushLsn)
-			quorumFeedback.flushLsn = commitLsn;
+		if (wp->commitLsn > quorumFeedback.flushLsn)
+			quorumFeedback.flushLsn = wp->commitLsn;
 
 		/*
 		 * Advance the replication slot to commitLsn. WAL before it is
@@ -1928,6 +2000,8 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
 								 XidFromFullTransactionId(hsFeedback.catalog_xmin),
 								 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
 	}
+
+	CheckGracefulShutdown(wp);
 }
 
 static XLogRecPtr
diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs
index 746cac019e..5c79e9082b 100644
--- a/safekeeper/tests/walproposer_sim/walproposer_api.rs
+++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs
@@ -196,6 +196,7 @@ pub struct SimulationApi {
     safekeepers: RefCell<Vec<SafekeeperConn>>,
     disk: Arc<DiskWalProposer>,
     redo_start_lsn: Option<Lsn>,
+    last_logged_commit_lsn: u64,
     shmem: UnsafeCell<walproposer::bindings::WalproposerShmemState>,
     config: Config,
     event_set: RefCell<Option<EventSet>>,
@@ -228,6 +229,7 @@ impl SimulationApi {
             safekeepers: RefCell::new(sk_conns),
             disk: args.disk,
             redo_start_lsn: args.redo_start_lsn,
+            last_logged_commit_lsn: 0,
             shmem: UnsafeCell::new(walproposer::bindings::WalproposerShmemState {
                 mutex: 0,
                 feedback: PageserverFeedback {
@@ -596,14 +598,11 @@ impl ApiImpl for SimulationApi {
         }
     }
 
-    fn process_safekeeper_feedback(
-        &self,
-        wp: &mut walproposer::bindings::WalProposer,
-        commit_lsn: u64,
-    ) {
-        debug!("process_safekeeper_feedback, commit_lsn={}", commit_lsn);
-        if commit_lsn > wp.lastSentCommitLsn {
-            self.os.log_event(format!("commit_lsn;{}", commit_lsn));
+    fn process_safekeeper_feedback(&mut self, wp: &mut walproposer::bindings::WalProposer) {
+        debug!("process_safekeeper_feedback, commit_lsn={}", wp.commitLsn);
+        if wp.commitLsn > self.last_logged_commit_lsn {
+            self.os.log_event(format!("commit_lsn;{}", wp.commitLsn));
+            self.last_logged_commit_lsn = wp.commitLsn;
         }
     }
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b933d391ab..018de975dc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -15,11 +15,11 @@ import threading
 import time
 import uuid
 from contextlib import closing, contextmanager
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
 from fcntl import LOCK_EX, LOCK_UN, flock
-from functools import cached_property
+from functools import cached_property, partial
 from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
@@ -70,6 +70,8 @@ from fixtures.remote_storage import (
     default_remote_storage,
     remote_storage_to_toml_inline_table,
 )
+from fixtures.safekeeper.http import SafekeeperHttpClient
+from fixtures.safekeeper.utils import are_walreceivers_absent
 from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import (
     ATTACHMENT_NAME_REGEX,
@@ -2547,6 +2549,20 @@ class PgBin:
         )
         return base_path
 
+    def get_pg_controldata_checkpoint_lsn(self, pgdata: str) -> Lsn:
+        """
+        Run pg_controldata on given datadir and extract checkpoint lsn.
+        """
+
+        pg_controldata_path = os.path.join(self.pg_bin_path, "pg_controldata")
+        cmd = f"{pg_controldata_path} -D {pgdata}"
+        result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
+        checkpoint_lsn = re.findall(
+            "Latest checkpoint location:\\s+([0-9A-F]+/[0-9A-F]+)", result.stdout
+        )[0]
+        log.info(f"last checkpoint at {checkpoint_lsn}")
+        return Lsn(checkpoint_lsn)
+
 
 @pytest.fixture(scope="function")
 def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin:
@@ -3565,220 +3581,6 @@ class Safekeeper:
         return segments
 
 
-# Walreceiver as returned by sk's timeline status endpoint.
-@dataclass
-class Walreceiver:
-    conn_id: int
-    state: str
-
-
-@dataclass
-class SafekeeperTimelineStatus:
-    acceptor_epoch: int
-    pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
-    flush_lsn: Lsn
-    commit_lsn: Lsn
-    timeline_start_lsn: Lsn
-    backup_lsn: Lsn
-    peer_horizon_lsn: Lsn
-    remote_consistent_lsn: Lsn
-    walreceivers: List[Walreceiver]
-
-
-@dataclass
-class SafekeeperMetrics:
-    # These are metrics from Prometheus which uses float64 internally.
-    # As a consequence, values may differ from real original int64s.
-    flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
-    commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
-
-
-class SafekeeperHttpClient(requests.Session):
-    HTTPError = requests.HTTPError
-
-    def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
-        super().__init__()
-        self.port = port
-        self.auth_token = auth_token
-        self.is_testing_enabled = is_testing_enabled
-
-        if auth_token is not None:
-            self.headers["Authorization"] = f"Bearer {auth_token}"
-
-    def check_status(self):
-        self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
-
-    def is_testing_enabled_or_skip(self):
-        if not self.is_testing_enabled:
-            pytest.skip("safekeeper was built without 'testing' feature")
-
-    def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
-        self.is_testing_enabled_or_skip()
-
-        if isinstance(config_strings, tuple):
-            pairs = [config_strings]
-        else:
-            pairs = config_strings
-
-        log.info(f"Requesting config failpoints: {repr(pairs)}")
-
-        res = self.put(
-            f"http://localhost:{self.port}/v1/failpoints",
-            json=[{"name": name, "actions": actions} for name, actions in pairs],
-        )
-        log.info(f"Got failpoints request response code {res.status_code}")
-        res.raise_for_status()
-        res_json = res.json()
-        assert res_json is None
-        return res_json
-
-    def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
-        params = params or {}
-        res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
-        res.raise_for_status()
-        res_json = json.loads(res.text)
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def patch_control_file(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        patch: Dict[str, Any],
-    ) -> Dict[str, Any]:
-        res = self.patch(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
-            json={
-                "updates": patch,
-                "apply_fields": list(patch.keys()),
-            },
-        )
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
-        res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
-            json=body,
-        )
-        res.raise_for_status()
-
-    def timeline_digest(
-        self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
-    ) -> Dict[str, Any]:
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
-            params={
-                "from_lsn": str(from_lsn),
-                "until_lsn": str(until_lsn),
-            },
-        )
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def timeline_create(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        pg_version: int,  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
-        commit_lsn: Lsn,
-    ):
-        body = {
-            "tenant_id": str(tenant_id),
-            "timeline_id": str(timeline_id),
-            "pg_version": pg_version,
-            "commit_lsn": str(commit_lsn),
-        }
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body)
-        res.raise_for_status()
-
-    def timeline_status(
-        self, tenant_id: TenantId, timeline_id: TimelineId
-    ) -> SafekeeperTimelineStatus:
-        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
-        res.raise_for_status()
-        resj = res.json()
-        walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
-        return SafekeeperTimelineStatus(
-            acceptor_epoch=resj["acceptor_state"]["epoch"],
-            pg_version=resj["pg_info"]["pg_version"],
-            flush_lsn=Lsn(resj["flush_lsn"]),
-            commit_lsn=Lsn(resj["commit_lsn"]),
-            timeline_start_lsn=Lsn(resj["timeline_start_lsn"]),
-            backup_lsn=Lsn(resj["backup_lsn"]),
-            peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
-            remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
-            walreceivers=walreceivers,
-        )
-
-    def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
-        res = self.post(
-            f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
-            json=body,
-        )
-        res.raise_for_status()
-
-    # only_local doesn't remove segments in the remote storage.
-    def timeline_delete(
-        self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
-    ) -> Dict[Any, Any]:
-        res = self.delete(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
-            params={
-                "only_local": str(only_local).lower(),
-            },
-        )
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]:
-        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def get_metrics_str(self) -> str:
-        request_result = self.get(f"http://localhost:{self.port}/metrics")
-        request_result.raise_for_status()
-        return request_result.text
-
-    def get_metrics(self) -> SafekeeperMetrics:
-        all_metrics_text = self.get_metrics_str()
-
-        metrics = SafekeeperMetrics()
-        for match in re.finditer(
-            r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
-            all_metrics_text,
-            re.MULTILINE,
-        ):
-            metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int(
-                match.group(3)
-            )
-        for match in re.finditer(
-            r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
-            all_metrics_text,
-            re.MULTILINE,
-        ):
-            metrics.commit_lsn_inexact[
-                (TenantId(match.group(1)), TimelineId(match.group(2)))
-            ] = int(match.group(3))
-        return metrics
-
-
 class S3Scrubber:
     def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
         self.env = env
@@ -4262,6 +4064,49 @@ def wait_for_last_flush_lsn(
     return min(results)
 
 
+def flush_ep_to_pageserver(
+    env: NeonEnv,
+    ep: Endpoint,
+    tenant: TenantId,
+    timeline: TimelineId,
+    pageserver_id: Optional[int] = None,
+) -> Lsn:
+    """
+    Stop endpoint and wait until all committed WAL reaches the pageserver
+    (last_record_lsn). This is for use by tests which want everything written so
+    far to reach pageserver *and* expecting that no more data will arrive until
+    endpoint starts again, so unlike wait_for_last_flush_lsn it polls
+    safekeepers instead of compute to learn LSN.
+
+    Returns the catch up LSN.
+    """
+    ep.stop()
+
+    commit_lsn: Lsn = Lsn(0)
+    # In principle in the absense of failures polling single sk would be enough.
+    for sk in env.safekeepers:
+        cli = sk.http_client()
+        # wait until compute connections are gone
+        wait_until(30, 0.5, partial(are_walreceivers_absent, cli, tenant, timeline))
+        commit_lsn = max(cli.get_commit_lsn(tenant, timeline), commit_lsn)
+
+    # Note: depending on WAL filtering implementation, probably most shards
+    # won't be able to reach commit_lsn (unless gaps are also ack'ed), so this
+    # is broken in sharded case.
+    shards = tenant_get_shards(env, tenant, pageserver_id)
+    for tenant_shard_id, pageserver in shards:
+        log.info(
+            f"flush_ep_to_pageserver: waiting for {commit_lsn} on shard {tenant_shard_id} on pageserver {pageserver.id})"
+        )
+        waited = wait_for_last_record_lsn(
+            pageserver.http_client(), tenant_shard_id, timeline, commit_lsn
+        )
+
+        assert waited >= commit_lsn
+
+    return commit_lsn
+
+
 def wait_for_wal_insert_lsn(
     env: NeonEnv,
     endpoint: Endpoint,
diff --git a/test_runner/fixtures/safekeeper/__init__.py b/test_runner/fixtures/safekeeper/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
new file mode 100644
index 0000000000..b9c1986818
--- /dev/null
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -0,0 +1,227 @@
+import json
+import re
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import pytest
+import requests
+
+from fixtures.log_helper import log
+from fixtures.types import Lsn, TenantId, TimelineId
+
+
+# Walreceiver as returned by sk's timeline status endpoint.
+@dataclass
+class Walreceiver:
+    conn_id: int
+    state: str
+
+
+@dataclass
+class SafekeeperTimelineStatus:
+    acceptor_epoch: int
+    pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
+    flush_lsn: Lsn
+    commit_lsn: Lsn
+    timeline_start_lsn: Lsn
+    backup_lsn: Lsn
+    peer_horizon_lsn: Lsn
+    remote_consistent_lsn: Lsn
+    walreceivers: List[Walreceiver]
+
+
+@dataclass
+class SafekeeperMetrics:
+    # These are metrics from Prometheus which uses float64 internally.
+    # As a consequence, values may differ from real original int64s.
+    flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
+    commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
+
+
+class SafekeeperHttpClient(requests.Session):
+    HTTPError = requests.HTTPError
+
+    def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
+        super().__init__()
+        self.port = port
+        self.auth_token = auth_token
+        self.is_testing_enabled = is_testing_enabled
+
+        if auth_token is not None:
+            self.headers["Authorization"] = f"Bearer {auth_token}"
+
+    def check_status(self):
+        self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
+
+    def is_testing_enabled_or_skip(self):
+        if not self.is_testing_enabled:
+            pytest.skip("safekeeper was built without 'testing' feature")
+
+    def configure_failpoints(self, config_strings: Union[Tuple[str, str], List[Tuple[str, str]]]):
+        self.is_testing_enabled_or_skip()
+
+        if isinstance(config_strings, tuple):
+            pairs = [config_strings]
+        else:
+            pairs = config_strings
+
+        log.info(f"Requesting config failpoints: {repr(pairs)}")
+
+        res = self.put(
+            f"http://localhost:{self.port}/v1/failpoints",
+            json=[{"name": name, "actions": actions} for name, actions in pairs],
+        )
+        log.info(f"Got failpoints request response code {res.status_code}")
+        res.raise_for_status()
+        res_json = res.json()
+        assert res_json is None
+        return res_json
+
+    def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
+        params = params or {}
+        res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
+        res.raise_for_status()
+        res_json = json.loads(res.text)
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def patch_control_file(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        patch: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        res = self.patch(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
+            json={
+                "updates": patch,
+                "apply_fields": list(patch.keys()),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
+        res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
+            json=body,
+        )
+        res.raise_for_status()
+
+    def timeline_digest(
+        self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
+    ) -> Dict[str, Any]:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
+            params={
+                "from_lsn": str(from_lsn),
+                "until_lsn": str(until_lsn),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def timeline_create(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        pg_version: int,  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
+        commit_lsn: Lsn,
+    ):
+        body = {
+            "tenant_id": str(tenant_id),
+            "timeline_id": str(timeline_id),
+            "pg_version": pg_version,
+            "commit_lsn": str(commit_lsn),
+        }
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body)
+        res.raise_for_status()
+
+    def timeline_status(
+        self, tenant_id: TenantId, timeline_id: TimelineId
+    ) -> SafekeeperTimelineStatus:
+        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
+        res.raise_for_status()
+        resj = res.json()
+        walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
+        return SafekeeperTimelineStatus(
+            acceptor_epoch=resj["acceptor_state"]["epoch"],
+            pg_version=resj["pg_info"]["pg_version"],
+            flush_lsn=Lsn(resj["flush_lsn"]),
+            commit_lsn=Lsn(resj["commit_lsn"]),
+            timeline_start_lsn=Lsn(resj["timeline_start_lsn"]),
+            backup_lsn=Lsn(resj["backup_lsn"]),
+            peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
+            remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
+            walreceivers=walreceivers,
+        )
+
+    def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
+        return self.timeline_status(tenant_id, timeline_id).commit_lsn
+
+    def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
+            json=body,
+        )
+        res.raise_for_status()
+
+    # only_local doesn't remove segments in the remote storage.
+    def timeline_delete(
+        self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
+    ) -> Dict[Any, Any]:
+        res = self.delete(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
+            params={
+                "only_local": str(only_local).lower(),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]:
+        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def get_metrics_str(self) -> str:
+        request_result = self.get(f"http://localhost:{self.port}/metrics")
+        request_result.raise_for_status()
+        return request_result.text
+
+    def get_metrics(self) -> SafekeeperMetrics:
+        all_metrics_text = self.get_metrics_str()
+
+        metrics = SafekeeperMetrics()
+        for match in re.finditer(
+            r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
+            all_metrics_text,
+            re.MULTILINE,
+        ):
+            metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int(
+                match.group(3)
+            )
+        for match in re.finditer(
+            r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
+            all_metrics_text,
+            re.MULTILINE,
+        ):
+            metrics.commit_lsn_inexact[
+                (TenantId(match.group(1)), TimelineId(match.group(2)))
+            ] = int(match.group(3))
+        return metrics
diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py
new file mode 100644
index 0000000000..2818a493d6
--- /dev/null
+++ b/test_runner/fixtures/safekeeper/utils.py
@@ -0,0 +1,11 @@
+from fixtures.log_helper import log
+from fixtures.safekeeper.http import SafekeeperHttpClient
+from fixtures.types import TenantId, TimelineId
+
+
+def are_walreceivers_absent(
+    sk_http_cli: SafekeeperHttpClient, tenant_id: TenantId, timeline_id: TimelineId
+):
+    status = sk_http_cli.timeline_status(tenant_id, timeline_id)
+    log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
+    return len(status.walreceivers) == 0
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index efba2033fb..7bbc0cc160 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -4,12 +4,11 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
+    flush_ep_to_pageserver,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
+from fixtures.pageserver.utils import wait_for_upload
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn
-from fixtures.utils import query_scalar
 
 
 # Crates a few layers, ensures that we can evict them (removing locally but keeping track of them anyway)
@@ -46,14 +45,15 @@ def test_basic_eviction(
             FROM generate_series(1, 5000000) g
             """
         )
-        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
 
-    wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
+    # stops the endpoint
+    current_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
+
     client.timeline_checkpoint(tenant_id, timeline_id)
     wait_for_upload(client, tenant_id, timeline_id, current_lsn)
 
-    # disable compute & sks to avoid on-demand downloads by walreceiver / getpage
-    endpoint.stop()
+    # stop sks to avoid on-demand downloads by walreceiver / getpage; endpoint
+    # has already been stopped by flush_ep_to_pageserver
     for sk in env.safekeepers:
         sk.stop()
 
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 9da47b9fd3..abdebb6d79 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -1,7 +1,7 @@
 import time
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
 from fixtures.pageserver.types import (
     DeltaLayerFileName,
     ImageLayerFileName,
@@ -115,8 +115,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
                     )
                     == 0
                 )
-
-    endpoint.stop()
+    last_record_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
 
     wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
 
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 8bbf50373e..914f068afb 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -8,6 +8,7 @@ from typing import Any, DefaultDict, Dict, Tuple
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
+    flush_ep_to_pageserver,
     last_flush_lsn_upload,
     wait_for_last_flush_lsn,
 )
@@ -517,7 +518,7 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder:
 
         with endpoint.cursor() as cur:
             cur.execute("update a set id = -id")
-        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+        flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
         pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
 
     layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 3d7bba6153..2cac58dc1a 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -28,7 +28,6 @@ from fixtures.neon_fixtures import (
     PgBin,
     PgProtocol,
     Safekeeper,
-    SafekeeperHttpClient,
     SafekeeperPort,
     last_flush_lsn_upload,
 )
@@ -46,6 +45,8 @@ from fixtures.remote_storage import (
     default_remote_storage,
     s3_storage,
 )
+from fixtures.safekeeper.http import SafekeeperHttpClient
+from fixtures.safekeeper.utils import are_walreceivers_absent
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import get_dir_size, query_scalar, start_in_background
 
@@ -1097,12 +1098,6 @@ def is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id):
     return all([flush_lsns[0] == flsn for flsn in flush_lsns])
 
 
-def are_walreceivers_absent(sk_http_cli, tenant_id: TenantId, timeline_id: TimelineId):
-    status = sk_http_cli.timeline_status(tenant_id, timeline_id)
-    log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
-    return len(status.walreceivers) == 0
-
-
 # Assert by xxd that WAL on given safekeepers is identical. No compute must be
 # running for this to be reliable.
 def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId):
@@ -1347,6 +1342,36 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
     endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'")
 
 
+# Test that when compute is terminated in fast (or smart) mode, walproposer is
+# allowed to run and self terminate after shutdown checkpoint is written, so it
+# commits it to safekeepers before exiting. This not required for correctness,
+# but needed for tests using check_restored_datadir_content.
+def test_wp_graceful_shutdown(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_wp_graceful_shutdown")
+    ep = env.endpoints.create_start("test_wp_graceful_shutdown")
+    ep.safe_psql("create table t(key int, value text)")
+    ep.stop()
+
+    # figure out checkpoint lsn
+    ckpt_lsn = pg_bin.get_pg_controldata_checkpoint_lsn(ep.pg_data_dir_path())
+
+    sk_http_cli = env.safekeepers[0].http_client()
+    commit_lsn = sk_http_cli.timeline_status(tenant_id, timeline_id).commit_lsn
+    # Note: this is in memory value. Graceful shutdown of walproposer currently
+    # doesn't guarantee persisted value, which is ok as we need it only for
+    # tests. Persisting it without risking too many cf flushes needs a wp -> sk
+    # protocol change. (though in reality shutdown sync-safekeepers does flush
+    # of cf, so most of the time persisted value wouldn't lag)
+    log.info(f"sk commit_lsn {commit_lsn}")
+    # note that ckpt_lsn is the *beginning* of checkpoint record, so commit_lsn
+    # must be actually higher
+    assert commit_lsn > ckpt_lsn, "safekeeper must have checkpoint record"
+
+
 class SafekeeperEnv:
     def __init__(
         self,
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index f49a962b9b..b980d6f090 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit f49a962b9b3715d6f47017d1dcf905c36f93ae5e
+Subproject commit b980d6f090c676e55fb2c830fb2434f532f635c0
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index e8b9a28006..56f32c0e73 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit e8b9a28006a550d7ca7cbb9bd0238eb9cd57bbd8
+Subproject commit 56f32c0e7330d17aaeee8bf211a73995180bd133
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 072697b225..9007894722 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 072697b2250da3251af75887b577104554b9cd44
+Subproject commit 90078947229aa7f9ac5f7ed4527b2c7386d5332b
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 1529d87bcb..1941c235ee 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,6 +1,5 @@
 {
-    "postgres-v16": "072697b2250da3251af75887b577104554b9cd44",
-    "postgres-v15": "e8b9a28006a550d7ca7cbb9bd0238eb9cd57bbd8",
-    "postgres-v14": "f49a962b9b3715d6f47017d1dcf905c36f93ae5e"
+  "postgres-v16": "90078947229aa7f9ac5f7ed4527b2c7386d5332b",
+  "postgres-v15": "56f32c0e7330d17aaeee8bf211a73995180bd133",
+  "postgres-v14": "b980d6f090c676e55fb2c830fb2434f532f635c0"
 }
-

From 621ea2ec4465a76a60c1c77e947b31e5a0812dfb Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 7 Feb 2024 19:58:08 +0200
Subject: [PATCH 0383/1571] tests: try to make restored-datadir comparison
 tests not flaky v2

This test occasionally fails with a difference in "pg_xact/0000" file
between the local and restored datadirs. My hypothesis is that
something changed in the database between the last explicit checkpoint
and the shutdown. I suspect autovacuum, it could certainly create
transactions.

To fix, be more precise about the point in time that we compare. Shut
down the endpoint first, then read the last LSN (i.e. the shutdown
checkpoint's LSN), from the local disk with pg_controldata. And use
exactly that LSN in the basebackup.

Closes #559
---
 test_runner/fixtures/neon_fixtures.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 018de975dc..584d5fea48 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3890,24 +3890,21 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
 
 # pg is the existing and running compute node, that we want to compare with a basebackup
 def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
+    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
+
     # Get the timeline ID. We need it for the 'basebackup' command
     timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
 
-    # many tests already checkpoint, but do it just in case
-    with closing(endpoint.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute("CHECKPOINT")
-
-    # wait for pageserver to catch up
-    wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
     # stop postgres to ensure that files won't change
     endpoint.stop()
 
+    # Read the shutdown checkpoint's LSN
+    checkpoint_lsn = pg_bin.get_pg_controldata_checkpoint_lsn(endpoint.pg_data_dir_path())
+
     # Take a basebackup from pageserver
     restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir"
     restored_dir_path.mkdir(exist_ok=True)
 
-    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
     psql_path = os.path.join(pg_bin.pg_bin_path, "psql")
 
     pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
@@ -3915,7 +3912,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
         {psql_path}                                    \
             --no-psqlrc                                \
             postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg}  \
-            -c 'basebackup {endpoint.tenant_id} {timeline_id}'  \
+            -c 'basebackup {endpoint.tenant_id} {timeline_id} {checkpoint_lsn}'  \
          | tar -x -C {restored_dir_path}
     """
 

From 89cf714890237862eb3fd52f473e4dbe15cd6e4a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 12 Mar 2024 11:36:27 +0000
Subject: [PATCH 0384/1571] tests/neon_local: rename "attachment service" ->
 "storage controller" (#7087)

Not a user-facing change, but can break any existing `.neon` directories
created by neon_local, as the name of the database used by the storage
controller changes.

This PR changes all the locations apart from the path of
`control_plane/attachment_service` (waiting for an opportune moment to
do that one, because it's the most conflict-ish wrt ongoing PRs like
#6676 )
---
 Makefile                                      |   2 +-
 control_plane/attachment_service/src/http.rs  |   2 +-
 control_plane/attachment_service/src/main.rs  |   6 -
 .../attachment_service/src/persistence.rs     |   4 +-
 .../attachment_service/src/service.rs         |   4 +-
 control_plane/src/bin/neon_local.rs           |  86 +++++-----
 control_plane/src/endpoint.rs                 |  10 +-
 control_plane/src/lib.rs                      |   2 +-
 control_plane/src/local_env.rs                |  12 +-
 control_plane/src/pageserver.rs               |   8 +-
 ...hment_service.rs => storage_controller.rs} |  38 ++---
 docs/authentication.md                        |   4 +-
 libs/pageserver_api/src/controller_api.rs     |   2 -
 test_runner/fixtures/neon_fixtures.py         | 108 ++++++------
 .../fixtures/pageserver/many_tenants.py       |   2 +-
 .../interactive/test_many_small_tenants.py    |   2 +-
 .../pagebench/test_large_slru_basebackup.py   |   2 +-
 ...er_max_throughput_getpage_at_latest_lsn.py |   2 +-
 test_runner/performance/test_bulk_insert.py   |   4 +-
 .../regress/test_attach_tenant_config.py      |   2 +-
 test_runner/regress/test_change_pageserver.py |   8 +-
 test_runner/regress/test_compatibility.py     |   2 +-
 .../regress/test_layers_from_future.py        |   2 +-
 test_runner/regress/test_neon_cli.py          |   4 +-
 test_runner/regress/test_pageserver_api.py    |   2 +-
 .../regress/test_pageserver_generations.py    |  14 +-
 .../regress/test_pageserver_secondary.py      |  10 +-
 test_runner/regress/test_remote_storage.py    |   4 +-
 test_runner/regress/test_s3_restore.py        |   4 +-
 test_runner/regress/test_sharding.py          |  30 ++--
 test_runner/regress/test_sharding_service.py  | 156 +++++++++---------
 test_runner/regress/test_timeline_size.py     |   4 +-
 32 files changed, 267 insertions(+), 275 deletions(-)
 rename control_plane/src/{attachment_service.rs => storage_controller.rs} (94%)

diff --git a/Makefile b/Makefile
index ea782cb369..f13f080f1a 100644
--- a/Makefile
+++ b/Makefile
@@ -51,7 +51,7 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
-# Set PQ_LIB_DIR to make sure `attachment_service` get linked with bundled libpq (through diesel)
+# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
 CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
 
 #
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 7e4030b221..27ba5bdb65 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -30,7 +30,7 @@ use pageserver_api::controller_api::{
 };
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
 
-use control_plane::attachment_service::{AttachHookRequest, InspectRequest};
+use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
 
 /// State available to HTTP request handlers
 #[derive(Clone)]
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index d9acbc0abd..333c3911e3 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -1,9 +1,3 @@
-/// The attachment service mimics the aspects of the control plane API
-/// that are required for a pageserver to operate.
-///
-/// This enables running & testing pageservers without a full-blown
-/// deployment of the Neon cloud platform.
-///
 use anyhow::{anyhow, Context};
 use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index d5c6d74ebe..aa08945834 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -20,7 +20,7 @@ use crate::node::Node;
 
 /// ## What do we store?
 ///
-/// The attachment service does not store most of its state durably.
+/// The storage controller service does not store most of its state durably.
 ///
 /// The essential things to store durably are:
 /// - generation numbers, as these must always advance monotonically to ensure data safety.
@@ -34,7 +34,7 @@ use crate::node::Node;
 ///
 /// ## Performance/efficiency
 ///
-/// The attachment service does not go via the database for most things: there are
+/// The storage controller service does not go via the database for most things: there are
 /// a couple of places where we must, and where efficiency matters:
 /// - Incrementing generation numbers: the Reconciler has to wait for this to complete
 ///   before it can attach a tenant, so this acts as a bound on how fast things like
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index f3d97c0dfb..3f245b5255 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -8,7 +8,7 @@ use std::{
 };
 
 use anyhow::Context;
-use control_plane::attachment_service::{
+use control_plane::storage_controller::{
     AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
 };
 use diesel::result::DatabaseErrorKind;
@@ -839,7 +839,7 @@ impl Service {
             tenant_state.generation = Some(new_generation);
         } else {
             // This is a detach notification.  We must update placement policy to avoid re-attaching
-            // during background scheduling/reconciliation, or during attachment service restart.
+            // during background scheduling/reconciliation, or during storage controller restart.
             assert!(attach_req.node_id.is_none());
             tenant_state.policy = PlacementPolicy::Detached;
         }
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 27abcb182a..86b9c0085d 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,11 +8,11 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
-use control_plane::attachment_service::AttachmentService;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::{InitForceMode, LocalEnv};
 use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
+use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
 use pageserver_api::controller_api::{
     NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
@@ -138,7 +138,7 @@ fn main() -> Result<()> {
             "start" => rt.block_on(handle_start_all(sub_args, &env)),
             "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
             "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
-            "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
+            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
             "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
             "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
             "mappings" => handle_mappings(sub_args, &mut env),
@@ -445,14 +445,14 @@ async fn handle_tenant(
             // If tenant ID was not specified, generate one
             let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
 
-            // We must register the tenant with the attachment service, so
+            // We must register the tenant with the storage controller, so
             // that when the pageserver restarts, it will be re-attached.
-            let attachment_service = AttachmentService::from_env(env);
-            attachment_service
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
                 .tenant_create(TenantCreateRequest {
                     // Note that ::unsharded here isn't actually because the tenant is unsharded, its because the
-                    // attachment service expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
-                    // type is used both in attachment service (for creating tenants) and in pageserver (for creating shards)
+                    // storage controller expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
+                    // type is used both in storage controller (for creating tenants) and in pageserver (for creating shards)
                     new_tenant_id: TenantShardId::unsharded(tenant_id),
                     generation: None,
                     shard_parameters: ShardParameters {
@@ -476,9 +476,9 @@ async fn handle_tenant(
                 .context("Failed to parse postgres version from the argument string")?;
 
             // FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have
-            // different shards picking different start lsns.  Maybe we have to teach attachment service
+            // different shards picking different start lsns.  Maybe we have to teach storage controller
             // to let shard 0 branch first and then propagate the chosen LSN to other shards.
-            attachment_service
+            storage_controller
                 .tenant_timeline_create(
                     tenant_id,
                     TimelineCreateRequest {
@@ -528,8 +528,8 @@ async fn handle_tenant(
             let new_pageserver = get_pageserver(env, matches)?;
             let new_pageserver_id = new_pageserver.conf.id;
 
-            let attachment_service = AttachmentService::from_env(env);
-            attachment_service
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
                 .tenant_migrate(tenant_shard_id, new_pageserver_id)
                 .await?;
 
@@ -543,8 +543,8 @@ async fn handle_tenant(
 
             let mut tenant_synthetic_size = None;
 
-            let attachment_service = AttachmentService::from_env(env);
-            for shard in attachment_service.tenant_locate(tenant_id).await?.shards {
+            let storage_controller = StorageController::from_env(env);
+            for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
                 let pageserver =
                     PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
 
@@ -586,8 +586,8 @@ async fn handle_tenant(
             let tenant_id = get_tenant_id(matches, env)?;
             let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
 
-            let attachment_service = AttachmentService::from_env(env);
-            let result = attachment_service
+            let storage_controller = StorageController::from_env(env);
+            let result = storage_controller
                 .tenant_split(tenant_id, shard_count)
                 .await?;
             println!(
@@ -613,7 +613,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
 
     match timeline_match.subcommand() {
         Some(("list", list_match)) => {
-            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
+            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
             // where shard 0 is attached, and query there.
             let tenant_shard_id = get_tenant_shard_id(list_match, env)?;
             let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
@@ -633,7 +633,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
             let new_timeline_id_opt = parse_timeline_id(create_match)?;
             let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate());
 
-            let attachment_service = AttachmentService::from_env(env);
+            let storage_controller = StorageController::from_env(env);
             let create_req = TimelineCreateRequest {
                 new_timeline_id,
                 ancestor_timeline_id: None,
@@ -641,7 +641,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                 ancestor_start_lsn: None,
                 pg_version: Some(pg_version),
             };
-            let timeline_info = attachment_service
+            let timeline_info = storage_controller
                 .tenant_timeline_create(tenant_id, create_req)
                 .await?;
 
@@ -730,7 +730,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                 .transpose()
                 .context("Failed to parse ancestor start Lsn from the request")?;
             let new_timeline_id = TimelineId::generate();
-            let attachment_service = AttachmentService::from_env(env);
+            let storage_controller = StorageController::from_env(env);
             let create_req = TimelineCreateRequest {
                 new_timeline_id,
                 ancestor_timeline_id: Some(ancestor_timeline_id),
@@ -738,7 +738,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                 ancestor_start_lsn: start_lsn,
                 pg_version: None,
             };
-            let timeline_info = attachment_service
+            let timeline_info = storage_controller
                 .tenant_timeline_create(tenant_id, create_req)
                 .await?;
 
@@ -767,7 +767,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
 
     match sub_name {
         "list" => {
-            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
+            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
             // where shard 0 is attached, and query there.
             let tenant_shard_id = get_tenant_shard_id(sub_args, env)?;
             let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
@@ -952,21 +952,21 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 (
                     vec![(parsed.0, parsed.1.unwrap_or(5432))],
                     // If caller is telling us what pageserver to use, this is not a tenant which is
-                    // full managed by attachment service, therefore not sharded.
+                    // full managed by storage controller, therefore not sharded.
                     ShardParameters::DEFAULT_STRIPE_SIZE,
                 )
             } else {
                 // Look up the currently attached location of the tenant, and its striping metadata,
                 // to pass these on to postgres.
-                let attachment_service = AttachmentService::from_env(env);
-                let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
+                let storage_controller = StorageController::from_env(env);
+                let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
                 let pageservers = locate_result
                     .shards
                     .into_iter()
                     .map(|shard| {
                         (
                             Host::parse(&shard.listen_pg_addr)
-                                .expect("Attachment service reported bad hostname"),
+                                .expect("Storage controller reported bad hostname"),
                             shard.listen_pg_port,
                         )
                     })
@@ -1015,8 +1015,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                         pageserver.pg_connection_config.port(),
                     )]
                 } else {
-                    let attachment_service = AttachmentService::from_env(env);
-                    attachment_service
+                    let storage_controller = StorageController::from_env(env);
+                    storage_controller
                         .tenant_locate(endpoint.tenant_id)
                         .await?
                         .shards
@@ -1024,7 +1024,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                         .map(|shard| {
                             (
                                 Host::parse(&shard.listen_pg_addr)
-                                    .expect("Attachment service reported malformed host"),
+                                    .expect("Storage controller reported malformed host"),
                                 shard.listen_pg_port,
                             )
                         })
@@ -1144,8 +1144,8 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
             let scheduling = subcommand_args.get_one("scheduling");
             let availability = subcommand_args.get_one("availability");
 
-            let attachment_service = AttachmentService::from_env(env);
-            attachment_service
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
                 .node_configure(NodeConfigureRequest {
                     node_id: pageserver.conf.id,
                     scheduling: scheduling.cloned(),
@@ -1170,11 +1170,11 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
     Ok(())
 }
 
-async fn handle_attachment_service(
+async fn handle_storage_controller(
     sub_match: &ArgMatches,
     env: &local_env::LocalEnv,
 ) -> Result<()> {
-    let svc = AttachmentService::from_env(env);
+    let svc = StorageController::from_env(env);
     match sub_match.subcommand() {
         Some(("start", _start_match)) => {
             if let Err(e) = svc.start().await {
@@ -1194,8 +1194,8 @@ async fn handle_attachment_service(
                 exit(1);
             }
         }
-        Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name),
-        None => bail!("no attachment_service subcommand provided"),
+        Some((sub_name, _)) => bail!("Unexpected storage_controller subcommand '{}'", sub_name),
+        None => bail!("no storage_controller subcommand provided"),
     }
     Ok(())
 }
@@ -1280,11 +1280,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
 
     broker::start_broker_process(env).await?;
 
-    // Only start the attachment service if the pageserver is configured to need it
+    // Only start the storage controller if the pageserver is configured to need it
     if env.control_plane_api.is_some() {
-        let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.start().await {
-            eprintln!("attachment_service start failed: {:#}", e);
+        let storage_controller = StorageController::from_env(env);
+        if let Err(e) = storage_controller.start().await {
+            eprintln!("storage_controller start failed: {:#}", e);
             try_stop_all(env, true).await;
             exit(1);
         }
@@ -1356,9 +1356,9 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
     }
 
     if env.control_plane_api.is_some() {
-        let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.stop(immediate).await {
-            eprintln!("attachment service stop failed: {e:#}");
+        let storage_controller = StorageController::from_env(env);
+        if let Err(e) = storage_controller.stop(immediate).await {
+            eprintln!("storage controller stop failed: {e:#}");
         }
     }
 }
@@ -1618,9 +1618,9 @@ fn cli() -> Command {
                 )
         )
         .subcommand(
-            Command::new("attachment_service")
+            Command::new("storage_controller")
                 .arg_required_else_help(true)
-                .about("Manage attachment_service")
+                .about("Manage storage_controller")
                 .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
                 .subcommand(Command::new("stop").about("Stop local pageserver")
                             .arg(stop_mode_arg.clone()))
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index ac0a8417ae..646bc2e8bc 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -57,9 +57,9 @@ use serde::{Deserialize, Serialize};
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
 
-use crate::attachment_service::AttachmentService;
 use crate::local_env::LocalEnv;
 use crate::postgresql_conf::PostgresConf;
+use crate::storage_controller::StorageController;
 
 use compute_api::responses::{ComputeState, ComputeStatus};
 use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
@@ -750,17 +750,17 @@ impl Endpoint {
         let postgresql_conf = self.read_postgresql_conf()?;
         spec.cluster.postgresql_conf = Some(postgresql_conf);
 
-        // If we weren't given explicit pageservers, query the attachment service
+        // If we weren't given explicit pageservers, query the storage controller
         if pageservers.is_empty() {
-            let attachment_service = AttachmentService::from_env(&self.env);
-            let locate_result = attachment_service.tenant_locate(self.tenant_id).await?;
+            let storage_controller = StorageController::from_env(&self.env);
+            let locate_result = storage_controller.tenant_locate(self.tenant_id).await?;
             pageservers = locate_result
                 .shards
                 .into_iter()
                 .map(|shard| {
                     (
                         Host::parse(&shard.listen_pg_addr)
-                            .expect("Attachment service reported bad hostname"),
+                            .expect("Storage controller reported bad hostname"),
                         shard.listen_pg_port,
                     )
                 })
diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs
index bb79d36bfc..2af272f388 100644
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -6,7 +6,6 @@
 //! local installations.
 #![deny(clippy::undocumented_unsafe_blocks)]
 
-pub mod attachment_service;
 mod background_process;
 pub mod broker;
 pub mod endpoint;
@@ -14,3 +13,4 @@ pub mod local_env;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
+pub mod storage_controller;
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 03270723a6..2e64489432 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -72,13 +72,13 @@ pub struct LocalEnv {
     #[serde(default)]
     pub safekeepers: Vec<SafekeeperConf>,
 
-    // Control plane upcall API for pageserver: if None, we will not run attachment_service.  If set, this will
+    // Control plane upcall API for pageserver: if None, we will not run storage_controller  If set, this will
     // be propagated into each pageserver's configuration.
     #[serde(default)]
     pub control_plane_api: Option<Url>,
 
-    // Control plane upcall API for attachment service.  If set, this will be propagated into the
-    // attachment service's configuration.
+    // Control plane upcall API for storage controller.  If set, this will be propagated into the
+    // storage controller's configuration.
     #[serde(default)]
     pub control_plane_compute_hook_api: Option<Url>,
 
@@ -227,10 +227,10 @@ impl LocalEnv {
         self.neon_distrib_dir.join("pageserver")
     }
 
-    pub fn attachment_service_bin(&self) -> PathBuf {
-        // Irrespective of configuration, attachment service binary is always
+    pub fn storage_controller_bin(&self) -> PathBuf {
+        // Irrespective of configuration, storage controller binary is always
         // run from the same location as neon_local.  This means that for compatibility
-        // tests that run old pageserver/safekeeper, they still run latest attachment service.
+        // tests that run old pageserver/safekeeper, they still run latest storage controller.
         let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
         neon_local_bin_dir.join("storage_controller")
     }
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index ae1bd60c52..021b9aca34 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -31,8 +31,8 @@ use utils::{
     lsn::Lsn,
 };
 
-use crate::attachment_service::AttachmentService;
 use crate::local_env::PageServerConf;
+use crate::storage_controller::StorageController;
 use crate::{background_process, local_env::LocalEnv};
 
 /// Directory within .neon which will be used by default for LocalFs remote storage.
@@ -111,7 +111,7 @@ impl PageServerNode {
                 control_plane_api.as_str()
             ));
 
-            // Attachment service uses the same auth as pageserver: if JWT is enabled
+            // Storage controller uses the same auth as pageserver: if JWT is enabled
             // for us, we will also need it to talk to them.
             if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
                 let jwt_token = self
@@ -214,12 +214,12 @@ impl PageServerNode {
         // Register the node with the storage controller before starting pageserver: pageserver must be registered to
         // successfully call /re-attach and finish starting up.
         if register {
-            let attachment_service = AttachmentService::from_env(&self.env);
+            let storage_controller = StorageController::from_env(&self.env);
             let (pg_host, pg_port) =
                 parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
             let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
                 .expect("Unable to parse listen_http_addr");
-            attachment_service
+            storage_controller
                 .node_register(NodeRegisterRequest {
                     node_id: self.conf.id,
                     listen_pg_addr: pg_host.to_string(),
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/storage_controller.rs
similarity index 94%
rename from control_plane/src/attachment_service.rs
rename to control_plane/src/storage_controller.rs
index 5c97561985..c505e67770 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/storage_controller.rs
@@ -24,7 +24,7 @@ use utils::{
     id::{NodeId, TenantId},
 };
 
-pub struct AttachmentService {
+pub struct StorageController {
     env: LocalEnv,
     listen: String,
     path: Utf8PathBuf,
@@ -36,7 +36,7 @@ pub struct AttachmentService {
 
 const COMMAND: &str = "storage_controller";
 
-const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
+const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
@@ -59,7 +59,7 @@ pub struct InspectResponse {
     pub attachment: Option<(u32, NodeId)>,
 }
 
-impl AttachmentService {
+impl StorageController {
     pub fn from_env(env: &LocalEnv) -> Self {
         let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
             .unwrap()
@@ -136,27 +136,27 @@ impl AttachmentService {
     }
 
     fn pid_file(&self) -> Utf8PathBuf {
-        Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("attachment_service.pid"))
+        Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
             .expect("non-Unicode path")
     }
 
-    /// PIDFile for the postgres instance used to store attachment service state
+    /// PIDFile for the postgres instance used to store storage controller state
     fn postgres_pid_file(&self) -> Utf8PathBuf {
         Utf8PathBuf::from_path_buf(
             self.env
                 .base_data_dir
-                .join("attachment_service_postgres.pid"),
+                .join("storage_controller_postgres.pid"),
         )
         .expect("non-Unicode path")
     }
 
     /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
     ///
-    /// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
+    /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
     /// to other versions if that one isn't found.  Some automated tests create circumstances
     /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
     pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
-        let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
+        let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
 
         for v in prefer_versions {
             let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
@@ -189,7 +189,7 @@ impl AttachmentService {
     ///
     /// Returns the database url
     pub async fn setup_database(&self) -> anyhow::Result<String> {
-        const DB_NAME: &str = "attachment_service";
+        const DB_NAME: &str = "storage_controller";
         let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
 
         let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -219,10 +219,10 @@ impl AttachmentService {
     }
 
     pub async fn start(&self) -> anyhow::Result<()> {
-        // Start a vanilla Postgres process used by the attachment service for persistence.
+        // Start a vanilla Postgres process used by the storage controller for persistence.
         let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
             .unwrap()
-            .join("attachment_service_db");
+            .join("storage_controller_db");
         let pg_bin_dir = self.get_pg_bin_dir().await?;
         let pg_log_path = pg_data_path.join("postgres.log");
 
@@ -245,7 +245,7 @@ impl AttachmentService {
             .await?;
         };
 
-        println!("Starting attachment service database...");
+        println!("Starting storage controller database...");
         let db_start_args = [
             "-w",
             "-D",
@@ -256,7 +256,7 @@ impl AttachmentService {
         ];
 
         background_process::start_process(
-            "attachment_service_db",
+            "storage_controller_db",
             &self.env.base_data_dir,
             pg_bin_dir.join("pg_ctl").as_std_path(),
             db_start_args,
@@ -300,7 +300,7 @@ impl AttachmentService {
         background_process::start_process(
             COMMAND,
             &self.env.base_data_dir,
-            &self.env.attachment_service_bin(),
+            &self.env.storage_controller_bin(),
             args,
             [(
                 "NEON_REPO_DIR".to_string(),
@@ -322,10 +322,10 @@ impl AttachmentService {
     pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
         background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
 
-        let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
+        let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
         let pg_bin_dir = self.get_pg_bin_dir().await?;
 
-        println!("Stopping attachment service database...");
+        println!("Stopping storage controller database...");
         let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
         let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
             .args(pg_stop_args)
@@ -344,10 +344,10 @@ impl AttachmentService {
             // fine that stop failed.  Otherwise it is an error that stop failed.
             const PG_STATUS_NOT_RUNNING: i32 = 3;
             if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
-                println!("Attachment service data base is already stopped");
+                println!("Storage controller database is already stopped");
                 return Ok(());
             } else {
-                anyhow::bail!("Failed to stop attachment service database: {stop_status}")
+                anyhow::bail!("Failed to stop storage controller database: {stop_status}")
             }
         }
 
@@ -368,7 +368,7 @@ impl AttachmentService {
         }
     }
 
-    /// Simple HTTP request wrapper for calling into attachment service
+    /// Simple HTTP request wrapper for calling into storage controller
     async fn dispatch<RQ, RS>(
         &self,
         method: hyper::Method,
diff --git a/docs/authentication.md b/docs/authentication.md
index faac7aa28e..522c5481b4 100644
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -70,9 +70,9 @@ Should only be used e.g. for status check/tenant creation/list.
 Should only be used e.g. for status check.
 Currently also used for connection from any pageserver to any safekeeper.
 
-"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
+"generations_api": Provides access to the upcall APIs served by the storage controller or the control plane.
 
-"admin": Provides access to the control plane and admin APIs of the attachment service.
+"admin": Provides access to the control plane and admin APIs of the storage controller.
 
 ### CLI
 CLI generates a key pair during call to `neon_local init` with the following commands:
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 38e61239c5..c172354e9f 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -88,8 +88,6 @@ impl FromStr for NodeAvailability {
     }
 }
 
-/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
-/// type needs to be defined with diesel traits in there.
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeSchedulingPolicy {
     Active,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 584d5fea48..234bfa8bf9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1014,24 +1014,24 @@ class NeonEnv:
         self.initial_tenant = config.initial_tenant
         self.initial_timeline = config.initial_timeline
 
-        # Find two adjacent ports for attachment service and its postgres DB.  This
+        # Find two adjacent ports for storage controller and its postgres DB.  This
         # loop would eventually throw from get_port() if we run out of ports (extremely
         # unlikely): usually we find two adjacent free ports on the first iteration.
         while True:
-            self.attachment_service_port = self.port_distributor.get_port()
-            attachment_service_pg_port = self.port_distributor.get_port()
-            if attachment_service_pg_port == self.attachment_service_port + 1:
+            self.storage_controller_port = self.port_distributor.get_port()
+            storage_controller_pg_port = self.port_distributor.get_port()
+            if storage_controller_pg_port == self.storage_controller_port + 1:
                 break
 
         # The URL for the pageserver to use as its control_plane_api config
-        self.control_plane_api: str = f"http://127.0.0.1:{self.attachment_service_port}/upcall/v1"
-        # The base URL of the attachment service
-        self.attachment_service_api: str = f"http://127.0.0.1:{self.attachment_service_port}"
+        self.control_plane_api: str = f"http://127.0.0.1:{self.storage_controller_port}/upcall/v1"
+        # The base URL of the storage controller
+        self.storage_controller_api: str = f"http://127.0.0.1:{self.storage_controller_port}"
 
         # For testing this with a fake HTTP server, enable passing through a URL from config
         self.control_plane_compute_hook_api = config.control_plane_compute_hook_api
 
-        self.attachment_service: NeonAttachmentService = NeonAttachmentService(
+        self.storage_controller: NeonStorageController = NeonStorageController(
             self, config.auth_enabled
         )
 
@@ -1113,16 +1113,16 @@ class NeonEnv:
         self.neon_cli.init(cfg, force=config.config_init_force)
 
     def start(self):
-        # Attachment service starts first, so that pageserver /re-attach calls don't
+        # storage controller starts first, so that pageserver /re-attach calls don't
         # bounce through retries on startup
-        self.attachment_service.start()
+        self.storage_controller.start()
 
-        def attachment_service_ready():
-            assert self.attachment_service.ready() is True
+        def storage_controller_ready():
+            assert self.storage_controller.ready() is True
 
-        # Wait for attachment service readiness to prevent unnecessary post start-up
+        # Wait for storage controller readiness to prevent unnecessary post start-up
         # reconcile.
-        wait_until(30, 1, attachment_service_ready)
+        wait_until(30, 1, storage_controller_ready)
 
         # Start up broker, pageserver and all safekeepers
         futs = []
@@ -1153,7 +1153,7 @@ class NeonEnv:
             if ps_assert_metric_no_errors:
                 pageserver.assert_no_metric_errors()
             pageserver.stop(immediate=immediate)
-        self.attachment_service.stop(immediate=immediate)
+        self.storage_controller.stop(immediate=immediate)
         self.broker.stop(immediate=immediate)
 
     @property
@@ -1188,9 +1188,9 @@ class NeonEnv:
     def get_tenant_pageserver(self, tenant_id: Union[TenantId, TenantShardId]):
         """
         Get the NeonPageserver where this tenant shard is currently attached, according
-        to the attachment service.
+        to the storage controller.
         """
-        meta = self.attachment_service.inspect(tenant_id)
+        meta = self.storage_controller.inspect(tenant_id)
         if meta is None:
             return None
         pageserver_id = meta[1]
@@ -1697,12 +1697,12 @@ class NeonCli(AbstractNeonCli):
             res.check_returncode()
             return res
 
-    def attachment_service_start(self):
-        cmd = ["attachment_service", "start"]
+    def storage_controller_start(self):
+        cmd = ["storage_controller", "start"]
         return self.raw_cli(cmd)
 
-    def attachment_service_stop(self, immediate: bool):
-        cmd = ["attachment_service", "stop"]
+    def storage_controller_stop(self, immediate: bool):
+        cmd = ["storage_controller", "stop"]
         if immediate:
             cmd.extend(["-m", "immediate"])
         return self.raw_cli(cmd)
@@ -1942,14 +1942,14 @@ class Pagectl(AbstractNeonCli):
         return IndexPartDump.from_json(parsed)
 
 
-class AttachmentServiceApiException(Exception):
+class StorageControllerApiException(Exception):
     def __init__(self, message, status_code: int):
         super().__init__(message)
         self.message = message
         self.status_code = status_code
 
 
-class NeonAttachmentService(MetricsGetter):
+class NeonStorageController(MetricsGetter):
     def __init__(self, env: NeonEnv, auth_enabled: bool):
         self.env = env
         self.running = False
@@ -1957,13 +1957,13 @@ class NeonAttachmentService(MetricsGetter):
 
     def start(self):
         assert not self.running
-        self.env.neon_cli.attachment_service_start()
+        self.env.neon_cli.storage_controller_start()
         self.running = True
         return self
 
-    def stop(self, immediate: bool = False) -> "NeonAttachmentService":
+    def stop(self, immediate: bool = False) -> "NeonStorageController":
         if self.running:
-            self.env.neon_cli.attachment_service_stop(immediate)
+            self.env.neon_cli.storage_controller_stop(immediate)
             self.running = False
         return self
 
@@ -1976,22 +1976,22 @@ class NeonAttachmentService(MetricsGetter):
                 msg = res.json()["msg"]
             except:  # noqa: E722
                 msg = ""
-            raise AttachmentServiceApiException(msg, res.status_code) from e
+            raise StorageControllerApiException(msg, res.status_code) from e
 
     def pageserver_api(self) -> PageserverHttpClient:
         """
-        The attachment service implements a subset of the pageserver REST API, for mapping
+        The storage controller implements a subset of the pageserver REST API, for mapping
         per-tenant actions into per-shard actions (e.g. timeline creation).  Tests should invoke those
         functions via the HttpClient, as an implicit check that these APIs remain compatible.
         """
         auth_token = None
         if self.auth_enabled:
             auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
-        return PageserverHttpClient(self.env.attachment_service_port, lambda: True, auth_token)
+        return PageserverHttpClient(self.env.storage_controller_port, lambda: True, auth_token)
 
     def request(self, method, *args, **kwargs) -> requests.Response:
         resp = requests.request(method, *args, **kwargs)
-        NeonAttachmentService.raise_api_exception(resp)
+        NeonStorageController.raise_api_exception(resp)
 
         return resp
 
@@ -2004,15 +2004,15 @@ class NeonAttachmentService(MetricsGetter):
         return headers
 
     def get_metrics(self) -> Metrics:
-        res = self.request("GET", f"{self.env.attachment_service_api}/metrics")
+        res = self.request("GET", f"{self.env.storage_controller_api}/metrics")
         return parse_metrics(res.text)
 
     def ready(self) -> bool:
         status = None
         try:
-            resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
+            resp = self.request("GET", f"{self.env.storage_controller_api}/ready")
             status = resp.status_code
-        except AttachmentServiceApiException as e:
+        except StorageControllerApiException as e:
             status = e.status_code
 
         if status == 503:
@@ -2027,7 +2027,7 @@ class NeonAttachmentService(MetricsGetter):
     ) -> int:
         response = self.request(
             "POST",
-            f"{self.env.attachment_service_api}/debug/v1/attach-hook",
+            f"{self.env.storage_controller_api}/debug/v1/attach-hook",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2038,7 +2038,7 @@ class NeonAttachmentService(MetricsGetter):
     def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
         self.request(
             "POST",
-            f"{self.env.attachment_service_api}/debug/v1/attach-hook",
+            f"{self.env.storage_controller_api}/debug/v1/attach-hook",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2049,7 +2049,7 @@ class NeonAttachmentService(MetricsGetter):
         """
         response = self.request(
             "POST",
-            f"{self.env.attachment_service_api}/debug/v1/inspect",
+            f"{self.env.storage_controller_api}/debug/v1/inspect",
             json={"tenant_shard_id": str(tenant_shard_id)},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2070,7 +2070,7 @@ class NeonAttachmentService(MetricsGetter):
         log.info(f"node_register({body})")
         self.request(
             "POST",
-            f"{self.env.attachment_service_api}/control/v1/node",
+            f"{self.env.storage_controller_api}/control/v1/node",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2078,7 +2078,7 @@ class NeonAttachmentService(MetricsGetter):
     def node_list(self):
         response = self.request(
             "GET",
-            f"{self.env.attachment_service_api}/control/v1/node",
+            f"{self.env.storage_controller_api}/control/v1/node",
             headers=self.headers(TokenScope.ADMIN),
         )
         return response.json()
@@ -2088,7 +2088,7 @@ class NeonAttachmentService(MetricsGetter):
         body["node_id"] = node_id
         self.request(
             "PUT",
-            f"{self.env.attachment_service_api}/control/v1/node/{node_id}/config",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/config",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2118,7 +2118,7 @@ class NeonAttachmentService(MetricsGetter):
 
         response = self.request(
             "POST",
-            f"{self.env.attachment_service_api}/v1/tenant",
+            f"{self.env.storage_controller_api}/v1/tenant",
             json=body,
             headers=self.headers(TokenScope.PAGE_SERVER_API),
         )
@@ -2130,7 +2130,7 @@ class NeonAttachmentService(MetricsGetter):
         """
         response = self.request(
             "GET",
-            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/locate",
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/locate",
             headers=self.headers(TokenScope.ADMIN),
         )
         body = response.json()
@@ -2140,7 +2140,7 @@ class NeonAttachmentService(MetricsGetter):
     def tenant_shard_split(self, tenant_id: TenantId, shard_count: int) -> list[TenantShardId]:
         response = self.request(
             "PUT",
-            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/shard_split",
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/shard_split",
             json={"new_shard_count": shard_count},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2152,7 +2152,7 @@ class NeonAttachmentService(MetricsGetter):
     def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
         self.request(
             "PUT",
-            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_shard_id}/migrate",
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_shard_id}/migrate",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2165,12 +2165,12 @@ class NeonAttachmentService(MetricsGetter):
         """
         self.request(
             "POST",
-            f"{self.env.attachment_service_api}/debug/v1/consistency_check",
+            f"{self.env.storage_controller_api}/debug/v1/consistency_check",
             headers=self.headers(TokenScope.ADMIN),
         )
-        log.info("Attachment service passed consistency check")
+        log.info("storage controller passed consistency check")
 
-    def __enter__(self) -> "NeonAttachmentService":
+    def __enter__(self) -> "NeonStorageController":
         return self
 
     def __exit__(
@@ -2401,7 +2401,7 @@ class NeonPageserver(PgProtocol):
         """
         client = self.http_client()
         if generation is None:
-            generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
+            generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
         return client.tenant_attach(
             tenant_id,
             config,
@@ -2410,14 +2410,14 @@ class NeonPageserver(PgProtocol):
         )
 
     def tenant_detach(self, tenant_id: TenantId):
-        self.env.attachment_service.attach_hook_drop(tenant_id)
+        self.env.storage_controller.attach_hook_drop(tenant_id)
 
         client = self.http_client()
         return client.tenant_detach(tenant_id)
 
     def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs):
         if config["mode"].startswith("Attached") and "generation" not in config:
-            config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
+            config["generation"] = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
 
         client = self.http_client()
         return client.tenant_location_conf(tenant_id, config, **kwargs)
@@ -2441,14 +2441,14 @@ class NeonPageserver(PgProtocol):
         generation: Optional[int] = None,
     ) -> TenantId:
         if generation is None:
-            generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
+            generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
         client = self.http_client(auth_token=auth_token)
         return client.tenant_create(tenant_id, conf, generation=generation)
 
     def tenant_load(self, tenant_id: TenantId):
         client = self.http_client()
         return client.tenant_load(
-            tenant_id, generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
+            tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
         )
 
 
@@ -3907,7 +3907,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
 
     psql_path = os.path.join(pg_bin.pg_bin_path, "psql")
 
-    pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
+    pageserver_id = env.storage_controller.locate(endpoint.tenant_id)[0]["node_id"]
     cmd = rf"""
         {psql_path}                                    \
             --no-psqlrc                                \
@@ -3994,7 +3994,7 @@ def tenant_get_shards(
     us to figure out the shards for a tenant.
 
     If the caller provides `pageserver_id`, it will be used for all shards, even
-    if the shard is indicated by attachment service to be on some other pageserver.
+    if the shard is indicated by storage controller to be on some other pageserver.
 
     Caller should over the response to apply their per-pageserver action to
     each shard
@@ -4010,7 +4010,7 @@ def tenant_get_shards(
                 TenantShardId.parse(s["shard_id"]),
                 override_pageserver or env.get_pageserver(s["node_id"]),
             )
-            for s in env.attachment_service.locate(tenant_id)
+            for s in env.storage_controller.locate(tenant_id)
         ]
     else:
         # Assume an unsharded tenant
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index bbb4ccee5b..f47a3ea043 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -43,7 +43,7 @@ def single_timeline(
     log.info("detach template tenant form pageserver")
     env.pageserver.tenant_detach(template_tenant)
     env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
         ".*Dropped remote consistent LSN updates.*",
     )
 
diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
index 3fb28ace46..0ff9c8fdaa 100644
--- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
+++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
@@ -56,7 +56,7 @@ def setup_env(
         template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
         env.pageserver.tenant_detach(template_tenant)
         env.pageserver.allowed_errors.append(
-            # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+            # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
             ".*Dropped remote consistent LSN updates.*",
         )
         env.pageserver.tenant_attach(template_tenant, config)
diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index 921b7c5b76..c98fa44b1a 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -92,7 +92,7 @@ def setup_tenant_template(env: NeonEnv, n_txns: int):
     template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
     env.pageserver.tenant_detach(template_tenant)
     env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
         ".*Dropped remote consistent LSN updates.*",
     )
     env.pageserver.tenant_attach(template_tenant, config)
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 8cd3569ea5..1a0012397c 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -114,7 +114,7 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
     template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
     env.pageserver.tenant_detach(template_tenant)
     env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
         ".*Dropped remote consistent LSN updates.*",
     )
     env.pageserver.tenant_attach(template_tenant, config)
diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index 72173dc2a7..9e3f602237 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -56,12 +56,12 @@ def measure_recovery_time(env: NeonCompare):
     # Delete the Tenant in the pageserver: this will drop local and remote layers, such that
     # when we "create" the Tenant again, we will replay the WAL from the beginning.
     #
-    # This is a "weird" thing to do, and can confuse the attachment service as we're re-using
+    # This is a "weird" thing to do, and can confuse the storage controller as we're re-using
     # the same tenant ID for a tenant that is logically different from the pageserver's point
     # of view, but the same as far as the safekeeper/WAL is concerned.  To work around that,
     # we will explicitly create the tenant in the same generation that it was previously
     # attached in.
-    attach_status = env.env.attachment_service.inspect(tenant_shard_id=env.tenant)
+    attach_status = env.env.storage_controller.inspect(tenant_shard_id=env.tenant)
     assert attach_status is not None
     (attach_gen, _) = attach_status
 
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 7fbce6a10c..3058926b25 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -137,7 +137,7 @@ def test_no_config(positive_env: NeonEnv, content_type: Optional[str]):
     ps_http.tenant_detach(tenant_id)
     assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()]
 
-    body = {"generation": env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)}
+    body = {"generation": env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)}
 
     ps_http.post(
         f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",
diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py
index adb67a579e..97ab69049d 100644
--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -85,9 +85,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     # the endpoint.  Whereas the previous reconfiguration was like a healthy migration, this
     # is more like what happens in an unexpected  pageserver failure.
     #
-    # Since we're dual-attached, need to tip-off attachment service to treat the one we're
+    # Since we're dual-attached, need to tip-off storage controller to treat the one we're
     # about to start as the attached pageserver
-    env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
+    env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
     env.pageservers[0].start()
     env.pageservers[1].stop()
 
@@ -97,9 +97,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     assert fetchone() == (100000,)
 
     env.pageservers[0].stop()
-    # Since we're dual-attached, need to tip-off attachment service to treat the one we're
+    # Since we're dual-attached, need to tip-off storage controller to treat the one we're
     # about to start as the attached pageserver
-    env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[1].id)
+    env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[1].id)
     env.pageservers[1].start()
 
     # Test a (former) bug where a child process spins without updating its connection string
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 0ea76d447e..618ac63785 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -133,7 +133,7 @@ def test_create_snapshot(
     for sk in env.safekeepers:
         sk.stop()
     env.pageserver.stop()
-    env.attachment_service.stop()
+    env.storage_controller.stop()
 
     # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
     compatibility_snapshot_dir = (
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index abdebb6d79..ca4295c5cb 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -159,7 +159,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     time.sleep(1.1)  # so that we can use change in pre_stat.st_mtime to detect overwrites
 
     def get_generation_number():
-        attachment = env.attachment_service.inspect(tenant_id)
+        attachment = env.storage_controller.inspect(tenant_id)
         assert attachment is not None
         return attachment[0]
 
diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py
index 16d120e24a..cb69f0ef39 100644
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -133,7 +133,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
     # Stop default ps/sk
     env.neon_cli.pageserver_stop(env.pageserver.id)
     env.neon_cli.safekeeper_stop()
-    env.neon_cli.attachment_service_stop(False)
+    env.neon_cli.storage_controller_stop(False)
 
     # Keep NeonEnv state up to date, it usually owns starting/stopping services
     env.pageserver.running = False
@@ -175,7 +175,7 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
     env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 2)
 
     # Stop this to get out of the way of the following `start`
-    env.neon_cli.attachment_service_stop(False)
+    env.neon_cli.storage_controller_stop(False)
 
     # Default start
     res = env.neon_cli.raw_cli(["start"])
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index e29db1e252..877deee08f 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -73,7 +73,7 @@ def check_client(env: NeonEnv, client: PageserverHttpClient):
     # create new tenant and check it is also there
     tenant_id = TenantId.generate()
     client.tenant_create(
-        tenant_id, generation=env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)
+        tenant_id, generation=env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)
     )
     assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()}
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 89fc48a49f..d1acb9817e 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -203,7 +203,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     env.broker.try_start()
     for sk in env.safekeepers:
         sk.start()
-    env.attachment_service.start()
+    env.storage_controller.start()
 
     env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
 
@@ -285,7 +285,7 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_pageservers = 2
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
 
-    attached_to_id = env.attachment_service.locate(env.initial_tenant)[0]["node_id"]
+    attached_to_id = env.storage_controller.locate(env.initial_tenant)[0]["node_id"]
     main_pageserver = env.get_pageserver(attached_to_id)
     other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0]
 
@@ -310,7 +310,7 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
 
     # Now advance the generation in the control plane: subsequent validations
     # from the running pageserver will fail.  No more deletions should happen.
-    env.attachment_service.attach_hook_issue(env.initial_tenant, other_pageserver.id)
+    env.storage_controller.attach_hook_issue(env.initial_tenant, other_pageserver.id)
     generate_uploads_and_deletions(env, init=False, pageserver=main_pageserver)
 
     assert_deletion_queue(ps_http, lambda n: n > 0)
@@ -366,7 +366,7 @@ def test_deletion_queue_recovery(
     neon_env_builder.num_pageservers = 2
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
 
-    attached_to_id = env.attachment_service.locate(env.initial_tenant)[0]["node_id"]
+    attached_to_id = env.storage_controller.locate(env.initial_tenant)[0]["node_id"]
     main_pageserver = env.get_pageserver(attached_to_id)
     other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0]
 
@@ -428,7 +428,7 @@ def test_deletion_queue_recovery(
 
     if keep_attachment == KeepAttachment.LOSE:
         some_other_pageserver = other_pageserver.id
-        env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
+        env.storage_controller.attach_hook_issue(env.initial_tenant, some_other_pageserver)
 
     main_pageserver.start()
 
@@ -494,7 +494,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     )
 
     # Simulate a major incident: the control plane goes offline
-    env.attachment_service.stop()
+    env.storage_controller.stop()
 
     # Remember how many validations had happened before the control plane went offline
     validated = get_deletion_queue_validated(ps_http)
@@ -525,7 +525,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     assert get_deletion_queue_executed(ps_http) == 0
 
     # When the control plane comes back up, normal service should resume
-    env.attachment_service.start()
+    env.storage_controller.start()
 
     ps_http.deletion_queue_flush(execute=True)
     assert get_deletion_queue_depth(ps_http) == 0
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 8ba9d767dd..79145f61b3 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -157,7 +157,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
                 workload.churn_rows(rng.randint(128, 256), pageserver.id)
                 workload.validate(pageserver.id)
             elif last_state_ps[0].startswith("Attached"):
-                # The `attachment_service` will only re-attach on startup when a pageserver was the
+                # The `storage_controller` will only re-attach on startup when a pageserver was the
                 # holder of the latest generation: otherwise the pageserver will revert to detached
                 # state if it was running attached with a stale generation
                 last_state[pageserver.id] = ("Detached", None)
@@ -182,12 +182,12 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
                         generation = last_state_ps[1]
                     else:
                         # Switch generations, while also jumping between attached states
-                        generation = env.attachment_service.attach_hook_issue(
+                        generation = env.storage_controller.attach_hook_issue(
                             tenant_id, pageserver.id
                         )
                         latest_attached = pageserver.id
                 else:
-                    generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver.id)
+                    generation = env.storage_controller.attach_hook_issue(tenant_id, pageserver.id)
                     latest_attached = pageserver.id
             else:
                 generation = None
@@ -273,7 +273,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
     # Encourage the new location to download while still in secondary mode
     pageserver_b.http_client().tenant_secondary_download(tenant_id)
 
-    migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id)
+    migrated_generation = env.storage_controller.attach_hook_issue(tenant_id, pageserver_b.id)
     log.info(f"Acquired generation {migrated_generation} for destination pageserver")
     assert migrated_generation == initial_generation + 1
 
@@ -436,7 +436,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
         remote_storage_kind=RemoteStorageKind.MOCK_S3,
     )
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    assert env.attachment_service is not None
+    assert env.storage_controller is not None
     assert isinstance(env.pageserver_remote_storage, S3Storage)  # Satisfy linter
 
     tenant_id = env.initial_tenant
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 06c13cc07d..05f769b0e3 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -169,7 +169,7 @@ def test_remote_storage_backup_and_restore(
     # Ensure that even though the tenant is broken, retrying the attachment fails
     with pytest.raises(Exception, match="Tenant state is Broken"):
         # Use same generation as in previous attempt
-        gen_state = env.attachment_service.inspect(tenant_id)
+        gen_state = env.storage_controller.inspect(tenant_id)
         assert gen_state is not None
         generation = gen_state[0]
         env.pageserver.tenant_attach(tenant_id, generation=generation)
@@ -355,7 +355,7 @@ def test_remote_storage_upload_queue_retries(
     env.pageserver.stop(immediate=True)
     env.endpoints.stop_all()
 
-    # We are about to forcibly drop local dirs.  Attachment service will increment generation in re-attach before
+    # We are about to forcibly drop local dirs.  Storage controller will increment generation in re-attach before
     # we later increment when actually attaching it again, leading to skipping a generation and potentially getting
     # these warnings if there was a durable but un-executed deletion list at time of restart.
     env.pageserver.allowed_errors.extend(
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
index aaa33f0bcb..611bd1c2a2 100644
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -80,7 +80,7 @@ def test_tenant_s3_restore(
     assert (
         ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
     ), "tenant removed before we deletion was issued"
-    env.attachment_service.attach_hook_drop(tenant_id)
+    env.storage_controller.attach_hook_drop(tenant_id)
 
     tenant_path = env.pageserver.tenant_dir(tenant_id)
     assert not tenant_path.exists()
@@ -103,7 +103,7 @@ def test_tenant_s3_restore(
         tenant_id, timestamp=ts_before_deletion, done_if_after=ts_after_deletion
     )
 
-    generation = env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)
+    generation = env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)
 
     ps_http.tenant_attach(tenant_id, generation=generation)
     env.pageserver.quiesce_tenants()
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 57c8d1f849..1b96cd6a80 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -43,7 +43,7 @@ def test_sharding_smoke(
     tenant_id = env.initial_tenant
 
     pageservers = dict((int(p.id), p) for p in env.pageservers)
-    shards = env.attachment_service.locate(tenant_id)
+    shards = env.storage_controller.locate(tenant_id)
 
     def get_sizes():
         sizes = {}
@@ -86,7 +86,7 @@ def test_sharding_smoke(
         )
         assert timelines == {env.initial_timeline, timeline_b}
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_sharding_split_unsharded(
@@ -102,7 +102,7 @@ def test_sharding_split_unsharded(
 
     # Check that we created with an unsharded TenantShardId: this is the default,
     # but check it in case we change the default in future
-    assert env.attachment_service.inspect(TenantShardId(tenant_id, 0, 0)) is not None
+    assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 0)) is not None
 
     workload = Workload(env, tenant_id, timeline_id, branch_name="main")
     workload.init()
@@ -110,15 +110,15 @@ def test_sharding_split_unsharded(
     workload.validate()
 
     # Split one shard into two
-    env.attachment_service.tenant_shard_split(tenant_id, shard_count=2)
+    env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)
 
     # Check we got the shard IDs we expected
-    assert env.attachment_service.inspect(TenantShardId(tenant_id, 0, 2)) is not None
-    assert env.attachment_service.inspect(TenantShardId(tenant_id, 1, 2)) is not None
+    assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None
+    assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None
 
     workload.validate()
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_sharding_split_smoke(
@@ -161,7 +161,7 @@ def test_sharding_split_smoke(
     workload.write_rows(256)
 
     # Note which pageservers initially hold a shard after tenant creation
-    pre_split_pageserver_ids = [loc["node_id"] for loc in env.attachment_service.locate(tenant_id)]
+    pre_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)]
 
     # For pageservers holding a shard, validate their ingest statistics
     # reflect a proper splitting of the WAL.
@@ -213,9 +213,9 @@ def test_sharding_split_smoke(
     # Before split, old shards exist
     assert shards_on_disk(old_shard_ids)
 
-    env.attachment_service.tenant_shard_split(tenant_id, shard_count=split_shard_count)
+    env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count)
 
-    post_split_pageserver_ids = [loc["node_id"] for loc in env.attachment_service.locate(tenant_id)]
+    post_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)]
     # We should have split into 8 shards, on the same 4 pageservers we started on.
     assert len(post_split_pageserver_ids) == split_shard_count
     assert len(set(post_split_pageserver_ids)) == shard_count
@@ -261,7 +261,7 @@ def test_sharding_split_smoke(
     # Check that we didn't do any spurious reconciliations.
     # Total number of reconciles should have been one per original shard, plus
     # one for each shard that was migrated.
-    reconcile_ok = env.attachment_service.get_metric_value(
+    reconcile_ok = env.storage_controller.get_metric_value(
         "storage_controller_reconcile_complete_total", filter={"status": "ok"}
     )
     assert reconcile_ok == shard_count + split_shard_count // 2
@@ -269,19 +269,19 @@ def test_sharding_split_smoke(
     # Check that no cancelled or errored reconciliations occurred: this test does no
     # failure injection and should run clean.
     assert (
-        env.attachment_service.get_metric_value(
+        env.storage_controller.get_metric_value(
             "storage_controller_reconcile_complete_total", filter={"status": "cancel"}
         )
         is None
     )
     assert (
-        env.attachment_service.get_metric_value(
+        env.storage_controller.get_metric_value(
             "storage_controller_reconcile_complete_total", filter={"status": "error"}
         )
         is None
     )
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
     # Validate pageserver state
     shards_exist: list[TenantShardId] = []
@@ -360,7 +360,7 @@ def test_sharding_ingest(
     huge_layer_count = 0
 
     # Inspect the resulting layer map, count how many layers are undersized.
-    for shard in env.attachment_service.locate(tenant_id):
+    for shard in env.storage_controller.locate(tenant_id):
         pageserver = env.get_pageserver(shard["node_id"])
         shard_id = shard["shard_id"]
         layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id)
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index aecc244a47..6b7cd9d829 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -6,10 +6,10 @@ from typing import Any, Dict, List, Union
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
-    AttachmentServiceApiException,
     NeonEnv,
     NeonEnvBuilder,
     PgBin,
+    StorageControllerApiException,
     TokenScope,
 )
 from fixtures.pageserver.http import PageserverHttpClient
@@ -36,7 +36,7 @@ from werkzeug.wrappers.response import Response
 def get_node_shard_counts(env: NeonEnv, tenant_ids):
     counts: defaultdict[str, int] = defaultdict(int)
     for tid in tenant_ids:
-        for shard in env.attachment_service.locate(tid):
+        for shard in env.storage_controller.locate(tid):
             counts[shard["node_id"]] += 1
     return counts
 
@@ -62,20 +62,20 @@ def test_sharding_service_smoke(
 
     # Start services by hand so that we can skip a pageserver (this will start + register later)
     env.broker.try_start()
-    env.attachment_service.start()
+    env.storage_controller.start()
     env.pageservers[0].start()
     env.pageservers[1].start()
     for sk in env.safekeepers:
         sk.start()
 
     # The pageservers we started should have registered with the sharding service on startup
-    nodes = env.attachment_service.node_list()
+    nodes = env.storage_controller.node_list()
     assert len(nodes) == 2
     assert set(n["id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}
 
     # Starting an additional pageserver should register successfully
     env.pageservers[2].start()
-    nodes = env.attachment_service.node_list()
+    nodes = env.storage_controller.node_list()
     assert len(nodes) == 3
     assert set(n["id"] for n in nodes) == {ps.id for ps in env.pageservers}
 
@@ -99,22 +99,22 @@ def test_sharding_service_smoke(
     # Creating and deleting timelines should work, using identical API to pageserver
     timeline_crud_tenant = next(iter(tenant_ids))
     timeline_id = TimelineId.generate()
-    env.attachment_service.pageserver_api().timeline_create(
+    env.storage_controller.pageserver_api().timeline_create(
         pg_version=PgVersion.NOT_SET, tenant_id=timeline_crud_tenant, new_timeline_id=timeline_id
     )
-    timelines = env.attachment_service.pageserver_api().timeline_list(timeline_crud_tenant)
+    timelines = env.storage_controller.pageserver_api().timeline_list(timeline_crud_tenant)
     assert len(timelines) == 2
     assert timeline_id in set(TimelineId(t["timeline_id"]) for t in timelines)
     #    virtual_ps_http.timeline_delete(tenant_id=timeline_crud_tenant, timeline_id=timeline_id)
     timeline_delete_wait_completed(
-        env.attachment_service.pageserver_api(), timeline_crud_tenant, timeline_id
+        env.storage_controller.pageserver_api(), timeline_crud_tenant, timeline_id
     )
-    timelines = env.attachment_service.pageserver_api().timeline_list(timeline_crud_tenant)
+    timelines = env.storage_controller.pageserver_api().timeline_list(timeline_crud_tenant)
     assert len(timelines) == 1
     assert timeline_id not in set(TimelineId(t["timeline_id"]) for t in timelines)
 
     # Marking a pageserver offline should migrate tenants away from it.
-    env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
+    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
 
     def node_evacuated(node_id: int) -> None:
         counts = get_node_shard_counts(env, tenant_ids)
@@ -124,7 +124,7 @@ def test_sharding_service_smoke(
 
     # Marking pageserver active should not migrate anything to it
     # immediately
-    env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Active"})
+    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Active"})
     time.sleep(1)
     assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 0
 
@@ -144,13 +144,13 @@ def test_sharding_service_smoke(
 
     # Delete all the tenants
     for tid in tenant_ids:
-        tenant_delete_wait_completed(env.attachment_service.pageserver_api(), tid, 10)
+        tenant_delete_wait_completed(env.storage_controller.pageserver_api(), tid, 10)
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
     # Set a scheduling policy on one node, create all the tenants, observe
     # that the scheduling policy is respected.
-    env.attachment_service.node_configure(env.pageservers[1].id, {"scheduling": "Draining"})
+    env.storage_controller.node_configure(env.pageservers[1].id, {"scheduling": "Draining"})
 
     # Create some fresh tenants
     tenant_ids = set(TenantId.generate() for i in range(0, tenant_count))
@@ -163,7 +163,7 @@ def test_sharding_service_smoke(
     assert counts[env.pageservers[0].id] == tenant_shard_count // 2
     assert counts[env.pageservers[2].id] == tenant_shard_count // 2
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_node_status_after_restart(
@@ -173,28 +173,28 @@ def test_node_status_after_restart(
     env = neon_env_builder.init_start()
 
     # Initially we have two online pageservers
-    nodes = env.attachment_service.node_list()
+    nodes = env.storage_controller.node_list()
     assert len(nodes) == 2
 
     env.pageservers[1].stop()
 
-    env.attachment_service.stop()
-    env.attachment_service.start()
+    env.storage_controller.stop()
+    env.storage_controller.start()
 
     def is_ready():
-        assert env.attachment_service.ready() is True
+        assert env.storage_controller.ready() is True
 
     wait_until(30, 1, is_ready)
 
     # We loaded nodes from database on restart
-    nodes = env.attachment_service.node_list()
+    nodes = env.storage_controller.node_list()
     assert len(nodes) == 2
 
     # We should still be able to create a tenant, because the pageserver which is still online
     # should have had its availabilty state set to Active.
-    env.attachment_service.tenant_create(TenantId.generate())
+    env.storage_controller.tenant_create(TenantId.generate())
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_sharding_service_passthrough(
@@ -208,9 +208,9 @@ def test_sharding_service_passthrough(
     neon_env_builder.num_pageservers = 2
     env = neon_env_builder.init_start()
 
-    # We will talk to attachment service as if it was a pageserver, using the pageserver
+    # We will talk to storage controller as if it was a pageserver, using the pageserver
     # HTTP client
-    client = PageserverHttpClient(env.attachment_service_port, lambda: True)
+    client = PageserverHttpClient(env.storage_controller_port, lambda: True)
     timelines = client.timeline_list(tenant_id=env.initial_tenant)
     assert len(timelines) == 1
 
@@ -221,22 +221,22 @@ def test_sharding_service_passthrough(
     }
     assert status["state"]["slug"] == "Active"
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     tenant_a = env.initial_tenant
     tenant_b = TenantId.generate()
-    env.attachment_service.tenant_create(tenant_b)
+    env.storage_controller.tenant_create(tenant_b)
     env.pageserver.tenant_detach(tenant_a)
 
     # TODO: extend this test to use multiple pageservers, and check that locations don't move around
     # on restart.
 
-    # Attachment service restart
-    env.attachment_service.stop()
-    env.attachment_service.start()
+    # Storage controller restart
+    env.storage_controller.stop()
+    env.storage_controller.start()
 
     observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list())
 
@@ -255,7 +255,7 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
     assert tenant_a not in observed
     assert tenant_b in observed
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 @pytest.mark.parametrize("warm_up", [True, False])
@@ -271,7 +271,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     # Start services by hand so that we can skip registration on one of the pageservers
     env = neon_env_builder.init_configs()
     env.broker.try_start()
-    env.attachment_service.start()
+    env.storage_controller.start()
 
     # This is the pageserver where we'll initially create the tenant.  Run it in emergency
     # mode so that it doesn't talk to storage controller, and do not register it.
@@ -286,12 +286,12 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     # will be attached after onboarding
     env.pageservers[1].start(register=True)
     dest_ps = env.pageservers[1]
-    virtual_ps_http = PageserverHttpClient(env.attachment_service_port, lambda: True)
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
 
     for sk in env.safekeepers:
         sk.start()
 
-    # Create a tenant directly via pageserver HTTP API, skipping the attachment service
+    # Create a tenant directly via pageserver HTTP API, skipping the storage controller
     tenant_id = TenantId.generate()
     generation = 123
     origin_ps.http_client().tenant_create(tenant_id, generation=generation)
@@ -324,7 +324,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
 
         virtual_ps_http.tenant_secondary_download(tenant_id)
 
-    # Call into attachment service to onboard the tenant
+    # Call into storage controller to onboard the tenant
     generation += 1
     virtual_ps_http.tenant_location_conf(
         tenant_id,
@@ -347,7 +347,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
         },
     )
 
-    # As if doing a live migration, call into the attachment service to
+    # As if doing a live migration, call into the storage controller to
     # set it to AttachedSingle: this is a no-op, but we test it because the
     # cloud control plane may call this for symmetry with live migration to
     # an individual pageserver
@@ -375,8 +375,8 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     assert dest_tenants[0]["generation"] == generation + 1
 
     # The onboarded tenant should survive a restart of sharding service
-    env.attachment_service.stop()
-    env.attachment_service.start()
+    env.storage_controller.stop()
+    env.storage_controller.start()
 
     # The onboarded tenant should surviev a restart of pageserver
     dest_ps.stop()
@@ -407,7 +407,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
     assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_sharding_service_compute_hook(
@@ -419,7 +419,7 @@ def test_sharding_service_compute_hook(
     Test that the sharding service calls out to the configured HTTP endpoint on attachment changes
     """
 
-    # We will run two pageserver to migrate and check that the attachment service sends notifications
+    # We will run two pageserver to migrate and check that the storage controller sends notifications
     # when migrating.
     neon_env_builder.num_pageservers = 2
     (host, port) = httpserver_listen_address
@@ -450,7 +450,7 @@ def test_sharding_service_compute_hook(
     }
     assert notifications[0] == expect
 
-    env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
+    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
 
     def node_evacuated(node_id: int) -> None:
         counts = get_node_shard_counts(env, [env.initial_tenant])
@@ -473,8 +473,8 @@ def test_sharding_service_compute_hook(
     wait_until(20, 0.25, received_migration_notification)
 
     # When we restart, we should re-emit notifications for all tenants
-    env.attachment_service.stop()
-    env.attachment_service.start()
+    env.storage_controller.stop()
+    env.storage_controller.start()
 
     def received_restart_notification():
         assert len(notifications) == 3
@@ -483,7 +483,7 @@ def test_sharding_service_compute_hook(
     wait_until(10, 1, received_restart_notification)
 
     # Splitting a tenant should cause its stripe size to become visible in the compute notification
-    env.attachment_service.tenant_shard_split(env.initial_tenant, shard_count=2)
+    env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2)
     expect = {
         "tenant_id": str(env.initial_tenant),
         "stripe_size": 32768,
@@ -499,7 +499,7 @@ def test_sharding_service_compute_hook(
 
     wait_until(10, 1, received_split_notification)
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
@@ -512,55 +512,55 @@ def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
     tenant_id = TenantId.generate()
-    env.attachment_service.tenant_create(tenant_id, shard_count=2, shard_stripe_size=8192)
+    env.storage_controller.tenant_create(tenant_id, shard_count=2, shard_stripe_size=8192)
 
     # Check that the consistency check passes on a freshly setup system
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
-    # These APIs are intentionally not implemented as methods on NeonAttachmentService, as
+    # These APIs are intentionally not implemented as methods on NeonStorageController, as
     # they're just for use in unanticipated circumstances.
 
     # Initial tenant (1 shard) and the one we just created (2 shards) should be visible
-    response = env.attachment_service.request(
+    response = env.storage_controller.request(
         "GET",
-        f"{env.attachment_service_api}/debug/v1/tenant",
-        headers=env.attachment_service.headers(TokenScope.ADMIN),
+        f"{env.storage_controller_api}/debug/v1/tenant",
+        headers=env.storage_controller.headers(TokenScope.ADMIN),
     )
     assert len(response.json()) == 3
 
     # Scheduler should report the expected nodes and shard counts
-    response = env.attachment_service.request(
-        "GET", f"{env.attachment_service_api}/debug/v1/scheduler"
+    response = env.storage_controller.request(
+        "GET", f"{env.storage_controller_api}/debug/v1/scheduler"
     )
     # Two nodes, in a dict of node_id->node
     assert len(response.json()["nodes"]) == 2
     assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
     assert all(v["may_schedule"] for v in response.json()["nodes"].values())
 
-    response = env.attachment_service.request(
+    response = env.storage_controller.request(
         "POST",
-        f"{env.attachment_service_api}/debug/v1/node/{env.pageservers[1].id}/drop",
-        headers=env.attachment_service.headers(TokenScope.ADMIN),
+        f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop",
+        headers=env.storage_controller.headers(TokenScope.ADMIN),
     )
-    assert len(env.attachment_service.node_list()) == 1
+    assert len(env.storage_controller.node_list()) == 1
 
-    response = env.attachment_service.request(
+    response = env.storage_controller.request(
         "POST",
-        f"{env.attachment_service_api}/debug/v1/tenant/{tenant_id}/drop",
-        headers=env.attachment_service.headers(TokenScope.ADMIN),
+        f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop",
+        headers=env.storage_controller.headers(TokenScope.ADMIN),
     )
 
     # Tenant drop should be reflected in dump output
-    response = env.attachment_service.request(
+    response = env.storage_controller.request(
         "GET",
-        f"{env.attachment_service_api}/debug/v1/tenant",
-        headers=env.attachment_service.headers(TokenScope.ADMIN),
+        f"{env.storage_controller_api}/debug/v1/tenant",
+        headers=env.storage_controller.headers(TokenScope.ADMIN),
     )
     assert len(response.json()) == 1
 
     # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
     # meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_sharding_service_s3_time_travel_recovery(
@@ -584,10 +584,10 @@ def test_sharding_service_s3_time_travel_recovery(
     neon_env_builder.num_pageservers = 1
 
     env = neon_env_builder.init_start()
-    virtual_ps_http = PageserverHttpClient(env.attachment_service_port, lambda: True)
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
 
     tenant_id = TenantId.generate()
-    env.attachment_service.tenant_create(
+    env.storage_controller.tenant_create(
         tenant_id,
         shard_count=2,
         shard_stripe_size=8192,
@@ -595,7 +595,7 @@ def test_sharding_service_s3_time_travel_recovery(
     )
 
     # Check that the consistency check passes
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
     branch_name = "main"
     timeline_id = env.neon_cli.create_timeline(
@@ -670,28 +670,28 @@ def test_sharding_service_s3_time_travel_recovery(
     with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
         endpoint.safe_psql("SELECT * FROM created_foo;")
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.auth_enabled = True
     env = neon_env_builder.init_start()
-    svc = env.attachment_service
-    api = env.attachment_service_api
+    svc = env.storage_controller
+    api = env.storage_controller_api
 
     tenant_id = TenantId.generate()
     body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
 
     # No token
     with pytest.raises(
-        AttachmentServiceApiException,
+        StorageControllerApiException,
         match="Unauthorized: missing authorization header",
     ):
-        svc.request("POST", f"{env.attachment_service_api}/v1/tenant", json=body)
+        svc.request("POST", f"{env.storage_controller_api}/v1/tenant", json=body)
 
     # Token with incorrect scope
     with pytest.raises(
-        AttachmentServiceApiException,
+        StorageControllerApiException,
         match="Forbidden: JWT authentication error",
     ):
         svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN))
@@ -703,14 +703,14 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
 
     # No token
     with pytest.raises(
-        AttachmentServiceApiException,
+        StorageControllerApiException,
         match="Unauthorized: missing authorization header",
     ):
         svc.request("GET", f"{api}/debug/v1/tenant")
 
     # Token with incorrect scope
     with pytest.raises(
-        AttachmentServiceApiException,
+        StorageControllerApiException,
         match="Forbidden: JWT authentication error",
     ):
         svc.request(
@@ -719,14 +719,14 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
 
     # No token
     with pytest.raises(
-        AttachmentServiceApiException,
+        StorageControllerApiException,
         match="Unauthorized: missing authorization header",
     ):
         svc.request("POST", f"{api}/upcall/v1/re-attach")
 
     # Token with incorrect scope
     with pytest.raises(
-        AttachmentServiceApiException,
+        StorageControllerApiException,
         match="Forbidden: JWT authentication error",
     ):
         svc.request(
@@ -743,7 +743,7 @@ def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     tenant_id = env.initial_tenant
 
-    http = env.attachment_service.pageserver_api()
+    http = env.storage_controller.pageserver_api()
 
     default_value = "7days"
     new_value = "1h"
@@ -769,4 +769,4 @@ def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
     assert readback_ps.effective_config["pitr_interval"] == default_value
     assert "pitr_interval" not in readback_ps.tenant_specific_overrides
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index cbf7059c92..205ca18050 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -1011,7 +1011,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
         resp = client.tenant_status(eager_tenant)
         assert resp["state"]["slug"] == "Active"
 
-    gen = env.attachment_service.attach_hook_issue(eager_tenant, env.pageserver.id)
+    gen = env.storage_controller.attach_hook_issue(eager_tenant, env.pageserver.id)
     client.tenant_location_conf(
         eager_tenant,
         {
@@ -1071,7 +1071,7 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met
     # attach, it will consume the only permit because logical size calculation
     # is paused.
 
-    gen = env.attachment_service.attach_hook_issue(lazy_tenant, env.pageserver.id)
+    gen = env.storage_controller.attach_hook_issue(lazy_tenant, env.pageserver.id)
     client.tenant_location_conf(
         lazy_tenant,
         {

From 09699d4bd883d9e1753dcff22406d8455b5f133e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 12 Mar 2024 11:52:00 +0000
Subject: [PATCH 0385/1571] proxy: cancel http queries on timeout (#7031)

## Problem

On HTTP query timeout, we should try and cancel the current in-flight
SQL query.

## Summary of changes

Trigger a cancellation command in postgres once the timeout is reach
---
 proxy/src/serverless/conn_pool.rs     |   9 +-
 proxy/src/serverless/sql_over_http.rs | 313 +++++++++++++++++---------
 test_runner/fixtures/neon_fixtures.py |   6 +
 test_runner/regress/test_proxy.py     |  32 +++
 4 files changed, 242 insertions(+), 118 deletions(-)

diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 73f213d074..901e30224b 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -612,13 +612,6 @@ impl<C: ClientInnerExt> Client<C> {
         let inner = inner.as_mut().expect("client inner should not be removed");
         (&mut inner.inner, Discard { pool, conn_info })
     }
-
-    pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
-        self.inner().1.check_idle(status)
-    }
-    pub fn discard(&mut self) {
-        self.inner().1.discard()
-    }
 }
 
 impl<C: ClientInnerExt> Discard<'_, C> {
@@ -739,7 +732,7 @@ mod tests {
         {
             let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
             assert_eq!(0, pool.get_global_connections_count());
-            client.discard();
+            client.inner().1.discard();
             // Discard should not add the connection from the pool.
             assert_eq!(0, pool.get_global_connections_count());
         }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 74af985211..20d9795b47 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,6 +1,10 @@
+use std::pin::pin;
 use std::sync::Arc;
 
 use anyhow::bail;
+use futures::future::select;
+use futures::future::try_join;
+use futures::future::Either;
 use futures::StreamExt;
 use hyper::body::HttpBody;
 use hyper::header;
@@ -11,13 +15,16 @@ use hyper::StatusCode;
 use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
 use serde_json::Value;
-use tokio::try_join;
+use tokio::time;
 use tokio_postgres::error::DbError;
 use tokio_postgres::error::ErrorPosition;
+use tokio_postgres::error::SqlState;
 use tokio_postgres::GenericClient;
 use tokio_postgres::IsolationLevel;
+use tokio_postgres::NoTls;
 use tokio_postgres::ReadyForQueryStatus;
 use tokio_postgres::Transaction;
+use tokio_util::sync::CancellationToken;
 use tracing::error;
 use tracing::info;
 use url::Url;
@@ -194,108 +201,111 @@ pub async fn handle(
     request: Request<Body>,
     backend: Arc<PoolingBackend>,
 ) -> Result<Response<Body>, ApiError> {
-    let result = tokio::time::timeout(
-        config.http_config.request_timeout,
-        handle_inner(config, &mut ctx, request, backend),
-    )
-    .await;
+    let cancel = CancellationToken::new();
+    let cancel2 = cancel.clone();
+    let handle = tokio::spawn(async move {
+        time::sleep(config.http_config.request_timeout).await;
+        cancel2.cancel();
+    });
+
+    let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
+    handle.abort();
+
     let mut response = match result {
-        Ok(r) => match r {
-            Ok(r) => {
-                ctx.set_success();
-                r
+        Ok(Ok(r)) => {
+            ctx.set_success();
+            r
+        }
+        Err(e) => {
+            // TODO: ctx.set_error_kind(e.get_error_type());
+
+            let mut message = format!("{:?}", e);
+            let db_error = e
+                .downcast_ref::<tokio_postgres::Error>()
+                .and_then(|e| e.as_db_error());
+            fn get<'a, T: serde::Serialize>(
+                db: Option<&'a DbError>,
+                x: impl FnOnce(&'a DbError) -> T,
+            ) -> Value {
+                db.map(x)
+                    .and_then(|t| serde_json::to_value(t).ok())
+                    .unwrap_or_default()
             }
-            Err(e) => {
-                // TODO: ctx.set_error_kind(e.get_error_type());
 
-                let mut message = format!("{:?}", e);
-                let db_error = e
-                    .downcast_ref::<tokio_postgres::Error>()
-                    .and_then(|e| e.as_db_error());
-                fn get<'a, T: serde::Serialize>(
-                    db: Option<&'a DbError>,
-                    x: impl FnOnce(&'a DbError) -> T,
-                ) -> Value {
-                    db.map(x)
-                        .and_then(|t| serde_json::to_value(t).ok())
-                        .unwrap_or_default()
-                }
-
-                if let Some(db_error) = db_error {
-                    db_error.message().clone_into(&mut message);
-                }
-
-                let position = db_error.and_then(|db| db.position());
-                let (position, internal_position, internal_query) = match position {
-                    Some(ErrorPosition::Original(position)) => (
-                        Value::String(position.to_string()),
-                        Value::Null,
-                        Value::Null,
-                    ),
-                    Some(ErrorPosition::Internal { position, query }) => (
-                        Value::Null,
-                        Value::String(position.to_string()),
-                        Value::String(query.clone()),
-                    ),
-                    None => (Value::Null, Value::Null, Value::Null),
-                };
-
-                let code = get(db_error, |db| db.code().code());
-                let severity = get(db_error, |db| db.severity());
-                let detail = get(db_error, |db| db.detail());
-                let hint = get(db_error, |db| db.hint());
-                let where_ = get(db_error, |db| db.where_());
-                let table = get(db_error, |db| db.table());
-                let column = get(db_error, |db| db.column());
-                let schema = get(db_error, |db| db.schema());
-                let datatype = get(db_error, |db| db.datatype());
-                let constraint = get(db_error, |db| db.constraint());
-                let file = get(db_error, |db| db.file());
-                let line = get(db_error, |db| db.line().map(|l| l.to_string()));
-                let routine = get(db_error, |db| db.routine());
-
-                error!(
-                    ?code,
-                    "sql-over-http per-client task finished with an error: {e:#}"
-                );
-                // TODO: this shouldn't always be bad request.
-                json_response(
-                    StatusCode::BAD_REQUEST,
-                    json!({
-                        "message": message,
-                        "code": code,
-                        "detail": detail,
-                        "hint": hint,
-                        "position": position,
-                        "internalPosition": internal_position,
-                        "internalQuery": internal_query,
-                        "severity": severity,
-                        "where": where_,
-                        "table": table,
-                        "column": column,
-                        "schema": schema,
-                        "dataType": datatype,
-                        "constraint": constraint,
-                        "file": file,
-                        "line": line,
-                        "routine": routine,
-                    }),
-                )?
+            if let Some(db_error) = db_error {
+                db_error.message().clone_into(&mut message);
             }
-        },
-        Err(_) => {
+
+            let position = db_error.and_then(|db| db.position());
+            let (position, internal_position, internal_query) = match position {
+                Some(ErrorPosition::Original(position)) => (
+                    Value::String(position.to_string()),
+                    Value::Null,
+                    Value::Null,
+                ),
+                Some(ErrorPosition::Internal { position, query }) => (
+                    Value::Null,
+                    Value::String(position.to_string()),
+                    Value::String(query.clone()),
+                ),
+                None => (Value::Null, Value::Null, Value::Null),
+            };
+
+            let code = get(db_error, |db| db.code().code());
+            let severity = get(db_error, |db| db.severity());
+            let detail = get(db_error, |db| db.detail());
+            let hint = get(db_error, |db| db.hint());
+            let where_ = get(db_error, |db| db.where_());
+            let table = get(db_error, |db| db.table());
+            let column = get(db_error, |db| db.column());
+            let schema = get(db_error, |db| db.schema());
+            let datatype = get(db_error, |db| db.datatype());
+            let constraint = get(db_error, |db| db.constraint());
+            let file = get(db_error, |db| db.file());
+            let line = get(db_error, |db| db.line().map(|l| l.to_string()));
+            let routine = get(db_error, |db| db.routine());
+
+            error!(
+                ?code,
+                "sql-over-http per-client task finished with an error: {e:#}"
+            );
+            // TODO: this shouldn't always be bad request.
+            json_response(
+                StatusCode::BAD_REQUEST,
+                json!({
+                    "message": message,
+                    "code": code,
+                    "detail": detail,
+                    "hint": hint,
+                    "position": position,
+                    "internalPosition": internal_position,
+                    "internalQuery": internal_query,
+                    "severity": severity,
+                    "where": where_,
+                    "table": table,
+                    "column": column,
+                    "schema": schema,
+                    "dataType": datatype,
+                    "constraint": constraint,
+                    "file": file,
+                    "line": line,
+                    "routine": routine,
+                }),
+            )?
+        }
+        Ok(Err(Cancelled())) => {
             // TODO: when http error classification is done, distinguish between
             // timeout on sql vs timeout in proxy/cplane
             // ctx.set_error_kind(crate::error::ErrorKind::RateLimit);
 
             let message = format!(
-                "HTTP-Connection timed out, execution time exceeded {} seconds",
-                config.http_config.request_timeout.as_secs()
+                "Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections",
+                config.http_config.request_timeout.as_secs_f64()
             );
             error!(message);
             json_response(
-                StatusCode::GATEWAY_TIMEOUT,
-                json!({ "message": message, "code": StatusCode::GATEWAY_TIMEOUT.as_u16() }),
+                StatusCode::BAD_REQUEST,
+                json!({ "message": message, "code": SqlState::PROTOCOL_VIOLATION.code() }),
             )?
         }
     };
@@ -307,12 +317,15 @@ pub async fn handle(
     Ok(response)
 }
 
+struct Cancelled();
+
 async fn handle_inner(
+    cancel: CancellationToken,
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
     request: Request<Body>,
     backend: Arc<PoolingBackend>,
-) -> anyhow::Result<Response<Body>> {
+) -> Result<Result<Response<Body>, Cancelled>, anyhow::Error> {
     let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
         .with_label_values(&[ctx.protocol])
         .guard();
@@ -389,7 +402,18 @@ async fn handle_inner(
     };
 
     // Run both operations in parallel
-    let (payload, mut client) = try_join!(fetch_and_process_request, authenticate_and_connect)?;
+    let (payload, mut client) = match select(
+        try_join(
+            pin!(fetch_and_process_request),
+            pin!(authenticate_and_connect),
+        ),
+        pin!(cancel.cancelled()),
+    )
+    .await
+    {
+        Either::Left((result, _cancelled)) => result?,
+        Either::Right((_cancelled, _)) => return Ok(Err(Cancelled())),
+    };
 
     let mut response = Response::builder()
         .status(StatusCode::OK)
@@ -401,19 +425,60 @@ async fn handle_inner(
     let mut size = 0;
     let result = match payload {
         Payload::Single(stmt) => {
-            let (status, results) =
-                query_to_json(&*client, stmt, &mut 0, raw_output, default_array_mode)
-                    .await
-                    .map_err(|e| {
-                        client.discard();
-                        e
-                    })?;
-            client.check_idle(status);
-            results
+            let mut size = 0;
+            let (inner, mut discard) = client.inner();
+            let cancel_token = inner.cancel_token();
+            let query = pin!(query_to_json(
+                &*inner,
+                stmt,
+                &mut size,
+                raw_output,
+                default_array_mode
+            ));
+            let cancelled = pin!(cancel.cancelled());
+            let res = select(query, cancelled).await;
+            match res {
+                Either::Left((Ok((status, results)), _cancelled)) => {
+                    discard.check_idle(status);
+                    results
+                }
+                Either::Left((Err(e), _cancelled)) => {
+                    discard.discard();
+                    return Err(e);
+                }
+                Either::Right((_cancelled, query)) => {
+                    if let Err(err) = cancel_token.cancel_query(NoTls).await {
+                        tracing::error!(?err, "could not cancel query");
+                    }
+                    match time::timeout(time::Duration::from_millis(100), query).await {
+                        Ok(Ok((status, results))) => {
+                            discard.check_idle(status);
+                            results
+                        }
+                        Ok(Err(error)) => {
+                            let db_error = error
+                                .downcast_ref::<tokio_postgres::Error>()
+                                .and_then(|e| e.as_db_error());
+
+                            // if errored for some other reason, it might not be safe to return
+                            if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
+                                discard.discard();
+                            }
+
+                            return Ok(Err(Cancelled()));
+                        }
+                        Err(_timeout) => {
+                            discard.discard();
+                            return Ok(Err(Cancelled()));
+                        }
+                    }
+                }
+            }
         }
         Payload::Batch(statements) => {
             info!("starting transaction");
             let (inner, mut discard) = client.inner();
+            let cancel_token = inner.cancel_token();
             let mut builder = inner.build_transaction();
             if let Some(isolation_level) = txn_isolation_level {
                 builder = builder.isolation_level(isolation_level);
@@ -433,6 +498,7 @@ async fn handle_inner(
             })?;
 
             let results = match query_batch(
+                cancel.child_token(),
                 &transaction,
                 statements,
                 &mut size,
@@ -441,7 +507,7 @@ async fn handle_inner(
             )
             .await
             {
-                Ok(results) => {
+                Ok(Ok(results)) => {
                     info!("commit");
                     let status = transaction.commit().await.map_err(|e| {
                         // if we cannot commit - for now don't return connection to pool
@@ -452,6 +518,15 @@ async fn handle_inner(
                     discard.check_idle(status);
                     results
                 }
+                Ok(Err(Cancelled())) => {
+                    if let Err(err) = cancel_token.cancel_query(NoTls).await {
+                        tracing::error!(?err, "could not cancel query");
+                    }
+                    // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
+                    discard.discard();
+
+                    return Ok(Err(Cancelled()));
+                }
                 Err(err) => {
                     info!("rollback");
                     let status = transaction.rollback().await.map_err(|e| {
@@ -499,26 +574,44 @@ async fn handle_inner(
     // moving this later in the stack is going to be a lot of effort and ehhhh
     metrics.record_egress(len as u64);
 
-    Ok(response)
+    Ok(Ok(response))
 }
 
 async fn query_batch(
+    cancel: CancellationToken,
     transaction: &Transaction<'_>,
     queries: BatchQueryData,
     total_size: &mut usize,
     raw_output: bool,
     array_mode: bool,
-) -> anyhow::Result<Vec<Value>> {
+) -> anyhow::Result<Result<Vec<Value>, Cancelled>> {
     let mut results = Vec::with_capacity(queries.queries.len());
     let mut current_size = 0;
     for stmt in queries.queries {
-        // TODO: maybe we should check that the transaction bit is set here
-        let (_, values) =
-            query_to_json(transaction, stmt, &mut current_size, raw_output, array_mode).await?;
-        results.push(values);
+        let query = pin!(query_to_json(
+            transaction,
+            stmt,
+            &mut current_size,
+            raw_output,
+            array_mode
+        ));
+        let cancelled = pin!(cancel.cancelled());
+        let res = select(query, cancelled).await;
+        match res {
+            // TODO: maybe we should check that the transaction bit is set here
+            Either::Left((Ok((_, values)), _cancelled)) => {
+                results.push(values);
+            }
+            Either::Left((Err(e), _cancelled)) => {
+                return Err(e);
+            }
+            Either::Right((_cancelled, _)) => {
+                return Ok(Err(Cancelled()));
+            }
+        }
     }
     *total_size += current_size;
-    Ok(results)
+    Ok(Ok(results))
 }
 
 async fn query_to_json<T: GenericClient>(
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 234bfa8bf9..b7196a2556 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2859,6 +2859,7 @@ class NeonProxy(PgProtocol):
         self.auth_backend = auth_backend
         self.metric_collection_endpoint = metric_collection_endpoint
         self.metric_collection_interval = metric_collection_interval
+        self.http_timeout_seconds = 15
         self._popen: Optional[subprocess.Popen[bytes]] = None
 
     def start(self) -> NeonProxy:
@@ -2897,6 +2898,7 @@ class NeonProxy(PgProtocol):
             *["--proxy", f"{self.host}:{self.proxy_port}"],
             *["--mgmt", f"{self.host}:{self.mgmt_port}"],
             *["--wss", f"{self.host}:{self.external_http_port}"],
+            *["--sql-over-http-timeout", f"{self.http_timeout_seconds}s"],
             *["-c", str(crt_path)],
             *["-k", str(key_path)],
             *self.auth_backend.extra_args(),
@@ -2937,6 +2939,8 @@ class NeonProxy(PgProtocol):
         password = quote(kwargs["password"])
         expected_code = kwargs.get("expected_code")
 
+        log.info(f"Executing http query: {query}")
+
         connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
         response = requests.post(
             f"https://{self.domain}:{self.external_http_port}/sql",
@@ -2959,6 +2963,8 @@ class NeonProxy(PgProtocol):
         password = kwargs["password"]
         expected_code = kwargs.get("expected_code")
 
+        log.info(f"Executing http2 query: {query}")
+
         connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
         async with httpx.AsyncClient(
             http2=True, verify=str(self.test_output_dir / "proxy.crt")
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 9905f120e1..078589d8eb 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -564,3 +564,35 @@ async def test_sql_over_http2(static_proxy: NeonProxy):
         "select 42 as answer", [], user="http", password="http", expected_code=200
     )
     assert resp["rows"] == [{"answer": 42}]
+
+
+def test_sql_over_http_timeout_cancel(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http with login password 'http' superuser")
+
+    static_proxy.safe_psql("create table test_table ( id int primary key )")
+
+    # insert into a table, with a unique constraint, after sleeping for n seconds
+    query = "WITH temp AS ( \
+        SELECT pg_sleep($1) as sleep, $2::int as id \
+    ) INSERT INTO test_table (id) SELECT id FROM temp"
+
+    # expect to fail with timeout
+    res = static_proxy.http_query(
+        query,
+        [static_proxy.http_timeout_seconds + 1, 1],
+        user="http",
+        password="http",
+        expected_code=400,
+    )
+    assert "Query cancelled, runtime exceeded" in res["message"], "HTTP query should time out"
+
+    time.sleep(2)
+
+    res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200)
+    assert res["command"] == "INSERT", "HTTP query should insert"
+    assert res["rowCount"] == 1, "HTTP query should insert"
+
+    res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400)
+    assert (
+        "duplicate key value violates unique constraint" in res["message"]
+    ), "HTTP query should conflict"

From 580e136b2e67321970b95e0fb51d46d4a2bec550 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 12 Mar 2024 13:14:02 +0100
Subject: [PATCH 0386/1571] Forward all backpressure feedback to compute
 (#7079)

Previously we aggregated ps_feedback on each safekeeper and sent it to
walproposer with every AppendResponse. This PR changes it to send
ps_feedback to walproposer right after receiving it from pageserver,
without aggregating it in memory. Also contains some preparations for
implementing backpressure support for sharding.
---
 libs/utils/src/pageserver_feedback.rs |   6 ++
 safekeeper/src/metrics.rs             |  30 ++++++-
 safekeeper/src/receive_wal.rs         |  98 ++++++++++++++++++-----
 safekeeper/src/safekeeper.rs          |  14 ++--
 safekeeper/src/send_wal.rs            | 109 ++++++++++----------------
 safekeeper/src/timeline.rs            |  22 +++---
 6 files changed, 172 insertions(+), 107 deletions(-)

diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs
index c9fbdde928..bc8fa7362e 100644
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -123,6 +123,12 @@ impl PageserverFeedback {
                         rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
                     }
                 }
+                b"shard_number" => {
+                    let len = buf.get_i32();
+                    // TODO: this will be implemented in the next update,
+                    //  for now, we just skip the value.
+                    buf.advance(len as usize);
+                }
                 _ => {
                     let len = buf.get_i32();
                     warn!(
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index f12e079632..e541527b6a 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -140,6 +140,13 @@ pub static BROKER_ITERATION_TIMELINES: Lazy<Histogram> = Lazy::new(|| {
     )
     .expect("Failed to register safekeeper_broker_iteration_timelines histogram vec")
 });
+pub static RECEIVED_PS_FEEDBACKS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_received_ps_feedbacks_total",
+        "Number of pageserver feedbacks received"
+    )
+    .expect("Failed to register safekeeper_received_ps_feedbacks_total counter")
+});
 
 pub const LABEL_UNKNOWN: &str = "unknown";
 
@@ -301,7 +308,8 @@ pub async fn time_io_closure<E: Into<anyhow::Error>>(
 #[derive(Clone)]
 pub struct FullTimelineInfo {
     pub ttid: TenantTimelineId,
-    pub ps_feedback: PageserverFeedback,
+    pub ps_feedback_count: u64,
+    pub last_ps_feedback: PageserverFeedback,
     pub wal_backup_active: bool,
     pub timeline_is_active: bool,
     pub num_computes: u32,
@@ -327,6 +335,7 @@ pub struct TimelineCollector {
     remote_consistent_lsn: GenericGaugeVec<AtomicU64>,
     ps_last_received_lsn: GenericGaugeVec<AtomicU64>,
     feedback_last_time_seconds: GenericGaugeVec<AtomicU64>,
+    ps_feedback_count: GenericGaugeVec<AtomicU64>,
     timeline_active: GenericGaugeVec<AtomicU64>,
     wal_backup_active: GenericGaugeVec<AtomicU64>,
     connected_computes: IntGaugeVec,
@@ -430,6 +439,15 @@ impl TimelineCollector {
         .unwrap();
         descs.extend(feedback_last_time_seconds.desc().into_iter().cloned());
 
+        let ps_feedback_count = GenericGaugeVec::new(
+            Opts::new(
+                "safekeeper_ps_feedback_count_total",
+                "Number of feedbacks received from the pageserver",
+            ),
+            &["tenant_id", "timeline_id"],
+        )
+        .unwrap();
+
         let timeline_active = GenericGaugeVec::new(
             Opts::new(
                 "safekeeper_timeline_active",
@@ -538,6 +556,7 @@ impl TimelineCollector {
             remote_consistent_lsn,
             ps_last_received_lsn,
             feedback_last_time_seconds,
+            ps_feedback_count,
             timeline_active,
             wal_backup_active,
             connected_computes,
@@ -570,6 +589,7 @@ impl Collector for TimelineCollector {
         self.remote_consistent_lsn.reset();
         self.ps_last_received_lsn.reset();
         self.feedback_last_time_seconds.reset();
+        self.ps_feedback_count.reset();
         self.timeline_active.reset();
         self.wal_backup_active.reset();
         self.connected_computes.reset();
@@ -646,9 +666,12 @@ impl Collector for TimelineCollector {
 
             self.ps_last_received_lsn
                 .with_label_values(labels)
-                .set(tli.ps_feedback.last_received_lsn.0);
+                .set(tli.last_ps_feedback.last_received_lsn.0);
+            self.ps_feedback_count
+                .with_label_values(labels)
+                .set(tli.ps_feedback_count);
             if let Ok(unix_time) = tli
-                .ps_feedback
+                .last_ps_feedback
                 .replytime
                 .duration_since(SystemTime::UNIX_EPOCH)
             {
@@ -679,6 +702,7 @@ impl Collector for TimelineCollector {
         mfs.extend(self.remote_consistent_lsn.collect());
         mfs.extend(self.ps_last_received_lsn.collect());
         mfs.extend(self.feedback_last_time_seconds.collect());
+        mfs.extend(self.ps_feedback_count.collect());
         mfs.extend(self.timeline_active.collect());
         mfs.extend(self.wal_backup_active.collect());
         mfs.extend(self.connected_computes.collect());
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 9ce9b049ba..015b53bb2e 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -36,11 +36,15 @@ use tokio::time::Instant;
 use tracing::*;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
+use utils::pageserver_feedback::PageserverFeedback;
+
+const DEFAULT_FEEDBACK_CAPACITY: usize = 8;
 
 /// Registry of WalReceivers (compute connections). Timeline holds it (wrapped
 /// in Arc).
 pub struct WalReceivers {
     mutex: Mutex<WalReceiversShared>,
+    pageserver_feedback_tx: tokio::sync::broadcast::Sender<PageserverFeedback>,
 }
 
 /// Id under which walreceiver is registered in shmem.
@@ -48,8 +52,12 @@ type WalReceiverId = usize;
 
 impl WalReceivers {
     pub fn new() -> Arc<WalReceivers> {
+        let (pageserver_feedback_tx, _) =
+            tokio::sync::broadcast::channel(DEFAULT_FEEDBACK_CAPACITY);
+
         Arc::new(WalReceivers {
             mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }),
+            pageserver_feedback_tx,
         })
     }
 
@@ -116,6 +124,12 @@ impl WalReceivers {
         let mut shared = self.mutex.lock();
         shared.slots[id] = None;
     }
+
+    /// Broadcast pageserver feedback to connected walproposers.
+    pub fn broadcast_pageserver_feedback(&self, feedback: PageserverFeedback) {
+        // Err means there is no subscribers, it is fine.
+        let _ = self.pageserver_feedback_tx.send(feedback);
+    }
 }
 
 /// Only a few connections are expected (normally one), so store in Vec.
@@ -197,17 +211,28 @@ impl SafekeeperPostgresHandler {
         // sends, so this avoids deadlocks.
         let mut pgb_reader = pgb.split().context("START_WAL_PUSH split")?;
         let peer_addr = *pgb.get_peer_addr();
-        let network_reader = NetworkReader {
+        let mut network_reader = NetworkReader {
             ttid: self.ttid,
             conn_id: self.conn_id,
             pgb_reader: &mut pgb_reader,
             peer_addr,
             acceptor_handle: &mut acceptor_handle,
         };
-        let res = tokio::select! {
-            // todo: add read|write .context to these errors
-            r = network_reader.run(msg_tx, msg_rx, reply_tx) => r,
-            r = network_write(pgb, reply_rx) => r,
+
+        // Read first message and create timeline if needed.
+        let res = network_reader.read_first_message().await;
+
+        let res = if let Ok((tli, next_msg)) = res {
+            let pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback> =
+                tli.get_walreceivers().pageserver_feedback_tx.subscribe();
+
+            tokio::select! {
+                // todo: add read|write .context to these errors
+                r = network_reader.run(msg_tx, msg_rx, reply_tx, tli.clone(), next_msg) => r,
+                r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
+            }
+        } else {
+            res.map(|_| ())
         };
 
         // Join pg backend back.
@@ -251,12 +276,9 @@ struct NetworkReader<'a, IO> {
 }
 
 impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
-    async fn run(
-        self,
-        msg_tx: Sender<ProposerAcceptorMessage>,
-        msg_rx: Receiver<ProposerAcceptorMessage>,
-        reply_tx: Sender<AcceptorProposerMessage>,
-    ) -> Result<(), CopyStreamHandlerEnd> {
+    async fn read_first_message(
+        &mut self,
+    ) -> Result<(Arc<Timeline>, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
         // Receive information about server to create timeline, if not yet.
         let next_msg = read_message(self.pgb_reader).await?;
         let tli = match next_msg {
@@ -278,9 +300,19 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
                 )))
             }
         };
+        Ok((tli, next_msg))
+    }
 
+    async fn run(
+        self,
+        msg_tx: Sender<ProposerAcceptorMessage>,
+        msg_rx: Receiver<ProposerAcceptorMessage>,
+        reply_tx: Sender<AcceptorProposerMessage>,
+        tli: Arc<Timeline>,
+        next_msg: ProposerAcceptorMessage,
+    ) -> Result<(), CopyStreamHandlerEnd> {
         *self.acceptor_handle = Some(WalAcceptor::spawn(
-            tli.clone(),
+            tli,
             msg_rx,
             reply_tx,
             Some(self.conn_id),
@@ -320,18 +352,46 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
 async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
     pgb_writer: &mut PostgresBackend<IO>,
     mut reply_rx: Receiver<AcceptorProposerMessage>,
+    mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback>,
 ) -> Result<(), CopyStreamHandlerEnd> {
     let mut buf = BytesMut::with_capacity(128);
 
+    // storing append_response to inject PageserverFeedback into it
+    let mut last_append_response = None;
+
     loop {
-        match reply_rx.recv().await {
-            Some(msg) => {
-                buf.clear();
-                msg.serialize(&mut buf)?;
-                pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?;
+        // trying to read either AcceptorProposerMessage or PageserverFeedback
+        let msg = tokio::select! {
+            reply = reply_rx.recv() => {
+                if let Some(msg) = reply {
+                    if let AcceptorProposerMessage::AppendResponse(append_response) = &msg {
+                        last_append_response = Some(append_response.clone());
+                    }
+                    Some(msg)
+                } else {
+                    return Ok(()); // chan closed, WalAcceptor terminated
+                }
             }
-            None => return Ok(()), // chan closed, WalAcceptor terminated
-        }
+
+            feedback = pageserver_feedback_rx.recv() =>
+                match (feedback, &last_append_response) {
+                    (Ok(feedback), Some(append_response)) => {
+                        // clone AppendResponse and inject PageserverFeedback into it
+                        let mut append_response = append_response.clone();
+                        append_response.pageserver_feedback = Some(feedback);
+                        Some(AcceptorProposerMessage::AppendResponse(append_response))
+                    }
+                    _ => None,
+                }
+        };
+
+        let Some(msg) = msg else {
+            continue;
+        };
+
+        buf.clear();
+        msg.serialize(&mut buf)?;
+        pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?;
     }
 }
 
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 84393d8dab..d7c8fa6955 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -321,7 +321,7 @@ pub struct AppendRequestHeader {
 }
 
 /// Report safekeeper state to proposer
-#[derive(Debug, Serialize)]
+#[derive(Debug, Serialize, Clone)]
 pub struct AppendResponse {
     // Current term of the safekeeper; if it is higher than proposer's, the
     // compute is out of date.
@@ -334,7 +334,7 @@ pub struct AppendResponse {
     // a criterion for walproposer --sync mode exit
     pub commit_lsn: Lsn,
     pub hs_feedback: HotStandbyFeedback,
-    pub pageserver_feedback: PageserverFeedback,
+    pub pageserver_feedback: Option<PageserverFeedback>,
 }
 
 impl AppendResponse {
@@ -344,7 +344,7 @@ impl AppendResponse {
             flush_lsn: Lsn(0),
             commit_lsn: Lsn(0),
             hs_feedback: HotStandbyFeedback::empty(),
-            pageserver_feedback: PageserverFeedback::empty(),
+            pageserver_feedback: None,
         }
     }
 }
@@ -462,7 +462,11 @@ impl AcceptorProposerMessage {
                 buf.put_u64_le(msg.hs_feedback.xmin);
                 buf.put_u64_le(msg.hs_feedback.catalog_xmin);
 
-                msg.pageserver_feedback.serialize(buf);
+                // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
+                // if it is not present.
+                if let Some(ref msg) = msg.pageserver_feedback {
+                    msg.serialize(buf);
+                }
             }
         }
 
@@ -681,7 +685,7 @@ where
             commit_lsn: self.state.commit_lsn,
             // will be filled by the upper code to avoid bothering safekeeper
             hs_feedback: HotStandbyFeedback::empty(),
-            pageserver_feedback: PageserverFeedback::empty(),
+            pageserver_feedback: None,
         };
         trace!("formed AppendResponse {:?}", ar);
         ar
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 4b887f36b7..7da5fd00b0 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -2,6 +2,8 @@
 //! with the "START_REPLICATION" message, and registry of walsenders.
 
 use crate::handler::SafekeeperPostgresHandler;
+use crate::metrics::RECEIVED_PS_FEEDBACKS;
+use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{Term, TermLsn};
 use crate::timeline::Timeline;
 use crate::wal_service::ConnectionId;
@@ -21,7 +23,7 @@ use utils::failpoint_support;
 use utils::id::TenantTimelineId;
 use utils::pageserver_feedback::PageserverFeedback;
 
-use std::cmp::{max, min};
+use std::cmp::min;
 use std::net::SocketAddr;
 use std::str;
 use std::sync::Arc;
@@ -90,12 +92,14 @@ pub struct StandbyFeedback {
 /// WalSenders registry. Timeline holds it (wrapped in Arc).
 pub struct WalSenders {
     mutex: Mutex<WalSendersShared>,
+    walreceivers: Arc<WalReceivers>,
 }
 
 impl WalSenders {
-    pub fn new() -> Arc<WalSenders> {
+    pub fn new(walreceivers: Arc<WalReceivers>) -> Arc<WalSenders> {
         Arc::new(WalSenders {
             mutex: Mutex::new(WalSendersShared::new()),
+            walreceivers,
         })
     }
 
@@ -151,22 +155,29 @@ impl WalSenders {
             .min()
     }
 
-    /// Get aggregated pageserver feedback.
-    pub fn get_ps_feedback(self: &Arc<WalSenders>) -> PageserverFeedback {
-        self.mutex.lock().agg_ps_feedback
+    /// Returns total counter of pageserver feedbacks received and last feedback.
+    pub fn get_ps_feedback_stats(self: &Arc<WalSenders>) -> (u64, PageserverFeedback) {
+        let shared = self.mutex.lock();
+        (shared.ps_feedback_counter, shared.last_ps_feedback)
     }
 
-    /// Get aggregated pageserver and hot standby feedback (we send them to compute).
-    pub fn get_feedbacks(self: &Arc<WalSenders>) -> (PageserverFeedback, HotStandbyFeedback) {
-        let shared = self.mutex.lock();
-        (shared.agg_ps_feedback, shared.agg_hs_feedback)
+    /// Get aggregated hot standby feedback (we send it to compute).
+    pub fn get_hotstandby(self: &Arc<WalSenders>) -> HotStandbyFeedback {
+        self.mutex.lock().agg_hs_feedback
     }
 
     /// Record new pageserver feedback, update aggregated values.
     fn record_ps_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &PageserverFeedback) {
         let mut shared = self.mutex.lock();
         shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback);
-        shared.update_ps_feedback();
+        shared.last_ps_feedback = *feedback;
+        shared.ps_feedback_counter += 1;
+        drop(shared);
+
+        RECEIVED_PS_FEEDBACKS.inc();
+
+        // send feedback to connected walproposers
+        self.walreceivers.broadcast_pageserver_feedback(*feedback);
     }
 
     /// Record standby reply.
@@ -222,8 +233,10 @@ impl WalSenders {
 struct WalSendersShared {
     // aggregated over all walsenders value
     agg_hs_feedback: HotStandbyFeedback,
-    // aggregated over all walsenders value
-    agg_ps_feedback: PageserverFeedback,
+    // last feedback ever received from any pageserver, empty if none
+    last_ps_feedback: PageserverFeedback,
+    // total counter of pageserver feedbacks received
+    ps_feedback_counter: u64,
     slots: Vec<Option<WalSenderState>>,
 }
 
@@ -231,7 +244,8 @@ impl WalSendersShared {
     fn new() -> Self {
         WalSendersShared {
             agg_hs_feedback: HotStandbyFeedback::empty(),
-            agg_ps_feedback: PageserverFeedback::empty(),
+            last_ps_feedback: PageserverFeedback::empty(),
+            ps_feedback_counter: 0,
             slots: Vec::new(),
         }
     }
@@ -276,37 +290,6 @@ impl WalSendersShared {
         }
         self.agg_hs_feedback = agg;
     }
-
-    /// Update aggregated pageserver feedback. LSNs (last_received,
-    /// disk_consistent, remote_consistent) and reply timestamp are just
-    /// maximized; timeline_size if taken from feedback with highest
-    /// last_received lsn. This is generally reasonable, but we might want to
-    /// implement other policies once multiple pageservers start to be actively
-    /// used.
-    fn update_ps_feedback(&mut self) {
-        let init = PageserverFeedback::empty();
-        let acc =
-            self.slots
-                .iter()
-                .flatten()
-                .fold(init, |mut acc, ws_state| match ws_state.feedback {
-                    ReplicationFeedback::Pageserver(feedback) => {
-                        if feedback.last_received_lsn > acc.last_received_lsn {
-                            acc.current_timeline_size = feedback.current_timeline_size;
-                        }
-                        acc.last_received_lsn =
-                            max(feedback.last_received_lsn, acc.last_received_lsn);
-                        acc.disk_consistent_lsn =
-                            max(feedback.disk_consistent_lsn, acc.disk_consistent_lsn);
-                        acc.remote_consistent_lsn =
-                            max(feedback.remote_consistent_lsn, acc.remote_consistent_lsn);
-                        acc.replytime = max(feedback.replytime, acc.replytime);
-                        acc
-                    }
-                    ReplicationFeedback::Standby(_) => acc,
-                });
-        self.agg_ps_feedback = acc;
-    }
 }
 
 // Serialized is used only for pretty printing in json.
@@ -443,7 +426,7 @@ impl SafekeeperPostgresHandler {
         };
         let mut reply_reader = ReplyReader {
             reader,
-            ws_guard,
+            ws_guard: ws_guard.clone(),
             tli,
         };
 
@@ -452,6 +435,18 @@ impl SafekeeperPostgresHandler {
             r = sender.run() => r,
             r = reply_reader.run() => r,
         };
+
+        let ws_state = ws_guard
+            .walsenders
+            .mutex
+            .lock()
+            .get_slot(ws_guard.id)
+            .clone();
+        info!(
+            "finished streaming to {}, feedback={:?}",
+            ws_state.addr, ws_state.feedback,
+        );
+
         // Join pg backend back.
         pgb.unsplit(reply_reader.reader)?;
 
@@ -733,7 +728,6 @@ async fn wait_for_lsn(
 
 #[cfg(test)]
 mod tests {
-    use postgres_protocol::PG_EPOCH;
     use utils::id::{TenantId, TimelineId};
 
     use super::*;
@@ -792,27 +786,4 @@ mod tests {
         wss.update_hs_feedback();
         assert_eq!(wss.agg_hs_feedback.xmin, 42);
     }
-
-    // form pageserver feedback with given last_record_lsn / tli size and the
-    // rest set to dummy values.
-    fn ps_feedback(current_timeline_size: u64, last_received_lsn: Lsn) -> ReplicationFeedback {
-        ReplicationFeedback::Pageserver(PageserverFeedback {
-            current_timeline_size,
-            last_received_lsn,
-            disk_consistent_lsn: Lsn::INVALID,
-            remote_consistent_lsn: Lsn::INVALID,
-            replytime: *PG_EPOCH,
-        })
-    }
-
-    // test that ps aggregation works as expected
-    #[test]
-    fn test_ps_feedback() {
-        let mut wss = WalSendersShared::new();
-        push_feedback(&mut wss, ps_feedback(8, Lsn(42)));
-        push_feedback(&mut wss, ps_feedback(4, Lsn(84)));
-        wss.update_ps_feedback();
-        assert_eq!(wss.agg_ps_feedback.current_timeline_size, 4);
-        assert_eq!(wss.agg_ps_feedback.last_received_lsn, Lsn(84));
-    }
 }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 9b7ab14218..4901b86acf 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -402,6 +402,7 @@ impl Timeline {
         )));
         let (cancellation_tx, cancellation_rx) = watch::channel(false);
 
+        let walreceivers = WalReceivers::new();
         Ok(Timeline {
             ttid,
             wal_backup_launcher_tx,
@@ -410,8 +411,8 @@ impl Timeline {
             term_flush_lsn_watch_tx,
             term_flush_lsn_watch_rx,
             mutex: Mutex::new(shared_state),
-            walsenders: WalSenders::new(),
-            walreceivers: WalReceivers::new(),
+            walsenders: WalSenders::new(walreceivers.clone()),
+            walreceivers,
             cancellation_rx,
             cancellation_tx,
             timeline_dir: conf.timeline_dir(&ttid),
@@ -435,6 +436,7 @@ impl Timeline {
         let state =
             TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
 
+        let walreceivers = WalReceivers::new();
         Ok(Timeline {
             ttid,
             wal_backup_launcher_tx,
@@ -443,8 +445,8 @@ impl Timeline {
             term_flush_lsn_watch_tx,
             term_flush_lsn_watch_rx,
             mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?),
-            walsenders: WalSenders::new(),
-            walreceivers: WalReceivers::new(),
+            walsenders: WalSenders::new(walreceivers.clone()),
+            walreceivers,
             cancellation_rx,
             cancellation_tx,
             timeline_dir: conf.timeline_dir(&ttid),
@@ -656,12 +658,9 @@ impl Timeline {
             let mut shared_state = self.write_shared_state().await;
             rmsg = shared_state.sk.process_msg(msg).await?;
 
-            // if this is AppendResponse, fill in proper pageserver and hot
-            // standby feedback.
+            // if this is AppendResponse, fill in proper hot standby feedback.
             if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
-                let (ps_feedback, hs_feedback) = self.walsenders.get_feedbacks();
-                resp.hs_feedback = hs_feedback;
-                resp.pageserver_feedback = ps_feedback;
+                resp.hs_feedback = self.walsenders.get_hotstandby();
             }
 
             commit_lsn = shared_state.sk.state.inmem.commit_lsn;
@@ -898,12 +897,13 @@ impl Timeline {
             return None;
         }
 
-        let ps_feedback = self.walsenders.get_ps_feedback();
+        let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats();
         let state = self.write_shared_state().await;
         if state.active {
             Some(FullTimelineInfo {
                 ttid: self.ttid,
-                ps_feedback,
+                ps_feedback_count,
+                last_ps_feedback,
                 wal_backup_active: state.wal_backup_active,
                 timeline_is_active: state.active,
                 num_computes: self.walreceivers.get_num() as u32,

From 1f7d54f9872482b4b181f93dee2e6d91173d0ef8 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 12 Mar 2024 13:05:40 +0000
Subject: [PATCH 0387/1571] proxy refactor tls listener (#7056)

## Problem

Now that we have tls-listener vendored, we can refactor and remove a lot
of bloated code and make the whole flow a bit simpler

## Summary of changes

1. Remove dead code
2. Move the error handling to inside the `TlsListener` accept() function
3. Extract the peer_addr from the PROXY protocol header and log it with
errors
---
 proxy/src/protocol2.rs               |   8 +-
 proxy/src/serverless.rs              |  30 +--
 proxy/src/serverless/tls_listener.rs | 321 +++++++--------------------
 3 files changed, 97 insertions(+), 262 deletions(-)

diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 3a7aabca32..f476cb9b37 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -17,7 +17,7 @@ use pin_project_lite::pin_project;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
 use uuid::Uuid;
 
-use crate::{metrics::NUM_CLIENT_CONNECTION_GAUGE, serverless::tls_listener::AsyncAccept};
+use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
 
 pub struct ProxyProtocolAccept {
     pub incoming: AddrIncoming,
@@ -331,15 +331,15 @@ impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
     }
 }
 
-impl AsyncAccept for ProxyProtocolAccept {
-    type Connection = WithConnectionGuard<WithClientIp<AddrStream>>;
+impl Accept for ProxyProtocolAccept {
+    type Conn = WithConnectionGuard<WithClientIp<AddrStream>>;
 
     type Error = io::Error;
 
     fn poll_accept(
         mut self: Pin<&mut Self>,
         cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
+    ) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
         let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
         tracing::info!(protocol = self.protocol, "accepted new TCP connection");
         let Some(conn) = conn else {
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index c81ae03b23..68f68eaba1 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -21,24 +21,19 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
 
 use crate::context::RequestMonitoring;
-use crate::metrics::TLS_HANDSHAKE_FAILURES;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
 use crate::{cancellation::CancellationHandler, config::ProxyConfig};
-use futures::StreamExt;
 use hyper::{
-    server::{
-        accept,
-        conn::{AddrIncoming, AddrStream},
-    },
+    server::conn::{AddrIncoming, AddrStream},
     Body, Method, Request, Response,
 };
 
 use std::convert::Infallible;
 use std::net::IpAddr;
+use std::sync::Arc;
 use std::task::Poll;
-use std::{future::ready, sync::Arc};
 use tls_listener::TlsListener;
 use tokio::net::TcpListener;
 use tokio_util::sync::CancellationToken;
@@ -105,19 +100,12 @@ pub async fn task_main(
     let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
     ws_connections.close(); // allows `ws_connections.wait to complete`
 
-    let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
-        if let Err(err) = conn {
-            error!(
-                protocol = "http",
-                "failed to accept TLS connection: {err:?}"
-            );
-            TLS_HANDSHAKE_FAILURES.inc();
-            ready(false)
-        } else {
-            info!(protocol = "http", "accepted new TLS connection");
-            ready(true)
-        }
-    });
+    let tls_listener = TlsListener::new(
+        tls_acceptor,
+        addr_incoming,
+        "http",
+        config.handshake_timeout,
+    );
 
     let make_svc = hyper::service::make_service_fn(
         |stream: &tokio_rustls::server::TlsStream<
@@ -174,7 +162,7 @@ pub async fn task_main(
         },
     );
 
-    hyper::Server::builder(accept::from_stream(tls_listener))
+    hyper::Server::builder(tls_listener)
         .serve(make_svc)
         .with_graceful_shutdown(cancellation_token.cancelled())
         .await?;
diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs
index 6196ff393c..cce02e3850 100644
--- a/proxy/src/serverless/tls_listener.rs
+++ b/proxy/src/serverless/tls_listener.rs
@@ -1,186 +1,110 @@
 use std::{
+    convert::Infallible,
     pin::Pin,
     task::{Context, Poll},
     time::Duration,
 };
 
-use futures::{Future, Stream, StreamExt};
+use hyper::server::{accept::Accept, conn::AddrStream};
 use pin_project_lite::pin_project;
-use thiserror::Error;
 use tokio::{
     io::{AsyncRead, AsyncWrite},
     task::JoinSet,
     time::timeout,
 };
+use tokio_rustls::{server::TlsStream, TlsAcceptor};
+use tracing::{info, warn};
 
-/// Default timeout for the TLS handshake.
-pub const DEFAULT_HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(10);
+use crate::{
+    metrics::TLS_HANDSHAKE_FAILURES,
+    protocol2::{WithClientIp, WithConnectionGuard},
+};
 
-/// Trait for TLS implementation.
-///
-/// Implementations are provided by the rustls and native-tls features.
-pub trait AsyncTls<C: AsyncRead + AsyncWrite>: Clone {
-    /// The type of the TLS stream created from the underlying stream.
-    type Stream: Send + 'static;
-    /// Error type for completing the TLS handshake
-    type Error: std::error::Error + Send + 'static;
-    /// Type of the Future for the TLS stream that is accepted.
-    type AcceptFuture: Future<Output = Result<Self::Stream, Self::Error>> + Send + 'static;
-
-    /// Accept a TLS connection on an underlying stream
-    fn accept(&self, stream: C) -> Self::AcceptFuture;
+pin_project! {
+    /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself
+    /// encrypted using TLS.
+    pub(crate) struct TlsListener<A: Accept> {
+        #[pin]
+        listener: A,
+        tls: TlsAcceptor,
+        waiting: JoinSet<Option<TlsStream<A::Conn>>>,
+        timeout: Duration,
+        protocol: &'static str,
+    }
 }
 
-/// Asynchronously accept connections.
-pub trait AsyncAccept {
-    /// The type of the connection that is accepted.
-    type Connection: AsyncRead + AsyncWrite;
-    /// The type of error that may be returned.
-    type Error;
-
-    /// Poll to accept the next connection.
-    fn poll_accept(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Connection, Self::Error>>>;
-
-    /// Return a new `AsyncAccept` that stops accepting connections after
-    /// `ender` completes.
-    ///
-    /// Useful for graceful shutdown.
-    ///
-    /// See [examples/echo.rs](https://github.com/tmccombs/tls-listener/blob/main/examples/echo.rs)
-    /// for example of how to use.
-    fn until<F: Future>(self, ender: F) -> Until<Self, F>
-    where
-        Self: Sized,
-    {
-        Until {
-            acceptor: self,
-            ender,
+impl<A: Accept> TlsListener<A> {
+    /// Create a `TlsListener` with default options.
+    pub(crate) fn new(
+        tls: TlsAcceptor,
+        listener: A,
+        protocol: &'static str,
+        timeout: Duration,
+    ) -> Self {
+        TlsListener {
+            listener,
+            tls,
+            waiting: JoinSet::new(),
+            timeout,
+            protocol,
         }
     }
 }
 
-pin_project! {
-    ///
-    /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself
-    /// encrypted using TLS.
-    ///
-    /// It is similar to:
-    ///
-    /// ```ignore
-    /// tcpListener.and_then(|s| tlsAcceptor.accept(s))
-    /// ```
-    ///
-    /// except that it has the ability to accept multiple transport-level connections
-    /// simultaneously while the TLS handshake is pending for other connections.
-    ///
-    /// By default, if a client fails the TLS handshake, that is treated as an error, and the
-    /// `TlsListener` will return an `Err`. If the `TlsListener` is passed directly to a hyper
-    /// [`Server`][1], then an invalid handshake can cause the server to stop accepting connections.
-    /// See [`http-stream.rs`][2] or [`http-low-level`][3] examples, for examples of how to avoid this.
-    ///
-    /// Note that if the maximum number of pending connections is greater than 1, the resulting
-    /// [`T::Stream`][4] connections may come in a different order than the connections produced by the
-    /// underlying listener.
-    ///
-    /// [1]: https://docs.rs/hyper/latest/hyper/server/struct.Server.html
-    /// [2]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-stream.rs
-    /// [3]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-low-level.rs
-    /// [4]: AsyncTls::Stream
-    ///
-    #[allow(clippy::type_complexity)]
-    pub struct TlsListener<A: AsyncAccept, T: AsyncTls<A::Connection>> {
-        #[pin]
-        listener: A,
-        tls: T,
-        waiting: JoinSet<Result<Result<T::Stream, T::Error>, tokio::time::error::Elapsed>>,
-        timeout: Duration,
-    }
-}
-
-/// Builder for `TlsListener`.
-#[derive(Clone)]
-pub struct Builder<T> {
-    tls: T,
-    handshake_timeout: Duration,
-}
-
-/// Wraps errors from either the listener or the TLS Acceptor
-#[derive(Debug, Error)]
-pub enum Error<LE: std::error::Error, TE: std::error::Error> {
-    /// An error that arose from the listener ([AsyncAccept::Error])
-    #[error("{0}")]
-    ListenerError(#[source] LE),
-    /// An error that occurred during the TLS accept handshake
-    #[error("{0}")]
-    TlsAcceptError(#[source] TE),
-}
-
-impl<A: AsyncAccept, T> TlsListener<A, T>
+impl<A> Accept for TlsListener<A>
 where
-    T: AsyncTls<A::Connection>,
-{
-    /// Create a `TlsListener` with default options.
-    pub fn new(tls: T, listener: A) -> Self {
-        builder(tls).listen(listener)
-    }
-}
-
-impl<A, T> TlsListener<A, T>
-where
-    A: AsyncAccept,
+    A: Accept<Conn = WithConnectionGuard<WithClientIp<AddrStream>>>,
     A::Error: std::error::Error,
-    T: AsyncTls<A::Connection>,
+    A::Conn: AsyncRead + AsyncWrite + Unpin + Send + 'static,
 {
-    /// Accept the next connection
-    ///
-    /// This is essentially an alias to `self.next()` with a more domain-appropriate name.
-    pub async fn accept(&mut self) -> Option<<Self as Stream>::Item>
-    where
-        Self: Unpin,
-    {
-        self.next().await
-    }
+    type Conn = TlsStream<A::Conn>;
 
-    /// Replaces the Tls Acceptor configuration, which will be used for new connections.
-    ///
-    /// This can be used to change the certificate used at runtime.
-    pub fn replace_acceptor(&mut self, acceptor: T) {
-        self.tls = acceptor;
-    }
+    type Error = Infallible;
 
-    /// Replaces the Tls Acceptor configuration from a pinned reference to `Self`.
-    ///
-    /// This is useful if your listener is `!Unpin`.
-    ///
-    /// This can be used to change the certificate used at runtime.
-    pub fn replace_acceptor_pin(self: Pin<&mut Self>, acceptor: T) {
-        *self.project().tls = acceptor;
-    }
-}
-
-impl<A, T> Stream for TlsListener<A, T>
-where
-    A: AsyncAccept,
-    A::Error: std::error::Error,
-    T: AsyncTls<A::Connection>,
-{
-    type Item = Result<T::Stream, Error<A::Error, T::Error>>;
-
-    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+    fn poll_accept(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
         let mut this = self.project();
 
         loop {
             match this.listener.as_mut().poll_accept(cx) {
                 Poll::Pending => break,
-                Poll::Ready(Some(Ok(conn))) => {
-                    this.waiting
-                        .spawn(timeout(*this.timeout, this.tls.accept(conn)));
+                Poll::Ready(Some(Ok(mut conn))) => {
+                    let t = *this.timeout;
+                    let tls = this.tls.clone();
+                    let protocol = *this.protocol;
+                    this.waiting.spawn(async move {
+                        let peer_addr = match conn.inner.wait_for_addr().await {
+                            Ok(Some(addr)) => addr,
+                            Err(e) => {
+                                tracing::error!("failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
+                                return None;
+                            }
+                            Ok(None) => conn.inner.inner.remote_addr()
+                        };
+
+                        let accept = tls.accept(conn);
+                        match timeout(t, accept).await {
+                            Ok(Ok(conn)) => Some(conn),
+                            // The handshake failed, try getting another connection from the queue
+                            Ok(Err(e)) => {
+                                TLS_HANDSHAKE_FAILURES.inc();
+                                warn!(%peer_addr, protocol, "failed to accept TLS connection: {e:?}");
+                                None
+                            }
+                            // The handshake timed out, try getting another connection from the queue
+                            Err(_) => {
+                                TLS_HANDSHAKE_FAILURES.inc();
+                                warn!(%peer_addr, protocol, "failed to accept TLS connection: timeout");
+                                None
+                            }
+                        }
+                    });
                 }
                 Poll::Ready(Some(Err(e))) => {
-                    return Poll::Ready(Some(Err(Error::ListenerError(e))));
+                    tracing::error!("error accepting TCP connection: {e}");
+                    continue;
                 }
                 Poll::Ready(None) => return Poll::Ready(None),
             }
@@ -188,96 +112,19 @@ where
 
         loop {
             return match this.waiting.poll_join_next(cx) {
-                Poll::Ready(Some(Ok(Ok(conn)))) => {
-                    Poll::Ready(Some(conn.map_err(Error::TlsAcceptError)))
+                Poll::Ready(Some(Ok(Some(conn)))) => {
+                    info!(protocol = this.protocol, "accepted new TLS connection");
+                    Poll::Ready(Some(Ok(conn)))
                 }
-                // The handshake timed out, try getting another connection from the queue
-                Poll::Ready(Some(Ok(Err(_)))) => continue,
-                // The handshake panicked
-                Poll::Ready(Some(Err(e))) if e.is_panic() => {
-                    std::panic::resume_unwind(e.into_panic())
+                // The handshake failed to complete, try getting another connection from the queue
+                Poll::Ready(Some(Ok(None))) => continue,
+                // The handshake panicked or was cancelled. ignore and get another connection
+                Poll::Ready(Some(Err(e))) => {
+                    tracing::warn!("handshake aborted: {e}");
+                    continue;
                 }
-                // The handshake was externally aborted
-                Poll::Ready(Some(Err(_))) => unreachable!("handshake tasks are never aborted"),
                 _ => Poll::Pending,
             };
         }
     }
 }
-
-impl<C: AsyncRead + AsyncWrite + Unpin + Send + 'static> AsyncTls<C> for tokio_rustls::TlsAcceptor {
-    type Stream = tokio_rustls::server::TlsStream<C>;
-    type Error = std::io::Error;
-    type AcceptFuture = tokio_rustls::Accept<C>;
-
-    fn accept(&self, conn: C) -> Self::AcceptFuture {
-        tokio_rustls::TlsAcceptor::accept(self, conn)
-    }
-}
-
-impl<T> Builder<T> {
-    /// Set the timeout for handshakes.
-    ///
-    /// If a timeout takes longer than `timeout`, then the handshake will be
-    /// aborted and the underlying connection will be dropped.
-    ///
-    /// Defaults to `DEFAULT_HANDSHAKE_TIMEOUT`.
-    pub fn handshake_timeout(&mut self, timeout: Duration) -> &mut Self {
-        self.handshake_timeout = timeout;
-        self
-    }
-
-    /// Create a `TlsListener` from the builder
-    ///
-    /// Actually build the `TlsListener`. The `listener` argument should be
-    /// an implementation of the `AsyncAccept` trait that accepts new connections
-    /// that the `TlsListener` will  encrypt using TLS.
-    pub fn listen<A: AsyncAccept>(&self, listener: A) -> TlsListener<A, T>
-    where
-        T: AsyncTls<A::Connection>,
-    {
-        TlsListener {
-            listener,
-            tls: self.tls.clone(),
-            waiting: JoinSet::new(),
-            timeout: self.handshake_timeout,
-        }
-    }
-}
-
-/// Create a new Builder for a TlsListener
-///
-/// `server_config` will be used to configure the TLS sessions.
-pub fn builder<T>(tls: T) -> Builder<T> {
-    Builder {
-        tls,
-        handshake_timeout: DEFAULT_HANDSHAKE_TIMEOUT,
-    }
-}
-
-pin_project! {
-    /// See [`AsyncAccept::until`]
-    pub struct Until<A, E> {
-        #[pin]
-        acceptor: A,
-        #[pin]
-        ender: E,
-    }
-}
-
-impl<A: AsyncAccept, E: Future> AsyncAccept for Until<A, E> {
-    type Connection = A::Connection;
-    type Error = A::Error;
-
-    fn poll_accept(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
-        let this = self.project();
-
-        match this.ender.poll(cx) {
-            Poll::Pending => this.acceptor.poll_accept(cx),
-            Poll::Ready(_) => Poll::Ready(None),
-        }
-    }
-}

From 7ae8364b0b0746b335f1d6e7c0d409fc1a236ffe Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 12 Mar 2024 14:47:12 +0000
Subject: [PATCH 0388/1571] storage controller: register nodes in re-attach
 request (#7040)

## Problem

Currently we manually register nodes with the storage controller, and
use a script during deploy to register with the cloud control plane.
Rather than extend that script further, nodes should just register on
startup.

## Summary of changes

- Extend the re-attach request to include an optional
NodeRegisterRequest
- If the `register` field is set, handle it like a normal node
registration before executing the normal re-attach work.
- Update tests/neon_local that used to rely on doing an explicit
register step that could be enabled/disabled.

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 .../attachment_service/src/service.rs         |  4 ++
 control_plane/src/bin/neon_local.rs           | 13 ++---
 control_plane/src/endpoint.rs                 |  5 +-
 control_plane/src/pageserver.rs               | 48 ++++++++--------
 libs/pageserver_api/src/upcall_api.rs         |  9 ++-
 pageserver/src/config.rs                      | 27 ++++++++-
 pageserver/src/control_plane_client.rs        | 55 ++++++++++++++++++-
 pageserver/src/deletion_queue.rs              |  5 +-
 pageserver/src/tenant/mgr.rs                  |  2 +-
 test_runner/fixtures/neon_fixtures.py         | 20 ++++---
 test_runner/regress/test_compatibility.py     |  2 +-
 .../regress/test_pageserver_generations.py    |  4 +-
 test_runner/regress/test_sharding_service.py  |  3 +-
 13 files changed, 145 insertions(+), 52 deletions(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 3f245b5255..a8498a39b5 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -922,6 +922,10 @@ impl Service {
         &self,
         reattach_req: ReAttachRequest,
     ) -> Result<ReAttachResponse, ApiError> {
+        if let Some(register_req) = reattach_req.register {
+            self.node_register(register_req).await?;
+        }
+
         // Take a re-attach as indication that the node is available: this is a precursor to proper
         // heartbeating in https://github.com/neondatabase/neon/issues/6844
         self.node_configure(NodeConfigureRequest {
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 86b9c0085d..952229c4b7 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1100,9 +1100,8 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     match sub_match.subcommand() {
         Some(("start", subcommand_args)) => {
-            let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
             if let Err(e) = get_pageserver(env, subcommand_args)?
-                .start(&pageserver_config_overrides(subcommand_args), *register)
+                .start(&pageserver_config_overrides(subcommand_args))
                 .await
             {
                 eprintln!("pageserver start failed: {e}");
@@ -1131,7 +1130,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
             }
 
             if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args), false)
+                .start(&pageserver_config_overrides(subcommand_args))
                 .await
             {
                 eprintln!("pageserver start failed: {e}");
@@ -1293,7 +1292,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
     for ps_conf in &env.pageservers {
         let pageserver = PageServerNode::from_env(env, ps_conf);
         if let Err(e) = pageserver
-            .start(&pageserver_config_overrides(sub_match), true)
+            .start(&pageserver_config_overrides(sub_match))
             .await
         {
             eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
@@ -1596,11 +1595,7 @@ fn cli() -> Command {
                 .subcommand(Command::new("status"))
                 .subcommand(Command::new("start")
                     .about("Start local pageserver")
-                    .arg(pageserver_config_args.clone()).arg(Arg::new("register")
-                    .long("register")
-                    .default_value("true").required(false)
-                    .value_parser(value_parser!(bool))
-                    .value_name("register"))
+                    .arg(pageserver_config_args.clone())
                 )
                 .subcommand(Command::new("stop")
                     .about("Stop local pageserver")
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 646bc2e8bc..5206222961 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -774,7 +774,10 @@ impl Endpoint {
             spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
         }
 
-        let client = reqwest::Client::new();
+        let client = reqwest::Client::builder()
+            .timeout(Duration::from_secs(30))
+            .build()
+            .unwrap();
         let response = client
             .post(format!(
                 "http://{}:{}/configure",
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 021b9aca34..06ec942895 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,7 +17,6 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
-use pageserver_api::controller_api::NodeRegisterRequest;
 use pageserver_api::models::{
     self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
 };
@@ -32,7 +31,6 @@ use utils::{
 };
 
 use crate::local_env::PageServerConf;
-use crate::storage_controller::StorageController;
 use crate::{background_process, local_env::LocalEnv};
 
 /// Directory within .neon which will be used by default for LocalFs remote storage.
@@ -163,8 +161,8 @@ impl PageServerNode {
             .expect("non-Unicode path")
     }
 
-    pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
-        self.start_node(config_overrides, false, register).await
+    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
+        self.start_node(config_overrides, false).await
     }
 
     fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -202,6 +200,28 @@ impl PageServerNode {
             String::from_utf8_lossy(&init_output.stderr),
         );
 
+        // Write metadata file, used by pageserver on startup to register itself with
+        // the storage controller
+        let metadata_path = datadir.join("metadata.json");
+
+        let (_http_host, http_port) =
+            parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
+        let http_port = http_port.unwrap_or(9898);
+        // Intentionally hand-craft JSON: this acts as an implicit format compat test
+        // in case the pageserver-side structure is edited, and reflects the real life
+        // situation: the metadata is written by some other script.
+        std::fs::write(
+            metadata_path,
+            serde_json::to_vec(&serde_json::json!({
+                "host": "localhost",
+                "port": self.pg_connection_config.port(),
+                "http_host": "localhost",
+                "http_port": http_port,
+            }))
+            .unwrap(),
+        )
+        .expect("Failed to write metadata file");
+
         Ok(())
     }
 
@@ -209,27 +229,7 @@ impl PageServerNode {
         &self,
         config_overrides: &[&str],
         update_config: bool,
-        register: bool,
     ) -> anyhow::Result<()> {
-        // Register the node with the storage controller before starting pageserver: pageserver must be registered to
-        // successfully call /re-attach and finish starting up.
-        if register {
-            let storage_controller = StorageController::from_env(&self.env);
-            let (pg_host, pg_port) =
-                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            storage_controller
-                .node_register(NodeRegisterRequest {
-                    node_id: self.conf.id,
-                    listen_pg_addr: pg_host.to_string(),
-                    listen_pg_port: pg_port.unwrap_or(5432),
-                    listen_http_addr: http_host.to_string(),
-                    listen_http_port: http_port.unwrap_or(80),
-                })
-                .await?;
-        }
-
         // TODO: using a thread here because start_process() is not async but we need to call check_status()
         let datadir = self.repo_path();
         print!(
diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs
index 0acc3a7bb0..5472948091 100644
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -6,11 +6,18 @@
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;
 
-use crate::shard::TenantShardId;
+use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};
 
+/// Upcall message sent by the pageserver to the configured `control_plane_api` on
+/// startup.
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachRequest {
     pub node_id: NodeId,
+
+    /// Optional inline self-registration: this is useful with the storage controller,
+    /// if the node already has a node_id set.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub register: Option<NodeRegisterRequest>,
 }
 
 #[derive(Serialize, Deserialize)]
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 4adcedafd1..845b20c8db 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -7,8 +7,9 @@
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{RemotePath, RemoteStorageConfig};
+use serde;
 use serde::de::IntoDeserializer;
-use std::env;
+use std::{collections::HashMap, env};
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
@@ -304,6 +305,26 @@ impl<T> BuilderValue<T> {
     }
 }
 
+// Certain metadata (e.g. externally-addressable name, AZ) is delivered
+// as a separate structure.  This information is not neeed by the pageserver
+// itself, it is only used for registering the pageserver with the control
+// plane and/or storage controller.
+//
+#[derive(serde::Deserialize)]
+pub(crate) struct NodeMetadata {
+    #[serde(rename = "host")]
+    pub(crate) postgres_host: String,
+    #[serde(rename = "port")]
+    pub(crate) postgres_port: u16,
+    pub(crate) http_host: String,
+    pub(crate) http_port: u16,
+
+    // Deployment tools may write fields to the metadata file beyond what we
+    // use in this type: this type intentionally only names fields that require.
+    #[serde(flatten)]
+    pub(crate) other: HashMap<String, serde_json::Value>,
+}
+
 // needed to simplify config construction
 struct PageServerConfigBuilder {
     listen_pg_addr: BuilderValue<String>,
@@ -761,6 +782,10 @@ impl PageServerConf {
         self.workdir.join("deletion")
     }
 
+    pub fn metadata_path(&self) -> Utf8PathBuf {
+        self.workdir.join("metadata.json")
+    }
+
     pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf {
         // Encode a version in the filename, so that if we ever switch away from JSON we can
         // increment this.
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 3fcf3a983b..1b3d76335d 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -2,6 +2,7 @@ use std::collections::HashMap;
 
 use futures::Future;
 use pageserver_api::{
+    controller_api::NodeRegisterRequest,
     shard::TenantShardId,
     upcall_api::{
         ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
@@ -12,7 +13,10 @@ use tokio_util::sync::CancellationToken;
 use url::Url;
 use utils::{backoff, generation::Generation, id::NodeId};
 
-use crate::config::PageServerConf;
+use crate::{
+    config::{NodeMetadata, PageServerConf},
+    virtual_file::on_fatal_io_error,
+};
 
 /// The Pageserver's client for using the control plane API: this is a small subset
 /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
@@ -32,6 +36,7 @@ pub enum RetryForeverError {
 pub trait ControlPlaneGenerationsApi {
     fn re_attach(
         &self,
+        conf: &PageServerConf,
     ) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
     fn validate(
         &self,
@@ -110,13 +115,59 @@ impl ControlPlaneClient {
 
 impl ControlPlaneGenerationsApi for ControlPlaneClient {
     /// Block until we get a successful response, or error out if we are shut down
-    async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
+    async fn re_attach(
+        &self,
+        conf: &PageServerConf,
+    ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
         let re_attach_path = self
             .base_url
             .join("re-attach")
             .expect("Failed to build re-attach path");
+
+        // Include registration content in the re-attach request if a metadata file is readable
+        let metadata_path = conf.metadata_path();
+        let register = match tokio::fs::read_to_string(&metadata_path).await {
+            Ok(metadata_str) => match serde_json::from_str::<NodeMetadata>(&metadata_str) {
+                Ok(m) => {
+                    // Since we run one time at startup, be generous in our logging and
+                    // dump all metadata.
+                    tracing::info!(
+                        "Loaded node metadata: postgres {}:{}, http {}:{}, other fields: {:?}",
+                        m.postgres_host,
+                        m.postgres_port,
+                        m.http_host,
+                        m.http_port,
+                        m.other
+                    );
+
+                    Some(NodeRegisterRequest {
+                        node_id: conf.id,
+                        listen_pg_addr: m.postgres_host,
+                        listen_pg_port: m.postgres_port,
+                        listen_http_addr: m.http_host,
+                        listen_http_port: m.http_port,
+                    })
+                }
+                Err(e) => {
+                    tracing::error!("Unreadable metadata in {metadata_path}: {e}");
+                    None
+                }
+            },
+            Err(e) => {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    // This is legal: we may have been deployed with some external script
+                    // doing registration for us.
+                    tracing::info!("Metadata file not found at {metadata_path}");
+                } else {
+                    on_fatal_io_error(&e, &format!("Loading metadata at {metadata_path}"))
+                }
+                None
+            }
+        };
+
         let request = ReAttachRequest {
             node_id: self.node_id,
+            register,
         };
 
         fail::fail_point!("control-plane-client-re-attach");
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 313eb2663d..b6aea8fae8 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -831,7 +831,10 @@ mod test {
     }
 
     impl ControlPlaneGenerationsApi for MockControlPlane {
-        async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
+        async fn re_attach(
+            &self,
+            _conf: &PageServerConf,
+        ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
             unimplemented!()
         }
         async fn validate(
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index fc08b3c82e..38274448b3 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -295,7 +295,7 @@ async fn init_load_generations(
     } else if let Some(client) = ControlPlaneClient::new(conf, cancel) {
         info!("Calling control plane API to re-attach tenants");
         // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
-        match client.re_attach().await {
+        match client.re_attach(conf).await {
             Ok(tenants) => tenants,
             Err(RetryForeverError::ShuttingDown) => {
                 anyhow::bail!("Shut down while waiting for control plane re-attach response")
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b7196a2556..975c6d865b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -519,9 +519,9 @@ class NeonEnvBuilder:
         self.env = NeonEnv(self)
         return self.env
 
-    def start(self):
+    def start(self, register_pageservers=False):
         assert self.env is not None, "environment is not already initialized, call init() first"
-        self.env.start()
+        self.env.start(register_pageservers=register_pageservers)
 
     def init_start(
         self,
@@ -1112,7 +1112,7 @@ class NeonEnv:
         log.info(f"Config: {cfg}")
         self.neon_cli.init(cfg, force=config.config_init_force)
 
-    def start(self):
+    def start(self, register_pageservers=False):
         # storage controller starts first, so that pageserver /re-attach calls don't
         # bounce through retries on startup
         self.storage_controller.start()
@@ -1124,6 +1124,11 @@ class NeonEnv:
         # reconcile.
         wait_until(30, 1, storage_controller_ready)
 
+        if register_pageservers:
+            # Special case for forward compat tests, this can be removed later.
+            for pageserver in self.pageservers:
+                self.storage_controller.node_register(pageserver)
+
         # Start up broker, pageserver and all safekeepers
         futs = []
         with concurrent.futures.ThreadPoolExecutor(
@@ -1712,10 +1717,8 @@ class NeonCli(AbstractNeonCli):
         id: int,
         overrides: Tuple[str, ...] = (),
         extra_env_vars: Optional[Dict[str, str]] = None,
-        register: bool = True,
     ) -> "subprocess.CompletedProcess[str]":
-        register_str = "true" if register else "false"
-        start_args = ["pageserver", "start", f"--id={id}", *overrides, f"--register={register_str}"]
+        start_args = ["pageserver", "start", f"--id={id}", *overrides]
         storage = self.env.pageserver_remote_storage
         append_pageserver_param_overrides(
             params_to_update=start_args,
@@ -2066,6 +2069,8 @@ class NeonStorageController(MetricsGetter):
             "node_id": int(node.id),
             "listen_http_addr": "localhost",
             "listen_http_port": node.service_port.http,
+            "listen_pg_addr": "localhost",
+            "listen_pg_port": node.service_port.pg,
         }
         log.info(f"node_register({body})")
         self.request(
@@ -2233,7 +2238,6 @@ class NeonPageserver(PgProtocol):
         self,
         overrides: Tuple[str, ...] = (),
         extra_env_vars: Optional[Dict[str, str]] = None,
-        register: bool = True,
     ) -> "NeonPageserver":
         """
         Start the page server.
@@ -2243,7 +2247,7 @@ class NeonPageserver(PgProtocol):
         assert self.running is False
 
         self.env.neon_cli.pageserver_start(
-            self.id, overrides=overrides, extra_env_vars=extra_env_vars, register=register
+            self.id, overrides=overrides, extra_env_vars=extra_env_vars
         )
         self.running = True
         return self
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 618ac63785..5f815d3e6c 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -242,7 +242,7 @@ def test_forward_compatibility(
         # everything else: our test code is written for latest CLI args.
         env.neon_local_binpath = neon_local_binpath
 
-        neon_env_builder.start()
+        neon_env_builder.start(register_pageservers=True)
 
         check_neon_works(
             env,
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index d1acb9817e..3ca13a904d 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -205,6 +205,9 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
         sk.start()
     env.storage_controller.start()
 
+    # We will start a pageserver with no control_plane_api set, so it won't be able to self-register
+    env.storage_controller.node_register(env.pageserver)
+
     env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
 
     env.neon_cli.create_tenant(
@@ -511,7 +514,6 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     env.pageserver.stop()  # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
     env.pageserver.start(
         overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
-        register=False,
     )
 
     # The pageserver should provide service to clients
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 6b7cd9d829..7a0707b564 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -278,13 +278,12 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     env.pageservers[0].allowed_errors.append(".*Emergency mode!.*")
     env.pageservers[0].start(
         overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
-        register=False,
     )
     origin_ps = env.pageservers[0]
 
     # This is the pageserver managed by the sharding service, where the tenant
     # will be attached after onboarding
-    env.pageservers[1].start(register=True)
+    env.pageservers[1].start()
     dest_ps = env.pageservers[1]
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
 

From bac06ea1accf54ae09c87cdd1f62e10565279b01 Mon Sep 17 00:00:00 2001
From: Jure Bajic <jure.bajic94@gmail.com>
Date: Tue, 12 Mar 2024 17:32:47 +0100
Subject: [PATCH 0389/1571] pageserver: fix read path max lsn bug  (#7007)

## Summary of changes
The problem it fixes is when `request_lsn` is `u64::MAX-1` the
`cont_lsn` becomes `u64::MAX` which is the same as `prev_lsn` which
stops the loop.

Closes https://github.com/neondatabase/neon/issues/6812
---
 pageserver/src/tenant.rs          | 24 ++++++++++++++++++++----
 pageserver/src/tenant/timeline.rs | 28 +++++++++++++++-------------
 2 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 961995b2d6..f0996328c0 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4625,10 +4625,7 @@ mod tests {
         drop(guard);
 
         // Pick a big LSN such that we query over all the changes.
-        // Technically, u64::MAX - 1 is the largest LSN supported by the read path,
-        // but there seems to be a bug on the non-vectored search path which surfaces
-        // in that case.
-        let reads_lsn = Lsn(u64::MAX - 1000);
+        let reads_lsn = Lsn(u64::MAX - 1);
 
         for read in reads {
             info!("Doing vectored read on {:?}", read);
@@ -5145,4 +5142,23 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_read_at_max_lsn() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_read_at_max_lsn")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        let lsn = Lsn(0x10);
+        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
+
+        let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let read_lsn = Lsn(u64::MAX - 1);
+
+        assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c017d30f45..a733a3b1a7 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2478,7 +2478,7 @@ impl Timeline {
         // 'prev_lsn' tracks the last LSN that we were at in our search. It's used
         // to check that each iteration make some progress, to break infinite
         // looping if something goes wrong.
-        let mut prev_lsn = Lsn(u64::MAX);
+        let mut prev_lsn = None;
 
         let mut result = ValueReconstructResult::Continue;
         let mut cont_lsn = Lsn(request_lsn.0 + 1);
@@ -2498,18 +2498,20 @@ impl Timeline {
                         MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
                         return Ok(traversal_path);
                     }
-                    if prev_lsn <= cont_lsn {
-                        // Didn't make any progress in last iteration. Error out to avoid
-                        // getting stuck in the loop.
-                        return Err(layer_traversal_error(format!(
-                            "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
-                            key,
-                            Lsn(cont_lsn.0 - 1),
-                            request_lsn,
-                            timeline.ancestor_lsn
-                        ), traversal_path));
+                    if let Some(prev) = prev_lsn {
+                        if prev <= cont_lsn {
+                            // Didn't make any progress in last iteration. Error out to avoid
+                            // getting stuck in the loop.
+                            return Err(layer_traversal_error(format!(
+                                "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
+                                key,
+                                Lsn(cont_lsn.0 - 1),
+                                request_lsn,
+                                timeline.ancestor_lsn
+                            ), traversal_path));
+                        }
                     }
-                    prev_lsn = cont_lsn;
+                    prev_lsn = Some(cont_lsn);
                 }
                 ValueReconstructResult::Missing => {
                     return Err(layer_traversal_error(
@@ -2539,7 +2541,7 @@ impl Timeline {
 
                 timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
                 timeline = &*timeline_owned;
-                prev_lsn = Lsn(u64::MAX);
+                prev_lsn = None;
                 continue 'outer;
             }
 

From 1b41db8bddfc1a89569346e1036df74f34454a4c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 12 Mar 2024 20:41:08 +0000
Subject: [PATCH 0390/1571] pageserver: enable setting stripe size inline with
 split request. (#7093)

## Summary

- Currently we can set stripe size at tenant creation, but it doesn't
mean anything until we have multiple shards
- When onboarding an existing tenant, it will always get a default shard
stripe size, so we would like to be able to pick the actual stripe size
at the point we split.

## Why do this inline with a split?

The alternative to this change would be to have a separate endpoint on
the storage controller for setting the stripe size on a tenant, and only
permit writes to that endpoint when the tenant has only a single shard.
That would work, but be a little bit more work for a client, and not
appreciably simpler (instead of having a special argument to the split
functions, we'd have a special separate endpoint, and a requirement that
the controller must sync its config down to the pageserver before
calling the split API). Either approach would work, but this one feels a
bit more robust end-to-end: the split API is the _very last moment_ that
the stripe size is mutable, so if we aim to set it before splitting, it
makes sense to do it as part of the same operation.
---
 .../attachment_service/src/service.rs         | 14 ++-
 control_plane/src/bin/neon_local.rs           |  7 +-
 control_plane/src/storage_controller.rs       |  8 +-
 libs/pageserver_api/src/models.rs             |  7 ++
 pageserver/src/http/routes.rs                 |  9 +-
 pageserver/src/tenant/mgr.rs                  | 28 +++++-
 test_runner/fixtures/neon_fixtures.py         |  6 +-
 test_runner/fixtures/pageserver/http.py       |  7 ++
 test_runner/regress/test_sharding.py          | 95 +++++++++++++++++++
 9 files changed, 168 insertions(+), 13 deletions(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index a8498a39b5..ea301d0372 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -2222,7 +2222,18 @@ impl Service {
 
         // unwrap safety: we would have returned above if we didn't find at least one shard to split
         let old_shard_count = old_shard_count.unwrap();
-        let shard_ident = shard_ident.unwrap();
+        let shard_ident = if let Some(new_stripe_size) = split_req.new_stripe_size {
+            // This ShardIdentity will be used as the template for all children, so this implicitly
+            // applies the new stripe size to the children.
+            let mut shard_ident = shard_ident.unwrap();
+            if shard_ident.count.count() > 1 && shard_ident.stripe_size != new_stripe_size {
+                return Err(ApiError::BadRequest(anyhow::anyhow!("Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", shard_ident.stripe_size)));
+            }
+            shard_ident.stripe_size = new_stripe_size;
+            shard_ident
+        } else {
+            shard_ident.unwrap()
+        };
         let policy = policy.unwrap();
 
         // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
@@ -2314,6 +2325,7 @@ impl Service {
                     *parent_id,
                     TenantShardSplitRequest {
                         new_shard_count: split_req.new_shard_count,
+                        new_stripe_size: split_req.new_stripe_size,
                     },
                 )
                 .await
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 952229c4b7..6c722f36b4 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -585,10 +585,14 @@ async fn handle_tenant(
         Some(("shard-split", matches)) => {
             let tenant_id = get_tenant_id(matches, env)?;
             let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
+            let shard_stripe_size: Option<ShardStripeSize> = matches
+                .get_one::<Option<ShardStripeSize>>("shard-stripe-size")
+                .cloned()
+                .unwrap();
 
             let storage_controller = StorageController::from_env(env);
             let result = storage_controller
-                .tenant_split(tenant_id, shard_count)
+                .tenant_split(tenant_id, shard_count, shard_stripe_size)
                 .await?;
             println!(
                 "Split tenant {} into shards {}",
@@ -1585,6 +1589,7 @@ fn cli() -> Command {
                 .about("Increase the number of shards in the tenant")
                 .arg(tenant_id_arg.clone())
                 .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
+                .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
                 )
         )
         .subcommand(
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index c505e67770..d7673f1b26 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -10,7 +10,7 @@ use pageserver_api::{
         TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
         TimelineCreateRequest, TimelineInfo,
     },
-    shard::TenantShardId,
+    shard::{ShardStripeSize, TenantShardId},
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
@@ -496,11 +496,15 @@ impl StorageController {
         &self,
         tenant_id: TenantId,
         new_shard_count: u8,
+        new_stripe_size: Option<ShardStripeSize>,
     ) -> anyhow::Result<TenantShardSplitResponse> {
         self.dispatch(
             Method::PUT,
             format!("control/v1/tenant/{tenant_id}/shard_split"),
-            Some(TenantShardSplitRequest { new_shard_count }),
+            Some(TenantShardSplitRequest {
+                new_shard_count,
+                new_stripe_size,
+            }),
         )
         .await
     }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index fe5bbd1c06..a96cc09158 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -198,6 +198,13 @@ pub struct TimelineCreateRequest {
 #[derive(Serialize, Deserialize)]
 pub struct TenantShardSplitRequest {
     pub new_shard_count: u8,
+
+    // A tenant's stripe size is only meaningful the first time their shard count goes
+    // above 1: therefore during a split from 1->N shards, we may modify the stripe size.
+    //
+    // If this is set while the stripe count is being increased from an already >1 value,
+    // then the request will fail with 400.
+    pub new_stripe_size: Option<ShardStripeSize>,
 }
 
 #[derive(Serialize, Deserialize)]
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index eafad9ab73..bb8b1bb7e5 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1151,7 +1151,12 @@ async fn tenant_shard_split_handler(
 
     let new_shards = state
         .tenant_manager
-        .shard_split(tenant_shard_id, ShardCount::new(req.new_shard_count), &ctx)
+        .shard_split(
+            tenant_shard_id,
+            ShardCount::new(req.new_shard_count),
+            req.new_stripe_size,
+            &ctx,
+        )
         .await
         .map_err(ApiError::InternalServerError)?;
 
@@ -2247,7 +2252,7 @@ pub fn make_router(
         .get("/v1/location_config", |r| {
             api_handler(r, list_location_config_handler)
         })
-        .get("/v1/location_config/:tenant_id", |r| {
+        .get("/v1/location_config/:tenant_shard_id", |r| {
             api_handler(r, get_location_config_handler)
         })
         .put(
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 38274448b3..26fcce1f38 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -6,7 +6,9 @@ use futures::stream::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::ShardParameters;
-use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
+use pageserver_api::shard::{
+    ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
+};
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::cmp::Ordering;
@@ -1439,11 +1441,12 @@ impl TenantManager {
         &self,
         tenant_shard_id: TenantShardId,
         new_shard_count: ShardCount,
+        new_stripe_size: Option<ShardStripeSize>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Vec<TenantShardId>> {
         let tenant = get_tenant(tenant_shard_id, true)?;
 
-        // Plan: identify what the new child shards will be
+        // Validate the incoming request
         if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
             anyhow::bail!("Requested shard count is not an increase");
         }
@@ -1452,10 +1455,18 @@ impl TenantManager {
             anyhow::bail!("Requested split is not a power of two");
         }
 
-        let parent_shard_identity = tenant.shard_identity;
-        let parent_tenant_conf = tenant.get_tenant_conf();
-        let parent_generation = tenant.generation;
+        if let Some(new_stripe_size) = new_stripe_size {
+            if tenant.get_shard_stripe_size() != new_stripe_size
+                && tenant_shard_id.shard_count.count() > 1
+            {
+                // This tenant already has multiple shards, it is illegal to try and change its stripe size
+                anyhow::bail!(
+                    "Shard stripe size may not be modified once tenant has multiple shards"
+                );
+            }
+        }
 
+        // Plan: identify what the new child shards will be
         let child_shards = tenant_shard_id.split(new_shard_count);
         tracing::info!(
             "Shard {} splits into: {}",
@@ -1466,6 +1477,10 @@ impl TenantManager {
                 .join(",")
         );
 
+        let parent_shard_identity = tenant.shard_identity;
+        let parent_tenant_conf = tenant.get_tenant_conf();
+        let parent_generation = tenant.generation;
+
         // Phase 1: Write out child shards' remote index files, in the parent tenant's current generation
         if let Err(e) = tenant.split_prepare(&child_shards).await {
             // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
@@ -1515,6 +1530,9 @@ impl TenantManager {
         // Phase 3: Spawn the child shards
         for child_shard in &child_shards {
             let mut child_shard_identity = parent_shard_identity;
+            if let Some(new_stripe_size) = new_stripe_size {
+                child_shard_identity.stripe_size = new_stripe_size;
+            }
             child_shard_identity.count = child_shard.shard_count;
             child_shard_identity.number = child_shard.shard_number;
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 975c6d865b..b3f460c7fe 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2142,11 +2142,13 @@ class NeonStorageController(MetricsGetter):
         shards: list[dict[str, Any]] = body["shards"]
         return shards
 
-    def tenant_shard_split(self, tenant_id: TenantId, shard_count: int) -> list[TenantShardId]:
+    def tenant_shard_split(
+        self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None
+    ) -> list[TenantShardId]:
         response = self.request(
             "PUT",
             f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/shard_split",
-            json={"new_shard_count": shard_count},
+            json={"new_shard_count": shard_count, "new_stripe_size": shard_stripe_size},
             headers=self.headers(TokenScope.ADMIN),
         )
         body = response.json()
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index b8e20c451f..6e082374d7 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -318,6 +318,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json["tenant_shards"], list)
         return res_json
 
+    def tenant_get_location(self, tenant_id: TenantShardId):
+        res = self.get(
+            f"http://localhost:{self.port}/v1/location_config/{tenant_id}",
+        )
+        self.verbose_error(res)
+        return res.json()
+
     def tenant_delete(self, tenant_id: Union[TenantId, TenantShardId]):
         res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
         self.verbose_error(res)
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 1b96cd6a80..9309af066b 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1,4 +1,5 @@
 import os
+from typing import Dict, List, Union
 
 import pytest
 from fixtures.log_helper import log
@@ -8,7 +9,11 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.remote_storage import s3_storage
 from fixtures.types import Lsn, TenantShardId, TimelineId
+from fixtures.utils import wait_until
 from fixtures.workload import Workload
+from pytest_httpserver import HTTPServer
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
 
 
 def test_sharding_smoke(
@@ -310,6 +315,96 @@ def test_sharding_split_smoke(
     workload.validate()
 
 
+@pytest.mark.parametrize("initial_stripe_size", [None, 65536])
+def test_sharding_split_stripe_size(
+    neon_env_builder: NeonEnvBuilder,
+    httpserver: HTTPServer,
+    httpserver_listen_address,
+    initial_stripe_size: int,
+):
+    """
+    Check that modifying stripe size inline with a shard split works as expected
+    """
+    (host, port) = httpserver_listen_address
+    neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
+    neon_env_builder.num_pageservers = 1
+
+    # Set up fake HTTP notify endpoint: we will use this to validate that we receive
+    # the correct stripe size after split.
+    notifications = []
+
+    def handler(request: Request):
+        log.info(f"Notify request: {request}")
+        notifications.append(request.json)
+        return Response(status=200)
+
+    httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=1, initial_tenant_shard_stripe_size=initial_stripe_size
+    )
+    tenant_id = env.initial_tenant
+
+    assert len(notifications) == 1
+    expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = {
+        "tenant_id": str(env.initial_tenant),
+        "stripe_size": None,
+        "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
+    }
+    assert notifications[0] == expect
+
+    new_stripe_size = 2048
+    env.storage_controller.tenant_shard_split(
+        tenant_id, shard_count=2, shard_stripe_size=new_stripe_size
+    )
+
+    # Check that we ended up with the stripe size that we expected, both on the pageserver
+    # and in the notifications to compute
+    assert len(notifications) == 2
+    expect_after: Dict[str, Union[List[Dict[str, int]], str, None, int]] = {
+        "tenant_id": str(env.initial_tenant),
+        "stripe_size": new_stripe_size,
+        "shards": [
+            {"node_id": int(env.pageservers[0].id), "shard_number": 0},
+            {"node_id": int(env.pageservers[0].id), "shard_number": 1},
+        ],
+    }
+    log.info(f"Got notification: {notifications[1]}")
+    assert notifications[1] == expect_after
+
+    # Inspect the stripe size on the pageserver
+    shard_0_loc = (
+        env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 0, 2))
+    )
+    assert shard_0_loc["shard_stripe_size"] == new_stripe_size
+    shard_1_loc = (
+        env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 1, 2))
+    )
+    assert shard_1_loc["shard_stripe_size"] == new_stripe_size
+
+    # Ensure stripe size survives a pageserver restart
+    env.pageservers[0].stop()
+    env.pageservers[0].start()
+    shard_0_loc = (
+        env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 0, 2))
+    )
+    assert shard_0_loc["shard_stripe_size"] == new_stripe_size
+    shard_1_loc = (
+        env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 1, 2))
+    )
+    assert shard_1_loc["shard_stripe_size"] == new_stripe_size
+
+    # Ensure stripe size survives a storage controller restart
+    env.storage_controller.stop()
+    env.storage_controller.start()
+
+    def assert_restart_notification():
+        assert len(notifications) == 3
+        assert notifications[2] == expect_after
+
+    wait_until(10, 1, assert_restart_notification)
+
+
 @pytest.mark.skipif(
     # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
     # validating in this test don't benefit much from debug assertions.

From 83855a907c93ff5c8435d4f1acf3e71a40f5c18f Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 13 Mar 2024 06:35:49 +0000
Subject: [PATCH 0391/1571] proxy http error classification (#7098)

## Problem

Missing error classification for SQL-over-HTTP queries.
Not respecting `UserFacingError` for SQL-over-HTTP queries.

## Summary of changes

Adds error classification.
Adds user facing errors.
---
 proxy/src/serverless/backend.rs       |  25 +++
 proxy/src/serverless/conn_pool.rs     |  14 +-
 proxy/src/serverless/sql_over_http.rs | 239 +++++++++++++++++++-------
 3 files changed, 204 insertions(+), 74 deletions(-)

diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 2e63ad6c99..d0f155165d 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -12,6 +12,7 @@ use crate::{
         CachedNodeInfo,
     },
     context::RequestMonitoring,
+    error::{ErrorKind, ReportableError, UserFacingError},
     proxy::connect_compute::ConnectMechanism,
 };
 
@@ -117,6 +118,30 @@ pub enum HttpConnError {
     WakeCompute(#[from] WakeComputeError),
 }
 
+impl ReportableError for HttpConnError {
+    fn get_error_kind(&self) -> ErrorKind {
+        match self {
+            HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute,
+            HttpConnError::ConnectionError(p) => p.get_error_kind(),
+            HttpConnError::GetAuthInfo(a) => a.get_error_kind(),
+            HttpConnError::AuthError(a) => a.get_error_kind(),
+            HttpConnError::WakeCompute(w) => w.get_error_kind(),
+        }
+    }
+}
+
+impl UserFacingError for HttpConnError {
+    fn to_string_client(&self) -> String {
+        match self {
+            HttpConnError::ConnectionClosedAbruptly(_) => self.to_string(),
+            HttpConnError::ConnectionError(p) => p.to_string(),
+            HttpConnError::GetAuthInfo(c) => c.to_string_client(),
+            HttpConnError::AuthError(c) => c.to_string_client(),
+            HttpConnError::WakeCompute(c) => c.to_string_client(),
+        }
+    }
+}
+
 struct TokioMechanism {
     pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
     conn_info: ConnInfo,
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 901e30224b..c7e8eaef76 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -119,16 +119,12 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
         }
     }
 
-    fn put(
-        pool: &RwLock<Self>,
-        conn_info: &ConnInfo,
-        client: ClientInner<C>,
-    ) -> anyhow::Result<()> {
+    fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner<C>) {
         let conn_id = client.conn_id;
 
         if client.is_closed() {
             info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
-            return Ok(());
+            return;
         }
         let global_max_conn = pool.read().global_pool_size_max_conns;
         if pool
@@ -138,7 +134,7 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
             >= global_max_conn
         {
             info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full");
-            return Ok(());
+            return;
         }
 
         // return connection to the pool
@@ -172,8 +168,6 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
         } else {
             info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
         }
-
-        Ok(())
     }
 }
 
@@ -653,7 +647,7 @@ impl<C: ClientInnerExt> Client<C> {
             // return connection to the pool
             return Some(move || {
                 let _span = current_span.enter();
-                let _ = EndpointConnPool::put(&conn_pool, &conn_info, client);
+                EndpointConnPool::put(&conn_pool, &conn_info, client);
             });
         }
         None
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 20d9795b47..86c278030f 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,11 +1,11 @@
 use std::pin::pin;
 use std::sync::Arc;
 
-use anyhow::bail;
 use futures::future::select;
 use futures::future::try_join;
 use futures::future::Either;
 use futures::StreamExt;
+use futures::TryFutureExt;
 use hyper::body::HttpBody;
 use hyper::header;
 use hyper::http::HeaderName;
@@ -37,9 +37,13 @@ use crate::auth::ComputeUserInfoParseError;
 use crate::config::ProxyConfig;
 use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
+use crate::error::ErrorKind;
+use crate::error::ReportableError;
+use crate::error::UserFacingError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
+use crate::serverless::backend::HttpConnError;
 use crate::DbName;
 use crate::RoleName;
 
@@ -47,6 +51,7 @@ use super::backend::PoolingBackend;
 use super::conn_pool::ConnInfo;
 use super::json::json_to_pg_text;
 use super::json::pg_text_row_to_json;
+use super::json::JsonConversionError;
 
 #[derive(serde::Deserialize)]
 #[serde(rename_all = "camelCase")]
@@ -117,6 +122,18 @@ pub enum ConnInfoError {
     MalformedEndpoint,
 }
 
+impl ReportableError for ConnInfoError {
+    fn get_error_kind(&self) -> ErrorKind {
+        ErrorKind::User
+    }
+}
+
+impl UserFacingError for ConnInfoError {
+    fn to_string_client(&self) -> String {
+        self.to_string()
+    }
+}
+
 fn get_conn_info(
     ctx: &mut RequestMonitoring,
     headers: &HeaderMap,
@@ -212,17 +229,41 @@ pub async fn handle(
     handle.abort();
 
     let mut response = match result {
-        Ok(Ok(r)) => {
+        Ok(r) => {
             ctx.set_success();
             r
         }
-        Err(e) => {
-            // TODO: ctx.set_error_kind(e.get_error_type());
+        Err(e @ SqlOverHttpError::Cancelled(_)) => {
+            let error_kind = e.get_error_kind();
+            ctx.set_error_kind(error_kind);
 
-            let mut message = format!("{:?}", e);
-            let db_error = e
-                .downcast_ref::<tokio_postgres::Error>()
-                .and_then(|e| e.as_db_error());
+            let message = format!(
+                "Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections",
+                config.http_config.request_timeout.as_secs_f64()
+            );
+
+            tracing::info!(
+                kind=error_kind.to_metric_label(),
+                error=%e,
+                msg=message,
+                "forwarding error to user"
+            );
+
+            json_response(
+                StatusCode::BAD_REQUEST,
+                json!({ "message": message, "code": SqlState::PROTOCOL_VIOLATION.code() }),
+            )?
+        }
+        Err(e) => {
+            let error_kind = e.get_error_kind();
+            ctx.set_error_kind(error_kind);
+
+            let mut message = e.to_string_client();
+            let db_error = match &e {
+                SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
+                | SqlOverHttpError::Postgres(e) => e.as_db_error(),
+                _ => None,
+            };
             fn get<'a, T: serde::Serialize>(
                 db: Option<&'a DbError>,
                 x: impl FnOnce(&'a DbError) -> T,
@@ -265,10 +306,13 @@ pub async fn handle(
             let line = get(db_error, |db| db.line().map(|l| l.to_string()));
             let routine = get(db_error, |db| db.routine());
 
-            error!(
-                ?code,
-                "sql-over-http per-client task finished with an error: {e:#}"
+            tracing::info!(
+                kind=error_kind.to_metric_label(),
+                error=%e,
+                msg=message,
+                "forwarding error to user"
             );
+
             // TODO: this shouldn't always be bad request.
             json_response(
                 StatusCode::BAD_REQUEST,
@@ -293,21 +337,6 @@ pub async fn handle(
                 }),
             )?
         }
-        Ok(Err(Cancelled())) => {
-            // TODO: when http error classification is done, distinguish between
-            // timeout on sql vs timeout in proxy/cplane
-            // ctx.set_error_kind(crate::error::ErrorKind::RateLimit);
-
-            let message = format!(
-                "Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections",
-                config.http_config.request_timeout.as_secs_f64()
-            );
-            error!(message);
-            json_response(
-                StatusCode::BAD_REQUEST,
-                json!({ "message": message, "code": SqlState::PROTOCOL_VIOLATION.code() }),
-            )?
-        }
     };
 
     response.headers_mut().insert(
@@ -317,7 +346,93 @@ pub async fn handle(
     Ok(response)
 }
 
-struct Cancelled();
+#[derive(Debug, thiserror::Error)]
+pub enum SqlOverHttpError {
+    #[error("{0}")]
+    ReadPayload(#[from] ReadPayloadError),
+    #[error("{0}")]
+    ConnectCompute(#[from] HttpConnError),
+    #[error("{0}")]
+    ConnInfo(#[from] ConnInfoError),
+    #[error("request is too large (max is {MAX_REQUEST_SIZE} bytes)")]
+    RequestTooLarge,
+    #[error("response is too large (max is {MAX_RESPONSE_SIZE} bytes)")]
+    ResponseTooLarge,
+    #[error("invalid isolation level")]
+    InvalidIsolationLevel,
+    #[error("{0}")]
+    Postgres(#[from] tokio_postgres::Error),
+    #[error("{0}")]
+    JsonConversion(#[from] JsonConversionError),
+    #[error("{0}")]
+    Cancelled(SqlOverHttpCancel),
+}
+
+impl ReportableError for SqlOverHttpError {
+    fn get_error_kind(&self) -> ErrorKind {
+        match self {
+            SqlOverHttpError::ReadPayload(e) => e.get_error_kind(),
+            SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(),
+            SqlOverHttpError::ConnInfo(e) => e.get_error_kind(),
+            SqlOverHttpError::RequestTooLarge => ErrorKind::User,
+            SqlOverHttpError::ResponseTooLarge => ErrorKind::User,
+            SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User,
+            SqlOverHttpError::Postgres(p) => p.get_error_kind(),
+            SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres,
+            SqlOverHttpError::Cancelled(c) => c.get_error_kind(),
+        }
+    }
+}
+
+impl UserFacingError for SqlOverHttpError {
+    fn to_string_client(&self) -> String {
+        match self {
+            SqlOverHttpError::ReadPayload(p) => p.to_string(),
+            SqlOverHttpError::ConnectCompute(c) => c.to_string_client(),
+            SqlOverHttpError::ConnInfo(c) => c.to_string_client(),
+            SqlOverHttpError::RequestTooLarge => self.to_string(),
+            SqlOverHttpError::ResponseTooLarge => self.to_string(),
+            SqlOverHttpError::InvalidIsolationLevel => self.to_string(),
+            SqlOverHttpError::Postgres(p) => p.to_string(),
+            SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(),
+            SqlOverHttpError::Cancelled(_) => self.to_string(),
+        }
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum ReadPayloadError {
+    #[error("could not read the HTTP request body: {0}")]
+    Read(#[from] hyper::Error),
+    #[error("could not parse the HTTP request body: {0}")]
+    Parse(#[from] serde_json::Error),
+}
+
+impl ReportableError for ReadPayloadError {
+    fn get_error_kind(&self) -> ErrorKind {
+        match self {
+            ReadPayloadError::Read(_) => ErrorKind::ClientDisconnect,
+            ReadPayloadError::Parse(_) => ErrorKind::User,
+        }
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum SqlOverHttpCancel {
+    #[error("query was cancelled")]
+    Postgres,
+    #[error("query was cancelled while stuck trying to connect to the database")]
+    Connect,
+}
+
+impl ReportableError for SqlOverHttpCancel {
+    fn get_error_kind(&self) -> ErrorKind {
+        match self {
+            SqlOverHttpCancel::Postgres => ErrorKind::RateLimit,
+            SqlOverHttpCancel::Connect => ErrorKind::ServiceRateLimit,
+        }
+    }
+}
 
 async fn handle_inner(
     cancel: CancellationToken,
@@ -325,7 +440,7 @@ async fn handle_inner(
     ctx: &mut RequestMonitoring,
     request: Request<Body>,
     backend: Arc<PoolingBackend>,
-) -> Result<Result<Response<Body>, Cancelled>, anyhow::Error> {
+) -> Result<Response<Body>, SqlOverHttpError> {
     let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
         .with_label_values(&[ctx.protocol])
         .guard();
@@ -358,7 +473,7 @@ async fn handle_inner(
             b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
             b"ReadCommitted" => IsolationLevel::ReadCommitted,
             b"RepeatableRead" => IsolationLevel::RepeatableRead,
-            _ => bail!("invalid isolation level"),
+            _ => return Err(SqlOverHttpError::InvalidIsolationLevel),
         }),
         None => None,
     };
@@ -376,19 +491,16 @@ async fn handle_inner(
     // we don't have a streaming request support yet so this is to prevent OOM
     // from a malicious user sending an extremely large request body
     if request_content_length > MAX_REQUEST_SIZE {
-        return Err(anyhow::anyhow!(
-            "request is too large (max is {MAX_REQUEST_SIZE} bytes)"
-        ));
+        return Err(SqlOverHttpError::RequestTooLarge);
     }
 
     let fetch_and_process_request = async {
-        let body = hyper::body::to_bytes(request.into_body())
-            .await
-            .map_err(anyhow::Error::from)?;
+        let body = hyper::body::to_bytes(request.into_body()).await?;
         info!(length = body.len(), "request payload read");
         let payload: Payload = serde_json::from_slice(&body)?;
-        Ok::<Payload, anyhow::Error>(payload) // Adjust error type accordingly
-    };
+        Ok::<Payload, ReadPayloadError>(payload) // Adjust error type accordingly
+    }
+    .map_err(SqlOverHttpError::from);
 
     let authenticate_and_connect = async {
         let keys = backend.authenticate(ctx, &conn_info).await?;
@@ -398,8 +510,9 @@ async fn handle_inner(
         // not strictly necessary to mark success here,
         // but it's just insurance for if we forget it somewhere else
         ctx.latency_timer.success();
-        Ok::<_, anyhow::Error>(client)
-    };
+        Ok::<_, HttpConnError>(client)
+    }
+    .map_err(SqlOverHttpError::from);
 
     // Run both operations in parallel
     let (payload, mut client) = match select(
@@ -412,7 +525,9 @@ async fn handle_inner(
     .await
     {
         Either::Left((result, _cancelled)) => result?,
-        Either::Right((_cancelled, _)) => return Ok(Err(Cancelled())),
+        Either::Right((_cancelled, _)) => {
+            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect))
+        }
     };
 
     let mut response = Response::builder()
@@ -456,20 +571,24 @@ async fn handle_inner(
                             results
                         }
                         Ok(Err(error)) => {
-                            let db_error = error
-                                .downcast_ref::<tokio_postgres::Error>()
-                                .and_then(|e| e.as_db_error());
+                            let db_error = match &error {
+                                SqlOverHttpError::ConnectCompute(
+                                    HttpConnError::ConnectionError(e),
+                                )
+                                | SqlOverHttpError::Postgres(e) => e.as_db_error(),
+                                _ => None,
+                            };
 
                             // if errored for some other reason, it might not be safe to return
                             if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
                                 discard.discard();
                             }
 
-                            return Ok(Err(Cancelled()));
+                            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
                         }
                         Err(_timeout) => {
                             discard.discard();
-                            return Ok(Err(Cancelled()));
+                            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
                         }
                     }
                 }
@@ -507,7 +626,7 @@ async fn handle_inner(
             )
             .await
             {
-                Ok(Ok(results)) => {
+                Ok(results) => {
                     info!("commit");
                     let status = transaction.commit().await.map_err(|e| {
                         // if we cannot commit - for now don't return connection to pool
@@ -518,14 +637,14 @@ async fn handle_inner(
                     discard.check_idle(status);
                     results
                 }
-                Ok(Err(Cancelled())) => {
+                Err(SqlOverHttpError::Cancelled(_)) => {
                     if let Err(err) = cancel_token.cancel_query(NoTls).await {
                         tracing::error!(?err, "could not cancel query");
                     }
                     // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
                     discard.discard();
 
-                    return Ok(Err(Cancelled()));
+                    return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
                 }
                 Err(err) => {
                     info!("rollback");
@@ -541,16 +660,10 @@ async fn handle_inner(
             };
 
             if txn_read_only {
-                response = response.header(
-                    TXN_READ_ONLY.clone(),
-                    HeaderValue::try_from(txn_read_only.to_string())?,
-                );
+                response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
             }
             if txn_deferrable {
-                response = response.header(
-                    TXN_DEFERRABLE.clone(),
-                    HeaderValue::try_from(txn_deferrable.to_string())?,
-                );
+                response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE);
             }
             if let Some(txn_isolation_level) = txn_isolation_level_raw {
                 response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
@@ -574,7 +687,7 @@ async fn handle_inner(
     // moving this later in the stack is going to be a lot of effort and ehhhh
     metrics.record_egress(len as u64);
 
-    Ok(Ok(response))
+    Ok(response)
 }
 
 async fn query_batch(
@@ -584,7 +697,7 @@ async fn query_batch(
     total_size: &mut usize,
     raw_output: bool,
     array_mode: bool,
-) -> anyhow::Result<Result<Vec<Value>, Cancelled>> {
+) -> Result<Vec<Value>, SqlOverHttpError> {
     let mut results = Vec::with_capacity(queries.queries.len());
     let mut current_size = 0;
     for stmt in queries.queries {
@@ -606,12 +719,12 @@ async fn query_batch(
                 return Err(e);
             }
             Either::Right((_cancelled, _)) => {
-                return Ok(Err(Cancelled()));
+                return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
             }
         }
     }
     *total_size += current_size;
-    Ok(Ok(results))
+    Ok(results)
 }
 
 async fn query_to_json<T: GenericClient>(
@@ -620,7 +733,7 @@ async fn query_to_json<T: GenericClient>(
     current_size: &mut usize,
     raw_output: bool,
     default_array_mode: bool,
-) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
+) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> {
     info!("executing query");
     let query_params = data.params;
     let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?);
@@ -637,9 +750,7 @@ async fn query_to_json<T: GenericClient>(
         // we don't have a streaming response support yet so this is to prevent OOM
         // from a malicious query (eg a cross join)
         if *current_size > MAX_RESPONSE_SIZE {
-            return Err(anyhow::anyhow!(
-                "response is too large (max is {MAX_RESPONSE_SIZE} bytes)"
-            ));
+            return Err(SqlOverHttpError::ResponseTooLarge);
         }
     }
 

From 0554bee02251ebf0bfdebf115a2ffc10c675782d Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 13 Mar 2024 15:45:19 +0400
Subject: [PATCH 0392/1571] proxy: Report warm cold start if connection is from
 the local cache (#7104)

## Problem

* quotes in serialized string
* no status if connection is from local cache

## Summary of changes

* remove quotes
* report warm if connection if from local cache
---
 proxy/src/console/provider/neon.rs |  5 ++++-
 proxy/src/context.rs               |  4 ++++
 proxy/src/context/parquet.rs       | 14 ++++++++------
 proxy/src/serverless/backend.rs    |  2 ++
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index f3befa33e0..3088cffa57 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -6,7 +6,9 @@ use super::{
     ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret,
     NodeInfo,
 };
-use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
+use crate::{
+    auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram,
+};
 use crate::{
     cache::Cached,
     context::RequestMonitoring,
@@ -254,6 +256,7 @@ impl super::Api for Api {
         if permit.should_check_cache() {
             if let Some(cached) = self.caches.node_info.get(&key) {
                 info!(key = &*key, "found cached compute node info");
+                ctx.set_cold_start_info(ColdStartInfo::Warm);
                 return Ok(cached);
             }
         }
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 1b48e01358..40aa21083f 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -111,6 +111,10 @@ impl RequestMonitoring {
         )
     }
 
+    pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
+        self.cold_start_info = Some(info);
+    }
+
     pub fn set_project(&mut self, x: MetricsAuxInfo) {
         self.set_endpoint_id(x.endpoint_id);
         self.branch = Some(x.branch_id);
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 1b1274b196..ba144bb7ba 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -93,7 +93,7 @@ struct RequestData {
     /// Or if we make it to proxy_pass
     success: bool,
     /// Indicates if the cplane started the new compute node for this request.
-    cold_start_info: Option<String>,
+    cold_start_info: Option<&'static str>,
     /// Tracks time from session start (HTTP request/libpq TCP handshake)
     /// Through to success/failure
     duration_us: u64,
@@ -121,10 +121,12 @@ impl From<RequestMonitoring> for RequestData {
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
             success: value.success,
-            cold_start_info: value
-                .cold_start_info
-                .as_ref()
-                .map(|x| serde_json::to_string(x).unwrap_or_default()),
+            cold_start_info: value.cold_start_info.as_ref().map(|x| match x {
+                crate::console::messages::ColdStartInfo::Unknown => "unknown",
+                crate::console::messages::ColdStartInfo::Warm => "warm",
+                crate::console::messages::ColdStartInfo::PoolHit => "pool_hit",
+                crate::console::messages::ColdStartInfo::PoolMiss => "pool_miss",
+            }),
             duration_us: SystemTime::from(value.first_packet)
                 .elapsed()
                 .unwrap_or_default()
@@ -458,7 +460,7 @@ mod tests {
             region: "us-east-1",
             error: None,
             success: rng.gen(),
-            cold_start_info: Some("no".into()),
+            cold_start_info: Some("no"),
             duration_us: rng.gen_range(0..30_000_000),
         }
     }
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index d0f155165d..9b3ca8d447 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -9,6 +9,7 @@ use crate::{
     config::ProxyConfig,
     console::{
         errors::{GetAuthInfoError, WakeComputeError},
+        messages::ColdStartInfo,
         CachedNodeInfo,
     },
     context::RequestMonitoring,
@@ -83,6 +84,7 @@ impl PoolingBackend {
         };
 
         if let Some(client) = maybe_client {
+            ctx.set_cold_start_info(ColdStartInfo::Warm);
             return Ok(client);
         }
         let conn_id = uuid::Uuid::new_v4();

From b0aff04157866904e53f815e7fd389e2823abce9 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 13 Mar 2024 16:50:05 +0400
Subject: [PATCH 0393/1571] proxy: add new dimension to exclude cplane latency
 (#7011)

## Problem

Currently cplane communication is a part of the latency monitoring. It
doesn't allow to setup the proper alerting based on proxy latency.

## Summary of changes

Added dimension to exclude cplane latency.
---
 proxy/src/auth/backend/hacks.rs    | 13 +++--
 proxy/src/auth/flow.rs             |  2 +-
 proxy/src/console/provider/neon.rs |  4 ++
 proxy/src/context.rs               |  9 ++--
 proxy/src/context/parquet.rs       |  6 +--
 proxy/src/metrics.rs               | 79 +++++++++++++++++++++---------
 proxy/src/proxy.rs                 |  2 +-
 7 files changed, 79 insertions(+), 36 deletions(-)

diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 26cf7a01f2..f7241be4a9 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -25,13 +25,16 @@ pub async fn authenticate_cleartext(
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let _paused = ctx.latency_timer.pause();
+    let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
 
-    let auth_outcome = AuthFlow::new(client)
+    let auth_flow = AuthFlow::new(client)
         .begin(auth::CleartextPassword(secret))
-        .await?
-        .authenticate()
         .await?;
+    drop(paused);
+    // cleartext auth is only allowed to the ws/http protocol.
+    // If we're here, we already received the password in the first message.
+    // Scram protocol will be executed on the proxy side.
+    let auth_outcome = auth_flow.authenticate().await?;
 
     let keys = match auth_outcome {
         sasl::Outcome::Success(key) => key,
@@ -56,7 +59,7 @@ pub async fn password_hack_no_authentication(
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let _paused = ctx.latency_timer.pause();
+    let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
 
     let payload = AuthFlow::new(client)
         .begin(auth::PasswordHack)
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index dce73138c6..788381b6c0 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -143,7 +143,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
         let Scram(secret, ctx) = self.state;
 
         // pause the timer while we communicate with the client
-        let _paused = ctx.latency_timer.pause();
+        let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
 
         // Initial client message contains the chosen auth method's name.
         let msg = self.stream.read_password_message().await?;
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 3088cffa57..3b2e0cc204 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -74,7 +74,9 @@ impl Api {
 
             info!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
+            let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
             let response = self.endpoint.execute(request).await?;
+            drop(pause);
             info!(duration = ?start.elapsed(), "received http response");
             let body = match parse_body::<GetRoleSecret>(response).await {
                 Ok(body) => body,
@@ -134,7 +136,9 @@ impl Api {
 
             info!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
+            let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
             let response = self.endpoint.execute(request).await?;
+            drop(pause);
             info!(duration = ?start.elapsed(), "received http response");
             let body = parse_body::<WakeCompute>(response).await?;
 
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 40aa21083f..7ca830cdb4 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -15,11 +15,12 @@ use crate::{
     BranchId, DbName, EndpointId, ProjectId, RoleName,
 };
 
+use self::parquet::RequestData;
+
 pub mod parquet;
 
-static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestMonitoring>> = OnceCell::new();
+static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
 
-#[derive(Clone)]
 /// Context data for a single request to connect to a database.
 ///
 /// This data should **not** be used for connection logic, only for observability and limiting purposes.
@@ -46,7 +47,7 @@ pub struct RequestMonitoring {
 
     // extra
     // This sender is here to keep the request monitoring channel open while requests are taking place.
-    sender: Option<mpsc::UnboundedSender<RequestMonitoring>>,
+    sender: Option<mpsc::UnboundedSender<RequestData>>,
     pub latency_timer: LatencyTimer,
 }
 
@@ -172,7 +173,7 @@ impl RequestMonitoring {
 impl Drop for RequestMonitoring {
     fn drop(&mut self) {
         if let Some(tx) = self.sender.take() {
-            let _: Result<(), _> = tx.send(self.clone());
+            let _: Result<(), _> = tx.send(RequestData::from(&*self));
         }
     }
 }
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index ba144bb7ba..a2be1c4186 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -74,7 +74,7 @@ pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
 // * after each rowgroup write, we check the length of the file and upload to s3 if large enough
 
 #[derive(parquet_derive::ParquetRecordWriter)]
-struct RequestData {
+pub struct RequestData {
     region: &'static str,
     protocol: &'static str,
     /// Must be UTC. The derive macro doesn't like the timezones
@@ -99,8 +99,8 @@ struct RequestData {
     duration_us: u64,
 }
 
-impl From<RequestMonitoring> for RequestData {
-    fn from(value: RequestMonitoring) -> Self {
+impl From<&RequestMonitoring> for RequestData {
+    fn from(value: &RequestMonitoring) -> Self {
         Self {
             session_id: value.session_id,
             peer_addr: value.peer_addr.to_string(),
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 0477176c45..02ebcd6aaa 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -7,7 +7,7 @@ use ::metrics::{
 use metrics::{register_int_counter, register_int_counter_pair, IntCounter, IntCounterPair};
 
 use once_cell::sync::Lazy;
-use tokio::time;
+use tokio::time::{self, Instant};
 
 pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
     register_int_counter_pair_vec!(
@@ -46,9 +46,9 @@ pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "proxy_compute_connection_latency_seconds",
         "Time it took for proxy to establish a connection to the compute endpoint",
-        // http/ws/tcp, true/false, true/false, success/failure
-        // 3 * 2 * 2 * 2 = 24 counters
-        &["protocol", "cache_miss", "pool_miss", "outcome"],
+        // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
+        // 3 * 2 * 2 * 2 * 2 = 48 counters
+        &["protocol", "cache_miss", "pool_miss", "outcome", "excluded"],
         // largest bucket = 2^16 * 0.5ms = 32s
         exponential_buckets(0.0005, 2.0, 16).unwrap(),
     )
@@ -161,12 +161,26 @@ pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
     .unwrap()
 });
 
-#[derive(Clone)]
+pub enum Waiting {
+    Cplane,
+    Client,
+    Compute,
+}
+
+#[derive(Default)]
+struct Accumulated {
+    cplane: time::Duration,
+    client: time::Duration,
+    compute: time::Duration,
+}
+
 pub struct LatencyTimer {
     // time since the stopwatch was started
-    start: Option<time::Instant>,
+    start: time::Instant,
+    // time since the stopwatch was stopped
+    stop: Option<time::Instant>,
     // accumulated time on the stopwatch
-    pub accumulated: std::time::Duration,
+    accumulated: Accumulated,
     // label data
     protocol: &'static str,
     cache_miss: bool,
@@ -176,13 +190,16 @@ pub struct LatencyTimer {
 
 pub struct LatencyTimerPause<'a> {
     timer: &'a mut LatencyTimer,
+    start: time::Instant,
+    waiting_for: Waiting,
 }
 
 impl LatencyTimer {
     pub fn new(protocol: &'static str) -> Self {
         Self {
-            start: Some(time::Instant::now()),
-            accumulated: std::time::Duration::ZERO,
+            start: time::Instant::now(),
+            stop: None,
+            accumulated: Accumulated::default(),
             protocol,
             cache_miss: false,
             // by default we don't do pooling
@@ -192,11 +209,12 @@ impl LatencyTimer {
         }
     }
 
-    pub fn pause(&mut self) -> LatencyTimerPause<'_> {
-        // stop the stopwatch and record the time that we have accumulated
-        let start = self.start.take().expect("latency timer should be started");
-        self.accumulated += start.elapsed();
-        LatencyTimerPause { timer: self }
+    pub fn pause(&mut self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
+        LatencyTimerPause {
+            timer: self,
+            start: Instant::now(),
+            waiting_for,
+        }
     }
 
     pub fn cache_miss(&mut self) {
@@ -209,9 +227,7 @@ impl LatencyTimer {
 
     pub fn success(&mut self) {
         // stop the stopwatch and record the time that we have accumulated
-        if let Some(start) = self.start.take() {
-            self.accumulated += start.elapsed();
-        }
+        self.stop = Some(time::Instant::now());
 
         // success
         self.outcome = "success";
@@ -220,23 +236,42 @@ impl LatencyTimer {
 
 impl Drop for LatencyTimerPause<'_> {
     fn drop(&mut self) {
-        // start the stopwatch again
-        self.timer.start = Some(time::Instant::now());
+        let dur = self.start.elapsed();
+        match self.waiting_for {
+            Waiting::Cplane => self.timer.accumulated.cplane += dur,
+            Waiting::Client => self.timer.accumulated.client += dur,
+            Waiting::Compute => self.timer.accumulated.compute += dur,
+        }
     }
 }
 
 impl Drop for LatencyTimer {
     fn drop(&mut self) {
-        let duration =
-            self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated;
+        let duration = self
+            .stop
+            .unwrap_or_else(time::Instant::now)
+            .duration_since(self.start);
+        // Excluding cplane communication from the accumulated time.
         COMPUTE_CONNECTION_LATENCY
             .with_label_values(&[
                 self.protocol,
                 bool_to_str(self.cache_miss),
                 bool_to_str(self.pool_miss),
                 self.outcome,
+                "client",
             ])
-            .observe(duration.as_secs_f64())
+            .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
+        // Exclude client and cplane communication from the accumulated time.
+        let accumulated_total = self.accumulated.client + self.accumulated.cplane;
+        COMPUTE_CONNECTION_LATENCY
+            .with_label_values(&[
+                self.protocol,
+                bool_to_str(self.cache_miss),
+                bool_to_str(self.pool_miss),
+                self.outcome,
+                "client_and_cplane",
+            ])
+            .observe((duration.saturating_sub(accumulated_total)).as_secs_f64());
     }
 }
 
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 7848fc2ac2..ab5bf5d494 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -248,7 +248,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     let tls = config.tls_config.as_ref();
 
-    let pause = ctx.latency_timer.pause();
+    let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
     let do_handshake = handshake(stream, mode.handshake_tls(tls));
     let (mut stream, params) =
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {

From 8a53d576e685b700c498d50588b4c0224711bed2 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 13 Mar 2024 17:10:20 +0200
Subject: [PATCH 0394/1571] fix(metrics): time individual layer flush
 operations (#7109)

Currently, the flushing operation could flush multiple frozen layers to
the disk and store the aggregate time in the histogram. The result is a
bimodal distribution with short and over 1000-second flushes. Change it
so that we record how long one layer flush takes.
---
 pageserver/src/tenant/timeline.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a733a3b1a7..f10df19b7b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2967,7 +2967,6 @@ impl Timeline {
             }
 
             trace!("waking up");
-            let timer = self.metrics.flush_time_histo.start_timer();
             let flush_counter = *layer_flush_start_rx.borrow();
             let result = loop {
                 if self.cancel.is_cancelled() {
@@ -2978,6 +2977,8 @@ impl Timeline {
                     return;
                 }
 
+                let timer = self.metrics.flush_time_histo.start_timer();
+
                 let layer_to_flush = {
                     let guard = self.layers.read().await;
                     guard.layer_map().frozen_layers.front().cloned()
@@ -2999,13 +3000,12 @@ impl Timeline {
                         break err;
                     }
                 }
+                timer.stop_and_record();
             };
             // Notify any listeners that we're done
             let _ = self
                 .layer_flush_done_tx
                 .send_replace((flush_counter, result));
-
-            timer.stop_and_record();
         }
     }
 
@@ -3073,6 +3073,7 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<(), FlushLayerError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
+
         // As a special case, when we have just imported an image into the repository,
         // instead of writing out a L0 delta layer, we directly write out image layer
         // files instead. This is possible as long as *all* the data imported into the

From 5309711691325c274fc34994c240feaa529cbef5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 13 Mar 2024 17:30:29 +0100
Subject: [PATCH 0395/1571] Make tenant_id in TenantLocationConfigRequest
 optional (#7055)

The `tenant_id` in `TenantLocationConfigRequest` in the
`location_config` endpoint was only used in the storage
controller/attachment service, and there it was only used for assertions
and the creation part.
---
 control_plane/attachment_service/src/http.rs    |  6 +++---
 control_plane/attachment_service/src/service.rs | 13 +++++++------
 libs/pageserver_api/src/models.rs               |  2 +-
 pageserver/client/src/mgmt_api.rs               |  2 +-
 pageserver/src/http/openapi_spec.yml            |  7 ++++---
 5 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 27ba5bdb65..515c287ea9 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -174,14 +174,14 @@ async fn handle_tenant_location_config(
     service: Arc<Service>,
     mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
     let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
     json_response(
         StatusCode::OK,
         service
-            .tenant_location_config(tenant_id, config_req)
+            .tenant_location_config(tenant_shard_id, config_req)
             .await?,
     )
 }
@@ -587,7 +587,7 @@ pub fn make_router(
         .get("/v1/tenant/:tenant_id/config", |r| {
             tenant_service_handler(r, handle_tenant_config_get)
         })
-        .put("/v1/tenant/:tenant_id/location_config", |r| {
+        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
             tenant_service_handler(r, handle_tenant_location_config)
         })
         .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index ea301d0372..1c4ede3d9d 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1262,6 +1262,7 @@ impl Service {
         let mut updates = Vec::new();
         let mut locked = self.inner.write().unwrap();
         let (nodes, tenants, _scheduler) = locked.parts_mut();
+        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
 
         // Use location config mode as an indicator of policy.
         let placement_policy = match req.config.mode {
@@ -1326,12 +1327,10 @@ impl Service {
             TenantCreateOrUpdate::Create(
                 // Synthesize a creation request
                 TenantCreateRequest {
-                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    new_tenant_id: tenant_shard_id,
                     generation,
                     shard_parameters: ShardParameters {
-                        // Must preserve the incoming shard_count do distinguish unsharded (0)
-                        // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
-                        count: req.tenant_id.shard_count,
+                        count: tenant_shard_id.shard_count,
                         // We only import un-sharded or single-sharded tenants, so stripe
                         // size can be made up arbitrarily here.
                         stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
@@ -1360,15 +1359,17 @@ impl Service {
     /// - Call with mode Detached to switch to PolicyMode::Detached
     pub(crate) async fn tenant_location_config(
         &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
         req: TenantLocationConfigRequest,
     ) -> Result<TenantLocationConfigResponse, ApiError> {
-        if !req.tenant_id.is_unsharded() {
+        if !tenant_shard_id.is_unsharded() {
             return Err(ApiError::BadRequest(anyhow::anyhow!(
                 "This API is for importing single-sharded or unsharded tenants"
             )));
         }
 
+        let tenant_id = tenant_shard_id.tenant_id;
+
         // First check if this is a creation or an update
         let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
 
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index a96cc09158..3aa84f8903 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -426,7 +426,7 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
-    pub tenant_id: TenantShardId,
+    pub tenant_id: Option<TenantShardId>,
     #[serde(flatten)]
     pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 732eb951c9..2f22ebd54d 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -257,7 +257,7 @@ impl Client {
         lazy: bool,
     ) -> Result<()> {
         let req_body = TenantLocationConfigRequest {
-            tenant_id: tenant_shard_id,
+            tenant_id: Some(tenant_shard_id),
             config,
         };
 
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 6a070e2135..4823710fb5 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -567,9 +567,9 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/ServiceUnavailableError"
-  /v1/tenant/{tenant_id}/location_config:
+  /v1/tenant/{tenant_shard_id}/location_config:
     parameters:
-      - name: tenant_id
+      - name: tenant_shard_id
         in: path
         required: true
         schema:
@@ -1367,10 +1367,11 @@ components:
     TenantLocationConfigRequest:
       type: object
       required:
-        - tenant_id
+        - mode
       properties:
         tenant_id:
           type: string
+          description: Not used, scheduled for removal.
         mode:
           type: string
           enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]

From 69338e53e3628cd0133b27d1c079f9deeaaea725 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 13 Mar 2024 18:49:17 +0100
Subject: [PATCH 0396/1571] throttling: fixup interactions with
 Timeline::get_vectored (#7089)

## Problem

Before this PR, `Timeline::get_vectored` would be throttled twice if the
sequential option was enabled or if validation was enabled.

Also, `pageserver_get_vectored_seconds` included the time spent in the
throttle, which turns out to be undesirable for what we use that metric
for.

## Summary of changes

Double-throttle:

* Add `Timeline::get0` method which is unthrottled.
* Use that method from within the `Timeline::get_vectored` code path.

Metric:

* return throttled time from `throttle()` method
* deduct the value from the observed time
* globally rate-limited logging of duration subtraction errors, like in
all other places that do the throttled-time deduction from observations
---
 pageserver/src/bin/pageserver.rs  |  2 ++
 pageserver/src/metrics.rs         |  2 +-
 pageserver/src/tenant/throttle.rs |  7 ++--
 pageserver/src/tenant/timeline.rs | 59 +++++++++++++++++++++++++------
 4 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 2f172bd384..59750897ff 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -1,3 +1,5 @@
+#![recursion_limit = "300"]
+
 //! Main entry point for the Page Server executable.
 
 use std::env::{var, VarError};
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 814b3e1f96..03537ddb05 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -167,7 +167,7 @@ impl GetVectoredLatency {
 pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
     let inner = register_histogram_vec!(
         "pageserver_get_vectored_seconds",
-        "Time spent in get_vectored",
+        "Time spent in get_vectored, excluding time spent in timeline_get_throttle.",
         &["task_kind"],
         CRITICAL_OP_BUCKETS.into(),
     )
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 280773e9c3..f3f3d5e3ae 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -130,10 +130,10 @@ where
         self.inner.load().config.steady_rps()
     }
 
-    pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) {
+    pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option<Duration> {
         let inner = self.inner.load_full(); // clones the `Inner` Arc
         if !inner.task_kinds.contains(ctx.task_kind()) {
-            return;
+            return None;
         };
         let start = std::time::Instant::now();
         let mut did_throttle = false;
@@ -170,6 +170,9 @@ where
                     });
                 }
             }
+            Some(wait_time)
+        } else {
+            None
         }
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f10df19b7b..d507a19de9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -634,6 +634,8 @@ impl Timeline {
     /// If a remote layer file is needed, it is downloaded as part of this
     /// call.
     ///
+    /// This method enforces [`Self::timeline_get_throttle`] internally.
+    ///
     /// NOTE: It is considered an error to 'get' a key that doesn't exist. The
     /// abstraction above this needs to store suitable metadata to track what
     /// data exists with what keys, in separate metadata entries. If a
@@ -644,18 +646,27 @@ impl Timeline {
     /// # Cancel-Safety
     ///
     /// This method is cancellation-safe.
+    #[inline(always)]
     pub(crate) async fn get(
         &self,
         key: Key,
         lsn: Lsn,
         ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.timeline_get_throttle.throttle(ctx, 1).await;
+        self.get_impl(key, lsn, ctx).await
+    }
+    /// Not subject to [`Self::timeline_get_throttle`].
+    async fn get_impl(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         if !lsn.is_valid() {
             return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
         }
 
-        self.timeline_get_throttle.throttle(ctx, 1).await;
-
         // This check is debug-only because of the cost of hashing, and because it's a double-check: we
         // already checked the key against the shard_identity when looking up the Timeline from
         // page_service.
@@ -752,10 +763,6 @@ impl Timeline {
             return Err(GetVectoredError::Oversized(key_count));
         }
 
-        self.timeline_get_throttle
-            .throttle(ctx, key_count as usize)
-            .await;
-
         for range in &keyspace.ranges {
             let mut key = range.start;
             while key != range.end {
@@ -772,11 +779,18 @@ impl Timeline {
             self.conf.get_vectored_impl
         );
 
-        let _timer = crate::metrics::GET_VECTORED_LATENCY
+        let start = crate::metrics::GET_VECTORED_LATENCY
             .for_task_kind(ctx.task_kind())
-            .map(|t| t.start_timer());
+            .map(|metric| (metric, Instant::now()));
 
-        match self.conf.get_vectored_impl {
+        // start counting after throttle so that throttle time
+        // is always less than observation time
+        let throttled = self
+            .timeline_get_throttle
+            .throttle(ctx, key_count as usize)
+            .await;
+
+        let res = match self.conf.get_vectored_impl {
             GetVectoredImpl::Sequential => {
                 self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
             }
@@ -790,9 +804,33 @@ impl Timeline {
 
                 vectored_res
             }
+        };
+
+        if let Some((metric, start)) = start {
+            let elapsed = start.elapsed();
+            let ex_throttled = if let Some(throttled) = throttled {
+                elapsed.checked_sub(throttled)
+            } else {
+                Some(elapsed)
+            };
+
+            if let Some(ex_throttled) = ex_throttled {
+                metric.observe(ex_throttled.as_secs_f64());
+            } else {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                let mut rate_limit = LOGGED.lock().unwrap();
+                rate_limit.call(|| {
+                    warn!("error deducting time spent throttled; this message is logged at a global rate limit");
+                });
+            }
         }
+
+        res
     }
 
+    /// Not subject to [`Self::timeline_get_throttle`].
     pub(super) async fn get_vectored_sequential_impl(
         &self,
         keyspace: KeySpace,
@@ -803,7 +841,7 @@ impl Timeline {
         for range in keyspace.ranges {
             let mut key = range.start;
             while key != range.end {
-                let block = self.get(key, lsn, ctx).await;
+                let block = self.get_impl(key, lsn, ctx).await;
 
                 use PageReconstructError::*;
                 match block {
@@ -853,6 +891,7 @@ impl Timeline {
         Ok(results)
     }
 
+    /// Not subject to [`Self::timeline_get_throttle`].
     pub(super) async fn validate_get_vectored_impl(
         &self,
         vectored_res: &Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError>,

From 3bd6551b36be636c7497ee774c65718320093bc3 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 14 Mar 2024 08:20:56 +0000
Subject: [PATCH 0397/1571] proxy http cancellation safety (#7117)

## Problem

hyper auto-cancels the request futures on connection close.
`sql_over_http::handle` is not 'drop cancel safe', so we need to do some
other work to make sure connections are queries in the right way.

## Summary of changes

1. tokio::spawn the request handler to resolve the initial cancel-safety
issue
2. share a cancellation token, and cancel it when the request `Service`
is dropped.
3. Add a new log span to be able to track the HTTP connection lifecycle.
---
 proxy/src/protocol2.rs                | 18 ++++++-
 proxy/src/serverless.rs               | 74 +++++++++++++++++++--------
 proxy/src/serverless/sql_over_http.rs |  2 +-
 proxy/src/serverless/tls_listener.rs  | 29 ++++-------
 test_runner/fixtures/neon_fixtures.py |  2 +
 test_runner/regress/test_proxy.py     | 36 +++++++++++++
 6 files changed, 120 insertions(+), 41 deletions(-)

diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index f476cb9b37..700c8c8681 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -341,7 +341,14 @@ impl Accept for ProxyProtocolAccept {
         cx: &mut Context<'_>,
     ) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
         let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
-        tracing::info!(protocol = self.protocol, "accepted new TCP connection");
+
+        let conn_id = uuid::Uuid::new_v4();
+        let span = tracing::info_span!("http_conn", ?conn_id);
+        {
+            let _enter = span.enter();
+            tracing::info!("accepted new TCP connection");
+        }
+
         let Some(conn) = conn else {
             return Poll::Ready(None);
         };
@@ -354,6 +361,7 @@ impl Accept for ProxyProtocolAccept {
                     .with_label_values(&[self.protocol])
                     .guard(),
             )),
+            span,
         })))
     }
 }
@@ -364,6 +372,14 @@ pin_project! {
         pub inner: T,
         pub connection_id: Uuid,
         pub gauge: Mutex<Option<IntCounterPairGuard>>,
+        pub span: tracing::Span,
+    }
+
+    impl<S> PinnedDrop for WithConnectionGuard<S> {
+        fn drop(this: Pin<&mut Self>) {
+            let _enter = this.span.enter();
+            tracing::info!("HTTP connection closed")
+        }
     }
 }
 
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 68f68eaba1..be9f90acde 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -19,6 +19,7 @@ use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
+use tracing::instrument::Instrumented;
 
 use crate::context::RequestMonitoring;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
@@ -30,13 +31,12 @@ use hyper::{
     Body, Method, Request, Response,
 };
 
-use std::convert::Infallible;
 use std::net::IpAddr;
 use std::sync::Arc;
 use std::task::Poll;
 use tls_listener::TlsListener;
 use tokio::net::TcpListener;
-use tokio_util::sync::CancellationToken;
+use tokio_util::sync::{CancellationToken, DropGuard};
 use tracing::{error, info, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};
 
@@ -100,12 +100,7 @@ pub async fn task_main(
     let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
     ws_connections.close(); // allows `ws_connections.wait to complete`
 
-    let tls_listener = TlsListener::new(
-        tls_acceptor,
-        addr_incoming,
-        "http",
-        config.handshake_timeout,
-    );
+    let tls_listener = TlsListener::new(tls_acceptor, addr_incoming, config.handshake_timeout);
 
     let make_svc = hyper::service::make_service_fn(
         |stream: &tokio_rustls::server::TlsStream<
@@ -121,6 +116,11 @@ pub async fn task_main(
                 .take()
                 .expect("gauge should be set on connection start");
 
+            // Cancel all current inflight HTTP requests if the HTTP connection is closed.
+            let http_cancellation_token = CancellationToken::new();
+            let cancel_connection = http_cancellation_token.clone().drop_guard();
+
+            let span = conn.span.clone();
             let client_addr = conn.inner.client_addr();
             let remote_addr = conn.inner.inner.remote_addr();
             let backend = backend.clone();
@@ -136,27 +136,43 @@ pub async fn task_main(
                 Ok(MetricService::new(
                     hyper::service::service_fn(move |req: Request<Body>| {
                         let backend = backend.clone();
-                        let ws_connections = ws_connections.clone();
+                        let ws_connections2 = ws_connections.clone();
                         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
                         let cancellation_handler = cancellation_handler.clone();
+                        let http_cancellation_token = http_cancellation_token.child_token();
 
-                        async move {
-                            Ok::<_, Infallible>(
-                                request_handler(
+                        // `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
+                        // By spawning the future, we ensure it never gets cancelled until it decides to.
+                        ws_connections.spawn(
+                            async move {
+                                // Cancel the current inflight HTTP request if the requets stream is closed.
+                                // This is slightly different to `_cancel_connection` in that
+                                // h2 can cancel individual requests with a `RST_STREAM`.
+                                let _cancel_session = http_cancellation_token.clone().drop_guard();
+
+                                let res = request_handler(
                                     req,
                                     config,
                                     backend,
-                                    ws_connections,
+                                    ws_connections2,
                                     cancellation_handler,
                                     peer_addr.ip(),
                                     endpoint_rate_limiter,
+                                    http_cancellation_token,
                                 )
                                 .await
-                                .map_or_else(|e| e.into_response(), |r| r),
-                            )
-                        }
+                                .map_or_else(|e| e.into_response(), |r| r);
+
+                                _cancel_session.disarm();
+
+                                res
+                            }
+                            .in_current_span(),
+                        )
                     }),
                     gauge,
+                    cancel_connection,
+                    span,
                 ))
             }
         },
@@ -176,11 +192,23 @@ pub async fn task_main(
 struct MetricService<S> {
     inner: S,
     _gauge: IntCounterPairGuard,
+    _cancel: DropGuard,
+    span: tracing::Span,
 }
 
 impl<S> MetricService<S> {
-    fn new(inner: S, _gauge: IntCounterPairGuard) -> MetricService<S> {
-        MetricService { inner, _gauge }
+    fn new(
+        inner: S,
+        _gauge: IntCounterPairGuard,
+        _cancel: DropGuard,
+        span: tracing::Span,
+    ) -> MetricService<S> {
+        MetricService {
+            inner,
+            _gauge,
+            _cancel,
+            span,
+        }
     }
 }
 
@@ -190,14 +218,16 @@ where
 {
     type Response = S::Response;
     type Error = S::Error;
-    type Future = S::Future;
+    type Future = Instrumented<S::Future>;
 
     fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
         self.inner.poll_ready(cx)
     }
 
     fn call(&mut self, req: Request<ReqBody>) -> Self::Future {
-        self.inner.call(req)
+        self.span
+            .in_scope(|| self.inner.call(req))
+            .instrument(self.span.clone())
     }
 }
 
@@ -210,6 +240,8 @@ async fn request_handler(
     cancellation_handler: Arc<CancellationHandler>,
     peer_addr: IpAddr,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    // used to cancel in-flight HTTP requests. not used to cancel websockets
+    http_cancellation_token: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
     let session_id = uuid::Uuid::new_v4();
 
@@ -253,7 +285,7 @@ async fn request_handler(
         let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
         let span = ctx.span.clone();
 
-        sql_over_http::handle(config, ctx, request, backend)
+        sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
             .instrument(span)
             .await
     } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 86c278030f..f675375ff1 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -217,8 +217,8 @@ pub async fn handle(
     mut ctx: RequestMonitoring,
     request: Request<Body>,
     backend: Arc<PoolingBackend>,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let cancel = CancellationToken::new();
     let cancel2 = cancel.clone();
     let handle = tokio::spawn(async move {
         time::sleep(config.http_config.request_timeout).await;
diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs
index cce02e3850..33f194dd59 100644
--- a/proxy/src/serverless/tls_listener.rs
+++ b/proxy/src/serverless/tls_listener.rs
@@ -13,7 +13,7 @@ use tokio::{
     time::timeout,
 };
 use tokio_rustls::{server::TlsStream, TlsAcceptor};
-use tracing::{info, warn};
+use tracing::{info, warn, Instrument};
 
 use crate::{
     metrics::TLS_HANDSHAKE_FAILURES,
@@ -29,24 +29,17 @@ pin_project! {
         tls: TlsAcceptor,
         waiting: JoinSet<Option<TlsStream<A::Conn>>>,
         timeout: Duration,
-        protocol: &'static str,
     }
 }
 
 impl<A: Accept> TlsListener<A> {
     /// Create a `TlsListener` with default options.
-    pub(crate) fn new(
-        tls: TlsAcceptor,
-        listener: A,
-        protocol: &'static str,
-        timeout: Duration,
-    ) -> Self {
+    pub(crate) fn new(tls: TlsAcceptor, listener: A, timeout: Duration) -> Self {
         TlsListener {
             listener,
             tls,
             waiting: JoinSet::new(),
             timeout,
-            protocol,
         }
     }
 }
@@ -73,7 +66,7 @@ where
                 Poll::Ready(Some(Ok(mut conn))) => {
                     let t = *this.timeout;
                     let tls = this.tls.clone();
-                    let protocol = *this.protocol;
+                    let span = conn.span.clone();
                     this.waiting.spawn(async move {
                         let peer_addr = match conn.inner.wait_for_addr().await {
                             Ok(Some(addr)) => addr,
@@ -86,21 +79,24 @@ where
 
                         let accept = tls.accept(conn);
                         match timeout(t, accept).await {
-                            Ok(Ok(conn)) => Some(conn),
+                            Ok(Ok(conn)) => {
+                                info!(%peer_addr, "accepted new TLS connection");
+                                Some(conn)
+                            },
                             // The handshake failed, try getting another connection from the queue
                             Ok(Err(e)) => {
                                 TLS_HANDSHAKE_FAILURES.inc();
-                                warn!(%peer_addr, protocol, "failed to accept TLS connection: {e:?}");
+                                warn!(%peer_addr, "failed to accept TLS connection: {e:?}");
                                 None
                             }
                             // The handshake timed out, try getting another connection from the queue
                             Err(_) => {
                                 TLS_HANDSHAKE_FAILURES.inc();
-                                warn!(%peer_addr, protocol, "failed to accept TLS connection: timeout");
+                                warn!(%peer_addr, "failed to accept TLS connection: timeout");
                                 None
                             }
                         }
-                    });
+                    }.instrument(span));
                 }
                 Poll::Ready(Some(Err(e))) => {
                     tracing::error!("error accepting TCP connection: {e}");
@@ -112,10 +108,7 @@ where
 
         loop {
             return match this.waiting.poll_join_next(cx) {
-                Poll::Ready(Some(Ok(Some(conn)))) => {
-                    info!(protocol = this.protocol, "accepted new TLS connection");
-                    Poll::Ready(Some(Ok(conn)))
-                }
+                Poll::Ready(Some(Ok(Some(conn)))) => Poll::Ready(Some(Ok(conn))),
                 // The handshake failed to complete, try getting another connection from the queue
                 Poll::Ready(Some(Ok(None))) => continue,
                 // The handshake panicked or was cancelled. ignore and get another connection
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b3f460c7fe..5b76e808d5 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2944,6 +2944,7 @@ class NeonProxy(PgProtocol):
         user = quote(kwargs["user"])
         password = quote(kwargs["password"])
         expected_code = kwargs.get("expected_code")
+        timeout = kwargs.get("timeout")
 
         log.info(f"Executing http query: {query}")
 
@@ -2957,6 +2958,7 @@ class NeonProxy(PgProtocol):
                 "Neon-Pool-Opt-In": "true",
             },
             verify=str(self.test_output_dir / "proxy.crt"),
+            timeout=timeout,
         )
 
         if expected_code is not None:
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 078589d8eb..3e986a8f7b 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -596,3 +596,39 @@ def test_sql_over_http_timeout_cancel(static_proxy: NeonProxy):
     assert (
         "duplicate key value violates unique constraint" in res["message"]
     ), "HTTP query should conflict"
+
+
+def test_sql_over_http_connection_cancel(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http with login password 'http' superuser")
+
+    static_proxy.safe_psql("create table test_table ( id int primary key )")
+
+    # insert into a table, with a unique constraint, after sleeping for n seconds
+    query = "WITH temp AS ( \
+        SELECT pg_sleep($1) as sleep, $2::int as id \
+    ) INSERT INTO test_table (id) SELECT id FROM temp"
+
+    try:
+        # The request should complete before the proxy HTTP timeout triggers.
+        # Timeout and cancel the request on the client side before the query completes.
+        static_proxy.http_query(
+            query,
+            [static_proxy.http_timeout_seconds - 1, 1],
+            user="http",
+            password="http",
+            timeout=2,
+        )
+    except requests.exceptions.ReadTimeout:
+        pass
+
+    # wait until the query _would_ have been complete
+    time.sleep(static_proxy.http_timeout_seconds)
+
+    res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200)
+    assert res["command"] == "INSERT", "HTTP query should insert"
+    assert res["rowCount"] == 1, "HTTP query should insert"
+
+    res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400)
+    assert (
+        "duplicate key value violates unique constraint" in res["message"]
+    ), "HTTP query should conflict"

From 44f42627dd32f29a3be9cfcd2d8c487c89642dc8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 14 Mar 2024 09:11:57 +0000
Subject: [PATCH 0398/1571] pageserver/controller: error handling for shard
 splitting (#7074)

## Problem

Shard splits worked, but weren't safe against failures (e.g. node crash
during split) yet.

Related: #6676

## Summary of changes

- Introduce async rwlocks at the scope of Tenant and Node:
  - exclusive tenant lock is used to protect splits
- exclusive node lock is used to protect new reconciliation process that
happens when setting node active
- exclusive locks used in both cases when doing persistent updates (e.g.
node scheduling conf) where the update to DB & in-memory state needs to
be atomic.
- Add failpoints to shard splitting in control plane and pageserver
code.
- Implement error handling in control plane for shard splits: this
detaches child chards and ensures parent shards are re-attached.
- Crash-safety for storage controller restarts requires little effort:
we already reconcile with nodes over a storage controller restart, so as
long as we reset any incomplete splits in the DB on restart (added in
this PR), things are implicitly cleaned up.
- Implement reconciliation with offline nodes before they transition to
active:
- (in this context reconciliation means something like
startup_reconcile, not literally the Reconciler)
- This covers cases where split abort cannot reach a node to clean it
up: the cleanup will eventually happen when the node is marked active,
as part of reconciliation.
- This also covers the case where a node was unavailable when the
storage controller started, but becomes available later: previously this
allowed it to skip the startup reconcile.
- Storage controller now terminates on panics. We only use panics for
true "should never happen" assertions, and these cases can leave us in
an un-usable state if we keep running (e.g. panicking in a shard split).
In the unlikely event that we get into a crashloop as a result, we'll
rely on kubernetes to back us off.
- Add `test_sharding_split_failures` which exercises a variety of
failure cases during shard split.
---
 Cargo.lock                                    |   2 +
 control_plane/attachment_service/Cargo.toml   |   2 +
 control_plane/attachment_service/src/http.rs  |   5 +
 .../attachment_service/src/id_lock_map.rs     |  54 ++
 control_plane/attachment_service/src/lib.rs   |   1 +
 control_plane/attachment_service/src/main.rs  |   6 +
 control_plane/attachment_service/src/node.rs  |  35 +-
 .../attachment_service/src/persistence.rs     |  78 ++
 .../attachment_service/src/reconciler.rs      |  56 +-
 .../attachment_service/src/service.rs         | 787 +++++++++++++++---
 .../attachment_service/src/tenant_state.rs    |   7 +-
 pageserver/src/http/routes.rs                 |  10 +
 pageserver/src/tenant/mgr.rs                  |  56 +-
 test_runner/conftest.py                       |   1 +
 test_runner/fixtures/compute_reconfigure.py   |  62 ++
 test_runner/fixtures/neon_fixtures.py         |  17 +
 test_runner/fixtures/workload.py              |  76 +-
 test_runner/regress/test_sharding.py          | 340 +++++++-
 18 files changed, 1445 insertions(+), 150 deletions(-)
 create mode 100644 control_plane/attachment_service/src/id_lock_map.rs
 create mode 100644 test_runner/fixtures/compute_reconfigure.py

diff --git a/Cargo.lock b/Cargo.lock
index 7fd9053f62..45397eb4a2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -282,8 +282,10 @@ dependencies = [
  "control_plane",
  "diesel",
  "diesel_migrations",
+ "fail",
  "futures",
  "git-version",
+ "hex",
  "humantime",
  "hyper",
  "metrics",
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index a5fad7216c..f78f56c480 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -19,8 +19,10 @@ aws-config.workspace = true
 aws-sdk-secretsmanager.workspace = true
 camino.workspace = true
 clap.workspace = true
+fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
+hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
 once_cell.workspace = true
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 515c287ea9..d26652cc94 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -10,7 +10,9 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
+use tokio_util::sync::CancellationToken;
 use utils::auth::{Scope, SwappableJwtAuth};
+use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
 use utils::http::request::{must_get_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};
@@ -554,6 +556,9 @@ pub fn make_router(
         .post("/debug/v1/consistency_check", |r| {
             request_span(r, handle_consistency_check)
         })
+        .put("/debug/v1/failpoints", |r| {
+            request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
+        })
         .get("/control/v1/tenant/:tenant_id/locate", |r| {
             tenant_service_handler(r, handle_tenant_locate)
         })
diff --git a/control_plane/attachment_service/src/id_lock_map.rs b/control_plane/attachment_service/src/id_lock_map.rs
new file mode 100644
index 0000000000..b03700b50c
--- /dev/null
+++ b/control_plane/attachment_service/src/id_lock_map.rs
@@ -0,0 +1,54 @@
+use std::{collections::HashMap, sync::Arc};
+
+/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
+/// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
+/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
+/// is needed at a tenant-wide granularity.
+pub(crate) struct IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    /// A synchronous lock for getting/setting the async locks that our callers will wait on.
+    entities: std::sync::Mutex<std::collections::HashMap<T, Arc<tokio::sync::RwLock<()>>>>,
+}
+
+impl<T> IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    pub(crate) fn shared(
+        &self,
+        key: T,
+    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<()>> {
+        let mut locked = self.entities.lock().unwrap();
+        let entry = locked.entry(key).or_default();
+        entry.clone().read_owned()
+    }
+
+    pub(crate) fn exclusive(
+        &self,
+        key: T,
+    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockWriteGuard<()>> {
+        let mut locked = self.entities.lock().unwrap();
+        let entry = locked.entry(key).or_default();
+        entry.clone().write_owned()
+    }
+
+    /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do
+    /// periodic housekeeping to avoid the map growing indefinitely
+    pub(crate) fn housekeeping(&self) {
+        let mut locked = self.entities.lock().unwrap();
+        locked.retain(|_k, lock| lock.try_write().is_err())
+    }
+}
+
+impl<T> Default for IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    fn default() -> Self {
+        Self {
+            entities: std::sync::Mutex::new(HashMap::new()),
+        }
+    }
+}
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index 796b465c10..a017bc1ecc 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -4,6 +4,7 @@ use utils::seqwait::MonotonicCounter;
 mod auth;
 mod compute_hook;
 pub mod http;
+mod id_lock_map;
 pub mod metrics;
 mod node;
 pub mod persistence;
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 333c3911e3..fb7b363c39 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -206,6 +206,12 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
 }
 
 fn main() -> anyhow::Result<()> {
+    let default_panic = std::panic::take_hook();
+    std::panic::set_hook(Box::new(move |info| {
+        default_panic(info);
+        std::process::exit(1);
+    }));
+
     tokio::runtime::Builder::new_current_thread()
         // We use spawn_blocking for database operations, so require approximately
         // as many blocking threads as we will open database connections.
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index 27b03608fa..dda8a155c6 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -83,29 +83,38 @@ impl Node {
         }
     }
 
-    pub(crate) fn set_availability(
-        &mut self,
-        availability: NodeAvailability,
-    ) -> AvailabilityTransition {
-        use NodeAvailability::*;
-        let transition = match (self.availability, availability) {
-            (Offline, Active) => {
+    pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
+        match self.get_availability_transition(availability) {
+            AvailabilityTransition::ToActive => {
                 // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
                 // users of previously-cloned copies of the node will still see the old cancellation
                 // state.  For example, Reconcilers in flight will have to complete and be spawned
                 // again to realize that the node has become available.
                 self.cancel = CancellationToken::new();
-                AvailabilityTransition::ToActive
             }
-            (Active, Offline) => {
+            AvailabilityTransition::ToOffline => {
                 // Fire the node's cancellation token to cancel any in-flight API requests to it
                 self.cancel.cancel();
-                AvailabilityTransition::ToOffline
             }
-            _ => AvailabilityTransition::Unchanged,
-        };
+            AvailabilityTransition::Unchanged => {}
+        }
         self.availability = availability;
-        transition
+    }
+
+    /// Without modifying the availability of the node, convert the intended availability
+    /// into a description of the transition.
+    pub(crate) fn get_availability_transition(
+        &self,
+        availability: NodeAvailability,
+    ) -> AvailabilityTransition {
+        use AvailabilityTransition::*;
+        use NodeAvailability::*;
+
+        match (self.availability, availability) {
+            (Offline, Active) => ToActive,
+            (Active, Offline) => ToOffline,
+            _ => Unchanged,
+        }
     }
 
     /// Whether we may send API requests to this node.
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index aa08945834..3602cf8b1f 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -11,6 +11,9 @@ use diesel::prelude::*;
 use diesel::Connection;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
+use pageserver_api::shard::ShardConfigError;
+use pageserver_api::shard::ShardIdentity;
+use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
@@ -72,6 +75,14 @@ pub(crate) enum DatabaseError {
     Logical(String),
 }
 
+#[must_use]
+pub(crate) enum AbortShardSplitStatus {
+    /// We aborted the split in the database by reverting to the parent shards
+    Aborted,
+    /// The split had already been persisted.
+    Complete,
+}
+
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
 impl Persistence {
@@ -570,6 +581,51 @@ impl Persistence {
         })
         .await
     }
+
+    /// Used when the remote part of a shard split failed: we will revert the database state to have only
+    /// the parent shards, with SplitState::Idle.
+    pub(crate) async fn abort_shard_split(
+        &self,
+        split_tenant_id: TenantId,
+        new_shard_count: ShardCount,
+    ) -> DatabaseResult<AbortShardSplitStatus> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
+            let aborted = conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
+                // Clear the splitting state on parent shards
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.ne(new_shard_count.literal() as i32))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
+
+                // Parent shards are already gone: we cannot abort.
+                if updated == 0 {
+                    return Ok(AbortShardSplitStatus::Complete);
+                }
+
+                // Sanity check: if parent shards were present, their cardinality should
+                // be less than the number of child shards.
+                if updated >= new_shard_count.count() as usize {
+                    return Err(DatabaseError::Logical(format!(
+                        "Unexpected parent shard count {updated} while aborting split to \
+                            count {new_shard_count:?} on tenant {split_tenant_id}"
+                    )));
+                }
+
+                // Erase child shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(new_shard_count.literal() as i32))
+                    .execute(conn)?;
+
+                Ok(AbortShardSplitStatus::Aborted)
+            })?;
+
+            Ok(aborted)
+        })
+        .await
+    }
 }
 
 /// Parts of [`crate::tenant_state::TenantState`] that are stored durably
@@ -604,6 +660,28 @@ pub(crate) struct TenantShardPersistence {
     pub(crate) config: String,
 }
 
+impl TenantShardPersistence {
+    pub(crate) fn get_shard_identity(&self) -> Result<ShardIdentity, ShardConfigError> {
+        if self.shard_count == 0 {
+            Ok(ShardIdentity::unsharded())
+        } else {
+            Ok(ShardIdentity::new(
+                ShardNumber(self.shard_number as u8),
+                ShardCount::new(self.shard_count as u8),
+                ShardStripeSize(self.shard_stripe_size as u32),
+            )?)
+        }
+    }
+
+    pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
+        Ok(TenantShardId {
+            tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
+            shard_number: ShardNumber(self.shard_number as u8),
+            shard_count: ShardCount::new(self.shard_count as u8),
+        })
+    }
+}
+
 /// Parts of [`crate::node::Node`] that are stored durably
 #[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::nodes)]
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index 603da9bf02..7f68a65c15 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,5 +1,6 @@
 use crate::persistence::Persistence;
 use crate::service;
+use hyper::StatusCode;
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
@@ -18,6 +19,8 @@ use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
 use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
 
+const DEFAULT_HEATMAP_PERIOD: &str = "60s";
+
 /// Object with the lifetime of the background reconcile task that is created
 /// for tenants which have a difference between their intent and observed states.
 pub(super) struct Reconciler {
@@ -485,17 +488,29 @@ impl Reconciler {
                 )
                 .await
             {
-                Some(Ok(observed)) => observed,
+                Some(Ok(observed)) => Some(observed),
+                Some(Err(mgmt_api::Error::ApiError(status, _msg)))
+                    if status == StatusCode::NOT_FOUND =>
+                {
+                    None
+                }
                 Some(Err(e)) => return Err(e.into()),
                 None => return Err(ReconcileError::Cancel),
             };
             tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
-            self.observed.locations.insert(
-                attached_node.get_id(),
-                ObservedStateLocation {
-                    conf: observed_conf,
-                },
-            );
+            match observed_conf {
+                Some(conf) => {
+                    // Pageserver returned a state: update it in observed.  This may still be an indeterminate (None) state,
+                    // if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running)
+                    self.observed
+                        .locations
+                        .insert(attached_node.get_id(), ObservedStateLocation { conf });
+                }
+                None => {
+                    // Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver.
+                    self.observed.locations.remove(&attached_node.get_id());
+                }
+            }
         }
 
         Ok(())
@@ -525,7 +540,12 @@ impl Reconciler {
                 )));
             };
 
-            let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
+            let mut wanted_conf = attached_location_conf(
+                generation,
+                &self.shard,
+                &self.config,
+                !self.intent.secondary.is_empty(),
+            );
             match self.observed.locations.get(&node.get_id()) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     // Nothing to do
@@ -662,10 +682,26 @@ impl Reconciler {
     }
 }
 
+/// We tweak the externally-set TenantConfig while configuring
+/// locations, using our awareness of whether secondary locations
+/// are in use to automatically enable/disable heatmap uploads.
+fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig {
+    let mut config = config.clone();
+    if has_secondaries {
+        if config.heatmap_period.is_none() {
+            config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
+        }
+    } else {
+        config.heatmap_period = None;
+    }
+    config
+}
+
 pub(crate) fn attached_location_conf(
     generation: Generation,
     shard: &ShardIdentity,
     config: &TenantConfig,
+    has_secondaries: bool,
 ) -> LocationConfig {
     LocationConfig {
         mode: LocationConfigMode::AttachedSingle,
@@ -674,7 +710,7 @@ pub(crate) fn attached_location_conf(
         shard_number: shard.number.0,
         shard_count: shard.count.literal(),
         shard_stripe_size: shard.stripe_size.0,
-        tenant_conf: config.clone(),
+        tenant_conf: ha_aware_config(config, has_secondaries),
     }
 }
 
@@ -689,6 +725,6 @@ pub(crate) fn secondary_location_conf(
         shard_number: shard.number.0,
         shard_count: shard.count.literal(),
         shard_stripe_size: shard.stripe_size.0,
-        tenant_conf: config.clone(),
+        tenant_conf: ha_aware_config(config, true),
     }
 }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 1c4ede3d9d..1b85081666 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -7,6 +7,7 @@ use std::{
     time::{Duration, Instant},
 };
 
+use crate::{id_lock_map::IdLockMap, persistence::AbortShardSplitStatus};
 use anyhow::Context;
 use control_plane::storage_controller::{
     AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
@@ -36,6 +37,7 @@ use pageserver_api::{
     },
 };
 use pageserver_client::mgmt_api;
+use tokio::sync::OwnedRwLockWriteGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::instrument;
 use utils::{
@@ -147,6 +149,18 @@ pub struct Service {
     compute_hook: Arc<ComputeHook>,
     result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
 
+    // Channel for background cleanup from failed operations that require cleanup, such as shard split
+    abort_tx: tokio::sync::mpsc::UnboundedSender<TenantShardSplitAbort>,
+
+    // Locking on a tenant granularity (covers all shards in the tenant):
+    // - Take exclusively for rare operations that mutate the tenant's persistent state (e.g. create/delete/split)
+    // - Take in shared mode for operations that need the set of shards to stay the same to complete reliably (e.g. timeline CRUD)
+    tenant_op_locks: IdLockMap<TenantId>,
+
+    // Locking for node-mutating operations: take exclusively for operations that modify the node's persistent state, or
+    // that transition it to/from Active.
+    node_op_locks: IdLockMap<NodeId>,
+
     // Process shutdown will fire this token
     cancel: CancellationToken,
 
@@ -174,6 +188,27 @@ enum TenantCreateOrUpdate {
     Update(Vec<ShardUpdate>),
 }
 
+/// When we tenant shard split operation fails, we may not be able to clean up immediately, because nodes
+/// might not be available.  We therefore use a queue of abort operations processed in the background.
+struct TenantShardSplitAbort {
+    tenant_id: TenantId,
+    /// The target values from the request that failed
+    new_shard_count: ShardCount,
+    new_stripe_size: Option<ShardStripeSize>,
+    /// Until this abort op is complete, no other operations may be done on the tenant
+    _tenant_lock: tokio::sync::OwnedRwLockWriteGuard<()>,
+}
+
+#[derive(thiserror::Error, Debug)]
+enum TenantShardSplitAbortError {
+    #[error(transparent)]
+    Database(#[from] DatabaseError),
+    #[error(transparent)]
+    Remote(#[from] mgmt_api::Error),
+    #[error("Unavailable")]
+    Unavailable,
+}
+
 struct ShardUpdate {
     tenant_shard_id: TenantShardId,
     placement_policy: PlacementPolicy,
@@ -627,8 +662,52 @@ impl Service {
         }
     }
 
+    async fn process_aborts(
+        &self,
+        mut abort_rx: tokio::sync::mpsc::UnboundedReceiver<TenantShardSplitAbort>,
+    ) {
+        loop {
+            // Wait for the next result, or for cancellation
+            let op = tokio::select! {
+                r = abort_rx.recv() => {
+                    match r {
+                        Some(op) => {op},
+                        None => {break;}
+                    }
+                }
+                _ = self.cancel.cancelled() => {
+                    break;
+                }
+            };
+
+            // Retry until shutdown: we must keep this request object alive until it is properly
+            // processed, as it holds a lock guard that prevents other operations trying to do things
+            // to the tenant while it is in a weird part-split state.
+            while !self.cancel.is_cancelled() {
+                match self.abort_tenant_shard_split(&op).await {
+                    Ok(_) => break,
+                    Err(e) => {
+                        tracing::warn!(
+                            "Failed to abort shard split on {}, will retry: {e}",
+                            op.tenant_id
+                        );
+
+                        // If a node is unavailable, we hope that it has been properly marked Offline
+                        // when we retry, so that the abort op will succeed.  If the abort op is failing
+                        // for some other reason, we will keep retrying forever, or until a human notices
+                        // and does something about it (either fixing a pageserver or restarting the controller).
+                        tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled())
+                            .await
+                            .ok();
+                    }
+                }
+            }
+        }
+    }
+
     pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
         let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
+        let (abort_tx, abort_rx) = tokio::sync::mpsc::unbounded_channel();
 
         tracing::info!("Loading nodes from database...");
         let nodes = persistence
@@ -641,12 +720,62 @@ impl Service {
         tracing::info!("Loaded {} nodes from database.", nodes.len());
 
         tracing::info!("Loading shards from database...");
-        let tenant_shard_persistence = persistence.list_tenant_shards().await?;
+        let mut tenant_shard_persistence = persistence.list_tenant_shards().await?;
         tracing::info!(
             "Loaded {} shards from database.",
             tenant_shard_persistence.len()
         );
 
+        // If any shard splits were in progress, reset the database state to abort them
+        let mut tenant_shard_count_min_max: HashMap<TenantId, (ShardCount, ShardCount)> =
+            HashMap::new();
+        for tsp in &mut tenant_shard_persistence {
+            let shard = tsp.get_shard_identity()?;
+            let tenant_shard_id = tsp.get_tenant_shard_id()?;
+            let entry = tenant_shard_count_min_max
+                .entry(tenant_shard_id.tenant_id)
+                .or_insert_with(|| (shard.count, shard.count));
+            entry.0 = std::cmp::min(entry.0, shard.count);
+            entry.1 = std::cmp::max(entry.1, shard.count);
+        }
+
+        for (tenant_id, (count_min, count_max)) in tenant_shard_count_min_max {
+            if count_min != count_max {
+                // Aborting the split in the database and dropping the child shards is sufficient: the reconciliation in
+                // [`Self::startup_reconcile`] will implicitly drop the child shards on remote pageservers, or they'll
+                // be dropped later in [`Self::node_activate_reconcile`] if it isn't available right now.
+                tracing::info!("Aborting shard split {tenant_id} {count_min:?} -> {count_max:?}");
+                let abort_status = persistence.abort_shard_split(tenant_id, count_max).await?;
+
+                // We may never see the Complete status here: if the split was complete, we wouldn't have
+                // identified this tenant has having mismatching min/max counts.
+                assert!(matches!(abort_status, AbortShardSplitStatus::Aborted));
+
+                // Clear the splitting status in-memory, to reflect that we just aborted in the database
+                tenant_shard_persistence.iter_mut().for_each(|tsp| {
+                    // Set idle split state on those shards that we will retain.
+                    let tsp_tenant_id = TenantId::from_str(tsp.tenant_id.as_str()).unwrap();
+                    if tsp_tenant_id == tenant_id
+                        && tsp.get_shard_identity().unwrap().count == count_min
+                    {
+                        tsp.splitting = SplitState::Idle;
+                    } else if tsp_tenant_id == tenant_id {
+                        // Leave the splitting state on the child shards: this will be used next to
+                        // drop them.
+                        tracing::info!(
+                            "Shard {tsp_tenant_id} will be dropped after shard split abort",
+                        );
+                    }
+                });
+
+                // Drop shards for this tenant which we didn't just mark idle (i.e. child shards of the aborted split)
+                tenant_shard_persistence.retain(|tsp| {
+                    TenantId::from_str(tsp.tenant_id.as_str()).unwrap() != tenant_id
+                        || tsp.splitting == SplitState::Idle
+                });
+            }
+        }
+
         let mut tenants = BTreeMap::new();
 
         let mut scheduler = Scheduler::new(nodes.values());
@@ -676,21 +805,8 @@ impl Service {
             }
         }
         for tsp in tenant_shard_persistence {
-            let tenant_shard_id = TenantShardId {
-                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
-                shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount::new(tsp.shard_count as u8),
-            };
-            let shard_identity = if tsp.shard_count == 0 {
-                ShardIdentity::unsharded()
-            } else {
-                ShardIdentity::new(
-                    ShardNumber(tsp.shard_number as u8),
-                    ShardCount::new(tsp.shard_count as u8),
-                    ShardStripeSize(tsp.shard_stripe_size as u32),
-                )?
-            };
-
+            let tenant_shard_id = tsp.get_tenant_shard_id()?;
+            let shard_identity = tsp.get_shard_identity()?;
             // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
             // it with what we can infer: the node for which a generation was most recently issued.
             let mut intent = IntentState::new();
@@ -728,9 +844,12 @@ impl Service {
             persistence,
             compute_hook: Arc::new(ComputeHook::new(config)),
             result_tx,
+            abort_tx,
             startup_complete: startup_complete.clone(),
             cancel: CancellationToken::new(),
             gate: Gate::default(),
+            tenant_op_locks: Default::default(),
+            node_op_locks: Default::default(),
         });
 
         let result_task_this = this.clone();
@@ -741,6 +860,33 @@ impl Service {
             }
         });
 
+        tokio::task::spawn({
+            let this = this.clone();
+            async move {
+                // Block shutdown until we're done (we must respect self.cancel)
+                if let Ok(_gate) = this.gate.enter() {
+                    this.process_aborts(abort_rx).await
+                }
+            }
+        });
+
+        tokio::task::spawn({
+            let this = this.clone();
+            async move {
+                if let Ok(_gate) = this.gate.enter() {
+                    loop {
+                        tokio::select! {
+                            _ = this.cancel.cancelled() => {
+                                break;
+                            },
+                            _ = tokio::time::sleep(Duration::from_secs(60)) => {}
+                        };
+                        this.tenant_op_locks.housekeeping();
+                    }
+                }
+            }
+        });
+
         tokio::task::spawn({
             let this = this.clone();
             // We will block the [`Service::startup_complete`] barrier until [`Self::startup_reconcile`]
@@ -889,6 +1035,7 @@ impl Service {
                             tenant_state.generation.unwrap(),
                             &tenant_state.shard,
                             &tenant_state.config,
+                            false,
                         )),
                     },
                 )]);
@@ -918,6 +1065,118 @@ impl Service {
         }
     }
 
+    // When the availability state of a node transitions to active, we must do a full reconciliation
+    // of LocationConfigs on that node.  This is because while a node was offline:
+    // - we might have proceeded through startup_reconcile without checking for extraneous LocationConfigs on this node
+    // - aborting a tenant shard split might have left rogue child shards behind on this node.
+    //
+    // This function must complete _before_ setting a `Node` to Active: once it is set to Active, other
+    // Reconcilers might communicate with the node, and these must not overlap with the work we do in
+    // this function.
+    //
+    // The reconciliation logic in here is very similar to what [`Self::startup_reconcile`] does, but
+    // for written for a single node rather than as a batch job for all nodes.
+    #[tracing::instrument(skip_all, fields(node_id=%node.get_id()))]
+    async fn node_activate_reconcile(
+        &self,
+        mut node: Node,
+        _lock: &OwnedRwLockWriteGuard<()>,
+    ) -> Result<(), ApiError> {
+        // This Node is a mutable local copy: we will set it active so that we can use its
+        // API client to reconcile with the node.  The Node in [`Self::nodes`] will get updated
+        // later.
+        node.set_availability(NodeAvailability::Active);
+
+        let configs = match node
+            .with_client_retries(
+                |client| async move { client.list_location_config().await },
+                &self.config.jwt_token,
+                1,
+                5,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await
+        {
+            None => {
+                // We're shutting down (the Node's cancellation token can't have fired, because
+                // we're the only scope that has a reference to it, and we didn't fire it).
+                return Err(ApiError::ShuttingDown);
+            }
+            Some(Err(e)) => {
+                // This node didn't succeed listing its locations: it may not proceed to active state
+                // as it is apparently unavailable.
+                return Err(ApiError::PreconditionFailed(
+                    format!("Failed to query node location configs, cannot activate ({e})").into(),
+                ));
+            }
+            Some(Ok(configs)) => configs,
+        };
+        tracing::info!("Loaded {} LocationConfigs", configs.tenant_shards.len());
+
+        let mut cleanup = Vec::new();
+        {
+            let mut locked = self.inner.write().unwrap();
+
+            for (tenant_shard_id, observed_loc) in configs.tenant_shards {
+                let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
+                    cleanup.push(tenant_shard_id);
+                    continue;
+                };
+                tenant_state
+                    .observed
+                    .locations
+                    .insert(node.get_id(), ObservedStateLocation { conf: observed_loc });
+            }
+        }
+
+        for tenant_shard_id in cleanup {
+            tracing::info!("Detaching {tenant_shard_id}");
+            match node
+                .with_client_retries(
+                    |client| async move {
+                        let config = LocationConfig {
+                            mode: LocationConfigMode::Detached,
+                            generation: None,
+                            secondary_conf: None,
+                            shard_number: tenant_shard_id.shard_number.0,
+                            shard_count: tenant_shard_id.shard_count.literal(),
+                            shard_stripe_size: 0,
+                            tenant_conf: models::TenantConfig::default(),
+                        };
+                        client
+                            .location_config(tenant_shard_id, config, None, false)
+                            .await
+                    },
+                    &self.config.jwt_token,
+                    1,
+                    5,
+                    SHORT_RECONCILE_TIMEOUT,
+                    &self.cancel,
+                )
+                .await
+            {
+                None => {
+                    // We're shutting down (the Node's cancellation token can't have fired, because
+                    // we're the only scope that has a reference to it, and we didn't fire it).
+                    return Err(ApiError::ShuttingDown);
+                }
+                Some(Err(e)) => {
+                    // Do not let the node proceed to Active state if it is not responsive to requests
+                    // to detach.  This could happen if e.g. a shutdown bug in the pageserver is preventing
+                    // detach completing: we should not let this node back into the set of nodes considered
+                    // okay for scheduling.
+                    return Err(ApiError::Conflict(format!(
+                        "Node {node} failed to detach {tenant_shard_id}: {e}"
+                    )));
+                }
+                Some(Ok(_)) => {}
+            };
+        }
+
+        Ok(())
+    }
+
     pub(crate) async fn re_attach(
         &self,
         reattach_req: ReAttachRequest,
@@ -926,15 +1185,6 @@ impl Service {
             self.node_register(register_req).await?;
         }
 
-        // Take a re-attach as indication that the node is available: this is a precursor to proper
-        // heartbeating in https://github.com/neondatabase/neon/issues/6844
-        self.node_configure(NodeConfigureRequest {
-            node_id: reattach_req.node_id,
-            availability: Some(NodeAvailability::Active),
-            scheduling: None,
-        })
-        .await?;
-
         // Ordering: we must persist generation number updates before making them visible in the in-memory state
         let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?;
 
@@ -946,6 +1196,7 @@ impl Service {
 
         // Apply the updated generation to our in-memory state
         let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, _scheduler) = locked.parts_mut();
 
         let mut response = ReAttachResponse {
             tenants: Vec::new(),
@@ -957,7 +1208,7 @@ impl Service {
                 gen: new_gen.into().unwrap(),
             });
             // Apply the new generation number to our in-memory state
-            let shard_state = locked.tenants.get_mut(&tenant_shard_id);
+            let shard_state = tenants.get_mut(&tenant_shard_id);
             let Some(shard_state) = shard_state else {
                 // Not fatal.  This edge case requires a re-attach to happen
                 // between inserting a new tenant shard in to the database, and updating our in-memory
@@ -1008,6 +1259,25 @@ impl Service {
             // request in flight over the network: TODO handle that by making location_conf API refuse
             // to go backward in generations.
         }
+
+        // We consider a node Active once we have composed a re-attach response, but we
+        // do not call [`Self::node_activate_reconcile`]: the handling of the re-attach response
+        // implicitly synchronizes the LocationConfigs on the node.
+        //
+        // Setting a node active unblocks any Reconcilers that might write to the location config API,
+        // but those requests will not be accepted by the node until it has finished processing
+        // the re-attach response.
+        if let Some(node) = nodes.get(&reattach_req.node_id) {
+            if !node.is_available() {
+                let mut new_nodes = (**nodes).clone();
+                if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
+                    node.set_availability(NodeAvailability::Active);
+                }
+                let new_nodes = Arc::new(new_nodes);
+                *nodes = new_nodes;
+            }
+        }
+
         Ok(response)
     }
 
@@ -1048,6 +1318,12 @@ impl Service {
         &self,
         create_req: TenantCreateRequest,
     ) -> Result<TenantCreateResponse, ApiError> {
+        // Exclude any concurrent attempts to create/access the same tenant ID
+        let _tenant_lock = self
+            .tenant_op_locks
+            .exclusive(create_req.new_tenant_id.tenant_id)
+            .await;
+
         let (response, waiters) = self.do_tenant_create(create_req).await?;
 
         self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
@@ -1362,16 +1638,20 @@ impl Service {
         tenant_shard_id: TenantShardId,
         req: TenantLocationConfigRequest,
     ) -> Result<TenantLocationConfigResponse, ApiError> {
+        // We require an exclusive lock, because we are updating both persistent and in-memory state
+        let _tenant_lock = self
+            .tenant_op_locks
+            .exclusive(tenant_shard_id.tenant_id)
+            .await;
+
         if !tenant_shard_id.is_unsharded() {
             return Err(ApiError::BadRequest(anyhow::anyhow!(
                 "This API is for importing single-sharded or unsharded tenants"
             )));
         }
 
-        let tenant_id = tenant_shard_id.tenant_id;
-
         // First check if this is a creation or an update
-        let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
+        let create_or_update = self.tenant_location_config_prepare(tenant_shard_id.tenant_id, req);
 
         let mut result = TenantLocationConfigResponse {
             shards: Vec::new(),
@@ -1477,6 +1757,9 @@ impl Service {
     }
 
     pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
+        // We require an exclusive lock, because we are updating persistent and in-memory state
+        let _tenant_lock = self.tenant_op_locks.exclusive(req.tenant_id).await;
+
         let tenant_id = req.tenant_id;
         let config = req.config;
 
@@ -1558,6 +1841,8 @@ impl Service {
         timestamp: Cow<'_, str>,
         done_if_after: Cow<'_, str>,
     ) -> Result<(), ApiError> {
+        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+
         let node = {
             let locked = self.inner.read().unwrap();
             // Just a sanity check to prevent misuse: the API expects that the tenant is fully
@@ -1643,6 +1928,8 @@ impl Service {
         &self,
         tenant_id: TenantId,
     ) -> Result<(), ApiError> {
+        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
+
         // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to
         let targets = {
             let locked = self.inner.read().unwrap();
@@ -1692,6 +1979,8 @@ impl Service {
     }
 
     pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
+        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+
         self.ensure_attached_wait(tenant_id).await?;
 
         // TODO: refactor into helper
@@ -1788,10 +2077,10 @@ impl Service {
             create_req.new_timeline_id,
         );
 
+        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
+
         self.ensure_attached_wait(tenant_id).await?;
 
-        // TODO: refuse to do this if shard splitting is in progress
-        // (https://github.com/neondatabase/neon/issues/6676)
         let mut targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
@@ -1913,11 +2202,10 @@ impl Service {
         timeline_id: TimelineId,
     ) -> Result<StatusCode, ApiError> {
         tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,);
+        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
 
         self.ensure_attached_wait(tenant_id).await?;
 
-        // TODO: refuse to do this if shard splitting is in progress
-        // (https://github.com/neondatabase/neon/issues/6676)
         let mut targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
@@ -2106,10 +2394,306 @@ impl Service {
         })
     }
 
+    #[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
+    async fn abort_tenant_shard_split(
+        &self,
+        op: &TenantShardSplitAbort,
+    ) -> Result<(), TenantShardSplitAbortError> {
+        // Cleaning up a split:
+        // - Parent shards are not destroyed during a split, just detached.
+        // - Failed pageserver split API calls can leave the remote node with just the parent attached,
+        //   just the children attached, or both.
+        //
+        // Therefore our work to do is to:
+        // 1. Clean up storage controller's internal state to just refer to parents, no children
+        // 2. Call out to pageservers to ensure that children are detached
+        // 3. Call out to pageservers to ensure that parents are attached.
+        //
+        // Crash safety:
+        // - If the storage controller stops running during this cleanup *after* clearing the splitting state
+        //   from our database, then [`Self::startup_reconcile`] will regard child attachments as garbage
+        //   and detach them.
+        // - TODO: If the storage controller stops running during this cleanup *before* clearing the splitting state
+        //   from our database, then we will re-enter this cleanup routine on startup.
+
+        let TenantShardSplitAbort {
+            tenant_id,
+            new_shard_count,
+            new_stripe_size,
+            ..
+        } = op;
+
+        // First abort persistent state, if any exists.
+        match self
+            .persistence
+            .abort_shard_split(*tenant_id, *new_shard_count)
+            .await?
+        {
+            AbortShardSplitStatus::Aborted => {
+                // Proceed to roll back any child shards created on pageservers
+            }
+            AbortShardSplitStatus::Complete => {
+                // The split completed (we might hit that path if e.g. our database transaction
+                // to write the completion landed in the database, but we dropped connection
+                // before seeing the result).
+                //
+                // We must update in-memory state to reflect the successful split.
+                self.tenant_shard_split_commit_inmem(
+                    *tenant_id,
+                    *new_shard_count,
+                    *new_stripe_size,
+                );
+                return Ok(());
+            }
+        }
+
+        // Clean up in-memory state, and accumulate the list of child locations that need detaching
+        let detach_locations: Vec<(Node, TenantShardId)> = {
+            let mut detach_locations = Vec::new();
+            let mut locked = self.inner.write().unwrap();
+            let (nodes, tenants, _scheduler) = locked.parts_mut();
+
+            for (tenant_shard_id, shard) in
+                tenants.range_mut(TenantShardId::tenant_range(op.tenant_id))
+            {
+                if shard.shard.count == op.new_shard_count {
+                    // Surprising: the phase of [`Self::do_tenant_shard_split`] which inserts child shards in-memory
+                    // is infallible, so if we got an error we shouldn't have got that far.
+                    tracing::warn!(
+                        "During split abort, child shard {tenant_shard_id} found in-memory"
+                    );
+                    continue;
+                }
+
+                // Add the children of this shard to this list of things to detach
+                if let Some(node_id) = shard.intent.get_attached() {
+                    for child_id in tenant_shard_id.split(*new_shard_count) {
+                        detach_locations.push((
+                            nodes
+                                .get(node_id)
+                                .expect("Intent references nonexistent node")
+                                .clone(),
+                            child_id,
+                        ));
+                    }
+                } else {
+                    tracing::warn!(
+                        "During split abort, shard {tenant_shard_id} has no attached location"
+                    );
+                }
+
+                tracing::info!("Restoring parent shard {tenant_shard_id}");
+                shard.splitting = SplitState::Idle;
+                self.maybe_reconcile_shard(shard, nodes);
+            }
+
+            // We don't expect any new_shard_count shards to exist here, but drop them just in case
+            tenants.retain(|_id, s| s.shard.count != *new_shard_count);
+
+            detach_locations
+        };
+
+        for (node, child_id) in detach_locations {
+            if !node.is_available() {
+                // An unavailable node cannot be cleaned up now: to avoid blocking forever, we will permit this, and
+                // rely on the reconciliation that happens when a node transitions to Active to clean up. Since we have
+                // removed child shards from our in-memory state and database, the reconciliation will implicitly remove
+                // them from the node.
+                tracing::warn!("Node {node} unavailable, can't clean up during split abort. It will be cleaned up when it is reactivated.");
+                continue;
+            }
+
+            // Detach the remote child.  If the pageserver split API call is still in progress, this call will get
+            // a 503 and retry, up to our limit.
+            tracing::info!("Detaching {child_id} on {node}...");
+            match node
+                .with_client_retries(
+                    |client| async move {
+                        let config = LocationConfig {
+                            mode: LocationConfigMode::Detached,
+                            generation: None,
+                            secondary_conf: None,
+                            shard_number: child_id.shard_number.0,
+                            shard_count: child_id.shard_count.literal(),
+                            // Stripe size and tenant config don't matter when detaching
+                            shard_stripe_size: 0,
+                            tenant_conf: TenantConfig::default(),
+                        };
+
+                        client.location_config(child_id, config, None, false).await
+                    },
+                    &self.config.jwt_token,
+                    1,
+                    10,
+                    Duration::from_secs(5),
+                    &self.cancel,
+                )
+                .await
+            {
+                Some(Ok(_)) => {}
+                Some(Err(e)) => {
+                    // We failed to communicate with the remote node.  This is problematic: we may be
+                    // leaving it with a rogue child shard.
+                    tracing::warn!(
+                        "Failed to detach child {child_id} from node {node} during abort"
+                    );
+                    return Err(e.into());
+                }
+                None => {
+                    // Cancellation: we were shutdown or the node went offline. Shutdown is fine, we'll
+                    // clean up on restart. The node going offline requires a retry.
+                    return Err(TenantShardSplitAbortError::Unavailable);
+                }
+            };
+        }
+
+        tracing::info!("Successfully aborted split");
+        Ok(())
+    }
+
+    /// Infallible final stage of [`Self::tenant_shard_split`]: update the contents
+    /// of the tenant map to reflect the child shards that exist after the split.
+    fn tenant_shard_split_commit_inmem(
+        &self,
+        tenant_id: TenantId,
+        new_shard_count: ShardCount,
+        new_stripe_size: Option<ShardStripeSize>,
+    ) -> (
+        TenantShardSplitResponse,
+        Vec<(TenantShardId, NodeId, ShardStripeSize)>,
+    ) {
+        let mut response = TenantShardSplitResponse {
+            new_shards: Vec::new(),
+        };
+        let mut child_locations = Vec::new();
+        {
+            let mut locked = self.inner.write().unwrap();
+
+            let parent_ids = locked
+                .tenants
+                .range(TenantShardId::tenant_range(tenant_id))
+                .map(|(shard_id, _)| *shard_id)
+                .collect::<Vec<_>>();
+
+            let (_nodes, tenants, scheduler) = locked.parts_mut();
+            for parent_id in parent_ids {
+                let child_ids = parent_id.split(new_shard_count);
+
+                let (pageserver, generation, policy, parent_ident, config) = {
+                    let mut old_state = tenants
+                        .remove(&parent_id)
+                        .expect("It was present, we just split it");
+
+                    // A non-splitting state is impossible, because [`Self::tenant_shard_split`] holds
+                    // a TenantId lock and passes it through to [`TenantShardSplitAbort`] in case of cleanup:
+                    // nothing else can clear this.
+                    assert!(matches!(old_state.splitting, SplitState::Splitting));
+
+                    let old_attached = old_state.intent.get_attached().unwrap();
+                    old_state.intent.clear(scheduler);
+                    let generation = old_state.generation.expect("Shard must have been attached");
+                    (
+                        old_attached,
+                        generation,
+                        old_state.policy,
+                        old_state.shard,
+                        old_state.config,
+                    )
+                };
+
+                for child in child_ids {
+                    let mut child_shard = parent_ident;
+                    child_shard.number = child.shard_number;
+                    child_shard.count = child.shard_count;
+                    if let Some(stripe_size) = new_stripe_size {
+                        child_shard.stripe_size = stripe_size;
+                    }
+
+                    let mut child_observed: HashMap<NodeId, ObservedStateLocation> = HashMap::new();
+                    child_observed.insert(
+                        pageserver,
+                        ObservedStateLocation {
+                            conf: Some(attached_location_conf(
+                                generation,
+                                &child_shard,
+                                &config,
+                                matches!(policy, PlacementPolicy::Double(n) if n > 0),
+                            )),
+                        },
+                    );
+
+                    let mut child_state = TenantState::new(child, child_shard, policy.clone());
+                    child_state.intent = IntentState::single(scheduler, Some(pageserver));
+                    child_state.observed = ObservedState {
+                        locations: child_observed,
+                    };
+                    child_state.generation = Some(generation);
+                    child_state.config = config.clone();
+
+                    // The child's TenantState::splitting is intentionally left at the default value of Idle,
+                    // as at this point in the split process we have succeeded and this part is infallible:
+                    // we will never need to do any special recovery from this state.
+
+                    child_locations.push((child, pageserver, child_shard.stripe_size));
+
+                    if let Err(e) = child_state.schedule(scheduler) {
+                        // This is not fatal, because we've implicitly already got an attached
+                        // location for the child shard.  Failure here just means we couldn't
+                        // find a secondary (e.g. because cluster is overloaded).
+                        tracing::warn!("Failed to schedule child shard {child}: {e}");
+                    }
+
+                    tenants.insert(child, child_state);
+                    response.new_shards.push(child);
+                }
+            }
+
+            (response, child_locations)
+        }
+    }
+
     pub(crate) async fn tenant_shard_split(
         &self,
         tenant_id: TenantId,
         split_req: TenantShardSplitRequest,
+    ) -> Result<TenantShardSplitResponse, ApiError> {
+        // TODO: return 503 if we get stuck waiting for this lock
+        // (issue https://github.com/neondatabase/neon/issues/7108)
+        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+
+        let new_shard_count = ShardCount::new(split_req.new_shard_count);
+        let new_stripe_size = split_req.new_stripe_size;
+
+        let r = self.do_tenant_shard_split(tenant_id, split_req).await;
+
+        match r {
+            Ok(r) => Ok(r),
+            Err(ApiError::BadRequest(_)) => {
+                // A request validation error does not require rollback: we rejected it before we started making any changes: just
+                // return the error
+                r
+            }
+            Err(e) => {
+                // General case error handling: split might be part-done, we must do work to abort it.
+                tracing::warn!("Enqueuing background abort of split on {tenant_id}");
+                self.abort_tx
+                    .send(TenantShardSplitAbort {
+                        tenant_id,
+                        new_shard_count,
+                        new_stripe_size,
+                        _tenant_lock,
+                    })
+                    // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it.
+                    .ok();
+                Err(e)
+            }
+        }
+    }
+
+    pub(crate) async fn do_tenant_shard_split(
+        &self,
+        tenant_id: TenantId,
+        split_req: TenantShardSplitRequest,
     ) -> Result<TenantShardSplitResponse, ApiError> {
         let mut policy = None;
         let mut shard_ident = None;
@@ -2121,6 +2705,10 @@ impl Service {
             child_ids: Vec<TenantShardId>,
         }
 
+        fail::fail_point!("shard-split-validation", |_| Err(ApiError::BadRequest(
+            anyhow::anyhow!("failpoint")
+        )));
+
         // Validate input, and calculate which shards we will create
         let (old_shard_count, targets) =
             {
@@ -2230,7 +2818,9 @@ impl Service {
             if shard_ident.count.count() > 1 && shard_ident.stripe_size != new_stripe_size {
                 return Err(ApiError::BadRequest(anyhow::anyhow!("Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", shard_ident.stripe_size)));
             }
+
             shard_ident.stripe_size = new_stripe_size;
+            tracing::info!("applied  stripe size {}", shard_ident.stripe_size.0);
             shard_ident
         } else {
             shard_ident.unwrap()
@@ -2255,6 +2845,11 @@ impl Service {
                 child_shard.number = child.shard_number;
                 child_shard.count = child.shard_count;
 
+                tracing::info!(
+                    "Create child shard persistence with stripe size {}",
+                    shard_ident.stripe_size.0
+                );
+
                 this_child_tsps.push(TenantShardPersistence {
                     tenant_id: child.tenant_id.to_string(),
                     shard_number: child.shard_number.0 as i32,
@@ -2293,6 +2888,9 @@ impl Service {
                 _ => return Err(ApiError::InternalServerError(e.into())),
             }
         }
+        fail::fail_point!("shard-split-post-begin", |_| Err(
+            ApiError::InternalServerError(anyhow::anyhow!("failpoint"))
+        ));
 
         // Now that I have persisted the splitting state, apply it in-memory.  This is infallible, so
         // callers may assume that if splitting is set in memory, then it was persisted, and if splitting
@@ -2302,15 +2900,16 @@ impl Service {
             for target in &targets {
                 if let Some(parent_shard) = locked.tenants.get_mut(&target.parent_id) {
                     parent_shard.splitting = SplitState::Splitting;
+                    // Put the observed state to None, to reflect that it is indeterminate once we start the
+                    // split operation.
+                    parent_shard
+                        .observed
+                        .locations
+                        .insert(target.node.get_id(), ObservedStateLocation { conf: None });
                 }
             }
         }
 
-        // FIXME: we have now committed the shard split state to the database, so any subsequent
-        // failure needs to roll it back.  We will later wrap this function in logic to roll back
-        // the split if it fails.
-        // (https://github.com/neondatabase/neon/issues/6676)
-
         // TODO: issue split calls concurrently (this only matters once we're splitting
         // N>1 shards into M shards -- initially we're usually splitting 1 shard into N).
 
@@ -2332,6 +2931,10 @@ impl Service {
                 .await
                 .map_err(|e| ApiError::Conflict(format!("Failed to split {}: {}", parent_id, e)))?;
 
+            fail::fail_point!("shard-split-post-remote", |_| Err(ApiError::Conflict(
+                "failpoint".to_string()
+            )));
+
             tracing::info!(
                 "Split {} into {}",
                 parent_id,
@@ -2366,62 +2969,16 @@ impl Service {
             .complete_shard_split(tenant_id, old_shard_count)
             .await?;
 
+        fail::fail_point!("shard-split-post-complete", |_| Err(
+            ApiError::InternalServerError(anyhow::anyhow!("failpoint"))
+        ));
+
         // Replace all the shards we just split with their children: this phase is infallible.
-        let mut response = TenantShardSplitResponse {
-            new_shards: Vec::new(),
-        };
-        let mut child_locations = Vec::new();
-        {
-            let mut locked = self.inner.write().unwrap();
-            let (_nodes, tenants, scheduler) = locked.parts_mut();
-            for target in targets {
-                let SplitTarget {
-                    parent_id,
-                    node: _node,
-                    child_ids,
-                } = target;
-                let (pageserver, generation, config) = {
-                    let mut old_state = tenants
-                        .remove(&parent_id)
-                        .expect("It was present, we just split it");
-                    let old_attached = old_state.intent.get_attached().unwrap();
-                    old_state.intent.clear(scheduler);
-                    let generation = old_state.generation.expect("Shard must have been attached");
-                    (old_attached, generation, old_state.config.clone())
-                };
-
-                for child in child_ids {
-                    let mut child_shard = shard_ident;
-                    child_shard.number = child.shard_number;
-                    child_shard.count = child.shard_count;
-
-                    let mut child_observed: HashMap<NodeId, ObservedStateLocation> = HashMap::new();
-                    child_observed.insert(
-                        pageserver,
-                        ObservedStateLocation {
-                            conf: Some(attached_location_conf(generation, &child_shard, &config)),
-                        },
-                    );
-
-                    let mut child_state = TenantState::new(child, child_shard, policy.clone());
-                    child_state.intent = IntentState::single(scheduler, Some(pageserver));
-                    child_state.observed = ObservedState {
-                        locations: child_observed,
-                    };
-                    child_state.generation = Some(generation);
-                    child_state.config = config.clone();
-
-                    // The child's TenantState::splitting is intentionally left at the default value of Idle,
-                    // as at this point in the split process we have succeeded and this part is infallible:
-                    // we will never need to do any special recovery from this state.
-
-                    child_locations.push((child, pageserver, child_shard.stripe_size));
-
-                    tenants.insert(child, child_state);
-                    response.new_shards.push(child);
-                }
-            }
-        }
+        let (response, child_locations) = self.tenant_shard_split_commit_inmem(
+            tenant_id,
+            ShardCount::new(split_req.new_shard_count),
+            split_req.new_stripe_size,
+        );
 
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
@@ -2710,6 +3267,8 @@ impl Service {
         &self,
         register_req: NodeRegisterRequest,
     ) -> Result<(), ApiError> {
+        let _node_lock = self.node_op_locks.exclusive(register_req.node_id).await;
+
         // Pre-check for an already-existing node
         {
             let locked = self.inner.read().unwrap();
@@ -2771,6 +3330,8 @@ impl Service {
         &self,
         config_req: NodeConfigureRequest,
     ) -> Result<(), ApiError> {
+        let _node_lock = self.node_op_locks.exclusive(config_req.node_id).await;
+
         if let Some(scheduling) = config_req.scheduling {
             // Scheduling is a persistent part of Node: we must write updates to the database before
             // applying them in memory
@@ -2779,6 +3340,37 @@ impl Service {
                 .await?;
         }
 
+        // If we're activating a node, then before setting it active we must reconcile any shard locations
+        // on that node, in case it is out of sync, e.g. due to being unavailable during controller startup,
+        // by calling [`Self::node_activate_reconcile`]
+        //
+        // The transition we calculate here remains valid later in the function because we hold the op lock on the node:
+        // nothing else can mutate its availability while we run.
+        let availability_transition = if let Some(input_availability) = config_req.availability {
+            let (activate_node, availability_transition) = {
+                let locked = self.inner.read().unwrap();
+                let Some(node) = locked.nodes.get(&config_req.node_id) else {
+                    return Err(ApiError::NotFound(
+                        anyhow::anyhow!("Node {} not registered", config_req.node_id).into(),
+                    ));
+                };
+
+                (
+                    node.clone(),
+                    node.get_availability_transition(input_availability),
+                )
+            };
+
+            if matches!(availability_transition, AvailabilityTransition::ToActive) {
+                self.node_activate_reconcile(activate_node, &_node_lock)
+                    .await?;
+            }
+            availability_transition
+        } else {
+            AvailabilityTransition::Unchanged
+        };
+
+        // Apply changes from the request to our in-memory state for the Node
         let mut locked = self.inner.write().unwrap();
         let (nodes, tenants, scheduler) = locked.parts_mut();
 
@@ -2790,11 +3382,9 @@ impl Service {
             ));
         };
 
-        let availability_transition = if let Some(availability) = &config_req.availability {
-            node.set_availability(*availability)
-        } else {
-            AvailabilityTransition::Unchanged
-        };
+        if let Some(availability) = &config_req.availability {
+            node.set_availability(*availability);
+        }
 
         if let Some(scheduling) = config_req.scheduling {
             node.set_scheduling(scheduling);
@@ -2808,6 +3398,7 @@ impl Service {
 
         let new_nodes = Arc::new(new_nodes);
 
+        // Modify scheduling state for any Tenants that are affected by a change in the node's availability state.
         match availability_transition {
             AvailabilityTransition::ToOffline => {
                 tracing::info!("Node {} transition to offline", config_req.node_id);
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 3c91e09ac3..39e557616d 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -577,7 +577,12 @@ impl TenantState {
                 .generation
                 .expect("Attempted to enter attached state without a generation");
 
-            let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
+            let wanted_conf = attached_location_conf(
+                generation,
+                &self.shard,
+                &self.config,
+                !self.intent.secondary.is_empty(),
+            );
             match self.observed.locations.get(&node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                 Some(_) | None => {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index bb8b1bb7e5..fc67f4cf8f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2108,6 +2108,16 @@ where
     R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
     H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
 {
+    if request.uri() != &"/v1/failpoints".parse::<Uri>().unwrap() {
+        fail::fail_point!("api-503", |_| Err(ApiError::ResourceUnavailable(
+            "failpoint".into()
+        )));
+
+        fail::fail_point!("api-500", |_| Err(ApiError::InternalServerError(
+            anyhow::anyhow!("failpoint")
+        )));
+    }
+
     // Spawn a new task to handle the request, to protect the handler from unexpected
     // async cancellations. Most pageserver functions are not async cancellation safe.
     // We arm a drop-guard, so that if Hyper drops the Future, we signal the task
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 26fcce1f38..7cf03d8fd6 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1443,6 +1443,35 @@ impl TenantManager {
         new_shard_count: ShardCount,
         new_stripe_size: Option<ShardStripeSize>,
         ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<TenantShardId>> {
+        let r = self
+            .do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx)
+            .await;
+        if r.is_err() {
+            // Shard splitting might have left the original shard in a partially shut down state (it
+            // stops the shard's remote timeline client).  Reset it to ensure we leave things in
+            // a working state.
+            if self.get(tenant_shard_id).is_some() {
+                tracing::warn!("Resetting {tenant_shard_id} after shard split failure");
+                if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await {
+                    // Log this error because our return value will still be the original error, not this one.  This is
+                    // a severe error: if this happens, we might be leaving behind a tenant that is not fully functional
+                    // (e.g. has uploads disabled).  We can't do anything else: if reset fails then shutting the tenant down or
+                    // setting it broken probably won't help either.
+                    tracing::error!("Failed to reset {tenant_shard_id}: {e}");
+                }
+            }
+        }
+
+        r
+    }
+
+    pub(crate) async fn do_shard_split(
+        &self,
+        tenant_shard_id: TenantShardId,
+        new_shard_count: ShardCount,
+        new_stripe_size: Option<ShardStripeSize>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Vec<TenantShardId>> {
         let tenant = get_tenant(tenant_shard_id, true)?;
 
@@ -1477,6 +1506,10 @@ impl TenantManager {
                 .join(",")
         );
 
+        fail::fail_point!("shard-split-pre-prepare", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
         let parent_shard_identity = tenant.shard_identity;
         let parent_tenant_conf = tenant.get_tenant_conf();
         let parent_generation = tenant.generation;
@@ -1490,6 +1523,10 @@ impl TenantManager {
             return Err(e);
         }
 
+        fail::fail_point!("shard-split-post-prepare", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
         self.resources.deletion_queue_client.flush_advisory();
 
         // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
@@ -1511,11 +1548,16 @@ impl TenantManager {
                 anyhow::bail!("Detached parent shard in the middle of split!")
             }
         };
-
+        fail::fail_point!("shard-split-pre-hardlink", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
         // Optimization: hardlink layers from the parent into the children, so that they don't have to
         // re-download & duplicate the data referenced in their initial IndexPart
         self.shard_split_hardlink(parent, child_shards.clone())
             .await?;
+        fail::fail_point!("shard-split-post-hardlink", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
 
         // Take a snapshot of where the parent's WAL ingest had got to: we will wait for
         // child shards to reach this point.
@@ -1555,6 +1597,10 @@ impl TenantManager {
             .await?;
         }
 
+        fail::fail_point!("shard-split-post-child-conf", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
         // Phase 4: wait for child chards WAL ingest to catch up to target LSN
         for child_shard_id in &child_shards {
             let child_shard_id = *child_shard_id;
@@ -1587,6 +1633,10 @@ impl TenantManager {
                         timeline.timeline_id,
                         target_lsn
                     );
+
+                    fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
+                        "failpoint"
+                    )));
                     if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
                         // Failure here might mean shutdown, in any case this part is an optimization
                         // and we shouldn't hold up the split operation.
@@ -1632,6 +1682,10 @@ impl TenantManager {
             },
         );
 
+        fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
         parent_slot_guard.drop_old_value()?;
 
         // Phase 6: Release the InProgress on the parent shard
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 200c9c3740..4b0c9ac71d 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -2,6 +2,7 @@ pytest_plugins = (
     "fixtures.pg_version",
     "fixtures.parametrize",
     "fixtures.httpserver",
+    "fixtures.compute_reconfigure",
     "fixtures.neon_fixtures",
     "fixtures.benchmark_fixture",
     "fixtures.pg_stats",
diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py
new file mode 100644
index 0000000000..9dd66fe636
--- /dev/null
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -0,0 +1,62 @@
+import concurrent.futures
+from typing import Any
+
+import pytest
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
+
+from fixtures.log_helper import log
+from fixtures.types import TenantId
+
+
+class ComputeReconfigure:
+    def __init__(self, server):
+        self.server = server
+        self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach"
+        self.workloads = {}
+
+    def register_workload(self, workload):
+        self.workloads[workload.tenant_id] = workload
+
+
+@pytest.fixture(scope="function")
+def compute_reconfigure_listener(make_httpserver):
+    """
+    This fixture exposes an HTTP listener for the storage controller to submit
+    compute notifications to us, instead of updating neon_local endpoints itself.
+
+    Although storage controller can use neon_local directly, this causes problems when
+    the test is also concurrently modifying endpoints.  Instead, configure storage controller
+    to send notifications up to this test code, which will route all endpoint updates
+    through Workload, which has a mutex to make concurrent updates safe.
+    """
+    server = make_httpserver
+
+    self = ComputeReconfigure(server)
+
+    # Do neon_local endpoint reconfiguration in the background so that we can
+    # accept a healthy rate of calls into notify-attach.
+    reconfigure_threads = concurrent.futures.ThreadPoolExecutor(max_workers=1)
+
+    def handler(request: Request):
+        assert request.json is not None
+        body: dict[str, Any] = request.json
+        log.info(f"notify-attach request: {body}")
+
+        try:
+            workload = self.workloads[TenantId(body["tenant_id"])]
+        except KeyError:
+            pass
+        else:
+            # This causes the endpoint to query storage controller for its location, which
+            # is redundant since we already have it here, but this avoids extending the
+            # neon_local CLI to take full lists of locations
+            reconfigure_threads.submit(lambda workload=workload: workload.reconfigure())  # type: ignore[no-any-return]
+
+        return Response(status=200)
+
+    self.server.expect_request("/notify-attach", method="PUT").respond_with_handler(handler)
+
+    yield self
+    reconfigure_threads.shutdown()
+    server.clear()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5b76e808d5..16ebc19698 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2177,6 +2177,23 @@ class NeonStorageController(MetricsGetter):
         )
         log.info("storage controller passed consistency check")
 
+    def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
+        if isinstance(config_strings, tuple):
+            pairs = [config_strings]
+        else:
+            pairs = config_strings
+
+        log.info(f"Requesting config failpoints: {repr(pairs)}")
+
+        res = self.request(
+            "PUT",
+            f"{self.env.storage_controller_api}/debug/v1/failpoints",
+            json=[{"name": name, "actions": actions} for name, actions in pairs],
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        log.info(f"Got failpoints request response code {res.status_code}")
+        res.raise_for_status()
+
     def __enter__(self) -> "NeonStorageController":
         return self
 
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index 1d5394dc1d..e852281fcf 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -1,3 +1,4 @@
+import threading
 from typing import Optional
 
 from fixtures.log_helper import log
@@ -11,6 +12,10 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.types import TenantId, TimelineId
 
+# neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex
+# to ensure we don't do that: this enables running lots of Workloads in parallel safely.
+ENDPOINT_LOCK = threading.Lock()
+
 
 class Workload:
     """
@@ -41,17 +46,30 @@ class Workload:
 
         self._endpoint: Optional[Endpoint] = None
 
+    def reconfigure(self):
+        """
+        Request the endpoint to reconfigure based on location reported by storage controller
+        """
+        if self._endpoint is not None:
+            with ENDPOINT_LOCK:
+                self._endpoint.reconfigure()
+
     def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
-        if self._endpoint is None:
-            self._endpoint = self.env.endpoints.create(
-                self.branch_name,
-                tenant_id=self.tenant_id,
-                pageserver_id=pageserver_id,
-                endpoint_id="ep-workload",
-            )
-            self._endpoint.start(pageserver_id=pageserver_id)
-        else:
-            self._endpoint.reconfigure(pageserver_id=pageserver_id)
+        # We may be running alongside other Workloads for different tenants.  Full TTID is
+        # obnoxiously long for use here, but a cut-down version is still unique enough for tests.
+        endpoint_id = f"ep-workload-{str(self.tenant_id)[0:4]}-{str(self.timeline_id)[0:4]}"
+
+        with ENDPOINT_LOCK:
+            if self._endpoint is None:
+                self._endpoint = self.env.endpoints.create(
+                    self.branch_name,
+                    tenant_id=self.tenant_id,
+                    pageserver_id=pageserver_id,
+                    endpoint_id=endpoint_id,
+                )
+                self._endpoint.start(pageserver_id=pageserver_id)
+            else:
+                self._endpoint.reconfigure(pageserver_id=pageserver_id)
 
         connstring = self._endpoint.safe_psql(
             "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'"
@@ -94,7 +112,7 @@ class Workload:
         else:
             return False
 
-    def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True):
+    def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True, ingest=True):
         assert self.expect_rows >= n
 
         max_iters = 10
@@ -132,22 +150,28 @@ class Workload:
                 ]
             )
 
-        for tenant_shard_id, pageserver in tenant_get_shards(
-            self.env, self.tenant_id, pageserver_id
-        ):
-            last_flush_lsn = wait_for_last_flush_lsn(
-                self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
-            )
-            ps_http = pageserver.http_client()
-            wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
+        if ingest:
+            # Wait for written data to be ingested by the pageserver
+            for tenant_shard_id, pageserver in tenant_get_shards(
+                self.env, self.tenant_id, pageserver_id
+            ):
+                last_flush_lsn = wait_for_last_flush_lsn(
+                    self.env,
+                    endpoint,
+                    self.tenant_id,
+                    self.timeline_id,
+                    pageserver_id=pageserver_id,
+                )
+                ps_http = pageserver.http_client()
+                wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
 
-            if upload:
-                # force a checkpoint to trigger upload
-                ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
-                wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
-                log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
-            else:
-                log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
+                if upload:
+                    # Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload)
+                    ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
+                    wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
+                    log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
+                else:
+                    log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
 
     def validate(self, pageserver_id: Optional[int] = None):
         endpoint = self.endpoint(pageserver_id)
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 9309af066b..bdb9990a51 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1,10 +1,14 @@
 import os
-from typing import Dict, List, Union
+from typing import Dict, List, Optional, Union
 
 import pytest
+import requests
+from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    NeonEnv,
     NeonEnvBuilder,
+    StorageControllerApiException,
     tenant_get_shards,
 )
 from fixtures.remote_storage import s3_storage
@@ -495,3 +499,337 @@ def test_sharding_ingest(
 
     # Each shard may emit up to one huge layer, because initdb ingest doesn't respect checkpoint_distance.
     assert huge_layer_count <= shard_count
+
+
+class Failure:
+    pageserver_id: Optional[int]
+
+    def apply(self, env: NeonEnv):
+        raise NotImplementedError()
+
+    def clear(self, env: NeonEnv):
+        """
+        Clear the failure, in a way that should enable the system to proceed
+        to a totally clean state (all nodes online and reconciled)
+        """
+        raise NotImplementedError()
+
+    def expect_available(self):
+        raise NotImplementedError()
+
+    def can_mitigate(self):
+        """Whether Self.mitigate is available for use"""
+        return False
+
+    def mitigate(self, env: NeonEnv):
+        """
+        Mitigate the failure in a way that should allow shard split to
+        complete and service to resume, but does not guarantee to leave
+        the whole world in a clean state (e.g. an Offline node might have
+        junk LocationConfigs on it)
+        """
+        raise NotImplementedError()
+
+    def fails_forward(self, env: NeonEnv):
+        """
+        If true, this failure results in a state that eventualy completes the split.
+        """
+        return False
+
+    def expect_exception(self):
+        """
+        How do we expect a call to the split API to fail?
+        """
+        return StorageControllerApiException
+
+
+class PageserverFailpoint(Failure):
+    def __init__(self, failpoint, pageserver_id, mitigate):
+        self.failpoint = failpoint
+        self.pageserver_id = pageserver_id
+        self._mitigate = mitigate
+
+    def apply(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.allowed_errors.extend(
+            [".*failpoint.*", ".*Resetting.*after shard split failure.*"]
+        )
+        pageserver.http_client().configure_failpoints((self.failpoint, "return(1)"))
+
+    def clear(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.http_client().configure_failpoints((self.failpoint, "off"))
+        if self._mitigate:
+            env.storage_controller.node_configure(self.pageserver_id, {"availability": "Active"})
+
+    def expect_available(self):
+        return True
+
+    def can_mitigate(self):
+        return self._mitigate
+
+    def mitigate(self, env):
+        env.storage_controller.node_configure(self.pageserver_id, {"availability": "Offline"})
+
+
+class StorageControllerFailpoint(Failure):
+    def __init__(self, failpoint, action):
+        self.failpoint = failpoint
+        self.pageserver_id = None
+        self.action = action
+
+    def apply(self, env: NeonEnv):
+        env.storage_controller.configure_failpoints((self.failpoint, self.action))
+
+    def clear(self, env: NeonEnv):
+        if "panic" in self.action:
+            log.info("Restarting storage controller after panic")
+            env.storage_controller.stop()
+            env.storage_controller.start()
+        else:
+            env.storage_controller.configure_failpoints((self.failpoint, "off"))
+
+    def expect_available(self):
+        # Controller panics _do_ leave pageservers available, but our test code relies
+        # on using the locate API to update configurations in Workload, so we must skip
+        # these actions when the controller has been panicked.
+        return "panic" not in self.action
+
+    def can_mitigate(self):
+        return False
+
+    def fails_forward(self, env):
+        # Edge case: the very last failpoint that simulates a DB connection error, where
+        # the abort path will fail-forward and result in a complete split.
+        fail_forward = self.failpoint == "shard-split-post-complete"
+
+        # If the failure was a panic, then if we expect split to eventually (after restart)
+        # complete, we must restart before checking that.
+        if fail_forward and "panic" in self.action:
+            log.info("Restarting storage controller after panic")
+            env.storage_controller.stop()
+            env.storage_controller.start()
+
+        return fail_forward
+
+    def expect_exception(self):
+        if "panic" in self.action:
+            return requests.exceptions.ConnectionError
+        else:
+            return StorageControllerApiException
+
+
+class NodeKill(Failure):
+    def __init__(self, pageserver_id, mitigate):
+        self.pageserver_id = pageserver_id
+        self._mitigate = mitigate
+
+    def apply(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.stop(immediate=True)
+
+    def clear(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.start()
+
+    def expect_available(self):
+        return False
+
+    def mitigate(self, env):
+        env.storage_controller.node_configure(self.pageserver_id, {"availability": "Offline"})
+
+
+class CompositeFailure(Failure):
+    """
+    Wrapper for failures in multiple components (e.g. a failpoint in the storage controller, *and*
+    stop a pageserver to interfere with rollback)
+    """
+
+    def __init__(self, failures: list[Failure]):
+        self.failures = failures
+
+        self.pageserver_id = None
+        for f in failures:
+            if f.pageserver_id is not None:
+                self.pageserver_id = f.pageserver_id
+                break
+
+    def apply(self, env: NeonEnv):
+        for f in self.failures:
+            f.apply(env)
+
+    def clear(self, env):
+        for f in self.failures:
+            f.clear(env)
+
+    def expect_available(self):
+        return all(f.expect_available() for f in self.failures)
+
+    def mitigate(self, env):
+        for f in self.failures:
+            f.mitigate(env)
+
+    def expect_exception(self):
+        expect = set(f.expect_exception() for f in self.failures)
+
+        # We can't give a sensible response if our failures have different expectations
+        assert len(expect) == 1
+
+        return list(expect)[0]
+
+
+@pytest.mark.parametrize(
+    "failure",
+    [
+        PageserverFailpoint("api-500", 1, False),
+        NodeKill(1, False),
+        PageserverFailpoint("api-500", 1, True),
+        NodeKill(1, True),
+        PageserverFailpoint("shard-split-pre-prepare", 1, False),
+        PageserverFailpoint("shard-split-post-prepare", 1, False),
+        PageserverFailpoint("shard-split-pre-hardlink", 1, False),
+        PageserverFailpoint("shard-split-post-hardlink", 1, False),
+        PageserverFailpoint("shard-split-post-child-conf", 1, False),
+        PageserverFailpoint("shard-split-lsn-wait", 1, False),
+        PageserverFailpoint("shard-split-pre-finish", 1, False),
+        StorageControllerFailpoint("shard-split-validation", "return(1)"),
+        StorageControllerFailpoint("shard-split-post-begin", "return(1)"),
+        StorageControllerFailpoint("shard-split-post-remote", "return(1)"),
+        StorageControllerFailpoint("shard-split-post-complete", "return(1)"),
+        StorageControllerFailpoint("shard-split-validation", "panic(failpoint)"),
+        StorageControllerFailpoint("shard-split-post-begin", "panic(failpoint)"),
+        StorageControllerFailpoint("shard-split-post-remote", "panic(failpoint)"),
+        StorageControllerFailpoint("shard-split-post-complete", "panic(failpoint)"),
+        CompositeFailure(
+            [NodeKill(1, True), StorageControllerFailpoint("shard-split-post-begin", "return(1)")]
+        ),
+        CompositeFailure(
+            [NodeKill(1, False), StorageControllerFailpoint("shard-split-post-begin", "return(1)")]
+        ),
+    ],
+)
+def test_sharding_split_failures(
+    neon_env_builder: NeonEnvBuilder,
+    compute_reconfigure_listener: ComputeReconfigure,
+    failure: Failure,
+):
+    neon_env_builder.num_pageservers = 4
+    neon_env_builder.control_plane_compute_hook_api = (
+        compute_reconfigure_listener.control_plane_compute_hook_api
+    )
+    initial_shard_count = 2
+    split_shard_count = 4
+
+    env = neon_env_builder.init_start(initial_tenant_shard_count=initial_shard_count)
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    for ps in env.pageservers:
+        # When we do node failures and abandon a shard, it will de-facto have old generation and
+        # thereby be unable to publish remote consistent LSN updates
+        ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
+
+    # Make sure the node we're failing has a shard on it, otherwise the test isn't testing anything
+    assert (
+        failure.pageserver_id is None
+        or len(
+            env.get_pageserver(failure.pageserver_id)
+            .http_client()
+            .tenant_list_locations()["tenant_shards"]
+        )
+        > 0
+    )
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(100)
+
+    # Put the environment into a failing state (exact meaning depends on `failure`)
+    failure.apply(env)
+
+    with pytest.raises(failure.expect_exception()):
+        env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
+
+    # We expect that the overall operation will fail, but some split requests
+    # will have succeeded: the net result should be to return to a clean state, including
+    # detaching any child shards.
+    def assert_rolled_back(exclude_ps_id=None) -> None:
+        count = 0
+        for ps in env.pageservers:
+            if exclude_ps_id is not None and ps.id == exclude_ps_id:
+                continue
+
+            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
+            for loc in locations:
+                tenant_shard_id = TenantShardId.parse(loc[0])
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
+                assert tenant_shard_id.shard_count == initial_shard_count
+                count += 1
+        assert count == initial_shard_count
+
+    def assert_split_done(exclude_ps_id=None) -> None:
+        count = 0
+        for ps in env.pageservers:
+            if exclude_ps_id is not None and ps.id == exclude_ps_id:
+                continue
+
+            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
+            for loc in locations:
+                tenant_shard_id = TenantShardId.parse(loc[0])
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
+                assert tenant_shard_id.shard_count == split_shard_count
+                count += 1
+        assert count == split_shard_count
+
+    def finish_split():
+        # Having failed+rolled back, we should be able to split again
+        # No failures this time; it will succeed
+        env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count)
+
+        workload.churn_rows(10)
+        workload.validate()
+
+    if failure.expect_available():
+        # Even though the split failed partway through, this should not have interrupted
+        # clients.  Disable waiting for pageservers in the workload helper, because our
+        # failpoints may prevent API access.
+        # This only applies for failure modes that leave pageserver page_service API available.
+        workload.churn_rows(10, upload=False, ingest=False)
+        workload.validate()
+
+    if failure.fails_forward(env):
+        log.info("Fail-forward failure, checking split eventually completes...")
+        # A failure type which results in eventual completion of the split
+        wait_until(30, 1, assert_split_done)
+    elif failure.can_mitigate():
+        log.info("Mitigating failure...")
+        # Mitigation phase: we expect to be able to proceed with a successful shard split
+        failure.mitigate(env)
+
+        # The split should appear to be rolled back from the point of view of all pageservers
+        # apart from the one that is offline
+        wait_until(30, 1, lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))
+
+        finish_split()
+        wait_until(30, 1, lambda: assert_split_done(exclude_ps_id=failure.pageserver_id))
+
+        # Having cleared the failure, everything should converge to a pristine state
+        failure.clear(env)
+        wait_until(30, 1, assert_split_done)
+    else:
+        # Once we restore the faulty pageserver's API to good health, rollback should
+        # eventually complete.
+        log.info("Clearing failure...")
+        failure.clear(env)
+
+        wait_until(30, 1, assert_rolled_back)
+
+        # Having rolled back, the tenant should be working
+        workload.churn_rows(10)
+        workload.validate()
+
+        # Splitting again should work, since we cleared the failure
+        finish_split()
+        assert_split_done()
+
+    env.storage_controller.consistency_check()

From 8075f0965af3bba94ac0bc874dcc14b068a62261 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 14 Mar 2024 12:18:55 +0100
Subject: [PATCH 0399/1571] fix(test suite) `virtual_file_io_engine` and
 `get_vectored_impl` patametrization doesn't work (#7113)

# Problem

While investigating #7124, I noticed that the benchmark was always using
the `DEFAULT_*` `virtual_file_io_engine` , i.e., `tokio-epoll-uring` as
of https://github.com/neondatabase/neon/pull/7077.

The fundamental problem is that the `control_plane` code has its own
view of `PageServerConfig`, which, I believe, will always be a subset of
the real pageserver's `pageserver/src/config.rs`.

For the `virtual_file_io_engine` and `get_vectored_impl` parametrization
of the test suite, we were constructing a dict on the Python side that
contained these parameters, then handed it to
`control_plane::PageServerConfig`'s derived `serde::Deserialize`.
The default in serde is to ignore unknown fields, so, the Deserialize
impl silently ignored the fields.
In consequence, the fields weren't propagated to the `pageserver --init`
call, and the tests ended up using the
`pageserver/src/config.rs::DEFAULT_` values for the respective options
all the time.

Tests that explicitly used overrides in `env.pageserver.start()` and
similar were not affected by this.

But, it means that all the test suite runs where with parametrization
didn't properly exercise the code path.

# Changes

- use `serde(deny_unknown_fields)` to expose the problem
- With this change, the Python tests that override
`virtual_file_io_engine` and
`get_vectored_impl` fail on `pageserver --init`, exposing the problem.
- use destructuring to uncover the issue in the future
- fix the issue by adding the missing fields to the `control_plane`
crate's `PageServerConf`
- A better solution would be for control plane to re-use a struct
provided
    by the pageserver crate, so that everything is in one place in
    `pageserver/src/config.rs`, but, our config parsing code is (almost)
    beyond repair anyways.
- fix the `pageserver_virtual_file_io_engine` to be responsive to the
env var
  - => required to make parametrization work in benchmarks

# Testing

Before merging this PR, I re-ran the regression tests & CI with the full
matrix of `virtual_file_io_engine` and `tokio-epoll-uring`, see
https://github.com/neondatabase/neon/pull/7113/commits/9c7ea364e04835b894a33136ca26e0cdb8cd6e30
---
 control_plane/src/local_env.rs      |  8 +++++++-
 control_plane/src/pageserver.rs     | 30 +++++++++++++++++++++--------
 test_runner/fixtures/parametrize.py |  2 +-
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 2e64489432..c7f22cc8f8 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -114,7 +114,7 @@ impl NeonBroker {
 }
 
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
-#[serde(default)]
+#[serde(default, deny_unknown_fields)]
 pub struct PageServerConf {
     // node id
     pub id: NodeId,
@@ -126,6 +126,9 @@ pub struct PageServerConf {
     // auth type used for the PG and HTTP ports
     pub pg_auth_type: AuthType,
     pub http_auth_type: AuthType,
+
+    pub(crate) virtual_file_io_engine: String,
+    pub(crate) get_vectored_impl: String,
 }
 
 impl Default for PageServerConf {
@@ -136,6 +139,9 @@ impl Default for PageServerConf {
             listen_http_addr: String::new(),
             pg_auth_type: AuthType::Trust,
             http_auth_type: AuthType::Trust,
+            // FIXME: use the ones exposed by pageserver crate
+            virtual_file_io_engine: "tokio-epoll-uring".to_owned(),
+            get_vectored_impl: "sequential".to_owned(),
         }
     }
 }
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 06ec942895..ab2f80fb0c 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -78,18 +78,31 @@ impl PageServerNode {
     ///
     /// These all end up on the command line of the `pageserver` binary.
     fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
-        let id = format!("id={}", self.conf.id);
         // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
         let pg_distrib_dir_param = format!(
             "pg_distrib_dir='{}'",
             self.env.pg_distrib_dir_raw().display()
         );
 
-        let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
-        let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);
+        let PageServerConf {
+            id,
+            listen_pg_addr,
+            listen_http_addr,
+            pg_auth_type,
+            http_auth_type,
+            virtual_file_io_engine,
+            get_vectored_impl,
+        } = &self.conf;
 
-        let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
-        let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);
+        let id = format!("id={}", id);
+
+        let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
+        let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
+
+        let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
+        let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
+        let virtual_file_io_engine = format!("virtual_file_io_engine='{virtual_file_io_engine}'");
+        let get_vectored_impl = format!("get_vectored_impl='{get_vectored_impl}'");
 
         let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
 
@@ -101,6 +114,8 @@ impl PageServerNode {
             listen_http_addr_param,
             listen_pg_addr_param,
             broker_endpoint_param,
+            virtual_file_io_engine,
+            get_vectored_impl,
         ];
 
         if let Some(control_plane_api) = &self.env.control_plane_api {
@@ -111,7 +126,7 @@ impl PageServerNode {
 
             // Storage controller uses the same auth as pageserver: if JWT is enabled
             // for us, we will also need it to talk to them.
-            if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
+            if matches!(http_auth_type, AuthType::NeonJWT) {
                 let jwt_token = self
                     .env
                     .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
@@ -129,8 +144,7 @@ impl PageServerNode {
             ));
         }
 
-        if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
-        {
+        if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
             // Keys are generated in the toplevel repo dir, pageservers' workdirs
             // are one level below that, so refer to keys with ../
             overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index b28da83508..c8ab550ad7 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -28,7 +28,7 @@ def platform() -> Optional[str]:
 
 @pytest.fixture(scope="function", autouse=True)
 def pageserver_virtual_file_io_engine() -> Optional[str]:
-    return None
+    return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE")
 
 
 def pytest_generate_tests(metafunc: Metafunc):

From 9fe0193e5154c7ac24093c02587c6669820c5fa6 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 14 Mar 2024 07:14:53 +0300
Subject: [PATCH 0400/1571] Bump vendor/postgres v15 v14.

---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/revisions.json | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index b980d6f090..3b09894ddb 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit b980d6f090c676e55fb2c830fb2434f532f635c0
+Subproject commit 3b09894ddb8825b50c963942059eab1a2a0b0a89
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 56f32c0e73..80cef885ad 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 56f32c0e7330d17aaeee8bf211a73995180bd133
+Subproject commit 80cef885add1af6741aa31944c7d2c84d8f9098f
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 1941c235ee..ae524d70b1 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
   "postgres-v16": "90078947229aa7f9ac5f7ed4527b2c7386d5332b",
-  "postgres-v15": "56f32c0e7330d17aaeee8bf211a73995180bd133",
-  "postgres-v14": "b980d6f090c676e55fb2c830fb2434f532f635c0"
+  "postgres-v15": "80cef885add1af6741aa31944c7d2c84d8f9098f",
+  "postgres-v14": "3b09894ddb8825b50c963942059eab1a2a0b0a89"
 }

From 38767ace686d191f6ae3f01df6ad5d682976bde2 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 14 Mar 2024 15:21:36 +0000
Subject: [PATCH 0401/1571] storage_controller: periodic pageserver heartbeats
 (#7092)

## Problem
If a pageserver was offline when the storage controller started, there
was no mechanism to update the
storage controller state when the pageserver becomes active.

## Summary of changes
* Add a heartbeater module. The heartbeater must be driven by an
external loop.
* Integrate the heartbeater into the service.
- Extend the types used by the service and scheduler to keep track of a
nodes' utilisation score.
- Add a background loop to drive the heartbeater and update the state
based on the deltas it generated
  - Do an initial round of heartbeats at start-up
---
 Cargo.lock                                    |   1 +
 control_plane/Cargo.toml                      |   1 +
 .../attachment_service/src/heartbeater.rs     | 227 ++++++++++++++
 control_plane/attachment_service/src/http.rs  |  11 +-
 control_plane/attachment_service/src/lib.rs   |   1 +
 control_plane/attachment_service/src/main.rs  |  10 +-
 control_plane/attachment_service/src/node.rs  |  29 +-
 .../attachment_service/src/scheduler.rs       |  34 ++-
 .../attachment_service/src/service.rs         | 277 ++++++++++++++----
 control_plane/src/storage_controller.rs       |   7 +
 libs/pageserver_api/src/controller_api.rs     |  62 +++-
 libs/pageserver_api/src/models/utilization.rs |  15 +-
 pageserver/client/src/mgmt_api.rs             |   9 +
 pageserver/src/http/routes.rs                 |   4 +
 test_runner/fixtures/neon_fixtures.py         |   8 +
 .../regress/test_pageserver_generations.py    |   2 +
 test_runner/regress/test_sharding_service.py  | 169 +++++++++++
 17 files changed, 779 insertions(+), 88 deletions(-)
 create mode 100644 control_plane/attachment_service/src/heartbeater.rs

diff --git a/Cargo.lock b/Cargo.lock
index 45397eb4a2..b8b276d74f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1346,6 +1346,7 @@ dependencies = [
  "futures",
  "git-version",
  "hex",
+ "humantime",
  "hyper",
  "nix 0.27.1",
  "once_cell",
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 75e5dcb7f8..b544a8c587 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -12,6 +12,7 @@ clap.workspace = true
 comfy-table.workspace = true
 futures.workspace = true
 git-version.workspace = true
+humantime.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
diff --git a/control_plane/attachment_service/src/heartbeater.rs b/control_plane/attachment_service/src/heartbeater.rs
new file mode 100644
index 0000000000..e15de28920
--- /dev/null
+++ b/control_plane/attachment_service/src/heartbeater.rs
@@ -0,0 +1,227 @@
+use futures::{stream::FuturesUnordered, StreamExt};
+use std::{
+    collections::HashMap,
+    sync::Arc,
+    time::{Duration, Instant},
+};
+use tokio_util::sync::CancellationToken;
+
+use pageserver_api::{
+    controller_api::{NodeAvailability, UtilizationScore},
+    models::PageserverUtilization,
+};
+
+use thiserror::Error;
+use utils::id::NodeId;
+
+use crate::node::Node;
+
+struct HeartbeaterTask {
+    receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
+    cancel: CancellationToken,
+
+    state: HashMap<NodeId, PageserverState>,
+
+    max_unavailable_interval: Duration,
+    jwt_token: Option<String>,
+}
+
+#[derive(Debug, Clone)]
+pub(crate) enum PageserverState {
+    Available {
+        last_seen_at: Instant,
+        utilization: PageserverUtilization,
+    },
+    Offline,
+}
+
+#[derive(Debug)]
+pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, PageserverState)>);
+
+#[derive(Debug, Error)]
+pub(crate) enum HeartbeaterError {
+    #[error("Cancelled")]
+    Cancel,
+}
+
+struct HeartbeatRequest {
+    pageservers: Arc<HashMap<NodeId, Node>>,
+    reply: tokio::sync::oneshot::Sender<Result<AvailablityDeltas, HeartbeaterError>>,
+}
+
+pub(crate) struct Heartbeater {
+    sender: tokio::sync::mpsc::UnboundedSender<HeartbeatRequest>,
+}
+
+impl Heartbeater {
+    pub(crate) fn new(
+        jwt_token: Option<String>,
+        max_unavailable_interval: Duration,
+        cancel: CancellationToken,
+    ) -> Self {
+        let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::<HeartbeatRequest>();
+        let mut heartbeater =
+            HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel);
+        tokio::task::spawn(async move { heartbeater.run().await });
+
+        Self { sender }
+    }
+
+    pub(crate) async fn heartbeat(
+        &self,
+        pageservers: Arc<HashMap<NodeId, Node>>,
+    ) -> Result<AvailablityDeltas, HeartbeaterError> {
+        let (sender, receiver) = tokio::sync::oneshot::channel();
+        self.sender
+            .send(HeartbeatRequest {
+                pageservers,
+                reply: sender,
+            })
+            .unwrap();
+
+        receiver.await.unwrap()
+    }
+}
+
+impl HeartbeaterTask {
+    fn new(
+        receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
+        jwt_token: Option<String>,
+        max_unavailable_interval: Duration,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            receiver,
+            cancel,
+            state: HashMap::new(),
+            max_unavailable_interval,
+            jwt_token,
+        }
+    }
+
+    async fn run(&mut self) {
+        loop {
+            tokio::select! {
+                request = self.receiver.recv() => {
+                    match request {
+                        Some(req) => {
+                            let res = self.heartbeat(req.pageservers).await;
+                            req.reply.send(res).unwrap();
+                        },
+                        None => { return; }
+                    }
+                },
+                _ = self.cancel.cancelled() => return
+            }
+        }
+    }
+
+    async fn heartbeat(
+        &mut self,
+        pageservers: Arc<HashMap<NodeId, Node>>,
+    ) -> Result<AvailablityDeltas, HeartbeaterError> {
+        let mut new_state = HashMap::new();
+
+        let mut heartbeat_futs = FuturesUnordered::new();
+        for (node_id, node) in &*pageservers {
+            heartbeat_futs.push({
+                let jwt_token = self.jwt_token.clone();
+                let cancel = self.cancel.clone();
+
+                // Clone the node and mark it as available such that the request
+                // goes through to the pageserver even when the node is marked offline.
+                // This doesn't impact the availability observed by [`crate::service::Service`].
+                let mut node = node.clone();
+                node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+
+                async move {
+                    let response = node
+                        .with_client_retries(
+                            |client| async move { client.get_utilization().await },
+                            &jwt_token,
+                            2,
+                            3,
+                            Duration::from_secs(1),
+                            &cancel,
+                        )
+                        .await;
+
+                    let response = match response {
+                        Some(r) => r,
+                        None => {
+                            // This indicates cancellation of the request.
+                            // We ignore the node in this case.
+                            return None;
+                        }
+                    };
+
+                    let status = if let Ok(utilization) = response {
+                        PageserverState::Available {
+                            last_seen_at: Instant::now(),
+                            utilization,
+                        }
+                    } else {
+                        PageserverState::Offline
+                    };
+
+                    Some((*node_id, status))
+                }
+            });
+
+            loop {
+                let maybe_status = tokio::select! {
+                    next = heartbeat_futs.next() => {
+                        match next {
+                            Some(result) => result,
+                            None => { break; }
+                        }
+                    },
+                    _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); }
+                };
+
+                if let Some((node_id, status)) = maybe_status {
+                    new_state.insert(node_id, status);
+                }
+            }
+        }
+
+        let mut deltas = Vec::new();
+        let now = Instant::now();
+        for (node_id, ps_state) in new_state {
+            use std::collections::hash_map::Entry::*;
+            let entry = self.state.entry(node_id);
+
+            let mut needs_update = false;
+            match entry {
+                Occupied(ref occ) => match (occ.get(), &ps_state) {
+                    (PageserverState::Offline, PageserverState::Offline) => {}
+                    (PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => {
+                        if now - *last_seen_at >= self.max_unavailable_interval {
+                            deltas.push((node_id, ps_state.clone()));
+                            needs_update = true;
+                        }
+                    }
+                    _ => {
+                        deltas.push((node_id, ps_state.clone()));
+                        needs_update = true;
+                    }
+                },
+                Vacant(_) => {
+                    deltas.push((node_id, ps_state.clone()));
+                }
+            }
+
+            match entry {
+                Occupied(mut occ) if needs_update => {
+                    (*occ.get_mut()) = ps_state;
+                }
+                Vacant(vac) => {
+                    vac.insert(ps_state);
+                }
+                _ => {}
+            }
+        }
+
+        Ok(AvailablityDeltas(deltas))
+    }
+}
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index d26652cc94..560a05e908 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -28,7 +28,7 @@ use utils::{
 };
 
 use pageserver_api::controller_api::{
-    NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
+    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
 };
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
 
@@ -389,7 +389,14 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
 
     json_response(
         StatusCode::OK,
-        state.service.node_configure(config_req).await?,
+        state
+            .service
+            .node_configure(
+                config_req.node_id,
+                config_req.availability.map(NodeAvailability::from),
+                config_req.scheduling,
+            )
+            .await?,
     )
 }
 
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index a017bc1ecc..4aff29f15b 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -3,6 +3,7 @@ use utils::seqwait::MonotonicCounter;
 
 mod auth;
 mod compute_hook;
+mod heartbeater;
 pub mod http;
 mod id_lock_map;
 pub mod metrics;
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index fb7b363c39..0a925a63f6 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -2,7 +2,7 @@ use anyhow::{anyhow, Context};
 use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
-use attachment_service::service::{Config, Service};
+use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
 use aws_config::{BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
@@ -54,6 +54,10 @@ struct Cli {
     /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
     #[arg(long)]
     database_url: Option<String>,
+
+    /// Grace period before marking unresponsive pageserver offline
+    #[arg(long)]
+    max_unavailable_interval: Option<humantime::Duration>,
 }
 
 /// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
@@ -249,6 +253,10 @@ async fn async_main() -> anyhow::Result<()> {
         jwt_token: secrets.jwt_token,
         control_plane_jwt_token: secrets.control_plane_jwt_token,
         compute_hook_url: args.compute_hook_url,
+        max_unavailable_interval: args
+            .max_unavailable_interval
+            .map(humantime::Duration::into)
+            .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index dda8a155c6..4167782715 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -12,7 +12,7 @@ use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, id::NodeId};
 
-use crate::persistence::NodePersistence;
+use crate::{persistence::NodePersistence, scheduler::MaySchedule};
 
 /// Represents the in-memory description of a Node.
 ///
@@ -111,8 +111,8 @@ impl Node {
         use NodeAvailability::*;
 
         match (self.availability, availability) {
-            (Offline, Active) => ToActive,
-            (Active, Offline) => ToOffline,
+            (Offline, Active(_)) => ToActive,
+            (Active(_), Offline) => ToOffline,
             _ => Unchanged,
         }
     }
@@ -123,21 +123,21 @@ impl Node {
         // a reference to the original Node's cancellation status.  Checking both of these results
         // in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
         // when we cloned it, or if the original Node instance's cancellation token was fired.
-        matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled()
+        matches!(self.availability, NodeAvailability::Active(_)) && !self.cancel.is_cancelled()
     }
 
     /// Is this node elegible to have work scheduled onto it?
-    pub(crate) fn may_schedule(&self) -> bool {
-        match self.availability {
-            NodeAvailability::Active => {}
-            NodeAvailability::Offline => return false,
-        }
+    pub(crate) fn may_schedule(&self) -> MaySchedule {
+        let score = match self.availability {
+            NodeAvailability::Active(score) => score,
+            NodeAvailability::Offline => return MaySchedule::No,
+        };
 
         match self.scheduling {
-            NodeSchedulingPolicy::Active => true,
-            NodeSchedulingPolicy::Draining => false,
-            NodeSchedulingPolicy::Filling => true,
-            NodeSchedulingPolicy::Pause => false,
+            NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
+            NodeSchedulingPolicy::Draining => MaySchedule::No,
+            NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
+            NodeSchedulingPolicy::Pause => MaySchedule::No,
         }
     }
 
@@ -155,8 +155,7 @@ impl Node {
             listen_pg_addr,
             listen_pg_port,
             scheduling: NodeSchedulingPolicy::Filling,
-            // TODO: we shouldn't really call this Active until we've heartbeated it.
-            availability: NodeAvailability::Active,
+            availability: NodeAvailability::Offline,
             cancel: CancellationToken::new(),
         }
     }
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 26a2707e8d..981ba26cce 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -1,4 +1,5 @@
 use crate::{node::Node, tenant_state::TenantState};
+use pageserver_api::controller_api::UtilizationScore;
 use serde::Serialize;
 use std::collections::HashMap;
 use utils::{http::error::ApiError, id::NodeId};
@@ -19,15 +20,34 @@ impl From<ScheduleError> for ApiError {
 }
 
 #[derive(Serialize, Eq, PartialEq)]
+pub enum MaySchedule {
+    Yes(UtilizationScore),
+    No,
+}
+
+#[derive(Serialize)]
 struct SchedulerNode {
     /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
     shard_count: usize,
 
     /// Whether this node is currently elegible to have new shards scheduled (this is derived
     /// from a node's availability state and scheduling policy).
-    may_schedule: bool,
+    may_schedule: MaySchedule,
 }
 
+impl PartialEq for SchedulerNode {
+    fn eq(&self, other: &Self) -> bool {
+        let may_schedule_matches = matches!(
+            (&self.may_schedule, &other.may_schedule),
+            (MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
+        );
+
+        may_schedule_matches && self.shard_count == other.shard_count
+    }
+}
+
+impl Eq for SchedulerNode {}
+
 /// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
 /// on which to run.
 ///
@@ -186,13 +206,15 @@ impl Scheduler {
             return None;
         }
 
+        // TODO: When the utilization score returned by the pageserver becomes meaningful,
+        // schedule based on that instead of the shard count.
         let node = nodes
             .iter()
             .map(|node_id| {
                 let may_schedule = self
                     .nodes
                     .get(node_id)
-                    .map(|n| n.may_schedule)
+                    .map(|n| n.may_schedule != MaySchedule::No)
                     .unwrap_or(false);
                 (*node_id, may_schedule)
             })
@@ -211,7 +233,7 @@ impl Scheduler {
             .nodes
             .iter()
             .filter_map(|(k, v)| {
-                if hard_exclude.contains(k) || !v.may_schedule {
+                if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
                     None
                 } else {
                     Some((*k, v.shard_count))
@@ -230,7 +252,7 @@ impl Scheduler {
             for (node_id, node) in &self.nodes {
                 tracing::info!(
                     "Node {node_id}: may_schedule={} shards={}",
-                    node.may_schedule,
+                    node.may_schedule != MaySchedule::No,
                     node.shard_count
                 );
             }
@@ -255,6 +277,7 @@ impl Scheduler {
 pub(crate) mod test_utils {
 
     use crate::node::Node;
+    use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
     use std::collections::HashMap;
     use utils::id::NodeId;
     /// Test helper: synthesize the requested number of nodes, all in active state.
@@ -264,13 +287,14 @@ pub(crate) mod test_utils {
         (1..n + 1)
             .map(|i| {
                 (NodeId(i), {
-                    let node = Node::new(
+                    let mut node = Node::new(
                         NodeId(i),
                         format!("httphost-{i}"),
                         80 + i as u16,
                         format!("pghost-{i}"),
                         5432 + i as u16,
                     );
+                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
                     assert!(node.is_available());
                     node
                 })
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 1b85081666..ac61209c38 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -16,19 +16,13 @@ use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
 use pageserver_api::{
-    controller_api::{
-        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, PlacementPolicy,
-        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
-    },
-    models::TenantConfigRequest,
-};
-use pageserver_api::{
+    controller_api::UtilizationScore,
     models::{
-        self, LocationConfig, LocationConfigListResponse, LocationConfigMode, ShardParameters,
-        TenantConfig, TenantCreateRequest, TenantLocationConfigRequest,
-        TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest,
-        TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo,
+        self, LocationConfig, LocationConfigListResponse, LocationConfigMode,
+        PageserverUtilization, ShardParameters, TenantConfig, TenantCreateRequest,
+        TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
+        TenantShardSplitRequest, TenantShardSplitResponse, TenantTimeTravelRequest,
+        TimelineCreateRequest, TimelineInfo,
     },
     shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
     upcall_api::{
@@ -36,6 +30,14 @@ use pageserver_api::{
         ValidateResponse, ValidateResponseTenant,
     },
 };
+use pageserver_api::{
+    controller_api::{
+        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
+        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
+        TenantShardMigrateRequest, TenantShardMigrateResponse,
+    },
+    models::TenantConfigRequest,
+};
 use pageserver_client::mgmt_api;
 use tokio::sync::OwnedRwLockWriteGuard;
 use tokio_util::sync::CancellationToken;
@@ -51,6 +53,7 @@ use utils::{
 
 use crate::{
     compute_hook::{self, ComputeHook},
+    heartbeater::{Heartbeater, PageserverState},
     node::{AvailabilityTransition, Node},
     persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
     reconciler::attached_location_conf,
@@ -78,6 +81,8 @@ const INITIAL_GENERATION: Generation = Generation::new(0);
 /// up on unresponsive pageservers and proceed.
 pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
+pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
+
 // Top level state available to all HTTP handlers
 struct ServiceState {
     tenants: BTreeMap<TenantShardId, TenantState>,
@@ -125,6 +130,11 @@ pub struct Config {
     /// (this URL points to the control plane in prod). If this is None, the compute hook will
     /// assume it is running in a test environment and try to update neon_local.
     pub compute_hook_url: Option<String>,
+
+    /// Grace period within which a pageserver does not respond to heartbeats, but is still
+    /// considered active. Once the grace period elapses, the next heartbeat failure will
+    /// mark the pagseserver offline.
+    pub max_unavailable_interval: Duration,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -149,6 +159,8 @@ pub struct Service {
     compute_hook: Arc<ComputeHook>,
     result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
 
+    heartbeater: Heartbeater,
+
     // Channel for background cleanup from failed operations that require cleanup, such as shard split
     abort_tx: tokio::sync::mpsc::UnboundedSender<TenantShardSplitAbort>,
 
@@ -232,8 +244,6 @@ impl Service {
         let mut observed: HashMap<TenantShardId, Vec<(NodeId, Option<LocationConfig>)>> =
             HashMap::new();
 
-        let mut nodes_online = HashSet::new();
-
         // Startup reconciliation does I/O to other services: whether they
         // are responsive or not, we should aim to finish within our deadline, because:
         // - If we don't, a k8s readiness hook watching /ready will kill us.
@@ -255,6 +265,9 @@ impl Service {
         let mut cleanup = Vec::new();
 
         let node_listings = self.scan_node_locations(node_scan_deadline).await;
+        // Send initial heartbeat requests to nodes that replied to the location listing above.
+        let nodes_online = self.initial_heartbeat_round(node_listings.keys()).await;
+
         for (node_id, list_response) in node_listings {
             let tenant_shards = list_response.tenant_shards;
             tracing::info!(
@@ -262,7 +275,6 @@ impl Service {
                 tenant_shards.len(),
                 node_id
             );
-            nodes_online.insert(node_id);
 
             for (tenant_shard_id, conf_opt) in tenant_shards {
                 let shard_observations = observed.entry(tenant_shard_id).or_default();
@@ -281,8 +293,10 @@ impl Service {
             // Mark nodes online if they responded to us: nodes are offline by default after a restart.
             let mut new_nodes = (**nodes).clone();
             for (node_id, node) in new_nodes.iter_mut() {
-                if nodes_online.contains(node_id) {
-                    node.set_availability(NodeAvailability::Active);
+                if let Some(utilization) = nodes_online.get(node_id) {
+                    node.set_availability(NodeAvailability::Active(UtilizationScore(
+                        utilization.utilization_score,
+                    )));
                     scheduler.node_upsert(node);
                 }
             }
@@ -371,6 +385,49 @@ impl Service {
         tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
     }
 
+    async fn initial_heartbeat_round<'a>(
+        &self,
+        node_ids: impl Iterator<Item = &'a NodeId>,
+    ) -> HashMap<NodeId, PageserverUtilization> {
+        assert!(!self.startup_complete.is_ready());
+
+        let all_nodes = {
+            let locked = self.inner.read().unwrap();
+            locked.nodes.clone()
+        };
+
+        let mut nodes_to_heartbeat = HashMap::new();
+        for node_id in node_ids {
+            match all_nodes.get(node_id) {
+                Some(node) => {
+                    nodes_to_heartbeat.insert(*node_id, node.clone());
+                }
+                None => {
+                    tracing::warn!("Node {node_id} was removed during start-up");
+                }
+            }
+        }
+
+        let res = self
+            .heartbeater
+            .heartbeat(Arc::new(nodes_to_heartbeat))
+            .await;
+
+        let mut online_nodes = HashMap::new();
+        if let Ok(deltas) = res {
+            for (node_id, status) in deltas.0 {
+                match status {
+                    PageserverState::Available { utilization, .. } => {
+                        online_nodes.insert(node_id, utilization);
+                    }
+                    PageserverState::Offline => {}
+                }
+            }
+        }
+
+        online_nodes
+    }
+
     /// Used during [`Self::startup_reconcile`]: issue GETs to all nodes concurrently, with a deadline.
     ///
     /// The result includes only nodes which responded within the deadline
@@ -391,7 +448,7 @@ impl Service {
             node_list_futs.push({
                 async move {
                     tracing::info!("Scanning shards on node {node}...");
-                    let timeout = Duration::from_secs(5);
+                    let timeout = Duration::from_secs(1);
                     let response = node
                         .with_client_retries(
                             |client| async move { client.list_location_config().await },
@@ -586,6 +643,56 @@ impl Service {
             }
         }
     }
+    #[instrument(skip_all)]
+    async fn spawn_heartbeat_driver(&self) {
+        self.startup_complete.clone().wait().await;
+
+        const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5);
+
+        let mut interval = tokio::time::interval(HEARTBEAT_INTERVAL);
+        while !self.cancel.is_cancelled() {
+            tokio::select! {
+              _ = interval.tick() => { }
+              _ = self.cancel.cancelled() => return
+            };
+
+            let nodes = {
+                let locked = self.inner.read().unwrap();
+                locked.nodes.clone()
+            };
+
+            let res = self.heartbeater.heartbeat(nodes).await;
+            if let Ok(deltas) = res {
+                for (node_id, state) in deltas.0 {
+                    let new_availability = match state {
+                        PageserverState::Available { utilization, .. } => NodeAvailability::Active(
+                            UtilizationScore(utilization.utilization_score),
+                        ),
+                        PageserverState::Offline => NodeAvailability::Offline,
+                    };
+                    let res = self
+                        .node_configure(node_id, Some(new_availability), None)
+                        .await;
+
+                    match res {
+                        Ok(()) => {}
+                        Err(ApiError::NotFound(_)) => {
+                            // This should be rare, but legitimate since the heartbeats are done
+                            // on a snapshot of the nodes.
+                            tracing::info!("Node {} was not found after heartbeat round", node_id);
+                        }
+                        Err(err) => {
+                            tracing::error!(
+                                "Failed to update node {} after heartbeat round: {}",
+                                node_id,
+                                err
+                            );
+                        }
+                    }
+                }
+            }
+        }
+    }
 
     /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation
     /// was successful, this will update the observed state of the tenant such that subsequent
@@ -836,6 +943,12 @@ impl Service {
 
         let (startup_completion, startup_complete) = utils::completion::channel();
 
+        let cancel = CancellationToken::new();
+        let heartbeater = Heartbeater::new(
+            config.jwt_token.clone(),
+            config.max_unavailable_interval,
+            cancel.clone(),
+        );
         let this = Arc::new(Self {
             inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
                 nodes, tenants, scheduler,
@@ -844,9 +957,10 @@ impl Service {
             persistence,
             compute_hook: Arc::new(ComputeHook::new(config)),
             result_tx,
+            heartbeater,
             abort_tx,
             startup_complete: startup_complete.clone(),
-            cancel: CancellationToken::new(),
+            cancel,
             gate: Gate::default(),
             tenant_op_locks: Default::default(),
             node_op_locks: Default::default(),
@@ -899,13 +1013,28 @@ impl Service {
                 };
 
                 this.startup_reconcile().await;
-
                 drop(startup_completion);
+            }
+        });
 
+        tokio::task::spawn({
+            let this = this.clone();
+            let startup_complete = startup_complete.clone();
+            async move {
+                startup_complete.wait().await;
                 this.background_reconcile().await;
             }
         });
 
+        tokio::task::spawn({
+            let this = this.clone();
+            let startup_complete = startup_complete.clone();
+            async move {
+                startup_complete.wait().await;
+                this.spawn_heartbeat_driver().await;
+            }
+        });
+
         Ok(this)
     }
 
@@ -964,11 +1093,37 @@ impl Service {
         }
 
         let new_generation = if let Some(req_node_id) = attach_req.node_id {
-            Some(
-                self.persistence
-                    .increment_generation(attach_req.tenant_shard_id, req_node_id)
-                    .await?,
-            )
+            let maybe_tenant_conf = {
+                let locked = self.inner.write().unwrap();
+                locked
+                    .tenants
+                    .get(&attach_req.tenant_shard_id)
+                    .map(|t| t.config.clone())
+            };
+
+            match maybe_tenant_conf {
+                Some(conf) => {
+                    let new_generation = self
+                        .persistence
+                        .increment_generation(attach_req.tenant_shard_id, req_node_id)
+                        .await?;
+
+                    // Persist the placement policy update. This is required
+                    // when we reattaching a detached tenant.
+                    self.persistence
+                        .update_tenant_shard(
+                            attach_req.tenant_shard_id,
+                            PlacementPolicy::Single,
+                            conf,
+                            None,
+                        )
+                        .await?;
+                    Some(new_generation)
+                }
+                None => {
+                    anyhow::bail!("Attach hook handling raced with tenant removal")
+                }
+            }
         } else {
             self.persistence.detach(attach_req.tenant_shard_id).await?;
             None
@@ -983,6 +1138,7 @@ impl Service {
 
         if let Some(new_generation) = new_generation {
             tenant_state.generation = Some(new_generation);
+            tenant_state.policy = PlacementPolicy::Single;
         } else {
             // This is a detach notification.  We must update placement policy to avoid re-attaching
             // during background scheduling/reconciliation, or during storage controller restart.
@@ -1085,7 +1241,7 @@ impl Service {
         // This Node is a mutable local copy: we will set it active so that we can use its
         // API client to reconcile with the node.  The Node in [`Self::nodes`] will get updated
         // later.
-        node.set_availability(NodeAvailability::Active);
+        node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
 
         let configs = match node
             .with_client_retries(
@@ -1196,7 +1352,7 @@ impl Service {
 
         // Apply the updated generation to our in-memory state
         let mut locked = self.inner.write().unwrap();
-        let (nodes, tenants, _scheduler) = locked.parts_mut();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
 
         let mut response = ReAttachResponse {
             tenants: Vec::new(),
@@ -1271,7 +1427,8 @@ impl Service {
             if !node.is_available() {
                 let mut new_nodes = (**nodes).clone();
                 if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
-                    node.set_availability(NodeAvailability::Active);
+                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+                    scheduler.node_upsert(node);
                 }
                 let new_nodes = Arc::new(new_nodes);
                 *nodes = new_nodes;
@@ -3328,16 +3485,16 @@ impl Service {
 
     pub(crate) async fn node_configure(
         &self,
-        config_req: NodeConfigureRequest,
+        node_id: NodeId,
+        availability: Option<NodeAvailability>,
+        scheduling: Option<NodeSchedulingPolicy>,
     ) -> Result<(), ApiError> {
-        let _node_lock = self.node_op_locks.exclusive(config_req.node_id).await;
+        let _node_lock = self.node_op_locks.exclusive(node_id).await;
 
-        if let Some(scheduling) = config_req.scheduling {
+        if let Some(scheduling) = scheduling {
             // Scheduling is a persistent part of Node: we must write updates to the database before
             // applying them in memory
-            self.persistence
-                .update_node(config_req.node_id, scheduling)
-                .await?;
+            self.persistence.update_node(node_id, scheduling).await?;
         }
 
         // If we're activating a node, then before setting it active we must reconcile any shard locations
@@ -3346,12 +3503,12 @@ impl Service {
         //
         // The transition we calculate here remains valid later in the function because we hold the op lock on the node:
         // nothing else can mutate its availability while we run.
-        let availability_transition = if let Some(input_availability) = config_req.availability {
+        let availability_transition = if let Some(input_availability) = availability {
             let (activate_node, availability_transition) = {
                 let locked = self.inner.read().unwrap();
-                let Some(node) = locked.nodes.get(&config_req.node_id) else {
+                let Some(node) = locked.nodes.get(&node_id) else {
                     return Err(ApiError::NotFound(
-                        anyhow::anyhow!("Node {} not registered", config_req.node_id).into(),
+                        anyhow::anyhow!("Node {} not registered", node_id).into(),
                     ));
                 };
 
@@ -3376,17 +3533,17 @@ impl Service {
 
         let mut new_nodes = (**nodes).clone();
 
-        let Some(node) = new_nodes.get_mut(&config_req.node_id) else {
+        let Some(node) = new_nodes.get_mut(&node_id) else {
             return Err(ApiError::NotFound(
                 anyhow::anyhow!("Node not registered").into(),
             ));
         };
 
-        if let Some(availability) = &config_req.availability {
+        if let Some(availability) = &availability {
             node.set_availability(*availability);
         }
 
-        if let Some(scheduling) = config_req.scheduling {
+        if let Some(scheduling) = scheduling {
             node.set_scheduling(scheduling);
 
             // TODO: once we have a background scheduling ticker for fill/drain, kick it
@@ -3401,25 +3558,23 @@ impl Service {
         // Modify scheduling state for any Tenants that are affected by a change in the node's availability state.
         match availability_transition {
             AvailabilityTransition::ToOffline => {
-                tracing::info!("Node {} transition to offline", config_req.node_id);
+                tracing::info!("Node {} transition to offline", node_id);
                 let mut tenants_affected: usize = 0;
                 for (tenant_shard_id, tenant_state) in tenants {
-                    if let Some(observed_loc) =
-                        tenant_state.observed.locations.get_mut(&config_req.node_id)
-                    {
+                    if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
                         // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
                         // not assume our knowledge of the node's configuration is accurate until it comes back online
                         observed_loc.conf = None;
                     }
 
-                    if tenant_state.intent.demote_attached(config_req.node_id) {
+                    if tenant_state.intent.demote_attached(node_id) {
                         tenant_state.sequence = tenant_state.sequence.next();
                         match tenant_state.schedule(scheduler) {
                             Err(e) => {
                                 // It is possible that some tenants will become unschedulable when too many pageservers
                                 // go offline: in this case there isn't much we can do other than make the issue observable.
                                 // TODO: give TenantState a scheduling error attribute to be queried later.
-                                tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id);
+                                tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id);
                             }
                             Ok(()) => {
                                 if self
@@ -3435,17 +3590,15 @@ impl Service {
                 tracing::info!(
                     "Launched {} reconciler tasks for tenants affected by node {} going offline",
                     tenants_affected,
-                    config_req.node_id
+                    node_id
                 )
             }
             AvailabilityTransition::ToActive => {
-                tracing::info!("Node {} transition to active", config_req.node_id);
+                tracing::info!("Node {} transition to active", node_id);
                 // When a node comes back online, we must reconcile any tenant that has a None observed
                 // location on the node.
                 for tenant_state in locked.tenants.values_mut() {
-                    if let Some(observed_loc) =
-                        tenant_state.observed.locations.get_mut(&config_req.node_id)
-                    {
+                    if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
                         if observed_loc.conf.is_none() {
                             self.maybe_reconcile_shard(tenant_state, &new_nodes);
                         }
@@ -3455,7 +3608,7 @@ impl Service {
                 // TODO: in the background, we should balance work back onto this pageserver
             }
             AvailabilityTransition::Unchanged => {
-                tracing::info!("Node {} no change during config", config_req.node_id);
+                tracing::info!("Node {} no change during config", node_id);
             }
         }
 
@@ -3534,17 +3687,23 @@ impl Service {
         )
     }
 
-    /// Check all tenants for pending reconciliation work, and reconcile those in need
+    /// Check all tenants for pending reconciliation work, and reconcile those in need.
+    /// Additionally, reschedule tenants that require it.
     ///
     /// Returns how many reconciliation tasks were started
     fn reconcile_all(&self) -> usize {
         let mut locked = self.inner.write().unwrap();
-        let pageservers = locked.nodes.clone();
-        locked
-            .tenants
-            .iter_mut()
-            .filter_map(|(_tenant_shard_id, shard)| self.maybe_reconcile_shard(shard, &pageservers))
-            .count()
+        let (nodes, tenants, _scheduler) = locked.parts_mut();
+        let pageservers = nodes.clone();
+
+        let mut reconciles_spawned = 0;
+        for (_tenant_shard_id, shard) in tenants.iter_mut() {
+            if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
+                reconciles_spawned += 1;
+            }
+        }
+
+        reconciles_spawned
     }
 
     pub async fn shutdown(&self) {
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index d7673f1b26..18014adba4 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -38,6 +38,9 @@ const COMMAND: &str = "storage_controller";
 
 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 
+// Use a shorter pageserver unavailability interval than the default to speed up tests.
+const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
@@ -269,6 +272,8 @@ impl StorageController {
         // Run migrations on every startup, in case something changed.
         let database_url = self.setup_database().await?;
 
+        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
+
         let mut args = vec![
             "-l",
             &self.listen,
@@ -276,6 +281,8 @@ impl StorageController {
             self.path.as_ref(),
             "--database-url",
             &database_url,
+            "--max-unavailable-interval",
+            &max_unavailable.to_string(),
         ]
         .into_iter()
         .map(|s| s.to_string())
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index c172354e9f..6053e8b8ed 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -35,7 +35,7 @@ pub struct NodeRegisterRequest {
 pub struct NodeConfigureRequest {
     pub node_id: NodeId,
 
-    pub availability: Option<NodeAvailability>,
+    pub availability: Option<NodeAvailabilityWrapper>,
     pub scheduling: Option<NodeSchedulingPolicy>,
 }
 
@@ -66,22 +66,76 @@ pub struct TenantShardMigrateRequest {
     pub node_id: NodeId,
 }
 
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+/// Utilisation score indicating how good a candidate a pageserver
+/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
+/// Lower values are better.
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
+pub struct UtilizationScore(pub u64);
+
+impl UtilizationScore {
+    pub fn worst() -> Self {
+        UtilizationScore(u64::MAX)
+    }
+}
+
+#[derive(Serialize, Clone, Copy)]
+#[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
     // Normal, happy state
-    Active,
+    Active(UtilizationScore),
     // Offline: Tenants shouldn't try to attach here, but they may assume that their
     // secondary locations on this node still exist.  Newly added nodes are in this
     // state until we successfully contact them.
     Offline,
 }
 
+impl PartialEq for NodeAvailability {
+    fn eq(&self, other: &Self) -> bool {
+        use NodeAvailability::*;
+        matches!((self, other), (Active(_), Active(_)) | (Offline, Offline))
+    }
+}
+
+impl Eq for NodeAvailability {}
+
+// This wrapper provides serde functionality and it should only be used to
+// communicate with external callers which don't know or care about the
+// utilisation score of the pageserver it is targeting.
+#[derive(Serialize, Deserialize, Clone)]
+pub enum NodeAvailabilityWrapper {
+    Active,
+    Offline,
+}
+
+impl From<NodeAvailabilityWrapper> for NodeAvailability {
+    fn from(val: NodeAvailabilityWrapper) -> Self {
+        match val {
+            // Assume the worst utilisation score to begin with. It will later be updated by
+            // the heartbeats.
+            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
+            NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
+        }
+    }
+}
+
+impl From<NodeAvailability> for NodeAvailabilityWrapper {
+    fn from(val: NodeAvailability) -> Self {
+        match val {
+            NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
+            NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
+        }
+    }
+}
+
 impl FromStr for NodeAvailability {
     type Err = anyhow::Error;
 
     fn from_str(s: &str) -> Result<Self, Self::Err> {
         match s {
-            "active" => Ok(Self::Active),
+            // This is used when parsing node configuration requests from neon-local.
+            // Assume the worst possible utilisation score
+            // and let it get updated via the heartbeats.
+            "active" => Ok(Self::Active(UtilizationScore::worst())),
             "offline" => Ok(Self::Offline),
             _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
         }
diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs
index 7195a12395..f5984dff5d 100644
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -7,7 +7,7 @@ use std::time::SystemTime;
 ///
 /// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
 /// not handle full u64 values properly.
-#[derive(serde::Serialize, Debug)]
+#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
 pub struct PageserverUtilization {
     /// Used disk space
     #[serde(serialize_with = "ser_saturating_u63")]
@@ -21,7 +21,10 @@ pub struct PageserverUtilization {
     /// When was this snapshot captured, pageserver local time.
     ///
     /// Use millis to give confidence that the value is regenerated often enough.
-    #[serde(serialize_with = "ser_rfc3339_millis")]
+    #[serde(
+        serialize_with = "ser_rfc3339_millis",
+        deserialize_with = "deser_rfc3339_millis"
+    )]
     pub captured_at: SystemTime,
 }
 
@@ -32,6 +35,14 @@ fn ser_rfc3339_millis<S: serde::Serializer>(
     serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
 }
 
+fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
+    humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
+}
+
 /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
 ///
 /// Instead of newtype, use this because a newtype would get require handling deserializing values
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 2f22ebd54d..ed9f633253 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -416,4 +416,13 @@ impl Client {
             .await
             .map_err(Error::ReceiveBody)
     }
+
+    pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
+        let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
+        self.get(uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index fc67f4cf8f..d6fe9f6055 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2053,6 +2053,10 @@ async fn get_utilization(
     r: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
+    fail::fail_point!("get-utilization-http-handler", |_| {
+        Err(ApiError::ResourceUnavailable("failpoint".into()))
+    });
+
     // this probably could be completely public, but lets make that change later.
     check_permission(&r, None)?;
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 16ebc19698..70d3076371 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2088,6 +2088,14 @@ class NeonStorageController(MetricsGetter):
         )
         return response.json()
 
+    def tenant_list(self):
+        response = self.request(
+            "GET",
+            f"{self.env.storage_controller_api}/debug/v1/tenant",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        return response.json()
+
     def node_configure(self, node_id, body: dict[str, Any]):
         log.info(f"node_configure({node_id}, {body})")
         body["node_id"] = node_id
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 3ca13a904d..56b4548b64 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -209,10 +209,12 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.node_register(env.pageserver)
 
     env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
+    env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"})
 
     env.neon_cli.create_tenant(
         tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
     )
+
     generate_uploads_and_deletions(env, pageserver=env.pageserver)
 
     def parse_generation_suffix(key):
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 7a0707b564..27ea425bb1 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -769,3 +769,172 @@ def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
     assert "pitr_interval" not in readback_ps.tenant_specific_overrides
 
     env.storage_controller.consistency_check()
+
+
+class Failure:
+    pageserver_id: int
+
+    def apply(self, env: NeonEnv):
+        raise NotImplementedError()
+
+    def clear(self, env: NeonEnv):
+        raise NotImplementedError()
+
+
+class NodeStop(Failure):
+    def __init__(self, pageserver_id, immediate):
+        self.pageserver_id = pageserver_id
+        self.immediate = immediate
+
+    def apply(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.stop(immediate=self.immediate)
+
+    def clear(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.start()
+
+
+class PageserverFailpoint(Failure):
+    def __init__(self, failpoint, pageserver_id):
+        self.failpoint = failpoint
+        self.pageserver_id = pageserver_id
+
+    def apply(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.http_client().configure_failpoints((self.failpoint, "return(1)"))
+
+    def clear(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.http_client().configure_failpoints((self.failpoint, "off"))
+
+
+def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]:
+    tenants = env.storage_controller.tenant_list()
+
+    node_to_tenants: dict[int, list[TenantId]] = {}
+    for t in tenants:
+        for node_id, loc_state in t["observed"]["locations"].items():
+            if (
+                loc_state is not None
+                and "conf" in loc_state
+                and loc_state["conf"] is not None
+                and loc_state["conf"]["mode"] == "AttachedSingle"
+            ):
+                crnt = node_to_tenants.get(int(node_id), [])
+                crnt.append(TenantId(t["tenant_shard_id"]))
+                node_to_tenants[int(node_id)] = crnt
+
+    return node_to_tenants
+
+
+@pytest.mark.parametrize(
+    "failure",
+    [
+        NodeStop(pageserver_id=1, immediate=False),
+        NodeStop(pageserver_id=1, immediate=True),
+        PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"),
+    ],
+)
+def test_sharding_service_heartbeats(
+    neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure
+):
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    # Initially we have two online pageservers
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 2
+    assert all([n["availability"] == "Active" for n in nodes])
+
+    # ... then we create two tenants and write some data into them
+    def create_tenant(tid: TenantId):
+        env.storage_controller.tenant_create(tid)
+
+        branch_name = "main"
+        env.neon_cli.create_timeline(
+            branch_name,
+            tenant_id=tid,
+        )
+
+        with env.endpoints.create_start("main", tenant_id=tid) as endpoint:
+            run_pg_bench_small(pg_bin, endpoint.connstr())
+            endpoint.safe_psql("CREATE TABLE created_foo(id integer);")
+
+    tenant_ids = [TenantId.generate(), TenantId.generate()]
+    for tid in tenant_ids:
+        create_tenant(tid)
+
+    # ... expecting that each tenant will be placed on a different node
+    def tenants_placed():
+        node_to_tenants = build_node_to_tenants_map(env)
+        log.info(f"{node_to_tenants=}")
+
+        # Check that all the tenants have been attached
+        assert sum((len(ts) for ts in node_to_tenants.values())) == len(tenant_ids)
+        # Check that each node got one tenant
+        assert all((len(ts) == 1 for ts in node_to_tenants.values()))
+
+    wait_until(10, 1, tenants_placed)
+
+    # ... then we apply the failure
+    offline_node_id = failure.pageserver_id
+    online_node_id = (set(range(1, len(env.pageservers) + 1)) - {offline_node_id}).pop()
+    env.get_pageserver(offline_node_id).allowed_errors.append(
+        # In the case of the failpoint failure, the impacted pageserver
+        # still believes it has the tenant attached since location
+        # config calls into it will fail due to being marked offline.
+        ".*Dropped remote consistent LSN updates.*",
+    )
+
+    failure.apply(env)
+
+    # ... expecting the heartbeats to mark it offline
+    def node_offline():
+        nodes = env.storage_controller.node_list()
+        log.info(f"{nodes=}")
+        target = next(n for n in nodes if n["id"] == offline_node_id)
+        assert target["availability"] == "Offline"
+
+    # A node is considered offline if the last successful heartbeat
+    # was more than 10 seconds ago (hardcoded in the storage controller).
+    wait_until(20, 1, node_offline)
+
+    # .. expecting the tenant on the offline node to be migrated
+    def tenant_migrated():
+        node_to_tenants = build_node_to_tenants_map(env)
+        log.info(f"{node_to_tenants=}")
+        assert set(node_to_tenants[online_node_id]) == set(tenant_ids)
+
+    wait_until(10, 1, tenant_migrated)
+
+    # ... then we clear the failure
+    failure.clear(env)
+
+    # ... expecting the offline node to become active again
+    def node_online():
+        nodes = env.storage_controller.node_list()
+        target = next(n for n in nodes if n["id"] == offline_node_id)
+        assert target["availability"] == "Active"
+
+    wait_until(10, 1, node_online)
+
+    time.sleep(5)
+
+    # ... then we create a new tenant
+    tid = TenantId.generate()
+    env.storage_controller.tenant_create(tid)
+
+    # ... expecting it to be placed on the node that just came back online
+    tenants = env.storage_controller.tenant_list()
+    newest_tenant = next(t for t in tenants if t["tenant_shard_id"] == str(tid))
+    locations = list(newest_tenant["observed"]["locations"].keys())
+    locations = [int(node_id) for node_id in locations]
+    assert locations == [offline_node_id]
+
+    # ... expecting the storage controller to reach a consistent state
+    def storage_controller_consistent():
+        env.storage_controller.consistency_check()
+
+    wait_until(10, 1, storage_controller_consistent)

From 3d8830ac357a0b8d2084f1dfd6b0393ca912e74e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 14 Mar 2024 16:47:32 +0000
Subject: [PATCH 0402/1571] test_runner: re-enable large slru benchmark (#7125)

Previously disabled due to
https://github.com/neondatabase/neon/issues/7006.
---
 .../pageserver/pagebench/test_large_slru_basebackup.py       | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index c98fa44b1a..324ef0d516 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -1,6 +1,5 @@
 import asyncio
 import json
-import os
 from pathlib import Path
 from typing import Any, Dict, Tuple
 
@@ -20,10 +19,6 @@ from performance.pageserver.util import (
 @pytest.mark.parametrize("n_tenants", [10])
 @pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
 @pytest.mark.timeout(1000)
-@pytest.mark.skipif(
-    os.getenv("CI", "false") == "true",
-    reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/7006",
-)
 def test_basebackup_with_high_slru_count(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,

From 678ed39de2446bb02572bb7d9439f4a2fa31aeef Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 14 Mar 2024 16:48:38 +0000
Subject: [PATCH 0403/1571] storage controller: validate DNS of registering
 nodes (#7101)

A node with a bad DNS configuration can register itself with the storage
controller, and the controller will try and schedule work onto the node,
but never succeed because it can't reach the node.

The DNS case is a special case of asymmetric network issues. The general
case isn't covered here -- but might make sense to tighten up after
#6844 merges -- then we can avoid assuming a node is immediately
available in re_attach.
---
 .../attachment_service/src/service.rs         | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index ac61209c38..8439ea5567 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -3453,6 +3453,30 @@ impl Service {
             }
         }
 
+        // We do not require that a node is actually online when registered (it will start life
+        // with it's  availability set to Offline), but we _do_ require that its DNS record exists. We're
+        // therefore not immune to asymmetric L3 connectivity issues, but we are protected against nodes
+        // that register themselves with a broken DNS config.  We check only the HTTP hostname, because
+        // the postgres hostname might only be resolvable to clients (e.g. if we're on a different VPC than clients).
+        if tokio::net::lookup_host(format!(
+            "{}:{}",
+            register_req.listen_http_addr, register_req.listen_http_port
+        ))
+        .await
+        .is_err()
+        {
+            // If we have a transient DNS issue, it's up to the caller to retry their registration.  Because
+            // we can't robustly distinguish between an intermittent issue and a totally bogus DNS situation,
+            // we return a soft 503 error, to encourage callers to retry past transient issues.
+            return Err(ApiError::ResourceUnavailable(
+                format!(
+                    "Node {} tried to register with unknown DNS name '{}'",
+                    register_req.node_id, register_req.listen_http_addr
+                )
+                .into(),
+            ));
+        }
+
         // Ordering: we must persist the new node _before_ adding it to in-memory state.
         // This ensures that before we use it for anything or expose it via any external
         // API, it is guaranteed to be available after a restart.

From 58ef78cf4174fc1802e365f2d164976ce77cde7e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 14 Mar 2024 20:49:42 +0200
Subject: [PATCH 0404/1571] doc(README): note cargo-nextest usage (#7122)

We have been using #5681 for quite some time, and at least since #6931
the tests have assumed `cargo-nextest` to work around our use of global
statics. Unlike the `cargo test`, the `cargo nextest run` runs each test
as a separate process that can be timeouted.

Add a mention of using `cargo-nextest` in the top-level README.md.
Sub-crates can still declare they support `cargo test`, like
`compute_tools/README.md` does.
---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index c44ae695d6..00a90f4483 100644
--- a/README.md
+++ b/README.md
@@ -238,6 +238,14 @@ If you encounter errors during setting up the initial tenant, it's best to stop
 
 ## Running tests
 
+### Rust unit tests
+
+We are using [`cargo-nextest`](https://nexte.st/) to run the tests in Github Workflows.
+Some crates do not support running plain `cargo test` anymore, prefer `cargo nextest run` instead.
+You can install `cargo-nextest` with `cargo install cargo-nextest`.
+
+### Integration tests
+
 Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
 
 ```sh

From 76c44dc140a61a359a1ce42f988074fdda910c3f Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 14 Mar 2024 15:45:38 -0400
Subject: [PATCH 0405/1571] spec: disable neon extension auto upgrade (#7128)

This pull request disables neon extension auto upgrade to help the next
compute image upgrade smooth.

## Summary of changes

We have two places to auto-upgrade neon extension: during compute spec
update, and when the compute node starts. The compute spec update logic
is always there, and the compute node start logic is added in
https://github.com/neondatabase/neon/pull/7029. In this pull request, we
disable both of them, so that we can still roll back to an older version
of compute before figuring out the best way of extension
upgrade-downgrade. https://github.com/neondatabase/neon/issues/6936

We will enable auto-upgrade in the next release following this release.

There are no other extension upgrades from release 4917 and therefore
after this pull request, it would be safe to revert to release 4917.

Impact:

* Project created after unpinning the compute image -> if we need to
roll back, **they will stuck**, because the default neon extension
version is 1.3. Need to manually pin the compute image version if such
things happen.
* Projects already stuck on staging due to not downgradeable -> I don't
know their current status, maybe they are already running the latest
compute image?
* Other projects -> can be rolled back to release 4917.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/spec.rs | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index ba3a84cda8..3b596a88ff 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -743,19 +743,21 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
     // which may happen in two cases:
     // - extension was just installed
     // - extension was already installed and is up to date
-    let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension version with query: {}", query);
-    client.simple_query(query)?;
+    // DISABLED due to compute node unpinning epic
+    // let query = "ALTER EXTENSION neon UPDATE";
+    // info!("update neon extension version with query: {}", query);
+    // client.simple_query(query)?;
 
     Ok(())
 }
 
 #[instrument(skip_all)]
-pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
-    info!("handle neon extension upgrade");
-    let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension version with query: {}", query);
-    client.simple_query(query)?;
+pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
+    info!("handle neon extension upgrade (not really)");
+    // DISABLED due to compute node unpinning epic
+    // let query = "ALTER EXTENSION neon UPDATE";
+    // info!("update neon extension version with query: {}", query);
+    // client.simple_query(query)?;
 
     Ok(())
 }

From 49bc734e02c4c89911168d8a6ecb823afd676967 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 15 Mar 2024 09:21:48 +0000
Subject: [PATCH 0406/1571] proxy: add websocket regression tests (#7121)

## Problem

We have no regression tests for websocket flow

## Summary of changes

Add a hacky implementation of the postgres protocol over websockets just
to verify the protocol behaviour does not regress over time.
---
 poetry.lock                                  |  94 +++++++--
 pyproject.toml                               |   1 +
 test_runner/regress/test_proxy_websockets.py | 189 +++++++++++++++++++
 3 files changed, 272 insertions(+), 12 deletions(-)
 create mode 100644 test_runner/regress/test_proxy_websockets.py

diff --git a/poetry.lock b/poetry.lock
index 832d7c4334..7b49daf42a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2182,7 +2182,6 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2529,6 +2528,87 @@ docs = ["Sphinx (>=3.4)", "sphinx-rtd-theme (>=0.5)"]
 optional = ["python-socks", "wsaccel"]
 test = ["websockets"]
 
+[[package]]
+name = "websockets"
+version = "12.0"
+description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "websockets-12.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d554236b2a2006e0ce16315c16eaa0d628dab009c33b63ea03f41c6107958374"},
+    {file = "websockets-12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d225bb6886591b1746b17c0573e29804619c8f755b5598d875bb4235ea639be"},
+    {file = "websockets-12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eb809e816916a3b210bed3c82fb88eaf16e8afcf9c115ebb2bacede1797d2547"},
+    {file = "websockets-12.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c588f6abc13f78a67044c6b1273a99e1cf31038ad51815b3b016ce699f0d75c2"},
+    {file = "websockets-12.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5aa9348186d79a5f232115ed3fa9020eab66d6c3437d72f9d2c8ac0c6858c558"},
+    {file = "websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6350b14a40c95ddd53e775dbdbbbc59b124a5c8ecd6fbb09c2e52029f7a9f480"},
+    {file = "websockets-12.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:70ec754cc2a769bcd218ed8d7209055667b30860ffecb8633a834dde27d6307c"},
+    {file = "websockets-12.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6e96f5ed1b83a8ddb07909b45bd94833b0710f738115751cdaa9da1fb0cb66e8"},
+    {file = "websockets-12.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4d87be612cbef86f994178d5186add3d94e9f31cc3cb499a0482b866ec477603"},
+    {file = "websockets-12.0-cp310-cp310-win32.whl", hash = "sha256:befe90632d66caaf72e8b2ed4d7f02b348913813c8b0a32fae1cc5fe3730902f"},
+    {file = "websockets-12.0-cp310-cp310-win_amd64.whl", hash = "sha256:363f57ca8bc8576195d0540c648aa58ac18cf85b76ad5202b9f976918f4219cf"},
+    {file = "websockets-12.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5d873c7de42dea355d73f170be0f23788cf3fa9f7bed718fd2830eefedce01b4"},
+    {file = "websockets-12.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3f61726cae9f65b872502ff3c1496abc93ffbe31b278455c418492016e2afc8f"},
+    {file = "websockets-12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed2fcf7a07334c77fc8a230755c2209223a7cc44fc27597729b8ef5425aa61a3"},
+    {file = "websockets-12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e332c210b14b57904869ca9f9bf4ca32f5427a03eeb625da9b616c85a3a506c"},
+    {file = "websockets-12.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5693ef74233122f8ebab026817b1b37fe25c411ecfca084b29bc7d6efc548f45"},
+    {file = "websockets-12.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e9e7db18b4539a29cc5ad8c8b252738a30e2b13f033c2d6e9d0549b45841c04"},
+    {file = "websockets-12.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6e2df67b8014767d0f785baa98393725739287684b9f8d8a1001eb2839031447"},
+    {file = "websockets-12.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bea88d71630c5900690fcb03161ab18f8f244805c59e2e0dc4ffadae0a7ee0ca"},
+    {file = "websockets-12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dff6cdf35e31d1315790149fee351f9e52978130cef6c87c4b6c9b3baf78bc53"},
+    {file = "websockets-12.0-cp311-cp311-win32.whl", hash = "sha256:3e3aa8c468af01d70332a382350ee95f6986db479ce7af14d5e81ec52aa2b402"},
+    {file = "websockets-12.0-cp311-cp311-win_amd64.whl", hash = "sha256:25eb766c8ad27da0f79420b2af4b85d29914ba0edf69f547cc4f06ca6f1d403b"},
+    {file = "websockets-12.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0e6e2711d5a8e6e482cacb927a49a3d432345dfe7dea8ace7b5790df5932e4df"},
+    {file = "websockets-12.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:dbcf72a37f0b3316e993e13ecf32f10c0e1259c28ffd0a85cee26e8549595fbc"},
+    {file = "websockets-12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:12743ab88ab2af1d17dd4acb4645677cb7063ef4db93abffbf164218a5d54c6b"},
+    {file = "websockets-12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b645f491f3c48d3f8a00d1fce07445fab7347fec54a3e65f0725d730d5b99cb"},
+    {file = "websockets-12.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9893d1aa45a7f8b3bc4510f6ccf8db8c3b62120917af15e3de247f0780294b92"},
+    {file = "websockets-12.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f38a7b376117ef7aff996e737583172bdf535932c9ca021746573bce40165ed"},
+    {file = "websockets-12.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f764ba54e33daf20e167915edc443b6f88956f37fb606449b4a5b10ba42235a5"},
+    {file = "websockets-12.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:1e4b3f8ea6a9cfa8be8484c9221ec0257508e3a1ec43c36acdefb2a9c3b00aa2"},
+    {file = "websockets-12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9fdf06fd06c32205a07e47328ab49c40fc1407cdec801d698a7c41167ea45113"},
+    {file = "websockets-12.0-cp312-cp312-win32.whl", hash = "sha256:baa386875b70cbd81798fa9f71be689c1bf484f65fd6fb08d051a0ee4e79924d"},
+    {file = "websockets-12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ae0a5da8f35a5be197f328d4727dbcfafa53d1824fac3d96cdd3a642fe09394f"},
+    {file = "websockets-12.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5f6ffe2c6598f7f7207eef9a1228b6f5c818f9f4d53ee920aacd35cec8110438"},
+    {file = "websockets-12.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9edf3fc590cc2ec20dc9d7a45108b5bbaf21c0d89f9fd3fd1685e223771dc0b2"},
+    {file = "websockets-12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8572132c7be52632201a35f5e08348137f658e5ffd21f51f94572ca6c05ea81d"},
+    {file = "websockets-12.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:604428d1b87edbf02b233e2c207d7d528460fa978f9e391bd8aaf9c8311de137"},
+    {file = "websockets-12.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1a9d160fd080c6285e202327aba140fc9a0d910b09e423afff4ae5cbbf1c7205"},
+    {file = "websockets-12.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87b4aafed34653e465eb77b7c93ef058516cb5acf3eb21e42f33928616172def"},
+    {file = "websockets-12.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b2ee7288b85959797970114deae81ab41b731f19ebcd3bd499ae9ca0e3f1d2c8"},
+    {file = "websockets-12.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:7fa3d25e81bfe6a89718e9791128398a50dec6d57faf23770787ff441d851967"},
+    {file = "websockets-12.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a571f035a47212288e3b3519944f6bf4ac7bc7553243e41eac50dd48552b6df7"},
+    {file = "websockets-12.0-cp38-cp38-win32.whl", hash = "sha256:3c6cc1360c10c17463aadd29dd3af332d4a1adaa8796f6b0e9f9df1fdb0bad62"},
+    {file = "websockets-12.0-cp38-cp38-win_amd64.whl", hash = "sha256:1bf386089178ea69d720f8db6199a0504a406209a0fc23e603b27b300fdd6892"},
+    {file = "websockets-12.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab3d732ad50a4fbd04a4490ef08acd0517b6ae6b77eb967251f4c263011a990d"},
+    {file = "websockets-12.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a1d9697f3337a89691e3bd8dc56dea45a6f6d975f92e7d5f773bc715c15dde28"},
+    {file = "websockets-12.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1df2fbd2c8a98d38a66f5238484405b8d1d16f929bb7a33ed73e4801222a6f53"},
+    {file = "websockets-12.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23509452b3bc38e3a057382c2e941d5ac2e01e251acce7adc74011d7d8de434c"},
+    {file = "websockets-12.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e5fc14ec6ea568200ea4ef46545073da81900a2b67b3e666f04adf53ad452ec"},
+    {file = "websockets-12.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46e71dbbd12850224243f5d2aeec90f0aaa0f2dde5aeeb8fc8df21e04d99eff9"},
+    {file = "websockets-12.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b81f90dcc6c85a9b7f29873beb56c94c85d6f0dac2ea8b60d995bd18bf3e2aae"},
+    {file = "websockets-12.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a02413bc474feda2849c59ed2dfb2cddb4cd3d2f03a2fedec51d6e959d9b608b"},
+    {file = "websockets-12.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bbe6013f9f791944ed31ca08b077e26249309639313fff132bfbf3ba105673b9"},
+    {file = "websockets-12.0-cp39-cp39-win32.whl", hash = "sha256:cbe83a6bbdf207ff0541de01e11904827540aa069293696dd528a6640bd6a5f6"},
+    {file = "websockets-12.0-cp39-cp39-win_amd64.whl", hash = "sha256:fc4e7fa5414512b481a2483775a8e8be7803a35b30ca805afa4998a84f9fd9e8"},
+    {file = "websockets-12.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:248d8e2446e13c1d4326e0a6a4e9629cb13a11195051a73acf414812700badbd"},
+    {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f44069528d45a933997a6fef143030d8ca8042f0dfaad753e2906398290e2870"},
+    {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c4e37d36f0d19f0a4413d3e18c0d03d0c268ada2061868c1e6f5ab1a6d575077"},
+    {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d829f975fc2e527a3ef2f9c8f25e553eb7bc779c6665e8e1d52aa22800bb38b"},
+    {file = "websockets-12.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:2c71bd45a777433dd9113847af751aae36e448bc6b8c361a566cb043eda6ec30"},
+    {file = "websockets-12.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0bee75f400895aef54157b36ed6d3b308fcab62e5260703add87f44cee9c82a6"},
+    {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:423fc1ed29f7512fceb727e2d2aecb952c46aa34895e9ed96071821309951123"},
+    {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27a5e9964ef509016759f2ef3f2c1e13f403725a5e6a1775555994966a66e931"},
+    {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3181df4583c4d3994d31fb235dc681d2aaad744fbdbf94c4802485ececdecf2"},
+    {file = "websockets-12.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b067cb952ce8bf40115f6c19f478dc71c5e719b7fbaa511359795dfd9d1a6468"},
+    {file = "websockets-12.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:00700340c6c7ab788f176d118775202aadea7602c5cc6be6ae127761c16d6b0b"},
+    {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e469d01137942849cff40517c97a30a93ae79917752b34029f0ec72df6b46399"},
+    {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffefa1374cd508d633646d51a8e9277763a9b78ae71324183693959cf94635a7"},
+    {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba0cab91b3956dfa9f512147860783a1829a8d905ee218a9837c18f683239611"},
+    {file = "websockets-12.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2cb388a5bfb56df4d9a406783b7f9dbefb888c09b71629351cc6b036e9259370"},
+    {file = "websockets-12.0-py3-none-any.whl", hash = "sha256:dc284bbc8d7c78a6c69e0c7325ab46ee5e40bb4d50e494d8131a07ef47500e9e"},
+    {file = "websockets-12.0.tar.gz", hash = "sha256:81df9cbcbb6c260de1e007e58c011bfebe2dafc8435107b0537f393dd38c8b1b"},
+]
+
 [[package]]
 name = "werkzeug"
 version = "3.0.1"
@@ -2572,16 +2652,6 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2819,4 +2889,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "af9d5b45310c12411bfe67cb9677d2236808d0780ca1bd81525d2763a928f7f9"
+content-hash = "df7161da4fdc3cba0a445176fc9dda2a0e8a53e13a7aa8a864385ca259381b41"
diff --git a/pyproject.toml b/pyproject.toml
index 6dff112a5e..e347d47cbf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,7 @@ pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
 httpx = {extras = ["http2"], version = "^0.26.0"}
 pytest-repeat = "^0.9.3"
+websockets = "^12.0"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py
new file mode 100644
index 0000000000..6d1cb9765a
--- /dev/null
+++ b/test_runner/regress/test_proxy_websockets.py
@@ -0,0 +1,189 @@
+import ssl
+
+import pytest
+import websockets
+from fixtures.neon_fixtures import NeonProxy
+
+
+@pytest.mark.asyncio
+async def test_websockets(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create user ws_auth with password 'ws' superuser")
+
+    user = "ws_auth"
+    password = "ws"
+
+    version = b"\x00\x03\x00\x00"
+    params = {
+        "user": user,
+        "database": "postgres",
+        "client_encoding": "UTF8",
+    }
+
+    ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+    ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt"))
+
+    async with websockets.connect(
+        f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+        ssl=ssl_context,
+    ) as websocket:
+        startup_message = bytearray(version)
+        for key, value in params.items():
+            startup_message.extend(key.encode("ascii"))
+            startup_message.extend(b"\0")
+            startup_message.extend(value.encode("ascii"))
+            startup_message.extend(b"\0")
+        startup_message.extend(b"\0")
+        length = (4 + len(startup_message)).to_bytes(4, byteorder="big")
+
+        await websocket.send([length, startup_message])
+
+        startup_response = await websocket.recv()
+        assert isinstance(startup_response, bytes)
+        assert startup_response[0:1] == b"R", "should be authentication message"
+        assert startup_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message"
+        assert startup_response[5:9] == b"\x00\x00\x00\x03", "should be cleartext"
+
+        auth_message = password.encode("utf-8") + b"\0"
+        length = (4 + len(auth_message)).to_bytes(4, byteorder="big")
+        await websocket.send([b"p", length, auth_message])
+
+        auth_response = await websocket.recv()
+        assert isinstance(auth_response, bytes)
+        assert auth_response[0:1] == b"R", "should be authentication message"
+        assert auth_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message"
+        assert auth_response[5:9] == b"\x00\x00\x00\x00", "should be authenticated"
+
+        query_message = "SELECT 1".encode("utf-8") + b"\0"
+        length = (4 + len(query_message)).to_bytes(4, byteorder="big")
+        await websocket.send([b"Q", length, query_message])
+
+        query_response = await websocket.recv()
+        assert isinstance(query_response, bytes)
+        # 'T\x00\x00\x00!\x00\x01?column?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17\x00\x04\xff\xff\xff\xff\x00\x00'
+        # 'D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011'
+        # 'C\x00\x00\x00\rSELECT 1\x00'
+        # 'Z\x00\x00\x00\x05I'
+
+        assert query_response[0:1] == b"T", "should be row description message"
+        row_description_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        row_description, query_response = (
+            query_response[:row_description_len],
+            query_response[row_description_len:],
+        )
+        assert row_description[5:7] == b"\x00\x01", "should have 1 column"
+        assert row_description[7:16] == b"?column?\0", "column should be named ?column?"
+        assert row_description[22:26] == b"\x00\x00\x00\x17", "column should be an int4"
+
+        assert query_response[0:1] == b"D", "should be data row message"
+        data_row_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        data_row, query_response = query_response[:data_row_len], query_response[data_row_len:]
+        assert (
+            data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011"
+        ), "should contain 1 column with text value 1"
+
+        assert query_response[0:1] == b"C", "should be command complete message"
+        command_complete_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        command_complete, query_response = (
+            query_response[:command_complete_len],
+            query_response[command_complete_len:],
+        )
+        assert command_complete == b"C\x00\x00\x00\x0dSELECT 1\0"
+
+        assert query_response[0:6] == b"Z\x00\x00\x00\x05I", "should be ready for query (idle)"
+
+        # close
+        await websocket.send(b"X\x00\x00\x00\x04")
+        await websocket.wait_closed()
+
+
+@pytest.mark.asyncio
+async def test_websockets_pipelined(static_proxy: NeonProxy):
+    """
+    Test whether we can send the startup + auth + query all in one go
+    """
+
+    static_proxy.safe_psql("create user ws_auth with password 'ws' superuser")
+
+    user = "ws_auth"
+    password = "ws"
+
+    version = b"\x00\x03\x00\x00"
+    params = {
+        "user": user,
+        "database": "postgres",
+        "client_encoding": "UTF8",
+    }
+
+    ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+    ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt"))
+
+    async with websockets.connect(
+        f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+        ssl=ssl_context,
+    ) as websocket:
+        startup_message = bytearray(version)
+        for key, value in params.items():
+            startup_message.extend(key.encode("ascii"))
+            startup_message.extend(b"\0")
+            startup_message.extend(value.encode("ascii"))
+            startup_message.extend(b"\0")
+        startup_message.extend(b"\0")
+        length0 = (4 + len(startup_message)).to_bytes(4, byteorder="big")
+
+        auth_message = password.encode("utf-8") + b"\0"
+        length1 = (4 + len(auth_message)).to_bytes(4, byteorder="big")
+        query_message = "SELECT 1".encode("utf-8") + b"\0"
+        length2 = (4 + len(query_message)).to_bytes(4, byteorder="big")
+        await websocket.send(
+            [length0, startup_message, b"p", length1, auth_message, b"Q", length2, query_message]
+        )
+
+        startup_response = await websocket.recv()
+        assert isinstance(startup_response, bytes)
+        assert startup_response[0:1] == b"R", "should be authentication message"
+        assert startup_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message"
+        assert startup_response[5:9] == b"\x00\x00\x00\x03", "should be cleartext"
+
+        auth_response = await websocket.recv()
+        assert isinstance(auth_response, bytes)
+        assert auth_response[0:1] == b"R", "should be authentication message"
+        assert auth_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message"
+        assert auth_response[5:9] == b"\x00\x00\x00\x00", "should be authenticated"
+
+        query_response = await websocket.recv()
+        assert isinstance(query_response, bytes)
+        # 'T\x00\x00\x00!\x00\x01?column?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17\x00\x04\xff\xff\xff\xff\x00\x00'
+        # 'D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011'
+        # 'C\x00\x00\x00\rSELECT 1\x00'
+        # 'Z\x00\x00\x00\x05I'
+
+        assert query_response[0:1] == b"T", "should be row description message"
+        row_description_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        row_description, query_response = (
+            query_response[:row_description_len],
+            query_response[row_description_len:],
+        )
+        assert row_description[5:7] == b"\x00\x01", "should have 1 column"
+        assert row_description[7:16] == b"?column?\0", "column should be named ?column?"
+        assert row_description[22:26] == b"\x00\x00\x00\x17", "column should be an int4"
+
+        assert query_response[0:1] == b"D", "should be data row message"
+        data_row_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        data_row, query_response = query_response[:data_row_len], query_response[data_row_len:]
+        assert (
+            data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011"
+        ), "should contain 1 column with text value 1"
+
+        assert query_response[0:1] == b"C", "should be command complete message"
+        command_complete_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        command_complete, query_response = (
+            query_response[:command_complete_len],
+            query_response[command_complete_len:],
+        )
+        assert command_complete == b"C\x00\x00\x00\x0dSELECT 1\0"
+
+        assert query_response[0:6] == b"Z\x00\x00\x00\x05I", "should be ready for query (idle)"
+
+        # close
+        await websocket.send(b"X\x00\x00\x00\x04")
+        await websocket.wait_closed()

From 46098ea0ea8b0a1fa2f49df76229291bdbae43b0 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Fri, 15 Mar 2024 16:13:15 +0500
Subject: [PATCH 0407/1571] proxy: add more missing warm logging (#7133)

## Problem

There is one more missing thing about cached connections for
`cold_start_info`.

## Summary of changes

Fix and add comments.
---
 proxy/src/console/provider/neon.rs | 3 +++
 proxy/src/serverless/backend.rs    | 1 +
 2 files changed, 4 insertions(+)

diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 3b2e0cc204..b36663518d 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -250,6 +250,8 @@ impl super::Api for Api {
         // which means that we might cache it to reduce the load and latency.
         if let Some(cached) = self.caches.node_info.get(&key) {
             info!(key = &*key, "found cached compute node info");
+            info!("cold_start_info=warm");
+            ctx.set_cold_start_info(ColdStartInfo::Warm);
             return Ok(cached);
         }
 
@@ -260,6 +262,7 @@ impl super::Api for Api {
         if permit.should_check_cache() {
             if let Some(cached) = self.caches.node_info.get(&key) {
                 info!(key = &*key, "found cached compute node info");
+                info!("cold_start_info=warm");
                 ctx.set_cold_start_info(ColdStartInfo::Warm);
                 return Ok(cached);
             }
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 9b3ca8d447..29ef641265 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -84,6 +84,7 @@ impl PoolingBackend {
         };
 
         if let Some(client) = maybe_client {
+            info!("cold_start_info=warm");
             ctx.set_cold_start_info(ColdStartInfo::Warm);
             return Ok(client);
         }

From 23416cc3580ecb2d05333e3fc9431aa27cc663a7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 11:14:25 +0000
Subject: [PATCH 0408/1571] docs: sharding phase 1 RFC (#5432)

We need to shard our Tenants to support larger databases without those
large databases dominating our pageservers and/or requiring dedicated
pageservers.

This RFC aims to define an initial capability that will permit creating
large-capacity databases using a static configuration
defined at time of Tenant creation.

Online re-sharding is deferred as future work, as is offloading layers
for historical reads. However, both of these capabilities would be
implementable without further changes to the control plane or compute:
this RFC aims to define the cross-component work needed to bootstrap
sharding end-to-end.
---
 docs/rfcs/031-sharding-static.md | 408 +++++++++++++++++++++++++++++++
 1 file changed, 408 insertions(+)
 create mode 100644 docs/rfcs/031-sharding-static.md

diff --git a/docs/rfcs/031-sharding-static.md b/docs/rfcs/031-sharding-static.md
new file mode 100644
index 0000000000..fe009b8660
--- /dev/null
+++ b/docs/rfcs/031-sharding-static.md
@@ -0,0 +1,408 @@
+# Sharding Phase 1: Static Key-space Sharding
+
+## Summary
+
+To enable databases with sizes approaching the capacity of a pageserver's disk,
+it is necessary to break up the storage for the database, or _shard_ it.
+
+Sharding in general is a complex area. This RFC aims to define an initial
+capability that will permit creating large-capacity databases using a static configuration
+defined at time of Tenant creation.
+
+## Motivation
+
+Currently, all data for a Tenant, including all its timelines, is stored on a single
+pageserver. The local storage required may be several times larger than the actual
+database size, due to LSM write inflation.
+
+If a database is larger than what one pageserver can hold, then it becomes impossible
+for the pageserver to hold it in local storage, as it must do to provide service to
+clients.
+
+### Prior art
+
+In Neon:
+
+- Layer File Spreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Konstantin-21fd9b11b618475da5f39c61dd8ab7a4
+- Layer File SPreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843
+- Key Space partitioning: https://www.notion.so/neondatabase/One-Pager-Key-Space-Partitioning-Stas-8e3a28a600a04a25a68523f42a170677
+
+Prior art in other distributed systems is too broad to capture here: pretty much
+any scale out storage system does something like this.
+
+## Requirements
+
+- Enable creating a large (for example, 16TiB) database without requiring dedicated
+  pageserver nodes.
+- Share read/write bandwidth costs for large databases across pageservers, as well
+  as storage capacity, in order to avoid large capacity databases acting as I/O hotspots
+  that disrupt service to other tenants.
+- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres
+  does not write out a single contiguous ranges of page numbers.
+
+_Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database
+that a user might create on a current-gen enterprise SSD should also work well on
+Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the
+pageserver backend is not the limiting factor in the database size_.
+
+## Non Goals
+
+- Independently distributing timelines within the same tenant. If a tenant has many
+  timelines, then sharding may be a less efficient mechanism for distributing load than
+  sharing out timelines between pageservers.
+- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only,
+  based on the idea that separate mechanisms will make sense for each dimension.
+
+## Impacted Components
+
+pageserver, control plane, postgres/smgr
+
+## Terminology
+
+**Key**: a postgres page number, qualified by relation. In the sense that the pageserver is a versioned key-value store,
+the page number is the key in that store. `Key` is a literal data type in existing code.
+
+**LSN dimension**: this just means the range of LSNs (history), when talking about the range
+of keys and LSNs as a two dimensional space.
+
+## Implementation
+
+### Key sharding vs. LSN sharding
+
+When we think of sharding across the two dimensional key/lsn space, this is an
+opportunity to think about how the two dimensions differ:
+
+- Sharding the key space distributes the _write_ workload of ingesting data
+  and compacting. This work must be carefully managed so that exactly one
+  node owns a given key.
+- Sharding the LSN space distributes the _historical read_ workload. This work
+  can be done by anyone without any special coordination, as long as they can
+  see the remote index and layers.
+
+The key sharding is the harder part, and also the more urgent one, to support larger
+capacity databases. Because distributing historical LSN read work is a relatively
+simpler problem that most users don't have, we defer it to future work. It is anticipated
+that some quite simple P2P offload model will enable distributing work for historical
+reads: a node which is low on space can call out to peer to ask it to download and
+serve reads from a historical layer.
+
+### Key mapping scheme
+
+Having decided to focus on key sharding, we must next decide how we will map
+keys to shards. It is proposed to use a "wide striping" approach, to obtain a good compromise
+between data locality and avoiding entire large relations mapping to the same shard.
+
+We will define two spaces:
+
+- Key space: unsigned integer
+- Shard space: integer from 0 to N-1, where we have N shards.
+
+### Key -> Shard mapping
+
+Keys are currently defined in the pageserver's getpage@lsn interface as follows:
+
+```
+pub struct Key {
+    pub field1: u8,
+    pub field2: u32,
+    pub field3: u32,
+    pub field4: u32,
+    pub field5: u8,
+    pub field6: u32,
+}
+
+
+fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: blknum,
+    }
+}
+```
+
+_Note: keys for relation metadata are ignored here, as this data will be mirrored to all
+shards. For distribution purposes, we only care about user data keys_
+
+The properties we want from our Key->Shard mapping are:
+
+- Locality in `blknum`, such that adjacent `blknum` will usually map to
+  the same stripe and consequently land on the same shard, even though the overall
+  collection of blocks in a relation will be spread over many stripes and therefore
+  many shards.
+- Avoid the same blknum on different relations landing on the same stripe, so that
+  with many small relations we do not end up aliasing data to the same stripe/shard.
+- Avoid vulnerability to aliasing in the values of relation identity fields, such that
+  if there are patterns in the value of `relnode`, these do not manifest as patterns
+  in data placement.
+
+To accomplish this, the blknum is used to select a stripe, and stripes are
+assigned to shards in a pseudorandom order via a hash. The motivation for
+pseudo-random distribution (rather than sequential mapping of stripe to shard)
+is to avoid I/O hotspots when sequentially reading multiple relations: we don't want
+all relations' stripes to touch pageservers in the same order.
+
+To map a `Key` to a shard:
+
+- Hash the `Key` field 4 (relNode).
+- Divide field 6 (`blknum`) field by the stripe size in pages, and combine the
+  hash of this with the hash from the previous step.
+- The total hash modulo the shard count gives the shard holding this key.
+
+Why don't we use the other fields in the Key?
+
+- We ignore `forknum` for key mapping, because it distinguishes different classes of data
+  in the same relation, and we would like to keep the data in a relation together.
+- We would like to use spcNode and dbNode, but cannot. Postgres database creation operations can refer to an existing database as a template, such that the created
+  database's blocks differ only by spcNode and dbNode from the original. To enable running
+  this type of creation without cross-pageserver communication, we must ensure that these
+  blocks map to the same shard -- we do this by excluding spcNode and dbNode from the hash.
+
+### Data placement examples
+
+For example, consider the extreme large databases cases of postgres data layout in a system with 8 shards
+and a stripe size of 32k pages:
+
+- A single large relation: `blknum` division will break the data up into 4096
+  stripes, which will be scattered across the shards.
+- 4096 relations of of 32k pages each: each relation will map to exactly one stripe,
+  and that stripe will be placed according to the hash of the key fields 4. The
+  data placement will be statistically uniform across shards.
+
+Data placement will be more uneven on smaller databases:
+
+- A tenant with 2 shards and 2 relations of one stripe size each: there is a 50% chance
+  that both relations land on the same shard and no data lands on the other shard.
+- A tenant with 8 shards and one relation of size 12 stripes: 4 shards will have double
+  the data of the other four shards.
+
+These uneven cases for small amounts of data do not matter, as long as the stripe size
+is an order of magnitude smaller than the amount of data we are comfortable holding
+in a single shard: if our system handles shard sizes up to 10-100GB, then it is not an issue if
+a tenant has some shards with 256MB size and some shards with 512MB size, even though
+the standard deviation of shard size within the tenant is very high. Our key mapping
+scheme provides a statistical guarantee that as the tenant's overall data size increases,
+uniformity of placement will improve.
+
+### Important Types
+
+#### `ShardIdentity`
+
+Provides the information needed to know whether a particular key belongs
+to a particular shard:
+
+- Layout version
+- Stripe size
+- Shard count
+- Shard index
+
+This structure's size is constant. Note that if we had used a differnet key
+mapping scheme such as consistent hashing with explicit hash ranges assigned
+to each shard, then the ShardIdentity's size would grow with the shard count: the simpler
+key mapping scheme used here enables a small fixed size ShardIdentity.
+
+### Pageserver changes
+
+#### Structural
+
+Everywhere the Pageserver currently deals with Tenants, it will move to dealing with
+`TenantShard`s, which are just a `Tenant` plus a `ShardIdentity` telling it which part
+of the keyspace it owns. An un-sharded tenant is just a `TenantShard` whose `ShardIdentity`
+covers the whole keyspace.
+
+When the pageserver writes layers and index_part.json to remote storage, it must
+include the shard index & count in the name, to avoid collisions (the count is
+necessary for future-proofing: the count will vary in time). These keys
+will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work
+exactly the same for TenantShards as it does for Tenants today: each shard will have
+its own generation number.
+
+#### Storage Format: Keys
+
+For tenants with >1 shard, layer files implicitly become sparse: within the key
+range described in the layer name, the layer file for a shard will only hold the
+content relevant to stripes assigned to the shard.
+
+For this reason, the LayerFileName within a tenant is no longer unique: different shards
+may use the same LayerFileName to refer to different data. We may solve this simply
+by including the shard number in the keys used for layers.
+
+The shard number will be included as a prefix (as part of tenant ID), like this:
+
+`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/<layer file name>-<generation>`
+
+`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/index_part.json-<generation>`
+
+Reasons for this particular format:
+
+- Use of a prefix is convenient for implementation (no need to carry the shard ID everywhere
+  we construct a layer file name), and enables efficient listing of index_parts within
+  a particular shard-timeline prefix.
+- Including the shard _count_ as well as shard number means that in future when we implement
+  shard splitting, it will be possible for a parent shard and one of its children to write
+  the same layer file without a name collision. For example, a parent shard 0_1 might split
+  into two (0_2, 1_2), and in the process of splitting shard 0_2 could write a layer or index_part
+  that is distinct from what shard 0_1 would have written at the same place.
+
+In practice, we expect shard counts to be relatively small, so a `u8` will be sufficient,
+and therefore the shard part of the path can be a fixed-length hex string like `{:02X}{:02X}`,
+for example a single-shard tenant's prefix will be `0001`.
+
+For backward compatibility, we may define a special `ShardIdentity` that has shard_count==0,
+and use this as a cue to construct paths with no prefix at all.
+
+#### Storage Format: Indices
+
+In the phase 1 described in this RFC, shards only reference layers they write themselves. However,
+when we implement shard splitting in future, it will be useful to enable shards to reference layers
+written by other shards (specifically the parent shard during a split), so that shards don't
+have to exhaustively copy all data into their own shard-prefixed keys.
+
+To enable this, the `IndexPart` structure will be extended to store the (shard number, shard count)
+tuple on each layer, such that it can construct paths for layers written by other shards. This
+naturally raises the question of who "owns" such layers written by ancestral shards: this problem
+will be addressed in phase 2.
+
+For backward compatibility, any index entry without shard information will be assumed to be
+in the legacy shardidentity.
+
+#### WAL Ingest
+
+In Phase 1, all shards will subscribe to the safekeeper to download WAL content. They will filter
+it down to the pages relevant to their shard:
+
+- For ordinary user data writes, only retain a write if it matches the ShardIdentity
+- For metadata describing relations etc, all shards retain these writes.
+
+The pageservers must somehow give the safekeeper correct feedback on remote_consistent_lsn:
+one solution here is for the 0th shard to periodically peek at the IndexParts for all the other shards,
+and have only the 0th shard populate remote_consistent_lsn. However, this is relatively
+expensive: if the safekeeper can be made shard-aware then it could be taught to use
+the max() of all shards' remote_consistent_lsns to decide when to trim the WAL.
+
+#### Compaction/GC
+
+No changes needed.
+
+The pageserver doesn't have to do anything special during compaction
+or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity.
+This will result in sparse layer files, containing keys only in the stripes that this
+shard owns. Where optimizations currently exist in compaction for spotting "gaps" in
+the key range, these should be updated to ignore gaps that are due to sharding, to
+avoid spuriously splitting up layers ito stripe-sized pieces.
+
+### Compute Endpoints
+
+Compute endpoints will need to:
+
+- Accept a vector of connection strings as part of their configuration from the control plane
+- Route pageserver requests according to mapping the hash of key to the correct
+  entry in the vector of connection strings.
+
+Doing this in compute rather than routing requests via a single pageserver is
+necessary to enable sharding tenants without adding latency from extra hops.
+
+### Control Plane
+
+Tenants, or _Projects_ in the control plane, will each own a set of TenantShards (this will
+be 1 for small tenants). Logic for placement of tenant shards is just the same as the current logic for placing
+tenants.
+
+Tenant lifecycle operations like deletion will require fanning-out to all the shards
+in the tenant. The same goes for timeline creation and deletion: a timeline should
+not be considered created until it has been created in all shards.
+
+#### Selectively enabling sharding for large tenants
+
+Initially, we will explicitly enable sharding for large tenants only.
+
+In future, this hint mechanism will become optional when we implement automatic
+re-sharding of tenants.
+
+## Future Phases
+
+This section exists to indicate what will likely come next after this phase.
+
+Phases 2a and 2b are amenable to execution in parallel.
+
+### Phase 2a: WAL fan-out
+
+**Problem**: when all shards consume the whole WAL, the network bandwidth used
+for transmitting the WAL from safekeeper to pageservers is multiplied by a factor
+of the shard count.
+
+Network bandwidth is not our most pressing bottleneck, but it is likely to become
+a problem if we set a modest shard count (~8) on a significant number of tenants,
+especially as those larger tenants which we shard are also likely to have higher
+write bandwidth than average.
+
+### Phase 2b: Shard Splitting
+
+**Problem**: the number of shards in a tenant is defined at creation time and cannot
+be changed. This causes excessive sharding for most small tenants, and an upper
+bound on scale for very large tenants.
+
+To address this, a _splitting_ feature will later be added. One shard can split its
+data into a number of children by doing a special compaction operation to generate
+image layers broken up child-shard-wise, and then writing out an `index_part.json` for
+each child. This will then require external coordination (by the control plane) to
+safely attach these new child shards and then move them around to distribute work.
+The opposite _merging_ operation can also be imagined, but is unlikely to be implemented:
+once a Tenant has been sharded, the marginal efficiency benefit of merging is unlikely to justify
+the risk/complexity of implementing such a rarely-encountered scenario.
+
+### Phase N (future): distributed historical reads
+
+**Problem**: while sharding based on key is good for handling changes in overall
+database size, it is less suitable for spiky/unpredictable changes in the read
+workload to historical layers. Sudden increases in historical reads could result
+in sudden increases in local disk capacity required for a TenantShard.
+
+Example: the extreme case of this would be to run a tenant for a year, then create branches
+with ancestors at monthly intervals. This could lead to a sudden 12x inflation in
+the on-disk capacity footprint of a TenantShard, since it would be serving reads
+from all those disparate historical layers.
+
+If we can respond fast enough, then key-sharding a tenant more finely can help with
+this, but splitting may be a relatively expensive operation and the increased historical
+read load may be transient.
+
+A separate mechanism for handling heavy historical reads could be something like
+a gossip mechanism for pageservers to communicate
+about their workload, and then a getpageatlsn offload mechanism where one pageserver can
+ask another to go read the necessary layers from remote storage to serve the read. This
+requires relativly little coordination because it is read-only: any node can service any
+read. All reads to a particular shard would still flow through one node, but the
+disk capactity & I/O impact of servicing the read would be distributed.
+
+## FAQ/Alternatives
+
+### Why stripe the data, rather than using contiguous ranges of keyspace for each shard?
+
+When a database is growing under a write workload, writes may predominantly hit the
+end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user
+is intensively re-writing a particular relation, if that relation lived in a particular
+shard then it would not achieve our goal of distributing the write work across shards.
+
+### Why not proxy read requests through one pageserver, so that endpoints don't have to change?
+
+1. This would not achieve scale-out of network bandwidth: a busy tenant with a large
+   database would still cause a load hotspot on the pageserver routing its read requests.
+2. The additional hop through the "proxy" pageserver would add latency and overall
+   resource cost (CPU, network bandwidth)
+
+### Layer File Spreading: use one pageserver as the owner of a tenant, and have it spread out work on a per-layer basis to peers
+
+In this model, there would be no explicit sharding of work, but the pageserver to which
+a tenant is attached would not hold all layers on its disk: instead, it would call out
+to peers to have them store some layers, and call out to those peers to request reads
+in those layers.
+
+This mechanism will work well for distributing work in the LSN dimension, but in the key
+space dimension it has the major limitation of requiring one node to handle all
+incoming writes, and compactions. Even if the write workload for a large database
+fits in one pageserver, it will still be a hotspot and such tenants may still
+de-facto require their own pageserver.

From 6443dbef90d121ad44911962d3991fec5d31b873 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 13:18:12 +0000
Subject: [PATCH 0409/1571] tests: extend log allow list for
 test_sharding_split_failures (#7134)

Failure types that panic the storage controller can cause unlucky
pageservers to emit log warnings that they can't reach the generation
validation API:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/8284495687/index.html

Tolerate this log message: it's an expected behavior.
---
 test_runner/regress/test_sharding.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index bdb9990a51..e8511e428e 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -729,6 +729,10 @@ def test_sharding_split_failures(
         # thereby be unable to publish remote consistent LSN updates
         ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
 
+        # If we're using a failure that will panic the storage controller, all background
+        # upcalls from the pageserver can fail
+        ps.allowed_errors.append(".*calling control plane generation validation API failed.*")
+
     # Make sure the node we're failing has a shard on it, otherwise the test isn't testing anything
     assert (
         failure.pageserver_id is None

From 516f793ab4c15e840f7990ca84fe00f13b1382d5 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 13:37:49 +0000
Subject: [PATCH 0410/1571] remote_storage: make last_modified and etag
 mandatory (#7126)

## Problem

These fields were only optional for the convenience of the `local_fs`
test helper -- real remote storage backends provide them. It complicated
any code that actually wanted to use them for anything.

## Summary of changes

- Make these fields non-optional
- For azure/S3 it is an error if the server doesn't provide them
- For local_fs, use random strings as etags and the file's mtime for
last_modified.
---
 libs/remote_storage/Cargo.toml            |   1 +
 libs/remote_storage/src/azure_blob.rs     |   9 ++
 libs/remote_storage/src/lib.rs            |   4 +-
 libs/remote_storage/src/local_fs.rs       | 162 ++++++++++++----------
 libs/remote_storage/src/s3_bucket.rs      |  14 +-
 libs/remote_storage/tests/test_real_s3.rs |   2 +-
 6 files changed, 111 insertions(+), 81 deletions(-)

diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 15f3cd3b80..4a53f485ca 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -18,6 +18,7 @@ camino.workspace = true
 humantime.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
+rand.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 12ec680cb6..1e337bc1e8 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -174,6 +174,15 @@ impl AzureBlobStorage {
                     .map_err(|e| DownloadError::Other(e.into()))?;
                 bufs.push(data);
             }
+
+            if bufs.is_empty() {
+                return Err(DownloadError::Other(anyhow::anyhow!(
+                    "Azure GET response contained no buffers"
+                )));
+            }
+            let etag = etag.unwrap();
+            let last_modified = last_modified.unwrap();
+
             Ok(Download {
                 download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
                 etag,
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index b0b69f9155..fd832eb94f 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -291,9 +291,9 @@ pub type DownloadStream =
 pub struct Download {
     pub download_stream: DownloadStream,
     /// The last time the file was modified (`last-modified` HTTP header)
-    pub last_modified: Option<SystemTime>,
+    pub last_modified: SystemTime,
     /// A way to identify this specific version of the resource (`etag` HTTP header)
-    pub etag: Option<String>,
+    pub etag: String,
     /// Extra key-value data, associated with the current remote file.
     pub metadata: Option<StorageMetadata>,
 }
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 478ad81dc1..ea0756541b 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -10,7 +10,7 @@ use std::{
     io::ErrorKind,
     num::NonZeroU32,
     pin::Pin,
-    time::{Duration, SystemTime},
+    time::{Duration, SystemTime, UNIX_EPOCH},
 };
 
 use anyhow::{bail, ensure, Context};
@@ -406,35 +406,37 @@ impl RemoteStorage for LocalFs {
         cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
         let target_path = from.with_base(&self.storage_root);
-        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let source = ReaderStream::new(
-                fs::OpenOptions::new()
-                    .read(true)
-                    .open(&target_path)
-                    .await
-                    .with_context(|| {
-                        format!("Failed to open source file {target_path:?} to use in the download")
-                    })
-                    .map_err(DownloadError::Other)?,
-            );
 
-            let metadata = self
-                .read_storage_metadata(&target_path)
+        let file_metadata = file_metadata(&target_path).await?;
+
+        let source = ReaderStream::new(
+            fs::OpenOptions::new()
+                .read(true)
+                .open(&target_path)
                 .await
-                .map_err(DownloadError::Other)?;
+                .with_context(|| {
+                    format!("Failed to open source file {target_path:?} to use in the download")
+                })
+                .map_err(DownloadError::Other)?,
+        );
 
-            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+        let metadata = self
+            .read_storage_metadata(&target_path)
+            .await
+            .map_err(DownloadError::Other)?;
 
-            Ok(Download {
-                metadata,
-                last_modified: None,
-                etag: None,
-                download_stream: Box::pin(source),
-            })
-        } else {
-            Err(DownloadError::NotFound)
-        }
+        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+
+        let etag = mock_etag(&file_metadata);
+        Ok(Download {
+            metadata,
+            last_modified: file_metadata
+                .modified()
+                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
+            etag,
+            download_stream: Box::pin(source),
+        })
     }
 
     async fn download_byte_range(
@@ -452,50 +454,51 @@ impl RemoteStorage for LocalFs {
                 return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
             }
         }
+
         let target_path = from.with_base(&self.storage_root);
-        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let mut source = tokio::fs::OpenOptions::new()
-                .read(true)
-                .open(&target_path)
-                .await
-                .with_context(|| {
-                    format!("Failed to open source file {target_path:?} to use in the download")
-                })
-                .map_err(DownloadError::Other)?;
-
-            let len = source
-                .metadata()
-                .await
-                .context("query file length")
-                .map_err(DownloadError::Other)?
-                .len();
-
-            source
-                .seek(io::SeekFrom::Start(start_inclusive))
-                .await
-                .context("Failed to seek to the range start in a local storage file")
-                .map_err(DownloadError::Other)?;
-
-            let metadata = self
-                .read_storage_metadata(&target_path)
-                .await
-                .map_err(DownloadError::Other)?;
-
-            let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
-            let source = ReaderStream::new(source);
-
-            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
-
-            Ok(Download {
-                metadata,
-                last_modified: None,
-                etag: None,
-                download_stream: Box::pin(source),
+        let file_metadata = file_metadata(&target_path).await?;
+        let mut source = tokio::fs::OpenOptions::new()
+            .read(true)
+            .open(&target_path)
+            .await
+            .with_context(|| {
+                format!("Failed to open source file {target_path:?} to use in the download")
             })
-        } else {
-            Err(DownloadError::NotFound)
-        }
+            .map_err(DownloadError::Other)?;
+
+        let len = source
+            .metadata()
+            .await
+            .context("query file length")
+            .map_err(DownloadError::Other)?
+            .len();
+
+        source
+            .seek(io::SeekFrom::Start(start_inclusive))
+            .await
+            .context("Failed to seek to the range start in a local storage file")
+            .map_err(DownloadError::Other)?;
+
+        let metadata = self
+            .read_storage_metadata(&target_path)
+            .await
+            .map_err(DownloadError::Other)?;
+
+        let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
+        let source = ReaderStream::new(source);
+
+        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+
+        let etag = mock_etag(&file_metadata);
+        Ok(Download {
+            metadata,
+            last_modified: file_metadata
+                .modified()
+                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
+            etag,
+            download_stream: Box::pin(source),
+        })
     }
 
     async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
@@ -610,13 +613,22 @@ async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<
     Ok(())
 }
 
-fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
-    if file_path.exists() {
-        ensure!(file_path.is_file(), "file path '{file_path}' is not a file");
-        Ok(true)
-    } else {
-        Ok(false)
-    }
+async fn file_metadata(file_path: &Utf8Path) -> Result<std::fs::Metadata, DownloadError> {
+    tokio::fs::metadata(&file_path).await.map_err(|e| {
+        if e.kind() == ErrorKind::NotFound {
+            DownloadError::NotFound
+        } else {
+            DownloadError::BadInput(e.into())
+        }
+    })
+}
+
+// Use mtime as stand-in for ETag.  We could calculate a meaningful one by md5'ing the contents of files we
+// read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests
+// quickly, with less overhead than using a mock S3 server.
+fn mock_etag(meta: &std::fs::Metadata) -> String {
+    let mtime = meta.modified().expect("Filesystem mtime missing");
+    format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis())
 }
 
 #[cfg(test)]
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 438f45fbde..56bc32ebdd 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -35,8 +35,8 @@ use aws_sdk_s3::{
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
 
-use aws_smithy_types::byte_stream::ByteStream;
 use aws_smithy_types::{body::SdkBody, DateTime};
+use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
 use bytes::Bytes;
 use futures::stream::Stream;
 use hyper::Body;
@@ -287,8 +287,16 @@ impl S3Bucket {
         let remaining = self.timeout.saturating_sub(started_at.elapsed());
 
         let metadata = object_output.metadata().cloned().map(StorageMetadata);
-        let etag = object_output.e_tag;
-        let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
+        let etag = object_output
+            .e_tag
+            .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?;
+        let last_modified = object_output
+            .last_modified
+            .ok_or(DownloadError::Other(anyhow::anyhow!(
+                "Missing LastModified header"
+            )))?
+            .try_into()
+            .map_err(|e: ConversionError| DownloadError::Other(e.into()))?;
 
         let body = object_output.body;
         let body = ByteStreamAsStream::from(body);
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index d8b9824d99..bc5e40e70f 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -118,7 +118,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     // A little check to ensure that our clock is not too far off from the S3 clock
     {
         let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
-        let last_modified = dl.last_modified.unwrap();
+        let last_modified = dl.last_modified;
         let half_wt = WAIT_TIME.mul_f32(0.5);
         let t0_hwt = t0 + half_wt;
         let t1_hwt = t1 - half_wt;

From 22c26d610b0a0cd4c7d714c25b8db1d516e3cc7f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 15:23:05 +0000
Subject: [PATCH 0411/1571] pageserver: remove un-needed "uninit mark" (#5717)

Switched the order; doing https://github.com/neondatabase/neon/pull/6139
first then can remove uninit marker after.

## Problem

Previously, existence of a timeline directory was treated as evidence of
the timeline's logical existence. That is no longer the case since we
treat remote storage as the source of truth on each startup: we can
therefore do without this mark file.

The mark file had also been used as a pseudo-lock to guard against
concurrent creations of the same TimelineId -- now that persistence is
no longer required, this is a bit unwieldy.

In #6139 the `Tenant::timelines_creating` was added to protect against
concurrent creations on the same TimelineId, making the uninit mark file
entirely redundant.

## Summary of changes

- Code that writes & reads mark file is removed
- Some nearby `pub` definitions are amended to `pub(crate)`
- `test_duplicate_creation` is added to demonstrate that mutual
exclusion of creations still works.
---
 pageserver/src/config.rs                      |  20 +--
 pageserver/src/http/routes.rs                 |   6 +-
 pageserver/src/lib.rs                         |  16 +--
 pageserver/src/tenant.rs                      | 122 +++++++-----------
 pageserver/src/tenant/timeline/uninit.rs      |  98 ++++----------
 .../fixtures/pageserver/allowed_errors.py     |   1 -
 test_runner/fixtures/pageserver/http.py       |   2 +-
 test_runner/regress/test_branching.py         |  58 +++++++++
 test_runner/regress/test_broken_timeline.py   |   8 +-
 test_runner/regress/test_import.py            |   1 -
 10 files changed, 147 insertions(+), 185 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 845b20c8db..7dac0ab352 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -41,7 +41,7 @@ use crate::tenant::{
 use crate::virtual_file;
 use crate::{
     IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
-    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
+    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
 };
 
 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -845,18 +845,7 @@ impl PageServerConf {
             .join(timeline_id.to_string())
     }
 
-    pub fn timeline_uninit_mark_file_path(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-    ) -> Utf8PathBuf {
-        path_with_suffix_extension(
-            self.timeline_path(&tenant_shard_id, &timeline_id),
-            TIMELINE_UNINIT_MARK_SUFFIX,
-        )
-    }
-
-    pub fn timeline_delete_mark_file_path(
+    pub(crate) fn timeline_delete_mark_file_path(
         &self,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
@@ -867,7 +856,10 @@ impl PageServerConf {
         )
     }
 
-    pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+    pub(crate) fn tenant_deleted_mark_file_path(
+        &self,
+        tenant_shard_id: &TenantShardId,
+    ) -> Utf8PathBuf {
         self.tenant_path(tenant_shard_id)
             .join(TENANT_DELETED_MARKER_FILE_NAME)
     }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index d6fe9f6055..7d3ede21ce 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -535,9 +535,9 @@ async fn timeline_create_handler(
                 )
             }
             Err(
-                tenant::CreateTimelineError::Conflict
-                | tenant::CreateTimelineError::AlreadyCreating,
-            ) => json_response(StatusCode::CONFLICT, ()),
+                e @ tenant::CreateTimelineError::Conflict
+                | e @ tenant::CreateTimelineError::AlreadyCreating,
+            ) => json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())),
             Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
                 StatusCode::NOT_ACCEPTABLE,
                 HttpErrorBody::from_msg(format!("{err:#}")),
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 02a690d4e1..b00db02a1c 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -114,27 +114,27 @@ pub const METADATA_FILE_NAME: &str = "metadata";
 
 /// Per-tenant configuration file.
 /// Full path: `tenants/<tenant_id>/config`.
-pub const TENANT_CONFIG_NAME: &str = "config";
+pub(crate) const TENANT_CONFIG_NAME: &str = "config";
 
 /// Per-tenant configuration file.
 /// Full path: `tenants/<tenant_id>/config`.
-pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
+pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
 
 /// Per-tenant copy of their remote heatmap, downloaded into the local
 /// tenant path while in secondary mode.
-pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
+pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
 
 /// A suffix used for various temporary files. Any temporary files found in the
 /// data directory at pageserver startup can be automatically removed.
-pub const TEMP_FILE_SUFFIX: &str = "___temp";
+pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp";
 
 /// A marker file to mark that a timeline directory was not fully initialized.
 /// If a timeline directory with this marker is encountered at pageserver startup,
 /// the timeline directory and the marker file are both removed.
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
-pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
+pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
 
-pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
+pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
 
 /// A marker file to prevent pageserver from loading a certain tenant on restart.
 /// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
@@ -161,11 +161,11 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
 // from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
 // from the name.
 
-pub fn is_uninit_mark(path: &Utf8Path) -> bool {
+pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool {
     ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
 }
 
-pub fn is_delete_mark(path: &Utf8Path) -> bool {
+pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool {
     ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
 }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f0996328c0..ddfb47369b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -55,8 +55,8 @@ use self::mgr::GetTenantError;
 use self::mgr::TenantsMap;
 use self::remote_timeline_client::upload::upload_index_part;
 use self::remote_timeline_client::RemoteTimelineClient;
+use self::timeline::uninit::TimelineCreateGuard;
 use self::timeline::uninit::TimelineExclusionError;
-use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
@@ -565,9 +565,8 @@ impl Tenant {
             // avoiding holding it across awaits
             let mut timelines_accessor = self.timelines.lock().unwrap();
             match timelines_accessor.entry(timeline_id) {
+                // We should never try and load the same timeline twice during startup
                 Entry::Occupied(_) => {
-                    // The uninit mark file acts as a lock that prevents another task from
-                    // initializing the timeline at the same time.
                     unreachable!(
                         "Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
                     );
@@ -1064,8 +1063,7 @@ impl Tenant {
             let entry_path = entry.path();
 
             let purge = if crate::is_temporary(entry_path)
-                // TODO: uninit_mark isn't needed any more, since uninitialized timelines are already
-                // covered by the check that the timeline must exist in remote storage.
+                // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
                 || is_uninit_mark(entry_path)
                 || crate::is_delete_mark(entry_path)
             {
@@ -1298,11 +1296,6 @@ impl Tenant {
     /// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0))
     /// and the timeline will fail to load at a restart.
     ///
-    /// That's why we add an uninit mark file, and wrap it together witht the Timeline
-    /// in-memory object into UninitializedTimeline.
-    /// Once the caller is done setting up the timeline, they should call
-    /// `UninitializedTimeline::initialize_with_lock` to remove the uninit mark.
-    ///
     /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
     /// minimum amount of keys required to get a writable timeline.
     /// (Without it, `put` might fail due to `repartition` failing.)
@@ -1318,7 +1311,9 @@ impl Tenant {
             "Cannot create empty timelines on inactive tenant"
         );
 
-        let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?;
+        // Protect against concurrent attempts to use this TimelineId
+        let create_guard = self.create_timeline_create_guard(new_timeline_id)?;
+
         let new_metadata = TimelineMetadata::new(
             // Initialize disk_consistent LSN to 0, The caller must import some data to
             // make it valid, before calling finish_creation()
@@ -1333,7 +1328,7 @@ impl Tenant {
         self.prepare_new_timeline(
             new_timeline_id,
             &new_metadata,
-            timeline_uninit_mark,
+            create_guard,
             initdb_lsn,
             None,
         )
@@ -1421,9 +1416,8 @@ impl Tenant {
             .map_err(|_| CreateTimelineError::ShuttingDown)?;
 
         // Get exclusive access to the timeline ID: this ensures that it does not already exist,
-        // and that no other creation attempts will be allowed in while we are working.  The
-        // uninit_mark is a guard.
-        let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) {
+        // and that no other creation attempts will be allowed in while we are working.
+        let create_guard = match self.create_timeline_create_guard(new_timeline_id) {
             Ok(m) => m,
             Err(TimelineExclusionError::AlreadyCreating) => {
                 // Creation is in progress, we cannot create it again, and we cannot
@@ -1466,6 +1460,8 @@ impl Tenant {
             }
         };
 
+        pausable_failpoint!("timeline-creation-after-uninit");
+
         let loaded_timeline = match ancestor_timeline_id {
             Some(ancestor_timeline_id) => {
                 let ancestor_timeline = self
@@ -1513,7 +1509,7 @@ impl Tenant {
                     &ancestor_timeline,
                     new_timeline_id,
                     ancestor_start_lsn,
-                    uninit_mark,
+                    create_guard,
                     ctx,
                 )
                 .await?
@@ -1523,7 +1519,7 @@ impl Tenant {
                     new_timeline_id,
                     pg_version,
                     load_existing_initdb,
-                    uninit_mark,
+                    create_guard,
                     ctx,
                 )
                 .await?
@@ -2870,9 +2866,9 @@ impl Tenant {
         start_lsn: Option<Lsn>,
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
-        let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap();
+        let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
         let tl = self
-            .branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx)
+            .branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx)
             .await?;
         tl.set_state(TimelineState::Active);
         Ok(tl)
@@ -2886,10 +2882,10 @@ impl Tenant {
         src_timeline: &Arc<Timeline>,
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
-        timeline_uninit_mark: TimelineUninitMark<'_>,
+        timeline_create_guard: TimelineCreateGuard<'_>,
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
-        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx)
+        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx)
             .await
     }
 
@@ -2898,7 +2894,7 @@ impl Tenant {
         src_timeline: &Arc<Timeline>,
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
-        timeline_uninit_mark: TimelineUninitMark<'_>,
+        timeline_create_guard: TimelineCreateGuard<'_>,
         _ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
         let src_id = src_timeline.timeline_id;
@@ -2982,7 +2978,7 @@ impl Tenant {
             .prepare_new_timeline(
                 dst_id,
                 &metadata,
-                timeline_uninit_mark,
+                timeline_create_guard,
                 start_lsn + 1,
                 Some(Arc::clone(src_timeline)),
             )
@@ -3014,12 +3010,12 @@ impl Tenant {
         load_existing_initdb: Option<TimelineId>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
-        let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap();
+        let create_guard = self.create_timeline_create_guard(timeline_id).unwrap();
         self.bootstrap_timeline(
             timeline_id,
             pg_version,
             load_existing_initdb,
-            uninit_mark,
+            create_guard,
             ctx,
         )
         .await
@@ -3083,7 +3079,7 @@ impl Tenant {
         timeline_id: TimelineId,
         pg_version: u32,
         load_existing_initdb: Option<TimelineId>,
-        timeline_uninit_mark: TimelineUninitMark<'_>,
+        timeline_create_guard: TimelineCreateGuard<'_>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
         // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
@@ -3095,13 +3091,14 @@ impl Tenant {
             TEMP_FILE_SUFFIX,
         );
 
-        // an uninit mark was placed before, nothing else can access this timeline files
-        // current initdb was not run yet, so remove whatever was left from the previous runs
+        // Remove whatever was left from the previous runs: safe because TimelineCreateGuard guarantees
+        // we won't race with other creations or existent timelines with the same path.
         if pgdata_path.exists() {
             fs::remove_dir_all(&pgdata_path).with_context(|| {
                 format!("Failed to remove already existing initdb directory: {pgdata_path}")
             })?;
         }
+
         // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
         scopeguard::defer! {
             if let Err(e) = fs::remove_dir_all(&pgdata_path) {
@@ -3178,7 +3175,7 @@ impl Tenant {
             .prepare_new_timeline(
                 timeline_id,
                 &new_metadata,
-                timeline_uninit_mark,
+                timeline_create_guard,
                 pgdata_lsn,
                 None,
             )
@@ -3250,13 +3247,12 @@ impl Tenant {
     ///
     /// An empty layer map is initialized, and new data and WAL can be imported starting
     /// at 'disk_consistent_lsn'. After any initial data has been imported, call
-    /// `finish_creation` to insert the Timeline into the timelines map and to remove the
-    /// uninit mark file.
+    /// `finish_creation` to insert the Timeline into the timelines map.
     async fn prepare_new_timeline<'a>(
         &'a self,
         new_timeline_id: TimelineId,
         new_metadata: &TimelineMetadata,
-        uninit_mark: TimelineUninitMark<'a>,
+        create_guard: TimelineCreateGuard<'a>,
         start_lsn: Lsn,
         ancestor: Option<Arc<Timeline>>,
     ) -> anyhow::Result<UninitializedTimeline> {
@@ -3279,9 +3275,12 @@ impl Tenant {
 
         timeline_struct.init_empty_layer_map(start_lsn);
 
-        if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await {
+        if let Err(e) = self
+            .create_timeline_files(&create_guard.timeline_path)
+            .await
+        {
             error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
-            cleanup_timeline_directory(uninit_mark);
+            cleanup_timeline_directory(create_guard);
             return Err(e);
         }
 
@@ -3292,41 +3291,31 @@ impl Tenant {
         Ok(UninitializedTimeline::new(
             self,
             new_timeline_id,
-            Some((timeline_struct, uninit_mark)),
+            Some((timeline_struct, create_guard)),
         ))
     }
 
     async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
         crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
 
-        fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
-            anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
+        fail::fail_point!("after-timeline-dir-creation", |_| {
+            anyhow::bail!("failpoint after-timeline-dir-creation");
         });
 
         Ok(())
     }
 
-    /// Attempts to create an uninit mark file for the timeline initialization.
-    /// Bails, if the timeline is already loaded into the memory (i.e. initialized before), or the uninit mark file already exists.
-    ///
-    /// This way, we need to hold the timelines lock only for small amount of time during the mark check/creation per timeline init.
-    fn create_timeline_uninit_mark(
+    /// Get a guard that provides exclusive access to the timeline directory, preventing
+    /// concurrent attempts to create the same timeline.
+    fn create_timeline_create_guard(
         &self,
         timeline_id: TimelineId,
-    ) -> Result<TimelineUninitMark, TimelineExclusionError> {
+    ) -> Result<TimelineCreateGuard, TimelineExclusionError> {
         let tenant_shard_id = self.tenant_shard_id;
 
-        let uninit_mark_path = self
-            .conf
-            .timeline_uninit_mark_file_path(tenant_shard_id, timeline_id);
         let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
 
-        let uninit_mark = TimelineUninitMark::new(
-            self,
-            timeline_id,
-            uninit_mark_path.clone(),
-            timeline_path.clone(),
-        )?;
+        let create_guard = TimelineCreateGuard::new(self, timeline_id, timeline_path.clone())?;
 
         // At this stage, we have got exclusive access to in-memory state for this timeline ID
         // for creation.
@@ -3342,23 +3331,7 @@ impl Tenant {
             )));
         }
 
-        // Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees
-        // that during process runtime, colliding creations will be caught in-memory without getting
-        // as far as failing to write a file.
-        fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .open(&uninit_mark_path)
-            .context("Failed to create uninit mark file")
-            .and_then(|_| {
-                crashsafe::fsync_file_and_parent(&uninit_mark_path)
-                    .context("Failed to fsync uninit mark file")
-            })
-            .with_context(|| {
-                format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}")
-            })?;
-
-        Ok(uninit_mark)
+        Ok(create_guard)
     }
 
     /// Gathers inputs from all of the timelines to produce a sizing model input.
@@ -5099,15 +5072,15 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_uninit_mark_crash() -> anyhow::Result<()> {
-        let name = "test_uninit_mark_crash";
+    async fn test_create_guard_crash() -> anyhow::Result<()> {
+        let name = "test_create_guard_crash";
         let harness = TenantHarness::create(name)?;
         {
             let (tenant, ctx) = harness.load().await;
             let tline = tenant
                 .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
                 .await?;
-            // Keeps uninit mark in place
+            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
             let raw_tline = tline.raw_timeline().unwrap();
             raw_tline
                 .shutdown()
@@ -5135,11 +5108,6 @@ mod tests {
             .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID)
             .exists());
 
-        assert!(!harness
-            .conf
-            .timeline_uninit_mark_file_path(tenant.tenant_shard_id, TIMELINE_ID)
-            .exists());
-
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs
index 27d6fd9c28..e1034a9fe2 100644
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -2,8 +2,8 @@ use std::{collections::hash_map::Entry, fs, sync::Arc};
 
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use tracing::{error, info, info_span, warn};
-use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn};
+use tracing::{error, info, info_span};
+use utils::{fs_ext, id::TimelineId, lsn::Lsn};
 
 use crate::{context::RequestContext, import_datadir, tenant::Tenant};
 
@@ -11,22 +11,22 @@ use super::Timeline;
 
 /// A timeline with some of its files on disk, being initialized.
 /// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
-/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory
-/// to be removed on next restart.
+/// its local files are removed.  If we crash while this class exists, then the timeline's local
+/// state is cleaned up during [`Tenant::clean_up_timelines`], because the timeline's content isn't in remote storage.
 ///
 /// The caller is responsible for proper timeline data filling before the final init.
 #[must_use]
 pub struct UninitializedTimeline<'t> {
     pub(crate) owning_tenant: &'t Tenant,
     timeline_id: TimelineId,
-    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
+    raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
 }
 
 impl<'t> UninitializedTimeline<'t> {
     pub(crate) fn new(
         owning_tenant: &'t Tenant,
         timeline_id: TimelineId,
-        raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
+        raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
     ) -> Self {
         Self {
             owning_tenant,
@@ -35,8 +35,7 @@ impl<'t> UninitializedTimeline<'t> {
         }
     }
 
-    /// Finish timeline creation: insert it into the Tenant's timelines map and remove the
-    /// uninit mark file.
+    /// Finish timeline creation: insert it into the Tenant's timelines map
     ///
     /// This function launches the flush loop if not already done.
     ///
@@ -72,16 +71,9 @@ impl<'t> UninitializedTimeline<'t> {
             Entry::Vacant(v) => {
                 // after taking here should be no fallible operations, because the drop guard will not
                 // cleanup after and would block for example the tenant deletion
-                let (new_timeline, uninit_mark) =
+                let (new_timeline, _create_guard) =
                     self.raw_timeline.take().expect("already checked");
 
-                // this is the mutual exclusion between different retries to create the timeline;
-                // this should be an assertion.
-                uninit_mark.remove_uninit_mark().with_context(|| {
-                    format!(
-                        "Failed to remove uninit mark file for timeline {tenant_shard_id}/{timeline_id}"
-                    )
-                })?;
                 v.insert(Arc::clone(&new_timeline));
 
                 new_timeline.maybe_spawn_flush_loop();
@@ -120,8 +112,7 @@ impl<'t> UninitializedTimeline<'t> {
             .await
             .context("Failed to flush after basebackup import")?;
 
-        // All the data has been imported. Insert the Timeline into the tenant's timelines
-        // map and remove the uninit mark file.
+        // All the data has been imported. Insert the Timeline into the tenant's timelines map
         let tl = self.finish_creation()?;
         tl.activate(broker_client, None, ctx);
         Ok(tl)
@@ -143,37 +134,35 @@ impl<'t> UninitializedTimeline<'t> {
 
 impl Drop for UninitializedTimeline<'_> {
     fn drop(&mut self) {
-        if let Some((_, uninit_mark)) = self.raw_timeline.take() {
+        if let Some((_, create_guard)) = self.raw_timeline.take() {
             let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
             error!("Timeline got dropped without initializing, cleaning its files");
-            cleanup_timeline_directory(uninit_mark);
+            cleanup_timeline_directory(create_guard);
         }
     }
 }
 
-pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
-    let timeline_path = &uninit_mark.timeline_path;
+pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) {
+    let timeline_path = &create_guard.timeline_path;
     match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
         Ok(()) => {
-            info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
+            info!("Timeline dir {timeline_path:?} removed successfully")
         }
         Err(e) => {
             error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
         }
     }
-    drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists
+    // Having cleaned up, we can release this TimelineId in `[Tenant::timelines_creating]` to allow other
+    // timeline creation attempts under this TimelineId to proceed
+    drop(create_guard);
 }
 
-/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
-/// or gets removed eventually.
-///
-/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
+/// A guard for timeline creations in process: as long as this object exists, the timeline ID
+/// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline.
 #[must_use]
-pub(crate) struct TimelineUninitMark<'t> {
+pub(crate) struct TimelineCreateGuard<'t> {
     owning_tenant: &'t Tenant,
     timeline_id: TimelineId,
-    uninit_mark_deleted: bool,
-    uninit_mark_path: Utf8PathBuf,
     pub(crate) timeline_path: Utf8PathBuf,
 }
 
@@ -190,11 +179,10 @@ pub(crate) enum TimelineExclusionError {
     Other(#[from] anyhow::Error),
 }
 
-impl<'t> TimelineUninitMark<'t> {
+impl<'t> TimelineCreateGuard<'t> {
     pub(crate) fn new(
         owning_tenant: &'t Tenant,
         timeline_id: TimelineId,
-        uninit_mark_path: Utf8PathBuf,
         timeline_path: Utf8PathBuf,
     ) -> Result<Self, TimelineExclusionError> {
         // Lock order: this is the only place we take both locks.  During drop() we only
@@ -214,56 +202,14 @@ impl<'t> TimelineUninitMark<'t> {
             Ok(Self {
                 owning_tenant,
                 timeline_id,
-                uninit_mark_deleted: false,
-                uninit_mark_path,
                 timeline_path,
             })
         }
     }
-
-    fn remove_uninit_mark(mut self) -> anyhow::Result<()> {
-        if !self.uninit_mark_deleted {
-            self.delete_mark_file_if_present()?;
-        }
-
-        Ok(())
-    }
-
-    fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> {
-        let uninit_mark_file = &self.uninit_mark_path;
-        let uninit_mark_parent = uninit_mark_file
-            .parent()
-            .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
-        fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
-            format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
-        })?;
-        crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
-        self.uninit_mark_deleted = true;
-
-        Ok(())
-    }
 }
 
-impl Drop for TimelineUninitMark<'_> {
+impl Drop for TimelineCreateGuard<'_> {
     fn drop(&mut self) {
-        if !self.uninit_mark_deleted {
-            if self.timeline_path.exists() {
-                error!(
-                    "Uninit mark {} is not removed, timeline {} stays uninitialized",
-                    self.uninit_mark_path, self.timeline_path
-                )
-            } else {
-                // unblock later timeline creation attempts
-                warn!(
-                    "Removing intermediate uninit mark file {}",
-                    self.uninit_mark_path
-                );
-                if let Err(e) = self.delete_mark_file_if_present() {
-                    error!("Failed to remove the uninit mark file: {e}")
-                }
-            }
-        }
-
         self.owning_tenant
             .timelines_creating
             .lock()
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 8ff4341cc0..839d4166c7 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -55,7 +55,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     # FIXME: These need investigation
     ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
     ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
-    ".*Removing intermediate uninit mark file.*",
     # Tenant::delete_timeline() can cause any of the four following errors.
     # FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946
     ".*could not flush frozen layer.*queue is in state Stopped",  # when schedule layer upload fails because queued got closed before compaction got killed
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 6e082374d7..4e355b73a9 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -34,7 +34,7 @@ class TimelineCreate406(PageserverApiException):
 class TimelineCreate409(PageserverApiException):
     def __init__(self, res: requests.Response):
         assert res.status_code == 409
-        super().__init__("", res.status_code)
+        super().__init__(res.json()["msg"], res.status_code)
 
 
 @dataclass
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 9a0b91b54e..2a7a3c41ac 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -347,6 +347,64 @@ def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvB
         ps_http.timeline_detail(env.initial_tenant, branch_id)
 
 
+def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_configs()
+    env.start()
+    env.pageserver.tenant_create(env.initial_tenant)
+
+    success_timeline = TimelineId.generate()
+    log.info(f"Creating timeline {success_timeline}")
+    ps_http = env.pageserver.http_client()
+    success_result = ps_http.timeline_create(
+        env.pg_version, env.initial_tenant, success_timeline, timeout=60
+    )
+
+    ps_http.configure_failpoints(("timeline-creation-after-uninit", "pause"))
+
+    def start_creating_timeline():
+        log.info(f"Creating (expect failure) timeline {env.initial_timeline}")
+        with pytest.raises(RequestException):
+            ps_http.timeline_create(
+                env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
+            )
+
+    t = threading.Thread(target=start_creating_timeline)
+    try:
+        t.start()
+
+        wait_until_paused(env, "timeline-creation-after-uninit")
+
+        # While timeline creation is in progress, trying to create a timeline
+        # again with the same ID should return 409
+        with pytest.raises(
+            PageserverApiException, match="creation of timeline with the given ID is in progress"
+        ):
+            ps_http.timeline_create(
+                env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
+            )
+
+        # Creation of a timeline already successfully created is idempotent, and is not impeded by some
+        # other timeline creation with a different TimelineId being stuck.
+        repeat_result = ps_http.timeline_create(
+            env.pg_version, env.initial_tenant, success_timeline, timeout=60
+        )
+        assert repeat_result == success_result
+    finally:
+        env.pageserver.stop(immediate=True)
+        t.join()
+
+    # now without a failpoint
+    env.pageserver.start()
+
+    wait_until_tenant_active(ps_http, env.initial_tenant)
+
+    with pytest.raises(PageserverApiException, match="not found"):
+        ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
+
+    # The one successfully created timeline should still be there.
+    assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1
+
+
 def wait_until_paused(env: NeonEnv, failpoint: str):
     found = False
     msg = f"at failpoint {failpoint}"
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index b046ed7f1b..804ad135ce 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -204,7 +204,7 @@ def test_timeline_init_break_before_checkpoint_recreate(
     assert timeline_id == new_timeline_id
 
 
-def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilder):
+def test_timeline_create_break_after_dir_creation(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
 
@@ -214,9 +214,9 @@ def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilde
     old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
     initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
 
-    # Introduce failpoint when creating a new timeline uninit mark, before any other files were created
-    pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return"))
-    with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"):
+    # Introduce failpoint when creating a new timeline, right after creating its directory
+    pageserver_http.configure_failpoints(("after-timeline-dir-creation", "return"))
+    with pytest.raises(Exception, match="after-timeline-dir-creation"):
         _ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())
 
     # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index ec57860033..132427ba2d 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -90,7 +90,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
         [
             ".*error importing base backup .*",
             ".*Timeline got dropped without initializing, cleaning its files.*",
-            ".*Removing intermediate uninit mark file.*",
             ".*InternalServerError.*timeline not found.*",
             ".*InternalServerError.*Tenant .* not found.*",
             ".*InternalServerError.*Timeline .* not found.*",

From bf187aa13f8527ea81de370d14984a6cc5889ecd Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 15 Mar 2024 17:30:13 +0200
Subject: [PATCH 0412/1571] fix(layer): metric miscalculations (#7137)

Split off from #7030:
- each early exit is counted as canceled init, even though it most
likely was just `LayerInner::keep_resident` doing the no-download repair
check
- `downloaded_after` could had been accounted for multiple times, and
also when repairing to match on-disk state

Cc: #5331
---
 pageserver/src/tenant/storage_layer/layer.rs | 63 ++++++++++++++------
 1 file changed, 45 insertions(+), 18 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 959065bc4c..0200ff8cf4 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -710,10 +710,6 @@ impl LayerInner {
                     // disable any scheduled but not yet running eviction deletions for this
                     let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
 
-                    // count cancellations, which currently remain largely unexpected
-                    let init_cancelled =
-                        scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
-
                     // no need to make the evict_and_wait wait for the actual download to complete
                     drop(self.status.send(Status::Downloaded));
 
@@ -722,7 +718,9 @@ impl LayerInner {
                         .upgrade()
                         .ok_or_else(|| DownloadError::TimelineShutdown)?;
 
-                    // FIXME: grab a gate
+                    // count cancellations, which currently remain largely unexpected
+                    let init_cancelled =
+                        scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
 
                     let can_ever_evict = timeline.remote_client.as_ref().is_some();
 
@@ -731,9 +729,17 @@ impl LayerInner {
                     let needs_download = self
                         .needs_download()
                         .await
-                        .map_err(DownloadError::PreStatFailed)?;
+                        .map_err(DownloadError::PreStatFailed);
 
-                    let permit = if let Some(reason) = needs_download {
+                    let needs_download = match needs_download {
+                        Ok(reason) => reason,
+                        Err(e) => {
+                            scopeguard::ScopeGuard::into_inner(init_cancelled);
+                            return Err(e);
+                        }
+                    };
+
+                    let (permit, downloaded) = if let Some(reason) = needs_download {
                         if let NeedsDownload::NotFile(ft) = reason {
                             return Err(DownloadError::NotFile(ft));
                         }
@@ -744,36 +750,59 @@ impl LayerInner {
                         self.wanted_evicted.store(false, Ordering::Release);
 
                         if !can_ever_evict {
+                            scopeguard::ScopeGuard::into_inner(init_cancelled);
                             return Err(DownloadError::NoRemoteStorage);
                         }
 
                         if let Some(ctx) = ctx {
-                            self.check_expected_download(ctx)?;
+                            let res = self.check_expected_download(ctx);
+                            if let Err(e) = res {
+                                scopeguard::ScopeGuard::into_inner(init_cancelled);
+                                return Err(e);
+                            }
                         }
 
                         if !allow_download {
                             // this does look weird, but for LayerInner the "downloading" means also changing
                             // internal once related state ...
+                            scopeguard::ScopeGuard::into_inner(init_cancelled);
                             return Err(DownloadError::DownloadRequired);
                         }
 
                         tracing::info!(%reason, "downloading on-demand");
 
-                        self.spawn_download_and_wait(timeline, permit).await?
+                        let permit = self.spawn_download_and_wait(timeline, permit).await;
+
+                        let permit = match permit {
+                            Ok(permit) => permit,
+                            Err(e) => {
+                                scopeguard::ScopeGuard::into_inner(init_cancelled);
+                                return Err(e);
+                            }
+                        };
+
+                        (permit, true)
                     } else {
                         // the file is present locally, probably by a previous but cancelled call to
                         // get_or_maybe_download. alternatively we might be running without remote storage.
                         LAYER_IMPL_METRICS.inc_init_needed_no_download();
 
-                        permit
+                        (permit, false)
                     };
 
-                    let since_last_eviction =
-                        self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
-                    if let Some(since_last_eviction) = since_last_eviction {
-                        // FIXME: this will not always be recorded correctly until #6028 (the no
-                        // download needed branch above)
-                        LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+                    scopeguard::ScopeGuard::into_inner(init_cancelled);
+
+                    if downloaded {
+                        let since_last_eviction = self
+                            .last_evicted_at
+                            .lock()
+                            .unwrap()
+                            .take()
+                            .map(|ts| ts.elapsed());
+
+                        if let Some(since_last_eviction) = since_last_eviction {
+                            LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+                        }
                     }
 
                     let res = Arc::new(DownloadedLayer {
@@ -795,8 +824,6 @@ impl LayerInner {
                         );
                     }
 
-                    scopeguard::ScopeGuard::into_inner(init_cancelled);
-
                     Ok((ResidentOrWantedEvicted::Resident(res), permit))
                 }
                 .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))

From 59b6cce4189eab1f66c805dc6a8f73b9f37f7063 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 15 Mar 2024 17:54:28 +0200
Subject: [PATCH 0413/1571] heavier_once_cell: add detached init support
 (#7135)

Aiming for the design where `heavier_once_cell::OnceCell` is initialized
by a future factory lead to awkwardness with how
`LayerInner::get_or_maybe_download` looks right now with the `loop`. The
loop helps with two situations:

- an eviction has been scheduled but has not yet happened, and a read
access should cancel the eviction
- a previous `LayerInner::get_or_maybe_download` that canceled a pending
eviction was canceled leaving the `heavier_once_cell::OnceCell`
uninitialized but needing repair by the next
`LayerInner::get_or_maybe_download`

By instead supporting detached initialization in
`heavier_once_cell::OnceCell` via an `OnceCell::get_or_detached_init`,
we can fix what the monolithic #7030 does:
- spawned off download task initializes the
`heavier_once_cell::OnceCell` regardless of the download starter being
canceled
- a canceled `LayerInner::get_or_maybe_download` no longer stops
eviction but can win it if not canceled

Split off from #7030.

Cc: #5331
---
 libs/utils/src/sync/heavier_once_cell.rs | 78 ++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 703a6dfd52..a3aee45b58 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -110,6 +110,49 @@ impl<T> OnceCell<T> {
         }
     }
 
+    /// Returns a guard to an existing initialized value, or returns an unique initialization
+    /// permit which can be used to initialize this `OnceCell` using `OnceCell::set`.
+    pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
+        // It looks like OnceCell::get_or_init could be implemented using this method instead of
+        // duplication. However, that makes the future be !Send due to possibly holding on to the
+        // MutexGuard over an await point.
+        loop {
+            let sem = {
+                let guard = self.inner.lock().unwrap();
+                if guard.value.is_some() {
+                    return Ok(Guard(guard));
+                }
+                guard.init_semaphore.clone()
+            };
+
+            {
+                let permit = {
+                    // increment the count for the duration of queued
+                    let _guard = CountWaitingInitializers::start(self);
+                    sem.acquire().await
+                };
+
+                let Ok(permit) = permit else {
+                    let guard = self.inner.lock().unwrap();
+                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
+                        // there was a take_and_deinit in between
+                        continue;
+                    }
+                    assert!(
+                        guard.value.is_some(),
+                        "semaphore got closed, must be initialized"
+                    );
+                    return Ok(Guard(guard));
+                };
+
+                permit.forget();
+            }
+
+            let permit = InitPermit(sem);
+            return Err(permit);
+        }
+    }
+
     /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
     /// to complete initializing the inner value.
     ///
@@ -481,4 +524,39 @@ mod tests {
 
         assert_eq!("t1", *cell.get().unwrap());
     }
+
+    #[tokio::test(start_paused = true)]
+    async fn detached_init_smoke() {
+        let target = OnceCell::default();
+
+        let Err(permit) = target.get_or_init_detached().await else {
+            unreachable!("it is not initialized")
+        };
+
+        tokio::time::timeout(
+            std::time::Duration::from_secs(3600 * 24 * 7 * 365),
+            target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }),
+        )
+        .await
+        .expect_err("should timeout since we are already holding the permit");
+
+        target.set(42, permit);
+
+        let (_answer, permit) = {
+            let mut guard = target
+                .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
+                .await
+                .unwrap();
+
+            assert_eq!(*guard, 42);
+
+            guard.take_and_deinit()
+        };
+
+        assert!(target.get().is_none());
+
+        target.set(11, permit);
+
+        assert_eq!(*target.get().unwrap(), 11);
+    }
 }

From 7d32af5ad5ec316761cb29d6a8141f4baf68735b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 15:57:01 +0000
Subject: [PATCH 0414/1571] .github: apply timeout to pytest `regress` (#7142)

These test runs usually take 20-30 minutes. if something hangs, we see
actions proceeding for several hours: it's more convenient to have them
time out sooner so that we notice that something has hung faster.
---
 .github/workflows/build_and_test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 810c61de2d..2bcda7cc8e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -461,6 +461,7 @@ jobs:
 
       - name: Pytest regression tests
         uses: ./.github/actions/run-python-test-set
+        timeout-minutes: 60
         with:
           build_type: ${{ matrix.build_type }}
           test_selection: regress

From 67522ce83d1b6b754e6e50dec79425ba13fdc993 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 16:00:04 +0000
Subject: [PATCH 0415/1571] docs: shard splitting RFC (#6358)

Extend the previous sharding RFC with functionality for dynamically splitting shards to increase the total shard count on existing tenants.
---
 docs/rfcs/032-shard-splitting.md | 479 +++++++++++++++++++++++++++++++
 1 file changed, 479 insertions(+)
 create mode 100644 docs/rfcs/032-shard-splitting.md

diff --git a/docs/rfcs/032-shard-splitting.md b/docs/rfcs/032-shard-splitting.md
new file mode 100644
index 0000000000..d5fbda8415
--- /dev/null
+++ b/docs/rfcs/032-shard-splitting.md
@@ -0,0 +1,479 @@
+# Shard splitting
+
+## Summary
+
+This RFC describes a new pageserver API for splitting an existing tenant shard into
+multiple shards, and describes how to use this API to safely increase the total
+shard count of a tenant.
+
+## Motivation
+
+In the [sharding RFC](031-sharding-static.md), a mechanism was introduced to scale
+tenants beyond the capacity of a single pageserver by breaking up the key space
+into stripes, and distributing these stripes across many pageservers. However,
+the shard count was defined once at tenant creation time and not varied thereafter.
+
+In practice, the expected size of a database is rarely known at creation time, and
+it is inefficient to enable sharding for very small tenants: we need to be
+able to create a tenant with a small number of shards (such as 1), and later expand
+when it becomes clear that the tenant has grown in size to a point where sharding
+is beneficial.
+
+### Prior art
+
+Many distributed systems have the problem of choosing how many shards to create for
+tenants that do not specify an expected size up-front. There are a couple of general
+approaches:
+
+- Write to a key space in order, and start a new shard when the highest key advances
+  past some point. This doesn't work well for Neon, because we write to our key space
+  in many different contiguous ranges (per relation), rather than in one contiguous
+  range. To adapt to this kind of model, we would need a sharding scheme where each
+  relation had its own range of shards, which would be inefficient for the common
+  case of databases with many small relations.
+- Monitor the system, and automatically re-shard at some size threshold. For
+  example in Ceph, the [pg_autoscaler](https://github.com/ceph/ceph/blob/49c27499af4ee9a90f69fcc6bf3597999d6efc7b/src/pybind/mgr/pg_autoscaler/module.py)
+  component monitors the size of each RADOS Pool, and adjusts the number of Placement
+  Groups (Ceph's shard equivalent).
+
+## Requirements
+
+- A configurable capacity limit per-shard is enforced.
+- Changes in shard count do not interrupt service beyond requiring postgres
+  to reconnect (i.e. milliseconds).
+- Human being does not have to choose shard count
+
+## Non Goals
+
+- Shard splitting is always a tenant-global operation: we will not enable splitting
+  one shard while leaving others intact.
+- The inverse operation (shard merging) is not described in this RFC. This is a lower
+  priority than splitting, because databases grow more often than they shrink, and
+  a database with many shards will still work properly if the stored data shrinks, just
+  with slightly more overhead (e.g. redundant WAL replication)
+- Shard splitting is only initiated based on capacity bounds, not load. Splitting
+  a tenant based on load will make sense for some medium-capacity, high-load workloads,
+  but is more complex to reason about and likely is not desirable until we have
+  shard merging to reduce the shard count again if the database becomes less busy.
+
+## Impacted Components
+
+pageserver, storage controller
+
+(the _storage controller_ is the evolution of what was called `attachment_service` in our test environment)
+
+## Terminology
+
+**Parent** shards are the shards that exist before a split. **Child** shards are
+the new shards created during a split.
+
+**Shard** is synonymous with _tenant shard_.
+
+**Shard Index** is the 2-tuple of shard number and shard count, written in
+paths as {:02x}{:02x}, e.g. `0001`.
+
+## Background
+
+In the implementation section, a couple of existing aspects of sharding are important
+to remember:
+
+- Shard identifiers contain the shard number and count, so that "shard 0 of 1" (`0001`) is
+  a distinct shard from "shard 0 of 2" (`0002`). This is the case in key paths, local
+  storage paths, and remote index metadata.
+- Remote layer file paths contain the shard index of the shard that created them, and
+  remote indices contain the same index to enable building the layer file path. A shard's
+  index may reference layers that were created by another shard.
+- Local tenant shard directories include the shard index. All layers downloaded by
+  a tenant shard are stored in this shard-prefixed path, even if those layers were
+  initially created by another shard: tenant shards do not read and write one anothers'
+  paths.
+- The `Tenant` pageserver type represents one tenant _shard_, not the whole tenant.
+  This is for historical reasons and will be cleaned up in future, but the existing
+  name is used here to help comprehension when reading code.
+
+## Implementation
+
+Note: this section focuses on the correctness of the core split process. This will
+be fairly inefficient in a naive implementation, and several important optimizations
+are described in a later section.
+
+There are broadly two parts to the implementation:
+
+1. The pageserver split API, which splits one shard on one pageserver
+2. The overall tenant split proccess which is coordinated by the storage controller,
+   and calls into the pageserver split API as needed.
+
+### Pageserver Split API
+
+The pageserver will expose a new API endpoint at `/v1/tenant/:tenant_shard_id/shard_split`
+that takes the new total shard count in the body.
+
+The pageserver split API operates on one tenant shard, on one pageserver. External
+coordination is required to use it safely, this is described in the later
+'Split procedure' section.
+
+#### Preparation
+
+First identify the shard indices for the new child shards. These are deterministic,
+calculated from the parent shard's index, and the number of children being created (this
+is an input to the API, and validated to be a power of two). In a trivial example, splitting
+0001 in two always results in 0002 and 0102.
+
+Child shard indices are chosen such that the childrens' parts of the keyspace will
+be subsets of the parent's parts of the keyspace.
+
+#### Step 1: write new remote indices
+
+In remote storage, splitting is very simple: we may just write new index_part.json
+objects for each child shard, containing exactly the same layers as the parent shard.
+
+The children will have more data than they need, but this avoids any exhausive
+re-writing or copying of layer files.
+
+The index key path includes a generation number: the parent shard's current
+attached generation number will also be used for the child shards' indices. This
+makes the operation safely retryable: if everything crashes and restarts, we may
+call the split API again on the parent shard, and the result will be some new remote
+indices for the child shards, under a higher generation number.
+
+#### Step 2: start new `Tenant` objects
+
+A new `Tenant` object may be instantiated for each child shard, while the parent
+shard still exists. When calling the tenant_spawn function for this object,
+the remote index from step 1 will be read, and the child shard will start
+to ingest WAL to catch up from whatever was in the remote storage at step 1.
+
+We now wait for child shards' WAL ingestion to catch up with the parent shard,
+so that we can safely tear down the parent shard without risking an availability
+gap to clients reading recent LSNs.
+
+#### Step 3: tear down parent `Tenant` object
+
+Once child shards are running and have caught up with WAL ingest, we no longer
+need the parent shard. Note that clients may still be using it -- when we
+shut it down, any page_service handlers will also shut down, causing clients
+to disconnect. When the client reconnects, it will re-lookup the tenant,
+and hit the child shard instead of the parent (shard lookup from page_service
+should bias toward higher ShardCount shards).
+
+Note that at this stage the page service client has not yet been notified of
+any split. In the trivial single split example:
+
+- Shard 0001 is gone: Tenant object torn down
+- Shards 0002 and 0102 are running on the same pageserver where Shard 0001 used to live.
+- Clients will continue to connect to that server thinking that shard 0001 is there,
+  and all requests will work, because any key that was in shard 0001 is definitely
+  available in either shard 0002 or shard 0102.
+- Eventually, the storage controller (not the pageserver) will decide to migrate
+  some child shards away: at that point it will do a live migration, ensuring
+  that the client has an updated configuration before it detaches anything
+  from the original server.
+
+#### Complete
+
+When we send a 200 response to the split request, we are promising the caller:
+
+- That the child shards are persistent in remote storage
+- That the parent shard has been shut down
+
+This enables the caller to proceed with the overall shard split operation, which
+may involve other shards on other pageservers.
+
+### Storage Controller Split procedure
+
+Splitting a tenant requires calling the pageserver split API, and tracking
+enough state to ensure recovery + completion in the event of any component (pageserver
+or storage controller) crashing (or request timing out) during the split.
+
+1. call the split API on all existing shards. Ensure that the resulting
+   child shards are pinned to their pageservers until _all_ the split calls are done.
+   This pinning may be implemented as a "split bit" on the tenant shards, that
+   blocks any migrations, and also acts as a sign that if we restart, we must go
+   through some recovery steps to resume the split.
+2. Once all the split calls are done, we may unpin the child shards (clear
+   the split bit). The split is now complete: subsequent steps are just migrations,
+   not strictly part of the split.
+3. Try to schedule new pageserver locations for the child shards, using
+   a soft anti-affinity constraint to place shards from the same tenant onto different
+   pageservers.
+
+Updating computes about the new shard count is not necessary until we migrate
+any of the child shards away from the parent's location.
+
+### Recovering from failures
+
+#### Rolling back an incomplete split
+
+An incomplete shard split may be rolled back quite simply, by attaching the parent shards to pageservers,
+and detaching child shards. This will lose any WAL ingested into the children after the parents
+were detached earlier, but the parents will catch up.
+
+No special pageserver API is needed for this. From the storage controllers point of view, the
+procedure is:
+
+1. For all parent shards in the tenant, ensure they are attached
+2. For all child shards, ensure they are not attached
+3. Drop child shards from the storage controller's database, and clear the split bit on the parent shards.
+
+Any remote storage content for child shards is left behind. This is similar to other cases where
+we may leave garbage objects in S3 (e.g. when we upload a layer but crash before uploading an
+index that references it). Future online scrub/cleanup functionality can remove these objects, or
+they will be removed when the tenant is deleted, as tenant deletion lists all objects in the prefix,
+which would include any child shards that were rolled back.
+
+If any timelines had been created on child shards, they will be lost when rolling back. To mitigate
+this, we will **block timeline creation during splitting**, so that we can safely roll back until
+the split is complete, without risking losing timelines.
+
+Rolling back an incomplete split will happen automatically if a split fails due to some fatal
+reason, and will not be accessible via an API:
+
+- A pageserver fails to complete its split API request after too many retries
+- A pageserver returns a fatal unexpected error such as 400 or 500
+- The storage controller database returns a non-retryable error
+- Some internal invariant is violated in the storage controller split code
+
+#### Rolling back a complete split
+
+A complete shard split may be rolled back similarly to an incomplete split, with the following
+modifications:
+
+- The parent shards will no longer exist in the storage controller database, so these must
+  be re-synthesized somehow: the hard part of this is figuring the parent shards' generations. This
+  may be accomplished either by probing in S3, or by retaining some tombstone state for deleted
+  shards in the storage controller database.
+- Any timelines that were created after the split complete will disappear when rolling back
+  to the tenant shards. For this reason, rolling back after a complete split should only
+  be done due to serious issues where loss of recently created timelines is acceptable, or
+  in cases where we have confirmed that no timelines were created in the intervening period.
+- Parent shards' layers must not have been deleted: this property will come "for free" when
+  we first roll out sharding, by simply not implementing deletion of parent layers after
+  a split. When we do implement such deletion (see "Cleaning up parent-shard layers" in the
+  Optimizations section), it should apply a TTL to layers such that we have a
+  defined walltime window in which rollback will be possible.
+
+The storage controller will expose an API for rolling back a complete split, for use
+in the field if we encounter some critical bug with a post-split tenant.
+
+#### Retrying API calls during Pageserver Restart
+
+When a pageserver restarts during a split API call, it may witness on-disk content for both parent and
+child shards from an ongoing split. This does not intrinsically break anything, and the
+pageserver may include all these shards in its `/re-attach` request to the storage controller.
+
+In order to support such restarts, it is important that the storage controller stores
+persistent records of each child shard before it calls into a pageserver, as these child shards
+may require generation increments via a `/re-attach` request.
+
+The pageserver restart will also result in a failed API call from the storage controller's point
+of view. Recall that if _any_ pageserver fails to split, the overall split operation may not
+complete, and all shards must remain pinned to their current pageserver locations until the
+split is done.
+
+The pageserver API calls during splitting will retry on transient errors, so that
+short availability gaps do not result in a failure of the overall operation. The
+split in progress will be automatically rolled back if the threshold for API
+retries is reached (e.g. if a pageserver stays offline for longer than a typical
+restart).
+
+#### Rollback on Storage Controller Restart
+
+On startup, the storage controller will inspect the split bit for tenant shards that
+it loads from the database. If any splits are in progress:
+
+- Database content will be reverted to the parent shards
+- Child shards will be dropped from memory
+- The parent and child shards will be included in the general startup reconciliation that
+  the storage controller does: any child shards will be detached from pageservers because
+  they don't exist in the storage controller's expected set of shards, and parent shards
+  will be attached if they aren't already.
+
+#### Storage controller API request failures/retries
+
+The split request handler will implement idempotency: if the [`Tenant`] requested to split
+doesn't exist, we will check for the would-be child shards, and if they already exist,
+we consider the request complete.
+
+If a request is retried while the original request is still underway, then the split
+request handler will notice an InProgress marker in TenantManager, and return 503
+to encourage the client to backoff/retry. This is the same as the general pageserver
+API handling for calls that try to act on an InProgress shard.
+
+#### Compute start/restart during a split
+
+If a compute starts up during split, it will be configured with the old sharding
+configuration. This will work for reads irrespective of the progress of the split
+as long as no child hards have been migrated away from their original location, and
+this is guaranteed in the split procedure (see earlier section).
+
+#### Pageserver fails permanently during a split
+
+If a pageserver permanently fails (i.e. the storage controller availability state for it
+goes to Offline) while a split is in progress, the splitting operation will roll back, and
+during the roll back it will skip any API calls to the offline pageserver. If the offline
+pageserver becomes available again, any stale locations will be cleaned up via the normal reconciliation process (the `/re-attach` API).
+
+### Handling secondary locations
+
+For correctness, it is not necessary to split secondary locations. We can simply detach
+the secondary locations for parent shards, and then attach new secondary locations
+for child shards.
+
+Clearly this is not optimal, as it will result in re-downloads of layer files that
+were already present on disk. See "Splitting secondary locations"
+
+### Conditions to trigger a split
+
+The pageserver will expose a new API for reporting on shards that are candidates
+for split: this will return a top-N report of the largest tenant shards by
+physical size (remote size). This should exclude any tenants that are already
+at the maximum configured shard count.
+
+The API would look something like:
+`/v1/top_n_tenant?shard_count_lt=8&sort_by=resident_size`
+
+The storage controller will poll that API across all pageservers it manages at some appropriate interval (e.g. 60 seconds).
+
+A split operation will be started when the tenant exceeds some threshold. This threshold
+should be _less than_ how large we actually want shards to be, perhaps much less. That's to
+minimize the amount of work involved in splitting -- if we want 100GiB shards, we shouldn't
+wait for a tenant to exceed 100GiB before we split anything. Some data analysis of existing
+tenant size distribution may be useful here: if we can make a statement like "usually, if
+a tenant has exceeded 20GiB they're probably going to exceed 100GiB later", then we might
+make our policy to split a tenant at 20GiB.
+
+The finest split we can do is by factors of two, but we can do higher-cardinality splits
+too, and this will help to reduce the overhead of repeatedly re-splitting a tenant
+as it grows. An example of a very simple heuristic for early deployment of the splitting
+feature would be: "Split tenants into 8 shards when their physical size exceeds 64GiB": that
+would give us two kinds of tenant (1 shard and 8 shards), and the confidence that once we had
+split a tenant, it will not need re-splitting soon after.
+
+## Optimizations
+
+### Flush parent shard to remote storage during split
+
+Any data that is in WAL but not remote storage at time of split will need
+to be replayed by child shards when they start for the first time. To minimize
+this work, we may flush the parent shard to remote storage before writing the
+remote indices for child shards.
+
+It is important that this flush is subject to some time bounds: we may be splitting
+in response to a surge of write ingest, so it may be time-critical to split. A
+few seconds to flush latest data should be sufficient to optimize common cases without
+running the risk of holding up a split for a harmful length of time when a parent
+shard is being written heavily. If the flush doesn't complete in time, we may proceed
+to shut down the parent shard and carry on with the split.
+
+### Hard linking parent layers into child shard directories
+
+Before we start the Tenant objects for child shards, we may pre-populate their
+local storage directories with hard links to the layer files already present
+in the parent shard's local directory. When the child shard starts and downloads
+its remote index, it will find all those layer files already present on local disk.
+
+This avoids wasting download capacity and makes splitting faster, but more importantly
+it avoids taking up a factor of N more disk space when splitting 1 shard into N.
+
+This mechanism will work well in typical flows where shards are migrated away
+promptly after a split, but for the general case including what happens when
+layers are evicted and re-downloaded after a split, see the 'Proactive compaction'
+section below.
+
+### Filtering during compaction
+
+Compaction, especially image layer generation, should skip any keys that are
+present in a shard's layer files, but do not match the shard's ShardIdentity's
+is_key_local() check. This avoids carrying around data for longer than necessary
+in post-split compactions.
+
+This was already implemented in https://github.com/neondatabase/neon/pull/6246
+
+### Proactive compaction
+
+In remote storage, there is little reason to rewrite any data on a shard split:
+all the children can reference parent layers via the very cheap write of the child
+index_part.json.
+
+In local storage, things are more nuanced. During the initial split there is no
+capacity cost to duplicating parent layers, if we implement the hard linking
+optimization described above. However, as soon as any layers are evicted from
+local disk and re-downloaded, the downloaded layers will not be hard-links any more:
+they'll have real capacity footprint. That isn't a problem if we migrate child shards
+away from the parent node swiftly, but it risks a significant over-use of local disk
+space if we do not.
+
+For example, if we did an 8-way split of a shard, and then _didn't_ migrate 7 of
+the shards elsewhere, then churned all the layers in all the shards via eviction,
+then we would blow up the storage capacity used on the node by 8x. If we're splitting
+a 100GB shard, that could take the pageserver to the point of exhausting disk space.
+
+To avoid this scenario, we could implement a special compaction mode where we just
+read historic layers, drop unwanted keys, and write back the layer file. This
+is pretty expensive, but useful if we have split a large shard and are not going to
+migrate the child shards away.
+
+The heuristic conditions for triggering such a compaction are:
+
+- A) eviction plus time: if a child shard
+  has existed for more than a time threshold, and has been requested to perform at least one eviction, then it becomes urgent for this child shard to execute a proactive compaction to reduce its storage footprint, at the cost of I/O load.
+- B) resident size plus time: we may inspect the resident layers and calculate how
+  many of them include the overhead of storing pre-split keys. After some time
+  threshold (different to the one in case A) we still have such layers occupying
+  local disk space, then we should proactively compact them.
+
+### Cleaning up parent-shard layers
+
+It is functionally harmless to leave parent shard layers in remote storage indefinitely.
+They would be cleaned up in the event of the tenant's deletion.
+
+As an optimization to avoid leaking remote storage capacity (which costs money), we may
+lazily clean up parent shard layers once no child shards reference them.
+
+This may be done _very_ lazily: e.g. check every PITR interval. The cleanup procedure is:
+
+- list all the key prefixes beginning with the tenant ID, and select those shard prefixes
+  which do not belong to the most-recently-split set of shards (_ancestral shards_, i.e. `shard*count < max(shard_count) over all shards)`, and those shard prefixes which do have the latest shard count (_current shards_)
+- If there are no _ancestral shard_ prefixes found, we have nothing to clean up and
+  may drop out now.
+- find the latest-generation index for each _current shard_, read all and accumulate the set of layers belonging to ancestral shards referenced by these indices.
+- for all ancestral shards, list objects in the prefix and delete any layer which was not
+  referenced by a current shard.
+
+If this cleanup is scheduled for 1-2 PITR periods after the split, there is a good chance that child shards will have written their own image layers covering the whole keyspace, such that all parent shard layers will be deletable.
+
+The cleanup may be done by the scrubber (external process), or we may choose to have
+the zeroth shard in the latest generation do the work -- there is no obstacle to one shard
+reading the other shard's indices at runtime, and we do not require visibility of the
+latest index writes.
+
+Cleanup should be artificially delayed by some period (for example 24 hours) to ensure
+that we retain the option to roll back a split in case of bugs.
+
+### Splitting secondary locations
+
+We may implement a pageserver API similar to the main splitting API, which does a simpler
+operation for secondary locations: it would not write anything to S3, instead it would simply
+create the child shard directory on local disk, hard link in directories from the parent,
+and set up the in memory (TenantSlot) state for the children.
+
+Similar to attached locations, a subset of secondary locations will probably need re-locating
+after the split is complete, to avoid leaving multiple child shards on the same pageservers,
+where they may use excessive space for the tenant.
+
+## FAQ/Alternatives
+
+### What should the thresholds be set to?
+
+Shard size limit: the pre-sharding default capacity quota for databases was 200GiB, so this could be a starting point for the per-shard size limit.
+
+Max shard count:
+
+- The safekeeper overhead to sharding is currently O(N) network bandwidth because
+  the un-filtered WAL is sent to all shards. To avoid this growing out of control,
+  a limit of 8 shards should be temporarily imposed until WAL filtering is implemented
+  on the safekeeper.
+- there is also little benefit to increasing the shard count beyond the number
+  of pageservers in a region.
+
+### Is it worth just rewriting all the data during a split to simplify reasoning about space?

From bc1efa827f794fee9ac3755b4d51b344dd2cefbf Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 16:07:36 +0000
Subject: [PATCH 0416/1571] pageserver: exclude gc_horizon from synthetic size
 calculation (#6407)

## Problem

See:
- https://github.com/neondatabase/neon/issues/6374

## Summary of changes

Whereas previously we calculated synthetic size from the gc_horizon or
the pitr_interval (whichever is the lower LSN), now we ignore gc_horizon
and exclusively start from the `pitr_interval`. This is a more generous
calculation for billing, where we do not charge users for data retained
due to gc_horizon.
---
 pageserver/src/tenant/size.rs           |  8 +++-
 pageserver/src/tenant/timeline.rs       |  5 ++-
 test_runner/regress/test_tenant_size.py | 54 +++++++++++++++++++------
 3 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index e0b1652d98..ad79b74d8b 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -183,7 +183,13 @@ pub(super) async fn gather_inputs(
         // new gc run, which we have no control over. however differently from `Timeline::gc`
         // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
         // actually removing files.
-        let mut next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
+        //
+        // We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from
+        // a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather
+        // than a space bound (horizon cutoff).  This means that if someone drops a database and waits for their
+        // PITR interval, they will see synthetic size decrease, even if we are still storing data inside
+        // horizon_cutoff.
+        let mut next_gc_cutoff = gc_info.pitr_cutoff;
 
         // If the caller provided a shorter retention period, use that instead of the GC cutoff.
         let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d507a19de9..2ab7301cce 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3784,8 +3784,11 @@ impl Timeline {
                         // The timestamp is in the future. That sounds impossible,
                         // but what it really means is that there hasn't been
                         // any commits since the cutoff timestamp.
+                        //
+                        // In this case we should use the LSN of the most recent commit,
+                        // which is implicitly the last LSN in the log.
                         debug!("future({})", lsn);
-                        cutoff_horizon
+                        self.get_last_record_lsn()
                     }
                     LsnForTimestamp::Past(lsn) => {
                         debug!("past({})", lsn);
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 7cea301a9c..025cc930d7 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,3 +1,4 @@
+import os
 from pathlib import Path
 from typing import List, Tuple
 
@@ -326,7 +327,7 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa
     size_debug_file.write(size_debug)
 
 
-@pytest.mark.xfail
+@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
 def test_single_branch_get_tenant_size_grows(
     neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion
 ):
@@ -349,10 +350,21 @@ def test_single_branch_get_tenant_size_grows(
     # adjust the gc_horizon accordingly.
     if pg_version == PgVersion.V14:
         gc_horizon = 0x4A000
+    elif pg_version == PgVersion.V15:
+        gc_horizon = 0x3BA00
+    elif pg_version == PgVersion.V16:
+        gc_horizon = 210000
+    else:
+        raise NotImplementedError(pg_version)
 
-    neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
+    tenant_config = {
+        "compaction_period": "0s",
+        "gc_period": "0s",
+        "pitr_interval": "0s",
+        "gc_horizon": gc_horizon,
+    }
 
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_config)
 
     tenant_id = env.initial_tenant
     branch_name, timeline_id = env.neon_cli.list_timelines(tenant_id)[0]
@@ -405,6 +417,7 @@ def test_single_branch_get_tenant_size_grows(
             current_lsn = after_lsn
         size_debug_file.write(size_debug)
         assert size > 0
+        log.info(f"size: {size} at lsn {current_lsn}")
         return (current_lsn, size)
 
     with env.endpoints.create_start(
@@ -492,24 +505,41 @@ def test_single_branch_get_tenant_size_grows(
 
             collected_responses.append(("DELETE", current_lsn, size))
 
+        size_before_drop = get_current_consistent_size(
+            env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
+        )[1]
+
         with endpoint.cursor() as cur:
             cur.execute("DROP TABLE t0")
 
+        # Without setting a PITR interval, dropping the table doesn't reclaim any space
+        # from the user's point of view, because the DROP transaction is too small
+        # to fall out of gc_horizon.
+        (current_lsn, size) = get_current_consistent_size(
+            env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
+        )
+        prev_size = collected_responses[-1][2]
+        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+
+        # Set a tiny PITR interval to allow the DROP to impact the synthetic size
+        # Because synthetic size calculation uses pitr interval when available,
+        # when our tenant is configured with a tiny pitr interval, dropping a table should
+        # cause synthetic size to go down immediately
+        tenant_config["pitr_interval"] = "1ms"
+        env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config)
+        (current_lsn, size) = get_current_consistent_size(
+            env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
+        )
+        assert size < size_before_drop
+
         # The size of the tenant should still be as large as before we dropped
         # the table, because the drop operation can still be undone in the PITR
         # defined by gc_horizon.
-        (current_lsn, size) = get_current_consistent_size(
-            env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
-        )
-
-        prev_size = collected_responses[-1][2]
-
-        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
-
         collected_responses.append(("DROP", current_lsn, size))
 
     # Should have gone past gc_horizon, otherwise gc_horizon is too large
-    assert current_lsn - initdb_lsn > gc_horizon
+    bytes_written = current_lsn - initdb_lsn
+    assert bytes_written > gc_horizon
 
     # this isn't too many lines to forget for a while. observed while
     # developing these tests that locally the value is a bit more than what we

From 60f30000ef4c9b4c06aa0ba0455af4c4f0ba3940 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 15 Mar 2024 18:46:04 +0100
Subject: [PATCH 0417/1571] tokio-epoll-uring: fallback to std-fs if not
 available & not explicitly requested (#7120)

fixes https://github.com/neondatabase/neon/issues/7116

Changes:

- refactor PageServerConfigBuilder: support not-set values
- implement runtime feature test
- use runtime feature test to determine `virtual_file_io_engine` if not
explicitly configured in the config
- log the effective engine at startup
- drive-by: improve assertion messages in `test_pageserver_init_node_id`

This needed a tiny bit of tokio-epoll-uring work, hence bumping it.
Changelog:

```
    git log --no-decorate --oneline --reverse 868d2c42b5d54ca82fead6e8f2f233b69a540d3e..342ddd197a060a8354e8f11f4d12994419fff939
    c7a74c6 Bump mio from 0.8.8 to 0.8.11
    4df3466 Bump mio from 0.8.8 to 0.8.11 (#47)
    342ddd1 lifecycle: expose `LaunchResult` enum (#49)
```
---
 Cargo.lock                                 |   4 +-
 pageserver/src/bin/pageserver.rs           |   3 +
 pageserver/src/config.rs                   | 235 ++++++++++-----------
 pageserver/src/virtual_file.rs             |   2 +
 pageserver/src/virtual_file/io_engine.rs   |  76 +++++++
 test_runner/regress/test_pageserver_api.py |  15 +-
 6 files changed, 195 insertions(+), 140 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b8b276d74f..99ba8b1cb3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5887,7 +5887,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
 dependencies = [
  "futures",
  "nix 0.26.4",
@@ -6424,7 +6424,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
 dependencies = [
  "bytes",
  "io-uring",
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 59750897ff..6380a4c6c1 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -120,6 +120,9 @@ fn main() -> anyhow::Result<()> {
         &[("node_id", &conf.id.to_string())],
     );
 
+    // after setting up logging, log the effective IO engine choice
+    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
+
     let tenants_path = conf.tenants_path();
     if !tenants_path.exists() {
         utils::crashsafe::create_dir_all(conf.tenants_path())
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 7dac0ab352..8ad9ade4a9 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -30,15 +30,14 @@ use utils::{
     logging::LogFormat,
 };
 
-use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
-use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{
     TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
-use crate::virtual_file;
+use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
+use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{
     IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
     TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
@@ -291,16 +290,23 @@ pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
 
 // use dedicated enum for builder to better indicate the intention
 // and avoid possible confusion with nested options
+#[derive(Clone, Default)]
 pub enum BuilderValue<T> {
     Set(T),
+    #[default]
     NotSet,
 }
 
-impl<T> BuilderValue<T> {
-    pub fn ok_or<E>(self, err: E) -> Result<T, E> {
+impl<T: Clone> BuilderValue<T> {
+    pub fn ok_or(&self, field_name: &'static str, default: BuilderValue<T>) -> anyhow::Result<T> {
         match self {
-            Self::Set(v) => Ok(v),
-            Self::NotSet => Err(err),
+            Self::Set(v) => Ok(v.clone()),
+            Self::NotSet => match default {
+                BuilderValue::Set(v) => Ok(v.clone()),
+                BuilderValue::NotSet => {
+                    anyhow::bail!("missing config value {field_name:?}")
+                }
+            },
         }
     }
 }
@@ -326,6 +332,7 @@ pub(crate) struct NodeMetadata {
 }
 
 // needed to simplify config construction
+#[derive(Default)]
 struct PageServerConfigBuilder {
     listen_pg_addr: BuilderValue<String>,
 
@@ -393,8 +400,9 @@ struct PageServerConfigBuilder {
     validate_vectored_get: BuilderValue<bool>,
 }
 
-impl Default for PageServerConfigBuilder {
-    fn default() -> Self {
+impl PageServerConfigBuilder {
+    #[inline(always)]
+    fn default_values() -> Self {
         use self::BuilderValue::*;
         use defaults::*;
         Self {
@@ -647,125 +655,96 @@ impl PageServerConfigBuilder {
     }
 
     pub fn build(self) -> anyhow::Result<PageServerConf> {
-        let concurrent_tenant_warmup = self
-            .concurrent_tenant_warmup
-            .ok_or(anyhow!("missing concurrent_tenant_warmup"))?;
-        let concurrent_tenant_size_logical_size_queries = self
-            .concurrent_tenant_size_logical_size_queries
-            .ok_or(anyhow!(
-                "missing concurrent_tenant_size_logical_size_queries"
-            ))?;
-        Ok(PageServerConf {
-            listen_pg_addr: self
-                .listen_pg_addr
-                .ok_or(anyhow!("missing listen_pg_addr"))?,
-            listen_http_addr: self
-                .listen_http_addr
-                .ok_or(anyhow!("missing listen_http_addr"))?,
-            availability_zone: self
-                .availability_zone
-                .ok_or(anyhow!("missing availability_zone"))?,
-            wait_lsn_timeout: self
-                .wait_lsn_timeout
-                .ok_or(anyhow!("missing wait_lsn_timeout"))?,
-            wal_redo_timeout: self
-                .wal_redo_timeout
-                .ok_or(anyhow!("missing wal_redo_timeout"))?,
-            superuser: self.superuser.ok_or(anyhow!("missing superuser"))?,
-            page_cache_size: self
-                .page_cache_size
-                .ok_or(anyhow!("missing page_cache_size"))?,
-            max_file_descriptors: self
-                .max_file_descriptors
-                .ok_or(anyhow!("missing max_file_descriptors"))?,
-            workdir: self.workdir.ok_or(anyhow!("missing workdir"))?,
-            pg_distrib_dir: self
-                .pg_distrib_dir
-                .ok_or(anyhow!("missing pg_distrib_dir"))?,
-            http_auth_type: self
-                .http_auth_type
-                .ok_or(anyhow!("missing http_auth_type"))?,
-            pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?,
-            auth_validation_public_key_path: self
-                .auth_validation_public_key_path
-                .ok_or(anyhow!("missing auth_validation_public_key_path"))?,
-            remote_storage_config: self
-                .remote_storage_config
-                .ok_or(anyhow!("missing remote_storage_config"))?,
-            id: self.id.ok_or(anyhow!("missing id"))?,
-            // TenantConf is handled separately
-            default_tenant_conf: TenantConf::default(),
-            broker_endpoint: self
-                .broker_endpoint
-                .ok_or(anyhow!("No broker endpoints provided"))?,
-            broker_keepalive_interval: self
-                .broker_keepalive_interval
-                .ok_or(anyhow!("No broker keepalive interval provided"))?,
-            log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
-            concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
-            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
-                concurrent_tenant_size_logical_size_queries,
-            ),
-            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
-                concurrent_tenant_size_logical_size_queries,
-            ),
-            metric_collection_interval: self
-                .metric_collection_interval
-                .ok_or(anyhow!("missing metric_collection_interval"))?,
-            cached_metric_collection_interval: self
-                .cached_metric_collection_interval
-                .ok_or(anyhow!("missing cached_metric_collection_interval"))?,
-            metric_collection_endpoint: self
-                .metric_collection_endpoint
-                .ok_or(anyhow!("missing metric_collection_endpoint"))?,
-            synthetic_size_calculation_interval: self
-                .synthetic_size_calculation_interval
-                .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
-            disk_usage_based_eviction: self
-                .disk_usage_based_eviction
-                .ok_or(anyhow!("missing disk_usage_based_eviction"))?,
-            test_remote_failures: self
-                .test_remote_failures
-                .ok_or(anyhow!("missing test_remote_failuers"))?,
-            ondemand_download_behavior_treat_error_as_warn: self
-                .ondemand_download_behavior_treat_error_as_warn
-                .ok_or(anyhow!(
-                    "missing ondemand_download_behavior_treat_error_as_warn"
-                ))?,
-            background_task_maximum_delay: self
-                .background_task_maximum_delay
-                .ok_or(anyhow!("missing background_task_maximum_delay"))?,
-            control_plane_api: self
-                .control_plane_api
-                .ok_or(anyhow!("missing control_plane_api"))?,
-            control_plane_api_token: self
-                .control_plane_api_token
-                .ok_or(anyhow!("missing control_plane_api_token"))?,
-            control_plane_emergency_mode: self
-                .control_plane_emergency_mode
-                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
-            heatmap_upload_concurrency: self
-                .heatmap_upload_concurrency
-                .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
-            secondary_download_concurrency: self
-                .secondary_download_concurrency
-                .ok_or(anyhow!("missing secondary_download_concurrency"))?,
-            ingest_batch_size: self
-                .ingest_batch_size
-                .ok_or(anyhow!("missing ingest_batch_size"))?,
-            virtual_file_io_engine: self
-                .virtual_file_io_engine
-                .ok_or(anyhow!("missing virtual_file_io_engine"))?,
-            get_vectored_impl: self
-                .get_vectored_impl
-                .ok_or(anyhow!("missing get_vectored_impl"))?,
-            max_vectored_read_bytes: self
-                .max_vectored_read_bytes
-                .ok_or(anyhow!("missing max_vectored_read_bytes"))?,
-            validate_vectored_get: self
-                .validate_vectored_get
-                .ok_or(anyhow!("missing validate_vectored_get"))?,
-        })
+        let default = Self::default_values();
+
+        macro_rules! conf {
+            (USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => {
+                PageServerConf {
+                    $(
+                        $field: self.$field.ok_or(stringify!($field), default.$field)?,
+                    )*
+                    $(
+                        $custom_field: $custom_value,
+                    )*
+                }
+            };
+        }
+
+        Ok(conf!(
+            USING DEFAULT
+            {
+                listen_pg_addr,
+                listen_http_addr,
+                availability_zone,
+                wait_lsn_timeout,
+                wal_redo_timeout,
+                superuser,
+                page_cache_size,
+                max_file_descriptors,
+                workdir,
+                pg_distrib_dir,
+                http_auth_type,
+                pg_auth_type,
+                auth_validation_public_key_path,
+                remote_storage_config,
+                id,
+                broker_endpoint,
+                broker_keepalive_interval,
+                log_format,
+                metric_collection_interval,
+                cached_metric_collection_interval,
+                metric_collection_endpoint,
+                synthetic_size_calculation_interval,
+                disk_usage_based_eviction,
+                test_remote_failures,
+                ondemand_download_behavior_treat_error_as_warn,
+                background_task_maximum_delay,
+                control_plane_api,
+                control_plane_api_token,
+                control_plane_emergency_mode,
+                heatmap_upload_concurrency,
+                secondary_download_concurrency,
+                ingest_batch_size,
+                get_vectored_impl,
+                max_vectored_read_bytes,
+                validate_vectored_get,
+            }
+            CUSTOM LOGIC
+            {
+                // TenantConf is handled separately
+                default_tenant_conf: TenantConf::default(),
+                concurrent_tenant_warmup: ConfigurableSemaphore::new({
+                    self
+                        .concurrent_tenant_warmup
+                        .ok_or("concurrent_tenant_warmpup",
+                               default.concurrent_tenant_warmup)?
+                }),
+                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
+                    self
+                        .concurrent_tenant_size_logical_size_queries
+                        .ok_or("concurrent_tenant_size_logical_size_queries",
+                               default.concurrent_tenant_size_logical_size_queries.clone())?
+                ),
+                eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
+                    // re-use `concurrent_tenant_size_logical_size_queries`
+                    self
+                        .concurrent_tenant_size_logical_size_queries
+                        .ok_or("eviction_task_immitated_concurrent_logical_size_queries",
+                               default.concurrent_tenant_size_logical_size_queries.clone())?,
+                ),
+                virtual_file_io_engine: match self.virtual_file_io_engine {
+                    BuilderValue::Set(v) => v,
+                    BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? {
+                        io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise
+                        io_engine::FeatureTestResult::Worse { engine, remark } => {
+                            // TODO: bubble this up to the caller so we can tracing::warn! it.
+                            eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}");
+                            engine
+                        }
+                    },
+                },
+            }
+        ))
     }
 }
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 6d4774cf75..ae44e9edc4 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -28,6 +28,8 @@ use tokio::time::Instant;
 
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
+pub use io_engine::feature_test as io_engine_feature_test;
+pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
 pub(crate) use io_engine::IoEngineKind;
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index e369d28711..7f2342e76e 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -253,3 +253,79 @@ impl IoEngine {
         }
     }
 }
+
+pub enum FeatureTestResult {
+    PlatformPreferred(IoEngineKind),
+    Worse {
+        engine: IoEngineKind,
+        remark: String,
+    },
+}
+
+impl FeatureTestResult {
+    #[cfg(target_os = "linux")]
+    const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::TokioEpollUring;
+    #[cfg(not(target_os = "linux"))]
+    const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::StdFs;
+}
+
+impl From<FeatureTestResult> for IoEngineKind {
+    fn from(val: FeatureTestResult) -> Self {
+        match val {
+            FeatureTestResult::PlatformPreferred(e) => e,
+            FeatureTestResult::Worse { engine, .. } => engine,
+        }
+    }
+}
+
+/// Somewhat costly under the hood, do only once.
+/// Panics if we can't set up the feature test.
+pub fn feature_test() -> anyhow::Result<FeatureTestResult> {
+    std::thread::spawn(|| {
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap();
+        #[cfg(not(target_os = "linux"))]
+        {
+            return Ok(FeatureTestResult::PlatformPreferred(
+                FeatureTestResult::PLATFORM_DEFAULT,
+            ));
+        }
+        #[cfg(target_os = "linux")]
+        Ok(match rt.block_on(tokio_epoll_uring::System::launch()) {
+            Ok(_) => FeatureTestResult::PlatformPreferred({
+                assert!(matches!(
+                    IoEngineKind::TokioEpollUring,
+                    FeatureTestResult::PLATFORM_PREFERRED
+                ));
+                FeatureTestResult::PLATFORM_PREFERRED
+            }),
+            Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) => {
+                let remark = match e.raw_os_error() {
+                    Some(nix::libc::EPERM) => {
+                        // fall back
+                        "creating tokio-epoll-uring fails with EPERM, assuming it's admin-disabled "
+                            .to_string()
+                    }
+                   Some(nix::libc::EFAULT) => {
+                        // fail feature test
+                        anyhow::bail!(
+                            "creating tokio-epoll-uring fails with EFAULT, might have corrupted memory"
+                        );
+                    }
+                    Some(_) | None => {
+                        // fall back
+                        format!("creating tokio-epoll-uring fails with error: {e:#}")
+                    }
+               };
+                FeatureTestResult::Worse {
+                    engine: IoEngineKind::StdFs,
+                    remark,
+                }
+            }
+        })
+    })
+    .join()
+    .unwrap()
+}
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 877deee08f..81aed704bb 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -37,23 +37,18 @@ def test_pageserver_init_node_id(
     assert (
         bad_init.returncode == 1
     ), "pageserver should not be able to init new config without the node id"
-    assert "missing id" in bad_init.stderr
+    assert 'missing config value "id"' in bad_init.stderr
     assert not pageserver_config.exists(), "config file should not be created after init error"
 
-    completed_init = run_pageserver(
-        ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']
-    )
+    good_init_cmd = ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']
+    completed_init = run_pageserver(good_init_cmd)
     assert (
         completed_init.returncode == 0
     ), "pageserver should be able to create a new config with the node id given"
     assert pageserver_config.exists(), "config file should be created successfully"
 
-    bad_reinit = run_pageserver(
-        ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']
-    )
-    assert (
-        bad_reinit.returncode == 1
-    ), "pageserver should not be able to init new config without the node id"
+    bad_reinit = run_pageserver(good_init_cmd)
+    assert bad_reinit.returncode == 1, "pageserver refuses to init if already exists"
     assert "already exists, cannot init it" in bad_reinit.stderr
 
     bad_update = run_pageserver(["--update-config", "-c", "id = 3"])

From 1aa159accac6f80b2565cf79d30ac584959fe32a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 18:03:49 +0000
Subject: [PATCH 0418/1571] pageserver: cancellation for remote ops in tenant
 deletion on shutdown (#6105)

## Problem

Tenant deletion had a couple of TODOs where we weren't using proper
cancellation tokens that would have aborted the deletions during process
shutdown.

## Summary of changes

- Refactor enough that deletion/shutdown code has access to the
TenantManager's cancellation toke
- Use that cancellation token in tenant deletion instead of dummy
tokens.
---
 pageserver/src/bin/pageserver.rs            |  4 +-
 pageserver/src/lib.rs                       |  9 ++++-
 pageserver/src/task_mgr.rs                  |  4 +-
 pageserver/src/tenant/delete.rs             | 21 +++++------
 pageserver/src/tenant/mgr.rs                | 42 +++++++++++++--------
 test_runner/regress/test_tenant_delete.py   |  4 +-
 test_runner/regress/test_timeline_delete.py |  6 +--
 7 files changed, 51 insertions(+), 39 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 6380a4c6c1..1fd7c775d5 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -317,6 +317,7 @@ fn start_pageserver(
     let http_listener = tcp_listener::bind(http_addr)?;
 
     let pg_addr = &conf.listen_pg_addr;
+
     info!("Starting pageserver pg protocol handler on {pg_addr}");
     let pageserver_listener = tcp_listener::bind(pg_addr)?;
 
@@ -549,7 +550,7 @@ fn start_pageserver(
         let router_state = Arc::new(
             http::routes::State::new(
                 conf,
-                tenant_manager,
+                tenant_manager.clone(),
                 http_auth.clone(),
                 remote_storage.clone(),
                 broker_client.clone(),
@@ -693,6 +694,7 @@ fn start_pageserver(
                 let bg_remote_storage = remote_storage.clone();
                 let bg_deletion_queue = deletion_queue.clone();
                 BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+                    &tenant_manager,
                     bg_remote_storage.map(|_| bg_deletion_queue),
                     0,
                 ));
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index b00db02a1c..f947a75f61 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -31,6 +31,7 @@ pub mod walredo;
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
+use tenant::mgr::TenantManager;
 use tracing::info;
 
 /// Current storage format version
@@ -53,7 +54,11 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 pub use crate::metrics::preinitialize_metrics;
 
 #[tracing::instrument(skip_all, fields(%exit_code))]
-pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
+pub async fn shutdown_pageserver(
+    tenant_manager: &TenantManager,
+    deletion_queue: Option<DeletionQueue>,
+    exit_code: i32,
+) {
     use std::time::Duration;
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
@@ -67,7 +72,7 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
     // Shut down all the tenants. This flushes everything to disk and kills
     // the checkpoint and GC tasks.
     timed(
-        tenant::mgr::shutdown_all_tenants(),
+        tenant_manager.shutdown(),
         "shutdown all tenants",
         Duration::from_secs(5),
     )
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 275a72c0b0..69e163effa 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -50,8 +50,6 @@ use once_cell::sync::Lazy;
 
 use utils::id::TimelineId;
 
-use crate::shutdown_pageserver;
-
 //
 // There are four runtimes:
 //
@@ -453,7 +451,7 @@ async fn task_finish(
     }
 
     if shutdown_process {
-        shutdown_pageserver(None, 1).await;
+        std::process::exit(1);
     }
 }
 
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index ffb7206b1e..cab60c3111 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -296,6 +296,7 @@ impl DeleteTenantFlow {
         remote_storage: Option<GenericRemoteStorage>,
         tenants: &'static std::sync::RwLock<TenantsMap>,
         tenant: Arc<Tenant>,
+        cancel: &CancellationToken,
     ) -> Result<(), DeleteTenantError> {
         span::debug_assert_current_span_has_tenant_id();
 
@@ -303,7 +304,9 @@ impl DeleteTenantFlow {
 
         let mut guard = Self::prepare(&tenant).await?;
 
-        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
+        if let Err(e) =
+            Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
+        {
             tenant.set_broken(format!("{e:#}")).await;
             return Err(e);
         }
@@ -322,6 +325,7 @@ impl DeleteTenantFlow {
         conf: &'static PageServerConf,
         remote_storage: Option<&GenericRemoteStorage>,
         tenant: &Tenant,
+        cancel: &CancellationToken,
     ) -> Result<(), DeleteTenantError> {
         guard.mark_in_progress()?;
 
@@ -335,15 +339,9 @@ impl DeleteTenantFlow {
         // Though sounds scary, different mark name?
         // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
         if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(
-                conf,
-                remote_storage,
-                &tenant.tenant_shard_id,
-                // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
-                &CancellationToken::new(),
-            )
-            .await
-            .context("remote_mark")?
+            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
+                .await
+                .context("remote_mark")?
         }
 
         fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
@@ -546,8 +544,7 @@ impl DeleteTenantFlow {
             conf,
             remote_storage.as_ref(),
             &tenant.tenant_shard_id,
-            // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
-            &CancellationToken::new(),
+            &task_mgr::shutdown_token(),
         )
         .await?;
 
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 7cf03d8fd6..3aaab6e4ef 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -102,7 +102,7 @@ pub(crate) enum TenantsMap {
     /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
     /// New tenants can be added using [`tenant_map_acquire_slot`].
     Open(BTreeMap<TenantShardId, TenantSlot>),
-    /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
+    /// The pageserver has entered shutdown mode via [`TenantManager::shutdown`].
     /// Existing tenants are still accessible, but no new tenants can be created.
     ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
 }
@@ -261,6 +261,12 @@ pub struct TenantManager {
     // See https://github.com/neondatabase/neon/issues/5796
     tenants: &'static std::sync::RwLock<TenantsMap>,
     resources: TenantSharedResources,
+
+    // Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token.
+    // This is for edge cases like tenant deletion.  In normal cases (within a Tenant lifetime),
+    // tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or
+    // when the tenant detaches.
+    cancel: CancellationToken,
 }
 
 fn emergency_generations(
@@ -620,6 +626,7 @@ pub async fn init_tenant_mgr(
         conf,
         tenants: &TENANTS,
         resources,
+        cancel: CancellationToken::new(),
     })
 }
 
@@ -680,21 +687,6 @@ pub(crate) fn tenant_spawn(
     Ok(tenant)
 }
 
-///
-/// Shut down all tenants. This runs as part of pageserver shutdown.
-///
-/// NB: We leave the tenants in the map, so that they remain accessible through
-/// the management API until we shut it down. If we removed the shut-down tenants
-/// from the tenants map, the management API would return 404 for these tenants,
-/// because TenantsMap::get() now returns `None`.
-/// That could be easily misinterpreted by control plane, the consumer of the
-/// management API. For example, it could attach the tenant on a different pageserver.
-/// We would then be in split-brain once this pageserver restarts.
-#[instrument(skip_all)]
-pub(crate) async fn shutdown_all_tenants() {
-    shutdown_all_tenants0(&TENANTS).await
-}
-
 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
     let mut join_set = JoinSet::new();
 
@@ -1428,6 +1420,7 @@ impl TenantManager {
             self.resources.remote_storage.clone(),
             &TENANTS,
             tenant,
+            &self.cancel,
         )
         .await;
 
@@ -1817,6 +1810,23 @@ impl TenantManager {
 
         Ok(())
     }
+
+    ///
+    /// Shut down all tenants. This runs as part of pageserver shutdown.
+    ///
+    /// NB: We leave the tenants in the map, so that they remain accessible through
+    /// the management API until we shut it down. If we removed the shut-down tenants
+    /// from the tenants map, the management API would return 404 for these tenants,
+    /// because TenantsMap::get() now returns `None`.
+    /// That could be easily misinterpreted by control plane, the consumer of the
+    /// management API. For example, it could attach the tenant on a different pageserver.
+    /// We would then be in split-brain once this pageserver restarts.
+    #[instrument(skip_all)]
+    pub(crate) async fn shutdown(&self) {
+        self.cancel.cancel();
+
+        shutdown_all_tenants0(self.tenants).await
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 52de889084..a164c7f60a 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -184,7 +184,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
             # allow errors caused by failpoints
             f".*failpoint: {failpoint}",
             # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
             # We may leave some upload tasks in the queue. They're likely deletes.
             # For uploads we explicitly wait with `last_flush_lsn_upload` below.
             # So by ignoring these instead of waiting for empty upload queue
@@ -327,7 +327,7 @@ def test_tenant_delete_is_resumed_on_attach(
             # From deletion polling
             f".*NotFound: tenant {env.initial_tenant}.*",
             # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
             # error from http response is also logged
             ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
             '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 96a5cc491a..0eb1327c9e 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -204,7 +204,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
         [
             f".*{timeline_id}.*failpoint: {failpoint}",
             # It appears when we stopped flush loop during deletion and then pageserver is stopped
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
             # This happens when we fail before scheduling background operation.
             # Timeline is left in stopping state and retry tries to stop it again.
             ".*Ignoring new state, equal to the existing one: Stopping",
@@ -398,7 +398,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
             ".*failpoint: timeline-delete-before-rm",
             ".*Ignoring new state, equal to the existing one: Stopping",
             # this happens, because the stuck timeline is visible to shutdown
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
         ]
     )
 
@@ -809,7 +809,7 @@ def test_timeline_delete_resumed_on_attach(
             # allow errors caused by failpoints
             f".*failpoint: {failpoint}",
             # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
             # error from http response is also logged
             ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
             # Polling after attach may fail with this

From ad6f538aefe3286d827663c2ebdc28f4c2ba9613 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 15 Mar 2024 19:57:05 +0100
Subject: [PATCH 0419/1571] tokio-epoll-uring: use it for on-demand downloads
 (#6992)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# Problem

On-demand downloads are still using `tokio::fs`, which we know is
inefficient.

# Changes

- Add `pagebench ondemand-download-churn` to quantify on-demand download
throughput
- Requires dumping layer map, which required making `history_buffer`
impl `Deserialize`
- Implement an equivalent of `tokio::io::copy_buf` for owned buffers =>
`owned_buffers_io` module and children.
- Make layer file download sensitive to `io_engine::get()`, using
VirtualFile + above copy loop
- For this, I had to move some code into the `retry_download`, e.g.,
`sync_all()` call.

Drive-by:
- fix missing escaping in `scripts/ps_ec2_setup_instance_store`
- if we failed in retry_download to create a file, we'd try to remove
it, encounter `NotFound`, and `abort()` the process using
`on_fatal_io_error`. This PR adds treats `NotFound` as a success.

# Testing

Functional

- The copy loop is generic & unit tested.

Performance

- Used the `ondemand-download-churn` benchmark to manually test against
real S3.
- Results (public Notion page):
https://neondatabase.notion.site/Benchmarking-tokio-epoll-uring-on-demand-downloads-2024-04-15-newer-code-03c0fdc475c54492b44d9627b6e4e710?pvs=4
- Performance is equivalent at low concurrency. Jumpier situation at
high concurrency, but, still less CPU / throughput with
tokio-epoll-uring.
  - It’s a win.

# Future Work

Turn the manual performance testing described in the above results
document into a performance regression test:
https://github.com/neondatabase/neon/issues/7146
---
 libs/pageserver_api/src/models.rs             |  39 ++-
 libs/utils/src/history_buffer.rs              |  39 ++-
 pageserver/client/src/mgmt_api.rs             |  77 ++++-
 .../src/cmd/ondemand_download_churn.rs        | 272 ++++++++++++++++++
 pageserver/pagebench/src/main.rs              |   3 +
 .../tenant/remote_timeline_client/download.rs | 178 ++++++++----
 pageserver/src/tenant/storage_layer.rs        |   5 +-
 pageserver/src/virtual_file.rs                |  17 ++
 pageserver/src/virtual_file/io_engine.rs      |  75 ++---
 .../util/size_tracking_writer.rs              |  34 +++
 .../virtual_file/owned_buffers_io/write.rs    | 206 +++++++++++++
 scripts/ps_ec2_setup_instance_store           |   2 +-
 12 files changed, 845 insertions(+), 102 deletions(-)
 create mode 100644 pageserver/pagebench/src/cmd/ondemand_download_churn.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/write.rs

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 3aa84f8903..0d0702e38e 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -4,6 +4,7 @@ pub mod utilization;
 pub use utilization::PageserverUtilization;
 
 use std::{
+    borrow::Cow,
     collections::HashMap,
     io::{BufRead, Read},
     num::{NonZeroU64, NonZeroUsize},
@@ -577,7 +578,7 @@ pub struct TimelineInfo {
     pub walreceiver_status: String,
 }
 
-#[derive(Debug, Clone, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerMapInfo {
     pub in_memory_layers: Vec<InMemoryLayerInfo>,
     pub historic_layers: Vec<HistoricLayerInfo>,
@@ -595,7 +596,7 @@ pub enum LayerAccessKind {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerAccessStatFullDetails {
     pub when_millis_since_epoch: u64,
-    pub task_kind: &'static str,
+    pub task_kind: Cow<'static, str>,
     pub access_kind: LayerAccessKind,
 }
 
@@ -654,23 +655,23 @@ impl LayerResidenceEvent {
     }
 }
 
-#[derive(Debug, Clone, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerAccessStats {
     pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
-    pub task_kind_access_flag: Vec<&'static str>,
+    pub task_kind_access_flag: Vec<Cow<'static, str>>,
     pub first: Option<LayerAccessStatFullDetails>,
     pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
     pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }
 
-#[derive(Debug, Clone, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(tag = "kind")]
 pub enum InMemoryLayerInfo {
     Open { lsn_start: Lsn },
     Frozen { lsn_start: Lsn, lsn_end: Lsn },
 }
 
-#[derive(Debug, Clone, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(tag = "kind")]
 pub enum HistoricLayerInfo {
     Delta {
@@ -692,6 +693,32 @@ pub enum HistoricLayerInfo {
     },
 }
 
+impl HistoricLayerInfo {
+    pub fn layer_file_name(&self) -> &str {
+        match self {
+            HistoricLayerInfo::Delta {
+                layer_file_name, ..
+            } => layer_file_name,
+            HistoricLayerInfo::Image {
+                layer_file_name, ..
+            } => layer_file_name,
+        }
+    }
+    pub fn is_remote(&self) -> bool {
+        match self {
+            HistoricLayerInfo::Delta { remote, .. } => *remote,
+            HistoricLayerInfo::Image { remote, .. } => *remote,
+        }
+    }
+    pub fn set_remote(&mut self, value: bool) {
+        let field = match self {
+            HistoricLayerInfo::Delta { remote, .. } => remote,
+            HistoricLayerInfo::Image { remote, .. } => remote,
+        };
+        *field = value;
+    }
+}
+
 #[derive(Debug, Serialize, Deserialize)]
 pub struct DownloadRemoteLayersTaskSpawnRequest {
     pub max_concurrent_downloads: NonZeroUsize,
diff --git a/libs/utils/src/history_buffer.rs b/libs/utils/src/history_buffer.rs
index 1f07f5560f..bd35e2bad6 100644
--- a/libs/utils/src/history_buffer.rs
+++ b/libs/utils/src/history_buffer.rs
@@ -47,9 +47,10 @@ impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
     }
 }
 
-#[derive(serde::Serialize)]
+#[derive(serde::Serialize, serde::Deserialize)]
 struct SerdeRepr<T> {
     buffer: Vec<T>,
+    buffer_size: usize,
     drop_count: u64,
 }
 
@@ -61,6 +62,7 @@ where
         let HistoryBufferWithDropCounter { buffer, drop_count } = value;
         SerdeRepr {
             buffer: buffer.iter().cloned().collect(),
+            buffer_size: L,
             drop_count: *drop_count,
         }
     }
@@ -78,19 +80,52 @@ where
     }
 }
 
+impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
+where
+    T: Clone + serde::Deserialize<'de>,
+{
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let SerdeRepr {
+            buffer: des_buffer,
+            drop_count,
+            buffer_size,
+        } = SerdeRepr::<T>::deserialize(deserializer)?;
+        if buffer_size != L {
+            use serde::de::Error;
+            return Err(D::Error::custom(format!(
+                "invalid buffer_size, expecting {L} got {buffer_size}"
+            )));
+        }
+        let mut buffer = HistoryBuffer::new();
+        buffer.extend(des_buffer);
+        Ok(HistoryBufferWithDropCounter { buffer, drop_count })
+    }
+}
+
 #[cfg(test)]
 mod test {
     use super::HistoryBufferWithDropCounter;
 
     #[test]
     fn test_basics() {
-        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
+        let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
         b.write(1);
         b.write(2);
         b.write(3);
         assert!(b.iter().any(|e| *e == 2));
         assert!(b.iter().any(|e| *e == 3));
         assert!(!b.iter().any(|e| *e == 1));
+
+        // round-trip serde
+        let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
+            serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
+        assert_eq!(
+            round_tripped.iter().cloned().collect::<Vec<_>>(),
+            b.iter().cloned().collect::<Vec<_>>()
+        );
     }
 
     #[test]
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index ed9f633253..1a8f7e0524 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -169,7 +169,7 @@ impl Client {
         self.request(Method::GET, uri, ()).await
     }
 
-    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
+    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
         &self,
         method: Method,
         uri: U,
@@ -181,7 +181,16 @@ impl Client {
         } else {
             req
         };
-        let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?;
+        req.json(&body).send().await.map_err(Error::ReceiveBody)
+    }
+
+    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+        body: B,
+    ) -> Result<reqwest::Response> {
+        let res = self.request_noerror(method, uri, body).await?;
         let response = res.error_from_body().await?;
         Ok(response)
     }
@@ -425,4 +434,68 @@ impl Client {
             .await
             .map_err(Error::ReceiveBody)
     }
+
+    pub async fn layer_map_info(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<LayerMapInfo> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/layer",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id,
+        );
+        self.get(&uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn layer_evict(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        layer_file_name: &str,
+    ) -> Result<bool> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/layer/{}",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
+        );
+        let resp = self.request_noerror(Method::DELETE, &uri, ()).await?;
+        match resp.status() {
+            StatusCode::OK => Ok(true),
+            StatusCode::NOT_MODIFIED => Ok(false),
+            // TODO: dedupe this pattern / introduce separate error variant?
+            status => Err(match resp.json::<HttpErrorBody>().await {
+                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
+                Err(_) => {
+                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
+                }
+            }),
+        }
+    }
+
+    pub async fn layer_ondemand_download(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        layer_file_name: &str,
+    ) -> Result<bool> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/layer/{}",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
+        );
+        let resp = self.request_noerror(Method::GET, &uri, ()).await?;
+        match resp.status() {
+            StatusCode::OK => Ok(true),
+            StatusCode::NOT_MODIFIED => Ok(false),
+            // TODO: dedupe this pattern / introduce separate error variant?
+            status => Err(match resp.json::<HttpErrorBody>().await {
+                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
+                Err(_) => {
+                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
+                }
+            }),
+        }
+    }
 }
diff --git a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
new file mode 100644
index 0000000000..197e782dca
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
@@ -0,0 +1,272 @@
+use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
+
+use pageserver_client::mgmt_api;
+use rand::seq::SliceRandom;
+use tracing::{debug, info};
+use utils::id::{TenantTimelineId, TimelineId};
+
+use tokio::{
+    sync::{mpsc, OwnedSemaphorePermit},
+    task::JoinSet,
+};
+
+use std::{
+    num::NonZeroUsize,
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Arc,
+    },
+    time::{Duration, Instant},
+};
+
+/// Evict & on-demand download random layers.
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    mgmt_api_endpoint: String,
+    #[clap(long)]
+    pageserver_jwt: Option<String>,
+    #[clap(long)]
+    runtime: Option<humantime::Duration>,
+    #[clap(long, default_value = "1")]
+    tasks_per_target: NonZeroUsize,
+    #[clap(long, default_value = "1")]
+    concurrency_per_target: NonZeroUsize,
+    /// Probability for sending `latest=true` in the request (uniform distribution).
+    #[clap(long)]
+    limit_to_first_n_targets: Option<usize>,
+    /// Before starting the benchmark, live-reconfigure the pageserver to use the given
+    /// [`pageserver_api::models::virtual_file::IoEngineKind`].
+    #[clap(long)]
+    set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
+    targets: Option<Vec<TenantTimelineId>>,
+}
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()?;
+    let task = rt.spawn(main_impl(args));
+    rt.block_on(task).unwrap().unwrap();
+    Ok(())
+}
+
+#[derive(Debug, Default)]
+struct LiveStats {
+    evictions: AtomicU64,
+    downloads: AtomicU64,
+    timeline_restarts: AtomicU64,
+}
+
+impl LiveStats {
+    fn eviction_done(&self) {
+        self.evictions.fetch_add(1, Ordering::Relaxed);
+    }
+    fn download_done(&self) {
+        self.downloads.fetch_add(1, Ordering::Relaxed);
+    }
+    fn timeline_restart_done(&self) {
+        self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+async fn main_impl(args: Args) -> anyhow::Result<()> {
+    let args: &'static Args = Box::leak(Box::new(args));
+
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        args.mgmt_api_endpoint.clone(),
+        args.pageserver_jwt.as_deref(),
+    ));
+
+    if let Some(engine_str) = &args.set_io_engine {
+        mgmt_api_client.put_io_engine(engine_str).await?;
+    }
+
+    // discover targets
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: args.limit_to_first_n_targets,
+            targets: args.targets.clone(),
+        },
+    )
+    .await?;
+
+    let mut tasks = JoinSet::new();
+
+    let live_stats = Arc::new(LiveStats::default());
+    tasks.spawn({
+        let live_stats = Arc::clone(&live_stats);
+        async move {
+            let mut last_at = Instant::now();
+            loop {
+                tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
+                let now = Instant::now();
+                let delta: Duration = now - last_at;
+                last_at = now;
+
+                let LiveStats {
+                    evictions,
+                    downloads,
+                    timeline_restarts,
+                } = &*live_stats;
+                let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
+                let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
+                let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
+                info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
+            }
+        }
+    });
+
+    for tl in timelines {
+        for _ in 0..args.tasks_per_target.get() {
+            tasks.spawn(timeline_actor(
+                args,
+                Arc::clone(&mgmt_api_client),
+                tl,
+                Arc::clone(&live_stats),
+            ));
+        }
+    }
+
+    while let Some(res) = tasks.join_next().await {
+        res.unwrap();
+    }
+    Ok(())
+}
+
+async fn timeline_actor(
+    args: &'static Args,
+    mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
+    timeline: TenantTimelineId,
+    live_stats: Arc<LiveStats>,
+) {
+    // TODO: support sharding
+    let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
+
+    struct Timeline {
+        joinset: JoinSet<()>,
+        layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
+        concurrency: Arc<tokio::sync::Semaphore>,
+    }
+    loop {
+        debug!("restarting timeline");
+        let layer_map_info = mgmt_api_client
+            .layer_map_info(tenant_shard_id, timeline.timeline_id)
+            .await
+            .unwrap();
+        let concurrency = Arc::new(tokio::sync::Semaphore::new(
+            args.concurrency_per_target.get(),
+        ));
+
+        let mut joinset = JoinSet::new();
+        let layers = layer_map_info
+            .historic_layers
+            .into_iter()
+            .map(|historic_layer| {
+                let (tx, rx) = mpsc::channel(1);
+                joinset.spawn(layer_actor(
+                    tenant_shard_id,
+                    timeline.timeline_id,
+                    historic_layer,
+                    rx,
+                    Arc::clone(&mgmt_api_client),
+                    Arc::clone(&live_stats),
+                ));
+                tx
+            })
+            .collect::<Vec<_>>();
+
+        let mut timeline = Timeline {
+            joinset,
+            layers,
+            concurrency,
+        };
+
+        live_stats.timeline_restart_done();
+
+        loop {
+            assert!(!timeline.joinset.is_empty());
+            if let Some(res) = timeline.joinset.try_join_next() {
+                debug!(?res, "a layer actor exited, should not happen");
+                timeline.joinset.shutdown().await;
+                break;
+            }
+
+            let mut permit = Some(
+                Arc::clone(&timeline.concurrency)
+                    .acquire_owned()
+                    .await
+                    .unwrap(),
+            );
+
+            loop {
+                let layer_tx = {
+                    let mut rng = rand::thread_rng();
+                    timeline.layers.choose_mut(&mut rng).expect("no layers")
+                };
+                match layer_tx.try_send(permit.take().unwrap()) {
+                    Ok(_) => break,
+                    Err(e) => match e {
+                        mpsc::error::TrySendError::Full(back) => {
+                            // TODO: retrying introduces bias away from slow downloaders
+                            permit.replace(back);
+                        }
+                        mpsc::error::TrySendError::Closed(_) => panic!(),
+                    },
+                }
+            }
+        }
+    }
+}
+
+async fn layer_actor(
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+    mut layer: HistoricLayerInfo,
+    mut rx: mpsc::Receiver<tokio::sync::OwnedSemaphorePermit>,
+    mgmt_api_client: Arc<mgmt_api::Client>,
+    live_stats: Arc<LiveStats>,
+) {
+    #[derive(Clone, Copy)]
+    enum Action {
+        Evict,
+        OnDemandDownload,
+    }
+
+    while let Some(_permit) = rx.recv().await {
+        let action = if layer.is_remote() {
+            Action::OnDemandDownload
+        } else {
+            Action::Evict
+        };
+
+        let did_it = match action {
+            Action::Evict => {
+                let did_it = mgmt_api_client
+                    .layer_evict(tenant_shard_id, timeline_id, layer.layer_file_name())
+                    .await
+                    .unwrap();
+                live_stats.eviction_done();
+                did_it
+            }
+            Action::OnDemandDownload => {
+                let did_it = mgmt_api_client
+                    .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
+                    .await
+                    .unwrap();
+                live_stats.download_done();
+                did_it
+            }
+        };
+        if !did_it {
+            debug!("local copy of layer map appears out of sync, re-downloading");
+            return;
+        }
+        debug!("did it");
+        layer.set_remote(match action {
+            Action::Evict => true,
+            Action::OnDemandDownload => false,
+        });
+    }
+}
diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs
index 5d688ed2d1..743102d853 100644
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -16,6 +16,7 @@ mod util {
 mod cmd {
     pub(super) mod basebackup;
     pub(super) mod getpage_latest_lsn;
+    pub(super) mod ondemand_download_churn;
     pub(super) mod trigger_initial_size_calculation;
 }
 
@@ -25,6 +26,7 @@ enum Args {
     Basebackup(cmd::basebackup::Args),
     GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
     TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
+    OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
 }
 
 fn main() {
@@ -43,6 +45,7 @@ fn main() {
         Args::TriggerInitialSizeCalculation(args) => {
             cmd::trigger_initial_size_calculation::main(args)
         }
+        Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
     }
     .unwrap()
 }
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 6fff6e78e2..6ee8ad7155 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -23,7 +23,7 @@ use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::Generation;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
-use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
+use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::TimelineId;
 
@@ -73,55 +73,13 @@ pub async fn download_layer_file<'a>(
     // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
     let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
 
-    let (mut destination_file, bytes_amount) = download_retry(
-        || async {
-            let destination_file = tokio::fs::File::create(&temp_file_path)
-                .await
-                .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
-                .map_err(DownloadError::Other)?;
-
-            let download = storage.download(&remote_path, cancel).await?;
-
-            let mut destination_file =
-                tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
-
-            let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
-
-            let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file).await;
-
-            match bytes_amount {
-                Ok(bytes_amount) => {
-                    let destination_file = destination_file.into_inner();
-                    Ok((destination_file, bytes_amount))
-                }
-                Err(e) => {
-                    if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
-                        on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
-                    }
-
-                    Err(e.into())
-                }
-            }
-        },
+    let bytes_amount = download_retry(
+        || async { download_object(storage, &remote_path, &temp_file_path, cancel).await },
         &format!("download {remote_path:?}"),
         cancel,
     )
     .await?;
 
-    // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
-    // A file will not be closed immediately when it goes out of scope if there are any IO operations
-    // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
-    // you should call flush before dropping it.
-    //
-    // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
-    // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
-    // But for additional safety lets check/wait for any pending operations.
-    destination_file
-        .flush()
-        .await
-        .with_context(|| format!("flush source file at {temp_file_path}"))
-        .map_err(DownloadError::Other)?;
-
     let expected = layer_metadata.file_size();
     if expected != bytes_amount {
         return Err(DownloadError::Other(anyhow!(
@@ -129,14 +87,6 @@ pub async fn download_layer_file<'a>(
         )));
     }
 
-    // not using sync_data because it can lose file size update
-    destination_file
-        .sync_all()
-        .await
-        .with_context(|| format!("failed to fsync source file at {temp_file_path}"))
-        .map_err(DownloadError::Other)?;
-    drop(destination_file);
-
     fail::fail_point!("remote-storage-download-pre-rename", |_| {
         Err(DownloadError::Other(anyhow!(
             "remote-storage-download-pre-rename failpoint triggered"
@@ -169,6 +119,128 @@ pub async fn download_layer_file<'a>(
     Ok(bytes_amount)
 }
 
+/// Download the object `src_path` in the remote `storage` to local path `dst_path`.
+///
+/// If Ok() is returned, the download succeeded and the inode & data have been made durable.
+/// (Note that the directory entry for the inode is not made durable.)
+/// The file size in bytes is returned.
+///
+/// If Err() is returned, there was some error. The file at `dst_path` has been unlinked.
+/// The unlinking has _not_ been made durable.
+async fn download_object<'a>(
+    storage: &'a GenericRemoteStorage,
+    src_path: &RemotePath,
+    dst_path: &Utf8PathBuf,
+    cancel: &CancellationToken,
+) -> Result<u64, DownloadError> {
+    let res = match crate::virtual_file::io_engine::get() {
+        crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),
+        crate::virtual_file::io_engine::IoEngine::StdFs => {
+            async {
+                let destination_file = tokio::fs::File::create(dst_path)
+                    .await
+                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
+                    .map_err(DownloadError::Other)?;
+
+                let download = storage.download(src_path, cancel).await?;
+
+                let mut buf_writer =
+                    tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
+
+                let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
+
+                let bytes_amount = tokio::io::copy_buf(&mut reader, &mut buf_writer).await?;
+                buf_writer.flush().await?;
+
+                let mut destination_file = buf_writer.into_inner();
+
+                // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
+                // A file will not be closed immediately when it goes out of scope if there are any IO operations
+                // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
+                // you should call flush before dropping it.
+                //
+                // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
+                // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
+                // But for additional safety lets check/wait for any pending operations.
+                destination_file
+                    .flush()
+                    .await
+                    .with_context(|| format!("flush source file at {dst_path}"))
+                    .map_err(DownloadError::Other)?;
+
+                // not using sync_data because it can lose file size update
+                destination_file
+                    .sync_all()
+                    .await
+                    .with_context(|| format!("failed to fsync source file at {dst_path}"))
+                    .map_err(DownloadError::Other)?;
+
+                Ok(bytes_amount)
+            }
+            .await
+        }
+        #[cfg(target_os = "linux")]
+        crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
+            use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
+            async {
+                let destination_file = VirtualFile::create(dst_path)
+                    .await
+                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
+                    .map_err(DownloadError::Other)?;
+
+                let mut download = storage.download(src_path, cancel).await?;
+
+                // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
+                // There's chunks_vectored() on the stream.
+                let (bytes_amount, destination_file) = async {
+                    let size_tracking = size_tracking_writer::Writer::new(destination_file);
+                    let mut buffered = owned_buffers_io::write::BufferedWriter::<
+                        { super::BUFFER_SIZE },
+                        _,
+                    >::new(size_tracking);
+                    while let Some(res) =
+                        futures::StreamExt::next(&mut download.download_stream).await
+                    {
+                        let chunk = match res {
+                            Ok(chunk) => chunk,
+                            Err(e) => return Err(e),
+                        };
+                        buffered
+                            .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk))
+                            .await?;
+                    }
+                    let size_tracking = buffered.flush_and_into_inner().await?;
+                    Ok(size_tracking.into_inner())
+                }
+                .await?;
+
+                // not using sync_data because it can lose file size update
+                destination_file
+                    .sync_all()
+                    .await
+                    .with_context(|| format!("failed to fsync source file at {dst_path}"))
+                    .map_err(DownloadError::Other)?;
+
+                Ok(bytes_amount)
+            }
+            .await
+        }
+    };
+
+    // in case the download failed, clean up
+    match res {
+        Ok(bytes_amount) => Ok(bytes_amount),
+        Err(e) => {
+            if let Err(e) = tokio::fs::remove_file(dst_path).await {
+                if e.kind() != std::io::ErrorKind::NotFound {
+                    on_fatal_io_error(&e, &format!("Removing temporary file {dst_path}"));
+                }
+            }
+            Err(e)
+        }
+    }
+}
+
 const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
 
 pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 299950cc21..5c3bab9868 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -20,6 +20,7 @@ use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::models::{
     LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
+use std::borrow::Cow;
 use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
@@ -427,7 +428,7 @@ impl LayerAccessStatFullDetails {
         } = self;
         pageserver_api::models::LayerAccessStatFullDetails {
             when_millis_since_epoch: system_time_to_millis_since_epoch(when),
-            task_kind: task_kind.into(), // into static str, powered by strum_macros
+            task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros
             access_kind: *access_kind,
         }
     }
@@ -525,7 +526,7 @@ impl LayerAccessStats {
                 .collect(),
             task_kind_access_flag: task_kind_flag
                 .iter()
-                .map(|task_kind| task_kind.into()) // into static str, powered by strum_macros
+                .map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros
                 .collect(),
             first: first_access.as_ref().map(|a| a.as_api_model()),
             accesses_history: last_accesses.map(|m| m.as_api_model()),
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index ae44e9edc4..dee36d8afd 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -36,6 +36,23 @@ pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
 
+#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
+pub(crate) mod owned_buffers_io {
+    //! Abstractions for IO with owned buffers.
+    //!
+    //! Not actually tied to [`crate::virtual_file`] specifically, but, it's the primary
+    //! reason we need this abstraction.
+    //!
+    //! Over time, this could move into the `tokio-epoll-uring` crate, maybe `uring-common`,
+    //! but for the time being we're proving out the primitives in the neon.git repo
+    //! for faster iteration.
+
+    pub(crate) mod write;
+    pub(crate) mod util {
+        pub(crate) mod size_tracking_writer;
+    }
+}
+
 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
 /// the underlying file is closed if the system is low on file descriptors,
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 7f2342e76e..55fa59e53b 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -282,49 +282,52 @@ impl From<FeatureTestResult> for IoEngineKind {
 /// Panics if we can't set up the feature test.
 pub fn feature_test() -> anyhow::Result<FeatureTestResult> {
     std::thread::spawn(|| {
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .unwrap();
+
         #[cfg(not(target_os = "linux"))]
         {
-            return Ok(FeatureTestResult::PlatformPreferred(
-                FeatureTestResult::PLATFORM_DEFAULT,
-            ));
+            Ok(FeatureTestResult::PlatformPreferred(
+                FeatureTestResult::PLATFORM_PREFERRED,
+            ))
         }
         #[cfg(target_os = "linux")]
-        Ok(match rt.block_on(tokio_epoll_uring::System::launch()) {
-            Ok(_) => FeatureTestResult::PlatformPreferred({
-                assert!(matches!(
-                    IoEngineKind::TokioEpollUring,
+        {
+            let rt = tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()
+                .unwrap();
+            Ok(match rt.block_on(tokio_epoll_uring::System::launch()) {
+                Ok(_) => FeatureTestResult::PlatformPreferred({
+                    assert!(matches!(
+                        IoEngineKind::TokioEpollUring,
+                        FeatureTestResult::PLATFORM_PREFERRED
+                    ));
                     FeatureTestResult::PLATFORM_PREFERRED
-                ));
-                FeatureTestResult::PLATFORM_PREFERRED
-            }),
-            Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) => {
-                let remark = match e.raw_os_error() {
-                    Some(nix::libc::EPERM) => {
-                        // fall back
-                        "creating tokio-epoll-uring fails with EPERM, assuming it's admin-disabled "
-                            .to_string()
+                }),
+                Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) => {
+                    let remark = match e.raw_os_error() {
+                        Some(nix::libc::EPERM) => {
+                            // fall back
+                            "creating tokio-epoll-uring fails with EPERM, assuming it's admin-disabled "
+                                .to_string()
+                        }
+                    Some(nix::libc::EFAULT) => {
+                            // fail feature test
+                            anyhow::bail!(
+                                "creating tokio-epoll-uring fails with EFAULT, might have corrupted memory"
+                            );
+                        }
+                        Some(_) | None => {
+                            // fall back
+                            format!("creating tokio-epoll-uring fails with error: {e:#}")
+                        }
+                };
+                    FeatureTestResult::Worse {
+                        engine: IoEngineKind::StdFs,
+                        remark,
                     }
-                   Some(nix::libc::EFAULT) => {
-                        // fail feature test
-                        anyhow::bail!(
-                            "creating tokio-epoll-uring fails with EFAULT, might have corrupted memory"
-                        );
-                    }
-                    Some(_) | None => {
-                        // fall back
-                        format!("creating tokio-epoll-uring fails with error: {e:#}")
-                    }
-               };
-                FeatureTestResult::Worse {
-                    engine: IoEngineKind::StdFs,
-                    remark,
                 }
-            }
-        })
+            })
+        }
     })
     .join()
     .unwrap()
diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
new file mode 100644
index 0000000000..7505b7487e
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
@@ -0,0 +1,34 @@
+use crate::virtual_file::{owned_buffers_io::write::OwnedAsyncWriter, VirtualFile};
+use tokio_epoll_uring::{BoundedBuf, IoBuf};
+
+pub struct Writer {
+    dst: VirtualFile,
+    bytes_amount: u64,
+}
+
+impl Writer {
+    pub fn new(dst: VirtualFile) -> Self {
+        Self {
+            dst,
+            bytes_amount: 0,
+        }
+    }
+    /// Returns the wrapped `VirtualFile` object as well as the number
+    /// of bytes that were written to it through this object.
+    pub fn into_inner(self) -> (u64, VirtualFile) {
+        (self.bytes_amount, self.dst)
+    }
+}
+
+impl OwnedAsyncWriter for Writer {
+    #[inline(always)]
+    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        buf: B,
+    ) -> std::io::Result<(usize, B::Buf)> {
+        let (buf, res) = self.dst.write_all(buf).await;
+        let nwritten = res?;
+        self.bytes_amount += u64::try_from(nwritten).unwrap();
+        Ok((nwritten, buf))
+    }
+}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
new file mode 100644
index 0000000000..f1812d9b51
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -0,0 +1,206 @@
+use bytes::BytesMut;
+use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
+
+/// A trait for doing owned-buffer write IO.
+/// Think [`tokio::io::AsyncWrite`] but with owned buffers.
+pub trait OwnedAsyncWriter {
+    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        buf: B,
+    ) -> std::io::Result<(usize, B::Buf)>;
+}
+
+/// A wrapper aorund an [`OwnedAsyncWriter`] that batches smaller writers
+/// into `BUFFER_SIZE`-sized writes.
+///
+/// # Passthrough Of Large Writers
+///
+/// Buffered writes larger than the `BUFFER_SIZE` cause the internal
+/// buffer to be flushed, even if it is not full yet. Then, the large
+/// buffered write is passed through to the unerlying [`OwnedAsyncWriter`].
+///
+/// This pass-through is generally beneficial for throughput, but if
+/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
+/// unlimited large writes may cause latency or fairness issues.
+///
+/// In such cases, a different implementation that always buffers in memory
+/// may be preferable.
+pub struct BufferedWriter<const BUFFER_SIZE: usize, W> {
+    writer: W,
+    // invariant: always remains Some(buf)
+    // with buf.capacity() == BUFFER_SIZE except
+    // - while IO is ongoing => goes back to Some() once the IO completed successfully
+    // - after an IO error => stays `None` forever
+    // In these exceptional cases, it's `None`.
+    buf: Option<BytesMut>,
+}
+
+impl<const BUFFER_SIZE: usize, W> BufferedWriter<BUFFER_SIZE, W>
+where
+    W: OwnedAsyncWriter,
+{
+    pub fn new(writer: W) -> Self {
+        Self {
+            writer,
+            buf: Some(BytesMut::with_capacity(BUFFER_SIZE)),
+        }
+    }
+
+    pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
+        self.flush().await?;
+        let Self { buf, writer } = self;
+        assert!(buf.is_some());
+        Ok(writer)
+    }
+
+    pub async fn write_buffered<B: IoBuf>(&mut self, chunk: Slice<B>) -> std::io::Result<()>
+    where
+        B: IoBuf + Send,
+    {
+        // avoid memcpy for the middle of the chunk
+        if chunk.len() >= BUFFER_SIZE {
+            self.flush().await?;
+            // do a big write, bypassing `buf`
+            assert_eq!(
+                self.buf
+                    .as_ref()
+                    .expect("must not use after an error")
+                    .len(),
+                0
+            );
+            let chunk_len = chunk.len();
+            let (nwritten, chunk) = self.writer.write_all(chunk).await?;
+            assert_eq!(nwritten, chunk_len);
+            drop(chunk);
+            return Ok(());
+        }
+        // in-memory copy the < BUFFER_SIZED tail of the chunk
+        assert!(chunk.len() < BUFFER_SIZE);
+        let mut chunk = &chunk[..];
+        while !chunk.is_empty() {
+            let buf = self.buf.as_mut().expect("must not use after an error");
+            let need = BUFFER_SIZE - buf.len();
+            let have = chunk.len();
+            let n = std::cmp::min(need, have);
+            buf.extend_from_slice(&chunk[..n]);
+            chunk = &chunk[n..];
+            if buf.len() >= BUFFER_SIZE {
+                assert_eq!(buf.len(), BUFFER_SIZE);
+                self.flush().await?;
+            }
+        }
+        assert!(chunk.is_empty(), "by now we should have drained the chunk");
+        Ok(())
+    }
+
+    async fn flush(&mut self) -> std::io::Result<()> {
+        let buf = self.buf.take().expect("must not use after an error");
+        if buf.is_empty() {
+            self.buf = Some(buf);
+            return std::io::Result::Ok(());
+        }
+        let buf_len = buf.len();
+        let (nwritten, mut buf) = self.writer.write_all(buf).await?;
+        assert_eq!(nwritten, buf_len);
+        buf.clear();
+        self.buf = Some(buf);
+        Ok(())
+    }
+}
+
+impl OwnedAsyncWriter for Vec<u8> {
+    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        buf: B,
+    ) -> std::io::Result<(usize, B::Buf)> {
+        let nbytes = buf.bytes_init();
+        if nbytes == 0 {
+            return Ok((0, Slice::into_inner(buf.slice_full())));
+        }
+        let buf = buf.slice(0..nbytes);
+        self.extend_from_slice(&buf[..]);
+        Ok((buf.len(), Slice::into_inner(buf)))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[derive(Default)]
+    struct RecorderWriter {
+        writes: Vec<Vec<u8>>,
+    }
+    impl OwnedAsyncWriter for RecorderWriter {
+        async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+            &mut self,
+            buf: B,
+        ) -> std::io::Result<(usize, B::Buf)> {
+            let nbytes = buf.bytes_init();
+            if nbytes == 0 {
+                self.writes.push(vec![]);
+                return Ok((0, Slice::into_inner(buf.slice_full())));
+            }
+            let buf = buf.slice(0..nbytes);
+            self.writes.push(Vec::from(&buf[..]));
+            Ok((buf.len(), Slice::into_inner(buf)))
+        }
+    }
+
+    macro_rules! write {
+        ($writer:ident, $data:literal) => {{
+            $writer
+                .write_buffered(::bytes::Bytes::from_static($data).slice_full())
+                .await?;
+        }};
+    }
+
+    #[tokio::test]
+    async fn test_buffered_writes_only() -> std::io::Result<()> {
+        let recorder = RecorderWriter::default();
+        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        write!(writer, b"a");
+        write!(writer, b"b");
+        write!(writer, b"c");
+        write!(writer, b"d");
+        write!(writer, b"e");
+        let recorder = writer.flush_and_into_inner().await?;
+        assert_eq!(
+            recorder.writes,
+            vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")]
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_passthrough_writes_only() -> std::io::Result<()> {
+        let recorder = RecorderWriter::default();
+        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        write!(writer, b"abc");
+        write!(writer, b"de");
+        write!(writer, b"");
+        write!(writer, b"fghijk");
+        let recorder = writer.flush_and_into_inner().await?;
+        assert_eq!(
+            recorder.writes,
+            vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")]
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
+        let recorder = RecorderWriter::default();
+        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        write!(writer, b"a");
+        write!(writer, b"bc");
+        write!(writer, b"d");
+        write!(writer, b"e");
+        let recorder = writer.flush_and_into_inner().await?;
+        assert_eq!(
+            recorder.writes,
+            vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")]
+        );
+        Ok(())
+    }
+}
diff --git a/scripts/ps_ec2_setup_instance_store b/scripts/ps_ec2_setup_instance_store
index 4cca3a9857..1f88f252eb 100755
--- a/scripts/ps_ec2_setup_instance_store
+++ b/scripts/ps_ec2_setup_instance_store
@@ -40,7 +40,7 @@ To run your local neon.git build on the instance store volume,
 run the following commands from the top of the neon.git checkout
 
     # raise file descriptor limit of your shell and its child processes
-    sudo prlimit -p $$ --nofile=800000:800000
+    sudo prlimit -p \$\$ --nofile=800000:800000
 
     # test suite run
     export TEST_OUTPUT="$TEST_OUTPUT"

From 9752ad8489896f9df9f5bf0c7b1d31ae3fbafd9a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Mar 2024 19:45:58 +0000
Subject: [PATCH 0420/1571] pageserver, controller: improve secondary download
 APIs for large shards (#7131)

## Problem

The existing secondary download API relied on the caller to wait as long
as it took to complete -- for large shards that could be a long time, so
typical clients that might have a baked-in ~30s timeout would have a
problem.

## Summary of changes

- Take a `wait_ms` query parameter to instruct the pageserver how long
to wait: if the download isn't complete in this duration, then 201 is
returned instead of 200.
- For both 200 and 201 responses, include response body describing
download progress, in terms of layers and bytes. This is sufficient for
the caller to track how much data is being transferred and log/present
that status.
- In storage controller live migrations, use this API to apply a much
longer outer timeout, with smaller individual per-request timeouts, and
log the progress of the downloads.
- Add a test that injects layer download delays to exercise the new
behavior
---
 control_plane/attachment_service/src/http.rs  |   8 +-
 .../attachment_service/src/reconciler.rs      |  93 +++++--
 .../attachment_service/src/service.rs         |  86 +++++--
 control_plane/src/pageserver.rs               |   7 -
 libs/pageserver_api/src/models.rs             |  46 ++++
 libs/remote_storage/src/azure_blob.rs         |   4 +-
 libs/remote_storage/src/lib.rs                |   5 +-
 libs/remote_storage/src/local_fs.rs           |   5 +-
 libs/remote_storage/src/s3_bucket.rs          |   3 +-
 pageserver/client/src/mgmt_api.rs             |  23 +-
 pageserver/src/http/openapi_spec.yml          |  47 ++++
 pageserver/src/http/routes.rs                 |  41 ++-
 pageserver/src/tenant/secondary.rs            |  15 +-
 pageserver/src/tenant/secondary/downloader.rs | 237 +++++++++++++++---
 pageserver/src/tenant/secondary/heatmap.rs    |  22 ++
 test_runner/fixtures/neon_fixtures.py         |   4 +
 test_runner/fixtures/pageserver/http.py       |  10 +-
 .../regress/test_pageserver_secondary.py      | 101 ++++++++
 18 files changed, 647 insertions(+), 110 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 560a05e908..45ee354822 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -14,7 +14,7 @@ use tokio_util::sync::CancellationToken;
 use utils::auth::{Scope, SwappableJwtAuth};
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
-use utils::http::request::{must_get_query_param, parse_request_param};
+use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};
 
 use utils::{
@@ -248,8 +248,10 @@ async fn handle_tenant_secondary_download(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    service.tenant_secondary_download(tenant_id).await?;
-    json_response(StatusCode::OK, ())
+    let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
+
+    let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
+    json_response(status, progress)
 }
 
 async fn handle_tenant_delete(
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index 7f68a65c15..3bf23275bd 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -8,7 +8,7 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_client::mgmt_api;
 use std::collections::HashMap;
 use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
@@ -258,22 +258,81 @@ impl Reconciler {
         tenant_shard_id: TenantShardId,
         node: &Node,
     ) -> Result<(), ReconcileError> {
-        match node
-            .with_client_retries(
-                |client| async move { client.tenant_secondary_download(tenant_shard_id).await },
-                &self.service_config.jwt_token,
-                1,
-                1,
-                Duration::from_secs(60),
-                &self.cancel,
-            )
-            .await
-        {
-            None => Err(ReconcileError::Cancel),
-            Some(Ok(_)) => Ok(()),
-            Some(Err(e)) => {
-                tracing::info!("  (skipping destination download: {})", e);
-                Ok(())
+        // This is not the timeout for a request, but the total amount of time we're willing to wait
+        // for a secondary location to get up to date before
+        const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
+
+        // This the long-polling interval for the secondary download requests we send to destination pageserver
+        // during a migration.
+        const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
+
+        let started_at = Instant::now();
+
+        loop {
+            let (status, progress) = match node
+                .with_client_retries(
+                    |client| async move {
+                        client
+                            .tenant_secondary_download(
+                                tenant_shard_id,
+                                Some(REQUEST_DOWNLOAD_TIMEOUT),
+                            )
+                            .await
+                    },
+                    &self.service_config.jwt_token,
+                    1,
+                    3,
+                    REQUEST_DOWNLOAD_TIMEOUT * 2,
+                    &self.cancel,
+                )
+                .await
+            {
+                None => Err(ReconcileError::Cancel),
+                Some(Ok(v)) => Ok(v),
+                Some(Err(e)) => {
+                    // Give up, but proceed: it's unfortunate if we couldn't freshen the destination before
+                    // attaching, but we should not let an issue with a secondary location stop us proceeding
+                    // with a live migration.
+                    tracing::warn!("Failed to prepare by downloading layers on node {node}: {e})");
+                    return Ok(());
+                }
+            }?;
+
+            if status == StatusCode::OK {
+                tracing::info!(
+                    "Downloads to {} complete: {}/{} layers, {}/{} bytes",
+                    node,
+                    progress.layers_downloaded,
+                    progress.layers_total,
+                    progress.bytes_downloaded,
+                    progress.bytes_total
+                );
+                return Ok(());
+            } else if status == StatusCode::ACCEPTED {
+                let total_runtime = started_at.elapsed();
+                if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
+                    tracing::warn!("Timed out after {}ms downloading layers to {node}.  Progress so far: {}/{} layers, {}/{} bytes",
+                        total_runtime.as_millis(),
+                        progress.layers_downloaded,
+                        progress.layers_total,
+                        progress.bytes_downloaded,
+                        progress.bytes_total
+                    );
+                    // Give up, but proceed: an incompletely warmed destination doesn't prevent migration working,
+                    // it just makes the I/O performance for users less good.
+                    return Ok(());
+                }
+
+                // Log and proceed around the loop to retry.  We don't sleep between requests, because our HTTP call
+                // to the pageserver is a long-poll.
+                tracing::info!(
+                    "Downloads to {} not yet complete: {}/{} layers, {}/{} bytes",
+                    node,
+                    progress.layers_downloaded,
+                    progress.layers_total,
+                    progress.bytes_downloaded,
+                    progress.bytes_total
+                );
             }
         }
     }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 8439ea5567..29f87021b2 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -16,7 +16,15 @@ use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
 use pageserver_api::{
-    controller_api::UtilizationScore,
+    controller_api::{
+        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
+        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
+        TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
+    },
+    models::{SecondaryProgress, TenantConfigRequest},
+};
+
+use pageserver_api::{
     models::{
         self, LocationConfig, LocationConfigListResponse, LocationConfigMode,
         PageserverUtilization, ShardParameters, TenantConfig, TenantCreateRequest,
@@ -30,14 +38,6 @@ use pageserver_api::{
         ValidateResponse, ValidateResponseTenant,
     },
 };
-use pageserver_api::{
-    controller_api::{
-        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
-    },
-    models::TenantConfigRequest,
-};
 use pageserver_client::mgmt_api;
 use tokio::sync::OwnedRwLockWriteGuard;
 use tokio_util::sync::CancellationToken;
@@ -2084,7 +2084,8 @@ impl Service {
     pub(crate) async fn tenant_secondary_download(
         &self,
         tenant_id: TenantId,
-    ) -> Result<(), ApiError> {
+        wait: Option<Duration>,
+    ) -> Result<(StatusCode, SecondaryProgress), ApiError> {
         let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
 
         // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to
@@ -2107,32 +2108,71 @@ impl Service {
             targets
         };
 
-        // TODO: this API, and the underlying pageserver API, should take a timeout argument so that for long running
-        // downloads, they can return a clean 202 response instead of the HTTP client timing out.
-
         // Issue concurrent requests to all shards' locations
         let mut futs = FuturesUnordered::new();
         for (tenant_shard_id, node) in targets {
             let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
             futs.push(async move {
-                let result = client.tenant_secondary_download(tenant_shard_id).await;
-                (result, node)
+                let result = client
+                    .tenant_secondary_download(tenant_shard_id, wait)
+                    .await;
+                (result, node, tenant_shard_id)
             })
         }
 
         // Handle any errors returned by pageservers.  This includes cases like this request racing with
         // a scheduling operation, such that the tenant shard we're calling doesn't exist on that pageserver any more, as
         // well as more general cases like 503s, 500s, or timeouts.
-        while let Some((result, node)) = futs.next().await {
-            let Err(e) = result else { continue };
-
-            // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
-            // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
-            // than they had hoped for.
-            tracing::warn!("Ignoring tenant secondary download error from pageserver {node}: {e}",);
+        let mut aggregate_progress = SecondaryProgress::default();
+        let mut aggregate_status: Option<StatusCode> = None;
+        let mut error: Option<mgmt_api::Error> = None;
+        while let Some((result, node, tenant_shard_id)) = futs.next().await {
+            match result {
+                Err(e) => {
+                    // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
+                    // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
+                    // than they had hoped for.
+                    tracing::warn!("Secondary download error from pageserver {node}: {e}",);
+                    error = Some(e)
+                }
+                Ok((status_code, progress)) => {
+                    tracing::info!(%tenant_shard_id, "Shard status={status_code} progress: {progress:?}");
+                    aggregate_progress.layers_downloaded += progress.layers_downloaded;
+                    aggregate_progress.layers_total += progress.layers_total;
+                    aggregate_progress.bytes_downloaded += progress.bytes_downloaded;
+                    aggregate_progress.bytes_total += progress.bytes_total;
+                    aggregate_progress.heatmap_mtime =
+                        std::cmp::max(aggregate_progress.heatmap_mtime, progress.heatmap_mtime);
+                    aggregate_status = match aggregate_status {
+                        None => Some(status_code),
+                        Some(StatusCode::OK) => Some(status_code),
+                        Some(cur) => {
+                            // Other status codes (e.g. 202) -- do not overwrite.
+                            Some(cur)
+                        }
+                    };
+                }
+            }
         }
 
-        Ok(())
+        // If any of the shards return 202, indicate our result as 202.
+        match aggregate_status {
+            None => {
+                match error {
+                    Some(e) => {
+                        // No successes, and an error: surface it
+                        Err(ApiError::Conflict(format!("Error from pageserver: {e}")))
+                    }
+                    None => {
+                        // No shards found
+                        Err(ApiError::NotFound(
+                            anyhow::anyhow!("Tenant {} not found", tenant_id).into(),
+                        ))
+                    }
+                }
+            }
+            Some(aggregate_status) => Ok((aggregate_status, aggregate_progress)),
+        }
     }
 
     pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index ab2f80fb0c..2603515681 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -568,13 +568,6 @@ impl PageServerNode {
         Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
     }
 
-    pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
-        Ok(self
-            .http_client
-            .tenant_secondary_download(*tenant_id)
-            .await?)
-    }
-
     pub async fn timeline_create(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 0d0702e38e..aad4cc97fc 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -751,6 +751,52 @@ pub struct WalRedoManagerStatus {
     pub pid: Option<u32>,
 }
 
+/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
+/// a download job, timing out while waiting for it to run, and then inspecting this status to understand
+/// what's happening.
+#[derive(Default, Debug, Serialize, Deserialize, Clone)]
+pub struct SecondaryProgress {
+    /// The remote storage LastModified time of the heatmap object we last downloaded.
+    #[serde(
+        serialize_with = "opt_ser_rfc3339_millis",
+        deserialize_with = "opt_deser_rfc3339_millis"
+    )]
+    pub heatmap_mtime: Option<SystemTime>,
+
+    /// The number of layers currently on-disk
+    pub layers_downloaded: usize,
+    /// The number of layers in the most recently seen heatmap
+    pub layers_total: usize,
+
+    /// The number of layer bytes currently on-disk
+    pub bytes_downloaded: u64,
+    /// The number of layer bytes in the most recently seen heatmap
+    pub bytes_total: u64,
+}
+
+fn opt_ser_rfc3339_millis<S: serde::Serializer>(
+    ts: &Option<SystemTime>,
+    serializer: S,
+) -> Result<S::Ok, S::Error> {
+    match ts {
+        Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
+        None => serializer.serialize_none(),
+    }
+}
+
+fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
+    match s {
+        None => Ok(None),
+        Some(s) => humantime::parse_rfc3339(&s)
+            .map_err(serde::de::Error::custom)
+            .map(Some),
+    }
+}
+
 pub mod virtual_file {
     #[derive(
         Copy,
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 1e337bc1e8..5fff3e25c9 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -157,9 +157,8 @@ impl AzureBlobStorage {
             let mut bufs = Vec::new();
             while let Some(part) = response.next().await {
                 let part = part?;
-                let etag_str: &str = part.blob.properties.etag.as_ref();
                 if etag.is_none() {
-                    etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
+                    etag = Some(part.blob.properties.etag);
                 }
                 if last_modified.is_none() {
                     last_modified = Some(part.blob.properties.last_modified.into());
@@ -180,6 +179,7 @@ impl AzureBlobStorage {
                     "Azure GET response contained no buffers"
                 )));
             }
+            // unwrap safety: if these were None, bufs would be empty and we would have returned an error already
             let etag = etag.unwrap();
             let last_modified = last_modified.unwrap();
 
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index fd832eb94f..ab2035f19a 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -42,6 +42,9 @@ pub use self::{
 };
 use s3_bucket::RequestKind;
 
+/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
+pub use azure_core::Etag;
+
 pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
 
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
@@ -293,7 +296,7 @@ pub struct Download {
     /// The last time the file was modified (`last-modified` HTTP header)
     pub last_modified: SystemTime,
     /// A way to identify this specific version of the resource (`etag` HTTP header)
-    pub etag: String,
+    pub etag: Etag,
     /// Extra key-value data, associated with the current remote file.
     pub metadata: Option<StorageMetadata>,
 }
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index ea0756541b..313d8226b1 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -30,6 +30,7 @@ use crate::{
 };
 
 use super::{RemoteStorage, StorageMetadata};
+use crate::Etag;
 
 const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
 
@@ -626,9 +627,9 @@ async fn file_metadata(file_path: &Utf8Path) -> Result<std::fs::Metadata, Downlo
 // Use mtime as stand-in for ETag.  We could calculate a meaningful one by md5'ing the contents of files we
 // read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests
 // quickly, with less overhead than using a mock S3 server.
-fn mock_etag(meta: &std::fs::Metadata) -> String {
+fn mock_etag(meta: &std::fs::Metadata) -> Etag {
     let mtime = meta.modified().expect("Filesystem mtime missing");
-    format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis())
+    format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis()).into()
 }
 
 #[cfg(test)]
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 56bc32ebdd..1cb85cfb1b 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -289,7 +289,8 @@ impl S3Bucket {
         let metadata = object_output.metadata().cloned().map(StorageMetadata);
         let etag = object_output
             .e_tag
-            .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?;
+            .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?
+            .into();
         let last_modified = object_output
             .last_modified
             .ok_or(DownloadError::Other(anyhow::anyhow!(
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 1a8f7e0524..ab55d2b0a3 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -249,13 +249,26 @@ impl Client {
         Ok(())
     }
 
-    pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
-        let uri = format!(
+    pub async fn tenant_secondary_download(
+        &self,
+        tenant_id: TenantShardId,
+        wait: Option<std::time::Duration>,
+    ) -> Result<(StatusCode, SecondaryProgress)> {
+        let mut path = reqwest::Url::parse(&format!(
             "{}/v1/tenant/{}/secondary/download",
             self.mgmt_api_endpoint, tenant_id
-        );
-        self.request(Method::POST, &uri, ()).await?;
-        Ok(())
+        ))
+        .expect("Cannot build URL");
+
+        if let Some(wait) = wait {
+            path.query_pairs_mut()
+                .append_pair("wait_ms", &format!("{}", wait.as_millis()));
+        }
+
+        let response = self.request(Method::POST, path, ()).await?;
+        let status = response.status();
+        let progress: SecondaryProgress = response.json().await.map_err(Error::ReceiveBody)?;
+        Ok((status, progress))
     }
 
     pub async fn location_config(
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 4823710fb5..0771229845 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -965,12 +965,28 @@ paths:
         required: true
         schema:
           type: string
+      - name: wait_ms
+        description: If set, we will wait this long for download to complete, and if it isn't complete then return 202
+        in: query
+        required: false
+        schema:
+          type: integer
     post:
       description: |
         If the location is in secondary mode, download latest heatmap and layers
       responses:
         "200":
           description: Success
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SecondaryProgress"
+        "202":
+          description: Download has started but not yet finished
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SecondaryProgress"
         "500":
           description: Generic operation error
           content:
@@ -1623,6 +1639,37 @@ components:
             Lower is better score for how good this pageserver would be for the next tenant.
             The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated.
 
+    SecondaryProgress:
+      type: object
+      required:
+        - heatmap_mtime
+        - layers_downloaded
+        - layers_total
+        - bytes_downloaded
+        - bytes_total
+      properties:
+        heatmap_mtime:
+          type: string
+          format: date-time
+          description: Modification time of the most recently downloaded layer heatmap (RFC 3339 format)
+        layers_downloaded:
+          type: integer
+          format: int64
+          description: How many layers from the latest layer heatmap are present on disk
+        bytes_downloaded:
+          type: integer
+          format: int64
+          description: How many bytes of layer content from the latest layer heatmap are present on disk
+        layers_total:
+          type: integer
+          format: int64
+          description: How many layers were in the latest layer heatmap
+        bytes_total:
+          type: integer
+          format: int64
+          description: How many bytes of layer content were in the latest layer heatmap
+
+
     Error:
       type: object
       required:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7d3ede21ce..6d98d3f746 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1987,13 +1987,42 @@ async fn secondary_download_handler(
 ) -> Result<Response<Body>, ApiError> {
     let state = get_state(&request);
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    state
-        .secondary_controller
-        .download_tenant(tenant_shard_id)
-        .await
-        .map_err(ApiError::InternalServerError)?;
+    let wait = parse_query_param(&request, "wait_ms")?.map(Duration::from_millis);
 
-    json_response(StatusCode::OK, ())
+    // We don't need this to issue the download request, but:
+    // - it enables us to cleanly return 404 if we get a request for an absent shard
+    // - we will use this to provide status feedback in the response
+    let Some(secondary_tenant) = state
+        .tenant_manager
+        .get_secondary_tenant_shard(tenant_shard_id)
+    else {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(),
+        ));
+    };
+
+    let timeout = wait.unwrap_or(Duration::MAX);
+
+    let status = match tokio::time::timeout(
+        timeout,
+        state.secondary_controller.download_tenant(tenant_shard_id),
+    )
+    .await
+    {
+        // Download job ran to completion.
+        Ok(Ok(())) => StatusCode::OK,
+        // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
+        // okay.  We could get an error here in the unlikely edge case that the tenant
+        // was detached between our check above and executing the download job.
+        Ok(Err(e)) => return Err(ApiError::InternalServerError(e)),
+        // A timeout is not an error: we have started the download, we're just not done
+        // yet.  The caller will get a response body indicating status.
+        Err(_) => StatusCode::ACCEPTED,
+    };
+
+    let progress = secondary_tenant.progress.lock().unwrap().clone();
+
+    json_response(status, progress)
 }
 
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 14e88b836e..19f36c722e 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -95,7 +95,11 @@ pub(crate) struct SecondaryTenant {
     shard_identity: ShardIdentity,
     tenant_conf: std::sync::Mutex<TenantConfOpt>,
 
+    // Internal state used by the Downloader.
     detail: std::sync::Mutex<SecondaryDetail>,
+
+    // Public state indicating overall progress of downloads relative to the last heatmap seen
+    pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
 }
 
 impl SecondaryTenant {
@@ -118,6 +122,8 @@ impl SecondaryTenant {
             tenant_conf: std::sync::Mutex::new(tenant_conf),
 
             detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
+
+            progress: std::sync::Mutex::default(),
         })
     }
 
@@ -247,9 +253,12 @@ impl SecondaryTenant {
 }
 
 /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
-/// and heatmap uploads.  This is not a hot data path: it's primarily a hook for tests,
-/// where we want to immediately upload/download for a particular tenant.  In normal operation
-/// uploads & downloads are autonomous and not driven by this interface.
+/// and heatmap uploads.  This is not a hot data path: it's used for:
+/// - Live migrations, where we want to ensure a migration destination has the freshest possible
+///   content before trying to cut over.
+/// - Tests, where we want to immediately upload/download for a particular tenant.
+///
+/// In normal operations, outside of migrations, uploads & downloads are autonomous and not driven by this interface.
 pub struct SecondaryController {
     upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
     download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index b679077358..a595096133 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -41,14 +41,16 @@ use crate::tenant::{
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
+use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
 use rand::Rng;
-use remote_storage::{DownloadError, GenericRemoteStorage};
+use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
 
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
 use utils::{
-    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
+    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
+    id::TimelineId,
 };
 
 use super::{
@@ -128,6 +130,7 @@ pub(super) struct SecondaryDetail {
     pub(super) config: SecondaryLocationConfig,
 
     last_download: Option<Instant>,
+    last_etag: Option<Etag>,
     next_download: Option<Instant>,
     pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
 }
@@ -138,11 +141,26 @@ fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
     datetime.format("%d/%m/%Y %T")
 }
 
+/// Information returned from download function when it detects the heatmap has changed
+struct HeatMapModified {
+    etag: Etag,
+    last_modified: SystemTime,
+    bytes: Vec<u8>,
+}
+
+enum HeatMapDownload {
+    // The heatmap's etag has changed: return the new etag, mtime and the body bytes
+    Modified(HeatMapModified),
+    // The heatmap's etag is unchanged
+    Unmodified,
+}
+
 impl SecondaryDetail {
     pub(super) fn new(config: SecondaryLocationConfig) -> Self {
         Self {
             config,
             last_download: None,
+            last_etag: None,
             next_download: None,
             timelines: HashMap::new(),
         }
@@ -477,11 +495,31 @@ impl<'a> TenantDownloader<'a> {
         };
 
         let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+
+        // We will use the etag from last successful download to make the download conditional on changes
+        let last_etag = self
+            .secondary_state
+            .detail
+            .lock()
+            .unwrap()
+            .last_etag
+            .clone();
+
         // Download the tenant's heatmap
-        let heatmap_bytes = tokio::select!(
-            bytes = self.download_heatmap() => {bytes?},
+        let HeatMapModified {
+            last_modified: heatmap_mtime,
+            etag: heatmap_etag,
+            bytes: heatmap_bytes,
+        } = match tokio::select!(
+            bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
             _ = self.secondary_state.cancel.cancelled() => return Ok(())
-        );
+        ) {
+            HeatMapDownload::Unmodified => {
+                tracing::info!("Heatmap unchanged since last successful download");
+                return Ok(());
+            }
+            HeatMapDownload::Modified(m) => m,
+        };
 
         let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
 
@@ -498,6 +536,14 @@ impl<'a> TenantDownloader<'a> {
 
         tracing::debug!("Wrote local heatmap to {}", heatmap_path);
 
+        // Clean up any local layers that aren't in the heatmap.  We do this first for all timelines, on the general
+        // principle that deletions should be done before writes wherever possible, and so that we can use this
+        // phase to initialize our SecondaryProgress.
+        {
+            *self.secondary_state.progress.lock().unwrap() =
+                self.prepare_timelines(&heatmap, heatmap_mtime).await?;
+        }
+
         // Download the layers in the heatmap
         for timeline in heatmap.timelines {
             if self.secondary_state.cancel.is_cancelled() {
@@ -515,30 +561,159 @@ impl<'a> TenantDownloader<'a> {
                 .await?;
         }
 
+        // Only update last_etag after a full successful download: this way will not skip
+        // the next download, even if the heatmap's actual etag is unchanged.
+        self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
+
         Ok(())
     }
 
-    async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
+    /// Do any fast local cleanup that comes before the much slower process of downloading
+    /// layers from remote storage.  In the process, initialize the SecondaryProgress object
+    /// that will later be updated incrementally as we download layers.
+    async fn prepare_timelines(
+        &self,
+        heatmap: &HeatMapTenant,
+        heatmap_mtime: SystemTime,
+    ) -> Result<SecondaryProgress, UpdateError> {
+        let heatmap_stats = heatmap.get_stats();
+        // We will construct a progress object, and then populate its initial "downloaded" numbers
+        // while iterating through local layer state in [`Self::prepare_timelines`]
+        let mut progress = SecondaryProgress {
+            layers_total: heatmap_stats.layers,
+            bytes_total: heatmap_stats.bytes,
+            heatmap_mtime: Some(heatmap_mtime),
+            layers_downloaded: 0,
+            bytes_downloaded: 0,
+        };
+        // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
+        let mut delete_layers = Vec::new();
+        let mut delete_timelines = Vec::new();
+        {
+            let mut detail = self.secondary_state.detail.lock().unwrap();
+            for (timeline_id, timeline_state) in &mut detail.timelines {
+                let Some(heatmap_timeline_index) = heatmap
+                    .timelines
+                    .iter()
+                    .position(|t| t.timeline_id == *timeline_id)
+                else {
+                    // This timeline is no longer referenced in the heatmap: delete it locally
+                    delete_timelines.push(*timeline_id);
+                    continue;
+                };
+
+                let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();
+
+                let layers_in_heatmap = heatmap_timeline
+                    .layers
+                    .iter()
+                    .map(|l| &l.name)
+                    .collect::<HashSet<_>>();
+                let layers_on_disk = timeline_state
+                    .on_disk_layers
+                    .iter()
+                    .map(|l| l.0)
+                    .collect::<HashSet<_>>();
+
+                let mut layer_count = layers_on_disk.len();
+                let mut layer_byte_count: u64 = timeline_state
+                    .on_disk_layers
+                    .values()
+                    .map(|l| l.metadata.file_size())
+                    .sum();
+
+                // Remove on-disk layers that are no longer present in heatmap
+                for layer in layers_on_disk.difference(&layers_in_heatmap) {
+                    layer_count -= 1;
+                    layer_byte_count -= timeline_state
+                        .on_disk_layers
+                        .get(layer)
+                        .unwrap()
+                        .metadata
+                        .file_size();
+
+                    delete_layers.push((*timeline_id, (*layer).clone()));
+                }
+
+                progress.bytes_downloaded += layer_byte_count;
+                progress.layers_downloaded += layer_count;
+            }
+        }
+
+        // Execute accumulated deletions
+        for (timeline_id, layer_name) in delete_layers {
+            let timeline_path = self
+                .conf
+                .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
+            let local_path = timeline_path.join(layer_name.to_string());
+            tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",);
+
+            tokio::fs::remove_file(&local_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)
+                .maybe_fatal_err("Removing secondary layer")?;
+
+            // Update in-memory housekeeping to reflect the absence of the deleted layer
+            let mut detail = self.secondary_state.detail.lock().unwrap();
+            let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
+                continue;
+            };
+            timeline_state.on_disk_layers.remove(&layer_name);
+        }
+
+        for timeline_id in delete_timelines {
+            let timeline_path = self
+                .conf
+                .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
+            tracing::info!(timeline_id=%timeline_id,
+                "Timeline no longer in heatmap, removing from secondary location"
+            );
+            tokio::fs::remove_dir_all(&timeline_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)
+                .maybe_fatal_err("Removing secondary timeline")?;
+        }
+
+        Ok(progress)
+    }
+
+    /// Returns downloaded bytes if the etag differs from `prev_etag`, or None if the object
+    /// still matches `prev_etag`.
+    async fn download_heatmap(
+        &self,
+        prev_etag: Option<&Etag>,
+    ) -> Result<HeatMapDownload, UpdateError> {
         debug_assert_current_span_has_tenant_id();
         let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
-        // TODO: make download conditional on ETag having changed since last download
+        // TODO: pull up etag check into the request, to do a conditional GET rather than
+        // issuing a GET and then maybe ignoring the response body
         // (https://github.com/neondatabase/neon/issues/6199)
         tracing::debug!("Downloading heatmap for secondary tenant",);
 
         let heatmap_path = remote_heatmap_path(tenant_shard_id);
         let cancel = &self.secondary_state.cancel;
 
-        let heatmap_bytes = backoff::retry(
+        backoff::retry(
             || async {
                 let download = self
                     .remote_storage
                     .download(&heatmap_path, cancel)
                     .await
                     .map_err(UpdateError::from)?;
-                let mut heatmap_bytes = Vec::new();
-                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
-                let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
-                Ok(heatmap_bytes)
+
+                if Some(&download.etag) == prev_etag {
+                    Ok(HeatMapDownload::Unmodified)
+                } else {
+                    let mut heatmap_bytes = Vec::new();
+                    let mut body = tokio_util::io::StreamReader::new(download.download_stream);
+                    let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
+                    SECONDARY_MODE.download_heatmap.inc();
+                    Ok(HeatMapDownload::Modified(HeatMapModified {
+                        etag: download.etag,
+                        last_modified: download.last_modified,
+                        bytes: heatmap_bytes,
+                    }))
+                }
             },
             |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
             FAILED_DOWNLOAD_WARN_THRESHOLD,
@@ -548,11 +723,7 @@ impl<'a> TenantDownloader<'a> {
         )
         .await
         .ok_or_else(|| UpdateError::Cancelled)
-        .and_then(|x| x)?;
-
-        SECONDARY_MODE.download_heatmap.inc();
-
-        Ok(heatmap_bytes)
+        .and_then(|x| x)
     }
 
     async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
@@ -593,27 +764,6 @@ impl<'a> TenantDownloader<'a> {
             }
         };
 
-        let layers_in_heatmap = timeline
-            .layers
-            .iter()
-            .map(|l| &l.name)
-            .collect::<HashSet<_>>();
-        let layers_on_disk = timeline_state
-            .on_disk_layers
-            .iter()
-            .map(|l| l.0)
-            .collect::<HashSet<_>>();
-
-        // Remove on-disk layers that are no longer present in heatmap
-        for layer in layers_on_disk.difference(&layers_in_heatmap) {
-            let local_path = timeline_path.join(layer.to_string());
-            tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
-            tokio::fs::remove_file(&local_path)
-                .await
-                .or_else(fs_ext::ignore_not_found)
-                .maybe_fatal_err("Removing secondary layer")?;
-        }
-
         // Download heatmap layers that are not present on local disk, or update their
         // access time if they are already present.
         for layer in timeline.layers {
@@ -662,6 +812,12 @@ impl<'a> TenantDownloader<'a> {
                 }
             }
 
+            // Failpoint for simulating slow remote storage
+            failpoint_support::sleep_millis_async!(
+                "secondary-layer-download-sleep",
+                &self.secondary_state.cancel
+            );
+
             // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
             let downloaded_bytes = match download_layer_file(
                 self.conf,
@@ -701,6 +857,11 @@ impl<'a> TenantDownloader<'a> {
                 tokio::fs::remove_file(&local_path)
                     .await
                     .or_else(fs_ext::ignore_not_found)?;
+            } else {
+                tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
+                let mut progress = self.secondary_state.progress.lock().unwrap();
+                progress.bytes_downloaded += downloaded_bytes;
+                progress.layers_downloaded += 1;
             }
 
             SECONDARY_MODE.download_layer.inc();
diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
index 99aaaeb8c8..73cdf6c6d4 100644
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -62,3 +62,25 @@ impl HeatMapTimeline {
         }
     }
 }
+
+pub(crate) struct HeatMapStats {
+    pub(crate) bytes: u64,
+    pub(crate) layers: usize,
+}
+
+impl HeatMapTenant {
+    pub(crate) fn get_stats(&self) -> HeatMapStats {
+        let mut stats = HeatMapStats {
+            bytes: 0,
+            layers: 0,
+        };
+        for timeline in &self.timelines {
+            for layer in &timeline.layers {
+                stats.layers += 1;
+                stats.bytes += layer.metadata.file_size;
+            }
+        }
+
+        stats
+    }
+}
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 70d3076371..56b23cef59 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1525,6 +1525,7 @@ class NeonCli(AbstractNeonCli):
         conf: Optional[Dict[str, Any]] = None,
         shard_count: Optional[int] = None,
         shard_stripe_size: Optional[int] = None,
+        placement_policy: Optional[str] = None,
         set_default: bool = False,
     ) -> Tuple[TenantId, TimelineId]:
         """
@@ -1558,6 +1559,9 @@ class NeonCli(AbstractNeonCli):
         if shard_stripe_size is not None:
             args.extend(["--shard-stripe-size", str(shard_stripe_size)])
 
+        if placement_policy is not None:
+            args.extend(["--placement-policy", str(placement_policy)])
+
         res = self.raw_cli(args)
         res.check_returncode()
         return tenant_id, timeline_id
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 4e355b73a9..99ec894106 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -357,9 +357,15 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
         self.verbose_error(res)
 
-    def tenant_secondary_download(self, tenant_id: Union[TenantId, TenantShardId]):
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download")
+    def tenant_secondary_download(
+        self, tenant_id: Union[TenantId, TenantShardId], wait_ms: Optional[int] = None
+    ) -> tuple[int, dict[Any, Any]]:
+        url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download"
+        if wait_ms is not None:
+            url = url + f"?wait_ms={wait_ms}"
+        res = self.post(url)
         self.verbose_error(res)
+        return (res.status_code, res.json())
 
     def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]):
         assert "tenant_id" not in config.keys()
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 79145f61b3..8ef75414a3 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -1,4 +1,5 @@
 import json
+import os
 import random
 from pathlib import Path
 from typing import Any, Dict, Optional
@@ -553,3 +554,103 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
             )
         ),
     )
+
+
+@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+@pytest.mark.parametrize("via_controller", [True, False])
+def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool):
+    """
+    Test use of secondary download API for slow downloads, where slow means either a healthy
+    system with a large capacity shard, or some unhealthy remote storage.
+
+    The download API is meant to respect a client-supplied time limit, and return 200 or 202
+    selectively based on whether the download completed.
+    """
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.enable_pageserver_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+
+    env.neon_cli.create_tenant(
+        tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Double":1}'
+    )
+
+    attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
+    ps_attached = env.get_pageserver(attached_to_id)
+    ps_secondary = next(p for p in env.pageservers if p != ps_attached)
+
+    # Generate a bunch of small layers (we will apply a slowdown failpoint that works on a per-layer basis)
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(128)
+    ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
+    workload.write_rows(128)
+    ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
+    workload.write_rows(128)
+    ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
+    workload.write_rows(128)
+    ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
+
+    # Expect lots of layers
+    assert len(list_layers(ps_attached, tenant_id, timeline_id)) > 10
+
+    # Simulate large data by making layer downloads artifically slow
+    for ps in env.pageservers:
+        ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")])
+
+    # Upload a heatmap, so that secondaries have something to download
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+
+    if via_controller:
+        http_client = env.storage_controller.pageserver_api()
+        http_client.tenant_location_conf(
+            tenant_id,
+            {
+                "mode": "Secondary",
+                "secondary_conf": {"warm": True},
+                "tenant_conf": {},
+                "generation": None,
+            },
+        )
+    else:
+        http_client = ps_secondary.http_client()
+
+    # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms
+    (status, progress_1) = http_client.tenant_secondary_download(tenant_id, wait_ms=4000)
+    assert status == 202
+    assert progress_1["heatmap_mtime"] is not None
+    assert progress_1["layers_downloaded"] > 0
+    assert progress_1["bytes_downloaded"] > 0
+    assert progress_1["layers_total"] > progress_1["layers_downloaded"]
+    assert progress_1["bytes_total"] > progress_1["bytes_downloaded"]
+
+    # Multiple polls should work: use a shorter wait period this time
+    (status, progress_2) = http_client.tenant_secondary_download(tenant_id, wait_ms=1000)
+    assert status == 202
+    assert progress_2["heatmap_mtime"] is not None
+    assert progress_2["layers_downloaded"] > 0
+    assert progress_2["bytes_downloaded"] > 0
+    assert progress_2["layers_total"] > progress_2["layers_downloaded"]
+    assert progress_2["bytes_total"] > progress_2["bytes_downloaded"]
+
+    # Progress should be >= the first poll: this can only go backward if we see a new heatmap,
+    # and the heatmap period on the attached node is much longer than the runtime of this test, so no
+    # new heatmap should have been uploaded.
+    assert progress_2["layers_downloaded"] >= progress_1["layers_downloaded"]
+    assert progress_2["bytes_downloaded"] >= progress_1["bytes_downloaded"]
+    assert progress_2["layers_total"] == progress_1["layers_total"]
+    assert progress_2["bytes_total"] == progress_1["bytes_total"]
+
+    # Make downloads fast again: when the download completes within this last request, we
+    # get a 200 instead of a 202
+    for ps in env.pageservers:
+        ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "off")])
+    (status, progress_3) = http_client.tenant_secondary_download(tenant_id, wait_ms=20000)
+    assert status == 200
+    assert progress_3["heatmap_mtime"] is not None
+    assert progress_3["layers_total"] == progress_3["layers_downloaded"]
+    assert progress_3["bytes_total"] == progress_3["bytes_downloaded"]

From 0694ee9531ffa2f4391cf0f27c89b5afab1ba052 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 15 Mar 2024 20:46:15 +0100
Subject: [PATCH 0421/1571] tokio-epoll-uring: retry on launch failures due to
 locked memory (#7141)

refs https://github.com/neondatabase/neon/issues/7136

Problem
-------

Before this PR, we were using
`tokio_epoll_uring::thread_local_system()`,
which panics on tokio_epoll_uring::System::launch() failure

As we've learned in [the

past](https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391),
some older Linux kernels account io_uring instances as locked memory.

And while we've raised the limit in prod considerably, we did hit it
once on 2024-03-11 16:30 UTC.
That was after we enabled tokio-epoll-uring fleet-wide, but before
we had shipped release-5090 (c6ed86d3d0690b52e7014b6a696effa95714e8cb)
which did away with the last mass-creation of tokio-epoll-uring
instances as per

    commit 3da410c8fee05b0cd65a5c0b83fffa3d5680cd77
    Author: Christian Schwarz <christian@neon.tech>
    Date:   Tue Mar 5 10:03:54 2024 +0100

tokio-epoll-uring: use it on the layer-creating code paths (#6378)

Nonetheless, it highlighted that panicking in this situation is probably
not ideal, as it can leave the pageserver process in a semi-broken
state.

Further, due to low sampling rate of Prometheus metrics, we don't know
much about the circumstances of this failure instance.

Solution
--------

This PR implements a custom thread_local_system() that is
pageserver-aware
and will do the following on failure:
- dump relevant stats to `tracing!`, hopefully they will be useful to
  understand the circumstances better
- if it's the locked memory failure (or any other ENOMEM): abort() the
  process
- if it's ENOMEM, retry with exponential back-off, capped at 3s.
- add metric counters so we can create an alert

This makes sense in the production environment where we know that
_usually_, there's ample locked memory allowance available, and we know
the failure rate is rare.
---
 Cargo.lock                                    |   2 +
 clippy.toml                                   |   2 +
 pageserver/Cargo.toml                         |   1 +
 pageserver/src/metrics.rs                     |  27 ++-
 pageserver/src/virtual_file/io_engine.rs      |  14 +-
 .../io_engine/tokio_epoll_uring_ext.rs        | 194 ++++++++++++++++++
 pageserver/src/virtual_file/open_options.rs   |   2 +-
 workspace_hack/Cargo.toml                     |   2 +
 8 files changed, 234 insertions(+), 10 deletions(-)
 create mode 100644 pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs

diff --git a/Cargo.lock b/Cargo.lock
index 99ba8b1cb3..022dc11f07 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3530,6 +3530,7 @@ dependencies = [
  "postgres_connection",
  "postgres_ffi",
  "pq_proto",
+ "procfs",
  "rand 0.8.5",
  "regex",
  "remote_storage",
@@ -6987,6 +6988,7 @@ dependencies = [
  "axum",
  "base64 0.21.1",
  "base64ct",
+ "byteorder",
  "bytes",
  "cc",
  "chrono",
diff --git a/clippy.toml b/clippy.toml
index 5f7dc66152..4c0c04f9a1 100644
--- a/clippy.toml
+++ b/clippy.toml
@@ -2,6 +2,8 @@ disallowed-methods = [
     "tokio::task::block_in_place",
     # Allow this for now, to deny it later once we stop using Handle::block_on completely
     # "tokio::runtime::Handle::block_on",
+    # use tokio_epoll_uring_ext instead
+    "tokio_epoll_uring::thread_local_system",
 ]
 
 disallowed-macros = [
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 5adeaffe1a..2702a2040a 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -48,6 +48,7 @@ postgres.workspace = true
 postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
+procfs.workspace = true
 rand.workspace = true
 regex.workspace = true
 scopeguard.workspace = true
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 03537ddb05..075bb76a1b 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2465,7 +2465,8 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }
 
 pub mod tokio_epoll_uring {
-    use metrics::UIntGauge;
+    use metrics::{register_int_counter, UIntGauge};
+    use once_cell::sync::Lazy;
 
     pub struct Collector {
         descs: Vec<metrics::core::Desc>,
@@ -2473,15 +2474,13 @@ pub mod tokio_epoll_uring {
         systems_destroyed: UIntGauge,
     }
 
-    const NMETRICS: usize = 2;
-
     impl metrics::core::Collector for Collector {
         fn desc(&self) -> Vec<&metrics::core::Desc> {
             self.descs.iter().collect()
         }
 
         fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
-            let mut mfs = Vec::with_capacity(NMETRICS);
+            let mut mfs = Vec::with_capacity(Self::NMETRICS);
             let tokio_epoll_uring::metrics::Metrics {
                 systems_created,
                 systems_destroyed,
@@ -2495,6 +2494,8 @@ pub mod tokio_epoll_uring {
     }
 
     impl Collector {
+        const NMETRICS: usize = 2;
+
         #[allow(clippy::new_without_default)]
         pub fn new() -> Self {
             let mut descs = Vec::new();
@@ -2528,6 +2529,22 @@ pub mod tokio_epoll_uring {
             }
         }
     }
+
+    pub(crate) static THREAD_LOCAL_LAUNCH_SUCCESSES: Lazy<metrics::IntCounter> = Lazy::new(|| {
+        register_int_counter!(
+            "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_success_count",
+            "Number of times where thread_local_system creation spanned multiple executor threads",
+        )
+        .unwrap()
+    });
+
+    pub(crate) static THREAD_LOCAL_LAUNCH_FAILURES: Lazy<metrics::IntCounter> = Lazy::new(|| {
+        register_int_counter!(
+            "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_failures_count",
+            "Number of times thread_local_system creation failed and was retried after back-off.",
+        )
+        .unwrap()
+    });
 }
 
 pub(crate) mod tenant_throttling {
@@ -2656,6 +2673,8 @@ pub fn preinitialize_metrics() {
         &WALRECEIVER_BROKER_UPDATES,
         &WALRECEIVER_CANDIDATES_ADDED,
         &WALRECEIVER_CANDIDATES_REMOVED,
+        &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
+        &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
     ]
     .into_iter()
     .for_each(|c| {
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 55fa59e53b..2dd0ce64d6 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -6,6 +6,10 @@
 //! Initialize using [`init`].
 //!
 //! Then use [`get`] and  [`super::OpenOptions`].
+//!
+//!
+
+pub(super) mod tokio_epoll_uring_ext;
 
 use tokio_epoll_uring::{IoBuf, Slice};
 use tracing::Instrument;
@@ -145,7 +149,7 @@ impl IoEngine {
             }
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = tokio_epoll_uring_ext::thread_local_system().await;
                 let (resources, res) = system.read(file_guard, offset, buf).await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
@@ -160,7 +164,7 @@ impl IoEngine {
             }
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = tokio_epoll_uring_ext::thread_local_system().await;
                 let (resources, res) = system.fsync(file_guard).await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
@@ -178,7 +182,7 @@ impl IoEngine {
             }
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = tokio_epoll_uring_ext::thread_local_system().await;
                 let (resources, res) = system.fdatasync(file_guard).await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
@@ -197,7 +201,7 @@ impl IoEngine {
             }
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = tokio_epoll_uring_ext::thread_local_system().await;
                 let (resources, res) = system.statx(file_guard).await;
                 (
                     resources,
@@ -220,7 +224,7 @@ impl IoEngine {
             }
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = tokio_epoll_uring_ext::thread_local_system().await;
                 let (resources, res) = system.write(file_guard, offset, buf).await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
new file mode 100644
index 0000000000..c4b10f3a24
--- /dev/null
+++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
@@ -0,0 +1,194 @@
+//! Like [`::tokio_epoll_uring::thread_local_system()`], but with pageserver-specific
+//! handling in case the instance can't launched.
+//!
+//! This is primarily necessary due to ENOMEM aka OutOfMemory errors during io_uring creation
+//! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series.
+//! See <https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391> for more details.
+
+use std::sync::atomic::AtomicU32;
+use std::sync::Arc;
+
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info, info_span, warn, Instrument};
+use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
+
+use tokio_epoll_uring::{System, SystemHandle};
+
+use crate::virtual_file::on_fatal_io_error;
+
+use crate::metrics::tokio_epoll_uring as metrics;
+
+#[derive(Clone)]
+struct ThreadLocalState(Arc<ThreadLocalStateInner>);
+
+struct ThreadLocalStateInner {
+    cell: tokio::sync::OnceCell<SystemHandle>,
+    launch_attempts: AtomicU32,
+}
+
+impl ThreadLocalState {
+    pub fn new() -> Self {
+        Self(Arc::new(ThreadLocalStateInner {
+            cell: tokio::sync::OnceCell::default(),
+            launch_attempts: AtomicU32::new(0),
+        }))
+    }
+    pub fn make_id_string(&self) -> String {
+        format!("0x{:p}", Arc::as_ptr(&self.0))
+    }
+}
+
+impl Drop for ThreadLocalState {
+    fn drop(&mut self) {
+        info!(parent: None, id=%self.make_id_string(), "tokio-epoll-uring_ext: ThreadLocalState is being dropped and id might be re-used in the future");
+    }
+}
+
+thread_local! {
+    static THREAD_LOCAL: ThreadLocalState = ThreadLocalState::new();
+}
+
+/// Panics if we cannot [`System::launch`].
+pub async fn thread_local_system() -> Handle {
+    let fake_cancel = CancellationToken::new();
+    loop {
+        let thread_local_state = THREAD_LOCAL.with(|arc| arc.clone());
+        let inner = &thread_local_state.0;
+        let get_or_init_res = inner
+            .cell
+            .get_or_try_init(|| async {
+                let attempt_no = inner
+                    .launch_attempts
+                    .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+                let span = info_span!("tokio_epoll_uring_ext::thread_local_system", thread_local=%thread_local_state.make_id_string(), %attempt_no);
+                async {
+                    // Rate-limit retries per thread-local.
+                    // NB: doesn't yield to executor at attempt_no=0.
+                    utils::backoff::exponential_backoff(
+                        attempt_no,
+                        DEFAULT_BASE_BACKOFF_SECONDS,
+                        DEFAULT_MAX_BACKOFF_SECONDS,
+                        &fake_cancel,
+                    )
+                    .await;
+                    let res = System::launch()
+                    // this might move us to another executor thread => loop outside the get_or_try_init, not inside it
+                    .await;
+                    match res {
+                        Ok(system) => {
+                            info!("successfully launched system");
+                            metrics::THREAD_LOCAL_LAUNCH_SUCCESSES.inc();
+                            Ok(system)
+                        }
+                        Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => {
+                            warn!("not enough locked memory to tokio-epoll-uring, will retry");
+                            info_span!("stats").in_scope(|| {
+                                emit_launch_failure_process_stats();
+                            });
+                            metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc();
+                            Err(())
+                        }
+                        // abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere.
+                        // This is equivalent to a fatal IO error.
+                        Err(ref e @ tokio_epoll_uring::LaunchResult::IoUringBuild(ref inner)) => {
+                            error!(error=%e, "failed to launch thread-local tokio-epoll-uring, this should not happen, aborting process");
+                            info_span!("stats").in_scope(|| {
+                                emit_launch_failure_process_stats();
+                            });
+                            on_fatal_io_error(inner, "launch thread-local tokio-epoll-uring");
+                        },
+                    }
+                }
+                .instrument(span)
+                .await
+            })
+            .await;
+        if get_or_init_res.is_ok() {
+            return Handle(thread_local_state);
+        }
+    }
+}
+
+fn emit_launch_failure_process_stats() {
+    // tokio-epoll-uring stats
+    // vmlck + rlimit
+    // number of threads
+    // rss / system memory usage generally
+
+    let tokio_epoll_uring::metrics::Metrics {
+        systems_created,
+        systems_destroyed,
+    } = tokio_epoll_uring::metrics::global();
+    info!(systems_created, systems_destroyed, "tokio-epoll-uring");
+
+    match procfs::process::Process::myself() {
+        Ok(myself) => {
+            match myself.limits() {
+                Ok(limits) => {
+                    info!(?limits.max_locked_memory, "/proc/self/limits");
+                }
+                Err(error) => {
+                    info!(%error, "no limit stats due to error");
+                }
+            }
+
+            match myself.status() {
+                Ok(status) => {
+                    let procfs::process::Status {
+                        vmsize,
+                        vmlck,
+                        vmpin,
+                        vmrss,
+                        rssanon,
+                        rssfile,
+                        rssshmem,
+                        vmdata,
+                        vmstk,
+                        vmexe,
+                        vmlib,
+                        vmpte,
+                        threads,
+                        ..
+                    } = status;
+                    info!(
+                        vmsize,
+                        vmlck,
+                        vmpin,
+                        vmrss,
+                        rssanon,
+                        rssfile,
+                        rssshmem,
+                        vmdata,
+                        vmstk,
+                        vmexe,
+                        vmlib,
+                        vmpte,
+                        threads,
+                        "/proc/self/status"
+                    );
+                }
+                Err(error) => {
+                    info!(%error, "no status status due to error");
+                }
+            }
+        }
+        Err(error) => {
+            info!(%error, "no process stats due to error");
+        }
+    };
+}
+
+#[derive(Clone)]
+pub struct Handle(ThreadLocalState);
+
+impl std::ops::Deref for Handle {
+    type Target = SystemHandle;
+
+    fn deref(&self) -> &Self::Target {
+        self.0
+             .0
+            .cell
+            .get()
+            .expect("must be already initialized when using this")
+    }
+}
diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs
index f75edb0bac..7f951270d1 100644
--- a/pageserver/src/virtual_file/open_options.rs
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -98,7 +98,7 @@ impl OpenOptions {
             OpenOptions::StdFs(x) => x.open(path).map(|file| file.into()),
             #[cfg(target_os = "linux")]
             OpenOptions::TokioEpollUring(x) => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await;
                 system.open(path, x).await.map_err(|e| match e {
                     tokio_epoll_uring::Error::Op(e) => e,
                     tokio_epoll_uring::Error::System(system) => {
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 8593b752c2..0646091006 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -24,6 +24,7 @@ aws-smithy-types = { version = "1", default-features = false, features = ["byte-
 axum = { version = "0.6", features = ["ws"] }
 base64 = { version = "0.21", features = ["alloc"] }
 base64ct = { version = "1", default-features = false, features = ["std"] }
+byteorder = { version = "1", features = ["i128"] }
 bytes = { version = "1", features = ["serde"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 clap = { version = "4", features = ["derive", "string"] }
@@ -86,6 +87,7 @@ zstd-sys = { version = "2", default-features = false, features = ["legacy", "std
 
 [build-dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
+byteorder = { version = "1", features = ["i128"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }

From 5cec5cb3cf4c1fc3004ae6fc412f96dccd91014e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 15 Mar 2024 20:48:51 +0100
Subject: [PATCH 0422/1571] fixup(#7120): the macOS code used an outdated
 constant name, broke the build (#7150)


From 30a3d80d2fe881f375483080f1370913d47704bf Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 18 Mar 2024 11:28:45 +0200
Subject: [PATCH 0423/1571] build: make procfs linux only dependency (#7156)

the dependency refuses to build on macos so builds on `main` are broken
right now, including the `release` PR.
---
 Cargo.lock                               | 1 -
 pageserver/Cargo.toml                    | 4 +++-
 pageserver/src/virtual_file/io_engine.rs | 1 +
 workspace_hack/Cargo.toml                | 2 --
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 022dc11f07..c4f925e3c7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6988,7 +6988,6 @@ dependencies = [
  "axum",
  "base64 0.21.1",
  "base64ct",
- "byteorder",
  "bytes",
  "cc",
  "chrono",
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 2702a2040a..f304294591 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -48,7 +48,6 @@ postgres.workspace = true
 postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
-procfs.workspace = true
 rand.workspace = true
 regex.workspace = true
 scopeguard.workspace = true
@@ -90,6 +89,9 @@ enumset = { workspace = true, features = ["serde"]}
 strum.workspace = true
 strum_macros.workspace = true
 
+[target.'cfg(target_os = "linux")'.dependencies]
+procfs.workspace = true
+
 [dev-dependencies]
 criterion.workspace = true
 hex-literal.workspace = true
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 2dd0ce64d6..7a27be2ca1 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -9,6 +9,7 @@
 //!
 //!
 
+#[cfg(target_os = "linux")]
 pub(super) mod tokio_epoll_uring_ext;
 
 use tokio_epoll_uring::{IoBuf, Slice};
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 0646091006..8593b752c2 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -24,7 +24,6 @@ aws-smithy-types = { version = "1", default-features = false, features = ["byte-
 axum = { version = "0.6", features = ["ws"] }
 base64 = { version = "0.21", features = ["alloc"] }
 base64ct = { version = "1", default-features = false, features = ["std"] }
-byteorder = { version = "1", features = ["i128"] }
 bytes = { version = "1", features = ["serde"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 clap = { version = "4", features = ["derive", "string"] }
@@ -87,7 +86,6 @@ zstd-sys = { version = "2", default-features = false, features = ["legacy", "std
 
 [build-dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
-byteorder = { version = "1", features = ["i128"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }

From 1d3ae57f18462e150a74d044c4f59c34a6c27e69 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 18 Mar 2024 10:37:20 +0000
Subject: [PATCH 0424/1571] pageserver: refactoring in TenantManager to reduce
 duplication (#6732)

## Problem

Followup to https://github.com/neondatabase/neon/pull/6725

In that PR, code for purging local files from a tenant shard was
duplicated.

## Summary of changes

- Refactor detach code into TenantManager
- `spawn_background_purge` method can now be common between detach and
split operations
---
 pageserver/src/http/routes.rs |  37 ++---
 pageserver/src/tenant/mgr.rs  | 259 +++++++++++++++++-----------------
 2 files changed, 151 insertions(+), 145 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6d98d3f746..97ffb99465 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -885,14 +885,16 @@ async fn tenant_detach_handler(
 
     let state = get_state(&request);
     let conf = state.conf;
-    mgr::detach_tenant(
-        conf,
-        tenant_shard_id,
-        detach_ignored.unwrap_or(false),
-        &state.deletion_queue_client,
-    )
-    .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
-    .await?;
+    state
+        .tenant_manager
+        .detach_tenant(
+            conf,
+            tenant_shard_id,
+            detach_ignored.unwrap_or(false),
+            &state.deletion_queue_client,
+        )
+        .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
+        .await?;
 
     json_response(StatusCode::OK, ())
 }
@@ -1403,7 +1405,9 @@ async fn update_tenant_config_handler(
         TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
 
     let state = get_state(&request);
-    mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
+    state
+        .tenant_manager
+        .set_new_tenant_config(tenant_conf, tenant_id)
         .instrument(info_span!("tenant_config", %tenant_id))
         .await?;
 
@@ -1428,13 +1432,14 @@ async fn put_tenant_location_config_handler(
     // The `Detached` state is special, it doesn't upsert a tenant, it removes
     // its local disk content and drops it from memory.
     if let LocationConfigMode::Detached = request_data.config.mode {
-        if let Err(e) =
-            mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
-                .instrument(info_span!("tenant_detach",
-                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard_id = %tenant_shard_id.shard_slug()
-                ))
-                .await
+        if let Err(e) = state
+            .tenant_manager
+            .detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
+            .instrument(info_span!("tenant_detach",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug()
+            ))
+            .await
         {
             match e {
                 TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 3aaab6e4ef..facaaa2ad7 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -633,7 +633,7 @@ pub async fn init_tenant_mgr(
 /// Wrapper for Tenant::spawn that checks invariants before running, and inserts
 /// a broken tenant in the map if Tenant::spawn fails.
 #[allow(clippy::too_many_arguments)]
-pub(crate) fn tenant_spawn(
+fn tenant_spawn(
     conf: &'static PageServerConf,
     tenant_shard_id: TenantShardId,
     tenant_path: &Utf8Path,
@@ -825,40 +825,6 @@ pub(crate) enum SetNewTenantConfigError {
     Other(anyhow::Error),
 }
 
-pub(crate) async fn set_new_tenant_config(
-    conf: &'static PageServerConf,
-    new_tenant_conf: TenantConfOpt,
-    tenant_id: TenantId,
-) -> Result<(), SetNewTenantConfigError> {
-    // Legacy API: does not support sharding
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-    info!("configuring tenant {tenant_id}");
-    let tenant = get_tenant(tenant_shard_id, true)?;
-
-    if !tenant.tenant_shard_id().shard_count.is_unsharded() {
-        // Note that we use ShardParameters::default below.
-        return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
-            "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
-        )));
-    }
-
-    // This is a legacy API that only operates on attached tenants: the preferred
-    // API to use is the location_config/ endpoint, which lets the caller provide
-    // the full LocationConf.
-    let location_conf = LocationConf::attached_single(
-        new_tenant_conf.clone(),
-        tenant.generation,
-        &ShardParameters::default(),
-    );
-
-    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
-        .await
-        .map_err(SetNewTenantConfigError::Persist)?;
-    tenant.set_new_tenant_config(new_tenant_conf);
-    Ok(())
-}
-
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum UpsertLocationError {
     #[error("Bad config request: {0}")]
@@ -1661,19 +1627,7 @@ impl TenantManager {
         let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
             .await
             .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::MgmtRequest,
-            None,
-            None,
-            "tenant_files_delete",
-            false,
-            async move {
-                fs::remove_dir_all(tmp_path.as_path())
-                    .await
-                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-            },
-        );
+        self.spawn_background_purge(tmp_path);
 
         fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
             "failpoint"
@@ -1827,6 +1781,134 @@ impl TenantManager {
 
         shutdown_all_tenants0(self.tenants).await
     }
+
+    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
+    /// the background, and thereby avoid blocking any API requests on this deletion completing.
+    fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
+        // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+        // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
+        let task_tenant_id = None;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MgmtRequest,
+            task_tenant_id,
+            None,
+            "tenant_files_delete",
+            false,
+            async move {
+                fs::remove_dir_all(tmp_path.as_path())
+                    .await
+                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+            },
+        );
+    }
+
+    pub(crate) async fn detach_tenant(
+        &self,
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        detach_ignored: bool,
+        deletion_queue_client: &DeletionQueueClient,
+    ) -> Result<(), TenantStateError> {
+        let tmp_path = self
+            .detach_tenant0(
+                conf,
+                &TENANTS,
+                tenant_shard_id,
+                detach_ignored,
+                deletion_queue_client,
+            )
+            .await?;
+        self.spawn_background_purge(tmp_path);
+
+        Ok(())
+    }
+
+    async fn detach_tenant0(
+        &self,
+        conf: &'static PageServerConf,
+        tenants: &std::sync::RwLock<TenantsMap>,
+        tenant_shard_id: TenantShardId,
+        detach_ignored: bool,
+        deletion_queue_client: &DeletionQueueClient,
+    ) -> Result<Utf8PathBuf, TenantStateError> {
+        let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
+            let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
+            safe_rename_tenant_dir(&local_tenant_directory)
+                .await
+                .with_context(|| {
+                    format!("local tenant directory {local_tenant_directory:?} rename")
+                })
+        };
+
+        let removal_result = remove_tenant_from_memory(
+            tenants,
+            tenant_shard_id,
+            tenant_dir_rename_operation(tenant_shard_id),
+        )
+        .await;
+
+        // Flush pending deletions, so that they have a good chance of passing validation
+        // before this tenant is potentially re-attached elsewhere.
+        deletion_queue_client.flush_advisory();
+
+        // Ignored tenants are not present in memory and will bail the removal from memory operation.
+        // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
+        if detach_ignored
+            && matches!(
+                removal_result,
+                Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
+            )
+        {
+            let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
+            if tenant_ignore_mark.exists() {
+                info!("Detaching an ignored tenant");
+                let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
+                    .await
+                    .with_context(|| {
+                        format!("Ignored tenant {tenant_shard_id} local directory rename")
+                    })?;
+                return Ok(tmp_path);
+            }
+        }
+
+        removal_result
+    }
+
+    pub(crate) async fn set_new_tenant_config(
+        &self,
+        new_tenant_conf: TenantConfOpt,
+        tenant_id: TenantId,
+    ) -> Result<(), SetNewTenantConfigError> {
+        // Legacy API: does not support sharding
+        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+        info!("configuring tenant {tenant_id}");
+        let tenant = get_tenant(tenant_shard_id, true)?;
+
+        if !tenant.tenant_shard_id().shard_count.is_unsharded() {
+            // Note that we use ShardParameters::default below.
+            return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
+            "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
+        )));
+        }
+
+        // This is a legacy API that only operates on attached tenants: the preferred
+        // API to use is the location_config/ endpoint, which lets the caller provide
+        // the full LocationConf.
+        let location_conf = LocationConf::attached_single(
+            new_tenant_conf.clone(),
+            tenant.generation,
+            &ShardParameters::default(),
+        );
+
+        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &location_conf)
+            .await
+            .map_err(SetNewTenantConfigError::Persist)?;
+        tenant.set_new_tenant_config(new_tenant_conf);
+        Ok(())
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -2028,87 +2110,6 @@ pub(crate) enum TenantStateError {
     Other(#[from] anyhow::Error),
 }
 
-pub(crate) async fn detach_tenant(
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    detach_ignored: bool,
-    deletion_queue_client: &DeletionQueueClient,
-) -> Result<(), TenantStateError> {
-    let tmp_path = detach_tenant0(
-        conf,
-        &TENANTS,
-        tenant_shard_id,
-        detach_ignored,
-        deletion_queue_client,
-    )
-    .await?;
-    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
-    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
-    let task_tenant_id = None;
-    task_mgr::spawn(
-        task_mgr::BACKGROUND_RUNTIME.handle(),
-        TaskKind::MgmtRequest,
-        task_tenant_id,
-        None,
-        "tenant_files_delete",
-        false,
-        async move {
-            fs::remove_dir_all(tmp_path.as_path())
-                .await
-                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-        },
-    );
-    Ok(())
-}
-
-async fn detach_tenant0(
-    conf: &'static PageServerConf,
-    tenants: &std::sync::RwLock<TenantsMap>,
-    tenant_shard_id: TenantShardId,
-    detach_ignored: bool,
-    deletion_queue_client: &DeletionQueueClient,
-) -> Result<Utf8PathBuf, TenantStateError> {
-    let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
-        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
-        safe_rename_tenant_dir(&local_tenant_directory)
-            .await
-            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))
-    };
-
-    let removal_result = remove_tenant_from_memory(
-        tenants,
-        tenant_shard_id,
-        tenant_dir_rename_operation(tenant_shard_id),
-    )
-    .await;
-
-    // Flush pending deletions, so that they have a good chance of passing validation
-    // before this tenant is potentially re-attached elsewhere.
-    deletion_queue_client.flush_advisory();
-
-    // Ignored tenants are not present in memory and will bail the removal from memory operation.
-    // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
-    if detach_ignored
-        && matches!(
-            removal_result,
-            Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
-        )
-    {
-        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
-        if tenant_ignore_mark.exists() {
-            info!("Detaching an ignored tenant");
-            let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
-                .await
-                .with_context(|| {
-                    format!("Ignored tenant {tenant_shard_id} local directory rename")
-                })?;
-            return Ok(tmp_path);
-        }
-    }
-
-    removal_result
-}
-
 pub(crate) async fn load_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,

From db749914d852382bf3ee579105ecfb35b020f7e5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 18 Mar 2024 13:29:20 +0100
Subject: [PATCH 0425/1571] fixup(#7141 / tokio_epoll_uring_ext): high
 frequency log message (#7160)

The PR #7141 added log message

```
ThreadLocalState is being dropped and id might be re-used in the future
```

which was supposed to be emitted when the thread-local is destroyed.
Instead, it was emitted on _each_ call to `thread_local_system()`,
ie.., on each tokio-epoll-uring operation.

Testing
-------

Reproduced the issue locally and verified that this PR fixes the issue.
---
 .../io_engine/tokio_epoll_uring_ext.rs        | 27 ++++++++++++++-----
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
index c4b10f3a24..9b2efef5d4 100644
--- a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
+++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
@@ -6,7 +6,7 @@
 //! See <https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391> for more details.
 
 use std::sync::atomic::AtomicU32;
-use std::sync::Arc;
+use std::sync::{Arc, Weak};
 
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -24,23 +24,36 @@ struct ThreadLocalState(Arc<ThreadLocalStateInner>);
 struct ThreadLocalStateInner {
     cell: tokio::sync::OnceCell<SystemHandle>,
     launch_attempts: AtomicU32,
+    weak_self: Weak<ThreadLocalStateInner>,
 }
 
 impl ThreadLocalState {
     pub fn new() -> Self {
-        Self(Arc::new(ThreadLocalStateInner {
+        Self(Arc::new_cyclic(|weak| ThreadLocalStateInner {
             cell: tokio::sync::OnceCell::default(),
             launch_attempts: AtomicU32::new(0),
+            weak_self: Weak::clone(weak),
         }))
     }
-    pub fn make_id_string(&self) -> String {
-        format!("0x{:p}", Arc::as_ptr(&self.0))
-    }
 }
 
-impl Drop for ThreadLocalState {
+impl ThreadLocalStateInner {
+    pub fn make_id_string(&self) -> String {
+        format!("0x{:p}", self.weak_self.as_ptr())
+    }
+}
+
+impl Drop for ThreadLocalStateInner {
     fn drop(&mut self) {
-        info!(parent: None, id=%self.make_id_string(), "tokio-epoll-uring_ext: ThreadLocalState is being dropped and id might be re-used in the future");
+        info!(parent: None, id=%self.make_id_string(), "tokio_epoll_uring_ext: thread-local state is being dropped and id might be re-used in the future");
+    }
+}
+
+impl std::ops::Deref for ThreadLocalState {
+    type Target = ThreadLocalStateInner;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
     }
 }
 

From 877fd144012d43b078772f90610b77eb80723c89 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 18 Mar 2024 16:27:53 +0200
Subject: [PATCH 0426/1571] fix: spanless log message (#7155)

with `immediate_gc` the span only covered the `gc_iteration`, make it
cover the whole needless spawned task, which also does waiting for layer
drops and stray logging in tests.

also clarify some comments while we are here.

Fixes: #6910
---
 pageserver/src/http/routes.rs |  3 +--
 pageserver/src/tenant/mgr.rs  | 12 +++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 97ffb99465..229f3ae98f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1653,8 +1653,7 @@ async fn timeline_gc_handler(
     let gc_req: TimelineGcRequest = json_request(&mut request).await?;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let wait_task_done =
-        mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
+    let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?;
     let gc_result = wait_task_done
         .await
         .context("wait for gc task")
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index facaaa2ad7..f456ca3006 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2730,7 +2730,7 @@ use {
     utils::http::error::ApiError,
 };
 
-pub(crate) async fn immediate_gc(
+pub(crate) fn immediate_gc(
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
     gc_req: TimelineGcRequest,
@@ -2752,6 +2752,8 @@ pub(crate) async fn immediate_gc(
     // Run in task_mgr to avoid race with tenant_detach operation
     let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
     let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
+    let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
+
     // TODO: spawning is redundant now, need to hold the gate
     task_mgr::spawn(
         &tokio::runtime::Handle::current(),
@@ -2766,16 +2768,15 @@ pub(crate) async fn immediate_gc(
             #[allow(unused_mut)]
             let mut result = tenant
                 .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
-                .instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))
                 .await;
                 // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
                 // better once the types support it.
 
             #[cfg(feature = "testing")]
             {
+                // we need to synchronize with drop completion for python tests without polling for
+                // log messages
                 if let Ok(result) = result.as_mut() {
-                    // why not futures unordered? it seems it needs very much the same task structure
-                    // but would only run on single task.
                     let mut js = tokio::task::JoinSet::new();
                     for layer in std::mem::take(&mut result.doomed_layers) {
                         js.spawn(layer.wait_drop());
@@ -2791,7 +2792,7 @@ pub(crate) async fn immediate_gc(
 
                 if let Some(rtc) = rtc {
                     // layer drops schedule actions on remote timeline client to actually do the
-                    // deletions; don't care just exit fast about the shutdown error
+                    // deletions; don't care about the shutdown error, just exit fast
                     drop(rtc.wait_completion().await);
                 }
             }
@@ -2802,6 +2803,7 @@ pub(crate) async fn immediate_gc(
             }
             Ok(())
         }
+        .instrument(span)
     );
 
     // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task

From 2bc2fd9cfd722b354a905df737cb92643aabfd13 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 18 Mar 2024 16:12:01 +0100
Subject: [PATCH 0427/1571] fixup(#7160 / tokio_epoll_uring_ext): double-panic
 caused by info! in thread-local's drop() (#7164)

Manual testing of the changes in #7160 revealed that, if the
thread-local destructor ever runs (it apparently doesn't in our test
suite runs, otherwise #7160 would not have auto-merged), we can
encounter an `abort()` due to a double-panic in the tracing code.

This github comment here contains the stack trace:
https://github.com/neondatabase/neon/pull/7160#issuecomment-2003778176

This PR reverts #7160 and uses a atomic counter to identify the
thread-local in log messages, instead of the memory address of the
thread local, which may be re-used.
---
 .../io_engine/tokio_epoll_uring_ext.rs        | 29 +++++--------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
index 9b2efef5d4..6ea19d6b2d 100644
--- a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
+++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
@@ -5,8 +5,8 @@
 //! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series.
 //! See <https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391> for more details.
 
-use std::sync::atomic::AtomicU32;
-use std::sync::{Arc, Weak};
+use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
+use std::sync::Arc;
 
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -24,38 +24,25 @@ struct ThreadLocalState(Arc<ThreadLocalStateInner>);
 struct ThreadLocalStateInner {
     cell: tokio::sync::OnceCell<SystemHandle>,
     launch_attempts: AtomicU32,
-    weak_self: Weak<ThreadLocalStateInner>,
+    /// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`]
+    thread_local_state_id: u64,
 }
 
 impl ThreadLocalState {
     pub fn new() -> Self {
-        Self(Arc::new_cyclic(|weak| ThreadLocalStateInner {
+        Self(Arc::new(ThreadLocalStateInner {
             cell: tokio::sync::OnceCell::default(),
             launch_attempts: AtomicU32::new(0),
-            weak_self: Weak::clone(weak),
+            thread_local_state_id: THREAD_LOCAL_STATE_ID.fetch_add(1, Ordering::Relaxed),
         }))
     }
-}
 
-impl ThreadLocalStateInner {
     pub fn make_id_string(&self) -> String {
-        format!("0x{:p}", self.weak_self.as_ptr())
+        format!("{}", self.0.thread_local_state_id)
     }
 }
 
-impl Drop for ThreadLocalStateInner {
-    fn drop(&mut self) {
-        info!(parent: None, id=%self.make_id_string(), "tokio_epoll_uring_ext: thread-local state is being dropped and id might be re-used in the future");
-    }
-}
-
-impl std::ops::Deref for ThreadLocalState {
-    type Target = ThreadLocalStateInner;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
+static THREAD_LOCAL_STATE_ID: AtomicU64 = AtomicU64::new(0);
 
 thread_local! {
     static THREAD_LOCAL: ThreadLocalState = ThreadLocalState::new();

From ad5efb49ee9783f7c30ba38e60c40f6ab0761b89 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 18 Mar 2024 22:54:44 +0100
Subject: [PATCH 0428/1571] Support backpressure for sharding (#7100)

Add shard_number to PageserverFeedback and parse it on the compute side.
When compute receives a new ps_feedback, it calculates min LSNs among
feedbacks from all shards, and uses those LSNs for backpressure.

Add `test_sharding_backpressure` to verify that backpressure slows down
compute to wait for the slowest shard.
---
 libs/utils/src/pageserver_feedback.rs         |  38 +++--
 libs/walproposer/src/api_bindings.rs          |   4 +-
 libs/walproposer/src/walproposer.rs           |   2 +-
 .../walreceiver/walreceiver_connection.rs     |   1 +
 pageserver/src/walingest.rs                   |   2 +
 pgxn/neon/walproposer.c                       | 120 +++++++--------
 pgxn/neon/walproposer.h                       |  20 ++-
 pgxn/neon/walproposer_pg.c                    | 139 ++++++++++--------
 .../tests/walproposer_sim/walproposer_api.rs  |  26 +++-
 test_runner/fixtures/workload.py              |   5 +-
 test_runner/regress/test_sharding.py          | 128 ++++++++++++++++
 11 files changed, 336 insertions(+), 149 deletions(-)

diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs
index bc8fa7362e..3ddfa44f41 100644
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -29,12 +29,10 @@ pub struct PageserverFeedback {
     // Serialize with RFC3339 format.
     #[serde(with = "serde_systemtime")]
     pub replytime: SystemTime,
+    /// Used to track feedbacks from different shards. Always zero for unsharded tenants.
+    pub shard_number: u32,
 }
 
-// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
-// Do not remove previously available fields because this might be backwards incompatible.
-pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
-
 impl PageserverFeedback {
     pub fn empty() -> PageserverFeedback {
         PageserverFeedback {
@@ -43,6 +41,7 @@ impl PageserverFeedback {
             remote_consistent_lsn: Lsn::INVALID,
             disk_consistent_lsn: Lsn::INVALID,
             replytime: *PG_EPOCH,
+            shard_number: 0,
         }
     }
 
@@ -59,17 +58,26 @@ impl PageserverFeedback {
     //
     // TODO: change serialized fields names once all computes migrate to rename.
     pub fn serialize(&self, buf: &mut BytesMut) {
-        buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
+        let buf_ptr = buf.len();
+        buf.put_u8(0); // # of keys, will be filled later
+        let mut nkeys = 0;
+
+        nkeys += 1;
         buf.put_slice(b"current_timeline_size\0");
         buf.put_i32(8);
         buf.put_u64(self.current_timeline_size);
 
+        nkeys += 1;
         buf.put_slice(b"ps_writelsn\0");
         buf.put_i32(8);
         buf.put_u64(self.last_received_lsn.0);
+
+        nkeys += 1;
         buf.put_slice(b"ps_flushlsn\0");
         buf.put_i32(8);
         buf.put_u64(self.disk_consistent_lsn.0);
+
+        nkeys += 1;
         buf.put_slice(b"ps_applylsn\0");
         buf.put_i32(8);
         buf.put_u64(self.remote_consistent_lsn.0);
@@ -80,9 +88,19 @@ impl PageserverFeedback {
             .expect("failed to serialize pg_replytime earlier than PG_EPOCH")
             .as_micros() as i64;
 
+        nkeys += 1;
         buf.put_slice(b"ps_replytime\0");
         buf.put_i32(8);
         buf.put_i64(timestamp);
+
+        if self.shard_number > 0 {
+            nkeys += 1;
+            buf.put_slice(b"shard_number\0");
+            buf.put_i32(4);
+            buf.put_u32(self.shard_number);
+        }
+
+        buf[buf_ptr] = nkeys;
     }
 
     // Deserialize PageserverFeedback message
@@ -125,9 +143,8 @@ impl PageserverFeedback {
                 }
                 b"shard_number" => {
                     let len = buf.get_i32();
-                    // TODO: this will be implemented in the next update,
-                    //  for now, we just skip the value.
-                    buf.advance(len as usize);
+                    assert_eq!(len, 4);
+                    rf.shard_number = buf.get_u32();
                 }
                 _ => {
                     let len = buf.get_i32();
@@ -200,10 +217,7 @@ mod tests {
         rf.serialize(&mut data);
 
         // Add an extra field to the buffer and adjust number of keys
-        if let Some(first) = data.first_mut() {
-            *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
-        }
-
+        data[0] += 1;
         data.put_slice(b"new_field_one\0");
         data.put_i32(8);
         data.put_u64(42);
diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index f5ed6ebb97..906302e46e 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
     }
 }
 
-extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer) {
+extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekeeper) {
     unsafe {
         let callback_data = (*(*wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).process_safekeeper_feedback(&mut (*wp))
+        (*api).process_safekeeper_feedback(&mut (*wp), &mut (*sk));
     }
 }
 
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 734967da3f..14cc3e05a2 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -142,7 +142,7 @@ pub trait ApiImpl {
         todo!()
     }
 
-    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer) {
+    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer, _sk: &mut Safekeeper) {
         todo!()
     }
 
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 8297ca6563..d9f780cfd1 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -448,6 +448,7 @@ pub(super) async fn handle_walreceiver_connection(
                 disk_consistent_lsn,
                 remote_consistent_lsn,
                 replytime: ts,
+                shard_number: timeline.tenant_shard_id.shard_number.0 as u32,
             };
 
             debug!("neon_status_update {status_update:?}");
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 63a2b30d09..9c7e8748d5 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -109,6 +109,8 @@ impl WalIngest {
             self.checkpoint_modified = true;
         }
 
+        failpoint_support::sleep_millis_async!("wal-ingest-record-sleep");
+
         match decoded.xl_rmid {
             pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
                 // Heap AM records need some special handling, because they modify VM pages
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 9ff0493352..d7987954d4 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -70,7 +70,7 @@ static bool SendAppendRequests(Safekeeper *sk);
 static bool RecvAppendResponses(Safekeeper *sk);
 static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp);
 static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp);
-static void HandleSafekeeperResponse(WalProposer *wp);
+static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk);
 static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size);
 static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg);
 static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state);
@@ -1405,7 +1405,6 @@ static bool
 RecvAppendResponses(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
-	XLogRecPtr	newCommitLsn;
 	bool		readAnything = false;
 
 	while (true)
@@ -1425,6 +1424,8 @@ RecvAppendResponses(Safekeeper *sk)
 			   LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
 			   sk->host, sk->port);
 
+		readAnything = true;
+
 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/*
@@ -1438,35 +1439,28 @@ RecvAppendResponses(Safekeeper *sk)
 				   sk->appendResponse.term, wp->propTerm);
 		}
 
-		readAnything = true;
+		HandleSafekeeperResponse(wp, sk);
 	}
 
 	if (!readAnything)
 		return sk->state == SS_ACTIVE;
 
-	/* update commit_lsn */
-	newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
-	/*
-	 * Send the new value to all safekeepers.
-	 */
-	if (newCommitLsn > wp->commitLsn)
-	{
-		wp->commitLsn = newCommitLsn;
-		BroadcastAppendRequest(wp);
-	}
-
-	HandleSafekeeperResponse(wp);
-
 	return sk->state == SS_ACTIVE;
 }
 
+#define psfeedback_log(fmt, key, ...) \
+	wp_log(DEBUG2, "ParsePageserverFeedbackMessage: %s " fmt, key, __VA_ARGS__)
+
 /* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */
 static void
-ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *rf)
+ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *ps_feedback)
 {
 	uint8		nkeys;
 	int			i;
-	int32		len;
+
+	/* initialize the struct before parsing */
+	memset(ps_feedback, 0, sizeof(PageserverFeedback));
+	ps_feedback->present = true;
 
 	/* get number of custom keys */
 	nkeys = pq_getmsgbyte(reply_message);
@@ -1474,66 +1468,52 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 	for (i = 0; i < nkeys; i++)
 	{
 		const char *key = pq_getmsgstring(reply_message);
+		unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32));
 
 		if (strcmp(key, "current_timeline_size") == 0)
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->currentClusterSize = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
-				   rf->currentClusterSize);
+			Assert(value_len == sizeof(int64));
+			ps_feedback->currentClusterSize = pq_getmsgint64(reply_message);
+			psfeedback_log(UINT64_FORMAT, key, ps_feedback->currentClusterSize);
 		}
 		else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->last_received_lsn = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
-				   LSN_FORMAT_ARGS(rf->last_received_lsn));
+			Assert(value_len == sizeof(int64));
+			ps_feedback->last_received_lsn = pq_getmsgint64(reply_message);
+			psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->last_received_lsn));
 		}
 		else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
-				   LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
+			Assert(value_len == sizeof(int64));
+			ps_feedback->disk_consistent_lsn = pq_getmsgint64(reply_message);
+			psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->disk_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
-				   LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
+			Assert(value_len == sizeof(int64));
+			ps_feedback->remote_consistent_lsn = pq_getmsgint64(reply_message);
+			psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->remote_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->replytime = pq_getmsgint64(reply_message);
-			{
-				char	   *replyTimeStr;
-
-				/* Copy because timestamptz_to_str returns a static buffer */
-				replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
-				wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
-					   rf->replytime, replyTimeStr);
-
-				pfree(replyTimeStr);
-			}
+			Assert(value_len == sizeof(int64));
+			ps_feedback->replytime = pq_getmsgint64(reply_message);
+			psfeedback_log("%s", key, timestamptz_to_str(ps_feedback->replytime));
+		}
+		else if (strcmp(key, "shard_number") == 0)
+		{
+			Assert(value_len == sizeof(uint32));
+			ps_feedback->shard_number = pq_getmsgint(reply_message, sizeof(uint32));
+			psfeedback_log("%u", key, ps_feedback->shard_number);
 		}
 		else
 		{
-			len = pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-
 			/*
 			 * Skip unknown keys to support backward compatibile protocol
 			 * changes
 			 */
-			wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
-			pq_getmsgbytes(reply_message, len);
+			wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, value_len);
+			pq_getmsgbytes(reply_message, value_len);
 		};
 	}
 }
@@ -1630,12 +1610,30 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
 	return donor;
 }
 
+/*
+ * Process AppendResponse message from safekeeper.
+ */
 static void
-HandleSafekeeperResponse(WalProposer *wp)
+HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk)
 {
 	XLogRecPtr	candidateTruncateLsn;
+	XLogRecPtr  newCommitLsn;
 
-	wp->api.process_safekeeper_feedback(wp);
+	newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
+	if (newCommitLsn > wp->commitLsn)
+	{
+		wp->commitLsn = newCommitLsn;
+		/* Send new value to all safekeepers. */
+		BroadcastAppendRequest(wp);
+	}
+
+	/* 
+	 * Unlock syncrep waiters, update ps_feedback, CheckGracefulShutdown().
+	 * The last one will terminate the process if the shutdown is requested
+	 * and WAL is committed by the quorum. BroadcastAppendRequest() should be
+	 * called to notify safekeepers about the new commitLsn.
+	 */
+	wp->api.process_safekeeper_feedback(wp, sk);
 
 	/*
 	 * Try to advance truncateLsn -- the last record flushed to all
@@ -1811,8 +1809,10 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 				msg->hs.ts = pq_getmsgint64_le(&s);
 				msg->hs.xmin.value = pq_getmsgint64_le(&s);
 				msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
-				if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
-					ParsePageserverFeedbackMessage(wp, &s, &msg->rf);
+				if (s.len > s.cursor)
+					ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
+				else
+					msg->ps_feedback.present = false;
 				pq_getmsgend(&s);
 				return true;
 			}
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index bc674fd979..28585eb4e7 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -10,6 +10,7 @@
 
 #include "libpqwalproposer.h"
 #include "neon_walreader.h"
+#include "pagestore_client.h"
 
 #define SK_MAGIC 0xCafeCeefu
 #define SK_PROTOCOL_VERSION 2
@@ -269,6 +270,8 @@ typedef struct HotStandbyFeedback
 
 typedef struct PageserverFeedback
 {
+	/* true if AppendResponse contains this feedback */
+	bool		present;
 	/* current size of the timeline on pageserver */
 	uint64		currentClusterSize;
 	/* standby_status_update fields that safekeeper received from pageserver */
@@ -276,14 +279,21 @@ typedef struct PageserverFeedback
 	XLogRecPtr	disk_consistent_lsn;
 	XLogRecPtr	remote_consistent_lsn;
 	TimestampTz replytime;
+	uint32		shard_number;
 } PageserverFeedback;
 
 typedef struct WalproposerShmemState
 {
 	slock_t		mutex;
-	PageserverFeedback feedback;
 	term_t		mineLastElectedTerm;
 	pg_atomic_uint64 backpressureThrottlingTime;
+
+	/* last feedback from each shard */
+	PageserverFeedback shard_ps_feedback[MAX_SHARDS];
+	int num_shards;
+
+	/* aggregated feedback with min LSNs across shards */
+	PageserverFeedback min_ps_feedback;
 } WalproposerShmemState;
 
 /*
@@ -307,12 +317,12 @@ typedef struct AppendResponse
 	/* Feedback received from pageserver includes standby_status_update fields */
 	/* and custom neon feedback. */
 	/* This part of the message is extensible. */
-	PageserverFeedback rf;
+	PageserverFeedback ps_feedback;
 } AppendResponse;
 
 /*  PageserverFeedback is extensible part of the message that is parsed separately */
 /*  Other fields are fixed part */
-#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)
+#define APPENDRESPONSE_FIXEDPART_SIZE 56
 
 struct WalProposer;
 typedef struct WalProposer WalProposer;
@@ -560,11 +570,11 @@ typedef struct walproposer_api
 	void		(*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn);
 
 	/*
-	 * Called after every new message from the safekeeper. Used to propagate
+	 * Called after every AppendResponse from the safekeeper. Used to propagate
 	 * backpressure feedback and to confirm WAL persistence (has been commited
 	 * on the quorum of safekeepers).
 	 */
-	void		(*process_safekeeper_feedback) (WalProposer *wp);
+	void		(*process_safekeeper_feedback) (WalProposer *wp, Safekeeper *sk);
 
 	/*
 	 * Write a log message to the internal log processor. This is used only
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 8eec2f02c1..c46fd9b3ec 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -63,7 +63,6 @@ char	   *wal_acceptors_list = "";
 int			wal_acceptor_reconnect_timeout = 1000;
 int			wal_acceptor_connection_timeout = 10000;
 
-static AppendResponse quorumFeedback;
 static WalproposerShmemState *walprop_shared;
 static WalProposerConfig walprop_config;
 static XLogRecPtr sentPtr = InvalidXLogRecPtr;
@@ -71,6 +70,10 @@ static const walproposer_api walprop_pg;
 static volatile sig_atomic_t got_SIGUSR2 = false;
 static bool reported_sigusr2 = false;
 
+static XLogRecPtr standby_flush_lsn = InvalidXLogRecPtr;
+static XLogRecPtr standby_apply_lsn = InvalidXLogRecPtr;
+static HotStandbyFeedback agg_hs_feedback;
+
 static void nwp_shmem_startup_hook(void);
 static void nwp_register_gucs(void);
 static void nwp_prepare_shmem(void);
@@ -402,21 +405,58 @@ walprop_pg_get_shmem_state(WalProposer *wp)
 	return walprop_shared;
 }
 
-static void
-replication_feedback_set(PageserverFeedback *rf)
+/*
+ * Record new ps_feedback in the array with shards and update min_feedback.
+ */
+static PageserverFeedback
+record_pageserver_feedback(PageserverFeedback *ps_feedback)
 {
+	PageserverFeedback min_feedback;
+
+	Assert(ps_feedback->present);
+	Assert(ps_feedback->shard_number < MAX_SHARDS);
+
 	SpinLockAcquire(&walprop_shared->mutex);
-	memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback));
+
+	/* Update the number of shards */
+	if (ps_feedback->shard_number + 1 > walprop_shared->num_shards)
+		walprop_shared->num_shards = ps_feedback->shard_number + 1;
+
+	/* Update the feedback */
+	memcpy(&walprop_shared->shard_ps_feedback[ps_feedback->shard_number], ps_feedback, sizeof(PageserverFeedback));
+
+	/* Calculate min LSNs */
+	memcpy(&min_feedback, ps_feedback, sizeof(PageserverFeedback));
+	for (int i = 0; i < walprop_shared->num_shards; i++)
+	{
+		PageserverFeedback *feedback = &walprop_shared->shard_ps_feedback[i];
+		if (feedback->present)
+		{
+			if (min_feedback.last_received_lsn == InvalidXLogRecPtr || feedback->last_received_lsn < min_feedback.last_received_lsn)
+				min_feedback.last_received_lsn = feedback->last_received_lsn;
+			
+			if (min_feedback.disk_consistent_lsn == InvalidXLogRecPtr || feedback->disk_consistent_lsn < min_feedback.disk_consistent_lsn)
+				min_feedback.disk_consistent_lsn = feedback->disk_consistent_lsn;
+			
+			if (min_feedback.remote_consistent_lsn == InvalidXLogRecPtr || feedback->remote_consistent_lsn < min_feedback.remote_consistent_lsn)
+				min_feedback.remote_consistent_lsn = feedback->remote_consistent_lsn;
+		}
+	}
+	/* Copy min_feedback back to shmem */
+	memcpy(&walprop_shared->min_ps_feedback, &min_feedback, sizeof(PageserverFeedback));
+
 	SpinLockRelease(&walprop_shared->mutex);
+
+	return min_feedback;
 }
 
 void
 replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
 {
 	SpinLockAcquire(&walprop_shared->mutex);
-	*writeLsn = walprop_shared->feedback.last_received_lsn;
-	*flushLsn = walprop_shared->feedback.disk_consistent_lsn;
-	*applyLsn = walprop_shared->feedback.remote_consistent_lsn;
+	*writeLsn = walprop_shared->min_ps_feedback.last_received_lsn;
+	*flushLsn = walprop_shared->min_ps_feedback.disk_consistent_lsn;
+	*applyLsn = walprop_shared->min_ps_feedback.remote_consistent_lsn;
 	SpinLockRelease(&walprop_shared->mutex);
 }
 
@@ -1869,39 +1909,6 @@ CheckGracefulShutdown(WalProposer *wp)
 	}
 }
 
-/*
- * Choose most advanced PageserverFeedback and set it to *rf.
- */
-static void
-GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
-{
-	int			latest_safekeeper = 0;
-	XLogRecPtr	last_received_lsn = InvalidXLogRecPtr;
-
-	for (int i = 0; i < wp->n_safekeepers; i++)
-	{
-		if (wp->safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn)
-		{
-			latest_safekeeper = i;
-			last_received_lsn = wp->safekeeper[i].appendResponse.rf.last_received_lsn;
-		}
-	}
-
-	rf->currentClusterSize = wp->safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize;
-	rf->last_received_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn;
-	rf->disk_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn;
-	rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
-	rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime;
-
-	wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
-			" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
-			rf->currentClusterSize,
-			LSN_FORMAT_ARGS(rf->last_received_lsn),
-			LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
-			LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
-			rf->replytime);
-}
-
 /*
  * Combine hot standby feedbacks from all safekeepers.
  */
@@ -1949,26 +1956,38 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
  * None of that is functional in sync-safekeepers.
  */
 static void
-walprop_pg_process_safekeeper_feedback(WalProposer *wp)
+walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 {
-	HotStandbyFeedback hsFeedback;
-	XLogRecPtr	oldDiskConsistentLsn;
+	HotStandbyFeedback	hsFeedback;
+	bool				needToAdvanceSlot = false;
 
 	if (wp->config->syncSafekeepers)
 		return;
 
-	oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
-
-	/* Get PageserverFeedback fields from the most advanced safekeeper */
-	GetLatestNeonFeedback(&quorumFeedback.rf, wp);
-	replication_feedback_set(&quorumFeedback.rf);
-	SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
-
-	if (wp->commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
+	/* handle fresh ps_feedback */
+	if (sk->appendResponse.ps_feedback.present)
 	{
-		if (wp->commitLsn > quorumFeedback.flushLsn)
-			quorumFeedback.flushLsn = wp->commitLsn;
+		PageserverFeedback min_feedback = record_pageserver_feedback(&sk->appendResponse.ps_feedback);
 
+		/* Only one main shard sends non-zero currentClusterSize */
+		if (sk->appendResponse.ps_feedback.currentClusterSize > 0)
+			SetZenithCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize);
+
+		if (min_feedback.disk_consistent_lsn != standby_apply_lsn)
+		{
+			standby_apply_lsn = min_feedback.disk_consistent_lsn;
+			needToAdvanceSlot = true;
+		}
+	}
+
+	if (wp->commitLsn > standby_flush_lsn)
+	{
+		standby_flush_lsn = wp->commitLsn;
+		needToAdvanceSlot = true;
+	}
+
+	if (needToAdvanceSlot)
+	{
 		/*
 		 * Advance the replication slot to commitLsn. WAL before it is
 		 * hardened and will be fetched from one of safekeepers by
@@ -1977,23 +1996,23 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp)
 		 * Also wakes up syncrep waiters.
 		 */
 		ProcessStandbyReply(
-		/* write_lsn -  This is what durably stored in WAL service. */
-							quorumFeedback.flushLsn,
-		/* flush_lsn - This is what durably stored in WAL service. */
-							quorumFeedback.flushLsn,
+		/* write_lsn -  This is what durably stored in safekeepers quorum. */
+							standby_flush_lsn,
+		/* flush_lsn - This is what durably stored in safekeepers quorum. */
+							standby_flush_lsn,
 
 		/*
 		 * apply_lsn - This is what processed and durably saved at*
 		 * pageserver.
 		 */
-							quorumFeedback.rf.disk_consistent_lsn,
+							standby_apply_lsn,
 							walprop_pg_get_current_timestamp(wp), false);
 	}
 
 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
-	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0)
+	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
 	{
-		quorumFeedback.hs = hsFeedback;
+		agg_hs_feedback = hsFeedback;
 		ProcessStandbyHSFeedback(hsFeedback.ts,
 								 XidFromFullTransactionId(hsFeedback.xmin),
 								 EpochFromFullTransactionId(hsFeedback.xmin),
diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs
index 5c79e9082b..42340ba1df 100644
--- a/safekeeper/tests/walproposer_sim/walproposer_api.rs
+++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs
@@ -224,6 +224,16 @@ impl SimulationApi {
             })
             .collect::<Vec<_>>();
 
+        let empty_feedback = PageserverFeedback {
+            present: false,
+            currentClusterSize: 0,
+            last_received_lsn: 0,
+            disk_consistent_lsn: 0,
+            remote_consistent_lsn: 0,
+            replytime: 0,
+            shard_number: 0,
+        };
+
         Self {
             os: args.os,
             safekeepers: RefCell::new(sk_conns),
@@ -232,15 +242,11 @@ impl SimulationApi {
             last_logged_commit_lsn: 0,
             shmem: UnsafeCell::new(walproposer::bindings::WalproposerShmemState {
                 mutex: 0,
-                feedback: PageserverFeedback {
-                    currentClusterSize: 0,
-                    last_received_lsn: 0,
-                    disk_consistent_lsn: 0,
-                    remote_consistent_lsn: 0,
-                    replytime: 0,
-                },
                 mineLastElectedTerm: 0,
                 backpressureThrottlingTime: pg_atomic_uint64 { value: 0 },
+                shard_ps_feedback: [empty_feedback; 128],
+                num_shards: 0,
+                min_ps_feedback: empty_feedback,
             }),
             config: args.config,
             event_set: RefCell::new(None),
@@ -598,7 +604,11 @@ impl ApiImpl for SimulationApi {
         }
     }
 
-    fn process_safekeeper_feedback(&mut self, wp: &mut walproposer::bindings::WalProposer) {
+    fn process_safekeeper_feedback(
+        &mut self,
+        wp: &mut walproposer::bindings::WalProposer,
+        _sk: &mut walproposer::bindings::Safekeeper,
+    ) {
         debug!("process_safekeeper_feedback, commit_lsn={}", wp.commitLsn);
         if wp.commitLsn > self.last_logged_commit_lsn {
             self.os.log_event(format!("commit_lsn;{}", wp.commitLsn));
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index e852281fcf..ab8717de54 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -1,5 +1,5 @@
 import threading
-from typing import Optional
+from typing import Any, Optional
 
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -32,6 +32,7 @@ class Workload:
         tenant_id: TenantId,
         timeline_id: TimelineId,
         branch_name: Optional[str] = None,
+        endpoint_opts: Optional[dict[str, Any]] = None,
     ):
         self.env = env
         self.tenant_id = tenant_id
@@ -45,6 +46,7 @@ class Workload:
         self.churn_cursor = 0
 
         self._endpoint: Optional[Endpoint] = None
+        self._endpoint_opts = endpoint_opts or {}
 
     def reconfigure(self):
         """
@@ -66,6 +68,7 @@ class Workload:
                     tenant_id=self.tenant_id,
                     pageserver_id=pageserver_id,
                     endpoint_id=endpoint_id,
+                    **self._endpoint_opts,
                 )
                 self._endpoint.start(pageserver_id=pageserver_id)
             else:
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index e8511e428e..9e62933f7e 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1,4 +1,5 @@
 import os
+import time
 from typing import Dict, List, Optional, Union
 
 import pytest
@@ -837,3 +838,130 @@ def test_sharding_split_failures(
         assert_split_done()
 
     env.storage_controller.consistency_check()
+
+
+def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
+    """
+    Check a scenario when one of the shards is much slower than others.
+    Without backpressure, this would lead to the slow shard falling behind
+    and eventually causing WAL timeouts.
+    """
+
+    shard_count = 4
+    neon_env_builder.num_pageservers = shard_count
+
+    # 256KiB stripes: enable getting some meaningful data distribution without
+    # writing large quantities of data in this test.  The stripe size is given
+    # in number of 8KiB pages.
+    stripe_size = 32
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
+    )
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+    shards = env.storage_controller.locate(tenant_id)
+
+    # Slow down one of the shards, around ~1MB/s
+    pageservers[4].http_client().configure_failpoints(("wal-ingest-record-sleep", "5%sleep(1)"))
+
+    def shards_info():
+        infos = []
+        for shard in shards:
+            node_id = int(shard["node_id"])
+            pageserver = pageservers[node_id]
+            shard_info = pageserver.http_client().timeline_detail(shard["shard_id"], timeline_id)
+            infos.append(shard_info)
+            last_record_lsn = shard_info["last_record_lsn"]
+            current_physical_size = shard_info["current_physical_size"]
+            log.info(
+                f"Shard on pageserver {node_id}: lsn={last_record_lsn}, size={current_physical_size}"
+            )
+        return infos
+
+    shards_info()
+
+    workload = Workload(
+        env,
+        tenant_id,
+        timeline_id,
+        branch_name="main",
+        endpoint_opts={
+            "config_lines": [
+                # Tip: set to 100MB to make the test fail
+                "max_replication_write_lag=1MB",
+            ],
+        },
+    )
+    workload.init()
+
+    endpoint = workload.endpoint()
+
+    # on 2024-03-05, the default config on prod was [15MB, 10GB, null]
+    res = endpoint.safe_psql_many(
+        [
+            "SHOW max_replication_write_lag",
+            "SHOW max_replication_flush_lag",
+            "SHOW max_replication_apply_lag",
+        ]
+    )
+    log.info(f"backpressure config: {res}")
+
+    last_flush_lsn = None
+    last_timestamp = None
+
+    def update_write_lsn():
+        nonlocal last_flush_lsn
+        nonlocal last_timestamp
+
+        res = endpoint.safe_psql(
+            """
+            SELECT
+                pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag,
+                received_lsn,
+                pg_current_wal_flush_lsn() as flush_lsn,
+                neon.backpressure_throttling_time() as throttling_time
+            FROM neon.backpressure_lsns();
+            """,
+            dbname="postgres",
+        )[0]
+        log.info(
+            f"received_lsn_lag = {res[0]}, received_lsn = {res[1]}, flush_lsn = {res[2]}, throttling_time = {res[3]}"
+        )
+
+        lsn = Lsn(res[2])
+        now = time.time()
+
+        if last_timestamp is not None:
+            delta = now - last_timestamp
+            delta_bytes = lsn - last_flush_lsn
+            avg_speed = delta_bytes / delta / 1024 / 1024
+            log.info(
+                f"flush_lsn {lsn}, written {delta_bytes/1024}kb for {delta:.3f}s, avg_speed {avg_speed:.3f} MiB/s"
+            )
+
+        last_flush_lsn = lsn
+        last_timestamp = now
+
+    update_write_lsn()
+
+    workload.write_rows(4096, upload=False)
+    workload.write_rows(4096, upload=False)
+    workload.write_rows(4096, upload=False)
+    workload.write_rows(4096, upload=False)
+    workload.validate()
+
+    update_write_lsn()
+    shards_info()
+
+    for _write_iter in range(30):
+        # approximately 1MB of data
+        workload.write_rows(8000, upload=False)
+        update_write_lsn()
+        infos = shards_info()
+        min_lsn = min(Lsn(info["last_record_lsn"]) for info in infos)
+        max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos)
+        diff = max_lsn - min_lsn
+        assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure"

From 49be446d95482c31febacf6b87b11aa47afb2bc2 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 18 Mar 2024 22:57:32 +0000
Subject: [PATCH 0429/1571] async password validation (#7171)

## Problem

password hashing can block main thread

## Summary of changes

spawn_blocking the password hash call
---
 proxy/src/auth/backend.rs       |  2 +-
 proxy/src/auth/flow.rs          |  7 ++++---
 proxy/src/scram.rs              | 15 ++++++++-------
 proxy/src/scram/exchange.rs     | 11 +++++++++--
 proxy/src/serverless/backend.rs |  2 +-
 5 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 11af85caa4..bc307230dd 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -254,7 +254,7 @@ async fn authenticate_with_secret(
     config: &'static AuthenticationConfig,
 ) -> auth::Result<ComputeCredentials> {
     if let Some(password) = unauthenticated_password {
-        let auth_outcome = validate_password_and_exchange(&password, secret)?;
+        let auth_outcome = validate_password_and_exchange(&password, secret).await?;
         let keys = match auth_outcome {
             crate::sasl::Outcome::Success(key) => key,
             crate::sasl::Outcome::Failure(reason) => {
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 788381b6c0..f26dcb7c9a 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -126,7 +126,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
             .strip_suffix(&[0])
             .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
 
-        let outcome = validate_password_and_exchange(password, self.state.0)?;
+        let outcome = validate_password_and_exchange(password, self.state.0).await?;
 
         if let sasl::Outcome::Success(_) = &outcome {
             self.stream.write_message_noflush(&Be::AuthenticationOk)?;
@@ -180,7 +180,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
     }
 }
 
-pub(crate) fn validate_password_and_exchange(
+pub(crate) async fn validate_password_and_exchange(
     password: &[u8],
     secret: AuthSecret,
 ) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
@@ -200,7 +200,8 @@ pub(crate) fn validate_password_and_exchange(
                 &scram_secret,
                 sasl_client,
                 crate::config::TlsServerEndPoint::Undefined,
-            )?;
+            )
+            .await?;
 
             let client_key = match outcome {
                 sasl::Outcome::Success(client_key) => client_key,
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index a95e734d06..df4b3ec8d7 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -113,7 +113,7 @@ mod tests {
         );
     }
 
-    fn run_round_trip_test(server_password: &str, client_password: &str) {
+    async fn run_round_trip_test(server_password: &str, client_password: &str) {
         let scram_secret = ServerSecret::build(server_password).unwrap();
         let sasl_client =
             ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
@@ -123,6 +123,7 @@ mod tests {
             sasl_client,
             crate::config::TlsServerEndPoint::Undefined,
         )
+        .await
         .unwrap();
 
         match outcome {
@@ -131,14 +132,14 @@ mod tests {
         }
     }
 
-    #[test]
-    fn round_trip() {
-        run_round_trip_test("pencil", "pencil")
+    #[tokio::test]
+    async fn round_trip() {
+        run_round_trip_test("pencil", "pencil").await
     }
 
-    #[test]
+    #[tokio::test]
     #[should_panic(expected = "password doesn't match")]
-    fn failure() {
-        run_round_trip_test("pencil", "eraser")
+    async fn failure() {
+        run_round_trip_test("pencil", "eraser").await
     }
 }
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 9af7db5201..16575d5d98 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -71,7 +71,7 @@ impl<'a> Exchange<'a> {
     }
 }
 
-pub fn exchange(
+pub async fn exchange(
     secret: &ServerSecret,
     mut client: ScramSha256,
     tls_server_end_point: config::TlsServerEndPoint,
@@ -86,7 +86,14 @@ pub fn exchange(
         .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
     let sent = match init.transition(secret, &tls_server_end_point, client_first)? {
         Continue(sent, server_first) => {
-            client.update(server_first.as_bytes())?;
+            // `client.update` might perform `pbkdf2(pw)`, best to spawn it in a blocking thread.
+            // TODO(conrad): take this code from tokio-postgres and make an async-aware pbkdf2 impl
+            client = tokio::task::spawn_blocking(move || {
+                client.update(server_first.as_bytes())?;
+                Ok::<ScramSha256, std::io::Error>(client)
+            })
+            .await
+            .expect("should not panic while performing password hash")?;
             sent
         }
         Success(x, _) => match x {},
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 29ef641265..72b55c45f0 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -50,7 +50,7 @@ impl PoolingBackend {
             }
         };
         let auth_outcome =
-            crate::auth::validate_password_and_exchange(&conn_info.password, secret)?;
+            crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
         let res = match auth_outcome {
             crate::sasl::Outcome::Success(key) => Ok(key),
             crate::sasl::Outcome::Failure(reason) => {

From b80704cd34baae1746a98c43db8dcb672e08dcf5 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 19 Mar 2024 10:30:33 +0000
Subject: [PATCH 0430/1571] tests: log hygiene checks for storage controller
 (#6710)

## Problem

As with the pageserver, we should fail tests that emit unexpected log
errors/warnings.

## Summary of changes

- Refactor existing log checks to be reusable
- Run log checks for attachment_service
- Add allow lists as needed.
---
 .../attachment_service/src/service.rs         | 25 ++++++++++++-
 test_runner/fixtures/neon_fixtures.py         | 26 ++++++-------
 .../fixtures/pageserver/allowed_errors.py     | 10 +++++
 test_runner/fixtures/utils.py                 | 37 +++++++++++++++++++
 test_runner/regress/test_branch_and_gc.py     | 12 +++---
 test_runner/regress/test_branch_behind.py     |  9 +++--
 test_runner/regress/test_compatibility.py     |  4 ++
 test_runner/regress/test_sharding.py          | 14 +++++++
 test_runner/regress/test_sharding_service.py  | 10 +++++
 test_runner/regress/test_tenants.py           |  4 +-
 10 files changed, 126 insertions(+), 25 deletions(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 29f87021b2..addfd9c232 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -7,7 +7,9 @@ use std::{
     time::{Duration, Instant},
 };
 
-use crate::{id_lock_map::IdLockMap, persistence::AbortShardSplitStatus};
+use crate::{
+    id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
+};
 use anyhow::Context;
 use control_plane::storage_controller::{
     AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
@@ -733,7 +735,19 @@ impl Service {
                 tenant.waiter.advance(result.sequence);
             }
             Err(e) => {
-                tracing::warn!("Reconcile error: {}", e);
+                match e {
+                    ReconcileError::Cancel => {
+                        tracing::info!("Reconciler was cancelled");
+                    }
+                    ReconcileError::Remote(mgmt_api::Error::Cancelled) => {
+                        // This might be due to the reconciler getting cancelled, or it might
+                        // be due to the `Node` being marked offline.
+                        tracing::info!("Reconciler cancelled during pageserver API call");
+                    }
+                    _ => {
+                        tracing::warn!("Reconcile error: {}", e);
+                    }
+                }
 
                 // Ordering: populate last_error before advancing error_seq,
                 // so that waiters will see the correct error after waiting.
@@ -3631,6 +3645,13 @@ impl Service {
                         observed_loc.conf = None;
                     }
 
+                    if new_nodes.len() == 1 {
+                        // Special case for single-node cluster: there is no point trying to reschedule
+                        // any tenant shards: avoid doing so, in order to avoid spewing warnings about
+                        // failures to schedule them.
+                        continue;
+                    }
+
                     if tenant_state.intent.demote_attached(node_id) {
                         tenant_state.sequence = tenant_state.sequence.next();
                         match tenant_state.schedule(scheduler) {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 56b23cef59..3ecd343224 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -51,7 +51,7 @@ from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pageserver.allowed_errors import (
     DEFAULT_PAGESERVER_ALLOWED_ERRORS,
-    scan_pageserver_log_for_errors,
+    DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.types import IndexPartDump
@@ -77,6 +77,7 @@ from fixtures.utils import (
     ATTACHMENT_NAME_REGEX,
     allure_add_grafana_links,
     allure_attach_from_dir,
+    assert_no_errors,
     get_self_dir,
     subprocess_capture,
     wait_until,
@@ -944,6 +945,8 @@ class NeonEnvBuilder:
             for pageserver in self.env.pageservers:
                 pageserver.assert_no_errors()
 
+            self.env.storage_controller.assert_no_errors()
+
         try:
             self.overlay_cleanup_teardown()
         except Exception as e:
@@ -1961,6 +1964,7 @@ class NeonStorageController(MetricsGetter):
         self.env = env
         self.running = False
         self.auth_enabled = auth_enabled
+        self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
 
     def start(self):
         assert not self.running
@@ -1985,6 +1989,11 @@ class NeonStorageController(MetricsGetter):
                 msg = ""
             raise StorageControllerApiException(msg, res.status_code) from e
 
+    def assert_no_errors(self):
+        assert_no_errors(
+            self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors
+        )
+
     def pageserver_api(self) -> PageserverHttpClient:
         """
         The storage controller implements a subset of the pageserver REST API, for mapping
@@ -2357,18 +2366,9 @@ class NeonPageserver(PgProtocol):
         return self.env.repo_dir / f"pageserver_{self.id}"
 
     def assert_no_errors(self):
-        logfile = self.workdir / "pageserver.log"
-        if not logfile.exists():
-            log.warning(f"Skipping log check: {logfile} does not exist")
-            return
-
-        with logfile.open("r") as f:
-            errors = scan_pageserver_log_for_errors(f, self.allowed_errors)
-
-        for _lineno, error in errors:
-            log.info(f"not allowed error: {error.strip()}")
-
-        assert not errors
+        assert_no_errors(
+            self.workdir / "pageserver.log", f"pageserver_{self.id}", self.allowed_errors
+        )
 
     def assert_no_metric_errors(self):
         """
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 839d4166c7..ec0f81b380 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -89,6 +89,16 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
 )
 
 
+DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
+    # Many tests will take pageservers offline, resulting in log warnings on the controller
+    # failing to connect to them.
+    ".*Call to node.*management API.*failed.*receive body.*",
+    ".*Call to node.*management API.*failed.*ReceiveBody.*",
+    # Many tests will start up with a node offline
+    ".*startup_reconcile: Could not scan node.*",
+]
+
+
 def _check_allowed_errors(input):
     allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)
 
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 7fc3bae3af..9365d65fc9 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -11,6 +11,7 @@ from typing import (
     Any,
     Callable,
     Dict,
+    Iterable,
     List,
     Optional,
     Tuple,
@@ -447,3 +448,39 @@ def humantime_to_ms(humantime: str) -> float:
             )
 
     return round(total_ms, 3)
+
+
+def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]:
+    error_or_warn = re.compile(r"\s(ERROR|WARN)")
+    errors = []
+    for lineno, line in enumerate(input, start=1):
+        if len(line) == 0:
+            continue
+
+        if error_or_warn.search(line):
+            # Is this a torn log line?  This happens when force-killing a process and restarting
+            # Example: "2023-10-25T09:38:31.752314Z  WARN deletion executo2023-10-25T09:38:31.875947Z  INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192"
+            if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line):
+                continue
+
+            # It's an ERROR or WARN. Is it in the allow-list?
+            for a in allowed_errors:
+                if re.match(a, line):
+                    break
+            else:
+                errors.append((lineno, line))
+    return errors
+
+
+def assert_no_errors(log_file, service, allowed_errors):
+    if not log_file.exists():
+        log.warning(f"Skipping {service} log check: {log_file} does not exist")
+        return
+
+    with log_file.open("r") as f:
+        errors = scan_log_for_errors(f, allowed_errors)
+
+    for _lineno, error in errors:
+        log.info(f"not allowed {service} error: {error.strip()}")
+
+    assert not errors, f"Log errors on {service}: {errors[0]}"
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index bdc944f352..ddd02238ea 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -120,12 +120,12 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
     env = neon_simple_env
     pageserver_http_client = env.pageserver.http_client()
 
-    env.pageserver.allowed_errors.extend(
-        [
-            ".*invalid branch start lsn: less than latest GC cutoff.*",
-            ".*invalid branch start lsn: less than planned GC cutoff.*",
-        ]
-    )
+    error_regexes = [
+        ".*invalid branch start lsn: less than latest GC cutoff.*",
+        ".*invalid branch start lsn: less than planned GC cutoff.*",
+    ]
+    env.pageserver.allowed_errors.extend(error_regexes)
+    env.storage_controller.allowed_errors.extend(error_regexes)
 
     # Disable background GC but set the `pitr_interval` to be small, so GC can delete something
     tenant, _ = env.neon_cli.create_tenant(
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index 46c74a26b8..b79cad979f 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -14,9 +14,12 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
     env = neon_env_builder.init_start()
 
-    env.pageserver.allowed_errors.extend(
-        [".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"]
-    )
+    error_regexes = [
+        ".*invalid branch start lsn.*",
+        ".*invalid start lsn .* for ancestor timeline.*",
+    ]
+    env.pageserver.allowed_errors.extend(error_regexes)
+    env.storage_controller.allowed_errors.extend(error_regexes)
 
     # Branch at the point where only 100 rows were inserted
     branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind")
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 5f815d3e6c..e0bb4c2062 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -238,6 +238,10 @@ def test_forward_compatibility(
             pg_distrib_dir=compatibility_postgres_distrib_dir,
         )
 
+        # TODO: remove this workaround after release-5090 is no longer the most recent release.
+        # There was a bug in that code that generates a warning in the storage controller log.
+        env.storage_controller.allowed_errors.append(".*no tenant_shard_id specified.*")
+
         # Use current neon_local even though we're using old binaries for
         # everything else: our test code is written for latest CLI args.
         env.neon_local_binpath = neon_local_binpath
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 9e62933f7e..3470d2e609 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -725,6 +725,20 @@ def test_sharding_split_failures(
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
+    env.storage_controller.allowed_errors.extend(
+        [
+            # All split failures log a warning when then enqueue the abort operation
+            ".*Enqueuing background abort.*",
+            # We exercise failure cases where abort itself will also fail (node offline)
+            ".*abort_tenant_shard_split.*",
+            ".*Failed to abort.*",
+            # Tolerate any error lots that mention a failpoint
+            ".*failpoint.*",
+            # Node offline cases will fail to send requests
+            ".*Reconcile error: receive body: error sending request for url.*",
+        ]
+    )
+
     for ps in env.pageservers:
         # When we do node failures and abandon a shard, it will de-facto have old generation and
         # thereby be unable to publish remote consistent LSN updates
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 27ea425bb1..a6b0f76c96 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -177,6 +177,7 @@ def test_node_status_after_restart(
     assert len(nodes) == 2
 
     env.pageservers[1].stop()
+    env.storage_controller.allowed_errors.extend([".*Could not scan node"])
 
     env.storage_controller.stop()
     env.storage_controller.start()
@@ -681,6 +682,9 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
     tenant_id = TenantId.generate()
     body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
 
+    env.storage_controller.allowed_errors.append(".*Unauthorized.*")
+    env.storage_controller.allowed_errors.append(".*Forbidden.*")
+
     # No token
     with pytest.raises(
         StorageControllerApiException,
@@ -843,6 +847,12 @@ def test_sharding_service_heartbeats(
     env = neon_env_builder.init_configs()
     env.start()
 
+    # Default log allow list permits connection errors, but this test will use error responses on
+    # the utilization endpoint.
+    env.storage_controller.allowed_errors.append(
+        ".*Call to node.*management API.*failed.*failpoint.*"
+    )
+
     # Initially we have two online pageservers
     nodes = env.storage_controller.node_list()
     assert len(nodes) == 2
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 1e13a2f20f..f8701b65d7 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -36,7 +36,9 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
     )
     [d for d in tenants_dir.iterdir()]
 
-    neon_simple_env.pageserver.allowed_errors.append(".*tenant-config-before-write.*")
+    error_regexes = [".*tenant-config-before-write.*"]
+    neon_simple_env.pageserver.allowed_errors.extend(error_regexes)
+    neon_simple_env.storage_controller.allowed_errors.extend(error_regexes)
 
     pageserver_http = neon_simple_env.pageserver.http_client()
     pageserver_http.configure_failpoints(("tenant-config-before-write", "return"))

From a8384a074e658193d1c4005763153898358b9d18 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 19 Mar 2024 10:43:24 -0400
Subject: [PATCH 0431/1571] fixup(#7168): neon_local: use pageserver defaults
 for known but unspecified config overrides (#7166)

e2e tests cannot run on macOS unless the file engine env var is
supplied.

```
./scripts/pytest test_runner/regress/test_neon_superuser.py -s
```

will fail with tokio-epoll-uring not supported.

This is because we persist the file engine config by default. In this
pull request, we only persist when someone specifies it, so that it can
use the default platform-variant config in the page server.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/src/local_env.rs  |  9 ++++-----
 control_plane/src/pageserver.rs | 12 ++++++++++--
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index c7f22cc8f8..bd3dbef453 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -127,8 +127,8 @@ pub struct PageServerConf {
     pub pg_auth_type: AuthType,
     pub http_auth_type: AuthType,
 
-    pub(crate) virtual_file_io_engine: String,
-    pub(crate) get_vectored_impl: String,
+    pub(crate) virtual_file_io_engine: Option<String>,
+    pub(crate) get_vectored_impl: Option<String>,
 }
 
 impl Default for PageServerConf {
@@ -139,9 +139,8 @@ impl Default for PageServerConf {
             listen_http_addr: String::new(),
             pg_auth_type: AuthType::Trust,
             http_auth_type: AuthType::Trust,
-            // FIXME: use the ones exposed by pageserver crate
-            virtual_file_io_engine: "tokio-epoll-uring".to_owned(),
-            get_vectored_impl: "sequential".to_owned(),
+            virtual_file_io_engine: None,
+            get_vectored_impl: None,
         }
     }
 }
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 2603515681..c5eabc46db 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -101,8 +101,16 @@ impl PageServerNode {
 
         let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
         let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
-        let virtual_file_io_engine = format!("virtual_file_io_engine='{virtual_file_io_engine}'");
-        let get_vectored_impl = format!("get_vectored_impl='{get_vectored_impl}'");
+        let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
+            format!("virtual_file_io_engine='{virtual_file_io_engine}'")
+        } else {
+            String::new()
+        };
+        let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
+            format!("get_vectored_impl='{get_vectored_impl}'")
+        } else {
+            String::new()
+        };
 
         let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
 

From 64c6dfd3e44c4550604e6b97678140afa93f4409 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 14 Mar 2024 14:35:34 -0500
Subject: [PATCH 0432/1571] Move functions for creating/extracting tarballs
 into utils

Useful for other code paths which will handle zstd compression and
decompression.
---
 Cargo.lock                       |  3 ++
 libs/utils/Cargo.toml            |  3 ++
 libs/utils/src/lib.rs            |  2 +
 libs/utils/src/zstd.rs           | 78 ++++++++++++++++++++++++++++++++
 pageserver/src/import_datadir.rs | 72 +----------------------------
 pageserver/src/tenant.rs         | 13 ++++--
 6 files changed, 97 insertions(+), 74 deletions(-)
 create mode 100644 libs/utils/src/zstd.rs

diff --git a/Cargo.lock b/Cargo.lock
index c4f925e3c7..70f427f97d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6468,6 +6468,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "arc-swap",
+ "async-compression",
  "async-trait",
  "bincode",
  "byteorder",
@@ -6506,12 +6507,14 @@ dependencies = [
  "thiserror",
  "tokio",
  "tokio-stream",
+ "tokio-tar",
  "tokio-util",
  "tracing",
  "tracing-error",
  "tracing-subscriber",
  "url",
  "uuid",
+ "walkdir",
  "workspace_hack",
 ]
 
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 983e94d963..c2d9d9d396 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -13,6 +13,7 @@ testing = ["fail/failpoints"]
 [dependencies]
 arc-swap.workspace = true
 sentry.workspace = true
+async-compression.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
@@ -36,6 +37,7 @@ serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
+tokio-tar.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
@@ -46,6 +48,7 @@ strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
 uuid.workspace = true
+walkdir.workspace = true
 
 pq_proto.workspace = true
 postgres_connection.workspace = true
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 890061dc59..04ce0626c8 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -87,6 +87,8 @@ pub mod failpoint_support;
 
 pub mod yielding_loop;
 
+pub mod zstd;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
diff --git a/libs/utils/src/zstd.rs b/libs/utils/src/zstd.rs
new file mode 100644
index 0000000000..be2dcc00f5
--- /dev/null
+++ b/libs/utils/src/zstd.rs
@@ -0,0 +1,78 @@
+use std::io::SeekFrom;
+
+use anyhow::{Context, Result};
+use async_compression::{
+    tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
+    zstd::CParameter,
+    Level,
+};
+use camino::Utf8Path;
+use nix::NixPath;
+use tokio::{
+    fs::{File, OpenOptions},
+    io::AsyncBufRead,
+    io::AsyncSeekExt,
+    io::AsyncWriteExt,
+};
+use tokio_tar::{Archive, Builder, HeaderMode};
+use walkdir::WalkDir;
+
+/// Creates a Zstandard tarball.
+pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
+    let file = OpenOptions::new()
+        .create(true)
+        .truncate(true)
+        .read(true)
+        .write(true)
+        .open(&tarball)
+        .await
+        .with_context(|| format!("tempfile creation {tarball}"))?;
+
+    let mut paths = Vec::new();
+    for entry in WalkDir::new(path) {
+        let entry = entry?;
+        let metadata = entry.metadata().expect("error getting dir entry metadata");
+        // Also allow directories so that we also get empty directories
+        if !(metadata.is_file() || metadata.is_dir()) {
+            continue;
+        }
+        let path = entry.into_path();
+        paths.push(path);
+    }
+    // Do a sort to get a more consistent listing
+    paths.sort_unstable();
+    let zstd = ZstdEncoder::with_quality_and_params(
+        file,
+        Level::Default,
+        &[CParameter::enable_long_distance_matching(true)],
+    );
+    let mut builder = Builder::new(zstd);
+    // Use reproducible header mode
+    builder.mode(HeaderMode::Deterministic);
+    for p in paths {
+        let rel_path = p.strip_prefix(path)?;
+        if rel_path.is_empty() {
+            // The top directory should not be compressed,
+            // the tar crate doesn't like that
+            continue;
+        }
+        builder.append_path_with_name(&p, rel_path).await?;
+    }
+    let mut zstd = builder.into_inner().await?;
+    zstd.shutdown().await?;
+    let mut compressed = zstd.into_inner();
+    let compressed_len = compressed.metadata().await?.len();
+    compressed.seek(SeekFrom::Start(0)).await?;
+    Ok((compressed, compressed_len))
+}
+
+/// Creates a Zstandard tarball.
+pub async fn extract_zst_tarball(
+    path: &Utf8Path,
+    tarball: impl AsyncBufRead + Unpin,
+) -> Result<()> {
+    let decoder = Box::pin(ZstdDecoder::new(tarball));
+    let mut archive = Archive::new(decoder);
+    archive.unpack(path).await?;
+    Ok(())
+}
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index d66df36b3a..343dec2ca1 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -2,28 +2,20 @@
 //! Import data and WAL from a PostgreSQL data directory and WAL segments into
 //! a neon Timeline.
 //!
-use std::io::SeekFrom;
 use std::path::{Path, PathBuf};
 
 use anyhow::{bail, ensure, Context, Result};
-use async_compression::tokio::bufread::ZstdDecoder;
-use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
-use nix::NixPath;
-use tokio::fs::{File, OpenOptions};
-use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
+use tokio::io::{AsyncRead, AsyncReadExt};
 use tokio_tar::Archive;
-use tokio_tar::Builder;
-use tokio_tar::HeaderMode;
 use tracing::*;
 use walkdir::WalkDir;
 
 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
-use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
 use crate::walrecord::DecodedWALRecord;
@@ -633,65 +625,3 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
     reader.read_to_end(&mut buf).await?;
     Ok(Bytes::from(buf))
 }
-
-pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
-    let file = OpenOptions::new()
-        .create(true)
-        .truncate(true)
-        .read(true)
-        .write(true)
-        .open(&tmp_path)
-        .await
-        .with_context(|| format!("tempfile creation {tmp_path}"))?;
-
-    let mut paths = Vec::new();
-    for entry in WalkDir::new(pgdata_path) {
-        let entry = entry?;
-        let metadata = entry.metadata().expect("error getting dir entry metadata");
-        // Also allow directories so that we also get empty directories
-        if !(metadata.is_file() || metadata.is_dir()) {
-            continue;
-        }
-        let path = entry.into_path();
-        paths.push(path);
-    }
-    // Do a sort to get a more consistent listing
-    paths.sort_unstable();
-    let zstd = ZstdEncoder::with_quality_and_params(
-        file,
-        Level::Default,
-        &[CParameter::enable_long_distance_matching(true)],
-    );
-    let mut builder = Builder::new(zstd);
-    // Use reproducible header mode
-    builder.mode(HeaderMode::Deterministic);
-    for path in paths {
-        let rel_path = path.strip_prefix(pgdata_path)?;
-        if rel_path.is_empty() {
-            // The top directory should not be compressed,
-            // the tar crate doesn't like that
-            continue;
-        }
-        builder.append_path_with_name(&path, rel_path).await?;
-    }
-    let mut zstd = builder.into_inner().await?;
-    zstd.shutdown().await?;
-    let mut compressed = zstd.into_inner();
-    let compressed_len = compressed.metadata().await?.len();
-    const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
-    if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
-        warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
-    }
-    compressed.seek(SeekFrom::Start(0)).await?;
-    Ok((compressed, compressed_len))
-}
-
-pub async fn extract_tar_zst(
-    pgdata_path: &Utf8Path,
-    tar_zst: impl AsyncBufRead + Unpin,
-) -> Result<()> {
-    let tar = Box::pin(ZstdDecoder::new(tar_zst));
-    let mut archive = Archive::new(tar);
-    archive.unpack(pgdata_path).await?;
-    Ok(())
-}
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ddfb47369b..7a6ddd6a4e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -43,6 +43,8 @@ use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
 use utils::timeout::timeout_cancellable;
 use utils::timeout::TimeoutCancellableError;
+use utils::zstd::create_zst_tarball;
+use utils::zstd::extract_zst_tarball;
 
 use self::config::AttachedLocationConfig;
 use self::config::AttachmentMode;
@@ -3042,8 +3044,13 @@ impl Tenant {
             }
         }
 
-        let (pgdata_zstd, tar_zst_size) =
-            import_datadir::create_tar_zst(pgdata_path, &temp_path).await?;
+        let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?;
+        const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
+        if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT {
+            warn!(
+                "compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."
+            );
+        }
 
         pausable_failpoint!("before-initdb-upload");
 
@@ -3143,7 +3150,7 @@ impl Tenant {
 
             let buf_read =
                 BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
-            import_datadir::extract_tar_zst(&pgdata_path, buf_read)
+            extract_zst_tarball(&pgdata_path, buf_read)
                 .await
                 .context("extract initdb tar")?;
         } else {

From a5d5c2a6a0c0e9da4ccbcd8e44dc97559eeee8c9 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 19 Mar 2024 16:08:20 +0000
Subject: [PATCH 0433/1571] storage controller: tech debt (#7165)

This is a mixed bag of changes split out for separate review while
working on other things, and batched together to reduce load on CI
runners. Each commits stands alone for review purposes:
- do_tenant_shard_split was a long function and had a synchronous
validation phase at the start that could readily be pulled out into a
separate function. This also avoids the special casing of
ApiError::BadRequest when deciding whether an abort is needed on errors
- Add a 'describe' API (GET on tenant ID) that will enable storcon-cli
to see what's going on with a tenant
- the 'locate' API wasn't really meant for use in the field. It's for
tests: demote it to the /debug/ prefix
- The `Single` placement policy was a redundant duplicate of Double(0),
and Double was a bad name. Rename it Attached.
(https://github.com/neondatabase/neon/issues/7107)
- Some neon_local commands were added for debug/demos, which are now
replaced by commands in storcon-cli (#7114 ). Even though that's not
merged yet, we don't need the neon_local ones any more.

Closes https://github.com/neondatabase/neon/issues/7107

## Backward compat of Single/Double -> `Attached(n)` change

A database migration is used to convert any existing values.
---
 .../2024-03-18-184429_rename_policy/down.sql  |   3 +
 .../2024-03-18-184429_rename_policy/up.sql    |   3 +
 control_plane/attachment_service/src/http.rs  |  19 +-
 .../attachment_service/src/persistence.rs     |  13 +-
 .../attachment_service/src/reconciler.rs      |   2 +-
 .../attachment_service/src/service.rs         | 184 +++++++++++++-----
 .../attachment_service/src/tenant_state.rs    |  21 +-
 control_plane/src/bin/neon_local.rs           |  97 +--------
 control_plane/src/storage_controller.rs       |   2 +-
 libs/pageserver_api/src/controller_api.rs     |  45 ++++-
 test_runner/fixtures/neon_fixtures.py         |  15 +-
 test_runner/fixtures/types.py                 |   3 +
 .../regress/test_pageserver_secondary.py      |   2 +-
 test_runner/regress/test_sharding.py          |   4 +-
 14 files changed, 206 insertions(+), 207 deletions(-)
 create mode 100644 control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql

diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
new file mode 100644
index 0000000000..897c7e0d01
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
@@ -0,0 +1,3 @@
+
+UPDATE tenant_shards set placement_policy='{"Double": 1}' where placement_policy='{"Attached": 1}';
+UPDATE tenant_shards set placement_policy='"Single"' where placement_policy='{"Attached": 0}';
\ No newline at end of file
diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
new file mode 100644
index 0000000000..c898ac9aee
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
@@ -0,0 +1,3 @@
+
+UPDATE tenant_shards set placement_policy='{"Attached": 1}' where placement_policy='{"Double": 1}';
+UPDATE tenant_shards set placement_policy='{"Attached": 0}' where placement_policy='"Single"';
\ No newline at end of file
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 45ee354822..076b3a2f70 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -353,6 +353,16 @@ async fn handle_tenant_locate(
     json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }
 
+async fn handle_tenant_describe(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
+}
+
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -559,6 +569,9 @@ pub fn make_router(
             request_span(r, handle_node_drop)
         })
         .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
+        .get("/debug/v1/tenant/:tenant_id/locate", |r| {
+            tenant_service_handler(r, handle_tenant_locate)
+        })
         .get("/debug/v1/scheduler", |r| {
             request_span(r, handle_scheduler_dump)
         })
@@ -568,9 +581,6 @@ pub fn make_router(
         .put("/debug/v1/failpoints", |r| {
             request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
         })
-        .get("/control/v1/tenant/:tenant_id/locate", |r| {
-            tenant_service_handler(r, handle_tenant_locate)
-        })
         // Node operations
         .post("/control/v1/node", |r| {
             request_span(r, handle_node_register)
@@ -586,6 +596,9 @@ pub fn make_router(
         .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
             tenant_service_handler(r, handle_tenant_shard_split)
         })
+        .get("/control/v1/tenant/:tenant_id", |r| {
+            tenant_service_handler(r, handle_tenant_describe)
+        })
         // Tenant operations
         // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
         // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 3602cf8b1f..209d8ff075 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -211,15 +211,10 @@ impl Persistence {
 
         let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
             .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
-        for (tenant_id, tenant) in &mut decoded.tenants {
-            // Backward compat: an old attachments.json from before PR #6251, replace
-            // empty strings with proper defaults.
-            if tenant.tenant_id.is_empty() {
-                tenant.tenant_id = tenant_id.to_string();
-                tenant.config = serde_json::to_string(&TenantConfig::default())
-                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
-                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
-                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+        for shard in decoded.tenants.values_mut() {
+            if shard.placement_policy == "\"Single\"" {
+                // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
+                shard.placement_policy = "{\"Attached\":0}".to_string();
             }
         }
 
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index 3bf23275bd..f00f35c74b 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -475,7 +475,7 @@ impl Reconciler {
             }
         }
 
-        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Single, then
+        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then
         // this location will be deleted in the general case reconciliation that runs after this.
         let origin_secondary_conf = build_location_config(
             &self.shard,
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index addfd9c232..e38007c7af 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -20,8 +20,9 @@ use hyper::StatusCode;
 use pageserver_api::{
     controller_api::{
         NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
-        TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
+        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
+        TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
+        TenantShardMigrateResponse, UtilizationScore,
     },
     models::{SecondaryProgress, TenantConfigRequest},
 };
@@ -202,6 +203,29 @@ enum TenantCreateOrUpdate {
     Update(Vec<ShardUpdate>),
 }
 
+struct ShardSplitParams {
+    old_shard_count: ShardCount,
+    new_shard_count: ShardCount,
+    new_stripe_size: Option<ShardStripeSize>,
+    targets: Vec<ShardSplitTarget>,
+    policy: PlacementPolicy,
+    shard_ident: ShardIdentity,
+}
+
+// When preparing for a shard split, we may either choose to proceed with the split,
+// or find that the work is already done and return NoOp.
+enum ShardSplitAction {
+    Split(ShardSplitParams),
+    NoOp(TenantShardSplitResponse),
+}
+
+// A parent shard which will be split
+struct ShardSplitTarget {
+    parent_id: TenantShardId,
+    node: Node,
+    child_ids: Vec<TenantShardId>,
+}
+
 /// When we tenant shard split operation fails, we may not be able to clean up immediately, because nodes
 /// might not be available.  We therefore use a queue of abort operations processed in the background.
 struct TenantShardSplitAbort {
@@ -1071,7 +1095,7 @@ impl Service {
                 shard_stripe_size: 0,
                 generation: Some(0),
                 generation_pageserver: None,
-                placement_policy: serde_json::to_string(&PlacementPolicy::Single).unwrap(),
+                placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
                 config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                 splitting: SplitState::default(),
             };
@@ -1098,7 +1122,7 @@ impl Service {
                         TenantState::new(
                             attach_req.tenant_shard_id,
                             ShardIdentity::unsharded(),
-                            PlacementPolicy::Single,
+                            PlacementPolicy::Attached(0),
                         ),
                     );
                     tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
@@ -1127,7 +1151,7 @@ impl Service {
                     self.persistence
                         .update_tenant_shard(
                             attach_req.tenant_shard_id,
-                            PlacementPolicy::Single,
+                            PlacementPolicy::Attached(0),
                             conf,
                             None,
                         )
@@ -1152,7 +1176,7 @@ impl Service {
 
         if let Some(new_generation) = new_generation {
             tenant_state.generation = Some(new_generation);
-            tenant_state.policy = PlacementPolicy::Single;
+            tenant_state.policy = PlacementPolicy::Attached(0);
         } else {
             // This is a detach notification.  We must update placement policy to avoid re-attaching
             // during background scheduling/reconciliation, or during storage controller restart.
@@ -1505,11 +1529,11 @@ impl Service {
         &self,
         create_req: TenantCreateRequest,
     ) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
-        // As a default, single is convenient for tests that don't choose a policy.
         let placement_policy = create_req
             .placement_policy
             .clone()
-            .unwrap_or(PlacementPolicy::Single);
+            // As a default, zero secondaries is convenient for tests that don't choose a policy.
+            .unwrap_or(PlacementPolicy::Attached(0));
 
         // This service expects to handle sharding itself: it is an error to try and directly create
         // a particular shard here.
@@ -1719,11 +1743,11 @@ impl Service {
             | LocationConfigMode::AttachedSingle
             | LocationConfigMode::AttachedStale => {
                 if nodes.len() > 1 {
-                    PlacementPolicy::Double(1)
+                    PlacementPolicy::Attached(1)
                 } else {
                     // Convenience for dev/test: if we just have one pageserver, import
-                    // tenants into Single mode so that scheduling will succeed.
-                    PlacementPolicy::Single
+                    // tenants into non-HA mode so that scheduling will succeed.
+                    PlacementPolicy::Attached(0)
                 }
             }
         };
@@ -2541,9 +2565,6 @@ impl Service {
         let locked = self.inner.read().unwrap();
         tracing::info!("Locating shards for tenant {tenant_id}");
 
-        // Take a snapshot of pageservers
-        let pageservers = locked.nodes.clone();
-
         let mut result = Vec::new();
         let mut shard_params: Option<ShardParameters> = None;
 
@@ -2557,7 +2578,8 @@ impl Service {
                         "Cannot locate a tenant that is not attached"
                     )))?;
 
-            let node = pageservers
+            let node = locked
+                .nodes
                 .get(&node_id)
                 .expect("Pageservers may not be deleted while referenced");
 
@@ -2605,6 +2627,47 @@ impl Service {
         })
     }
 
+    pub(crate) fn tenant_describe(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantDescribeResponse, ApiError> {
+        let locked = self.inner.read().unwrap();
+
+        let mut shard_zero = None;
+        let mut shards = Vec::new();
+
+        for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+        {
+            if tenant_shard_id.is_zero() {
+                shard_zero = Some(shard);
+            }
+
+            let response_shard = TenantDescribeResponseShard {
+                tenant_shard_id: *tenant_shard_id,
+                node_attached: *shard.intent.get_attached(),
+                node_secondary: shard.intent.get_secondary().to_vec(),
+                last_error: shard.last_error.lock().unwrap().clone(),
+                is_reconciling: shard.reconciler.is_some(),
+                is_pending_compute_notification: shard.pending_compute_notification,
+                is_splitting: matches!(shard.splitting, SplitState::Splitting),
+            };
+            shards.push(response_shard);
+        }
+
+        let Some(shard_zero) = shard_zero else {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
+            ));
+        };
+
+        Ok(TenantDescribeResponse {
+            shards,
+            stripe_size: shard_zero.shard.stripe_size,
+            policy: shard_zero.policy.clone(),
+            config: shard_zero.config.clone(),
+        })
+    }
+
     #[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
     async fn abort_tenant_shard_split(
         &self,
@@ -2828,7 +2891,7 @@ impl Service {
                                 generation,
                                 &child_shard,
                                 &config,
-                                matches!(policy, PlacementPolicy::Double(n) if n > 0),
+                                matches!(policy, PlacementPolicy::Attached(n) if n > 0),
                             )),
                         },
                     );
@@ -2875,17 +2938,23 @@ impl Service {
         let new_shard_count = ShardCount::new(split_req.new_shard_count);
         let new_stripe_size = split_req.new_stripe_size;
 
-        let r = self.do_tenant_shard_split(tenant_id, split_req).await;
+        // Validate the request and construct parameters.  This phase is fallible, but does not require
+        // rollback on errors, as it does no I/O and mutates no state.
+        let shard_split_params = match self.prepare_tenant_shard_split(tenant_id, split_req)? {
+            ShardSplitAction::NoOp(resp) => return Ok(resp),
+            ShardSplitAction::Split(params) => params,
+        };
+
+        // Execute this split: this phase mutates state and does remote I/O on pageservers.  If it fails,
+        // we must roll back.
+        let r = self
+            .do_tenant_shard_split(tenant_id, shard_split_params)
+            .await;
 
         match r {
             Ok(r) => Ok(r),
-            Err(ApiError::BadRequest(_)) => {
-                // A request validation error does not require rollback: we rejected it before we started making any changes: just
-                // return the error
-                r
-            }
             Err(e) => {
-                // General case error handling: split might be part-done, we must do work to abort it.
+                // Split might be part-done, we must do work to abort it.
                 tracing::warn!("Enqueuing background abort of split on {tenant_id}");
                 self.abort_tx
                     .send(TenantShardSplitAbort {
@@ -2901,25 +2970,17 @@ impl Service {
         }
     }
 
-    pub(crate) async fn do_tenant_shard_split(
+    fn prepare_tenant_shard_split(
         &self,
         tenant_id: TenantId,
         split_req: TenantShardSplitRequest,
-    ) -> Result<TenantShardSplitResponse, ApiError> {
-        let mut policy = None;
-        let mut shard_ident = None;
-
-        // A parent shard which will be split
-        struct SplitTarget {
-            parent_id: TenantShardId,
-            node: Node,
-            child_ids: Vec<TenantShardId>,
-        }
-
+    ) -> Result<ShardSplitAction, ApiError> {
         fail::fail_point!("shard-split-validation", |_| Err(ApiError::BadRequest(
             anyhow::anyhow!("failpoint")
         )));
 
+        let mut policy = None;
+        let mut shard_ident = None;
         // Validate input, and calculate which shards we will create
         let (old_shard_count, targets) =
             {
@@ -2995,7 +3056,7 @@ impl Service {
 
                     // TODO: if any reconciliation is currently in progress for this shard, wait for it.
 
-                    targets.push(SplitTarget {
+                    targets.push(ShardSplitTarget {
                         parent_id: *tenant_shard_id,
                         node: node.clone(),
                         child_ids: tenant_shard_id
@@ -3005,9 +3066,9 @@ impl Service {
 
                 if targets.is_empty() {
                     if children_found.len() == split_req.new_shard_count as usize {
-                        return Ok(TenantShardSplitResponse {
+                        return Ok(ShardSplitAction::NoOp(TenantShardSplitResponse {
                             new_shards: children_found,
-                        });
+                        }));
                     } else {
                         // No shards found to split, and no existing children found: the
                         // tenant doesn't exist at all.
@@ -3038,12 +3099,36 @@ impl Service {
         };
         let policy = policy.unwrap();
 
+        Ok(ShardSplitAction::Split(ShardSplitParams {
+            old_shard_count,
+            new_shard_count: ShardCount::new(split_req.new_shard_count),
+            new_stripe_size: split_req.new_stripe_size,
+            targets,
+            policy,
+            shard_ident,
+        }))
+    }
+
+    async fn do_tenant_shard_split(
+        &self,
+        tenant_id: TenantId,
+        params: ShardSplitParams,
+    ) -> Result<TenantShardSplitResponse, ApiError> {
         // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
         // request could occur here, deleting or mutating the tenant.  begin_shard_split checks that the
         // parent shards exist as expected, but it would be neater to do the above pre-checks within the
         // same database transaction rather than pre-check in-memory and then maybe-fail the database write.
         // (https://github.com/neondatabase/neon/issues/6676)
 
+        let ShardSplitParams {
+            old_shard_count,
+            new_shard_count,
+            new_stripe_size,
+            targets,
+            policy,
+            shard_ident,
+        } = params;
+
         // Before creating any new child shards in memory or on the pageservers, persist them: this
         // enables us to ensure that we will always be able to clean up if something goes wrong.  This also
         // acts as the protection against two concurrent attempts to split: one of them will get a database
@@ -3125,7 +3210,7 @@ impl Service {
         // N>1 shards into M shards -- initially we're usually splitting 1 shard into N).
 
         for target in &targets {
-            let SplitTarget {
+            let ShardSplitTarget {
                 parent_id,
                 node,
                 child_ids,
@@ -3135,8 +3220,8 @@ impl Service {
                 .tenant_shard_split(
                     *parent_id,
                     TenantShardSplitRequest {
-                        new_shard_count: split_req.new_shard_count,
-                        new_stripe_size: split_req.new_stripe_size,
+                        new_shard_count: new_shard_count.literal(),
+                        new_stripe_size,
                     },
                 )
                 .await
@@ -3185,11 +3270,8 @@ impl Service {
         ));
 
         // Replace all the shards we just split with their children: this phase is infallible.
-        let (response, child_locations) = self.tenant_shard_split_commit_inmem(
-            tenant_id,
-            ShardCount::new(split_req.new_shard_count),
-            split_req.new_stripe_size,
-        );
+        let (response, child_locations) =
+            self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
 
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
@@ -3254,17 +3336,15 @@ impl Service {
                 let old_attached = *shard.intent.get_attached();
 
                 match shard.policy {
-                    PlacementPolicy::Single => {
-                        shard.intent.clear_secondary(scheduler);
-                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
-                    }
-                    PlacementPolicy::Double(_n) => {
+                    PlacementPolicy::Attached(n) => {
                         // If our new attached node was a secondary, it no longer should be.
                         shard.intent.remove_secondary(scheduler, migrate_req.node_id);
 
                         // If we were already attached to something, demote that to a secondary
                         if let Some(old_attached) = old_attached {
-                            shard.intent.push_secondary(scheduler, old_attached);
+                            if n > 0 {
+                                shard.intent.push_secondary(scheduler, old_attached);
+                            }
                         }
 
                         shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 39e557616d..9dd368bf41 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -457,22 +457,7 @@ impl TenantState {
         // Add/remove nodes to fulfil policy
         use PlacementPolicy::*;
         match self.policy {
-            Single => {
-                // Should have exactly one attached, and zero secondaries
-                if !self.intent.secondary.is_empty() {
-                    self.intent.clear_secondary(scheduler);
-                    modified = true;
-                }
-
-                let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
-                modified |= modified_attached;
-
-                if !self.intent.secondary.is_empty() {
-                    self.intent.clear_secondary(scheduler);
-                    modified = true;
-                }
-            }
-            Double(secondary_count) => {
+            Attached(secondary_count) => {
                 let retain_secondaries = if self.intent.attached.is_none()
                     && scheduler.node_preferred(&self.intent.secondary).is_some()
                 {
@@ -895,7 +880,7 @@ pub(crate) mod tests {
 
         let mut scheduler = Scheduler::new(nodes.values());
 
-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
         tenant_state
             .schedule(&mut scheduler)
             .expect("we have enough nodes, scheduling should work");
@@ -943,7 +928,7 @@ pub(crate) mod tests {
         let nodes = make_test_nodes(3);
         let mut scheduler = Scheduler::new(nodes.values());
 
-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
 
         tenant_state.observed.locations.insert(
             NodeId(3),
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 6c722f36b4..401feae706 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -437,7 +437,7 @@ async fn handle_tenant(
 
             let placement_policy = match create_match.get_one::<String>("placement-policy") {
                 Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
-                _ => PlacementPolicy::Single,
+                _ => PlacementPolicy::Attached(0),
             };
 
             let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
@@ -523,88 +523,6 @@ async fn handle_tenant(
                 .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
             println!("tenant {tenant_id} successfully configured on the pageserver");
         }
-        Some(("migrate", matches)) => {
-            let tenant_shard_id = get_tenant_shard_id(matches, env)?;
-            let new_pageserver = get_pageserver(env, matches)?;
-            let new_pageserver_id = new_pageserver.conf.id;
-
-            let storage_controller = StorageController::from_env(env);
-            storage_controller
-                .tenant_migrate(tenant_shard_id, new_pageserver_id)
-                .await?;
-
-            println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
-        }
-        Some(("status", matches)) => {
-            let tenant_id = get_tenant_id(matches, env)?;
-
-            let mut shard_table = comfy_table::Table::new();
-            shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
-
-            let mut tenant_synthetic_size = None;
-
-            let storage_controller = StorageController::from_env(env);
-            for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
-                let pageserver =
-                    PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
-
-                let size = pageserver
-                    .http_client
-                    .tenant_details(shard.shard_id)
-                    .await?
-                    .tenant_info
-                    .current_physical_size
-                    .unwrap();
-
-                shard_table.add_row([
-                    format!("{}", shard.shard_id.shard_slug()),
-                    format!("{}", shard.node_id.0),
-                    format!("{} MiB", size / (1024 * 1024)),
-                ]);
-
-                if shard.shard_id.is_zero() {
-                    tenant_synthetic_size =
-                        Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
-                }
-            }
-
-            let Some(synthetic_size) = tenant_synthetic_size else {
-                bail!("Shard 0 not found")
-            };
-
-            let mut tenant_table = comfy_table::Table::new();
-            tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
-            tenant_table.add_row([
-                "Synthetic size".to_string(),
-                format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
-            ]);
-
-            println!("{tenant_table}");
-            println!("{shard_table}");
-        }
-        Some(("shard-split", matches)) => {
-            let tenant_id = get_tenant_id(matches, env)?;
-            let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
-            let shard_stripe_size: Option<ShardStripeSize> = matches
-                .get_one::<Option<ShardStripeSize>>("shard-stripe-size")
-                .cloned()
-                .unwrap();
-
-            let storage_controller = StorageController::from_env(env);
-            let result = storage_controller
-                .tenant_split(tenant_id, shard_count, shard_stripe_size)
-                .await?;
-            println!(
-                "Split tenant {} into shards {}",
-                tenant_id,
-                result
-                    .new_shards
-                    .iter()
-                    .map(|s| format!("{:?}", s))
-                    .collect::<Vec<_>>()
-                    .join(",")
-            );
-        }
 
         Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
         None => bail!("no tenant subcommand provided"),
@@ -1578,19 +1496,6 @@ fn cli() -> Command {
             .subcommand(Command::new("config")
                 .arg(tenant_id_arg.clone())
                 .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
-            .subcommand(Command::new("migrate")
-                .about("Migrate a tenant from one pageserver to another")
-                .arg(tenant_id_arg.clone())
-                .arg(pageserver_id_arg.clone()))
-            .subcommand(Command::new("status")
-                .about("Human readable summary of the tenant's shards and attachment locations")
-                .arg(tenant_id_arg.clone()))
-            .subcommand(Command::new("shard-split")
-                .about("Increase the number of shards in the tenant")
-                .arg(tenant_id_arg.clone())
-                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
-                .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
-                )
         )
         .subcommand(
             Command::new("pageserver")
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 18014adba4..e7697ecac8 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -475,7 +475,7 @@ impl StorageController {
     pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
         self.dispatch::<(), _>(
             Method::GET,
-            format!("control/v1/tenant/{tenant_id}/locate"),
+            format!("debug/v1/tenant/{tenant_id}/locate"),
             None,
         )
         .await
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 6053e8b8ed..e33bd0f486 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -6,7 +6,10 @@ use std::str::FromStr;
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;
 
-use crate::{models::ShardParameters, shard::TenantShardId};
+use crate::{
+    models::{ShardParameters, TenantConfig},
+    shard::{ShardStripeSize, TenantShardId},
+};
 
 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
@@ -57,6 +60,31 @@ pub struct TenantLocateResponse {
     pub shard_params: ShardParameters,
 }
 
+#[derive(Serialize, Deserialize)]
+pub struct TenantDescribeResponse {
+    pub shards: Vec<TenantDescribeResponseShard>,
+    pub stripe_size: ShardStripeSize,
+    pub policy: PlacementPolicy,
+    pub config: TenantConfig,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantDescribeResponseShard {
+    pub tenant_shard_id: TenantShardId,
+
+    pub node_attached: Option<NodeId>,
+    pub node_secondary: Vec<NodeId>,
+
+    pub last_error: String,
+
+    /// A task is currently running to reconcile this tenant's intent state with the state on pageservers
+    pub is_reconciling: bool,
+    /// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
+    pub is_pending_compute_notification: bool,
+    /// A shard split is currently underway
+    pub is_splitting: bool,
+}
+
 /// Explicitly migrating a particular shard is a low level operation
 /// TODO: higher level "Reschedule tenant" operation where the request
 /// specifies some constraints, e.g. asking it to get off particular node(s)
@@ -181,11 +209,8 @@ impl From<NodeSchedulingPolicy> for String {
 /// to create secondary locations.
 #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
 pub enum PlacementPolicy {
-    /// Cheapest way to attach a tenant: just one pageserver, no secondary
-    Single,
-    /// Production-ready way to attach a tenant: one attached pageserver and
-    /// some number of secondaries.
-    Double(usize),
+    /// Normal live state: one attached pageserver and zero or more secondaries.
+    Attached(usize),
     /// Create one secondary mode locations. This is useful when onboarding
     /// a tenant, or for an idle tenant that we might want to bring online quickly.
     Secondary,
@@ -207,14 +232,14 @@ mod test {
     /// Check stability of PlacementPolicy's serialization
     #[test]
     fn placement_policy_encoding() -> anyhow::Result<()> {
-        let v = PlacementPolicy::Double(1);
+        let v = PlacementPolicy::Attached(1);
         let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "{\"Double\":1}");
+        assert_eq!(encoded, "{\"Attached\":1}");
         assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
 
-        let v = PlacementPolicy::Single;
+        let v = PlacementPolicy::Detached;
         let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "\"Single\"");
+        assert_eq!(encoded, "\"Detached\"");
         assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
         Ok(())
     }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3ecd343224..1d30c45278 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1892,19 +1892,6 @@ class NeonCli(AbstractNeonCli):
 
         return self.raw_cli(args, check_return_code=True)
 
-    def tenant_migrate(
-        self, tenant_shard_id: TenantShardId, new_pageserver: int, timeout_secs: Optional[int]
-    ):
-        args = [
-            "tenant",
-            "migrate",
-            "--tenant-id",
-            str(tenant_shard_id),
-            "--id",
-            str(new_pageserver),
-        ]
-        return self.raw_cli(args, check_return_code=True, timeout=timeout_secs)
-
     def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
         return self.raw_cli(["start"], check_return_code=check_return_code)
 
@@ -2156,7 +2143,7 @@ class NeonStorageController(MetricsGetter):
         """
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/locate",
+            f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/locate",
             headers=self.headers(TokenScope.ADMIN),
         )
         body = response.json()
diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py
index ea648e460d..80c9b9ce9a 100644
--- a/test_runner/fixtures/types.py
+++ b/test_runner/fixtures/types.py
@@ -158,6 +158,9 @@ class TenantShardId:
     def __str__(self):
         return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
 
+    def __repr__(self):
+        return self.__str__()
+
     def _tuple(self) -> tuple[TenantId, int, int]:
         return (self.tenant_id, self.shard_number, self.shard_count)
 
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 8ef75414a3..e664547b69 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -576,7 +576,7 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll
     timeline_id = TimelineId.generate()
 
     env.neon_cli.create_tenant(
-        tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Double":1}'
+        tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}'
     )
 
     attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 3470d2e609..cb58c640c3 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -264,7 +264,7 @@ def test_sharding_split_smoke(
         destination = migrate_to_pageserver_ids.pop()
 
         log.info(f"Migrating shard {migrate_shard} from {ps_id} to {destination}")
-        env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10)
+        env.storage_controller.tenant_shard_migrate(migrate_shard, destination)
 
     workload.validate()
 
@@ -299,7 +299,7 @@ def test_sharding_split_smoke(
         locations = pageserver.http_client().tenant_list_locations()
         shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])
 
-    log.info("Shards after split: {shards_exist}")
+    log.info(f"Shards after split: {shards_exist}")
     assert len(shards_exist) == split_shard_count
 
     # Ensure post-split pageserver locations survive a restart (i.e. the child shards

From 4ba3f3518eddd8e5eebca90c564857e0d285932d Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 20 Mar 2024 10:24:59 +0000
Subject: [PATCH 0434/1571] test: fix on demand activation test flakyness
 (#7180)

Warm-up (and the "tenant startup complete" metric update) happens in
a background tokio task. The tenant map is eagerly updated (can happen
before the task finishes).

The test assumed that if the tenant map was updated, then the metric
should reflect that. That's not the case, so we tweak the test to wait
for the metric.

Fixes https://github.com/neondatabase/neon/issues/7158
---
 test_runner/regress/test_timeline_size.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 205ca18050..628c484fbd 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -20,6 +20,7 @@ from fixtures.neon_fixtures import (
     VanillaPostgres,
     wait_for_last_flush_lsn,
 )
+from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
     assert_tenant_state,
     timeline_delete_wait_completed,
@@ -684,6 +685,13 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
     # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
 
 
+def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int):
+    def condition():
+        assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count
+
+    wait_until(5, 1.0, condition)
+
+
 def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     """
     Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete
@@ -767,10 +775,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     # That one that we successfully accessed is now Active
     expect_activated += 1
     assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
-    assert (
-        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
-        == expect_activated - 1
-    )
+    wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)
 
     # The ones we didn't touch are still in Attaching
     assert (
@@ -790,10 +795,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
         == n_tenants - expect_activated
     )
 
-    assert (
-        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
-        == expect_activated - 1
-    )
+    wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)
 
     # When we unblock logical size calculation, all tenants should proceed to active state via
     # the warmup route.
@@ -813,7 +815,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     assert (
         pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
     )
-    assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants
+    wait_for_tenant_startup_completions(pageserver_http, count=n_tenants)
 
     # Check that tenant deletion/detach proactively wakes tenants: this is done separately to the main
     # body of the test because it will disrupt tenant counts

From 6d996427b19ae20b0be30651838586307537b2b4 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 20 Mar 2024 12:26:31 +0000
Subject: [PATCH 0435/1571] proxy: enable sha2 asm support (#7184)

## Problem

faster sha2 hashing.

## Summary of changes

enable asm feature for sha2. this feature will be default in sha2 0.11,
so we might as well lean into it now. It provides a noticeable speed
boost on macos aarch64. Haven't tested on x86 though
---
 Cargo.lock                | 15 +++++++++++++--
 proxy/Cargo.toml          |  2 +-
 workspace_hack/Cargo.toml |  1 +
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 70f427f97d..cdbabf2f76 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5346,13 +5346,23 @@ checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
 
 [[package]]
 name = "sha2"
-version = "0.10.6"
+version = "0.10.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
+checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
 dependencies = [
  "cfg-if",
  "cpufeatures",
  "digest",
+ "sha2-asm",
+]
+
+[[package]]
+name = "sha2-asm"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e"
+dependencies = [
+ "cc",
 ]
 
 [[package]]
@@ -7032,6 +7042,7 @@ dependencies = [
  "scopeguard",
  "serde",
  "serde_json",
+ "sha2",
  "smallvec",
  "subtle",
  "syn 1.0.109",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index d8112c8bf0..b3a5bf873e 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -59,7 +59,7 @@ rustls.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-sha2.workspace = true
+sha2 = { workspace = true, features = ["asm"] }
 smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 8593b752c2..152c452dd4 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -64,6 +64,7 @@ rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
+sha2 = { version = "0.10", features = ["asm"] }
 smallvec = { version = "1", default-features = false, features = ["write"] }
 subtle = { version = "2" }
 time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }

From fb66a3dd857bec7f99bba9c5dd5ee80213761878 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 20 Mar 2024 16:08:03 +0200
Subject: [PATCH 0436/1571] fix: ResidentLayer::load_keys should not create
 INFO level span (#7174)

Since #6115 with more often used get_value_reconstruct_data and friends,
we should not have needless INFO level span creation near hot paths. In
our prod configuration, INFO spans are always created, but in practice,
very rarely anything at INFO level is logged underneath.
`ResidentLayer::load_keys` is only used during compaction so it is not
that hot, but this aligns the access paths and their span usage.

PR changes the span level to debug to align with others, and adds the
layer name to the error which was missing.

Split off from #7030.
---
 pageserver/src/tenant/storage_layer/layer.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 0200ff8cf4..f37d7e6449 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1484,7 +1484,7 @@ impl ResidentLayer {
     }
 
     /// Loads all keys stored in the layer. Returns key, lsn and value size.
-    #[tracing::instrument(skip_all, fields(layer=%self))]
+    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
     pub(crate) async fn load_keys<'a>(
         &'a self,
         ctx: &RequestContext,
@@ -1504,9 +1504,9 @@ impl ResidentLayer {
                 // while it's being held.
                 delta_layer::DeltaLayerInner::load_keys(d, ctx)
                     .await
-                    .context("Layer index is corrupted")
+                    .with_context(|| format!("Layer index is corrupted for {self}"))
             }
-            Image(_) => anyhow::bail!("cannot load_keys on a image layer"),
+            Image(_) => anyhow::bail!(format!("cannot load_keys on a image layer {self}")),
         }
     }
 

From 3d16cda846f4e7b8e929c61db13093586dca93d1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 20 Mar 2024 18:03:09 +0200
Subject: [PATCH 0437/1571] refactor(layer): use detached init (#7152)

The second part of work towards fixing `Layer::keep_resident` so that it
does not need to repair the internal state. #7135 added a nicer API for
initialization. This PR uses it to remove a few indentation levels and
the loop construction. The next PR #7175 will use the refactorings done
in this PR, and always initialize the internal state after a download.

Cc: #5331
---
 pageserver/src/tenant/storage_layer/layer.rs  | 329 +++++++++---------
 .../src/tenant/storage_layer/layer/tests.rs   |   3 +
 2 files changed, 166 insertions(+), 166 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index f37d7e6449..eed423c3e6 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -702,181 +702,132 @@ impl LayerInner {
         allow_download: bool,
         ctx: Option<&RequestContext>,
     ) -> Result<Arc<DownloadedLayer>, DownloadError> {
-        let mut init_permit = None;
+        let (weak, permit) = {
+            let locked = self
+                .inner
+                .get_or_init_detached()
+                .await
+                .map(|mut guard| guard.get_and_upgrade().ok_or(guard));
 
-        loop {
-            let download = move |permit| {
-                async move {
-                    // disable any scheduled but not yet running eviction deletions for this
-                    let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
+            match locked {
+                // this path could had been a RwLock::read
+                Ok(Ok((strong, upgraded))) if !upgraded => return Ok(strong),
+                Ok(Ok((strong, _))) => {
+                    // when upgraded back, the Arc<DownloadedLayer> is still available, but
+                    // previously a `evict_and_wait` was received.
+                    self.wanted_evicted.store(false, Ordering::Relaxed);
 
-                    // no need to make the evict_and_wait wait for the actual download to complete
+                    // error out any `evict_and_wait`
                     drop(self.status.send(Status::Downloaded));
-
-                    let timeline = self
-                        .timeline
-                        .upgrade()
-                        .ok_or_else(|| DownloadError::TimelineShutdown)?;
-
-                    // count cancellations, which currently remain largely unexpected
-                    let init_cancelled =
-                        scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
-
-                    let can_ever_evict = timeline.remote_client.as_ref().is_some();
-
-                    // check if we really need to be downloaded; could have been already downloaded by a
-                    // cancelled previous attempt.
-                    let needs_download = self
-                        .needs_download()
-                        .await
-                        .map_err(DownloadError::PreStatFailed);
-
-                    let needs_download = match needs_download {
-                        Ok(reason) => reason,
-                        Err(e) => {
-                            scopeguard::ScopeGuard::into_inner(init_cancelled);
-                            return Err(e);
-                        }
-                    };
-
-                    let (permit, downloaded) = if let Some(reason) = needs_download {
-                        if let NeedsDownload::NotFile(ft) = reason {
-                            return Err(DownloadError::NotFile(ft));
-                        }
-
-                        // only reset this after we've decided we really need to download. otherwise it'd
-                        // be impossible to mark cancelled downloads for eviction, like one could imagine
-                        // we would like to do for prefetching which was not needed.
-                        self.wanted_evicted.store(false, Ordering::Release);
-
-                        if !can_ever_evict {
-                            scopeguard::ScopeGuard::into_inner(init_cancelled);
-                            return Err(DownloadError::NoRemoteStorage);
-                        }
-
-                        if let Some(ctx) = ctx {
-                            let res = self.check_expected_download(ctx);
-                            if let Err(e) = res {
-                                scopeguard::ScopeGuard::into_inner(init_cancelled);
-                                return Err(e);
-                            }
-                        }
-
-                        if !allow_download {
-                            // this does look weird, but for LayerInner the "downloading" means also changing
-                            // internal once related state ...
-                            scopeguard::ScopeGuard::into_inner(init_cancelled);
-                            return Err(DownloadError::DownloadRequired);
-                        }
-
-                        tracing::info!(%reason, "downloading on-demand");
-
-                        let permit = self.spawn_download_and_wait(timeline, permit).await;
-
-                        let permit = match permit {
-                            Ok(permit) => permit,
-                            Err(e) => {
-                                scopeguard::ScopeGuard::into_inner(init_cancelled);
-                                return Err(e);
-                            }
-                        };
-
-                        (permit, true)
-                    } else {
-                        // the file is present locally, probably by a previous but cancelled call to
-                        // get_or_maybe_download. alternatively we might be running without remote storage.
-                        LAYER_IMPL_METRICS.inc_init_needed_no_download();
-
-                        (permit, false)
-                    };
-
-                    scopeguard::ScopeGuard::into_inner(init_cancelled);
-
-                    if downloaded {
-                        let since_last_eviction = self
-                            .last_evicted_at
-                            .lock()
-                            .unwrap()
-                            .take()
-                            .map(|ts| ts.elapsed());
-
-                        if let Some(since_last_eviction) = since_last_eviction {
-                            LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
-                        }
-                    }
-
-                    let res = Arc::new(DownloadedLayer {
-                        owner: Arc::downgrade(self),
-                        kind: tokio::sync::OnceCell::default(),
-                        version: next_version,
-                    });
-
-                    self.access_stats.record_residence_event(
-                        LayerResidenceStatus::Resident,
-                        LayerResidenceEventReason::ResidenceChange,
-                    );
-
-                    let waiters = self.inner.initializer_count();
-                    if waiters > 0 {
-                        tracing::info!(
-                            waiters,
-                            "completing the on-demand download for other tasks"
-                        );
-                    }
-
-                    Ok((ResidentOrWantedEvicted::Resident(res), permit))
-                }
-                .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
-            };
-
-            if let Some(init_permit) = init_permit.take() {
-                // use the already held initialization permit because it is impossible to hit the
-                // below paths anymore essentially limiting the max loop iterations to 2.
-                let (value, init_permit) = download(init_permit).await?;
-                let mut guard = self.inner.set(value, init_permit);
-                let (strong, _upgraded) = guard
-                    .get_and_upgrade()
-                    .expect("init creates strong reference, we held the init permit");
-                return Ok(strong);
-            }
-
-            let (weak, permit) = {
-                let mut locked = self.inner.get_or_init(download).await?;
-
-                if let Some((strong, upgraded)) = locked.get_and_upgrade() {
-                    if upgraded {
-                        // when upgraded back, the Arc<DownloadedLayer> is still available, but
-                        // previously a `evict_and_wait` was received.
-                        self.wanted_evicted.store(false, Ordering::Relaxed);
-
-                        // error out any `evict_and_wait`
-                        drop(self.status.send(Status::Downloaded));
-                        LAYER_IMPL_METRICS
-                            .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
-                    }
+                    LAYER_IMPL_METRICS
+                        .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
 
                     return Ok(strong);
-                } else {
+                }
+                Ok(Err(mut guard)) => {
                     // path to here: the evict_blocking is stuck on spawn_blocking queue.
                     //
                     // reset the contents, deactivating the eviction and causing a
                     // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed.
-                    locked.take_and_deinit()
+                    let (weak, permit) = guard.take_and_deinit();
+                    (Some(weak), permit)
                 }
-            };
-
-            // unlock first, then drop the weak, but because upgrade failed, we
-            // know it cannot be a problem.
+                Err(permit) => (None, permit),
+            }
+        };
 
+        if let Some(weak) = weak {
+            // only drop the weak after dropping the heavier_once_cell guard
             assert!(
                 matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)),
                 "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug"
             );
-
-            init_permit = Some(permit);
-
-            LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download();
         }
+
+        async move {
+            // disable any scheduled but not yet running eviction deletions for this
+            let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
+
+            // no need to make the evict_and_wait wait for the actual download to complete
+            drop(self.status.send(Status::Downloaded));
+
+            let timeline = self
+                .timeline
+                .upgrade()
+                .ok_or_else(|| DownloadError::TimelineShutdown)?;
+
+            // count cancellations, which currently remain largely unexpected
+            let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+
+            // check if we really need to be downloaded; could have been already downloaded by a
+            // cancelled previous attempt.
+            let needs_download = self
+                .needs_download()
+                .await
+                .map_err(DownloadError::PreStatFailed);
+
+            let needs_download = match needs_download {
+                Ok(reason) => reason,
+                Err(e) => {
+                    scopeguard::ScopeGuard::into_inner(init_cancelled);
+                    return Err(e);
+                }
+            };
+
+            let Some(reason) = needs_download else {
+                scopeguard::ScopeGuard::into_inner(init_cancelled);
+
+                // the file is present locally, probably by a previous but cancelled call to
+                // get_or_maybe_download. alternatively we might be running without remote storage.
+                LAYER_IMPL_METRICS.inc_init_needed_no_download();
+
+                let res = self.initialize_after_layer_is_on_disk(next_version, permit, false);
+                return Ok(res);
+            };
+
+            if let NeedsDownload::NotFile(ft) = reason {
+                scopeguard::ScopeGuard::into_inner(init_cancelled);
+                return Err(DownloadError::NotFile(ft));
+            }
+
+            // only reset this after we've decided we really need to download. otherwise it'd
+            // be impossible to mark cancelled downloads for eviction, like one could imagine
+            // we would like to do for prefetching which was not needed.
+            self.wanted_evicted.store(false, Ordering::Release);
+
+            if timeline.remote_client.as_ref().is_none() {
+                scopeguard::ScopeGuard::into_inner(init_cancelled);
+                return Err(DownloadError::NoRemoteStorage);
+            }
+
+            if let Some(ctx) = ctx {
+                let res = self.check_expected_download(ctx);
+                if let Err(e) = res {
+                    scopeguard::ScopeGuard::into_inner(init_cancelled);
+                    return Err(e);
+                }
+            }
+
+            if !allow_download {
+                // this does look weird, but for LayerInner the "downloading" means also changing
+                // internal once related state ...
+                scopeguard::ScopeGuard::into_inner(init_cancelled);
+                return Err(DownloadError::DownloadRequired);
+            }
+
+            tracing::info!(%reason, "downloading on-demand");
+
+            let permit = self.spawn_download_and_wait(timeline, permit).await;
+
+            scopeguard::ScopeGuard::into_inner(init_cancelled);
+
+            let permit = permit?;
+
+            let res = self.initialize_after_layer_is_on_disk(next_version, permit, true);
+            Ok(res)
+        }
+        .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
+        .await
     }
 
     /// Nag or fail per RequestContext policy
@@ -1026,6 +977,59 @@ impl LayerInner {
         }
     }
 
+    /// Initializes the `Self::inner` to a "resident" state.
+    ///
+    /// Callers are assumed to ensure that the file is actually on disk with `Self::needs_download`
+    /// before calling this method.
+    ///
+    /// If this method is ever made async, it needs to be cancellation safe so that no state
+    /// changes are made before we can write to the OnceCell in non-cancellable fashion.
+    fn initialize_after_layer_is_on_disk(
+        self: &Arc<LayerInner>,
+        next_version: usize,
+        permit: heavier_once_cell::InitPermit,
+        downloaded: bool,
+    ) -> Arc<DownloadedLayer> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
+        if downloaded {
+            let since_last_eviction = self
+                .last_evicted_at
+                .lock()
+                .unwrap()
+                .take()
+                .map(|ts| ts.elapsed());
+            if let Some(since_last_eviction) = since_last_eviction {
+                // FIXME: this will not always be recorded correctly until #6028 (the no
+                // download needed branch above)
+                LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+            }
+        }
+
+        let res = Arc::new(DownloadedLayer {
+            owner: Arc::downgrade(self),
+            kind: tokio::sync::OnceCell::default(),
+            version: next_version,
+        });
+
+        // FIXME: this might now be double-accounted for !downloaded
+        self.access_stats.record_residence_event(
+            LayerResidenceStatus::Resident,
+            LayerResidenceEventReason::ResidenceChange,
+        );
+
+        let waiters = self.inner.initializer_count();
+        if waiters > 0 {
+            tracing::info!(waiters, "completing the on-demand download for other tasks");
+        }
+
+        let value = ResidentOrWantedEvicted::Resident(res.clone());
+
+        self.inner.set(value, permit);
+
+        res
+    }
+
     async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
         match tokio::fs::metadata(&self.path).await {
             Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
@@ -1690,11 +1694,6 @@ impl LayerImplMetrics {
         self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
     }
 
-    /// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`.
-    fn inc_retried_get_or_maybe_download(&self) {
-        self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc();
-    }
-
     /// Expected rare because cancellations are unexpected, and failures are unexpected
     fn inc_download_failed_without_requester(&self) {
         self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
@@ -1779,7 +1778,6 @@ impl DeleteFailed {
 #[derive(enum_map::Enum)]
 enum RareEvent {
     RemoveOnDropFailed,
-    RetriedGetOrMaybeDownload,
     DownloadFailedWithoutRequester,
     UpgradedWantedEvicted,
     InitWithoutDownload,
@@ -1793,7 +1791,6 @@ impl RareEvent {
 
         match self {
             RemoveOnDropFailed => "remove_on_drop_failed",
-            RetriedGetOrMaybeDownload => "retried_gomd",
             DownloadFailedWithoutRequester => "download_failed_without",
             UpgradedWantedEvicted => "raced_wanted_evicted",
             InitWithoutDownload => "init_needed_no_download",
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index b43534efd4..e7b2eb025a 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -254,6 +254,8 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
     let h = TenantHarness::create("residency_check_while_evict_and_wait_on_clogged_spawn_blocking")
         .unwrap();
     let (tenant, ctx) = h.load().await;
+    let span = h.span();
+    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
 
     let timeline = tenant
         .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
@@ -292,6 +294,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
     // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
     layer
         .keep_resident()
+        .instrument(download_span)
         .await
         .expect("keep_resident should had reinitialized without downloading")
         .expect("ResidentLayer");

From 2726b1934ebd9d12d976ce9e9a41783d9ab238a8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 20 Mar 2024 18:07:45 +0000
Subject: [PATCH 0438/1571] pageserver: extra debug for
 test_secondary_downloads failures (#7183)

- Enable debug logs for this test
- Add some debug logging detail in downloader.rs
- Add an info-level message in scheduler.rs that makes it obvious if a
command is waiting for an existing task rather than spawning a new one.
---
 pageserver/src/tenant/secondary/downloader.rs    | 13 ++++++++++++-
 pageserver/src/tenant/secondary/scheduler.rs     |  1 +
 test_runner/regress/test_pageserver_secondary.py |  4 ++++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index a595096133..82af7ed83b 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -534,7 +534,11 @@ impl<'a> TenantDownloader<'a> {
             .await
             .maybe_fatal_err(&context_msg)?;
 
-        tracing::debug!("Wrote local heatmap to {}", heatmap_path);
+        tracing::debug!(
+            "Wrote local heatmap to {}, with {} timelines",
+            heatmap_path,
+            heatmap.timelines.len()
+        );
 
         // Clean up any local layers that aren't in the heatmap.  We do this first for all timelines, on the general
         // principle that deletions should be done before writes wherever possible, and so that we can use this
@@ -547,6 +551,10 @@ impl<'a> TenantDownloader<'a> {
         // Download the layers in the heatmap
         for timeline in heatmap.timelines {
             if self.secondary_state.cancel.is_cancelled() {
+                tracing::debug!(
+                    "Cancelled before downloading timeline {}",
+                    timeline.timeline_id
+                );
                 return Ok(());
             }
 
@@ -764,10 +772,13 @@ impl<'a> TenantDownloader<'a> {
             }
         };
 
+        tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
+
         // Download heatmap layers that are not present on local disk, or update their
         // access time if they are already present.
         for layer in timeline.layers {
             if self.secondary_state.cancel.is_cancelled() {
+                tracing::debug!("Cancelled -- dropping out of layer loop");
                 return Ok(());
             }
 
diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs
index 58bdb54161..3bd7be782e 100644
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -300,6 +300,7 @@ where
 
         let tenant_shard_id = job.get_tenant_shard_id();
         let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
+            tracing::info!("Command already running, waiting for it");
             barrier
         } else {
             let running = self.spawn_now(job);
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index e664547b69..2e57136607 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -432,6 +432,10 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
      - Eviction of layers on the attached location results in deletion
        on the secondary location as well.
     """
+
+    # For debug of https://github.com/neondatabase/neon/issues/6966
+    neon_env_builder.rust_log_override = "DEBUG"
+
     neon_env_builder.num_pageservers = 2
     neon_env_builder.enable_pageserver_remote_storage(
         remote_storage_kind=RemoteStorageKind.MOCK_S3,

From e961e0d3df1e7040221300fbb3d3e654257e4cad Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 20 Mar 2024 20:37:47 +0200
Subject: [PATCH 0439/1571] fix(Layer): always init after downloading in the
 spawned task (#7175)

Before this PR, cancellation for `LayerInner::get_or_maybe_download`
could occur so that we have downloaded the layer file in the filesystem,
but because of the cancellation chance, we have not set the internal
`LayerInner::inner` or initialized the state. With the detached init
support introduced in #7135 and in place in #7152, we can now initialize
the internal state after successfully downloading in the spawned task.

The next PR will fix the remaining problems that this PR leaves:
- `Layer::keep_resident` is still used because
- `Layer::get_or_maybe_download` always cancels an eviction, even when
canceled

Split off from #7030. Stacked on top of #7152. Cc: #5331.
---
 pageserver/src/tenant/storage_layer/layer.rs | 350 +++++++++----------
 1 file changed, 171 insertions(+), 179 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index eed423c3e6..626fd69ef3 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -702,6 +702,11 @@ impl LayerInner {
         allow_download: bool,
         ctx: Option<&RequestContext>,
     ) -> Result<Arc<DownloadedLayer>, DownloadError> {
+        // get_or_init_detached can:
+        // - be fast (mutex lock) OR uncontested semaphore permit acquire
+        // - be slow (wait for semaphore permit or closing)
+        let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+
         let (weak, permit) = {
             let locked = self
                 .inner
@@ -736,6 +741,8 @@ impl LayerInner {
             }
         };
 
+        scopeguard::ScopeGuard::into_inner(init_cancelled);
+
         if let Some(weak) = weak {
             // only drop the weak after dropping the heavier_once_cell guard
             assert!(
@@ -744,86 +751,57 @@ impl LayerInner {
             );
         }
 
+        let timeline = self
+            .timeline
+            .upgrade()
+            .ok_or_else(|| DownloadError::TimelineShutdown)?;
+
+        // count cancellations, which currently remain largely unexpected
+        let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+
+        // check if we really need to be downloaded; could have been already downloaded by a
+        // cancelled previous attempt.
+        let needs_download = self
+            .needs_download()
+            .await
+            .map_err(DownloadError::PreStatFailed);
+
+        scopeguard::ScopeGuard::into_inner(init_cancelled);
+
+        let needs_download = needs_download?;
+
+        let Some(reason) = needs_download else {
+            // the file is present locally, probably by a previous but cancelled call to
+            // get_or_maybe_download. alternatively we might be running without remote storage.
+            LAYER_IMPL_METRICS.inc_init_needed_no_download();
+
+            return Ok(self.initialize_after_layer_is_on_disk(permit));
+        };
+
+        if let NeedsDownload::NotFile(ft) = reason {
+            return Err(DownloadError::NotFile(ft));
+        }
+
+        if timeline.remote_client.as_ref().is_none() {
+            return Err(DownloadError::NoRemoteStorage);
+        }
+
+        if let Some(ctx) = ctx {
+            self.check_expected_download(ctx)?;
+        }
+
+        if !allow_download {
+            // this does look weird, but for LayerInner the "downloading" means also changing
+            // internal once related state ...
+            return Err(DownloadError::DownloadRequired);
+        }
+
         async move {
-            // disable any scheduled but not yet running eviction deletions for this
-            let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
-
-            // no need to make the evict_and_wait wait for the actual download to complete
-            drop(self.status.send(Status::Downloaded));
-
-            let timeline = self
-                .timeline
-                .upgrade()
-                .ok_or_else(|| DownloadError::TimelineShutdown)?;
-
-            // count cancellations, which currently remain largely unexpected
-            let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
-
-            // check if we really need to be downloaded; could have been already downloaded by a
-            // cancelled previous attempt.
-            let needs_download = self
-                .needs_download()
-                .await
-                .map_err(DownloadError::PreStatFailed);
-
-            let needs_download = match needs_download {
-                Ok(reason) => reason,
-                Err(e) => {
-                    scopeguard::ScopeGuard::into_inner(init_cancelled);
-                    return Err(e);
-                }
-            };
-
-            let Some(reason) = needs_download else {
-                scopeguard::ScopeGuard::into_inner(init_cancelled);
-
-                // the file is present locally, probably by a previous but cancelled call to
-                // get_or_maybe_download. alternatively we might be running without remote storage.
-                LAYER_IMPL_METRICS.inc_init_needed_no_download();
-
-                let res = self.initialize_after_layer_is_on_disk(next_version, permit, false);
-                return Ok(res);
-            };
-
-            if let NeedsDownload::NotFile(ft) = reason {
-                scopeguard::ScopeGuard::into_inner(init_cancelled);
-                return Err(DownloadError::NotFile(ft));
-            }
-
-            // only reset this after we've decided we really need to download. otherwise it'd
-            // be impossible to mark cancelled downloads for eviction, like one could imagine
-            // we would like to do for prefetching which was not needed.
-            self.wanted_evicted.store(false, Ordering::Release);
-
-            if timeline.remote_client.as_ref().is_none() {
-                scopeguard::ScopeGuard::into_inner(init_cancelled);
-                return Err(DownloadError::NoRemoteStorage);
-            }
-
-            if let Some(ctx) = ctx {
-                let res = self.check_expected_download(ctx);
-                if let Err(e) = res {
-                    scopeguard::ScopeGuard::into_inner(init_cancelled);
-                    return Err(e);
-                }
-            }
-
-            if !allow_download {
-                // this does look weird, but for LayerInner the "downloading" means also changing
-                // internal once related state ...
-                scopeguard::ScopeGuard::into_inner(init_cancelled);
-                return Err(DownloadError::DownloadRequired);
-            }
-
             tracing::info!(%reason, "downloading on-demand");
 
-            let permit = self.spawn_download_and_wait(timeline, permit).await;
-
+            let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+            let res = self.download_init_and_wait(timeline, permit).await?;
             scopeguard::ScopeGuard::into_inner(init_cancelled);
-
-            let permit = permit?;
-
-            let res = self.initialize_after_layer_is_on_disk(next_version, permit, true);
             Ok(res)
         }
         .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
@@ -857,11 +835,11 @@ impl LayerInner {
     }
 
     /// Actual download, at most one is executed at the time.
-    async fn spawn_download_and_wait(
+    async fn download_init_and_wait(
         self: &Arc<Self>,
         timeline: Arc<Timeline>,
         permit: heavier_once_cell::InitPermit,
-    ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
+    ) -> Result<Arc<DownloadedLayer>, DownloadError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
         let (tx, rx) = tokio::sync::oneshot::channel();
@@ -873,66 +851,24 @@ impl LayerInner {
             .enter()
             .map_err(|_| DownloadError::DownloadCancelled)?;
 
-        tokio::task::spawn(async move {
-
+        tokio::task::spawn(
+            async move {
                 let _guard = guard;
 
-                let client = timeline
-                    .remote_client
-                    .as_ref()
-                    .expect("checked above with have_remote_client");
+                drop(this.status.send(Status::Downloaded));
 
-                let result = client.download_layer_file(
-                    &this.desc.filename(),
-                    &this.metadata(),
-                    &timeline.cancel
-                )
-                .await;
+                let res = this.download_and_init(timeline, permit).await;
 
-                let result = match result {
-                    Ok(size) => {
-                        timeline.metrics.resident_physical_size_add(size);
-                        Ok(())
-                    }
-                    Err(e) => {
-                        let consecutive_failures =
-                            this.consecutive_failures.fetch_add(1, Ordering::Relaxed);
-
-                        let backoff = utils::backoff::exponential_backoff_duration_seconds(
-                            consecutive_failures.min(u32::MAX as usize) as u32,
-                            1.5,
-                            60.0,
-                        );
-
-                        let backoff = std::time::Duration::from_secs_f64(backoff);
-
-                        tokio::select! {
-                            _ = tokio::time::sleep(backoff) => {},
-                            _ = timeline.cancel.cancelled() => {},
-                        };
-
-                        Err(e)
-                    }
-                };
-
-                if let Err(res) = tx.send((result, permit)) {
+                if let Err(res) = tx.send(res) {
                     match res {
-                        (Ok(()), _) => {
-                            // our caller is cancellation safe so this is fine; if someone
-                            // else requests the layer, they'll find it already downloaded.
-                            //
-                            // See counter [`LayerImplMetrics::inc_init_needed_no_download`]
-                            //
-                            // FIXME(#6028): however, could be that we should consider marking the
-                            // layer for eviction? alas, cannot: because only DownloadedLayer will
-                            // handle that.
-                        },
-                        (Err(e), _) => {
-                            // our caller is cancellation safe, but we might be racing with
-                            // another attempt to initialize. before we have cancellation
-                            // token support: these attempts should converge regardless of
-                            // their completion order.
-                            tracing::error!("layer file download failed, and additionally failed to communicate this to caller: {e:?}");
+                        Ok(_res) => {
+                            tracing::debug!("layer initialized, but caller has been cancelled");
+                            LAYER_IMPL_METRICS.inc_init_completed_without_requester();
+                        }
+                        Err(e) => {
+                            tracing::info!(
+                                "layer file download failed, and caller has been cancelled: {e:?}"
+                            );
                             LAYER_IMPL_METRICS.inc_download_failed_without_requester();
                         }
                     }
@@ -942,41 +878,100 @@ impl LayerInner {
         );
 
         match rx.await {
-            Ok((Ok(()), permit)) => {
-                if let Some(reason) = self
-                    .needs_download()
-                    .await
-                    .map_err(DownloadError::PostStatFailed)?
-                {
-                    // this is really a bug in needs_download or remote timeline client
-                    panic!("post-condition failed: needs_download returned {reason:?}");
-                }
-
-                self.consecutive_failures.store(0, Ordering::Relaxed);
-                tracing::info!(size=%self.desc.file_size, "on-demand download successful");
-
-                Ok(permit)
-            }
-            Ok((Err(e), _permit)) => {
+            Ok(Ok(res)) => Ok(res),
+            Ok(Err(e)) => {
                 // sleep already happened in the spawned task, if it was not cancelled
-                let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed);
-
                 match e.downcast_ref::<remote_storage::DownloadError>() {
                     // If the download failed due to its cancellation token,
                     // propagate the cancellation error upstream.
                     Some(remote_storage::DownloadError::Cancelled) => {
                         Err(DownloadError::DownloadCancelled)
                     }
-                    _ => {
-                        tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
-                        Err(DownloadError::DownloadFailed)
-                    }
+                    _ => Err(DownloadError::DownloadFailed),
                 }
             }
             Err(_gone) => Err(DownloadError::DownloadCancelled),
         }
     }
 
+    async fn download_and_init(
+        self: &Arc<LayerInner>,
+        timeline: Arc<Timeline>,
+        permit: heavier_once_cell::InitPermit,
+    ) -> anyhow::Result<Arc<DownloadedLayer>> {
+        let client = timeline
+            .remote_client
+            .as_ref()
+            .expect("checked before download_init_and_wait");
+
+        let result = client
+            .download_layer_file(&self.desc.filename(), &self.metadata(), &timeline.cancel)
+            .await;
+
+        match result {
+            Ok(size) => {
+                assert_eq!(size, self.desc.file_size);
+
+                match self.needs_download().await {
+                    Ok(Some(reason)) => {
+                        // this is really a bug in needs_download or remote timeline client
+                        panic!("post-condition failed: needs_download returned {reason:?}");
+                    }
+                    Ok(None) => {
+                        // as expected
+                    }
+                    Err(e) => {
+                        panic!("post-condition failed: needs_download errored: {e:?}");
+                    }
+                }
+
+                tracing::info!(size=%self.desc.file_size, "on-demand download successful");
+                timeline
+                    .metrics
+                    .resident_physical_size_add(self.desc.file_size);
+                self.consecutive_failures.store(0, Ordering::Relaxed);
+
+                let since_last_eviction = self
+                    .last_evicted_at
+                    .lock()
+                    .unwrap()
+                    .take()
+                    .map(|ts| ts.elapsed());
+                if let Some(since_last_eviction) = since_last_eviction {
+                    LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+                }
+
+                self.access_stats.record_residence_event(
+                    LayerResidenceStatus::Resident,
+                    LayerResidenceEventReason::ResidenceChange,
+                );
+
+                Ok(self.initialize_after_layer_is_on_disk(permit))
+            }
+            Err(e) => {
+                let consecutive_failures =
+                    1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
+
+                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
+
+                let backoff = utils::backoff::exponential_backoff_duration_seconds(
+                    consecutive_failures.min(u32::MAX as usize) as u32,
+                    1.5,
+                    60.0,
+                );
+
+                let backoff = std::time::Duration::from_secs_f64(backoff);
+
+                tokio::select! {
+                    _ = tokio::time::sleep(backoff) => {},
+                    _ = timeline.cancel.cancelled() => {},
+                };
+
+                Err(e)
+            }
+        }
+    }
+
     /// Initializes the `Self::inner` to a "resident" state.
     ///
     /// Callers are assumed to ensure that the file is actually on disk with `Self::needs_download`
@@ -986,25 +981,22 @@ impl LayerInner {
     /// changes are made before we can write to the OnceCell in non-cancellable fashion.
     fn initialize_after_layer_is_on_disk(
         self: &Arc<LayerInner>,
-        next_version: usize,
         permit: heavier_once_cell::InitPermit,
-        downloaded: bool,
     ) -> Arc<DownloadedLayer> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
-        if downloaded {
-            let since_last_eviction = self
-                .last_evicted_at
-                .lock()
-                .unwrap()
-                .take()
-                .map(|ts| ts.elapsed());
-            if let Some(since_last_eviction) = since_last_eviction {
-                // FIXME: this will not always be recorded correctly until #6028 (the no
-                // download needed branch above)
-                LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
-            }
-        }
+        // disable any scheduled but not yet running eviction deletions for this
+        let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
+
+        // only reset this after we've decided we really need to download. otherwise it'd
+        // be impossible to mark cancelled downloads for eviction, like one could imagine
+        // we would like to do for prefetching which was not needed.
+        self.wanted_evicted.store(false, Ordering::Release);
+
+        // re-send the notification we've already sent when we started to download, just so
+        // evict_and_wait does not need to wait for the download to complete. note that this is
+        // sent when initializing after finding the file on the disk.
+        drop(self.status.send(Status::Downloaded));
 
         let res = Arc::new(DownloadedLayer {
             owner: Arc::downgrade(self),
@@ -1012,15 +1004,9 @@ impl LayerInner {
             version: next_version,
         });
 
-        // FIXME: this might now be double-accounted for !downloaded
-        self.access_stats.record_residence_event(
-            LayerResidenceStatus::Resident,
-            LayerResidenceEventReason::ResidenceChange,
-        );
-
         let waiters = self.inner.initializer_count();
         if waiters > 0 {
-            tracing::info!(waiters, "completing the on-demand download for other tasks");
+            tracing::info!(waiters, "completing layer init for other tasks");
         }
 
         let value = ResidentOrWantedEvicted::Resident(res.clone());
@@ -1268,8 +1254,6 @@ pub(crate) enum DownloadError {
     DownloadCancelled,
     #[error("pre-condition: stat before download failed")]
     PreStatFailed(#[source] std::io::Error),
-    #[error("post-condition: stat after download failed")]
-    PostStatFailed(#[source] std::io::Error),
 }
 
 #[derive(Debug, PartialEq)]
@@ -1694,6 +1678,12 @@ impl LayerImplMetrics {
         self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
     }
 
+    /// Expected rare just as cancellations are rare, but we could have cancellations separate from
+    /// the single caller which can start the download, so use this counter to separte them.
+    fn inc_init_completed_without_requester(&self) {
+        self.rare_counters[RareEvent::InitCompletedWithoutRequester].inc();
+    }
+
     /// Expected rare because cancellations are unexpected, and failures are unexpected
     fn inc_download_failed_without_requester(&self) {
         self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
@@ -1778,6 +1768,7 @@ impl DeleteFailed {
 #[derive(enum_map::Enum)]
 enum RareEvent {
     RemoveOnDropFailed,
+    InitCompletedWithoutRequester,
     DownloadFailedWithoutRequester,
     UpgradedWantedEvicted,
     InitWithoutDownload,
@@ -1791,6 +1782,7 @@ impl RareEvent {
 
         match self {
             RemoveOnDropFailed => "remove_on_drop_failed",
+            InitCompletedWithoutRequester => "init_completed_without",
             DownloadFailedWithoutRequester => "download_failed_without",
             UpgradedWantedEvicted => "raced_wanted_evicted",
             InitWithoutDownload => "init_needed_no_download",

From 34fa34d15c2a3fd13f3a475540991b5d9a63947a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 20 Mar 2024 19:39:46 +0100
Subject: [PATCH 0440/1571] Dump layer map json in test_gc_feedback.py (#7179)

The layer map json is an interesting file for that test, so dump it to
make debugging easier.
---
 test_runner/fixtures/pageserver/http.py     | 11 +++++++++++
 test_runner/performance/test_gc_feedback.py |  7 +++++++
 2 files changed, 18 insertions(+)

diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 99ec894106..6aebfbc99c 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -626,6 +626,17 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res_json = res.json()
         return res_json
 
+    def timeline_layer_map_info(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ):
+        log.info(f"Requesting layer map info of tenant {tenant_id}, timeline {timeline_id}")
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer",
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        return res_json
+
     def timeline_checkpoint(
         self,
         tenant_id: Union[TenantId, TenantShardId],
diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py
index 48dd84fb06..be56203b26 100644
--- a/test_runner/performance/test_gc_feedback.py
+++ b/test_runner/performance/test_gc_feedback.py
@@ -1,3 +1,5 @@
+import json
+
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.log_helper import log
@@ -79,3 +81,8 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
     zenbenchmark.record(
         "physical/logical ratio", physical_size / logical_size, "", MetricReport.LOWER_IS_BETTER
     )
+
+    layer_map_path = env.repo_dir / "layer-map.json"
+    log.info(f"Writing layer map to {layer_map_path}")
+    with layer_map_path.open("w") as f:
+        f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id)))

From 5f0d9f2360e10bb9e3edc4978eda898be62f9fcb Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 20 Mar 2024 14:40:48 -0400
Subject: [PATCH 0441/1571] fix: add safekeeper team to pgxn codeowners (#7170)

`pgxn/` also contains WAL proposer code, so modifications to this
directory should be able to be approved by the safekeeper team.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 CODEOWNERS | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 5b601f0566..9a23e8c958 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,12 +1,13 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
 /control_plane/attachment_service @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute
+/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling
 /pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
+/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
 /vendor/ @neondatabase/compute

From 55c4ef408b7e2305d1449c49d82d64ad095c949a Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 20 Mar 2024 15:22:25 -0400
Subject: [PATCH 0442/1571] safekeeper: correctly handle signals (#7167)

errno is not preserved in the signal handler. This pull request fixes
it. Maybe related: https://github.com/neondatabase/neon/issues/6969, but
does not fix the flaky test problem.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pgxn/neon/walproposer_pg.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index c46fd9b3ec..002bf4e2ce 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -549,9 +549,10 @@ walprop_pg_init_standalone_sync_safekeepers(void)
 static void
 walprop_sigusr2(SIGNAL_ARGS)
 {
+	int			save_errno = errno;
 	got_SIGUSR2 = true;
-
 	SetLatch(MyLatch);
+	errno = save_errno;
 }
 
 static void

From 041b653a1a31c369b349b2a7799af04379bb583b Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 18 Mar 2024 15:34:16 -0500
Subject: [PATCH 0443/1571] Add state diagram for compute

Models a compute's lifetime.
---
 compute_tools/README.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/compute_tools/README.md b/compute_tools/README.md
index 22a7de7cb7..8d84031efc 100644
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -32,6 +32,29 @@ compute_ctl -D /var/db/postgres/compute \
             -b /usr/local/bin/postgres
 ```
 
+## State Diagram
+
+Computes can be in various states. Below is a diagram that details how a
+compute moves between states.
+
+```mermaid
+%% https://mermaid.js.org/syntax/stateDiagram.html
+stateDiagram-v2
+  [*] --> Empty : Compute spawned
+  Empty --> ConfigurationPending : Waiting for compute spec
+  ConfigurationPending --> Configuration : Received compute spec
+  Configuration --> Failed : Failed to configure the compute
+  Configuration --> Running : Compute has been configured
+  Empty --> Init : Compute spec is immediately available
+  Empty --> TerminationPending : Requested termination
+  Init --> Failed : Failed to start Postgres
+  Init --> Running : Started Postgres
+  Running --> TerminationPending : Requested termination
+  TerminationPending --> Terminated : Terminated compute
+  Failed --> [*] : Compute exited
+  Terminated --> [*] : Compute exited
+```
+
 ## Tests
 
 Cargo formatter:

From a95c41f463681eda15a89115f6f95aa20e55afa3 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 21 Mar 2024 00:42:38 +0200
Subject: [PATCH 0444/1571] fix(heavier_once_cell): take_and_deinit should take
 ownership (#7185)

Small fix to remove confusing `mut` bindings.

Builds upon #7175, split off from #7030. Cc: #5331.
---
 libs/utils/src/sync/heavier_once_cell.rs     | 4 ++--
 pageserver/src/tenant/storage_layer/layer.rs | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index a3aee45b58..8eee1f72a6 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -245,7 +245,7 @@ impl<'a, T> Guard<'a, T> {
     ///
     /// The permit will be on a semaphore part of the new internal value, and any following
     /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
+    pub fn take_and_deinit(mut self) -> (T, InitPermit) {
         let mut swapped = Inner::default();
         let sem = swapped.init_semaphore.clone();
         // acquire and forget right away, moving the control over to InitPermit
@@ -543,7 +543,7 @@ mod tests {
         target.set(42, permit);
 
         let (_answer, permit) = {
-            let mut guard = target
+            let guard = target
                 .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
                 .await
                 .unwrap();
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 626fd69ef3..c503d0d454 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -729,7 +729,7 @@ impl LayerInner {
 
                     return Ok(strong);
                 }
-                Ok(Err(mut guard)) => {
+                Ok(Err(guard)) => {
                     // path to here: the evict_blocking is stuck on spawn_blocking queue.
                     //
                     // reset the contents, deactivating the eviction and causing a
@@ -1128,7 +1128,7 @@ impl LayerInner {
             let maybe_downloaded = self.inner.get();
 
             let (_weak, permit) = match maybe_downloaded {
-                Some(mut guard) => {
+                Some(guard) => {
                     if let ResidentOrWantedEvicted::WantedEvicted(_weak, version) = &*guard {
                         if *version == only_version {
                             guard.take_and_deinit()

From 2206e14c261cb417a07e850a87fdf2b3cd9b07f8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 21 Mar 2024 03:19:08 +0200
Subject: [PATCH 0445/1571] fix(layer): remove the need to repair internal
 state (#7030)

## Problem

The current implementation of struct Layer supports canceled read
requests, but those will leave the internal state such that a following
`Layer::keep_resident` call will need to repair the state. In
pathological cases seen during generation numbers resetting in staging
or with too many in-progress on-demand downloads, this repair activity
will need to wait for the download to complete, which stalls disk
usage-based eviction. Similar stalls have been observed in staging near
disk-full situations, where downloads failed because the disk was full.

Fixes #6028 or the "layer is present on filesystem but not evictable"
problems by:
1. not canceling pending evictions by a canceled
`LayerInner::get_or_maybe_download`
2. completing post-download initialization of the `LayerInner::inner`
from the download task

Not canceling evictions above case (1) and always initializing (2) lead
to plain `LayerInner::inner` always having the up-to-date information,
which leads to the old `Layer::keep_resident` never having to wait for
downloads to complete. Finally, the `Layer::keep_resident` is replaced
with `Layer::is_likely_resident`. These fix #7145.

## Summary of changes

- add a new test showing that a canceled get_or_maybe_download should
not cancel the eviction
- switch to using a `watch` internally rather than a `broadcast` to
avoid hanging eviction while a download is ongoing
- doc changes for new semantics and cleanup
- fix `Layer::keep_resident` to use just `self.0.inner.get()` as truth
as `Layer::is_likely_resident`
- remove `LayerInner::wanted_evicted` boolean as no longer needed

Builds upon: #7185. Cc: #5331.
---
 pageserver/src/tenant/mgr.rs                  |   7 +-
 pageserver/src/tenant/storage_layer/layer.rs  | 636 +++++++++++++-----
 .../tenant/storage_layer/layer/failpoints.rs  | 119 ++++
 .../src/tenant/storage_layer/layer/tests.rs   | 593 +++++++++++++---
 pageserver/src/tenant/timeline.rs             |  13 +-
 .../src/tenant/timeline/eviction_task.rs      |  23 +-
 .../src/tenant/timeline/layer_manager.rs      |  28 +-
 7 files changed, 1088 insertions(+), 331 deletions(-)
 create mode 100644 pageserver/src/tenant/storage_layer/layer/failpoints.rs

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index f456ca3006..7e0092d5b6 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,7 +2,6 @@
 //! page server.
 
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
-use futures::stream::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::ShardParameters;
@@ -1662,9 +1661,9 @@ impl TenantManager {
                     .layers
                     .read()
                     .await
-                    .resident_layers()
-                    .collect::<Vec<_>>()
-                    .await;
+                    .likely_resident_layers()
+                    .collect::<Vec<_>>();
+
                 for layer in timeline_layers {
                     let relative_path = layer
                         .local_path()
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index c503d0d454..8ba37b5a86 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -32,6 +32,9 @@ use utils::generation::Generation;
 #[cfg(test)]
 mod tests;
 
+#[cfg(test)]
+mod failpoints;
+
 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
 ///
@@ -46,7 +49,41 @@ mod tests;
 /// An image layer is a snapshot of all the data in a key-range, at a single
 /// LSN.
 ///
-/// This type models the on-disk layers, which can be evicted and on-demand downloaded.
+/// This type models the on-disk layers, which can be evicted and on-demand downloaded. As a
+/// general goal, read accesses should always win eviction and eviction should not wait for
+/// download.
+///
+/// ### State transitions
+///
+/// The internal state of `Layer` is composed of most importantly the on-filesystem state and the
+/// [`ResidentOrWantedEvicted`] enum. On-filesystem state can be either present (fully downloaded,
+/// right size) or deleted.
+///
+/// Reads will always win requests to evict until `wait_for_turn_and_evict` has acquired the
+/// `heavier_once_cell::InitPermit` and has started to `evict_blocking`. Before the
+/// `heavier_once_cell::InitPermit` has been acquired, any read request
+/// (`get_or_maybe_download`) can "re-initialize" using the existing downloaded file and thus
+/// cancelling the eviction.
+///
+/// ```text
+///  +-----------------+   get_or_maybe_download    +--------------------------------+
+///  | not initialized |--------------------------->| Resident(Arc<DownloadedLayer>) |
+///  |     ENOENT      |                         /->|                                |
+///  +-----------------+                         |  +--------------------------------+
+///                  ^                           |                         |       ^
+///                  |    get_or_maybe_download  |                         |       | get_or_maybe_download, either:
+///   evict_blocking | /-------------------------/                         |       | - upgrade weak to strong
+///                  | |                                                   |       | - re-initialize without download
+///                  | |                                    evict_and_wait |       |
+///  +-----------------+                                                   v       |
+///  | not initialized |  on_downloaded_layer_drop  +--------------------------------------+
+///  | file is present |<---------------------------| WantedEvicted(Weak<DownloadedLayer>) |
+///  +-----------------+                            +--------------------------------------+
+/// ```
+///
+/// ### Unsupported
+///
+/// - Evicting by the operator deleting files from the filesystem
 ///
 /// [`InMemoryLayer`]: super::inmemory_layer::InMemoryLayer
 #[derive(Clone)]
@@ -211,8 +248,7 @@ impl Layer {
     ///
     /// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction
     /// will happen regardless the future returned by this method completing unless there is a
-    /// read access (currently including [`Layer::keep_resident`]) before eviction gets to
-    /// complete.
+    /// read access before eviction gets to complete.
     ///
     /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
     /// of download-evict cycle on retry.
@@ -307,21 +343,28 @@ impl Layer {
     /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction
     /// while the guard exists.
     ///
-    /// Returns None if the layer is currently evicted.
-    pub(crate) async fn keep_resident(&self) -> anyhow::Result<Option<ResidentLayer>> {
-        let downloaded = match self.0.get_or_maybe_download(false, None).await {
-            Ok(d) => d,
-            // technically there are a lot of possible errors, but in practice it should only be
-            // DownloadRequired which is tripped up. could work to improve this situation
-            // statically later.
-            Err(DownloadError::DownloadRequired) => return Ok(None),
-            Err(e) => return Err(e.into()),
-        };
+    /// Returns None if the layer is currently evicted or becoming evicted.
+    #[cfg(test)]
+    pub(crate) async fn keep_resident(&self) -> Option<ResidentLayer> {
+        let downloaded = self.0.inner.get().and_then(|rowe| rowe.get())?;
 
-        Ok(Some(ResidentLayer {
+        Some(ResidentLayer {
             downloaded,
             owner: self.clone(),
-        }))
+        })
+    }
+
+    /// Weak indicator of is the layer resident or not. Good enough for eviction, which can deal
+    /// with `EvictionError::NotFound`.
+    ///
+    /// Returns `true` if this layer might be resident, or `false`, if it most likely evicted or
+    /// will be unless a read happens soon.
+    pub(crate) fn is_likely_resident(&self) -> bool {
+        self.0
+            .inner
+            .get()
+            .map(|rowe| rowe.is_likely_resident())
+            .unwrap_or(false)
     }
 
     /// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
@@ -371,11 +414,11 @@ impl Layer {
     /// separatedly.
     #[cfg(any(feature = "testing", test))]
     pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
-        let mut rx = self.0.status.subscribe();
+        let mut rx = self.0.status.as_ref().unwrap().subscribe();
 
         async move {
             loop {
-                if let Err(tokio::sync::broadcast::error::RecvError::Closed) = rx.recv().await {
+                if rx.changed().await.is_err() {
                     break;
                 }
             }
@@ -397,6 +440,32 @@ enum ResidentOrWantedEvicted {
 }
 
 impl ResidentOrWantedEvicted {
+    /// Non-mutating access to the a DownloadedLayer, if possible.
+    ///
+    /// This is not used on the read path (anything that calls
+    /// [`LayerInner::get_or_maybe_download`]) because it was decided that reads always win
+    /// evictions, and part of that winning is using [`ResidentOrWantedEvicted::get_and_upgrade`].
+    #[cfg(test)]
+    fn get(&self) -> Option<Arc<DownloadedLayer>> {
+        match self {
+            ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()),
+            ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.upgrade(),
+        }
+    }
+
+    /// Best-effort query for residency right now, not as strong guarantee as receiving a strong
+    /// reference from `ResidentOrWantedEvicted::get`.
+    fn is_likely_resident(&self) -> bool {
+        match self {
+            ResidentOrWantedEvicted::Resident(_) => true,
+            ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.strong_count() > 0,
+        }
+    }
+
+    /// Upgrades any weak to strong if possible.
+    ///
+    /// Returns a strong reference if possible, along with a boolean telling if an upgrade
+    /// happened.
     fn get_and_upgrade(&mut self) -> Option<(Arc<DownloadedLayer>, bool)> {
         match self {
             ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
@@ -417,7 +486,7 @@ impl ResidentOrWantedEvicted {
     ///
     /// Returns `Some` if this was the first time eviction was requested. Care should be taken to
     /// drop the possibly last strong reference outside of the mutex of
-    /// heavier_once_cell::OnceCell.
+    /// [`heavier_once_cell::OnceCell`].
     fn downgrade(&mut self) -> Option<Arc<DownloadedLayer>> {
         match self {
             ResidentOrWantedEvicted::Resident(strong) => {
@@ -445,6 +514,9 @@ struct LayerInner {
     desc: PersistentLayerDesc,
 
     /// Timeline access is needed for remote timeline client and metrics.
+    ///
+    /// There should not be an access to timeline for any reason without entering the
+    /// [`Timeline::gate`] at the same time.
     timeline: Weak<Timeline>,
 
     /// Cached knowledge of [`Timeline::remote_client`] being `Some`.
@@ -453,27 +525,38 @@ struct LayerInner {
     access_stats: LayerAccessStats,
 
     /// This custom OnceCell is backed by std mutex, but only held for short time periods.
-    /// Initialization and deinitialization are done while holding a permit.
+    ///
+    /// Filesystem changes (download, evict) are only done while holding a permit which the
+    /// `heavier_once_cell` provides.
+    ///
+    /// A number of fields in `Layer` are meant to only be updated when holding the InitPermit, but
+    /// possibly read while not holding it.
     inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
 
     /// Do we want to delete locally and remotely this when `LayerInner` is dropped
     wanted_deleted: AtomicBool,
 
-    /// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses
-    /// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger
-    /// [`LayerInner::on_downloaded_layer_drop`].
-    wanted_evicted: AtomicBool,
-
-    /// Version is to make sure we will only evict a specific download of a file.
+    /// Version is to make sure we will only evict a specific initialization of the downloaded file.
     ///
-    /// Incremented for each download, stored in `DownloadedLayer::version` or
+    /// Incremented for each initialization, stored in `DownloadedLayer::version` or
     /// `ResidentOrWantedEvicted::WantedEvicted`.
     version: AtomicUsize,
 
-    /// Allow subscribing to when the layer actually gets evicted.
-    status: tokio::sync::broadcast::Sender<Status>,
+    /// Allow subscribing to when the layer actually gets evicted, a non-cancellable download
+    /// starts, or completes.
+    ///
+    /// Updates must only be posted while holding the InitPermit or the heavier_once_cell::Guard.
+    /// Holding the InitPermit is the only time we can do state transitions, but we also need to
+    /// cancel a pending eviction on upgrading a [`ResidentOrWantedEvicted::WantedEvicted`] back to
+    /// [`ResidentOrWantedEvicted::Resident`] on access.
+    ///
+    /// The sender is wrapped in an Option to facilitate moving it out on [`LayerInner::drop`].
+    status: Option<tokio::sync::watch::Sender<Status>>,
 
-    /// Counter for exponential backoff with the download
+    /// Counter for exponential backoff with the download.
+    ///
+    /// This is atomic only for the purposes of having additional data only accessed while holding
+    /// the InitPermit.
     consecutive_failures: AtomicUsize,
 
     /// The generation of this Layer.
@@ -491,7 +574,13 @@ struct LayerInner {
     /// a shard split since the layer was originally written.
     shard: ShardIndex,
 
+    /// When the Layer was last evicted but has not been downloaded since.
+    ///
+    /// This is used solely for updating metrics. See [`LayerImplMetrics::redownload_after`].
     last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
+
+    #[cfg(test)]
+    failpoints: std::sync::Mutex<Vec<failpoints::Failpoint>>,
 }
 
 impl std::fmt::Display for LayerInner {
@@ -508,16 +597,16 @@ impl AsLayerDesc for LayerInner {
 
 #[derive(Debug, Clone, Copy)]
 enum Status {
+    Resident,
     Evicted,
-    Downloaded,
+    Downloading,
 }
 
 impl Drop for LayerInner {
     fn drop(&mut self) {
         if !*self.wanted_deleted.get_mut() {
-            // should we try to evict if the last wish was for eviction?
-            // feels like there's some hazard of overcrowding near shutdown near by, but we don't
-            // run drops during shutdown (yet)
+            // should we try to evict if the last wish was for eviction? seems more like a hazard
+            // than a clear win.
             return;
         }
 
@@ -528,9 +617,9 @@ impl Drop for LayerInner {
         let file_size = self.layer_desc().file_size;
         let timeline = self.timeline.clone();
         let meta = self.metadata();
-        let status = self.status.clone();
+        let status = self.status.take();
 
-        crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
+        Self::spawn_blocking(move || {
             let _g = span.entered();
 
             // carry this until we are finished for [`Layer::wait_drop`] support
@@ -605,12 +694,16 @@ impl LayerInner {
             .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
             .join(desc.filename().to_string());
 
-        let (inner, version) = if let Some(inner) = downloaded {
+        let (inner, version, init_status) = if let Some(inner) = downloaded {
             let version = inner.version;
             let resident = ResidentOrWantedEvicted::Resident(inner);
-            (heavier_once_cell::OnceCell::new(resident), version)
+            (
+                heavier_once_cell::OnceCell::new(resident),
+                version,
+                Status::Resident,
+            )
         } else {
-            (heavier_once_cell::OnceCell::default(), 0)
+            (heavier_once_cell::OnceCell::default(), 0, Status::Evicted)
         };
 
         LayerInner {
@@ -621,14 +714,15 @@ impl LayerInner {
             have_remote_client: timeline.remote_client.is_some(),
             access_stats,
             wanted_deleted: AtomicBool::new(false),
-            wanted_evicted: AtomicBool::new(false),
             inner,
             version: AtomicUsize::new(version),
-            status: tokio::sync::broadcast::channel(1).0,
+            status: Some(tokio::sync::watch::channel(init_status).0),
             consecutive_failures: AtomicUsize::new(0),
             generation,
             shard,
             last_evicted_at: std::sync::Mutex::default(),
+            #[cfg(test)]
+            failpoints: Default::default(),
         }
     }
 
@@ -644,20 +738,34 @@ impl LayerInner {
 
     /// Cancellation safe, however dropping the future and calling this method again might result
     /// in a new attempt to evict OR join the previously started attempt.
+    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))]
     pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
-        use tokio::sync::broadcast::error::RecvError;
-
         assert!(self.have_remote_client);
 
-        let mut rx = self.status.subscribe();
+        let mut rx = self.status.as_ref().unwrap().subscribe();
+
+        {
+            let current = rx.borrow_and_update();
+            match &*current {
+                Status::Resident => {
+                    // we might get lucky and evict this; continue
+                }
+                Status::Evicted | Status::Downloading => {
+                    // it is already evicted
+                    return Err(EvictionError::NotFound);
+                }
+            }
+        }
 
         let strong = {
             match self.inner.get() {
-                Some(mut either) => {
-                    self.wanted_evicted.store(true, Ordering::Relaxed);
-                    either.downgrade()
+                Some(mut either) => either.downgrade(),
+                None => {
+                    // we already have a scheduled eviction, which just has not gotten to run yet.
+                    // it might still race with a read access, but that could also get cancelled,
+                    // so let's say this is not evictable.
+                    return Err(EvictionError::NotFound);
                 }
-                None => return Err(EvictionError::NotFound),
             }
         };
 
@@ -673,26 +781,26 @@ impl LayerInner {
             LAYER_IMPL_METRICS.inc_started_evictions();
         }
 
-        match tokio::time::timeout(timeout, rx.recv()).await {
-            Ok(Ok(Status::Evicted)) => Ok(()),
-            Ok(Ok(Status::Downloaded)) => Err(EvictionError::Downloaded),
-            Ok(Err(RecvError::Closed)) => {
-                unreachable!("sender cannot be dropped while we are in &self method")
-            }
-            Ok(Err(RecvError::Lagged(_))) => {
-                // this is quite unlikely, but we are blocking a lot in the async context, so
-                // we might be missing this because we are stuck on a LIFO slot on a thread
-                // which is busy blocking for a 1TB database create_image_layers.
-                //
-                // use however late (compared to the initial expressing of wanted) as the
-                // "outcome" now
-                LAYER_IMPL_METRICS.inc_broadcast_lagged();
-                match self.inner.get() {
-                    Some(_) => Err(EvictionError::Downloaded),
-                    None => Ok(()),
-                }
-            }
-            Err(_timeout) => Err(EvictionError::Timeout),
+        let changed = rx.changed();
+        let changed = tokio::time::timeout(timeout, changed).await;
+
+        let Ok(changed) = changed else {
+            return Err(EvictionError::Timeout);
+        };
+
+        let _: () = changed.expect("cannot be closed, because we are holding a strong reference");
+
+        let current = rx.borrow_and_update();
+
+        match &*current {
+            // the easiest case
+            Status::Evicted => Ok(()),
+            // it surely was evicted in between, but then there was a new access now; we can't know
+            // if it'll succeed so lets just call it evicted
+            Status::Downloading => Ok(()),
+            // either the download which was started after eviction completed already, or it was
+            // never evicted
+            Status::Resident => Err(EvictionError::Downloaded),
         }
     }
 
@@ -702,38 +810,38 @@ impl LayerInner {
         allow_download: bool,
         ctx: Option<&RequestContext>,
     ) -> Result<Arc<DownloadedLayer>, DownloadError> {
-        // get_or_init_detached can:
-        // - be fast (mutex lock) OR uncontested semaphore permit acquire
-        // - be slow (wait for semaphore permit or closing)
-        let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
-
         let (weak, permit) = {
+            // get_or_init_detached can:
+            // - be fast (mutex lock) OR uncontested semaphore permit acquire
+            // - be slow (wait for semaphore permit or closing)
+            let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+
             let locked = self
                 .inner
                 .get_or_init_detached()
                 .await
                 .map(|mut guard| guard.get_and_upgrade().ok_or(guard));
 
+            scopeguard::ScopeGuard::into_inner(init_cancelled);
+
             match locked {
                 // this path could had been a RwLock::read
                 Ok(Ok((strong, upgraded))) if !upgraded => return Ok(strong),
                 Ok(Ok((strong, _))) => {
                     // when upgraded back, the Arc<DownloadedLayer> is still available, but
-                    // previously a `evict_and_wait` was received.
-                    self.wanted_evicted.store(false, Ordering::Relaxed);
-
-                    // error out any `evict_and_wait`
-                    drop(self.status.send(Status::Downloaded));
+                    // previously a `evict_and_wait` was received. this is the only place when we
+                    // send out an update without holding the InitPermit.
+                    //
+                    // note that we also have dropped the Guard; this is fine, because we just made
+                    // a state change and are holding a strong reference to be returned.
+                    self.status.as_ref().unwrap().send_replace(Status::Resident);
                     LAYER_IMPL_METRICS
                         .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
 
                     return Ok(strong);
                 }
                 Ok(Err(guard)) => {
-                    // path to here: the evict_blocking is stuck on spawn_blocking queue.
-                    //
-                    // reset the contents, deactivating the eviction and causing a
-                    // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed.
+                    // path to here: we won the eviction, the file should still be on the disk.
                     let (weak, permit) = guard.take_and_deinit();
                     (Some(weak), permit)
                 }
@@ -741,8 +849,6 @@ impl LayerInner {
             }
         };
 
-        scopeguard::ScopeGuard::into_inner(init_cancelled);
-
         if let Some(weak) = weak {
             // only drop the weak after dropping the heavier_once_cell guard
             assert!(
@@ -759,8 +865,11 @@ impl LayerInner {
         // count cancellations, which currently remain largely unexpected
         let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
 
-        // check if we really need to be downloaded; could have been already downloaded by a
-        // cancelled previous attempt.
+        // check if we really need to be downloaded: this can happen if a read access won the
+        // semaphore before eviction.
+        //
+        // if we are cancelled while doing this `stat` the `self.inner` will be uninitialized. a
+        // pending eviction will try to evict even upon finding an uninitialized `self.inner`.
         let needs_download = self
             .needs_download()
             .await
@@ -771,13 +880,20 @@ impl LayerInner {
         let needs_download = needs_download?;
 
         let Some(reason) = needs_download else {
-            // the file is present locally, probably by a previous but cancelled call to
-            // get_or_maybe_download. alternatively we might be running without remote storage.
+            // the file is present locally because eviction has not had a chance to run yet
+
+            #[cfg(test)]
+            self.failpoint(failpoints::FailpointKind::AfterDeterminingLayerNeedsNoDownload)
+                .await?;
+
             LAYER_IMPL_METRICS.inc_init_needed_no_download();
 
             return Ok(self.initialize_after_layer_is_on_disk(permit));
         };
 
+        // we must download; getting cancelled before spawning the download is not an issue as
+        // any still running eviction would not find anything to evict.
+
         if let NeedsDownload::NotFile(ft) = reason {
             return Err(DownloadError::NotFile(ft));
         }
@@ -791,8 +907,7 @@ impl LayerInner {
         }
 
         if !allow_download {
-            // this does look weird, but for LayerInner the "downloading" means also changing
-            // internal once related state ...
+            // this is only used from tests, but it is hard to test without the boolean
             return Err(DownloadError::DownloadRequired);
         }
 
@@ -851,11 +966,22 @@ impl LayerInner {
             .enter()
             .map_err(|_| DownloadError::DownloadCancelled)?;
 
-        tokio::task::spawn(
+        Self::spawn(
             async move {
                 let _guard = guard;
 
-                drop(this.status.send(Status::Downloaded));
+                // now that we have commited to downloading, send out an update to:
+                // - unhang any pending eviction
+                // - break out of evict_and_wait
+                this.status
+                    .as_ref()
+                    .unwrap()
+                    .send_replace(Status::Downloading);
+
+                #[cfg(test)]
+                this.failpoint(failpoints::FailpointKind::WaitBeforeDownloading)
+                    .await
+                    .unwrap();
 
                 let res = this.download_and_init(timeline, permit).await;
 
@@ -887,6 +1013,8 @@ impl LayerInner {
                     Some(remote_storage::DownloadError::Cancelled) => {
                         Err(DownloadError::DownloadCancelled)
                     }
+                    // FIXME: this is not embedding the error because historically it would had
+                    // been output to compute, however that is no longer the case.
                     _ => Err(DownloadError::DownloadFailed),
                 }
             }
@@ -985,18 +1113,9 @@ impl LayerInner {
     ) -> Arc<DownloadedLayer> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
-        // disable any scheduled but not yet running eviction deletions for this
+        // disable any scheduled but not yet running eviction deletions for this initialization
         let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
-
-        // only reset this after we've decided we really need to download. otherwise it'd
-        // be impossible to mark cancelled downloads for eviction, like one could imagine
-        // we would like to do for prefetching which was not needed.
-        self.wanted_evicted.store(false, Ordering::Release);
-
-        // re-send the notification we've already sent when we started to download, just so
-        // evict_and_wait does not need to wait for the download to complete. note that this is
-        // sent when initializing after finding the file on the disk.
-        drop(self.status.send(Status::Downloaded));
+        self.status.as_ref().unwrap().send_replace(Status::Resident);
 
         let res = Arc::new(DownloadedLayer {
             owner: Arc::downgrade(self),
@@ -1049,9 +1168,11 @@ impl LayerInner {
     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
         let layer_file_name = self.desc.filename().file_name();
 
-        // this is not accurate: we could have the file locally but there was a cancellation
-        // and now we are not in sync, or we are currently downloading it.
-        let remote = self.inner.get().is_none();
+        let resident = self
+            .inner
+            .get()
+            .map(|rowe| rowe.is_likely_resident())
+            .unwrap_or(false);
 
         let access_stats = self.access_stats.as_api_model(reset);
 
@@ -1063,7 +1184,7 @@ impl LayerInner {
                 layer_file_size: self.desc.file_size,
                 lsn_start: lsn_range.start,
                 lsn_end: lsn_range.end,
-                remote,
+                remote: !resident,
                 access_stats,
             }
         } else {
@@ -1073,94 +1194,195 @@ impl LayerInner {
                 layer_file_name,
                 layer_file_size: self.desc.file_size,
                 lsn_start: lsn,
-                remote,
+                remote: !resident,
                 access_stats,
             }
         }
     }
 
     /// `DownloadedLayer` is being dropped, so it calls this method.
-    fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
-        let evict = self.wanted_evicted.load(Ordering::Acquire);
+    fn on_downloaded_layer_drop(self: Arc<LayerInner>, only_version: usize) {
         let can_evict = self.have_remote_client;
 
-        if can_evict && evict {
-            let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);
+        // we cannot know without inspecting LayerInner::inner if we should evict or not, even
+        // though here it is very likely
+        let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version);
 
-            // downgrade for queueing, in case there's a tear down already ongoing we should not
-            // hold it alive.
-            let this = Arc::downgrade(&self);
-            drop(self);
-
-            // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
-            // drop while the `self.inner` is being locked, leading to a deadlock.
-
-            crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
-                let _g = span.entered();
-
-                // if LayerInner is already dropped here, do nothing because the delete on drop
-                // has already ran while we were in queue
-                let Some(this) = this.upgrade() else {
-                    LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
-                    return;
-                };
-                match this.evict_blocking(version) {
-                    Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
-                    Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
-                }
+        if !can_evict {
+            // it would be nice to assert this case out, but we are in drop
+            span.in_scope(|| {
+                tracing::error!("bug in struct Layer: ResidentOrWantedEvicted has been downgraded while we have no remote storage");
             });
+            return;
         }
+
+        // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
+        // drop while the `self.inner` is being locked, leading to a deadlock.
+
+        let start_evicting = async move {
+            #[cfg(test)]
+            self.failpoint(failpoints::FailpointKind::WaitBeforeStartingEvicting)
+                .await
+                .expect("failpoint should not have errored");
+
+            tracing::debug!("eviction started");
+
+            let res = self.wait_for_turn_and_evict(only_version).await;
+            // metrics: ignore the Ok branch, it is not done yet
+            if let Err(e) = res {
+                tracing::debug!(res=?Err::<(), _>(&e), "eviction completed");
+                LAYER_IMPL_METRICS.inc_eviction_cancelled(e);
+            }
+        };
+
+        Self::spawn(start_evicting.instrument(span));
     }
 
-    fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
-        // deleted or detached timeline, don't do anything.
-        let Some(timeline) = self.timeline.upgrade() else {
-            return Err(EvictionCancelled::TimelineGone);
-        };
+    async fn wait_for_turn_and_evict(
+        self: Arc<LayerInner>,
+        only_version: usize,
+    ) -> Result<(), EvictionCancelled> {
+        fn is_good_to_continue(status: &Status) -> Result<(), EvictionCancelled> {
+            use Status::*;
+            match status {
+                Resident => Ok(()),
+                Evicted => Err(EvictionCancelled::UnexpectedEvictedState),
+                Downloading => Err(EvictionCancelled::LostToDownload),
+            }
+        }
+
+        let timeline = self
+            .timeline
+            .upgrade()
+            .ok_or(EvictionCancelled::TimelineGone)?;
+
+        let mut rx = self
+            .status
+            .as_ref()
+            .expect("LayerInner cannot be dropped, holding strong ref")
+            .subscribe();
+
+        is_good_to_continue(&rx.borrow_and_update())?;
 
         let Ok(_gate) = timeline.gate.enter() else {
             return Err(EvictionCancelled::TimelineGone);
         };
 
-        // to avoid starting a new download while we evict, keep holding on to the
-        // permit.
-        let _permit = {
-            let maybe_downloaded = self.inner.get();
+        let permit = {
+            // we cannot just `std::fs::remove_file` because there might already be an
+            // get_or_maybe_download which will inspect filesystem and reinitialize. filesystem
+            // operations must be done while holding the heavier_once_cell::InitPermit
+            let mut wait = std::pin::pin!(self.inner.get_or_init_detached());
 
-            let (_weak, permit) = match maybe_downloaded {
-                Some(guard) => {
-                    if let ResidentOrWantedEvicted::WantedEvicted(_weak, version) = &*guard {
-                        if *version == only_version {
-                            guard.take_and_deinit()
-                        } else {
-                            // this was not for us; maybe there's another eviction job
-                            // TODO: does it make any sense to stall here? unique versions do not
-                            // matter, we only want to make sure not to evict a resident, which we
-                            // are not doing.
-                            return Err(EvictionCancelled::VersionCheckFailed);
-                        }
-                    } else {
-                        return Err(EvictionCancelled::AlreadyReinitialized);
+            let waited = loop {
+                // we must race to the Downloading starting, otherwise we would have to wait until the
+                // completion of the download. waiting for download could be long and hinder our
+                // efforts to alert on "hanging" evictions.
+                tokio::select! {
+                    res = &mut wait => break res,
+                    _ = rx.changed() => {
+                        is_good_to_continue(&rx.borrow_and_update())?;
+                        // two possibilities for Status::Resident:
+                        // - the layer was found locally from disk by a read
+                        // - we missed a bunch of updates and now the layer is
+                        // again downloaded -- assume we'll fail later on with
+                        // version check or AlreadyReinitialized
                     }
                 }
-                None => {
-                    // already deinitialized, perhaps get_or_maybe_download did this and is
-                    // currently waiting to reinitialize it
-                    return Err(EvictionCancelled::LostToDownload);
+            };
+
+            // re-check now that we have the guard or permit; all updates should have happened
+            // while holding the permit.
+            is_good_to_continue(&rx.borrow_and_update())?;
+
+            // the term deinitialize is used here, because we clearing out the Weak will eventually
+            // lead to deallocating the reference counted value, and the value we
+            // `Guard::take_and_deinit` is likely to be the last because the Weak is never cloned.
+            let (_weak, permit) = match waited {
+                Ok(guard) => {
+                    match &*guard {
+                        ResidentOrWantedEvicted::WantedEvicted(_weak, version)
+                            if *version == only_version =>
+                        {
+                            tracing::debug!(version, "deinitializing matching WantedEvicted");
+                            let (weak, permit) = guard.take_and_deinit();
+                            (Some(weak), permit)
+                        }
+                        ResidentOrWantedEvicted::WantedEvicted(_, version) => {
+                            // if we were not doing the version check, we would need to try to
+                            // upgrade the weak here to see if it really is dropped. version check
+                            // is done instead assuming that it is cheaper.
+                            tracing::debug!(
+                                version,
+                                only_version,
+                                "version mismatch, not deinitializing"
+                            );
+                            return Err(EvictionCancelled::VersionCheckFailed);
+                        }
+                        ResidentOrWantedEvicted::Resident(_) => {
+                            return Err(EvictionCancelled::AlreadyReinitialized);
+                        }
+                    }
+                }
+                Err(permit) => {
+                    tracing::debug!("continuing after cancelled get_or_maybe_download or eviction");
+                    (None, permit)
                 }
             };
 
             permit
         };
 
-        // now accesses to inner.get_or_init wait on the semaphore or the `_permit`
+        let span = tracing::Span::current();
 
-        self.access_stats.record_residence_event(
-            LayerResidenceStatus::Evicted,
-            LayerResidenceEventReason::ResidenceChange,
-        );
+        let spawned_at = std::time::Instant::now();
 
-        let res = match capture_mtime_and_remove(&self.path) {
+        // this is on purpose a detached spawn; we don't need to wait for it
+        //
+        // eviction completion reporting is the only thing hinging on this, and it can be just as
+        // well from a spawn_blocking thread.
+        //
+        // important to note that now that we've acquired the permit we have made sure the evicted
+        // file is either the exact `WantedEvicted` we wanted to evict, or uninitialized in case
+        // there are multiple evictions. The rest is not cancellable, and we've now commited to
+        // evicting.
+        //
+        // If spawn_blocking has a queue and maximum number of threads are in use, we could stall
+        // reads. We will need to add cancellation for that if necessary.
+        Self::spawn_blocking(move || {
+            let _span = span.entered();
+
+            let res = self.evict_blocking(&timeline, &permit);
+
+            let waiters = self.inner.initializer_count();
+
+            if waiters > 0 {
+                LAYER_IMPL_METRICS.inc_evicted_with_waiters();
+            }
+
+            let completed_in = spawned_at.elapsed();
+            LAYER_IMPL_METRICS.record_time_to_evict(completed_in);
+
+            match res {
+                Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
+                Err(e) => LAYER_IMPL_METRICS.inc_eviction_cancelled(e),
+            }
+
+            tracing::debug!(?res, elapsed_ms=%completed_in.as_millis(), %waiters, "eviction completed");
+        });
+
+        Ok(())
+    }
+
+    /// This is blocking only to do just one spawn_blocking hop compared to multiple via tokio::fs.
+    fn evict_blocking(
+        &self,
+        timeline: &Timeline,
+        _permit: &heavier_once_cell::InitPermit,
+    ) -> Result<(), EvictionCancelled> {
+        // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit`
+
+        match capture_mtime_and_remove(&self.path) {
             Ok(local_layer_mtime) => {
                 let duration = SystemTime::now().duration_since(local_layer_mtime);
                 match duration {
@@ -1184,33 +1406,60 @@ impl LayerInner {
                 timeline
                     .metrics
                     .resident_physical_size_sub(self.desc.file_size);
-
-                Ok(())
             }
             Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
                 tracing::error!(
                     layer_size = %self.desc.file_size,
-                    "failed to evict layer from disk, it was already gone (metrics will be inaccurate)"
+                    "failed to evict layer from disk, it was already gone"
                 );
-                Err(EvictionCancelled::FileNotFound)
+                return Err(EvictionCancelled::FileNotFound);
             }
             Err(e) => {
+                // FIXME: this should probably be an abort
                 tracing::error!("failed to evict file from disk: {e:#}");
-                Err(EvictionCancelled::RemoveFailed)
+                return Err(EvictionCancelled::RemoveFailed);
             }
-        };
+        }
 
-        // we are still holding the permit, so no new spawn_download_and_wait can happen
-        drop(self.status.send(Status::Evicted));
+        self.access_stats.record_residence_event(
+            LayerResidenceStatus::Evicted,
+            LayerResidenceEventReason::ResidenceChange,
+        );
+
+        self.status.as_ref().unwrap().send_replace(Status::Evicted);
 
         *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
 
-        res
+        Ok(())
     }
 
     fn metadata(&self) -> LayerFileMetadata {
         LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard)
     }
+
+    /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME.
+    ///
+    /// Synchronizing with spawned tasks is very complicated otherwise.
+    fn spawn<F>(fut: F)
+    where
+        F: std::future::Future<Output = ()> + Send + 'static,
+    {
+        #[cfg(test)]
+        tokio::task::spawn(fut);
+        #[cfg(not(test))]
+        crate::task_mgr::BACKGROUND_RUNTIME.spawn(fut);
+    }
+
+    /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME.
+    fn spawn_blocking<F>(f: F)
+    where
+        F: FnOnce() + Send + 'static,
+    {
+        #[cfg(test)]
+        tokio::task::spawn_blocking(f);
+        #[cfg(not(test))]
+        crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(f);
+    }
 }
 
 fn capture_mtime_and_remove(path: &Utf8Path) -> Result<SystemTime, std::io::Error> {
@@ -1254,6 +1503,10 @@ pub(crate) enum DownloadError {
     DownloadCancelled,
     #[error("pre-condition: stat before download failed")]
     PreStatFailed(#[source] std::io::Error),
+
+    #[cfg(test)]
+    #[error("failpoint: {0:?}")]
+    Failpoint(failpoints::FailpointKind),
 }
 
 #[derive(Debug, PartialEq)]
@@ -1300,6 +1553,7 @@ impl Drop for DownloadedLayer {
             owner.on_downloaded_layer_drop(self.version);
         } else {
             // no need to do anything, we are shutting down
+            LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
         }
     }
 }
@@ -1540,6 +1794,7 @@ pub(crate) struct LayerImplMetrics {
     rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
     inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
     redownload_after: metrics::Histogram,
+    time_to_evict: metrics::Histogram,
 }
 
 impl Default for LayerImplMetrics {
@@ -1635,6 +1890,13 @@ impl Default for LayerImplMetrics {
             .unwrap()
         };
 
+        let time_to_evict = metrics::register_histogram!(
+            "pageserver_layer_eviction_held_permit_seconds",
+            "Time eviction held the permit.",
+            vec![0.001, 0.010, 0.100, 0.500, 1.000, 5.000]
+        )
+        .unwrap();
+
         Self {
             started_evictions,
             completed_evictions,
@@ -1647,6 +1909,7 @@ impl Default for LayerImplMetrics {
             rare_counters,
             inits_cancelled,
             redownload_after,
+            time_to_evict,
         }
     }
 }
@@ -1708,10 +1971,6 @@ impl LayerImplMetrics {
         self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
     }
 
-    fn inc_broadcast_lagged(&self) {
-        self.rare_counters[RareEvent::EvictAndWaitLagged].inc();
-    }
-
     fn inc_init_cancelled(&self) {
         self.inits_cancelled.inc()
     }
@@ -1719,9 +1978,22 @@ impl LayerImplMetrics {
     fn record_redownloaded_after(&self, duration: std::time::Duration) {
         self.redownload_after.observe(duration.as_secs_f64())
     }
+
+    /// This would be bad if it ever happened, or mean extreme disk pressure. We should probably
+    /// instead cancel eviction if we would have read waiters. We cannot however separate reads
+    /// from other evictions, so this could have noise as well.
+    fn inc_evicted_with_waiters(&self) {
+        self.rare_counters[RareEvent::EvictedWithWaiters].inc();
+    }
+
+    /// Recorded at least initially as the permit is now acquired in async context before
+    /// spawn_blocking action.
+    fn record_time_to_evict(&self, duration: std::time::Duration) {
+        self.time_to_evict.observe(duration.as_secs_f64())
+    }
 }
 
-#[derive(enum_map::Enum)]
+#[derive(Debug, Clone, Copy, enum_map::Enum)]
 enum EvictionCancelled {
     LayerGone,
     TimelineGone,
@@ -1733,6 +2005,7 @@ enum EvictionCancelled {
     LostToDownload,
     /// After eviction, there was a new layer access which cancelled the eviction.
     UpgradedBackOnAccess,
+    UnexpectedEvictedState,
 }
 
 impl EvictionCancelled {
@@ -1746,6 +2019,7 @@ impl EvictionCancelled {
             EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
             EvictionCancelled::LostToDownload => "lost_to_download",
             EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
+            EvictionCancelled::UnexpectedEvictedState => "unexpected_evicted_state",
         }
     }
 }
@@ -1773,7 +2047,7 @@ enum RareEvent {
     UpgradedWantedEvicted,
     InitWithoutDownload,
     PermanentLoadingFailure,
-    EvictAndWaitLagged,
+    EvictedWithWaiters,
 }
 
 impl RareEvent {
@@ -1787,7 +2061,7 @@ impl RareEvent {
             UpgradedWantedEvicted => "raced_wanted_evicted",
             InitWithoutDownload => "init_needed_no_download",
             PermanentLoadingFailure => "permanent_loading_failure",
-            EvictAndWaitLagged => "broadcast_lagged",
+            EvictedWithWaiters => "evicted_with_waiters",
         }
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer/failpoints.rs b/pageserver/src/tenant/storage_layer/layer/failpoints.rs
new file mode 100644
index 0000000000..6cedc41d98
--- /dev/null
+++ b/pageserver/src/tenant/storage_layer/layer/failpoints.rs
@@ -0,0 +1,119 @@
+//! failpoints for unit tests, implying `#[cfg(test)]`.
+//!
+//! These are not accessible over http.
+
+use super::*;
+
+impl Layer {
+    /// Enable a failpoint from a unit test.
+    pub(super) fn enable_failpoint(&self, failpoint: Failpoint) {
+        self.0.failpoints.lock().unwrap().push(failpoint);
+    }
+}
+
+impl LayerInner {
+    /// Query if this failpoint is enabled, as in, arrive at a failpoint.
+    ///
+    /// Calls to this method need to be `#[cfg(test)]` guarded.
+    pub(super) async fn failpoint(&self, kind: FailpointKind) -> Result<(), FailpointHit> {
+        let fut = {
+            let mut fps = self.failpoints.lock().unwrap();
+            // find the *last* failpoint for cases in which we need to use multiple for the same
+            // thing (two blocked evictions)
+            let fp = fps.iter_mut().rfind(|x| x.kind() == kind);
+
+            let Some(fp) = fp else {
+                return Ok(());
+            };
+
+            fp.hit()
+        };
+
+        fut.await
+    }
+}
+
+#[derive(Debug, PartialEq, Eq)]
+pub(crate) enum FailpointKind {
+    /// Failpoint acts as an accurate cancelled by drop here; see the only site of use.
+    AfterDeterminingLayerNeedsNoDownload,
+    /// Failpoint for stalling eviction starting
+    WaitBeforeStartingEvicting,
+    /// Failpoint hit in the spawned task
+    WaitBeforeDownloading,
+}
+
+pub(crate) enum Failpoint {
+    AfterDeterminingLayerNeedsNoDownload,
+    WaitBeforeStartingEvicting(
+        Option<utils::completion::Completion>,
+        utils::completion::Barrier,
+    ),
+    WaitBeforeDownloading(
+        Option<utils::completion::Completion>,
+        utils::completion::Barrier,
+    ),
+}
+
+impl Failpoint {
+    fn kind(&self) -> FailpointKind {
+        match self {
+            Failpoint::AfterDeterminingLayerNeedsNoDownload => {
+                FailpointKind::AfterDeterminingLayerNeedsNoDownload
+            }
+            Failpoint::WaitBeforeStartingEvicting(..) => FailpointKind::WaitBeforeStartingEvicting,
+            Failpoint::WaitBeforeDownloading(..) => FailpointKind::WaitBeforeDownloading,
+        }
+    }
+
+    fn hit(&mut self) -> impl std::future::Future<Output = Result<(), FailpointHit>> + 'static {
+        use futures::future::FutureExt;
+
+        // use boxed futures to avoid Either hurdles
+        match self {
+            Failpoint::AfterDeterminingLayerNeedsNoDownload => {
+                let kind = self.kind();
+
+                async move { Err(FailpointHit(kind)) }.boxed()
+            }
+            Failpoint::WaitBeforeStartingEvicting(arrival, b)
+            | Failpoint::WaitBeforeDownloading(arrival, b) => {
+                // first one signals arrival
+                drop(arrival.take());
+
+                let b = b.clone();
+
+                async move {
+                    tracing::trace!("waiting on a failpoint barrier");
+                    b.wait().await;
+                    tracing::trace!("done waiting on a failpoint barrier");
+                    Ok(())
+                }
+                .boxed()
+            }
+        }
+    }
+}
+
+impl std::fmt::Display for FailpointKind {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        std::fmt::Debug::fmt(self, f)
+    }
+}
+
+#[derive(Debug)]
+pub(crate) struct FailpointHit(FailpointKind);
+
+impl std::fmt::Display for FailpointHit {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        std::fmt::Debug::fmt(self, f)
+    }
+}
+
+impl std::error::Error for FailpointHit {}
+
+impl From<FailpointHit> for DownloadError {
+    fn from(value: FailpointHit) -> Self {
+        DownloadError::Failpoint(value.0)
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index e7b2eb025a..247ff123b5 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,14 +1,13 @@
-use futures::StreamExt;
 use pageserver_api::key::CONTROLFILE_KEY;
 use tokio::task::JoinSet;
-use tracing::Instrument;
 use utils::{
     completion::{self, Completion},
     id::TimelineId,
 };
 
+use super::failpoints::{Failpoint, FailpointKind};
 use super::*;
-use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME};
+use crate::context::DownloadBehavior;
 use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
 
 /// Used in tests to advance a future to wanted await point, and not futher.
@@ -21,7 +20,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s
 /// Demonstrate the API and resident -> evicted -> resident -> deleted transitions.
 #[tokio::test]
 async fn smoke_test() {
-    let handle = BACKGROUND_RUNTIME.handle();
+    let handle = tokio::runtime::Handle::current();
 
     let h = TenantHarness::create("smoke_test").unwrap();
     let span = h.span();
@@ -38,7 +37,7 @@ async fn smoke_test() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.resident_layers().collect::<Vec<_>>().await
+            layers.likely_resident_layers().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -88,7 +87,7 @@ async fn smoke_test() {
     //
     // ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to
     // artificially slow it down.
-    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(&handle).await;
 
     match layer
         .evict_and_wait(std::time::Duration::ZERO)
@@ -99,7 +98,7 @@ async fn smoke_test() {
             // expected, but note that the eviction is "still ongoing"
             helper.release().await;
             // exhaust spawn_blocking pool to ensure it is now complete
-            SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle)
+            SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle)
                 .await;
         }
         other => unreachable!("{other:?}"),
@@ -108,7 +107,7 @@ async fn smoke_test() {
     // only way to query if a layer is resident is to acquire a ResidentLayer instance.
     // Layer::keep_resident never downloads, but it might initialize if the layer file is found
     // downloaded locally.
-    let none = layer.keep_resident().await.unwrap();
+    let none = layer.keep_resident().await;
     assert!(
         none.is_none(),
         "Expected none, because eviction removed the local file, found: {none:?}"
@@ -167,6 +166,7 @@ async fn smoke_test() {
     rtc.wait_completion().await.unwrap();
 
     assert_eq!(rtc.get_remote_physical_size(), 0);
+    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }
 
 /// This test demonstrates a previous hang when a eviction and deletion were requested at the same
@@ -174,7 +174,7 @@ async fn smoke_test() {
 #[tokio::test(start_paused = true)]
 async fn evict_and_wait_on_wanted_deleted() {
     // this is the runtime on which Layer spawns the blocking tasks on
-    let handle = BACKGROUND_RUNTIME.handle();
+    let handle = tokio::runtime::Handle::current();
 
     let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
     utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
@@ -188,7 +188,7 @@ async fn evict_and_wait_on_wanted_deleted() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.resident_layers().collect::<Vec<_>>().await
+            layers.likely_resident_layers().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -213,11 +213,11 @@ async fn evict_and_wait_on_wanted_deleted() {
         drop(resident);
 
         // make sure the eviction task gets to run
-        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
 
         let resident = layer.keep_resident().await;
         assert!(
-            matches!(resident, Ok(None)),
+            resident.is_none(),
             "keep_resident should not have re-initialized: {resident:?}"
         );
 
@@ -235,24 +235,408 @@ async fn evict_and_wait_on_wanted_deleted() {
         layers.finish_gc_timeline(&[layer]);
     }
 
-    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
 
     assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get());
     assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get());
     assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
     assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }
 
-/// This test shows that ensures we are able to read the layer while the layer eviction has been
-/// started but not completed due to spawn_blocking pool being blocked.
-///
-/// Here `Layer::keep_resident` is used to "simulate" reads, because it cannot download.
-#[tokio::test(start_paused = true)]
-async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
-    // this is the runtime on which Layer spawns the blocking tasks on
-    let handle = BACKGROUND_RUNTIME.handle();
-    let h = TenantHarness::create("residency_check_while_evict_and_wait_on_clogged_spawn_blocking")
+/// This test ensures we are able to read the layer while the layer eviction has been
+/// started but not completed.
+#[test]
+fn read_wins_pending_eviction() {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .max_blocking_threads(1)
+        .enable_all()
+        .start_paused(true)
+        .build()
         .unwrap();
+
+    rt.block_on(async move {
+        // this is the runtime on which Layer spawns the blocking tasks on
+        let handle = tokio::runtime::Handle::current();
+        let h = TenantHarness::create("read_wins_pending_eviction").unwrap();
+        let (tenant, ctx) = h.load().await;
+        let span = h.span();
+        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
+
+        let timeline = tenant
+            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+            .await
+            .unwrap();
+
+        let layer = {
+            let mut layers = {
+                let layers = timeline.layers.read().await;
+                layers.likely_resident_layers().collect::<Vec<_>>()
+            };
+
+            assert_eq!(layers.len(), 1);
+
+            layers.swap_remove(0)
+        };
+
+        // setup done
+
+        let resident = layer.keep_resident().await.unwrap();
+
+        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
+
+        // drive the future to await on the status channel
+        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
+            .await
+            .expect_err("should had been a timeout since we are holding the layer resident");
+        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+
+        let (completion, barrier) = utils::completion::channel();
+        let (arrival, arrived_at_barrier) = utils::completion::channel();
+        layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
+            Some(arrival),
+            barrier,
+        ));
+
+        // now the eviction cannot proceed because the threads are consumed while completion exists
+        drop(resident);
+        arrived_at_barrier.wait().await;
+        assert!(!layer.is_likely_resident());
+
+        // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
+        layer
+            .0
+            .get_or_maybe_download(false, None)
+            .instrument(download_span)
+            .await
+            .expect("should had reinitialized without downloading");
+
+        assert!(layer.is_likely_resident());
+
+        // reinitialization notifies of new resident status, which should error out all evict_and_wait
+        let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
+            .await
+            .expect("no timeout, because get_or_maybe_download re-initialized")
+            .expect_err("eviction should not have succeeded because re-initialized");
+
+        // works as intended: evictions lose to "downloads"
+        assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // this is not wrong: the eviction is technically still "on the way" as it's still queued
+        // because of a failpoint
+        assert_eq!(
+            0,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        drop(completion);
+
+        tokio::time::sleep(ADVANCE).await;
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
+            .await;
+
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // now we finally can observe the original eviction failing
+        // it would had been possible to observe it earlier, but here it is guaranteed to have
+        // happened.
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::AlreadyReinitialized].get()
+        );
+
+        assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
+    });
+}
+
+/// Use failpoint to delay an eviction starting to get a VersionCheckFailed.
+#[test]
+fn multiple_pending_evictions_in_order() {
+    let name = "multiple_pending_evictions_in_order";
+    let in_order = true;
+    multiple_pending_evictions_scenario(name, in_order);
+}
+
+/// Use failpoint to reorder later eviction before first to get a UnexpectedEvictedState.
+#[test]
+fn multiple_pending_evictions_out_of_order() {
+    let name = "multiple_pending_evictions_out_of_order";
+    let in_order = false;
+    multiple_pending_evictions_scenario(name, in_order);
+}
+
+fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .max_blocking_threads(1)
+        .enable_all()
+        .start_paused(true)
+        .build()
+        .unwrap();
+
+    rt.block_on(async move {
+        // this is the runtime on which Layer spawns the blocking tasks on
+        let handle = tokio::runtime::Handle::current();
+        let h = TenantHarness::create(name).unwrap();
+        let (tenant, ctx) = h.load().await;
+        let span = h.span();
+        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
+
+        let timeline = tenant
+            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+            .await
+            .unwrap();
+
+        let layer = {
+            let mut layers = {
+                let layers = timeline.layers.read().await;
+                layers.likely_resident_layers().collect::<Vec<_>>()
+            };
+
+            assert_eq!(layers.len(), 1);
+
+            layers.swap_remove(0)
+        };
+
+        // setup done
+
+        let resident = layer.keep_resident().await.unwrap();
+
+        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
+
+        // drive the future to await on the status channel
+        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
+            .await
+            .expect_err("should had been a timeout since we are holding the layer resident");
+        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+
+        let (completion1, barrier) = utils::completion::channel();
+        let mut completion1 = Some(completion1);
+        let (arrival, arrived_at_barrier) = utils::completion::channel();
+        layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
+            Some(arrival),
+            barrier,
+        ));
+
+        // now the eviction cannot proceed because we are simulating arbitrary long delay for the
+        // eviction task start.
+        drop(resident);
+        assert!(!layer.is_likely_resident());
+
+        arrived_at_barrier.wait().await;
+
+        // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
+        layer
+            .0
+            .get_or_maybe_download(false, None)
+            .instrument(download_span)
+            .await
+            .expect("should had reinitialized without downloading");
+
+        assert!(layer.is_likely_resident());
+
+        // reinitialization notifies of new resident status, which should error out all evict_and_wait
+        let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
+            .await
+            .expect("no timeout, because get_or_maybe_download re-initialized")
+            .expect_err("eviction should not have succeeded because re-initialized");
+
+        // works as intended: evictions lose to "downloads"
+        assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // this is not wrong: the eviction is technically still "on the way" as it's still queued
+        // because of a failpoint
+        assert_eq!(
+            0,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // configure another failpoint for the second eviction -- evictions are per initialization,
+        // so now that we've reinitialized the inner, we get to run two of them at the same time.
+        let (completion2, barrier) = utils::completion::channel();
+        let (arrival, arrived_at_barrier) = utils::completion::channel();
+        layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
+            Some(arrival),
+            barrier,
+        ));
+
+        let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
+
+        // advance to the wait on the queue
+        tokio::time::timeout(ADVANCE, &mut second_eviction)
+            .await
+            .expect_err("timeout because failpoint is blocking");
+
+        arrived_at_barrier.wait().await;
+
+        assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
+
+        let mut release_earlier_eviction = |expected_reason| {
+            assert_eq!(
+                0,
+                LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
+            );
+
+            drop(completion1.take().unwrap());
+
+            let handle = &handle;
+
+            async move {
+                tokio::time::sleep(ADVANCE).await;
+                SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(
+                    handle, 1,
+                )
+                .await;
+
+                assert_eq!(
+                    1,
+                    LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
+                );
+            }
+        };
+
+        if in_order {
+            release_earlier_eviction(EvictionCancelled::VersionCheckFailed).await;
+        }
+
+        // release the later eviction which is for the current version
+        drop(completion2);
+        tokio::time::sleep(ADVANCE).await;
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
+            .await;
+
+        if !in_order {
+            release_earlier_eviction(EvictionCancelled::UnexpectedEvictedState).await;
+        }
+
+        tokio::time::timeout(ADVANCE, &mut second_eviction)
+            .await
+            .expect("eviction goes through now that spawn_blocking is unclogged")
+            .expect("eviction should succeed, because version matches");
+
+        assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // ensure the cancelled are unchanged
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
+    });
+}
+
+/// The test ensures with a failpoint that a pending eviction is not cancelled by what is currently
+/// a `Layer::keep_resident` call.
+///
+/// This matters because cancelling the eviction would leave us in a state where the file is on
+/// disk but the layer internal state says it has not been initialized. Futhermore, it allows us to
+/// have non-repairing `Layer::is_likely_resident`.
+#[tokio::test(start_paused = true)]
+async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
+    let handle = tokio::runtime::Handle::current();
+    let h =
+        TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap();
+    let (tenant, ctx) = h.load().await;
+
+    let timeline = tenant
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .await
+        .unwrap();
+
+    let layer = {
+        let mut layers = {
+            let layers = timeline.layers.read().await;
+            layers.likely_resident_layers().collect::<Vec<_>>()
+        };
+
+        assert_eq!(layers.len(), 1);
+
+        layers.swap_remove(0)
+    };
+
+    // this failpoint will simulate the `get_or_maybe_download` becoming cancelled (by returning an
+    // Err) at the right time as in "during" the `LayerInner::needs_download`.
+    layer.enable_failpoint(Failpoint::AfterDeterminingLayerNeedsNoDownload);
+
+    let (completion, barrier) = utils::completion::channel();
+    let (arrival, arrived_at_barrier) = utils::completion::channel();
+
+    layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
+        Some(arrival),
+        barrier,
+    ));
+
+    tokio::time::timeout(ADVANCE, layer.evict_and_wait(FOREVER))
+        .await
+        .expect_err("should had advanced to waiting on channel");
+
+    arrived_at_barrier.wait().await;
+
+    // simulate a cancelled read which is cancelled before it gets to re-initialize
+    let e = layer
+        .0
+        .get_or_maybe_download(false, None)
+        .await
+        .unwrap_err();
+    assert!(
+        matches!(
+            e,
+            DownloadError::Failpoint(FailpointKind::AfterDeterminingLayerNeedsNoDownload)
+        ),
+        "{e:?}"
+    );
+
+    assert!(
+        layer.0.needs_download().await.unwrap().is_none(),
+        "file is still on disk"
+    );
+
+    // release the eviction task
+    drop(completion);
+    tokio::time::sleep(ADVANCE).await;
+    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
+
+    // failpoint is still enabled, but it is not hit
+    let e = layer
+        .0
+        .get_or_maybe_download(false, None)
+        .await
+        .unwrap_err();
+    assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}");
+
+    // failpoint is not counted as cancellation either
+    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
+}
+
+#[tokio::test(start_paused = true)]
+async fn evict_and_wait_does_not_wait_for_download() {
+    // let handle = tokio::runtime::Handle::current();
+    let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap();
     let (tenant, ctx) = h.load().await;
     let span = h.span();
     let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -265,7 +649,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.resident_layers().collect::<Vec<_>>().await
+            layers.likely_resident_layers().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -273,91 +657,76 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
         layers.swap_remove(0)
     };
 
-    // setup done
-
-    let resident = layer.keep_resident().await.unwrap();
+    // kind of forced setup: start an eviction but do not allow it progress until we are
+    // downloading
+    let (eviction_can_continue, barrier) = utils::completion::channel();
+    let (arrival, eviction_arrived) = utils::completion::channel();
+    layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
+        Some(arrival),
+        barrier,
+    ));
 
     let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
 
-    // drive the future to await on the status channel
+    // use this once-awaited other_evict to synchronize with the eviction
+    let other_evict = layer.evict_and_wait(FOREVER);
+
     tokio::time::timeout(ADVANCE, &mut evict_and_wait)
         .await
-        .expect_err("should had been a timeout since we are holding the layer resident");
-    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+        .expect_err("should had advanced");
+    eviction_arrived.wait().await;
+    drop(eviction_can_continue);
+    other_evict.await.unwrap();
 
-    // clog up BACKGROUND_RUNTIME spawn_blocking
-    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+    // now the layer is evicted, and the "evict_and_wait" is waiting on the receiver
+    assert!(!layer.is_likely_resident());
 
-    // now the eviction cannot proceed because the threads are consumed while completion exists
-    drop(resident);
+    // following new evict_and_wait will fail until we've completed the download
+    let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
+    assert!(matches!(e, EvictionError::NotFound), "{e:?}");
 
-    // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
-    layer
-        .keep_resident()
-        .instrument(download_span)
-        .await
-        .expect("keep_resident should had reinitialized without downloading")
-        .expect("ResidentLayer");
+    let (download_can_continue, barrier) = utils::completion::channel();
+    let (arrival, _download_arrived) = utils::completion::channel();
+    layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier));
 
-    // because the keep_resident check alters wanted evicted without sending a message, we will
-    // never get completed
-    let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
-        .await
-        .expect("no timeout, because keep_resident re-initialized")
-        .expect_err("eviction should not have succeeded because re-initialized");
+    let mut download = std::pin::pin!(layer
+        .0
+        .get_or_maybe_download(true, None)
+        .instrument(download_span));
 
-    // works as intended: evictions lose to "downloads"
-    assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
-    assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
-
-    // this is not wrong: the eviction is technically still "on the way" as it's still queued
-    // because spawn_blocking is clogged up
-    assert_eq!(
-        0,
-        LAYER_IMPL_METRICS
-            .cancelled_evictions
-            .values()
-            .map(|ctr| ctr.get())
-            .sum::<u64>()
+    assert!(
+        !layer.is_likely_resident(),
+        "during download layer is evicted"
     );
 
-    let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
-
-    // advance to the wait on the queue
-    tokio::time::timeout(ADVANCE, &mut second_eviction)
+    tokio::time::timeout(ADVANCE, &mut download)
         .await
-        .expect_err("timeout because spawn_blocking is clogged");
+        .expect_err("should had timed out because of failpoint");
 
-    // in this case we don't leak started evictions, but I think there is still a chance of that
-    // happening, because we could have upgrades race multiple evictions while only one of them
-    // happens?
-    assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
+    // now we finally get to continue, and because the latest state is downloading, we deduce that
+    // original eviction succeeded
+    evict_and_wait.await.unwrap();
 
-    helper.release().await;
+    // however a new evict_and_wait will fail
+    let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
+    assert!(matches!(e, EvictionError::NotFound), "{e:?}");
 
-    // the second_eviction gets to run here
-    //
-    // synchronize to be *strictly* after the second_eviction spawn_blocking run
-    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+    assert!(!layer.is_likely_resident());
 
-    tokio::time::timeout(ADVANCE, &mut second_eviction)
-        .await
-        .expect("eviction goes through now that spawn_blocking is unclogged")
-        .expect("eviction should succeed, because version matches");
+    drop(download_can_continue);
+    download.await.expect("download should had succeeded");
+    assert!(layer.is_likely_resident());
 
-    assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+    // only now can we evict
+    layer.evict_and_wait(FOREVER).await.unwrap();
+}
 
-    // now we finally can observe the original spawn_blocking failing
-    // it would had been possible to observe it earlier, but here it is guaranteed to have
-    // happened.
-    assert_eq!(
-        1,
-        LAYER_IMPL_METRICS
-            .cancelled_evictions
-            .values()
-            .map(|ctr| ctr.get())
-            .sum::<u64>()
-    );
+#[test]
+fn layer_size() {
+    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
+    assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
+    assert_eq!(std::mem::size_of::<LayerInner>(), 2328);
+    // it also has the utf8 path
 }
 
 struct SpawnBlockingPoolHelper {
@@ -374,31 +743,41 @@ impl SpawnBlockingPoolHelper {
     ///
     /// This should be no issue nowdays, because nextest runs each test in it's own process.
     async fn consume_all_spawn_blocking_threads(handle: &tokio::runtime::Handle) -> Self {
-        let (completion, barrier) = completion::channel();
-        let (tx, mut rx) = tokio::sync::mpsc::channel(8);
+        let default_max_blocking_threads = 512;
 
-        let assumed_max_blocking_threads = 512;
+        Self::consume_all_spawn_blocking_threads0(handle, default_max_blocking_threads).await
+    }
+
+    async fn consume_all_spawn_blocking_threads0(
+        handle: &tokio::runtime::Handle,
+        threads: usize,
+    ) -> Self {
+        assert_ne!(threads, 0);
+
+        let (completion, barrier) = completion::channel();
+        let (started, starts_completed) = completion::channel();
 
         let mut blocking_tasks = JoinSet::new();
 
-        for _ in 0..assumed_max_blocking_threads {
+        for _ in 0..threads {
             let barrier = barrier.clone();
-            let tx = tx.clone();
+            let started = started.clone();
             blocking_tasks.spawn_blocking_on(
                 move || {
-                    tx.blocking_send(()).unwrap();
-                    drop(tx);
+                    drop(started);
                     tokio::runtime::Handle::current().block_on(barrier.wait());
                 },
                 handle,
             );
         }
 
+        drop(started);
+
+        starts_completed.wait().await;
+
         drop(barrier);
 
-        for _ in 0..assumed_max_blocking_threads {
-            rx.recv().await.unwrap();
-        }
+        tracing::trace!("consumed all threads");
 
         SpawnBlockingPoolHelper {
             awaited_by_spawn_blocking_tasks: completion,
@@ -418,13 +797,22 @@ impl SpawnBlockingPoolHelper {
         while let Some(res) = blocking_tasks.join_next().await {
             res.expect("none of the tasks should had panicked");
         }
+
+        tracing::trace!("released all threads");
     }
 
     /// In the tests it is used as an easy way of making sure something scheduled on the target
     /// runtimes `spawn_blocking` has completed, because it must've been scheduled and completed
     /// before our tasks have a chance to schedule and complete.
     async fn consume_and_release_all_of_spawn_blocking_threads(handle: &tokio::runtime::Handle) {
-        Self::consume_all_spawn_blocking_threads(handle)
+        Self::consume_and_release_all_of_spawn_blocking_threads0(handle, 512).await
+    }
+
+    async fn consume_and_release_all_of_spawn_blocking_threads0(
+        handle: &tokio::runtime::Handle,
+        threads: usize,
+    ) {
+        Self::consume_all_spawn_blocking_threads0(handle, threads)
             .await
             .release()
             .await
@@ -438,7 +826,7 @@ fn spawn_blocking_pool_helper_actually_works() {
     // because the amount is not configurable for our helper, expect the same amount as
     // BACKGROUND_RUNTIME using the tokio defaults would have.
     let rt = tokio::runtime::Builder::new_current_thread()
-        .max_blocking_threads(512)
+        .max_blocking_threads(1)
         .enable_all()
         .build()
         .unwrap();
@@ -448,7 +836,8 @@ fn spawn_blocking_pool_helper_actually_works() {
     rt.block_on(async move {
         // this will not return until all threads are spun up and actually executing the code
         // waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d.
-        let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+        let consumed =
+            SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads0(handle, 1).await;
 
         println!("consumed");
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2ab7301cce..0b8222bca7 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -13,7 +13,6 @@ use bytes::Bytes;
 use camino::Utf8Path;
 use enumset::EnumSet;
 use fail::fail_point;
-use futures::stream::StreamExt;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::AUX_FILES_KEY,
@@ -2442,7 +2441,7 @@ impl Timeline {
 
         let guard = self.layers.read().await;
 
-        let resident = guard.resident_layers().map(|layer| {
+        let resident = guard.likely_resident_layers().map(|layer| {
             let last_activity_ts = layer.access_stats().latest_activity_or_now();
 
             HeatMapLayer::new(
@@ -2452,7 +2451,7 @@ impl Timeline {
             )
         });
 
-        let layers = resident.collect().await;
+        let layers = resident.collect();
 
         Some(HeatMapTimeline::new(self.timeline_id, layers))
     }
@@ -4302,7 +4301,7 @@ impl Timeline {
         let mut max_layer_size: Option<u64> = None;
 
         let resident_layers = guard
-            .resident_layers()
+            .likely_resident_layers()
             .map(|layer| {
                 let file_size = layer.layer_desc().file_size;
                 max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
@@ -4315,8 +4314,7 @@ impl Timeline {
                     relative_last_activity: finite_f32::FiniteF32::ZERO,
                 }
             })
-            .collect()
-            .await;
+            .collect();
 
         DiskUsageEvictionInfo {
             max_layer_size,
@@ -4713,7 +4711,6 @@ mod tests {
             .keep_resident()
             .await
             .expect("no download => no downloading errors")
-            .expect("should had been resident")
             .drop_eviction_guard();
 
         let forever = std::time::Duration::from_secs(120);
@@ -4724,7 +4721,7 @@ mod tests {
         let (first, second) = tokio::join!(first, second);
 
         let res = layer.keep_resident().await;
-        assert!(matches!(res, Ok(None)), "{res:?}");
+        assert!(res.is_none(), "{res:?}");
 
         match (first, second) {
             (Ok(()), Ok(())) => {
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index dd603135d2..dd769d4121 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -225,24 +225,18 @@ impl Timeline {
         {
             let guard = self.layers.read().await;
             let layers = guard.layer_map();
-            for hist_layer in layers.iter_historic_layers() {
-                let hist_layer = guard.get_from_desc(&hist_layer);
+            for layer in layers.iter_historic_layers() {
+                let layer = guard.get_from_desc(&layer);
 
                 // guard against eviction while we inspect it; it might be that eviction_task and
                 // disk_usage_eviction_task both select the same layers to be evicted, and
                 // seemingly free up double the space. both succeeding is of no consequence.
-                let guard = match hist_layer.keep_resident().await {
-                    Ok(Some(l)) => l,
-                    Ok(None) => continue,
-                    Err(e) => {
-                        // these should not happen, but we cannot make them statically impossible right
-                        // now.
-                        tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}");
-                        continue;
-                    }
-                };
 
-                let last_activity_ts = hist_layer.access_stats().latest_activity_or_now();
+                if !layer.is_likely_resident() {
+                    continue;
+                }
+
+                let last_activity_ts = layer.access_stats().latest_activity_or_now();
 
                 let no_activity_for = match now.duration_since(last_activity_ts) {
                     Ok(d) => d,
@@ -265,9 +259,8 @@ impl Timeline {
                         continue;
                     }
                 };
-                let layer = guard.drop_eviction_guard();
+
                 if no_activity_for > p.threshold {
-                    // this could cause a lot of allocations in some cases
                     js.spawn(async move {
                         layer
                             .evict_and_wait(std::time::Duration::from_secs(5))
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index ebcdcfdb4d..d54dc1642c 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,5 +1,4 @@
 use anyhow::{bail, ensure, Context, Result};
-use futures::StreamExt;
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
 use tracing::trace;
@@ -241,29 +240,16 @@ impl LayerManager {
         layer.delete_on_drop();
     }
 
-    pub(crate) fn resident_layers(&self) -> impl futures::stream::Stream<Item = Layer> + '_ {
+    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = Layer> + '_ {
         // for small layer maps, we most likely have all resident, but for larger more are likely
         // to be evicted assuming lots of layers correlated with longer lifespan.
 
-        let layers = self
-            .layer_map()
-            .iter_historic_layers()
-            .map(|desc| self.get_from_desc(&desc));
-
-        let layers = futures::stream::iter(layers);
-
-        layers.filter_map(|layer| async move {
-            // TODO(#6028): this query does not really need to see the ResidentLayer
-            match layer.keep_resident().await {
-                Ok(Some(layer)) => Some(layer.drop_eviction_guard()),
-                Ok(None) => None,
-                Err(e) => {
-                    // these should not happen, but we cannot make them statically impossible right
-                    // now.
-                    tracing::warn!(%layer, "failed to keep the layer resident: {e:#}");
-                    None
-                }
-            }
+        self.layer_map().iter_historic_layers().filter_map(|desc| {
+            self.layer_fmgr
+                .0
+                .get(&desc.key())
+                .filter(|l| l.is_likely_resident())
+                .cloned()
         })
     }
 

From 94138c1a28e998b6e0d70f3b72dc170b2af34ca6 Mon Sep 17 00:00:00 2001
From: Jure Bajic <jure.bajic94@gmail.com>
Date: Thu, 21 Mar 2024 10:17:24 +0100
Subject: [PATCH 0446/1571] Enforce LSN ordering of batch entries (#7071)

## Summary of changes

Enforce LSN ordering of batch entries.

Closes https://github.com/neondatabase/neon/issues/6707
---
 libs/utils/src/vec_map.rs           | 218 +++++++++++++++++++++++-----
 pageserver/src/pgdatadir_mapping.rs |  14 +-
 pageserver/src/tenant/timeline.rs   |  10 +-
 3 files changed, 192 insertions(+), 50 deletions(-)

diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs
index 9953b447c8..18b2af14f1 100644
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -1,27 +1,60 @@
 use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};
 
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum VecMapOrdering {
+    Greater,
+    GreaterOrEqual,
+}
+
 /// Ordered map datastructure implemented in a Vec.
 /// Append only - can only add keys that are larger than the
 /// current max key.
+/// Ordering can be adjusted using [`VecMapOrdering`]
+/// during `VecMap` construction.
 #[derive(Clone, Debug)]
-pub struct VecMap<K, V>(Vec<(K, V)>);
+pub struct VecMap<K, V> {
+    data: Vec<(K, V)>,
+    ordering: VecMapOrdering,
+}
 
 impl<K, V> Default for VecMap<K, V> {
     fn default() -> Self {
-        VecMap(Default::default())
+        VecMap {
+            data: Default::default(),
+            ordering: VecMapOrdering::Greater,
+        }
     }
 }
 
-#[derive(Debug)]
-pub struct InvalidKey;
+#[derive(thiserror::Error, Debug)]
+pub enum VecMapError {
+    #[error("Key violates ordering constraint")]
+    InvalidKey,
+    #[error("Mismatched ordering constraints")]
+    ExtendOrderingError,
+}
 
 impl<K: Ord, V> VecMap<K, V> {
+    pub fn new(ordering: VecMapOrdering) -> Self {
+        Self {
+            data: Vec::new(),
+            ordering,
+        }
+    }
+
+    pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
+        Self {
+            data: Vec::with_capacity(capacity),
+            ordering,
+        }
+    }
+
     pub fn is_empty(&self) -> bool {
-        self.0.is_empty()
+        self.data.is_empty()
     }
 
     pub fn as_slice(&self) -> &[(K, V)] {
-        self.0.as_slice()
+        self.data.as_slice()
     }
 
     /// This function may panic if given a range where the lower bound is
@@ -29,7 +62,7 @@ impl<K: Ord, V> VecMap<K, V> {
     pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
         use std::ops::Bound::*;
 
-        let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key);
+        let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key);
 
         let start_idx = match range.start_bound() {
             Unbounded => 0,
@@ -41,7 +74,7 @@ impl<K: Ord, V> VecMap<K, V> {
         };
 
         let end_idx = match range.end_bound() {
-            Unbounded => self.0.len(),
+            Unbounded => self.data.len(),
             Included(k) => match binary_search(k) {
                 Ok(idx) => idx + 1,
                 Err(idx) => idx,
@@ -49,34 +82,30 @@ impl<K: Ord, V> VecMap<K, V> {
             Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
         };
 
-        &self.0[start_idx..end_idx]
+        &self.data[start_idx..end_idx]
     }
 
     /// Add a key value pair to the map.
-    /// If `key` is less than or equal to the current maximum key
-    /// the pair will not be added and InvalidKey error will be returned.
-    pub fn append(&mut self, key: K, value: V) -> Result<usize, InvalidKey> {
-        if let Some((last_key, _last_value)) = self.0.last() {
-            if &key <= last_key {
-                return Err(InvalidKey);
-            }
-        }
+    /// If `key` is not respective of the `self` ordering the
+    /// pair will not be added and `InvalidKey` error will be returned.
+    pub fn append(&mut self, key: K, value: V) -> Result<usize, VecMapError> {
+        self.validate_key_order(&key)?;
 
         let delta_size = self.instrument_vec_op(|vec| vec.push((key, value)));
         Ok(delta_size)
     }
 
     /// Update the maximum key value pair or add a new key value pair to the map.
-    /// If `key` is less than the current maximum key no updates or additions
-    /// will occur and InvalidKey error will be returned.
+    /// If `key` is not respective of the `self` ordering no updates or additions
+    /// will occur and `InvalidKey` error will be returned.
     pub fn append_or_update_last(
         &mut self,
         key: K,
         mut value: V,
-    ) -> Result<(Option<V>, usize), InvalidKey> {
-        if let Some((last_key, last_value)) = self.0.last_mut() {
+    ) -> Result<(Option<V>, usize), VecMapError> {
+        if let Some((last_key, last_value)) = self.data.last_mut() {
             match key.cmp(last_key) {
-                Ordering::Less => return Err(InvalidKey),
+                Ordering::Less => return Err(VecMapError::InvalidKey),
                 Ordering::Equal => {
                     std::mem::swap(last_value, &mut value);
                     const DELTA_SIZE: usize = 0;
@@ -100,40 +129,67 @@ impl<K: Ord, V> VecMap<K, V> {
         V: Clone,
     {
         let split_idx = self
-            .0
+            .data
             .binary_search_by_key(&cutoff, extract_key)
             .unwrap_or_else(std::convert::identity);
 
         (
-            VecMap(self.0[..split_idx].to_vec()),
-            VecMap(self.0[split_idx..].to_vec()),
+            VecMap {
+                data: self.data[..split_idx].to_vec(),
+                ordering: self.ordering,
+            },
+            VecMap {
+                data: self.data[split_idx..].to_vec(),
+                ordering: self.ordering,
+            },
         )
     }
 
     /// Move items from `other` to the end of `self`, leaving `other` empty.
-    /// If any keys in `other` is less than or equal to any key in `self`,
-    /// `InvalidKey` error will be returned and no mutation will occur.
-    pub fn extend(&mut self, other: &mut Self) -> Result<usize, InvalidKey> {
-        let self_last_opt = self.0.last().map(extract_key);
-        let other_first_opt = other.0.last().map(extract_key);
+    /// If the `other` ordering is different from `self` ordering
+    /// `ExtendOrderingError` error will be returned.
+    /// If any keys in `other` is not respective of the ordering defined in
+    /// `self`, `InvalidKey` error will be returned and no mutation will occur.
+    pub fn extend(&mut self, other: &mut Self) -> Result<usize, VecMapError> {
+        if self.ordering != other.ordering {
+            return Err(VecMapError::ExtendOrderingError);
+        }
 
-        if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) {
-            if self_last >= other_first {
-                return Err(InvalidKey);
+        let other_first_opt = other.data.last().map(extract_key);
+        if let Some(other_first) = other_first_opt {
+            self.validate_key_order(other_first)?;
+        }
+
+        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data));
+        Ok(delta_size)
+    }
+
+    /// Validate the current last key in `self` and key being
+    /// inserted against the order defined in `self`.
+    fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> {
+        if let Some(last_key) = self.data.last().map(extract_key) {
+            match (&self.ordering, &key.cmp(last_key)) {
+                (VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => {
+                    return Err(VecMapError::InvalidKey);
+                }
+                (VecMapOrdering::Greater, Ordering::Greater) => {}
+                (VecMapOrdering::GreaterOrEqual, Ordering::Less) => {
+                    return Err(VecMapError::InvalidKey);
+                }
+                (VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {}
             }
         }
 
-        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0));
-        Ok(delta_size)
+        Ok(())
     }
 
     /// Instrument an operation on the underlying [`Vec`].
     /// Will panic if the operation decreases capacity.
     /// Returns the increase in memory usage caused by the op.
     fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
-        let old_cap = self.0.capacity();
-        op(&mut self.0);
-        let new_cap = self.0.capacity();
+        let old_cap = self.data.capacity();
+        op(&mut self.data);
+        let new_cap = self.data.capacity();
 
         match old_cap.cmp(&new_cap) {
             Ordering::Less => {
@@ -145,6 +201,36 @@ impl<K: Ord, V> VecMap<K, V> {
             Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"),
         }
     }
+
+    /// Similar to `from_iter` defined in `FromIter` trait except
+    /// that it accepts an [`VecMapOrdering`]
+    pub fn from_iter<I: IntoIterator<Item = (K, V)>>(iter: I, ordering: VecMapOrdering) -> Self {
+        let iter = iter.into_iter();
+        let initial_capacity = {
+            match iter.size_hint() {
+                (lower_bound, None) => lower_bound,
+                (_, Some(upper_bound)) => upper_bound,
+            }
+        };
+
+        let mut vec_map = VecMap::with_capacity(initial_capacity, ordering);
+        for (key, value) in iter {
+            vec_map
+                .append(key, value)
+                .expect("The passed collection needs to be sorted!");
+        }
+
+        vec_map
+    }
+}
+
+impl<K: Ord, V> IntoIterator for VecMap<K, V> {
+    type Item = (K, V);
+    type IntoIter = std::vec::IntoIter<(K, V)>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.data.into_iter()
+    }
 }
 
 fn extract_key<K, V>(entry: &(K, V)) -> &K {
@@ -155,7 +241,7 @@ fn extract_key<K, V>(entry: &(K, V)) -> &K {
 mod tests {
     use std::{collections::BTreeMap, ops::Bound};
 
-    use super::VecMap;
+    use super::{VecMap, VecMapOrdering};
 
     #[test]
     fn unbounded_range() {
@@ -310,5 +396,59 @@ mod tests {
         left.extend(&mut one_map).unwrap_err();
         assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
         assert_eq!(one_map.as_slice(), &[(1, ())]);
+
+        let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual);
+        map_greater_or_equal.append(2, ()).unwrap();
+        map_greater_or_equal.append(2, ()).unwrap();
+
+        left.extend(&mut map_greater_or_equal).unwrap_err();
+        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
+        assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]);
+    }
+
+    #[test]
+    fn extend_with_ordering() {
+        let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual);
+        left.append(0, ()).unwrap();
+        assert_eq!(left.as_slice(), &[(0, ())]);
+
+        let mut greater_right = VecMap::new(VecMapOrdering::Greater);
+        greater_right.append(0, ()).unwrap();
+        left.extend(&mut greater_right).unwrap_err();
+        assert_eq!(left.as_slice(), &[(0, ())]);
+
+        let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual);
+        greater_or_equal_right.append(2, ()).unwrap();
+        greater_or_equal_right.append(2, ()).unwrap();
+        left.extend(&mut greater_or_equal_right).unwrap();
+        assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]);
+    }
+
+    #[test]
+    fn vec_map_from_sorted() {
+        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())];
+        let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater);
+        assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]);
+
+        let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())];
+        let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
+        assert_eq!(
+            vec_map.as_slice(),
+            &[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]
+        );
+    }
+
+    #[test]
+    #[should_panic]
+    fn vec_map_from_unsorted_greater() {
+        let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())];
+        let _ = VecMap::from_iter(vec, VecMapOrdering::Greater);
+    }
+
+    #[test]
+    #[should_panic]
+    fn vec_map_from_unsorted_greater_or_equal() {
+        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())];
+        let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
     }
 }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 727650a5a5..6f7d74bdee 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -34,6 +34,7 @@ use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
+use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
 const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -1546,12 +1547,13 @@ impl<'a> DatadirModification<'a> {
         if !self.pending_updates.is_empty() {
             // The put_batch call below expects expects the inputs to be sorted by Lsn,
             // so we do that first.
-            let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = self
-                .pending_updates
-                .drain()
-                .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
-                .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
-                .collect();
+            let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
+                self.pending_updates
+                    .drain()
+                    .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
+                    .kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
+                VecMapOrdering::GreaterOrEqual,
+            );
 
             writer.put_batch(lsn_ordered_batch, ctx).await?;
         }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0b8222bca7..7523130f23 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -36,6 +36,7 @@ use tracing::*;
 use utils::{
     bin_ser::BeSer,
     sync::gate::{Gate, GateGuard},
+    vec_map::VecMap,
 };
 
 use std::ops::{Deref, Range};
@@ -4616,16 +4617,15 @@ impl<'a> TimelineWriter<'a> {
         }
     }
 
-    /// Put a batch keys at the specified Lsns.
+    /// Put a batch of keys at the specified Lsns.
     ///
-    /// The batch should be sorted by Lsn such that it's safe
-    /// to roll the open layer mid batch.
+    /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
     pub(crate) async fn put_batch(
         &mut self,
-        batch: Vec<(Key, Lsn, Value)>,
+        batch: VecMap<Lsn, (Key, Value)>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        for (key, lsn, val) in batch {
+        for (lsn, (key, val)) in batch {
             self.put(key, lsn, &val, ctx).await?
         }
 

From 5ec6862bcf2437480964943a4bd1c5a059561693 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 21 Mar 2024 10:58:41 +0000
Subject: [PATCH 0447/1571] proxy: async aware password validation (#7176)

## Problem

spawn_blocking in #7171 was a hack

## Summary of changes

https://github.com/neondatabase/rust-postgres/pull/29
---
 Cargo.lock                    | 11 ++++++-----
 proxy/src/proxy/tests.rs      | 11 ++++++-----
 proxy/src/proxy/tests/mitm.rs |  4 ++--
 proxy/src/scram.rs            |  2 +-
 proxy/src/scram/exchange.rs   |  9 +--------
 proxy/src/scram/secret.rs     |  6 ++----
 6 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index cdbabf2f76..96edba7ae5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3901,7 +3901,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -3914,7 +3914,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "native-tls",
  "tokio",
@@ -3925,7 +3925,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -3938,12 +3938,13 @@ dependencies = [
  "rand 0.8.5",
  "sha2",
  "stringprep",
+ "tokio",
 ]
 
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -5945,7 +5946,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 5d0340e852..9c3be73612 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -135,9 +135,10 @@ impl TestAuth for NoAuth {}
 struct Scram(scram::ServerSecret);
 
 impl Scram {
-    fn new(password: &str) -> anyhow::Result<Self> {
-        let secret =
-            scram::ServerSecret::build(password).context("failed to generate scram secret")?;
+    async fn new(password: &str) -> anyhow::Result<Self> {
+        let secret = scram::ServerSecret::build(password)
+            .await
+            .context("failed to generate scram secret")?;
         Ok(Scram(secret))
     }
 
@@ -284,7 +285,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
     let proxy = tokio::spawn(dummy_proxy(
         client,
         Some(server_config),
-        Scram::new(password)?,
+        Scram::new(password).await?,
     ));
 
     let (_client, _conn) = tokio_postgres::Config::new()
@@ -308,7 +309,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
     let proxy = tokio::spawn(dummy_proxy(
         client,
         Some(server_config),
-        Scram::new("password")?,
+        Scram::new("password").await?,
     ));
 
     let (_client, _conn) = tokio_postgres::Config::new()
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index e0c2d836f4..3b760e5dab 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -148,7 +148,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
     let proxy = tokio::spawn(dummy_proxy(
         client,
         Some(server_config),
-        Scram::new("password")?,
+        Scram::new("password").await?,
     ));
 
     let _client_err = tokio_postgres::Config::new()
@@ -231,7 +231,7 @@ async fn connect_failure(
     let proxy = tokio::spawn(dummy_proxy(
         client,
         Some(server_config),
-        Scram::new("password")?,
+        Scram::new("password").await?,
     ));
 
     let _client_err = tokio_postgres::Config::new()
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index df4b3ec8d7..76541ae2f3 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -114,7 +114,7 @@ mod tests {
     }
 
     async fn run_round_trip_test(server_password: &str, client_password: &str) {
-        let scram_secret = ServerSecret::build(server_password).unwrap();
+        let scram_secret = ServerSecret::build(server_password).await.unwrap();
         let sasl_client =
             ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
 
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 16575d5d98..51c0ba4e09 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -86,14 +86,7 @@ pub async fn exchange(
         .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
     let sent = match init.transition(secret, &tls_server_end_point, client_first)? {
         Continue(sent, server_first) => {
-            // `client.update` might perform `pbkdf2(pw)`, best to spawn it in a blocking thread.
-            // TODO(conrad): take this code from tokio-postgres and make an async-aware pbkdf2 impl
-            client = tokio::task::spawn_blocking(move || {
-                client.update(server_first.as_bytes())?;
-                Ok::<ScramSha256, std::io::Error>(client)
-            })
-            .await
-            .expect("should not panic while performing password hash")?;
+            client.update(server_first.as_bytes()).await?;
             sent
         }
         Success(x, _) => match x {},
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index fb3c45816e..b46d8c3ab5 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -59,10 +59,8 @@ impl ServerSecret {
     /// Build a new server secret from the prerequisites.
     /// XXX: We only use this function in tests.
     #[cfg(test)]
-    pub fn build(password: &str) -> Option<Self> {
-        Self::parse(&postgres_protocol::password::scram_sha_256(
-            password.as_bytes(),
-        ))
+    pub async fn build(password: &str) -> Option<Self> {
+        Self::parse(&postgres_protocol::password::scram_sha_256(password.as_bytes()).await)
     }
 }
 

From c75b58443069d74293d55a0ccb8f71a1b77f2770 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 21 Mar 2024 12:00:20 +0000
Subject: [PATCH 0448/1571] storage_controller: add metrics (#7178)

## Problem
Storage controller had basically no metrics.

## Summary of changes
1. Migrate the existing metrics to use Conrad's
[`measured`](https://docs.rs/measured/0.0.14/measured/) crate.
2. Add metrics for incoming http requests
3. Add metrics for outgoing http requests to the pageserver
4. Add metrics for outgoing pass through requests to the pageserver
5. Add metrics for database queries

Note that the metrics response for the attachment service does not use
chunked encoding like the rest of the metrics endpoints. Conrad has
kindly extended the crate such that it can now be done. Let's leave it
for a follow-up since the payload shouldn't be that big at this point.

Fixes https://github.com/neondatabase/neon/issues/6875
---
 Cargo.lock                                    |  33 ++
 Cargo.toml                                    |   1 +
 control_plane/attachment_service/Cargo.toml   |   4 +
 control_plane/attachment_service/src/http.rs  | 264 +++++++++++++--
 control_plane/attachment_service/src/lib.rs   |   1 +
 .../attachment_service/src/metrics.rs         | 304 ++++++++++++++++--
 control_plane/attachment_service/src/node.rs  |  14 +-
 .../src/pageserver_client.rs                  | 203 ++++++++++++
 .../attachment_service/src/persistence.rs     | 247 +++++++++-----
 .../attachment_service/src/reconciler.rs      |   8 +-
 .../attachment_service/src/service.rs         |  44 ++-
 .../attachment_service/src/tenant_state.rs    |  44 +--
 libs/utils/src/http/endpoint.rs               |   3 +-
 pageserver/src/http/routes.rs                 |   2 +
 proxy/src/http/health_server.rs               |  11 +-
 safekeeper/src/http/routes.rs                 |   3 +-
 test_runner/regress/test_sharding.py          |  16 +-
 17 files changed, 1004 insertions(+), 198 deletions(-)
 create mode 100644 control_plane/attachment_service/src/pageserver_client.rs

diff --git a/Cargo.lock b/Cargo.lock
index 96edba7ae5..dcef66c15d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,6 +277,7 @@ dependencies = [
  "anyhow",
  "aws-config",
  "aws-sdk-secretsmanager",
+ "bytes",
  "camino",
  "clap",
  "control_plane",
@@ -288,6 +289,8 @@ dependencies = [
  "hex",
  "humantime",
  "hyper",
+ "lasso",
+ "measured",
  "metrics",
  "once_cell",
  "pageserver_api",
@@ -295,6 +298,7 @@ dependencies = [
  "postgres_connection",
  "r2d2",
  "reqwest",
+ "routerify",
  "serde",
  "serde_json",
  "thiserror",
@@ -2880,6 +2884,35 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
 
+[[package]]
+name = "measured"
+version = "0.0.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
+dependencies = [
+ "bytes",
+ "hashbrown 0.14.0",
+ "itoa",
+ "lasso",
+ "measured-derive",
+ "memchr",
+ "parking_lot 0.12.1",
+ "rustc-hash",
+ "ryu",
+]
+
+[[package]]
+name = "measured-derive"
+version = "0.0.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.52",
+]
+
 [[package]]
 name = "memchr"
 version = "2.6.4"
diff --git a/Cargo.toml b/Cargo.toml
index 76f4ff041c..0f3dbd4987 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -101,6 +101,7 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
+measured = { version = "0.0.13", features=["default", "lasso"] }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index f78f56c480..34882659e3 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -17,6 +17,7 @@ testing = []
 anyhow.workspace = true
 aws-config.workspace = true
 aws-sdk-secretsmanager.workspace = true
+bytes.workspace = true
 camino.workspace = true
 clap.workspace = true
 fail.workspace = true
@@ -25,17 +26,20 @@ git-version.workspace = true
 hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
+lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
 reqwest.workspace = true
+routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
+measured.workspace = true
 
 diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 076b3a2f70..036019cd38 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,5 +1,11 @@
+use crate::metrics::{
+    HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
+    METRICS_REGISTRY,
+};
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
+use futures::Future;
+use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
@@ -34,6 +40,8 @@ use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
 
 use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
 
+use routerify::Middleware;
+
 /// State available to HTTP request handlers
 #[derive(Clone)]
 pub struct HttpState {
@@ -313,7 +321,7 @@ async fn handle_tenant_timeline_passthrough(
     tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
 
     // Find the node that holds shard zero
-    let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
+    let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?;
 
     // Callers will always pass an unsharded tenant ID.  Before proxying, we must
     // rewrite this to a shard-aware shard zero ID.
@@ -322,12 +330,39 @@ async fn handle_tenant_timeline_passthrough(
     let tenant_shard_str = format!("{}", tenant_shard_id);
     let path = path.replace(&tenant_str, &tenant_shard_str);
 
-    let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
+    let latency = &METRICS_REGISTRY
+        .metrics_group
+        .storage_controller_passthrough_request_latency;
+
+    // This is a bit awkward. We remove the param from the request
+    // and join the words by '_' to get a label for the request.
+    let just_path = path.replace(&tenant_shard_str, "");
+    let path_label = just_path
+        .split('/')
+        .filter(|token| !token.is_empty())
+        .collect::<Vec<_>>()
+        .join("_");
+    let labels = PageserverRequestLabelGroup {
+        pageserver_id: &node.get_id().to_string(),
+        path: &path_label,
+        method: crate::metrics::Method::Get,
+    };
+
+    let _timer = latency.start_timer(labels.clone());
+
+    let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
     let resp = client.get_raw(path).await.map_err(|_e|
         // FIXME: give APiError a proper Unavailable variant.  We return 503 here because
         // if we can't successfully send a request to the pageserver, we aren't available.
         ApiError::ShuttingDown)?;
 
+    if !resp.status().is_success() {
+        let error_counter = &METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_passthrough_request_error;
+        error_counter.inc(labels);
+    }
+
     // We have a reqest::Response, would like a http::Response
     let mut builder = hyper::Response::builder()
         .status(resp.status())
@@ -498,7 +533,11 @@ impl From<ReconcileError> for ApiError {
 
 /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
 /// be allowed to run if Service has finished its initial reconciliation.
-async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
+async fn tenant_service_handler<R, H>(
+    request: Request<Body>,
+    handler: H,
+    request_name: RequestName,
+) -> R::Output
 where
     R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
     H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
@@ -518,9 +557,10 @@ where
         ));
     }
 
-    request_span(
+    named_request_span(
         request,
         |request| async move { handler(service, request).await },
+        request_name,
     )
     .await
 }
@@ -531,11 +571,98 @@ fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(
     })
 }
 
+#[derive(Clone, Debug)]
+struct RequestMeta {
+    method: hyper::http::Method,
+    at: Instant,
+}
+
+fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
+) -> Middleware<B, ApiError> {
+    Middleware::pre(move |req| async move {
+        let meta = RequestMeta {
+            method: req.method().clone(),
+            at: Instant::now(),
+        };
+
+        req.set_context(meta);
+
+        Ok(req)
+    })
+}
+
+fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
+) -> Middleware<B, ApiError> {
+    Middleware::post_with_info(move |resp, req_info| async move {
+        let request_name = match req_info.context::<RequestName>() {
+            Some(name) => name,
+            None => {
+                return Ok(resp);
+            }
+        };
+
+        if let Some(meta) = req_info.context::<RequestMeta>() {
+            let status = &crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_http_request_status;
+            let latency = &crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_http_request_latency;
+
+            status.inc(HttpRequestStatusLabelGroup {
+                path: request_name.0,
+                method: meta.method.clone().into(),
+                status: crate::metrics::StatusCode(resp.status()),
+            });
+
+            latency.observe(
+                HttpRequestLatencyLabelGroup {
+                    path: request_name.0,
+                    method: meta.method.into(),
+                },
+                meta.at.elapsed().as_secs_f64(),
+            );
+        }
+        Ok(resp)
+    })
+}
+
+pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
+
+    let payload = crate::metrics::METRICS_REGISTRY.encode();
+    let response = Response::builder()
+        .status(200)
+        .header(CONTENT_TYPE, TEXT_FORMAT)
+        .body(payload.into())
+        .unwrap();
+
+    Ok(response)
+}
+
+#[derive(Clone)]
+struct RequestName(&'static str);
+
+async fn named_request_span<R, H>(
+    request: Request<Body>,
+    handler: H,
+    name: RequestName,
+) -> R::Output
+where
+    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
+{
+    request.set_context(name);
+    request_span(request, handler).await
+}
+
 pub fn make_router(
     service: Arc<Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
 ) -> RouterBuilder<hyper::Body, ApiError> {
-    let mut router = endpoint::make_router();
+    let mut router = endpoint::make_router()
+        .middleware(prologue_metrics_middleware())
+        .middleware(epilogue_metrics_middleware());
     if auth.is_some() {
         router = router.middleware(auth_middleware(|request| {
             let state = get_state(request);
@@ -544,99 +671,166 @@ pub fn make_router(
             } else {
                 state.auth.as_deref()
             }
-        }))
+        }));
     }
 
     router
         .data(Arc::new(HttpState::new(service, auth)))
+        .get("/metrics", |r| {
+            named_request_span(r, measured_metrics_handler, RequestName("metrics"))
+        })
         // Non-prefixed generic endpoints (status, metrics)
-        .get("/status", |r| request_span(r, handle_status))
-        .get("/ready", |r| request_span(r, handle_ready))
+        .get("/status", |r| {
+            named_request_span(r, handle_status, RequestName("status"))
+        })
+        .get("/ready", |r| {
+            named_request_span(r, handle_ready, RequestName("ready"))
+        })
         // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
         .post("/upcall/v1/re-attach", |r| {
-            request_span(r, handle_re_attach)
+            named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach"))
+        })
+        .post("/upcall/v1/validate", |r| {
+            named_request_span(r, handle_validate, RequestName("upcall_v1_validate"))
         })
-        .post("/upcall/v1/validate", |r| request_span(r, handle_validate))
         // Test/dev/debug endpoints
         .post("/debug/v1/attach-hook", |r| {
-            request_span(r, handle_attach_hook)
+            named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook"))
+        })
+        .post("/debug/v1/inspect", |r| {
+            named_request_span(r, handle_inspect, RequestName("debug_v1_inspect"))
         })
-        .post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
         .post("/debug/v1/tenant/:tenant_id/drop", |r| {
-            request_span(r, handle_tenant_drop)
+            named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop"))
         })
         .post("/debug/v1/node/:node_id/drop", |r| {
-            request_span(r, handle_node_drop)
+            named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
+        })
+        .get("/debug/v1/tenant", |r| {
+            named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
         })
-        .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
         .get("/debug/v1/tenant/:tenant_id/locate", |r| {
-            tenant_service_handler(r, handle_tenant_locate)
+            tenant_service_handler(
+                r,
+                handle_tenant_locate,
+                RequestName("debug_v1_tenant_locate"),
+            )
         })
         .get("/debug/v1/scheduler", |r| {
-            request_span(r, handle_scheduler_dump)
+            named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
         })
         .post("/debug/v1/consistency_check", |r| {
-            request_span(r, handle_consistency_check)
+            named_request_span(
+                r,
+                handle_consistency_check,
+                RequestName("debug_v1_consistency_check"),
+            )
         })
         .put("/debug/v1/failpoints", |r| {
             request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
         })
         // Node operations
         .post("/control/v1/node", |r| {
-            request_span(r, handle_node_register)
+            named_request_span(r, handle_node_register, RequestName("control_v1_node"))
+        })
+        .get("/control/v1/node", |r| {
+            named_request_span(r, handle_node_list, RequestName("control_v1_node"))
         })
-        .get("/control/v1/node", |r| request_span(r, handle_node_list))
         .put("/control/v1/node/:node_id/config", |r| {
-            request_span(r, handle_node_configure)
+            named_request_span(
+                r,
+                handle_node_configure,
+                RequestName("control_v1_node_config"),
+            )
         })
         // Tenant Shard operations
         .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
-            tenant_service_handler(r, handle_tenant_shard_migrate)
+            tenant_service_handler(
+                r,
+                handle_tenant_shard_migrate,
+                RequestName("control_v1_tenant_migrate"),
+            )
         })
         .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
-            tenant_service_handler(r, handle_tenant_shard_split)
+            tenant_service_handler(
+                r,
+                handle_tenant_shard_split,
+                RequestName("control_v1_tenant_shard_split"),
+            )
         })
         .get("/control/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(r, handle_tenant_describe)
+            tenant_service_handler(
+                r,
+                handle_tenant_describe,
+                RequestName("control_v1_tenant_describe"),
+            )
         })
         // Tenant operations
         // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
         // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
         .post("/v1/tenant", |r| {
-            tenant_service_handler(r, handle_tenant_create)
+            tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant"))
         })
         .delete("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(r, handle_tenant_delete)
+            tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
         })
         .put("/v1/tenant/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_set)
+            tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
         })
         .get("/v1/tenant/:tenant_id/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_get)
+            tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config"))
         })
         .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
-            tenant_service_handler(r, handle_tenant_location_config)
+            tenant_service_handler(
+                r,
+                handle_tenant_location_config,
+                RequestName("v1_tenant_location_config"),
+            )
         })
         .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
-            tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
+            tenant_service_handler(
+                r,
+                handle_tenant_time_travel_remote_storage,
+                RequestName("v1_tenant_time_travel_remote_storage"),
+            )
         })
         .post("/v1/tenant/:tenant_id/secondary/download", |r| {
-            tenant_service_handler(r, handle_tenant_secondary_download)
+            tenant_service_handler(
+                r,
+                handle_tenant_secondary_download,
+                RequestName("v1_tenant_secondary_download"),
+            )
         })
         // Timeline operations
         .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            tenant_service_handler(r, handle_tenant_timeline_delete)
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_delete,
+                RequestName("v1_tenant_timeline"),
+            )
         })
         .post("/v1/tenant/:tenant_id/timeline", |r| {
-            tenant_service_handler(r, handle_tenant_timeline_create)
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_create,
+                RequestName("v1_tenant_timeline"),
+            )
         })
         // Tenant detail GET passthrough to shard zero
         .get("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(r, handle_tenant_timeline_passthrough)
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_passthrough,
+                RequestName("v1_tenant_passthrough"),
+            )
         })
         // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
         // timeline GET APIs will be implicitly included.
         .get("/v1/tenant/:tenant_id/timeline*", |r| {
-            tenant_service_handler(r, handle_tenant_timeline_passthrough)
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_passthrough,
+                RequestName("v1_tenant_timeline_passthrough"),
+            )
         })
 }
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index 4aff29f15b..8bcd5c0ac4 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -8,6 +8,7 @@ pub mod http;
 mod id_lock_map;
 pub mod metrics;
 mod node;
+mod pageserver_client;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
diff --git a/control_plane/attachment_service/src/metrics.rs b/control_plane/attachment_service/src/metrics.rs
index ffe093b9c8..ccf5e9b07c 100644
--- a/control_plane/attachment_service/src/metrics.rs
+++ b/control_plane/attachment_service/src/metrics.rs
@@ -1,32 +1,284 @@
-use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
+//!
+//! This module provides metric definitions for the storage controller.
+//!
+//! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
+//! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
+//! constant.
+//!
+//! The rest of the code defines label group types and deals with converting outer types to labels.
+//!
+use bytes::Bytes;
+use measured::{
+    label::{LabelValue, StaticLabelSet},
+    FixedCardinalityLabel, MetricGroup,
+};
 use once_cell::sync::Lazy;
+use std::sync::Mutex;
 
-pub(crate) struct ReconcilerMetrics {
-    pub(crate) spawned: IntCounter,
-    pub(crate) complete: IntCounterVec,
-}
+use crate::persistence::{DatabaseError, DatabaseOperation};
 
-impl ReconcilerMetrics {
-    // Labels used on [`Self::complete`]
-    pub(crate) const SUCCESS: &'static str = "ok";
-    pub(crate) const ERROR: &'static str = "success";
-    pub(crate) const CANCEL: &'static str = "cancel";
-}
-
-pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
-    spawned: register_int_counter!(
-        "storage_controller_reconcile_spawn",
-        "Count of how many times we spawn a reconcile task",
-    )
-    .expect("failed to define a metric"),
-    complete: register_int_counter_vec!(
-        "storage_controller_reconcile_complete",
-        "Reconciler tasks completed, broken down by success/failure/cancelled",
-        &["status"],
-    )
-    .expect("failed to define a metric"),
-});
+pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
+    Lazy::new(StorageControllerMetrics::default);
 
 pub fn preinitialize_metrics() {
-    Lazy::force(&RECONCILER);
+    Lazy::force(&METRICS_REGISTRY);
+}
+
+pub(crate) struct StorageControllerMetrics {
+    pub(crate) metrics_group: StorageControllerMetricGroup,
+    encoder: Mutex<measured::text::TextEncoder>,
+}
+
+#[derive(measured::MetricGroup)]
+pub(crate) struct StorageControllerMetricGroup {
+    /// Count of how many times we spawn a reconcile task
+    pub(crate) storage_controller_reconcile_spawn: measured::Counter,
+    /// Reconciler tasks completed, broken down by success/failure/cancelled
+    pub(crate) storage_controller_reconcile_complete:
+        measured::CounterVec<ReconcileCompleteLabelGroupSet>,
+
+    /// HTTP request status counters for handled requests
+    pub(crate) storage_controller_http_request_status:
+        measured::CounterVec<HttpRequestStatusLabelGroupSet>,
+    /// HTTP request handler latency across all status codes
+    pub(crate) storage_controller_http_request_latency:
+        measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
+
+    /// Count of HTTP requests to the pageserver that resulted in an error,
+    /// broken down by the pageserver node id, request name and method
+    pub(crate) storage_controller_pageserver_request_error:
+        measured::CounterVec<PageserverRequestLabelGroupSet>,
+
+    /// Latency of HTTP requests to the pageserver, broken down by pageserver
+    /// node id, request name and method. This include both successful and unsuccessful
+    /// requests.
+    pub(crate) storage_controller_pageserver_request_latency:
+        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
+
+    /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
+    /// broken down by the pageserver node id, request name and method
+    pub(crate) storage_controller_passthrough_request_error:
+        measured::CounterVec<PageserverRequestLabelGroupSet>,
+
+    /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
+    /// node id, request name and method. This include both successful and unsuccessful
+    /// requests.
+    pub(crate) storage_controller_passthrough_request_latency:
+        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
+
+    /// Count of errors in database queries, broken down by error type and operation.
+    pub(crate) storage_controller_database_query_error:
+        measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
+
+    /// Latency of database queries, broken down by operation.
+    pub(crate) storage_controller_database_query_latency:
+        measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
+}
+
+impl StorageControllerMetrics {
+    pub(crate) fn encode(&self) -> Bytes {
+        let mut encoder = self.encoder.lock().unwrap();
+        self.metrics_group.collect_into(&mut *encoder);
+        encoder.finish()
+    }
+}
+
+impl Default for StorageControllerMetrics {
+    fn default() -> Self {
+        Self {
+            metrics_group: StorageControllerMetricGroup::new(),
+            encoder: Mutex::new(measured::text::TextEncoder::new()),
+        }
+    }
+}
+
+impl StorageControllerMetricGroup {
+    pub(crate) fn new() -> Self {
+        Self {
+            storage_controller_reconcile_spawn: measured::Counter::new(),
+            storage_controller_reconcile_complete: measured::CounterVec::new(
+                ReconcileCompleteLabelGroupSet {
+                    status: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_http_request_status: measured::CounterVec::new(
+                HttpRequestStatusLabelGroupSet {
+                    path: lasso::ThreadedRodeo::new(),
+                    method: StaticLabelSet::new(),
+                    status: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_http_request_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+            storage_controller_pageserver_request_error: measured::CounterVec::new(
+                PageserverRequestLabelGroupSet {
+                    pageserver_id: lasso::ThreadedRodeo::new(),
+                    path: lasso::ThreadedRodeo::new(),
+                    method: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_pageserver_request_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+            storage_controller_passthrough_request_error: measured::CounterVec::new(
+                PageserverRequestLabelGroupSet {
+                    pageserver_id: lasso::ThreadedRodeo::new(),
+                    path: lasso::ThreadedRodeo::new(),
+                    method: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_passthrough_request_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+            storage_controller_database_query_error: measured::CounterVec::new(
+                DatabaseQueryErrorLabelGroupSet {
+                    operation: StaticLabelSet::new(),
+                    error_type: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_database_query_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+        }
+    }
+}
+
+#[derive(measured::LabelGroup)]
+#[label(set = ReconcileCompleteLabelGroupSet)]
+pub(crate) struct ReconcileCompleteLabelGroup {
+    pub(crate) status: ReconcileOutcome,
+}
+
+#[derive(measured::LabelGroup)]
+#[label(set = HttpRequestStatusLabelGroupSet)]
+pub(crate) struct HttpRequestStatusLabelGroup<'a> {
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    pub(crate) path: &'a str,
+    pub(crate) method: Method,
+    pub(crate) status: StatusCode,
+}
+
+#[derive(measured::LabelGroup)]
+#[label(set = HttpRequestLatencyLabelGroupSet)]
+pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    pub(crate) path: &'a str,
+    pub(crate) method: Method,
+}
+
+impl Default for HttpRequestLatencyLabelGroupSet {
+    fn default() -> Self {
+        Self {
+            path: lasso::ThreadedRodeo::new(),
+            method: StaticLabelSet::new(),
+        }
+    }
+}
+
+#[derive(measured::LabelGroup, Clone)]
+#[label(set = PageserverRequestLabelGroupSet)]
+pub(crate) struct PageserverRequestLabelGroup<'a> {
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    pub(crate) pageserver_id: &'a str,
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    pub(crate) path: &'a str,
+    pub(crate) method: Method,
+}
+
+impl Default for PageserverRequestLabelGroupSet {
+    fn default() -> Self {
+        Self {
+            pageserver_id: lasso::ThreadedRodeo::new(),
+            path: lasso::ThreadedRodeo::new(),
+            method: StaticLabelSet::new(),
+        }
+    }
+}
+
+#[derive(measured::LabelGroup)]
+#[label(set = DatabaseQueryErrorLabelGroupSet)]
+pub(crate) struct DatabaseQueryErrorLabelGroup {
+    pub(crate) error_type: DatabaseErrorLabel,
+    pub(crate) operation: DatabaseOperation,
+}
+
+#[derive(measured::LabelGroup)]
+#[label(set = DatabaseQueryLatencyLabelGroupSet)]
+pub(crate) struct DatabaseQueryLatencyLabelGroup {
+    pub(crate) operation: DatabaseOperation,
+}
+
+#[derive(FixedCardinalityLabel)]
+pub(crate) enum ReconcileOutcome {
+    #[label(rename = "ok")]
+    Success,
+    Error,
+    Cancel,
+}
+
+#[derive(FixedCardinalityLabel, Clone)]
+pub(crate) enum Method {
+    Get,
+    Put,
+    Post,
+    Delete,
+    Other,
+}
+
+impl From<hyper::Method> for Method {
+    fn from(value: hyper::Method) -> Self {
+        if value == hyper::Method::GET {
+            Method::Get
+        } else if value == hyper::Method::PUT {
+            Method::Put
+        } else if value == hyper::Method::POST {
+            Method::Post
+        } else if value == hyper::Method::DELETE {
+            Method::Delete
+        } else {
+            Method::Other
+        }
+    }
+}
+
+pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
+
+impl LabelValue for StatusCode {
+    fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
+        v.write_int(self.0.as_u16() as u64)
+    }
+}
+
+impl FixedCardinalityLabel for StatusCode {
+    fn cardinality() -> usize {
+        (100..1000).len()
+    }
+
+    fn encode(&self) -> usize {
+        self.0.as_u16() as usize
+    }
+
+    fn decode(value: usize) -> Self {
+        Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
+    }
+}
+
+#[derive(FixedCardinalityLabel)]
+pub(crate) enum DatabaseErrorLabel {
+    Query,
+    Connection,
+    ConnectionPool,
+    Logical,
+}
+
+impl DatabaseError {
+    pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
+        match self {
+            Self::Query(_) => DatabaseErrorLabel::Query,
+            Self::Connection(_) => DatabaseErrorLabel::Connection,
+            Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
+            Self::Logical(_) => DatabaseErrorLabel::Logical,
+        }
+    }
 }
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index 4167782715..df40bff66f 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -12,7 +12,9 @@ use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, id::NodeId};
 
-use crate::{persistence::NodePersistence, scheduler::MaySchedule};
+use crate::{
+    pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule,
+};
 
 /// Represents the in-memory description of a Node.
 ///
@@ -202,7 +204,7 @@ impl Node {
         cancel: &CancellationToken,
     ) -> Option<mgmt_api::Result<T>>
     where
-        O: FnMut(mgmt_api::Client) -> F,
+        O: FnMut(PageserverClient) -> F,
         F: std::future::Future<Output = mgmt_api::Result<T>>,
     {
         fn is_fatal(e: &mgmt_api::Error) -> bool {
@@ -224,8 +226,12 @@ impl Node {
                     .build()
                     .expect("Failed to construct HTTP client");
 
-                let client =
-                    mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());
+                let client = PageserverClient::from_client(
+                    self.get_id(),
+                    http_client,
+                    self.base_url(),
+                    jwt.as_deref(),
+                );
 
                 let node_cancel_fut = self.cancel.cancelled();
 
diff --git a/control_plane/attachment_service/src/pageserver_client.rs b/control_plane/attachment_service/src/pageserver_client.rs
new file mode 100644
index 0000000000..8237229d7b
--- /dev/null
+++ b/control_plane/attachment_service/src/pageserver_client.rs
@@ -0,0 +1,203 @@
+use pageserver_api::{
+    models::{
+        LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
+        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
+    },
+    shard::TenantShardId,
+};
+use pageserver_client::mgmt_api::{Client, Result};
+use reqwest::StatusCode;
+use utils::id::{NodeId, TimelineId};
+
+/// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
+/// controller to collect metrics in a non-intrusive manner.
+#[derive(Debug, Clone)]
+pub(crate) struct PageserverClient {
+    inner: Client,
+    node_id_label: String,
+}
+
+macro_rules! measured_request {
+    ($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{
+        let labels = crate::metrics::PageserverRequestLabelGroup {
+            pageserver_id: $node_id,
+            path: $name,
+            method: $method,
+        };
+
+        let latency = &crate::metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_pageserver_request_latency;
+        let _timer_guard = latency.start_timer(labels.clone());
+
+        let res = $invoke;
+
+        if res.is_err() {
+            let error_counters = &crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_pageserver_request_error;
+            error_counters.inc(labels)
+        }
+
+        res
+    }};
+}
+
+impl PageserverClient {
+    pub(crate) fn new(node_id: NodeId, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
+        Self {
+            inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt),
+            node_id_label: node_id.0.to_string(),
+        }
+    }
+
+    pub(crate) fn from_client(
+        node_id: NodeId,
+        raw_client: reqwest::Client,
+        mgmt_api_endpoint: String,
+        jwt: Option<&str>,
+    ) -> Self {
+        Self {
+            inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt),
+            node_id_label: node_id.0.to_string(),
+        }
+    }
+
+    pub(crate) async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result<StatusCode> {
+        measured_request!(
+            "tenant",
+            crate::metrics::Method::Delete,
+            &self.node_id_label,
+            self.inner.tenant_delete(tenant_shard_id).await
+        )
+    }
+
+    pub(crate) async fn tenant_time_travel_remote_storage(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timestamp: &str,
+        done_if_after: &str,
+    ) -> Result<()> {
+        measured_request!(
+            "tenant_time_travel_remote_storage",
+            crate::metrics::Method::Put,
+            &self.node_id_label,
+            self.inner
+                .tenant_time_travel_remote_storage(tenant_shard_id, timestamp, done_if_after)
+                .await
+        )
+    }
+
+    pub(crate) async fn tenant_secondary_download(
+        &self,
+        tenant_id: TenantShardId,
+        wait: Option<std::time::Duration>,
+    ) -> Result<(StatusCode, SecondaryProgress)> {
+        measured_request!(
+            "tenant_secondary_download",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.tenant_secondary_download(tenant_id, wait).await
+        )
+    }
+
+    pub(crate) async fn location_config(
+        &self,
+        tenant_shard_id: TenantShardId,
+        config: LocationConfig,
+        flush_ms: Option<std::time::Duration>,
+        lazy: bool,
+    ) -> Result<()> {
+        measured_request!(
+            "location_config",
+            crate::metrics::Method::Put,
+            &self.node_id_label,
+            self.inner
+                .location_config(tenant_shard_id, config, flush_ms, lazy)
+                .await
+        )
+    }
+
+    pub(crate) async fn list_location_config(&self) -> Result<LocationConfigListResponse> {
+        measured_request!(
+            "location_configs",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.list_location_config().await
+        )
+    }
+
+    pub(crate) async fn get_location_config(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<Option<LocationConfig>> {
+        measured_request!(
+            "location_config",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.get_location_config(tenant_shard_id).await
+        )
+    }
+
+    pub(crate) async fn timeline_create(
+        &self,
+        tenant_shard_id: TenantShardId,
+        req: &TimelineCreateRequest,
+    ) -> Result<TimelineInfo> {
+        measured_request!(
+            "timeline",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.timeline_create(tenant_shard_id, req).await
+        )
+    }
+
+    pub(crate) async fn timeline_delete(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<StatusCode> {
+        measured_request!(
+            "timeline",
+            crate::metrics::Method::Delete,
+            &self.node_id_label,
+            self.inner
+                .timeline_delete(tenant_shard_id, timeline_id)
+                .await
+        )
+    }
+
+    pub(crate) async fn tenant_shard_split(
+        &self,
+        tenant_shard_id: TenantShardId,
+        req: TenantShardSplitRequest,
+    ) -> Result<TenantShardSplitResponse> {
+        measured_request!(
+            "tenant_shard_split",
+            crate::metrics::Method::Put,
+            &self.node_id_label,
+            self.inner.tenant_shard_split(tenant_shard_id, req).await
+        )
+    }
+
+    pub(crate) async fn timeline_list(
+        &self,
+        tenant_shard_id: &TenantShardId,
+    ) -> Result<Vec<TimelineInfo>> {
+        measured_request!(
+            "timelines",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.timeline_list(tenant_shard_id).await
+        )
+    }
+
+    pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
+        measured_request!(
+            "utilization",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.get_utilization().await
+        )
+    }
+}
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 209d8ff075..dafd52017b 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -19,6 +19,9 @@ use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
 
+use crate::metrics::{
+    DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY,
+};
 use crate::node::Node;
 
 /// ## What do we store?
@@ -75,6 +78,25 @@ pub(crate) enum DatabaseError {
     Logical(String),
 }
 
+#[derive(measured::FixedCardinalityLabel, Clone)]
+pub(crate) enum DatabaseOperation {
+    InsertNode,
+    UpdateNode,
+    DeleteNode,
+    ListNodes,
+    BeginShardSplit,
+    CompleteShardSplit,
+    AbortShardSplit,
+    Detach,
+    ReAttach,
+    IncrementGeneration,
+    ListTenantShards,
+    InsertTenantShards,
+    UpdateTenantShard,
+    DeleteTenant,
+    UpdateTenantConfig,
+}
+
 #[must_use]
 pub(crate) enum AbortShardSplitStatus {
     /// We aborted the split in the database by reverting to the parent shards
@@ -115,6 +137,34 @@ impl Persistence {
         }
     }
 
+    /// Wraps `with_conn` in order to collect latency and error metrics
+    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
+    where
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        R: Send + 'static,
+    {
+        let latency = &METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_database_query_latency;
+        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
+            operation: op.clone(),
+        });
+
+        let res = self.with_conn(func).await;
+
+        if let Err(err) = &res {
+            let error_counter = &METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_database_query_error;
+            error_counter.inc(DatabaseQueryErrorLabelGroup {
+                error_type: err.error_label(),
+                operation: op,
+            })
+        }
+
+        res
+    }
+
     /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
     async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
     where
@@ -130,21 +180,27 @@ impl Persistence {
     /// When a node is first registered, persist it before using it for anything
     pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
         let np = node.to_persistent();
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            diesel::insert_into(crate::schema::nodes::table)
-                .values(&np)
-                .execute(conn)?;
-            Ok(())
-        })
+        self.with_measured_conn(
+            DatabaseOperation::InsertNode,
+            move |conn| -> DatabaseResult<()> {
+                diesel::insert_into(crate::schema::nodes::table)
+                    .values(&np)
+                    .execute(conn)?;
+                Ok(())
+            },
+        )
         .await
     }
 
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
         let nodes: Vec<NodePersistence> = self
-            .with_conn(move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
-            })
+            .with_measured_conn(
+                DatabaseOperation::ListNodes,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
+                },
+            )
             .await?;
 
         tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -159,7 +215,7 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::nodes::dsl::*;
         let updated = self
-            .with_conn(move |conn| {
+            .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
                 let updated = diesel::update(nodes)
                     .filter(node_id.eq(input_node_id.0 as i64))
                     .set((scheduling_policy.eq(String::from(input_scheduling)),))
@@ -181,9 +237,12 @@ impl Persistence {
     /// be enriched at runtime with state discovered on pageservers.
     pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
         let loaded = self
-            .with_conn(move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
-            })
+            .with_measured_conn(
+                DatabaseOperation::ListTenantShards,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
+                },
+            )
             .await?;
 
         if loaded.is_empty() {
@@ -260,17 +319,20 @@ impl Persistence {
         shards: Vec<TenantShardPersistence>,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            conn.transaction(|conn| -> QueryResult<()> {
-                for tenant in &shards {
-                    diesel::insert_into(tenant_shards)
-                        .values(tenant)
-                        .execute(conn)?;
-                }
+        self.with_measured_conn(
+            DatabaseOperation::InsertTenantShards,
+            move |conn| -> DatabaseResult<()> {
+                conn.transaction(|conn| -> QueryResult<()> {
+                    for tenant in &shards {
+                        diesel::insert_into(tenant_shards)
+                            .values(tenant)
+                            .execute(conn)?;
+                    }
+                    Ok(())
+                })?;
                 Ok(())
-            })?;
-            Ok(())
-        })
+            },
+        )
         .await
     }
 
@@ -278,25 +340,31 @@ impl Persistence {
     /// the tenant from memory on this server.
     pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            diesel::delete(tenant_shards)
-                .filter(tenant_id.eq(del_tenant_id.to_string()))
-                .execute(conn)?;
+        self.with_measured_conn(
+            DatabaseOperation::DeleteTenant,
+            move |conn| -> DatabaseResult<()> {
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(del_tenant_id.to_string()))
+                    .execute(conn)?;
 
-            Ok(())
-        })
+                Ok(())
+            },
+        )
         .await
     }
 
     pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
         use crate::schema::nodes::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            diesel::delete(nodes)
-                .filter(node_id.eq(del_node_id.0 as i64))
-                .execute(conn)?;
+        self.with_measured_conn(
+            DatabaseOperation::DeleteNode,
+            move |conn| -> DatabaseResult<()> {
+                diesel::delete(nodes)
+                    .filter(node_id.eq(del_node_id.0 as i64))
+                    .execute(conn)?;
 
-            Ok(())
-        })
+                Ok(())
+            },
+        )
         .await
     }
 
@@ -310,7 +378,7 @@ impl Persistence {
     ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
-            .with_conn(move |conn| {
+            .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
                 let rows_updated = diesel::update(tenant_shards)
                     .filter(generation_pageserver.eq(node_id.0 as i64))
                     .set(generation.eq(generation + 1))
@@ -360,7 +428,7 @@ impl Persistence {
     ) -> anyhow::Result<Generation> {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
-            .with_conn(move |conn| {
+            .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                     .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -404,7 +472,7 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
 
-        self.with_conn(move |conn| {
+        self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
             let query = diesel::update(tenant_shards)
                 .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                 .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -445,7 +513,7 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
 
-        self.with_conn(move |conn| {
+        self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
             diesel::update(tenant_shards)
                 .filter(tenant_id.eq(input_tenant_id.to_string()))
                 .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
@@ -460,7 +528,7 @@ impl Persistence {
 
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| {
+        self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
             let updated = diesel::update(tenant_shards)
                 .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                 .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -490,7 +558,7 @@ impl Persistence {
         parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
             conn.transaction(|conn| -> DatabaseResult<()> {
                 // Mark parent shards as splitting
 
@@ -554,26 +622,29 @@ impl Persistence {
         old_shard_count: ShardCount,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            conn.transaction(|conn| -> QueryResult<()> {
-                // Drop parent shards
-                diesel::delete(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(old_shard_count.literal() as i32))
-                    .execute(conn)?;
+        self.with_measured_conn(
+            DatabaseOperation::CompleteShardSplit,
+            move |conn| -> DatabaseResult<()> {
+                conn.transaction(|conn| -> QueryResult<()> {
+                    // Drop parent shards
+                    diesel::delete(tenant_shards)
+                        .filter(tenant_id.eq(split_tenant_id.to_string()))
+                        .filter(shard_count.eq(old_shard_count.literal() as i32))
+                        .execute(conn)?;
 
-                // Clear sharding flag
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .set((splitting.eq(0),))
-                    .execute(conn)?;
-                debug_assert!(updated > 0);
+                    // Clear sharding flag
+                    let updated = diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(split_tenant_id.to_string()))
+                        .set((splitting.eq(0),))
+                        .execute(conn)?;
+                    debug_assert!(updated > 0);
+
+                    Ok(())
+                })?;
 
                 Ok(())
-            })?;
-
-            Ok(())
-        })
+            },
+        )
         .await
     }
 
@@ -585,40 +656,44 @@ impl Persistence {
         new_shard_count: ShardCount,
     ) -> DatabaseResult<AbortShardSplitStatus> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
-            let aborted = conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
-                // Clear the splitting state on parent shards
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.ne(new_shard_count.literal() as i32))
-                    .set((splitting.eq(0),))
-                    .execute(conn)?;
+        self.with_measured_conn(
+            DatabaseOperation::AbortShardSplit,
+            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
+                let aborted =
+                    conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
+                        // Clear the splitting state on parent shards
+                        let updated = diesel::update(tenant_shards)
+                            .filter(tenant_id.eq(split_tenant_id.to_string()))
+                            .filter(shard_count.ne(new_shard_count.literal() as i32))
+                            .set((splitting.eq(0),))
+                            .execute(conn)?;
 
-                // Parent shards are already gone: we cannot abort.
-                if updated == 0 {
-                    return Ok(AbortShardSplitStatus::Complete);
-                }
+                        // Parent shards are already gone: we cannot abort.
+                        if updated == 0 {
+                            return Ok(AbortShardSplitStatus::Complete);
+                        }
 
-                // Sanity check: if parent shards were present, their cardinality should
-                // be less than the number of child shards.
-                if updated >= new_shard_count.count() as usize {
-                    return Err(DatabaseError::Logical(format!(
-                        "Unexpected parent shard count {updated} while aborting split to \
+                        // Sanity check: if parent shards were present, their cardinality should
+                        // be less than the number of child shards.
+                        if updated >= new_shard_count.count() as usize {
+                            return Err(DatabaseError::Logical(format!(
+                                "Unexpected parent shard count {updated} while aborting split to \
                             count {new_shard_count:?} on tenant {split_tenant_id}"
-                    )));
-                }
+                            )));
+                        }
 
-                // Erase child shards
-                diesel::delete(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(new_shard_count.literal() as i32))
-                    .execute(conn)?;
+                        // Erase child shards
+                        diesel::delete(tenant_shards)
+                            .filter(tenant_id.eq(split_tenant_id.to_string()))
+                            .filter(shard_count.eq(new_shard_count.literal() as i32))
+                            .execute(conn)?;
 
-                Ok(AbortShardSplitStatus::Aborted)
-            })?;
+                        Ok(AbortShardSplitStatus::Aborted)
+                    })?;
 
-            Ok(aborted)
-        })
+                Ok(aborted)
+            },
+        )
         .await
     }
 }
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index f00f35c74b..32d2cb2643 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,3 +1,4 @@
+use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::service;
 use hyper::StatusCode;
@@ -243,8 +244,11 @@ impl Reconciler {
         tenant_shard_id: TenantShardId,
         node: &Node,
     ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-        let client =
-            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
+        let client = PageserverClient::new(
+            node.get_id(),
+            node.base_url(),
+            self.service_config.jwt_token.as_deref(),
+        );
 
         let timelines = client.timeline_list(&tenant_shard_id).await?;
         Ok(timelines
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index e38007c7af..98377cace6 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -27,6 +27,7 @@ use pageserver_api::{
     models::{SecondaryProgress, TenantConfigRequest},
 };
 
+use crate::pageserver_client::PageserverClient;
 use pageserver_api::{
     models::{
         self, LocationConfig, LocationConfigListResponse, LocationConfigMode,
@@ -551,7 +552,11 @@ impl Service {
                 break;
             }
 
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            let client = PageserverClient::new(
+                node.get_id(),
+                node.base_url(),
+                self.config.jwt_token.as_deref(),
+            );
             match client
                 .location_config(
                     tenant_shard_id,
@@ -2096,8 +2101,11 @@ impl Service {
                 })
                 .collect::<Vec<_>>();
             for tenant_shard_id in shard_ids {
-                let client =
-                    mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+                let client = PageserverClient::new(
+                    node.get_id(),
+                    node.base_url(),
+                    self.config.jwt_token.as_deref(),
+                );
 
                 tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);
 
@@ -2149,7 +2157,11 @@ impl Service {
         // Issue concurrent requests to all shards' locations
         let mut futs = FuturesUnordered::new();
         for (tenant_shard_id, node) in targets {
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            let client = PageserverClient::new(
+                node.get_id(),
+                node.base_url(),
+                self.config.jwt_token.as_deref(),
+            );
             futs.push(async move {
                 let result = client
                     .tenant_secondary_download(tenant_shard_id, wait)
@@ -2242,7 +2254,11 @@ impl Service {
         // Phase 1: delete on the pageservers
         let mut any_pending = false;
         for (tenant_shard_id, node) in targets {
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            let client = PageserverClient::new(
+                node.get_id(),
+                node.base_url(),
+                self.config.jwt_token.as_deref(),
+            );
             // TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
             // surface immediately as an error to our caller.
             let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
@@ -2354,7 +2370,7 @@ impl Service {
                 tenant_shard_id,
                 create_req.new_timeline_id,
             );
-            let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
+            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
 
             client
                 .timeline_create(tenant_shard_id, &create_req)
@@ -2478,7 +2494,7 @@ impl Service {
                 "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
             );
 
-            let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
+            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
             client
                 .timeline_delete(tenant_shard_id, timeline_id)
                 .await
@@ -2519,11 +2535,11 @@ impl Service {
     }
 
     /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
-    /// function looks it up and returns the url.  If the tenant isn't found, returns Err(ApiError::NotFound)
-    pub(crate) fn tenant_shard0_baseurl(
+    /// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound)
+    pub(crate) fn tenant_shard0_node(
         &self,
         tenant_id: TenantId,
-    ) -> Result<(String, TenantShardId), ApiError> {
+    ) -> Result<(Node, TenantShardId), ApiError> {
         let locked = self.inner.read().unwrap();
         let Some((tenant_shard_id, shard)) = locked
             .tenants
@@ -2555,7 +2571,7 @@ impl Service {
             )));
         };
 
-        Ok((node.base_url(), *tenant_shard_id))
+        Ok((node.clone(), *tenant_shard_id))
     }
 
     pub(crate) fn tenant_locate(
@@ -3215,7 +3231,11 @@ impl Service {
                 node,
                 child_ids,
             } = target;
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            let client = PageserverClient::new(
+                node.get_id(),
+                node.base_url(),
+                self.config.jwt_token.as_deref(),
+            );
             let response = client
                 .tenant_shard_split(
                     *parent_id,
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 9dd368bf41..83c921dc58 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -4,7 +4,10 @@ use std::{
     time::Duration,
 };
 
-use crate::{metrics, persistence::TenantShardPersistence};
+use crate::{
+    metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
+    persistence::TenantShardPersistence,
+};
 use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
@@ -718,7 +721,10 @@ impl TenantState {
         let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
                                                         tenant_id=%reconciler.tenant_shard_id.tenant_id,
                                                         shard_id=%reconciler.tenant_shard_id.shard_slug());
-        metrics::RECONCILER.spawned.inc();
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_reconcile_spawn
+            .inc();
         let result_tx = result_tx.clone();
         let join_handle = tokio::task::spawn(
             async move {
@@ -736,10 +742,12 @@ impl TenantState {
                 // TODO: wrap all remote API operations in cancellation check
                 // as well.
                 if reconciler.cancel.is_cancelled() {
-                    metrics::RECONCILER
-                        .complete
-                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
-                        .inc();
+                    metrics::METRICS_REGISTRY
+                        .metrics_group
+                        .storage_controller_reconcile_complete
+                        .inc(ReconcileCompleteLabelGroup {
+                            status: ReconcileOutcome::Cancel,
+                        });
                     return;
                 }
 
@@ -754,18 +762,18 @@ impl TenantState {
                 }
 
                 // Update result counter
-                match &result {
-                    Ok(_) => metrics::RECONCILER
-                        .complete
-                        .with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
-                    Err(ReconcileError::Cancel) => metrics::RECONCILER
-                        .complete
-                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
-                    Err(_) => metrics::RECONCILER
-                        .complete
-                        .with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
-                }
-                .inc();
+                let outcome_label = match &result {
+                    Ok(_) => ReconcileOutcome::Success,
+                    Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
+                    Err(_) => ReconcileOutcome::Error,
+                };
+
+                metrics::METRICS_REGISTRY
+                    .metrics_group
+                    .storage_controller_reconcile_complete
+                    .inc(ReconcileCompleteLabelGroup {
+                        status: outcome_label,
+                    });
 
                 result_tx
                     .send(ReconcileResult {
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index a60971abf0..f8a5f68131 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -245,7 +245,7 @@ impl std::io::Write for ChannelWriter {
     }
 }
 
-async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
     SERVE_METRICS_COUNT.inc();
 
     let started_at = std::time::Instant::now();
@@ -367,7 +367,6 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
         .middleware(Middleware::post_with_info(
             add_request_id_header_to_response,
         ))
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
         .err_handler(route_error_handler)
 }
 
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 229f3ae98f..26f23fb8c2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -36,6 +36,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
+use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -2266,6 +2267,7 @@ pub fn make_router(
 
     Ok(router
         .data(state)
+        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
         .get("/v1/status", |r| api_handler(r, status_handler))
         .put("/v1/failpoints", |r| {
             testing_api_handler("manage failpoints", r, failpoints_handler)
diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs
index 6186ddde0d..cbb17ebcb7 100644
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -2,14 +2,21 @@ use anyhow::{anyhow, bail};
 use hyper::{Body, Request, Response, StatusCode};
 use std::{convert::Infallible, net::TcpListener};
 use tracing::info;
-use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService};
+use utils::http::{
+    endpoint::{self, prometheus_metrics_handler, request_span},
+    error::ApiError,
+    json::json_response,
+    RouterBuilder, RouterService,
+};
 
 async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(StatusCode::OK, "")
 }
 
 fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
-    endpoint::make_router().get("/v1/status", status_handler)
+    endpoint::make_router()
+        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/v1/status", status_handler)
 }
 
 pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible> {
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index a0c0c7ca4c..9ce26e6c5d 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -20,7 +20,7 @@ use std::io::Write as _;
 use tokio::sync::mpsc;
 use tokio_stream::wrappers::ReceiverStream;
 use tracing::{info_span, Instrument};
-use utils::http::endpoint::{request_span, ChannelWriter};
+use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWriter};
 
 use crate::debug_dump::TimelineDigestRequest;
 use crate::receive_wal::WalReceiverState;
@@ -515,6 +515,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
     router
         .data(Arc::new(conf))
         .data(auth)
+        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
         .get("/v1/status", |r| request_span(r, status_handler))
         .put("/v1/failpoints", |r| {
             request_span(r, move |r| async {
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index cb58c640c3..57b2b2b0a1 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -278,18 +278,14 @@ def test_sharding_split_smoke(
 
     # Check that no cancelled or errored reconciliations occurred: this test does no
     # failure injection and should run clean.
-    assert (
-        env.storage_controller.get_metric_value(
-            "storage_controller_reconcile_complete_total", filter={"status": "cancel"}
-        )
-        is None
+    cancelled_reconciles = env.storage_controller.get_metric_value(
+        "storage_controller_reconcile_complete_total", filter={"status": "cancel"}
     )
-    assert (
-        env.storage_controller.get_metric_value(
-            "storage_controller_reconcile_complete_total", filter={"status": "error"}
-        )
-        is None
+    errored_reconciles = env.storage_controller.get_metric_value(
+        "storage_controller_reconcile_complete_total", filter={"status": "error"}
     )
+    assert cancelled_reconciles is not None and int(cancelled_reconciles) == 0
+    assert errored_reconciles is not None and int(errored_reconciles) == 0
 
     env.storage_controller.consistency_check()
 

From 59cdee749edcfde5e57bc1eeea7df25b6a0af485 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 21 Mar 2024 12:06:57 +0000
Subject: [PATCH 0449/1571] storage controller: fixes to secondary location
 handling (#7169)

Stacks on:
- https://github.com/neondatabase/neon/pull/7165

Fixes while working on background optimization of scheduling after a
split:
- When a tenant has secondary locations, we weren't detaching the parent
shards' secondary locations when doing a split
- When a reconciler detaches a location, it was feeding back a
locationconf with `Detached` mode in its `observed` object, whereas it
should omit that location. This could cause the background reconcile
task to keep kicking off no-op reconcilers forever (harmless but
annoying).
- During shard split, we were scheduling secondary locations for the
child shards, but no reconcile was run for these until the next time the
background reconcile task ran. Creating these ASAP is useful, because
they'll be used shortly after a shard split as the destination locations
for migrating the new shards to different nodes.
---
 .../attachment_service/src/reconciler.rs      |  22 ++-
 .../attachment_service/src/service.rs         |  70 ++++++++-
 pageserver/src/tenant/secondary/downloader.rs |   6 +-
 test_runner/fixtures/neon_fixtures.py         |  12 ++
 test_runner/regress/test_sharding.py          | 137 +++++++++++++-----
 5 files changed, 202 insertions(+), 45 deletions(-)

diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index 32d2cb2643..a62357f9ac 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -118,6 +118,15 @@ impl Reconciler {
         flush_ms: Option<Duration>,
         lazy: bool,
     ) -> Result<(), ReconcileError> {
+        if !node.is_available() && config.mode == LocationConfigMode::Detached {
+            // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline
+            // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of
+            // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`]
+            tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation");
+            self.observed.locations.remove(&node.get_id());
+            return Ok(());
+        }
+
         self.observed
             .locations
             .insert(node.get_id(), ObservedStateLocation { conf: None });
@@ -150,9 +159,16 @@ impl Reconciler {
         };
         tracing::info!("location_config({node}) complete: {:?}", config);
 
-        self.observed
-            .locations
-            .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
+        match config.mode {
+            LocationConfigMode::Detached => {
+                self.observed.locations.remove(&node.get_id());
+            }
+            _ => {
+                self.observed
+                    .locations
+                    .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
+            }
+        }
 
         Ok(())
     }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 98377cace6..c886afaf1c 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -210,6 +210,7 @@ struct ShardSplitParams {
     new_stripe_size: Option<ShardStripeSize>,
     targets: Vec<ShardSplitTarget>,
     policy: PlacementPolicy,
+    config: TenantConfig,
     shard_ident: ShardIdentity,
 }
 
@@ -2741,7 +2742,7 @@ impl Service {
         let detach_locations: Vec<(Node, TenantShardId)> = {
             let mut detach_locations = Vec::new();
             let mut locked = self.inner.write().unwrap();
-            let (nodes, tenants, _scheduler) = locked.parts_mut();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
 
             for (tenant_shard_id, shard) in
                 tenants.range_mut(TenantShardId::tenant_range(op.tenant_id))
@@ -2774,6 +2775,13 @@ impl Service {
 
                 tracing::info!("Restoring parent shard {tenant_shard_id}");
                 shard.splitting = SplitState::Idle;
+                if let Err(e) = shard.schedule(scheduler) {
+                    // If this shard can't be scheduled now (perhaps due to offline nodes or
+                    // capacity issues), that must not prevent us rolling back a split.  In this
+                    // case it should be eventually scheduled in the background.
+                    tracing::warn!("Failed to schedule {tenant_shard_id} during shard abort: {e}")
+                }
+
                 self.maybe_reconcile_shard(shard, nodes);
             }
 
@@ -2865,7 +2873,7 @@ impl Service {
                 .map(|(shard_id, _)| *shard_id)
                 .collect::<Vec<_>>();
 
-            let (_nodes, tenants, scheduler) = locked.parts_mut();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
             for parent_id in parent_ids {
                 let child_ids = parent_id.split(new_shard_count);
 
@@ -2932,6 +2940,8 @@ impl Service {
                         // find a secondary (e.g. because cluster is overloaded).
                         tracing::warn!("Failed to schedule child shard {child}: {e}");
                     }
+                    // In the background, attach secondary locations for the new shards
+                    self.maybe_reconcile_shard(&mut child_state, nodes);
 
                     tenants.insert(child, child_state);
                     response.new_shards.push(child);
@@ -2996,6 +3006,7 @@ impl Service {
         )));
 
         let mut policy = None;
+        let mut config = None;
         let mut shard_ident = None;
         // Validate input, and calculate which shards we will create
         let (old_shard_count, targets) =
@@ -3052,6 +3063,9 @@ impl Service {
                     if shard_ident.is_none() {
                         shard_ident = Some(shard.shard);
                     }
+                    if config.is_none() {
+                        config = Some(shard.config.clone());
+                    }
 
                     if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
                         tracing::info!(
@@ -3070,8 +3084,6 @@ impl Service {
                         .get(&node_id)
                         .expect("Pageservers may not be deleted while referenced");
 
-                    // TODO: if any reconciliation is currently in progress for this shard, wait for it.
-
                     targets.push(ShardSplitTarget {
                         parent_id: *tenant_shard_id,
                         node: node.clone(),
@@ -3114,6 +3126,7 @@ impl Service {
             shard_ident.unwrap()
         };
         let policy = policy.unwrap();
+        let config = config.unwrap();
 
         Ok(ShardSplitAction::Split(ShardSplitParams {
             old_shard_count,
@@ -3121,6 +3134,7 @@ impl Service {
             new_stripe_size: split_req.new_stripe_size,
             targets,
             policy,
+            config,
             shard_ident,
         }))
     }
@@ -3140,11 +3154,49 @@ impl Service {
             old_shard_count,
             new_shard_count,
             new_stripe_size,
-            targets,
+            mut targets,
             policy,
+            config,
             shard_ident,
         } = params;
 
+        // Drop any secondary locations: pageservers do not support splitting these, and in any case the
+        // end-state for a split tenant will usually be to have secondary locations on different nodes.
+        // The reconciliation calls in this block also implicitly cancel+barrier wrt any ongoing reconciliation
+        // at the time of split.
+        let waiters = {
+            let mut locked = self.inner.write().unwrap();
+            let mut waiters = Vec::new();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
+            for target in &mut targets {
+                let Some(shard) = tenants.get_mut(&target.parent_id) else {
+                    // Paranoia check: this shouldn't happen: we have the oplock for this tenant ID.
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "Shard {} not found",
+                        target.parent_id
+                    )));
+                };
+
+                if shard.intent.get_attached() != &Some(target.node.get_id()) {
+                    // Paranoia check: this shouldn't happen: we have the oplock for this tenant ID.
+                    return Err(ApiError::Conflict(format!(
+                        "Shard {} unexpectedly rescheduled during split",
+                        target.parent_id
+                    )));
+                }
+
+                // Irrespective of PlacementPolicy, clear secondary locations from intent
+                shard.intent.clear_secondary(scheduler);
+
+                // Run Reconciler to execute detach fo secondary locations.
+                if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
+                    waiters.push(waiter);
+                }
+            }
+            waiters
+        };
+        self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
+
         // Before creating any new child shards in memory or on the pageservers, persist them: this
         // enables us to ensure that we will always be able to clean up if something goes wrong.  This also
         // acts as the protection against two concurrent attempts to split: one of them will get a database
@@ -3173,8 +3225,7 @@ impl Service {
                     generation: None,
                     generation_pageserver: Some(target.node.get_id().0 as i64),
                     placement_policy: serde_json::to_string(&policy).unwrap(),
-                    // TODO: get the config out of the map
-                    config: serde_json::to_string(&TenantConfig::default()).unwrap(),
+                    config: serde_json::to_string(&config).unwrap(),
                     splitting: SplitState::Splitting,
                 });
             }
@@ -3363,6 +3414,11 @@ impl Service {
                         // If we were already attached to something, demote that to a secondary
                         if let Some(old_attached) = old_attached {
                             if n > 0 {
+                                // Remove other secondaries to make room for the location we'll demote
+                                while shard.intent.get_secondary().len() >= n {
+                                    shard.intent.pop_secondary(scheduler);
+                                }
+
                                 shard.intent.push_secondary(scheduler, old_attached);
                             }
                         }
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 82af7ed83b..40f19e3b05 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -11,6 +11,7 @@ use crate::{
     disk_usage_eviction_task::{
         finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer,
     },
+    is_temporary,
     metrics::SECONDARY_MODE,
     tenant::{
         config::SecondaryLocationConfig,
@@ -961,7 +962,10 @@ async fn init_timeline_state(
             // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
             warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
             continue;
-        } else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
+        } else if crate::is_temporary(&file_path)
+            || is_temp_download_file(&file_path)
+            || is_temporary(&file_path)
+        {
             // Temporary files are frequently left behind from restarting during downloads
             tracing::info!("Cleaning up temporary file {file_path}");
             if let Err(e) = tokio::fs::remove_file(&file_path)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 1d30c45278..f8994a8dcc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2150,6 +2150,18 @@ class NeonStorageController(MetricsGetter):
         shards: list[dict[str, Any]] = body["shards"]
         return shards
 
+    def tenant_describe(self, tenant_id: TenantId):
+        """
+        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
+        """
+        response = self.request(
+            "GET",
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        response.raise_for_status()
+        return response.json()
+
     def tenant_shard_split(
         self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None
     ) -> list[TenantShardId]:
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 57b2b2b0a1..e6318aff68 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1,5 +1,6 @@
 import os
 import time
+from collections import defaultdict
 from typing import Dict, List, Optional, Union
 
 import pytest
@@ -13,7 +14,7 @@ from fixtures.neon_fixtures import (
     tenant_get_shards,
 )
 from fixtures.remote_storage import s3_storage
-from fixtures.types import Lsn, TenantShardId, TimelineId
+from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
 from pytest_httpserver import HTTPServer
@@ -159,11 +160,20 @@ def test_sharding_split_smoke(
 
     neon_env_builder.preserve_database_files = True
 
-    env = neon_env_builder.init_start(
-        initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
+    non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024}
+
+    env = neon_env_builder.init_configs(True)
+    neon_env_builder.start()
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(
+        tenant_id,
+        timeline_id,
+        shard_count=shard_count,
+        shard_stripe_size=stripe_size,
+        placement_policy='{"Attached": 1}',
+        conf=non_default_tenant_config,
     )
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
     workload = Workload(env, tenant_id, timeline_id, branch_name="main")
     workload.init()
 
@@ -223,6 +233,14 @@ def test_sharding_split_smoke(
     # Before split, old shards exist
     assert shards_on_disk(old_shard_ids)
 
+    # Before split, we have done one reconcile for each shard
+    assert (
+        env.storage_controller.get_metric_value(
+            "storage_controller_reconcile_complete_total", filter={"status": "ok"}
+        )
+        == shard_count
+    )
+
     env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count)
 
     post_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)]
@@ -268,13 +286,20 @@ def test_sharding_split_smoke(
 
     workload.validate()
 
-    # Check that we didn't do any spurious reconciliations.
-    # Total number of reconciles should have been one per original shard, plus
-    # one for each shard that was migrated.
+    # Assert on how many reconciles happened during the process.  This is something of an
+    # implementation detail, but it is useful to detect any bugs that might generate spurious
+    # extra reconcile iterations.
+    #
+    # We'll have:
+    # - shard_count reconciles for the original setup of the tenant
+    # - shard_count reconciles for detaching the original secondary locations during split
+    # - split_shard_count reconciles during shard splitting, for setting up secondaries.
+    # - shard_count reconciles for the migrations we did to move child shards away from their split location
+    expect_reconciles = shard_count * 2 + split_shard_count + shard_count
     reconcile_ok = env.storage_controller.get_metric_value(
         "storage_controller_reconcile_complete_total", filter={"status": "ok"}
     )
-    assert reconcile_ok == shard_count + split_shard_count // 2
+    assert reconcile_ok == expect_reconciles
 
     # Check that no cancelled or errored reconciliations occurred: this test does no
     # failure injection and should run clean.
@@ -289,14 +314,34 @@ def test_sharding_split_smoke(
 
     env.storage_controller.consistency_check()
 
-    # Validate pageserver state
-    shards_exist: list[TenantShardId] = []
-    for pageserver in env.pageservers:
-        locations = pageserver.http_client().tenant_list_locations()
-        shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])
+    def get_node_shard_counts(env: NeonEnv, tenant_ids):
+        total: defaultdict[int, int] = defaultdict(int)
+        attached: defaultdict[int, int] = defaultdict(int)
+        for tid in tenant_ids:
+            for shard in env.storage_controller.tenant_describe(tid)["shards"]:
+                log.info(
+                    f"{shard['tenant_shard_id']}: attached={shard['node_attached']}, secondary={shard['node_secondary']} "
+                )
+                for node in shard["node_secondary"]:
+                    total[int(node)] += 1
+                attached[int(shard["node_attached"])] += 1
+                total[int(shard["node_attached"])] += 1
 
-    log.info(f"Shards after split: {shards_exist}")
-    assert len(shards_exist) == split_shard_count
+        return total, attached
+
+    def check_effective_tenant_config():
+        # Expect our custom tenant configs to have survived the split
+        for shard in env.storage_controller.tenant_describe(tenant_id)["shards"]:
+            node = env.get_pageserver(int(shard["node_attached"]))
+            config = node.http_client().tenant_config(TenantShardId.parse(shard["tenant_shard_id"]))
+            for k, v in non_default_tenant_config.items():
+                assert config.effective_config[k] == v
+
+    # Validate pageserver state: expect every child shard to have an attached and secondary location
+    (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id])
+    assert sum(attached.values()) == split_shard_count
+    assert sum(total.values()) == split_shard_count * 2
+    check_effective_tenant_config()
 
     # Ensure post-split pageserver locations survive a restart (i.e. the child shards
     # correctly wrote config to disk, and the storage controller responds correctly
@@ -305,13 +350,11 @@ def test_sharding_split_smoke(
         pageserver.stop()
         pageserver.start()
 
-    shards_exist = []
-    for pageserver in env.pageservers:
-        locations = pageserver.http_client().tenant_list_locations()
-        shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])
-
-    log.info("Shards after restart: {shards_exist}")
-    assert len(shards_exist) == split_shard_count
+    # Validate pageserver state: expect every child shard to have an attached and secondary location
+    (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id])
+    assert sum(attached.values()) == split_shard_count
+    assert sum(total.values()) == split_shard_count * 2
+    check_effective_tenant_config()
 
     workload.validate()
 
@@ -717,9 +760,16 @@ def test_sharding_split_failures(
     initial_shard_count = 2
     split_shard_count = 4
 
-    env = neon_env_builder.init_start(initial_tenant_shard_count=initial_shard_count)
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+
+    # Create a tenant with secondary locations enabled
+    env.neon_cli.create_tenant(
+        tenant_id, timeline_id, shard_count=initial_shard_count, placement_policy='{"Attached":1}'
+    )
 
     env.storage_controller.allowed_errors.extend(
         [
@@ -732,6 +782,8 @@ def test_sharding_split_failures(
             ".*failpoint.*",
             # Node offline cases will fail to send requests
             ".*Reconcile error: receive body: error sending request for url.*",
+            # Node offline cases will fail inside reconciler when detaching secondaries
+            ".*Reconcile error on shard.*: receive body: error sending request for url.*",
         ]
     )
 
@@ -769,7 +821,8 @@ def test_sharding_split_failures(
     # will have succeeded: the net result should be to return to a clean state, including
     # detaching any child shards.
     def assert_rolled_back(exclude_ps_id=None) -> None:
-        count = 0
+        secondary_count = 0
+        attached_count = 0
         for ps in env.pageservers:
             if exclude_ps_id is not None and ps.id == exclude_ps_id:
                 continue
@@ -777,13 +830,25 @@ def test_sharding_split_failures(
             locations = ps.http_client().tenant_list_locations()["tenant_shards"]
             for loc in locations:
                 tenant_shard_id = TenantShardId.parse(loc[0])
-                log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
                 assert tenant_shard_id.shard_count == initial_shard_count
-                count += 1
-        assert count == initial_shard_count
+                if loc[1]["mode"] == "Secondary":
+                    secondary_count += 1
+                else:
+                    attached_count += 1
+
+        if exclude_ps_id is not None:
+            # For a node failure case, we expect there to be a secondary location
+            # scheduled on the offline node, so expect one fewer secondary in total
+            assert secondary_count == initial_shard_count - 1
+        else:
+            assert secondary_count == initial_shard_count
+
+        assert attached_count == initial_shard_count
 
     def assert_split_done(exclude_ps_id=None) -> None:
-        count = 0
+        secondary_count = 0
+        attached_count = 0
         for ps in env.pageservers:
             if exclude_ps_id is not None and ps.id == exclude_ps_id:
                 continue
@@ -791,10 +856,14 @@ def test_sharding_split_failures(
             locations = ps.http_client().tenant_list_locations()["tenant_shards"]
             for loc in locations:
                 tenant_shard_id = TenantShardId.parse(loc[0])
-                log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
                 assert tenant_shard_id.shard_count == split_shard_count
-                count += 1
-        assert count == split_shard_count
+                if loc[1]["mode"] == "Secondary":
+                    secondary_count += 1
+                else:
+                    attached_count += 1
+        assert attached_count == split_shard_count
+        assert secondary_count == split_shard_count
 
     def finish_split():
         # Having failed+rolled back, we should be able to split again

From bb47d536fb6e79865d9876f7ed7a46fa57e988a2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 21 Mar 2024 12:56:13 +0000
Subject: [PATCH 0450/1571] pageserver: quieten log on shutdown-while-attaching
 (#7177)

## Problem

If a shutdown happens when a tenant is attaching, we were logging at
ERROR severity and with a backtrace. Yuck.

## Summary of changes

- Pass a flag into `make_broken` to enable quietening this non-scary
case.
---
 pageserver/src/tenant.rs | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7a6ddd6a4e..1c66f99ece 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -678,9 +678,20 @@ impl Tenant {
                 }
 
                 // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
+                enum BrokenVerbosity {
+                    Error,
+                    Info
+                }
                 let make_broken =
-                    |t: &Tenant, err: anyhow::Error| {
-                        error!("attach failed, setting tenant state to Broken: {err:?}");
+                    |t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| {
+                        match verbosity {
+                            BrokenVerbosity::Info => {
+                                info!("attach cancelled, setting tenant state to Broken: {err}");
+                            },
+                            BrokenVerbosity::Error => {
+                                error!("attach failed, setting tenant state to Broken: {err:?}");
+                            }
+                        }
                         t.state.send_modify(|state| {
                             // The Stopping case is for when we have passed control on to DeleteTenantFlow:
                             // if it errors, we will call make_broken when tenant is already in Stopping.
@@ -744,7 +755,7 @@ impl Tenant {
                             // Make the tenant broken so that set_stopping will not hang waiting for it to leave
                             // the Attaching state.  This is an over-reaction (nothing really broke, the tenant is
                             // just shutting down), but ensures progress.
-                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
+                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info);
                             return Ok(());
                         },
                     )
@@ -766,7 +777,7 @@ impl Tenant {
                         match res {
                             Ok(p) => Some(p),
                             Err(e) => {
-                                make_broken(&tenant_clone, anyhow::anyhow!(e));
+                                make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
                                 return Ok(());
                             }
                         }
@@ -790,7 +801,7 @@ impl Tenant {
                     {
                         Ok(should_resume_deletion) => should_resume_deletion,
                         Err(err) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            make_broken(&tenant_clone, anyhow::anyhow!(err), BrokenVerbosity::Error);
                             return Ok(());
                         }
                     }
@@ -820,7 +831,7 @@ impl Tenant {
                     .await;
 
                     if let Err(e) = deleted {
-                        make_broken(&tenant_clone, anyhow::anyhow!(e));
+                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
                     }
 
                     return Ok(());
@@ -841,7 +852,7 @@ impl Tenant {
                         tenant_clone.activate(broker_client, None, &ctx);
                     }
                     Err(e) => {
-                        make_broken(&tenant_clone, anyhow::anyhow!(e));
+                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
                     }
                 }
 

From 06cb582d910f4949bcb3927ec40a7cba7a306ff3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 21 Mar 2024 13:39:23 +0000
Subject: [PATCH 0451/1571] pageserver: extend /re-attach response to include
 tenant mode (#6941)

This change improves the resilience of the system to unclean restarts.

Previously, re-attach responses only included attached tenants
- If the pageserver had local state for a secondary location, it would
remain, but with no guarantee that it was still _meant_ to be there.
After this change, the pageserver will only retain secondary locations
if the /re-attach response indicates that they should still be there.
- If the pageserver had local state for an attached location that was
omitted from a re-attach response, it would be entirely detached. This
is wasteful in a typical HA setup, where an offline node's tenants might
have been re-attached elsewhere before it restarts, but the offline
node's location should revert to a secondary location rather than being
wiped. Including secondary tenants in the re-attach response enables the
pageserver to avoid throwing away local state unnecessarily.

In this PR:
- The re-attach items are extended with a 'mode' field.
- Storage controller populates 'mode'
- Pageserver interprets it (default is attached if missing) to construct
either a SecondaryTenant or a Tenant.
- A new test exercises both cases.
---
 .../attachment_service/src/service.rs         | 114 ++++-----
 libs/pageserver_api/src/upcall_api.rs         |  20 +-
 pageserver/src/control_plane_client.rs        |  11 +-
 pageserver/src/deletion_queue.rs              |   7 +-
 pageserver/src/tenant.rs                      |   7 +
 pageserver/src/tenant/config.rs               |   5 +-
 pageserver/src/tenant/mgr.rs                  | 233 +++++++++++-------
 test_runner/regress/test_sharding_service.py  |  64 ++++-
 8 files changed, 305 insertions(+), 156 deletions(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index c886afaf1c..aa930014b2 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1394,7 +1394,8 @@ impl Service {
             incremented_generations.len()
         );
 
-        // Apply the updated generation to our in-memory state
+        // Apply the updated generation to our in-memory state, and
+        // gather discover secondary locations.
         let mut locked = self.inner.write().unwrap();
         let (nodes, tenants, scheduler) = locked.parts_mut();
 
@@ -1402,62 +1403,65 @@ impl Service {
             tenants: Vec::new(),
         };
 
-        for (tenant_shard_id, new_gen) in incremented_generations {
-            response.tenants.push(ReAttachResponseTenant {
-                id: tenant_shard_id,
-                gen: new_gen.into().unwrap(),
-            });
-            // Apply the new generation number to our in-memory state
-            let shard_state = tenants.get_mut(&tenant_shard_id);
-            let Some(shard_state) = shard_state else {
-                // Not fatal.  This edge case requires a re-attach to happen
-                // between inserting a new tenant shard in to the database, and updating our in-memory
-                // state to know about the shard, _and_ that the state inserted to the database referenced
-                // a pageserver.  Should never happen, but handle it rather than panicking, since it should
-                // be harmless.
-                tracing::error!(
-                    "Shard {} is in database for node {} but not in-memory state",
-                    tenant_shard_id,
-                    reattach_req.node_id
-                );
-                continue;
-            };
+        // TODO: cancel/restart any running reconciliation for this tenant, it might be trying
+        // to call location_conf API with an old generation.  Wait for cancellation to complete
+        // before responding to this request.  Requires well implemented CancellationToken logic
+        // all the way to where we call location_conf.  Even then, there can still be a location_conf
+        // request in flight over the network: TODO handle that by making location_conf API refuse
+        // to go backward in generations.
 
-            // If [`Persistence::re_attach`] selected this shard, it must have alread
-            // had a generation set.
-            debug_assert!(shard_state.generation.is_some());
-            let Some(old_gen) = shard_state.generation else {
-                // Should never happen:  would only return incremented generation
-                // for a tenant that already had a non-null generation.
-                return Err(ApiError::InternalServerError(anyhow::anyhow!(
-                    "Generation must be set while re-attaching"
-                )));
-            };
-            shard_state.generation = Some(std::cmp::max(old_gen, new_gen));
-            if let Some(observed) = shard_state
-                .observed
-                .locations
-                .get_mut(&reattach_req.node_id)
-            {
-                if let Some(conf) = observed.conf.as_mut() {
-                    conf.generation = new_gen.into();
+        // Scan through all shards, applying updates for ones where we updated generation
+        // and identifying shards that intend to have a secondary location on this node.
+        for (tenant_shard_id, shard) in tenants {
+            if let Some(new_gen) = incremented_generations.get(tenant_shard_id) {
+                let new_gen = *new_gen;
+                response.tenants.push(ReAttachResponseTenant {
+                    id: *tenant_shard_id,
+                    gen: Some(new_gen.into().unwrap()),
+                    // A tenant is only put into multi or stale modes in the middle of a [`Reconciler::live_migrate`]
+                    // execution.  If a pageserver is restarted during that process, then the reconcile pass will
+                    // fail, and start from scratch, so it doesn't make sense for us to try and preserve
+                    // the stale/multi states at this point.
+                    mode: LocationConfigMode::AttachedSingle,
+                });
+
+                shard.generation = std::cmp::max(shard.generation, Some(new_gen));
+                if let Some(observed) = shard.observed.locations.get_mut(&reattach_req.node_id) {
+                    // Why can we update `observed` even though we're not sure our response will be received
+                    // by the pageserver?  Because the pageserver will not proceed with startup until
+                    // it has processed response: if it loses it, we'll see another request and increment
+                    // generation again, avoiding any uncertainty about dirtiness of tenant's state.
+                    if let Some(conf) = observed.conf.as_mut() {
+                        conf.generation = new_gen.into();
+                    }
+                } else {
+                    // This node has no observed state for the shard: perhaps it was offline
+                    // when the pageserver restarted.  Insert a None, so that the Reconciler
+                    // will be prompted to learn the location's state before it makes changes.
+                    shard
+                        .observed
+                        .locations
+                        .insert(reattach_req.node_id, ObservedStateLocation { conf: None });
                 }
-            } else {
-                // This node has no observed state for the shard: perhaps it was offline
-                // when the pageserver restarted.  Insert a None, so that the Reconciler
-                // will be prompted to learn the location's state before it makes changes.
-                shard_state
-                    .observed
-                    .locations
-                    .insert(reattach_req.node_id, ObservedStateLocation { conf: None });
-            }
+            } else if shard.intent.get_secondary().contains(&reattach_req.node_id) {
+                // Ordering: pageserver will not accept /location_config requests until it has
+                // finished processing the response from re-attach.  So we can update our in-memory state
+                // now, and be confident that we are not stamping on the result of some later location config.
+                // TODO: however, we are not strictly ordered wrt ReconcileResults queue,
+                // so we might update observed state here, and then get over-written by some racing
+                // ReconcileResult.  The impact is low however, since we have set state on pageserver something
+                // that matches intent, so worst case if we race then we end up doing a spurious reconcile.
 
-            // TODO: cancel/restart any running reconciliation for this tenant, it might be trying
-            // to call location_conf API with an old generation.  Wait for cancellation to complete
-            // before responding to this request.  Requires well implemented CancellationToken logic
-            // all the way to where we call location_conf.  Even then, there can still be a location_conf
-            // request in flight over the network: TODO handle that by making location_conf API refuse
-            // to go backward in generations.
+                response.tenants.push(ReAttachResponseTenant {
+                    id: *tenant_shard_id,
+                    gen: None,
+                    mode: LocationConfigMode::Secondary,
+                });
+
+                // We must not update observed, because we have no guarantee that our
+                // response will be received by the pageserver. This could leave it
+                // falsely dirty, but the resulting reconcile should be idempotent.
+            }
         }
 
         // We consider a node Active once we have composed a re-attach response, but we
@@ -3446,7 +3450,7 @@ impl Service {
         if let Some(waiter) = waiter {
             waiter.wait_timeout(RECONCILE_TIMEOUT).await?;
         } else {
-            tracing::warn!("Migration is a no-op");
+            tracing::info!("Migration is a no-op");
         }
 
         Ok(TenantShardMigrateResponse {})
diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs
index 5472948091..2e88836bd0 100644
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -6,7 +6,9 @@
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;
 
-use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};
+use crate::{
+    controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId,
+};
 
 /// Upcall message sent by the pageserver to the configured `control_plane_api` on
 /// startup.
@@ -20,12 +22,20 @@ pub struct ReAttachRequest {
     pub register: Option<NodeRegisterRequest>,
 }
 
-#[derive(Serialize, Deserialize)]
-pub struct ReAttachResponseTenant {
-    pub id: TenantShardId,
-    pub gen: u32,
+fn default_mode() -> LocationConfigMode {
+    LocationConfigMode::AttachedSingle
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+pub struct ReAttachResponseTenant {
+    pub id: TenantShardId,
+    /// Mandatory if LocationConfigMode is None or set to an Attached* mode
+    pub gen: Option<u32>,
+
+    /// Default value only for backward compat: this field should be set
+    #[serde(default = "default_mode")]
+    pub mode: LocationConfigMode,
+}
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponse {
     pub tenants: Vec<ReAttachResponseTenant>,
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 1b3d76335d..42c800822b 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -5,7 +5,8 @@ use pageserver_api::{
     controller_api::NodeRegisterRequest,
     shard::TenantShardId,
     upcall_api::{
-        ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
+        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
+        ValidateRequestTenant, ValidateResponse,
     },
 };
 use serde::{de::DeserializeOwned, Serialize};
@@ -37,7 +38,9 @@ pub trait ControlPlaneGenerationsApi {
     fn re_attach(
         &self,
         conf: &PageServerConf,
-    ) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
+    ) -> impl Future<
+        Output = Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError>,
+    > + Send;
     fn validate(
         &self,
         tenants: Vec<(TenantShardId, Generation)>,
@@ -118,7 +121,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
     async fn re_attach(
         &self,
         conf: &PageServerConf,
-    ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
+    ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
         let re_attach_path = self
             .base_url
             .join("re-attach")
@@ -181,7 +184,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
         Ok(response
             .tenants
             .into_iter()
-            .map(|t| (t.id, Generation::new(t.gen)))
+            .map(|rart| (rart.id, rart))
             .collect::<HashMap<_, _>>())
     }
 
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index b6aea8fae8..e3c11cb299 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -724,8 +724,8 @@ impl DeletionQueue {
 mod test {
     use camino::Utf8Path;
     use hex_literal::hex;
-    use pageserver_api::shard::ShardIndex;
-    use std::io::ErrorKind;
+    use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
+    use std::{io::ErrorKind, time::Duration};
     use tracing::info;
 
     use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -834,9 +834,10 @@ mod test {
         async fn re_attach(
             &self,
             _conf: &PageServerConf,
-        ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
+        ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
             unimplemented!()
         }
+
         async fn validate(
             &self,
             tenants: Vec<(TenantShardId, Generation)>,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1c66f99ece..fe48741a89 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -202,6 +202,13 @@ pub(super) struct AttachedTenantConf {
 }
 
 impl AttachedTenantConf {
+    fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
+        Self {
+            tenant_conf,
+            location,
+        }
+    }
+
     fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
         match &location_conf.mode {
             LocationMode::Attached(attach_conf) => Ok(Self {
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 57fc444cdd..53a8c97e23 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -196,16 +196,17 @@ impl LocationConf {
     /// For use when attaching/re-attaching: update the generation stored in this
     /// structure.  If we were in a secondary state, promote to attached (posession
     /// of a fresh generation implies this).
-    pub(crate) fn attach_in_generation(&mut self, generation: Generation) {
+    pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) {
         match &mut self.mode {
             LocationMode::Attached(attach_conf) => {
                 attach_conf.generation = generation;
+                attach_conf.attach_mode = mode;
             }
             LocationMode::Secondary(_) => {
                 // We are promoted to attached by the control plane's re-attach response
                 self.mode = LocationMode::Attached(AttachedLocationConfig {
                     generation,
-                    attach_mode: AttachmentMode::Single,
+                    attach_mode: mode,
                 })
             }
         }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 7e0092d5b6..97a505ded9 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -4,10 +4,11 @@
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use itertools::Itertools;
 use pageserver_api::key::Key;
-use pageserver_api::models::ShardParameters;
+use pageserver_api::models::{LocationConfigMode, ShardParameters};
 use pageserver_api::shard::{
     ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
 };
+use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::cmp::Ordering;
@@ -124,6 +125,46 @@ pub(crate) enum ShardSelector {
     Page(Key),
 }
 
+/// A convenience for use with the re_attach ControlPlaneClient function: rather
+/// than the serializable struct, we build this enum that encapsulates
+/// the invariant that attached tenants always have generations.
+///
+/// This represents the subset of a LocationConfig that we receive during re-attach.
+pub(crate) enum TenantStartupMode {
+    Attached((AttachmentMode, Generation)),
+    Secondary,
+}
+
+impl TenantStartupMode {
+    /// Return the generation & mode that should be used when starting
+    /// this tenant.
+    ///
+    /// If this returns None, the re-attach struct is in an invalid state and
+    /// should be ignored in the response.
+    fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option<Self> {
+        match (rart.mode, rart.gen) {
+            (LocationConfigMode::Detached, _) => None,
+            (LocationConfigMode::Secondary, _) => Some(Self::Secondary),
+            (LocationConfigMode::AttachedMulti, Some(g)) => {
+                Some(Self::Attached((AttachmentMode::Multi, Generation::new(g))))
+            }
+            (LocationConfigMode::AttachedSingle, Some(g)) => {
+                Some(Self::Attached((AttachmentMode::Single, Generation::new(g))))
+            }
+            (LocationConfigMode::AttachedStale, Some(g)) => {
+                Some(Self::Attached((AttachmentMode::Stale, Generation::new(g))))
+            }
+            _ => {
+                tracing::warn!(
+                    "Received invalid re-attach state for tenant {}: {rart:?}",
+                    rart.id
+                );
+                None
+            }
+        }
+    }
+}
+
 impl TenantsMap {
     /// Convenience function for typical usage, where we want to get a `Tenant` object, for
     /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
@@ -270,7 +311,7 @@ pub struct TenantManager {
 
 fn emergency_generations(
     tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
-) -> HashMap<TenantShardId, Generation> {
+) -> HashMap<TenantShardId, TenantStartupMode> {
     tenant_confs
         .iter()
         .filter_map(|(tid, lc)| {
@@ -278,12 +319,15 @@ fn emergency_generations(
                 Ok(lc) => lc,
                 Err(_) => return None,
             };
-            let gen = match &lc.mode {
-                LocationMode::Attached(alc) => Some(alc.generation),
-                LocationMode::Secondary(_) => None,
-            };
-
-            gen.map(|g| (*tid, g))
+            Some((
+                *tid,
+                match &lc.mode {
+                    LocationMode::Attached(alc) => {
+                        TenantStartupMode::Attached((alc.attach_mode, alc.generation))
+                    }
+                    LocationMode::Secondary(_) => TenantStartupMode::Secondary,
+                },
+            ))
         })
         .collect()
 }
@@ -293,7 +337,7 @@ async fn init_load_generations(
     tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
     resources: &TenantSharedResources,
     cancel: &CancellationToken,
-) -> anyhow::Result<Option<HashMap<TenantShardId, Generation>>> {
+) -> anyhow::Result<Option<HashMap<TenantShardId, TenantStartupMode>>> {
     let generations = if conf.control_plane_emergency_mode {
         error!(
             "Emergency mode!  Tenants will be attached unsafely using their last known generation"
@@ -303,7 +347,12 @@ async fn init_load_generations(
         info!("Calling control plane API to re-attach tenants");
         // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
         match client.re_attach(conf).await {
-            Ok(tenants) => tenants,
+            Ok(tenants) => tenants
+                .into_iter()
+                .flat_map(|(id, rart)| {
+                    TenantStartupMode::from_reattach_tenant(rart).map(|tsm| (id, tsm))
+                })
+                .collect(),
             Err(RetryForeverError::ShuttingDown) => {
                 anyhow::bail!("Shut down while waiting for control plane re-attach response")
             }
@@ -321,9 +370,17 @@ async fn init_load_generations(
     // Must only do this if remote storage is enabled, otherwise deletion queue
     // is not running and channel push will fail.
     if resources.remote_storage.is_some() {
-        resources
-            .deletion_queue_client
-            .recover(generations.clone())?;
+        let attached_tenants = generations
+            .iter()
+            .flat_map(|(id, start_mode)| {
+                match start_mode {
+                    TenantStartupMode::Attached((_mode, generation)) => Some(generation),
+                    TenantStartupMode::Secondary => None,
+                }
+                .map(|gen| (*id, *gen))
+            })
+            .collect();
+        resources.deletion_queue_client.recover(attached_tenants)?;
     }
 
     Ok(Some(generations))
@@ -489,9 +546,8 @@ pub async fn init_tenant_mgr(
     // Scan local filesystem for attached tenants
     let tenant_configs = init_load_tenant_configs(conf).await?;
 
-    // Determine which tenants are to be attached
-    let tenant_generations =
-        init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
+    // Determine which tenants are to be secondary or attached, and in which generation
+    let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
 
     tracing::info!(
         "Attaching {} tenants at startup, warming up {} at a time",
@@ -521,97 +577,102 @@ pub async fn init_tenant_mgr(
             }
         };
 
-        let generation = if let Some(generations) = &tenant_generations {
+        // FIXME: if we were attached, and get demoted to secondary on re-attach, we
+        // don't have a place to get a config.
+        // (https://github.com/neondatabase/neon/issues/5377)
+        const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
+            SecondaryLocationConfig { warm: true };
+
+        // Update the location config according to the re-attach response
+        if let Some(tenant_modes) = &tenant_modes {
             // We have a generation map: treat it as the authority for whether
             // this tenant is really attached.
-            if let Some(gen) = generations.get(&tenant_shard_id) {
-                if let LocationMode::Attached(attached) = &location_conf.mode {
-                    if attached.generation > *gen {
+            match tenant_modes.get(&tenant_shard_id) {
+                None => {
+                    info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
+                    if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
+                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                            "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
+                        );
+                    }
+
+                    // We deleted local content: move on to next tenant, don't try and spawn this one.
+                    continue;
+                }
+                Some(TenantStartupMode::Secondary) => {
+                    if !matches!(location_conf.mode, LocationMode::Secondary(_)) {
+                        location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
+                    }
+                }
+                Some(TenantStartupMode::Attached((attach_mode, generation))) => {
+                    let old_gen_higher = match &location_conf.mode {
+                        LocationMode::Attached(AttachedLocationConfig {
+                            generation: old_generation,
+                            attach_mode: _attach_mode,
+                        }) => {
+                            if old_generation > generation {
+                                Some(old_generation)
+                            } else {
+                                None
+                            }
+                        }
+                        _ => None,
+                    };
+                    if let Some(old_generation) = old_gen_higher {
                         tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                            "Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
-                            attached.generation
+                            "Control plane gave decreasing generation ({generation:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
+                            old_generation
                         );
 
                         // We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
                         // local disk content: demote to secondary rather than detaching.
-                        tenants.insert(
-                            tenant_shard_id,
-                            TenantSlot::Secondary(SecondaryTenant::new(
-                                tenant_shard_id,
-                                location_conf.shard,
-                                location_conf.tenant_conf.clone(),
-                                &SecondaryLocationConfig { warm: false },
-                            )),
-                        );
+                        location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
+                    } else {
+                        location_conf.attach_in_generation(*attach_mode, *generation);
                     }
                 }
-                *gen
-            } else {
-                match &location_conf.mode {
-                    LocationMode::Secondary(secondary_config) => {
-                        // We do not require the control plane's permission for secondary mode
-                        // tenants, because they do no remote writes and hence require no
-                        // generation number
-                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
-                        tenants.insert(
-                            tenant_shard_id,
-                            TenantSlot::Secondary(SecondaryTenant::new(
-                                tenant_shard_id,
-                                location_conf.shard,
-                                location_conf.tenant_conf,
-                                secondary_config,
-                            )),
-                        );
-                    }
-                    LocationMode::Attached(_) => {
-                        // TODO: augment re-attach API to enable the control plane to
-                        // instruct us about secondary attachments.  That way, instead of throwing
-                        // away local state, we can gracefully fall back to secondary here, if the control
-                        // plane tells us so.
-                        // (https://github.com/neondatabase/neon/issues/5377)
-                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
-                        if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
-                            error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                                "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
-                            );
-                        }
-                    }
-                };
-
-                continue;
             }
         } else {
             // Legacy mode: no generation information, any tenant present
             // on local disk may activate
             info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",);
-            Generation::none()
         };
 
         // Presence of a generation number implies attachment: attach the tenant
         // if it wasn't already, and apply the generation number.
-        location_conf.attach_in_generation(generation);
         Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
 
         let shard_identity = location_conf.shard;
-        match tenant_spawn(
-            conf,
-            tenant_shard_id,
-            &tenant_dir_path,
-            resources.clone(),
-            AttachedTenantConf::try_from(location_conf)?,
-            shard_identity,
-            Some(init_order.clone()),
-            &TENANTS,
-            SpawnMode::Lazy,
-            &ctx,
-        ) {
-            Ok(tenant) => {
-                tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
+        let slot = match location_conf.mode {
+            LocationMode::Attached(attached_conf) => {
+                match tenant_spawn(
+                    conf,
+                    tenant_shard_id,
+                    &tenant_dir_path,
+                    resources.clone(),
+                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                    shard_identity,
+                    Some(init_order.clone()),
+                    &TENANTS,
+                    SpawnMode::Lazy,
+                    &ctx,
+                ) {
+                    Ok(tenant) => TenantSlot::Attached(tenant),
+                    Err(e) => {
+                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
+                        continue;
+                    }
+                }
             }
-            Err(e) => {
-                error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
-            }
-        }
+            LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
+                tenant_shard_id,
+                shard_identity,
+                location_conf.tenant_conf,
+                &secondary_conf,
+            )),
+        };
+
+        tenants.insert(tenant_shard_id, slot);
     }
 
     info!("Processed {} local tenants at startup", tenants.len());
@@ -2142,7 +2203,7 @@ pub(crate) async fn load_tenant(
 
     let mut location_conf =
         Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
-    location_conf.attach_in_generation(generation);
+    location_conf.attach_in_generation(AttachmentMode::Single, generation);
 
     Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
 
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index a6b0f76c96..b7488cadd6 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -23,7 +23,7 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
-from fixtures.types import TenantId, TimelineId
+from fixtures.types import TenantId, TenantShardId, TimelineId
 from fixtures.utils import run_pg_bench_small, wait_until
 from mypy_boto3_s3.type_defs import (
     ObjectTypeDef,
@@ -948,3 +948,65 @@ def test_sharding_service_heartbeats(
         env.storage_controller.consistency_check()
 
     wait_until(10, 1, storage_controller_consistent)
+
+
+def test_sharding_service_re_attach(neon_env_builder: NeonEnvBuilder):
+    """
+    Exercise the behavior of the /re-attach endpoint on pageserver startup when
+    pageservers have a mixture of attached and secondary locations
+    """
+
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    # We'll have two tenants.
+    tenant_a = TenantId.generate()
+    env.neon_cli.create_tenant(tenant_a, placement_policy='{"Attached":1}')
+    tenant_b = TenantId.generate()
+    env.neon_cli.create_tenant(tenant_b, placement_policy='{"Attached":1}')
+
+    # Each pageserver will have one attached and one secondary location
+    env.storage_controller.tenant_shard_migrate(
+        TenantShardId(tenant_a, 0, 0), env.pageservers[0].id
+    )
+    env.storage_controller.tenant_shard_migrate(
+        TenantShardId(tenant_b, 0, 0), env.pageservers[1].id
+    )
+
+    # Hard-fail a pageserver
+    victim_ps = env.pageservers[1]
+    survivor_ps = env.pageservers[0]
+    victim_ps.stop(immediate=True)
+
+    # Heatbeater will notice it's offline, and consequently attachments move to the other pageserver
+    def failed_over():
+        locations = survivor_ps.http_client().tenant_list_locations()["tenant_shards"]
+        log.info(f"locations: {locations}")
+        assert len(locations) == 2
+        assert all(loc[1]["mode"] == "AttachedSingle" for loc in locations)
+
+    # We could pre-empty this by configuring the node to Offline, but it's preferable to test
+    # the realistic path we would take when a node restarts uncleanly.
+    # The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local
+    wait_until(30, 1, failed_over)
+
+    reconciles_before_restart = env.storage_controller.get_metric_value(
+        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
+    )
+
+    # Restart the failed pageserver
+    victim_ps.start()
+
+    # We expect that the re-attach call correctly tipped off the pageserver that its locations
+    # are all secondaries now.
+    locations = victim_ps.http_client().tenant_list_locations()["tenant_shards"]
+    assert len(locations) == 2
+    assert all(loc[1]["mode"] == "Secondary" for loc in locations)
+
+    # We expect that this situation resulted from the re_attach call, and not any explicit
+    # Reconciler runs: assert that the reconciliation count has not gone up since we restarted.
+    reconciles_after_restart = env.storage_controller.get_metric_value(
+        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
+    )
+    assert reconciles_after_restart == reconciles_before_restart

From d5304337cf2b15826f28e1de92e97d87ba620952 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 21 Mar 2024 13:54:06 +0000
Subject: [PATCH 0452/1571] proxy: simplify password validation (#7188)

## Problem

for HTTP/WS/password hack flows we imitate SCRAM to validate passwords.
This code was unnecessarily complicated.

## Summary of changes

Copy in the `pbkdf2` and 'derive keys' steps from the
`postgres_protocol` crate in our `rust-postgres` fork. Derive the
`client_key`, `server_key` and `stored_key` from the password directly.
Use constant time equality to compare the `stored_key` and `server_key`
with the ones we are sent from cplane.
---
 Cargo.lock                  |  1 +
 Cargo.toml                  |  1 +
 proxy/Cargo.toml            |  1 +
 proxy/src/auth/flow.rs      |  9 +---
 proxy/src/sasl.rs           |  4 ++
 proxy/src/scram.rs          | 15 ++----
 proxy/src/scram/exchange.rs | 95 +++++++++++++++++++++++++------------
 7 files changed, 76 insertions(+), 50 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index dcef66c15d..824cac13b3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4259,6 +4259,7 @@ dependencies = [
  "smallvec",
  "smol_str",
  "socket2 0.5.5",
+ "subtle",
  "sync_wrapper",
  "task-local-extensions",
  "thiserror",
diff --git a/Cargo.toml b/Cargo.toml
index 0f3dbd4987..44e6ec9744 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -149,6 +149,7 @@ smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
+"subtle"  = "2.5.0"
 svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index b3a5bf873e..93a1fe85db 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -63,6 +63,7 @@ sha2 = { workspace = true, features = ["asm"] }
 smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
+subtle.workspace = true
 sync_wrapper.workspace = true
 task-local-extensions.workspace = true
 thiserror.workspace = true
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index f26dcb7c9a..45bbad8cb2 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -194,14 +194,7 @@ pub(crate) async fn validate_password_and_exchange(
         }
         // perform scram authentication as both client and server to validate the keys
         AuthSecret::Scram(scram_secret) => {
-            use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
-            let sasl_client = ScramSha256::new(password, ChannelBinding::unsupported());
-            let outcome = crate::scram::exchange(
-                &scram_secret,
-                sasl_client,
-                crate::config::TlsServerEndPoint::Undefined,
-            )
-            .await?;
+            let outcome = crate::scram::exchange(&scram_secret, password).await?;
 
             let client_key = match outcome {
                 sasl::Outcome::Success(client_key) => client_key,
diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs
index 1cf8b53e11..0811416ca2 100644
--- a/proxy/src/sasl.rs
+++ b/proxy/src/sasl.rs
@@ -33,6 +33,9 @@ pub enum Error {
     #[error("Internal error: missing digest")]
     MissingBinding,
 
+    #[error("could not decode salt: {0}")]
+    Base64(#[from] base64::DecodeError),
+
     #[error(transparent)]
     Io(#[from] io::Error),
 }
@@ -55,6 +58,7 @@ impl ReportableError for Error {
             Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User,
             Error::BadClientMessage(_) => crate::error::ErrorKind::User,
             Error::MissingBinding => crate::error::ErrorKind::Service,
+            Error::Base64(_) => crate::error::ErrorKind::ControlPlane,
             Error::Io(_) => crate::error::ErrorKind::ClientDisconnect,
         }
     }
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index 76541ae2f3..ed80675f8a 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -56,8 +56,6 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
 
 #[cfg(test)]
 mod tests {
-    use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
-
     use crate::sasl::{Mechanism, Step};
 
     use super::{Exchange, ServerSecret};
@@ -115,16 +113,9 @@ mod tests {
 
     async fn run_round_trip_test(server_password: &str, client_password: &str) {
         let scram_secret = ServerSecret::build(server_password).await.unwrap();
-        let sasl_client =
-            ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
-
-        let outcome = super::exchange(
-            &scram_secret,
-            sasl_client,
-            crate::config::TlsServerEndPoint::Undefined,
-        )
-        .await
-        .unwrap();
+        let outcome = super::exchange(&scram_secret, client_password.as_bytes())
+            .await
+            .unwrap();
 
         match outcome {
             crate::sasl::Outcome::Success(_) => {}
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 51c0ba4e09..682cbe795f 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -2,7 +2,11 @@
 
 use std::convert::Infallible;
 
-use postgres_protocol::authentication::sasl::ScramSha256;
+use hmac::{Hmac, Mac};
+use sha2::digest::FixedOutput;
+use sha2::{Digest, Sha256};
+use subtle::{Choice, ConstantTimeEq};
+use tokio::task::yield_now;
 
 use super::messages::{
     ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
@@ -71,40 +75,71 @@ impl<'a> Exchange<'a> {
     }
 }
 
+// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
+async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
+    let hmac = Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
+    let mut prev = hmac
+        .clone()
+        .chain_update(salt)
+        .chain_update(1u32.to_be_bytes())
+        .finalize()
+        .into_bytes();
+
+    let mut hi = prev;
+
+    for i in 1..iterations {
+        prev = hmac.clone().chain_update(prev).finalize().into_bytes();
+
+        for (hi, prev) in hi.iter_mut().zip(prev) {
+            *hi ^= prev;
+        }
+        // yield every ~250us
+        // hopefully reduces tail latencies
+        if i % 1024 == 0 {
+            yield_now().await
+        }
+    }
+
+    hi.into()
+}
+
+// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L236-L248>
+async fn derive_keys(password: &[u8], salt: &[u8], iterations: u32) -> ([u8; 32], [u8; 32]) {
+    let salted_password = pbkdf2(password, salt, iterations).await;
+
+    let make_key = |name| {
+        let key = Hmac::<Sha256>::new_from_slice(&salted_password)
+            .expect("HMAC is able to accept all key sizes")
+            .chain_update(name)
+            .finalize();
+
+        <[u8; 32]>::from(key.into_bytes())
+    };
+
+    (make_key(b"Client Key"), make_key(b"Server Key"))
+}
+
 pub async fn exchange(
     secret: &ServerSecret,
-    mut client: ScramSha256,
-    tls_server_end_point: config::TlsServerEndPoint,
+    password: &[u8],
 ) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
-    use sasl::Step::*;
+    let salt = base64::decode(&secret.salt_base64)?;
+    let (client_key, server_key) = derive_keys(password, &salt, secret.iterations).await;
+    let stored_key: [u8; 32] = Sha256::default()
+        .chain_update(client_key)
+        .finalize_fixed()
+        .into();
 
-    let init = SaslInitial {
-        nonce: rand::random,
-    };
+    // constant time to not leak partial key match
+    let valid = stored_key.ct_eq(&secret.stored_key.as_bytes())
+        | server_key.ct_eq(&secret.server_key.as_bytes())
+        | Choice::from(secret.doomed as u8);
 
-    let client_first = std::str::from_utf8(client.message())
-        .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
-    let sent = match init.transition(secret, &tls_server_end_point, client_first)? {
-        Continue(sent, server_first) => {
-            client.update(server_first.as_bytes()).await?;
-            sent
-        }
-        Success(x, _) => match x {},
-        Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
-    };
-
-    let client_final = std::str::from_utf8(client.message())
-        .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
-    let keys = match sent.transition(secret, &tls_server_end_point, client_final)? {
-        Success(keys, server_final) => {
-            client.finish(server_final.as_bytes())?;
-            keys
-        }
-        Continue(x, _) => match x {},
-        Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
-    };
-
-    Ok(sasl::Outcome::Success(keys))
+    if valid.into() {
+        Ok(sasl::Outcome::Success(super::ScramKey::from(client_key)))
+    } else {
+        Ok(sasl::Outcome::Failure("password doesn't match"))
+    }
 }
 
 impl SaslInitial {

From fb60278e0272d5981b40611ca2fb2b29c4404f5c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 21 Mar 2024 15:24:56 +0100
Subject: [PATCH 0453/1571] walredo benchmark: throughput-oriented rewrite
 (#7190)

See the updated `bench_walredo.rs` module comment.

tl;dr: we measure avg latency of single redo operations issues against a
single redo manager from N tokio tasks.

part of https://github.com/neondatabase/neon/issues/6628
---
 pageserver/benches/bench_walredo.rs | 345 ++++++++++++++--------------
 1 file changed, 172 insertions(+), 173 deletions(-)

diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 47c8bd75c6..3efad546a6 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -1,160 +1,156 @@
-//! Simple benchmarking around walredo.
+//! Quantify a single walredo manager's throughput under N concurrent callers.
 //!
-//! Right now they hope to just set a baseline. Later we can try to expand into latency and
-//! throughput after figuring out the coordinated omission problems below.
+//! The benchmark implementation ([`bench_impl`]) is parametrized by
+//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
+//! - `n_redos` => number of times the benchmark shell execute the `redo_work`
+//! - `nclients` => number of clients (more on this shortly).
 //!
-//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
-//! logging what happens when a sequential scan is requested on a small table, then picking out two
-//! suitable from logs.
+//! The benchmark impl sets up a multi-threaded tokio runtime with default parameters.
+//! It spawns `nclients` times [`client`] tokio tasks.
+//! Each task executes the `redo_work` `n_redos/nclients` times.
 //!
+//! We exercise the following combinations:
+//! - `redo_work = short / medium``
+//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
 //!
-//! Reference data (git blame to see commit) on an i3en.3xlarge
-// ```text
-//! short/short/1           time:   [39.175 µs 39.348 µs 39.536 µs]
-//! short/short/2           time:   [51.227 µs 51.487 µs 51.755 µs]
-//! short/short/4           time:   [76.048 µs 76.362 µs 76.674 µs]
-//! short/short/8           time:   [128.94 µs 129.82 µs 130.74 µs]
-//! short/short/16          time:   [227.84 µs 229.00 µs 230.28 µs]
-//! short/short/32          time:   [455.97 µs 457.81 µs 459.90 µs]
-//! short/short/64          time:   [902.46 µs 904.84 µs 907.32 µs]
-//! short/short/128         time:   [1.7416 ms 1.7487 ms 1.7561 ms]
-//! ``
-
-use std::sync::Arc;
+//! We let `criterion` determine the `n_redos` using `iter_custom`.
+//! The idea is that for each `(redo_work, nclients)` combination,
+//! criterion will run the `bench_impl` multiple times with different `n_redos`.
+//! The `bench_impl` reports the aggregate wall clock time from the clients' perspective.
+//! Criterion will divide that by `n_redos` to compute the "time per iteration".
+//! In our case, "time per iteration" means "time per redo_work execution".
+//!
+//! NB: the way by which `iter_custom` determines the "number of iterations"
+//! is called sampling. Apparently the idea here is to detect outliers.
+//! We're not sure whether the current choice of sampling method makes sense.
+//! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples
+//!
+//! # Reference Numbers
+//!
+//! 2024-03-20 on i3en.3xlarge
+//!
+//! ```text
+//! short/1                 time:   [26.483 µs 26.614 µs 26.767 µs]
+//! short/2                 time:   [32.223 µs 32.465 µs 32.767 µs]
+//! short/4                 time:   [47.203 µs 47.583 µs 47.984 µs]
+//! short/8                 time:   [89.135 µs 89.612 µs 90.139 µs]
+//! short/16                time:   [190.12 µs 191.52 µs 192.88 µs]
+//! short/32                time:   [380.96 µs 382.63 µs 384.20 µs]
+//! short/64                time:   [736.86 µs 741.07 µs 745.03 µs]
+//! short/128               time:   [1.4106 ms 1.4206 ms 1.4294 ms]
+//! medium/1                time:   [111.81 µs 112.25 µs 112.79 µs]
+//! medium/2                time:   [158.26 µs 159.13 µs 160.21 µs]
+//! medium/4                time:   [334.65 µs 337.14 µs 340.07 µs]
+//! medium/8                time:   [675.32 µs 679.91 µs 685.25 µs]
+//! medium/16               time:   [1.2929 ms 1.2996 ms 1.3067 ms]
+//! medium/32               time:   [2.4295 ms 2.4461 ms 2.4623 ms]
+//! medium/64               time:   [4.3973 ms 4.4458 ms 4.4875 ms]
+//! medium/128              time:   [7.5955 ms 7.7847 ms 7.9481 ms]
+//! ```
 
 use bytes::{Buf, Bytes};
-use pageserver::{
-    config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
+use criterion::{BenchmarkId, Criterion};
+use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
+use pageserver_api::{key::Key, shard::TenantShardId};
+use std::{
+    sync::Arc,
+    time::{Duration, Instant},
 };
-use pageserver_api::shard::TenantShardId;
-use tokio::task::JoinSet;
+use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};
 
-use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+fn bench(c: &mut Criterion) {
+    {
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        for nclients in nclients {
+            let mut group = c.benchmark_group("short");
+            group.bench_with_input(
+                BenchmarkId::from_parameter(nclients),
+                &nclients,
+                |b, nclients| {
+                    let redo_work = Arc::new(Request::short_input());
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                },
+            );
+        }
+    }
 
-fn redo_scenarios(c: &mut Criterion) {
-    // logging should be enabled when adding more inputs, since walredo will only report malformed
-    // input to the stderr.
-    // utils::logging::init(utils::logging::LogFormat::Plain).unwrap();
+    {
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        for nclients in nclients {
+            let mut group = c.benchmark_group("medium");
+            group.bench_with_input(
+                BenchmarkId::from_parameter(nclients),
+                &nclients,
+                |b, nclients| {
+                    let redo_work = Arc::new(Request::medium_input());
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                },
+            );
+        }
+    }
+}
+criterion::criterion_group!(benches, bench);
+criterion::criterion_main!(benches);
 
+// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
+fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
     let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
 
     let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
     let conf = Box::leak(Box::new(conf));
     let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
 
-    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
-
-    let manager = Arc::new(manager);
-
-    {
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .unwrap();
-        tracing::info!("executing first");
-        rt.block_on(short().execute(&manager)).unwrap();
-        tracing::info!("first executed");
-    }
-
-    let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
-
-    let mut group = c.benchmark_group("short");
-    group.sampling_mode(criterion::SamplingMode::Flat);
-
-    for thread_count in thread_counts {
-        group.bench_with_input(
-            BenchmarkId::new("short", thread_count),
-            &thread_count,
-            |b, thread_count| {
-                add_multithreaded_walredo_requesters(b, *thread_count, &manager, short);
-            },
-        );
-    }
-    drop(group);
-
-    let mut group = c.benchmark_group("medium");
-    group.sampling_mode(criterion::SamplingMode::Flat);
-
-    for thread_count in thread_counts {
-        group.bench_with_input(
-            BenchmarkId::new("medium", thread_count),
-            &thread_count,
-            |b, thread_count| {
-                add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium);
-            },
-        );
-    }
-    drop(group);
-}
-
-/// Sets up a multi-threaded tokio runtime with default worker thread count,
-/// then, spawn `requesters` tasks that repeatedly:
-/// - get input from `input_factor()`
-/// - call `manager.request_redo()` with their input
-///
-/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
-///
-/// Using tokio's default worker thread count means the results will differ on machines
-/// with different core countrs. We don't care about that, the performance will always
-/// be different on different hardware. To compare performance of different software versions,
-/// use the same hardware.
-fn add_multithreaded_walredo_requesters(
-    b: &mut criterion::Bencher,
-    nrequesters: usize,
-    manager: &Arc<PostgresRedoManager>,
-    input_factory: fn() -> Request,
-) {
-    assert_ne!(nrequesters, 0);
-
     let rt = tokio::runtime::Builder::new_multi_thread()
         .enable_all()
         .build()
         .unwrap();
 
-    let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));
+    let start = Arc::new(Barrier::new(nclients as usize));
 
-    let mut requesters = JoinSet::new();
-    for _ in 0..nrequesters {
-        let _entered = rt.enter();
-        let manager = manager.clone();
-        let barrier = barrier.clone();
-        requesters.spawn(async move {
-            loop {
-                let input = input_factory();
-                barrier.wait().await;
-                let page = input.execute(&manager).await.unwrap();
-                assert_eq!(page.remaining(), 8192);
-                barrier.wait().await;
-            }
+    let mut tasks = JoinSet::new();
+
+    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
+    let manager = Arc::new(manager);
+
+    for _ in 0..nclients {
+        rt.block_on(async {
+            tasks.spawn(client(
+                Arc::clone(&manager),
+                Arc::clone(&start),
+                Arc::clone(&redo_work),
+                // divide the amount of work equally among the clients
+                n_redos / nclients,
+            ))
         });
     }
 
-    let do_one_iteration = || {
-        rt.block_on(async {
-            barrier.wait().await;
-            // wait for work to complete
-            barrier.wait().await;
-        })
-    };
-
-    b.iter_batched(
-        || {
-            // warmup
-            do_one_iteration();
-        },
-        |()| {
-            // work loop
-            do_one_iteration();
-        },
-        criterion::BatchSize::PerIteration,
-    );
-
-    rt.block_on(requesters.shutdown());
+    rt.block_on(async move {
+        let mut total_wallclock_time = std::time::Duration::from_millis(0);
+        while let Some(res) = tasks.join_next().await {
+            total_wallclock_time += res.unwrap();
+        }
+        total_wallclock_time
+    })
 }
 
-criterion_group!(benches, redo_scenarios);
-criterion_main!(benches);
+async fn client(
+    mgr: Arc<PostgresRedoManager>,
+    start: Arc<Barrier>,
+    redo_work: Arc<Request>,
+    n_redos: u64,
+) -> Duration {
+    start.wait().await;
+    let start = Instant::now();
+    for _ in 0..n_redos {
+        let page = redo_work.execute(&mgr).await.unwrap();
+        assert_eq!(page.remaining(), 8192);
+        // The real pageserver will rarely if ever do 2 walredos in a row without
+        // yielding to the executor.
+        tokio::task::yield_now().await;
+    }
+    start.elapsed()
+}
 
 macro_rules! lsn {
     ($input:expr) => {{
@@ -166,12 +162,46 @@ macro_rules! lsn {
     }};
 }
 
-/// Short payload, 1132 bytes.
-// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
-// for null bytes.
-#[allow(clippy::octal_escapes)]
-fn short() -> Request {
-    Request {
+/// Simple wrapper around `WalRedoManager::request_redo`.
+///
+/// In benchmarks this is cloned around.
+#[derive(Clone)]
+struct Request {
+    key: Key,
+    lsn: Lsn,
+    base_img: Option<(Lsn, Bytes)>,
+    records: Vec<(Lsn, NeonWalRecord)>,
+    pg_version: u32,
+}
+
+impl Request {
+    async fn execute(&self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
+        let Request {
+            key,
+            lsn,
+            base_img,
+            records,
+            pg_version,
+        } = self;
+
+        // TODO: avoid these clones
+        manager
+            .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
+            .await
+    }
+
+    fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
+        let rec = Bytes::from_static(bytes);
+        NeonWalRecord::Postgres { will_init, rec }
+    }
+
+    /// Short payload, 1132 bytes.
+    // pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
+    // for null bytes.
+    #[allow(clippy::octal_escapes)]
+    pub fn short_input() -> Request {
+        let pg_record = Self::pg_record;
+        Request {
         key: Key {
             field1: 0,
             field2: 1663,
@@ -194,13 +224,14 @@ fn short() -> Request {
         ],
         pg_version: 14,
     }
-}
+    }
 
-/// Medium sized payload, serializes as 26393 bytes.
-// see [`short`]
-#[allow(clippy::octal_escapes)]
-fn medium() -> Request {
-    Request {
+    /// Medium sized payload, serializes as 26393 bytes.
+    // see [`short`]
+    #[allow(clippy::octal_escapes)]
+    pub fn medium_input() -> Request {
+        let pg_record = Self::pg_record;
+        Request {
         key: Key {
             field1: 0,
             field2: 1663,
@@ -442,37 +473,5 @@ fn medium() -> Request {
         ],
         pg_version: 14,
     }
-}
-
-fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
-    let rec = Bytes::from_static(bytes);
-    NeonWalRecord::Postgres { will_init, rec }
-}
-
-/// Simple wrapper around `WalRedoManager::request_redo`.
-///
-/// In benchmarks this is cloned around.
-#[derive(Clone)]
-struct Request {
-    key: Key,
-    lsn: Lsn,
-    base_img: Option<(Lsn, Bytes)>,
-    records: Vec<(Lsn, NeonWalRecord)>,
-    pg_version: u32,
-}
-
-impl Request {
-    async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
-        let Request {
-            key,
-            lsn,
-            base_img,
-            records,
-            pg_version,
-        } = self;
-
-        manager
-            .request_redo(key, lsn, base_img, records, pg_version)
-            .await
     }
 }

From 3ee34a3f26c232b48c1f386675d9d809869c7ba6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 22 Mar 2024 07:52:31 +0100
Subject: [PATCH 0454/1571] Update Rust to 1.77.0 (#7198)

Release notes: https://blog.rust-lang.org/2024/03/21/Rust-1.77.0.html

Thanks to #6886 the diff is reasonable, only for one new lint
`clippy::suspicious_open_options`. I added `truncate()` calls to the
places where it is obviously the right choice to me, and added allows
everywhere else, leaving it for followups.

I had to specify cargo install --locked because the build would fail otherwise.
This was also recommended by upstream.
---
 Dockerfile.build-tools              | 4 ++--
 compute_tools/src/config.rs         | 1 +
 libs/remote_storage/src/local_fs.rs | 1 +
 libs/utils/src/lock_file.rs         | 1 +
 pageserver/src/tenant/delete.rs     | 1 +
 rust-toolchain.toml                 | 2 +-
 safekeeper/src/copy_timeline.rs     | 1 +
 safekeeper/src/wal_storage.rs       | 1 +
 8 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 3a452fec32..1ed6f87473 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.76.0
+ENV RUSTC_VERSION=1.77.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
@@ -149,7 +149,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     cargo install --git https://github.com/paritytech/cachepot && \
     cargo install rustfilt && \
     cargo install cargo-hakari && \
-    cargo install cargo-deny && \
+    cargo install cargo-deny --locked && \
     cargo install cargo-hack && \
     cargo install cargo-nextest && \
     rm -rf /home/nonroot/.cargo/registry && \
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 42b8480211..f1fd8637f5 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -17,6 +17,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
         .write(true)
         .create(true)
         .append(false)
+        .truncate(false)
         .open(path)?;
     let buf = io::BufReader::new(&file);
     let mut count: usize = 0;
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 313d8226b1..8cad863731 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -198,6 +198,7 @@ impl LocalFs {
             fs::OpenOptions::new()
                 .write(true)
                 .create(true)
+                .truncate(true)
                 .open(&temp_file_path)
                 .await
                 .with_context(|| {
diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs
index 987b9d9ad2..59c66ca757 100644
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -63,6 +63,7 @@ impl UnwrittenLockFile {
 pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
     let lock_file = fs::OpenOptions::new()
         .create(true) // O_CREAT
+        .truncate(true)
         .write(true)
         .open(lock_file_path)
         .context("open lock file")?;
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index cab60c3111..7d37873a67 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -111,6 +111,7 @@ async fn create_local_delete_mark(
     let _ = std::fs::OpenOptions::new()
         .write(true)
         .create(true)
+        .truncate(true)
         .open(&marker_path)
         .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
 
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index b0949c32b1..50a5a4185b 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.76.0"
+channel = "1.77.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
index 5bc877adbd..3023d4e2cb 100644
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -225,6 +225,7 @@ async fn write_segment(
     assert!(from <= to);
     assert!(to <= wal_seg_size);
 
+    #[allow(clippy::suspicious_open_options)]
     let mut file = OpenOptions::new()
         .create(true)
         .write(true)
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 8bbd95e9e8..147f318b9f 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -221,6 +221,7 @@ impl PhysicalStorage {
             // half initialized segment, first bake it under tmp filename and
             // then rename.
             let tmp_path = self.timeline_dir.join("waltmp");
+            #[allow(clippy::suspicious_open_options)]
             let mut file = OpenOptions::new()
                 .create(true)
                 .write(true)

From 6770ddba2e24b81429abc68576f78ff06816edb2 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Fri, 22 Mar 2024 09:38:04 +0100
Subject: [PATCH 0455/1571] proxy: connect redis with AWS IAM (#7189)

## Problem

Support of IAM Roles for Service Accounts for authentication.

## Summary of changes

* Obtain aws 15m-long credentials
* Retrieve redis password from credentials
* Update every 1h to keep connection for more than 12h
* For now allow to have different endpoints for pubsub/stream redis.

TODOs:
* PubSub doesn't support credentials refresh, consider using stream
instead.
* We need an AWS role for proxy to be able to connect to both: S3 and
elasticache.

Credentials obtaining and connection refresh was tested on xenon
preview.

https://github.com/neondatabase/cloud/issues/10365
---
 Cargo.lock                                    | 117 ++++++---
 Cargo.toml                                    |   6 +-
 proxy/Cargo.toml                              |   5 +
 proxy/src/bin/proxy.rs                        | 108 ++++++++-
 proxy/src/cancellation.rs                     | 147 ++++++------
 proxy/src/config.rs                           |   1 +
 proxy/src/metrics.rs                          |   3 +
 proxy/src/proxy.rs                            |  12 +-
 proxy/src/proxy/passthrough.rs                |   6 +-
 proxy/src/redis.rs                            |   4 +-
 proxy/src/redis/cancellation_publisher.rs     | 167 +++++++++++++
 .../connection_with_credentials_provider.rs   | 225 ++++++++++++++++++
 proxy/src/redis/elasticache.rs                | 110 +++++++++
 proxy/src/redis/notifications.rs              |  62 ++---
 proxy/src/redis/publisher.rs                  |  80 -------
 proxy/src/serverless.rs                       |   7 +-
 proxy/src/serverless/websocket.rs             |   4 +-
 workspace_hack/Cargo.toml                     |   3 +-
 18 files changed, 803 insertions(+), 264 deletions(-)
 create mode 100644 proxy/src/redis/cancellation_publisher.rs
 create mode 100644 proxy/src/redis/connection_with_credentials_provider.rs
 create mode 100644 proxy/src/redis/elasticache.rs
 delete mode 100644 proxy/src/redis/publisher.rs

diff --git a/Cargo.lock b/Cargo.lock
index 824cac13b3..dcf1c49924 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -347,9 +347,9 @@ dependencies = [
 
 [[package]]
 name = "aws-credential-types"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33cc49dcdd31c8b6e79850a179af4c367669150c7ac0135f176c61bec81a70f7"
+checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-runtime-api",
@@ -359,9 +359,9 @@ dependencies = [
 
 [[package]]
 name = "aws-runtime"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb031bff99877c26c28895766f7bb8484a05e24547e370768d6cc9db514662aa"
+checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8"
 dependencies = [
  "aws-credential-types",
  "aws-sigv4",
@@ -381,6 +381,29 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "aws-sdk-iam"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8ae76026bfb1b80a6aed0bb400c1139cd9c0563e26bce1986cd021c6a968c7b"
+dependencies = [
+ "aws-credential-types",
+ "aws-runtime",
+ "aws-smithy-async",
+ "aws-smithy-http",
+ "aws-smithy-json",
+ "aws-smithy-query",
+ "aws-smithy-runtime",
+ "aws-smithy-runtime-api",
+ "aws-smithy-types",
+ "aws-smithy-xml",
+ "aws-types",
+ "http 0.2.9",
+ "once_cell",
+ "regex-lite",
+ "tracing",
+]
+
 [[package]]
 name = "aws-sdk-s3"
 version = "1.14.0"
@@ -502,9 +525,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sigv4"
-version = "1.1.4"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c371c6b0ac54d4605eb6f016624fb5c7c2925d315fdf600ac1bf21b19d5f1742"
+checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-eventstream",
@@ -517,7 +540,7 @@ dependencies = [
  "hex",
  "hmac",
  "http 0.2.9",
- "http 1.0.0",
+ "http 1.1.0",
  "once_cell",
  "p256",
  "percent-encoding",
@@ -531,9 +554,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-async"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72ee2d09cce0ef3ae526679b522835d63e75fb427aca5413cd371e490d52dcc6"
+checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46"
 dependencies = [
  "futures-util",
  "pin-project-lite",
@@ -574,9 +597,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dab56aea3cd9e1101a0a999447fb346afb680ab1406cebc44b32346e25b4117d"
+checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9"
 dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-runtime-api",
@@ -595,18 +618,18 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-json"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd3898ca6518f9215f62678870064398f00031912390efd03f1f6ef56d83aa8e"
+checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
 dependencies = [
  "aws-smithy-types",
 ]
 
 [[package]]
 name = "aws-smithy-query"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bda4b1dfc9810e35fba8a620e900522cd1bd4f9578c446e82f49d1ce41d2e9f9"
+checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb"
 dependencies = [
  "aws-smithy-types",
  "urlencoding",
@@ -614,9 +637,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fafdab38f40ad7816e7da5dec279400dd505160780083759f01441af1bbb10ea"
+checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -639,14 +662,15 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime-api"
-version = "1.1.4"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c18276dd28852f34b3bf501f4f3719781f4999a51c7bff1a5c6dc8c4529adc29"
+checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-types",
  "bytes",
  "http 0.2.9",
+ "http 1.1.0",
  "pin-project-lite",
  "tokio",
  "tracing",
@@ -655,9 +679,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb3e134004170d3303718baa2a4eb4ca64ee0a1c0a7041dca31b38be0fb414f3"
+checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729"
 dependencies = [
  "base64-simd",
  "bytes",
@@ -678,18 +702,18 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-xml"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8604a11b25e9ecaf32f9aa56b9fe253c5e2f606a3477f0071e96d3155a5ed218"
+checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9"
 dependencies = [
  "xmlparser",
 ]
 
 [[package]]
 name = "aws-types"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "789bbe008e65636fe1b6dbbb374c40c8960d1232b96af5ff4aec349f9c4accf4"
+checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-async",
@@ -2396,9 +2420,9 @@ dependencies = [
 
 [[package]]
 name = "http"
-version = "1.0.0"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b32afd38673a8016f7c9ae69e5af41a58f81b1d31689040f2f1959594ce194ea"
+checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258"
 dependencies = [
  "bytes",
  "fnv",
@@ -2498,7 +2522,7 @@ dependencies = [
  "hyper",
  "log",
  "rustls 0.21.9",
- "rustls-native-certs",
+ "rustls-native-certs 0.6.2",
  "tokio",
  "tokio-rustls 0.24.0",
 ]
@@ -4199,6 +4223,10 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "async-trait",
+ "aws-config",
+ "aws-sdk-iam",
+ "aws-sigv4",
+ "aws-types",
  "base64 0.13.1",
  "bstr",
  "bytes",
@@ -4216,6 +4244,7 @@ dependencies = [
  "hex",
  "hmac",
  "hostname",
+ "http 1.1.0",
  "humantime",
  "hyper",
  "hyper-tungstenite",
@@ -4431,9 +4460,9 @@ dependencies = [
 
 [[package]]
 name = "redis"
-version = "0.24.0"
+version = "0.25.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd"
+checksum = "71d64e978fd98a0e6b105d066ba4889a7301fca65aeac850a877d8797343feeb"
 dependencies = [
  "async-trait",
  "bytes",
@@ -4442,15 +4471,15 @@ dependencies = [
  "itoa",
  "percent-encoding",
  "pin-project-lite",
- "rustls 0.21.9",
- "rustls-native-certs",
- "rustls-pemfile 1.0.2",
- "rustls-webpki 0.101.7",
+ "rustls 0.22.2",
+ "rustls-native-certs 0.7.0",
+ "rustls-pemfile 2.1.1",
+ "rustls-pki-types",
  "ryu",
  "sha1_smol",
- "socket2 0.4.9",
+ "socket2 0.5.5",
  "tokio",
- "tokio-rustls 0.24.0",
+ "tokio-rustls 0.25.0",
  "tokio-util",
  "url",
 ]
@@ -4879,6 +4908,19 @@ dependencies = [
  "security-framework",
 ]
 
+[[package]]
+name = "rustls-native-certs"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792"
+dependencies = [
+ "openssl-probe",
+ "rustls-pemfile 2.1.1",
+ "rustls-pki-types",
+ "schannel",
+ "security-framework",
+]
+
 [[package]]
 name = "rustls-pemfile"
 version = "1.0.2"
@@ -6146,7 +6188,7 @@ dependencies = [
  "percent-encoding",
  "pin-project",
  "prost",
- "rustls-native-certs",
+ "rustls-native-certs 0.6.2",
  "rustls-pemfile 1.0.2",
  "tokio",
  "tokio-rustls 0.24.0",
@@ -7031,7 +7073,6 @@ dependencies = [
  "aws-sigv4",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-runtime-api",
  "aws-smithy-types",
  "axum",
  "base64 0.21.1",
diff --git a/Cargo.toml b/Cargo.toml
index 44e6ec9744..2741bd046b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -53,9 +53,12 @@ async-trait = "0.1"
 aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "1.14"
 aws-sdk-secretsmanager = { version = "1.14.0" }
+aws-sdk-iam = "1.15.0"
 aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
 aws-smithy-types = "1.1.4"
 aws-credential-types = "1.1.4"
+aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
+aws-types = "1.1.7"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -88,6 +91,7 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
+http = {version = "1.1.0", features = ["std"]}
 http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
@@ -121,7 +125,7 @@ procfs = "0.14"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
-redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
+redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 93a1fe85db..3566d8b728 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -11,6 +11,10 @@ testing = []
 [dependencies]
 anyhow.workspace = true
 async-trait.workspace = true
+aws-config.workspace = true
+aws-sdk-iam.workspace = true
+aws-sigv4.workspace = true
+aws-types.workspace = true
 base64.workspace = true
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
@@ -27,6 +31,7 @@ hashlink.workspace = true
 hex.workspace = true
 hmac.workspace = true
 hostname.workspace = true
+http.workspace = true
 humantime.workspace = true
 hyper-tungstenite.workspace = true
 hyper.workspace = true
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index b3d4fc0411..d38439c2a0 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,3 +1,10 @@
+use aws_config::environment::EnvironmentVariableCredentialsProvider;
+use aws_config::imds::credentials::ImdsCredentialsProvider;
+use aws_config::meta::credentials::CredentialsProviderChain;
+use aws_config::meta::region::RegionProviderChain;
+use aws_config::profile::ProfileFileCredentialsProvider;
+use aws_config::provider_config::ProviderConfig;
+use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
 use futures::future::Either;
 use proxy::auth;
 use proxy::auth::backend::MaybeOwned;
@@ -10,11 +17,14 @@ use proxy::config::ProjectInfoCacheOptions;
 use proxy::console;
 use proxy::context::parquet::ParquetUploadArgs;
 use proxy::http;
+use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
+use proxy::redis::cancellation_publisher::RedisPublisherClient;
+use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
+use proxy::redis::elasticache;
 use proxy::redis::notifications;
-use proxy::redis::publisher::RedisPublisherClient;
 use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;
 
@@ -150,9 +160,24 @@ struct ProxyCliArgs {
     /// disable ip check for http requests. If it is too time consuming, it could be turned off.
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     disable_ip_check_for_http: bool,
-    /// redis url for notifications.
+    /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
     #[clap(long)]
     redis_notifications: Option<String>,
+    /// redis host for streaming connections (might be different from the notifications host)
+    #[clap(long)]
+    redis_host: Option<String>,
+    /// redis port for streaming connections (might be different from the notifications host)
+    #[clap(long)]
+    redis_port: Option<u16>,
+    /// redis cluster name, used in aws elasticache
+    #[clap(long)]
+    redis_cluster_name: Option<String>,
+    /// redis user_id, used in aws elasticache
+    #[clap(long)]
+    redis_user_id: Option<String>,
+    /// aws region to retrieve credentials
+    #[clap(long, default_value_t = String::new())]
+    aws_region: String,
     /// cache for `project_info` (use `size=0` to disable)
     #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
     project_info_cache: String,
@@ -216,6 +241,61 @@ async fn main() -> anyhow::Result<()> {
     let config = build_config(&args)?;
 
     info!("Authentication backend: {}", config.auth_backend);
+    info!("Using region: {}", config.aws_region);
+
+    let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed
+    let provider_conf =
+        ProviderConfig::without_region().with_region(region_provider.region().await);
+    let aws_credentials_provider = {
+        // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+        CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new())
+            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
+            .or_else(
+                "profile-sso",
+                ProfileFileCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
+            // needed to access remote extensions bucket
+            .or_else(
+                "token",
+                WebIdentityTokenCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses imds v2
+            .or_else("imds", ImdsCredentialsProvider::builder().build())
+    };
+    let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
+        elasticache::AWSIRSAConfig::new(
+            config.aws_region.clone(),
+            args.redis_cluster_name,
+            args.redis_user_id,
+        ),
+        aws_credentials_provider,
+    ));
+    let redis_notifications_client =
+        match (args.redis_notifications, (args.redis_host, args.redis_port)) {
+            (Some(url), _) => {
+                info!("Starting redis notifications listener ({url})");
+                Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
+            }
+            (None, (Some(host), Some(port))) => Some(
+                ConnectionWithCredentialsProvider::new_with_credentials_provider(
+                    host,
+                    port,
+                    elasticache_credentials_provider.clone(),
+                ),
+            ),
+            (None, (None, None)) => {
+                warn!("Redis is disabled");
+                None
+            }
+            _ => {
+                bail!("redis-host and redis-port must be specified together");
+            }
+        };
 
     // Check that we can bind to address before further initialization
     let http_address: SocketAddr = args.http.parse()?;
@@ -233,17 +313,22 @@ async fn main() -> anyhow::Result<()> {
 
     let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
     let cancel_map = CancelMap::default();
-    let redis_publisher = match &args.redis_notifications {
-        Some(url) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
-            url,
+
+    // let redis_notifications_client = redis_notifications_client.map(|x| Box::leak(Box::new(x)));
+    let redis_publisher = match &redis_notifications_client {
+        Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
+            redis_publisher.clone(),
             args.region.clone(),
             &config.redis_rps_limit,
         )?))),
         None => None,
     };
-    let cancellation_handler = Arc::new(CancellationHandler::new(
+    let cancellation_handler = Arc::new(CancellationHandler::<
+        Option<Arc<tokio::sync::Mutex<RedisPublisherClient>>>,
+    >::new(
         cancel_map.clone(),
         redis_publisher,
+        NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT,
     ));
 
     // client facing tasks. these will exit on error or on cancellation
@@ -290,17 +375,16 @@ async fn main() -> anyhow::Result<()> {
 
     if let auth::BackendType::Console(api, _) = &config.auth_backend {
         if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
-            let cache = api.caches.project_info.clone();
-            if let Some(url) = args.redis_notifications {
-                info!("Starting redis notifications listener ({url})");
+            if let Some(redis_notifications_client) = redis_notifications_client {
+                let cache = api.caches.project_info.clone();
                 maintenance_tasks.spawn(notifications::task_main(
-                    url.to_owned(),
+                    redis_notifications_client.clone(),
                     cache.clone(),
                     cancel_map.clone(),
                     args.region.clone(),
                 ));
+                maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
             }
-            maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
         }
     }
 
@@ -445,8 +529,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         endpoint_rps_limit,
         redis_rps_limit,
         handshake_timeout: args.handshake_timeout,
-        // TODO: add this argument
         region: args.region.clone(),
+        aws_region: args.aws_region.clone(),
     }));
 
     Ok(config)
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index c9607909b3..8054f33b6c 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,4 +1,3 @@
-use async_trait::async_trait;
 use dashmap::DashMap;
 use pq_proto::CancelKeyData;
 use std::{net::SocketAddr, sync::Arc};
@@ -10,18 +9,26 @@ use tracing::info;
 use uuid::Uuid;
 
 use crate::{
-    error::ReportableError, metrics::NUM_CANCELLATION_REQUESTS,
-    redis::publisher::RedisPublisherClient,
+    error::ReportableError,
+    metrics::NUM_CANCELLATION_REQUESTS,
+    redis::cancellation_publisher::{
+        CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
+    },
 };
 
 pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
+pub type CancellationHandlerMain = CancellationHandler<Option<Arc<Mutex<RedisPublisherClient>>>>;
+pub type CancellationHandlerMainInternal = Option<Arc<Mutex<RedisPublisherClient>>>;
 
 /// Enables serving `CancelRequest`s.
 ///
-/// If there is a `RedisPublisherClient` available, it will be used to publish the cancellation key to other proxy instances.
-pub struct CancellationHandler {
+/// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances.
+pub struct CancellationHandler<P> {
     map: CancelMap,
-    redis_client: Option<Arc<Mutex<RedisPublisherClient>>>,
+    client: P,
+    /// This field used for the monitoring purposes.
+    /// Represents the source of the cancellation request.
+    from: &'static str,
 }
 
 #[derive(Debug, Error)]
@@ -44,49 +51,9 @@ impl ReportableError for CancelError {
     }
 }
 
-impl CancellationHandler {
-    pub fn new(map: CancelMap, redis_client: Option<Arc<Mutex<RedisPublisherClient>>>) -> Self {
-        Self { map, redis_client }
-    }
-    /// Cancel a running query for the corresponding connection.
-    pub async fn cancel_session(
-        &self,
-        key: CancelKeyData,
-        session_id: Uuid,
-    ) -> Result<(), CancelError> {
-        let from = "from_client";
-        // NB: we should immediately release the lock after cloning the token.
-        let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
-            tracing::warn!("query cancellation key not found: {key}");
-            if let Some(redis_client) = &self.redis_client {
-                NUM_CANCELLATION_REQUESTS
-                    .with_label_values(&[from, "not_found"])
-                    .inc();
-                info!("publishing cancellation key to Redis");
-                match redis_client.lock().await.try_publish(key, session_id).await {
-                    Ok(()) => {
-                        info!("cancellation key successfuly published to Redis");
-                    }
-                    Err(e) => {
-                        tracing::error!("failed to publish a message: {e}");
-                        return Err(CancelError::IO(std::io::Error::new(
-                            std::io::ErrorKind::Other,
-                            e.to_string(),
-                        )));
-                    }
-                }
-            }
-            return Ok(());
-        };
-        NUM_CANCELLATION_REQUESTS
-            .with_label_values(&[from, "found"])
-            .inc();
-        info!("cancelling query per user's request using key {key}");
-        cancel_closure.try_cancel_query().await
-    }
-
+impl<P: CancellationPublisher> CancellationHandler<P> {
     /// Run async action within an ephemeral session identified by [`CancelKeyData`].
-    pub fn get_session(self: Arc<Self>) -> Session {
+    pub fn get_session(self: Arc<Self>) -> Session<P> {
         // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
         // expose it and we don't want to do another roundtrip to query
         // for it. The client will be able to notice that this is not the
@@ -112,9 +79,39 @@ impl CancellationHandler {
             cancellation_handler: self,
         }
     }
+    /// Try to cancel a running query for the corresponding connection.
+    /// If the cancellation key is not found, it will be published to Redis.
+    pub async fn cancel_session(
+        &self,
+        key: CancelKeyData,
+        session_id: Uuid,
+    ) -> Result<(), CancelError> {
+        // NB: we should immediately release the lock after cloning the token.
+        let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
+            tracing::warn!("query cancellation key not found: {key}");
+            NUM_CANCELLATION_REQUESTS
+                .with_label_values(&[self.from, "not_found"])
+                .inc();
+            match self.client.try_publish(key, session_id).await {
+                Ok(()) => {} // do nothing
+                Err(e) => {
+                    return Err(CancelError::IO(std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        e.to_string(),
+                    )));
+                }
+            }
+            return Ok(());
+        };
+        NUM_CANCELLATION_REQUESTS
+            .with_label_values(&[self.from, "found"])
+            .inc();
+        info!("cancelling query per user's request using key {key}");
+        cancel_closure.try_cancel_query().await
+    }
 
     #[cfg(test)]
-    fn contains(&self, session: &Session) -> bool {
+    fn contains(&self, session: &Session<P>) -> bool {
         self.map.contains_key(&session.key)
     }
 
@@ -124,31 +121,19 @@ impl CancellationHandler {
     }
 }
 
-#[async_trait]
-pub trait NotificationsCancellationHandler {
-    async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError>;
+impl CancellationHandler<()> {
+    pub fn new(map: CancelMap, from: &'static str) -> Self {
+        Self {
+            map,
+            client: (),
+            from,
+        }
+    }
 }
 
-#[async_trait]
-impl NotificationsCancellationHandler for CancellationHandler {
-    async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError> {
-        let from = "from_redis";
-        let cancel_closure = self.map.get(&key).and_then(|x| x.clone());
-        match cancel_closure {
-            Some(cancel_closure) => {
-                NUM_CANCELLATION_REQUESTS
-                    .with_label_values(&[from, "found"])
-                    .inc();
-                cancel_closure.try_cancel_query().await
-            }
-            None => {
-                NUM_CANCELLATION_REQUESTS
-                    .with_label_values(&[from, "not_found"])
-                    .inc();
-                tracing::warn!("query cancellation key not found: {key}");
-                Ok(())
-            }
-        }
+impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
+    pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: &'static str) -> Self {
+        Self { map, client, from }
     }
 }
 
@@ -178,14 +163,14 @@ impl CancelClosure {
 }
 
 /// Helper for registering query cancellation tokens.
-pub struct Session {
+pub struct Session<P> {
     /// The user-facing key identifying this session.
     key: CancelKeyData,
     /// The [`CancelMap`] this session belongs to.
-    cancellation_handler: Arc<CancellationHandler>,
+    cancellation_handler: Arc<CancellationHandler<P>>,
 }
 
-impl Session {
+impl<P> Session<P> {
     /// Store the cancel token for the given session.
     /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
     pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
@@ -198,7 +183,7 @@ impl Session {
     }
 }
 
-impl Drop for Session {
+impl<P> Drop for Session<P> {
     fn drop(&mut self) {
         self.cancellation_handler.map.remove(&self.key);
         info!("dropped query cancellation key {}", &self.key);
@@ -207,14 +192,16 @@ impl Drop for Session {
 
 #[cfg(test)]
 mod tests {
+    use crate::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS;
+
     use super::*;
 
     #[tokio::test]
     async fn check_session_drop() -> anyhow::Result<()> {
-        let cancellation_handler = Arc::new(CancellationHandler {
-            map: CancelMap::default(),
-            redis_client: None,
-        });
+        let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
+            CancelMap::default(),
+            NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
+        ));
 
         let session = cancellation_handler.clone().get_session();
         assert!(cancellation_handler.contains(&session));
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 437ec9f401..45f8d76144 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -28,6 +28,7 @@ pub struct ProxyConfig {
     pub redis_rps_limit: Vec<RateBucketInfo>,
     pub region: String,
     pub handshake_timeout: Duration,
+    pub aws_region: String,
 }
 
 #[derive(Debug)]
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 02ebcd6aaa..eed45e421b 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -161,6 +161,9 @@ pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
     .unwrap()
 });
 
+pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client";
+pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis";
+
 pub enum Waiting {
     Cplane,
     Client,
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index ab5bf5d494..843bfc08cf 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -10,7 +10,7 @@ pub mod wake_compute;
 
 use crate::{
     auth,
-    cancellation::{self, CancellationHandler},
+    cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal},
     compute,
     config::{ProxyConfig, TlsConfig},
     context::RequestMonitoring,
@@ -62,7 +62,7 @@ pub async fn task_main(
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    cancellation_handler: Arc<CancellationHandler>,
+    cancellation_handler: Arc<CancellationHandlerMain>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("proxy has shut down");
@@ -233,12 +233,12 @@ impl ReportableError for ClientRequestError {
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
-    cancellation_handler: Arc<CancellationHandler>,
+    cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     conn_gauge: IntCounterPairGuard,
-) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
+) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
     info!("handling interactive connection from client");
 
     let proto = ctx.protocol;
@@ -338,9 +338,9 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
 /// Finish client connection initialization: confirm auth success, send params, etc.
 #[tracing::instrument(skip_all)]
-async fn prepare_client_connection(
+async fn prepare_client_connection<P>(
     node: &compute::PostgresConnection,
-    session: &cancellation::Session,
+    session: &cancellation::Session<P>,
     stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> Result<(), std::io::Error> {
     // Register compute's query cancellation token and produce a new, unique one.
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index b2f682fd2f..f6d4314391 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -55,17 +55,17 @@ pub async fn proxy_pass(
     Ok(())
 }
 
-pub struct ProxyPassthrough<S> {
+pub struct ProxyPassthrough<P, S> {
     pub client: Stream<S>,
     pub compute: PostgresConnection,
     pub aux: MetricsAuxInfo,
 
     pub req: IntCounterPairGuard,
     pub conn: IntCounterPairGuard,
-    pub cancel: cancellation::Session,
+    pub cancel: cancellation::Session<P>,
 }
 
-impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
+impl<P, S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<P, S> {
     pub async fn proxy_pass(self) -> anyhow::Result<()> {
         let res = proxy_pass(self.client, self.compute.stream, self.aux).await;
         self.compute.cancel_closure.try_cancel_query().await?;
diff --git a/proxy/src/redis.rs b/proxy/src/redis.rs
index 35d6db074e..a322f0368c 100644
--- a/proxy/src/redis.rs
+++ b/proxy/src/redis.rs
@@ -1,2 +1,4 @@
+pub mod cancellation_publisher;
+pub mod connection_with_credentials_provider;
+pub mod elasticache;
 pub mod notifications;
-pub mod publisher;
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
new file mode 100644
index 0000000000..d9efc3561b
--- /dev/null
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -0,0 +1,167 @@
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use pq_proto::CancelKeyData;
+use redis::AsyncCommands;
+use tokio::sync::Mutex;
+use uuid::Uuid;
+
+use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
+
+use super::{
+    connection_with_credentials_provider::ConnectionWithCredentialsProvider,
+    notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME},
+};
+
+#[async_trait]
+pub trait CancellationPublisherMut: Send + Sync + 'static {
+    async fn try_publish(
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()>;
+}
+
+#[async_trait]
+pub trait CancellationPublisher: Send + Sync + 'static {
+    async fn try_publish(
+        &self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()>;
+}
+
+#[async_trait]
+impl CancellationPublisherMut for () {
+    async fn try_publish(
+        &mut self,
+        _cancel_key_data: CancelKeyData,
+        _session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl<P: CancellationPublisherMut> CancellationPublisher for P {
+    async fn try_publish(
+        &self,
+        _cancel_key_data: CancelKeyData,
+        _session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        self.try_publish(_cancel_key_data, _session_id).await
+    }
+}
+
+#[async_trait]
+impl<P: CancellationPublisher> CancellationPublisher for Option<P> {
+    async fn try_publish(
+        &self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        if let Some(p) = self {
+            p.try_publish(cancel_key_data, session_id).await
+        } else {
+            Ok(())
+        }
+    }
+}
+
+#[async_trait]
+impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
+    async fn try_publish(
+        &self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        self.lock()
+            .await
+            .try_publish(cancel_key_data, session_id)
+            .await
+    }
+}
+
+pub struct RedisPublisherClient {
+    client: ConnectionWithCredentialsProvider,
+    region_id: String,
+    limiter: RedisRateLimiter,
+}
+
+impl RedisPublisherClient {
+    pub fn new(
+        client: ConnectionWithCredentialsProvider,
+        region_id: String,
+        info: &'static [RateBucketInfo],
+    ) -> anyhow::Result<Self> {
+        Ok(Self {
+            client,
+            region_id,
+            limiter: RedisRateLimiter::new(info),
+        })
+    }
+
+    async fn publish(
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        let payload = serde_json::to_string(&Notification::Cancel(CancelSession {
+            region_id: Some(self.region_id.clone()),
+            cancel_key_data,
+            session_id,
+        }))?;
+        self.client.publish(PROXY_CHANNEL_NAME, payload).await?;
+        Ok(())
+    }
+    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
+        match self.client.connect().await {
+            Ok(()) => {}
+            Err(e) => {
+                tracing::error!("failed to connect to redis: {e}");
+                return Err(e);
+            }
+        }
+        Ok(())
+    }
+    async fn try_publish_internal(
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping cancellation message");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+        match self.publish(cancel_key_data, session_id).await {
+            Ok(()) => return Ok(()),
+            Err(e) => {
+                tracing::error!("failed to publish a message: {e}");
+            }
+        }
+        tracing::info!("Publisher is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.publish(cancel_key_data, session_id).await
+    }
+}
+
+#[async_trait]
+impl CancellationPublisherMut for RedisPublisherClient {
+    async fn try_publish(
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        tracing::info!("publishing cancellation key to Redis");
+        match self.try_publish_internal(cancel_key_data, session_id).await {
+            Ok(()) => {
+                tracing::info!("cancellation key successfuly published to Redis");
+                Ok(())
+            }
+            Err(e) => {
+                tracing::error!("failed to publish a message: {e}");
+                Err(e)
+            }
+        }
+    }
+}
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
new file mode 100644
index 0000000000..d183abb53a
--- /dev/null
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -0,0 +1,225 @@
+use std::{sync::Arc, time::Duration};
+
+use futures::FutureExt;
+use redis::{
+    aio::{ConnectionLike, MultiplexedConnection},
+    ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult,
+};
+use tokio::task::JoinHandle;
+use tracing::{error, info};
+
+use super::elasticache::CredentialsProvider;
+
+enum Credentials {
+    Static(ConnectionInfo),
+    Dynamic(Arc<CredentialsProvider>, redis::ConnectionAddr),
+}
+
+impl Clone for Credentials {
+    fn clone(&self) -> Self {
+        match self {
+            Credentials::Static(info) => Credentials::Static(info.clone()),
+            Credentials::Dynamic(provider, addr) => {
+                Credentials::Dynamic(Arc::clone(provider), addr.clone())
+            }
+        }
+    }
+}
+
+/// A wrapper around `redis::MultiplexedConnection` that automatically refreshes the token.
+/// Provides PubSub connection without credentials refresh.
+pub struct ConnectionWithCredentialsProvider {
+    credentials: Credentials,
+    con: Option<MultiplexedConnection>,
+    refresh_token_task: Option<JoinHandle<()>>,
+    mutex: tokio::sync::Mutex<()>,
+}
+
+impl Clone for ConnectionWithCredentialsProvider {
+    fn clone(&self) -> Self {
+        Self {
+            credentials: self.credentials.clone(),
+            con: None,
+            refresh_token_task: None,
+            mutex: tokio::sync::Mutex::new(()),
+        }
+    }
+}
+
+impl ConnectionWithCredentialsProvider {
+    pub fn new_with_credentials_provider(
+        host: String,
+        port: u16,
+        credentials_provider: Arc<CredentialsProvider>,
+    ) -> Self {
+        Self {
+            credentials: Credentials::Dynamic(
+                credentials_provider,
+                redis::ConnectionAddr::TcpTls {
+                    host,
+                    port,
+                    insecure: false,
+                    tls_params: None,
+                },
+            ),
+            con: None,
+            refresh_token_task: None,
+            mutex: tokio::sync::Mutex::new(()),
+        }
+    }
+
+    pub fn new_with_static_credentials<T: IntoConnectionInfo>(params: T) -> Self {
+        Self {
+            credentials: Credentials::Static(params.into_connection_info().unwrap()),
+            con: None,
+            refresh_token_task: None,
+            mutex: tokio::sync::Mutex::new(()),
+        }
+    }
+
+    pub async fn connect(&mut self) -> anyhow::Result<()> {
+        let _guard = self.mutex.lock().await;
+        if let Some(con) = self.con.as_mut() {
+            match redis::cmd("PING").query_async(con).await {
+                Ok(()) => {
+                    return Ok(());
+                }
+                Err(e) => {
+                    error!("Error during PING: {e:?}");
+                }
+            }
+        } else {
+            info!("Connection is not established");
+        }
+        info!("Establishing a new connection...");
+        self.con = None;
+        if let Some(f) = self.refresh_token_task.take() {
+            f.abort()
+        }
+        let con = self
+            .get_client()
+            .await?
+            .get_multiplexed_tokio_connection()
+            .await?;
+        if let Credentials::Dynamic(credentials_provider, _) = &self.credentials {
+            let credentials_provider = credentials_provider.clone();
+            let con2 = con.clone();
+            let f = tokio::spawn(async move {
+                let _ = Self::keep_connection(con2, credentials_provider).await;
+            });
+            self.refresh_token_task = Some(f);
+        }
+        self.con = Some(con);
+        Ok(())
+    }
+
+    async fn get_connection_info(&self) -> anyhow::Result<ConnectionInfo> {
+        match &self.credentials {
+            Credentials::Static(info) => Ok(info.clone()),
+            Credentials::Dynamic(provider, addr) => {
+                let (username, password) = provider.provide_credentials().await?;
+                Ok(ConnectionInfo {
+                    addr: addr.clone(),
+                    redis: RedisConnectionInfo {
+                        db: 0,
+                        username: Some(username),
+                        password: Some(password.clone()),
+                    },
+                })
+            }
+        }
+    }
+
+    async fn get_client(&self) -> anyhow::Result<redis::Client> {
+        let client = redis::Client::open(self.get_connection_info().await?)?;
+        Ok(client)
+    }
+
+    // PubSub does not support credentials refresh.
+    // Requires manual reconnection every 12h.
+    pub async fn get_async_pubsub(&self) -> anyhow::Result<redis::aio::PubSub> {
+        Ok(self.get_client().await?.get_async_pubsub().await?)
+    }
+
+    // The connection lives for 12h.
+    // It can be prolonged with sending `AUTH` commands with the refreshed token.
+    // https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/auth-iam.html#auth-iam-limits
+    async fn keep_connection(
+        mut con: MultiplexedConnection,
+        credentials_provider: Arc<CredentialsProvider>,
+    ) -> anyhow::Result<()> {
+        loop {
+            // The connection lives for 12h, for the sanity check we refresh it every hour.
+            tokio::time::sleep(Duration::from_secs(60 * 60)).await;
+            match Self::refresh_token(&mut con, credentials_provider.clone()).await {
+                Ok(()) => {
+                    info!("Token refreshed");
+                }
+                Err(e) => {
+                    error!("Error during token refresh: {e:?}");
+                }
+            }
+        }
+    }
+    async fn refresh_token(
+        con: &mut MultiplexedConnection,
+        credentials_provider: Arc<CredentialsProvider>,
+    ) -> anyhow::Result<()> {
+        let (user, password) = credentials_provider.provide_credentials().await?;
+        redis::cmd("AUTH")
+            .arg(user)
+            .arg(password)
+            .query_async(con)
+            .await?;
+        Ok(())
+    }
+    /// Sends an already encoded (packed) command into the TCP socket and
+    /// reads the single response from it.
+    pub async fn send_packed_command(&mut self, cmd: &redis::Cmd) -> RedisResult<redis::Value> {
+        // Clone connection to avoid having to lock the ArcSwap in write mode
+        let con = self.con.as_mut().ok_or(redis::RedisError::from((
+            redis::ErrorKind::IoError,
+            "Connection not established",
+        )))?;
+        con.send_packed_command(cmd).await
+    }
+
+    /// Sends multiple already encoded (packed) command into the TCP socket
+    /// and reads `count` responses from it.  This is used to implement
+    /// pipelining.
+    pub async fn send_packed_commands(
+        &mut self,
+        cmd: &redis::Pipeline,
+        offset: usize,
+        count: usize,
+    ) -> RedisResult<Vec<redis::Value>> {
+        // Clone shared connection future to avoid having to lock the ArcSwap in write mode
+        let con = self.con.as_mut().ok_or(redis::RedisError::from((
+            redis::ErrorKind::IoError,
+            "Connection not established",
+        )))?;
+        con.send_packed_commands(cmd, offset, count).await
+    }
+}
+
+impl ConnectionLike for ConnectionWithCredentialsProvider {
+    fn req_packed_command<'a>(
+        &'a mut self,
+        cmd: &'a redis::Cmd,
+    ) -> redis::RedisFuture<'a, redis::Value> {
+        (async move { self.send_packed_command(cmd).await }).boxed()
+    }
+
+    fn req_packed_commands<'a>(
+        &'a mut self,
+        cmd: &'a redis::Pipeline,
+        offset: usize,
+        count: usize,
+    ) -> redis::RedisFuture<'a, Vec<redis::Value>> {
+        (async move { self.send_packed_commands(cmd, offset, count).await }).boxed()
+    }
+
+    fn get_db(&self) -> i64 {
+        0
+    }
+}
diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs
new file mode 100644
index 0000000000..eded8250af
--- /dev/null
+++ b/proxy/src/redis/elasticache.rs
@@ -0,0 +1,110 @@
+use std::time::{Duration, SystemTime};
+
+use aws_config::meta::credentials::CredentialsProviderChain;
+use aws_sdk_iam::config::ProvideCredentials;
+use aws_sigv4::http_request::{
+    self, SignableBody, SignableRequest, SignatureLocation, SigningSettings,
+};
+use tracing::info;
+
+#[derive(Debug)]
+pub struct AWSIRSAConfig {
+    region: String,
+    service_name: String,
+    cluster_name: String,
+    user_id: String,
+    token_ttl: Duration,
+    action: String,
+}
+
+impl AWSIRSAConfig {
+    pub fn new(region: String, cluster_name: Option<String>, user_id: Option<String>) -> Self {
+        AWSIRSAConfig {
+            region,
+            service_name: "elasticache".to_string(),
+            cluster_name: cluster_name.unwrap_or_default(),
+            user_id: user_id.unwrap_or_default(),
+            // "The IAM authentication token is valid for 15 minutes"
+            // https://docs.aws.amazon.com/memorydb/latest/devguide/auth-iam.html#auth-iam-limits
+            token_ttl: Duration::from_secs(15 * 60),
+            action: "connect".to_string(),
+        }
+    }
+}
+
+/// Credentials provider for AWS elasticache authentication.
+///
+/// Official documentation:
+/// <https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/auth-iam.html>
+///
+/// Useful resources:
+/// <https://aws.amazon.com/blogs/database/simplify-managing-access-to-amazon-elasticache-for-redis-clusters-with-iam/>
+pub struct CredentialsProvider {
+    config: AWSIRSAConfig,
+    credentials_provider: CredentialsProviderChain,
+}
+
+impl CredentialsProvider {
+    pub fn new(config: AWSIRSAConfig, credentials_provider: CredentialsProviderChain) -> Self {
+        CredentialsProvider {
+            config,
+            credentials_provider,
+        }
+    }
+    pub async fn provide_credentials(&self) -> anyhow::Result<(String, String)> {
+        let aws_credentials = self
+            .credentials_provider
+            .provide_credentials()
+            .await?
+            .into();
+        info!("AWS credentials successfully obtained");
+        info!("Connecting to Redis with configuration: {:?}", self.config);
+        let mut settings = SigningSettings::default();
+        settings.signature_location = SignatureLocation::QueryParams;
+        settings.expires_in = Some(self.config.token_ttl);
+        let signing_params = aws_sigv4::sign::v4::SigningParams::builder()
+            .identity(&aws_credentials)
+            .region(&self.config.region)
+            .name(&self.config.service_name)
+            .time(SystemTime::now())
+            .settings(settings)
+            .build()?
+            .into();
+        let auth_params = [
+            ("Action", &self.config.action),
+            ("User", &self.config.user_id),
+        ];
+        let auth_params = url::form_urlencoded::Serializer::new(String::new())
+            .extend_pairs(auth_params)
+            .finish();
+        let auth_uri = http::Uri::builder()
+            .scheme("http")
+            .authority(self.config.cluster_name.as_bytes())
+            .path_and_query(format!("/?{auth_params}"))
+            .build()?;
+        info!("{}", auth_uri);
+
+        // Convert the HTTP request into a signable request
+        let signable_request = SignableRequest::new(
+            "GET",
+            auth_uri.to_string(),
+            std::iter::empty(),
+            SignableBody::Bytes(&[]),
+        )?;
+
+        // Sign and then apply the signature to the request
+        let (si, _) = http_request::sign(signable_request, &signing_params)?.into_parts();
+        let mut signable_request = http::Request::builder()
+            .method("GET")
+            .uri(auth_uri)
+            .body(())?;
+        si.apply_to_request_http1x(&mut signable_request);
+        Ok((
+            self.config.user_id.clone(),
+            signable_request
+                .uri()
+                .to_string()
+                .replacen("http://", "", 1),
+        ))
+    }
+}
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 6ae848c0d2..8b7e3e3419 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -6,11 +6,12 @@ use redis::aio::PubSub;
 use serde::{Deserialize, Serialize};
 use uuid::Uuid;
 
+use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use crate::{
     cache::project_info::ProjectInfoCache,
-    cancellation::{CancelMap, CancellationHandler, NotificationsCancellationHandler},
+    cancellation::{CancelMap, CancellationHandler},
     intern::{ProjectIdInt, RoleNameInt},
-    metrics::REDIS_BROKEN_MESSAGES,
+    metrics::{NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, REDIS_BROKEN_MESSAGES},
 };
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -18,23 +19,13 @@ pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
 const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
 const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20);
 
-struct RedisConsumerClient {
-    client: redis::Client,
-}
-
-impl RedisConsumerClient {
-    pub fn new(url: &str) -> anyhow::Result<Self> {
-        let client = redis::Client::open(url)?;
-        Ok(Self { client })
-    }
-    async fn try_connect(&self) -> anyhow::Result<PubSub> {
-        let mut conn = self.client.get_async_connection().await?.into_pubsub();
-        tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`");
-        conn.subscribe(CPLANE_CHANNEL_NAME).await?;
-        tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`");
-        conn.subscribe(PROXY_CHANNEL_NAME).await?;
-        Ok(conn)
-    }
+async fn try_connect(client: &ConnectionWithCredentialsProvider) -> anyhow::Result<PubSub> {
+    let mut conn = client.get_async_pubsub().await?;
+    tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`");
+    conn.subscribe(CPLANE_CHANNEL_NAME).await?;
+    tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`");
+    conn.subscribe(PROXY_CHANNEL_NAME).await?;
+    Ok(conn)
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
@@ -80,21 +71,18 @@ where
     serde_json::from_str(&s).map_err(<D::Error as serde::de::Error>::custom)
 }
 
-struct MessageHandler<
-    C: ProjectInfoCache + Send + Sync + 'static,
-    H: NotificationsCancellationHandler + Send + Sync + 'static,
-> {
+struct MessageHandler<C: ProjectInfoCache + Send + Sync + 'static> {
     cache: Arc<C>,
-    cancellation_handler: Arc<H>,
+    cancellation_handler: Arc<CancellationHandler<()>>,
     region_id: String,
 }
 
-impl<
-        C: ProjectInfoCache + Send + Sync + 'static,
-        H: NotificationsCancellationHandler + Send + Sync + 'static,
-    > MessageHandler<C, H>
-{
-    pub fn new(cache: Arc<C>, cancellation_handler: Arc<H>, region_id: String) -> Self {
+impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
+    pub fn new(
+        cache: Arc<C>,
+        cancellation_handler: Arc<CancellationHandler<()>>,
+        region_id: String,
+    ) -> Self {
         Self {
             cache,
             cancellation_handler,
@@ -139,7 +127,7 @@ impl<
                 // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message.
                 match self
                     .cancellation_handler
-                    .cancel_session_no_publish(cancel_session.cancel_key_data)
+                    .cancel_session(cancel_session.cancel_key_data, uuid::Uuid::nil())
                     .await
                 {
                     Ok(()) => {}
@@ -182,7 +170,7 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
 /// Handle console's invalidation messages.
 #[tracing::instrument(name = "console_notifications", skip_all)]
 pub async fn task_main<C>(
-    url: String,
+    redis: ConnectionWithCredentialsProvider,
     cache: Arc<C>,
     cancel_map: CancelMap,
     region_id: String,
@@ -193,13 +181,15 @@ where
     cache.enable_ttl();
     let handler = MessageHandler::new(
         cache,
-        Arc::new(CancellationHandler::new(cancel_map, None)),
+        Arc::new(CancellationHandler::<()>::new(
+            cancel_map,
+            NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
+        )),
         region_id,
     );
 
     loop {
-        let redis = RedisConsumerClient::new(&url)?;
-        let conn = match redis.try_connect().await {
+        let mut conn = match try_connect(&redis).await {
             Ok(conn) => {
                 handler.disable_ttl();
                 conn
@@ -212,7 +202,7 @@ where
                 continue;
             }
         };
-        let mut stream = conn.into_on_message();
+        let mut stream = conn.on_message();
         while let Some(msg) = stream.next().await {
             match handler.handle_message(msg).await {
                 Ok(()) => {}
diff --git a/proxy/src/redis/publisher.rs b/proxy/src/redis/publisher.rs
deleted file mode 100644
index f85593afdd..0000000000
--- a/proxy/src/redis/publisher.rs
+++ /dev/null
@@ -1,80 +0,0 @@
-use pq_proto::CancelKeyData;
-use redis::AsyncCommands;
-use uuid::Uuid;
-
-use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
-
-use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME};
-
-pub struct RedisPublisherClient {
-    client: redis::Client,
-    publisher: Option<redis::aio::Connection>,
-    region_id: String,
-    limiter: RedisRateLimiter,
-}
-
-impl RedisPublisherClient {
-    pub fn new(
-        url: &str,
-        region_id: String,
-        info: &'static [RateBucketInfo],
-    ) -> anyhow::Result<Self> {
-        let client = redis::Client::open(url)?;
-        Ok(Self {
-            client,
-            publisher: None,
-            region_id,
-            limiter: RedisRateLimiter::new(info),
-        })
-    }
-    pub async fn try_publish(
-        &mut self,
-        cancel_key_data: CancelKeyData,
-        session_id: Uuid,
-    ) -> anyhow::Result<()> {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping cancellation message");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-        match self.publish(cancel_key_data, session_id).await {
-            Ok(()) => return Ok(()),
-            Err(e) => {
-                tracing::error!("failed to publish a message: {e}");
-                self.publisher = None;
-            }
-        }
-        tracing::info!("Publisher is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.publish(cancel_key_data, session_id).await
-    }
-
-    async fn publish(
-        &mut self,
-        cancel_key_data: CancelKeyData,
-        session_id: Uuid,
-    ) -> anyhow::Result<()> {
-        let conn = self
-            .publisher
-            .as_mut()
-            .ok_or_else(|| anyhow::anyhow!("not connected"))?;
-        let payload = serde_json::to_string(&Notification::Cancel(CancelSession {
-            region_id: Some(self.region_id.clone()),
-            cancel_key_data,
-            session_id,
-        }))?;
-        conn.publish(PROXY_CHANNEL_NAME, payload).await?;
-        Ok(())
-    }
-    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
-        match self.client.get_async_connection().await {
-            Ok(conn) => {
-                self.publisher = Some(conn);
-            }
-            Err(e) => {
-                tracing::error!("failed to connect to redis: {e}");
-                return Err(e.into());
-            }
-        }
-        Ok(())
-    }
-}
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index be9f90acde..a2010fd613 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -21,11 +21,12 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
 use tracing::instrument::Instrumented;
 
+use crate::cancellation::CancellationHandlerMain;
+use crate::config::ProxyConfig;
 use crate::context::RequestMonitoring;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
-use crate::{cancellation::CancellationHandler, config::ProxyConfig};
 use hyper::{
     server::conn::{AddrIncoming, AddrStream},
     Body, Method, Request, Response,
@@ -47,7 +48,7 @@ pub async fn task_main(
     ws_listener: TcpListener,
     cancellation_token: CancellationToken,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    cancellation_handler: Arc<CancellationHandler>,
+    cancellation_handler: Arc<CancellationHandlerMain>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("websocket server has shut down");
@@ -237,7 +238,7 @@ async fn request_handler(
     config: &'static ProxyConfig,
     backend: Arc<PoolingBackend>,
     ws_connections: TaskTracker,
-    cancellation_handler: Arc<CancellationHandler>,
+    cancellation_handler: Arc<CancellationHandlerMain>,
     peer_addr: IpAddr,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     // used to cancel in-flight HTTP requests. not used to cancel websockets
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index a72ede6d0a..ada6c974f4 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -1,5 +1,5 @@
 use crate::{
-    cancellation::CancellationHandler,
+    cancellation::CancellationHandlerMain,
     config::ProxyConfig,
     context::RequestMonitoring,
     error::{io_error, ReportableError},
@@ -134,7 +134,7 @@ pub async fn serve_websocket(
     config: &'static ProxyConfig,
     mut ctx: RequestMonitoring,
     websocket: HyperWebsocket,
-    cancellation_handler: Arc<CancellationHandler>,
+    cancellation_handler: Arc<CancellationHandlerMain>,
     hostname: Option<String>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 152c452dd4..7b8228a082 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -19,8 +19,7 @@ aws-runtime = { version = "1", default-features = false, features = ["event-stre
 aws-sigv4 = { version = "1", features = ["http0-compat", "sign-eventstream", "sigv4a"] }
 aws-smithy-async = { version = "1", default-features = false, features = ["rt-tokio"] }
 aws-smithy-http = { version = "0.60", default-features = false, features = ["event-stream"] }
-aws-smithy-runtime-api = { version = "1", features = ["client", "http-02x", "http-auth"] }
-aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "rt-tokio"] }
+aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "rt-tokio", "test-util"] }
 axum = { version = "0.6", features = ["ws"] }
 base64 = { version = "0.21", features = ["alloc"] }
 base64ct = { version = "1", default-features = false, features = ["std"] }

From 62b318c928f365827039022e900bd6c80928792e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 22 Mar 2024 10:10:28 +0000
Subject: [PATCH 0456/1571] Fix ephemeral file warning on secondaries (#7201)

A test was added which exercises secondary locations more, and there was
a location in the secondary downloader that warned on ephemeral files.

This was intended to be fixed in this faulty commit:
https://github.com/neondatabase/neon/pull/7169/commits/8cea866adf15c3086dc16e5fa62f59d5604fdf1e
---
 pageserver/src/tenant/secondary/downloader.rs            | 4 ++--
 test_runner/regress/test_pageserver_metric_collection.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 40f19e3b05..8782a9f04e 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -11,11 +11,11 @@ use crate::{
     disk_usage_eviction_task::{
         finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer,
     },
-    is_temporary,
     metrics::SECONDARY_MODE,
     tenant::{
         config::SecondaryLocationConfig,
         debug_assert_current_span_has_tenant_and_timeline_id,
+        ephemeral_file::is_ephemeral_file,
         remote_timeline_client::{
             index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
@@ -964,7 +964,7 @@ async fn init_timeline_state(
             continue;
         } else if crate::is_temporary(&file_path)
             || is_temp_download_file(&file_path)
-            || is_temporary(&file_path)
+            || is_ephemeral_file(file_name)
         {
             // Temporary files are frequently left behind from restarting during downloads
             tracing::info!("Cleaning up temporary file {file_path}");
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index 042961baa5..5799d11190 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -70,6 +70,7 @@ def test_metric_collection(
             # we have a fast rate of calculation, these can happen at shutdown
             ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*",
             ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes",
+            ".*metrics_collection: failed to upload to S3: Failed to upload data of length .* to storage path.*",
         ]
     )
 

From 77f3a30440aba4845da3a5203a2764fed4d96648 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 22 Mar 2024 13:31:10 +0000
Subject: [PATCH 0457/1571] proxy: unit tests for auth_quirks (#7199)

## Problem

I noticed code coverage for auth_quirks was pretty bare

## Summary of changes

Adds 3 happy path unit tests for auth_quirks
* scram
* cleartext (websockets)
* cleartext (password hack)
---
 Cargo.lock                         |   1 +
 Cargo.toml                         |   1 +
 proxy/Cargo.toml                   |   1 +
 proxy/src/auth/backend.rs          | 225 +++++++++++++++++++++++++++++
 proxy/src/compute.rs               |  11 +-
 proxy/src/console.rs               |   2 +-
 proxy/src/console/provider.rs      |   5 +-
 proxy/src/console/provider/mock.rs |   2 -
 proxy/src/console/provider/neon.rs |   2 -
 proxy/src/scram/exchange.rs        |  28 ++--
 proxy/src/scram/key.rs             |  16 +-
 proxy/src/scram/messages.rs        |  22 +++
 proxy/src/scram/secret.rs          |   7 +
 13 files changed, 285 insertions(+), 38 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index dcf1c49924..6409c79ef9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4237,6 +4237,7 @@ dependencies = [
  "consumption_metrics",
  "dashmap",
  "env_logger",
+ "fallible-iterator",
  "futures",
  "git-version",
  "hashbrown 0.13.2",
diff --git a/Cargo.toml b/Cargo.toml
index 2741bd046b..4dda63ff58 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -79,6 +79,7 @@ either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
 fail = "0.5.0"
+fallible-iterator = "0.2"
 fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 3566d8b728..57a2736d5b 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -97,6 +97,7 @@ workspace_hack.workspace = true
 
 [dev-dependencies]
 camino-tempfile.workspace = true
+fallible-iterator.workspace = true
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index bc307230dd..04fe83d8eb 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -408,3 +408,228 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use bytes::BytesMut;
+    use fallible_iterator::FallibleIterator;
+    use postgres_protocol::{
+        authentication::sasl::{ChannelBinding, ScramSha256},
+        message::{backend::Message as PgMessage, frontend},
+    };
+    use provider::AuthSecret;
+    use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
+
+    use crate::{
+        auth::{ComputeUserInfoMaybeEndpoint, IpPattern},
+        config::AuthenticationConfig,
+        console::{
+            self,
+            provider::{self, CachedAllowedIps, CachedRoleSecret},
+            CachedNodeInfo,
+        },
+        context::RequestMonitoring,
+        proxy::NeonOptions,
+        scram::ServerSecret,
+        stream::{PqStream, Stream},
+    };
+
+    use super::auth_quirks;
+
+    struct Auth {
+        ips: Vec<IpPattern>,
+        secret: AuthSecret,
+    }
+
+    impl console::Api for Auth {
+        async fn get_role_secret(
+            &self,
+            _ctx: &mut RequestMonitoring,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
+            Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
+        }
+
+        async fn get_allowed_ips_and_secret(
+            &self,
+            _ctx: &mut RequestMonitoring,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
+        {
+            Ok((
+                CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())),
+                Some(CachedRoleSecret::new_uncached(Some(self.secret.clone()))),
+            ))
+        }
+
+        async fn wake_compute(
+            &self,
+            _ctx: &mut RequestMonitoring,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
+            unimplemented!()
+        }
+    }
+
+    static CONFIG: &AuthenticationConfig = &AuthenticationConfig {
+        scram_protocol_timeout: std::time::Duration::from_secs(5),
+    };
+
+    async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
+        loop {
+            r.read_buf(&mut *b).await.unwrap();
+            if let Some(m) = PgMessage::parse(&mut *b).unwrap() {
+                break m;
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn auth_quirks_scram() {
+        let (mut client, server) = tokio::io::duplex(1024);
+        let mut stream = PqStream::new(Stream::from_raw(server));
+
+        let mut ctx = RequestMonitoring::test();
+        let api = Auth {
+            ips: vec![],
+            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
+        };
+
+        let user_info = ComputeUserInfoMaybeEndpoint {
+            user: "conrad".into(),
+            endpoint_id: Some("endpoint".into()),
+            options: NeonOptions::default(),
+        };
+
+        let handle = tokio::spawn(async move {
+            let mut scram = ScramSha256::new(b"my-secret-password", ChannelBinding::unsupported());
+
+            let mut read = BytesMut::new();
+
+            // server should offer scram
+            match read_message(&mut client, &mut read).await {
+                PgMessage::AuthenticationSasl(a) => {
+                    let options: Vec<&str> = a.mechanisms().collect().unwrap();
+                    assert_eq!(options, ["SCRAM-SHA-256"]);
+                }
+                _ => panic!("wrong message"),
+            }
+
+            // client sends client-first-message
+            let mut write = BytesMut::new();
+            frontend::sasl_initial_response("SCRAM-SHA-256", scram.message(), &mut write).unwrap();
+            client.write_all(&write).await.unwrap();
+
+            // server response with server-first-message
+            match read_message(&mut client, &mut read).await {
+                PgMessage::AuthenticationSaslContinue(a) => {
+                    scram.update(a.data()).await.unwrap();
+                }
+                _ => panic!("wrong message"),
+            }
+
+            // client response with client-final-message
+            write.clear();
+            frontend::sasl_response(scram.message(), &mut write).unwrap();
+            client.write_all(&write).await.unwrap();
+
+            // server response with server-final-message
+            match read_message(&mut client, &mut read).await {
+                PgMessage::AuthenticationSaslFinal(a) => {
+                    scram.finish(a.data()).unwrap();
+                }
+                _ => panic!("wrong message"),
+            }
+        });
+
+        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, CONFIG)
+            .await
+            .unwrap();
+
+        handle.await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn auth_quirks_cleartext() {
+        let (mut client, server) = tokio::io::duplex(1024);
+        let mut stream = PqStream::new(Stream::from_raw(server));
+
+        let mut ctx = RequestMonitoring::test();
+        let api = Auth {
+            ips: vec![],
+            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
+        };
+
+        let user_info = ComputeUserInfoMaybeEndpoint {
+            user: "conrad".into(),
+            endpoint_id: Some("endpoint".into()),
+            options: NeonOptions::default(),
+        };
+
+        let handle = tokio::spawn(async move {
+            let mut read = BytesMut::new();
+            let mut write = BytesMut::new();
+
+            // server should offer cleartext
+            match read_message(&mut client, &mut read).await {
+                PgMessage::AuthenticationCleartextPassword => {}
+                _ => panic!("wrong message"),
+            }
+
+            // client responds with password
+            write.clear();
+            frontend::password_message(b"my-secret-password", &mut write).unwrap();
+            client.write_all(&write).await.unwrap();
+        });
+
+        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG)
+            .await
+            .unwrap();
+
+        handle.await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn auth_quirks_password_hack() {
+        let (mut client, server) = tokio::io::duplex(1024);
+        let mut stream = PqStream::new(Stream::from_raw(server));
+
+        let mut ctx = RequestMonitoring::test();
+        let api = Auth {
+            ips: vec![],
+            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
+        };
+
+        let user_info = ComputeUserInfoMaybeEndpoint {
+            user: "conrad".into(),
+            endpoint_id: None,
+            options: NeonOptions::default(),
+        };
+
+        let handle = tokio::spawn(async move {
+            let mut read = BytesMut::new();
+
+            // server should offer cleartext
+            match read_message(&mut client, &mut read).await {
+                PgMessage::AuthenticationCleartextPassword => {}
+                _ => panic!("wrong message"),
+            }
+
+            // client responds with password
+            let mut write = BytesMut::new();
+            frontend::password_message(b"endpoint=my-endpoint;my-secret-password", &mut write)
+                .unwrap();
+            client.write_all(&write).await.unwrap();
+        });
+
+        let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG)
+            .await
+            .unwrap();
+
+        assert_eq!(creds.info.endpoint, "my-endpoint");
+
+        handle.await.unwrap();
+    }
+}
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index b61c1fb9ef..65153babcb 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -82,14 +82,13 @@ pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
 /// A config for establishing a connection to compute node.
 /// Eventually, `tokio_postgres` will be replaced with something better.
 /// Newtype allows us to implement methods on top of it.
-#[derive(Clone)]
-#[repr(transparent)]
+#[derive(Clone, Default)]
 pub struct ConnCfg(Box<tokio_postgres::Config>);
 
 /// Creation and initialization routines.
 impl ConnCfg {
     pub fn new() -> Self {
-        Self(Default::default())
+        Self::default()
     }
 
     /// Reuse password or auth keys from the other config.
@@ -165,12 +164,6 @@ impl std::ops::DerefMut for ConnCfg {
     }
 }
 
-impl Default for ConnCfg {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl ConnCfg {
     /// Establish a raw TCP connection to the compute node.
     async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> {
diff --git a/proxy/src/console.rs b/proxy/src/console.rs
index fd3c46b946..ea95e83437 100644
--- a/proxy/src/console.rs
+++ b/proxy/src/console.rs
@@ -6,7 +6,7 @@ pub mod messages;
 
 /// Wrappers for console APIs and their mocks.
 pub mod provider;
-pub use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};
+pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};
 
 /// Various cache-related types.
 pub mod caches {
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 8609606273..69bfd6b045 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -14,7 +14,6 @@ use crate::{
     context::RequestMonitoring,
     scram, EndpointCacheKey, ProjectId,
 };
-use async_trait::async_trait;
 use dashmap::DashMap;
 use std::{sync::Arc, time::Duration};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
@@ -326,8 +325,7 @@ pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPatt
 
 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
-#[async_trait]
-pub trait Api {
+pub(crate) trait Api {
     /// Get the client's auth secret for authentication.
     /// Returns option because user not found situation is special.
     /// We still have to mock the scram to avoid leaking information that user doesn't exist.
@@ -363,7 +361,6 @@ pub enum ConsoleBackend {
     Test(Box<dyn crate::auth::backend::TestBackend>),
 }
 
-#[async_trait]
 impl Api for ConsoleBackend {
     async fn get_role_secret(
         &self,
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 0579ef6fc4..b759c81373 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -8,7 +8,6 @@ use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
 use crate::context::RequestMonitoring;
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
 use crate::{auth::IpPattern, cache::Cached};
-use async_trait::async_trait;
 use futures::TryFutureExt;
 use std::{str::FromStr, sync::Arc};
 use thiserror::Error;
@@ -144,7 +143,6 @@ async fn get_execute_postgres_query(
     Ok(Some(entry))
 }
 
-#[async_trait]
 impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn get_role_secret(
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index b36663518d..89ebfa57f1 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -14,7 +14,6 @@ use crate::{
     context::RequestMonitoring,
     metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
 };
-use async_trait::async_trait;
 use futures::TryFutureExt;
 use std::sync::Arc;
 use tokio::time::Instant;
@@ -168,7 +167,6 @@ impl Api {
     }
 }
 
-#[async_trait]
 impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn get_role_secret(
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 682cbe795f..89dd33e59f 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -3,9 +3,7 @@
 use std::convert::Infallible;
 
 use hmac::{Hmac, Mac};
-use sha2::digest::FixedOutput;
-use sha2::{Digest, Sha256};
-use subtle::{Choice, ConstantTimeEq};
+use sha2::Sha256;
 use tokio::task::yield_now;
 
 use super::messages::{
@@ -13,6 +11,7 @@ use super::messages::{
 };
 use super::secret::ServerSecret;
 use super::signature::SignatureBuilder;
+use super::ScramKey;
 use crate::config;
 use crate::sasl::{self, ChannelBinding, Error as SaslError};
 
@@ -104,7 +103,7 @@ async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
 }
 
 // copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L236-L248>
-async fn derive_keys(password: &[u8], salt: &[u8], iterations: u32) -> ([u8; 32], [u8; 32]) {
+async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey {
     let salted_password = pbkdf2(password, salt, iterations).await;
 
     let make_key = |name| {
@@ -116,7 +115,7 @@ async fn derive_keys(password: &[u8], salt: &[u8], iterations: u32) -> ([u8; 32]
         <[u8; 32]>::from(key.into_bytes())
     };
 
-    (make_key(b"Client Key"), make_key(b"Server Key"))
+    make_key(b"Client Key").into()
 }
 
 pub async fn exchange(
@@ -124,21 +123,12 @@ pub async fn exchange(
     password: &[u8],
 ) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
     let salt = base64::decode(&secret.salt_base64)?;
-    let (client_key, server_key) = derive_keys(password, &salt, secret.iterations).await;
-    let stored_key: [u8; 32] = Sha256::default()
-        .chain_update(client_key)
-        .finalize_fixed()
-        .into();
+    let client_key = derive_client_key(password, &salt, secret.iterations).await;
 
-    // constant time to not leak partial key match
-    let valid = stored_key.ct_eq(&secret.stored_key.as_bytes())
-        | server_key.ct_eq(&secret.server_key.as_bytes())
-        | Choice::from(secret.doomed as u8);
-
-    if valid.into() {
-        Ok(sasl::Outcome::Success(super::ScramKey::from(client_key)))
-    } else {
+    if secret.is_password_invalid(&client_key).into() {
         Ok(sasl::Outcome::Failure("password doesn't match"))
+    } else {
+        Ok(sasl::Outcome::Success(client_key))
     }
 }
 
@@ -220,7 +210,7 @@ impl SaslSentInner {
             .derive_client_key(&client_final_message.proof);
 
         // Auth fails either if keys don't match or it's pre-determined to fail.
-        if client_key.sha256() != secret.stored_key || secret.doomed {
+        if secret.is_password_invalid(&client_key).into() {
             return Ok(sasl::Step::Failure("password doesn't match"));
         }
 
diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs
index 973126e729..32a3dbd203 100644
--- a/proxy/src/scram/key.rs
+++ b/proxy/src/scram/key.rs
@@ -1,17 +1,31 @@
 //! Tools for client/server/stored key management.
 
+use subtle::ConstantTimeEq;
+
 /// Faithfully taken from PostgreSQL.
 pub const SCRAM_KEY_LEN: usize = 32;
 
 /// One of the keys derived from the user's password.
 /// We use the same structure for all keys, i.e.
 /// `ClientKey`, `StoredKey`, and `ServerKey`.
-#[derive(Clone, Default, PartialEq, Eq, Debug)]
+#[derive(Clone, Default, Eq, Debug)]
 #[repr(transparent)]
 pub struct ScramKey {
     bytes: [u8; SCRAM_KEY_LEN],
 }
 
+impl PartialEq for ScramKey {
+    fn eq(&self, other: &Self) -> bool {
+        self.ct_eq(other).into()
+    }
+}
+
+impl ConstantTimeEq for ScramKey {
+    fn ct_eq(&self, other: &Self) -> subtle::Choice {
+        self.bytes.ct_eq(&other.bytes)
+    }
+}
+
 impl ScramKey {
     pub fn sha256(&self) -> Self {
         super::sha256([self.as_ref()]).into()
diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs
index b59baec508..f9372540ca 100644
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -206,6 +206,28 @@ mod tests {
         }
     }
 
+    #[test]
+    fn parse_client_first_message_with_invalid_gs2_authz() {
+        assert!(ClientFirstMessage::parse("n,authzid,n=user,r=nonce").is_none())
+    }
+
+    #[test]
+    fn parse_client_first_message_with_extra_params() {
+        let msg = ClientFirstMessage::parse("n,,n=user,r=nonce,a=foo,b=bar,c=baz").unwrap();
+        assert_eq!(msg.bare, "n=user,r=nonce,a=foo,b=bar,c=baz");
+        assert_eq!(msg.username, "user");
+        assert_eq!(msg.nonce, "nonce");
+        assert_eq!(msg.cbind_flag, ChannelBinding::NotSupportedClient);
+    }
+
+    #[test]
+    fn parse_client_first_message_with_extra_params_invalid() {
+        // must be of the form `<ascii letter>=<...>`
+        assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,abc=foo").is_none());
+        assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,1=foo").is_none());
+        assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,a").is_none());
+    }
+
     #[test]
     fn parse_client_final_message() {
         let input = [
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index b46d8c3ab5..f3414cb8ec 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -1,5 +1,7 @@
 //! Tools for SCRAM server secret management.
 
+use subtle::{Choice, ConstantTimeEq};
+
 use super::base64_decode_array;
 use super::key::ScramKey;
 
@@ -40,6 +42,11 @@ impl ServerSecret {
         Some(secret)
     }
 
+    pub fn is_password_invalid(&self, client_key: &ScramKey) -> Choice {
+        // constant time to not leak partial key match
+        client_key.sha256().ct_ne(&self.stored_key) | Choice::from(self.doomed as u8)
+    }
+
     /// To avoid revealing information to an attacker, we use a
     /// mocked server secret even if the user doesn't exist.
     /// See `auth-scram.c : mock_scram_secret` for details.

From 2668a1dfabf703520b46726d73b4e924f9c9a5cd Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 22 Mar 2024 14:42:10 +0000
Subject: [PATCH 0458/1571] CI: deploy release version to a preprod region
 (#6811)

## Problem

We want to deploy releases to a preprod region first to perform required
checks

## Summary of changes
- Deploy `release-XXX` / `release-proxy-YYY` docker tags to a preprod region
---
 .github/workflows/build_and_test.yml | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 2bcda7cc8e..d27713f083 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1121,10 +1121,16 @@ jobs:
         run: |
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
             gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
-
-            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+              -f deployPgSniRouter=false \
+              -f deployProxy=false \
+              -f deployStorage=true \
+              -f deployStorageBroker=true \
+              -f branch=main \
+              -f dockerTag=${{needs.tag.outputs.build-tag}} \
+              -f deployPreprodRegion=true
+
             gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
               -f deployPgSniRouter=false \
               -f deployProxy=false \
@@ -1133,6 +1139,15 @@ jobs:
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+              -f deployPgSniRouter=true \
+              -f deployProxy=true \
+              -f deployStorage=false \
+              -f deployStorageBroker=false \
+              -f branch=main \
+              -f dockerTag=${{needs.tag.outputs.build-tag}} \
+              -f deployPreprodRegion=true
+
             gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
               -f deployPgSniRouter=true \
               -f deployProxy=true \

From 1787cf19e3f6fa67edbeb40faa9f0287e864db07 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 22 Mar 2024 14:52:14 +0000
Subject: [PATCH 0459/1571] pageserver: write consumption metrics to S3 (#7200)

## Problem

The service that receives consumption metrics has lower availability
than S3. Writing metrics to S3 improves their availability.

Closes: https://github.com/neondatabase/cloud/issues/9824

## Summary of changes

- The same data as consumption metrics POST bodies is also compressed
and written to an S3 object with a timestamp-formatted path.
- Set `metric_collection_bucket` (same format as `remote_storage`
config) to configure the location to write to
---
 pageserver/src/bin/pageserver.rs              |  1 +
 pageserver/src/config.rs                      | 18 ++++++
 pageserver/src/consumption_metrics.rs         | 28 ++++++++-
 pageserver/src/consumption_metrics/upload.rs  | 62 ++++++++++++++++++-
 .../test_pageserver_metric_collection.py      | 28 ++++++++-
 5 files changed, 131 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 1fd7c775d5..ef616c0a39 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -615,6 +615,7 @@ fn start_pageserver(
 
                 pageserver::consumption_metrics::collect_metrics(
                     metric_collection_endpoint,
+                    &conf.metric_collection_bucket,
                     conf.metric_collection_interval,
                     conf.cached_metric_collection_interval,
                     conf.synthetic_size_calculation_interval,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 8ad9ade4a9..a29719e36f 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -234,6 +234,7 @@ pub struct PageServerConf {
     // How often to send unchanged cached metrics to the metrics endpoint.
     pub cached_metric_collection_interval: Duration,
     pub metric_collection_endpoint: Option<Url>,
+    pub metric_collection_bucket: Option<RemoteStorageConfig>,
     pub synthetic_size_calculation_interval: Duration,
 
     pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
@@ -373,6 +374,7 @@ struct PageServerConfigBuilder {
     cached_metric_collection_interval: BuilderValue<Duration>,
     metric_collection_endpoint: BuilderValue<Option<Url>>,
     synthetic_size_calculation_interval: BuilderValue<Duration>,
+    metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
 
     disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,
 
@@ -455,6 +457,8 @@ impl PageServerConfigBuilder {
             .expect("cannot parse default synthetic size calculation interval")),
             metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
 
+            metric_collection_bucket: Set(None),
+
             disk_usage_based_eviction: Set(None),
 
             test_remote_failures: Set(0),
@@ -586,6 +590,13 @@ impl PageServerConfigBuilder {
         self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
     }
 
+    pub fn metric_collection_bucket(
+        &mut self,
+        metric_collection_bucket: Option<RemoteStorageConfig>,
+    ) {
+        self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket)
+    }
+
     pub fn synthetic_size_calculation_interval(
         &mut self,
         synthetic_size_calculation_interval: Duration,
@@ -694,6 +705,7 @@ impl PageServerConfigBuilder {
                 metric_collection_interval,
                 cached_metric_collection_interval,
                 metric_collection_endpoint,
+                metric_collection_bucket,
                 synthetic_size_calculation_interval,
                 disk_usage_based_eviction,
                 test_remote_failures,
@@ -942,6 +954,9 @@ impl PageServerConf {
                     let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
                     builder.metric_collection_endpoint(Some(endpoint));
                 },
+                "metric_collection_bucket" => {
+                    builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
+                }
                 "synthetic_size_calculation_interval" =>
                     builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
                 "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
@@ -1057,6 +1072,7 @@ impl PageServerConf {
             metric_collection_interval: Duration::from_secs(60),
             cached_metric_collection_interval: Duration::from_secs(60 * 60),
             metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
+            metric_collection_bucket: None,
             synthetic_size_calculation_interval: Duration::from_secs(60),
             disk_usage_based_eviction: None,
             test_remote_failures: 0,
@@ -1289,6 +1305,7 @@ background_task_maximum_delay = '334 s'
                     defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
                 )?,
                 metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
+                metric_collection_bucket: None,
                 synthetic_size_calculation_interval: humantime::parse_duration(
                     defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
                 )?,
@@ -1363,6 +1380,7 @@ background_task_maximum_delay = '334 s'
                 metric_collection_interval: Duration::from_secs(222),
                 cached_metric_collection_interval: Duration::from_secs(22200),
                 metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
+                metric_collection_bucket: None,
                 synthetic_size_calculation_interval: Duration::from_secs(333),
                 disk_usage_based_eviction: None,
                 test_remote_failures: 0,
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index c7f9d596c6..3429e3a0a6 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -7,6 +7,7 @@ use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tena
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
+use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
 use reqwest::Url;
 use std::collections::HashMap;
 use std::sync::Arc;
@@ -41,6 +42,7 @@ type Cache = HashMap<MetricsKey, (EventType, u64)>;
 #[allow(clippy::too_many_arguments)]
 pub async fn collect_metrics(
     metric_collection_endpoint: &Url,
+    metric_collection_bucket: &Option<RemoteStorageConfig>,
     metric_collection_interval: Duration,
     _cached_metric_collection_interval: Duration,
     synthetic_size_calculation_interval: Duration,
@@ -94,6 +96,20 @@ pub async fn collect_metrics(
         .build()
         .expect("Failed to create http client with timeout");
 
+    let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
+        match GenericRemoteStorage::from_config(bucket_config) {
+            Ok(client) => Some(client),
+            Err(e) => {
+                // Non-fatal error: if we were given an invalid config, we will proceed
+                // with sending metrics over the network, but not to S3.
+                tracing::warn!("Invalid configuration for metric_collection_bucket: {e}");
+                None
+            }
+        }
+    } else {
+        None
+    };
+
     let node_id = node_id.to_string();
 
     loop {
@@ -118,10 +134,18 @@ pub async fn collect_metrics(
                     tracing::error!("failed to persist metrics to {path:?}: {e:#}");
                 }
             }
+
+            if let Some(bucket_client) = &bucket_client {
+                let res =
+                    upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
+                if let Err(e) = res {
+                    tracing::error!("failed to upload to S3: {e:#}");
+                }
+            }
         };
 
         let upload = async {
-            let res = upload::upload_metrics(
+            let res = upload::upload_metrics_http(
                 &client,
                 metric_collection_endpoint,
                 &cancel,
@@ -132,7 +156,7 @@ pub async fn collect_metrics(
             .await;
             if let Err(e) = res {
                 // serialization error which should never happen
-                tracing::error!("failed to upload due to {e:#}");
+                tracing::error!("failed to upload via HTTP due to {e:#}");
             }
         };
 
diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs
index 6b840a3136..4e8283c3e4 100644
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -1,4 +1,9 @@
+use std::time::SystemTime;
+
+use chrono::{DateTime, Utc};
 use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
+use remote_storage::{GenericRemoteStorage, RemotePath};
+use tokio::io::AsyncWriteExt;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 
@@ -13,8 +18,9 @@ struct Ids {
     pub(super) timeline_id: Option<TimelineId>,
 }
 
+/// Serialize and write metrics to an HTTP endpoint
 #[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
-pub(super) async fn upload_metrics(
+pub(super) async fn upload_metrics_http(
     client: &reqwest::Client,
     metric_collection_endpoint: &reqwest::Url,
     cancel: &CancellationToken,
@@ -74,6 +80,60 @@ pub(super) async fn upload_metrics(
     Ok(())
 }
 
+/// Serialize and write metrics to a remote storage object
+#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
+pub(super) async fn upload_metrics_bucket(
+    client: &GenericRemoteStorage,
+    cancel: &CancellationToken,
+    node_id: &str,
+    metrics: &[RawMetric],
+) -> anyhow::Result<()> {
+    if metrics.is_empty() {
+        // Skip uploads if we have no metrics, so that readers don't have to handle the edge case
+        // of an empty object.
+        return Ok(());
+    }
+
+    // Compose object path
+    let datetime: DateTime<Utc> = SystemTime::now().into();
+    let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ");
+    let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?;
+
+    // Set up a gzip writer into a buffer
+    let mut compressed_bytes: Vec<u8> = Vec::new();
+    let compressed_writer = std::io::Cursor::new(&mut compressed_bytes);
+    let mut gzip_writer = async_compression::tokio::write::GzipEncoder::new(compressed_writer);
+
+    // Serialize and write into compressed buffer
+    let started_at = std::time::Instant::now();
+    for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
+        let (_chunk, body) = res?;
+        gzip_writer.write_all(&body).await?;
+    }
+    gzip_writer.flush().await?;
+    gzip_writer.shutdown().await?;
+    let compressed_length = compressed_bytes.len();
+
+    // Write to remote storage
+    client
+        .upload_storage_object(
+            futures::stream::once(futures::future::ready(Ok(compressed_bytes.into()))),
+            compressed_length,
+            &path,
+            cancel,
+        )
+        .await?;
+    let elapsed = started_at.elapsed();
+
+    tracing::info!(
+        compressed_length,
+        elapsed_ms = elapsed.as_millis(),
+        "write metrics bucket at {path}",
+    );
+
+    Ok(())
+}
+
 // The return type is quite ugly, but we gain testability in isolation
 fn serialize_in_chunks<'a, F>(
     chunk_size: usize,
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index 5799d11190..c34ef46d07 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -1,4 +1,6 @@
+import gzip
 import json
+import os
 import time
 from dataclasses import dataclass
 from pathlib import Path
@@ -10,7 +12,11 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     wait_for_last_flush_lsn,
 )
-from fixtures.remote_storage import RemoteStorageKind
+from fixtures.remote_storage import (
+    LocalFsStorage,
+    RemoteStorageKind,
+    remote_storage_to_toml_inline_table,
+)
 from fixtures.types import TenantId, TimelineId
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
@@ -40,6 +46,9 @@ def test_metric_collection(
         uploads.put((events, is_last == "true"))
         return Response(status=200)
 
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    assert neon_env_builder.pageserver_remote_storage is not None
+
     # Require collecting metrics frequently, since we change
     # the timeline and want something to be logged about it.
     #
@@ -48,12 +57,11 @@ def test_metric_collection(
     neon_env_builder.pageserver_config_override = f"""
         metric_collection_interval="1s"
         metric_collection_endpoint="{metric_collection_endpoint}"
+        metric_collection_bucket={remote_storage_to_toml_inline_table(neon_env_builder.pageserver_remote_storage)}
         cached_metric_collection_interval="0s"
         synthetic_size_calculation_interval="3s"
         """
 
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-
     log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
 
     # mock http server that returns OK for the metrics
@@ -167,6 +175,20 @@ def test_metric_collection(
 
     httpserver.check()
 
+    # Check that at least one bucket output object is present, and that all
+    # can be decompressed and decoded.
+    bucket_dumps = {}
+    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+    for dirpath, _dirs, files in os.walk(env.pageserver_remote_storage.root):
+        for file in files:
+            file_path = os.path.join(dirpath, file)
+            log.info(file_path)
+            if file.endswith(".gz"):
+                bucket_dumps[file_path] = json.load(gzip.open(file_path))
+
+    assert len(bucket_dumps) >= 1
+    assert all("events" in data for data in bucket_dumps.values())
+
 
 def test_metric_collection_cleans_up_tempfile(
     httpserver: HTTPServer,

From 35f4c04c9b3ec6f0850d3835a0364439b9907f3f Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 22 Mar 2024 19:14:31 +0200
Subject: [PATCH 0460/1571] Remove Get/SetZenithCurrentClusterSize from
 Postgres core (#7196)

## Problem

See https://neondb.slack.com/archives/C04DGM6SMTM/p1711003752072899

## Summary of changes

Move keeping of cluster size to neon extension

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/neon.c                                  |  2 +-
 pgxn/neon/neon.h                                  |  2 ++
 pgxn/neon/pagestore_smgr.c                        |  4 ++--
 pgxn/neon/walproposer.h                           |  1 +
 pgxn/neon/walproposer_pg.c                        | 15 ++++++++++++++-
 .../tests/walproposer_sim/walproposer_api.rs      |  1 +
 vendor/postgres-v14                               |  2 +-
 vendor/postgres-v15                               |  2 +-
 vendor/postgres-v16                               |  2 +-
 vendor/revisions.json                             |  6 +++---
 10 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 6ede78a576..8d236144b5 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -312,7 +312,7 @@ pg_cluster_size(PG_FUNCTION_ARGS)
 {
 	int64		size;
 
-	size = GetZenithCurrentClusterSize();
+	size = GetNeonCurrentClusterSize();
 
 	if (size == 0)
 		PG_RETURN_NULL();
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index a0f8c97497..5c653fc6c6 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -26,6 +26,8 @@ extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);
 
 extern uint64 BackpressureThrottlingTime(void);
+extern void SetNeonCurrentClusterSize(uint64 size);
+extern uint64 GetNeonCurrentClusterSize(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
 
 extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 0256de2b9a..2d222e3c7c 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1831,7 +1831,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
 		!IsAutoVacuumWorkerProcess())
 	{
-		uint64		current_size = GetZenithCurrentClusterSize();
+		uint64		current_size = GetNeonCurrentClusterSize();
 
 		if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
 			ereport(ERROR,
@@ -1912,7 +1912,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 		reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
 		!IsAutoVacuumWorkerProcess())
 	{
-		uint64		current_size = GetZenithCurrentClusterSize();
+		uint64		current_size = GetNeonCurrentClusterSize();
 
 		if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
 			ereport(ERROR,
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 28585eb4e7..69a557fdf2 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -287,6 +287,7 @@ typedef struct WalproposerShmemState
 	slock_t		mutex;
 	term_t		mineLastElectedTerm;
 	pg_atomic_uint64 backpressureThrottlingTime;
+	pg_atomic_uint64 currentClusterSize;
 
 	/* last feedback from each shard */
 	PageserverFeedback shard_ps_feedback[MAX_SHARDS];
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 002bf4e2ce..7debb6325e 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -282,6 +282,7 @@ WalproposerShmemInit(void)
 		memset(walprop_shared, 0, WalproposerShmemSize());
 		SpinLockInit(&walprop_shared->mutex);
 		pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
+		pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0);
 	}
 	LWLockRelease(AddinShmemInitLock);
 
@@ -1972,7 +1973,7 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 
 		/* Only one main shard sends non-zero currentClusterSize */
 		if (sk->appendResponse.ps_feedback.currentClusterSize > 0)
-			SetZenithCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize);
+			SetNeonCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize);
 
 		if (min_feedback.disk_consistent_lsn != standby_apply_lsn)
 		{
@@ -2094,6 +2095,18 @@ GetLogRepRestartLSN(WalProposer *wp)
 	return lrRestartLsn;
 }
 
+void SetNeonCurrentClusterSize(uint64 size)
+{
+	pg_atomic_write_u64(&walprop_shared->currentClusterSize, size);
+}
+
+uint64 GetNeonCurrentClusterSize(void)
+{
+	return pg_atomic_read_u64(&walprop_shared->currentClusterSize);
+}
+uint64 GetNeonCurrentClusterSize(void);
+
+
 static const walproposer_api walprop_pg = {
 	.get_shmem_state = walprop_pg_get_shmem_state,
 	.start_streaming = walprop_pg_start_streaming,
diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs
index 42340ba1df..c49495a4f3 100644
--- a/safekeeper/tests/walproposer_sim/walproposer_api.rs
+++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs
@@ -244,6 +244,7 @@ impl SimulationApi {
                 mutex: 0,
                 mineLastElectedTerm: 0,
                 backpressureThrottlingTime: pg_atomic_uint64 { value: 0 },
+                currentClusterSize: pg_atomic_uint64 { value: 0 },
                 shard_ps_feedback: [empty_feedback; 128],
                 num_shards: 0,
                 min_ps_feedback: empty_feedback,
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 3b09894ddb..c5d920a7d9 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 3b09894ddb8825b50c963942059eab1a2a0b0a89
+Subproject commit c5d920a7d9e9cbeb62b6c46f292db08162763f68
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 80cef885ad..af9ab67bc8 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 80cef885add1af6741aa31944c7d2c84d8f9098f
+Subproject commit af9ab67bc80afd94e4eb11c34f50c0a29c37eb1b
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 9007894722..111e82c45d 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 90078947229aa7f9ac5f7ed4527b2c7386d5332b
+Subproject commit 111e82c45d79728fdd3a4816605378c3cc5cfe84
diff --git a/vendor/revisions.json b/vendor/revisions.json
index ae524d70b1..18959f2ef2 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "postgres-v16": "90078947229aa7f9ac5f7ed4527b2c7386d5332b",
-  "postgres-v15": "80cef885add1af6741aa31944c7d2c84d8f9098f",
-  "postgres-v14": "3b09894ddb8825b50c963942059eab1a2a0b0a89"
+  "postgres-v16": "111e82c45d79728fdd3a4816605378c3cc5cfe84",
+  "postgres-v15": "af9ab67bc80afd94e4eb11c34f50c0a29c37eb1b",
+  "postgres-v14": "c5d920a7d9e9cbeb62b6c46f292db08162763f68"
 }

From 643683f41a6b25865d516201fddbe03fae537077 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 22 Mar 2024 21:01:51 -0400
Subject: [PATCH 0461/1571] fixup(#7204 / postgres): revert `IsPrimaryAlive`
 checks (#7209)

Fix #7204.

https://github.com/neondatabase/postgres/pull/400
https://github.com/neondatabase/postgres/pull/401
https://github.com/neondatabase/postgres/pull/402

These commits never go into prod. Detailed investigation will be posted
in another issue. Reverting the commits so that things can keep running
in prod. This pull request adds the test to start two replicas. It fails
on the current main https://github.com/neondatabase/neon/pull/7210 but
passes in this pull request.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_hot_standby.py       | 18 ++++++++++++++++++
 test_runner/regress/test_replication_start.py |  2 ++
 vendor/postgres-v14                           |  2 +-
 vendor/postgres-v15                           |  2 +-
 vendor/postgres-v16                           |  2 +-
 vendor/revisions.json                         |  6 +++---
 6 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 0497e1965c..ac3315b86f 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -84,3 +84,21 @@ def test_hot_standby(neon_simple_env: NeonEnv):
     # clean up
     if slow_down_send:
         sk_http.configure_failpoints(("sk-send-wal-replica-sleep", "off"))
+
+
+def test_2_replicas_start(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    with env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+    ) as primary:
+        time.sleep(1)
+        with env.endpoints.new_replica_start(
+            origin=primary, endpoint_id="secondary1"
+        ) as secondary1:
+            with env.endpoints.new_replica_start(
+                origin=primary, endpoint_id="secondary2"
+            ) as secondary2:
+                wait_replica_caughtup(primary, secondary1)
+                wait_replica_caughtup(primary, secondary2)
diff --git a/test_runner/regress/test_replication_start.py b/test_runner/regress/test_replication_start.py
index b4699c7be8..2360745990 100644
--- a/test_runner/regress/test_replication_start.py
+++ b/test_runner/regress/test_replication_start.py
@@ -1,7 +1,9 @@
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup
 
 
+@pytest.mark.xfail
 def test_replication_start(neon_simple_env: NeonEnv):
     env = neon_simple_env
 
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index c5d920a7d9..748643b468 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit c5d920a7d9e9cbeb62b6c46f292db08162763f68
+Subproject commit 748643b4683e9fe3b105011a6ba8a687d032cd65
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index af9ab67bc8..e7651e79c0 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit af9ab67bc80afd94e4eb11c34f50c0a29c37eb1b
+Subproject commit e7651e79c0c27fbddc3c724f5b9553222c28e395
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 111e82c45d..3946b2e2ea 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 111e82c45d79728fdd3a4816605378c3cc5cfe84
+Subproject commit 3946b2e2ea71d07af092099cb5bcae76a69b90d6
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 18959f2ef2..3c1b866137 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "postgres-v16": "111e82c45d79728fdd3a4816605378c3cc5cfe84",
-  "postgres-v15": "af9ab67bc80afd94e4eb11c34f50c0a29c37eb1b",
-  "postgres-v14": "c5d920a7d9e9cbeb62b6c46f292db08162763f68"
+  "postgres-v16": "3946b2e2ea71d07af092099cb5bcae76a69b90d6",
+  "postgres-v15": "e7651e79c0c27fbddc3c724f5b9553222c28e395",
+  "postgres-v14": "748643b4683e9fe3b105011a6ba8a687d032cd65"
 }

From 72103d481d1b27d9ae18e14b83ab4c985c3d42cf Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Sat, 23 Mar 2024 06:36:58 +0000
Subject: [PATCH 0462/1571] proxy: fix stack overflow in cancel publisher
 (#7212)

## Problem

stack overflow in blanket impl for `CancellationPublisher`

## Summary of changes

Removes `async_trait` and fixes the impl order to make it non-recursive.
---
 proxy/src/cancellation.rs                 | 15 ++++++++++++++
 proxy/src/redis/cancellation_publisher.rs | 24 +++++++++--------------
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 8054f33b6c..6151513614 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -211,4 +211,19 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn cancel_session_noop_regression() {
+        let handler = CancellationHandler::<()>::new(Default::default(), "local");
+        handler
+            .cancel_session(
+                CancelKeyData {
+                    backend_pid: 0,
+                    cancel_key: 0,
+                },
+                Uuid::new_v4(),
+            )
+            .await
+            .unwrap();
+    }
 }
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index d9efc3561b..422789813c 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -1,6 +1,5 @@
 use std::sync::Arc;
 
-use async_trait::async_trait;
 use pq_proto::CancelKeyData;
 use redis::AsyncCommands;
 use tokio::sync::Mutex;
@@ -13,8 +12,8 @@ use super::{
     notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME},
 };
 
-#[async_trait]
 pub trait CancellationPublisherMut: Send + Sync + 'static {
+    #[allow(async_fn_in_trait)]
     async fn try_publish(
         &mut self,
         cancel_key_data: CancelKeyData,
@@ -22,8 +21,8 @@ pub trait CancellationPublisherMut: Send + Sync + 'static {
     ) -> anyhow::Result<()>;
 }
 
-#[async_trait]
 pub trait CancellationPublisher: Send + Sync + 'static {
+    #[allow(async_fn_in_trait)]
     async fn try_publish(
         &self,
         cancel_key_data: CancelKeyData,
@@ -31,10 +30,9 @@ pub trait CancellationPublisher: Send + Sync + 'static {
     ) -> anyhow::Result<()>;
 }
 
-#[async_trait]
-impl CancellationPublisherMut for () {
+impl CancellationPublisher for () {
     async fn try_publish(
-        &mut self,
+        &self,
         _cancel_key_data: CancelKeyData,
         _session_id: Uuid,
     ) -> anyhow::Result<()> {
@@ -42,18 +40,16 @@ impl CancellationPublisherMut for () {
     }
 }
 
-#[async_trait]
-impl<P: CancellationPublisherMut> CancellationPublisher for P {
+impl<P: CancellationPublisher> CancellationPublisherMut for P {
     async fn try_publish(
-        &self,
-        _cancel_key_data: CancelKeyData,
-        _session_id: Uuid,
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
     ) -> anyhow::Result<()> {
-        self.try_publish(_cancel_key_data, _session_id).await
+        <P as CancellationPublisher>::try_publish(self, cancel_key_data, session_id).await
     }
 }
 
-#[async_trait]
 impl<P: CancellationPublisher> CancellationPublisher for Option<P> {
     async fn try_publish(
         &self,
@@ -68,7 +64,6 @@ impl<P: CancellationPublisher> CancellationPublisher for Option<P> {
     }
 }
 
-#[async_trait]
 impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
     async fn try_publish(
         &self,
@@ -145,7 +140,6 @@ impl RedisPublisherClient {
     }
 }
 
-#[async_trait]
 impl CancellationPublisherMut for RedisPublisherClient {
     async fn try_publish(
         &mut self,

From 3220f830b7fbb785d6db8a93775f46314f10a99b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sat, 23 Mar 2024 19:25:11 +0100
Subject: [PATCH 0463/1571] pageserver: use a single tokio runtime (#6555)

Before this PR, each core had 3 executor threads from 3 different
runtimes. With this PR, we just have one runtime, with one thread per
core. Switching to a single tokio runtime should reduce that effective
over-commit of CPU and in theory help with tail latencies -- iff all
tokio tasks are well-behaved and yield to the runtime regularly.

Are All Tasks Well-Behaved? Are We Ready?
-----------------------------------------

Sadly there doesn't seem to be good out-of-the box tokio tooling to
answer this question.

We *believe* all tasks are well behaved in today's code base, as of the
switch to `virtual_file_io_engine = "tokio-epoll-uring"` in production
(https://github.com/neondatabase/aws/pull/1121).

The only remaining executor-thread-blocking code is walredo and some
filesystem namespace operations.

Filesystem namespace operations work is being tracked in #6663 and not
considered likely to actually block at this time.

Regarding walredo, it currently does a blocking `poll` for read/write to
the pipe file descriptors we use for IPC with the walredo process.
There is an ongoing experiment to make walredo async (#6628), but it
needs more time because there are surprisingly tricky trade-offs that
are articulated in that PR's description (which itself is still WIP).
What's relevant for *this* PR is that
1. walredo is always CPU-bound
2. production tail latencies for walredo request-response
(`pageserver_wal_redo_seconds_bucket`) are
  - p90: with few exceptions, low hundreds of micro-seconds
  - p95: except on very packed pageservers, below 1ms
  - p99: all below 50ms, vast majority below 1ms
  - p99.9: almost all around 50ms, rarely at >= 70ms
- [Dashboard
Link](https://neonprod.grafana.net/d/edgggcrmki3uof/2024-03-walredo-latency?orgId=1&var-ds=ZNX49CDVz&var-pXX_by_instance=0.9&var-pXX_by_instance=0.99&var-pXX_by_instance=0.95&var-adhoc=instance%7C%21%3D%7Cpageserver-30.us-west-2.aws.neon.tech&var-per_instance_pXX_max_seconds=0.0005&from=1711049688777&to=1711136088777)

The ones below 1ms are below our current threshold for when we start
thinking about yielding to the executor.
The tens of milliseconds stalls aren't great, but, not least because of
the implicit overcommit of CPU by the three runtimes, we can't be sure
whether these tens of milliseconds are inherently necessary to do the
walredo work or whether we could be faster if there was less contention
for CPU.

On the first item (walredo being always CPU-bound work): it means that
walredo processes will always compete with the executor threads.
We could yield, using async walredo, but then we hit the trade-offs
explained in that PR.

tl;dr: the risk of stalling executor threads through blocking walredo
seems low, and switching to one runtime cleans up one potential source
for higher-than-necessary stall times (explained in the previous
paragraphs).


Code Changes
------------

- Remove the 3 different runtime definitions.
- Add a new definition called `THE_RUNTIME`.
- Use it in all places that previously used one of the 3 removed
runtimes.
- Remove the argument from `task_mgr`.
- Fix failpoint usage where `pausable_failpoint!` should have been used.
We encountered some actual failures because of this, e.g., hung
`get_metric()` calls during test teardown that would client-timeout
after 300s.

As indicated by the comment above `THE_RUNTIME`, we could take this
clean-up further.
But before we create so much churn, let's first validate that there's no
perf regression.


Performance
-----------

We will test this in staging using the various nightly benchmark runs.

However, the worst-case impact of this change is likely compaction
(=>image layer creation) competing with compute requests.
Image layer creation work can't be easily generated & repeated quickly
by pagebench.
So, we'll simply watch getpage & basebackup tail latencies in staging.

Additionally, I have done manual benchmarking using pagebench.
Report:
https://neondatabase.notion.site/2024-03-23-oneruntime-change-benchmarking-22a399c411e24399a73311115fb703ec?pvs=4
Tail latencies and throughput are marginally better (no regression =
good).
Except in a workload with 128 clients against one tenant.
There, the p99.9 and p99.99 getpage latency is about 2x worse (at
slightly lower throughput).
A dip in throughput every 20s (compaction_period_ is clearly visible,
and probably responsible for that worse tail latency.
This has potential to improve with async walredo, and is an edge case
workload anyway.


Future Work
-----------

1. Once this change has shown satisfying results in production, change
the codebase to use the ambient runtime instead of explicitly
referencing `THE_RUNTIME`.
2. Have a mode where we run with a single-threaded runtime, so we
uncover executor stalls more quickly.
3. Switch or write our own failpoints library that is async-native:
https://github.com/neondatabase/neon/issues/7216
---
 pageserver/src/bin/pageserver.rs              | 82 +++++++++----------
 pageserver/src/consumption_metrics.rs         |  3 +-
 pageserver/src/control_plane_client.rs        |  4 +-
 pageserver/src/disk_usage_eviction_task.rs    |  3 +-
 pageserver/src/page_service.rs                |  1 -
 pageserver/src/task_mgr.rs                    | 37 ++-------
 pageserver/src/tenant.rs                      |  2 +-
 pageserver/src/tenant/delete.rs               |  1 -
 pageserver/src/tenant/mgr.rs                  |  4 -
 .../src/tenant/remote_timeline_client.rs      | 11 ---
 pageserver/src/tenant/secondary.rs            |  4 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  4 +-
 pageserver/src/tenant/tasks.rs                |  6 +-
 pageserver/src/tenant/timeline.rs             |  6 +-
 pageserver/src/tenant/timeline/delete.rs      |  1 -
 .../src/tenant/timeline/eviction_task.rs      |  3 +-
 pageserver/src/tenant/timeline/walreceiver.rs |  5 +-
 .../walreceiver/walreceiver_connection.rs     | 18 ++--
 test_runner/regress/test_backpressure.py      |  2 +-
 test_runner/regress/test_timeline_size.py     | 26 ++++--
 20 files changed, 92 insertions(+), 131 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index ef616c0a39..f4a231f217 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -15,9 +15,9 @@ use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
 use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
-use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
 use remote_storage::GenericRemoteStorage;
+use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
 use tracing::*;
 
@@ -28,7 +28,7 @@ use pageserver::{
     deletion_queue::DeletionQueue,
     http, page_cache, page_service, task_mgr,
     task_mgr::TaskKind,
-    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
+    task_mgr::THE_RUNTIME,
     tenant::mgr,
     virtual_file,
 };
@@ -323,7 +323,7 @@ fn start_pageserver(
 
     // Launch broker client
     // The storage_broker::connect call needs to happen inside a tokio runtime thread.
-    let broker_client = WALRECEIVER_RUNTIME
+    let broker_client = THE_RUNTIME
         .block_on(async {
             // Note: we do not attempt connecting here (but validate endpoints sanity).
             storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)
@@ -391,7 +391,7 @@ fn start_pageserver(
         conf,
     );
     if let Some(deletion_workers) = deletion_workers {
-        deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
+        deletion_workers.spawn_with(THE_RUNTIME.handle());
     }
 
     // Up to this point no significant I/O has been done: this should have been fast.  Record
@@ -423,7 +423,7 @@ fn start_pageserver(
 
     // Scan the local 'tenants/' directory and start loading the tenants
     let deletion_queue_client = deletion_queue.new_client();
-    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+    let tenant_manager = THE_RUNTIME.block_on(mgr::init_tenant_mgr(
         conf,
         TenantSharedResources {
             broker_client: broker_client.clone(),
@@ -435,7 +435,7 @@ fn start_pageserver(
     ))?;
     let tenant_manager = Arc::new(tenant_manager);
 
-    BACKGROUND_RUNTIME.spawn({
+    THE_RUNTIME.spawn({
         let shutdown_pageserver = shutdown_pageserver.clone();
         let drive_init = async move {
             // NOTE: unlike many futures in pageserver, this one is cancellation-safe
@@ -545,7 +545,7 @@ fn start_pageserver(
     // Start up the service to handle HTTP mgmt API request. We created the
     // listener earlier already.
     {
-        let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
+        let _rt_guard = THE_RUNTIME.enter();
 
         let router_state = Arc::new(
             http::routes::State::new(
@@ -569,7 +569,6 @@ fn start_pageserver(
             .with_graceful_shutdown(task_mgr::shutdown_watcher());
 
         task_mgr::spawn(
-            MGMT_REQUEST_RUNTIME.handle(),
             TaskKind::HttpEndpointListener,
             None,
             None,
@@ -594,7 +593,6 @@ fn start_pageserver(
         let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
 
         task_mgr::spawn(
-            crate::BACKGROUND_RUNTIME.handle(),
             TaskKind::MetricsCollection,
             None,
             None,
@@ -643,7 +641,6 @@ fn start_pageserver(
             DownloadBehavior::Error,
         );
         task_mgr::spawn(
-            COMPUTE_REQUEST_RUNTIME.handle(),
             TaskKind::LibpqEndpointListener,
             None,
             None,
@@ -667,42 +664,37 @@ fn start_pageserver(
     let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
 
     // All started up! Now just sit and wait for shutdown signal.
-    {
-        use signal_hook::consts::*;
-        let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
-            let mut signals =
-                signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
-            return signals
-                .forever()
-                .next()
-                .expect("forever() never returns None unless explicitly closed");
-        });
-        let signal = BACKGROUND_RUNTIME
-            .block_on(signal_handler)
-            .expect("join error");
-        match signal {
-            SIGQUIT => {
-                info!("Got signal {signal}. Terminating in immediate shutdown mode",);
-                std::process::exit(111);
-            }
-            SIGINT | SIGTERM => {
-                info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
 
-                // This cancels the `shutdown_pageserver` cancellation tree.
-                // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
-                // The plan is to change that over time.
-                shutdown_pageserver.take();
-                let bg_remote_storage = remote_storage.clone();
-                let bg_deletion_queue = deletion_queue.clone();
-                BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
-                    &tenant_manager,
-                    bg_remote_storage.map(|_| bg_deletion_queue),
-                    0,
-                ));
-                unreachable!()
-            }
-            _ => unreachable!(),
-        }
+    {
+        THE_RUNTIME.block_on(async move {
+            let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
+            let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
+            let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
+            let signal = tokio::select! {
+                _ = sigquit.recv() => {
+                    info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
+                    std::process::exit(111);
+                }
+                _ = sigint.recv() => { "SIGINT" },
+                _ = sigterm.recv() => { "SIGTERM" },
+            };
+
+            info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
+
+            // This cancels the `shutdown_pageserver` cancellation tree.
+            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
+            // The plan is to change that over time.
+            shutdown_pageserver.take();
+            let bg_remote_storage = remote_storage.clone();
+            let bg_deletion_queue = deletion_queue.clone();
+            pageserver::shutdown_pageserver(
+                &tenant_manager,
+                bg_remote_storage.map(|_| bg_deletion_queue),
+                0,
+            )
+            .await;
+            unreachable!()
+        })
     }
 }
 
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 3429e3a0a6..c82be8c581 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -1,7 +1,7 @@
 //! Periodically collect consumption metrics for all active tenants
 //! and push them to a HTTP endpoint.
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
+use crate::task_mgr::{self, TaskKind};
 use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
 use camino::Utf8PathBuf;
@@ -61,7 +61,6 @@ pub async fn collect_metrics(
     let worker_ctx =
         ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
     task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
         TaskKind::CalculateSyntheticSize,
         None,
         None,
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 42c800822b..55d80c2966 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -173,8 +173,6 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
             register,
         };
 
-        fail::fail_point!("control-plane-client-re-attach");
-
         let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
         tracing::info!(
             "Received re-attach response with {} tenants",
@@ -210,7 +208,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                 .collect(),
         };
 
-        fail::fail_point!("control-plane-client-validate");
+        crate::tenant::pausable_failpoint!("control-plane-client-validate");
 
         let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
 
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 92c1475aef..6b68acd1c7 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -59,7 +59,7 @@ use utils::{completion, id::TimelineId};
 use crate::{
     config::PageServerConf,
     metrics::disk_usage_based_eviction::METRICS,
-    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
+    task_mgr::{self, TaskKind},
     tenant::{
         self,
         mgr::TenantManager,
@@ -202,7 +202,6 @@ pub fn launch_disk_usage_global_eviction_task(
     info!("launching disk usage based eviction task");
 
     task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
         TaskKind::DiskUsageEviction,
         None,
         None,
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index f3ceb7d3e6..fa1a0f535b 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -180,7 +180,6 @@ pub async fn libpq_listener_main(
                 // only deal with a particular timeline, but we don't know which one
                 // yet.
                 task_mgr::spawn(
-                    &tokio::runtime::Handle::current(),
                     TaskKind::PageRequestHandler,
                     None,
                     None,
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 69e163effa..2d97389982 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -98,42 +98,22 @@ use utils::id::TimelineId;
 // other operations, if the upload tasks e.g. get blocked on locks. It shouldn't
 // happen, but still.
 //
-pub static COMPUTE_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("compute request worker")
-        .enable_all()
-        .build()
-        .expect("Failed to create compute request runtime")
-});
 
-pub static MGMT_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+/// The single tokio runtime used by all pageserver code.
+/// In the past, we had multiple runtimes, and in the future we should weed out
+/// remaining references to this global field and rely on ambient runtime instead,
+/// i.e., use `tokio::spawn` instead of `THE_RUNTIME.spawn()`, etc.
+pub static THE_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
     tokio::runtime::Builder::new_multi_thread()
-        .thread_name("mgmt request worker")
-        .enable_all()
-        .build()
-        .expect("Failed to create mgmt request runtime")
-});
-
-pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("walreceiver worker")
-        .enable_all()
-        .build()
-        .expect("Failed to create walreceiver runtime")
-});
-
-pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("background op worker")
         // if you change the number of worker threads please change the constant below
         .enable_all()
         .build()
         .expect("Failed to create background op runtime")
 });
 
-pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
+pub(crate) static THE_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
     // force init and thus panics
-    let _ = BACKGROUND_RUNTIME.handle();
+    let _ = THE_RUNTIME.handle();
     // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
     // tokio would had already panicked for parsing errors or NotUnicode
     //
@@ -325,7 +305,6 @@ struct PageServerTask {
 /// Note: if shutdown_process_on_error is set to true failure
 ///   of the task will lead to shutdown of entire process
 pub fn spawn<F>(
-    runtime: &tokio::runtime::Handle,
     kind: TaskKind,
     tenant_shard_id: Option<TenantShardId>,
     timeline_id: Option<TimelineId>,
@@ -354,7 +333,7 @@ where
 
     let task_name = name.to_string();
     let task_cloned = Arc::clone(&task);
-    let join_handle = runtime.spawn(task_wrapper(
+    let join_handle = THE_RUNTIME.spawn(task_wrapper(
         task_name,
         task_id,
         task_cloned,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index fe48741a89..7bd85b6fd5 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -144,6 +144,7 @@ macro_rules! pausable_failpoint {
         }
     };
 }
+pub(crate) use pausable_failpoint;
 
 pub mod blob_io;
 pub mod block_io;
@@ -661,7 +662,6 @@ impl Tenant {
         let tenant_clone = Arc::clone(&tenant);
         let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
         task_mgr::spawn(
-            &tokio::runtime::Handle::current(),
             TaskKind::Attach,
             Some(tenant_shard_id),
             None,
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 7d37873a67..3866136dbd 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -482,7 +482,6 @@ impl DeleteTenantFlow {
         let tenant_shard_id = tenant.tenant_shard_id;
 
         task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
             TaskKind::TimelineDeletionWorker,
             Some(tenant_shard_id),
             None,
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 97a505ded9..34ca43a173 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1850,7 +1850,6 @@ impl TenantManager {
         let task_tenant_id = None;
 
         task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
             TaskKind::MgmtRequest,
             task_tenant_id,
             None,
@@ -2816,15 +2815,12 @@ pub(crate) fn immediate_gc(
 
     // TODO: spawning is redundant now, need to hold the gate
     task_mgr::spawn(
-        &tokio::runtime::Handle::current(),
         TaskKind::GarbageCollector,
         Some(tenant_shard_id),
         Some(timeline_id),
         &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
         false,
         async move {
-            fail::fail_point!("immediate_gc_task_pre");
-
             #[allow(unused_mut)]
             let mut result = tenant
                 .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 40be2ca8f3..c0a150eb0d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -223,7 +223,6 @@ use crate::{
     config::PageServerConf,
     task_mgr,
     task_mgr::TaskKind,
-    task_mgr::BACKGROUND_RUNTIME,
     tenant::metadata::TimelineMetadata,
     tenant::upload_queue::{
         UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
@@ -307,8 +306,6 @@ pub enum PersistIndexPartWithDeletedFlagError {
 pub struct RemoteTimelineClient {
     conf: &'static PageServerConf,
 
-    runtime: tokio::runtime::Handle,
-
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
     generation: Generation,
@@ -341,12 +338,6 @@ impl RemoteTimelineClient {
     ) -> RemoteTimelineClient {
         RemoteTimelineClient {
             conf,
-            runtime: if cfg!(test) {
-                // remote_timeline_client.rs tests rely on current-thread runtime
-                tokio::runtime::Handle::current()
-            } else {
-                BACKGROUND_RUNTIME.handle().clone()
-            },
             tenant_shard_id,
             timeline_id,
             generation,
@@ -1281,7 +1272,6 @@ impl RemoteTimelineClient {
             let tenant_shard_id = self.tenant_shard_id;
             let timeline_id = self.timeline_id;
             task_mgr::spawn(
-                &self.runtime,
                 TaskKind::RemoteUploadTask,
                 Some(self.tenant_shard_id),
                 Some(self.timeline_id),
@@ -1876,7 +1866,6 @@ mod tests {
         fn build_client(&self, generation: Generation) -> Arc<RemoteTimelineClient> {
             Arc::new(RemoteTimelineClient {
                 conf: self.harness.conf,
-                runtime: tokio::runtime::Handle::current(),
                 tenant_shard_id: self.harness.tenant_shard_id,
                 timeline_id: TIMELINE_ID,
                 generation,
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 19f36c722e..b0babb1308 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -8,7 +8,7 @@ use std::{sync::Arc, time::SystemTime};
 use crate::{
     config::PageServerConf,
     disk_usage_eviction_task::DiskUsageEvictionInfo,
-    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
+    task_mgr::{self, TaskKind},
     virtual_file::MaybeFatalIo,
 };
 
@@ -317,7 +317,6 @@ pub fn spawn_tasks(
         tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
 
     task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
         TaskKind::SecondaryDownloads,
         None,
         None,
@@ -338,7 +337,6 @@ pub fn spawn_tasks(
     );
 
     task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
         TaskKind::SecondaryUploads,
         None,
         None,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 8ba37b5a86..e101a40da4 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1447,7 +1447,7 @@ impl LayerInner {
         #[cfg(test)]
         tokio::task::spawn(fut);
         #[cfg(not(test))]
-        crate::task_mgr::BACKGROUND_RUNTIME.spawn(fut);
+        crate::task_mgr::THE_RUNTIME.spawn(fut);
     }
 
     /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME.
@@ -1458,7 +1458,7 @@ impl LayerInner {
         #[cfg(test)]
         tokio::task::spawn_blocking(f);
         #[cfg(not(test))]
-        crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(f);
+        crate::task_mgr::THE_RUNTIME.spawn_blocking(f);
     }
 }
 
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index e4f5f75132..db32223a60 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -8,7 +8,7 @@ use std::time::{Duration, Instant};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
-use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::task_mgr::TaskKind;
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
@@ -18,7 +18,7 @@ use utils::{backoff, completion};
 
 static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
     once_cell::sync::Lazy::new(|| {
-        let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
+        let total_threads = *crate::task_mgr::THE_RUNTIME_WORKER_THREADS;
         let permits = usize::max(
             1,
             // while a lot of the work is done on spawn_blocking, we still do
@@ -85,7 +85,6 @@ pub fn start_background_loops(
 ) {
     let tenant_shard_id = tenant.tenant_shard_id;
     task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
         TaskKind::Compaction,
         Some(tenant_shard_id),
         None,
@@ -109,7 +108,6 @@ pub fn start_background_loops(
         },
     );
     task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
         TaskKind::GarbageCollector,
         Some(tenant_shard_id),
         None,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7523130f23..289dee75ab 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1723,7 +1723,6 @@ impl Timeline {
             initdb_optimization_count: 0,
         };
         task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::LayerFlushTask,
             Some(self.tenant_shard_id),
             Some(self.timeline_id),
@@ -2086,7 +2085,6 @@ impl Timeline {
             DownloadBehavior::Download,
         );
         task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::InitialLogicalSizeCalculation,
             Some(self.tenant_shard_id),
             Some(self.timeline_id),
@@ -2264,7 +2262,6 @@ impl Timeline {
             DownloadBehavior::Download,
         );
         task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::OndemandLogicalSizeCalculation,
             Some(self.tenant_shard_id),
             Some(self.timeline_id),
@@ -3840,7 +3837,7 @@ impl Timeline {
         };
         let timer = self.metrics.garbage_collect_histo.start_timer();
 
-        fail_point!("before-timeline-gc");
+        pausable_failpoint!("before-timeline-gc");
 
         // Is the timeline being deleted?
         if self.is_stopping() {
@@ -4151,7 +4148,6 @@ impl Timeline {
 
         let self_clone = Arc::clone(&self);
         let task_id = task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::DownloadAllRemoteLayers,
             Some(self.tenant_shard_id),
             Some(self.timeline_id),
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index a0c9d99196..d2272fc75f 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -443,7 +443,6 @@ impl DeleteTimelineFlow {
         let timeline_id = timeline.timeline_id;
 
         task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
             TaskKind::TimelineDeletionWorker,
             Some(tenant_shard_id),
             Some(timeline_id),
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index dd769d4121..f84a4b0dac 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -28,7 +28,7 @@ use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
 use crate::{
     context::{DownloadBehavior, RequestContext},
     pgdatadir_mapping::CollectKeySpaceError,
-    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
+    task_mgr::{self, TaskKind},
     tenant::{
         tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
     },
@@ -56,7 +56,6 @@ impl Timeline {
         let self_clone = Arc::clone(self);
         let background_tasks_can_start = background_tasks_can_start.cloned();
         task_mgr::spawn(
-            BACKGROUND_RUNTIME.handle(),
             TaskKind::Eviction,
             Some(self.tenant_shard_id),
             Some(self.timeline_id),
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index 2fab6722b8..3592dda8d7 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -24,7 +24,7 @@ mod connection_manager;
 mod walreceiver_connection;
 
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
+use crate::task_mgr::{self, TaskKind};
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::timeline::walreceiver::connection_manager::{
     connection_manager_loop_step, ConnectionManagerState,
@@ -82,7 +82,6 @@ impl WalReceiver {
         let loop_status = Arc::new(std::sync::RwLock::new(None));
         let manager_status = Arc::clone(&loop_status);
         task_mgr::spawn(
-            WALRECEIVER_RUNTIME.handle(),
             TaskKind::WalReceiverManager,
             Some(timeline.tenant_shard_id),
             Some(timeline_id),
@@ -181,7 +180,7 @@ impl<E: Clone> TaskHandle<E> {
         let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);
 
         let cancellation_clone = cancellation.clone();
-        let join_handle = WALRECEIVER_RUNTIME.spawn(async move {
+        let join_handle = tokio::spawn(async move {
             events_sender.send(TaskStateUpdate::Started).ok();
             task(events_sender, cancellation_clone).await
             // events_sender is dropped at some point during the .await above.
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index d9f780cfd1..cf87cc6ce0 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -11,7 +11,6 @@ use std::{
 use anyhow::{anyhow, Context};
 use bytes::BytesMut;
 use chrono::{NaiveDateTime, Utc};
-use fail::fail_point;
 use futures::StreamExt;
 use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
 use postgres_ffi::WAL_SEGMENT_SIZE;
@@ -27,9 +26,7 @@ use super::TaskStateUpdate;
 use crate::{
     context::RequestContext,
     metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
-    task_mgr,
-    task_mgr::TaskKind,
-    task_mgr::WALRECEIVER_RUNTIME,
+    task_mgr::{self, TaskKind},
     tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
     walingest::WalIngest,
     walrecord::DecodedWALRecord,
@@ -163,7 +160,6 @@ pub(super) async fn handle_walreceiver_connection(
     );
     let connection_cancellation = cancellation.clone();
     task_mgr::spawn(
-        WALRECEIVER_RUNTIME.handle(),
         TaskKind::WalReceiverConnectionPoller,
         Some(timeline.tenant_shard_id),
         Some(timeline.timeline_id),
@@ -329,7 +325,17 @@ pub(super) async fn handle_walreceiver_connection(
                             filtered_records += 1;
                         }
 
-                        fail_point!("walreceiver-after-ingest");
+                        // don't simply use pausable_failpoint here because its spawn_blocking slows
+                        // slows down the tests too much.
+                        fail::fail_point!("walreceiver-after-ingest-blocking");
+                        if let Err(()) = (|| {
+                            fail::fail_point!("walreceiver-after-ingest-pause-activate", |_| {
+                                Err(())
+                            });
+                            Ok(())
+                        })() {
+                            pausable_failpoint!("walreceiver-after-ingest-pause");
+                        }
 
                         last_rec_lsn = lsn;
 
diff --git a/test_runner/regress/test_backpressure.py b/test_runner/regress/test_backpressure.py
index 819912dd05..af17a2e89d 100644
--- a/test_runner/regress/test_backpressure.py
+++ b/test_runner/regress/test_backpressure.py
@@ -116,7 +116,7 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder):
     # Configure failpoint to slow down walreceiver ingest
     with closing(env.pageserver.connect()) as psconn:
         with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:
-            pscur.execute("failpoints walreceiver-after-ingest=sleep(20)")
+            pscur.execute("failpoints walreceiver-after-ingest-blocking=sleep(20)")
 
     # FIXME
     # Wait for the check thread to start
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 628c484fbd..efd257900d 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -931,7 +931,7 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
     env.pageserver.stop()
     env.pageserver.start(
         extra_env_vars={
-            "FAILPOINTS": "initial-size-calculation-permit-pause=pause;walreceiver-after-ingest=pause"
+            "FAILPOINTS": "initial-size-calculation-permit-pause=pause;walreceiver-after-ingest-pause-activate=return(1);walreceiver-after-ingest-pause=pause"
         }
     )
 
@@ -953,7 +953,11 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
     assert details["current_logical_size_is_accurate"] is True
 
     client.configure_failpoints(
-        [("initial-size-calculation-permit-pause", "off"), ("walreceiver-after-ingest", "off")]
+        [
+            ("initial-size-calculation-permit-pause", "off"),
+            ("walreceiver-after-ingest-pause-activate", "off"),
+            ("walreceiver-after-ingest-pause", "off"),
+        ]
     )
 
 
@@ -983,7 +987,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
     # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation
     env.pageserver.start(
         extra_env_vars={
-            "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause"
+            "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest-pause-activate=return(1);walreceiver-after-ingest-pause=pause"
         }
     )
 
@@ -1029,7 +1033,11 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
     other_is_attaching()
 
     client.configure_failpoints(
-        [("timeline-calculate-logical-size-pause", "off"), ("walreceiver-after-ingest", "off")]
+        [
+            ("timeline-calculate-logical-size-pause", "off"),
+            ("walreceiver-after-ingest-pause-activate", "off"),
+            ("walreceiver-after-ingest-pause", "off"),
+        ]
     )
 
 
@@ -1059,7 +1067,7 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met
     # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation
     env.pageserver.start(
         extra_env_vars={
-            "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause"
+            "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest-pause-activate=return(1);walreceiver-after-ingest-pause=pause"
         }
     )
 
@@ -1111,3 +1119,11 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met
         delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True)
     else:
         raise RuntimeError(activation_method)
+
+    client.configure_failpoints(
+        [
+            ("timeline-calculate-logical-size-pause", "off"),
+            ("walreceiver-after-ingest-pause-activate", "off"),
+            ("walreceiver-after-ingest-pause", "off"),
+        ]
+    )

From 3a4ebfb95dd19a499b574c812f6c8cd4adebb172 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 25 Mar 2024 09:38:12 +0000
Subject: [PATCH 0464/1571] test: fix `test_pageserver_recovery` flakyness
 (#7207)

## Problem
We recently introduced log file validation for the storage controller.
The heartbeater will WARN when it fails
for a node, hence the test fails.

Closes https://github.com/neondatabase/neon/issues/7159

## Summary of changes
* Warn only once for each set of heartbeat retries
* Allow list heartbeat warns
---
 control_plane/attachment_service/src/heartbeater.rs | 2 +-
 test_runner/regress/test_recovery.py                | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/control_plane/attachment_service/src/heartbeater.rs b/control_plane/attachment_service/src/heartbeater.rs
index e15de28920..7669680eb6 100644
--- a/control_plane/attachment_service/src/heartbeater.rs
+++ b/control_plane/attachment_service/src/heartbeater.rs
@@ -139,7 +139,7 @@ impl HeartbeaterTask {
                         .with_client_retries(
                             |client| async move { client.get_utilization().await },
                             &jwt_token,
-                            2,
+                            3,
                             3,
                             Duration::from_secs(1),
                             &cancel,
diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py
index 6aac1e1d84..ab5c8be256 100644
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -15,6 +15,13 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     env.pageserver.is_testing_enabled_or_skip()
 
+    # We expect the pageserver to exit, which will cause storage storage controller
+    # requests to fail and warn.
+    env.storage_controller.allowed_errors.append(".*management API still failed.*")
+    env.storage_controller.allowed_errors.append(
+        ".*Reconcile error.*error sending request for url.*"
+    )
+
     # Create a branch for us
     env.neon_cli.create_branch("test_pageserver_recovery", "main")
 

From 0099dfa56b1b24519b4948fe8705006c79b484a7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 25 Mar 2024 11:52:33 +0000
Subject: [PATCH 0465/1571] storage controller: tighten up secrets handling
 (#7105)

- Remove code for using AWS secrets manager, as we're deploying with
k8s->env vars instead
- Load each secret independently, so that one can mix CLI args with
environment variables, rather than requiring that all secrets are loaded
with the same mechanism.
- Add a 'strict mode', enabled by default, which will refuse to start if
secrets are not loaded. This avoids the risk of accidentially disabling
auth by omitting the public key, for example
---
 Cargo.lock                                    |  24 ---
 Cargo.toml                                    |   1 -
 control_plane/attachment_service/Cargo.toml   |   1 -
 control_plane/attachment_service/src/main.rs  | 180 ++++++++----------
 control_plane/src/storage_controller.rs       |   1 +
 .../fixtures/pageserver/allowed_errors.py     |   2 +
 6 files changed, 81 insertions(+), 128 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6409c79ef9..45b802c54f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -276,7 +276,6 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "aws-config",
- "aws-sdk-secretsmanager",
  "bytes",
  "camino",
  "clap",
@@ -433,29 +432,6 @@ dependencies = [
  "url",
 ]
 
-[[package]]
-name = "aws-sdk-secretsmanager"
-version = "1.14.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0a0b64e61e7d632d9df90a2e0f32630c68c24960cab1d27d848718180af883d3"
-dependencies = [
- "aws-credential-types",
- "aws-runtime",
- "aws-smithy-async",
- "aws-smithy-http",
- "aws-smithy-json",
- "aws-smithy-runtime",
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "aws-types",
- "bytes",
- "fastrand 2.0.0",
- "http 0.2.9",
- "once_cell",
- "regex-lite",
- "tracing",
-]
-
 [[package]]
 name = "aws-sdk-sso"
 version = "1.12.0"
diff --git a/Cargo.toml b/Cargo.toml
index 4dda63ff58..309ebbe119 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -52,7 +52,6 @@ async-stream = "0.3"
 async-trait = "0.1"
 aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "1.14"
-aws-sdk-secretsmanager = { version = "1.14.0" }
 aws-sdk-iam = "1.15.0"
 aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
 aws-smithy-types = "1.1.4"
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 34882659e3..0201e0ed86 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -16,7 +16,6 @@ testing = []
 [dependencies]
 anyhow.workspace = true
 aws-config.workspace = true
-aws-sdk-secretsmanager.workspace = true
 bytes.workspace = true
 camino.workspace = true
 clap.workspace = true
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 0a925a63f6..bd8d7f5c59 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -3,7 +3,6 @@ use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
-use aws_config::{BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
@@ -55,11 +54,31 @@ struct Cli {
     #[arg(long)]
     database_url: Option<String>,
 
+    /// Flag to enable dev mode, which permits running without auth
+    #[arg(long, default_value = "false")]
+    dev: bool,
+
     /// Grace period before marking unresponsive pageserver offline
     #[arg(long)]
     max_unavailable_interval: Option<humantime::Duration>,
 }
 
+enum StrictMode {
+    /// In strict mode, we will require that all secrets are loaded, i.e. security features
+    /// may not be implicitly turned off by omitting secrets in the environment.
+    Strict,
+    /// In dev mode, secrets are optional, and omitting a particular secret will implicitly
+    /// disable the auth related to it (e.g. no pageserver jwt key -> send unauthenticated
+    /// requests, no public key -> don't authenticate incoming requests).
+    Dev,
+}
+
+impl Default for StrictMode {
+    fn default() -> Self {
+        Self::Strict
+    }
+}
+
 /// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
 /// type encapsulates the logic to decide which and do the loading.
 struct Secrets {
@@ -70,13 +89,6 @@ struct Secrets {
 }
 
 impl Secrets {
-    const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url";
-    const PAGESERVER_JWT_TOKEN_SECRET: &'static str =
-        "neon-storage-controller-pageserver-jwt-token";
-    const CONTROL_PLANE_JWT_TOKEN_SECRET: &'static str =
-        "neon-storage-controller-control-plane-jwt-token";
-    const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
-
     const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
     const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
     const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
@@ -87,111 +99,41 @@ impl Secrets {
     /// - Environment variables if DATABASE_URL is set.
     /// - AWS Secrets Manager secrets
     async fn load(args: &Cli) -> anyhow::Result<Self> {
-        match &args.database_url {
-            Some(url) => Self::load_cli(url, args),
-            None => match std::env::var(Self::DATABASE_URL_ENV) {
-                Ok(database_url) => Self::load_env(database_url),
-                Err(_) => Self::load_aws_sm().await,
-            },
-        }
-    }
-
-    fn load_env(database_url: String) -> anyhow::Result<Self> {
-        let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
-            Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
-            Err(_) => None,
-        };
-        Ok(Self {
-            database_url,
-            public_key,
-            jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
-            control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
-        })
-    }
-
-    async fn load_aws_sm() -> anyhow::Result<Self> {
-        let Ok(region) = std::env::var("AWS_REGION") else {
-            anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
-        };
-        let config = aws_config::defaults(BehaviorVersion::v2023_11_09())
-            .region(Region::new(region.clone()))
-            .load()
-            .await;
-
-        let asm = aws_sdk_secretsmanager::Client::new(&config);
-
-        let Some(database_url) = asm
-            .get_secret_value()
-            .secret_id(Self::DATABASE_URL_SECRET)
-            .send()
-            .await?
-            .secret_string()
-            .map(str::to_string)
+        let Some(database_url) =
+            Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await
         else {
             anyhow::bail!(
-                "Database URL secret not found at {region}/{}",
-                Self::DATABASE_URL_SECRET
+                "Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)"
             )
         };
 
-        let jwt_token = asm
-            .get_secret_value()
-            .secret_id(Self::PAGESERVER_JWT_TOKEN_SECRET)
-            .send()
-            .await?
-            .secret_string()
-            .map(str::to_string);
-        if jwt_token.is_none() {
-            tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver");
-        }
-
-        let control_plane_jwt_token = asm
-            .get_secret_value()
-            .secret_id(Self::CONTROL_PLANE_JWT_TOKEN_SECRET)
-            .send()
-            .await?
-            .secret_string()
-            .map(str::to_string);
-        if jwt_token.is_none() {
-            tracing::warn!("No control plane JWT token set: this will only work if authentication is disabled on the pageserver");
-        }
-
-        let public_key = asm
-            .get_secret_value()
-            .secret_id(Self::PUBLIC_KEY_SECRET)
-            .send()
-            .await?
-            .secret_string()
-            .map(str::to_string);
-        let public_key = match public_key {
-            Some(key) => Some(JwtAuth::from_key(key)?),
-            None => {
-                tracing::warn!(
-                    "No public key set: inccoming HTTP requests will not be authenticated"
-                );
-                None
-            }
+        let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await {
+            Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?),
+            None => None,
         };
 
-        Ok(Self {
+        let this = Self {
             database_url,
             public_key,
-            jwt_token,
-            control_plane_jwt_token,
-        })
+            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await,
+            control_plane_jwt_token: Self::load_secret(
+                &args.control_plane_jwt_token,
+                Self::CONTROL_PLANE_JWT_TOKEN_ENV,
+            )
+            .await,
+        };
+
+        Ok(this)
     }
 
-    fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
-        let public_key = match &args.public_key {
-            None => None,
-            Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?),
-        };
-        Ok(Self {
-            database_url: database_url.to_owned(),
-            public_key,
-            jwt_token: args.jwt_token.clone(),
-            control_plane_jwt_token: args.control_plane_jwt_token.clone(),
-        })
+    async fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
+        if let Some(v) = cli {
+            Some(v.clone())
+        } else if let Ok(v) = std::env::var(env_name) {
+            Some(v)
+        } else {
+            None
+        }
     }
 }
 
@@ -247,8 +189,42 @@ async fn async_main() -> anyhow::Result<()> {
         args.listen
     );
 
+    let strict_mode = if args.dev {
+        StrictMode::Dev
+    } else {
+        StrictMode::Strict
+    };
+
     let secrets = Secrets::load(&args).await?;
 
+    // Validate required secrets and arguments are provided in strict mode
+    match strict_mode {
+        StrictMode::Strict
+            if (secrets.public_key.is_none()
+                || secrets.jwt_token.is_none()
+                || secrets.control_plane_jwt_token.is_none()) =>
+        {
+            // Production systems should always have secrets configured: if public_key was not set
+            // then we would implicitly disable auth.
+            anyhow::bail!(
+                    "Insecure config!  One or more secrets is not set.  This is only permitted in `--dev` mode"
+                );
+        }
+        StrictMode::Strict if args.compute_hook_url.is_none() => {
+            // Production systems should always have a compute hook set, to prevent falling
+            // back to trying to use neon_local.
+            anyhow::bail!(
+                "`--compute-hook-url` is not set: this is only permitted in `--dev` mode"
+            );
+        }
+        StrictMode::Strict => {
+            tracing::info!("Starting in strict mode: configuration is OK.")
+        }
+        StrictMode::Dev => {
+            tracing::warn!("Starting in dev mode: this may be an insecure configuration.")
+        }
+    }
+
     let config = Config {
         jwt_token: secrets.jwt_token,
         control_plane_jwt_token: secrets.control_plane_jwt_token,
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index e7697ecac8..7f2b973391 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -279,6 +279,7 @@ impl StorageController {
             &self.listen,
             "-p",
             self.path.as_ref(),
+            "--dev",
             "--database-url",
             &database_url,
             "--max-unavailable-interval",
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index ec0f81b380..d7f682dad3 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -96,6 +96,8 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
     ".*Call to node.*management API.*failed.*ReceiveBody.*",
     # Many tests will start up with a node offline
     ".*startup_reconcile: Could not scan node.*",
+    # Tests run in dev mode
+    ".*Starting in dev mode.*",
 ]
 
 
From adb05262628a1d3259617a066eeb555d3075e4d2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 25 Mar 2024 11:52:50 +0000
Subject: [PATCH 0466/1571] pageserver: track total ephemeral layer bytes
 (#7182)

## Problem

Large quantities of ephemeral layer data can lead to excessive memory
consumption (https://github.com/neondatabase/neon/issues/6939). We
currently don't have a way to know how much ephemeral layer data is
present on a pageserver.

Before we can add new behaviors to proactively roll layers in response
to too much ephemeral data, we must calculate that total.

Related: https://github.com/neondatabase/neon/issues/6916

## Summary of changes

- Create GlobalResources and GlobalResourceUnits types, where timelines
carry a GlobalResourceUnits in their TimelineWriterState.
- Periodically update the size in GlobalResourceUnits:
  - During tick()
  - During layer roll
- During put() if the latest value has drifted more than 10MB since our
last update
- Expose the value of the global ephemeral layer bytes counter as a
prometheus metric.
- Extend the lifetime of TimelineWriterState:
  - Instead of dropping it in TimelineWriter::drop, let it remain.
- Drop TimelineWriterState in roll_layer: this drops our guard on the
global byte count to reflect the fact that we're freezing the layer.
- Ensure the validity of the later in the writer state by clearing the
state in the same place we freeze layers, and asserting on the
write-ability of the layer in `writer()`
- Add a 'context' parameter to `get_open_layer_action` so that it can
skip the prev_lsn==lsn check when called in tick() -- this is needed
because now tick is called with a populated state, where
prev_lsn==Some(lsn) is true for an idle timeline.
- Extend layer rolling test to use this metric
---
 pageserver/src/metrics.rs                     |   8 ++
 .../tenant/storage_layer/inmemory_layer.rs    | 111 ++++++++++++++++++
 pageserver/src/tenant/timeline.rs             |   3 +
 test_runner/fixtures/pageserver/utils.py      |   4 +-
 .../test_pageserver_small_inmemory_layers.py  |  45 ++++++-
 5 files changed, 162 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 075bb76a1b..6de284ee8b 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -699,6 +699,14 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
     .expect("Failed to register pageserver_startup_is_loading")
 });
 
+pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_timeline_ephemeral_bytes",
+        "Total number of bytes in ephemeral layers, summed for all timelines.  Approximate, lazily updated."
+    )
+    .expect("Failed to register metric")
+});
+
 /// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
 /// like how long it took to load.
 ///
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 5f1db21d49..869d175d8d 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -23,8 +23,12 @@ use tracing::*;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
+use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
+use std::cmp::Ordering;
 use std::fmt::Write as _;
 use std::ops::Range;
+use std::sync::atomic::Ordering as AtomicOrdering;
+use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::{RwLock, RwLockWriteGuard};
 
 use super::{
@@ -70,6 +74,8 @@ pub struct InMemoryLayerInner {
     /// Each serialized Value is preceded by a 'u32' length field.
     /// PerSeg::page_versions map stores offsets into this file.
     file: EphemeralFile,
+
+    resource_units: GlobalResourceUnits,
 }
 
 impl std::fmt::Debug for InMemoryLayerInner {
@@ -78,6 +84,101 @@ impl std::fmt::Debug for InMemoryLayerInner {
     }
 }
 
+/// State shared by all in-memory (ephemeral) layers.  Updated infrequently during background ticks in Timeline,
+/// to minimize contention.
+///
+/// This global state is used to implement behaviors that require a global view of the system, e.g.
+/// rolling layers proactively to limit the total amount of dirty data.
+struct GlobalResources {
+    // How many bytes are in all EphemeralFile objects
+    dirty_bytes: AtomicU64,
+    // How many layers are contributing to dirty_bytes
+    dirty_layers: AtomicUsize,
+}
+
+// Per-timeline RAII struct for its contribution to [`GlobalResources`]
+struct GlobalResourceUnits {
+    // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible
+    // for decrementing the global counter by this many bytes when dropped.
+    dirty_bytes: u64,
+}
+
+impl GlobalResourceUnits {
+    // Hint for the layer append path to update us when the layer size differs from the last
+    // call to update_size by this much.  If we don't reach this threshold, we'll still get
+    // updated when the Timeline "ticks" in the background.
+    const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024;
+
+    fn new() -> Self {
+        GLOBAL_RESOURCES
+            .dirty_layers
+            .fetch_add(1, AtomicOrdering::Relaxed);
+        Self { dirty_bytes: 0 }
+    }
+
+    /// Do not call this frequently: all timelines will write to these same global atomics,
+    /// so this is a relatively expensive operation.  Wait at least a few seconds between calls.
+    fn publish_size(&mut self, size: u64) {
+        let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
+            Ordering::Equal => {
+                return;
+            }
+            Ordering::Greater => {
+                let delta = size - self.dirty_bytes;
+                let old = GLOBAL_RESOURCES
+                    .dirty_bytes
+                    .fetch_add(delta, AtomicOrdering::Relaxed);
+                old + delta
+            }
+            Ordering::Less => {
+                let delta = self.dirty_bytes - size;
+                let old = GLOBAL_RESOURCES
+                    .dirty_bytes
+                    .fetch_sub(delta, AtomicOrdering::Relaxed);
+                old - delta
+            }
+        };
+
+        // This is a sloppy update: concurrent updates to the counter will race, and the exact
+        // value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes.
+        // That's okay: as long as the metric contains some recent value, it doesn't have to always
+        // be literally the last update.
+        TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);
+
+        self.dirty_bytes = size;
+    }
+
+    // Call publish_size if the input size differs from last published size by more than
+    // the drift limit
+    fn maybe_publish_size(&mut self, size: u64) {
+        let publish = match size.cmp(&self.dirty_bytes) {
+            Ordering::Equal => false,
+            Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT,
+            Ordering::Less => self.dirty_bytes - size > Self::MAX_SIZE_DRIFT,
+        };
+
+        if publish {
+            self.publish_size(size);
+        }
+    }
+}
+
+impl Drop for GlobalResourceUnits {
+    fn drop(&mut self) {
+        GLOBAL_RESOURCES
+            .dirty_layers
+            .fetch_sub(1, AtomicOrdering::Relaxed);
+
+        // Subtract our contribution to the global total dirty bytes
+        self.publish_size(0);
+    }
+}
+
+static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
+    dirty_bytes: AtomicU64::new(0),
+    dirty_layers: AtomicUsize::new(0),
+};
+
 impl InMemoryLayer {
     pub(crate) fn get_timeline_id(&self) -> TimelineId {
         self.timeline_id
@@ -328,6 +429,7 @@ impl InMemoryLayer {
             inner: RwLock::new(InMemoryLayerInner {
                 index: HashMap::new(),
                 file,
+                resource_units: GlobalResourceUnits::new(),
             }),
         })
     }
@@ -378,9 +480,18 @@ impl InMemoryLayer {
             warn!("Key {} at {} already exists", key, lsn);
         }
 
+        let size = locked_inner.file.len();
+        locked_inner.resource_units.maybe_publish_size(size);
+
         Ok(())
     }
 
+    pub(crate) async fn tick(&self) {
+        let mut inner = self.inner.write().await;
+        let size = inner.file.len();
+        inner.resource_units.publish_size(size);
+    }
+
     pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
         // TODO: Currently, we just leak the storage for any deleted keys
         Ok(())
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 289dee75ab..15ffa72aaa 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4465,6 +4465,9 @@ impl<'a> TimelineWriter<'a> {
         let action = self.get_open_layer_action(last_record_lsn, 0);
         if action == OpenLayerAction::Roll {
             self.roll_layer(last_record_lsn).await?;
+        } else if let Some(writer_state) = &mut *self.write_guard {
+            // Periodic update of statistics
+            writer_state.open_layer.tick().await;
         }
 
         Ok(())
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index cf64c86821..693771dd3d 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -62,9 +62,7 @@ def wait_for_upload(
         )
         time.sleep(1)
     raise Exception(
-        "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
-            lsn, current_lsn
-        )
+        f"timed out while waiting for {tenant}/{timeline} remote_consistent_lsn to reach {lsn}, was {current_lsn}"
     )
 
 
diff --git a/test_runner/regress/test_pageserver_small_inmemory_layers.py b/test_runner/regress/test_pageserver_small_inmemory_layers.py
index 5d55020e3c..714d1c1229 100644
--- a/test_runner/regress/test_pageserver_small_inmemory_layers.py
+++ b/test_runner/regress/test_pageserver_small_inmemory_layers.py
@@ -1,5 +1,4 @@
 import asyncio
-import time
 from typing import Tuple
 
 import pytest
@@ -10,7 +9,7 @@ from fixtures.neon_fixtures import (
     tenant_get_shards,
 )
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pageserver.utils import wait_for_last_record_lsn
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
@@ -61,6 +60,15 @@ def wait_until_pageserver_is_caught_up(
             assert waited >= last_flush_lsn
 
 
+def wait_until_pageserver_has_uploaded(
+    env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]]
+):
+    for tenant, timeline, last_flush_lsn in last_flush_lsns:
+        shards = tenant_get_shards(env, tenant)
+        for tenant_shard_id, pageserver in shards:
+            wait_for_upload(pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn)
+
+
 def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float:
     def query():
         value = pageserver_http.get_metric_value("pageserver_wal_ingest_records_received_total")
@@ -86,25 +94,50 @@ def test_pageserver_small_inmemory_layers(
     The workload creates a number of timelines and writes some data to each,
     but not enough to trigger flushes via the `checkpoint_distance` config.
     """
+
+    def get_dirty_bytes():
+        v = (
+            env.pageserver.http_client().get_metric_value("pageserver_timeline_ephemeral_bytes")
+            or 0
+        )
+        log.info(f"dirty_bytes: {v}")
+        return v
+
+    def assert_dirty_bytes(v):
+        assert get_dirty_bytes() == v
+
     env = neon_env_builder.init_configs()
     env.start()
 
     last_flush_lsns = asyncio.run(workload(env, TIMELINE_COUNT, ENTRIES_PER_TIMELINE))
     wait_until_pageserver_is_caught_up(env, last_flush_lsns)
 
+    # We didn't write enough data to trigger a size-based checkpoint
+    assert get_dirty_bytes() > 0
+
     ps_http_client = env.pageserver.http_client()
     total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client)
 
-    log.info("Sleeping for checkpoint timeout ...")
-    time.sleep(CHECKPOINT_TIMEOUT_SECONDS + 5)
+    # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed,
+    # such that there are zero bytes of ephemeral layer left on the pageserver
+    log.info("Waiting for background checkpoints...")
+    wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(0))  # type: ignore
+
+    # Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they
+    # must be uploaded to remain visible to the pageserver after restart.
+    wait_until_pageserver_has_uploaded(env, last_flush_lsns)
 
     env.pageserver.restart(immediate=immediate_shutdown)
     wait_until_pageserver_is_caught_up(env, last_flush_lsns)
 
+    # Catching up with WAL ingest should have resulted in zero bytes of ephemeral layers, since
+    # we froze, flushed and uploaded everything before restarting.  There can be no more WAL writes
+    # because we shut down compute endpoints before flushing.
+    assert get_dirty_bytes() == 0
+
     total_wal_ingested_after_restart = wait_for_wal_ingest_metric(ps_http_client)
 
     log.info(f"WAL ingested before restart: {total_wal_ingested_before_restart}")
     log.info(f"WAL ingested after restart: {total_wal_ingested_after_restart}")
 
-    leeway = total_wal_ingested_before_restart * 5 / 100
-    assert total_wal_ingested_after_restart <= leeway
+    assert total_wal_ingested_after_restart == 0

From a6c1fdcaf657ad3de8cbdac514d44a9f1a0ecef8 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 15 Mar 2024 18:04:05 +0300
Subject: [PATCH 0467/1571] Try to fix test_crafted_wal_end flakiness.

Postgres can always write some more WAL, so previous checks that WAL doesn't
change after something had been crafted were wrong; remove them. Add comments
here and there.

should fix https://github.com/neondatabase/neon/issues/4691
---
 .../wal_craft/src/bin/wal_craft.rs            |   6 +-
 libs/postgres_ffi/wal_craft/src/lib.rs        | 126 ++++++++----------
 .../wal_craft/src/xlog_utils_test.rs          |  35 +++--
 3 files changed, 77 insertions(+), 90 deletions(-)

diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
index e87ca27e90..41afcea6c2 100644
--- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
+++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
@@ -1,5 +1,6 @@
 use anyhow::*;
 use clap::{value_parser, Arg, ArgMatches, Command};
+use postgres::Client;
 use std::{path::PathBuf, str::FromStr};
 use wal_craft::*;
 
@@ -8,8 +9,8 @@ fn main() -> Result<()> {
         .init();
     let arg_matches = cli().get_matches();
 
-    let wal_craft = |arg_matches: &ArgMatches, client| {
-        let (intermediate_lsns, end_of_wal_lsn) = match arg_matches
+    let wal_craft = |arg_matches: &ArgMatches, client: &mut Client| {
+        let intermediate_lsns = match arg_matches
             .get_one::<String>("type")
             .map(|s| s.as_str())
             .context("'type' is required")?
@@ -25,6 +26,7 @@ fn main() -> Result<()> {
             LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
             a => panic!("Unknown --type argument: {a}"),
         };
+        let end_of_wal_lsn = client.pg_current_wal_insert_lsn()?;
         for lsn in intermediate_lsns {
             println!("intermediate_lsn = {lsn}");
         }
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 281a180e3b..23786e3b08 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -5,7 +5,6 @@ use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
 use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
-use std::cmp::Ordering;
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -232,59 +231,52 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow
 pub trait Crafter {
     const NAME: &'static str;
 
-    /// Generates WAL using the client `client`. Returns a pair of:
-    /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from.
-    ///   May include or exclude Lsn(0) and the end-of-wal.
-    /// * The expected end-of-wal LSN.
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)>;
+    /// Generates WAL using the client `client`. Returns a vector of some valid
+    /// "interesting" intermediate LSNs which one may start reading from.
+    /// test_end_of_wal uses this to check various starting points.
+    ///
+    /// Note that postgres is generally keen about writing some WAL. While we
+    /// try to disable it (autovacuum, big wal_writer_delay, etc) it is always
+    /// possible, e.g. xl_running_xacts are dumped each 15s. So checks about
+    /// stable WAL end would be flaky unless postgres is shut down. For this
+    /// reason returning potential end of WAL here is pointless. Most of the
+    /// time this doesn't happen though, so it is reasonable to create needed
+    /// WAL structure and immediately kill postgres like test_end_of_wal does.
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>>;
 }
 
+/// Wraps some WAL craft function, providing current LSN to it before the
+/// insertion and flushing WAL afterwards. Also pushes initial LSN to the
+/// result.
 fn craft_internal<C: postgres::GenericClient>(
     client: &mut C,
-    f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec<PgLsn>, Option<PgLsn>)>,
-) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+    f: impl Fn(&mut C, PgLsn) -> anyhow::Result<Vec<PgLsn>>,
+) -> anyhow::Result<Vec<PgLsn>> {
     ensure_server_config(client)?;
 
     let initial_lsn = client.pg_current_wal_insert_lsn()?;
     info!("LSN initial = {}", initial_lsn);
 
-    let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
-    let last_lsn = match last_lsn {
-        None => client.pg_current_wal_insert_lsn()?,
-        Some(last_lsn) => {
-            let insert_lsn = client.pg_current_wal_insert_lsn()?;
-            match last_lsn.cmp(&insert_lsn) {
-                Ordering::Less => bail!(
-                    "Some records were inserted after the crafted WAL: {} vs {}",
-                    last_lsn,
-                    insert_lsn
-                ),
-                Ordering::Equal => last_lsn,
-                Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
-            }
-        }
-    };
+    let mut intermediate_lsns = f(client, initial_lsn)?;
     if !intermediate_lsns.starts_with(&[initial_lsn]) {
         intermediate_lsns.insert(0, initial_lsn);
     }
 
     // Some records may be not flushed, e.g. non-transactional logical messages.
+    //
+    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
+    // because pg_current_wal_insert_lsn skips page headers.
     client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
-    match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
-        Ordering::Less => bail!("Some records were flushed after the crafted WAL"),
-        Ordering::Equal => {}
-        Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
-    }
-    Ok((intermediate_lsns, last_lsn))
+    Ok(intermediate_lsns)
 }
 
 pub struct Simple;
 impl Crafter for Simple {
     const NAME: &'static str = "simple";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
         craft_internal(client, |client, _| {
             client.execute("CREATE table t(x int)", &[])?;
-            Ok((Vec::new(), None))
+            Ok(Vec::new())
         })
     }
 }
@@ -292,29 +284,36 @@ impl Crafter for Simple {
 pub struct LastWalRecordXlogSwitch;
 impl Crafter for LastWalRecordXlogSwitch {
     const NAME: &'static str = "last_wal_record_xlog_switch";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
-        // Do not use generate_internal because here we end up with flush_lsn exactly on
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
+        // Do not use craft_internal because here we end up with flush_lsn exactly on
         // the segment boundary and insert_lsn after the initial page header, which is unusual.
         ensure_server_config(client)?;
 
         client.execute("CREATE table t(x int)", &[])?;
         let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
-        let next_segment = PgLsn::from(0x0200_0000);
+        // pg_switch_wal returns end of last record of the switched segment,
+        // i.e. end of SWITCH itself.
+        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+        let before_xlog_switch_u64 = u64::from(before_xlog_switch);
+        let next_segment = PgLsn::from(
+            before_xlog_switch_u64 - (before_xlog_switch_u64 % WAL_SEGMENT_SIZE as u64)
+                + WAL_SEGMENT_SIZE as u64,
+        );
         ensure!(
-            after_xlog_switch <= next_segment,
-            "XLOG_SWITCH message ended after the expected segment boundary: {} > {}",
-            after_xlog_switch,
+            xlog_switch_record_end <= next_segment,
+            "XLOG_SWITCH record ended after the expected segment boundary: {} > {}",
+            xlog_switch_record_end,
             next_segment
         );
-        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
+        Ok(vec![before_xlog_switch, xlog_switch_record_end])
     }
 }
 
 pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
+/// Craft xlog SWITCH record ending at page boundary.
 impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
     const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
         // Do not use generate_internal because here we end up with flush_lsn exactly on
         // the segment boundary and insert_lsn after the initial page header, which is unusual.
         ensure_server_config(client)?;
@@ -361,28 +360,29 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
 
         // Emit the XLOG_SWITCH
         let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
         let next_segment = PgLsn::from(0x0200_0000);
         ensure!(
-            after_xlog_switch < next_segment,
-            "XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}",
-            after_xlog_switch,
+            xlog_switch_record_end < next_segment,
+            "XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
+            xlog_switch_record_end,
             next_segment
         );
         ensure!(
-            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
+            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
             "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
-            after_xlog_switch,
-            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ
+            xlog_switch_record_end,
+            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
         );
-        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
+        Ok(vec![before_xlog_switch, xlog_switch_record_end])
     }
 }
 
-fn craft_single_logical_message(
+/// Write ~16MB logical message; it should cross WAL segment.
+fn craft_seg_size_logical_message(
     client: &mut impl postgres::GenericClient,
     transactional: bool,
-) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+) -> anyhow::Result<Vec<PgLsn>> {
     craft_internal(client, |client, initial_lsn| {
         ensure!(
             initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
@@ -405,34 +405,24 @@ fn craft_single_logical_message(
             "Logical message crossed two segments"
         );
 
-        if transactional {
-            // Transactional logical messages are part of a transaction, so the one above is
-            // followed by a small COMMIT record.
-
-            let after_message_lsn = client.pg_current_wal_insert_lsn()?;
-            ensure!(
-                message_lsn < after_message_lsn,
-                "No record found after the emitted message"
-            );
-            Ok((vec![message_lsn], Some(after_message_lsn)))
-        } else {
-            Ok((Vec::new(), Some(message_lsn)))
-        }
+        Ok(vec![message_lsn])
     })
 }
 
 pub struct WalRecordCrossingSegmentFollowedBySmallOne;
 impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
     const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
-        craft_single_logical_message(client, true)
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
+        // Transactional message crossing WAL segment will be followed by small
+        // commit record.
+        craft_seg_size_logical_message(client, true)
     }
 }
 
 pub struct LastWalRecordCrossingSegment;
 impl Crafter for LastWalRecordCrossingSegment {
     const NAME: &'static str = "last_wal_record_crossing_segment";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
-        craft_single_logical_message(client, false)
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
+        craft_seg_size_logical_message(client, false)
     }
 }
diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
index 6ff4c563b2..496458b2e4 100644
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -11,13 +11,15 @@ use utils::const_assert;
 use utils::lsn::Lsn;
 
 fn init_logging() {
-    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
-        format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
-    ))
+    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(format!(
+        "crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"
+    )))
     .is_test(true)
     .try_init();
 }
 
+/// Test that find_end_of_wal returns the same results as pg_dump on various
+/// WALs created by Crafter.
 fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
     use crate::*;
 
@@ -38,13 +40,13 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
     }
     cfg.initdb().unwrap();
     let srv = cfg.start_server().unwrap();
-    let (intermediate_lsns, expected_end_of_wal_partial) =
-        C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
+    let intermediate_lsns = C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
     let intermediate_lsns: Vec<Lsn> = intermediate_lsns
         .iter()
         .map(|&lsn| u64::from(lsn).into())
         .collect();
-    let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
+    // Kill postgres. Note that it might have inserted to WAL something after
+    // 'craft' did its job.
     srv.kill();
 
     // Check find_end_of_wal on the initial WAL
@@ -56,7 +58,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
         .filter(|fname| IsXLogFileName(fname))
         .max()
         .unwrap();
-    check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
+    let expected_end_of_wal = find_pg_waldump_end_of_wal(&cfg, &last_segment);
     for start_lsn in intermediate_lsns
         .iter()
         .chain(std::iter::once(&expected_end_of_wal))
@@ -91,11 +93,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
     }
 }
 
-fn check_pg_waldump_end_of_wal(
-    cfg: &crate::Conf,
-    last_segment: &str,
-    expected_end_of_wal: Lsn,
-) {
+fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
     // Get the actual end of WAL by pg_waldump
     let waldump_output = cfg
         .pg_waldump("000000010000000000000001", last_segment)
@@ -113,11 +111,8 @@ fn check_pg_waldump_end_of_wal(
         }
     };
     let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
-    info!(
-        "waldump erred on {}, expected wal end at {}",
-        waldump_wal_end, expected_end_of_wal
-    );
-    assert_eq!(waldump_wal_end, expected_end_of_wal);
+    info!("waldump erred on {}", waldump_wal_end);
+    waldump_wal_end
 }
 
 fn check_end_of_wal(
@@ -210,9 +205,9 @@ pub fn test_update_next_xid() {
 #[test]
 pub fn test_encode_logical_message() {
     let expected = [
-        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
-        38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
-        101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
+        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38,
+        0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102,
+        105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
     ];
     let actual = encode_logical_message("prefix", "message");
     assert_eq!(expected, actual[..]);

From 271314230848a4f978aa6fa589bae8583b3e2740 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 25 Mar 2024 14:35:24 +0000
Subject: [PATCH 0468/1571] tests: stabilize compat tests (#7227)

This test had two flaky failure modes:
- pageserver log error for timeline not found: this resulted from
changes for DR when timeline destroy/create was added, but endpoint was
left running during that operation.
- storage controller log error because the test was running for long
enough that a background reconcile happened at almost the exact moment
of test teardown, and our test fixtures tear down the pageservers before
the controller.

Closes: #7224
---
 test_runner/fixtures/neon_fixtures.py     | 6 +++++-
 test_runner/regress/test_compatibility.py | 9 ++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index f8994a8dcc..f22ce10c20 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1155,13 +1155,17 @@ class NeonEnv:
         After this method returns, there should be no child processes running.
         """
         self.endpoints.stop_all()
+
+        # Stop storage controller before pageservers: we don't want it to spuriously
+        # detect a pageserver "failure" during test teardown
+        self.storage_controller.stop(immediate=immediate)
+
         for sk in self.safekeepers:
             sk.stop(immediate=immediate)
         for pageserver in self.pageservers:
             if ps_assert_metric_no_errors:
                 pageserver.assert_no_metric_errors()
             pageserver.stop(immediate=immediate)
-        self.storage_controller.stop(immediate=immediate)
         self.broker.stop(immediate=immediate)
 
     @property
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index e0bb4c2062..5406acc005 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -267,9 +267,10 @@ def test_forward_compatibility(
 
 def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path):
     ep = env.endpoints.create_start("main")
+    connstr = ep.connstr()
+
     pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
 
-    connstr = ep.connstr()
     pg_bin.run_capture(
         ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"]
     )
@@ -286,6 +287,9 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
     timeline_id = env.initial_timeline
     pg_version = env.pg_version
 
+    # Stop endpoint while we recreate timeline
+    ep.stop()
+
     try:
         pageserver_http.timeline_preserve_initdb_archive(tenant_id, timeline_id)
     except PageserverApiException as e:
@@ -310,6 +314,9 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
         existing_initdb_timeline_id=timeline_id,
     )
 
+    # Timeline exists again: restart the endpoint
+    ep.start()
+
     pg_bin.run_capture(
         ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"]
     )

From d837ce0686046837f558d0202716c22937d6213b Mon Sep 17 00:00:00 2001
From: George Ma <164313692+availhang@users.noreply.github.com>
Date: Mon, 25 Mar 2024 23:43:02 +0800
Subject: [PATCH 0469/1571] chore: remove repetitive words (#7206)

Signed-off-by: availhang <mayangang@outlook.com>
---
 control_plane/src/background_process.rs    | 2 +-
 control_plane/src/endpoint.rs              | 2 +-
 libs/tenant_size_model/tests/tests.rs      | 2 +-
 libs/vm_monitor/src/runner.rs              | 2 +-
 pageserver/src/metrics.rs                  | 2 +-
 pageserver/src/virtual_file.rs             | 2 +-
 test_runner/regress/test_auth.py           | 2 +-
 test_runner/regress/test_remote_storage.py | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 0e59b28230..2fced7d778 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -294,7 +294,7 @@ where
     //      is in state 'taken' but the thread that would unlock it is
     //      not there.
     //   2. A rust object that represented some external resource in the
-    //      parent now got implicitly copied by the the fork, even though
+    //      parent now got implicitly copied by the fork, even though
     //      the object's type is not `Copy`. The parent program may use
     //      non-copyability as way to enforce unique ownership of an
     //      external resource in the typesystem. The fork breaks that
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 5206222961..03f7db99fb 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -12,7 +12,7 @@
 //!
 //! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
 //! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
-//! the basebackup from the pageserver to initialize the the data directory, and
+//! the basebackup from the pageserver to initialize the data directory, and
 //! finally launches the PostgreSQL process. It watches the PostgreSQL process
 //! until it exits.
 //!
diff --git a/libs/tenant_size_model/tests/tests.rs b/libs/tenant_size_model/tests/tests.rs
index 7660d41c56..0ffea0f2cd 100644
--- a/libs/tenant_size_model/tests/tests.rs
+++ b/libs/tenant_size_model/tests/tests.rs
@@ -247,7 +247,7 @@ fn scenario_4() {
     //
     // This is in total 5000 + 1000 + 5000 + 1000 = 12000
     //
-    // (If we used the the method from the previous scenario, and
+    // (If we used the method from the previous scenario, and
     // kept only snapshot at the branch point, we'd need to keep
     // all the WAL between 10000-18000 on the main branch, so
     // the total size would be 5000 + 1000 + 8000 = 14000. The
diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs
index ba37966476..ca02637ecf 100644
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -69,7 +69,7 @@ pub struct Config {
     /// should be removed once we have a better solution there.
     sys_buffer_bytes: u64,
 
-    /// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
+    /// Minimum fraction of total system memory reserved *before* the cgroup threshold; in
     /// other words, providing a ceiling for the highest value of the threshold by enforcing that
     /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
     /// threshold.
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 6de284ee8b..cc661194e9 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -435,7 +435,7 @@ pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(||
 static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_remote_physical_size",
-        "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
+        "The size of the layer files present in the remote storage that are listed in the remote index_part.json.",
         // Corollary: If any files are missing from the index part, they won't be included here.
         &["tenant_id", "shard_id", "timeline_id"]
     )
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index dee36d8afd..0cf6a0019b 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -782,7 +782,7 @@ where
         }
     }
     // NB: don't use `buf.is_empty()` here; it is from the
-    // `impl Deref for Slice { Target = [u8] }`; the the &[u8]
+    // `impl Deref for Slice { Target = [u8] }`; the &[u8]
     // returned by it only covers the initialized portion of `buf`.
     // Whereas we're interested in ensuring that we filled the entire
     // buffer that the user passed in.
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index ea88b5d8e9..bb622c0d59 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -105,7 +105,7 @@ def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder):
     # The neon_local tool generates one key pair at a hardcoded path by default.
     # As a preparation for our test, move the public key of the key pair into a
     # directory at the same location as the hardcoded path by:
-    # 1. moving the the file at `configured_pub_key_path` to a temporary location
+    # 1. moving the file at `configured_pub_key_path` to a temporary location
     # 2. creating a new directory at `configured_pub_key_path`
     # 3. moving the file from the temporary location into the newly created directory
     configured_pub_key_path = Path(env.repo_dir) / "auth_public_key.pem"
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 05f769b0e3..986d6c4dbf 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -838,7 +838,7 @@ def test_compaction_waits_for_upload(
     # upload_stuck_layers and the original initdb L0
     client.timeline_checkpoint(tenant_id, timeline_id)
 
-    # as uploads are paused, the the upload_stuck_layers should still be with us
+    # as uploads are paused, the upload_stuck_layers should still be with us
     for name in upload_stuck_layers:
         path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name
         assert path.exists(), "uploads are stuck still over compaction"

From f72415e1fd952274f132a47baaddbf0a4ac912de Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 25 Mar 2024 18:42:18 +0100
Subject: [PATCH 0470/1571] refactor(remote_timeline_client): infallible stop()
 and shutdown() (#7234)

preliminary refactoring for
https://github.com/neondatabase/neon/pull/7233

part of #7062
---
 pageserver/src/tenant.rs                      |  2 +-
 .../src/tenant/remote_timeline_client.rs      | 77 ++++++++-----------
 pageserver/src/tenant/timeline.rs             | 15 +---
 pageserver/src/tenant/timeline/delete.rs      | 18 +----
 pageserver/src/tenant/upload_queue.rs         | 14 +++-
 5 files changed, 51 insertions(+), 75 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7bd85b6fd5..b923e473ce 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2141,7 +2141,7 @@ impl Tenant {
 
             // Shut down the timeline's remote client: this means that the indices we write
             // for child shards will not be invalidated by the parent shard deleting layers.
-            tl_client.shutdown().await?;
+            tl_client.shutdown().await;
 
             // Download methods can still be used after shutdown, as they don't flow through the remote client's
             // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index c0a150eb0d..b4b3243d11 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -217,7 +217,7 @@ use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::download::download_retry;
 use crate::tenant::storage_layer::AsLayerDesc;
-use crate::tenant::upload_queue::Delete;
+use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable};
 use crate::tenant::TIMELINES_SEGMENT_NAME;
 use crate::{
     config::PageServerConf,
@@ -265,15 +265,6 @@ pub enum MaybeDeletedIndexPart {
     Deleted(IndexPart),
 }
 
-/// Errors that can arise when calling [`RemoteTimelineClient::stop`].
-#[derive(Debug, thiserror::Error)]
-pub enum StopError {
-    /// Returned if the upload queue was never initialized.
-    /// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`].
-    #[error("queue is not initialized")]
-    QueueUninitialized,
-}
-
 #[derive(Debug, thiserror::Error)]
 pub enum PersistIndexPartWithDeletedFlagError {
     #[error("another task is already setting the deleted_flag, started at {0:?}")]
@@ -390,15 +381,10 @@ impl RemoteTimelineClient {
             "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
         ))?;
 
-        {
-            let mut upload_queue = self.upload_queue.lock().unwrap();
-            upload_queue.initialize_with_current_remote_index_part(index_part)?;
-            self.update_remote_physical_size_gauge(Some(index_part));
-        }
-        // also locks upload queue, without dropping the guard above it will be a deadlock
-        self.stop().expect("initialized line above");
-
         let mut upload_queue = self.upload_queue.lock().unwrap();
+        upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        self.update_remote_physical_size_gauge(Some(index_part));
+        self.stop_impl(&mut upload_queue);
 
         upload_queue
             .stopped_mut()
@@ -412,7 +398,8 @@ impl RemoteTimelineClient {
         match &mut *self.upload_queue.lock().unwrap() {
             UploadQueue::Uninitialized => None,
             UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
-            UploadQueue::Stopped(q) => q
+            UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None,
+            UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => q
                 .upload_queue_for_deletion
                 .get_last_remote_consistent_lsn_projected(),
         }
@@ -422,7 +409,8 @@ impl RemoteTimelineClient {
         match &mut *self.upload_queue.lock().unwrap() {
             UploadQueue::Uninitialized => None,
             UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
-            UploadQueue::Stopped(q) => Some(
+            UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None,
+            UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => Some(
                 q.upload_queue_for_deletion
                     .get_last_remote_consistent_lsn_visible(),
             ),
@@ -889,7 +877,7 @@ impl RemoteTimelineClient {
     /// Wait for all previously scheduled operations to complete, and then stop.
     ///
     /// Not cancellation safe
-    pub(crate) async fn shutdown(self: &Arc<Self>) -> Result<(), StopError> {
+    pub(crate) async fn shutdown(self: &Arc<Self>) {
         // On cancellation the queue is left in ackward state of refusing new operations but
         // proper stop is yet to be called. On cancel the original or some later task must call
         // `stop` or `shutdown`.
@@ -900,8 +888,12 @@ impl RemoteTimelineClient {
         let fut = {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = match &mut *guard {
-                UploadQueue::Stopped(_) => return Ok(()),
-                UploadQueue::Uninitialized => return Err(StopError::QueueUninitialized),
+                UploadQueue::Stopped(_) => return,
+                UploadQueue::Uninitialized => {
+                    // transition into Stopped state
+                    self.stop_impl(&mut guard);
+                    return;
+                }
                 UploadQueue::Initialized(ref mut init) => init,
             };
 
@@ -933,7 +925,7 @@ impl RemoteTimelineClient {
             }
         }
 
-        self.stop()
+        self.stop();
     }
 
     /// Set the deleted_at field in the remote index file.
@@ -1314,12 +1306,7 @@ impl RemoteTimelineClient {
             // upload finishes or times out soon enough.
             if cancel.is_cancelled() {
                 info!("upload task cancelled by shutdown request");
-                match self.stop() {
-                    Ok(()) => {}
-                    Err(StopError::QueueUninitialized) => {
-                        unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
-                    }
-                }
+                self.stop();
                 return;
             }
 
@@ -1574,17 +1561,23 @@ impl RemoteTimelineClient {
     /// In-progress operations will still be running after this function returns.
     /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
     /// to wait for them to complete, after calling this function.
-    pub(crate) fn stop(&self) -> Result<(), StopError> {
+    pub(crate) fn stop(&self) {
         // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
         // into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
         // The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
         let mut guard = self.upload_queue.lock().unwrap();
-        match &mut *guard {
-            UploadQueue::Uninitialized => Err(StopError::QueueUninitialized),
+        self.stop_impl(&mut guard);
+    }
+
+    fn stop_impl(&self, guard: &mut std::sync::MutexGuard<UploadQueue>) {
+        match &mut **guard {
+            UploadQueue::Uninitialized => {
+                info!("UploadQueue is in state Uninitialized, nothing to do");
+                **guard = UploadQueue::Stopped(UploadQueueStopped::Uninitialized);
+            }
             UploadQueue::Stopped(_) => {
                 // nothing to do
                 info!("another concurrent task already shut down the queue");
-                Ok(())
             }
             UploadQueue::Initialized(initialized) => {
                 info!("shutting down upload queue");
@@ -1617,11 +1610,13 @@ impl RemoteTimelineClient {
                     };
 
                     let upload_queue = std::mem::replace(
-                        &mut *guard,
-                        UploadQueue::Stopped(UploadQueueStopped {
-                            upload_queue_for_deletion,
-                            deleted_at: SetDeletedFlagProgress::NotRunning,
-                        }),
+                        &mut **guard,
+                        UploadQueue::Stopped(UploadQueueStopped::Deletable(
+                            UploadQueueStoppedDeletable {
+                                upload_queue_for_deletion,
+                                deleted_at: SetDeletedFlagProgress::NotRunning,
+                            },
+                        )),
                     );
                     if let UploadQueue::Initialized(qi) = upload_queue {
                         qi
@@ -1650,10 +1645,6 @@ impl RemoteTimelineClient {
                     // which is exactly what we want to happen.
                     drop(op);
                 }
-
-                // We're done.
-                drop(guard);
-                Ok(())
             }
         }
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 15ffa72aaa..6c6bb4b788 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -54,6 +54,7 @@ use std::{
     ops::ControlFlow,
 };
 
+use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
@@ -64,7 +65,6 @@ use crate::{
     disk_usage_eviction_task::DiskUsageEvictionInfo,
     pgdatadir_mapping::CollectKeySpaceError,
 };
-use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};
 use crate::{
     disk_usage_eviction_task::finite_f32,
     tenant::storage_layer::{
@@ -1241,11 +1241,7 @@ impl Timeline {
                     // what is problematic is the shutting down of RemoteTimelineClient, because
                     // obviously it does not make sense to stop while we wait for it, but what
                     // about corner cases like s3 suddenly hanging up?
-                    if let Err(e) = client.shutdown().await {
-                        // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
-                        // we have some extra WAL replay to do next time the timeline starts.
-                        warn!("failed to flush to remote storage: {e:#}");
-                    }
+                    client.shutdown().await;
                 }
             }
             Err(e) => {
@@ -1282,12 +1278,7 @@ impl Timeline {
         // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
         // case our caller wants to use that for a deletion
         if let Some(remote_client) = self.remote_client.as_ref() {
-            match remote_client.stop() {
-                Ok(()) => {}
-                Err(StopError::QueueUninitialized) => {
-                    // Shutting down during initialization is legal
-                }
-            }
+            remote_client.stop();
         }
 
         tracing::debug!("Waiting for tasks...");
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index d2272fc75f..e9afbfd8ba 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -16,9 +16,7 @@ use crate::{
     tenant::{
         debug_assert_current_span_has_tenant_and_timeline_id,
         metadata::TimelineMetadata,
-        remote_timeline_client::{
-            self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
-        },
+        remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
         CreateTimelineCause, DeleteTimelineError, Tenant,
     },
 };
@@ -50,19 +48,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
 
     // Prevent new uploads from starting.
     if let Some(remote_client) = timeline.remote_client.as_ref() {
-        let res = remote_client.stop();
-        match res {
-            Ok(()) => {}
-            Err(e) => match e {
-                remote_timeline_client::StopError::QueueUninitialized => {
-                    // This case shouldn't happen currently because the
-                    // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
-                    // That is, before we declare the Tenant as Active.
-                    // But we only allow calls to delete_timeline on Active tenants.
-                    return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
-                }
-            },
-        }
+        remote_client.stop();
     }
 
     // Stop & wait for the remaining timeline tasks, including upload tasks.
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index a5516bb9a9..0bf4d1e599 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -121,11 +121,16 @@ pub(super) enum SetDeletedFlagProgress {
     Successful(NaiveDateTime),
 }
 
-pub(super) struct UploadQueueStopped {
+pub(super) struct UploadQueueStoppedDeletable {
     pub(super) upload_queue_for_deletion: UploadQueueInitialized,
     pub(super) deleted_at: SetDeletedFlagProgress,
 }
 
+pub(super) enum UploadQueueStopped {
+    Deletable(UploadQueueStoppedDeletable),
+    Uninitialized,
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum NotInitialized {
     #[error("queue is in state Uninitialized")]
@@ -249,12 +254,15 @@ impl UploadQueue {
         }
     }
 
-    pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStopped> {
+    pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStoppedDeletable> {
         match self {
             UploadQueue::Initialized(_) | UploadQueue::Uninitialized => {
                 anyhow::bail!("queue is in state {}", self.as_str())
             }
-            UploadQueue::Stopped(stopped) => Ok(stopped),
+            UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => {
+                anyhow::bail!("queue is in state Stopped(Uninitialized)")
+            }
+            UploadQueue::Stopped(UploadQueueStopped::Deletable(deletable)) => Ok(deletable),
         }
     }
 }

From 6313f1fa7a36a91a83158a381bd850f0147cb772 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 26 Mar 2024 09:56:47 +0000
Subject: [PATCH 0471/1571] tests: tolerate transient unavailability in
 test_sharding_split_failures (#7223)

## Problem

While most forms of split rollback don't interrupt clients, there are a
couple of cases that do -- this interruption is brief, driven by the
time it takes the controller to kick off Reconcilers during the async
abort of the split, so it's operationally fine, but can trip up a test.

- #7148

## Summary of changes

- Relax test check to require that the tenant is eventually available
after split failure, rather than immediately. In the vast majority of
cases this will pass on the first iteration.
---
 test_runner/regress/test_sharding.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index e6318aff68..9aebf16c68 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -874,11 +874,17 @@ def test_sharding_split_failures(
         workload.validate()
 
     if failure.expect_available():
-        # Even though the split failed partway through, this should not have interrupted
-        # clients.  Disable waiting for pageservers in the workload helper, because our
-        # failpoints may prevent API access.
-        # This only applies for failure modes that leave pageserver page_service API available.
-        workload.churn_rows(10, upload=False, ingest=False)
+        # Even though the split failed partway through, this should not leave the tenant in
+        # an unavailable state.
+        # - Disable waiting for pageservers in the workload helper, because our
+        #   failpoints may prevent API access. This only applies for failure modes that
+        #   leave pageserver page_service API available.
+        # - This is a wait_until because clients may see transient errors in some split error cases,
+        #   e.g. while waiting for a storage controller to re-attach a parent shard if we failed
+        #   inside the pageserver and the storage controller responds by detaching children and attaching
+        #   parents concurrently (https://github.com/neondatabase/neon/issues/7148)
+        wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False))  # type: ignore
+
         workload.validate()
 
     if failure.fails_forward(env):

From 5dee58f492a80a76e1b01b32d4f02a42fba7cd47 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 26 Mar 2024 10:59:16 +0000
Subject: [PATCH 0472/1571] tests: wait for uploads in test_secondary_downloads
 (#7220)

## Problem

- https://github.com/neondatabase/neon/issues/6966

This test occasionally failed with some layers unexpectedly not present
on the secondary pageserver. The issue in that failure is the attached
pageserver uploading heatmaps that refer to not-yet-uploaded layers.

## Summary of changes

After uploading heatmap, drain upload queue on attached pageserver, to
guarantee that all the layers referenced in the haetmap are uploaded.
---
 test_runner/regress/test_pageserver_secondary.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 2e57136607..25510c50e6 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -11,6 +11,7 @@ from fixtures.pageserver.utils import (
     assert_prefix_empty,
     poll_for_remote_storage_iterations,
     tenant_delete_wait_completed,
+    wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage
 from fixtures.types import TenantId, TimelineId
@@ -472,6 +473,10 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     log.info("Synchronizing after initial write...")
     ps_attached.http_client().tenant_heatmap_upload(tenant_id)
 
+    # Ensure that everything which appears in the heatmap is also present in S3: heatmap writers
+    # are allowed to upload heatmaps that reference layers which are only enqueued for upload
+    wait_for_upload_queue_empty(ps_attached.http_client(), tenant_id, timeline_id)
+
     ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
     assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
@@ -484,6 +489,11 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     workload.churn_rows(128, ps_attached.id)
 
     ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+
+    # Ensure that everything which appears in the heatmap is also present in S3: heatmap writers
+    # are allowed to upload heatmaps that reference layers which are only enqueued for upload
+    wait_for_upload_queue_empty(ps_attached.http_client(), tenant_id, timeline_id)
+
     ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
     assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(

From 6c18109734e77bba7b93862d7f5ca54879bf5052 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 26 Mar 2024 12:27:48 +0100
Subject: [PATCH 0473/1571] proxy: reuse sess_id as request_id for the cplane
 requests (#7245)

## Problem

https://github.com/neondatabase/cloud/issues/11599

## Summary of changes

Reuse the same sess_id for requests within the one session.

TODO: get rid of `session_id` in query params.
---
 proxy/src/console/provider/neon.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 89ebfa57f1..289b0c08f7 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -55,7 +55,7 @@ impl Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
-        let request_id = uuid::Uuid::new_v4().to_string();
+        let request_id = ctx.session_id.to_string();
         let application_name = ctx.console_application_name();
         async {
             let request = self
@@ -112,7 +112,7 @@ impl Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<NodeInfo, WakeComputeError> {
-        let request_id = uuid::Uuid::new_v4().to_string();
+        let request_id = ctx.session_id.to_string();
         let application_name = ctx.console_application_name();
         async {
             let mut request_builder = self

From ad072de4209193fd21314cf7f03f14df4fa55eb1 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 26 Mar 2024 15:24:18 +0100
Subject: [PATCH 0474/1571] Revert "pageserver: use a single tokio runtime
 (#6555)" (#7246)

---
 pageserver/src/bin/pageserver.rs              | 80 ++++++++++---------
 pageserver/src/consumption_metrics.rs         |  3 +-
 pageserver/src/control_plane_client.rs        |  4 +-
 pageserver/src/disk_usage_eviction_task.rs    |  3 +-
 pageserver/src/page_service.rs                |  1 +
 pageserver/src/task_mgr.rs                    | 39 ++++++---
 pageserver/src/tenant.rs                      |  2 +-
 pageserver/src/tenant/delete.rs               |  1 +
 pageserver/src/tenant/mgr.rs                  |  4 +
 .../src/tenant/remote_timeline_client.rs      | 11 +++
 pageserver/src/tenant/secondary.rs            |  4 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  4 +-
 pageserver/src/tenant/tasks.rs                |  6 +-
 pageserver/src/tenant/timeline.rs             |  6 +-
 pageserver/src/tenant/timeline/delete.rs      |  1 +
 .../src/tenant/timeline/eviction_task.rs      |  3 +-
 pageserver/src/tenant/timeline/walreceiver.rs |  5 +-
 .../walreceiver/walreceiver_connection.rs     | 18 ++---
 test_runner/regress/test_backpressure.py      |  2 +-
 test_runner/regress/test_timeline_size.py     | 26 ++----
 20 files changed, 131 insertions(+), 92 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index f4a231f217..ef616c0a39 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -15,9 +15,9 @@ use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
 use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
+use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
 use remote_storage::GenericRemoteStorage;
-use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
 use tracing::*;
 
@@ -28,7 +28,7 @@ use pageserver::{
     deletion_queue::DeletionQueue,
     http, page_cache, page_service, task_mgr,
     task_mgr::TaskKind,
-    task_mgr::THE_RUNTIME,
+    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
     tenant::mgr,
     virtual_file,
 };
@@ -323,7 +323,7 @@ fn start_pageserver(
 
     // Launch broker client
     // The storage_broker::connect call needs to happen inside a tokio runtime thread.
-    let broker_client = THE_RUNTIME
+    let broker_client = WALRECEIVER_RUNTIME
         .block_on(async {
             // Note: we do not attempt connecting here (but validate endpoints sanity).
             storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)
@@ -391,7 +391,7 @@ fn start_pageserver(
         conf,
     );
     if let Some(deletion_workers) = deletion_workers {
-        deletion_workers.spawn_with(THE_RUNTIME.handle());
+        deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
     }
 
     // Up to this point no significant I/O has been done: this should have been fast.  Record
@@ -423,7 +423,7 @@ fn start_pageserver(
 
     // Scan the local 'tenants/' directory and start loading the tenants
     let deletion_queue_client = deletion_queue.new_client();
-    let tenant_manager = THE_RUNTIME.block_on(mgr::init_tenant_mgr(
+    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
         conf,
         TenantSharedResources {
             broker_client: broker_client.clone(),
@@ -435,7 +435,7 @@ fn start_pageserver(
     ))?;
     let tenant_manager = Arc::new(tenant_manager);
 
-    THE_RUNTIME.spawn({
+    BACKGROUND_RUNTIME.spawn({
         let shutdown_pageserver = shutdown_pageserver.clone();
         let drive_init = async move {
             // NOTE: unlike many futures in pageserver, this one is cancellation-safe
@@ -545,7 +545,7 @@ fn start_pageserver(
     // Start up the service to handle HTTP mgmt API request. We created the
     // listener earlier already.
     {
-        let _rt_guard = THE_RUNTIME.enter();
+        let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
 
         let router_state = Arc::new(
             http::routes::State::new(
@@ -569,6 +569,7 @@ fn start_pageserver(
             .with_graceful_shutdown(task_mgr::shutdown_watcher());
 
         task_mgr::spawn(
+            MGMT_REQUEST_RUNTIME.handle(),
             TaskKind::HttpEndpointListener,
             None,
             None,
@@ -593,6 +594,7 @@ fn start_pageserver(
         let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
 
         task_mgr::spawn(
+            crate::BACKGROUND_RUNTIME.handle(),
             TaskKind::MetricsCollection,
             None,
             None,
@@ -641,6 +643,7 @@ fn start_pageserver(
             DownloadBehavior::Error,
         );
         task_mgr::spawn(
+            COMPUTE_REQUEST_RUNTIME.handle(),
             TaskKind::LibpqEndpointListener,
             None,
             None,
@@ -664,37 +667,42 @@ fn start_pageserver(
     let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
 
     // All started up! Now just sit and wait for shutdown signal.
-
     {
-        THE_RUNTIME.block_on(async move {
-            let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
-            let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
-            let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
-            let signal = tokio::select! {
-                _ = sigquit.recv() => {
-                    info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
-                    std::process::exit(111);
-                }
-                _ = sigint.recv() => { "SIGINT" },
-                _ = sigterm.recv() => { "SIGTERM" },
-            };
+        use signal_hook::consts::*;
+        let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
+            let mut signals =
+                signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
+            return signals
+                .forever()
+                .next()
+                .expect("forever() never returns None unless explicitly closed");
+        });
+        let signal = BACKGROUND_RUNTIME
+            .block_on(signal_handler)
+            .expect("join error");
+        match signal {
+            SIGQUIT => {
+                info!("Got signal {signal}. Terminating in immediate shutdown mode",);
+                std::process::exit(111);
+            }
+            SIGINT | SIGTERM => {
+                info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
 
-            info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
-
-            // This cancels the `shutdown_pageserver` cancellation tree.
-            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
-            // The plan is to change that over time.
-            shutdown_pageserver.take();
-            let bg_remote_storage = remote_storage.clone();
-            let bg_deletion_queue = deletion_queue.clone();
-            pageserver::shutdown_pageserver(
-                &tenant_manager,
-                bg_remote_storage.map(|_| bg_deletion_queue),
-                0,
-            )
-            .await;
-            unreachable!()
-        })
+                // This cancels the `shutdown_pageserver` cancellation tree.
+                // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
+                // The plan is to change that over time.
+                shutdown_pageserver.take();
+                let bg_remote_storage = remote_storage.clone();
+                let bg_deletion_queue = deletion_queue.clone();
+                BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+                    &tenant_manager,
+                    bg_remote_storage.map(|_| bg_deletion_queue),
+                    0,
+                ));
+                unreachable!()
+            }
+            _ => unreachable!(),
+        }
     }
 }
 
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index c82be8c581..3429e3a0a6 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -1,7 +1,7 @@
 //! Periodically collect consumption metrics for all active tenants
 //! and push them to a HTTP endpoint.
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::task_mgr::{self, TaskKind};
+use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
 use camino::Utf8PathBuf;
@@ -61,6 +61,7 @@ pub async fn collect_metrics(
     let worker_ctx =
         ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
     task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
         TaskKind::CalculateSyntheticSize,
         None,
         None,
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 55d80c2966..42c800822b 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -173,6 +173,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
             register,
         };
 
+        fail::fail_point!("control-plane-client-re-attach");
+
         let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
         tracing::info!(
             "Received re-attach response with {} tenants",
@@ -208,7 +210,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                 .collect(),
         };
 
-        crate::tenant::pausable_failpoint!("control-plane-client-validate");
+        fail::fail_point!("control-plane-client-validate");
 
         let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
 
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 6b68acd1c7..92c1475aef 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -59,7 +59,7 @@ use utils::{completion, id::TimelineId};
 use crate::{
     config::PageServerConf,
     metrics::disk_usage_based_eviction::METRICS,
-    task_mgr::{self, TaskKind},
+    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     tenant::{
         self,
         mgr::TenantManager,
@@ -202,6 +202,7 @@ pub fn launch_disk_usage_global_eviction_task(
     info!("launching disk usage based eviction task");
 
     task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
         TaskKind::DiskUsageEviction,
         None,
         None,
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index fa1a0f535b..f3ceb7d3e6 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -180,6 +180,7 @@ pub async fn libpq_listener_main(
                 // only deal with a particular timeline, but we don't know which one
                 // yet.
                 task_mgr::spawn(
+                    &tokio::runtime::Handle::current(),
                     TaskKind::PageRequestHandler,
                     None,
                     None,
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 2d97389982..69e163effa 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -98,22 +98,42 @@ use utils::id::TimelineId;
 // other operations, if the upload tasks e.g. get blocked on locks. It shouldn't
 // happen, but still.
 //
-
-/// The single tokio runtime used by all pageserver code.
-/// In the past, we had multiple runtimes, and in the future we should weed out
-/// remaining references to this global field and rely on ambient runtime instead,
-/// i.e., use `tokio::spawn` instead of `THE_RUNTIME.spawn()`, etc.
-pub static THE_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+pub static COMPUTE_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
     tokio::runtime::Builder::new_multi_thread()
+        .thread_name("compute request worker")
+        .enable_all()
+        .build()
+        .expect("Failed to create compute request runtime")
+});
+
+pub static MGMT_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .thread_name("mgmt request worker")
+        .enable_all()
+        .build()
+        .expect("Failed to create mgmt request runtime")
+});
+
+pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .thread_name("walreceiver worker")
+        .enable_all()
+        .build()
+        .expect("Failed to create walreceiver runtime")
+});
+
+pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .thread_name("background op worker")
         // if you change the number of worker threads please change the constant below
         .enable_all()
         .build()
         .expect("Failed to create background op runtime")
 });
 
-pub(crate) static THE_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
+pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
     // force init and thus panics
-    let _ = THE_RUNTIME.handle();
+    let _ = BACKGROUND_RUNTIME.handle();
     // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
     // tokio would had already panicked for parsing errors or NotUnicode
     //
@@ -305,6 +325,7 @@ struct PageServerTask {
 /// Note: if shutdown_process_on_error is set to true failure
 ///   of the task will lead to shutdown of entire process
 pub fn spawn<F>(
+    runtime: &tokio::runtime::Handle,
     kind: TaskKind,
     tenant_shard_id: Option<TenantShardId>,
     timeline_id: Option<TimelineId>,
@@ -333,7 +354,7 @@ where
 
     let task_name = name.to_string();
     let task_cloned = Arc::clone(&task);
-    let join_handle = THE_RUNTIME.spawn(task_wrapper(
+    let join_handle = runtime.spawn(task_wrapper(
         task_name,
         task_id,
         task_cloned,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index b923e473ce..dcf9b1a605 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -144,7 +144,6 @@ macro_rules! pausable_failpoint {
         }
     };
 }
-pub(crate) use pausable_failpoint;
 
 pub mod blob_io;
 pub mod block_io;
@@ -662,6 +661,7 @@ impl Tenant {
         let tenant_clone = Arc::clone(&tenant);
         let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
         task_mgr::spawn(
+            &tokio::runtime::Handle::current(),
             TaskKind::Attach,
             Some(tenant_shard_id),
             None,
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 3866136dbd..7d37873a67 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -482,6 +482,7 @@ impl DeleteTenantFlow {
         let tenant_shard_id = tenant.tenant_shard_id;
 
         task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
             TaskKind::TimelineDeletionWorker,
             Some(tenant_shard_id),
             None,
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 34ca43a173..97a505ded9 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1850,6 +1850,7 @@ impl TenantManager {
         let task_tenant_id = None;
 
         task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
             TaskKind::MgmtRequest,
             task_tenant_id,
             None,
@@ -2815,12 +2816,15 @@ pub(crate) fn immediate_gc(
 
     // TODO: spawning is redundant now, need to hold the gate
     task_mgr::spawn(
+        &tokio::runtime::Handle::current(),
         TaskKind::GarbageCollector,
         Some(tenant_shard_id),
         Some(timeline_id),
         &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
         false,
         async move {
+            fail::fail_point!("immediate_gc_task_pre");
+
             #[allow(unused_mut)]
             let mut result = tenant
                 .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index b4b3243d11..cbd942d706 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -223,6 +223,7 @@ use crate::{
     config::PageServerConf,
     task_mgr,
     task_mgr::TaskKind,
+    task_mgr::BACKGROUND_RUNTIME,
     tenant::metadata::TimelineMetadata,
     tenant::upload_queue::{
         UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
@@ -297,6 +298,8 @@ pub enum PersistIndexPartWithDeletedFlagError {
 pub struct RemoteTimelineClient {
     conf: &'static PageServerConf,
 
+    runtime: tokio::runtime::Handle,
+
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
     generation: Generation,
@@ -329,6 +332,12 @@ impl RemoteTimelineClient {
     ) -> RemoteTimelineClient {
         RemoteTimelineClient {
             conf,
+            runtime: if cfg!(test) {
+                // remote_timeline_client.rs tests rely on current-thread runtime
+                tokio::runtime::Handle::current()
+            } else {
+                BACKGROUND_RUNTIME.handle().clone()
+            },
             tenant_shard_id,
             timeline_id,
             generation,
@@ -1264,6 +1273,7 @@ impl RemoteTimelineClient {
             let tenant_shard_id = self.tenant_shard_id;
             let timeline_id = self.timeline_id;
             task_mgr::spawn(
+                &self.runtime,
                 TaskKind::RemoteUploadTask,
                 Some(self.tenant_shard_id),
                 Some(self.timeline_id),
@@ -1857,6 +1867,7 @@ mod tests {
         fn build_client(&self, generation: Generation) -> Arc<RemoteTimelineClient> {
             Arc::new(RemoteTimelineClient {
                 conf: self.harness.conf,
+                runtime: tokio::runtime::Handle::current(),
                 tenant_shard_id: self.harness.tenant_shard_id,
                 timeline_id: TIMELINE_ID,
                 generation,
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index b0babb1308..19f36c722e 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -8,7 +8,7 @@ use std::{sync::Arc, time::SystemTime};
 use crate::{
     config::PageServerConf,
     disk_usage_eviction_task::DiskUsageEvictionInfo,
-    task_mgr::{self, TaskKind},
+    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     virtual_file::MaybeFatalIo,
 };
 
@@ -317,6 +317,7 @@ pub fn spawn_tasks(
         tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
 
     task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
         TaskKind::SecondaryDownloads,
         None,
         None,
@@ -337,6 +338,7 @@ pub fn spawn_tasks(
     );
 
     task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
         TaskKind::SecondaryUploads,
         None,
         None,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index e101a40da4..8ba37b5a86 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1447,7 +1447,7 @@ impl LayerInner {
         #[cfg(test)]
         tokio::task::spawn(fut);
         #[cfg(not(test))]
-        crate::task_mgr::THE_RUNTIME.spawn(fut);
+        crate::task_mgr::BACKGROUND_RUNTIME.spawn(fut);
     }
 
     /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME.
@@ -1458,7 +1458,7 @@ impl LayerInner {
         #[cfg(test)]
         tokio::task::spawn_blocking(f);
         #[cfg(not(test))]
-        crate::task_mgr::THE_RUNTIME.spawn_blocking(f);
+        crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(f);
     }
 }
 
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index db32223a60..e4f5f75132 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -8,7 +8,7 @@ use std::time::{Duration, Instant};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
-use crate::task_mgr::TaskKind;
+use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
@@ -18,7 +18,7 @@ use utils::{backoff, completion};
 
 static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
     once_cell::sync::Lazy::new(|| {
-        let total_threads = *crate::task_mgr::THE_RUNTIME_WORKER_THREADS;
+        let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
         let permits = usize::max(
             1,
             // while a lot of the work is done on spawn_blocking, we still do
@@ -85,6 +85,7 @@ pub fn start_background_loops(
 ) {
     let tenant_shard_id = tenant.tenant_shard_id;
     task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
         TaskKind::Compaction,
         Some(tenant_shard_id),
         None,
@@ -108,6 +109,7 @@ pub fn start_background_loops(
         },
     );
     task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
         TaskKind::GarbageCollector,
         Some(tenant_shard_id),
         None,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6c6bb4b788..0b8cdac1cc 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1714,6 +1714,7 @@ impl Timeline {
             initdb_optimization_count: 0,
         };
         task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::LayerFlushTask,
             Some(self.tenant_shard_id),
             Some(self.timeline_id),
@@ -2076,6 +2077,7 @@ impl Timeline {
             DownloadBehavior::Download,
         );
         task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::InitialLogicalSizeCalculation,
             Some(self.tenant_shard_id),
             Some(self.timeline_id),
@@ -2253,6 +2255,7 @@ impl Timeline {
             DownloadBehavior::Download,
         );
         task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::OndemandLogicalSizeCalculation,
             Some(self.tenant_shard_id),
             Some(self.timeline_id),
@@ -3828,7 +3831,7 @@ impl Timeline {
         };
         let timer = self.metrics.garbage_collect_histo.start_timer();
 
-        pausable_failpoint!("before-timeline-gc");
+        fail_point!("before-timeline-gc");
 
         // Is the timeline being deleted?
         if self.is_stopping() {
@@ -4139,6 +4142,7 @@ impl Timeline {
 
         let self_clone = Arc::clone(&self);
         let task_id = task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::DownloadAllRemoteLayers,
             Some(self.tenant_shard_id),
             Some(self.timeline_id),
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index e9afbfd8ba..ab0a88c764 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -429,6 +429,7 @@ impl DeleteTimelineFlow {
         let timeline_id = timeline.timeline_id;
 
         task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
             TaskKind::TimelineDeletionWorker,
             Some(tenant_shard_id),
             Some(timeline_id),
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index f84a4b0dac..dd769d4121 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -28,7 +28,7 @@ use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
 use crate::{
     context::{DownloadBehavior, RequestContext},
     pgdatadir_mapping::CollectKeySpaceError,
-    task_mgr::{self, TaskKind},
+    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     tenant::{
         tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
     },
@@ -56,6 +56,7 @@ impl Timeline {
         let self_clone = Arc::clone(self);
         let background_tasks_can_start = background_tasks_can_start.cloned();
         task_mgr::spawn(
+            BACKGROUND_RUNTIME.handle(),
             TaskKind::Eviction,
             Some(self.tenant_shard_id),
             Some(self.timeline_id),
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index 3592dda8d7..2fab6722b8 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -24,7 +24,7 @@ mod connection_manager;
 mod walreceiver_connection;
 
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::task_mgr::{self, TaskKind};
+use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::timeline::walreceiver::connection_manager::{
     connection_manager_loop_step, ConnectionManagerState,
@@ -82,6 +82,7 @@ impl WalReceiver {
         let loop_status = Arc::new(std::sync::RwLock::new(None));
         let manager_status = Arc::clone(&loop_status);
         task_mgr::spawn(
+            WALRECEIVER_RUNTIME.handle(),
             TaskKind::WalReceiverManager,
             Some(timeline.tenant_shard_id),
             Some(timeline_id),
@@ -180,7 +181,7 @@ impl<E: Clone> TaskHandle<E> {
         let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);
 
         let cancellation_clone = cancellation.clone();
-        let join_handle = tokio::spawn(async move {
+        let join_handle = WALRECEIVER_RUNTIME.spawn(async move {
             events_sender.send(TaskStateUpdate::Started).ok();
             task(events_sender, cancellation_clone).await
             // events_sender is dropped at some point during the .await above.
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index cf87cc6ce0..d9f780cfd1 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -11,6 +11,7 @@ use std::{
 use anyhow::{anyhow, Context};
 use bytes::BytesMut;
 use chrono::{NaiveDateTime, Utc};
+use fail::fail_point;
 use futures::StreamExt;
 use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
 use postgres_ffi::WAL_SEGMENT_SIZE;
@@ -26,7 +27,9 @@ use super::TaskStateUpdate;
 use crate::{
     context::RequestContext,
     metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
-    task_mgr::{self, TaskKind},
+    task_mgr,
+    task_mgr::TaskKind,
+    task_mgr::WALRECEIVER_RUNTIME,
     tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
     walingest::WalIngest,
     walrecord::DecodedWALRecord,
@@ -160,6 +163,7 @@ pub(super) async fn handle_walreceiver_connection(
     );
     let connection_cancellation = cancellation.clone();
     task_mgr::spawn(
+        WALRECEIVER_RUNTIME.handle(),
         TaskKind::WalReceiverConnectionPoller,
         Some(timeline.tenant_shard_id),
         Some(timeline.timeline_id),
@@ -325,17 +329,7 @@ pub(super) async fn handle_walreceiver_connection(
                             filtered_records += 1;
                         }
 
-                        // don't simply use pausable_failpoint here because its spawn_blocking slows
-                        // slows down the tests too much.
-                        fail::fail_point!("walreceiver-after-ingest-blocking");
-                        if let Err(()) = (|| {
-                            fail::fail_point!("walreceiver-after-ingest-pause-activate", |_| {
-                                Err(())
-                            });
-                            Ok(())
-                        })() {
-                            pausable_failpoint!("walreceiver-after-ingest-pause");
-                        }
+                        fail_point!("walreceiver-after-ingest");
 
                         last_rec_lsn = lsn;
 
diff --git a/test_runner/regress/test_backpressure.py b/test_runner/regress/test_backpressure.py
index af17a2e89d..819912dd05 100644
--- a/test_runner/regress/test_backpressure.py
+++ b/test_runner/regress/test_backpressure.py
@@ -116,7 +116,7 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder):
     # Configure failpoint to slow down walreceiver ingest
     with closing(env.pageserver.connect()) as psconn:
         with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:
-            pscur.execute("failpoints walreceiver-after-ingest-blocking=sleep(20)")
+            pscur.execute("failpoints walreceiver-after-ingest=sleep(20)")
 
     # FIXME
     # Wait for the check thread to start
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index efd257900d..628c484fbd 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -931,7 +931,7 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
     env.pageserver.stop()
     env.pageserver.start(
         extra_env_vars={
-            "FAILPOINTS": "initial-size-calculation-permit-pause=pause;walreceiver-after-ingest-pause-activate=return(1);walreceiver-after-ingest-pause=pause"
+            "FAILPOINTS": "initial-size-calculation-permit-pause=pause;walreceiver-after-ingest=pause"
         }
     )
 
@@ -953,11 +953,7 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
     assert details["current_logical_size_is_accurate"] is True
 
     client.configure_failpoints(
-        [
-            ("initial-size-calculation-permit-pause", "off"),
-            ("walreceiver-after-ingest-pause-activate", "off"),
-            ("walreceiver-after-ingest-pause", "off"),
-        ]
+        [("initial-size-calculation-permit-pause", "off"), ("walreceiver-after-ingest", "off")]
     )
 
 
@@ -987,7 +983,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
     # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation
     env.pageserver.start(
         extra_env_vars={
-            "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest-pause-activate=return(1);walreceiver-after-ingest-pause=pause"
+            "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause"
         }
     )
 
@@ -1033,11 +1029,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
     other_is_attaching()
 
     client.configure_failpoints(
-        [
-            ("timeline-calculate-logical-size-pause", "off"),
-            ("walreceiver-after-ingest-pause-activate", "off"),
-            ("walreceiver-after-ingest-pause", "off"),
-        ]
+        [("timeline-calculate-logical-size-pause", "off"), ("walreceiver-after-ingest", "off")]
     )
 
 
@@ -1067,7 +1059,7 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met
     # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation
     env.pageserver.start(
         extra_env_vars={
-            "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest-pause-activate=return(1);walreceiver-after-ingest-pause=pause"
+            "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause"
         }
     )
 
@@ -1119,11 +1111,3 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met
         delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True)
     else:
         raise RuntimeError(activation_method)
-
-    client.configure_failpoints(
-        [
-            ("timeline-calculate-logical-size-pause", "off"),
-            ("walreceiver-after-ingest-pause-activate", "off"),
-            ("walreceiver-after-ingest-pause", "off"),
-        ]
-    )

From de03742ca33ac5881b7639b7cc863c80e0830c53 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 26 Mar 2024 14:35:36 +0000
Subject: [PATCH 0475/1571] pageserver: drop layer map lock in Timeline::get
 (#7217)

## Problem
We currently hold the layer map read lock while doing IO on the read
path. This is not required for correctness.

## Summary of changes
Drop the layer map lock after figuring out which layer we wish to read
from.
Why is this correct:
* `Layer` models the lifecycle of an on disk layer. In the event the
layer is removed from local disk, it will be on demand downloaded
* `InMemoryLayer` holds the `EphemeralFile` which wraps the on disk
file. As long as the `InMemoryLayer` is in scope, it's safe to read from it.

Related https://github.com/neondatabase/neon/issues/6833
---
 pageserver/src/tenant/timeline.rs | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0b8cdac1cc..8b6e93d500 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2587,6 +2587,10 @@ impl Timeline {
                     // Get all the data needed to reconstruct the page version from this layer.
                     // But if we have an older cached page image, no need to go past that.
                     let lsn_floor = max(cached_lsn + 1, start_lsn);
+
+                    let open_layer = open_layer.clone();
+                    drop(guard);
+
                     result = match open_layer
                         .get_value_reconstruct_data(
                             key,
@@ -2604,10 +2608,7 @@ impl Timeline {
                     traversal_path.push((
                         result,
                         cont_lsn,
-                        Box::new({
-                            let open_layer = Arc::clone(open_layer);
-                            move || open_layer.traversal_id()
-                        }),
+                        Box::new(move || open_layer.traversal_id()),
                     ));
                     continue 'outer;
                 }
@@ -2617,6 +2618,10 @@ impl Timeline {
                 if cont_lsn > start_lsn {
                     //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
                     let lsn_floor = max(cached_lsn + 1, start_lsn);
+
+                    let frozen_layer = frozen_layer.clone();
+                    drop(guard);
+
                     result = match frozen_layer
                         .get_value_reconstruct_data(
                             key,
@@ -2634,10 +2639,7 @@ impl Timeline {
                     traversal_path.push((
                         result,
                         cont_lsn,
-                        Box::new({
-                            let frozen_layer = Arc::clone(frozen_layer);
-                            move || frozen_layer.traversal_id()
-                        }),
+                        Box::new(move || frozen_layer.traversal_id()),
                     ));
                     continue 'outer;
                 }
@@ -2645,6 +2647,8 @@ impl Timeline {
 
             if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
                 let layer = guard.get_from_desc(&layer);
+                drop(guard);
+
                 // Get all the data needed to reconstruct the page version from this layer.
                 // But if we have an older cached page image, no need to go past that.
                 let lsn_floor = max(cached_lsn + 1, lsn_floor);

From 3426619a7949dd46652ef89e7b81cbac15a4fbe1 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 26 Mar 2024 15:10:15 +0000
Subject: [PATCH 0476/1571] test_runner/performance: skip test_bulk_insert
 (#7238)

## Problem
`test_bulk_insert` becomes too slow, and it fails constantly:
https://github.com/neondatabase/neon/issues/7124

## Summary of changes
- Skip `test_bulk_insert` until it's fixed
---
 test_runner/performance/test_bulk_insert.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index 9e3f602237..1df3f2f5f1 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -1,5 +1,6 @@
 from contextlib import closing
 
+import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.compare_fixtures import NeonCompare, PgCompare
 from fixtures.pageserver.utils import wait_tenant_status_404
@@ -17,6 +18,7 @@ from fixtures.types import Lsn
 # 3. Disk space used
 # 4. Peak memory usage
 #
+@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/7124")
 def test_bulk_insert(neon_with_baseline: PgCompare):
     env = neon_with_baseline
 

From 8dfe3a070cd04dd2310ed07e1f38f4257dd43cd8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 26 Mar 2024 15:20:05 +0000
Subject: [PATCH 0477/1571] pageserver: return 429 on timeline creation in
 progress (#7225)

## Problem

Currently, we return 409 (Conflict) in two cases:
- Temporary: Timeline creation cannot proceed because another timeline
with the same ID is being created
- Permanent: Timeline creation cannot proceed because another timeline
exists with different parameters but the same ID.

Callers which time out a request and retry should be able to distinguish
these cases.

Closes: #7208

## Summary of changes

- Expose `AlreadyCreating` errors as 429 instead of 409
---
 pageserver/src/http/openapi_spec.yml | 10 ++++++++--
 pageserver/src/http/routes.rs        | 11 +++++++----
 test_runner/regress/test_tenants.py  |  3 +++
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 0771229845..bb477f89c5 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1038,7 +1038,7 @@ paths:
                   format: hex
       responses:
         "201":
-          description: TimelineInfo
+          description: Timeline was created, or already existed with matching parameters
           content:
             application/json:
               schema:
@@ -1068,11 +1068,17 @@ paths:
               schema:
                 $ref: "#/components/schemas/Error"
         "409":
-          description: Timeline already exists, creation skipped
+          description: Timeline already exists, with different parameters.  Creation cannot proceed.
           content:
             application/json:
               schema:
                 $ref: "#/components/schemas/ConflictError"
+        "429":
+          description: A creation request was sent for the same Timeline Id while a creation was already in progress.  Back off and retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
         "500":
           description: Generic operation error
           content:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 26f23fb8c2..3cc92216ed 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -535,10 +535,13 @@ async fn timeline_create_handler(
                     HttpErrorBody::from_msg("Tenant shutting down".to_string()),
                 )
             }
-            Err(
-                e @ tenant::CreateTimelineError::Conflict
-                | e @ tenant::CreateTimelineError::AlreadyCreating,
-            ) => json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())),
+            Err(e @ tenant::CreateTimelineError::Conflict) => {
+                json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string()))
+            }
+            Err(e @ tenant::CreateTimelineError::AlreadyCreating) => json_response(
+                StatusCode::TOO_MANY_REQUESTS,
+                HttpErrorBody::from_msg(e.to_string()),
+            ),
             Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
                 StatusCode::NOT_ACCEPTABLE,
                 HttpErrorBody::from_msg(format!("{err:#}")),
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index f8701b65d7..2832304dcc 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -389,6 +389,9 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
             if e.status_code == 409:
                 log.info(f"delay_ms={delay_ms} 409")
                 pass
+            elif e.status_code == 429:
+                log.info(f"delay_ms={delay_ms} 429")
+                pass
             elif e.status_code == 400:
                 if "is less than existing" in e.message:
                     # We send creation requests very close together in time: it is expected that these

From 47d2b3a4830f6d5ecb84086e785ec0f913390176 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 26 Mar 2024 15:45:32 +0000
Subject: [PATCH 0478/1571] pageserver: limit total ephemeral layer bytes
 (#7218)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

Follows: https://github.com/neondatabase/neon/pull/7182

- Sufficient concurrent writes could OOM a pageserver from the size of
indices on all the InMemoryLayer instances.
- Enforcement of checkpoint_period only happened if there were some
writes.

Closes: https://github.com/neondatabase/neon/issues/6916

## Summary of changes

- Add `ephemeral_bytes_per_memory_kb` config property. This controls the
ratio of ephemeral layer capacity to memory capacity. The weird unit is
to enable making the ratio less than 1:1 (set this property to 1024 to
use 1MB of ephemeral layers for every 1MB of RAM, set it smaller to get
a fraction).
- Implement background layer rolling checks in
Timeline::compaction_iteration -- this ensures we apply layer rolling
policy in the absence of writes.
- During background checks, if the total ephemeral layer size has
exceeded the limit, then roll layers whose size is greater than the mean
size of all ephemeral layers.
- Remove the tick() path from walreceiver: it isn't needed any more now
that we do equivalent checks from compaction_iteration.
- Add tests for the above.

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 Cargo.lock                                    |   1 +
 pageserver/Cargo.toml                         |   1 +
 pageserver/src/config.rs                      |  25 ++
 pageserver/src/tenant/mgr.rs                  |  14 +
 pageserver/src/tenant/storage_layer.rs        |   2 +-
 .../tenant/storage_layer/inmemory_layer.rs    |  40 ++-
 pageserver/src/tenant/timeline.rs             | 218 ++++++++------
 .../walreceiver/walreceiver_connection.rs     |  11 -
 .../regress/test_pageserver_layer_rolling.py  | 275 ++++++++++++++++++
 .../test_pageserver_small_inmemory_layers.py  | 143 ---------
 10 files changed, 483 insertions(+), 247 deletions(-)
 create mode 100644 test_runner/regress/test_pageserver_layer_rolling.py
 delete mode 100644 test_runner/regress/test_pageserver_small_inmemory_layers.py

diff --git a/Cargo.lock b/Cargo.lock
index 45b802c54f..c1c245fa9c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3581,6 +3581,7 @@ dependencies = [
  "strum_macros",
  "svg_fmt",
  "sync_wrapper",
+ "sysinfo",
  "tenant_size_model",
  "thiserror",
  "tokio",
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index f304294591..7a11610a91 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -59,6 +59,7 @@ signal-hook.workspace = true
 smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
 sync_wrapper.workspace = true
+sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index a29719e36f..1837da34ce 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -95,6 +95,8 @@ pub mod defaults {
 
     pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
 
+    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
+
     ///
     /// Default built-in configuration file.
     ///
@@ -156,6 +158,8 @@ pub mod defaults {
 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
 #secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
 
+#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
+
 [remote_storage]
 
 "#
@@ -279,6 +283,13 @@ pub struct PageServerConf {
     pub max_vectored_read_bytes: MaxVectoredReadBytes,
 
     pub validate_vectored_get: bool,
+
+    /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
+    /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
+    /// of ephemeral data.
+    ///
+    /// Setting this to zero disables limits on total ephemeral layer size.
+    pub ephemeral_bytes_per_memory_kb: usize,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -400,6 +411,8 @@ struct PageServerConfigBuilder {
     max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
 
     validate_vectored_get: BuilderValue<bool>,
+
+    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
 }
 
 impl PageServerConfigBuilder {
@@ -486,6 +499,7 @@ impl PageServerConfigBuilder {
                 NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
             )),
             validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
+            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
         }
     }
 }
@@ -665,6 +679,10 @@ impl PageServerConfigBuilder {
         self.validate_vectored_get = BuilderValue::Set(value);
     }
 
+    pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
+        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
@@ -720,6 +738,7 @@ impl PageServerConfigBuilder {
                 get_vectored_impl,
                 max_vectored_read_bytes,
                 validate_vectored_get,
+                ephemeral_bytes_per_memory_kb,
             }
             CUSTOM LOGIC
             {
@@ -1010,6 +1029,9 @@ impl PageServerConf {
                 "validate_vectored_get" => {
                     builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
                 }
+                "ephemeral_bytes_per_memory_kb" => {
+                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1091,6 +1113,7 @@ impl PageServerConf {
                     .expect("Invalid default constant"),
             ),
             validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
+            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
         }
     }
 }
@@ -1328,6 +1351,7 @@ background_task_maximum_delay = '334 s'
                         .expect("Invalid default constant")
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
+                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1399,6 +1423,7 @@ background_task_maximum_delay = '334 s'
                         .expect("Invalid default constant")
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
+                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 97a505ded9..8e3eae7cf6 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -16,6 +16,7 @@ use std::collections::{BTreeMap, HashMap};
 use std::ops::Deref;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
+use sysinfo::SystemExt;
 use tokio::fs;
 use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
 
@@ -43,6 +44,7 @@ use crate::tenant::config::{
 };
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
+use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
 
@@ -543,6 +545,18 @@ pub async fn init_tenant_mgr(
 
     let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
 
+    // Initialize dynamic limits that depend on system resources
+    let system_memory =
+        sysinfo::System::new_with_specifics(sysinfo::RefreshKind::new().with_memory())
+            .total_memory();
+    let max_ephemeral_layer_bytes =
+        conf.ephemeral_bytes_per_memory_kb as u64 * (system_memory / 1024);
+    tracing::info!("Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory");
+    inmemory_layer::GLOBAL_RESOURCES.max_dirty_bytes.store(
+        max_ephemeral_layer_bytes,
+        std::sync::atomic::Ordering::Relaxed,
+    );
+
     // Scan local filesystem for attached tenants
     let tenant_configs = init_load_tenant_configs(conf).await?;
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 5c3bab9868..f44a92a2d7 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -3,7 +3,7 @@
 pub mod delta_layer;
 mod filename;
 pub mod image_layer;
-mod inmemory_layer;
+pub(crate) mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
 
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 869d175d8d..628f12065f 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -89,7 +89,10 @@ impl std::fmt::Debug for InMemoryLayerInner {
 ///
 /// This global state is used to implement behaviors that require a global view of the system, e.g.
 /// rolling layers proactively to limit the total amount of dirty data.
-struct GlobalResources {
+pub(crate) struct GlobalResources {
+    // Limit on how high dirty_bytes may grow before we start freezing layers to reduce it.
+    // Zero means unlimited.
+    pub(crate) max_dirty_bytes: AtomicU64,
     // How many bytes are in all EphemeralFile objects
     dirty_bytes: AtomicU64,
     // How many layers are contributing to dirty_bytes
@@ -118,11 +121,12 @@ impl GlobalResourceUnits {
 
     /// Do not call this frequently: all timelines will write to these same global atomics,
     /// so this is a relatively expensive operation.  Wait at least a few seconds between calls.
-    fn publish_size(&mut self, size: u64) {
+    ///
+    /// Returns the effective layer size limit that should be applied, if any, to keep
+    /// the total number of dirty bytes below the configured maximum.
+    fn publish_size(&mut self, size: u64) -> Option<u64> {
         let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
-            Ordering::Equal => {
-                return;
-            }
+            Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
             Ordering::Greater => {
                 let delta = size - self.dirty_bytes;
                 let old = GLOBAL_RESOURCES
@@ -146,6 +150,21 @@ impl GlobalResourceUnits {
         TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);
 
         self.dirty_bytes = size;
+
+        let max_dirty_bytes = GLOBAL_RESOURCES
+            .max_dirty_bytes
+            .load(AtomicOrdering::Relaxed);
+        if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes {
+            // Set the layer file limit to the average layer size: this implies that all above-average
+            // sized layers will be elegible for freezing.  They will be frozen in the order they
+            // next enter publish_size.
+            Some(
+                new_global_dirty_bytes
+                    / GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64,
+            )
+        } else {
+            None
+        }
     }
 
     // Call publish_size if the input size differs from last published size by more than
@@ -174,7 +193,8 @@ impl Drop for GlobalResourceUnits {
     }
 }
 
-static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
+pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
+    max_dirty_bytes: AtomicU64::new(0),
     dirty_bytes: AtomicU64::new(0),
     dirty_layers: AtomicUsize::new(0),
 };
@@ -194,6 +214,10 @@ impl InMemoryLayer {
         }
     }
 
+    pub(crate) fn try_len(&self) -> Option<u64> {
+        self.inner.try_read().map(|i| i.file.len()).ok()
+    }
+
     pub(crate) fn assert_writable(&self) {
         assert!(self.end_lsn.get().is_none());
     }
@@ -486,10 +510,10 @@ impl InMemoryLayer {
         Ok(())
     }
 
-    pub(crate) async fn tick(&self) {
+    pub(crate) async fn tick(&self) -> Option<u64> {
         let mut inner = self.inner.write().await;
         let size = inner.file.len();
-        inner.resource_units.publish_size(size);
+        inner.resource_units.publish_size(size)
     }
 
     pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8b6e93d500..38292b6d78 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -19,7 +19,7 @@ use pageserver_api::{
     keyspace::KeySpaceAccum,
     models::{
         CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
-        EvictionPolicy, LayerMapInfo, TimelineState,
+        EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
     },
     reltag::BlockNumber,
     shard::{ShardIdentity, TenantShardId},
@@ -1142,6 +1142,79 @@ impl Timeline {
         self.flush_frozen_layers_and_wait().await
     }
 
+    /// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
+    ///
+    /// This is for use in background housekeeping, to provide guarantees of layers closing eventually
+    /// even if there are no ongoing writes to drive that.
+    async fn maybe_freeze_ephemeral_layer(&self) {
+        let Ok(_write_guard) = self.write_lock.try_lock() else {
+            // If the write lock is held, there is an active wal receiver: rolling open layers
+            // is their responsibility while they hold this lock.
+            return;
+        };
+
+        let Ok(layers_guard) = self.layers.try_read() else {
+            // Don't block if the layer lock is busy
+            return;
+        };
+
+        let Some(open_layer) = &layers_guard.layer_map().open_layer else {
+            // No open layer, no work to do.
+            return;
+        };
+
+        let Some(current_size) = open_layer.try_len() else {
+            // Unexpected: since we hold the write guard, nobody else should be writing to this layer, so
+            // read lock to get size should always succeed.
+            tracing::warn!("Lock conflict while reading size of open layer");
+            return;
+        };
+
+        let current_lsn = self.get_last_record_lsn();
+
+        let checkpoint_distance_override = open_layer.tick().await;
+
+        if let Some(size_override) = checkpoint_distance_override {
+            if current_size > size_override {
+                // This is not harmful, but it only happens in relatively rare cases where
+                // time-based checkpoints are not happening fast enough to keep the amount of
+                // ephemeral data within configured limits.  It's a sign of stress on the system.
+                tracing::info!("Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure");
+            }
+        }
+
+        let checkpoint_distance =
+            checkpoint_distance_override.unwrap_or(self.get_checkpoint_distance());
+
+        if self.should_roll(
+            current_size,
+            current_size,
+            checkpoint_distance,
+            self.get_last_record_lsn(),
+            self.last_freeze_at.load(),
+            *self.last_freeze_ts.read().unwrap(),
+        ) {
+            match open_layer.info() {
+                InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
+                    // We may reach this point if the layer was already frozen by not yet flushed: flushing
+                    // happens asynchronously in the background.
+                    tracing::debug!(
+                        "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
+                    );
+                }
+                InMemoryLayerInfo::Open { .. } => {
+                    // Upgrade to a write lock and freeze the layer
+                    drop(layers_guard);
+                    let mut layers_guard = self.layers.write().await;
+                    layers_guard
+                        .try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at)
+                        .await;
+                }
+            }
+            self.flush_frozen_layers();
+        }
+    }
+
     /// Outermost timeline compaction operation; downloads needed layers.
     pub(crate) async fn compact(
         self: &Arc<Self>,
@@ -1164,6 +1237,11 @@ impl Timeline {
             (guard, permit)
         };
 
+        // Prior to compaction, check if an open ephemeral layer should be closed: this provides
+        // background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping
+        // an ephemeral layer open forever when idle.
+        self.maybe_freeze_ephemeral_layer().await;
+
         // this wait probably never needs any "long time spent" logging, because we already nag if
         // compaction task goes over it's period (20s) which is quite often in production.
         let (_guard, _permit) = tokio::select! {
@@ -1434,6 +1512,53 @@ impl Timeline {
             Err(EvictionError::Timeout) => Ok(Some(false)),
         }
     }
+
+    fn should_roll(
+        &self,
+        layer_size: u64,
+        projected_layer_size: u64,
+        checkpoint_distance: u64,
+        projected_lsn: Lsn,
+        last_freeze_at: Lsn,
+        last_freeze_ts: Instant,
+    ) -> bool {
+        let distance = projected_lsn.widening_sub(last_freeze_at);
+
+        // Rolling the open layer can be triggered by:
+        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
+        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
+        //    account for how writes are distributed across shards: we expect each node to consume
+        //    1/count of the LSN on average.
+        // 2. The size of the currently open layer.
+        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
+        //    up and suspend activity.
+        if distance >= checkpoint_distance as i128 * self.shard_identity.count.count() as i128 {
+            info!(
+                "Will roll layer at {} with layer size {} due to LSN distance ({})",
+                projected_lsn, layer_size, distance
+            );
+
+            true
+        } else if projected_layer_size >= checkpoint_distance {
+            info!(
+                "Will roll layer at {} with layer size {} due to layer size ({})",
+                projected_lsn, layer_size, projected_layer_size
+            );
+
+            true
+        } else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
+            info!(
+                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
+                projected_lsn,
+                layer_size,
+                last_freeze_ts.elapsed()
+            );
+
+            true
+        } else {
+            false
+        }
+    }
 }
 
 /// Number of times we will compute partition within a checkpoint distance.
@@ -4455,52 +4580,6 @@ impl<'a> TimelineWriter<'a> {
         res
     }
 
-    /// "Tick" the timeline writer: it will roll the open layer if required
-    /// and do nothing else.
-    pub(crate) async fn tick(&mut self) -> anyhow::Result<()> {
-        self.open_layer_if_present().await?;
-
-        let last_record_lsn = self.get_last_record_lsn();
-        let action = self.get_open_layer_action(last_record_lsn, 0);
-        if action == OpenLayerAction::Roll {
-            self.roll_layer(last_record_lsn).await?;
-        } else if let Some(writer_state) = &mut *self.write_guard {
-            // Periodic update of statistics
-            writer_state.open_layer.tick().await;
-        }
-
-        Ok(())
-    }
-
-    /// Populate the timeline writer state only if an in-memory layer
-    /// is already open.
-    async fn open_layer_if_present(&mut self) -> anyhow::Result<()> {
-        assert!(self.write_guard.is_none());
-
-        let open_layer = {
-            let guard = self.layers.read().await;
-            let layers = guard.layer_map();
-            match layers.open_layer {
-                Some(ref open_layer) => open_layer.clone(),
-                None => {
-                    return Ok(());
-                }
-            }
-        };
-
-        let initial_size = open_layer.size().await?;
-        let last_freeze_at = self.last_freeze_at.load();
-        let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
-        self.write_guard.replace(TimelineWriterState::new(
-            open_layer,
-            initial_size,
-            last_freeze_at,
-            last_freeze_ts,
-        ));
-
-        Ok(())
-    }
-
     async fn handle_open_layer_action(
         &mut self,
         at: Lsn,
@@ -4572,43 +4651,14 @@ impl<'a> TimelineWriter<'a> {
             return OpenLayerAction::None;
         }
 
-        let distance = lsn.widening_sub(state.cached_last_freeze_at);
-        let proposed_open_layer_size = state.current_size + new_value_size;
-
-        // Rolling the open layer can be triggered by:
-        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
-        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
-        //    account for how writes are distributed across shards: we expect each node to consume
-        //    1/count of the LSN on average.
-        // 2. The size of the currently open layer.
-        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
-        //    up and suspend activity.
-        if distance
-            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128
-        {
-            info!(
-                "Will roll layer at {} with layer size {} due to LSN distance ({})",
-                lsn, state.current_size, distance
-            );
-
-            OpenLayerAction::Roll
-        } else if proposed_open_layer_size >= self.get_checkpoint_distance() {
-            info!(
-                "Will roll layer at {} with layer size {} due to layer size ({})",
-                lsn, state.current_size, proposed_open_layer_size
-            );
-
-            OpenLayerAction::Roll
-        } else if distance > 0
-            && state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()
-        {
-            info!(
-                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
-                lsn,
-                state.current_size,
-                state.cached_last_freeze_ts.elapsed()
-            );
-
+        if self.tl.should_roll(
+            state.current_size,
+            state.current_size + new_value_size,
+            self.get_checkpoint_distance(),
+            lsn,
+            state.cached_last_freeze_at,
+            state.cached_last_freeze_ts,
+        ) {
             OpenLayerAction::Roll
         } else {
             OpenLayerAction::None
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index d9f780cfd1..00a9dbd760 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -389,17 +389,6 @@ pub(super) async fn handle_walreceiver_connection(
             }
         }
 
-        {
-            // This is a hack. It piggybacks on the keepalive messages sent by the
-            // safekeeper in order to enforce `checkpoint_timeout` on the currently
-            // open layer. This hack doesn't provide a bound on the total size of
-            // in-memory layers on a pageserver. See https://github.com/neondatabase/neon/issues/6916.
-            let mut writer = timeline.writer().await;
-            if let Err(err) = writer.tick().await {
-                warn!("Timeline writer tick failed: {err}");
-            }
-        }
-
         if let Some(last_lsn) = status_update {
             let timeline_remote_consistent_lsn = timeline
                 .get_remote_consistent_lsn_visible()
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
new file mode 100644
index 0000000000..c7e1e88468
--- /dev/null
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -0,0 +1,275 @@
+import asyncio
+import os
+from typing import Tuple
+
+import psutil
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    tenant_get_shards,
+)
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
+from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import wait_until
+
+TIMELINE_COUNT = 10
+ENTRIES_PER_TIMELINE = 10_000
+CHECKPOINT_TIMEOUT_SECONDS = 60
+
+
+async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
+    tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf)
+    with env.endpoints.create_start("main", tenant_id=tenant) as ep:
+        conn = await ep.connect_async()
+        try:
+            await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)")
+            await conn.execute(
+                f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i"
+            )
+        finally:
+            await conn.close(timeout=10)
+
+        last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+        return tenant, timeline, last_flush_lsn
+
+
+async def workload(
+    env: NeonEnv, tenant_conf, timelines: int, entries: int
+) -> list[Tuple[TenantId, TimelineId, Lsn]]:
+    workers = [asyncio.create_task(run_worker(env, tenant_conf, entries)) for _ in range(timelines)]
+    return await asyncio.gather(*workers)
+
+
+def wait_until_pageserver_is_caught_up(
+    env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]]
+):
+    for tenant, timeline, last_flush_lsn in last_flush_lsns:
+        shards = tenant_get_shards(env, tenant)
+        for tenant_shard_id, pageserver in shards:
+            waited = wait_for_last_record_lsn(
+                pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
+            )
+            assert waited >= last_flush_lsn
+
+
+def wait_until_pageserver_has_uploaded(
+    env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]]
+):
+    for tenant, timeline, last_flush_lsn in last_flush_lsns:
+        shards = tenant_get_shards(env, tenant)
+        for tenant_shard_id, pageserver in shards:
+            wait_for_upload(pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn)
+
+
+def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float:
+    def query():
+        value = pageserver_http.get_metric_value("pageserver_wal_ingest_records_received_total")
+        assert value is not None
+        return value
+
+    # The metric gets initialised on the first update.
+    # Retry a few times, but return 0 if it's stable.
+    try:
+        return float(wait_until(3, 0.5, query))
+    except Exception:
+        return 0
+
+
+def get_dirty_bytes(env):
+    v = env.pageserver.http_client().get_metric_value("pageserver_timeline_ephemeral_bytes") or 0
+    log.info(f"dirty_bytes: {v}")
+    return v
+
+
+def assert_dirty_bytes(env, v):
+    assert get_dirty_bytes(env) == v
+
+
+def assert_dirty_bytes_nonzero(env):
+    assert get_dirty_bytes(env) > 0
+
+
+@pytest.mark.parametrize("immediate_shutdown", [True, False])
+def test_pageserver_small_inmemory_layers(
+    neon_env_builder: NeonEnvBuilder, immediate_shutdown: bool
+):
+    """
+    Test that open layers get flushed after the `checkpoint_timeout` config
+    and do not require WAL reingest upon restart.
+
+    The workload creates a number of timelines and writes some data to each,
+    but not enough to trigger flushes via the `checkpoint_distance` config.
+    """
+    tenant_conf = {
+        # Large `checkpoint_distance` effectively disables size
+        # based checkpointing.
+        "checkpoint_distance": f"{2 * 1024 ** 3}",
+        "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s",
+        "compaction_period": "1s",
+    }
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    last_flush_lsns = asyncio.run(workload(env, tenant_conf, TIMELINE_COUNT, ENTRIES_PER_TIMELINE))
+    wait_until_pageserver_is_caught_up(env, last_flush_lsns)
+
+    # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data.
+    wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))  # type: ignore
+
+    ps_http_client = env.pageserver.http_client()
+    total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client)
+
+    # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed,
+    # such that there are zero bytes of ephemeral layer left on the pageserver
+    log.info("Waiting for background checkpoints...")
+    wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0))  # type: ignore
+
+    # Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they
+    # must be uploaded to remain visible to the pageserver after restart.
+    wait_until_pageserver_has_uploaded(env, last_flush_lsns)
+
+    env.pageserver.restart(immediate=immediate_shutdown)
+    wait_until_pageserver_is_caught_up(env, last_flush_lsns)
+
+    # Catching up with WAL ingest should have resulted in zero bytes of ephemeral layers, since
+    # we froze, flushed and uploaded everything before restarting.  There can be no more WAL writes
+    # because we shut down compute endpoints before flushing.
+    assert get_dirty_bytes(env) == 0
+
+    total_wal_ingested_after_restart = wait_for_wal_ingest_metric(ps_http_client)
+
+    log.info(f"WAL ingested before restart: {total_wal_ingested_before_restart}")
+    log.info(f"WAL ingested after restart: {total_wal_ingested_after_restart}")
+
+    assert total_wal_ingested_after_restart == 0
+
+
+def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that `checkpoint_timeout` is enforced even if there is no safekeeper input.
+    """
+    tenant_conf = {
+        # Large `checkpoint_distance` effectively disables size
+        # based checkpointing.
+        "checkpoint_distance": f"{2 * 1024 ** 3}",
+        "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s",
+        "compaction_period": "1s",
+    }
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    last_flush_lsns = asyncio.run(workload(env, tenant_conf, TIMELINE_COUNT, ENTRIES_PER_TIMELINE))
+    wait_until_pageserver_is_caught_up(env, last_flush_lsns)
+
+    # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data.
+    wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))  # type: ignore
+
+    # Stop the safekeepers, so that we cannot have any more WAL receiver connections
+    for sk in env.safekeepers:
+        sk.stop()
+
+    # We should have got here fast enough that we didn't hit the background interval yet,
+    # and the teardown of SK connections shouldn't prompt any layer freezing.
+    assert get_dirty_bytes(env) > 0
+
+    # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed,
+    # such that there are zero bytes of ephemeral layer left on the pageserver
+    log.info("Waiting for background checkpoints...")
+    wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0))  # type: ignore
+
+
+@pytest.mark.skipif(
+    # We have to use at least ~100MB of data to hit the lowest limit we can configure, which is
+    # prohibitively slow in debug mode
+    os.getenv("BUILD_TYPE") == "debug",
+    reason="Avoid running bulkier ingest tests in debug mode",
+)
+def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that checkpoints are done based on total ephemeral layer size, even if no one timeline is
+    individually exceeding checkpoint thresholds.
+    """
+
+    system_memory = psutil.virtual_memory().total
+
+    # The smallest total size limit we can configure is 1/1024th of the system memory (e.g. 128MB on
+    # a system with 128GB of RAM).  We will then write enough data to violate this limit.
+    max_dirty_data = 128 * 1024 * 1024
+    ephemeral_bytes_per_memory_kb = (max_dirty_data * 1024) // system_memory
+    assert ephemeral_bytes_per_memory_kb > 0
+
+    neon_env_builder.pageserver_config_override = f"""
+        ephemeral_bytes_per_memory_kb={ephemeral_bytes_per_memory_kb}
+        """
+
+    compaction_period_s = 10
+
+    tenant_conf = {
+        # Large space + time thresholds: effectively disable these limits
+        "checkpoint_distance": f"{1024 ** 4}",
+        "checkpoint_timeout": "3600s",
+        "compaction_period": f"{compaction_period_s}s",
+    }
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    timeline_count = 10
+
+    # This is about 2MiB of data per timeline
+    entries_per_timeline = 100_000
+
+    last_flush_lsns = asyncio.run(workload(env, tenant_conf, timeline_count, entries_per_timeline))
+    wait_until_pageserver_is_caught_up(env, last_flush_lsns)
+
+    total_bytes_ingested = 0
+    for tenant, timeline, last_flush_lsn in last_flush_lsns:
+        http_client = env.pageserver.http_client()
+        initdb_lsn = Lsn(http_client.timeline_detail(tenant, timeline)["initdb_lsn"])
+        total_bytes_ingested += last_flush_lsn - initdb_lsn
+
+    log.info(f"Ingested {total_bytes_ingested} bytes since initdb (vs max dirty {max_dirty_data})")
+    assert total_bytes_ingested > max_dirty_data
+
+    # Expected end state: the total physical size of all the tenants is in excess of the max dirty
+    # data, but the total amount of dirty data is less than the limit: this demonstrates that we
+    # have exceeded the threshold but then rolled layers in response
+    def get_total_historic_layers():
+        total_ephemeral_layers = 0
+        total_historic_bytes = 0
+        for tenant, timeline, _last_flush_lsn in last_flush_lsns:
+            http_client = env.pageserver.http_client()
+            initdb_lsn = Lsn(http_client.timeline_detail(tenant, timeline)["initdb_lsn"])
+            layer_map = http_client.layer_map_info(tenant, timeline)
+            total_historic_bytes += sum(
+                layer.layer_file_size
+                for layer in layer_map.historic_layers
+                if layer.layer_file_size is not None and Lsn(layer.lsn_start) > initdb_lsn
+            )
+            total_ephemeral_layers += len(layer_map.in_memory_layers)
+
+        log.info(
+            f"Total historic layer bytes: {total_historic_bytes} ({total_ephemeral_layers} ephemeral layers)"
+        )
+
+        return total_historic_bytes
+
+    def assert_bytes_rolled():
+        assert total_bytes_ingested - get_total_historic_layers() <= max_dirty_data
+
+    # Wait until enough layers have rolled that the amount of dirty data is under the threshold.
+    # We do this indirectly via layer maps, rather than the dirty bytes metric, to avoid false-passing
+    # if that metric isn't updated quickly enough to reflect the dirty bytes exceeding the limit.
+    wait_until(compaction_period_s * 2, 1, assert_bytes_rolled)
+
+    # The end state should also have the reported metric under the limit
+    def assert_dirty_data_limited():
+        dirty_bytes = get_dirty_bytes(env)
+        assert dirty_bytes < max_dirty_data
+
+    wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited())  # type: ignore
diff --git a/test_runner/regress/test_pageserver_small_inmemory_layers.py b/test_runner/regress/test_pageserver_small_inmemory_layers.py
deleted file mode 100644
index 714d1c1229..0000000000
--- a/test_runner/regress/test_pageserver_small_inmemory_layers.py
+++ /dev/null
@@ -1,143 +0,0 @@
-import asyncio
-from typing import Tuple
-
-import pytest
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    NeonEnv,
-    NeonEnvBuilder,
-    tenant_get_shards,
-)
-from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
-from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import wait_until
-
-TIMELINE_COUNT = 10
-ENTRIES_PER_TIMELINE = 10_000
-CHECKPOINT_TIMEOUT_SECONDS = 60
-
-TENANT_CONF = {
-    # Large `checkpoint_distance` effectively disables size
-    # based checkpointing.
-    "checkpoint_distance": f"{2 * 1024 ** 3}",
-    "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s",
-}
-
-
-async def run_worker(env: NeonEnv, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
-    tenant, timeline = env.neon_cli.create_tenant(conf=TENANT_CONF)
-    with env.endpoints.create_start("main", tenant_id=tenant) as ep:
-        conn = await ep.connect_async()
-        try:
-            await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)")
-            await conn.execute(
-                f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i"
-            )
-        finally:
-            await conn.close(timeout=10)
-
-        last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
-        return tenant, timeline, last_flush_lsn
-
-
-async def workload(
-    env: NeonEnv, timelines: int, entries: int
-) -> list[Tuple[TenantId, TimelineId, Lsn]]:
-    workers = [asyncio.create_task(run_worker(env, entries)) for _ in range(timelines)]
-    return await asyncio.gather(*workers)
-
-
-def wait_until_pageserver_is_caught_up(
-    env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]]
-):
-    for tenant, timeline, last_flush_lsn in last_flush_lsns:
-        shards = tenant_get_shards(env, tenant)
-        for tenant_shard_id, pageserver in shards:
-            waited = wait_for_last_record_lsn(
-                pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
-            )
-            assert waited >= last_flush_lsn
-
-
-def wait_until_pageserver_has_uploaded(
-    env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]]
-):
-    for tenant, timeline, last_flush_lsn in last_flush_lsns:
-        shards = tenant_get_shards(env, tenant)
-        for tenant_shard_id, pageserver in shards:
-            wait_for_upload(pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn)
-
-
-def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float:
-    def query():
-        value = pageserver_http.get_metric_value("pageserver_wal_ingest_records_received_total")
-        assert value is not None
-        return value
-
-    # The metric gets initialised on the first update.
-    # Retry a few times, but return 0 if it's stable.
-    try:
-        return float(wait_until(3, 0.5, query))
-    except Exception:
-        return 0
-
-
-@pytest.mark.parametrize("immediate_shutdown", [True, False])
-def test_pageserver_small_inmemory_layers(
-    neon_env_builder: NeonEnvBuilder, immediate_shutdown: bool
-):
-    """
-    Test that open layers get flushed after the `checkpoint_timeout` config
-    and do not require WAL reingest upon restart.
-
-    The workload creates a number of timelines and writes some data to each,
-    but not enough to trigger flushes via the `checkpoint_distance` config.
-    """
-
-    def get_dirty_bytes():
-        v = (
-            env.pageserver.http_client().get_metric_value("pageserver_timeline_ephemeral_bytes")
-            or 0
-        )
-        log.info(f"dirty_bytes: {v}")
-        return v
-
-    def assert_dirty_bytes(v):
-        assert get_dirty_bytes() == v
-
-    env = neon_env_builder.init_configs()
-    env.start()
-
-    last_flush_lsns = asyncio.run(workload(env, TIMELINE_COUNT, ENTRIES_PER_TIMELINE))
-    wait_until_pageserver_is_caught_up(env, last_flush_lsns)
-
-    # We didn't write enough data to trigger a size-based checkpoint
-    assert get_dirty_bytes() > 0
-
-    ps_http_client = env.pageserver.http_client()
-    total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client)
-
-    # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed,
-    # such that there are zero bytes of ephemeral layer left on the pageserver
-    log.info("Waiting for background checkpoints...")
-    wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(0))  # type: ignore
-
-    # Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they
-    # must be uploaded to remain visible to the pageserver after restart.
-    wait_until_pageserver_has_uploaded(env, last_flush_lsns)
-
-    env.pageserver.restart(immediate=immediate_shutdown)
-    wait_until_pageserver_is_caught_up(env, last_flush_lsns)
-
-    # Catching up with WAL ingest should have resulted in zero bytes of ephemeral layers, since
-    # we froze, flushed and uploaded everything before restarting.  There can be no more WAL writes
-    # because we shut down compute endpoints before flushing.
-    assert get_dirty_bytes() == 0
-
-    total_wal_ingested_after_restart = wait_for_wal_ingest_metric(ps_http_client)
-
-    log.info(f"WAL ingested before restart: {total_wal_ingested_before_restart}")
-    log.info(f"WAL ingested after restart: {total_wal_ingested_after_restart}")
-
-    assert total_wal_ingested_after_restart == 0

From b3bb1d1cad76f1a6cddf4c94d240705f8d58c427 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 26 Mar 2024 16:57:35 +0000
Subject: [PATCH 0479/1571] storage controller: make direct tenant creation
 more robust (#7247)

## Problem

- Creations were not idempotent (unique key violation)
- Creations waited for reconciliation, which control plane blocks while
an operation is in flight

## Summary of changes

- Handle unique key constraint violation as an OK situation: if we're
creating the same tenant ID and shard count, it's reasonable to assume
this is a duplicate creation.
- Make the wait for reconcile during creation tolerate failures: this is
similar to location_conf, where the cloud control plane blocks our
notification calls until it is done with calling into our API (in future
this constraint is expected to relax as the cloud control plane learns
to run multiple operations concurrently for a tenant)
---
 .../attachment_service/src/service.rs         | 31 +++++++++++++++----
 test_runner/fixtures/neon_fixtures.py         |  3 ++
 test_runner/regress/test_sharding_service.py  |  5 +++
 3 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index aa930014b2..925910253b 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1523,6 +1523,8 @@ impl Service {
         &self,
         create_req: TenantCreateRequest,
     ) -> Result<TenantCreateResponse, ApiError> {
+        let tenant_id = create_req.new_tenant_id.tenant_id;
+
         // Exclude any concurrent attempts to create/access the same tenant ID
         let _tenant_lock = self
             .tenant_op_locks
@@ -1531,7 +1533,12 @@ impl Service {
 
         let (response, waiters) = self.do_tenant_create(create_req).await?;
 
-        self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
+        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
+            // Avoid deadlock: reconcile may fail while notifying compute, if the cloud control plane refuses to
+            // accept compute notifications while it is in the process of creating.  Reconciliation will
+            // be retried in the background.
+            tracing::warn!(%tenant_id, "Reconcile not done yet while creating tenant ({e})");
+        }
         Ok(response)
     }
 
@@ -1610,13 +1617,25 @@ impl Service {
                 splitting: SplitState::default(),
             })
             .collect();
-        self.persistence
+
+        match self
+            .persistence
             .insert_tenant_shards(persist_tenant_shards)
             .await
-            .map_err(|e| {
-                // TODO: distinguish primary key constraint (idempotent, OK), from other errors
-                ApiError::InternalServerError(anyhow::anyhow!(e))
-            })?;
+        {
+            Ok(_) => {}
+            Err(DatabaseError::Query(diesel::result::Error::DatabaseError(
+                DatabaseErrorKind::UniqueViolation,
+                _,
+            ))) => {
+                // Unique key violation: this is probably a retry.  Because the shard count is part of the unique key,
+                // if we see a unique key violation it means that the creation request's shard count matches the previous
+                // creation's shard count.
+                tracing::info!("Tenant shards already present in database, proceeding with idempotent creation...");
+            }
+            // Any other database error is unexpected and a bug.
+            Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))),
+        };
 
         let (waiters, response_shards) = {
             let mut locked = self.inner.write().unwrap();
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index f22ce10c20..3d60f9bef5 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2126,6 +2126,8 @@ class NeonStorageController(MetricsGetter):
             shard_params = {"count": shard_count}
             if shard_stripe_size is not None:
                 shard_params["stripe_size"] = shard_stripe_size
+            else:
+                shard_params["stripe_size"] = 32768
 
             body["shard_parameters"] = shard_params
 
@@ -2139,6 +2141,7 @@ class NeonStorageController(MetricsGetter):
             json=body,
             headers=self.headers(TokenScope.PAGE_SERVER_API),
         )
+        response.raise_for_status()
         log.info(f"tenant_create success: {response.json()}")
 
     def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index b7488cadd6..fc6c137667 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -89,6 +89,11 @@ def test_sharding_service_smoke(
     for tid in tenant_ids:
         env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)
 
+    # Repeating a creation should be idempotent (we are just testing it doesn't return an error)
+    env.storage_controller.tenant_create(
+        tenant_id=next(iter(tenant_ids)), shard_count=shards_per_tenant
+    )
+
     for node_id, count in get_node_shard_counts(env, tenant_ids).items():
         # we used a multiple of pagservers for the total shard count,
         # so expect equal number on all pageservers

From 6814bb4b59809b7d08657fd57a05b6f6dbf7a409 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 26 Mar 2024 17:44:18 +0000
Subject: [PATCH 0480/1571] tests: add a log allow list to stabilize benchmarks
 (#7251)

## Problem

https://github.com/neondatabase/neon/pull/7227 destabilized various
tests in the performance suite, with log errors during shutdown. It's
because we switched shutdown order to stop the storage controller before
the pageservers.

## Summary of changes

- Tolerate "connection failed" errors from pageservers trying to
validation their deletion queue.
---
 test_runner/fixtures/pageserver/allowed_errors.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index d7f682dad3..8b895dcd92 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -86,6 +86,9 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     # This is especially pronounced in tests that set small checkpoint
     # distances.
     ".*Flushed oversized open layer with size.*",
+    # During teardown, we stop the storage controller before the pageservers, so pageservers
+    # can experience connection errors doing background deletion queue work.
+    ".*WARN deletion backend: calling control plane generation validation API failed.*Connection refused.*",
 )
 
 
From b3b7ce457cdb5d0f6aa9d01cb3aaedf16c6d51c3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 26 Mar 2024 18:29:08 +0000
Subject: [PATCH 0481/1571] pageserver: remove bare mgr::get_tenant,
 mgr::list_tenants (#7237)

## Problem

This is a refactor.

This PR was a precursor to a much smaller change
https://github.com/neondatabase/neon/commit/e5bd602dc11dc0fbc483ce0abaf486b3407f42dc,
where as I was writing it I found that we were not far from getting rid
of the last non-deprecated code paths that use `mgr::` scoped functions
to get at the TenantManager state.

We're almost done cleaning this up as per
https://github.com/neondatabase/neon/issues/5796. The only significant
remaining mgr:: item is `get_active_tenant_with_timeout`, which is
page_service's path for fetching tenants.

## Summary of changes

- Remove the bool argument to get_attached_tenant_shard: this was almost
always false from API use cases, and in cases when it was true, it was
readily replacable with an explicit check of the returned tenant's
status.
- Rather than letting the timeline eviction task query any tenant it
likes via `mgr::`, pass an `Arc<Tenant>` into the task. This is still an
ugly circular reference, but should eventually go away: either when we
switch to exclusively using disk usage eviction, or when we change
metadata storage to avoid the need to imitate layer accesses.
- Convert all the mgr::get_tenant call sites to use
TenantManager::get_attached_tenant_shard
- Move list_tenants into TenantManager.
---
 pageserver/src/bin/pageserver.rs              |  54 +++---
 pageserver/src/consumption_metrics.rs         |  42 +++--
 pageserver/src/consumption_metrics/metrics.rs |   7 +-
 pageserver/src/disk_usage_eviction_task.rs    |  13 +-
 pageserver/src/http/routes.rs                 | 110 +++++++-----
 pageserver/src/page_service.rs                |   1 +
 pageserver/src/tenant.rs                      |  18 +-
 pageserver/src/tenant/mgr.rs                  | 168 ++++--------------
 .../src/tenant/secondary/heatmap_uploader.rs  |   6 +-
 pageserver/src/tenant/timeline.rs             |   3 +-
 .../src/tenant/timeline/eviction_task.rs      |  37 ++--
 pageserver/src/tenant/timeline/uninit.rs      |   3 +-
 .../regress/test_pageserver_secondary.py      |   2 +
 13 files changed, 221 insertions(+), 243 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index ef616c0a39..c80230d4d7 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -600,33 +600,37 @@ fn start_pageserver(
             None,
             "consumption metrics collection",
             true,
-            async move {
-                // first wait until background jobs are cleared to launch.
-                //
-                // this is because we only process active tenants and timelines, and the
-                // Timeline::get_current_logical_size will spawn the logical size calculation,
-                // which will not be rate-limited.
-                let cancel = task_mgr::shutdown_token();
+            {
+                let tenant_manager = tenant_manager.clone();
+                async move {
+                    // first wait until background jobs are cleared to launch.
+                    //
+                    // this is because we only process active tenants and timelines, and the
+                    // Timeline::get_current_logical_size will spawn the logical size calculation,
+                    // which will not be rate-limited.
+                    let cancel = task_mgr::shutdown_token();
 
-                tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()); },
-                    _ = background_jobs_barrier.wait() => {}
-                };
+                    tokio::select! {
+                        _ = cancel.cancelled() => { return Ok(()); },
+                        _ = background_jobs_barrier.wait() => {}
+                    };
 
-                pageserver::consumption_metrics::collect_metrics(
-                    metric_collection_endpoint,
-                    &conf.metric_collection_bucket,
-                    conf.metric_collection_interval,
-                    conf.cached_metric_collection_interval,
-                    conf.synthetic_size_calculation_interval,
-                    conf.id,
-                    local_disk_storage,
-                    cancel,
-                    metrics_ctx,
-                )
-                .instrument(info_span!("metrics_collection"))
-                .await?;
-                Ok(())
+                    pageserver::consumption_metrics::collect_metrics(
+                        tenant_manager,
+                        metric_collection_endpoint,
+                        &conf.metric_collection_bucket,
+                        conf.metric_collection_interval,
+                        conf.cached_metric_collection_interval,
+                        conf.synthetic_size_calculation_interval,
+                        conf.id,
+                        local_disk_storage,
+                        cancel,
+                        metrics_ctx,
+                    )
+                    .instrument(info_span!("metrics_collection"))
+                    .await?;
+                    Ok(())
+                }
             },
         );
     }
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 3429e3a0a6..f5540e896f 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,7 +3,9 @@
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
+use crate::tenant::{
+    mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
+};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
@@ -41,6 +43,7 @@ type Cache = HashMap<MetricsKey, (EventType, u64)>;
 /// Main thread that serves metrics collection
 #[allow(clippy::too_many_arguments)]
 pub async fn collect_metrics(
+    tenant_manager: Arc<TenantManager>,
     metric_collection_endpoint: &Url,
     metric_collection_bucket: &Option<RemoteStorageConfig>,
     metric_collection_interval: Duration,
@@ -67,15 +70,19 @@ pub async fn collect_metrics(
         None,
         "synthetic size calculation",
         false,
-        async move {
-            calculate_synthetic_size_worker(
-                synthetic_size_calculation_interval,
-                &cancel,
-                &worker_ctx,
-            )
-            .instrument(info_span!("synthetic_size_worker"))
-            .await?;
-            Ok(())
+        {
+            let tenant_manager = tenant_manager.clone();
+            async move {
+                calculate_synthetic_size_worker(
+                    tenant_manager,
+                    synthetic_size_calculation_interval,
+                    &cancel,
+                    &worker_ctx,
+                )
+                .instrument(info_span!("synthetic_size_worker"))
+                .await?;
+                Ok(())
+            }
         },
     );
 
@@ -116,7 +123,7 @@ pub async fn collect_metrics(
         let started_at = Instant::now();
 
         // these are point in time, with variable "now"
-        let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;
+        let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
 
         let metrics = Arc::new(metrics);
 
@@ -271,6 +278,7 @@ async fn reschedule(
 
 /// Caclculate synthetic size for each active tenant
 async fn calculate_synthetic_size_worker(
+    tenant_manager: Arc<TenantManager>,
     synthetic_size_calculation_interval: Duration,
     cancel: &CancellationToken,
     ctx: &RequestContext,
@@ -283,7 +291,7 @@ async fn calculate_synthetic_size_worker(
     loop {
         let started_at = Instant::now();
 
-        let tenants = match mgr::list_tenants().await {
+        let tenants = match tenant_manager.list_tenants() {
             Ok(tenants) => tenants,
             Err(e) => {
                 warn!("cannot get tenant list: {e:#}");
@@ -302,10 +310,14 @@ async fn calculate_synthetic_size_worker(
                 continue;
             }
 
-            let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
+            let Ok(tenant) = tenant_manager.get_attached_tenant_shard(tenant_shard_id) else {
                 continue;
             };
 
+            if !tenant.is_active() {
+                continue;
+            }
+
             // there is never any reason to exit calculate_synthetic_size_worker following any
             // return value -- we don't need to care about shutdown because no tenant is found when
             // pageserver is shut down.
@@ -343,9 +355,7 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
     };
 
     // this error can be returned if timeline is shutting down, but it does not
-    // mean the synthetic size worker should terminate. we do not need any checks
-    // in this function because `mgr::get_tenant` will error out after shutdown has
-    // progressed to shutting down tenants.
+    // mean the synthetic size worker should terminate.
     let shutting_down = matches!(
         e.downcast_ref::<PageReconstructError>(),
         Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs
index 26b299a71d..6740c1360b 100644
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -1,3 +1,4 @@
+use crate::tenant::mgr::TenantManager;
 use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
@@ -181,6 +182,7 @@ impl MetricsKey {
 }
 
 pub(super) async fn collect_all_metrics(
+    tenant_manager: &Arc<TenantManager>,
     cached_metrics: &Cache,
     ctx: &RequestContext,
 ) -> Vec<RawMetric> {
@@ -188,7 +190,7 @@ pub(super) async fn collect_all_metrics(
 
     let started_at = std::time::Instant::now();
 
-    let tenants = match crate::tenant::mgr::list_tenants().await {
+    let tenants = match tenant_manager.list_tenants() {
         Ok(tenants) => tenants,
         Err(err) => {
             tracing::error!("failed to list tenants: {:?}", err);
@@ -200,7 +202,8 @@ pub(super) async fn collect_all_metrics(
         if state != TenantState::Active || !id.is_zero() {
             None
         } else {
-            crate::tenant::mgr::get_tenant(id, true)
+            tenant_manager
+                .get_attached_tenant_shard(id)
                 .ok()
                 .map(|tenant| (id.tenant_id, tenant))
         }
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 92c1475aef..6248424cee 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -61,7 +61,6 @@ use crate::{
     metrics::disk_usage_based_eviction::METRICS,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     tenant::{
-        self,
         mgr::TenantManager,
         remote_timeline_client::LayerFileMetadata,
         secondary::SecondaryTenant,
@@ -814,8 +813,8 @@ async fn collect_eviction_candidates(
     const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
 
     // get a snapshot of the list of tenants
-    let tenants = tenant::mgr::list_tenants()
-        .await
+    let tenants = tenant_manager
+        .list_tenants()
         .context("get list of tenants")?;
 
     // TODO: avoid listing every layer in every tenant: this loop can block the executor,
@@ -827,8 +826,12 @@ async fn collect_eviction_candidates(
         if cancel.is_cancelled() {
             return Ok(EvictionCandidates::Cancelled);
         }
-        let tenant = match tenant::mgr::get_tenant(tenant_id, true) {
-            Ok(tenant) => tenant,
+        let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) {
+            Ok(tenant) if tenant.is_active() => tenant,
+            Ok(_) => {
+                debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active");
+                continue;
+            }
             Err(e) => {
                 // this can happen if tenant has lifecycle transition after we fetched it
                 debug!("failed to get tenant: {e:#}");
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 3cc92216ed..759a1b25ee 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -49,8 +49,8 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::{
-    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
-    TenantSlotError, TenantSlotUpsertError, TenantStateError,
+    GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError,
+    TenantSlotUpsertError, TenantStateError,
 };
 use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
 use crate::tenant::remote_timeline_client;
@@ -249,16 +249,11 @@ impl From<GetTenantError> for ApiError {
     fn from(tse: GetTenantError) -> ApiError {
         match tse {
             GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
-            GetTenantError::Broken(reason) => {
-                ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
-            }
             GetTenantError::NotActive(_) => {
                 // Why is this not `ApiError::NotFound`?
                 // Because we must be careful to never return 404 for a tenant if it does
                 // in fact exist locally. If we did, the caller could draw the conclusion
                 // that it can attach the tenant to another PS and we'd be in split-brain.
-                //
-                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
                 ApiError::ResourceUnavailable("Tenant not yet active".into())
             }
             GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
@@ -269,6 +264,9 @@ impl From<GetTenantError> for ApiError {
 impl From<GetActiveTenantError> for ApiError {
     fn from(e: GetActiveTenantError) -> ApiError {
         match e {
+            GetActiveTenantError::Broken(reason) => {
+                ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
+            }
             GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
             GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
             GetActiveTenantError::NotFound(gte) => gte.into(),
@@ -279,19 +277,6 @@ impl From<GetActiveTenantError> for ApiError {
     }
 }
 
-impl From<SetNewTenantConfigError> for ApiError {
-    fn from(e: SetNewTenantConfigError) -> ApiError {
-        match e {
-            SetNewTenantConfigError::GetTenant(tid) => {
-                ApiError::NotFound(anyhow!("tenant {}", tid).into())
-            }
-            e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => {
-                ApiError::InternalServerError(anyhow::Error::new(e))
-            }
-        }
-    }
-}
-
 impl From<crate::tenant::DeleteTimelineError> for ApiError {
     fn from(value: crate::tenant::DeleteTimelineError) -> Self {
         use crate::tenant::DeleteTimelineError::*;
@@ -495,7 +480,7 @@ async fn timeline_create_handler(
     async {
         let tenant = state
             .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id, false)?;
+            .get_attached_tenant_shard(tenant_shard_id)?;
 
         tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
@@ -584,7 +569,7 @@ async fn timeline_list_handler(
     let response_data = async {
         let tenant = state
             .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id, false)?;
+            .get_attached_tenant_shard(tenant_shard_id)?;
 
         tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
@@ -622,6 +607,7 @@ async fn timeline_preserve_initdb_handler(
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
 
     // Part of the process for disaster recovery from safekeeper-stored WAL:
     // If we don't recover into a new timeline but want to keep the timeline ID,
@@ -629,7 +615,9 @@ async fn timeline_preserve_initdb_handler(
     // location where timeline recreation cand find it.
 
     async {
-        let tenant = mgr::get_tenant(tenant_shard_id, false)?;
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
 
         let timeline = tenant
             .get_timeline(timeline_id, false)
@@ -671,7 +659,7 @@ async fn timeline_detail_handler(
     let timeline_info = async {
         let tenant = state
             .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id, false)?;
+            .get_attached_tenant_shard(tenant_shard_id)?;
 
         tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
@@ -858,7 +846,7 @@ async fn timeline_delete_handler(
 
     let tenant = state
         .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id, false)
+        .get_attached_tenant_shard(tenant_shard_id)
         .map_err(|e| {
             match e {
                 // GetTenantError has a built-in conversion to ApiError, but in this context we don't
@@ -976,10 +964,11 @@ async fn tenant_list_handler(
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
+    let state = get_state(&request);
 
-    let response_data = mgr::list_tenants()
-        .instrument(info_span!("tenant_list"))
-        .await
+    let response_data = state
+        .tenant_manager
+        .list_tenants()
         .map_err(|_| {
             ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
         })?
@@ -1002,9 +991,12 @@ async fn tenant_status(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
 
     let tenant_info = async {
-        let tenant = mgr::get_tenant(tenant_shard_id, false)?;
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
 
         // Calculate total physical size of all timelines
         let mut current_physical_size = 0;
@@ -1077,9 +1069,7 @@ async fn tenant_size_handler(
     let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
     let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
     let headers = request.headers();
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+    let state = get_state(&request);
 
     if !tenant_shard_id.is_zero() {
         return Err(ApiError::BadRequest(anyhow!(
@@ -1087,6 +1077,12 @@ async fn tenant_size_handler(
         )));
     }
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
     // this can be long operation
     let inputs = tenant
         .gather_size_inputs(
@@ -1155,10 +1151,15 @@ async fn tenant_shard_split_handler(
     let state = get_state(&request);
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
 
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
     let new_shards = state
         .tenant_manager
         .shard_split(
-            tenant_shard_id,
+            tenant,
             ShardCount::new(req.new_shard_count),
             req.new_stripe_size,
             &ctx,
@@ -1376,8 +1377,11 @@ async fn get_tenant_config_handler(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
 
-    let tenant = mgr::get_tenant(tenant_shard_id, false)?;
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
 
     let response = HashMap::from([
         (
@@ -1405,15 +1409,31 @@ async fn update_tenant_config_handler(
     let tenant_id = request_data.tenant_id;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant_conf =
+    let new_tenant_conf =
         TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
 
     let state = get_state(&request);
-    state
+
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let tenant = state
         .tenant_manager
-        .set_new_tenant_config(tenant_conf, tenant_id)
-        .instrument(info_span!("tenant_config", %tenant_id))
-        .await?;
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+    // This is a legacy API that only operates on attached tenants: the preferred
+    // API to use is the location_config/ endpoint, which lets the caller provide
+    // the full LocationConf.
+    let location_conf = LocationConf::attached_single(
+        new_tenant_conf.clone(),
+        tenant.get_generation(),
+        &ShardParameters::default(),
+    );
+
+    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    tenant.set_new_tenant_config(new_tenant_conf);
 
     json_response(StatusCode::OK, ())
 }
@@ -1637,10 +1657,12 @@ async fn handle_tenant_break(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
 
-    let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
-        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
-
-    tenant.set_broken("broken from test".to_owned()).await;
+    let state = get_state(&r);
+    state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?
+        .set_broken("broken from test".to_owned())
+        .await;
 
     json_response(StatusCode::OK, ())
 }
@@ -1884,7 +1906,7 @@ async fn active_timeline_of_active_tenant(
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
+    let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id)?;
 
     tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index f3ceb7d3e6..3d622f1871 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -760,6 +760,7 @@ impl PageServerHandler {
         let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
         timeline
             .import_basebackup_from_tar(
+                tenant.clone(),
                 &mut copyin_reader,
                 base_lsn,
                 self.broker_client.clone(),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index dcf9b1a605..792d9e548d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1411,7 +1411,7 @@ impl Tenant {
     /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
     #[allow(clippy::too_many_arguments)]
     pub(crate) async fn create_timeline(
-        &self,
+        self: &Arc<Tenant>,
         new_timeline_id: TimelineId,
         ancestor_timeline_id: Option<TimelineId>,
         mut ancestor_start_lsn: Option<Lsn>,
@@ -1559,7 +1559,7 @@ impl Tenant {
             })?;
         }
 
-        loaded_timeline.activate(broker_client, None, ctx);
+        loaded_timeline.activate(self.clone(), broker_client, None, ctx);
 
         Ok(loaded_timeline)
     }
@@ -1731,7 +1731,12 @@ impl Tenant {
             let mut activated_timelines = 0;
 
             for timeline in timelines_to_activate {
-                timeline.activate(broker_client.clone(), background_jobs_can_start, ctx);
+                timeline.activate(
+                    self.clone(),
+                    broker_client.clone(),
+                    background_jobs_can_start,
+                    ctx,
+                );
                 activated_timelines += 1;
             }
 
@@ -2063,7 +2068,12 @@ impl Tenant {
                 TenantState::Active { .. } => {
                     return Ok(());
                 }
-                TenantState::Broken { .. } | TenantState::Stopping { .. } => {
+                TenantState::Broken { reason, .. } => {
+                    // This is fatal, and reported distinctly from the general case of "will never be active" because
+                    // it's logically a 500 to external API users (broken is always a bug).
+                    return Err(GetActiveTenantError::Broken(reason));
+                }
+                TenantState::Stopping { .. } => {
                     // There's no chance the tenant can transition back into ::Active
                     return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
                 }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 8e3eae7cf6..f01fb9791c 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -4,7 +4,7 @@
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use itertools::Itertools;
 use pageserver_api::key::Key;
-use pageserver_api::models::{LocationConfigMode, ShardParameters};
+use pageserver_api::models::LocationConfigMode;
 use pageserver_api::shard::{
     ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
 };
@@ -40,7 +40,6 @@ use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
     AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
-    TenantConfOpt,
 };
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
@@ -889,16 +888,6 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
     // caller will log how long we took
 }
 
-#[derive(Debug, thiserror::Error)]
-pub(crate) enum SetNewTenantConfigError {
-    #[error(transparent)]
-    GetTenant(#[from] GetTenantError),
-    #[error(transparent)]
-    Persist(anyhow::Error),
-    #[error(transparent)]
-    Other(anyhow::Error),
-}
-
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum UpsertLocationError {
     #[error("Bad config request: {0}")]
@@ -924,32 +913,21 @@ impl TenantManager {
         self.conf
     }
 
-    /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
-    /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
+    /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently
+    /// undergoing a state change (i.e. slot is InProgress).
+    ///
+    /// The return Tenant is not guaranteed to be active: check its status after obtaing it, or
+    /// use [`Tenant::wait_to_become_active`] before using it if you will do I/O on it.
     pub(crate) fn get_attached_tenant_shard(
         &self,
         tenant_shard_id: TenantShardId,
-        active_only: bool,
     ) -> Result<Arc<Tenant>, GetTenantError> {
         let locked = self.tenants.read().unwrap();
 
         let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
 
         match peek_slot {
-            Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
-                TenantState::Broken {
-                    reason,
-                    backtrace: _,
-                } if active_only => Err(GetTenantError::Broken(reason)),
-                TenantState::Active => Ok(Arc::clone(tenant)),
-                _ => {
-                    if active_only {
-                        Err(GetTenantError::NotActive(tenant_shard_id))
-                    } else {
-                        Ok(Arc::clone(tenant))
-                    }
-                }
-            },
+            Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)),
             Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
             None | Some(TenantSlot::Secondary(_)) => {
                 Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
@@ -1442,7 +1420,8 @@ impl TenantManager {
                     .wait_to_become_active(activation_timeout)
                     .await
                     .map_err(|e| match e {
-                        GetActiveTenantError::WillNotBecomeActive(_) => {
+                        GetActiveTenantError::WillNotBecomeActive(_)
+                        | GetActiveTenantError::Broken(_) => {
                             DeleteTenantError::InvalidState(tenant.current_state())
                         }
                         GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
@@ -1469,29 +1448,30 @@ impl TenantManager {
         result
     }
 
-    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.literal()))]
+    #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
     pub(crate) async fn shard_split(
         &self,
-        tenant_shard_id: TenantShardId,
+        tenant: Arc<Tenant>,
         new_shard_count: ShardCount,
         new_stripe_size: Option<ShardStripeSize>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Vec<TenantShardId>> {
+        let tenant_shard_id = *tenant.get_tenant_shard_id();
         let r = self
-            .do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx)
+            .do_shard_split(tenant, new_shard_count, new_stripe_size, ctx)
             .await;
         if r.is_err() {
             // Shard splitting might have left the original shard in a partially shut down state (it
             // stops the shard's remote timeline client).  Reset it to ensure we leave things in
             // a working state.
             if self.get(tenant_shard_id).is_some() {
-                tracing::warn!("Resetting {tenant_shard_id} after shard split failure");
+                tracing::warn!("Resetting after shard split failure");
                 if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await {
                     // Log this error because our return value will still be the original error, not this one.  This is
                     // a severe error: if this happens, we might be leaving behind a tenant that is not fully functional
                     // (e.g. has uploads disabled).  We can't do anything else: if reset fails then shutting the tenant down or
                     // setting it broken probably won't help either.
-                    tracing::error!("Failed to reset {tenant_shard_id}: {e}");
+                    tracing::error!("Failed to reset: {e}");
                 }
             }
         }
@@ -1501,12 +1481,12 @@ impl TenantManager {
 
     pub(crate) async fn do_shard_split(
         &self,
-        tenant_shard_id: TenantShardId,
+        tenant: Arc<Tenant>,
         new_shard_count: ShardCount,
         new_stripe_size: Option<ShardStripeSize>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Vec<TenantShardId>> {
-        let tenant = get_tenant(tenant_shard_id, true)?;
+        let tenant_shard_id = *tenant.get_tenant_shard_id();
 
         // Validate the incoming request
         if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
@@ -1552,7 +1532,6 @@ impl TenantManager {
             // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
             // have been left in a partially-shut-down state.
             tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning");
-            self.reset_tenant(tenant_shard_id, false, ctx).await?;
             return Err(e);
         }
 
@@ -1950,38 +1929,23 @@ impl TenantManager {
         removal_result
     }
 
-    pub(crate) async fn set_new_tenant_config(
+    pub(crate) fn list_tenants(
         &self,
-        new_tenant_conf: TenantConfOpt,
-        tenant_id: TenantId,
-    ) -> Result<(), SetNewTenantConfigError> {
-        // Legacy API: does not support sharding
-        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-        info!("configuring tenant {tenant_id}");
-        let tenant = get_tenant(tenant_shard_id, true)?;
-
-        if !tenant.tenant_shard_id().shard_count.is_unsharded() {
-            // Note that we use ShardParameters::default below.
-            return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
-            "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
-        )));
-        }
-
-        // This is a legacy API that only operates on attached tenants: the preferred
-        // API to use is the location_config/ endpoint, which lets the caller provide
-        // the full LocationConf.
-        let location_conf = LocationConf::attached_single(
-            new_tenant_conf.clone(),
-            tenant.generation,
-            &ShardParameters::default(),
-        );
-
-        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &location_conf)
-            .await
-            .map_err(SetNewTenantConfigError::Persist)?;
-        tenant.set_new_tenant_config(new_tenant_conf);
-        Ok(())
+    ) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
+        let tenants = TENANTS.read().unwrap();
+        let m = match &*tenants {
+            TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
+        };
+        Ok(m.iter()
+            .filter_map(|(id, tenant)| match tenant {
+                TenantSlot::Attached(tenant) => {
+                    Some((*id, tenant.current_state(), tenant.generation()))
+                }
+                TenantSlot::Secondary(_) => None,
+                TenantSlot::InProgress(_) => None,
+            })
+            .collect())
     }
 }
 
@@ -1994,51 +1958,12 @@ pub(crate) enum GetTenantError {
 
     #[error("Tenant {0} is not active")]
     NotActive(TenantShardId),
-    /// Broken is logically a subset of NotActive, but a distinct error is useful as
-    /// NotActive is usually a retryable state for API purposes, whereas Broken
-    /// is a stuck error state
-    #[error("Tenant is broken: {0}")]
-    Broken(String),
 
     // Initializing or shutting down: cannot authoritatively say whether we have this tenant
     #[error("Tenant map is not available: {0}")]
     MapState(#[from] TenantMapError),
 }
 
-/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
-/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
-///
-/// This method is cancel-safe.
-pub(crate) fn get_tenant(
-    tenant_shard_id: TenantShardId,
-    active_only: bool,
-) -> Result<Arc<Tenant>, GetTenantError> {
-    let locked = TENANTS.read().unwrap();
-
-    let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
-
-    match peek_slot {
-        Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
-            TenantState::Broken {
-                reason,
-                backtrace: _,
-            } if active_only => Err(GetTenantError::Broken(reason)),
-            TenantState::Active => Ok(Arc::clone(tenant)),
-            _ => {
-                if active_only {
-                    Err(GetTenantError::NotActive(tenant_shard_id))
-                } else {
-                    Ok(Arc::clone(tenant))
-                }
-            }
-        },
-        Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
-        None | Some(TenantSlot::Secondary(_)) => {
-            Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
-        }
-    }
-}
-
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum GetActiveTenantError {
     /// We may time out either while TenantSlot is InProgress, or while the Tenant
@@ -2062,6 +1987,12 @@ pub(crate) enum GetActiveTenantError {
     /// Tenant exists, but is in a state that cannot become active (e.g. Stopping, Broken)
     #[error("will not become active.  Current state: {0}")]
     WillNotBecomeActive(TenantState),
+
+    /// Broken is logically a subset of WillNotBecomeActive, but a distinct error is useful as
+    /// WillNotBecomeActive is a permitted error under some circumstances, whereas broken should
+    /// never happen.
+    #[error("Tenant is broken: {0}")]
+    Broken(String),
 }
 
 /// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`]
@@ -2281,27 +2212,6 @@ pub(crate) enum TenantMapListError {
     Initializing,
 }
 
-///
-/// Get list of tenants, for the mgmt API
-///
-pub(crate) async fn list_tenants(
-) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
-    let tenants = TENANTS.read().unwrap();
-    let m = match &*tenants {
-        TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
-        TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
-    };
-    Ok(m.iter()
-        .filter_map(|(id, tenant)| match tenant {
-            TenantSlot::Attached(tenant) => {
-                Some((*id, tenant.current_state(), tenant.generation()))
-            }
-            TenantSlot::Secondary(_) => None,
-            TenantSlot::InProgress(_) => None,
-        })
-        .collect())
-}
-
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum TenantMapInsertError {
     #[error(transparent)]
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index a8b05f4c0e..39d088ffc3 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -9,6 +9,7 @@ use crate::{
     metrics::SECONDARY_MODE,
     tenant::{
         config::AttachmentMode,
+        mgr::GetTenantError,
         mgr::TenantManager,
         remote_timeline_client::remote_heatmap_path,
         span::debug_assert_current_span_has_tenant_id,
@@ -292,8 +293,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
             "Starting heatmap write on command");
         let tenant = self
             .tenant_manager
-            .get_attached_tenant_shard(*tenant_shard_id, true)
+            .get_attached_tenant_shard(*tenant_shard_id)
             .map_err(|e| anyhow::anyhow!(e))?;
+        if !tenant.is_active() {
+            return Err(GetTenantError::NotActive(*tenant_shard_id).into());
+        }
 
         Ok(UploadPending {
             // Ignore our state for last digest: this forces an upload even if nothing has changed
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 38292b6d78..bc3fc1df1f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1274,6 +1274,7 @@ impl Timeline {
 
     pub(crate) fn activate(
         self: &Arc<Self>,
+        parent: Arc<crate::tenant::Tenant>,
         broker_client: BrokerClientChannel,
         background_jobs_can_start: Option<&completion::Barrier>,
         ctx: &RequestContext,
@@ -1284,7 +1285,7 @@ impl Timeline {
         }
         self.launch_wal_receiver(ctx, broker_client);
         self.set_state(TimelineState::Active);
-        self.launch_eviction_task(background_jobs_can_start);
+        self.launch_eviction_task(parent, background_jobs_can_start);
     }
 
     /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index dd769d4121..ebcd70bd39 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -51,6 +51,7 @@ pub struct EvictionTaskTenantState {
 impl Timeline {
     pub(super) fn launch_eviction_task(
         self: &Arc<Self>,
+        parent: Arc<Tenant>,
         background_tasks_can_start: Option<&completion::Barrier>,
     ) {
         let self_clone = Arc::clone(self);
@@ -72,14 +73,14 @@ impl Timeline {
                     _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
                 };
 
-                self_clone.eviction_task(cancel).await;
+                self_clone.eviction_task(parent, cancel).await;
                 Ok(())
             },
         );
     }
 
     #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
-    async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
+    async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>, cancel: CancellationToken) {
         use crate::tenant::tasks::random_init_delay;
 
         // acquire the gate guard only once within a useful span
@@ -103,7 +104,7 @@ impl Timeline {
         loop {
             let policy = self.get_eviction_policy();
             let cf = self
-                .eviction_iteration(&policy, &cancel, &guard, &ctx)
+                .eviction_iteration(&tenant, &policy, &cancel, &guard, &ctx)
                 .await;
 
             match cf {
@@ -123,6 +124,7 @@ impl Timeline {
     #[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))]
     async fn eviction_iteration(
         self: &Arc<Self>,
+        tenant: &Tenant,
         policy: &EvictionPolicy,
         cancel: &CancellationToken,
         gate: &GateGuard,
@@ -137,7 +139,7 @@ impl Timeline {
             }
             EvictionPolicy::LayerAccessThreshold(p) => {
                 match self
-                    .eviction_iteration_threshold(p, cancel, gate, ctx)
+                    .eviction_iteration_threshold(tenant, p, cancel, gate, ctx)
                     .await
                 {
                     ControlFlow::Break(()) => return ControlFlow::Break(()),
@@ -146,7 +148,11 @@ impl Timeline {
                 (p.period, p.threshold)
             }
             EvictionPolicy::OnlyImitiate(p) => {
-                if self.imitiate_only(p, cancel, gate, ctx).await.is_break() {
+                if self
+                    .imitiate_only(tenant, p, cancel, gate, ctx)
+                    .await
+                    .is_break()
+                {
                     return ControlFlow::Break(());
                 }
                 (p.period, p.threshold)
@@ -175,6 +181,7 @@ impl Timeline {
 
     async fn eviction_iteration_threshold(
         self: &Arc<Self>,
+        tenant: &Tenant,
         p: &EvictionPolicyLayerAccessThreshold,
         cancel: &CancellationToken,
         gate: &GateGuard,
@@ -193,7 +200,10 @@ impl Timeline {
             _ = self.cancel.cancelled() => return ControlFlow::Break(()),
         };
 
-        match self.imitate_layer_accesses(p, cancel, gate, ctx).await {
+        match self
+            .imitate_layer_accesses(tenant, p, cancel, gate, ctx)
+            .await
+        {
             ControlFlow::Break(()) => return ControlFlow::Break(()),
             ControlFlow::Continue(()) => (),
         }
@@ -315,6 +325,7 @@ impl Timeline {
     /// disk usage based eviction task.
     async fn imitiate_only(
         self: &Arc<Self>,
+        tenant: &Tenant,
         p: &EvictionPolicyLayerAccessThreshold,
         cancel: &CancellationToken,
         gate: &GateGuard,
@@ -331,7 +342,8 @@ impl Timeline {
             _ = self.cancel.cancelled() => return ControlFlow::Break(()),
         };
 
-        self.imitate_layer_accesses(p, cancel, gate, ctx).await
+        self.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
+            .await
     }
 
     /// If we evict layers but keep cached values derived from those layers, then
@@ -361,6 +373,7 @@ impl Timeline {
     #[instrument(skip_all)]
     async fn imitate_layer_accesses(
         &self,
+        tenant: &Tenant,
         p: &EvictionPolicyLayerAccessThreshold,
         cancel: &CancellationToken,
         gate: &GateGuard,
@@ -396,17 +409,11 @@ impl Timeline {
         // Make one of the tenant's timelines draw the short straw and run the calculation.
         // The others wait until the calculation is done so that they take into account the
         // imitated accesses that the winner made.
-        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) {
-            Ok(t) => t,
-            Err(_) => {
-                return ControlFlow::Break(());
-            }
-        };
         let mut state = tenant.eviction_task_tenant_state.lock().await;
         match state.last_layer_access_imitation {
             Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
             _ => {
-                self.imitate_synthetic_size_calculation_worker(&tenant, cancel, ctx)
+                self.imitate_synthetic_size_calculation_worker(tenant, cancel, ctx)
                     .await;
                 state.last_layer_access_imitation = Some(tokio::time::Instant::now());
             }
@@ -480,7 +487,7 @@ impl Timeline {
     #[instrument(skip_all)]
     async fn imitate_synthetic_size_calculation_worker(
         &self,
-        tenant: &Arc<Tenant>,
+        tenant: &Tenant,
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) {
diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs
index e1034a9fe2..2b60e670ea 100644
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -86,6 +86,7 @@ impl<'t> UninitializedTimeline<'t> {
     /// Prepares timeline data by loading it from the basebackup archive.
     pub(crate) async fn import_basebackup_from_tar(
         self,
+        tenant: Arc<Tenant>,
         copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
         base_lsn: Lsn,
         broker_client: storage_broker::BrokerClientChannel,
@@ -114,7 +115,7 @@ impl<'t> UninitializedTimeline<'t> {
 
         // All the data has been imported. Insert the Timeline into the tenant's timelines map
         let tl = self.finish_creation()?;
-        tl.activate(broker_client, None, ctx);
+        tl.activate(tenant, broker_client, None, ctx);
         Ok(tl)
     }
 
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 25510c50e6..ca6f77c75f 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -90,6 +90,8 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
                 # this shutdown case is logged at WARN severity by the time it bubbles up to logical size calculation code
                 # WARN ...: initial size calculation failed: downloading failed, possibly for shutdown
                 ".*downloading failed, possibly for shutdown",
+                # {tenant_id=... timeline_id=...}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1664/0/1260 blkno=0 req_lsn=0/149F0D8}: error reading relation or page version: Not found: will not become active.  Current state: Stopping\n'
+                ".*page_service.*will not become active.*",
             ]
         )
 

From 12512f31736a5c5b3d3973c5c5cfd43dd58acb3d Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 26 Mar 2024 19:31:19 +0000
Subject: [PATCH 0482/1571] add authentication rate limiting (#6865)

## Problem

https://github.com/neondatabase/cloud/issues/9642

## Summary of changes

1. Make `EndpointRateLimiter` generic, renamed as `BucketRateLimiter`
2. Add support for claiming multiple tokens at once
3. Add `AuthRateLimiter` alias.
4. Check `(Endpoint, IP)` pair during authentication, weighted by how
many hashes proxy would be doing.

TODO: handle ipv6 subnets. will do this in a separate PR.
---
 libs/metrics/src/hll.rs           |   2 +-
 proxy/src/auth/backend.rs         |  90 ++++++++++++++++----
 proxy/src/bin/proxy.rs            |  13 ++-
 proxy/src/cache/common.rs         |  10 +++
 proxy/src/cache/project_info.rs   |  30 ++-----
 proxy/src/config.rs               |   8 +-
 proxy/src/metrics.rs              |  22 ++++-
 proxy/src/proxy.rs                |   2 +-
 proxy/src/proxy/tests.rs          |  10 +--
 proxy/src/rate_limiter.rs         |   2 +-
 proxy/src/rate_limiter/limiter.rs | 132 +++++++++++++++++++++---------
 proxy/src/scram/secret.rs         |  12 +--
 proxy/src/serverless/backend.rs   |   7 +-
 13 files changed, 241 insertions(+), 99 deletions(-)

diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs
index 46a623b0e2..dfb4461ce9 100644
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -40,7 +40,7 @@ macro_rules! register_hll {
     }};
 
     ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
-        $crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
+        $crate::register_hll!($N, $crate::opts!($NAME, $HELP))
     }};
 }
 
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 04fe83d8eb..e421798067 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -12,6 +12,8 @@ use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
 use crate::console::{AuthSecret, NodeInfo};
 use crate::context::RequestMonitoring;
+use crate::intern::EndpointIdInt;
+use crate::metrics::{AUTH_RATE_LIMIT_HITS, ENDPOINTS_AUTH_RATE_LIMITED};
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
 use crate::stream::Stream;
@@ -28,7 +30,7 @@ use crate::{
 use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::info;
+use tracing::{info, warn};
 
 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
@@ -174,6 +176,52 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
     }
 }
 
+impl AuthenticationConfig {
+    pub fn check_rate_limit(
+        &self,
+
+        ctx: &mut RequestMonitoring,
+        secret: AuthSecret,
+        endpoint: &EndpointId,
+        is_cleartext: bool,
+    ) -> auth::Result<AuthSecret> {
+        // we have validated the endpoint exists, so let's intern it.
+        let endpoint_int = EndpointIdInt::from(endpoint);
+
+        // only count the full hash count if password hack or websocket flow.
+        // in other words, if proxy needs to run the hashing
+        let password_weight = if is_cleartext {
+            match &secret {
+                #[cfg(any(test, feature = "testing"))]
+                AuthSecret::Md5(_) => 1,
+                AuthSecret::Scram(s) => s.iterations + 1,
+            }
+        } else {
+            // validating scram takes just 1 hmac_sha_256 operation.
+            1
+        };
+
+        let limit_not_exceeded = self
+            .rate_limiter
+            .check((endpoint_int, ctx.peer_addr), password_weight);
+
+        if !limit_not_exceeded {
+            warn!(
+                enabled = self.rate_limiter_enabled,
+                "rate limiting authentication"
+            );
+            AUTH_RATE_LIMIT_HITS.inc();
+            ENDPOINTS_AUTH_RATE_LIMITED.measure(endpoint);
+
+            if self.rate_limiter_enabled {
+                return Err(auth::AuthError::too_many_connections());
+            }
+        }
+
+        Ok(secret)
+    }
+}
+
 /// True to its name, this function encapsulates our current auth trade-offs.
 /// Here, we choose the appropriate auth flow based on circumstances.
 ///
@@ -214,14 +262,24 @@ async fn auth_quirks(
         Some(secret) => secret,
         None => api.get_role_secret(ctx, &info).await?,
     };
+    let (cached_entry, secret) = cached_secret.take_value();
+
+    let secret = match secret {
+        Some(secret) => config.check_rate_limit(
+            ctx,
+            secret,
+            &info.endpoint,
+            unauthenticated_password.is_some() || allow_cleartext,
+        )?,
+        None => {
+            // If we don't have an authentication secret, we mock one to
+            // prevent malicious probing (possible due to missing protocol steps).
+            // This mocked secret will never lead to successful authentication.
+            info!("authentication info not found, mocking it");
+            AuthSecret::Scram(scram::ServerSecret::mock(rand::random()))
+        }
+    };
 
-    let secret = cached_secret.value.clone().unwrap_or_else(|| {
-        // If we don't have an authentication secret, we mock one to
-        // prevent malicious probing (possible due to missing protocol steps).
-        // This mocked secret will never lead to successful authentication.
-        info!("authentication info not found, mocking it");
-        AuthSecret::Scram(scram::ServerSecret::mock(&info.user, rand::random()))
-    });
     match authenticate_with_secret(
         ctx,
         secret,
@@ -237,7 +295,7 @@ async fn auth_quirks(
         Err(e) => {
             if e.is_auth_failed() {
                 // The password could have been changed, so we invalidate the cache.
-                cached_secret.invalidate();
+                cached_entry.invalidate();
             }
             Err(e)
         }
@@ -415,6 +473,7 @@ mod tests {
 
     use bytes::BytesMut;
     use fallible_iterator::FallibleIterator;
+    use once_cell::sync::Lazy;
     use postgres_protocol::{
         authentication::sasl::{ChannelBinding, ScramSha256},
         message::{backend::Message as PgMessage, frontend},
@@ -432,6 +491,7 @@ mod tests {
         },
         context::RequestMonitoring,
         proxy::NeonOptions,
+        rate_limiter::{AuthRateLimiter, RateBucketInfo},
         scram::ServerSecret,
         stream::{PqStream, Stream},
     };
@@ -473,9 +533,11 @@ mod tests {
         }
     }
 
-    static CONFIG: &AuthenticationConfig = &AuthenticationConfig {
+    static CONFIG: Lazy<AuthenticationConfig> = Lazy::new(|| AuthenticationConfig {
         scram_protocol_timeout: std::time::Duration::from_secs(5),
-    };
+        rate_limiter_enabled: true,
+        rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
+    });
 
     async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
         loop {
@@ -544,7 +606,7 @@ mod tests {
             }
         });
 
-        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, CONFIG)
+        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, &CONFIG)
             .await
             .unwrap();
 
@@ -584,7 +646,7 @@ mod tests {
             client.write_all(&write).await.unwrap();
         });
 
-        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG)
+        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
             .await
             .unwrap();
 
@@ -624,7 +686,7 @@ mod tests {
             client.write_all(&write).await.unwrap();
         });
 
-        let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG)
+        let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
             .await
             .unwrap();
 
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index d38439c2a0..88b847f5f1 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -18,6 +18,7 @@ use proxy::console;
 use proxy::context::parquet::ParquetUploadArgs;
 use proxy::http;
 use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT;
+use proxy::rate_limiter::AuthRateLimiter;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
@@ -141,10 +142,16 @@ struct ProxyCliArgs {
     ///
     /// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
     /// Can be given multiple times for different bucket sizes.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
     endpoint_rps_limit: Vec<RateBucketInfo>,
+    /// Whether the auth rate limiter actually takes effect (for testing)
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    auth_rate_limit_enabled: bool,
+    /// Authentication rate limiter max number of hashes per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
+    auth_rate_limit: Vec<RateBucketInfo>,
     /// Redis rate limiter max number of requests per second.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
     redis_rps_limit: Vec<RateBucketInfo>,
     /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
     #[clap(long, default_value_t = 100)]
@@ -510,6 +517,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     };
     let authentication_config = AuthenticationConfig {
         scram_protocol_timeout: args.scram_protocol_timeout,
+        rate_limiter_enabled: args.auth_rate_limit_enabled,
+        rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
     };
 
     let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
diff --git a/proxy/src/cache/common.rs b/proxy/src/cache/common.rs
index 2af6a70e90..bc1c37512b 100644
--- a/proxy/src/cache/common.rs
+++ b/proxy/src/cache/common.rs
@@ -43,6 +43,16 @@ impl<C: Cache, V> Cached<C, V> {
         Self { token: None, value }
     }
 
+    pub fn take_value(self) -> (Cached<C, ()>, V) {
+        (
+            Cached {
+                token: self.token,
+                value: (),
+            },
+            self.value,
+        )
+    }
+
     /// Drop this entry from a cache if it's still there.
     pub fn invalidate(self) -> V {
         if let Some((cache, info)) = &self.token {
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 6e3eb8c1b0..5a3660520b 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -373,10 +373,7 @@ mod tests {
         let endpoint_id = "endpoint".into();
         let user1: RoleName = "user1".into();
         let user2: RoleName = "user2".into();
-        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
-            user1.as_str(),
-            [1; 32],
-        )));
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
         let secret2 = None;
         let allowed_ips = Arc::new(vec![
             "127.0.0.1".parse().unwrap(),
@@ -395,10 +392,7 @@ mod tests {
 
         // Shouldn't add more than 2 roles.
         let user3: RoleName = "user3".into();
-        let secret3 = Some(AuthSecret::Scram(ServerSecret::mock(
-            user3.as_str(),
-            [3; 32],
-        )));
+        let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32])));
         cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone());
         assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());
 
@@ -431,14 +425,8 @@ mod tests {
         let endpoint_id = "endpoint".into();
         let user1: RoleName = "user1".into();
         let user2: RoleName = "user2".into();
-        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
-            user1.as_str(),
-            [1; 32],
-        )));
-        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock(
-            user2.as_str(),
-            [2; 32],
-        )));
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
+        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32])));
         let allowed_ips = Arc::new(vec![
             "127.0.0.1".parse().unwrap(),
             "127.0.0.2".parse().unwrap(),
@@ -486,14 +474,8 @@ mod tests {
         let endpoint_id = "endpoint".into();
         let user1: RoleName = "user1".into();
         let user2: RoleName = "user2".into();
-        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
-            user1.as_str(),
-            [1; 32],
-        )));
-        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock(
-            user2.as_str(),
-            [2; 32],
-        )));
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
+        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32])));
         let allowed_ips = Arc::new(vec![
             "127.0.0.1".parse().unwrap(),
             "127.0.0.2".parse().unwrap(),
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 45f8d76144..361c3ef519 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,4 +1,8 @@
-use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions};
+use crate::{
+    auth,
+    rate_limiter::{AuthRateLimiter, RateBucketInfo},
+    serverless::GlobalConnPoolOptions,
+};
 use anyhow::{bail, ensure, Context, Ok};
 use itertools::Itertools;
 use rustls::{
@@ -50,6 +54,8 @@ pub struct HttpConfig {
 
 pub struct AuthenticationConfig {
     pub scram_protocol_timeout: tokio::time::Duration,
+    pub rate_limiter_enabled: bool,
+    pub rate_limiter: AuthRateLimiter,
 }
 
 impl TlsConfig {
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index eed45e421b..4172dc19da 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -4,7 +4,10 @@ use ::metrics::{
     register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
     IntCounterVec, IntGauge, IntGaugeVec,
 };
-use metrics::{register_int_counter, register_int_counter_pair, IntCounter, IntCounterPair};
+use metrics::{
+    register_hll, register_int_counter, register_int_counter_pair, HyperLogLog, IntCounter,
+    IntCounterPair,
+};
 
 use once_cell::sync::Lazy;
 use tokio::time::{self, Instant};
@@ -358,3 +361,20 @@ pub static TLS_HANDSHAKE_FAILURES: Lazy<IntCounter> = Lazy::new(|| {
     )
     .unwrap()
 });
+
+pub static ENDPOINTS_AUTH_RATE_LIMITED: Lazy<HyperLogLog<32>> = Lazy::new(|| {
+    register_hll!(
+        32,
+        "proxy_endpoints_auth_rate_limits",
+        "Number of endpoints affected by authentication rate limits",
+    )
+    .unwrap()
+});
+
+pub static AUTH_RATE_LIMIT_HITS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "proxy_requests_auth_rate_limits_total",
+        "Number of connection requests affected by authentication rate limits",
+    )
+    .unwrap()
+});
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 843bfc08cf..6051c0a812 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -280,7 +280,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     // check rate limit
     if let Some(ep) = user_info.get_endpoint() {
-        if !endpoint_rate_limiter.check(ep) {
+        if !endpoint_rate_limiter.check(ep, 1) {
             return stream
                 .throw_error(auth::AuthError::too_many_connections())
                 .await?;
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 9c3be73612..a4051447c1 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -142,8 +142,8 @@ impl Scram {
         Ok(Scram(secret))
     }
 
-    fn mock(user: &str) -> Self {
-        Scram(scram::ServerSecret::mock(user, rand::random()))
+    fn mock() -> Self {
+        Scram(scram::ServerSecret::mock(rand::random()))
     }
 }
 
@@ -330,11 +330,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 
     let (client_config, server_config) =
         generate_tls_config("generic-project-name.localhost", "localhost")?;
-    let proxy = tokio::spawn(dummy_proxy(
-        client,
-        Some(server_config),
-        Scram::mock("user"),
-    ));
+    let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock()));
 
     use rand::{distributions::Alphanumeric, Rng};
     let password: String = rand::thread_rng()
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index f0da4ead23..13dffffca0 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -4,4 +4,4 @@ mod limiter;
 pub use aimd::Aimd;
 pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
 pub use limiter::Limiter;
-pub use limiter::{EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
+pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 3181060e2f..f590896dd9 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -1,6 +1,8 @@
 use std::{
+    borrow::Cow,
     collections::hash_map::RandomState,
-    hash::BuildHasher,
+    hash::{BuildHasher, Hash},
+    net::IpAddr,
     sync::{
         atomic::{AtomicUsize, Ordering},
         Arc, Mutex,
@@ -15,7 +17,7 @@ use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
 use tokio::time::{timeout, Duration, Instant};
 use tracing::info;
 
-use crate::EndpointId;
+use crate::{intern::EndpointIdInt, EndpointId};
 
 use super::{
     limit_algorithm::{LimitAlgorithm, Sample},
@@ -49,11 +51,11 @@ impl RedisRateLimiter {
             .data
             .iter_mut()
             .zip(self.info)
-            .all(|(bucket, info)| bucket.should_allow_request(info, now));
+            .all(|(bucket, info)| bucket.should_allow_request(info, now, 1));
 
         if should_allow_request {
             // only increment the bucket counts if the request will actually be accepted
-            self.data.iter_mut().for_each(RateBucket::inc);
+            self.data.iter_mut().for_each(|b| b.inc(1));
         }
 
         should_allow_request
@@ -71,9 +73,14 @@ impl RedisRateLimiter {
 // saw SNI, before doing TLS handshake. User-side error messages in that case
 // does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
 // I went with a more expensive way that yields user-friendlier error messages.
-pub struct EndpointRateLimiter<Rand = StdRng, Hasher = RandomState> {
-    map: DashMap<EndpointId, Vec<RateBucket>, Hasher>,
-    info: &'static [RateBucketInfo],
+pub type EndpointRateLimiter = BucketRateLimiter<EndpointId, StdRng, RandomState>;
+
+// This can't be just per IP because that would limit some PaaS that share IP addresses
+pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, IpAddr), StdRng, RandomState>;
+
+pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
+    map: DashMap<Key, Vec<RateBucket>, Hasher>,
+    info: Cow<'static, [RateBucketInfo]>,
     access_count: AtomicUsize,
     rand: Mutex<Rand>,
 }
@@ -85,9 +92,9 @@ struct RateBucket {
 }
 
 impl RateBucket {
-    fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant) -> bool {
+    fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant, n: u32) -> bool {
         if now - self.start < info.interval {
-            self.count < info.max_rpi
+            self.count + n <= info.max_rpi
         } else {
             // bucket expired, reset
             self.count = 0;
@@ -97,8 +104,8 @@ impl RateBucket {
         }
     }
 
-    fn inc(&mut self) {
-        self.count += 1;
+    fn inc(&mut self, n: u32) {
+        self.count += n;
     }
 }
 
@@ -111,7 +118,7 @@ pub struct RateBucketInfo {
 
 impl std::fmt::Display for RateBucketInfo {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let rps = self.max_rpi * 1000 / self.interval.as_millis() as u32;
+        let rps = (self.max_rpi as u64) * 1000 / self.interval.as_millis() as u64;
         write!(f, "{rps}@{}", humantime::format_duration(self.interval))
     }
 }
@@ -136,12 +143,25 @@ impl std::str::FromStr for RateBucketInfo {
 }
 
 impl RateBucketInfo {
-    pub const DEFAULT_SET: [Self; 3] = [
+    pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [
         Self::new(300, Duration::from_secs(1)),
         Self::new(200, Duration::from_secs(60)),
         Self::new(100, Duration::from_secs(600)),
     ];
 
+    /// All of these are per endpoint-ip pair.
+    /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
+    ///
+    /// First bucket: 300mcpus total per endpoint-ip pair
+    /// * 1228800 requests per second with 1 hash rounds. (endpoint rate limiter will catch this first)
+    /// * 300 requests per second with 4096 hash rounds.
+    /// * 2 requests per second with 600000 hash rounds.
+    pub const DEFAULT_AUTH_SET: [Self; 3] = [
+        Self::new(300 * 4096, Duration::from_secs(1)),
+        Self::new(200 * 4096, Duration::from_secs(60)),
+        Self::new(100 * 4096, Duration::from_secs(600)),
+    ];
+
     pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
         info.sort_unstable_by_key(|info| info.interval);
         let invalid = info
@@ -150,7 +170,7 @@ impl RateBucketInfo {
             .find(|(a, b)| a.max_rpi > b.max_rpi);
         if let Some((a, b)) = invalid {
             bail!(
-                "invalid endpoint RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})",
+                "invalid bucket RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})",
                 b.max_rpi,
                 a.max_rpi,
             );
@@ -162,19 +182,24 @@ impl RateBucketInfo {
     pub const fn new(max_rps: u32, interval: Duration) -> Self {
         Self {
             interval,
-            max_rpi: max_rps * interval.as_millis() as u32 / 1000,
+            max_rpi: ((max_rps as u64) * (interval.as_millis() as u64) / 1000) as u32,
         }
     }
 }
 
-impl EndpointRateLimiter {
-    pub fn new(info: &'static [RateBucketInfo]) -> Self {
+impl<K: Hash + Eq> BucketRateLimiter<K> {
+    pub fn new(info: impl Into<Cow<'static, [RateBucketInfo]>>) -> Self {
         Self::new_with_rand_and_hasher(info, StdRng::from_entropy(), RandomState::new())
     }
 }
 
-impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
-    fn new_with_rand_and_hasher(info: &'static [RateBucketInfo], rand: R, hasher: S) -> Self {
+impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
+    fn new_with_rand_and_hasher(
+        info: impl Into<Cow<'static, [RateBucketInfo]>>,
+        rand: R,
+        hasher: S,
+    ) -> Self {
+        let info = info.into();
         info!(buckets = ?info, "endpoint rate limiter");
         Self {
             info,
@@ -185,7 +210,7 @@ impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
     }
 
     /// Check that number of connections to the endpoint is below `max_rps` rps.
-    pub fn check(&self, endpoint: EndpointId) -> bool {
+    pub fn check(&self, key: K, n: u32) -> bool {
         // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map.
         // worst case memory usage is about:
         //    = 2 * 2048 * 64 * (48B + 72B)
@@ -195,7 +220,7 @@ impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
         }
 
         let now = Instant::now();
-        let mut entry = self.map.entry(endpoint).or_insert_with(|| {
+        let mut entry = self.map.entry(key).or_insert_with(|| {
             vec![
                 RateBucket {
                     start: now,
@@ -207,12 +232,12 @@ impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
 
         let should_allow_request = entry
             .iter_mut()
-            .zip(self.info)
-            .all(|(bucket, info)| bucket.should_allow_request(info, now));
+            .zip(&*self.info)
+            .all(|(bucket, info)| bucket.should_allow_request(info, now, n));
 
         if should_allow_request {
             // only increment the bucket counts if the request will actually be accepted
-            entry.iter_mut().for_each(RateBucket::inc);
+            entry.iter_mut().for_each(|b| b.inc(n));
         }
 
         should_allow_request
@@ -223,7 +248,7 @@ impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
     /// But that way deletion does not aquire mutex on each entry access.
     pub fn do_gc(&self) {
         info!(
-            "cleaning up endpoint rate limiter, current size = {}",
+            "cleaning up bucket rate limiter, current size = {}",
             self.map.len()
         );
         let n = self.map.shards().len();
@@ -534,7 +559,7 @@ mod tests {
     use rustc_hash::FxHasher;
     use tokio::time;
 
-    use super::{EndpointRateLimiter, Limiter, Outcome};
+    use super::{BucketRateLimiter, EndpointRateLimiter, Limiter, Outcome};
     use crate::{
         rate_limiter::{RateBucketInfo, RateLimitAlgorithm},
         EndpointId,
@@ -672,12 +697,12 @@ mod tests {
 
     #[test]
     fn default_rate_buckets() {
-        let mut defaults = RateBucketInfo::DEFAULT_SET;
+        let mut defaults = RateBucketInfo::DEFAULT_ENDPOINT_SET;
         RateBucketInfo::validate(&mut defaults[..]).unwrap();
     }
 
     #[test]
-    #[should_panic = "invalid endpoint RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"]
+    #[should_panic = "invalid bucket RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"]
     fn rate_buckets_validate() {
         let mut rates: Vec<RateBucketInfo> = ["300@1s", "10@10s"]
             .into_iter()
@@ -693,42 +718,42 @@ mod tests {
             .map(|s| s.parse().unwrap())
             .collect();
         RateBucketInfo::validate(&mut rates).unwrap();
-        let limiter = EndpointRateLimiter::new(Vec::leak(rates));
+        let limiter = EndpointRateLimiter::new(rates);
 
         let endpoint = EndpointId::from("ep-my-endpoint-1234");
 
         time::pause();
 
         for _ in 0..100 {
-            assert!(limiter.check(endpoint.clone()));
+            assert!(limiter.check(endpoint.clone(), 1));
         }
         // more connections fail
-        assert!(!limiter.check(endpoint.clone()));
+        assert!(!limiter.check(endpoint.clone(), 1));
 
         // fail even after 500ms as it's in the same bucket
         time::advance(time::Duration::from_millis(500)).await;
-        assert!(!limiter.check(endpoint.clone()));
+        assert!(!limiter.check(endpoint.clone(), 1));
 
         // after a full 1s, 100 requests are allowed again
         time::advance(time::Duration::from_millis(500)).await;
         for _ in 1..6 {
-            for _ in 0..100 {
-                assert!(limiter.check(endpoint.clone()));
+            for _ in 0..50 {
+                assert!(limiter.check(endpoint.clone(), 2));
             }
             time::advance(time::Duration::from_millis(1000)).await;
         }
 
         // more connections after 600 will exceed the 20rps@30s limit
-        assert!(!limiter.check(endpoint.clone()));
+        assert!(!limiter.check(endpoint.clone(), 1));
 
         // will still fail before the 30 second limit
         time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await;
-        assert!(!limiter.check(endpoint.clone()));
+        assert!(!limiter.check(endpoint.clone(), 1));
 
         // after the full 30 seconds, 100 requests are allowed again
         time::advance(time::Duration::from_millis(1)).await;
         for _ in 0..100 {
-            assert!(limiter.check(endpoint.clone()));
+            assert!(limiter.check(endpoint.clone(), 1));
         }
     }
 
@@ -738,14 +763,41 @@ mod tests {
         let rand = rand::rngs::StdRng::from_seed([1; 32]);
         let hasher = BuildHasherDefault::<FxHasher>::default();
 
-        let limiter = EndpointRateLimiter::new_with_rand_and_hasher(
-            &RateBucketInfo::DEFAULT_SET,
+        let limiter = BucketRateLimiter::new_with_rand_and_hasher(
+            &RateBucketInfo::DEFAULT_ENDPOINT_SET,
             rand,
             hasher,
         );
         for i in 0..1_000_000 {
-            limiter.check(format!("{i}").into());
+            limiter.check(i, 1);
         }
         assert!(limiter.map.len() < 150_000);
     }
+
+    #[test]
+    fn test_default_auth_set() {
+        // these values used to exceed u32::MAX
+        assert_eq!(
+            RateBucketInfo::DEFAULT_AUTH_SET,
+            [
+                RateBucketInfo {
+                    interval: Duration::from_secs(1),
+                    max_rpi: 300 * 4096,
+                },
+                RateBucketInfo {
+                    interval: Duration::from_secs(60),
+                    max_rpi: 200 * 4096 * 60,
+                },
+                RateBucketInfo {
+                    interval: Duration::from_secs(600),
+                    max_rpi: 100 * 4096 * 600,
+                }
+            ]
+        );
+
+        for x in RateBucketInfo::DEFAULT_AUTH_SET {
+            let y = x.to_string().parse().unwrap();
+            assert_eq!(x, y);
+        }
+    }
 }
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index f3414cb8ec..44c4f9e44a 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -50,13 +50,13 @@ impl ServerSecret {
     /// To avoid revealing information to an attacker, we use a
     /// mocked server secret even if the user doesn't exist.
     /// See `auth-scram.c : mock_scram_secret` for details.
-    pub fn mock(user: &str, nonce: [u8; 32]) -> Self {
-        // Refer to `auth-scram.c : scram_mock_salt`.
-        let mocked_salt = super::sha256([user.as_bytes(), &nonce]);
-
+    pub fn mock(nonce: [u8; 32]) -> Self {
         Self {
-            iterations: 4096,
-            salt_base64: base64::encode(mocked_salt),
+            // this doesn't reveal much information as we're going to use
+            // iteration count 1 for our generated passwords going forward.
+            // PG16 users can set iteration count=1 already today.
+            iterations: 1,
+            salt_base64: base64::encode(nonce),
             stored_key: ScramKey::default(),
             server_key: ScramKey::default(),
             doomed: true,
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 72b55c45f0..f10779d7ba 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -42,7 +42,12 @@ impl PoolingBackend {
         };
 
         let secret = match cached_secret.value.clone() {
-            Some(secret) => secret,
+            Some(secret) => self.config.authentication_config.check_rate_limit(
+                ctx,
+                secret,
+                &user_info.endpoint,
+                true,
+            )?,
             None => {
                 // If we don't have an authentication secret, for the http flow we can just return an error.
                 info!("authentication info not found");

From cdf12ed008c27fa7d59e296c498ce34ce681bddb Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 27 Mar 2024 12:04:31 +0100
Subject: [PATCH 0483/1571] fix(walreceiver): Timeline::shutdown can leave a
 dangling handle_walreceiver_connection tokio task (#7235)

# Problem

As pointed out through doc-comments in this PR, `drop_old_connection` is
not cancellation-safe.

This means we can leave a `handle_walreceiver_connection` tokio task
dangling during Timeline shutdown.

More details described in the corresponding issue #7062.

# Solution

Don't cancel-by-drop the `connection_manager_loop_step` from the
`tokio::select!()` in the task_mgr task.
Instead, transform the code to use a `CancellationToken` ---
specifically, `task_mgr::shutdown_token()` --- and make code responsive
to it.

The `drop_old_connection()` is still not cancellation-safe and also
doesn't get a cancellation token, because there's no point inside the
function where we could return early if cancellation were requested
using a token.

We rely on the `handle_walreceiver_connection` to be sensitive to the
`TaskHandle`s cancellation token (argument name: `cancellation`).
Currently it checks for `cancellation` on each WAL message. It is
probably also sensitive to `Timeline::cancel` because ultimately all
that `handle_walreceiver_connection` does is interact with the
`Timeline`.

In summary, the above means that the following code (which is found in
`Timeline::shutdown`) now might **take longer**, but actually ensures
that all `handle_walreceiver_connection` tasks are finished:

```rust
task_mgr::shutdown_tasks(
    Some(TaskKind::WalReceiverManager),
    Some(self.tenant_shard_id),
    Some(self.timeline_id)
)
```

# Refs

refs #7062
---
 pageserver/src/tenant/timeline/walreceiver.rs | 37 +++++----
 .../walreceiver/connection_manager.rs         | 76 +++++++++++++------
 2 files changed, 70 insertions(+), 43 deletions(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index 2fab6722b8..f1b62067f9 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -33,11 +33,9 @@ use crate::tenant::timeline::walreceiver::connection_manager::{
 use pageserver_api::shard::TenantShardId;
 use std::future::Future;
 use std::num::NonZeroU64;
-use std::ops::ControlFlow;
 use std::sync::Arc;
 use std::time::Duration;
 use storage_broker::BrokerClientChannel;
-use tokio::select;
 use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -91,31 +89,27 @@ impl WalReceiver {
             async move {
                 debug_assert_current_span_has_tenant_and_timeline_id();
                 debug!("WAL receiver manager started, connecting to broker");
+                let cancel = task_mgr::shutdown_token();
                 let mut connection_manager_state = ConnectionManagerState::new(
                     timeline,
                     conf,
                 );
-                loop {
-                    select! {
-                        _ = task_mgr::shutdown_watcher() => {
-                            trace!("WAL receiver shutdown requested, shutting down");
+                while !cancel.is_cancelled() {
+                    let loop_step_result = connection_manager_loop_step(
+                        &mut broker_client,
+                        &mut connection_manager_state,
+                        &walreceiver_ctx,
+                        &cancel,
+                        &loop_status,
+                    ).await;
+                    match loop_step_result {
+                        Ok(()) => continue,
+                        Err(_cancelled) => {
+                            trace!("Connection manager loop ended, shutting down");
                             break;
-                        },
-                        loop_step_result = connection_manager_loop_step(
-                            &mut broker_client,
-                            &mut connection_manager_state,
-                            &walreceiver_ctx,
-                            &loop_status,
-                        ) => match loop_step_result {
-                            ControlFlow::Continue(()) => continue,
-                            ControlFlow::Break(()) => {
-                                trace!("Connection manager loop ended, shutting down");
-                                break;
-                            }
-                        },
+                        }
                     }
                 }
-
                 connection_manager_state.shutdown().await;
                 *loop_status.write().unwrap() = None;
                 Ok(())
@@ -197,6 +191,9 @@ impl<E: Clone> TaskHandle<E> {
         }
     }
 
+    /// # Cancel-Safety
+    ///
+    /// Cancellation-safe.
     async fn next_task_event(&mut self) -> TaskEvent<E> {
         match self.events_receiver.changed().await {
             Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()),
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index cf6dee114f..030d24a017 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -17,7 +17,7 @@ use crate::metrics::{
     WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
     WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
 };
-use crate::task_mgr::{shutdown_token, TaskKind};
+use crate::task_mgr::TaskKind;
 use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
@@ -27,7 +27,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use storage_broker::{BrokerClientChannel, Code, Streaming};
-use tokio::select;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 
 use postgres_connection::PgConnectionConfig;
@@ -45,27 +45,33 @@ use super::{
     TaskEvent, TaskHandle,
 };
 
+pub(crate) struct Cancelled;
+
 /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
 /// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
 /// If storage broker subscription is cancelled, exits.
+///
+/// # Cancel-Safety
+///
+/// Not cancellation-safe. Use `cancel` token to request cancellation.
 pub(super) async fn connection_manager_loop_step(
     broker_client: &mut BrokerClientChannel,
     connection_manager_state: &mut ConnectionManagerState,
     ctx: &RequestContext,
+    cancel: &CancellationToken,
     manager_status: &std::sync::RwLock<Option<ConnectionManagerStatus>>,
-) -> ControlFlow<(), ()> {
-    match connection_manager_state
-        .timeline
-        .wait_to_become_active(ctx)
-        .await
-    {
+) -> Result<(), Cancelled> {
+    match tokio::select! {
+        _ = cancel.cancelled() => { return Err(Cancelled); },
+        st = connection_manager_state.timeline.wait_to_become_active(ctx) => { st }
+    } {
         Ok(()) => {}
         Err(new_state) => {
             debug!(
                 ?new_state,
                 "state changed, stopping wal connection manager loop"
             );
-            return ControlFlow::Break(());
+            return Err(Cancelled);
         }
     }
 
@@ -86,7 +92,7 @@ pub(super) async fn connection_manager_loop_step(
     // Subscribe to the broker updates. Stream shares underlying TCP connection
     // with other streams on this client (other connection managers). When
     // object goes out of scope, stream finishes in drop() automatically.
-    let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
+    let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
     debug!("Subscribed for broker timeline updates");
 
     loop {
@@ -94,6 +100,7 @@ pub(super) async fn connection_manager_loop_step(
 
         // These things are happening concurrently:
         //
+        // - cancellation request
         //  - keep receiving WAL on the current connection
         //      - if the shared state says we need to change connection, disconnect and return
         //      - this runs in a separate task and we receive updates via a watch channel
@@ -101,7 +108,11 @@ pub(super) async fn connection_manager_loop_step(
         //  - receive updates from broker
         //      - this might change the current desired connection
         //  - timeline state changes to something that does not allow walreceiver to run concurrently
-        select! {
+
+        // NB: make sure each of the select expressions are cancellation-safe
+        // (no need for arms to be cancellation-safe).
+        tokio::select! {
+            _ = cancel.cancelled() => { return Err(Cancelled); }
             Some(wal_connection_update) = async {
                 match connection_manager_state.wal_connection.as_mut() {
                     Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
@@ -133,7 +144,7 @@ pub(super) async fn connection_manager_loop_step(
             },
 
             // Got a new update from the broker
-            broker_update = broker_subscription.message() => {
+            broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => {
                 match broker_update {
                     Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
                     Err(status) => {
@@ -147,16 +158,17 @@ pub(super) async fn connection_manager_loop_step(
                                 warn!("broker subscription failed: {status}");
                             }
                         }
-                        return ControlFlow::Continue(());
+                        return Ok(());
                     }
                     Ok(None) => {
                         error!("broker subscription stream ended"); // can't happen
-                        return ControlFlow::Continue(());
+                        return Ok(());
                     }
                 }
             },
 
             new_event = async {
+                // Reminder: this match arm needs to be cancellation-safe.
                 loop {
                     if connection_manager_state.timeline.current_state() == TimelineState::Loading {
                         warn!("wal connection manager should only be launched after timeline has become active");
@@ -182,11 +194,11 @@ pub(super) async fn connection_manager_loop_step(
                 }
             } => match new_event {
                 ControlFlow::Continue(()) => {
-                    return ControlFlow::Continue(());
+                    return Ok(());
                 }
                 ControlFlow::Break(()) => {
                     debug!("Timeline is no longer active, stopping wal connection manager loop");
-                    return ControlFlow::Break(());
+                    return Err(Cancelled);
                 }
             },
 
@@ -218,16 +230,15 @@ pub(super) async fn connection_manager_loop_step(
 async fn subscribe_for_timeline_updates(
     broker_client: &mut BrokerClientChannel,
     id: TenantTimelineId,
-) -> Streaming<SafekeeperTimelineInfo> {
+    cancel: &CancellationToken,
+) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
     let mut attempt = 0;
-    let cancel = shutdown_token();
-
     loop {
         exponential_backoff(
             attempt,
             DEFAULT_BASE_BACKOFF_SECONDS,
             DEFAULT_MAX_BACKOFF_SECONDS,
-            &cancel,
+            cancel,
         )
         .await;
         attempt += 1;
@@ -241,9 +252,14 @@ async fn subscribe_for_timeline_updates(
             subscription_key: Some(key),
         };
 
-        match broker_client.subscribe_safekeeper_info(request).await {
+        match {
+            tokio::select! {
+                r = broker_client.subscribe_safekeeper_info(request) => { r }
+                _ = cancel.cancelled() => { return Err(Cancelled); }
+            }
+        } {
             Ok(resp) => {
-                return resp.into_inner();
+                return Ok(resp.into_inner());
             }
             Err(e) => {
                 // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
@@ -486,6 +502,10 @@ impl ConnectionManagerState {
 
     /// Drops the current connection (if any) and updates retry timeout for the next
     /// connection attempt to the same safekeeper.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// Not cancellation-safe.
     async fn drop_old_connection(&mut self, needs_shutdown: bool) {
         let wal_connection = match self.wal_connection.take() {
             Some(wal_connection) => wal_connection,
@@ -493,7 +513,14 @@ impl ConnectionManagerState {
         };
 
         if needs_shutdown {
-            wal_connection.connection_task.shutdown().await;
+            wal_connection
+                .connection_task
+                .shutdown()
+                // This here is why this function isn't cancellation-safe.
+                // If we got cancelled here, then self.wal_connection is already None and we lose track of the task.
+                // Even if our caller diligently calls Self::shutdown(), it will find a self.wal_connection=None
+                // and thus be ineffective.
+                .await;
         }
 
         let retry = self
@@ -838,6 +865,9 @@ impl ConnectionManagerState {
         }
     }
 
+    /// # Cancel-Safety
+    ///
+    /// Not cancellation-safe.
     pub(super) async fn shutdown(mut self) {
         if let Some(wal_connection) = self.wal_connection.take() {
             wal_connection.connection_task.shutdown().await;

From 7f9cc1bd5ee4e5c99298d2e0c4bd709c3fa7bcc2 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 27 Mar 2024 13:10:37 +0000
Subject: [PATCH 0484/1571] CI(trigger-e2e-tests): set e2e-platforms (#7229)

## Problem

We don't want to run an excessive e2e test suite on neonvm if there are
no relevant changes.

## Summary of changes
- Check PR diff and if there are no relevant compute changes (in
`vendor/`, `pgxn/`, `libs/vm_monitor` or `Dockerfile.compute-node`
- Switch job from `small` to `ubuntu-latest` runner to make it possible
to use GitHub CLI
---
 .github/workflows/trigger-e2e-tests.yml | 90 ++++++++++++++-----------
 1 file changed, 52 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index ae34cbffe0..7111ee37fa 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -62,14 +62,14 @@ jobs:
 
   trigger-e2e-tests:
     needs: [ tag ]
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: ubuntu-latest
     env:
       TAG: ${{ needs.tag.outputs.build-tag }}
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-      options: --init
     steps:
       - name: check if ecr image are present
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
         run: |
           for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
             OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
@@ -79,41 +79,55 @@ jobs:
             fi
           done
 
-      - name: Set PR's status to pending and request a remote CI test
+      - name: Set e2e-platforms
+        id: e2e-platforms
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
-          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
-          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
-          # to place a job run status update later.
-          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
-          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
-          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+          # Default set of platforms to run e2e tests on
+          platforms='["docker", "k8s"]'
 
-          REMOTE_REPO="${{ github.repository_owner }}/cloud"
+          # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
+          # If the workflow run is not a pull request, add k8s-neonvm to the list.
+          if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
+            for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
+              case "$f" in
+                vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
+                  platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
+                  ;;
+                *)
+                  # no-op
+                  ;;
+              esac
+            done
+          else
+            platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
+          fi
 
-          curl -f -X POST \
-          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
-          -H "Accept: application/vnd.github.v3+json" \
-          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
-          --data \
-            "{
-              \"state\": \"pending\",
-              \"context\": \"neon-cloud-e2e\",
-              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
-            }"
+          echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
 
-          curl -f -X POST \
-          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-          -H "Accept: application/vnd.github.v3+json" \
-          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
-          --data \
-            "{
-              \"ref\": \"main\",
-              \"inputs\": {
-                \"ci_job_name\": \"neon-cloud-e2e\",
-                \"commit_hash\": \"$COMMIT_SHA\",
-                \"remote_repo\": \"${{ github.repository }}\",
-                \"storage_image_tag\": \"${TAG}\",
-                \"compute_image_tag\": \"${TAG}\",
-                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
-              }
-            }"
+      - name: Set PR's status to pending and request a remote CI test
+        env:
+          E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
+
+          gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
+            --method POST \
+            --raw-field "state=pending" \
+            --raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
+            --raw-field "context=neon-cloud-e2e"
+
+          gh workflow --repo ${REMOTE_REPO} \
+            run testing.yml \
+              --ref "main" \
+              --raw-field "ci_job_name=neon-cloud-e2e" \
+              --raw-field "commit_hash=$COMMIT_SHA" \
+              --raw-field "remote_repo=${GITHUB_REPOSITORY}" \
+              --raw-field "storage_image_tag=${TAG}" \
+              --raw-field "compute_image_tag=${TAG}" \
+              --raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
+              --raw-field "e2e-platforms=${E2E_PLATFORMS}"

From 24c5a5ac165bf863797356aff67ef64a84cf0e58 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Wed, 27 Mar 2024 10:07:51 -0800
Subject: [PATCH 0485/1571] Revert "Revoke REPLICATION" (#7261)

Reverts neondatabase/neon#7052
---
 compute_tools/src/spec.rs | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 3b596a88ff..4006062fc2 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
             RoleAction::Create => {
                 // This branch only runs when roles are created through the console, so it is
                 // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
-                // from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
+                // from neon_superuser.
                 let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                     name.pg_quote()
                 );
                 info!("running role create query: '{}'", &query);
@@ -806,19 +806,8 @@ $$;"#,
         "",
         "",
         "",
+        "",
         // Add new migrations below.
-        r#"
-DO $$
-DECLARE
-    role_name TEXT;
-BEGIN
-    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
-    LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
-        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
-    END LOOP;
-END
-$$;"#,
     ];
 
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";

From 63b2060aef39da8e9eb00cda72ff1e99eed2a74d Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 28 Mar 2024 08:16:05 +0200
Subject: [PATCH 0486/1571] Drop connections with all shards invoplved in
 prefetch in case of error (#7249)

## Problem

See https://github.com/neondatabase/cloud/issues/11559

If we have multiple shards, we need to reset connections to all shards
involved in prefetch (having active prefetch requests) if connection
with any of them is lost.

## Summary of changes

In `prefetch_on_ps_disconnect` drop connection to all shards with active
page requests.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/libpagestore.c     | 36 ++++++++++++++++++++++++++----------
 pgxn/neon/pagestore_client.h |  1 +
 pgxn/neon/pagestore_smgr.c   |  8 ++++++++
 3 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index e31de3c6b5..1bc8a2e87c 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -111,6 +111,7 @@ static PageServer page_servers[MAX_SHARDS];
 
 static bool pageserver_flush(shardno_t shard_no);
 static void pageserver_disconnect(shardno_t shard_no);
+static void pageserver_disconnect_shard(shardno_t shard_no);
 
 static bool
 PagestoreShmemIsValid(void)
@@ -487,9 +488,31 @@ retry:
 	return ret;
 }
 
-
+/*
+ * Reset prefetch and drop connection to the shard.
+ * It also drops connection to all other shards involved in prefetch.
+ */
 static void
 pageserver_disconnect(shardno_t shard_no)
+{
+	if (page_servers[shard_no].conn)
+	{
+		/*
+		 * If the connection to any pageserver is lost, we throw away the
+		 * whole prefetch queue, even for other pageservers. It should not
+		 * cause big problems, because connection loss is supposed to be a
+		 * rare event.
+		 */
+		prefetch_on_ps_disconnect();
+	}
+	pageserver_disconnect_shard(shard_no);
+}
+
+/*
+ * Disconnect from specified shard
+ */
+static void
+pageserver_disconnect_shard(shardno_t shard_no)
 {
 	/*
 	 * If anything goes wrong while we were sending a request, it's not clear
@@ -503,14 +526,6 @@ pageserver_disconnect(shardno_t shard_no)
 		neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
 		PQfinish(page_servers[shard_no].conn);
 		page_servers[shard_no].conn = NULL;
-
-		/*
-		 * If the connection to any pageserver is lost, we throw away the
-		 * whole prefetch queue, even for other pageservers. It should not
-		 * cause big problems, because connection loss is supposed to be a
-		 * rare event.
-		 */
-		prefetch_on_ps_disconnect();
 	}
 	if (page_servers[shard_no].wes != NULL)
 	{
@@ -676,7 +691,8 @@ page_server_api api =
 {
 	.send = pageserver_send,
 	.flush = pageserver_flush,
-	.receive = pageserver_receive
+	.receive = pageserver_receive,
+	.disconnect = pageserver_disconnect_shard
 };
 
 static bool
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 2889ffacae..44ae766f76 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -180,6 +180,7 @@ typedef struct
 	bool		(*send) (shardno_t  shard_no, NeonRequest * request);
 	NeonResponse *(*receive) (shardno_t shard_no);
 	bool		(*flush) (shardno_t shard_no);
+	void        (*disconnect) (shardno_t shard_no);
 } page_server_api;
 
 extern void prefetch_on_ps_disconnect(void);
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 2d222e3c7c..ecc8ddb384 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -613,6 +613,14 @@ prefetch_on_ps_disconnect(void)
 		Assert(slot->status == PRFS_REQUESTED);
 		Assert(slot->my_ring_index == ring_index);
 
+		/*
+		 * Drop connection to all shards which have prefetch requests.
+		 * It is not a problem to call disconnect multiple times on the same connection
+		 * because disconnect implementation in libpagestore.c will check if connection
+		 * is alive and do nothing of connection was already dropped.
+		 */
+		page_server->disconnect(slot->shard_no);
+
 		/* clean up the request */
 		slot->status = PRFS_TAG_REMAINS;
 		MyPState->n_requests_inflight -= 1;

From 5928f6709c4957f723d6dbe5c789040696023f98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 28 Mar 2024 13:48:47 +0100
Subject: [PATCH 0487/1571] Support compaction_threshold=1 for tiered
 compaction (#7257)

Many tests like `test_live_migration` or
`test_timeline_deletion_with_files_stuck_in_upload_queue` set
`compaction_threshold` to 1, to create a lot of changes/updates. The
compaction threshold was passed as `fanout` parameter to the
tiered_compaction function, which didn't support values of 1 however.
Now we change the assert to support it, while still retaining the
exponential nature of the increase in range in terms of lsn that a layer
is responsible for.

A large chunk of the failures in #6964 was due to hitting this issue
that we now resolved.

Part of #6768.
---
 pageserver/compaction/src/compact_tiered.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index 60fc7ac925..5261746b22 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -43,7 +43,8 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
     fanout: u64,
     ctx: &E::RequestContext,
 ) -> anyhow::Result<()> {
-    assert!(fanout >= 2);
+    assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}");
+    let exp_base = fanout.max(2);
     // Start at L0
     let mut current_level_no = 0;
     let mut current_level_target_height = target_file_size;
@@ -106,7 +107,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
             break;
         }
         current_level_no += 1;
-        current_level_target_height = current_level_target_height.saturating_mul(fanout);
+        current_level_target_height = current_level_target_height.saturating_mul(exp_base);
     }
     Ok(())
 }

From 6633332e6746c8533d13d67edf2fb9f76beb4979 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 28 Mar 2024 14:19:25 +0000
Subject: [PATCH 0488/1571] storage controller: tenant scheduling policy
 (#7262)

## Problem

In the event of bugs with scheduling or reconciliation, we need to be
able to switch this off at a per-tenant granularity.

This is intended to mitigate risk of issues with
https://github.com/neondatabase/neon/pull/7181, which makes scheduling
more involved.

Closes: #7103

## Summary of changes

- Introduce a scheduling policy per tenant, with API to set it
- Refactor persistent.rs helpers for updating tenants to be more general
- Add tests
---
 .../down.sql                                  |   3 +
 .../2024-03-27-133204_tenant_policies/up.sql  |   2 +
 control_plane/attachment_service/src/http.rs  |  37 ++++-
 .../attachment_service/src/persistence.rs     |  92 ++++++------
 .../attachment_service/src/schema.rs          |   1 +
 .../attachment_service/src/service.rs         | 136 ++++++++++++++----
 .../attachment_service/src/tenant_state.rs    |  98 ++++++++++++-
 libs/pageserver_api/src/controller_api.rs     |  32 +++++
 test_runner/fixtures/neon_fixtures.py         |  31 ++++
 test_runner/regress/test_sharding_service.py  |  95 ++++++++++++
 10 files changed, 448 insertions(+), 79 deletions(-)
 create mode 100644 control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql

diff --git a/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql b/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql
new file mode 100644
index 0000000000..33c06dc03d
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql
@@ -0,0 +1,3 @@
+-- This file should undo anything in `up.sql`
+
+ALTER TABLE tenant_shards drop scheduling_policy;
\ No newline at end of file
diff --git a/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql b/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql
new file mode 100644
index 0000000000..aa00f0d2ca
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql
@@ -0,0 +1,2 @@
+
+ALTER TABLE tenant_shards add scheduling_policy VARCHAR NOT NULL DEFAULT '"Active"';
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 036019cd38..1f3f78bffa 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -34,7 +34,8 @@ use utils::{
 };
 
 use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
+    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest,
+    TenantShardMigrateRequest,
 };
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
 
@@ -478,6 +479,22 @@ async fn handle_tenant_shard_migrate(
     )
 }
 
+async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
+    let state = get_state(&req);
+
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .tenant_update_policy(tenant_id, update_req)
+            .await?,
+    )
+}
+
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
@@ -509,6 +526,14 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
     json_response(StatusCode::OK, state.service.consistency_check().await?)
 }
 
+async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+
+    json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
+}
+
 /// Status endpoint is just used for checking that our HTTP listener is up
 async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(StatusCode::OK, ())
@@ -726,6 +751,9 @@ pub fn make_router(
                 RequestName("debug_v1_consistency_check"),
             )
         })
+        .post("/debug/v1/reconcile_all", |r| {
+            request_span(r, handle_reconcile_all)
+        })
         .put("/debug/v1/failpoints", |r| {
             request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
         })
@@ -765,6 +793,13 @@ pub fn make_router(
                 RequestName("control_v1_tenant_describe"),
             )
         })
+        .put("/control/v1/tenant/:tenant_id/policy", |r| {
+            named_request_span(
+                r,
+                handle_tenant_update_policy,
+                RequestName("control_v1_tenant_policy"),
+            )
+        })
         // Tenant operations
         // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
         // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index dafd52017b..d60392bdbc 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -9,6 +9,7 @@ use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
+use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::ShardConfigError;
@@ -107,6 +108,12 @@ pub(crate) enum AbortShardSplitStatus {
 
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
+/// Some methods can operate on either a whole tenant or a single shard
+pub(crate) enum TenantFilter {
+    Tenant(TenantId),
+    Shard(TenantShardId),
+}
+
 impl Persistence {
     // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
     // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -140,7 +147,7 @@ impl Persistence {
     /// Wraps `with_conn` in order to collect latency and error metrics
     async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
     where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
         R: Send + 'static,
     {
         let latency = &METRICS_REGISTRY
@@ -168,7 +175,7 @@ impl Persistence {
     /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
     async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
     where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
         R: Send + 'static,
     {
         let mut conn = self.connection_pool.get()?;
@@ -275,6 +282,11 @@ impl Persistence {
                 // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
                 shard.placement_policy = "{\"Attached\":0}".to_string();
             }
+
+            if shard.scheduling_policy.is_empty() {
+                shard.scheduling_policy =
+                    serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
+            }
         }
 
         let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
@@ -465,59 +477,45 @@ impl Persistence {
     /// that we only do the first time a tenant is set to an attached policy via /location_config.
     pub(crate) async fn update_tenant_shard(
         &self,
-        tenant_shard_id: TenantShardId,
-        input_placement_policy: PlacementPolicy,
-        input_config: TenantConfig,
+        tenant: TenantFilter,
+        input_placement_policy: Option<PlacementPolicy>,
+        input_config: Option<TenantConfig>,
         input_generation: Option<Generation>,
+        input_scheduling_policy: Option<ShardSchedulingPolicy>,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
 
         self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
-            let query = diesel::update(tenant_shards)
-                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
+            let query = match tenant {
+                TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .into_boxed(),
+                TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(input_tenant_id.to_string()))
+                    .into_boxed(),
+            };
 
-            if let Some(input_generation) = input_generation {
-                // Update includes generation column
-                query
-                    .set((
-                        generation.eq(Some(input_generation.into().unwrap() as i32)),
-                        placement_policy
-                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
-                        config.eq(serde_json::to_string(&input_config).unwrap()),
-                    ))
-                    .execute(conn)?;
-            } else {
-                // Update does not include generation column
-                query
-                    .set((
-                        placement_policy
-                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
-                        config.eq(serde_json::to_string(&input_config).unwrap()),
-                    ))
-                    .execute(conn)?;
+            #[derive(AsChangeset)]
+            #[diesel(table_name = crate::schema::tenant_shards)]
+            struct ShardUpdate {
+                generation: Option<i32>,
+                placement_policy: Option<String>,
+                config: Option<String>,
+                scheduling_policy: Option<String>,
             }
 
-            Ok(())
-        })
-        .await?;
+            let update = ShardUpdate {
+                generation: input_generation.map(|g| g.into().unwrap() as i32),
+                placement_policy: input_placement_policy
+                    .map(|p| serde_json::to_string(&p).unwrap()),
+                config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
+                scheduling_policy: input_scheduling_policy
+                    .map(|p| serde_json::to_string(&p).unwrap()),
+            };
 
-        Ok(())
-    }
-
-    pub(crate) async fn update_tenant_config(
-        &self,
-        input_tenant_id: TenantId,
-        input_config: TenantConfig,
-    ) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
-
-        self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
-            diesel::update(tenant_shards)
-                .filter(tenant_id.eq(input_tenant_id.to_string()))
-                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
-                .execute(conn)?;
+            query.set(update).execute(conn)?;
 
             Ok(())
         })
@@ -728,6 +726,8 @@ pub(crate) struct TenantShardPersistence {
     pub(crate) splitting: SplitState,
     #[serde(default)]
     pub(crate) config: String,
+    #[serde(default)]
+    pub(crate) scheduling_policy: String,
 }
 
 impl TenantShardPersistence {
diff --git a/control_plane/attachment_service/src/schema.rs b/control_plane/attachment_service/src/schema.rs
index 76e4e56a66..ff37d0fe77 100644
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -22,6 +22,7 @@ diesel::table! {
         placement_policy -> Varchar,
         splitting -> Int2,
         config -> Text,
+        scheduling_policy -> Varchar,
     }
 }
 
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 925910253b..cceecebb7f 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -8,7 +8,9 @@ use std::{
 };
 
 use crate::{
-    id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
+    id_lock_map::IdLockMap,
+    persistence::{AbortShardSplitStatus, TenantFilter},
+    reconciler::ReconcileError,
 };
 use anyhow::Context;
 use control_plane::storage_controller::{
@@ -20,9 +22,10 @@ use hyper::StatusCode;
 use pageserver_api::{
     controller_api::{
         NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
-        TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
-        TenantShardMigrateResponse, UtilizationScore,
+        ShardSchedulingPolicy, TenantCreateResponse, TenantCreateResponseShard,
+        TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse,
+        TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
+        UtilizationScore,
     },
     models::{SecondaryProgress, TenantConfigRequest},
 };
@@ -51,7 +54,6 @@ use utils::{
     generation::Generation,
     http::error::ApiError,
     id::{NodeId, TenantId, TimelineId},
-    seqwait::SeqWait,
     sync::gate::Gate,
 };
 
@@ -66,7 +68,6 @@ use crate::{
         IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
         ReconcilerWaiter, TenantState,
     },
-    Sequence,
 };
 
 // For operations that should be quick, like attaching a new tenant
@@ -957,30 +958,14 @@ impl Service {
         }
         for tsp in tenant_shard_persistence {
             let tenant_shard_id = tsp.get_tenant_shard_id()?;
-            let shard_identity = tsp.get_shard_identity()?;
+
             // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
             // it with what we can infer: the node for which a generation was most recently issued.
             let mut intent = IntentState::new();
             if let Some(generation_pageserver) = tsp.generation_pageserver {
                 intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
             }
-
-            let new_tenant = TenantState {
-                tenant_shard_id,
-                shard: shard_identity,
-                sequence: Sequence::initial(),
-                generation: tsp.generation.map(|g| Generation::new(g as u32)),
-                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
-                intent,
-                observed: ObservedState::new(),
-                config: serde_json::from_str(&tsp.config).unwrap(),
-                reconciler: None,
-                splitting: tsp.splitting,
-                waiter: Arc::new(SeqWait::new(Sequence::initial())),
-                error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
-                last_error: Arc::default(),
-                pending_compute_notification: false,
-            };
+            let new_tenant = TenantState::from_persistent(tsp, intent)?;
 
             tenants.insert(tenant_shard_id, new_tenant);
         }
@@ -1104,6 +1089,8 @@ impl Service {
                 placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
                 config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                 splitting: SplitState::default(),
+                scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
+                    .unwrap(),
             };
 
             match self.persistence.insert_tenant_shards(vec![tsp]).await {
@@ -1156,9 +1143,10 @@ impl Service {
                     // when we reattaching a detached tenant.
                     self.persistence
                         .update_tenant_shard(
-                            attach_req.tenant_shard_id,
-                            PlacementPolicy::Attached(0),
-                            conf,
+                            TenantFilter::Shard(attach_req.tenant_shard_id),
+                            Some(PlacementPolicy::Attached(0)),
+                            Some(conf),
+                            None,
                             None,
                         )
                         .await?;
@@ -1615,6 +1603,8 @@ impl Service {
                 placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                 config: serde_json::to_string(&create_req.config).unwrap(),
                 splitting: SplitState::default(),
+                scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
+                    .unwrap(),
             })
             .collect();
 
@@ -1907,10 +1897,11 @@ impl Service {
                 {
                     self.persistence
                         .update_tenant_shard(
-                            *tenant_shard_id,
-                            placement_policy.clone(),
-                            tenant_config.clone(),
+                            TenantFilter::Shard(*tenant_shard_id),
+                            Some(placement_policy.clone()),
+                            Some(tenant_config.clone()),
                             *generation,
+                            None,
                         )
                         .await?;
                 }
@@ -1988,7 +1979,13 @@ impl Service {
         let config = req.config;
 
         self.persistence
-            .update_tenant_config(req.tenant_id, config.clone())
+            .update_tenant_shard(
+                TenantFilter::Tenant(req.tenant_id),
+                None,
+                Some(config.clone()),
+                None,
+                None,
+            )
             .await?;
 
         let waiters = {
@@ -2341,6 +2338,57 @@ impl Service {
         Ok(StatusCode::NOT_FOUND)
     }
 
+    /// Naming: this configures the storage controller's policies for a tenant, whereas [`Self::tenant_config_set`] is "set the TenantConfig"
+    /// for a tenant.  The TenantConfig is passed through to pageservers, whereas this function modifies
+    /// the tenant's policies (configuration) within the storage controller
+    pub(crate) async fn tenant_update_policy(
+        &self,
+        tenant_id: TenantId,
+        req: TenantPolicyRequest,
+    ) -> Result<(), ApiError> {
+        // We require an exclusive lock, because we are updating persistent and in-memory state
+        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+
+        let TenantPolicyRequest {
+            placement,
+            scheduling,
+        } = req;
+
+        self.persistence
+            .update_tenant_shard(
+                TenantFilter::Tenant(tenant_id),
+                placement.clone(),
+                None,
+                None,
+                scheduling,
+            )
+            .await?;
+
+        let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
+        for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+            if let Some(placement) = &placement {
+                shard.policy = placement.clone();
+
+                tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(),
+                               "Updated placement policy to {placement:?}");
+            }
+
+            if let Some(scheduling) = &scheduling {
+                shard.set_scheduling_policy(*scheduling);
+
+                tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(),
+                               "Updated scheduling policy to {scheduling:?}");
+            }
+
+            // In case scheduling is being switched back on, try it now.
+            shard.schedule(scheduler).ok();
+            self.maybe_reconcile_shard(shard, nodes);
+        }
+
+        Ok(())
+    }
+
     pub(crate) async fn tenant_timeline_create(
         &self,
         tenant_id: TenantId,
@@ -3250,6 +3298,10 @@ impl Service {
                     placement_policy: serde_json::to_string(&policy).unwrap(),
                     config: serde_json::to_string(&config).unwrap(),
                     splitting: SplitState::Splitting,
+
+                    // Scheduling policies do not carry through to children
+                    scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
+                        .unwrap(),
                 });
             }
 
@@ -3970,6 +4022,28 @@ impl Service {
         reconciles_spawned
     }
 
+    /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but
+    /// also wait for any generated Reconcilers to complete.  Calling this until it returns zero should
+    /// put the system into a quiescent state where future background reconciliations won't do anything.
+    pub(crate) async fn reconcile_all_now(&self) -> Result<usize, ReconcileWaitError> {
+        self.reconcile_all();
+
+        let waiters = {
+            let mut waiters = Vec::new();
+            let locked = self.inner.read().unwrap();
+            for (_tenant_shard_id, shard) in locked.tenants.iter() {
+                if let Some(waiter) = shard.get_waiter() {
+                    waiters.push(waiter);
+                }
+            }
+            waiters
+        };
+
+        let waiter_count = waiters.len();
+        self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
+        Ok(waiter_count)
+    }
+
     pub async fn shutdown(&self) {
         // Note that this already stops processing any results from reconciles: so
         // we do not expect that our [`TenantState`] objects will reach a neat
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 83c921dc58..3dc3483e09 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -8,7 +8,7 @@ use crate::{
     metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
     persistence::TenantShardPersistence,
 };
-use pageserver_api::controller_api::PlacementPolicy;
+use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
     shard::{ShardIdentity, TenantShardId},
@@ -116,6 +116,10 @@ pub(crate) struct TenantState {
     /// sending it.  This is the mechanism by which compute notifications are included in the scope
     /// of state that we publish externally in an eventually consistent way.
     pub(crate) pending_compute_notification: bool,
+
+    // Support/debug tool: if something is going wrong or flapping with scheduling, this may
+    // be set to a non-active state to avoid making changes while the issue is fixed.
+    scheduling_policy: ShardSchedulingPolicy,
 }
 
 #[derive(Default, Clone, Debug, Serialize)]
@@ -370,6 +374,7 @@ impl TenantState {
             error_waiter: Arc::new(SeqWait::new(Sequence(0))),
             last_error: Arc::default(),
             pending_compute_notification: false,
+            scheduling_policy: ShardSchedulingPolicy::default(),
         }
     }
 
@@ -453,6 +458,16 @@ impl TenantState {
         // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
         // change their attach location.
 
+        match self.scheduling_policy {
+            ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {}
+            ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
+                // Warn to make it obvious why other things aren't happening/working, if we skip scheduling
+                tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
+                    "Scheduling is disabled by policy {:?}", self.scheduling_policy);
+                return Ok(());
+            }
+        }
+
         // Build the set of pageservers already in use by this tenant, to avoid scheduling
         // more work on the same pageservers we're already using.
         let mut modified = false;
@@ -668,6 +683,19 @@ impl TenantState {
             }
         }
 
+        // Pre-checks done: finally check whether we may actually do the work
+        match self.scheduling_policy {
+            ShardSchedulingPolicy::Active
+            | ShardSchedulingPolicy::Essential
+            | ShardSchedulingPolicy::Pause => {}
+            ShardSchedulingPolicy::Stop => {
+                // We only reach this point if there is work to do and we're going to skip
+                // doing it: warn it obvious why this tenant isn't doing what it ought to.
+                tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
+                return None;
+            }
+        }
+
         // Build list of nodes from which the reconciler should detach
         let mut detach = Vec::new();
         for node_id in self.observed.locations.keys() {
@@ -804,6 +832,22 @@ impl TenantState {
         })
     }
 
+    /// Get a waiter for any reconciliation in flight, but do not start reconciliation
+    /// if it is not already running
+    pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
+        if self.reconciler.is_some() {
+            Some(ReconcilerWaiter {
+                tenant_shard_id: self.tenant_shard_id,
+                seq_wait: self.waiter.clone(),
+                error_seq_wait: self.error_waiter.clone(),
+                error: self.last_error.clone(),
+                seq: self.sequence,
+            })
+        } else {
+            None
+        }
+    }
+
     /// Called when a ReconcileResult has been emitted and the service is updating
     /// our state: if the result is from a sequence >= my ReconcileHandle, then drop
     /// the handle to indicate there is no longer a reconciliation in progress.
@@ -829,6 +873,36 @@ impl TenantState {
         debug_assert!(!self.intent.all_pageservers().contains(&node_id));
     }
 
+    pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) {
+        self.scheduling_policy = p;
+    }
+
+    pub(crate) fn from_persistent(
+        tsp: TenantShardPersistence,
+        intent: IntentState,
+    ) -> anyhow::Result<Self> {
+        let tenant_shard_id = tsp.get_tenant_shard_id()?;
+        let shard_identity = tsp.get_shard_identity()?;
+
+        Ok(Self {
+            tenant_shard_id,
+            shard: shard_identity,
+            sequence: Sequence::initial(),
+            generation: tsp.generation.map(|g| Generation::new(g as u32)),
+            policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
+            intent,
+            observed: ObservedState::new(),
+            config: serde_json::from_str(&tsp.config).unwrap(),
+            reconciler: None,
+            splitting: tsp.splitting,
+            waiter: Arc::new(SeqWait::new(Sequence::initial())),
+            error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
+            last_error: Arc::default(),
+            pending_compute_notification: false,
+            scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
+        })
+    }
+
     pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
         TenantShardPersistence {
             tenant_id: self.tenant_shard_id.tenant_id.to_string(),
@@ -840,6 +914,7 @@ impl TenantState {
             placement_policy: serde_json::to_string(&self.policy).unwrap(),
             config: serde_json::to_string(&self.config).unwrap(),
             splitting: SplitState::default(),
+            scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
         }
     }
 }
@@ -980,4 +1055,25 @@ pub(crate) mod tests {
         tenant_state.intent.clear(&mut scheduler);
         Ok(())
     }
+
+    #[test]
+    fn scheduling_mode() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(3);
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
+
+        // In pause mode, schedule() shouldn't do anything
+        tenant_state.scheduling_policy = ShardSchedulingPolicy::Pause;
+        assert!(tenant_state.schedule(&mut scheduler).is_ok());
+        assert!(tenant_state.intent.all_pageservers().is_empty());
+
+        // In active mode, schedule() works
+        tenant_state.scheduling_policy = ShardSchedulingPolicy::Active;
+        assert!(tenant_state.schedule(&mut scheduler).is_ok());
+        assert!(!tenant_state.intent.all_pageservers().is_empty());
+
+        tenant_state.intent.clear(&mut scheduler);
+        Ok(())
+    }
 }
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index e33bd0f486..dcf9e38106 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -42,6 +42,12 @@ pub struct NodeConfigureRequest {
     pub scheduling: Option<NodeSchedulingPolicy>,
 }
 
+#[derive(Serialize, Deserialize)]
+pub struct TenantPolicyRequest {
+    pub placement: Option<PlacementPolicy>,
+    pub scheduling: Option<ShardSchedulingPolicy>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantLocateResponseShard {
     pub shard_id: TenantShardId,
@@ -170,6 +176,32 @@ impl FromStr for NodeAvailability {
     }
 }
 
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
+pub enum ShardSchedulingPolicy {
+    // Normal mode: the tenant's scheduled locations may be updated at will, including
+    // for non-essential optimization.
+    Active,
+
+    // Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
+    // For example, this still permits a node's attachment location to change to a secondary in
+    // response to a node failure, or to assign a new secondary if a node was removed.
+    Essential,
+
+    // No scheduling: leave the shard running wherever it currently is.  Even if the shard is
+    // unavailable, it will not be rescheduled to another node.
+    Pause,
+
+    // No reconciling: we will make no location_conf API calls to pageservers at all.  If the
+    // shard is unavailable, it stays that way.  If a node fails, this shard doesn't get failed over.
+    Stop,
+}
+
+impl Default for ShardSchedulingPolicy {
+    fn default() -> Self {
+        Self::Active
+    }
+}
+
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeSchedulingPolicy {
     Active,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3d60f9bef5..d0519d3406 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2116,6 +2116,7 @@ class NeonStorageController(MetricsGetter):
         shard_count: Optional[int] = None,
         shard_stripe_size: Optional[int] = None,
         tenant_config: Optional[Dict[Any, Any]] = None,
+        placement_policy: Optional[str] = None,
     ):
         """
         Use this rather than pageserver_api() when you need to include shard parameters
@@ -2135,6 +2136,8 @@ class NeonStorageController(MetricsGetter):
             for k, v in tenant_config.items():
                 body[k] = v
 
+        body["placement_policy"] = placement_policy
+
         response = self.request(
             "POST",
             f"{self.env.storage_controller_api}/v1/tenant",
@@ -2193,6 +2196,34 @@ class NeonStorageController(MetricsGetter):
         log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}")
         assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id
 
+    def tenant_policy_update(self, tenant_id: TenantId, body: dict[str, Any]):
+        log.info(f"tenant_policy_update({tenant_id}, {body})")
+        self.request(
+            "PUT",
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/policy",
+            json=body,
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
+    def reconcile_all(self):
+        r = self.request(
+            "POST",
+            f"{self.env.storage_controller_api}/debug/v1/reconcile_all",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        r.raise_for_status()
+        n = r.json()
+        log.info(f"reconcile_all waited for {n} shards")
+        return n
+
+    def reconcile_until_idle(self, timeout_secs=30):
+        start_at = time.time()
+        n = 1
+        while n > 0:
+            n = self.reconcile_all()
+            if time.time() - start_at > timeout_secs:
+                raise RuntimeError("Timeout in reconcile_until_idle")
+
     def consistency_check(self):
         """
         Throw an exception if the service finds any inconsistencies in its state
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index fc6c137667..c33d2ca0da 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -1015,3 +1015,98 @@ def test_sharding_service_re_attach(neon_env_builder: NeonEnvBuilder):
         "storage_controller_reconcile_complete_total", filter={"status": "ok"}
     )
     assert reconciles_after_restart == reconciles_before_restart
+
+
+def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBuilder):
+    """
+    Check that emergency hooks for disabling rogue tenants' reconcilers work as expected.
+    """
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+
+    env.storage_controller.allowed_errors.extend(
+        [
+            # We will intentionally cause reconcile errors
+            ".*Reconcile error.*",
+            # Message from using a scheduling policy
+            ".*Scheduling is disabled by policy.*",
+            ".*Skipping reconcile for policy.*",
+            # Message from a node being offline
+            ".*Call to node .* management API .* failed",
+        ]
+    )
+
+    # Stop pageserver so that reconcile cannot complete
+    env.pageserver.stop()
+
+    env.storage_controller.tenant_create(tenant_id, placement_policy="Detached")
+
+    # Try attaching it: we should see reconciles failing
+    env.storage_controller.tenant_policy_update(
+        tenant_id,
+        {
+            "placement": {"Attached": 0},
+        },
+    )
+
+    def reconcile_errors() -> int:
+        return int(
+            env.storage_controller.get_metric_value(
+                "storage_controller_reconcile_complete_total", filter={"status": "error"}
+            )
+            or 0
+        )
+
+    def reconcile_ok() -> int:
+        return int(
+            env.storage_controller.get_metric_value(
+                "storage_controller_reconcile_complete_total", filter={"status": "ok"}
+            )
+            or 0
+        )
+
+    def assert_errors_gt(n) -> int:
+        e = reconcile_errors()
+        assert e > n
+        return e
+
+    errs = wait_until(10, 1, lambda: assert_errors_gt(0))
+
+    # Try reconciling again, it should fail again
+    with pytest.raises(StorageControllerApiException):
+        env.storage_controller.reconcile_all()
+    errs = wait_until(10, 1, lambda: assert_errors_gt(errs))
+
+    # Configure the tenant to disable reconciles
+    env.storage_controller.tenant_policy_update(
+        tenant_id,
+        {
+            "scheduling": "Stop",
+        },
+    )
+
+    # Try reconciling again, it should not cause an error (silently skip)
+    env.storage_controller.reconcile_all()
+    assert reconcile_errors() == errs
+
+    # Start the pageserver and re-enable reconciles
+    env.pageserver.start()
+    env.storage_controller.tenant_policy_update(
+        tenant_id,
+        {
+            "scheduling": "Active",
+        },
+    )
+
+    def assert_ok_gt(n) -> int:
+        o = reconcile_ok()
+        assert o > n
+        return o
+
+    # We should see a successful reconciliation
+    wait_until(10, 1, lambda: assert_ok_gt(0))
+
+    # And indeed the tenant should be attached
+    assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1

From 25c4b676e07d582866dade5b8cbda085c0630b68 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 28 Mar 2024 14:27:15 +0000
Subject: [PATCH 0489/1571] pageserver: fix oversized key on vectored read
 (#7259)

## Problem
During this week's deployment we observed panics due to the blobs
for certain keys not fitting in the vectored read buffers. The likely
cause of this is a bloated AUX_FILE_KEY caused by logical replication.

## Summary of changes
This pr fixes the issue by allocating a buffer big enough to fit
the widest read. It also has the benefit of saving space if all keys
in the read have blobs smaller than the max vectored read size.

If the soft limit for the max size of a vectored read is violated,
we print a warning which includes the offending key and lsn.

A randomised (but deterministic) end to end test is also added for
vectored reads on the delta layer.
---
 .../src/tenant/storage_layer/delta_layer.rs   | 268 +++++++++++++++++-
 .../src/tenant/storage_layer/image_layer.rs   |  21 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  12 +
 pageserver/src/tenant/vectored_blob_io.rs     |   2 +-
 4 files changed, 298 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index b7132ee3bf..466d95f46d 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -47,6 +47,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
+use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -946,6 +947,34 @@ impl DeltaLayerInner {
         Ok(planner.finish())
     }
 
+    fn get_min_read_buffer_size(
+        planned_reads: &[VectoredRead],
+        read_size_soft_max: usize,
+    ) -> usize {
+        let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else {
+            return read_size_soft_max;
+        };
+
+        let largest_read_size = largest_read.size();
+        if largest_read_size > read_size_soft_max {
+            // If the read is oversized, it should only contain one key.
+            let offenders = largest_read
+                .blobs_at
+                .as_slice()
+                .iter()
+                .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
+                .join(", ");
+            tracing::warn!(
+                "Oversized vectored read ({} > {}) for keys {}",
+                largest_read_size,
+                read_size_soft_max,
+                offenders
+            );
+        }
+
+        largest_read_size
+    }
+
     async fn do_reads_and_update_state(
         &self,
         reads: Vec<VectoredRead>,
@@ -959,7 +988,8 @@ impl DeltaLayerInner {
             .expect("Layer is loaded with max vectored bytes config")
             .0
             .into();
-        let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
+        let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
+        let mut buf = Some(BytesMut::with_capacity(buf_size));
 
         // Note that reads are processed in reverse order (from highest key+lsn).
         // This is the order that `ReconstructState` requires such that it can
@@ -986,7 +1016,7 @@ impl DeltaLayerInner {
 
                     // We have "lost" the buffer since the lower level IO api
                     // doesn't return the buffer on error. Allocate a new one.
-                    buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
+                    buf = Some(BytesMut::with_capacity(buf_size));
 
                     continue;
                 }
@@ -1210,9 +1240,16 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
 mod test {
     use std::collections::BTreeMap;
 
+    use itertools::MinMaxResult;
+    use rand::prelude::{SeedableRng, SliceRandom, StdRng};
+    use rand::RngCore;
+
     use super::*;
     use crate::{
-        context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
+        context::DownloadBehavior,
+        task_mgr::TaskKind,
+        tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
+        DEFAULT_PG_VERSION,
     };
 
     /// Construct an index for a fictional delta layer and and then
@@ -1332,4 +1369,229 @@ mod test {
 
         assert_eq!(planned_blobs, expected_blobs);
     }
+
+    mod constants {
+        use utils::lsn::Lsn;
+
+        /// Offset used by all lsns in this test
+        pub(super) const LSN_OFFSET: Lsn = Lsn(0x08);
+        /// Number of unique keys including in the test data
+        pub(super) const KEY_COUNT: u8 = 60;
+        /// Max number of different lsns for each key
+        pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20;
+        /// Possible value sizes for each key along with a probability weight
+        pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)];
+        /// Probability that there will be a gap between the current key and the next one (33.3%)
+        pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)];
+        /// The minimum size of a key range in all the generated reads
+        pub(super) const MIN_RANGE_SIZE: i128 = 10;
+        /// The number of ranges included in each vectored read
+        pub(super) const RANGES_COUNT: u8 = 2;
+        /// The number of vectored reads performed
+        pub(super) const READS_COUNT: u8 = 100;
+        /// Soft max size of a vectored read. Will be violated if we have to read keys
+        /// with values larger than the limit
+        pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024;
+    }
+
+    struct Entry {
+        key: Key,
+        lsn: Lsn,
+        value: Vec<u8>,
+    }
+
+    fn generate_entries(rng: &mut StdRng) -> Vec<Entry> {
+        let mut current_key = Key::MIN;
+
+        let mut entries = Vec::new();
+        for _ in 0..constants::KEY_COUNT {
+            let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY);
+            let mut lsns_iter =
+                std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| {
+                    Some(Lsn(lsn.0 + 0x08))
+                });
+            let mut lsns = Vec::new();
+            while lsns.len() < count as usize {
+                let take = rng.gen_bool(0.5);
+                let lsn = lsns_iter.next().unwrap();
+                if take {
+                    lsns.push(lsn);
+                }
+            }
+
+            for lsn in lsns {
+                let size = constants::VALUE_SIZES
+                    .choose_weighted(rng, |item| item.1)
+                    .unwrap()
+                    .0;
+                let mut buf = vec![0; size];
+                rng.fill_bytes(&mut buf);
+
+                entries.push(Entry {
+                    key: current_key,
+                    lsn,
+                    value: buf,
+                })
+            }
+
+            let gap = constants::KEY_GAP_CHANGES
+                .choose_weighted(rng, |item| item.1)
+                .unwrap()
+                .0;
+            if gap {
+                current_key = current_key.add(2);
+            } else {
+                current_key = current_key.add(1);
+            }
+        }
+
+        entries
+    }
+
+    struct EntriesMeta {
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+        index: BTreeMap<(Key, Lsn), Vec<u8>>,
+    }
+
+    fn get_entries_meta(entries: &[Entry]) -> EntriesMeta {
+        let key_range = match entries.iter().minmax_by_key(|e| e.key) {
+            MinMaxResult::MinMax(min, max) => min.key..max.key.next(),
+            _ => panic!("More than one entry is always expected"),
+        };
+
+        let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) {
+            MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1),
+            _ => panic!("More than one entry is always expected"),
+        };
+
+        let mut index = BTreeMap::new();
+        for entry in entries.iter() {
+            index.insert((entry.key, entry.lsn), entry.value.clone());
+        }
+
+        EntriesMeta {
+            key_range,
+            lsn_range,
+            index,
+        }
+    }
+
+    fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range<Key>) -> KeySpace {
+        let start = key_range.start.to_i128();
+        let end = key_range.end.to_i128();
+
+        let mut keyspace = KeySpace::default();
+
+        for _ in 0..constants::RANGES_COUNT {
+            let mut range: Option<Range<Key>> = Option::default();
+            while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) {
+                let range_start = rng.gen_range(start..end);
+                let range_end_offset = range_start + constants::MIN_RANGE_SIZE;
+                if range_end_offset >= end {
+                    range = Some(Key::from_i128(range_start)..Key::from_i128(end));
+                } else {
+                    let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end);
+                    range = Some(Key::from_i128(range_start)..Key::from_i128(range_end));
+                }
+            }
+            keyspace.ranges.push(range.unwrap());
+        }
+
+        keyspace
+    }
+
+    #[tokio::test]
+    async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
+        let (tenant, ctx) = harness.load().await;
+
+        let timeline_id = TimelineId::generate();
+        let timeline = tenant
+            .create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        tracing::info!("Generating test data ...");
+
+        let rng = &mut StdRng::seed_from_u64(0);
+        let entries = generate_entries(rng);
+        let entries_meta = get_entries_meta(&entries);
+
+        tracing::info!("Done generating {} entries", entries.len());
+
+        tracing::info!("Writing test data to delta layer ...");
+        let mut writer = DeltaLayerWriter::new(
+            harness.conf,
+            timeline_id,
+            harness.tenant_shard_id,
+            entries_meta.key_range.start,
+            entries_meta.lsn_range.clone(),
+        )
+        .await?;
+
+        for entry in entries {
+            let (_, res) = writer
+                .put_value_bytes(entry.key, entry.lsn, entry.value, false)
+                .await;
+            res?;
+        }
+
+        let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
+
+        let inner = resident.get_inner_delta(&ctx).await?;
+
+        let file_size = inner.file.metadata().await?.len();
+        tracing::info!(
+            "Done writing test data to delta layer. Resulting file size is: {}",
+            file_size
+        );
+
+        for i in 0..constants::READS_COUNT {
+            tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT);
+
+            let block_reader = FileBlockReader::new(&inner.file, inner.file_id);
+            let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+                inner.index_start_blk,
+                inner.index_root_blk,
+                block_reader,
+            );
+
+            let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES);
+            let mut reconstruct_state = ValuesReconstructState::new();
+            let keyspace = pick_random_keyspace(rng, &entries_meta.key_range);
+            let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
+
+            let vectored_reads = DeltaLayerInner::plan_reads(
+                keyspace.clone(),
+                entries_meta.lsn_range.clone(),
+                data_end_offset,
+                index_reader,
+                planner,
+                &mut reconstruct_state,
+                &ctx,
+            )
+            .await?;
+
+            let vectored_blob_reader = VectoredBlobReader::new(&inner.file);
+            let buf_size = DeltaLayerInner::get_min_read_buffer_size(
+                &vectored_reads,
+                constants::MAX_VECTORED_READ_BYTES,
+            );
+            let mut buf = Some(BytesMut::with_capacity(buf_size));
+
+            for read in vectored_reads {
+                let blobs_buf = vectored_blob_reader
+                    .read_blobs(&read, buf.take().expect("Should have a buffer"))
+                    .await?;
+                for meta in blobs_buf.blobs.iter() {
+                    let value = &blobs_buf.buf[meta.start..meta.end];
+                    assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
+                }
+
+                buf = Some(blobs_buf.buf);
+            }
+        }
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 14c79e413c..5b44d2bc2c 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -44,6 +44,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
+use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -540,7 +541,25 @@ impl ImageLayerInner {
 
         let vectored_blob_reader = VectoredBlobReader::new(&self.file);
         for read in reads.into_iter() {
-            let buf = BytesMut::with_capacity(max_vectored_read_bytes);
+            let buf_size = read.size();
+
+            if buf_size > max_vectored_read_bytes {
+                // If the read is oversized, it should only contain one key.
+                let offenders = read
+                    .blobs_at
+                    .as_slice()
+                    .iter()
+                    .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
+                    .join(", ");
+                tracing::warn!(
+                    "Oversized vectored read ({} > {}) for keys {}",
+                    buf_size,
+                    max_vectored_read_bytes,
+                    offenders
+                );
+            }
+
+            let buf = BytesMut::with_capacity(buf_size);
             let res = vectored_blob_reader.read_blobs(&read, buf).await;
 
             match res {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 8ba37b5a86..27e60f783c 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1759,6 +1759,18 @@ impl ResidentLayer {
     pub(crate) fn metadata(&self) -> LayerFileMetadata {
         self.owner.metadata()
     }
+
+    #[cfg(test)]
+    pub(crate) async fn get_inner_delta<'a>(
+        &'a self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> {
+        let owner = &self.owner.0;
+        match self.downloaded.get(owner, ctx).await? {
+            LayerKind::Delta(d) => Ok(d),
+            LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")),
+        }
+    }
 }
 
 impl AsLayerDesc for ResidentLayer {
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 805f70b23b..3a6950cf88 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -61,7 +61,7 @@ pub struct VectoredRead {
 }
 
 impl VectoredRead {
-    fn size(&self) -> usize {
+    pub fn size(&self) -> usize {
         (self.end - self.start) as usize
     }
 }

From be1d8fc4f73718afc919276701a9b180c809161f Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 28 Mar 2024 11:24:36 -0400
Subject: [PATCH 0490/1571] fix: drop replication slot causes postgres stuck on
 exit (#7192)

Fix https://github.com/neondatabase/neon/issues/6969

Ref https://github.com/neondatabase/postgres/pull/395
https://github.com/neondatabase/postgres/pull/396

Postgres will stuck on exit if the replication slot is not dropped
before shutting down. This is caused by Neon's custom WAL record to
record replication slots. The pull requests in the postgres repo fixes
the problem, and this pull request bumps the postgres commit.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../regress/test_logical_replication.py       | 64 +++++++++++++++++++
 vendor/postgres-v14                           |  2 +-
 vendor/postgres-v15                           |  2 +-
 vendor/revisions.json                         |  4 +-
 4 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 3f4ca8070d..1bac528397 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -364,3 +364,67 @@ def test_slots_and_branching(neon_simple_env: NeonEnv):
     # Check that we can create slot with the same name
     ws_cur = ws_branch.connect().cursor()
     ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')")
+
+
+def test_replication_shutdown(neon_simple_env: NeonEnv):
+    # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_replication_shutdown_publisher", "empty")
+    pub = env.endpoints.create("test_replication_shutdown_publisher")
+
+    env.neon_cli.create_branch("test_replication_shutdown_subscriber")
+    sub = env.endpoints.create("test_replication_shutdown_subscriber")
+
+    pub.respec(skip_pg_catalog_updates=False)
+    pub.start()
+
+    sub.respec(skip_pg_catalog_updates=False)
+    sub.start()
+
+    pub.wait_for_migrations()
+    sub.wait_for_migrations()
+
+    with pub.cursor() as cur:
+        cur.execute(
+            "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser"
+        )
+        cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers")
+        cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser")
+
+        # If we don't do this, creating the subscription will fail later on PG16
+        pub.edit_hba(["host all mr_whiskers 0.0.0.0/0 md5"])
+
+    with sub.cursor() as cur:
+        cur.execute(
+            "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser"
+        )
+        cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers")
+        cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser")
+
+    with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
+        cur.execute("CREATE PUBLICATION pub FOR ALL TABLES")
+        cur.execute("CREATE TABLE t (a int)")
+        cur.execute("INSERT INTO t VALUES (10), (20)")
+        cur.execute("SELECT * from t")
+        res = cur.fetchall()
+        assert [r[0] for r in res] == [10, 20]
+
+    with sub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
+        cur.execute("CREATE TABLE t (a int)")
+
+        pub_conn = f"host=localhost port={pub.pg_port} dbname=neondb user=mr_whiskers password=cat"
+        query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub"
+        log.info(f"Creating subscription: {query}")
+        cur.execute(query)
+
+        with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as pcur:
+            pcur.execute("INSERT INTO t VALUES (30), (40)")
+
+        def check_that_changes_propagated():
+            cur.execute("SELECT * FROM t")
+            res = cur.fetchall()
+            log.info(res)
+            assert len(res) == 4
+            assert [r[0] for r in res] == [10, 20, 30, 40]
+
+        wait_until(10, 0.5, check_that_changes_propagated)
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 748643b468..a7b4c66156 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 748643b4683e9fe3b105011a6ba8a687d032cd65
+Subproject commit a7b4c66156bce00afa60e5592d4284ba9e40b4cf
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index e7651e79c0..64b8c7bccc 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit e7651e79c0c27fbddc3c724f5b9553222c28e395
+Subproject commit 64b8c7bccc6b77e04795e2d4cf6ad82dc8d987ed
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 3c1b866137..75dc095168 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
   "postgres-v16": "3946b2e2ea71d07af092099cb5bcae76a69b90d6",
-  "postgres-v15": "e7651e79c0c27fbddc3c724f5b9553222c28e395",
-  "postgres-v14": "748643b4683e9fe3b105011a6ba8a687d032cd65"
+  "postgres-v15": "64b8c7bccc6b77e04795e2d4cf6ad82dc8d987ed",
+  "postgres-v14": "a7b4c66156bce00afa60e5592d4284ba9e40b4cf"
 }

From 722f271f6eb339f3bf5ce72e78608f2e6e527b63 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 28 Mar 2024 15:28:58 +0000
Subject: [PATCH 0491/1571] Specify caller in 'unexpected response from page
 server' error (#7272)

Tiny improvement for log messages to investigate
https://github.com/neondatabase/cloud/issues/11559
---
 pgxn/neon/pagestore_smgr.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index ecc8ddb384..b33cfab2bb 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1688,7 +1688,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			break;
 
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag);
 	}
 	pfree(resp);
 	return exists;
@@ -2224,7 +2224,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag);
 	}
 
 	/* buffer was used, clean up for later reuse */
@@ -2497,7 +2497,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			break;
 
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag);
 	}
 	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
 
@@ -2552,7 +2552,7 @@ neon_dbsize(Oid dbNode)
 			break;
 
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag);
 	}
 
 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
@@ -2857,7 +2857,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 			break;
 
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag);
 	}
 	pfree(resp);
 

From c52b80b930f0cb7106f5474a70bdcea4b5883579 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 28 Mar 2024 16:51:45 +0000
Subject: [PATCH 0492/1571] CI(deploy): Do not deploy storage controller to
 preprod for proxy releases (#7269)

## Problem

Proxy release to a preprod automatically triggers a deployment of storage
controller (`deployStorageController=true` by default)

## Summary of changes
- Set `deployStorageController=false` for proxy releases to preprod
- Set explicitly `deployStorageController=true` for storage releases to
preprod and prod
---
 .github/workflows/build_and_test.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index d27713f083..36922d5294 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1127,6 +1127,7 @@ jobs:
               -f deployProxy=false \
               -f deployStorage=true \
               -f deployStorageBroker=true \
+              -f deployStorageController=true \
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}} \
               -f deployPreprodRegion=true
@@ -1136,6 +1137,7 @@ jobs:
               -f deployProxy=false \
               -f deployStorage=true \
               -f deployStorageBroker=true \
+              -f deployStorageController=true \
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
@@ -1144,6 +1146,7 @@ jobs:
               -f deployProxy=true \
               -f deployStorage=false \
               -f deployStorageBroker=false \
+              -f deployStorageController=false \
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}} \
               -f deployPreprodRegion=true

From 90be79fcf5fa94d81254a79e4555248bc8c68fa2 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 28 Mar 2024 13:22:35 -0400
Subject: [PATCH 0493/1571] spec: allow neon extension auto-upgrade + softfail
 upgrade (#7231)

reverts https://github.com/neondatabase/neon/pull/7128, unblocks
https://github.com/neondatabase/cloud/issues/10742

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/spec.rs                  | 23 ++++++++-------
 test_runner/regress/test_neon_extension.py | 34 ++++++++++++++++++++++
 2 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 4006062fc2..5643634633 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -743,21 +743,24 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
     // which may happen in two cases:
     // - extension was just installed
     // - extension was already installed and is up to date
-    // DISABLED due to compute node unpinning epic
-    // let query = "ALTER EXTENSION neon UPDATE";
-    // info!("update neon extension version with query: {}", query);
-    // client.simple_query(query)?;
+    let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension version with query: {}", query);
+    if let Err(e) = client.simple_query(query) {
+        error!(
+            "failed to upgrade neon extension during `handle_extension_neon`: {}",
+            e
+        );
+    }
 
     Ok(())
 }
 
 #[instrument(skip_all)]
-pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
-    info!("handle neon extension upgrade (not really)");
-    // DISABLED due to compute node unpinning epic
-    // let query = "ALTER EXTENSION neon UPDATE";
-    // info!("update neon extension version with query: {}", query);
-    // client.simple_query(query)?;
+pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
+    info!("handle neon extension upgrade");
+    let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension version with query: {}", query);
+    client.simple_query(query)?;
 
     Ok(())
 }
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index e31e1cab51..39b4865026 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -1,3 +1,4 @@
+import time
 from contextlib import closing
 
 from fixtures.log_helper import log
@@ -43,6 +44,12 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
 
     with closing(endpoint_main.connect()) as conn:
         with conn.cursor() as cur:
+            cur.execute("SELECT extversion from pg_extension where extname='neon'")
+            # IMPORTANT:
+            # If the version has changed, the test should be updated.
+            # Ensure that the default version is also updated in the neon.control file
+            assert cur.fetchone() == ("1.3",)
+            cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
             all_versions = ["1.3", "1.2", "1.1", "1.0"]
             current_version = "1.3"
             for idx, begin_version in enumerate(all_versions):
@@ -60,3 +67,30 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
                     cur.execute(
                         f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {target_version}->{begin_version}"
                     )
+
+
+# Verify that the neon extension can be auto-upgraded to the latest version.
+def test_neon_extension_auto_upgrade(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_neon_extension_auto_upgrade")
+
+    endpoint_main = env.endpoints.create("test_neon_extension_auto_upgrade")
+    # don't skip pg_catalog updates - it runs CREATE EXTENSION neon
+    endpoint_main.respec(skip_pg_catalog_updates=False)
+    endpoint_main.start()
+
+    with closing(endpoint_main.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("ALTER EXTENSION neon UPDATE TO '1.0';")
+            cur.execute("SELECT extversion from pg_extension where extname='neon'")
+            assert cur.fetchone() == ("1.0",)  # Ensure the extension gets downgraded
+
+    endpoint_main.stop()
+    time.sleep(1)
+    endpoint_main.start()
+    time.sleep(1)
+
+    with closing(endpoint_main.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("SELECT extversion from pg_extension where extname='neon'")
+            assert cur.fetchone() != ("1.0",)  # Ensure the extension gets upgraded

From 39d1818ae982f1c703a481e510dbefd92d614fde Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 28 Mar 2024 17:38:08 +0000
Subject: [PATCH 0494/1571] storage controller: be more tolerant of control
 plane blocking notifications (#7268)

## Problem

- Control plane can deadlock if it calls into a function that requires
reconciliation to complete, while refusing compute notification hooks
API calls.

## Summary of changes

- Fail faster in the notify path in 438 errors: these were originally
expected to be transient, but in practice it's more common that a 438
results from an operation blocking on the currently API call, rather
than something happening in the background.
- In ensure_attached, relax the condition for spawning a reconciler:
instead of just the general maybe_reconcile path, do a pre-check that
skips trying to reconcile if the shard appears to be attached. This
avoids doing work in cases where the tenant is attached, but is dirty
from a reconciliation point of view, e.g. due to a failed compute
notification.
---
 .../attachment_service/src/compute_hook.rs    | 17 +++++++------
 .../attachment_service/src/service.rs         | 21 +++++++++++++---
 test_runner/regress/test_sharding_service.py  | 25 +++++++++++++++++--
 3 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs
index bebc62ac2f..1a8dc6b86d 100644
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -14,7 +14,6 @@ use utils::{
 
 use crate::service::Config;
 
-const BUSY_DELAY: Duration = Duration::from_secs(1);
 const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
 
 pub(crate) const API_CONCURRENCY: usize = 32;
@@ -280,11 +279,10 @@ impl ComputeHook {
                 Err(NotifyError::SlowDown)
             }
             StatusCode::LOCKED => {
-                // Delay our retry if busy: the usual fast exponential backoff in backoff::retry
-                // is not appropriate
-                tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
-                    .await
-                    .ok();
+                // We consider this fatal, because it's possible that the operation blocking the control one is
+                // also the one that is waiting for this reconcile.  We should let the reconciler calling
+                // this hook fail, to give control plane a chance to un-lock.
+                tracing::info!("Control plane reports tenant is locked, dropping out of notify");
                 Err(NotifyError::Busy)
             }
             StatusCode::SERVICE_UNAVAILABLE
@@ -306,7 +304,12 @@ impl ComputeHook {
         let client = reqwest::Client::new();
         backoff::retry(
             || self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
-            |e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
+            |e| {
+                matches!(
+                    e,
+                    NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy
+                )
+            },
             3,
             10,
             "Send compute notification",
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index cceecebb7f..fe2358abae 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -3936,9 +3936,6 @@ impl Service {
     /// Helper for methods that will try and call pageserver APIs for
     /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
     /// is attached somewhere.
-    ///
-    /// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
-    /// an attached policy.  We should error out if it isn't.
     fn ensure_attached_schedule(
         &self,
         mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
@@ -3947,10 +3944,26 @@ impl Service {
         let mut waiters = Vec::new();
         let (nodes, tenants, scheduler) = locked.parts_mut();
 
-        for (_tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+        for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
             shard.schedule(scheduler)?;
 
+            // The shard's policies may not result in an attached location being scheduled: this
+            // is an error because our caller needs it attached somewhere.
+            if shard.intent.get_attached().is_none() {
+                return Err(anyhow::anyhow!(
+                    "Tenant {tenant_id} not scheduled to be attached"
+                ));
+            };
+
+            if shard.stably_attached().is_some() {
+                // We do not require the shard to be totally up to date on reconciliation: we just require
+                // that it has been attached on the intended node.   Other dirty state such as unattached secondary
+                // locations, or compute hook notifications can be ignored.
+                continue;
+            }
+
             if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
+                tracing::info!("Waiting for shard {tenant_shard_id} to reconcile, in order to ensure it is attached");
                 waiters.push(waiter);
             }
         }
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index c33d2ca0da..5a86e03d2b 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -433,10 +433,13 @@ def test_sharding_service_compute_hook(
     # Set up fake HTTP notify endpoint
     notifications = []
 
+    handle_params = {"status": 200}
+
     def handler(request: Request):
-        log.info(f"Notify request: {request}")
+        status = handle_params["status"]
+        log.info(f"Notify request[{status}]: {request}")
         notifications.append(request.json)
-        return Response(status=200)
+        return Response(status=status)
 
     httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
 
@@ -504,6 +507,24 @@ def test_sharding_service_compute_hook(
 
     wait_until(10, 1, received_split_notification)
 
+    # If the compute hook is unavailable, that should not block creating a tenant and
+    # creating a timeline.  This simulates a control plane refusing to accept notifications
+    handle_params["status"] = 423
+    degraded_tenant_id = TenantId.generate()
+    degraded_timeline_id = TimelineId.generate()
+    env.storage_controller.tenant_create(degraded_tenant_id)
+    env.storage_controller.pageserver_api().timeline_create(
+        PgVersion.NOT_SET, degraded_tenant_id, degraded_timeline_id
+    )
+
+    # Ensure we hit the handler error path
+    env.storage_controller.allowed_errors.append(
+        ".*Failed to notify compute of attached pageserver.*tenant busy.*"
+    )
+    env.storage_controller.allowed_errors.append(".*Reconcile error.*tenant busy.*")
+    assert notifications[-1] is not None
+    assert notifications[-1]["tenant_id"] == str(degraded_tenant_id)
+
     env.storage_controller.consistency_check()
 
 
From 090123a4292d56c811a39a7a59a918b7114fd85f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 28 Mar 2024 17:44:55 +0000
Subject: [PATCH 0495/1571] pageserver: check for new image layers based on
 ingested WAL (#7230)

## Problem
Part of the legacy (but current) compaction algorithm is to find a stack
of overlapping delta layers which will be turned
into an image layer. This operation is exponential in terms of the
number of matching layers and we do it roughly every 20 seconds.

## Summary of changes
Only check if a new image layer is required if we've ingested a certain
amount of WAL since the last check.
The amount of wal is expressed in terms of multiples of checkpoint
distance, with the intuition being that
that there's little point doing the check if we only have two new L1
layers (not enough to create a new image).
---
 control_plane/src/pageserver.rs               | 10 ++++++
 libs/pageserver_api/src/models.rs             |  1 +
 pageserver/src/tenant.rs                      |  3 ++
 pageserver/src/tenant/config.rs               | 15 +++++++++
 pageserver/src/tenant/timeline.rs             | 31 +++++++++++++++++++
 .../regress/test_attach_tenant_config.py      |  1 +
 test_runner/regress/test_layer_eviction.py    |  1 +
 .../regress/test_layers_from_future.py        |  1 +
 test_runner/regress/test_ondemand_download.py |  5 ++-
 .../regress/test_pageserver_generations.py    |  1 +
 test_runner/regress/test_remote_storage.py    |  1 +
 11 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index c5eabc46db..abf815f07a 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -389,6 +389,10 @@ impl PageServerNode {
                 .remove("image_creation_threshold")
                 .map(|x| x.parse::<usize>())
                 .transpose()?,
+            image_layer_creation_check_threshold: settings
+                .remove("image_layer_creation_check_threshold")
+                .map(|x| x.parse::<u8>())
+                .transpose()?,
             pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
             walreceiver_connect_timeout: settings
                 .remove("walreceiver_connect_timeout")
@@ -501,6 +505,12 @@ impl PageServerNode {
                     .map(|x| x.parse::<usize>())
                     .transpose()
                     .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
+                image_layer_creation_check_threshold: settings
+                    .remove("image_layer_creation_check_threshold")
+                    .map(|x| x.parse::<u8>())
+                    .transpose()
+                    .context("Failed to parse 'image_creation_check_threshold' as integer")?,
+
                 pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
                 walreceiver_connect_timeout: settings
                     .remove("walreceiver_connect_timeout")
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index aad4cc97fc..ad4ca6710d 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -301,6 +301,7 @@ pub struct TenantConfig {
     pub heatmap_period: Option<String>,
     pub lazy_slru_download: Option<bool>,
     pub timeline_get_throttle: Option<ThrottleConfig>,
+    pub image_layer_creation_check_threshold: Option<u8>,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 792d9e548d..0806ef0cf4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3653,6 +3653,9 @@ pub(crate) mod harness {
                 heatmap_period: Some(tenant_conf.heatmap_period),
                 lazy_slru_download: Some(tenant_conf.lazy_slru_download),
                 timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
+                image_layer_creation_check_threshold: Some(
+                    tenant_conf.image_layer_creation_check_threshold,
+                ),
             }
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 53a8c97e23..a2bb479f63 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -57,6 +57,9 @@ pub mod defaults {
     // throughputs up to 1GiB/s per timeline.
     pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
     pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
+    // By default ingest enough WAL for two new L0 layers before checking if new image
+    // image layers should be created.
+    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
 
     pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }
@@ -362,6 +365,10 @@ pub struct TenantConf {
     pub lazy_slru_download: bool,
 
     pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
+
+    // How much WAL must be ingested before checking again whether a new image layer is required.
+    // Expresed in multiples of checkpoint distance.
+    pub image_layer_creation_check_threshold: u8,
 }
 
 /// Same as TenantConf, but this struct preserves the information about
@@ -454,6 +461,9 @@ pub struct TenantConfOpt {
 
     #[serde(skip_serializing_if = "Option::is_none")]
     pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_layer_creation_check_threshold: Option<u8>,
 }
 
 impl TenantConfOpt {
@@ -508,6 +518,9 @@ impl TenantConfOpt {
                 .timeline_get_throttle
                 .clone()
                 .unwrap_or(global_conf.timeline_get_throttle),
+            image_layer_creation_check_threshold: self
+                .image_layer_creation_check_threshold
+                .unwrap_or(global_conf.image_layer_creation_check_threshold),
         }
     }
 }
@@ -548,6 +561,7 @@ impl Default for TenantConf {
             heatmap_period: Duration::ZERO,
             lazy_slru_download: false,
             timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
+            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
         }
     }
 }
@@ -621,6 +635,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             heatmap_period: value.heatmap_period.map(humantime),
             lazy_slru_download: value.lazy_slru_download,
             timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
+            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
         }
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index bc3fc1df1f..f3565c1fb3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -309,6 +309,8 @@ pub struct Timeline {
     /// Configuration: how often should the partitioning be recalculated.
     repartition_threshold: u64,
 
+    last_image_layer_creation_check_at: AtomicLsn,
+
     /// Current logical size of the "datadir", at the last LSN.
     current_logical_size: LogicalSize,
 
@@ -1632,6 +1634,15 @@ impl Timeline {
             .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
     }
 
+    fn get_image_layer_creation_check_threshold(&self) -> u8 {
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        tenant_conf.image_layer_creation_check_threshold.unwrap_or(
+            self.conf
+                .default_tenant_conf
+                .image_layer_creation_check_threshold,
+        )
+    }
+
     pub(super) fn tenant_conf_updated(&self) {
         // NB: Most tenant conf options are read by background loops, so,
         // changes will automatically be picked up.
@@ -1769,6 +1780,7 @@ impl Timeline {
                 },
                 partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
                 repartition_threshold: 0,
+                last_image_layer_creation_check_at: AtomicLsn::new(0),
 
                 last_received_wal: Mutex::new(None),
                 rel_size_cache: RwLock::new(HashMap::new()),
@@ -1797,6 +1809,7 @@ impl Timeline {
             };
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
+
             result
                 .metrics
                 .last_record_gauge
@@ -3501,6 +3514,24 @@ impl Timeline {
 
     // Is it time to create a new image layer for the given partition?
     async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
+        let last = self.last_image_layer_creation_check_at.load();
+        if lsn != Lsn(0) {
+            let distance = lsn
+                .checked_sub(last)
+                .expect("Attempt to compact with LSN going backwards");
+
+            let min_distance = self.get_image_layer_creation_check_threshold() as u64
+                * self.get_checkpoint_distance();
+
+            // Skip the expensive delta layer counting below if we've not ingested
+            // sufficient WAL since the last check.
+            if distance.0 < min_distance {
+                return false;
+            }
+        }
+
+        self.last_image_layer_creation_check_at.store(lsn);
+
         let threshold = self.get_image_creation_threshold();
 
         let guard = self.layers.read().await;
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 3058926b25..909d25980b 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -189,6 +189,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         },
         "trace_read_requests": True,
         "walreceiver_connect_timeout": "13m",
+        "image_layer_creation_check_threshold": 1,
     }
 
     ps_http = env.pageserver.http_client()
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index 7bbc0cc160..fefb30bbdd 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -165,6 +165,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
         "compaction_threshold": "3",
         # "image_creation_threshold": set at runtime
         "compaction_target_size": f"{128 * (1024**2)}",  # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
+        "image_layer_creation_check_threshold": "0",  # always check if a new image layer can be created
     }
 
     def tenant_update_config(changes):
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index ca4295c5cb..f311a8bf2c 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -53,6 +53,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
         "checkpoint_timeout": "24h",  # something we won't reach
         "checkpoint_distance": f"{50 * (1024**2)}",  # something we won't reach, we checkpoint manually
         "image_creation_threshold": "100",  # we want to control when image is created
+        "image_layer_creation_check_threshold": "0",
         "compaction_threshold": f"{l0_l1_threshold}",
         "compaction_target_size": f"{128 * (1024**3)}",  # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
     }
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 914f068afb..ba0d53704b 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -568,6 +568,8 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne
         "image_creation_threshold": 100,
         # repartitioning parameter, unused
         "compaction_target_size": 128 * 1024**2,
+        # Always check if a new image layer can be created
+        "image_layer_creation_check_threshold": 0,
         # pitr_interval and gc_horizon are not interesting because we dont run gc
     }
 
@@ -632,7 +634,8 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne
     # threshold to expose image creation to downloading all of the needed
     # layers -- threshold of 2 would sound more reasonable, but keeping it as 1
     # to be less flaky
-    env.neon_cli.config_tenant(tenant_id, {"image_creation_threshold": "1"})
+    conf["image_creation_threshold"] = "1"
+    env.neon_cli.config_tenant(tenant_id, {k: str(v) for k, v in conf.items()})
 
     pageserver_http.timeline_compact(tenant_id, timeline_id)
     layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 56b4548b64..41fa03cdf8 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -53,6 +53,7 @@ TENANT_CONF = {
     "compaction_period": "0s",
     # create image layers eagerly, so that GC can remove some layers
     "image_creation_threshold": "1",
+    "image_layer_creation_check_threshold": "0",
 }
 
 
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 986d6c4dbf..47200a856e 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -245,6 +245,7 @@ def test_remote_storage_upload_queue_retries(
             "compaction_period": "0s",
             # create image layers eagerly, so that GC can remove some layers
             "image_creation_threshold": "1",
+            "image_layer_creation_check_threshold": "0",
         }
     )
 

From 63213fc814624145bab00aefc9c9d4ee167b27bb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 28 Mar 2024 18:48:52 +0000
Subject: [PATCH 0496/1571] storage controller: scheduling optimization for
 sharded tenants (#7181)

## Problem

- When we scheduled locations, we were doing it without any context
about other shards in the same tenant
- After a shard split, there wasn't an automatic mechanism to migrate
the attachments away from the split location
- After a shard split and the migration away from the split location,
there wasn't an automatic mechanism to pick new secondary locations so
that the end state has no concentration of locations on the nodes where
the split happened.

Partially completes: https://github.com/neondatabase/neon/issues/7139

## Summary of changes

- Scheduler now takes a `ScheduleContext` object that can be populated
with information about other shards
- During tenant creation and shard split, we incrementally build up the
ScheduleContext, updating it for each shard as we proceed.
- When scheduling new locations, the ScheduleContext is used to apply a
soft anti-affinity to nodes where a tenant already has shards.
- The background reconciler task now has an extra phase `optimize_all`,
which runs only if the primary `reconcile_all` phase didn't generate any
work. The separation is that `reconcile_all` is needed for availability,
but optimize_all is purely "nice to have" work to balance work across
the nodes better.
- optimize_all calls into two new TenantState methods called
optimize_attachment and optimize_secondary, which seek out opportunities
to improve placment:
- optimize_attachment: if the node where we're currently attached has an
excess of attached shard locations for this tenant compared with the
node where we have a secondary location, then cut over to the secondary
location.
- optimize_secondary: if the node holding our secondary location has an
excessive number of locations for this tenant compared with some other
node where we don't currently have a location, then create a new
secondary location on that other node.
- a new debug API endpoint is provided to run background tasks
on-demand. This returns a number of reconciliations in progress, so
callers can keep calling until they get a `0` to advance the system to
its final state without waiting for many iterations of the background
task.

Optimization is run at an implicitly low priority by:
- Omitting the phase entirely if reconcile_all has work to do
- Skipping optimization of any tenant that has reconciles in flight
- Limiting the total number of optimizations that will be run from one
call to optimize_all to a constant (currently 2).

The idea of that low priority execution is to minimize the operational
risk that optimization work overloads any part of the system. It happens
to also make the system easier to observe and debug, as we avoid running
large numbers of concurrent changes. Eventually we may relax these
limitations: there is no correctness problem with optimizing lots of
tenants concurrently, and optimizing multiple shards in one tenant just
requires housekeeping changes to update ShardContext with the result of
one optimization before proceeding to the next shard.
---
 .../attachment_service/src/metrics.rs         |   4 +
 .../attachment_service/src/reconciler.rs      |   1 +
 .../attachment_service/src/scheduler.rs       | 117 ++++-
 .../attachment_service/src/service.rs         | 203 +++++++-
 .../attachment_service/src/tenant_state.rs    | 455 +++++++++++++++++-
 test_runner/regress/test_sharding.py          |  64 ++-
 6 files changed, 780 insertions(+), 64 deletions(-)

diff --git a/control_plane/attachment_service/src/metrics.rs b/control_plane/attachment_service/src/metrics.rs
index ccf5e9b07c..cabf416b9f 100644
--- a/control_plane/attachment_service/src/metrics.rs
+++ b/control_plane/attachment_service/src/metrics.rs
@@ -37,6 +37,9 @@ pub(crate) struct StorageControllerMetricGroup {
     pub(crate) storage_controller_reconcile_complete:
         measured::CounterVec<ReconcileCompleteLabelGroupSet>,
 
+    /// Count of how many times we make an optimization change to a tenant's scheduling
+    pub(crate) storage_controller_schedule_optimization: measured::Counter,
+
     /// HTTP request status counters for handled requests
     pub(crate) storage_controller_http_request_status:
         measured::CounterVec<HttpRequestStatusLabelGroupSet>,
@@ -101,6 +104,7 @@ impl StorageControllerMetricGroup {
                     status: StaticLabelSet::new(),
                 },
             ),
+            storage_controller_schedule_optimization: measured::Counter::new(),
             storage_controller_http_request_status: measured::CounterVec::new(
                 HttpRequestStatusLabelGroupSet {
                     path: lasso::ThreadedRodeo::new(),
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index a62357f9ac..72eb8faccb 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -487,6 +487,7 @@ impl Reconciler {
         while let Err(e) = self.compute_notify().await {
             match e {
                 NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
+                NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
                 _ => {
                     tracing::warn!(
                         "Live migration blocked by compute notification error, retrying: {e}"
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 981ba26cce..782189d11f 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -58,6 +58,70 @@ pub(crate) struct Scheduler {
     nodes: HashMap<NodeId, SchedulerNode>,
 }
 
+/// Score for soft constraint scheduling: lower scores are preferred to higher scores.
+///
+/// For example, we may set an affinity score based on the number of shards from the same
+/// tenant already on a node, to implicitly prefer to balance out shards.
+#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
+pub(crate) struct AffinityScore(pub(crate) usize);
+
+impl AffinityScore {
+    /// If we have no anti-affinity at all toward a node, this is its score.  It means
+    /// the scheduler has a free choice amongst nodes with this score, and may pick a node
+    /// based on other information such as total utilization.
+    pub(crate) const FREE: Self = Self(0);
+
+    pub(crate) fn inc(&mut self) {
+        self.0 += 1;
+    }
+}
+
+impl std::ops::Add for AffinityScore {
+    type Output = Self;
+
+    fn add(self, rhs: Self) -> Self::Output {
+        Self(self.0 + rhs.0)
+    }
+}
+
+// For carrying state between multiple calls to [`TenantState::schedule`], e.g. when calling
+// it for many shards in the same tenant.
+#[derive(Debug, Default)]
+pub(crate) struct ScheduleContext {
+    /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
+    pub(crate) nodes: HashMap<NodeId, AffinityScore>,
+
+    /// Specifically how many _attached_ locations are on each node
+    pub(crate) attached_nodes: HashMap<NodeId, usize>,
+}
+
+impl ScheduleContext {
+    /// Input is a list of nodes we would like to avoid using again within this context.  The more
+    /// times a node is passed into this call, the less inclined we are to use it.
+    pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
+        for node_id in nodes {
+            let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE);
+            entry.inc()
+        }
+    }
+
+    pub(crate) fn push_attached(&mut self, node_id: NodeId) {
+        let entry = self.attached_nodes.entry(node_id).or_default();
+        *entry += 1;
+    }
+
+    pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
+        self.nodes
+            .get(&node_id)
+            .copied()
+            .unwrap_or(AffinityScore::FREE)
+    }
+
+    pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
+        self.attached_nodes.get(&node_id).copied().unwrap_or(0)
+    }
+}
+
 impl Scheduler {
     pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
         let mut scheduler_nodes = HashMap::new();
@@ -224,27 +288,47 @@ impl Scheduler {
         node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
     }
 
-    pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
+    /// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
+    /// are already in use by this shard -- we use this to avoid picking the same node
+    /// as both attached and secondary location.  This is a hard constraint: if we cannot
+    /// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`].
+    ///
+    /// context: we prefer to avoid using nodes identified in the context, according
+    /// to their anti-affinity score.  We use this to prefeer to avoid placing shards in
+    /// the same tenant on the same node.  This is a soft constraint: the context will never
+    /// cause us to fail to schedule a shard.
+    pub(crate) fn schedule_shard(
+        &self,
+        hard_exclude: &[NodeId],
+        context: &ScheduleContext,
+    ) -> Result<NodeId, ScheduleError> {
         if self.nodes.is_empty() {
             return Err(ScheduleError::NoPageservers);
         }
 
-        let mut tenant_counts: Vec<(NodeId, usize)> = self
+        let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
             .nodes
             .iter()
             .filter_map(|(k, v)| {
                 if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
                     None
                 } else {
-                    Some((*k, v.shard_count))
+                    Some((
+                        *k,
+                        context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
+                        v.shard_count,
+                    ))
                 }
             })
             .collect();
 
-        // Sort by tenant count.  Nodes with the same tenant count are sorted by ID.
-        tenant_counts.sort_by_key(|i| (i.1, i.0));
+        // Sort by, in order of precedence:
+        //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
+        //  2nd: Utilization.  Within nodes with the same affinity, use the least loaded nodes.
+        //  3rd: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
+        scores.sort_by_key(|i| (i.1, i.2, i.0));
 
-        if tenant_counts.is_empty() {
+        if scores.is_empty() {
             // After applying constraints, no pageservers were left.  We log some detail about
             // the state of nodes to help understand why this happened.  This is not logged as an error because
             // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
@@ -260,10 +344,11 @@ impl Scheduler {
             return Err(ScheduleError::ImpossibleConstraint);
         }
 
-        let node_id = tenant_counts.first().unwrap().0;
+        // Lowest score wins
+        let node_id = scores.first().unwrap().0;
         tracing::info!(
-            "scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
-            tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
+            "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
+            scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
         );
 
         // Note that we do not update shard count here to reflect the scheduling: that
@@ -271,6 +356,12 @@ impl Scheduler {
 
         Ok(node_id)
     }
+
+    /// Unit test access to internal state
+    #[cfg(test)]
+    pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
+        self.nodes.get(&node_id).unwrap().shard_count
+    }
 }
 
 #[cfg(test)]
@@ -316,15 +407,17 @@ mod tests {
         let mut t1_intent = IntentState::new();
         let mut t2_intent = IntentState::new();
 
-        let scheduled = scheduler.schedule_shard(&[])?;
+        let context = ScheduleContext::default();
+
+        let scheduled = scheduler.schedule_shard(&[], &context)?;
         t1_intent.set_attached(&mut scheduler, Some(scheduled));
-        let scheduled = scheduler.schedule_shard(&[])?;
+        let scheduled = scheduler.schedule_shard(&[], &context)?;
         t2_intent.set_attached(&mut scheduler, Some(scheduled));
 
         assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
         assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
 
-        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
+        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
         t1_intent.push_secondary(&mut scheduler, scheduled);
 
         assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index fe2358abae..7502d9d186 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -11,6 +11,7 @@ use crate::{
     id_lock_map::IdLockMap,
     persistence::{AbortShardSplitStatus, TenantFilter},
     reconciler::ReconcileError,
+    scheduler::ScheduleContext,
 };
 use anyhow::Context;
 use control_plane::storage_controller::{
@@ -345,9 +346,15 @@ impl Service {
             }
 
             // Populate each tenant's intent state
+            let mut schedule_context = ScheduleContext::default();
             for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
+                if tenant_shard_id.shard_number == ShardNumber(0) {
+                    // Reset scheduling context each time we advance to the next Tenant
+                    schedule_context = ScheduleContext::default();
+                }
+
                 tenant_state.intent_from_observed(scheduler);
-                if let Err(e) = tenant_state.schedule(scheduler) {
+                if let Err(e) = tenant_state.schedule(scheduler, &mut schedule_context) {
                     // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
                     // not enough pageservers are available.  The tenant may well still be available
                     // to clients.
@@ -671,7 +678,13 @@ impl Service {
         let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD);
         while !self.cancel.is_cancelled() {
             tokio::select! {
-              _ = interval.tick() => { self.reconcile_all(); }
+              _ = interval.tick() => {
+                let reconciles_spawned = self.reconcile_all();
+                if reconciles_spawned == 0 {
+                    // Run optimizer only when we didn't find any other work to do
+                    self.optimize_all();
+                }
+            }
               _ = self.cancel.cancelled() => return
             }
         }
@@ -1627,6 +1640,8 @@ impl Service {
             Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))),
         };
 
+        let mut schedule_context = ScheduleContext::default();
+
         let (waiters, response_shards) = {
             let mut locked = self.inner.write().unwrap();
             let (nodes, tenants, scheduler) = locked.parts_mut();
@@ -1648,11 +1663,14 @@ impl Service {
                         // attached and secondary locations (independently) away frorm those
                         // pageservers also holding a shard for this tenant.
 
-                        entry.get_mut().schedule(scheduler).map_err(|e| {
-                            ApiError::Conflict(format!(
-                                "Failed to schedule shard {tenant_shard_id}: {e}"
-                            ))
-                        })?;
+                        entry
+                            .get_mut()
+                            .schedule(scheduler, &mut schedule_context)
+                            .map_err(|e| {
+                                ApiError::Conflict(format!(
+                                    "Failed to schedule shard {tenant_shard_id}: {e}"
+                                ))
+                            })?;
 
                         if let Some(node_id) = entry.get().intent.get_attached() {
                             let generation = entry
@@ -1680,7 +1698,7 @@ impl Service {
 
                         state.generation = initial_generation;
                         state.config = create_req.config.clone();
-                        if let Err(e) = state.schedule(scheduler) {
+                        if let Err(e) = state.schedule(scheduler, &mut schedule_context) {
                             schcedule_error = Some(e);
                         }
 
@@ -1888,6 +1906,7 @@ impl Service {
                 // Persist updates
                 // Ordering: write to the database before applying changes in-memory, so that
                 // we will not appear time-travel backwards on a restart.
+                let mut schedule_context = ScheduleContext::default();
                 for ShardUpdate {
                     tenant_shard_id,
                     placement_policy,
@@ -1935,7 +1954,7 @@ impl Service {
                             shard.generation = Some(generation);
                         }
 
-                        shard.schedule(scheduler)?;
+                        shard.schedule(scheduler, &mut schedule_context)?;
 
                         let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
                         if let Some(waiter) = maybe_waiter {
@@ -2095,7 +2114,7 @@ impl Service {
             let scheduler = &locked.scheduler;
             // Right now we only perform the operation on a single node without parallelization
             // TODO fan out the operation to multiple nodes for better performance
-            let node_id = scheduler.schedule_shard(&[])?;
+            let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
             let node = locked
                 .nodes
                 .get(&node_id)
@@ -2364,6 +2383,7 @@ impl Service {
             )
             .await?;
 
+        let mut schedule_context = ScheduleContext::default();
         let mut locked = self.inner.write().unwrap();
         let (nodes, tenants, scheduler) = locked.parts_mut();
         for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
@@ -2382,7 +2402,7 @@ impl Service {
             }
 
             // In case scheduling is being switched back on, try it now.
-            shard.schedule(scheduler).ok();
+            shard.schedule(scheduler, &mut schedule_context).ok();
             self.maybe_reconcile_shard(shard, nodes);
         }
 
@@ -2846,7 +2866,7 @@ impl Service {
 
                 tracing::info!("Restoring parent shard {tenant_shard_id}");
                 shard.splitting = SplitState::Idle;
-                if let Err(e) = shard.schedule(scheduler) {
+                if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) {
                     // If this shard can't be scheduled now (perhaps due to offline nodes or
                     // capacity issues), that must not prevent us rolling back a split.  In this
                     // case it should be eventually scheduled in the background.
@@ -2970,6 +2990,7 @@ impl Service {
                     )
                 };
 
+                let mut schedule_context = ScheduleContext::default();
                 for child in child_ids {
                     let mut child_shard = parent_ident;
                     child_shard.number = child.shard_number;
@@ -3005,7 +3026,7 @@ impl Service {
 
                     child_locations.push((child, pageserver, child_shard.stripe_size));
 
-                    if let Err(e) = child_state.schedule(scheduler) {
+                    if let Err(e) = child_state.schedule(scheduler, &mut schedule_context) {
                         // This is not fatal, because we've implicitly already got an attached
                         // location for the child shard.  Failure here just means we couldn't
                         // find a secondary (e.g. because cluster is overloaded).
@@ -3869,6 +3890,7 @@ impl Service {
             AvailabilityTransition::ToOffline => {
                 tracing::info!("Node {} transition to offline", node_id);
                 let mut tenants_affected: usize = 0;
+
                 for (tenant_shard_id, tenant_state) in tenants {
                     if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
                         // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
@@ -3885,7 +3907,13 @@ impl Service {
 
                     if tenant_state.intent.demote_attached(node_id) {
                         tenant_state.sequence = tenant_state.sequence.next();
-                        match tenant_state.schedule(scheduler) {
+
+                        // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
+                        // for tenants without secondary locations: if they have a secondary location, then this
+                        // schedule() call is just promoting an existing secondary)
+                        let mut schedule_context = ScheduleContext::default();
+
+                        match tenant_state.schedule(scheduler, &mut schedule_context) {
                             Err(e) => {
                                 // It is possible that some tenants will become unschedulable when too many pageservers
                                 // go offline: in this case there isn't much we can do other than make the issue observable.
@@ -3944,8 +3972,9 @@ impl Service {
         let mut waiters = Vec::new();
         let (nodes, tenants, scheduler) = locked.parts_mut();
 
+        let mut schedule_context = ScheduleContext::default();
         for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
-            shard.schedule(scheduler)?;
+            shard.schedule(scheduler, &mut schedule_context)?;
 
             // The shard's policies may not result in an attached location being scheduled: this
             // is an error because our caller needs it attached somewhere.
@@ -4025,8 +4054,144 @@ impl Service {
         let (nodes, tenants, _scheduler) = locked.parts_mut();
         let pageservers = nodes.clone();
 
+        let mut schedule_context = ScheduleContext::default();
+
         let mut reconciles_spawned = 0;
-        for (_tenant_shard_id, shard) in tenants.iter_mut() {
+        for (tenant_shard_id, shard) in tenants.iter_mut() {
+            if tenant_shard_id.is_zero() {
+                schedule_context = ScheduleContext::default();
+            }
+
+            // Eventual consistency: if an earlier reconcile job failed, and the shard is still
+            // dirty, spawn another rone
+            if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
+                reconciles_spawned += 1;
+            }
+
+            schedule_context.avoid(&shard.intent.all_pageservers());
+        }
+
+        reconciles_spawned
+    }
+
+    /// `optimize` in this context means identifying shards which have valid scheduled locations, but
+    /// could be scheduled somewhere better:
+    /// - Cutting over to a secondary if the node with the secondary is more lightly loaded
+    ///    * e.g. after a node fails then recovers, to move some work back to it
+    /// - Cutting over to a secondary if it improves the spread of shard attachments within a tenant
+    ///    * e.g. after a shard split, the initial attached locations will all be on the node where
+    ///      we did the split, but are probably better placed elsewhere.
+    /// - Creating new secondary locations if it improves the spreading of a sharded tenant
+    ///    * e.g. after a shard split, some locations will be on the same node (where the split
+    ///     happened), and will probably be better placed elsewhere.
+    ///
+    /// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at
+    /// the time of scheduling, this function looks for cases where a better-scoring location is available
+    /// according to those same soft constraints.
+    fn optimize_all(&self) -> usize {
+        let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
+        let pageservers = nodes.clone();
+
+        let mut schedule_context = ScheduleContext::default();
+
+        let mut reconciles_spawned = 0;
+
+        let mut tenant_shards: Vec<&TenantState> = Vec::new();
+
+        // Limit on how many shards' optmizations each call to this function will execute.  Combined
+        // with the frequency of background calls, this acts as an implicit rate limit that runs a small
+        // trickle of optimizations in the background, rather than executing a large number in parallel
+        // when a change occurs.
+        const MAX_OPTIMIZATIONS_PER_PASS: usize = 2;
+
+        let mut work = Vec::new();
+
+        for (tenant_shard_id, shard) in tenants.iter() {
+            if tenant_shard_id.is_zero() {
+                // Reset accumulators on the first shard in a tenant
+                schedule_context = ScheduleContext::default();
+                tenant_shards.clear();
+            }
+
+            if work.len() >= MAX_OPTIMIZATIONS_PER_PASS {
+                break;
+            }
+
+            match shard.get_scheduling_policy() {
+                ShardSchedulingPolicy::Active => {
+                    // Ok to do optimization
+                }
+                ShardSchedulingPolicy::Essential
+                | ShardSchedulingPolicy::Pause
+                | ShardSchedulingPolicy::Stop => {
+                    // Policy prevents optimizing this shard.
+                    continue;
+                }
+            }
+
+            // Accumulate the schedule context for all the shards in a tenant: we must have
+            // the total view of all shards before we can try to optimize any of them.
+            schedule_context.avoid(&shard.intent.all_pageservers());
+            if let Some(attached) = shard.intent.get_attached() {
+                schedule_context.push_attached(*attached);
+            }
+            tenant_shards.push(shard);
+
+            // Once we have seen the last shard in the tenant, proceed to search across all shards
+            // in the tenant for optimizations
+            if shard.shard.number.0 == shard.shard.count.count() - 1 {
+                if tenant_shards.iter().any(|s| s.reconciler.is_some()) {
+                    // Do not start any optimizations while another change to the tenant is ongoing: this
+                    // is not necessary for correctness, but simplifies operations and implicitly throttles
+                    // optimization changes to happen in a "trickle" over time.
+                    continue;
+                }
+
+                if tenant_shards.iter().any(|s| {
+                    !matches!(s.splitting, SplitState::Idle)
+                        || matches!(s.policy, PlacementPolicy::Detached)
+                }) {
+                    // Never attempt to optimize a tenant that is currently being split, or
+                    // a tenant that is meant to be detached
+                    continue;
+                }
+
+                // TODO: optimization calculations are relatively expensive: create some fast-path for
+                // the common idle case (avoiding the search on tenants that we have recently checked)
+
+                for shard in &tenant_shards {
+                    if let Some(optimization) =
+                        // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
+                        // its primary location based on soft constraints, cut it over.
+                        shard.optimize_attachment(nodes, &schedule_context)
+                    {
+                        work.push((shard.tenant_shard_id, optimization));
+                        break;
+                    } else if let Some(optimization) =
+                        // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be
+                        // better placed on another node, based on ScheduleContext, then adjust it.  This
+                        // covers cases like after a shard split, where we might have too many shards
+                        // in the same tenant with secondary locations on the node where they originally split.
+                        shard.optimize_secondary(scheduler, &schedule_context)
+                    {
+                        work.push((shard.tenant_shard_id, optimization));
+                        break;
+                    }
+
+                    // TODO: extend this mechanism to prefer attaching on nodes with fewer attached
+                    // tenants (i.e. extend schedule state to distinguish attached from secondary counts),
+                    // for the total number of attachments on a node (not just within a tenant.)
+                }
+            }
+        }
+
+        for (tenant_shard_id, optimization) in work {
+            let shard = tenants
+                .get_mut(&tenant_shard_id)
+                .expect("We held lock from place we got this ID");
+            shard.apply_optimization(scheduler, optimization);
+
             if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
                 reconciles_spawned += 1;
             }
@@ -4039,7 +4204,11 @@ impl Service {
     /// also wait for any generated Reconcilers to complete.  Calling this until it returns zero should
     /// put the system into a quiescent state where future background reconciliations won't do anything.
     pub(crate) async fn reconcile_all_now(&self) -> Result<usize, ReconcileWaitError> {
-        self.reconcile_all();
+        let reconciles_spawned = self.reconcile_all();
+        if reconciles_spawned == 0 {
+            // Only optimize when we are otherwise idle
+            self.optimize_all();
+        }
 
         let waiters = {
             let mut waiters = Vec::new();
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 3dc3483e09..6717b8e178 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -7,6 +7,7 @@ use std::{
 use crate::{
     metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
     persistence::TenantShardPersistence,
+    scheduler::{AffinityScore, MaySchedule, ScheduleContext},
 };
 use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
 use pageserver_api::{
@@ -250,8 +251,13 @@ impl IntentState {
 
 impl Drop for IntentState {
     fn drop(&mut self) {
-        // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
-        debug_assert!(self.attached.is_none() && self.secondary.is_empty());
+        // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler.
+        // We do not check this while panicking, to avoid polluting unit test failures or
+        // other assertions with this assertion's output.  It's still wrong to leak these,
+        // but if we already have a panic then we don't need to independently flag this case.
+        if !(std::thread::panicking()) {
+            debug_assert!(self.attached.is_none() && self.secondary.is_empty());
+        }
     }
 }
 
@@ -296,6 +302,26 @@ pub enum ReconcileWaitError {
     Failed(TenantShardId, String),
 }
 
+#[derive(Eq, PartialEq, Debug)]
+pub(crate) struct ReplaceSecondary {
+    old_node_id: NodeId,
+    new_node_id: NodeId,
+}
+
+#[derive(Eq, PartialEq, Debug)]
+pub(crate) struct MigrateAttachment {
+    old_attached_node_id: NodeId,
+    new_attached_node_id: NodeId,
+}
+
+#[derive(Eq, PartialEq, Debug)]
+pub(crate) enum ScheduleOptimization {
+    // Replace one of our secondary locations with a different node
+    ReplaceSecondary(ReplaceSecondary),
+    // Migrate attachment to an existing secondary location
+    MigrateAttachment(MigrateAttachment),
+}
+
 impl ReconcilerWaiter {
     pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
         tokio::select! {
@@ -430,6 +456,7 @@ impl TenantState {
     fn schedule_attached(
         &mut self,
         scheduler: &mut Scheduler,
+        context: &ScheduleContext,
     ) -> Result<(bool, NodeId), ScheduleError> {
         // No work to do if we already have an attached tenant
         if let Some(node_id) = self.intent.attached {
@@ -443,14 +470,33 @@ impl TenantState {
             Ok((true, promote_secondary))
         } else {
             // Pick a fresh node: either we had no secondaries or none were schedulable
-            let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
+            let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?;
             tracing::debug!("Selected {} as attached", node_id);
             self.intent.set_attached(scheduler, Some(node_id));
             Ok((true, node_id))
         }
     }
 
-    pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
+    pub(crate) fn schedule(
+        &mut self,
+        scheduler: &mut Scheduler,
+        context: &mut ScheduleContext,
+    ) -> Result<(), ScheduleError> {
+        let r = self.do_schedule(scheduler, context);
+
+        context.avoid(&self.intent.all_pageservers());
+        if let Some(attached) = self.intent.get_attached() {
+            context.push_attached(*attached);
+        }
+
+        r
+    }
+
+    pub(crate) fn do_schedule(
+        &mut self,
+        scheduler: &mut Scheduler,
+        context: &ScheduleContext,
+    ) -> Result<(), ScheduleError> {
         // TODO: before scheduling new nodes, check if any existing content in
         // self.intent refers to pageservers that are offline, and pick other
         // pageservers if so.
@@ -494,12 +540,13 @@ impl TenantState {
                 }
 
                 // Should have exactly one attached, and N secondaries
-                let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
+                let (modified_attached, attached_node_id) =
+                    self.schedule_attached(scheduler, context)?;
                 modified |= modified_attached;
 
                 let mut used_pageservers = vec![attached_node_id];
                 while self.intent.secondary.len() < secondary_count {
-                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
+                    let node_id = scheduler.schedule_shard(&used_pageservers, context)?;
                     self.intent.push_secondary(scheduler, node_id);
                     used_pageservers.push(node_id);
                     modified = true;
@@ -512,7 +559,7 @@ impl TenantState {
                     modified = true;
                 } else if self.intent.secondary.is_empty() {
                     // Populate secondary by scheduling a fresh node
-                    let node_id = scheduler.schedule_shard(&[])?;
+                    let node_id = scheduler.schedule_shard(&[], context)?;
                     self.intent.push_secondary(scheduler, node_id);
                     modified = true;
                 }
@@ -539,6 +586,167 @@ impl TenantState {
         Ok(())
     }
 
+    /// Optimize attachments: if a shard has a secondary location that is preferable to
+    /// its primary location based on soft constraints, switch that secondary location
+    /// to be attached.
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
+    pub(crate) fn optimize_attachment(
+        &self,
+        nodes: &HashMap<NodeId, Node>,
+        schedule_context: &ScheduleContext,
+    ) -> Option<ScheduleOptimization> {
+        let attached = (*self.intent.get_attached())?;
+        if self.intent.secondary.is_empty() {
+            // We can only do useful work if we have both attached and secondary locations: this
+            // function doesn't schedule new locations, only swaps between attached and secondaries.
+            return None;
+        }
+
+        let current_affinity_score = schedule_context.get_node_affinity(attached);
+        let current_attachment_count = schedule_context.get_node_attachments(attached);
+
+        // Generate score for each node, dropping any un-schedulable nodes.
+        let all_pageservers = self.intent.all_pageservers();
+        let mut scores = all_pageservers
+            .iter()
+            .flat_map(|node_id| {
+                if matches!(
+                    nodes
+                        .get(node_id)
+                        .map(|n| n.may_schedule())
+                        .unwrap_or(MaySchedule::No),
+                    MaySchedule::No
+                ) {
+                    None
+                } else {
+                    let affinity_score = schedule_context.get_node_affinity(*node_id);
+                    let attachment_count = schedule_context.get_node_attachments(*node_id);
+                    Some((*node_id, affinity_score, attachment_count))
+                }
+            })
+            .collect::<Vec<_>>();
+
+        // Sort precedence:
+        //  1st - prefer nodes with the lowest total affinity score
+        //  2nd - prefer nodes with the lowest number of attachments in this context
+        //  3rd - if all else is equal, sort by node ID for determinism in tests.
+        scores.sort_by_key(|i| (i.1, i.2, i.0));
+
+        if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) =
+            scores.first()
+        {
+            if attached != *preferred_node {
+                // The best alternative must be more than 1 better than us, otherwise we could end
+                // up flapping back next time we're called (e.g. there's no point migrating from
+                // a location with score 1 to a score zero, because on next location the situation
+                // would be the same, but in reverse).
+                if current_affinity_score > *preferred_affinity_score + AffinityScore(1)
+                    || current_attachment_count > *preferred_attachment_count + 1
+                {
+                    tracing::info!(
+                        "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
+                        self.intent.get_secondary()
+                    );
+                    return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
+                        old_attached_node_id: attached,
+                        new_attached_node_id: *preferred_node,
+                    }));
+                }
+            } else {
+                tracing::debug!(
+                    "Node {} is already preferred (score {:?})",
+                    preferred_node,
+                    preferred_affinity_score
+                );
+            }
+        }
+
+        // Fall-through: we didn't find an optimization
+        None
+    }
+
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
+    pub(crate) fn optimize_secondary(
+        &self,
+        scheduler: &Scheduler,
+        schedule_context: &ScheduleContext,
+    ) -> Option<ScheduleOptimization> {
+        if self.intent.secondary.is_empty() {
+            // We can only do useful work if we have both attached and secondary locations: this
+            // function doesn't schedule new locations, only swaps between attached and secondaries.
+            return None;
+        }
+
+        for secondary in self.intent.get_secondary() {
+            let Some(affinity_score) = schedule_context.nodes.get(secondary) else {
+                // We're already on a node unaffected any affinity constraints,
+                // so we won't change it.
+                continue;
+            };
+
+            // Let the scheduler suggest a node, where it would put us if we were scheduling afresh
+            // This implicitly limits the choice to nodes that are available, and prefers nodes
+            // with lower utilization.
+            let Ok(candidate_node) =
+                scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context)
+            else {
+                // A scheduling error means we have no possible candidate replacements
+                continue;
+            };
+
+            let candidate_affinity_score = schedule_context
+                .nodes
+                .get(&candidate_node)
+                .unwrap_or(&AffinityScore::FREE);
+
+            // The best alternative must be more than 1 better than us, otherwise we could end
+            // up flapping back next time we're called.
+            if *candidate_affinity_score + AffinityScore(1) < *affinity_score {
+                // If some other node is available and has a lower score than this node, then
+                // that other node is a good place to migrate to.
+                tracing::info!(
+                    "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
+                    self.intent.get_secondary()
+                );
+                return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
+                    old_node_id: *secondary,
+                    new_node_id: candidate_node,
+                }));
+            }
+        }
+
+        None
+    }
+
+    pub(crate) fn apply_optimization(
+        &mut self,
+        scheduler: &mut Scheduler,
+        optimization: ScheduleOptimization,
+    ) {
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_schedule_optimization
+            .inc();
+
+        match optimization {
+            ScheduleOptimization::MigrateAttachment(MigrateAttachment {
+                old_attached_node_id,
+                new_attached_node_id,
+            }) => {
+                self.intent.demote_attached(old_attached_node_id);
+                self.intent
+                    .promote_attached(scheduler, new_attached_node_id);
+            }
+            ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
+                old_node_id,
+                new_node_id,
+            }) => {
+                self.intent.remove_secondary(scheduler, old_node_id);
+                self.intent.push_secondary(scheduler, new_node_id);
+            }
+        }
+    }
+
     /// Query whether the tenant's observed state for attached node matches its intent state, and if so,
     /// yield the node ID.  This is appropriate for emitting compute hook notifications: we are checking that
     /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
@@ -877,6 +1085,10 @@ impl TenantState {
         self.scheduling_policy = p;
     }
 
+    pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy {
+        &self.scheduling_policy
+    }
+
     pub(crate) fn from_persistent(
         tsp: TenantShardPersistence,
         intent: IntentState,
@@ -953,6 +1165,32 @@ pub(crate) mod tests {
         )
     }
 
+    fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantState> {
+        let tenant_id = TenantId::generate();
+
+        (0..shard_count.count())
+            .map(|i| {
+                let shard_number = ShardNumber(i);
+
+                let tenant_shard_id = TenantShardId {
+                    tenant_id,
+                    shard_number,
+                    shard_count,
+                };
+                TenantState::new(
+                    tenant_shard_id,
+                    ShardIdentity::new(
+                        shard_number,
+                        shard_count,
+                        pageserver_api::shard::ShardStripeSize(32768),
+                    )
+                    .unwrap(),
+                    policy.clone(),
+                )
+            })
+            .collect()
+    }
+
     /// Test the scheduling behaviors used when a tenant configured for HA is subject
     /// to nodes being marked offline.
     #[test]
@@ -962,10 +1200,11 @@ pub(crate) mod tests {
         let mut nodes = make_test_nodes(3);
 
         let mut scheduler = Scheduler::new(nodes.values());
+        let mut context = ScheduleContext::default();
 
         let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
         tenant_state
-            .schedule(&mut scheduler)
+            .schedule(&mut scheduler, &mut context)
             .expect("we have enough nodes, scheduling should work");
 
         // Expect to initially be schedule on to different nodes
@@ -991,7 +1230,7 @@ pub(crate) mod tests {
 
         // Scheduling the node should promote the still-available secondary node to attached
         tenant_state
-            .schedule(&mut scheduler)
+            .schedule(&mut scheduler, &mut context)
             .expect("active nodes are available");
         assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
 
@@ -1065,15 +1304,209 @@ pub(crate) mod tests {
 
         // In pause mode, schedule() shouldn't do anything
         tenant_state.scheduling_policy = ShardSchedulingPolicy::Pause;
-        assert!(tenant_state.schedule(&mut scheduler).is_ok());
+        assert!(tenant_state
+            .schedule(&mut scheduler, &mut ScheduleContext::default())
+            .is_ok());
         assert!(tenant_state.intent.all_pageservers().is_empty());
 
         // In active mode, schedule() works
         tenant_state.scheduling_policy = ShardSchedulingPolicy::Active;
-        assert!(tenant_state.schedule(&mut scheduler).is_ok());
+        assert!(tenant_state
+            .schedule(&mut scheduler, &mut ScheduleContext::default())
+            .is_ok());
         assert!(!tenant_state.intent.all_pageservers().is_empty());
 
         tenant_state.intent.clear(&mut scheduler);
         Ok(())
     }
+
+    #[test]
+    fn optimize_attachment() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(3);
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
+
+        // Initially: both nodes attached on shard 1, and both have secondary locations
+        // on different nodes.
+        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(2));
+        shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1)));
+        shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
+
+        let mut schedule_context = ScheduleContext::default();
+        schedule_context.avoid(&shard_a.intent.all_pageservers());
+        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
+        schedule_context.avoid(&shard_b.intent.all_pageservers());
+        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
+
+        let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
+
+        // Either shard should recognize that it has the option to switch to a secondary location where there
+        // would be no other shards from the same tenant, and request to do so.
+        assert_eq!(
+            optimization_a,
+            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
+                old_attached_node_id: NodeId(1),
+                new_attached_node_id: NodeId(2)
+            }))
+        );
+
+        // Note that these optimizing two shards in the same tenant with the same ScheduleContext is
+        // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility
+        // of [`Service::optimize_all`] to avoid trying
+        // to do optimizations for multiple shards in the same tenant at the same time.  Generating
+        // both optimizations is just done for test purposes
+        let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
+        assert_eq!(
+            optimization_b,
+            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
+                old_attached_node_id: NodeId(1),
+                new_attached_node_id: NodeId(3)
+            }))
+        );
+
+        // Applying these optimizations should result in the end state proposed
+        shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
+        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2)));
+        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
+        shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap());
+        assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3)));
+        assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]);
+
+        shard_a.intent.clear(&mut scheduler);
+        shard_b.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
+
+    #[test]
+    fn optimize_secondary() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(4);
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
+
+        // Initially: both nodes attached on shard 1, and both have secondary locations
+        // on different nodes.
+        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
+        shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2)));
+        shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
+
+        let mut schedule_context = ScheduleContext::default();
+        schedule_context.avoid(&shard_a.intent.all_pageservers());
+        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
+        schedule_context.avoid(&shard_b.intent.all_pageservers());
+        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
+
+        let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);
+
+        // Since there is a node with no locations available, the node with two locations for the
+        // same tenant should generate an optimization to move one away
+        assert_eq!(
+            optimization_a,
+            Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
+                old_node_id: NodeId(3),
+                new_node_id: NodeId(4)
+            }))
+        );
+
+        shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
+        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
+        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]);
+
+        shard_a.intent.clear(&mut scheduler);
+        shard_b.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
+
+    // Optimize til quiescent: this emulates what Service::optimize_all does, when
+    // called repeatedly in the background.
+    fn optimize_til_idle(
+        nodes: &HashMap<NodeId, Node>,
+        scheduler: &mut Scheduler,
+        shards: &mut [TenantState],
+    ) {
+        let mut loop_n = 0;
+        loop {
+            let mut schedule_context = ScheduleContext::default();
+            let mut any_changed = false;
+
+            for shard in shards.iter() {
+                schedule_context.avoid(&shard.intent.all_pageservers());
+                if let Some(attached) = shard.intent.get_attached() {
+                    schedule_context.push_attached(*attached);
+                }
+            }
+
+            for shard in shards.iter_mut() {
+                let optimization = shard.optimize_attachment(nodes, &schedule_context);
+                if let Some(optimization) = optimization {
+                    shard.apply_optimization(scheduler, optimization);
+                    any_changed = true;
+                    break;
+                }
+
+                let optimization = shard.optimize_secondary(scheduler, &schedule_context);
+                if let Some(optimization) = optimization {
+                    shard.apply_optimization(scheduler, optimization);
+                    any_changed = true;
+                    break;
+                }
+            }
+
+            if !any_changed {
+                break;
+            }
+
+            // Assert no infinite loop
+            loop_n += 1;
+            assert!(loop_n < 1000);
+        }
+    }
+
+    /// Test the balancing behavior of shard scheduling: that it achieves a balance, and
+    /// that it converges.
+    #[test]
+    fn optimize_add_nodes() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(4);
+
+        // Only show the scheduler a couple of nodes
+        let mut scheduler = Scheduler::new([].iter());
+        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
+        scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
+
+        let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
+        let mut schedule_context = ScheduleContext::default();
+        for shard in &mut shards {
+            assert!(shard
+                .schedule(&mut scheduler, &mut schedule_context)
+                .is_ok());
+        }
+
+        // We should see equal number of locations on the two nodes.
+        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
+
+        // Add another two nodes: we should see the shards spread out when their optimize
+        // methods are called
+        scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap());
+        scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
+        optimize_til_idle(&nodes, &mut scheduler, &mut shards);
+
+        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
+
+        for shard in shards.iter_mut() {
+            shard.intent.clear(&mut scheduler);
+        }
+
+        Ok(())
+    }
 }
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 9aebf16c68..2699654f80 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -146,7 +146,7 @@ def test_sharding_split_smoke(
     # 8 shards onto separate pageservers
     shard_count = 4
     split_shard_count = 8
-    neon_env_builder.num_pageservers = split_shard_count
+    neon_env_builder.num_pageservers = split_shard_count * 2
 
     # 1MiB stripes: enable getting some meaningful data distribution without
     # writing large quantities of data in this test.  The stripe size is given
@@ -174,6 +174,7 @@ def test_sharding_split_smoke(
         placement_policy='{"Attached": 1}',
         conf=non_default_tenant_config,
     )
+
     workload = Workload(env, tenant_id, timeline_id, branch_name="main")
     workload.init()
 
@@ -252,6 +253,10 @@ def test_sharding_split_smoke(
     # The old parent shards should no longer exist on disk
     assert not shards_on_disk(old_shard_ids)
 
+    # Enough background reconciliations should result in the shards being properly distributed.
+    # Run this before the workload, because its LSN-waiting code presumes stable locations.
+    env.storage_controller.reconcile_until_idle()
+
     workload.validate()
 
     workload.churn_rows(256)
@@ -265,27 +270,6 @@ def test_sharding_split_smoke(
         pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None)
     workload.validate()
 
-    migrate_to_pageserver_ids = list(
-        set(p.id for p in env.pageservers) - set(pre_split_pageserver_ids)
-    )
-    assert len(migrate_to_pageserver_ids) == split_shard_count - shard_count
-
-    # Migrate shards away from the node where the split happened
-    for ps_id in pre_split_pageserver_ids:
-        shards_here = [
-            tenant_shard_id
-            for (tenant_shard_id, pageserver) in all_shards
-            if pageserver.id == ps_id
-        ]
-        assert len(shards_here) == 2
-        migrate_shard = shards_here[0]
-        destination = migrate_to_pageserver_ids.pop()
-
-        log.info(f"Migrating shard {migrate_shard} from {ps_id} to {destination}")
-        env.storage_controller.tenant_shard_migrate(migrate_shard, destination)
-
-    workload.validate()
-
     # Assert on how many reconciles happened during the process.  This is something of an
     # implementation detail, but it is useful to detect any bugs that might generate spurious
     # extra reconcile iterations.
@@ -294,8 +278,9 @@ def test_sharding_split_smoke(
     # - shard_count reconciles for the original setup of the tenant
     # - shard_count reconciles for detaching the original secondary locations during split
     # - split_shard_count reconciles during shard splitting, for setting up secondaries.
-    # - shard_count reconciles for the migrations we did to move child shards away from their split location
-    expect_reconciles = shard_count * 2 + split_shard_count + shard_count
+    # - shard_count of the child shards will need to fail over to their secondaries
+    # - shard_count of the child shard secondary locations will get moved to emptier nodes
+    expect_reconciles = shard_count * 2 + split_shard_count + shard_count * 2
     reconcile_ok = env.storage_controller.get_metric_value(
         "storage_controller_reconcile_complete_total", filter={"status": "ok"}
     )
@@ -343,6 +328,31 @@ def test_sharding_split_smoke(
     assert sum(total.values()) == split_shard_count * 2
     check_effective_tenant_config()
 
+    # More specific check: that we are fully balanced.  This is deterministic because
+    # the order in which we consider shards for optimization is deterministic, and the
+    # order of preference of nodes is also deterministic (lower node IDs win).
+    log.info(f"total: {total}")
+    assert total == {
+        1: 1,
+        2: 1,
+        3: 1,
+        4: 1,
+        5: 1,
+        6: 1,
+        7: 1,
+        8: 1,
+        9: 1,
+        10: 1,
+        11: 1,
+        12: 1,
+        13: 1,
+        14: 1,
+        15: 1,
+        16: 1,
+    }
+    log.info(f"attached: {attached}")
+    assert attached == {1: 1, 2: 1, 3: 1, 5: 1, 6: 1, 7: 1, 9: 1, 11: 1}
+
     # Ensure post-split pageserver locations survive a restart (i.e. the child shards
     # correctly wrote config to disk, and the storage controller responds correctly
     # to /re-attach)
@@ -401,6 +411,7 @@ def test_sharding_split_stripe_size(
     env.storage_controller.tenant_shard_split(
         tenant_id, shard_count=2, shard_stripe_size=new_stripe_size
     )
+    env.storage_controller.reconcile_until_idle()
 
     # Check that we ended up with the stripe size that we expected, both on the pageserver
     # and in the notifications to compute
@@ -869,6 +880,7 @@ def test_sharding_split_failures(
         # Having failed+rolled back, we should be able to split again
         # No failures this time; it will succeed
         env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count)
+        env.storage_controller.reconcile_until_idle(timeout_secs=30)
 
         workload.churn_rows(10)
         workload.validate()
@@ -922,6 +934,10 @@ def test_sharding_split_failures(
         finish_split()
         assert_split_done()
 
+    # Having completed the split, pump the background reconciles to ensure that
+    # the scheduler reaches an idle state
+    env.storage_controller.reconcile_until_idle(timeout_secs=30)
+
     env.storage_controller.consistency_check()
 
 
From 7ddc7b4990a31a39886e3ecaa9c0d79f4e20e6df Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 29 Mar 2024 12:11:17 -0400
Subject: [PATCH 0497/1571] neonvm: add LFC approximate working set size to
 metrics (#7252)

ref https://github.com/neondatabase/autoscaling/pull/878
ref https://github.com/neondatabase/autoscaling/issues/872

Add `approximate_working_set_size` to sql exporter so that autoscaling
can use it in the future.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Peter Bendel <peterbendel@neon.tech>
---
 vm-image-spec.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 5b93088303..c760744491 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -187,6 +187,14 @@ files:
         query: |
           select sum(pg_database_size(datname)) as total from pg_database;
 
+      - metric_name: lfc_approximate_working_set_size
+        type: gauge
+        help: 'Approximate working set size in pages of 8192 bytes'
+        key_labels:
+        values: [approximate_working_set_size]
+        query: |
+          select neon.approximate_working_set_size(false) as approximate_working_set_size;
+
 build: |
   # Build cgroup-tools
   #

From 3ab9f56f5fbbfae0626e8a5a8e41b1ca6e73e204 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 29 Mar 2024 13:59:30 -0400
Subject: [PATCH 0498/1571] fixup(#7278/compute_ctl): remote extension download
 permission (#7280)

Fix #7278

## Summary of changes

* Explicitly create the extension download directory and assign correct
permissoins.
* Fix the problem that the extension download failure will cause all
future downloads to fail.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 Dockerfile.compute-node      |  3 +++
 compute_tools/src/compute.rs | 10 ++++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index c73b9ce5c9..bd4534ce1d 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -944,6 +944,9 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
 
+# Create remote extension download directory
+RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
+
 # Install:
 # libreadline8 for psql
 # libicu67, locales for collations (including ICU and plpgsql_check)
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 0fa315682d..88dc4aca2b 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1262,10 +1262,12 @@ LIMIT 100",
         .await
         .map_err(DownloadError::Other);
 
-        self.ext_download_progress
-            .write()
-            .expect("bad lock")
-            .insert(ext_archive_name.to_string(), (download_start, true));
+        if download_size.is_ok() {
+            self.ext_download_progress
+                .write()
+                .expect("bad lock")
+                .insert(ext_archive_name.to_string(), (download_start, true));
+        }
 
         download_size
     }

From 8ee54ffd3020fba9c5027345018a19d727214842 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 2 Apr 2024 10:12:54 +0100
Subject: [PATCH 0499/1571] update tokio 1.37 (#7276)

## Problem

## Summary of changes

`cargo update -p tokio`.

The only risky change I could see is the `tokio::io::split` moving from
a spin-lock to a mutex but I think that's ok.
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c1c245fa9c..7200fb7968 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5934,9 +5934,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.36.0"
+version = "1.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
+checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787"
 dependencies = [
  "backtrace",
  "bytes",

From 3b95e8072ab4a46c619d2eb0e320ce91869e9737 Mon Sep 17 00:00:00 2001
From: macdoos <127897805+macdoos@users.noreply.github.com>
Date: Tue, 2 Apr 2024 15:32:14 +0200
Subject: [PATCH 0500/1571] test_runner: replace all `.format()` with f-strings
 (#7194)

---
 pyproject.toml                                |  1 +
 scripts/export_import_between_pageservers.py  | 24 +++++-------
 test_runner/fixtures/benchmark_fixture.py     | 12 +++---
 test_runner/fixtures/neon_fixtures.py         | 10 ++---
 test_runner/fixtures/pageserver/utils.py      |  6 +--
 .../pagebench/test_large_slru_basebackup.py   | 24 ++++++------
 .../performance/test_branch_creation.py       |  2 +-
 test_runner/regress/test_branching.py         |  6 +--
 test_runner/regress/test_large_schema.py      |  4 +-
 test_runner/regress/test_layer_bloating.py    |  4 +-
 .../regress/test_pageserver_generations.py    |  1 -
 test_runner/regress/test_read_validation.py   | 38 +++++++------------
 test_runner/regress/test_wal_acceptor.py      | 12 +++---
 .../regress/test_wal_acceptor_async.py        | 10 ++---
 14 files changed, 65 insertions(+), 89 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e347d47cbf..156f135062 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -94,4 +94,5 @@ select = [
     "I", # isort
     "W", # pycodestyle
     "B", # bugbear
+    "UP032", # f-string
 ]
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
index 980f343047..84b69cb36a 100755
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -64,14 +64,14 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
     Returns basepath for files with captured output.
     """
     assert isinstance(cmd, list)
-    base = os.path.basename(cmd[0]) + "_{}".format(global_counter())
+    base = f"{os.path.basename(cmd[0])}_{global_counter()}"
     basepath = os.path.join(capture_dir, base)
     stdout_filename = basepath + ".stdout"
     stderr_filename = basepath + ".stderr"
 
     with open(stdout_filename, "w") as stdout_f:
         with open(stderr_filename, "w") as stderr_f:
-            print('(capturing output to "{}.stdout")'.format(base))
+            print(f'(capturing output to "{base}.stdout")')
             subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
 
     return basepath
@@ -82,11 +82,9 @@ class PgBin:
 
     def __init__(self, log_dir: Path, pg_distrib_dir, pg_version):
         self.log_dir = log_dir
-        self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin")
+        self.pg_bin_path = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "bin")
         self.env = os.environ.copy()
-        self.env["LD_LIBRARY_PATH"] = os.path.join(
-            str(pg_distrib_dir), "v{}".format(pg_version), "lib"
-        )
+        self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "lib")
 
     def _fixpath(self, command: List[str]):
         if "/" not in command[0]:
@@ -110,7 +108,7 @@ class PgBin:
         """
 
         self._fixpath(command)
-        print('Running command "{}"'.format(" ".join(command)))
+        print(f'Running command "{" ".join(command)}"')
         env = self._build_env(env)
         subprocess.run(command, env=env, cwd=cwd, check=True)
 
@@ -128,7 +126,7 @@ class PgBin:
         """
 
         self._fixpath(command)
-        print('Running command "{}"'.format(" ".join(command)))
+        print(f'Running command "{" ".join(command)}"')
         env = self._build_env(env)
         return subprocess_capture(
             str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs
@@ -300,7 +298,7 @@ class NeonPageserverHttpClient(requests.Session):
 
 def lsn_to_hex(num: int) -> str:
     """Convert lsn from int to standard hex notation."""
-    return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF)
+    return f"{num >> 32:X}/{num & 0xFFFFFFFF:X}"
 
 
 def lsn_from_hex(lsn_hex: str) -> int:
@@ -331,16 +329,12 @@ def wait_for_upload(
         if current_lsn >= lsn:
             return
         print(
-            "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
-                lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1
-            )
+            f"waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, now {lsn_to_hex(current_lsn)}, iteration {i + 1}"
         )
         time.sleep(1)
 
     raise Exception(
-        "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
-            lsn_to_hex(lsn), lsn_to_hex(current_lsn)
-        )
+        f"timed out while waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, was {lsn_to_hex(current_lsn)}"
     )
 
 
diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index e7959c1764..c32748f6f0 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -482,20 +482,18 @@ def pytest_terminal_summary(
                 terminalreporter.section("Benchmark results", "-")
                 is_header_printed = True
 
-            terminalreporter.write(
-                "{}.{}: ".format(test_report.head_line, recorded_property["name"])
-            )
+            terminalreporter.write(f"{test_report.head_line}.{recorded_property['name']}: ")
             unit = recorded_property["unit"]
             value = recorded_property["value"]
             if unit == "MB":
-                terminalreporter.write("{0:,.0f}".format(value), green=True)
+                terminalreporter.write(f"{value:,.0f}", green=True)
             elif unit in ("s", "ms") and isinstance(value, float):
-                terminalreporter.write("{0:,.3f}".format(value), green=True)
+                terminalreporter.write(f"{value:,.3f}", green=True)
             elif isinstance(value, float):
-                terminalreporter.write("{0:,.4f}".format(value), green=True)
+                terminalreporter.write(f"{value:,.4f}", green=True)
             else:
                 terminalreporter.write(str(value), green=True)
-            terminalreporter.line(" {}".format(unit))
+            terminalreporter.line(f" {unit}")
 
             result_entry.append(recorded_property)
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d0519d3406..67560a1017 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3605,7 +3605,7 @@ class Safekeeper:
         return self
 
     def stop(self, immediate: bool = False) -> "Safekeeper":
-        log.info("Stopping safekeeper {}".format(self.id))
+        log.info(f"Stopping safekeeper {self.id}")
         self.env.neon_cli.safekeeper_stop(self.id, immediate)
         self.running = False
         return self
@@ -4037,13 +4037,13 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
     for f in mismatch:
         f1 = os.path.join(endpoint.pgdata_dir, f)
         f2 = os.path.join(restored_dir_path, f)
-        stdout_filename = "{}.filediff".format(f2)
+        stdout_filename = f"{f2}.filediff"
 
         with open(stdout_filename, "w") as stdout_f:
-            subprocess.run("xxd -b {} > {}.hex ".format(f1, f1), shell=True)
-            subprocess.run("xxd -b {} > {}.hex ".format(f2, f2), shell=True)
+            subprocess.run(f"xxd -b {f1} > {f1}.hex ", shell=True)
+            subprocess.run(f"xxd -b {f2} > {f2}.hex ", shell=True)
 
-            cmd = "diff {}.hex {}.hex".format(f1, f2)
+            cmd = f"diff {f1}.hex {f2}.hex"
             subprocess.run([cmd], stdout=stdout_f, shell=True)
 
     assert (mismatch, error) == ([], [])
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 693771dd3d..4b0dd7a815 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -204,13 +204,11 @@ def wait_for_last_record_lsn(
             return current_lsn
         if i % 10 == 0:
             log.info(
-                "{}/{} waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
-                    tenant, timeline, lsn, current_lsn, i + 1
-                )
+                f"{tenant}/{timeline} waiting for last_record_lsn to reach {lsn}, now {current_lsn}, iteration {i + 1}"
             )
         time.sleep(0.1)
     raise Exception(
-        "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn)
+        f"timed out while waiting for last_record_lsn to reach {lsn}, was {current_lsn}"
     )
 
 
diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index 324ef0d516..b66db4d0ab 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -125,19 +125,19 @@ async def run_update_loop_worker(ep: Endpoint, n_txns: int, idx: int):
     await conn.execute(f"ALTER TABLE {table} SET (autovacuum_enabled = false)")
     await conn.execute(f"INSERT INTO {table} VALUES (1, 0)")
     await conn.execute(
+        f"""
+        CREATE PROCEDURE updating{table}() as
+        $$
+            DECLARE
+            i integer;
+            BEGIN
+            FOR i IN 1..{n_txns} LOOP
+                UPDATE {table} SET x = x + 1 WHERE pk=1;
+                COMMIT;
+            END LOOP;
+            END
+        $$ LANGUAGE plpgsql
         """
-         CREATE PROCEDURE updating{0}() as
-         $$
-             DECLARE
-             i integer;
-             BEGIN
-             FOR i IN 1..{1} LOOP
-                 UPDATE {0} SET x = x + 1 WHERE pk=1;
-                 COMMIT;
-             END LOOP;
-             END
-         $$ LANGUAGE plpgsql
-         """.format(table, n_txns)
     )
     await conn.execute("SET statement_timeout=0")
     await conn.execute(f"call updating{table}()")
diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index 9777bf6748..54905759bd 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -78,7 +78,7 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int)
         p = random.randint(0, i)
 
         timer = timeit.default_timer()
-        env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p), tenant_id=tenant)
+        env.neon_cli.create_branch(f"b{i + 1}", f"b{p}", tenant_id=tenant)
         dur = timeit.default_timer() - timer
 
         log.info(f"Creating branch b{i+1} took {dur}s")
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 2a7a3c41ac..5b69649007 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -84,11 +84,11 @@ def test_branching_with_pgbench(
             threads = []
 
         if ty == "cascade":
-            env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(i), tenant_id=tenant)
+            env.neon_cli.create_branch(f"b{i + 1}", f"b{i}", tenant_id=tenant)
         else:
-            env.neon_cli.create_branch("b{}".format(i + 1), "b0", tenant_id=tenant)
+            env.neon_cli.create_branch(f"b{i + 1}", "b0", tenant_id=tenant)
 
-        endpoints.append(env.endpoints.create_start("b{}".format(i + 1), tenant_id=tenant))
+        endpoints.append(env.endpoints.create_start(f"b{i + 1}", tenant_id=tenant))
 
         threads.append(
             threading.Thread(target=run_pgbench, args=(endpoints[-1].connstr(),), daemon=True)
diff --git a/test_runner/regress/test_large_schema.py b/test_runner/regress/test_large_schema.py
index b6ac1aa41f..c5d5b5fe64 100644
--- a/test_runner/regress/test_large_schema.py
+++ b/test_runner/regress/test_large_schema.py
@@ -74,8 +74,8 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder):
     cur.execute("select * from pg_depend order by refclassid, refobjid, refobjsubid")
 
     # Check layer file sizes
-    timeline_path = "{}/tenants/{}/timelines/{}/".format(
-        env.pageserver.workdir, env.initial_tenant, env.initial_timeline
+    timeline_path = (
+        f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{env.initial_timeline}/"
     )
     for filename in os.listdir(timeline_path):
         if filename.startswith("00000"):
diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py
index 2fdee89389..77dc8a35b5 100644
--- a/test_runner/regress/test_layer_bloating.py
+++ b/test_runner/regress/test_layer_bloating.py
@@ -57,9 +57,7 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
     time.sleep(10)
 
     # Check layer file sizes
-    timeline_path = "{}/tenants/{}/timelines/{}/".format(
-        env.pageserver.workdir, env.initial_tenant, timeline
-    )
+    timeline_path = f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{timeline}/"
     log.info(f"Check {timeline_path}")
     for filename in os.listdir(timeline_path):
         if filename.startswith("00000"):
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 41fa03cdf8..4767f2edb1 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -9,7 +9,6 @@ of the pageserver are:
 - Updates to remote_consistent_lsn may only be made visible after validating generation
 """
 
-
 import enum
 import re
 import time
diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py
index effb7e83f9..868b80a561 100644
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -22,7 +22,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
             for e in extensions:
-                c.execute("create extension if not exists {};".format(e))
+                c.execute(f"create extension if not exists {e};")
 
             c.execute("create table foo (c int) with (autovacuum_enabled = false)")
             c.execute("insert into foo values (1)")
@@ -42,14 +42,12 @@ def test_read_validation(neon_simple_env: NeonEnv):
             log.info("Test table is populated, validating buffer cache")
 
             cache_entries = query_scalar(
-                c, "select count(*) from pg_buffercache where relfilenode =  {}".format(relfilenode)
+                c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}"
             )
             assert cache_entries > 0, "No buffers cached for the test relation"
 
             c.execute(
-                "select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {}".format(
-                    relfilenode
-                )
+                f"select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {relfilenode}"
             )
             reln = c.fetchone()
             assert reln is not None
@@ -59,22 +57,20 @@ def test_read_validation(neon_simple_env: NeonEnv):
             c.execute("select clear_buffer_cache()")
 
             cache_entries = query_scalar(
-                c, "select count(*) from pg_buffercache where relfilenode =  {}".format(relfilenode)
+                c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}"
             )
             assert cache_entries == 0, "Failed to clear buffer cache"
 
             log.info("Cache is clear, reading stale page version")
 
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{}'))".format(
-                    first[0]
-                )
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}'))"
             )
             direct_first = c.fetchone()
             assert first == direct_first, "Failed fetch page at historic lsn"
 
             cache_entries = query_scalar(
-                c, "select count(*) from pg_buffercache where relfilenode =  {}".format(relfilenode)
+                c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}"
             )
             assert cache_entries == 0, "relation buffers detected after invalidation"
 
@@ -87,7 +83,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             assert second == direct_latest, "Failed fetch page at latest lsn"
 
             cache_entries = query_scalar(
-                c, "select count(*) from pg_buffercache where relfilenode =  {}".format(relfilenode)
+                c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}"
             )
             assert cache_entries == 0, "relation buffers detected after invalidation"
 
@@ -96,9 +92,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             )
 
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format(
-                    reln[0], reln[1], reln[2], first[0]
-                )
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))"
             )
             direct_first = c.fetchone()
             assert first == direct_first, "Failed fetch page at historic lsn using oid"
@@ -108,9 +102,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             )
 
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, NULL ))".format(
-                    reln[0], reln[1], reln[2]
-                )
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL))"
             )
             direct_latest = c.fetchone()
             assert second == direct_latest, "Failed fetch page at latest lsn"
@@ -122,9 +114,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             )
 
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format(
-                    reln[0], reln[1], reln[2], first[0]
-                )
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))"
             )
             direct_first = c.fetchone()
             assert first == direct_first, "Failed fetch page at historic lsn using oid"
@@ -134,7 +124,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
                 c.execute("select * from page_header(get_raw_page('foo', 'main', 0));")
                 raise AssertionError("query should have failed")
             except UndefinedTable as e:
-                log.info("Caught an expected failure: {}".format(e))
+                log.info(f"Caught an expected failure: {e}")
 
 
 def test_read_validation_neg(neon_simple_env: NeonEnv):
@@ -148,7 +138,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
             for e in extensions:
-                c.execute("create extension if not exists {};".format(e))
+                c.execute(f"create extension if not exists {e};")
 
             log.info("read a page of a missing relation")
             try:
@@ -157,7 +147,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
                 )
                 raise AssertionError("query should have failed")
             except UndefinedTable as e:
-                log.info("Caught an expected failure: {}".format(e))
+                log.info(f"Caught an expected failure: {e}")
 
             c.execute("create table foo (c int) with (autovacuum_enabled = false)")
             c.execute("insert into foo values (1)")
@@ -169,7 +159,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
                 )
                 raise AssertionError("query should have failed")
             except IoError as e:
-                log.info("Caught an expected failure: {}".format(e))
+                log.info(f"Caught an expected failure: {e}")
 
             log.info("Pass NULL as an input")
             expected = (None, None, None)
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 2cac58dc1a..ac1a747df3 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -103,9 +103,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
 
     n_timelines = 3
 
-    branch_names = [
-        "test_safekeepers_many_timelines_{}".format(tlin) for tlin in range(n_timelines)
-    ]
+    branch_names = [f"test_safekeepers_many_timelines_{tlin}" for tlin in range(n_timelines)]
     # pageserver, safekeeper operate timelines via their ids (can be represented in hex as 'ad50847381e248feaac9876cc71ae418')
     # that's not really human readable, so the branch names are introduced in Neon CLI.
     # Neon CLI stores its branch <-> timeline mapping in its internals,
@@ -1136,13 +1134,13 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline
         for f in mismatch:
             f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f)
             f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f)
-            stdout_filename = "{}.filediff".format(f2)
+            stdout_filename = f"{f2}.filediff"
 
             with open(stdout_filename, "w") as stdout_f:
-                subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True)
-                subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True)
+                subprocess.run(f"xxd {f1} > {f1}.hex ", shell=True)
+                subprocess.run(f"xxd {f2} > {f2}.hex ", shell=True)
 
-                cmd = "diff {}.hex {}.hex".format(f1, f2)
+                cmd = f"diff {f1}.hex {f2}.hex"
                 subprocess.run([cmd], stdout=stdout_f, shell=True)
 
             assert (mismatch, not_regular) == (
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index 720633189e..b5e8eea237 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -76,20 +76,20 @@ class WorkerStats(object):
         self.counters[worker_id] += 1
 
     def check_progress(self):
-        log.debug("Workers progress: {}".format(self.counters))
+        log.debug(f"Workers progress: {self.counters}")
 
         # every worker should finish at least one tx
         assert all(cnt > 0 for cnt in self.counters)
 
         progress = sum(self.counters)
-        log.info("All workers made {} transactions".format(progress))
+        log.info(f"All workers made {progress} transactions")
 
 
 async def run_random_worker(
     stats: WorkerStats, endpoint: Endpoint, worker_id, n_accounts, max_transfer
 ):
     pg_conn = await endpoint.connect_async()
-    log.debug("Started worker {}".format(worker_id))
+    log.debug(f"Started worker {worker_id}")
 
     while stats.running:
         from_uid = random.randint(0, n_accounts - 1)
@@ -99,9 +99,9 @@ async def run_random_worker(
         await bank_transfer(pg_conn, from_uid, to_uid, amount)
         stats.inc_progress(worker_id)
 
-        log.debug("Executed transfer({}) {} => {}".format(amount, from_uid, to_uid))
+        log.debug(f"Executed transfer({amount}) {from_uid} => {to_uid}")
 
-    log.debug("Finished worker {}".format(worker_id))
+    log.debug(f"Finished worker {worker_id}")
 
     await pg_conn.close()
 

From 90a8ff55fa135e86d3cf56cea83f8f92b211799b Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 2 Apr 2024 14:39:24 +0100
Subject: [PATCH 0501/1571] CI(benchmarking): Add Sharded Tenant for pgbench
 (#7186)

## Problem

During Nightly Benchmarks, we want to collect pgbench results for
sharded tenants as well.

## Summary of changes
- Add pre-created sharded project for pgbench
---
 .github/workflows/benchmarking.yml | 58 ++++++++++++++++++++----------
 1 file changed, 39 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 2e56bf909f..1eaf05cd54 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -147,15 +147,16 @@ jobs:
             "neonvm-captest-new"
           ],
           "db_size": [ "10gb" ],
-          "include": [{ "platform": "neon-captest-freetier",   "db_size": "3gb"  },
-                      { "platform": "neon-captest-new",        "db_size": "50gb" },
-                      { "platform": "neonvm-captest-freetier", "db_size": "3gb"  },
-                      { "platform": "neonvm-captest-new",      "db_size": "50gb" }]
+          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
+                      { "platform": "neon-captest-new",              "db_size": "50gb" },
+                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
+                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
         }'
 
         if [ "$(date +%A)" = "Saturday" ]; then
           matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
-                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
+                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -171,7 +172,7 @@ jobs:
 
         if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
           matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
-                                                   { "platform": "rds-aurora"   }]')
+                                                     { "platform": "rds-aurora"   }]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -190,7 +191,7 @@ jobs:
 
         if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
           matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                    { "platform": "rds-aurora",   "scale": "10" }]')
+                                                     { "platform": "rds-aurora",   "scale": "10" }]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -253,6 +254,9 @@ jobs:
           neon-captest-reuse)
             CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
             ;;
+          neonvm-captest-sharding-reuse)
+            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
+            ;;
           neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
             CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
             ;;
@@ -270,11 +274,15 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
         if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
+          QUERIES+=("SHOW neon.timeline_id")
         fi
-        psql ${CONNSTR} -c "${QUERY}"
+
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done
 
     - name: Benchmark init
       uses: ./.github/actions/run-python-test-set
@@ -401,11 +409,15 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
         if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
+          QUERIES+=("SHOW neon.timeline_id")
         fi
-        psql ${CONNSTR} -c "${QUERY}"
+
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done
 
     - name: ClickBench benchmark
       uses: ./.github/actions/run-python-test-set
@@ -507,11 +519,15 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
         if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
+          QUERIES+=("SHOW neon.timeline_id")
         fi
-        psql ${CONNSTR} -c "${QUERY}"
+
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done
 
     - name: Run TPC-H benchmark
       uses: ./.github/actions/run-python-test-set
@@ -597,11 +613,15 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
         if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
+          QUERIES+=("SHOW neon.timeline_id")
         fi
-        psql ${CONNSTR} -c "${QUERY}"
+
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done
 
     - name: Run user examples
       uses: ./.github/actions/run-python-test-set

From a5777bab09468358ec7f2e5e55bb52e0f68c2740 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 2 Apr 2024 16:46:24 +0100
Subject: [PATCH 0502/1571] tests: clean up compat test workarounds (#7097)

- Cleanup from
https://github.com/neondatabase/neon/pull/7040#discussion_r1521120263 --
in that PR, we needed to let compat tests manually register a node,
because it would run an old binary that doesn't self-register.
- Cleanup vectored get config workaround
- Cleanup a log allow list for which the underlying log noise has been
fixed.
---
 test_runner/fixtures/neon_fixtures.py     | 13 ++++---------
 test_runner/regress/test_compatibility.py | 10 +---------
 2 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 67560a1017..0e4a58c099 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -520,9 +520,9 @@ class NeonEnvBuilder:
         self.env = NeonEnv(self)
         return self.env
 
-    def start(self, register_pageservers=False):
+    def start(self):
         assert self.env is not None, "environment is not already initialized, call init() first"
-        self.env.start(register_pageservers=register_pageservers)
+        self.env.start()
 
     def init_start(
         self,
@@ -1115,8 +1115,8 @@ class NeonEnv:
         log.info(f"Config: {cfg}")
         self.neon_cli.init(cfg, force=config.config_init_force)
 
-    def start(self, register_pageservers=False):
-        # storage controller starts first, so that pageserver /re-attach calls don't
+    def start(self):
+        # Storage controller starts first, so that pageserver /re-attach calls don't
         # bounce through retries on startup
         self.storage_controller.start()
 
@@ -1127,11 +1127,6 @@ class NeonEnv:
         # reconcile.
         wait_until(30, 1, storage_controller_ready)
 
-        if register_pageservers:
-            # Special case for forward compat tests, this can be removed later.
-            for pageserver in self.pageservers:
-                self.storage_controller.node_register(pageserver)
-
         # Start up broker, pageserver and all safekeepers
         futs = []
         with concurrent.futures.ThreadPoolExecutor(
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 5406acc005..ddad98a5fa 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -226,10 +226,6 @@ def test_forward_compatibility(
     )
 
     try:
-        # TODO: remove this once the previous pageserrver version understands
-        # the 'get_vectored_impl' config
-        neon_env_builder.pageserver_get_vectored_impl = None
-
         neon_env_builder.num_safekeepers = 3
         neon_local_binpath = neon_env_builder.neon_binpath
         env = neon_env_builder.from_repo_dir(
@@ -238,15 +234,11 @@ def test_forward_compatibility(
             pg_distrib_dir=compatibility_postgres_distrib_dir,
         )
 
-        # TODO: remove this workaround after release-5090 is no longer the most recent release.
-        # There was a bug in that code that generates a warning in the storage controller log.
-        env.storage_controller.allowed_errors.append(".*no tenant_shard_id specified.*")
-
         # Use current neon_local even though we're using old binaries for
         # everything else: our test code is written for latest CLI args.
         env.neon_local_binpath = neon_local_binpath
 
-        neon_env_builder.start(register_pageservers=True)
+        neon_env_builder.start()
 
         check_neon_works(
             env,

From 9957c6a9a08e3cd02b23c89b540c0492dced5451 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 2 Apr 2024 17:16:15 +0100
Subject: [PATCH 0503/1571] pageserver: drop the layer map lock after planning
 reads (#7215)

## Problem
The vectored read path holds the layer map lock while visiting a
timeline.

## Summary of changes
* Rework the fringe order to hold `Layer` on `Arc<InMemoryLayer>`
handles instead of descriptions that are resolved by the layer map at
the time of read. Note that previously `get_values_reconstruct_data` was
implemented for the layer description which already knew the lsn range
for the read. Now it is implemented on the new `ReadableLayer` handle
and needs to get the lsn range as an argument.
* Drop the layer map lock after updating the fringe.

Related https://github.com/neondatabase/neon/issues/6833
---
 pageserver/src/tenant/ephemeral_file.rs       |   4 +
 pageserver/src/tenant/layer_map.rs            |  60 +------
 pageserver/src/tenant/storage_layer.rs        | 146 +++++++++---------
 .../tenant/storage_layer/inmemory_layer.rs    |  12 +-
 pageserver/src/tenant/timeline.rs             |  53 ++++---
 5 files changed, 125 insertions(+), 150 deletions(-)

diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index e48b9e83bd..b27230db03 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -72,6 +72,10 @@ impl EphemeralFile {
         self.len
     }
 
+    pub(crate) fn id(&self) -> page_cache::FileId {
+        self.page_cache_file_id
+    }
+
     pub(crate) async fn read_blk(
         &self,
         blknum: u32,
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index b8ed69052f..4c4cd90c99 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -346,35 +346,6 @@ where
     }
 }
 
-#[derive(PartialEq, Eq, Hash, Debug, Clone)]
-pub enum InMemoryLayerHandle {
-    Open {
-        lsn_floor: Lsn,
-        end_lsn: Lsn,
-    },
-    Frozen {
-        idx: usize,
-        lsn_floor: Lsn,
-        end_lsn: Lsn,
-    },
-}
-
-impl InMemoryLayerHandle {
-    pub fn get_lsn_floor(&self) -> Lsn {
-        match self {
-            InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
-            InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
-        }
-    }
-
-    pub fn get_end_lsn(&self) -> Lsn {
-        match self {
-            InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
-            InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
-        }
-    }
-}
-
 impl LayerMap {
     ///
     /// Find the latest layer (by lsn.end) that covers the given
@@ -576,41 +547,18 @@ impl LayerMap {
         self.historic.iter()
     }
 
-    /// Get a handle for the first in memory layer that matches the provided predicate.
-    /// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
-    ///
-    /// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
-    /// the same exclusive region established by holding the layer manager lock.
-    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
+    /// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
+    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
     where
         Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
     {
         if let Some(open) = &self.open_layer {
             if pred(open) {
-                return Some(InMemoryLayerHandle::Open {
-                    lsn_floor: open.get_lsn_range().start,
-                    end_lsn: open.get_lsn_range().end,
-                });
+                return Some(open.clone());
             }
         }
 
-        let pos = self.frozen_layers.iter().rev().position(pred);
-        pos.map(|rev_idx| {
-            let idx = self.frozen_layers.len() - 1 - rev_idx;
-            InMemoryLayerHandle::Frozen {
-                idx,
-                lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
-                end_lsn: self.frozen_layers[idx].get_lsn_range().end,
-            }
-        })
-    }
-
-    /// Get the layer pointed to by the provided handle.
-    pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
-        match handle {
-            InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
-            InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
-        }
+        self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
     }
 
     ///
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index f44a92a2d7..9a2b086828 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
 use std::ops::Range;
-use std::sync::Mutex;
+use std::sync::{Arc, Mutex};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 
 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
 
-use super::layer_map::InMemoryLayerHandle;
-use super::timeline::layer_manager::LayerManager;
+use self::inmemory_layer::InMemoryLayerFileId;
+
 use super::timeline::GetVectoredError;
 use super::PageReconstructError;
 
@@ -204,23 +204,30 @@ impl Default for ValuesReconstructState {
     }
 }
 
-/// Description of layer to be read - the layer map can turn
-/// this description into the actual layer.
-#[derive(PartialEq, Eq, Hash, Debug, Clone)]
-pub(crate) enum ReadableLayerDesc {
-    Persistent {
-        desc: PersistentLayerDesc,
-        lsn_range: Range<Lsn>,
-    },
-    InMemory {
-        handle: InMemoryLayerHandle,
-        lsn_ceil: Lsn,
-    },
+/// A key that uniquely identifies a layer in a timeline
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+pub(crate) enum LayerId {
+    PersitentLayerId(PersistentLayerKey),
+    InMemoryLayerId(InMemoryLayerFileId),
 }
 
-/// Wraper for 'ReadableLayerDesc' sorted by Lsn
+/// Layer wrapper for the read path. Note that it is valid
+/// to use these layers even after external operations have
+/// been performed on them (compaction, freeze, etc.).
 #[derive(Debug)]
-struct ReadableLayerDescOrdered(ReadableLayerDesc);
+pub(crate) enum ReadableLayer {
+    PersistentLayer(Layer),
+    InMemoryLayer(Arc<InMemoryLayer>),
+}
+
+/// A partial description of a read to be done.
+#[derive(Debug, Clone)]
+struct ReadDesc {
+    /// An id used to resolve the readable layer within the fringe
+    layer_id: LayerId,
+    /// Lsn range for the read, used for selecting the next read
+    lsn_range: Range<Lsn>,
+}
 
 /// Data structure which maintains a fringe of layers for the
 /// read path. The fringe is the set of layers which intersects
@@ -231,41 +238,64 @@ struct ReadableLayerDescOrdered(ReadableLayerDesc);
 /// a two layer indexing scheme.
 #[derive(Debug)]
 pub(crate) struct LayerFringe {
-    layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
-    layers: HashMap<ReadableLayerDesc, KeySpace>,
+    planned_reads_by_lsn: BinaryHeap<ReadDesc>,
+    layers: HashMap<LayerId, LayerKeyspace>,
+}
+
+#[derive(Debug)]
+struct LayerKeyspace {
+    layer: ReadableLayer,
+    target_keyspace: KeySpace,
 }
 
 impl LayerFringe {
     pub(crate) fn new() -> Self {
         LayerFringe {
-            layers_by_lsn: BinaryHeap::new(),
+            planned_reads_by_lsn: BinaryHeap::new(),
             layers: HashMap::new(),
         }
     }
 
-    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
-        let handle = match self.layers_by_lsn.pop() {
-            Some(h) => h,
+    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
+        let read_desc = match self.planned_reads_by_lsn.pop() {
+            Some(desc) => desc,
             None => return None,
         };
 
-        let removed = self.layers.remove_entry(&handle.0);
+        let removed = self.layers.remove_entry(&read_desc.layer_id);
         match removed {
-            Some((layer, keyspace)) => Some((layer, keyspace)),
+            Some((
+                _,
+                LayerKeyspace {
+                    layer,
+                    target_keyspace,
+                },
+            )) => Some((layer, target_keyspace, read_desc.lsn_range)),
             None => unreachable!("fringe internals are always consistent"),
         }
     }
 
-    pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
-        let entry = self.layers.entry(layer.clone());
+    pub(crate) fn update(
+        &mut self,
+        layer: ReadableLayer,
+        keyspace: KeySpace,
+        lsn_range: Range<Lsn>,
+    ) {
+        let layer_id = layer.id();
+        let entry = self.layers.entry(layer_id.clone());
         match entry {
             Entry::Occupied(mut entry) => {
-                entry.get_mut().merge(&keyspace);
+                entry.get_mut().target_keyspace.merge(&keyspace);
             }
             Entry::Vacant(entry) => {
-                self.layers_by_lsn
-                    .push(ReadableLayerDescOrdered(entry.key().clone()));
-                entry.insert(keyspace);
+                self.planned_reads_by_lsn.push(ReadDesc {
+                    lsn_range,
+                    layer_id: layer_id.clone(),
+                });
+                entry.insert(LayerKeyspace {
+                    layer,
+                    target_keyspace: keyspace,
+                });
             }
         }
     }
@@ -277,77 +307,55 @@ impl Default for LayerFringe {
     }
 }
 
-impl Ord for ReadableLayerDescOrdered {
+impl Ord for ReadDesc {
     fn cmp(&self, other: &Self) -> Ordering {
-        let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
+        let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
         if ord == std::cmp::Ordering::Equal {
-            self.0
-                .get_lsn_floor()
-                .cmp(&other.0.get_lsn_floor())
-                .reverse()
+            self.lsn_range.start.cmp(&other.lsn_range.start).reverse()
         } else {
             ord
         }
     }
 }
 
-impl PartialOrd for ReadableLayerDescOrdered {
+impl PartialOrd for ReadDesc {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl PartialEq for ReadableLayerDescOrdered {
+impl PartialEq for ReadDesc {
     fn eq(&self, other: &Self) -> bool {
-        self.0.get_lsn_floor() == other.0.get_lsn_floor()
-            && self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
+        self.lsn_range == other.lsn_range
     }
 }
 
-impl Eq for ReadableLayerDescOrdered {}
+impl Eq for ReadDesc {}
 
-impl ReadableLayerDesc {
-    pub(crate) fn get_lsn_floor(&self) -> Lsn {
+impl ReadableLayer {
+    pub(crate) fn id(&self) -> LayerId {
         match self {
-            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
-            ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
-        }
-    }
-
-    pub(crate) fn get_lsn_ceil(&self) -> Lsn {
-        match self {
-            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
-            ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
+            Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()),
+            Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()),
         }
     }
 
     pub(crate) async fn get_values_reconstruct_data(
         &self,
-        layer_manager: &LayerManager,
         keyspace: KeySpace,
+        lsn_range: Range<Lsn>,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
         match self {
-            ReadableLayerDesc::Persistent { desc, lsn_range } => {
-                let layer = layer_manager.get_from_desc(desc);
+            ReadableLayer::PersistentLayer(layer) => {
                 layer
-                    .get_values_reconstruct_data(
-                        keyspace,
-                        lsn_range.clone(),
-                        reconstruct_state,
-                        ctx,
-                    )
+                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
                     .await
             }
-            ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
-                let layer = layer_manager
-                    .layer_map()
-                    .get_in_memory_layer(handle)
-                    .unwrap();
-
+            ReadableLayer::InMemoryLayer(layer) => {
                 layer
-                    .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
                     .await
             }
         }
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 628f12065f..43942ba2db 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -12,7 +12,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{PageReconstructError, Timeline};
-use crate::walrecord;
+use crate::{page_cache, walrecord};
 use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
@@ -36,10 +36,14 @@ use super::{
     ValuesReconstructState,
 };
 
+#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
+pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
+
 pub struct InMemoryLayer {
     conf: &'static PageServerConf,
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
+    file_id: InMemoryLayerFileId,
 
     /// This layer contains all the changes from 'start_lsn'. The
     /// start is inclusive.
@@ -200,6 +204,10 @@ pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
 };
 
 impl InMemoryLayer {
+    pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
+        self.file_id
+    }
+
     pub(crate) fn get_timeline_id(&self) -> TimelineId {
         self.timeline_id
     }
@@ -443,8 +451,10 @@ impl InMemoryLayer {
         trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
 
         let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
+        let key = InMemoryLayerFileId(file.id());
 
         Ok(InMemoryLayer {
+            file_id: key,
             conf,
             timeline_id,
             tenant_shard_id,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f3565c1fb3..8ee9b9dbd2 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -118,11 +118,11 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::remote_timeline_client::RemoteTimelineClient;
+use super::config::TenantConf;
 use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
-use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
+use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(super) enum FlushLoopState {
@@ -2905,16 +2905,6 @@ impl Timeline {
 
         let mut completed_keyspace = KeySpace::default();
 
-        // Hold the layer map whilst visiting the timeline to prevent
-        // compaction, eviction and flushes from rendering the layers unreadable.
-        //
-        // TODO: Do we actually need to do this? In theory holding on
-        // to [`tenant::storage_layer::Layer`] should be enough. However,
-        // [`Timeline::get`] also holds the lock during IO, so more investigation
-        // is needed.
-        let guard = timeline.layers.read().await;
-        let layers = guard.layer_map();
-
         loop {
             if cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
@@ -2924,6 +2914,9 @@ impl Timeline {
             unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
             completed_keyspace.merge(&keys_done_last_step);
 
+            let guard = timeline.layers.read().await;
+            let layers = guard.layer_map();
+
             let in_memory_layer = layers.find_in_memory_layer(|l| {
                 let start_lsn = l.get_lsn_range().start;
                 cont_lsn > start_lsn
@@ -2931,12 +2924,11 @@ impl Timeline {
 
             match in_memory_layer {
                 Some(l) => {
+                    let lsn_range = l.get_lsn_range().start..cont_lsn;
                     fringe.update(
-                        ReadableLayerDesc::InMemory {
-                            handle: l,
-                            lsn_ceil: cont_lsn,
-                        },
+                        ReadableLayer::InMemoryLayer(l),
                         unmapped_keyspace.clone(),
+                        lsn_range,
                     );
                 }
                 None => {
@@ -2948,30 +2940,43 @@ impl Timeline {
                             .into_iter()
                             .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
                                 (
-                                    ReadableLayerDesc::Persistent {
-                                        desc: (*layer).clone(),
-                                        lsn_range: lsn_floor..cont_lsn,
-                                    },
+                                    ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
                                     keyspace_accum.to_keyspace(),
+                                    lsn_floor..cont_lsn,
                                 )
                             })
-                            .for_each(|(layer, keyspace)| fringe.update(layer, keyspace));
+                            .for_each(|(layer, keyspace, lsn_range)| {
+                                fringe.update(layer, keyspace, lsn_range)
+                            });
                     }
                 }
             }
 
-            if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() {
+            // It's safe to drop the layer map lock after planning the next round of reads.
+            // The fringe keeps readable handles for the layers which are safe to read even
+            // if layers were compacted or flushed.
+            //
+            // The more interesting consideration is: "Why is the read algorithm still correct
+            // if the layer map changes while it is operating?". Doing a vectored read on a
+            // timeline boils down to pushing an imaginary lsn boundary downwards for each range
+            // covered by the read. The layer map tells us how to move the lsn downwards for a
+            // range at *a particular point in time*. It is fine for the answer to be different
+            // at two different time points.
+            drop(guard);
+
+            if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
+                let next_cont_lsn = lsn_range.start;
                 layer_to_read
                     .get_values_reconstruct_data(
-                        &guard,
                         keyspace_to_read.clone(),
+                        lsn_range,
                         reconstruct_state,
                         ctx,
                     )
                     .await?;
 
                 unmapped_keyspace = keyspace_to_read;
-                cont_lsn = layer_to_read.get_lsn_floor();
+                cont_lsn = next_cont_lsn;
             } else {
                 break;
             }

From 582cec53c5a783c0fcff811aa86572cd27a4f65f Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 2 Apr 2024 21:46:23 +0200
Subject: [PATCH 0504/1571] proxy: upload consumption events to S3 (#7213)

## Problem

If vector is unavailable, we are missing consumption events.

https://github.com/neondatabase/cloud/issues/9826

## Summary of changes

Added integration with the consumption bucket.
---
 Cargo.lock                            |   1 +
 proxy/Cargo.toml                      |   1 +
 proxy/src/bin/proxy.rs                |  29 +-
 proxy/src/config.rs                   |  17 ++
 proxy/src/context/parquet.rs          |  16 +-
 proxy/src/proxy/passthrough.rs        |   2 +-
 proxy/src/serverless/sql_over_http.rs |   1 +
 proxy/src/usage_metrics.rs            | 382 +++++++++++++++++++++-----
 8 files changed, 372 insertions(+), 77 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7200fb7968..92c07b0c6f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4199,6 +4199,7 @@ name = "proxy"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "async-compression",
  "async-trait",
  "aws-config",
  "aws-sdk-iam",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 57a2736d5b..b327890be2 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -10,6 +10,7 @@ testing = []
 
 [dependencies]
 anyhow.workspace = true
+async-compression.workspace = true
 async-trait.workspace = true
 aws-config.workspace = true
 aws-sdk-iam.workspace = true
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 88b847f5f1..56a3ef79cd 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -10,6 +10,7 @@ use proxy::auth;
 use proxy::auth::backend::MaybeOwned;
 use proxy::cancellation::CancelMap;
 use proxy::cancellation::CancellationHandler;
+use proxy::config::remote_storage_from_toml;
 use proxy::config::AuthenticationConfig;
 use proxy::config::CacheOptions;
 use proxy::config::HttpConfig;
@@ -191,6 +192,19 @@ struct ProxyCliArgs {
 
     #[clap(flatten)]
     parquet_upload: ParquetUploadArgs,
+
+    /// interval for backup metric collection
+    #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
+    metric_backup_collection_interval: std::time::Duration,
+    /// remote storage configuration for backup metric collection
+    /// Encoded as toml (same format as pageservers), eg
+    /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
+    #[clap(long, default_value = "{}")]
+    metric_backup_collection_remote_storage: String,
+    /// chunk size for backup metric collection
+    /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
+    #[clap(long, default_value = "4194304")]
+    metric_backup_collection_chunk_size: usize,
 }
 
 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -372,12 +386,17 @@ async fn main() -> anyhow::Result<()> {
 
     // maintenance tasks. these never return unless there's an error
     let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
+    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
     maintenance_tasks.spawn(http::health_server::task_main(http_listener));
     maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
 
     if let Some(metrics_config) = &config.metric_collection {
+        // TODO: Add gc regardles of the metric collection being enabled.
         maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
+        client_tasks.spawn(usage_metrics::task_backup(
+            &metrics_config.backup_metric_collection_config,
+            cancellation_token,
+        ));
     }
 
     if let auth::BackendType::Console(api, _) = &config.auth_backend {
@@ -434,6 +453,13 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     if args.allow_self_signed_compute {
         warn!("allowing self-signed compute certificates");
     }
+    let backup_metric_collection_config = config::MetricBackupCollectionConfig {
+        interval: args.metric_backup_collection_interval,
+        remote_storage_config: remote_storage_from_toml(
+            &args.metric_backup_collection_remote_storage,
+        )?,
+        chunk_size: args.metric_backup_collection_chunk_size,
+    };
 
     let metric_collection = match (
         &args.metric_collection_endpoint,
@@ -442,6 +468,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig {
             endpoint: endpoint.parse()?,
             interval: humantime::parse_duration(interval)?,
+            backup_metric_collection_config,
         }),
         (None, None) => None,
         _ => bail!(
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 361c3ef519..fc490c7348 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -5,6 +5,7 @@ use crate::{
 };
 use anyhow::{bail, ensure, Context, Ok};
 use itertools::Itertools;
+use remote_storage::RemoteStorageConfig;
 use rustls::{
     crypto::ring::sign,
     pki_types::{CertificateDer, PrivateKeyDer},
@@ -39,6 +40,7 @@ pub struct ProxyConfig {
 pub struct MetricCollectionConfig {
     pub endpoint: reqwest::Url,
     pub interval: Duration,
+    pub backup_metric_collection_config: MetricBackupCollectionConfig,
 }
 
 pub struct TlsConfig {
@@ -311,6 +313,21 @@ impl CertResolver {
     }
 }
 
+#[derive(Debug)]
+pub struct MetricBackupCollectionConfig {
+    pub interval: Duration,
+    pub remote_storage_config: OptRemoteStorageConfig,
+    pub chunk_size: usize,
+}
+
+/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
+/// runtime type errors from the value parser we use.
+pub type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
+
+pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
+    RemoteStorageConfig::from_toml(&s.parse()?)
+}
+
 /// Helper for cmdline cache options parsing.
 #[derive(Debug)]
 pub struct CacheOptions {
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index a2be1c4186..04e5695255 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -13,12 +13,14 @@ use parquet::{
     },
     record::RecordWriter,
 };
-use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel};
+use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use tokio::{sync::mpsc, time};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
 use utils::backoff;
 
+use crate::config::{remote_storage_from_toml, OptRemoteStorageConfig};
+
 use super::{RequestMonitoring, LOG_CHAN};
 
 #[derive(clap::Args, Clone, Debug)]
@@ -50,21 +52,13 @@ pub struct ParquetUploadArgs {
     parquet_upload_compression: Compression,
 }
 
-/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
-/// runtime type errors from the value parser we use.
-type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
-
-fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
-    RemoteStorageConfig::from_toml(&s.parse()?)
-}
-
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a upload fails, we log it at info-level, and retry.
 // But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN
 // level instead, as repeated failures can mean a more serious problem. If it
 // fails more than FAILED_UPLOAD_RETRIES times, we give up
-pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
-pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
+pub const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
 
 // the parquet crate leaves a lot to be desired...
 // what follows is an attempt to write parquet files with minimal allocs.
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index f6d4314391..cf53c6e673 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -4,7 +4,7 @@ use crate::{
     console::messages::MetricsAuxInfo,
     metrics::NUM_BYTES_PROXIED_COUNTER,
     stream::Stream,
-    usage_metrics::{Ids, USAGE_METRICS},
+    usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS},
 };
 use metrics::IntCounterPairGuard;
 use tokio::io::{AsyncRead, AsyncWrite};
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index f675375ff1..d5f2fea487 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -44,6 +44,7 @@ use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
 use crate::serverless::backend::HttpConnError;
+use crate::usage_metrics::MetricCounterRecorder;
 use crate::DbName;
 use crate::RoleName;
 
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index d75aedf89b..2ad0883fb0 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -1,20 +1,34 @@
 //! Periodically collect proxy consumption metrics
 //! and push them to a HTTP endpoint.
-use crate::{config::MetricCollectionConfig, http, BranchId, EndpointId};
-use chrono::{DateTime, Utc};
+use crate::{
+    config::{MetricBackupCollectionConfig, MetricCollectionConfig},
+    context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
+    http, BranchId, EndpointId,
+};
+use anyhow::Context;
+use async_compression::tokio::write::GzipEncoder;
+use bytes::Bytes;
+use chrono::{DateTime, Datelike, Timelike, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use dashmap::{mapref::entry::Entry, DashMap};
+use futures::future::select;
 use once_cell::sync::Lazy;
+use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use serde::{Deserialize, Serialize};
 use std::{
     convert::Infallible,
+    pin::pin,
     sync::{
         atomic::{AtomicU64, AtomicUsize, Ordering},
         Arc,
     },
     time::Duration,
 };
+use tokio::io::AsyncWriteExt;
+use tokio_util::sync::CancellationToken;
 use tracing::{error, info, instrument, trace};
+use utils::backoff;
+use uuid::{NoContext, Timestamp};
 
 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 
@@ -33,19 +47,93 @@ pub struct Ids {
     pub branch_id: BranchId,
 }
 
+pub trait MetricCounterRecorder {
+    /// Record that some bytes were sent from the proxy to the client
+    fn record_egress(&self, bytes: u64);
+    /// Record that some connections were opened
+    fn record_connection(&self, count: usize);
+}
+
+trait MetricCounterReporter {
+    fn get_metrics(&mut self) -> (u64, usize);
+    fn move_metrics(&self) -> (u64, usize);
+}
+
 #[derive(Debug)]
-pub struct MetricCounter {
+struct MetricBackupCounter {
     transmitted: AtomicU64,
     opened_connections: AtomicUsize,
 }
 
-impl MetricCounter {
-    /// Record that some bytes were sent from the proxy to the client
-    pub fn record_egress(&self, bytes: u64) {
+impl MetricCounterRecorder for MetricBackupCounter {
+    fn record_egress(&self, bytes: u64) {
         self.transmitted.fetch_add(bytes, Ordering::AcqRel);
     }
 
+    fn record_connection(&self, count: usize) {
+        self.opened_connections.fetch_add(count, Ordering::AcqRel);
+    }
+}
+
+impl MetricCounterReporter for MetricBackupCounter {
+    fn get_metrics(&mut self) -> (u64, usize) {
+        (
+            *self.transmitted.get_mut(),
+            *self.opened_connections.get_mut(),
+        )
+    }
+    fn move_metrics(&self) -> (u64, usize) {
+        (
+            self.transmitted.swap(0, Ordering::AcqRel),
+            self.opened_connections.swap(0, Ordering::AcqRel),
+        )
+    }
+}
+
+#[derive(Debug)]
+pub struct MetricCounter {
+    transmitted: AtomicU64,
+    opened_connections: AtomicUsize,
+    backup: Arc<MetricBackupCounter>,
+}
+
+impl MetricCounterRecorder for MetricCounter {
+    /// Record that some bytes were sent from the proxy to the client
+    fn record_egress(&self, bytes: u64) {
+        self.transmitted.fetch_add(bytes, Ordering::AcqRel);
+        self.backup.record_egress(bytes);
+    }
+
+    /// Record that some connections were opened
+    fn record_connection(&self, count: usize) {
+        self.opened_connections.fetch_add(count, Ordering::AcqRel);
+        self.backup.record_connection(count);
+    }
+}
+
+impl MetricCounterReporter for MetricCounter {
+    fn get_metrics(&mut self) -> (u64, usize) {
+        (
+            *self.transmitted.get_mut(),
+            *self.opened_connections.get_mut(),
+        )
+    }
+    fn move_metrics(&self) -> (u64, usize) {
+        (
+            self.transmitted.swap(0, Ordering::AcqRel),
+            self.opened_connections.swap(0, Ordering::AcqRel),
+        )
+    }
+}
+
+trait Clearable {
     /// extract the value that should be reported
+    fn should_report(self: &Arc<Self>) -> Option<u64>;
+    /// Determine whether the counter should be cleared from the global map.
+    fn should_clear(self: &mut Arc<Self>) -> bool;
+}
+
+impl<C: MetricCounterReporter> Clearable for C {
     fn should_report(self: &Arc<Self>) -> Option<u64> {
         // heuristic to see if the branch is still open
         // if a clone happens while we are observing, the heuristic will be incorrect.
@@ -54,13 +142,12 @@ impl MetricCounter {
         // However, for the strong count to be 1 it must have occured that at one instant
         // all the endpoints were closed, so missing a report because the endpoints are closed is valid.
         let is_open = Arc::strong_count(self) > 1;
-        let opened = self.opened_connections.swap(0, Ordering::AcqRel);
 
         // update cached metrics eagerly, even if they can't get sent
         // (to avoid sending the same metrics twice)
         // see the relevant discussion on why to do so even if the status is not success:
         // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
-        let value = self.transmitted.swap(0, Ordering::AcqRel);
+        let (value, opened) = self.move_metrics();
 
         // Our only requirement is that we report in every interval if there was an open connection
         // if there were no opened connections since, then we don't need to report
@@ -70,15 +157,12 @@ impl MetricCounter {
             Some(value)
         }
     }
-
-    /// Determine whether the counter should be cleared from the global map.
     fn should_clear(self: &mut Arc<Self>) -> bool {
         // we can't clear this entry if it's acquired elsewhere
         let Some(counter) = Arc::get_mut(self) else {
             return false;
         };
-        let opened = *counter.opened_connections.get_mut();
-        let value = *counter.transmitted.get_mut();
+        let (opened, value) = counter.get_metrics();
         // clear if there's no data to report
         value == 0 && opened == 0
     }
@@ -90,11 +174,26 @@ type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;
 #[derive(Default)]
 pub struct Metrics {
     endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
+    backup_endpoints: DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
 }
 
 impl Metrics {
     /// Register a new byte metrics counter for this endpoint
     pub fn register(&self, ids: Ids) -> Arc<MetricCounter> {
+        let backup = if let Some(entry) = self.backup_endpoints.get(&ids) {
+            entry.clone()
+        } else {
+            self.backup_endpoints
+                .entry(ids.clone())
+                .or_insert_with(|| {
+                    Arc::new(MetricBackupCounter {
+                        transmitted: AtomicU64::new(0),
+                        opened_connections: AtomicUsize::new(0),
+                    })
+                })
+                .clone()
+        };
+
         let entry = if let Some(entry) = self.endpoints.get(&ids) {
             entry.clone()
         } else {
@@ -104,12 +203,13 @@ impl Metrics {
                     Arc::new(MetricCounter {
                         transmitted: AtomicU64::new(0),
                         opened_connections: AtomicUsize::new(0),
+                        backup: backup.clone(),
                     })
                 })
                 .clone()
         };
 
-        entry.opened_connections.fetch_add(1, Ordering::AcqRel);
+        entry.record_connection(1);
         entry
     }
 }
@@ -132,7 +232,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
 
         let now = Utc::now();
         collect_metrics_iteration(
-            &USAGE_METRICS,
+            &USAGE_METRICS.endpoints,
             &http_client,
             &config.endpoint,
             &hostname,
@@ -144,24 +244,12 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
     }
 }
 
-#[instrument(skip_all)]
-async fn collect_metrics_iteration(
-    metrics: &Metrics,
-    client: &http::ClientWithMiddleware,
-    metric_collection_endpoint: &reqwest::Url,
-    hostname: &str,
-    prev: DateTime<Utc>,
-    now: DateTime<Utc>,
-) {
-    info!(
-        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
-        metric_collection_endpoint
-    );
-
+fn collect_and_clear_metrics<C: Clearable>(
+    endpoints: &DashMap<Ids, Arc<C>, FastHasher>,
+) -> Vec<(Ids, u64)> {
     let mut metrics_to_clear = Vec::new();
 
-    let metrics_to_send: Vec<(Ids, u64)> = metrics
-        .endpoints
+    let metrics_to_send: Vec<(Ids, u64)> = endpoints
         .iter()
         .filter_map(|counter| {
             let key = counter.key().clone();
@@ -173,33 +261,71 @@ async fn collect_metrics_iteration(
         })
         .collect();
 
+    for metric in metrics_to_clear {
+        match endpoints.entry(metric) {
+            Entry::Occupied(mut counter) => {
+                if counter.get_mut().should_clear() {
+                    counter.remove_entry();
+                }
+            }
+            Entry::Vacant(_) => {}
+        }
+    }
+    metrics_to_send
+}
+
+fn create_event_chunks<'a>(
+    metrics_to_send: &'a [(Ids, u64)],
+    hostname: &'a str,
+    prev: DateTime<Utc>,
+    now: DateTime<Utc>,
+    chunk_size: usize,
+) -> impl Iterator<Item = EventChunk<'a, Event<Ids, &'static str>>> + 'a {
+    // Split into chunks of 1000 metrics to avoid exceeding the max request size
+    metrics_to_send
+        .chunks(chunk_size)
+        .map(move |chunk| EventChunk {
+            events: chunk
+                .iter()
+                .map(|(ids, value)| Event {
+                    kind: EventType::Incremental {
+                        start_time: prev,
+                        stop_time: now,
+                    },
+                    metric: PROXY_IO_BYTES_PER_CLIENT,
+                    idempotency_key: idempotency_key(hostname),
+                    value: *value,
+                    extra: ids.clone(),
+                })
+                .collect(),
+        })
+}
+
+#[instrument(skip_all)]
+async fn collect_metrics_iteration(
+    endpoints: &DashMap<Ids, Arc<MetricCounter>, FastHasher>,
+    client: &http::ClientWithMiddleware,
+    metric_collection_endpoint: &reqwest::Url,
+    hostname: &str,
+    prev: DateTime<Utc>,
+    now: DateTime<Utc>,
+) {
+    info!(
+        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
+        metric_collection_endpoint
+    );
+
+    let metrics_to_send = collect_and_clear_metrics(endpoints);
+
     if metrics_to_send.is_empty() {
         trace!("no new metrics to send");
     }
 
     // Send metrics.
-    // Split into chunks of 1000 metrics to avoid exceeding the max request size
-    for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
-        let events = chunk
-            .iter()
-            .map(|(ids, value)| Event {
-                kind: EventType::Incremental {
-                    start_time: prev,
-                    stop_time: now,
-                },
-                metric: PROXY_IO_BYTES_PER_CLIENT,
-                idempotency_key: idempotency_key(hostname),
-                value: *value,
-                extra: Ids {
-                    endpoint_id: ids.endpoint_id.clone(),
-                    branch_id: ids.branch_id.clone(),
-                },
-            })
-            .collect();
-
+    for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, CHUNK_SIZE) {
         let res = client
             .post(metric_collection_endpoint.clone())
-            .json(&EventChunk { events })
+            .json(&chunk)
             .send()
             .await;
 
@@ -213,23 +339,142 @@ async fn collect_metrics_iteration(
 
         if !res.status().is_success() {
             error!("metrics endpoint refused the sent metrics: {:?}", res);
-            for metric in chunk.iter().filter(|(_, value)| *value > (1u64 << 40)) {
+            for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) {
                 // Report if the metric value is suspiciously large
                 error!("potentially abnormal metric value: {:?}", metric);
             }
         }
     }
+}
 
-    for metric in metrics_to_clear {
-        match metrics.endpoints.entry(metric) {
-            Entry::Occupied(mut counter) => {
-                if counter.get_mut().should_clear() {
-                    counter.remove_entry();
-                }
-            }
-            Entry::Vacant(_) => {}
+pub async fn task_backup(
+    backup_config: &MetricBackupCollectionConfig,
+    cancellation_token: CancellationToken,
+) -> anyhow::Result<()> {
+    info!("metrics backup config: {backup_config:?}");
+    scopeguard::defer! {
+        info!("metrics backup has shut down");
+    }
+    // Even if the remote storage is not configured, we still want to clear the metrics.
+    let storage = backup_config
+        .remote_storage_config
+        .as_ref()
+        .map(|config| GenericRemoteStorage::from_config(config).context("remote storage init"))
+        .transpose()?;
+    let mut ticker = tokio::time::interval(backup_config.interval);
+    let mut prev = Utc::now();
+    let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
+    loop {
+        select(pin!(ticker.tick()), pin!(cancellation_token.cancelled())).await;
+        let now = Utc::now();
+        collect_metrics_backup_iteration(
+            &USAGE_METRICS.backup_endpoints,
+            &storage,
+            &hostname,
+            prev,
+            now,
+            backup_config.chunk_size,
+        )
+        .await;
+
+        prev = now;
+        if cancellation_token.is_cancelled() {
+            info!("metrics backup has been cancelled");
+            break;
         }
     }
+    Ok(())
+}
+
+#[instrument(skip_all)]
+async fn collect_metrics_backup_iteration(
+    endpoints: &DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
+    storage: &Option<GenericRemoteStorage>,
+    hostname: &str,
+    prev: DateTime<Utc>,
+    now: DateTime<Utc>,
+    chunk_size: usize,
+) {
+    let year = now.year();
+    let month = now.month();
+    let day = now.day();
+    let hour = now.hour();
+    let minute = now.minute();
+    let second = now.second();
+    let cancel = CancellationToken::new();
+
+    info!("starting collect_metrics_backup_iteration");
+
+    let metrics_to_send = collect_and_clear_metrics(endpoints);
+
+    if metrics_to_send.is_empty() {
+        trace!("no new metrics to send");
+    }
+
+    // Send metrics.
+    for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, chunk_size) {
+        let real_now = Utc::now();
+        let id = uuid::Uuid::new_v7(Timestamp::from_unix(
+            NoContext,
+            real_now.second().into(),
+            real_now.nanosecond(),
+        ));
+        let path = format!("year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z_{id}.json.gz");
+        let remote_path = match RemotePath::from_string(&path) {
+            Ok(remote_path) => remote_path,
+            Err(e) => {
+                error!("failed to create remote path from str {path}: {:?}", e);
+                continue;
+            }
+        };
+
+        let res = upload_events_chunk(storage, chunk, &remote_path, &cancel).await;
+
+        if let Err(e) = res {
+            error!(
+                "failed to upload consumption events to remote storage: {:?}",
+                e
+            );
+        }
+    }
+}
+
+async fn upload_events_chunk(
+    storage: &Option<GenericRemoteStorage>,
+    chunk: EventChunk<'_, Event<Ids, &'static str>>,
+    remote_path: &RemotePath,
+    cancel: &CancellationToken,
+) -> anyhow::Result<()> {
+    let storage = match storage {
+        Some(storage) => storage,
+        None => {
+            error!("no remote storage configured");
+            return Ok(());
+        }
+    };
+    let data = serde_json::to_vec(&chunk).context("serialize metrics")?;
+    let mut encoder = GzipEncoder::new(Vec::new());
+    encoder.write_all(&data).await.context("compress metrics")?;
+    encoder.shutdown().await.context("compress metrics")?;
+    let compressed_data: Bytes = encoder.get_ref().clone().into();
+    backoff::retry(
+        || async {
+            let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone())));
+            storage
+                .upload(stream, data.len(), remote_path, None, cancel)
+                .await
+        },
+        TimeoutOrCancel::caused_by_cancel,
+        FAILED_UPLOAD_WARN_THRESHOLD,
+        FAILED_UPLOAD_MAX_RETRIES,
+        "request_data_upload",
+        cancel,
+    )
+    .await
+    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
+    .and_then(|x| x)
+    .context("request_data_upload")?;
+    Ok(())
 }
 
 #[cfg(test)]
@@ -248,7 +493,7 @@ mod tests {
     };
     use url::Url;
 
-    use super::{collect_metrics_iteration, Ids, Metrics};
+    use super::*;
     use crate::{http, rate_limiter::RateLimiterConfig};
 
     #[tokio::test]
@@ -284,18 +529,19 @@ mod tests {
         let now = Utc::now();
 
         // no counters have been registered
-        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
         let r = std::mem::take(&mut *reports2.lock().unwrap());
         assert!(r.is_empty());
 
         // register a new counter
+
         let counter = metrics.register(Ids {
             endpoint_id: "e1".into(),
             branch_id: "b1".into(),
         });
 
         // the counter should be observed despite 0 egress
-        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
         let r = std::mem::take(&mut *reports2.lock().unwrap());
         assert_eq!(r.len(), 1);
         assert_eq!(r[0].events.len(), 1);
@@ -305,7 +551,7 @@ mod tests {
         counter.record_egress(1);
 
         // egress should be observered
-        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
         let r = std::mem::take(&mut *reports2.lock().unwrap());
         assert_eq!(r.len(), 1);
         assert_eq!(r[0].events.len(), 1);
@@ -315,11 +561,19 @@ mod tests {
         drop(counter);
 
         // we do not observe the counter
-        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
         let r = std::mem::take(&mut *reports2.lock().unwrap());
         assert!(r.is_empty());
 
         // counter is unregistered
         assert!(metrics.endpoints.is_empty());
+
+        collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
+            .await;
+        assert!(!metrics.backup_endpoints.is_empty());
+        collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
+            .await;
+        // backup counter is unregistered after the second iteration
+        assert!(metrics.backup_endpoints.is_empty());
     }
 }

From 6e3834d506e8b443a95890b59f5851397b563f35 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 3 Apr 2024 11:07:56 +0100
Subject: [PATCH 0505/1571] controller: add `storcon-cli` (#7114)

## Problem

During incidents, we may need to quickly access the storage controller's
API without trying API client code or crafting `curl` CLIs on the fly. A
basic CLI client is needed for this.

## Summary of changes

- Update storage controller node listing API to only use public types in
controller_api.rs
- Add a storage controller API for listing tenants
- Add a basic test that the CLI can list and modify nodes and tenants.
---
 Cargo.lock                                    |  21 +
 Cargo.toml                                    |   1 +
 control_plane/attachment_service/Cargo.toml   |   1 +
 control_plane/attachment_service/src/http.rs  |  17 +-
 control_plane/attachment_service/src/node.rs  |  16 +-
 .../attachment_service/src/service.rs         |  67 +-
 control_plane/src/bin/neon_local.rs           |  25 +-
 control_plane/storcon_cli/Cargo.toml          |  23 +
 control_plane/storcon_cli/src/main.rs         | 587 ++++++++++++++++++
 libs/pageserver_api/src/controller_api.rs     |  42 +-
 test_runner/regress/test_sharding_service.py  |  89 ++-
 11 files changed, 822 insertions(+), 67 deletions(-)
 create mode 100644 control_plane/storcon_cli/Cargo.toml
 create mode 100644 control_plane/storcon_cli/src/main.rs

diff --git a/Cargo.lock b/Cargo.lock
index 92c07b0c6f..ecc69f7048 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -288,6 +288,7 @@ dependencies = [
  "hex",
  "humantime",
  "hyper",
+ "itertools",
  "lasso",
  "measured",
  "metrics",
@@ -5622,6 +5623,26 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "storcon_cli"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "comfy-table",
+ "hyper",
+ "pageserver_api",
+ "pageserver_client",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "thiserror",
+ "tokio",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "stringprep"
 version = "0.1.2"
diff --git a/Cargo.toml b/Cargo.toml
index 309ebbe119..9f24176c65 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,6 +4,7 @@ members = [
     "compute_tools",
     "control_plane",
     "control_plane/attachment_service",
+    "control_plane/storcon_cli",
     "pageserver",
     "pageserver/compaction",
     "pageserver/ctl",
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 0201e0ed86..595b091df4 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -25,6 +25,7 @@ git-version.workspace = true
 hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
+itertools.workspace = true
 lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 1f3f78bffa..03883f0ca2 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -399,6 +399,15 @@ async fn handle_tenant_describe(
     json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }
 
+async fn handle_tenant_list(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    json_response(StatusCode::OK, service.tenant_list())
+}
+
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -412,7 +421,10 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
     check_permissions(&req, Scope::Admin)?;
 
     let state = get_state(&req);
-    json_response(StatusCode::OK, state.service.node_list().await?)
+    let nodes = state.service.node_list().await?;
+    let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
+
+    json_response(StatusCode::OK, api_nodes)
 }
 
 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -793,6 +805,9 @@ pub fn make_router(
                 RequestName("control_v1_tenant_describe"),
             )
         })
+        .get("/control/v1/tenant", |r| {
+            tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list"))
+        })
         .put("/control/v1/tenant/:tenant_id/policy", |r| {
             named_request_span(
                 r,
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index df40bff66f..7ba6828deb 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -3,7 +3,8 @@ use std::{str::FromStr, time::Duration};
 use hyper::StatusCode;
 use pageserver_api::{
     controller_api::{
-        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
+        NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
+        TenantLocateResponseShard,
     },
     shard::TenantShardId,
 };
@@ -256,6 +257,19 @@ impl Node {
         )
         .await
     }
+
+    /// Generate the simplified API-friendly description of a node's state
+    pub(crate) fn describe(&self) -> NodeDescribeResponse {
+        NodeDescribeResponse {
+            id: self.id,
+            availability: self.availability.into(),
+            scheduling: self.scheduling,
+            listen_http_addr: self.listen_http_addr.clone(),
+            listen_http_port: self.listen_http_port,
+            listen_pg_addr: self.listen_pg_addr.clone(),
+            listen_pg_port: self.listen_pg_port,
+        }
+    }
 }
 
 impl std::fmt::Display for Node {
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 7502d9d186..0b67e30b96 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -20,6 +20,7 @@ use control_plane::storage_controller::{
 use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
+use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
         NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
@@ -2735,47 +2736,73 @@ impl Service {
         })
     }
 
-    pub(crate) fn tenant_describe(
+    /// Returns None if the input iterator of shards does not include a shard with number=0
+    fn tenant_describe_impl<'a>(
         &self,
-        tenant_id: TenantId,
-    ) -> Result<TenantDescribeResponse, ApiError> {
-        let locked = self.inner.read().unwrap();
-
+        shards: impl Iterator<Item = &'a TenantState>,
+    ) -> Option<TenantDescribeResponse> {
         let mut shard_zero = None;
-        let mut shards = Vec::new();
+        let mut describe_shards = Vec::new();
 
-        for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-        {
-            if tenant_shard_id.is_zero() {
+        for shard in shards {
+            if shard.tenant_shard_id.is_zero() {
                 shard_zero = Some(shard);
             }
 
-            let response_shard = TenantDescribeResponseShard {
-                tenant_shard_id: *tenant_shard_id,
+            describe_shards.push(TenantDescribeResponseShard {
+                tenant_shard_id: shard.tenant_shard_id,
                 node_attached: *shard.intent.get_attached(),
                 node_secondary: shard.intent.get_secondary().to_vec(),
                 last_error: shard.last_error.lock().unwrap().clone(),
                 is_reconciling: shard.reconciler.is_some(),
                 is_pending_compute_notification: shard.pending_compute_notification,
                 is_splitting: matches!(shard.splitting, SplitState::Splitting),
-            };
-            shards.push(response_shard);
+                scheduling_policy: *shard.get_scheduling_policy(),
+            })
         }
 
-        let Some(shard_zero) = shard_zero else {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
-            ));
-        };
+        let shard_zero = shard_zero?;
 
-        Ok(TenantDescribeResponse {
-            shards,
+        Some(TenantDescribeResponse {
+            tenant_id: shard_zero.tenant_shard_id.tenant_id,
+            shards: describe_shards,
             stripe_size: shard_zero.shard.stripe_size,
             policy: shard_zero.policy.clone(),
             config: shard_zero.config.clone(),
         })
     }
 
+    pub(crate) fn tenant_describe(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantDescribeResponse, ApiError> {
+        let locked = self.inner.read().unwrap();
+
+        self.tenant_describe_impl(
+            locked
+                .tenants
+                .range(TenantShardId::tenant_range(tenant_id))
+                .map(|(_k, v)| v),
+        )
+        .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
+    }
+
+    pub(crate) fn tenant_list(&self) -> Vec<TenantDescribeResponse> {
+        let locked = self.inner.read().unwrap();
+
+        let mut result = Vec::new();
+        for (_tenant_id, tenant_shards) in
+            &locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id)
+        {
+            result.push(
+                self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v))
+                    .expect("Groups are always non-empty"),
+            );
+        }
+
+        result
+    }
+
     #[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
     async fn abort_tenant_shard_split(
         &self,
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 401feae706..56495dd2da 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -14,9 +14,7 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
-use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
-};
+use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::models::{
     ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
@@ -1060,21 +1058,6 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
             }
         }
 
-        Some(("set-state", subcommand_args)) => {
-            let pageserver = get_pageserver(env, subcommand_args)?;
-            let scheduling = subcommand_args.get_one("scheduling");
-            let availability = subcommand_args.get_one("availability");
-
-            let storage_controller = StorageController::from_env(env);
-            storage_controller
-                .node_configure(NodeConfigureRequest {
-                    node_id: pageserver.conf.id,
-                    scheduling: scheduling.cloned(),
-                    availability: availability.cloned(),
-                })
-                .await?;
-        }
-
         Some(("status", subcommand_args)) => {
             match get_pageserver(env, subcommand_args)?.check_status().await {
                 Ok(_) => println!("Page server is up and running"),
@@ -1515,12 +1498,6 @@ fn cli() -> Command {
                     .about("Restart local pageserver")
                     .arg(pageserver_config_args.clone())
                 )
-                .subcommand(Command::new("set-state")
-                    .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
-                    .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
-                    .about("Set scheduling or availability state of pageserver node")
-                    .arg(pageserver_config_args.clone())
-                )
         )
         .subcommand(
             Command::new("storage_controller")
diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml
new file mode 100644
index 0000000000..61eb7fa4e4
--- /dev/null
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "storcon_cli"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+
+[dependencies]
+anyhow.workspace = true
+clap.workspace = true
+comfy-table.workspace = true
+hyper.workspace = true
+pageserver_api.workspace = true
+pageserver_client.workspace = true
+reqwest.workspace = true
+serde.workspace = true
+serde_json = { workspace = true, features = ["raw_value"] }
+thiserror.workspace = true
+tokio.workspace = true
+tracing.workspace = true
+utils.workspace = true
+workspace_hack.workspace = true
+
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
new file mode 100644
index 0000000000..f72bc9a2a9
--- /dev/null
+++ b/control_plane/storcon_cli/src/main.rs
@@ -0,0 +1,587 @@
+use std::{collections::HashMap, str::FromStr};
+
+use clap::{Parser, Subcommand};
+use hyper::Method;
+use pageserver_api::{
+    controller_api::{
+        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
+        TenantDescribeResponse, TenantPolicyRequest,
+    },
+    models::{
+        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
+        TenantShardSplitRequest, TenantShardSplitResponse,
+    },
+    shard::{ShardStripeSize, TenantShardId},
+};
+use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
+use reqwest::Url;
+use serde::{de::DeserializeOwned, Serialize};
+use utils::id::{NodeId, TenantId};
+
+use pageserver_api::controller_api::{
+    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
+    TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
+};
+
+#[derive(Subcommand, Debug)]
+enum Command {
+    /// Register a pageserver with the storage controller.  This shouldn't usually be necessary,
+    /// since pageservers auto-register when they start up
+    NodeRegister {
+        #[arg(long)]
+        node_id: NodeId,
+
+        #[arg(long)]
+        listen_pg_addr: String,
+        #[arg(long)]
+        listen_pg_port: u16,
+
+        #[arg(long)]
+        listen_http_addr: String,
+        #[arg(long)]
+        listen_http_port: u16,
+    },
+
+    /// Modify a node's configuration in the storage controller
+    NodeConfigure {
+        #[arg(long)]
+        node_id: NodeId,
+
+        /// Availability is usually auto-detected based on heartbeats.  Set 'offline' here to
+        /// manually mark a node offline
+        #[arg(long)]
+        availability: Option<NodeAvailabilityArg>,
+        /// Scheduling policy controls whether tenant shards may be scheduled onto this node.
+        #[arg(long)]
+        scheduling: Option<NodeSchedulingPolicy>,
+    },
+    /// Modify a tenant's policies in the storage controller
+    TenantPolicy {
+        #[arg(long)]
+        tenant_id: TenantId,
+        /// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
+        /// or is in the normal attached state with N secondary locations (`attached:N`)
+        #[arg(long)]
+        placement: Option<PlacementPolicyArg>,
+        /// Scheduling policy enables pausing the controller's scheduling activity involving this tenant.  `active` is normal,
+        /// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
+        /// all reconciliation activity including for scheduling changes already made.  `pause` and `stop` can make a tenant
+        /// unavailable, and are only for use in emergencies.
+        #[arg(long)]
+        scheduling: Option<ShardSchedulingPolicyArg>,
+    },
+    /// List nodes known to the storage controller
+    Nodes {},
+    /// List tenants known to the storage controller
+    Tenants {},
+    /// Create a new tenant in the storage controller, and by extension on pageservers.
+    TenantCreate {
+        #[arg(long)]
+        tenant_id: TenantId,
+    },
+    /// Delete a tenant in the storage controller, and by extension on pageservers.
+    TenantDelete {
+        #[arg(long)]
+        tenant_id: TenantId,
+    },
+    /// Split an existing tenant into a higher number of shards than its current shard count.
+    TenantShardSplit {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        shard_count: u8,
+        /// Optional, in 8kiB pages.  e.g. set 2048 for 16MB stripes.
+        #[arg(long)]
+        stripe_size: Option<u32>,
+    },
+    /// Migrate the attached location for a tenant shard to a specific pageserver.
+    TenantShardMigrate {
+        #[arg(long)]
+        tenant_shard_id: TenantShardId,
+        #[arg(long)]
+        node: NodeId,
+    },
+    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
+    /// that is passed through to pageservers, and does not affect storage controller behavior.
+    TenantConfig {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        config: String,
+    },
+    /// Attempt to balance the locations for a tenant across pageservers.  This is a client-side
+    /// alternative to the storage controller's scheduling optimization behavior.
+    TenantScatter {
+        #[arg(long)]
+        tenant_id: TenantId,
+    },
+    /// Print details about a particular tenant, including all its shards' states.
+    TenantDescribe {
+        #[arg(long)]
+        tenant_id: TenantId,
+    },
+}
+
+#[derive(Parser)]
+#[command(
+    author,
+    version,
+    about,
+    long_about = "CLI for Storage Controller Support/Debug"
+)]
+#[command(arg_required_else_help(true))]
+struct Cli {
+    #[arg(long)]
+    /// URL to storage controller.  e.g. http://127.0.0.1:1234 when using `neon_local`
+    api: Url,
+
+    #[arg(long)]
+    /// JWT token for authenticating with storage controller.  Depending on the API used, this
+    /// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
+    /// a token with both scopes to use with this tool.
+    jwt: Option<String>,
+
+    #[command(subcommand)]
+    command: Command,
+}
+
+#[derive(Debug, Clone)]
+struct PlacementPolicyArg(PlacementPolicy);
+
+impl FromStr for PlacementPolicyArg {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "detached" => Ok(Self(PlacementPolicy::Detached)),
+            "secondary" => Ok(Self(PlacementPolicy::Secondary)),
+            _ if s.starts_with("attached:") => {
+                let mut splitter = s.split(':');
+                let _prefix = splitter.next().unwrap();
+                match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
+                    Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
+                    None => Err(anyhow::anyhow!(
+                        "Invalid format '{s}', a valid example is 'attached:1'"
+                    )),
+                }
+            }
+            _ => Err(anyhow::anyhow!(
+                "Unknown placement policy '{s}', try detached,secondary,attached:<n>"
+            )),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
+
+impl FromStr for ShardSchedulingPolicyArg {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self(ShardSchedulingPolicy::Active)),
+            "essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
+            "pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
+            "stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
+            _ => Err(anyhow::anyhow!(
+                "Unknown scheduling policy '{s}', try active,essential,pause,stop"
+            )),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct NodeAvailabilityArg(NodeAvailabilityWrapper);
+
+impl FromStr for NodeAvailabilityArg {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self(NodeAvailabilityWrapper::Active)),
+            "offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
+        }
+    }
+}
+
+struct Client {
+    base_url: Url,
+    jwt_token: Option<String>,
+    client: reqwest::Client,
+}
+
+impl Client {
+    fn new(base_url: Url, jwt_token: Option<String>) -> Self {
+        Self {
+            base_url,
+            jwt_token,
+            client: reqwest::ClientBuilder::new()
+                .build()
+                .expect("Failed to construct http client"),
+        }
+    }
+
+    /// Simple HTTP request wrapper for calling into attachment service
+    async fn dispatch<RQ, RS>(
+        &self,
+        method: hyper::Method,
+        path: String,
+        body: Option<RQ>,
+    ) -> mgmt_api::Result<RS>
+    where
+        RQ: Serialize + Sized,
+        RS: DeserializeOwned + Sized,
+    {
+        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
+        // for general purpose API access.
+        let url = Url::from_str(&format!(
+            "http://{}:{}/{path}",
+            self.base_url.host_str().unwrap(),
+            self.base_url.port().unwrap()
+        ))
+        .unwrap();
+
+        let mut builder = self.client.request(method, url);
+        if let Some(body) = body {
+            builder = builder.json(&body)
+        }
+        if let Some(jwt_token) = &self.jwt_token {
+            builder = builder.header(
+                reqwest::header::AUTHORIZATION,
+                format!("Bearer {jwt_token}"),
+            );
+        }
+
+        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
+        let response = response.error_from_body().await?;
+
+        response
+            .json()
+            .await
+            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
+    }
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let cli = Cli::parse();
+
+    let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
+
+    let mut trimmed = cli.api.to_string();
+    trimmed.pop();
+    let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
+
+    match cli.command {
+        Command::NodeRegister {
+            node_id,
+            listen_pg_addr,
+            listen_pg_port,
+            listen_http_addr,
+            listen_http_port,
+        } => {
+            storcon_client
+                .dispatch::<_, ()>(
+                    Method::POST,
+                    "control/v1/node".to_string(),
+                    Some(NodeRegisterRequest {
+                        node_id,
+                        listen_pg_addr,
+                        listen_pg_port,
+                        listen_http_addr,
+                        listen_http_port,
+                    }),
+                )
+                .await?;
+        }
+        Command::TenantCreate { tenant_id } => {
+            vps_client
+                .tenant_create(&TenantCreateRequest {
+                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    generation: None,
+                    shard_parameters: ShardParameters::default(),
+                    placement_policy: Some(PlacementPolicy::Attached(1)),
+                    config: TenantConfig::default(),
+                })
+                .await?;
+        }
+        Command::TenantDelete { tenant_id } => {
+            let status = vps_client
+                .tenant_delete(TenantShardId::unsharded(tenant_id))
+                .await?;
+            tracing::info!("Delete status: {}", status);
+        }
+        Command::Nodes {} => {
+            let resp = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+            let mut table = comfy_table::Table::new();
+            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
+            for node in resp {
+                table.add_row([
+                    format!("{}", node.id),
+                    node.listen_http_addr,
+                    format!("{:?}", node.scheduling),
+                    format!("{:?}", node.availability),
+                ]);
+            }
+            println!("{table}");
+        }
+        Command::NodeConfigure {
+            node_id,
+            availability,
+            scheduling,
+        } => {
+            let req = NodeConfigureRequest {
+                node_id,
+                availability: availability.map(|a| a.0),
+                scheduling,
+            };
+            storcon_client
+                .dispatch::<_, ()>(
+                    Method::PUT,
+                    format!("control/v1/node/{node_id}/config"),
+                    Some(req),
+                )
+                .await?;
+        }
+        Command::Tenants {} => {
+            let resp = storcon_client
+                .dispatch::<(), Vec<TenantDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/tenant".to_string(),
+                    None,
+                )
+                .await?;
+            let mut table = comfy_table::Table::new();
+            table.set_header([
+                "TenantId",
+                "ShardCount",
+                "StripeSize",
+                "Placement",
+                "Scheduling",
+            ]);
+            for tenant in resp {
+                let shard_zero = tenant.shards.into_iter().next().unwrap();
+                table.add_row([
+                    format!("{}", tenant.tenant_id),
+                    format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
+                    format!("{:?}", tenant.stripe_size),
+                    format!("{:?}", tenant.policy),
+                    format!("{:?}", shard_zero.scheduling_policy),
+                ]);
+            }
+
+            println!("{table}");
+        }
+        Command::TenantPolicy {
+            tenant_id,
+            placement,
+            scheduling,
+        } => {
+            let req = TenantPolicyRequest {
+                scheduling: scheduling.map(|s| s.0),
+                placement: placement.map(|p| p.0),
+            };
+            storcon_client
+                .dispatch::<_, ()>(
+                    Method::PUT,
+                    format!("control/v1/tenant/{tenant_id}/policy"),
+                    Some(req),
+                )
+                .await?;
+        }
+        Command::TenantShardSplit {
+            tenant_id,
+            shard_count,
+            stripe_size,
+        } => {
+            let req = TenantShardSplitRequest {
+                new_shard_count: shard_count,
+                new_stripe_size: stripe_size.map(ShardStripeSize),
+            };
+
+            let response = storcon_client
+                .dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
+                    Method::PUT,
+                    format!("control/v1/tenant/{tenant_id}/shard_split"),
+                    Some(req),
+                )
+                .await?;
+            println!(
+                "Split tenant {} into {} shards: {}",
+                tenant_id,
+                shard_count,
+                response
+                    .new_shards
+                    .iter()
+                    .map(|s| format!("{:?}", s))
+                    .collect::<Vec<_>>()
+                    .join(",")
+            );
+        }
+        Command::TenantShardMigrate {
+            tenant_shard_id,
+            node,
+        } => {
+            let req = TenantShardMigrateRequest {
+                tenant_shard_id,
+                node_id: node,
+            };
+
+            storcon_client
+                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
+                    Method::PUT,
+                    format!("control/v1/tenant/{tenant_shard_id}/migrate"),
+                    Some(req),
+                )
+                .await?;
+        }
+        Command::TenantConfig { tenant_id, config } => {
+            let tenant_conf = serde_json::from_str(&config)?;
+
+            vps_client
+                .tenant_config(&TenantConfigRequest {
+                    tenant_id,
+                    config: tenant_conf,
+                })
+                .await?;
+        }
+        Command::TenantScatter { tenant_id } => {
+            // Find the shards
+            let locate_response = storcon_client
+                .dispatch::<(), TenantLocateResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}/locate"),
+                    None,
+                )
+                .await?;
+            let shards = locate_response.shards;
+
+            let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
+            let shard_count = shards.len();
+            for s in shards {
+                let entry = node_to_shards.entry(s.node_id).or_default();
+                entry.push(s.shard_id);
+            }
+
+            // Load list of available nodes
+            let nodes_resp = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+
+            for node in nodes_resp {
+                if matches!(node.availability, NodeAvailabilityWrapper::Active) {
+                    node_to_shards.entry(node.id).or_default();
+                }
+            }
+
+            let max_shard_per_node = shard_count / node_to_shards.len();
+
+            loop {
+                let mut migrate_shard = None;
+                for shards in node_to_shards.values_mut() {
+                    if shards.len() > max_shard_per_node {
+                        // Pick the emptiest
+                        migrate_shard = Some(shards.pop().unwrap());
+                    }
+                }
+                let Some(migrate_shard) = migrate_shard else {
+                    break;
+                };
+
+                // Pick the emptiest node to migrate to
+                let mut destinations = node_to_shards
+                    .iter()
+                    .map(|(k, v)| (k, v.len()))
+                    .collect::<Vec<_>>();
+                destinations.sort_by_key(|i| i.1);
+                let (destination_node, destination_count) = *destinations.first().unwrap();
+                if destination_count + 1 > max_shard_per_node {
+                    // Even the emptiest destination doesn't have space: we're done
+                    break;
+                }
+                let destination_node = *destination_node;
+
+                node_to_shards
+                    .get_mut(&destination_node)
+                    .unwrap()
+                    .push(migrate_shard);
+
+                println!("Migrate {} -> {} ...", migrate_shard, destination_node);
+
+                storcon_client
+                    .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
+                        Method::PUT,
+                        format!("control/v1/tenant/{migrate_shard}/migrate"),
+                        Some(TenantShardMigrateRequest {
+                            tenant_shard_id: migrate_shard,
+                            node_id: destination_node,
+                        }),
+                    )
+                    .await?;
+                println!("Migrate {} -> {} OK", migrate_shard, destination_node);
+            }
+
+            // Spread the shards across the nodes
+        }
+        Command::TenantDescribe { tenant_id } => {
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await?;
+            let shards = describe_response.shards;
+            let mut table = comfy_table::Table::new();
+            table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
+            for shard in shards {
+                let secondary = shard
+                    .node_secondary
+                    .iter()
+                    .map(|n| format!("{}", n))
+                    .collect::<Vec<_>>()
+                    .join(",");
+
+                let mut status_parts = Vec::new();
+                if shard.is_reconciling {
+                    status_parts.push("reconciling");
+                }
+
+                if shard.is_pending_compute_notification {
+                    status_parts.push("pending_compute");
+                }
+
+                if shard.is_splitting {
+                    status_parts.push("splitting");
+                }
+                let status = status_parts.join(",");
+
+                table.add_row([
+                    format!("{}", shard.tenant_shard_id),
+                    shard
+                        .node_attached
+                        .map(|n| format!("{}", n))
+                        .unwrap_or(String::new()),
+                    secondary,
+                    shard.last_error,
+                    status,
+                ]);
+            }
+            println!("{table}");
+        }
+    }
+
+    Ok(())
+}
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index dcf9e38106..be24d452b6 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -4,7 +4,7 @@ use std::str::FromStr;
 /// API (`/control/v1` prefix).  Implemented by the server
 /// in [`attachment_service::http`]
 use serde::{Deserialize, Serialize};
-use utils::id::NodeId;
+use utils::id::{NodeId, TenantId};
 
 use crate::{
     models::{ShardParameters, TenantConfig},
@@ -68,12 +68,27 @@ pub struct TenantLocateResponse {
 
 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponse {
+    pub tenant_id: TenantId,
     pub shards: Vec<TenantDescribeResponseShard>,
     pub stripe_size: ShardStripeSize,
     pub policy: PlacementPolicy,
     pub config: TenantConfig,
 }
 
+#[derive(Serialize, Deserialize)]
+pub struct NodeDescribeResponse {
+    pub id: NodeId,
+
+    pub availability: NodeAvailabilityWrapper,
+    pub scheduling: NodeSchedulingPolicy,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponseShard {
     pub tenant_shard_id: TenantShardId,
@@ -89,6 +104,8 @@ pub struct TenantDescribeResponseShard {
     pub is_pending_compute_notification: bool,
     /// A shard split is currently underway
     pub is_splitting: bool,
+
+    pub scheduling_policy: ShardSchedulingPolicy,
 }
 
 /// Explicitly migrating a particular shard is a low level operation
@@ -103,7 +120,7 @@ pub struct TenantShardMigrateRequest {
 /// Utilisation score indicating how good a candidate a pageserver
 /// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
 /// Lower values are better.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
 pub struct UtilizationScore(pub u64);
 
 impl UtilizationScore {
@@ -112,7 +129,7 @@ impl UtilizationScore {
     }
 }
 
-#[derive(Serialize, Clone, Copy)]
+#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
     // Normal, happy state
@@ -135,7 +152,7 @@ impl Eq for NodeAvailability {}
 // This wrapper provides serde functionality and it should only be used to
 // communicate with external callers which don't know or care about the
 // utilisation score of the pageserver it is targeting.
-#[derive(Serialize, Deserialize, Clone)]
+#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
 pub enum NodeAvailabilityWrapper {
     Active,
     Offline,
@@ -161,21 +178,6 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
     }
 }
 
-impl FromStr for NodeAvailability {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            // This is used when parsing node configuration requests from neon-local.
-            // Assume the worst possible utilisation score
-            // and let it get updated via the heartbeats.
-            "active" => Ok(Self::Active(UtilizationScore::worst())),
-            "offline" => Ok(Self::Offline),
-            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
-        }
-    }
-}
-
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum ShardSchedulingPolicy {
     // Normal mode: the tenant's scheduled locations may be updated at will, including
@@ -202,7 +204,7 @@ impl Default for ShardSchedulingPolicy {
     }
 }
 
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum NodeSchedulingPolicy {
     Active,
     Filling,
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 5a86e03d2b..7df0b58596 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -1,3 +1,4 @@
+import json
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
@@ -24,7 +25,7 @@ from fixtures.pageserver.utils import (
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.types import TenantId, TenantShardId, TimelineId
-from fixtures.utils import run_pg_bench_small, wait_until
+from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until
 from mypy_boto3_s3.type_defs import (
     ObjectTypeDef,
 )
@@ -1131,3 +1132,89 @@ def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBui
 
     # And indeed the tenant should be attached
     assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1
+
+
+def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
+    """
+    The storage controller command line interface (storcon-cli) is an internal tool.  Most tests
+    just use the APIs directly: this test exercises some basics of the CLI as a regression test
+    that the client remains usable as the server evolves.
+    """
+    output_dir = neon_env_builder.test_output_dir
+    shard_count = 4
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    base_args = [env.neon_binpath / "storcon_cli", "--api", env.storage_controller_api]
+
+    def storcon_cli(args):
+        """
+        CLI wrapper: returns stdout split into a list of non-empty strings
+        """
+        (output_path, stdout, status_code) = subprocess_capture(
+            output_dir,
+            [str(s) for s in base_args + args],
+            echo_stderr=True,
+            echo_stdout=True,
+            env={},
+            check=False,
+            capture_stdout=True,
+            timeout=10,
+        )
+        if status_code:
+            log.warning(f"Command {args} failed")
+            log.warning(f"Output at: {output_path}")
+
+            raise RuntimeError("CLI failure (check logs for stderr)")
+
+        assert stdout is not None
+        return [line.strip() for line in stdout.split("\n") if line.strip()]
+
+    # List nodes
+    node_lines = storcon_cli(["nodes"])
+    # Table header, footer, and one line of data
+    assert len(node_lines) == 5
+    assert "localhost" in node_lines[3]
+
+    # Pause scheduling onto a node
+    storcon_cli(["node-configure", "--node-id", "1", "--scheduling", "pause"])
+    assert "Pause" in storcon_cli(["nodes"])[3]
+
+    # Make a node offline
+    storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"])
+    assert "Offline" in storcon_cli(["nodes"])[3]
+
+    # List tenants
+    tenant_lines = storcon_cli(["tenants"])
+    assert len(tenant_lines) == 5
+    assert str(env.initial_tenant) in tenant_lines[3]
+
+    env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy.*")
+
+    # Describe a tenant
+    tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)])
+    assert len(tenant_lines) == 3 + shard_count * 2
+    assert str(env.initial_tenant) in tenant_lines[3]
+
+    # Pause changes on a tenant
+    storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])
+    assert "Stop" in storcon_cli(["tenants"])[3]
+
+    # Change a tenant's placement
+    storcon_cli(
+        ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"]
+    )
+    assert "Secondary" in storcon_cli(["tenants"])[3]
+
+    # Modify a tenant's config
+    storcon_cli(
+        [
+            "tenant-config",
+            "--tenant-id",
+            str(env.initial_tenant),
+            "--config",
+            json.dumps({"pitr_interval": "1m"}),
+        ]
+    )
+
+    # Quiesce any background reconciliation before doing consistency check
+    env.storage_controller.reconcile_until_idle(timeout_secs=10)
+    env.storage_controller.consistency_check()

From d8da51e78a5664da12e794e7af22b3bb5930cb77 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 3 Apr 2024 11:23:26 +0100
Subject: [PATCH 0506/1571] remove http timeout (#7291)

## Problem

https://github.com/neondatabase/cloud/issues/11051

additionally, I felt like the http logic was a bit complex.

## Summary of changes

1. Removes timeout for HTTP requests.
2. Split out header parsing to a `HttpHeaders` type.
3. Moved db client handling to `QueryData::process` and
`BatchQueryData::process` to simplify the logic of `handle_inner` a bit.
---
 proxy/src/metrics.rs                  |  13 +-
 proxy/src/serverless/sql_over_http.rs | 372 +++++++++++++++-----------
 test_runner/regress/test_proxy.py     |  32 ---
 3 files changed, 217 insertions(+), 200 deletions(-)

diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 4172dc19da..9da1fdc02f 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -117,12 +117,15 @@ pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
     .unwrap()
 });
 
-pub static HTTP_CONTENT_LENGTH: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+pub static HTTP_CONTENT_LENGTH: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
         "proxy_http_conn_content_length_bytes",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // largest bucket = 3^16 * 0.05ms = 2.15s
-        exponential_buckets(8.0, 2.0, 20).unwrap()
+        "Number of bytes the HTTP response content consumes",
+        // request/response
+        &["direction"],
+        // smallest bucket = 16 bytes
+        // largest bucket = 4^12 * 16 bytes = 256MB
+        exponential_buckets(16.0, 4.0, 12).unwrap()
     )
     .unwrap()
 });
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index d5f2fea487..00dffd5784 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -42,6 +42,7 @@ use crate::error::ReportableError;
 use crate::error::UserFacingError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
+use crate::proxy::run_until_cancelled;
 use crate::proxy::NeonOptions;
 use crate::serverless::backend::HttpConnError;
 use crate::usage_metrics::MetricCounterRecorder;
@@ -49,6 +50,7 @@ use crate::DbName;
 use crate::RoleName;
 
 use super::backend::PoolingBackend;
+use super::conn_pool::Client;
 use super::conn_pool::ConnInfo;
 use super::json::json_to_pg_text;
 use super::json::pg_text_row_to_json;
@@ -220,14 +222,7 @@ pub async fn handle(
     backend: Arc<PoolingBackend>,
     cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let cancel2 = cancel.clone();
-    let handle = tokio::spawn(async move {
-        time::sleep(config.http_config.request_timeout).await;
-        cancel2.cancel();
-    });
-
     let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
-    handle.abort();
 
     let mut response = match result {
         Ok(r) => {
@@ -238,10 +233,7 @@ pub async fn handle(
             let error_kind = e.get_error_kind();
             ctx.set_error_kind(error_kind);
 
-            let message = format!(
-                "Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections",
-                config.http_config.request_timeout.as_secs_f64()
-            );
+            let message = "Query cancelled, connection was terminated";
 
             tracing::info!(
                 kind=error_kind.to_metric_label(),
@@ -435,6 +427,63 @@ impl ReportableError for SqlOverHttpCancel {
     }
 }
 
+#[derive(Clone, Copy, Debug)]
+struct HttpHeaders {
+    raw_output: bool,
+    default_array_mode: bool,
+    txn_isolation_level: Option<IsolationLevel>,
+    txn_read_only: bool,
+    txn_deferrable: bool,
+}
+
+impl HttpHeaders {
+    fn try_parse(headers: &hyper::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
+        // Determine the output options. Default behaviour is 'false'. Anything that is not
+        // strictly 'true' assumed to be false.
+        let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
+        let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
+
+        // isolation level, read only and deferrable
+        let txn_isolation_level = match headers.get(&TXN_ISOLATION_LEVEL) {
+            Some(x) => Some(
+                map_header_to_isolation_level(x).ok_or(SqlOverHttpError::InvalidIsolationLevel)?,
+            ),
+            None => None,
+        };
+
+        let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
+        let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
+
+        Ok(Self {
+            raw_output,
+            default_array_mode,
+            txn_isolation_level,
+            txn_read_only,
+            txn_deferrable,
+        })
+    }
+}
+
+fn map_header_to_isolation_level(level: &HeaderValue) -> Option<IsolationLevel> {
+    match level.as_bytes() {
+        b"Serializable" => Some(IsolationLevel::Serializable),
+        b"ReadUncommitted" => Some(IsolationLevel::ReadUncommitted),
+        b"ReadCommitted" => Some(IsolationLevel::ReadCommitted),
+        b"RepeatableRead" => Some(IsolationLevel::RepeatableRead),
+        _ => None,
+    }
+}
+
+fn map_isolation_level_to_headers(level: IsolationLevel) -> Option<HeaderValue> {
+    match level {
+        IsolationLevel::ReadUncommitted => Some(HeaderValue::from_static("ReadUncommitted")),
+        IsolationLevel::ReadCommitted => Some(HeaderValue::from_static("ReadCommitted")),
+        IsolationLevel::RepeatableRead => Some(HeaderValue::from_static("RepeatableRead")),
+        IsolationLevel::Serializable => Some(HeaderValue::from_static("Serializable")),
+        _ => None,
+    }
+}
+
 async fn handle_inner(
     cancel: CancellationToken,
     config: &'static ProxyConfig,
@@ -451,43 +500,26 @@ async fn handle_inner(
     // Determine the destination and connection params
     //
     let headers = request.headers();
+
     // TLS config should be there.
     let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
     info!(user = conn_info.user_info.user.as_str(), "credentials");
 
-    // Determine the output options. Default behaviour is 'false'. Anything that is not
-    // strictly 'true' assumed to be false.
-    let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
-    let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
-
     // Allow connection pooling only if explicitly requested
     // or if we have decided that http pool is no longer opt-in
     let allow_pool = !config.http_config.pool_options.opt_in
         || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
 
-    // isolation level, read only and deferrable
-
-    let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
-    let txn_isolation_level = match txn_isolation_level_raw {
-        Some(ref x) => Some(match x.as_bytes() {
-            b"Serializable" => IsolationLevel::Serializable,
-            b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
-            b"ReadCommitted" => IsolationLevel::ReadCommitted,
-            b"RepeatableRead" => IsolationLevel::RepeatableRead,
-            _ => return Err(SqlOverHttpError::InvalidIsolationLevel),
-        }),
-        None => None,
-    };
-
-    let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
-    let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
+    let parsed_headers = HttpHeaders::try_parse(headers)?;
 
     let request_content_length = match request.body().size_hint().upper() {
         Some(v) => v,
         None => MAX_REQUEST_SIZE + 1,
     };
     info!(request_content_length, "request size in bytes");
-    HTTP_CONTENT_LENGTH.observe(request_content_length as f64);
+    HTTP_CONTENT_LENGTH
+        .with_label_values(&["request"])
+        .observe(request_content_length as f64);
 
     // we don't have a streaming request support yet so this is to prevent OOM
     // from a malicious user sending an extremely large request body
@@ -515,20 +547,18 @@ async fn handle_inner(
     }
     .map_err(SqlOverHttpError::from);
 
-    // Run both operations in parallel
-    let (payload, mut client) = match select(
+    let (payload, mut client) = match run_until_cancelled(
+        // Run both operations in parallel
         try_join(
             pin!(fetch_and_process_request),
             pin!(authenticate_and_connect),
         ),
-        pin!(cancel.cancelled()),
+        &cancel,
     )
     .await
     {
-        Either::Left((result, _cancelled)) => result?,
-        Either::Right((_cancelled, _)) => {
-            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect))
-        }
+        Some(result) => result?,
+        None => return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect)),
     };
 
     let mut response = Response::builder()
@@ -538,95 +568,143 @@ async fn handle_inner(
     //
     // Now execute the query and return the result
     //
-    let mut size = 0;
     let result = match payload {
-        Payload::Single(stmt) => {
-            let mut size = 0;
-            let (inner, mut discard) = client.inner();
-            let cancel_token = inner.cancel_token();
-            let query = pin!(query_to_json(
-                &*inner,
-                stmt,
-                &mut size,
-                raw_output,
-                default_array_mode
-            ));
-            let cancelled = pin!(cancel.cancelled());
-            let res = select(query, cancelled).await;
-            match res {
-                Either::Left((Ok((status, results)), _cancelled)) => {
-                    discard.check_idle(status);
-                    results
-                }
-                Either::Left((Err(e), _cancelled)) => {
-                    discard.discard();
-                    return Err(e);
-                }
-                Either::Right((_cancelled, query)) => {
-                    if let Err(err) = cancel_token.cancel_query(NoTls).await {
-                        tracing::error!(?err, "could not cancel query");
-                    }
-                    match time::timeout(time::Duration::from_millis(100), query).await {
-                        Ok(Ok((status, results))) => {
-                            discard.check_idle(status);
-                            results
-                        }
-                        Ok(Err(error)) => {
-                            let db_error = match &error {
-                                SqlOverHttpError::ConnectCompute(
-                                    HttpConnError::ConnectionError(e),
-                                )
-                                | SqlOverHttpError::Postgres(e) => e.as_db_error(),
-                                _ => None,
-                            };
-
-                            // if errored for some other reason, it might not be safe to return
-                            if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
-                                discard.discard();
-                            }
-
-                            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
-                        }
-                        Err(_timeout) => {
-                            discard.discard();
-                            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
-                        }
-                    }
-                }
-            }
-        }
+        Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?,
         Payload::Batch(statements) => {
-            info!("starting transaction");
-            let (inner, mut discard) = client.inner();
-            let cancel_token = inner.cancel_token();
-            let mut builder = inner.build_transaction();
-            if let Some(isolation_level) = txn_isolation_level {
-                builder = builder.isolation_level(isolation_level);
+            if parsed_headers.txn_read_only {
+                response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
             }
-            if txn_read_only {
-                builder = builder.read_only(true);
+            if parsed_headers.txn_deferrable {
+                response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE);
             }
-            if txn_deferrable {
-                builder = builder.deferrable(true);
-            }
-
-            let transaction = builder.start().await.map_err(|e| {
-                // if we cannot start a transaction, we should return immediately
-                // and not return to the pool. connection is clearly broken
-                discard.discard();
-                e
-            })?;
-
-            let results = match query_batch(
-                cancel.child_token(),
-                &transaction,
-                statements,
-                &mut size,
-                raw_output,
-                default_array_mode,
-            )
-            .await
+            if let Some(txn_isolation_level) = parsed_headers
+                .txn_isolation_level
+                .and_then(map_isolation_level_to_headers)
             {
+                response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
+            }
+
+            statements
+                .process(cancel, &mut client, parsed_headers)
+                .await?
+        }
+    };
+
+    let metrics = client.metrics();
+
+    // how could this possibly fail
+    let body = serde_json::to_string(&result).expect("json serialization should not fail");
+    let len = body.len();
+    let response = response
+        .body(Body::from(body))
+        // only fails if invalid status code or invalid header/values are given.
+        // these are not user configurable so it cannot fail dynamically
+        .expect("building response payload should not fail");
+
+    // count the egress bytes - we miss the TLS and header overhead but oh well...
+    // moving this later in the stack is going to be a lot of effort and ehhhh
+    metrics.record_egress(len as u64);
+    HTTP_CONTENT_LENGTH
+        .with_label_values(&["response"])
+        .observe(len as f64);
+
+    Ok(response)
+}
+
+impl QueryData {
+    async fn process(
+        self,
+        cancel: CancellationToken,
+        client: &mut Client<tokio_postgres::Client>,
+        parsed_headers: HttpHeaders,
+    ) -> Result<Value, SqlOverHttpError> {
+        let (inner, mut discard) = client.inner();
+        let cancel_token = inner.cancel_token();
+
+        let res = match select(
+            pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)),
+            pin!(cancel.cancelled()),
+        )
+        .await
+        {
+            // The query successfully completed.
+            Either::Left((Ok((status, results)), __not_yet_cancelled)) => {
+                discard.check_idle(status);
+                Ok(results)
+            }
+            // The query failed with an error
+            Either::Left((Err(e), __not_yet_cancelled)) => {
+                discard.discard();
+                return Err(e);
+            }
+            // The query was cancelled.
+            Either::Right((_cancelled, query)) => {
+                if let Err(err) = cancel_token.cancel_query(NoTls).await {
+                    tracing::error!(?err, "could not cancel query");
+                }
+                // wait for the query cancellation
+                match time::timeout(time::Duration::from_millis(100), query).await {
+                    // query successed before it was cancelled.
+                    Ok(Ok((status, results))) => {
+                        discard.check_idle(status);
+                        Ok(results)
+                    }
+                    // query failed or was cancelled.
+                    Ok(Err(error)) => {
+                        let db_error = match &error {
+                            SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
+                            | SqlOverHttpError::Postgres(e) => e.as_db_error(),
+                            _ => None,
+                        };
+
+                        // if errored for some other reason, it might not be safe to return
+                        if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
+                            discard.discard();
+                        }
+
+                        Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
+                    }
+                    Err(_timeout) => {
+                        discard.discard();
+                        Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
+                    }
+                }
+            }
+        };
+        res
+    }
+}
+
+impl BatchQueryData {
+    async fn process(
+        self,
+        cancel: CancellationToken,
+        client: &mut Client<tokio_postgres::Client>,
+        parsed_headers: HttpHeaders,
+    ) -> Result<Value, SqlOverHttpError> {
+        info!("starting transaction");
+        let (inner, mut discard) = client.inner();
+        let cancel_token = inner.cancel_token();
+        let mut builder = inner.build_transaction();
+        if let Some(isolation_level) = parsed_headers.txn_isolation_level {
+            builder = builder.isolation_level(isolation_level);
+        }
+        if parsed_headers.txn_read_only {
+            builder = builder.read_only(true);
+        }
+        if parsed_headers.txn_deferrable {
+            builder = builder.deferrable(true);
+        }
+
+        let transaction = builder.start().await.map_err(|e| {
+            // if we cannot start a transaction, we should return immediately
+            // and not return to the pool. connection is clearly broken
+            discard.discard();
+            e
+        })?;
+
+        let results =
+            match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
                 Ok(results) => {
                     info!("commit");
                     let status = transaction.commit().await.map_err(|e| {
@@ -660,44 +738,15 @@ async fn handle_inner(
                 }
             };
 
-            if txn_read_only {
-                response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
-            }
-            if txn_deferrable {
-                response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE);
-            }
-            if let Some(txn_isolation_level) = txn_isolation_level_raw {
-                response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
-            }
-            json!({ "results": results })
-        }
-    };
-
-    let metrics = client.metrics();
-
-    // how could this possibly fail
-    let body = serde_json::to_string(&result).expect("json serialization should not fail");
-    let len = body.len();
-    let response = response
-        .body(Body::from(body))
-        // only fails if invalid status code or invalid header/values are given.
-        // these are not user configurable so it cannot fail dynamically
-        .expect("building response payload should not fail");
-
-    // count the egress bytes - we miss the TLS and header overhead but oh well...
-    // moving this later in the stack is going to be a lot of effort and ehhhh
-    metrics.record_egress(len as u64);
-
-    Ok(response)
+        Ok(json!({ "results": results }))
+    }
 }
 
 async fn query_batch(
     cancel: CancellationToken,
     transaction: &Transaction<'_>,
     queries: BatchQueryData,
-    total_size: &mut usize,
-    raw_output: bool,
-    array_mode: bool,
+    parsed_headers: HttpHeaders,
 ) -> Result<Vec<Value>, SqlOverHttpError> {
     let mut results = Vec::with_capacity(queries.queries.len());
     let mut current_size = 0;
@@ -706,8 +755,7 @@ async fn query_batch(
             transaction,
             stmt,
             &mut current_size,
-            raw_output,
-            array_mode
+            parsed_headers,
         ));
         let cancelled = pin!(cancel.cancelled());
         let res = select(query, cancelled).await;
@@ -724,7 +772,6 @@ async fn query_batch(
             }
         }
     }
-    *total_size += current_size;
     Ok(results)
 }
 
@@ -732,8 +779,7 @@ async fn query_to_json<T: GenericClient>(
     client: &T,
     data: QueryData,
     current_size: &mut usize,
-    raw_output: bool,
-    default_array_mode: bool,
+    parsed_headers: HttpHeaders,
 ) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> {
     info!("executing query");
     let query_params = data.params;
@@ -793,12 +839,12 @@ async fn query_to_json<T: GenericClient>(
         columns.push(client.get_type(c.type_oid()).await?);
     }
 
-    let array_mode = data.array_mode.unwrap_or(default_array_mode);
+    let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode);
 
     // convert rows to JSON
     let rows = rows
         .iter()
-        .map(|row| pg_text_row_to_json(row, &columns, raw_output, array_mode))
+        .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode))
         .collect::<Result<Vec<_>, _>>()?;
 
     // resulting JSON format is based on the format of node-postgres result
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 3e986a8f7b..f446f4f200 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -566,38 +566,6 @@ async def test_sql_over_http2(static_proxy: NeonProxy):
     assert resp["rows"] == [{"answer": 42}]
 
 
-def test_sql_over_http_timeout_cancel(static_proxy: NeonProxy):
-    static_proxy.safe_psql("create role http with login password 'http' superuser")
-
-    static_proxy.safe_psql("create table test_table ( id int primary key )")
-
-    # insert into a table, with a unique constraint, after sleeping for n seconds
-    query = "WITH temp AS ( \
-        SELECT pg_sleep($1) as sleep, $2::int as id \
-    ) INSERT INTO test_table (id) SELECT id FROM temp"
-
-    # expect to fail with timeout
-    res = static_proxy.http_query(
-        query,
-        [static_proxy.http_timeout_seconds + 1, 1],
-        user="http",
-        password="http",
-        expected_code=400,
-    )
-    assert "Query cancelled, runtime exceeded" in res["message"], "HTTP query should time out"
-
-    time.sleep(2)
-
-    res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200)
-    assert res["command"] == "INSERT", "HTTP query should insert"
-    assert res["rowCount"] == 1, "HTTP query should insert"
-
-    res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400)
-    assert (
-        "duplicate key value violates unique constraint" in res["message"]
-    ), "HTTP query should conflict"
-
-
 def test_sql_over_http_connection_cancel(static_proxy: NeonProxy):
     static_proxy.safe_psql("create role http with login password 'http' superuser")
 

From bc05d7eb9c0dd228e34477b5916ce43680eeecb3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 3 Apr 2024 11:23:44 +0100
Subject: [PATCH 0507/1571] pageserver: even more debug for
 test_secondary_downloads (#7295)

The latest failures of test_secondary_downloads are spooky: layers are
missing on disk according to the test, but present according to the
pageserver logs:
- Make the pageserver assert that layers are really present on disk and
log the full path (debug mode only)
- Make the test dump a full listing on failure of the assert that failed
the last two times

Related: #6966
---
 pageserver/src/tenant/secondary/downloader.rs | 29 +++++++++++++++++++
 .../regress/test_pageserver_secondary.py      | 16 ++++++++--
 2 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 8782a9f04e..530e1a3244 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -786,6 +786,35 @@ impl<'a> TenantDownloader<'a> {
             // Existing on-disk layers: just update their access time.
             if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
                 tracing::debug!("Layer {} is already on disk", layer.name);
+
+                if cfg!(debug_assertions) {
+                    // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
+                    // are already present on disk are really there.
+                    let local_path = self
+                        .conf
+                        .timeline_path(tenant_shard_id, &timeline.timeline_id)
+                        .join(layer.name.file_name());
+                    match tokio::fs::metadata(&local_path).await {
+                        Ok(meta) => {
+                            tracing::debug!(
+                                "Layer {} present at {}, size {}",
+                                layer.name,
+                                local_path,
+                                meta.len(),
+                            );
+                        }
+                        Err(e) => {
+                            tracing::warn!(
+                                "Layer {} not found at {} ({})",
+                                layer.name,
+                                local_path,
+                                e
+                            );
+                            debug_assert!(false);
+                        }
+                    }
+                }
+
                 if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
                     || on_disk.access_time != layer.access_time
                 {
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index ca6f77c75f..345abdc072 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -498,9 +498,19 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
 
     ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
-    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
-        ps_secondary, tenant_id, timeline_id
-    )
+    try:
+        assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+            ps_secondary, tenant_id, timeline_id
+        )
+    except:
+        # Do a full listing of the secondary location on errors, to help debug of
+        # https://github.com/neondatabase/neon/issues/6966
+        timeline_path = ps_secondary.timeline_dir(tenant_id, timeline_id)
+        for path, _dirs, files in os.walk(timeline_path):
+            for f in files:
+                log.info(f"Secondary file: {os.path.join(path, f)}")
+
+        raise
 
     # FIXME: this sleep is needed to avoid on-demand promotion of the layers we evict, while
     # walreceiver is still doing something.

From 3de416a016a1fd34a3e49390ca0b8e2deed66665 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 3 Apr 2024 12:28:04 +0200
Subject: [PATCH 0508/1571] refactor(walreceiver): eliminate task_mgr usage
 (#7260)

We want to move the code base away from task_mgr.

This PR refactors the walreceiver code such that it doesn't use
`task_mgr` anymore.

# Background

As a reminder, there are three tasks in a Timeline that's ingesting WAL.
`WalReceiverManager`, `WalReceiverConnectionHandler`, and
`WalReceiverConnectionPoller`.
See the documentation in `task_mgr.rs` for how they interact.

Before this PR, cancellation was requested through
task_mgr::shutdown_token() and `TaskHandle::shutdown`.

Wait-for-task-finish was implemented using a mixture of
`task_mgr::shutdown_tasks` and `TaskHandle::shutdown`.

This drawing might help:

<img width="300" alt="image"
src="https://github.com/neondatabase/neon/assets/956573/b6be7ad6-ecb3-41d0-b410-ec85cb8d6d20">


# Changes

For cancellation, the entire WalReceiver task tree now has a
`child_token()` of `Timeline::cancel`. The `TaskHandle` no longer is a
cancellation root.
This means that `Timeline::cancel.cancel()` is propagated.

For wait-for-task-finish, all three tasks in the task tree hold the
`Timeline::gate` open until they exit.

The downside of using the `Timeline::gate` is that we can no longer wait
for just the walreceiver to shut down, which is particularly relevant
for `Timeline::flush_and_shutdown`.
Effectively, it means that we might ingest more WAL while the
`freeze_and_flush()` call is ongoing.

Also, drive-by-fix the assertiosn around task kinds in `wait_lsn`. The
check for `WalReceiverConnectionHandler` was ineffective because that
never was a task_mgr task, but a TaskHandle task. Refine the assertion
to check whether we would wait, and only fail in that case.

# Alternatives

I contemplated (ab-)using the `Gate` by having a separate `Gate` for
`struct WalReceiver`.
All the child tasks would use _that_ gate instead of `Timeline::gate`.
And `struct WalReceiver` itself would hold an `Option<GateGuard>` of the
`Timeline::gate`.
Then we could have a `WalReceiver::stop` function that closes the
WalReceiver's gate, then drops the `WalReceiver::Option<GateGuard>`.

However, such design would mean sharing the WalReceiver's `Gate` in an
`Arc`, which seems awkward.
A proper abstraction would be to make gates hierarchical, analogous to
CancellationToken.

In the end, @jcsp and I talked it over and we determined that it's not
worth the effort at this time.

# Refs

part of #7062
---
 libs/utils/src/seqwait.rs                     | 12 ++++
 pageserver/src/page_service.rs                | 24 ++++++-
 pageserver/src/task_mgr.rs                    |  8 +--
 pageserver/src/tenant.rs                      |  2 +-
 pageserver/src/tenant/mgr.rs                  |  9 ++-
 pageserver/src/tenant/timeline.rs             | 68 ++++++++++++-------
 pageserver/src/tenant/timeline/delete.rs      | 18 +----
 pageserver/src/tenant/timeline/walreceiver.rs | 53 +++++++--------
 .../walreceiver/connection_manager.rs         | 44 ++++++++++--
 .../walreceiver/walreceiver_connection.rs     | 34 ++++++----
 10 files changed, 174 insertions(+), 98 deletions(-)

diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs
index b7301776eb..0544c5be03 100644
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -182,6 +182,18 @@ where
         }
     }
 
+    /// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
+    pub fn would_wait_for(&self, num: V) -> Result<(), V> {
+        let internal = self.internal.lock().unwrap();
+        let cnt = internal.current.cnt_value();
+        drop(internal);
+        if cnt >= num {
+            Ok(())
+        } else {
+            Err(cnt)
+        }
+    }
+
     /// Register and return a channel that will be notified when a number arrives,
     /// or None, if it has already arrived.
     fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 3d622f1871..3b9a30ba4c 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -876,7 +876,13 @@ impl PageServerHandler {
             if lsn <= last_record_lsn {
                 lsn = last_record_lsn;
             } else {
-                timeline.wait_lsn(lsn, ctx).await?;
+                timeline
+                    .wait_lsn(
+                        lsn,
+                        crate::tenant::timeline::WaitLsnWaiter::PageService,
+                        ctx,
+                    )
+                    .await?;
                 // Since we waited for 'lsn' to arrive, that is now the last
                 // record LSN. (Or close enough for our purposes; the
                 // last-record LSN can advance immediately after we return
@@ -888,7 +894,13 @@ impl PageServerHandler {
                     "invalid LSN(0) in request".into(),
                 ));
             }
-            timeline.wait_lsn(lsn, ctx).await?;
+            timeline
+                .wait_lsn(
+                    lsn,
+                    crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    ctx,
+                )
+                .await?;
         }
 
         if lsn < **latest_gc_cutoff_lsn {
@@ -1215,7 +1227,13 @@ impl PageServerHandler {
         if let Some(lsn) = lsn {
             // Backup was requested at a particular LSN. Wait for it to arrive.
             info!("waiting for {}", lsn);
-            timeline.wait_lsn(lsn, ctx).await?;
+            timeline
+                .wait_lsn(
+                    lsn,
+                    crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    ctx,
+                )
+                .await?;
             timeline
                 .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
                 .context("invalid basebackup lsn")?;
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 69e163effa..0cc5611a12 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -214,13 +214,12 @@ pub enum TaskKind {
     /// Internally, `Client` hands over requests to the `Connection` object.
     /// The `Connection` object is responsible for speaking the wire protocol.
     ///
-    /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
-    /// That abstraction doesn't use `task_mgr`.
+    /// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
     /// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
     /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
     ///
-    /// Once the connection is established, the `TaskHandle` task creates a
-    /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
+    /// Once the connection is established, the `TaskHandle` task spawns a
+    /// [`WalReceiverConnectionPoller`] task that is responsible for polling
     /// the `Connection` object.
     /// A `CancellationToken` created by the `TaskHandle` task ensures
     /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
@@ -230,7 +229,6 @@ pub enum TaskKind {
     WalReceiverManager,
 
     /// The `TaskHandle` task that executes `handle_walreceiver_connection`.
-    /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
     /// See the comment on [`WalReceiverManager`].
     ///
     /// [`WalReceiverManager`]: Self::WalReceiverManager
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0806ef0cf4..1fb92a50fe 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1515,7 +1515,7 @@ impl Tenant {
                     // sizes etc. and that would get confused if the previous page versions
                     // are not in the repository yet.
                     ancestor_timeline
-                        .wait_lsn(*lsn, ctx)
+                        .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
                         .await
                         .map_err(|e| match e {
                             e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index f01fb9791c..ab2ef4fa79 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1649,7 +1649,14 @@ impl TenantManager {
                     fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
                         "failpoint"
                     )));
-                    if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
+                    if let Err(e) = timeline
+                        .wait_lsn(
+                            *target_lsn,
+                            crate::tenant::timeline::WaitLsnWaiter::Tenant,
+                            ctx,
+                        )
+                        .await
+                    {
                         // Failure here might mean shutdown, in any case this part is an optimization
                         // and we shouldn't hold up the split operation.
                         tracing::warn!(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8ee9b9dbd2..a801c64382 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -612,6 +612,12 @@ pub enum GetVectoredImpl {
     Vectored,
 }
 
+pub(crate) enum WaitLsnWaiter<'a> {
+    Timeline(&'a Timeline),
+    Tenant,
+    PageService,
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -1060,7 +1066,8 @@ impl Timeline {
     pub(crate) async fn wait_lsn(
         &self,
         lsn: Lsn,
-        _ctx: &RequestContext, /* Prepare for use by cancellation */
+        who_is_waiting: WaitLsnWaiter<'_>,
+        ctx: &RequestContext, /* Prepare for use by cancellation */
     ) -> Result<(), WaitLsnError> {
         if self.cancel.is_cancelled() {
             return Err(WaitLsnError::Shutdown);
@@ -1068,20 +1075,28 @@ impl Timeline {
             return Err(WaitLsnError::BadState);
         }
 
-        // This should never be called from the WAL receiver, because that could lead
-        // to a deadlock.
-        debug_assert!(
-            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
-            "wait_lsn cannot be called in WAL receiver"
-        );
-        debug_assert!(
-            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
-            "wait_lsn cannot be called in WAL receiver"
-        );
-        debug_assert!(
-            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
-            "wait_lsn cannot be called in WAL receiver"
-        );
+        if cfg!(debug_assertions) {
+            match ctx.task_kind() {
+                TaskKind::WalReceiverManager
+                | TaskKind::WalReceiverConnectionHandler
+                | TaskKind::WalReceiverConnectionPoller => {
+                    let is_myself = match who_is_waiting {
+                        WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself),
+                        WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()),
+                    };
+                    if is_myself {
+                        if let Err(current) = self.last_record_lsn.would_wait_for(lsn) {
+                            // walingest is the only one that can advance last_record_lsn; it should make sure to never reach here
+                            panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock");
+                        }
+                    } else {
+                        // if another  timeline's  is waiting for us, there's no deadlock risk because
+                        // our walreceiver task can make progress independent of theirs
+                    }
+                }
+                _ => {}
+            }
+        }
 
         let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();
 
@@ -1297,15 +1312,18 @@ impl Timeline {
     pub(crate) async fn flush_and_shutdown(&self) {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
-        // Stop ingesting data, so that we are not still writing to an InMemoryLayer while
-        // trying to flush
-        tracing::debug!("Waiting for WalReceiverManager...");
-        task_mgr::shutdown_tasks(
-            Some(TaskKind::WalReceiverManager),
-            Some(self.tenant_shard_id),
-            Some(self.timeline_id),
-        )
-        .await;
+        // Stop ingesting data. Walreceiver only provides cancellation but no
+        // "wait until gone", because it uses the Timeline::gate.  So, only
+        // after the self.gate.close() in self.shutdown() below will we know for
+        // sure that no walreceiver tasks are left.
+        // This means that we might still be ingesting data during the call to
+        // `self.freeze_and_flush()` below.  That's not ideal, but, we don't have
+        // the concept of a ChildGuard, which is what we'd need to properly model
+        // early shutdown of the walreceiver task sub-tree before the other
+        // Timeline task sub-trees.
+        if let Some(walreceiver) = self.walreceiver.lock().unwrap().take() {
+            walreceiver.cancel();
+        }
 
         // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
         self.last_record_lsn.shutdown();
@@ -3054,7 +3072,7 @@ impl Timeline {
             }
         }
         ancestor
-            .wait_lsn(self.ancestor_lsn, ctx)
+            .wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx)
             .await
             .map_err(|e| match e {
                 e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index ab0a88c764..c7f815d179 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -6,7 +6,7 @@ use std::{
 use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
-use tracing::{debug, error, info, instrument, Instrument};
+use tracing::{error, info, instrument, Instrument};
 use utils::{crashsafe, fs_ext, id::TimelineId};
 
 use crate::{
@@ -30,22 +30,6 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
     tracing::debug!("Cancelling CancellationToken");
     timeline.cancel.cancel();
 
-    // Stop the walreceiver first.
-    debug!("waiting for wal receiver to shutdown");
-    let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
-    if let Some(walreceiver) = maybe_started_walreceiver {
-        walreceiver.stop().await;
-    }
-    debug!("wal receiver shutdown confirmed");
-
-    // Shut down the layer flush task before the remote client, as one depends on the other
-    task_mgr::shutdown_tasks(
-        Some(TaskKind::LayerFlushTask),
-        Some(timeline.tenant_shard_id),
-        Some(timeline.timeline_id),
-    )
-    .await;
-
     // Prevent new uploads from starting.
     if let Some(remote_client) = timeline.remote_client.as_ref() {
         remote_client.stop();
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index f1b62067f9..a085154a5a 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -24,13 +24,12 @@ mod connection_manager;
 mod walreceiver_connection;
 
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
+use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME};
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::timeline::walreceiver::connection_manager::{
     connection_manager_loop_step, ConnectionManagerState,
 };
 
-use pageserver_api::shard::TenantShardId;
 use std::future::Future;
 use std::num::NonZeroU64;
 use std::sync::Arc;
@@ -40,8 +39,6 @@ use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 
-use utils::id::TimelineId;
-
 use self::connection_manager::ConnectionManagerStatus;
 
 use super::Timeline;
@@ -60,9 +57,10 @@ pub struct WalReceiverConf {
 }
 
 pub struct WalReceiver {
-    tenant_shard_id: TenantShardId,
-    timeline_id: TimelineId,
     manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
+    /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token.
+    /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`.
+    cancel: CancellationToken,
 }
 
 impl WalReceiver {
@@ -76,23 +74,23 @@ impl WalReceiver {
         let timeline_id = timeline.timeline_id;
         let walreceiver_ctx =
             ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
-
         let loop_status = Arc::new(std::sync::RwLock::new(None));
         let manager_status = Arc::clone(&loop_status);
-        task_mgr::spawn(
-            WALRECEIVER_RUNTIME.handle(),
-            TaskKind::WalReceiverManager,
-            Some(timeline.tenant_shard_id),
-            Some(timeline_id),
-            &format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"),
-            false,
+        let cancel = timeline.cancel.child_token();
+        WALRECEIVER_RUNTIME.spawn({
+            let cancel = cancel.clone();
             async move {
                 debug_assert_current_span_has_tenant_and_timeline_id();
+                // acquire timeline gate so we know the task doesn't outlive the Timeline
+                let Ok(_guard) = timeline.gate.enter() else {
+                    debug!("WAL receiver manager could not enter the gate timeline gate, it's closed already");
+                    return;
+                };
                 debug!("WAL receiver manager started, connecting to broker");
-                let cancel = task_mgr::shutdown_token();
                 let mut connection_manager_state = ConnectionManagerState::new(
                     timeline,
                     conf,
+                    cancel.clone(),
                 );
                 while !cancel.is_cancelled() {
                     let loop_step_result = connection_manager_loop_step(
@@ -112,25 +110,22 @@ impl WalReceiver {
                 }
                 connection_manager_state.shutdown().await;
                 *loop_status.write().unwrap() = None;
-                Ok(())
+                debug!("task exits");
             }
             .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
-        );
+        });
 
         Self {
-            tenant_shard_id,
-            timeline_id,
             manager_status,
+            cancel,
         }
     }
 
-    pub async fn stop(self) {
-        task_mgr::shutdown_tasks(
-            Some(TaskKind::WalReceiverManager),
-            Some(self.tenant_shard_id),
-            Some(self.timeline_id),
-        )
-        .await;
+    #[instrument(skip_all, level = tracing::Level::DEBUG)]
+    pub fn cancel(&self) {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        debug!("cancelling walreceiver tasks");
+        self.cancel.cancel();
     }
 
     pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {
@@ -164,14 +159,18 @@ enum TaskStateUpdate<E> {
 
 impl<E: Clone> TaskHandle<E> {
     /// Initializes the task, starting it immediately after the creation.
+    ///
+    /// The second argument to `task` is a child token of `cancel_parent` ([`CancellationToken::child_token`]).
+    /// It being a child token enables us to provide a [`Self::shutdown`] method.
     fn spawn<Fut>(
+        cancel_parent: &CancellationToken,
         task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, CancellationToken) -> Fut + Send + 'static,
     ) -> Self
     where
         Fut: Future<Output = anyhow::Result<()>> + Send,
         E: Send + Sync + 'static,
     {
-        let cancellation = CancellationToken::new();
+        let cancellation = cancel_parent.child_token();
         let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);
 
         let cancellation_clone = cancellation.clone();
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 030d24a017..dae31934ad 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -280,6 +280,8 @@ pub(super) struct ConnectionManagerState {
     id: TenantTimelineId,
     /// Use pageserver data about the timeline to filter out some of the safekeepers.
     timeline: Arc<Timeline>,
+    /// Child token of [`super::WalReceiver::cancel`], inherited to all tasks we spawn.
+    cancel: CancellationToken,
     conf: WalReceiverConf,
     /// Current connection to safekeeper for WAL streaming.
     wal_connection: Option<WalConnection>,
@@ -402,7 +404,11 @@ struct BrokerSkTimeline {
 }
 
 impl ConnectionManagerState {
-    pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
+    pub(super) fn new(
+        timeline: Arc<Timeline>,
+        conf: WalReceiverConf,
+        cancel: CancellationToken,
+    ) -> Self {
         let id = TenantTimelineId {
             tenant_id: timeline.tenant_shard_id.tenant_id,
             timeline_id: timeline.timeline_id,
@@ -410,6 +416,7 @@ impl ConnectionManagerState {
         Self {
             id,
             timeline,
+            cancel,
             conf,
             wal_connection: None,
             wal_stream_candidates: HashMap::new(),
@@ -417,6 +424,22 @@ impl ConnectionManagerState {
         }
     }
 
+    fn spawn<Fut>(
+        &self,
+        task: impl FnOnce(
+                tokio::sync::watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
+                CancellationToken,
+            ) -> Fut
+            + Send
+            + 'static,
+    ) -> TaskHandle<WalConnectionStatus>
+    where
+        Fut: std::future::Future<Output = anyhow::Result<()>> + Send,
+    {
+        // TODO: get rid of TaskHandle
+        super::TaskHandle::spawn(&self.cancel, task)
+    }
+
     /// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
     async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
         WALRECEIVER_SWITCHES
@@ -435,7 +458,7 @@ impl ConnectionManagerState {
         );
 
         let span = info_span!("connection", %node_id);
-        let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
+        let connection_handle = self.spawn(move |events_sender, cancellation| {
             async move {
                 debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -463,6 +486,12 @@ impl ConnectionManagerState {
                                 info!("walreceiver connection handling ended: {e}");
                                 Ok(())
                             }
+                            WalReceiverError::ClosedGate => {
+                                info!(
+                                    "walreceiver connection handling ended because of closed gate"
+                                );
+                                Ok(())
+                            }
                             WalReceiverError::Other(e) => {
                                 // give out an error to have task_mgr give it a really verbose logging
                                 if cancellation.is_cancelled() {
@@ -1016,7 +1045,7 @@ mod tests {
             sk_id: connected_sk_id,
             availability_zone: None,
             status: connection_status,
-            connection_task: TaskHandle::spawn(move |sender, _| async move {
+            connection_task: state.spawn(move |sender, _| async move {
                 sender
                     .send(TaskStateUpdate::Progress(connection_status))
                     .ok();
@@ -1184,7 +1213,7 @@ mod tests {
             sk_id: connected_sk_id,
             availability_zone: None,
             status: connection_status,
-            connection_task: TaskHandle::spawn(move |sender, _| async move {
+            connection_task: state.spawn(move |sender, _| async move {
                 sender
                     .send(TaskStateUpdate::Progress(connection_status))
                     .ok();
@@ -1251,7 +1280,7 @@ mod tests {
             sk_id: NodeId(1),
             availability_zone: None,
             status: connection_status,
-            connection_task: TaskHandle::spawn(move |sender, _| async move {
+            connection_task: state.spawn(move |sender, _| async move {
                 sender
                     .send(TaskStateUpdate::Progress(connection_status))
                     .ok();
@@ -1315,7 +1344,7 @@ mod tests {
             sk_id: NodeId(1),
             availability_zone: None,
             status: connection_status,
-            connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
+            connection_task: state.spawn(move |_, _| async move { Ok(()) }),
             discovered_new_wal: Some(NewCommittedWAL {
                 discovered_at: time_over_threshold,
                 lsn: new_lsn,
@@ -1371,6 +1400,7 @@ mod tests {
                 timeline_id: TIMELINE_ID,
             },
             timeline,
+            cancel: CancellationToken::new(),
             conf: WalReceiverConf {
                 wal_connect_timeout: Duration::from_secs(1),
                 lagging_wal_timeout: Duration::from_secs(1),
@@ -1414,7 +1444,7 @@ mod tests {
             sk_id: connected_sk_id,
             availability_zone: None,
             status: connection_status,
-            connection_task: TaskHandle::spawn(move |sender, _| async move {
+            connection_task: state.spawn(move |sender, _| async move {
                 sender
                     .send(TaskStateUpdate::Progress(connection_status))
                     .ok();
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 00a9dbd760..a7cb19c2a0 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -27,7 +27,6 @@ use super::TaskStateUpdate;
 use crate::{
     context::RequestContext,
     metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
-    task_mgr,
     task_mgr::TaskKind,
     task_mgr::WALRECEIVER_RUNTIME,
     tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
@@ -37,8 +36,8 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use utils::pageserver_feedback::PageserverFeedback;
 use utils::{id::NodeId, lsn::Lsn};
+use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
 
 /// Status of the connection.
 #[derive(Debug, Clone, Copy)]
@@ -68,6 +67,7 @@ pub(super) enum WalReceiverError {
     SuccessfulCompletion(String),
     /// Generic error
     Other(anyhow::Error),
+    ClosedGate,
 }
 
 impl From<tokio_postgres::Error> for WalReceiverError {
@@ -119,6 +119,16 @@ pub(super) async fn handle_walreceiver_connection(
 ) -> Result<(), WalReceiverError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
+    // prevent timeline shutdown from finishing until we have exited
+    let _guard = timeline.gate.enter().map_err(|e| match e {
+        GateError::GateClosed => WalReceiverError::ClosedGate,
+    })?;
+    // This function spawns a side-car task (WalReceiverConnectionPoller).
+    // Get its gate guard now as well.
+    let poller_guard = timeline.gate.enter().map_err(|e| match e {
+        GateError::GateClosed => WalReceiverError::ClosedGate,
+    })?;
+
     WALRECEIVER_STARTED_CONNECTIONS.inc();
 
     // Connect to the database in replication mode.
@@ -156,22 +166,19 @@ pub(super) async fn handle_walreceiver_connection(
     }
 
     // The connection object performs the actual communication with the database,
-    // so spawn it off to run on its own.
+    // so spawn it off to run on its own. It shouldn't outlive this function, but,
+    // due to lack of async drop, we can't enforce that. However, we ensure that
+    // 1. it is sensitive to `cancellation` and
+    // 2. holds the Timeline gate open so that after timeline shutdown,
+    //    we know this task is gone.
     let _connection_ctx = ctx.detached_child(
         TaskKind::WalReceiverConnectionPoller,
         ctx.download_behavior(),
     );
     let connection_cancellation = cancellation.clone();
-    task_mgr::spawn(
-        WALRECEIVER_RUNTIME.handle(),
-        TaskKind::WalReceiverConnectionPoller,
-        Some(timeline.tenant_shard_id),
-        Some(timeline.timeline_id),
-        "walreceiver connection",
-        false,
+    WALRECEIVER_RUNTIME.spawn(
         async move {
             debug_assert_current_span_has_tenant_and_timeline_id();
-
             select! {
                 connection_result = connection => match connection_result {
                     Ok(()) => debug!("Walreceiver db connection closed"),
@@ -182,6 +189,9 @@ pub(super) async fn handle_walreceiver_connection(
                                 // with a similar error.
                             },
                             WalReceiverError::SuccessfulCompletion(_) => {}
+                            WalReceiverError::ClosedGate => {
+                                // doesn't happen at runtime
+                            }
                             WalReceiverError::Other(err) => {
                                 warn!("Connection aborted: {err:#}")
                             }
@@ -190,7 +200,7 @@ pub(super) async fn handle_walreceiver_connection(
                 },
                 _ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
             }
-            Ok(())
+            drop(poller_guard);
         }
         // Enrich the log lines emitted by this closure with meaningful context.
         // TODO: technically, this task outlives the surrounding function, so, the

From d443d07518cbce7a825c4663b43c896935c23a00 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 3 Apr 2024 13:30:14 +0300
Subject: [PATCH 0509/1571] wal_ingest: global counter for bytes received
 (#7240)

Fixes #7102 by adding a metric for global total received WAL bytes:
`pageserver_wal_ingest_bytes_received`.
---
 pageserver/src/metrics.rs                                   | 6 ++++++
 .../tenant/timeline/walreceiver/walreceiver_connection.rs   | 1 +
 2 files changed, 7 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index cc661194e9..ab9a2e8509 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1483,12 +1483,18 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
 });
 
 pub(crate) struct WalIngestMetrics {
+    pub(crate) bytes_received: IntCounter,
     pub(crate) records_received: IntCounter,
     pub(crate) records_committed: IntCounter,
     pub(crate) records_filtered: IntCounter,
 }
 
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
+    bytes_received: register_int_counter!(
+        "pageserver_wal_ingest_bytes_received",
+        "Bytes of WAL ingested from safekeepers",
+    )
+    .unwrap(),
     records_received: register_int_counter!(
         "pageserver_wal_ingest_records_received",
         "Number of WAL records received from safekeepers"
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index a7cb19c2a0..3f3419e886 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -313,6 +313,7 @@ pub(super) async fn handle_walreceiver_connection(
 
                 trace!("received XLogData between {startlsn} and {endlsn}");
 
+                WAL_INGEST.bytes_received.inc_by(data.len() as u64);
                 waldecoder.feed_bytes(data);
 
                 {

From 944313ffe1a1bca9482e82c2dd6f609034e540e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 3 Apr 2024 13:42:45 +0200
Subject: [PATCH 0510/1571] Schedule image layer uploads in tiered compaction
 (#7282)

Tiered compaction hasn't scheduled the upload of image layers. In the
`test_gc_feedback.py` test this has caused warnings like with tiered
compaction:

```
INFO request[...] Deleting layer [...] not found in latest_files list, never uploaded?
```

Which caused errors like:

```
ERROR layer_delete[...] was unlinked but was not dangling
```

Fixes #7244
---
 pageserver/src/tenant/timeline.rs            | 18 ++++++++++++++++++
 pageserver/src/tenant/timeline/compaction.rs | 17 +++++------------
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a801c64382..16cec6805c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3896,6 +3896,24 @@ impl Timeline {
         Ok(())
     }
 
+    /// Schedules the uploads of the given image layers
+    fn upload_new_image_layers(
+        self: &Arc<Self>,
+        new_images: impl IntoIterator<Item = ResidentLayer>,
+    ) -> anyhow::Result<()> {
+        let Some(remote_client) = &self.remote_client else {
+            return Ok(());
+        };
+        for layer in new_images {
+            remote_client.schedule_layer_file_upload(layer)?;
+        }
+        // should any new image layer been created, not uploading index_part will
+        // result in a mismatch between remote_physical_size and layermap calculated
+        // size, which will fail some tests, but should not be an issue otherwise.
+        remote_client.schedule_index_upload_for_file_changes()?;
+        Ok(())
+    }
+
     /// Update information about which layer files need to be retained on
     /// garbage collection. This is separate from actually performing the GC,
     /// and is updated more frequently, so that compaction can remove obsolete
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 74b75dabf0..ab001bf10d 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -125,18 +125,8 @@ impl Timeline {
                     )
                     .await
                     .map_err(anyhow::Error::from)?;
-                if let Some(remote_client) = &self.remote_client {
-                    for layer in layers {
-                        remote_client.schedule_layer_file_upload(layer)?;
-                    }
-                }
 
-                if let Some(remote_client) = &self.remote_client {
-                    // should any new image layer been created, not uploading index_part will
-                    // result in a mismatch between remote_physical_size and layermap calculated
-                    // size, which will fail some tests, but should not be an issue otherwise.
-                    remote_client.schedule_index_upload_for_file_changes()?;
-                }
+                self.upload_new_image_layers(layers)?;
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -818,7 +808,10 @@ impl TimelineAdaptor {
         self.timeline
             .finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete)
             .await?;
-        self.new_images.clear();
+
+        self.timeline
+            .upload_new_image_layers(std::mem::take(&mut self.new_images))?;
+
         self.new_deltas.clear();
         self.layers_to_delete.clear();
         Ok(())

From 8b10407be41758f9defff2a830904be8531a7830 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 3 Apr 2024 14:53:43 +0100
Subject: [PATCH 0511/1571] pageserver: on-demand activation of tenant on GET
 tenant status (#7250)

## Problem

(Follows https://github.com/neondatabase/neon/pull/7237)

Some API users will query a tenant to wait for it to activate.
Currently, we return the current status of the tenant, whatever that may
be. Under heavy load, a pageserver starting up might take a long time to
activate such a tenant.

## Summary of changes

- In `tenant_status` handler, call wait_to_become_active on the tenant.
If the tenant is currently waiting for activation, this causes it to
skip the queue, similiar to other API handlers that require an active
tenant, like timeline creation. This avoids external services waiting a
long time for activation when polling GET /v1/tenant/<id>.
---
 pageserver/src/http/routes.rs           | 15 +++++++++++++++
 test_runner/fixtures/pageserver/http.py | 17 +++++++++++++++--
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 759a1b25ee..47d8ae1148 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -993,11 +993,26 @@ async fn tenant_status(
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let state = get_state(&request);
 
+    // In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting.
+    let activate = true;
+    #[cfg(feature = "testing")]
+    let activate = parse_query_param(&request, "activate")?.unwrap_or(activate);
+
     let tenant_info = async {
         let tenant = state
             .tenant_manager
             .get_attached_tenant_shard(tenant_shard_id)?;
 
+        if activate {
+            // This is advisory: we prefer to let the tenant activate on-demand when this function is
+            // called, but it is still valid to return 200 and describe the current state of the tenant
+            // if it doesn't make it into an active state.
+            tenant
+                .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
+                .await
+                .ok();
+        }
+
         // Calculate total physical size of all timelines
         let mut current_physical_size = 0;
         for timeline in tenant.list_timelines().iter() {
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 6aebfbc99c..d3bf46b2e8 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -341,8 +341,21 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
         self.verbose_error(res)
 
-    def tenant_status(self, tenant_id: Union[TenantId, TenantShardId]) -> Dict[Any, Any]:
-        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
+    def tenant_status(
+        self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False
+    ) -> Dict[Any, Any]:
+        """
+        :activate: hint the server not to accelerate activation of this tenant in response
+        to this query.  False by default for tests, because they generally want to observed the
+        system rather than interfering with it.  This is true  by default on the server side,
+        because in the field if the control plane is GET'ing a tenant it's a sign that it wants
+        to do something with it.
+        """
+        params = {}
+        if not activate:
+            params["activate"] = "false"
+
+        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}", params=params)
         self.verbose_error(res)
         res_json = res.json()
         assert isinstance(res_json, dict)

From 3f77f26aa29a0a250a494346fed2f294d690aa46 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 3 Apr 2024 17:20:51 +0200
Subject: [PATCH 0512/1571] Upload partial segments (#6530)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for backing up partial segments to remote storage. Disabled
by default, can be enabled with `--partial-backup-enabled`.

Safekeeper timeline has a background task which is subscribed to
`commit_lsn` and `flush_lsn` updates. After the partial segment was
updated (`flush_lsn` was changed), the segment will be uploaded to S3 in
about 15 minutes.

The filename format for partial segments is
`Segment_Term_Flush_Commit_skNN.partial`, where:
- `Segment` – the segment name, like `000000010000000000000001`
- `Term` – current term
- `Flush` – flush_lsn in hex format `{:016X}`, e.g. `00000000346BC568`
- `Commit` – commit_lsn in the same hex format
- `NN` – safekeeper_id, like `1`

The full object name example:
`000000010000000000000002_2_0000000002534868_0000000002534410_sk1.partial`

Each safekeeper will keep info about remote partial segments in its
control file. Code updates state in the control file before doing any S3
operations. This way control file stores information about all
potentially existing remote partial segments and can clean them up after
uploading a newer version.


Closes #6336
---
 libs/remote_storage/src/lib.rs                |  10 +
 safekeeper/Cargo.toml                         |   1 +
 safekeeper/src/bin/safekeeper.rs              |  13 +-
 safekeeper/src/control_file.rs                |   2 +-
 safekeeper/src/control_file_upgrade.rs        |  72 ++++
 safekeeper/src/lib.rs                         |   6 +
 safekeeper/src/metrics.rs                     |  15 +
 safekeeper/src/safekeeper.rs                  |   3 +
 safekeeper/src/state.rs                       |  13 +-
 safekeeper/src/timeline.rs                    |   7 +-
 safekeeper/src/wal_backup.rs                  |  56 ++-
 safekeeper/src/wal_backup_partial.rs          | 396 ++++++++++++++++++
 .../tests/walproposer_sim/safekeeper.rs       |   2 +
 test_runner/regress/test_compatibility.py     |   3 +
 .../regress/test_wal_acceptor_async.py        |   6 +-
 15 files changed, 587 insertions(+), 18 deletions(-)
 create mode 100644 safekeeper/src/wal_backup_partial.rs

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index ab2035f19a..e708854be2 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -565,6 +565,16 @@ impl GenericRemoteStorage {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct StorageMetadata(HashMap<String, String>);
 
+impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
+    fn from(arr: [(&str, &str); N]) -> Self {
+        let map: HashMap<String, String> = arr
+            .iter()
+            .map(|(k, v)| (k.to_string(), v.to_string()))
+            .collect();
+        Self(map)
+    }
+}
+
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index cb4a1def1f..c8b732fee1 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -33,6 +33,7 @@ once_cell.workspace = true
 parking_lot.workspace = true
 postgres.workspace = true
 postgres-protocol.workspace = true
+rand.workspace = true
 regex.workspace = true
 scopeguard.workspace = true
 reqwest = { workspace = true, features = ["json"] }
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 3c4c81e499..e53ccaeb3d 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -28,7 +28,7 @@ use utils::pid_file;
 use metrics::set_build_info_metric;
 use safekeeper::defaults::{
     DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
-    DEFAULT_PG_LISTEN_ADDR,
+    DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
@@ -170,6 +170,13 @@ struct Args {
     /// still needed for existing replication connection.
     #[arg(long)]
     walsenders_keep_horizon: bool,
+    /// Enable partial backup. If disabled, safekeeper will not upload partial
+    /// segments to remote storage.
+    #[arg(long)]
+    partial_backup_enabled: bool,
+    /// Controls how long backup will wait until uploading the partial segment.
+    #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
+    partial_backup_timeout: Duration,
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -300,6 +307,8 @@ async fn main() -> anyhow::Result<()> {
         http_auth,
         current_thread_runtime: args.current_thread_runtime,
         walsenders_keep_horizon: args.walsenders_keep_horizon,
+        partial_backup_enabled: args.partial_backup_enabled,
+        partial_backup_timeout: args.partial_backup_timeout,
     };
 
     // initialize sentry if SENTRY_DSN is provided
@@ -365,6 +374,8 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
 
     let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
 
+    wal_backup::init_remote_storage(&conf);
+
     // Keep handles to main tasks to die if any of them disappears.
     let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
         FuturesUnordered::new();
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index d822c87c0e..fe9f2e6899 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -20,7 +20,7 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId};
 use crate::SafeKeeperConf;
 
 pub const SK_MAGIC: u32 = 0xcafeceefu32;
-pub const SK_FORMAT_VERSION: u32 = 7;
+pub const SK_FORMAT_VERSION: u32 = 8;
 
 // contains persistent metadata for safekeeper
 const CONTROL_FILE_NAME: &str = "safekeeper.control";
diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs
index 2fd719326d..8f4dfe9b43 100644
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -2,6 +2,7 @@
 use crate::{
     safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn},
     state::{PersistedPeers, TimelinePersistentState},
+    wal_backup_partial,
 };
 use anyhow::{bail, Result};
 use pq_proto::SystemId;
@@ -138,6 +139,50 @@ pub struct SafeKeeperStateV4 {
     pub peers: PersistedPeers,
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct SafeKeeperStateV7 {
+    #[serde(with = "hex")]
+    pub tenant_id: TenantId,
+    #[serde(with = "hex")]
+    pub timeline_id: TimelineId,
+    /// persistent acceptor state
+    pub acceptor_state: AcceptorState,
+    /// information about server
+    pub server: ServerInfo,
+    /// Unique id of the last *elected* proposer we dealt with. Not needed
+    /// for correctness, exists for monitoring purposes.
+    #[serde(with = "hex")]
+    pub proposer_uuid: PgUuid,
+    /// Since which LSN this timeline generally starts. Safekeeper might have
+    /// joined later.
+    pub timeline_start_lsn: Lsn,
+    /// Since which LSN safekeeper has (had) WAL for this timeline.
+    /// All WAL segments next to one containing local_start_lsn are
+    /// filled with data from the beginning.
+    pub local_start_lsn: Lsn,
+    /// Part of WAL acknowledged by quorum *and available locally*. Always points
+    /// to record boundary.
+    pub commit_lsn: Lsn,
+    /// LSN that points to the end of the last backed up segment. Useful to
+    /// persist to avoid finding out offloading progress on boot.
+    pub backup_lsn: Lsn,
+    /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
+    /// of last record streamed to everyone). Persisting it helps skipping
+    /// recovery in walproposer, generally we compute it from peers. In
+    /// walproposer proto called 'truncate_lsn'. Updates are currently drived
+    /// only by walproposer.
+    pub peer_horizon_lsn: Lsn,
+    /// LSN of the oldest known checkpoint made by pageserver and successfully
+    /// pushed to s3. We don't remove WAL beyond it. Persisted only for
+    /// informational purposes, we receive it from pageserver (or broker).
+    pub remote_consistent_lsn: Lsn,
+    // Peers and their state as we remember it. Knowing peers themselves is
+    // fundamental; but state is saved here only for informational purposes and
+    // obviously can be stale. (Currently not saved at all, but let's provision
+    // place to have less file version upgrades).
+    pub peers: PersistedPeers,
+}
+
 pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
     // migrate to storing full term history
     if version == 1 {
@@ -167,6 +212,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             peer_horizon_lsn: oldstate.truncate_lsn,
             remote_consistent_lsn: Lsn(0),
             peers: PersistedPeers(vec![]),
+            partial_backup: wal_backup_partial::State::default(),
         });
     // migrate to hexing some ids
     } else if version == 2 {
@@ -190,6 +236,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             peer_horizon_lsn: oldstate.truncate_lsn,
             remote_consistent_lsn: Lsn(0),
             peers: PersistedPeers(vec![]),
+            partial_backup: wal_backup_partial::State::default(),
         });
     // migrate to moving tenant_id/timeline_id to the top and adding some lsns
     } else if version == 3 {
@@ -213,6 +260,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             peer_horizon_lsn: oldstate.truncate_lsn,
             remote_consistent_lsn: Lsn(0),
             peers: PersistedPeers(vec![]),
+            partial_backup: wal_backup_partial::State::default(),
         });
     // migrate to having timeline_start_lsn
     } else if version == 4 {
@@ -236,6 +284,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             peer_horizon_lsn: oldstate.peer_horizon_lsn,
             remote_consistent_lsn: Lsn(0),
             peers: PersistedPeers(vec![]),
+            partial_backup: wal_backup_partial::State::default(),
         });
     } else if version == 5 {
         info!("reading safekeeper control file version {}", version);
@@ -262,7 +311,30 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
         oldstate.server.pg_version = 140005;
 
         return Ok(oldstate);
+    } else if version == 7 {
+        info!("reading safekeeper control file version {}", version);
+        let oldstate = SafeKeeperStateV7::des(&buf[..buf.len()])?;
+
+        return Ok(TimelinePersistentState {
+            tenant_id: oldstate.tenant_id,
+            timeline_id: oldstate.timeline_id,
+            acceptor_state: oldstate.acceptor_state,
+            server: oldstate.server,
+            proposer_uuid: oldstate.proposer_uuid,
+            timeline_start_lsn: oldstate.timeline_start_lsn,
+            local_start_lsn: oldstate.local_start_lsn,
+            commit_lsn: oldstate.commit_lsn,
+            backup_lsn: oldstate.backup_lsn,
+            peer_horizon_lsn: oldstate.peer_horizon_lsn,
+            remote_consistent_lsn: oldstate.remote_consistent_lsn,
+            peers: oldstate.peers,
+            partial_backup: wal_backup_partial::State::default(),
+        });
     }
+
+    // TODO: persist the file back to the disk after upgrade
+    // TODO: think about backward compatibility and rollbacks
+
     bail!("unsupported safekeeper control file version {}", version)
 }
 
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index ce4b4d7bd0..9b4d4dbb38 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -32,6 +32,7 @@ pub mod send_wal;
 pub mod state;
 pub mod timeline;
 pub mod wal_backup;
+pub mod wal_backup_partial;
 pub mod wal_service;
 pub mod wal_storage;
 
@@ -48,6 +49,7 @@ pub mod defaults {
 
     pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms";
     pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
+    pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
 }
 
 #[derive(Debug, Clone)]
@@ -79,6 +81,8 @@ pub struct SafeKeeperConf {
     pub http_auth: Option<Arc<SwappableJwtAuth>>,
     pub current_thread_runtime: bool,
     pub walsenders_keep_horizon: bool,
+    pub partial_backup_enabled: bool,
+    pub partial_backup_timeout: Duration,
 }
 
 impl SafeKeeperConf {
@@ -123,6 +127,8 @@ impl SafeKeeperConf {
             max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
             current_thread_runtime: false,
             walsenders_keep_horizon: false,
+            partial_backup_enabled: false,
+            partial_backup_timeout: Duration::from_secs(0),
         }
     }
 }
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index e541527b6a..28ae042bb3 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -147,6 +147,21 @@ pub static RECEIVED_PS_FEEDBACKS: Lazy<IntCounter> = Lazy::new(|| {
     )
     .expect("Failed to register safekeeper_received_ps_feedbacks_total counter")
 });
+pub static PARTIAL_BACKUP_UPLOADS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_partial_backup_uploads_total",
+        "Number of partial backup uploads to the S3",
+        &["result"]
+    )
+    .expect("Failed to register safekeeper_partial_backup_uploads_total counter")
+});
+pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_partial_backup_uploaded_bytes_total",
+        "Number of bytes uploaded to the S3 during partial backup"
+    )
+    .expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter")
+});
 
 pub const LABEL_UNKNOWN: &str = "unknown";
 
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index d7c8fa6955..f2ee0403eb 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -1221,6 +1221,7 @@ mod tests {
                     commit_lsn: Lsn(1234567600),
                 },
             )]),
+            partial_backup: crate::wal_backup_partial::State::default(),
         };
 
         let ser = state.ser().unwrap();
@@ -1266,6 +1267,8 @@ mod tests {
             0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
             0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
+            // partial_backup
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         ];
 
         assert_eq!(Hex(&ser), Hex(&expected));
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index 82f7954051..be5e516296 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -13,6 +13,7 @@ use utils::{
 use crate::{
     control_file,
     safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory},
+    wal_backup_partial::{self},
 };
 
 /// Persistent information stored on safekeeper node about timeline.
@@ -54,11 +55,14 @@ pub struct TimelinePersistentState {
     /// pushed to s3. We don't remove WAL beyond it. Persisted only for
     /// informational purposes, we receive it from pageserver (or broker).
     pub remote_consistent_lsn: Lsn,
-    // Peers and their state as we remember it. Knowing peers themselves is
-    // fundamental; but state is saved here only for informational purposes and
-    // obviously can be stale. (Currently not saved at all, but let's provision
-    // place to have less file version upgrades).
+    /// Peers and their state as we remember it. Knowing peers themselves is
+    /// fundamental; but state is saved here only for informational purposes and
+    /// obviously can be stale. (Currently not saved at all, but let's provision
+    /// place to have less file version upgrades).
     pub peers: PersistedPeers,
+    /// Holds names of partial segments uploaded to remote storage. Used to
+    /// clean up old objects without leaving garbage in remote storage.
+    pub partial_backup: wal_backup_partial::State,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
@@ -93,6 +97,7 @@ impl TimelinePersistentState {
                     .map(|p| (*p, PersistedPeerInfo::new()))
                     .collect(),
             ),
+            partial_backup: wal_backup_partial::State::default(),
         }
     }
 
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 4901b86acf..64f764f191 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -38,7 +38,7 @@ use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
 use crate::metrics::FullTimelineInfo;
 use crate::wal_storage::Storage as wal_storage_iface;
-use crate::{debug_dump, wal_storage};
+use crate::{debug_dump, wal_backup_partial, wal_storage};
 use crate::{GlobalTimelines, SafeKeeperConf};
 
 /// Things safekeeper should know about timeline state on peers.
@@ -503,6 +503,9 @@ impl Timeline {
         if conf.peer_recovery_enabled {
             tokio::spawn(recovery_main(self.clone(), conf.clone()));
         }
+        if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
+            tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
+        }
     }
 
     /// Delete timeline from disk completely, by removing timeline directory.
@@ -667,8 +670,8 @@ impl Timeline {
             term_flush_lsn =
                 TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn()));
         }
-        self.commit_lsn_watch_tx.send(commit_lsn)?;
         self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
+        self.commit_lsn_watch_tx.send(commit_lsn)?;
         Ok(rmsg)
     }
 
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 944d80f777..e3f6a606a0 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -18,7 +18,7 @@ use std::time::Duration;
 use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::{XLogSegNo, PG_TLI};
-use remote_storage::{GenericRemoteStorage, RemotePath};
+use remote_storage::{GenericRemoteStorage, RemotePath, StorageMetadata};
 use tokio::fs::File;
 
 use tokio::select;
@@ -180,6 +180,16 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
         .unwrap()
 }
 
+pub fn init_remote_storage(conf: &SafeKeeperConf) {
+    // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide
+    // dependencies to all tasks instead.
+    REMOTE_STORAGE.get_or_init(|| {
+        conf.remote_storage
+            .as_ref()
+            .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
+    });
+}
+
 const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
 
 /// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
@@ -194,14 +204,6 @@ pub async fn wal_backup_launcher_task_main(
         conf.remote_storage
     );
 
-    let conf_ = conf.clone();
-    REMOTE_STORAGE.get_or_init(|| {
-        conf_
-            .remote_storage
-            .as_ref()
-            .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
-    });
-
     // Presence in this map means launcher is aware s3 offloading is needed for
     // the timeline, but task is started only if it makes sense for to offload
     // from this safekeeper.
@@ -518,6 +520,35 @@ async fn backup_object(
         .await
 }
 
+pub(crate) async fn backup_partial_segment(
+    source_file: &Utf8Path,
+    target_file: &RemotePath,
+    size: usize,
+) -> Result<()> {
+    let storage = get_configured_remote_storage();
+
+    let file = File::open(&source_file)
+        .await
+        .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
+
+    // limiting the file to read only the first `size` bytes
+    let limited_file = tokio::io::AsyncReadExt::take(file, size as u64);
+
+    let file = tokio_util::io::ReaderStream::with_capacity(limited_file, BUFFER_SIZE);
+
+    let cancel = CancellationToken::new();
+
+    storage
+        .upload(
+            file,
+            size,
+            target_file,
+            Some(StorageMetadata::from([("sk_type", "partial_segment")])),
+            &cancel,
+        )
+        .await
+}
+
 pub async fn read_object(
     file_path: &RemotePath,
     offset: u64,
@@ -604,6 +635,13 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     Ok(())
 }
 
+/// Used by wal_backup_partial.
+pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> {
+    let cancel = CancellationToken::new(); // not really used
+    let storage = get_configured_remote_storage();
+    storage.delete_objects(paths, &cancel).await
+}
+
 /// Copy segments from one timeline to another. Used in copy_timeline.
 pub async fn copy_s3_segments(
     wal_seg_size: usize,
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
new file mode 100644
index 0000000000..a535c814ea
--- /dev/null
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -0,0 +1,396 @@
+//! Safekeeper timeline has a background task which is subscribed to `commit_lsn`
+//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn`
+//! was changed), the segment will be uploaded to S3 in about 15 minutes.
+//!
+//! The filename format for partial segments is
+//! `Segment_Term_Flush_Commit_skNN.partial`, where:
+//! - `Segment` – the segment name, like `000000010000000000000001`
+//! - `Term` – current term
+//! - `Flush` – flush_lsn in hex format `{:016X}`, e.g. `00000000346BC568`
+//! - `Commit` – commit_lsn in the same hex format
+//! - `NN` – safekeeper_id, like `1`
+//!
+//! The full object name example:
+//! `000000010000000000000002_2_0000000002534868_0000000002534410_sk1.partial`
+//!
+//! Each safekeeper will keep info about remote partial segments in its control
+//! file. Code updates state in the control file before doing any S3 operations.
+//! This way control file stores information about all potentially existing
+//! remote partial segments and can clean them up after uploading a newer version.
+
+use std::sync::Arc;
+
+use camino::Utf8PathBuf;
+use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
+use rand::Rng;
+use remote_storage::RemotePath;
+use serde::{Deserialize, Serialize};
+
+use tracing::{debug, error, info, instrument};
+use utils::lsn::Lsn;
+
+use crate::{
+    metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
+    safekeeper::Term,
+    timeline::Timeline,
+    wal_backup, SafeKeeperConf,
+};
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum UploadStatus {
+    /// Upload is in progress
+    InProgress,
+    /// Upload is finished
+    Uploaded,
+    /// Deletion is in progress
+    Deleting,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct PartialRemoteSegment {
+    pub status: UploadStatus,
+    pub name: String,
+    pub commit_lsn: Lsn,
+    pub flush_lsn: Lsn,
+    pub term: Term,
+}
+
+impl PartialRemoteSegment {
+    fn eq_without_status(&self, other: &Self) -> bool {
+        self.name == other.name
+            && self.commit_lsn == other.commit_lsn
+            && self.flush_lsn == other.flush_lsn
+            && self.term == other.term
+    }
+}
+
+// NB: these structures are a part of a control_file, you can't change them without
+// changing the control file format version.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
+pub struct State {
+    pub segments: Vec<PartialRemoteSegment>,
+}
+
+impl State {
+    /// Find an Uploaded segment. There should be only one Uploaded segment at a time.
+    fn uploaded_segment(&self) -> Option<PartialRemoteSegment> {
+        self.segments
+            .iter()
+            .find(|seg| seg.status == UploadStatus::Uploaded)
+            .cloned()
+    }
+}
+
+struct PartialBackup {
+    wal_seg_size: usize,
+    tli: Arc<Timeline>,
+    conf: SafeKeeperConf,
+    local_prefix: Utf8PathBuf,
+    remote_prefix: Utf8PathBuf,
+
+    state: State,
+}
+
+// Read-only methods for getting segment names
+impl PartialBackup {
+    fn segno(&self, lsn: Lsn) -> XLogSegNo {
+        lsn.segment_number(self.wal_seg_size)
+    }
+
+    fn segment_name(&self, segno: u64) -> String {
+        XLogFileName(PG_TLI, segno, self.wal_seg_size)
+    }
+
+    fn remote_segment_name(
+        &self,
+        segno: u64,
+        term: u64,
+        commit_lsn: Lsn,
+        flush_lsn: Lsn,
+    ) -> String {
+        format!(
+            "{}_{}_{:016X}_{:016X}_sk{}.partial",
+            self.segment_name(segno),
+            term,
+            flush_lsn.0,
+            commit_lsn.0,
+            self.conf.my_id.0,
+        )
+    }
+
+    fn local_segment_name(&self, segno: u64) -> String {
+        format!("{}.partial", self.segment_name(segno))
+    }
+}
+
+impl PartialBackup {
+    /// Takes a lock to read actual safekeeper state and returns a segment that should be uploaded.
+    async fn prepare_upload(&self) -> PartialRemoteSegment {
+        // this operation takes a lock to get the actual state
+        let sk_info = self.tli.get_safekeeper_info(&self.conf).await;
+        let flush_lsn = Lsn(sk_info.flush_lsn);
+        let commit_lsn = Lsn(sk_info.commit_lsn);
+        let term = sk_info.term;
+        let segno = self.segno(flush_lsn);
+
+        let name = self.remote_segment_name(segno, term, commit_lsn, flush_lsn);
+
+        PartialRemoteSegment {
+            status: UploadStatus::InProgress,
+            name,
+            commit_lsn,
+            flush_lsn,
+            term,
+        }
+    }
+
+    /// Reads segment from disk and uploads it to the remote storage.
+    async fn upload_segment(&mut self, prepared: PartialRemoteSegment) -> anyhow::Result<()> {
+        let flush_lsn = prepared.flush_lsn;
+        let segno = self.segno(flush_lsn);
+
+        // We're going to backup bytes from the start of the segment up to flush_lsn.
+        let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size);
+
+        let local_path = self.local_prefix.join(self.local_segment_name(segno));
+        let remote_path = RemotePath::new(self.remote_prefix.join(&prepared.name).as_ref())?;
+
+        // Upload first `backup_bytes` bytes of the segment to the remote storage.
+        wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
+        PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64);
+
+        // We uploaded the segment, now let's verify that the data is still actual.
+        // If the term changed, we cannot guarantee the validity of the uploaded data.
+        // If the term is the same, we know the data is not corrupted.
+        let sk_info = self.tli.get_safekeeper_info(&self.conf).await;
+        if sk_info.term != prepared.term {
+            anyhow::bail!("term changed during upload");
+        }
+        assert!(prepared.commit_lsn <= Lsn(sk_info.commit_lsn));
+        assert!(prepared.flush_lsn <= Lsn(sk_info.flush_lsn));
+
+        Ok(())
+    }
+
+    /// Write new state to disk. If in-memory and on-disk states diverged, returns an error.
+    async fn commit_state(&mut self, new_state: State) -> anyhow::Result<()> {
+        self.tli
+            .map_control_file(|cf| {
+                if cf.partial_backup != self.state {
+                    let memory = self.state.clone();
+                    self.state = cf.partial_backup.clone();
+                    anyhow::bail!(
+                        "partial backup state diverged, memory={:?}, disk={:?}",
+                        memory,
+                        cf.partial_backup
+                    );
+                }
+
+                cf.partial_backup = new_state.clone();
+                Ok(())
+            })
+            .await?;
+        // update in-memory state
+        self.state = new_state;
+        Ok(())
+    }
+
+    /// Upload the latest version of the partial segment and garbage collect older versions.
+    #[instrument(name = "upload", skip_all, fields(name = %prepared.name))]
+    async fn do_upload(&mut self, prepared: &PartialRemoteSegment) -> anyhow::Result<()> {
+        info!("starting upload {:?}", prepared);
+
+        let state_0 = self.state.clone();
+        let state_1 = {
+            let mut state = state_0.clone();
+            state.segments.push(prepared.clone());
+            state
+        };
+
+        // we're going to upload a new segment, let's write it to disk to make GC later
+        self.commit_state(state_1).await?;
+
+        self.upload_segment(prepared.clone()).await?;
+
+        let state_2 = {
+            let mut state = state_0.clone();
+            for seg in state.segments.iter_mut() {
+                seg.status = UploadStatus::Deleting;
+            }
+            let mut actual_remote_segment = prepared.clone();
+            actual_remote_segment.status = UploadStatus::Uploaded;
+            state.segments.push(actual_remote_segment);
+            state
+        };
+
+        // we've uploaded new segment, it's actual, all other segments should be GCed
+        self.commit_state(state_2).await?;
+        self.gc().await?;
+
+        Ok(())
+    }
+
+    /// Delete all non-Uploaded segments from the remote storage. There should be only one
+    /// Uploaded segment at a time.
+    #[instrument(name = "gc", skip_all)]
+    async fn gc(&mut self) -> anyhow::Result<()> {
+        let mut segments_to_delete = vec![];
+
+        let new_segments: Vec<PartialRemoteSegment> = self
+            .state
+            .segments
+            .iter()
+            .filter_map(|seg| {
+                if seg.status == UploadStatus::Uploaded {
+                    Some(seg.clone())
+                } else {
+                    segments_to_delete.push(seg.name.clone());
+                    None
+                }
+            })
+            .collect();
+
+        info!("deleting objects: {:?}", segments_to_delete);
+        let mut objects_to_delete = vec![];
+        for seg in segments_to_delete.iter() {
+            let remote_path = RemotePath::new(self.remote_prefix.join(seg).as_ref())?;
+            objects_to_delete.push(remote_path);
+        }
+
+        // removing segments from remote storage
+        wal_backup::delete_objects(&objects_to_delete).await?;
+
+        // now we can update the state on disk
+        let new_state = {
+            let mut state = self.state.clone();
+            state.segments = new_segments;
+            state
+        };
+        self.commit_state(new_state).await?;
+
+        Ok(())
+    }
+}
+
+#[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))]
+pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
+    debug!("started");
+    let await_duration = conf.partial_backup_timeout;
+
+    let mut cancellation_rx = match tli.get_cancellation_rx() {
+        Ok(rx) => rx,
+        Err(_) => {
+            info!("timeline canceled during task start");
+            return;
+        }
+    };
+
+    // sleep for random time to avoid thundering herd
+    {
+        let randf64 = rand::thread_rng().gen_range(0.0..1.0);
+        let sleep_duration = await_duration.mul_f64(randf64);
+        tokio::time::sleep(sleep_duration).await;
+    }
+
+    let (_, persistent_state) = tli.get_state().await;
+    let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
+    let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
+    let wal_seg_size = tli.get_wal_seg_size().await;
+
+    let local_prefix = tli.timeline_dir.clone();
+    let remote_prefix = match tli.timeline_dir.strip_prefix(&conf.workdir) {
+        Ok(path) => path.to_owned(),
+        Err(e) => {
+            error!("failed to strip workspace dir prefix: {:?}", e);
+            return;
+        }
+    };
+
+    let mut backup = PartialBackup {
+        wal_seg_size,
+        tli,
+        state: persistent_state.partial_backup,
+        conf,
+        local_prefix,
+        remote_prefix,
+    };
+
+    debug!("state: {:?}", backup.state);
+
+    'outer: loop {
+        // wait until we have something to upload
+        let uploaded_segment = backup.state.uploaded_segment();
+        if let Some(seg) = &uploaded_segment {
+            // if we already uploaded something, wait until we have something new
+            while flush_lsn_rx.borrow().lsn == seg.flush_lsn
+                && *commit_lsn_rx.borrow() == seg.commit_lsn
+                && flush_lsn_rx.borrow().term == seg.term
+            {
+                tokio::select! {
+                    _ = cancellation_rx.changed() => {
+                        info!("timeline canceled");
+                        return;
+                    }
+                    _ = commit_lsn_rx.changed() => {}
+                    _ = flush_lsn_rx.changed() => {}
+                }
+            }
+        }
+
+        // fixing the segno and waiting some time to prevent reuploading the same segment too often
+        let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
+        let timeout = tokio::time::sleep(await_duration);
+        tokio::pin!(timeout);
+        let mut timeout_expired = false;
+
+        // waiting until timeout expires OR segno changes
+        'inner: loop {
+            tokio::select! {
+                _ = cancellation_rx.changed() => {
+                    info!("timeline canceled");
+                    return;
+                }
+                _ = commit_lsn_rx.changed() => {}
+                _ = flush_lsn_rx.changed() => {
+                    let segno = backup.segno(flush_lsn_rx.borrow().lsn);
+                    if segno != pending_segno {
+                        // previous segment is no longer partial, aborting the wait
+                        break 'inner;
+                    }
+                }
+                _ = &mut timeout => {
+                    // timeout expired, now we are ready for upload
+                    timeout_expired = true;
+                    break 'inner;
+                }
+            }
+        }
+
+        if !timeout_expired {
+            // likely segno has changed, let's try again in the next iteration
+            continue 'outer;
+        }
+
+        let prepared = backup.prepare_upload().await;
+        if let Some(seg) = &uploaded_segment {
+            if seg.eq_without_status(&prepared) {
+                // we already uploaded this segment, nothing to do
+                continue 'outer;
+            }
+        }
+
+        match backup.do_upload(&prepared).await {
+            Ok(()) => {
+                debug!(
+                    "uploaded {} up to flush_lsn {}",
+                    prepared.name, prepared.flush_lsn
+                );
+                PARTIAL_BACKUP_UPLOADS.with_label_values(&["ok"]).inc();
+            }
+            Err(e) => {
+                info!("failed to upload {}: {:#}", prepared.name, e);
+                PARTIAL_BACKUP_UPLOADS.with_label_values(&["error"]).inc();
+            }
+        }
+    }
+}
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index e3aaf5d391..bc21c4d765 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -176,6 +176,8 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         http_auth: None,
         current_thread_runtime: false,
         walsenders_keep_horizon: false,
+        partial_backup_enabled: false,
+        partial_backup_timeout: Duration::from_secs(0),
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index ddad98a5fa..208263a22a 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -192,6 +192,9 @@ def test_backward_compatibility(
     assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
 
 
+# Forward compatibility is broken due to https://github.com/neondatabase/neon/pull/6530
+# The test is disabled until the next release deployment
+@pytest.mark.xfail
 @check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index b5e8eea237..5902eb3217 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -10,6 +10,7 @@ import pytest
 import toml
 from fixtures.log_helper import getLogger
 from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper
+from fixtures.remote_storage import RemoteStorageKind
 from fixtures.types import Lsn, TenantId, TimelineId
 
 log = getLogger("root.safekeeper_async")
@@ -199,7 +200,9 @@ async def run_restarts_under_load(
         # assert that at least one transaction has completed in every worker
         stats.check_progress()
 
-        victim.start()
+        # testing #6530, temporary here
+        # TODO: remove afer partial backup is enabled by default
+        victim.start(extra_opts=["--partial-backup-enabled", "--partial-backup-timeout=2s"])
 
     log.info("Iterations are finished, exiting coroutines...")
     stats.running = False
@@ -213,6 +216,7 @@ async def run_restarts_under_load(
 # Restart acceptors one by one, while executing and validating bank transactions
 def test_restarts_under_load(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS)
     env = neon_env_builder.init_start()
 
     env.neon_cli.create_branch("test_safekeepers_restarts_under_load")

From 36b875388f7e3fa6d37b4e90b74600526465b2ae Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 3 Apr 2024 16:46:25 +0100
Subject: [PATCH 0513/1571] pageserver: replace the locked tenant config with
 arcsawps (#7292)

## Problem
For reasons unrelated to this PR, I would like to make use of the tenant
conf in the `InMemoryLayer`. Previously, this was not possible without
copying and manually updating the copy to keep it in sync with updates.

## Summary of Changes:
Replace the `Arc<RwLock<AttachedTenantConf>>` with
`Arc<ArcSwap<AttachedTenantConf>>` (how many `Arc(s)` can one fit in a
type?). The most interesting part of this change is the updating of the
tenant config (`set_new_tenant_config` and
`set_new_location_config`). In theory, these two may race, although the
storage controller should prevent this via the tenant exclusive op lock.
Particular care has been taken to not "lose" a location config update by
using the read-copy-update approach when updating only the config.
---
 pageserver/src/tenant.rs          | 76 ++++++++++++++++++-------------
 pageserver/src/tenant/timeline.rs | 68 +++++++++++++++------------
 2 files changed, 83 insertions(+), 61 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1fb92a50fe..1ee810614e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,6 +12,7 @@
 //!
 
 use anyhow::{bail, Context};
+use arc_swap::ArcSwap;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use enumset::EnumSet;
@@ -98,7 +99,7 @@ use std::ops::Bound::Included;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
-use std::sync::{Mutex, RwLock};
+use std::sync::Mutex;
 use std::time::{Duration, Instant};
 
 use crate::span;
@@ -260,7 +261,7 @@ pub struct Tenant {
     // We keep TenantConfOpt sturct here to preserve the information
     // about parameters that are not set.
     // This is necessary to allow global config updates.
-    tenant_conf: Arc<RwLock<AttachedTenantConf>>,
+    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
 
     tenant_shard_id: TenantShardId,
 
@@ -1606,7 +1607,7 @@ impl Tenant {
         );
 
         {
-            let conf = self.tenant_conf.read().unwrap();
+            let conf = self.tenant_conf.load();
 
             if !conf.location.may_delete_layers_hint() {
                 info!("Skipping GC in location state {:?}", conf.location);
@@ -1633,7 +1634,7 @@ impl Tenant {
         }
 
         {
-            let conf = self.tenant_conf.read().unwrap();
+            let conf = self.tenant_conf.load();
             if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
                 info!("Skipping compaction in location state {:?}", conf.location);
                 return Ok(());
@@ -2082,14 +2083,14 @@ impl Tenant {
     }
 
     pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
-        self.tenant_conf.read().unwrap().location.attach_mode
+        self.tenant_conf.load().location.attach_mode
     }
 
     /// For API access: generate a LocationConfig equivalent to the one that would be used to
     /// create a Tenant in the same state.  Do not use this in hot paths: it's for relatively
     /// rare external API calls, like a reconciliation at startup.
     pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
-        let conf = self.tenant_conf.read().unwrap();
+        let conf = self.tenant_conf.load();
 
         let location_config_mode = match conf.location.attach_mode {
             AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
@@ -2236,7 +2237,7 @@ where
 
 impl Tenant {
     pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
-        self.tenant_conf.read().unwrap().tenant_conf.clone()
+        self.tenant_conf.load().tenant_conf.clone()
     }
 
     pub fn effective_config(&self) -> TenantConf {
@@ -2245,84 +2246,84 @@ impl Tenant {
     }
 
     pub fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .checkpoint_distance
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
     }
 
     pub fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .checkpoint_timeout
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
     }
 
     pub fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .compaction_target_size
             .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
     }
 
     pub fn get_compaction_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .compaction_period
             .unwrap_or(self.conf.default_tenant_conf.compaction_period)
     }
 
     pub fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .compaction_threshold
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
     pub fn get_gc_horizon(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .gc_horizon
             .unwrap_or(self.conf.default_tenant_conf.gc_horizon)
     }
 
     pub fn get_gc_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .gc_period
             .unwrap_or(self.conf.default_tenant_conf.gc_period)
     }
 
     pub fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .image_creation_threshold
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
     pub fn get_pitr_interval(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .pitr_interval
             .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
     }
 
     pub fn get_trace_read_requests(&self) -> bool {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .trace_read_requests
             .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
     }
 
     pub fn get_min_resident_size_override(&self) -> Option<u64> {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .min_resident_size_override
             .or(self.conf.default_tenant_conf.min_resident_size_override)
     }
 
     pub fn get_heatmap_period(&self) -> Option<Duration> {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         let heatmap_period = tenant_conf
             .heatmap_period
             .unwrap_or(self.conf.default_tenant_conf.heatmap_period);
@@ -2334,26 +2335,40 @@ impl Tenant {
     }
 
     pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
-        self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
-        self.tenant_conf_updated();
+        // Use read-copy-update in order to avoid overwriting the location config
+        // state if this races with [`Tenant::set_new_location_config`]. Note that
+        // this race is not possible if both request types come from the storage
+        // controller (as they should!) because an exclusive op lock is required
+        // on the storage controller side.
+        self.tenant_conf.rcu(|inner| {
+            Arc::new(AttachedTenantConf {
+                tenant_conf: new_tenant_conf.clone(),
+                location: inner.location,
+            })
+        });
+
+        self.tenant_conf_updated(&new_tenant_conf);
         // Don't hold self.timelines.lock() during the notifies.
         // There's no risk of deadlock right now, but there could be if we consolidate
         // mutexes in struct Timeline in the future.
         let timelines = self.list_timelines();
         for timeline in timelines {
-            timeline.tenant_conf_updated();
+            timeline.tenant_conf_updated(&new_tenant_conf);
         }
     }
 
     pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
-        *self.tenant_conf.write().unwrap() = new_conf;
-        self.tenant_conf_updated();
+        let new_tenant_conf = new_conf.tenant_conf.clone();
+
+        self.tenant_conf.store(Arc::new(new_conf));
+
+        self.tenant_conf_updated(&new_tenant_conf);
         // Don't hold self.timelines.lock() during the notifies.
         // There's no risk of deadlock right now, but there could be if we consolidate
         // mutexes in struct Timeline in the future.
         let timelines = self.list_timelines();
         for timeline in timelines {
-            timeline.tenant_conf_updated();
+            timeline.tenant_conf_updated(&new_tenant_conf);
         }
     }
 
@@ -2367,11 +2382,8 @@ impl Tenant {
             .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
     }
 
-    pub(crate) fn tenant_conf_updated(&self) {
-        let conf = {
-            let guard = self.tenant_conf.read().unwrap();
-            Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf)
-        };
+    pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
+        let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
         self.timeline_get_throttle.reconfigure(conf)
     }
 
@@ -2519,7 +2531,7 @@ impl Tenant {
                 Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
                 &crate::metrics::tenant_throttling::TIMELINE_GET,
             )),
-            tenant_conf: Arc::new(RwLock::new(attached_conf)),
+            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
         }
     }
 
@@ -3505,7 +3517,7 @@ impl Tenant {
     }
 
     pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
-        self.tenant_conf.read().unwrap().tenant_conf.clone()
+        self.tenant_conf.load().tenant_conf.clone()
     }
 }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 16cec6805c..11d0c7763e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -9,6 +9,7 @@ pub mod uninit;
 mod walreceiver;
 
 use anyhow::{anyhow, bail, ensure, Context, Result};
+use arc_swap::ArcSwap;
 use bytes::Bytes;
 use camino::Utf8Path;
 use enumset::EnumSet;
@@ -183,7 +184,7 @@ pub(crate) struct AuxFilesState {
 
 pub struct Timeline {
     conf: &'static PageServerConf,
-    tenant_conf: Arc<RwLock<AttachedTenantConf>>,
+    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
 
     myself: Weak<Self>,
 
@@ -1588,57 +1589,65 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 // Private functions
 impl Timeline {
     pub(crate) fn get_lazy_slru_download(&self) -> bool {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
         tenant_conf
+            .tenant_conf
             .lazy_slru_download
             .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
     }
 
     fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
         tenant_conf
+            .tenant_conf
             .checkpoint_distance
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
     }
 
     fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
         tenant_conf
+            .tenant_conf
             .checkpoint_timeout
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
     }
 
     fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
         tenant_conf
+            .tenant_conf
             .compaction_target_size
             .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
     }
 
     fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
         tenant_conf
+            .tenant_conf
             .compaction_threshold
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
     fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
         tenant_conf
+            .tenant_conf
             .image_creation_threshold
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
     fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
-        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = &self.tenant_conf.load();
         tenant_conf
+            .tenant_conf
             .compaction_algorithm
             .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
     }
 
     fn get_eviction_policy(&self) -> EvictionPolicy {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
         tenant_conf
+            .tenant_conf
             .eviction_policy
             .unwrap_or(self.conf.default_tenant_conf.eviction_policy)
     }
@@ -1653,22 +1662,25 @@ impl Timeline {
     }
 
     fn get_image_layer_creation_check_threshold(&self) -> u8 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
-        tenant_conf.image_layer_creation_check_threshold.unwrap_or(
-            self.conf
-                .default_tenant_conf
-                .image_layer_creation_check_threshold,
-        )
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .image_layer_creation_check_threshold
+            .unwrap_or(
+                self.conf
+                    .default_tenant_conf
+                    .image_layer_creation_check_threshold,
+            )
     }
 
-    pub(super) fn tenant_conf_updated(&self) {
+    pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
         // NB: Most tenant conf options are read by background loops, so,
         // changes will automatically be picked up.
 
         // The threshold is embedded in the metric. So, we need to update it.
         {
             let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
-                &self.tenant_conf.read().unwrap().tenant_conf,
+                new_conf,
                 &self.conf.default_tenant_conf,
             );
 
@@ -1695,7 +1707,7 @@ impl Timeline {
     #[allow(clippy::too_many_arguments)]
     pub(super) fn new(
         conf: &'static PageServerConf,
-        tenant_conf: Arc<RwLock<AttachedTenantConf>>,
+        tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
         metadata: &TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
         timeline_id: TimelineId,
@@ -1714,14 +1726,13 @@ impl Timeline {
         let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
         let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
 
-        let tenant_conf_guard = tenant_conf.read().unwrap();
-
-        let evictions_low_residence_duration_metric_threshold =
+        let evictions_low_residence_duration_metric_threshold = {
+            let loaded_tenant_conf = tenant_conf.load();
             Self::get_evictions_low_residence_duration_metric_threshold(
-                &tenant_conf_guard.tenant_conf,
+                &loaded_tenant_conf.tenant_conf,
                 &conf.default_tenant_conf,
-            );
-        drop(tenant_conf_guard);
+            )
+        };
 
         Arc::new_cyclic(|myself| {
             let mut result = Timeline {
@@ -1904,20 +1915,19 @@ impl Timeline {
             self.timeline_id, self.tenant_shard_id
         );
 
-        let tenant_conf_guard = self.tenant_conf.read().unwrap();
-        let wal_connect_timeout = tenant_conf_guard
+        let tenant_conf = self.tenant_conf.load();
+        let wal_connect_timeout = tenant_conf
             .tenant_conf
             .walreceiver_connect_timeout
             .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
-        let lagging_wal_timeout = tenant_conf_guard
+        let lagging_wal_timeout = tenant_conf
             .tenant_conf
             .lagging_wal_timeout
             .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
-        let max_lsn_wal_lag = tenant_conf_guard
+        let max_lsn_wal_lag = tenant_conf
             .tenant_conf
             .max_lsn_wal_lag
             .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
-        drop(tenant_conf_guard);
 
         let mut guard = self.walreceiver.lock().unwrap();
         assert!(

From b30b15e7cbc90ade8cba8dea337c6c6ac9f6ed00 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 3 Apr 2024 17:49:54 +0200
Subject: [PATCH 0514/1571] refactor(Timeline::shutdown): rely more on
 Timeline::cancel; use it from deletion code path (#7233)

This PR is a fallout from work on #7062.

# Changes

- Unify the freeze-and-flush and hard shutdown code paths into a single
method `Timeline::shutdown` that takes the shutdown mode as an argument.
- Replace `freeze_and_flush` bool arg in callers with that mode
argument, makes them more expressive.
- Switch timeline deletion to use `Timeline::shutdown` instead of its
own slightly-out-of-sync copy.
- Remove usage of `task_mgr::shutdown_watcher` /
`task_mgr::shutdown_token` where possible

# Future Work

Do we really need the freeze_and_flush?
If we could get rid of it, then there'd be no need for a specific
shutdown order.

Also, if you undo this patch's changes to the `eviction_task.rs` and
enable RUST_LOG=debug, it's easy to see that we do leave some task
hanging that logs under span `Connection{...}` at debug level. I think
it's a pre-existing issue; it's probably a broker client task.
---
 pageserver/src/tenant.rs                      |  21 +--
 pageserver/src/tenant/delete.rs               |   7 +-
 pageserver/src/tenant/mgr.rs                  |  17 +-
 .../src/tenant/remote_timeline_client.rs      |   2 +-
 pageserver/src/tenant/timeline.rs             | 173 +++++++++++-------
 pageserver/src/tenant/timeline/delete.rs      |  46 +----
 .../src/tenant/timeline/eviction_task.rs      |  13 +-
 7 files changed, 140 insertions(+), 139 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1ee810614e..17ff033e00 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1783,7 +1783,7 @@ impl Tenant {
     async fn shutdown(
         &self,
         shutdown_progress: completion::Barrier,
-        freeze_and_flush: bool,
+        shutdown_mode: timeline::ShutdownMode,
     ) -> Result<(), completion::Barrier> {
         span::debug_assert_current_span_has_tenant_id();
 
@@ -1830,16 +1830,8 @@ impl Tenant {
             timelines.values().for_each(|timeline| {
                 let timeline = Arc::clone(timeline);
                 let timeline_id = timeline.timeline_id;
-
-                let span =
-                    tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
-                js.spawn(async move {
-                    if freeze_and_flush {
-                        timeline.flush_and_shutdown().instrument(span).await
-                    } else {
-                        timeline.shutdown().instrument(span).await
-                    }
-                });
+                let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode);
+                js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await });
             })
         };
         // test_long_timeline_create_then_tenant_delete is leaning on this message
@@ -3866,6 +3858,7 @@ mod tests {
     use hex_literal::hex;
     use pageserver_api::keyspace::KeySpace;
     use rand::{thread_rng, Rng};
+    use tests::timeline::ShutdownMode;
 
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4311,7 +4304,7 @@ mod tests {
             make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
             // so that all uploads finish & we can call harness.load() below again
             tenant
-                .shutdown(Default::default(), true)
+                .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
                 .instrument(harness.span())
                 .await
                 .ok()
@@ -4352,7 +4345,7 @@ mod tests {
 
             // so that all uploads finish & we can call harness.load() below again
             tenant
-                .shutdown(Default::default(), true)
+                .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
                 .instrument(harness.span())
                 .await
                 .ok()
@@ -5133,7 +5126,7 @@ mod tests {
             // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
             let raw_tline = tline.raw_timeline().unwrap();
             raw_tline
-                .shutdown()
+                .shutdown(super::timeline::ShutdownMode::Hard)
                 .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
                 .await;
             std::mem::forget(tline);
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 7d37873a67..d1881f3897 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -14,7 +14,10 @@ use crate::{
     config::PageServerConf,
     context::RequestContext,
     task_mgr::{self, TaskKind},
-    tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
+    tenant::{
+        mgr::{TenantSlot, TenantsMapRemoveResult},
+        timeline::ShutdownMode,
+    },
 };
 
 use super::{
@@ -463,7 +466,7 @@ impl DeleteTenantFlow {
         // tenant.shutdown
         // Its also bad that we're holding tenants.read here.
         // TODO relax set_stopping to be idempotent?
-        if tenant.shutdown(progress, false).await.is_err() {
+        if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
             return Err(DeleteTenantError::Other(anyhow::anyhow!(
                 "tenant shutdown is already in progress"
             )));
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index ab2ef4fa79..b1b46d487b 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -44,6 +44,7 @@ use crate::tenant::config::{
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
+use crate::tenant::timeline::ShutdownMode;
 use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
 
@@ -783,11 +784,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
                             shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
                             join_set.spawn(
                                 async move {
-                                    let freeze_and_flush = true;
-
                                     let res = {
                                         let (_guard, shutdown_progress) = completion::channel();
-                                        t.shutdown(shutdown_progress, freeze_and_flush).await
+                                        t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await
                                     };
 
                                     if let Err(other_progress) = res {
@@ -1107,7 +1106,7 @@ impl TenantManager {
                 };
 
                 info!("Shutting down attached tenant");
-                match tenant.shutdown(progress, false).await {
+                match tenant.shutdown(progress, ShutdownMode::Hard).await {
                     Ok(()) => {}
                     Err(barrier) => {
                         info!("Shutdown already in progress, waiting for it to complete");
@@ -1223,7 +1222,7 @@ impl TenantManager {
                     TenantSlot::Attached(tenant) => {
                         let (_guard, progress) = utils::completion::channel();
                         info!("Shutting down just-spawned tenant, because tenant manager is shut down");
-                        match tenant.shutdown(progress, false).await {
+                        match tenant.shutdown(progress, ShutdownMode::Hard).await {
                             Ok(()) => {
                                 info!("Finished shutting down just-spawned tenant");
                             }
@@ -1273,7 +1272,7 @@ impl TenantManager {
         };
 
         let (_guard, progress) = utils::completion::channel();
-        match tenant.shutdown(progress, false).await {
+        match tenant.shutdown(progress, ShutdownMode::Hard).await {
             Ok(()) => {
                 slot_guard.drop_old_value()?;
             }
@@ -1677,7 +1676,7 @@ impl TenantManager {
 
         // Phase 5: Shut down the parent shard, and erase it from disk
         let (_guard, progress) = completion::channel();
-        match parent.shutdown(progress, false).await {
+        match parent.shutdown(progress, ShutdownMode::Hard).await {
             Ok(()) => {}
             Err(other) => {
                 other.wait().await;
@@ -2664,11 +2663,11 @@ where
     let attached_tenant = match slot_guard.get_old_value() {
         Some(TenantSlot::Attached(tenant)) => {
             // whenever we remove a tenant from memory, we don't want to flush and wait for upload
-            let freeze_and_flush = false;
+            let shutdown_mode = ShutdownMode::Hard;
 
             // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
             // that we can continue safely to cleanup.
-            match tenant.shutdown(progress, freeze_and_flush).await {
+            match tenant.shutdown(progress, shutdown_mode).await {
                 Ok(()) => {}
                 Err(_other) => {
                     // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index cbd942d706..13fcd1a5e8 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1569,7 +1569,7 @@ impl RemoteTimelineClient {
     /// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
     ///
     /// In-progress operations will still be running after this function returns.
-    /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
+    /// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))`
     /// to wait for them to complete, after calling this function.
     pub(crate) fn stop(&self) {
         // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 11d0c7763e..c5eda44b7d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -619,6 +619,19 @@ pub(crate) enum WaitLsnWaiter<'a> {
     PageService,
 }
 
+/// Argument to [`Timeline::shutdown`].
+#[derive(Debug, Clone, Copy)]
+pub(crate) enum ShutdownMode {
+    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
+    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
+    ///
+    /// While we are flushing, we continue to accept read I/O for LSNs ingested before
+    /// the call to [`Timeline::shutdown`].
+    FreezeAndFlush,
+    /// Shut down immediately, without waiting for any open layers to flush.
+    Hard,
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -1306,86 +1319,119 @@ impl Timeline {
         self.launch_eviction_task(parent, background_jobs_can_start);
     }
 
-    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
-    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
+    /// After this function returns, there are no timeline-scoped tasks are left running.
     ///
-    /// While we are flushing, we continue to accept read I/O.
-    pub(crate) async fn flush_and_shutdown(&self) {
+    /// The preferred pattern for is:
+    /// - in any spawned tasks, keep Timeline::guard open + Timeline::cancel / child token
+    /// - if early shutdown (not just cancellation) of a sub-tree of tasks is required,
+    ///   go the extra mile and keep track of JoinHandles
+    /// - Keep track of JoinHandles using a passed-down `Arc<Mutex<Option<JoinSet>>>` or similar,
+    ///   instead of spawning directly on a runtime. It is a more composable / testable pattern.
+    ///
+    /// For legacy reasons, we still have multiple tasks spawned using
+    /// `task_mgr::spawn(X, Some(tenant_id), Some(timeline_id))`.
+    /// We refer to these as "timeline-scoped task_mgr tasks".
+    /// Some of these tasks are already sensitive to Timeline::cancel while others are
+    /// not sensitive to Timeline::cancel and instead respect [`task_mgr::shutdown_token`]
+    /// or [`task_mgr::shutdown_watcher`].
+    /// We want to gradually convert the code base away from these.
+    ///
+    /// Here is an inventory of timeline-scoped task_mgr tasks that are still sensitive to
+    /// `task_mgr::shutdown_{token,watcher}` (there are also tenant-scoped and global-scoped
+    /// ones that aren't mentioned here):
+    /// - [`TaskKind::TimelineDeletionWorker`]
+    ///    - NB: also used for tenant deletion
+    /// - [`TaskKind::RemoteUploadTask`]`
+    /// - [`TaskKind::InitialLogicalSizeCalculation`]
+    /// - [`TaskKind::DownloadAllRemoteLayers`] (can we get rid of it?)
+    // Inventory of timeline-scoped task_mgr tasks that use spawn but aren't sensitive:
+    /// - [`TaskKind::Eviction`]
+    /// - [`TaskKind::LayerFlushTask`]
+    /// - [`TaskKind::OndemandLogicalSizeCalculation`]
+    /// - [`TaskKind::GarbageCollector`] (immediate_gc is timeline-scoped)
+    pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
-        // Stop ingesting data. Walreceiver only provides cancellation but no
-        // "wait until gone", because it uses the Timeline::gate.  So, only
-        // after the self.gate.close() in self.shutdown() below will we know for
-        // sure that no walreceiver tasks are left.
-        // This means that we might still be ingesting data during the call to
-        // `self.freeze_and_flush()` below.  That's not ideal, but, we don't have
-        // the concept of a ChildGuard, which is what we'd need to properly model
-        // early shutdown of the walreceiver task sub-tree before the other
-        // Timeline task sub-trees.
-        if let Some(walreceiver) = self.walreceiver.lock().unwrap().take() {
+        let try_freeze_and_flush = match mode {
+            ShutdownMode::FreezeAndFlush => true,
+            ShutdownMode::Hard => false,
+        };
+
+        // Regardless of whether we're going to try_freeze_and_flush
+        // or not, stop ingesting any more data. Walreceiver only provides
+        // cancellation but no "wait until gone", because it uses the Timeline::gate.
+        // So, only after the self.gate.close() below will we know for sure that
+        // no walreceiver tasks are left.
+        // For `try_freeze_and_flush=true`, this means that we might still be ingesting
+        // data during the call to `self.freeze_and_flush()` below.
+        // That's not ideal, but, we don't have the concept of a ChildGuard,
+        // which is what we'd need to properly model early shutdown of the walreceiver
+        // task sub-tree before the other Timeline task sub-trees.
+        let walreceiver = self.walreceiver.lock().unwrap().take();
+        tracing::debug!(
+            is_some = walreceiver.is_some(),
+            "Waiting for WalReceiverManager..."
+        );
+        if let Some(walreceiver) = walreceiver {
             walreceiver.cancel();
         }
-
-        // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
+        // ... and inform any waiters for newer LSNs that there won't be any.
         self.last_record_lsn.shutdown();
 
-        // now all writers to InMemory layer are gone, do the final flush if requested
-        match self.freeze_and_flush().await {
-            Ok(_) => {
-                // drain the upload queue
-                if let Some(client) = self.remote_client.as_ref() {
-                    // if we did not wait for completion here, it might be our shutdown process
-                    // didn't wait for remote uploads to complete at all, as new tasks can forever
-                    // be spawned.
-                    //
-                    // what is problematic is the shutting down of RemoteTimelineClient, because
-                    // obviously it does not make sense to stop while we wait for it, but what
-                    // about corner cases like s3 suddenly hanging up?
-                    client.shutdown().await;
+        if try_freeze_and_flush {
+            // we shut down walreceiver above, so, we won't add anything more
+            // to the InMemoryLayer; freeze it and wait for all frozen layers
+            // to reach the disk & upload queue, then shut the upload queue and
+            // wait for it to drain.
+            match self.freeze_and_flush().await {
+                Ok(_) => {
+                    // drain the upload queue
+                    if let Some(client) = self.remote_client.as_ref() {
+                        // if we did not wait for completion here, it might be our shutdown process
+                        // didn't wait for remote uploads to complete at all, as new tasks can forever
+                        // be spawned.
+                        //
+                        // what is problematic is the shutting down of RemoteTimelineClient, because
+                        // obviously it does not make sense to stop while we wait for it, but what
+                        // about corner cases like s3 suddenly hanging up?
+                        client.shutdown().await;
+                    }
+                }
+                Err(e) => {
+                    // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
+                    // we have some extra WAL replay to do next time the timeline starts.
+                    warn!("failed to freeze and flush: {e:#}");
                 }
             }
-            Err(e) => {
-                // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
-                // we have some extra WAL replay to do next time the timeline starts.
-                warn!("failed to freeze and flush: {e:#}");
-            }
         }
 
-        self.shutdown().await;
-    }
-
-    /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
-    /// the graceful [`Timeline::flush_and_shutdown`] function.
-    pub(crate) async fn shutdown(&self) {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-
         // Signal any subscribers to our cancellation token to drop out
         tracing::debug!("Cancelling CancellationToken");
         self.cancel.cancel();
 
-        // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
-        // while doing so.
-        self.last_record_lsn.shutdown();
-
-        // Shut down the layer flush task before the remote client, as one depends on the other
-        task_mgr::shutdown_tasks(
-            Some(TaskKind::LayerFlushTask),
-            Some(self.tenant_shard_id),
-            Some(self.timeline_id),
-        )
-        .await;
-
-        // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
-        // case our caller wants to use that for a deletion
+        // Transition the remote_client into a state where it's only useful for timeline deletion.
+        // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
         if let Some(remote_client) = self.remote_client.as_ref() {
             remote_client.stop();
+            // As documented in remote_client.stop()'s doc comment, it's our responsibility
+            // to shut down the upload queue tasks.
+            // TODO: fix that, task management should be encapsulated inside remote_client.
+            task_mgr::shutdown_tasks(
+                Some(TaskKind::RemoteUploadTask),
+                Some(self.tenant_shard_id),
+                Some(self.timeline_id),
+            )
+            .await;
         }
 
+        // TODO: work toward making this a no-op. See this funciton's doc comment for more context.
         tracing::debug!("Waiting for tasks...");
-
         task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;
 
-        // Finally wait until any gate-holders are complete
+        // Finally wait until any gate-holders are complete.
+        //
+        // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks
+        // and use a TBD variant of shutdown_tasks that asserts that there were no tasks left.
         self.gate.close().await;
 
         self.metrics.shutdown();
@@ -2475,10 +2521,6 @@ impl Timeline {
                 debug!("cancelling logical size calculation for timeline shutdown");
                 calculation.await
             }
-            _ = task_mgr::shutdown_watcher() => {
-                debug!("cancelling logical size calculation for task shutdown");
-                calculation.await
-            }
         }
     }
 
@@ -3162,16 +3204,11 @@ impl Timeline {
         loop {
             tokio::select! {
                 _ = self.cancel.cancelled() => {
-                    info!("shutting down layer flush task");
-                    break;
-                },
-                _ = task_mgr::shutdown_watcher() => {
-                    info!("shutting down layer flush task");
+                    info!("shutting down layer flush task due to Timeline::cancel");
                     break;
                 },
                 _ = layer_flush_start_rx.changed() => {}
             }
-
             trace!("waking up");
             let flush_counter = *layer_flush_start_rx.borrow();
             let result = loop {
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index c7f815d179..af10c1c84b 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -14,7 +14,6 @@ use crate::{
     deletion_queue::DeletionQueueClient,
     task_mgr::{self, TaskKind},
     tenant::{
-        debug_assert_current_span_has_tenant_and_timeline_id,
         metadata::TimelineMetadata,
         remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
         CreateTimelineCause, DeleteTimelineError, Tenant,
@@ -23,42 +22,6 @@ use crate::{
 
 use super::{Timeline, TimelineResources};
 
-/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
-async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
-    debug_assert_current_span_has_tenant_and_timeline_id();
-    // Notify any timeline work to drop out of loops/requests
-    tracing::debug!("Cancelling CancellationToken");
-    timeline.cancel.cancel();
-
-    // Prevent new uploads from starting.
-    if let Some(remote_client) = timeline.remote_client.as_ref() {
-        remote_client.stop();
-    }
-
-    // Stop & wait for the remaining timeline tasks, including upload tasks.
-    // NB: This and other delete_timeline calls do not run as a task_mgr task,
-    //     so, they are not affected by this shutdown_tasks() call.
-    info!("waiting for timeline tasks to shutdown");
-    task_mgr::shutdown_tasks(
-        None,
-        Some(timeline.tenant_shard_id),
-        Some(timeline.timeline_id),
-    )
-    .await;
-
-    fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: timeline-delete-before-index-deleted-at"
-        ))?
-    });
-
-    tracing::debug!("Waiting for gate...");
-    timeline.gate.close().await;
-    tracing::debug!("Shutdown complete");
-
-    Ok(())
-}
-
 /// Mark timeline as deleted in S3 so we won't pick it up next time
 /// during attach or pageserver restart.
 /// See comment in persist_index_part_with_deleted_flag.
@@ -252,7 +215,14 @@ impl DeleteTimelineFlow {
 
         guard.mark_in_progress()?;
 
-        stop_tasks(&timeline).await?;
+        // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
+        timeline.shutdown(super::ShutdownMode::Hard).await;
+
+        fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: timeline-delete-before-index-deleted-at"
+            ))?
+        });
 
         set_deleted_in_remote_index(&timeline).await?;
 
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index ebcd70bd39..522c5b57de 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -67,20 +67,19 @@ impl Timeline {
             ),
             false,
             async move {
-                let cancel = task_mgr::shutdown_token();
                 tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()); }
+                    _ = self_clone.cancel.cancelled() => { return Ok(()); }
                     _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
                 };
 
-                self_clone.eviction_task(parent, cancel).await;
+                self_clone.eviction_task(parent).await;
                 Ok(())
             },
         );
     }
 
     #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
-    async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>, cancel: CancellationToken) {
+    async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
         use crate::tenant::tasks::random_init_delay;
 
         // acquire the gate guard only once within a useful span
@@ -95,7 +94,7 @@ impl Timeline {
                 EvictionPolicy::OnlyImitiate(lat) => lat.period,
                 EvictionPolicy::NoEviction => Duration::from_secs(10),
             };
-            if random_init_delay(period, &cancel).await.is_err() {
+            if random_init_delay(period, &self.cancel).await.is_err() {
                 return;
             }
         }
@@ -104,13 +103,13 @@ impl Timeline {
         loop {
             let policy = self.get_eviction_policy();
             let cf = self
-                .eviction_iteration(&tenant, &policy, &cancel, &guard, &ctx)
+                .eviction_iteration(&tenant, &policy, &self.cancel, &guard, &ctx)
                 .await;
 
             match cf {
                 ControlFlow::Break(()) => break,
                 ControlFlow::Continue(sleep_until) => {
-                    if tokio::time::timeout_at(sleep_until, cancel.cancelled())
+                    if tokio::time::timeout_at(sleep_until, self.cancel.cancelled())
                         .await
                         .is_ok()
                     {

From 40852b955d5d35cd70a229f2639658c4eab1f867 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 4 Apr 2024 09:55:43 +0100
Subject: [PATCH 0515/1571] update ordered-multimap (#7306)

## Problem

ordered-multimap was yanked

## Summary of changes

`cargo update -p ordered-multimap`
---
 Cargo.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ecc69f7048..7fef2ebf22 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2235,9 +2235,9 @@ dependencies = [
 
 [[package]]
 name = "h2"
-version = "0.3.24"
+version = "0.3.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9"
+checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8"
 dependencies = [
  "bytes",
  "fnv",
@@ -3436,9 +3436,9 @@ dependencies = [
 
 [[package]]
 name = "ordered-multimap"
-version = "0.7.1"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
+checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
 dependencies = [
  "dlv-list",
  "hashbrown 0.14.0",

From c5f64fe54fb3329d950a39a03f14d17918f936b2 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 4 Apr 2024 10:45:14 +0100
Subject: [PATCH 0516/1571] tests: reinstate some syntethic size tests (#7294)

## Problem

`test_empty_tenant_size` was marked `xfail` and a few other tests were
skipped.

## Summary of changes

Stabilise `test_empty_tenant_size`. This test attempted to disable
checkpointing for the postgres instance
and expected that the synthetic size remains stable for an empty tenant.
When debugging I noticed that
postgres *was* issuing a checkpoint after the transaction in the test
(perhaps something changed since the
test was introduced). Hence, I relaxed the size check to allow for the
checkpoint key written on the pageserver.

Also removed the checks for synthetic size inputs since the expected
values differ between postgres versions.

Closes https://github.com/neondatabase/neon/issues/7138
---
 test_runner/regress/test_tenant_size.py | 77 ++++++-------------------
 1 file changed, 17 insertions(+), 60 deletions(-)

diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 025cc930d7..4c8fd4b0e5 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -20,9 +20,10 @@ from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn, TenantId, TimelineId
 
 
-@pytest.mark.xfail
-def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
-    env = neon_simple_env
+def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_configs()
+    env.start()
+
     (tenant_id, _) = env.neon_cli.create_tenant()
     http_client = env.pageserver.http_client()
     initial_size = http_client.tenant_size(tenant_id)
@@ -35,66 +36,25 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
     branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0]
     assert branch_name == main_branch_name
 
-    with env.endpoints.create_start(
+    endpoint = env.endpoints.create_start(
         main_branch_name,
         tenant_id=tenant_id,
         config_lines=["autovacuum=off", "checkpoint_timeout=10min"],
-    ) as endpoint:
-        with endpoint.cursor() as cur:
-            cur.execute("SELECT 1")
-            row = cur.fetchone()
-            assert row is not None
-            assert row[0] == 1
-        size = http_client.tenant_size(tenant_id)
-        # we've disabled the autovacuum and checkpoint
-        # so background processes should not change the size.
-        # If this test will flake we should probably loosen the check
-        assert (
-            size == initial_size
-        ), f"starting idle compute should not change the tenant size (Currently {size}, expected {initial_size})"
+    )
 
-    # the size should be the same, until we increase the size over the
-    # gc_horizon
-    size, inputs = http_client.tenant_size_and_modelinputs(tenant_id)
-    assert (
-        size == initial_size
-    ), f"tenant_size should not be affected by shutdown of compute (Currently {size}, expected {initial_size})"
+    with endpoint.cursor() as cur:
+        cur.execute("SELECT 1")
+        row = cur.fetchone()
+        assert row is not None
+        assert row[0] == 1
 
-    expected_inputs = {
-        "segments": [
-            {
-                "segment": {"parent": None, "lsn": 23694408, "size": 25362432, "needed": True},
-                "timeline_id": f"{main_timeline_id}",
-                "kind": "BranchStart",
-            },
-            {
-                "segment": {"parent": 0, "lsn": 23694528, "size": None, "needed": True},
-                "timeline_id": f"{main_timeline_id}",
-                "kind": "BranchEnd",
-            },
-        ],
-        "timeline_inputs": [
-            {
-                "timeline_id": f"{main_timeline_id}",
-                "ancestor_id": None,
-                "ancestor_lsn": "0/0",
-                "last_record": "0/1698CC0",
-                "latest_gc_cutoff": "0/1698C48",
-                "horizon_cutoff": "0/0",
-                "pitr_cutoff": "0/0",
-                "next_gc_cutoff": "0/0",
-                "retention_param_cutoff": None,
-            }
-        ],
-    }
-    expected_inputs = mask_model_inputs(expected_inputs)
-    actual_inputs = mask_model_inputs(inputs)
+    # The transaction above will make the compute generate a checkpoint.
+    # In turn, the pageserver persists the checkpoint. This should only be
+    # one key with a size of a couple hundred bytes.
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, main_timeline_id)
+    size = http_client.tenant_size(tenant_id)
 
-    assert expected_inputs == actual_inputs
-
-    size_debug_file = open(test_output_dir / "size_debug.html", "w")
-    size_debug = http_client.tenant_size_debug(tenant_id)
-    size_debug_file.write(size_debug)
+    assert size >= initial_size and size - initial_size < 1024
 
 
 def test_branched_empty_timeline_size(neon_simple_env: NeonEnv, test_output_dir: Path):
@@ -190,7 +150,6 @@ def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv, test_ou
     size_debug_file.write(size_debug)
 
 
-@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
 def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     gc_horizon = 15
@@ -233,7 +192,6 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir:
     size_debug_file.write(size_debug)
 
 
-@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
 def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     gc_horizon = 5
@@ -282,7 +240,6 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
     size_debug_file.write(size_debug)
 
 
-@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
 def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     gc_horizon = small

From ae15acdee7d435d8fc61036227dde02ca7fa7462 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 4 Apr 2024 13:28:22 +0300
Subject: [PATCH 0517/1571] Fix bug in prefetch cleanup (#7277)

## Problem

Running test_pageserver_restarts_under_workload in POR #7275 I get the
following assertion failure in prefetch:
```
#5  0x00005587220d4bf0 in ExceptionalCondition (
    conditionName=0x7fbf24d003c8 "(ring_index) < MyPState->ring_unused && (ring_index) >= MyPState->ring_last",
    fileName=0x7fbf24d00240 "/home/knizhnik/neon.main//pgxn/neon/pagestore_smgr.c", lineNumber=644)
    at /home/knizhnik/neon.main//vendor/postgres-v16/src/backend/utils/error/assert.c:66
#6  0x00007fbf24cebc9b in prefetch_set_unused (ring_index=1509) at /home/knizhnik/neon.main//pgxn/neon/pagestore_smgr.c:644
#7  0x00007fbf24cec613 in prefetch_register_buffer (tag=..., force_latest=0x0, force_lsn=0x0)
    at /home/knizhnik/neon.main//pgxn/neon/pagestore_smgr.c:891
#8  0x00007fbf24cef21e in neon_prefetch (reln=0x5587233b7388, forknum=MAIN_FORKNUM, blocknum=14110)
    at /home/knizhnik/neon.main//pgxn/neon/pagestore_smgr.c:2055

(gdb) p ring_index
$1 = 1509
(gdb) p MyPState->ring_unused
$2 = 1636
(gdb) p MyPState->ring_last
$3 = 1636
```

## Summary of changes

Check status of `prefetch_wait_for`

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/libpagestore.c   | 21 +++++++++++----------
 pgxn/neon/pagestore_smgr.c | 18 +++++++++++-------
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 1bc8a2e87c..2276b4e807 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -495,16 +495,17 @@ retry:
 static void
 pageserver_disconnect(shardno_t shard_no)
 {
-	if (page_servers[shard_no].conn)
-	{
-		/*
-		 * If the connection to any pageserver is lost, we throw away the
-		 * whole prefetch queue, even for other pageservers. It should not
-		 * cause big problems, because connection loss is supposed to be a
-		 * rare event.
-		 */
-		prefetch_on_ps_disconnect();
-	}
+	/*
+	 * If the connection to any pageserver is lost, we throw away the
+	 * whole prefetch queue, even for other pageservers. It should not
+	 * cause big problems, because connection loss is supposed to be a
+	 * rare event.
+	 *
+	 * Prefetch state should be reset even if page_servers[shard_no].conn == NULL,
+	 * because prefetch request may be registered before connection is established.
+	 */
+	prefetch_on_ps_disconnect();
+
 	pageserver_disconnect_shard(shard_no);
 }
 
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index b33cfab2bb..57a16e00ca 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -641,13 +641,12 @@ prefetch_on_ps_disconnect(void)
 static inline void
 prefetch_set_unused(uint64 ring_index)
 {
-	PrefetchRequest *slot = GetPrfSlot(ring_index);
+	PrefetchRequest *slot;
 
 	if (ring_index < MyPState->ring_last)
 		return;					/* Should already be unused */
 
-	Assert(MyPState->ring_unused > ring_index);
-
+	slot = GetPrfSlot(ring_index);
 	if (slot->status == PRFS_UNUSED)
 		return;
 
@@ -806,7 +805,8 @@ Retry:
 			{
 				if (*force_lsn > slot->effective_request_lsn)
 				{
-					prefetch_wait_for(ring_index);
+					if (!prefetch_wait_for(ring_index))
+						goto Retry;
 					prefetch_set_unused(ring_index);
 					entry = NULL;
 				}
@@ -821,7 +821,8 @@ Retry:
 			{
 				if (*force_lsn != slot->effective_request_lsn)
 				{
-					prefetch_wait_for(ring_index);
+					if (!prefetch_wait_for(ring_index))
+						goto Retry;
 					prefetch_set_unused(ring_index);
 					entry = NULL;
 				}
@@ -887,7 +888,8 @@ Retry:
 			{
 				case PRFS_REQUESTED:
 					Assert(MyPState->ring_receive == cleanup_index);
-					prefetch_wait_for(cleanup_index);
+					if (!prefetch_wait_for(cleanup_index))
+						goto Retry;
 					prefetch_set_unused(cleanup_index);
 					break;
 				case PRFS_RECEIVED:
@@ -2140,6 +2142,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	/*
 	 * Try to find prefetched page in the list of received pages.
 	 */
+  Retry:
 	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
 
 	if (entry != NULL)
@@ -2161,7 +2164,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			 */
 			if (slot->status == PRFS_REQUESTED)
 			{
-				prefetch_wait_for(slot->my_ring_index);
+				if (!prefetch_wait_for(slot->my_ring_index))
+					goto Retry;
 			}
 			/* drop caches */
 			prefetch_set_unused(slot->my_ring_index);

From 7ce613354e5230ab51a81ddb092c52d9e13810f3 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 4 Apr 2024 12:29:10 +0200
Subject: [PATCH 0518/1571] Fix length (#7308)

## Problem

Bug

## Summary of changes

Use `compressed_data.len()` instead of `data.len()`.
---
 proxy/src/usage_metrics.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index 2ad0883fb0..b21056735d 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -461,7 +461,7 @@ async fn upload_events_chunk(
         || async {
             let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone())));
             storage
-                .upload(stream, data.len(), remote_path, None, cancel)
+                .upload(stream, compressed_data.len(), remote_path, None, cancel)
                 .await
         },
         TimeoutOrCancel::caused_by_cancel,

From 375e15815c2d4adc6b435dafeb1218ad47c28a6a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 4 Apr 2024 12:22:08 +0100
Subject: [PATCH 0519/1571] storage controller: grant 'admin' access to all
 APIs (#7307)

## Problem

Currently, using `storcon-cli` requires user to select a token with
either `pageserverapi` or `admin` scope depending on which endpoint
they're using.

## Summary of changes

- In check_permissions, permit access with the admin scope even if the
required scope is missing. The effect is that an endpoint that required
`pageserverapi` now accepts either `pageserverapi` or `admin`, and for
the CLI one can simply use an `admin` scope token for everything.
---
 control_plane/attachment_service/src/http.rs | 10 +++++++++-
 test_runner/regress/test_sharding_service.py |  7 ++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 03883f0ca2..c59bcaa174 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -602,9 +602,17 @@ where
     .await
 }
 
+/// Check if the required scope is held in the request's token, or if the request has
+/// a token with 'admin' scope then always permit it.
 fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
     check_permission_with(request, |claims| {
-        crate::auth::check_permission(claims, required_scope)
+        match crate::auth::check_permission(claims, required_scope) {
+            Err(e) => match crate::auth::check_permission(claims, Scope::Admin) {
+                Ok(()) => Ok(()),
+                Err(_) => Err(e),
+            },
+            Ok(()) => Ok(()),
+        }
     })
 }
 
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 7df0b58596..233d3b9603 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -724,13 +724,18 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
         StorageControllerApiException,
         match="Forbidden: JWT authentication error",
     ):
-        svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN))
+        svc.request(
+            "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.SAFEKEEPER_DATA)
+        )
 
     # Token with correct scope
     svc.request(
         "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.PAGE_SERVER_API)
     )
 
+    # Token with admin scope should also be permitted
+    svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN))
+
     # No token
     with pytest.raises(
         StorageControllerApiException,

From 9d754e984f81dbaaf996f2f19e5756847dc8f508 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 4 Apr 2024 13:41:04 +0100
Subject: [PATCH 0520/1571] storage_controller: setup sentry reporting (#7311)

## Problem

No alerting for storage controller is in place.

## Summary of changes

Set up sentry for the storage controller.
---
 control_plane/attachment_service/src/main.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index bd8d7f5c59..5150468537 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -13,6 +13,7 @@ use tokio_util::sync::CancellationToken;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};
 
+use utils::sentry_init::init_sentry;
 use utils::{project_build_tag, project_git_version, tcp_listener};
 
 project_git_version!(GIT_VERSION);
@@ -158,6 +159,8 @@ fn main() -> anyhow::Result<()> {
         std::process::exit(1);
     }));
 
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
+
     tokio::runtime::Builder::new_current_thread()
         // We use spawn_blocking for database operations, so require approximately
         // as many blocking threads as we will open database connections.

From 4810c22607ee020ddbb1408032aaf0f0d35bc6ca Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 4 Apr 2024 17:54:14 +0200
Subject: [PATCH 0521/1571] fix(walredo spawn): coalescing stalls other
 executors std::sync::RwLock (#7310)

part of #6628

Before this PR, we used a std::sync::RwLock to coalesce multiple
callers on one walredo spawning. One thread would win the write lock
and others would queue up either at the read() or write() lock call.

In a scenario where a compute initiates multiple getpage requests
from different Postgres backends (= different page_service conns),
and we don't have a walredo process around, this means all these
page_service handler tasks will enter the spawning code path,
one of them will do the spawning, and the others will stall their
respective executor thread because they do a blocking
read()/write() lock call.

I don't know exactly how bad the impact is in reality because
posix_spawn uses CLONE_VFORK under the hood, which means that the
entire parent process stalls anyway until the child does `exec`,
which in turn resumes the parent.

But, anyway, we won't know until we fix this issue.
And, there's definitely a future way out of stalling the
pageserver on posix_spawn, namely, forking template walredo processes
that fork again when they need to be per-tenant.
This idea is tracked in
https://github.com/neondatabase/neon/issues/7320.

Changes
-------

This PR fixes that scenario by switching to use `heavier_once_cell`
for coalescing. There is a comment on the struct field that explains
it in a bit more nuance.

### Alternative Design

An alternative would be to use tokio::sync::RwLock.
I did this in the first commit in this PR branch,
before switching to `heavier_once_cell`.

Performance
-----------

I re-ran the `bench_walredo` and updated the results, showing that
the changes are neglible.

For the record, the earlier commit in this PR branch that uses
`tokio::sync::RwLock` also has updated benchmark numbers, and the
results / kinds of tiny regression were equivalent to
`heavier_once_cell`.

Note that the above doesn't measure performance on the cold path, i.e.,
when we need to launch the process and coalesce. We don't have a
benchmark
for that, and I don't expect any significant changes. We have metrics
and we log spawn latency, so, we can monitor it in staging & prod.

Risks
-----

As "usual", replacing a std::sync primitive with something that yields
to
the executor risks exposing concurrency that was previously implicitly
limited to the number of executor threads.

This would be the first one for walredo.

The risk is that we get descheduled while the reconstruct data is
already there.
That could pile up reconstruct data.

In practice, I think the risk is low because once we get scheduled
again, we'll
likely have a walredo process ready, and there is no further await point
until walredo is complete and the reconstruct data has been dropped.

This will change with async walredo PR #6548, and I'm well aware of it
in that PR.
---
 pageserver/benches/bench_walredo.rs |  34 +++----
 pageserver/src/walredo.rs           | 136 +++++++++++++++-------------
 2 files changed, 88 insertions(+), 82 deletions(-)

diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 3efad546a6..ffe607be4b 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -27,25 +27,25 @@
 //!
 //! # Reference Numbers
 //!
-//! 2024-03-20 on i3en.3xlarge
+//! 2024-04-04 on i3en.3xlarge
 //!
 //! ```text
-//! short/1                 time:   [26.483 µs 26.614 µs 26.767 µs]
-//! short/2                 time:   [32.223 µs 32.465 µs 32.767 µs]
-//! short/4                 time:   [47.203 µs 47.583 µs 47.984 µs]
-//! short/8                 time:   [89.135 µs 89.612 µs 90.139 µs]
-//! short/16                time:   [190.12 µs 191.52 µs 192.88 µs]
-//! short/32                time:   [380.96 µs 382.63 µs 384.20 µs]
-//! short/64                time:   [736.86 µs 741.07 µs 745.03 µs]
-//! short/128               time:   [1.4106 ms 1.4206 ms 1.4294 ms]
-//! medium/1                time:   [111.81 µs 112.25 µs 112.79 µs]
-//! medium/2                time:   [158.26 µs 159.13 µs 160.21 µs]
-//! medium/4                time:   [334.65 µs 337.14 µs 340.07 µs]
-//! medium/8                time:   [675.32 µs 679.91 µs 685.25 µs]
-//! medium/16               time:   [1.2929 ms 1.2996 ms 1.3067 ms]
-//! medium/32               time:   [2.4295 ms 2.4461 ms 2.4623 ms]
-//! medium/64               time:   [4.3973 ms 4.4458 ms 4.4875 ms]
-//! medium/128              time:   [7.5955 ms 7.7847 ms 7.9481 ms]
+//! short/1                 time:   [25.925 µs 26.060 µs 26.209 µs]
+//! short/2                 time:   [31.277 µs 31.483 µs 31.722 µs]
+//! short/4                 time:   [45.496 µs 45.831 µs 46.182 µs]
+//! short/8                 time:   [84.298 µs 84.920 µs 85.566 µs]
+//! short/16                time:   [185.04 µs 186.41 µs 187.88 µs]
+//! short/32                time:   [385.01 µs 386.77 µs 388.70 µs]
+//! short/64                time:   [770.24 µs 773.04 µs 776.04 µs]
+//! short/128               time:   [1.5017 ms 1.5064 ms 1.5113 ms]
+//! medium/1                time:   [106.65 µs 107.20 µs 107.85 µs]
+//! medium/2                time:   [153.28 µs 154.24 µs 155.56 µs]
+//! medium/4                time:   [325.67 µs 327.01 µs 328.71 µs]
+//! medium/8                time:   [646.82 µs 650.17 µs 653.91 µs]
+//! medium/16               time:   [1.2645 ms 1.2701 ms 1.2762 ms]
+//! medium/32               time:   [2.4409 ms 2.4550 ms 2.4692 ms]
+//! medium/64               time:   [4.6814 ms 4.7114 ms 4.7408 ms]
+//! medium/128              time:   [8.7790 ms 8.9037 ms 9.0282 ms]
 //! ```
 
 use bytes::{Buf, Bytes};
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 0004f4f3c9..ca41a576fd 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -36,11 +36,12 @@ use bytes::{Bytes, BytesMut};
 use pageserver_api::key::key_to_rel_block;
 use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::TenantShardId;
-use std::sync::{Arc, RwLock};
+use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
 use utils::lsn::Lsn;
+use utils::sync::heavier_once_cell;
 
 ///
 /// This is the real implementation that uses a Postgres process to
@@ -53,7 +54,19 @@ pub struct PostgresRedoManager {
     tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
     last_redo_at: std::sync::Mutex<Option<Instant>>,
-    redo_process: RwLock<Option<Arc<process::WalRedoProcess>>>,
+    /// The current [`process::WalRedoProcess`] that is used by new redo requests.
+    /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
+    /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
+    /// their process object; we use [`Arc::clone`] for that.
+    /// This is primarily because earlier implementations that didn't  use [`heavier_once_cell`]
+    /// had that behavior; it's probably unnecessary.
+    /// The only merit of it is that if one walredo process encounters an error,
+    /// it can take it out of rotation (= using [`heavier_once_cell::Guard::take_and_deinit`].
+    /// and retry redo, thereby starting the new process, while other redo tasks might
+    /// still be using the old redo process. But, those other tasks will most likely
+    /// encounter an error as well, and errors are an unexpected condition anyway.
+    /// So, probably we could get rid of the `Arc` in the future.
+    redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
 }
 
 ///
@@ -101,6 +114,7 @@ impl PostgresRedoManager {
                         self.conf.wal_redo_timeout,
                         pg_version,
                     )
+                    .await
                 };
                 img = Some(result?);
 
@@ -121,6 +135,7 @@ impl PostgresRedoManager {
                 self.conf.wal_redo_timeout,
                 pg_version,
             )
+            .await
         }
     }
 
@@ -134,7 +149,7 @@ impl PostgresRedoManager {
                     chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
                 })
             },
-            pid: self.redo_process.read().unwrap().as_ref().map(|p| p.id()),
+            pid: self.redo_process.get().map(|p| p.id()),
         })
     }
 }
@@ -152,7 +167,7 @@ impl PostgresRedoManager {
             tenant_shard_id,
             conf,
             last_redo_at: std::sync::Mutex::default(),
-            redo_process: RwLock::new(None),
+            redo_process: heavier_once_cell::OnceCell::default(),
         }
     }
 
@@ -164,8 +179,7 @@ impl PostgresRedoManager {
             if let Some(last_redo_at) = *g {
                 if last_redo_at.elapsed() >= idle_timeout {
                     drop(g);
-                    let mut guard = self.redo_process.write().unwrap();
-                    *guard = None;
+                    drop(self.redo_process.get().map(|guard| guard.take_and_deinit()));
                 }
             }
         }
@@ -174,8 +188,11 @@ impl PostgresRedoManager {
     ///
     /// Process one request for WAL redo using wal-redo postgres
     ///
+    /// # Cancel-Safety
+    ///
+    /// Cancellation safe.
     #[allow(clippy::too_many_arguments)]
-    fn apply_batch_postgres(
+    async fn apply_batch_postgres(
         &self,
         key: Key,
         lsn: Lsn,
@@ -191,42 +208,31 @@ impl PostgresRedoManager {
         const MAX_RETRY_ATTEMPTS: u32 = 1;
         let mut n_attempts = 0u32;
         loop {
-            // launch the WAL redo process on first use
-            let proc: Arc<process::WalRedoProcess> = {
-                let proc_guard = self.redo_process.read().unwrap();
-                match &*proc_guard {
-                    None => {
-                        // "upgrade" to write lock to launch the process
-                        drop(proc_guard);
-                        let mut proc_guard = self.redo_process.write().unwrap();
-                        match &*proc_guard {
-                            None => {
-                                let start = Instant::now();
-                                let proc = Arc::new(
-                                    process::WalRedoProcess::launch(
-                                        self.conf,
-                                        self.tenant_shard_id,
-                                        pg_version,
-                                    )
-                                    .context("launch walredo process")?,
-                                );
-                                let duration = start.elapsed();
-                                WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM
-                                    .observe(duration.as_secs_f64());
-                                info!(
-                                    duration_ms = duration.as_millis(),
-                                    pid = proc.id(),
-                                    "launched walredo process"
-                                );
-                                *proc_guard = Some(Arc::clone(&proc));
-                                proc
-                            }
-                            Some(proc) => Arc::clone(proc),
-                        }
+            let proc: Arc<process::WalRedoProcess> =
+                match self.redo_process.get_or_init_detached().await {
+                    Ok(guard) => Arc::clone(&guard),
+                    Err(permit) => {
+                        // don't hold poison_guard, the launch code can bail
+                        let start = Instant::now();
+                        let proc = Arc::new(
+                            process::WalRedoProcess::launch(
+                                self.conf,
+                                self.tenant_shard_id,
+                                pg_version,
+                            )
+                            .context("launch walredo process")?,
+                        );
+                        let duration = start.elapsed();
+                        WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                        info!(
+                            duration_ms = duration.as_millis(),
+                            pid = proc.id(),
+                            "launched walredo process"
+                        );
+                        self.redo_process.set(Arc::clone(&proc), permit);
+                        proc
                     }
-                    Some(proc) => Arc::clone(proc),
-                }
-            };
+                };
 
             let started_at = std::time::Instant::now();
 
@@ -272,34 +278,34 @@ impl PostgresRedoManager {
                     n_attempts,
                     e,
                 );
-                // Avoid concurrent callers hitting the same issue.
-                // We can't prevent it from happening because we want to enable parallelism.
-                {
-                    let mut guard = self.redo_process.write().unwrap();
-                    match &*guard {
-                        Some(current_field_value) => {
-                            if Arc::ptr_eq(current_field_value, &proc) {
-                                // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
-                                *guard = None;
-                            }
-                        }
-                        None => {
-                            // Another thread was faster to observe the error, and already took the process out of rotation.
-                        }
-                    }
-                }
+                // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
+                // Note that there may be other tasks concurrent with us that also hold `proc`.
+                // We have to deal with that here.
+                // Also read the doc comment on field `self.redo_process`.
+                //
                 // NB: there may still be other concurrent threads using `proc`.
                 // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
-                // NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep
-                // holding the lock while waiting for the process to exit.
-                // NB: the drop impl blocks the current threads with a wait() system call for
-                // the child process. We dropped the `guard` above so that other threads aren't
-                // affected. But, it's good that the current thread _does_ block to wait.
-                // If we instead deferred the waiting into the background / to tokio, it could
-                // happen that if walredo always fails immediately, we spawn processes faster
+                //
+                // NB: the drop impl blocks the dropping thread with a wait() system call for
+                // the child process. In some ways the blocking is actually good: if we
+                // deferred the waiting into the background / to tokio if we used `tokio::process`,
+                // it could happen that if walredo always fails immediately, we spawn processes faster
                 // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
                 // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
                 // This probably needs revisiting at some later point.
+                match self.redo_process.get() {
+                    None => (),
+                    Some(guard) => {
+                        if Arc::ptr_eq(&proc, &*guard) {
+                            // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
+                            guard.take_and_deinit();
+                        } else {
+                            // Another task already spawned another redo process (further up in this method)
+                            // and put it into `redo_process`. Do nothing, our view of the world is behind.
+                        }
+                    }
+                }
+                // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
                 drop(proc);
             } else if n_attempts != 0 {
                 info!(n_attempts, "retried walredo succeeded");

From 862a6b701883de4b74771b6bccc485ccdcdee1e2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 4 Apr 2024 17:51:44 +0100
Subject: [PATCH 0522/1571] pageserver: timeout on deletion queue flush in
 timeline deletion (#7315)

Some time ago, we had an issue where a deletion queue hang was also
causing timeline deletions to hang.

This was unnecessary because the timeline deletion doesn't _need_ to
flush the deletion queue, it just does it as a pleasantry to make the
behavior easier to understand and test.

In this PR, we wrap the flush calls in a 10 second timeout (typically
the flush takes milliseconds) so that in the event of issues with the
deletion queue, timeline deletions are slower but not entirely blocked.

Closes: https://github.com/neondatabase/neon/issues/6440
---
 .../src/tenant/remote_timeline_client.rs      | 31 +++++++++++++++++--
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 13fcd1a5e8..9b1b5e7ed5 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -200,6 +200,7 @@ use utils::backoff::{
 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
+use std::time::Duration;
 
 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use std::ops::DerefMut;
@@ -207,7 +208,7 @@ use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
 
-use crate::deletion_queue::DeletionQueueClient;
+use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::metrics::{
     MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
     RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -261,6 +262,10 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
 
+/// Doing non-essential flushes of deletion queue is subject to this timeout, after
+/// which we warn and skip.
+const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
+
 pub enum MaybeDeletedIndexPart {
     IndexPart(IndexPart),
     Deleted(IndexPart),
@@ -1050,6 +1055,26 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> {
+        match tokio::time::timeout(
+            DELETION_QUEUE_FLUSH_TIMEOUT,
+            self.deletion_queue_client.flush_immediate(),
+        )
+        .await
+        {
+            Ok(result) => result,
+            Err(_timeout) => {
+                // Flushing remote deletions is not mandatory: we flush here to make the system easier to test, and
+                // to ensure that _usually_ objects are really gone after a DELETE is acked.  However, in case of deletion
+                // queue issues (https://github.com/neondatabase/neon/issues/6440), we don't want to wait indefinitely here.
+                tracing::warn!(
+                    "Timed out waiting for deletion queue flush, acking deletion anyway"
+                );
+                Ok(())
+            }
+        }
+    }
+
     /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
     /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
     /// deletes leaked files if any and proceeds with deletion of index file at the end.
@@ -1099,7 +1124,7 @@ impl RemoteTimelineClient {
 
         // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
         // taking the burden of listing all the layers that we already know we should delete.
-        self.deletion_queue_client.flush_immediate().await?;
+        self.flush_deletion_queue().await?;
 
         let cancel = shutdown_token();
 
@@ -1173,7 +1198,7 @@ impl RemoteTimelineClient {
 
         // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
         // for a flush to a persistent deletion list so that we may be sure deletion will occur.
-        self.deletion_queue_client.flush_immediate().await?;
+        self.flush_deletion_queue().await?;
 
         fail::fail_point!("timeline-delete-after-index-delete", |_| {
             Err(anyhow::anyhow!(

From ac7fc6110bba250f17b494c604b717cf69e09ef1 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 4 Apr 2024 17:54:38 +0100
Subject: [PATCH 0523/1571] pageserver: handle WAL gaps on sharded tenants
 (#6788)

## Problem

In the test for https://github.com/neondatabase/neon/pull/6776, a test
cases uses tiny layer sizes and tiny stripe sizes. This hits a scenario
where a shard's checkpoint interval spans a region where none of the
content in the WAL is ingested by this shard. Since there is no layer to
flush, we do not advance disk_consistent_lsn, and this causes the test
to fail while waiting for LSN to advance.

## Summary of changes

- Pass an LSN through `layer_flush_start_tx`. This is the LSN to which
we have frozen at the time we ask the flush to flush layers frozen up to
this point.
- In the layer flush task, if the layers we flush do not reach
`frozen_to_lsn`, then advance disk_consistent_lsn up to this point.
- In `maybe_freeze_ephemeral_layer`, handle the case where
last_record_lsn has advanced without writing a layer file: this ensures
that disk_consistent_lsn and remote_consistent_lsn advance anyway.

The net effect is that the disk_consistent_lsn is allowed to advance
past regions in the WAL where a shard ingests no data, and that we
uphold our guarantee that remote_consistent_lsn always eventually
reaches the tip of the WAL.

The case of no layer at all is hard to test at present due to >0 shards
being polluted with SLRU writes, but I have tested it locally with a
branch that disables SLRU writes on shards >0. We can tighten up the
testing on this in future as/when we refine shard filtering (currently
shards >0 need the SLRU because they use it to figure out cutoff in GC
using timestamp-to-lsn).
---
 pageserver/src/tenant/timeline.rs             | 141 +++++++++++++++---
 .../src/tenant/timeline/layer_manager.rs      |   8 +-
 test_runner/fixtures/workload.py              |   5 +
 test_runner/regress/test_sharding.py          | 102 ++++++++++++-
 4 files changed, 225 insertions(+), 31 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c5eda44b7d..d3c8c5f66c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -282,10 +282,12 @@ pub struct Timeline {
     pub(super) flush_loop_state: Mutex<FlushLoopState>,
 
     /// layer_flush_start_tx can be used to wake up the layer-flushing task.
-    /// The value is a counter, incremented every time a new flush cycle is requested.
-    /// The flush cycle counter is sent back on the layer_flush_done channel when
-    /// the flush finishes. You can use that to wait for the flush to finish.
-    layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
+    /// - The u64 value is a counter, incremented every time a new flush cycle is requested.
+    ///   The flush cycle counter is sent back on the layer_flush_done channel when
+    ///   the flush finishes. You can use that to wait for the flush to finish.
+    /// - The LSN is updated to max() of its current value and the latest disk_consistent_lsn
+    ///   read by whoever sends an update
+    layer_flush_start_tx: tokio::sync::watch::Sender<(u64, Lsn)>,
     /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
     layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
 
@@ -1169,8 +1171,8 @@ impl Timeline {
     /// Flush to disk all data that was written with the put_* functions
     #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
     pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
-        self.freeze_inmem_layer(false).await;
-        self.flush_frozen_layers_and_wait().await
+        let to_lsn = self.freeze_inmem_layer(false).await;
+        self.flush_frozen_layers_and_wait(to_lsn).await
     }
 
     /// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
@@ -1190,7 +1192,39 @@ impl Timeline {
         };
 
         let Some(open_layer) = &layers_guard.layer_map().open_layer else {
-            // No open layer, no work to do.
+            // If there is no open layer, we have no layer freezing to do.  However, we might need to generate
+            // some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions
+            // that didn't result in writes to this shard.
+
+            // Must not hold the layers lock while waiting for a flush.
+            drop(layers_guard);
+
+            let last_record_lsn = self.get_last_record_lsn();
+            let disk_consistent_lsn = self.get_disk_consistent_lsn();
+            if last_record_lsn > disk_consistent_lsn {
+                // We have no open layer, but disk_consistent_lsn is behind the last record: this indicates
+                // we are a sharded tenant and have skipped some WAL
+                let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
+                if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
+                    // This should be somewhat rare, so we log it at INFO level.
+                    //
+                    // We checked for checkpoint timeout so that a shard without any
+                    // data ingested (yet) doesn't write a remote index as soon as it
+                    // sees its LSN advance: we only do this if we've been layer-less
+                    // for some time.
+                    tracing::info!(
+                        "Advancing disk_consistent_lsn past WAL ingest gap {} -> {}",
+                        disk_consistent_lsn,
+                        last_record_lsn
+                    );
+
+                    // The flush loop will update remote consistent LSN as well as disk consistent LSN.
+                    self.flush_frozen_layers_and_wait(last_record_lsn)
+                        .await
+                        .ok();
+                }
+            }
+
             return;
         };
 
@@ -1769,7 +1803,7 @@ impl Timeline {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
         let (state, _) = watch::channel(state);
 
-        let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
+        let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn));
         let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
 
         let evictions_low_residence_duration_metric_threshold = {
@@ -3174,7 +3208,9 @@ impl Timeline {
         self.last_record_lsn.advance(new_lsn);
     }
 
-    async fn freeze_inmem_layer(&self, write_lock_held: bool) {
+    /// Whether there was a layer to freeze or not, return the value of get_last_record_lsn
+    /// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive).
+    async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn {
         // Freeze the current open in-memory layer. It will be written to disk on next
         // iteration.
 
@@ -3184,7 +3220,9 @@ impl Timeline {
             Some(self.write_lock.lock().await)
         };
 
-        self.freeze_inmem_layer_at(self.get_last_record_lsn()).await;
+        let to_lsn = self.get_last_record_lsn();
+        self.freeze_inmem_layer_at(to_lsn).await;
+        to_lsn
     }
 
     async fn freeze_inmem_layer_at(&self, at: Lsn) {
@@ -3197,7 +3235,7 @@ impl Timeline {
     /// Layer flusher task's main loop.
     async fn flush_loop(
         self: &Arc<Self>,
-        mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
+        mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>,
         ctx: &RequestContext,
     ) {
         info!("started flush loop");
@@ -3210,7 +3248,11 @@ impl Timeline {
                 _ = layer_flush_start_rx.changed() => {}
             }
             trace!("waking up");
-            let flush_counter = *layer_flush_start_rx.borrow();
+            let (flush_counter, frozen_to_lsn) = *layer_flush_start_rx.borrow();
+
+            // The highest LSN to which we flushed in the loop over frozen layers
+            let mut flushed_to_lsn = Lsn(0);
+
             let result = loop {
                 if self.cancel.is_cancelled() {
                     info!("dropping out of flush loop for timeline shutdown");
@@ -3231,7 +3273,9 @@ impl Timeline {
                     break Ok(());
                 };
                 match self.flush_frozen_layer(layer_to_flush, ctx).await {
-                    Ok(()) => {}
+                    Ok(this_layer_to_lsn) => {
+                        flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);
+                    }
                     Err(FlushLayerError::Cancelled) => {
                         info!("dropping out of flush loop for timeline shutdown");
                         return;
@@ -3240,11 +3284,36 @@ impl Timeline {
                         FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
                     ) => {
                         error!("could not flush frozen layer: {err:?}");
-                        break err;
+                        break err.map(|_| ());
                     }
                 }
                 timer.stop_and_record();
             };
+
+            // Unsharded tenants should never advance their LSN beyond the end of the
+            // highest layer they write: such gaps between layer data and the frozen LSN
+            // are only legal on sharded tenants.
+            debug_assert!(
+                self.shard_identity.count.count() > 1
+                    || flushed_to_lsn >= frozen_to_lsn
+                    || !flushed_to_lsn.is_valid()
+            );
+
+            if flushed_to_lsn < frozen_to_lsn && self.shard_identity.count.count() > 1 {
+                // If our layer flushes didn't carry disk_consistent_lsn up to the `to_lsn` advertised
+                // to us via layer_flush_start_rx, then advance it here.
+                //
+                // This path is only taken for tenants with multiple shards: single sharded tenants should
+                // never encounter a gap in the wal.
+                let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
+                tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}");
+                if self.set_disk_consistent_lsn(frozen_to_lsn) {
+                    if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) {
+                        tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}");
+                    }
+                }
+            }
+
             // Notify any listeners that we're done
             let _ = self
                 .layer_flush_done_tx
@@ -3252,7 +3321,13 @@ impl Timeline {
         }
     }
 
-    async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> {
+    /// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk.
+    /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`].
+    ///
+    /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
+    /// it means no data will be written between the top of the highest frozen layer and to_lsn,
+    /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
+    async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> {
         let mut rx = self.layer_flush_done_tx.subscribe();
 
         // Increment the flush cycle counter and wake up the flush task.
@@ -3266,9 +3341,10 @@ impl Timeline {
             anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
         }
 
-        self.layer_flush_start_tx.send_modify(|counter| {
+        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
             my_flush_request = *counter + 1;
             *counter = my_flush_request;
+            *lsn = std::cmp::max(last_record_lsn, *lsn);
         });
 
         loop {
@@ -3305,16 +3381,22 @@ impl Timeline {
     }
 
     fn flush_frozen_layers(&self) {
-        self.layer_flush_start_tx.send_modify(|val| *val += 1);
+        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
+            *counter += 1;
+
+            *lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1));
+        });
     }
 
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
+    ///
+    /// Return value is the last lsn (inclusive) of the layer that was frozen.
     #[instrument(skip_all, fields(layer=%frozen_layer))]
     async fn flush_frozen_layer(
         self: &Arc<Self>,
         frozen_layer: Arc<InMemoryLayer>,
         ctx: &RequestContext,
-    ) -> Result<(), FlushLayerError> {
+    ) -> Result<Lsn, FlushLayerError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
         // As a special case, when we have just imported an image into the repository,
@@ -3389,7 +3471,6 @@ impl Timeline {
         }
 
         let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
-        let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
 
         // The new on-disk layers are now in the layer map. We can remove the
         // in-memory layer from the map now. The flushed layer is stored in
@@ -3403,10 +3484,7 @@ impl Timeline {
 
             guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
 
-            if disk_consistent_lsn != old_disk_consistent_lsn {
-                assert!(disk_consistent_lsn > old_disk_consistent_lsn);
-                self.disk_consistent_lsn.store(disk_consistent_lsn);
-
+            if self.set_disk_consistent_lsn(disk_consistent_lsn) {
                 // Schedule remote uploads that will reflect our new disk_consistent_lsn
                 self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
             }
@@ -3423,7 +3501,22 @@ impl Timeline {
         // This failpoint is used by another test case `test_pageserver_recovery`.
         fail_point!("flush-frozen-exit");
 
-        Ok(())
+        Ok(Lsn(lsn_range.end.0 - 1))
+    }
+
+    /// Return true if the value changed
+    ///
+    /// This function must only be used from the layer flush task, and may not be called concurrently.
+    fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
+        // We do a simple load/store cycle: that's why this function isn't safe for concurrent use.
+        let old_value = self.disk_consistent_lsn.load();
+        if new_value != old_value {
+            assert!(new_value >= old_value);
+            self.disk_consistent_lsn.store(new_value);
+            true
+        } else {
+            false
+        }
     }
 
     /// Update metadata file
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index d54dc1642c..64edcc5e40 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -120,9 +120,10 @@ impl LayerManager {
     /// Called from `freeze_inmem_layer`, returns true if successfully frozen.
     pub(crate) async fn try_freeze_in_memory_layer(
         &mut self,
-        Lsn(last_record_lsn): Lsn,
+        lsn: Lsn,
         last_freeze_at: &AtomicLsn,
     ) {
+        let Lsn(last_record_lsn) = lsn;
         let end_lsn = Lsn(last_record_lsn + 1);
 
         if let Some(open_layer) = &self.layer_map.open_layer {
@@ -135,8 +136,11 @@ impl LayerManager {
             self.layer_map.frozen_layers.push_back(open_layer_rc);
             self.layer_map.open_layer = None;
             self.layer_map.next_open_layer_at = Some(end_lsn);
-            last_freeze_at.store(end_lsn);
         }
+
+        // Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this
+        // accounts for regions in the LSN range where we might have ingested no data due to sharding.
+        last_freeze_at.store(end_lsn);
     }
 
     /// Add image layers to the layer map, called from `create_image_layers`.
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index ab8717de54..4ebc02e6fd 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -85,6 +85,11 @@ class Workload:
         if self._endpoint is not None:
             self._endpoint.stop()
 
+    def stop(self):
+        if self._endpoint is not None:
+            self._endpoint.stop()
+            self._endpoint = None
+
     def init(self, pageserver_id: Optional[int] = None):
         endpoint = self.endpoint(pageserver_id)
 
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 2699654f80..bca11bbbe7 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -11,7 +11,9 @@ from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
     StorageControllerApiException,
+    last_flush_lsn_upload,
     tenant_get_shards,
+    wait_for_last_flush_lsn,
 )
 from fixtures.remote_storage import s3_storage
 from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
@@ -466,13 +468,11 @@ def test_sharding_split_stripe_size(
     os.getenv("BUILD_TYPE") == "debug",
     reason="Avoid running bulkier ingest tests in debug mode",
 )
-def test_sharding_ingest(
+def test_sharding_ingest_layer_sizes(
     neon_env_builder: NeonEnvBuilder,
 ):
     """
-    Check behaviors related to ingest:
-    - That we generate properly sized layers
-    - TODO: that updates to remote_consistent_lsn are made correctly via safekeepers
+    Check that when ingesting data to a sharded tenant, we properly respect layer size limts.
     """
 
     # Set a small stripe size and checkpoint distance, so that we can exercise rolling logic
@@ -503,6 +503,7 @@ def test_sharding_ingest(
     workload.write_rows(4096, upload=False)
     workload.write_rows(4096, upload=False)
     workload.write_rows(4096, upload=False)
+
     workload.validate()
 
     small_layer_count = 0
@@ -515,7 +516,9 @@ def test_sharding_ingest(
         shard_id = shard["shard_id"]
         layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id)
 
-        for layer in layer_map.historic_layers:
+        historic_layers = sorted(layer_map.historic_layers, key=lambda layer: layer.lsn_start)
+
+        for layer in historic_layers:
             assert layer.layer_file_size is not None
             if layer.layer_file_size < expect_layer_size // 2:
                 classification = "Small"
@@ -552,6 +555,93 @@ def test_sharding_ingest(
     assert huge_layer_count <= shard_count
 
 
+def test_sharding_ingest_gaps(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Check ingest behavior when the incoming data results in some shards having gaps where
+    no data is ingested: they should advance their disk_consistent_lsn and remote_consistent_lsn
+    even if they aren't writing out layers.
+    """
+
+    # Set a small stripe size and checkpoint distance, so that we can exercise rolling logic
+    # without writing a lot of data.
+    expect_layer_size = 131072
+    checkpoint_interval_secs = 5
+    TENANT_CONF = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": f"{expect_layer_size}",
+        "compaction_target_size": f"{expect_layer_size}",
+        # Set a short checkpoint interval as we will wait for uploads to happen
+        "checkpoint_timeout": f"{checkpoint_interval_secs}s",
+        # Background checkpointing is done from compaction loop, so set that interval short too
+        "compaction_period": "1s",
+    }
+    shard_count = 4
+    neon_env_builder.num_pageservers = shard_count
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF,
+        initial_tenant_shard_count=shard_count,
+        initial_tenant_shard_stripe_size=128,
+    )
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Just a few writes: we aim to produce a situation where some shards are skipping
+    # ingesting some records and thereby won't have layer files that advance their
+    # consistent LSNs, to exercise the code paths that explicitly handle this case by
+    # advancing consistent LSNs in the background if there is no open layer.
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(128, upload=False)
+    workload.churn_rows(128, upload=False)
+
+    # Checkpoint, so that we won't get a background checkpoint happening during the next step
+    workload.endpoint().safe_psql("checkpoint")
+    # Freeze + flush, so that subsequent writes will start from a position of no open layers
+    last_flush_lsn_upload(env, workload.endpoint(), tenant_id, timeline_id)
+
+    # This write is tiny: at least some of the shards should find they don't have any
+    # data to ingest.  This will exercise how they handle that.
+    workload.churn_rows(1, upload=False)
+
+    # The LSN that has reached pageservers, but may not have been flushed to historic layers yet
+    expect_lsn = wait_for_last_flush_lsn(env, workload.endpoint(), tenant_id, timeline_id)
+
+    # Don't leave the endpoint running, we don't want it writing in the background
+    workload.stop()
+
+    log.info(f"Waiting for shards' consistent LSNs to reach {expect_lsn}")
+
+    shards = tenant_get_shards(env, tenant_id, None)
+
+    def assert_all_disk_consistent():
+        """
+        Assert that all the shards' disk_consistent_lsns have reached expect_lsn
+        """
+        for tenant_shard_id, pageserver in shards:
+            timeline_detail = pageserver.http_client().timeline_detail(tenant_shard_id, timeline_id)
+            log.info(f"{tenant_shard_id} (ps {pageserver.id}) detail: {timeline_detail}")
+            assert Lsn(timeline_detail["disk_consistent_lsn"]) >= expect_lsn
+
+    # We set a short checkpoint timeout: expect things to get frozen+flushed within that
+    wait_until(checkpoint_interval_secs * 3, 1, assert_all_disk_consistent)
+
+    def assert_all_remote_consistent():
+        """
+        Assert that all the shards' remote_consistent_lsns have reached expect_lsn
+        """
+        for tenant_shard_id, pageserver in shards:
+            timeline_detail = pageserver.http_client().timeline_detail(tenant_shard_id, timeline_id)
+            log.info(f"{tenant_shard_id} (ps {pageserver.id}) detail: {timeline_detail}")
+            assert Lsn(timeline_detail["remote_consistent_lsn"]) >= expect_lsn
+
+    # We set a short checkpoint timeout: expect things to get frozen+flushed within that
+    wait_until(checkpoint_interval_secs * 3, 1, assert_all_remote_consistent)
+
+    workload.validate()
+
+
 class Failure:
     pageserver_id: Optional[int]
 
@@ -795,6 +885,8 @@ def test_sharding_split_failures(
             ".*Reconcile error: receive body: error sending request for url.*",
             # Node offline cases will fail inside reconciler when detaching secondaries
             ".*Reconcile error on shard.*: receive body: error sending request for url.*",
+            # While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning
+            ".*Failed to schedule metadata upload after updating disk_consistent_lsn.*",
         ]
     )
 

From e17bc6afb4a2fd08ea3698a23d19f53d1bb86b1d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 4 Apr 2024 18:23:45 +0100
Subject: [PATCH 0524/1571] pageserver: update mgmt_api to use TenantShardId
 (#7313)

## Problem

The API client was written around the same time as some of the server
APIs changed from TenantId to TenantShardId

Closes: https://github.com/neondatabase/neon/issues/6154

## Summary of changes

- Refactor mgmt_api timeline_info and keyspace methods to use
TenantShardId to match the server

This doesn't make pagebench sharding aware, but it paves the way to do
so later.
---
 pageserver/client/src/mgmt_api.rs                   |  8 ++++----
 pageserver/pagebench/src/cmd/basebackup.rs          |  3 ++-
 pageserver/pagebench/src/cmd/getpage_latest_lsn.rs  |  6 +++++-
 .../src/cmd/trigger_initial_size_calculation.rs     | 13 +++++++++++--
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index ab55d2b0a3..3c9982ffb8 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -128,12 +128,12 @@ impl Client {
 
     pub async fn timeline_info(
         &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         force_await_logical_size: ForceAwaitLogicalSize,
     ) -> Result<pageserver_api::models::TimelineInfo> {
         let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
             self.mgmt_api_endpoint
         );
 
@@ -151,11 +151,11 @@ impl Client {
 
     pub async fn keyspace(
         &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
     ) -> Result<pageserver_api::models::partitioning::Partitioning> {
         let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace",
             self.mgmt_api_endpoint
         );
         self.get(&uri)
diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs
index 55844be041..3ae6d99aa7 100644
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -1,4 +1,5 @@
 use anyhow::Context;
+use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
 use pageserver_client::page_service::BasebackupRequest;
 
@@ -95,7 +96,7 @@ async fn main_impl(
             let timeline = *timeline;
             let info = mgmt_api_client
                 .timeline_info(
-                    timeline.tenant_id,
+                    TenantShardId::unsharded(timeline.tenant_id),
                     timeline.timeline_id,
                     ForceAwaitLogicalSize::No,
                 )
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 2838511a77..c3d8e61a2c 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -4,6 +4,7 @@ use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::models::PagestreamGetPageRequest;
 
+use pageserver_api::shard::TenantShardId;
 use tokio_util::sync::CancellationToken;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
@@ -173,7 +174,10 @@ async fn main_impl(
                 let timeline = *timeline;
                 async move {
                     let partitioning = mgmt_api_client
-                        .keyspace(timeline.tenant_id, timeline.timeline_id)
+                        .keyspace(
+                            TenantShardId::unsharded(timeline.tenant_id),
+                            timeline.timeline_id,
+                        )
                         .await?;
                     let lsn = partitioning.at_lsn;
                     let start = Instant::now();
diff --git a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
index 98938d780a..f07beeecfd 100644
--- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
+++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;
 
 use humantime::Duration;
+use pageserver_api::shard::TenantShardId;
 use tokio::task::JoinSet;
 use utils::id::TenantTimelineId;
 
@@ -59,7 +60,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
         let mgmt_api_client = Arc::clone(&mgmt_api_client);
         js.spawn(async move {
             let info = mgmt_api_client
-                .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
+                .timeline_info(
+                    TenantShardId::unsharded(tl.tenant_id),
+                    tl.timeline_id,
+                    ForceAwaitLogicalSize::Yes,
+                )
                 .await
                 .unwrap();
 
@@ -74,7 +79,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
                 while !info.current_logical_size_is_accurate {
                     ticker.tick().await;
                     info = mgmt_api_client
-                        .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
+                        .timeline_info(
+                            TenantShardId::unsharded(tl.tenant_id),
+                            tl.timeline_id,
+                            ForceAwaitLogicalSize::Yes,
+                        )
                         .await
                         .unwrap();
                 }

From 0c6367a7325ab5ff9ebf889578aa91e07ceb3c9c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 4 Apr 2024 18:34:05 +0100
Subject: [PATCH 0525/1571] storage controller: fix repeated location_conf
 returning no shards (#7314)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

When a location_conf request was repeated with no changes, we failed to
build the list of shards in the result.

## Summary of changes

Remove conditional that only generated a list of updates if something
had really changed. This does some redundant database updates, but it is
preferable to having a whole separate code path for no-op changes.

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 .../attachment_service/src/service.rs         | 21 +++++++++----------
 test_runner/fixtures/pageserver/http.py       |  1 +
 test_runner/regress/test_sharding_service.py  | 12 +++++++----
 3 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 0b67e30b96..0f87a8ab05 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1763,6 +1763,9 @@ impl Service {
 
     /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
     /// and transform it into either a tenant creation of a series of shard updates.
+    ///
+    /// If the incoming request makes no changes, a [`TenantCreateOrUpdate::Update`] result will
+    /// still be returned.
     fn tenant_location_config_prepare(
         &self,
         tenant_id: TenantId,
@@ -1810,17 +1813,12 @@ impl Service {
                 _ => None,
             };
 
-            if shard.policy != placement_policy
-                || shard.config != req.config.tenant_conf
-                || set_generation.is_some()
-            {
-                updates.push(ShardUpdate {
-                    tenant_shard_id: *shard_id,
-                    placement_policy: placement_policy.clone(),
-                    tenant_config: req.config.tenant_conf.clone(),
-                    generation: set_generation,
-                });
-            }
+            updates.push(ShardUpdate {
+                tenant_shard_id: *shard_id,
+                placement_policy: placement_policy.clone(),
+                tenant_config: req.config.tenant_conf.clone(),
+                generation: set_generation,
+            });
         }
 
         if create {
@@ -1849,6 +1847,7 @@ impl Service {
                 },
             )
         } else {
+            assert!(!updates.is_empty());
             TenantCreateOrUpdate::Update(updates)
         }
     }
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index d3bf46b2e8..b899b0dac8 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -308,6 +308,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
             params=params,
         )
         self.verbose_error(res)
+        return res.json()
 
     def tenant_list_locations(self):
         res = self.get(
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 233d3b9603..3248afae15 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -303,7 +303,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     origin_ps.http_client().tenant_create(tenant_id, generation=generation)
 
     # As if doing a live migration, first configure origin into stale mode
-    origin_ps.http_client().tenant_location_conf(
+    r = origin_ps.http_client().tenant_location_conf(
         tenant_id,
         {
             "mode": "AttachedStale",
@@ -312,6 +312,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
             "generation": generation,
         },
     )
+    assert len(r["shards"]) == 1
 
     if warm_up:
         origin_ps.http_client().tenant_heatmap_upload(tenant_id)
@@ -332,7 +333,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
 
     # Call into storage controller to onboard the tenant
     generation += 1
-    virtual_ps_http.tenant_location_conf(
+    r = virtual_ps_http.tenant_location_conf(
         tenant_id,
         {
             "mode": "AttachedMulti",
@@ -341,6 +342,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
             "generation": generation,
         },
     )
+    assert len(r["shards"]) == 1
 
     # As if doing a live migration, detach the original pageserver
     origin_ps.http_client().tenant_location_conf(
@@ -357,7 +359,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     # set it to AttachedSingle: this is a no-op, but we test it because the
     # cloud control plane may call this for symmetry with live migration to
     # an individual pageserver
-    virtual_ps_http.tenant_location_conf(
+    r = virtual_ps_http.tenant_location_conf(
         tenant_id,
         {
             "mode": "AttachedSingle",
@@ -366,6 +368,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
             "generation": generation,
         },
     )
+    assert len(r["shards"]) == 1
 
     # We should see the tenant is now attached to the pageserver managed
     # by the sharding service
@@ -396,7 +399,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     # The generation has moved on since we onboarded
     assert generation != dest_tenant_before_conf_change["generation"]
 
-    virtual_ps_http.tenant_location_conf(
+    r = virtual_ps_http.tenant_location_conf(
         tenant_id,
         {
             "mode": "AttachedSingle",
@@ -406,6 +409,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
             "generation": generation,
         },
     )
+    assert len(r["shards"]) == 1
     dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id)
     assert (
         dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]

From 6019ccef06c75cf89eb271bffba27495d05b1940 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 5 Apr 2024 11:44:15 +0100
Subject: [PATCH 0526/1571] tests: extend log allow list in test_storcon_cli
 (#7321)

This test was occasionally flaky: it already allowed the log for the
scheduler complaining about Stop state, but not the log for
maybe_reconcile complaining.
---
 test_runner/regress/test_sharding_service.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 3248afae15..b7d97fd107 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -1196,7 +1196,10 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     assert len(tenant_lines) == 5
     assert str(env.initial_tenant) in tenant_lines[3]
 
-    env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy.*")
+    # Setting scheduling policies intentionally result in warnings, they're for rare use.
+    env.storage_controller.allowed_errors.extend(
+        [".*Skipping reconcile for policy.*", ".*Scheduling is disabled by policy.*"]
+    )
 
     # Describe a tenant
     tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)])

From 8ceb4f0a6994849524c5091ee374db94b7f49eb9 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 5 Apr 2024 12:48:08 +0200
Subject: [PATCH 0527/1571] Fix partial zero segment upload (#7318)

Found these logs on staging safekeepers:
```
INFO Partial backup{ttid=X/Y}: failed to upload 000000010000000000000000_173_0000000000000000_0000000000000000_sk56.partial: Failed to open file "/storage/safekeeper/data/X/Y/000000010000000000000000.partial" for wal backup: No such file or directory (os error 2)
INFO Partial backup{ttid=X/Y}:upload{name=000000010000000000000000_173_0000000000000000_0000000000000000_sk56.partial}: starting upload PartialRemoteSegment { status: InProgress, name: "000000010000000000000000_173_0000000000000000_0000000000000000_sk56.partial", commit_lsn: 0/0, flush_lsn: 0/0, term: 173 }
```

This is because partial backup tries to upload zero segment when there
is no data in timeline. This PR fixes this bug introduced in #6530.
---
 safekeeper/src/wal_backup_partial.rs | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index a535c814ea..200096ac5c 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -337,6 +337,17 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
             }
         }
 
+        // if we don't have any data and zero LSNs, wait for something
+        while flush_lsn_rx.borrow().lsn == Lsn(0) {
+            tokio::select! {
+                _ = cancellation_rx.changed() => {
+                    info!("timeline canceled");
+                    return;
+                }
+                _ = flush_lsn_rx.changed() => {}
+            }
+        }
+
         // fixing the segno and waiting some time to prevent reuploading the same segment too often
         let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
         let timeout = tokio::time::sleep(await_duration);

From 0fa517eb809cadcc2718c8fbd1daff235bab30f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 5 Apr 2024 15:53:29 +0200
Subject: [PATCH 0528/1571] Update test-context dependency to 0.3 (#7303)

Updates the `test-context` dev-dependency of the `remote_storage` crate
to 0.3. This removes a lot of `async_trait` instances.

Related earlier work: #6305, #6464
---
 Cargo.lock                                   | 12 ++++++------
 Cargo.toml                                   |  2 +-
 libs/remote_storage/tests/test_real_azure.rs |  3 ---
 libs/remote_storage/tests/test_real_s3.rs    |  3 ---
 4 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7fef2ebf22..d413641c3f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5799,23 +5799,23 @@ dependencies = [
 
 [[package]]
 name = "test-context"
-version = "0.1.4"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3"
+checksum = "6676ab8513edfd2601a108621103fdb45cac9098305ca25ec93f7023b06b05d9"
 dependencies = [
- "async-trait",
  "futures",
  "test-context-macros",
 ]
 
 [[package]]
 name = "test-context-macros"
-version = "0.1.4"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d"
+checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1"
 dependencies = [
+ "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.52",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 9f24176c65..510c702290 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -159,7 +159,7 @@ svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
-test-context = "0.1"
+test-context = "0.3"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"
 tikv-jemalloc-ctl = "0.5"
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 6adddf52a9..6aa02868e6 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -57,7 +57,6 @@ enum MaybeEnabledStorage {
     Disabled,
 }
 
-#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
     async fn setup() -> Self {
         ensure_logging_ready();
@@ -86,7 +85,6 @@ struct AzureWithTestBlobs {
     remote_blobs: HashSet<RemotePath>,
 }
 
-#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
     async fn setup() -> Self {
         ensure_logging_ready();
@@ -148,7 +146,6 @@ struct AzureWithSimpleTestBlobs {
     remote_blobs: HashSet<RemotePath>,
 }
 
-#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
     async fn setup() -> Self {
         ensure_logging_ready();
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index bc5e40e70f..c5d5216f00 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -219,7 +219,6 @@ enum MaybeEnabledStorage {
     Disabled,
 }
 
-#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
     async fn setup() -> Self {
         ensure_logging_ready();
@@ -248,7 +247,6 @@ struct S3WithTestBlobs {
     remote_blobs: HashSet<RemotePath>,
 }
 
-#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
     async fn setup() -> Self {
         ensure_logging_ready();
@@ -310,7 +308,6 @@ struct S3WithSimpleTestBlobs {
     remote_blobs: HashSet<RemotePath>,
 }
 
-#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
     async fn setup() -> Self {
         ensure_logging_ready();

From 55da8eff4ff9c26e9458f4dc4ee82ff67c422383 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 5 Apr 2024 16:14:50 +0100
Subject: [PATCH 0529/1571] proxy: report metrics based on cold start info
 (#7324)

## Problem

Would be nice to have a bit more info on cold start metrics.

## Summary of changes

* Change connect compute latency to include `cold_start_info`.
* Update `ColdStartInfo` to include HttpPoolHit and WarmCached.
* Several changes to make more use of interned strings
---
 proxy/src/auth/backend/link.rs     |  3 +-
 proxy/src/bin/pg_sni_router.rs     |  8 ++-
 proxy/src/cache/project_info.rs    | 98 +++++++++++++++++++++---------
 proxy/src/compute.rs               |  1 +
 proxy/src/console/messages.rs      | 49 +++++++++++----
 proxy/src/console/provider.rs      |  5 +-
 proxy/src/console/provider/mock.rs | 15 ++++-
 proxy/src/console/provider/neon.rs | 39 ++++++------
 proxy/src/context.rs               | 34 ++++++-----
 proxy/src/context/parquet.rs       | 69 ++++++++++-----------
 proxy/src/metrics.rs               | 51 +++++++++-------
 proxy/src/proxy/connect_compute.rs |  2 -
 proxy/src/proxy/passthrough.rs     |  4 +-
 proxy/src/proxy/tests.rs           | 10 ++-
 proxy/src/serverless/backend.rs    |  8 +--
 proxy/src/serverless/conn_pool.rs  | 25 +++++---
 proxy/src/usage_metrics.rs         | 13 ++--
 17 files changed, 274 insertions(+), 160 deletions(-)

diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index 7db76f3d9e..415a4b7d85 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -102,8 +102,7 @@ pub(super) async fn authenticate(
 
     ctx.set_user(db_info.user.into());
     ctx.set_project(db_info.aux.clone());
-    let cold_start_info = db_info.aux.cold_start_info.clone().unwrap_or_default();
-    info!(?cold_start_info, "woken up a compute node");
+    info!("woken up a compute node");
 
     // Backwards compatibility. pg_sni_proxy uses "--" in domain names
     // while direct connections do not. Once we migrate to pg_sni_proxy
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 385f7820cb..c28814b1c8 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -10,6 +10,7 @@ use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
 use proxy::proxy::run_until_cancelled;
+use proxy::{BranchId, EndpointId, ProjectId};
 use rustls::pki_types::PrivateKeyDer;
 use tokio::net::TcpListener;
 
@@ -269,7 +270,12 @@ async fn handle_client(
 
     let client = tokio::net::TcpStream::connect(destination).await?;
 
-    let metrics_aux: MetricsAuxInfo = Default::default();
+    let metrics_aux: MetricsAuxInfo = MetricsAuxInfo {
+        endpoint_id: (&EndpointId::from("")).into(),
+        project_id: (&ProjectId::from("")).into(),
+        branch_id: (&BranchId::from("")).into(),
+        cold_start_info: proxy::console::messages::ColdStartInfo::Unknown,
+    };
 
     // doesn't yet matter as pg-sni-router doesn't report analytics logs
     ctx.set_success();
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 5a3660520b..d8a1d261ce 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -16,7 +16,7 @@ use crate::{
     config::ProjectInfoCacheOptions,
     console::AuthSecret,
     intern::{EndpointIdInt, ProjectIdInt, RoleNameInt},
-    EndpointId, ProjectId, RoleName,
+    EndpointId, RoleName,
 };
 
 use super::{Cache, Cached};
@@ -214,14 +214,11 @@ impl ProjectInfoCacheImpl {
     }
     pub fn insert_role_secret(
         &self,
-        project_id: &ProjectId,
-        endpoint_id: &EndpointId,
-        role_name: &RoleName,
+        project_id: ProjectIdInt,
+        endpoint_id: EndpointIdInt,
+        role_name: RoleNameInt,
         secret: Option<AuthSecret>,
     ) {
-        let project_id = ProjectIdInt::from(project_id);
-        let endpoint_id = EndpointIdInt::from(endpoint_id);
-        let role_name = RoleNameInt::from(role_name);
         if self.cache.len() >= self.config.size {
             // If there are too many entries, wait until the next gc cycle.
             return;
@@ -234,12 +231,10 @@ impl ProjectInfoCacheImpl {
     }
     pub fn insert_allowed_ips(
         &self,
-        project_id: &ProjectId,
-        endpoint_id: &EndpointId,
+        project_id: ProjectIdInt,
+        endpoint_id: EndpointIdInt,
         allowed_ips: Arc<Vec<IpPattern>>,
     ) {
-        let project_id = ProjectIdInt::from(project_id);
-        let endpoint_id = EndpointIdInt::from(endpoint_id);
         if self.cache.len() >= self.config.size {
             // If there are too many entries, wait until the next gc cycle.
             return;
@@ -358,7 +353,7 @@ impl Cache for ProjectInfoCacheImpl {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::scram::ServerSecret;
+    use crate::{scram::ServerSecret, ProjectId};
 
     #[tokio::test]
     async fn test_project_info_cache_settings() {
@@ -369,8 +364,8 @@ mod tests {
             ttl: Duration::from_secs(1),
             gc_interval: Duration::from_secs(600),
         });
-        let project_id = "project".into();
-        let endpoint_id = "endpoint".into();
+        let project_id: ProjectId = "project".into();
+        let endpoint_id: EndpointId = "endpoint".into();
         let user1: RoleName = "user1".into();
         let user2: RoleName = "user2".into();
         let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -379,9 +374,23 @@ mod tests {
             "127.0.0.1".parse().unwrap(),
             "127.0.0.2".parse().unwrap(),
         ]);
-        cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
-        cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
-        cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user1).into(),
+            secret1.clone(),
+        );
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user2).into(),
+            secret2.clone(),
+        );
+        cache.insert_allowed_ips(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            allowed_ips.clone(),
+        );
 
         let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
         assert!(cached.cached());
@@ -393,7 +402,12 @@ mod tests {
         // Shouldn't add more than 2 roles.
         let user3: RoleName = "user3".into();
         let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32])));
-        cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone());
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user3).into(),
+            secret3.clone(),
+        );
         assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());
 
         let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
@@ -421,8 +435,8 @@ mod tests {
         cache.clone().disable_ttl();
         tokio::time::advance(Duration::from_secs(2)).await;
 
-        let project_id = "project".into();
-        let endpoint_id = "endpoint".into();
+        let project_id: ProjectId = "project".into();
+        let endpoint_id: EndpointId = "endpoint".into();
         let user1: RoleName = "user1".into();
         let user2: RoleName = "user2".into();
         let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -431,9 +445,23 @@ mod tests {
             "127.0.0.1".parse().unwrap(),
             "127.0.0.2".parse().unwrap(),
         ]);
-        cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
-        cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
-        cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user1).into(),
+            secret1.clone(),
+        );
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user2).into(),
+            secret2.clone(),
+        );
+        cache.insert_allowed_ips(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            allowed_ips.clone(),
+        );
 
         tokio::time::advance(Duration::from_secs(2)).await;
         // Nothing should be invalidated.
@@ -470,8 +498,8 @@ mod tests {
             gc_interval: Duration::from_secs(600),
         }));
 
-        let project_id = "project".into();
-        let endpoint_id = "endpoint".into();
+        let project_id: ProjectId = "project".into();
+        let endpoint_id: EndpointId = "endpoint".into();
         let user1: RoleName = "user1".into();
         let user2: RoleName = "user2".into();
         let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -480,10 +508,20 @@ mod tests {
             "127.0.0.1".parse().unwrap(),
             "127.0.0.2".parse().unwrap(),
         ]);
-        cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user1).into(),
+            secret1.clone(),
+        );
         cache.clone().disable_ttl();
         tokio::time::advance(Duration::from_millis(100)).await;
-        cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user2).into(),
+            secret2.clone(),
+        );
 
         // Added before ttl was disabled + ttl should be still cached.
         let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
@@ -497,7 +535,11 @@ mod tests {
         assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
 
         // Added after ttl was disabled + ttl should not be cached.
-        cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
+        cache.insert_allowed_ips(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            allowed_ips.clone(),
+        );
         let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
         assert!(!cached.cached());
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 65153babcb..ee33b97fbd 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -276,6 +276,7 @@ impl ConnCfg {
         let stream = connection.stream.into_inner();
 
         info!(
+            cold_start_info = ctx.cold_start_info.as_str(),
             "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
             self.0.get_ssl_mode()
         );
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 102076f2c6..45161f5ac8 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -3,7 +3,7 @@ use std::fmt;
 
 use crate::auth::IpPattern;
 
-use crate::{BranchId, EndpointId, ProjectId};
+use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
 
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
@@ -18,7 +18,7 @@ pub struct ConsoleError {
 pub struct GetRoleSecret {
     pub role_secret: Box<str>,
     pub allowed_ips: Option<Vec<IpPattern>>,
-    pub project_id: Option<ProjectId>,
+    pub project_id: Option<ProjectIdInt>,
 }
 
 // Manually implement debug to omit sensitive info.
@@ -93,22 +93,47 @@ impl fmt::Debug for DatabaseInfo {
 
 /// Various labels for prometheus metrics.
 /// Also known as `ProxyMetricsAuxInfo` in the console.
-#[derive(Debug, Deserialize, Clone, Default)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct MetricsAuxInfo {
-    pub endpoint_id: EndpointId,
-    pub project_id: ProjectId,
-    pub branch_id: BranchId,
-    pub cold_start_info: Option<ColdStartInfo>,
+    pub endpoint_id: EndpointIdInt,
+    pub project_id: ProjectIdInt,
+    pub branch_id: BranchIdInt,
+    #[serde(default)]
+    pub cold_start_info: ColdStartInfo,
 }
 
-#[derive(Debug, Default, Serialize, Deserialize, Clone)]
+#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)]
 #[serde(rename_all = "snake_case")]
 pub enum ColdStartInfo {
     #[default]
-    Unknown = 0,
-    Warm = 1,
-    PoolHit = 2,
-    PoolMiss = 3,
+    Unknown,
+    /// Compute was already running
+    Warm,
+    #[serde(rename = "pool_hit")]
+    /// Compute was not running but there was an available VM
+    VmPoolHit,
+    #[serde(rename = "pool_miss")]
+    /// Compute was not running and there were no VMs available
+    VmPoolMiss,
+
+    // not provided by control plane
+    /// Connection available from HTTP pool
+    HttpPoolHit,
+    /// Cached connection info
+    WarmCached,
+}
+
+impl ColdStartInfo {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            ColdStartInfo::Unknown => "unknown",
+            ColdStartInfo::Warm => "warm",
+            ColdStartInfo::VmPoolHit => "pool_hit",
+            ColdStartInfo::VmPoolMiss => "pool_miss",
+            ColdStartInfo::HttpPoolHit => "http_pool_hit",
+            ColdStartInfo::WarmCached => "warm_cached",
+        }
+    }
 }
 
 #[cfg(test)]
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 69bfd6b045..f7d621fb12 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -12,7 +12,8 @@ use crate::{
     compute,
     config::{CacheOptions, ProjectInfoCacheOptions},
     context::RequestMonitoring,
-    scram, EndpointCacheKey, ProjectId,
+    intern::ProjectIdInt,
+    scram, EndpointCacheKey,
 };
 use dashmap::DashMap;
 use std::{sync::Arc, time::Duration};
@@ -271,7 +272,7 @@ pub struct AuthInfo {
     /// List of IP addresses allowed for the autorization.
     pub allowed_ips: Vec<IpPattern>,
     /// Project ID. This is used for cache invalidation.
-    pub project_id: Option<ProjectId>,
+    pub project_id: Option<ProjectIdInt>,
 }
 
 /// Info for establishing a connection to a compute node.
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index b759c81373..cfe491f2aa 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -4,10 +4,16 @@ use super::{
     errors::{ApiError, GetAuthInfoError, WakeComputeError},
     AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
 };
-use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
 use crate::context::RequestMonitoring;
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
 use crate::{auth::IpPattern, cache::Cached};
+use crate::{
+    console::{
+        messages::MetricsAuxInfo,
+        provider::{CachedAllowedIps, CachedRoleSecret},
+    },
+    BranchId, EndpointId, ProjectId,
+};
 use futures::TryFutureExt;
 use std::{str::FromStr, sync::Arc};
 use thiserror::Error;
@@ -114,7 +120,12 @@ impl Api {
 
         let node = NodeInfo {
             config,
-            aux: Default::default(),
+            aux: MetricsAuxInfo {
+                endpoint_id: (&EndpointId::from("endpoint")).into(),
+                project_id: (&ProjectId::from("project")).into(),
+                branch_id: (&BranchId::from("branch")).into(),
+                cold_start_info: crate::console::messages::ColdStartInfo::Warm,
+            },
             allow_self_signed_compute: false,
         };
 
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 289b0c08f7..1a3e2ca795 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -181,15 +181,16 @@ impl super::Api for Api {
         }
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         if let Some(project_id) = auth_info.project_id {
+            let ep_int = ep.into();
             self.caches.project_info.insert_role_secret(
-                &project_id,
-                ep,
-                user,
+                project_id,
+                ep_int,
+                user.into(),
                 auth_info.secret.clone(),
             );
             self.caches.project_info.insert_allowed_ips(
-                &project_id,
-                ep,
+                project_id,
+                ep_int,
                 Arc::new(auth_info.allowed_ips),
             );
             ctx.set_project_id(project_id);
@@ -217,15 +218,16 @@ impl super::Api for Api {
         let allowed_ips = Arc::new(auth_info.allowed_ips);
         let user = &user_info.user;
         if let Some(project_id) = auth_info.project_id {
+            let ep_int = ep.into();
             self.caches.project_info.insert_role_secret(
-                &project_id,
-                ep,
-                user,
+                project_id,
+                ep_int,
+                user.into(),
                 auth_info.secret.clone(),
             );
             self.caches
                 .project_info
-                .insert_allowed_ips(&project_id, ep, allowed_ips.clone());
+                .insert_allowed_ips(project_id, ep_int, allowed_ips.clone());
             ctx.set_project_id(project_id);
         }
         Ok((
@@ -248,8 +250,7 @@ impl super::Api for Api {
         // which means that we might cache it to reduce the load and latency.
         if let Some(cached) = self.caches.node_info.get(&key) {
             info!(key = &*key, "found cached compute node info");
-            info!("cold_start_info=warm");
-            ctx.set_cold_start_info(ColdStartInfo::Warm);
+            ctx.set_project(cached.aux.clone());
             return Ok(cached);
         }
 
@@ -260,17 +261,21 @@ impl super::Api for Api {
         if permit.should_check_cache() {
             if let Some(cached) = self.caches.node_info.get(&key) {
                 info!(key = &*key, "found cached compute node info");
-                info!("cold_start_info=warm");
-                ctx.set_cold_start_info(ColdStartInfo::Warm);
+                ctx.set_project(cached.aux.clone());
                 return Ok(cached);
             }
         }
 
-        let node = self.do_wake_compute(ctx, user_info).await?;
+        let mut node = self.do_wake_compute(ctx, user_info).await?;
         ctx.set_project(node.aux.clone());
-        let cold_start_info = node.aux.cold_start_info.clone().unwrap_or_default();
-        info!(?cold_start_info, "woken up a compute node");
-        let (_, cached) = self.caches.node_info.insert(key.clone(), node);
+        let cold_start_info = node.aux.cold_start_info;
+        info!("woken up a compute node");
+
+        // store the cached node as 'warm'
+        node.aux.cold_start_info = ColdStartInfo::WarmCached;
+        let (_, mut cached) = self.caches.node_info.insert(key.clone(), node);
+        cached.aux.cold_start_info = cold_start_info;
+
         info!(key = &*key, "created a cache entry for compute node info");
 
         Ok(cached)
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 7ca830cdb4..fec95f4722 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -11,8 +11,9 @@ use uuid::Uuid;
 use crate::{
     console::messages::{ColdStartInfo, MetricsAuxInfo},
     error::ErrorKind,
+    intern::{BranchIdInt, ProjectIdInt},
     metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
-    BranchId, DbName, EndpointId, ProjectId, RoleName,
+    DbName, EndpointId, RoleName,
 };
 
 use self::parquet::RequestData;
@@ -34,8 +35,8 @@ pub struct RequestMonitoring {
     pub span: Span,
 
     // filled in as they are discovered
-    project: Option<ProjectId>,
-    branch: Option<BranchId>,
+    project: Option<ProjectIdInt>,
+    branch: Option<BranchIdInt>,
     endpoint_id: Option<EndpointId>,
     dbname: Option<DbName>,
     user: Option<RoleName>,
@@ -43,7 +44,7 @@ pub struct RequestMonitoring {
     error_kind: Option<ErrorKind>,
     pub(crate) auth_method: Option<AuthMethod>,
     success: bool,
-    cold_start_info: Option<ColdStartInfo>,
+    pub(crate) cold_start_info: ColdStartInfo,
 
     // extra
     // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -92,7 +93,7 @@ impl RequestMonitoring {
             error_kind: None,
             auth_method: None,
             success: false,
-            cold_start_info: None,
+            cold_start_info: ColdStartInfo::Unknown,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
             latency_timer: LatencyTimer::new(protocol),
@@ -113,26 +114,31 @@ impl RequestMonitoring {
     }
 
     pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
-        self.cold_start_info = Some(info);
+        self.cold_start_info = info;
+        self.latency_timer.cold_start_info(info);
     }
 
     pub fn set_project(&mut self, x: MetricsAuxInfo) {
-        self.set_endpoint_id(x.endpoint_id);
+        if self.endpoint_id.is_none() {
+            self.set_endpoint_id(x.endpoint_id.as_str().into())
+        }
         self.branch = Some(x.branch_id);
         self.project = Some(x.project_id);
-        self.cold_start_info = x.cold_start_info;
+        self.set_cold_start_info(x.cold_start_info);
     }
 
-    pub fn set_project_id(&mut self, project_id: ProjectId) {
+    pub fn set_project_id(&mut self, project_id: ProjectIdInt) {
         self.project = Some(project_id);
     }
 
     pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
-        self.span.record("ep", display(&endpoint_id));
-        crate::metrics::CONNECTING_ENDPOINTS
-            .with_label_values(&[self.protocol])
-            .measure(&endpoint_id);
-        self.endpoint_id = Some(endpoint_id);
+        if self.endpoint_id.is_none() {
+            self.span.record("ep", display(&endpoint_id));
+            crate::metrics::CONNECTING_ENDPOINTS
+                .with_label_values(&[self.protocol])
+                .measure(&endpoint_id);
+            self.endpoint_id = Some(endpoint_id);
+        }
     }
 
     pub fn set_application(&mut self, app: Option<SmolStr>) {
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 04e5695255..eb77409429 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -87,7 +87,7 @@ pub struct RequestData {
     /// Or if we make it to proxy_pass
     success: bool,
     /// Indicates if the cplane started the new compute node for this request.
-    cold_start_info: Option<&'static str>,
+    cold_start_info: &'static str,
     /// Tracks time from session start (HTTP request/libpq TCP handshake)
     /// Through to success/failure
     duration_us: u64,
@@ -115,12 +115,7 @@ impl From<&RequestMonitoring> for RequestData {
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
             success: value.success,
-            cold_start_info: value.cold_start_info.as_ref().map(|x| match x {
-                crate::console::messages::ColdStartInfo::Unknown => "unknown",
-                crate::console::messages::ColdStartInfo::Warm => "warm",
-                crate::console::messages::ColdStartInfo::PoolHit => "pool_hit",
-                crate::console::messages::ColdStartInfo::PoolMiss => "pool_miss",
-            }),
+            cold_start_info: value.cold_start_info.as_str(),
             duration_us: SystemTime::from(value.first_packet)
                 .elapsed()
                 .unwrap_or_default()
@@ -454,7 +449,7 @@ mod tests {
             region: "us-east-1",
             error: None,
             success: rng.gen(),
-            cold_start_info: Some("no"),
+            cold_start_info: "no",
             duration_us: rng.gen_range(0..30_000_000),
         }
     }
@@ -524,15 +519,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1314406, 3, 6000),
-                (1314399, 3, 6000),
-                (1314459, 3, 6000),
-                (1314416, 3, 6000),
-                (1314546, 3, 6000),
-                (1314388, 3, 6000),
-                (1314180, 3, 6000),
-                (1314416, 3, 6000),
-                (438359, 1, 2000)
+                (1314385, 3, 6000),
+                (1314378, 3, 6000),
+                (1314438, 3, 6000),
+                (1314395, 3, 6000),
+                (1314525, 3, 6000),
+                (1314367, 3, 6000),
+                (1314159, 3, 6000),
+                (1314395, 3, 6000),
+                (438352, 1, 2000)
             ]
         );
 
@@ -562,11 +557,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1220668, 5, 10000),
-                (1226818, 5, 10000),
-                (1228612, 5, 10000),
-                (1227974, 5, 10000),
-                (1219252, 5, 10000)
+                (1220633, 5, 10000),
+                (1226783, 5, 10000),
+                (1228577, 5, 10000),
+                (1227939, 5, 10000),
+                (1219217, 5, 10000)
             ]
         );
 
@@ -598,11 +593,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1206315, 5, 10000),
-                (1206046, 5, 10000),
-                (1206339, 5, 10000),
-                (1206327, 5, 10000),
-                (1206582, 5, 10000)
+                (1206280, 5, 10000),
+                (1206011, 5, 10000),
+                (1206304, 5, 10000),
+                (1206292, 5, 10000),
+                (1206547, 5, 10000)
             ]
         );
 
@@ -627,15 +622,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1314406, 3, 6000),
-                (1314399, 3, 6000),
-                (1314459, 3, 6000),
-                (1314416, 3, 6000),
-                (1314546, 3, 6000),
-                (1314388, 3, 6000),
-                (1314180, 3, 6000),
-                (1314416, 3, 6000),
-                (438359, 1, 2000)
+                (1314385, 3, 6000),
+                (1314378, 3, 6000),
+                (1314438, 3, 6000),
+                (1314395, 3, 6000),
+                (1314525, 3, 6000),
+                (1314367, 3, 6000),
+                (1314159, 3, 6000),
+                (1314395, 3, 6000),
+                (438352, 1, 2000)
             ]
         );
 
@@ -672,7 +667,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(658837, 2, 3001), (658551, 2, 3000), (658347, 2, 2999)]
+            [(658823, 2, 3001), (658537, 2, 3000), (658333, 2, 2999)]
         );
 
         tmpdir.close().unwrap();
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 9da1fdc02f..59ee899c08 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -12,6 +12,8 @@ use metrics::{
 use once_cell::sync::Lazy;
 use tokio::time::{self, Instant};
 
+use crate::console::messages::ColdStartInfo;
+
 pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
     register_int_counter_pair_vec!(
         "proxy_opened_db_connections_total",
@@ -50,8 +52,8 @@ pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
         "proxy_compute_connection_latency_seconds",
         "Time it took for proxy to establish a connection to the compute endpoint",
         // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
-        // 3 * 2 * 2 * 2 * 2 = 48 counters
-        &["protocol", "cache_miss", "pool_miss", "outcome", "excluded"],
+        // 3 * 6 * 2 * 2 = 72 counters
+        &["protocol", "cold_start_info", "outcome", "excluded"],
         // largest bucket = 2^16 * 0.5ms = 32s
         exponential_buckets(0.0005, 2.0, 16).unwrap(),
     )
@@ -183,6 +185,20 @@ struct Accumulated {
     compute: time::Duration,
 }
 
+enum Outcome {
+    Success,
+    Failed,
+}
+
+impl Outcome {
+    fn as_str(&self) -> &'static str {
+        match self {
+            Outcome::Success => "success",
+            Outcome::Failed => "failed",
+        }
+    }
+}
+
 pub struct LatencyTimer {
     // time since the stopwatch was started
     start: time::Instant,
@@ -192,9 +208,8 @@ pub struct LatencyTimer {
     accumulated: Accumulated,
     // label data
     protocol: &'static str,
-    cache_miss: bool,
-    pool_miss: bool,
-    outcome: &'static str,
+    cold_start_info: ColdStartInfo,
+    outcome: Outcome,
 }
 
 pub struct LatencyTimerPause<'a> {
@@ -210,11 +225,9 @@ impl LatencyTimer {
             stop: None,
             accumulated: Accumulated::default(),
             protocol,
-            cache_miss: false,
-            // by default we don't do pooling
-            pool_miss: true,
+            cold_start_info: ColdStartInfo::Unknown,
             // assume failed unless otherwise specified
-            outcome: "failed",
+            outcome: Outcome::Failed,
         }
     }
 
@@ -226,12 +239,8 @@ impl LatencyTimer {
         }
     }
 
-    pub fn cache_miss(&mut self) {
-        self.cache_miss = true;
-    }
-
-    pub fn pool_hit(&mut self) {
-        self.pool_miss = false;
+    pub fn cold_start_info(&mut self, cold_start_info: ColdStartInfo) {
+        self.cold_start_info = cold_start_info;
     }
 
     pub fn success(&mut self) {
@@ -239,7 +248,7 @@ impl LatencyTimer {
         self.stop = Some(time::Instant::now());
 
         // success
-        self.outcome = "success";
+        self.outcome = Outcome::Success;
     }
 }
 
@@ -264,9 +273,8 @@ impl Drop for LatencyTimer {
         COMPUTE_CONNECTION_LATENCY
             .with_label_values(&[
                 self.protocol,
-                bool_to_str(self.cache_miss),
-                bool_to_str(self.pool_miss),
-                self.outcome,
+                self.cold_start_info.as_str(),
+                self.outcome.as_str(),
                 "client",
             ])
             .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
@@ -275,9 +283,8 @@ impl Drop for LatencyTimer {
         COMPUTE_CONNECTION_LATENCY
             .with_label_values(&[
                 self.protocol,
-                bool_to_str(self.cache_miss),
-                bool_to_str(self.pool_miss),
-                self.outcome,
+                self.cold_start_info.as_str(),
+                self.outcome.as_str(),
                 "client_and_cplane",
             ])
             .observe((duration.saturating_sub(accumulated_total)).as_secs_f64());
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index c76e2ff6d9..4c0d68ce0b 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -87,7 +87,6 @@ impl ConnectMechanism for TcpMechanism<'_> {
 }
 
 /// Try to connect to the compute node, retrying if necessary.
-/// This function might update `node_info`, so we take it by `&mut`.
 #[tracing::instrument(skip_all)]
 pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
     ctx: &mut RequestMonitoring,
@@ -132,7 +131,6 @@ where
     } else {
         // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
         info!("compute node's state has likely changed; requesting a wake-up");
-        ctx.latency_timer.cache_miss();
         let old_node_info = invalidate_cache(node_info);
         let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
         node_info.reuse_settings(old_node_info);
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index cf53c6e673..c81a1a8292 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -19,8 +19,8 @@ pub async fn proxy_pass(
     aux: MetricsAuxInfo,
 ) -> anyhow::Result<()> {
     let usage = USAGE_METRICS.register(Ids {
-        endpoint_id: aux.endpoint_id.clone(),
-        branch_id: aux.branch_id.clone(),
+        endpoint_id: aux.endpoint_id,
+        branch_id: aux.branch_id,
     });
 
     let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index a4051447c1..71d85e106d 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -12,11 +12,12 @@ use crate::auth::backend::{
 };
 use crate::config::CertResolver;
 use crate::console::caches::NodeInfoCache;
+use crate::console::messages::MetricsAuxInfo;
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
-use crate::{http, sasl, scram};
+use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use rstest::rstest;
@@ -512,7 +513,12 @@ impl TestBackend for TestConnectMechanism {
 fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
     let node = NodeInfo {
         config: compute::ConnCfg::new(),
-        aux: Default::default(),
+        aux: MetricsAuxInfo {
+            endpoint_id: (&EndpointId::from("endpoint")).into(),
+            project_id: (&ProjectId::from("project")).into(),
+            branch_id: (&BranchId::from("branch")).into(),
+            cold_start_info: crate::console::messages::ColdStartInfo::Warm,
+        },
         allow_self_signed_compute: false,
     };
     let (_, node) = cache.insert("key".into(), node);
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index f10779d7ba..8aa5ad4e8a 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -9,7 +9,6 @@ use crate::{
     config::ProxyConfig,
     console::{
         errors::{GetAuthInfoError, WakeComputeError},
-        messages::ColdStartInfo,
         CachedNodeInfo,
     },
     context::RequestMonitoring,
@@ -57,7 +56,10 @@ impl PoolingBackend {
         let auth_outcome =
             crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
         let res = match auth_outcome {
-            crate::sasl::Outcome::Success(key) => Ok(key),
+            crate::sasl::Outcome::Success(key) => {
+                info!("user successfully authenticated");
+                Ok(key)
+            }
             crate::sasl::Outcome::Failure(reason) => {
                 info!("auth backend failed with an error: {reason}");
                 Err(AuthError::auth_failed(&*conn_info.user_info.user))
@@ -89,8 +91,6 @@ impl PoolingBackend {
         };
 
         if let Some(client) = maybe_client {
-            info!("cold_start_info=warm");
-            ctx.set_cold_start_info(ColdStartInfo::Warm);
             return Ok(client);
         }
         let conn_id = uuid::Uuid::new_v4();
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index c7e8eaef76..35311facb8 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -17,7 +17,7 @@ use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
 
-use crate::console::messages::MetricsAuxInfo;
+use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
 use crate::{
@@ -383,9 +383,12 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                     "pid",
                     &tracing::field::display(client.inner.get_process_id()),
                 );
-                info!("pool: reusing connection '{conn_info}'");
+                info!(
+                    cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
+                    "pool: reusing connection '{conn_info}'"
+                );
                 client.session.send(ctx.session_id)?;
-                ctx.latency_timer.pool_hit();
+                ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
                 ctx.latency_timer.success();
                 return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
             }
@@ -454,8 +457,9 @@ pub fn poll_client<C: ClientInnerExt>(
     let (tx, mut rx) = tokio::sync::watch::channel(session_id);
 
     let span = info_span!(parent: None, "connection", %conn_id);
+    let cold_start_info = ctx.cold_start_info;
     span.in_scope(|| {
-        info!(%conn_info, %session_id, "new connection");
+        info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection");
     });
     let pool = match conn_info.endpoint_cache_key() {
         Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)),
@@ -565,8 +569,8 @@ impl<C: ClientInnerExt> Client<C> {
     pub fn metrics(&self) -> Arc<MetricCounter> {
         let aux = &self.inner.as_ref().unwrap().aux;
         USAGE_METRICS.register(Ids {
-            endpoint_id: aux.endpoint_id.clone(),
-            branch_id: aux.branch_id.clone(),
+            endpoint_id: aux.endpoint_id,
+            branch_id: aux.branch_id,
         })
     }
 }
@@ -666,6 +670,8 @@ impl<C: ClientInnerExt> Drop for Client<C> {
 mod tests {
     use std::{mem, sync::atomic::AtomicBool};
 
+    use crate::{BranchId, EndpointId, ProjectId};
+
     use super::*;
 
     struct MockClient(Arc<AtomicBool>);
@@ -691,7 +697,12 @@ mod tests {
         ClientInner {
             inner: client,
             session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
-            aux: Default::default(),
+            aux: MetricsAuxInfo {
+                endpoint_id: (&EndpointId::from("endpoint")).into(),
+                project_id: (&ProjectId::from("project")).into(),
+                branch_id: (&BranchId::from("branch")).into(),
+                cold_start_info: crate::console::messages::ColdStartInfo::Warm,
+            },
             conn_id: uuid::Uuid::new_v4(),
         }
     }
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index b21056735d..5ffbf95c07 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -3,7 +3,8 @@
 use crate::{
     config::{MetricBackupCollectionConfig, MetricCollectionConfig},
     context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
-    http, BranchId, EndpointId,
+    http,
+    intern::{BranchIdInt, EndpointIdInt},
 };
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
@@ -43,8 +44,8 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// because we enrich the event with project_id in the control-plane endpoint.
 #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
 pub struct Ids {
-    pub endpoint_id: EndpointId,
-    pub branch_id: BranchId,
+    pub endpoint_id: EndpointIdInt,
+    pub branch_id: BranchIdInt,
 }
 
 pub trait MetricCounterRecorder {
@@ -494,7 +495,7 @@ mod tests {
     use url::Url;
 
     use super::*;
-    use crate::{http, rate_limiter::RateLimiterConfig};
+    use crate::{http, rate_limiter::RateLimiterConfig, BranchId, EndpointId};
 
     #[tokio::test]
     async fn metrics() {
@@ -536,8 +537,8 @@ mod tests {
         // register a new counter
 
         let counter = metrics.register(Ids {
-            endpoint_id: "e1".into(),
-            branch_id: "b1".into(),
+            endpoint_id: (&EndpointId::from("e1")).into(),
+            branch_id: (&BranchId::from("b1")).into(),
         });
 
         // the counter should be observed despite 0 egress

From 66fc465484326f5a87760797715b0bb4959da38d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 5 Apr 2024 16:18:00 +0100
Subject: [PATCH 0530/1571] Clean up 'attachment service' names to storage
 controller (#7326)

The binary etc were renamed some time ago, but the path in the source
tree remained "attachment_service" to avoid disruption to ongoing PRs.
There aren't any big PRs out right now, so it's a good time to cut over.

- Rename `attachment_service` to `storage_controller`
- Move it to the top level for symmetry with `storage_broker` & to avoid
mixing the non-prod neon_local stuff (`control_plane/`) with the storage
controller which is a production component.
---
 .dockerignore                                 |  1 +
 CODEOWNERS                                    |  2 +-
 Cargo.lock                                    | 78 +++++++++----------
 Cargo.toml                                    |  2 +-
 control_plane/storcon_cli/src/main.rs         |  2 +-
 diesel.toml                                   |  4 +-
 docs/sourcetree.md                            |  5 ++
 libs/pageserver_api/src/controller_api.rs     |  2 +-
 .../Cargo.toml                                | 10 +--
 .../migrations/.keep                          |  0
 .../down.sql                                  |  0
 .../up.sql                                    |  0
 .../down.sql                                  |  0
 .../up.sql                                    |  0
 .../2024-01-07-212945_create_nodes/down.sql   |  0
 .../2024-01-07-212945_create_nodes/up.sql     |  0
 .../down.sql                                  |  0
 .../2024-02-29-094122_generations_null/up.sql |  0
 .../2024-03-18-184429_rename_policy/down.sql  |  0
 .../2024-03-18-184429_rename_policy/up.sql    |  0
 .../down.sql                                  |  0
 .../2024-03-27-133204_tenant_policies/up.sql  |  0
 .../src/auth.rs                               |  0
 .../src/compute_hook.rs                       |  0
 .../src/heartbeater.rs                        |  0
 .../src/http.rs                               |  0
 .../src/id_lock_map.rs                        |  0
 .../src/lib.rs                                |  0
 .../src/main.rs                               | 10 +--
 .../src/metrics.rs                            |  0
 .../src/node.rs                               |  0
 .../src/pageserver_client.rs                  |  0
 .../src/persistence.rs                        |  0
 .../src/persistence/split_state.rs            |  0
 .../src/reconciler.rs                         |  0
 .../src/scheduler.rs                          |  0
 .../src/schema.rs                             |  0
 .../src/service.rs                            |  0
 .../src/tenant_state.rs                       |  0
 ..._service.py => test_storage_controller.py} | 24 +++---
 40 files changed, 73 insertions(+), 67 deletions(-)
 rename {control_plane/attachment_service => storage_controller}/Cargo.toml (83%)
 rename {control_plane/attachment_service => storage_controller}/migrations/.keep (100%)
 rename {control_plane/attachment_service => storage_controller}/migrations/00000000000000_diesel_initial_setup/down.sql (100%)
 rename {control_plane/attachment_service => storage_controller}/migrations/00000000000000_diesel_initial_setup/up.sql (100%)
 rename {control_plane/attachment_service => storage_controller}/migrations/2024-01-07-211257_create_tenant_shards/down.sql (100%)
 rename {control_plane/attachment_service => storage_controller}/migrations/2024-01-07-211257_create_tenant_shards/up.sql (100%)
 rename {control_plane/attachment_service => storage_controller}/migrations/2024-01-07-212945_create_nodes/down.sql (100%)
 rename {control_plane/attachment_service => storage_controller}/migrations/2024-01-07-212945_create_nodes/up.sql (100%)
 rename {control_plane/attachment_service => storage_controller}/migrations/2024-02-29-094122_generations_null/down.sql (100%)
 rename {control_plane/attachment_service => storage_controller}/migrations/2024-02-29-094122_generations_null/up.sql (100%)
 rename {control_plane/attachment_service => storage_controller}/migrations/2024-03-18-184429_rename_policy/down.sql (100%)
 rename {control_plane/attachment_service => storage_controller}/migrations/2024-03-18-184429_rename_policy/up.sql (100%)
 rename {control_plane/attachment_service => storage_controller}/migrations/2024-03-27-133204_tenant_policies/down.sql (100%)
 rename {control_plane/attachment_service => storage_controller}/migrations/2024-03-27-133204_tenant_policies/up.sql (100%)
 rename {control_plane/attachment_service => storage_controller}/src/auth.rs (100%)
 rename {control_plane/attachment_service => storage_controller}/src/compute_hook.rs (100%)
 rename {control_plane/attachment_service => storage_controller}/src/heartbeater.rs (100%)
 rename {control_plane/attachment_service => storage_controller}/src/http.rs (100%)
 rename {control_plane/attachment_service => storage_controller}/src/id_lock_map.rs (100%)
 rename {control_plane/attachment_service => storage_controller}/src/lib.rs (100%)
 rename {control_plane/attachment_service => storage_controller}/src/main.rs (97%)
 rename {control_plane/attachment_service => storage_controller}/src/metrics.rs (100%)
 rename {control_plane/attachment_service => storage_controller}/src/node.rs (100%)
 rename {control_plane/attachment_service => storage_controller}/src/pageserver_client.rs (100%)
 rename {control_plane/attachment_service => storage_controller}/src/persistence.rs (100%)
 rename {control_plane/attachment_service => storage_controller}/src/persistence/split_state.rs (100%)
 rename {control_plane/attachment_service => storage_controller}/src/reconciler.rs (100%)
 rename {control_plane/attachment_service => storage_controller}/src/scheduler.rs (100%)
 rename {control_plane/attachment_service => storage_controller}/src/schema.rs (100%)
 rename {control_plane/attachment_service => storage_controller}/src/service.rs (100%)
 rename {control_plane/attachment_service => storage_controller}/src/tenant_state.rs (100%)
 rename test_runner/regress/{test_sharding_service.py => test_storage_controller.py} (98%)

diff --git a/.dockerignore b/.dockerignore
index 8b378b5dab..f7a6232ba1 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -22,6 +22,7 @@
 !s3_scrubber/
 !safekeeper/
 !storage_broker/
+!storage_controller/
 !trace/
 !vendor/postgres-*/
 !workspace_hack/
diff --git a/CODEOWNERS b/CODEOWNERS
index 9a23e8c958..af2fa6088e 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,5 +1,5 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/control_plane/attachment_service @neondatabase/storage
+/storage_controller @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
 /libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
 /libs/remote_storage/ @neondatabase/storage
diff --git a/Cargo.lock b/Cargo.lock
index d413641c3f..dae406e4ae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -270,45 +270,6 @@ dependencies = [
  "critical-section",
 ]
 
-[[package]]
-name = "attachment_service"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "aws-config",
- "bytes",
- "camino",
- "clap",
- "control_plane",
- "diesel",
- "diesel_migrations",
- "fail",
- "futures",
- "git-version",
- "hex",
- "humantime",
- "hyper",
- "itertools",
- "lasso",
- "measured",
- "metrics",
- "once_cell",
- "pageserver_api",
- "pageserver_client",
- "postgres_connection",
- "r2d2",
- "reqwest",
- "routerify",
- "serde",
- "serde_json",
- "thiserror",
- "tokio",
- "tokio-util",
- "tracing",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -5623,6 +5584,45 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "storage_controller"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "aws-config",
+ "bytes",
+ "camino",
+ "clap",
+ "control_plane",
+ "diesel",
+ "diesel_migrations",
+ "fail",
+ "futures",
+ "git-version",
+ "hex",
+ "humantime",
+ "hyper",
+ "itertools",
+ "lasso",
+ "measured",
+ "metrics",
+ "once_cell",
+ "pageserver_api",
+ "pageserver_client",
+ "postgres_connection",
+ "r2d2",
+ "reqwest",
+ "routerify",
+ "serde",
+ "serde_json",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "storcon_cli"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index 510c702290..3c6077648e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,6 @@ resolver = "2"
 members = [
     "compute_tools",
     "control_plane",
-    "control_plane/attachment_service",
     "control_plane/storcon_cli",
     "pageserver",
     "pageserver/compaction",
@@ -13,6 +12,7 @@ members = [
     "proxy",
     "safekeeper",
     "storage_broker",
+    "storage_controller",
     "s3_scrubber",
     "workspace_hack",
     "trace",
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index f72bc9a2a9..2edd09eac1 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -223,7 +223,7 @@ impl Client {
         }
     }
 
-    /// Simple HTTP request wrapper for calling into attachment service
+    /// Simple HTTP request wrapper for calling into storage controller
     async fn dispatch<RQ, RS>(
         &self,
         method: hyper::Method,
diff --git a/diesel.toml b/diesel.toml
index 30ed4444d7..558c54a1e1 100644
--- a/diesel.toml
+++ b/diesel.toml
@@ -2,8 +2,8 @@
 # see https://diesel.rs/guides/configuring-diesel-cli
 
 [print_schema]
-file = "control_plane/attachment_service/src/schema.rs"
+file = "storage_controller/src/schema.rs"
 custom_type_derives = ["diesel::query_builder::QueryId"]
 
 [migrations_directory]
-dir = "control_plane/attachment_service/migrations"
+dir = "storage_controller/migrations"
diff --git a/docs/sourcetree.md b/docs/sourcetree.md
index 12fa80349e..3732bfdab2 100644
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -7,6 +7,11 @@ Below you will find a brief overview of each subdir in the source tree in alphab
 Neon storage broker, providing messaging between safekeepers and pageservers.
 [storage_broker.md](./storage_broker.md)
 
+`storage_controller`:
+
+Neon storage controller, manages a cluster of pageservers and exposes an API that enables
+managing a many-sharded tenant as a single entity.
+
 `/control_plane`:
 
 Local control plane.
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index be24d452b6..1278f17ad2 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -2,7 +2,7 @@ use std::str::FromStr;
 
 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
-/// in [`attachment_service::http`]
+/// in [`storage_controller::http`]
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId};
 
diff --git a/control_plane/attachment_service/Cargo.toml b/storage_controller/Cargo.toml
similarity index 83%
rename from control_plane/attachment_service/Cargo.toml
rename to storage_controller/Cargo.toml
index 595b091df4..165cafaf4e 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "attachment_service"
+name = "storage_controller"
 version = "0.1.0"
 edition.workspace = true
 license.workspace = true
@@ -45,8 +45,8 @@ diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
 r2d2 = { version = "0.8.10" }
 
-utils = { path = "../../libs/utils/" }
-metrics = { path = "../../libs/metrics/" }
-control_plane = { path = ".." }
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+utils = { path = "../libs/utils/" }
+metrics = { path = "../libs/metrics/" }
+control_plane = { path = "../control_plane" }
+workspace_hack = { version = "0.1", path = "../workspace_hack" }
 
diff --git a/control_plane/attachment_service/migrations/.keep b/storage_controller/migrations/.keep
similarity index 100%
rename from control_plane/attachment_service/migrations/.keep
rename to storage_controller/migrations/.keep
diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql b/storage_controller/migrations/00000000000000_diesel_initial_setup/down.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
rename to storage_controller/migrations/00000000000000_diesel_initial_setup/down.sql
diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql b/storage_controller/migrations/00000000000000_diesel_initial_setup/up.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
rename to storage_controller/migrations/00000000000000_diesel_initial_setup/up.sql
diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/down.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
rename to storage_controller/migrations/2024-01-07-211257_create_tenant_shards/down.sql
diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/up.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
rename to storage_controller/migrations/2024-01-07-211257_create_tenant_shards/up.sql
diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql b/storage_controller/migrations/2024-01-07-212945_create_nodes/down.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
rename to storage_controller/migrations/2024-01-07-212945_create_nodes/down.sql
diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql b/storage_controller/migrations/2024-01-07-212945_create_nodes/up.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
rename to storage_controller/migrations/2024-01-07-212945_create_nodes/up.sql
diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql b/storage_controller/migrations/2024-02-29-094122_generations_null/down.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
rename to storage_controller/migrations/2024-02-29-094122_generations_null/down.sql
diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql b/storage_controller/migrations/2024-02-29-094122_generations_null/up.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
rename to storage_controller/migrations/2024-02-29-094122_generations_null/up.sql
diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql b/storage_controller/migrations/2024-03-18-184429_rename_policy/down.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
rename to storage_controller/migrations/2024-03-18-184429_rename_policy/down.sql
diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql b/storage_controller/migrations/2024-03-18-184429_rename_policy/up.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
rename to storage_controller/migrations/2024-03-18-184429_rename_policy/up.sql
diff --git a/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql b/storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql
rename to storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql
diff --git a/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql b/storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql
rename to storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql
diff --git a/control_plane/attachment_service/src/auth.rs b/storage_controller/src/auth.rs
similarity index 100%
rename from control_plane/attachment_service/src/auth.rs
rename to storage_controller/src/auth.rs
diff --git a/control_plane/attachment_service/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
similarity index 100%
rename from control_plane/attachment_service/src/compute_hook.rs
rename to storage_controller/src/compute_hook.rs
diff --git a/control_plane/attachment_service/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
similarity index 100%
rename from control_plane/attachment_service/src/heartbeater.rs
rename to storage_controller/src/heartbeater.rs
diff --git a/control_plane/attachment_service/src/http.rs b/storage_controller/src/http.rs
similarity index 100%
rename from control_plane/attachment_service/src/http.rs
rename to storage_controller/src/http.rs
diff --git a/control_plane/attachment_service/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs
similarity index 100%
rename from control_plane/attachment_service/src/id_lock_map.rs
rename to storage_controller/src/id_lock_map.rs
diff --git a/control_plane/attachment_service/src/lib.rs b/storage_controller/src/lib.rs
similarity index 100%
rename from control_plane/attachment_service/src/lib.rs
rename to storage_controller/src/lib.rs
diff --git a/control_plane/attachment_service/src/main.rs b/storage_controller/src/main.rs
similarity index 97%
rename from control_plane/attachment_service/src/main.rs
rename to storage_controller/src/main.rs
index 5150468537..3c03d6efe8 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -1,13 +1,13 @@
 use anyhow::{anyhow, Context};
-use attachment_service::http::make_router;
-use attachment_service::metrics::preinitialize_metrics;
-use attachment_service::persistence::Persistence;
-use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use std::sync::Arc;
+use storage_controller::http::make_router;
+use storage_controller::metrics::preinitialize_metrics;
+use storage_controller::persistence::Persistence;
+use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
@@ -51,7 +51,7 @@ struct Cli {
     #[arg(short, long)]
     path: Option<Utf8PathBuf>,
 
-    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
+    /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller
     #[arg(long)]
     database_url: Option<String>,
 
diff --git a/control_plane/attachment_service/src/metrics.rs b/storage_controller/src/metrics.rs
similarity index 100%
rename from control_plane/attachment_service/src/metrics.rs
rename to storage_controller/src/metrics.rs
diff --git a/control_plane/attachment_service/src/node.rs b/storage_controller/src/node.rs
similarity index 100%
rename from control_plane/attachment_service/src/node.rs
rename to storage_controller/src/node.rs
diff --git a/control_plane/attachment_service/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
similarity index 100%
rename from control_plane/attachment_service/src/pageserver_client.rs
rename to storage_controller/src/pageserver_client.rs
diff --git a/control_plane/attachment_service/src/persistence.rs b/storage_controller/src/persistence.rs
similarity index 100%
rename from control_plane/attachment_service/src/persistence.rs
rename to storage_controller/src/persistence.rs
diff --git a/control_plane/attachment_service/src/persistence/split_state.rs b/storage_controller/src/persistence/split_state.rs
similarity index 100%
rename from control_plane/attachment_service/src/persistence/split_state.rs
rename to storage_controller/src/persistence/split_state.rs
diff --git a/control_plane/attachment_service/src/reconciler.rs b/storage_controller/src/reconciler.rs
similarity index 100%
rename from control_plane/attachment_service/src/reconciler.rs
rename to storage_controller/src/reconciler.rs
diff --git a/control_plane/attachment_service/src/scheduler.rs b/storage_controller/src/scheduler.rs
similarity index 100%
rename from control_plane/attachment_service/src/scheduler.rs
rename to storage_controller/src/scheduler.rs
diff --git a/control_plane/attachment_service/src/schema.rs b/storage_controller/src/schema.rs
similarity index 100%
rename from control_plane/attachment_service/src/schema.rs
rename to storage_controller/src/schema.rs
diff --git a/control_plane/attachment_service/src/service.rs b/storage_controller/src/service.rs
similarity index 100%
rename from control_plane/attachment_service/src/service.rs
rename to storage_controller/src/service.rs
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/storage_controller/src/tenant_state.rs
similarity index 100%
rename from control_plane/attachment_service/src/tenant_state.rs
rename to storage_controller/src/tenant_state.rs
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_storage_controller.py
similarity index 98%
rename from test_runner/regress/test_sharding_service.py
rename to test_runner/regress/test_storage_controller.py
index b7d97fd107..405aa22831 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -42,11 +42,11 @@ def get_node_shard_counts(env: NeonEnv, tenant_ids):
     return counts
 
 
-def test_sharding_service_smoke(
+def test_storage_controller_smoke(
     neon_env_builder: NeonEnvBuilder,
 ):
     """
-    Test the basic lifecycle of a sharding service:
+    Test the basic lifecycle of a storage controller:
     - Restarting
     - Restarting a pageserver
     - Creating and deleting tenants and timelines
@@ -204,7 +204,7 @@ def test_node_status_after_restart(
     env.storage_controller.consistency_check()
 
 
-def test_sharding_service_passthrough(
+def test_storage_controller_passthrough(
     neon_env_builder: NeonEnvBuilder,
 ):
     """
@@ -231,7 +231,7 @@ def test_sharding_service_passthrough(
     env.storage_controller.consistency_check()
 
 
-def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
+def test_storage_controller_restart(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     tenant_a = env.initial_tenant
     tenant_b = TenantId.generate()
@@ -266,7 +266,7 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
 
 
 @pytest.mark.parametrize("warm_up", [True, False])
-def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
+def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
     """
     We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
     which provides the /location_config API.  This is similar to creating a tenant,
@@ -420,7 +420,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     env.storage_controller.consistency_check()
 
 
-def test_sharding_service_compute_hook(
+def test_storage_controller_compute_hook(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
     httpserver_listen_address,
@@ -533,7 +533,7 @@ def test_sharding_service_compute_hook(
     env.storage_controller.consistency_check()
 
 
-def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
+def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
     """
     Verify that occasional-use debug APIs work as expected.  This is a lightweight test
     that just hits the endpoints to check that they don't bitrot.
@@ -594,7 +594,7 @@ def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.consistency_check()
 
 
-def test_sharding_service_s3_time_travel_recovery(
+def test_storage_controller_s3_time_travel_recovery(
     neon_env_builder: NeonEnvBuilder,
     pg_bin: PgBin,
 ):
@@ -704,7 +704,7 @@ def test_sharding_service_s3_time_travel_recovery(
     env.storage_controller.consistency_check()
 
 
-def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
+def test_storage_controller_auth(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.auth_enabled = True
     env = neon_env_builder.init_start()
     svc = env.storage_controller
@@ -773,7 +773,7 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
         )
 
 
-def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
+def test_storage_controller_tenant_conf(neon_env_builder: NeonEnvBuilder):
     """
     Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without
     supplying the whole LocationConf.
@@ -876,7 +876,7 @@ def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]:
         PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"),
     ],
 )
-def test_sharding_service_heartbeats(
+def test_storage_controller_heartbeats(
     neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure
 ):
     neon_env_builder.num_pageservers = 2
@@ -986,7 +986,7 @@ def test_sharding_service_heartbeats(
     wait_until(10, 1, storage_controller_consistent)
 
 
-def test_sharding_service_re_attach(neon_env_builder: NeonEnvBuilder):
+def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder):
     """
     Exercise the behavior of the /re-attach endpoint on pageserver startup when
     pageservers have a mixture of attached and secondary locations

From ec01292b55389be73c9a7013ed79d49dd4610cee Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 5 Apr 2024 17:29:53 +0100
Subject: [PATCH 0531/1571] storage controller: rename TenantState to
 TenantShard (#7329)

This is a widely used type that had a misleading name: it's not the
total state of a tenant, but rrepresents one shard.
---
 storage_controller/src/lib.rs                 |   2 +-
 storage_controller/src/persistence.rs         |   2 +-
 storage_controller/src/reconciler.rs          |  10 +-
 storage_controller/src/scheduler.rs           |  10 +-
 storage_controller/src/service.rs             | 120 +++++++++---------
 .../src/{tenant_state.rs => tenant_shard.rs}  |  88 ++++++-------
 6 files changed, 116 insertions(+), 116 deletions(-)
 rename storage_controller/src/{tenant_state.rs => tenant_shard.rs} (96%)

diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs
index 8bcd5c0ac4..2ea490a14b 100644
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -14,7 +14,7 @@ mod reconciler;
 mod scheduler;
 mod schema;
 pub mod service;
-mod tenant_state;
+mod tenant_shard;
 
 #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
 struct Sequence(u64);
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index d60392bdbc..55fbfd10bc 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -696,7 +696,7 @@ impl Persistence {
     }
 }
 
-/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
+/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
 #[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 72eb8faccb..49cfaad569 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -18,14 +18,14 @@ use utils::sync::gate::GateGuard;
 
 use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
-use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
+use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation};
 
 const DEFAULT_HEATMAP_PERIOD: &str = "60s";
 
 /// Object with the lifetime of the background reconcile task that is created
 /// for tenants which have a difference between their intent and observed states.
 pub(super) struct Reconciler {
-    /// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
+    /// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot
     /// of a tenant's state from when we spawned a reconcile task.
     pub(super) tenant_shard_id: TenantShardId,
     pub(crate) shard: ShardIdentity,
@@ -48,11 +48,11 @@ pub(super) struct Reconciler {
 
     /// To avoid stalling if the cloud control plane is unavailable, we may proceed
     /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
-    /// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry.
+    /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry.
     pub(crate) compute_notify_failure: bool,
 
     /// A means to abort background reconciliation: it is essential to
-    /// call this when something changes in the original TenantState that
+    /// call this when something changes in the original TenantShard that
     /// will make this reconciliation impossible or unnecessary, for
     /// example when a pageserver node goes offline, or the PlacementPolicy for
     /// the tenant is changed.
@@ -66,7 +66,7 @@ pub(super) struct Reconciler {
     pub(crate) persistence: Arc<Persistence>,
 }
 
-/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
+/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any
 /// reference counting for Scheduler.  The IntentState is what the scheduler works with,
 /// and the TargetState is just the instruction for a particular Reconciler run.
 #[derive(Debug)]
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 782189d11f..862ac0cbfe 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -1,4 +1,4 @@
-use crate::{node::Node, tenant_state::TenantState};
+use crate::{node::Node, tenant_shard::TenantShard};
 use pageserver_api::controller_api::UtilizationScore;
 use serde::Serialize;
 use std::collections::HashMap;
@@ -27,7 +27,7 @@ pub enum MaySchedule {
 
 #[derive(Serialize)]
 struct SchedulerNode {
-    /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
+    /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
     shard_count: usize,
 
     /// Whether this node is currently elegible to have new shards scheduled (this is derived
@@ -84,7 +84,7 @@ impl std::ops::Add for AffinityScore {
     }
 }
 
-// For carrying state between multiple calls to [`TenantState::schedule`], e.g. when calling
+// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
 // it for many shards in the same tenant.
 #[derive(Debug, Default)]
 pub(crate) struct ScheduleContext {
@@ -147,7 +147,7 @@ impl Scheduler {
     pub(crate) fn consistency_check<'a>(
         &self,
         nodes: impl Iterator<Item = &'a Node>,
-        shards: impl Iterator<Item = &'a TenantState>,
+        shards: impl Iterator<Item = &'a TenantShard>,
     ) -> anyhow::Result<()> {
         let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
         for node in nodes {
@@ -398,7 +398,7 @@ pub(crate) mod test_utils {
 mod tests {
     use super::*;
 
-    use crate::tenant_state::IntentState;
+    use crate::tenant_shard::IntentState;
     #[test]
     fn scheduler_basic() -> anyhow::Result<()> {
         let nodes = test_utils::make_test_nodes(2);
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 0f87a8ab05..010558b797 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -66,9 +66,9 @@ use crate::{
     persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
     reconciler::attached_location_conf,
     scheduler::Scheduler,
-    tenant_state::{
+    tenant_shard::{
         IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
-        ReconcilerWaiter, TenantState,
+        ReconcilerWaiter, TenantShard,
     },
 };
 
@@ -92,7 +92,7 @@ pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
 
 // Top level state available to all HTTP handlers
 struct ServiceState {
-    tenants: BTreeMap<TenantShardId, TenantState>,
+    tenants: BTreeMap<TenantShardId, TenantShard>,
 
     nodes: Arc<HashMap<NodeId, Node>>,
 
@@ -102,7 +102,7 @@ struct ServiceState {
 impl ServiceState {
     fn new(
         nodes: HashMap<NodeId, Node>,
-        tenants: BTreeMap<TenantShardId, TenantState>,
+        tenants: BTreeMap<TenantShardId, TenantShard>,
         scheduler: Scheduler,
     ) -> Self {
         Self {
@@ -116,7 +116,7 @@ impl ServiceState {
         &mut self,
     ) -> (
         &mut Arc<HashMap<NodeId, Node>>,
-        &mut BTreeMap<TenantShardId, TenantState>,
+        &mut BTreeMap<TenantShardId, TenantShard>,
         &mut Scheduler,
     ) {
         (&mut self.nodes, &mut self.tenants, &mut self.scheduler)
@@ -335,11 +335,11 @@ impl Service {
 
             for (tenant_shard_id, shard_observations) in observed {
                 for (node_id, observed_loc) in shard_observations {
-                    let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
+                    let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
                         cleanup.push((tenant_shard_id, node_id));
                         continue;
                     };
-                    tenant_state
+                    tenant_shard
                         .observed
                         .locations
                         .insert(node_id, ObservedStateLocation { conf: observed_loc });
@@ -348,14 +348,14 @@ impl Service {
 
             // Populate each tenant's intent state
             let mut schedule_context = ScheduleContext::default();
-            for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
+            for (tenant_shard_id, tenant_shard) in tenants.iter_mut() {
                 if tenant_shard_id.shard_number == ShardNumber(0) {
                     // Reset scheduling context each time we advance to the next Tenant
                     schedule_context = ScheduleContext::default();
                 }
 
-                tenant_state.intent_from_observed(scheduler);
-                if let Err(e) = tenant_state.schedule(scheduler, &mut schedule_context) {
+                tenant_shard.intent_from_observed(scheduler);
+                if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) {
                     // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
                     // not enough pageservers are available.  The tenant may well still be available
                     // to clients.
@@ -364,11 +364,11 @@ impl Service {
                     // If we're both intending and observed to be attached at a particular node, we will
                     // emit a compute notification for this. In the case where our observed state does not
                     // yet match our intent, we will eventually reconcile, and that will emit a compute notification.
-                    if let Some(attached_at) = tenant_state.stably_attached() {
+                    if let Some(attached_at) = tenant_shard.stably_attached() {
                         compute_notifications.push((
                             *tenant_shard_id,
                             attached_at,
-                            tenant_state.shard.stripe_size,
+                            tenant_shard.shard.stripe_size,
                         ));
                     }
                 }
@@ -743,7 +743,7 @@ impl Service {
 
     /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation
     /// was successful, this will update the observed state of the tenant such that subsequent
-    /// calls to [`TenantState::maybe_reconcile`] will do nothing.
+    /// calls to [`TenantShard::maybe_reconcile`] will do nothing.
     #[instrument(skip_all, fields(
         tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(),
         sequence=%result.sequence
@@ -761,10 +761,10 @@ impl Service {
         tenant.generation = std::cmp::max(tenant.generation, result.generation);
 
         // If the reconciler signals that it failed to notify compute, set this state on
-        // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
+        // the shard so that a future [`TenantShard::maybe_reconcile`] will try again.
         tenant.pending_compute_notification = result.pending_compute_notification;
 
-        // Let the TenantState know it is idle.
+        // Let the TenantShard know it is idle.
         tenant.reconcile_complete(result.sequence);
 
         match result.result {
@@ -979,7 +979,7 @@ impl Service {
             if let Some(generation_pageserver) = tsp.generation_pageserver {
                 intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
             }
-            let new_tenant = TenantState::from_persistent(tsp, intent)?;
+            let new_tenant = TenantShard::from_persistent(tsp, intent)?;
 
             tenants.insert(tenant_shard_id, new_tenant);
         }
@@ -1126,7 +1126,7 @@ impl Service {
                     let mut locked = self.inner.write().unwrap();
                     locked.tenants.insert(
                         attach_req.tenant_shard_id,
-                        TenantState::new(
+                        TenantShard::new(
                             attach_req.tenant_shard_id,
                             ShardIdentity::unsharded(),
                             PlacementPolicy::Attached(0),
@@ -1178,32 +1178,32 @@ impl Service {
         let mut locked = self.inner.write().unwrap();
         let (_nodes, tenants, scheduler) = locked.parts_mut();
 
-        let tenant_state = tenants
+        let tenant_shard = tenants
             .get_mut(&attach_req.tenant_shard_id)
             .expect("Checked for existence above");
 
         if let Some(new_generation) = new_generation {
-            tenant_state.generation = Some(new_generation);
-            tenant_state.policy = PlacementPolicy::Attached(0);
+            tenant_shard.generation = Some(new_generation);
+            tenant_shard.policy = PlacementPolicy::Attached(0);
         } else {
             // This is a detach notification.  We must update placement policy to avoid re-attaching
             // during background scheduling/reconciliation, or during storage controller restart.
             assert!(attach_req.node_id.is_none());
-            tenant_state.policy = PlacementPolicy::Detached;
+            tenant_shard.policy = PlacementPolicy::Detached;
         }
 
         if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
             tracing::info!(
                 tenant_id = %attach_req.tenant_shard_id,
                 ps_id = %attaching_pageserver,
-                generation = ?tenant_state.generation,
+                generation = ?tenant_shard.generation,
                 "issuing",
             );
-        } else if let Some(ps_id) = tenant_state.intent.get_attached() {
+        } else if let Some(ps_id) = tenant_shard.intent.get_attached() {
             tracing::info!(
                 tenant_id = %attach_req.tenant_shard_id,
                 %ps_id,
-                generation = ?tenant_state.generation,
+                generation = ?tenant_shard.generation,
                 "dropping",
             );
         } else {
@@ -1211,14 +1211,14 @@ impl Service {
             tenant_id = %attach_req.tenant_shard_id,
             "no-op: tenant already has no pageserver");
         }
-        tenant_state
+        tenant_shard
             .intent
             .set_attached(scheduler, attach_req.node_id);
 
         tracing::info!(
             "attach_hook: tenant {} set generation {:?}, pageserver {}",
             attach_req.tenant_shard_id,
-            tenant_state.generation,
+            tenant_shard.generation,
             // TODO: this is an odd number of 0xf's
             attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
         );
@@ -1230,36 +1230,36 @@ impl Service {
         #[cfg(feature = "testing")]
         {
             if let Some(node_id) = attach_req.node_id {
-                tenant_state.observed.locations = HashMap::from([(
+                tenant_shard.observed.locations = HashMap::from([(
                     node_id,
                     ObservedStateLocation {
                         conf: Some(attached_location_conf(
-                            tenant_state.generation.unwrap(),
-                            &tenant_state.shard,
-                            &tenant_state.config,
+                            tenant_shard.generation.unwrap(),
+                            &tenant_shard.shard,
+                            &tenant_shard.config,
                             false,
                         )),
                     },
                 )]);
             } else {
-                tenant_state.observed.locations.clear();
+                tenant_shard.observed.locations.clear();
             }
         }
 
         Ok(AttachHookResponse {
             gen: attach_req
                 .node_id
-                .map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
+                .map(|_| tenant_shard.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
         })
     }
 
     pub(crate) fn inspect(&self, inspect_req: InspectRequest) -> InspectResponse {
         let locked = self.inner.read().unwrap();
 
-        let tenant_state = locked.tenants.get(&inspect_req.tenant_shard_id);
+        let tenant_shard = locked.tenants.get(&inspect_req.tenant_shard_id);
 
         InspectResponse {
-            attachment: tenant_state.and_then(|s| {
+            attachment: tenant_shard.and_then(|s| {
                 s.intent
                     .get_attached()
                     .map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps))
@@ -1321,11 +1321,11 @@ impl Service {
             let mut locked = self.inner.write().unwrap();
 
             for (tenant_shard_id, observed_loc) in configs.tenant_shards {
-                let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
+                let Some(tenant_shard) = locked.tenants.get_mut(&tenant_shard_id) else {
                     cleanup.push(tenant_shard_id);
                     continue;
                 };
-                tenant_state
+                tenant_shard
                     .observed
                     .locations
                     .insert(node.get_id(), ObservedStateLocation { conf: observed_loc });
@@ -1496,13 +1496,13 @@ impl Service {
         };
 
         for req_tenant in validate_req.tenants {
-            if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
-                let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen));
+            if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) {
+                let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen));
                 tracing::info!(
                     "handle_validate: {}(gen {}): valid={valid} (latest {:?})",
                     req_tenant.id,
                     req_tenant.gen,
-                    tenant_state.generation
+                    tenant_shard.generation
                 );
                 response.tenants.push(ValidateResponseTenant {
                     id: req_tenant.id,
@@ -1688,7 +1688,7 @@ impl Service {
                         continue;
                     }
                     Entry::Vacant(entry) => {
-                        let state = entry.insert(TenantState::new(
+                        let state = entry.insert(TenantShard::new(
                             tenant_shard_id,
                             ShardIdentity::from_params(
                                 tenant_shard_id.shard_number,
@@ -2738,7 +2738,7 @@ impl Service {
     /// Returns None if the input iterator of shards does not include a shard with number=0
     fn tenant_describe_impl<'a>(
         &self,
-        shards: impl Iterator<Item = &'a TenantState>,
+        shards: impl Iterator<Item = &'a TenantShard>,
     ) -> Option<TenantDescribeResponse> {
         let mut shard_zero = None;
         let mut describe_shards = Vec::new();
@@ -3038,7 +3038,7 @@ impl Service {
                         },
                     );
 
-                    let mut child_state = TenantState::new(child, child_shard, policy.clone());
+                    let mut child_state = TenantShard::new(child, child_shard, policy.clone());
                     child_state.intent = IntentState::single(scheduler, Some(pageserver));
                     child_state.observed = ObservedState {
                         locations: child_observed,
@@ -3046,7 +3046,7 @@ impl Service {
                     child_state.generation = Some(generation);
                     child_state.config = config.clone();
 
-                    // The child's TenantState::splitting is intentionally left at the default value of Idle,
+                    // The child's TenantShard::splitting is intentionally left at the default value of Idle,
                     // as at this point in the split process we have succeeded and this part is infallible:
                     // we will never need to do any special recovery from this state.
 
@@ -3595,8 +3595,8 @@ impl Service {
         Ok(())
     }
 
-    /// For debug/support: a full JSON dump of TenantStates.  Returns a response so that
-    /// we don't have to make TenantState clonable in the return path.
+    /// For debug/support: a full JSON dump of TenantShards.  Returns a response so that
+    /// we don't have to make TenantShard clonable in the return path.
     pub(crate) fn tenants_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
         let serialized = {
             let locked = self.inner.read().unwrap();
@@ -3700,7 +3700,7 @@ impl Service {
     }
 
     /// For debug/support: a JSON dump of the [`Scheduler`].  Returns a response so that
-    /// we don't have to make TenantState clonable in the return path.
+    /// we don't have to make TenantShard clonable in the return path.
     pub(crate) fn scheduler_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
         let serialized = {
             let locked = self.inner.read().unwrap();
@@ -3917,8 +3917,8 @@ impl Service {
                 tracing::info!("Node {} transition to offline", node_id);
                 let mut tenants_affected: usize = 0;
 
-                for (tenant_shard_id, tenant_state) in tenants {
-                    if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
+                for (tenant_shard_id, tenant_shard) in tenants {
+                    if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
                         // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
                         // not assume our knowledge of the node's configuration is accurate until it comes back online
                         observed_loc.conf = None;
@@ -3931,24 +3931,24 @@ impl Service {
                         continue;
                     }
 
-                    if tenant_state.intent.demote_attached(node_id) {
-                        tenant_state.sequence = tenant_state.sequence.next();
+                    if tenant_shard.intent.demote_attached(node_id) {
+                        tenant_shard.sequence = tenant_shard.sequence.next();
 
                         // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
                         // for tenants without secondary locations: if they have a secondary location, then this
                         // schedule() call is just promoting an existing secondary)
                         let mut schedule_context = ScheduleContext::default();
 
-                        match tenant_state.schedule(scheduler, &mut schedule_context) {
+                        match tenant_shard.schedule(scheduler, &mut schedule_context) {
                             Err(e) => {
                                 // It is possible that some tenants will become unschedulable when too many pageservers
                                 // go offline: in this case there isn't much we can do other than make the issue observable.
-                                // TODO: give TenantState a scheduling error attribute to be queried later.
+                                // TODO: give TenantShard a scheduling error attribute to be queried later.
                                 tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id);
                             }
                             Ok(()) => {
                                 if self
-                                    .maybe_reconcile_shard(tenant_state, &new_nodes)
+                                    .maybe_reconcile_shard(tenant_shard, &new_nodes)
                                     .is_some()
                                 {
                                     tenants_affected += 1;
@@ -3967,10 +3967,10 @@ impl Service {
                 tracing::info!("Node {} transition to active", node_id);
                 // When a node comes back online, we must reconcile any tenant that has a None observed
                 // location on the node.
-                for tenant_state in locked.tenants.values_mut() {
-                    if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
+                for tenant_shard in locked.tenants.values_mut() {
+                    if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
                         if observed_loc.conf.is_none() {
-                            self.maybe_reconcile_shard(tenant_state, &new_nodes);
+                            self.maybe_reconcile_shard(tenant_shard, &new_nodes);
                         }
                     }
                 }
@@ -4053,11 +4053,11 @@ impl Service {
         Ok(())
     }
 
-    /// Convenience wrapper around [`TenantState::maybe_reconcile`] that provides
+    /// Convenience wrapper around [`TenantShard::maybe_reconcile`] that provides
     /// all the references to parts of Self that are needed
     fn maybe_reconcile_shard(
         &self,
-        shard: &mut TenantState,
+        shard: &mut TenantShard,
         nodes: &Arc<HashMap<NodeId, Node>>,
     ) -> Option<ReconcilerWaiter> {
         shard.maybe_reconcile(
@@ -4123,7 +4123,7 @@ impl Service {
 
         let mut reconciles_spawned = 0;
 
-        let mut tenant_shards: Vec<&TenantState> = Vec::new();
+        let mut tenant_shards: Vec<&TenantShard> = Vec::new();
 
         // Limit on how many shards' optmizations each call to this function will execute.  Combined
         // with the frequency of background calls, this acts as an implicit rate limit that runs a small
@@ -4254,7 +4254,7 @@ impl Service {
 
     pub async fn shutdown(&self) {
         // Note that this already stops processing any results from reconciles: so
-        // we do not expect that our [`TenantState`] objects will reach a neat
+        // we do not expect that our [`TenantShard`] objects will reach a neat
         // final state.
         self.cancel.cancel();
 
diff --git a/storage_controller/src/tenant_state.rs b/storage_controller/src/tenant_shard.rs
similarity index 96%
rename from storage_controller/src/tenant_state.rs
rename to storage_controller/src/tenant_shard.rs
index 6717b8e178..58b8ef8d5d 100644
--- a/storage_controller/src/tenant_state.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -50,7 +50,7 @@ where
 /// This struct implement Serialize for debugging purposes, but is _not_ persisted
 /// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
 #[derive(Serialize)]
-pub(crate) struct TenantState {
+pub(crate) struct TenantShard {
     pub(crate) tenant_shard_id: TenantShardId,
 
     pub(crate) shard: ShardIdentity,
@@ -354,7 +354,7 @@ pub(crate) struct ReconcilerHandle {
 }
 
 /// When a reconcile task completes, it sends this result object
-/// to be applied to the primary TenantState.
+/// to be applied to the primary TenantShard.
 pub(crate) struct ReconcileResult {
     pub(crate) sequence: Sequence,
     /// On errors, `observed` should be treated as an incompleted description
@@ -367,7 +367,7 @@ pub(crate) struct ReconcileResult {
     pub(crate) generation: Option<Generation>,
     pub(crate) observed: ObservedState,
 
-    /// Set [`TenantState::pending_compute_notification`] from this flag
+    /// Set [`TenantShard::pending_compute_notification`] from this flag
     pub(crate) pending_compute_notification: bool,
 }
 
@@ -379,7 +379,7 @@ impl ObservedState {
     }
 }
 
-impl TenantState {
+impl TenantShard {
     pub(crate) fn new(
         tenant_shard_id: TenantShardId,
         shard: ShardIdentity,
@@ -1143,7 +1143,7 @@ pub(crate) mod tests {
 
     use super::*;
 
-    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
+    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard {
         let tenant_id = TenantId::generate();
         let shard_number = ShardNumber(0);
         let shard_count = ShardCount::new(1);
@@ -1153,7 +1153,7 @@ pub(crate) mod tests {
             shard_number,
             shard_count,
         };
-        TenantState::new(
+        TenantShard::new(
             tenant_shard_id,
             ShardIdentity::new(
                 shard_number,
@@ -1165,7 +1165,7 @@ pub(crate) mod tests {
         )
     }
 
-    fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantState> {
+    fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantShard> {
         let tenant_id = TenantId::generate();
 
         (0..shard_count.count())
@@ -1177,7 +1177,7 @@ pub(crate) mod tests {
                     shard_number,
                     shard_count,
                 };
-                TenantState::new(
+                TenantShard::new(
                     tenant_shard_id,
                     ShardIdentity::new(
                         shard_number,
@@ -1202,24 +1202,24 @@ pub(crate) mod tests {
         let mut scheduler = Scheduler::new(nodes.values());
         let mut context = ScheduleContext::default();
 
-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
-        tenant_state
+        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        tenant_shard
             .schedule(&mut scheduler, &mut context)
             .expect("we have enough nodes, scheduling should work");
 
         // Expect to initially be schedule on to different nodes
-        assert_eq!(tenant_state.intent.secondary.len(), 1);
-        assert!(tenant_state.intent.attached.is_some());
+        assert_eq!(tenant_shard.intent.secondary.len(), 1);
+        assert!(tenant_shard.intent.attached.is_some());
 
-        let attached_node_id = tenant_state.intent.attached.unwrap();
-        let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
+        let attached_node_id = tenant_shard.intent.attached.unwrap();
+        let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap();
         assert_ne!(attached_node_id, secondary_node_id);
 
         // Notifying the attached node is offline should demote it to a secondary
-        let changed = tenant_state.intent.demote_attached(attached_node_id);
+        let changed = tenant_shard.intent.demote_attached(attached_node_id);
         assert!(changed);
-        assert!(tenant_state.intent.attached.is_none());
-        assert_eq!(tenant_state.intent.secondary.len(), 2);
+        assert!(tenant_shard.intent.attached.is_none());
+        assert_eq!(tenant_shard.intent.secondary.len(), 2);
 
         // Update the scheduler state to indicate the node is offline
         nodes
@@ -1229,18 +1229,18 @@ pub(crate) mod tests {
         scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
 
         // Scheduling the node should promote the still-available secondary node to attached
-        tenant_state
+        tenant_shard
             .schedule(&mut scheduler, &mut context)
             .expect("active nodes are available");
-        assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
+        assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id);
 
         // The original attached node should have been retained as a secondary
         assert_eq!(
-            *tenant_state.intent.secondary.iter().last().unwrap(),
+            *tenant_shard.intent.secondary.iter().last().unwrap(),
             attached_node_id
         );
 
-        tenant_state.intent.clear(&mut scheduler);
+        tenant_shard.intent.clear(&mut scheduler);
 
         Ok(())
     }
@@ -1250,48 +1250,48 @@ pub(crate) mod tests {
         let nodes = make_test_nodes(3);
         let mut scheduler = Scheduler::new(nodes.values());
 
-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
 
-        tenant_state.observed.locations.insert(
+        tenant_shard.observed.locations.insert(
             NodeId(3),
             ObservedStateLocation {
                 conf: Some(LocationConfig {
                     mode: LocationConfigMode::AttachedMulti,
                     generation: Some(2),
                     secondary_conf: None,
-                    shard_number: tenant_state.shard.number.0,
-                    shard_count: tenant_state.shard.count.literal(),
-                    shard_stripe_size: tenant_state.shard.stripe_size.0,
+                    shard_number: tenant_shard.shard.number.0,
+                    shard_count: tenant_shard.shard.count.literal(),
+                    shard_stripe_size: tenant_shard.shard.stripe_size.0,
                     tenant_conf: TenantConfig::default(),
                 }),
             },
         );
 
-        tenant_state.observed.locations.insert(
+        tenant_shard.observed.locations.insert(
             NodeId(2),
             ObservedStateLocation {
                 conf: Some(LocationConfig {
                     mode: LocationConfigMode::AttachedStale,
                     generation: Some(1),
                     secondary_conf: None,
-                    shard_number: tenant_state.shard.number.0,
-                    shard_count: tenant_state.shard.count.literal(),
-                    shard_stripe_size: tenant_state.shard.stripe_size.0,
+                    shard_number: tenant_shard.shard.number.0,
+                    shard_count: tenant_shard.shard.count.literal(),
+                    shard_stripe_size: tenant_shard.shard.stripe_size.0,
                     tenant_conf: TenantConfig::default(),
                 }),
             },
         );
 
-        tenant_state.intent_from_observed(&mut scheduler);
+        tenant_shard.intent_from_observed(&mut scheduler);
 
         // The highest generationed attached location gets used as attached
-        assert_eq!(tenant_state.intent.attached, Some(NodeId(3)));
+        assert_eq!(tenant_shard.intent.attached, Some(NodeId(3)));
         // Other locations get used as secondary
-        assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]);
+        assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]);
 
-        scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?;
+        scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?;
 
-        tenant_state.intent.clear(&mut scheduler);
+        tenant_shard.intent.clear(&mut scheduler);
         Ok(())
     }
 
@@ -1300,23 +1300,23 @@ pub(crate) mod tests {
         let nodes = make_test_nodes(3);
         let mut scheduler = Scheduler::new(nodes.values());
 
-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
 
         // In pause mode, schedule() shouldn't do anything
-        tenant_state.scheduling_policy = ShardSchedulingPolicy::Pause;
-        assert!(tenant_state
+        tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause;
+        assert!(tenant_shard
             .schedule(&mut scheduler, &mut ScheduleContext::default())
             .is_ok());
-        assert!(tenant_state.intent.all_pageservers().is_empty());
+        assert!(tenant_shard.intent.all_pageservers().is_empty());
 
         // In active mode, schedule() works
-        tenant_state.scheduling_policy = ShardSchedulingPolicy::Active;
-        assert!(tenant_state
+        tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active;
+        assert!(tenant_shard
             .schedule(&mut scheduler, &mut ScheduleContext::default())
             .is_ok());
-        assert!(!tenant_state.intent.all_pageservers().is_empty());
+        assert!(!tenant_shard.intent.all_pageservers().is_empty());
 
-        tenant_state.intent.clear(&mut scheduler);
+        tenant_shard.intent.clear(&mut scheduler);
         Ok(())
     }
 
@@ -1429,7 +1429,7 @@ pub(crate) mod tests {
     fn optimize_til_idle(
         nodes: &HashMap<NodeId, Node>,
         scheduler: &mut Scheduler,
-        shards: &mut [TenantState],
+        shards: &mut [TenantShard],
     ) {
         let mut loop_n = 0;
         loop {

From 534c099b42f9282cbb2494e771c8492d4d59e702 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 5 Apr 2024 18:01:31 +0100
Subject: [PATCH 0532/1571] tests: improve stability of 
 `test_deletion_queue_recovery` (#7325)

## Problem

As https://github.com/neondatabase/neon/issues/6092 points out, this
test was (ab)using a failpoint!() with 'pause', which was occasionally
causing index uploads to get hung on a stuck executor thread, resulting
in timeouts waiting for remote_consistent_lsn.

That is one of several failure modes, but by far the most frequent.

## Summary of changes

- Replace the failpoint! with a `sleep_millis_async`, which is not only
async but also supports clean shutdown.
- Improve debugging: log the consistent LSN when scheduling an index
upload
- Tidy: remove an unnecessary checkpoint in the test code, where
last_flush_lsn_upload had just been called (this does a checkpoint
internally)
---
 pageserver/src/control_plane_client.rs             | 7 +++++--
 pageserver/src/tenant/remote_timeline_client.rs    | 6 +++---
 test_runner/regress/test_pageserver_generations.py | 6 ++----
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 42c800822b..f0ed46ce23 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -12,7 +12,7 @@ use pageserver_api::{
 use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;
 use url::Url;
-use utils::{backoff, generation::Generation, id::NodeId};
+use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
 
 use crate::{
     config::{NodeMetadata, PageServerConf},
@@ -210,7 +210,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                 .collect(),
         };
 
-        fail::fail_point!("control-plane-client-validate");
+        failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
+        if self.cancel.is_cancelled() {
+            return Err(RetryForeverError::ShuttingDown);
+        }
 
         let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 9b1b5e7ed5..3879135f26 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -593,14 +593,14 @@ impl RemoteTimelineClient {
         upload_queue: &mut UploadQueueInitialized,
         metadata: TimelineMetadata,
     ) {
+        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
+
         info!(
-            "scheduling metadata upload with {} files ({} changed)",
+            "scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
             upload_queue.latest_files.len(),
             upload_queue.latest_files_changes_since_metadata_upload_scheduled,
         );
 
-        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
-
         let index_part = IndexPart::new(
             upload_queue.latest_files.clone(),
             disk_consistent_lsn,
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 4767f2edb1..7020a61b2f 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -111,7 +111,6 @@ def generate_uploads_and_deletions(
             last_flush_lsn_upload(
                 env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
             )
-            ps_http.timeline_checkpoint(tenant_id, timeline_id)
 
         # Compaction should generate some GC-elegible layers
         for i in range(0, 2):
@@ -385,9 +384,8 @@ def test_deletion_queue_recovery(
     if validate_before == ValidateBefore.NO_VALIDATE:
         failpoints.append(
             # Prevent deletion lists from being validated, we will test that they are
-            # dropped properly during recovery.  'pause' is okay here because we kill
-            # the pageserver with immediate=true
-            ("control-plane-client-validate", "pause")
+            # dropped properly during recovery.  This is such a long sleep as to be equivalent to "never"
+            ("control-plane-client-validate", "return(3600000)")
         )
 
     ps_http.configure_failpoints(failpoints)

From 4fc95d2d71c4a3c31d5769762266be2b851d3f7b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 5 Apr 2024 18:07:35 +0100
Subject: [PATCH 0533/1571] pageserver: apply shard filtering to blocks
 ingested during initdb (#7319)

## Problem

Ingest filtering wasn't being applied to timeline creations, so a
timeline created on a sharded tenant would use 20MB+ on each shard (each
shard got a full copy). This didn't break anything, but is inefficient
and leaves the system in a harder-to-validate state where shards
initially have some data that they will eventually drop during
compaction.

Closes: https://github.com/neondatabase/neon/issues/6649

## Summary of changes

- in `import_rel`, filter block-by-block with is_key_local
- During test_sharding_smoke, check that per-shard physical sizes are as
expected
- Also extend the test to check deletion works as expected (this was an
outstanding tech debt task)
---
 pageserver/src/import_datadir.rs     |  6 +++-
 test_runner/fixtures/workload.py     |  6 +++-
 test_runner/regress/test_sharding.py | 43 ++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 343dec2ca1..ed409d3130 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -8,6 +8,7 @@ use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
+use pageserver_api::key::rel_block_to_key;
 use tokio::io::{AsyncRead, AsyncReadExt};
 use tokio_tar::Archive;
 use tracing::*;
@@ -170,7 +171,10 @@ async fn import_rel(
         let r = reader.read_exact(&mut buf).await;
         match r {
             Ok(_) => {
-                modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
+                let key = rel_block_to_key(rel, blknum);
+                if modification.tline.get_shard_identity().is_key_local(&key) {
+                    modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
+                }
             }
 
             // TODO: UnexpectedEof is expected
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index 4ebc02e6fd..364b8a1cf0 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -81,9 +81,13 @@ class Workload:
 
         return self._endpoint
 
-    def __del__(self):
+    def stop(self):
         if self._endpoint is not None:
             self._endpoint.stop()
+            self._endpoint = None
+
+    def __del__(self):
+        self.stop()
 
     def stop(self):
         if self._endpoint is not None:
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index bca11bbbe7..bfaab9125f 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -10,11 +10,13 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
+    S3Scrubber,
     StorageControllerApiException,
     last_flush_lsn_upload,
     tenant_get_shards,
     wait_for_last_flush_lsn,
 )
+from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty
 from fixtures.remote_storage import s3_storage
 from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import wait_until
@@ -69,6 +71,15 @@ def test_sharding_smoke(
         log.info(f"sizes = {sizes}")
         return sizes
 
+    # The imported initdb for timeline creation should
+    # not be fully imported on every shard.  We use a 1MB strripe size so expect
+    # pretty good distribution: no one shard should have more than half the data
+    sizes = get_sizes()
+    physical_initdb_total = sum(sizes.values())
+    expect_initdb_size = 20 * 1024 * 1024
+    assert physical_initdb_total > expect_initdb_size
+    assert all(s < expect_initdb_size // 2 for s in sizes.values())
+
     # Test that timeline creation works on a sharded tenant
     timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id)
 
@@ -101,6 +112,38 @@ def test_sharding_smoke(
 
     env.storage_controller.consistency_check()
 
+    # Validate that deleting a sharded tenant removes all files in the prefix
+
+    # Before deleting, stop the client and check we have some objects to delete
+    workload.stop()
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
+
+    # Check the scrubber isn't confused by sharded content, then disable
+    # it during teardown because we'll have deleted by then
+    S3Scrubber(neon_env_builder).scan_metadata()
+    neon_env_builder.scrub_on_exit = False
+
+    env.storage_controller.pageserver_api().tenant_delete(tenant_id)
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
+
+    env.storage_controller.consistency_check()
+
 
 def test_sharding_split_unsharded(
     neon_env_builder: NeonEnvBuilder,

From edcaae6290034db41a701f01fda7002001d663e8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 5 Apr 2024 21:11:04 +0200
Subject: [PATCH 0534/1571] fixup: PR #7319 defined workload.py `def stop()`
 twice (#7333)

Somehow it made it through CI.
---
 test_runner/fixtures/workload.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index 364b8a1cf0..c44628ce06 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -89,11 +89,6 @@ class Workload:
     def __del__(self):
         self.stop()
 
-    def stop(self):
-        if self._endpoint is not None:
-            self._endpoint.stop()
-            self._endpoint = None
-
     def init(self, pageserver_id: Optional[int] = None):
         endpoint = self.endpoint(pageserver_id)
 

From 74b2314a5d6f7ce2baf2951962ec04136caa5111 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Sat, 6 Apr 2024 20:51:59 +0100
Subject: [PATCH 0535/1571] control_plane: revise compute_hook locking (don't
 serialise all calls) (#7088)

## Problem

- Previously, an async mutex was held for the duration of
`ComputeHook::notify`. This served multiple purposes:
  - Ensure updates to a given tenant are sent in the proper order
- Prevent concurrent calls into neon_local endpoint updates in test
environments (neon_local is not safe to call concurrently)
- Protect the inner ComputeHook::state hashmap that is used to calculate
when to send notifications.

This worked, but had the major downside that while we're waiting for a
compute hook request to the control plane to succeed, we can't notify
about any other tenants. Notifications block progress of live
migrations, so this is a problem.

## Summary of changes

- Protect `ComputeHook::state` with a sync lock instead of an async lock
- Use a separate async lock ( `ComputeHook::neon_local_lock` ) for
preventing concurrent calls into neon_local, and only take this in the
neon_local code path.
- Add per-tenant async locks in ShardedComputeHookTenant, and use these
to ensure that only one remote notification can be sent at once per
tenant. If several shards update concurrently, their updates will be
coalesced.
- Add an explicit semaphore that limits concurrency of calls into the
cloud control plane.
---
 storage_controller/src/compute_hook.rs | 277 ++++++++++++++++++-------
 1 file changed, 197 insertions(+), 80 deletions(-)

diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 1a8dc6b86d..eb0c4472e4 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -1,3 +1,4 @@
+use std::sync::Arc;
 use std::{collections::HashMap, time::Duration};
 
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
@@ -18,14 +19,26 @@ const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
 
 pub(crate) const API_CONCURRENCY: usize = 32;
 
+struct UnshardedComputeHookTenant {
+    // Which node is this tenant attached to
+    node_id: NodeId,
+
+    // Must hold this lock to send a notification.
+    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
+}
 struct ShardedComputeHookTenant {
     stripe_size: ShardStripeSize,
     shard_count: ShardCount,
     shards: Vec<(ShardNumber, NodeId)>,
+
+    // Must hold this lock to send a notification.  The contents represent
+    // the last successfully sent notification, and are used to coalesce multiple
+    // updates by only sending when there is a chance since our last successful send.
+    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
 }
 
 enum ComputeHookTenant {
-    Unsharded(NodeId),
+    Unsharded(UnshardedComputeHookTenant),
     Sharded(ShardedComputeHookTenant),
 }
 
@@ -37,9 +50,20 @@ impl ComputeHookTenant {
                 shards: vec![(tenant_shard_id.shard_number, node_id)],
                 stripe_size,
                 shard_count: tenant_shard_id.shard_count,
+                send_lock: Arc::default(),
             })
         } else {
-            Self::Unsharded(node_id)
+            Self::Unsharded(UnshardedComputeHookTenant {
+                node_id,
+                send_lock: Arc::default(),
+            })
+        }
+    }
+
+    fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>> {
+        match self {
+            Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock,
+            Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock,
         }
     }
 
@@ -52,8 +76,8 @@ impl ComputeHookTenant {
         node_id: NodeId,
     ) {
         match self {
-            Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
-                *existing_node_id = node_id
+            Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => {
+                unsharded_tenant.node_id = node_id
             }
             Self::Sharded(sharded_tenant)
                 if sharded_tenant.stripe_size == stripe_size
@@ -80,14 +104,14 @@ impl ComputeHookTenant {
     }
 }
 
-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
 struct ComputeHookNotifyRequestShard {
     node_id: NodeId,
     shard_number: ShardNumber,
 }
 
 /// Request body that we send to the control plane to notify it of where a tenant is attached
-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
 struct ComputeHookNotifyRequest {
     tenant_id: TenantId,
     stripe_size: Option<ShardStripeSize>,
@@ -120,14 +144,44 @@ pub(crate) enum NotifyError {
     Fatal(StatusCode),
 }
 
+enum MaybeSendResult {
+    // Please send this request while holding the lock, and if you succeed then write
+    // the request into the lock.
+    Transmit(
+        (
+            ComputeHookNotifyRequest,
+            tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>,
+        ),
+    ),
+    // Something requires sending, but you must wait for a current sender then call again
+    AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>),
+    // Nothing requires sending
+    Noop,
+}
+
 impl ComputeHookTenant {
-    fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
-        match self {
-            Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
+    fn maybe_send(
+        &self,
+        tenant_id: TenantId,
+        lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>>,
+    ) -> MaybeSendResult {
+        let locked = match lock {
+            Some(already_locked) => already_locked,
+            None => {
+                // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock.
+                let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else {
+                    return MaybeSendResult::AwaitLock(self.get_send_lock().clone());
+                };
+                locked
+            }
+        };
+
+        let request = match self {
+            Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest {
                 tenant_id,
                 shards: vec![ComputeHookNotifyRequestShard {
                     shard_number: ShardNumber(0),
-                    node_id: *node_id,
+                    node_id: unsharded_tenant.node_id,
                 }],
                 stripe_size: None,
             }),
@@ -151,12 +205,25 @@ impl ComputeHookTenant {
                 // Sharded tenant doesn't yet have information for all its shards
 
                 tracing::info!(
-                    "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
+                    "ComputeHookTenant::maybe_send: not enough shards ({}/{})",
                     sharded_tenant.shards.len(),
                     sharded_tenant.shard_count.count()
                 );
                 None
             }
+        };
+
+        match request {
+            None => {
+                // Not yet ready to emit a notification
+                tracing::info!("Tenant isn't yet ready to emit a notification");
+                MaybeSendResult::Noop
+            }
+            Some(request) if Some(&request) == locked.as_ref() => {
+                // No change from the last value successfully sent
+                MaybeSendResult::Noop
+            }
+            Some(request) => MaybeSendResult::Transmit((request, locked)),
         }
     }
 }
@@ -166,8 +233,15 @@ impl ComputeHookTenant {
 /// the compute connection string.
 pub(super) struct ComputeHook {
     config: Config,
-    state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
+    state: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
     authorization_header: Option<String>,
+
+    // Concurrency limiter, so that we do not overload the cloud control plane when updating
+    // large numbers of tenants (e.g. when failing over after a node failure)
+    api_concurrency: tokio::sync::Semaphore,
+
+    // This lock is only used in testing enviroments, to serialize calls into neon_lock
+    neon_local_lock: tokio::sync::Mutex<()>,
 }
 
 impl ComputeHook {
@@ -181,14 +255,20 @@ impl ComputeHook {
             state: Default::default(),
             config,
             authorization_header,
+            neon_local_lock: Default::default(),
+            api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
         }
     }
 
     /// For test environments: use neon_local's LocalEnv to update compute
     async fn do_notify_local(
         &self,
-        reconfigure_request: ComputeHookNotifyRequest,
+        reconfigure_request: &ComputeHookNotifyRequest,
     ) -> anyhow::Result<()> {
+        // neon_local updates are not safe to call concurrently, use a lock to serialize
+        // all calls to this function
+        let _locked = self.neon_local_lock.lock().await;
+
         let env = match LocalEnv::load_config() {
             Ok(e) => e,
             Err(e) => {
@@ -205,7 +285,7 @@ impl ComputeHook {
         } = reconfigure_request;
 
         let compute_pageservers = shards
-            .into_iter()
+            .iter()
             .map(|shard| {
                 let ps_conf = env
                     .get_pageserver_conf(shard.node_id)
@@ -217,10 +297,10 @@ impl ComputeHook {
             .collect::<Vec<_>>();
 
         for (endpoint_name, endpoint) in &cplane.endpoints {
-            if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
+            if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
                 tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
                 endpoint
-                    .reconfigure(compute_pageservers.clone(), stripe_size)
+                    .reconfigure(compute_pageservers.clone(), *stripe_size)
                     .await?;
             }
         }
@@ -298,12 +378,23 @@ impl ComputeHook {
     async fn do_notify(
         &self,
         url: &String,
-        reconfigure_request: ComputeHookNotifyRequest,
+        reconfigure_request: &ComputeHookNotifyRequest,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
         let client = reqwest::Client::new();
+
+        // We hold these semaphore units across all retries, rather than only across each
+        // HTTP request: this is to preserve fairness and avoid a situation where a retry might
+        // time out waiting for a semaphore.
+        let _units = self
+            .api_concurrency
+            .acquire()
+            .await
+            // Interpret closed semaphore as shutdown
+            .map_err(|_| NotifyError::ShuttingDown)?;
+
         backoff::retry(
-            || self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
+            || self.do_notify_iteration(&client, url, reconfigure_request, cancel),
             |e| {
                 matches!(
                     e,
@@ -343,42 +434,70 @@ impl ComputeHook {
         stripe_size: ShardStripeSize,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
-        let mut locked = self.state.lock().await;
+        let maybe_send_result = {
+            let mut state_locked = self.state.lock().unwrap();
 
-        use std::collections::hash_map::Entry;
-        let tenant = match locked.entry(tenant_shard_id.tenant_id) {
-            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
-                tenant_shard_id,
-                stripe_size,
-                node_id,
-            )),
-            Entry::Occupied(e) => {
-                let tenant = e.into_mut();
-                tenant.update(tenant_shard_id, stripe_size, node_id);
-                tenant
+            use std::collections::hash_map::Entry;
+            let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
+                Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
+                    tenant_shard_id,
+                    stripe_size,
+                    node_id,
+                )),
+                Entry::Occupied(e) => {
+                    let tenant = e.into_mut();
+                    tenant.update(tenant_shard_id, stripe_size, node_id);
+                    tenant
+                }
+            };
+            tenant.maybe_send(tenant_shard_id.tenant_id, None)
+        };
+
+        // Process result: we may get an update to send, or we may have to wait for a lock
+        // before trying again.
+        let (request, mut send_lock_guard) = match maybe_send_result {
+            MaybeSendResult::Noop => {
+                return Ok(());
             }
+            MaybeSendResult::AwaitLock(send_lock) => {
+                let send_locked = send_lock.lock_owned().await;
+
+                // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here
+                // we have acquired the send lock and take `[Self::state]` lock.  This is safe because maybe_send only uses
+                // try_lock.
+                let state_locked = self.state.lock().unwrap();
+                let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else {
+                    return Ok(());
+                };
+                match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) {
+                    MaybeSendResult::AwaitLock(_) => {
+                        unreachable!("We supplied lock guard")
+                    }
+                    MaybeSendResult::Noop => {
+                        return Ok(());
+                    }
+                    MaybeSendResult::Transmit((request, lock)) => (request, lock),
+                }
+            }
+            MaybeSendResult::Transmit((request, lock)) => (request, lock),
         };
 
-        let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
-        let Some(reconfigure_request) = reconfigure_request else {
-            // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
-            // until it does.
-            tracing::info!("Tenant isn't yet ready to emit a notification");
-            return Ok(());
-        };
-
-        if let Some(notify_url) = &self.config.compute_hook_url {
-            self.do_notify(notify_url, reconfigure_request, cancel)
-                .await
+        let result = if let Some(notify_url) = &self.config.compute_hook_url {
+            self.do_notify(notify_url, &request, cancel).await
         } else {
-            self.do_notify_local(reconfigure_request)
-                .await
-                .map_err(|e| {
-                    // This path is for testing only, so munge the error into our prod-style error type.
-                    tracing::error!("Local notification hook failed: {e}");
-                    NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
-                })
+            self.do_notify_local(&request).await.map_err(|e| {
+                // This path is for testing only, so munge the error into our prod-style error type.
+                tracing::error!("Local notification hook failed: {e}");
+                NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
+            })
+        };
+
+        if result.is_ok() {
+            // Before dropping the send lock, stash the request we just sent so that
+            // subsequent callers can avoid redundantly re-sending the same thing.
+            *send_lock_guard = Some(request);
         }
+        result
     }
 }
 
@@ -402,21 +521,22 @@ pub(crate) mod tests {
             NodeId(1),
         );
 
-        // An unsharded tenant is always ready to emit a notification
-        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
-        assert_eq!(
-            tenant_state
-                .maybe_reconfigure(tenant_id)
-                .unwrap()
-                .shards
-                .len(),
-            1
-        );
-        assert!(tenant_state
-            .maybe_reconfigure(tenant_id)
-            .unwrap()
-            .stripe_size
-            .is_none());
+        // An unsharded tenant is always ready to emit a notification, but won't
+        // send the same one twice
+        let send_result = tenant_state.maybe_send(tenant_id, None);
+        let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
+            anyhow::bail!("Wrong send result");
+        };
+        assert_eq!(request.shards.len(), 1);
+        assert!(request.stripe_size.is_none());
+
+        // Simulate successful send
+        *guard = Some(request);
+        drop(guard);
+
+        // Try asking again: this should be a no-op
+        let send_result = tenant_state.maybe_send(tenant_id, None);
+        assert!(matches!(send_result, MaybeSendResult::Noop));
 
         // Writing the first shard of a multi-sharded situation (i.e. in a split)
         // resets the tenant state and puts it in an non-notifying state (need to
@@ -430,7 +550,10 @@ pub(crate) mod tests {
             ShardStripeSize(32768),
             NodeId(1),
         );
-        assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
+        assert!(matches!(
+            tenant_state.maybe_send(tenant_id, None),
+            MaybeSendResult::Noop
+        ));
 
         // Writing the second shard makes it ready to notify
         tenant_state.update(
@@ -443,22 +566,16 @@ pub(crate) mod tests {
             NodeId(1),
         );
 
-        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
-        assert_eq!(
-            tenant_state
-                .maybe_reconfigure(tenant_id)
-                .unwrap()
-                .shards
-                .len(),
-            2
-        );
-        assert_eq!(
-            tenant_state
-                .maybe_reconfigure(tenant_id)
-                .unwrap()
-                .stripe_size,
-            Some(ShardStripeSize(32768))
-        );
+        let send_result = tenant_state.maybe_send(tenant_id, None);
+        let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
+            anyhow::bail!("Wrong send result");
+        };
+        assert_eq!(request.shards.len(), 2);
+        assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
+
+        // Simulate successful send
+        *guard = Some(request);
+        drop(guard);
 
         Ok(())
     }

From 0788760451619d408cf1550e47e722dc2f794c46 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Sun, 7 Apr 2024 22:21:18 +0100
Subject: [PATCH 0536/1571] tests: further stabilize
 test_deletion_queue_recovery (#7335)

This is the other main failure mode called out in #6092 , that the test
can shut down the pageserver while it has "future layers" in the index,
and that this results in unexpected stats after restart.

We can avoid this nondeterminism by shutting down the endpoint, flushing
everything from SK to PS, checkpointing, and then waiting for that final
LSN to be uploaded. This is more heavyweight than most of our tests
require, but useful in the case of tests that expect a particular
behavior after restart wrt layer deletions.
---
 test_runner/regress/test_pageserver_generations.py | 13 +++++++++++++
 test_runner/regress/test_storage_controller.py     |  9 ++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 7020a61b2f..67f68a62af 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -22,6 +22,7 @@ from fixtures.neon_fixtures import (
     NeonPageserver,
     PgBin,
     S3Scrubber,
+    flush_ep_to_pageserver,
     last_flush_lsn_upload,
 )
 from fixtures.pageserver.http import PageserverApiException
@@ -30,6 +31,7 @@ from fixtures.pageserver.utils import (
     list_prefix,
     wait_for_last_record_lsn,
     wait_for_upload,
+    wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import (
     RemoteStorageKind,
@@ -120,6 +122,17 @@ def generate_uploads_and_deletions(
         print_gc_result(gc_result)
         assert gc_result["layers_removed"] > 0
 
+        # Stop endpoint and flush all data to pageserver, then checkpoint it: this
+        # ensures that the pageserver is in a fully idle state: there will be no more
+        # background ingest, no more uploads pending, and therefore no non-determinism
+        # in subsequent actions like pageserver restarts.
+        final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
+        ps_http.timeline_checkpoint(tenant_id, timeline_id)
+        # Finish uploads
+        wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn)
+        # Finish all remote writes (including deletions)
+        wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
+
 
 def read_all(
     env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 405aa22831..840f354142 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1187,7 +1187,14 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     storcon_cli(["node-configure", "--node-id", "1", "--scheduling", "pause"])
     assert "Pause" in storcon_cli(["nodes"])[3]
 
-    # Make a node offline
+    # We will simulate a node death and then marking it offline
+    env.pageservers[0].stop(immediate=True)
+    # Sleep to make it unlikely that the controller's heartbeater will race handling
+    # a /utilization response internally, such that it marks the node back online.  IRL
+    # there would always be a longer delay than this before a node failing and a human
+    # intervening.
+    time.sleep(2)
+
     storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"])
     assert "Offline" in storcon_cli(["nodes"])[3]
 

From 21b3e1d13b33765bbb1832c0e6894ef6c340a301 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 8 Apr 2024 09:01:38 +0300
Subject: [PATCH 0537/1571] fix(utilization): return used as does df (#7337)

We can currently underflow `pageserver_resident_physical_size_global`,
so the used disk bytes would show `u63::MAX` by mistake. The assumption
of the API (and the documented behavior) was to give the layer files
disk usage.

Switch to reporting numbers that match `df` output.

Fixes: #7336
---
 pageserver/src/http/openapi_spec.yml |  2 +-
 pageserver/src/utilization.rs        | 16 ++++++++++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index bb477f89c5..2713309824 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1629,7 +1629,7 @@ components:
           type: integer
           format: int64
           minimum: 0
-          description: The amount of disk space currently utilized by layer files.
+          description: The amount of disk space currently used.
         free_space_bytes:
           type: integer
           format: int64
diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs
index 830c9897ca..5eccf185ac 100644
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -15,11 +15,23 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtiliz
         .map_err(std::io::Error::from)
         .context("statvfs tenants directory")?;
 
-    let blocksz = statvfs.block_size();
+    // https://unix.stackexchange.com/a/703650
+    let blocksz = if statvfs.fragment_size() > 0 {
+        statvfs.fragment_size()
+    } else {
+        statvfs.block_size()
+    };
 
     #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
     let free = statvfs.blocks_available() as u64 * blocksz;
-    let used = crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.get();
+
+    #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
+    let used = statvfs
+        .blocks()
+        // use blocks_free instead of available here to match df in case someone compares
+        .saturating_sub(statvfs.blocks_free()) as u64
+        * blocksz;
+
     let captured_at = std::time::SystemTime::now();
 
     let doc = PageserverUtilization {

From 2d3c9f0d43758fbd3da8d4a1dc5d039545b39ef9 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 8 Apr 2024 11:35:32 +0200
Subject: [PATCH 0538/1571] refactor(pageserver): use tokio::signal instead of
 spawn_blocking (#7332)

It's just unnecessary to use spawn_blocking there, and with
https://github.com/neondatabase/neon/pull/7331 , it will result in
really just one executor thread when enabling one-runtime with
current_thread executor.
---
 pageserver/src/bin/pageserver.rs | 66 +++++++++++++++-----------------
 1 file changed, 31 insertions(+), 35 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index c80230d4d7..0903b206ff 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -18,6 +18,7 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
 use remote_storage::GenericRemoteStorage;
+use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
 use tracing::*;
 
@@ -671,42 +672,37 @@ fn start_pageserver(
     let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
 
     // All started up! Now just sit and wait for shutdown signal.
-    {
-        use signal_hook::consts::*;
-        let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
-            let mut signals =
-                signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
-            return signals
-                .forever()
-                .next()
-                .expect("forever() never returns None unless explicitly closed");
-        });
-        let signal = BACKGROUND_RUNTIME
-            .block_on(signal_handler)
-            .expect("join error");
-        match signal {
-            SIGQUIT => {
-                info!("Got signal {signal}. Terminating in immediate shutdown mode",);
-                std::process::exit(111);
-            }
-            SIGINT | SIGTERM => {
-                info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
 
-                // This cancels the `shutdown_pageserver` cancellation tree.
-                // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
-                // The plan is to change that over time.
-                shutdown_pageserver.take();
-                let bg_remote_storage = remote_storage.clone();
-                let bg_deletion_queue = deletion_queue.clone();
-                BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
-                    &tenant_manager,
-                    bg_remote_storage.map(|_| bg_deletion_queue),
-                    0,
-                ));
-                unreachable!()
-            }
-            _ => unreachable!(),
-        }
+    {
+        BACKGROUND_RUNTIME.block_on(async move {
+            let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
+            let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
+            let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
+            let signal = tokio::select! {
+                _ = sigquit.recv() => {
+                    info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
+                    std::process::exit(111);
+                }
+                _ = sigint.recv() => { "SIGINT" },
+                _ = sigterm.recv() => { "SIGTERM" },
+            };
+
+            info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
+
+            // This cancels the `shutdown_pageserver` cancellation tree.
+            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
+            // The plan is to change that over time.
+            shutdown_pageserver.take();
+            let bg_remote_storage = remote_storage.clone();
+            let bg_deletion_queue = deletion_queue.clone();
+            pageserver::shutdown_pageserver(
+                &tenant_manager,
+                bg_remote_storage.map(|_| bg_deletion_queue),
+                0,
+            )
+            .await;
+            unreachable!()
+        })
     }
 }
 

From 47b705cffe0e13182ec41df8da518f310444c8d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 8 Apr 2024 14:59:08 +0200
Subject: [PATCH 0539/1571] Remove async_trait from CompactionDeltaLayer
 (#7342)

Removes usage of async_trait from the `CompactionDeltaLayer` trait.

Split off from #7301

Related earlier work: https://github.com/neondatabase/neon/pull/6305,
https://github.com/neondatabase/neon/pull/6464,
https://github.com/neondatabase/neon/pull/7303
---
 Cargo.lock                                   | 1 -
 pageserver/compaction/Cargo.toml             | 1 -
 pageserver/compaction/src/helpers.rs         | 2 +-
 pageserver/compaction/src/interface.rs       | 7 ++-----
 pageserver/compaction/src/simulator.rs       | 2 --
 pageserver/src/tenant/timeline/compaction.rs | 2 --
 6 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index dae406e4ae..67054cf2c7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3616,7 +3616,6 @@ dependencies = [
  "anyhow",
  "async-compression",
  "async-stream",
- "async-trait",
  "byteorder",
  "bytes",
  "chrono",
diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml
index 47f318db63..0fd1d81845 100644
--- a/pageserver/compaction/Cargo.toml
+++ b/pageserver/compaction/Cargo.toml
@@ -11,7 +11,6 @@ default = []
 anyhow.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
-async-trait.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
 chrono = { workspace = true, features = ["serde"] }
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index 22a410b4af..9de6363d6e 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -180,7 +180,7 @@ where
                 match top.deref_mut() {
                     LazyLoadLayer::Unloaded(ref mut l) => {
                         let fut = l.load_keys(this.ctx);
-                        this.load_future.set(Some(fut));
+                        this.load_future.set(Some(Box::pin(fut)));
                         continue;
                     }
                     LazyLoadLayer::Loaded(ref mut entries) => {
diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs
index 2bb2e749c0..5dc62e506f 100644
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -3,7 +3,6 @@
 //!
 //! All the heavy lifting is done by the create_image and create_delta
 //! functions that the implementor provides.
-use async_trait::async_trait;
 use futures::Future;
 use pageserver_api::{key::Key, keyspace::key_range_size};
 use std::ops::Range;
@@ -141,18 +140,16 @@ pub trait CompactionLayer<K: CompactionKey + ?Sized> {
 
     fn is_delta(&self) -> bool;
 }
-
-#[async_trait]
 pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
     type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
     where
         Self: 'a;
 
     /// Return all keys in this delta layer.
-    async fn load_keys<'a>(
+    fn load_keys<'a>(
         &self,
         ctx: &E::RequestContext,
-    ) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
+    ) -> impl Future<Output = anyhow::Result<Vec<Self::DeltaEntry<'_>>>> + Send;
 }
 
 pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs
index def7983e75..6c00df3a65 100644
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -2,7 +2,6 @@ mod draw;
 
 use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
 
-use async_trait::async_trait;
 use futures::StreamExt;
 use rand::Rng;
 use tracing::info;
@@ -139,7 +138,6 @@ impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
     }
 }
 
-#[async_trait]
 impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
     type DeltaEntry<'a> = MockRecord;
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index ab001bf10d..8075775bbc 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -12,7 +12,6 @@ use super::layer_manager::LayerManager;
 use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};
 
 use anyhow::{anyhow, Context};
-use async_trait::async_trait;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
@@ -1122,7 +1121,6 @@ impl CompactionLayer<Key> for ResidentDeltaLayer {
     }
 }
 
-#[async_trait]
 impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
     type DeltaEntry<'a> = DeltaEntry<'a>;
 

From 1081a4d2462d324961604b9114def1efea096f44 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 8 Apr 2024 16:27:08 +0200
Subject: [PATCH 0540/1571] pageserver: option to run with just one tokio
 runtime (#7331)

This PR is an off-by-default revision v2 of the (since-reverted) PR
#6555 / commit `3220f830b7fbb785d6db8a93775f46314f10a99b`.

See that PR for details on why running with a single runtime is
desirable and why we should be ready.

We reverted #6555 because it showed regressions in prodlike cloudbench,
see the revert commit message `ad072de4209193fd21314cf7f03f14df4fa55eb1`
for more context.

This PR makes it an opt-in choice via an env var.

The default is to use the 4 separate runtimes that we have today, there
shouldn't be any performance change.

I tested manually that the env var & added metric works.

```
# undefined env var => no change to before this PR, uses 4 runtimes
./target/debug/neon_local start
# defining the env var enables one-runtime mode, value defines that one runtime's configuration
NEON_PAGESERVER_USE_ONE_RUNTIME=current_thread ./target/debug/neon_local start
NEON_PAGESERVER_USE_ONE_RUNTIME=multi_thread:1 ./target/debug/neon_local start
NEON_PAGESERVER_USE_ONE_RUNTIME=multi_thread:2 ./target/debug/neon_local start
NEON_PAGESERVER_USE_ONE_RUNTIME=multi_thread:default ./target/debug/neon_local start

```

I want to use this change to do more manualy testing and potentially
testing in staging.

Future Work
-----------

Testing / deployment ergonomics would be better if this were a variable
in `pageserver.toml`.
It can be done, but, I don't need it right now, so let's stick with the
env var.
---
 control_plane/src/background_process.rs |  14 ++-
 libs/utils/src/env.rs                   |  21 ++++
 libs/utils/src/lib.rs                   |   2 +
 pageserver/src/metrics.rs               |  21 ++++
 pageserver/src/task_mgr.rs              | 149 +++++++++++++++++-------
 pageserver/src/tenant/tasks.rs          |   3 +-
 6 files changed, 169 insertions(+), 41 deletions(-)
 create mode 100644 libs/utils/src/env.rs

diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 2fced7d778..94666f2870 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -86,7 +86,10 @@ where
         .stdout(process_log_file)
         .stderr(same_file_for_stderr)
         .args(args);
-    let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
+
+    let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
+        fill_rust_env_vars(background_command),
+    ));
     filled_cmd.envs(envs);
 
     let pid_file_to_check = match &initial_pid_file {
@@ -268,6 +271,15 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
     cmd
 }
 
+fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
+    for (var, val) in std::env::vars() {
+        if var.starts_with("NEON_PAGESERVER_") {
+            cmd = cmd.env(var, val);
+        }
+    }
+    cmd
+}
+
 /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
 /// 1. Claims a pidfile with a fcntl lock on it and
 /// 2. Sets up the pidfile's file descriptor so that it (and the lock)
diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs
new file mode 100644
index 0000000000..b3e326bfd0
--- /dev/null
+++ b/libs/utils/src/env.rs
@@ -0,0 +1,21 @@
+//! Wrapper around `std::env::var` for parsing environment variables.
+
+use std::{fmt::Display, str::FromStr};
+
+pub fn var<V, E>(varname: &str) -> Option<V>
+where
+    V: FromStr<Err = E>,
+    E: Display,
+{
+    match std::env::var(varname) {
+        Ok(s) => Some(
+            s.parse()
+                .map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
+                .unwrap(),
+        ),
+        Err(std::env::VarError::NotPresent) => None,
+        Err(std::env::VarError::NotUnicode(_)) => {
+            panic!("env var {varname} is not unicode")
+        }
+    }
+}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 04ce0626c8..cd5075613e 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -89,6 +89,8 @@ pub mod yielding_loop;
 
 pub mod zstd;
 
+pub mod env;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ab9a2e8509..3160f204e2 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2100,6 +2100,7 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
 use futures::Future;
 use pin_project_lite::pin_project;
 use std::collections::HashMap;
+use std::num::NonZeroUsize;
 use std::pin::Pin;
 use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
@@ -2669,6 +2670,26 @@ pub(crate) mod disk_usage_based_eviction {
     pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
 }
 
+static TOKIO_EXECUTOR_THREAD_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_tokio_executor_thread_configured_count",
+        "Total number of configued tokio executor threads in the process.
+         The `setup` label denotes whether we're running with multiple runtimes or a single runtime.",
+        &["setup"],
+    )
+    .unwrap()
+});
+
+pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
+    static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(());
+    let _guard = SERIALIZE.lock().unwrap();
+    TOKIO_EXECUTOR_THREAD_COUNT.reset();
+    TOKIO_EXECUTOR_THREAD_COUNT
+        .get_metric_with_label_values(&[setup])
+        .unwrap()
+        .set(u64::try_from(num_threads.get()).unwrap());
+}
+
 pub fn preinitialize_metrics() {
     // Python tests need these and on some we do alerting.
     //
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 0cc5611a12..9a1e354ecf 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -33,13 +33,14 @@
 use std::collections::HashMap;
 use std::fmt;
 use std::future::Future;
+use std::num::NonZeroUsize;
 use std::panic::AssertUnwindSafe;
+use std::str::FromStr;
 use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::{Arc, Mutex};
 
 use futures::FutureExt;
 use pageserver_api::shard::TenantShardId;
-use tokio::runtime::Runtime;
 use tokio::task::JoinHandle;
 use tokio::task_local;
 use tokio_util::sync::CancellationToken;
@@ -48,8 +49,11 @@ use tracing::{debug, error, info, warn};
 
 use once_cell::sync::Lazy;
 
+use utils::env;
 use utils::id::TimelineId;
 
+use crate::metrics::set_tokio_runtime_setup;
+
 //
 // There are four runtimes:
 //
@@ -98,52 +102,119 @@ use utils::id::TimelineId;
 // other operations, if the upload tasks e.g. get blocked on locks. It shouldn't
 // happen, but still.
 //
-pub static COMPUTE_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("compute request worker")
-        .enable_all()
-        .build()
-        .expect("Failed to create compute request runtime")
-});
 
-pub static MGMT_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("mgmt request worker")
-        .enable_all()
-        .build()
-        .expect("Failed to create mgmt request runtime")
-});
-
-pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("walreceiver worker")
-        .enable_all()
-        .build()
-        .expect("Failed to create walreceiver runtime")
-});
-
-pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("background op worker")
-        // if you change the number of worker threads please change the constant below
-        .enable_all()
-        .build()
-        .expect("Failed to create background op runtime")
-});
-
-pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
-    // force init and thus panics
-    let _ = BACKGROUND_RUNTIME.handle();
+pub(crate) static TOKIO_WORKER_THREADS: Lazy<NonZeroUsize> = Lazy::new(|| {
     // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
     // tokio would had already panicked for parsing errors or NotUnicode
     //
     // this will be wrong if any of the runtimes gets their worker threads configured to something
     // else, but that has not been needed in a long time.
-    std::env::var("TOKIO_WORKER_THREADS")
-        .map(|s| s.parse::<usize>().unwrap())
-        .unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
+    NonZeroUsize::new(
+        std::env::var("TOKIO_WORKER_THREADS")
+            .map(|s| s.parse::<usize>().unwrap())
+            .unwrap_or_else(|_e| usize::max(2, num_cpus::get())),
+    )
+    .expect("the max() ensures that this is not zero")
 });
 
+enum TokioRuntimeMode {
+    SingleThreaded,
+    MultiThreaded { num_workers: NonZeroUsize },
+}
+
+impl FromStr for TokioRuntimeMode {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "current_thread" => Ok(TokioRuntimeMode::SingleThreaded),
+            s => match s.strip_prefix("multi_thread:") {
+                Some("default") => Ok(TokioRuntimeMode::MultiThreaded {
+                    num_workers: *TOKIO_WORKER_THREADS,
+                }),
+                Some(suffix) => {
+                    let num_workers = suffix.parse::<NonZeroUsize>().map_err(|e| {
+                        format!(
+                            "invalid number of multi-threaded runtime workers ({suffix:?}): {e}",
+                        )
+                    })?;
+                    Ok(TokioRuntimeMode::MultiThreaded { num_workers })
+                }
+                None => Err(format!("invalid runtime config: {s:?}")),
+            },
+        }
+    }
+}
+
+static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
+    let thread_name = "pageserver-tokio";
+    let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
+        // If the env var is not set, leave this static as None.
+        set_tokio_runtime_setup(
+            "multiple-runtimes",
+            NUM_MULTIPLE_RUNTIMES
+                .checked_mul(*TOKIO_WORKER_THREADS)
+                .unwrap(),
+        );
+        return None;
+    };
+    Some(match mode {
+        TokioRuntimeMode::SingleThreaded => {
+            set_tokio_runtime_setup("one-runtime-single-threaded", NonZeroUsize::new(1).unwrap());
+            tokio::runtime::Builder::new_current_thread()
+                .thread_name(thread_name)
+                .enable_all()
+                .build()
+                .expect("failed to create one single runtime")
+        }
+        TokioRuntimeMode::MultiThreaded { num_workers } => {
+            set_tokio_runtime_setup("one-runtime-multi-threaded", num_workers);
+            tokio::runtime::Builder::new_multi_thread()
+                .thread_name(thread_name)
+                .enable_all()
+                .worker_threads(num_workers.get())
+                .build()
+                .expect("failed to create one multi-threaded runtime")
+        }
+    })
+});
+
+/// Declare a lazy static variable named `$varname` that will resolve
+/// to a tokio runtime handle. If the env var `NEON_PAGESERVER_USE_ONE_RUNTIME`
+/// is set, this will resolve to `ONE_RUNTIME`. Otherwise, the macro invocation
+/// declares a separate runtime and the lazy static variable `$varname`
+/// will resolve to that separate runtime.
+///
+/// The result is is that `$varname.spawn()` will use `ONE_RUNTIME` if
+/// `NEON_PAGESERVER_USE_ONE_RUNTIME` is set, and will use the separate runtime
+/// otherwise.
+macro_rules! pageserver_runtime {
+    ($varname:ident, $name:literal) => {
+        pub static $varname: Lazy<&'static tokio::runtime::Runtime> = Lazy::new(|| {
+            if let Some(runtime) = &*ONE_RUNTIME {
+                return runtime;
+            }
+            static RUNTIME: Lazy<tokio::runtime::Runtime> = Lazy::new(|| {
+                tokio::runtime::Builder::new_multi_thread()
+                    .thread_name($name)
+                    .worker_threads(TOKIO_WORKER_THREADS.get())
+                    .enable_all()
+                    .build()
+                    .expect(std::concat!("Failed to create runtime ", $name))
+            });
+            &*RUNTIME
+        });
+    };
+}
+
+pageserver_runtime!(COMPUTE_REQUEST_RUNTIME, "compute request worker");
+pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker");
+pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker");
+pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker");
+// Bump this number when adding a new pageserver_runtime!
+// SAFETY: it's obviously correct
+const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) };
+
 #[derive(Debug, Clone, Copy)]
 pub struct PageserverTaskId(u64);
 
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index e4f5f75132..74ed677ffe 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -18,7 +18,7 @@ use utils::{backoff, completion};
 
 static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
     once_cell::sync::Lazy::new(|| {
-        let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
+        let total_threads = task_mgr::TOKIO_WORKER_THREADS.get();
         let permits = usize::max(
             1,
             // while a lot of the work is done on spawn_blocking, we still do
@@ -72,6 +72,7 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
         loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation
     );
 
+    // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
     match CONCURRENT_BACKGROUND_TASKS.acquire().await {
         Ok(permit) => permit,
         Err(_closed) => unreachable!("we never close the semaphore"),

From a306d0a54b0e579556893c0344a27664e39e54a1 Mon Sep 17 00:00:00 2001
From: Kevin Mingtarja <69668484+kevinmingtarja@users.noreply.github.com>
Date: Mon, 8 Apr 2024 22:53:07 +0800
Subject: [PATCH 0541/1571] implement Serialize/Deserialize for SystemTime with
 RFC3339 format (#7203)

## Problem
We have two places that use a helper (`ser_rfc3339_millis`) to get serde
to stringify SystemTimes into the desired format.

## Summary of changes
Created a new module `utils::serde_system_time` and inside it a wrapper
type `SystemTime` for `std::time::SystemTime` that
serializes/deserializes to the RFC3339 format.

This new type is then used in the two places that were previously using
the helper for serialization, thereby eliminating the need to decorate
structs.

Closes #7151.
---
 Cargo.lock                                    |  1 +
 libs/pageserver_api/src/models.rs             | 30 +---------
 libs/pageserver_api/src/models/utilization.rs | 25 ++-------
 libs/utils/Cargo.toml                         |  1 +
 libs/utils/src/lib.rs                         |  1 +
 libs/utils/src/serde_system_time.rs           | 55 +++++++++++++++++++
 pageserver/src/tenant/secondary/downloader.rs |  4 +-
 pageserver/src/utilization.rs                 |  2 +-
 8 files changed, 67 insertions(+), 52 deletions(-)
 create mode 100644 libs/utils/src/serde_system_time.rs

diff --git a/Cargo.lock b/Cargo.lock
index 67054cf2c7..66ff3dedb7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6569,6 +6569,7 @@ dependencies = [
  "heapless",
  "hex",
  "hex-literal",
+ "humantime",
  "hyper",
  "jsonwebtoken",
  "leaky-bucket",
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ad4ca6710d..b4909f247f 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -20,6 +20,7 @@ use utils::{
     history_buffer::HistoryBufferWithDropCounter,
     id::{NodeId, TenantId, TimelineId},
     lsn::Lsn,
+    serde_system_time,
 };
 
 use crate::controller_api::PlacementPolicy;
@@ -758,11 +759,7 @@ pub struct WalRedoManagerStatus {
 #[derive(Default, Debug, Serialize, Deserialize, Clone)]
 pub struct SecondaryProgress {
     /// The remote storage LastModified time of the heatmap object we last downloaded.
-    #[serde(
-        serialize_with = "opt_ser_rfc3339_millis",
-        deserialize_with = "opt_deser_rfc3339_millis"
-    )]
-    pub heatmap_mtime: Option<SystemTime>,
+    pub heatmap_mtime: Option<serde_system_time::SystemTime>,
 
     /// The number of layers currently on-disk
     pub layers_downloaded: usize,
@@ -775,29 +772,6 @@ pub struct SecondaryProgress {
     pub bytes_total: u64,
 }
 
-fn opt_ser_rfc3339_millis<S: serde::Serializer>(
-    ts: &Option<SystemTime>,
-    serializer: S,
-) -> Result<S::Ok, S::Error> {
-    match ts {
-        Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
-        None => serializer.serialize_none(),
-    }
-}
-
-fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-{
-    let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
-    match s {
-        None => Ok(None),
-        Some(s) => humantime::parse_rfc3339(&s)
-            .map_err(serde::de::Error::custom)
-            .map(Some),
-    }
-}
-
 pub mod virtual_file {
     #[derive(
         Copy,
diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs
index f5984dff5d..e88cab5d6a 100644
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -1,4 +1,4 @@
-use std::time::SystemTime;
+use utils::serde_system_time::SystemTime;
 
 /// Pageserver current utilization and scoring for how good candidate the pageserver would be for
 /// the next tenant.
@@ -21,28 +21,9 @@ pub struct PageserverUtilization {
     /// When was this snapshot captured, pageserver local time.
     ///
     /// Use millis to give confidence that the value is regenerated often enough.
-    #[serde(
-        serialize_with = "ser_rfc3339_millis",
-        deserialize_with = "deser_rfc3339_millis"
-    )]
     pub captured_at: SystemTime,
 }
 
-fn ser_rfc3339_millis<S: serde::Serializer>(
-    ts: &SystemTime,
-    serializer: S,
-) -> Result<S::Ok, S::Error> {
-    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
-}
-
-fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-{
-    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
-    humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
-}
-
 /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
 ///
 /// Instead of newtype, use this because a newtype would get require handling deserializing values
@@ -69,7 +50,9 @@ mod tests {
             disk_usage_bytes: u64::MAX,
             free_space_bytes: 0,
             utilization_score: u64::MAX,
-            captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
+            captured_at: SystemTime(
+                std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
+            ),
         };
 
         let s = serde_json::to_string(&doc).unwrap();
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index c2d9d9d396..a6a081c5c1 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -22,6 +22,7 @@ camino.workspace = true
 chrono.workspace = true
 heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
+humantime.workspace = true
 hyper = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index cd5075613e..b09350d11e 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -63,6 +63,7 @@ pub mod measured_stream;
 
 pub mod serde_percent;
 pub mod serde_regex;
+pub mod serde_system_time;
 
 pub mod pageserver_feedback;
 
diff --git a/libs/utils/src/serde_system_time.rs b/libs/utils/src/serde_system_time.rs
new file mode 100644
index 0000000000..b0f6934e87
--- /dev/null
+++ b/libs/utils/src/serde_system_time.rs
@@ -0,0 +1,55 @@
+//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision.
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)]
+#[serde(transparent)]
+pub struct SystemTime(
+    #[serde(
+        deserialize_with = "deser_rfc3339_millis",
+        serialize_with = "ser_rfc3339_millis"
+    )]
+    pub std::time::SystemTime,
+);
+
+fn ser_rfc3339_millis<S: serde::ser::Serializer>(
+    ts: &std::time::SystemTime,
+    serializer: S,
+) -> Result<S::Ok, S::Error> {
+    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
+}
+
+fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<std::time::SystemTime, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
+    humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds.
+    fn to_millisecond_precision(time: SystemTime) -> SystemTime {
+        match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) {
+            Ok(duration) => {
+                let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis());
+                SystemTime(
+                    std::time::SystemTime::UNIX_EPOCH
+                        + std::time::Duration::from_millis(total_millis),
+                )
+            }
+            Err(_) => time,
+        }
+    }
+
+    #[test]
+    fn test_serialize_deserialize() {
+        let input = SystemTime(std::time::SystemTime::now());
+        let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0));
+        let serialized = serde_json::to_string(&input).unwrap();
+        assert_eq!(expected_serialized, serialized);
+        let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap();
+        assert_eq!(to_millisecond_precision(input), deserialized);
+    }
+}
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 530e1a3244..5b29c126d1 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -51,7 +51,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
 use utils::{
     backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
-    id::TimelineId,
+    id::TimelineId, serde_system_time,
 };
 
 use super::{
@@ -591,7 +591,7 @@ impl<'a> TenantDownloader<'a> {
         let mut progress = SecondaryProgress {
             layers_total: heatmap_stats.layers,
             bytes_total: heatmap_stats.bytes,
-            heatmap_mtime: Some(heatmap_mtime),
+            heatmap_mtime: Some(serde_system_time::SystemTime(heatmap_mtime)),
             layers_downloaded: 0,
             bytes_downloaded: 0,
         };
diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs
index 5eccf185ac..e6c835aa75 100644
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -41,7 +41,7 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtiliz
         //
         // note that u64::MAX will be output as i64::MAX as u64, but that should not matter
         utilization_score: u64::MAX,
-        captured_at,
+        captured_at: utils::serde_system_time::SystemTime(captured_at),
     };
 
     // TODO: make utilization_score into a metric

From f212630da2b83ae53448e11577d16c9b6703a316 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 8 Apr 2024 19:01:41 +0100
Subject: [PATCH 0542/1571] update measured with some more convenient features
 (#7334)

## Problem

Some awkwardness in the measured API.
Missing process metrics.

## Summary of changes

Update measured to use the new convenience setup features.
Added measured-process lib.
Added measured support for libmetrics
---
 Cargo.lock                            | 175 +++++++++++++++++++++-----
 Cargo.toml                            |   3 +-
 libs/metrics/Cargo.toml               |   2 +
 libs/metrics/src/lib.rs               | 146 ++++++++++++++++++++-
 storage_controller/src/http.rs        |  18 ++-
 storage_controller/src/main.rs        |   8 +-
 storage_controller/src/metrics.rs     | 120 +++++-------------
 storage_controller/src/persistence.rs |   6 +-
 workspace_hack/Cargo.toml             |   5 +-
 9 files changed, 345 insertions(+), 138 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 66ff3dedb7..a7e29b1de3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1124,7 +1124,7 @@ version = "4.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b"
 dependencies = [
- "heck",
+ "heck 0.4.1",
  "proc-macro2",
  "quote",
  "syn 2.0.52",
@@ -1462,12 +1462,9 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-utils"
-version = "0.8.15"
+version = "0.8.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
-dependencies = [
- "cfg-if",
-]
+checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
 
 [[package]]
 name = "crossterm"
@@ -1840,23 +1837,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 
 [[package]]
 name = "errno"
-version = "0.3.1"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
+checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
 dependencies = [
- "errno-dragonfly",
- "libc",
- "windows-sys 0.48.0",
-]
-
-[[package]]
-name = "errno-dragonfly"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
-dependencies = [
- "cc",
  "libc",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -2294,6 +2280,12 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
 [[package]]
 name = "hermit-abi"
 version = "0.3.3"
@@ -2794,6 +2786,12 @@ version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
 
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
+
 [[package]]
 name = "lock_api"
 version = "0.4.10"
@@ -2848,11 +2846,12 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
 
 [[package]]
 name = "measured"
-version = "0.0.13"
+version = "0.0.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
+checksum = "3cbf033874bea03565f2449572c8640ca37ec26300455faf36001f24755da452"
 dependencies = [
  "bytes",
+ "crossbeam-utils",
  "hashbrown 0.14.0",
  "itoa",
  "lasso",
@@ -2865,16 +2864,27 @@ dependencies = [
 
 [[package]]
 name = "measured-derive"
-version = "0.0.13"
+version = "0.0.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
+checksum = "be9e29b682b38f8af2a89f960455054ab1a9f5a06822f6f3500637ad9fa57def"
 dependencies = [
- "heck",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
  "syn 2.0.52",
 ]
 
+[[package]]
+name = "measured-process"
+version = "0.0.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a20849acdd04c5d6a88f565559044546904648a1842a2937cfff0b48b4ca7ef2"
+dependencies = [
+ "libc",
+ "measured",
+ "procfs 0.16.0",
+]
+
 [[package]]
 name = "memchr"
 version = "2.6.4"
@@ -2914,8 +2924,10 @@ version = "0.1.0"
 dependencies = [
  "chrono",
  "libc",
+ "measured",
+ "measured-process",
  "once_cell",
- "procfs",
+ "procfs 0.14.2",
  "prometheus",
  "rand 0.8.5",
  "rand_distr",
@@ -3525,7 +3537,7 @@ dependencies = [
  "postgres_connection",
  "postgres_ffi",
  "pq_proto",
- "procfs",
+ "procfs 0.14.2",
  "rand 0.8.5",
  "regex",
  "remote_storage",
@@ -4085,6 +4097,29 @@ dependencies = [
  "rustix 0.36.16",
 ]
 
+[[package]]
+name = "procfs"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
+dependencies = [
+ "bitflags 2.4.1",
+ "hex",
+ "lazy_static",
+ "procfs-core",
+ "rustix 0.38.28",
+]
+
+[[package]]
+name = "procfs-core"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
+dependencies = [
+ "bitflags 2.4.1",
+ "hex",
+]
+
 [[package]]
 name = "prometheus"
 version = "0.13.3"
@@ -4097,7 +4132,7 @@ dependencies = [
  "libc",
  "memchr",
  "parking_lot 0.12.1",
- "procfs",
+ "procfs 0.14.2",
  "thiserror",
 ]
 
@@ -4118,7 +4153,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
 dependencies = [
  "bytes",
- "heck",
+ "heck 0.4.1",
  "itertools",
  "lazy_static",
  "log",
@@ -4810,6 +4845,19 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "rustix"
+version = "0.38.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316"
+dependencies = [
+ "bitflags 2.4.1",
+ "errno",
+ "libc",
+ "linux-raw-sys 0.4.13",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "rustls"
 version = "0.21.9"
@@ -5670,7 +5718,7 @@ version = "0.24.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
 dependencies = [
- "heck",
+ "heck 0.4.1",
  "proc-macro2",
  "quote",
  "rustversion",
@@ -6930,6 +6978,15 @@ dependencies = [
  "windows-targets 0.48.0",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.4",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.42.2"
@@ -6960,6 +7017,21 @@ dependencies = [
  "windows_x86_64_msvc 0.48.0",
 ]
 
+[[package]]
+name = "windows-targets"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.4",
+ "windows_aarch64_msvc 0.52.4",
+ "windows_i686_gnu 0.52.4",
+ "windows_i686_msvc 0.52.4",
+ "windows_x86_64_gnu 0.52.4",
+ "windows_x86_64_gnullvm 0.52.4",
+ "windows_x86_64_msvc 0.52.4",
+]
+
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.42.2"
@@ -6972,6 +7044,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
 
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.42.2"
@@ -6984,6 +7062,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.42.2"
@@ -6996,6 +7080,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.42.2"
@@ -7008,6 +7098,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.42.2"
@@ -7020,6 +7116,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.42.2"
@@ -7032,6 +7134,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
 
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.42.2"
@@ -7044,6 +7152,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
 
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
+
 [[package]]
 name = "winnow"
 version = "0.4.6"
@@ -7092,7 +7206,6 @@ dependencies = [
  "futures-sink",
  "futures-util",
  "getrandom 0.2.11",
- "hashbrown 0.13.2",
  "hashbrown 0.14.0",
  "hex",
  "hmac",
diff --git a/Cargo.toml b/Cargo.toml
index 3c6077648e..5db6b7016a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -106,7 +106,8 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.13", features=["default", "lasso"] }
+measured = { version = "0.0.20", features=["lasso"] }
+measured-process = { version = "0.0.20" }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml
index f6a49a0166..0bd804051c 100644
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -10,11 +10,13 @@ libc.workspace = true
 once_cell.workspace = true
 chrono.workspace = true
 twox-hash.workspace = true
+measured.workspace = true
 
 workspace_hack.workspace = true
 
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
+measured-process.workspace = true
 
 [dev-dependencies]
 rand = "0.8"
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 22b0a18933..6cff28c0ca 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -4,6 +4,17 @@
 //! a default registry.
 #![deny(clippy::undocumented_unsafe_blocks)]
 
+use measured::{
+    label::{LabelGroupVisitor, LabelName, NoLabels},
+    metric::{
+        counter::CounterState,
+        gauge::GaugeState,
+        group::{Encoding, MetricValue},
+        name::{MetricName, MetricNameEncoder},
+        MetricEncoding, MetricFamilyEncoding,
+    },
+    FixedCardinalityLabel, LabelGroup, MetricGroup,
+};
 use once_cell::sync::Lazy;
 use prometheus::core::{
     Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
@@ -11,6 +22,7 @@ use prometheus::core::{
 pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::Error;
+use prometheus::Registry;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
 pub use prometheus::{register_counter_vec, Counter, CounterVec};
@@ -23,7 +35,6 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
 pub use prometheus::{register_int_gauge, IntGauge};
 pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
 pub use prometheus::{Encoder, TextEncoder};
-use prometheus::{Registry, Result};
 
 pub mod launch_timestamp;
 mod wrappers;
@@ -59,7 +70,7 @@ static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
 /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
 /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
 /// while holding the lock.
-pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
+pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
     INTERNAL_REGISTRY.register(c)
 }
 
@@ -96,6 +107,127 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
     0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
 ];
 
+pub struct BuildInfo {
+    pub revision: &'static str,
+    pub build_tag: &'static str,
+}
+
+// todo: allow label group without the set
+impl LabelGroup for BuildInfo {
+    fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
+        const REVISION: &LabelName = LabelName::from_str("revision");
+        v.write_value(REVISION, &self.revision);
+        const BUILD_TAG: &LabelName = LabelName::from_str("build_tag");
+        v.write_value(BUILD_TAG, &self.build_tag);
+    }
+}
+
+impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
+where
+    GaugeState: MetricEncoding<T>,
+{
+    fn collect_family_into(
+        &self,
+        name: impl measured::metric::name::MetricNameEncoder,
+        enc: &mut T,
+    ) -> Result<(), T::Err> {
+        enc.write_help(&name, "Build/version information")?;
+        GaugeState::write_type(&name, enc)?;
+        GaugeState {
+            count: std::sync::atomic::AtomicI64::new(1),
+        }
+        .collect_into(&(), self, name, enc)
+    }
+}
+
+#[derive(MetricGroup)]
+#[metric(new(build_info: BuildInfo))]
+pub struct NeonMetrics {
+    #[cfg(target_os = "linux")]
+    #[metric(namespace = "process")]
+    #[metric(init = measured_process::ProcessCollector::for_self())]
+    process: measured_process::ProcessCollector,
+
+    #[metric(namespace = "libmetrics")]
+    #[metric(init = LibMetrics::new(build_info))]
+    libmetrics: LibMetrics,
+}
+
+#[derive(MetricGroup)]
+#[metric(new(build_info: BuildInfo))]
+pub struct LibMetrics {
+    #[metric(init = build_info)]
+    build_info: BuildInfo,
+
+    #[metric(flatten)]
+    rusage: Rusage,
+
+    serve_count: CollectionCounter,
+}
+
+fn write_gauge<Enc: Encoding>(
+    x: i64,
+    labels: impl LabelGroup,
+    name: impl MetricNameEncoder,
+    enc: &mut Enc,
+) -> Result<(), Enc::Err> {
+    enc.write_metric_value(name, labels, MetricValue::Int(x))
+}
+
+#[derive(Default)]
+struct Rusage;
+
+#[derive(FixedCardinalityLabel, Clone, Copy)]
+#[label(singleton = "io_operation")]
+enum IoOp {
+    Read,
+    Write,
+}
+
+impl<T: Encoding> MetricGroup<T> for Rusage
+where
+    GaugeState: MetricEncoding<T>,
+{
+    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
+        const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total");
+        const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb");
+
+        let ru = get_rusage_stats();
+
+        enc.write_help(
+            DISK_IO,
+            "Bytes written and read from disk, grouped by the operation (read|write)",
+        )?;
+        GaugeState::write_type(DISK_IO, enc)?;
+        write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?;
+        write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?;
+
+        enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?;
+        GaugeState::write_type(MAXRSS, enc)?;
+        write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?;
+
+        Ok(())
+    }
+}
+
+#[derive(Default)]
+struct CollectionCounter(CounterState);
+
+impl<T: Encoding> MetricFamilyEncoding<T> for CollectionCounter
+where
+    CounterState: MetricEncoding<T>,
+{
+    fn collect_family_into(
+        &self,
+        name: impl measured::metric::name::MetricNameEncoder,
+        enc: &mut T,
+    ) -> Result<(), T::Err> {
+        self.0.inc();
+        enc.write_help(&name, "Number of metric requests made")?;
+        self.0.collect_into(&(), NoLabels, name, enc)
+    }
+}
+
 pub fn set_build_info_metric(revision: &str, build_tag: &str) {
     let metric = register_int_gauge_vec!(
         "libmetrics_build_info",
@@ -105,6 +237,7 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
     .expect("Failed to register build info metric");
     metric.with_label_values(&[revision, build_tag]).set(1);
 }
+const BYTES_IN_BLOCK: i64 = 512;
 
 // Records I/O stats in a "cross-platform" way.
 // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
@@ -117,7 +250,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
 fn update_rusage_metrics() {
     let rusage_stats = get_rusage_stats();
 
-    const BYTES_IN_BLOCK: i64 = 512;
     DISK_IO_BYTES
         .with_label_values(&["read"])
         .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
@@ -151,6 +283,7 @@ macro_rules! register_int_counter_pair_vec {
         }
     }};
 }
+
 /// Create an [`IntCounterPair`] and registers to default registry.
 #[macro_export(local_inner_macros)]
 macro_rules! register_int_counter_pair {
@@ -188,7 +321,10 @@ impl<P: Atomic> GenericCounterPairVec<P> {
     ///
     /// An error is returned if the number of label values is not the same as the
     /// number of VariableLabels in Desc.
-    pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
+    pub fn get_metric_with_label_values(
+        &self,
+        vals: &[&str],
+    ) -> prometheus::Result<GenericCounterPair<P>> {
         Ok(GenericCounterPair {
             inc: self.inc.get_metric_with_label_values(vals)?,
             dec: self.dec.get_metric_with_label_values(vals)?,
@@ -201,7 +337,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
         self.get_metric_with_label_values(vals).unwrap()
     }
 
-    pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
+    pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
         res[0] = self.inc.remove_label_values(vals);
         res[1] = self.dec.remove_label_values(vals);
     }
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index c59bcaa174..2e83bbc5ed 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -8,6 +8,7 @@ use futures::Future;
 use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
+use metrics::{BuildInfo, NeonMetrics};
 use pageserver_api::models::{
     TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
     TenantTimeTravelRequest, TimelineCreateRequest,
@@ -44,15 +45,19 @@ use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
 use routerify::Middleware;
 
 /// State available to HTTP request handlers
-#[derive(Clone)]
 pub struct HttpState {
     service: Arc<crate::service::Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
+    neon_metrics: NeonMetrics,
     allowlist_routes: Vec<Uri>,
 }
 
 impl HttpState {
-    pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
+    pub fn new(
+        service: Arc<crate::service::Service>,
+        auth: Option<Arc<SwappableJwtAuth>>,
+        build_info: BuildInfo,
+    ) -> Self {
         let allowlist_routes = ["/status", "/ready", "/metrics"]
             .iter()
             .map(|v| v.parse().unwrap())
@@ -60,6 +65,7 @@ impl HttpState {
         Self {
             service,
             auth,
+            neon_metrics: NeonMetrics::new(build_info),
             allowlist_routes,
         }
     }
@@ -672,10 +678,11 @@ fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>
     })
 }
 
-pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
 
-    let payload = crate::metrics::METRICS_REGISTRY.encode();
+    let state = get_state(&req);
+    let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
     let response = Response::builder()
         .status(200)
         .header(CONTENT_TYPE, TEXT_FORMAT)
@@ -704,6 +711,7 @@ where
 pub fn make_router(
     service: Arc<Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
+    build_info: BuildInfo,
 ) -> RouterBuilder<hyper::Body, ApiError> {
     let mut router = endpoint::make_router()
         .middleware(prologue_metrics_middleware())
@@ -720,7 +728,7 @@ pub fn make_router(
     }
 
     router
-        .data(Arc::new(HttpState::new(service, auth)))
+        .data(Arc::new(HttpState::new(service, auth, build_info)))
         .get("/metrics", |r| {
             named_request_span(r, measured_metrics_handler, RequestName("metrics"))
         })
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 3c03d6efe8..6466b9f7a3 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -3,6 +3,7 @@ use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
+use metrics::BuildInfo;
 use std::sync::Arc;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
@@ -192,6 +193,11 @@ async fn async_main() -> anyhow::Result<()> {
         args.listen
     );
 
+    let build_info = BuildInfo {
+        revision: GIT_VERSION,
+        build_tag: BUILD_TAG,
+    };
+
     let strict_mode = if args.dev {
         StrictMode::Dev
     } else {
@@ -253,7 +259,7 @@ async fn async_main() -> anyhow::Result<()> {
     let auth = secrets
         .public_key
         .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
-    let router = make_router(service.clone(), auth)
+    let router = make_router(service.clone(), auth, build_info)
         .build()
         .map_err(|err| anyhow!(err))?;
     let router_service = utils::http::RouterService::new(router).unwrap();
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index cabf416b9f..ac9f22c739 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -8,10 +8,8 @@
 //! The rest of the code defines label group types and deals with converting outer types to labels.
 //!
 use bytes::Bytes;
-use measured::{
-    label::{LabelValue, StaticLabelSet},
-    FixedCardinalityLabel, MetricGroup,
-};
+use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup};
+use metrics::NeonMetrics;
 use once_cell::sync::Lazy;
 use std::sync::Mutex;
 
@@ -26,13 +24,15 @@ pub fn preinitialize_metrics() {
 
 pub(crate) struct StorageControllerMetrics {
     pub(crate) metrics_group: StorageControllerMetricGroup,
-    encoder: Mutex<measured::text::TextEncoder>,
+    encoder: Mutex<measured::text::BufferedTextEncoder>,
 }
 
 #[derive(measured::MetricGroup)]
+#[metric(new())]
 pub(crate) struct StorageControllerMetricGroup {
     /// Count of how many times we spawn a reconcile task
     pub(crate) storage_controller_reconcile_spawn: measured::Counter,
+
     /// Reconciler tasks completed, broken down by success/failure/cancelled
     pub(crate) storage_controller_reconcile_complete:
         measured::CounterVec<ReconcileCompleteLabelGroupSet>,
@@ -43,7 +43,9 @@ pub(crate) struct StorageControllerMetricGroup {
     /// HTTP request status counters for handled requests
     pub(crate) storage_controller_http_request_status:
         measured::CounterVec<HttpRequestStatusLabelGroupSet>,
+
     /// HTTP request handler latency across all status codes
+    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
     pub(crate) storage_controller_http_request_latency:
         measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
 
@@ -55,6 +57,7 @@ pub(crate) struct StorageControllerMetricGroup {
     /// Latency of HTTP requests to the pageserver, broken down by pageserver
     /// node id, request name and method. This include both successful and unsuccessful
     /// requests.
+    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
     pub(crate) storage_controller_pageserver_request_latency:
         measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
 
@@ -66,6 +69,7 @@ pub(crate) struct StorageControllerMetricGroup {
     /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
     /// node id, request name and method. This include both successful and unsuccessful
     /// requests.
+    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
     pub(crate) storage_controller_passthrough_request_latency:
         measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
 
@@ -74,76 +78,34 @@ pub(crate) struct StorageControllerMetricGroup {
         measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
 
     /// Latency of database queries, broken down by operation.
+    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
     pub(crate) storage_controller_database_query_latency:
         measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
 }
 
 impl StorageControllerMetrics {
-    pub(crate) fn encode(&self) -> Bytes {
+    pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
         let mut encoder = self.encoder.lock().unwrap();
-        self.metrics_group.collect_into(&mut *encoder);
+        neon_metrics
+            .collect_group_into(&mut *encoder)
+            .unwrap_or_else(|infallible| match infallible {});
+        self.metrics_group
+            .collect_group_into(&mut *encoder)
+            .unwrap_or_else(|infallible| match infallible {});
         encoder.finish()
     }
 }
 
 impl Default for StorageControllerMetrics {
     fn default() -> Self {
-        Self {
-            metrics_group: StorageControllerMetricGroup::new(),
-            encoder: Mutex::new(measured::text::TextEncoder::new()),
-        }
-    }
-}
+        let mut metrics_group = StorageControllerMetricGroup::new();
+        metrics_group
+            .storage_controller_reconcile_complete
+            .init_all_dense();
 
-impl StorageControllerMetricGroup {
-    pub(crate) fn new() -> Self {
         Self {
-            storage_controller_reconcile_spawn: measured::Counter::new(),
-            storage_controller_reconcile_complete: measured::CounterVec::new(
-                ReconcileCompleteLabelGroupSet {
-                    status: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_schedule_optimization: measured::Counter::new(),
-            storage_controller_http_request_status: measured::CounterVec::new(
-                HttpRequestStatusLabelGroupSet {
-                    path: lasso::ThreadedRodeo::new(),
-                    method: StaticLabelSet::new(),
-                    status: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_http_request_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
-            storage_controller_pageserver_request_error: measured::CounterVec::new(
-                PageserverRequestLabelGroupSet {
-                    pageserver_id: lasso::ThreadedRodeo::new(),
-                    path: lasso::ThreadedRodeo::new(),
-                    method: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_pageserver_request_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
-            storage_controller_passthrough_request_error: measured::CounterVec::new(
-                PageserverRequestLabelGroupSet {
-                    pageserver_id: lasso::ThreadedRodeo::new(),
-                    path: lasso::ThreadedRodeo::new(),
-                    method: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_passthrough_request_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
-            storage_controller_database_query_error: measured::CounterVec::new(
-                DatabaseQueryErrorLabelGroupSet {
-                    operation: StaticLabelSet::new(),
-                    error_type: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_database_query_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
+            metrics_group,
+            encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
         }
     }
 }
@@ -157,7 +119,7 @@ pub(crate) struct ReconcileCompleteLabelGroup {
 #[derive(measured::LabelGroup)]
 #[label(set = HttpRequestStatusLabelGroupSet)]
 pub(crate) struct HttpRequestStatusLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     pub(crate) path: &'a str,
     pub(crate) method: Method,
     pub(crate) status: StatusCode,
@@ -166,40 +128,21 @@ pub(crate) struct HttpRequestStatusLabelGroup<'a> {
 #[derive(measured::LabelGroup)]
 #[label(set = HttpRequestLatencyLabelGroupSet)]
 pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     pub(crate) path: &'a str,
     pub(crate) method: Method,
 }
 
-impl Default for HttpRequestLatencyLabelGroupSet {
-    fn default() -> Self {
-        Self {
-            path: lasso::ThreadedRodeo::new(),
-            method: StaticLabelSet::new(),
-        }
-    }
-}
-
 #[derive(measured::LabelGroup, Clone)]
 #[label(set = PageserverRequestLabelGroupSet)]
 pub(crate) struct PageserverRequestLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     pub(crate) pageserver_id: &'a str,
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     pub(crate) path: &'a str,
     pub(crate) method: Method,
 }
 
-impl Default for PageserverRequestLabelGroupSet {
-    fn default() -> Self {
-        Self {
-            pageserver_id: lasso::ThreadedRodeo::new(),
-            path: lasso::ThreadedRodeo::new(),
-            method: StaticLabelSet::new(),
-        }
-    }
-}
-
 #[derive(measured::LabelGroup)]
 #[label(set = DatabaseQueryErrorLabelGroupSet)]
 pub(crate) struct DatabaseQueryErrorLabelGroup {
@@ -213,7 +156,7 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup {
     pub(crate) operation: DatabaseOperation,
 }
 
-#[derive(FixedCardinalityLabel)]
+#[derive(FixedCardinalityLabel, Clone, Copy)]
 pub(crate) enum ReconcileOutcome {
     #[label(rename = "ok")]
     Success,
@@ -221,7 +164,7 @@ pub(crate) enum ReconcileOutcome {
     Cancel,
 }
 
-#[derive(FixedCardinalityLabel, Clone)]
+#[derive(FixedCardinalityLabel, Copy, Clone)]
 pub(crate) enum Method {
     Get,
     Put,
@@ -246,11 +189,12 @@ impl From<hyper::Method> for Method {
     }
 }
 
+#[derive(Clone, Copy)]
 pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
 
 impl LabelValue for StatusCode {
     fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
-        v.write_int(self.0.as_u16() as u64)
+        v.write_int(self.0.as_u16() as i64)
     }
 }
 
@@ -268,7 +212,7 @@ impl FixedCardinalityLabel for StatusCode {
     }
 }
 
-#[derive(FixedCardinalityLabel)]
+#[derive(FixedCardinalityLabel, Clone, Copy)]
 pub(crate) enum DatabaseErrorLabel {
     Query,
     Connection,
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 55fbfd10bc..5312e1e218 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -79,7 +79,7 @@ pub(crate) enum DatabaseError {
     Logical(String),
 }
 
-#[derive(measured::FixedCardinalityLabel, Clone)]
+#[derive(measured::FixedCardinalityLabel, Copy, Clone)]
 pub(crate) enum DatabaseOperation {
     InsertNode,
     UpdateNode,
@@ -153,9 +153,7 @@ impl Persistence {
         let latency = &METRICS_REGISTRY
             .metrics_group
             .storage_controller_database_query_latency;
-        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
-            operation: op.clone(),
-        });
+        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
 
         let res = self.with_conn(func).await;
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 7b8228a082..bcbd4daa7e 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -37,8 +37,7 @@ futures-io = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
-hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] }
+hashbrown = { version = "0.14", features = ["raw"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
@@ -91,7 +90,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
+hashbrown = { version = "0.14", features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }

From bcab344490fbb68daf75c98900cdd8e20f6417d6 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 9 Apr 2024 10:50:43 +0100
Subject: [PATCH 0543/1571] CI(flaky-tests): remove outdated restriction
 (#7345)

## Problem

After switching the default pageserver io-engine to `tokio-epoll-uring`
on CI, we tuned a query that finds flaky tests (in
https://github.com/neondatabase/neon/pull/7077).

It has been almost a month since then, additional query tuning is not
required anymore.

## Summary of changes
- Remove extra condition from flaky tests query
- Also return back parameterisation to the query
---
 scripts/flaky_tests.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
index 853c67d218..878840fcee 100755
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -15,8 +15,7 @@ FLAKY_TESTS_QUERY = """
         DISTINCT parent_suite, suite, name
     FROM results
     WHERE
-        started_at > CURRENT_DATE - INTERVAL '10' day
-        AND started_at > '2024-03-11 14:50:11.845+00' -- we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs`
+        started_at > CURRENT_DATE - INTERVAL '%s' day
         AND (
             (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
             OR flaky

From 4f4f787119c2a353da0a0691714256bec1f82b11 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 9 Apr 2024 12:03:46 +0100
Subject: [PATCH 0544/1571] Update staging hostname (#7347)

## Problem

```
Could not resolve host: console.stage.neon.tech
```

## Summary of changes
- replace `console.stage.neon.tech` with `console-stage.neon.build`
---
 .github/actions/neon-branch-create/action.yml  | 2 +-
 .github/actions/neon-branch-delete/action.yml  | 2 +-
 .github/actions/neon-project-create/action.yml | 2 +-
 .github/actions/neon-project-delete/action.yml | 2 +-
 scripts/sk_cleanup_tenants/script.py           | 2 +-
 scripts/sk_collect_dumps/readme.md             | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml
index f1eea34ab9..dea3fc2357 100644
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -10,7 +10,7 @@ inputs:
     required: true
   api_host:
     desctiption: 'Neon API host'
-    default: console.stage.neon.tech
+    default: console-stage.neon.build
 outputs:
   dsn:
     description: 'Created Branch DSN (for main database)'
diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml
index f8cd351dd9..8acba7ad00 100644
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -13,7 +13,7 @@ inputs:
     required: true
   api_host:
     desctiption: 'Neon API host'
-    default: console.stage.neon.tech
+    default: console-stage.neon.build
 
 runs:
   using: "composite"
diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index ae6464990e..7f0e599b97 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -13,7 +13,7 @@ inputs:
     default: 15
   api_host:
     desctiption: 'Neon API host'
-    default: console.stage.neon.tech
+    default: console-stage.neon.build
   provisioner:
     desctiption: 'k8s-pod or k8s-neonvm'
     default: 'k8s-pod'
diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml
index adc8510a34..b8ec6cac70 100644
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -10,7 +10,7 @@ inputs:
     required: true
   api_host:
     desctiption: 'Neon API host'
-    default: console.stage.neon.tech
+    default: console-stage.neon.build
 
 runs:
   using: "composite"
diff --git a/scripts/sk_cleanup_tenants/script.py b/scripts/sk_cleanup_tenants/script.py
index fa22433614..c20a4bb830 100644
--- a/scripts/sk_cleanup_tenants/script.py
+++ b/scripts/sk_cleanup_tenants/script.py
@@ -22,7 +22,7 @@ parser.add_argument("--safekeeper-host", required=True, type=str)
 args = parser.parse_args()
 
 access_key = os.getenv("CONSOLE_API_TOKEN")
-endpoint: str = "https://console.stage.neon.tech/api"
+endpoint: str = "https://console-stage.neon.build/api"
 
 trash_dir: Path = args.trash_dir
 dry_run: bool = args.dry_run
diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md
index 7494a6cb78..5ae55e058b 100644
--- a/scripts/sk_collect_dumps/readme.md
+++ b/scripts/sk_collect_dumps/readme.md
@@ -3,7 +3,7 @@
 3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key):
 ```
 # staging:
-AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
+AUTH_TOKEN=$(curl https://console-stage.neon.build/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
 # prod:
 AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
 # check

From dbac2d2c473f3648251f0a64e36d066f444dfe00 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 10 Apr 2024 02:40:14 +0200
Subject: [PATCH 0545/1571] Proxy read ids from redis (#7205)

## Problem

Proxy doesn't know about existing endpoints.

## Summary of changes

* Added caching of all available endpoints.
* On the high load, use it before going to cplane.
* Report metrics for the outcome.
* For rate limiter and credentials caching don't distinguish between
`-pooled` and not

TODOs:
* Make metrics more meaningful
* Consider integrating it with the endpoint rate limiter
* Test it together with cplane in preview
---
 proxy/src/auth/backend.rs                     |   4 +-
 proxy/src/bin/proxy.rs                        |  15 +-
 proxy/src/cache.rs                            |   1 +
 proxy/src/cache/endpoints.rs                  | 191 ++++++++++++++++++
 proxy/src/config.rs                           |  69 +++++++
 proxy/src/console/provider.rs                 |  22 +-
 proxy/src/console/provider/neon.rs            |  20 +-
 proxy/src/context.rs                          |  15 +-
 proxy/src/intern.rs                           |  15 ++
 proxy/src/lib.rs                              |  37 ++++
 proxy/src/metrics.rs                          |  12 ++
 proxy/src/proxy.rs                            |   4 +-
 proxy/src/rate_limiter.rs                     |   2 +-
 proxy/src/rate_limiter/limiter.rs             |  10 +-
 proxy/src/redis/cancellation_publisher.rs     |   6 +-
 .../regress/test_proxy_rate_limiter.py        |  84 --------
 16 files changed, 393 insertions(+), 114 deletions(-)
 create mode 100644 proxy/src/cache/endpoints.rs
 delete mode 100644 test_runner/regress/test_proxy_rate_limiter.py

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index e421798067..71e9da18bc 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -27,7 +27,7 @@ use crate::{
     },
     stream, url,
 };
-use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
+use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
@@ -186,7 +186,7 @@ impl AuthenticationConfig {
         is_cleartext: bool,
     ) -> auth::Result<AuthSecret> {
         // we have validated the endpoint exists, so let's intern it.
-        let endpoint_int = EndpointIdInt::from(endpoint);
+        let endpoint_int = EndpointIdInt::from(endpoint.normalize());
 
         // only count the full hash count if password hack or websocket flow.
         // in other words, if proxy needs to run the hashing
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 56a3ef79cd..9302b31d5c 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -189,7 +189,9 @@ struct ProxyCliArgs {
     /// cache for `project_info` (use `size=0` to disable)
     #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
     project_info_cache: String,
-
+    /// cache for all valid endpoints
+    #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
+    endpoint_cache_config: String,
     #[clap(flatten)]
     parquet_upload: ParquetUploadArgs,
 
@@ -401,6 +403,7 @@ async fn main() -> anyhow::Result<()> {
 
     if let auth::BackendType::Console(api, _) = &config.auth_backend {
         if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
+            maintenance_tasks.spawn(api.locks.garbage_collect_worker());
             if let Some(redis_notifications_client) = redis_notifications_client {
                 let cache = api.caches.project_info.clone();
                 maintenance_tasks.spawn(notifications::task_main(
@@ -410,6 +413,9 @@ async fn main() -> anyhow::Result<()> {
                     args.region.clone(),
                 ));
                 maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+                let cache = api.caches.endpoints_cache.clone();
+                let con = redis_notifications_client.clone();
+                maintenance_tasks.spawn(async move { cache.do_read(con).await });
             }
         }
     }
@@ -489,14 +495,18 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
                 args.project_info_cache.parse()?;
+            let endpoint_cache_config: config::EndpointCacheConfig =
+                args.endpoint_cache_config.parse()?;
 
             info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
             info!(
                 "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
             );
+            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
             let caches = Box::leak(Box::new(console::caches::ApiCaches::new(
                 wake_compute_cache_config,
                 project_info_cache_config,
+                endpoint_cache_config,
             )));
 
             let config::WakeComputeLockOptions {
@@ -507,10 +517,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             } = args.wake_compute_lock.parse()?;
             info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
             let locks = Box::leak(Box::new(
-                console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout)
+                console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout, epoch)
                     .unwrap(),
             ));
-            tokio::spawn(locks.garbage_collect_worker(epoch));
 
             let url = args.auth_endpoint.parse()?;
             let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));
diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs
index fc5f416395..d1d4087241 100644
--- a/proxy/src/cache.rs
+++ b/proxy/src/cache.rs
@@ -1,4 +1,5 @@
 pub mod common;
+pub mod endpoints;
 pub mod project_info;
 mod timed_lru;
 
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
new file mode 100644
index 0000000000..9bc019c2d8
--- /dev/null
+++ b/proxy/src/cache/endpoints.rs
@@ -0,0 +1,191 @@
+use std::{
+    convert::Infallible,
+    sync::{
+        atomic::{AtomicBool, Ordering},
+        Arc,
+    },
+};
+
+use dashmap::DashSet;
+use redis::{
+    streams::{StreamReadOptions, StreamReadReply},
+    AsyncCommands, FromRedisValue, Value,
+};
+use serde::Deserialize;
+use tokio::sync::Mutex;
+
+use crate::{
+    config::EndpointCacheConfig,
+    context::RequestMonitoring,
+    intern::{BranchIdInt, EndpointIdInt, ProjectIdInt},
+    metrics::REDIS_BROKEN_MESSAGES,
+    rate_limiter::GlobalRateLimiter,
+    redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
+    EndpointId, Normalize,
+};
+
+#[derive(Deserialize, Debug, Clone)]
+#[serde(rename_all(deserialize = "snake_case"))]
+pub enum ControlPlaneEventKey {
+    EndpointCreated,
+    BranchCreated,
+    ProjectCreated,
+}
+
+pub struct EndpointsCache {
+    config: EndpointCacheConfig,
+    endpoints: DashSet<EndpointIdInt>,
+    branches: DashSet<BranchIdInt>,
+    projects: DashSet<ProjectIdInt>,
+    ready: AtomicBool,
+    limiter: Arc<Mutex<GlobalRateLimiter>>,
+}
+
+impl EndpointsCache {
+    pub fn new(config: EndpointCacheConfig) -> Self {
+        Self {
+            limiter: Arc::new(Mutex::new(GlobalRateLimiter::new(
+                config.limiter_info.clone(),
+            ))),
+            config,
+            endpoints: DashSet::new(),
+            branches: DashSet::new(),
+            projects: DashSet::new(),
+            ready: AtomicBool::new(false),
+        }
+    }
+    pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
+        if !self.ready.load(Ordering::Acquire) {
+            return true;
+        }
+        // If cache is disabled, just collect the metrics and return.
+        if self.config.disable_cache {
+            ctx.set_rejected(self.should_reject(endpoint));
+            return true;
+        }
+        // If the limiter allows, we don't need to check the cache.
+        if self.limiter.lock().await.check() {
+            return true;
+        }
+        let rejected = self.should_reject(endpoint);
+        ctx.set_rejected(rejected);
+        !rejected
+    }
+    fn should_reject(&self, endpoint: &EndpointId) -> bool {
+        let endpoint = endpoint.normalize();
+        if endpoint.is_endpoint() {
+            !self.endpoints.contains(&EndpointIdInt::from(&endpoint))
+        } else if endpoint.is_branch() {
+            !self
+                .branches
+                .contains(&BranchIdInt::from(&endpoint.as_branch()))
+        } else {
+            !self
+                .projects
+                .contains(&ProjectIdInt::from(&endpoint.as_project()))
+        }
+    }
+    fn insert_event(&self, key: ControlPlaneEventKey, value: String) {
+        // Do not do normalization here, we expect the events to be normalized.
+        match key {
+            ControlPlaneEventKey::EndpointCreated => {
+                self.endpoints.insert(EndpointIdInt::from(&value.into()));
+            }
+            ControlPlaneEventKey::BranchCreated => {
+                self.branches.insert(BranchIdInt::from(&value.into()));
+            }
+            ControlPlaneEventKey::ProjectCreated => {
+                self.projects.insert(ProjectIdInt::from(&value.into()));
+            }
+        }
+    }
+    pub async fn do_read(
+        &self,
+        mut con: ConnectionWithCredentialsProvider,
+    ) -> anyhow::Result<Infallible> {
+        let mut last_id = "0-0".to_string();
+        loop {
+            self.ready.store(false, Ordering::Release);
+            if let Err(e) = con.connect().await {
+                tracing::error!("error connecting to redis: {:?}", e);
+                continue;
+            }
+            if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
+                tracing::error!("error reading from redis: {:?}", e);
+            }
+        }
+    }
+    async fn read_from_stream(
+        &self,
+        con: &mut ConnectionWithCredentialsProvider,
+        last_id: &mut String,
+    ) -> anyhow::Result<()> {
+        tracing::info!("reading endpoints/branches/projects from redis");
+        self.batch_read(
+            con,
+            StreamReadOptions::default().count(self.config.initial_batch_size),
+            last_id,
+            true,
+        )
+        .await?;
+        tracing::info!("ready to filter user requests");
+        self.ready.store(true, Ordering::Release);
+        self.batch_read(
+            con,
+            StreamReadOptions::default()
+                .count(self.config.initial_batch_size)
+                .block(self.config.xread_timeout.as_millis() as usize),
+            last_id,
+            false,
+        )
+        .await
+    }
+    fn parse_key_value(key: &str, value: &Value) -> anyhow::Result<(ControlPlaneEventKey, String)> {
+        Ok((serde_json::from_str(key)?, String::from_redis_value(value)?))
+    }
+    async fn batch_read(
+        &self,
+        conn: &mut ConnectionWithCredentialsProvider,
+        opts: StreamReadOptions,
+        last_id: &mut String,
+        return_when_finish: bool,
+    ) -> anyhow::Result<()> {
+        let mut total: usize = 0;
+        loop {
+            let mut res: StreamReadReply = conn
+                .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts)
+                .await?;
+            if res.keys.len() != 1 {
+                anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name);
+            }
+
+            let res = res.keys.pop().expect("Checked length above");
+
+            if return_when_finish && res.ids.len() <= self.config.default_batch_size {
+                break;
+            }
+            for x in res.ids {
+                total += 1;
+                for (k, v) in x.map {
+                    let (key, value) = match Self::parse_key_value(&k, &v) {
+                        Ok(x) => x,
+                        Err(e) => {
+                            REDIS_BROKEN_MESSAGES
+                                .with_label_values(&[&self.config.stream_name])
+                                .inc();
+                            tracing::error!("error parsing key-value {k}-{v:?}: {e:?}");
+                            continue;
+                        }
+                    };
+                    self.insert_event(key, value);
+                }
+                if total.is_power_of_two() {
+                    tracing::debug!("endpoints read {}", total);
+                }
+                *last_id = x.id;
+            }
+        }
+        tracing::info!("read {} endpoints/branches/projects from redis", total);
+        Ok(())
+    }
+}
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index fc490c7348..3bdfb3cfad 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -313,6 +313,75 @@ impl CertResolver {
     }
 }
 
+#[derive(Debug)]
+pub struct EndpointCacheConfig {
+    /// Batch size to receive all endpoints on the startup.
+    pub initial_batch_size: usize,
+    /// Batch size to receive endpoints.
+    pub default_batch_size: usize,
+    /// Timeouts for the stream read operation.
+    pub xread_timeout: Duration,
+    /// Stream name to read from.
+    pub stream_name: String,
+    /// Limiter info (to distinguish when to enable cache).
+    pub limiter_info: Vec<RateBucketInfo>,
+    /// Disable cache.
+    /// If true, cache is ignored, but reports all statistics.
+    pub disable_cache: bool,
+}
+
+impl EndpointCacheConfig {
+    /// Default options for [`crate::console::provider::NodeInfoCache`].
+    /// Notice that by default the limiter is empty, which means that cache is disabled.
+    pub const CACHE_DEFAULT_OPTIONS: &'static str =
+        "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s";
+
+    /// Parse cache options passed via cmdline.
+    /// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
+    fn parse(options: &str) -> anyhow::Result<Self> {
+        let mut initial_batch_size = None;
+        let mut default_batch_size = None;
+        let mut xread_timeout = None;
+        let mut stream_name = None;
+        let mut limiter_info = vec![];
+        let mut disable_cache = false;
+
+        for option in options.split(',') {
+            let (key, value) = option
+                .split_once('=')
+                .with_context(|| format!("bad key-value pair: {option}"))?;
+
+            match key {
+                "initial_batch_size" => initial_batch_size = Some(value.parse()?),
+                "default_batch_size" => default_batch_size = Some(value.parse()?),
+                "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?),
+                "stream_name" => stream_name = Some(value.to_string()),
+                "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?),
+                "disable_cache" => disable_cache = value.parse()?,
+                unknown => bail!("unknown key: {unknown}"),
+            }
+        }
+        RateBucketInfo::validate(&mut limiter_info)?;
+
+        Ok(Self {
+            initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?,
+            default_batch_size: default_batch_size.context("missing `default_batch_size`")?,
+            xread_timeout: xread_timeout.context("missing `xread_timeout`")?,
+            stream_name: stream_name.context("missing `stream_name`")?,
+            disable_cache,
+            limiter_info,
+        })
+    }
+}
+
+impl FromStr for EndpointCacheConfig {
+    type Err = anyhow::Error;
+
+    fn from_str(options: &str) -> Result<Self, Self::Err> {
+        let error = || format!("failed to parse endpoint cache options '{options}'");
+        Self::parse(options).with_context(error)
+    }
+}
 #[derive(Debug)]
 pub struct MetricBackupCollectionConfig {
     pub interval: Duration,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index f7d621fb12..ee2bc866ab 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -8,15 +8,15 @@ use crate::{
         backend::{ComputeCredentialKeys, ComputeUserInfo},
         IpPattern,
     },
-    cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru},
+    cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru},
     compute,
-    config::{CacheOptions, ProjectInfoCacheOptions},
+    config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
     context::RequestMonitoring,
     intern::ProjectIdInt,
     scram, EndpointCacheKey,
 };
 use dashmap::DashMap;
-use std::{sync::Arc, time::Duration};
+use std::{convert::Infallible, sync::Arc, time::Duration};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio::time::Instant;
 use tracing::info;
@@ -416,12 +416,15 @@ pub struct ApiCaches {
     pub node_info: NodeInfoCache,
     /// Cache which stores project_id -> endpoint_ids mapping.
     pub project_info: Arc<ProjectInfoCacheImpl>,
+    /// List of all valid endpoints.
+    pub endpoints_cache: Arc<EndpointsCache>,
 }
 
 impl ApiCaches {
     pub fn new(
         wake_compute_cache_config: CacheOptions,
         project_info_cache_config: ProjectInfoCacheOptions,
+        endpoint_cache_config: EndpointCacheConfig,
     ) -> Self {
         Self {
             node_info: NodeInfoCache::new(
@@ -431,6 +434,7 @@ impl ApiCaches {
                 true,
             ),
             project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
+            endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
         }
     }
 }
@@ -441,6 +445,7 @@ pub struct ApiLocks {
     node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
     permits: usize,
     timeout: Duration,
+    epoch: std::time::Duration,
     registered: prometheus::IntCounter,
     unregistered: prometheus::IntCounter,
     reclamation_lag: prometheus::Histogram,
@@ -453,6 +458,7 @@ impl ApiLocks {
         permits: usize,
         shards: usize,
         timeout: Duration,
+        epoch: std::time::Duration,
     ) -> prometheus::Result<Self> {
         let registered = prometheus::IntCounter::with_opts(
             prometheus::Opts::new(
@@ -497,6 +503,7 @@ impl ApiLocks {
             node_locks: DashMap::with_shard_amount(shards),
             permits,
             timeout,
+            epoch,
             lock_acquire_lag,
             registered,
             unregistered,
@@ -536,12 +543,9 @@ impl ApiLocks {
         })
     }
 
-    pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) {
-        if self.permits == 0 {
-            return;
-        }
-
-        let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32);
+    pub async fn garbage_collect_worker(&self) -> anyhow::Result<Infallible> {
+        let mut interval =
+            tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32);
         loop {
             for (i, shard) in self.node_locks.shards().iter().enumerate() {
                 interval.tick().await;
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 1a3e2ca795..3a0e5609d8 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -8,6 +8,7 @@ use super::{
 };
 use crate::{
     auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram,
+    Normalize,
 };
 use crate::{
     cache::Cached,
@@ -23,7 +24,7 @@ use tracing::{error, info, info_span, warn, Instrument};
 pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
-    locks: &'static ApiLocks,
+    pub locks: &'static ApiLocks,
     jwt: String,
 }
 
@@ -55,6 +56,15 @@ impl Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
+        if !self
+            .caches
+            .endpoints_cache
+            .is_valid(ctx, &user_info.endpoint)
+            .await
+        {
+            info!("endpoint is not valid, skipping the request");
+            return Ok(AuthInfo::default());
+        }
         let request_id = ctx.session_id.to_string();
         let application_name = ctx.console_application_name();
         async {
@@ -81,7 +91,9 @@ impl Api {
                 Ok(body) => body,
                 // Error 404 is special: it's ok not to have a secret.
                 Err(e) => match e.http_status_code() {
-                    Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()),
+                    Some(http::StatusCode::NOT_FOUND) => {
+                        return Ok(AuthInfo::default());
+                    }
                     _otherwise => return Err(e.into()),
                 },
             };
@@ -181,7 +193,7 @@ impl super::Api for Api {
         }
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         if let Some(project_id) = auth_info.project_id {
-            let ep_int = ep.into();
+            let ep_int = ep.normalize().into();
             self.caches.project_info.insert_role_secret(
                 project_id,
                 ep_int,
@@ -218,7 +230,7 @@ impl super::Api for Api {
         let allowed_ips = Arc::new(auth_info.allowed_ips);
         let user = &user_info.user;
         if let Some(project_id) = auth_info.project_id {
-            let ep_int = ep.into();
+            let ep_int = ep.normalize().into();
             self.caches.project_info.insert_role_secret(
                 project_id,
                 ep_int,
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index fec95f4722..85544f1d65 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -12,7 +12,9 @@ use crate::{
     console::messages::{ColdStartInfo, MetricsAuxInfo},
     error::ErrorKind,
     intern::{BranchIdInt, ProjectIdInt},
-    metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
+    metrics::{
+        bool_to_str, LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND, NUM_INVALID_ENDPOINTS,
+    },
     DbName, EndpointId, RoleName,
 };
 
@@ -50,6 +52,8 @@ pub struct RequestMonitoring {
     // This sender is here to keep the request monitoring channel open while requests are taking place.
     sender: Option<mpsc::UnboundedSender<RequestData>>,
     pub latency_timer: LatencyTimer,
+    // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
+    rejected: bool,
 }
 
 #[derive(Clone, Debug)]
@@ -93,6 +97,7 @@ impl RequestMonitoring {
             error_kind: None,
             auth_method: None,
             success: false,
+            rejected: false,
             cold_start_info: ColdStartInfo::Unknown,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
@@ -113,6 +118,10 @@ impl RequestMonitoring {
         )
     }
 
+    pub fn set_rejected(&mut self, rejected: bool) {
+        self.rejected = rejected;
+    }
+
     pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
         self.cold_start_info = info;
         self.latency_timer.cold_start_info(info);
@@ -178,6 +187,10 @@ impl RequestMonitoring {
 
 impl Drop for RequestMonitoring {
     fn drop(&mut self) {
+        let outcome = if self.success { "success" } else { "failure" };
+        NUM_INVALID_ENDPOINTS
+            .with_label_values(&[self.protocol, bool_to_str(self.rejected), outcome])
+            .inc();
         if let Some(tx) = self.sender.take() {
             let _: Result<(), _> = tx.send(RequestData::from(&*self));
         }
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index a6519bdff9..e38135dd22 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -160,6 +160,11 @@ impl From<&EndpointId> for EndpointIdInt {
         EndpointIdTag::get_interner().get_or_intern(value)
     }
 }
+impl From<EndpointId> for EndpointIdInt {
+    fn from(value: EndpointId) -> Self {
+        EndpointIdTag::get_interner().get_or_intern(&value)
+    }
+}
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct BranchIdTag;
@@ -175,6 +180,11 @@ impl From<&BranchId> for BranchIdInt {
         BranchIdTag::get_interner().get_or_intern(value)
     }
 }
+impl From<BranchId> for BranchIdInt {
+    fn from(value: BranchId) -> Self {
+        BranchIdTag::get_interner().get_or_intern(&value)
+    }
+}
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct ProjectIdTag;
@@ -190,6 +200,11 @@ impl From<&ProjectId> for ProjectIdInt {
         ProjectIdTag::get_interner().get_or_intern(value)
     }
 }
+impl From<ProjectId> for ProjectIdInt {
+    fn from(value: ProjectId) -> Self {
+        ProjectIdTag::get_interner().get_or_intern(&value)
+    }
+}
 
 #[cfg(test)]
 mod tests {
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index da7c7f3ed2..3f6d985fe8 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -127,6 +127,24 @@ macro_rules! smol_str_wrapper {
     };
 }
 
+const POOLER_SUFFIX: &str = "-pooler";
+
+pub trait Normalize {
+    fn normalize(&self) -> Self;
+}
+
+impl<S: Clone + AsRef<str> + From<String>> Normalize for S {
+    fn normalize(&self) -> Self {
+        if self.as_ref().ends_with(POOLER_SUFFIX) {
+            let mut s = self.as_ref().to_string();
+            s.truncate(s.len() - POOLER_SUFFIX.len());
+            s.into()
+        } else {
+            self.clone()
+        }
+    }
+}
+
 // 90% of role name strings are 20 characters or less.
 smol_str_wrapper!(RoleName);
 // 50% of endpoint strings are 23 characters or less.
@@ -140,3 +158,22 @@ smol_str_wrapper!(ProjectId);
 smol_str_wrapper!(EndpointCacheKey);
 
 smol_str_wrapper!(DbName);
+
+// Endpoints are a bit tricky. Rare they might be branches or projects.
+impl EndpointId {
+    pub fn is_endpoint(&self) -> bool {
+        self.0.starts_with("ep-")
+    }
+    pub fn is_branch(&self) -> bool {
+        self.0.starts_with("br-")
+    }
+    pub fn is_project(&self) -> bool {
+        !self.is_endpoint() && !self.is_branch()
+    }
+    pub fn as_branch(&self) -> BranchId {
+        BranchId(self.0.clone())
+    }
+    pub fn as_project(&self) -> ProjectId {
+        ProjectId(self.0.clone())
+    }
+}
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 59ee899c08..f299313e0a 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -169,6 +169,18 @@ pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
     .unwrap()
 });
 
+pub static NUM_INVALID_ENDPOINTS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_invalid_endpoints_total",
+        "Number of invalid endpoints (per protocol, per rejected).",
+        // http/ws/tcp, true/false, success/failure
+        // TODO(anna): the last dimension is just a proxy to what we actually want to measure.
+        // We need to measure whether the endpoint was found by cplane or not.
+        &["protocol", "rejected", "outcome"],
+    )
+    .unwrap()
+});
+
 pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client";
 pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis";
 
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 6051c0a812..166e761a4e 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -20,7 +20,7 @@ use crate::{
     proxy::handshake::{handshake, HandshakeData},
     rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
-    EndpointCacheKey,
+    EndpointCacheKey, Normalize,
 };
 use futures::TryFutureExt;
 use itertools::Itertools;
@@ -280,7 +280,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     // check rate limit
     if let Some(ep) = user_info.get_endpoint() {
-        if !endpoint_rate_limiter.check(ep, 1) {
+        if !endpoint_rate_limiter.check(ep.normalize(), 1) {
             return stream
                 .throw_error(auth::AuthError::too_many_connections())
                 .await?;
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index 13dffffca0..a3b83e5e50 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -4,4 +4,4 @@ mod limiter;
 pub use aimd::Aimd;
 pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
 pub use limiter::Limiter;
-pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
+pub use limiter::{AuthRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index f590896dd9..0503deb311 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -24,13 +24,13 @@ use super::{
     RateLimiterConfig,
 };
 
-pub struct RedisRateLimiter {
+pub struct GlobalRateLimiter {
     data: Vec<RateBucket>,
-    info: &'static [RateBucketInfo],
+    info: Vec<RateBucketInfo>,
 }
 
-impl RedisRateLimiter {
-    pub fn new(info: &'static [RateBucketInfo]) -> Self {
+impl GlobalRateLimiter {
+    pub fn new(info: Vec<RateBucketInfo>) -> Self {
         Self {
             data: vec![
                 RateBucket {
@@ -50,7 +50,7 @@ impl RedisRateLimiter {
         let should_allow_request = self
             .data
             .iter_mut()
-            .zip(self.info)
+            .zip(&self.info)
             .all(|(bucket, info)| bucket.should_allow_request(info, now, 1));
 
         if should_allow_request {
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 422789813c..7baf104374 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -5,7 +5,7 @@ use redis::AsyncCommands;
 use tokio::sync::Mutex;
 use uuid::Uuid;
 
-use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
+use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
 
 use super::{
     connection_with_credentials_provider::ConnectionWithCredentialsProvider,
@@ -80,7 +80,7 @@ impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
 pub struct RedisPublisherClient {
     client: ConnectionWithCredentialsProvider,
     region_id: String,
-    limiter: RedisRateLimiter,
+    limiter: GlobalRateLimiter,
 }
 
 impl RedisPublisherClient {
@@ -92,7 +92,7 @@ impl RedisPublisherClient {
         Ok(Self {
             client,
             region_id,
-            limiter: RedisRateLimiter::new(info),
+            limiter: GlobalRateLimiter::new(info.into()),
         })
     }
 
diff --git a/test_runner/regress/test_proxy_rate_limiter.py b/test_runner/regress/test_proxy_rate_limiter.py
deleted file mode 100644
index f39f0cad07..0000000000
--- a/test_runner/regress/test_proxy_rate_limiter.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import asyncio
-import time
-from pathlib import Path
-from typing import Iterator
-
-import pytest
-from fixtures.neon_fixtures import (
-    PSQL,
-    NeonProxy,
-)
-from fixtures.port_distributor import PortDistributor
-from pytest_httpserver import HTTPServer
-from werkzeug.wrappers.response import Response
-
-
-def waiting_handler(status_code: int) -> Response:
-    # wait more than timeout to make sure that both (two) connections are open.
-    # It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver.
-    time.sleep(2)
-    return Response(status=status_code)
-
-
-@pytest.fixture(scope="function")
-def proxy_with_rate_limit(
-    port_distributor: PortDistributor,
-    neon_binpath: Path,
-    httpserver_listen_address,
-    test_output_dir: Path,
-) -> Iterator[NeonProxy]:
-    """Neon proxy that routes directly to vanilla postgres."""
-
-    proxy_port = port_distributor.get_port()
-    mgmt_port = port_distributor.get_port()
-    http_port = port_distributor.get_port()
-    external_http_port = port_distributor.get_port()
-    (host, port) = httpserver_listen_address
-    endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
-
-    with NeonProxy(
-        neon_binpath=neon_binpath,
-        test_output_dir=test_output_dir,
-        proxy_port=proxy_port,
-        http_port=http_port,
-        mgmt_port=mgmt_port,
-        external_http_port=external_http_port,
-        auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5),
-    ) as proxy:
-        proxy.start()
-        yield proxy
-
-
-@pytest.mark.asyncio
-async def test_proxy_rate_limit(
-    httpserver: HTTPServer,
-    proxy_with_rate_limit: NeonProxy,
-):
-    uri = "/billing/api/v1/usage_events/proxy_get_role_secret"
-    # mock control plane service
-    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
-        lambda _: Response(status=200)
-    )
-    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
-        lambda _: waiting_handler(429)
-    )
-    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
-        lambda _: waiting_handler(500)
-    )
-
-    psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port)
-    f = await psql.run("select 42;")
-    await proxy_with_rate_limit.find_auth_link(uri, f)
-    # Limit should be 2.
-
-    # Run two queries in parallel.
-    f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;"))
-    await proxy_with_rate_limit.find_auth_link(uri, f1)
-    await proxy_with_rate_limit.find_auth_link(uri, f2)
-
-    # Now limit should be 0.
-    f = await psql.run("select 42;")
-    await proxy_with_rate_limit.find_auth_link(uri, f)
-
-    # There last query shouldn't reach the http-server.
-    assert httpserver.assertions == []

From 221414de4b0260056e0961528d46c5141825a0a0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 10 Apr 2024 06:31:28 +0100
Subject: [PATCH 0546/1571] pageserver: time based rolling based on the first
 write timestamp (#7346)

Problem
Currently, we base our time based layer rolling decision on the last
time we froze a layer. This means that if we roll a layer and then go
idle for longer than the checkpoint timeout the next layer will be
rolled after the first write. This is of course not desirable.

Summary of changes
Record the timepoint of the first write to an open layer and use that
for time based layer rolling decisions. Note that I had to keep
`Timeline::last_freeze_ts` for the sharded tenant disk consistent lsn
skip hack.

Fixes #7241
---
 .../tenant/storage_layer/inmemory_layer.rs    |  8 +++
 pageserver/src/tenant/timeline.rs             | 29 ++++-------
 .../regress/test_pageserver_layer_rolling.py  | 50 ++++++++++++++++---
 3 files changed, 62 insertions(+), 25 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 43942ba2db..29751641b4 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -19,6 +19,7 @@ use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
 use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::sync::{Arc, OnceLock};
+use std::time::Instant;
 use tracing::*;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // avoid binding to Write (conflicts with std::io::Write)
@@ -53,6 +54,8 @@ pub struct InMemoryLayer {
     /// Writes are only allowed when this is `None`.
     end_lsn: OnceLock<Lsn>,
 
+    opened_at: Instant,
+
     /// The above fields never change, except for `end_lsn`, which is only set once.
     /// All other changing parts are in `inner`, and protected by a mutex.
     inner: RwLock<InMemoryLayerInner>,
@@ -460,6 +463,7 @@ impl InMemoryLayer {
             tenant_shard_id,
             start_lsn,
             end_lsn: OnceLock::new(),
+            opened_at: Instant::now(),
             inner: RwLock::new(InMemoryLayerInner {
                 index: HashMap::new(),
                 file,
@@ -520,6 +524,10 @@ impl InMemoryLayer {
         Ok(())
     }
 
+    pub(crate) fn get_opened_at(&self) -> Instant {
+        self.opened_at
+    }
+
     pub(crate) async fn tick(&self) -> Option<u64> {
         let mut inner = self.inner.write().await;
         let size = inner.file.len();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d3c8c5f66c..d046a60af4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1257,7 +1257,7 @@ impl Timeline {
             checkpoint_distance,
             self.get_last_record_lsn(),
             self.last_freeze_at.load(),
-            *self.last_freeze_ts.read().unwrap(),
+            open_layer.get_opened_at(),
         ) {
             match open_layer.info() {
                 InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
@@ -1622,7 +1622,7 @@ impl Timeline {
         checkpoint_distance: u64,
         projected_lsn: Lsn,
         last_freeze_at: Lsn,
-        last_freeze_ts: Instant,
+        opened_at: Instant,
     ) -> bool {
         let distance = projected_lsn.widening_sub(last_freeze_at);
 
@@ -1648,13 +1648,13 @@ impl Timeline {
             );
 
             true
-        } else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
+        } else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() {
             info!(
-                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
-                projected_lsn,
-                layer_size,
-                last_freeze_ts.elapsed()
-            );
+                    "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
+                    projected_lsn,
+                    layer_size,
+                    opened_at.elapsed()
+                );
 
             true
         } else {
@@ -4703,23 +4703,16 @@ struct TimelineWriterState {
     max_lsn: Option<Lsn>,
     // Cached details of the last freeze. Avoids going trough the atomic/lock on every put.
     cached_last_freeze_at: Lsn,
-    cached_last_freeze_ts: Instant,
 }
 
 impl TimelineWriterState {
-    fn new(
-        open_layer: Arc<InMemoryLayer>,
-        current_size: u64,
-        last_freeze_at: Lsn,
-        last_freeze_ts: Instant,
-    ) -> Self {
+    fn new(open_layer: Arc<InMemoryLayer>, current_size: u64, last_freeze_at: Lsn) -> Self {
         Self {
             open_layer,
             current_size,
             prev_lsn: None,
             max_lsn: None,
             cached_last_freeze_at: last_freeze_at,
-            cached_last_freeze_ts: last_freeze_ts,
         }
     }
 }
@@ -4818,12 +4811,10 @@ impl<'a> TimelineWriter<'a> {
         let initial_size = layer.size().await?;
 
         let last_freeze_at = self.last_freeze_at.load();
-        let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
         self.write_guard.replace(TimelineWriterState::new(
             layer,
             initial_size,
             last_freeze_at,
-            last_freeze_ts,
         ));
 
         Ok(())
@@ -4870,7 +4861,7 @@ impl<'a> TimelineWriter<'a> {
             self.get_checkpoint_distance(),
             lsn,
             state.cached_last_freeze_at,
-            state.cached_last_freeze_ts,
+            state.open_layer.get_opened_at(),
         ) {
             OpenLayerAction::Roll
         } else {
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index c7e1e88468..c5dc0f2919 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -1,6 +1,7 @@
 import asyncio
 import os
-from typing import Tuple
+import time
+from typing import Optional, Tuple
 
 import psutil
 import pytest
@@ -20,20 +21,30 @@ ENTRIES_PER_TIMELINE = 10_000
 CHECKPOINT_TIMEOUT_SECONDS = 60
 
 
-async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
-    tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf)
+async def run_worker_for_tenant(
+    env: NeonEnv, entries: int, tenant: TenantId, offset: Optional[int] = None
+) -> Lsn:
+    if offset is None:
+        offset = 0
+
     with env.endpoints.create_start("main", tenant_id=tenant) as ep:
         conn = await ep.connect_async()
         try:
             await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)")
             await conn.execute(
-                f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i"
+                f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series({offset},{entries}) as i"
             )
         finally:
             await conn.close(timeout=10)
 
         last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
-        return tenant, timeline, last_flush_lsn
+        return last_flush_lsn
+
+
+async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
+    tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf)
+    last_flush_lsn = await run_worker_for_tenant(env, entries, tenant)
+    return tenant, timeline, last_flush_lsn
 
 
 async def workload(
@@ -89,7 +100,9 @@ def assert_dirty_bytes(env, v):
 
 
 def assert_dirty_bytes_nonzero(env):
-    assert get_dirty_bytes(env) > 0
+    dirty_bytes = get_dirty_bytes(env)
+    assert dirty_bytes > 0
+    return dirty_bytes
 
 
 @pytest.mark.parametrize("immediate_shutdown", [True, False])
@@ -182,6 +195,31 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
     log.info("Waiting for background checkpoints...")
     wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0))  # type: ignore
 
+    # The code below verifies that we do not flush on the first write
+    # after an idle period longer than the checkpoint timeout.
+
+    # Sit quietly for longer than the checkpoint timeout
+    time.sleep(CHECKPOINT_TIMEOUT_SECONDS + CHECKPOINT_TIMEOUT_SECONDS / 2)
+
+    # Restart the safekeepers and write a bit of extra data into one tenant
+    for sk in env.safekeepers:
+        sk.start()
+
+    tenant_with_extra_writes = last_flush_lsns[0][0]
+    asyncio.run(
+        run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE)
+    )
+
+    dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))  # type: ignore
+
+    # We shouldn't flush since we've just opened a new layer
+    waited_for = 0
+    while waited_for < CHECKPOINT_TIMEOUT_SECONDS // 4:
+        time.sleep(5)
+        waited_for += 5
+
+        assert get_dirty_bytes(env) >= dirty_after_write
+
 
 @pytest.mark.skipif(
     # We have to use at least ~100MB of data to hit the lowest limit we can configure, which is

From fd88d4608c3e8a8cb8579786a7b507a436033efc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 10 Apr 2024 09:12:07 +0200
Subject: [PATCH 0547/1571] Add command to time travel recover prefixes (#7322)

Adds another tool to the DR toolbox: ability in pagectl to
recover arbitrary prefixes in remote storage. Requires remote storage config,
the prefix, and the travel-to timestamp parameter
to be specified as cli args.
The done-if-after parameter is also supported.

Example invocation (after `aws login --profile dev`):

```
RUST_LOG=remote_storage=debug AWS_PROFILE=dev cargo run -p pagectl time-travel-remote-prefix 'remote_storage = { bucket_name = "neon-test-bucket-name", bucket_region = "us-east-2" }' wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/ 2024-04-05T17:00:00Z
```

This has been written to resolve a customer recovery case:
https://neondb.slack.com/archives/C033RQ5SPDH/p1712256888468009

There is validation of the prefix to prevent accidentially specifying
too generic prefixes, which can cause corruption and data
loss if used wrongly. Still, the validation is not perfect and it is
important that the command is used with caution.
If possible, `time_travel_remote_storage` should
be used instead which has additional checks in place.
---
 Cargo.lock                 |   5 ++
 pageserver/ctl/Cargo.toml  |   5 ++
 pageserver/ctl/src/main.rs | 166 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 175 insertions(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index a7e29b1de3..4c2bcf250e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3477,12 +3477,17 @@ dependencies = [
  "camino",
  "clap",
  "git-version",
+ "humantime",
  "pageserver",
+ "pageserver_api",
  "postgres_ffi",
+ "remote_storage",
  "serde",
  "serde_json",
  "svg_fmt",
  "tokio",
+ "tokio-util",
+ "toml_edit",
  "utils",
  "workspace_hack",
 ]
diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml
index c5cd451e8d..843f5dd862 100644
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -12,9 +12,14 @@ bytes.workspace = true
 camino.workspace = true
 clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
+humantime.workspace = true
 pageserver = { path = ".." }
+pageserver_api.workspace = true
+remote_storage = { path = "../../libs/remote_storage" }
 postgres_ffi.workspace = true
 tokio.workspace = true
+tokio-util.workspace = true
+toml_edit.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index e73d961e36..1fb75584fc 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -9,6 +9,11 @@ mod index_part;
 mod layer_map_analyzer;
 mod layers;
 
+use std::{
+    str::FromStr,
+    time::{Duration, SystemTime},
+};
+
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
 use index_part::IndexPartCmd;
@@ -20,8 +25,16 @@ use pageserver::{
     tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
     virtual_file,
 };
+use pageserver_api::shard::TenantShardId;
 use postgres_ffi::ControlFileData;
-use utils::{lsn::Lsn, project_git_version};
+use remote_storage::{RemotePath, RemoteStorageConfig};
+use tokio_util::sync::CancellationToken;
+use utils::{
+    id::TimelineId,
+    logging::{self, LogFormat, TracingErrorLayerEnablement},
+    lsn::Lsn,
+    project_git_version,
+};
 
 project_git_version!(GIT_VERSION);
 
@@ -43,6 +56,7 @@ enum Commands {
     #[command(subcommand)]
     IndexPart(IndexPartCmd),
     PrintLayerFile(PrintLayerFileCmd),
+    TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd),
     DrawTimeline {},
     AnalyzeLayerMap(AnalyzeLayerMapCmd),
     #[command(subcommand)]
@@ -68,6 +82,26 @@ struct PrintLayerFileCmd {
     path: Utf8PathBuf,
 }
 
+/// Roll back the time for the specified prefix using S3 history.
+///
+/// The command is fairly low level and powerful. Validation is only very light,
+/// so it is more powerful, and thus potentially more dangerous.
+#[derive(Parser)]
+struct TimeTravelRemotePrefixCmd {
+    /// A configuration string for the remote_storage configuration.
+    ///
+    /// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }`
+    config_toml_str: String,
+    /// remote prefix to time travel recover. For safety reasons, we require it to contain
+    /// a timeline or tenant ID in the prefix.
+    prefix: String,
+    /// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy.
+    travel_to: String,
+    /// Timestamp of the start of the operation, must be after any changes we want to roll back and after.
+    /// You can use a few seconds before invoking the command. Same format as `travel_to`.
+    done_if_after: Option<String>,
+}
+
 #[derive(Parser)]
 struct AnalyzeLayerMapCmd {
     /// Pageserver data path
@@ -78,6 +112,14 @@ struct AnalyzeLayerMapCmd {
 
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
+    logging::init(
+        LogFormat::Plain,
+        TracingErrorLayerEnablement::EnableWithRustLogFilter,
+        logging::Output::Stdout,
+    )?;
+
+    logging::replace_panic_hook_with_tracing_panic_hook().forget();
+
     let cli = CliOpts::parse();
 
     match cli.command {
@@ -105,6 +147,42 @@ async fn main() -> anyhow::Result<()> {
                 print_layerfile(&cmd.path).await?;
             }
         }
+        Commands::TimeTravelRemotePrefix(cmd) => {
+            let timestamp = humantime::parse_rfc3339(&cmd.travel_to)
+                .map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?;
+
+            let done_if_after = if let Some(done_if_after) = &cmd.done_if_after {
+                humantime::parse_rfc3339(done_if_after).map_err(|_e| {
+                    anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after)
+                })?
+            } else {
+                const SAFETY_MARGIN: Duration = Duration::from_secs(3);
+                tokio::time::sleep(SAFETY_MARGIN).await;
+                // Convert to string representation and back to get rid of sub-second values
+                let done_if_after = SystemTime::now();
+                tokio::time::sleep(SAFETY_MARGIN).await;
+                done_if_after
+            };
+
+            let timestamp = strip_subsecond(timestamp);
+            let done_if_after = strip_subsecond(done_if_after);
+
+            let Some(prefix) = validate_prefix(&cmd.prefix) else {
+                println!("specified prefix '{}' failed validation", cmd.prefix);
+                return Ok(());
+            };
+            let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
+            let toml_item = toml_document
+                .get("remote_storage")
+                .expect("need remote_storage");
+            let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
+            let storage = remote_storage::GenericRemoteStorage::from_config(&config);
+            let cancel = CancellationToken::new();
+            storage
+                .unwrap()
+                .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
+                .await?;
+        }
     };
     Ok(())
 }
@@ -185,3 +263,89 @@ fn handle_metadata(
 
     Ok(())
 }
+
+/// Ensures that the given S3 prefix is sufficiently constrained.
+/// The command is very risky already and we don't want to expose something
+/// that allows usually unintentional and quite catastrophic time travel of
+/// an entire bucket, which would be a major catastrophy and away
+/// by only one character change (similar to "rm -r /home /username/foobar").
+fn validate_prefix(prefix: &str) -> Option<RemotePath> {
+    if prefix.is_empty() {
+        // Empty prefix means we want to specify the *whole* bucket
+        return None;
+    }
+    let components = prefix.split('/').collect::<Vec<_>>();
+    let (last, components) = {
+        let last = components.last()?;
+        if last.is_empty() {
+            (
+                components.iter().nth_back(1)?,
+                &components[..(components.len() - 1)],
+            )
+        } else {
+            (last, &components[..])
+        }
+    };
+    'valid: {
+        if let Ok(_timeline_id) = TimelineId::from_str(last) {
+            // Ends in either a tenant or timeline ID
+            break 'valid;
+        }
+        if *last == "timelines" {
+            if let Some(before_last) = components.iter().nth_back(1) {
+                if let Ok(_tenant_id) = TenantShardId::from_str(before_last) {
+                    // Has a valid tenant id
+                    break 'valid;
+                }
+            }
+        }
+
+        return None;
+    }
+    RemotePath::from_string(prefix).ok()
+}
+
+fn strip_subsecond(timestamp: SystemTime) -> SystemTime {
+    let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string();
+    humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_validate_prefix() {
+        assert_eq!(validate_prefix(""), None);
+        assert_eq!(validate_prefix("/"), None);
+        #[track_caller]
+        fn assert_valid(prefix: &str) {
+            let remote_path = RemotePath::from_string(prefix).unwrap();
+            assert_eq!(validate_prefix(prefix), Some(remote_path));
+        }
+        assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/");
+        // Path is not relative but absolute
+        assert_eq!(
+            validate_prefix(
+                "/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"
+            ),
+            None
+        );
+        assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/");
+        // Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix
+        assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None);
+        assert_eq!(validate_prefix("wal"), None);
+        assert_eq!(validate_prefix("/wal/"), None);
+        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001");
+        // Partial tenant ID
+        assert_eq!(
+            validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"),
+            None
+        );
+        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines");
+        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines");
+        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/");
+        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683");
+        assert_eq!(validate_prefix("pageserver/v1/tenants/"), None);
+    }
+}

From c0ff4f18dcb60d2b8035a8d83b693e5e81ceaeff Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 10 Apr 2024 09:23:59 +0100
Subject: [PATCH 0548/1571] proxy: hyper1 for only proxy (#7073)

## Problem

hyper1 offers control over the HTTP connection that hyper0_14 does not.
We're blocked on switching all services to hyper1 because of how we use
tonic, but no reason we can't switch proxy over.

## Summary of changes

1. hyper0.14 -> hyper1
    1. self managed server
    2. Remove the `WithConnectionGuard` wrapper from `protocol2`
2. Remove TLS listener as it's no longer necessary
3. include first session ID in connection startup logs
---
 Cargo.lock                            | 214 +++++++++++++----
 Cargo.toml                            |   3 +-
 proxy/Cargo.toml                      |   4 +
 proxy/src/protocol2.rs                | 105 +--------
 proxy/src/serverless.rs               | 315 ++++++++++++++------------
 proxy/src/serverless/http_util.rs     |  92 ++++++++
 proxy/src/serverless/sql_over_http.rs |  44 ++--
 proxy/src/serverless/tls_listener.rs  | 123 ----------
 workspace_hack/Cargo.toml             |   3 +-
 9 files changed, 458 insertions(+), 445 deletions(-)
 create mode 100644 proxy/src/serverless/http_util.rs
 delete mode 100644 proxy/src/serverless/tls_listener.rs

diff --git a/Cargo.lock b/Cargo.lock
index 4c2bcf250e..bdf2b08c5c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -270,6 +270,12 @@ dependencies = [
  "critical-section",
 ]
 
+[[package]]
+name = "atomic-take"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
+
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -298,7 +304,7 @@ dependencies = [
  "fastrand 2.0.0",
  "hex",
  "http 0.2.9",
- "hyper",
+ "hyper 0.14.26",
  "ring 0.17.6",
  "time",
  "tokio",
@@ -335,7 +341,7 @@ dependencies = [
  "bytes",
  "fastrand 2.0.0",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "percent-encoding",
  "pin-project-lite",
  "tracing",
@@ -386,7 +392,7 @@ dependencies = [
  "aws-types",
  "bytes",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "once_cell",
  "percent-encoding",
  "regex-lite",
@@ -514,7 +520,7 @@ dependencies = [
  "crc32fast",
  "hex",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "md-5",
  "pin-project-lite",
  "sha1",
@@ -546,7 +552,7 @@ dependencies = [
  "bytes-utils",
  "futures-core",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
@@ -585,10 +591,10 @@ dependencies = [
  "aws-smithy-types",
  "bytes",
  "fastrand 2.0.0",
- "h2",
+ "h2 0.3.26",
  "http 0.2.9",
- "http-body",
- "hyper",
+ "http-body 0.4.5",
+ "hyper 0.14.26",
  "hyper-rustls",
  "once_cell",
  "pin-project-lite",
@@ -626,7 +632,7 @@ dependencies = [
  "bytes-utils",
  "futures-core",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "itoa",
  "num-integer",
  "pin-project-lite",
@@ -675,8 +681,8 @@ dependencies = [
  "bytes",
  "futures-util",
  "http 0.2.9",
- "http-body",
- "hyper",
+ "http-body 0.4.5",
+ "hyper 0.14.26",
  "itoa",
  "matchit",
  "memchr",
@@ -691,7 +697,7 @@ dependencies = [
  "sha1",
  "sync_wrapper",
  "tokio",
- "tokio-tungstenite",
+ "tokio-tungstenite 0.20.0",
  "tower",
  "tower-layer",
  "tower-service",
@@ -707,7 +713,7 @@ dependencies = [
  "bytes",
  "futures-util",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "mime",
  "rustversion",
  "tower-layer",
@@ -1196,7 +1202,7 @@ dependencies = [
  "compute_api",
  "flate2",
  "futures",
- "hyper",
+ "hyper 0.14.26",
  "nix 0.27.1",
  "notify",
  "num_cpus",
@@ -1313,7 +1319,7 @@ dependencies = [
  "git-version",
  "hex",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
  "nix 0.27.1",
  "once_cell",
  "pageserver_api",
@@ -2199,6 +2205,25 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "h2"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "816ec7294445779408f36fe57bc5b7fc1cf59664059096c65f905c1c61f58069"
+dependencies = [
+ "bytes",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "futures-util",
+ "http 1.1.0",
+ "indexmap 2.0.1",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
 [[package]]
 name = "half"
 version = "1.8.2"
@@ -2370,6 +2395,29 @@ dependencies = [
  "pin-project-lite",
 ]
 
+[[package]]
+name = "http-body"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643"
+dependencies = [
+ "bytes",
+ "http 1.1.0",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41cb79eb393015dadd30fc252023adb0b2400a0caee0fa2a077e6e21a551e840"
+dependencies = [
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "pin-project-lite",
+]
+
 [[package]]
 name = "http-types"
 version = "2.12.0"
@@ -2428,9 +2476,9 @@ dependencies = [
  "futures-channel",
  "futures-core",
  "futures-util",
- "h2",
+ "h2 0.3.26",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "httparse",
  "httpdate",
  "itoa",
@@ -2442,6 +2490,26 @@ dependencies = [
  "want",
 ]
 
+[[package]]
+name = "hyper"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a"
+dependencies = [
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "h2 0.4.4",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project-lite",
+ "smallvec",
+ "tokio",
+]
+
 [[package]]
 name = "hyper-rustls"
 version = "0.24.0"
@@ -2449,7 +2517,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
 dependencies = [
  "http 0.2.9",
- "hyper",
+ "hyper 0.14.26",
  "log",
  "rustls 0.21.9",
  "rustls-native-certs 0.6.2",
@@ -2463,7 +2531,7 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
 dependencies = [
- "hyper",
+ "hyper 0.14.26",
  "pin-project-lite",
  "tokio",
  "tokio-io-timeout",
@@ -2476,7 +2544,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
 dependencies = [
  "bytes",
- "hyper",
+ "hyper 0.14.26",
  "native-tls",
  "tokio",
  "tokio-native-tls",
@@ -2484,15 +2552,33 @@ dependencies = [
 
 [[package]]
 name = "hyper-tungstenite"
-version = "0.11.1"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9"
+checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad"
 dependencies = [
- "hyper",
+ "http-body-util",
+ "hyper 1.2.0",
+ "hyper-util",
  "pin-project-lite",
  "tokio",
- "tokio-tungstenite",
- "tungstenite",
+ "tokio-tungstenite 0.21.0",
+ "tungstenite 0.21.0",
+]
+
+[[package]]
+name = "hyper-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa"
+dependencies = [
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "hyper 1.2.0",
+ "pin-project-lite",
+ "socket2 0.5.5",
+ "tokio",
 ]
 
 [[package]]
@@ -3523,7 +3609,7 @@ dependencies = [
  "hex-literal",
  "humantime",
  "humantime-serde",
- "hyper",
+ "hyper 0.14.26",
  "itertools",
  "leaky-bucket",
  "md5",
@@ -4202,6 +4288,7 @@ dependencies = [
  "anyhow",
  "async-compression",
  "async-trait",
+ "atomic-take",
  "aws-config",
  "aws-sdk-iam",
  "aws-sigv4",
@@ -4225,9 +4312,12 @@ dependencies = [
  "hmac",
  "hostname",
  "http 1.1.0",
+ "http-body-util",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
+ "hyper 1.2.0",
  "hyper-tungstenite",
+ "hyper-util",
  "ipnet",
  "itertools",
  "lasso",
@@ -4560,7 +4650,7 @@ dependencies = [
  "futures-util",
  "http-types",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
  "itertools",
  "metrics",
  "once_cell",
@@ -4590,10 +4680,10 @@ dependencies = [
  "encoding_rs",
  "futures-core",
  "futures-util",
- "h2",
+ "h2 0.3.26",
  "http 0.2.9",
- "http-body",
- "hyper",
+ "http-body 0.4.5",
+ "hyper 0.14.26",
  "hyper-rustls",
  "hyper-tls",
  "ipnet",
@@ -4651,7 +4741,7 @@ dependencies = [
  "futures",
  "getrandom 0.2.11",
  "http 0.2.9",
- "hyper",
+ "hyper 0.14.26",
  "parking_lot 0.11.2",
  "reqwest",
  "reqwest-middleware",
@@ -4738,7 +4828,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945"
 dependencies = [
  "http 0.2.9",
- "hyper",
+ "hyper 0.14.26",
  "lazy_static",
  "percent-encoding",
  "regex",
@@ -5043,7 +5133,7 @@ dependencies = [
  "git-version",
  "hex",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
  "metrics",
  "once_cell",
  "parking_lot 0.12.1",
@@ -5528,9 +5618,9 @@ dependencies = [
 
 [[package]]
 name = "smallvec"
-version = "1.11.0"
+version = "1.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
+checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
 
 [[package]]
 name = "smol_str"
@@ -5622,7 +5712,7 @@ dependencies = [
  "futures-util",
  "git-version",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
  "metrics",
  "once_cell",
  "parking_lot 0.12.1",
@@ -5653,7 +5743,7 @@ dependencies = [
  "git-version",
  "hex",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
  "itertools",
  "lasso",
  "measured",
@@ -5682,7 +5772,7 @@ dependencies = [
  "anyhow",
  "clap",
  "comfy-table",
- "hyper",
+ "hyper 0.14.26",
  "pageserver_api",
  "pageserver_client",
  "reqwest",
@@ -6165,7 +6255,19 @@ dependencies = [
  "futures-util",
  "log",
  "tokio",
- "tungstenite",
+ "tungstenite 0.20.1",
+]
+
+[[package]]
+name = "tokio-tungstenite"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
+dependencies = [
+ "futures-util",
+ "log",
+ "tokio",
+ "tungstenite 0.21.0",
 ]
 
 [[package]]
@@ -6232,10 +6334,10 @@ dependencies = [
  "bytes",
  "futures-core",
  "futures-util",
- "h2",
+ "h2 0.3.26",
  "http 0.2.9",
- "http-body",
- "hyper",
+ "http-body 0.4.5",
+ "hyper 0.14.26",
  "hyper-timeout",
  "percent-encoding",
  "pin-project",
@@ -6421,7 +6523,7 @@ dependencies = [
 name = "tracing-utils"
 version = "0.1.0"
 dependencies = [
- "hyper",
+ "hyper 0.14.26",
  "opentelemetry",
  "opentelemetry-otlp",
  "opentelemetry-semantic-conventions",
@@ -6458,6 +6560,25 @@ dependencies = [
  "utf-8",
 ]
 
+[[package]]
+name = "tungstenite"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
+dependencies = [
+ "byteorder",
+ "bytes",
+ "data-encoding",
+ "http 1.1.0",
+ "httparse",
+ "log",
+ "rand 0.8.5",
+ "sha1",
+ "thiserror",
+ "url",
+ "utf-8",
+]
+
 [[package]]
 name = "twox-hash"
 version = "1.6.3"
@@ -6623,7 +6744,7 @@ dependencies = [
  "hex",
  "hex-literal",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
  "jsonwebtoken",
  "leaky-bucket",
  "metrics",
@@ -7214,7 +7335,7 @@ dependencies = [
  "hashbrown 0.14.0",
  "hex",
  "hmac",
- "hyper",
+ "hyper 0.14.26",
  "indexmap 1.9.3",
  "itertools",
  "libc",
@@ -7252,7 +7373,6 @@ dependencies = [
  "tower",
  "tracing",
  "tracing-core",
- "tungstenite",
  "url",
  "uuid",
  "zeroize",
diff --git a/Cargo.toml b/Cargo.toml
index 5db6b7016a..feea17ab05 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -44,6 +44,7 @@ license = "Apache-2.0"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
+atomic-take = "1.1.0"
 azure_core = "0.18"
 azure_identity = "0.18"
 azure_storage = "0.18"
@@ -97,7 +98,7 @@ http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-hyper-tungstenite = "0.11"
+hyper-tungstenite = "0.13.0"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index b327890be2..12bd67ea36 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -12,6 +12,7 @@ testing = []
 anyhow.workspace = true
 async-compression.workspace = true
 async-trait.workspace = true
+atomic-take.workspace = true
 aws-config.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
@@ -36,6 +37,9 @@ http.workspace = true
 humantime.workspace = true
 hyper-tungstenite.workspace = true
 hyper.workspace = true
+hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
+hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
+http-body-util = { version = "0.1" }
 ipnet.workspace = true
 itertools.workspace = true
 lasso = { workspace = true, features = ["multi-threaded"] }
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 700c8c8681..70f9b4bfab 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -5,19 +5,13 @@ use std::{
     io,
     net::SocketAddr,
     pin::{pin, Pin},
-    sync::Mutex,
     task::{ready, Context, Poll},
 };
 
 use bytes::{Buf, BytesMut};
-use hyper::server::accept::Accept;
-use hyper::server::conn::{AddrIncoming, AddrStream};
-use metrics::IntCounterPairGuard;
+use hyper::server::conn::AddrIncoming;
 use pin_project_lite::pin_project;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
-use uuid::Uuid;
-
-use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
 
 pub struct ProxyProtocolAccept {
     pub incoming: AddrIncoming,
@@ -331,103 +325,6 @@ impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
     }
 }
 
-impl Accept for ProxyProtocolAccept {
-    type Conn = WithConnectionGuard<WithClientIp<AddrStream>>;
-
-    type Error = io::Error;
-
-    fn poll_accept(
-        mut self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
-        let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
-
-        let conn_id = uuid::Uuid::new_v4();
-        let span = tracing::info_span!("http_conn", ?conn_id);
-        {
-            let _enter = span.enter();
-            tracing::info!("accepted new TCP connection");
-        }
-
-        let Some(conn) = conn else {
-            return Poll::Ready(None);
-        };
-
-        Poll::Ready(Some(Ok(WithConnectionGuard {
-            inner: WithClientIp::new(conn),
-            connection_id: Uuid::new_v4(),
-            gauge: Mutex::new(Some(
-                NUM_CLIENT_CONNECTION_GAUGE
-                    .with_label_values(&[self.protocol])
-                    .guard(),
-            )),
-            span,
-        })))
-    }
-}
-
-pin_project! {
-    pub struct WithConnectionGuard<T> {
-        #[pin]
-        pub inner: T,
-        pub connection_id: Uuid,
-        pub gauge: Mutex<Option<IntCounterPairGuard>>,
-        pub span: tracing::Span,
-    }
-
-    impl<S> PinnedDrop for WithConnectionGuard<S> {
-        fn drop(this: Pin<&mut Self>) {
-            let _enter = this.span.enter();
-            tracing::info!("HTTP connection closed")
-        }
-    }
-}
-
-impl<T: AsyncWrite> AsyncWrite for WithConnectionGuard<T> {
-    #[inline]
-    fn poll_write(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        buf: &[u8],
-    ) -> Poll<Result<usize, io::Error>> {
-        self.project().inner.poll_write(cx, buf)
-    }
-
-    #[inline]
-    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
-        self.project().inner.poll_flush(cx)
-    }
-
-    #[inline]
-    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
-        self.project().inner.poll_shutdown(cx)
-    }
-
-    #[inline]
-    fn poll_write_vectored(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        bufs: &[io::IoSlice<'_>],
-    ) -> Poll<Result<usize, io::Error>> {
-        self.project().inner.poll_write_vectored(cx, bufs)
-    }
-
-    #[inline]
-    fn is_write_vectored(&self) -> bool {
-        self.inner.is_write_vectored()
-    }
-}
-
-impl<T: AsyncRead> AsyncRead for WithConnectionGuard<T> {
-    fn poll_read(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        buf: &mut ReadBuf<'_>,
-    ) -> Poll<io::Result<()>> {
-        self.project().inner.poll_read(cx, buf)
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use std::pin::pin;
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index a2010fd613..f275caa7eb 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -4,42 +4,48 @@
 
 mod backend;
 mod conn_pool;
+mod http_util;
 mod json;
 mod sql_over_http;
-pub mod tls_listener;
 mod websocket;
 
+use atomic_take::AtomicTake;
+use bytes::Bytes;
 pub use conn_pool::GlobalConnPoolOptions;
 
-use anyhow::bail;
-use hyper::StatusCode;
-use metrics::IntCounterPairGuard;
+use anyhow::Context;
+use futures::future::{select, Either};
+use futures::TryFutureExt;
+use http::{Method, Response, StatusCode};
+use http_body_util::Full;
+use hyper1::body::Incoming;
+use hyper_util::rt::TokioExecutor;
+use hyper_util::server::conn::auto::Builder;
 use rand::rngs::StdRng;
 use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+use tokio::time::timeout;
+use tokio_rustls::TlsAcceptor;
 use tokio_util::task::TaskTracker;
-use tracing::instrument::Instrumented;
 
 use crate::cancellation::CancellationHandlerMain;
 use crate::config::ProxyConfig;
 use crate::context::RequestMonitoring;
-use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
+use crate::metrics::{NUM_CLIENT_CONNECTION_GAUGE, TLS_HANDSHAKE_FAILURES};
+use crate::protocol2::WithClientIp;
+use crate::proxy::run_until_cancelled;
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
-use hyper::{
-    server::conn::{AddrIncoming, AddrStream},
-    Body, Method, Request, Response,
-};
+use crate::serverless::http_util::{api_error_into_response, json_response};
 
-use std::net::IpAddr;
+use std::net::{IpAddr, SocketAddr};
+use std::pin::pin;
 use std::sync::Arc;
-use std::task::Poll;
-use tls_listener::TlsListener;
-use tokio::net::TcpListener;
-use tokio_util::sync::{CancellationToken, DropGuard};
+use tokio::net::{TcpListener, TcpStream};
+use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn, Instrument};
-use utils::http::{error::ApiError, json::json_response};
+use utils::http::error::ApiError;
 
 pub const SERVERLESS_DRIVER_SNI: &str = "api";
 
@@ -91,161 +97,174 @@ pub async fn task_main(
     tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
     let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into();
 
-    let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
-    let _ = addr_incoming.set_nodelay(true);
-    let addr_incoming = ProxyProtocolAccept {
-        incoming: addr_incoming,
-        protocol: "http",
-    };
+    let connections = tokio_util::task::task_tracker::TaskTracker::new();
+    connections.close(); // allows `connections.wait to complete`
 
-    let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
-    ws_connections.close(); // allows `ws_connections.wait to complete`
+    let server = Builder::new(hyper_util::rt::TokioExecutor::new());
 
-    let tls_listener = TlsListener::new(tls_acceptor, addr_incoming, config.handshake_timeout);
+    while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await {
+        let (conn, peer_addr) = res.context("could not accept TCP stream")?;
+        if let Err(e) = conn.set_nodelay(true) {
+            tracing::error!("could not set nodelay: {e}");
+            continue;
+        }
+        let conn_id = uuid::Uuid::new_v4();
+        let http_conn_span = tracing::info_span!("http_conn", ?conn_id);
 
-    let make_svc = hyper::service::make_service_fn(
-        |stream: &tokio_rustls::server::TlsStream<
-            WithConnectionGuard<WithClientIp<AddrStream>>,
-        >| {
-            let (conn, _) = stream.get_ref();
+        connections.spawn(
+            connection_handler(
+                config,
+                backend.clone(),
+                connections.clone(),
+                cancellation_handler.clone(),
+                endpoint_rate_limiter.clone(),
+                cancellation_token.clone(),
+                server.clone(),
+                tls_acceptor.clone(),
+                conn,
+                peer_addr,
+            )
+            .instrument(http_conn_span),
+        );
+    }
 
-            // this is jank. should dissapear with hyper 1.0 migration.
-            let gauge = conn
-                .gauge
-                .lock()
-                .expect("lock should not be poisoned")
-                .take()
-                .expect("gauge should be set on connection start");
-
-            // Cancel all current inflight HTTP requests if the HTTP connection is closed.
-            let http_cancellation_token = CancellationToken::new();
-            let cancel_connection = http_cancellation_token.clone().drop_guard();
-
-            let span = conn.span.clone();
-            let client_addr = conn.inner.client_addr();
-            let remote_addr = conn.inner.inner.remote_addr();
-            let backend = backend.clone();
-            let ws_connections = ws_connections.clone();
-            let endpoint_rate_limiter = endpoint_rate_limiter.clone();
-            let cancellation_handler = cancellation_handler.clone();
-            async move {
-                let peer_addr = match client_addr {
-                    Some(addr) => addr,
-                    None if config.require_client_ip => bail!("missing required client ip"),
-                    None => remote_addr,
-                };
-                Ok(MetricService::new(
-                    hyper::service::service_fn(move |req: Request<Body>| {
-                        let backend = backend.clone();
-                        let ws_connections2 = ws_connections.clone();
-                        let endpoint_rate_limiter = endpoint_rate_limiter.clone();
-                        let cancellation_handler = cancellation_handler.clone();
-                        let http_cancellation_token = http_cancellation_token.child_token();
-
-                        // `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
-                        // By spawning the future, we ensure it never gets cancelled until it decides to.
-                        ws_connections.spawn(
-                            async move {
-                                // Cancel the current inflight HTTP request if the requets stream is closed.
-                                // This is slightly different to `_cancel_connection` in that
-                                // h2 can cancel individual requests with a `RST_STREAM`.
-                                let _cancel_session = http_cancellation_token.clone().drop_guard();
-
-                                let res = request_handler(
-                                    req,
-                                    config,
-                                    backend,
-                                    ws_connections2,
-                                    cancellation_handler,
-                                    peer_addr.ip(),
-                                    endpoint_rate_limiter,
-                                    http_cancellation_token,
-                                )
-                                .await
-                                .map_or_else(|e| e.into_response(), |r| r);
-
-                                _cancel_session.disarm();
-
-                                res
-                            }
-                            .in_current_span(),
-                        )
-                    }),
-                    gauge,
-                    cancel_connection,
-                    span,
-                ))
-            }
-        },
-    );
-
-    hyper::Server::builder(tls_listener)
-        .serve(make_svc)
-        .with_graceful_shutdown(cancellation_token.cancelled())
-        .await?;
-
-    // await websocket connections
-    ws_connections.wait().await;
+    connections.wait().await;
 
     Ok(())
 }
 
-struct MetricService<S> {
-    inner: S,
-    _gauge: IntCounterPairGuard,
-    _cancel: DropGuard,
-    span: tracing::Span,
-}
+/// Handles the TCP lifecycle.
+///
+/// 1. Parses PROXY protocol V2
+/// 2. Handles TLS handshake
+/// 3. Handles HTTP connection
+///     1. With graceful shutdowns
+///     2. With graceful request cancellation with connection failure
+///     3. With websocket upgrade support.
+#[allow(clippy::too_many_arguments)]
+async fn connection_handler(
+    config: &'static ProxyConfig,
+    backend: Arc<PoolingBackend>,
+    connections: TaskTracker,
+    cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    cancellation_token: CancellationToken,
+    server: Builder<TokioExecutor>,
+    tls_acceptor: TlsAcceptor,
+    conn: TcpStream,
+    peer_addr: SocketAddr,
+) {
+    let session_id = uuid::Uuid::new_v4();
 
-impl<S> MetricService<S> {
-    fn new(
-        inner: S,
-        _gauge: IntCounterPairGuard,
-        _cancel: DropGuard,
-        span: tracing::Span,
-    ) -> MetricService<S> {
-        MetricService {
-            inner,
-            _gauge,
-            _cancel,
-            span,
+    let _gauge = NUM_CLIENT_CONNECTION_GAUGE
+        .with_label_values(&["http"])
+        .guard();
+
+    // handle PROXY protocol
+    let mut conn = WithClientIp::new(conn);
+    let peer = match conn.wait_for_addr().await {
+        Ok(peer) => peer,
+        Err(e) => {
+            tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
+            return;
         }
-    }
-}
+    };
 
-impl<S, ReqBody> hyper::service::Service<Request<ReqBody>> for MetricService<S>
-where
-    S: hyper::service::Service<Request<ReqBody>>,
-{
-    type Response = S::Response;
-    type Error = S::Error;
-    type Future = Instrumented<S::Future>;
+    let peer_addr = peer.unwrap_or(peer_addr).ip();
+    info!(?session_id, %peer_addr, "accepted new TCP connection");
 
-    fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
-        self.inner.poll_ready(cx)
-    }
+    // try upgrade to TLS, but with a timeout.
+    let conn = match timeout(config.handshake_timeout, tls_acceptor.accept(conn)).await {
+        Ok(Ok(conn)) => {
+            info!(?session_id, %peer_addr, "accepted new TLS connection");
+            conn
+        }
+        // The handshake failed
+        Ok(Err(e)) => {
+            TLS_HANDSHAKE_FAILURES.inc();
+            warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
+            return;
+        }
+        // The handshake timed out
+        Err(e) => {
+            TLS_HANDSHAKE_FAILURES.inc();
+            warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
+            return;
+        }
+    };
 
-    fn call(&mut self, req: Request<ReqBody>) -> Self::Future {
-        self.span
-            .in_scope(|| self.inner.call(req))
-            .instrument(self.span.clone())
+    let session_id = AtomicTake::new(session_id);
+
+    // Cancel all current inflight HTTP requests if the HTTP connection is closed.
+    let http_cancellation_token = CancellationToken::new();
+    let _cancel_connection = http_cancellation_token.clone().drop_guard();
+
+    let conn = server.serve_connection_with_upgrades(
+        hyper_util::rt::TokioIo::new(conn),
+        hyper1::service::service_fn(move |req: hyper1::Request<Incoming>| {
+            // First HTTP request shares the same session ID
+            let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4);
+
+            // Cancel the current inflight HTTP request if the requets stream is closed.
+            // This is slightly different to `_cancel_connection` in that
+            // h2 can cancel individual requests with a `RST_STREAM`.
+            let http_request_token = http_cancellation_token.child_token();
+            let cancel_request = http_request_token.clone().drop_guard();
+
+            // `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
+            // By spawning the future, we ensure it never gets cancelled until it decides to.
+            let handler = connections.spawn(
+                request_handler(
+                    req,
+                    config,
+                    backend.clone(),
+                    connections.clone(),
+                    cancellation_handler.clone(),
+                    session_id,
+                    peer_addr,
+                    endpoint_rate_limiter.clone(),
+                    http_request_token,
+                )
+                .in_current_span()
+                .map_ok_or_else(api_error_into_response, |r| r),
+            );
+
+            async move {
+                let res = handler.await;
+                cancel_request.disarm();
+                res
+            }
+        }),
+    );
+
+    // On cancellation, trigger the HTTP connection handler to shut down.
+    let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await {
+        Either::Left((_cancelled, mut conn)) => {
+            conn.as_mut().graceful_shutdown();
+            conn.await
+        }
+        Either::Right((res, _)) => res,
+    };
+
+    match res {
+        Ok(()) => tracing::info!(%peer_addr, "HTTP connection closed"),
+        Err(e) => tracing::warn!(%peer_addr, "HTTP connection error {e}"),
     }
 }
 
 #[allow(clippy::too_many_arguments)]
 async fn request_handler(
-    mut request: Request<Body>,
+    mut request: hyper1::Request<Incoming>,
     config: &'static ProxyConfig,
     backend: Arc<PoolingBackend>,
     ws_connections: TaskTracker,
     cancellation_handler: Arc<CancellationHandlerMain>,
+    session_id: uuid::Uuid,
     peer_addr: IpAddr,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     // used to cancel in-flight HTTP requests. not used to cancel websockets
     http_cancellation_token: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let session_id = uuid::Uuid::new_v4();
-
+) -> Result<Response<Full<Bytes>>, ApiError> {
     let host = request
         .headers()
         .get("host")
@@ -282,14 +301,14 @@ async fn request_handler(
 
         // Return the response so the spawned future can continue.
         Ok(response)
-    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
+    } else if request.uri().path() == "/sql" && *request.method() == Method::POST {
         let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
         let span = ctx.span.clone();
 
         sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
             .instrument(span)
             .await
-    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
+    } else if request.uri().path() == "/sql" && *request.method() == Method::OPTIONS {
         Response::builder()
             .header("Allow", "OPTIONS, POST")
             .header("Access-Control-Allow-Origin", "*")
@@ -299,7 +318,7 @@ async fn request_handler(
             )
             .header("Access-Control-Max-Age", "86400" /* 24 hours */)
             .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
-            .body(Body::empty())
+            .body(Full::new(Bytes::new()))
             .map_err(|e| ApiError::InternalServerError(e.into()))
     } else {
         json_response(StatusCode::BAD_REQUEST, "query is not supported")
diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs
new file mode 100644
index 0000000000..ab9127b13e
--- /dev/null
+++ b/proxy/src/serverless/http_util.rs
@@ -0,0 +1,92 @@
+//! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility
+//! Will merge back in at some point in the future.
+
+use bytes::Bytes;
+
+use anyhow::Context;
+use http::{Response, StatusCode};
+use http_body_util::Full;
+
+use serde::Serialize;
+use utils::http::error::ApiError;
+
+/// Like [`ApiError::into_response`]
+pub fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
+    match this {
+        ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status(
+            format!("{err:#?}"), // use debug printing so that we give the cause
+            StatusCode::BAD_REQUEST,
+        ),
+        ApiError::Forbidden(_) => {
+            HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::FORBIDDEN)
+        }
+        ApiError::Unauthorized(_) => {
+            HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::UNAUTHORIZED)
+        }
+        ApiError::NotFound(_) => {
+            HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::NOT_FOUND)
+        }
+        ApiError::Conflict(_) => {
+            HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::CONFLICT)
+        }
+        ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status(
+            this.to_string(),
+            StatusCode::PRECONDITION_FAILED,
+        ),
+        ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
+            "Shutting down".to_string(),
+            StatusCode::SERVICE_UNAVAILABLE,
+        ),
+        ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status(
+            err.to_string(),
+            StatusCode::SERVICE_UNAVAILABLE,
+        ),
+        ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
+            err.to_string(),
+            StatusCode::REQUEST_TIMEOUT,
+        ),
+        ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
+            err.to_string(),
+            StatusCode::INTERNAL_SERVER_ERROR,
+        ),
+    }
+}
+
+/// Same as [`utils::http::error::HttpErrorBody`]
+#[derive(Serialize)]
+struct HttpErrorBody {
+    pub msg: String,
+}
+
+impl HttpErrorBody {
+    /// Same as [`utils::http::error::HttpErrorBody::response_from_msg_and_status`]
+    fn response_from_msg_and_status(msg: String, status: StatusCode) -> Response<Full<Bytes>> {
+        HttpErrorBody { msg }.to_response(status)
+    }
+
+    /// Same as [`utils::http::error::HttpErrorBody::to_response`]
+    fn to_response(&self, status: StatusCode) -> Response<Full<Bytes>> {
+        Response::builder()
+            .status(status)
+            .header(http::header::CONTENT_TYPE, "application/json")
+            // we do not have nested maps with non string keys so serialization shouldn't fail
+            .body(Full::new(Bytes::from(serde_json::to_string(self).unwrap())))
+            .unwrap()
+    }
+}
+
+/// Same as [`utils::http::json::json_response`]
+pub fn json_response<T: Serialize>(
+    status: StatusCode,
+    data: T,
+) -> Result<Response<Full<Bytes>>, ApiError> {
+    let json = serde_json::to_string(&data)
+        .context("Failed to serialize JSON response")
+        .map_err(ApiError::InternalServerError)?;
+    let response = Response::builder()
+        .status(status)
+        .header(http::header::CONTENT_TYPE, "application/json")
+        .body(Full::new(Bytes::from(json)))
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+    Ok(response)
+}
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 00dffd5784..7f7f93988c 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,18 +1,22 @@
 use std::pin::pin;
 use std::sync::Arc;
 
+use bytes::Bytes;
 use futures::future::select;
 use futures::future::try_join;
 use futures::future::Either;
 use futures::StreamExt;
 use futures::TryFutureExt;
-use hyper::body::HttpBody;
-use hyper::header;
-use hyper::http::HeaderName;
-use hyper::http::HeaderValue;
-use hyper::Response;
-use hyper::StatusCode;
-use hyper::{Body, HeaderMap, Request};
+use http_body_util::BodyExt;
+use http_body_util::Full;
+use hyper1::body::Body;
+use hyper1::body::Incoming;
+use hyper1::header;
+use hyper1::http::HeaderName;
+use hyper1::http::HeaderValue;
+use hyper1::Response;
+use hyper1::StatusCode;
+use hyper1::{HeaderMap, Request};
 use serde_json::json;
 use serde_json::Value;
 use tokio::time;
@@ -29,7 +33,6 @@ use tracing::error;
 use tracing::info;
 use url::Url;
 use utils::http::error::ApiError;
-use utils::http::json::json_response;
 
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::endpoint_sni;
@@ -52,6 +55,7 @@ use crate::RoleName;
 use super::backend::PoolingBackend;
 use super::conn_pool::Client;
 use super::conn_pool::ConnInfo;
+use super::http_util::json_response;
 use super::json::json_to_pg_text;
 use super::json::pg_text_row_to_json;
 use super::json::JsonConversionError;
@@ -218,10 +222,10 @@ fn get_conn_info(
 pub async fn handle(
     config: &'static ProxyConfig,
     mut ctx: RequestMonitoring,
-    request: Request<Body>,
+    request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
     cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
+) -> Result<Response<Full<Bytes>>, ApiError> {
     let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
 
     let mut response = match result {
@@ -332,10 +336,9 @@ pub async fn handle(
         }
     };
 
-    response.headers_mut().insert(
-        "Access-Control-Allow-Origin",
-        hyper::http::HeaderValue::from_static("*"),
-    );
+    response
+        .headers_mut()
+        .insert("Access-Control-Allow-Origin", HeaderValue::from_static("*"));
     Ok(response)
 }
 
@@ -396,7 +399,7 @@ impl UserFacingError for SqlOverHttpError {
 #[derive(Debug, thiserror::Error)]
 pub enum ReadPayloadError {
     #[error("could not read the HTTP request body: {0}")]
-    Read(#[from] hyper::Error),
+    Read(#[from] hyper1::Error),
     #[error("could not parse the HTTP request body: {0}")]
     Parse(#[from] serde_json::Error),
 }
@@ -437,7 +440,7 @@ struct HttpHeaders {
 }
 
 impl HttpHeaders {
-    fn try_parse(headers: &hyper::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
+    fn try_parse(headers: &hyper1::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
         // Determine the output options. Default behaviour is 'false'. Anything that is not
         // strictly 'true' assumed to be false.
         let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
@@ -488,9 +491,9 @@ async fn handle_inner(
     cancel: CancellationToken,
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
-    request: Request<Body>,
+    request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
-) -> Result<Response<Body>, SqlOverHttpError> {
+) -> Result<Response<Full<Bytes>>, SqlOverHttpError> {
     let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
         .with_label_values(&[ctx.protocol])
         .guard();
@@ -528,7 +531,7 @@ async fn handle_inner(
     }
 
     let fetch_and_process_request = async {
-        let body = hyper::body::to_bytes(request.into_body()).await?;
+        let body = request.into_body().collect().await?.to_bytes();
         info!(length = body.len(), "request payload read");
         let payload: Payload = serde_json::from_slice(&body)?;
         Ok::<Payload, ReadPayloadError>(payload) // Adjust error type accordingly
@@ -596,7 +599,7 @@ async fn handle_inner(
     let body = serde_json::to_string(&result).expect("json serialization should not fail");
     let len = body.len();
     let response = response
-        .body(Body::from(body))
+        .body(Full::new(Bytes::from(body)))
         // only fails if invalid status code or invalid header/values are given.
         // these are not user configurable so it cannot fail dynamically
         .expect("building response payload should not fail");
@@ -639,6 +642,7 @@ impl QueryData {
             }
             // The query was cancelled.
             Either::Right((_cancelled, query)) => {
+                tracing::info!("cancelling query");
                 if let Err(err) = cancel_token.cancel_query(NoTls).await {
                     tracing::error!(?err, "could not cancel query");
                 }
diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs
deleted file mode 100644
index 33f194dd59..0000000000
--- a/proxy/src/serverless/tls_listener.rs
+++ /dev/null
@@ -1,123 +0,0 @@
-use std::{
-    convert::Infallible,
-    pin::Pin,
-    task::{Context, Poll},
-    time::Duration,
-};
-
-use hyper::server::{accept::Accept, conn::AddrStream};
-use pin_project_lite::pin_project;
-use tokio::{
-    io::{AsyncRead, AsyncWrite},
-    task::JoinSet,
-    time::timeout,
-};
-use tokio_rustls::{server::TlsStream, TlsAcceptor};
-use tracing::{info, warn, Instrument};
-
-use crate::{
-    metrics::TLS_HANDSHAKE_FAILURES,
-    protocol2::{WithClientIp, WithConnectionGuard},
-};
-
-pin_project! {
-    /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself
-    /// encrypted using TLS.
-    pub(crate) struct TlsListener<A: Accept> {
-        #[pin]
-        listener: A,
-        tls: TlsAcceptor,
-        waiting: JoinSet<Option<TlsStream<A::Conn>>>,
-        timeout: Duration,
-    }
-}
-
-impl<A: Accept> TlsListener<A> {
-    /// Create a `TlsListener` with default options.
-    pub(crate) fn new(tls: TlsAcceptor, listener: A, timeout: Duration) -> Self {
-        TlsListener {
-            listener,
-            tls,
-            waiting: JoinSet::new(),
-            timeout,
-        }
-    }
-}
-
-impl<A> Accept for TlsListener<A>
-where
-    A: Accept<Conn = WithConnectionGuard<WithClientIp<AddrStream>>>,
-    A::Error: std::error::Error,
-    A::Conn: AsyncRead + AsyncWrite + Unpin + Send + 'static,
-{
-    type Conn = TlsStream<A::Conn>;
-
-    type Error = Infallible;
-
-    fn poll_accept(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
-        let mut this = self.project();
-
-        loop {
-            match this.listener.as_mut().poll_accept(cx) {
-                Poll::Pending => break,
-                Poll::Ready(Some(Ok(mut conn))) => {
-                    let t = *this.timeout;
-                    let tls = this.tls.clone();
-                    let span = conn.span.clone();
-                    this.waiting.spawn(async move {
-                        let peer_addr = match conn.inner.wait_for_addr().await {
-                            Ok(Some(addr)) => addr,
-                            Err(e) => {
-                                tracing::error!("failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
-                                return None;
-                            }
-                            Ok(None) => conn.inner.inner.remote_addr()
-                        };
-
-                        let accept = tls.accept(conn);
-                        match timeout(t, accept).await {
-                            Ok(Ok(conn)) => {
-                                info!(%peer_addr, "accepted new TLS connection");
-                                Some(conn)
-                            },
-                            // The handshake failed, try getting another connection from the queue
-                            Ok(Err(e)) => {
-                                TLS_HANDSHAKE_FAILURES.inc();
-                                warn!(%peer_addr, "failed to accept TLS connection: {e:?}");
-                                None
-                            }
-                            // The handshake timed out, try getting another connection from the queue
-                            Err(_) => {
-                                TLS_HANDSHAKE_FAILURES.inc();
-                                warn!(%peer_addr, "failed to accept TLS connection: timeout");
-                                None
-                            }
-                        }
-                    }.instrument(span));
-                }
-                Poll::Ready(Some(Err(e))) => {
-                    tracing::error!("error accepting TCP connection: {e}");
-                    continue;
-                }
-                Poll::Ready(None) => return Poll::Ready(None),
-            }
-        }
-
-        loop {
-            return match this.waiting.poll_join_next(cx) {
-                Poll::Ready(Some(Ok(Some(conn)))) => Poll::Ready(Some(Ok(conn))),
-                // The handshake failed to complete, try getting another connection from the queue
-                Poll::Ready(Some(Ok(None))) => continue,
-                // The handshake panicked or was cancelled. ignore and get another connection
-                Poll::Ready(Some(Err(e))) => {
-                    tracing::warn!("handshake aborted: {e}");
-                    continue;
-                }
-                _ => Poll::Pending,
-            };
-        }
-    }
-}
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index bcbd4daa7e..d6e2cc2996 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -63,7 +63,7 @@ scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
 sha2 = { version = "0.10", features = ["asm"] }
-smallvec = { version = "1", default-features = false, features = ["write"] }
+smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
 subtle = { version = "2" }
 time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
@@ -75,7 +75,6 @@ tonic = { version = "0.9", features = ["tls-roots"] }
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
-tungstenite = { version = "0.20" }
 url = { version = "2", features = ["serde"] }
 uuid = { version = "1", features = ["serde", "v4", "v7"] }
 zeroize = { version = "1", features = ["derive"] }

From 5efe95a008bb6a19ec9676a0c7b1a5516f85e4c1 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 10 Apr 2024 10:30:09 +0200
Subject: [PATCH 0549/1571] proxy: fix credentials cache lookup (#7349)

## Problem

Incorrect processing of `-pooler` connections.

## Summary of changes

Fix

TODO: add e2e tests for caching
---
 proxy/src/cache/endpoints.rs       |  5 ++---
 proxy/src/console/provider/neon.rs | 32 ++++++++++++++++++------------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 9bc019c2d8..31e3ef6891 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -21,7 +21,7 @@ use crate::{
     metrics::REDIS_BROKEN_MESSAGES,
     rate_limiter::GlobalRateLimiter,
     redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
-    EndpointId, Normalize,
+    EndpointId,
 };
 
 #[derive(Deserialize, Debug, Clone)]
@@ -72,9 +72,8 @@ impl EndpointsCache {
         !rejected
     }
     fn should_reject(&self, endpoint: &EndpointId) -> bool {
-        let endpoint = endpoint.normalize();
         if endpoint.is_endpoint() {
-            !self.endpoints.contains(&EndpointIdInt::from(&endpoint))
+            !self.endpoints.contains(&EndpointIdInt::from(endpoint))
         } else if endpoint.is_branch() {
             !self
                 .branches
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 3a0e5609d8..68b91447f9 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -59,7 +59,7 @@ impl Api {
         if !self
             .caches
             .endpoints_cache
-            .is_valid(ctx, &user_info.endpoint)
+            .is_valid(ctx, &user_info.endpoint.normalize())
             .await
         {
             info!("endpoint is not valid, skipping the request");
@@ -186,23 +186,27 @@ impl super::Api for Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        let ep = &user_info.endpoint;
+        let normalized_ep = &user_info.endpoint.normalize();
         let user = &user_info.user;
-        if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) {
+        if let Some(role_secret) = self
+            .caches
+            .project_info
+            .get_role_secret(normalized_ep, user)
+        {
             return Ok(role_secret);
         }
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         if let Some(project_id) = auth_info.project_id {
-            let ep_int = ep.normalize().into();
+            let normalized_ep_int = normalized_ep.into();
             self.caches.project_info.insert_role_secret(
                 project_id,
-                ep_int,
+                normalized_ep_int,
                 user.into(),
                 auth_info.secret.clone(),
             );
             self.caches.project_info.insert_allowed_ips(
                 project_id,
-                ep_int,
+                normalized_ep_int,
                 Arc::new(auth_info.allowed_ips),
             );
             ctx.set_project_id(project_id);
@@ -216,8 +220,8 @@ impl super::Api for Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        let ep = &user_info.endpoint;
-        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
             ALLOWED_IPS_BY_CACHE_OUTCOME
                 .with_label_values(&["hit"])
                 .inc();
@@ -230,16 +234,18 @@ impl super::Api for Api {
         let allowed_ips = Arc::new(auth_info.allowed_ips);
         let user = &user_info.user;
         if let Some(project_id) = auth_info.project_id {
-            let ep_int = ep.normalize().into();
+            let normalized_ep_int = normalized_ep.into();
             self.caches.project_info.insert_role_secret(
                 project_id,
-                ep_int,
+                normalized_ep_int,
                 user.into(),
                 auth_info.secret.clone(),
             );
-            self.caches
-                .project_info
-                .insert_allowed_ips(project_id, ep_int, allowed_ips.clone());
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
             ctx.set_project_id(project_id);
         }
         Ok((

From 0bb04ebe19c1dd024c7762926ecce166f4259d82 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 10 Apr 2024 12:12:55 +0200
Subject: [PATCH 0550/1571] Revert "Proxy read ids from redis (#7205)" (#7350)

This reverts commit dbac2d2c473f3648251f0a64e36d066f444dfe00.

## Problem

Proxy pods fails to install in k8s clusters, cplane release blocking.

## Summary of changes

Revert
---
 proxy/src/auth/backend.rs                     |   4 +-
 proxy/src/bin/proxy.rs                        |  15 +-
 proxy/src/cache.rs                            |   1 -
 proxy/src/cache/endpoints.rs                  | 190 ------------------
 proxy/src/config.rs                           |  69 -------
 proxy/src/console/provider.rs                 |  22 +-
 proxy/src/console/provider/neon.rs            |  46 ++---
 proxy/src/context.rs                          |  15 +-
 proxy/src/intern.rs                           |  15 --
 proxy/src/lib.rs                              |  37 ----
 proxy/src/metrics.rs                          |  12 --
 proxy/src/proxy.rs                            |   4 +-
 proxy/src/rate_limiter.rs                     |   2 +-
 proxy/src/rate_limiter/limiter.rs             |  10 +-
 proxy/src/redis/cancellation_publisher.rs     |   6 +-
 .../regress/test_proxy_rate_limiter.py        |  84 ++++++++
 16 files changed, 124 insertions(+), 408 deletions(-)
 delete mode 100644 proxy/src/cache/endpoints.rs
 create mode 100644 test_runner/regress/test_proxy_rate_limiter.py

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 71e9da18bc..e421798067 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -27,7 +27,7 @@ use crate::{
     },
     stream, url,
 };
-use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
+use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
@@ -186,7 +186,7 @@ impl AuthenticationConfig {
         is_cleartext: bool,
     ) -> auth::Result<AuthSecret> {
         // we have validated the endpoint exists, so let's intern it.
-        let endpoint_int = EndpointIdInt::from(endpoint.normalize());
+        let endpoint_int = EndpointIdInt::from(endpoint);
 
         // only count the full hash count if password hack or websocket flow.
         // in other words, if proxy needs to run the hashing
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 9302b31d5c..56a3ef79cd 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -189,9 +189,7 @@ struct ProxyCliArgs {
     /// cache for `project_info` (use `size=0` to disable)
     #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
     project_info_cache: String,
-    /// cache for all valid endpoints
-    #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
-    endpoint_cache_config: String,
+
     #[clap(flatten)]
     parquet_upload: ParquetUploadArgs,
 
@@ -403,7 +401,6 @@ async fn main() -> anyhow::Result<()> {
 
     if let auth::BackendType::Console(api, _) = &config.auth_backend {
         if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
-            maintenance_tasks.spawn(api.locks.garbage_collect_worker());
             if let Some(redis_notifications_client) = redis_notifications_client {
                 let cache = api.caches.project_info.clone();
                 maintenance_tasks.spawn(notifications::task_main(
@@ -413,9 +410,6 @@ async fn main() -> anyhow::Result<()> {
                     args.region.clone(),
                 ));
                 maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
-                let cache = api.caches.endpoints_cache.clone();
-                let con = redis_notifications_client.clone();
-                maintenance_tasks.spawn(async move { cache.do_read(con).await });
             }
         }
     }
@@ -495,18 +489,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
                 args.project_info_cache.parse()?;
-            let endpoint_cache_config: config::EndpointCacheConfig =
-                args.endpoint_cache_config.parse()?;
 
             info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
             info!(
                 "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
             );
-            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
             let caches = Box::leak(Box::new(console::caches::ApiCaches::new(
                 wake_compute_cache_config,
                 project_info_cache_config,
-                endpoint_cache_config,
             )));
 
             let config::WakeComputeLockOptions {
@@ -517,9 +507,10 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             } = args.wake_compute_lock.parse()?;
             info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
             let locks = Box::leak(Box::new(
-                console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout, epoch)
+                console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout)
                     .unwrap(),
             ));
+            tokio::spawn(locks.garbage_collect_worker(epoch));
 
             let url = args.auth_endpoint.parse()?;
             let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));
diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs
index d1d4087241..fc5f416395 100644
--- a/proxy/src/cache.rs
+++ b/proxy/src/cache.rs
@@ -1,5 +1,4 @@
 pub mod common;
-pub mod endpoints;
 pub mod project_info;
 mod timed_lru;
 
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
deleted file mode 100644
index 31e3ef6891..0000000000
--- a/proxy/src/cache/endpoints.rs
+++ /dev/null
@@ -1,190 +0,0 @@
-use std::{
-    convert::Infallible,
-    sync::{
-        atomic::{AtomicBool, Ordering},
-        Arc,
-    },
-};
-
-use dashmap::DashSet;
-use redis::{
-    streams::{StreamReadOptions, StreamReadReply},
-    AsyncCommands, FromRedisValue, Value,
-};
-use serde::Deserialize;
-use tokio::sync::Mutex;
-
-use crate::{
-    config::EndpointCacheConfig,
-    context::RequestMonitoring,
-    intern::{BranchIdInt, EndpointIdInt, ProjectIdInt},
-    metrics::REDIS_BROKEN_MESSAGES,
-    rate_limiter::GlobalRateLimiter,
-    redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
-    EndpointId,
-};
-
-#[derive(Deserialize, Debug, Clone)]
-#[serde(rename_all(deserialize = "snake_case"))]
-pub enum ControlPlaneEventKey {
-    EndpointCreated,
-    BranchCreated,
-    ProjectCreated,
-}
-
-pub struct EndpointsCache {
-    config: EndpointCacheConfig,
-    endpoints: DashSet<EndpointIdInt>,
-    branches: DashSet<BranchIdInt>,
-    projects: DashSet<ProjectIdInt>,
-    ready: AtomicBool,
-    limiter: Arc<Mutex<GlobalRateLimiter>>,
-}
-
-impl EndpointsCache {
-    pub fn new(config: EndpointCacheConfig) -> Self {
-        Self {
-            limiter: Arc::new(Mutex::new(GlobalRateLimiter::new(
-                config.limiter_info.clone(),
-            ))),
-            config,
-            endpoints: DashSet::new(),
-            branches: DashSet::new(),
-            projects: DashSet::new(),
-            ready: AtomicBool::new(false),
-        }
-    }
-    pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
-        if !self.ready.load(Ordering::Acquire) {
-            return true;
-        }
-        // If cache is disabled, just collect the metrics and return.
-        if self.config.disable_cache {
-            ctx.set_rejected(self.should_reject(endpoint));
-            return true;
-        }
-        // If the limiter allows, we don't need to check the cache.
-        if self.limiter.lock().await.check() {
-            return true;
-        }
-        let rejected = self.should_reject(endpoint);
-        ctx.set_rejected(rejected);
-        !rejected
-    }
-    fn should_reject(&self, endpoint: &EndpointId) -> bool {
-        if endpoint.is_endpoint() {
-            !self.endpoints.contains(&EndpointIdInt::from(endpoint))
-        } else if endpoint.is_branch() {
-            !self
-                .branches
-                .contains(&BranchIdInt::from(&endpoint.as_branch()))
-        } else {
-            !self
-                .projects
-                .contains(&ProjectIdInt::from(&endpoint.as_project()))
-        }
-    }
-    fn insert_event(&self, key: ControlPlaneEventKey, value: String) {
-        // Do not do normalization here, we expect the events to be normalized.
-        match key {
-            ControlPlaneEventKey::EndpointCreated => {
-                self.endpoints.insert(EndpointIdInt::from(&value.into()));
-            }
-            ControlPlaneEventKey::BranchCreated => {
-                self.branches.insert(BranchIdInt::from(&value.into()));
-            }
-            ControlPlaneEventKey::ProjectCreated => {
-                self.projects.insert(ProjectIdInt::from(&value.into()));
-            }
-        }
-    }
-    pub async fn do_read(
-        &self,
-        mut con: ConnectionWithCredentialsProvider,
-    ) -> anyhow::Result<Infallible> {
-        let mut last_id = "0-0".to_string();
-        loop {
-            self.ready.store(false, Ordering::Release);
-            if let Err(e) = con.connect().await {
-                tracing::error!("error connecting to redis: {:?}", e);
-                continue;
-            }
-            if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
-                tracing::error!("error reading from redis: {:?}", e);
-            }
-        }
-    }
-    async fn read_from_stream(
-        &self,
-        con: &mut ConnectionWithCredentialsProvider,
-        last_id: &mut String,
-    ) -> anyhow::Result<()> {
-        tracing::info!("reading endpoints/branches/projects from redis");
-        self.batch_read(
-            con,
-            StreamReadOptions::default().count(self.config.initial_batch_size),
-            last_id,
-            true,
-        )
-        .await?;
-        tracing::info!("ready to filter user requests");
-        self.ready.store(true, Ordering::Release);
-        self.batch_read(
-            con,
-            StreamReadOptions::default()
-                .count(self.config.initial_batch_size)
-                .block(self.config.xread_timeout.as_millis() as usize),
-            last_id,
-            false,
-        )
-        .await
-    }
-    fn parse_key_value(key: &str, value: &Value) -> anyhow::Result<(ControlPlaneEventKey, String)> {
-        Ok((serde_json::from_str(key)?, String::from_redis_value(value)?))
-    }
-    async fn batch_read(
-        &self,
-        conn: &mut ConnectionWithCredentialsProvider,
-        opts: StreamReadOptions,
-        last_id: &mut String,
-        return_when_finish: bool,
-    ) -> anyhow::Result<()> {
-        let mut total: usize = 0;
-        loop {
-            let mut res: StreamReadReply = conn
-                .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts)
-                .await?;
-            if res.keys.len() != 1 {
-                anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name);
-            }
-
-            let res = res.keys.pop().expect("Checked length above");
-
-            if return_when_finish && res.ids.len() <= self.config.default_batch_size {
-                break;
-            }
-            for x in res.ids {
-                total += 1;
-                for (k, v) in x.map {
-                    let (key, value) = match Self::parse_key_value(&k, &v) {
-                        Ok(x) => x,
-                        Err(e) => {
-                            REDIS_BROKEN_MESSAGES
-                                .with_label_values(&[&self.config.stream_name])
-                                .inc();
-                            tracing::error!("error parsing key-value {k}-{v:?}: {e:?}");
-                            continue;
-                        }
-                    };
-                    self.insert_event(key, value);
-                }
-                if total.is_power_of_two() {
-                    tracing::debug!("endpoints read {}", total);
-                }
-                *last_id = x.id;
-            }
-        }
-        tracing::info!("read {} endpoints/branches/projects from redis", total);
-        Ok(())
-    }
-}
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 3bdfb3cfad..fc490c7348 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -313,75 +313,6 @@ impl CertResolver {
     }
 }
 
-#[derive(Debug)]
-pub struct EndpointCacheConfig {
-    /// Batch size to receive all endpoints on the startup.
-    pub initial_batch_size: usize,
-    /// Batch size to receive endpoints.
-    pub default_batch_size: usize,
-    /// Timeouts for the stream read operation.
-    pub xread_timeout: Duration,
-    /// Stream name to read from.
-    pub stream_name: String,
-    /// Limiter info (to distinguish when to enable cache).
-    pub limiter_info: Vec<RateBucketInfo>,
-    /// Disable cache.
-    /// If true, cache is ignored, but reports all statistics.
-    pub disable_cache: bool,
-}
-
-impl EndpointCacheConfig {
-    /// Default options for [`crate::console::provider::NodeInfoCache`].
-    /// Notice that by default the limiter is empty, which means that cache is disabled.
-    pub const CACHE_DEFAULT_OPTIONS: &'static str =
-        "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s";
-
-    /// Parse cache options passed via cmdline.
-    /// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
-    fn parse(options: &str) -> anyhow::Result<Self> {
-        let mut initial_batch_size = None;
-        let mut default_batch_size = None;
-        let mut xread_timeout = None;
-        let mut stream_name = None;
-        let mut limiter_info = vec![];
-        let mut disable_cache = false;
-
-        for option in options.split(',') {
-            let (key, value) = option
-                .split_once('=')
-                .with_context(|| format!("bad key-value pair: {option}"))?;
-
-            match key {
-                "initial_batch_size" => initial_batch_size = Some(value.parse()?),
-                "default_batch_size" => default_batch_size = Some(value.parse()?),
-                "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?),
-                "stream_name" => stream_name = Some(value.to_string()),
-                "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?),
-                "disable_cache" => disable_cache = value.parse()?,
-                unknown => bail!("unknown key: {unknown}"),
-            }
-        }
-        RateBucketInfo::validate(&mut limiter_info)?;
-
-        Ok(Self {
-            initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?,
-            default_batch_size: default_batch_size.context("missing `default_batch_size`")?,
-            xread_timeout: xread_timeout.context("missing `xread_timeout`")?,
-            stream_name: stream_name.context("missing `stream_name`")?,
-            disable_cache,
-            limiter_info,
-        })
-    }
-}
-
-impl FromStr for EndpointCacheConfig {
-    type Err = anyhow::Error;
-
-    fn from_str(options: &str) -> Result<Self, Self::Err> {
-        let error = || format!("failed to parse endpoint cache options '{options}'");
-        Self::parse(options).with_context(error)
-    }
-}
 #[derive(Debug)]
 pub struct MetricBackupCollectionConfig {
     pub interval: Duration,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index ee2bc866ab..f7d621fb12 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -8,15 +8,15 @@ use crate::{
         backend::{ComputeCredentialKeys, ComputeUserInfo},
         IpPattern,
     },
-    cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru},
+    cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru},
     compute,
-    config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
+    config::{CacheOptions, ProjectInfoCacheOptions},
     context::RequestMonitoring,
     intern::ProjectIdInt,
     scram, EndpointCacheKey,
 };
 use dashmap::DashMap;
-use std::{convert::Infallible, sync::Arc, time::Duration};
+use std::{sync::Arc, time::Duration};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio::time::Instant;
 use tracing::info;
@@ -416,15 +416,12 @@ pub struct ApiCaches {
     pub node_info: NodeInfoCache,
     /// Cache which stores project_id -> endpoint_ids mapping.
     pub project_info: Arc<ProjectInfoCacheImpl>,
-    /// List of all valid endpoints.
-    pub endpoints_cache: Arc<EndpointsCache>,
 }
 
 impl ApiCaches {
     pub fn new(
         wake_compute_cache_config: CacheOptions,
         project_info_cache_config: ProjectInfoCacheOptions,
-        endpoint_cache_config: EndpointCacheConfig,
     ) -> Self {
         Self {
             node_info: NodeInfoCache::new(
@@ -434,7 +431,6 @@ impl ApiCaches {
                 true,
             ),
             project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
-            endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
         }
     }
 }
@@ -445,7 +441,6 @@ pub struct ApiLocks {
     node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
     permits: usize,
     timeout: Duration,
-    epoch: std::time::Duration,
     registered: prometheus::IntCounter,
     unregistered: prometheus::IntCounter,
     reclamation_lag: prometheus::Histogram,
@@ -458,7 +453,6 @@ impl ApiLocks {
         permits: usize,
         shards: usize,
         timeout: Duration,
-        epoch: std::time::Duration,
     ) -> prometheus::Result<Self> {
         let registered = prometheus::IntCounter::with_opts(
             prometheus::Opts::new(
@@ -503,7 +497,6 @@ impl ApiLocks {
             node_locks: DashMap::with_shard_amount(shards),
             permits,
             timeout,
-            epoch,
             lock_acquire_lag,
             registered,
             unregistered,
@@ -543,9 +536,12 @@ impl ApiLocks {
         })
     }
 
-    pub async fn garbage_collect_worker(&self) -> anyhow::Result<Infallible> {
-        let mut interval =
-            tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32);
+    pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) {
+        if self.permits == 0 {
+            return;
+        }
+
+        let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32);
         loop {
             for (i, shard) in self.node_locks.shards().iter().enumerate() {
                 interval.tick().await;
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 68b91447f9..1a3e2ca795 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -8,7 +8,6 @@ use super::{
 };
 use crate::{
     auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram,
-    Normalize,
 };
 use crate::{
     cache::Cached,
@@ -24,7 +23,7 @@ use tracing::{error, info, info_span, warn, Instrument};
 pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
-    pub locks: &'static ApiLocks,
+    locks: &'static ApiLocks,
     jwt: String,
 }
 
@@ -56,15 +55,6 @@ impl Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
-        if !self
-            .caches
-            .endpoints_cache
-            .is_valid(ctx, &user_info.endpoint.normalize())
-            .await
-        {
-            info!("endpoint is not valid, skipping the request");
-            return Ok(AuthInfo::default());
-        }
         let request_id = ctx.session_id.to_string();
         let application_name = ctx.console_application_name();
         async {
@@ -91,9 +81,7 @@ impl Api {
                 Ok(body) => body,
                 // Error 404 is special: it's ok not to have a secret.
                 Err(e) => match e.http_status_code() {
-                    Some(http::StatusCode::NOT_FOUND) => {
-                        return Ok(AuthInfo::default());
-                    }
+                    Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()),
                     _otherwise => return Err(e.into()),
                 },
             };
@@ -186,27 +174,23 @@ impl super::Api for Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        let normalized_ep = &user_info.endpoint.normalize();
+        let ep = &user_info.endpoint;
         let user = &user_info.user;
-        if let Some(role_secret) = self
-            .caches
-            .project_info
-            .get_role_secret(normalized_ep, user)
-        {
+        if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) {
             return Ok(role_secret);
         }
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         if let Some(project_id) = auth_info.project_id {
-            let normalized_ep_int = normalized_ep.into();
+            let ep_int = ep.into();
             self.caches.project_info.insert_role_secret(
                 project_id,
-                normalized_ep_int,
+                ep_int,
                 user.into(),
                 auth_info.secret.clone(),
             );
             self.caches.project_info.insert_allowed_ips(
                 project_id,
-                normalized_ep_int,
+                ep_int,
                 Arc::new(auth_info.allowed_ips),
             );
             ctx.set_project_id(project_id);
@@ -220,8 +204,8 @@ impl super::Api for Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        let normalized_ep = &user_info.endpoint.normalize();
-        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
+        let ep = &user_info.endpoint;
+        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) {
             ALLOWED_IPS_BY_CACHE_OUTCOME
                 .with_label_values(&["hit"])
                 .inc();
@@ -234,18 +218,16 @@ impl super::Api for Api {
         let allowed_ips = Arc::new(auth_info.allowed_ips);
         let user = &user_info.user;
         if let Some(project_id) = auth_info.project_id {
-            let normalized_ep_int = normalized_ep.into();
+            let ep_int = ep.into();
             self.caches.project_info.insert_role_secret(
                 project_id,
-                normalized_ep_int,
+                ep_int,
                 user.into(),
                 auth_info.secret.clone(),
             );
-            self.caches.project_info.insert_allowed_ips(
-                project_id,
-                normalized_ep_int,
-                allowed_ips.clone(),
-            );
+            self.caches
+                .project_info
+                .insert_allowed_ips(project_id, ep_int, allowed_ips.clone());
             ctx.set_project_id(project_id);
         }
         Ok((
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 85544f1d65..fec95f4722 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -12,9 +12,7 @@ use crate::{
     console::messages::{ColdStartInfo, MetricsAuxInfo},
     error::ErrorKind,
     intern::{BranchIdInt, ProjectIdInt},
-    metrics::{
-        bool_to_str, LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND, NUM_INVALID_ENDPOINTS,
-    },
+    metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
     DbName, EndpointId, RoleName,
 };
 
@@ -52,8 +50,6 @@ pub struct RequestMonitoring {
     // This sender is here to keep the request monitoring channel open while requests are taking place.
     sender: Option<mpsc::UnboundedSender<RequestData>>,
     pub latency_timer: LatencyTimer,
-    // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
-    rejected: bool,
 }
 
 #[derive(Clone, Debug)]
@@ -97,7 +93,6 @@ impl RequestMonitoring {
             error_kind: None,
             auth_method: None,
             success: false,
-            rejected: false,
             cold_start_info: ColdStartInfo::Unknown,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
@@ -118,10 +113,6 @@ impl RequestMonitoring {
         )
     }
 
-    pub fn set_rejected(&mut self, rejected: bool) {
-        self.rejected = rejected;
-    }
-
     pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
         self.cold_start_info = info;
         self.latency_timer.cold_start_info(info);
@@ -187,10 +178,6 @@ impl RequestMonitoring {
 
 impl Drop for RequestMonitoring {
     fn drop(&mut self) {
-        let outcome = if self.success { "success" } else { "failure" };
-        NUM_INVALID_ENDPOINTS
-            .with_label_values(&[self.protocol, bool_to_str(self.rejected), outcome])
-            .inc();
         if let Some(tx) = self.sender.take() {
             let _: Result<(), _> = tx.send(RequestData::from(&*self));
         }
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index e38135dd22..a6519bdff9 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -160,11 +160,6 @@ impl From<&EndpointId> for EndpointIdInt {
         EndpointIdTag::get_interner().get_or_intern(value)
     }
 }
-impl From<EndpointId> for EndpointIdInt {
-    fn from(value: EndpointId) -> Self {
-        EndpointIdTag::get_interner().get_or_intern(&value)
-    }
-}
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct BranchIdTag;
@@ -180,11 +175,6 @@ impl From<&BranchId> for BranchIdInt {
         BranchIdTag::get_interner().get_or_intern(value)
     }
 }
-impl From<BranchId> for BranchIdInt {
-    fn from(value: BranchId) -> Self {
-        BranchIdTag::get_interner().get_or_intern(&value)
-    }
-}
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct ProjectIdTag;
@@ -200,11 +190,6 @@ impl From<&ProjectId> for ProjectIdInt {
         ProjectIdTag::get_interner().get_or_intern(value)
     }
 }
-impl From<ProjectId> for ProjectIdInt {
-    fn from(value: ProjectId) -> Self {
-        ProjectIdTag::get_interner().get_or_intern(&value)
-    }
-}
 
 #[cfg(test)]
 mod tests {
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 3f6d985fe8..da7c7f3ed2 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -127,24 +127,6 @@ macro_rules! smol_str_wrapper {
     };
 }
 
-const POOLER_SUFFIX: &str = "-pooler";
-
-pub trait Normalize {
-    fn normalize(&self) -> Self;
-}
-
-impl<S: Clone + AsRef<str> + From<String>> Normalize for S {
-    fn normalize(&self) -> Self {
-        if self.as_ref().ends_with(POOLER_SUFFIX) {
-            let mut s = self.as_ref().to_string();
-            s.truncate(s.len() - POOLER_SUFFIX.len());
-            s.into()
-        } else {
-            self.clone()
-        }
-    }
-}
-
 // 90% of role name strings are 20 characters or less.
 smol_str_wrapper!(RoleName);
 // 50% of endpoint strings are 23 characters or less.
@@ -158,22 +140,3 @@ smol_str_wrapper!(ProjectId);
 smol_str_wrapper!(EndpointCacheKey);
 
 smol_str_wrapper!(DbName);
-
-// Endpoints are a bit tricky. Rare they might be branches or projects.
-impl EndpointId {
-    pub fn is_endpoint(&self) -> bool {
-        self.0.starts_with("ep-")
-    }
-    pub fn is_branch(&self) -> bool {
-        self.0.starts_with("br-")
-    }
-    pub fn is_project(&self) -> bool {
-        !self.is_endpoint() && !self.is_branch()
-    }
-    pub fn as_branch(&self) -> BranchId {
-        BranchId(self.0.clone())
-    }
-    pub fn as_project(&self) -> ProjectId {
-        ProjectId(self.0.clone())
-    }
-}
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index f299313e0a..59ee899c08 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -169,18 +169,6 @@ pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
     .unwrap()
 });
 
-pub static NUM_INVALID_ENDPOINTS: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_invalid_endpoints_total",
-        "Number of invalid endpoints (per protocol, per rejected).",
-        // http/ws/tcp, true/false, success/failure
-        // TODO(anna): the last dimension is just a proxy to what we actually want to measure.
-        // We need to measure whether the endpoint was found by cplane or not.
-        &["protocol", "rejected", "outcome"],
-    )
-    .unwrap()
-});
-
 pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client";
 pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis";
 
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 166e761a4e..6051c0a812 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -20,7 +20,7 @@ use crate::{
     proxy::handshake::{handshake, HandshakeData},
     rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
-    EndpointCacheKey, Normalize,
+    EndpointCacheKey,
 };
 use futures::TryFutureExt;
 use itertools::Itertools;
@@ -280,7 +280,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     // check rate limit
     if let Some(ep) = user_info.get_endpoint() {
-        if !endpoint_rate_limiter.check(ep.normalize(), 1) {
+        if !endpoint_rate_limiter.check(ep, 1) {
             return stream
                 .throw_error(auth::AuthError::too_many_connections())
                 .await?;
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index a3b83e5e50..13dffffca0 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -4,4 +4,4 @@ mod limiter;
 pub use aimd::Aimd;
 pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
 pub use limiter::Limiter;
-pub use limiter::{AuthRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
+pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 0503deb311..f590896dd9 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -24,13 +24,13 @@ use super::{
     RateLimiterConfig,
 };
 
-pub struct GlobalRateLimiter {
+pub struct RedisRateLimiter {
     data: Vec<RateBucket>,
-    info: Vec<RateBucketInfo>,
+    info: &'static [RateBucketInfo],
 }
 
-impl GlobalRateLimiter {
-    pub fn new(info: Vec<RateBucketInfo>) -> Self {
+impl RedisRateLimiter {
+    pub fn new(info: &'static [RateBucketInfo]) -> Self {
         Self {
             data: vec![
                 RateBucket {
@@ -50,7 +50,7 @@ impl GlobalRateLimiter {
         let should_allow_request = self
             .data
             .iter_mut()
-            .zip(&self.info)
+            .zip(self.info)
             .all(|(bucket, info)| bucket.should_allow_request(info, now, 1));
 
         if should_allow_request {
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 7baf104374..422789813c 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -5,7 +5,7 @@ use redis::AsyncCommands;
 use tokio::sync::Mutex;
 use uuid::Uuid;
 
-use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
+use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
 
 use super::{
     connection_with_credentials_provider::ConnectionWithCredentialsProvider,
@@ -80,7 +80,7 @@ impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
 pub struct RedisPublisherClient {
     client: ConnectionWithCredentialsProvider,
     region_id: String,
-    limiter: GlobalRateLimiter,
+    limiter: RedisRateLimiter,
 }
 
 impl RedisPublisherClient {
@@ -92,7 +92,7 @@ impl RedisPublisherClient {
         Ok(Self {
             client,
             region_id,
-            limiter: GlobalRateLimiter::new(info.into()),
+            limiter: RedisRateLimiter::new(info),
         })
     }
 
diff --git a/test_runner/regress/test_proxy_rate_limiter.py b/test_runner/regress/test_proxy_rate_limiter.py
new file mode 100644
index 0000000000..f39f0cad07
--- /dev/null
+++ b/test_runner/regress/test_proxy_rate_limiter.py
@@ -0,0 +1,84 @@
+import asyncio
+import time
+from pathlib import Path
+from typing import Iterator
+
+import pytest
+from fixtures.neon_fixtures import (
+    PSQL,
+    NeonProxy,
+)
+from fixtures.port_distributor import PortDistributor
+from pytest_httpserver import HTTPServer
+from werkzeug.wrappers.response import Response
+
+
+def waiting_handler(status_code: int) -> Response:
+    # wait more than timeout to make sure that both (two) connections are open.
+    # It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver.
+    time.sleep(2)
+    return Response(status=status_code)
+
+
+@pytest.fixture(scope="function")
+def proxy_with_rate_limit(
+    port_distributor: PortDistributor,
+    neon_binpath: Path,
+    httpserver_listen_address,
+    test_output_dir: Path,
+) -> Iterator[NeonProxy]:
+    """Neon proxy that routes directly to vanilla postgres."""
+
+    proxy_port = port_distributor.get_port()
+    mgmt_port = port_distributor.get_port()
+    http_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
+    (host, port) = httpserver_listen_address
+    endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
+
+    with NeonProxy(
+        neon_binpath=neon_binpath,
+        test_output_dir=test_output_dir,
+        proxy_port=proxy_port,
+        http_port=http_port,
+        mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
+        auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5),
+    ) as proxy:
+        proxy.start()
+        yield proxy
+
+
+@pytest.mark.asyncio
+async def test_proxy_rate_limit(
+    httpserver: HTTPServer,
+    proxy_with_rate_limit: NeonProxy,
+):
+    uri = "/billing/api/v1/usage_events/proxy_get_role_secret"
+    # mock control plane service
+    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
+        lambda _: Response(status=200)
+    )
+    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
+        lambda _: waiting_handler(429)
+    )
+    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
+        lambda _: waiting_handler(500)
+    )
+
+    psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port)
+    f = await psql.run("select 42;")
+    await proxy_with_rate_limit.find_auth_link(uri, f)
+    # Limit should be 2.
+
+    # Run two queries in parallel.
+    f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;"))
+    await proxy_with_rate_limit.find_auth_link(uri, f1)
+    await proxy_with_rate_limit.find_auth_link(uri, f2)
+
+    # Now limit should be 0.
+    f = await psql.run("select 42;")
+    await proxy_with_rate_limit.find_auth_link(uri, f)
+
+    # There last query shouldn't reach the http-server.
+    assert httpserver.assertions == []

From f86845f64b9576d05b06de9c33dec3c6be19c47c Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Wed, 10 Apr 2024 06:13:48 -0700
Subject: [PATCH 0551/1571] compute_ctl: Auto-set dynamic_shared_memory_type
 (#7348)

Part of neondatabase/cloud#12047.

The basic idea is that for our VMs, we want to enable swap and disable
Linux memory overcommit. Alongside these, we should set postgres'
dynamic_shared_memory_type to mmap, but we want to avoid setting it to
mmap if swap is not enabled.

Implementing this in the control plane would be fiddly, but it's
relatively straightforward to add to compute_ctl.
---
 compute_tools/src/config.rs     | 25 +++++++++++++++++++++++--
 compute_tools/src/pg_helpers.rs |  2 +-
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index f1fd8637f5..89c866b20c 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -6,8 +6,8 @@ use std::path::Path;
 use anyhow::Result;
 
 use crate::pg_helpers::escape_conf_value;
-use crate::pg_helpers::PgOptionsSerialize;
-use compute_api::spec::{ComputeMode, ComputeSpec};
+use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize};
+use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
 
 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
@@ -92,6 +92,27 @@ pub fn write_postgres_conf(
         }
     }
 
+    if cfg!(target_os = "linux") {
+        // Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
+        // disabled), then the control plane has enabled swap and we should set
+        // dynamic_shared_memory_type = 'mmap'.
+        //
+        // This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
+        let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
+            // ignore any errors - they may be expected to occur under certain situations (e.g. when
+            // not running in Linux).
+            .unwrap_or_else(|_| String::new());
+        if overcommit_memory_contents.trim() == "2" {
+            let opt = GenericOption {
+                name: "dynamic_shared_memory_type".to_owned(),
+                value: Some("mmap".to_owned()),
+                vartype: "enum".to_owned(),
+            };
+
+            write!(file, "{}", opt.to_pg_setting())?;
+        }
+    }
+
     // If there are any extra options in the 'settings' field, append those
     if spec.cluster.settings.is_some() {
         writeln!(file, "# Managed by compute_ctl: begin")?;
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 5deb50d6b7..fa0822748b 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
     format!("'{}'", res)
 }
 
-trait GenericOptionExt {
+pub trait GenericOptionExt {
     fn to_pg_option(&self) -> String;
     fn to_pg_setting(&self) -> String;
 }

From d47e4a2a4148ff0b6467d5bda504401b90bb00da Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 11 Apr 2024 07:47:45 +0300
Subject: [PATCH 0552/1571] Remember last written LSN when it is first
 requested (#7343)

## Problem

See https://neondb.slack.com/archives/C03QLRH7PPD/p1712529369520409

In case of statements CREATE TABLE AS SELECT... or INSERT FROM SELECT...
we are fetching data from source table and storing it in destination
table. It cause problems with prefetch last-written-lsn is known for the
pages of source table
(which for example happens after compute restart). In this case we get
get global value of last-written-lsn which is changed frequently as far
as we are writing pages of destination table. As a result request-isn
for the prefetch and request-let when this page is actually needed are
different and we got exported prefetch request. So it actually disarms
prefetch.


## Summary of changes

Proposed simple patch stores last-written LSN for the page when it is
not found. So next time we will request last-written LSN for this page,
we will get the same value (certainly if the page was not changed).

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index a7b4c66156..d9149dc59a 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit a7b4c66156bce00afa60e5592d4284ba9e40b4cf
+Subproject commit d9149dc59abcbeeb26293707509aef51752db28f
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 64b8c7bccc..85d809c124 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 64b8c7bccc6b77e04795e2d4cf6ad82dc8d987ed
+Subproject commit 85d809c124a898847a97d66a211f7d5ef4f8e0cb
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 3946b2e2ea..261497dd63 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 3946b2e2ea71d07af092099cb5bcae76a69b90d6
+Subproject commit 261497dd63ace434045058b1453bcbaaa83f23e5
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 75dc095168..dfc0aa04c3 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "postgres-v16": "3946b2e2ea71d07af092099cb5bcae76a69b90d6",
-  "postgres-v15": "64b8c7bccc6b77e04795e2d4cf6ad82dc8d987ed",
-  "postgres-v14": "a7b4c66156bce00afa60e5592d4284ba9e40b4cf"
+  "postgres-v16": "261497dd63ace434045058b1453bcbaaa83f23e5",
+  "postgres-v15": "85d809c124a898847a97d66a211f7d5ef4f8e0cb",
+  "postgres-v14": "d9149dc59abcbeeb26293707509aef51752db28f"
 }

From db72543f4d4d3300d48375db177c8ee598ed4049 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 11 Apr 2024 12:31:27 +0200
Subject: [PATCH 0553/1571] Reenable test_forward_compatibility (#7358)

It was disabled due to https://github.com/neondatabase/neon/pull/6530
breaking forward compatiblity.
Now that we have deployed it to production, we can reenable the test
---
 test_runner/regress/test_compatibility.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 208263a22a..ddad98a5fa 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -192,9 +192,6 @@ def test_backward_compatibility(
     assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
 
 
-# Forward compatibility is broken due to https://github.com/neondatabase/neon/pull/6530
-# The test is disabled until the next release deployment
-@pytest.mark.xfail
 @check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")

From 1628b5b145b335e4a26fcdb1ccdf4263ab8745cf Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 11 Apr 2024 17:14:09 +0300
Subject: [PATCH 0554/1571] compute hook: use shared client with explicit
 timeout (#7359)

## Problem

We are seeing some mysterious long waits when sending requests.

## Summary of changes

- To eliminate risk that we are incurring some unreasonable overheads
from setup, e.g. DNS, use a single Client (internally a pool) instead of
repeatedly constructing a fresh one.
- To make it clearer where a timeout is occurring, apply a 10 second
timeout to requests as we send them.
---
 storage_controller/src/compute_hook.rs | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index eb0c4472e4..1ed8998713 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -17,6 +17,8 @@ use crate::service::Config;
 
 const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
 
+const NOTIFY_REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
+
 pub(crate) const API_CONCURRENCY: usize = 32;
 
 struct UnshardedComputeHookTenant {
@@ -242,6 +244,10 @@ pub(super) struct ComputeHook {
 
     // This lock is only used in testing enviroments, to serialize calls into neon_lock
     neon_local_lock: tokio::sync::Mutex<()>,
+
+    // We share a client across all notifications to enable connection re-use etc when
+    // sending large numbers of notifications
+    client: reqwest::Client,
 }
 
 impl ComputeHook {
@@ -251,12 +257,18 @@ impl ComputeHook {
             .clone()
             .map(|jwt| format!("Bearer {}", jwt));
 
+        let client = reqwest::ClientBuilder::new()
+            .timeout(NOTIFY_REQUEST_TIMEOUT)
+            .build()
+            .expect("Failed to construct HTTP client");
+
         Self {
             state: Default::default(),
             config,
             authorization_header,
             neon_local_lock: Default::default(),
             api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
+            client,
         }
     }
 
@@ -310,12 +322,11 @@ impl ComputeHook {
 
     async fn do_notify_iteration(
         &self,
-        client: &reqwest::Client,
         url: &String,
         reconfigure_request: &ComputeHookNotifyRequest,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
-        let req = client.request(Method::PUT, url);
+        let req = self.client.request(Method::PUT, url);
         let req = if let Some(value) = &self.authorization_header {
             req.header(reqwest::header::AUTHORIZATION, value)
         } else {
@@ -381,8 +392,6 @@ impl ComputeHook {
         reconfigure_request: &ComputeHookNotifyRequest,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
-        let client = reqwest::Client::new();
-
         // We hold these semaphore units across all retries, rather than only across each
         // HTTP request: this is to preserve fairness and avoid a situation where a retry might
         // time out waiting for a semaphore.
@@ -394,7 +403,7 @@ impl ComputeHook {
             .map_err(|_| NotifyError::ShuttingDown)?;
 
         backoff::retry(
-            || self.do_notify_iteration(&client, url, reconfigure_request, cancel),
+            || self.do_notify_iteration(url, reconfigure_request, cancel),
             |e| {
                 matches!(
                     e,

From 99a56b56064264fd73a7dc3ce5606469725cc4cb Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 11 Apr 2024 15:23:08 +0100
Subject: [PATCH 0555/1571] CI(build-build-tools-image): Do not cancel
 concurrent workflows  (#7226)

## Problem

`build-build-tools-image` workflow is designed to be run only in one
example per the whole repository. Currently, the job gets cancelled if a
newer one is scheduled, here's an example:
https://github.com/neondatabase/neon/actions/runs/8419610607

## Summary of changes
- Explicitly set `cancel-in-progress: false` for all jobs that aren't
supposed to be cancelled
---
 .github/workflows/approved-for-ci-run.yml     | 1 +
 .github/workflows/build-build-tools-image.yml | 1 +
 .github/workflows/pin-build-tools-image.yml   | 1 +
 3 files changed, 3 insertions(+)

diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml
index 69c48d86b9..ab616d17e2 100644
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -18,6 +18,7 @@ on:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+  cancel-in-progress: false
 
 env:
   GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 251423e701..c527cef1ac 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -21,6 +21,7 @@ defaults:
 
 concurrency:
   group: build-build-tools-image-${{ inputs.image-tag }}
+  cancel-in-progress: false
 
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index c941692066..d495a158e8 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -20,6 +20,7 @@ defaults:
 
 concurrency:
   group: pin-build-tools-image-${{ inputs.from-tag }}
+  cancel-in-progress: false
 
 permissions: {}
 

From 5299f917d6d2be5d87b56d236342d48682a5c9f4 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 11 Apr 2024 17:26:01 +0100
Subject: [PATCH 0556/1571] proxy: replace prometheus with measured (#6717)

## Problem

My benchmarks show that prometheus is not very good.
https://github.com/conradludgate/measured

We're already using it in storage_controller and it seems to be working
well.

## Summary of changes

Replace prometheus with my new measured crate in proxy only.

Apologies for the large diff. I tried to keep it as minimal as I could.
The label types add a bit of boiler plate (but reduce the chance we
mistype the labels), and some of our custom metrics like CounterPair and
HLL needed to be rewritten.
---
 Cargo.lock                            |  13 +-
 Cargo.toml                            |   4 +-
 libs/metrics/src/hll.rs               | 395 ++++------------
 libs/metrics/src/lib.rs               | 172 ++++++-
 proxy/Cargo.toml                      |   1 +
 proxy/src/auth/backend.rs             |  10 +-
 proxy/src/auth/credentials.rs         |  21 +-
 proxy/src/bin/pg_sni_router.rs        |   7 +-
 proxy/src/bin/proxy.rs                |  40 +-
 proxy/src/cancellation.rs             |  34 +-
 proxy/src/compute.rs                  |   9 +-
 proxy/src/console/messages.rs         |   5 +-
 proxy/src/console/provider.rs         |  63 +--
 proxy/src/console/provider/neon.rs    |  32 +-
 proxy/src/context.rs                  |  24 +-
 proxy/src/context/parquet.rs          |   2 +-
 proxy/src/error.rs                    |   9 +-
 proxy/src/http.rs                     |  21 +-
 proxy/src/http/health_server.rs       |  89 +++-
 proxy/src/jemalloc.rs                 | 178 +++----
 proxy/src/metrics.rs                  | 658 +++++++++++++++-----------
 proxy/src/proxy.rs                    |  30 +-
 proxy/src/proxy/connect_compute.rs    |   8 +-
 proxy/src/proxy/passthrough.rs        |  16 +-
 proxy/src/proxy/wake_compute.rs       |  31 +-
 proxy/src/rate_limiter/limiter.rs     |  30 +-
 proxy/src/redis/notifications.rs      |  10 +-
 proxy/src/serverless.rs               |  28 +-
 proxy/src/serverless/conn_pool.rs     |  51 +-
 proxy/src/serverless/sql_over_http.rs |  27 +-
 proxy/src/serverless/websocket.rs     |   9 +-
 proxy/src/stream.rs                   |   4 +-
 32 files changed, 1127 insertions(+), 904 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bdf2b08c5c..6faf4b72f0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2932,9 +2932,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
 
 [[package]]
 name = "measured"
-version = "0.0.20"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3cbf033874bea03565f2449572c8640ca37ec26300455faf36001f24755da452"
+checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
 dependencies = [
  "bytes",
  "crossbeam-utils",
@@ -2950,9 +2950,9 @@ dependencies = [
 
 [[package]]
 name = "measured-derive"
-version = "0.0.20"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be9e29b682b38f8af2a89f960455054ab1a9f5a06822f6f3500637ad9fa57def"
+checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
 dependencies = [
  "heck 0.5.0",
  "proc-macro2",
@@ -2962,9 +2962,9 @@ dependencies = [
 
 [[package]]
 name = "measured-process"
-version = "0.0.20"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a20849acdd04c5d6a88f565559044546904648a1842a2937cfff0b48b4ca7ef2"
+checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
 dependencies = [
  "libc",
  "measured",
@@ -4322,6 +4322,7 @@ dependencies = [
  "itertools",
  "lasso",
  "md5",
+ "measured",
  "metrics",
  "native-tls",
  "once_cell",
diff --git a/Cargo.toml b/Cargo.toml
index feea17ab05..8310d2d522 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -107,8 +107,8 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.20", features=["lasso"] }
-measured-process = { version = "0.0.20" }
+measured = { version = "0.0.21", features=["lasso"] }
+measured-process = { version = "0.0.21" }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs
index dfb4461ce9..f53511ab5c 100644
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -7,14 +7,19 @@
 //! use significantly less memory than this, but can only approximate the cardinality.
 
 use std::{
-    collections::HashMap,
-    hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
-    sync::{atomic::AtomicU8, Arc, RwLock},
+    hash::{BuildHasher, BuildHasherDefault, Hash},
+    sync::atomic::AtomicU8,
 };
 
-use prometheus::{
-    core::{self, Describer},
-    proto, Opts,
+use measured::{
+    label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
+    metric::{
+        group::{Encoding, MetricValue},
+        name::MetricNameEncoder,
+        Metric, MetricType, MetricVec,
+    },
+    text::TextEncoder,
+    LabelGroup,
 };
 use twox_hash::xxh3;
 
@@ -93,203 +98,25 @@ macro_rules! register_hll {
 /// ```
 ///
 /// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
-#[derive(Clone)]
-pub struct HyperLogLogVec<const N: usize> {
-    core: Arc<HyperLogLogVecCore<N>>,
+pub type HyperLogLogVec<L, const N: usize> = MetricVec<HyperLogLogState<N>, L>;
+pub type HyperLogLog<const N: usize> = Metric<HyperLogLogState<N>>;
+
+pub struct HyperLogLogState<const N: usize> {
+    shards: [AtomicU8; N],
 }
-
-struct HyperLogLogVecCore<const N: usize> {
-    pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
-    pub desc: core::Desc,
-    pub opts: Opts,
-}
-
-impl<const N: usize> core::Collector for HyperLogLogVec<N> {
-    fn desc(&self) -> Vec<&core::Desc> {
-        vec![&self.core.desc]
-    }
-
-    fn collect(&self) -> Vec<proto::MetricFamily> {
-        let mut m = proto::MetricFamily::default();
-        m.set_name(self.core.desc.fq_name.clone());
-        m.set_help(self.core.desc.help.clone());
-        m.set_field_type(proto::MetricType::GAUGE);
-
-        let mut metrics = Vec::new();
-        for child in self.core.children.read().unwrap().values() {
-            child.core.collect_into(&mut metrics);
-        }
-        m.set_metric(metrics);
-
-        vec![m]
+impl<const N: usize> Default for HyperLogLogState<N> {
+    fn default() -> Self {
+        #[allow(clippy::declare_interior_mutable_const)]
+        const ZERO: AtomicU8 = AtomicU8::new(0);
+        Self { shards: [ZERO; N] }
     }
 }
 
-impl<const N: usize> HyperLogLogVec<N> {
-    /// Create a new [`HyperLogLogVec`] based on the provided
-    /// [`Opts`] and partitioned by the given label names. At least one label name must be
-    /// provided.
-    pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
-        assert!(N.is_power_of_two());
-        let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
-        let opts = opts.variable_labels(variable_names);
-
-        let desc = opts.describe()?;
-        let v = HyperLogLogVecCore {
-            children: RwLock::new(HashMap::default()),
-            desc,
-            opts,
-        };
-
-        Ok(Self { core: Arc::new(v) })
-    }
-
-    /// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
-    /// of label values (same order as the VariableLabels in Desc). If that combination of
-    /// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
-    ///
-    /// An error is returned if the number of label values is not the same as the
-    /// number of VariableLabels in Desc.
-    pub fn get_metric_with_label_values(
-        &self,
-        vals: &[&str],
-    ) -> prometheus::Result<HyperLogLog<N>> {
-        self.core.get_metric_with_label_values(vals)
-    }
-
-    /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
-    /// occurs.
-    pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
-        self.get_metric_with_label_values(vals).unwrap()
-    }
+impl<const N: usize> MetricType for HyperLogLogState<N> {
+    type Metadata = ();
 }
 
-impl<const N: usize> HyperLogLogVecCore<N> {
-    pub fn get_metric_with_label_values(
-        &self,
-        vals: &[&str],
-    ) -> prometheus::Result<HyperLogLog<N>> {
-        let h = self.hash_label_values(vals)?;
-
-        if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
-            return Ok(metric);
-        }
-
-        self.get_or_create_metric(h, vals)
-    }
-
-    pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
-        if vals.len() != self.desc.variable_labels.len() {
-            return Err(prometheus::Error::InconsistentCardinality {
-                expect: self.desc.variable_labels.len(),
-                got: vals.len(),
-            });
-        }
-
-        let mut h = xxh3::Hash64::default();
-        for val in vals {
-            h.write(val.as_bytes());
-        }
-
-        Ok(h.finish())
-    }
-
-    fn get_or_create_metric(
-        &self,
-        hash: u64,
-        label_values: &[&str],
-    ) -> prometheus::Result<HyperLogLog<N>> {
-        let mut children = self.children.write().unwrap();
-        // Check exist first.
-        if let Some(metric) = children.get(&hash).cloned() {
-            return Ok(metric);
-        }
-
-        let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
-        children.insert(hash, metric.clone());
-        Ok(metric)
-    }
-}
-
-/// HLL is a probabilistic cardinality measure.
-///
-/// How to use this time-series for a metric name `my_metrics_total_hll`:
-///
-/// ```promql
-/// # harmonic mean
-/// 1 / (
-///     sum (
-///         2 ^ -(
-///             # HLL merge operation
-///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
-///         )
-///     ) without (hll_shard)
-/// )
-/// * alpha
-/// * shards_count
-/// * shards_count
-/// ```
-///
-/// If you want an estimate over time, you can use the following query:
-///
-/// ```promql
-/// # harmonic mean
-/// 1 / (
-///     sum (
-///         2 ^ -(
-///             # HLL merge operation
-///             max (
-///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
-///             ) by (hll_shard, other_labels...)
-///         )
-///     ) without (hll_shard)
-/// )
-/// * alpha
-/// * shards_count
-/// * shards_count
-/// ```
-///
-/// In the case of low cardinality, you might want to use the linear counting approximation:
-///
-/// ```promql
-/// # LinearCounting(m, V) = m log (m / V)
-/// shards_count * ln(shards_count /
-///     # calculate V = how many shards contain a 0
-///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
-/// )
-/// ```
-///
-/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
-#[derive(Clone)]
-pub struct HyperLogLog<const N: usize> {
-    core: Arc<HyperLogLogCore<N>>,
-}
-
-impl<const N: usize> HyperLogLog<N> {
-    /// Create a [`HyperLogLog`] with the `name` and `help` arguments.
-    pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
-        assert!(N.is_power_of_two());
-        let opts = Opts::new(name, help);
-        Self::with_opts(opts)
-    }
-
-    /// Create a [`HyperLogLog`] with the `opts` options.
-    pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
-        Self::with_opts_and_label_values(&opts, &[])
-    }
-
-    fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
-        let desc = opts.describe()?;
-        let labels = make_label_pairs(&desc, label_values)?;
-
-        let v = HyperLogLogCore {
-            shards: [0; N].map(AtomicU8::new),
-            desc,
-            labels,
-        };
-        Ok(Self { core: Arc::new(v) })
-    }
-
+impl<const N: usize> HyperLogLogState<N> {
     pub fn measure(&self, item: &impl Hash) {
         // changing the hasher will break compatibility with previous measurements.
         self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
@@ -299,42 +126,11 @@ impl<const N: usize> HyperLogLog<N> {
         let p = N.ilog2() as u8;
         let j = hash & (N as u64 - 1);
         let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
-        self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
-    }
-}
-
-struct HyperLogLogCore<const N: usize> {
-    shards: [AtomicU8; N],
-    desc: core::Desc,
-    labels: Vec<proto::LabelPair>,
-}
-
-impl<const N: usize> core::Collector for HyperLogLog<N> {
-    fn desc(&self) -> Vec<&core::Desc> {
-        vec![&self.core.desc]
+        self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
     }
 
-    fn collect(&self) -> Vec<proto::MetricFamily> {
-        let mut m = proto::MetricFamily::default();
-        m.set_name(self.core.desc.fq_name.clone());
-        m.set_help(self.core.desc.help.clone());
-        m.set_field_type(proto::MetricType::GAUGE);
-
-        let mut metrics = Vec::new();
-        self.core.collect_into(&mut metrics);
-        m.set_metric(metrics);
-
-        vec![m]
-    }
-}
-
-impl<const N: usize> HyperLogLogCore<N> {
-    fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
-        self.shards.iter().enumerate().for_each(|(i, x)| {
-            let mut shard_label = proto::LabelPair::default();
-            shard_label.set_name("hll_shard".to_owned());
-            shard_label.set_value(format!("{i}"));
-
+    fn take_sample(&self) -> [u8; N] {
+        self.shards.each_ref().map(|x| {
             // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
 
             // This seems like it would be a race condition,
@@ -344,85 +140,90 @@ impl<const N: usize> HyperLogLogCore<N> {
 
             // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
             // this would mean that a dev port-forwarding the metrics url won't break the sampling.
-            let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
-
-            let mut m = proto::Metric::default();
-            let mut c = proto::Gauge::default();
-            c.set_value(v as f64);
-            m.set_gauge(c);
-
-            let mut labels = Vec::with_capacity(self.labels.len() + 1);
-            labels.extend_from_slice(&self.labels);
-            labels.push(shard_label);
-
-            m.set_label(labels);
-            metrics.push(m);
+            x.swap(0, std::sync::atomic::Ordering::Relaxed)
         })
     }
 }
-
-fn make_label_pairs(
-    desc: &core::Desc,
-    label_values: &[&str],
-) -> prometheus::Result<Vec<proto::LabelPair>> {
-    if desc.variable_labels.len() != label_values.len() {
-        return Err(prometheus::Error::InconsistentCardinality {
-            expect: desc.variable_labels.len(),
-            got: label_values.len(),
-        });
+impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
+    for HyperLogLogState<N>
+{
+    fn write_type(
+        name: impl MetricNameEncoder,
+        enc: &mut TextEncoder<W>,
+    ) -> Result<(), std::io::Error> {
+        enc.write_type(&name, measured::text::MetricType::Gauge)
     }
+    fn collect_into(
+        &self,
+        _: &(),
+        labels: impl LabelGroup,
+        name: impl MetricNameEncoder,
+        enc: &mut TextEncoder<W>,
+    ) -> Result<(), std::io::Error> {
+        struct I64(i64);
+        impl LabelValue for I64 {
+            fn visit<V: LabelVisitor>(&self, v: V) -> V::Output {
+                v.write_int(self.0)
+            }
+        }
 
-    let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
-    if total_len == 0 {
-        return Ok(vec![]);
-    }
+        struct HllShardLabel {
+            hll_shard: i64,
+        }
 
-    if desc.variable_labels.is_empty() {
-        return Ok(desc.const_label_pairs.clone());
-    }
+        impl LabelGroup for HllShardLabel {
+            fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
+                const LE: &LabelName = LabelName::from_str("hll_shard");
+                v.write_value(LE, &I64(self.hll_shard));
+            }
+        }
 
-    let mut label_pairs = Vec::with_capacity(total_len);
-    for (i, n) in desc.variable_labels.iter().enumerate() {
-        let mut label_pair = proto::LabelPair::default();
-        label_pair.set_name(n.clone());
-        label_pair.set_value(label_values[i].to_owned());
-        label_pairs.push(label_pair);
+        self.take_sample()
+            .into_iter()
+            .enumerate()
+            .try_for_each(|(hll_shard, val)| {
+                enc.write_metric_value(
+                    name.by_ref(),
+                    labels.by_ref().compose_with(HllShardLabel {
+                        hll_shard: hll_shard as i64,
+                    }),
+                    MetricValue::Int(val as i64),
+                )
+            })
     }
-
-    for label_pair in &desc.const_label_pairs {
-        label_pairs.push(label_pair.clone());
-    }
-    label_pairs.sort();
-    Ok(label_pairs)
 }
 
 #[cfg(test)]
 mod tests {
     use std::collections::HashSet;
 
-    use prometheus::{proto, Opts};
+    use measured::{label::StaticLabelSet, FixedCardinalityLabel};
     use rand::{rngs::StdRng, Rng, SeedableRng};
     use rand_distr::{Distribution, Zipf};
 
     use crate::HyperLogLogVec;
 
-    fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
-        let mut metrics = vec![];
-        hll.core
-            .children
-            .read()
-            .unwrap()
-            .values()
-            .for_each(|c| c.core.collect_into(&mut metrics));
-        metrics
+    #[derive(FixedCardinalityLabel, Clone, Copy)]
+    #[label(singleton = "x")]
+    enum Label {
+        A,
+        B,
     }
-    fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
+
+    fn collect(hll: &HyperLogLogVec<StaticLabelSet<Label>, 32>) -> ([u8; 32], [u8; 32]) {
+        // cannot go through the `hll.collect_family_into` interface yet...
+        // need to see if I can fix the conflicting impls problem in measured.
+        (
+            hll.get_metric(hll.with_labels(Label::A)).take_sample(),
+            hll.get_metric(hll.with_labels(Label::B)).take_sample(),
+        )
+    }
+
+    fn get_cardinality(samples: &[[u8; 32]]) -> f64 {
         let mut buckets = [0.0; 32];
-        for metric in metrics.chunks_exact(32) {
-            if filter(&metric[0]) {
-                for (i, m) in metric.iter().enumerate() {
-                    buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
-                }
+        for &sample in samples {
+            for (i, m) in sample.into_iter().enumerate() {
+                buckets[i] = f64::max(buckets[i], m as f64);
             }
         }
 
@@ -437,7 +238,7 @@ mod tests {
     }
 
     fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
-        let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
+        let hll = HyperLogLogVec::<StaticLabelSet<Label>, 32>::new();
 
         let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
         let mut set_a = HashSet::new();
@@ -445,18 +246,20 @@ mod tests {
 
         for x in iter.by_ref().take(n) {
             set_a.insert(x.to_bits());
-            hll.with_label_values(&["a"]).measure(&x.to_bits());
+            hll.get_metric(hll.with_labels(Label::A))
+                .measure(&x.to_bits());
         }
         for x in iter.by_ref().take(n) {
             set_b.insert(x.to_bits());
-            hll.with_label_values(&["b"]).measure(&x.to_bits());
+            hll.get_metric(hll.with_labels(Label::B))
+                .measure(&x.to_bits());
         }
         let merge = &set_a | &set_b;
 
-        let metrics = collect(&hll);
-        let len = get_cardinality(&metrics, |_| true);
-        let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
-        let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
+        let (a, b) = collect(&hll);
+        let len = get_cardinality(&[a, b]);
+        let len_a = get_cardinality(&[a]);
+        let len_b = get_cardinality(&[b]);
 
         ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
     }
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 6cff28c0ca..2cf3cdeaa7 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -5,7 +5,7 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 
 use measured::{
-    label::{LabelGroupVisitor, LabelName, NoLabels},
+    label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels},
     metric::{
         counter::CounterState,
         gauge::GaugeState,
@@ -40,7 +40,7 @@ pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
-pub use hll::{HyperLogLog, HyperLogLogVec};
+pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
 #[cfg(target_os = "linux")]
 pub mod more_process_metrics;
 
@@ -421,3 +421,171 @@ pub type IntCounterPair = GenericCounterPair<AtomicU64>;
 
 /// A guard for [`IntCounterPair`] that will decrement the gauge on drop
 pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
+
+pub trait CounterPairAssoc {
+    const INC_NAME: &'static MetricName;
+    const DEC_NAME: &'static MetricName;
+
+    const INC_HELP: &'static str;
+    const DEC_HELP: &'static str;
+
+    type LabelGroupSet: LabelGroupSet;
+}
+
+pub struct CounterPairVec<A: CounterPairAssoc> {
+    vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
+}
+
+impl<A: CounterPairAssoc> Default for CounterPairVec<A>
+where
+    A::LabelGroupSet: Default,
+{
+    fn default() -> Self {
+        Self {
+            vec: Default::default(),
+        }
+    }
+}
+
+impl<A: CounterPairAssoc> CounterPairVec<A> {
+    pub fn guard(
+        &self,
+        labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
+    ) -> MeasuredCounterPairGuard<'_, A> {
+        let id = self.vec.with_labels(labels);
+        self.vec.get_metric(id).inc.inc();
+        MeasuredCounterPairGuard { vec: &self.vec, id }
+    }
+    pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
+        let id = self.vec.with_labels(labels);
+        self.vec.get_metric(id).inc.inc();
+    }
+    pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
+        let id = self.vec.with_labels(labels);
+        self.vec.get_metric(id).dec.inc();
+    }
+    pub fn remove_metric(
+        &self,
+        labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
+    ) -> Option<MeasuredCounterPairState> {
+        let id = self.vec.with_labels(labels);
+        self.vec.remove_metric(id)
+    }
+}
+
+impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
+where
+    T: ::measured::metric::group::Encoding,
+    A: CounterPairAssoc,
+    ::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
+{
+    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
+        // write decrement first to avoid a race condition where inc - dec < 0
+        T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
+        self.vec
+            .collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
+
+        T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
+        self.vec
+            .collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
+
+        Ok(())
+    }
+}
+
+#[derive(MetricGroup, Default)]
+pub struct MeasuredCounterPairState {
+    pub inc: CounterState,
+    pub dec: CounterState,
+}
+
+impl measured::metric::MetricType for MeasuredCounterPairState {
+    type Metadata = ();
+}
+
+pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> {
+    vec: &'a measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
+    id: measured::metric::LabelId<A::LabelGroupSet>,
+}
+
+impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
+    fn drop(&mut self) {
+        self.vec.get_metric(self.id).dec.inc();
+    }
+}
+
+/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
+struct Inc<T>(T);
+/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
+struct Dec<T>(T);
+
+impl<T: Encoding> Encoding for Inc<T> {
+    type Err = T::Err;
+
+    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
+        self.0.write_help(name, help)
+    }
+
+    fn write_metric_value(
+        &mut self,
+        name: impl MetricNameEncoder,
+        labels: impl LabelGroup,
+        value: MetricValue,
+    ) -> Result<(), Self::Err> {
+        self.0.write_metric_value(name, labels, value)
+    }
+}
+
+impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
+where
+    CounterState: MetricEncoding<T>,
+{
+    fn write_type(name: impl MetricNameEncoder, enc: &mut Inc<T>) -> Result<(), T::Err> {
+        CounterState::write_type(name, &mut enc.0)
+    }
+    fn collect_into(
+        &self,
+        metadata: &(),
+        labels: impl LabelGroup,
+        name: impl MetricNameEncoder,
+        enc: &mut Inc<T>,
+    ) -> Result<(), T::Err> {
+        self.inc.collect_into(metadata, labels, name, &mut enc.0)
+    }
+}
+
+impl<T: Encoding> Encoding for Dec<T> {
+    type Err = T::Err;
+
+    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
+        self.0.write_help(name, help)
+    }
+
+    fn write_metric_value(
+        &mut self,
+        name: impl MetricNameEncoder,
+        labels: impl LabelGroup,
+        value: MetricValue,
+    ) -> Result<(), Self::Err> {
+        self.0.write_metric_value(name, labels, value)
+    }
+}
+
+/// Write the dec counter to the encoder
+impl<T: Encoding> MetricEncoding<Dec<T>> for MeasuredCounterPairState
+where
+    CounterState: MetricEncoding<T>,
+{
+    fn write_type(name: impl MetricNameEncoder, enc: &mut Dec<T>) -> Result<(), T::Err> {
+        CounterState::write_type(name, &mut enc.0)
+    }
+    fn collect_into(
+        &self,
+        metadata: &(),
+        labels: impl LabelGroup,
+        name: impl MetricNameEncoder,
+        enc: &mut Dec<T>,
+    ) -> Result<(), T::Err> {
+        self.dec.collect_into(metadata, labels, name, &mut enc.0)
+    }
+}
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 12bd67ea36..6b8f2ecbf4 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -44,6 +44,7 @@ ipnet.workspace = true
 itertools.workspace = true
 lasso = { workspace = true, features = ["multi-threaded"] }
 md5.workspace = true
+measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
 opentelemetry.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index e421798067..229d499e30 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -13,7 +13,7 @@ use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
 use crate::console::{AuthSecret, NodeInfo};
 use crate::context::RequestMonitoring;
 use crate::intern::EndpointIdInt;
-use crate::metrics::{AUTH_RATE_LIMIT_HITS, ENDPOINTS_AUTH_RATE_LIMITED};
+use crate::metrics::Metrics;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
 use crate::stream::Stream;
@@ -210,8 +210,12 @@ impl AuthenticationConfig {
                 enabled = self.rate_limiter_enabled,
                 "rate limiting authentication"
             );
-            AUTH_RATE_LIMIT_HITS.inc();
-            ENDPOINTS_AUTH_RATE_LIMITED.measure(endpoint);
+            Metrics::get().proxy.requests_auth_rate_limits_total.inc();
+            Metrics::get()
+                .proxy
+                .endpoints_auth_rate_limits
+                .get_metric()
+                .measure(endpoint);
 
             if self.rate_limiter_enabled {
                 return Err(auth::AuthError::too_many_connections());
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 89773aa1ff..783a1a5a21 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -4,7 +4,7 @@ use crate::{
     auth::password_hack::parse_endpoint_param,
     context::RequestMonitoring,
     error::{ReportableError, UserFacingError},
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI,
+    metrics::{Metrics, SniKind},
     proxy::NeonOptions,
     serverless::SERVERLESS_DRIVER_SNI,
     EndpointId, RoleName,
@@ -144,21 +144,22 @@ impl ComputeUserInfoMaybeEndpoint {
             ctx.set_endpoint_id(ep.clone());
         }
 
+        let metrics = Metrics::get();
         info!(%user, "credentials");
         if sni.is_some() {
             info!("Connection with sni");
-            NUM_CONNECTION_ACCEPTED_BY_SNI
-                .with_label_values(&["sni"])
-                .inc();
+            metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni);
         } else if endpoint.is_some() {
-            NUM_CONNECTION_ACCEPTED_BY_SNI
-                .with_label_values(&["no_sni"])
-                .inc();
+            metrics
+                .proxy
+                .accepted_connections_by_sni
+                .inc(SniKind::NoSni);
             info!("Connection without sni");
         } else {
-            NUM_CONNECTION_ACCEPTED_BY_SNI
-                .with_label_values(&["password_hack"])
-                .inc();
+            metrics
+                .proxy
+                .accepted_connections_by_sni
+                .inc(SniKind::PasswordHack);
             info!("Connection with password hack");
         }
 
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index c28814b1c8..58737efe46 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -176,7 +176,12 @@ async fn task_main(
                     .context("failed to set socket option")?;
 
                 info!(%peer_addr, "serving");
-                let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
+                let ctx = RequestMonitoring::new(
+                    session_id,
+                    peer_addr.ip(),
+                    proxy::metrics::Protocol::SniRouter,
+                    "sni",
+                );
                 handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
             }
             .unwrap_or_else(|e| {
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 56a3ef79cd..3392c21075 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -18,7 +18,8 @@ use proxy::config::ProjectInfoCacheOptions;
 use proxy::console;
 use proxy::context::parquet::ParquetUploadArgs;
 use proxy::http;
-use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT;
+use proxy::http::health_server::AppMetrics;
+use proxy::metrics::Metrics;
 use proxy::rate_limiter::AuthRateLimiter;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
@@ -249,14 +250,18 @@ async fn main() -> anyhow::Result<()> {
 
     info!("Version: {GIT_VERSION}");
     info!("Build_tag: {BUILD_TAG}");
-    ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);
+    let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
+        revision: GIT_VERSION,
+        build_tag: BUILD_TAG,
+    });
 
-    match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) {
-        Ok(t) => {
-            t.start();
+    let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
+        Ok(t) => Some(t),
+        Err(e) => {
+            tracing::error!(error = ?e, "could not start jemalloc metrics loop");
+            None
         }
-        Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"),
-    }
+    };
 
     let args = ProxyCliArgs::parse();
     let config = build_config(&args)?;
@@ -349,7 +354,7 @@ async fn main() -> anyhow::Result<()> {
     >::new(
         cancel_map.clone(),
         redis_publisher,
-        NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT,
+        proxy::metrics::CancellationSource::FromClient,
     ));
 
     // client facing tasks. these will exit on error or on cancellation
@@ -387,7 +392,14 @@ async fn main() -> anyhow::Result<()> {
     // maintenance tasks. these never return unless there's an error
     let mut maintenance_tasks = JoinSet::new();
     maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
-    maintenance_tasks.spawn(http::health_server::task_main(http_listener));
+    maintenance_tasks.spawn(http::health_server::task_main(
+        http_listener,
+        AppMetrics {
+            jemalloc,
+            neon_metrics,
+            proxy: proxy::metrics::Metrics::get(),
+        },
+    ));
     maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
 
     if let Some(metrics_config) = &config.metric_collection {
@@ -507,8 +519,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             } = args.wake_compute_lock.parse()?;
             info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
             let locks = Box::leak(Box::new(
-                console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout)
-                    .unwrap(),
+                console::locks::ApiLocks::new(
+                    "wake_compute_lock",
+                    permits,
+                    shards,
+                    timeout,
+                    &Metrics::get().wake_compute_lock,
+                )
+                .unwrap(),
             ));
             tokio::spawn(locks.garbage_collect_worker(epoch));
 
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 6151513614..34512e9f5b 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -10,7 +10,7 @@ use uuid::Uuid;
 
 use crate::{
     error::ReportableError,
-    metrics::NUM_CANCELLATION_REQUESTS,
+    metrics::{CancellationRequest, CancellationSource, Metrics},
     redis::cancellation_publisher::{
         CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
     },
@@ -28,7 +28,7 @@ pub struct CancellationHandler<P> {
     client: P,
     /// This field used for the monitoring purposes.
     /// Represents the source of the cancellation request.
-    from: &'static str,
+    from: CancellationSource,
 }
 
 #[derive(Debug, Error)]
@@ -89,9 +89,13 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
         // NB: we should immediately release the lock after cloning the token.
         let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
             tracing::warn!("query cancellation key not found: {key}");
-            NUM_CANCELLATION_REQUESTS
-                .with_label_values(&[self.from, "not_found"])
-                .inc();
+            Metrics::get()
+                .proxy
+                .cancellation_requests_total
+                .inc(CancellationRequest {
+                    source: self.from,
+                    kind: crate::metrics::CancellationOutcome::NotFound,
+                });
             match self.client.try_publish(key, session_id).await {
                 Ok(()) => {} // do nothing
                 Err(e) => {
@@ -103,9 +107,13 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
             }
             return Ok(());
         };
-        NUM_CANCELLATION_REQUESTS
-            .with_label_values(&[self.from, "found"])
-            .inc();
+        Metrics::get()
+            .proxy
+            .cancellation_requests_total
+            .inc(CancellationRequest {
+                source: self.from,
+                kind: crate::metrics::CancellationOutcome::Found,
+            });
         info!("cancelling query per user's request using key {key}");
         cancel_closure.try_cancel_query().await
     }
@@ -122,7 +130,7 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
 }
 
 impl CancellationHandler<()> {
-    pub fn new(map: CancelMap, from: &'static str) -> Self {
+    pub fn new(map: CancelMap, from: CancellationSource) -> Self {
         Self {
             map,
             client: (),
@@ -132,7 +140,7 @@ impl CancellationHandler<()> {
 }
 
 impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
-    pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: &'static str) -> Self {
+    pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: CancellationSource) -> Self {
         Self { map, client, from }
     }
 }
@@ -192,15 +200,13 @@ impl<P> Drop for Session<P> {
 
 #[cfg(test)]
 mod tests {
-    use crate::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS;
-
     use super::*;
 
     #[tokio::test]
     async fn check_session_drop() -> anyhow::Result<()> {
         let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
             CancelMap::default(),
-            NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
+            CancellationSource::FromRedis,
         ));
 
         let session = cancellation_handler.clone().get_session();
@@ -214,7 +220,7 @@ mod tests {
 
     #[tokio::test]
     async fn cancel_session_noop_regression() {
-        let handler = CancellationHandler::<()>::new(Default::default(), "local");
+        let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local);
         handler
             .cancel_session(
                 CancelKeyData {
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index ee33b97fbd..149a619316 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -4,12 +4,11 @@ use crate::{
     console::{errors::WakeComputeError, messages::MetricsAuxInfo},
     context::RequestMonitoring,
     error::{ReportableError, UserFacingError},
-    metrics::NUM_DB_CONNECTIONS_GAUGE,
+    metrics::{Metrics, NumDbConnectionsGuard},
     proxy::neon_option,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
-use metrics::IntCounterPairGuard;
 use pq_proto::StartupMessageParams;
 use std::{io, net::SocketAddr, time::Duration};
 use thiserror::Error;
@@ -249,7 +248,7 @@ pub struct PostgresConnection {
     /// Labels for proxy's metrics.
     pub aux: MetricsAuxInfo,
 
-    _guage: IntCounterPairGuard,
+    _guage: NumDbConnectionsGuard<'static>,
 }
 
 impl ConnCfg {
@@ -295,9 +294,7 @@ impl ConnCfg {
             params,
             cancel_closure,
             aux,
-            _guage: NUM_DB_CONNECTIONS_GAUGE
-                .with_label_values(&[ctx.protocol])
-                .guard(),
+            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol),
         };
 
         Ok(connection)
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 45161f5ac8..9869b95768 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,3 +1,4 @@
+use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
 use std::fmt;
 
@@ -102,7 +103,7 @@ pub struct MetricsAuxInfo {
     pub cold_start_info: ColdStartInfo,
 }
 
-#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)]
+#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)]
 #[serde(rename_all = "snake_case")]
 pub enum ColdStartInfo {
     #[default]
@@ -110,9 +111,11 @@ pub enum ColdStartInfo {
     /// Compute was already running
     Warm,
     #[serde(rename = "pool_hit")]
+    #[label(rename = "pool_hit")]
     /// Compute was not running but there was an available VM
     VmPoolHit,
     #[serde(rename = "pool_miss")]
+    #[label(rename = "pool_miss")]
     /// Compute was not running and there were no VMs available
     VmPoolMiss,
 
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index f7d621fb12..b9502f0722 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -13,6 +13,7 @@ use crate::{
     config::{CacheOptions, ProjectInfoCacheOptions},
     context::RequestMonitoring,
     intern::ProjectIdInt,
+    metrics::ApiLockMetrics,
     scram, EndpointCacheKey,
 };
 use dashmap::DashMap;
@@ -441,10 +442,7 @@ pub struct ApiLocks {
     node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
     permits: usize,
     timeout: Duration,
-    registered: prometheus::IntCounter,
-    unregistered: prometheus::IntCounter,
-    reclamation_lag: prometheus::Histogram,
-    lock_acquire_lag: prometheus::Histogram,
+    metrics: &'static ApiLockMetrics,
 }
 
 impl ApiLocks {
@@ -453,54 +451,14 @@ impl ApiLocks {
         permits: usize,
         shards: usize,
         timeout: Duration,
+        metrics: &'static ApiLockMetrics,
     ) -> prometheus::Result<Self> {
-        let registered = prometheus::IntCounter::with_opts(
-            prometheus::Opts::new(
-                "semaphores_registered",
-                "Number of semaphores registered in this api lock",
-            )
-            .namespace(name),
-        )?;
-        prometheus::register(Box::new(registered.clone()))?;
-        let unregistered = prometheus::IntCounter::with_opts(
-            prometheus::Opts::new(
-                "semaphores_unregistered",
-                "Number of semaphores unregistered in this api lock",
-            )
-            .namespace(name),
-        )?;
-        prometheus::register(Box::new(unregistered.clone()))?;
-        let reclamation_lag = prometheus::Histogram::with_opts(
-            prometheus::HistogramOpts::new(
-                "reclamation_lag_seconds",
-                "Time it takes to reclaim unused semaphores in the api lock",
-            )
-            .namespace(name)
-            // 1us -> 65ms
-            // benchmarks on my mac indicate it's usually in the range of 256us and 512us
-            .buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?),
-        )?;
-        prometheus::register(Box::new(reclamation_lag.clone()))?;
-        let lock_acquire_lag = prometheus::Histogram::with_opts(
-            prometheus::HistogramOpts::new(
-                "semaphore_acquire_seconds",
-                "Time it takes to reclaim unused semaphores in the api lock",
-            )
-            .namespace(name)
-            // 0.1ms -> 6s
-            .buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?),
-        )?;
-        prometheus::register(Box::new(lock_acquire_lag.clone()))?;
-
         Ok(Self {
             name,
             node_locks: DashMap::with_shard_amount(shards),
             permits,
             timeout,
-            lock_acquire_lag,
-            registered,
-            unregistered,
-            reclamation_lag,
+            metrics,
         })
     }
 
@@ -520,7 +478,7 @@ impl ApiLocks {
                 self.node_locks
                     .entry(key.clone())
                     .or_insert_with(|| {
-                        self.registered.inc();
+                        self.metrics.semaphores_registered.inc();
                         Arc::new(Semaphore::new(self.permits))
                     })
                     .clone()
@@ -528,8 +486,9 @@ impl ApiLocks {
         };
         let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await;
 
-        self.lock_acquire_lag
-            .observe((Instant::now() - now).as_secs_f64());
+        self.metrics
+            .semaphore_acquire_seconds
+            .observe(now.elapsed().as_secs_f64());
 
         Ok(WakeComputePermit {
             permit: Some(permit??),
@@ -554,13 +513,13 @@ impl ApiLocks {
                     "performing epoch reclamation on api lock"
                 );
                 let mut lock = shard.write();
-                let timer = self.reclamation_lag.start_timer();
+                let timer = self.metrics.reclamation_lag_seconds.start_timer();
                 let count = lock
                     .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
                     .count();
                 drop(lock);
-                self.unregistered.inc_by(count as u64);
-                timer.observe_duration()
+                self.metrics.semaphores_unregistered.inc_by(count as u64);
+                timer.observe();
             }
         }
     }
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 1a3e2ca795..9ac1900324 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -7,13 +7,14 @@ use super::{
     NodeInfo,
 };
 use crate::{
-    auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram,
-};
-use crate::{
-    cache::Cached,
-    context::RequestMonitoring,
-    metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
+    auth::backend::ComputeUserInfo,
+    compute,
+    console::messages::ColdStartInfo,
+    http,
+    metrics::{CacheOutcome, Metrics},
+    scram,
 };
+use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
 use std::sync::Arc;
 use tokio::time::Instant;
@@ -95,7 +96,10 @@ impl Api {
                 Some(secret)
             };
             let allowed_ips = body.allowed_ips.unwrap_or_default();
-            ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64);
+            Metrics::get()
+                .proxy
+                .allowed_ips_number
+                .observe(allowed_ips.len() as f64);
             Ok(AuthInfo {
                 secret,
                 allowed_ips,
@@ -206,14 +210,16 @@ impl super::Api for Api {
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         let ep = &user_info.endpoint;
         if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) {
-            ALLOWED_IPS_BY_CACHE_OUTCOME
-                .with_label_values(&["hit"])
-                .inc();
+            Metrics::get()
+                .proxy
+                .allowed_ips_cache_misses
+                .inc(CacheOutcome::Hit);
             return Ok((allowed_ips, None));
         }
-        ALLOWED_IPS_BY_CACHE_OUTCOME
-            .with_label_values(&["miss"])
-            .inc();
+        Metrics::get()
+            .proxy
+            .allowed_ips_cache_misses
+            .inc(CacheOutcome::Miss);
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         let allowed_ips = Arc::new(auth_info.allowed_ips);
         let user = &user_info.user;
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index fec95f4722..0094235921 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -12,7 +12,7 @@ use crate::{
     console::messages::{ColdStartInfo, MetricsAuxInfo},
     error::ErrorKind,
     intern::{BranchIdInt, ProjectIdInt},
-    metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
+    metrics::{LatencyTimer, Metrics, Protocol},
     DbName, EndpointId, RoleName,
 };
 
@@ -29,7 +29,7 @@ static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::ne
 pub struct RequestMonitoring {
     pub peer_addr: IpAddr,
     pub session_id: Uuid,
-    pub protocol: &'static str,
+    pub protocol: Protocol,
     first_packet: chrono::DateTime<Utc>,
     region: &'static str,
     pub span: Span,
@@ -65,7 +65,7 @@ impl RequestMonitoring {
     pub fn new(
         session_id: Uuid,
         peer_addr: IpAddr,
-        protocol: &'static str,
+        protocol: Protocol,
         region: &'static str,
     ) -> Self {
         let span = info_span!(
@@ -102,7 +102,7 @@ impl RequestMonitoring {
 
     #[cfg(test)]
     pub fn test() -> Self {
-        RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), "test", "test")
+        RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test")
     }
 
     pub fn console_application_name(&self) -> String {
@@ -134,9 +134,9 @@ impl RequestMonitoring {
     pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
         if self.endpoint_id.is_none() {
             self.span.record("ep", display(&endpoint_id));
-            crate::metrics::CONNECTING_ENDPOINTS
-                .with_label_values(&[self.protocol])
-                .measure(&endpoint_id);
+            let metric = &Metrics::get().proxy.connecting_endpoints;
+            let label = metric.with_labels(self.protocol);
+            metric.get_metric(label).measure(&endpoint_id);
             self.endpoint_id = Some(endpoint_id);
         }
     }
@@ -158,13 +158,11 @@ impl RequestMonitoring {
     }
 
     pub fn set_error_kind(&mut self, kind: ErrorKind) {
-        ERROR_BY_KIND
-            .with_label_values(&[kind.to_metric_label()])
-            .inc();
+        Metrics::get().proxy.errors_total.inc(kind);
         if let Some(ep) = &self.endpoint_id {
-            ENDPOINT_ERRORS_BY_KIND
-                .with_label_values(&[kind.to_metric_label()])
-                .measure(ep);
+            let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
+            let label = metric.with_labels(kind);
+            metric.get_metric(label).measure(ep);
         }
         self.error_kind = Some(kind);
     }
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index eb77409429..e061216d15 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -111,7 +111,7 @@ impl From<&RequestMonitoring> for RequestData {
                 super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus",
                 super::AuthMethod::Cleartext => "cleartext",
             }),
-            protocol: value.protocol,
+            protocol: value.protocol.as_str(),
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
             success: value.success,
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index 4614f3913d..fdfe50a494 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -1,5 +1,7 @@
 use std::{error::Error as StdError, fmt, io};
 
+use measured::FixedCardinalityLabel;
+
 /// Upcast (almost) any error into an opaque [`io::Error`].
 pub fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
     io::Error::new(io::ErrorKind::Other, e)
@@ -29,24 +31,29 @@ pub trait UserFacingError: ReportableError {
     }
 }
 
-#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq, FixedCardinalityLabel)]
+#[label(singleton = "type")]
 pub enum ErrorKind {
     /// Wrong password, unknown endpoint, protocol violation, etc...
     User,
 
     /// Network error between user and proxy. Not necessarily user error
+    #[label(rename = "clientdisconnect")]
     ClientDisconnect,
 
     /// Proxy self-imposed user rate limits
+    #[label(rename = "ratelimit")]
     RateLimit,
 
     /// Proxy self-imposed service-wise rate limits
+    #[label(rename = "serviceratelimit")]
     ServiceRateLimit,
 
     /// internal errors
     Service,
 
     /// Error communicating with control plane
+    #[label(rename = "controlplane")]
     ControlPlane,
 
     /// Postgres error
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 59e1492ed4..95ca0ccd5c 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -13,7 +13,11 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio::time::Instant;
 use tracing::trace;
 
-use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
+use crate::{
+    metrics::{ConsoleRequest, Metrics},
+    rate_limiter,
+    url::ApiUrl,
+};
 use reqwest_middleware::RequestBuilder;
 
 /// This is the preferred way to create new http clients,
@@ -90,13 +94,14 @@ impl Endpoint {
 
     /// Execute a [request](reqwest::Request).
     pub async fn execute(&self, request: Request) -> Result<Response, Error> {
-        let path = request.url().path().to_string();
-        let start = Instant::now();
-        let res = self.client.execute(request).await;
-        CONSOLE_REQUEST_LATENCY
-            .with_label_values(&[&path])
-            .observe(start.elapsed().as_secs_f64());
-        res
+        let _timer = Metrics::get()
+            .proxy
+            .console_request_latency
+            .start_timer(ConsoleRequest {
+                request: request.url().path(),
+            });
+
+        self.client.execute(request).await
     }
 }
 
diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs
index cbb17ebcb7..cae9eb5b97 100644
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -1,30 +1,49 @@
 use anyhow::{anyhow, bail};
-use hyper::{Body, Request, Response, StatusCode};
-use std::{convert::Infallible, net::TcpListener};
-use tracing::info;
+use hyper::{header::CONTENT_TYPE, Body, Request, Response, StatusCode};
+use measured::{text::BufferedTextEncoder, MetricGroup};
+use metrics::NeonMetrics;
+use std::{
+    convert::Infallible,
+    net::TcpListener,
+    sync::{Arc, Mutex},
+};
+use tracing::{info, info_span};
 use utils::http::{
-    endpoint::{self, prometheus_metrics_handler, request_span},
+    endpoint::{self, request_span},
     error::ApiError,
     json::json_response,
     RouterBuilder, RouterService,
 };
 
+use crate::jemalloc;
+
 async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(StatusCode::OK, "")
 }
 
-fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
+fn make_router(metrics: AppMetrics) -> RouterBuilder<hyper::Body, ApiError> {
+    let state = Arc::new(Mutex::new(PrometheusHandler {
+        encoder: BufferedTextEncoder::new(),
+        metrics,
+    }));
+
     endpoint::make_router()
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/metrics", move |r| {
+            let state = state.clone();
+            request_span(r, move |b| prometheus_metrics_handler(b, state))
+        })
         .get("/v1/status", status_handler)
 }
 
-pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible> {
+pub async fn task_main(
+    http_listener: TcpListener,
+    metrics: AppMetrics,
+) -> anyhow::Result<Infallible> {
     scopeguard::defer! {
         info!("http has shut down");
     }
 
-    let service = || RouterService::new(make_router().build()?);
+    let service = || RouterService::new(make_router(metrics).build()?);
 
     hyper::Server::from_tcp(http_listener)?
         .serve(service().map_err(|e| anyhow!(e))?)
@@ -32,3 +51,57 @@ pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible>
 
     bail!("hyper server without shutdown handling cannot shutdown successfully");
 }
+
+struct PrometheusHandler {
+    encoder: BufferedTextEncoder,
+    metrics: AppMetrics,
+}
+
+#[derive(MetricGroup)]
+pub struct AppMetrics {
+    #[metric(namespace = "jemalloc")]
+    pub jemalloc: Option<jemalloc::MetricRecorder>,
+    #[metric(flatten)]
+    pub neon_metrics: NeonMetrics,
+    #[metric(flatten)]
+    pub proxy: &'static crate::metrics::Metrics,
+}
+
+async fn prometheus_metrics_handler(
+    _req: Request<Body>,
+    state: Arc<Mutex<PrometheusHandler>>,
+) -> Result<Response<Body>, ApiError> {
+    let started_at = std::time::Instant::now();
+
+    let span = info_span!("blocking");
+    let body = tokio::task::spawn_blocking(move || {
+        let _span = span.entered();
+
+        let mut state = state.lock().unwrap();
+        let PrometheusHandler { encoder, metrics } = &mut *state;
+
+        metrics
+            .collect_group_into(&mut *encoder)
+            .unwrap_or_else(|infallible| match infallible {});
+
+        let body = encoder.finish();
+
+        tracing::info!(
+            bytes = body.len(),
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "responded /metrics"
+        );
+
+        body
+    })
+    .await
+    .unwrap();
+
+    let response = Response::builder()
+        .status(200)
+        .header(CONTENT_TYPE, "text/plain; version=0.0.4")
+        .body(Body::from(body))
+        .unwrap();
+
+    Ok(response)
+}
diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs
index ed20798d56..3243e6a140 100644
--- a/proxy/src/jemalloc.rs
+++ b/proxy/src/jemalloc.rs
@@ -1,27 +1,45 @@
-use std::time::Duration;
+use std::marker::PhantomData;
 
-use metrics::IntGauge;
-use prometheus::{register_int_gauge_with_registry, Registry};
+use measured::{
+    label::NoLabels,
+    metric::{
+        gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder,
+        MetricEncoding, MetricFamilyEncoding, MetricType,
+    },
+    text::TextEncoder,
+    LabelGroup, MetricGroup,
+};
 use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version};
 
 pub struct MetricRecorder {
     epoch: epoch_mib,
-    active: stats::active_mib,
-    active_gauge: IntGauge,
-    allocated: stats::allocated_mib,
-    allocated_gauge: IntGauge,
-    mapped: stats::mapped_mib,
-    mapped_gauge: IntGauge,
-    metadata: stats::metadata_mib,
-    metadata_gauge: IntGauge,
-    resident: stats::resident_mib,
-    resident_gauge: IntGauge,
-    retained: stats::retained_mib,
-    retained_gauge: IntGauge,
+    inner: Metrics,
+}
+
+#[derive(MetricGroup)]
+struct Metrics {
+    active_bytes: JemallocGaugeFamily<stats::active_mib>,
+    allocated_bytes: JemallocGaugeFamily<stats::allocated_mib>,
+    mapped_bytes: JemallocGaugeFamily<stats::mapped_mib>,
+    metadata_bytes: JemallocGaugeFamily<stats::metadata_mib>,
+    resident_bytes: JemallocGaugeFamily<stats::resident_mib>,
+    retained_bytes: JemallocGaugeFamily<stats::retained_mib>,
+}
+
+impl<Enc: Encoding> MetricGroup<Enc> for MetricRecorder
+where
+    Metrics: MetricGroup<Enc>,
+{
+    fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> {
+        if self.epoch.advance().is_ok() {
+            self.inner.collect_group_into(enc)?;
+        }
+        Ok(())
+    }
 }
 
 impl MetricRecorder {
-    pub fn new(registry: &Registry) -> Result<Self, anyhow::Error> {
+    pub fn new() -> Result<Self, anyhow::Error> {
         tracing::info!(
             config = config::malloc_conf::read()?,
             version = version::read()?,
@@ -30,71 +48,69 @@ impl MetricRecorder {
 
         Ok(Self {
             epoch: epoch::mib()?,
-            active: stats::active::mib()?,
-            active_gauge: register_int_gauge_with_registry!(
-                "jemalloc_active_bytes",
-                "Total number of bytes in active pages allocated by the process",
-                registry
-            )?,
-            allocated: stats::allocated::mib()?,
-            allocated_gauge: register_int_gauge_with_registry!(
-                "jemalloc_allocated_bytes",
-                "Total number of bytes allocated by the process",
-                registry
-            )?,
-            mapped: stats::mapped::mib()?,
-            mapped_gauge: register_int_gauge_with_registry!(
-                "jemalloc_mapped_bytes",
-                "Total number of bytes in active extents mapped by the allocator",
-                registry
-            )?,
-            metadata: stats::metadata::mib()?,
-            metadata_gauge: register_int_gauge_with_registry!(
-                "jemalloc_metadata_bytes",
-                "Total number of bytes dedicated to jemalloc metadata",
-                registry
-            )?,
-            resident: stats::resident::mib()?,
-            resident_gauge: register_int_gauge_with_registry!(
-                "jemalloc_resident_bytes",
-                "Total number of bytes in physically resident data pages mapped by the allocator",
-                registry
-            )?,
-            retained: stats::retained::mib()?,
-            retained_gauge: register_int_gauge_with_registry!(
-                "jemalloc_retained_bytes",
-                "Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system",
-                registry
-            )?,
-        })
-    }
-
-    fn _poll(&self) -> Result<(), anyhow::Error> {
-        self.epoch.advance()?;
-        self.active_gauge.set(self.active.read()? as i64);
-        self.allocated_gauge.set(self.allocated.read()? as i64);
-        self.mapped_gauge.set(self.mapped.read()? as i64);
-        self.metadata_gauge.set(self.metadata.read()? as i64);
-        self.resident_gauge.set(self.resident.read()? as i64);
-        self.retained_gauge.set(self.retained.read()? as i64);
-        Ok(())
-    }
-
-    #[inline]
-    pub fn poll(&self) {
-        if let Err(error) = self._poll() {
-            tracing::warn!(%error, "Failed to poll jemalloc stats");
-        }
-    }
-
-    pub fn start(self) -> tokio::task::JoinHandle<()> {
-        tokio::task::spawn(async move {
-            let mut interval = tokio::time::interval(Duration::from_secs(15));
-            interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
-            loop {
-                self.poll();
-                interval.tick().await;
-            }
+            inner: Metrics {
+                active_bytes: JemallocGaugeFamily(stats::active::mib()?),
+                allocated_bytes: JemallocGaugeFamily(stats::allocated::mib()?),
+                mapped_bytes: JemallocGaugeFamily(stats::mapped::mib()?),
+                metadata_bytes: JemallocGaugeFamily(stats::metadata::mib()?),
+                resident_bytes: JemallocGaugeFamily(stats::resident::mib()?),
+                retained_bytes: JemallocGaugeFamily(stats::retained::mib()?),
+            },
         })
     }
 }
+
+struct JemallocGauge<T>(PhantomData<T>);
+
+impl<T> Default for JemallocGauge<T> {
+    fn default() -> Self {
+        JemallocGauge(PhantomData)
+    }
+}
+impl<T> MetricType for JemallocGauge<T> {
+    type Metadata = T;
+}
+
+struct JemallocGaugeFamily<T>(T);
+impl<M, T: Encoding> MetricFamilyEncoding<T> for JemallocGaugeFamily<M>
+where
+    JemallocGauge<M>: MetricEncoding<T, Metadata = M>,
+{
+    fn collect_family_into(&self, name: impl MetricNameEncoder, enc: &mut T) -> Result<(), T::Err> {
+        JemallocGauge::write_type(&name, enc)?;
+        JemallocGauge(PhantomData).collect_into(&self.0, NoLabels, name, enc)
+    }
+}
+
+macro_rules! jemalloc_gauge {
+    ($stat:ident, $mib:ident) => {
+        impl<W: std::io::Write> MetricEncoding<TextEncoder<W>> for JemallocGauge<stats::$mib> {
+            fn write_type(
+                name: impl MetricNameEncoder,
+                enc: &mut TextEncoder<W>,
+            ) -> Result<(), std::io::Error> {
+                GaugeState::write_type(name, enc)
+            }
+
+            fn collect_into(
+                &self,
+                mib: &stats::$mib,
+                labels: impl LabelGroup,
+                name: impl MetricNameEncoder,
+                enc: &mut TextEncoder<W>,
+            ) -> Result<(), std::io::Error> {
+                if let Ok(v) = mib.read() {
+                    enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?;
+                }
+                Ok(())
+            }
+        }
+    };
+}
+
+jemalloc_gauge!(active, active_mib);
+jemalloc_gauge!(allocated, allocated_mib);
+jemalloc_gauge!(mapped, mapped_mib);
+jemalloc_gauge!(metadata, metadata_mib);
+jemalloc_gauge!(resident, resident_mib);
+jemalloc_gauge!(retained, retained_mib);
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 59ee899c08..78840f5983 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,176 +1,356 @@
-use ::metrics::{
-    exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec,
-    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge,
-    register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
-    IntCounterVec, IntGauge, IntGaugeVec,
-};
-use metrics::{
-    register_hll, register_int_counter, register_int_counter_pair, HyperLogLog, IntCounter,
-    IntCounterPair,
-};
+use std::sync::OnceLock;
+
+use lasso::ThreadedRodeo;
+use measured::{
+    label::StaticLabelSet,
+    metric::{histogram::Thresholds, name::MetricName},
+    Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
+    LabelGroup, MetricGroup,
+};
+use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
 
-use once_cell::sync::Lazy;
 use tokio::time::{self, Instant};
 
 use crate::console::messages::ColdStartInfo;
 
-pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_opened_db_connections_total",
-        "Number of opened connections to a database.",
-        "proxy_closed_db_connections_total",
-        "Number of closed connections to a database.",
-        &["protocol"],
-    )
-    .unwrap()
-});
+#[derive(MetricGroup)]
+pub struct Metrics {
+    #[metric(namespace = "proxy")]
+    pub proxy: ProxyMetrics,
 
-pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_opened_client_connections_total",
-        "Number of opened connections from a client.",
-        "proxy_closed_client_connections_total",
-        "Number of closed connections from a client.",
-        &["protocol"],
-    )
-    .unwrap()
-});
+    #[metric(namespace = "wake_compute_lock")]
+    pub wake_compute_lock: ApiLockMetrics,
 
-pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_accepted_connections_total",
-        "Number of client connections accepted.",
-        "proxy_closed_connections_total",
-        "Number of client connections closed.",
-        &["protocol"],
-    )
-    .unwrap()
-});
+    // the one metric not called proxy_....
+    pub semaphore_control_plane_limit: GaugeVec<StaticLabelSet<RateLimit>>,
+}
 
-pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_compute_connection_latency_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
-        // 3 * 6 * 2 * 2 = 72 counters
-        &["protocol", "cold_start_info", "outcome", "excluded"],
-        // largest bucket = 2^16 * 0.5ms = 32s
-        exponential_buckets(0.0005, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
+impl Metrics {
+    pub fn get() -> &'static Self {
+        static SELF: OnceLock<Metrics> = OnceLock::new();
+        SELF.get_or_init(|| Metrics {
+            proxy: ProxyMetrics::default(),
+            wake_compute_lock: ApiLockMetrics::new(),
+            semaphore_control_plane_limit: GaugeVec::default(),
+        })
+    }
+}
 
-pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_console_request_latency",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // proxy_wake_compute/proxy_get_role_info
-        &["request"],
+#[derive(MetricGroup)]
+#[metric(new())]
+pub struct ProxyMetrics {
+    #[metric(flatten)]
+    pub db_connections: CounterPairVec<NumDbConnectionsGauge>,
+    #[metric(flatten)]
+    pub client_connections: CounterPairVec<NumClientConnectionsGauge>,
+    #[metric(flatten)]
+    pub connection_requests: CounterPairVec<NumConnectionRequestsGauge>,
+    #[metric(flatten)]
+    pub http_endpoint_pools: HttpEndpointPools,
+
+    /// Time it took for proxy to establish a connection to the compute endpoint.
+    // largest bucket = 2^16 * 0.5ms = 32s
+    #[metric(metadata = Thresholds::exponential_buckets(0.0005, 2.0))]
+    pub compute_connection_latency_seconds: HistogramVec<ComputeConnectionLatencySet, 16>,
+
+    /// Time it took for proxy to receive a response from control plane.
+    #[metric(
         // largest bucket = 2^16 * 0.2ms = 13s
-        exponential_buckets(0.0002, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
+        metadata = Thresholds::exponential_buckets(0.0002, 2.0),
+    )]
+    pub console_request_latency: HistogramVec<ConsoleRequestSet, 16>,
 
-pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_allowed_ips_cache_misses",
-        "Number of cache hits/misses for allowed ips",
-        // hit/miss
-        &["outcome"],
-    )
-    .unwrap()
-});
+    /// Time it takes to acquire a token to call console plane.
+    // largest bucket = 3^16 * 0.05ms = 2.15s
+    #[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))]
+    pub control_plane_token_acquire_seconds: Histogram<16>,
 
-pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_control_plane_token_acquire_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // largest bucket = 3^16 * 0.05ms = 2.15s
-        exponential_buckets(0.00005, 3.0, 16).unwrap(),
-    )
-    .unwrap()
-});
+    /// Size of the HTTP request body lengths.
+    // smallest bucket = 16 bytes
+    // largest bucket = 4^12 * 16 bytes = 256MB
+    #[metric(metadata = Thresholds::exponential_buckets(16.0, 4.0))]
+    pub http_conn_content_length_bytes: HistogramVec<StaticLabelSet<HttpDirection>, 12>,
 
-pub static RATE_LIMITER_LIMIT: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "semaphore_control_plane_limit",
-        "Current limit of the semaphore control plane",
-        &["limit"], // 2 counters
-    )
-    .unwrap()
-});
+    /// Time it takes to reclaim unused connection pools.
+    #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))]
+    pub http_pool_reclaimation_lag_seconds: Histogram<16>,
 
-pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_accepted_connections_by_sni",
-        "Number of connections (per sni).",
-        &["kind"],
-    )
-    .unwrap()
-});
+    /// Number of opened connections to a database.
+    pub http_pool_opened_connections: Gauge,
 
-pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_allowed_ips_number",
-        "Number of allowed ips",
-        vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0],
-    )
-    .unwrap()
-});
+    /// Number of cache hits/misses for allowed ips.
+    pub allowed_ips_cache_misses: CounterVec<StaticLabelSet<CacheOutcome>>,
 
-pub static HTTP_CONTENT_LENGTH: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_http_conn_content_length_bytes",
-        "Number of bytes the HTTP response content consumes",
-        // request/response
-        &["direction"],
-        // smallest bucket = 16 bytes
-        // largest bucket = 4^12 * 16 bytes = 256MB
-        exponential_buckets(16.0, 4.0, 12).unwrap()
-    )
-    .unwrap()
-});
+    /// Number of allowed ips
+    #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
+    pub allowed_ips_number: Histogram<10>,
 
-pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_http_pool_reclaimation_lag_seconds",
-        "Time it takes to reclaim unused connection pools",
-        // 1us -> 65ms
-        exponential_buckets(1e-6, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
+    /// Number of connections (per sni).
+    pub accepted_connections_by_sni: CounterVec<StaticLabelSet<SniKind>>,
 
-pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
-    register_int_counter_pair!(
-        "proxy_http_pool_endpoints_registered_total",
-        "Number of endpoints we have registered pools for",
-        "proxy_http_pool_endpoints_unregistered_total",
-        "Number of endpoints we have unregistered pools for",
-    )
-    .unwrap()
-});
+    /// Number of connection failures (per kind).
+    pub connection_failures_total: CounterVec<StaticLabelSet<ConnectionFailureKind>>,
 
-pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy<IntGauge> = Lazy::new(|| {
-    register_int_gauge!(
-        "proxy_http_pool_opened_connections",
-        "Number of opened connections to a database.",
-    )
-    .unwrap()
-});
+    /// Number of wake-up failures (per kind).
+    pub connection_failures_breakdown: CounterVec<ConnectionFailuresBreakdownSet>,
 
-pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_cancellation_requests_total",
-        "Number of cancellation requests (per found/not_found).",
-        &["source", "kind"],
-    )
-    .unwrap()
-});
+    /// Number of bytes sent/received between all clients and backends.
+    pub io_bytes: CounterVec<StaticLabelSet<Direction>>,
 
-pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client";
-pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis";
+    /// Number of errors by a given classification.
+    pub errors_total: CounterVec<StaticLabelSet<crate::error::ErrorKind>>,
+
+    /// Number of cancellation requests (per found/not_found).
+    pub cancellation_requests_total: CounterVec<CancellationRequestSet>,
+
+    /// Number of errors by a given classification
+    pub redis_errors_total: CounterVec<RedisErrorsSet>,
+
+    /// Number of TLS handshake failures
+    pub tls_handshake_failures: Counter,
+
+    /// Number of connection requests affected by authentication rate limits
+    pub requests_auth_rate_limits_total: Counter,
+
+    /// HLL approximate cardinality of endpoints that are connecting
+    pub connecting_endpoints: HyperLogLogVec<StaticLabelSet<Protocol>, 32>,
+
+    /// Number of endpoints affected by errors of a given classification
+    pub endpoints_affected_by_errors: HyperLogLogVec<StaticLabelSet<crate::error::ErrorKind>, 32>,
+
+    /// Number of endpoints affected by authentication rate limits
+    pub endpoints_auth_rate_limits: HyperLogLog<32>,
+}
+
+#[derive(MetricGroup)]
+#[metric(new())]
+pub struct ApiLockMetrics {
+    /// Number of semaphores registered in this api lock
+    pub semaphores_registered: Counter,
+    /// Number of semaphores unregistered in this api lock
+    pub semaphores_unregistered: Counter,
+    /// Time it takes to reclaim unused semaphores in the api lock
+    #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))]
+    pub reclamation_lag_seconds: Histogram<16>,
+    /// Time it takes to acquire a semaphore lock
+    #[metric(metadata = Thresholds::exponential_buckets(1e-4, 2.0))]
+    pub semaphore_acquire_seconds: Histogram<16>,
+}
+
+impl Default for ProxyMetrics {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "direction")]
+pub enum HttpDirection {
+    Request,
+    Response,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "direction")]
+pub enum Direction {
+    Tx,
+    Rx,
+}
+
+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+#[label(singleton = "protocol")]
+pub enum Protocol {
+    Http,
+    Ws,
+    Tcp,
+    SniRouter,
+}
+
+impl Protocol {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Protocol::Http => "http",
+            Protocol::Ws => "ws",
+            Protocol::Tcp => "tcp",
+            Protocol::SniRouter => "sni_router",
+        }
+    }
+}
+
+impl std::fmt::Display for Protocol {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+pub enum Bool {
+    True,
+    False,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "outcome")]
+pub enum Outcome {
+    Success,
+    Failed,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "outcome")]
+pub enum CacheOutcome {
+    Hit,
+    Miss,
+}
+
+#[derive(LabelGroup)]
+#[label(set = ConsoleRequestSet)]
+pub struct ConsoleRequest<'a> {
+    #[label(dynamic_with = ThreadedRodeo, default)]
+    pub request: &'a str,
+}
+
+#[derive(MetricGroup, Default)]
+pub struct HttpEndpointPools {
+    /// Number of endpoints we have registered pools for
+    pub http_pool_endpoints_registered_total: Counter,
+    /// Number of endpoints we have unregistered pools for
+    pub http_pool_endpoints_unregistered_total: Counter,
+}
+
+pub struct HttpEndpointPoolsGuard<'a> {
+    dec: &'a Counter,
+}
+
+impl Drop for HttpEndpointPoolsGuard<'_> {
+    fn drop(&mut self) {
+        self.dec.inc();
+    }
+}
+
+impl HttpEndpointPools {
+    pub fn guard(&self) -> HttpEndpointPoolsGuard {
+        self.http_pool_endpoints_registered_total.inc();
+        HttpEndpointPoolsGuard {
+            dec: &self.http_pool_endpoints_unregistered_total,
+        }
+    }
+}
+pub struct NumDbConnectionsGauge;
+impl CounterPairAssoc for NumDbConnectionsGauge {
+    const INC_NAME: &'static MetricName = MetricName::from_str("opened_db_connections_total");
+    const DEC_NAME: &'static MetricName = MetricName::from_str("closed_db_connections_total");
+    const INC_HELP: &'static str = "Number of opened connections to a database.";
+    const DEC_HELP: &'static str = "Number of closed connections to a database.";
+    type LabelGroupSet = StaticLabelSet<Protocol>;
+}
+pub type NumDbConnectionsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumDbConnectionsGauge>;
+
+pub struct NumClientConnectionsGauge;
+impl CounterPairAssoc for NumClientConnectionsGauge {
+    const INC_NAME: &'static MetricName = MetricName::from_str("opened_client_connections_total");
+    const DEC_NAME: &'static MetricName = MetricName::from_str("closed_client_connections_total");
+    const INC_HELP: &'static str = "Number of opened connections from a client.";
+    const DEC_HELP: &'static str = "Number of closed connections from a client.";
+    type LabelGroupSet = StaticLabelSet<Protocol>;
+}
+pub type NumClientConnectionsGuard<'a> =
+    metrics::MeasuredCounterPairGuard<'a, NumClientConnectionsGauge>;
+
+pub struct NumConnectionRequestsGauge;
+impl CounterPairAssoc for NumConnectionRequestsGauge {
+    const INC_NAME: &'static MetricName = MetricName::from_str("accepted_connections_total");
+    const DEC_NAME: &'static MetricName = MetricName::from_str("closed_connections_total");
+    const INC_HELP: &'static str = "Number of client connections accepted.";
+    const DEC_HELP: &'static str = "Number of client connections closed.";
+    type LabelGroupSet = StaticLabelSet<Protocol>;
+}
+pub type NumConnectionRequestsGuard<'a> =
+    metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>;
+
+#[derive(LabelGroup)]
+#[label(set = ComputeConnectionLatencySet)]
+pub struct ComputeConnectionLatencyGroup {
+    protocol: Protocol,
+    cold_start_info: ColdStartInfo,
+    outcome: ConnectOutcome,
+    excluded: LatencyExclusions,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+pub enum LatencyExclusions {
+    Client,
+    ClientAndCplane,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "limit")]
+pub enum RateLimit {
+    Actual,
+    Expected,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "kind")]
+pub enum SniKind {
+    Sni,
+    NoSni,
+    PasswordHack,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "kind")]
+pub enum ConnectionFailureKind {
+    ComputeCached,
+    ComputeUncached,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "kind")]
+pub enum WakeupFailureKind {
+    BadComputeAddress,
+    ApiTransportError,
+    QuotaExceeded,
+    ApiConsoleLocked,
+    ApiConsoleBadRequest,
+    ApiConsoleOtherServerError,
+    ApiConsoleOtherError,
+    TimeoutError,
+}
+
+#[derive(LabelGroup)]
+#[label(set = ConnectionFailuresBreakdownSet)]
+pub struct ConnectionFailuresBreakdownGroup {
+    pub kind: WakeupFailureKind,
+    pub retry: Bool,
+}
+
+#[derive(LabelGroup, Copy, Clone)]
+#[label(set = RedisErrorsSet)]
+pub struct RedisErrors<'a> {
+    #[label(dynamic_with = ThreadedRodeo, default)]
+    pub channel: &'a str,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+pub enum CancellationSource {
+    FromClient,
+    FromRedis,
+    Local,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+pub enum CancellationOutcome {
+    NotFound,
+    Found,
+}
+
+#[derive(LabelGroup)]
+#[label(set = CancellationRequestSet)]
+pub struct CancellationRequest {
+    pub source: CancellationSource,
+    pub kind: CancellationOutcome,
+}
 
 pub enum Waiting {
     Cplane,
@@ -185,20 +365,6 @@ struct Accumulated {
     compute: time::Duration,
 }
 
-enum Outcome {
-    Success,
-    Failed,
-}
-
-impl Outcome {
-    fn as_str(&self) -> &'static str {
-        match self {
-            Outcome::Success => "success",
-            Outcome::Failed => "failed",
-        }
-    }
-}
-
 pub struct LatencyTimer {
     // time since the stopwatch was started
     start: time::Instant,
@@ -207,9 +373,9 @@ pub struct LatencyTimer {
     // accumulated time on the stopwatch
     accumulated: Accumulated,
     // label data
-    protocol: &'static str,
+    protocol: Protocol,
     cold_start_info: ColdStartInfo,
-    outcome: Outcome,
+    outcome: ConnectOutcome,
 }
 
 pub struct LatencyTimerPause<'a> {
@@ -219,7 +385,7 @@ pub struct LatencyTimerPause<'a> {
 }
 
 impl LatencyTimer {
-    pub fn new(protocol: &'static str) -> Self {
+    pub fn new(protocol: Protocol) -> Self {
         Self {
             start: time::Instant::now(),
             stop: None,
@@ -227,7 +393,7 @@ impl LatencyTimer {
             protocol,
             cold_start_info: ColdStartInfo::Unknown,
             // assume failed unless otherwise specified
-            outcome: Outcome::Failed,
+            outcome: ConnectOutcome::Failed,
         }
     }
 
@@ -248,7 +414,7 @@ impl LatencyTimer {
         self.stop = Some(time::Instant::now());
 
         // success
-        self.outcome = Outcome::Success;
+        self.outcome = ConnectOutcome::Success;
     }
 }
 
@@ -263,128 +429,54 @@ impl Drop for LatencyTimerPause<'_> {
     }
 }
 
+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+enum ConnectOutcome {
+    Success,
+    Failed,
+}
+
 impl Drop for LatencyTimer {
     fn drop(&mut self) {
         let duration = self
             .stop
             .unwrap_or_else(time::Instant::now)
             .duration_since(self.start);
-        // Excluding cplane communication from the accumulated time.
-        COMPUTE_CONNECTION_LATENCY
-            .with_label_values(&[
-                self.protocol,
-                self.cold_start_info.as_str(),
-                self.outcome.as_str(),
-                "client",
-            ])
-            .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
+
+        let metric = &Metrics::get().proxy.compute_connection_latency_seconds;
+
+        // Excluding client communication from the accumulated time.
+        metric.observe(
+            ComputeConnectionLatencyGroup {
+                protocol: self.protocol,
+                cold_start_info: self.cold_start_info,
+                outcome: self.outcome,
+                excluded: LatencyExclusions::Client,
+            },
+            duration
+                .saturating_sub(self.accumulated.client)
+                .as_secs_f64(),
+        );
+
         // Exclude client and cplane communication from the accumulated time.
         let accumulated_total = self.accumulated.client + self.accumulated.cplane;
-        COMPUTE_CONNECTION_LATENCY
-            .with_label_values(&[
-                self.protocol,
-                self.cold_start_info.as_str(),
-                self.outcome.as_str(),
-                "client_and_cplane",
-            ])
-            .observe((duration.saturating_sub(accumulated_total)).as_secs_f64());
+        metric.observe(
+            ComputeConnectionLatencyGroup {
+                protocol: self.protocol,
+                cold_start_info: self.cold_start_info,
+                outcome: self.outcome,
+                excluded: LatencyExclusions::ClientAndCplane,
+            },
+            duration.saturating_sub(accumulated_total).as_secs_f64(),
+        );
     }
 }
 
-pub static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_connection_failures_total",
-        "Number of connection failures (per kind).",
-        &["kind"],
-    )
-    .unwrap()
-});
-
-pub static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_connection_failures_breakdown",
-        "Number of wake-up failures (per kind).",
-        &["retry", "kind"],
-    )
-    .unwrap()
-});
-
-pub static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_io_bytes",
-        "Number of bytes sent/received between all clients and backends.",
-        &["direction"],
-    )
-    .unwrap()
-});
-
-pub const fn bool_to_str(x: bool) -> &'static str {
-    if x {
-        "true"
-    } else {
-        "false"
+impl From<bool> for Bool {
+    fn from(value: bool) -> Self {
+        if value {
+            Bool::True
+        } else {
+            Bool::False
+        }
     }
 }
-
-pub static CONNECTING_ENDPOINTS: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
-    register_hll_vec!(
-        32,
-        "proxy_connecting_endpoints",
-        "HLL approximate cardinality of endpoints that are connecting",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static ERROR_BY_KIND: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_errors_total",
-        "Number of errors by a given classification",
-        &["type"],
-    )
-    .unwrap()
-});
-
-pub static ENDPOINT_ERRORS_BY_KIND: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
-    register_hll_vec!(
-        32,
-        "proxy_endpoints_affected_by_errors",
-        "Number of endpoints affected by errors of a given classification",
-        &["type"],
-    )
-    .unwrap()
-});
-
-pub static REDIS_BROKEN_MESSAGES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_redis_errors_total",
-        "Number of errors by a given classification",
-        &["channel"],
-    )
-    .unwrap()
-});
-
-pub static TLS_HANDSHAKE_FAILURES: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "proxy_tls_handshake_failures",
-        "Number of TLS handshake failures",
-    )
-    .unwrap()
-});
-
-pub static ENDPOINTS_AUTH_RATE_LIMITED: Lazy<HyperLogLog<32>> = Lazy::new(|| {
-    register_hll!(
-        32,
-        "proxy_endpoints_auth_rate_limits",
-        "Number of endpoints affected by authentication rate limits",
-    )
-    .unwrap()
-});
-
-pub static AUTH_RATE_LIMIT_HITS: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "proxy_requests_auth_rate_limits_total",
-        "Number of connection requests affected by authentication rate limits",
-    )
-    .unwrap()
-});
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 6051c0a812..5598215b6b 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -15,7 +15,7 @@ use crate::{
     config::{ProxyConfig, TlsConfig},
     context::RequestMonitoring,
     error::ReportableError,
-    metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE},
+    metrics::{Metrics, NumClientConnectionsGuard},
     protocol2::WithClientIp,
     proxy::handshake::{handshake, HandshakeData},
     rate_limiter::EndpointRateLimiter,
@@ -24,7 +24,6 @@ use crate::{
 };
 use futures::TryFutureExt;
 use itertools::Itertools;
-use metrics::IntCounterPairGuard;
 use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, StartupMessageParams};
 use regex::Regex;
@@ -79,9 +78,10 @@ pub async fn task_main(
     {
         let (socket, peer_addr) = accept_result?;
 
-        let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
-            .with_label_values(&["tcp"])
-            .guard();
+        let conn_gauge = Metrics::get()
+            .proxy
+            .client_connections
+            .guard(crate::metrics::Protocol::Tcp);
 
         let session_id = uuid::Uuid::new_v4();
         let cancellation_handler = Arc::clone(&cancellation_handler);
@@ -113,7 +113,12 @@ pub async fn task_main(
                 },
             };
 
-            let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
+            let mut ctx = RequestMonitoring::new(
+                    session_id,
+                    peer_addr,
+                    crate::metrics::Protocol::Tcp,
+                    &config.region,
+                );
             let span = ctx.span.clone();
 
             let res = handle_client(
@@ -237,14 +242,17 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     stream: S,
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    conn_gauge: IntCounterPairGuard,
+    conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
-    info!("handling interactive connection from client");
+    info!(
+        protocol = %ctx.protocol,
+        "handling interactive connection from client"
+    );
 
+    let metrics = &Metrics::get().proxy;
     let proto = ctx.protocol;
-    let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
-        .with_label_values(&[proto])
-        .guard();
+    // let _client_gauge = metrics.client_connections.guard(proto);
+    let _request_gauge = metrics.connection_requests.guard(proto);
 
     let tls = config.tls_config.as_ref();
 
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 4c0d68ce0b..33f394c550 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -4,7 +4,7 @@ use crate::{
     console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
     context::RequestMonitoring,
     error::ReportableError,
-    metrics::NUM_CONNECTION_FAILURES,
+    metrics::{ConnectionFailureKind, Metrics},
     proxy::{
         retry::{retry_after, ShouldRetry},
         wake_compute::wake_compute,
@@ -27,10 +27,10 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
         warn!("invalidating stalled compute node info cache entry");
     }
     let label = match is_cached {
-        true => "compute_cached",
-        false => "compute_uncached",
+        true => ConnectionFailureKind::ComputeCached,
+        false => ConnectionFailureKind::ComputeUncached,
     };
-    NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
+    Metrics::get().proxy.connection_failures_total.inc(label);
 
     node_info.invalidate()
 }
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index c81a1a8292..62de79946f 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -2,11 +2,10 @@ use crate::{
     cancellation,
     compute::PostgresConnection,
     console::messages::MetricsAuxInfo,
-    metrics::NUM_BYTES_PROXIED_COUNTER,
+    metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard},
     stream::Stream,
     usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS},
 };
-use metrics::IntCounterPairGuard;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 use utils::measured_stream::MeasuredStream;
@@ -23,24 +22,25 @@ pub async fn proxy_pass(
         branch_id: aux.branch_id,
     });
 
-    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
+    let metrics = &Metrics::get().proxy.io_bytes;
+    let m_sent = metrics.with_labels(Direction::Tx);
     let mut client = MeasuredStream::new(
         client,
         |_| {},
         |cnt| {
             // Number of bytes we sent to the client (outbound).
-            m_sent.inc_by(cnt as u64);
+            metrics.get_metric(m_sent).inc_by(cnt as u64);
             usage.record_egress(cnt as u64);
         },
     );
 
-    let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
+    let m_recv = metrics.with_labels(Direction::Rx);
     let mut compute = MeasuredStream::new(
         compute,
         |_| {},
         |cnt| {
             // Number of bytes the client sent to the compute node (inbound).
-            m_recv.inc_by(cnt as u64);
+            metrics.get_metric(m_recv).inc_by(cnt as u64);
         },
     );
 
@@ -60,8 +60,8 @@ pub struct ProxyPassthrough<P, S> {
     pub compute: PostgresConnection,
     pub aux: MetricsAuxInfo,
 
-    pub req: IntCounterPairGuard,
-    pub conn: IntCounterPairGuard,
+    pub req: NumConnectionRequestsGuard<'static>,
+    pub conn: NumClientConnectionsGuard<'static>,
     pub cancel: cancellation::Session<P>,
 }
 
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index bfe4b7ec3a..f8154b1a94 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,6 +1,6 @@
 use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
 use crate::context::RequestMonitoring;
-use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES};
+use crate::metrics::{ConnectionFailuresBreakdownGroup, Metrics, WakeupFailureKind};
 use crate::proxy::retry::retry_after;
 use hyper::StatusCode;
 use std::ops::ControlFlow;
@@ -57,39 +57,46 @@ pub fn handle_try_wake(
 
 fn report_error(e: &WakeComputeError, retry: bool) {
     use crate::console::errors::ApiError;
-    let retry = bool_to_str(retry);
     let kind = match e {
-        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
-        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
+        WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress,
+        WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError,
         WakeComputeError::ApiError(ApiError::Console {
             status: StatusCode::LOCKED,
             ref text,
         }) if text.contains("written data quota exceeded")
             || text.contains("the limit for current plan reached") =>
         {
-            "quota_exceeded"
+            WakeupFailureKind::QuotaExceeded
         }
         WakeComputeError::ApiError(ApiError::Console {
             status: StatusCode::UNPROCESSABLE_ENTITY,
             ref text,
         }) if text.contains("compute time quota of non-primary branches is exceeded") => {
-            "quota_exceeded"
+            WakeupFailureKind::QuotaExceeded
         }
         WakeComputeError::ApiError(ApiError::Console {
             status: StatusCode::LOCKED,
             ..
-        }) => "api_console_locked",
+        }) => WakeupFailureKind::ApiConsoleLocked,
         WakeComputeError::ApiError(ApiError::Console {
             status: StatusCode::BAD_REQUEST,
             ..
-        }) => "api_console_bad_request",
+        }) => WakeupFailureKind::ApiConsoleBadRequest,
         WakeComputeError::ApiError(ApiError::Console { status, .. })
             if status.is_server_error() =>
         {
-            "api_console_other_server_error"
+            WakeupFailureKind::ApiConsoleOtherServerError
         }
-        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
-        WakeComputeError::TimeoutError => "timeout_error",
+        WakeComputeError::ApiError(ApiError::Console { .. }) => {
+            WakeupFailureKind::ApiConsoleOtherError
+        }
+        WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError,
     };
-    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
+    Metrics::get()
+        .proxy
+        .connection_failures_breakdown
+        .inc(ConnectionFailuresBreakdownGroup {
+            kind,
+            retry: retry.into(),
+        });
 }
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index f590896dd9..aba5120f38 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -17,7 +17,13 @@ use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
 use tokio::time::{timeout, Duration, Instant};
 use tracing::info;
 
-use crate::{intern::EndpointIdInt, EndpointId};
+use crate::{
+    intern::EndpointIdInt,
+    {
+        metrics::{Metrics, RateLimit},
+        EndpointId,
+    },
+};
 
 use super::{
     limit_algorithm::{LimitAlgorithm, Sample},
@@ -457,12 +463,9 @@ impl Limiter {
             }
             new_limit
         };
-        crate::metrics::RATE_LIMITER_LIMIT
-            .with_label_values(&["expected"])
-            .set(new_limit as i64);
-        crate::metrics::RATE_LIMITER_LIMIT
-            .with_label_values(&["actual"])
-            .set(actual_limit as i64);
+        let metric = &Metrics::get().semaphore_control_plane_limit;
+        metric.set(RateLimit::Expected, new_limit as i64);
+        metric.set(RateLimit::Actual, actual_limit as i64);
         self.limits.store(new_limit, Ordering::Release);
         #[cfg(test)]
         if let Some(n) = &self.notifier {
@@ -519,7 +522,10 @@ impl reqwest_middleware::Middleware for Limiter {
         extensions: &mut task_local_extensions::Extensions,
         next: reqwest_middleware::Next<'_>,
     ) -> reqwest_middleware::Result<reqwest::Response> {
-        let start = Instant::now();
+        let timer = Metrics::get()
+            .proxy
+            .control_plane_token_acquire_seconds
+            .start_timer();
         let token = self
             .acquire_timeout(self.config.timeout)
             .await
@@ -533,8 +539,12 @@ impl reqwest_middleware::Middleware for Limiter {
                     .into(),
                 )
             })?;
-        info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane");
-        crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64());
+        let duration = timer.observe();
+        info!(
+            ?duration,
+            "waiting for token to connect to the control plane"
+        );
+
         match next.run(req, extensions).await {
             Ok(response) => {
                 self.release(token, Some(Outcome::from_reqwest_response(&response)))
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 8b7e3e3419..5a38530faf 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -11,7 +11,7 @@ use crate::{
     cache::project_info::ProjectInfoCache,
     cancellation::{CancelMap, CancellationHandler},
     intern::{ProjectIdInt, RoleNameInt},
-    metrics::{NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, REDIS_BROKEN_MESSAGES},
+    metrics::{Metrics, RedisErrors},
 };
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -104,9 +104,9 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
         let msg: Notification = match serde_json::from_str(&payload) {
             Ok(msg) => msg,
             Err(e) => {
-                REDIS_BROKEN_MESSAGES
-                    .with_label_values(&[msg.get_channel_name()])
-                    .inc();
+                Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
+                    channel: msg.get_channel_name(),
+                });
                 tracing::error!("broken message: {e}");
                 return Ok(());
             }
@@ -183,7 +183,7 @@ where
         cache,
         Arc::new(CancellationHandler::<()>::new(
             cancel_map,
-            NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
+            crate::metrics::CancellationSource::FromRedis,
         )),
         region_id,
     );
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index f275caa7eb..24c94fadd8 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -32,7 +32,7 @@ use tokio_util::task::TaskTracker;
 use crate::cancellation::CancellationHandlerMain;
 use crate::config::ProxyConfig;
 use crate::context::RequestMonitoring;
-use crate::metrics::{NUM_CLIENT_CONNECTION_GAUGE, TLS_HANDSHAKE_FAILURES};
+use crate::metrics::Metrics;
 use crate::protocol2::WithClientIp;
 use crate::proxy::run_until_cancelled;
 use crate::rate_limiter::EndpointRateLimiter;
@@ -156,9 +156,10 @@ async fn connection_handler(
 ) {
     let session_id = uuid::Uuid::new_v4();
 
-    let _gauge = NUM_CLIENT_CONNECTION_GAUGE
-        .with_label_values(&["http"])
-        .guard();
+    let _gauge = Metrics::get()
+        .proxy
+        .client_connections
+        .guard(crate::metrics::Protocol::Http);
 
     // handle PROXY protocol
     let mut conn = WithClientIp::new(conn);
@@ -181,13 +182,13 @@ async fn connection_handler(
         }
         // The handshake failed
         Ok(Err(e)) => {
-            TLS_HANDSHAKE_FAILURES.inc();
+            Metrics::get().proxy.tls_handshake_failures.inc();
             warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
             return;
         }
         // The handshake timed out
         Err(e) => {
-            TLS_HANDSHAKE_FAILURES.inc();
+            Metrics::get().proxy.tls_handshake_failures.inc();
             warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
             return;
         }
@@ -274,7 +275,13 @@ async fn request_handler(
 
     // Check if the request is a websocket upgrade request.
     if hyper_tungstenite::is_upgrade_request(&request) {
-        let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
+        let ctx = RequestMonitoring::new(
+            session_id,
+            peer_addr,
+            crate::metrics::Protocol::Ws,
+            &config.region,
+        );
+
         let span = ctx.span.clone();
         info!(parent: &span, "performing websocket upgrade");
 
@@ -302,7 +309,12 @@ async fn request_handler(
         // Return the response so the spawned future can continue.
         Ok(response)
     } else if request.uri().path() == "/sql" && *request.method() == Method::POST {
-        let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
+        let ctx = RequestMonitoring::new(
+            session_id,
+            peer_addr,
+            crate::metrics::Protocol::Http,
+            &config.region,
+        );
         let span = ctx.span.clone();
 
         sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 35311facb8..131f088880 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,6 +1,5 @@
 use dashmap::DashMap;
 use futures::{future::poll_fn, Future};
-use metrics::IntCounterPairGuard;
 use parking_lot::RwLock;
 use rand::Rng;
 use smallvec::SmallVec;
@@ -18,11 +17,10 @@ use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
 
 use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
-use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL};
+use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
 use crate::{
-    auth::backend::ComputeUserInfo, context::RequestMonitoring, metrics::NUM_DB_CONNECTIONS_GAUGE,
-    DbName, EndpointCacheKey, RoleName,
+    auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName,
 };
 
 use tracing::{debug, error, warn, Span};
@@ -78,7 +76,7 @@ pub struct EndpointConnPool<C: ClientInnerExt> {
     pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
     total_conns: usize,
     max_conns: usize,
-    _guard: IntCounterPairGuard,
+    _guard: HttpEndpointPoolsGuard<'static>,
     global_connections_count: Arc<AtomicUsize>,
     global_pool_size_max_conns: usize,
 }
@@ -110,7 +108,11 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
             let removed = old_len - new_len;
             if removed > 0 {
                 global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
-                NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64);
+                Metrics::get()
+                    .proxy
+                    .http_pool_opened_connections
+                    .get_metric()
+                    .dec_by(removed as i64);
             }
             *total_conns -= removed;
             removed > 0
@@ -156,7 +158,11 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
                 pool.total_conns += 1;
                 pool.global_connections_count
                     .fetch_add(1, atomic::Ordering::Relaxed);
-                NUM_OPEN_CLIENTS_IN_HTTP_POOL.inc();
+                Metrics::get()
+                    .proxy
+                    .http_pool_opened_connections
+                    .get_metric()
+                    .inc();
             }
 
             pool.total_conns
@@ -176,7 +182,11 @@ impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
         if self.total_conns > 0 {
             self.global_connections_count
                 .fetch_sub(self.total_conns, atomic::Ordering::Relaxed);
-            NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(self.total_conns as i64);
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(self.total_conns as i64);
         }
     }
 }
@@ -215,7 +225,11 @@ impl<C: ClientInnerExt> DbUserConnPool<C> {
             removed += 1;
         }
         global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
-        NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64);
+        Metrics::get()
+            .proxy
+            .http_pool_opened_connections
+            .get_metric()
+            .dec_by(removed as i64);
         conn
     }
 }
@@ -303,7 +317,10 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         // acquire a random shard lock
         let mut shard = self.global_pool.shards()[shard].write();
 
-        let timer = GC_LATENCY.start_timer();
+        let timer = Metrics::get()
+            .proxy
+            .http_pool_reclaimation_lag_seconds
+            .start_timer();
         let current_len = shard.len();
         let mut clients_removed = 0;
         shard.retain(|endpoint, x| {
@@ -331,7 +348,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
 
         let new_len = shard.len();
         drop(shard);
-        timer.observe_duration();
+        timer.observe();
 
         // Do logging outside of the lock.
         if clients_removed > 0 {
@@ -339,7 +356,11 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                 .global_connections_count
                 .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
                 - clients_removed;
-            NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(clients_removed as i64);
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(clients_removed as i64);
             info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
         }
         let removed = current_len - new_len;
@@ -410,7 +431,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
             pools: HashMap::new(),
             total_conns: 0,
             max_conns: self.config.pool_options.max_conns_per_endpoint,
-            _guard: ENDPOINT_POOLS.guard(),
+            _guard: Metrics::get().proxy.http_endpoint_pools.guard(),
             global_connections_count: self.global_connections_count.clone(),
             global_pool_size_max_conns: self.config.pool_options.max_total_conns,
         }));
@@ -450,9 +471,7 @@ pub fn poll_client<C: ClientInnerExt>(
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
 ) -> Client<C> {
-    let conn_gauge = NUM_DB_CONNECTIONS_GAUGE
-        .with_label_values(&[ctx.protocol])
-        .guard();
+    let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol);
     let mut session_id = ctx.session_id;
     let (tx, mut rx) = tokio::sync::watch::channel(session_id);
 
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 7f7f93988c..a66edb2c66 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -43,8 +43,8 @@ use crate::context::RequestMonitoring;
 use crate::error::ErrorKind;
 use crate::error::ReportableError;
 use crate::error::UserFacingError;
-use crate::metrics::HTTP_CONTENT_LENGTH;
-use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
+use crate::metrics::HttpDirection;
+use crate::metrics::Metrics;
 use crate::proxy::run_until_cancelled;
 use crate::proxy::NeonOptions;
 use crate::serverless::backend::HttpConnError;
@@ -494,10 +494,11 @@ async fn handle_inner(
     request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
 ) -> Result<Response<Full<Bytes>>, SqlOverHttpError> {
-    let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
-        .with_label_values(&[ctx.protocol])
-        .guard();
-    info!("handling interactive connection from client");
+    let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol);
+    info!(
+        protocol = %ctx.protocol,
+        "handling interactive connection from client"
+    );
 
     //
     // Determine the destination and connection params
@@ -520,9 +521,10 @@ async fn handle_inner(
         None => MAX_REQUEST_SIZE + 1,
     };
     info!(request_content_length, "request size in bytes");
-    HTTP_CONTENT_LENGTH
-        .with_label_values(&["request"])
-        .observe(request_content_length as f64);
+    Metrics::get()
+        .proxy
+        .http_conn_content_length_bytes
+        .observe(HttpDirection::Request, request_content_length as f64);
 
     // we don't have a streaming request support yet so this is to prevent OOM
     // from a malicious user sending an extremely large request body
@@ -607,9 +609,10 @@ async fn handle_inner(
     // count the egress bytes - we miss the TLS and header overhead but oh well...
     // moving this later in the stack is going to be a lot of effort and ehhhh
     metrics.record_egress(len as u64);
-    HTTP_CONTENT_LENGTH
-        .with_label_values(&["response"])
-        .observe(len as f64);
+    Metrics::get()
+        .proxy
+        .http_conn_content_length_bytes
+        .observe(HttpDirection::Response, len as f64);
 
     Ok(response)
 }
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index ada6c974f4..d054877126 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -3,7 +3,7 @@ use crate::{
     config::ProxyConfig,
     context::RequestMonitoring,
     error::{io_error, ReportableError},
-    metrics::NUM_CLIENT_CONNECTION_GAUGE,
+    metrics::Metrics,
     proxy::{handle_client, ClientMode},
     rate_limiter::EndpointRateLimiter,
 };
@@ -139,9 +139,10 @@ pub async fn serve_websocket(
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
-    let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
-        .with_label_values(&["ws"])
-        .guard();
+    let conn_gauge = Metrics::get()
+        .proxy
+        .client_connections
+        .guard(crate::metrics::Protocol::Ws);
 
     let res = handle_client(
         config,
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index b6b7a85659..fdd2be3ee5 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -1,6 +1,6 @@
 use crate::config::TlsServerEndPoint;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
-use crate::metrics::TLS_HANDSHAKE_FAILURES;
+use crate::metrics::Metrics;
 use bytes::BytesMut;
 
 use pq_proto::framed::{ConnectionError, Framed};
@@ -228,7 +228,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
             Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg)
                 .accept(raw)
                 .await
-                .inspect_err(|_| TLS_HANDSHAKE_FAILURES.inc())?),
+                .inspect_err(|_| Metrics::get().proxy.tls_handshake_failures.inc())?),
             Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
         }
     }

From 40f15c31235242ffdefc8b3662ba252cec55377e Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 11 Apr 2024 20:24:34 +0200
Subject: [PATCH 0557/1571] Read cplane events from regional redis (#7352)

## Problem

Actually read redis events.

## Summary of changes

This is revert of https://github.com/neondatabase/neon/pull/7350 +
fixes.
* Fixed events parsing
* Added timeout after connection failure
* Separated regional and global redis clients.
---
 proxy/src/auth/backend.rs                     |   4 +-
 proxy/src/bin/proxy.rs                        |  61 +++--
 proxy/src/cache.rs                            |   1 +
 proxy/src/cache/endpoints.rs                  | 226 ++++++++++++++++++
 proxy/src/config.rs                           |  74 ++++++
 proxy/src/console/provider.rs                 |  17 +-
 proxy/src/console/provider/neon.rs            |  47 ++--
 proxy/src/context.rs                          |  22 +-
 proxy/src/intern.rs                           |  15 ++
 proxy/src/lib.rs                              |  37 +++
 proxy/src/metrics.rs                          |  13 +-
 proxy/src/proxy.rs                            |   4 +-
 proxy/src/rate_limiter.rs                     |   2 +-
 proxy/src/rate_limiter/limiter.rs             |  10 +-
 proxy/src/redis/cancellation_publisher.rs     |   6 +-
 .../regress/test_proxy_rate_limiter.py        |  84 -------
 16 files changed, 479 insertions(+), 144 deletions(-)
 create mode 100644 proxy/src/cache/endpoints.rs
 delete mode 100644 test_runner/regress/test_proxy_rate_limiter.py

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 229d499e30..ab5dd4544b 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -27,7 +27,7 @@ use crate::{
     },
     stream, url,
 };
-use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
+use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
@@ -186,7 +186,7 @@ impl AuthenticationConfig {
         is_cleartext: bool,
     ) -> auth::Result<AuthSecret> {
         // we have validated the endpoint exists, so let's intern it.
-        let endpoint_int = EndpointIdInt::from(endpoint);
+        let endpoint_int = EndpointIdInt::from(endpoint.normalize());
 
         // only count the full hash count if password hack or websocket flow.
         // in other words, if proxy needs to run the hashing
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 3392c21075..2e749fc7e8 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -190,7 +190,9 @@ struct ProxyCliArgs {
     /// cache for `project_info` (use `size=0` to disable)
     #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
     project_info_cache: String,
-
+    /// cache for all valid endpoints
+    #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
+    endpoint_cache_config: String,
     #[clap(flatten)]
     parquet_upload: ParquetUploadArgs,
 
@@ -301,27 +303,27 @@ async fn main() -> anyhow::Result<()> {
         ),
         aws_credentials_provider,
     ));
-    let redis_notifications_client =
-        match (args.redis_notifications, (args.redis_host, args.redis_port)) {
-            (Some(url), _) => {
-                info!("Starting redis notifications listener ({url})");
-                Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
-            }
-            (None, (Some(host), Some(port))) => Some(
-                ConnectionWithCredentialsProvider::new_with_credentials_provider(
-                    host,
-                    port,
-                    elasticache_credentials_provider.clone(),
-                ),
+    let regional_redis_client = match (args.redis_host, args.redis_port) {
+        (Some(host), Some(port)) => Some(
+            ConnectionWithCredentialsProvider::new_with_credentials_provider(
+                host,
+                port,
+                elasticache_credentials_provider.clone(),
             ),
-            (None, (None, None)) => {
-                warn!("Redis is disabled");
-                None
-            }
-            _ => {
-                bail!("redis-host and redis-port must be specified together");
-            }
-        };
+        ),
+        (None, None) => {
+            warn!("Redis events from console are disabled");
+            None
+        }
+        _ => {
+            bail!("redis-host and redis-port must be specified together");
+        }
+    };
+    let redis_notifications_client = if let Some(url) = args.redis_notifications {
+        Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
+    } else {
+        regional_redis_client.clone()
+    };
 
     // Check that we can bind to address before further initialization
     let http_address: SocketAddr = args.http.parse()?;
@@ -340,8 +342,7 @@ async fn main() -> anyhow::Result<()> {
     let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
     let cancel_map = CancelMap::default();
 
-    // let redis_notifications_client = redis_notifications_client.map(|x| Box::leak(Box::new(x)));
-    let redis_publisher = match &redis_notifications_client {
+    let redis_publisher = match &regional_redis_client {
         Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
             redis_publisher.clone(),
             args.region.clone(),
@@ -416,13 +417,18 @@ async fn main() -> anyhow::Result<()> {
             if let Some(redis_notifications_client) = redis_notifications_client {
                 let cache = api.caches.project_info.clone();
                 maintenance_tasks.spawn(notifications::task_main(
-                    redis_notifications_client.clone(),
+                    redis_notifications_client,
                     cache.clone(),
                     cancel_map.clone(),
                     args.region.clone(),
                 ));
                 maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
             }
+            if let Some(regional_redis_client) = regional_redis_client {
+                let cache = api.caches.endpoints_cache.clone();
+                let con = regional_redis_client;
+                maintenance_tasks.spawn(async move { cache.do_read(con).await });
+            }
         }
     }
 
@@ -501,14 +507,18 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
                 args.project_info_cache.parse()?;
+            let endpoint_cache_config: config::EndpointCacheConfig =
+                args.endpoint_cache_config.parse()?;
 
             info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
             info!(
                 "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
             );
+            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
             let caches = Box::leak(Box::new(console::caches::ApiCaches::new(
                 wake_compute_cache_config,
                 project_info_cache_config,
+                endpoint_cache_config,
             )));
 
             let config::WakeComputeLockOptions {
@@ -524,11 +534,12 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
                     permits,
                     shards,
                     timeout,
+                    epoch,
                     &Metrics::get().wake_compute_lock,
                 )
                 .unwrap(),
             ));
-            tokio::spawn(locks.garbage_collect_worker(epoch));
+            tokio::spawn(locks.garbage_collect_worker());
 
             let url = args.auth_endpoint.parse()?;
             let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));
diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs
index fc5f416395..d1d4087241 100644
--- a/proxy/src/cache.rs
+++ b/proxy/src/cache.rs
@@ -1,4 +1,5 @@
 pub mod common;
+pub mod endpoints;
 pub mod project_info;
 mod timed_lru;
 
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
new file mode 100644
index 0000000000..f3f9e9395f
--- /dev/null
+++ b/proxy/src/cache/endpoints.rs
@@ -0,0 +1,226 @@
+use std::{
+    convert::Infallible,
+    sync::{
+        atomic::{AtomicBool, Ordering},
+        Arc,
+    },
+};
+
+use dashmap::DashSet;
+use redis::{
+    streams::{StreamReadOptions, StreamReadReply},
+    AsyncCommands, FromRedisValue, Value,
+};
+use serde::Deserialize;
+use tokio::sync::Mutex;
+
+use crate::{
+    config::EndpointCacheConfig,
+    context::RequestMonitoring,
+    intern::{BranchIdInt, EndpointIdInt, ProjectIdInt},
+    metrics::{Metrics, RedisErrors},
+    rate_limiter::GlobalRateLimiter,
+    redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
+    EndpointId,
+};
+
+#[derive(Deserialize, Debug, Clone)]
+pub struct ControlPlaneEventKey {
+    endpoint_created: Option<EndpointCreated>,
+    branch_created: Option<BranchCreated>,
+    project_created: Option<ProjectCreated>,
+}
+#[derive(Deserialize, Debug, Clone)]
+struct EndpointCreated {
+    endpoint_id: String,
+}
+#[derive(Deserialize, Debug, Clone)]
+struct BranchCreated {
+    branch_id: String,
+}
+#[derive(Deserialize, Debug, Clone)]
+struct ProjectCreated {
+    project_id: String,
+}
+
+pub struct EndpointsCache {
+    config: EndpointCacheConfig,
+    endpoints: DashSet<EndpointIdInt>,
+    branches: DashSet<BranchIdInt>,
+    projects: DashSet<ProjectIdInt>,
+    ready: AtomicBool,
+    limiter: Arc<Mutex<GlobalRateLimiter>>,
+}
+
+impl EndpointsCache {
+    pub fn new(config: EndpointCacheConfig) -> Self {
+        Self {
+            limiter: Arc::new(Mutex::new(GlobalRateLimiter::new(
+                config.limiter_info.clone(),
+            ))),
+            config,
+            endpoints: DashSet::new(),
+            branches: DashSet::new(),
+            projects: DashSet::new(),
+            ready: AtomicBool::new(false),
+        }
+    }
+    pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
+        if !self.ready.load(Ordering::Acquire) {
+            return true;
+        }
+        // If cache is disabled, just collect the metrics and return.
+        if self.config.disable_cache {
+            ctx.set_rejected(self.should_reject(endpoint));
+            return true;
+        }
+        // If the limiter allows, we don't need to check the cache.
+        if self.limiter.lock().await.check() {
+            return true;
+        }
+        let rejected = self.should_reject(endpoint);
+        ctx.set_rejected(rejected);
+        !rejected
+    }
+    fn should_reject(&self, endpoint: &EndpointId) -> bool {
+        if endpoint.is_endpoint() {
+            !self.endpoints.contains(&EndpointIdInt::from(endpoint))
+        } else if endpoint.is_branch() {
+            !self
+                .branches
+                .contains(&BranchIdInt::from(&endpoint.as_branch()))
+        } else {
+            !self
+                .projects
+                .contains(&ProjectIdInt::from(&endpoint.as_project()))
+        }
+    }
+    fn insert_event(&self, key: ControlPlaneEventKey) {
+        // Do not do normalization here, we expect the events to be normalized.
+        if let Some(endpoint_created) = key.endpoint_created {
+            self.endpoints
+                .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
+        }
+        if let Some(branch_created) = key.branch_created {
+            self.branches
+                .insert(BranchIdInt::from(&branch_created.branch_id.into()));
+        }
+        if let Some(project_created) = key.project_created {
+            self.projects
+                .insert(ProjectIdInt::from(&project_created.project_id.into()));
+        }
+    }
+    pub async fn do_read(
+        &self,
+        mut con: ConnectionWithCredentialsProvider,
+    ) -> anyhow::Result<Infallible> {
+        let mut last_id = "0-0".to_string();
+        loop {
+            self.ready.store(false, Ordering::Release);
+            if let Err(e) = con.connect().await {
+                tracing::error!("error connecting to redis: {:?}", e);
+                continue;
+            }
+            if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
+                tracing::error!("error reading from redis: {:?}", e);
+            }
+            tokio::time::sleep(self.config.retry_interval).await;
+        }
+    }
+    async fn read_from_stream(
+        &self,
+        con: &mut ConnectionWithCredentialsProvider,
+        last_id: &mut String,
+    ) -> anyhow::Result<()> {
+        tracing::info!("reading endpoints/branches/projects from redis");
+        self.batch_read(
+            con,
+            StreamReadOptions::default().count(self.config.initial_batch_size),
+            last_id,
+            true,
+        )
+        .await?;
+        tracing::info!("ready to filter user requests");
+        self.ready.store(true, Ordering::Release);
+        self.batch_read(
+            con,
+            StreamReadOptions::default()
+                .count(self.config.default_batch_size)
+                .block(self.config.xread_timeout.as_millis() as usize),
+            last_id,
+            false,
+        )
+        .await
+    }
+    fn parse_key_value(value: &Value) -> anyhow::Result<ControlPlaneEventKey> {
+        let s: String = FromRedisValue::from_redis_value(value)?;
+        Ok(serde_json::from_str(&s)?)
+    }
+    async fn batch_read(
+        &self,
+        conn: &mut ConnectionWithCredentialsProvider,
+        opts: StreamReadOptions,
+        last_id: &mut String,
+        return_when_finish: bool,
+    ) -> anyhow::Result<()> {
+        let mut total: usize = 0;
+        loop {
+            let mut res: StreamReadReply = conn
+                .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts)
+                .await?;
+
+            if res.keys.is_empty() {
+                if return_when_finish {
+                    anyhow::bail!(
+                        "Redis stream {} is empty, cannot be used to filter endpoints",
+                        self.config.stream_name
+                    );
+                }
+                // If we are not returning when finish, we should wait for more data.
+                continue;
+            }
+            if res.keys.len() != 1 {
+                anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name);
+            }
+
+            let res = res.keys.pop().expect("Checked length above");
+            let len = res.ids.len();
+            for x in res.ids {
+                total += 1;
+                for (_, v) in x.map {
+                    let key = match Self::parse_key_value(&v) {
+                        Ok(x) => x,
+                        Err(e) => {
+                            Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
+                                channel: &self.config.stream_name,
+                            });
+                            tracing::error!("error parsing value {v:?}: {e:?}");
+                            continue;
+                        }
+                    };
+                    self.insert_event(key);
+                }
+                if total.is_power_of_two() {
+                    tracing::debug!("endpoints read {}", total);
+                }
+                *last_id = x.id;
+            }
+            if return_when_finish && len <= self.config.default_batch_size {
+                break;
+            }
+        }
+        tracing::info!("read {} endpoints/branches/projects from redis", total);
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::ControlPlaneEventKey;
+
+    #[test]
+    fn test() {
+        let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}";
+        let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap();
+    }
+}
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index fc490c7348..b4b2ce8dbd 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -313,6 +313,80 @@ impl CertResolver {
     }
 }
 
+#[derive(Debug)]
+pub struct EndpointCacheConfig {
+    /// Batch size to receive all endpoints on the startup.
+    pub initial_batch_size: usize,
+    /// Batch size to receive endpoints.
+    pub default_batch_size: usize,
+    /// Timeouts for the stream read operation.
+    pub xread_timeout: Duration,
+    /// Stream name to read from.
+    pub stream_name: String,
+    /// Limiter info (to distinguish when to enable cache).
+    pub limiter_info: Vec<RateBucketInfo>,
+    /// Disable cache.
+    /// If true, cache is ignored, but reports all statistics.
+    pub disable_cache: bool,
+    /// Retry interval for the stream read operation.
+    pub retry_interval: Duration,
+}
+
+impl EndpointCacheConfig {
+    /// Default options for [`crate::console::provider::NodeInfoCache`].
+    /// Notice that by default the limiter is empty, which means that cache is disabled.
+    pub const CACHE_DEFAULT_OPTIONS: &'static str =
+        "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s";
+
+    /// Parse cache options passed via cmdline.
+    /// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
+    fn parse(options: &str) -> anyhow::Result<Self> {
+        let mut initial_batch_size = None;
+        let mut default_batch_size = None;
+        let mut xread_timeout = None;
+        let mut stream_name = None;
+        let mut limiter_info = vec![];
+        let mut disable_cache = false;
+        let mut retry_interval = None;
+
+        for option in options.split(',') {
+            let (key, value) = option
+                .split_once('=')
+                .with_context(|| format!("bad key-value pair: {option}"))?;
+
+            match key {
+                "initial_batch_size" => initial_batch_size = Some(value.parse()?),
+                "default_batch_size" => default_batch_size = Some(value.parse()?),
+                "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?),
+                "stream_name" => stream_name = Some(value.to_string()),
+                "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?),
+                "disable_cache" => disable_cache = value.parse()?,
+                "retry_interval" => retry_interval = Some(humantime::parse_duration(value)?),
+                unknown => bail!("unknown key: {unknown}"),
+            }
+        }
+        RateBucketInfo::validate(&mut limiter_info)?;
+
+        Ok(Self {
+            initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?,
+            default_batch_size: default_batch_size.context("missing `default_batch_size`")?,
+            xread_timeout: xread_timeout.context("missing `xread_timeout`")?,
+            stream_name: stream_name.context("missing `stream_name`")?,
+            disable_cache,
+            limiter_info,
+            retry_interval: retry_interval.context("missing `retry_interval`")?,
+        })
+    }
+}
+
+impl FromStr for EndpointCacheConfig {
+    type Err = anyhow::Error;
+
+    fn from_str(options: &str) -> Result<Self, Self::Err> {
+        let error = || format!("failed to parse endpoint cache options '{options}'");
+        Self::parse(options).with_context(error)
+    }
+}
 #[derive(Debug)]
 pub struct MetricBackupCollectionConfig {
     pub interval: Duration,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index b9502f0722..3fa7221f98 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -8,9 +8,9 @@ use crate::{
         backend::{ComputeCredentialKeys, ComputeUserInfo},
         IpPattern,
     },
-    cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru},
+    cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru},
     compute,
-    config::{CacheOptions, ProjectInfoCacheOptions},
+    config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
     context::RequestMonitoring,
     intern::ProjectIdInt,
     metrics::ApiLockMetrics,
@@ -417,12 +417,15 @@ pub struct ApiCaches {
     pub node_info: NodeInfoCache,
     /// Cache which stores project_id -> endpoint_ids mapping.
     pub project_info: Arc<ProjectInfoCacheImpl>,
+    /// List of all valid endpoints.
+    pub endpoints_cache: Arc<EndpointsCache>,
 }
 
 impl ApiCaches {
     pub fn new(
         wake_compute_cache_config: CacheOptions,
         project_info_cache_config: ProjectInfoCacheOptions,
+        endpoint_cache_config: EndpointCacheConfig,
     ) -> Self {
         Self {
             node_info: NodeInfoCache::new(
@@ -432,6 +435,7 @@ impl ApiCaches {
                 true,
             ),
             project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
+            endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
         }
     }
 }
@@ -442,6 +446,7 @@ pub struct ApiLocks {
     node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
     permits: usize,
     timeout: Duration,
+    epoch: std::time::Duration,
     metrics: &'static ApiLockMetrics,
 }
 
@@ -451,6 +456,7 @@ impl ApiLocks {
         permits: usize,
         shards: usize,
         timeout: Duration,
+        epoch: std::time::Duration,
         metrics: &'static ApiLockMetrics,
     ) -> prometheus::Result<Self> {
         Ok(Self {
@@ -458,6 +464,7 @@ impl ApiLocks {
             node_locks: DashMap::with_shard_amount(shards),
             permits,
             timeout,
+            epoch,
             metrics,
         })
     }
@@ -495,12 +502,12 @@ impl ApiLocks {
         })
     }
 
-    pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) {
+    pub async fn garbage_collect_worker(&self) {
         if self.permits == 0 {
             return;
         }
-
-        let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32);
+        let mut interval =
+            tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32);
         loop {
             for (i, shard) in self.node_locks.shards().iter().enumerate() {
                 interval.tick().await;
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 9ac1900324..138acdf578 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -12,7 +12,7 @@ use crate::{
     console::messages::ColdStartInfo,
     http,
     metrics::{CacheOutcome, Metrics},
-    scram,
+    scram, Normalize,
 };
 use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
@@ -24,7 +24,7 @@ use tracing::{error, info, info_span, warn, Instrument};
 pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
-    locks: &'static ApiLocks,
+    pub locks: &'static ApiLocks,
     jwt: String,
 }
 
@@ -56,6 +56,15 @@ impl Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
+        if !self
+            .caches
+            .endpoints_cache
+            .is_valid(ctx, &user_info.endpoint.normalize())
+            .await
+        {
+            info!("endpoint is not valid, skipping the request");
+            return Ok(AuthInfo::default());
+        }
         let request_id = ctx.session_id.to_string();
         let application_name = ctx.console_application_name();
         async {
@@ -82,7 +91,9 @@ impl Api {
                 Ok(body) => body,
                 // Error 404 is special: it's ok not to have a secret.
                 Err(e) => match e.http_status_code() {
-                    Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()),
+                    Some(http::StatusCode::NOT_FOUND) => {
+                        return Ok(AuthInfo::default());
+                    }
                     _otherwise => return Err(e.into()),
                 },
             };
@@ -178,23 +189,27 @@ impl super::Api for Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        let ep = &user_info.endpoint;
+        let normalized_ep = &user_info.endpoint.normalize();
         let user = &user_info.user;
-        if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) {
+        if let Some(role_secret) = self
+            .caches
+            .project_info
+            .get_role_secret(normalized_ep, user)
+        {
             return Ok(role_secret);
         }
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         if let Some(project_id) = auth_info.project_id {
-            let ep_int = ep.into();
+            let normalized_ep_int = normalized_ep.into();
             self.caches.project_info.insert_role_secret(
                 project_id,
-                ep_int,
+                normalized_ep_int,
                 user.into(),
                 auth_info.secret.clone(),
             );
             self.caches.project_info.insert_allowed_ips(
                 project_id,
-                ep_int,
+                normalized_ep_int,
                 Arc::new(auth_info.allowed_ips),
             );
             ctx.set_project_id(project_id);
@@ -208,8 +223,8 @@ impl super::Api for Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        let ep = &user_info.endpoint;
-        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
             Metrics::get()
                 .proxy
                 .allowed_ips_cache_misses
@@ -224,16 +239,18 @@ impl super::Api for Api {
         let allowed_ips = Arc::new(auth_info.allowed_ips);
         let user = &user_info.user;
         if let Some(project_id) = auth_info.project_id {
-            let ep_int = ep.into();
+            let normalized_ep_int = normalized_ep.into();
             self.caches.project_info.insert_role_secret(
                 project_id,
-                ep_int,
+                normalized_ep_int,
                 user.into(),
                 auth_info.secret.clone(),
             );
-            self.caches
-                .project_info
-                .insert_allowed_ips(project_id, ep_int, allowed_ips.clone());
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
             ctx.set_project_id(project_id);
         }
         Ok((
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 0094235921..dc475d57ed 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -12,7 +12,7 @@ use crate::{
     console::messages::{ColdStartInfo, MetricsAuxInfo},
     error::ErrorKind,
     intern::{BranchIdInt, ProjectIdInt},
-    metrics::{LatencyTimer, Metrics, Protocol},
+    metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol},
     DbName, EndpointId, RoleName,
 };
 
@@ -50,6 +50,8 @@ pub struct RequestMonitoring {
     // This sender is here to keep the request monitoring channel open while requests are taking place.
     sender: Option<mpsc::UnboundedSender<RequestData>>,
     pub latency_timer: LatencyTimer,
+    // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
+    rejected: bool,
 }
 
 #[derive(Clone, Debug)]
@@ -93,6 +95,7 @@ impl RequestMonitoring {
             error_kind: None,
             auth_method: None,
             success: false,
+            rejected: false,
             cold_start_info: ColdStartInfo::Unknown,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
@@ -113,6 +116,10 @@ impl RequestMonitoring {
         )
     }
 
+    pub fn set_rejected(&mut self, rejected: bool) {
+        self.rejected = rejected;
+    }
+
     pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
         self.cold_start_info = info;
         self.latency_timer.cold_start_info(info);
@@ -176,6 +183,19 @@ impl RequestMonitoring {
 
 impl Drop for RequestMonitoring {
     fn drop(&mut self) {
+        let outcome = if self.success {
+            ConnectOutcome::Success
+        } else {
+            ConnectOutcome::Failed
+        };
+        Metrics::get()
+            .proxy
+            .invalid_endpoints_total
+            .inc(InvalidEndpointsGroup {
+                protocol: self.protocol,
+                rejected: self.rejected.into(),
+                outcome,
+            });
         if let Some(tx) = self.sender.take() {
             let _: Result<(), _> = tx.send(RequestData::from(&*self));
         }
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index a6519bdff9..e38135dd22 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -160,6 +160,11 @@ impl From<&EndpointId> for EndpointIdInt {
         EndpointIdTag::get_interner().get_or_intern(value)
     }
 }
+impl From<EndpointId> for EndpointIdInt {
+    fn from(value: EndpointId) -> Self {
+        EndpointIdTag::get_interner().get_or_intern(&value)
+    }
+}
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct BranchIdTag;
@@ -175,6 +180,11 @@ impl From<&BranchId> for BranchIdInt {
         BranchIdTag::get_interner().get_or_intern(value)
     }
 }
+impl From<BranchId> for BranchIdInt {
+    fn from(value: BranchId) -> Self {
+        BranchIdTag::get_interner().get_or_intern(&value)
+    }
+}
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct ProjectIdTag;
@@ -190,6 +200,11 @@ impl From<&ProjectId> for ProjectIdInt {
         ProjectIdTag::get_interner().get_or_intern(value)
     }
 }
+impl From<ProjectId> for ProjectIdInt {
+    fn from(value: ProjectId) -> Self {
+        ProjectIdTag::get_interner().get_or_intern(&value)
+    }
+}
 
 #[cfg(test)]
 mod tests {
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index da7c7f3ed2..3f6d985fe8 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -127,6 +127,24 @@ macro_rules! smol_str_wrapper {
     };
 }
 
+const POOLER_SUFFIX: &str = "-pooler";
+
+pub trait Normalize {
+    fn normalize(&self) -> Self;
+}
+
+impl<S: Clone + AsRef<str> + From<String>> Normalize for S {
+    fn normalize(&self) -> Self {
+        if self.as_ref().ends_with(POOLER_SUFFIX) {
+            let mut s = self.as_ref().to_string();
+            s.truncate(s.len() - POOLER_SUFFIX.len());
+            s.into()
+        } else {
+            self.clone()
+        }
+    }
+}
+
 // 90% of role name strings are 20 characters or less.
 smol_str_wrapper!(RoleName);
 // 50% of endpoint strings are 23 characters or less.
@@ -140,3 +158,22 @@ smol_str_wrapper!(ProjectId);
 smol_str_wrapper!(EndpointCacheKey);
 
 smol_str_wrapper!(DbName);
+
+// Endpoints are a bit tricky. Rare they might be branches or projects.
+impl EndpointId {
+    pub fn is_endpoint(&self) -> bool {
+        self.0.starts_with("ep-")
+    }
+    pub fn is_branch(&self) -> bool {
+        self.0.starts_with("br-")
+    }
+    pub fn is_project(&self) -> bool {
+        !self.is_endpoint() && !self.is_branch()
+    }
+    pub fn as_branch(&self) -> BranchId {
+        BranchId(self.0.clone())
+    }
+    pub fn as_project(&self) -> ProjectId {
+        ProjectId(self.0.clone())
+    }
+}
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 78840f5983..b96950b0a2 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -120,6 +120,9 @@ pub struct ProxyMetrics {
 
     /// Number of endpoints affected by authentication rate limits
     pub endpoints_auth_rate_limits: HyperLogLog<32>,
+
+    /// Number of invalid endpoints (per protocol, per rejected).
+    pub invalid_endpoints_total: CounterVec<InvalidEndpointsSet>,
 }
 
 #[derive(MetricGroup)]
@@ -430,7 +433,7 @@ impl Drop for LatencyTimerPause<'_> {
 }
 
 #[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
-enum ConnectOutcome {
+pub enum ConnectOutcome {
     Success,
     Failed,
 }
@@ -480,3 +483,11 @@ impl From<bool> for Bool {
         }
     }
 }
+
+#[derive(LabelGroup)]
+#[label(set = InvalidEndpointsSet)]
+pub struct InvalidEndpointsGroup {
+    pub protocol: Protocol,
+    pub rejected: Bool,
+    pub outcome: ConnectOutcome,
+}
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 5598215b6b..42fb10b326 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -20,7 +20,7 @@ use crate::{
     proxy::handshake::{handshake, HandshakeData},
     rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
-    EndpointCacheKey,
+    EndpointCacheKey, Normalize,
 };
 use futures::TryFutureExt;
 use itertools::Itertools;
@@ -288,7 +288,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     // check rate limit
     if let Some(ep) = user_info.get_endpoint() {
-        if !endpoint_rate_limiter.check(ep, 1) {
+        if !endpoint_rate_limiter.check(ep.normalize(), 1) {
             return stream
                 .throw_error(auth::AuthError::too_many_connections())
                 .await?;
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index 13dffffca0..a3b83e5e50 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -4,4 +4,4 @@ mod limiter;
 pub use aimd::Aimd;
 pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
 pub use limiter::Limiter;
-pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
+pub use limiter::{AuthRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index aba5120f38..7e9370f606 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -30,13 +30,13 @@ use super::{
     RateLimiterConfig,
 };
 
-pub struct RedisRateLimiter {
+pub struct GlobalRateLimiter {
     data: Vec<RateBucket>,
-    info: &'static [RateBucketInfo],
+    info: Vec<RateBucketInfo>,
 }
 
-impl RedisRateLimiter {
-    pub fn new(info: &'static [RateBucketInfo]) -> Self {
+impl GlobalRateLimiter {
+    pub fn new(info: Vec<RateBucketInfo>) -> Self {
         Self {
             data: vec![
                 RateBucket {
@@ -56,7 +56,7 @@ impl RedisRateLimiter {
         let should_allow_request = self
             .data
             .iter_mut()
-            .zip(self.info)
+            .zip(&self.info)
             .all(|(bucket, info)| bucket.should_allow_request(info, now, 1));
 
         if should_allow_request {
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 422789813c..7baf104374 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -5,7 +5,7 @@ use redis::AsyncCommands;
 use tokio::sync::Mutex;
 use uuid::Uuid;
 
-use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
+use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
 
 use super::{
     connection_with_credentials_provider::ConnectionWithCredentialsProvider,
@@ -80,7 +80,7 @@ impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
 pub struct RedisPublisherClient {
     client: ConnectionWithCredentialsProvider,
     region_id: String,
-    limiter: RedisRateLimiter,
+    limiter: GlobalRateLimiter,
 }
 
 impl RedisPublisherClient {
@@ -92,7 +92,7 @@ impl RedisPublisherClient {
         Ok(Self {
             client,
             region_id,
-            limiter: RedisRateLimiter::new(info),
+            limiter: GlobalRateLimiter::new(info.into()),
         })
     }
 
diff --git a/test_runner/regress/test_proxy_rate_limiter.py b/test_runner/regress/test_proxy_rate_limiter.py
deleted file mode 100644
index f39f0cad07..0000000000
--- a/test_runner/regress/test_proxy_rate_limiter.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import asyncio
-import time
-from pathlib import Path
-from typing import Iterator
-
-import pytest
-from fixtures.neon_fixtures import (
-    PSQL,
-    NeonProxy,
-)
-from fixtures.port_distributor import PortDistributor
-from pytest_httpserver import HTTPServer
-from werkzeug.wrappers.response import Response
-
-
-def waiting_handler(status_code: int) -> Response:
-    # wait more than timeout to make sure that both (two) connections are open.
-    # It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver.
-    time.sleep(2)
-    return Response(status=status_code)
-
-
-@pytest.fixture(scope="function")
-def proxy_with_rate_limit(
-    port_distributor: PortDistributor,
-    neon_binpath: Path,
-    httpserver_listen_address,
-    test_output_dir: Path,
-) -> Iterator[NeonProxy]:
-    """Neon proxy that routes directly to vanilla postgres."""
-
-    proxy_port = port_distributor.get_port()
-    mgmt_port = port_distributor.get_port()
-    http_port = port_distributor.get_port()
-    external_http_port = port_distributor.get_port()
-    (host, port) = httpserver_listen_address
-    endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
-
-    with NeonProxy(
-        neon_binpath=neon_binpath,
-        test_output_dir=test_output_dir,
-        proxy_port=proxy_port,
-        http_port=http_port,
-        mgmt_port=mgmt_port,
-        external_http_port=external_http_port,
-        auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5),
-    ) as proxy:
-        proxy.start()
-        yield proxy
-
-
-@pytest.mark.asyncio
-async def test_proxy_rate_limit(
-    httpserver: HTTPServer,
-    proxy_with_rate_limit: NeonProxy,
-):
-    uri = "/billing/api/v1/usage_events/proxy_get_role_secret"
-    # mock control plane service
-    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
-        lambda _: Response(status=200)
-    )
-    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
-        lambda _: waiting_handler(429)
-    )
-    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
-        lambda _: waiting_handler(500)
-    )
-
-    psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port)
-    f = await psql.run("select 42;")
-    await proxy_with_rate_limit.find_auth_link(uri, f)
-    # Limit should be 2.
-
-    # Run two queries in parallel.
-    f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;"))
-    await proxy_with_rate_limit.find_auth_link(uri, f1)
-    await proxy_with_rate_limit.find_auth_link(uri, f2)
-
-    # Now limit should be 0.
-    f = await psql.run("select 42;")
-    await proxy_with_rate_limit.find_auth_link(uri, f)
-
-    # There last query shouldn't reach the http-server.
-    assert httpserver.assertions == []

From e92fb94149967d5eca3eccddcdd718149d3d7031 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 11 Apr 2024 21:55:05 +0100
Subject: [PATCH 0558/1571] proxy: fix overloaded db connection closure (#7364)

## Problem

possible for the database connections to not close in time.

## Summary of changes

force the closing of connections if the client has hung up
---
 proxy/src/serverless/conn_pool.rs | 36 +++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 131f088880..798e488509 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -15,6 +15,7 @@ use std::{
 use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
+use tokio_util::sync::CancellationToken;
 
 use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
@@ -488,15 +489,32 @@ pub fn poll_client<C: ClientInnerExt>(
 
     let db_user = conn_info.db_and_user();
     let idle = global_pool.get_idle_timeout();
+    let cancel = CancellationToken::new();
+    let cancelled = cancel.clone().cancelled_owned();
+
     tokio::spawn(
     async move {
         let _conn_gauge = conn_gauge;
         let mut idle_timeout = pin!(tokio::time::sleep(idle));
+        let mut cancelled = pin!(cancelled);
+
         poll_fn(move |cx| {
-            if matches!(rx.has_changed(), Ok(true)) {
-                session_id = *rx.borrow_and_update();
-                info!(%session_id, "changed session");
-                idle_timeout.as_mut().reset(Instant::now() + idle);
+            if cancelled.as_mut().poll(cx).is_ready() {
+                info!("connection dropped");
+                return Poll::Ready(())
+            }
+
+            match rx.has_changed() {
+                Ok(true) => {
+                    session_id = *rx.borrow_and_update();
+                    info!(%session_id, "changed session");
+                    idle_timeout.as_mut().reset(Instant::now() + idle);
+                }
+                Err(_) => {
+                    info!("connection dropped");
+                    return Poll::Ready(())
+                }
+                _ => {}
             }
 
             // 5 minute idle connection timeout
@@ -551,6 +569,7 @@ pub fn poll_client<C: ClientInnerExt>(
     let inner = ClientInner {
         inner: client,
         session: tx,
+        cancel,
         aux,
         conn_id,
     };
@@ -560,10 +579,18 @@ pub fn poll_client<C: ClientInnerExt>(
 struct ClientInner<C: ClientInnerExt> {
     inner: C,
     session: tokio::sync::watch::Sender<uuid::Uuid>,
+    cancel: CancellationToken,
     aux: MetricsAuxInfo,
     conn_id: uuid::Uuid,
 }
 
+impl<C: ClientInnerExt> Drop for ClientInner<C> {
+    fn drop(&mut self) {
+        // on client drop, tell the conn to shut down
+        self.cancel.cancel();
+    }
+}
+
 pub trait ClientInnerExt: Sync + Send + 'static {
     fn is_closed(&self) -> bool;
     fn get_process_id(&self) -> i32;
@@ -716,6 +743,7 @@ mod tests {
         ClientInner {
             inner: client,
             session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
+            cancel: CancellationToken::new(),
             aux: MetricsAuxInfo {
                 endpoint_id: (&EndpointId::from("endpoint")).into(),
                 project_id: (&ProjectId::from("project")).into(),

From 94505fd67288e0301c32763348c7b75f0b63e514 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 11 Apr 2024 23:35:30 +0100
Subject: [PATCH 0559/1571] CI: speed up Allure reports upload (#7362)

## Problem

`create-test-report` job takes more than 8 minutes, the longest step is
uploading Allure report to S3:

Before:
```
+ aws s3 cp --recursive --only-show-errors /tmp/pr-7362-1712847045/report s3://neon-github-public-dev/reports/pr-7362/8647730612

real	6m10.572s
user	6m37.717s
sys	1m9.429s
```

After:
```
+ s5cmd --log error cp '/tmp/pr-7362-1712858221/report/*' s3://neon-github-public-dev/reports/pr-7362/8650636861/

real	0m9.698s
user	1m9.438s
sys	0m6.419s
```

## Summary of changes
- Add `s5cmd`(https://github.com/peak/s5cmd) to build-tools image
- Use `s5cmd` instead of `aws s3` for uploading Allure reports
---
 .github/actions/allure-report-generate/action.yml | 2 +-
 Dockerfile.build-tools                            | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index 1ecb5ecc7e..f84beff20c 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -150,7 +150,7 @@ runs:
 
         # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
         # and to keep files on the host to upload them to the database
-        time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
+        time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/"
 
         # Generate redirect
         cat <<EOF > ${WORKDIR}/index.html
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 1ed6f87473..a082f15c34 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -58,6 +58,12 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
     && mv protoc/include/google /usr/local/include/google \
     && rm -rf protoc.zip protoc
 
+# s5cmd
+ENV S5CMD_VERSION=2.2.2
+RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
+    && chmod +x s5cmd \
+    && mv s5cmd /usr/local/bin/s5cmd
+
 # LLVM
 ENV LLVM_VERSION=17
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \

From e8338c60f9c048e27c38fb8212ac96b542cbfcff Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 11 Apr 2024 23:42:18 -0500
Subject: [PATCH 0560/1571] Fix typo in pg_ctl shutdown mode (#7365)

The allowed modes as of Postgres 17 are: smart, fast, and immediate.

$ cargo neon stop
    Finished dev [unoptimized + debuginfo] target(s) in 0.24s
     Running `target/debug/neon_local stop`
postgres stop failed: pg_ctl failed, exit code: exit status: 1, stdout: , stderr: pg_ctl: unrecognized shutdown mode "fast "
Try "pg_ctl --help" for more information.
---
 control_plane/src/bin/neon_local.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 56495dd2da..68a5474c87 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1231,7 +1231,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
     match ComputeControlPlane::load(env.clone()) {
         Ok(cplane) => {
             for (_k, node) in cplane.endpoints {
-                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
+                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) {
                     eprintln!("postgres stop failed: {e:#}");
                 }
             }

From 5288f9621e2c84e912ca972e3a7bbf597884be49 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 12 Apr 2024 10:15:40 +0100
Subject: [PATCH 0561/1571] build(deps): bump idna from 3.3 to 3.7 (#7367)

---
 poetry.lock | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 7b49daf42a..aca88073a8 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -1191,13 +1191,13 @@ files = [
 
 [[package]]
 name = "idna"
-version = "3.3"
+version = "3.7"
 description = "Internationalized Domain Names in Applications (IDNA)"
 optional = false
 python-versions = ">=3.5"
 files = [
-    {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
-    {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
+    {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
+    {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
 ]
 
 [[package]]
@@ -2182,6 +2182,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2652,6 +2653,16 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},

From 83cdbbb89aa939a54c8388cfc4b0294831626467 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 15 Apr 2024 13:50:26 +0300
Subject: [PATCH 0562/1571] pageserver: improve readability of shard.rs (#7330)

No functional changes, this is a comments/naming PR.

While merging sharding changes, some cleanup of the shard.rs types was
deferred.

In this PR:
- Rename `is_zero` to `is_shard_zero` to make clear that this method
doesn't literally mean that the entire object is zeros, just that it
refers to the 0th shard in a tenant.
- Pull definitions of types to the top of shard.rs and add a big comment
giving an overview of which type is for what.

Closes: https://github.com/neondatabase/neon/issues/6072
---
 libs/pageserver_api/src/shard.rs              | 149 +++++++++++-------
 pageserver/src/consumption_metrics.rs         |   2 +-
 pageserver/src/consumption_metrics/metrics.rs |   2 +-
 pageserver/src/http/routes.rs                 |   6 +-
 pageserver/src/metrics.rs                     |   2 +-
 pageserver/src/tenant.rs                      |   4 +-
 .../tenant/remote_timeline_client/upload.rs   |   2 +-
 pageserver/src/tenant/timeline.rs             |   6 +-
 .../src/tenant/timeline/eviction_task.rs      |   2 +-
 .../walreceiver/walreceiver_connection.rs     |   2 +-
 pageserver/src/walingest.rs                   |   2 +-
 storage_controller/src/service.rs             |   6 +-
 12 files changed, 114 insertions(+), 71 deletions(-)

diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index a2a9165184..c293ad705b 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -8,12 +8,89 @@ use hex::FromHex;
 use serde::{Deserialize, Serialize};
 use utils::id::TenantId;
 
+/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
+///
+/// This module contains a variety of types used to represent the concept of sharding
+/// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
+/// we provide an summary here.
+///
+/// Types used to describe shards:
+/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
+///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
+///   a shard suffix.
+/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
+/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
+///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
+///   tenant, such as layer files.
+/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
+///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
+/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
+///   four hex digits.  An unsharded tenant is `0000`.
+/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
+///
+/// Types used to describe the parameters for data distribution in a sharded tenant:
+/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
+///   multiple shards.  Its value is given in 8kiB pages.
+/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
+///   always zero: this is provided for future upgrades that might introduce different
+///   data distribution schemes.
+///
+/// Examples:
+/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
+/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
+/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
+///   and their slugs are 0004, 0104, 0204, and 0304.
+
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardNumber(pub u8);
 
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(u8);
 
+/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
+/// when we need to know which shard we're dealing with, but do not need to know the full
+/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
+/// the fully qualified TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
+/// and to check whether that [`ShardNumber`] is the same as the current shard.
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+pub struct ShardIdentity {
+    pub number: ShardNumber,
+    pub count: ShardCount,
+    pub stripe_size: ShardStripeSize,
+    layout: ShardLayout,
+}
+
+/// Formatting helper, for generating the `shard_id` label in traces.
+struct ShardSlug<'a>(&'a TenantShardId);
+
+/// TenantShardId globally identifies a particular shard in a particular tenant.
+///
+/// These are written as `<TenantId>-<ShardSlug>`, for example:
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
+/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
+/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
+///
+/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible with TenantId: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
 impl ShardCount {
     pub const MAX: Self = Self(u8::MAX);
 
@@ -38,6 +115,7 @@ impl ShardCount {
         self.0
     }
 
+    ///
     pub fn is_unsharded(&self) -> bool {
         self.0 == 0
     }
@@ -53,33 +131,6 @@ impl ShardNumber {
     pub const MAX: Self = Self(u8::MAX);
 }
 
-/// TenantShardId identify the units of work for the Pageserver.
-///
-/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
-///
-///   # The second shard in a two-shard tenant
-///   072f1291a5310026820b2fe4b2968934-0102
-///
-/// Historically, tenants could not have multiple shards, and were identified
-/// by TenantId.  To support this, TenantShardId has a special legacy
-/// mode where `shard_count` is equal to zero: this represents a single-sharded
-/// tenant which should be written as a TenantId with no suffix.
-///
-/// The human-readable encoding of TenantShardId, such as used in API URLs,
-/// is both forward and backward compatible: a legacy TenantId can be
-/// decoded as a TenantShardId, and when re-encoded it will be parseable
-/// as a TenantId.
-///
-/// Note that the binary encoding is _not_ backward compatible, because
-/// at the time sharding is introduced, there are no existing binary structures
-/// containing TenantId that we need to handle.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TenantShardId {
-    pub tenant_id: TenantId,
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
 impl TenantShardId {
     pub fn unsharded(tenant_id: TenantId) -> Self {
         Self {
@@ -111,10 +162,13 @@ impl TenantShardId {
     }
 
     /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_zero(&self) -> bool {
+    pub fn is_shard_zero(&self) -> bool {
         self.shard_number == ShardNumber(0)
     }
 
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
     pub fn is_unsharded(&self) -> bool {
         self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
     }
@@ -150,9 +204,6 @@ impl TenantShardId {
     }
 }
 
-/// Formatting helper
-struct ShardSlug<'a>(&'a TenantShardId);
-
 impl<'a> std::fmt::Display for ShardSlug<'a> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
@@ -222,16 +273,6 @@ impl From<[u8; 18]> for TenantShardId {
     }
 }
 
-/// For use within the context of a particular tenant, when we need to know which
-/// shard we're dealing with, but do not need to know the full ShardIdentity (because
-/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
-/// TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct ShardIndex {
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
 impl ShardIndex {
     pub fn new(number: ShardNumber, count: ShardCount) -> Self {
         Self {
@@ -246,6 +287,9 @@ impl ShardIndex {
         }
     }
 
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
     pub fn is_unsharded(&self) -> bool {
         self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
     }
@@ -313,6 +357,8 @@ impl Serialize for TenantShardId {
         if serializer.is_human_readable() {
             serializer.collect_str(self)
         } else {
+            // Note: while human encoding of [`TenantShardId`] is backward and forward
+            // compatible, this binary encoding is not.
             let mut packed: [u8; 18] = [0; 18];
             packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
             packed[16] = self.shard_number.0;
@@ -390,16 +436,6 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
 /// Default stripe size in pages: 256MiB divided by 8kiB page size.
 const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
 
-/// The ShardIdentity contains the information needed for one member of map
-/// to resolve a key to a shard, and then check whether that shard is ==self.
-#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
-pub struct ShardIdentity {
-    pub number: ShardNumber,
-    pub count: ShardCount,
-    pub stripe_size: ShardStripeSize,
-    layout: ShardLayout,
-}
-
 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
 pub enum ShardConfigError {
     #[error("Invalid shard count")]
@@ -439,6 +475,9 @@ impl ShardIdentity {
         }
     }
 
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
     pub fn is_unsharded(&self) -> bool {
         self.number == ShardNumber(0) && self.count == ShardCount(0)
     }
@@ -487,6 +526,8 @@ impl ShardIdentity {
     }
 
     /// Return true if the key should be ingested by this shard
+    ///
+    /// Shards must ingest _at least_ keys which return true from this check.
     pub fn is_key_local(&self, key: &Key) -> bool {
         assert!(!self.is_broken());
         if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
@@ -497,7 +538,9 @@ impl ShardIdentity {
     }
 
     /// Return true if the key should be discarded if found in this shard's
-    /// data store, e.g. during compaction after a split
+    /// data store, e.g. during compaction after a split.
+    ///
+    /// Shards _may_ drop keys which return false here, but are not obliged to.
     pub fn is_key_disposable(&self, key: &Key) -> bool {
         if key_is_shard0(key) {
             // Q: Why can't we dispose of shard0 content if we're not shard 0?
@@ -523,7 +566,7 @@ impl ShardIdentity {
 
     /// Convenience for checking if this identity is the 0th shard in a tenant,
     /// for special cases on shard 0 such as ingesting relation sizes.
-    pub fn is_zero(&self) -> bool {
+    pub fn is_shard_zero(&self) -> bool {
         self.number == ShardNumber(0)
     }
 }
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index f5540e896f..62bbde42f4 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -304,7 +304,7 @@ async fn calculate_synthetic_size_worker(
                 continue;
             }
 
-            if !tenant_shard_id.is_zero() {
+            if !tenant_shard_id.is_shard_zero() {
                 // We only send consumption metrics from shard 0, so don't waste time calculating
                 // synthetic size on other shards.
                 continue;
diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs
index 6740c1360b..7ba2d04c4f 100644
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -199,7 +199,7 @@ pub(super) async fn collect_all_metrics(
     };
 
     let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
-        if state != TenantState::Active || !id.is_zero() {
+        if state != TenantState::Active || !id.is_shard_zero() {
             None
         } else {
             tenant_manager
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 47d8ae1148..0b8c991f11 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -696,7 +696,7 @@ async fn get_lsn_by_timestamp_handler(
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let state = get_state(&request);
 
-    if !tenant_shard_id.is_zero() {
+    if !tenant_shard_id.is_shard_zero() {
         // Requires SLRU contents, which are only stored on shard zero
         return Err(ApiError::BadRequest(anyhow!(
             "Size calculations are only available on shard zero"
@@ -747,7 +747,7 @@ async fn get_timestamp_of_lsn_handler(
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let state = get_state(&request);
 
-    if !tenant_shard_id.is_zero() {
+    if !tenant_shard_id.is_shard_zero() {
         // Requires SLRU contents, which are only stored on shard zero
         return Err(ApiError::BadRequest(anyhow!(
             "Size calculations are only available on shard zero"
@@ -1086,7 +1086,7 @@ async fn tenant_size_handler(
     let headers = request.headers();
     let state = get_state(&request);
 
-    if !tenant_shard_id.is_zero() {
+    if !tenant_shard_id.is_shard_zero() {
         return Err(ApiError::BadRequest(anyhow!(
             "Size calculations are only available on shard zero"
         )));
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 3160f204e2..6755c15c30 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2089,7 +2089,7 @@ impl TimelineMetrics {
 
 pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
     // Only shard zero deals in synthetic sizes
-    if tenant_shard_id.is_zero() {
+    if tenant_shard_id.is_shard_zero() {
         let tid = tenant_shard_id.tenant_id.to_string();
         let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
     }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 17ff033e00..2eac1247f7 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3190,7 +3190,7 @@ impl Tenant {
             run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
 
             // Upload the created data dir to S3
-            if self.tenant_shard_id().is_zero() {
+            if self.tenant_shard_id().is_shard_zero() {
                 self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id)
                     .await?;
             }
@@ -3437,7 +3437,7 @@ impl Tenant {
             .store(size, Ordering::Relaxed);
 
         // Only shard zero should be calculating synthetic sizes
-        debug_assert!(self.shard_identity.is_zero());
+        debug_assert!(self.shard_identity.is_shard_zero());
 
         TENANT_SYNTHETIC_SIZE_METRIC
             .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 137fe48b73..0227331953 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -167,7 +167,7 @@ pub(crate) async fn time_travel_recover_tenant(
     let warn_after = 3;
     let max_attempts = 10;
     let mut prefixes = Vec::with_capacity(2);
-    if tenant_shard_id.is_zero() {
+    if tenant_shard_id.is_shard_zero() {
         // Also recover the unsharded prefix for a shard of zero:
         // - if the tenant is totally unsharded, the unsharded prefix contains all the data
         // - if the tenant is sharded, we still want to recover the initdb data, but we only
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d046a60af4..46b3d41e2b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1344,7 +1344,7 @@ impl Timeline {
         background_jobs_can_start: Option<&completion::Barrier>,
         ctx: &RequestContext,
     ) {
-        if self.tenant_shard_id.is_zero() {
+        if self.tenant_shard_id.is_shard_zero() {
             // Logical size is only maintained accurately on shard zero.
             self.spawn_initial_logical_size_computation_task(ctx);
         }
@@ -2237,7 +2237,7 @@ impl Timeline {
         priority: GetLogicalSizePriority,
         ctx: &RequestContext,
     ) -> logical_size::CurrentLogicalSize {
-        if !self.tenant_shard_id.is_zero() {
+        if !self.tenant_shard_id.is_shard_zero() {
             // Logical size is only accurately maintained on shard zero: when called elsewhere, for example
             // when HTTP API is serving a GET for timeline zero, return zero
             return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero());
@@ -2533,7 +2533,7 @@ impl Timeline {
         crate::span::debug_assert_current_span_has_tenant_and_timeline_id();
         // We should never be calculating logical sizes on shard !=0, because these shards do not have
         // accurate relation sizes, and they do not emit consumption metrics.
-        debug_assert!(self.tenant_shard_id.is_zero());
+        debug_assert!(self.tenant_shard_id.is_shard_zero());
 
         let guard = self
             .gate
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 522c5b57de..304d0d60ee 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -378,7 +378,7 @@ impl Timeline {
         gate: &GateGuard,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
-        if !self.tenant_shard_id.is_zero() {
+        if !self.tenant_shard_id.is_shard_zero() {
             // Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size
             // for consumption metrics (consumption metrics are only sent from shard 0).  We may therefore
             // skip imitating logical size accesses for eviction purposes.
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 3f3419e886..c6ee6b90c4 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -427,7 +427,7 @@ pub(super) async fn handle_walreceiver_connection(
 
             // Send the replication feedback message.
             // Regular standby_status_update fields are put into this message.
-            let current_timeline_size = if timeline.tenant_shard_id.is_zero() {
+            let current_timeline_size = if timeline.tenant_shard_id.is_shard_zero() {
                 timeline
                     .get_current_logical_size(
                         crate::tenant::timeline::GetLogicalSizePriority::User,
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 9c7e8748d5..4f83b118ae 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -403,7 +403,7 @@ impl WalIngest {
             );
 
             if !key_is_local {
-                if self.shard.is_zero() {
+                if self.shard.is_shard_zero() {
                     // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
                     // its blkno in case it implicitly extends a relation.
                     self.observe_decoded_block(modification, blk, ctx).await?;
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 010558b797..4ee189dac9 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2744,7 +2744,7 @@ impl Service {
         let mut describe_shards = Vec::new();
 
         for shard in shards {
-            if shard.tenant_shard_id.is_zero() {
+            if shard.tenant_shard_id.is_shard_zero() {
                 shard_zero = Some(shard);
             }
 
@@ -4084,7 +4084,7 @@ impl Service {
 
         let mut reconciles_spawned = 0;
         for (tenant_shard_id, shard) in tenants.iter_mut() {
-            if tenant_shard_id.is_zero() {
+            if tenant_shard_id.is_shard_zero() {
                 schedule_context = ScheduleContext::default();
             }
 
@@ -4134,7 +4134,7 @@ impl Service {
         let mut work = Vec::new();
 
         for (tenant_shard_id, shard) in tenants.iter() {
-            if tenant_shard_id.is_zero() {
+            if tenant_shard_id.is_shard_zero() {
                 // Reset accumulators on the first shard in a tenant
                 schedule_context = ScheduleContext::default();
                 tenant_shards.clear();

From f752c40f58dc854a9b0ba9a03164e8d91e95b5b3 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 15 Apr 2024 16:05:44 +0300
Subject: [PATCH 0563/1571]  storage release: stop using no-op deployProxy /
 deployPgSniRouter (#7382)

As of https://github.com/neondatabase/aws/pull/1264
these options are no-ops.

This PR unblocks removal of the variables in
https://github.com/neondatabase/aws/pull/1263
---
 .github/workflows/build_and_test.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 36922d5294..1d35fa9223 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1133,8 +1133,6 @@ jobs:
               -f deployPreprodRegion=true
 
             gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
-              -f deployPgSniRouter=false \
-              -f deployProxy=false \
               -f deployStorage=true \
               -f deployStorageBroker=true \
               -f deployStorageController=true \

From 110282ee7ea43f1aef4164fa947382d9801e11a0 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 15 Apr 2024 20:21:50 +0200
Subject: [PATCH 0564/1571] proxy: Exclude private ip errors from recorded
 metrics (#7389)

## Problem

Right now we record errors from internal VPC.

## Summary of changes

* Exclude it from the metrics.
* Simplify pg-sni-router
---
 proxy/src/bin/pg_sni_router.rs        | 27 +++++++++++++--------------
 proxy/src/context.rs                  | 12 +++++++++++-
 proxy/src/proxy.rs                    |  4 +++-
 proxy/src/proxy/copy_bidirectional.rs |  2 +-
 proxy/src/proxy/handshake.rs          |  5 ++++-
 proxy/src/proxy/tests.rs              |  2 +-
 proxy/src/proxy/tests/mitm.rs         |  5 ++++-
 proxy/src/serverless.rs               | 12 ++++++++++--
 proxy/src/stream.rs                   | 12 ++++++++++--
 9 files changed, 57 insertions(+), 24 deletions(-)

diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 58737efe46..7a693002a8 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -9,15 +9,13 @@ use futures::future::Either;
 use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
-use proxy::proxy::run_until_cancelled;
-use proxy::{BranchId, EndpointId, ProjectId};
+use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled};
 use rustls::pki_types::PrivateKeyDer;
 use tokio::net::TcpListener;
 
 use anyhow::{anyhow, bail, ensure, Context};
 use clap::Arg;
 use futures::TryFutureExt;
-use proxy::console::messages::MetricsAuxInfo;
 use proxy::stream::{PqStream, Stream};
 
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -204,6 +202,7 @@ async fn task_main(
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 
 async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
+    ctx: &mut RequestMonitoring,
     raw_stream: S,
     tls_config: Arc<rustls::ServerConfig>,
     tls_server_end_point: TlsServerEndPoint,
@@ -233,7 +232,10 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
             }
 
             Ok(Stream::Tls {
-                tls: Box::new(raw.upgrade(tls_config).await?),
+                tls: Box::new(
+                    raw.upgrade(tls_config, !ctx.has_private_peer_addr())
+                        .await?,
+                ),
                 tls_server_end_point,
             })
         }
@@ -256,7 +258,7 @@ async fn handle_client(
     tls_server_end_point: TlsServerEndPoint,
     stream: impl AsyncRead + AsyncWrite + Unpin,
 ) -> anyhow::Result<()> {
-    let tls_stream = ssl_handshake(stream, tls_config, tls_server_end_point).await?;
+    let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?;
 
     // Cut off first part of the SNI domain
     // We receive required destination details in the format of
@@ -273,18 +275,15 @@ async fn handle_client(
 
     info!("destination: {}", destination);
 
-    let client = tokio::net::TcpStream::connect(destination).await?;
-
-    let metrics_aux: MetricsAuxInfo = MetricsAuxInfo {
-        endpoint_id: (&EndpointId::from("")).into(),
-        project_id: (&ProjectId::from("")).into(),
-        branch_id: (&BranchId::from("")).into(),
-        cold_start_info: proxy::console::messages::ColdStartInfo::Unknown,
-    };
+    let mut client = tokio::net::TcpStream::connect(destination).await?;
 
     // doesn't yet matter as pg-sni-router doesn't report analytics logs
     ctx.set_success();
     ctx.log();
 
-    proxy::proxy::passthrough::proxy_pass(tls_stream, client, metrics_aux).await
+    // Starting from here we only proxy the client's traffic.
+    info!("performing the proxy pass...");
+    let _ = copy_bidirectional_client_compute(&mut tls_stream, &mut client).await?;
+
+    Ok(())
 }
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index dc475d57ed..d7b5be5534 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -164,8 +164,18 @@ impl RequestMonitoring {
         self.auth_method = Some(auth_method);
     }
 
+    pub fn has_private_peer_addr(&self) -> bool {
+        match self.peer_addr {
+            IpAddr::V4(ip) => ip.is_private(),
+            _ => false,
+        }
+    }
+
     pub fn set_error_kind(&mut self, kind: ErrorKind) {
-        Metrics::get().proxy.errors_total.inc(kind);
+        // Do not record errors from the private address to metrics.
+        if !self.has_private_peer_addr() {
+            Metrics::get().proxy.errors_total.inc(kind);
+        }
         if let Some(ep) = &self.endpoint_id {
             let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
             let label = metric.with_labels(kind);
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 42fb10b326..f80ced91c8 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -7,6 +7,7 @@ pub mod handshake;
 pub mod passthrough;
 pub mod retry;
 pub mod wake_compute;
+pub use copy_bidirectional::copy_bidirectional_client_compute;
 
 use crate::{
     auth,
@@ -256,8 +257,9 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     let tls = config.tls_config.as_ref();
 
+    let record_handshake_error = !ctx.has_private_peer_addr();
     let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
-    let do_handshake = handshake(stream, mode.handshake_tls(tls));
+    let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error);
     let (mut stream, params) =
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 684be74f9a..4b09ebd8dc 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -41,7 +41,7 @@ where
 }
 
 #[tracing::instrument(skip_all)]
-pub(super) async fn copy_bidirectional_client_compute<Client, Compute>(
+pub async fn copy_bidirectional_client_compute<Client, Compute>(
     client: &mut Client,
     compute: &mut Compute,
 ) -> Result<(u64, u64), std::io::Error>
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index 4665e07d23..dd935cc245 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -63,6 +63,7 @@ pub enum HandshakeData<S> {
 pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     stream: S,
     mut tls: Option<&TlsConfig>,
+    record_handshake_error: bool,
 ) -> Result<HandshakeData<S>, HandshakeError> {
     // Client may try upgrading to each protocol only once
     let (mut tried_ssl, mut tried_gss) = (false, false);
@@ -95,7 +96,9 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                         if !read_buf.is_empty() {
                             return Err(HandshakeError::EarlyData);
                         }
-                        let tls_stream = raw.upgrade(tls.to_server_config()).await?;
+                        let tls_stream = raw
+                            .upgrade(tls.to_server_config(), record_handshake_error)
+                            .await?;
 
                         let (_, tls_server_end_point) = tls
                             .cert_resolver
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 71d85e106d..849e9bd33c 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -175,7 +175,7 @@ async fn dummy_proxy(
     auth: impl TestAuth + Send,
 ) -> anyhow::Result<()> {
     let client = WithClientIp::new(client);
-    let mut stream = match handshake(client, tls.as_ref()).await? {
+    let mut stream = match handshake(client, tls.as_ref(), false).await? {
         HandshakeData::Startup(stream, _) => stream,
         HandshakeData::Cancel(_) => bail!("cancellation not supported"),
     };
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index 3b760e5dab..cbfc9f1358 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -34,7 +34,10 @@ async fn proxy_mitm(
     tokio::spawn(async move {
         // begin handshake with end_server
         let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await;
-        let (end_client, startup) = match handshake(client1, Some(&server_config1)).await.unwrap() {
+        let (end_client, startup) = match handshake(client1, Some(&server_config1), false)
+            .await
+            .unwrap()
+        {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(_) => panic!("cancellation not supported"),
         };
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 24c94fadd8..f3c42cdb01 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -172,6 +172,10 @@ async fn connection_handler(
     };
 
     let peer_addr = peer.unwrap_or(peer_addr).ip();
+    let has_private_peer_addr = match peer_addr {
+        IpAddr::V4(ip) => ip.is_private(),
+        _ => false,
+    };
     info!(?session_id, %peer_addr, "accepted new TCP connection");
 
     // try upgrade to TLS, but with a timeout.
@@ -182,13 +186,17 @@ async fn connection_handler(
         }
         // The handshake failed
         Ok(Err(e)) => {
-            Metrics::get().proxy.tls_handshake_failures.inc();
+            if !has_private_peer_addr {
+                Metrics::get().proxy.tls_handshake_failures.inc();
+            }
             warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
             return;
         }
         // The handshake timed out
         Err(e) => {
-            Metrics::get().proxy.tls_handshake_failures.inc();
+            if !has_private_peer_addr {
+                Metrics::get().proxy.tls_handshake_failures.inc();
+            }
             warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
             return;
         }
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index fdd2be3ee5..690e92ffb1 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -223,12 +223,20 @@ pub enum StreamUpgradeError {
 
 impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
     /// If possible, upgrade raw stream into a secure TLS-based stream.
-    pub async fn upgrade(self, cfg: Arc<ServerConfig>) -> Result<TlsStream<S>, StreamUpgradeError> {
+    pub async fn upgrade(
+        self,
+        cfg: Arc<ServerConfig>,
+        record_handshake_error: bool,
+    ) -> Result<TlsStream<S>, StreamUpgradeError> {
         match self {
             Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg)
                 .accept(raw)
                 .await
-                .inspect_err(|_| Metrics::get().proxy.tls_handshake_failures.inc())?),
+                .inspect_err(|_| {
+                    if record_handshake_error {
+                        Metrics::get().proxy.tls_handshake_failures.inc()
+                    }
+                })?),
             Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
         }
     }

From 2d5a8462c8093fb7db7e15cea68c6d740818c39c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 15 Apr 2024 22:14:42 +0200
Subject: [PATCH 0565/1571] add `async` walredo mode (disabled-by-default,
 opt-in via config) (#6548)

Before this PR, the `nix::poll::poll` call would stall the executor.

This PR refactors the `walredo::process` module to allow for different
implementations, and adds a new `async` implementation which uses
`tokio::process::ChildStd{in,out}` for IPC.

The `sync` variant remains the default for now; we'll do more testing in
staging and gradual rollout to prod using the config variable.

Performance
-----------

I updated `bench_walredo.rs`, demonstrating that a single `async`-based
walredo manager used by N=1...128 tokio tasks has lower latency and
higher throughput.

I further did manual less-micro-benchmarking in the real pageserver
binary.
Methodology & results are published here:

https://neondatabase.notion.site/2024-04-08-async-walredo-benchmarking-8c0ed3cc8d364a44937c4cb50b6d7019?pvs=4

tl;dr:
- use pagebench against a pageserver patched to answer getpage request &
small-enough working set to fit into PS PageCache / kernel page cache.
- compare knee in the latency/throughput curve
    - N tenants, each 1 pagebench clients
    - sync better throughput at N < 30, async better at higher N
    - async generally noticable but not much worse p99.X tail latencies
- eyeballing CPU efficiency in htop, `async` seems significantly more
CPU efficient at ca N=[0.5*ncpus, 1.5*ncpus], worse than `sync` outside
of that band

Mental Model For Walredo & Scheduler Interactions
-------------------------------------------------

Walredo is CPU-/DRAM-only work.
This means that as soon as the Pageserver writes to the pipe, the
walredo process becomes runnable.

To the Linux kernel scheduler, the `$ncpus` executor threads and the
walredo process thread are just `struct task_struct`, and it will divide
CPU time fairly among them.

In `sync` mode, there are always `$ncpus` runnable `struct task_struct`
because the executor thread blocks while `walredo` runs, and the
executor thread becomes runnable when the `walredo` process is done
handling the request.
In `async` mode, the executor threads remain runnable unless there are
no more runnable tokio tasks, which is unlikely in a production
pageserver.

The above means that in `sync` mode, there is an implicit concurrency
limit on concurrent walredo requests (`$num_runtimes *
$num_executor_threads_per_runtime`).
And executor threads do not compete in the Linux kernel scheduler for
CPU time, due to the blocked-runnable-ping-pong.
In `async` mode, there is no concurrency limit, and the walredo tasks
compete with the executor threads for CPU time in the kernel scheduler.

If we're not CPU-bound, `async` has a pipelining and hence throughput
advantage over `sync` because one executor thread can continue
processing requests while a walredo request is in flight.

If we're CPU-bound, under a fair CPU scheduler, the *fixed* number of
executor threads has to share CPU time with the aggregate of walredo
processes.
It's trivial to reason about this in `sync` mode due to the
blocked-runnable-ping-pong.
In `async` mode, at 100% CPU, the system arrives at some (potentially
sub-optiomal) equilibrium where the executor threads get just enough CPU
time to fill up the remaining CPU time with runnable walredo process.

Why `async` mode Doesn't Limit Walredo Concurrency
--------------------------------------------------

To control that equilibrium in `async` mode, one may add a tokio
semaphore to limit the number of in-flight walredo requests.
However, the placement of such a semaphore is non-trivial because it
means that tasks queuing up behind it hold on to their request-scoped
allocations.
In the case of walredo, that might be the entire reconstruct data.
We don't limit the number of total inflight Timeline::get (we only
throttle admission).
So, that queue might lead to an OOM.

The alternative is to acquire the semaphore permit *before* collecting
reconstruct data.
However, what if we need to on-demand download?

A combination of semaphores might help: one for reconstruct data, one
for walredo.
The reconstruct data semaphore permit is dropped after acquiring the
walredo semaphore permit.
This scheme effectively enables both a limit on in-flight reconstruct
data and walredo concurrency.

However, sizing the amount of permits for the semaphores is tricky:
- Reconstruct data retrieval is a mix of disk IO and CPU work.
- If we need to do on-demand downloads, it's network IO + disk IO + CPU
work.
- At this time, we have no good data on how the wall clock time is
distributed.

It turns out that, in my benchmarking, the system worked fine without a
semaphore. So, we're shipping async walredo without one for now.

Future Work
-----------

We will do more testing of `async` mode and gradual rollout to prod
using the config flag.
Once that is done, we'll remove `sync` mode to avoid the temporary code
duplication introduced by this PR.
The flag will be removed.

The `wait()` for the child process to exit is still synchronous; the
comment [here](
https://github.com/neondatabase/neon/blob/655d3b64681b6562530665c9ab5f2f806f30ad01/pageserver/src/walredo.rs#L294-L306)
is still a valid argument in favor of that.

The `sync` mode had another implicit advantage: from tokio's
perspective, the calling task was using up coop budget.
But with `async` mode, that's no longer the case -- to tokio, the writes
to the child process pipe look like IO.
We could/should inform tokio about the CPU time budget consumed by the
task to achieve fairness similar to `sync`.
However, the [runtime function for this is
`tokio_unstable`](`https://docs.rs/tokio/latest/tokio/task/fn.consume_budget.html).


Refs
----

refs #6628
refs https://github.com/neondatabase/neon/issues/2975
---
 libs/pageserver_api/src/models.rs             |  10 +-
 libs/utils/src/lib.rs                         |   2 +
 libs/utils/src/poison.rs                      | 121 +++++
 pageserver/benches/bench_walredo.rs           | 147 ++++--
 pageserver/src/bin/pageserver.rs              |   1 +
 pageserver/src/config.rs                      |  25 +-
 pageserver/src/metrics.rs                     |  23 +
 pageserver/src/tenant.rs                      |   2 +-
 pageserver/src/walredo.rs                     |  65 +--
 pageserver/src/walredo/process.rs             | 435 +++---------------
 .../process/process_impl/process_async.rs     | 374 +++++++++++++++
 .../process/process_impl/process_std.rs       | 405 ++++++++++++++++
 test_runner/regress/test_pageserver_config.py |  35 ++
 13 files changed, 1187 insertions(+), 458 deletions(-)
 create mode 100644 libs/utils/src/poison.rs
 create mode 100644 pageserver/src/walredo/process/process_impl/process_async.rs
 create mode 100644 pageserver/src/walredo/process/process_impl/process_std.rs
 create mode 100644 test_runner/regress/test_pageserver_config.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index b4909f247f..f441d1ff1a 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -747,10 +747,18 @@ pub struct TimelineGcRequest {
     pub gc_horizon: Option<u64>,
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WalRedoManagerProcessStatus {
+    pub pid: u32,
+    /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
+    /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
+    pub kind: Cow<'static, str>,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalRedoManagerStatus {
     pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
-    pub pid: Option<u32>,
+    pub process: Option<WalRedoManagerProcessStatus>,
 }
 
 /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index b09350d11e..2953f0aad4 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -92,6 +92,8 @@ pub mod zstd;
 
 pub mod env;
 
+pub mod poison;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs
new file mode 100644
index 0000000000..0bf5664f47
--- /dev/null
+++ b/libs/utils/src/poison.rs
@@ -0,0 +1,121 @@
+//!  Protect a piece of state from reuse after it is left in an inconsistent state.
+//!
+//!  # Example
+//!
+//!  ```
+//!  # tokio_test::block_on(async {
+//!  use utils::poison::Poison;
+//!  use std::time::Duration;
+//!
+//!  struct State {
+//!    clean: bool,
+//!  }
+//!  let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true }));
+//!
+//!  let mut mutex_guard = state.lock().await;
+//!  let mut poison_guard = mutex_guard.check_and_arm()?;
+//!  let state = poison_guard.data_mut();
+//!  state.clean = false;
+//!  // If we get cancelled at this await point, subsequent check_and_arm() calls will fail.
+//!  tokio::time::sleep(Duration::from_secs(10)).await;
+//!  state.clean = true;
+//!  poison_guard.disarm();
+//!  # Ok::<(), utils::poison::Error>(())
+//!  # });
+//!  ```
+
+use tracing::warn;
+
+pub struct Poison<T> {
+    what: &'static str,
+    state: State,
+    data: T,
+}
+
+#[derive(Clone, Copy)]
+enum State {
+    Clean,
+    Armed,
+    Poisoned { at: chrono::DateTime<chrono::Utc> },
+}
+
+impl<T> Poison<T> {
+    /// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed.
+    pub fn new(what: &'static str, data: T) -> Self {
+        Self {
+            what,
+            state: State::Clean,
+            data,
+        }
+    }
+
+    /// Check for poisoning and return a [`Guard`] that provides access to the wrapped state.
+    pub fn check_and_arm(&mut self) -> Result<Guard<T>, Error> {
+        match self.state {
+            State::Clean => {
+                self.state = State::Armed;
+                Ok(Guard(self))
+            }
+            State::Armed => unreachable!("transient state"),
+            State::Poisoned { at } => Err(Error::Poisoned {
+                what: self.what,
+                at,
+            }),
+        }
+    }
+}
+
+/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
+/// Once modifications are done, use [`Self::disarm`].
+/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
+/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
+pub struct Guard<'a, T>(&'a mut Poison<T>);
+
+impl<'a, T> Guard<'a, T> {
+    pub fn data(&self) -> &T {
+        &self.0.data
+    }
+    pub fn data_mut(&mut self) -> &mut T {
+        &mut self.0.data
+    }
+
+    pub fn disarm(self) {
+        match self.0.state {
+            State::Clean => unreachable!("we set it to Armed in check_and_arm()"),
+            State::Armed => {
+                self.0.state = State::Clean;
+            }
+            State::Poisoned { at } => {
+                unreachable!("we fail check_and_arm() if it's in that state: {at}")
+            }
+        }
+    }
+}
+
+impl<'a, T> Drop for Guard<'a, T> {
+    fn drop(&mut self) {
+        match self.0.state {
+            State::Clean => {
+                // set by disarm()
+            }
+            State::Armed => {
+                // still armed => poison it
+                let at = chrono::Utc::now();
+                self.0.state = State::Poisoned { at };
+                warn!(at=?at, "poisoning {}", self.0.what);
+            }
+            State::Poisoned { at } => {
+                unreachable!("we fail check_and_arm() if it's in that state: {at}")
+            }
+        }
+    }
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+    #[error("poisoned at {at}: {what}")]
+    Poisoned {
+        what: &'static str,
+        at: chrono::DateTime<chrono::Utc>,
+    },
+}
diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index ffe607be4b..5b871c5d5e 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -27,30 +27,50 @@
 //!
 //! # Reference Numbers
 //!
-//! 2024-04-04 on i3en.3xlarge
+//! 2024-04-15 on i3en.3xlarge
 //!
 //! ```text
-//! short/1                 time:   [25.925 µs 26.060 µs 26.209 µs]
-//! short/2                 time:   [31.277 µs 31.483 µs 31.722 µs]
-//! short/4                 time:   [45.496 µs 45.831 µs 46.182 µs]
-//! short/8                 time:   [84.298 µs 84.920 µs 85.566 µs]
-//! short/16                time:   [185.04 µs 186.41 µs 187.88 µs]
-//! short/32                time:   [385.01 µs 386.77 µs 388.70 µs]
-//! short/64                time:   [770.24 µs 773.04 µs 776.04 µs]
-//! short/128               time:   [1.5017 ms 1.5064 ms 1.5113 ms]
-//! medium/1                time:   [106.65 µs 107.20 µs 107.85 µs]
-//! medium/2                time:   [153.28 µs 154.24 µs 155.56 µs]
-//! medium/4                time:   [325.67 µs 327.01 µs 328.71 µs]
-//! medium/8                time:   [646.82 µs 650.17 µs 653.91 µs]
-//! medium/16               time:   [1.2645 ms 1.2701 ms 1.2762 ms]
-//! medium/32               time:   [2.4409 ms 2.4550 ms 2.4692 ms]
-//! medium/64               time:   [4.6814 ms 4.7114 ms 4.7408 ms]
-//! medium/128              time:   [8.7790 ms 8.9037 ms 9.0282 ms]
+//! async-short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
+//! async-short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
+//! async-short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
+//! async-short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
+//! async-short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
+//! async-short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
+//! async-short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
+//! async-short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
+//! async-medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
+//! async-medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
+//! async-medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
+//! async-medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
+//! async-medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
+//! async-medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
+//! async-medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
+//! async-medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
+//! sync-short/1            time:   [25.503 µs 25.626 µs 25.771 µs]
+//! sync-short/2            time:   [30.850 µs 31.013 µs 31.208 µs]
+//! sync-short/4            time:   [45.543 µs 45.856 µs 46.193 µs]
+//! sync-short/8            time:   [84.114 µs 84.639 µs 85.220 µs]
+//! sync-short/16           time:   [185.22 µs 186.15 µs 187.13 µs]
+//! sync-short/32           time:   [377.43 µs 378.87 µs 380.46 µs]
+//! sync-short/64           time:   [756.49 µs 759.04 µs 761.70 µs]
+//! sync-short/128          time:   [1.4825 ms 1.4874 ms 1.4923 ms]
+//! sync-medium/1           time:   [105.66 µs 106.01 µs 106.43 µs]
+//! sync-medium/2           time:   [153.10 µs 153.84 µs 154.72 µs]
+//! sync-medium/4           time:   [327.13 µs 329.44 µs 332.27 µs]
+//! sync-medium/8           time:   [654.26 µs 658.73 µs 663.63 µs]
+//! sync-medium/16          time:   [1.2682 ms 1.2748 ms 1.2816 ms]
+//! sync-medium/32          time:   [2.4456 ms 2.4595 ms 2.4731 ms]
+//! sync-medium/64          time:   [4.6523 ms 4.6890 ms 4.7256 ms]
+//! sync-medium/128         time:   [8.7215 ms 8.8323 ms 8.9344 ms]
 //! ```
 
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
-use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
+use pageserver::{
+    config::PageServerConf,
+    walrecord::NeonWalRecord,
+    walredo::{PostgresRedoManager, ProcessKind},
+};
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
     sync::Arc,
@@ -60,33 +80,39 @@ use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};
 
 fn bench(c: &mut Criterion) {
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("short");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::short_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
+    for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
+        {
+            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+            for nclients in nclients {
+                let mut group = c.benchmark_group(format!("{process_kind}-short"));
+                group.bench_with_input(
+                    BenchmarkId::from_parameter(nclients),
+                    &nclients,
+                    |b, nclients| {
+                        let redo_work = Arc::new(Request::short_input());
+                        b.iter_custom(|iters| {
+                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
+                        });
+                    },
+                );
+            }
         }
-    }
 
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("medium");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::medium_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
+        {
+            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+            for nclients in nclients {
+                let mut group = c.benchmark_group(format!("{process_kind}-medium"));
+                group.bench_with_input(
+                    BenchmarkId::from_parameter(nclients),
+                    &nclients,
+                    |b, nclients| {
+                        let redo_work = Arc::new(Request::medium_input());
+                        b.iter_custom(|iters| {
+                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
+                        });
+                    },
+                );
+            }
         }
     }
 }
@@ -94,10 +120,16 @@ criterion::criterion_group!(benches, bench);
 criterion::criterion_main!(benches);
 
 // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
+fn bench_impl(
+    process_kind: ProcessKind,
+    redo_work: Arc<Request>,
+    n_redos: u64,
+    nclients: u64,
+) -> Duration {
     let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
 
-    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
+    let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
+    conf.walredo_process_kind = process_kind;
     let conf = Box::leak(Box::new(conf));
     let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
 
@@ -113,25 +145,40 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
     let manager = PostgresRedoManager::new(conf, tenant_shard_id);
     let manager = Arc::new(manager);
 
+    // divide the amount of work equally among the clients.
+    let nredos_per_client = n_redos / nclients;
     for _ in 0..nclients {
         rt.block_on(async {
             tasks.spawn(client(
                 Arc::clone(&manager),
                 Arc::clone(&start),
                 Arc::clone(&redo_work),
-                // divide the amount of work equally among the clients
-                n_redos / nclients,
+                nredos_per_client,
             ))
         });
     }
 
-    rt.block_on(async move {
-        let mut total_wallclock_time = std::time::Duration::from_millis(0);
+    let elapsed = rt.block_on(async move {
+        let mut total_wallclock_time = Duration::ZERO;
         while let Some(res) = tasks.join_next().await {
             total_wallclock_time += res.unwrap();
         }
         total_wallclock_time
-    })
+    });
+
+    // consistency check to ensure process kind setting worked
+    if nredos_per_client > 0 {
+        assert_eq!(
+            manager
+                .status()
+                .process
+                .map(|p| p.kind)
+                .expect("the benchmark work causes a walredo process to be spawned"),
+            std::borrow::Cow::Borrowed(process_kind.into())
+        );
+    }
+
+    elapsed
 }
 
 async fn client(
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 0903b206ff..41835f9843 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -285,6 +285,7 @@ fn start_pageserver(
     ))
     .unwrap();
     pageserver::preinitialize_metrics();
+    pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
 
     // If any failpoints were set from FAILPOINTS environment variable,
     // print them to the log for debugging purposes
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 1837da34ce..e10db2b853 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -97,6 +97,8 @@ pub mod defaults {
 
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
 
+    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
+
     ///
     /// Default built-in configuration file.
     ///
@@ -140,6 +142,8 @@ pub mod defaults {
 
 #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
 
+#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -290,6 +294,8 @@ pub struct PageServerConf {
     ///
     /// Setting this to zero disables limits on total ephemeral layer size.
     pub ephemeral_bytes_per_memory_kb: usize,
+
+    pub walredo_process_kind: crate::walredo::ProcessKind,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -413,6 +419,8 @@ struct PageServerConfigBuilder {
     validate_vectored_get: BuilderValue<bool>,
 
     ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
+
+    walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
 }
 
 impl PageServerConfigBuilder {
@@ -500,6 +508,8 @@ impl PageServerConfigBuilder {
             )),
             validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
+
+            walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
         }
     }
 }
@@ -683,6 +693,10 @@ impl PageServerConfigBuilder {
         self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
     }
 
+    pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
+        self.walredo_process_kind = BuilderValue::Set(value);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
@@ -739,6 +753,7 @@ impl PageServerConfigBuilder {
                 max_vectored_read_bytes,
                 validate_vectored_get,
                 ephemeral_bytes_per_memory_kb,
+                walredo_process_kind,
             }
             CUSTOM LOGIC
             {
@@ -1032,6 +1047,9 @@ impl PageServerConf {
                 "ephemeral_bytes_per_memory_kb" => {
                     builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                 }
+                "walredo_process_kind" => {
+                    builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1114,6 +1132,7 @@ impl PageServerConf {
             ),
             validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+            walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
         }
     }
 }
@@ -1351,7 +1370,8 @@ background_task_maximum_delay = '334 s'
                         .expect("Invalid default constant")
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
+                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1423,7 +1443,8 @@ background_task_maximum_delay = '334 s'
                         .expect("Invalid default constant")
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
+                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 6755c15c30..be61a755ff 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1819,6 +1819,29 @@ impl Default for WalRedoProcessCounters {
 pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
     Lazy::new(WalRedoProcessCounters::default);
 
+#[cfg(not(test))]
+pub mod wal_redo {
+    use super::*;
+
+    static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
+        std::sync::Mutex::new(
+            register_uint_gauge_vec!(
+                "pageserver_wal_redo_process_kind",
+                "The configured process kind for walredo",
+                &["kind"],
+            )
+            .unwrap(),
+        )
+    });
+
+    pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
+        // use guard to avoid races around the next two steps
+        let guard = PROCESS_KIND.lock().unwrap();
+        guard.reset();
+        guard.with_label_values(&[&format!("{kind}")]).set(1);
+    }
+}
+
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
 pub(crate) struct StorageTimeMetricsTimer {
     metrics: StorageTimeMetrics,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2eac1247f7..35ea037a55 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -386,7 +386,7 @@ impl WalRedoManager {
 
     pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
         match self {
-            WalRedoManager::Prod(m) => m.status(),
+            WalRedoManager::Prod(m) => Some(m.status()),
             #[cfg(test)]
             WalRedoManager::Test(_) => None,
         }
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index ca41a576fd..9776d4ce88 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -20,6 +20,7 @@
 
 /// Process lifecycle and abstracction for the IPC protocol.
 mod process;
+pub use process::Kind as ProcessKind;
 
 /// Code to apply [`NeonWalRecord`]s.
 pub(crate) mod apply_neon;
@@ -34,7 +35,7 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Bytes, BytesMut};
 use pageserver_api::key::key_to_rel_block;
-use pageserver_api::models::WalRedoManagerStatus;
+use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
 use pageserver_api::shard::TenantShardId;
 use std::sync::Arc;
 use std::time::Duration;
@@ -54,7 +55,7 @@ pub struct PostgresRedoManager {
     tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
     last_redo_at: std::sync::Mutex<Option<Instant>>,
-    /// The current [`process::WalRedoProcess`] that is used by new redo requests.
+    /// The current [`process::Process`] that is used by new redo requests.
     /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
     /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
     /// their process object; we use [`Arc::clone`] for that.
@@ -66,7 +67,7 @@ pub struct PostgresRedoManager {
     /// still be using the old redo process. But, those other tasks will most likely
     /// encounter an error as well, and errors are an unexpected condition anyway.
     /// So, probably we could get rid of the `Arc` in the future.
-    redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
+    redo_process: heavier_once_cell::OnceCell<Arc<process::Process>>,
 }
 
 ///
@@ -139,8 +140,8 @@ impl PostgresRedoManager {
         }
     }
 
-    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
-        Some(WalRedoManagerStatus {
+    pub fn status(&self) -> WalRedoManagerStatus {
+        WalRedoManagerStatus {
             last_redo_at: {
                 let at = *self.last_redo_at.lock().unwrap();
                 at.and_then(|at| {
@@ -149,8 +150,14 @@ impl PostgresRedoManager {
                     chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
                 })
             },
-            pid: self.redo_process.get().map(|p| p.id()),
-        })
+            process: self
+                .redo_process
+                .get()
+                .map(|p| WalRedoManagerProcessStatus {
+                    pid: p.id(),
+                    kind: std::borrow::Cow::Borrowed(p.kind().into()),
+                }),
+        }
     }
 }
 
@@ -208,37 +215,33 @@ impl PostgresRedoManager {
         const MAX_RETRY_ATTEMPTS: u32 = 1;
         let mut n_attempts = 0u32;
         loop {
-            let proc: Arc<process::WalRedoProcess> =
-                match self.redo_process.get_or_init_detached().await {
-                    Ok(guard) => Arc::clone(&guard),
-                    Err(permit) => {
-                        // don't hold poison_guard, the launch code can bail
-                        let start = Instant::now();
-                        let proc = Arc::new(
-                            process::WalRedoProcess::launch(
-                                self.conf,
-                                self.tenant_shard_id,
-                                pg_version,
-                            )
+            let proc: Arc<process::Process> = match self.redo_process.get_or_init_detached().await {
+                Ok(guard) => Arc::clone(&guard),
+                Err(permit) => {
+                    // don't hold poison_guard, the launch code can bail
+                    let start = Instant::now();
+                    let proc = Arc::new(
+                        process::Process::launch(self.conf, self.tenant_shard_id, pg_version)
                             .context("launch walredo process")?,
-                        );
-                        let duration = start.elapsed();
-                        WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                        info!(
-                            duration_ms = duration.as_millis(),
-                            pid = proc.id(),
-                            "launched walredo process"
-                        );
-                        self.redo_process.set(Arc::clone(&proc), permit);
-                        proc
-                    }
-                };
+                    );
+                    let duration = start.elapsed();
+                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                    info!(
+                        duration_ms = duration.as_millis(),
+                        pid = proc.id(),
+                        "launched walredo process"
+                    );
+                    self.redo_process.set(Arc::clone(&proc), permit);
+                    proc
+                }
+            };
 
             let started_at = std::time::Instant::now();
 
             // Relational WAL records are applied using wal-redo-postgres
             let result = proc
                 .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
+                .await
                 .context("apply_wal_records");
 
             let duration = started_at.elapsed();
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index bcbb263663..ad6b4e5fe9 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -1,186 +1,67 @@
-use self::no_leak_child::NoLeakChild;
-use crate::{
-    config::PageServerConf,
-    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
-    walrecord::NeonWalRecord,
-};
-use anyhow::Context;
+use std::time::Duration;
+
 use bytes::Bytes;
-use nix::poll::{PollFd, PollFlags};
 use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use postgres_ffi::BLCKSZ;
-use std::os::fd::AsRawFd;
-#[cfg(feature = "testing")]
-use std::sync::atomic::AtomicUsize;
-use std::{
-    collections::VecDeque,
-    io::{Read, Write},
-    process::{ChildStdin, ChildStdout, Command, Stdio},
-    sync::{Mutex, MutexGuard},
-    time::Duration,
-};
-use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, nonblock::set_nonblock};
+use utils::lsn::Lsn;
+
+use crate::{config::PageServerConf, walrecord::NeonWalRecord};
 
 mod no_leak_child;
 /// The IPC protocol that pageserver and walredo process speak over their shared pipe.
 mod protocol;
 
-pub struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: Mutex<ProcessOutput>,
-    stdin: Mutex<ProcessInput>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
+mod process_impl {
+    pub(super) mod process_async;
+    pub(super) mod process_std;
 }
 
-struct ProcessInput {
-    stdin: ChildStdin,
-    n_requests: usize,
+#[derive(
+    Clone,
+    Copy,
+    Debug,
+    PartialEq,
+    Eq,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    strum_macros::IntoStaticStr,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+#[repr(u8)]
+pub enum Kind {
+    Sync,
+    Async,
 }
 
-struct ProcessOutput {
-    stdout: ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
+pub(crate) enum Process {
+    Sync(process_impl::process_std::WalRedoProcess),
+    Async(process_impl::process_async::WalRedoProcess),
 }
 
-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(pg_version=pg_version))]
-    pub(crate) fn launch(
+impl Process {
+    #[inline(always)]
+    pub fn launch(
         conf: &'static PageServerConf,
         tenant_shard_id: TenantShardId,
         pg_version: u32,
     ) -> anyhow::Result<Self> {
-        crate::span::debug_assert_current_span_has_tenant_id();
-
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        use no_leak_child::NoLeakChildCommandExt;
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        macro_rules! set_nonblock_or_log_err {
-        ($file:ident) => {{
-            let res = set_nonblock($file.as_raw_fd());
-            if let Err(e) = &res {
-                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
-            }
-            res
-        }};
-    }
-        set_nonblock_or_log_err!(stdin)?;
-        set_nonblock_or_log_err!(stdout)?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-        async move {
-            scopeguard::defer! {
-                debug!("wal-redo-postgres stderr_logger_task finished");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-            }
-            debug!("wal-redo-postgres stderr_logger_task started");
-            crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-            use tokio::io::AsyncBufReadExt;
-            let mut stderr_lines = tokio::io::BufReader::new(stderr);
-            let mut buf = Vec::new();
-            let res = loop {
-                buf.clear();
-                // TODO we don't trust the process to cap its stderr length.
-                // Currently it can do unbounded Vec allocation.
-                match stderr_lines.read_until(b'\n', &mut buf).await {
-                    Ok(0) => break Ok(()), // eof
-                    Ok(num_bytes) => {
-                        let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                        error!(%output, "received output");
-                    }
-                    Err(e) => {
-                        break Err(e);
-                    }
-                }
-            };
-            match res {
-                Ok(()) => (),
-                Err(e) => {
-                    error!(error=?e, "failed to read from walredo stderr");
-                }
-            }
-        }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-    );
-
-        Ok(Self {
-            conf,
-            tenant_shard_id,
-            child: Some(child),
-            stdin: Mutex::new(ProcessInput {
-                stdin,
-                n_requests: 0,
-            }),
-            stdout: Mutex::new(ProcessOutput {
-                stdout,
-                pending_responses: VecDeque::new(),
-                n_processed_responses: 0,
-            }),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
+        Ok(match conf.walredo_process_kind {
+            Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch(
+                conf,
+                tenant_shard_id,
+                pg_version,
+            )?),
+            Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch(
+                conf,
+                tenant_shard_id,
+                pg_version,
+            )?),
         })
     }
 
-    pub(crate) fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    // Apply given WAL records ('records') over an old page image. Returns
-    // new page image.
-    //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
-    pub(crate) fn apply_wal_records(
+    #[inline(always)]
+    pub(crate) async fn apply_wal_records(
         &self,
         rel: RelTag,
         blknum: u32,
@@ -188,221 +69,29 @@ impl WalRedoProcess {
         records: &[(Lsn, NeonWalRecord)],
         wal_redo_timeout: Duration,
     ) -> anyhow::Result<Bytes> {
-        let tag = protocol::BufferTag { rel, blknum };
-        let input = self.stdin.lock().unwrap();
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            protocol::build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+        match self {
+            Process::Sync(p) => {
+                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+                    .await
             }
-        }
-        protocol::build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
-    }
-
-    fn apply_wal_records0(
-        &self,
-        writebuf: &[u8],
-        input: MutexGuard<ProcessInput>,
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
-        let mut nwrite = 0usize;
-
-        while nwrite < writebuf.len() {
-            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
-            let n = loop {
-                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
-                    Err(nix::errno::Errno::EINTR) => continue,
-                    res => break res,
-                }
-            }?;
-
-            if n == 0 {
-                anyhow::bail!("WAL redo timed out");
+            Process::Async(p) => {
+                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+                    .await
             }
-
-            // If 'stdin' is writeable, do write.
-            let in_revents = stdin_pollfds[0].revents().unwrap();
-            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            }
-            if in_revents.contains(PollFlags::POLLHUP) {
-                // We still have more data to write, but the process closed the pipe.
-                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
-            }
-        }
-        let request_no = proc.n_requests;
-        proc.n_requests += 1;
-        drop(proc);
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut output = self.stdout.lock().unwrap();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
-            while nresult < BLCKSZ.into() {
-                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
-                // We do two things simultaneously: reading response from stdout
-                // and forward any logging information that the child writes to its stderr to the page server's log.
-                let n = loop {
-                    match nix::poll::poll(
-                        &mut stdout_pollfds[..],
-                        wal_redo_timeout.as_millis() as i32,
-                    ) {
-                        Err(nix::errno::Errno::EINTR) => continue,
-                        res => break res,
-                    }
-                }?;
-
-                if n == 0 {
-                    anyhow::bail!("WAL redo timed out");
-                }
-
-                // If we have some data in stdout, read it to the result buffer.
-                let out_revents = stdout_pollfds[0].revents().unwrap();
-                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                }
-                if out_revents.contains(PollFlags::POLLHUP) {
-                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
-                }
-            }
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        use std::sync::atomic::Ordering;
-
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
         }
     }
 
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
+    pub(crate) fn id(&self) -> u32 {
+        match self {
+            Process::Sync(p) => p.id(),
+            Process::Async(p) => p.id(),
+        }
+    }
 
-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
+    pub(crate) fn kind(&self) -> Kind {
+        match self {
+            Process::Sync(_) => Kind::Sync,
+            Process::Async(_) => Kind::Async,
+        }
     }
 }
diff --git a/pageserver/src/walredo/process/process_impl/process_async.rs b/pageserver/src/walredo/process/process_impl/process_async.rs
new file mode 100644
index 0000000000..262858b033
--- /dev/null
+++ b/pageserver/src/walredo/process/process_impl/process_async.rs
@@ -0,0 +1,374 @@
+use self::no_leak_child::NoLeakChild;
+use crate::{
+    config::PageServerConf,
+    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    walrecord::NeonWalRecord,
+    walredo::process::{no_leak_child, protocol},
+};
+use anyhow::Context;
+use bytes::Bytes;
+use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use postgres_ffi::BLCKSZ;
+#[cfg(feature = "testing")]
+use std::sync::atomic::AtomicUsize;
+use std::{
+    collections::VecDeque,
+    process::{Command, Stdio},
+    time::Duration,
+};
+use tokio::io::{AsyncReadExt, AsyncWriteExt};
+use tracing::{debug, error, instrument, Instrument};
+use utils::{lsn::Lsn, poison::Poison};
+
+pub struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
+    stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
+}
+
+struct ProcessInput {
+    stdin: tokio::process::ChildStdin,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: tokio::process::ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
+
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(pg_version=pg_version))]
+    pub(crate) fn launch(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        pg_version: u32,
+    ) -> anyhow::Result<Self> {
+        crate::span::debug_assert_current_span_has_tenant_id();
+
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        use no_leak_child::NoLeakChildCommandExt;
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        let stdin =
+            tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
+        let stdout = tokio::process::ChildStdout::from_std(stdout)
+            .context("convert to tokio::ChildStdout")?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                }
+                debug!("wal-redo-postgres stderr_logger_task started");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+                use tokio::io::AsyncBufReadExt;
+                let mut stderr_lines = tokio::io::BufReader::new(stderr);
+                let mut buf = Vec::new();
+                let res = loop {
+                    buf.clear();
+                    // TODO we don't trust the process to cap its stderr length.
+                    // Currently it can do unbounded Vec allocation.
+                    match stderr_lines.read_until(b'\n', &mut buf).await {
+                        Ok(0) => break Ok(()), // eof
+                        Ok(num_bytes) => {
+                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                            error!(%output, "received output");
+                        }
+                        Err(e) => {
+                            break Err(e);
+                        }
+                    }
+                };
+                match res {
+                    Ok(()) => (),
+                    Err(e) => {
+                        error!(error=?e, "failed to read from walredo stderr");
+                    }
+                }
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+        );
+
+        Ok(Self {
+            conf,
+            tenant_shard_id,
+            child: Some(child),
+            stdin: tokio::sync::Mutex::new(Poison::new(
+                "stdin",
+                ProcessInput {
+                    stdin,
+                    n_requests: 0,
+                },
+            )),
+            stdout: tokio::sync::Mutex::new(Poison::new(
+                "stdout",
+                ProcessOutput {
+                    stdout,
+                    pending_responses: VecDeque::new(),
+                    n_processed_responses: 0,
+                },
+            )),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+        })
+    }
+
+    pub(crate) fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    /// Apply given WAL records ('records') over an old page image. Returns
+    /// new page image.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// Cancellation safe.
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    pub(crate) async fn apply_wal_records(
+        &self,
+        rel: RelTag,
+        blknum: u32,
+        base_img: &Option<Bytes>,
+        records: &[(Lsn, NeonWalRecord)],
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let tag = protocol::BufferTag { rel, blknum };
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            protocol::build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+            }
+        }
+        protocol::build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let Ok(res) =
+            tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
+        else {
+            anyhow::bail!("WAL redo timed out");
+        };
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
+    }
+
+    /// # Cancel-Safety
+    ///
+    /// When not polled to completion (e.g. because in `tokio::select!` another
+    /// branch becomes ready before this future), concurrent and subsequent
+    /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
+    /// Dispose of this process instance and create a new one.
+    async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
+        let request_no = {
+            let mut lock_guard = self.stdin.lock().await;
+            let mut poison_guard = lock_guard.check_and_arm()?;
+            let input = poison_guard.data_mut();
+            input
+                .stdin
+                .write_all(writebuf)
+                .await
+                .context("write to walredo stdin")?;
+            let request_no = input.n_requests;
+            input.n_requests += 1;
+            poison_guard.disarm();
+            request_no
+        };
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut lock_guard = self.stdout.lock().await;
+        let mut poison_guard = lock_guard.check_and_arm()?;
+        let output = poison_guard.data_mut();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            output
+                .stdout
+                .read_exact(&mut resultbuf)
+                .await
+                .context("read walredo stdout")?;
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        poison_guard.disarm();
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        use std::sync::atomic::Ordering;
+
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        use std::io::Write;
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}
+
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
+    }
+}
diff --git a/pageserver/src/walredo/process/process_impl/process_std.rs b/pageserver/src/walredo/process/process_impl/process_std.rs
new file mode 100644
index 0000000000..e7a6c263c9
--- /dev/null
+++ b/pageserver/src/walredo/process/process_impl/process_std.rs
@@ -0,0 +1,405 @@
+use self::no_leak_child::NoLeakChild;
+use crate::{
+    config::PageServerConf,
+    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    walrecord::NeonWalRecord,
+    walredo::process::{no_leak_child, protocol},
+};
+use anyhow::Context;
+use bytes::Bytes;
+use nix::poll::{PollFd, PollFlags};
+use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use postgres_ffi::BLCKSZ;
+use std::os::fd::AsRawFd;
+#[cfg(feature = "testing")]
+use std::sync::atomic::AtomicUsize;
+use std::{
+    collections::VecDeque,
+    io::{Read, Write},
+    process::{ChildStdin, ChildStdout, Command, Stdio},
+    sync::{Mutex, MutexGuard},
+    time::Duration,
+};
+use tracing::{debug, error, instrument, Instrument};
+use utils::{lsn::Lsn, nonblock::set_nonblock};
+
+pub struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: Mutex<ProcessOutput>,
+    stdin: Mutex<ProcessInput>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
+}
+
+struct ProcessInput {
+    stdin: ChildStdin,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
+
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(pg_version=pg_version))]
+    pub(crate) fn launch(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        pg_version: u32,
+    ) -> anyhow::Result<Self> {
+        crate::span::debug_assert_current_span_has_tenant_id();
+
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        use no_leak_child::NoLeakChildCommandExt;
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        macro_rules! set_nonblock_or_log_err {
+        ($file:ident) => {{
+            let res = set_nonblock($file.as_raw_fd());
+            if let Err(e) = &res {
+                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
+            }
+            res
+        }};
+    }
+        set_nonblock_or_log_err!(stdin)?;
+        set_nonblock_or_log_err!(stdout)?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                }
+                debug!("wal-redo-postgres stderr_logger_task started");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+                use tokio::io::AsyncBufReadExt;
+                let mut stderr_lines = tokio::io::BufReader::new(stderr);
+                let mut buf = Vec::new();
+                let res = loop {
+                    buf.clear();
+                    // TODO we don't trust the process to cap its stderr length.
+                    // Currently it can do unbounded Vec allocation.
+                    match stderr_lines.read_until(b'\n', &mut buf).await {
+                        Ok(0) => break Ok(()), // eof
+                        Ok(num_bytes) => {
+                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                            error!(%output, "received output");
+                        }
+                        Err(e) => {
+                            break Err(e);
+                        }
+                    }
+                };
+                match res {
+                    Ok(()) => (),
+                    Err(e) => {
+                        error!(error=?e, "failed to read from walredo stderr");
+                    }
+                }
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+        );
+
+        Ok(Self {
+            conf,
+            tenant_shard_id,
+            child: Some(child),
+            stdin: Mutex::new(ProcessInput {
+                stdin,
+                n_requests: 0,
+            }),
+            stdout: Mutex::new(ProcessOutput {
+                stdout,
+                pending_responses: VecDeque::new(),
+                n_processed_responses: 0,
+            }),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+        })
+    }
+
+    pub(crate) fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    // Apply given WAL records ('records') over an old page image. Returns
+    // new page image.
+    //
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    pub(crate) async fn apply_wal_records(
+        &self,
+        rel: RelTag,
+        blknum: u32,
+        base_img: &Option<Bytes>,
+        records: &[(Lsn, NeonWalRecord)],
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let tag = protocol::BufferTag { rel, blknum };
+        let input = self.stdin.lock().unwrap();
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            protocol::build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+            }
+        }
+        protocol::build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
+    }
+
+    fn apply_wal_records0(
+        &self,
+        writebuf: &[u8],
+        input: MutexGuard<ProcessInput>,
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
+        let mut nwrite = 0usize;
+
+        while nwrite < writebuf.len() {
+            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
+            let n = loop {
+                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
+                    Err(nix::errno::Errno::EINTR) => continue,
+                    res => break res,
+                }
+            }?;
+
+            if n == 0 {
+                anyhow::bail!("WAL redo timed out");
+            }
+
+            // If 'stdin' is writeable, do write.
+            let in_revents = stdin_pollfds[0].revents().unwrap();
+            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
+                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
+            }
+            if in_revents.contains(PollFlags::POLLHUP) {
+                // We still have more data to write, but the process closed the pipe.
+                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
+            }
+        }
+        let request_no = proc.n_requests;
+        proc.n_requests += 1;
+        drop(proc);
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut output = self.stdout.lock().unwrap();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+            while nresult < BLCKSZ.into() {
+                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
+                // We do two things simultaneously: reading response from stdout
+                // and forward any logging information that the child writes to its stderr to the page server's log.
+                let n = loop {
+                    match nix::poll::poll(
+                        &mut stdout_pollfds[..],
+                        wal_redo_timeout.as_millis() as i32,
+                    ) {
+                        Err(nix::errno::Errno::EINTR) => continue,
+                        res => break res,
+                    }
+                }?;
+
+                if n == 0 {
+                    anyhow::bail!("WAL redo timed out");
+                }
+
+                // If we have some data in stdout, read it to the result buffer.
+                let out_revents = stdout_pollfds[0].revents().unwrap();
+                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
+                }
+                if out_revents.contains(PollFlags::POLLHUP) {
+                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
+                }
+            }
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        use std::sync::atomic::Ordering;
+
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}
+
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
+    }
+}
diff --git a/test_runner/regress/test_pageserver_config.py b/test_runner/regress/test_pageserver_config.py
new file mode 100644
index 0000000000..c04348b488
--- /dev/null
+++ b/test_runner/regress/test_pageserver_config.py
@@ -0,0 +1,35 @@
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    last_flush_lsn_upload,
+)
+
+
+@pytest.mark.parametrize("kind", ["sync", "async"])
+def test_walredo_process_kind_config(neon_env_builder: NeonEnvBuilder, kind: str):
+    neon_env_builder.pageserver_config_override = f"walredo_process_kind = '{kind}'"
+    # ensure it starts
+    env = neon_env_builder.init_start()
+    # ensure the metric is set
+    ps_http = env.pageserver.http_client()
+    metrics = ps_http.get_metrics()
+    samples = metrics.query_all("pageserver_wal_redo_process_kind")
+    assert [(s.labels, s.value) for s in samples] == [({"kind": kind}, 1)]
+    # ensure default tenant's config kind matches
+    # => write some data to force-spawn walredo
+    ep = env.endpoints.create_start("main")
+    with ep.connect() as conn:
+        with conn.cursor() as cur:
+            cur.execute("create table foo(bar text)")
+            cur.execute("insert into foo select from generate_series(1, 100)")
+    last_flush_lsn_upload(env, ep, env.initial_tenant, env.initial_timeline)
+    ep.stop()
+    ep.start()
+    with ep.connect() as conn:
+        with conn.cursor() as cur:
+            cur.execute("select count(*) from foo")
+            [(count,)] = cur.fetchall()
+            assert count == 100
+
+    status = ps_http.tenant_status(env.initial_tenant)
+    assert status["walredo"]["process"]["kind"] == kind

From 3366cd34bacfbd2dab57378494eee0d3a21d3079 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 16 Apr 2024 11:39:18 +0300
Subject: [PATCH 0566/1571] pageserver: return ACCEPTED when deletion already
 in flight (#7384)

## Problem

test_sharding_smoke recently got an added section that checks deletion
of a sharded tenant. The storage controller does a retry loop for
deletion, waiting for a 404 response. When deletion is a bit slow (debug
builds), the retry of deletion was getting a 500 response -- this caused
the test to become flaky (example failure:
https://neon-github-public-dev.s3.amazonaws.com/reports/release-proxy/8659801445/index.html#testresult/b4cbf5b58190f60e/retries)

There was a false comment in the code:
```
         match tenant.current_state() {
             TenantState::Broken { .. } | TenantState::Stopping { .. } => {
-                // If a tenant is broken or stopping, DeleteTenantFlow can
-                // handle it: broken tenants proceed to delete, stopping tenants
-                // are checked for deletion already in progress.
```

If the tenant is stopping, DeleteTenantFlow does not in fact handle it,
but returns a 500-yielding errror.

## Summary of changes

Before calling into DeleteTenantFlow, if the tenant is in
stopping|broken state then return 202 if a deletion is in progress. This
makes the API friendlier for retries.

The historic AlreadyInProgress (409) response still exists for if we
enter DeleteTenantFlow and unexpectedly see the tenant stopping. That
should go away when we implement #5080 . For the moment, callers that
handle 409s should continue to do so.
---
 pageserver/src/tenant/delete.rs           |  5 +++++
 pageserver/src/tenant/mgr.rs              | 12 ++++++++---
 test_runner/fixtures/neon_fixtures.py     |  4 +++-
 test_runner/regress/test_tenant_delete.py | 26 ++++++++++++-----------
 4 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index d1881f3897..33d0f677e5 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -436,6 +436,11 @@ impl DeleteTenantFlow {
         .await
     }
 
+    /// Check whether background deletion of this tenant is currently in progress
+    pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
+        tenant.delete_progress.try_lock().is_err()
+    }
+
     async fn prepare(
         tenant: &Arc<Tenant>,
     ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index b1b46d487b..73967f2949 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1410,9 +1410,15 @@ impl TenantManager {
 
         match tenant.current_state() {
             TenantState::Broken { .. } | TenantState::Stopping { .. } => {
-                // If a tenant is broken or stopping, DeleteTenantFlow can
-                // handle it: broken tenants proceed to delete, stopping tenants
-                // are checked for deletion already in progress.
+                // If deletion is already in progress, return success (the semantics of this
+                // function are to rerturn success afterr deletion is spawned in background).
+                // Otherwise fall through and let [`DeleteTenantFlow`] handle this state.
+                if DeleteTenantFlow::is_in_progress(&tenant) {
+                    // The `delete_progress` lock is held: deletion is already happening
+                    // in the bacckground
+                    slot_guard.revert();
+                    return Ok(());
+                }
             }
             _ => {
                 tenant
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0e4a58c099..c2c661088b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2449,10 +2449,12 @@ class NeonPageserver(PgProtocol):
                 if cur_line_no < skip_until_line_no:
                     cur_line_no += 1
                     continue
-                if contains_re.search(line):
+                elif contains_re.search(line):
                     # found it!
                     cur_line_no += 1
                     return (line, LogCursor(cur_line_no))
+                else:
+                    cur_line_no += 1
         return None
 
     def tenant_attach(
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index a164c7f60a..c115c0375b 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -469,7 +469,8 @@ def test_tenant_delete_concurrent(
 ):
     """
     Validate that concurrent delete requests to the same tenant behave correctly:
-    exactly one should succeed.
+    exactly one should execute: the rest should give 202 responses but not start
+    another deletion.
 
     This is a reproducer for https://github.com/neondatabase/neon/issues/5936
     """
@@ -484,14 +485,10 @@ def test_tenant_delete_concurrent(
         run_pg_bench_small(pg_bin, endpoint.connstr())
         last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
 
-    CONFLICT_MESSAGE = "Precondition failed: Invalid state Stopping. Expected Active or Broken"
-
     env.pageserver.allowed_errors.extend(
         [
             # lucky race with stopping from flushing a layer we fail to schedule any uploads
             ".*layer flush task.+: could not flush frozen layer: update_metadata_file",
-            # Errors logged from our 4xx requests
-            f".*{CONFLICT_MESSAGE}.*",
         ]
     )
 
@@ -507,7 +504,7 @@ def test_tenant_delete_concurrent(
         return ps_http.tenant_delete(tenant_id)
 
     def hit_remove_failpoint():
-        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
+        return env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")[1]
 
     def hit_run_failpoint():
         env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
@@ -518,11 +515,14 @@ def test_tenant_delete_concurrent(
 
         # Wait until the first request completes its work and is blocked on removing
         # the TenantSlot from tenant manager.
-        wait_until(100, 0.1, hit_remove_failpoint)
+        log_cursor = wait_until(100, 0.1, hit_remove_failpoint)
+        assert log_cursor is not None
 
-        # Start another request: this should fail when it sees a tenant in Stopping state
-        with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE):
-            ps_http.tenant_delete(tenant_id)
+        # Start another request: this should succeed without actually entering the deletion code
+        ps_http.tenant_delete(tenant_id)
+        assert not env.pageserver.log_contains(
+            f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
+        )
 
         # Start another background request, which will pause after acquiring a TenantSlotGuard
         # but before completing.
@@ -539,8 +539,10 @@ def test_tenant_delete_concurrent(
 
         # Permit the duplicate background request to run to completion and fail.
         ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off"))
-        with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE):
-            background_4xx_req.result(timeout=10)
+        background_4xx_req.result(timeout=10)
+        assert not env.pageserver.log_contains(
+            f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
+        )
 
     # Physical deletion should have happened
     assert_prefix_empty(

From 926662eb7ca12956d7210c97f28ba744b43aa30f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 16 Apr 2024 13:41:48 +0100
Subject: [PATCH 0567/1571] storage_controller: suppress misleading log (#7395)

## Problem

- https://github.com/neondatabase/neon/issues/7355

The optimize_secondary function calls schedule_shard to check for
improvements, but if there are exactly the same number of nodes as there
are replicas of the shard, it emits some scary looking logs about no
nodes being elegible.

Closes https://github.com/neondatabase/neon/issues/7355

## Summary of changes

- Add a mode to SchedulingContext that controls logging: this should be
useful in future any time we add a log to the scheduling path, to avoid
it becoming a source of spam when the scheduler is called during
optimization.
---
 storage_controller/src/scheduler.rs | 43 ++++++++++++++++++++++-------
 storage_controller/src/service.rs   |  3 +-
 2 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 862ac0cbfe..3ff0d87988 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -84,6 +84,20 @@ impl std::ops::Add for AffinityScore {
     }
 }
 
+/// Hint for whether this is a sincere attempt to schedule, or a speculative
+/// check for where we _would_ schedule (done during optimization)
+#[derive(Debug)]
+pub(crate) enum ScheduleMode {
+    Normal,
+    Speculative,
+}
+
+impl Default for ScheduleMode {
+    fn default() -> Self {
+        Self::Normal
+    }
+}
+
 // For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
 // it for many shards in the same tenant.
 #[derive(Debug, Default)]
@@ -93,6 +107,8 @@ pub(crate) struct ScheduleContext {
 
     /// Specifically how many _attached_ locations are on each node
     pub(crate) attached_nodes: HashMap<NodeId, usize>,
+
+    pub(crate) mode: ScheduleMode,
 }
 
 impl ScheduleContext {
@@ -329,27 +345,34 @@ impl Scheduler {
         scores.sort_by_key(|i| (i.1, i.2, i.0));
 
         if scores.is_empty() {
-            // After applying constraints, no pageservers were left.  We log some detail about
-            // the state of nodes to help understand why this happened.  This is not logged as an error because
-            // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
-            tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
-            for (node_id, node) in &self.nodes {
+            // After applying constraints, no pageservers were left.
+            if !matches!(context.mode, ScheduleMode::Speculative) {
+                // If this was not a speculative attempt, log details to understand why we couldn't
+                // schedule: this may help an engineer understand if some nodes are marked offline
+                // in a way that's preventing progress.
                 tracing::info!(
-                    "Node {node_id}: may_schedule={} shards={}",
-                    node.may_schedule != MaySchedule::No,
-                    node.shard_count
+                    "Scheduling failure, while excluding {hard_exclude:?}, node states:"
                 );
+                for (node_id, node) in &self.nodes {
+                    tracing::info!(
+                        "Node {node_id}: may_schedule={} shards={}",
+                        node.may_schedule != MaySchedule::No,
+                        node.shard_count
+                    );
+                }
             }
-
             return Err(ScheduleError::ImpossibleConstraint);
         }
 
         // Lowest score wins
         let node_id = scores.first().unwrap().0;
-        tracing::info!(
+
+        if !matches!(context.mode, ScheduleMode::Speculative) {
+            tracing::info!(
             "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
             scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
         );
+        }
 
         // Note that we do not update shard count here to reflect the scheduling: that
         // is IntentState's job when the scheduled location is used.
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 4ee189dac9..0565f8e7b4 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -11,7 +11,7 @@ use crate::{
     id_lock_map::IdLockMap,
     persistence::{AbortShardSplitStatus, TenantFilter},
     reconciler::ReconcileError,
-    scheduler::ScheduleContext,
+    scheduler::{ScheduleContext, ScheduleMode},
 };
 use anyhow::Context;
 use control_plane::storage_controller::{
@@ -4137,6 +4137,7 @@ impl Service {
             if tenant_shard_id.is_shard_zero() {
                 // Reset accumulators on the first shard in a tenant
                 schedule_context = ScheduleContext::default();
+                schedule_context.mode = ScheduleMode::Speculative;
                 tenant_shards.clear();
             }
 

From e5c50bb12b8013fd671052084b02626e02081c27 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 16 Apr 2024 15:16:34 +0100
Subject: [PATCH 0568/1571] proxy: rate limit authentication by masked IPv6.
 (#7316)

## Problem

Many users have access to ipv6 subnets (eg a /64). That gives them 2^64
addresses to play with

## Summary of changes

Truncate the address to /64 to reduce the attack surface.

Todo:
~~Will NAT64 be an issue here? AFAIU they put the IPv4 address at the
end of the IPv6 address. By truncating we will lose all that detail.~~
It's the same problem as a host sharing IPv6 addresses between clients.
I don't think it's up to us to solve. If a customer is getting DDoSed,
then they likely need to arrange a dedicated IP with us.
---
 proxy/src/auth/backend.rs             | 112 +++++++++++++++++++++++---
 proxy/src/bin/proxy.rs                |   6 +-
 proxy/src/config.rs                   |   5 +-
 proxy/src/rate_limiter.rs             |   2 +-
 proxy/src/rate_limiter/limiter.rs     |  51 +-----------
 proxy/src/serverless/backend.rs       |   4 +-
 proxy/src/serverless/sql_over_http.rs |   4 +-
 7 files changed, 118 insertions(+), 66 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index ab5dd4544b..3795e3b608 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -2,8 +2,15 @@ mod classic;
 mod hacks;
 mod link;
 
+use std::net::IpAddr;
+use std::sync::Arc;
+use std::time::Duration;
+
+use ipnet::{Ipv4Net, Ipv6Net};
 pub use link::LinkAuthError;
+use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::AuthKeys;
+use tracing::{info, warn};
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::validate_password_and_exchange;
@@ -16,6 +23,7 @@ use crate::intern::EndpointIdInt;
 use crate::metrics::Metrics;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
+use crate::rate_limiter::{BucketRateLimiter, RateBucketInfo};
 use crate::stream::Stream;
 use crate::{
     auth::{self, ComputeUserInfoMaybeEndpoint},
@@ -28,9 +36,6 @@ use crate::{
     stream, url,
 };
 use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
-use std::sync::Arc;
-use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, warn};
 
 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
@@ -176,11 +181,45 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
     }
 }
 
+#[derive(PartialEq, PartialOrd, Hash, Eq, Ord, Debug, Copy, Clone)]
+pub struct MaskedIp(IpAddr);
+
+impl MaskedIp {
+    fn new(value: IpAddr, prefix: u8) -> Self {
+        match value {
+            IpAddr::V4(v4) => Self(IpAddr::V4(
+                Ipv4Net::new(v4, prefix).map_or(v4, |x| x.trunc().addr()),
+            )),
+            IpAddr::V6(v6) => Self(IpAddr::V6(
+                Ipv6Net::new(v6, prefix).map_or(v6, |x| x.trunc().addr()),
+            )),
+        }
+    }
+}
+
+// This can't be just per IP because that would limit some PaaS that share IP addresses
+pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>;
+
+impl RateBucketInfo {
+    /// All of these are per endpoint-maskedip pair.
+    /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
+    ///
+    /// First bucket: 1000mcpus total per endpoint-ip pair
+    /// * 4096000 requests per second with 1 hash rounds.
+    /// * 1000 requests per second with 4096 hash rounds.
+    /// * 6.8 requests per second with 600000 hash rounds.
+    pub const DEFAULT_AUTH_SET: [Self; 3] = [
+        Self::new(1000 * 4096, Duration::from_secs(1)),
+        Self::new(600 * 4096, Duration::from_secs(60)),
+        Self::new(300 * 4096, Duration::from_secs(600)),
+    ];
+}
+
 impl AuthenticationConfig {
     pub fn check_rate_limit(
         &self,
-
         ctx: &mut RequestMonitoring,
+        config: &AuthenticationConfig,
         secret: AuthSecret,
         endpoint: &EndpointId,
         is_cleartext: bool,
@@ -201,9 +240,13 @@ impl AuthenticationConfig {
             1
         };
 
-        let limit_not_exceeded = self
-            .rate_limiter
-            .check((endpoint_int, ctx.peer_addr), password_weight);
+        let limit_not_exceeded = self.rate_limiter.check(
+            (
+                endpoint_int,
+                MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet),
+            ),
+            password_weight,
+        );
 
         if !limit_not_exceeded {
             warn!(
@@ -271,6 +314,7 @@ async fn auth_quirks(
     let secret = match secret {
         Some(secret) => config.check_rate_limit(
             ctx,
+            config,
             secret,
             &info.endpoint,
             unauthenticated_password.is_some() || allow_cleartext,
@@ -473,7 +517,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
+    use std::{net::IpAddr, sync::Arc, time::Duration};
 
     use bytes::BytesMut;
     use fallible_iterator::FallibleIterator;
@@ -486,7 +530,7 @@ mod tests {
     use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
 
     use crate::{
-        auth::{ComputeUserInfoMaybeEndpoint, IpPattern},
+        auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern},
         config::AuthenticationConfig,
         console::{
             self,
@@ -495,12 +539,12 @@ mod tests {
         },
         context::RequestMonitoring,
         proxy::NeonOptions,
-        rate_limiter::{AuthRateLimiter, RateBucketInfo},
+        rate_limiter::RateBucketInfo,
         scram::ServerSecret,
         stream::{PqStream, Stream},
     };
 
-    use super::auth_quirks;
+    use super::{auth_quirks, AuthRateLimiter};
 
     struct Auth {
         ips: Vec<IpPattern>,
@@ -541,6 +585,7 @@ mod tests {
         scram_protocol_timeout: std::time::Duration::from_secs(5),
         rate_limiter_enabled: true,
         rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
+        rate_limit_ip_subnet: 64,
     });
 
     async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
@@ -552,6 +597,51 @@ mod tests {
         }
     }
 
+    #[test]
+    fn masked_ip() {
+        let ip_a = IpAddr::V4([127, 0, 0, 1].into());
+        let ip_b = IpAddr::V4([127, 0, 0, 2].into());
+        let ip_c = IpAddr::V4([192, 168, 1, 101].into());
+        let ip_d = IpAddr::V4([192, 168, 1, 102].into());
+        let ip_e = IpAddr::V6("abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd".parse().unwrap());
+        let ip_f = IpAddr::V6("abcd:abcd:abcd:abcd:1234:abcd:abcd:abcd".parse().unwrap());
+
+        assert_ne!(MaskedIp::new(ip_a, 64), MaskedIp::new(ip_b, 64));
+        assert_ne!(MaskedIp::new(ip_a, 32), MaskedIp::new(ip_b, 32));
+        assert_eq!(MaskedIp::new(ip_a, 30), MaskedIp::new(ip_b, 30));
+        assert_eq!(MaskedIp::new(ip_c, 30), MaskedIp::new(ip_d, 30));
+
+        assert_ne!(MaskedIp::new(ip_e, 128), MaskedIp::new(ip_f, 128));
+        assert_eq!(MaskedIp::new(ip_e, 64), MaskedIp::new(ip_f, 64));
+    }
+
+    #[test]
+    fn test_default_auth_rate_limit_set() {
+        // these values used to exceed u32::MAX
+        assert_eq!(
+            RateBucketInfo::DEFAULT_AUTH_SET,
+            [
+                RateBucketInfo {
+                    interval: Duration::from_secs(1),
+                    max_rpi: 1000 * 4096,
+                },
+                RateBucketInfo {
+                    interval: Duration::from_secs(60),
+                    max_rpi: 600 * 4096 * 60,
+                },
+                RateBucketInfo {
+                    interval: Duration::from_secs(600),
+                    max_rpi: 300 * 4096 * 600,
+                }
+            ]
+        );
+
+        for x in RateBucketInfo::DEFAULT_AUTH_SET {
+            let y = x.to_string().parse().unwrap();
+            assert_eq!(x, y);
+        }
+    }
+
     #[tokio::test]
     async fn auth_quirks_scram() {
         let (mut client, server) = tokio::io::duplex(1024);
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 2e749fc7e8..06ada991f3 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -7,6 +7,7 @@ use aws_config::provider_config::ProviderConfig;
 use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
 use futures::future::Either;
 use proxy::auth;
+use proxy::auth::backend::AuthRateLimiter;
 use proxy::auth::backend::MaybeOwned;
 use proxy::cancellation::CancelMap;
 use proxy::cancellation::CancellationHandler;
@@ -20,7 +21,6 @@ use proxy::context::parquet::ParquetUploadArgs;
 use proxy::http;
 use proxy::http::health_server::AppMetrics;
 use proxy::metrics::Metrics;
-use proxy::rate_limiter::AuthRateLimiter;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
@@ -152,6 +152,9 @@ struct ProxyCliArgs {
     /// Authentication rate limiter max number of hashes per second.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
     auth_rate_limit: Vec<RateBucketInfo>,
+    /// The IP subnet to use when considering whether two IP addresses are considered the same.
+    #[clap(long, default_value_t = 64)]
+    auth_rate_limit_ip_subnet: u8,
     /// Redis rate limiter max number of requests per second.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
     redis_rps_limit: Vec<RateBucketInfo>,
@@ -575,6 +578,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         scram_protocol_timeout: args.scram_protocol_timeout,
         rate_limiter_enabled: args.auth_rate_limit_enabled,
         rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
+        rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
     };
 
     let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index b4b2ce8dbd..7b4c02393b 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,6 +1,6 @@
 use crate::{
-    auth,
-    rate_limiter::{AuthRateLimiter, RateBucketInfo},
+    auth::{self, backend::AuthRateLimiter},
+    rate_limiter::RateBucketInfo,
     serverless::GlobalConnPoolOptions,
 };
 use anyhow::{bail, ensure, Context, Ok};
@@ -58,6 +58,7 @@ pub struct AuthenticationConfig {
     pub scram_protocol_timeout: tokio::time::Duration,
     pub rate_limiter_enabled: bool,
     pub rate_limiter: AuthRateLimiter,
+    pub rate_limit_ip_subnet: u8,
 }
 
 impl TlsConfig {
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index a3b83e5e50..2a7297ef81 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -4,4 +4,4 @@ mod limiter;
 pub use aimd::Aimd;
 pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
 pub use limiter::Limiter;
-pub use limiter::{AuthRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
+pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 7e9370f606..a0a4e82fe5 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -2,7 +2,6 @@ use std::{
     borrow::Cow,
     collections::hash_map::RandomState,
     hash::{BuildHasher, Hash},
-    net::IpAddr,
     sync::{
         atomic::{AtomicUsize, Ordering},
         Arc, Mutex,
@@ -18,11 +17,8 @@ use tokio::time::{timeout, Duration, Instant};
 use tracing::info;
 
 use crate::{
-    intern::EndpointIdInt,
-    {
-        metrics::{Metrics, RateLimit},
-        EndpointId,
-    },
+    metrics::{Metrics, RateLimit},
+    EndpointId,
 };
 
 use super::{
@@ -81,9 +77,6 @@ impl GlobalRateLimiter {
 // I went with a more expensive way that yields user-friendlier error messages.
 pub type EndpointRateLimiter = BucketRateLimiter<EndpointId, StdRng, RandomState>;
 
-// This can't be just per IP because that would limit some PaaS that share IP addresses
-pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, IpAddr), StdRng, RandomState>;
-
 pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
     map: DashMap<Key, Vec<RateBucket>, Hasher>,
     info: Cow<'static, [RateBucketInfo]>,
@@ -155,19 +148,6 @@ impl RateBucketInfo {
         Self::new(100, Duration::from_secs(600)),
     ];
 
-    /// All of these are per endpoint-ip pair.
-    /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
-    ///
-    /// First bucket: 300mcpus total per endpoint-ip pair
-    /// * 1228800 requests per second with 1 hash rounds. (endpoint rate limiter will catch this first)
-    /// * 300 requests per second with 4096 hash rounds.
-    /// * 2 requests per second with 600000 hash rounds.
-    pub const DEFAULT_AUTH_SET: [Self; 3] = [
-        Self::new(300 * 4096, Duration::from_secs(1)),
-        Self::new(200 * 4096, Duration::from_secs(60)),
-        Self::new(100 * 4096, Duration::from_secs(600)),
-    ];
-
     pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
         info.sort_unstable_by_key(|info| info.interval);
         let invalid = info
@@ -783,31 +763,4 @@ mod tests {
         }
         assert!(limiter.map.len() < 150_000);
     }
-
-    #[test]
-    fn test_default_auth_set() {
-        // these values used to exceed u32::MAX
-        assert_eq!(
-            RateBucketInfo::DEFAULT_AUTH_SET,
-            [
-                RateBucketInfo {
-                    interval: Duration::from_secs(1),
-                    max_rpi: 300 * 4096,
-                },
-                RateBucketInfo {
-                    interval: Duration::from_secs(60),
-                    max_rpi: 200 * 4096 * 60,
-                },
-                RateBucketInfo {
-                    interval: Duration::from_secs(600),
-                    max_rpi: 100 * 4096 * 600,
-                }
-            ]
-        );
-
-        for x in RateBucketInfo::DEFAULT_AUTH_SET {
-            let y = x.to_string().parse().unwrap();
-            assert_eq!(x, y);
-        }
-    }
 }
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 8aa5ad4e8a..e74c63599a 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -6,7 +6,7 @@ use tracing::{field::display, info};
 use crate::{
     auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError},
     compute,
-    config::ProxyConfig,
+    config::{AuthenticationConfig, ProxyConfig},
     console::{
         errors::{GetAuthInfoError, WakeComputeError},
         CachedNodeInfo,
@@ -27,6 +27,7 @@ impl PoolingBackend {
     pub async fn authenticate(
         &self,
         ctx: &mut RequestMonitoring,
+        config: &AuthenticationConfig,
         conn_info: &ConnInfo,
     ) -> Result<ComputeCredentials, AuthError> {
         let user_info = conn_info.user_info.clone();
@@ -43,6 +44,7 @@ impl PoolingBackend {
         let secret = match cached_secret.value.clone() {
             Some(secret) => self.config.authentication_config.check_rate_limit(
                 ctx,
+                config,
                 secret,
                 &user_info.endpoint,
                 true,
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index a66edb2c66..e856053a7e 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -541,7 +541,9 @@ async fn handle_inner(
     .map_err(SqlOverHttpError::from);
 
     let authenticate_and_connect = async {
-        let keys = backend.authenticate(ctx, &conn_info).await?;
+        let keys = backend
+            .authenticate(ctx, &config.authentication_config, &conn_info)
+            .await?;
         let client = backend
             .connect_to_compute(ctx, conn_info, keys, !allow_pool)
             .await?;

From 1c012958c7b350eacf94ce631e271ef7afd2a575 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 16 Apr 2024 16:24:09 +0100
Subject: [PATCH 0569/1571] pageserver/http: remove status code boilerplate
 from swagger spec (#7385)

## Problem
We specify a bunch of possible error codes in the pageserver api swagger
spec. This is error prone and annoying to work with.
https://github.com/neondatabase/cloud/pull/11907 introduced generic
error handling on the control plane side, so we can now clean up the
spec.

## Summary of changes
* Remove generic error codes from swagger spec
* Update a couple route handlers which would previously return an error
without a `msg` field in the response body.

Tested via https://github.com/neondatabase/cloud/pull/12340

Related https://github.com/neondatabase/cloud/issues/7238
---
 pageserver/src/http/openapi_spec.yml | 615 +--------------------------
 pageserver/src/http/routes.rs        |  10 +-
 2 files changed, 11 insertions(+), 614 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 2713309824..d89f949688 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -58,24 +58,6 @@ paths:
       responses:
         "200":
           description: The reload completed successfully.
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error (also hits if no keys were found)
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
 
   /v1/tenant/{tenant_id}:
     parameters:
@@ -93,62 +75,14 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/TenantInfo"
-        "400":
-          description: Error when no tenant id found in path or no timeline id
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
     delete:
       description: |
         Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
         404 means that deletion successfully finished"
       responses:
-        "400":
-          description: Error when no tenant id found in path
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
         "404":
-          description: Tenant not found
+          description: Tenant not found. This is the success path.
           content:
             application/json:
               schema:
@@ -165,18 +99,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/PreconditionFailedError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/time_travel_remote_storage:
     parameters:
@@ -206,36 +128,6 @@ paths:
             application/json:
               schema:
                 type: string
-        "400":
-          description: Error when no tenant id found in path or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/timeline:
     parameters:
@@ -255,36 +147,6 @@ paths:
                 type: array
                 items:
                   $ref: "#/components/schemas/TimelineInfo"
-        "400":
-          description: Error when no tenant id found in path
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
 
   /v1/tenant/{tenant_id}/timeline/{timeline_id}:
@@ -309,60 +171,12 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/TimelineInfo"
-        "400":
-          description: Error when no tenant id found in path or no timeline id
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
     delete:
       description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
       responses:
-        "400":
-          description: Error when no tenant id found in path or no timeline id
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
         "404":
-          description: Timeline not found
+          description: Timeline not found. This is the success path.
           content:
             application/json:
               schema:
@@ -379,18 +193,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/PreconditionFailedError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
     parameters:
@@ -423,36 +225,6 @@ paths:
               schema:
                 type: string
                 format: date-time
-        "400":
-          description: Error when no tenant id found in path, no timeline id or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Timeline not found, or there is no timestamp information for the given lsn
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
 
   /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
     parameters:
@@ -484,36 +256,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/LsnByTimestampResponse"
-        "400":
-          description: Error when no tenant id found in path, no timeline id or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
     parameters:
@@ -537,36 +279,6 @@ paths:
             application/json:
               schema:
                 type: string
-        "400":
-          description: Error when no tenant id found in path, no timeline id or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
   /v1/tenant/{tenant_shard_id}/location_config:
     parameters:
       - name: tenant_shard_id
@@ -628,24 +340,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/TenantLocationConfigResponse"
-        "503":
-          description: Tenant's state cannot be changed right now.  Wait a few seconds and retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
         "409":
           description: |
             The tenant is already known to Pageserver in some way,
@@ -662,12 +356,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/ConflictError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
   /v1/tenant/{tenant_id}/ignore:
     parameters:
       - name: tenant_id
@@ -684,36 +372,6 @@ paths:
       responses:
         "200":
           description: Tenant ignored
-        "400":
-          description: Error when no tenant id found in path parameters
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
 
   /v1/tenant/{tenant_id}/load:
@@ -740,36 +398,6 @@ paths:
       responses:
         "202":
           description: Tenant scheduled to load successfully
-        "400":
-          description: Error when no tenant id found in path parameters
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
     parameters:
@@ -790,37 +418,6 @@ paths:
       responses:
         "202":
           description: Tenant scheduled to load successfully
-        "404":
-          description: No tenant or timeline found for the specified ids
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
 
   /v1/tenant/{tenant_id}/synthetic_size:
     parameters:
@@ -839,31 +436,8 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/SyntheticSizeResponse"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
+  # This route has no handler. TODO: remove?
   /v1/tenant/{tenant_id}/size:
     parameters:
       - name: tenant_id
@@ -945,18 +519,6 @@ paths:
       responses:
         "200":
           description: Success
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_shard_id}/secondary/download:
     parameters:
@@ -987,20 +549,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/SecondaryProgress"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
-
 
   /v1/tenant/{tenant_id}/timeline/:
     parameters:
@@ -1043,24 +591,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/TimelineInfo"
-        "400":
-          description: Malformed timeline create request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
         "406":
           description: Permanently unsatisfiable request, don't retry.
           content:
@@ -1079,18 +609,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/Error"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/:
     get:
@@ -1104,30 +622,6 @@ paths:
                 type: array
                 items:
                   $ref: "#/components/schemas/TenantInfo"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
     post:
       description: |
@@ -1148,43 +642,12 @@ paths:
             application/json:
               schema:
                 type: string
-        "400":
-          description: Malformed tenant create request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
         "409":
           description: Tenant already exists, creation skipped
           content:
             application/json:
               schema:
                 $ref: "#/components/schemas/ConflictError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
 
   /v1/tenant/config:
     put:
@@ -1206,36 +669,6 @@ paths:
                 type: array
                 items:
                   $ref: "#/components/schemas/TenantInfo"
-        "400":
-          description: Malformed tenant config request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/config/:
     parameters:
@@ -1255,42 +688,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/TenantConfigResponse"
-        "400":
-          description: Malformed get tenanant config request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Tenand or timeline were not found
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/utilization:
     get:
@@ -1304,12 +701,6 @@ paths:
               application/json:
                 schema:
                   $ref: "#/components/schemas/PageserverUtilization"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
 
 components:
   securitySchemes:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0b8c991f11..20258dd950 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -457,8 +457,12 @@ async fn reload_auth_validation_keys_handler(
             json_response(StatusCode::OK, ())
         }
         Err(e) => {
+            let err_msg = "Error reloading public keys";
             warn!("Error reloading public keys from {key_path:?}: {e:}");
-            json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
+            json_response(
+                StatusCode::INTERNAL_SERVER_ERROR,
+                HttpErrorBody::from_msg(err_msg.to_string()),
+            )
         }
     }
 }
@@ -772,7 +776,9 @@ async fn get_timestamp_of_lsn_handler(
             let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
             json_response(StatusCode::OK, time)
         }
-        None => json_response(StatusCode::NOT_FOUND, ()),
+        None => Err(ApiError::NotFound(
+            anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
+        )),
     }
 }
 

From 9e567d9814d139698dae041db849d201717ef58d Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 17 Apr 2024 09:10:01 +0300
Subject: [PATCH 0570/1571] feat(neon_local): support listen addr for
 safekeeper (#7328)

Leftover from my LFC benchmarks. Safekeepers only listen on `127.0.0.1`
for `neon_local`. This pull request adds support for listening on other
address. To specify a custom address, modify `.neon/config`.

```
[[safekeepers]]
listen_addr = "192.168.?.?"
```

Endpoints created by neon_local still use 127.0.0.1 and I will fix them
later. I didn't fix it in the same pull request because my benchmark
setting does not use neon_local to create compute nodes so I don't know
how to fix it yet -- maybe replacing a few `127.0.0.1`s.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/src/local_env.rs  |  2 ++
 control_plane/src/safekeeper.rs | 21 ++++++++++++++-------
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index bd3dbef453..38b7fffd09 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -156,6 +156,7 @@ pub struct SafekeeperConf {
     pub remote_storage: Option<String>,
     pub backup_threads: Option<u32>,
     pub auth_enabled: bool,
+    pub listen_addr: Option<String>,
 }
 
 impl Default for SafekeeperConf {
@@ -169,6 +170,7 @@ impl Default for SafekeeperConf {
             remote_storage: None,
             backup_threads: None,
             auth_enabled: false,
+            listen_addr: None,
         }
     }
 }
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index 6ac71dfe51..d62a2e80b5 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -70,24 +70,31 @@ pub struct SafekeeperNode {
     pub pg_connection_config: PgConnectionConfig,
     pub env: LocalEnv,
     pub http_client: reqwest::Client,
+    pub listen_addr: String,
     pub http_base_url: String,
 }
 
 impl SafekeeperNode {
     pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
+        let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
+            listen_addr.clone()
+        } else {
+            "127.0.0.1".to_string()
+        };
         SafekeeperNode {
             id: conf.id,
             conf: conf.clone(),
-            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
+            pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
             env: env.clone(),
             http_client: reqwest::Client::new(),
-            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
+            http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
+            listen_addr,
         }
     }
 
     /// Construct libpq connection string for connecting to this safekeeper.
-    fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
-        PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
+    fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig {
+        PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port)
     }
 
     pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
@@ -111,8 +118,8 @@ impl SafekeeperNode {
         );
         io::stdout().flush().unwrap();
 
-        let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
-        let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
+        let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port);
+        let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port);
         let id = self.id;
         let datadir = self.datadir_path();
 
@@ -139,7 +146,7 @@ impl SafekeeperNode {
             availability_zone,
         ];
         if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
-            let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
+            let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
             args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
         }
         if !self.conf.sync {

From cb4b40f9c1afb6fe1dbf19691845dd65b187929e Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 17 Apr 2024 09:11:04 +0300
Subject: [PATCH 0571/1571] chore(compute_ctl): add error context to apply_spec
 (#7374)

Make it faster to identify which part of apply spec goes wrong by adding
an error context.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/compute.rs | 39 ++++++++++++++++---------
 compute_tools/src/spec.rs    | 55 +++++++++++++++++++++++-------------
 2 files changed, 60 insertions(+), 34 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 88dc4aca2b..40060f4117 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -818,9 +818,15 @@ impl ComputeNode {
                         Client::connect(zenith_admin_connstr.as_str(), NoTls)
                             .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
                     // Disable forwarding so that users don't get a cloud_admin role
-                    client.simple_query("SET neon.forward_ddl = false")?;
-                    client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
-                    client.simple_query("GRANT zenith_admin TO cloud_admin")?;
+
+                    let mut func = || {
+                        client.simple_query("SET neon.forward_ddl = false")?;
+                        client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
+                        client.simple_query("GRANT zenith_admin TO cloud_admin")?;
+                        Ok::<_, anyhow::Error>(())
+                    };
+                    func().context("apply_config setup cloud_admin")?;
+
                     drop(client);
 
                     // reconnect with connstring with expected name
@@ -832,24 +838,29 @@ impl ComputeNode {
         };
 
         // Disable DDL forwarding because control plane already knows about these roles/databases.
-        client.simple_query("SET neon.forward_ddl = false")?;
+        client
+            .simple_query("SET neon.forward_ddl = false")
+            .context("apply_config SET neon.forward_ddl = false")?;
 
         // Proceed with post-startup configuration. Note, that order of operations is important.
         let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
-        create_neon_superuser(spec, &mut client)?;
-        cleanup_instance(&mut client)?;
-        handle_roles(spec, &mut client)?;
-        handle_databases(spec, &mut client)?;
-        handle_role_deletions(spec, connstr.as_str(), &mut client)?;
+        create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?;
+        cleanup_instance(&mut client).context("apply_config cleanup_instance")?;
+        handle_roles(spec, &mut client).context("apply_config handle_roles")?;
+        handle_databases(spec, &mut client).context("apply_config handle_databases")?;
+        handle_role_deletions(spec, connstr.as_str(), &mut client)
+            .context("apply_config handle_role_deletions")?;
         handle_grants(
             spec,
             &mut client,
             connstr.as_str(),
             self.has_feature(ComputeFeature::AnonExtension),
-        )?;
-        handle_extensions(spec, &mut client)?;
-        handle_extension_neon(&mut client)?;
-        create_availability_check_data(&mut client)?;
+        )
+        .context("apply_config handle_grants")?;
+        handle_extensions(spec, &mut client).context("apply_config handle_extensions")?;
+        handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?;
+        create_availability_check_data(&mut client)
+            .context("apply_config create_availability_check_data")?;
 
         // 'Close' connection
         drop(client);
@@ -857,7 +868,7 @@ impl ComputeNode {
         // Run migrations separately to not hold up cold starts
         thread::spawn(move || {
             let mut client = Client::connect(connstr.as_str(), NoTls)?;
-            handle_migrations(&mut client)
+            handle_migrations(&mut client).context("apply_config handle_migrations")
         });
         Ok(())
     }
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 5643634633..269177ee16 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -2,7 +2,7 @@ use std::fs::File;
 use std::path::Path;
 use std::str::FromStr;
 
-use anyhow::{anyhow, bail, Result};
+use anyhow::{anyhow, bail, Context, Result};
 use postgres::config::Config;
 use postgres::{Client, NoTls};
 use reqwest::StatusCode;
@@ -698,7 +698,8 @@ pub fn handle_grants(
 
         // it is important to run this after all grants
         if enable_anon_extension {
-            handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
+            handle_extension_anon(spec, &db.owner, &mut db_client, false)
+                .context("handle_grants handle_extension_anon")?;
         }
     }
 
@@ -813,28 +814,36 @@ $$;"#,
         // Add new migrations below.
     ];
 
-    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-    client.simple_query(query)?;
+    let mut func = || {
+        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+        client.simple_query(query)?;
 
-    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-    client.simple_query(query)?;
+        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+        client.simple_query(query)?;
 
-    query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-    client.simple_query(query)?;
+        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+        client.simple_query(query)?;
 
-    query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-    client.simple_query(query)?;
+        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+        client.simple_query(query)?;
 
-    query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-    client.simple_query(query)?;
+        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+        client.simple_query(query)?;
+        Ok::<_, anyhow::Error>(())
+    };
+    func().context("handle_migrations prepare")?;
 
-    query = "SELECT id FROM neon_migration.migration_id";
-    let row = client.query_one(query, &[])?;
+    let query = "SELECT id FROM neon_migration.migration_id";
+    let row = client
+        .query_one(query, &[])
+        .context("handle_migrations get migration_id")?;
     let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
     let starting_migration_id = current_migration;
 
-    query = "BEGIN";
-    client.simple_query(query)?;
+    let query = "BEGIN";
+    client
+        .simple_query(query)
+        .context("handle_migrations begin")?;
 
     while current_migration < migrations.len() {
         let migration = &migrations[current_migration];
@@ -842,7 +851,9 @@ $$;"#,
             info!("Skip migration id={}", current_migration);
         } else {
             info!("Running migration:\n{}\n", migration);
-            client.simple_query(migration)?;
+            client.simple_query(migration).with_context(|| {
+                format!("handle_migrations current_migration={}", current_migration)
+            })?;
         }
         current_migration += 1;
     }
@@ -850,10 +861,14 @@ $$;"#,
         "UPDATE neon_migration.migration_id SET id={}",
         migrations.len()
     );
-    client.simple_query(&setval)?;
+    client
+        .simple_query(&setval)
+        .context("handle_migrations update id")?;
 
-    query = "COMMIT";
-    client.simple_query(query)?;
+    let query = "COMMIT";
+    client
+        .simple_query(query)
+        .context("handle_migrations commit")?;
 
     info!(
         "Ran {} migrations",

From 41bb1e42b8aa6152d2f27c8f6535ce54748ef61e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 17 Apr 2024 11:50:58 +0300
Subject: [PATCH 0572/1571] CI(check-build-tools-image): fix getting
 build-tools image tag (#7402)

## Problem

For PRs, by default, we check out a phantom merge commit (merge a branch
into the main), but using a real branches head when finding `build-tools`
image tag.

## Summary of changes
- Change `COMMIT_SHA` to use `${{ github.sha }}` instead of `${{
github.event.pull_request.head.sha }}` for PRs

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 .github/workflows/check-build-tools-image.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml
index 28646dfc19..a1e22cf93f 100644
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -28,7 +28,9 @@ jobs:
       - name: Get build-tools image tag for the current commit
         id: get-build-tools-tag
         env:
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          # Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
+          # we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
+          COMMIT_SHA: ${{ github.sha }}
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           LAST_BUILD_TOOLS_SHA=$(

From 13b9135d4eba2533d817ade229a2daf66f5f5eba Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 17 Apr 2024 11:11:49 +0200
Subject: [PATCH 0573/1571] proxy: Cleanup unused rate limiter (#7400)

## Problem

There is an unused dead code.

## Summary of changes

Let's remove it. In case we would need it in the future, we can always
return it back.

Also removed cli arguments. They shouldn't be used by anyone but us.
---
 proxy/src/bin/proxy.rs                    |  26 +-
 proxy/src/http.rs                         |   4 +-
 proxy/src/metrics.rs                      |  15 +-
 proxy/src/rate_limiter.rs                 |   5 -
 proxy/src/rate_limiter/aimd.rs            | 166 ---------
 proxy/src/rate_limiter/limit_algorithm.rs |  98 -----
 proxy/src/rate_limiter/limiter.rs         | 428 +---------------------
 proxy/src/usage_metrics.rs                |   4 +-
 8 files changed, 16 insertions(+), 730 deletions(-)
 delete mode 100644 proxy/src/rate_limiter/aimd.rs
 delete mode 100644 proxy/src/rate_limiter/limit_algorithm.rs

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 06ada991f3..cefab870cc 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -23,7 +23,6 @@ use proxy::http::health_server::AppMetrics;
 use proxy::metrics::Metrics;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
-use proxy::rate_limiter::RateLimiterConfig;
 use proxy::redis::cancellation_publisher::RedisPublisherClient;
 use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use proxy::redis::elasticache;
@@ -132,14 +131,8 @@ struct ProxyCliArgs {
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     require_client_ip: bool,
     /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     disable_dynamic_rate_limiter: bool,
-    /// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`.
-    #[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)]
-    rate_limit_algorithm: proxy::rate_limiter::RateLimitAlgorithm,
-    /// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error.
-    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
-    rate_limiter_timeout: tokio::time::Duration,
     /// Endpoint rate limiter max number of requests per second.
     ///
     /// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
@@ -158,11 +151,6 @@ struct ProxyCliArgs {
     /// Redis rate limiter max number of requests per second.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
     redis_rps_limit: Vec<RateBucketInfo>,
-    /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
-    #[clap(long, default_value_t = 100)]
-    initial_limit: usize,
-    #[clap(flatten)]
-    aimd_config: proxy::rate_limiter::AimdConfig,
     /// cache for `allowed_ips` (use `size=0` to disable)
     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     allowed_ips_cache: String,
@@ -497,13 +485,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
              and metric-collection-interval must be specified"
         ),
     };
-    let rate_limiter_config = RateLimiterConfig {
-        disable: args.disable_dynamic_rate_limiter,
-        algorithm: args.rate_limit_algorithm,
-        timeout: args.rate_limiter_timeout,
-        initial_limit: args.initial_limit,
-        aimd_config: Some(args.aimd_config),
-    };
+    if !args.disable_dynamic_rate_limiter {
+        bail!("dynamic rate limiter should be disabled");
+    }
 
     let auth_backend = match &args.auth_backend {
         AuthBackend::Console => {
@@ -545,7 +529,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             tokio::spawn(locks.garbage_collect_worker());
 
             let url = args.auth_endpoint.parse()?;
-            let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));
+            let endpoint = http::Endpoint::new(url, http::new_client());
 
             let api = console::provider::neon::Api::new(endpoint, caches, locks);
             let api = console::provider::ConsoleBackend::Console(api);
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 95ca0ccd5c..e20488e23c 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -15,7 +15,6 @@ use tracing::trace;
 
 use crate::{
     metrics::{ConsoleRequest, Metrics},
-    rate_limiter,
     url::ApiUrl,
 };
 use reqwest_middleware::RequestBuilder;
@@ -23,7 +22,7 @@ use reqwest_middleware::RequestBuilder;
 /// This is the preferred way to create new http clients,
 /// because it takes care of observability (OpenTelemetry).
 /// We deliberately don't want to replace this with a public static.
-pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> ClientWithMiddleware {
+pub fn new_client() -> ClientWithMiddleware {
     let client = reqwest::ClientBuilder::new()
         .dns_resolver(Arc::new(GaiResolver::default()))
         .connection_verbose(true)
@@ -32,7 +31,6 @@ pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> Clien
 
     reqwest_middleware::ClientBuilder::new(client)
         .with(reqwest_tracing::TracingMiddleware::default())
-        .with(rate_limiter::Limiter::new(rate_limiter_config))
         .build()
 }
 
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index b96950b0a2..3a4e54aea0 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -4,8 +4,8 @@ use lasso::ThreadedRodeo;
 use measured::{
     label::StaticLabelSet,
     metric::{histogram::Thresholds, name::MetricName},
-    Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
-    LabelGroup, MetricGroup,
+    Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
+    MetricGroup,
 };
 use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
 
@@ -20,9 +20,6 @@ pub struct Metrics {
 
     #[metric(namespace = "wake_compute_lock")]
     pub wake_compute_lock: ApiLockMetrics,
-
-    // the one metric not called proxy_....
-    pub semaphore_control_plane_limit: GaugeVec<StaticLabelSet<RateLimit>>,
 }
 
 impl Metrics {
@@ -31,7 +28,6 @@ impl Metrics {
         SELF.get_or_init(|| Metrics {
             proxy: ProxyMetrics::default(),
             wake_compute_lock: ApiLockMetrics::new(),
-            semaphore_control_plane_limit: GaugeVec::default(),
         })
     }
 }
@@ -286,13 +282,6 @@ pub enum LatencyExclusions {
     ClientAndCplane,
 }
 
-#[derive(FixedCardinalityLabel, Copy, Clone)]
-#[label(singleton = "limit")]
-pub enum RateLimit {
-    Actual,
-    Expected,
-}
-
 #[derive(FixedCardinalityLabel, Copy, Clone)]
 #[label(singleton = "kind")]
 pub enum SniKind {
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index 2a7297ef81..c542267547 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -1,7 +1,2 @@
-mod aimd;
-mod limit_algorithm;
 mod limiter;
-pub use aimd::Aimd;
-pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
-pub use limiter::Limiter;
 pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
diff --git a/proxy/src/rate_limiter/aimd.rs b/proxy/src/rate_limiter/aimd.rs
deleted file mode 100644
index 2c14a54a6c..0000000000
--- a/proxy/src/rate_limiter/aimd.rs
+++ /dev/null
@@ -1,166 +0,0 @@
-use std::usize;
-
-use async_trait::async_trait;
-
-use super::limit_algorithm::{AimdConfig, LimitAlgorithm, Sample};
-
-use super::limiter::Outcome;
-
-/// Loss-based congestion avoidance.
-///
-/// Additive-increase, multiplicative decrease.
-///
-/// Adds available currency when:
-/// 1. no load-based errors are observed, and
-/// 2. the utilisation of the current limit is high.
-///
-/// Reduces available concurrency by a factor when load-based errors are detected.
-pub struct Aimd {
-    min_limit: usize,
-    max_limit: usize,
-    decrease_factor: f32,
-    increase_by: usize,
-    min_utilisation_threshold: f32,
-}
-
-impl Aimd {
-    pub fn new(config: AimdConfig) -> Self {
-        Self {
-            min_limit: config.aimd_min_limit,
-            max_limit: config.aimd_max_limit,
-            decrease_factor: config.aimd_decrease_factor,
-            increase_by: config.aimd_increase_by,
-            min_utilisation_threshold: config.aimd_min_utilisation_threshold,
-        }
-    }
-}
-
-#[async_trait]
-impl LimitAlgorithm for Aimd {
-    async fn update(&mut self, old_limit: usize, sample: Sample) -> usize {
-        use Outcome::*;
-        match sample.outcome {
-            Success => {
-                let utilisation = sample.in_flight as f32 / old_limit as f32;
-
-                if utilisation > self.min_utilisation_threshold {
-                    let limit = old_limit + self.increase_by;
-                    limit.clamp(self.min_limit, self.max_limit)
-                } else {
-                    old_limit
-                }
-            }
-            Overload => {
-                let limit = old_limit as f32 * self.decrease_factor;
-
-                // Floor instead of round, so the limit reduces even with small numbers.
-                // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1
-                let limit = limit.floor() as usize;
-
-                limit.clamp(self.min_limit, self.max_limit)
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use tokio::sync::Notify;
-
-    use super::*;
-
-    use crate::rate_limiter::{Limiter, RateLimiterConfig};
-
-    #[tokio::test]
-    async fn should_decrease_limit_on_overload() {
-        let config = RateLimiterConfig {
-            initial_limit: 10,
-            aimd_config: Some(AimdConfig {
-                aimd_decrease_factor: 0.5,
-                ..Default::default()
-            }),
-            disable: false,
-            ..Default::default()
-        };
-
-        let release_notifier = Arc::new(Notify::new());
-
-        let limiter = Limiter::new(config).with_release_notifier(release_notifier.clone());
-
-        let token = limiter.try_acquire().unwrap();
-        limiter.release(token, Some(Outcome::Overload)).await;
-        release_notifier.notified().await;
-        assert_eq!(limiter.state().limit(), 5, "overload: decrease");
-    }
-
-    #[tokio::test]
-    async fn should_increase_limit_on_success_when_using_gt_util_threshold() {
-        let config = RateLimiterConfig {
-            initial_limit: 4,
-            aimd_config: Some(AimdConfig {
-                aimd_decrease_factor: 0.5,
-                aimd_min_utilisation_threshold: 0.5,
-                aimd_increase_by: 1,
-                ..Default::default()
-            }),
-            disable: false,
-            ..Default::default()
-        };
-
-        let limiter = Limiter::new(config);
-
-        let token = limiter.try_acquire().unwrap();
-        let _token = limiter.try_acquire().unwrap();
-        let _token = limiter.try_acquire().unwrap();
-
-        limiter.release(token, Some(Outcome::Success)).await;
-        assert_eq!(limiter.state().limit(), 5, "success: increase");
-    }
-
-    #[tokio::test]
-    async fn should_not_change_limit_on_success_when_using_lt_util_threshold() {
-        let config = RateLimiterConfig {
-            initial_limit: 4,
-            aimd_config: Some(AimdConfig {
-                aimd_decrease_factor: 0.5,
-                aimd_min_utilisation_threshold: 0.5,
-                ..Default::default()
-            }),
-            disable: false,
-            ..Default::default()
-        };
-
-        let limiter = Limiter::new(config);
-
-        let token = limiter.try_acquire().unwrap();
-
-        limiter.release(token, Some(Outcome::Success)).await;
-        assert_eq!(
-            limiter.state().limit(),
-            4,
-            "success: ignore when < half limit"
-        );
-    }
-
-    #[tokio::test]
-    async fn should_not_change_limit_when_no_outcome() {
-        let config = RateLimiterConfig {
-            initial_limit: 10,
-            aimd_config: Some(AimdConfig {
-                aimd_decrease_factor: 0.5,
-                aimd_min_utilisation_threshold: 0.5,
-                ..Default::default()
-            }),
-            disable: false,
-            ..Default::default()
-        };
-
-        let limiter = Limiter::new(config);
-
-        let token = limiter.try_acquire().unwrap();
-        limiter.release(token, None).await;
-        assert_eq!(limiter.state().limit(), 10, "ignore");
-    }
-}
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
deleted file mode 100644
index 5cd2d5ebb7..0000000000
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ /dev/null
@@ -1,98 +0,0 @@
-//! Algorithms for controlling concurrency limits.
-use async_trait::async_trait;
-use std::time::Duration;
-
-use super::{limiter::Outcome, Aimd};
-
-/// An algorithm for controlling a concurrency limit.
-#[async_trait]
-pub trait LimitAlgorithm: Send + Sync + 'static {
-    /// Update the concurrency limit in response to a new job completion.
-    async fn update(&mut self, old_limit: usize, sample: Sample) -> usize;
-}
-
-/// The result of a job (or jobs), including the [Outcome] (loss) and latency (delay).
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct Sample {
-    pub(crate) latency: Duration,
-    /// Jobs in flight when the sample was taken.
-    pub(crate) in_flight: usize,
-    pub(crate) outcome: Outcome,
-}
-
-#[derive(Clone, Copy, Debug, Default, clap::ValueEnum)]
-pub enum RateLimitAlgorithm {
-    Fixed,
-    #[default]
-    Aimd,
-}
-
-pub struct Fixed;
-
-#[async_trait]
-impl LimitAlgorithm for Fixed {
-    async fn update(&mut self, old_limit: usize, _sample: Sample) -> usize {
-        old_limit
-    }
-}
-
-#[derive(Clone, Copy, Debug)]
-pub struct RateLimiterConfig {
-    pub disable: bool,
-    pub algorithm: RateLimitAlgorithm,
-    pub timeout: Duration,
-    pub initial_limit: usize,
-    pub aimd_config: Option<AimdConfig>,
-}
-
-impl RateLimiterConfig {
-    pub fn create_rate_limit_algorithm(self) -> Box<dyn LimitAlgorithm> {
-        match self.algorithm {
-            RateLimitAlgorithm::Fixed => Box::new(Fixed),
-            RateLimitAlgorithm::Aimd => Box::new(Aimd::new(self.aimd_config.unwrap())), // For aimd algorithm config is mandatory.
-        }
-    }
-}
-
-impl Default for RateLimiterConfig {
-    fn default() -> Self {
-        Self {
-            disable: true,
-            algorithm: RateLimitAlgorithm::Aimd,
-            timeout: Duration::from_secs(1),
-            initial_limit: 100,
-            aimd_config: Some(AimdConfig::default()),
-        }
-    }
-}
-
-#[derive(clap::Parser, Clone, Copy, Debug)]
-pub struct AimdConfig {
-    /// Minimum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`.
-    #[clap(long, default_value_t = 1)]
-    pub aimd_min_limit: usize,
-    /// Maximum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`.
-    #[clap(long, default_value_t = 1500)]
-    pub aimd_max_limit: usize,
-    /// Increase AIMD increase by value in case of success. Makes sense only if `rate_limit_algorithm` is `Aimd`.
-    #[clap(long, default_value_t = 10)]
-    pub aimd_increase_by: usize,
-    /// Decrease AIMD decrease by value in case of timout/429. Makes sense only if `rate_limit_algorithm` is `Aimd`.
-    #[clap(long, default_value_t = 0.9)]
-    pub aimd_decrease_factor: f32,
-    /// A threshold below which the limit won't be increased. Makes sense only if `rate_limit_algorithm` is `Aimd`.
-    #[clap(long, default_value_t = 0.8)]
-    pub aimd_min_utilisation_threshold: f32,
-}
-
-impl Default for AimdConfig {
-    fn default() -> Self {
-        Self {
-            aimd_min_limit: 1,
-            aimd_max_limit: 1500,
-            aimd_increase_by: 10,
-            aimd_decrease_factor: 0.9,
-            aimd_min_utilisation_threshold: 0.8,
-        }
-    }
-}
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index a0a4e82fe5..3796b22ae9 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -4,7 +4,7 @@ use std::{
     hash::{BuildHasher, Hash},
     sync::{
         atomic::{AtomicUsize, Ordering},
-        Arc, Mutex,
+        Mutex,
     },
 };
 
@@ -12,19 +12,10 @@ use anyhow::bail;
 use dashmap::DashMap;
 use itertools::Itertools;
 use rand::{rngs::StdRng, Rng, SeedableRng};
-use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
-use tokio::time::{timeout, Duration, Instant};
+use tokio::time::{Duration, Instant};
 use tracing::info;
 
-use crate::{
-    metrics::{Metrics, RateLimit},
-    EndpointId,
-};
-
-use super::{
-    limit_algorithm::{LimitAlgorithm, Sample},
-    RateLimiterConfig,
-};
+use crate::EndpointId;
 
 pub struct GlobalRateLimiter {
     data: Vec<RateBucket>,
@@ -245,423 +236,16 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
     }
 }
 
-/// Limits the number of concurrent jobs.
-///
-/// Concurrency is limited through the use of [Token]s. Acquire a token to run a job, and release the
-/// token once the job is finished.
-///
-/// The limit will be automatically adjusted based on observed latency (delay) and/or failures
-/// caused by overload (loss).
-pub struct Limiter {
-    limit_algo: AsyncMutex<Box<dyn LimitAlgorithm>>,
-    semaphore: std::sync::Arc<Semaphore>,
-    config: RateLimiterConfig,
-
-    // ONLY WRITE WHEN LIMIT_ALGO IS LOCKED
-    limits: AtomicUsize,
-
-    // ONLY USE ATOMIC ADD/SUB
-    in_flight: Arc<AtomicUsize>,
-
-    #[cfg(test)]
-    notifier: Option<std::sync::Arc<tokio::sync::Notify>>,
-}
-
-/// A concurrency token, required to run a job.
-///
-/// Release the token back to the [Limiter] after the job is complete.
-#[derive(Debug)]
-pub struct Token<'t> {
-    permit: Option<tokio::sync::SemaphorePermit<'t>>,
-    start: Instant,
-    in_flight: Arc<AtomicUsize>,
-}
-
-/// A snapshot of the state of the [Limiter].
-///
-/// Not guaranteed to be consistent under high concurrency.
-#[derive(Debug, Clone, Copy)]
-pub struct LimiterState {
-    limit: usize,
-    in_flight: usize,
-}
-
-/// Whether a job succeeded or failed as a result of congestion/overload.
-///
-/// Errors not considered to be caused by overload should be ignored.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum Outcome {
-    /// The job succeeded, or failed in a way unrelated to overload.
-    Success,
-    /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal
-    /// was observed.
-    Overload,
-}
-
-impl Outcome {
-    fn from_reqwest_error(error: &reqwest_middleware::Error) -> Self {
-        match error {
-            reqwest_middleware::Error::Middleware(_) => Outcome::Success,
-            reqwest_middleware::Error::Reqwest(e) => {
-                if let Some(status) = e.status() {
-                    if status.is_server_error()
-                        || reqwest::StatusCode::TOO_MANY_REQUESTS.as_u16() == status
-                    {
-                        Outcome::Overload
-                    } else {
-                        Outcome::Success
-                    }
-                } else {
-                    Outcome::Success
-                }
-            }
-        }
-    }
-    fn from_reqwest_response(response: &reqwest::Response) -> Self {
-        if response.status().is_server_error()
-            || response.status() == reqwest::StatusCode::TOO_MANY_REQUESTS
-        {
-            Outcome::Overload
-        } else {
-            Outcome::Success
-        }
-    }
-}
-
-impl Limiter {
-    /// Create a limiter with a given limit control algorithm.
-    pub fn new(config: RateLimiterConfig) -> Self {
-        assert!(config.initial_limit > 0);
-        Self {
-            limit_algo: AsyncMutex::new(config.create_rate_limit_algorithm()),
-            semaphore: Arc::new(Semaphore::new(config.initial_limit)),
-            config,
-            limits: AtomicUsize::new(config.initial_limit),
-            in_flight: Arc::new(AtomicUsize::new(0)),
-            #[cfg(test)]
-            notifier: None,
-        }
-    }
-    // pub fn new(limit_algorithm: T, timeout: Duration, initial_limit: usize) -> Self {
-    //     assert!(initial_limit > 0);
-
-    //     Self {
-    //         limit_algo: AsyncMutex::new(limit_algorithm),
-    //         semaphore: Arc::new(Semaphore::new(initial_limit)),
-    //         timeout,
-    //         limits: AtomicUsize::new(initial_limit),
-    //         in_flight: Arc::new(AtomicUsize::new(0)),
-    //         #[cfg(test)]
-    //         notifier: None,
-    //     }
-    // }
-
-    /// In some cases [Token]s are acquired asynchronously when updating the limit.
-    #[cfg(test)]
-    pub fn with_release_notifier(mut self, n: std::sync::Arc<tokio::sync::Notify>) -> Self {
-        self.notifier = Some(n);
-        self
-    }
-
-    /// Try to immediately acquire a concurrency [Token].
-    ///
-    /// Returns `None` if there are none available.
-    pub fn try_acquire(&self) -> Option<Token> {
-        let result = if self.config.disable {
-            // If the rate limiter is disabled, we can always acquire a token.
-            Some(Token::new(None, self.in_flight.clone()))
-        } else {
-            self.semaphore
-                .try_acquire()
-                .map(|permit| Token::new(Some(permit), self.in_flight.clone()))
-                .ok()
-        };
-        if result.is_some() {
-            self.in_flight.fetch_add(1, Ordering::AcqRel);
-        }
-        result
-    }
-
-    /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available.
-    ///
-    /// Returns `None` if there are none available after `duration`.
-    pub async fn acquire_timeout(&self, duration: Duration) -> Option<Token<'_>> {
-        info!("acquiring token: {:?}", self.semaphore.available_permits());
-        let result = if self.config.disable {
-            // If the rate limiter is disabled, we can always acquire a token.
-            Some(Token::new(None, self.in_flight.clone()))
-        } else {
-            match timeout(duration, self.semaphore.acquire()).await {
-                Ok(maybe_permit) => maybe_permit
-                    .map(|permit| Token::new(Some(permit), self.in_flight.clone()))
-                    .ok(),
-                Err(_) => None,
-            }
-        };
-        if result.is_some() {
-            self.in_flight.fetch_add(1, Ordering::AcqRel);
-        }
-        result
-    }
-
-    /// Return the concurrency [Token], along with the outcome of the job.
-    ///
-    /// The [Outcome] of the job, and the time taken to perform it, may be used
-    /// to update the concurrency limit.
-    ///
-    /// Set the outcome to `None` to ignore the job.
-    pub async fn release(&self, mut token: Token<'_>, outcome: Option<Outcome>) {
-        tracing::info!("outcome is {:?}", outcome);
-        let in_flight = self.in_flight.load(Ordering::Acquire);
-        let old_limit = self.limits.load(Ordering::Acquire);
-        let available = if self.config.disable {
-            0 // This is not used in the algorithm and can be anything. If the config disable it makes sense to set it to 0.
-        } else {
-            self.semaphore.available_permits()
-        };
-        let total = in_flight + available;
-
-        let mut algo = self.limit_algo.lock().await;
-
-        let new_limit = if let Some(outcome) = outcome {
-            let sample = Sample {
-                latency: token.start.elapsed(),
-                in_flight,
-                outcome,
-            };
-            algo.update(old_limit, sample).await
-        } else {
-            old_limit
-        };
-        tracing::info!("new limit is {}", new_limit);
-        let actual_limit = if new_limit < total {
-            token.forget();
-            total.saturating_sub(1)
-        } else {
-            if !self.config.disable {
-                self.semaphore.add_permits(new_limit.saturating_sub(total));
-            }
-            new_limit
-        };
-        let metric = &Metrics::get().semaphore_control_plane_limit;
-        metric.set(RateLimit::Expected, new_limit as i64);
-        metric.set(RateLimit::Actual, actual_limit as i64);
-        self.limits.store(new_limit, Ordering::Release);
-        #[cfg(test)]
-        if let Some(n) = &self.notifier {
-            n.notify_one();
-        }
-    }
-
-    /// The current state of the limiter.
-    pub fn state(&self) -> LimiterState {
-        let limit = self.limits.load(Ordering::Relaxed);
-        let in_flight = self.in_flight.load(Ordering::Relaxed);
-        LimiterState { limit, in_flight }
-    }
-}
-
-impl<'t> Token<'t> {
-    fn new(permit: Option<SemaphorePermit<'t>>, in_flight: Arc<AtomicUsize>) -> Self {
-        Self {
-            permit,
-            start: Instant::now(),
-            in_flight,
-        }
-    }
-
-    pub fn forget(&mut self) {
-        if let Some(permit) = self.permit.take() {
-            permit.forget();
-        }
-    }
-}
-
-impl Drop for Token<'_> {
-    fn drop(&mut self) {
-        self.in_flight.fetch_sub(1, Ordering::AcqRel);
-    }
-}
-
-impl LimiterState {
-    /// The current concurrency limit.
-    pub fn limit(&self) -> usize {
-        self.limit
-    }
-    /// The number of jobs in flight.
-    pub fn in_flight(&self) -> usize {
-        self.in_flight
-    }
-}
-
-#[async_trait::async_trait]
-impl reqwest_middleware::Middleware for Limiter {
-    async fn handle(
-        &self,
-        req: reqwest::Request,
-        extensions: &mut task_local_extensions::Extensions,
-        next: reqwest_middleware::Next<'_>,
-    ) -> reqwest_middleware::Result<reqwest::Response> {
-        let timer = Metrics::get()
-            .proxy
-            .control_plane_token_acquire_seconds
-            .start_timer();
-        let token = self
-            .acquire_timeout(self.config.timeout)
-            .await
-            .ok_or_else(|| {
-                reqwest_middleware::Error::Middleware(
-                    // TODO: Should we map it into user facing errors?
-                    crate::console::errors::ApiError::Console {
-                        status: crate::http::StatusCode::TOO_MANY_REQUESTS,
-                        text: "Too many requests".into(),
-                    }
-                    .into(),
-                )
-            })?;
-        let duration = timer.observe();
-        info!(
-            ?duration,
-            "waiting for token to connect to the control plane"
-        );
-
-        match next.run(req, extensions).await {
-            Ok(response) => {
-                self.release(token, Some(Outcome::from_reqwest_response(&response)))
-                    .await;
-                Ok(response)
-            }
-            Err(e) => {
-                self.release(token, Some(Outcome::from_reqwest_error(&e)))
-                    .await;
-                Err(e)
-            }
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
-    use std::{hash::BuildHasherDefault, pin::pin, task::Context, time::Duration};
+    use std::{hash::BuildHasherDefault, time::Duration};
 
-    use futures::{task::noop_waker_ref, Future};
     use rand::SeedableRng;
     use rustc_hash::FxHasher;
     use tokio::time;
 
-    use super::{BucketRateLimiter, EndpointRateLimiter, Limiter, Outcome};
-    use crate::{
-        rate_limiter::{RateBucketInfo, RateLimitAlgorithm},
-        EndpointId,
-    };
-
-    #[tokio::test]
-    async fn it_works() {
-        let config = super::RateLimiterConfig {
-            algorithm: RateLimitAlgorithm::Fixed,
-            timeout: Duration::from_secs(1),
-            initial_limit: 10,
-            disable: false,
-            ..Default::default()
-        };
-        let limiter = Limiter::new(config);
-
-        let token = limiter.try_acquire().unwrap();
-
-        limiter.release(token, Some(Outcome::Success)).await;
-
-        assert_eq!(limiter.state().limit(), 10);
-    }
-
-    #[tokio::test]
-    async fn is_fair() {
-        let config = super::RateLimiterConfig {
-            algorithm: RateLimitAlgorithm::Fixed,
-            timeout: Duration::from_secs(1),
-            initial_limit: 1,
-            disable: false,
-            ..Default::default()
-        };
-        let limiter = Limiter::new(config);
-
-        // === TOKEN 1 ===
-        let token1 = limiter.try_acquire().unwrap();
-
-        let mut token2_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1)));
-        assert!(
-            token2_fut
-                .as_mut()
-                .poll(&mut Context::from_waker(noop_waker_ref()))
-                .is_pending(),
-            "token is acquired by token1"
-        );
-
-        let mut token3_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1)));
-        assert!(
-            token3_fut
-                .as_mut()
-                .poll(&mut Context::from_waker(noop_waker_ref()))
-                .is_pending(),
-            "token is acquired by token1"
-        );
-
-        limiter.release(token1, Some(Outcome::Success)).await;
-        // === END TOKEN 1 ===
-
-        // === TOKEN 2 ===
-        assert!(
-            limiter.try_acquire().is_none(),
-            "token is acquired by token2"
-        );
-
-        assert!(
-            token3_fut
-                .as_mut()
-                .poll(&mut Context::from_waker(noop_waker_ref()))
-                .is_pending(),
-            "token is acquired by token2"
-        );
-
-        let token2 = token2_fut.await.unwrap();
-
-        limiter.release(token2, Some(Outcome::Success)).await;
-        // === END TOKEN 2 ===
-
-        // === TOKEN 3 ===
-        assert!(
-            limiter.try_acquire().is_none(),
-            "token is acquired by token3"
-        );
-
-        let token3 = token3_fut.await.unwrap();
-        limiter.release(token3, Some(Outcome::Success)).await;
-        // === END TOKEN 3 ===
-
-        // === TOKEN 4 ===
-        let token4 = limiter.try_acquire().unwrap();
-        limiter.release(token4, Some(Outcome::Success)).await;
-    }
-
-    #[tokio::test]
-    async fn disable() {
-        let config = super::RateLimiterConfig {
-            algorithm: RateLimitAlgorithm::Fixed,
-            timeout: Duration::from_secs(1),
-            initial_limit: 1,
-            disable: true,
-            ..Default::default()
-        };
-        let limiter = Limiter::new(config);
-
-        // === TOKEN 1 ===
-        let token1 = limiter.try_acquire().unwrap();
-        let token2 = limiter.try_acquire().unwrap();
-        let state = limiter.state();
-        assert_eq!(state.limit(), 1);
-        assert_eq!(state.in_flight(), 2); // For disabled limiter, it's expected.
-        limiter.release(token1, None).await;
-        limiter.release(token2, None).await;
-    }
+    use super::{BucketRateLimiter, EndpointRateLimiter};
+    use crate::{rate_limiter::RateBucketInfo, EndpointId};
 
     #[test]
     fn rate_bucket_rpi() {
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index 5ffbf95c07..56ed2145dc 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -495,7 +495,7 @@ mod tests {
     use url::Url;
 
     use super::*;
-    use crate::{http, rate_limiter::RateLimiterConfig, BranchId, EndpointId};
+    use crate::{http, BranchId, EndpointId};
 
     #[tokio::test]
     async fn metrics() {
@@ -525,7 +525,7 @@ mod tests {
         tokio::spawn(server);
 
         let metrics = Metrics::default();
-        let client = http::new_client(RateLimiterConfig::default());
+        let client = http::new_client();
         let endpoint = Url::parse(&format!("http://{addr}")).unwrap();
         let now = Utc::now();
 

From e49e931bc44c0ebe52a90db865b64c87f3281c92 Mon Sep 17 00:00:00 2001
From: Jure Bajic <jure.bajic94@gmail.com>
Date: Wed, 17 Apr 2024 11:23:55 +0200
Subject: [PATCH 0574/1571] Add for `add-help-for-timeline-arg` for `timeline`
 command (#7361)

## Problem

When calling `./neon_local timeline` a confusing error message pops up:
`command failed: no tenant subcommand provided`

## Summary of changes
Add `add-help-for-timeline-arg` for timeline commands so when no
argument for the timeline is provided help is printed.
---
 control_plane/src/bin/neon_local.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 68a5474c87..7f8f6d21e0 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1417,6 +1417,7 @@ fn cli() -> Command {
         .subcommand(
             Command::new("timeline")
             .about("Manage timelines")
+            .arg_required_else_help(true)
             .subcommand(Command::new("list")
                 .about("List all timelines, available to this pageserver")
                 .arg(tenant_id_arg.clone()))

From 3023de156e35db166d8d24a4d298f36f558593eb Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 17 Apr 2024 11:32:07 +0100
Subject: [PATCH 0575/1571] pageserver: demote range end fallback log (#7403)

## Problem
This trace is emitted whenever a vectored read touches the end of a
delta layer file. It's a perfectly normal case, but I expected it to be
more rare when implementing the code.

## Summary of changes
Demote log to debug.
---
 pageserver/src/tenant/storage_layer/delta_layer.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 466d95f46d..255855a246 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -939,7 +939,7 @@ impl DeltaLayerInner {
             }
 
             if !range_end_handled {
-                tracing::info!("Handling range end fallback at {}", data_end_offset);
+                tracing::debug!("Handling range end fallback at {}", data_end_offset);
                 planner.handle_range_end(data_end_offset);
             }
         }

From fd49005cb3016da98e6f0f6305549a601e7ebc7b Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 17 Apr 2024 13:33:31 +0200
Subject: [PATCH 0576/1571] proxy: Improve logging (#7405)

## Problem

It's unclear from logs what's going on with the regional redis.

## Summary of changes

Make logs better.
---
 proxy/src/bin/proxy.rs                          |  4 +++-
 proxy/src/cache/endpoints.rs                    |  9 ++++++++-
 proxy/src/context.rs                            | 17 +++++++++++++++--
 .../connection_with_credentials_provider.rs     | 16 ++++++++++++++--
 4 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index cefab870cc..71283dd606 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -42,6 +42,7 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 use tracing::warn;
+use tracing::Instrument;
 use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
 
 project_git_version!(GIT_VERSION);
@@ -418,7 +419,8 @@ async fn main() -> anyhow::Result<()> {
             if let Some(regional_redis_client) = regional_redis_client {
                 let cache = api.caches.endpoints_cache.clone();
                 let con = regional_redis_client;
-                maintenance_tasks.spawn(async move { cache.do_read(con).await });
+                let span = tracing::info_span!("endpoints_cache");
+                maintenance_tasks.spawn(async move { cache.do_read(con).await }.instrument(span));
             }
         }
     }
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index f3f9e9395f..72543c6408 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -13,6 +13,7 @@ use redis::{
 };
 use serde::Deserialize;
 use tokio::sync::Mutex;
+use tracing::info;
 
 use crate::{
     config::EndpointCacheConfig,
@@ -71,7 +72,9 @@ impl EndpointsCache {
         }
         // If cache is disabled, just collect the metrics and return.
         if self.config.disable_cache {
-            ctx.set_rejected(self.should_reject(endpoint));
+            let rejected = self.should_reject(endpoint);
+            ctx.set_rejected(rejected);
+            info!(?rejected, "check endpoint is valid, disabled cache");
             return true;
         }
         // If the limiter allows, we don't need to check the cache.
@@ -79,6 +82,7 @@ impl EndpointsCache {
             return true;
         }
         let rejected = self.should_reject(endpoint);
+        info!(?rejected, "check endpoint is valid, enabled cache");
         ctx.set_rejected(rejected);
         !rejected
     }
@@ -171,6 +175,9 @@ impl EndpointsCache {
 
             if res.keys.is_empty() {
                 if return_when_finish {
+                    if total != 0 {
+                        break;
+                    }
                     anyhow::bail!(
                         "Redis stream {} is empty, cannot be used to filter endpoints",
                         self.config.stream_name
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index d7b5be5534..95c74e6cca 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -5,7 +5,7 @@ use once_cell::sync::OnceCell;
 use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
-use tracing::{field::display, info_span, Span};
+use tracing::{field::display, info, info_span, Span};
 use uuid::Uuid;
 
 use crate::{
@@ -198,12 +198,25 @@ impl Drop for RequestMonitoring {
         } else {
             ConnectOutcome::Failed
         };
+        let rejected = self.rejected;
+        let ep = self
+            .endpoint_id
+            .as_ref()
+            .map(|x| x.as_str())
+            .unwrap_or_default();
+        // This makes sense only if cache is disabled
+        info!(
+            ?ep,
+            ?outcome,
+            ?rejected,
+            "check endpoint is valid with outcome"
+        );
         Metrics::get()
             .proxy
             .invalid_endpoints_total
             .inc(InvalidEndpointsGroup {
                 protocol: self.protocol,
-                rejected: self.rejected.into(),
+                rejected: rejected.into(),
                 outcome,
             });
         if let Some(tx) = self.sender.take() {
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index d183abb53a..3a90d911c2 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -77,10 +77,14 @@ impl ConnectionWithCredentialsProvider {
         }
     }
 
+    async fn ping(con: &mut MultiplexedConnection) -> RedisResult<()> {
+        redis::cmd("PING").query_async(con).await
+    }
+
     pub async fn connect(&mut self) -> anyhow::Result<()> {
         let _guard = self.mutex.lock().await;
         if let Some(con) = self.con.as_mut() {
-            match redis::cmd("PING").query_async(con).await {
+            match Self::ping(con).await {
                 Ok(()) => {
                     return Ok(());
                 }
@@ -96,7 +100,7 @@ impl ConnectionWithCredentialsProvider {
         if let Some(f) = self.refresh_token_task.take() {
             f.abort()
         }
-        let con = self
+        let mut con = self
             .get_client()
             .await?
             .get_multiplexed_tokio_connection()
@@ -109,6 +113,14 @@ impl ConnectionWithCredentialsProvider {
             });
             self.refresh_token_task = Some(f);
         }
+        match Self::ping(&mut con).await {
+            Ok(()) => {
+                info!("Connection succesfully established");
+            }
+            Err(e) => {
+                error!("Connection is broken. Error during PING: {e:?}");
+            }
+        }
         self.con = Some(con);
         Ok(())
     }

From d5708e74357ca19146098770895356326542306e Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 17 Apr 2024 14:16:11 +0200
Subject: [PATCH 0577/1571] proxy: Record role to span (#7407)

## Problem

## Summary of changes

Add dbrole to span.
---
 proxy/src/context.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 95c74e6cca..8cd3024fcf 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -76,6 +76,7 @@ impl RequestMonitoring {
             ?session_id,
             %peer_addr,
             ep = tracing::field::Empty,
+            role = tracing::field::Empty,
         );
 
         Self {
@@ -157,6 +158,7 @@ impl RequestMonitoring {
     }
 
     pub fn set_user(&mut self, user: RoleName) {
+        self.span.record("role", display(&user));
         self.user = Some(user);
     }
 

From a54ea8fb1cd26396a06d2fd715bcf19b8b7a7226 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 18 Apr 2024 06:00:33 +0100
Subject: [PATCH 0578/1571] proxy: move endpoint rate limiter (#7413)

## Problem

## Summary of changes

Rate limit for wake_compute calls
---
 proxy/src/bin/proxy.rs             | 12 +++++-------
 proxy/src/config.rs                |  1 -
 proxy/src/console/provider.rs      |  6 ++++++
 proxy/src/console/provider/neon.rs | 12 ++++++++++++
 proxy/src/proxy.rs                 | 16 +---------------
 proxy/src/proxy/wake_compute.rs    |  1 +
 proxy/src/rate_limiter/limiter.rs  | 26 +++++++++++---------------
 proxy/src/serverless.rs            | 18 +++---------------
 proxy/src/serverless/websocket.rs  |  3 ---
 9 files changed, 39 insertions(+), 56 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 71283dd606..b54f8c131c 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -331,7 +331,6 @@ async fn main() -> anyhow::Result<()> {
     let proxy_listener = TcpListener::bind(proxy_address).await?;
     let cancellation_token = CancellationToken::new();
 
-    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
     let cancel_map = CancelMap::default();
 
     let redis_publisher = match &regional_redis_client {
@@ -357,7 +356,6 @@ async fn main() -> anyhow::Result<()> {
         config,
         proxy_listener,
         cancellation_token.clone(),
-        endpoint_rate_limiter.clone(),
         cancellation_handler.clone(),
     ));
 
@@ -372,7 +370,6 @@ async fn main() -> anyhow::Result<()> {
             config,
             serverless_listener,
             cancellation_token.clone(),
-            endpoint_rate_limiter.clone(),
             cancellation_handler.clone(),
         ));
     }
@@ -533,7 +530,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             let url = args.auth_endpoint.parse()?;
             let endpoint = http::Endpoint::new(url, http::new_client());
 
-            let api = console::provider::neon::Api::new(endpoint, caches, locks);
+            let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
+            RateBucketInfo::validate(&mut endpoint_rps_limit)?;
+            let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
+            let api =
+                console::provider::neon::Api::new(endpoint, caches, locks, endpoint_rate_limiter);
             let api = console::provider::ConsoleBackend::Console(api);
             auth::BackendType::Console(MaybeOwned::Owned(api), ())
         }
@@ -567,8 +568,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
     };
 
-    let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
-    RateBucketInfo::validate(&mut endpoint_rps_limit)?;
     let mut redis_rps_limit = args.redis_rps_limit.clone();
     RateBucketInfo::validate(&mut redis_rps_limit)?;
 
@@ -581,7 +580,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         authentication_config,
         require_client_ip: args.require_client_ip,
         disable_ip_check_for_http: args.disable_ip_check_for_http,
-        endpoint_rps_limit,
         redis_rps_limit,
         handshake_timeout: args.handshake_timeout,
         region: args.region.clone(),
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 7b4c02393b..f9519c7645 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -29,7 +29,6 @@ pub struct ProxyConfig {
     pub authentication_config: AuthenticationConfig,
     pub require_client_ip: bool,
     pub disable_ip_check_for_http: bool,
-    pub endpoint_rps_limit: Vec<RateBucketInfo>,
     pub redis_rps_limit: Vec<RateBucketInfo>,
     pub region: String,
     pub handshake_timeout: Duration,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 3fa7221f98..aa1800a9da 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -208,6 +208,9 @@ pub mod errors {
         #[error(transparent)]
         ApiError(ApiError),
 
+        #[error("Too many connections attempts")]
+        TooManyConnections,
+
         #[error("Timeout waiting to acquire wake compute lock")]
         TimeoutError,
     }
@@ -240,6 +243,8 @@ pub mod errors {
                 // However, API might return a meaningful error.
                 ApiError(e) => e.to_string_client(),
 
+                TooManyConnections => self.to_string(),
+
                 TimeoutError => "timeout while acquiring the compute resource lock".to_owned(),
             }
         }
@@ -250,6 +255,7 @@ pub mod errors {
             match self {
                 WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
                 WakeComputeError::ApiError(e) => e.get_error_kind(),
+                WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit,
                 WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
             }
         }
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 138acdf578..58b2a1570c 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -12,6 +12,7 @@ use crate::{
     console::messages::ColdStartInfo,
     http,
     metrics::{CacheOutcome, Metrics},
+    rate_limiter::EndpointRateLimiter,
     scram, Normalize,
 };
 use crate::{cache::Cached, context::RequestMonitoring};
@@ -25,6 +26,7 @@ pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
     pub locks: &'static ApiLocks,
+    pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     jwt: String,
 }
 
@@ -34,6 +36,7 @@ impl Api {
         endpoint: http::Endpoint,
         caches: &'static ApiCaches,
         locks: &'static ApiLocks,
+        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     ) -> Self {
         let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
             Ok(v) => v,
@@ -43,6 +46,7 @@ impl Api {
             endpoint,
             caches,
             locks,
+            endpoint_rate_limiter,
             jwt,
         }
     }
@@ -277,6 +281,14 @@ impl super::Api for Api {
             return Ok(cached);
         }
 
+        // check rate limit
+        if !self
+            .endpoint_rate_limiter
+            .check(user_info.endpoint.normalize().into(), 1)
+        {
+            return Err(WakeComputeError::TooManyConnections);
+        }
+
         let permit = self.locks.get_wake_compute_permit(&key).await?;
 
         // after getting back a permit - it's possible the cache was filled
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index f80ced91c8..4321bad968 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -19,9 +19,8 @@ use crate::{
     metrics::{Metrics, NumClientConnectionsGuard},
     protocol2::WithClientIp,
     proxy::handshake::{handshake, HandshakeData},
-    rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
-    EndpointCacheKey, Normalize,
+    EndpointCacheKey,
 };
 use futures::TryFutureExt;
 use itertools::Itertools;
@@ -61,7 +60,6 @@ pub async fn task_main(
     config: &'static ProxyConfig,
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_handler: Arc<CancellationHandlerMain>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
@@ -86,7 +84,6 @@ pub async fn task_main(
 
         let session_id = uuid::Uuid::new_v4();
         let cancellation_handler = Arc::clone(&cancellation_handler);
-        let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
         tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
 
@@ -128,7 +125,6 @@ pub async fn task_main(
                 cancellation_handler,
                 socket,
                 ClientMode::Tcp,
-                endpoint_rate_limiter,
                 conn_gauge,
             )
             .instrument(span.clone())
@@ -242,7 +238,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
     mode: ClientMode,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
     info!(
@@ -288,15 +283,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         Err(e) => stream.throw_error(e).await?,
     };
 
-    // check rate limit
-    if let Some(ep) = user_info.get_endpoint() {
-        if !endpoint_rate_limiter.check(ep.normalize(), 1) {
-            return stream
-                .throw_error(auth::AuthError::too_many_connections())
-                .await?;
-        }
-    }
-
     let user = user_info.get_user().to_owned();
     let user_info = match user_info
         .authenticate(
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index f8154b1a94..fe228ab33d 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -90,6 +90,7 @@ fn report_error(e: &WakeComputeError, retry: bool) {
         WakeComputeError::ApiError(ApiError::Console { .. }) => {
             WakeupFailureKind::ApiConsoleOtherError
         }
+        WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked,
         WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError,
     };
     Metrics::get()
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 3796b22ae9..5ba2c36436 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -15,7 +15,7 @@ use rand::{rngs::StdRng, Rng, SeedableRng};
 use tokio::time::{Duration, Instant};
 use tracing::info;
 
-use crate::EndpointId;
+use crate::intern::EndpointIdInt;
 
 pub struct GlobalRateLimiter {
     data: Vec<RateBucket>,
@@ -61,12 +61,7 @@ impl GlobalRateLimiter {
 // Purposefully ignore user name and database name as clients can reconnect
 // with different names, so we'll end up sending some http requests to
 // the control plane.
-//
-// We also may save quite a lot of CPU (I think) by bailing out right after we
-// saw SNI, before doing TLS handshake. User-side error messages in that case
-// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
-// I went with a more expensive way that yields user-friendlier error messages.
-pub type EndpointRateLimiter = BucketRateLimiter<EndpointId, StdRng, RandomState>;
+pub type EndpointRateLimiter = BucketRateLimiter<EndpointIdInt, StdRng, RandomState>;
 
 pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
     map: DashMap<Key, Vec<RateBucket>, Hasher>,
@@ -245,7 +240,7 @@ mod tests {
     use tokio::time;
 
     use super::{BucketRateLimiter, EndpointRateLimiter};
-    use crate::{rate_limiter::RateBucketInfo, EndpointId};
+    use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId};
 
     #[test]
     fn rate_bucket_rpi() {
@@ -295,39 +290,40 @@ mod tests {
         let limiter = EndpointRateLimiter::new(rates);
 
         let endpoint = EndpointId::from("ep-my-endpoint-1234");
+        let endpoint = EndpointIdInt::from(endpoint);
 
         time::pause();
 
         for _ in 0..100 {
-            assert!(limiter.check(endpoint.clone(), 1));
+            assert!(limiter.check(endpoint, 1));
         }
         // more connections fail
-        assert!(!limiter.check(endpoint.clone(), 1));
+        assert!(!limiter.check(endpoint, 1));
 
         // fail even after 500ms as it's in the same bucket
         time::advance(time::Duration::from_millis(500)).await;
-        assert!(!limiter.check(endpoint.clone(), 1));
+        assert!(!limiter.check(endpoint, 1));
 
         // after a full 1s, 100 requests are allowed again
         time::advance(time::Duration::from_millis(500)).await;
         for _ in 1..6 {
             for _ in 0..50 {
-                assert!(limiter.check(endpoint.clone(), 2));
+                assert!(limiter.check(endpoint, 2));
             }
             time::advance(time::Duration::from_millis(1000)).await;
         }
 
         // more connections after 600 will exceed the 20rps@30s limit
-        assert!(!limiter.check(endpoint.clone(), 1));
+        assert!(!limiter.check(endpoint, 1));
 
         // will still fail before the 30 second limit
         time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await;
-        assert!(!limiter.check(endpoint.clone(), 1));
+        assert!(!limiter.check(endpoint, 1));
 
         // after the full 30 seconds, 100 requests are allowed again
         time::advance(time::Duration::from_millis(1)).await;
         for _ in 0..100 {
-            assert!(limiter.check(endpoint.clone(), 1));
+            assert!(limiter.check(endpoint, 1));
         }
     }
 
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index f3c42cdb01..b0f4026c76 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -35,7 +35,6 @@ use crate::context::RequestMonitoring;
 use crate::metrics::Metrics;
 use crate::protocol2::WithClientIp;
 use crate::proxy::run_until_cancelled;
-use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};
 
@@ -53,7 +52,6 @@ pub async fn task_main(
     config: &'static ProxyConfig,
     ws_listener: TcpListener,
     cancellation_token: CancellationToken,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_handler: Arc<CancellationHandlerMain>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
@@ -117,7 +115,6 @@ pub async fn task_main(
                 backend.clone(),
                 connections.clone(),
                 cancellation_handler.clone(),
-                endpoint_rate_limiter.clone(),
                 cancellation_token.clone(),
                 server.clone(),
                 tls_acceptor.clone(),
@@ -147,7 +144,6 @@ async fn connection_handler(
     backend: Arc<PoolingBackend>,
     connections: TaskTracker,
     cancellation_handler: Arc<CancellationHandlerMain>,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_token: CancellationToken,
     server: Builder<TokioExecutor>,
     tls_acceptor: TlsAcceptor,
@@ -231,7 +227,6 @@ async fn connection_handler(
                     cancellation_handler.clone(),
                     session_id,
                     peer_addr,
-                    endpoint_rate_limiter.clone(),
                     http_request_token,
                 )
                 .in_current_span()
@@ -270,7 +265,6 @@ async fn request_handler(
     cancellation_handler: Arc<CancellationHandlerMain>,
     session_id: uuid::Uuid,
     peer_addr: IpAddr,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     // used to cancel in-flight HTTP requests. not used to cancel websockets
     http_cancellation_token: CancellationToken,
 ) -> Result<Response<Full<Bytes>>, ApiError> {
@@ -298,15 +292,9 @@ async fn request_handler(
 
         ws_connections.spawn(
             async move {
-                if let Err(e) = websocket::serve_websocket(
-                    config,
-                    ctx,
-                    websocket,
-                    cancellation_handler,
-                    host,
-                    endpoint_rate_limiter,
-                )
-                .await
+                if let Err(e) =
+                    websocket::serve_websocket(config, ctx, websocket, cancellation_handler, host)
+                        .await
                 {
                     error!("error in websocket connection: {e:#}");
                 }
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index d054877126..eddd278b7d 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -5,7 +5,6 @@ use crate::{
     error::{io_error, ReportableError},
     metrics::Metrics,
     proxy::{handle_client, ClientMode},
-    rate_limiter::EndpointRateLimiter,
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream};
@@ -136,7 +135,6 @@ pub async fn serve_websocket(
     websocket: HyperWebsocket,
     cancellation_handler: Arc<CancellationHandlerMain>,
     hostname: Option<String>,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
     let conn_gauge = Metrics::get()
@@ -150,7 +148,6 @@ pub async fn serve_websocket(
         cancellation_handler,
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },
-        endpoint_rate_limiter,
         conn_gauge,
     )
     .await;

From 5191f6ef0e381887981d40e4f8001ff63c9abc8e Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 18 Apr 2024 07:09:12 +0200
Subject: [PATCH 0579/1571] proxy: Record only valid rejected events (#7415)

## Problem

Sometimes rejected metric might record invalid events.

## Summary of changes

* Only record it `rejected` was explicitly set.
* Change order in logs.
* Report metrics if not under high-load.
---
 proxy/src/cache/endpoints.rs | 18 +++++--------
 proxy/src/context.rs         | 49 ++++++++++++++++++------------------
 2 files changed, 31 insertions(+), 36 deletions(-)

diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 72543c6408..2aa1986d5e 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -70,20 +70,14 @@ impl EndpointsCache {
         if !self.ready.load(Ordering::Acquire) {
             return true;
         }
-        // If cache is disabled, just collect the metrics and return.
-        if self.config.disable_cache {
-            let rejected = self.should_reject(endpoint);
-            ctx.set_rejected(rejected);
-            info!(?rejected, "check endpoint is valid, disabled cache");
-            return true;
-        }
-        // If the limiter allows, we don't need to check the cache.
-        if self.limiter.lock().await.check() {
-            return true;
-        }
         let rejected = self.should_reject(endpoint);
-        info!(?rejected, "check endpoint is valid, enabled cache");
         ctx.set_rejected(rejected);
+        info!(?rejected, "check endpoint is valid, disabled cache");
+        // If cache is disabled, just collect the metrics and return or
+        // If the limiter allows, we don't need to check the cache.
+        if self.config.disable_cache || self.limiter.lock().await.check() {
+            return true;
+        }
         !rejected
     }
     fn should_reject(&self, endpoint: &EndpointId) -> bool {
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 8cd3024fcf..17b82c08aa 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -51,7 +51,7 @@ pub struct RequestMonitoring {
     sender: Option<mpsc::UnboundedSender<RequestData>>,
     pub latency_timer: LatencyTimer,
     // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
-    rejected: bool,
+    rejected: Option<bool>,
 }
 
 #[derive(Clone, Debug)]
@@ -96,7 +96,7 @@ impl RequestMonitoring {
             error_kind: None,
             auth_method: None,
             success: false,
-            rejected: false,
+            rejected: None,
             cold_start_info: ColdStartInfo::Unknown,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
@@ -118,7 +118,7 @@ impl RequestMonitoring {
     }
 
     pub fn set_rejected(&mut self, rejected: bool) {
-        self.rejected = rejected;
+        self.rejected = Some(rejected);
     }
 
     pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
@@ -200,27 +200,28 @@ impl Drop for RequestMonitoring {
         } else {
             ConnectOutcome::Failed
         };
-        let rejected = self.rejected;
-        let ep = self
-            .endpoint_id
-            .as_ref()
-            .map(|x| x.as_str())
-            .unwrap_or_default();
-        // This makes sense only if cache is disabled
-        info!(
-            ?ep,
-            ?outcome,
-            ?rejected,
-            "check endpoint is valid with outcome"
-        );
-        Metrics::get()
-            .proxy
-            .invalid_endpoints_total
-            .inc(InvalidEndpointsGroup {
-                protocol: self.protocol,
-                rejected: rejected.into(),
-                outcome,
-            });
+        if let Some(rejected) = self.rejected {
+            let ep = self
+                .endpoint_id
+                .as_ref()
+                .map(|x| x.as_str())
+                .unwrap_or_default();
+            // This makes sense only if cache is disabled
+            info!(
+                ?outcome,
+                ?rejected,
+                ?ep,
+                "check endpoint is valid with outcome"
+            );
+            Metrics::get()
+                .proxy
+                .invalid_endpoints_total
+                .inc(InvalidEndpointsGroup {
+                    protocol: self.protocol,
+                    rejected: rejected.into(),
+                    outcome,
+                });
+        }
         if let Some(tx) = self.sender.take() {
             let _: Result<(), _> = tx.send(RequestData::from(&*self));
         }

From 8d0f7017678b1c54f415da9de212d2749e6af9b2 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 18 Apr 2024 10:43:04 +0300
Subject: [PATCH 0580/1571] feat: copy delta layer prefix or "truncate" (#7228)

For "timeline ancestor merge" or "timeline detach," we need to "cut"
delta layers at particular LSN. The name "truncate" is not used as it
would imply that a layer file changes, instead of what happens: we copy
keys with Lsn less than a "cut point".

Cc: #6994

Add the "copy delta layer prefix" operation to DeltaLayerInner, re-using
some of the vectored read internals. The code is `cfg(test)` until it
will be used later with a more complete integration test.
---
 pageserver/src/repository.rs                  | 138 ++++++
 .../src/tenant/storage_layer/delta_layer.rs   | 445 +++++++++++++++++-
 pageserver/src/tenant/storage_layer/layer.rs  |  42 +-
 pageserver/src/tenant/vectored_blob_io.rs     |  25 +-
 pageserver/src/walrecord.rs                   |   1 +
 5 files changed, 632 insertions(+), 19 deletions(-)

diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 9959d105eb..0a9ac50aad 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -33,6 +33,52 @@ impl Value {
     }
 }
 
+#[cfg(test)]
+#[derive(Debug, PartialEq)]
+pub(crate) enum InvalidInput {
+    TooShortValue,
+    TooShortPostgresRecord,
+}
+
+/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
+/// use this type for querying if a slice looks some particular way.
+#[cfg(test)]
+pub(crate) struct ValueBytes;
+
+#[cfg(test)]
+impl ValueBytes {
+    pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
+        if raw.len() < 12 {
+            return Err(InvalidInput::TooShortValue);
+        }
+
+        let value_discriminator = &raw[0..4];
+
+        if value_discriminator == [0, 0, 0, 0] {
+            // Value::Image always initializes
+            return Ok(true);
+        }
+
+        if value_discriminator != [0, 0, 0, 1] {
+            // not a Value::WalRecord(..)
+            return Ok(false);
+        }
+
+        let walrecord_discriminator = &raw[4..8];
+
+        if walrecord_discriminator != [0, 0, 0, 0] {
+            // only NeonWalRecord::Postgres can have will_init
+            return Ok(false);
+        }
+
+        if raw.len() < 17 {
+            return Err(InvalidInput::TooShortPostgresRecord);
+        }
+
+        Ok(raw[8] == 1)
+    }
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
@@ -70,6 +116,8 @@ mod test {
         ];
 
         roundtrip!(image, expected);
+
+        assert!(ValueBytes::will_init(&expected).unwrap());
     }
 
     #[test]
@@ -93,6 +141,96 @@ mod test {
         ];
 
         roundtrip!(rec, expected);
+
+        assert!(ValueBytes::will_init(&expected).unwrap());
+    }
+
+    #[test]
+    fn bytes_inspection_too_short_image() {
+        let rec = Value::Image(Bytes::from_static(b""));
+
+        #[rustfmt::skip]
+        let expected = [
+            // top level discriminator of 4 bytes
+            0x00, 0x00, 0x00, 0x00,
+            // 8 byte length
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        ];
+
+        roundtrip!(rec, expected);
+
+        assert!(ValueBytes::will_init(&expected).unwrap());
+        assert_eq!(expected.len(), 12);
+        for len in 0..12 {
+            assert_eq!(
+                ValueBytes::will_init(&expected[..len]).unwrap_err(),
+                InvalidInput::TooShortValue
+            );
+        }
+    }
+
+    #[test]
+    fn bytes_inspection_too_short_postgres_record() {
+        let rec = NeonWalRecord::Postgres {
+            will_init: false,
+            rec: Bytes::from_static(b""),
+        };
+        let rec = Value::WalRecord(rec);
+
+        #[rustfmt::skip]
+        let expected = [
+            // flattened discriminator of total 8 bytes
+            0x00, 0x00, 0x00, 0x01,
+            0x00, 0x00, 0x00, 0x00,
+            // will_init
+            0x00,
+            // 8 byte length
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        ];
+
+        roundtrip!(rec, expected);
+
+        assert!(!ValueBytes::will_init(&expected).unwrap());
+        assert_eq!(expected.len(), 17);
+        for len in 12..17 {
+            assert_eq!(
+                ValueBytes::will_init(&expected[..len]).unwrap_err(),
+                InvalidInput::TooShortPostgresRecord
+            )
+        }
+        for len in 0..12 {
+            assert_eq!(
+                ValueBytes::will_init(&expected[..len]).unwrap_err(),
+                InvalidInput::TooShortValue
+            )
+        }
+    }
+
+    #[test]
+    fn clear_visibility_map_flags_example() {
+        let rec = NeonWalRecord::ClearVisibilityMapFlags {
+            new_heap_blkno: Some(0x11),
+            old_heap_blkno: None,
+            flags: 0x03,
+        };
+        let rec = Value::WalRecord(rec);
+
+        #[rustfmt::skip]
+        let expected = [
+            // discriminators
+            0x00, 0x00, 0x00, 0x01,
+            0x00, 0x00, 0x00, 0x01,
+            // Some == 1 followed by 4 bytes
+            0x01, 0x00, 0x00, 0x00, 0x11,
+            // None == 0
+            0x00,
+            // flags
+            0x03
+        ];
+
+        roundtrip!(rec, expected);
+
+        assert!(!ValueBytes::will_init(&expected).unwrap());
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 255855a246..c5b5e5c98f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -20,8 +20,8 @@
 //!    000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
 //! ```
 //!
-//! Every delta file consists of three parts: "summary", "index", and
-//! "values". The summary is a fixed size header at the beginning of the file,
+//! Every delta file consists of three parts: "summary", "values", and
+//! "index". The summary is a fixed size header at the beginning of the file,
 //! and it contains basic information about the layer, and offsets to the other
 //! parts. The "index" is a B-tree, mapping from Key and LSN to an offset in the
 //! "values" part.  The actual page images and WAL records are stored in the
@@ -863,7 +863,7 @@ impl DeltaLayerInner {
                 .into(),
         );
 
-        let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64;
+        let data_end_offset = self.index_start_offset();
 
         let reads = Self::plan_reads(
             keyspace,
@@ -1103,11 +1103,195 @@ impl DeltaLayerInner {
         if let Some(last) = all_keys.last_mut() {
             // Last key occupies all space till end of value storage,
             // which corresponds to beginning of the index
-            last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
+            last.size = self.index_start_offset() - last.size;
         }
         Ok(all_keys)
     }
 
+    /// Using the given writer, write out a truncated version, where LSNs higher than the
+    /// truncate_at are missing.
+    #[cfg(test)]
+    pub(super) async fn copy_prefix(
+        &self,
+        writer: &mut DeltaLayerWriter,
+        truncate_at: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        use crate::tenant::vectored_blob_io::{
+            BlobMeta, VectoredReadBuilder, VectoredReadExtended,
+        };
+        use futures::stream::TryStreamExt;
+
+        #[derive(Debug)]
+        enum Item {
+            Actual(Key, Lsn, BlobRef),
+            Sentinel,
+        }
+
+        impl From<Item> for Option<(Key, Lsn, BlobRef)> {
+            fn from(value: Item) -> Self {
+                match value {
+                    Item::Actual(key, lsn, blob) => Some((key, lsn, blob)),
+                    Item::Sentinel => None,
+                }
+            }
+        }
+
+        impl Item {
+            fn offset(&self) -> Option<BlobRef> {
+                match self {
+                    Item::Actual(_, _, blob) => Some(*blob),
+                    Item::Sentinel => None,
+                }
+            }
+
+            fn is_last(&self) -> bool {
+                matches!(self, Item::Sentinel)
+            }
+        }
+
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            block_reader,
+        );
+
+        let stream = self.stream_index_forwards(&tree_reader, &[0u8; DELTA_KEY_SIZE], ctx);
+        let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos));
+        // put in a sentinel value for getting the end offset for last item, and not having to
+        // repeat the whole read part
+        let stream = stream.chain(futures::stream::once(futures::future::ready(Ok(
+            Item::Sentinel,
+        ))));
+        let mut stream = std::pin::pin!(stream);
+
+        let mut prev: Option<(Key, Lsn, BlobRef)> = None;
+
+        let mut read_builder: Option<VectoredReadBuilder> = None;
+
+        let max_read_size = self
+            .max_vectored_read_bytes
+            .map(|x| x.0.get())
+            .unwrap_or(8192);
+
+        let mut buffer = Some(BytesMut::with_capacity(max_read_size));
+
+        // FIXME: buffering of DeltaLayerWriter
+        let mut per_blob_copy = Vec::new();
+
+        while let Some(item) = stream.try_next().await? {
+            tracing::debug!(?item, "popped");
+            let offset = item
+                .offset()
+                .unwrap_or(BlobRef::new(self.index_start_offset(), false));
+
+            let actionable = if let Some((key, lsn, start_offset)) = prev.take() {
+                let end_offset = offset;
+
+                Some((BlobMeta { key, lsn }, start_offset..end_offset))
+            } else {
+                None
+            };
+
+            let is_last = item.is_last();
+
+            prev = Option::from(item);
+
+            let actionable = actionable.filter(|x| x.0.lsn < truncate_at);
+
+            let builder = if let Some((meta, offsets)) = actionable {
+                // extend or create a new builder
+                if read_builder
+                    .as_mut()
+                    .map(|x| x.extend(offsets.start.pos(), offsets.end.pos(), meta))
+                    .unwrap_or(VectoredReadExtended::No)
+                    == VectoredReadExtended::Yes
+                {
+                    None
+                } else {
+                    read_builder.replace(VectoredReadBuilder::new(
+                        offsets.start.pos(),
+                        offsets.end.pos(),
+                        meta,
+                        max_read_size,
+                    ))
+                }
+            } else {
+                // nothing to do, except perhaps flush any existing for the last element
+                None
+            };
+
+            // flush the possible older builder and also the new one if the item was the last one
+            let builders = builder.into_iter();
+            let builders = if is_last {
+                builders.chain(read_builder.take())
+            } else {
+                builders.chain(None)
+            };
+
+            for builder in builders {
+                let read = builder.build();
+
+                let reader = VectoredBlobReader::new(&self.file);
+
+                let mut buf = buffer.take().unwrap();
+
+                buf.clear();
+                buf.reserve(read.size());
+                let res = reader.read_blobs(&read, buf).await?;
+
+                for blob in res.blobs {
+                    let key = blob.meta.key;
+                    let lsn = blob.meta.lsn;
+                    let data = &res.buf[blob.start..blob.end];
+
+                    #[cfg(debug_assertions)]
+                    Value::des(data)
+                        .with_context(|| {
+                            format!(
+                                "blob failed to deserialize for {}@{}, {}..{}: {:?}",
+                                blob.meta.key,
+                                blob.meta.lsn,
+                                blob.start,
+                                blob.end,
+                                utils::Hex(data)
+                            )
+                        })
+                        .unwrap();
+
+                    // is it an image or will_init walrecord?
+                    // FIXME: this could be handled by threading the BlobRef to the
+                    // VectoredReadBuilder
+                    let will_init = crate::repository::ValueBytes::will_init(data)
+                        .inspect_err(|_e| {
+                            #[cfg(feature = "testing")]
+                            tracing::error!(data=?utils::Hex(data), err=?_e, "failed to parse will_init out of serialized value");
+                        })
+                        .unwrap_or(false);
+
+                    per_blob_copy.clear();
+                    per_blob_copy.extend_from_slice(data);
+
+                    let (tmp, res) = writer
+                        .put_value_bytes(key, lsn, std::mem::take(&mut per_blob_copy), will_init)
+                        .await;
+                    per_blob_copy = tmp;
+                    res?;
+                }
+
+                buffer = Some(res.buf);
+            }
+        }
+
+        assert!(
+            read_builder.is_none(),
+            "with the sentinel above loop should had handled all"
+        );
+
+        Ok(())
+    }
+
     pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
         println!(
             "index_start_blk: {}, root {}",
@@ -1177,6 +1361,44 @@ impl DeltaLayerInner {
 
         Ok(())
     }
+
+    #[cfg(test)]
+    fn stream_index_forwards<'a, R>(
+        &'a self,
+        reader: &'a DiskBtreeReader<R, DELTA_KEY_SIZE>,
+        start: &'a [u8; DELTA_KEY_SIZE],
+        ctx: &'a RequestContext,
+    ) -> impl futures::stream::Stream<
+        Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>,
+    > + 'a
+    where
+        R: BlockReader,
+    {
+        use futures::stream::TryStreamExt;
+        let stream = reader.get_stream_from(start, ctx);
+        stream.map_ok(|(key, value)| {
+            let key = DeltaKey::from_slice(&key);
+            let (key, lsn) = (key.key(), key.lsn());
+            let offset = BlobRef(value);
+
+            (key, lsn, offset)
+        })
+    }
+
+    /// The file offset to the first block of index.
+    ///
+    /// The file structure is summary, values, and index. We often need this for the size of last blob.
+    fn index_start_offset(&self) -> u64 {
+        let offset = self.index_start_blk as u64 * PAGE_SZ as u64;
+        let bref = BlobRef(offset);
+        tracing::debug!(
+            index_start_blk = self.index_start_blk,
+            offset,
+            pos = bref.pos(),
+            "index_start_offset"
+        );
+        offset
+    }
 }
 
 /// A set of data associated with a delta layer key and its value
@@ -1538,7 +1760,7 @@ mod test {
 
         let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
 
-        let inner = resident.get_inner_delta(&ctx).await?;
+        let inner = resident.as_delta(&ctx).await?;
 
         let file_size = inner.file.metadata().await?.len();
         tracing::info!(
@@ -1594,4 +1816,217 @@ mod test {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn copy_delta_prefix_smoke() {
+        use crate::walrecord::NeonWalRecord;
+        use bytes::Bytes;
+
+        let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
+        let (tenant, ctx) = h.load().await;
+        let ctx = &ctx;
+        let timeline = tenant
+            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx)
+            .await
+            .unwrap();
+
+        let initdb_layer = timeline
+            .layers
+            .read()
+            .await
+            .likely_resident_layers()
+            .next()
+            .unwrap();
+
+        {
+            let mut writer = timeline.writer().await;
+
+            let data = [
+                (0x20, 12, Value::Image(Bytes::from_static(b"foobar"))),
+                (
+                    0x30,
+                    12,
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: false,
+                        rec: Bytes::from_static(b"1"),
+                    }),
+                ),
+                (
+                    0x40,
+                    12,
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: true,
+                        rec: Bytes::from_static(b"2"),
+                    }),
+                ),
+                // build an oversized value so we cannot extend and existing read over
+                // this
+                (
+                    0x50,
+                    12,
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: true,
+                        rec: {
+                            let mut buf =
+                                vec![0u8; tenant.conf.max_vectored_read_bytes.0.get() + 1024];
+                            buf.iter_mut()
+                                .enumerate()
+                                .for_each(|(i, slot)| *slot = (i % 256) as u8);
+                            Bytes::from(buf)
+                        },
+                    }),
+                ),
+                // because the oversized read cannot be extended further, we are sure to exercise the
+                // builder created on the last round with this:
+                (
+                    0x60,
+                    12,
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: true,
+                        rec: Bytes::from_static(b"3"),
+                    }),
+                ),
+                (
+                    0x60,
+                    9,
+                    Value::Image(Bytes::from_static(b"something for a different key")),
+                ),
+            ];
+
+            let mut last_lsn = None;
+
+            for (lsn, key, value) in data {
+                let key = Key::from_i128(key);
+                writer.put(key, Lsn(lsn), &value, ctx).await.unwrap();
+                last_lsn = Some(lsn);
+            }
+
+            writer.finish_write(Lsn(last_lsn.unwrap()));
+        }
+        timeline.freeze_and_flush().await.unwrap();
+
+        let new_layer = timeline
+            .layers
+            .read()
+            .await
+            .likely_resident_layers()
+            .find(|x| x != &initdb_layer)
+            .unwrap();
+
+        // create a copy for the timeline, so we don't overwrite the file
+        let branch = tenant
+            .branch_timeline_test(&timeline, TimelineId::generate(), None, ctx)
+            .await
+            .unwrap();
+
+        assert_eq!(branch.get_ancestor_lsn(), Lsn(0x60));
+
+        // truncating at 0x61 gives us a full copy, otherwise just go backwards until there's just
+        // a single key
+
+        for truncate_at in [0x61, 0x51, 0x41, 0x31, 0x21] {
+            let truncate_at = Lsn(truncate_at);
+
+            let mut writer = DeltaLayerWriter::new(
+                tenant.conf,
+                branch.timeline_id,
+                tenant.tenant_shard_id,
+                Key::MIN,
+                Lsn(0x11)..truncate_at,
+            )
+            .await
+            .unwrap();
+
+            let new_layer = new_layer.download_and_keep_resident().await.unwrap();
+
+            new_layer
+                .copy_delta_prefix(&mut writer, truncate_at, ctx)
+                .await
+                .unwrap();
+
+            let copied_layer = writer.finish(Key::MAX, &branch).await.unwrap();
+
+            copied_layer.as_delta(ctx).await.unwrap();
+
+            assert_keys_and_values_eq(
+                new_layer.as_delta(ctx).await.unwrap(),
+                copied_layer.as_delta(ctx).await.unwrap(),
+                truncate_at,
+                ctx,
+            )
+            .await;
+        }
+    }
+
+    async fn assert_keys_and_values_eq(
+        source: &DeltaLayerInner,
+        truncated: &DeltaLayerInner,
+        truncated_at: Lsn,
+        ctx: &RequestContext,
+    ) {
+        use futures::future::ready;
+        use futures::stream::TryStreamExt;
+
+        let start_key = [0u8; DELTA_KEY_SIZE];
+
+        let source_reader = FileBlockReader::new(&source.file, source.file_id);
+        let source_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            source.index_start_blk,
+            source.index_root_blk,
+            &source_reader,
+        );
+        let source_stream = source.stream_index_forwards(&source_tree, &start_key, ctx);
+        let source_stream = source_stream.filter(|res| match res {
+            Ok((_, lsn, _)) => ready(lsn < &truncated_at),
+            _ => ready(true),
+        });
+        let mut source_stream = std::pin::pin!(source_stream);
+
+        let truncated_reader = FileBlockReader::new(&truncated.file, truncated.file_id);
+        let truncated_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            truncated.index_start_blk,
+            truncated.index_root_blk,
+            &truncated_reader,
+        );
+        let truncated_stream = truncated.stream_index_forwards(&truncated_tree, &start_key, ctx);
+        let mut truncated_stream = std::pin::pin!(truncated_stream);
+
+        let mut scratch_left = Vec::new();
+        let mut scratch_right = Vec::new();
+
+        loop {
+            let (src, truncated) = (source_stream.try_next(), truncated_stream.try_next());
+            let (src, truncated) = tokio::try_join!(src, truncated).unwrap();
+
+            if src.is_none() {
+                assert!(truncated.is_none());
+                break;
+            }
+
+            let (src, truncated) = (src.unwrap(), truncated.unwrap());
+
+            // because we've filtered the source with Lsn, we should always have the same keys from both.
+            assert_eq!(src.0, truncated.0);
+            assert_eq!(src.1, truncated.1);
+
+            // if this is needed for something else, just drop this assert.
+            assert!(
+                src.2.pos() >= truncated.2.pos(),
+                "value position should not go backwards {} vs. {}",
+                src.2.pos(),
+                truncated.2.pos()
+            );
+
+            scratch_left.clear();
+            let src_cursor = source_reader.block_cursor();
+            let left = src_cursor.read_blob_into_buf(src.2.pos(), &mut scratch_left, ctx);
+            scratch_right.clear();
+            let trunc_cursor = truncated_reader.block_cursor();
+            let right = trunc_cursor.read_blob_into_buf(truncated.2.pos(), &mut scratch_right, ctx);
+
+            tokio::try_join!(left, right).unwrap();
+
+            assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right));
+        }
+    }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 27e60f783c..291da0f645 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -116,6 +116,12 @@ impl AsLayerDesc for Layer {
     }
 }
 
+impl PartialEq for Layer {
+    fn eq(&self, other: &Self) -> bool {
+        Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0)
+    }
+}
+
 impl Layer {
     /// Creates a layer value for a file we know to not be resident.
     pub(crate) fn for_evicted(
@@ -1752,6 +1758,28 @@ impl ResidentLayer {
         }
     }
 
+    /// FIXME: truncate is bad name because we are not truncating anything, but copying the
+    /// filtered parts.
+    #[cfg(test)]
+    pub(super) async fn copy_delta_prefix(
+        &self,
+        writer: &mut super::delta_layer::DeltaLayerWriter,
+        truncate_at: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        use LayerKind::*;
+
+        let owner = &self.owner.0;
+
+        match self.downloaded.get(owner, ctx).await? {
+            Delta(ref d) => d
+                .copy_prefix(writer, truncate_at, ctx)
+                .await
+                .with_context(|| format!("truncate {self}")),
+            Image(_) => anyhow::bail!(format!("cannot truncate image layer {self}")),
+        }
+    }
+
     pub(crate) fn local_path(&self) -> &Utf8Path {
         &self.owner.0.path
     }
@@ -1761,14 +1789,14 @@ impl ResidentLayer {
     }
 
     #[cfg(test)]
-    pub(crate) async fn get_inner_delta<'a>(
-        &'a self,
+    pub(crate) async fn as_delta(
+        &self,
         ctx: &RequestContext,
-    ) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> {
-        let owner = &self.owner.0;
-        match self.downloaded.get(owner, ctx).await? {
-            LayerKind::Delta(d) => Ok(d),
-            LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")),
+    ) -> anyhow::Result<&delta_layer::DeltaLayerInner> {
+        use LayerKind::*;
+        match self.downloaded.get(&self.owner.0, ctx).await? {
+            Delta(ref d) => Ok(d),
+            Image(_) => Err(anyhow::anyhow!("image layer")),
         }
     }
 }
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 3a6950cf88..91934d5e0e 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -61,18 +61,18 @@ pub struct VectoredRead {
 }
 
 impl VectoredRead {
-    pub fn size(&self) -> usize {
+    pub(crate) fn size(&self) -> usize {
         (self.end - self.start) as usize
     }
 }
 
 #[derive(Eq, PartialEq)]
-enum VectoredReadExtended {
+pub(crate) enum VectoredReadExtended {
     Yes,
     No,
 }
 
-struct VectoredReadBuilder {
+pub(crate) struct VectoredReadBuilder {
     start: u64,
     end: u64,
     blobs_at: VecMap<u64, BlobMeta>,
@@ -80,7 +80,17 @@ struct VectoredReadBuilder {
 }
 
 impl VectoredReadBuilder {
-    fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self {
+    /// Start building a new vectored read.
+    ///
+    /// Note that by design, this does not check against reading more than `max_read_size` to
+    /// support reading larger blobs than the configuration value. The builder will be single use
+    /// however after that.
+    pub(crate) fn new(
+        start_offset: u64,
+        end_offset: u64,
+        meta: BlobMeta,
+        max_read_size: usize,
+    ) -> Self {
         let mut blobs_at = VecMap::default();
         blobs_at
             .append(start_offset, meta)
@@ -97,7 +107,8 @@ impl VectoredReadBuilder {
     /// Attempt to extend the current read with a new blob if the start
     /// offset matches with the current end of the vectored read
     /// and the resuting size is below the max read size
-    fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
+    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
+        tracing::trace!(start, end, "trying to extend");
         let size = (end - start) as usize;
         if self.end == start && self.size() + size <= self.max_read_size {
             self.end = end;
@@ -111,11 +122,11 @@ impl VectoredReadBuilder {
         VectoredReadExtended::No
     }
 
-    fn size(&self) -> usize {
+    pub(crate) fn size(&self) -> usize {
         (self.end - self.start) as usize
     }
 
-    fn build(self) -> VectoredRead {
+    pub(crate) fn build(self) -> VectoredRead {
         VectoredRead {
             start: self.start,
             end: self.end,
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index ae2d996879..02f6f49694 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -55,6 +55,7 @@ impl NeonWalRecord {
     /// Does replaying this WAL record initialize the page from scratch, or does
     /// it need to be applied over the previous image of the page?
     pub fn will_init(&self) -> bool {
+        // If you change this function, you'll also need to change ValueBytes::will_init
         match self {
             NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
 

From 637ad4a6380000ad5af17726deccea6bc963efab Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 18 Apr 2024 13:16:03 +0100
Subject: [PATCH 0581/1571] pageserver: fix secondary download scheduling
 (#7396)

## Problem

Some tenants were observed to stop doing downloads after some time

## Summary of changes

- Fix a rogue `<` that was incorrectly scheduling work when `now` was
_before_ the scheduling target, rather than after. This usually resulted
in too-frequent execution, but could also result in never executing, if
the current time has advanced ahead of `next_download` at the time we
call `schedule()`.
- Fix in-memory list of timelines not being amended after timeline
deletion: the resulted in repeated harmless logs about the timeline
being removed, and redundant calls to remove_dir_all for the timeline
path.
- Add a log at startup to make it easier to see a particular tenant
starting in secondary mode (this is for parity with the logging that
exists when spawning an attached tenant). Previously searching on tenant
ID didn't provide a clear signal as to how the tenant was started during
pageserver start.
- Add a test that exercises secondary downloads using the background
scheduling, whereas existing tests were using the API hook to invoke
download directly.
---
 pageserver/src/metrics.rs                     |  6 +-
 pageserver/src/tenant/mgr.rs                  | 19 ++--
 pageserver/src/tenant/secondary/downloader.rs | 11 ++-
 .../regress/test_pageserver_secondary.py      | 86 +++++++++++++++++++
 4 files changed, 112 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index be61a755ff..e6db95082b 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1518,7 +1518,8 @@ pub(crate) struct SecondaryModeMetrics {
     pub(crate) download_heatmap: IntCounter,
     pub(crate) download_layer: IntCounter,
 }
-pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
+pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
+    SecondaryModeMetrics {
     upload_heatmap: register_int_counter!(
         "pageserver_secondary_upload_heatmap",
         "Number of heatmaps written to remote storage by attached tenants"
@@ -1536,7 +1537,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
     .expect("failed to define a metric"),
     download_heatmap: register_int_counter!(
         "pageserver_secondary_download_heatmap",
-        "Number of downloads of heatmaps by secondary mode locations"
+        "Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed"
     )
     .expect("failed to define a metric"),
     download_layer: register_int_counter!(
@@ -1544,6 +1545,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
         "Number of downloads of layers by secondary mode locations"
     )
     .expect("failed to define a metric"),
+}
 });
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 73967f2949..2c9476ba0a 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -678,12 +678,19 @@ pub async fn init_tenant_mgr(
                     }
                 }
             }
-            LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
-                tenant_shard_id,
-                shard_identity,
-                location_conf.tenant_conf,
-                &secondary_conf,
-            )),
+            LocationMode::Secondary(secondary_conf) => {
+                info!(
+                    tenant_id = %tenant_shard_id.tenant_id,
+                    shard_id = %tenant_shard_id.shard_slug(),
+                    "Starting secondary tenant"
+                );
+                TenantSlot::Secondary(SecondaryTenant::new(
+                    tenant_shard_id,
+                    shard_identity,
+                    location_conf.tenant_conf,
+                    &secondary_conf,
+                ))
+            }
         };
 
         tenants.insert(tenant_shard_id, slot);
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 5b29c126d1..67f866cb7b 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -312,7 +312,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                     (detail.last_download, detail.next_download.unwrap())
                 };
 
-                if now < next_download {
+                if now > next_download {
                     Some(PendingDownload {
                         secondary_state: secondary_tenant,
                         last_download,
@@ -647,6 +647,12 @@ impl<'a> TenantDownloader<'a> {
                 progress.bytes_downloaded += layer_byte_count;
                 progress.layers_downloaded += layer_count;
             }
+
+            for delete_timeline in &delete_timelines {
+                // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
+                // from disk fails that will be a fatal error.
+                detail.timelines.remove(delete_timeline);
+            }
         }
 
         // Execute accumulated deletions
@@ -710,13 +716,14 @@ impl<'a> TenantDownloader<'a> {
                     .await
                     .map_err(UpdateError::from)?;
 
+                SECONDARY_MODE.download_heatmap.inc();
+
                 if Some(&download.etag) == prev_etag {
                     Ok(HeatMapDownload::Unmodified)
                 } else {
                     let mut heatmap_bytes = Vec::new();
                     let mut body = tokio_util::io::StreamReader::new(download.download_stream);
                     let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
-                    SECONDARY_MODE.download_heatmap.inc();
                     Ok(HeatMapDownload::Modified(HeatMapModified {
                         etag: download.etag,
                         last_modified: download.last_modified,
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 345abdc072..8f194e5dda 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -1,6 +1,7 @@
 import json
 import os
 import random
+import time
 from pathlib import Path
 from typing import Any, Dict, Optional
 
@@ -582,6 +583,91 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     )
 
 
+def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
+    """
+    Slow test that runs in realtime, checks that the background scheduling of secondary
+    downloads happens as expected.
+    """
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    # Create this many tenants, each with two timelines
+    tenant_count = 4
+    tenant_timelines = {}
+
+    # This mirrors a constant in `downloader.rs`
+    freshen_interval_secs = 60
+
+    for _i in range(0, tenant_count):
+        tenant_id = TenantId.generate()
+        timeline_a = TimelineId.generate()
+        timeline_b = TimelineId.generate()
+        env.neon_cli.create_tenant(
+            tenant_id,
+            timeline_a,
+            placement_policy='{"Attached":1}',
+            # Run with a low heatmap period so that we can avoid having to do synthetic API calls
+            # to trigger the upload promptly.
+            conf={"heatmap_period": "1s"},
+        )
+        env.neon_cli.create_timeline("main2", tenant_id, timeline_b)
+
+        tenant_timelines[tenant_id] = [timeline_a, timeline_b]
+
+    t_start = time.time()
+
+    # Wait long enough that the background downloads should happen; we expect all the inital layers
+    # of all the initial timelines to show up on the secondary location of each tenant.
+    time.sleep(freshen_interval_secs * 1.5)
+
+    for tenant_id, timelines in tenant_timelines.items():
+        attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
+        ps_attached = env.get_pageserver(attached_to_id)
+        # We only have two: the other one must be secondary
+        ps_secondary = next(p for p in env.pageservers if p != ps_attached)
+
+        for timeline_id in timelines:
+            log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}")
+            # One or more layers should be present for all timelines
+            assert list_layers(ps_secondary, tenant_id, timeline_id)
+
+        # Delete the second timeline: this should be reflected later on the secondary
+        env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])
+
+    # Wait long enough for the secondary locations to see the deletion
+    time.sleep(freshen_interval_secs * 1.5)
+
+    for tenant_id, timelines in tenant_timelines.items():
+        attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
+        ps_attached = env.get_pageserver(attached_to_id)
+        # We only have two: the other one must be secondary
+        ps_secondary = next(p for p in env.pageservers if p != ps_attached)
+
+        # This one was not deleted
+        assert list_layers(ps_secondary, tenant_id, timelines[0])
+
+        # This one was deleted
+        assert not list_layers(ps_secondary, tenant_id, timelines[1])
+
+    t_end = time.time()
+
+    # Measure how many heatmap downloads we did in total: this checks that we succeeded with
+    # proper scheduling, and not some bug that just runs downloads in a loop.
+    total_heatmap_downloads = 0
+    for ps in env.pageservers:
+        v = ps.http_client().get_metric_value("pageserver_secondary_download_heatmap_total")
+        assert v is not None
+        total_heatmap_downloads += int(v)
+
+    download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start)
+
+    expect_download_rate = 1.0 / freshen_interval_secs
+    log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min")
+
+    assert download_rate < expect_download_rate * 2
+
+
 @pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
 @pytest.mark.parametrize("via_controller", [True, False])
 def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool):

From 0d8e68003a9ef5bb628a245a66b915322824dd44 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 18 Apr 2024 14:45:25 +0100
Subject: [PATCH 0582/1571] Add a docs page for storage controller (#7392)

## Problem

External contributors need information on how to use the storage
controller.

## Summary of changes

- Background content on what the storage controller is.
- Deployment information on how to use it.

This is not super-detailed, but should be enough for a well motivated
third party to get started, with an occasional peek at the code.
---
 docs/storage_controller.md | 150 +++++++++++++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 docs/storage_controller.md

diff --git a/docs/storage_controller.md b/docs/storage_controller.md
new file mode 100644
index 0000000000..4cb796edaa
--- /dev/null
+++ b/docs/storage_controller.md
@@ -0,0 +1,150 @@
+# Storage Controller
+
+## Concepts
+
+The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller,
+which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations).
+
+It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding
+the underlying details of how data is spread across multiple nodes.
+
+The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent.
+
+## APIs
+
+The storage controller’s HTTP server implements four logically separate APIs:
+
+- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver.
+- `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits.
+- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system.
+- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers
+  to ensure data safety with generation numbers.
+
+The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs).
+
+See the `http.rs` file in the source for where the HTTP APIs are implemented.
+
+## Database
+
+The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not
+persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
+rebuilt on startup.
+
+The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
+
+The `diesel` crate is used for defining models & migrations.
+
+Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database.
+
+### Diesel tip: migrations
+
+If you need to modify the database schema, here’s how to create a migration:
+
+- Install the diesel CLI with `cargo install diesel_cli`
+- Use `diesel migration generate <name>` to create a new migration
+- Populate the SQL files in the `migrations/` subdirectory
+- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
+  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
+- Commit the migration files and the changes to schema.rs
+- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
+- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
+
+## storcon_cli
+
+The `storcon_cli` tool enables interactive management of the storage controller. This is usually
+only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline).
+
+`storcon_cli --help` includes details on commands.
+
+# Deploying
+
+This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as
+part of a self-hosted system.
+
+_General note: since the default `neon_local` environment includes a storage controller, this is a useful
+reference when figuring out deployment._
+
+## Database
+
+It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral
+local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver.
+
+The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte.
+
+Set the URL to the database using the `--database-url` CLI option.
+
+There is no need to run migrations manually: the storage controller automatically applies migrations
+when it starts up.
+
+## Configure pageservers to use the storage controller
+
+1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should
+   point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters.
+2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself
+   with the storage controller when it starts up. See the example below for the format of this file.
+
+### Example `metadata.json`
+
+```
+{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000}
+```
+
+- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever
+  postgres runs.
+- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where
+  the storage controller runs.
+
+## Handle compute notifications.
+
+The storage controller independently moves tenant attachments between pageservers in response to
+changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable
+postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
+location changes.
+
+The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
+JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
+
+In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
+the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
+the compute hook.
+
+When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
+the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
+
+```
+struct ComputeHookNotifyRequestShard {
+    node_id: NodeId,
+    shard_number: ShardNumber,
+}
+
+struct ComputeHookNotifyRequest {
+    tenant_id: TenantId,
+    stripe_size: Option<ShardStripeSize>,
+    shards: Vec<ComputeHookNotifyRequestShard>,
+}
+```
+
+When a notification is received:
+
+1. Modify postgres configuration for this tenant:
+
+   - set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
+     shards identified by `NodeId` must be converted to the address+port of the node.
+   - if stripe_size is not None, set `neon.stripe_size` to this value
+
+2. Send SIGHUP to postgres to reload configuration
+3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
+   will retry the notification until it succeeds..
+
+### Example notification body
+
+```
+{
+  "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
+  "stripe_size": 32768,
+  "shards": [
+      {"node_id": 344, "shard_number": 0},
+      {"node_id": 722, "shard_number": 1},
+  ],
+}
+```

From 3df67bf4d7d23a074cd0e45104e86ebc36315242 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 18 Apr 2024 18:27:58 +0300
Subject: [PATCH 0583/1571] fix(Layer): metric regression with too many
 canceled evictions (#7363)

#7030 introduced an annoying papercut, deeming a failure to acquire a
strong reference to `LayerInner` from `DownloadedLayer::drop` as a
canceled eviction. Most of the time, it wasn't that, but just timeline
deletion or tenant detach with the layer not wanting to be deleted or
evicted.

When a Layer is dropped as part of a normal shutdown, the `Layer` is
dropped first, and the `DownloadedLayer` the second. Because of this, we
cannot detect eviction being canceled from the `DownloadedLayer::drop`.
We can detect it from `LayerInner::drop`, which this PR adds.

Test case is added which before had 1 started eviction, 2 canceled. Now
it accurately finds 1 started, 1 canceled.
---
 libs/utils/src/sync/heavier_once_cell.rs      | 51 +++++++++-
 pageserver/src/tenant/storage_layer/layer.rs  | 16 ++-
 .../src/tenant/storage_layer/layer/tests.rs   | 97 +++++++++++++++++++
 3 files changed, 155 insertions(+), 9 deletions(-)

diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 8eee1f72a6..1abd3d9861 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -192,6 +192,14 @@ impl<T> OnceCell<T> {
         }
     }
 
+    /// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never
+    /// initialized.
+    pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
+        let inner = self.inner.get_mut().unwrap();
+
+        inner.take_and_deinit()
+    }
+
     /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
     pub fn initializer_count(&self) -> usize {
         self.initializers.load(Ordering::Relaxed)
@@ -246,15 +254,23 @@ impl<'a, T> Guard<'a, T> {
     /// The permit will be on a semaphore part of the new internal value, and any following
     /// [`OnceCell::get_or_init`] will wait on it to complete.
     pub fn take_and_deinit(mut self) -> (T, InitPermit) {
+        self.0
+            .take_and_deinit()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<T> Inner<T> {
+    pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
+        let value = self.value.take()?;
+
         let mut swapped = Inner::default();
         let sem = swapped.init_semaphore.clone();
         // acquire and forget right away, moving the control over to InitPermit
         sem.try_acquire().expect("we just created this").forget();
-        std::mem::swap(&mut *self.0, &mut swapped);
-        swapped
-            .value
-            .map(|v| (v, InitPermit(sem)))
-            .expect("guard is not created unless value has been initialized")
+        let permit = InitPermit(sem);
+        std::mem::swap(self, &mut swapped);
+        Some((value, permit))
     }
 }
 
@@ -263,6 +279,13 @@ impl<'a, T> Guard<'a, T> {
 /// On drop, this type will return the permit.
 pub struct InitPermit(Arc<tokio::sync::Semaphore>);
 
+impl std::fmt::Debug for InitPermit {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let ptr = Arc::as_ptr(&self.0) as *const ();
+        f.debug_tuple("InitPermit").field(&ptr).finish()
+    }
+}
+
 impl Drop for InitPermit {
     fn drop(&mut self) {
         assert_eq!(
@@ -559,4 +582,22 @@ mod tests {
 
         assert_eq!(*target.get().unwrap(), 11);
     }
+
+    #[tokio::test]
+    async fn take_and_deinit_on_mut() {
+        use std::convert::Infallible;
+
+        let mut target = OnceCell::<u32>::default();
+        assert!(target.take_and_deinit().is_none());
+
+        target
+            .get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) })
+            .await
+            .unwrap();
+
+        let again = target.take_and_deinit();
+        assert!(matches!(again, Some((42, _))), "{again:?}");
+
+        assert!(target.take_and_deinit().is_none());
+    }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 291da0f645..e55299f0fa 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -610,9 +610,17 @@ enum Status {
 
 impl Drop for LayerInner {
     fn drop(&mut self) {
+        // if there was a pending eviction, mark it cancelled here to balance metrics
+        if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit()
+        {
+            // eviction has already been started
+            LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
+
+            // eviction request is intentionally not honored as no one is present to wait for it
+            // and we could be delaying shutdown for nothing.
+        }
+
         if !*self.wanted_deleted.get_mut() {
-            // should we try to evict if the last wish was for eviction? seems more like a hazard
-            // than a clear win.
             return;
         }
 
@@ -1558,8 +1566,8 @@ impl Drop for DownloadedLayer {
         if let Some(owner) = self.owner.upgrade() {
             owner.on_downloaded_layer_drop(self.version);
         } else {
-            // no need to do anything, we are shutting down
-            LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
+            // Layer::drop will handle cancelling the eviction; because of drop order and
+            // `DownloadedLayer` never leaking, we cannot know here if eviction was requested.
         }
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 247ff123b5..f0697fdf28 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -721,6 +721,103 @@ async fn evict_and_wait_does_not_wait_for_download() {
     layer.evict_and_wait(FOREVER).await.unwrap();
 }
 
+/// Asserts that there is no miscalculation when Layer is dropped while it is being kept resident,
+/// which is the last value.
+///
+/// Also checks that the same does not happen on a non-evicted layer (regression test).
+#[tokio::test(start_paused = true)]
+async fn eviction_cancellation_on_drop() {
+    use crate::repository::Value;
+    use bytes::Bytes;
+
+    // this is the runtime on which Layer spawns the blocking tasks on
+    let handle = tokio::runtime::Handle::current();
+
+    let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap();
+    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
+    let (tenant, ctx) = h.load().await;
+
+    let timeline = tenant
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .await
+        .unwrap();
+
+    {
+        // create_test_timeline wrote us one layer, write another
+        let mut writer = timeline.writer().await;
+        writer
+            .put(
+                Key::from_i128(5),
+                Lsn(0x20),
+                &Value::Image(Bytes::from_static(b"this does not matter either")),
+                &ctx,
+            )
+            .await
+            .unwrap();
+
+        writer.finish_write(Lsn(0x20));
+    }
+
+    timeline.freeze_and_flush().await.unwrap();
+
+    // wait for the upload to complete so our Arc::strong_count assertion holds
+    timeline
+        .remote_client
+        .as_ref()
+        .unwrap()
+        .wait_completion()
+        .await
+        .unwrap();
+
+    let (evicted_layer, not_evicted) = {
+        let mut layers = {
+            let mut guard = timeline.layers.write().await;
+            let layers = guard.likely_resident_layers().collect::<Vec<_>>();
+            // remove the layers from layermap
+            guard.finish_gc_timeline(&layers);
+
+            layers
+        };
+
+        assert_eq!(layers.len(), 2);
+
+        (layers.pop().unwrap(), layers.pop().unwrap())
+    };
+
+    let victims = [(evicted_layer, true), (not_evicted, false)];
+
+    for (victim, evict) in victims {
+        let resident = victim.keep_resident().await.unwrap();
+        drop(victim);
+
+        assert_eq!(Arc::strong_count(&resident.owner.0), 1);
+
+        if evict {
+            let evict_and_wait = resident.owner.evict_and_wait(FOREVER);
+
+            // drive the future to await on the status channel, and then drop it
+            tokio::time::timeout(ADVANCE, evict_and_wait)
+                .await
+                .expect_err("should had been a timeout since we are holding the layer resident");
+        }
+
+        // 1 == we only evict one of the layers
+        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+
+        drop(resident);
+
+        // run any spawned
+        tokio::time::sleep(ADVANCE).await;
+
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
+
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::LayerGone].get()
+        );
+    }
+}
+
 #[test]
 fn layer_size() {
     assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);

From 681a04d2874514a2fae4fd0a11114ecb48c42280 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 18 Apr 2024 16:47:34 +0000
Subject: [PATCH 0584/1571] build(deps): bump aiohttp from 3.9.2 to 3.9.4
 (#7429)

---
 poetry.lock    | 156 ++++++++++++++++++++++++-------------------------
 pyproject.toml |   2 +-
 2 files changed, 79 insertions(+), 79 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index aca88073a8..6ed64d28fc 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2,87 +2,87 @@
 
 [[package]]
 name = "aiohttp"
-version = "3.9.2"
+version = "3.9.4"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:772fbe371788e61c58d6d3d904268e48a594ba866804d08c995ad71b144f94cb"},
-    {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:edd4f1af2253f227ae311ab3d403d0c506c9b4410c7fc8d9573dec6d9740369f"},
-    {file = "aiohttp-3.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cfee9287778399fdef6f8a11c9e425e1cb13cc9920fd3a3df8f122500978292b"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cc158466f6a980a6095ee55174d1de5730ad7dec251be655d9a6a9dd7ea1ff9"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54ec82f45d57c9a65a1ead3953b51c704f9587440e6682f689da97f3e8defa35"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abeb813a18eb387f0d835ef51f88568540ad0325807a77a6e501fed4610f864e"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc91d07280d7d169f3a0f9179d8babd0ee05c79d4d891447629ff0d7d8089ec2"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b65e861f4bebfb660f7f0f40fa3eb9f2ab9af10647d05dac824390e7af8f75b7"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:04fd8ffd2be73d42bcf55fd78cde7958eeee6d4d8f73c3846b7cba491ecdb570"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3d8d962b439a859b3ded9a1e111a4615357b01620a546bc601f25b0211f2da81"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:8ceb658afd12b27552597cf9a65d9807d58aef45adbb58616cdd5ad4c258c39e"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0e4ee4df741670560b1bc393672035418bf9063718fee05e1796bf867e995fad"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2dec87a556f300d3211decf018bfd263424f0690fcca00de94a837949fbcea02"},
-    {file = "aiohttp-3.9.2-cp310-cp310-win32.whl", hash = "sha256:3e1a800f988ce7c4917f34096f81585a73dbf65b5c39618b37926b1238cf9bc4"},
-    {file = "aiohttp-3.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:ea510718a41b95c236c992b89fdfc3d04cc7ca60281f93aaada497c2b4e05c46"},
-    {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6aaa6f99256dd1b5756a50891a20f0d252bd7bdb0854c5d440edab4495c9f973"},
-    {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a27d8c70ad87bcfce2e97488652075a9bdd5b70093f50b10ae051dfe5e6baf37"},
-    {file = "aiohttp-3.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:54287bcb74d21715ac8382e9de146d9442b5f133d9babb7e5d9e453faadd005e"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb3d05569aa83011fcb346b5266e00b04180105fcacc63743fc2e4a1862a891"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8534e7d69bb8e8d134fe2be9890d1b863518582f30c9874ed7ed12e48abe3c4"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd9d5b989d57b41e4ff56ab250c5ddf259f32db17159cce630fd543376bd96b"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa6904088e6642609981f919ba775838ebf7df7fe64998b1a954fb411ffb4663"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bda42eb410be91b349fb4ee3a23a30ee301c391e503996a638d05659d76ea4c2"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:193cc1ccd69d819562cc7f345c815a6fc51d223b2ef22f23c1a0f67a88de9a72"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b9f1cb839b621f84a5b006848e336cf1496688059d2408e617af33e3470ba204"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d22a0931848b8c7a023c695fa2057c6aaac19085f257d48baa24455e67df97ec"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4112d8ba61fbd0abd5d43a9cb312214565b446d926e282a6d7da3f5a5aa71d36"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c4ad4241b52bb2eb7a4d2bde060d31c2b255b8c6597dd8deac2f039168d14fd7"},
-    {file = "aiohttp-3.9.2-cp311-cp311-win32.whl", hash = "sha256:ee2661a3f5b529f4fc8a8ffee9f736ae054adfb353a0d2f78218be90617194b3"},
-    {file = "aiohttp-3.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:4deae2c165a5db1ed97df2868ef31ca3cc999988812e82386d22937d9d6fed52"},
-    {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6f4cdba12539215aaecf3c310ce9d067b0081a0795dd8a8805fdb67a65c0572a"},
-    {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:84e843b33d5460a5c501c05539809ff3aee07436296ff9fbc4d327e32aa3a326"},
-    {file = "aiohttp-3.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8008d0f451d66140a5aa1c17e3eedc9d56e14207568cd42072c9d6b92bf19b52"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61c47ab8ef629793c086378b1df93d18438612d3ed60dca76c3422f4fbafa792"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc71f748e12284312f140eaa6599a520389273174b42c345d13c7e07792f4f57"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1c3a4d0ab2f75f22ec80bca62385db2e8810ee12efa8c9e92efea45c1849133"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a87aa0b13bbee025faa59fa58861303c2b064b9855d4c0e45ec70182bbeba1b"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2cc0d04688b9f4a7854c56c18aa7af9e5b0a87a28f934e2e596ba7e14783192"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1956e3ac376b1711c1533266dec4efd485f821d84c13ce1217d53e42c9e65f08"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:114da29f39eccd71b93a0fcacff178749a5c3559009b4a4498c2c173a6d74dff"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:3f17999ae3927d8a9a823a1283b201344a0627272f92d4f3e3a4efe276972fe8"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f31df6a32217a34ae2f813b152a6f348154f948c83213b690e59d9e84020925c"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7a75307ffe31329928a8d47eae0692192327c599113d41b278d4c12b54e1bd11"},
-    {file = "aiohttp-3.9.2-cp312-cp312-win32.whl", hash = "sha256:972b63d589ff8f305463593050a31b5ce91638918da38139b9d8deaba9e0fed7"},
-    {file = "aiohttp-3.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:200dc0246f0cb5405c80d18ac905c8350179c063ea1587580e3335bfc243ba6a"},
-    {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:158564d0d1020e0d3fe919a81d97aadad35171e13e7b425b244ad4337fc6793a"},
-    {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:da1346cd0ccb395f0ed16b113ebb626fa43b7b07fd7344fce33e7a4f04a8897a"},
-    {file = "aiohttp-3.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eaa9256de26ea0334ffa25f1913ae15a51e35c529a1ed9af8e6286dd44312554"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1543e7fb00214fb4ccead42e6a7d86f3bb7c34751ec7c605cca7388e525fd0b4"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:186e94570433a004e05f31f632726ae0f2c9dee4762a9ce915769ce9c0a23d89"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d52d20832ac1560f4510d68e7ba8befbc801a2b77df12bd0cd2bcf3b049e52a4"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c45e4e815ac6af3b72ca2bde9b608d2571737bb1e2d42299fc1ffdf60f6f9a1"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa906b9bdfd4a7972dd0628dbbd6413d2062df5b431194486a78f0d2ae87bd55"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:68bbee9e17d66f17bb0010aa15a22c6eb28583edcc8b3212e2b8e3f77f3ebe2a"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4c189b64bd6d9a403a1a3f86a3ab3acbc3dc41a68f73a268a4f683f89a4dec1f"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8a7876f794523123bca6d44bfecd89c9fec9ec897a25f3dd202ee7fc5c6525b7"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d23fba734e3dd7b1d679b9473129cd52e4ec0e65a4512b488981a56420e708db"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b141753be581fab842a25cb319f79536d19c2a51995d7d8b29ee290169868eab"},
-    {file = "aiohttp-3.9.2-cp38-cp38-win32.whl", hash = "sha256:103daf41ff3b53ba6fa09ad410793e2e76c9d0269151812e5aba4b9dd674a7e8"},
-    {file = "aiohttp-3.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:328918a6c2835861ff7afa8c6d2c70c35fdaf996205d5932351bdd952f33fa2f"},
-    {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5264d7327c9464786f74e4ec9342afbbb6ee70dfbb2ec9e3dfce7a54c8043aa3"},
-    {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07205ae0015e05c78b3288c1517afa000823a678a41594b3fdc870878d645305"},
-    {file = "aiohttp-3.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0a1e638cffc3ec4d4784b8b4fd1cf28968febc4bd2718ffa25b99b96a741bd"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d43302a30ba1166325974858e6ef31727a23bdd12db40e725bec0f759abce505"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16a967685907003765855999af11a79b24e70b34dc710f77a38d21cd9fc4f5fe"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fa3ee92cd441d5c2d07ca88d7a9cef50f7ec975f0117cd0c62018022a184308"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b500c5ad9c07639d48615a770f49618130e61be36608fc9bc2d9bae31732b8f"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c07327b368745b1ce2393ae9e1aafed7073d9199e1dcba14e035cc646c7941bf"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc7d6502c23a0ec109687bf31909b3fb7b196faf198f8cff68c81b49eb316ea9"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:07be2be7071723c3509ab5c08108d3a74f2181d4964e869f2504aaab68f8d3e8"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:122468f6fee5fcbe67cb07014a08c195b3d4c41ff71e7b5160a7bcc41d585a5f"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:00a9abcea793c81e7f8778ca195a1714a64f6d7436c4c0bb168ad2a212627000"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a9825fdd64ecac5c670234d80bb52bdcaa4139d1f839165f548208b3779c6c6"},
-    {file = "aiohttp-3.9.2-cp39-cp39-win32.whl", hash = "sha256:5422cd9a4a00f24c7244e1b15aa9b87935c85fb6a00c8ac9b2527b38627a9211"},
-    {file = "aiohttp-3.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:7d579dcd5d82a86a46f725458418458fa43686f6a7b252f2966d359033ffc8ab"},
-    {file = "aiohttp-3.9.2.tar.gz", hash = "sha256:b0ad0a5e86ce73f5368a164c10ada10504bf91869c05ab75d982c6048217fbf7"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"},
+    {file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"},
+    {file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"},
+    {file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"},
+    {file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"},
+    {file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"},
+    {file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"},
+    {file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"},
+    {file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"},
+    {file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"},
+    {file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"},
+    {file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"},
 ]
 
 [package.dependencies]
@@ -2900,4 +2900,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "df7161da4fdc3cba0a445176fc9dda2a0e8a53e13a7aa8a864385ca259381b41"
+content-hash = "b3452b50901123fd5f2c385ce8a0c1c492296393b8a7926a322b6df0ea3ac572"
diff --git a/pyproject.toml b/pyproject.toml
index 156f135062..aadcf26818 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
-aiohttp = "3.9.2"
+aiohttp = "3.9.4"
 pytest-rerunfailures = "^13.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"

From 6eb946e2ded051d2d8f6b2c545d67288212e6dab Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 18 Apr 2024 18:40:30 +0100
Subject: [PATCH 0585/1571] pageserver: fix cont lsn jump on vectored read path
 (#7412)

## Problem
Vectored read path may return an image that's newer than the request lsn
under certain circumstances.
```
  LSN
    ^
    |
    |
500 | ------------------------- -> branch point
400 |        X
300 |        X
200 | ------------------------------------> requested lsn
100 |        X
    |---------------------------------> Key

Legend:
* X - page images
```

The vectored read path inspects each ancestor timeline one by one
starting from the current one.
When moving into the ancestor timeline, the current code resets the
current search lsn (called `cont_lsn` in code)
to the lsn of the ancestor timeline
([here](https://github.com/neondatabase/neon/blob/d5708e74357ca19146098770895356326542306e/pageserver/src/tenant/timeline.rs#L2971)).

For instance, if the request lsn was 200, we would:
1. Look into the current timeline and find nothing for the key
2. Descend into the ancestor timeline and set `cont_lsn=500`
3. Return the page image at LSN 400

Myself and Christian find it very unlikely for this to have happened in
prod since the vectored read path
is always used at the last record lsn.

This issue was found by a regress test during the work to migrate get
page handling to use the vectored
implementation. I've applied my fix to that wip branch and it fixed the
issue.

## Summary of changes
The fix is to set the current search lsn to the min between the
requested LSN and the ancestor lsn.
Hence, at step 2 above we would set the current search lsn to 200 and
ignore the images above that.

A test illustrating the bug is also included. Fails without the patch
and passes with it.
---
 pageserver/src/tenant.rs          | 164 +++++++++++++++++++++++++++++-
 pageserver/src/tenant/timeline.rs |   3 +-
 2 files changed, 165 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 35ea037a55..ff17400d45 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3848,6 +3848,8 @@ pub(crate) mod harness {
 
 #[cfg(test)]
 mod tests {
+    use std::collections::BTreeMap;
+
     use super::*;
     use crate::keyspace::KeySpaceAccum;
     use crate::repository::{Key, Value};
@@ -3858,7 +3860,7 @@ mod tests {
     use hex_literal::hex;
     use pageserver_api::keyspace::KeySpace;
     use rand::{thread_rng, Rng};
-    use tests::timeline::ShutdownMode;
+    use tests::timeline::{GetVectoredError, ShutdownMode};
 
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4794,6 +4796,166 @@ mod tests {
         Ok(())
     }
 
+    // Test that vectored get descends into ancestor timelines correctly and
+    // does not return an image that's newer than requested.
+    //
+    // The diagram below ilustrates an interesting case. We have a parent timeline
+    // (top of the Lsn range) and a child timeline. The request key cannot be reconstructed
+    // from the child timeline, so the parent timeline must be visited. When advacing into
+    // the child timeline, the read path needs to remember what the requested Lsn was in
+    // order to avoid returning an image that's too new. The test below constructs such
+    // a timeline setup and does a few queries around the Lsn of each page image.
+    // ```
+    //    LSN
+    //     ^
+    //     |
+    //     |
+    // 500 | --------------------------------------> branch point
+    // 400 |        X
+    // 300 |        X
+    // 200 | --------------------------------------> requested lsn
+    // 100 |        X
+    //     |---------------------------------------> Key
+    //              |
+    //              ------> requested key
+    //
+    // Legend:
+    // * X - page images
+    // ```
+    #[tokio::test]
+    async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?;
+        let (tenant, ctx) = harness.load().await;
+
+        let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let end_key = start_key.add(1000);
+        let child_gap_at_key = start_key.add(500);
+        let mut parent_gap_lsns: BTreeMap<Lsn, String> = BTreeMap::new();
+
+        let mut current_lsn = Lsn(0x10);
+
+        let timeline_id = TimelineId::generate();
+        let parent_timeline = tenant
+            .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        current_lsn += 0x100;
+
+        for _ in 0..3 {
+            let mut key = start_key;
+            while key < end_key {
+                current_lsn += 0x10;
+
+                let image_value = format!("{} at {}", child_gap_at_key, current_lsn);
+
+                let mut writer = parent_timeline.writer().await;
+                writer
+                    .put(
+                        key,
+                        current_lsn,
+                        &Value::Image(test_img(&image_value)),
+                        &ctx,
+                    )
+                    .await?;
+                writer.finish_write(current_lsn);
+
+                if key == child_gap_at_key {
+                    parent_gap_lsns.insert(current_lsn, image_value);
+                }
+
+                key = key.next();
+            }
+
+            parent_timeline.freeze_and_flush().await?;
+        }
+
+        let child_timeline_id = TimelineId::generate();
+
+        let child_timeline = tenant
+            .branch_timeline_test(&parent_timeline, child_timeline_id, Some(current_lsn), &ctx)
+            .await?;
+
+        let mut key = start_key;
+        while key < end_key {
+            if key == child_gap_at_key {
+                key = key.next();
+                continue;
+            }
+
+            current_lsn += 0x10;
+
+            let mut writer = child_timeline.writer().await;
+            writer
+                .put(
+                    key,
+                    current_lsn,
+                    &Value::Image(test_img(&format!("{} at {}", key, current_lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(current_lsn);
+
+            key = key.next();
+        }
+
+        child_timeline.freeze_and_flush().await?;
+
+        let lsn_offsets: [i64; 5] = [-10, -1, 0, 1, 10];
+        let mut query_lsns = Vec::new();
+        for image_lsn in parent_gap_lsns.keys().rev() {
+            for offset in lsn_offsets {
+                query_lsns.push(Lsn(image_lsn
+                    .0
+                    .checked_add_signed(offset)
+                    .expect("Shouldn't overflow")));
+            }
+        }
+
+        for query_lsn in query_lsns {
+            let results = child_timeline
+                .get_vectored_impl(
+                    KeySpace {
+                        ranges: vec![child_gap_at_key..child_gap_at_key.next()],
+                    },
+                    query_lsn,
+                    &ctx,
+                )
+                .await;
+
+            let expected_item = parent_gap_lsns
+                .iter()
+                .rev()
+                .find(|(lsn, _)| **lsn <= query_lsn);
+
+            info!(
+                "Doing vectored read at LSN {}. Expecting image to be: {:?}",
+                query_lsn, expected_item
+            );
+
+            match expected_item {
+                Some((_, img_value)) => {
+                    let key_results = results.expect("No vectored get error expected");
+                    let key_result = &key_results[&child_gap_at_key];
+                    let returned_img = key_result
+                        .as_ref()
+                        .expect("No page reconstruct error expected");
+
+                    info!(
+                        "Vectored read at LSN {} returned image {}",
+                        query_lsn,
+                        std::str::from_utf8(returned_img)?
+                    );
+                    assert_eq!(*returned_img, test_img(img_value));
+                }
+                None => {
+                    assert!(matches!(results, Err(GetVectoredError::MissingKey(_))));
+                }
+            }
+        }
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_random_updates")?;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 46b3d41e2b..3f2d807ce8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2968,7 +2968,8 @@ impl Timeline {
                 break;
             }
 
-            cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
+            // Take the min to avoid reconstructing a page with data newer than request Lsn.
+            cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
             timeline_owned = timeline
                 .get_ready_ancestor_timeline(ctx)
                 .await

From 98be8b94308135c19e49696141b41e86d90cb973 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 19 Apr 2024 12:32:58 +0100
Subject: [PATCH 0586/1571] storcon_cli: `tenant-warmup` command (#7432)

## Problem

When we migrate a large existing tenant, we would like to be able to
ensure it has pre-loaded layers onto a pageserver managed by the storage
controller.

## Summary of changes

- Add `storcon_cli tenant-warmup`, which configures the tenant into
PlacementPolicy::Secondary (unless it's already attached), and then
polls the secondary download API reporting progress.
- Extend a test case to check that when onboarding with a secondary
location pre-created, we properly use that location for our first
attachment.
---
 control_plane/storcon_cli/src/main.rs         | 102 +++++++++++++++++-
 .../regress/test_storage_controller.py        |  25 ++++-
 2 files changed, 120 insertions(+), 7 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 2edd09eac1..b3d1f0be05 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,15 +1,15 @@
-use std::{collections::HashMap, str::FromStr};
+use std::{collections::HashMap, str::FromStr, time::Duration};
 
 use clap::{Parser, Subcommand};
-use hyper::Method;
+use hyper::{Method, StatusCode};
 use pageserver_api::{
     controller_api::{
         NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
         TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
-        TenantShardSplitRequest, TenantShardSplitResponse,
+        LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
+        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
     },
     shard::{ShardStripeSize, TenantShardId},
 };
@@ -120,6 +120,12 @@ enum Command {
         #[arg(long)]
         tenant_id: TenantId,
     },
+    /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
+    /// mode so that it can warm up content on a pageserver.
+    TenantWarmup {
+        #[arg(long)]
+        tenant_id: TenantId,
+    },
 }
 
 #[derive(Parser)]
@@ -581,6 +587,94 @@ async fn main() -> anyhow::Result<()> {
             }
             println!("{table}");
         }
+        Command::TenantWarmup { tenant_id } => {
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await;
+            match describe_response {
+                Ok(describe) => {
+                    if matches!(describe.policy, PlacementPolicy::Secondary) {
+                        // Fine: it's already known to controller in secondary mode: calling
+                        // again to put it into secondary mode won't cause problems.
+                    } else {
+                        anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
+                    }
+                }
+                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
+                    // Fine: this tenant isn't know to the storage controller yet.
+                }
+                Err(e) => {
+                    // Unexpected API error
+                    return Err(e.into());
+                }
+            }
+
+            vps_client
+                .location_config(
+                    TenantShardId::unsharded(tenant_id),
+                    pageserver_api::models::LocationConfig {
+                        mode: pageserver_api::models::LocationConfigMode::Secondary,
+                        generation: None,
+                        secondary_conf: Some(LocationConfigSecondary { warm: true }),
+                        shard_number: 0,
+                        shard_count: 0,
+                        shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
+                        tenant_conf: TenantConfig::default(),
+                    },
+                    None,
+                    true,
+                )
+                .await?;
+
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await?;
+
+            let secondary_ps_id = describe_response
+                .shards
+                .first()
+                .unwrap()
+                .node_secondary
+                .first()
+                .unwrap();
+
+            println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
+            loop {
+                let (status, progress) = vps_client
+                    .tenant_secondary_download(
+                        TenantShardId::unsharded(tenant_id),
+                        Some(Duration::from_secs(10)),
+                    )
+                    .await?;
+                println!(
+                    "Progress: {}/{} layers, {}/{} bytes",
+                    progress.layers_downloaded,
+                    progress.layers_total,
+                    progress.bytes_downloaded,
+                    progress.bytes_total
+                );
+                match status {
+                    StatusCode::OK => {
+                        println!("Download complete");
+                        break;
+                    }
+                    StatusCode::ACCEPTED => {
+                        // Loop
+                    }
+                    _ => {
+                        anyhow::bail!("Unexpected download status: {status}");
+                    }
+                }
+            }
+        }
     }
 
     Ok(())
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 840f354142..b4b23745f8 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -273,7 +273,8 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     but imports the generation number.
     """
 
-    neon_env_builder.num_pageservers = 2
+    # One pageserver to simulate legacy environment, two to be managed by storage controller
+    neon_env_builder.num_pageservers = 3
 
     # Start services by hand so that we can skip registration on one of the pageservers
     env = neon_env_builder.init_configs()
@@ -288,10 +289,10 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     )
     origin_ps = env.pageservers[0]
 
-    # This is the pageserver managed by the sharding service, where the tenant
+    # These are the pageservers managed by the sharding service, where the tenant
     # will be attached after onboarding
     env.pageservers[1].start()
-    dest_ps = env.pageservers[1]
+    env.pageservers[2].start()
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
 
     for sk in env.safekeepers:
@@ -330,6 +331,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
         )
 
         virtual_ps_http.tenant_secondary_download(tenant_id)
+        warm_up_ps = env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "node_secondary"
+        ][0]
 
     # Call into storage controller to onboard the tenant
     generation += 1
@@ -344,6 +348,18 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     )
     assert len(r["shards"]) == 1
 
+    describe = env.storage_controller.tenant_describe(tenant_id)["shards"][0]
+    dest_ps_id = describe["node_attached"]
+    dest_ps = env.get_pageserver(dest_ps_id)
+    if warm_up:
+        # The storage controller should have attached the tenant to the same placce
+        # it had a secondary location, otherwise there was no point warming it up
+        assert dest_ps_id == warm_up_ps
+
+        # It should have been given a new secondary location as well
+        assert len(describe["node_secondary"]) == 1
+        assert describe["node_secondary"][0] != warm_up_ps
+
     # As if doing a live migration, detach the original pageserver
     origin_ps.http_client().tenant_location_conf(
         tenant_id,
@@ -415,6 +431,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
         dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]
     )
     dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
+
+    # Storage controller auto-sets heatmap period, ignore it for the comparison
+    del dest_tenant_conf_after.tenant_specific_overrides["heatmap_period"]
     assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf
 
     env.storage_controller.consistency_check()

From e8a98adcd0a06a8c50c3483d7109e252f4d4d4e0 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sun, 21 Apr 2024 11:56:34 +0100
Subject: [PATCH 0587/1571] CI: downgrade docker/setup-buildx-action to v2

- Cleanup part for `docker/setup-buildx-action` started to fail with the following error (for no obvious reason):
```
/nvme/actions-runner/_work/_actions/docker/setup-buildx-action/v3/webpack:/docker-setup-buildx/node_modules/@actions/cache/lib/cache.js:175
            throw new Error(`Path Validation Error: Path(s) specified in the action for caching do(es) not exist, hence no cache is being saved.`);
^
Error: Path Validation Error: Path(s) specified in the action for caching do(es) not exist, hence no cache is being saved.
    at Object.rejected (/nvme/actions-runner/_work/_actions/docker/setup-buildx-action/v3/webpack:/docker-setup-buildx/node_modules/@actions/cache/lib/cache.js:175:1)
    at Generator.next (<anonymous>)
    at fulfilled (/nvme/actions-runner/_work/_actions/docker/setup-buildx-action/v3/webpack:/docker-setup-buildx/node_modules/@actions/cache/lib/cache.js:29:1)
```

- Downgrade `docker/setup-buildx-action` from v3 to v2
---
 .github/workflows/build_and_test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1d35fa9223..c395b36c21 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -735,7 +735,7 @@ jobs:
         run: |
           mkdir -p .docker-custom
           echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2
 
       - uses: docker/login-action@v3
         with:
@@ -792,7 +792,7 @@ jobs:
         run: |
           mkdir -p .docker-custom
           echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2
         with:
           # Disable parallelism for docker buildkit.
           # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.

From 0d21187322591412fbf7309d9e8780d660a9bf60 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 20 Apr 2024 12:37:58 +0300
Subject: [PATCH 0588/1571] update rustls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

`cargo deny check` is complaining about our rustls versions, causing
CI to fail:

```
error[vulnerability]: `rustls::ConnectionCommon::complete_io` could fall into an infinite loop based on network input
    ┌─ /__w/neon/neon/Cargo.lock:395:1
    │
395 │ rustls 0.21.9 registry+https://github.com/rust-lang/crates.io-index
    │ ------------------------------------------------------------------- security vulnerability detected
    │
    = ID: RUSTSEC-2024-0336
    = Advisory: https://rustsec.org/advisories/RUSTSEC-2024-0336
    = If a `close_notify` alert is received during a handshake, `complete_io`
      does not terminate.

      Callers which do not call `complete_io` are not affected.

      `rustls-tokio` and `rustls-ffi` do not call `complete_io`
      and are not affected.

      `rustls::Stream` and `rustls::StreamOwned` types use
      `complete_io` and are affected.
    = Announcement: https://github.com/rustls/rustls/security/advisories/GHSA-6g7w-8wpp-frhj
    = Solution: Upgrade to >=0.23.5 OR >=0.22.4, <0.23.0 OR >=0.21.11, <0.22.0 (try `cargo update -p rustls`)

error[vulnerability]: `rustls::ConnectionCommon::complete_io` could fall into an infinite loop based on network input
    ┌─ /__w/neon/neon/Cargo.lock:396:1
    │
396 │ rustls 0.22.2 registry+https://github.com/rust-lang/crates.io-index
    │ ------------------------------------------------------------------- security vulnerability detected
    │
    = ID: RUSTSEC-2024-0336
    = Advisory: https://rustsec.org/advisories/RUSTSEC-2024-0336
    = If a `close_notify` alert is received during a handshake, `complete_io`
      does not terminate.

      Callers which do not call `complete_io` are not affected.

      `rustls-tokio` and `rustls-ffi` do not call `complete_io`
      and are not affected.

      `rustls::Stream` and `rustls::StreamOwned` types use
      `complete_io` and are affected.
    = Announcement: https://github.com/rustls/rustls/security/advisories/GHSA-6g7w-8wpp-frhj
    = Solution: Upgrade to >=0.23.5 OR >=0.22.4, <0.23.0 OR >=0.21.11, <0.22.0 (try `cargo update -p rustls`)
```

## Summary of changes

`cargo update -p rustls@0.21.9 -p rustls@0.22.2`
---
 Cargo.lock | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6faf4b72f0..76183bdaab 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -599,7 +599,7 @@ dependencies = [
  "once_cell",
  "pin-project-lite",
  "pin-utils",
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "tokio",
  "tracing",
 ]
@@ -2519,7 +2519,7 @@ dependencies = [
  "http 0.2.9",
  "hyper 0.14.26",
  "log",
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "rustls-native-certs 0.6.2",
  "tokio",
  "tokio-rustls 0.24.0",
@@ -4059,7 +4059,7 @@ dependencies = [
  "futures",
  "once_cell",
  "pq_proto",
- "rustls 0.22.2",
+ "rustls 0.22.4",
  "rustls-pemfile 2.1.1",
  "serde",
  "thiserror",
@@ -4350,7 +4350,7 @@ dependencies = [
  "routerify",
  "rstest",
  "rustc-hash",
- "rustls 0.22.2",
+ "rustls 0.22.4",
  "rustls-pemfile 2.1.1",
  "scopeguard",
  "serde",
@@ -4542,7 +4542,7 @@ dependencies = [
  "itoa",
  "percent-encoding",
  "pin-project-lite",
- "rustls 0.22.2",
+ "rustls 0.22.4",
  "rustls-native-certs 0.7.0",
  "rustls-pemfile 2.1.1",
  "rustls-pki-types",
@@ -4696,7 +4696,7 @@ dependencies = [
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "rustls-pemfile 1.0.2",
  "serde",
  "serde_json",
@@ -4956,9 +4956,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.21.9"
+version = "0.21.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "629648aced5775d558af50b2b4c7b02983a04b312126d45eeead26e7caa498b9"
+checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
 dependencies = [
  "log",
  "ring 0.17.6",
@@ -4968,9 +4968,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.22.2"
+version = "0.22.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41"
+checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
 dependencies = [
  "log",
  "ring 0.17.6",
@@ -5282,7 +5282,7 @@ checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
 dependencies = [
  "httpdate",
  "reqwest",
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "sentry-backtrace",
  "sentry-contexts",
  "sentry-core",
@@ -6193,7 +6193,7 @@ checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
 dependencies = [
  "futures",
  "ring 0.17.6",
- "rustls 0.22.2",
+ "rustls 0.22.4",
  "tokio",
  "tokio-postgres",
  "tokio-rustls 0.25.0",
@@ -6206,7 +6206,7 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
 dependencies = [
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "tokio",
 ]
 
@@ -6216,7 +6216,7 @@ version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f"
 dependencies = [
- "rustls 0.22.2",
+ "rustls 0.22.4",
  "rustls-pki-types",
  "tokio",
 ]
@@ -6677,7 +6677,7 @@ dependencies = [
  "base64 0.21.1",
  "log",
  "once_cell",
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "rustls-webpki 0.100.2",
  "url",
  "webpki-roots 0.23.1",
@@ -7354,7 +7354,7 @@ dependencies = [
  "regex-automata 0.4.3",
  "regex-syntax 0.8.2",
  "reqwest",
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "scopeguard",
  "serde",
  "serde_json",

From 35e9fb360b4a0c51a88f98ffaf1c252f2f0850a5 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Sun, 21 Apr 2024 17:35:01 -0700
Subject: [PATCH 0589/1571] Bump vm-builder v0.23.2 -> v0.28.1 (#7433)

Only one relevant change, from v0.28.0:

- neondatabase/autoscaling#887

Double-checked with `git log neonvm/tools/vm-builder`.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index c395b36c21..a7e108fac4 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -865,7 +865,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.23.2
+      VM_BUILDER_VERSION: v0.28.1
 
     steps:
       - name: Checkout

From 3a673dce67f0d5d9ab2163e9f4bd818bbc4b5375 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 22 Apr 2024 10:58:10 +0300
Subject: [PATCH 0590/1571] Make test less sensitive to exact WAL positions
 (#7436)

As noted in the comment, the craft_internal() function fails if the
inserted WAL happens to land at page boundary. I bumped into that with
PR #7377; it changed the arguments of a few SQL functions in
neon_test_utils extension, which changed the WAL positions slightly, and
caused a test failure.
---
 libs/postgres_ffi/src/lib.rs           |  4 +++-
 libs/postgres_ffi/wal_craft/src/lib.rs | 22 +++++++++++++++++-----
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index aa6845b9b1..0d6986778a 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -118,7 +118,9 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
 // Likewise for these, although the assumption that these don't change is a little more iffy.
 pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
 pub use v14::bindings::{PageHeaderData, XLogRecord};
-pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+pub use v14::xlog_utils::{
+    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
+};
 
 pub use v14::bindings::{CheckPoint, ControlFileData};
 
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 23786e3b08..223ff08e8d 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -4,7 +4,9 @@ use log::*;
 use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
-use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+use postgres_ffi::{
+    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
+};
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -262,11 +264,21 @@ fn craft_internal<C: postgres::GenericClient>(
         intermediate_lsns.insert(0, initial_lsn);
     }
 
-    // Some records may be not flushed, e.g. non-transactional logical messages.
+    // Some records may be not flushed, e.g. non-transactional logical messages. Flush now.
     //
-    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
-    // because pg_current_wal_insert_lsn skips page headers.
-    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
+    // If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn
+    // returns the position just after the page header on the next page. That's where the next
+    // record will be inserted. But the page header hasn't actually been written to the WAL
+    // yet, and if you try to flush it, you get a "request to flush past end of generated WAL"
+    // error. Because of that, if the insert location is just after a page header, back off to
+    // previous page boundary.
+    let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?);
+    if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 {
+        lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
+    } else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 {
+        lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
+    }
+    client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?;
     Ok(intermediate_lsns)
 }
 

From 00d9c2d9a81491e1d159c85f6cd129b13755f9f8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 22 Apr 2024 10:58:28 +0300
Subject: [PATCH 0591/1571] Make another walcraft test more robust (#7439)

There were two issues with the test at page boundaries:

1. If the first logical message with 10 bytes payload crossed a page
boundary, the calculated 'base_size' was too large because it included
the page header.

2. If it was inserted near the end of a page so that there was not
enough room for another one, we did "remaining_lsn += XLOG_BLCKSZ" but
that didn't take into account the page headers either.

As a result, the test would fail if the WAL insert position at the
beginning of the test was too close to the end of a WAL page. Fix the
calculations by repeating the 10-byte logical message if the starting
position is not suitable.

I bumped into this with PR #7377; it changed the arguments of a few SQL
functions in neon_test_utils extension, which changed the WAL positions
slightly, and caused a test failure.


This is similar to https://github.com/neondatabase/neon/pull/7436, but
for different test.
---
 libs/postgres_ffi/wal_craft/src/lib.rs | 63 +++++++++++++++-----------
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 223ff08e8d..262068cbda 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -332,38 +332,49 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
 
         client.execute("CREATE table t(x int)", &[])?;
 
-        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
-        // We will use logical message as the padding. We start with detecting how much WAL
-        // it takes for one logical message, considering all alignments and headers.
-        let base_wal_advance = {
+        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.  We
+        // will use carefully-sized logical messages to advance WAL insert location such
+        // that there is just enough space on the page for the XLOG_SWITCH record.
+        loop {
+            // We start with measuring how much WAL it takes for one logical message,
+            // considering all alignments and headers.
             let before_lsn = client.pg_current_wal_insert_lsn()?;
-            // Small non-empty message bigger than few bytes is more likely than an empty
-            // message to have the same format as the big padding message.
             client.execute(
                 "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
                 &[],
             )?;
-            // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
-            (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
-                + XLOG_SIZE_OF_XLOG_RECORD
-        };
-        let mut remaining_lsn =
-            XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
-        if remaining_lsn < base_wal_advance {
-            remaining_lsn += XLOG_BLCKSZ;
+            let after_lsn = client.pg_current_wal_insert_lsn()?;
+
+            // Did the record cross a page boundary? If it did, start over. Crossing a
+            // page boundary adds to the apparent size of the record because of the page
+            // header, which throws off the calculation.
+            if u64::from(before_lsn) / XLOG_BLCKSZ as u64
+                != u64::from(after_lsn) / XLOG_BLCKSZ as u64
+            {
+                continue;
+            }
+            // base_size is the size of a logical message without the payload
+            let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10;
+
+            // Is there enough space on the page for another logical message and an
+            // XLOG_SWITCH? If not, start over.
+            let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
+            if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
+                continue;
+            }
+
+            // We will write another logical message, such that after the logical message
+            // record, there will be space for exactly one XLOG_SWITCH. How large should
+            // the logical message's payload be? An XLOG_SWITCH record has no data => its
+            // size is exactly XLOG_SIZE_OF_XLOG_RECORD.
+            let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64;
+
+            client.execute(
+                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
+                &[&(repeats as i32)],
+            )?;
+            break;
         }
-        let repeats = 10 + remaining_lsn - base_wal_advance;
-        info!(
-            "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
-            client.pg_current_wal_insert_lsn()?,
-            remaining_lsn,
-            base_wal_advance,
-            repeats
-        );
-        client.execute(
-            "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
-            &[&(repeats as i32)],
-        )?;
         info!(
             "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
             client.pg_current_wal_insert_lsn()?,

From b91c58a8bf8b3e11451220fe3bb2a4479023fa45 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 22 Apr 2024 11:57:14 +0300
Subject: [PATCH 0592/1571] refactor(Timeline): simpler metadata updates
 (#7422)

Currently, any `Timeline::schedule_uploads` will generate a fresh
`TimelineMetadata` instead of updating the values, which it means to
update. This makes it impossible for #6994 to work while `Timeline`
receives layer flushes by overwriting any configured new
`ancestor_timeline_id` and possible `ancestor_lsn`.

The solution is to only make full `TimelineMetadata` "updates" from one
place: branching. At runtime, update only the three fields, same as
before in `Timeline::schedule_updates`.
---
 pageserver/src/tenant.rs                      |  5 +--
 pageserver/src/tenant/metadata.rs             | 27 ++++++++++++++++
 .../src/tenant/remote_timeline_client.rs      | 31 ++++++++++++++++---
 pageserver/src/tenant/timeline.rs             | 17 +++-------
 4 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ff17400d45..15be6df637 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -559,9 +559,10 @@ impl Tenant {
             // By doing what we do here, the index part upload is retried.
             // If control plane retries timeline creation in the meantime, the mgmt API handler
             // for timeline creation will coalesce on the upload we queue here.
+            // FIXME: this branch should be dead code as we no longer write local metadata.
             let rtc = timeline.remote_client.as_ref().unwrap();
             rtc.init_upload_queue_for_empty_remote(&metadata)?;
-            rtc.schedule_index_upload_for_metadata_update(&metadata)?;
+            rtc.schedule_index_upload_for_full_metadata_update(&metadata)?;
         }
 
         timeline
@@ -3027,7 +3028,7 @@ impl Tenant {
         // See also https://github.com/neondatabase/neon/issues/3865
         if let Some(remote_client) = new_timeline.remote_client.as_ref() {
             remote_client
-                .schedule_index_upload_for_metadata_update(&metadata)
+                .schedule_index_upload_for_full_metadata_update(&metadata)
                 .context("branch initial metadata upload")?;
         }
 
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 1736950d1f..39da713479 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -235,6 +235,12 @@ impl TimelineMetadata {
         let bytes = instance.to_bytes().unwrap();
         Self::from_bytes(&bytes).unwrap()
     }
+
+    pub(crate) fn apply(&mut self, update: &MetadataUpdate) {
+        self.body.disk_consistent_lsn = update.disk_consistent_lsn;
+        self.body.prev_record_lsn = update.prev_record_lsn;
+        self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn;
+    }
 }
 
 impl<'de> Deserialize<'de> for TimelineMetadata {
@@ -259,6 +265,27 @@ impl Serialize for TimelineMetadata {
     }
 }
 
+/// Parts of the metadata which are regularly modified.
+pub(crate) struct MetadataUpdate {
+    disk_consistent_lsn: Lsn,
+    prev_record_lsn: Option<Lsn>,
+    latest_gc_cutoff_lsn: Lsn,
+}
+
+impl MetadataUpdate {
+    pub(crate) fn new(
+        disk_consistent_lsn: Lsn,
+        prev_record_lsn: Option<Lsn>,
+        latest_gc_cutoff_lsn: Lsn,
+    ) -> Self {
+        Self {
+            disk_consistent_lsn,
+            prev_record_lsn,
+            latest_gc_cutoff_lsn,
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 3879135f26..1fa3badefb 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -236,6 +236,7 @@ use utils::id::{TenantId, TimelineId};
 
 use self::index::IndexPart;
 
+use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;
@@ -536,9 +537,10 @@ impl RemoteTimelineClient {
     // Upload operations.
     //
 
-    ///
     /// Launch an index-file upload operation in the background, with
-    /// updated metadata.
+    /// fully updated metadata.
+    ///
+    /// This should only be used to upload initial metadata to remote storage.
     ///
     /// The upload will be added to the queue immediately, but it
     /// won't be performed until all previously scheduled layer file
@@ -550,7 +552,7 @@ impl RemoteTimelineClient {
     /// If there were any changes to the list of files, i.e. if any
     /// layer file uploads were scheduled, since the last index file
     /// upload, those will be included too.
-    pub fn schedule_index_upload_for_metadata_update(
+    pub fn schedule_index_upload_for_full_metadata_update(
         self: &Arc<Self>,
         metadata: &TimelineMetadata,
     ) -> anyhow::Result<()> {
@@ -566,6 +568,27 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Launch an index-file upload operation in the background, with only parts of the metadata
+    /// updated.
+    ///
+    /// This is the regular way of updating metadata on layer flushes or Gc.
+    ///
+    /// Using this lighter update mechanism allows for reparenting and detaching without changes to
+    /// `index_part.json`, while being more clear on what values update regularly.
+    pub(crate) fn schedule_index_upload_for_metadata_update(
+        self: &Arc<Self>,
+        update: &MetadataUpdate,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        upload_queue.latest_metadata.apply(update);
+
+        self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+
+        Ok(())
+    }
+
     ///
     /// Launch an index-file upload operation in the background, if necessary.
     ///
@@ -2024,7 +2047,7 @@ mod tests {
         // Schedule upload of index. Check that it is queued
         let metadata = dummy_metadata(Lsn(0x20));
         client
-            .schedule_index_upload_for_metadata_update(&metadata)
+            .schedule_index_upload_for_full_metadata_update(&metadata)
             .unwrap();
         {
             let mut guard = client.upload_queue.lock().unwrap();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3f2d807ce8..8e815ddae8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3525,7 +3525,7 @@ impl Timeline {
         &self,
         disk_consistent_lsn: Lsn,
         layers_to_upload: impl IntoIterator<Item = ResidentLayer>,
-    ) -> anyhow::Result<TimelineMetadata> {
+    ) -> anyhow::Result<()> {
         // We can only save a valid 'prev_record_lsn' value on disk if we
         // flushed *all* in-memory changes to disk. We only track
         // 'prev_record_lsn' in memory for the latest processed record, so we
@@ -3542,19 +3542,10 @@ impl Timeline {
             None
         };
 
-        let ancestor_timeline_id = self
-            .ancestor_timeline
-            .as_ref()
-            .map(|ancestor| ancestor.timeline_id);
-
-        let metadata = TimelineMetadata::new(
+        let update = crate::tenant::metadata::MetadataUpdate::new(
             disk_consistent_lsn,
             ondisk_prev_record_lsn,
-            ancestor_timeline_id,
-            self.ancestor_lsn,
             *self.latest_gc_cutoff_lsn.read(),
-            self.initdb_lsn,
-            self.pg_version,
         );
 
         fail_point!("checkpoint-before-saving-metadata", |x| bail!(
@@ -3566,10 +3557,10 @@ impl Timeline {
             for layer in layers_to_upload {
                 remote_client.schedule_layer_file_upload(layer)?;
             }
-            remote_client.schedule_index_upload_for_metadata_update(&metadata)?;
+            remote_client.schedule_index_upload_for_metadata_update(&update)?;
         }
 
-        Ok(metadata)
+        Ok(())
     }
 
     pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {

From 47addc15f182cd1823cc4b7713117376823d281e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 22 Apr 2024 13:04:37 +0300
Subject: [PATCH 0593/1571] relaxation: allow using layers across timelines
 (#7453)

Before, we asserted that a layer would only be loaded by the timeline
that initially created it. Now, with the ancestor detach, we will want
to utilize remote copy as much as possible, so we will need to open
other timeline layers as our own.

Cc: #6994
---
 pageserver/src/tenant/storage_layer/delta_layer.rs | 3 +++
 pageserver/src/tenant/storage_layer/image_layer.rs | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index c5b5e5c98f..a4b2b4f840 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -728,6 +728,9 @@ impl DeltaLayerInner {
             // production code path
             expected_summary.index_start_blk = actual_summary.index_start_blk;
             expected_summary.index_root_blk = actual_summary.index_root_blk;
+            // mask out the timeline_id, but still require the layers to be from the same tenant
+            expected_summary.timeline_id = actual_summary.timeline_id;
+
             if actual_summary != expected_summary {
                 bail!(
                     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 5b44d2bc2c..6f46a0203b 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -396,6 +396,8 @@ impl ImageLayerInner {
             // production code path
             expected_summary.index_start_blk = actual_summary.index_start_blk;
             expected_summary.index_root_blk = actual_summary.index_root_blk;
+            // mask out the timeline_id, but still require the layers to be from the same tenant
+            expected_summary.timeline_id = actual_summary.timeline_id;
 
             if actual_summary != expected_summary {
                 bail!(

From 6a5650d40c82496ea5d3fc7b870cf0e6e130e91f Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 22 Apr 2024 13:37:22 +0200
Subject: [PATCH 0594/1571] proxy: Make retries configurable and record it.
 (#7438)

## Problem

Currently we cannot configure retries, also, we don't really have
visibility of what's going on there.

## Summary of changes

* Added cli params
* Improved logging
* Decrease the number of retries: it feels like most of retries doesn't
help. Once there would be better errors handling, we can increase it
back.
---
 proxy/src/bin/proxy.rs             | 10 ++++
 proxy/src/config.rs                | 55 ++++++++++++++++++
 proxy/src/metrics.rs               | 17 ++++++
 proxy/src/proxy.rs                 |  2 +
 proxy/src/proxy/connect_compute.rs | 40 ++++++++++++--
 proxy/src/proxy/retry.rs           | 18 +++---
 proxy/src/proxy/tests.rs           | 89 +++++++++++++++++++++++-------
 proxy/src/proxy/wake_compute.rs    | 36 ++++++++++--
 proxy/src/serverless/backend.rs    |  2 +
 9 files changed, 226 insertions(+), 43 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index b54f8c131c..7df320fd42 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -200,6 +200,12 @@ struct ProxyCliArgs {
     /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
     #[clap(long, default_value = "4194304")]
     metric_backup_collection_chunk_size: usize,
+    /// Whether to retry the connection to the compute node
+    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
+    connect_to_compute_retry: String,
+    /// Whether to retry the wake_compute request
+    #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)]
+    wake_compute_retry: String,
 }
 
 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -584,6 +590,10 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         handshake_timeout: args.handshake_timeout,
         region: args.region.clone(),
         aws_region: args.aws_region.clone(),
+        wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
+        connect_to_compute_retry_config: config::RetryConfig::parse(
+            &args.connect_to_compute_retry,
+        )?,
     }));
 
     Ok(config)
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index f9519c7645..ae7606e5d4 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -33,6 +33,8 @@ pub struct ProxyConfig {
     pub region: String,
     pub handshake_timeout: Duration,
     pub aws_region: String,
+    pub wake_compute_retry_config: RetryConfig,
+    pub connect_to_compute_retry_config: RetryConfig,
 }
 
 #[derive(Debug)]
@@ -517,6 +519,59 @@ impl FromStr for ProjectInfoCacheOptions {
     }
 }
 
+/// This is a config for connect to compute and wake compute.
+#[derive(Clone, Copy, Debug)]
+pub struct RetryConfig {
+    /// Number of times we should retry.
+    pub max_retries: u32,
+    /// Retry duration is base_delay * backoff_factor ^ n, where n starts at 0
+    pub base_delay: tokio::time::Duration,
+    /// Exponential base for retry wait duration
+    pub backoff_factor: f64,
+}
+
+impl RetryConfig {
+    /// Default options for RetryConfig.
+
+    /// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
+    pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
+        "num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
+    /// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
+    /// Cplane has timeout of 60s on each request.
+    pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str =
+        "num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
+
+    /// Parse retry options passed via cmdline.
+    /// Example: [`Self::CONNECT_TO_COMPUTE_DEFAULT_VALUES`].
+    pub fn parse(options: &str) -> anyhow::Result<Self> {
+        let mut num_retries = None;
+        let mut base_retry_wait_duration = None;
+        let mut retry_wait_exponent_base = None;
+
+        for option in options.split(',') {
+            let (key, value) = option
+                .split_once('=')
+                .with_context(|| format!("bad key-value pair: {option}"))?;
+
+            match key {
+                "num_retries" => num_retries = Some(value.parse()?),
+                "base_retry_wait_duration" => {
+                    base_retry_wait_duration = Some(humantime::parse_duration(value)?)
+                }
+                "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?),
+                unknown => bail!("unknown key: {unknown}"),
+            }
+        }
+
+        Ok(Self {
+            max_retries: num_retries.context("missing `num_retries`")?,
+            base_delay: base_retry_wait_duration.context("missing `base_retry_wait_duration`")?,
+            backoff_factor: retry_wait_exponent_base
+                .context("missing `retry_wait_exponent_base`")?,
+        })
+    }
+}
+
 /// Helper for cmdline cache options parsing.
 pub struct WakeComputeLockOptions {
     /// The number of shards the lock map should have
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 3a4e54aea0..530350008c 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -119,6 +119,10 @@ pub struct ProxyMetrics {
 
     /// Number of invalid endpoints (per protocol, per rejected).
     pub invalid_endpoints_total: CounterVec<InvalidEndpointsSet>,
+
+    /// Number of retries (per outcome, per retry_type).
+    #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))]
+    pub retries_metric: HistogramVec<RetriesMetricSet, 9>,
 }
 
 #[derive(MetricGroup)]
@@ -480,3 +484,16 @@ pub struct InvalidEndpointsGroup {
     pub rejected: Bool,
     pub outcome: ConnectOutcome,
 }
+
+#[derive(LabelGroup)]
+#[label(set = RetriesMetricSet)]
+pub struct RetriesMetricGroup {
+    pub outcome: ConnectOutcome,
+    pub retry_type: RetryType,
+}
+
+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+pub enum RetryType {
+    WakeCompute,
+    ConnectToCompute,
+}
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 4321bad968..a4554eef38 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -308,6 +308,8 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         &TcpMechanism { params: &params },
         &user_info,
         mode.allow_self_signed_compute(config),
+        config.wake_compute_retry_config,
+        config.connect_to_compute_retry_config,
     )
     .or_else(|e| stream.throw_error(e))
     .await?;
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 33f394c550..8a220aaa0c 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -1,10 +1,11 @@
 use crate::{
     auth::backend::ComputeCredentialKeys,
     compute::{self, PostgresConnection},
+    config::RetryConfig,
     console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
     context::RequestMonitoring,
     error::ReportableError,
-    metrics::{ConnectionFailureKind, Metrics},
+    metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType},
     proxy::{
         retry::{retry_after, ShouldRetry},
         wake_compute::wake_compute,
@@ -93,19 +94,23 @@ pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
     mechanism: &M,
     user_info: &B,
     allow_self_signed_compute: bool,
+    wake_compute_retry_config: RetryConfig,
+    connect_to_compute_retry_config: RetryConfig,
 ) -> Result<M::Connection, M::Error>
 where
     M::ConnectError: ShouldRetry + std::fmt::Debug,
     M::Error: From<WakeComputeError>,
 {
     let mut num_retries = 0;
-    let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+    let mut node_info =
+        wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
     if let Some(keys) = user_info.get_keys() {
         node_info.set_keys(keys);
     }
     node_info.allow_self_signed_compute = allow_self_signed_compute;
     // let mut node_info = credentials.get_node_info(ctx, user_info).await?;
     mechanism.update_connect_config(&mut node_info.config);
+    let retry_type = RetryType::ConnectToCompute;
 
     // try once
     let err = match mechanism
@@ -114,6 +119,13 @@ where
     {
         Ok(res) => {
             ctx.latency_timer.success();
+            Metrics::get().proxy.retries_metric.observe(
+                RetriesMetricGroup {
+                    outcome: ConnectOutcome::Success,
+                    retry_type,
+                },
+                num_retries.into(),
+            );
             return Ok(res);
         }
         Err(e) => e,
@@ -124,7 +136,7 @@ where
     let node_info = if !node_info.cached() {
         // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
         // Do not need to retrieve a new node_info, just return the old one.
-        if !err.should_retry(num_retries) {
+        if !err.should_retry(num_retries, connect_to_compute_retry_config) {
             return Err(err.into());
         }
         node_info
@@ -132,7 +144,8 @@ where
         // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
         info!("compute node's state has likely changed; requesting a wake-up");
         let old_node_info = invalidate_cache(node_info);
-        let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+        let mut node_info =
+            wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
         node_info.reuse_settings(old_node_info);
 
         mechanism.update_connect_config(&mut node_info.config);
@@ -151,19 +164,34 @@ where
         {
             Ok(res) => {
                 ctx.latency_timer.success();
+                Metrics::get().proxy.retries_metric.observe(
+                    RetriesMetricGroup {
+                        outcome: ConnectOutcome::Success,
+                        retry_type,
+                    },
+                    num_retries.into(),
+                );
+                info!(?num_retries, "connected to compute node after");
                 return Ok(res);
             }
             Err(e) => {
-                let retriable = e.should_retry(num_retries);
+                let retriable = e.should_retry(num_retries, connect_to_compute_retry_config);
                 if !retriable {
                     error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
+                    Metrics::get().proxy.retries_metric.observe(
+                        RetriesMetricGroup {
+                            outcome: ConnectOutcome::Failed,
+                            retry_type,
+                        },
+                        num_retries.into(),
+                    );
                     return Err(e.into());
                 }
                 warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
             }
         }
 
-        let wait_duration = retry_after(num_retries);
+        let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
         num_retries += 1;
 
         time::sleep(wait_duration).await;
diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs
index a85ed380b0..082e06caa3 100644
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -1,18 +1,12 @@
-use crate::compute;
+use crate::{compute, config::RetryConfig};
 use std::{error::Error, io};
 use tokio::time;
 
-/// Number of times we should retry the `/proxy_wake_compute` http request.
-/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
-pub const NUM_RETRIES_CONNECT: u32 = 16;
-const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
-const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
-
 pub trait ShouldRetry {
     fn could_retry(&self) -> bool;
-    fn should_retry(&self, num_retries: u32) -> bool {
+    fn should_retry(&self, num_retries: u32, config: RetryConfig) -> bool {
         match self {
-            _ if num_retries >= NUM_RETRIES_CONNECT => false,
+            _ if num_retries >= config.max_retries => false,
             err => err.could_retry(),
         }
     }
@@ -63,6 +57,8 @@ impl ShouldRetry for compute::ConnectionError {
     }
 }
 
-pub fn retry_after(num_retries: u32) -> time::Duration {
-    BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
+pub fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration {
+    config
+        .base_delay
+        .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1))
 }
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 849e9bd33c..e0ec90cb44 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -10,13 +10,13 @@ use super::*;
 use crate::auth::backend::{
     ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
 };
-use crate::config::CertResolver;
+use crate::config::{CertResolver, RetryConfig};
 use crate::console::caches::NodeInfoCache;
 use crate::console::messages::MetricsAuxInfo;
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
-use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
+use crate::proxy::retry::retry_after;
 use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
@@ -361,11 +361,15 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 #[test]
 fn connect_compute_total_wait() {
     let mut total_wait = tokio::time::Duration::ZERO;
-    for num_retries in 1..NUM_RETRIES_CONNECT {
-        total_wait += retry_after(num_retries);
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    for num_retries in 1..config.max_retries {
+        total_wait += retry_after(num_retries, config);
     }
-    assert!(total_wait < tokio::time::Duration::from_secs(12));
-    assert!(total_wait > tokio::time::Duration::from_secs(10));
+    assert!(f64::abs(total_wait.as_secs_f64() - 15.0) < 0.1);
 }
 
 #[derive(Clone, Copy, Debug)]
@@ -549,7 +553,12 @@ async fn connect_to_compute_success() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -562,7 +571,12 @@ async fn connect_to_compute_retry() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -576,7 +590,12 @@ async fn connect_to_compute_non_retry_1() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -590,7 +609,12 @@ async fn connect_to_compute_non_retry_2() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -600,17 +624,32 @@ async fn connect_to_compute_non_retry_2() {
 #[tokio::test]
 async fn connect_to_compute_non_retry_3() {
     let _ = env_logger::try_init();
-    assert_eq!(NUM_RETRIES_CONNECT, 16);
+    tokio::time::pause();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![
-        Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
-        Retry, Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
-    ]);
+    let mechanism =
+        TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
-        .await
-        .unwrap_err();
+    let wake_compute_retry_config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 1,
+        backoff_factor: 2.0,
+    };
+    let connect_to_compute_retry_config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(
+        &mut ctx,
+        &mechanism,
+        &user_info,
+        false,
+        wake_compute_retry_config,
+        connect_to_compute_retry_config,
+    )
+    .await
+    .unwrap_err();
     mechanism.verify();
 }
 
@@ -622,7 +661,12 @@ async fn wake_retry() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -636,7 +680,12 @@ async fn wake_non_retry() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap_err();
     mechanism.verify();
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index fe228ab33d..cfedf0e98a 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,10 +1,14 @@
+use crate::config::RetryConfig;
 use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
 use crate::context::RequestMonitoring;
-use crate::metrics::{ConnectionFailuresBreakdownGroup, Metrics, WakeupFailureKind};
+use crate::metrics::{
+    ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,
+    WakeupFailureKind,
+};
 use crate::proxy::retry::retry_after;
 use hyper::StatusCode;
 use std::ops::ControlFlow;
-use tracing::{error, warn};
+use tracing::{error, info, warn};
 
 use super::connect_compute::ComputeConnectBackend;
 use super::retry::ShouldRetry;
@@ -13,23 +17,42 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
     ctx: &mut RequestMonitoring,
     api: &B,
+    config: RetryConfig,
 ) -> Result<CachedNodeInfo, WakeComputeError> {
+    let retry_type = RetryType::WakeCompute;
     loop {
         let wake_res = api.wake_compute(ctx).await;
-        match handle_try_wake(wake_res, *num_retries) {
+        match handle_try_wake(wake_res, *num_retries, config) {
             Err(e) => {
                 error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
                 report_error(&e, false);
+                Metrics::get().proxy.retries_metric.observe(
+                    RetriesMetricGroup {
+                        outcome: ConnectOutcome::Failed,
+                        retry_type,
+                    },
+                    (*num_retries).into(),
+                );
                 return Err(e);
             }
             Ok(ControlFlow::Continue(e)) => {
                 warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
                 report_error(&e, true);
             }
-            Ok(ControlFlow::Break(n)) => return Ok(n),
+            Ok(ControlFlow::Break(n)) => {
+                Metrics::get().proxy.retries_metric.observe(
+                    RetriesMetricGroup {
+                        outcome: ConnectOutcome::Success,
+                        retry_type,
+                    },
+                    (*num_retries).into(),
+                );
+                info!(?num_retries, "compute node woken up after");
+                return Ok(n);
+            }
         }
 
-        let wait_duration = retry_after(*num_retries);
+        let wait_duration = retry_after(*num_retries, config);
         *num_retries += 1;
         tokio::time::sleep(wait_duration).await;
     }
@@ -42,10 +65,11 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
 pub fn handle_try_wake(
     result: Result<CachedNodeInfo, WakeComputeError>,
     num_retries: u32,
+    config: RetryConfig,
 ) -> Result<ControlFlow<CachedNodeInfo, WakeComputeError>, WakeComputeError> {
     match result {
         Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
+            WakeComputeError::ApiError(api) if api.should_retry(num_retries, config) => {
                 Ok(ControlFlow::Continue(err))
             }
             _ => Err(err),
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index e74c63599a..b91c0e62ed 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -108,6 +108,8 @@ impl PoolingBackend {
             },
             &backend,
             false, // do not allow self signed compute for http flow
+            self.config.wake_compute_retry_config,
+            self.config.connect_to_compute_retry_config,
         )
         .await
     }

From 0bd16182f7b2e7abedbb218238d83928f67607bc Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 22 Apr 2024 12:47:24 +0100
Subject: [PATCH 0595/1571] pageserver: fix unlogged relations with sharding
 (#7454)

## Problem

- #7451

INIT_FORKNUM blocks must be stored on shard 0 to enable including them
in basebackup.

This issue can be missed in simple tests because creating an unlogged
table isn't sufficient -- to repro I had to create an _index_ on an
unlogged table (then restart the endpoint).

Closes: #7451

## Summary of changes

- Add a reproducer for the issue.
- Tweak the condition for `key_is_shard0` to include anything that isn't
a normal relation block _and_ any normal relation block whose forknum is
INIT_FORKNUM.
- To enable existing databases to recover from the issue, add a special
case that omits relations if they were stored on the wrong INITFORK.
This enables postgres to start and the user to drop the table and
recreate it.
---
 libs/pageserver_api/src/shard.rs     | 27 +++++++++++++++++-
 pageserver/src/basebackup.rs         | 17 +++++++++--
 test_runner/regress/test_sharding.py | 42 ++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index c293ad705b..6a8a5cc8f3 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -5,6 +5,7 @@ use crate::{
     models::ShardParameters,
 };
 use hex::FromHex;
+use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
 use utils::id::TenantId;
 
@@ -537,6 +538,24 @@ impl ShardIdentity {
         }
     }
 
+    /// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
+    ///
+    /// When we fail to read a forknum block, this function tells us whether we may ignore the error
+    /// as a symptom of that issue.
+    pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
+        if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
+            return false;
+        }
+
+        let mut hash = murmurhash32(key.field4);
+        hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
+        let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
+
+        // The key may be affected by issue #7454: it is an initfork and it would not
+        // have mapped to shard 0 until we fixed that issue.
+        mapped_shard != ShardNumber(0)
+    }
+
     /// Return true if the key should be discarded if found in this shard's
     /// data store, e.g. during compaction after a split.
     ///
@@ -649,7 +668,13 @@ fn key_is_shard0(key: &Key) -> bool {
     // relation pages are distributed to shards other than shard zero. Everything else gets
     // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
     // requests, and any request other than those for particular blocks in relations.
-    !is_rel_block_key(key)
+    //
+    // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
+    // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
+    // because they must be included in basebackups.
+    let is_initfork = key.field5 == INIT_FORKNUM;
+
+    !is_rel_block_key(key) || is_initfork
 }
 
 /// Provide the same result as the function in postgres `hashfn.h` with the same name
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 0479d05f8f..107758f385 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::{key_to_slru_block, Key};
+use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -297,7 +297,20 @@ where
                 if rel.forknum == INIT_FORKNUM {
                     // I doubt we need _init fork itself, but having it at least
                     // serves as a marker relation is unlogged.
-                    self.add_rel(rel, rel).await?;
+                    if let Err(_e) = self.add_rel(rel, rel).await {
+                        if self
+                            .timeline
+                            .get_shard_identity()
+                            .is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
+                        {
+                            // Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
+                            // whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup.  This allows
+                            // postgres to start up.  The relation won't work, but it will be possible to DROP TABLE on it and
+                            // recreate.
+                            tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
+                            continue;
+                        }
+                    };
                     self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
                     continue;
                 }
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index bfaab9125f..101d2620b0 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1201,3 +1201,45 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
         max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos)
         diff = max_lsn - min_lsn
         assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure"
+
+
+def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder):
+    """
+    Check that an unlogged relation is handled properly on a sharded tenant
+
+    Reproducer for https://github.com/neondatabase/neon/issues/7451
+    """
+
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    neon_env_builder.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=8)
+
+    # We will create many tables to ensure it's overwhelmingly likely that at least one
+    # of them doesn't land on shard 0
+    table_names = [f"my_unlogged_{i}" for i in range(0, 16)]
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
+        for table_name in table_names:
+            ep.safe_psql(f"CREATE UNLOGGED TABLE {table_name} (id integer, value varchar(64));")
+            ep.safe_psql(f"INSERT INTO {table_name} VALUES (1, 'foo')")
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == [(1, "foo")]
+            ep.safe_psql(f"CREATE INDEX ON {table_name} USING btree (value);")
+
+        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
+        for table_name in table_names:
+            # Check that table works: we can select and insert
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == []
+            ep.safe_psql(f"INSERT INTO {table_name} VALUES (2, 'bar');")
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == [(2, "bar")]
+
+        # Ensure that post-endpoint-restart modifications are ingested happily by pageserver
+        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)

From 139d1346d5aed41e1cf1479343943f9bf3670794 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 22 Apr 2024 14:55:17 +0200
Subject: [PATCH 0596/1571] pagectl draw-timeline-dir: include layer file name
 as an SVG comment (#7455)

fixes https://github.com/neondatabase/neon/issues/7452

Also, drive-by improve the usage instructions with commands I found
useful during that incident.

The patch in the fork of `svg_fmt` is [being
upstreamed](https://github.com/nical/rust_debug/pull/4), but, in the
meantime,
let's commit what we have because it was useful during the incident.
---
 Cargo.lock                              |  3 +-
 Cargo.toml                              |  3 +-
 pageserver/ctl/src/draw_timeline_dir.rs | 73 ++++++++++++++++++++-----
 3 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 76183bdaab..cff07239e7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5830,8 +5830,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 [[package]]
 name = "svg_fmt"
 version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f83ba502a3265efb76efb89b0a2f7782ad6f2675015d4ce37e4b547dda42b499"
+source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#b9501105e746629004bc6d0473639320939dbe10"
 
 [[package]]
 name = "syn"
diff --git a/Cargo.toml b/Cargo.toml
index 8310d2d522..677eaa9ce4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -157,7 +157,8 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-svg_fmt = "0.4.1"
+# https://github.com/nical/rust_debug/pull/4
+svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs
index 0e77ef0563..9a556cb3d4 100644
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -9,18 +9,45 @@
 //! Coordinates in both axis are compressed for better readability.
 //! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
 //!
-//! Example use:
+//! The plain text API was chosen so that we can easily work with filenames from various
+//! sources; see the Usage section below for examples.
+//!
+//! # Usage
+//!
+//! ## Producing the SVG
+//!
 //! ```bash
-//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
-//! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
-//! $ firefox out.svg
+//!
+//! # local timeline dir
+//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
+//!     grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
+//!
+//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer`
+//! (jq -r '.historic_layers[] | .layer_file_name' | cargo  run -p pagectl draw-timeline) < layer-map.json > out.svg
+//!
+//! # From an `index_part.json` in S3
+//! (jq -r '.layer_metadata | keys[]' | cargo  run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
+//!
 //! ```
 //!
-//! This API was chosen so that we can easily work with filenames extracted from ssh,
-//! or from pageserver log files.
+//! ## Viewing
 //!
-//! TODO Consider shipping this as a grafana panel plugin:
-//!      <https://grafana.com/tutorials/build-a-panel-plugin/>
+//! **Inkscape** is better than the built-in viewers in browsers.
+//!
+//! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X)
+//! to see the layer file name in the comment field.
+//!
+//! ```bash
+//!
+//! # Linux
+//! inkscape out.svg
+//!
+//! # macOS
+//! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg
+//!
+//! ```
+//!
+
 use anyhow::Result;
 use pageserver::repository::Key;
 use pageserver::METADATA_FILE_NAME;
@@ -65,7 +92,12 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
 
 pub fn main() -> Result<()> {
     // Parse layer filenames from stdin
-    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
+    struct Layer {
+        filename: String,
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+    }
+    let mut files: Vec<Layer> = vec![];
     let stdin = io::stdin();
     for line in stdin.lock().lines() {
         let line = line.unwrap();
@@ -76,14 +108,23 @@ pub fn main() -> Result<()> {
             // Don't try and parse "metadata" like a key-lsn range
             continue;
         }
-        let range = parse_filename(filename);
-        ranges.push(range);
+        let (key_range, lsn_range) = parse_filename(filename);
+        files.push(Layer {
+            filename: filename.to_owned(),
+            key_range,
+            lsn_range,
+        });
     }
 
     // Collect all coordinates
     let mut keys: Vec<Key> = vec![];
     let mut lsns: Vec<Lsn> = vec![];
-    for (keyr, lsnr) in &ranges {
+    for Layer {
+        key_range: keyr,
+        lsn_range: lsnr,
+        ..
+    } in &files
+    {
         keys.push(keyr.start);
         keys.push(keyr.end);
         lsns.push(lsnr.start);
@@ -107,7 +148,12 @@ pub fn main() -> Result<()> {
             h: stretch * lsn_map.len() as f32
         }
     );
-    for (keyr, lsnr) in &ranges {
+    for Layer {
+        filename,
+        key_range: keyr,
+        lsn_range: lsnr,
+    } in &files
+    {
         let key_start = *key_map.get(&keyr.start).unwrap();
         let key_end = *key_map.get(&keyr.end).unwrap();
         let key_diff = key_end - key_start;
@@ -151,6 +197,7 @@ pub fn main() -> Result<()> {
             .fill(fill)
             .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
             .border_radius(0.4)
+            .comment(filename)
         );
     }
     println!("{}", EndSvg);

From 25d9dc6eaf9803675bd694a6d5f107947c8c24aa Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 22 Apr 2024 15:40:35 +0100
Subject: [PATCH 0597/1571] chore(pageserver): separate missing key error
 (#7393)

As part of https://github.com/neondatabase/neon/pull/7375 and to improve
the current vectored get implementation, we separate the missing key
error out. This also saves us several Box allocations in the get page
implementation.

## Summary of changes

* Create a caching field of layer traversal id for each of the layer.
* Remove box allocations for layer traversal id retrieval and implement
MissingKey error message as before. This should be a little bit faster.
* Do not format error message until `Display`.
* For in-mem layer, the descriptor is different before/after frozen. I'm
using once lock for that.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs                 |   3 +
 pageserver/src/pgdatadir_mapping.rs           |  12 +-
 .../tenant/storage_layer/inmemory_layer.rs    |  45 ++++-
 pageserver/src/tenant/storage_layer/layer.rs  |   8 +
 .../src/tenant/storage_layer/layer/tests.rs   |   4 +-
 pageserver/src/tenant/timeline.rs             | 165 ++++++++++--------
 6 files changed, 157 insertions(+), 80 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 20258dd950..81508965b4 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -160,6 +160,9 @@ impl From<PageReconstructError> for ApiError {
     fn from(pre: PageReconstructError) -> ApiError {
         match pre {
             PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
+            PageReconstructError::MissingKey(e) => {
+                ApiError::InternalServerError(anyhow::anyhow!("{e}"))
+            }
             PageReconstructError::Cancelled => {
                 ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
             }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 6f7d74bdee..351a766b10 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1446,10 +1446,14 @@ impl<'a> DatadirModification<'a> {
                     // reset the map.
                     return Err(e.into());
                 }
-                // FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
-                // we are assuming that all _other_ possible errors represents a missing key.  If some
-                // other error occurs, we may incorrectly reset the map of aux files.
-                Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
+                // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
+                // the original code assumes all other errors are missing keys. Therefore, we keep the code path
+                // the same for now, though in theory, we should only match the `MissingKey` variant.
+                Err(
+                    PageReconstructError::Other(_)
+                    | PageReconstructError::WalRedo(_)
+                    | PageReconstructError::MissingKey { .. },
+                ) => {
                     // Key is missing, we must insert an image as the basis for subsequent deltas.
 
                     let mut dir = AuxFilesDirectory {
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 29751641b4..a86d0d48c5 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -26,7 +26,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // while being able to use std::fmt::Write's methods
 use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
 use std::cmp::Ordering;
-use std::fmt::Write as _;
+use std::fmt::Write;
 use std::ops::Range;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
@@ -54,6 +54,12 @@ pub struct InMemoryLayer {
     /// Writes are only allowed when this is `None`.
     end_lsn: OnceLock<Lsn>,
 
+    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
+    local_path_str: Arc<str>,
+
+    /// Used for traversal path. Cached representation of the in-memory layer after frozen.
+    frozen_local_path_str: OnceLock<Arc<str>>,
+
     opened_at: Instant,
 
     /// The above fields never change, except for `end_lsn`, which is only set once.
@@ -241,6 +247,12 @@ impl InMemoryLayer {
         self.start_lsn..self.end_lsn_or_max()
     }
 
+    pub(crate) fn local_path_str(&self) -> &Arc<str> {
+        self.frozen_local_path_str
+            .get()
+            .unwrap_or(&self.local_path_str)
+    }
+
     /// debugging function to print out the contents of the layer
     ///
     /// this is likely completly unused
@@ -430,10 +442,24 @@ impl InMemoryLayer {
     }
 }
 
+fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
+    write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
+}
+
+fn inmem_layer_log_display(
+    mut f: impl Write,
+    timeline: TimelineId,
+    start_lsn: Lsn,
+    end_lsn: Lsn,
+) -> std::fmt::Result {
+    write!(f, "timeline {} in-memory ", timeline)?;
+    inmem_layer_display(f, start_lsn, end_lsn)
+}
+
 impl std::fmt::Display for InMemoryLayer {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         let end_lsn = self.end_lsn_or_max();
-        write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
+        inmem_layer_display(f, self.start_lsn, end_lsn)
     }
 }
 
@@ -458,6 +484,12 @@ impl InMemoryLayer {
 
         Ok(InMemoryLayer {
             file_id: key,
+            local_path_str: {
+                let mut buf = String::new();
+                inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
+                buf.into()
+            },
+            frozen_local_path_str: OnceLock::new(),
             conf,
             timeline_id,
             tenant_shard_id,
@@ -552,6 +584,15 @@ impl InMemoryLayer {
         );
         self.end_lsn.set(end_lsn).expect("end_lsn set only once");
 
+        self.frozen_local_path_str
+            .set({
+                let mut buf = String::new();
+                inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn)
+                    .unwrap();
+                buf.into()
+            })
+            .expect("frozen_local_path_str set only once");
+
         for vec_map in inner.index.values() {
             for (lsn, _pos) in vec_map.as_slice() {
                 assert!(*lsn < end_lsn);
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index e55299f0fa..316a11f8cc 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -395,6 +395,10 @@ impl Layer {
         &self.0.path
     }
 
+    pub(crate) fn local_path_str(&self) -> &Arc<str> {
+        &self.0.path_str
+    }
+
     pub(crate) fn metadata(&self) -> LayerFileMetadata {
         self.0.metadata()
     }
@@ -517,6 +521,9 @@ struct LayerInner {
     /// Full path to the file; unclear if this should exist anymore.
     path: Utf8PathBuf,
 
+    /// String representation of the full path, used for traversal id.
+    path_str: Arc<str>,
+
     desc: PersistentLayerDesc,
 
     /// Timeline access is needed for remote timeline client and metrics.
@@ -722,6 +729,7 @@ impl LayerInner {
 
         LayerInner {
             conf,
+            path_str: path.to_string().into(),
             path,
             desc,
             timeline: Arc::downgrade(timeline),
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index f0697fdf28..52f62faa8d 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -818,11 +818,13 @@ async fn eviction_cancellation_on_drop() {
     }
 }
 
+/// A test case to remind you the cost of these structures. You can bump the size limit
+/// below if it is really necessary to add more fields to the structures.
 #[test]
 fn layer_size() {
     assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
     assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(std::mem::size_of::<LayerInner>(), 2328);
+    assert_eq!(std::mem::size_of::<LayerInner>(), 2344);
     // it also has the utf8 path
 }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8e815ddae8..e707c3b244 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,7 +23,7 @@ use pageserver_api::{
         EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
     },
     reltag::BlockNumber,
-    shard::{ShardIdentity, TenantShardId},
+    shard::{ShardIdentity, ShardNumber, TenantShardId},
 };
 use rand::Rng;
 use serde_with::serde_as;
@@ -428,6 +428,62 @@ pub(crate) enum PageReconstructError {
     /// An error happened replaying WAL records
     #[error(transparent)]
     WalRedo(anyhow::Error),
+
+    #[error("{0}")]
+    MissingKey(MissingKeyError),
+}
+
+#[derive(Debug)]
+pub struct MissingKeyError {
+    stuck_at_lsn: bool,
+    key: Key,
+    shard: ShardNumber,
+    cont_lsn: Lsn,
+    request_lsn: Lsn,
+    ancestor_lsn: Option<Lsn>,
+    traversal_path: Vec<TraversalPathItem>,
+    backtrace: Option<std::backtrace::Backtrace>,
+}
+
+impl std::fmt::Display for MissingKeyError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.stuck_at_lsn {
+            // Records are found in this timeline but no image layer or initial delta record was found.
+            write!(
+                f,
+                "could not find layer with more data for key {} (shard {:?}) at LSN {}, request LSN {}",
+                self.key, self.shard, self.cont_lsn, self.request_lsn
+            )?;
+            if let Some(ref ancestor_lsn) = self.ancestor_lsn {
+                write!(f, ", ancestor {}", ancestor_lsn)?;
+            }
+        } else {
+            // No records in this timeline.
+            write!(
+                f,
+                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
+                self.key, self.shard, self.cont_lsn, self.request_lsn
+            )?;
+        }
+
+        if !self.traversal_path.is_empty() {
+            writeln!(f)?;
+        }
+
+        for (r, c, l) in &self.traversal_path {
+            writeln!(
+                f,
+                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
+                r, c, l,
+            )?;
+        }
+
+        if let Some(ref backtrace) = self.backtrace {
+            write!(f, "\n{}", backtrace)?;
+        }
+
+        Ok(())
+    }
 }
 
 impl PageReconstructError {
@@ -439,6 +495,7 @@ impl PageReconstructError {
             AncestorLsnTimeout(_) => false,
             Cancelled | AncestorStopping(_) => true,
             WalRedo(_) => false,
+            MissingKey { .. } => false,
         }
     }
 }
@@ -753,7 +810,7 @@ impl Timeline {
                 writeln!(
                     msg,
                     "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
-                    layer(),
+                    layer,
                 )
                 .expect("string grows")
             });
@@ -872,9 +929,11 @@ impl Timeline {
                     Err(Cancelled | AncestorStopping(_)) => {
                         return Err(GetVectoredError::Cancelled)
                     }
-                    Err(Other(err)) if err.to_string().contains("could not find data for key") => {
-                        return Err(GetVectoredError::MissingKey(key))
-                    }
+                    // we only capture stuck_at_lsn=false now until we figure out https://github.com/neondatabase/neon/issues/7380
+                    Err(MissingKey(MissingKeyError {
+                        stuck_at_lsn: false,
+                        ..
+                    })) => return Err(GetVectoredError::MissingKey(key)),
                     _ => {
                         values.insert(key, block);
                         key = key.next();
@@ -2692,7 +2751,7 @@ impl Timeline {
     }
 }
 
-type TraversalId = String;
+type TraversalId = Arc<str>;
 
 trait TraversalLayerExt {
     fn traversal_id(&self) -> TraversalId;
@@ -2700,13 +2759,13 @@ trait TraversalLayerExt {
 
 impl TraversalLayerExt for Layer {
     fn traversal_id(&self) -> TraversalId {
-        self.local_path().to_string()
+        Arc::clone(self.local_path_str())
     }
 }
 
 impl TraversalLayerExt for Arc<InMemoryLayer> {
     fn traversal_id(&self) -> TraversalId {
-        format!("timeline {} in-memory {self}", self.get_timeline_id())
+        Arc::clone(self.local_path_str())
     }
 }
 
@@ -2775,32 +2834,35 @@ impl Timeline {
                         if prev <= cont_lsn {
                             // Didn't make any progress in last iteration. Error out to avoid
                             // getting stuck in the loop.
-                            return Err(layer_traversal_error(format!(
-                                "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
+                            return Err(PageReconstructError::MissingKey(MissingKeyError {
+                                stuck_at_lsn: true,
                                 key,
-                                Lsn(cont_lsn.0 - 1),
+                                shard: self.shard_identity.get_shard_number(&key),
+                                cont_lsn: Lsn(cont_lsn.0 - 1),
                                 request_lsn,
-                                timeline.ancestor_lsn
-                            ), traversal_path));
+                                ancestor_lsn: Some(timeline.ancestor_lsn),
+                                traversal_path,
+                                backtrace: None,
+                            }));
                         }
                     }
                     prev_lsn = Some(cont_lsn);
                 }
                 ValueReconstructResult::Missing => {
-                    return Err(layer_traversal_error(
-                        if cfg!(test) {
-                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
-                            )
-                        } else {
-                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
-                            )
-                        },
+                    return Err(PageReconstructError::MissingKey(MissingKeyError {
+                        stuck_at_lsn: false,
+                        key,
+                        shard: self.shard_identity.get_shard_number(&key),
+                        cont_lsn,
+                        request_lsn,
+                        ancestor_lsn: None,
                         traversal_path,
-                    ));
+                        backtrace: if cfg!(test) {
+                            Some(std::backtrace::Backtrace::force_capture())
+                        } else {
+                            None
+                        },
+                    }));
                 }
             }
 
@@ -2848,11 +2910,7 @@ impl Timeline {
                     };
                     cont_lsn = lsn_floor;
                     // metrics: open_layer does not count as fs access, so we are not updating `read_count`
-                    traversal_path.push((
-                        result,
-                        cont_lsn,
-                        Box::new(move || open_layer.traversal_id()),
-                    ));
+                    traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
                     continue 'outer;
                 }
             }
@@ -2879,11 +2937,7 @@ impl Timeline {
                     };
                     cont_lsn = lsn_floor;
                     // metrics: open_layer does not count as fs access, so we are not updating `read_count`
-                    traversal_path.push((
-                        result,
-                        cont_lsn,
-                        Box::new(move || frozen_layer.traversal_id()),
-                    ));
+                    traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
                     continue 'outer;
                 }
             }
@@ -2904,14 +2958,7 @@ impl Timeline {
                 };
                 cont_lsn = lsn_floor;
                 *read_count += 1;
-                traversal_path.push((
-                    result,
-                    cont_lsn,
-                    Box::new({
-                        let layer = layer.to_owned();
-                        move || layer.traversal_id()
-                    }),
-                ));
+                traversal_path.push((result, cont_lsn, layer.traversal_id()));
                 continue 'outer;
             } else if timeline.ancestor_timeline.is_some() {
                 // Nothing on this timeline. Traverse to parent
@@ -4656,35 +4703,7 @@ impl Timeline {
     }
 }
 
-type TraversalPathItem = (
-    ValueReconstructResult,
-    Lsn,
-    Box<dyn Send + FnOnce() -> TraversalId>,
-);
-
-/// Helper function for get_reconstruct_data() to add the path of layers traversed
-/// to an error, as anyhow context information.
-fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageReconstructError {
-    // We want the original 'msg' to be the outermost context. The outermost context
-    // is the most high-level information, which also gets propagated to the client.
-    let mut msg_iter = path
-        .into_iter()
-        .map(|(r, c, l)| {
-            format!(
-                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
-                r,
-                c,
-                l(),
-            )
-        })
-        .chain(std::iter::once(msg));
-    // Construct initial message from the first traversed layer
-    let err = anyhow!(msg_iter.next().unwrap());
-
-    // Append all subsequent traversals, and the error message 'msg', as contexts.
-    let msg = msg_iter.fold(err, |err, msg| err.context(msg));
-    PageReconstructError::from(msg)
-}
+type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
 
 struct TimelineWriterState {
     open_layer: Arc<InMemoryLayer>,

From e69ff3fc00ab8be31e8f69eb3726da1b83d84180 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 22 Apr 2024 19:40:08 +0300
Subject: [PATCH 0598/1571] Refactor updating relation size cache on reads
 (#7376)

Instead of trusting that a request with latest == true means that the
request LSN was at least last_record_lsn, remember explicitly when the
relation cache was initialized.

Incidentally, this allows updating the relation size cache also on reads
from read-only endpoints, when the endpoint is at a relatively recent
LSN (more recent than the end of the timeline when the timeline was
loaded in the pageserver).

Add a comment to wait_or_get_last_lsn() that it might be better to use
an older LSN when possible. Note that doing that would be unsafe,
without the relation cache changes in this commit!
---
 pageserver/src/page_service.rs      |  5 +++++
 pageserver/src/pgdatadir_mapping.rs | 29 +++++++++++++++--------------
 pageserver/src/tenant/timeline.rs   | 17 +++++++++++++++--
 3 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 3b9a30ba4c..62782d8dd3 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -874,6 +874,11 @@ impl PageServerHandler {
             // walsender completes the authentication and starts streaming the
             // WAL.
             if lsn <= last_record_lsn {
+                // It might be better to use max(lsn, latest_gc_cutoff_lsn) instead
+                // last_record_lsn. That would give the same result, since we know
+                // that there haven't been modifications since 'lsn'. Using an older
+                // LSN might be faster, because that could allow skipping recent
+                // layers when finding the page.
                 lsn = last_record_lsn;
             } else {
                 timeline
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 351a766b10..4a9682dcac 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -252,16 +252,8 @@ impl Timeline {
         let mut buf = version.get(self, key, ctx).await?;
         let nblocks = buf.get_u32_le();
 
-        if latest {
-            // Update relation size cache only if "latest" flag is set.
-            // This flag is set by compute when it is working with most recent version of relation.
-            // Typically master compute node always set latest=true.
-            // Please notice, that even if compute node "by mistake" specifies old LSN but set
-            // latest=true, then it can not cause cache corruption, because with latest=true
-            // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
-            // associated with most recent value of LSN.
-            self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
-        }
+        self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
+
         Ok(nblocks)
     }
 
@@ -817,7 +809,7 @@ impl Timeline {
     /// Get cached size of relation if it not updated after specified LSN
     pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
         let rel_size_cache = self.rel_size_cache.read().unwrap();
-        if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
+        if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
             if lsn >= *cached_lsn {
                 return Some(*nblocks);
             }
@@ -828,7 +820,16 @@ impl Timeline {
     /// Update cached relation size if there is no more recent update
     pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
         let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        match rel_size_cache.entry(tag) {
+
+        if lsn < rel_size_cache.complete_as_of {
+            // Do not cache old values. It's safe to cache the size on read, as long as
+            // the read was at an LSN since we started the WAL ingestion. Reasoning: we
+            // never evict values from the cache, so if the relation size changed after
+            // 'lsn', the new value is already in the cache.
+            return;
+        }
+
+        match rel_size_cache.map.entry(tag) {
             hash_map::Entry::Occupied(mut entry) => {
                 let cached_lsn = entry.get_mut();
                 if lsn >= cached_lsn.0 {
@@ -844,13 +845,13 @@ impl Timeline {
     /// Store cached relation size
     pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
         let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.insert(tag, (lsn, nblocks));
+        rel_size_cache.map.insert(tag, (lsn, nblocks));
     }
 
     /// Remove cached relation size
     pub fn remove_cached_rel_size(&self, tag: &RelTag) {
         let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.remove(tag);
+        rel_size_cache.map.remove(tag);
     }
 }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e707c3b244..fa7d219fb0 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -182,6 +182,16 @@ pub(crate) struct AuxFilesState {
     pub(crate) n_deltas: usize,
 }
 
+/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL
+/// ingestion considerably, because WAL ingestion needs to check on most records if the record
+/// implicitly extends the relation.  At startup, `complete_as_of` is initialized to the current end
+/// of the timeline (disk_consistent_lsn).  It's used on reads of relation sizes to check if the
+/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`].
+pub(crate) struct RelSizeCache {
+    pub(crate) complete_as_of: Lsn,
+    pub(crate) map: HashMap<RelTag, (Lsn, BlockNumber)>,
+}
+
 pub struct Timeline {
     conf: &'static PageServerConf,
     tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
@@ -324,7 +334,7 @@ pub struct Timeline {
     pub walreceiver: Mutex<Option<WalReceiver>>,
 
     /// Relation size cache
-    pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
+    pub(crate) rel_size_cache: RwLock<RelSizeCache>,
 
     download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,
 
@@ -1951,7 +1961,10 @@ impl Timeline {
                 last_image_layer_creation_check_at: AtomicLsn::new(0),
 
                 last_received_wal: Mutex::new(None),
-                rel_size_cache: RwLock::new(HashMap::new()),
+                rel_size_cache: RwLock::new(RelSizeCache {
+                    complete_as_of: disk_consistent_lsn,
+                    map: HashMap::new(),
+                }),
 
                 download_all_remote_layers_task_info: RwLock::new(None),
 

From d551bfee091abed46152f26c06e86a216ab8ac08 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 23 Apr 2024 11:36:56 +0100
Subject: [PATCH 0599/1571] pageserver: remove import/export script previously
 used for breaking format changes (#7458)

## Problem
The `export_import_between_pageservers` script us to do major storage format changes
in the past. If we have to do such breaking changes in the future this approach
wouldn't be suitable because:
1. It doesn't scale to the current size of the fleet
2. It loses history

## Summary of changes
Remove the script and its associated test.
Keep `fullbasebackup` and friends because it's useful for debugging.

Closes https://github.com/neondatabase/cloud/issues/11648
---
 pageserver/src/page_service.rs                |   4 +
 scripts/export_import_between_pageservers.py  | 730 ------------------
 test_runner/regress/test_tenant_relocation.py |  49 +-
 3 files changed, 8 insertions(+), 775 deletions(-)
 delete mode 100755 scripts/export_import_between_pageservers.py

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 62782d8dd3..fa6b81ac72 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1206,6 +1206,10 @@ impl PageServerHandler {
         ))
     }
 
+    /// Note on "fullbackup":
+    /// Full basebackups should only be used for debugging purposes.
+    /// Originally, it was introduced to enable breaking storage format changes,
+    /// but that is not applicable anymore.
     #[allow(clippy::too_many_arguments)]
     #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
     async fn handle_basebackup_request<IO>(
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
deleted file mode 100755
index 84b69cb36a..0000000000
--- a/scripts/export_import_between_pageservers.py
+++ /dev/null
@@ -1,730 +0,0 @@
-#
-# Script to export tenants from one pageserver and import them into another page server.
-#
-# Outline of steps:
-# 1. Get `(last_lsn, prev_lsn)` from old pageserver
-# 2. Get `fullbackup` from old pageserver, which creates a basebackup tar file
-# 3. This tar file might be missing relation files for empty relations, if the pageserver
-#    is old enough (we didn't always store those). So to recreate them, we start a local
-#    vanilla postgres on this basebackup and ask it what relations should exist, then touch
-#    any missing files and re-pack the tar.
-#    TODO This functionality is no longer needed, so we can delete it later if we don't
-#         end up using the same utils for the pg 15 upgrade. Not sure.
-# 4. We import the patched basebackup into a new pageserver
-# 5. We export again via fullbackup, now from the new pageserver and compare the returned
-#    tar file with the one we imported. This confirms that we imported everything that was
-#    exported, but doesn't guarantee correctness (what if we didn't **export** everything
-#    initially?)
-# 6. We wait for the new pageserver's remote_consistent_lsn to catch up
-#
-# For more context on how to use this, see:
-# https://www.notion.so/neondatabase/Storage-format-migration-9a8eba33ccf8417ea8cf50e6a0c542cf
-
-import argparse
-import os
-import shutil
-import subprocess
-import tempfile
-import time
-import uuid
-from contextlib import closing
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, cast
-
-import psycopg2
-import requests
-from psycopg2.extensions import connection as PgConnection
-from psycopg2.extensions import parse_dsn
-
-###############################################
-### client-side utils copied from test fixtures
-###############################################
-
-Env = Dict[str, str]
-
-_global_counter = 0
-
-
-def global_counter() -> int:
-    """A really dumb global counter.
-    This is useful for giving output files a unique number, so if we run the
-    same command multiple times we can keep their output separate.
-    """
-    global _global_counter
-    _global_counter += 1
-    return _global_counter
-
-
-def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
-    """Run a process and capture its output
-    Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
-    where "cmd" is the name of the program and NNN is an incrementing
-    counter.
-    If those files already exist, we will overwrite them.
-    Returns basepath for files with captured output.
-    """
-    assert isinstance(cmd, list)
-    base = f"{os.path.basename(cmd[0])}_{global_counter()}"
-    basepath = os.path.join(capture_dir, base)
-    stdout_filename = basepath + ".stdout"
-    stderr_filename = basepath + ".stderr"
-
-    with open(stdout_filename, "w") as stdout_f:
-        with open(stderr_filename, "w") as stderr_f:
-            print(f'(capturing output to "{base}.stdout")')
-            subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
-
-    return basepath
-
-
-class PgBin:
-    """A helper class for executing postgres binaries"""
-
-    def __init__(self, log_dir: Path, pg_distrib_dir, pg_version):
-        self.log_dir = log_dir
-        self.pg_bin_path = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "bin")
-        self.env = os.environ.copy()
-        self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "lib")
-
-    def _fixpath(self, command: List[str]):
-        if "/" not in command[0]:
-            command[0] = os.path.join(self.pg_bin_path, command[0])
-
-    def _build_env(self, env_add: Optional[Env]) -> Env:
-        if env_add is None:
-            return self.env
-        env = self.env.copy()
-        env.update(env_add)
-        return env
-
-    def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None):
-        """
-        Run one of the postgres binaries.
-        The command should be in list form, e.g. ['pgbench', '-p', '55432']
-        All the necessary environment variables will be set.
-        If the first argument (the command name) doesn't include a path (no '/'
-        characters present), then it will be edited to include the correct path.
-        If you want stdout/stderr captured to files, use `run_capture` instead.
-        """
-
-        self._fixpath(command)
-        print(f'Running command "{" ".join(command)}"')
-        env = self._build_env(env)
-        subprocess.run(command, env=env, cwd=cwd, check=True)
-
-    def run_capture(
-        self,
-        command: List[str],
-        env: Optional[Env] = None,
-        cwd: Optional[str] = None,
-        **kwargs: Any,
-    ) -> str:
-        """
-        Run one of the postgres binaries, with stderr and stdout redirected to a file.
-        This is just like `run`, but for chatty programs. Returns basepath for files
-        with captured output.
-        """
-
-        self._fixpath(command)
-        print(f'Running command "{" ".join(command)}"')
-        env = self._build_env(env)
-        return subprocess_capture(
-            str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs
-        )
-
-
-class PgProtocol:
-    """Reusable connection logic"""
-
-    def __init__(self, **kwargs):
-        self.default_options = kwargs
-
-    def conn_options(self, **kwargs):
-        conn_options = self.default_options.copy()
-        if "dsn" in kwargs:
-            conn_options.update(parse_dsn(kwargs["dsn"]))
-        conn_options.update(kwargs)
-
-        # Individual statement timeout in seconds. 2 minutes should be
-        # enough for our tests, but if you need a longer, you can
-        # change it by calling "SET statement_timeout" after
-        # connecting.
-        conn_options["options"] = f"-cstatement_timeout=120s {conn_options.get('options', '')}"
-
-        return conn_options
-
-    # autocommit=True here by default because that's what we need most of the time
-    def connect(self, autocommit=True, **kwargs) -> PgConnection:
-        """
-        Connect to the node.
-        Returns psycopg2's connection object.
-        This method passes all extra params to connstr.
-        """
-        conn: PgConnection = psycopg2.connect(**self.conn_options(**kwargs))
-
-        # WARNING: this setting affects *all* tests!
-        conn.autocommit = autocommit
-        return conn
-
-    def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]:
-        """
-        Execute query against the node and return all rows.
-        This method passes all extra params to connstr.
-        """
-        return self.safe_psql_many([query], **kwargs)[0]
-
-    def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]:
-        """
-        Execute queries against the node and return all rows.
-        This method passes all extra params to connstr.
-        """
-        result: List[List[Any]] = []
-        with closing(self.connect(**kwargs)) as conn:
-            with conn.cursor() as cur:
-                for query in queries:
-                    print(f"Executing query: {query}")
-                    cur.execute(query)
-
-                    if cur.description is None:
-                        result.append([])  # query didn't return data
-                    else:
-                        result.append(cast(List[Any], cur.fetchall()))
-        return result
-
-
-class VanillaPostgres(PgProtocol):
-    def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True):
-        super().__init__(host="localhost", port=port, dbname="postgres")
-        self.pgdatadir = pgdatadir
-        self.pg_bin = pg_bin
-        self.running = False
-        if init:
-            self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)])
-        self.configure([f"port = {port}\n"])
-
-    def configure(self, options: List[str]):
-        """Append lines into postgresql.conf file."""
-        assert not self.running
-        with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file:
-            conf_file.write("\n".join(options))
-
-    def start(self, log_path: Optional[str] = None):
-        assert not self.running
-        self.running = True
-
-        log_path = log_path or os.path.join(self.pgdatadir, "pg.log")
-
-        self.pg_bin.run_capture(
-            ["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"]
-        )
-
-    def stop(self):
-        assert self.running
-        self.running = False
-        self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"])
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc, tb):
-        if self.running:
-            self.stop()
-
-
-class NeonPageserverApiException(Exception):
-    pass
-
-
-class NeonPageserverHttpClient(requests.Session):
-    def __init__(self, host, port):
-        super().__init__()
-        self.host = host
-        self.port = port
-
-    def verbose_error(self, res: requests.Response):
-        try:
-            res.raise_for_status()
-        except requests.RequestException as e:
-            try:
-                msg = res.json()["msg"]
-            except:  # noqa: E722
-                msg = ""
-            raise NeonPageserverApiException(msg) from e
-
-    def check_status(self):
-        self.get(f"http://{self.host}:{self.port}/v1/status").raise_for_status()
-
-    def tenant_list(self):
-        res = self.get(f"http://{self.host}:{self.port}/v1/tenant")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, list)
-        return res_json
-
-    def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists):
-        res = self.post(
-            f"http://{self.host}:{self.port}/v1/tenant",
-            json={"new_tenant_id": new_tenant_id.hex, "generation": 1},
-        )
-
-        if res.status_code == 409:
-            if ok_if_exists:
-                print(f"could not create tenant: already exists for id {new_tenant_id}")
-            else:
-                res.raise_for_status()
-        elif res.status_code == 201:
-            print(f"created tenant {new_tenant_id}")
-        else:
-            self.verbose_error(res)
-
-        return new_tenant_id
-
-    def timeline_list(self, tenant_id: uuid.UUID):
-        res = self.get(f"http://{self.host}:{self.port}/v1/tenant/{tenant_id.hex}/timeline")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, list)
-        return res_json
-
-    def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]:
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=true"
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-
-def lsn_to_hex(num: int) -> str:
-    """Convert lsn from int to standard hex notation."""
-    return f"{num >> 32:X}/{num & 0xFFFFFFFF:X}"
-
-
-def lsn_from_hex(lsn_hex: str) -> int:
-    """Convert lsn from hex notation to int."""
-    left, right = lsn_hex.split("/")
-    return (int(left, 16) << 32) + int(right, 16)
-
-
-def remote_consistent_lsn(
-    pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID
-) -> int:
-    detail = pageserver_http_client.timeline_detail(tenant, timeline)
-
-    lsn_str = detail["remote_consistent_lsn"]
-    assert isinstance(lsn_str, str)
-    return lsn_from_hex(lsn_str)
-
-
-def wait_for_upload(
-    pageserver_http_client: NeonPageserverHttpClient,
-    tenant: uuid.UUID,
-    timeline: uuid.UUID,
-    lsn: int,
-):
-    """waits for local timeline upload up to specified lsn"""
-    for i in range(10):
-        current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline)
-        if current_lsn >= lsn:
-            return
-        print(
-            f"waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, now {lsn_to_hex(current_lsn)}, iteration {i + 1}"
-        )
-        time.sleep(1)
-
-    raise Exception(
-        f"timed out while waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, was {lsn_to_hex(current_lsn)}"
-    )
-
-
-##############
-# End of utils
-##############
-
-
-def pack_base(log_dir, restored_dir, output_tar):
-    """Create tar file from basebackup, being careful to produce relative filenames."""
-    tmp_tar_name = "tmp.tar"
-    tmp_tar_path = os.path.join(restored_dir, tmp_tar_name)
-    cmd = ["tar", "-cf", tmp_tar_name] + os.listdir(restored_dir)
-    # We actually cd into the dir and call tar from there. If we call tar from
-    # outside we won't encode filenames as relative, and they won't parse well
-    # on import.
-    subprocess_capture(log_dir, cmd, cwd=restored_dir)
-    shutil.move(tmp_tar_path, output_tar)
-
-
-def reconstruct_paths(log_dir, pg_bin, base_tar, port: int):
-    """Reconstruct what relation files should exist in the datadir by querying postgres."""
-    with tempfile.TemporaryDirectory() as restored_dir:
-        # Unpack the base tar
-        subprocess_capture(log_dir, ["tar", "-xf", base_tar, "-C", restored_dir])
-
-        # Start a vanilla postgres from the given datadir and query it to find
-        # what relfiles should exist, but possibly don't.
-        with VanillaPostgres(Path(restored_dir), pg_bin, port, init=False) as vanilla_pg:
-            vanilla_pg.configure([f"port={port}"])
-            vanilla_pg.start(log_path=os.path.join(log_dir, "tmp_pg.log"))
-
-            # Create database based on template0 because we can't connect to template0
-            query = "create database template0copy template template0"
-            vanilla_pg.safe_psql(query, user="cloud_admin")
-            vanilla_pg.safe_psql("CHECKPOINT", user="cloud_admin")
-
-            # Get all databases
-            query = "select oid, datname from pg_database"
-            oid_dbname_pairs = vanilla_pg.safe_psql(query, user="cloud_admin")
-            template0_oid = [
-                oid for (oid, database) in oid_dbname_pairs if database == "template0"
-            ][0]
-
-            # Get rel paths for each database
-            for oid, database in oid_dbname_pairs:
-                if database == "template0":
-                    # We can't connect to template0
-                    continue
-
-                query = "select relname, pg_relation_filepath(oid) from pg_class"
-                result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database)
-                for _relname, filepath in result:
-                    if filepath is not None:
-                        if database == "template0copy":
-                            # Add all template0copy paths to template0
-                            prefix = f"base/{oid}/"
-                            if filepath.startswith(prefix):
-                                suffix = filepath[len(prefix) :]
-                                yield f"base/{template0_oid}/{suffix}"
-                            elif filepath.startswith("global"):
-                                print(f"skipping {database} global file {filepath}")
-                            else:
-                                raise AssertionError
-                        else:
-                            yield filepath
-
-
-def touch_missing_rels(log_dir, corrupt_tar, output_tar, paths):
-    """Add the appropriate empty files to a basebadkup tar."""
-    with tempfile.TemporaryDirectory() as restored_dir:
-        # Unpack the base tar
-        subprocess_capture(log_dir, ["tar", "-xf", corrupt_tar, "-C", restored_dir])
-
-        # Touch files that don't exist
-        for path in paths:
-            absolute_path = os.path.join(restored_dir, path)
-            exists = os.path.exists(absolute_path)
-            if not exists:
-                print(f"File {absolute_path} didn't exist. Creating..")
-                Path(absolute_path).touch()
-
-        # Repackage
-        pack_base(log_dir, restored_dir, output_tar)
-
-
-# HACK This is a workaround for exporting from old pageservers that
-#      can't export empty relations. In this case we need to start
-#      a vanilla postgres from the exported datadir, and query it
-#      to see what empty relations are missing, and then create
-#      those empty files before importing.
-def add_missing_rels(base_tar, output_tar, log_dir, pg_bin, tmp_pg_port: int):
-    reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar, tmp_pg_port))
-    touch_missing_rels(log_dir, base_tar, output_tar, reconstructed_paths)
-
-
-def get_rlsn(pageserver_connstr, tenant_id, timeline_id):
-    with closing(psycopg2.connect(pageserver_connstr)) as conn:
-        conn.autocommit = True
-        with conn.cursor() as cur:
-            cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}"
-            cur.execute(cmd)
-            res = cur.fetchone()
-            assert res is not None
-            prev_lsn = res[0]
-            last_lsn = res[1]
-
-    return last_lsn, prev_lsn
-
-
-def import_timeline(
-    args,
-    psql_path,
-    pageserver_connstr,
-    pageserver_http,
-    tenant_id,
-    timeline_id,
-    last_lsn,
-    prev_lsn,
-    tar_filename,
-    pg_version,
-):
-    # Import timelines to new pageserver
-    import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn} {pg_version}"
-    full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """
-
-    stderr_filename2 = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr")
-    stdout_filename = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout")
-
-    print(f"Running: {full_cmd}")
-
-    with open(stdout_filename, "w") as stdout_f:
-        with open(stderr_filename2, "w") as stderr_f:
-            print(f"(capturing output to {stdout_filename})")
-            pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
-            subprocess.run(
-                full_cmd,
-                stdout=stdout_f,
-                stderr=stderr_f,
-                env=pg_bin._build_env(None),
-                shell=True,
-                check=True,
-            )
-
-            print("Done import")
-
-    # Wait until pageserver persists the files
-    wait_for_upload(
-        pageserver_http, uuid.UUID(tenant_id), uuid.UUID(timeline_id), lsn_from_hex(last_lsn)
-    )
-
-
-def export_timeline(
-    args,
-    psql_path,
-    pageserver_connstr,
-    tenant_id,
-    timeline_id,
-    last_lsn,
-    prev_lsn,
-    tar_filename,
-    pg_version,
-):
-    # Choose filenames
-    incomplete_filename = tar_filename + ".incomplete"
-    stderr_filename = os.path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr")
-
-    # Construct export command
-    query = f"fullbackup {tenant_id} {timeline_id} {last_lsn} {prev_lsn}"
-    cmd = [psql_path, "--no-psqlrc", pageserver_connstr, "-c", query]
-
-    # Run export command
-    print(f"Running: {cmd}")
-    with open(incomplete_filename, "w") as stdout_f:
-        with open(stderr_filename, "w") as stderr_f:
-            print(f"(capturing output to {incomplete_filename})")
-            pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
-            subprocess.run(
-                cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True
-            )
-
-    # Add missing rels
-    pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
-    add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin, args.tmp_pg_port)
-
-    # Log more info
-    file_size = os.path.getsize(tar_filename)
-    print(f"Done export: {tar_filename}, size {file_size}")
-
-
-def main(args: argparse.Namespace):
-    # any psql version will do here. use current DEFAULT_PG_VERSION = 15
-    psql_path = str(Path(args.pg_distrib_dir) / "v15" / "bin" / "psql")
-
-    old_pageserver_host = args.old_pageserver_host
-    new_pageserver_host = args.new_pageserver_host
-
-    old_http_client = NeonPageserverHttpClient(old_pageserver_host, args.old_pageserver_http_port)
-    old_http_client.check_status()
-    old_pageserver_connstr = f"postgresql://{old_pageserver_host}:{args.old_pageserver_pg_port}"
-
-    new_http_client = NeonPageserverHttpClient(new_pageserver_host, args.new_pageserver_http_port)
-    new_http_client.check_status()
-    new_pageserver_connstr = f"postgresql://{new_pageserver_host}:{args.new_pageserver_pg_port}"
-
-    for tenant_id in args.tenants:
-        print(f"Tenant: {tenant_id}")
-        timelines = old_http_client.timeline_list(uuid.UUID(tenant_id))
-        print(f"Timelines: {timelines}")
-
-        # Create tenant in new pageserver
-        if args.only_import is False and not args.timelines:
-            new_http_client.tenant_create(uuid.UUID(tenant_id), args.ok_if_exists)
-
-        for timeline in timelines:
-            # Skip timelines we don't need to export
-            if args.timelines and timeline["timeline_id"] not in args.timelines:
-                print(f"Skipping timeline {timeline['timeline_id']}")
-                continue
-
-            # Choose filenames
-            tar_filename = os.path.join(
-                args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar"
-            )
-
-            pg_version = timeline["pg_version"]
-
-            # Export timeline from old pageserver
-            if args.only_import is False:
-                last_lsn, prev_lsn = get_rlsn(
-                    old_pageserver_connstr,
-                    timeline["tenant_id"],
-                    timeline["timeline_id"],
-                )
-                export_timeline(
-                    args,
-                    psql_path,
-                    old_pageserver_connstr,
-                    timeline["tenant_id"],
-                    timeline["timeline_id"],
-                    last_lsn,
-                    prev_lsn,
-                    tar_filename,
-                    pg_version,
-                )
-
-            # Import into new pageserver
-            import_timeline(
-                args,
-                psql_path,
-                new_pageserver_connstr,
-                new_http_client,
-                timeline["tenant_id"],
-                timeline["timeline_id"],
-                last_lsn,
-                prev_lsn,
-                tar_filename,
-                pg_version,
-            )
-
-            # Re-export and compare
-            re_export_filename = tar_filename + ".reexport"
-            export_timeline(
-                args,
-                psql_path,
-                new_pageserver_connstr,
-                timeline["tenant_id"],
-                timeline["timeline_id"],
-                last_lsn,
-                prev_lsn,
-                re_export_filename,
-                pg_version,
-            )
-
-            # Check the size is the same
-            old_size = (os.path.getsize(tar_filename),)
-            new_size = (os.path.getsize(re_export_filename),)
-            if old_size != new_size:
-                raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}")
-
-
-def non_zero_tcp_port(arg: Any):
-    port = int(arg)
-    if port < 1 or port > 65535:
-        raise argparse.ArgumentTypeError(f"invalid tcp port: {arg}")
-    return port
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--tenant-id",
-        dest="tenants",
-        required=True,
-        nargs="+",
-        help="Id of the tenant to migrate. You can pass multiple arguments",
-    )
-    parser.add_argument(
-        "--timeline-id",
-        dest="timelines",
-        required=False,
-        nargs="+",
-        help="Id of the timeline to migrate. You can pass multiple arguments",
-    )
-    parser.add_argument(
-        "--from-host",
-        dest="old_pageserver_host",
-        required=True,
-        help="Host of the pageserver to migrate data from",
-    )
-    parser.add_argument(
-        "--from-http-port",
-        dest="old_pageserver_http_port",
-        required=False,
-        type=int,
-        default=9898,
-        help="HTTP port of the pageserver to migrate data from. Default: 9898",
-    )
-    parser.add_argument(
-        "--from-pg-port",
-        dest="old_pageserver_pg_port",
-        required=False,
-        type=int,
-        default=6400,
-        help="pg port of the pageserver to migrate data from. Default: 6400",
-    )
-    parser.add_argument(
-        "--to-host",
-        dest="new_pageserver_host",
-        required=True,
-        help="Host of the pageserver to migrate data to",
-    )
-    parser.add_argument(
-        "--to-http-port",
-        dest="new_pageserver_http_port",
-        required=False,
-        default=9898,
-        type=int,
-        help="HTTP port of the pageserver to migrate data to. Default: 9898",
-    )
-    parser.add_argument(
-        "--to-pg-port",
-        dest="new_pageserver_pg_port",
-        required=False,
-        default=6400,
-        type=int,
-        help="pg port of the pageserver to migrate data to. Default: 6400",
-    )
-    parser.add_argument(
-        "--ignore-tenant-exists",
-        dest="ok_if_exists",
-        required=False,
-        help="Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.",
-    )
-    parser.add_argument(
-        "--pg-distrib-dir",
-        dest="pg_distrib_dir",
-        required=False,
-        default="/usr/local/",
-        help="Path where postgres binaries are installed. Default: /usr/local/",
-    )
-    parser.add_argument(
-        "--psql-path",
-        dest="psql_path",
-        required=False,
-        default="/usr/local/v14/bin/psql",
-        help="Path to the psql binary. Default: /usr/local/v14/bin/psql",
-    )
-    parser.add_argument(
-        "--only-import",
-        dest="only_import",
-        required=False,
-        default=False,
-        action="store_true",
-        help="Skip export and tenant creation part",
-    )
-    parser.add_argument(
-        "--work-dir",
-        dest="work_dir",
-        required=True,
-        default=False,
-        help="directory where temporary tar files are stored",
-    )
-    parser.add_argument(
-        "--tmp-pg-port",
-        dest="tmp_pg_port",
-        required=False,
-        default=55439,
-        type=non_zero_tcp_port,
-        help="localhost port to use for temporary postgres instance",
-    )
-    args = parser.parse_args()
-    main(args)
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 9def3ad1c2..68d9d9a660 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -16,7 +16,6 @@ from fixtures.pageserver.utils import (
     wait_for_upload,
     wait_tenant_status_404,
 )
-from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import (
     LocalFsStorage,
     RemoteStorageKind,
@@ -24,7 +23,6 @@ from fixtures.remote_storage import (
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
     query_scalar,
-    subprocess_capture,
     wait_until,
 )
 
@@ -184,20 +182,14 @@ def post_migration_check(endpoint: Endpoint, sum_before_migration: int, old_loca
         # A minor migration involves no storage breaking changes.
         # It is done by attaching the tenant to a new pageserver.
         "minor",
-        # A major migration involves exporting a postgres datadir
-        # basebackup and importing it into the new pageserver.
-        # This kind of migration can tolerate breaking changes
-        # to storage format
-        "major",
+        # In the unlikely and unfortunate event that we have to break
+        # the storage format, extend this test with the param below.
+        # "major",
     ],
 )
 @pytest.mark.parametrize("with_load", ["with_load", "without_load"])
 def test_tenant_relocation(
     neon_env_builder: NeonEnvBuilder,
-    port_distributor: PortDistributor,
-    test_output_dir: Path,
-    neon_binpath: Path,
-    base_dir: Path,
     method: str,
     with_load: str,
 ):
@@ -299,40 +291,7 @@ def test_tenant_relocation(
         current_lsn=current_lsn_second,
     )
 
-    # Migrate either by attaching from s3 or import/export basebackup
-    if method == "major":
-        cmd = [
-            "poetry",
-            "run",
-            "python",
-            str(base_dir / "scripts/export_import_between_pageservers.py"),
-            "--tenant-id",
-            str(tenant_id),
-            "--from-host",
-            "localhost",
-            "--from-http-port",
-            str(origin_http.port),
-            "--from-pg-port",
-            str(origin_ps.service_port.pg),
-            "--to-host",
-            "localhost",
-            "--to-http-port",
-            str(destination_http.port),
-            "--to-pg-port",
-            str(destination_ps.service_port.pg),
-            "--pg-distrib-dir",
-            str(neon_env_builder.pg_distrib_dir),
-            "--work-dir",
-            str(test_output_dir),
-            "--tmp-pg-port",
-            str(port_distributor.get_port()),
-        ]
-        subprocess_capture(test_output_dir, cmd, check=True)
-
-        destination_ps.allowed_errors.append(
-            ".*ignored .* unexpected bytes after the tar archive.*"
-        )
-    elif method == "minor":
+    if method == "minor":
         # call to attach timeline to new pageserver
         destination_ps.tenant_attach(tenant_id)
 

From fa12d6023781e3d3972e77a8cc4be58bc24dd810 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 23 Apr 2024 13:42:58 +0200
Subject: [PATCH 0600/1571] Don't pass tenant_id in location_config requests
 from storage controller (#7476)

Tested this locally via a simple patch, the `tenant_id` is now gone from
the json.

Follow-up of #7055, prerequisite for #7469.
---
 libs/pageserver_api/src/models.rs | 1 +
 pageserver/client/src/mgmt_api.rs | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index f441d1ff1a..e334a68a1e 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -429,6 +429,7 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
+    #[serde(skip_serializing_if = "Option::is_none")]
     pub tenant_id: Option<TenantShardId>,
     #[serde(flatten)]
     pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 3c9982ffb8..892e6c2758 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -279,7 +279,7 @@ impl Client {
         lazy: bool,
     ) -> Result<()> {
         let req_body = TenantLocationConfigRequest {
-            tenant_id: Some(tenant_shard_id),
+            tenant_id: None,
             config,
         };
 

From a9fda8c8327b39c9d543bf22c02186c279cc152a Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 23 Apr 2024 14:03:33 +0100
Subject: [PATCH 0601/1571] pageserver: fix vectored read aux key handling
 (#7404)

## Problem
Vectored get would descend into ancestor timelines for aux files.
This is not the behaviour of the legacy read path and blocks cutting
over to the vectored read path.

Fixes https://github.com/neondatabase/neon/issues/7379

## Summary of Changes
Treat non inherited keys specially in vectored get. At the point when
we want to descend into the ancestor mark all pending non inherited keys
as errored out at the key level. Note that this diverges from the
standard vectored get behaviour for missing keys which is a top level
error. This divergence is required to avoid blocking compaction in case
such an error is encountered when compaction aux files keys. I'm pretty
sure the bug I just described predates the vectored get implementation,
but it's still worth fixing.
---
 libs/pageserver_api/src/key.rs      |  8 ++--
 libs/pageserver_api/src/keyspace.rs | 53 ++++++++++++++++++++++++---
 pageserver/src/tenant.rs            | 57 +++++++++++++++++++++++++++++
 pageserver/src/tenant/timeline.rs   | 45 ++++++++++++++++++++++-
 4 files changed, 152 insertions(+), 11 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 852670af2c..1d66dd8878 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -48,11 +48,11 @@ impl Key {
         }
     }
 
-    pub fn next(&self) -> Key {
+    pub const fn next(&self) -> Key {
         self.add(1)
     }
 
-    pub fn add(&self, x: u32) -> Key {
+    pub const fn add(&self, x: u32) -> Key {
         let mut key = *self;
 
         let r = key.field6.overflowing_add(x);
@@ -475,12 +475,14 @@ pub const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.
 
+pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
+
 // AUX_FILES currently stores only data for logical replication (slots etc), and
 // we don't preserve these on a branch because safekeepers can't follow timeline
 // switch (and generally it likely should be optional), so ignore these.
 #[inline(always)]
 pub fn is_inherited_key(key: Key) -> bool {
-    key != AUX_FILES_KEY
+    !NON_INHERITED_RANGE.contains(&key)
 }
 
 #[inline(always)]
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 05fa4562e1..78e4a3d735 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -94,12 +94,13 @@ impl KeySpace {
 
     /// Remove all keys in `other` from `self`.
     /// This can involve splitting or removing of existing ranges.
-    pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
+    /// Returns the removed keyspace
+    pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
         let (self_start, self_end) = match (self.start(), self.end()) {
             (Some(start), Some(end)) => (start, end),
             _ => {
                 // self is empty
-                return;
+                return KeySpace::default();
             }
         };
 
@@ -112,30 +113,37 @@ impl KeySpace {
             .skip_while(|range| self_start >= range.end)
             .take_while(|range| self_end > range.start);
 
+        let mut removed_accum = KeySpaceRandomAccum::new();
         for range in other_ranges {
             while let Some(overlap_at) = self.overlaps_at(range) {
                 let overlapped = self.ranges[overlap_at].clone();
 
                 if overlapped.start < range.start && overlapped.end <= range.end {
                     // Higher part of the range is completely overlapped.
+                    removed_accum.add_range(range.start..self.ranges[overlap_at].end);
                     self.ranges[overlap_at].end = range.start;
                 }
                 if overlapped.start >= range.start && overlapped.end > range.end {
                     // Lower part of the range is completely overlapped.
+                    removed_accum.add_range(self.ranges[overlap_at].start..range.end);
                     self.ranges[overlap_at].start = range.end;
                 }
                 if overlapped.start < range.start && overlapped.end > range.end {
                     // Middle part of the range is overlapped.
+                    removed_accum.add_range(range.clone());
                     self.ranges[overlap_at].end = range.start;
                     self.ranges
                         .insert(overlap_at + 1, range.end..overlapped.end);
                 }
                 if overlapped.start >= range.start && overlapped.end <= range.end {
                     // Whole range is overlapped
+                    removed_accum.add_range(self.ranges[overlap_at].clone());
                     self.ranges.remove(overlap_at);
                 }
             }
         }
+
+        removed_accum.to_keyspace()
     }
 
     pub fn start(&self) -> Option<Key> {
@@ -553,7 +561,16 @@ mod tests {
                 Key::from_i128(11)..Key::from_i128(13),
             ],
         };
-        key_space1.remove_overlapping_with(&key_space2);
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace {
+            ranges: vec![
+                Key::from_i128(2)..Key::from_i128(3),
+                Key::from_i128(6)..Key::from_i128(7),
+                Key::from_i128(11)..Key::from_i128(12),
+            ],
+        };
+        assert_eq!(removed, removed_expected);
+
         assert_eq!(
             key_space1.ranges,
             vec![
@@ -583,7 +600,17 @@ mod tests {
                 Key::from_i128(14)..Key::from_i128(17),
             ],
         };
-        key_space1.remove_overlapping_with(&key_space2);
+
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace {
+            ranges: vec![
+                Key::from_i128(3)..Key::from_i128(5),
+                Key::from_i128(8)..Key::from_i128(10),
+                Key::from_i128(14)..Key::from_i128(15),
+            ],
+        };
+        assert_eq!(removed, removed_expected);
+
         assert_eq!(
             key_space1.ranges,
             vec![
@@ -610,7 +637,11 @@ mod tests {
                 Key::from_i128(15)..Key::from_i128(17),
             ],
         };
-        key_space1.remove_overlapping_with(&key_space2);
+
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace::default();
+        assert_eq!(removed, removed_expected);
+
         assert_eq!(
             key_space1.ranges,
             vec![
@@ -637,7 +668,17 @@ mod tests {
         let key_space2 = KeySpace {
             ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
         };
-        key_space1.remove_overlapping_with(&key_space2);
+
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace {
+            ranges: vec![
+                Key::from_i128(9)..Key::from_i128(10),
+                Key::from_i128(12)..Key::from_i128(15),
+                Key::from_i128(17)..Key::from_i128(19),
+            ],
+        };
+        assert_eq!(removed, removed_expected);
+
         assert_eq!(
             key_space1.ranges,
             vec![
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 15be6df637..098bad71fb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3859,6 +3859,7 @@ mod tests {
     use crate::DEFAULT_PG_VERSION;
     use bytes::BytesMut;
     use hex_literal::hex;
+    use pageserver_api::key::NON_INHERITED_RANGE;
     use pageserver_api::keyspace::KeySpace;
     use rand::{thread_rng, Rng};
     use tests::timeline::{GetVectoredError, ShutdownMode};
@@ -4658,6 +4659,62 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_get_vectored_aux_files")?;
+
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        let tline = tline.raw_timeline().unwrap();
+
+        let mut modification = tline.begin_modification(Lsn(0x1000));
+        modification.put_file("foo/bar1", b"content1", &ctx).await?;
+        modification.set_lsn(Lsn(0x1008))?;
+        modification.put_file("foo/bar2", b"content2", &ctx).await?;
+        modification.commit(&ctx).await?;
+
+        let child_timeline_id = TimelineId::generate();
+        tenant
+            .branch_timeline_test(
+                tline,
+                child_timeline_id,
+                Some(tline.get_last_record_lsn()),
+                &ctx,
+            )
+            .await?;
+
+        let child_timeline = tenant
+            .get_timeline(child_timeline_id, true)
+            .expect("Should have the branched timeline");
+
+        let aux_keyspace = KeySpace {
+            ranges: vec![NON_INHERITED_RANGE],
+        };
+        let read_lsn = child_timeline.get_last_record_lsn();
+
+        let vectored_res = child_timeline
+            .get_vectored_impl(aux_keyspace.clone(), read_lsn, &ctx)
+            .await;
+
+        child_timeline
+            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
+            .await;
+
+        let images = vectored_res?;
+        let mut key = NON_INHERITED_RANGE.start;
+        while key < NON_INHERITED_RANGE.end {
+            assert!(matches!(
+                images[&key],
+                Err(PageReconstructError::MissingKey(_))
+            ));
+            key = key.next();
+        }
+
+        Ok(())
+    }
+
     // Test that vectored get handles layer gaps correctly
     // by advancing into the next ancestor timeline if required.
     //
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fa7d219fb0..fb5ee0a8fa 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -16,7 +16,7 @@ use enumset::EnumSet;
 use fail::fail_point;
 use once_cell::sync::Lazy;
 use pageserver_api::{
-    key::AUX_FILES_KEY,
+    key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
     keyspace::KeySpaceAccum,
     models::{
         CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
@@ -943,7 +943,13 @@ impl Timeline {
                     Err(MissingKey(MissingKeyError {
                         stuck_at_lsn: false,
                         ..
-                    })) => return Err(GetVectoredError::MissingKey(key)),
+                    })) if !NON_INHERITED_RANGE.contains(&key) => {
+                        // The vectored read path handles non inherited keys specially.
+                        // If such a a key cannot be reconstructed from the current timeline,
+                        // the vectored read path returns a key level error as opposed to a top
+                        // level error.
+                        return Err(GetVectoredError::MissingKey(key));
+                    }
                     _ => {
                         values.insert(key, block);
                         key = key.next();
@@ -3024,6 +3030,41 @@ impl Timeline {
             .await?;
 
             keyspace.remove_overlapping_with(&completed);
+
+            // Do not descend into the ancestor timeline for aux files.
+            // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
+            // stalling compaction.
+            // TODO(chi): this will need to be updated for aux files v2 storage
+            if keyspace.overlaps(&NON_INHERITED_RANGE) {
+                let removed = keyspace.remove_overlapping_with(&KeySpace {
+                    ranges: vec![NON_INHERITED_RANGE],
+                });
+
+                for range in removed.ranges {
+                    let mut key = range.start;
+                    while key < range.end {
+                        reconstruct_state.on_key_error(
+                            key,
+                            PageReconstructError::MissingKey(MissingKeyError {
+                                stuck_at_lsn: false,
+                                key,
+                                shard: self.shard_identity.get_shard_number(&key),
+                                cont_lsn,
+                                request_lsn,
+                                ancestor_lsn: None,
+                                traversal_path: Vec::default(),
+                                backtrace: if cfg!(test) {
+                                    Some(std::backtrace::Backtrace::force_capture())
+                                } else {
+                                    None
+                                },
+                            }),
+                        );
+                        key = key.next();
+                    }
+                }
+            }
+
             if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() {
                 break;
             }

From 28e7fa98c4d8f8ef96fd2931f03543f8e06a2389 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 23 Apr 2024 14:05:02 +0100
Subject: [PATCH 0602/1571] pageserver: add read depth metrics and test (#7464)

## Problem
We recently went through an incident where compaction was inhibited by a
bug. We didn't observe this until quite late because we did not have alerting
on deep reads.

## Summary of changes
+ Tweak an existing metric that tracks the depth of a read on the
non-vectored read path:
  * Give it a better name
  * Track all layers
  * Larger buckets
+ Add a similar metric for the vectored read path
+ Add a compaction smoke test which uses these metrics. This test would
have caught
the compaction issue mentioned earlier.

Related https://github.com/neondatabase/neon/issues/7428
---
 pageserver/src/metrics.rs              | 20 ++++--
 pageserver/src/tenant/storage_layer.rs | 10 +++
 pageserver/src/tenant/timeline.rs      | 15 ++++-
 test_runner/fixtures/metrics.py        |  2 +-
 test_runner/regress/test_compaction.py | 93 ++++++++++++++++++++++++++
 5 files changed, 131 insertions(+), 9 deletions(-)
 create mode 100644 test_runner/regress/test_compaction.py

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index e6db95082b..66bf21ddec 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -86,11 +86,20 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
-        "pageserver_read_num_fs_layers",
-        "Number of persistent layers accessed for processing a read request, including those in the cache",
-        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
+        "pageserver_layers_visited_per_read_global",
+        "Number of layers visited to reconstruct one key",
+        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_layers_visited_per_vectored_read_global",
+        "Average number of layers visited to reconstruct one key",
+        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
     )
     .expect("failed to define a metric")
 });
@@ -2771,7 +2780,8 @@ pub fn preinitialize_metrics() {
 
     // histograms
     [
-        &READ_NUM_FS_LAYERS,
+        &READ_NUM_LAYERS_VISITED,
+        &VEC_READ_NUM_LAYERS_VISITED,
         &WAIT_LSN_TIME,
         &WAL_REDO_TIME,
         &WAL_REDO_RECORDS_HISTOGRAM,
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9a2b086828..9ddd916700 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -118,6 +118,7 @@ pub(crate) struct ValuesReconstructState {
     pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
 
     keys_done: KeySpaceRandomAccum,
+    layers_visited: u32,
 }
 
 impl ValuesReconstructState {
@@ -125,6 +126,7 @@ impl ValuesReconstructState {
         Self {
             keys: HashMap::new(),
             keys_done: KeySpaceRandomAccum::new(),
+            layers_visited: 0,
         }
     }
 
@@ -138,6 +140,14 @@ impl ValuesReconstructState {
         }
     }
 
+    pub(crate) fn on_layer_visited(&mut self) {
+        self.layers_visited += 1;
+    }
+
+    pub(crate) fn get_layers_visited(&self) -> u32 {
+        self.layers_visited
+    }
+
     /// Update the state collected for a given key.
     /// Returns true if this was the last value needed for the key and false otherwise.
     ///
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fb5ee0a8fa..2fbe3c63a2 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -973,6 +973,7 @@ impl Timeline {
             .await?;
 
         let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
+        let layers_visited = reconstruct_state.get_layers_visited();
         for (key, res) in reconstruct_state.keys {
             match res {
                 Err(err) => {
@@ -987,6 +988,12 @@ impl Timeline {
             }
         }
 
+        // Note that this is an approximation. Tracking the exact number of layers visited
+        // per key requires virtually unbounded memory usage and is inefficient
+        // (i.e. segment tree tracking each range queried from a layer)
+        crate::metrics::VEC_READ_NUM_LAYERS_VISITED
+            .observe(layers_visited as f64 / results.len() as f64);
+
         Ok(results)
     }
 
@@ -2813,7 +2820,7 @@ impl Timeline {
         let mut timeline = self;
 
         let mut read_count = scopeguard::guard(0, |cnt| {
-            crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64)
+            crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64)
         });
 
         // For debugging purposes, collect the path of layers that we traversed
@@ -2928,7 +2935,7 @@ impl Timeline {
                         Err(e) => return Err(PageReconstructError::from(e)),
                     };
                     cont_lsn = lsn_floor;
-                    // metrics: open_layer does not count as fs access, so we are not updating `read_count`
+                    *read_count += 1;
                     traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
                     continue 'outer;
                 }
@@ -2955,7 +2962,7 @@ impl Timeline {
                         Err(e) => return Err(PageReconstructError::from(e)),
                     };
                     cont_lsn = lsn_floor;
-                    // metrics: open_layer does not count as fs access, so we are not updating `read_count`
+                    *read_count += 1;
                     traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
                     continue 'outer;
                 }
@@ -3183,6 +3190,8 @@ impl Timeline {
 
                 unmapped_keyspace = keyspace_to_read;
                 cont_lsn = next_cont_lsn;
+
+                reconstruct_state.on_layer_visited();
             } else {
                 break;
             }
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index c615dd154f..7d34e12ca3 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -129,7 +129,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
     "pageserver_getpage_reconstruct_seconds_sum",
     *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
     *histogram("pageserver_smgr_query_seconds_global"),
-    *histogram("pageserver_read_num_fs_layers"),
+    *histogram("pageserver_layers_visited_per_read_global"),
     *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
     *histogram("pageserver_wait_lsn_seconds"),
     *histogram("pageserver_remote_operation_seconds"),
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
new file mode 100644
index 0000000000..37b87b92a9
--- /dev/null
+++ b/test_runner/regress/test_compaction.py
@@ -0,0 +1,93 @@
+import os
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.workload import Workload
+
+AGGRESIVE_COMPACTION_TENANT_CONF = {
+    # Disable gc and compaction. The test runs compaction manually.
+    "gc_period": "0s",
+    "compaction_period": "0s",
+    # Small checkpoint distance to create many layers
+    "checkpoint_distance": 1024**2,
+    # Compact small layers
+    "compaction_target_size": 1024**2,
+    "image_creation_threshold": 2,
+    # INC-186: remove when merging the fix
+    "image_layer_creation_check_threshold": 0,
+}
+
+
+@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
+    """
+    This is a smoke test that compaction kicks in. The workload repeatedly churns
+    a small number of rows and manually instructs the pageserver to run compaction
+    between iterations. At the end of the test validate that the average number of
+    layers visited to gather reconstruct data for a given key is within the empirically
+    observed bounds.
+    """
+
+    # Effectively disable the page cache to rely only on image layers
+    # to shorten reads.
+    neon_env_builder.pageserver_config_override = """
+page_cache_size=10
+"""
+
+    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    row_count = 10000
+    churn_rounds = 100
+
+    ps_http = env.pageserver.http_client()
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageserver.id)
+
+    log.info("Writing initial data ...")
+    workload.write_rows(row_count, env.pageserver.id)
+
+    for i in range(1, churn_rounds + 1):
+        if i % 10 == 0:
+            log.info(f"Running churn round {i}/{churn_rounds} ...")
+
+        workload.churn_rows(row_count, env.pageserver.id)
+        ps_http.timeline_compact(tenant_id, timeline_id)
+
+    log.info("Validating at workload end ...")
+    workload.validate(env.pageserver.id)
+
+    log.info("Checking layer access metrics ...")
+
+    layer_access_metric_names = [
+        "pageserver_layers_visited_per_read_global_sum",
+        "pageserver_layers_visited_per_read_global_count",
+        "pageserver_layers_visited_per_read_global_bucket",
+        "pageserver_layers_visited_per_vectored_read_global_sum",
+        "pageserver_layers_visited_per_vectored_read_global_count",
+        "pageserver_layers_visited_per_vectored_read_global_bucket",
+    ]
+
+    metrics = env.pageserver.http_client().get_metrics()
+    for name in layer_access_metric_names:
+        layer_access_metrics = metrics.query_all(name)
+        log.info(f"Got metrics: {layer_access_metrics}")
+
+    non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum")
+    non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count")
+    non_vectored_average = non_vectored_sum.value / non_vectored_count.value
+
+    vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
+    vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
+    vectored_average = vectored_sum.value / vectored_count.value
+
+    log.info(f"{non_vectored_average=} {vectored_average=}")
+
+    # The upper bound for average number of layer visits below (8)
+    # was chosen empirically for this workload.
+    assert non_vectored_average < 8
+    assert vectored_average < 8

From 8426fb886bcb19e509b2d4d40a0682316163685f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 23 Apr 2024 14:20:12 +0100
Subject: [PATCH 0603/1571] storage_controller: wait for db on startup (#7479)

## Problem

In some dev/test environments, there aren't health checks to guarantee
the database is available before starting the controller. This creates
friction for the developer.

## Summary of changes

- Wait up to 5 seconds for the database to become available on startup
---
 storage_controller/src/main.rs        |  3 +++
 storage_controller/src/persistence.rs | 26 ++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 6466b9f7a3..ca55d6c593 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -5,6 +5,7 @@ use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
 use std::sync::Arc;
+use std::time::Duration;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
@@ -245,6 +246,8 @@ async fn async_main() -> anyhow::Result<()> {
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
+    Persistence::await_connection(&secrets.database_url, Duration::from_secs(5)).await?;
+
     migration_run(&secrets.database_url)
         .await
         .context("Running database migrations")?;
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 5312e1e218..dca37166ba 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -2,6 +2,7 @@ pub(crate) mod split_state;
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::time::Duration;
+use std::time::Instant;
 
 use self::split_state::SplitState;
 use camino::Utf8Path;
@@ -144,6 +145,31 @@ impl Persistence {
         }
     }
 
+    /// A helper for use during startup, where we would like to tolerate concurrent restarts of the
+    /// database and the storage controller, therefore the database might not be available right away
+    pub async fn await_connection(
+        database_url: &str,
+        timeout: Duration,
+    ) -> Result<(), diesel::ConnectionError> {
+        let started_at = Instant::now();
+        loop {
+            match PgConnection::establish(database_url) {
+                Ok(_) => {
+                    tracing::info!("Connected to database.");
+                    return Ok(());
+                }
+                Err(e) => {
+                    if started_at.elapsed() > timeout {
+                        return Err(e);
+                    } else {
+                        tracing::info!("Database not yet available, waiting... ({e})");
+                        tokio::time::sleep(Duration::from_millis(100)).await;
+                    }
+                }
+            }
+        }
+    }
+
     /// Wraps `with_conn` in order to collect latency and error metrics
     async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
     where

From 89f023e6b0d18f39b08197d0db9875aa1fe924ed Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 23 Apr 2024 11:16:04 -0400
Subject: [PATCH 0604/1571] feat(pageserver): add metadata key range and aux
 key encoding (#7401)

Extracted from https://github.com/neondatabase/neon/pull/7375. We assume
everything >= 0x80 are metadata keys. AUX file keys are part of the
metadata keys, and we use `0x90` as the prefix for AUX file keys.

The AUX file encoding is described in the code comment. We use xxhash128
as the hash algorithm. It seems to be portable according to the
introduction,

> xxHash is an Extremely fast Hash algorithm, processing at RAM speed
limits. Code is highly portable, and produces hashes identical across
all platforms (little / big endian).

...though whether the Rust version follows the same convention is
unknown and might need manual review of the library. Anyways, we can
always change the hash algorithm before rolling it out in
staging/end-user, and I made a quick decision to use xxhash here because
it generates 128b hash + portable. We can save the discussion of which
hash algorithm to use later.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 Cargo.lock                          |   1 +
 libs/pageserver_api/src/key.rs      |  93 +++++++++++++++++++++++
 pageserver/Cargo.toml               |   1 +
 pageserver/src/aux_file.rs          | 112 ++++++++++++++++++++++++++++
 pageserver/src/lib.rs               |   1 +
 pageserver/src/pgdatadir_mapping.rs |   2 +-
 6 files changed, 209 insertions(+), 1 deletion(-)
 create mode 100644 pageserver/src/aux_file.rs

diff --git a/Cargo.lock b/Cargo.lock
index cff07239e7..85a59ec0ed 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3658,6 +3658,7 @@ dependencies = [
  "tokio-util",
  "toml_edit",
  "tracing",
+ "twox-hash",
  "url",
  "utils",
  "walkdir",
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 1d66dd8878..01919e8325 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,8 +1,10 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
+use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
+use std::ops::RangeInclusive;
 use std::{fmt, ops::Range};
 
 use crate::reltag::{BlockNumber, RelTag, SlruKind};
@@ -21,9 +23,81 @@ pub struct Key {
     pub field6: u32,
 }
 
+/// The storage key size.
 pub const KEY_SIZE: usize = 18;
 
+/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
+/// See [`Key::to_i128`] for more information on the encoding.
+pub const METADATA_KEY_SIZE: usize = 16;
+
+/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x80 is a metadata key.
+pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x80;
+
+/// The (reserved) key prefix of relation sizes.
+pub const RELATION_SIZE_PREFIX: u8 = 0x81;
+
+/// The key prefix of AUX file keys.
+pub const AUX_KEY_PREFIX: u8 = 0x82;
+
+/// Check if the key falls in the range of metadata keys.
+pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
+    key[0] >= METADATA_KEY_BEGIN_PREFIX
+}
+
 impl Key {
+    /// Check if the key falls in the range of metadata keys.
+    pub const fn is_metadata_key(&self) -> bool {
+        self.field1 >= METADATA_KEY_BEGIN_PREFIX
+    }
+
+    /// Encode a metadata key to a storage key.
+    pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
+        assert!(is_metadata_key_slice(key), "key not in metadata key range");
+        Key {
+            field1: key[0],
+            field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
+            field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
+            field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
+            field5: key[11],
+            field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
+        }
+    }
+
+    /// Encode a metadata key to a storage key.
+    pub fn from_metadata_key(key: &[u8]) -> Self {
+        Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
+    }
+
+    /// Extract a metadata key to a writer. The result should always be 16 bytes.
+    pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
+        writer.put_u8(self.field1);
+        assert!(self.field2 <= 0xFFFF);
+        writer.put_u16(self.field2 as u16);
+        writer.put_u32(self.field3);
+        writer.put_u32(self.field4);
+        writer.put_u8(self.field5);
+        writer.put_u32(self.field6);
+    }
+
+    /// Get the range of metadata keys.
+    pub fn metadata_key_range() -> RangeInclusive<Self> {
+        Key {
+            field1: METADATA_KEY_BEGIN_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }..=Key {
+            field1: u8::MAX,
+            field2: u16::MAX as u32,
+            field3: u32::MAX,
+            field4: u32::MAX,
+            field5: u8::MAX,
+            field6: u32::MAX,
+        }
+    }
+
     /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
     /// As long as Neon does not support tablespace (because of lack of access to local file system),
     /// we can assume that only some predefined namespace OIDs are used which can fit in u16
@@ -81,6 +155,8 @@ impl Key {
         key
     }
 
+    /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
+    /// Use [`Key::from_metadata_key`] instead.
     pub fn from_slice(b: &[u8]) -> Self {
         Key {
             field1: b[0],
@@ -92,6 +168,8 @@ impl Key {
         }
     }
 
+    /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
+    /// Use [`Key::extract_metadata_key_to_writer`] instead.
     pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
         buf[0] = self.field1;
         BE::write_u32(&mut buf[1..5], self.field2);
@@ -558,11 +636,14 @@ impl std::str::FromStr for Key {
 mod tests {
     use std::str::FromStr;
 
+    use crate::key::is_metadata_key_slice;
     use crate::key::Key;
 
     use rand::Rng;
     use rand::SeedableRng;
 
+    use super::AUX_KEY_PREFIX;
+
     #[test]
     fn display_fromstr_bijection() {
         let mut rng = rand::rngs::StdRng::seed_from_u64(42);
@@ -578,4 +659,16 @@ mod tests {
 
         assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
     }
+
+    #[test]
+    fn test_metadata_keys() {
+        let mut metadata_key = vec![AUX_KEY_PREFIX];
+        metadata_key.extend_from_slice(&[0xFF; 15]);
+        let encoded_key = Key::from_metadata_key(&metadata_key);
+        let mut output_key = Vec::new();
+        encoded_key.extract_metadata_key_to_writer(&mut output_key);
+        assert_eq!(metadata_key, output_key);
+        assert!(encoded_key.is_metadata_key());
+        assert!(is_metadata_key_slice(&metadata_key));
+    }
 }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 7a11610a91..4335f38f1e 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -70,6 +70,7 @@ tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
+twox-hash.workspace = true
 url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs
new file mode 100644
index 0000000000..aba4ccf19d
--- /dev/null
+++ b/pageserver/src/aux_file.rs
@@ -0,0 +1,112 @@
+use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
+use tracing::warn;
+
+/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
+fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
+    let mut key = [0; METADATA_KEY_SIZE];
+    let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
+    key[0] = AUX_KEY_PREFIX;
+    key[1] = dir_level1;
+    key[2] = dir_level2;
+    key[3..16].copy_from_slice(&hash[0..13]);
+    Key::from_metadata_key_fixed_size(&key)
+}
+
+const AUX_DIR_PG_LOGICAL: u8 = 0x01;
+const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
+const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
+
+/// Encode the aux file into a fixed-size key.
+///
+/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type.
+/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path
+/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix
+/// is roughly based on the first two components of the path, one unique number for one component.
+///
+/// * pg_logical/mappings -> 0x0101
+/// * pg_logical/snapshots -> 0x0102
+/// * pg_logical/replorigin_checkpoint -> 0x0103
+/// * pg_logical/others -> 0x01FF
+/// * pg_replslot/ -> 0x0201
+/// * others -> 0xFFFF
+///
+/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
+/// The new file type must have never been written to the storage before. Otherwise, there could be data
+/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix.
+pub fn encode_aux_file_key(path: &str) -> Key {
+    if let Some(fname) = path.strip_prefix("pg_logical/mappings/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes())
+    } else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes())
+    } else if path == "pg_logical/replorigin_checkpoint" {
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"")
+    } else if let Some(fname) = path.strip_prefix("pg_logical/") {
+        if cfg!(debug_assertions) {
+            warn!(
+                "unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning",
+                path
+            );
+        }
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
+    } else if let Some(fname) = path.strip_prefix("pg_replslot/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
+    } else {
+        if cfg!(debug_assertions) {
+            warn!(
+                "unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning",
+                path
+            );
+        }
+        aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_hash_portable() {
+        // AUX file encoding requires the hash to be portable across all platforms. This test case checks
+        // if the algorithm produces the same hash across different environments.
+        assert_eq!(
+            305317690835051308206966631765527126151,
+            twox_hash::xxh3::hash128("test1".as_bytes())
+        );
+        assert_eq!(
+            85104974691013376326742244813280798847,
+            twox_hash::xxh3::hash128("test/test2".as_bytes())
+        );
+        assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
+    }
+
+    #[test]
+    fn test_encoding_portable() {
+        // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
+        // of the page server.
+        assert_eq!(
+            "8200000101E5B20C5F8DD5AA3289D6D9EAFA",
+            encode_aux_file_key("pg_logical/mappings/test1").to_string()
+        );
+        assert_eq!(
+            "820000010239AAC544893139B26F501B97E6",
+            encode_aux_file_key("pg_logical/snapshots/test2").to_string()
+        );
+        assert_eq!(
+            "820000010300000000000000000000000000",
+            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
+        );
+        assert_eq!(
+            "82000001FF8635AF2134B7266EC5B4189FD6",
+            encode_aux_file_key("pg_logical/unsupported").to_string()
+        );
+        assert_eq!(
+            "8200000201772D0E5D71DE14DA86142A1619",
+            encode_aux_file_key("pg_replslot/test3").to_string()
+        );
+        assert_eq!(
+            "820000FFFF1866EBEB53B807B26A2416F317",
+            encode_aux_file_key("other_file_not_supported").to_string()
+        );
+    }
+}
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index f947a75f61..930700e50c 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -12,6 +12,7 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub use pageserver_api::keyspace;
+pub mod aux_file;
 pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 4a9682dcac..c733b38acb 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1402,7 +1402,7 @@ impl<'a> DatadirModification<'a> {
         let n_files;
         let mut aux_files = self.tline.aux_files.lock().await;
         if let Some(mut dir) = aux_files.dir.take() {
-            // We already updated aux files in `self`: emit a delta and update our latest value
+            // We already updated aux files in `self`: emit a delta and update our latest value.
             dir.upsert(file_path.clone(), content.clone());
             n_files = dir.files.len();
             if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {

From e22c072064ac32e4d9af7e6813beeb392f6d5ffe Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 23 Apr 2024 16:24:51 +0100
Subject: [PATCH 0605/1571] remote_storage: fix prefix handling in remote
 storage & clean up (#7431)

## Problem

Split off from https://github.com/neondatabase/neon/pull/7399, which is
the first piece of code that does a WithDelimiter object listing using a
prefix that isn't a full directory name.

## Summary of changes

- Revise list function to not append a `/` to the prefix -- prefixes
don't have to end with a slash.
- Fix local_fs implementation of list to not assume that WithDelimiter
case will always use a directory as a prerfix.
- Remove `list_files`, `list_prefixes` wrappers, as they add little
value and obscure the underlying list function -- we need callers to
understand the semantics of what they're really calling (listobjectsv2)
---
 libs/remote_storage/src/lib.rs                |  94 ++----
 libs/remote_storage/src/local_fs.rs           | 292 +++++++++++-------
 libs/remote_storage/src/s3_bucket.rs          |  44 ++-
 libs/remote_storage/src/simulate_failures.rs  |  21 --
 libs/remote_storage/tests/common/tests.rs     |  51 ++-
 libs/remote_storage/tests/test_real_azure.rs  |   4 -
 libs/remote_storage/tests/test_real_s3.rs     |  21 +-
 .../src/tenant/remote_timeline_client.rs      |  16 +-
 .../tenant/remote_timeline_client/download.rs |  11 +-
 safekeeper/src/wal_backup.rs                  |  19 +-
 10 files changed, 305 insertions(+), 268 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index e708854be2..14c391ca53 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -134,6 +134,11 @@ impl RemotePath {
     pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
         self.0.strip_prefix(&p.0)
     }
+
+    pub fn add_trailing_slash(&self) -> Self {
+        // Unwrap safety inputs are guararnteed to be valid UTF-8
+        Self(format!("{}/", self.0).try_into().unwrap())
+    }
 }
 
 /// We don't need callers to be able to pass arbitrary delimiters: just control
@@ -157,47 +162,21 @@ pub struct Listing {
 /// providing basic CRUD operations for storage files.
 #[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// Lists all top level subdirectories for a given prefix
-    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
-    /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
-    /// so this method doesnt need to.
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let result = self
-            .list(prefix, ListingMode::WithDelimiter, None, cancel)
-            .await?
-            .prefixes;
-        Ok(result)
-    }
-    /// Lists all files in directory "recursively"
-    /// (not really recursively, because AWS has a flat namespace)
-    /// Note: This is subtely different than list_prefixes,
-    /// because it is for listing files instead of listing
-    /// names sharing common prefixes.
-    /// For example,
-    /// list_files("foo/bar") = ["foo/bar/cat123.txt",
-    /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
-    /// whereas,
-    /// list_prefixes("foo/bar/") = ["cat", "dog"]
-    /// See `test_real_s3.rs` for more details.
+    /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
+    /// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
+    ///
+    /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
+    /// from the absolute root of the bucket.
+    ///
+    /// `mode` configures whether to use a delimiter.  Without a delimiter all keys
+    /// within the prefix are listed in the `keys` of the result.  With a delimiter, any "directories" at the top level of
+    /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
+    /// returned in `keys` ().
+    ///
+    /// `max_keys` controls the maximum number of keys that will be returned.  If this is None, this function
+    /// will iteratively call listobjects until it runs out of keys.  Note that this is not safe to use on
+    /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
     ///
-    /// max_keys limits max number of keys returned; None means unlimited.
-    async fn list_files(
-        &self,
-        prefix: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let result = self
-            .list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
-            .await?
-            .keys;
-        Ok(result)
-    }
-
     async fn list(
         &self,
         prefix: Option<&RemotePath>,
@@ -336,41 +315,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         }
     }
 
-    // A function for listing all the files in a "directory"
-    // Example:
-    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
-    //
-    // max_keys limits max number of keys returned; None means unlimited.
-    pub async fn list_files(
-        &self,
-        folder: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        match self {
-            Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
-            Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
-            Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
-            Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
-        }
-    }
-
-    // lists common *prefixes*, if any of files
-    // Example:
-    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
-    pub async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        match self {
-            Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
-            Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
-            Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
-            Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
-        }
-    }
-
     /// See [`RemoteStorage::upload`]
     pub async fn upload(
         &self,
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 8cad863731..1f7bcfc982 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,11 +5,9 @@
 //! volume is mounted to the local FS.
 
 use std::{
-    borrow::Cow,
-    future::Future,
+    collections::HashSet,
     io::ErrorKind,
     num::NonZeroU32,
-    pin::Pin,
     time::{Duration, SystemTime, UNIX_EPOCH},
 };
 
@@ -22,11 +20,11 @@ use tokio::{
     io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tokio_util::{io::ReaderStream, sync::CancellationToken};
-use tracing::*;
-use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
+use utils::crashsafe::path_with_suffix_extension;
 
 use crate::{
     Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 use super::{RemoteStorage, StorageMetadata};
@@ -93,7 +91,47 @@ impl LocalFs {
 
     #[cfg(test)]
     async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
-        Ok(get_all_files(&self.storage_root, true)
+        use std::{future::Future, pin::Pin};
+        fn get_all_files<'a, P>(
+            directory_path: P,
+        ) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
+        where
+            P: AsRef<Utf8Path> + Send + Sync + 'a,
+        {
+            Box::pin(async move {
+                let directory_path = directory_path.as_ref();
+                if directory_path.exists() {
+                    if directory_path.is_dir() {
+                        let mut paths = Vec::new();
+                        let mut dir_contents = fs::read_dir(directory_path).await?;
+                        while let Some(dir_entry) = dir_contents.next_entry().await? {
+                            let file_type = dir_entry.file_type().await?;
+                            let entry_path =
+                                Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
+                                    anyhow::Error::msg(format!(
+                                        "non-Unicode path: {}",
+                                        pb.to_string_lossy()
+                                    ))
+                                })?;
+                            if file_type.is_symlink() {
+                                tracing::debug!("{entry_path:?} is a symlink, skipping")
+                            } else if file_type.is_dir() {
+                                paths.extend(get_all_files(&entry_path).await?.into_iter())
+                            } else {
+                                paths.push(entry_path);
+                            }
+                        }
+                        Ok(paths)
+                    } else {
+                        bail!("Path {directory_path:?} is not a directory")
+                    }
+                } else {
+                    Ok(Vec::new())
+                }
+            })
+        }
+
+        Ok(get_all_files(&self.storage_root)
             .await?
             .into_iter()
             .map(|path| {
@@ -120,6 +158,14 @@ impl LocalFs {
         // S3 object list prefixes can be arbitrary strings, but when reading
         // the local filesystem we need a directory to start calling read_dir on.
         let mut initial_dir = full_path.clone();
+
+        // If there's no trailing slash, we have to start looking from one above: even if
+        // `initial_dir` is a directory, we should still list any prefixes in the parent
+        // that start with the same string.
+        if !full_path.to_string().ends_with('/') {
+            initial_dir.pop();
+        }
+
         loop {
             // Did we make it to the root?
             if initial_dir.parent().is_none() {
@@ -295,61 +341,66 @@ impl RemoteStorage for LocalFs {
         let op = async {
             let mut result = Listing::default();
 
-            if let ListingMode::NoDelimiter = mode {
-                let keys = self
-                    .list_recursive(prefix)
-                    .await
-                    .map_err(DownloadError::Other)?;
-
-                result.keys = keys
-                    .into_iter()
-                    .filter(|k| {
-                        let path = k.with_base(&self.storage_root);
-                        !path.is_dir()
-                    })
-                    .collect();
-
-                if let Some(max_keys) = max_keys {
-                    result.keys.truncate(max_keys.get() as usize);
-                }
-
-                return Ok(result);
-            }
-
-            let path = match prefix {
-                Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
-                None => Cow::Borrowed(&self.storage_root),
-            };
-
-            let prefixes_to_filter = get_all_files(path.as_ref(), false)
+            // Filter out directories: in S3 directories don't exist, only the keys within them do.
+            let keys = self
+                .list_recursive(prefix)
                 .await
                 .map_err(DownloadError::Other)?;
+            let keys = keys
+                .into_iter()
+                .filter(|k| {
+                    let path = k.with_base(&self.storage_root);
+                    !path.is_dir()
+                })
+                .collect();
 
-            // filter out empty directories to mirror s3 behavior.
-            for prefix in prefixes_to_filter {
-                if prefix.is_dir()
-                    && is_directory_empty(&prefix)
-                        .await
-                        .map_err(DownloadError::Other)?
-                {
-                    continue;
-                }
-
-                let stripped = prefix
-                    .strip_prefix(&self.storage_root)
-                    .context("Failed to strip prefix")
-                    .and_then(RemotePath::new)
-                    .expect(
-                        "We list files for storage root, hence should be able to remote the prefix",
-                    );
-
-                if prefix.is_dir() {
-                    result.prefixes.push(stripped);
-                } else {
-                    result.keys.push(stripped);
+            if let ListingMode::NoDelimiter = mode {
+                result.keys = keys;
+            } else {
+                let mut prefixes = HashSet::new();
+                for key in keys {
+                    // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
+                    let relative_key = if let Some(prefix) = prefix {
+                        let mut prefix = prefix.clone();
+                        // We only strip the dirname of the prefix, so that when we strip it from the start of keys we
+                        // end up with full file/dir names.
+                        let prefix_full_local_path = prefix.with_base(&self.storage_root);
+                        let has_slash = prefix.0.to_string().ends_with('/');
+                        let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
+                            prefix
+                        } else {
+                            prefix.0.pop();
+                            prefix
+                        };
+
+                        RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
+                    } else {
+                        key
+                    };
+
+                    let relative_key = format!("{}", relative_key);
+                    if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                        let first_part = relative_key
+                            .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                            .next()
+                            .unwrap()
+                            .to_owned();
+                        prefixes.insert(first_part);
+                    } else {
+                        result
+                            .keys
+                            .push(RemotePath::from_string(&relative_key).unwrap());
+                    }
                 }
+                result.prefixes = prefixes
+                    .into_iter()
+                    .map(|s| RemotePath::from_string(&s).unwrap())
+                    .collect();
             }
 
+            if let Some(max_keys) = max_keys {
+                result.keys.truncate(max_keys.get() as usize);
+            }
             Ok(result)
         };
 
@@ -560,50 +611,6 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
     path_with_suffix_extension(original_path, "metadata")
 }
 
-fn get_all_files<'a, P>(
-    directory_path: P,
-    recursive: bool,
-) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
-where
-    P: AsRef<Utf8Path> + Send + Sync + 'a,
-{
-    Box::pin(async move {
-        let directory_path = directory_path.as_ref();
-        if directory_path.exists() {
-            if directory_path.is_dir() {
-                let mut paths = Vec::new();
-                let mut dir_contents = fs::read_dir(directory_path).await?;
-                while let Some(dir_entry) = dir_contents.next_entry().await? {
-                    let file_type = dir_entry.file_type().await?;
-                    let entry_path =
-                        Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
-                            anyhow::Error::msg(format!(
-                                "non-Unicode path: {}",
-                                pb.to_string_lossy()
-                            ))
-                        })?;
-                    if file_type.is_symlink() {
-                        debug!("{entry_path:?} is a symlink, skipping")
-                    } else if file_type.is_dir() {
-                        if recursive {
-                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
-                        } else {
-                            paths.push(entry_path)
-                        }
-                    } else {
-                        paths.push(entry_path);
-                    }
-                }
-                Ok(paths)
-            } else {
-                bail!("Path {directory_path:?} is not a directory")
-            }
-        } else {
-            Ok(Vec::new())
-        }
-    })
-}
-
 async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
     let target_dir = match target_file_path.parent() {
         Some(parent_dir) => parent_dir,
@@ -923,13 +930,18 @@ mod fs_tests {
         // No delimiter: should recursively list everything
         let (storage, cancel) = create_storage()?;
         let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
+        let child_sibling =
+            upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
         let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
 
         let listing = storage
             .list(None, ListingMode::NoDelimiter, None, &cancel)
             .await?;
         assert!(listing.prefixes.is_empty());
-        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
+        assert_eq!(
+            listing.keys.into_iter().collect::<HashSet<_>>(),
+            HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
+        );
 
         // Delimiter: should only go one deep
         let listing = storage
@@ -942,7 +954,25 @@ mod fs_tests {
         );
         assert!(listing.keys.is_empty());
 
-        // Delimiter & prefix
+        // Delimiter & prefix with a trailing slash
+        let listing = storage
+            .list(
+                Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
+            )
+            .await?;
+        assert_eq!(
+            listing.keys,
+            [RemotePath::from_string("uncle").unwrap()].to_vec()
+        );
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("parent").unwrap()].to_vec()
+        );
+
+        // Delimiter and prefix without a trailing slash
         let listing = storage
             .list(
                 Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
@@ -951,12 +981,66 @@ mod fs_tests {
                 &cancel,
             )
             .await?;
+        assert_eq!(listing.keys, [].to_vec());
         assert_eq!(
             listing.prefixes,
-            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
-                .to_vec()
+            [RemotePath::from_string("grandparent").unwrap()].to_vec()
+        );
+
+        // Delimiter and prefix that's partway through a path component
+        let listing = storage
+            .list(
+                Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
+            )
+            .await?;
+        assert_eq!(listing.keys, [].to_vec());
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("grandparent").unwrap()].to_vec()
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn list_part_component() -> anyhow::Result<()> {
+        // No delimiter: should recursively list everything
+        let (storage, cancel) = create_storage()?;
+
+        // Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
+        // of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
+        // a freeform prefix.
+        let _child_a =
+            upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
+        let _child_b =
+            upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
+
+        // Delimiter and prefix that's partway through a path component
+        let listing = storage
+            .list(
+                Some(
+                    &RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
+                ),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
+            )
+            .await?;
+        assert_eq!(listing.keys, [].to_vec());
+
+        let mut found_prefixes = listing.prefixes.clone();
+        found_prefixes.sort();
+        assert_eq!(
+            found_prefixes,
+            [
+                RemotePath::from_string("tenant").unwrap(),
+                RemotePath::from_string("tenant-01").unwrap(),
+            ]
+            .to_vec()
         );
-        assert_eq!(listing.keys, [uncle.clone()].to_vec());
 
         Ok(())
     }
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 1cb85cfb1b..8091681221 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -178,10 +178,7 @@ impl S3Bucket {
 
     pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
         assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path
-            .get_path()
-            .as_str()
-            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
+        let path_string = path.get_path().as_str();
         match &self.prefix_in_bucket {
             Some(prefix) => prefix.clone() + "/" + path_string,
             None => path_string.to_string(),
@@ -471,16 +468,11 @@ impl RemoteStorage for S3Bucket {
         // get the passed prefix or if it is not set use prefix_in_bucket value
         let list_prefix = prefix
             .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| self.prefix_in_bucket.clone())
-            .map(|mut p| {
-                // required to end with a separator
-                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
-                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                }
-                p
+            .or_else(|| {
+                self.prefix_in_bucket.clone().map(|mut s| {
+                    s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                    s
+                })
             });
 
         let _permit = self.permit(kind, cancel).await?;
@@ -549,11 +541,15 @@ impl RemoteStorage for S3Bucket {
                 }
             }
 
-            result.prefixes.extend(
-                prefixes
-                    .iter()
-                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
-            );
+            // S3 gives us prefixes like "foo/", we return them like "foo"
+            result.prefixes.extend(prefixes.iter().filter_map(|o| {
+                Some(
+                    self.s3_object_to_relative_path(
+                        o.prefix()?
+                            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
+                    ),
+                )
+            }));
 
             continuation_token = match response.next_continuation_token {
                 Some(new_token) => Some(new_token),
@@ -1050,22 +1046,22 @@ mod tests {
             Some("/test/prefix/"),
         ];
         let expected_outputs = [
-            vec!["", "some/path", "some/path"],
-            vec!["/", "/some/path", "/some/path"],
+            vec!["", "some/path", "some/path/"],
+            vec!["/", "/some/path", "/some/path/"],
             vec![
                 "test/prefix/",
                 "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
             ],
             vec![
                 "test/prefix/",
                 "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
             ],
             vec![
                 "test/prefix/",
                 "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
             ],
         ];
 
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index f5344d3ae2..c467a2d196 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -107,27 +107,6 @@ impl UnreliableWrapper {
 type VoidStorage = crate::LocalFs;
 
 impl RemoteStorage for UnreliableWrapper {
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
-            .map_err(DownloadError::Other)?;
-        self.inner.list_prefixes(prefix, cancel).await
-    }
-
-    async fn list_files(
-        &self,
-        folder: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
-            .map_err(DownloadError::Other)?;
-        self.inner.list_files(folder, max_keys, cancel).await
-    }
-
     async fn list(
         &self,
         prefix: Option<&RemotePath>,
diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs
index 72f6f956e0..673151c8ef 100644
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,5 +1,6 @@
 use anyhow::Context;
 use camino::Utf8Path;
+use remote_storage::ListingMode;
 use remote_storage::RemotePath;
 use std::sync::Arc;
 use std::{collections::HashSet, num::NonZeroU32};
@@ -54,9 +55,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
     let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
         .context("common_prefix construction")?;
     let root_remote_prefixes = test_client
-        .list_prefixes(None, &cancel)
-        .await
-        .context("client list root prefixes failure")?
+        .list(None, ListingMode::WithDelimiter, None, &cancel)
+        .await?
+        .prefixes
         .into_iter()
         .collect::<HashSet<_>>();
     assert_eq!(
@@ -65,9 +66,14 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
     );
 
     let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix), &cancel)
-        .await
-        .context("client list nested prefixes failure")?
+        .list(
+            Some(&base_prefix.add_trailing_slash()),
+            ListingMode::WithDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .prefixes
         .into_iter()
         .collect::<HashSet<_>>();
     let remote_only_prefixes = nested_remote_prefixes
@@ -90,11 +96,13 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
 ///
 /// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
 /// Then performs the following queries:
-///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+///    1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
 #[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
 #[tokio::test]
-async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
+async fn list_no_delimiter_works(
+    ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
+) -> anyhow::Result<()> {
     let ctx = match ctx {
         MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
         MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
@@ -107,29 +115,36 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
     let base_prefix =
         RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
     let root_files = test_client
-        .list_files(None, None, &cancel)
+        .list(None, ListingMode::NoDelimiter, None, &cancel)
         .await
         .context("client list root files failure")?
+        .keys
         .into_iter()
         .collect::<HashSet<_>>();
     assert_eq!(
         root_files,
         ctx.remote_blobs.clone(),
-        "remote storage list_files on root mismatches with the uploads."
+        "remote storage list on root mismatches with the uploads."
     );
 
     // Test that max_keys limit works. In total there are about 21 files (see
     // upload_simple_remote_data call in test_real_s3.rs).
     let limited_root_files = test_client
-        .list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
+        .list(
+            None,
+            ListingMode::NoDelimiter,
+            Some(NonZeroU32::new(2).unwrap()),
+            &cancel,
+        )
         .await
         .context("client list root files failure")?;
-    assert_eq!(limited_root_files.len(), 2);
+    assert_eq!(limited_root_files.keys.len(), 2);
 
     let nested_remote_files = test_client
-        .list_files(Some(&base_prefix), None, &cancel)
+        .list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
         .await
         .context("client list nested files failure")?
+        .keys
         .into_iter()
         .collect::<HashSet<_>>();
     let trim_remote_blobs: HashSet<_> = ctx
@@ -141,7 +156,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
         .collect();
     assert_eq!(
         nested_remote_files, trim_remote_blobs,
-        "remote storage list_files on subdirrectory mismatches with the uploads."
+        "remote storage list on subdirrectory mismatches with the uploads."
     );
     Ok(())
 }
@@ -199,7 +214,11 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
 
     ctx.client.delete_objects(&[path1, path2], &cancel).await?;
 
-    let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
+    let prefixes = ctx
+        .client
+        .list(None, ListingMode::WithDelimiter, None, &cancel)
+        .await?
+        .prefixes;
 
     assert_eq!(prefixes.len(), 1);
 
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 6aa02868e6..cd0b2be4b5 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -132,10 +132,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
     }
 }
 
-// NOTE: the setups for the list_prefixes test and the list_files test are very similar
-// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
-// whereas the list_files function is concerned with listing files.
-// See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
     Enabled(AzureWithSimpleTestBlobs),
     Disabled,
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index c5d5216f00..01f6a532d6 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -12,8 +12,8 @@ use anyhow::Context;
 use camino::Utf8Path;
 use futures_util::StreamExt;
 use remote_storage::{
-    DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
-    S3Config,
+    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
+    RemoteStorageKind, S3Config,
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
@@ -75,11 +75,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
         client: &Arc<GenericRemoteStorage>,
         cancel: &CancellationToken,
     ) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(retry(|| client.list_files(None, None, cancel))
-            .await
-            .context("list root files failure")?
-            .into_iter()
-            .collect::<HashSet<_>>())
+        Ok(
+            retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
+                .await
+                .context("list root files failure")?
+                .keys
+                .into_iter()
+                .collect::<HashSet<_>>(),
+        )
     }
 
     let cancel = CancellationToken::new();
@@ -294,10 +297,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
     }
 }
 
-// NOTE: the setups for the list_prefixes test and the list_files test are very similar
-// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
-// whereas the list_files function is concerned with listing files.
-// See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
     Enabled(S3WithSimpleTestBlobs),
     Disabled,
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1fa3badefb..d02f00adad 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -202,7 +202,9 @@ use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
 use std::time::Duration;
 
-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
+use remote_storage::{
+    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel,
+};
 use std::ops::DerefMut;
 use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
@@ -1145,7 +1147,7 @@ impl RemoteTimelineClient {
         // and retry will arrive to different pageserver there wont be any traces of it on remote storage
         let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
 
-        // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
+        // Execute all pending deletions, so that when we proceed to do a listing below, we aren't
         // taking the burden of listing all the layers that we already know we should delete.
         self.flush_deletion_queue().await?;
 
@@ -1154,14 +1156,20 @@ impl RemoteTimelineClient {
         let remaining = download_retry(
             || async {
                 self.storage_impl
-                    .list_files(Some(&timeline_storage_path), None, &cancel)
+                    .list(
+                        Some(&timeline_storage_path),
+                        ListingMode::NoDelimiter,
+                        None,
+                        &cancel,
+                    )
                     .await
             },
             "list remaining files",
             &cancel,
         )
         .await
-        .context("list files remaining files")?;
+        .context("list files remaining files")?
+        .keys;
 
         // We will delete the current index_part object last, since it acts as a deletion
         // marker via its deleted_at attribute
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 6ee8ad7155..84692aa577 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -258,7 +258,7 @@ pub async fn list_remote_timelines(
     tenant_shard_id: TenantShardId,
     cancel: CancellationToken,
 ) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
-    let remote_path = remote_timelines_path(&tenant_shard_id);
+    let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();
 
     fail::fail_point!("storage-sync-list-remote-timelines", |_| {
         anyhow::bail!("storage-sync-list-remote-timelines");
@@ -417,11 +417,16 @@ pub(super) async fn download_index_part(
     let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
 
     let indices = download_retry(
-        || async { storage.list_files(Some(&index_prefix), None, cancel).await },
+        || async {
+            storage
+                .list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
+                .await
+        },
         "list index_part files",
         cancel,
     )
-    .await?;
+    .await?
+    .keys;
 
     // General case logic for which index to use: the latest index whose generation
     // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index e3f6a606a0..e496f07114 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -18,7 +18,7 @@ use std::time::Duration;
 use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::{XLogSegNo, PG_TLI};
-use remote_storage::{GenericRemoteStorage, RemotePath, StorageMetadata};
+use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata};
 use tokio::fs::File;
 
 use tokio::select;
@@ -601,12 +601,18 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     backoff::retry(
         || async {
             // Do list-delete in batch_size batches to make progress even if there a lot of files.
-            // Alternatively we could make list_files return iterator, but it is more complicated and
+            // Alternatively we could make remote storage list return iterator, but it is more complicated and
             // I'm not sure deleting while iterating is expected in s3.
             loop {
                 let files = storage
-                    .list_files(Some(&remote_path), Some(batch_size), &cancel)
-                    .await?;
+                    .list(
+                        Some(&remote_path),
+                        ListingMode::NoDelimiter,
+                        Some(batch_size),
+                        &cancel,
+                    )
+                    .await?
+                    .keys;
                 if files.is_empty() {
                     return Ok(()); // done
                 }
@@ -666,8 +672,9 @@ pub async fn copy_s3_segments(
     let cancel = CancellationToken::new();
 
     let files = storage
-        .list_files(Some(&remote_path), None, &cancel)
-        .await?;
+        .list(Some(&remote_path), ListingMode::NoDelimiter, None, &cancel)
+        .await?
+        .keys;
 
     let uploaded_segments = &files
         .iter()

From ee9ec26808d71e441b7d0c96bf9a046ced831f88 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 23 Apr 2024 17:16:17 +0100
Subject: [PATCH 0606/1571] pageserver: change pitr_interval=0 behavior (#7423)

## Problem

We already made a change in #6407 to make pitr_interval authoritative
for synthetic size calculations (do not charge users for data retained
due to gc_horizon), but that change didn't cover the case where someone
entirely disables time-based retention by setting pitr_interval=0

Relates to: https://github.com/neondatabase/neon/issues/6374

## Summary of changes

When pitr_interval is zero, do not set `pitr_cutoff` based on
gc_horizon.

gc_horizon is still enforced, but separately (its value is passed
separately, there was never a need to claim pitr_cutoff to gc_horizon)

## More detail

### Issue 1
Before this PR, we would skip the update_gc_info for timelines with
last_record_lsn() < gc_horizon.
Let's call such timelines "tiny".

The rationale for that presumably was that we can't GC anything in the
tiny timelines, why bother to call update_gc_info().

However, synthetic size calculation relies on up-to-date
update_gc_info() data.

Before this PR, tiny timelines would never get an updated
GcInfo::pitr_horizon (it remained Lsn(0)).
Even on projects with pitr_interval=0d.

With this PR, update_gc_info is always called, hence
GcInfo::pitr_horizon is always updated, thereby
providing synthetic size calculation with up-to-data data.

### Issue 2
Before this PR, regardless of whether the timeline is "tiny" or not,
GcInfo::pitr_horizon was clamped to at least last_record_lsn -
gc_horizon, even if the pitr window in terms of LSN range was shorter
(=less than) the gc_horizon.

With this PR, that clamping is removed, so, for pitr_interval=0, the
pitr_horizon = last_record_lsn.
---
 pageserver/src/tenant.rs                | 29 +++++-----
 pageserver/src/tenant/timeline.rs       |  5 +-
 test_runner/regress/test_tenant_size.py | 71 +++++--------------------
 3 files changed, 30 insertions(+), 75 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 098bad71fb..15350e93e9 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2870,20 +2870,23 @@ impl Tenant {
                 }
             }
 
-            if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) {
-                let branchpoints: Vec<Lsn> = all_branchpoints
-                    .range((
-                        Included((timeline_id, Lsn(0))),
-                        Included((timeline_id, Lsn(u64::MAX))),
-                    ))
-                    .map(|&x| x.1)
-                    .collect();
-                timeline
-                    .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
-                    .await?;
+            let cutoff = timeline
+                .get_last_record_lsn()
+                .checked_sub(horizon)
+                .unwrap_or(Lsn(0));
 
-                gc_timelines.push(timeline);
-            }
+            let branchpoints: Vec<Lsn> = all_branchpoints
+                .range((
+                    Included((timeline_id, Lsn(0))),
+                    Included((timeline_id, Lsn(u64::MAX))),
+                ))
+                .map(|&x| x.1)
+                .collect();
+            timeline
+                .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
+                .await?;
+
+            gc_timelines.push(timeline);
         }
         drop(gc_cs);
         Ok(gc_timelines)
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2fbe3c63a2..22b8a17874 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4244,9 +4244,8 @@ impl Timeline {
                 *self.get_latest_gc_cutoff_lsn()
             }
         } else {
-            // No time-based retention was configured. Set time-based cutoff to
-            // same as LSN based.
-            cutoff_horizon
+            // No time-based retention was configured. Interpret this as "keep no history".
+            self.get_last_record_lsn()
         };
 
         // Grab the lock and update the values
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 4c8fd4b0e5..a588f6ab53 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -292,33 +292,12 @@ def test_single_branch_get_tenant_size_grows(
     Operate on single branch reading the tenants size after each transaction.
     """
 
-    # Disable automatic gc and compaction.
-    # The pitr_interval here is quite problematic, so we cannot really use it.
-    # it'd have to be calibrated per test executing env.
-
-    # there was a bug which was hidden if the create table and first batch of
-    # inserts is larger than gc_horizon. for example 0x20000 here hid the fact
-    # that there next_gc_cutoff could be smaller than initdb_lsn, which will
-    # obviously lead to issues when calculating the size.
-    gc_horizon = 0x3BA00
-
-    # it's a bit of a hack, but different versions of postgres have different
-    # amount of WAL generated for the same amount of data. so we need to
-    # adjust the gc_horizon accordingly.
-    if pg_version == PgVersion.V14:
-        gc_horizon = 0x4A000
-    elif pg_version == PgVersion.V15:
-        gc_horizon = 0x3BA00
-    elif pg_version == PgVersion.V16:
-        gc_horizon = 210000
-    else:
-        raise NotImplementedError(pg_version)
-
+    # Disable automatic compaction and GC, and set a long PITR interval: we will expect
+    # size to always increase with writes as all writes remain within the PITR
     tenant_config = {
         "compaction_period": "0s",
         "gc_period": "0s",
-        "pitr_interval": "0s",
-        "gc_horizon": gc_horizon,
+        "pitr_interval": "3600s",
     }
 
     env = neon_env_builder.init_start(initial_tenant_conf=tenant_config)
@@ -332,18 +311,6 @@ def test_single_branch_get_tenant_size_grows(
 
     size_debug_file = open(test_output_dir / "size_debug.html", "w")
 
-    def check_size_change(
-        current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev_size: int
-    ):
-        if current_lsn - initdb_lsn >= gc_horizon:
-            assert (
-                size >= prev_size
-            ), f"tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})"
-        else:
-            assert (
-                size > prev_size
-            ), f"tenant_size should grow, because we continue to add WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})"
-
     def get_current_consistent_size(
         env: NeonEnv,
         endpoint: Endpoint,
@@ -412,14 +379,6 @@ def test_single_branch_get_tenant_size_grows(
             )
 
             prev_size = collected_responses[-1][2]
-
-            # branch start shouldn't be past gc_horizon yet
-            # thus the size should grow as we insert more data
-            # "gc_horizon" is tuned so that it kicks in _after_ the
-            # insert phase, but before the update phase ends.
-            assert (
-                current_lsn - initdb_lsn <= gc_horizon
-            ), "Tuning of GC window is likely out-of-date"
             assert size > prev_size
 
             collected_responses.append(("INSERT", current_lsn, size))
@@ -439,8 +398,7 @@ def test_single_branch_get_tenant_size_grows(
             )
 
             prev_size = collected_responses[-1][2]
-
-            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+            assert size > prev_size
 
             collected_responses.append(("UPDATE", current_lsn, size))
 
@@ -457,8 +415,7 @@ def test_single_branch_get_tenant_size_grows(
             )
 
             prev_size = collected_responses[-1][2]
-
-            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+            assert size > prev_size
 
             collected_responses.append(("DELETE", current_lsn, size))
 
@@ -469,20 +426,20 @@ def test_single_branch_get_tenant_size_grows(
         with endpoint.cursor() as cur:
             cur.execute("DROP TABLE t0")
 
-        # Without setting a PITR interval, dropping the table doesn't reclaim any space
-        # from the user's point of view, because the DROP transaction is too small
-        # to fall out of gc_horizon.
+        # Dropping the table doesn't reclaim any space
+        # from the user's point of view, because the DROP transaction is still
+        # within pitr_interval.
         (current_lsn, size) = get_current_consistent_size(
             env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
         )
-        prev_size = collected_responses[-1][2]
-        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+        assert size >= prev_size
+        prev_size = size
 
-        # Set a tiny PITR interval to allow the DROP to impact the synthetic size
+        # Set a zero PITR interval to allow the DROP to impact the synthetic size
         # Because synthetic size calculation uses pitr interval when available,
         # when our tenant is configured with a tiny pitr interval, dropping a table should
         # cause synthetic size to go down immediately
-        tenant_config["pitr_interval"] = "1ms"
+        tenant_config["pitr_interval"] = "0s"
         env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config)
         (current_lsn, size) = get_current_consistent_size(
             env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
@@ -494,10 +451,6 @@ def test_single_branch_get_tenant_size_grows(
         # defined by gc_horizon.
         collected_responses.append(("DROP", current_lsn, size))
 
-    # Should have gone past gc_horizon, otherwise gc_horizon is too large
-    bytes_written = current_lsn - initdb_lsn
-    assert bytes_written > gc_horizon
-
     # this isn't too many lines to forget for a while. observed while
     # developing these tests that locally the value is a bit more than what we
     # get in the ci.

From 18fd73d84afd1086414ba9fae1d08c16660809ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 24 Apr 2024 00:46:48 +0200
Subject: [PATCH 0607/1571] get_lsn_by_timestamp: clamp commit_lsn to be >=
 min_lsn (#7488)

There was an edge case where
`get_lsn_by_timestamp`/`find_lsn_for_timestamp` could have returned an
lsn that is before the limits we enforce: when we did find SLRU entries
with timestamps before the one we search for.

The API contract of `get_lsn_by_timestamp` is to not return something
before the anchestor lsn.

cc https://neondb.slack.com/archives/C03F5SM1N02/p1713871064147029
---
 pageserver/src/pgdatadir_mapping.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index c733b38acb..2c98c0b6c8 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -448,6 +448,11 @@ impl Timeline {
         // include physical changes from later commits that will be marked
         // as aborted, and will need to be vacuumed away.
         let commit_lsn = Lsn((low - 1) * 8);
+        // This maxing operation is for the edge case that the search above did
+        // set found_smaller to true but it never increased the lsn. Then, low
+        // is still the old min_lsn the subtraction above could possibly give a value
+        // below the anchestor_lsn.
+        let commit_lsn = commit_lsn.max(min_lsn);
         match (found_smaller, found_larger) {
             (false, false) => {
                 // This can happen if no commit records have been processed yet, e.g.

From a60035b23a2f05e512036131f5aef506e583c213 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 24 Apr 2024 11:38:59 +0300
Subject: [PATCH 0608/1571] fix: avoid starving background task permits in
 eviction task (#7471)

As seen with a recent incident, eviction tasks can cause pageserver-wide
permit starvation on the background task semaphore when synthetic size
calculation takes a long time for a tenant that has more than our permit
number of timelines or multiple tenants that have slow synthetic size
and total number of timelines exceeds the permits. Metric links can be
found in the internal [slack thread].

As a solution, release the permit while waiting for the state guarding
the synthetic size calculation. This will most likely hurt the eviction
task eviction performance, but that does not matter because we are
hoping to get away from it using OnlyImitiate policy anyway and rely
solely on disk usage-based eviction.

[slack thread]:
https://neondb.slack.com/archives/C06UEMLK7FE/p1713810505587809?thread_ts=1713468604.508969&cid=C06UEMLK7FE
---
 pageserver/src/tenant/tasks.rs                |  2 +-
 .../src/tenant/timeline/eviction_task.rs      | 68 ++++++++++++-------
 2 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 74ed677ffe..41b77c1f4a 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -62,7 +62,7 @@ impl BackgroundLoopKind {
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
     loop_kind: BackgroundLoopKind,
     _ctx: &RequestContext,
-) -> impl Drop {
+) -> tokio::sync::SemaphorePermit<'static> {
     let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
         .with_label_values(&[loop_kind.as_static_str()])
         .guard();
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 304d0d60ee..3567761b9a 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -188,24 +188,10 @@ impl Timeline {
     ) -> ControlFlow<()> {
         let now = SystemTime::now();
 
-        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
-            BackgroundLoopKind::Eviction,
-            ctx,
-        );
+        let permit = self.acquire_imitation_permit(cancel, ctx).await?;
 
-        let _permit = tokio::select! {
-            permit = acquire_permit => permit,
-            _ = cancel.cancelled() => return ControlFlow::Break(()),
-            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
-        };
-
-        match self
-            .imitate_layer_accesses(tenant, p, cancel, gate, ctx)
-            .await
-        {
-            ControlFlow::Break(()) => return ControlFlow::Break(()),
-            ControlFlow::Continue(()) => (),
-        }
+        self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
+            .await?;
 
         #[derive(Debug, Default)]
         struct EvictionStats {
@@ -330,19 +316,27 @@ impl Timeline {
         gate: &GateGuard,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
+        let permit = self.acquire_imitation_permit(cancel, ctx).await?;
+
+        self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
+            .await
+    }
+
+    async fn acquire_imitation_permit(
+        &self,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> {
         let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
             BackgroundLoopKind::Eviction,
             ctx,
         );
 
-        let _permit = tokio::select! {
-            permit = acquire_permit => permit,
-            _ = cancel.cancelled() => return ControlFlow::Break(()),
-            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
-        };
-
-        self.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
-            .await
+        tokio::select! {
+            permit = acquire_permit => ControlFlow::Continue(permit),
+            _ = cancel.cancelled() => ControlFlow::Break(()),
+            _ = self.cancel.cancelled() => ControlFlow::Break(()),
+        }
     }
 
     /// If we evict layers but keep cached values derived from those layers, then
@@ -376,6 +370,7 @@ impl Timeline {
         p: &EvictionPolicyLayerAccessThreshold,
         cancel: &CancellationToken,
         gate: &GateGuard,
+        permit: tokio::sync::SemaphorePermit<'static>,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
         if !self.tenant_shard_id.is_shard_zero() {
@@ -408,7 +403,28 @@ impl Timeline {
         // Make one of the tenant's timelines draw the short straw and run the calculation.
         // The others wait until the calculation is done so that they take into account the
         // imitated accesses that the winner made.
-        let mut state = tenant.eviction_task_tenant_state.lock().await;
+        let (mut state, _permit) = {
+            if let Ok(locked) = tenant.eviction_task_tenant_state.try_lock() {
+                (locked, permit)
+            } else {
+                // we might need to wait for a long time here in case of pathological synthetic
+                // size calculation performance
+                drop(permit);
+                let locked = tokio::select! {
+                    locked = tenant.eviction_task_tenant_state.lock() => locked,
+                    _ = self.cancel.cancelled() => {
+                        return ControlFlow::Break(())
+                    },
+                    _ = cancel.cancelled() => {
+                        return ControlFlow::Break(())
+                    }
+                };
+                // then reacquire -- this will be bad if there is a lot of traffic, but because we
+                // released the permit, the overall latency will be much better.
+                let permit = self.acquire_imitation_permit(cancel, ctx).await?;
+                (locked, permit)
+            }
+        };
         match state.last_layer_access_imitation {
             Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
             _ => {

From 5dda371c2b75213bb3fa286cc7ba612980379613 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 24 Apr 2024 15:13:18 +0200
Subject: [PATCH 0609/1571] Fix a bug with retries (#7494)

## Problem

## Summary of changes

By default, it's 5s retry.
---
 proxy/src/bin/proxy.rs       |  7 +++++--
 proxy/src/cache/endpoints.rs | 12 ++++++++++--
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 7df320fd42..760ccf40d4 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -403,7 +403,7 @@ async fn main() -> anyhow::Result<()> {
         maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
         client_tasks.spawn(usage_metrics::task_backup(
             &metrics_config.backup_metric_collection_config,
-            cancellation_token,
+            cancellation_token.clone(),
         ));
     }
 
@@ -423,7 +423,10 @@ async fn main() -> anyhow::Result<()> {
                 let cache = api.caches.endpoints_cache.clone();
                 let con = regional_redis_client;
                 let span = tracing::info_span!("endpoints_cache");
-                maintenance_tasks.spawn(async move { cache.do_read(con).await }.instrument(span));
+                maintenance_tasks.spawn(
+                    async move { cache.do_read(con, cancellation_token.clone()).await }
+                        .instrument(span),
+                );
             }
         }
     }
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 2aa1986d5e..02511e6ff7 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -4,6 +4,7 @@ use std::{
         atomic::{AtomicBool, Ordering},
         Arc,
     },
+    time::Duration,
 };
 
 use dashmap::DashSet;
@@ -13,6 +14,7 @@ use redis::{
 };
 use serde::Deserialize;
 use tokio::sync::Mutex;
+use tokio_util::sync::CancellationToken;
 use tracing::info;
 
 use crate::{
@@ -111,16 +113,22 @@ impl EndpointsCache {
     pub async fn do_read(
         &self,
         mut con: ConnectionWithCredentialsProvider,
+        cancellation_token: CancellationToken,
     ) -> anyhow::Result<Infallible> {
         let mut last_id = "0-0".to_string();
         loop {
-            self.ready.store(false, Ordering::Release);
             if let Err(e) = con.connect().await {
                 tracing::error!("error connecting to redis: {:?}", e);
-                continue;
+                self.ready.store(false, Ordering::Release);
             }
             if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
                 tracing::error!("error reading from redis: {:?}", e);
+                self.ready.store(false, Ordering::Release);
+            }
+            if cancellation_token.is_cancelled() {
+                info!("cancellation token is cancelled, exiting");
+                tokio::time::sleep(Duration::from_secs(60 * 60 * 24 * 7)).await;
+                // 1 week.
             }
             tokio::time::sleep(self.config.retry_interval).await;
         }

From 2a3a8ee31d5ddf98a8b1e335034ddbdd2818dc12 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 24 Apr 2024 14:52:46 +0100
Subject: [PATCH 0610/1571] pageserver: publish the same metrics from both read
 paths (#7486)

## Problem
Vectored and non-vectored read paths don't publish the same set of
metrics. Metrics parity is needed for coalescing the read paths.

## Summary of changes
* Publish reconstruct time and fetching data for reconstruct time from
the vectored read path
* Remove pageserver_getpage_reconstruct_seconds{res="err"} - wasn't used
anyway
---
 pageserver/src/metrics.rs         | 52 ++++++++++++++++++++++++-------
 pageserver/src/tenant/timeline.rs | 22 +++++++++++--
 2 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 66bf21ddec..6ce7f286b3 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -105,31 +105,39 @@ pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
 });
 
 // Metrics collected on operations on the storage repository.
+#[derive(
+    Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
+)]
+pub(crate) enum GetKind {
+    Singular,
+    Vectored,
+}
 
 pub(crate) struct ReconstructTimeMetrics {
-    ok: Histogram,
-    err: Histogram,
+    singular: Histogram,
+    vectored: Histogram,
 }
 
 pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
     let inner = register_histogram_vec!(
         "pageserver_getpage_reconstruct_seconds",
         "Time spent in reconstruct_value (reconstruct a page from deltas)",
-        &["result"],
+        &["get_kind"],
         CRITICAL_OP_BUCKETS.into(),
     )
     .expect("failed to define a metric");
+
     ReconstructTimeMetrics {
-        ok: inner.get_metric_with_label_values(&["ok"]).unwrap(),
-        err: inner.get_metric_with_label_values(&["err"]).unwrap(),
+        singular: inner.with_label_values(&[GetKind::Singular.into()]),
+        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
     }
 });
 
 impl ReconstructTimeMetrics {
-    pub(crate) fn for_result<T, E>(&self, result: &Result<T, E>) -> &Histogram {
-        match result {
-            Ok(_) => &self.ok,
-            Err(_) => &self.err,
+    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
+        match get_kind {
+            GetKind::Singular => &self.singular,
+            GetKind::Vectored => &self.vectored,
         }
     }
 }
@@ -142,13 +150,33 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::n
     .expect("failed to define a metric")
 });
 
-pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+pub(crate) struct ReconstructDataTimeMetrics {
+    singular: Histogram,
+    vectored: Histogram,
+}
+
+impl ReconstructDataTimeMetrics {
+    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
+        match get_kind {
+            GetKind::Singular => &self.singular,
+            GetKind::Vectored => &self.vectored,
+        }
+    }
+}
+
+pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> = Lazy::new(|| {
+    let inner = register_histogram_vec!(
         "pageserver_getpage_get_reconstruct_data_seconds",
         "Time spent in get_reconstruct_value_data",
+        &["get_kind"],
         CRITICAL_OP_BUCKETS.into(),
     )
-    .expect("failed to define a metric")
+    .expect("failed to define a metric");
+
+    ReconstructDataTimeMetrics {
+        singular: inner.with_label_values(&[GetKind::Singular.into()]),
+        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
+    }
 });
 
 pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 22b8a17874..11d96bf1a6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -86,7 +86,7 @@ use crate::{
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::{
-    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
+    GetKind, TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
@@ -797,7 +797,9 @@ impl Timeline {
             img: cached_page_img,
         };
 
-        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
+        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
+            .for_get_kind(GetKind::Singular)
+            .start_timer();
         let path = self
             .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
             .await?;
@@ -807,7 +809,7 @@ impl Timeline {
         let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
         let elapsed = start.elapsed();
         crate::metrics::RECONSTRUCT_TIME
-            .for_result(&res)
+            .for_get_kind(GetKind::Singular)
             .observe(elapsed.as_secs_f64());
 
         if cfg!(feature = "testing") && res.is_err() {
@@ -969,9 +971,22 @@ impl Timeline {
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
         let mut reconstruct_state = ValuesReconstructState::new();
 
+        let get_kind = if keyspace.total_size() == 1 {
+            GetKind::Singular
+        } else {
+            GetKind::Vectored
+        };
+
+        let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
+            .for_get_kind(get_kind)
+            .start_timer();
         self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
             .await?;
+        get_data_timer.stop_and_record();
 
+        let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME
+            .for_get_kind(get_kind)
+            .start_timer();
         let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
         let layers_visited = reconstruct_state.get_layers_visited();
         for (key, res) in reconstruct_state.keys {
@@ -987,6 +1002,7 @@ impl Timeline {
                 }
             }
         }
+        reconstruct_timer.stop_and_record();
 
         // Note that this is an approximation. Tracking the exact number of layers visited
         // per key requires virtually unbounded memory usage and is inefficient

From c12861cccda7c8bc7b57260843102c09be58f733 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 24 Apr 2024 15:36:23 +0100
Subject: [PATCH 0611/1571] pageserver: finish vectored get early (#7490)

## Problem
If the previous step of the vectored left no further keyspace to
investigate (i.e. keyspace remains empty after removing keys completed in the previous step),
then we'd still grab the layers lock, potentially add an in-mem layer to the fringe
and at some further point read its index without reading any values from it.

## Summary of changes
If there's nothing left in the current keyspace, then skip the search
and just select the next item from the fringe as usual.

When running `test_pg_regress[release-pg16]` with the vectored read path
for singular gets this improved perf drastically (see PR cover letter).

## Correctness
Since no keys remained from the previous range (i.e. we are on a leaf
node) there's nothing that search can find in deeper nodes.
---
 libs/pageserver_api/src/keyspace.rs |  4 ++
 pageserver/src/tenant/timeline.rs   | 92 +++++++++++++++--------------
 2 files changed, 53 insertions(+), 43 deletions(-)

diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 78e4a3d735..a9e19e8cc7 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -162,6 +162,10 @@ impl KeySpace {
             .sum()
     }
 
+    pub fn is_empty(&self) -> bool {
+        self.total_size() == 0
+    }
+
     fn overlaps_at(&self, range: &Range<Key>) -> Option<usize> {
         match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
             Ok(0) => None,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 11d96bf1a6..703654a37c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3143,55 +3143,61 @@ impl Timeline {
             unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
             completed_keyspace.merge(&keys_done_last_step);
 
-            let guard = timeline.layers.read().await;
-            let layers = guard.layer_map();
+            // Do not descent any further if the last layer we visited
+            // completed all keys in the keyspace it inspected. This is not
+            // required for correctness, but avoids visiting extra layers
+            // which turns out to be a perf bottleneck in some cases.
+            if !unmapped_keyspace.is_empty() {
+                let guard = timeline.layers.read().await;
+                let layers = guard.layer_map();
 
-            let in_memory_layer = layers.find_in_memory_layer(|l| {
-                let start_lsn = l.get_lsn_range().start;
-                cont_lsn > start_lsn
-            });
+                let in_memory_layer = layers.find_in_memory_layer(|l| {
+                    let start_lsn = l.get_lsn_range().start;
+                    cont_lsn > start_lsn
+                });
 
-            match in_memory_layer {
-                Some(l) => {
-                    let lsn_range = l.get_lsn_range().start..cont_lsn;
-                    fringe.update(
-                        ReadableLayer::InMemoryLayer(l),
-                        unmapped_keyspace.clone(),
-                        lsn_range,
-                    );
-                }
-                None => {
-                    for range in unmapped_keyspace.ranges.iter() {
-                        let results = layers.range_search(range.clone(), cont_lsn);
+                match in_memory_layer {
+                    Some(l) => {
+                        let lsn_range = l.get_lsn_range().start..cont_lsn;
+                        fringe.update(
+                            ReadableLayer::InMemoryLayer(l),
+                            unmapped_keyspace.clone(),
+                            lsn_range,
+                        );
+                    }
+                    None => {
+                        for range in unmapped_keyspace.ranges.iter() {
+                            let results = layers.range_search(range.clone(), cont_lsn);
 
-                        results
-                            .found
-                            .into_iter()
-                            .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
-                                (
-                                    ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
-                                    keyspace_accum.to_keyspace(),
-                                    lsn_floor..cont_lsn,
-                                )
-                            })
-                            .for_each(|(layer, keyspace, lsn_range)| {
-                                fringe.update(layer, keyspace, lsn_range)
-                            });
+                            results
+                                .found
+                                .into_iter()
+                                .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
+                                    (
+                                        ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
+                                        keyspace_accum.to_keyspace(),
+                                        lsn_floor..cont_lsn,
+                                    )
+                                })
+                                .for_each(|(layer, keyspace, lsn_range)| {
+                                    fringe.update(layer, keyspace, lsn_range)
+                                });
+                        }
                     }
                 }
-            }
 
-            // It's safe to drop the layer map lock after planning the next round of reads.
-            // The fringe keeps readable handles for the layers which are safe to read even
-            // if layers were compacted or flushed.
-            //
-            // The more interesting consideration is: "Why is the read algorithm still correct
-            // if the layer map changes while it is operating?". Doing a vectored read on a
-            // timeline boils down to pushing an imaginary lsn boundary downwards for each range
-            // covered by the read. The layer map tells us how to move the lsn downwards for a
-            // range at *a particular point in time*. It is fine for the answer to be different
-            // at two different time points.
-            drop(guard);
+                // It's safe to drop the layer map lock after planning the next round of reads.
+                // The fringe keeps readable handles for the layers which are safe to read even
+                // if layers were compacted or flushed.
+                //
+                // The more interesting consideration is: "Why is the read algorithm still correct
+                // if the layer map changes while it is operating?". Doing a vectored read on a
+                // timeline boils down to pushing an imaginary lsn boundary downwards for each range
+                // covered by the read. The layer map tells us how to move the lsn downwards for a
+                // range at *a particular point in time*. It is fine for the answer to be different
+                // at two different time points.
+                drop(guard);
+            }
 
             if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
                 let next_cont_lsn = lsn_range.start;

From 447a063f3c6583ed8e1946900493c1343b1daaef Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 24 Apr 2024 11:09:23 -0400
Subject: [PATCH 0612/1571] fix(metrics): correct maxrss metrics on macos
 (#7487)

macOS max_rss is in bytes, while Linux is in kilobytes.
https://stackoverflow.com/a/59915669

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/metrics/src/lib.rs | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 2cf3cdeaa7..8e0dbe6ce4 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -256,7 +256,16 @@ fn update_rusage_metrics() {
     DISK_IO_BYTES
         .with_label_values(&["write"])
         .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
-    MAXRSS_KB.set(rusage_stats.ru_maxrss);
+
+    // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
+    #[cfg(target_os = "macos")]
+    {
+        MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024);
+    }
+    #[cfg(not(target_os = "macos"))]
+    {
+        MAXRSS_KB.set(rusage_stats.ru_maxrss);
+    }
 }
 
 fn get_rusage_stats() -> libc::rusage {

From c18d3340b5e3c978a81c3db8b6f1e83cd9087e8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 24 Apr 2024 18:48:25 +0200
Subject: [PATCH 0613/1571] Ability to specify the upload_storage_class in S3
 bucket configuration (#7461)

Currently we move data to the intended storage class via lifecycle
rules, but those are a daily batch job so data first spends up to a day
in standard storage.

Therefore, make it possible to specify the storage class used for
uploads to S3 so that the data doesn't have to be migrated
automatically.

The advantage of this is that it gives cleaner billing reports.

Part of https://github.com/neondatabase/cloud/issues/11348
---
 libs/remote_storage/src/lib.rs            | 15 +++++++++++++++
 libs/remote_storage/src/s3_bucket.rs      |  8 +++++++-
 libs/remote_storage/tests/test_real_s3.rs |  1 +
 pageserver/src/config.rs                  |  1 +
 proxy/src/context/parquet.rs              |  1 +
 5 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 14c391ca53..32bc71c513 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -21,11 +21,13 @@ use std::{
     fmt::Debug,
     num::{NonZeroU32, NonZeroUsize},
     pin::Pin,
+    str::FromStr,
     sync::Arc,
     time::{Duration, SystemTime},
 };
 
 use anyhow::{bail, Context};
+use aws_sdk_s3::types::StorageClass;
 use camino::{Utf8Path, Utf8PathBuf};
 
 use bytes::Bytes;
@@ -563,6 +565,7 @@ pub struct S3Config {
     /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
     pub concurrency_limit: NonZeroUsize,
     pub max_keys_per_list_response: Option<i32>,
+    pub upload_storage_class: Option<StorageClass>,
 }
 
 impl Debug for S3Config {
@@ -691,6 +694,18 @@ impl RemoteStorageConfig {
                     endpoint,
                     concurrency_limit,
                     max_keys_per_list_response,
+                    upload_storage_class: toml
+                        .get("upload_storage_class")
+                        .map(|prefix_in_bucket| -> anyhow::Result<_> {
+                            let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?;
+                            let storage_class = StorageClass::from_str(&s).expect("infallible");
+                            #[allow(deprecated)]
+                            if matches!(storage_class, StorageClass::Unknown(_)) {
+                                bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values());
+                            }
+                            Ok(storage_class)
+                        })
+                        .transpose()?,
                 })
             }
             (_, _, _, Some(_), None) => {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 8091681221..c0b89cee2a 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -30,7 +30,7 @@ use aws_sdk_s3::{
     config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
     error::SdkError,
     operation::get_object::GetObjectError,
-    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
+    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
     Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
@@ -62,6 +62,7 @@ pub struct S3Bucket {
     bucket_name: String,
     prefix_in_bucket: Option<String>,
     max_keys_per_list_response: Option<i32>,
+    upload_storage_class: Option<StorageClass>,
     concurrency_limiter: ConcurrencyLimiter,
     // Per-request timeout. Accessible for tests.
     pub timeout: Duration,
@@ -154,6 +155,7 @@ impl S3Bucket {
             max_keys_per_list_response: aws_config.max_keys_per_list_response,
             prefix_in_bucket,
             concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
+            upload_storage_class: aws_config.upload_storage_class.clone(),
             timeout,
         })
     }
@@ -582,6 +584,7 @@ impl RemoteStorage for S3Bucket {
             .bucket(self.bucket_name.clone())
             .key(self.relative_path_to_s3_object(to))
             .set_metadata(metadata.map(|m| m.0))
+            .set_storage_class(self.upload_storage_class.clone())
             .content_length(from_size_bytes.try_into()?)
             .body(bytes_stream)
             .send();
@@ -633,6 +636,7 @@ impl RemoteStorage for S3Bucket {
             .copy_object()
             .bucket(self.bucket_name.clone())
             .key(self.relative_path_to_s3_object(to))
+            .set_storage_class(self.upload_storage_class.clone())
             .copy_source(copy_source)
             .send();
 
@@ -890,6 +894,7 @@ impl RemoteStorage for S3Bucket {
                                     .copy_object()
                                     .bucket(self.bucket_name.clone())
                                     .key(key)
+                                    .set_storage_class(self.upload_storage_class.clone())
                                     .copy_source(&source_id)
                                     .send();
 
@@ -1073,6 +1078,7 @@ mod tests {
                 endpoint: None,
                 concurrency_limit: NonZeroUsize::new(100).unwrap(),
                 max_keys_per_list_response: Some(5),
+                upload_storage_class: None,
             };
             let storage =
                 S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 01f6a532d6..a273abe867 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -380,6 +380,7 @@ fn create_s3_client(
             endpoint: None,
             concurrency_limit: NonZeroUsize::new(100).unwrap(),
             max_keys_per_list_response,
+            upload_storage_class: None,
         }),
         timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
     };
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index e10db2b853..10d5a22797 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1557,6 +1557,7 @@ broker_endpoint = '{broker_endpoint}'
                         endpoint: Some(endpoint.clone()),
                         concurrency_limit: s3_concurrency_limit,
                         max_keys_per_list_response: None,
+                        upload_storage_class: None,
                     }),
                     timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                 },
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index e061216d15..9600321937 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -413,6 +413,7 @@ mod tests {
                     )
                     .unwrap(),
                     max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+                    upload_storage_class: None,
                 }),
                 timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
             })

From e8814b6f81388d389f629a80f3620de99283a79c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 25 Apr 2024 10:46:07 +0100
Subject: [PATCH 0614/1571] controller: limit Reconciler concurrency (#7493)

## Problem

Storage controller memory can spike very high if we have many tenants
and they all try to reconcile at the same time.

Related:
- https://github.com/neondatabase/neon/issues/7463
- https://github.com/neondatabase/neon/issues/7460

Not closing those issues in this PR, because the test coverage for them
will be in https://github.com/neondatabase/neon/pull/7475

## Summary of changes

- Add a CLI arg `--reconciler-concurrency`, defaulted to 128
- Add a semaphore to Service with this many units
- In `maybe_reconcile_shard`, try to acquire semaphore unit. If we can't
get one, return a ReconcileWaiter for a future sequence number, and push
the TenantShardId onto a channel of delayed IDs.
- In `process_result`, consume from the channel of delayed IDs if there
are semaphore units available and call maybe_reconcile_shard again for
these delayed shards.

This has been tested in https://github.com/neondatabase/neon/pull/7475,
but will land that PR separately because it contains other changes &
needs the test stabilizing. This change is worth merging sooner, because
it fixes a practical issue with larger shard counts.
---
 storage_controller/src/main.rs         |  11 +-
 storage_controller/src/reconciler.rs   |  17 ++++
 storage_controller/src/service.rs      | 124 +++++++++++++++++++++--
 storage_controller/src/tenant_shard.rs | 133 ++++++++++++++++++-------
 4 files changed, 238 insertions(+), 47 deletions(-)

diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index ca55d6c593..d84803733a 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -9,7 +9,9 @@ use std::time::Duration;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
-use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
+use storage_controller::service::{
+    Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
+};
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
@@ -64,6 +66,10 @@ struct Cli {
     /// Grace period before marking unresponsive pageserver offline
     #[arg(long)]
     max_unavailable_interval: Option<humantime::Duration>,
+
+    /// Maximum number of reconcilers that may run in parallel
+    #[arg(long)]
+    reconciler_concurrency: Option<usize>,
 }
 
 enum StrictMode {
@@ -243,6 +249,9 @@ async fn async_main() -> anyhow::Result<()> {
             .max_unavailable_interval
             .map(humantime::Duration::into)
             .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
+        reconciler_concurrency: args
+            .reconciler_concurrency
+            .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 49cfaad569..28801ede6e 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -51,6 +51,10 @@ pub(super) struct Reconciler {
     /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry.
     pub(crate) compute_notify_failure: bool,
 
+    /// Reconciler is responsible for keeping alive semaphore units that limit concurrency on how many
+    /// we will spawn.
+    pub(crate) _resource_units: ReconcileUnits,
+
     /// A means to abort background reconciliation: it is essential to
     /// call this when something changes in the original TenantShard that
     /// will make this reconciliation impossible or unnecessary, for
@@ -66,6 +70,19 @@ pub(super) struct Reconciler {
     pub(crate) persistence: Arc<Persistence>,
 }
 
+/// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
+pub(crate) struct ReconcileUnits {
+    _sem_units: tokio::sync::OwnedSemaphorePermit,
+}
+
+impl ReconcileUnits {
+    pub(crate) fn new(sem_units: tokio::sync::OwnedSemaphorePermit) -> Self {
+        Self {
+            _sem_units: sem_units,
+        }
+    }
+}
+
 /// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any
 /// reference counting for Scheduler.  The IntentState is what the scheduler works with,
 /// and the TargetState is just the instruction for a particular Reconciler run.
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 0565f8e7b4..2e6f3750e7 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -10,8 +10,9 @@ use std::{
 use crate::{
     id_lock_map::IdLockMap,
     persistence::{AbortShardSplitStatus, TenantFilter},
-    reconciler::ReconcileError,
+    reconciler::{ReconcileError, ReconcileUnits},
     scheduler::{ScheduleContext, ScheduleMode},
+    tenant_shard::ReconcileNeeded,
 };
 use anyhow::Context;
 use control_plane::storage_controller::{
@@ -48,7 +49,7 @@ use pageserver_api::{
     },
 };
 use pageserver_client::mgmt_api;
-use tokio::sync::OwnedRwLockWriteGuard;
+use tokio::sync::{mpsc::error::TrySendError, OwnedRwLockWriteGuard};
 use tokio_util::sync::CancellationToken;
 use tracing::instrument;
 use utils::{
@@ -90,6 +91,13 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
 pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
 
+pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
+
+// Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
+// This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
+// than they're being pushed onto the queue.
+const MAX_DELAYED_RECONCILES: usize = 10000;
+
 // Top level state available to all HTTP handlers
 struct ServiceState {
     tenants: BTreeMap<TenantShardId, TenantShard>,
@@ -97,6 +105,9 @@ struct ServiceState {
     nodes: Arc<HashMap<NodeId, Node>>,
 
     scheduler: Scheduler,
+
+    /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile
+    delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
 }
 
 impl ServiceState {
@@ -104,11 +115,13 @@ impl ServiceState {
         nodes: HashMap<NodeId, Node>,
         tenants: BTreeMap<TenantShardId, TenantShard>,
         scheduler: Scheduler,
+        delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
     ) -> Self {
         Self {
             tenants,
             nodes: Arc::new(nodes),
             scheduler,
+            delayed_reconcile_rx,
         }
     }
 
@@ -142,6 +155,9 @@ pub struct Config {
     /// considered active. Once the grace period elapses, the next heartbeat failure will
     /// mark the pagseserver offline.
     pub max_unavailable_interval: Duration,
+
+    /// How many Reconcilers may be spawned concurrently
+    pub reconciler_concurrency: usize,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -180,6 +196,17 @@ pub struct Service {
     // that transition it to/from Active.
     node_op_locks: IdLockMap<NodeId>,
 
+    // Limit how many Reconcilers we will spawn concurrently
+    reconciler_concurrency: Arc<tokio::sync::Semaphore>,
+
+    /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile
+    /// Send into this queue to promptly attempt to reconcile this shard next time units are available.
+    ///
+    /// Note that this state logically lives inside ServiceInner, but carrying Sender here makes the code simpler
+    /// by avoiding needing a &mut ref to something inside the ServiceInner.  This could be optimized to
+    /// use a VecDeque instead of a channel to reduce synchronization overhead, at the cost of some code complexity.
+    delayed_reconcile_tx: tokio::sync::mpsc::Sender<TenantShardId>,
+
     // Process shutdown will fire this token
     cancel: CancellationToken,
 
@@ -742,8 +769,9 @@ impl Service {
     }
 
     /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation
-    /// was successful, this will update the observed state of the tenant such that subsequent
-    /// calls to [`TenantShard::maybe_reconcile`] will do nothing.
+    /// was successful and intent hasn't changed since the Reconciler was spawned, this will update
+    /// the observed state of the tenant such that subsequent calls to [`TenantShard::get_reconcile_needed`]
+    /// will indicate that reconciliation is not needed.
     #[instrument(skip_all, fields(
         tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(),
         sequence=%result.sequence
@@ -804,6 +832,21 @@ impl Service {
                 }
             }
         }
+
+        // Maybe some other work can proceed now that this job finished.
+        if self.reconciler_concurrency.available_permits() > 0 {
+            while let Ok(tenant_shard_id) = locked.delayed_reconcile_rx.try_recv() {
+                let (nodes, tenants, _scheduler) = locked.parts_mut();
+                if let Some(shard) = tenants.get_mut(&tenant_shard_id) {
+                    shard.delayed_reconcile = false;
+                    self.maybe_reconcile_shard(shard, nodes);
+                }
+
+                if self.reconciler_concurrency.available_permits() == 0 {
+                    break;
+                }
+            }
+        }
     }
 
     async fn process_results(
@@ -986,6 +1029,9 @@ impl Service {
 
         let (startup_completion, startup_complete) = utils::completion::channel();
 
+        let (delayed_reconcile_tx, delayed_reconcile_rx) =
+            tokio::sync::mpsc::channel(MAX_DELAYED_RECONCILES);
+
         let cancel = CancellationToken::new();
         let heartbeater = Heartbeater::new(
             config.jwt_token.clone(),
@@ -994,13 +1040,20 @@ impl Service {
         );
         let this = Arc::new(Self {
             inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
-                nodes, tenants, scheduler,
+                nodes,
+                tenants,
+                scheduler,
+                delayed_reconcile_rx,
             ))),
             config: config.clone(),
             persistence,
-            compute_hook: Arc::new(ComputeHook::new(config)),
+            compute_hook: Arc::new(ComputeHook::new(config.clone())),
             result_tx,
             heartbeater,
+            reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new(
+                config.reconciler_concurrency,
+            )),
+            delayed_reconcile_tx,
             abort_tx,
             startup_complete: startup_complete.clone(),
             cancel,
@@ -1535,7 +1588,7 @@ impl Service {
 
         let (response, waiters) = self.do_tenant_create(create_req).await?;
 
-        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
+        if let Err(e) = self.await_waiters(waiters, RECONCILE_TIMEOUT).await {
             // Avoid deadlock: reconcile may fail while notifying compute, if the cloud control plane refuses to
             // accept compute notifications while it is in the process of creating.  Reconciliation will
             // be retried in the background.
@@ -4053,20 +4106,64 @@ impl Service {
         Ok(())
     }
 
-    /// Convenience wrapper around [`TenantShard::maybe_reconcile`] that provides
-    /// all the references to parts of Self that are needed
+    /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`],
     fn maybe_reconcile_shard(
         &self,
         shard: &mut TenantShard,
         nodes: &Arc<HashMap<NodeId, Node>>,
     ) -> Option<ReconcilerWaiter> {
-        shard.maybe_reconcile(
+        let reconcile_needed = shard.get_reconcile_needed(nodes);
+
+        match reconcile_needed {
+            ReconcileNeeded::No => return None,
+            ReconcileNeeded::WaitExisting(waiter) => return Some(waiter),
+            ReconcileNeeded::Yes => {
+                // Fall through to try and acquire units for spawning reconciler
+            }
+        };
+
+        let units = match self.reconciler_concurrency.clone().try_acquire_owned() {
+            Ok(u) => ReconcileUnits::new(u),
+            Err(_) => {
+                tracing::info!(tenant_id=%shard.tenant_shard_id.tenant_id, shard_id=%shard.tenant_shard_id.shard_slug(),
+                    "Concurrency limited: enqueued for reconcile later");
+                if !shard.delayed_reconcile {
+                    match self.delayed_reconcile_tx.try_send(shard.tenant_shard_id) {
+                        Err(TrySendError::Closed(_)) => {
+                            // Weird mid-shutdown case?
+                        }
+                        Err(TrySendError::Full(_)) => {
+                            // It is safe to skip sending our ID in the channel: we will eventually get retried by the background reconcile task.
+                            tracing::warn!(
+                                "Many shards are waiting to reconcile: delayed_reconcile queue is full"
+                            );
+                        }
+                        Ok(()) => {
+                            shard.delayed_reconcile = true;
+                        }
+                    }
+                }
+
+                // We won't spawn a reconciler, but we will construct a waiter that waits for the shard's sequence
+                // number to advance.  When this function is eventually called again and succeeds in getting units,
+                // it will spawn a reconciler that makes this waiter complete.
+                return Some(shard.future_reconcile_waiter());
+            }
+        };
+
+        let Ok(gate_guard) = self.gate.enter() else {
+            // Gate closed: we're shutting down, drop out.
+            return None;
+        };
+
+        shard.spawn_reconciler(
             &self.result_tx,
             nodes,
             &self.compute_hook,
             &self.config,
             &self.persistence,
-            &self.gate,
+            units,
+            gate_guard,
             &self.cancel,
         )
     }
@@ -4088,6 +4185,11 @@ impl Service {
                 schedule_context = ScheduleContext::default();
             }
 
+            // Skip checking if this shard is already enqueued for reconciliation
+            if shard.delayed_reconcile {
+                continue;
+            }
+
             // Eventual consistency: if an earlier reconcile job failed, and the shard is still
             // dirty, spawn another rone
             if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 58b8ef8d5d..d69260b9e7 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -7,6 +7,7 @@ use std::{
 use crate::{
     metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
     persistence::TenantShardPersistence,
+    reconciler::ReconcileUnits,
     scheduler::{AffinityScore, MaySchedule, ScheduleContext},
 };
 use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
@@ -22,7 +23,7 @@ use utils::{
     generation::Generation,
     id::NodeId,
     seqwait::{SeqWait, SeqWaitError},
-    sync::gate::Gate,
+    sync::gate::GateGuard,
 };
 
 use crate::{
@@ -95,6 +96,10 @@ pub(crate) struct TenantShard {
     /// reconciliation, and timeline creation.
     pub(crate) splitting: SplitState,
 
+    /// If a tenant was enqueued for later reconcile due to hitting concurrency limit, this flag
+    /// is set. This flag is cleared when the tenant is popped off the delay queue.
+    pub(crate) delayed_reconcile: bool,
+
     /// Optionally wait for reconciliation to complete up to a particular
     /// sequence number.
     #[serde(skip)]
@@ -113,8 +118,8 @@ pub(crate) struct TenantShard {
     pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
 
     /// If we have a pending compute notification that for some reason we weren't able to send,
-    /// set this to true. If this is set, calls to [`Self::maybe_reconcile`] will run a task to retry
-    /// sending it.  This is the mechanism by which compute notifications are included in the scope
+    /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes
+    /// and trigger a Reconciler run.  This is the mechanism by which compute notifications are included in the scope
     /// of state that we publish externally in an eventually consistent way.
     pub(crate) pending_compute_notification: bool,
 
@@ -353,6 +358,17 @@ pub(crate) struct ReconcilerHandle {
     cancel: CancellationToken,
 }
 
+pub(crate) enum ReconcileNeeded {
+    /// shard either doesn't need reconciliation, or is forbidden from spawning a reconciler
+    /// in its current state (e.g. shard split in progress, or ShardSchedulingPolicy forbids it)
+    No,
+    /// shard has a reconciler running, and its intent hasn't changed since that one was
+    /// spawned: wait for the existing reconciler rather than spawning a new one.
+    WaitExisting(ReconcilerWaiter),
+    /// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`]
+    Yes,
+}
+
 /// When a reconcile task completes, it sends this result object
 /// to be applied to the primary TenantShard.
 pub(crate) struct ReconcileResult {
@@ -396,6 +412,7 @@ impl TenantShard {
             reconciler: None,
             splitting: SplitState::Idle,
             sequence: Sequence(1),
+            delayed_reconcile: false,
             waiter: Arc::new(SeqWait::new(Sequence(0))),
             error_waiter: Arc::new(SeqWait::new(Sequence(0))),
             last_error: Arc::default(),
@@ -831,16 +848,10 @@ impl TenantShard {
 
     #[allow(clippy::too_many_arguments)]
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
-    pub(crate) fn maybe_reconcile(
+    pub(crate) fn get_reconcile_needed(
         &mut self,
-        result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
         pageservers: &Arc<HashMap<NodeId, Node>>,
-        compute_hook: &Arc<ComputeHook>,
-        service_config: &service::Config,
-        persistence: &Arc<Persistence>,
-        gate: &Gate,
-        cancel: &CancellationToken,
-    ) -> Option<ReconcilerWaiter> {
+    ) -> ReconcileNeeded {
         // If there are any ambiguous observed states, and the nodes they refer to are available,
         // we should reconcile to clean them up.
         let mut dirty_observed = false;
@@ -863,7 +874,7 @@ impl TenantShard {
 
         if !do_reconcile {
             tracing::info!("Not dirty, no reconciliation needed.");
-            return None;
+            return ReconcileNeeded::No;
         }
 
         // If we are currently splitting, then never start a reconciler task: the splitting logic
@@ -871,7 +882,7 @@ impl TenantShard {
         // up top, so that we only log this message if we would otherwise have done a reconciliation.
         if !matches!(self.splitting, SplitState::Idle) {
             tracing::info!("Refusing to reconcile, splitting in progress");
-            return None;
+            return ReconcileNeeded::No;
         }
 
         // Reconcile already in flight for the current sequence?
@@ -881,7 +892,7 @@ impl TenantShard {
                     "Reconciliation already in progress for sequence {:?}",
                     self.sequence,
                 );
-                return Some(ReconcilerWaiter {
+                return ReconcileNeeded::WaitExisting(ReconcilerWaiter {
                     tenant_shard_id: self.tenant_shard_id,
                     seq_wait: self.waiter.clone(),
                     error_seq_wait: self.error_waiter.clone(),
@@ -900,10 +911,67 @@ impl TenantShard {
                 // We only reach this point if there is work to do and we're going to skip
                 // doing it: warn it obvious why this tenant isn't doing what it ought to.
                 tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
-                return None;
+                return ReconcileNeeded::No;
             }
         }
 
+        ReconcileNeeded::Yes
+    }
+
+    /// Ensure the sequence number is set to a value where waiting for this value will make us wait
+    /// for the next reconcile: i.e. it is ahead of all completed or running reconcilers.
+    ///
+    /// Constructing a ReconcilerWaiter with the resulting sequence number gives the property
+    /// that the waiter will not complete until some future Reconciler is constructed and run.
+    fn ensure_sequence_ahead(&mut self) {
+        // Find the highest sequence for which a Reconciler has previously run or is currently
+        // running
+        let max_seen = std::cmp::max(
+            self.reconciler
+                .as_ref()
+                .map(|r| r.sequence)
+                .unwrap_or(Sequence(0)),
+            std::cmp::max(self.waiter.load(), self.error_waiter.load()),
+        );
+
+        if self.sequence <= max_seen {
+            self.sequence = max_seen.next();
+        }
+    }
+
+    /// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet.
+    ///
+    /// This is appropriate when you can't spawn a recociler (e.g. due to resource limits), but
+    /// you would like to wait until one gets spawned in the background.
+    pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter {
+        self.ensure_sequence_ahead();
+
+        ReconcilerWaiter {
+            tenant_shard_id: self.tenant_shard_id,
+            seq_wait: self.waiter.clone(),
+            error_seq_wait: self.error_waiter.clone(),
+            error: self.last_error.clone(),
+            seq: self.sequence,
+        }
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
+    pub(crate) fn spawn_reconciler(
+        &mut self,
+        result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
+        pageservers: &Arc<HashMap<NodeId, Node>>,
+        compute_hook: &Arc<ComputeHook>,
+        service_config: &service::Config,
+        persistence: &Arc<Persistence>,
+        units: ReconcileUnits,
+        gate_guard: GateGuard,
+        cancel: &CancellationToken,
+    ) -> Option<ReconcilerWaiter> {
+        // Reconcile in flight for a stale sequence?  Our sequence's task will wait for it before
+        // doing our sequence's work.
+        let old_handle = self.reconciler.take();
+
         // Build list of nodes from which the reconciler should detach
         let mut detach = Vec::new();
         for node_id in self.observed.locations.keys() {
@@ -919,18 +987,9 @@ impl TenantShard {
             }
         }
 
-        // Reconcile in flight for a stale sequence?  Our sequence's task will wait for it before
-        // doing our sequence's work.
-        let old_handle = self.reconciler.take();
-
-        let Ok(gate_guard) = gate.enter() else {
-            // Shutting down, don't start a reconciler
-            return None;
-        };
-
         // Advance the sequence before spawning a reconciler, so that sequence waiters
         // can distinguish between before+after the reconcile completes.
-        self.sequence = self.sequence.next();
+        self.ensure_sequence_ahead();
 
         let reconciler_cancel = cancel.child_token();
         let reconciler_intent = TargetState::from_intent(pageservers, &self.intent);
@@ -945,6 +1004,7 @@ impl TenantShard {
             compute_hook: compute_hook.clone(),
             service_config: service_config.clone(),
             _gate_guard: gate_guard,
+            _resource_units: units,
             cancel: reconciler_cancel.clone(),
             persistence: persistence.clone(),
             compute_notify_failure: false,
@@ -1011,16 +1071,18 @@ impl TenantShard {
                         status: outcome_label,
                     });
 
-                result_tx
-                    .send(ReconcileResult {
-                        sequence: reconcile_seq,
-                        result,
-                        tenant_shard_id: reconciler.tenant_shard_id,
-                        generation: reconciler.generation,
-                        observed: reconciler.observed,
-                        pending_compute_notification: reconciler.compute_notify_failure,
-                    })
-                    .ok();
+                // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might
+                // try and schedule more work in response to our result.
+                let result = ReconcileResult {
+                    sequence: reconcile_seq,
+                    result,
+                    tenant_shard_id: reconciler.tenant_shard_id,
+                    generation: reconciler.generation,
+                    observed: reconciler.observed,
+                    pending_compute_notification: reconciler.compute_notify_failure,
+                };
+
+                result_tx.send(result).ok();
             }
             .instrument(reconciler_span),
         );
@@ -1111,6 +1173,7 @@ impl TenantShard {
             error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
             last_error: Arc::default(),
             pending_compute_notification: false,
+            delayed_reconcile: false,
             scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
         })
     }

From cdccab4bd9f39c4f491df2e3165b8ebc0af3e4bb Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 25 Apr 2024 11:14:04 +0100
Subject: [PATCH 0615/1571] reduce complexity of proxy protocol parse (#7078)

## Problem

The `WithClientIp` AsyncRead/Write abstraction never filled me with much
joy. I would just rather read the protocol header once and then get the
remaining buf and reader.

## Summary of changes

* Replace `WithClientIp::wait_for_addr` with `read_proxy_protocol`.
* Replace `WithClientIp` with `ChainRW`.
* Optimise `ChainRW` to make the standard path more optimal.
---
 proxy/src/protocol2.rs   | 427 +++++++++++++++------------------------
 proxy/src/proxy.rs       |  14 +-
 proxy/src/proxy/tests.rs |   2 +-
 proxy/src/serverless.rs  |   7 +-
 4 files changed, 168 insertions(+), 282 deletions(-)

diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 70f9b4bfab..1dd4563514 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -1,42 +1,26 @@
 //! Proxy Protocol V2 implementation
 
 use std::{
-    future::{poll_fn, Future},
     io,
     net::SocketAddr,
-    pin::{pin, Pin},
-    task::{ready, Context, Poll},
+    pin::Pin,
+    task::{Context, Poll},
 };
 
-use bytes::{Buf, BytesMut};
-use hyper::server::conn::AddrIncoming;
+use bytes::BytesMut;
 use pin_project_lite::pin_project;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
 
-pub struct ProxyProtocolAccept {
-    pub incoming: AddrIncoming,
-    pub protocol: &'static str,
-}
-
 pin_project! {
-    pub struct WithClientIp<T> {
+    /// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough
+    pub struct ChainRW<T> {
         #[pin]
         pub inner: T,
         buf: BytesMut,
-        tlv_bytes: u16,
-        state: ProxyParse,
     }
 }
 
-#[derive(Clone, PartialEq, Debug)]
-enum ProxyParse {
-    NotStarted,
-
-    Finished(SocketAddr),
-    None,
-}
-
-impl<T: AsyncWrite> AsyncWrite for WithClientIp<T> {
+impl<T: AsyncWrite> AsyncWrite for ChainRW<T> {
     #[inline]
     fn poll_write(
         self: Pin<&mut Self>,
@@ -71,267 +55,174 @@ impl<T: AsyncWrite> AsyncWrite for WithClientIp<T> {
     }
 }
 
-impl<T> WithClientIp<T> {
-    pub fn new(inner: T) -> Self {
-        WithClientIp {
-            inner,
-            buf: BytesMut::with_capacity(128),
-            tlv_bytes: 0,
-            state: ProxyParse::NotStarted,
-        }
-    }
-
-    pub fn client_addr(&self) -> Option<SocketAddr> {
-        match self.state {
-            ProxyParse::Finished(socket) => Some(socket),
-            _ => None,
-        }
-    }
-}
-
-impl<T: AsyncRead + Unpin> WithClientIp<T> {
-    pub async fn wait_for_addr(&mut self) -> io::Result<Option<SocketAddr>> {
-        match self.state {
-            ProxyParse::NotStarted => {
-                let mut pin = Pin::new(&mut *self);
-                let addr = poll_fn(|cx| pin.as_mut().poll_client_ip(cx)).await?;
-                match addr {
-                    Some(addr) => self.state = ProxyParse::Finished(addr),
-                    None => self.state = ProxyParse::None,
-                }
-                Ok(addr)
-            }
-            ProxyParse::Finished(addr) => Ok(Some(addr)),
-            ProxyParse::None => Ok(None),
-        }
-    }
-}
-
 /// Proxy Protocol Version 2 Header
 const HEADER: [u8; 12] = [
     0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A,
 ];
 
-impl<T: AsyncRead> WithClientIp<T> {
-    /// implementation of <https://www.haproxy.org/download/2.4/doc/proxy-protocol.txt>
-    /// Version 2 (Binary Format)
-    fn poll_client_ip(
-        mut self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<io::Result<Option<SocketAddr>>> {
-        // The binary header format starts with a constant 12 bytes block containing the protocol signature :
-        //    \x0D \x0A \x0D \x0A \x00 \x0D \x0A \x51 \x55 \x49 \x54 \x0A
-        while self.buf.len() < 16 {
-            let mut this = self.as_mut().project();
-            let bytes_read = pin!(this.inner.read_buf(this.buf)).poll(cx)?;
+pub async fn read_proxy_protocol<T: AsyncRead + Unpin>(
+    mut read: T,
+) -> std::io::Result<(ChainRW<T>, Option<SocketAddr>)> {
+    let mut buf = BytesMut::with_capacity(128);
+    while buf.len() < 16 {
+        let bytes_read = read.read_buf(&mut buf).await?;
 
-            // exit for bad header
-            let len = usize::min(self.buf.len(), HEADER.len());
-            if self.buf[..len] != HEADER[..len] {
-                return Poll::Ready(Ok(None));
-            }
-
-            // if no more bytes available then exit
-            if ready!(bytes_read) == 0 {
-                return Poll::Ready(Ok(None));
-            };
+        // exit for bad header
+        let len = usize::min(buf.len(), HEADER.len());
+        if buf[..len] != HEADER[..len] {
+            return Ok((ChainRW { inner: read, buf }, None));
         }
 
-        // The next byte (the 13th one) is the protocol version and command.
-        // The highest four bits contains the version. As of this specification, it must
-        // always be sent as \x2 and the receiver must only accept this value.
-        let vc = self.buf[12];
-        let version = vc >> 4;
-        let command = vc & 0b1111;
-        if version != 2 {
-            return Poll::Ready(Err(io::Error::new(
+        // if no more bytes available then exit
+        if bytes_read == 0 {
+            return Ok((ChainRW { inner: read, buf }, None));
+        };
+    }
+
+    let header = buf.split_to(16);
+
+    // The next byte (the 13th one) is the protocol version and command.
+    // The highest four bits contains the version. As of this specification, it must
+    // always be sent as \x2 and the receiver must only accept this value.
+    let vc = header[12];
+    let version = vc >> 4;
+    let command = vc & 0b1111;
+    if version != 2 {
+        return Err(io::Error::new(
+            io::ErrorKind::Other,
+            "invalid proxy protocol version. expected version 2",
+        ));
+    }
+    match command {
+        // the connection was established on purpose by the proxy
+        // without being relayed. The connection endpoints are the sender and the
+        // receiver. Such connections exist when the proxy sends health-checks to the
+        // server. The receiver must accept this connection as valid and must use the
+        // real connection endpoints and discard the protocol block including the
+        // family which is ignored.
+        0 => {}
+        // the connection was established on behalf of another node,
+        // and reflects the original connection endpoints. The receiver must then use
+        // the information provided in the protocol block to get original the address.
+        1 => {}
+        // other values are unassigned and must not be emitted by senders. Receivers
+        // must drop connections presenting unexpected values here.
+        _ => {
+            return Err(io::Error::new(
                 io::ErrorKind::Other,
-                "invalid proxy protocol version. expected version 2",
-            )));
+                "invalid proxy protocol command. expected local (0) or proxy (1)",
+            ))
         }
-        match command {
-            // the connection was established on purpose by the proxy
-            // without being relayed. The connection endpoints are the sender and the
-            // receiver. Such connections exist when the proxy sends health-checks to the
-            // server. The receiver must accept this connection as valid and must use the
-            // real connection endpoints and discard the protocol block including the
-            // family which is ignored.
-            0 => {}
-            // the connection was established on behalf of another node,
-            // and reflects the original connection endpoints. The receiver must then use
-            // the information provided in the protocol block to get original the address.
-            1 => {}
-            // other values are unassigned and must not be emitted by senders. Receivers
-            // must drop connections presenting unexpected values here.
-            _ => {
-                return Poll::Ready(Err(io::Error::new(
-                    io::ErrorKind::Other,
-                    "invalid proxy protocol command. expected local (0) or proxy (1)",
-                )))
-            }
-        };
+    };
 
-        // The 14th byte contains the transport protocol and address family. The highest 4
-        // bits contain the address family, the lowest 4 bits contain the protocol.
-        let ft = self.buf[13];
-        let address_length = match ft {
-            // - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET
-            //   protocol family. Address length is 2*4 + 2*2 = 12 bytes.
-            // - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET
-            //   protocol family. Address length is 2*4 + 2*2 = 12 bytes.
-            0x11 | 0x12 => 12,
-            // - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6
-            //   protocol family. Address length is 2*16 + 2*2 = 36 bytes.
-            // - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6
-            //   protocol family. Address length is 2*16 + 2*2 = 36 bytes.
-            0x21 | 0x22 => 36,
-            // unspecified or unix stream. ignore the addresses
-            _ => 0,
-        };
+    // The 14th byte contains the transport protocol and address family. The highest 4
+    // bits contain the address family, the lowest 4 bits contain the protocol.
+    let ft = header[13];
+    let address_length = match ft {
+        // - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET
+        //   protocol family. Address length is 2*4 + 2*2 = 12 bytes.
+        // - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET
+        //   protocol family. Address length is 2*4 + 2*2 = 12 bytes.
+        0x11 | 0x12 => 12,
+        // - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6
+        //   protocol family. Address length is 2*16 + 2*2 = 36 bytes.
+        // - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6
+        //   protocol family. Address length is 2*16 + 2*2 = 36 bytes.
+        0x21 | 0x22 => 36,
+        // unspecified or unix stream. ignore the addresses
+        _ => 0,
+    };
 
-        // The 15th and 16th bytes is the address length in bytes in network endian order.
-        // It is used so that the receiver knows how many address bytes to skip even when
-        // it does not implement the presented protocol. Thus the length of the protocol
-        // header in bytes is always exactly 16 + this value. When a sender presents a
-        // LOCAL connection, it should not present any address so it sets this field to
-        // zero. Receivers MUST always consider this field to skip the appropriate number
-        // of bytes and must not assume zero is presented for LOCAL connections. When a
-        // receiver accepts an incoming connection showing an UNSPEC address family or
-        // protocol, it may or may not decide to log the address information if present.
-        let remaining_length = u16::from_be_bytes(self.buf[14..16].try_into().unwrap());
-        if remaining_length < address_length {
-            return Poll::Ready(Err(io::Error::new(
-                io::ErrorKind::Other,
-                "invalid proxy protocol length. not enough to fit requested IP addresses",
-            )));
+    // The 15th and 16th bytes is the address length in bytes in network endian order.
+    // It is used so that the receiver knows how many address bytes to skip even when
+    // it does not implement the presented protocol. Thus the length of the protocol
+    // header in bytes is always exactly 16 + this value. When a sender presents a
+    // LOCAL connection, it should not present any address so it sets this field to
+    // zero. Receivers MUST always consider this field to skip the appropriate number
+    // of bytes and must not assume zero is presented for LOCAL connections. When a
+    // receiver accepts an incoming connection showing an UNSPEC address family or
+    // protocol, it may or may not decide to log the address information if present.
+    let remaining_length = u16::from_be_bytes(header[14..16].try_into().unwrap());
+    if remaining_length < address_length {
+        return Err(io::Error::new(
+            io::ErrorKind::Other,
+            "invalid proxy protocol length. not enough to fit requested IP addresses",
+        ));
+    }
+    drop(header);
+
+    while buf.len() < remaining_length as usize {
+        if read.read_buf(&mut buf).await? == 0 {
+            return Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "stream closed while waiting for proxy protocol addresses",
+            ));
         }
-
-        while self.buf.len() < 16 + address_length as usize {
-            let mut this = self.as_mut().project();
-            if ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?) == 0 {
-                return Poll::Ready(Err(io::Error::new(
-                    io::ErrorKind::UnexpectedEof,
-                    "stream closed while waiting for proxy protocol addresses",
-                )));
-            }
-        }
-
-        let this = self.as_mut().project();
-
-        // we are sure this is a proxy protocol v2 entry and we have read all the bytes we need
-        // discard the header we have parsed
-        this.buf.advance(16);
-
-        // Starting from the 17th byte, addresses are presented in network byte order.
-        // The address order is always the same :
-        //   - source layer 3 address in network byte order
-        //   - destination layer 3 address in network byte order
-        //   - source layer 4 address if any, in network byte order (port)
-        //   - destination layer 4 address if any, in network byte order (port)
-        let addresses = this.buf.split_to(address_length as usize);
-        let socket = match address_length {
-            12 => {
-                let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap();
-                let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap());
-                Some(SocketAddr::from((src_addr, src_port)))
-            }
-            36 => {
-                let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap();
-                let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap());
-                Some(SocketAddr::from((src_addr, src_port)))
-            }
-            _ => None,
-        };
-
-        *this.tlv_bytes = remaining_length - address_length;
-        self.as_mut().skip_tlv_inner();
-
-        Poll::Ready(Ok(socket))
     }
 
-    #[cold]
-    fn read_ip(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let ip = ready!(self.as_mut().poll_client_ip(cx)?);
-        match ip {
-            Some(x) => *self.as_mut().project().state = ProxyParse::Finished(x),
-            None => *self.as_mut().project().state = ProxyParse::None,
+    // Starting from the 17th byte, addresses are presented in network byte order.
+    // The address order is always the same :
+    //   - source layer 3 address in network byte order
+    //   - destination layer 3 address in network byte order
+    //   - source layer 4 address if any, in network byte order (port)
+    //   - destination layer 4 address if any, in network byte order (port)
+    let addresses = buf.split_to(remaining_length as usize);
+    let socket = match address_length {
+        12 => {
+            let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap();
+            let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap());
+            Some(SocketAddr::from((src_addr, src_port)))
         }
-        Poll::Ready(Ok(()))
-    }
+        36 => {
+            let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap();
+            let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap());
+            Some(SocketAddr::from((src_addr, src_port)))
+        }
+        _ => None,
+    };
 
-    #[cold]
-    fn skip_tlv(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let mut this = self.as_mut().project();
-        // we know that this.buf is empty
-        debug_assert_eq!(this.buf.len(), 0);
-
-        this.buf.reserve((*this.tlv_bytes).clamp(0, 1024) as usize);
-        ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?);
-        self.skip_tlv_inner();
-
-        Poll::Ready(Ok(()))
-    }
-
-    fn skip_tlv_inner(self: Pin<&mut Self>) {
-        let tlv_bytes_read = match u16::try_from(self.buf.len()) {
-            // we read more than u16::MAX therefore we must have read the full tlv_bytes
-            Err(_) => self.tlv_bytes,
-            // we might not have read the full tlv bytes yet
-            Ok(n) => u16::min(n, self.tlv_bytes),
-        };
-        let this = self.project();
-        *this.tlv_bytes -= tlv_bytes_read;
-        this.buf.advance(tlv_bytes_read as usize);
-    }
+    Ok((ChainRW { inner: read, buf }, socket))
 }
 
-impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
+impl<T: AsyncRead> AsyncRead for ChainRW<T> {
     #[inline]
     fn poll_read(
-        mut self: Pin<&mut Self>,
+        self: Pin<&mut Self>,
         cx: &mut Context<'_>,
         buf: &mut ReadBuf<'_>,
     ) -> Poll<io::Result<()>> {
-        // I'm assuming these 3 comparisons will be easy to branch predict.
-        // especially with the cold attributes
-        // which should make this read wrapper almost invisible
-
-        if let ProxyParse::NotStarted = self.state {
-            ready!(self.as_mut().read_ip(cx)?);
-        }
-
-        while self.tlv_bytes > 0 {
-            ready!(self.as_mut().skip_tlv(cx)?)
-        }
-
-        let this = self.project();
-        if this.buf.is_empty() {
-            this.inner.poll_read(cx, buf)
+        if self.buf.is_empty() {
+            self.project().inner.poll_read(cx, buf)
         } else {
-            // we know that tlv_bytes is 0
-            debug_assert_eq!(*this.tlv_bytes, 0);
-
-            let write = usize::min(this.buf.len(), buf.remaining());
-            let slice = this.buf.split_to(write).freeze();
-            buf.put_slice(&slice);
-
-            // reset the allocation so it can be freed
-            if this.buf.is_empty() {
-                *this.buf = BytesMut::new();
-            }
-
-            Poll::Ready(Ok(()))
+            self.read_from_buf(buf)
         }
     }
 }
 
+impl<T: AsyncRead> ChainRW<T> {
+    #[cold]
+    fn read_from_buf(self: Pin<&mut Self>, buf: &mut ReadBuf<'_>) -> Poll<io::Result<()>> {
+        debug_assert!(!self.buf.is_empty());
+        let this = self.project();
+
+        let write = usize::min(this.buf.len(), buf.remaining());
+        let slice = this.buf.split_to(write).freeze();
+        buf.put_slice(&slice);
+
+        // reset the allocation so it can be freed
+        if this.buf.is_empty() {
+            *this.buf = BytesMut::new();
+        }
+
+        Poll::Ready(Ok(()))
+    }
+}
+
 #[cfg(test)]
 mod tests {
-    use std::pin::pin;
-
     use tokio::io::AsyncReadExt;
 
-    use crate::protocol2::{ProxyParse, WithClientIp};
+    use crate::protocol2::read_proxy_protocol;
 
     #[tokio::test]
     async fn test_ipv4() {
@@ -353,16 +244,15 @@ mod tests {
 
         let extra_data = [0x55; 256];
 
-        let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice())));
+        let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
+            .await
+            .unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
 
         assert_eq!(bytes, extra_data);
-        assert_eq!(
-            read.state,
-            ProxyParse::Finished(([127, 0, 0, 1], 65535).into())
-        );
+        assert_eq!(addr, Some(([127, 0, 0, 1], 65535).into()));
     }
 
     #[tokio::test]
@@ -385,17 +275,17 @@ mod tests {
 
         let extra_data = [0x55; 256];
 
-        let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice())));
+        let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
+            .await
+            .unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
 
         assert_eq!(bytes, extra_data);
         assert_eq!(
-            read.state,
-            ProxyParse::Finished(
-                ([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into()
-            )
+            addr,
+            Some(([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into())
         );
     }
 
@@ -403,24 +293,24 @@ mod tests {
     async fn test_invalid() {
         let data = [0x55; 256];
 
-        let mut read = pin!(WithClientIp::new(data.as_slice()));
+        let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
         assert_eq!(bytes, data);
-        assert_eq!(read.state, ProxyParse::None);
+        assert_eq!(addr, None);
     }
 
     #[tokio::test]
     async fn test_short() {
         let data = [0x55; 10];
 
-        let mut read = pin!(WithClientIp::new(data.as_slice()));
+        let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
         assert_eq!(bytes, data);
-        assert_eq!(read.state, ProxyParse::None);
+        assert_eq!(addr, None);
     }
 
     #[tokio::test]
@@ -446,15 +336,14 @@ mod tests {
 
         let extra_data = [0xaa; 256];
 
-        let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice())));
+        let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
+            .await
+            .unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
 
         assert_eq!(bytes, extra_data);
-        assert_eq!(
-            read.state,
-            ProxyParse::Finished(([55, 56, 57, 58], 65535).into())
-        );
+        assert_eq!(addr, Some(([55, 56, 57, 58], 65535).into()));
     }
 }
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index a4554eef38..ddae6536fb 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -17,7 +17,7 @@ use crate::{
     context::RequestMonitoring,
     error::ReportableError,
     metrics::{Metrics, NumClientConnectionsGuard},
-    protocol2::WithClientIp,
+    protocol2::read_proxy_protocol,
     proxy::handshake::{handshake, HandshakeData},
     stream::{PqStream, Stream},
     EndpointCacheKey,
@@ -88,20 +88,18 @@ pub async fn task_main(
         tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
 
         connections.spawn(async move {
-            let mut socket = WithClientIp::new(socket);
-            let mut peer_addr = peer_addr.ip();
-            match socket.wait_for_addr().await {
-                Ok(Some(addr)) => peer_addr = addr.ip(),
+            let (socket, peer_addr) = match read_proxy_protocol(socket).await{
+                Ok((socket, Some(addr))) => (socket, addr.ip()),
                 Err(e) => {
                     error!("per-client task finished with an error: {e:#}");
                     return;
                 }
-                Ok(None) if config.require_client_ip => {
+                Ok((_socket, None)) if config.require_client_ip => {
                     error!("missing required client IP");
                     return;
                 }
-                Ok(None) => {}
-            }
+                Ok((socket, None)) => (socket, peer_addr.ip())
+            };
 
             match socket.inner.set_nodelay(true) {
                 Ok(()) => {},
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index e0ec90cb44..ad48af0093 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -174,7 +174,7 @@ async fn dummy_proxy(
     tls: Option<TlsConfig>,
     auth: impl TestAuth + Send,
 ) -> anyhow::Result<()> {
-    let client = WithClientIp::new(client);
+    let (client, _) = read_proxy_protocol(client).await?;
     let mut stream = match handshake(client, tls.as_ref(), false).await? {
         HandshakeData::Startup(stream, _) => stream,
         HandshakeData::Cancel(_) => bail!("cancellation not supported"),
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index b0f4026c76..1a0d1f7b0e 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -33,7 +33,7 @@ use crate::cancellation::CancellationHandlerMain;
 use crate::config::ProxyConfig;
 use crate::context::RequestMonitoring;
 use crate::metrics::Metrics;
-use crate::protocol2::WithClientIp;
+use crate::protocol2::read_proxy_protocol;
 use crate::proxy::run_until_cancelled;
 use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};
@@ -158,9 +158,8 @@ async fn connection_handler(
         .guard(crate::metrics::Protocol::Http);
 
     // handle PROXY protocol
-    let mut conn = WithClientIp::new(conn);
-    let peer = match conn.wait_for_addr().await {
-        Ok(peer) => peer,
+    let (conn, peer) = match read_proxy_protocol(conn).await {
+        Ok(c) => c,
         Err(e) => {
             tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
             return;

From a3d62b31bbafdf15ce6c83ea7bcd594f5870193a Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 25 Apr 2024 13:16:27 +0200
Subject: [PATCH 0616/1571] Update connect to compute and wake compute retry
 configs (#7509)

## Problem

## Summary of changes

Decrease waiting time
---
 proxy/src/config.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index ae7606e5d4..a32ab8c43c 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -533,13 +533,13 @@ pub struct RetryConfig {
 impl RetryConfig {
     /// Default options for RetryConfig.
 
-    /// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
+    /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
     pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
-        "num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
-    /// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
-    /// Cplane has timeout of 60s on each request.
+        "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6";
+    /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
+    /// Cplane has timeout of 60s on each request. 8m7s in total.
     pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str =
-        "num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
+        "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6";
 
     /// Parse retry options passed via cmdline.
     /// Example: [`Self::CONNECT_TO_COMPUTE_DEFAULT_VALUES`].

From b1d47f39117ed55dfcee7c8afe0b7c32f0336b8e Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 25 Apr 2024 13:38:51 +0200
Subject: [PATCH 0617/1571] proxy: Fix cancellations (#7510)

## Problem

Cancellations were published to the channel, that was never read.

## Summary of changes

Fallback to global redis publishing.
---
 proxy/src/bin/proxy.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 760ccf40d4..a1b4c21947 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -339,7 +339,7 @@ async fn main() -> anyhow::Result<()> {
 
     let cancel_map = CancelMap::default();
 
-    let redis_publisher = match &regional_redis_client {
+    let redis_publisher = match &redis_notifications_client {
         Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
             redis_publisher.clone(),
             args.region.clone(),

From e4a279db132b532c31da97daf09bd133f6c70bcc Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 25 Apr 2024 13:29:17 +0100
Subject: [PATCH 0618/1571] pageserver: coalesce read paths (#7477)

## Problem
We are currently supporting two read paths. No bueno.

## Summary of changes
High level: use vectored read path to serve get page requests - gated by
`get_impl` config
Low level:
1. Add ps config, `get_impl` to specify which read path to use when
serving get page requests
2. Fix base cached image handling for the vectored read path. This was
subtly broken: previously we
would not mark keys that went past their cached lsn as complete. This is
a self standing change which
could be its own PR, but I've included it here because writing separate
tests for it is tricky.
3. Fork get page to use either the legacy or vectored implementation
4. Validate the use of vectored read path when serving get page requests
against the legacy implementation.
Controlled by `validate_vectored_get` ps config.
5. Use the vectored read path to serve get page requests in tests (with
validation).

## Note
Since the vectored read path does not go through the page cache to read
buffers, this change also amounts to a removal of the buffer page cache. Materialized page cache
is still used.
---
 .github/workflows/build_and_test.yml          |   1 +
 control_plane/src/local_env.rs                |   2 +
 control_plane/src/pageserver.rs               |   7 +
 libs/pageserver_api/src/keyspace.rs           |   5 +
 pageserver/src/bin/pageserver.rs              |   4 +-
 pageserver/src/config.rs                      |  22 ++-
 pageserver/src/tenant.rs                      |  20 ++-
 pageserver/src/tenant/storage_layer.rs        |  36 +++-
 .../src/tenant/storage_layer/delta_layer.rs   |  12 +-
 .../tenant/storage_layer/inmemory_layer.rs    |   2 +
 pageserver/src/tenant/storage_layer/layer.rs  |   6 +
 pageserver/src/tenant/timeline.rs             | 157 +++++++++++++++---
 test_runner/fixtures/neon_fixtures.py         |   7 +
 test_runner/regress/test_broken_timeline.py   |   9 +-
 test_runner/regress/test_compatibility.py     |   5 +
 test_runner/regress/test_local_file_cache.py  |  11 +-
 test_runner/regress/test_lsn_mapping.py       |  10 +-
 test_runner/regress/test_pg_regress.py        |   6 +
 18 files changed, 277 insertions(+), 45 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index a7e108fac4..65b573663a 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -477,6 +477,7 @@ jobs:
           BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
           PAGESERVER_GET_VECTORED_IMPL: vectored
+          PAGESERVER_GET_IMPL: vectored
 
       # Temporary disable this step until we figure out why it's so flaky
       # Ref https://github.com/neondatabase/neon/issues/4540
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 38b7fffd09..2168d4b944 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -129,6 +129,7 @@ pub struct PageServerConf {
 
     pub(crate) virtual_file_io_engine: Option<String>,
     pub(crate) get_vectored_impl: Option<String>,
+    pub(crate) get_impl: Option<String>,
 }
 
 impl Default for PageServerConf {
@@ -141,6 +142,7 @@ impl Default for PageServerConf {
             http_auth_type: AuthType::Trust,
             virtual_file_io_engine: None,
             get_vectored_impl: None,
+            get_impl: None,
         }
     }
 }
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index abf815f07a..adac7d7bb5 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -92,6 +92,7 @@ impl PageServerNode {
             http_auth_type,
             virtual_file_io_engine,
             get_vectored_impl,
+            get_impl,
         } = &self.conf;
 
         let id = format!("id={}", id);
@@ -111,6 +112,11 @@ impl PageServerNode {
         } else {
             String::new()
         };
+        let get_impl = if let Some(get_impl) = get_impl {
+            format!("get_impl='{get_impl}'")
+        } else {
+            String::new()
+        };
 
         let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
 
@@ -124,6 +130,7 @@ impl PageServerNode {
             broker_endpoint_param,
             virtual_file_io_engine,
             get_vectored_impl,
+            get_impl,
         ];
 
         if let Some(control_plane_api) = &self.env.control_plane_api {
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index a9e19e8cc7..f73648d306 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -182,6 +182,11 @@ impl KeySpace {
     pub fn overlaps(&self, range: &Range<Key>) -> bool {
         self.overlaps_at(range).is_some()
     }
+
+    /// Check if the keyspace contains a key
+    pub fn contains(&self, key: &Key) -> bool {
+        self.overlaps(&(*key..key.next()))
+    }
 }
 
 ///
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 41835f9843..1345223a43 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -121,8 +121,10 @@ fn main() -> anyhow::Result<()> {
         &[("node_id", &conf.id.to_string())],
     );
 
-    // after setting up logging, log the effective IO engine choice
+    // after setting up logging, log the effective IO engine choice and read path implementations
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
+    info!(?conf.get_impl, "starting with get page implementation");
+    info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
 
     let tenants_path = conf.tenants_path();
     if !tenants_path.exists() {
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 10d5a22797..96fff1f0c0 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -30,9 +30,9 @@ use utils::{
     logging::LogFormat,
 };
 
-use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
+use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{
     TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
@@ -91,6 +91,8 @@ pub mod defaults {
 
     pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
 
+    pub const DEFAULT_GET_IMPL: &str = "legacy";
+
     pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
 
     pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
@@ -138,6 +140,8 @@ pub mod defaults {
 
 #get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
 
+#get_impl = '{DEFAULT_GET_IMPL}'
+
 #max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
 
 #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
@@ -284,6 +288,8 @@ pub struct PageServerConf {
 
     pub get_vectored_impl: GetVectoredImpl,
 
+    pub get_impl: GetImpl,
+
     pub max_vectored_read_bytes: MaxVectoredReadBytes,
 
     pub validate_vectored_get: bool,
@@ -414,6 +420,8 @@ struct PageServerConfigBuilder {
 
     get_vectored_impl: BuilderValue<GetVectoredImpl>,
 
+    get_impl: BuilderValue<GetImpl>,
+
     max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
 
     validate_vectored_get: BuilderValue<bool>,
@@ -503,6 +511,7 @@ impl PageServerConfigBuilder {
             virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
 
             get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
+            get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()),
             max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                 NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
             )),
@@ -681,6 +690,10 @@ impl PageServerConfigBuilder {
         self.get_vectored_impl = BuilderValue::Set(value);
     }
 
+    pub fn get_impl(&mut self, value: GetImpl) {
+        self.get_impl = BuilderValue::Set(value);
+    }
+
     pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
         self.max_vectored_read_bytes = BuilderValue::Set(value);
     }
@@ -750,6 +763,7 @@ impl PageServerConfigBuilder {
                 secondary_download_concurrency,
                 ingest_batch_size,
                 get_vectored_impl,
+                get_impl,
                 max_vectored_read_bytes,
                 validate_vectored_get,
                 ephemeral_bytes_per_memory_kb,
@@ -1035,6 +1049,9 @@ impl PageServerConf {
                 "get_vectored_impl" => {
                     builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
                 }
+                "get_impl" => {
+                    builder.get_impl(parse_toml_from_str("get_impl", item)?)
+                }
                 "max_vectored_read_bytes" => {
                     let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
                     builder.get_max_vectored_read_bytes(
@@ -1126,6 +1143,7 @@ impl PageServerConf {
             ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
             virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
             get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+            get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
             max_vectored_read_bytes: MaxVectoredReadBytes(
                 NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                     .expect("Invalid default constant"),
@@ -1365,6 +1383,7 @@ background_task_maximum_delay = '334 s'
                 ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                 get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                 max_vectored_read_bytes: MaxVectoredReadBytes(
                     NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                         .expect("Invalid default constant")
@@ -1438,6 +1457,7 @@ background_task_maximum_delay = '334 s'
                 ingest_batch_size: 100,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                 get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                 max_vectored_read_bytes: MaxVectoredReadBytes(
                     NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                         .expect("Invalid default constant")
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 15350e93e9..ff6194ab00 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3865,6 +3865,7 @@ mod tests {
     use pageserver_api::key::NON_INHERITED_RANGE;
     use pageserver_api::keyspace::KeySpace;
     use rand::{thread_rng, Rng};
+    use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
 
     static TEST_KEY: Lazy<Key> =
@@ -4653,7 +4654,9 @@ mod tests {
         for read in reads {
             info!("Doing vectored read on {:?}", read);
 
-            let vectored_res = tline.get_vectored_impl(read.clone(), reads_lsn, &ctx).await;
+            let vectored_res = tline
+                .get_vectored_impl(read.clone(), reads_lsn, ValuesReconstructState::new(), &ctx)
+                .await;
             tline
                 .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
                 .await;
@@ -4698,7 +4701,12 @@ mod tests {
         let read_lsn = child_timeline.get_last_record_lsn();
 
         let vectored_res = child_timeline
-            .get_vectored_impl(aux_keyspace.clone(), read_lsn, &ctx)
+            .get_vectored_impl(
+                aux_keyspace.clone(),
+                read_lsn,
+                ValuesReconstructState::new(),
+                &ctx,
+            )
             .await;
 
         child_timeline
@@ -4846,7 +4854,12 @@ mod tests {
             ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
         };
         let results = child_timeline
-            .get_vectored_impl(read.clone(), current_lsn, &ctx)
+            .get_vectored_impl(
+                read.clone(),
+                current_lsn,
+                ValuesReconstructState::new(),
+                &ctx,
+            )
             .await?;
 
         for (key, img_res) in results {
@@ -4979,6 +4992,7 @@ mod tests {
                         ranges: vec![child_gap_at_key..child_gap_at_key.next()],
                     },
                     query_lsn,
+                    ValuesReconstructState::new(),
                     &ctx,
                 )
                 .await;
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9ddd916700..4f1b56ef9f 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -148,6 +148,29 @@ impl ValuesReconstructState {
         self.layers_visited
     }
 
+    /// This function is called after reading a keyspace from a layer.
+    /// It checks if the read path has now moved past the cached Lsn for any keys.
+    ///
+    /// Implementation note: We intentionally iterate over the keys for which we've
+    /// already collected some reconstruct data. This avoids scaling complexity with
+    /// the size of the search space.
+    pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) {
+        for (key, value) in self.keys.iter_mut() {
+            if !keyspace.contains(key) {
+                continue;
+            }
+
+            if let Ok(state) = value {
+                if state.situation != ValueReconstructSituation::Complete
+                    && state.get_cached_lsn() >= Some(advanced_to)
+                {
+                    state.situation = ValueReconstructSituation::Complete;
+                    self.keys_done.add_key(*key);
+                }
+            }
+        }
+    }
+
     /// Update the state collected for a given key.
     /// Returns true if this was the last value needed for the key and false otherwise.
     ///
@@ -172,11 +195,18 @@ impl ValuesReconstructState {
                         true
                     }
                     Value::WalRecord(rec) => {
-                        let reached_cache =
-                            state.get_cached_lsn().map(|clsn| clsn + 1) == Some(lsn);
+                        debug_assert!(
+                            Some(lsn) > state.get_cached_lsn(),
+                            "Attempt to collect a record below cached LSN for walredo: {} < {}",
+                            lsn,
+                            state
+                                .get_cached_lsn()
+                                .expect("Assertion can only fire if a cached lsn is present")
+                        );
+
                         let will_init = rec.will_init();
                         state.records.push((lsn, rec));
-                        will_init || reached_cache
+                        will_init
                     }
                 },
             };
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index a4b2b4f840..a9f8404158 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -217,6 +217,7 @@ pub struct DeltaLayerInner {
     // values copied from summary
     index_start_blk: u32,
     index_root_blk: u32,
+    lsn_range: Range<Lsn>,
 
     file: VirtualFile,
     file_id: FileId,
@@ -745,6 +746,7 @@ impl DeltaLayerInner {
             file_id,
             index_start_blk: actual_summary.index_start_blk,
             index_root_blk: actual_summary.index_root_blk,
+            lsn_range: actual_summary.lsn_range,
             max_vectored_read_bytes,
         }))
     }
@@ -869,7 +871,7 @@ impl DeltaLayerInner {
         let data_end_offset = self.index_start_offset();
 
         let reads = Self::plan_reads(
-            keyspace,
+            &keyspace,
             lsn_range,
             data_end_offset,
             index_reader,
@@ -883,11 +885,13 @@ impl DeltaLayerInner {
         self.do_reads_and_update_state(reads, reconstruct_state)
             .await;
 
+        reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);
+
         Ok(())
     }
 
     async fn plan_reads<Reader>(
-        keyspace: KeySpace,
+        keyspace: &KeySpace,
         lsn_range: Range<Lsn>,
         data_end_offset: u64,
         index_reader: DiskBtreeReader<Reader, DELTA_KEY_SIZE>,
@@ -1535,7 +1539,7 @@ mod test {
 
         // Plan and validate
         let vectored_reads = DeltaLayerInner::plan_reads(
-            keyspace.clone(),
+            &keyspace,
             lsn_range.clone(),
             disk_offset,
             reader,
@@ -1787,7 +1791,7 @@ mod test {
             let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
 
             let vectored_reads = DeltaLayerInner::plan_reads(
-                keyspace.clone(),
+                &keyspace,
                 entries_meta.lsn_range.clone(),
                 data_end_offset,
                 index_reader,
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index a86d0d48c5..5939b969d6 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -438,6 +438,8 @@ impl InMemoryLayer {
             }
         }
 
+        reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);
+
         Ok(())
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 316a11f8cc..ee9de8de09 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -336,6 +336,12 @@ impl Layer {
             .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
             .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
             .await
+            .map_err(|err| match err {
+                GetVectoredError::Other(err) => GetVectoredError::Other(
+                    err.context(format!("get_values_reconstruct_data for layer {self}")),
+                ),
+                err => err,
+            })
     }
 
     /// Download the layer if evicted.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 703654a37c..f1387e10ac 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -119,8 +119,8 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::config::TenantConf;
 use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
+use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
@@ -653,6 +653,19 @@ impl From<GetVectoredError> for CreateImageLayersError {
     }
 }
 
+impl From<GetVectoredError> for PageReconstructError {
+    fn from(e: GetVectoredError) -> Self {
+        match e {
+            GetVectoredError::Cancelled => PageReconstructError::Cancelled,
+            GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")),
+            err @ GetVectoredError::Oversized(_) => PageReconstructError::Other(err.into()),
+            err @ GetVectoredError::MissingKey(_) => PageReconstructError::Other(err.into()),
+            GetVectoredError::GetReadyAncestorError(err) => PageReconstructError::from(err),
+            GetVectoredError::Other(err) => PageReconstructError::Other(err),
+        }
+    }
+}
+
 impl From<GetReadyAncestorError> for PageReconstructError {
     fn from(e: GetReadyAncestorError) -> Self {
         use GetReadyAncestorError::*;
@@ -682,6 +695,23 @@ pub enum GetVectoredImpl {
     Vectored,
 }
 
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum GetImpl {
+    Legacy,
+    Vectored,
+}
+
 pub(crate) enum WaitLsnWaiter<'a> {
     Timeline(&'a Timeline),
     Tenant,
@@ -743,16 +773,6 @@ impl Timeline {
         key: Key,
         lsn: Lsn,
         ctx: &RequestContext,
-    ) -> Result<Bytes, PageReconstructError> {
-        self.timeline_get_throttle.throttle(ctx, 1).await;
-        self.get_impl(key, lsn, ctx).await
-    }
-    /// Not subject to [`Self::timeline_get_throttle`].
-    async fn get_impl(
-        &self,
-        key: Key,
-        lsn: Lsn,
-        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         if !lsn.is_valid() {
             return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
@@ -763,13 +783,7 @@ impl Timeline {
         // page_service.
         debug_assert!(!self.shard_identity.is_key_disposable(&key));
 
-        // XXX: structured stats collection for layer eviction here.
-        trace!(
-            "get page request for {}@{} from task kind {:?}",
-            key,
-            lsn,
-            ctx.task_kind()
-        );
+        self.timeline_get_throttle.throttle(ctx, 1).await;
 
         // Check the page cache. We will get back the most recent page with lsn <= `lsn`.
         // The cached image can be returned directly if there is no WAL between the cached image
@@ -792,10 +806,81 @@ impl Timeline {
             None => None,
         };
 
-        let mut reconstruct_state = ValueReconstructState {
-            records: Vec::new(),
-            img: cached_page_img,
-        };
+        match self.conf.get_impl {
+            GetImpl::Legacy => {
+                let reconstruct_state = ValueReconstructState {
+                    records: Vec::new(),
+                    img: cached_page_img,
+                };
+
+                self.get_impl(key, lsn, reconstruct_state, ctx).await
+            }
+            GetImpl::Vectored => {
+                let keyspace = KeySpace {
+                    ranges: vec![key..key.next()],
+                };
+
+                // Initialise the reconstruct state for the key with the cache
+                // entry returned above.
+                let mut reconstruct_state = ValuesReconstructState::new();
+                let mut key_state = VectoredValueReconstructState::default();
+                key_state.img = cached_page_img;
+                reconstruct_state.keys.insert(key, Ok(key_state));
+
+                let vectored_res = self
+                    .get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx)
+                    .await;
+
+                if self.conf.validate_vectored_get {
+                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
+                        .await;
+                }
+
+                let key_value = vectored_res?.pop_first();
+                match key_value {
+                    Some((got_key, value)) => {
+                        if got_key != key {
+                            error!(
+                                "Expected {}, but singular vectored get returned {}",
+                                key, got_key
+                            );
+                            Err(PageReconstructError::Other(anyhow!(
+                                "Singular vectored get returned wrong key"
+                            )))
+                        } else {
+                            value
+                        }
+                    }
+                    None => {
+                        error!(
+                            "Expected {}, but singular vectored get returned nothing",
+                            key
+                        );
+                        Err(PageReconstructError::Other(anyhow!(
+                            "Singular vectored get did not return a value for {}",
+                            key
+                        )))
+                    }
+                }
+            }
+        }
+    }
+
+    /// Not subject to [`Self::timeline_get_throttle`].
+    async fn get_impl(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        mut reconstruct_state: ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        // XXX: structured stats collection for layer eviction here.
+        trace!(
+            "get page request for {}@{} from task kind {:?}",
+            key,
+            lsn,
+            ctx.task_kind()
+        );
 
         let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
             .for_get_kind(GetKind::Singular)
@@ -888,7 +973,9 @@ impl Timeline {
                 self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
             }
             GetVectoredImpl::Vectored => {
-                let vectored_res = self.get_vectored_impl(keyspace.clone(), lsn, ctx).await;
+                let vectored_res = self
+                    .get_vectored_impl(keyspace.clone(), lsn, ValuesReconstructState::new(), ctx)
+                    .await;
 
                 if self.conf.validate_vectored_get {
                     self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
@@ -934,7 +1021,9 @@ impl Timeline {
         for range in keyspace.ranges {
             let mut key = range.start;
             while key != range.end {
-                let block = self.get_impl(key, lsn, ctx).await;
+                let block = self
+                    .get_impl(key, lsn, ValueReconstructState::default(), ctx)
+                    .await;
 
                 use PageReconstructError::*;
                 match block {
@@ -952,6 +1041,23 @@ impl Timeline {
                         // level error.
                         return Err(GetVectoredError::MissingKey(key));
                     }
+                    Err(Other(err))
+                        if err
+                            .to_string()
+                            .contains("downloading evicted layer file failed") =>
+                    {
+                        return Err(GetVectoredError::Other(err))
+                    }
+                    Err(Other(err))
+                        if err
+                            .chain()
+                            .any(|cause| cause.to_string().contains("layer loading failed")) =>
+                    {
+                        // The intent here is to achieve error parity with the vectored read path.
+                        // When vectored read fails to load a layer it fails the whole read, hence
+                        // we mimic this behaviour here to keep the validation happy.
+                        return Err(GetVectoredError::Other(err));
+                    }
                     _ => {
                         values.insert(key, block);
                         key = key.next();
@@ -967,10 +1073,9 @@ impl Timeline {
         &self,
         keyspace: KeySpace,
         lsn: Lsn,
+        mut reconstruct_state: ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        let mut reconstruct_state = ValuesReconstructState::new();
-
         let get_kind = if keyspace.total_size() == 1 {
             GetKind::Singular
         } else {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c2c661088b..fcd33bb66a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -507,6 +507,11 @@ class NeonEnvBuilder:
             self.pageserver_get_vectored_impl = "vectored"
             log.debug('Overriding pageserver get_vectored_impl config to "vectored"')
 
+        self.pageserver_get_impl: Optional[str] = None
+        if os.getenv("PAGESERVER_GET_IMPL", "") == "vectored":
+            self.pageserver_get_impl = "vectored"
+            log.debug('Overriding pageserver get_impl config to "vectored"')
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -1078,6 +1083,8 @@ class NeonEnv:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
             if config.pageserver_get_vectored_impl is not None:
                 ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl
+            if config.pageserver_get_impl is not None:
+                ps_cfg["get_impl"] = config.pageserver_get_impl
 
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 804ad135ce..1279c1bf81 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -17,11 +17,16 @@ from fixtures.types import TenantId, TimelineId
 # Test restarting page server, while safekeeper and compute node keep
 # running.
 def test_local_corruption(neon_env_builder: NeonEnvBuilder):
+    if neon_env_builder.pageserver_get_impl == "vectored":
+        reconstruct_function_name = "get_values_reconstruct_data"
+    else:
+        reconstruct_function_name = "get_value_reconstruct_data"
+
     env = neon_env_builder.init_start()
 
     env.pageserver.allowed_errors.extend(
         [
-            ".*get_value_reconstruct_data for layer .*",
+            f".*{reconstruct_function_name} for layer .*",
             ".*could not find data for key.*",
             ".*is not active. Current state: Broken.*",
             ".*will not become active. Current state: Broken.*",
@@ -84,7 +89,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
     # (We don't check layer file contents on startup, when loading the timeline)
     #
     # This will change when we implement checksums for layers
-    with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err:
+    with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err:
         pg2.start()
     log.info(
         f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index ddad98a5fa..2a371eae72 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -226,6 +226,11 @@ def test_forward_compatibility(
     )
 
     try:
+        # Previous version neon_local and pageserver are not aware
+        # of the new config.
+        # TODO: remove this once the code reaches main
+        neon_env_builder.pageserver_get_impl = None
+
         neon_env_builder.num_safekeepers = 3
         neon_local_binpath = neon_env_builder.neon_binpath
         env = neon_env_builder.from_repo_dir(
diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py
index 38f2034c18..76c6581448 100644
--- a/test_runner/regress/test_local_file_cache.py
+++ b/test_runner/regress/test_local_file_cache.py
@@ -4,16 +4,21 @@ import threading
 import time
 from typing import List
 
-from fixtures.neon_fixtures import NeonEnv
+from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder
 from fixtures.utils import query_scalar
 
 
-def test_local_file_cache_unlink(neon_simple_env: NeonEnv):
-    env = neon_simple_env
+def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: str):
+    if build_type == "debug":
+        # Disable vectored read path cross validation since it makes the test time out.
+        neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
+
+    env = neon_env_builder.init_start()
 
     cache_dir = os.path.join(env.repo_dir, "file_cache")
     os.mkdir(cache_dir)
 
+    env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME)
     env.neon_cli.create_branch("test_local_file_cache_unlink", "empty")
 
     endpoint = env.endpoints.create_start(
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 5813231aab..37676ab0d4 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -1,3 +1,4 @@
+import re
 import time
 from datetime import datetime, timedelta, timezone
 
@@ -109,6 +110,11 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
 
 # Test pageserver get_timestamp_of_lsn API
 def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
+    if neon_env_builder.pageserver_get_impl == "vectored":
+        key_not_found_error = r".*Requested key.*not found,*"
+    else:
+        key_not_found_error = r".*could not find data for key.*"
+
     env = neon_env_builder.init_start()
 
     new_timeline_id = env.neon_cli.create_branch("test_ts_of_lsn_api")
@@ -177,8 +183,8 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
             raise RuntimeError("there should have been an 'could not find data for key' error")
         except PageserverApiException as error:
             assert error.status_code == 500
-            assert str(error).startswith("could not find data for key")
-            env.pageserver.allowed_errors.append(".*could not find data for key.*")
+            assert re.match(key_not_found_error, str(error))
+            env.pageserver.allowed_errors.append(key_not_found_error)
 
         # Probe a bunch of timestamps in the valid range
         step_size = 100
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index e4219ec7a6..2b1b7fff34 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -18,6 +18,7 @@ from fixtures.remote_storage import s3_storage
 def test_pg_regress(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
+    build_type: str,
     pg_bin,
     capsys,
     base_dir: Path,
@@ -30,6 +31,11 @@ def test_pg_regress(
     """
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count
+
+    if build_type == "debug":
+        # Disable vectored read path cross validation since it makes the test time out.
+        neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
+
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
     env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)

From 5357f401831a42c7f11adc141ce78d7e795e3bc9 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 25 Apr 2024 17:26:18 +0200
Subject: [PATCH 0619/1571] proxy: Workaround switch to the regional redis
 (#7513)

## Problem

Start switching from the global redis to the regional one

## Summary of changes

* Publish cancellations to the regional redis
* Listen notifications from both: global and regional
---
 proxy/src/bin/proxy.rs | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index a1b4c21947..39f6bc8b6d 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -339,7 +339,7 @@ async fn main() -> anyhow::Result<()> {
 
     let cancel_map = CancelMap::default();
 
-    let redis_publisher = match &redis_notifications_client {
+    let redis_publisher = match &regional_redis_client {
         Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
             redis_publisher.clone(),
             args.region.clone(),
@@ -409,15 +409,28 @@ async fn main() -> anyhow::Result<()> {
 
     if let auth::BackendType::Console(api, _) = &config.auth_backend {
         if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
-            if let Some(redis_notifications_client) = redis_notifications_client {
-                let cache = api.caches.project_info.clone();
-                maintenance_tasks.spawn(notifications::task_main(
-                    redis_notifications_client,
-                    cache.clone(),
-                    cancel_map.clone(),
-                    args.region.clone(),
-                ));
-                maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+            match (redis_notifications_client, regional_redis_client.clone()) {
+                (None, None) => {}
+                (client1, client2) => {
+                    let cache = api.caches.project_info.clone();
+                    if let Some(client) = client1 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            cancel_map.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    if let Some(client) = client2 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            cancel_map.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+                }
             }
             if let Some(regional_redis_client) = regional_redis_client {
                 let cache = api.caches.endpoints_cache.clone();

From c59abedd85b81d832225a2490ba066e0c6993fc9 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 25 Apr 2024 12:39:27 -0400
Subject: [PATCH 0620/1571] chore(pageserver): temporary metrics on ingestion
 time (#7515)

As a follow-up on https://github.com/neondatabase/neon/pull/7467, also
measure the ingestion operation speed.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/metrics.rs           | 66 ++++++++++++++++-------------
 pageserver/src/pgdatadir_mapping.rs |  5 +++
 2 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 6ce7f286b3..e4b314f805 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1519,35 +1519,6 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
 }
 });
 
-pub(crate) struct WalIngestMetrics {
-    pub(crate) bytes_received: IntCounter,
-    pub(crate) records_received: IntCounter,
-    pub(crate) records_committed: IntCounter,
-    pub(crate) records_filtered: IntCounter,
-}
-
-pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
-    bytes_received: register_int_counter!(
-        "pageserver_wal_ingest_bytes_received",
-        "Bytes of WAL ingested from safekeepers",
-    )
-    .unwrap(),
-    records_received: register_int_counter!(
-        "pageserver_wal_ingest_records_received",
-        "Number of WAL records received from safekeepers"
-    )
-    .expect("failed to define a metric"),
-    records_committed: register_int_counter!(
-        "pageserver_wal_ingest_records_committed",
-        "Number of WAL records which resulted in writes to pageserver storage"
-    )
-    .expect("failed to define a metric"),
-    records_filtered: register_int_counter!(
-        "pageserver_wal_ingest_records_filtered",
-        "Number of WAL records filtered out due to sharding"
-    )
-    .expect("failed to define a metric"),
-});
 pub(crate) struct SecondaryModeMetrics {
     pub(crate) upload_heatmap: IntCounter,
     pub(crate) upload_heatmap_errors: IntCounter,
@@ -1749,6 +1720,43 @@ macro_rules! redo_bytes_histogram_count_buckets {
     };
 }
 
+pub(crate) struct WalIngestMetrics {
+    pub(crate) bytes_received: IntCounter,
+    pub(crate) records_received: IntCounter,
+    pub(crate) records_committed: IntCounter,
+    pub(crate) records_filtered: IntCounter,
+    pub(crate) time_spent_on_ingest: Histogram,
+}
+
+pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
+    bytes_received: register_int_counter!(
+        "pageserver_wal_ingest_bytes_received",
+        "Bytes of WAL ingested from safekeepers",
+    )
+    .unwrap(),
+    records_received: register_int_counter!(
+        "pageserver_wal_ingest_records_received",
+        "Number of WAL records received from safekeepers"
+    )
+    .expect("failed to define a metric"),
+    records_committed: register_int_counter!(
+        "pageserver_wal_ingest_records_committed",
+        "Number of WAL records which resulted in writes to pageserver storage"
+    )
+    .expect("failed to define a metric"),
+    records_filtered: register_int_counter!(
+        "pageserver_wal_ingest_records_filtered",
+        "Number of WAL records filtered out due to sharding"
+    )
+    .expect("failed to define a metric"),
+    time_spent_on_ingest: register_histogram!(
+        "pageserver_wal_ingest_put_value_seconds",
+        "Actual time spent on ingesting a record",
+        redo_histogram_time_buckets!(),
+    )
+    .expect("failed to define a metric"),
+});
+
 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wal_redo_seconds",
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 2c98c0b6c8..ed1d737583 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,6 +9,7 @@
 use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
+use crate::metrics::WAL_INGEST;
 use crate::repository::*;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
@@ -1551,6 +1552,8 @@ impl<'a> DatadirModification<'a> {
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
         let mut writer = self.tline.writer().await;
 
+        let timer = WAL_INGEST.time_spent_on_ingest.start_timer();
+
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
@@ -1590,6 +1593,8 @@ impl<'a> DatadirModification<'a> {
             writer.update_directory_entries_count(kind, count as u64);
         }
 
+        timer.observe_duration();
+
         Ok(())
     }
 

From 04a682021f34a39a2e1ba36ec8e9e7cf1d911a9c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 25 Apr 2024 19:45:36 +0300
Subject: [PATCH 0621/1571] Remove the now-unused 'latest' arguments (#7377)

The 'latest' argument was passed to the functions in
pgdatadir_mapping.rs to know when they can update the relsize
cache. Commit e69ff3fc00 changed how the relsize cache is updated,
making the 'latest' argument unused.
---
 pageserver/src/basebackup.rs        |   4 +-
 pageserver/src/page_service.rs      |  14 +---
 pageserver/src/pgdatadir_mapping.rs |  14 ++--
 pageserver/src/walingest.rs         | 110 +++++++++++++---------------
 4 files changed, 63 insertions(+), 79 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 107758f385..ba047745f1 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -376,7 +376,7 @@ where
     async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
         let nblocks = self
             .timeline
-            .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
+            .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
             .await?;
 
         // If the relation is empty, create an empty file
@@ -397,7 +397,7 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
+                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
                     .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index fa6b81ac72..69475c2dc7 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -936,7 +936,7 @@ impl PageServerHandler {
                 .await?;
 
         let exists = timeline
-            .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
+            .get_rel_exists(req.rel, Version::Lsn(lsn), ctx)
             .await?;
 
         Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -964,7 +964,7 @@ impl PageServerHandler {
                 .await?;
 
         let n_blocks = timeline
-            .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
+            .get_rel_size(req.rel, Version::Lsn(lsn), ctx)
             .await?;
 
         Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
@@ -992,13 +992,7 @@ impl PageServerHandler {
                 .await?;
 
         let total_blocks = timeline
-            .get_db_size(
-                DEFAULTTABLESPACE_OID,
-                req.dbnode,
-                Version::Lsn(lsn),
-                req.latest,
-                ctx,
-            )
+            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx)
             .await?;
         let db_size = total_blocks as i64 * BLCKSZ as i64;
 
@@ -1170,7 +1164,7 @@ impl PageServerHandler {
                 .await?;
 
         let page = timeline
-            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
+            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx)
             .await?;
 
         Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index ed1d737583..14bcc50e7e 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -176,7 +176,6 @@ impl Timeline {
         tag: RelTag,
         blknum: BlockNumber,
         version: Version<'_>,
-        latest: bool,
         ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         if tag.relnode == 0 {
@@ -185,7 +184,7 @@ impl Timeline {
             ));
         }
 
-        let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
+        let nblocks = self.get_rel_size(tag, version, ctx).await?;
         if blknum >= nblocks {
             debug!(
                 "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
@@ -207,7 +206,6 @@ impl Timeline {
         spcnode: Oid,
         dbnode: Oid,
         version: Version<'_>,
-        latest: bool,
         ctx: &RequestContext,
     ) -> Result<usize, PageReconstructError> {
         let mut total_blocks = 0;
@@ -215,7 +213,7 @@ impl Timeline {
         let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
 
         for rel in rels {
-            let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
+            let n_blocks = self.get_rel_size(rel, version, ctx).await?;
             total_blocks += n_blocks as usize;
         }
         Ok(total_blocks)
@@ -226,7 +224,6 @@ impl Timeline {
         &self,
         tag: RelTag,
         version: Version<'_>,
-        latest: bool,
         ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
         if tag.relnode == 0 {
@@ -240,7 +237,7 @@ impl Timeline {
         }
 
         if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, version, latest, ctx).await?
+            && !self.get_rel_exists(tag, version, ctx).await?
         {
             // FIXME: Postgres sometimes calls smgrcreate() to create
             // FSM, and smgrnblocks() on it immediately afterwards,
@@ -263,7 +260,6 @@ impl Timeline {
         &self,
         tag: RelTag,
         version: Version<'_>,
-        _latest: bool,
         ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
         if tag.relnode == 0 {
@@ -1095,7 +1091,7 @@ impl<'a> DatadirModification<'a> {
     ) -> anyhow::Result<()> {
         let total_blocks = self
             .tline
-            .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
+            .get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
             .await?;
 
         // Remove entry from dbdir
@@ -1194,7 +1190,7 @@ impl<'a> DatadirModification<'a> {
         anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
         if self
             .tline
-            .get_rel_exists(rel, Version::Modified(self), true, ctx)
+            .get_rel_exists(rel, Version::Modified(self), ctx)
             .await?
         {
             let size_key = rel_size_to_key(rel);
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 4f83b118ae..79f075b877 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1034,7 +1034,7 @@ impl WalIngest {
 
             let nblocks = modification
                 .tline
-                .get_rel_size(src_rel, Version::Modified(modification), true, ctx)
+                .get_rel_size(src_rel, Version::Modified(modification), ctx)
                 .await?;
             let dst_rel = RelTag {
                 spcnode: tablespace_id,
@@ -1068,13 +1068,7 @@ impl WalIngest {
 
                 let content = modification
                     .tline
-                    .get_rel_page_at_lsn(
-                        src_rel,
-                        blknum,
-                        Version::Modified(modification),
-                        true,
-                        ctx,
-                    )
+                    .get_rel_page_at_lsn(src_rel, blknum, Version::Modified(modification), ctx)
                     .await?;
                 modification.put_rel_page_image(dst_rel, blknum, content)?;
                 num_blocks_copied += 1;
@@ -1242,7 +1236,7 @@ impl WalIngest {
                 };
                 if modification
                     .tline
-                    .get_rel_exists(rel, Version::Modified(modification), true, ctx)
+                    .get_rel_exists(rel, Version::Modified(modification), ctx)
                     .await?
                 {
                     self.put_rel_drop(modification, rel, ctx).await?;
@@ -1541,7 +1535,7 @@ impl WalIngest {
             nblocks
         } else if !modification
             .tline
-            .get_rel_exists(rel, Version::Modified(modification), true, ctx)
+            .get_rel_exists(rel, Version::Modified(modification), ctx)
             .await?
         {
             // create it with 0 size initially, the logic below will extend it
@@ -1553,7 +1547,7 @@ impl WalIngest {
         } else {
             modification
                 .tline
-                .get_rel_size(rel, Version::Modified(modification), true, ctx)
+                .get_rel_size(rel, Version::Modified(modification), ctx)
                 .await?
         };
 
@@ -1650,14 +1644,14 @@ async fn get_relsize(
 ) -> anyhow::Result<BlockNumber> {
     let nblocks = if !modification
         .tline
-        .get_rel_exists(rel, Version::Modified(modification), true, ctx)
+        .get_rel_exists(rel, Version::Modified(modification), ctx)
         .await?
     {
         0
     } else {
         modification
             .tline
-            .get_rel_size(rel, Version::Modified(modification), true, ctx)
+            .get_rel_size(rel, Version::Modified(modification), ctx)
             .await?
     };
     Ok(nblocks)
@@ -1732,29 +1726,29 @@ mod tests {
         // The relation was created at LSN 2, not visible at LSN 1 yet.
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
                 .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
             .await
             .is_err());
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             1
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             3
         );
@@ -1762,46 +1756,46 @@ mod tests {
         // Check page contents at each LSN
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             test_img("foo blk 0 at 2")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), &ctx)
                 .await?,
             test_img("foo blk 0 at 3")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), &ctx)
                 .await?,
             test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), &ctx)
                 .await?,
             test_img("foo blk 1 at 4")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             test_img("foo blk 1 at 4")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             test_img("foo blk 2 at 5")
         );
@@ -1817,19 +1811,19 @@ mod tests {
         // Check reported size and contents after truncation
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
                 .await?,
             2
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), &ctx)
                 .await?,
             test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), &ctx)
                 .await?,
             test_img("foo blk 1 at 4")
         );
@@ -1837,13 +1831,13 @@ mod tests {
         // should still see the truncated block with older LSN
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             3
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             test_img("foo blk 2 at 5")
         );
@@ -1856,7 +1850,7 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx)
                 .await?,
             0
         );
@@ -1869,19 +1863,19 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx)
                 .await?,
             2
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), &ctx)
                 .await?,
             ZERO_PAGE
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), &ctx)
                 .await?,
             test_img("foo blk 1")
         );
@@ -1894,21 +1888,21 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
                 .await?,
             1501
         );
         for blk in 2..1500 {
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), &ctx)
                     .await?,
                 ZERO_PAGE
             );
         }
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), &ctx)
                 .await?,
             test_img("foo blk 1500")
         );
@@ -1935,13 +1929,13 @@ mod tests {
         // Check that rel exists and size is correct
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             1
         );
@@ -1954,7 +1948,7 @@ mod tests {
         // Check that rel is not visible anymore
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx)
                 .await?,
             false
         );
@@ -1972,13 +1966,13 @@ mod tests {
         // Check that rel exists and size is correct
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
                 .await?,
             1
         );
@@ -2011,24 +2005,24 @@ mod tests {
         // The relation was created at LSN 20, not visible at LSN 1 yet.
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
                 .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
             .await
             .is_err());
 
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             relsize
         );
@@ -2039,7 +2033,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), &ctx)
                     .await?,
                 test_img(&data)
             );
@@ -2056,7 +2050,7 @@ mod tests {
         // Check reported size and contents after truncation
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
                 .await?,
             1
         );
@@ -2066,7 +2060,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), &ctx)
                     .await?,
                 test_img(&data)
             );
@@ -2075,7 +2069,7 @@ mod tests {
         // should still see all blocks with older LSN
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             relsize
         );
@@ -2084,7 +2078,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), &ctx)
                     .await?,
                 test_img(&data)
             );
@@ -2104,13 +2098,13 @@ mod tests {
 
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
                 .await?,
             relsize
         );
@@ -2120,7 +2114,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), &ctx)
                     .await?,
                 test_img(&data)
             );
@@ -2154,7 +2148,7 @@ mod tests {
 
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
                 .await?,
             RELSEG_SIZE + 1
         );
@@ -2168,7 +2162,7 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
                 .await?,
             RELSEG_SIZE
         );
@@ -2183,7 +2177,7 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
                 .await?,
             RELSEG_SIZE - 1
         );
@@ -2201,7 +2195,7 @@ mod tests {
             m.commit(&ctx).await?;
             assert_eq!(
                 tline
-                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
                     .await?,
                 size as BlockNumber
             );

From 4917f52c8850ac77cc8a42f9916435f5da18f2f4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 25 Apr 2024 19:45:42 +0300
Subject: [PATCH 0622/1571] Server support for new pagestream protocol version
 (#7377)

In the old protocol version, the client sent with each request:

- latest: bool. If true, the client requested the latest page
  version, and the 'lsn' was just a hint of when the page was last
  modified
- lsn: Lsn, the page version to return

This protocol didn't allow requesting a page at a particular
non-latest LSN and *also* sending a hint on when the page was last
modified. That put a read only compute into an awkward position where
it had to either request each page at the replay-LSN, which could be
very close to the last LSN written in the primary and therefore
require the pageserver to wait for it to arrive, or an older LSN which
could already be garbage collected in the pageserver, resulting in an
error. The new protocol version fixes that by allowing a read only
compute to send both LSNs.

To use the new protocol version, use "pagestream_v2" command instead
of just "pagestream". The old protocol version is still supported, for
compatibility with old computes (and in fact there is no client
support yet, it is added by the next commit).
---
 libs/pageserver_api/src/models.rs             | 143 +++++++----
 pageserver/client/src/page_service.rs         |   2 +-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |   8 +-
 pageserver/src/page_service.rs                | 233 +++++++++++-------
 trace/src/main.rs                             |   8 +-
 5 files changed, 254 insertions(+), 140 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index e334a68a1e..4ce1ecde26 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -848,39 +848,72 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
     }
 }
 
+// In the V2 protocol version, a GetPage request contains two LSN values:
+//
+// request_lsn: Get the page version at this point in time.  Lsn::Max is a special value that means
+// "get the latest version present". It's used by the primary server, which knows that no one else
+// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
+// Lsn::Max. Standby servers use the current replay LSN as the request LSN.
+//
+// not_modified_since: Hint to the pageserver that the client knows that the page has not been
+// modified between 'not_modified_since' and the request LSN. It's always correct to set
+// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
+// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
+// request without waiting for 'request_lsn' to arrive.
+//
+// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
+// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
+// 'latest' was set to true. The V2 interface was added because there was no correct way for a
+// standby to request a page at a particular non-latest LSN, and also include the
+// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
+// request, if the standby knows that the page hasn't been modified since, and risk getting an error
+// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
+// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
+// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
+// difference in the responses between V1 and V2.
+//
+// The Request structs below reflect the V2 interface. If V1 is used, the parse function
+// maps the old format requests to the new format.
+//
+#[derive(Clone, Copy)]
+pub enum PagestreamProtocolVersion {
+    V1,
+    V2,
+}
+
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamExistsRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
     pub rel: RelTag,
 }
 
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamNblocksRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
     pub rel: RelTag,
 }
 
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetPageRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
     pub rel: RelTag,
     pub blkno: u32,
 }
 
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamDbSizeRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
     pub dbnode: u32,
 }
 
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetSlruSegmentRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
     pub kind: u8,
     pub segno: u32,
 }
@@ -927,14 +960,16 @@ pub struct TenantHistorySize {
 }
 
 impl PagestreamFeMessage {
+    /// Serialize a compute -> pageserver message. This is currently only used in testing
+    /// tools. Always uses protocol version 2.
     pub fn serialize(&self) -> Bytes {
         let mut bytes = BytesMut::new();
 
         match self {
             Self::Exists(req) => {
                 bytes.put_u8(0);
-                bytes.put_u8(u8::from(req.latest));
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
                 bytes.put_u32(req.rel.relnode);
@@ -943,8 +978,8 @@ impl PagestreamFeMessage {
 
             Self::Nblocks(req) => {
                 bytes.put_u8(1);
-                bytes.put_u8(u8::from(req.latest));
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
                 bytes.put_u32(req.rel.relnode);
@@ -953,8 +988,8 @@ impl PagestreamFeMessage {
 
             Self::GetPage(req) => {
                 bytes.put_u8(2);
-                bytes.put_u8(u8::from(req.latest));
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
                 bytes.put_u32(req.rel.relnode);
@@ -964,15 +999,15 @@ impl PagestreamFeMessage {
 
             Self::DbSize(req) => {
                 bytes.put_u8(3);
-                bytes.put_u8(u8::from(req.latest));
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                 bytes.put_u32(req.dbnode);
             }
 
             Self::GetSlruSegment(req) => {
                 bytes.put_u8(4);
-                bytes.put_u8(u8::from(req.latest));
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                 bytes.put_u8(req.kind);
                 bytes.put_u32(req.segno);
             }
@@ -981,18 +1016,40 @@ impl PagestreamFeMessage {
         bytes.into()
     }
 
-    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
-        // TODO these gets can fail
-
+    pub fn parse<R: std::io::Read>(
+        body: &mut R,
+        protocol_version: PagestreamProtocolVersion,
+    ) -> anyhow::Result<PagestreamFeMessage> {
         // these correspond to the NeonMessageTag enum in pagestore_client.h
         //
         // TODO: consider using protobuf or serde bincode for less error prone
         // serialization.
         let msg_tag = body.read_u8()?;
+
+        let (request_lsn, not_modified_since) = match protocol_version {
+            PagestreamProtocolVersion::V2 => (
+                Lsn::from(body.read_u64::<BigEndian>()?),
+                Lsn::from(body.read_u64::<BigEndian>()?),
+            ),
+            PagestreamProtocolVersion::V1 => {
+                // In the old protocol, each message starts with a boolean 'latest' flag,
+                // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
+                // 'not_modified_since', used in the new protocol version.
+                let latest = body.read_u8()? != 0;
+                let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
+                if latest {
+                    (Lsn::MAX, request_lsn) // get latest version
+                } else {
+                    (request_lsn, request_lsn) // get version at specified LSN
+                }
+            }
+        };
+
+        // The rest of the messages are the same between V1 and V2
         match msg_tag {
             0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: body.read_u8()? != 0,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                request_lsn,
+                not_modified_since,
                 rel: RelTag {
                     spcnode: body.read_u32::<BigEndian>()?,
                     dbnode: body.read_u32::<BigEndian>()?,
@@ -1001,8 +1058,8 @@ impl PagestreamFeMessage {
                 },
             })),
             1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: body.read_u8()? != 0,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                request_lsn,
+                not_modified_since,
                 rel: RelTag {
                     spcnode: body.read_u32::<BigEndian>()?,
                     dbnode: body.read_u32::<BigEndian>()?,
@@ -1011,8 +1068,8 @@ impl PagestreamFeMessage {
                 },
             })),
             2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: body.read_u8()? != 0,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                request_lsn,
+                not_modified_since,
                 rel: RelTag {
                     spcnode: body.read_u32::<BigEndian>()?,
                     dbnode: body.read_u32::<BigEndian>()?,
@@ -1022,14 +1079,14 @@ impl PagestreamFeMessage {
                 blkno: body.read_u32::<BigEndian>()?,
             })),
             3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: body.read_u8()? != 0,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                request_lsn,
+                not_modified_since,
                 dbnode: body.read_u32::<BigEndian>()?,
             })),
             4 => Ok(PagestreamFeMessage::GetSlruSegment(
                 PagestreamGetSlruSegmentRequest {
-                    latest: body.read_u8()? != 0,
-                    lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                    request_lsn,
+                    not_modified_since,
                     kind: body.read_u8()?,
                     segno: body.read_u32::<BigEndian>()?,
                 },
@@ -1157,8 +1214,8 @@ mod tests {
         // Test serialization/deserialization of PagestreamFeMessage
         let messages = vec![
             PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: true,
-                lsn: Lsn(4),
+                request_lsn: Lsn(4),
+                not_modified_since: Lsn(3),
                 rel: RelTag {
                     forknum: 1,
                     spcnode: 2,
@@ -1167,8 +1224,8 @@ mod tests {
                 },
             }),
             PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: false,
-                lsn: Lsn(4),
+                request_lsn: Lsn(4),
+                not_modified_since: Lsn(4),
                 rel: RelTag {
                     forknum: 1,
                     spcnode: 2,
@@ -1177,8 +1234,8 @@ mod tests {
                 },
             }),
             PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: true,
-                lsn: Lsn(4),
+                request_lsn: Lsn(4),
+                not_modified_since: Lsn(3),
                 rel: RelTag {
                     forknum: 1,
                     spcnode: 2,
@@ -1188,14 +1245,16 @@ mod tests {
                 blkno: 7,
             }),
             PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: true,
-                lsn: Lsn(4),
+                request_lsn: Lsn(4),
+                not_modified_since: Lsn(3),
                 dbnode: 7,
             }),
         ];
         for msg in messages {
             let bytes = msg.serialize();
-            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
+            let reconstructed =
+                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
+                    .unwrap();
             assert!(msg == reconstructed);
         }
     }
diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs
index 49175b3b90..f9507fc47a 100644
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -60,7 +60,7 @@ impl Client {
     ) -> anyhow::Result<PagestreamClient> {
         let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
             .client
-            .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
+            .copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}"))
             .await?;
         let Client {
             cancel_on_client_drop,
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index c3d8e61a2c..5043a207fc 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -312,8 +312,12 @@ async fn main_impl(
                     let (rel_tag, block_no) =
                         key_to_rel_block(key).expect("we filter non-rel-block keys out above");
                     PagestreamGetPageRequest {
-                        latest: rng.gen_bool(args.req_latest_probability),
-                        lsn: r.timeline_lsn,
+                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
+                            Lsn::MAX
+                        } else {
+                            r.timeline_lsn
+                        },
+                        not_modified_since: r.timeline_lsn,
                         rel: rel_tag,
                         blkno: block_no,
                     }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 69475c2dc7..96d2397c94 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1,13 +1,5 @@
-//
 //! The Page Service listens for client connections and serves their GetPage@LSN
 //! requests.
-//
-//   It is possible to connect here using usual psql/pgbench/libpq. Following
-// commands are supported now:
-//     *status* -- show actual info about this pageserver,
-//     *pagestream* -- enter mode where smgr and pageserver talk with their
-//  custom protocol.
-//
 
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
@@ -23,7 +15,7 @@ use pageserver_api::models::{
     PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
     PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
     PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
-    PagestreamNblocksResponse,
+    PagestreamNblocksResponse, PagestreamProtocolVersion,
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::ShardNumber;
@@ -551,6 +543,7 @@ impl PageServerHandler {
         pgb: &mut PostgresBackend<IO>,
         tenant_id: TenantId,
         timeline_id: TimelineId,
+        protocol_version: PagestreamProtocolVersion,
         ctx: RequestContext,
     ) -> Result<(), QueryError>
     where
@@ -613,14 +606,15 @@ impl PageServerHandler {
                 t.trace(&copy_data_bytes)
             }
 
-            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
+            let neon_fe_msg =
+                PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
 
             // TODO: We could create a new per-request context here, with unique ID.
             // Currently we use the same per-timeline context for all requests
 
             let (response, span) = match neon_fe_msg {
                 PagestreamFeMessage::Exists(req) => {
-                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn);
+                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
                             .instrument(span.clone())
@@ -629,7 +623,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::Nblocks(req) => {
-                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn);
+                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
                             .instrument(span.clone())
@@ -639,7 +633,7 @@ impl PageServerHandler {
                 }
                 PagestreamFeMessage::GetPage(req) => {
                     // shard_id is filled in by the handler
-                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
+                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
                             .instrument(span.clone())
@@ -648,7 +642,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::DbSize(req) => {
-                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn);
+                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
                     (
                         self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
                             .instrument(span.clone())
@@ -657,7 +651,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::GetSlruSegment(req) => {
-                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.lsn);
+                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
                             .instrument(span.clone())
@@ -838,83 +832,80 @@ impl PageServerHandler {
     /// Helper function to handle the LSN from client request.
     ///
     /// Each GetPage (and Exists and Nblocks) request includes information about
-    /// which version of the page is being requested. The client can request the
-    /// latest version of the page, or the version that's valid at a particular
-    /// LSN. The primary compute node will always request the latest page
-    /// version, while a standby will request a version at the LSN that it's
-    /// currently caught up to.
+    /// which version of the page is being requested. The primary compute node
+    /// will always request the latest page version, by setting 'request_lsn' to
+    /// the last inserted or flushed WAL position, while a standby will request
+    /// a version at the LSN that it's currently caught up to.
     ///
     /// In either case, if the page server hasn't received the WAL up to the
     /// requested LSN yet, we will wait for it to arrive. The return value is
     /// the LSN that should be used to look up the page versions.
+    ///
+    /// In addition to the request LSN, each request carries another LSN,
+    /// 'not_modified_since', which is a hint to the pageserver that the client
+    /// knows that the page has not been modified between 'not_modified_since'
+    /// and the request LSN. This allows skipping the wait, as long as the WAL
+    /// up to 'not_modified_since' has arrived. If the client doesn't have any
+    /// information about when the page was modified, it will use
+    /// not_modified_since == lsn. If the client lies and sends a too low
+    /// not_modified_hint such that there are in fact later page versions, the
+    /// behavior is undefined: the pageserver may return any of the page versions
+    /// or an error.
     async fn wait_or_get_last_lsn(
         timeline: &Timeline,
-        mut lsn: Lsn,
-        latest: bool,
+        request_lsn: Lsn,
+        not_modified_since: Lsn,
         latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
         ctx: &RequestContext,
     ) -> Result<Lsn, PageStreamError> {
-        if latest {
-            // Latest page version was requested. If LSN is given, it is a hint
-            // to the page server that there have been no modifications to the
-            // page after that LSN. If we haven't received WAL up to that point,
-            // wait until it arrives.
-            let last_record_lsn = timeline.get_last_record_lsn();
+        let last_record_lsn = timeline.get_last_record_lsn();
 
-            // Note: this covers the special case that lsn == Lsn(0). That
-            // special case means "return the latest version whatever it is",
-            // and it's used for bootstrapping purposes, when the page server is
-            // connected directly to the compute node. That is needed because
-            // when you connect to the compute node, to receive the WAL, the
-            // walsender process will do a look up in the pg_authid catalog
-            // table for authentication. That poses a deadlock problem: the
-            // catalog table lookup will send a GetPage request, but the GetPage
-            // request will block in the page server because the recent WAL
-            // hasn't been received yet, and it cannot be received until the
-            // walsender completes the authentication and starts streaming the
-            // WAL.
-            if lsn <= last_record_lsn {
-                // It might be better to use max(lsn, latest_gc_cutoff_lsn) instead
-                // last_record_lsn. That would give the same result, since we know
-                // that there haven't been modifications since 'lsn'. Using an older
-                // LSN might be faster, because that could allow skipping recent
-                // layers when finding the page.
-                lsn = last_record_lsn;
+        // Sanity check the request
+        if request_lsn < not_modified_since {
+            return Err(PageStreamError::BadRequest(
+                format!(
+                    "invalid request with request LSN {} and not_modified_since {}",
+                    request_lsn, not_modified_since,
+                )
+                .into(),
+            ));
+        }
+
+        if request_lsn < **latest_gc_cutoff_lsn {
+            // Check explicitly for INVALID just to get a less scary error message if the
+            // request is obviously bogus
+            return Err(if request_lsn == Lsn::INVALID {
+                PageStreamError::BadRequest("invalid LSN(0) in request".into())
             } else {
-                timeline
-                    .wait_lsn(
-                        lsn,
-                        crate::tenant::timeline::WaitLsnWaiter::PageService,
-                        ctx,
-                    )
-                    .await?;
-                // Since we waited for 'lsn' to arrive, that is now the last
-                // record LSN. (Or close enough for our purposes; the
-                // last-record LSN can advance immediately after we return
-                // anyway)
-            }
-        } else {
-            if lsn == Lsn(0) {
-                return Err(PageStreamError::BadRequest(
-                    "invalid LSN(0) in request".into(),
-                ));
-            }
+                PageStreamError::BadRequest(format!(
+                        "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
+                        request_lsn, **latest_gc_cutoff_lsn
+                    ).into())
+            });
+        }
+
+        // Wait for WAL up to 'not_modified_since' to arrive, if necessary
+        if not_modified_since > last_record_lsn {
             timeline
                 .wait_lsn(
-                    lsn,
+                    not_modified_since,
                     crate::tenant::timeline::WaitLsnWaiter::PageService,
                     ctx,
                 )
                 .await?;
+            // Since we waited for 'not_modified_since' to arrive, that is now the last
+            // record LSN. (Or close enough for our purposes; the last-record LSN can
+            // advance immediately after we return anyway)
+            Ok(not_modified_since)
+        } else {
+            // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
+            // here instead. That would give the same result, since we know that there
+            // haven't been any modifications since 'not_modified_since'. Using an older
+            // LSN might be faster, because that could allow skipping recent layers when
+            // finding the page. However, we have historically used 'last_record_lsn', so
+            // stick to that for now.
+            Ok(std::cmp::min(last_record_lsn, request_lsn))
         }
-
-        if lsn < **latest_gc_cutoff_lsn {
-            return Err(PageStreamError::BadRequest(format!(
-                "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
-                lsn, **latest_gc_cutoff_lsn
-            ).into()));
-        }
-        Ok(lsn)
     }
 
     #[instrument(skip_all, fields(shard_id))]
@@ -931,9 +922,14 @@ impl PageServerHandler {
             .start_timer(metrics::SmgrQueryType::GetRelExists, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
+        let lsn = Self::wait_or_get_last_lsn(
+            timeline,
+            req.request_lsn,
+            req.not_modified_since,
+            &latest_gc_cutoff_lsn,
+            ctx,
+        )
+        .await?;
 
         let exists = timeline
             .get_rel_exists(req.rel, Version::Lsn(lsn), ctx)
@@ -959,9 +955,14 @@ impl PageServerHandler {
             .start_timer(metrics::SmgrQueryType::GetRelSize, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
+        let lsn = Self::wait_or_get_last_lsn(
+            timeline,
+            req.request_lsn,
+            req.not_modified_since,
+            &latest_gc_cutoff_lsn,
+            ctx,
+        )
+        .await?;
 
         let n_blocks = timeline
             .get_rel_size(req.rel, Version::Lsn(lsn), ctx)
@@ -987,9 +988,14 @@ impl PageServerHandler {
             .start_timer(metrics::SmgrQueryType::GetDbSize, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
+        let lsn = Self::wait_or_get_last_lsn(
+            timeline,
+            req.request_lsn,
+            req.not_modified_since,
+            &latest_gc_cutoff_lsn,
+            ctx,
+        )
+        .await?;
 
         let total_blocks = timeline
             .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx)
@@ -1159,9 +1165,14 @@ impl PageServerHandler {
             .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
+        let lsn = Self::wait_or_get_last_lsn(
+            timeline,
+            req.request_lsn,
+            req.not_modified_since,
+            &latest_gc_cutoff_lsn,
+            ctx,
+        )
+        .await?;
 
         let page = timeline
             .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx)
@@ -1187,9 +1198,14 @@ impl PageServerHandler {
             .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
+        let lsn = Self::wait_or_get_last_lsn(
+            timeline,
+            req.request_lsn,
+            req.not_modified_since,
+            &latest_gc_cutoff_lsn,
+            ctx,
+        )
+        .await?;
 
         let kind = SlruKind::from_repr(req.kind)
             .ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?;
@@ -1407,7 +1423,34 @@ where
 
         let ctx = self.connection_ctx.attached_child();
         debug!("process query {query_string:?}");
-        if query_string.starts_with("pagestream ") {
+        if query_string.starts_with("pagestream_v2 ") {
+            let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
+            let params = params_raw.split(' ').collect::<Vec<_>>();
+            if params.len() != 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for pagestream command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            self.handle_pagerequests(
+                pgb,
+                tenant_id,
+                timeline_id,
+                PagestreamProtocolVersion::V2,
+                ctx,
+            )
+            .await?;
+        } else if query_string.starts_with("pagestream ") {
             let (_, params_raw) = query_string.split_at("pagestream ".len());
             let params = params_raw.split(' ').collect::<Vec<_>>();
             if params.len() != 2 {
@@ -1426,8 +1469,14 @@ where
 
             self.check_permission(Some(tenant_id))?;
 
-            self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
-                .await?;
+            self.handle_pagerequests(
+                pgb,
+                tenant_id,
+                timeline_id,
+                PagestreamProtocolVersion::V1,
+                ctx,
+            )
+            .await?;
         } else if query_string.starts_with("basebackup ") {
             let (_, params_raw) = query_string.split_at("basebackup ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
diff --git a/trace/src/main.rs b/trace/src/main.rs
index 4605c124e9..049f922b6f 100644
--- a/trace/src/main.rs
+++ b/trace/src/main.rs
@@ -7,7 +7,9 @@ use std::{
     io::BufReader,
 };
 
-use pageserver_api::models::{PagestreamFeMessage, PagestreamGetPageRequest};
+use pageserver_api::models::{
+    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamProtocolVersion,
+};
 use utils::id::{ConnectionId, TenantId, TimelineId};
 
 use clap::{Parser, Subcommand};
@@ -56,7 +58,7 @@ fn analyze_trace<R: std::io::Read>(mut reader: R) {
     let mut prev: Option<PagestreamGetPageRequest> = None;
 
     // Compute stats
-    while let Ok(msg) = PagestreamFeMessage::parse(&mut reader) {
+    while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) {
         match msg {
             PagestreamFeMessage::Exists(_) => {}
             PagestreamFeMessage::Nblocks(_) => {}
@@ -89,7 +91,7 @@ fn analyze_trace<R: std::io::Read>(mut reader: R) {
 }
 
 fn dump_trace<R: std::io::Read>(mut reader: R) {
-    while let Ok(msg) = PagestreamFeMessage::parse(&mut reader) {
+    while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) {
         println!("{msg:?}");
     }
 }

From a2a44ea213905ecd0f20b38f41a5725138214ee0 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 25 Apr 2024 19:45:45 +0300
Subject: [PATCH 0623/1571] Refactor how the request LSNs are tracked in
 compute (#7377)

Instead of thinking in terms of 'latest' and 'lsn' of the request,
each request has two LSNs: the request LSN and 'not_modified_since'
LSN. The request is nominally made at the request LSN, that determines
what page version we want to see. But as a hint, we also include
'not_modified_since'. It tells the pageserver that the page has not
been modified since that LSN, which allows the pageserver to skip
waiting for newer WAL to arrive, and could allow more optimizations in
the future.

Refactor the internal functions to calculate the request LSN to
calculate both LSNs.

Sending two LSNs to the pageserver requires using the new protocol
version 2. The previous commit added the server support for it, but we
still default to the old protocol for compatibility with old
pageservers. The 'neon.protocol_version' GUC can be used to use the
new protocol.

The new protocol addresses one cause of issue #6211, although you can
still get the same error if you have a standby that is lagging behind
so that the page version it needs is genuinely GC'd away.
---
 pgxn/neon/libpagestore.c                      |  26 +-
 pgxn/neon/pagestore_client.h                  |  34 +-
 pgxn/neon/pagestore_smgr.c                    | 438 +++++++++++-------
 pgxn/neon_test_utils/Makefile                 |   2 +-
 ...tils--1.0.sql => neon_test_utils--1.1.sql} |   4 +-
 pgxn/neon_test_utils/neon_test_utils.control  |   2 +-
 pgxn/neon_test_utils/neontest.c               |  29 +-
 test_runner/regress/test_read_validation.py   |  40 +-
 test_runner/regress/test_vm_bits.py           |   4 +-
 9 files changed, 377 insertions(+), 202 deletions(-)
 rename pgxn/neon_test_utils/{neon_test_utils--1.0.sql => neon_test_utils--1.1.sql} (89%)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 2276b4e807..b7b1e7ccbf 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -49,6 +49,8 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;
 
+int         neon_protocol_version = 1;
+
 static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
 static int	stripe_size;
@@ -379,7 +381,17 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		pfree(msg);
 		return false;
 	}
-	query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
+	switch (neon_protocol_version)
+	{
+		case 2:
+			query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
+			break;
+		case 1:
+			query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
+			break;
+		default:
+			elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
+	}
 	ret = PQsendQuery(conn, query);
 	pfree(query);
 	if (ret != 1)
@@ -440,7 +452,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		return false;
 	}
 
-	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
+	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
 	page_servers[shard_no].conn = conn;
 	page_servers[shard_no].wes = wes;
 
@@ -844,6 +856,16 @@ pg_init_libpagestore(void)
 							PGC_USERSET,
 							0,	/* no flags required */
 							NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL);
+	DefineCustomIntVariable("neon.protocol_version",
+							"Version of compute<->page server protocol",
+							NULL,
+							&neon_protocol_version,
+							1, /* default to old protocol for now */
+							1, /* min */
+							2, /* max */
+							PGC_SU_BACKEND,
+							0,	/* no flags required */
+							NULL, NULL, NULL);
 
 	relsize_hash_init();
 
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 44ae766f76..7709ab9d42 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -69,18 +69,33 @@ typedef enum {
 	SLRU_MULTIXACT_OFFSETS
 } SlruKind;
 
-/*
- * supertype of all the Neon*Request structs below
+/*--
+ * supertype of all the Neon*Request structs below.
  *
- * If 'latest' is true, we are requesting the latest page version, and 'lsn'
- * is just a hint to the server that we know there are no versions of the page
- * (or relation size, for exists/nblocks requests) later than the 'lsn'.
+ * All requests contain two LSNs:
+ *
+ * lsn:                request page (or relation size, etc) at this LSN
+ * not_modified_since: Hint that the page hasn't been modified between
+ *                     this LSN and the request LSN (`lsn`).
+ *
+ * To request the latest version of a page, you can use MAX_LSN as the request
+ * LSN.
+ *
+ * If you don't know any better, you can always set 'not_modified_since' equal
+ * to 'lsn', but providing a lower value can speed up processing the request
+ * in the pageserver, as it doesn't need to wait for the WAL to arrive, and it
+ * can skip traversing through recent layers which we know to not contain any
+ * versions for the requested page.
+ *
+ * These structs describe the V2 of these requests. The old V1 protocol contained
+ * just one LSN and a boolean 'latest' flag. If the neon_protocol_version GUC is
+ * set to 1, we will convert these to the V1 requests before sending.
  */
 typedef struct
 {
 	NeonMessageTag tag;
-	bool		latest;			/* if true, request latest page version */
-	XLogRecPtr	lsn;			/* request page version @ this LSN */
+	XLogRecPtr	lsn;
+	XLogRecPtr	not_modified_since;
 } NeonRequest;
 
 typedef struct
@@ -193,6 +208,7 @@ extern int	readahead_buffer_size;
 extern char *neon_timeline;
 extern char *neon_tenant;
 extern int32 max_cluster_size;
+extern int  neon_protocol_version;
 
 extern shardno_t get_shard_number(BufferTag* tag);
 
@@ -225,14 +241,14 @@ extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  char *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, bool request_latest, char *buffer);
+										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
 #else
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  void *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, bool request_latest, void *buffer);
+										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 #endif
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 57a16e00ca..44ecdbd9aa 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -168,8 +168,8 @@ typedef enum PrefetchStatus
 typedef struct PrefetchRequest
 {
 	BufferTag	buftag;			/* must be first entry in the struct */
-	XLogRecPtr	effective_request_lsn;
-	XLogRecPtr	actual_request_lsn;
+	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;
 	NeonResponse *response;		/* may be null */
 	PrefetchStatus status;
 	shardno_t   shard_no;
@@ -269,19 +269,19 @@ static PrefetchState *MyPState;
 	) \
 )
 
-static XLogRecPtr prefetch_lsn = 0;
-
 static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
-static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn);
+static uint64 prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
 static bool prefetch_read(PrefetchRequest *slot);
-static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn);
+static void prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
 static bool prefetch_wait_for(uint64 ring_index);
 static void prefetch_cleanup_trailing_unused(void);
 static inline void prefetch_set_unused(uint64 ring_index);
 
-static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo,
-									   ForkNumber forknum, BlockNumber blkno);
+static void neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
+								 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since);
+static bool neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
+										  PrefetchRequest *slot);
 
 static bool
 compact_prefetch_buffers(void)
@@ -338,8 +338,8 @@ compact_prefetch_buffers(void)
 		target_slot->shard_no = source_slot->shard_no;
 		target_slot->status = source_slot->status;
 		target_slot->response = source_slot->response;
-		target_slot->effective_request_lsn = source_slot->effective_request_lsn;
-		target_slot->actual_request_lsn = source_slot->actual_request_lsn;
+		target_slot->request_lsn = source_slot->request_lsn;
+		target_slot->not_modified_since = source_slot->not_modified_since;
 		target_slot->my_ring_index = empty_ring_index;
 
 		prfh_delete(MyPState->prf_hash, source_slot);
@@ -358,7 +358,8 @@ compact_prefetch_buffers(void)
 		};
 		source_slot->response = NULL;
 		source_slot->my_ring_index = 0;
-		source_slot->effective_request_lsn = 0;
+		source_slot->request_lsn = InvalidXLogRecPtr;
+		source_slot->not_modified_since = InvalidXLogRecPtr;
 
 		/* update bookkeeping */
 		n_moved++;
@@ -683,56 +684,39 @@ prefetch_set_unused(uint64 ring_index)
 		compact_prefetch_buffers();
 }
 
+/*
+ * Send one prefetch request to the pageserver. To wait for the response, call
+ * prefetch_wait_for().
+ */
 static void
-prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
+prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since)
 {
 	bool		found;
 	NeonGetPageRequest request = {
 		.req.tag = T_NeonGetPageRequest,
-		.req.latest = false,
-		.req.lsn = 0,
+		/* lsn and not_modified_since are filled in below */
 		.rinfo = BufTagGetNRelFileInfo(slot->buftag),
 		.forknum = slot->buftag.forkNum,
 		.blkno = slot->buftag.blockNum,
 	};
 
-	if (force_lsn && force_latest)
+	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
+
+	if (force_request_lsn)
 	{
-		request.req.lsn = *force_lsn;
-		request.req.latest = *force_latest;
-		slot->actual_request_lsn = slot->effective_request_lsn = *force_lsn;
+		request.req.lsn = *force_request_lsn;
+		request.req.not_modified_since = *force_not_modified_since;
 	}
 	else
 	{
-		XLogRecPtr	lsn = neon_get_request_lsn(
-											   &request.req.latest,
-											   BufTagGetNRelFileInfo(slot->buftag),
-											   slot->buftag.forkNum,
-											   slot->buftag.blockNum
-			);
-
-		/*
-		 * Note: effective_request_lsn is potentially higher than the
-		 * requested LSN, but still correct:
-		 *
-		 * We know there are no changes between the actual requested LSN and
-		 * the value of effective_request_lsn: If there were, the page would
-		 * have been in cache and evicted between those LSN values, which then
-		 * would have had to result in a larger request LSN for this page.
-		 *
-		 * It is possible that a concurrent backend loads the page, modifies
-		 * it and then evicts it again, but the LSN of that eviction cannot be
-		 * smaller than the current WAL insert/redo pointer, which is already
-		 * larger than this prefetch_lsn. So in any case, that would
-		 * invalidate this cache.
-		 *
-		 * The best LSN to use for effective_request_lsn would be
-		 * XLogCtl->Insert.RedoRecPtr, but that's expensive to access.
-		 */
-		slot->actual_request_lsn = request.req.lsn = lsn;
-		prefetch_lsn = Max(prefetch_lsn, lsn);
-		slot->effective_request_lsn = prefetch_lsn;
+		neon_get_request_lsn(BufTagGetNRelFileInfo(slot->buftag),
+							 slot->buftag.forkNum,
+							 slot->buftag.blockNum,
+							 &request.req.lsn,
+							 &request.req.not_modified_since);
 	}
+	slot->request_lsn = request.req.lsn;
+	slot->not_modified_since = request.req.not_modified_since;
 
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
@@ -749,7 +733,6 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
 
-
 	prfh_insert(MyPState->prf_hash, slot, &found);
 	Assert(!found);
 }
@@ -759,22 +742,25 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
  *
  * Register that we may want the contents of BufferTag in the near future.
  *
- * If force_latest and force_lsn are not NULL, those values are sent to the
- * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
- * to fill in these values manually.
+ * If force_request_lsn and force_not_modified_since are not NULL, those
+ * values are sent to the pageserver. If they are NULL, we utilize the
+ * lastWrittenLsn -infrastructure to fill them in.
  *
  * NOTE: this function may indirectly update MyPState->pfs_hash; which
  * invalidates any active pointers into the hash table.
  */
 
 static uint64
-prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
+prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn,
+						 XLogRecPtr *force_not_modified_since)
 {
 	uint64		ring_index;
 	PrefetchRequest req;
 	PrefetchRequest *slot;
 	PrfHashEntry *entry;
 
+	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
+
 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
 Retry:
@@ -792,40 +778,19 @@ Retry:
 		Assert(BUFFERTAGS_EQUAL(slot->buftag, tag));
 
 		/*
-		 * If we want a specific lsn, we do not accept requests that were made
-		 * with a potentially different LSN.
+		 * If the caller specified a request LSN to use, only accept prefetch
+		 * responses that satisfy that request.
 		 */
-		if (force_latest && force_lsn)
+		if (force_request_lsn)
 		{
-			/*
-			 * if we want the latest version, any effective_request_lsn <
-			 * request lsn is OK
-			 */
-			if (*force_latest)
+			if (!neon_prefetch_response_usable(*force_request_lsn,
+											   *force_not_modified_since, slot))
 			{
-				if (*force_lsn > slot->effective_request_lsn)
-				{
-					if (!prefetch_wait_for(ring_index))
-						goto Retry;
-					prefetch_set_unused(ring_index);
-					entry = NULL;
-				}
-
-			}
-
-			/*
-			 * if we don't want the latest version, only accept requests with
-			 * the exact same LSN
-			 */
-			else
-			{
-				if (*force_lsn != slot->effective_request_lsn)
-				{
-					if (!prefetch_wait_for(ring_index))
-						goto Retry;
-					prefetch_set_unused(ring_index);
-					entry = NULL;
-				}
+				/* Wait for the old request to finish and discard it */
+				if (!prefetch_wait_for(ring_index))
+					goto Retry;
+				prefetch_set_unused(ring_index);
+				entry = NULL;
 			}
 		}
 
@@ -921,7 +886,7 @@ Retry:
 	slot->shard_no = get_shard_number(&tag);
 	slot->my_ring_index = ring_index;
 
-	prefetch_do_request(slot, force_latest, force_lsn);
+	prefetch_do_request(slot, force_request_lsn, force_not_modified_since);
 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(MyPState->ring_last <= ring_index &&
 		   ring_index < MyPState->ring_unused);
@@ -950,7 +915,7 @@ page_server_request(void const *req)
 	BufferTag tag = {0};
 	shardno_t shard_no;
 
-	switch (((NeonRequest *) req)->tag)
+	switch (messageTag(req))
 	{
 		case T_NeonExistsRequest:
 			CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
@@ -966,11 +931,10 @@ page_server_request(void const *req)
 			tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
 			break;
 		default:
-			neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag);
+			neon_log(ERROR, "Unexpected request tag: %d", messageTag(req));
 	}
 	shard_no = get_shard_number(&tag);
 
-
 	/*
 	 * Current sharding model assumes that all metadata is present only at shard 0.
 	 * We still need to call get_shard_no() to check if shard map is up-to-date.
@@ -997,8 +961,52 @@ nm_pack_request(NeonRequest *msg)
 	StringInfoData s;
 
 	initStringInfo(&s);
-	pq_sendbyte(&s, msg->tag);
 
+	if (neon_protocol_version >= 2)
+	{
+		pq_sendbyte(&s, msg->tag);
+		pq_sendint64(&s, msg->lsn);
+		pq_sendint64(&s, msg->not_modified_since);
+	}
+	else
+	{
+		bool		latest;
+		XLogRecPtr	lsn;
+
+		/*
+		 * In primary, we always request the latest page version.
+		 */
+		if (!RecoveryInProgress())
+		{
+			latest = true;
+			lsn = msg->not_modified_since;
+		}
+		else
+		{
+			/*
+			 * In the protocol V1, we cannot represent that we want to read
+			 * page at LSN X, and we know that it hasn't been modified since
+			 * Y. We can either use 'not_modified_lsn' as the request LSN, and
+			 * risk getting an error if that LSN is too old and has already
+			 * fallen out of the pageserver's GC horizon, or we can send
+			 * 'request_lsn', causing the pageserver to possibly wait for the
+			 * recent WAL to arrive unnecessarily. Or something in between. We
+			 * choose to use the old LSN and risk GC errors, because that's
+			 * what we've done historically.
+			 */
+			latest = false;
+			lsn = msg->not_modified_since;
+		}
+
+		pq_sendbyte(&s, msg->tag);
+		pq_sendbyte(&s, latest);
+		pq_sendint64(&s, lsn);
+	}
+
+	/*
+	 * The rest of the request messages are the same between protocol V1 and
+	 * V2
+	 */
 	switch (messageTag(msg))
 	{
 			/* pagestore_client -> pagestore */
@@ -1006,8 +1014,6 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonExistsRequest *msg_req = (NeonExistsRequest *) msg;
 
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
@@ -1019,8 +1025,6 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg;
 
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
@@ -1032,8 +1036,6 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;
 
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendint32(&s, msg_req->dbNode);
 
 				break;
@@ -1042,8 +1044,6 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg;
 
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
@@ -1057,8 +1057,6 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
 
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendbyte(&s, msg_req->kind);
 				pq_sendint32(&s, msg_req->segno);
 
@@ -1209,7 +1207,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1222,7 +1220,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1236,7 +1234,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1247,7 +1245,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
 				appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1259,7 +1257,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfo(&s, ", \"kind\": %u", msg_req->kind);
 				appendStringInfo(&s, ", \"segno\": %u", msg_req->segno);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1531,44 +1529,38 @@ nm_adjust_lsn(XLogRecPtr lsn)
 /*
  * Return LSN for requesting pages and number of blocks from page server
  */
-static XLogRecPtr
-neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
+static void
+neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
+					 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since)
 {
-	XLogRecPtr	lsn;
+	XLogRecPtr	last_written_lsn;
+
+	last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
+	last_written_lsn = nm_adjust_lsn(last_written_lsn);
+	Assert(last_written_lsn != InvalidXLogRecPtr);
 
 	if (RecoveryInProgress())
 	{
-		/*
-		 * We don't know if WAL has been generated but not yet replayed, so
-		 * we're conservative in our estimates about latest pages.
-		 */
-		*latest = false;
+		/* Request the page at the last replayed LSN. */
+		*request_lsn = GetXLogReplayRecPtr(NULL);
+		*not_modified_since = last_written_lsn;
+		Assert(last_written_lsn <= *request_lsn);
 
-		/*
-		 * Get the last written LSN of this page.
-		 */
-		lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
-		lsn = nm_adjust_lsn(lsn);
-
-		neon_log(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
-			 (uint32) ((lsn) >> 32), (uint32) (lsn));
+		neon_log(DEBUG1, "neon_get_request_lsn request lsn %X/%X, not_modified_since %X/%X",
+				 LSN_FORMAT_ARGS(*request_lsn), LSN_FORMAT_ARGS(*not_modified_since));
 	}
 	else
 	{
 		XLogRecPtr	flushlsn;
 
 		/*
-		 * Use the latest LSN that was evicted from the buffer cache. Any
-		 * pages modified by later WAL records must still in the buffer cache,
-		 * so our request cannot concern those.
+		 * Use the latest LSN that was evicted from the buffer cache as the
+		 * 'not_modified_since' hint. Any pages modified by later WAL records
+		 * must still in the buffer cache, so our request cannot concern
+		 * those.
 		 */
-		*latest = true;
-		lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
-		Assert(lsn != InvalidXLogRecPtr);
 		neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
-			 (uint32) ((lsn) >> 32), (uint32) (lsn));
-
-		lsn = nm_adjust_lsn(lsn);
+				 LSN_FORMAT_ARGS(last_written_lsn));
 
 		/*
 		 * Is it possible that the last-written LSN is ahead of last flush
@@ -1583,16 +1575,109 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
 #else
 		flushlsn = GetFlushRecPtr();
 #endif
-		if (lsn > flushlsn)
+		if (last_written_lsn > flushlsn)
 		{
 			neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
-				 (uint32) (lsn >> 32), (uint32) lsn,
-				 (uint32) (flushlsn >> 32), (uint32) flushlsn);
-			XLogFlush(lsn);
+					 LSN_FORMAT_ARGS(last_written_lsn),
+					 LSN_FORMAT_ARGS(flushlsn));
+			XLogFlush(last_written_lsn);
+			flushlsn = last_written_lsn;
 		}
+
+		/*
+		 * Request the latest version of the page. The most up-to-date request
+		 * LSN we could use would be the current insert LSN, but to avoid the
+		 * overhead of looking it up, use 'flushlsn' instead. This relies on
+		 * the assumption that if the page was modified since the last WAL
+		 * flush, it should still be in the buffer cache, and we wouldn't be
+		 * requesting it.
+		 */
+		*request_lsn = flushlsn;
+		*not_modified_since = last_written_lsn;
+	}
+}
+
+/*
+ *  neon_prefetch_response_usable -- Can a new request be satisfied by old one?
+ *
+ * This is used to check if the response to a prefetch request can be used to
+ * satisfy a page read now.
+ */
+static bool
+neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
+							  PrefetchRequest *slot)
+{
+	/* sanity check the LSN's on the old and the new request */
+	Assert(request_lsn >= not_modified_since);
+	Assert(slot->request_lsn >= slot->not_modified_since);
+	Assert(slot->status != PRFS_UNUSED);
+
+	/*
+	 * The new request's LSN should never be older than the old one.  This
+	 * could be an Assert, except that for testing purposes, we do provide an
+	 * interface in neon_test_utils to fetch pages at arbitary LSNs, which
+	 * violates this.
+	 *
+	 * Similarly, the not_modified_since value calculated for a page should
+	 * never move backwards. This assumption is a bit fragile; if we updated
+	 * the last-written cache when we read in a page, for example, then it
+	 * might. But as the code stands, it should not.
+	 *
+	 * (If two backends issue a request at the same time, they might race and
+	 * calculate LSNs "out of order" with each other, but the prefetch queue
+	 * is backend-private at the moment.)
+	 */
+	if (request_lsn < slot->request_lsn || not_modified_since < slot->not_modified_since)
+	{
+		ereport(LOG,
+				(errcode(ERRCODE_IO_ERROR),
+				 errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
+				 errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
+						   LSN_FORMAT_ARGS(request_lsn), LSN_FORMAT_ARGS(not_modified_since),
+						   LSN_FORMAT_ARGS(slot->request_lsn), LSN_FORMAT_ARGS(slot->not_modified_since))));
+		return false;
 	}
 
-	return lsn;
+	/*---
+	 * Each request to the pageserver carries two LSN values:
+	 * `not_modified_since` and `request_lsn`. The (not_modified_since,
+	 * request_lsn] range of each request is effectively a claim that the page
+	 * has not been modified between those LSNs.  If the range of the old
+	 * request in the queue overlaps with the new request, we know that the
+	 * page hasn't been modified in the union of the ranges. We can use the
+	 * response to old request to satisfy the new request in that case. For
+	 * example:
+	 *
+	 *              100      500
+	 * Old request:  +--------+
+	 *
+	 *                     400      800
+	 * New request:         +--------+
+	 *
+	 * The old request claims that the page was not modified between LSNs 100
+	 * and 500, and the second claims that it was not modified between 400 and
+	 * 800. Together they mean that the page was not modified between 100 and
+	 * 800. Therefore the response to the old request is also valid for the
+	 * new request.
+	 *
+	 * This logic also holds at the boundary case that the old request's LSN
+	 * matches the new request's not_modified_since LSN exactly:
+	 *
+	 *              100      500
+	 * Old request:  +--------+
+	 *
+	 *                       500      900
+	 * New request:           +--------+
+	 *
+	 * The response to the old request is the page as it was at LSN 500, and
+	 * the page hasn't been changed in the range (500, 900], therefore the
+	 * response is valid also for the new request.
+	 */
+
+	/* this follows from the checks above */
+	Assert(request_lsn >= slot->not_modified_since);
+
+	return not_modified_since <= slot->request_lsn;
 }
 
 /*
@@ -1604,8 +1689,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 	bool		exists;
 	NeonResponse *resp;
 	BlockNumber n_blocks;
-	bool		latest;
 	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -1660,12 +1745,13 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}
 
-	request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO,
+						 &request_lsn, &not_modified_since);
 	{
 		NeonExistsRequest request = {
 			.req.tag = T_NeonExistsRequest,
-			.req.latest = latest,
 			.req.lsn = request_lsn,
+			.req.not_modified_since = not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
 		.forknum = forkNum};
 
@@ -2102,10 +2188,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 void
 #if PG_MAJORVERSION_NUM < 16
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 XLogRecPtr request_lsn, bool request_latest, char *buffer)
+				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer)
 #else
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 XLogRecPtr request_lsn, bool request_latest, void *buffer)
+				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer)
 #endif
 {
 	NeonResponse *resp;
@@ -2148,15 +2234,16 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	if (entry != NULL)
 	{
 		slot = entry->slot;
-		if (slot->effective_request_lsn >= request_lsn)
+		if (neon_prefetch_response_usable(request_lsn, not_modified_since, slot))
 		{
 			ring_index = slot->my_ring_index;
 			pgBufferUsage.prefetch.hits += 1;
 		}
-		else					/* the current prefetch LSN is not large
-								 * enough, so drop the prefetch */
+		else
 		{
 			/*
+			 * Cannot use this prefetch, discard it
+			 *
 			 * We can't drop cache for not-yet-received requested items. It is
 			 * unlikely this happens, but it can happen if prefetch distance
 			 * is large enough and a backend didn't consume all prefetch
@@ -2181,8 +2268,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		{
 			pgBufferUsage.prefetch.misses += 1;
 
-			ring_index = prefetch_register_buffer(buftag, &request_latest,
-												  &request_lsn);
+			ring_index = prefetch_register_buffer(buftag, &request_lsn,
+												  &not_modified_since);
 			slot = GetPrfSlot(ring_index);
 		}
 		else
@@ -2246,8 +2333,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer
 neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
 #endif
 {
-	bool		latest;
 	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -2272,8 +2359,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 		return;
 	}
 
-	request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, blkno);
-	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, latest, buffer);
+	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, blkno,
+						 &request_lsn, &not_modified_since);
+	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, not_modified_since, buffer);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -2442,8 +2530,8 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	NeonResponse *resp;
 	BlockNumber n_blocks;
-	bool		latest;
 	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -2470,12 +2558,13 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 	}
 
-	request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsn(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO,
+						 &request_lsn, &not_modified_since);
 	{
 		NeonNblocksRequest request = {
 			.req.tag = T_NeonNblocksRequest,
-			.req.latest = latest,
 			.req.lsn = request_lsn,
+			.req.not_modified_since = not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
 			.forknum = forknum,
 		};
@@ -2523,16 +2612,17 @@ neon_dbsize(Oid dbNode)
 {
 	NeonResponse *resp;
 	int64		db_size;
-	XLogRecPtr	request_lsn;
-	bool		latest;
+	XLogRecPtr	request_lsn,
+				not_modified_since;
 	NRelFileInfo dummy_node = {0};
 
-	request_lsn = neon_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsn(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO,
+						 &request_lsn, &not_modified_since);
 	{
 		NeonDbSizeRequest request = {
 			.req.tag = T_NeonDbSizeRequest,
-			.req.latest = latest,
 			.req.lsn = request_lsn,
+			.req.not_modified_since = not_modified_since,
 			.dbNode = dbNode,
 		};
 
@@ -2605,7 +2695,6 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	 * the most recently inserted WAL record's LSN.
 	 */
 	lsn = GetXLogInsertRecPtr();
-
 	lsn = nm_adjust_lsn(lsn);
 
 	/*
@@ -2805,14 +2894,33 @@ neon_end_unlogged_build(SMgrRelation reln)
 static int
 neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer)
 {
-	XLogRecPtr request_lsn;
-	/*
-	 * GetRedoStartLsn() returns LSN of basebackup.
-	 * We need to download SLRU segments only once after node startup,
-	 * then SLRUs are maintained locally.
-	 */
-	request_lsn = GetRedoStartLsn();
+	XLogRecPtr request_lsn,
+		not_modified_since;
+
+	if (RecoveryInProgress())
+	{
+		request_lsn = GetXLogReplayRecPtr(NULL);
+		if (request_lsn == InvalidXLogRecPtr)
+		{
+			/*
+			 * This happens in neon startup, we start up without replaying any
+			 * records.
+			 */
+			request_lsn = GetRedoStartLsn();
+		}
+	}
+	else
+		request_lsn = GetXLogInsertRecPtr();
 	request_lsn = nm_adjust_lsn(request_lsn);
+
+	/*
+	 * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
+	 * segment has not changed since the basebackup, because in order to
+	 * modify it, we would have had to download it already. And once
+	 * downloaded, we never evict SLRU segments from local disk.
+	 */
+	not_modified_since = GetRedoStartLsn();
+
 	SlruKind kind;
 
     if (STRPREFIX(path, "pg_xact"))
@@ -2827,8 +2935,8 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	NeonResponse *resp;
 	NeonGetSlruSegmentRequest request = {
 		.req.tag = T_NeonGetSlruSegmentRequest,
-		.req.latest = false,
 		.req.lsn = request_lsn,
+		.req.not_modified_since = not_modified_since,
 
 		.kind = kind,
 		.segno = segno
@@ -2956,6 +3064,9 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 {
 	BlockNumber relsize;
 
+	/* This is only used in WAL replay */
+	Assert(RecoveryInProgress());
+
 	/* Extend the relation if we know its size */
 	if (get_cached_relsize(rinfo, forknum, &relsize))
 	{
@@ -2974,14 +3085,13 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		 * This length is later reused when we open the smgr to read the
 		 * block, which is fine and expected.
 		 */
-
 		NeonResponse *response;
 		NeonNblocksResponse *nbresponse;
 		NeonNblocksRequest request = {
 			.req = (NeonRequest) {
-				.lsn = end_recptr,
-				.latest = false,
 				.tag = T_NeonNblocksRequest,
+				.lsn = end_recptr,
+				.not_modified_since = end_recptr,
 			},
 			.rinfo = rinfo,
 			.forknum = forknum,
diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile
index 9c774ec185..1ee87357e5 100644
--- a/pgxn/neon_test_utils/Makefile
+++ b/pgxn/neon_test_utils/Makefile
@@ -7,7 +7,7 @@ OBJS = \
 	neontest.o
 
 EXTENSION = neon_test_utils
-DATA = neon_test_utils--1.0.sql
+DATA = neon_test_utils--1.1.sql
 PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
 
 PG_CONFIG = pg_config
diff --git a/pgxn/neon_test_utils/neon_test_utils--1.0.sql b/pgxn/neon_test_utils/neon_test_utils--1.1.sql
similarity index 89%
rename from pgxn/neon_test_utils/neon_test_utils--1.0.sql
rename to pgxn/neon_test_utils/neon_test_utils--1.1.sql
index 23340e352e..534784f319 100644
--- a/pgxn/neon_test_utils/neon_test_utils--1.0.sql
+++ b/pgxn/neon_test_utils/neon_test_utils--1.1.sql
@@ -31,12 +31,12 @@ AS 'MODULE_PATHNAME', 'clear_buffer_cache'
 LANGUAGE C STRICT
 PARALLEL UNSAFE;
 
-CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn)
+CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn)
 RETURNS bytea
 AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn'
 LANGUAGE C PARALLEL UNSAFE;
 
-CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn)
+CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn)
 RETURNS bytea
 AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
 LANGUAGE C PARALLEL UNSAFE;
diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control
index 5219571f11..5f6d640835 100644
--- a/pgxn/neon_test_utils/neon_test_utils.control
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -1,6 +1,6 @@
 # neon_test_utils extension
 comment = 'helpers for neon testing and debugging'
-default_version = '1.0'
+default_version = '1.1'
 module_pathname = '$libdir/neon_test_utils'
 relocatable = true
 trusted = true
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 82ce5be9f6..677006923d 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -48,10 +48,10 @@ PG_FUNCTION_INFO_V1(neon_xlogflush);
  */
 #if PG_MAJORVERSION_NUM < 16
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   XLogRecPtr request_lsn, bool request_latest, char *buffer);
+									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
 #else
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   XLogRecPtr request_lsn, bool request_latest, void *buffer);
+									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
 #endif
 
 static neon_read_at_lsn_type neon_read_at_lsn_ptr;
@@ -299,8 +299,11 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	text	   *forkname;
 	uint32		blkno;
 
-	bool		request_latest = PG_ARGISNULL(3);
-	uint64		read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3);
+	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;
+
+	if (PG_NARGS() != 5)
+		elog(ERROR, "unexpected number of arguments in SQL function signature");
 
 	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
 		PG_RETURN_NULL();
@@ -309,6 +312,9 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	forkname = PG_GETARG_TEXT_PP(1);
 	blkno = PG_GETARG_UINT32(2);
 
+	request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3);
+	not_modified_since = PG_ARGISNULL(4) ? request_lsn : PG_GETARG_LSN(4);
+
 	if (!superuser())
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
@@ -361,7 +367,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 	raw_page_data = VARDATA(raw_page);
 
-	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, read_lsn, request_latest, raw_page_data);
+	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsn, not_modified_since, raw_page_data);
 
 	relation_close(rel, AccessShareLock);
 
@@ -380,6 +386,9 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 {
 	char	   *raw_page_data;
 
+	if (PG_NARGS() != 7)
+		elog(ERROR, "unexpected number of arguments in SQL function signature");
+
 	if (!superuser())
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
@@ -403,18 +412,20 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 		};
 
 		ForkNumber	forknum = PG_GETARG_UINT32(3);
-
 		uint32		blkno = PG_GETARG_UINT32(4);
-		bool		request_latest = PG_ARGISNULL(5);
-		uint64		read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5);
+		XLogRecPtr	request_lsn;
+		XLogRecPtr	not_modified_since;
 
 		/* Initialize buffer to copy to */
 		bytea	   *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
 
+		request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5);
+		not_modified_since = PG_ARGISNULL(6) ? request_lsn : PG_GETARG_LSN(6);
+
 		SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 		raw_page_data = VARDATA(raw_page);
 
-		neon_read_at_lsn(rinfo, forknum, blkno, read_lsn, request_latest, raw_page_data);
+		neon_read_at_lsn(rinfo, forknum, blkno, request_lsn, not_modified_since, raw_page_data);
 		PG_RETURN_BYTEA_P(raw_page);
 	}
 }
diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py
index 868b80a561..2437c8f806 100644
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -17,7 +17,14 @@ def test_read_validation(neon_simple_env: NeonEnv):
     env = neon_simple_env
     env.neon_cli.create_branch("test_read_validation", "empty")
 
-    endpoint = env.endpoints.create_start("test_read_validation")
+    endpoint = env.endpoints.create_start(
+        "test_read_validation",
+        # Use protocol version 2, because the code that constructs the V1 messages
+        # assumes that a primary always wants to read the latest version of a page,
+        # and therefore doesn't work with the test functions below to read an older
+        # page version.
+        config_lines=["neon.protocol_version=2"],
+    )
 
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
@@ -64,7 +71,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             log.info("Cache is clear, reading stale page version")
 
             c.execute(
-                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}'))"
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}', NULL))"
             )
             direct_first = c.fetchone()
             assert first == direct_first, "Failed fetch page at historic lsn"
@@ -77,7 +84,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             log.info("Cache is clear, reading latest page version without cache")
 
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL, NULL))"
             )
             direct_latest = c.fetchone()
             assert second == direct_latest, "Failed fetch page at latest lsn"
@@ -92,7 +99,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             )
 
             c.execute(
-                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))"
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))"
             )
             direct_first = c.fetchone()
             assert first == direct_first, "Failed fetch page at historic lsn using oid"
@@ -102,7 +109,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             )
 
             c.execute(
-                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL))"
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL, NULL))"
             )
             direct_latest = c.fetchone()
             assert second == direct_latest, "Failed fetch page at latest lsn"
@@ -114,7 +121,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             )
 
             c.execute(
-                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))"
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))"
             )
             direct_first = c.fetchone()
             assert first == direct_first, "Failed fetch page at historic lsn using oid"
@@ -133,7 +140,14 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
 
     env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*")
 
-    endpoint = env.endpoints.create_start("test_read_validation_neg")
+    endpoint = env.endpoints.create_start(
+        "test_read_validation_neg",
+        # Use protocol version 2, because the code that constructs the V1 messages
+        # assumes that a primary always wants to read the latest version of a page,
+        # and therefore doesn't work with the test functions below to read an older
+        # page version.
+        config_lines=["neon.protocol_version=2"],
+    )
 
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
@@ -143,7 +157,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
             log.info("read a page of a missing relation")
             try:
                 c.execute(
-                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0'))"
+                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0', NULL))"
                 )
                 raise AssertionError("query should have failed")
             except UndefinedTable as e:
@@ -155,7 +169,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
             log.info("read a page at lsn 0")
             try:
                 c.execute(
-                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0'))"
+                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0', NULL))"
                 )
                 raise AssertionError("query should have failed")
             except IoError as e:
@@ -164,22 +178,22 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
             log.info("Pass NULL as an input")
             expected = (None, None, None)
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0'))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0', NULL))"
             )
             assert c.fetchone() == expected, "Expected null output"
 
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0'))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0', NULL))"
             )
             assert c.fetchone() == expected, "Expected null output"
 
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0'))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0', NULL))"
             )
             assert c.fetchone() == expected, "Expected null output"
 
             # This check is currently failing, reading beyond EOF is returning a 0-page
             log.info("Read beyond EOF")
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL, NULL))"
             )
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index eff103ca09..06f2a8befd 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -173,7 +173,9 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
     # which changes the LSN on the page.
     cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
     vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex()
-    cur.execute("select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn() )")
+    cur.execute(
+        "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )"
+    )
     vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex()
 
     assert vm_page_at_pageserver == vm_page_in_cache

From 0397427dcf9de7d16ede744700b6d87c84ebfd46 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 25 Apr 2024 19:45:48 +0300
Subject: [PATCH 0624/1571] Add test for SLRU download (#7377)

Before PR #7377, on-demand SLRU download always used the basebackup's
LSN in the SLRU download, but that LSN might get garbage-collected away
in the pageserver. We should request the latest LSN, like with GetPage
requests, with the LSN just indicating that we know that the page hasn't
been changed since the LSN (since the basebackup in this case).

Add test to demonstrate the problem. Without the fix, it fails with
"tried to request a page version that was garbage collected" error from
the pageserver.

I wrote this test as part of earlier PR #6693, but that fell through
the cracks and was never applied. PR #7377 superseded the fix from
that older PR, but the test is still valid.
---
 .../regress/test_ondemand_slru_download.py    | 131 ++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 test_runner/regress/test_ondemand_slru_download.py

diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py
new file mode 100644
index 0000000000..0b36b32552
--- /dev/null
+++ b/test_runner/regress/test_ondemand_slru_download.py
@@ -0,0 +1,131 @@
+from typing import Optional
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder, tenant_get_shards
+from fixtures.types import Lsn
+from fixtures.utils import query_scalar
+
+
+#
+# Test on-demand download of the pg_xact SLRUs
+#
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+
+    tenant_conf = {
+        "lazy_slru_download": "true",
+        # set PITR interval to be small, so we can do GC
+        "pitr_interval": "0 s",
+    }
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=tenant_conf, initial_tenant_shard_count=shard_count
+    )
+
+    timeline_id = env.initial_timeline
+    tenant_id = env.initial_tenant
+    endpoint = env.endpoints.create_start("main")
+
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    cur.execute("CREATE EXTENSION neon_test_utils")
+
+    # Create a test table
+    cur.execute("CREATE TABLE clogtest (id integer)")
+    cur.execute("INSERT INTO clogtest VALUES (1)")
+
+    # Consume a lot of XIDs, to create more pg_xact segments
+    for _ in range(1000):
+        cur.execute("select test_consume_xids(10000);")
+    cur.execute("INSERT INTO clogtest VALUES (2)")
+    for _ in range(1000):
+        cur.execute("select test_consume_xids(10000);")
+    cur.execute("INSERT INTO clogtest VALUES (2)")
+    for _ in range(1000):
+        cur.execute("select test_consume_xids(10000);")
+    cur.execute("INSERT INTO clogtest VALUES (3)")
+
+    # Restart postgres. After restart, the new instance will download the
+    # pg_xact segments lazily.
+    endpoint.stop()
+    endpoint.start()
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    # Consume more WAL, so that the pageserver can compact and GC older data,
+    # including the LSN that we started the new endpoint at,
+    cur.execute("CREATE TABLE anothertable (i int, t text)")
+    cur.execute(
+        "INSERT INTO anothertable SELECT g, 'long string to consume some space' || g FROM generate_series(1, 10000) g"
+    )
+
+    # Run GC
+    shards = tenant_get_shards(env, tenant_id, None)
+    for tenant_shard_id, pageserver in shards:
+        client = pageserver.http_client()
+        client.timeline_checkpoint(tenant_shard_id, timeline_id)
+        client.timeline_compact(tenant_shard_id, timeline_id)
+        client.timeline_gc(tenant_shard_id, timeline_id, 0)
+
+    # Test that this can still on-demand download the old pg_xact segments
+    cur.execute("select xmin, xmax, * from clogtest")
+    tup = cur.fetchall()
+    log.info(f"tuples = {tup}")
+
+
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+
+    tenant_conf = {
+        "lazy_slru_download": "true",
+    }
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=tenant_conf, initial_tenant_shard_count=shard_count
+    )
+
+    endpoint = env.endpoints.create_start("main")
+
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    cur.execute("CREATE EXTENSION neon_test_utils")
+
+    # Create a test table
+    cur.execute("CREATE TABLE clogtest (id integer)")
+    cur.execute("INSERT INTO clogtest VALUES (1)")
+
+    # Consume a lot of XIDs, to create more pg_xact segments
+    for _ in range(1000):
+        cur.execute("select test_consume_xids(10000);")
+
+    # Open a new connection and insert another row, but leave
+    # the transaction open
+    pg_conn2 = endpoint.connect()
+    cur2 = pg_conn2.cursor()
+    cur2.execute("BEGIN")
+    cur2.execute("INSERT INTO clogtest VALUES (2)")
+
+    # Another insert on the first connection, which is committed.
+    for _ in range(1000):
+        cur.execute("select test_consume_xids(10000);")
+    cur.execute("INSERT INTO clogtest VALUES (3)")
+
+    # Start standby at this point in time
+    lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
+    endpoint_at_lsn = env.endpoints.create_start(
+        branch_name="main", endpoint_id="ep-at-lsn", lsn=lsn
+    )
+
+    # Commit transaction 2, after the standby was launched.
+    cur2.execute("COMMIT")
+
+    # The replica should not see transaction 2 as committed.
+    conn_replica = endpoint_at_lsn.connect()
+    cur_replica = conn_replica.cursor()
+    cur_replica.execute("SELECT * FROM clogtest")
+    assert cur_replica.fetchall() == [(1,), (3,)]

From ca8fca0e9ff87b0dfdc776fd1806bd04238954a4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 25 Apr 2024 19:45:52 +0300
Subject: [PATCH 0625/1571] Add test to demonstrate the problem with protocol
 version 1 (#7377)

---
 test_runner/regress/test_hot_standby.py | 79 ++++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index ac3315b86f..179cc273ec 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -3,7 +3,7 @@ import re
 import time
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, tenant_get_shards, wait_replica_caughtup
 
 
 # Check for corrupted WAL messages which might otherwise go unnoticed if
@@ -102,3 +102,80 @@ def test_2_replicas_start(neon_simple_env: NeonEnv):
             ) as secondary2:
                 wait_replica_caughtup(primary, secondary1)
                 wait_replica_caughtup(primary, secondary2)
+
+
+# We had an issue that a standby server made GetPage requests with an
+# old LSN, based on the last-written LSN cache, to avoid waits in the
+# pageserver.  However, requesting a page with a very old LSN, such
+# that the GC horizon has already advanced past it, results in an
+# error from the pageserver:
+# "Bad request: tried to request a page version that was garbage collected"
+#
+# To avoid that, the compute<-> pageserver protocol was updated so
+# that that the standby now sends two LSNs, the old last-written LSN
+# and the current replay LSN.
+#
+# https://github.com/neondatabase/neon/issues/6211
+def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
+    tenant_conf = {
+        # set PITR interval to be small, so we can do GC
+        "pitr_interval": "0 s",
+    }
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
+    timeline_id = env.initial_timeline
+    tenant_id = env.initial_tenant
+
+    with env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+    ) as primary:
+        with env.endpoints.new_replica_start(
+            origin=primary,
+            endpoint_id="secondary",
+            # Protocol version 2 was introduced to fix the issue
+            # that this test exercises. With protocol version 1 it
+            # fails.
+            config_lines=["neon.protocol_version=2"],
+        ) as secondary:
+            p_cur = primary.connect().cursor()
+            p_cur.execute("CREATE EXTENSION neon_test_utils")
+            p_cur.execute("CREATE TABLE test (id int primary key) WITH (autovacuum_enabled=false)")
+            p_cur.execute("INSERT INTO test SELECT generate_series(1, 10000) AS g")
+
+            wait_replica_caughtup(primary, secondary)
+
+            s_cur = secondary.connect().cursor()
+
+            s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()")
+            res = s_cur.fetchone()
+            assert res is not None
+
+            s_cur.execute("SELECT COUNT(*) FROM test")
+            res = s_cur.fetchone()
+            assert res[0] == 10000
+
+            # Clear the cache in the standby, so that when we
+            # re-execute the query, it will make GetPage
+            # requests. This does not clear the last-written LSN cache
+            # so we still remember the LSNs of the pages.
+            s_cur.execute("SELECT clear_buffer_cache()")
+
+            # Do other stuff on the primary, to advance the WAL
+            p_cur.execute("CREATE TABLE test2 AS SELECT generate_series(1, 1000000) AS g")
+
+            # Run GC. The PITR interval is very small, so this advances the GC cutoff LSN
+            # very close to the primary's current insert LSN.
+            shards = tenant_get_shards(env, tenant_id, None)
+            for tenant_shard_id, pageserver in shards:
+                client = pageserver.http_client()
+                client.timeline_checkpoint(tenant_shard_id, timeline_id)
+                client.timeline_compact(tenant_shard_id, timeline_id)
+                client.timeline_gc(tenant_shard_id, timeline_id, 0)
+
+            # Re-execute the query. The GetPage requests that this
+            # generates use old not_modified_since LSNs, older than
+            # the GC cutoff, but new request LSNs. (In protocol
+            # version 1 there was only one LSN, and this failed.)
+            s_cur.execute("SELECT COUNT(*) FROM test")
+            res = s_cur.fetchone()
+            assert res[0] == 10000

From d63185fa6c05dc7ba5dba8d11bb84788c50e288f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 26 Apr 2024 09:15:59 +0100
Subject: [PATCH 0626/1571] storage controller: log hygiene & better error type
 (#7508)

These are testability/logging improvements spun off from #7475

- Don't log warnings for shutdown errors in compute hook
- Revise logging around heartbeats and reconcile_all so that we aren't
emitting such a large volume of INFO messages under normal quite
conditions.
- Clean up the `last_error` of TenantShard to hold a ReconcileError
instead of a String, and use that properly typed error to suppress
reconciler cancel errors during reconcile_all_now. This is important for
tests that iteratively call that, as otherwise they would get 500 errors
when some reconciler in flight was cancelled (perhaps due to a state
change on the tenant shard starting a new reconciler).
---
 storage_controller/src/heartbeater.rs  | 13 +++++++++
 storage_controller/src/reconciler.rs   |  5 +++-
 storage_controller/src/service.rs      | 35 ++++++++++++++++++----
 storage_controller/src/tenant_shard.rs | 40 +++++++++++++++++++-------
 4 files changed, 76 insertions(+), 17 deletions(-)

diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 7669680eb6..1ef97e78eb 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -184,6 +184,19 @@ impl HeartbeaterTask {
                 }
             }
         }
+        tracing::info!(
+            "Heartbeat round complete for {} nodes, {} offline",
+            new_state.len(),
+            new_state
+                .values()
+                .filter(|s| match s {
+                    PageserverState::Available { .. } => {
+                        false
+                    }
+                    PageserverState::Offline => true,
+                })
+                .count()
+        );
 
         let mut deltas = Vec::new();
         let now = Instant::now();
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 28801ede6e..f38905b424 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -767,7 +767,10 @@ impl Reconciler {
                 // It is up to the caller whether they want to drop out on this error, but they don't have to:
                 // in general we should avoid letting unavailability of the cloud control plane stop us from
                 // making progress.
-                tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
+                if !matches!(e, NotifyError::ShuttingDown) {
+                    tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
+                }
+
                 // Set this flag so that in our ReconcileResult we will set the flag on the shard that it
                 // needs to retry at some point.
                 self.compute_notify_failure = true;
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 2e6f3750e7..952664e339 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -824,8 +824,7 @@ impl Service {
 
                 // Ordering: populate last_error before advancing error_seq,
                 // so that waiters will see the correct error after waiting.
-                *(tenant.last_error.lock().unwrap()) = format!("{e}");
-                tenant.error_waiter.advance(result.sequence);
+                tenant.set_last_error(result.sequence, e);
 
                 for (node_id, o) in result.observed.locations {
                     tenant.observed.locations.insert(node_id, o);
@@ -2805,7 +2804,14 @@ impl Service {
                 tenant_shard_id: shard.tenant_shard_id,
                 node_attached: *shard.intent.get_attached(),
                 node_secondary: shard.intent.get_secondary().to_vec(),
-                last_error: shard.last_error.lock().unwrap().clone(),
+                last_error: shard
+                    .last_error
+                    .lock()
+                    .unwrap()
+                    .as_ref()
+                    .map(|e| format!("{e}"))
+                    .unwrap_or("".to_string())
+                    .clone(),
                 is_reconciling: shard.reconciler.is_some(),
                 is_pending_compute_notification: shard.pending_compute_notification,
                 is_splitting: matches!(shard.splitting, SplitState::Splitting),
@@ -4031,7 +4037,7 @@ impl Service {
                 // TODO: in the background, we should balance work back onto this pageserver
             }
             AvailabilityTransition::Unchanged => {
-                tracing::info!("Node {} no change during config", node_id);
+                tracing::debug!("Node {} no change during config", node_id);
             }
         }
 
@@ -4351,7 +4357,26 @@ impl Service {
         };
 
         let waiter_count = waiters.len();
-        self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
+        match self.await_waiters(waiters, RECONCILE_TIMEOUT).await {
+            Ok(()) => {}
+            Err(ReconcileWaitError::Failed(_, reconcile_error))
+                if matches!(*reconcile_error, ReconcileError::Cancel) =>
+            {
+                // Ignore reconciler cancel errors: this reconciler might have shut down
+                // because some other change superceded it.  We will return a nonzero number,
+                // so the caller knows they might have to call again to quiesce the system.
+            }
+            Err(e) => {
+                return Err(e);
+            }
+        };
+
+        tracing::info!(
+            "{} reconciles in reconcile_all, {} waiters",
+            reconciles_spawned,
+            waiter_count
+        );
+
         Ok(waiter_count)
     }
 
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index d69260b9e7..7b11dfe64d 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -38,12 +38,18 @@ use crate::{
 };
 
 /// Serialization helper
-fn read_mutex_content<S, T>(v: &std::sync::Mutex<T>, serializer: S) -> Result<S::Ok, S::Error>
+fn read_last_error<S, T>(v: &std::sync::Mutex<Option<T>>, serializer: S) -> Result<S::Ok, S::Error>
 where
     S: serde::ser::Serializer,
-    T: Clone + std::fmt::Display,
+    T: std::fmt::Display,
 {
-    serializer.collect_str(&v.lock().unwrap())
+    serializer.collect_str(
+        &v.lock()
+            .unwrap()
+            .as_ref()
+            .map(|e| format!("{e}"))
+            .unwrap_or("".to_string()),
+    )
 }
 
 /// In-memory state for a particular tenant shard.
@@ -111,11 +117,15 @@ pub(crate) struct TenantShard {
     #[serde(skip)]
     pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
 
-    /// The most recent error from a reconcile on this tenant
+    /// The most recent error from a reconcile on this tenant.  This is a nested Arc
+    /// because:
+    ///  - ReconcileWaiters need to Arc-clone the overall object to read it later
+    ///  - ReconcileWaitError needs to use an `Arc<ReconcileError>` because we can construct
+    ///    many waiters for one shard, and the underlying error types are not Clone.
     /// TODO: generalize to an array of recent events
     /// TOOD: use a ArcSwap instead of mutex for faster reads?
-    #[serde(serialize_with = "read_mutex_content")]
-    pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
+    #[serde(serialize_with = "read_last_error")]
+    pub(crate) last_error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,
 
     /// If we have a pending compute notification that for some reason we weren't able to send,
     /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes
@@ -293,18 +303,18 @@ pub(crate) struct ReconcilerWaiter {
 
     seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
     error_seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
-    error: std::sync::Arc<std::sync::Mutex<String>>,
+    error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,
     seq: Sequence,
 }
 
 #[derive(thiserror::Error, Debug)]
-pub enum ReconcileWaitError {
+pub(crate) enum ReconcileWaitError {
     #[error("Timeout waiting for shard {0}")]
     Timeout(TenantShardId),
     #[error("shutting down")]
     Shutdown,
     #[error("Reconcile error on shard {0}: {1}")]
-    Failed(TenantShardId, String),
+    Failed(TenantShardId, Arc<ReconcileError>),
 }
 
 #[derive(Eq, PartialEq, Debug)]
@@ -342,7 +352,8 @@ impl ReconcilerWaiter {
                     SeqWaitError::Timeout => unreachable!()
                 })?;
 
-                return Err(ReconcileWaitError::Failed(self.tenant_shard_id, self.error.lock().unwrap().clone()))
+                return Err(ReconcileWaitError::Failed(self.tenant_shard_id,
+                    self.error.lock().unwrap().clone().expect("If error_seq_wait was advanced error was set").clone()))
             }
         }
 
@@ -873,7 +884,7 @@ impl TenantShard {
             active_nodes_dirty || dirty_observed || self.pending_compute_notification;
 
         if !do_reconcile {
-            tracing::info!("Not dirty, no reconciliation needed.");
+            tracing::debug!("Not dirty, no reconciliation needed.");
             return ReconcileNeeded::No;
         }
 
@@ -1151,6 +1162,13 @@ impl TenantShard {
         &self.scheduling_policy
     }
 
+    pub(crate) fn set_last_error(&mut self, sequence: Sequence, error: ReconcileError) {
+        // Ordering: always set last_error before advancing sequence, so that sequence
+        // waiters are guaranteed to see a Some value when they see an error.
+        *(self.last_error.lock().unwrap()) = Some(Arc::new(error));
+        self.error_waiter.advance(sequence);
+    }
+
     pub(crate) fn from_persistent(
         tsp: TenantShardPersistence,
         intent: IntentState,

From 70f4a16a05a5512c250102600f7900169b15c56d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Apr 2024 10:30:20 +0200
Subject: [PATCH 0627/1571] refactor(owned_buffers_io::BufferedWriter): be
 generic over the type of buffer (#7482)

---
 .../tenant/remote_timeline_client/download.rs |   9 +-
 .../virtual_file/owned_buffers_io/write.rs    | 147 +++++++++++++-----
 2 files changed, 110 insertions(+), 46 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 84692aa577..7bf2d2de10 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -7,6 +7,7 @@ use std::collections::HashSet;
 use std::future::Future;
 
 use anyhow::{anyhow, Context};
+use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::shard::TenantShardId;
 use tokio::fs::{self, File, OpenOptions};
@@ -194,10 +195,10 @@ async fn download_object<'a>(
                 // There's chunks_vectored() on the stream.
                 let (bytes_amount, destination_file) = async {
                     let size_tracking = size_tracking_writer::Writer::new(destination_file);
-                    let mut buffered = owned_buffers_io::write::BufferedWriter::<
-                        { super::BUFFER_SIZE },
-                        _,
-                    >::new(size_tracking);
+                    let mut buffered = owned_buffers_io::write::BufferedWriter::<BytesMut, _>::new(
+                        size_tracking,
+                        BytesMut::with_capacity(super::BUFFER_SIZE),
+                    );
                     while let Some(res) =
                         futures::StreamExt::next(&mut download.download_stream).await
                     {
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index f1812d9b51..6b3a02c71a 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -10,14 +10,14 @@ pub trait OwnedAsyncWriter {
     ) -> std::io::Result<(usize, B::Buf)>;
 }
 
-/// A wrapper aorund an [`OwnedAsyncWriter`] that batches smaller writers
-/// into `BUFFER_SIZE`-sized writes.
+/// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch
+/// small writes into larger writes of size [`Buffer::cap`].
 ///
 /// # Passthrough Of Large Writers
 ///
-/// Buffered writes larger than the `BUFFER_SIZE` cause the internal
-/// buffer to be flushed, even if it is not full yet. Then, the large
-/// buffered write is passed through to the unerlying [`OwnedAsyncWriter`].
+/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`]
+/// cause the internal buffer to be flushed prematurely so that the large
+/// buffered write is passed through to the underlying [`OwnedAsyncWriter`].
 ///
 /// This pass-through is generally beneficial for throughput, but if
 /// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
@@ -25,24 +25,25 @@ pub trait OwnedAsyncWriter {
 ///
 /// In such cases, a different implementation that always buffers in memory
 /// may be preferable.
-pub struct BufferedWriter<const BUFFER_SIZE: usize, W> {
+pub struct BufferedWriter<B, W> {
     writer: W,
-    // invariant: always remains Some(buf)
-    // with buf.capacity() == BUFFER_SIZE except
-    // - while IO is ongoing => goes back to Some() once the IO completed successfully
-    // - after an IO error => stays `None` forever
-    // In these exceptional cases, it's `None`.
-    buf: Option<BytesMut>,
+    /// invariant: always remains Some(buf) except
+    /// - while IO is ongoing => goes back to Some() once the IO completed successfully
+    /// - after an IO error => stays `None` forever
+    /// In these exceptional cases, it's `None`.
+    buf: Option<B>,
 }
 
-impl<const BUFFER_SIZE: usize, W> BufferedWriter<BUFFER_SIZE, W>
+impl<B, Buf, W> BufferedWriter<B, W>
 where
+    B: Buffer<IoBuf = Buf> + Send,
+    Buf: IoBuf + Send,
     W: OwnedAsyncWriter,
 {
-    pub fn new(writer: W) -> Self {
+    pub fn new(writer: W, buf: B) -> Self {
         Self {
             writer,
-            buf: Some(BytesMut::with_capacity(BUFFER_SIZE)),
+            buf: Some(buf),
         }
     }
 
@@ -53,61 +54,121 @@ where
         Ok(writer)
     }
 
-    pub async fn write_buffered<B: IoBuf>(&mut self, chunk: Slice<B>) -> std::io::Result<()>
+    #[inline(always)]
+    fn buf(&self) -> &B {
+        self.buf
+            .as_ref()
+            .expect("must not use after we returned an error")
+    }
+
+    pub async fn write_buffered<S: IoBuf>(&mut self, chunk: Slice<S>) -> std::io::Result<(usize, S)>
     where
-        B: IoBuf + Send,
+        S: IoBuf + Send,
     {
+        let chunk_len = chunk.len();
         // avoid memcpy for the middle of the chunk
-        if chunk.len() >= BUFFER_SIZE {
+        if chunk.len() >= self.buf().cap() {
             self.flush().await?;
             // do a big write, bypassing `buf`
             assert_eq!(
                 self.buf
                     .as_ref()
                     .expect("must not use after an error")
-                    .len(),
+                    .pending(),
                 0
             );
-            let chunk_len = chunk.len();
             let (nwritten, chunk) = self.writer.write_all(chunk).await?;
             assert_eq!(nwritten, chunk_len);
-            drop(chunk);
-            return Ok(());
+            return Ok((nwritten, chunk));
         }
         // in-memory copy the < BUFFER_SIZED tail of the chunk
-        assert!(chunk.len() < BUFFER_SIZE);
-        let mut chunk = &chunk[..];
-        while !chunk.is_empty() {
+        assert!(chunk.len() < self.buf().cap());
+        let mut slice = &chunk[..];
+        while !slice.is_empty() {
             let buf = self.buf.as_mut().expect("must not use after an error");
-            let need = BUFFER_SIZE - buf.len();
-            let have = chunk.len();
+            let need = buf.cap() - buf.pending();
+            let have = slice.len();
             let n = std::cmp::min(need, have);
-            buf.extend_from_slice(&chunk[..n]);
-            chunk = &chunk[n..];
-            if buf.len() >= BUFFER_SIZE {
-                assert_eq!(buf.len(), BUFFER_SIZE);
+            buf.extend_from_slice(&slice[..n]);
+            slice = &slice[n..];
+            if buf.pending() >= buf.cap() {
+                assert_eq!(buf.pending(), buf.cap());
                 self.flush().await?;
             }
         }
-        assert!(chunk.is_empty(), "by now we should have drained the chunk");
-        Ok(())
+        assert!(slice.is_empty(), "by now we should have drained the chunk");
+        Ok((chunk_len, chunk.into_inner()))
     }
 
     async fn flush(&mut self) -> std::io::Result<()> {
         let buf = self.buf.take().expect("must not use after an error");
-        if buf.is_empty() {
+        let buf_len = buf.pending();
+        if buf_len == 0 {
             self.buf = Some(buf);
-            return std::io::Result::Ok(());
+            return Ok(());
         }
-        let buf_len = buf.len();
-        let (nwritten, mut buf) = self.writer.write_all(buf).await?;
+        let (nwritten, io_buf) = self.writer.write_all(buf.flush()).await?;
         assert_eq!(nwritten, buf_len);
-        buf.clear();
-        self.buf = Some(buf);
+        self.buf = Some(Buffer::reuse_after_flush(io_buf));
         Ok(())
     }
 }
 
+/// A [`Buffer`] is used by [`BufferedWriter`] to batch smaller writes into larger ones.
+pub trait Buffer {
+    type IoBuf: IoBuf;
+
+    /// Capacity of the buffer. Must not change over the lifetime `self`.`
+    fn cap(&self) -> usize;
+
+    /// Add data to the buffer.
+    /// Panics if there is not enough room to accomodate `other`'s content, i.e.,
+    /// panics if `other.len() > self.cap() - self.pending()`.
+    fn extend_from_slice(&mut self, other: &[u8]);
+
+    /// Number of bytes in the buffer.
+    fn pending(&self) -> usize;
+
+    /// Turns `self` into a [`tokio_epoll_uring::Slice`] of the pending data
+    /// so we can use [`tokio_epoll_uring`] to write it to disk.
+    fn flush(self) -> Slice<Self::IoBuf>;
+
+    /// After the write to disk is done and we have gotten back the slice,
+    /// [`BufferedWriter`] uses this method to re-use the io buffer.
+    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self;
+}
+
+impl Buffer for BytesMut {
+    type IoBuf = BytesMut;
+
+    #[inline(always)]
+    fn cap(&self) -> usize {
+        self.capacity()
+    }
+
+    fn extend_from_slice(&mut self, other: &[u8]) {
+        BytesMut::extend_from_slice(self, other)
+    }
+
+    #[inline(always)]
+    fn pending(&self) -> usize {
+        self.len()
+    }
+
+    fn flush(self) -> Slice<BytesMut> {
+        if self.is_empty() {
+            return self.slice_full();
+        }
+        let len = self.len();
+        self.slice(0..len)
+    }
+
+    fn reuse_after_flush(mut iobuf: BytesMut) -> Self {
+        iobuf.clear();
+        iobuf
+    }
+}
+
 impl OwnedAsyncWriter for Vec<u8> {
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
@@ -125,6 +186,8 @@ impl OwnedAsyncWriter for Vec<u8> {
 
 #[cfg(test)]
 mod tests {
+    use bytes::BytesMut;
+
     use super::*;
 
     #[derive(Default)]
@@ -158,7 +221,7 @@ mod tests {
     #[tokio::test]
     async fn test_buffered_writes_only() -> std::io::Result<()> {
         let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
         write!(writer, b"a");
         write!(writer, b"b");
         write!(writer, b"c");
@@ -175,7 +238,7 @@ mod tests {
     #[tokio::test]
     async fn test_passthrough_writes_only() -> std::io::Result<()> {
         let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
         write!(writer, b"abc");
         write!(writer, b"de");
         write!(writer, b"");
@@ -191,7 +254,7 @@ mod tests {
     #[tokio::test]
     async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
         let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
         write!(writer, b"a");
         write!(writer, b"bc");
         write!(writer, b"d");

From bf369f4268f839b5228dd1d65d822280d50401c8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Apr 2024 11:19:41 +0200
Subject: [PATCH 0628/1571] 
 refactor(owned_buffer_io::util::size_tracking_writer): make generic over
 underlying writer (#7483)

part of https://github.com/neondatabase/neon/issues/7124
---
 .../tenant/remote_timeline_client/download.rs |  1 +
 pageserver/src/virtual_file.rs                | 12 +++++++++++
 .../util/size_tracking_writer.rs              | 21 +++++++++++--------
 3 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 7bf2d2de10..3744eecab5 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -183,6 +183,7 @@ async fn download_object<'a>(
         #[cfg(target_os = "linux")]
         crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
             use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
+            use bytes::BytesMut;
             async {
                 let destination_file = VirtualFile::create(dst_path)
                     .await
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 0cf6a0019b..1d43a94568 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -32,6 +32,7 @@ pub use io_engine::feature_test as io_engine_feature_test;
 pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
+use self::owned_buffers_io::write::OwnedAsyncWriter;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
@@ -1083,6 +1084,17 @@ impl Drop for VirtualFile {
     }
 }
 
+impl OwnedAsyncWriter for VirtualFile {
+    #[inline(always)]
+    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        buf: B,
+    ) -> std::io::Result<(usize, B::Buf)> {
+        let (buf, res) = VirtualFile::write_all(self, buf).await;
+        res.map(move |v| (v, buf))
+    }
+}
+
 impl OpenFiles {
     fn new(num_slots: usize) -> OpenFiles {
         let mut slots = Box::new(Vec::with_capacity(num_slots));
diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
index 7505b7487e..edb11c5f4c 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
@@ -1,33 +1,36 @@
-use crate::virtual_file::{owned_buffers_io::write::OwnedAsyncWriter, VirtualFile};
+use crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter;
 use tokio_epoll_uring::{BoundedBuf, IoBuf};
 
-pub struct Writer {
-    dst: VirtualFile,
+pub struct Writer<W> {
+    dst: W,
     bytes_amount: u64,
 }
 
-impl Writer {
-    pub fn new(dst: VirtualFile) -> Self {
+impl<W> Writer<W> {
+    pub fn new(dst: W) -> Self {
         Self {
             dst,
             bytes_amount: 0,
         }
     }
+
     /// Returns the wrapped `VirtualFile` object as well as the number
     /// of bytes that were written to it through this object.
-    pub fn into_inner(self) -> (u64, VirtualFile) {
+    pub fn into_inner(self) -> (u64, W) {
         (self.bytes_amount, self.dst)
     }
 }
 
-impl OwnedAsyncWriter for Writer {
+impl<W> OwnedAsyncWriter for Writer<W>
+where
+    W: OwnedAsyncWriter,
+{
     #[inline(always)]
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         buf: B,
     ) -> std::io::Result<(usize, B::Buf)> {
-        let (buf, res) = self.dst.write_all(buf).await;
-        let nwritten = res?;
+        let (nwritten, buf) = self.dst.write_all(buf).await?;
         self.bytes_amount += u64::try_from(nwritten).unwrap();
         Ok((nwritten, buf))
     }

From dbb0c967d5fb5104847fb71e8d783ebeae3e7ff2 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Apr 2024 13:01:26 +0200
Subject: [PATCH 0629/1571] refactor(ephemeral_file): reuse
 owned_buffers_io::BufferedWriter (#7484)

part of https://github.com/neondatabase/neon/issues/7124

Changes
-------

This PR replaces the `EphemeralFile::write_blob`-specifc `struct Writer`
with re-use of `owned_buffers_io::write::BufferedWriter`.

Further, it restructures the code to cleanly separate

* the high-level aspect of EphemeralFile's write_blob / read_blk API
* the page-caching aspect
* the aspect of IO
  * performing buffered write IO to an underlying VirtualFile
* serving reads from either the VirtualFile or the buffer if it hasn't
been flushed yet
* the annoying "feature" that reads past the end of the written range
are allowed and expected to return zeroed memory, as long as one remains
within one PAGE_SZ
---
 pageserver/src/task_mgr.rs                    |   2 +
 pageserver/src/tenant/ephemeral_file.rs       | 223 ++----------------
 .../src/tenant/ephemeral_file/page_caching.rs | 218 +++++++++++++++++
 .../ephemeral_file/zero_padded_read_write.rs  | 125 ++++++++++
 .../zero_padded_read_write/zero_padded.rs     | 108 +++++++++
 .../tenant/remote_timeline_client/download.rs |   1 -
 .../tenant/storage_layer/inmemory_layer.rs    |   2 +-
 pageserver/src/virtual_file.rs                |   1 -
 .../util/size_tracking_writer.rs              |   8 +
 .../virtual_file/owned_buffers_io/write.rs    |  58 +++++
 10 files changed, 538 insertions(+), 208 deletions(-)
 create mode 100644 pageserver/src/tenant/ephemeral_file/page_caching.rs
 create mode 100644 pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
 create mode 100644 pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 9a1e354ecf..b76105399b 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -361,6 +361,8 @@ pub enum TaskKind {
 
     DebugTool,
 
+    EphemeralFilePreWarmPageCache,
+
     #[cfg(test)]
     UnitTest,
 }
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index b27230db03..96efd13c1b 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -3,36 +3,26 @@
 
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::page_cache::{self, PAGE_SZ};
+use crate::page_cache;
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::{self, VirtualFile};
-use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
-use std::cmp::min;
 
-use std::io::{self, ErrorKind};
-use std::ops::DerefMut;
+use std::io;
 use std::sync::atomic::AtomicU64;
-use tracing::*;
 use utils::id::TimelineId;
 
 pub struct EphemeralFile {
-    page_cache_file_id: page_cache::FileId,
-
     _tenant_shard_id: TenantShardId,
     _timeline_id: TimelineId,
-    file: VirtualFile,
-    len: u64,
-    /// An ephemeral file is append-only.
-    /// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
-    /// The other pages, which can no longer be modified, are accessed through the page cache.
-    ///
-    /// None <=> IO is ongoing.
-    /// Size is fixed to PAGE_SZ at creation time and must not be changed.
-    mutable_tail: Option<BytesMut>,
+
+    rw: page_caching::RW,
 }
 
+mod page_caching;
+mod zero_padded_read_write;
+
 impl EphemeralFile {
     pub async fn create(
         conf: &PageServerConf,
@@ -59,21 +49,18 @@ impl EphemeralFile {
         .await?;
 
         Ok(EphemeralFile {
-            page_cache_file_id: page_cache::next_file_id(),
             _tenant_shard_id: tenant_shard_id,
             _timeline_id: timeline_id,
-            file,
-            len: 0,
-            mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)),
+            rw: page_caching::RW::new(file),
         })
     }
 
     pub(crate) fn len(&self) -> u64 {
-        self.len
+        self.rw.bytes_written()
     }
 
-    pub(crate) fn id(&self) -> page_cache::FileId {
-        self.page_cache_file_id
+    pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
+        self.rw.page_cache_file_id()
     }
 
     pub(crate) async fn read_blk(
@@ -81,182 +68,30 @@ impl EphemeralFile {
         blknum: u32,
         ctx: &RequestContext,
     ) -> Result<BlockLease, io::Error> {
-        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
-        if flushed_blknums.contains(&(blknum as u64)) {
-            let cache = page_cache::get();
-            match cache
-                .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
-                .await
-                .map_err(|e| {
-                    std::io::Error::new(
-                        std::io::ErrorKind::Other,
-                        // order path before error because error is anyhow::Error => might have many contexts
-                        format!(
-                            "ephemeral file: read immutable page #{}: {}: {:#}",
-                            blknum, self.file.path, e,
-                        ),
-                    )
-                })? {
-                page_cache::ReadBufResult::Found(guard) => {
-                    return Ok(BlockLease::PageReadGuard(guard))
-                }
-                page_cache::ReadBufResult::NotFound(write_guard) => {
-                    let write_guard = self
-                        .file
-                        .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
-                        .await?;
-                    let read_guard = write_guard.mark_valid();
-                    return Ok(BlockLease::PageReadGuard(read_guard));
-                }
-            };
-        } else {
-            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
-            Ok(BlockLease::EphemeralFileMutableTail(
-                self.mutable_tail
-                    .as_deref()
-                    .expect("we're not doing IO, it must be Some()")
-                    .try_into()
-                    .expect("we ensure that it's always PAGE_SZ"),
-            ))
-        }
+        self.rw.read_blk(blknum, ctx).await
     }
 
     pub(crate) async fn write_blob(
         &mut self,
         srcbuf: &[u8],
-        ctx: &RequestContext,
+        _ctx: &RequestContext,
     ) -> Result<u64, io::Error> {
-        struct Writer<'a> {
-            ephemeral_file: &'a mut EphemeralFile,
-            /// The block to which the next [`push_bytes`] will write.
-            blknum: u32,
-            /// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
-            off: usize,
-        }
-        impl<'a> Writer<'a> {
-            fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
-                Ok(Writer {
-                    blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
-                    off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
-                    ephemeral_file,
-                })
-            }
-            #[inline(always)]
-            async fn push_bytes(
-                &mut self,
-                src: &[u8],
-                ctx: &RequestContext,
-            ) -> Result<(), io::Error> {
-                let mut src_remaining = src;
-                while !src_remaining.is_empty() {
-                    let dst_remaining = &mut self
-                        .ephemeral_file
-                        .mutable_tail
-                        .as_deref_mut()
-                        .expect("IO is not yet ongoing")[self.off..];
-                    let n = min(dst_remaining.len(), src_remaining.len());
-                    dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
-                    self.off += n;
-                    src_remaining = &src_remaining[n..];
-                    if self.off == PAGE_SZ {
-                        let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail)
-                            .expect("IO is not yet ongoing");
-                        let (mutable_tail, res) = self
-                            .ephemeral_file
-                            .file
-                            .write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64)
-                            .await;
-                        // TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail.
-                        // I.e., the IO isn't retryable if we panic.
-                        self.ephemeral_file.mutable_tail = Some(mutable_tail);
-                        match res {
-                            Ok(_) => {
-                                // Pre-warm the page cache with what we just wrote.
-                                // This isn't necessary for coherency/correctness, but it's how we've always done it.
-                                let cache = page_cache::get();
-                                match cache
-                                    .read_immutable_buf(
-                                        self.ephemeral_file.page_cache_file_id,
-                                        self.blknum,
-                                        ctx,
-                                    )
-                                    .await
-                                {
-                                    Ok(page_cache::ReadBufResult::Found(_guard)) => {
-                                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
-                                        unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
-                                    }
-                                    Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
-                                        let buf: &mut [u8] = write_guard.deref_mut();
-                                        debug_assert_eq!(buf.len(), PAGE_SZ);
-                                        buf.copy_from_slice(
-                                            self.ephemeral_file
-                                                .mutable_tail
-                                                .as_deref()
-                                                .expect("IO is not ongoing"),
-                                        );
-                                        let _ = write_guard.mark_valid();
-                                        // pre-warm successful
-                                    }
-                                    Err(e) => {
-                                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
-                                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
-                                    }
-                                }
-                                // Zero the buffer for re-use.
-                                // Zeroing is critical for correcntess because the write_blob code below
-                                // and similarly read_blk expect zeroed pages.
-                                self.ephemeral_file
-                                    .mutable_tail
-                                    .as_deref_mut()
-                                    .expect("IO is not ongoing")
-                                    .fill(0);
-                                // This block is done, move to next one.
-                                self.blknum += 1;
-                                self.off = 0;
-                            }
-                            Err(e) => {
-                                return Err(std::io::Error::new(
-                                    ErrorKind::Other,
-                                    // order error before path because path is long and error is short
-                                    format!(
-                                        "ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}",
-                                        self.blknum,
-                                        e,
-                                        self.ephemeral_file.file.path,
-                                    ),
-                                ));
-                            }
-                        }
-                    }
-                }
-                Ok(())
-            }
-        }
-
-        let pos = self.len;
-        let mut writer = Writer::new(self)?;
+        let pos = self.rw.bytes_written();
 
         // Write the length field
         if srcbuf.len() < 0x80 {
             // short one-byte length header
             let len_buf = [srcbuf.len() as u8];
-            writer.push_bytes(&len_buf, ctx).await?;
+
+            self.rw.write_all_borrowed(&len_buf).await?;
         } else {
             let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
             len_buf[0] |= 0x80;
-            writer.push_bytes(&len_buf, ctx).await?;
+            self.rw.write_all_borrowed(&len_buf).await?;
         }
 
         // Write the payload
-        writer.push_bytes(srcbuf, ctx).await?;
-
-        if srcbuf.len() < 0x80 {
-            self.len += 1;
-        } else {
-            self.len += 4;
-        }
-        self.len += srcbuf.len() as u64;
+        self.rw.write_all_borrowed(srcbuf).await?;
 
         Ok(pos)
     }
@@ -271,28 +106,6 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
     }
 }
 
-impl Drop for EphemeralFile {
-    fn drop(&mut self) {
-        // There might still be pages in the [`crate::page_cache`] for this file.
-        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
-
-        // unlink the file
-        let res = std::fs::remove_file(&self.file.path);
-        if let Err(e) = res {
-            if e.kind() != std::io::ErrorKind::NotFound {
-                // just never log the not found errors, we cannot do anything for them; on detach
-                // the tenant directory is already gone.
-                //
-                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!(
-                    "could not remove ephemeral file '{}': {}",
-                    self.file.path, e
-                );
-            }
-        }
-    }
-}
-
 impl BlockReader for EphemeralFile {
     fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
         BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
new file mode 100644
index 0000000000..934400e5be
--- /dev/null
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -0,0 +1,218 @@
+//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
+//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
+
+use crate::context::RequestContext;
+use crate::page_cache::{self, PAGE_SZ};
+use crate::tenant::block_io::BlockLease;
+use crate::virtual_file::VirtualFile;
+
+use once_cell::sync::Lazy;
+use std::io::{self, ErrorKind};
+use tokio_epoll_uring::BoundedBuf;
+use tracing::*;
+
+use super::zero_padded_read_write;
+
+/// See module-level comment.
+pub struct RW {
+    page_cache_file_id: page_cache::FileId,
+    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
+}
+
+impl RW {
+    pub fn new(file: VirtualFile) -> Self {
+        let page_cache_file_id = page_cache::next_file_id();
+        Self {
+            page_cache_file_id,
+            rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
+                page_cache_file_id,
+                file,
+            )),
+        }
+    }
+
+    pub fn page_cache_file_id(&self) -> page_cache::FileId {
+        self.page_cache_file_id
+    }
+
+    pub(crate) async fn write_all_borrowed(&mut self, srcbuf: &[u8]) -> Result<usize, io::Error> {
+        // It doesn't make sense to proactively fill the page cache on the Pageserver write path
+        // because Compute is unlikely to access recently written data.
+        self.rw.write_all_borrowed(srcbuf).await
+    }
+
+    pub(crate) fn bytes_written(&self) -> u64 {
+        self.rw.bytes_written()
+    }
+
+    pub(crate) async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, io::Error> {
+        match self.rw.read_blk(blknum).await? {
+            zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => {
+                let cache = page_cache::get();
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
+                    .await
+                    .map_err(|e| {
+                        std::io::Error::new(
+                            std::io::ErrorKind::Other,
+                            // order path before error because error is anyhow::Error => might have many contexts
+                            format!(
+                                "ephemeral file: read immutable page #{}: {}: {:#}",
+                                blknum,
+                                self.rw.as_writer().file.path,
+                                e,
+                            ),
+                        )
+                    })? {
+                    page_cache::ReadBufResult::Found(guard) => {
+                        return Ok(BlockLease::PageReadGuard(guard))
+                    }
+                    page_cache::ReadBufResult::NotFound(write_guard) => {
+                        let write_guard = writer
+                            .file
+                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
+                            .await?;
+                        let read_guard = write_guard.mark_valid();
+                        return Ok(BlockLease::PageReadGuard(read_guard));
+                    }
+                }
+            }
+            zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
+                Ok(BlockLease::EphemeralFileMutableTail(buffer))
+            }
+        }
+    }
+}
+
+impl Drop for RW {
+    fn drop(&mut self) {
+        // There might still be pages in the [`crate::page_cache`] for this file.
+        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
+
+        // unlink the file
+        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
+        if let Err(e) = res {
+            if e.kind() != std::io::ErrorKind::NotFound {
+                // just never log the not found errors, we cannot do anything for them; on detach
+                // the tenant directory is already gone.
+                //
+                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
+                error!(
+                    "could not remove ephemeral file '{}': {}",
+                    self.rw.as_writer().file.path,
+                    e
+                );
+            }
+        }
+    }
+}
+
+struct PreWarmingWriter {
+    nwritten_blocks: u32,
+    page_cache_file_id: page_cache::FileId,
+    file: VirtualFile,
+}
+
+impl PreWarmingWriter {
+    fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self {
+        Self {
+            nwritten_blocks: 0,
+            page_cache_file_id,
+            file,
+        }
+    }
+}
+
+impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
+    async fn write_all<
+        B: tokio_epoll_uring::BoundedBuf<Buf = Buf>,
+        Buf: tokio_epoll_uring::IoBuf + Send,
+    >(
+        &mut self,
+        buf: B,
+    ) -> std::io::Result<(usize, B::Buf)> {
+        let buf = buf.slice(..);
+        let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done
+        let check_bounds_stuff_works = if cfg!(test) && cfg!(debug_assertions) {
+            Some(buf.to_vec())
+        } else {
+            None
+        };
+        let buflen = buf.len();
+        assert_eq!(
+            buflen % PAGE_SZ,
+            0,
+            "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
+        );
+
+        // Do the IO.
+        let iobuf = match self.file.write_all(buf).await {
+            (iobuf, Ok(nwritten)) => {
+                assert_eq!(nwritten, buflen);
+                iobuf
+            }
+            (_, Err(e)) => {
+                return Err(std::io::Error::new(
+                    ErrorKind::Other,
+                    // order error before path because path is long and error is short
+                    format!(
+                        "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
+                        self.nwritten_blocks, buflen, e, self.file.path,
+                    ),
+                ));
+            }
+        };
+
+        // Reconstruct the Slice (the write path consumed the Slice and returned us the underlying IoBuf)
+        let buf = tokio_epoll_uring::Slice::from_buf_bounds(iobuf, saved_bounds);
+        if let Some(check_bounds_stuff_works) = check_bounds_stuff_works {
+            assert_eq!(&check_bounds_stuff_works, &*buf);
+        }
+
+        // Pre-warm page cache with the contents.
+        // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
+        // benefits the code that writes InMemoryLayer=>L0 layers.
+        let nblocks = buflen / PAGE_SZ;
+        let nblocks32 = u32::try_from(nblocks).unwrap();
+        let cache = page_cache::get();
+        static CTX: Lazy<RequestContext> = Lazy::new(|| {
+            RequestContext::new(
+                crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
+                crate::context::DownloadBehavior::Error,
+            )
+        });
+        for blknum_in_buffer in 0..nblocks {
+            let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
+            let blknum = self
+                .nwritten_blocks
+                .checked_add(blknum_in_buffer as u32)
+                .unwrap();
+            match cache
+                .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
+                .await
+            {
+                Err(e) => {
+                    error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
+                    // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
+                }
+                Ok(v) => match v {
+                    page_cache::ReadBufResult::Found(_guard) => {
+                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
+                        unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
+                                      and this function takes &mut self, so, no concurrent read_blk is possible");
+                    }
+                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                        write_guard.copy_from_slice(blk_in_buffer);
+                        let _ = write_guard.mark_valid();
+                    }
+                },
+            }
+        }
+        self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
+        Ok((buflen, buf.into_inner()))
+    }
+}
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
new file mode 100644
index 0000000000..34944b1072
--- /dev/null
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -0,0 +1,125 @@
+//! The heart of how [`super::EphemeralFile`] does its reads and writes.
+//!
+//! # Writes
+//!
+//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`].
+//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`].
+//!
+//! # Reads
+//!
+//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`].
+//!
+//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer
+//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`]
+//! if the read is for the prefix that has already been flushed.
+//!
+//! # Current Usage
+//!
+//! The current user of this module is [`super::page_caching::RW`].
+
+mod zero_padded;
+
+use crate::{
+    page_cache::PAGE_SZ,
+    virtual_file::owned_buffers_io::{
+        self,
+        write::{Buffer, OwnedAsyncWriter},
+    },
+};
+
+const TAIL_SZ: usize = PAGE_SZ;
+
+/// See module-level comment.
+pub struct RW<W: OwnedAsyncWriter> {
+    buffered_writer: owned_buffers_io::write::BufferedWriter<
+        zero_padded::Buffer<TAIL_SZ>,
+        owned_buffers_io::util::size_tracking_writer::Writer<W>,
+    >,
+}
+
+pub enum ReadResult<'a, W> {
+    NeedsReadFromWriter { writer: &'a W },
+    ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
+}
+
+impl<W> RW<W>
+where
+    W: OwnedAsyncWriter,
+{
+    pub fn new(writer: W) -> Self {
+        let bytes_flushed_tracker =
+            owned_buffers_io::util::size_tracking_writer::Writer::new(writer);
+        let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
+            bytes_flushed_tracker,
+            zero_padded::Buffer::default(),
+        );
+        Self { buffered_writer }
+    }
+
+    pub(crate) fn as_writer(&self) -> &W {
+        self.buffered_writer.as_inner().as_inner()
+    }
+
+    pub async fn write_all_borrowed(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        self.buffered_writer.write_buffered_borrowed(buf).await
+    }
+
+    pub fn bytes_written(&self) -> u64 {
+        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
+        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
+        flushed_offset + u64::try_from(buffer.pending()).unwrap()
+    }
+
+    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
+        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
+        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
+        let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
+        let read_offset = (blknum as u64) * (PAGE_SZ as u64);
+
+        // The trailing page ("block") might only be partially filled,
+        // yet the blob_io code relies on us to return a full PAGE_SZed slice anyway.
+        // Moreover, it has to be zero-padded, because when we still had
+        // a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it.
+        // DeltaLayer probably has the same issue, not sure why it needs no special treatment.
+        // => check here that the read doesn't go beyond this potentially trailing
+        // => the zero-padding is done in the `else` branch below
+        let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 {
+            buffered_offset / (PAGE_SZ as u64)
+        } else {
+            (buffered_offset / (PAGE_SZ as u64)) + 1
+        };
+        if (blknum as u64) >= blocks_written {
+            return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}")));
+        }
+
+        // assertions for the `if-else` below
+        assert_eq!(
+            flushed_offset % (TAIL_SZ as u64), 0,
+            "we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks"
+        );
+        assert_eq!(
+            flushed_offset % (PAGE_SZ as u64),
+            0,
+            "the logic below can't handle if the page is spread across the flushed part and the buffer"
+        );
+
+        if read_offset < flushed_offset {
+            assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset);
+            Ok(ReadResult::NeedsReadFromWriter {
+                writer: self.as_writer(),
+            })
+        } else {
+            let read_offset_in_buffer = read_offset
+                .checked_sub(flushed_offset)
+                .expect("would have taken `if` branch instead of this one");
+            let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
+            let zero_padded_slice = buffer.as_zero_padded_slice();
+            let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
+            Ok(ReadResult::ServedFromZeroPaddedMutableTail {
+                buffer: page
+                    .try_into()
+                    .expect("the slice above got it as page-size slice"),
+            })
+        }
+    }
+}
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
new file mode 100644
index 0000000000..f90291bbf8
--- /dev/null
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
@@ -0,0 +1,108 @@
+//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose
+//! unwritten range is guaranteed to be zero-initialized.
+//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`]
+//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled.
+
+use std::mem::MaybeUninit;
+
+/// See module-level comment.
+pub struct Buffer<const N: usize> {
+    allocation: Box<[u8; N]>,
+    written: usize,
+}
+
+impl<const N: usize> Default for Buffer<N> {
+    fn default() -> Self {
+        Self {
+            allocation: Box::new(
+                // SAFETY: zeroed memory is a valid [u8; N]
+                unsafe { MaybeUninit::zeroed().assume_init() },
+            ),
+            written: 0,
+        }
+    }
+}
+
+impl<const N: usize> Buffer<N> {
+    #[inline(always)]
+    fn invariants(&self) {
+        // don't check by default, unoptimized is too expensive even for debug mode
+        if false {
+            debug_assert!(self.written <= N, "{}", self.written);
+            debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
+        }
+    }
+
+    pub fn as_zero_padded_slice(&self) -> &[u8; N] {
+        &self.allocation
+    }
+}
+
+impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buffer<N> {
+    type IoBuf = Self;
+
+    fn cap(&self) -> usize {
+        self.allocation.len()
+    }
+
+    fn extend_from_slice(&mut self, other: &[u8]) {
+        self.invariants();
+        let remaining = self.allocation.len() - self.written;
+        if other.len() > remaining {
+            panic!("calling extend_from_slice() with insufficient remaining capacity");
+        }
+        self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
+        self.written += other.len();
+        self.invariants();
+    }
+
+    fn pending(&self) -> usize {
+        self.written
+    }
+
+    fn flush(self) -> tokio_epoll_uring::Slice<Self> {
+        self.invariants();
+        let written = self.written;
+        tokio_epoll_uring::BoundedBuf::slice(self, 0..written)
+    }
+
+    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
+        let Self {
+            mut allocation,
+            written,
+        } = iobuf;
+        allocation[0..written].fill(0);
+        let new = Self {
+            allocation,
+            written: 0,
+        };
+        new.invariants();
+        new
+    }
+}
+
+/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a
+/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data.
+///
+/// Remember that bytes_init is generally _not_ a tracker of the amount
+/// of valid data in the io buffer; we use `Slice` for that.
+/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit.
+///
+/// SAFETY:
+///
+/// The [`Self::allocation`] is stable becauses boxes are stable.
+/// The memory is zero-initialized, so, bytes_init is always N.
+unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buffer<N> {
+    fn stable_ptr(&self) -> *const u8 {
+        self.allocation.as_ptr()
+    }
+
+    fn bytes_init(&self) -> usize {
+        // Yes, N, not self.written; Read the full comment of this impl block!
+        N
+    }
+
+    fn bytes_total(&self) -> usize {
+        N
+    }
+}
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 3744eecab5..1852e4b4ff 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -7,7 +7,6 @@ use std::collections::HashSet;
 use std::future::Future;
 
 use anyhow::{anyhow, Context};
-use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::shard::TenantShardId;
 use tokio::fs::{self, File, OpenOptions};
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 5939b969d6..8ec4d61434 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -482,7 +482,7 @@ impl InMemoryLayer {
         trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
 
         let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
-        let key = InMemoryLayerFileId(file.id());
+        let key = InMemoryLayerFileId(file.page_cache_file_id());
 
         Ok(InMemoryLayer {
             file_id: key,
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 1d43a94568..6127b35079 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -37,7 +37,6 @@ pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
 
-#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
 pub(crate) mod owned_buffers_io {
     //! Abstractions for IO with owned buffers.
     //!
diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
index edb11c5f4c..107ada4c13 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
@@ -14,6 +14,14 @@ impl<W> Writer<W> {
         }
     }
 
+    pub fn bytes_written(&self) -> u64 {
+        self.bytes_amount
+    }
+
+    pub fn as_inner(&self) -> &W {
+        &self.dst
+    }
+
     /// Returns the wrapped `VirtualFile` object as well as the number
     /// of bytes that were written to it through this object.
     pub fn into_inner(self) -> (u64, W) {
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index 6b3a02c71a..d419f02f2d 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -47,6 +47,15 @@ where
         }
     }
 
+    pub fn as_inner(&self) -> &W {
+        &self.writer
+    }
+
+    /// Panics if used after any of the write paths returned an error
+    pub fn inspect_buffer(&self) -> &B {
+        self.buf()
+    }
+
     pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
         self.flush().await?;
         let Self { buf, writer } = self;
@@ -100,6 +109,28 @@ where
         Ok((chunk_len, chunk.into_inner()))
     }
 
+    /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data.
+    ///
+    /// It is less performant because we always have to copy the borrowed data into the internal buffer
+    /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
+    /// for large writes.
+    pub async fn write_buffered_borrowed(&mut self, mut chunk: &[u8]) -> std::io::Result<usize> {
+        let chunk_len = chunk.len();
+        while !chunk.is_empty() {
+            let buf = self.buf.as_mut().expect("must not use after an error");
+            let need = buf.cap() - buf.pending();
+            let have = chunk.len();
+            let n = std::cmp::min(need, have);
+            buf.extend_from_slice(&chunk[..n]);
+            chunk = &chunk[n..];
+            if buf.pending() >= buf.cap() {
+                assert_eq!(buf.pending(), buf.cap());
+                self.flush().await?;
+            }
+        }
+        Ok(chunk_len)
+    }
+
     async fn flush(&mut self) -> std::io::Result<()> {
         let buf = self.buf.take().expect("must not use after an error");
         let buf_len = buf.pending();
@@ -266,4 +297,31 @@ mod tests {
         );
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
+        let recorder = RecorderWriter::default();
+        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
+
+        writer.write_buffered_borrowed(b"abc").await?;
+        writer.write_buffered_borrowed(b"d").await?;
+        writer.write_buffered_borrowed(b"e").await?;
+        writer.write_buffered_borrowed(b"fg").await?;
+        writer.write_buffered_borrowed(b"hi").await?;
+        writer.write_buffered_borrowed(b"j").await?;
+        writer.write_buffered_borrowed(b"klmno").await?;
+
+        let recorder = writer.flush_and_into_inner().await?;
+        assert_eq!(
+            recorder.writes,
+            {
+                let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"];
+                expect
+            }
+            .iter()
+            .map(|v| v[..].to_vec())
+            .collect::<Vec<_>>()
+        );
+        Ok(())
+    }
 }

From f1de18f1c9057510fb34d8241011a35d0f249d50 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 26 Apr 2024 12:15:05 +0100
Subject: [PATCH 0630/1571] Remove unused import (#7519)

Linter error from a merge collision

From ed577727936b18479a6d04c2449bb77eb8245e19 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Apr 2024 13:34:28 +0200
Subject: [PATCH 0631/1571] perf!: use larger buffers for blob_io and
 ephemeral_file (#7485)

part of https://github.com/neondatabase/neon/issues/7124

# Problem

(Re-stating the problem from #7124 for posterity)

The `test_bulk_ingest` benchmark shows about 2x lower throughput with
`tokio-epoll-uring` compared to `std-fs`.
That's why we temporarily disabled it in #7238.

The reason for this regression is that the benchmark runs on a system
without memory pressure and thus std-fs writes don't block on disk IO
but only copy the data into the kernel page cache.
`tokio-epoll-uring` cannot beat that at this time, and possibly never.
(However, under memory pressure, std-fs would stall the executor thread
on kernel page cache writeback disk IO. That's why we want to use
`tokio-epoll-uring`. And we likely want to use O_DIRECT in the future,
at which point std-fs becomes an absolute show-stopper.)

More elaborate analysis:
https://neondatabase.notion.site/Why-test_bulk_ingest-is-slower-with-tokio-epoll-uring-918c5e619df045a7bd7b5f806cfbd53f?pvs=4

# Changes

This PR increases the buffer size of `blob_io` and `EphemeralFile` from
PAGE_SZ=8k to 64k.

Longer-term, we probably want to do double-buffering / pipelined IO.

# Resource Usage

We currently do not flush the buffer when freezing the InMemoryLayer.
That means a single Timeline can have multiple 64k buffers alive, esp if
flushing is slow.
This poses an OOM risk.

We should either bound the number of frozen layers
(https://github.com/neondatabase/neon/issues/7317).

Or we should change the freezing code to flush the buffer and drop the
allocation.

However, that's future work.

# Performance

(Measurements done on i3en.3xlarge.)

The `test_bulk_insert.py` is too noisy, even with instance storage. It
varies by 30-40%. I suspect that's due to compaction. Raising amount of
data by 10x doesn't help with the noisiness.)

So, I used the `bench_ingest` from @jcsp 's #7409  .
Specifically, the `ingest-small-values/ingest 128MB/100b seq` and
`ingest-small-values/ingest 128MB/100b seq, no delta` benchmarks.

|     |                   | seq | seq, no delta |
|-----|-------------------|-----|---------------|
| 8k  | std-fs            | 55  | 165           |
| 8k  | tokio-epoll-uring | 37  | 107           |
| 64k | std-fs            | 55  | 180           |
| 64k | tokio-epoll-uring | 48  | 164           |

The `8k` is from before this PR, the `64k` is with this PR.
The values are the throughput reported by the benchmark (MiB/s).

We see that this PR gets `tokio-epoll-uring` from 67% to 87% of `std-fs`
performance in the `seq` benchmark. Notably, `seq` appears to hit some
other bottleneck at `55 MiB/s`. CC'ing #7418 due to the apparent
bottlenecks in writing delta layers.

For `seq, no delta`, this PR gets `tokio-epoll-uring` from 64% to 91% of
`std-fs` performance.
---
 pageserver/src/tenant/blob_io.rs                               | 2 +-
 pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 0d33100ead..6e90b3e8ff 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -121,7 +121,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         self.offset
     }
 
-    const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };
+    const CAPACITY: usize = if BUFFERED { 64 * 1024 } else { 0 };
 
     /// Writes the given buffer directly to the underlying `VirtualFile`.
     /// You need to make sure that the internal buffer is empty, otherwise
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
index 34944b1072..4159b5820a 100644
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -27,7 +27,7 @@ use crate::{
     },
 };
 
-const TAIL_SZ: usize = PAGE_SZ;
+const TAIL_SZ: usize = 64 * 1024;
 
 /// See module-level comment.
 pub struct RW<W: OwnedAsyncWriter> {

From af43f78561cb8603e0b864cbfb18f5324155b613 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 26 Apr 2024 14:53:05 +0100
Subject: [PATCH 0632/1571] pageserver: fix image layer creation check that
 inhibited compaction (#7420)

## Problem
PR #7230 attempted to introduce a WAL ingest threshold for checking
whether enough deltas are stacked to warrant creating a new image layer.
However, this check was incorrectly performed at the compaction
partition level instead of the timeline level. Hence, it inhibited GC
for any keys outside of the first partition.

## Summary of Changes
Hoist the check up to the timeline level.
---
 pageserver/src/tenant/timeline.rs      | 48 ++++++++++++++++----------
 test_runner/regress/test_compaction.py |  2 --
 2 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f1387e10ac..eb72ce9629 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3906,24 +3906,6 @@ impl Timeline {
 
     // Is it time to create a new image layer for the given partition?
     async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
-        let last = self.last_image_layer_creation_check_at.load();
-        if lsn != Lsn(0) {
-            let distance = lsn
-                .checked_sub(last)
-                .expect("Attempt to compact with LSN going backwards");
-
-            let min_distance = self.get_image_layer_creation_check_threshold() as u64
-                * self.get_checkpoint_distance();
-
-            // Skip the expensive delta layer counting below if we've not ingested
-            // sufficient WAL since the last check.
-            if distance.0 < min_distance {
-                return false;
-            }
-        }
-
-        self.last_image_layer_creation_check_at.store(lsn);
-
         let threshold = self.get_image_creation_threshold();
 
         let guard = self.layers.read().await;
@@ -3995,9 +3977,37 @@ impl Timeline {
         // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
         let mut start = Key::MIN;
 
+        let check_for_image_layers = {
+            let last_checks_at = self.last_image_layer_creation_check_at.load();
+            let distance = lsn
+                .checked_sub(last_checks_at)
+                .expect("Attempt to compact with LSN going backwards");
+            let min_distance = self.get_image_layer_creation_check_threshold() as u64
+                * self.get_checkpoint_distance();
+
+            // Skip the expensive delta layer counting if this timeline has not ingested sufficient
+            // WAL since the last check.
+            distance.0 >= min_distance
+        };
+
+        if check_for_image_layers {
+            self.last_image_layer_creation_check_at.store(lsn);
+        }
+
         for partition in partitioning.parts.iter() {
             let img_range = start..partition.ranges.last().unwrap().end;
-            if !force && !self.time_for_new_image_layer(partition, lsn).await {
+
+            let do_it = if force {
+                true
+            } else if check_for_image_layers {
+                // [`Self::time_for_new_image_layer`] is CPU expensive,
+                // so skip if we've not collected enough WAL since the last time
+                self.time_for_new_image_layer(partition, lsn).await
+            } else {
+                false
+            };
+
+            if !do_it {
                 start = img_range.end;
                 continue;
             }
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 37b87b92a9..3902819d3d 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -14,8 +14,6 @@ AGGRESIVE_COMPACTION_TENANT_CONF = {
     # Compact small layers
     "compaction_target_size": 1024**2,
     "image_creation_threshold": 2,
-    # INC-186: remove when merging the fix
-    "image_layer_creation_check_threshold": 0,
 }
 
 
From 39427925c2f9fa6966aec9da66408aa134d30ab4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 26 Apr 2024 16:23:25 +0200
Subject: [PATCH 0633/1571] Return Past instead of Present or Future when
 commit_lsn < min_lsn (#7520)

Implements an approach different from the one #7488 chose: We now return
`past` instead of `present` (or`future`) when encountering the edge case
where commit_lsn < min_lsn. In my opinion, both `past` and `present` are
correct responses, but past is slightly better as the lsn returned by
`present` with #7488 is one too "new". In practice, this shouldn't
matter much, but shrug.

We agreed in slack that this is the better approach:
https://neondb.slack.com/archives/C03F5SM1N02/p1713871064147029
---
 pageserver/src/pgdatadir_mapping.rs | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 14bcc50e7e..c76c2d5451 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -445,11 +445,6 @@ impl Timeline {
         // include physical changes from later commits that will be marked
         // as aborted, and will need to be vacuumed away.
         let commit_lsn = Lsn((low - 1) * 8);
-        // This maxing operation is for the edge case that the search above did
-        // set found_smaller to true but it never increased the lsn. Then, low
-        // is still the old min_lsn the subtraction above could possibly give a value
-        // below the anchestor_lsn.
-        let commit_lsn = commit_lsn.max(min_lsn);
         match (found_smaller, found_larger) {
             (false, false) => {
                 // This can happen if no commit records have been processed yet, e.g.
@@ -460,6 +455,12 @@ impl Timeline {
                 // Didn't find any commit timestamps smaller than the request
                 Ok(LsnForTimestamp::Past(min_lsn))
             }
+            (true, _) if commit_lsn < min_lsn => {
+                // the search above did set found_smaller to true but it never increased the lsn.
+                // Then, low is still the old min_lsn, and the subtraction above gave a value
+                // below the min_lsn. We should never do that.
+                Ok(LsnForTimestamp::Past(min_lsn))
+            }
             (true, false) => {
                 // Only found commits with timestamps smaller than the request.
                 // It's still a valid case for branch creation, return it.

From dbe0aa653ac2d0c3ef0a8087b7ab8878d1e59c9a Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 26 Apr 2024 11:48:47 -0400
Subject: [PATCH 0634/1571] feat(pageserver): add aux-file-v2 flag on tenant
 level (#7505)

Changing metadata format is not easy. This pull request adds a
tenant-level flag on whether to enable aux file v2. As long as we don't
roll this out to the user and guarantee our staging projects can persist
tenant config correctly, we can test the aux file v2 change with setting
this flag. Previous discussion at
https://github.com/neondatabase/neon/pull/7424.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/src/pageserver.rs                  | 10 ++++++++++
 libs/pageserver_api/src/models.rs                |  1 +
 pageserver/src/tenant.rs                         |  1 +
 pageserver/src/tenant/config.rs                  | 13 +++++++++++++
 pageserver/src/tenant/timeline.rs                |  9 +++++++++
 test_runner/regress/test_attach_tenant_config.py |  1 +
 6 files changed, 35 insertions(+)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index adac7d7bb5..0699e47866 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -441,6 +441,11 @@ impl PageServerNode {
                 .map(serde_json::from_str)
                 .transpose()
                 .context("parse `timeline_get_throttle` from json")?,
+            switch_to_aux_file_v2: settings
+                .remove("switch_to_aux_file_v2")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
@@ -559,6 +564,11 @@ impl PageServerNode {
                     .map(serde_json::from_str)
                     .transpose()
                     .context("parse `timeline_get_throttle` from json")?,
+                switch_to_aux_file_v2: settings
+                    .remove("switch_to_aux_file_v2")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
             }
         };
 
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 4ce1ecde26..e2acde6139 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -303,6 +303,7 @@ pub struct TenantConfig {
     pub lazy_slru_download: Option<bool>,
     pub timeline_get_throttle: Option<ThrottleConfig>,
     pub image_layer_creation_check_threshold: Option<u8>,
+    pub switch_to_aux_file_v2: Option<bool>,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ff6194ab00..32c0606fc2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3664,6 +3664,7 @@ pub(crate) mod harness {
                 image_layer_creation_check_threshold: Some(
                     tenant_conf.image_layer_creation_check_threshold,
                 ),
+                switch_to_aux_file_v2: Some(tenant_conf.switch_to_aux_file_v2),
             }
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index a2bb479f63..9975c9edbc 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -369,6 +369,10 @@ pub struct TenantConf {
     // How much WAL must be ingested before checking again whether a new image layer is required.
     // Expresed in multiples of checkpoint distance.
     pub image_layer_creation_check_threshold: u8,
+
+    /// Switch to aux file v2. Switching this flag requires the user has not written any aux file into
+    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
+    pub switch_to_aux_file_v2: bool,
 }
 
 /// Same as TenantConf, but this struct preserves the information about
@@ -464,6 +468,10 @@ pub struct TenantConfOpt {
 
     #[serde(skip_serializing_if = "Option::is_none")]
     pub image_layer_creation_check_threshold: Option<u8>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub switch_to_aux_file_v2: Option<bool>,
 }
 
 impl TenantConfOpt {
@@ -521,6 +529,9 @@ impl TenantConfOpt {
             image_layer_creation_check_threshold: self
                 .image_layer_creation_check_threshold
                 .unwrap_or(global_conf.image_layer_creation_check_threshold),
+            switch_to_aux_file_v2: self
+                .switch_to_aux_file_v2
+                .unwrap_or(global_conf.switch_to_aux_file_v2),
         }
     }
 }
@@ -562,6 +573,7 @@ impl Default for TenantConf {
             lazy_slru_download: false,
             timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
             image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
+            switch_to_aux_file_v2: false,
         }
     }
 }
@@ -636,6 +648,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             lazy_slru_download: value.lazy_slru_download,
             timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
             image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
+            switch_to_aux_file_v2: value.switch_to_aux_file_v2,
         }
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index eb72ce9629..a05e0da260 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1871,6 +1871,15 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 
 // Private functions
 impl Timeline {
+    #[allow(dead_code)]
+    pub(crate) fn get_switch_to_aux_file_v2(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .switch_to_aux_file_v2
+            .unwrap_or(self.conf.default_tenant_conf.switch_to_aux_file_v2)
+    }
+
     pub(crate) fn get_lazy_slru_download(&self) -> bool {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 909d25980b..59461cc095 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -190,6 +190,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "trace_read_requests": True,
         "walreceiver_connect_timeout": "13m",
         "image_layer_creation_check_threshold": 1,
+        "switch_to_aux_file_v2": True,
     }
 
     ps_http = env.pageserver.http_client()

From ee3437cbd8d539d00cc0789b7314d8a995668a9d Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 26 Apr 2024 13:35:01 -0400
Subject: [PATCH 0635/1571] chore(pageserver): shrink aux keyspace to 0x60-0x7F
 (#7502)

extracted from https://github.com/neondatabase/neon/pull/7468, part of
https://github.com/neondatabase/neon/issues/7462.

In the page server, we use i128 (instead of u128) to do the integer
representation of the key, which indicates that the highest bit of the
key should not be 1. This constraints our keyspace to <= 0x7F.

Also fix the bug of `to_i128` that dropped the highest 4b. Now we keep
3b of them, dropping the sign bit.

And on that, we shrink the metadata keyspace to 0x60-0x7F for now, and
once we add support for u128, we can have a larger metadata keyspace.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs      | 53 ++++++++++++++++++++---------
 libs/pageserver_api/src/keyspace.rs | 16 ++++++---
 pageserver/src/aux_file.rs          | 12 +++----
 3 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 01919e8325..ea6115853e 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -4,7 +4,6 @@ use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
-use std::ops::RangeInclusive;
 use std::{fmt, ops::Range};
 
 use crate::reltag::{BlockNumber, RelTag, SlruKind};
@@ -30,24 +29,25 @@ pub const KEY_SIZE: usize = 18;
 /// See [`Key::to_i128`] for more information on the encoding.
 pub const METADATA_KEY_SIZE: usize = 16;
 
-/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x80 is a metadata key.
-pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x80;
+/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
+pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
+pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
 
 /// The (reserved) key prefix of relation sizes.
-pub const RELATION_SIZE_PREFIX: u8 = 0x81;
+pub const RELATION_SIZE_PREFIX: u8 = 0x61;
 
 /// The key prefix of AUX file keys.
-pub const AUX_KEY_PREFIX: u8 = 0x82;
+pub const AUX_KEY_PREFIX: u8 = 0x62;
 
 /// Check if the key falls in the range of metadata keys.
 pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
-    key[0] >= METADATA_KEY_BEGIN_PREFIX
+    key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
 }
 
 impl Key {
     /// Check if the key falls in the range of metadata keys.
     pub const fn is_metadata_key(&self) -> bool {
-        self.field1 >= METADATA_KEY_BEGIN_PREFIX
+        self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
     }
 
     /// Encode a metadata key to a storage key.
@@ -80,7 +80,7 @@ impl Key {
     }
 
     /// Get the range of metadata keys.
-    pub fn metadata_key_range() -> RangeInclusive<Self> {
+    pub fn metadata_key_range() -> Range<Self> {
         Key {
             field1: METADATA_KEY_BEGIN_PREFIX,
             field2: 0,
@@ -88,13 +88,32 @@ impl Key {
             field4: 0,
             field5: 0,
             field6: 0,
-        }..=Key {
-            field1: u8::MAX,
-            field2: u16::MAX as u32,
-            field3: u32::MAX,
-            field4: u32::MAX,
-            field5: u8::MAX,
-            field6: u32::MAX,
+        }..Key {
+            field1: METADATA_KEY_END_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }
+    }
+
+    /// Get the range of aux keys.
+    pub fn metadata_aux_key_range() -> Range<Self> {
+        Key {
+            field1: AUX_KEY_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }..Key {
+            field1: AUX_KEY_PREFIX + 1,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
         }
     }
 
@@ -103,7 +122,7 @@ impl Key {
     /// we can assume that only some predefined namespace OIDs are used which can fit in u16
     pub fn to_i128(&self) -> i128 {
         assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
-        (((self.field1 & 0xf) as i128) << 120)
+        (((self.field1 & 0x7F) as i128) << 120)
             | (((self.field2 & 0xFFFF) as i128) << 104)
             | ((self.field3 as i128) << 72)
             | ((self.field4 as i128) << 40)
@@ -113,7 +132,7 @@ impl Key {
 
     pub const fn from_i128(x: i128) -> Self {
         Key {
-            field1: ((x >> 120) & 0xf) as u8,
+            field1: ((x >> 120) & 0x7F) as u8,
             field2: ((x >> 104) & 0xFFFF) as u32,
             field3: (x >> 72) as u32,
             field4: (x >> 40) as u32,
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index f73648d306..eed4835f25 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -15,7 +15,13 @@ pub struct KeySpace {
 }
 
 impl KeySpace {
-    ///
+    /// Create a key space with a single range.
+    pub fn single(key_range: Range<Key>) -> Self {
+        Self {
+            ranges: vec![key_range],
+        }
+    }
+
     /// Partition a key space into roughly chunks of roughly 'target_size' bytes
     /// in each partition.
     ///
@@ -64,6 +70,10 @@ impl KeySpace {
         KeyPartitioning { parts }
     }
 
+    pub fn is_empty(&self) -> bool {
+        self.total_size() == 0
+    }
+
     /// Merge another keyspace into the current one.
     /// Note: the keyspaces must not ovelap (enforced via assertions)
     pub fn merge(&mut self, other: &KeySpace) {
@@ -162,10 +172,6 @@ impl KeySpace {
             .sum()
     }
 
-    pub fn is_empty(&self) -> bool {
-        self.total_size() == 0
-    }
-
     fn overlaps_at(&self, range: &Range<Key>) -> Option<usize> {
         match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
             Ok(0) => None,
diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs
index aba4ccf19d..a343acaf7a 100644
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -85,27 +85,27 @@ mod tests {
         // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
         // of the page server.
         assert_eq!(
-            "8200000101E5B20C5F8DD5AA3289D6D9EAFA",
+            "6200000101E5B20C5F8DD5AA3289D6D9EAFA",
             encode_aux_file_key("pg_logical/mappings/test1").to_string()
         );
         assert_eq!(
-            "820000010239AAC544893139B26F501B97E6",
+            "620000010239AAC544893139B26F501B97E6",
             encode_aux_file_key("pg_logical/snapshots/test2").to_string()
         );
         assert_eq!(
-            "820000010300000000000000000000000000",
+            "620000010300000000000000000000000000",
             encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
         );
         assert_eq!(
-            "82000001FF8635AF2134B7266EC5B4189FD6",
+            "62000001FF8635AF2134B7266EC5B4189FD6",
             encode_aux_file_key("pg_logical/unsupported").to_string()
         );
         assert_eq!(
-            "8200000201772D0E5D71DE14DA86142A1619",
+            "6200000201772D0E5D71DE14DA86142A1619",
             encode_aux_file_key("pg_replslot/test3").to_string()
         );
         assert_eq!(
-            "820000FFFF1866EBEB53B807B26A2416F317",
+            "620000FFFF1866EBEB53B807B26A2416F317",
             encode_aux_file_key("other_file_not_supported").to_string()
         );
     }

From 75b4440d0786b4f53c5ca26e9c7ed8b88bc4b40b Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 26 Apr 2024 17:09:51 -0400
Subject: [PATCH 0636/1571] fix(virtual_file): compile warnings on macos
 (#7525)

starting at commit
https://github.com/neondatabase/neon/commit/dbb0c967d5fb5104847fb71e8d783ebeae3e7ff2,
macOS reports warning for a few functions in the virtual file module.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../virtual_file/owned_buffers_io/util/size_tracking_writer.rs  | 1 +
 pageserver/src/virtual_file/owned_buffers_io/write.rs           | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
index 107ada4c13..c2817699c3 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
@@ -24,6 +24,7 @@ impl<W> Writer<W> {
 
     /// Returns the wrapped `VirtualFile` object as well as the number
     /// of bytes that were written to it through this object.
+    #[cfg_attr(target_os = "macos", allow(dead_code))]
     pub fn into_inner(self) -> (u64, W) {
         (self.bytes_amount, self.dst)
     }
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index d419f02f2d..738a642332 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -56,6 +56,7 @@ where
         self.buf()
     }
 
+    #[cfg_attr(target_os = "macos", allow(dead_code))]
     pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
         self.flush().await?;
         let Self { buf, writer } = self;
@@ -70,6 +71,7 @@ where
             .expect("must not use after we returned an error")
     }
 
+    #[cfg_attr(target_os = "macos", allow(dead_code))]
     pub async fn write_buffered<S: IoBuf>(&mut self, chunk: Slice<S>) -> std::io::Result<(usize, S)>
     where
         S: IoBuf + Send,

From 3695a1efa1c88c3b98106f5a2a8e74d655e467b1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 29 Apr 2024 07:14:53 +0300
Subject: [PATCH 0637/1571] metrics: record time to update gc info as a per
 timeline metric (#7473)

We know that updating gc info can take a very long time from [recent
incident], and holding `Tenant::gc_cs` affects many per-tenant
operations in the system. We need a direct way to observe the time it
takes. The solution is to add metrics so that we know when this happens:
- 2 new per-timeline metric
- 1 new global histogram

Verified that the buckets are okay-ish in [dashboard]. In our current
state, we will see a lot more of `Inf,` but that is probably okay; at
least we can learn which timelines are having issues.

Can we afford to add these metrics? A bit unclear, see [another
dashboard] with top pageserver `/metrics` response sizes.

[dashboard]:
https://neonprod.grafana.net/d/b7a5a5e2-1276-4bb0-9e3a-b4528adb6eb6/storage-operations-histograms-in-prod?orgId=1&var-datasource=ZNX49CDVz&var-instance=All&var-operation=All&from=now-7d&to=now

[another dashboard]:
https://neonprod.grafana.net/d/MQx4SN-Vk/metric-sizes-on-prod-and-some-correlations?orgId=1

[recent incident]:
https://neondb.slack.com/archives/C06UEMLK7FE/p1713817696580119?thread_ts=1713468604.508969&cid=C06UEMLK7FE
---
 pageserver/src/metrics.rs         | 27 +++++++++++++++++++++++++++
 pageserver/src/tenant.rs          |  6 +++++-
 pageserver/src/tenant/size.rs     |  3 +++
 pageserver/src/tenant/timeline.rs |  6 ++++++
 4 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index e4b314f805..d3c8c423e4 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -51,6 +51,9 @@ pub(crate) enum StorageTimeOperation {
     #[strum(serialize = "gc")]
     Gc,
 
+    #[strum(serialize = "update gc info")]
+    UpdateGcInfo,
+
     #[strum(serialize = "create tenant")]
     CreateTenant,
 }
@@ -1910,6 +1913,22 @@ impl StorageTimeMetricsTimer {
         self.metrics.timeline_count.inc();
         self.metrics.global_histogram.observe(duration);
     }
+
+    /// Turns this timer into a timer, which will always record -- usually this means recording
+    /// regardless an early `?` path was taken in a function.
+    pub(crate) fn record_on_drop(self) -> AlwaysRecordingStorageTimeMetricsTimer {
+        AlwaysRecordingStorageTimeMetricsTimer(Some(self))
+    }
+}
+
+pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option<StorageTimeMetricsTimer>);
+
+impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
+    fn drop(&mut self) {
+        if let Some(inner) = self.0.take() {
+            inner.stop_and_record();
+        }
+    }
 }
 
 /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
@@ -1970,6 +1989,7 @@ pub(crate) struct TimelineMetrics {
     pub imitate_logical_size_histo: StorageTimeMetrics,
     pub load_layer_map_histo: StorageTimeMetrics,
     pub garbage_collect_histo: StorageTimeMetrics,
+    pub update_gc_info_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
     resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
@@ -2030,6 +2050,12 @@ impl TimelineMetrics {
             &shard_id,
             &timeline_id,
         );
+        let update_gc_info_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::UpdateGcInfo,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
         let last_record_gauge = LAST_RECORD_LSN
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -2072,6 +2098,7 @@ impl TimelineMetrics {
             logical_size_histo,
             imitate_logical_size_histo,
             garbage_collect_histo,
+            update_gc_info_histo,
             load_layer_map_histo,
             last_record_gauge,
             resident_physical_size_gauge,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 32c0606fc2..02ce65922e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3402,7 +3402,11 @@ impl Tenant {
         // is in progress (which is not a common case).
         //
         // See more for on the issue #2748 condenced out of the initial PR review.
-        let mut shared_cache = self.cached_logical_sizes.lock().await;
+        let mut shared_cache = tokio::select! {
+            locked = self.cached_logical_sizes.lock() => locked,
+            _ = cancel.cancelled() => anyhow::bail!("cancelled"),
+            _ = self.cancel.cancelled() => anyhow::bail!("tenant is shutting down"),
+        };
 
         size::gather_inputs(
             self,
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index ad79b74d8b..f521dfa55d 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -118,6 +118,9 @@ pub(super) async fn gather_inputs(
     ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
     // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
+    //
+    // FIXME: if a single timeline is deleted while refresh gc info is ongoing, we will fail the
+    // whole computation. It does not make sense from the billing perspective.
     tenant
         .refresh_gc_info(cancel, ctx)
         .await
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a05e0da260..c10adf4c22 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4345,6 +4345,12 @@ impl Timeline {
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        let _timer = self
+            .metrics
+            .update_gc_info_histo
+            .start_timer()
+            .record_on_drop();
+
         // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
         //
         // Some unit tests depend on garbage-collection working even when

From b655c7030ff2172e32f0c6e2e056183aa3b70d81 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 29 Apr 2024 08:52:18 +0100
Subject: [PATCH 0638/1571] neon_local: add "tenant import" (#7399)

## Problem

Sometimes we have test data in the form of S3 contents that we would
like to run live in a neon_local environment.

## Summary of changes

- Add a storage controller API that imports an existing tenant.
Currently this is equivalent to doing a create with a high generation
number, but in future this would be something smarter to probe S3 to
find the shards in a tenant and find generation numbers.
- Add a `neon_local` command that invokes the import API, and then
inspects timelines in the newly attached tenant to create matching
branches.
---
 control_plane/src/bin/neon_local.rs           |  50 +++++++
 control_plane/src/storage_controller.rs       |  10 ++
 libs/pageserver_api/src/models.rs             |  11 ++
 libs/utils/src/generation.rs                  |   2 +
 pageserver/client/src/mgmt_api.rs             |  13 ++
 pageserver/src/http/routes.rs                 |  82 +++++++++++
 pageserver/src/tenant.rs                      |  12 +-
 .../src/tenant/remote_timeline_client.rs      |  11 +-
 .../tenant/remote_timeline_client/download.rs |  80 ++++++-----
 storage_controller/src/http.rs                |  19 +++
 storage_controller/src/pageserver_client.rs   |  17 ++-
 storage_controller/src/service.rs             | 130 ++++++++++++++++--
 test_runner/fixtures/neon_fixtures.py         |  12 ++
 .../regress/test_storage_controller.py        |  83 +++++++++++
 test_runner/regress/test_tenant_detach.py     |   2 +-
 15 files changed, 481 insertions(+), 53 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 7f8f6d21e0..1a9e9a1e6a 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -417,6 +417,54 @@ async fn handle_tenant(
                 println!("{} {:?}", t.id, t.state);
             }
         }
+        Some(("import", import_match)) => {
+            let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate);
+
+            let storage_controller = StorageController::from_env(env);
+            let create_response = storage_controller.tenant_import(tenant_id).await?;
+
+            let shard_zero = create_response
+                .shards
+                .first()
+                .expect("Import response omitted shards");
+
+            let attached_pageserver_id = shard_zero.node_id;
+            let pageserver =
+                PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?);
+
+            println!(
+                "Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}"
+            );
+
+            let timelines = pageserver
+                .http_client
+                .list_timelines(shard_zero.shard_id)
+                .await?;
+
+            // Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names
+            let main_timeline = timelines
+                .iter()
+                .find(|t| t.ancestor_timeline_id.is_none())
+                .expect("No timelines found")
+                .timeline_id;
+
+            let mut branch_i = 0;
+            for timeline in timelines.iter() {
+                let branch_name = if timeline.timeline_id == main_timeline {
+                    "main".to_string()
+                } else {
+                    branch_i += 1;
+                    format!("branch_{branch_i}")
+                };
+
+                println!(
+                    "Importing timeline {tenant_id}/{} as branch {branch_name}",
+                    timeline.timeline_id
+                );
+
+                env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?;
+            }
+        }
         Some(("create", create_match)) => {
             let tenant_conf: HashMap<_, _> = create_match
                 .get_many::<String>("config")
@@ -1480,6 +1528,8 @@ fn cli() -> Command {
             .subcommand(Command::new("config")
                 .arg(tenant_id_arg.clone())
                 .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
+            .subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true))
+                .about("Import a tenant that is present in remote storage, and create branches for its timelines"))
         )
         .subcommand(
             Command::new("pageserver")
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 7f2b973391..dbb4475ae8 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -472,6 +472,16 @@ impl StorageController {
             .await
     }
 
+    #[instrument(skip(self))]
+    pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result<TenantCreateResponse> {
+        self.dispatch::<(), TenantCreateResponse>(
+            Method::POST,
+            format!("debug/v1/tenant/{tenant_id}/import"),
+            None,
+        )
+        .await
+    }
+
     #[instrument(skip(self))]
     pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
         self.dispatch::<(), _>(
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index e2acde6139..c752799c4c 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -782,6 +782,17 @@ pub struct SecondaryProgress {
     pub bytes_total: u64,
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantScanRemoteStorageShard {
+    pub tenant_shard_id: TenantShardId,
+    pub generation: Option<u32>,
+}
+
+#[derive(Serialize, Deserialize, Debug, Default)]
+pub struct TenantScanRemoteStorageResponse {
+    pub shards: Vec<TenantScanRemoteStorageShard>,
+}
+
 pub mod virtual_file {
     #[derive(
         Copy,
diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs
index af15cee924..b703e883de 100644
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -34,6 +34,8 @@ pub enum Generation {
 /// scenarios where pageservers might otherwise issue conflicting writes to
 /// remote storage
 impl Generation {
+    pub const MAX: Self = Self::Valid(u32::MAX);
+
     /// Create a new Generation that represents a legacy key format with
     /// no generation suffix
     pub fn none() -> Self {
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 892e6c2758..012cb1a662 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -243,6 +243,19 @@ impl Client {
         Ok(())
     }
 
+    pub async fn tenant_scan_remote_storage(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantScanRemoteStorageResponse> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/scan_remote_storage",
+            self.mgmt_api_endpoint
+        );
+        let response = self.request(Method::GET, &uri, ()).await?;
+        let body = response.json().await.map_err(Error::ReceiveBody)?;
+        Ok(body)
+    }
+
     pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
         let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
         self.request(Method::PUT, &uri, req).await?;
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 81508965b4..9a280c2e0c 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -19,6 +19,8 @@ use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
+use pageserver_api::models::TenantScanRemoteStorageResponse;
+use pageserver_api::models::TenantScanRemoteStorageShard;
 use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
@@ -29,6 +31,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
+use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
 use tenant_size_model::{SizeResult, StorageModel};
@@ -54,6 +57,9 @@ use crate::tenant::mgr::{
 };
 use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
 use crate::tenant::remote_timeline_client;
+use crate::tenant::remote_timeline_client::download_index_part;
+use crate::tenant::remote_timeline_client::list_remote_tenant_shards;
+use crate::tenant::remote_timeline_client::list_remote_timelines;
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -2035,6 +2041,79 @@ async fn secondary_upload_handler(
     json_response(StatusCode::OK, ())
 }
 
+async fn tenant_scan_remote_handler(
+    request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+
+    let Some(remote_storage) = state.remote_storage.as_ref() else {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "Remote storage not configured"
+        )));
+    };
+
+    let mut response = TenantScanRemoteStorageResponse::default();
+
+    let (shards, _other_keys) =
+        list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone())
+            .await
+            .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
+
+    for tenant_shard_id in shards {
+        let (timeline_ids, _other_keys) =
+            list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone())
+                .await
+                .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
+
+        let mut generation = Generation::none();
+        for timeline_id in timeline_ids {
+            match download_index_part(
+                remote_storage,
+                &tenant_shard_id,
+                &timeline_id,
+                Generation::MAX,
+                &cancel,
+            )
+            .instrument(info_span!("download_index_part",
+                         tenant_id=%tenant_shard_id.tenant_id,
+                         shard_id=%tenant_shard_id.shard_slug(),
+                         %timeline_id))
+            .await
+            {
+                Ok((index_part, index_generation)) => {
+                    tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
+                        index_part.layer_metadata.len(), index_part.get_disk_consistent_lsn());
+                    generation = std::cmp::max(generation, index_generation);
+                }
+                Err(DownloadError::NotFound) => {
+                    // This is normal for tenants that were created with multiple shards: they have an unsharded path
+                    // containing the timeline's initdb tarball but no index.  Otherwise it is a bit strange.
+                    tracing::info!("Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping");
+                    continue;
+                }
+                Err(e) => {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
+                }
+            };
+        }
+
+        response.shards.push(TenantScanRemoteStorageShard {
+            tenant_shard_id,
+            generation: generation.into(),
+        });
+    }
+
+    if response.shards.is_empty() {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("No shards found for tenant ID {tenant_id}").into(),
+        ));
+    }
+
+    json_response(StatusCode::OK, response)
+}
+
 async fn secondary_download_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -2431,6 +2510,9 @@ pub fn make_router(
         .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
             api_handler(r, secondary_upload_handler)
         })
+        .get("/v1/tenant/:tenant_id/scan_remote_storage", |r| {
+            api_handler(r, tenant_scan_remote_handler)
+        })
         .put("/v1/disk_usage_eviction/run", |r| {
             api_handler(r, disk_usage_eviction_run)
         })
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 02ce65922e..cb3e36efb3 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -888,7 +888,7 @@ impl Tenant {
 
     #[instrument(skip_all)]
     pub(crate) async fn preload(
-        self: &Arc<Tenant>,
+        self: &Arc<Self>,
         remote_storage: &GenericRemoteStorage,
         cancel: CancellationToken,
     ) -> anyhow::Result<TenantPreload> {
@@ -918,9 +918,13 @@ impl Tenant {
 
         Ok(TenantPreload {
             deleting,
-            timelines: self
-                .load_timeline_metadata(remote_timeline_ids, remote_storage, cancel)
-                .await?,
+            timelines: Self::load_timeline_metadata(
+                self,
+                remote_timeline_ids,
+                remote_storage,
+                cancel,
+            )
+            .await?,
         })
     }
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index d02f00adad..c0767345ca 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -243,7 +243,9 @@ use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;
 
-pub(crate) use download::{is_temp_download_file, list_remote_timelines};
+pub(crate) use download::{
+    download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines,
+};
 pub(crate) use index::LayerFileMetadata;
 
 // Occasional network issues and such can cause remote operations to fail, and
@@ -472,7 +474,7 @@ impl RemoteTimelineClient {
             },
         );
 
-        let index_part = download::download_index_part(
+        let (index_part, _index_generation) = download::download_index_part(
             &self.storage_impl,
             &self.tenant_shard_id,
             &self.timeline_id,
@@ -1716,6 +1718,11 @@ impl RemoteTimelineClient {
     }
 }
 
+pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
+    let path = format!("tenants/{tenant_shard_id}");
+    RemotePath::from_string(&path).expect("Failed to construct path")
+}
+
 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
     let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
     RemotePath::from_string(&path).expect("Failed to construct path")
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 1852e4b4ff..250354ac20 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -5,6 +5,7 @@
 
 use std::collections::HashSet;
 use std::future::Future;
+use std::str::FromStr;
 
 use anyhow::{anyhow, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -25,13 +26,13 @@ use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
 use utils::crashsafe::path_with_suffix_extension;
-use utils::id::TimelineId;
+use utils::id::{TenantId, TimelineId};
 
 use super::index::{IndexPart, LayerFileMetadata};
 use super::{
     parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
-    remote_initdb_preserved_archive_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
-    INITDB_PATH,
+    remote_initdb_preserved_archive_path, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
+    FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
 };
 
 ///
@@ -253,42 +254,31 @@ pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
     }
 }
 
-/// List timelines of given tenant in remote storage
-pub async fn list_remote_timelines(
+async fn list_identifiers<T>(
     storage: &GenericRemoteStorage,
-    tenant_shard_id: TenantShardId,
+    prefix: RemotePath,
     cancel: CancellationToken,
-) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
-    let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();
-
-    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
-        anyhow::bail!("storage-sync-list-remote-timelines");
-    });
-
+) -> anyhow::Result<(HashSet<T>, HashSet<String>)>
+where
+    T: FromStr + Eq + std::hash::Hash,
+{
     let listing = download_retry_forever(
-        || {
-            storage.list(
-                Some(&remote_path),
-                ListingMode::WithDelimiter,
-                None,
-                &cancel,
-            )
-        },
-        &format!("list timelines for {tenant_shard_id}"),
+        || storage.list(Some(&prefix), ListingMode::WithDelimiter, None, &cancel),
+        &format!("list identifiers in prefix {prefix}"),
         &cancel,
     )
     .await?;
 
-    let mut timeline_ids = HashSet::new();
+    let mut parsed_ids = HashSet::new();
     let mut other_prefixes = HashSet::new();
 
-    for timeline_remote_storage_key in listing.prefixes {
-        let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
-            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_shard_id}")
+    for id_remote_storage_key in listing.prefixes {
+        let object_name = id_remote_storage_key.object_name().ok_or_else(|| {
+            anyhow::anyhow!("failed to get object name for key {id_remote_storage_key}")
         })?;
 
-        match object_name.parse::<TimelineId>() {
-            Ok(t) => timeline_ids.insert(t),
+        match object_name.parse::<T>() {
+            Ok(t) => parsed_ids.insert(t),
             Err(_) => other_prefixes.insert(object_name.to_string()),
         };
     }
@@ -300,7 +290,31 @@ pub async fn list_remote_timelines(
         other_prefixes.insert(object_name.to_string());
     }
 
-    Ok((timeline_ids, other_prefixes))
+    Ok((parsed_ids, other_prefixes))
+}
+
+/// List shards of given tenant in remote storage
+pub(crate) async fn list_remote_tenant_shards(
+    storage: &GenericRemoteStorage,
+    tenant_id: TenantId,
+    cancel: CancellationToken,
+) -> anyhow::Result<(HashSet<TenantShardId>, HashSet<String>)> {
+    let remote_path = remote_tenant_path(&TenantShardId::unsharded(tenant_id));
+    list_identifiers::<TenantShardId>(storage, remote_path, cancel).await
+}
+
+/// List timelines of given tenant shard in remote storage
+pub async fn list_remote_timelines(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: TenantShardId,
+    cancel: CancellationToken,
+) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
+    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
+        anyhow::bail!("storage-sync-list-remote-timelines");
+    });
+
+    let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();
+    list_identifiers::<TimelineId>(storage, remote_path, cancel).await
 }
 
 async fn do_download_index_part(
@@ -309,7 +323,7 @@ async fn do_download_index_part(
     timeline_id: &TimelineId,
     index_generation: Generation,
     cancel: &CancellationToken,
-) -> Result<IndexPart, DownloadError> {
+) -> Result<(IndexPart, Generation), DownloadError> {
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
     let index_part_bytes = download_retry_forever(
@@ -334,7 +348,7 @@ async fn do_download_index_part(
         .with_context(|| format!("deserialize index part file at {remote_path:?}"))
         .map_err(DownloadError::Other)?;
 
-    Ok(index_part)
+    Ok((index_part, index_generation))
 }
 
 /// index_part.json objects are suffixed with a generation number, so we cannot
@@ -343,13 +357,13 @@ async fn do_download_index_part(
 /// In this function we probe for the most recent index in a generation <= our current generation.
 /// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
 #[tracing::instrument(skip_all, fields(generation=?my_generation))]
-pub(super) async fn download_index_part(
+pub(crate) async fn download_index_part(
     storage: &GenericRemoteStorage,
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
     my_generation: Generation,
     cancel: &CancellationToken,
-) -> Result<IndexPart, DownloadError> {
+) -> Result<(IndexPart, Generation), DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
     if my_generation.is_none() {
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 2e83bbc5ed..09a25a5be0 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -522,6 +522,18 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
     json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }
 
+async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let state = get_state(&req);
+
+    json_response(
+        StatusCode::OK,
+        state.service.tenant_import(tenant_id).await?,
+    )
+}
+
 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -759,6 +771,13 @@ pub fn make_router(
         .post("/debug/v1/node/:node_id/drop", |r| {
             named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
         })
+        .post("/debug/v1/tenant/:tenant_id/import", |r| {
+            named_request_span(
+                r,
+                handle_tenant_import,
+                RequestName("debug_v1_tenant_import"),
+            )
+        })
         .get("/debug/v1/tenant", |r| {
             named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
         })
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 8237229d7b..0cea205599 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -1,13 +1,14 @@
 use pageserver_api::{
     models::{
         LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
-        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
+        TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse,
+        TimelineCreateRequest, TimelineInfo,
     },
     shard::TenantShardId,
 };
 use pageserver_client::mgmt_api::{Client, Result};
 use reqwest::StatusCode;
-use utils::id::{NodeId, TimelineId};
+use utils::id::{NodeId, TenantId, TimelineId};
 
 /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
 /// controller to collect metrics in a non-intrusive manner.
@@ -88,6 +89,18 @@ impl PageserverClient {
         )
     }
 
+    pub(crate) async fn tenant_scan_remote_storage(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantScanRemoteStorageResponse> {
+        measured_request!(
+            "tenant_scan_remote_storage",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.tenant_scan_remote_storage(tenant_id).await
+        )
+    }
+
     pub(crate) async fn tenant_secondary_download(
         &self,
         tenant_id: TenantShardId,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 952664e339..df1008b64e 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -110,6 +110,42 @@ struct ServiceState {
     delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
 }
 
+/// Transform an error from a pageserver into an error to return to callers of a storage
+/// controller API.
+fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
+    match e {
+        mgmt_api::Error::ReceiveErrorBody(str) => {
+            // Presume errors receiving body are connectivity/availability issues
+            ApiError::ResourceUnavailable(
+                format!("{node} error receiving error body: {str}").into(),
+            )
+        }
+        mgmt_api::Error::ReceiveBody(str) => {
+            // Presume errors receiving body are connectivity/availability issues
+            ApiError::ResourceUnavailable(format!("{node} error receiving body: {str}").into())
+        }
+        mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, msg) => {
+            ApiError::NotFound(anyhow::anyhow!(format!("{node}: {msg}")).into())
+        }
+        mgmt_api::Error::ApiError(StatusCode::SERVICE_UNAVAILABLE, msg) => {
+            ApiError::ResourceUnavailable(format!("{node}: {msg}").into())
+        }
+        mgmt_api::Error::ApiError(status @ StatusCode::UNAUTHORIZED, msg)
+        | mgmt_api::Error::ApiError(status @ StatusCode::FORBIDDEN, msg) => {
+            // Auth errors talking to a pageserver are not auth errors for the caller: they are
+            // internal server errors, showing that something is wrong with the pageserver or
+            // storage controller's auth configuration.
+            ApiError::InternalServerError(anyhow::anyhow!("{node} {status}: {msg}"))
+        }
+        mgmt_api::Error::ApiError(status, msg) => {
+            // Presume general case of pageserver API errors is that we tried to do something
+            // that can't be done right now.
+            ApiError::Conflict(format!("{node} {status}: {status} {msg}"))
+        }
+        mgmt_api::Error::Cancelled => ApiError::ShuttingDown,
+    }
+}
+
 impl ServiceState {
     fn new(
         nodes: HashMap<NodeId, Node>,
@@ -2519,17 +2555,7 @@ impl Service {
             client
                 .timeline_create(tenant_shard_id, &create_req)
                 .await
-                .map_err(|e| match e {
-                    mgmt_api::Error::ApiError(status, msg)
-                        if status == StatusCode::INTERNAL_SERVER_ERROR
-                            || status == StatusCode::NOT_ACCEPTABLE =>
-                    {
-                        // TODO: handle more error codes, e.g. 503 should be passed through.  Make a general wrapper
-                        // for pass-through API calls.
-                        ApiError::InternalServerError(anyhow::anyhow!(msg))
-                    }
-                    _ => ApiError::Conflict(format!("Failed to create timeline: {e}")),
-                })
+                .map_err(|e| passthrough_api_error(&node, e))
         }
 
         // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then
@@ -3654,6 +3680,88 @@ impl Service {
         Ok(())
     }
 
+    /// This is for debug/support only: assuming tenant data is already present in S3, we "create" a
+    /// tenant with a very high generation number so that it will see the existing data.
+    pub(crate) async fn tenant_import(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantCreateResponse, ApiError> {
+        // Pick an arbitrary available pageserver to use for scanning the tenant in remote storage
+        let maybe_node = {
+            self.inner
+                .read()
+                .unwrap()
+                .nodes
+                .values()
+                .find(|n| n.is_available())
+                .cloned()
+        };
+        let Some(node) = maybe_node else {
+            return Err(ApiError::BadRequest(anyhow::anyhow!("No nodes available")));
+        };
+
+        let client = PageserverClient::new(
+            node.get_id(),
+            node.base_url(),
+            self.config.jwt_token.as_deref(),
+        );
+
+        let scan_result = client
+            .tenant_scan_remote_storage(tenant_id)
+            .await
+            .map_err(|e| passthrough_api_error(&node, e))?;
+
+        // A post-split tenant may contain a mixture of shard counts in remote storage: pick the highest count.
+        let Some(shard_count) = scan_result
+            .shards
+            .iter()
+            .map(|s| s.tenant_shard_id.shard_count)
+            .max()
+        else {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("No shards found").into(),
+            ));
+        };
+
+        // Ideally we would set each newly imported shard's generation independently, but for correctness it is sufficient
+        // to
+        let generation = scan_result
+            .shards
+            .iter()
+            .map(|s| s.generation)
+            .max()
+            .expect("We already validated >0 shards");
+
+        // FIXME: we have no way to recover the shard stripe size from contents of remote storage: this will
+        // only work if they were using the default stripe size.
+        let stripe_size = ShardParameters::DEFAULT_STRIPE_SIZE;
+
+        let (response, waiters) = self
+            .do_tenant_create(TenantCreateRequest {
+                new_tenant_id: TenantShardId::unsharded(tenant_id),
+                generation,
+
+                shard_parameters: ShardParameters {
+                    count: shard_count,
+                    stripe_size,
+                },
+                placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking
+
+                // There is no way to know what the tenant's config was: revert to defaults
+                config: TenantConfig::default(),
+            })
+            .await?;
+
+        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
+            // Since this is a debug/support operation, all kinds of weird issues are possible (e.g. this
+            // tenant doesn't exist in the control plane), so don't fail the request if it can't fully
+            // reconcile, as reconciliation includes notifying compute.
+            tracing::warn!(%tenant_id, "Reconcile not done yet while importing tenant ({e})");
+        }
+
+        Ok(response)
+    }
+
     /// For debug/support: a full JSON dump of TenantShards.  Returns a response so that
     /// we don't have to make TenantShard clonable in the return path.
     pub(crate) fn tenants_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index fcd33bb66a..a94732a682 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1575,6 +1575,11 @@ class NeonCli(AbstractNeonCli):
         res.check_returncode()
         return tenant_id, timeline_id
 
+    def import_tenant(self, tenant_id: TenantId):
+        args = ["tenant", "import", "--tenant-id", str(tenant_id)]
+        res = self.raw_cli(args)
+        res.check_returncode()
+
     def set_default(self, tenant_id: TenantId):
         """
         Update default tenant for future operations that require tenant_id.
@@ -2207,6 +2212,13 @@ class NeonStorageController(MetricsGetter):
             headers=self.headers(TokenScope.ADMIN),
         )
 
+    def tenant_import(self, tenant_id: TenantId):
+        self.request(
+            "POST",
+            f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/import",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
     def reconcile_all(self):
         r = self.request(
             "POST",
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index b4b23745f8..bc1f8776b3 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -26,6 +26,7 @@ from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.types import TenantId, TenantShardId, TimelineId
 from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until
+from fixtures.workload import Workload
 from mypy_boto3_s3.type_defs import (
     ObjectTypeDef,
 )
@@ -1256,3 +1257,85 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     # Quiesce any background reconciliation before doing consistency check
     env.storage_controller.reconcile_until_idle(timeout_secs=10)
     env.storage_controller.consistency_check()
+
+
+@pytest.mark.parametrize("remote_storage", [RemoteStorageKind.LOCAL_FS, s3_storage()])
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_storage):
+    """
+    Tenant import is a support/debug tool for recovering a tenant from remote storage
+    if we don't have any metadata for it in the storage controller.
+    """
+
+    # This test is parametrized on remote storage because it exercises the relatively rare
+    # code path of listing with a prefix that is not a directory name: this helps us notice
+    # quickly if local_fs or s3_bucket implementations diverge.
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage)
+
+    # Use multiple pageservers because some test helpers assume single sharded tenants
+    # if there is only one pageserver.
+    neon_env_builder.num_pageservers = 2
+
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    tenant_id = env.initial_tenant
+
+    # Create a second timeline to ensure that import finds both
+    timeline_a = env.initial_timeline
+    timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id)
+
+    workload_a = Workload(env, tenant_id, timeline_a, branch_name="main")
+    workload_a.init()
+
+    workload_b = Workload(env, tenant_id, timeline_b, branch_name="branch_b")
+    workload_b.init()
+
+    # Write some data
+    workload_a.write_rows(72)
+    expect_rows_a = workload_a.expect_rows
+    workload_a.stop()
+    del workload_a
+
+    # Bump generation to make sure generation recovery works properly
+    for pageserver in env.pageservers:
+        pageserver.stop()
+        pageserver.start()
+
+    # Write some data in the higher generation into the other branch
+    workload_b.write_rows(107)
+    expect_rows_b = workload_b.expect_rows
+    workload_b.stop()
+    del workload_b
+
+    # Detach from pageservers
+    env.storage_controller.tenant_policy_update(
+        tenant_id,
+        {
+            "placement": "Detached",
+        },
+    )
+    env.storage_controller.reconcile_until_idle(timeout_secs=10)
+
+    # Force-drop it from the storage controller
+    env.storage_controller.request(
+        "POST",
+        f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop",
+        headers=env.storage_controller.headers(TokenScope.ADMIN),
+    )
+
+    # Now import it again
+    env.neon_cli.import_tenant(tenant_id)
+
+    # Check we found the shards
+    describe = env.storage_controller.tenant_describe(tenant_id)
+    literal_shard_count = 1 if shard_count is None else shard_count
+    assert len(describe["shards"]) == literal_shard_count
+
+    # Check the data is still there: this implicitly proves that we recovered generation numbers
+    # properly, for the timeline which was written to after a generation bump.
+    for timeline, branch, expect_rows in [
+        (timeline_a, "main", expect_rows_a),
+        (timeline_b, "branch_1", expect_rows_b),
+    ]:
+        workload = Workload(env, tenant_id, timeline, branch_name=branch)
+        workload.expect_rows = expect_rows
+        workload.validate()
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index d3f24cb06e..0ba0108651 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -132,7 +132,7 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str):
             assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
 
         # Check that we had to retry the downloads
-        assert env.pageserver.log_contains(".*list timelines.*failed, will retry.*")
+        assert env.pageserver.log_contains(".*list identifiers.*failed, will retry.*")
         assert env.pageserver.log_contains(".*download.*failed, will retry.*")
 
 
From 84914434e3fc63a26b817ba3fe8c2f0c8e545ea8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 29 Apr 2024 09:59:22 +0100
Subject: [PATCH 0639/1571] storage controller: send startup compute
 notifications in background (#7495)

## Problem

Previously, we try to send compute notifications in startup_reconcile
before completing that function, with a time limit. Any notifications
that don't happen within the time limit result in tenants having their
`pending_compute_notification` flag set, which causes them to spawn a
Reconciler next time the background reconciler loop runs.

This causes two problems:
- Spawning a lot of reconcilers after startup caused a spike in memory
(this is addressed in https://github.com/neondatabase/neon/pull/7493)
- After https://github.com/neondatabase/neon/pull/7493, spawning lots of
reconcilers will block some other operations, e.g. a tenant creation
might fail due to lack of reconciler semaphore units while the
controller is busy running all the Reconcilers for its startup compute
notifications.

When the code was first written, ComputeHook didn't have internal
ordering logic to ensure that notifications for a shard were sent in the
right order. Since that was added in
https://github.com/neondatabase/neon/pull/7088, we can use it to avoid
waiting for notifications to complete in startup_reconcile.

Related to: https://github.com/neondatabase/neon/issues/7460

## Summary of changes

- Add a `notify_background` method to ComputeHook.
- Call this from startup_reconcile instead of doing notifications inline
- Process completions from `notify_background` in `process_results`, and
if a notification failed then set the `pending_compute_notification`
flag on the shard.

The result is that we will only spawn lots of Reconcilers if the compute
notifications _fail_, not just because they take some significant amount
of time.

Test coverage for this case is in
https://github.com/neondatabase/neon/pull/7475
---
 storage_controller/src/compute_hook.rs | 158 +++++++++++++++++++------
 storage_controller/src/service.rs      | 158 ++++++++++---------------
 2 files changed, 183 insertions(+), 133 deletions(-)

diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 1ed8998713..44a156a5ec 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -3,11 +3,13 @@ use std::{collections::HashMap, time::Duration};
 
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
+use futures::StreamExt;
 use hyper::{Method, StatusCode};
 use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
+use tracing::{info_span, Instrument};
 use utils::{
     backoff::{self},
     id::{NodeId, TenantId},
@@ -420,48 +422,37 @@ impl ComputeHook {
         .and_then(|x| x)
     }
 
-    /// Call this to notify the compute (postgres) tier of new pageservers to use
-    /// for a tenant.  notify() is called by each shard individually, and this function
-    /// will decide whether an update to the tenant is sent.  An update is sent on the
-    /// condition that:
-    /// - We know a pageserver for every shard.
-    /// - All the shards have the same shard_count (i.e. we are not mid-split)
-    ///
-    /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler
-    /// that is cancelled.
-    ///
-    /// This function is fallible, including in the case that the control plane is transiently
-    /// unavailable.  A limited number of retries are done internally to efficiently hide short unavailability
-    /// periods, but we don't retry forever.  The **caller** is responsible for handling failures and
-    /// ensuring that they eventually call again to ensure that the compute is eventually notified of
-    /// the proper pageserver nodes for a tenant.
-    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
-    pub(super) async fn notify(
+    /// Synchronous phase: update the per-tenant state for the next intended notification
+    fn notify_prepare(
         &self,
         tenant_shard_id: TenantShardId,
         node_id: NodeId,
         stripe_size: ShardStripeSize,
+    ) -> MaybeSendResult {
+        let mut state_locked = self.state.lock().unwrap();
+
+        use std::collections::hash_map::Entry;
+        let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
+            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
+                tenant_shard_id,
+                stripe_size,
+                node_id,
+            )),
+            Entry::Occupied(e) => {
+                let tenant = e.into_mut();
+                tenant.update(tenant_shard_id, stripe_size, node_id);
+                tenant
+            }
+        };
+        tenant.maybe_send(tenant_shard_id.tenant_id, None)
+    }
+
+    async fn notify_execute(
+        &self,
+        maybe_send_result: MaybeSendResult,
+        tenant_shard_id: TenantShardId,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
-        let maybe_send_result = {
-            let mut state_locked = self.state.lock().unwrap();
-
-            use std::collections::hash_map::Entry;
-            let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
-                Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
-                    tenant_shard_id,
-                    stripe_size,
-                    node_id,
-                )),
-                Entry::Occupied(e) => {
-                    let tenant = e.into_mut();
-                    tenant.update(tenant_shard_id, stripe_size, node_id);
-                    tenant
-                }
-            };
-            tenant.maybe_send(tenant_shard_id.tenant_id, None)
-        };
-
         // Process result: we may get an update to send, or we may have to wait for a lock
         // before trying again.
         let (request, mut send_lock_guard) = match maybe_send_result {
@@ -469,7 +460,12 @@ impl ComputeHook {
                 return Ok(());
             }
             MaybeSendResult::AwaitLock(send_lock) => {
-                let send_locked = send_lock.lock_owned().await;
+                let send_locked = tokio::select! {
+                    guard = send_lock.lock_owned() => {guard},
+                    _ = cancel.cancelled() => {
+                        return Err(NotifyError::ShuttingDown)
+                    }
+                };
 
                 // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here
                 // we have acquired the send lock and take `[Self::state]` lock.  This is safe because maybe_send only uses
@@ -508,6 +504,94 @@ impl ComputeHook {
         }
         result
     }
+
+    /// Infallible synchronous fire-and-forget version of notify(), that sends its results to
+    /// a channel.  Something should consume the channel and arrange to try notifying again
+    /// if something failed.
+    pub(super) fn notify_background(
+        self: &Arc<Self>,
+        notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>,
+        result_tx: tokio::sync::mpsc::Sender<Result<(), (TenantShardId, NotifyError)>>,
+        cancel: &CancellationToken,
+    ) {
+        let mut maybe_sends = Vec::new();
+        for (tenant_shard_id, node_id, stripe_size) in notifications {
+            let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size);
+            maybe_sends.push((tenant_shard_id, maybe_send_result))
+        }
+
+        let this = self.clone();
+        let cancel = cancel.clone();
+
+        tokio::task::spawn(async move {
+            // Construct an async stream of futures to invoke the compute notify function: we do this
+            // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.  The
+            // ComputeHook semaphore already limits concurrency, but this way we avoid constructing+polling lots of futures which
+            // would mostly just be waiting on that semaphore.
+            let mut stream = futures::stream::iter(maybe_sends)
+                .map(|(tenant_shard_id, maybe_send_result)| {
+                    let this = this.clone();
+                    let cancel = cancel.clone();
+
+                    async move {
+                        this
+                            .notify_execute(maybe_send_result, tenant_shard_id, &cancel)
+                            .await.map_err(|e| (tenant_shard_id, e))
+                    }.instrument(info_span!(
+                        "notify_background", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()
+                    ))
+                })
+                .buffered(API_CONCURRENCY);
+
+            loop {
+                tokio::select! {
+                    next = stream.next() => {
+                        match next {
+                            Some(r) => {
+                                result_tx.send(r).await.ok();
+                            },
+                            None => {
+                                tracing::info!("Finished sending background compute notifications");
+                                break;
+                            }
+                        }
+                    },
+                    _ = cancel.cancelled() => {
+                        tracing::info!("Shutdown while running background compute notifications");
+                        break;
+                    }
+                };
+            }
+        });
+    }
+
+    /// Call this to notify the compute (postgres) tier of new pageservers to use
+    /// for a tenant.  notify() is called by each shard individually, and this function
+    /// will decide whether an update to the tenant is sent.  An update is sent on the
+    /// condition that:
+    /// - We know a pageserver for every shard.
+    /// - All the shards have the same shard_count (i.e. we are not mid-split)
+    ///
+    /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler
+    /// that is cancelled.
+    ///
+    /// This function is fallible, including in the case that the control plane is transiently
+    /// unavailable.  A limited number of retries are done internally to efficiently hide short unavailability
+    /// periods, but we don't retry forever.  The **caller** is responsible for handling failures and
+    /// ensuring that they eventually call again to ensure that the compute is eventually notified of
+    /// the proper pageserver nodes for a tenant.
+    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
+    pub(super) async fn notify(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node_id: NodeId,
+        stripe_size: ShardStripeSize,
+        cancel: &CancellationToken,
+    ) -> Result<(), NotifyError> {
+        let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size);
+        self.notify_execute(maybe_send_result, tenant_shard_id, cancel)
+            .await
+    }
 }
 
 #[cfg(test)]
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index df1008b64e..882562d99f 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -8,6 +8,7 @@ use std::{
 };
 
 use crate::{
+    compute_hook::NotifyError,
     id_lock_map::IdLockMap,
     persistence::{AbortShardSplitStatus, TenantFilter},
     reconciler::{ReconcileError, ReconcileUnits},
@@ -61,7 +62,7 @@ use utils::{
 };
 
 use crate::{
-    compute_hook::{self, ComputeHook},
+    compute_hook::ComputeHook,
     heartbeater::{Heartbeater, PageserverState},
     node::{AvailabilityTransition, Node},
     persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
@@ -332,7 +333,12 @@ impl Service {
     /// Called once on startup, this function attempts to contact all pageservers to build an up-to-date
     /// view of the world, and determine which pageservers are responsive.
     #[instrument(skip_all)]
-    async fn startup_reconcile(self: &Arc<Service>) {
+    async fn startup_reconcile(
+        self: &Arc<Service>,
+        bg_compute_notify_result_tx: tokio::sync::mpsc::Sender<
+            Result<(), (TenantShardId, NotifyError)>,
+        >,
+    ) {
         // For all tenant shards, a vector of observed states on nodes (where None means
         // indeterminate, same as in [`ObservedStateLocation`])
         let mut observed: HashMap<TenantShardId, Vec<(NodeId, Option<LocationConfig>)>> =
@@ -351,10 +357,6 @@ impl Service {
             .checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
             .expect("Reconcile timeout is a modest constant");
 
-        let compute_notify_deadline = start_at
-            .checked_add((STARTUP_RECONCILE_TIMEOUT / 4) * 3)
-            .expect("Reconcile timeout is a modest constant");
-
         // Accumulate a list of any tenant locations that ought to be detached
         let mut cleanup = Vec::new();
 
@@ -380,6 +382,7 @@ impl Service {
         let mut compute_notifications = Vec::new();
 
         // Populate intent and observed states for all tenants, based on reported state on pageservers
+        tracing::info!("Populating tenant shards' states from initial pageserver scan...");
         let shard_count = {
             let mut locked = self.inner.write().unwrap();
             let (nodes, tenants, scheduler) = locked.parts_mut();
@@ -446,28 +449,27 @@ impl Service {
         // Emit compute hook notifications for all tenants which are already stably attached.  Other tenants
         // will emit compute hook notifications when they reconcile.
         //
-        // Ordering: we must complete these notification attempts before doing any other reconciliation for the
-        // tenants named here, because otherwise our calls to notify() might race with more recent values
-        // generated by reconciliation.
-        let notify_failures = self
-            .compute_notify_many(compute_notifications, compute_notify_deadline)
-            .await;
-
-        // Compute notify is fallible.  If it fails here, do not delay overall startup: set the
-        // flag on these shards that they have a pending notification.
-        // Update tenant state for any that failed to do their initial compute notify, so that they'll retry later.
-        {
-            let mut locked = self.inner.write().unwrap();
-            for tenant_shard_id in notify_failures.into_iter() {
-                if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) {
-                    shard.pending_compute_notification = true;
-                }
-            }
-        }
+        // Ordering: our calls to notify_background synchronously establish a relative order for these notifications vs. any later
+        // calls into the ComputeHook for the same tenant: we can leave these to run to completion in the background and any later
+        // calls will be correctly ordered wrt these.
+        //
+        // Concurrency: we call notify_background for all tenants, which will create O(N) tokio tasks, but almost all of them
+        // will just wait on the ComputeHook::API_CONCURRENCY semaphore immediately, so very cheap until they get that semaphore
+        // unit and start doing I/O.
+        tracing::info!(
+            "Sending {} compute notifications",
+            compute_notifications.len()
+        );
+        self.compute_hook.notify_background(
+            compute_notifications,
+            bg_compute_notify_result_tx.clone(),
+            &self.cancel,
+        );
 
         // Finally, now that the service is up and running, launch reconcile operations for any tenants
         // which require it: under normal circumstances this should only include tenants that were in some
         // transient state before we restarted, or any tenants whose compute hooks failed above.
+        tracing::info!("Checking for shards in need of reconciliation...");
         let reconcile_tasks = self.reconcile_all();
         // We will not wait for these reconciliation tasks to run here: we're now done with startup and
         // normal operations may proceed.
@@ -508,6 +510,7 @@ impl Service {
             }
         }
 
+        tracing::info!("Sending initial heartbeats...");
         let res = self
             .heartbeater
             .heartbeat(Arc::new(nodes_to_heartbeat))
@@ -544,6 +547,7 @@ impl Service {
 
         let mut node_list_futs = FuturesUnordered::new();
 
+        tracing::info!("Scanning shards on {} nodes...", nodes.len());
         for node in nodes.values() {
             node_list_futs.push({
                 async move {
@@ -663,72 +667,6 @@ impl Service {
         }
     }
 
-    /// Used during [`Self::startup_reconcile`]: issue many concurrent compute notifications.
-    ///
-    /// Returns a set of any shards for which notifications where not acked within the deadline.
-    async fn compute_notify_many(
-        &self,
-        notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>,
-        deadline: Instant,
-    ) -> HashSet<TenantShardId> {
-        let attempt_shards = notifications.iter().map(|i| i.0).collect::<HashSet<_>>();
-        let mut success_shards = HashSet::new();
-
-        // Construct an async stream of futures to invoke the compute notify function: we do this
-        // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.
-        let mut stream = futures::stream::iter(notifications.into_iter())
-            .map(|(tenant_shard_id, node_id, stripe_size)| {
-                let compute_hook = self.compute_hook.clone();
-                let cancel = self.cancel.clone();
-                async move {
-                    if let Err(e) = compute_hook
-                        .notify(tenant_shard_id, node_id, stripe_size, &cancel)
-                        .await
-                    {
-                        tracing::error!(
-                            %tenant_shard_id,
-                            %node_id,
-                            "Failed to notify compute on startup for shard: {e}"
-                        );
-                        None
-                    } else {
-                        Some(tenant_shard_id)
-                    }
-                }
-            })
-            .buffered(compute_hook::API_CONCURRENCY);
-
-        loop {
-            tokio::select! {
-                next = stream.next() => {
-                    match next {
-                        Some(Some(success_shard)) => {
-                            // A notification succeeded
-                            success_shards.insert(success_shard);
-                            },
-                        Some(None) => {
-                            // A notification that failed
-                        },
-                        None => {
-                            tracing::info!("Successfully sent all compute notifications");
-                            break;
-                        }
-                    }
-                },
-                _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
-                    // Give up sending any that didn't succeed yet
-                    tracing::info!("Reached deadline while sending compute notifications");
-                    break;
-                }
-            };
-        }
-
-        attempt_shards
-            .difference(&success_shards)
-            .cloned()
-            .collect()
-    }
-
     /// Long running background task that periodically wakes up and looks for shards that need
     /// reconciliation.  Reconciliation is fallible, so any reconciliation tasks that fail during
     /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible
@@ -887,23 +825,45 @@ impl Service {
     async fn process_results(
         &self,
         mut result_rx: tokio::sync::mpsc::UnboundedReceiver<ReconcileResult>,
+        mut bg_compute_hook_result_rx: tokio::sync::mpsc::Receiver<
+            Result<(), (TenantShardId, NotifyError)>,
+        >,
     ) {
         loop {
             // Wait for the next result, or for cancellation
-            let result = tokio::select! {
+            tokio::select! {
                 r = result_rx.recv() => {
                     match r {
-                        Some(result) => {result},
+                        Some(result) => {self.process_result(result);},
                         None => {break;}
                     }
                 }
+                _ = async{
+                    match bg_compute_hook_result_rx.recv().await {
+                        Some(result) => {
+                            if let Err((tenant_shard_id, notify_error)) = result {
+                                tracing::warn!("Marking shard {tenant_shard_id} for notification retry, due to error {notify_error}");
+                                let mut locked = self.inner.write().unwrap();
+                                if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) {
+                                    shard.pending_compute_notification = true;
+                                }
+
+                            }
+                        },
+                        None => {
+                            // This channel is dead, but we don't want to terminate the outer loop{}: just wait for shutdown
+                            self.cancel.cancelled().await;
+                        }
+                    }
+                } => {},
                 _ = self.cancel.cancelled() => {
                     break;
                 }
             };
-
-            self.process_result(result);
         }
+
+        // We should only fall through on shutdown
+        assert!(self.cancel.is_cancelled());
     }
 
     async fn process_aborts(
@@ -1064,6 +1024,10 @@ impl Service {
 
         let (startup_completion, startup_complete) = utils::completion::channel();
 
+        // This channel is continuously consumed by process_results, so doesn't need to be very large.
+        let (bg_compute_notify_result_tx, bg_compute_notify_result_rx) =
+            tokio::sync::mpsc::channel(512);
+
         let (delayed_reconcile_tx, delayed_reconcile_rx) =
             tokio::sync::mpsc::channel(MAX_DELAYED_RECONCILES);
 
@@ -1101,7 +1065,9 @@ impl Service {
         tokio::task::spawn(async move {
             // Block shutdown until we're done (we must respect self.cancel)
             if let Ok(_gate) = result_task_this.gate.enter() {
-                result_task_this.process_results(result_rx).await
+                result_task_this
+                    .process_results(result_rx, bg_compute_notify_result_rx)
+                    .await
             }
         });
 
@@ -1143,7 +1109,7 @@ impl Service {
                     return;
                 };
 
-                this.startup_reconcile().await;
+                this.startup_reconcile(bg_compute_notify_result_tx).await;
                 drop(startup_completion);
             }
         });

From 24ce878039fbf7b45b18cbcf4c7617b779338d2e Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 29 Apr 2024 11:49:42 +0200
Subject: [PATCH 0640/1571] proxy: Exclude compute and retries (#7529)

## Problem

Alerts fire if the connection the compute is slow.

## Summary of changes

Exclude compute and retry from latencies.
---
 proxy/src/compute.rs               |  4 ++++
 proxy/src/metrics.rs               | 33 ++++++++++++++++++++++++++++++
 proxy/src/proxy/connect_compute.rs |  4 ++++
 proxy/src/proxy/wake_compute.rs    |  4 ++++
 proxy/src/serverless/backend.rs    |  2 ++
 5 files changed, 47 insertions(+)

diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 149a619316..44d85c2952 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -260,7 +260,9 @@ impl ConnCfg {
         aux: MetricsAuxInfo,
         timeout: Duration,
     ) -> Result<PostgresConnection, ConnectionError> {
+        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
         let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
+        drop(pause);
 
         let tls_connector = native_tls::TlsConnector::builder()
             .danger_accept_invalid_certs(allow_self_signed_compute)
@@ -270,7 +272,9 @@ impl ConnCfg {
         let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;
 
         // connect_raw() will not use TLS if sslmode is "disable"
+        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
         let (client, connection) = self.0.connect_raw(stream, tls).await?;
+        drop(pause);
         tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
         let stream = connection.stream.into_inner();
 
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 530350008c..c129ece059 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -284,6 +284,8 @@ pub struct ComputeConnectionLatencyGroup {
 pub enum LatencyExclusions {
     Client,
     ClientAndCplane,
+    ClientCplaneCompute,
+    ClientCplaneComputeRetry,
 }
 
 #[derive(FixedCardinalityLabel, Copy, Clone)]
@@ -352,6 +354,7 @@ pub enum Waiting {
     Cplane,
     Client,
     Compute,
+    RetryTimeout,
 }
 
 #[derive(Default)]
@@ -359,6 +362,7 @@ struct Accumulated {
     cplane: time::Duration,
     client: time::Duration,
     compute: time::Duration,
+    retry: time::Duration,
 }
 
 pub struct LatencyTimer {
@@ -421,6 +425,7 @@ impl Drop for LatencyTimerPause<'_> {
             Waiting::Cplane => self.timer.accumulated.cplane += dur,
             Waiting::Client => self.timer.accumulated.client += dur,
             Waiting::Compute => self.timer.accumulated.compute += dur,
+            Waiting::RetryTimeout => self.timer.accumulated.retry += dur,
         }
     }
 }
@@ -464,6 +469,34 @@ impl Drop for LatencyTimer {
             },
             duration.saturating_sub(accumulated_total).as_secs_f64(),
         );
+
+        // Exclude client cplane, compue communication from the accumulated time.
+        let accumulated_total =
+            self.accumulated.client + self.accumulated.cplane + self.accumulated.compute;
+        metric.observe(
+            ComputeConnectionLatencyGroup {
+                protocol: self.protocol,
+                cold_start_info: self.cold_start_info,
+                outcome: self.outcome,
+                excluded: LatencyExclusions::ClientCplaneCompute,
+            },
+            duration.saturating_sub(accumulated_total).as_secs_f64(),
+        );
+
+        // Exclude client cplane, compue, retry communication from the accumulated time.
+        let accumulated_total = self.accumulated.client
+            + self.accumulated.cplane
+            + self.accumulated.compute
+            + self.accumulated.retry;
+        metric.observe(
+            ComputeConnectionLatencyGroup {
+                protocol: self.protocol,
+                cold_start_info: self.cold_start_info,
+                outcome: self.outcome,
+                excluded: LatencyExclusions::ClientCplaneComputeRetry,
+            },
+            duration.saturating_sub(accumulated_total).as_secs_f64(),
+        );
     }
 }
 
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 8a220aaa0c..f561085588 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -194,6 +194,10 @@ where
         let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
         num_retries += 1;
 
+        let pause = ctx
+            .latency_timer
+            .pause(crate::metrics::Waiting::RetryTimeout);
         time::sleep(wait_duration).await;
+        drop(pause);
     }
 }
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index cfedf0e98a..cb9a293413 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -54,7 +54,11 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
 
         let wait_duration = retry_after(*num_retries, config);
         *num_retries += 1;
+        let pause = ctx
+            .latency_timer
+            .pause(crate::metrics::Waiting::RetryTimeout);
         tokio::time::sleep(wait_duration).await;
+        drop(pause);
     }
 }
 
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index b91c0e62ed..c89ebc3251 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -179,7 +179,9 @@ impl ConnectMechanism for TokioMechanism {
             .dbname(&self.conn_info.dbname)
             .connect_timeout(timeout);
 
+        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
         let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
+        drop(pause);
 
         tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
         Ok(poll_client(

From 2226acef7ca147276dab2bc3eea94958fbc03036 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 29 Apr 2024 13:16:00 +0100
Subject: [PATCH 0641/1571] s3_scrubber: add `tenant-snapshot` (#7444)

## Problem

Downloading tenant data for analysis/debug with `aws s3 cp` works well
for small tenants, but for larger tenants it is unlikely that one ends
up with an index that matches layer files, due to the time taken to
download.

## Summary of changes

- Add a `tenant-snapshot` command to the scrubber, which reads timeline
indices and then downloads the layers referenced in the index, even if
they were deleted. The result is a snapshot of the tenant's remote
storage state that should be usable when imported (#7399 ).
---
 Cargo.lock                              |   1 +
 s3_scrubber/Cargo.toml                  |   1 +
 s3_scrubber/src/lib.rs                  |  71 +++++-
 s3_scrubber/src/main.rs                 |  21 ++
 s3_scrubber/src/metadata_stream.rs      |  58 ++++-
 s3_scrubber/src/tenant_snapshot.rs      | 293 ++++++++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py   |  31 ++-
 test_runner/fixtures/remote_storage.py  |   8 +-
 test_runner/fixtures/types.py           |   6 +-
 test_runner/regress/test_s3_scrubber.py | 111 +++++++++
 10 files changed, 586 insertions(+), 15 deletions(-)
 create mode 100644 s3_scrubber/src/tenant_snapshot.rs
 create mode 100644 test_runner/regress/test_s3_scrubber.py

diff --git a/Cargo.lock b/Cargo.lock
index 85a59ec0ed..a130988409 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5085,6 +5085,7 @@ dependencies = [
  "aws-smithy-async",
  "bincode",
  "bytes",
+ "camino",
  "chrono",
  "clap",
  "crc32c",
diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml
index 4d136472e0..0ee9112010 100644
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -25,6 +25,7 @@ async-stream.workspace = true
 tokio-stream.workspace = true
 futures-util.workspace = true
 itertools.workspace = true
+camino.workspace = true
 
 tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
 chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index d2842877d0..90d58a3bc2 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -5,6 +5,7 @@ pub mod cloud_admin_api;
 pub mod garbage;
 pub mod metadata_stream;
 pub mod scan_metadata;
+pub mod tenant_snapshot;
 
 use std::env;
 use std::fmt::Display;
@@ -23,17 +24,18 @@ use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep};
 use aws_sdk_s3::{Client, Config};
 use aws_smithy_async::rt::sleep::TokioSleep;
 
+use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
 use pageserver_api::shard::TenantShardId;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
-use std::io::IsTerminal;
 use tokio::io::AsyncReadExt;
 use tracing::error;
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
-use utils::id::TimelineId;
+use utils::fs_ext;
+use utils::id::{TenantId, TimelineId};
 
 const MAX_RETRIES: usize = 20;
 const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
@@ -147,6 +149,23 @@ impl RootTarget {
         self.tenants_root().with_sub_segment(&tenant_id.to_string())
     }
 
+    pub(crate) fn tenant_shards_prefix(&self, tenant_id: &TenantId) -> S3Target {
+        // Only pageserver remote storage contains tenant-shards
+        assert!(matches!(self, Self::Pageserver(_)));
+        let Self::Pageserver(root) = self else {
+            panic!();
+        };
+
+        S3Target {
+            bucket_name: root.bucket_name.clone(),
+            prefix_in_bucket: format!(
+                "{}/{TENANTS_SEGMENT_NAME}/{tenant_id}",
+                root.prefix_in_bucket
+            ),
+            delimiter: root.delimiter.clone(),
+        }
+    }
+
     pub fn timelines_root(&self, tenant_id: &TenantShardId) -> S3Target {
         match self {
             Self::Pageserver(_) => self.tenant_root(tenant_id).with_sub_segment("timelines"),
@@ -240,7 +259,6 @@ pub fn init_logging(file_name: &str) -> WorkerGuard {
         .with_ansi(false)
         .with_writer(file_writer);
     let stderr_logs = fmt::Layer::new()
-        .with_ansi(std::io::stderr().is_terminal())
         .with_target(false)
         .with_writer(std::io::stderr);
     tracing_subscriber::registry()
@@ -396,3 +414,50 @@ async fn download_object_with_retries(
 
     anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times")
 }
+
+async fn download_object_to_file(
+    s3_client: &Client,
+    bucket_name: &str,
+    key: &str,
+    version_id: Option<&str>,
+    local_path: &Utf8Path,
+) -> anyhow::Result<()> {
+    let tmp_path = Utf8PathBuf::from(format!("{local_path}.tmp"));
+    for _ in 0..MAX_RETRIES {
+        tokio::fs::remove_file(&tmp_path)
+            .await
+            .or_else(fs_ext::ignore_not_found)?;
+
+        let mut file = tokio::fs::File::create(&tmp_path)
+            .await
+            .context("Opening output file")?;
+
+        let request = s3_client.get_object().bucket(bucket_name).key(key);
+
+        let request = match version_id {
+            Some(version_id) => request.version_id(version_id),
+            None => request,
+        };
+
+        let response_stream = match request.send().await {
+            Ok(response) => response,
+            Err(e) => {
+                error!(
+                    "Failed to download object for key {key} version {}: {e:#}",
+                    version_id.unwrap_or("")
+                );
+                tokio::time::sleep(Duration::from_secs(1)).await;
+                continue;
+            }
+        };
+
+        let mut read_stream = response_stream.body.into_async_read();
+
+        tokio::io::copy(&mut read_stream, &mut file).await?;
+
+        tokio::fs::rename(&tmp_path, local_path).await?;
+        return Ok(());
+    }
+
+    anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times")
+}
diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs
index 957213856b..88ba9bfa61 100644
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -1,9 +1,12 @@
+use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use s3_scrubber::scan_metadata::scan_metadata;
+use s3_scrubber::tenant_snapshot::SnapshotDownloader;
 use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
 
 use clap::{Parser, Subcommand};
+use utils::id::TenantId;
 
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
@@ -38,6 +41,14 @@ enum Command {
         #[arg(long = "tenant-id", num_args = 0..)]
         tenant_ids: Vec<TenantShardId>,
     },
+    TenantSnapshot {
+        #[arg(long = "tenant-id")]
+        tenant_id: TenantId,
+        #[arg(long = "concurrency", short = 'j', default_value_t = 8)]
+        concurrency: usize,
+        #[arg(short, long)]
+        output_path: Utf8PathBuf,
+    },
 }
 
 #[tokio::main]
@@ -50,6 +61,7 @@ async fn main() -> anyhow::Result<()> {
         Command::ScanMetadata { .. } => "scan",
         Command::FindGarbage { .. } => "find-garbage",
         Command::PurgeGarbage { .. } => "purge-garbage",
+        Command::TenantSnapshot { .. } => "tenant-snapshot",
     };
     let _guard = init_logging(&format!(
         "{}_{}_{}_{}.log",
@@ -102,5 +114,14 @@ async fn main() -> anyhow::Result<()> {
         Command::PurgeGarbage { input_path, mode } => {
             purge_garbage(input_path, mode, !cli.delete).await
         }
+        Command::TenantSnapshot {
+            tenant_id,
+            output_path,
+            concurrency,
+        } => {
+            let downloader =
+                SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?;
+            downloader.download().await
+        }
     }
 }
diff --git a/s3_scrubber/src/metadata_stream.rs b/s3_scrubber/src/metadata_stream.rs
index 073f37f319..b192e0be2e 100644
--- a/s3_scrubber/src/metadata_stream.rs
+++ b/s3_scrubber/src/metadata_stream.rs
@@ -5,7 +5,7 @@ use tokio_stream::Stream;
 
 use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
 use pageserver_api::shard::TenantShardId;
-use utils::id::TimelineId;
+use utils::id::{TenantId, TimelineId};
 
 /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
 pub fn stream_tenants<'a>(
@@ -45,6 +45,62 @@ pub fn stream_tenants<'a>(
     }
 }
 
+pub async fn stream_tenant_shards<'a>(
+    s3_client: &'a Client,
+    target: &'a RootTarget,
+    tenant_id: TenantId,
+) -> anyhow::Result<impl Stream<Item = Result<TenantShardId, anyhow::Error>> + 'a> {
+    let mut tenant_shard_ids: Vec<Result<TenantShardId, anyhow::Error>> = Vec::new();
+    let mut continuation_token = None;
+    let shards_target = target.tenant_shards_prefix(&tenant_id);
+
+    loop {
+        tracing::info!("Listing in {}", shards_target.prefix_in_bucket);
+        let fetch_response =
+            list_objects_with_retries(s3_client, &shards_target, continuation_token.clone()).await;
+        let fetch_response = match fetch_response {
+            Err(e) => {
+                tenant_shard_ids.push(Err(e));
+                break;
+            }
+            Ok(r) => r,
+        };
+
+        let new_entry_ids = fetch_response
+            .common_prefixes()
+            .iter()
+            .filter_map(|prefix| prefix.prefix())
+            .filter_map(|prefix| -> Option<&str> {
+                prefix
+                    .strip_prefix(&target.tenants_root().prefix_in_bucket)?
+                    .strip_suffix('/')
+            })
+            .map(|entry_id_str| {
+                let first_part = entry_id_str.split('/').next().unwrap();
+
+                first_part
+                    .parse::<TenantShardId>()
+                    .with_context(|| format!("Incorrect entry id str: {first_part}"))
+            });
+
+        for i in new_entry_ids {
+            tenant_shard_ids.push(i);
+        }
+
+        match fetch_response.next_continuation_token {
+            Some(new_token) => continuation_token = Some(new_token),
+            None => break,
+        }
+    }
+
+    Ok(stream! {
+        for i in tenant_shard_ids {
+            let id = i?;
+            yield Ok(id);
+        }
+    })
+}
+
 /// Given a TenantShardId, output a stream of the timelines within that tenant, discovered
 /// using ListObjectsv2.  The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
diff --git a/s3_scrubber/src/tenant_snapshot.rs b/s3_scrubber/src/tenant_snapshot.rs
new file mode 100644
index 0000000000..4eccad381b
--- /dev/null
+++ b/s3_scrubber/src/tenant_snapshot.rs
@@ -0,0 +1,293 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use crate::checks::{list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData};
+use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines};
+use crate::{
+    download_object_to_file, init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId,
+};
+use anyhow::Context;
+use async_stream::stream;
+use aws_sdk_s3::Client;
+use camino::Utf8PathBuf;
+use futures::{StreamExt, TryStreamExt};
+use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::IndexPart;
+use pageserver_api::shard::TenantShardId;
+use utils::generation::Generation;
+use utils::id::TenantId;
+
+pub struct SnapshotDownloader {
+    s3_client: Arc<Client>,
+    s3_root: RootTarget,
+    bucket_config: BucketConfig,
+    tenant_id: TenantId,
+    output_path: Utf8PathBuf,
+    concurrency: usize,
+}
+
+impl SnapshotDownloader {
+    pub fn new(
+        bucket_config: BucketConfig,
+        tenant_id: TenantId,
+        output_path: Utf8PathBuf,
+        concurrency: usize,
+    ) -> anyhow::Result<Self> {
+        let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+        Ok(Self {
+            s3_client,
+            s3_root,
+            bucket_config,
+            tenant_id,
+            output_path,
+            concurrency,
+        })
+    }
+
+    async fn download_layer(
+        &self,
+        ttid: TenantShardTimelineId,
+        layer_name: LayerFileName,
+        layer_metadata: IndexLayerMetadata,
+    ) -> anyhow::Result<(LayerFileName, IndexLayerMetadata)> {
+        // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format.  They use
+        // different layer names (remote-style has the generation suffix)
+        let local_path = self.output_path.join(format!(
+            "{}/timelines/{}/{}{}",
+            ttid.tenant_shard_id,
+            ttid.timeline_id,
+            layer_name.file_name(),
+            layer_metadata.generation.get_suffix()
+        ));
+
+        // We should only be called for layers that are owned by the input TTID
+        assert_eq!(layer_metadata.shard, ttid.tenant_shard_id.to_index());
+
+        // Assumption: we always write layer files atomically, and layer files are immutable.  Therefore if the file
+        // already exists on local disk, we assume it is fully correct and skip it.
+        if tokio::fs::try_exists(&local_path).await? {
+            tracing::debug!("{} already exists", local_path);
+            return Ok((layer_name, layer_metadata));
+        } else {
+            tracing::debug!("{} requires download...", local_path);
+
+            let timeline_root = self.s3_root.timeline_root(&ttid);
+            let remote_layer_path = format!(
+                "{}{}{}",
+                timeline_root.prefix_in_bucket,
+                layer_name.file_name(),
+                layer_metadata.generation.get_suffix()
+            );
+
+            // List versions: the object might be deleted.
+            let versions = self
+                .s3_client
+                .list_object_versions()
+                .bucket(self.bucket_config.bucket.clone())
+                .prefix(&remote_layer_path)
+                .send()
+                .await?;
+            let Some(version) = versions.versions.as_ref().and_then(|v| v.first()) else {
+                return Err(anyhow::anyhow!("No versions found for {remote_layer_path}"));
+            };
+            download_object_to_file(
+                &self.s3_client,
+                &self.bucket_config.bucket,
+                &remote_layer_path,
+                version.version_id.as_deref(),
+                &local_path,
+            )
+            .await?;
+
+            tracing::debug!("Downloaded successfully to {local_path}");
+        }
+
+        Ok((layer_name, layer_metadata))
+    }
+
+    /// Download many layers belonging to the same TTID, with some concurrency
+    async fn download_layers(
+        &self,
+        ttid: TenantShardTimelineId,
+        layers: Vec<(LayerFileName, IndexLayerMetadata)>,
+    ) -> anyhow::Result<()> {
+        let layer_count = layers.len();
+        tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count);
+        let layers_stream = stream! {
+            for (layer_name, layer_metadata) in layers {
+                yield self.download_layer(ttid, layer_name, layer_metadata);
+            }
+        };
+
+        tokio::fs::create_dir_all(self.output_path.join(format!(
+            "{}/timelines/{}",
+            ttid.tenant_shard_id, ttid.timeline_id
+        )))
+        .await?;
+
+        let layer_results = layers_stream.buffered(self.concurrency);
+        let mut layer_results = std::pin::pin!(layer_results);
+
+        let mut err = None;
+        let mut download_count = 0;
+        while let Some(i) = layer_results.next().await {
+            download_count += 1;
+            match i {
+                Ok((layer_name, layer_metadata)) => {
+                    tracing::info!(
+                        "[{download_count}/{layer_count}] OK: {} bytes {ttid} {}",
+                        layer_metadata.file_size,
+                        layer_name.file_name()
+                    );
+                }
+                Err(e) => {
+                    // Warn and continue: we will download what we can
+                    tracing::warn!("Download error: {e}");
+                    err = Some(e);
+                }
+            }
+        }
+        if let Some(e) = err {
+            tracing::warn!("Some errors occurred downloading {ttid} layers, last error: {e}");
+            Err(e)
+        } else {
+            Ok(())
+        }
+    }
+
+    async fn download_timeline(
+        &self,
+        ttid: TenantShardTimelineId,
+        index_part: IndexPart,
+        index_part_generation: Generation,
+        ancestor_layers: &mut HashMap<
+            TenantShardTimelineId,
+            HashMap<LayerFileName, IndexLayerMetadata>,
+        >,
+    ) -> anyhow::Result<()> {
+        let index_bytes = serde_json::to_string(&index_part).unwrap();
+
+        let layers = index_part
+            .layer_metadata
+            .into_iter()
+            .filter_map(|(layer_name, layer_metadata)| {
+                if layer_metadata.shard.shard_count != ttid.tenant_shard_id.shard_count {
+                    // Accumulate ancestor layers for later download
+                    let ancestor_ttid = TenantShardTimelineId::new(
+                        TenantShardId {
+                            tenant_id: ttid.tenant_shard_id.tenant_id,
+                            shard_number: layer_metadata.shard.shard_number,
+                            shard_count: layer_metadata.shard.shard_count,
+                        },
+                        ttid.timeline_id,
+                    );
+                    let ancestor_ttid_layers = ancestor_layers.entry(ancestor_ttid).or_default();
+                    use std::collections::hash_map::Entry;
+                    match ancestor_ttid_layers.entry(layer_name) {
+                        Entry::Occupied(entry) => {
+                            // Descendent shards that reference a layer from an ancestor should always have matching metadata,
+                            // as their siblings, because it is read atomically during a shard split.
+                            assert_eq!(entry.get(), &layer_metadata);
+                        }
+                        Entry::Vacant(entry) => {
+                            entry.insert(layer_metadata);
+                        }
+                    }
+                    None
+                } else {
+                    Some((layer_name, layer_metadata))
+                }
+            })
+            .collect();
+
+        let download_result = self.download_layers(ttid, layers).await;
+
+        // Write index last, once all the layers it references are downloaded
+        let local_index_path = self.output_path.join(format!(
+            "{}/timelines/{}/index_part.json{}",
+            ttid.tenant_shard_id,
+            ttid.timeline_id,
+            index_part_generation.get_suffix()
+        ));
+        tokio::fs::write(&local_index_path, index_bytes)
+            .await
+            .context("writing index")?;
+
+        download_result
+    }
+
+    pub async fn download(&self) -> anyhow::Result<()> {
+        let (s3_client, target) = init_remote(self.bucket_config.clone(), NodeKind::Pageserver)?;
+
+        // Generate a stream of TenantShardId
+        let shards = stream_tenant_shards(&s3_client, &target, self.tenant_id).await?;
+        let shards: Vec<TenantShardId> = shards.try_collect().await?;
+
+        // Only read from shards that have the highest count: avoids redundantly downloading
+        // from ancestor shards.
+        let Some(shard_count) = shards.iter().map(|s| s.shard_count).max() else {
+            anyhow::bail!("No shards found");
+        };
+
+        // We will build a collection of layers in anccestor shards to download (this will only
+        // happen if this tenant has been split at some point)
+        let mut ancestor_layers: HashMap<
+            TenantShardTimelineId,
+            HashMap<LayerFileName, IndexLayerMetadata>,
+        > = Default::default();
+
+        for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
+            // Generate a stream of TenantTimelineId
+            let timelines = stream_tenant_timelines(&s3_client, &self.s3_root, shard).await?;
+
+            // Generate a stream of S3TimelineBlobData
+            async fn load_timeline_index(
+                s3_client: &Client,
+                target: &RootTarget,
+                ttid: TenantShardTimelineId,
+            ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> {
+                let data = list_timeline_blobs(s3_client, ttid, target).await?;
+                Ok((ttid, data))
+            }
+            let timelines = timelines.map_ok(|ttid| load_timeline_index(&s3_client, &target, ttid));
+            let mut timelines = std::pin::pin!(timelines.try_buffered(8));
+
+            while let Some(i) = timelines.next().await {
+                let (ttid, data) = i?;
+                match data.blob_data {
+                    BlobDataParseResult::Parsed {
+                        index_part,
+                        index_part_generation,
+                        s3_layers: _,
+                    } => {
+                        self.download_timeline(
+                            ttid,
+                            index_part,
+                            index_part_generation,
+                            &mut ancestor_layers,
+                        )
+                        .await
+                        .context("Downloading timeline")?;
+                    }
+                    BlobDataParseResult::Relic => {}
+                    BlobDataParseResult::Incorrect(_) => {
+                        tracing::error!("Bad metadata in timeline {ttid}");
+                    }
+                };
+            }
+        }
+
+        for (ttid, layers) in ancestor_layers.into_iter() {
+            tracing::info!(
+                "Downloading {} layers from ancvestor timeline {ttid}...",
+                layers.len()
+            );
+
+            self.download_layers(ttid, layers.into_iter().collect())
+                .await?;
+        }
+
+        Ok(())
+    }
+}
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a94732a682..07db355d98 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2310,20 +2310,24 @@ class NeonPageserver(PgProtocol):
         # The entries in the list are regular experessions.
         self.allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)
 
-    def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path:
+    def timeline_dir(
+        self,
+        tenant_shard_id: Union[TenantId, TenantShardId],
+        timeline_id: Optional[TimelineId] = None,
+    ) -> Path:
         """Get a timeline directory's path based on the repo directory of the test environment"""
         if timeline_id is None:
-            return self.tenant_dir(tenant_id) / "timelines"
-        return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id)
+            return self.tenant_dir(tenant_shard_id) / "timelines"
+        return self.tenant_dir(tenant_shard_id) / "timelines" / str(timeline_id)
 
     def tenant_dir(
         self,
-        tenant_id: Optional[TenantId] = None,
+        tenant_shard_id: Optional[Union[TenantId, TenantShardId]] = None,
     ) -> Path:
         """Get a tenant directory's path based on the repo directory of the test environment"""
-        if tenant_id is None:
+        if tenant_shard_id is None:
             return self.workdir / "tenants"
-        return self.workdir / "tenants" / str(tenant_id)
+        return self.workdir / "tenants" / str(tenant_shard_id)
 
     def start(
         self,
@@ -2510,8 +2514,10 @@ class NeonPageserver(PgProtocol):
         client = self.http_client()
         return client.tenant_location_conf(tenant_id, config, **kwargs)
 
-    def read_tenant_location_conf(self, tenant_id: TenantId) -> dict[str, Any]:
-        path = self.tenant_dir(tenant_id) / "config-v1"
+    def read_tenant_location_conf(
+        self, tenant_shard_id: Union[TenantId, TenantShardId]
+    ) -> dict[str, Any]:
+        path = self.tenant_dir(tenant_shard_id) / "config-v1"
         log.info(f"Reading location conf from {path}")
         bytes = open(path, "r").read()
         try:
@@ -3715,7 +3721,7 @@ class S3Scrubber:
             log.warning(f"Scrub environment: {env}")
             log.warning(f"Output at: {output_path}")
 
-            raise RuntimeError("Remote storage scrub failed")
+            raise RuntimeError(f"Scrubber failed while running {args}")
 
         assert stdout is not None
         return stdout
@@ -3730,6 +3736,13 @@ class S3Scrubber:
             log.error(stdout)
             raise
 
+    def tenant_snapshot(self, tenant_id: TenantId, output_path: Path):
+        stdout = self.scrubber_cli(
+            ["tenant-snapshot", "--tenant-id", str(tenant_id), "--output-path", str(output_path)],
+            timeout=30,
+        )
+        log.info(f"tenant-snapshot output: {stdout}")
+
 
 def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path:
     """Compute the path to a working directory for an individual test."""
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 60591d8d46..83f9f26837 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -252,8 +252,11 @@ class S3Storage:
 
         log.info(f"deleted {cnt} objects from remote storage")
 
+    def tenants_path(self) -> str:
+        return f"{self.prefix_in_bucket}/tenants"
+
     def tenant_path(self, tenant_id: TenantId) -> str:
-        return f"{self.prefix_in_bucket}/tenants/{tenant_id}"
+        return f"{self.tenants_path()}/{tenant_id}"
 
     def heatmap_key(self, tenant_id: TenantId) -> str:
         return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
@@ -262,6 +265,9 @@ class S3Storage:
         r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id))
         return json.loads(r["Body"].read().decode("utf-8"))
 
+    def mock_remote_tenant_path(self, tenant_id: TenantId):
+        assert self.real is False
+
 
 RemoteStorage = Union[LocalFsStorage, S3Storage]
 
diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py
index 80c9b9ce9a..b5458b5c26 100644
--- a/test_runner/fixtures/types.py
+++ b/test_runner/fixtures/types.py
@@ -156,7 +156,11 @@ class TenantShardId:
             raise ValueError(f"Invalid TenantShardId '{input}'")
 
     def __str__(self):
-        return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
+        if self.shard_count > 0:
+            return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
+        else:
+            # Unsharded case: equivalent of Rust TenantShardId::unsharded(tenant_id)
+            return str(self.tenant_id)
 
     def __repr__(self):
         return self.__str__()
diff --git a/test_runner/regress/test_s3_scrubber.py b/test_runner/regress/test_s3_scrubber.py
new file mode 100644
index 0000000000..018c1637d0
--- /dev/null
+++ b/test_runner/regress/test_s3_scrubber.py
@@ -0,0 +1,111 @@
+import os
+import shutil
+from typing import Optional
+
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    S3Scrubber,
+)
+from fixtures.remote_storage import S3Storage, s3_storage
+from fixtures.types import TenantShardId
+from fixtures.workload import Workload
+
+
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
+    """
+    Test the `tenant-snapshot` subcommand, which grabs data from remote storage
+
+    This is only a support/debug tool, but worth testing to ensure the tool does not regress.
+    """
+
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1
+
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    branch = "main"
+
+    # Do some work
+    workload = Workload(env, tenant_id, timeline_id, branch)
+    workload.init()
+
+    # Multiple write/flush passes to generate multiple layers
+    for _n in range(0, 3):
+        workload.write_rows(128)
+
+    # Do some more work after a restart, so that we have multiple generations
+    for pageserver in env.pageservers:
+        pageserver.stop()
+        pageserver.start()
+
+    for _n in range(0, 3):
+        workload.write_rows(128)
+
+    # If we're doing multiple shards, split: this is important to exercise
+    # the scrubber's ability to understand the references from child shards to parent shard's layers
+    if shard_count is not None:
+        tenant_shard_ids = env.storage_controller.tenant_shard_split(
+            tenant_id, shard_count=shard_count
+        )
+
+        # Write after shard split: this will result in shards containing a mixture of owned
+        # and parent layers in their index.
+        workload.write_rows(128)
+    else:
+        tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)]
+
+    output_path = neon_env_builder.test_output_dir / "snapshot"
+    os.makedirs(output_path)
+
+    scrubber = S3Scrubber(neon_env_builder)
+    scrubber.tenant_snapshot(tenant_id, output_path)
+
+    assert len(os.listdir(output_path)) > 0
+
+    workload.stop()
+
+    # Stop pageservers
+    for pageserver in env.pageservers:
+        pageserver.stop()
+
+    # Drop all shards' local storage
+    for tenant_shard_id in tenant_shard_ids:
+        pageserver = env.get_tenant_pageserver(tenant_shard_id)
+        shutil.rmtree(pageserver.timeline_dir(tenant_shard_id, timeline_id))
+
+    # Replace remote storage contents with the snapshot we downloaded
+    assert isinstance(env.pageserver_remote_storage, S3Storage)
+
+    remote_tenant_path = env.pageserver_remote_storage.tenant_path(tenant_id)
+
+    # Delete current remote storage contents
+    bucket = env.pageserver_remote_storage.bucket_name
+    remote_client = env.pageserver_remote_storage.client
+    deleted = 0
+    for object in remote_client.list_objects_v2(Bucket=bucket, Prefix=remote_tenant_path)[
+        "Contents"
+    ]:
+        key = object["Key"]
+        remote_client.delete_object(Key=key, Bucket=bucket)
+        deleted += 1
+    assert deleted > 0
+
+    # Upload from snapshot
+    for root, _dirs, files in os.walk(output_path):
+        for file in files:
+            full_local_path = os.path.join(root, file)
+            full_remote_path = (
+                env.pageserver_remote_storage.tenants_path()
+                + "/"
+                + full_local_path.removeprefix(f"{output_path}/")
+            )
+            remote_client.upload_file(full_local_path, bucket, full_remote_path)
+
+    for pageserver in env.pageservers:
+        pageserver.start()
+
+    # Check we can read everything
+    workload.validate()

From 90cadfa986327d6ae29bfef32a6a60d67f19c845 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 29 Apr 2024 14:26:21 +0200
Subject: [PATCH 0642/1571] proxy: Adjust retry wake compute (#7537)

## Problem

Right now we always do retry wake compute.

## Summary of changes

Create a list of errors when we could avoid needless retries.
---
 proxy/src/proxy/connect_compute.rs |  9 +++++++-
 proxy/src/proxy/retry.rs           | 34 ++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index f561085588..da6223209f 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -133,10 +133,17 @@ where
 
     error!(error = ?err, "could not connect to compute node");
 
-    let node_info = if !node_info.cached() {
+    let node_info = if !node_info.cached() || !err.should_retry_database_address() {
         // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
         // Do not need to retrieve a new node_info, just return the old one.
         if !err.should_retry(num_retries, connect_to_compute_retry_config) {
+            Metrics::get().proxy.retries_metric.observe(
+                RetriesMetricGroup {
+                    outcome: ConnectOutcome::Failed,
+                    retry_type,
+                },
+                num_retries.into(),
+            );
             return Err(err.into());
         }
         node_info
diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs
index 082e06caa3..36a05ba190 100644
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -10,6 +10,9 @@ pub trait ShouldRetry {
             err => err.could_retry(),
         }
     }
+    fn should_retry_database_address(&self) -> bool {
+        true
+    }
 }
 
 impl ShouldRetry for io::Error {
@@ -33,6 +36,21 @@ impl ShouldRetry for tokio_postgres::error::DbError {
                 | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
         )
     }
+    fn should_retry_database_address(&self) -> bool {
+        use tokio_postgres::error::SqlState;
+        // Here are errors that happens after the user successfully authenticated to the database.
+        // TODO: there are pgbouncer errors that should be retried, but they are not listed here.
+        !matches!(
+            self.code(),
+            &SqlState::TOO_MANY_CONNECTIONS
+                | &SqlState::OUT_OF_MEMORY
+                | &SqlState::SYNTAX_ERROR
+                | &SqlState::T_R_SERIALIZATION_FAILURE
+                | &SqlState::INVALID_CATALOG_NAME
+                | &SqlState::INVALID_SCHEMA_NAME
+                | &SqlState::INVALID_PARAMETER_VALUE
+        )
+    }
 }
 
 impl ShouldRetry for tokio_postgres::Error {
@@ -45,6 +63,15 @@ impl ShouldRetry for tokio_postgres::Error {
             false
         }
     }
+    fn should_retry_database_address(&self) -> bool {
+        if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
+            io::Error::should_retry_database_address(io_err)
+        } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
+            tokio_postgres::error::DbError::should_retry_database_address(db_err)
+        } else {
+            true
+        }
+    }
 }
 
 impl ShouldRetry for compute::ConnectionError {
@@ -55,6 +82,13 @@ impl ShouldRetry for compute::ConnectionError {
             _ => false,
         }
     }
+    fn should_retry_database_address(&self) -> bool {
+        match self {
+            compute::ConnectionError::Postgres(err) => err.should_retry_database_address(),
+            compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(),
+            _ => true,
+        }
+    }
 }
 
 pub fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration {

From 1684bbf16255a5cffd06ca03d9abe1124745b964 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 29 Apr 2024 15:22:13 +0200
Subject: [PATCH 0643/1571] proxy: Create disconnect events (#7535)

## Problem

It's not possible to get the duration of the session from proxy events.

## Summary of changes

* Added a separate events folder in s3, to record disconnect events.
* Disconnect events are exactly the same as normal events, but also have
`disconnect_timestamp` field not empty.
* @oruen suggested to fill it with the same information as the original
events to avoid potentially heavy joins.
---
 proxy/src/bin/pg_sni_router.rs    |   2 +-
 proxy/src/context.rs              |  33 ++++++++--
 proxy/src/context/parquet.rs      | 104 +++++++++++++++++++++---------
 proxy/src/proxy.rs                |   4 +-
 proxy/src/serverless/websocket.rs |   4 +-
 5 files changed, 102 insertions(+), 45 deletions(-)

diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 7a693002a8..fb16b76567 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -279,7 +279,7 @@ async fn handle_client(
 
     // doesn't yet matter as pg-sni-router doesn't report analytics logs
     ctx.set_success();
-    ctx.log();
+    ctx.log_connect();
 
     // Starting from here we only proxy the client's traffic.
     info!("performing the proxy pass...");
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 17b82c08aa..dfd3ef108e 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -20,7 +20,8 @@ use self::parquet::RequestData;
 
 pub mod parquet;
 
-static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
+pub static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
+pub static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
 
 /// Context data for a single request to connect to a database.
 ///
@@ -49,9 +50,12 @@ pub struct RequestMonitoring {
     // extra
     // This sender is here to keep the request monitoring channel open while requests are taking place.
     sender: Option<mpsc::UnboundedSender<RequestData>>,
+    // This sender is only used to log the length of session in case of success.
+    disconnect_sender: Option<mpsc::UnboundedSender<RequestData>>,
     pub latency_timer: LatencyTimer,
     // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
     rejected: Option<bool>,
+    disconnect_timestamp: Option<chrono::DateTime<Utc>>,
 }
 
 #[derive(Clone, Debug)]
@@ -100,7 +104,9 @@ impl RequestMonitoring {
             cold_start_info: ColdStartInfo::Unknown,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
+            disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
             latency_timer: LatencyTimer::new(protocol),
+            disconnect_timestamp: None,
         }
     }
 
@@ -190,11 +196,7 @@ impl RequestMonitoring {
         self.success = true;
     }
 
-    pub fn log(self) {}
-}
-
-impl Drop for RequestMonitoring {
-    fn drop(&mut self) {
+    pub fn log_connect(&mut self) {
         let outcome = if self.success {
             ConnectOutcome::Success
         } else {
@@ -226,4 +228,23 @@ impl Drop for RequestMonitoring {
             let _: Result<(), _> = tx.send(RequestData::from(&*self));
         }
     }
+
+    fn log_disconnect(&mut self) {
+        // If we are here, it's guaranteed that the user successfully connected to the endpoint.
+        // Here we log the length of the session.
+        self.disconnect_timestamp = Some(Utc::now());
+        if let Some(tx) = self.disconnect_sender.take() {
+            let _: Result<(), _> = tx.send(RequestData::from(&*self));
+        }
+    }
+}
+
+impl Drop for RequestMonitoring {
+    fn drop(&mut self) {
+        if self.sender.is_some() {
+            self.log_connect();
+        } else {
+            self.log_disconnect();
+        }
+    }
 }
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 9600321937..8104fe6087 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -19,7 +19,10 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
 use utils::backoff;
 
-use crate::config::{remote_storage_from_toml, OptRemoteStorageConfig};
+use crate::{
+    config::{remote_storage_from_toml, OptRemoteStorageConfig},
+    context::LOG_CHAN_DISCONNECT,
+};
 
 use super::{RequestMonitoring, LOG_CHAN};
 
@@ -31,6 +34,9 @@ pub struct ParquetUploadArgs {
     #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
     parquet_upload_remote_storage: OptRemoteStorageConfig,
 
+    #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
+    parquet_upload_disconnect_events_remote_storage: OptRemoteStorageConfig,
+
     /// How many rows to include in a row group
     #[clap(long, default_value_t = 8192)]
     parquet_upload_row_group_size: usize,
@@ -91,6 +97,8 @@ pub struct RequestData {
     /// Tracks time from session start (HTTP request/libpq TCP handshake)
     /// Through to success/failure
     duration_us: u64,
+    /// If the session was successful after the disconnect, will be created one more event with filled `disconnect_timestamp`.
+    disconnect_timestamp: Option<chrono::NaiveDateTime>,
 }
 
 impl From<&RequestMonitoring> for RequestData {
@@ -120,6 +128,7 @@ impl From<&RequestMonitoring> for RequestData {
                 .elapsed()
                 .unwrap_or_default()
                 .as_micros() as u64, // 584 millenia... good enough
+            disconnect_timestamp: value.disconnect_timestamp.map(|x| x.naive_utc()),
         }
     }
 }
@@ -141,8 +150,9 @@ pub async fn worker(
     LOG_CHAN.set(tx.downgrade()).unwrap();
 
     // setup row stream that will close on cancellation
+    let cancellation_token2 = cancellation_token.clone();
     tokio::spawn(async move {
-        cancellation_token.cancelled().await;
+        cancellation_token2.cancelled().await;
         // dropping this sender will cause the channel to close only once
         // all the remaining inflight requests have been completed.
         drop(tx);
@@ -167,9 +177,38 @@ pub async fn worker(
         test_remote_failures: 0,
     };
 
-    worker_inner(storage, rx, parquet_config).await
+    // TODO(anna): consider moving this to a separate function.
+    if let Some(disconnect_events_storage_config) =
+        config.parquet_upload_disconnect_events_remote_storage
+    {
+        let (tx_disconnect, mut rx_disconnect) = mpsc::unbounded_channel();
+        LOG_CHAN_DISCONNECT.set(tx_disconnect.downgrade()).unwrap();
+
+        // setup row stream that will close on cancellation
+        tokio::spawn(async move {
+            cancellation_token.cancelled().await;
+            // dropping this sender will cause the channel to close only once
+            // all the remaining inflight requests have been completed.
+            drop(tx_disconnect);
+        });
+        let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx));
+        let rx_disconnect = rx_disconnect.map(RequestData::from);
+
+        let storage_disconnect =
+            GenericRemoteStorage::from_config(&disconnect_events_storage_config)
+                .context("remote storage for disconnect events init")?;
+        let parquet_config_disconnect = parquet_config.clone();
+        tokio::try_join!(
+            worker_inner(storage, rx, parquet_config),
+            worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect)
+        )
+        .map(|_| ())
+    } else {
+        worker_inner(storage, rx, parquet_config).await
+    }
 }
 
+#[derive(Clone, Debug)]
 struct ParquetConfig {
     propeties: WriterPropertiesPtr,
     rows_per_group: usize,
@@ -452,6 +491,7 @@ mod tests {
             success: rng.gen(),
             cold_start_info: "no",
             duration_us: rng.gen_range(0..30_000_000),
+            disconnect_timestamp: None,
         }
     }
 
@@ -520,15 +560,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1314385, 3, 6000),
-                (1314378, 3, 6000),
-                (1314438, 3, 6000),
-                (1314395, 3, 6000),
-                (1314525, 3, 6000),
-                (1314367, 3, 6000),
-                (1314159, 3, 6000),
-                (1314395, 3, 6000),
-                (438352, 1, 2000)
+                (1315008, 3, 6000),
+                (1315001, 3, 6000),
+                (1315061, 3, 6000),
+                (1315018, 3, 6000),
+                (1315148, 3, 6000),
+                (1314990, 3, 6000),
+                (1314782, 3, 6000),
+                (1315018, 3, 6000),
+                (438575, 1, 2000)
             ]
         );
 
@@ -558,11 +598,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1220633, 5, 10000),
-                (1226783, 5, 10000),
-                (1228577, 5, 10000),
-                (1227939, 5, 10000),
-                (1219217, 5, 10000)
+                (1221738, 5, 10000),
+                (1227888, 5, 10000),
+                (1229682, 5, 10000),
+                (1229044, 5, 10000),
+                (1220322, 5, 10000)
             ]
         );
 
@@ -594,11 +634,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1206280, 5, 10000),
-                (1206011, 5, 10000),
-                (1206304, 5, 10000),
-                (1206292, 5, 10000),
-                (1206547, 5, 10000)
+                (1207385, 5, 10000),
+                (1207116, 5, 10000),
+                (1207409, 5, 10000),
+                (1207397, 5, 10000),
+                (1207652, 5, 10000)
             ]
         );
 
@@ -623,15 +663,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1314385, 3, 6000),
-                (1314378, 3, 6000),
-                (1314438, 3, 6000),
-                (1314395, 3, 6000),
-                (1314525, 3, 6000),
-                (1314367, 3, 6000),
-                (1314159, 3, 6000),
-                (1314395, 3, 6000),
-                (438352, 1, 2000)
+                (1315008, 3, 6000),
+                (1315001, 3, 6000),
+                (1315061, 3, 6000),
+                (1315018, 3, 6000),
+                (1315148, 3, 6000),
+                (1314990, 3, 6000),
+                (1314782, 3, 6000),
+                (1315018, 3, 6000),
+                (438575, 1, 2000)
             ]
         );
 
@@ -668,7 +708,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(658823, 2, 3001), (658537, 2, 3000), (658333, 2, 2999)]
+            [(659240, 2, 3001), (658954, 2, 3000), (658750, 2, 2999)]
         );
 
         tmpdir.close().unwrap();
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index ddae6536fb..33d73eb675 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -132,16 +132,14 @@ pub async fn task_main(
                 Err(e) => {
                     // todo: log and push to ctx the error kind
                     ctx.set_error_kind(e.get_error_kind());
-                    ctx.log();
                     error!(parent: &span, "per-client task finished with an error: {e:#}");
                 }
                 Ok(None) => {
                     ctx.set_success();
-                    ctx.log();
                 }
                 Ok(Some(p)) => {
                     ctx.set_success();
-                    ctx.log();
+                    ctx.log_connect();
                     match p.proxy_pass().instrument(span.clone()).await {
                         Ok(()) => {}
                         Err(e) => {
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index eddd278b7d..b6cd85af73 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -156,17 +156,15 @@ pub async fn serve_websocket(
         Err(e) => {
             // todo: log and push to ctx the error kind
             ctx.set_error_kind(e.get_error_kind());
-            ctx.log();
             Err(e.into())
         }
         Ok(None) => {
             ctx.set_success();
-            ctx.log();
             Ok(())
         }
         Ok(Some(p)) => {
             ctx.set_success();
-            ctx.log();
+            ctx.log_connect();
             p.proxy_pass().await
         }
     }

From 1f417af9fd7e43de192dcd536d1ff0bab5b85f80 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 29 Apr 2024 17:26:35 +0100
Subject: [PATCH 0644/1571] pagserver: use vectored read path in benchmarks
 (#7498)

## Problem
Benchmarks don't use the vectored read path.

## Summary of changes
* Update the benchmarks to use the vectored read path for both singular
and vectored gets.
* Disable validation for the benchmarks
---
 .github/workflows/build_and_test.yml      | 4 ++++
 control_plane/src/local_env.rs            | 2 ++
 control_plane/src/pageserver.rs           | 7 +++++++
 test_runner/fixtures/neon_fixtures.py     | 7 +++++++
 test_runner/regress/test_compatibility.py | 3 ++-
 5 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 65b573663a..606564f209 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -478,6 +478,7 @@ jobs:
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
           PAGESERVER_GET_VECTORED_IMPL: vectored
           PAGESERVER_GET_IMPL: vectored
+          PAGESERVER_VALIDATE_VEC_GET: true
 
       # Temporary disable this step until we figure out why it's so flaky
       # Ref https://github.com/neondatabase/neon/issues/4540
@@ -557,6 +558,9 @@ jobs:
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_IMPL: vectored
+          PAGESERVER_GET_IMPL: vectored
+          PAGESERVER_VALIDATE_VEC_GET: false
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 2168d4b944..8cbda528a7 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -130,6 +130,7 @@ pub struct PageServerConf {
     pub(crate) virtual_file_io_engine: Option<String>,
     pub(crate) get_vectored_impl: Option<String>,
     pub(crate) get_impl: Option<String>,
+    pub(crate) validate_vectored_get: Option<bool>,
 }
 
 impl Default for PageServerConf {
@@ -143,6 +144,7 @@ impl Default for PageServerConf {
             virtual_file_io_engine: None,
             get_vectored_impl: None,
             get_impl: None,
+            validate_vectored_get: None,
         }
     }
 }
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 0699e47866..52accc5890 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -93,6 +93,7 @@ impl PageServerNode {
             virtual_file_io_engine,
             get_vectored_impl,
             get_impl,
+            validate_vectored_get,
         } = &self.conf;
 
         let id = format!("id={}", id);
@@ -117,6 +118,11 @@ impl PageServerNode {
         } else {
             String::new()
         };
+        let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get {
+            format!("validate_vectored_get={validate_vectored_get}")
+        } else {
+            String::new()
+        };
 
         let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
 
@@ -131,6 +137,7 @@ impl PageServerNode {
             virtual_file_io_engine,
             get_vectored_impl,
             get_impl,
+            validate_vectored_get,
         ];
 
         if let Some(control_plane_api) = &self.env.control_plane_api {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 07db355d98..abe2718a49 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -512,6 +512,11 @@ class NeonEnvBuilder:
             self.pageserver_get_impl = "vectored"
             log.debug('Overriding pageserver get_impl config to "vectored"')
 
+        self.pageserver_validate_vectored_get: Optional[bool] = None
+        if (validate := os.getenv("PAGESERVER_VALIDATE_VEC_GET")) is not None:
+            self.pageserver_validate_vectored_get = bool(validate)
+            log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"')
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -1085,6 +1090,8 @@ class NeonEnv:
                 ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl
             if config.pageserver_get_impl is not None:
                 ps_cfg["get_impl"] = config.pageserver_get_impl
+            if config.pageserver_validate_vectored_get is not None:
+                ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get
 
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 2a371eae72..e1ccb3e0c6 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -228,8 +228,9 @@ def test_forward_compatibility(
     try:
         # Previous version neon_local and pageserver are not aware
         # of the new config.
-        # TODO: remove this once the code reaches main
+        # TODO: remove these once the previous version of neon local supports them
         neon_env_builder.pageserver_get_impl = None
+        neon_env_builder.pageserver_validate_vectored_get = None
 
         neon_env_builder.num_safekeepers = 3
         neon_local_binpath = neon_env_builder.neon_binpath

From 89cae64e38a68045b1f748d5b15d5cd607c9958a Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 29 Apr 2024 12:33:01 -0400
Subject: [PATCH 0645/1571] chore(vm-image): specify sql exporter listen port
 (#7526)

Extracted from https://github.com/neondatabase/neon/pull/7514, 9399 is
the default port. We want to specify it b/c we will start a second sql
exporter for autoscaling agent soon.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index c760744491..061ff38722 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -16,7 +16,7 @@ commands:
   - name: sql-exporter
     user: nobody
     sysvInitAction: respawn
-    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml'
+    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399'
 shutdownHook: |
   su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
 files:

From af7cca494930bad73ddd3f8eb21289000ddeb3ac Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 29 Apr 2024 17:35:08 +0100
Subject: [PATCH 0646/1571] pageserver: tweak vec get validation for ancestor
 lsn wait (#7533)

## Problem
Sequential get runs after vectored get, so it is possible for the later
to time out while waiting for its ancestor's Lsn to become ready and for
the former to succeed (it essentially has a doubled wait time).

## Summary of Changes
Relax the validation to allow for such rare cases.
---
 pageserver/src/tenant/timeline.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c10adf4c22..108acd3925 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1149,6 +1149,11 @@ impl Timeline {
                 panic!(concat!("Sequential get failed with {}, but vectored get did not",
                                " - keyspace={:?} lsn={}"),
                        seq_err, keyspace, lsn) },
+            (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => {
+                // Sequential get runs after vectored get, so it is possible for the later 
+                // to time out while waiting for its ancestor's Lsn to become ready and for the
+                // former to succeed (it essentially has a doubled wait time).
+            },
             (Ok(_), Err(vec_err)) => {
                 panic!(concat!("Vectored get failed with {}, but sequential get did not",
                                " - keyspace={:?} lsn={}"),

From cddafc79e1d528e35cd9d2b5308aea2138790af1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 29 Apr 2024 19:02:53 +0200
Subject: [PATCH 0647/1571] Update azure_* crates to 0.19 (#7539)

Updates the four azure SDK crates used by remote_storage to 0.19.
---
 Cargo.lock | 65 +++++++++++++++++++++++++++---------------------------
 Cargo.toml |  8 +++----
 2 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a130988409..de548bb2de 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -722,9 +722,9 @@ dependencies = [
 
 [[package]]
 name = "azure_core"
-version = "0.18.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd"
+checksum = "70fd680c0d0424a518229b1150922f92653ba2ac933aa000abc8bf1ca08105f7"
 dependencies = [
  "async-trait",
  "base64 0.21.1",
@@ -752,9 +752,9 @@ dependencies = [
 
 [[package]]
 name = "azure_identity"
-version = "0.18.1"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8"
+checksum = "a6d2060f5b2e1c664026ca4edd561306c473be887c1f7a81f10bf06f9b71c63f"
 dependencies = [
  "async-lock",
  "async-trait",
@@ -772,9 +772,9 @@ dependencies = [
 
 [[package]]
 name = "azure_storage"
-version = "0.18.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1"
+checksum = "15d3da73bfa09350e1bd6ae2a260806fcf90048c7e78cd2d8f88be60b19a7266"
 dependencies = [
  "RustyXML",
  "async-lock",
@@ -791,9 +791,9 @@ dependencies = [
 
 [[package]]
 name = "azure_storage_blobs"
-version = "0.18.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872"
+checksum = "149c21834a4105d761e3dd33d91c2a3064acc05a3c978848ea8089102ae45c94"
 dependencies = [
  "RustyXML",
  "azure_core",
@@ -812,9 +812,9 @@ dependencies = [
 
 [[package]]
 name = "azure_svc_blobstorage"
-version = "0.18.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389"
+checksum = "88c888b7bf522d5405218b8613bf0fae7ddaae6ef3bf4ad42ae005993c96ab8b"
 dependencies = [
  "azure_core",
  "bytes",
@@ -2763,9 +2763,9 @@ dependencies = [
 
 [[package]]
 name = "js-sys"
-version = "0.3.63"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790"
+checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
 dependencies = [
  "wasm-bindgen",
 ]
@@ -6413,11 +6413,10 @@ dependencies = [
 
 [[package]]
 name = "tracing"
-version = "0.1.37"
+version = "0.1.40"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
+checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
 dependencies = [
- "cfg-if",
  "log",
  "pin-project-lite",
  "tracing-attributes",
@@ -6437,9 +6436,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.24"
+version = "0.1.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
+checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6448,9 +6447,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-core"
-version = "0.1.31"
+version = "0.1.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
+checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
 dependencies = [
  "once_cell",
  "valuable",
@@ -6905,9 +6904,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.86"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73"
+checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
 dependencies = [
  "cfg-if",
  "wasm-bindgen-macro",
@@ -6915,9 +6914,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.86"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb"
+checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
 dependencies = [
  "bumpalo",
  "log",
@@ -6930,9 +6929,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.36"
+version = "0.4.42"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e"
+checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0"
 dependencies = [
  "cfg-if",
  "js-sys",
@@ -6942,9 +6941,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.86"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258"
+checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -6952,9 +6951,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.86"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
+checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6965,9 +6964,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.86"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93"
+checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
 
 [[package]]
 name = "wasm-streams"
@@ -6999,9 +6998,9 @@ dependencies = [
 
 [[package]]
 name = "web-sys"
-version = "0.3.63"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2"
+checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
diff --git a/Cargo.toml b/Cargo.toml
index 677eaa9ce4..92dcc254d4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -45,10 +45,10 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
-azure_core = "0.18"
-azure_identity = "0.18"
-azure_storage = "0.18"
-azure_storage_blobs = "0.18"
+azure_core = "0.19"
+azure_identity = "0.19"
+azure_storage = "0.19"
+azure_storage_blobs = "0.19"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"

From 11945e64ecec437caf5840edfa7a31ac765ce5e1 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 29 Apr 2024 13:16:42 -0400
Subject: [PATCH 0648/1571] chore(pageserver): improve in-memory layer vectored
 get (#7467)

previously in https://github.com/neondatabase/neon/pull/7375, we
observed that for in-memory layers, we will need to iterate every key in
the key space in order to get the result. The operation can be more
efficient if we use BTreeMap as the in-memory layer representation, even
if we are doing vectored get in a dense keyspace. Imagine a case that
the in-memory layer covers a very little part of the keyspace, and most
of the keys need to be found in lower layers. Using a BTreeMap can
significantly reduce probes for nonexistent keys.

## Summary of changes

* Use BTreeMap as in-memory layer representation.
* Optimize the vectored get flow to utilize the range scan functionality
of BTreeMap.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../tenant/storage_layer/inmemory_layer.rs    | 50 +++++++------------
 1 file changed, 18 insertions(+), 32 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 8ec4d61434..5fb5d231c7 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -17,7 +17,7 @@ use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use std::collections::{BinaryHeap, HashMap, HashSet};
+use std::collections::{BTreeMap, BinaryHeap, HashSet};
 use std::sync::{Arc, OnceLock};
 use std::time::Instant;
 use tracing::*;
@@ -78,10 +78,10 @@ impl std::fmt::Debug for InMemoryLayer {
 }
 
 pub struct InMemoryLayerInner {
-    /// All versions of all pages in the layer are kept here.  Indexed
+    /// All versions of all pages in the layer are kept here. Indexed
     /// by block number and LSN. The value is an offset into the
     /// ephemeral file where the page version is stored.
-    index: HashMap<Key, VecMap<Lsn, u64>>,
+    index: BTreeMap<Key, VecMap<Lsn, u64>>,
 
     /// The values are stored in a serialized format in this file.
     /// Each serialized Value is preceded by a 'u32' length field.
@@ -384,25 +384,20 @@ impl InMemoryLayer {
         let mut planned_block_reads = BinaryHeap::new();
 
         for range in keyspace.ranges.iter() {
-            let mut key = range.start;
-            while key < range.end {
-                if let Some(vec_map) = inner.index.get(&key) {
-                    let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
-                        Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
-                        None => self.start_lsn..end_lsn,
-                    };
+            for (key, vec_map) in inner.index.range(range.start..range.end) {
+                let lsn_range = match reconstruct_state.get_cached_lsn(key) {
+                    Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
+                    None => self.start_lsn..end_lsn,
+                };
 
-                    let slice = vec_map.slice_range(lsn_range);
-                    for (entry_lsn, pos) in slice.iter().rev() {
-                        planned_block_reads.push(BlockRead {
-                            key,
-                            lsn: *entry_lsn,
-                            block_offset: *pos,
-                        });
-                    }
+                let slice = vec_map.slice_range(lsn_range);
+                for (entry_lsn, pos) in slice.iter().rev() {
+                    planned_block_reads.push(BlockRead {
+                        key: *key,
+                        lsn: *entry_lsn,
+                        block_offset: *pos,
+                    });
                 }
-
-                key = key.next();
             }
         }
 
@@ -499,7 +494,7 @@ impl InMemoryLayer {
             end_lsn: OnceLock::new(),
             opened_at: Instant::now(),
             inner: RwLock::new(InMemoryLayerInner {
-                index: HashMap::new(),
+                index: BTreeMap::new(),
                 file,
                 resource_units: GlobalResourceUnits::new(),
             }),
@@ -636,26 +631,17 @@ impl InMemoryLayer {
 
         let cursor = inner.file.block_cursor();
 
-        // Sort the keys because delta layer writer expects them sorted.
-        //
-        // NOTE: this sort can take up significant time if the layer has millions of
-        //       keys. To speed up all the comparisons we convert the key to i128 and
-        //       keep the value as a reference.
-        let mut keys: Vec<_> = inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect();
-        keys.sort_unstable_by_key(|k| k.0);
-
         let ctx = RequestContextBuilder::extend(ctx)
             .page_content_kind(PageContentKind::InMemoryLayer)
             .build();
-        for (key, vec_map) in keys.iter() {
-            let key = Key::from_i128(*key);
+        for (key, vec_map) in inner.index.iter() {
             // Write all page versions
             for (lsn, pos) in vec_map.as_slice() {
                 cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
                 let will_init = Value::des(&buf)?.will_init();
                 let res;
                 (buf, res) = delta_layer_writer
-                    .put_value_bytes(key, *lsn, buf, will_init)
+                    .put_value_bytes(*key, *lsn, buf, will_init)
                     .await;
                 res?;
             }

From 574645412b376fac11125e9960f432ed0c99a44c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 29 Apr 2024 18:46:46 +0100
Subject: [PATCH 0649/1571] pageserver: shard-aware keyspace partitioning
 (#6778)

## Problem

Followup to https://github.com/neondatabase/neon/pull/6776

While #6776 makes compaction safe on sharded tenants, the logic for
keyspace partitioning remains inefficient: it assumes that the size of
data on a pageserver can be calculated simply as the range between start
and end of a Range -- this is not the case in sharded tenants, where
data within a range belongs to a variety of shards.

Closes: https://github.com/neondatabase/neon/issues/6774

## Summary of changes

I experimented with using a sharding-aware range type in KeySpace to
replace all the Range<Key> uses, but the impact on other code was quite
large (many places use the ranges), and not all of them need this
property of being able to approximate the physical size of data within a
key range.

So I compromised on expressing this as a ShardedRange type, but only
using that type selctively: during keyspace repartition, and in tiered
compaction when accumulating key ranges.

- keyspace partitioning methods take sharding parameters as an input
- new `ShardedRange` type wraps a Range<Key> and a shard identity
- ShardedRange::page_count is the shard-aware replacement for
key_range_size
- Callers that don't need to be shard-aware (e.g. vectored get code that
just wants to count the number of keys in a keyspace) can use
ShardedRange::raw_size to get the faster, shard-naive code (same as old
`key_range_size`)
- Compaction code is updated to carry a shard identity so that it can
use shard aware calculations
- Unit tests for the new fragmentation logic.
- Add a test for compaction on sharded tenants, that validates that we
generate appropriately sized image layers (this fails before fixing
keyspace partitioning)
---
 libs/pageserver_api/src/keyspace.rs           | 744 ++++++++++++++++--
 libs/pageserver_api/src/shard.rs              |   2 +-
 pageserver/compaction/src/compact_tiered.rs   |  22 +-
 pageserver/compaction/src/helpers.rs          |  11 +-
 pageserver/compaction/src/interface.rs        |  10 +-
 pageserver/compaction/src/simulator.rs        |   8 +-
 pageserver/src/basebackup.rs                  |   5 +-
 .../tenant/storage_layer/inmemory_layer.rs    |   2 +-
 pageserver/src/tenant/timeline.rs             |  12 +-
 pageserver/src/tenant/timeline/compaction.rs  |   6 +-
 test_runner/regress/test_compaction.py        | 101 +++
 11 files changed, 841 insertions(+), 82 deletions(-)

diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index eed4835f25..4283da18ab 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -1,7 +1,10 @@
 use postgres_ffi::BLCKSZ;
 use std::ops::Range;
 
-use crate::key::Key;
+use crate::{
+    key::Key,
+    shard::{ShardCount, ShardIdentity},
+};
 use itertools::Itertools;
 
 ///
@@ -14,6 +17,234 @@ pub struct KeySpace {
     pub ranges: Vec<Range<Key>>,
 }
 
+/// Represents a contiguous half-open range of the keyspace, masked according to a particular
+/// ShardNumber's stripes: within this range of keys, only some "belong" to the current
+/// shard.
+///
+/// When we iterate over keys within this object, we will skip any keys that don't belong
+/// to this shard.
+///
+/// The start + end keys may not belong to the shard: these specify where layer files should
+/// start  + end, but we will never actually read/write those keys.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct ShardedRange<'a> {
+    pub shard_identity: &'a ShardIdentity,
+    pub range: Range<Key>,
+}
+
+// Calculate the size of a range within the blocks of the same relation, or spanning only the
+// top page in the previous relation's space.
+fn contiguous_range_len(range: &Range<Key>) -> u32 {
+    debug_assert!(is_contiguous_range(range));
+    if range.start.field6 == 0xffffffff {
+        range.end.field6 + 1
+    } else {
+        range.end.field6 - range.start.field6
+    }
+}
+
+/// Return true if this key range includes only keys in the same relation's data blocks, or
+/// just spanning one relation and the logical size (0xffffffff) block of the relation before it.
+///
+/// Contiguous in this context means we know the keys are in use _somewhere_, but it might not
+/// be on our shard.  Later in ShardedRange we do the extra work to figure out how much
+/// of a given contiguous range is present on one shard.
+///
+/// This matters, because:
+/// - Within such ranges, keys are used contiguously.  Outside such ranges it is sparse.
+/// - Within such ranges, we may calculate distances using simple subtraction of field6.
+fn is_contiguous_range(range: &Range<Key>) -> bool {
+    range.start.field1 == range.end.field1
+        && range.start.field2 == range.end.field2
+        && range.start.field3 == range.end.field3
+        && range.start.field4 == range.end.field4
+        && (range.start.field5 == range.end.field5
+            || (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5))
+}
+
+impl<'a> ShardedRange<'a> {
+    pub fn new(range: Range<Key>, shard_identity: &'a ShardIdentity) -> Self {
+        Self {
+            shard_identity,
+            range,
+        }
+    }
+
+    /// Break up this range into chunks, each of which has at least one local key in it if the
+    /// total range has at least one local key.
+    pub fn fragment(self, target_nblocks: u32) -> Vec<(u32, Range<Key>)> {
+        // Optimization for single-key case (e.g. logical size keys)
+        if self.range.end == self.range.start.add(1) {
+            return vec![(
+                if self.shard_identity.is_key_disposable(&self.range.start) {
+                    0
+                } else {
+                    1
+                },
+                self.range,
+            )];
+        }
+
+        if !is_contiguous_range(&self.range) {
+            // Ranges that span relations are not fragmented.  We only get these ranges as a result
+            // of operations that act on existing layers, so we trust that the existing range is
+            // reasonably small.
+            return vec![(u32::MAX, self.range)];
+        }
+
+        let mut fragments: Vec<(u32, Range<Key>)> = Vec::new();
+
+        let mut cursor = self.range.start;
+        while cursor < self.range.end {
+            let advance_by = self.distance_to_next_boundary(cursor);
+            let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor);
+
+            // If the previous fragment is undersized, then we seek to consume enough
+            // blocks to complete it.
+            let (want_blocks, merge_last_fragment) = match fragments.last_mut() {
+                Some(frag) if frag.0 < target_nblocks => (target_nblocks - frag.0, Some(frag)),
+                Some(frag) => {
+                    // Prev block is complete, want the full number.
+                    (
+                        target_nblocks,
+                        if is_fragment_disposable {
+                            // If this current range will be empty (not shard-local data), we will merge into previous
+                            Some(frag)
+                        } else {
+                            None
+                        },
+                    )
+                }
+                None => {
+                    // First iteration, want the full number
+                    (target_nblocks, None)
+                }
+            };
+
+            let advance_by = if is_fragment_disposable {
+                advance_by
+            } else {
+                std::cmp::min(advance_by, want_blocks)
+            };
+
+            let next_cursor = cursor.add(advance_by);
+
+            let this_frag = (
+                if is_fragment_disposable {
+                    0
+                } else {
+                    advance_by
+                },
+                cursor..next_cursor,
+            );
+            cursor = next_cursor;
+
+            if let Some(last_fragment) = merge_last_fragment {
+                // Previous fragment was short or this one is empty, merge into it
+                last_fragment.0 += this_frag.0;
+                last_fragment.1.end = this_frag.1.end;
+            } else {
+                fragments.push(this_frag);
+            }
+        }
+
+        fragments
+    }
+
+    /// Estimate the physical pages that are within this range, on this shard.  This returns
+    /// u32::MAX if the range spans relations: this return value should be interpreted as "large".
+    pub fn page_count(&self) -> u32 {
+        // Special cases for single keys like logical sizes
+        if self.range.end == self.range.start.add(1) {
+            return if self.shard_identity.is_key_disposable(&self.range.start) {
+                0
+            } else {
+                1
+            };
+        }
+
+        // We can only do an authentic calculation of contiguous key ranges
+        if !is_contiguous_range(&self.range) {
+            return u32::MAX;
+        }
+
+        // Special case for single sharded tenants: our logical and physical sizes are the same
+        if self.shard_identity.count < ShardCount::new(2) {
+            return contiguous_range_len(&self.range);
+        }
+
+        // Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs
+        // to Self, and add the stripe's block count to our total if so.
+        let mut result: u64 = 0;
+        let mut cursor = self.range.start;
+        while cursor < self.range.end {
+            // Count up to the next stripe_size boundary or end of range
+            let advance_by = self.distance_to_next_boundary(cursor);
+
+            // If this blocks in this stripe belong to us, add them to our count
+            if !self.shard_identity.is_key_disposable(&cursor) {
+                result += advance_by as u64;
+            }
+
+            cursor = cursor.add(advance_by);
+        }
+
+        if result > u32::MAX as u64 {
+            u32::MAX
+        } else {
+            result as u32
+        }
+    }
+
+    /// Advance the cursor to the next potential fragment boundary: this is either
+    /// a stripe boundary, or the end of the range.
+    fn distance_to_next_boundary(&self, cursor: Key) -> u32 {
+        let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end));
+
+        if self.shard_identity.count < ShardCount::new(2) {
+            // Optimization: don't bother stepping through stripes if the tenant isn't sharded.
+            return distance_to_range_end;
+        }
+
+        if cursor.field6 == 0xffffffff {
+            // We are wrapping from one relation's logical size to the next relation's first data block
+            return 1;
+        }
+
+        let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0;
+        let stripe_remainder = self.shard_identity.stripe_size.0
+            - (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0);
+
+        if cfg!(debug_assertions) {
+            // We should never overflow field5 and field6 -- our callers check this earlier
+            // and would have returned their u32::MAX cases if the input range violated this.
+            let next_cursor = cursor.add(stripe_remainder);
+            debug_assert!(
+                next_cursor.field1 == cursor.field1
+                    && next_cursor.field2 == cursor.field2
+                    && next_cursor.field3 == cursor.field3
+                    && next_cursor.field4 == cursor.field4
+                    && next_cursor.field5 == cursor.field5
+            )
+        }
+
+        std::cmp::min(stripe_remainder, distance_to_range_end)
+    }
+
+    /// Whereas `page_count` estimates the number of pages physically in this range on this shard,
+    /// this function simply calculates the number of pages in the space, without accounting for those
+    /// pages that would not actually be stored on this node.
+    ///
+    /// Don't use this function in code that works with physical entities like layer files.
+    fn raw_size(range: &Range<Key>) -> u32 {
+        if is_contiguous_range(range) {
+            contiguous_range_len(range)
+        } else {
+            u32::MAX
+        }
+    }
+}
+
 impl KeySpace {
     /// Create a key space with a single range.
     pub fn single(key_range: Range<Key>) -> Self {
@@ -25,39 +256,36 @@ impl KeySpace {
     /// Partition a key space into roughly chunks of roughly 'target_size' bytes
     /// in each partition.
     ///
-    pub fn partition(&self, target_size: u64) -> KeyPartitioning {
+    pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning {
         // Assume that each value is 8k in size.
-        let target_nblocks = (target_size / BLCKSZ as u64) as usize;
+        let target_nblocks = (target_size / BLCKSZ as u64) as u32;
 
         let mut parts = Vec::new();
         let mut current_part = Vec::new();
         let mut current_part_size: usize = 0;
         for range in &self.ranges {
-            // If appending the next contiguous range in the keyspace to the current
-            // partition would cause it to be too large, start a new partition.
-            let this_size = key_range_size(range) as usize;
-            if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
-                parts.push(KeySpace {
-                    ranges: current_part,
-                });
-                current_part = Vec::new();
-                current_part_size = 0;
-            }
+            // While doing partitioning, wrap the range in ShardedRange so that our size calculations
+            // will respect shard striping rather than assuming all keys within a range are present.
+            let range = ShardedRange::new(range.clone(), shard_identity);
 
-            // If the next range is larger than 'target_size', split it into
-            // 'target_size' chunks.
-            let mut remain_size = this_size;
-            let mut start = range.start;
-            while remain_size > target_nblocks {
-                let next = start.add(target_nblocks as u32);
-                parts.push(KeySpace {
-                    ranges: vec![start..next],
-                });
-                start = next;
-                remain_size -= target_nblocks
+            // Chunk up the range into parts that each contain up to target_size local blocks
+            for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) {
+                // If appending the next contiguous range in the keyspace to the current
+                // partition would cause it to be too large, and our current partition
+                // covers at least one block that is physically present in this shard,
+                // then start a new partition
+                if current_part_size + frag_on_shard_size as usize > target_nblocks as usize
+                    && current_part_size > 0
+                {
+                    parts.push(KeySpace {
+                        ranges: current_part,
+                    });
+                    current_part = Vec::new();
+                    current_part_size = 0;
+                }
+                current_part.push(frag_range.start..frag_range.end);
+                current_part_size += frag_on_shard_size as usize;
             }
-            current_part.push(start..range.end);
-            current_part_size += remain_size;
         }
 
         // add last partition that wasn't full yet.
@@ -71,7 +299,7 @@ impl KeySpace {
     }
 
     pub fn is_empty(&self) -> bool {
-        self.total_size() == 0
+        self.total_raw_size() == 0
     }
 
     /// Merge another keyspace into the current one.
@@ -164,11 +392,11 @@ impl KeySpace {
         self.ranges.last().map(|range| range.end)
     }
 
-    #[allow(unused)]
-    pub fn total_size(&self) -> usize {
+    /// The size of the keyspace in pages, before accounting for sharding
+    pub fn total_raw_size(&self) -> usize {
         self.ranges
             .iter()
-            .map(|range| key_range_size(range) as usize)
+            .map(|range| ShardedRange::raw_size(range) as usize)
             .sum()
     }
 
@@ -242,7 +470,7 @@ impl KeySpaceAccum {
 
     #[inline(always)]
     pub fn add_range(&mut self, range: Range<Key>) {
-        self.size += key_range_size(&range) as u64;
+        self.size += ShardedRange::raw_size(&range) as u64;
 
         match self.accum.as_mut() {
             Some(accum) => {
@@ -274,7 +502,9 @@ impl KeySpaceAccum {
         std::mem::take(self).to_keyspace()
     }
 
-    pub fn size(&self) -> u64 {
+    // The total number of keys in this object, ignoring any sharding effects that might cause some of
+    // the keys to be omitted in storage on this shard.
+    pub fn raw_size(&self) -> u64 {
         self.size
     }
 }
@@ -330,36 +560,19 @@ impl KeySpaceRandomAccum {
     }
 }
 
-#[inline(always)]
-pub fn key_range_size(key_range: &Range<Key>) -> u32 {
-    let start = key_range.start;
-    let end = key_range.end;
-
-    if end.field1 != start.field1
-        || end.field2 != start.field2
-        || end.field3 != start.field3
-        || end.field4 != start.field4
-    {
-        return u32::MAX;
-    }
-
-    let start = (start.field5 as u64) << 32 | start.field6 as u64;
-    let end = (end.field5 as u64) << 32 | end.field6 as u64;
-
-    let diff = end - start;
-    if diff > u32::MAX as u64 {
-        u32::MAX
-    } else {
-        diff as u32
-    }
-}
-
 pub fn singleton_range(key: Key) -> Range<Key> {
     key..key.next()
 }
 
 #[cfg(test)]
 mod tests {
+    use rand::{RngCore, SeedableRng};
+
+    use crate::{
+        models::ShardParameters,
+        shard::{ShardCount, ShardNumber},
+    };
+
     use super::*;
     use std::fmt::Write;
 
@@ -402,14 +615,17 @@ mod tests {
             accum.add_range(range.clone());
         }
 
-        let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
-        assert_eq!(accum.size(), expected_size);
+        let expected_size: u64 = ranges
+            .iter()
+            .map(|r| ShardedRange::raw_size(r) as u64)
+            .sum();
+        assert_eq!(accum.raw_size(), expected_size);
 
         assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
-        assert_eq!(accum.size(), 0);
+        assert_eq!(accum.raw_size(), 0);
 
         assert_ks_eq(&accum.consume_keyspace(), vec![]);
-        assert_eq!(accum.size(), 0);
+        assert_eq!(accum.raw_size(), 0);
 
         for range in &ranges {
             accum.add_range(range.clone());
@@ -706,4 +922,412 @@ mod tests {
             ]
         );
     }
+    #[test]
+    fn sharded_range_relation_gap() {
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+
+        let range = ShardedRange::new(
+            Range {
+                start: Key::from_hex("000000067F00000005000040100300000000").unwrap(),
+                end: Key::from_hex("000000067F00000005000040130000004000").unwrap(),
+            },
+            &shard_identity,
+        );
+
+        // Key range spans relations, expect MAX
+        assert_eq!(range.page_count(), u32::MAX);
+    }
+
+    #[test]
+    fn shard_identity_keyspaces_single_key() {
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(1),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+
+        let range = ShardedRange::new(
+            Range {
+                start: Key::from_hex("000000067f000000010000007000ffffffff").unwrap(),
+                end: Key::from_hex("000000067f00000001000000700100000000").unwrap(),
+            },
+            &shard_identity,
+        );
+        // Single-key range on logical size key
+        assert_eq!(range.page_count(), 1);
+    }
+
+    /// Test the helper that we use to identify ranges which go outside the data blocks of a single relation
+    #[test]
+    fn contiguous_range_check() {
+        assert!(!is_contiguous_range(
+            &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
+                ..Key::from_hex("000000067f00000001000004df0100000003").unwrap())
+        ),);
+
+        // The ranges goes all the way up to the 0xffffffff, including it: this is
+        // not considered a rel block range because 0xffffffff stores logical sizes,
+        // not blocks.
+        assert!(!is_contiguous_range(
+            &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
+                ..Key::from_hex("000000067f00000001000004df0100000000").unwrap())
+        ),);
+
+        // Keys within the normal data region of a relation
+        assert!(is_contiguous_range(
+            &(Key::from_hex("000000067f00000001000004df0000000000").unwrap()
+                ..Key::from_hex("000000067f00000001000004df0000000080").unwrap())
+        ),);
+
+        // The logical size key of one forkno, then some blocks in the next
+        assert!(is_contiguous_range(
+            &(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap()
+                ..Key::from_hex("000000067f00000001000004df0100000080").unwrap())
+        ),);
+    }
+
+    #[test]
+    fn shard_identity_keyspaces_forkno_gap() {
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(1),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+
+        let range = ShardedRange::new(
+            Range {
+                start: Key::from_hex("000000067f00000001000004df00fffffffe").unwrap(),
+                end: Key::from_hex("000000067f00000001000004df0100000003").unwrap(),
+            },
+            &shard_identity,
+        );
+
+        // Range spanning the end of one forkno and the start of the next: we do not attempt to
+        // calculate a valid size, because we have no way to know if they keys between start
+        // and end are actually in use.
+        assert_eq!(range.page_count(), u32::MAX);
+    }
+
+    #[test]
+    fn shard_identity_keyspaces_one_relation() {
+        for shard_number in 0..4 {
+            let shard_identity = ShardIdentity::new(
+                ShardNumber(shard_number),
+                ShardCount::new(4),
+                ShardParameters::DEFAULT_STRIPE_SIZE,
+            )
+            .unwrap();
+
+            let range = ShardedRange::new(
+                Range {
+                    start: Key::from_hex("000000067f00000001000000ae0000000000").unwrap(),
+                    end: Key::from_hex("000000067f00000001000000ae0000000001").unwrap(),
+                },
+                &shard_identity,
+            );
+
+            // Very simple case: range covering block zero of one relation, where that block maps to shard zero
+            if shard_number == 0 {
+                assert_eq!(range.page_count(), 1);
+            } else {
+                // Other shards should perceive the range's size as zero
+                assert_eq!(range.page_count(), 0);
+            }
+        }
+    }
+
+    /// Test helper: construct a ShardedRange and call fragment() on it, returning
+    /// the total page count in the range and the fragments.
+    fn do_fragment(
+        range_start: Key,
+        range_end: Key,
+        shard_identity: &ShardIdentity,
+        target_nblocks: u32,
+    ) -> (u32, Vec<(u32, Range<Key>)>) {
+        let range = ShardedRange::new(
+            Range {
+                start: range_start,
+                end: range_end,
+            },
+            shard_identity,
+        );
+
+        let page_count = range.page_count();
+        let fragments = range.fragment(target_nblocks);
+
+        // Invariant: we always get at least one fragment
+        assert!(!fragments.is_empty());
+
+        // Invariant: the first/last fragment start/end should equal the input start/end
+        assert_eq!(fragments.first().unwrap().1.start, range_start);
+        assert_eq!(fragments.last().unwrap().1.end, range_end);
+
+        if page_count > 0 {
+            // Invariant: every fragment must contain at least one shard-local page, if the
+            // total range contains at least one shard-local page
+            let all_nonzero = fragments.iter().all(|f| f.0 > 0);
+            if !all_nonzero {
+                eprintln!("Found a zero-length fragment: {:?}", fragments);
+            }
+            assert!(all_nonzero);
+        } else {
+            // A range with no shard-local pages should always be returned as a single fragment
+            assert_eq!(fragments, vec![(0, range_start..range_end)]);
+        }
+
+        // Invariant: fragments must be ordered and non-overlapping
+        let mut last: Option<Range<Key>> = None;
+        for frag in &fragments {
+            if let Some(last) = last {
+                assert!(frag.1.start >= last.end);
+                assert!(frag.1.start > last.start);
+            }
+            last = Some(frag.1.clone())
+        }
+
+        // Invariant: fragments respect target_nblocks
+        for frag in &fragments {
+            assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks);
+        }
+
+        (page_count, fragments)
+    }
+
+    /// Really simple tests for fragment(), on a range that just contains a single stripe
+    /// for a single tenant.
+    #[test]
+    fn sharded_range_fragment_simple() {
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+
+        // A range which we happen to know covers exactly one stripe which belongs to this shard
+        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
+
+        // Ask for stripe_size blocks, we get the whole stripe
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 32768),
+            (32768, vec![(32768, input_start..input_end)])
+        );
+
+        // Ask for more, we still get the whole stripe
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 10000000),
+            (32768, vec![(32768, input_start..input_end)])
+        );
+
+        // Ask for target_nblocks of half the stripe size, we get two halves
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 16384),
+            (
+                32768,
+                vec![
+                    (16384, input_start..input_start.add(16384)),
+                    (16384, input_start.add(16384)..input_end)
+                ]
+            )
+        );
+    }
+
+    #[test]
+    fn sharded_range_fragment_multi_stripe() {
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+
+        // A range which covers multiple stripes, exactly one of which belongs to the current shard.
+        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
+        // Ask for all the blocks, get a fragment that covers the whole range but reports
+        // its size to be just the blocks belonging to our shard.
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 131072),
+            (32768, vec![(32768, input_start..input_end)])
+        );
+
+        // Ask for a sub-stripe quantity
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 16000),
+            (
+                32768,
+                vec![
+                    (16000, input_start..input_start.add(16000)),
+                    (16000, input_start.add(16000)..input_start.add(32000)),
+                    (768, input_start.add(32000)..input_end),
+                ]
+            )
+        );
+
+        // Try on a range that starts slightly after our owned stripe
+        assert_eq!(
+            do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
+            (32767, vec![(32767, input_start.add(1)..input_end)])
+        );
+    }
+
+    /// Test our calculations work correctly when we start a range from the logical size key of
+    /// a previous relation.
+    #[test]
+    fn sharded_range_fragment_starting_from_logical_size() {
+        let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
+
+        // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 0x10000),
+            (0x8001, vec![(0x8001, input_start..input_end)])
+        );
+
+        // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
+        // store all logical sizes)
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(1),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 0x10000),
+            (0x1, vec![(0x1, input_start..input_end)])
+        );
+    }
+
+    /// Test that ShardedRange behaves properly when used on un-sharded data
+    #[test]
+    fn sharded_range_fragment_unsharded() {
+        let shard_identity = ShardIdentity::unsharded();
+
+        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000010000").unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 0x8000),
+            (
+                0x10000,
+                vec![
+                    (0x8000, input_start..input_start.add(0x8000)),
+                    (0x8000, input_start.add(0x8000)..input_start.add(0x10000))
+                ]
+            )
+        );
+    }
+
+    #[test]
+    fn sharded_range_fragment_cross_relation() {
+        let shard_identity = ShardIdentity::unsharded();
+
+        // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
+        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
+        let input_end = Key::from_hex("000000068f00000001000000ae0000010000").unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 0x8000),
+            (u32::MAX, vec![(u32::MAX, input_start..input_end),])
+        );
+
+        // Same, but using a sharded identity
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 0x8000),
+            (u32::MAX, vec![(u32::MAX, input_start..input_end),])
+        );
+    }
+
+    #[test]
+    fn sharded_range_fragment_tiny_nblocks() {
+        let shard_identity = ShardIdentity::unsharded();
+
+        // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
+        let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap();
+        let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 16),
+            (
+                0x38,
+                vec![
+                    (16, input_start..input_start.add(16)),
+                    (16, input_start.add(16)..input_start.add(32)),
+                    (16, input_start.add(32)..input_start.add(48)),
+                    (8, input_start.add(48)..input_end),
+                ]
+            )
+        );
+    }
+
+    #[test]
+    fn sharded_range_fragment_fuzz() {
+        // Use a fixed seed: we don't want to explicitly pick values, but we do want
+        // the test to be reproducible.
+        let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef);
+
+        for _i in 0..1000 {
+            let shard_identity = if prng.next_u32() % 2 == 0 {
+                ShardIdentity::unsharded()
+            } else {
+                let shard_count = prng.next_u32() % 127 + 1;
+                ShardIdentity::new(
+                    ShardNumber((prng.next_u32() % shard_count) as u8),
+                    ShardCount::new(shard_count as u8),
+                    ShardParameters::DEFAULT_STRIPE_SIZE,
+                )
+                .unwrap()
+            };
+
+            let target_nblocks = prng.next_u32() % 65536 + 1;
+
+            let start_offset = prng.next_u32() % 16384;
+
+            // Try ranges up to 4GiB in size, that are always at least 1
+            let range_size = prng.next_u32() % 8192 + 1;
+
+            // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
+            let input_start = Key::from_hex("000000067F00000001000004E10000000000")
+                .unwrap()
+                .add(start_offset);
+            let input_end = input_start.add(range_size);
+
+            // This test's main success conditions are the invariants baked into do_fragment
+            let (_total_size, fragments) =
+                do_fragment(input_start, input_end, &shard_identity, target_nblocks);
+
+            // Pick a random key within the range and check it appears in the output
+            let example_key = input_start.add(prng.next_u32() % range_size);
+
+            // Panic on unwrap if it isn't found
+            let example_key_frag = fragments
+                .iter()
+                .find(|f| f.1.contains(&example_key))
+                .unwrap();
+
+            // Check that the fragment containing our random key has a nonzero size if
+            // that key is shard-local
+            let example_key_local = !shard_identity.is_key_disposable(&example_key);
+            if example_key_local {
+                assert!(example_key_frag.0 > 0);
+            }
+        }
+    }
 }
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 6a8a5cc8f3..2d7f6772b2 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -451,7 +451,7 @@ impl ShardIdentity {
     /// An identity with number=0 count=0 is a "none" identity, which represents legacy
     /// tenants.  Modern single-shard tenants should not use this: they should
     /// have number=0 count=1.
-    pub fn unsharded() -> Self {
+    pub const fn unsharded() -> Self {
         Self {
             number: ShardNumber(0),
             count: ShardCount(0),
diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index 5261746b22..137b93055a 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -18,6 +18,7 @@
 //! database size. For example, if the logical database size is 10 GB, we would
 //! generate new image layers every 10 GB of WAL.
 use futures::StreamExt;
+use pageserver_api::shard::ShardIdentity;
 use tracing::{debug, info};
 
 use std::collections::{HashSet, VecDeque};
@@ -125,6 +126,7 @@ async fn compact_level<E: CompactionJobExecutor>(
     }
 
     let mut state = LevelCompactionState {
+        shard_identity: *executor.get_shard_identity(),
         target_file_size,
         _lsn_range: lsn_range.clone(),
         layers: layer_fragments,
@@ -164,6 +166,8 @@ struct LevelCompactionState<'a, E>
 where
     E: CompactionJobExecutor,
 {
+    shard_identity: ShardIdentity,
+
     // parameters
     target_file_size: u64,
 
@@ -366,6 +370,7 @@ where
                 .executor
                 .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
                 .await?,
+            &self.shard_identity,
         ) * 8192;
 
         let wal_size = job
@@ -430,7 +435,7 @@ where
             keyspace,
             self.target_file_size / 8192,
         );
-        while let Some(key_range) = window.choose_next_image() {
+        while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
             new_jobs.push(CompactionJob::<E> {
                 key_range,
                 lsn_range: job.lsn_range.clone(),
@@ -623,7 +628,12 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
     }
 
     // Advance the cursor until it reaches 'target_keysize'.
-    fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
+    fn advance_until_size(
+        &mut self,
+        w: &KeyspaceWindowHead<K>,
+        max_size: u64,
+        shard_identity: &ShardIdentity,
+    ) {
         while self.accum_keysize < max_size && !self.reached_end(w) {
             let curr_range = &w.keyspace[self.keyspace_idx];
             if self.end_key < curr_range.start {
@@ -632,7 +642,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
             }
 
             // We're now within 'curr_range'. Can we advance past it completely?
-            let distance = K::key_range_size(&(self.end_key..curr_range.end));
+            let distance = K::key_range_size(&(self.end_key..curr_range.end), shard_identity);
             if (self.accum_keysize + distance as u64) < max_size {
                 // oh yeah, it fits
                 self.end_key = curr_range.end;
@@ -641,7 +651,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
             } else {
                 // advance within the range
                 let skip_key = self.end_key.skip_some();
-                let distance = K::key_range_size(&(self.end_key..skip_key));
+                let distance = K::key_range_size(&(self.end_key..skip_key), shard_identity);
                 if (self.accum_keysize + distance as u64) < max_size {
                     self.end_key = skip_key;
                     self.accum_keysize += distance as u64;
@@ -677,7 +687,7 @@ where
         }
     }
 
-    fn choose_next_image(&mut self) -> Option<Range<K>> {
+    fn choose_next_image(&mut self, shard_identity: &ShardIdentity) -> Option<Range<K>> {
         if self.start_pos.keyspace_idx == self.head.keyspace.len() {
             // we've reached the end
             return None;
@@ -687,6 +697,7 @@ where
         next_pos.advance_until_size(
             &self.head,
             self.start_pos.accum_keysize + self.head.target_keysize,
+            shard_identity,
         );
 
         // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
@@ -695,6 +706,7 @@ where
         end_pos.advance_until_size(
             &self.head,
             self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
+            shard_identity,
         );
         if end_pos.reached_end(&self.head) {
             // gobble up any unused keyspace between the last used key and end of the range
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index 9de6363d6e..1b80373ba7 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -5,6 +5,7 @@ use crate::interface::*;
 use futures::future::BoxFuture;
 use futures::{Stream, StreamExt};
 use itertools::Itertools;
+use pageserver_api::shard::ShardIdentity;
 use pin_project_lite::pin_project;
 use std::collections::BinaryHeap;
 use std::collections::VecDeque;
@@ -13,11 +14,17 @@ use std::ops::{DerefMut, Range};
 use std::pin::Pin;
 use std::task::{ready, Poll};
 
-pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
+pub fn keyspace_total_size<K>(
+    keyspace: &CompactionKeySpace<K>,
+    shard_identity: &ShardIdentity,
+) -> u64
 where
     K: CompactionKey,
 {
-    keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
+    keyspace
+        .iter()
+        .map(|r| K::key_range_size(r, shard_identity) as u64)
+        .sum()
 }
 
 pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs
index 5dc62e506f..35519b5d0a 100644
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -4,7 +4,7 @@
 //! All the heavy lifting is done by the create_image and create_delta
 //! functions that the implementor provides.
 use futures::Future;
-use pageserver_api::{key::Key, keyspace::key_range_size};
+use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity};
 use std::ops::Range;
 use utils::lsn::Lsn;
 
@@ -32,6 +32,8 @@ pub trait CompactionJobExecutor {
     // Functions that the planner uses to support its decisions
     // ----
 
+    fn get_shard_identity(&self) -> &ShardIdentity;
+
     /// Return all layers that overlap the given bounding box.
     fn get_layers(
         &mut self,
@@ -98,7 +100,7 @@ pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
     ///
     /// This returns u32, for compatibility with Repository::key. If the
     /// distance is larger, return u32::MAX.
-    fn key_range_size(key_range: &Range<Self>) -> u32;
+    fn key_range_size(key_range: &Range<Self>, shard_identity: &ShardIdentity) -> u32;
 
     // return "self + 1"
     fn next(&self) -> Self;
@@ -113,8 +115,8 @@ impl CompactionKey for Key {
     const MIN: Self = Self::MIN;
     const MAX: Self = Self::MAX;
 
-    fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
-        key_range_size(r)
+    fn key_range_size(r: &std::ops::Range<Self>, shard_identity: &ShardIdentity) -> u32 {
+        ShardedRange::new(r.clone(), shard_identity).page_count()
     }
     fn next(&self) -> Key {
         (self as &Key).next()
diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs
index 6c00df3a65..3543df64fa 100644
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -3,6 +3,7 @@ mod draw;
 use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
 
 use futures::StreamExt;
+use pageserver_api::shard::ShardIdentity;
 use rand::Rng;
 use tracing::info;
 
@@ -71,7 +72,7 @@ impl interface::CompactionKey for Key {
     const MIN: Self = u64::MIN;
     const MAX: Self = u64::MAX;
 
-    fn key_range_size(key_range: &Range<Self>) -> u32 {
+    fn key_range_size(key_range: &Range<Self>, _shard_identity: &ShardIdentity) -> u32 {
         std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
     }
 
@@ -434,6 +435,11 @@ impl interface::CompactionJobExecutor for MockTimeline {
     type ImageLayer = Arc<MockImageLayer>;
     type RequestContext = MockRequestContext;
 
+    fn get_shard_identity(&self) -> &ShardIdentity {
+        static IDENTITY: ShardIdentity = ShardIdentity::unsharded();
+        &IDENTITY
+    }
+
     async fn get_layers(
         &mut self,
         key_range: &Range<Self::Key>,
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index ba047745f1..8c51e93643 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -263,7 +263,10 @@ where
                 .timeline
                 .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
                 .await?
-                .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
+                .partition(
+                    self.timeline.get_shard_identity(),
+                    Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
+                );
 
             let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
 
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 5fb5d231c7..1a85481e97 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -401,7 +401,7 @@ impl InMemoryLayer {
             }
         }
 
-        let keyspace_size = keyspace.total_size();
+        let keyspace_size = keyspace.total_raw_size();
 
         let mut completed_keys = HashSet::new();
         while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 108acd3925..c5068386d6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -936,7 +936,7 @@ impl Timeline {
             return Err(GetVectoredError::InvalidLsn(lsn));
         }
 
-        let key_count = keyspace.total_size().try_into().unwrap();
+        let key_count = keyspace.total_raw_size().try_into().unwrap();
         if key_count > Timeline::MAX_GET_VECTORED_KEYS {
             return Err(GetVectoredError::Oversized(key_count));
         }
@@ -1076,7 +1076,7 @@ impl Timeline {
         mut reconstruct_state: ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        let get_kind = if keyspace.total_size() == 1 {
+        let get_kind = if keyspace.total_raw_size() == 1 {
             GetKind::Singular
         } else {
             GetKind::Vectored
@@ -3207,7 +3207,7 @@ impl Timeline {
                 }
             }
 
-            if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() {
+            if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() {
                 break;
             }
 
@@ -3220,7 +3220,7 @@ impl Timeline {
             timeline = &*timeline_owned;
         }
 
-        if keyspace.total_size() != 0 {
+        if keyspace.total_raw_size() != 0 {
             return Err(GetVectoredError::MissingKey(keyspace.start().unwrap()));
         }
 
@@ -3911,7 +3911,7 @@ impl Timeline {
         }
 
         let keyspace = self.collect_keyspace(lsn, ctx).await?;
-        let partitioning = keyspace.partition(partition_size);
+        let partitioning = keyspace.partition(&self.shard_identity, partition_size);
 
         *partitioning_guard = (partitioning, lsn);
 
@@ -4064,7 +4064,7 @@ impl Timeline {
                     key = key.next();
 
                     // Maybe flush `key_rest_accum`
-                    if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
+                    if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
                         || last_key_in_range
                     {
                         let results = self
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8075775bbc..b92832a3de 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -15,7 +15,7 @@ use anyhow::{anyhow, Context};
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;
@@ -831,6 +831,10 @@ impl CompactionJobExecutor for TimelineAdaptor {
 
     type RequestContext = crate::context::RequestContext;
 
+    fn get_shard_identity(&self) -> &ShardIdentity {
+        self.timeline.get_shard_identity()
+    }
+
     async fn get_layers(
         &mut self,
         key_range: &Range<Key>,
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 3902819d3d..43a3323462 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -1,4 +1,6 @@
+import json
 import os
+from typing import Optional
 
 import pytest
 from fixtures.log_helper import log
@@ -89,3 +91,102 @@ page_cache_size=10
     # was chosen empirically for this workload.
     assert non_vectored_average < 8
     assert vectored_average < 8
+
+
+# Stripe sizes in number of pages.
+TINY_STRIPES = 16
+LARGE_STRIPES = 32768
+
+
+@pytest.mark.parametrize(
+    "shard_count,stripe_size", [(None, None), (4, TINY_STRIPES), (4, LARGE_STRIPES)]
+)
+def test_sharding_compaction(
+    neon_env_builder: NeonEnvBuilder, stripe_size: int, shard_count: Optional[int]
+):
+    """
+    Use small stripes, small layers, and small compaction thresholds to exercise how compaction
+    and image layer generation interacts with sharding.
+
+    We are looking for bugs that might emerge from the way sharding uses sparse layer files that
+    only contain some of the keys in the key range covered by the layer, such as errors estimating
+    the size of layers that might result in too-small layer files.
+    """
+
+    compaction_target_size = 128 * 1024
+
+    TENANT_CONF = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": f"{128 * 1024}",
+        "compaction_threshold": "1",
+        "compaction_target_size": f"{compaction_target_size}",
+        # no PITR horizon, we specify the horizon when we request on-demand GC
+        "pitr_interval": "0s",
+        # disable background compaction and GC. We invoke it manually when we want it to happen.
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # create image layers eagerly: we want to exercise image layer creation in this test.
+        "image_creation_threshold": "1",
+        "image_layer_creation_check_threshold": 0,
+    }
+
+    neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF,
+        initial_tenant_shard_count=shard_count,
+        initial_tenant_shard_stripe_size=stripe_size,
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(64)
+    for _i in range(0, 10):
+        # Each of these does some writes then a checkpoint: because we set image_creation_threshold to 1,
+        # these should result in image layers each time we write some data into a shard, and also shards
+        # recieving less data hitting their "empty image layer" path (wherre they should skip writing the layer,
+        # rather than asserting)
+        workload.churn_rows(64)
+
+    # Assert that we got some image layers: this is important because this test's purpose is to exercise the sharding changes
+    # to Timeline::create_image_layers, so if we weren't creating any image layers we wouldn't be doing our job.
+    shard_has_image_layers = []
+    for shard in env.storage_controller.locate(tenant_id):
+        pageserver = env.get_pageserver(shard["node_id"])
+        shard_id = shard["shard_id"]
+        layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id)
+        image_layer_sizes = {}
+        for layer in layer_map.historic_layers:
+            if layer.kind == "Image":
+                image_layer_sizes[layer.layer_file_name] = layer.layer_file_size
+
+                # Pageserver should assert rather than emit an empty layer file, but double check here
+                assert layer.layer_file_size is not None
+                assert layer.layer_file_size > 0
+
+        shard_has_image_layers.append(len(image_layer_sizes) > 1)
+        log.info(f"Shard {shard_id} image layer sizes: {json.dumps(image_layer_sizes, indent=2)}")
+
+        if stripe_size == TINY_STRIPES:
+            # Checking the average size validates that our keyspace partitioning is  properly respecting sharding: if
+            # it was not, we would tend to get undersized layers because the partitioning would overestimate the physical
+            # data in a keyrange.
+            #
+            # We only do this check with tiny stripes, because large stripes may not give all shards enough
+            # data to have statistically significant image layers
+            avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes)  # type: ignore
+            log.info(f"Shard {shard_id} average image layer size: {avg_size}")
+            assert avg_size > compaction_target_size / 2
+
+    if stripe_size == TINY_STRIPES:
+        # Expect writes were scattered across all pageservers: they should all have compacted some image layers
+        assert all(shard_has_image_layers)
+    else:
+        # With large stripes, it is expected that most of our writes went to one pageserver, so we just require
+        # that at least one of them has some image layers.
+        assert any(shard_has_image_layers)
+
+    # Assert that everything is still readable
+    workload.validate()

From 577982b7782aceaa0782ef4295663d72d39b09aa Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 30 Apr 2024 11:04:54 +0100
Subject: [PATCH 0650/1571] pageserver: remove workarounds from #7454 (#7550)

PR #7454 included a workaround that let any existing bugged databases
start up. Having used that already, we may now

Closes: https://github.com/neondatabase/neon/issues/7480
---
 libs/pageserver_api/src/shard.rs | 18 ------------------
 pageserver/src/basebackup.rs     | 17 ++---------------
 2 files changed, 2 insertions(+), 33 deletions(-)

diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 2d7f6772b2..d769b2fd2f 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -538,24 +538,6 @@ impl ShardIdentity {
         }
     }
 
-    /// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
-    ///
-    /// When we fail to read a forknum block, this function tells us whether we may ignore the error
-    /// as a symptom of that issue.
-    pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
-        if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
-            return false;
-        }
-
-        let mut hash = murmurhash32(key.field4);
-        hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
-        let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
-
-        // The key may be affected by issue #7454: it is an initfork and it would not
-        // have mapped to shard 0 until we fixed that issue.
-        mapped_shard != ShardNumber(0)
-    }
-
     /// Return true if the key should be discarded if found in this shard's
     /// data store, e.g. during compaction after a split.
     ///
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 8c51e93643..53abd8bfb9 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
+use pageserver_api::key::{key_to_slru_block, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -300,20 +300,7 @@ where
                 if rel.forknum == INIT_FORKNUM {
                     // I doubt we need _init fork itself, but having it at least
                     // serves as a marker relation is unlogged.
-                    if let Err(_e) = self.add_rel(rel, rel).await {
-                        if self
-                            .timeline
-                            .get_shard_identity()
-                            .is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
-                        {
-                            // Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
-                            // whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup.  This allows
-                            // postgres to start up.  The relation won't work, but it will be possible to DROP TABLE on it and
-                            // recreate.
-                            tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
-                            continue;
-                        }
-                    };
+                    self.add_rel(rel, rel).await?;
                     self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
                     continue;
                 }

From 84b6b95783eaecea06b40e2e87ddcdd70aa9e504 Mon Sep 17 00:00:00 2001
From: Cihan Demirci <128653800+fcdm@users.noreply.github.com>
Date: Tue, 30 Apr 2024 14:17:01 +0100
Subject: [PATCH 0651/1571] docs: fix unintentional file link (#7506)

Not sure if this should actually be a link pointing to the
`persistence.rs` file but following the conventions of the rest of the
file, change `persistence.rs` reference to simply be a file name
mention.
---
 docs/storage_controller.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/storage_controller.md b/docs/storage_controller.md
index 4cb796edaa..daf4d0c8b7 100644
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -30,7 +30,7 @@ The storage controller uses a postgres database to persist a subset of its state
 persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
 rebuilt on startup.
 
-The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
+The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
 
 The `diesel` crate is used for defining models & migrations.
 

From 45c625fb349c3dbe711e5868bfa389da298bc960 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 30 Apr 2024 09:39:10 -0400
Subject: [PATCH 0652/1571] feat(pageserver): separate sparse and dense
 keyspace  (#7503)

extracted (and tested) from
https://github.com/neondatabase/neon/pull/7468, part of
https://github.com/neondatabase/neon/issues/7462.

The current codebase assumes the keyspace is dense -- which means that
if we have a keyspace of 0x00-0x100, we assume every key (e.g., 0x00,
0x01, 0x02, ...) exists in the storage engine. However, the assumption
does not hold any more in metadata keyspace. The metadata keyspace is
sparse. It is impossible to do per-key check.

Ideally, we should not have the assumption of dense keyspace at all, but
this would incur a lot of refactors. Therefore, we split the keyspaces
we have to dense/sparse and handle them differently in the code for now.
At some point in the future, we should assume all keyspaces are sparse.

## Summary of changes

* Split collect_keyspace to return dense+sparse keyspace.
* Do not allow generating image layers for sparse keyspace (for now --
will fix this next week, we need image layers anyways).
* Generate delta layers for sparse keyspace.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/keyspace.rs           |  27 ++
 .../pageserver_api/src/models/partitioning.rs |  14 +-
 pageserver/src/http/routes.rs                 |   6 +-
 pageserver/src/pgdatadir_mapping.rs           |  12 +-
 pageserver/src/tenant/layer_map.rs            |   1 +
 .../tenant/storage_layer/inmemory_layer.rs    |  24 +-
 pageserver/src/tenant/timeline.rs             | 251 ++++++++++++------
 pageserver/src/tenant/timeline/compaction.rs  |  37 ++-
 8 files changed, 269 insertions(+), 103 deletions(-)

diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 4283da18ab..a9ad3aca18 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -17,6 +17,10 @@ pub struct KeySpace {
     pub ranges: Vec<Range<Key>>,
 }
 
+/// A wrapper type for sparse keyspaces.
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
+pub struct SparseKeySpace(pub KeySpace);
+
 /// Represents a contiguous half-open range of the keyspace, masked according to a particular
 /// ShardNumber's stripes: within this range of keys, only some "belong" to the current
 /// shard.
@@ -435,10 +439,33 @@ pub struct KeyPartitioning {
     pub parts: Vec<KeySpace>,
 }
 
+/// Represents a partitioning of the sparse key space.
+#[derive(Clone, Debug, Default)]
+pub struct SparseKeyPartitioning {
+    pub parts: Vec<SparseKeySpace>,
+}
+
 impl KeyPartitioning {
     pub fn new() -> Self {
         KeyPartitioning { parts: Vec::new() }
     }
+
+    /// Convert a key partitioning to a sparse partition.
+    pub fn into_sparse(self) -> SparseKeyPartitioning {
+        SparseKeyPartitioning {
+            parts: self.parts.into_iter().map(SparseKeySpace).collect(),
+        }
+    }
+}
+
+impl SparseKeyPartitioning {
+    /// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will
+    /// cause long/dead loops.
+    pub fn into_dense(self) -> KeyPartitioning {
+        KeyPartitioning {
+            parts: self.parts.into_iter().map(|x| x.0).collect(),
+        }
+    }
 }
 
 ///
diff --git a/libs/pageserver_api/src/models/partitioning.rs b/libs/pageserver_api/src/models/partitioning.rs
index 0d287f7be0..f6644be635 100644
--- a/libs/pageserver_api/src/models/partitioning.rs
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -1,9 +1,11 @@
 use utils::lsn::Lsn;
 
+use crate::keyspace::SparseKeySpace;
+
 #[derive(Debug, PartialEq, Eq)]
 pub struct Partitioning {
     pub keys: crate::keyspace::KeySpace,
-
+    pub sparse_keys: crate::keyspace::SparseKeySpace,
     pub at_lsn: Lsn,
 }
 
@@ -32,6 +34,8 @@ impl serde::Serialize for Partitioning {
         let mut map = serializer.serialize_map(Some(2))?;
         map.serialize_key("keys")?;
         map.serialize_value(&KeySpace(&self.keys))?;
+        map.serialize_key("sparse_keys")?;
+        map.serialize_value(&KeySpace(&self.sparse_keys.0))?;
         map.serialize_key("at_lsn")?;
         map.serialize_value(&WithDisplay(&self.at_lsn))?;
         map.end()
@@ -99,6 +103,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
         #[derive(serde::Deserialize)]
         struct De {
             keys: KeySpace,
+            sparse_keys: KeySpace,
             #[serde_as(as = "serde_with::DisplayFromStr")]
             at_lsn: Lsn,
         }
@@ -107,6 +112,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
         Ok(Self {
             at_lsn: de.at_lsn,
             keys: de.keys.0,
+            sparse_keys: SparseKeySpace(de.sparse_keys.0),
         })
     }
 }
@@ -133,6 +139,12 @@ mod tests {
                 "030000000000000000000000000000000003"
               ]
             ],
+            "sparse_keys": [
+              [
+                "620000000000000000000000000000000000",
+                "620000000000000000000000000000000003"
+              ]
+            ],
             "at_lsn": "0/2240160"
         }
         "#;
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 9a280c2e0c..ae1e7aac78 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1918,12 +1918,14 @@ async fn timeline_collect_keyspace(
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
         let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
         let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
-        let keys = timeline
+        let (dense_ks, sparse_ks) = timeline
             .collect_keyspace(at_lsn, &ctx)
             .await
             .map_err(|e| ApiError::InternalServerError(e.into()))?;
 
-        let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };
+        // This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace.
+        // Therefore, we split dense/sparse keys in this API.
+        let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn };
 
         json_response(StatusCode::OK, res)
     }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index c76c2d5451..015191b875 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -23,6 +23,7 @@ use pageserver_api::key::{
     slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
     AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
+use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -730,11 +731,13 @@ impl Timeline {
     /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
     /// Anything that's not listed maybe removed from the underlying storage (from
     /// that LSN forwards).
+    ///
+    /// The return value is (dense keyspace, sparse keyspace).
     pub(crate) async fn collect_keyspace(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
-    ) -> Result<KeySpace, CollectKeySpaceError> {
+    ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
         // Iterate through key ranges, greedily packing them into partitions
         let mut result = KeySpaceAccum::new();
 
@@ -806,7 +809,12 @@ impl Timeline {
         if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
             result.add_key(AUX_FILES_KEY);
         }
-        Ok(result.to_keyspace())
+
+        Ok((
+            result.to_keyspace(),
+            /* AUX sparse key space */
+            SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
+        ))
     }
 
     /// Get cached size of relation if it not updated after specified LSN
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 4c4cd90c99..3c4de8fe4d 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -916,6 +916,7 @@ mod tests {
         assert_eq!(lhs, rhs);
     }
 
+    #[cfg(test)]
     fn brute_force_range_search(
         layer_map: &LayerMap,
         key_range: Range<Key>,
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 1a85481e97..a2ae8ec29d 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -597,14 +597,17 @@ impl InMemoryLayer {
         }
     }
 
-    /// Write this frozen in-memory layer to disk.
+    /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
+    /// layer will only contain the key range the user specifies, and may return `None`
+    /// if there are no matching keys.
     ///
     /// Returns a new delta layer with all the same data as this in-memory layer
     pub(crate) async fn write_to_disk(
         &self,
         timeline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> Result<ResidentLayer> {
+        key_range: Option<Range<Key>>,
+    ) -> Result<Option<ResidentLayer>> {
         // Grab the lock in read-mode. We hold it over the I/O, but because this
         // layer is not writeable anymore, no one should be trying to acquire the
         // write lock on it, so we shouldn't block anyone. There's one exception
@@ -618,6 +621,21 @@ impl InMemoryLayer {
 
         let end_lsn = *self.end_lsn.get().unwrap();
 
+        let keys: Vec<_> = if let Some(key_range) = key_range {
+            inner
+                .index
+                .iter()
+                .filter(|(k, _)| key_range.contains(k))
+                .map(|(k, m)| (k.to_i128(), m))
+                .collect()
+        } else {
+            inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect()
+        };
+
+        if keys.is_empty() {
+            return Ok(None);
+        }
+
         let mut delta_layer_writer = DeltaLayerWriter::new(
             self.conf,
             self.timeline_id,
@@ -649,6 +667,6 @@ impl InMemoryLayer {
 
         // MAX is used here because we identify L0 layers by full key range
         let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
-        Ok(delta_layer)
+        Ok(Some(delta_layer))
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c5068386d6..2a2c5d4ee5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -17,7 +17,7 @@ use fail::fail_point;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
-    keyspace::KeySpaceAccum,
+    keyspace::{KeySpaceAccum, SparseKeyPartitioning},
     models::{
         CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
         EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
@@ -55,7 +55,6 @@ use std::{
     ops::ControlFlow,
 };
 
-use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
@@ -66,6 +65,7 @@ use crate::{
     disk_usage_eviction_task::DiskUsageEvictionInfo,
     pgdatadir_mapping::CollectKeySpaceError,
 };
+use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind};
 use crate::{
     disk_usage_eviction_task::finite_f32,
     tenant::storage_layer::{
@@ -86,7 +86,7 @@ use crate::{
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::{
-    GetKind, TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
+    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
@@ -137,6 +137,25 @@ pub(super) enum FlushLoopState {
     Exited,
 }
 
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum ImageLayerCreationMode {
+    /// Try to create image layers based on `time_for_new_image_layer`. Used in compaction code path.
+    Try,
+    /// Force creating the image layers if possible. For now, no image layers will be created
+    /// for metadata keys. Used in compaction code path with force flag enabled.
+    Force,
+    /// Initial ingestion of the data, and no data should be dropped in this function. This
+    /// means that no metadata keys should be included in the partitions. Used in flush frozen layer
+    /// code path.
+    Initial,
+}
+
+impl std::fmt::Display for ImageLayerCreationMode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
 /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub(crate) struct Hole {
@@ -317,7 +336,7 @@ pub struct Timeline {
     pub initdb_lsn: Lsn,
 
     /// When did we last calculate the partitioning?
-    partitioning: tokio::sync::Mutex<(KeyPartitioning, Lsn)>,
+    partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
 
     /// Configuration: how often should the partitioning be recalculated.
     repartition_threshold: u64,
@@ -2104,7 +2123,10 @@ impl Timeline {
                     // initial logical size is 0.
                     LogicalSize::empty_initial()
                 },
-                partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
+                partitioning: tokio::sync::Mutex::new((
+                    (KeyPartitioning::new(), KeyPartitioning::new().into_sparse()),
+                    Lsn(0),
+                )),
                 repartition_threshold: 0,
                 last_image_layer_creation_check_at: AtomicLsn::new(0),
 
@@ -3106,7 +3128,6 @@ impl Timeline {
             if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
                 let layer = guard.get_from_desc(&layer);
                 drop(guard);
-
                 // Get all the data needed to reconstruct the page version from this layer.
                 // But if we have an older cached page image, no need to go past that.
                 let lsn_floor = max(cached_lsn + 1, lsn_floor);
@@ -3227,7 +3248,7 @@ impl Timeline {
         Ok(())
     }
 
-    /// Collect the reconstruct data for a ketspace from the specified timeline.
+    /// Collect the reconstruct data for a keyspace from the specified timeline.
     ///
     /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
     /// the current keyspace. The current keyspace of the search at any given timeline
@@ -3656,66 +3677,103 @@ impl Timeline {
         // files instead. This is possible as long as *all* the data imported into the
         // repository have the same LSN.
         let lsn_range = frozen_layer.get_lsn_range();
-        let (layers_to_upload, delta_layer_to_add) =
-            if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
-                #[cfg(test)]
-                match &mut *self.flush_loop_state.lock().unwrap() {
-                    FlushLoopState::NotStarted | FlushLoopState::Exited => {
-                        panic!("flush loop not running")
-                    }
-                    FlushLoopState::Running {
-                        initdb_optimization_count,
-                        ..
-                    } => {
+
+        // Whether to directly create image layers for this flush, or flush them as delta layers
+        let create_image_layer =
+            lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1);
+
+        #[cfg(test)]
+        {
+            match &mut *self.flush_loop_state.lock().unwrap() {
+                FlushLoopState::NotStarted | FlushLoopState::Exited => {
+                    panic!("flush loop not running")
+                }
+                FlushLoopState::Running {
+                    expect_initdb_optimization,
+                    initdb_optimization_count,
+                    ..
+                } => {
+                    if create_image_layer {
                         *initdb_optimization_count += 1;
-                    }
-                }
-                // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
-                // require downloading anything during initial import.
-                let (partitioning, _lsn) = self
-                    .repartition(
-                        self.initdb_lsn,
-                        self.get_compaction_target_size(),
-                        EnumSet::empty(),
-                        ctx,
-                    )
-                    .await?;
-
-                if self.cancel.is_cancelled() {
-                    return Err(FlushLayerError::Cancelled);
-                }
-
-                // For image layers, we add them immediately into the layer map.
-                (
-                    self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
-                        .await?,
-                    None,
-                )
-            } else {
-                #[cfg(test)]
-                match &mut *self.flush_loop_state.lock().unwrap() {
-                    FlushLoopState::NotStarted | FlushLoopState::Exited => {
-                        panic!("flush loop not running")
-                    }
-                    FlushLoopState::Running {
-                        expect_initdb_optimization,
-                        ..
-                    } => {
+                    } else {
                         assert!(!*expect_initdb_optimization, "expected initdb optimization");
                     }
                 }
-                // Normal case, write out a L0 delta layer file.
-                // `create_delta_layer` will not modify the layer map.
-                // We will remove frozen layer and add delta layer in one atomic operation later.
-                let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
-                (
-                    // FIXME: even though we have a single image and single delta layer assumption
-                    // we push them to vec
-                    vec![layer.clone()],
-                    Some(layer),
+            }
+        }
+
+        let (layers_to_upload, delta_layer_to_add) = if create_image_layer {
+            // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
+            // require downloading anything during initial import.
+            let ((rel_partition, metadata_partition), _lsn) = self
+                .repartition(
+                    self.initdb_lsn,
+                    self.get_compaction_target_size(),
+                    EnumSet::empty(),
+                    ctx,
                 )
+                .await?;
+
+            if self.cancel.is_cancelled() {
+                return Err(FlushLayerError::Cancelled);
+            }
+
+            // For metadata, always create delta layers.
+            let delta_layer = if !metadata_partition.parts.is_empty() {
+                assert_eq!(
+                    metadata_partition.parts.len(),
+                    1,
+                    "currently sparse keyspace should only contain a single aux file keyspace"
+                );
+                let metadata_keyspace = &metadata_partition.parts[0];
+                assert_eq!(
+                    metadata_keyspace.0.ranges.len(),
+                    1,
+                    "aux file keyspace should be a single range"
+                );
+                self.create_delta_layer(
+                    &frozen_layer,
+                    ctx,
+                    Some(metadata_keyspace.0.ranges[0].clone()),
+                )
+                .await?
+            } else {
+                None
             };
 
+            // For image layers, we add them immediately into the layer map.
+            let mut layers_to_upload = Vec::new();
+            layers_to_upload.extend(
+                self.create_image_layers(
+                    &rel_partition,
+                    self.initdb_lsn,
+                    ImageLayerCreationMode::Initial,
+                    ctx,
+                )
+                .await?,
+            );
+
+            if let Some(delta_layer) = delta_layer {
+                layers_to_upload.push(delta_layer.clone());
+                (layers_to_upload, Some(delta_layer))
+            } else {
+                (layers_to_upload, None)
+            }
+        } else {
+            // Normal case, write out a L0 delta layer file.
+            // `create_delta_layer` will not modify the layer map.
+            // We will remove frozen layer and add delta layer in one atomic operation later.
+            let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else {
+                panic!("delta layer cannot be empty if no filter is applied");
+            };
+            (
+                // FIXME: even though we have a single image and single delta layer assumption
+                // we push them to vec
+                vec![layer.clone()],
+                Some(layer),
+            )
+        };
+
         pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable");
 
         if self.cancel.is_cancelled() {
@@ -3835,12 +3893,18 @@ impl Timeline {
         self: &Arc<Self>,
         frozen_layer: &Arc<InMemoryLayer>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
+        key_range: Option<Range<Key>>,
+    ) -> anyhow::Result<Option<ResidentLayer>> {
         let self_clone = Arc::clone(self);
         let frozen_layer = Arc::clone(frozen_layer);
         let ctx = ctx.attached_child();
         let work = async move {
-            let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
+            let Some(new_delta) = frozen_layer
+                .write_to_disk(&self_clone, &ctx, key_range)
+                .await?
+            else {
+                return Ok(None);
+            };
             // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
             // We just need to fsync the directory in which these inodes are linked,
             // which we know to be the timeline directory.
@@ -3859,7 +3923,7 @@ impl Timeline {
                 .sync_all()
                 .await
                 .fatal_err("VirtualFile::sync_all timeline dir");
-            anyhow::Ok(new_delta)
+            anyhow::Ok(Some(new_delta))
         };
         // Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking.
         // Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`.
@@ -3886,19 +3950,20 @@ impl Timeline {
         partition_size: u64,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
+    ) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> {
         let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
             // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
             // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
             // and hence before the compaction task starts.
             anyhow::bail!("repartition() called concurrently, this should not happen");
         };
-        if lsn < partitioning_guard.1 {
+        let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
+        if lsn < *partition_lsn {
             anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
         }
 
-        let distance = lsn.0 - partitioning_guard.1 .0;
-        if partitioning_guard.1 != Lsn(0)
+        let distance = lsn.0 - partition_lsn.0;
+        if *partition_lsn != Lsn(0)
             && distance <= self.repartition_threshold
             && !flags.contains(CompactFlags::ForceRepartition)
         {
@@ -3907,13 +3972,18 @@ impl Timeline {
                 threshold = self.repartition_threshold,
                 "no repartitioning needed"
             );
-            return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
+            return Ok((
+                (dense_partition.clone(), sparse_partition.clone()),
+                *partition_lsn,
+            ));
         }
 
-        let keyspace = self.collect_keyspace(lsn, ctx).await?;
-        let partitioning = keyspace.partition(&self.shard_identity, partition_size);
-
-        *partitioning_guard = (partitioning, lsn);
+        let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
+        let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
+        let sparse_partitioning = SparseKeyPartitioning {
+            parts: vec![sparse_ks],
+        }; // no partitioning for metadata keys for now
+        *partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn);
 
         Ok((partitioning_guard.0.clone(), partitioning_guard.1))
     }
@@ -3969,12 +4039,12 @@ impl Timeline {
         false
     }
 
-    #[tracing::instrument(skip_all, fields(%lsn, %force))]
+    #[tracing::instrument(skip_all, fields(%lsn, %mode))]
     async fn create_image_layers(
         self: &Arc<Timeline>,
         partitioning: &KeyPartitioning,
         lsn: Lsn,
-        force: bool,
+        mode: ImageLayerCreationMode,
         ctx: &RequestContext,
     ) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
@@ -4011,19 +4081,26 @@ impl Timeline {
         for partition in partitioning.parts.iter() {
             let img_range = start..partition.ranges.last().unwrap().end;
 
-            let do_it = if force {
-                true
-            } else if check_for_image_layers {
-                // [`Self::time_for_new_image_layer`] is CPU expensive,
-                // so skip if we've not collected enough WAL since the last time
-                self.time_for_new_image_layer(partition, lsn).await
-            } else {
-                false
-            };
-
-            if !do_it {
-                start = img_range.end;
-                continue;
+            if partition.overlaps(&Key::metadata_key_range()) {
+                // TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a
+                // rather big change. Keep this patch small for now.
+                match mode {
+                    ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => {
+                        // skip image layer creation anyways for metadata keys.
+                        start = img_range.end;
+                        continue;
+                    }
+                    ImageLayerCreationMode::Initial => {
+                        return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
+                    }
+                }
+            } else if let ImageLayerCreationMode::Try = mode {
+                // check_for_image_layers = false -> skip
+                // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
+                if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await {
+                    start = img_range.end;
+                    continue;
+                }
             }
 
             let mut image_layer_writer = ImageLayerWriter::new(
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index b92832a3de..6ea37bf793 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -9,7 +9,7 @@ use std::ops::{Deref, Range};
 use std::sync::Arc;
 
 use super::layer_manager::LayerManager;
-use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};
+use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
 
 use anyhow::{anyhow, Context};
 use enumset::EnumSet;
@@ -102,7 +102,7 @@ impl Timeline {
             )
             .await
         {
-            Ok((partitioning, lsn)) => {
+            Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
                 // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
                 let image_ctx = RequestContextBuilder::extend(ctx)
                     .access_stats_behavior(AccessStatsBehavior::Skip)
@@ -115,17 +115,37 @@ impl Timeline {
 
                 // 3. Create new image layers for partitions that have been modified
                 // "enough".
-                let layers = self
+                let dense_layers = self
                     .create_image_layers(
-                        &partitioning,
+                        &dense_partitioning,
                         lsn,
-                        flags.contains(CompactFlags::ForceImageLayerCreation),
+                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
+                            ImageLayerCreationMode::Force
+                        } else {
+                            ImageLayerCreationMode::Try
+                        },
                         &image_ctx,
                     )
                     .await
                     .map_err(anyhow::Error::from)?;
 
-                self.upload_new_image_layers(layers)?;
+                // For now, nothing will be produced...
+                let sparse_layers = self
+                    .create_image_layers(
+                        &sparse_partitioning.clone().into_dense(),
+                        lsn,
+                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
+                            ImageLayerCreationMode::Force
+                        } else {
+                            ImageLayerCreationMode::Try
+                        },
+                        &image_ctx,
+                    )
+                    .await
+                    .map_err(anyhow::Error::from)?;
+                assert!(sparse_layers.is_empty());
+
+                self.upload_new_image_layers(dense_layers)?;
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -758,8 +778,9 @@ impl Timeline {
             return Err(CompactionError::ShuttingDown);
         }
 
-        let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
-        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
+        let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?;
+        // TODO(chi): ignore sparse_keyspace for now, compact it in the future.
+        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks));
 
         pageserver_compaction::compact_tiered::compact_tiered(
             &mut adaptor,

From eb53345d48b14d2ad474a8983a09c42d82ca2e5d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 30 Apr 2024 15:16:15 +0100
Subject: [PATCH 0653/1571] pageserver: reduce runtime of init_tenant_mgr
 (#7553)

## Problem

`init_tenant_mgr` blocks the rest of pageserver startup, including
starting the admin API.

This was noticeable in #7475 , where the init_tenant_mgr runtime could
be long enough to trip the controller's 30 second heartbeat timeout.

## Summary of changes

- When detaching tenants during startup, spawn the background deletes as
background tasks instead of doing them inline
- Write all configs before spawning any tenants, so that the config
writes aren't fighting tenants for system resources
- Write configs with some concurrency (16) rather than writing them all
sequentially.
---
 pageserver/src/tenant/mgr.rs | 105 +++++++++++++++++++++++------------
 1 file changed, 68 insertions(+), 37 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 2c9476ba0a..006d501daa 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,6 +2,7 @@
 //! page server.
 
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
+use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
@@ -253,17 +254,15 @@ impl TenantsMap {
     }
 }
 
+/// Precursor to deletion of a tenant dir: we do a fast rename to a tmp path, and then
+/// the slower actual deletion in the background.
+///
 /// This is "safe" in that that it won't leave behind a partially deleted directory
 /// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
 /// the contents.
 ///
 /// This is pageserver-specific, as it relies on future processes after a crash to check
 /// for TEMP_FILE_SUFFIX when loading things.
-async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
-    let tmp_path = safe_rename_tenant_dir(path).await?;
-    fs::remove_dir_all(tmp_path).await
-}
-
 async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<Utf8PathBuf> {
     let parent = path
         .as_ref()
@@ -286,6 +285,28 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
     Ok(tmp_path)
 }
 
+/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
+/// the background, and thereby avoid blocking any API requests on this deletion completing.
+fn spawn_background_purge(tmp_path: Utf8PathBuf) {
+    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
+    let task_tenant_id = None;
+
+    task_mgr::spawn(
+        task_mgr::BACKGROUND_RUNTIME.handle(),
+        TaskKind::MgmtRequest,
+        task_tenant_id,
+        None,
+        "tenant_files_delete",
+        false,
+        async move {
+            fs::remove_dir_all(tmp_path.as_path())
+                .await
+                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+        },
+    );
+}
+
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
     Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
 
@@ -570,7 +591,11 @@ pub async fn init_tenant_mgr(
     );
     TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);
 
-    // Construct `Tenant` objects and start them running
+    // Accumulate futures for writing tenant configs, so that we can execute in parallel
+    let mut config_write_futs = Vec::new();
+
+    // Update the location configs according to the re-attach response and persist them to disk
+    tracing::info!("Updating {} location configs", tenant_configs.len());
     for (tenant_shard_id, location_conf) in tenant_configs {
         let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
 
@@ -597,18 +622,22 @@ pub async fn init_tenant_mgr(
         const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
             SecondaryLocationConfig { warm: true };
 
-        // Update the location config according to the re-attach response
         if let Some(tenant_modes) = &tenant_modes {
             // We have a generation map: treat it as the authority for whether
             // this tenant is really attached.
             match tenant_modes.get(&tenant_shard_id) {
                 None => {
                     info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
-                    if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
-                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                            "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
-                        );
-                    }
+
+                    match safe_rename_tenant_dir(&tenant_dir_path).await {
+                        Ok(tmp_path) => {
+                            spawn_background_purge(tmp_path);
+                        }
+                        Err(e) => {
+                            error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                            "Failed to move detached tenant directory '{tenant_dir_path}': {e:?}");
+                        }
+                    };
 
                     // We deleted local content: move on to next tenant, don't try and spawn this one.
                     continue;
@@ -654,8 +683,32 @@ pub async fn init_tenant_mgr(
 
         // Presence of a generation number implies attachment: attach the tenant
         // if it wasn't already, and apply the generation number.
-        Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
+        config_write_futs.push(async move {
+            let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
+            (tenant_shard_id, location_conf, r)
+        });
+    }
 
+    // Execute config writes with concurrency, to avoid bottlenecking on local FS write latency
+    tracing::info!(
+        "Writing {} location config files...",
+        config_write_futs.len()
+    );
+    let config_write_results = futures::stream::iter(config_write_futs)
+        .buffer_unordered(16)
+        .collect::<Vec<_>>()
+        .await;
+
+    tracing::info!(
+        "Spawning {} tenant shard locations...",
+        config_write_results.len()
+    );
+    // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
+    for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
+        // Errors writing configs are fatal
+        config_write_result?;
+
+        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
         let shard_identity = location_conf.shard;
         let slot = match location_conf.mode {
             LocationMode::Attached(attached_conf) => {
@@ -1699,7 +1752,7 @@ impl TenantManager {
         let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
             .await
             .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
-        self.spawn_background_purge(tmp_path);
+        spawn_background_purge(tmp_path);
 
         fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
             "failpoint"
@@ -1854,28 +1907,6 @@ impl TenantManager {
         shutdown_all_tenants0(self.tenants).await
     }
 
-    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
-    /// the background, and thereby avoid blocking any API requests on this deletion completing.
-    fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
-        // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
-        // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
-        let task_tenant_id = None;
-
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::MgmtRequest,
-            task_tenant_id,
-            None,
-            "tenant_files_delete",
-            false,
-            async move {
-                fs::remove_dir_all(tmp_path.as_path())
-                    .await
-                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-            },
-        );
-    }
-
     pub(crate) async fn detach_tenant(
         &self,
         conf: &'static PageServerConf,
@@ -1892,7 +1923,7 @@ impl TenantManager {
                 deletion_queue_client,
             )
             .await?;
-        self.spawn_background_purge(tmp_path);
+        spawn_background_purge(tmp_path);
 
         Ok(())
     }

From 010f0a310a83b5ab7101165ade9f3284a69a4bfc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 30 Apr 2024 16:52:54 +0200
Subject: [PATCH 0654/1571] Make test_random_updates and test_read_at_max_lsn
 compatible with new compaction (#7551)

Makes two of the tests work with the tiered compaction that I had to
ignore in #7283.

The issue was that tiered compaction actually created image layers, but
the keys didn't appear in them as `collect_keyspace` didn't include
them. Not a compaction problem, but due to how the test is structured.

Fixes #7287
---
 pageserver/src/tenant.rs | 66 +++++++++++++++++++++++++++++++++-------
 1 file changed, 55 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index cb3e36efb3..05ceff2b59 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3873,6 +3873,7 @@ mod tests {
     use hex_literal::hex;
     use pageserver_api::key::NON_INHERITED_RANGE;
     use pageserver_api::keyspace::KeySpace;
+    use pageserver_api::models::CompactionAlgorithm;
     use rand::{thread_rng, Rng};
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
@@ -4512,11 +4513,23 @@ mod tests {
     }
 
     async fn bulk_insert_compact_gc(
+        timeline: Arc<Timeline>,
+        ctx: &RequestContext,
+        lsn: Lsn,
+        repeat: usize,
+        key_count: usize,
+    ) -> anyhow::Result<()> {
+        let compact = true;
+        bulk_insert_maybe_compact_gc(timeline, ctx, lsn, repeat, key_count, compact).await
+    }
+
+    async fn bulk_insert_maybe_compact_gc(
         timeline: Arc<Timeline>,
         ctx: &RequestContext,
         mut lsn: Lsn,
         repeat: usize,
         key_count: usize,
+        compact: bool,
     ) -> anyhow::Result<()> {
         let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
         let mut blknum = 0;
@@ -4557,9 +4570,11 @@ mod tests {
                 )
                 .await?;
             timeline.freeze_and_flush().await?;
-            timeline
-                .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
-                .await?;
+            if compact {
+                timeline
+                    .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
+                    .await?;
+            }
             timeline.gc().await?;
         }
 
@@ -5042,7 +5057,22 @@ mod tests {
 
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_random_updates")?;
+        let names_algorithms = [
+            ("test_random_updates_legacy", CompactionAlgorithm::Legacy),
+            ("test_random_updates_tiered", CompactionAlgorithm::Tiered),
+        ];
+        for (name, algorithm) in names_algorithms {
+            test_random_updates_algorithm(name, algorithm).await?;
+        }
+        Ok(())
+    }
+
+    async fn test_random_updates_algorithm(
+        name: &'static str,
+        compaction_algorithm: CompactionAlgorithm,
+    ) -> anyhow::Result<()> {
+        let mut harness = TenantHarness::create(name)?;
+        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5107,7 +5137,7 @@ mod tests {
                 );
             }
 
-            // Perform a cycle of flush, compact, and GC
+            // Perform a cycle of flush, and GC
             let cutoff = tline.get_last_record_lsn();
             tline
                 .update_gc_info(
@@ -5119,9 +5149,6 @@ mod tests {
                 )
                 .await?;
             tline.freeze_and_flush().await?;
-            tline
-                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
-                .await?;
             tline.gc().await?;
         }
 
@@ -5402,19 +5429,36 @@ mod tests {
 
     #[tokio::test]
     async fn test_read_at_max_lsn() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_read_at_max_lsn")?;
+        let names_algorithms = [
+            ("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy),
+            ("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered),
+        ];
+        for (name, algorithm) in names_algorithms {
+            test_read_at_max_lsn_algorithm(name, algorithm).await?;
+        }
+        Ok(())
+    }
+
+    async fn test_read_at_max_lsn_algorithm(
+        name: &'static str,
+        compaction_algorithm: CompactionAlgorithm,
+    ) -> anyhow::Result<()> {
+        let mut harness = TenantHarness::create(name)?;
+        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
         let lsn = Lsn(0x10);
-        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
+        let compact = false;
+        bulk_insert_maybe_compact_gc(tline.clone(), &ctx, lsn, 50, 10000, compact).await?;
 
         let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
         let read_lsn = Lsn(u64::MAX - 1);
 
-        assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());
+        let result = tline.get(test_key, read_lsn, &ctx).await;
+        assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err());
 
         Ok(())
     }

From 3da54e6d90c7befdff50df48206fa441c24b6e94 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 3 Apr 2024 15:46:54 +0300
Subject: [PATCH 0655/1571] s3_scrubber: implement scan-metadata for
 safekeepers.

It works by listing postgres table with memory dump of safekeepers state. s3
contents for each timeline are checked then against timeline_start_lsn and
backup_lsn. If inconsistency is found, before complaining timeline (branch) is
checked at control plane; it might have been deleted between the dump take and
s3 check.
---
 Cargo.lock                                    |  21 ++
 Cargo.toml                                    |   2 +-
 s3_scrubber/Cargo.toml                        |   4 +
 s3_scrubber/README.md                         |  10 +-
 s3_scrubber/src/lib.rs                        |  16 +-
 s3_scrubber/src/main.rs                       | 109 ++++++--
 ...etadata.rs => scan_pageserver_metadata.rs} |   0
 s3_scrubber/src/scan_safekeeper_metadata.rs   | 234 ++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py         |   4 +-
 9 files changed, 363 insertions(+), 37 deletions(-)
 rename s3_scrubber/src/{scan_metadata.rs => scan_pageserver_metadata.rs} (100%)
 create mode 100644 s3_scrubber/src/scan_safekeeper_metadata.rs

diff --git a/Cargo.lock b/Cargo.lock
index de548bb2de..f2f06210cf 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3184,6 +3184,16 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
 [[package]]
 name = "num"
 version = "0.4.1"
@@ -3520,6 +3530,12 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
 
+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
 [[package]]
 name = "p256"
 version = "0.11.1"
@@ -5095,8 +5111,11 @@ dependencies = [
  "hex",
  "histogram",
  "itertools",
+ "native-tls",
  "pageserver",
  "pageserver_api",
+ "postgres-native-tls",
+ "postgres_ffi",
  "rand 0.8.5",
  "remote_storage",
  "reqwest",
@@ -5105,6 +5124,7 @@ dependencies = [
  "serde_with",
  "thiserror",
  "tokio",
+ "tokio-postgres",
  "tokio-rustls 0.25.0",
  "tokio-stream",
  "tracing",
@@ -6507,6 +6527,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
  "matchers",
+ "nu-ansi-term",
  "once_cell",
  "regex",
  "serde",
diff --git a/Cargo.toml b/Cargo.toml
index 92dcc254d4..32a0bc23e6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -180,7 +180,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.20.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 urlencoding = "2.1"
diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml
index 0ee9112010..37124e6caf 100644
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -22,7 +22,11 @@ serde_with.workspace = true
 workspace_hack.workspace = true
 utils.workspace = true
 async-stream.workspace = true
+native-tls.workspace = true
+postgres-native-tls.workspace = true
+postgres_ffi.workspace = true
 tokio-stream.workspace = true
+tokio-postgres.workspace = true
 futures-util.workspace = true
 itertools.workspace = true
 camino.workspace = true
diff --git a/s3_scrubber/README.md b/s3_scrubber/README.md
index 2f21b9f191..c1deab8852 100644
--- a/s3_scrubber/README.md
+++ b/s3_scrubber/README.md
@@ -67,10 +67,12 @@ the purge command will log all the keys that it would have deleted.
 
 #### `scan-metadata`
 
-Walk objects in a pageserver S3 bucket, and report statistics on the contents.
+Walk objects in a pageserver or safekeeper S3 bucket, and report statistics on the contents and checking consistency.
+Errors are logged to stderr and summary to stdout.
 
+For pageserver:
 ```
-env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata
+env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
 
 Timelines: 31106
 With errors: 3
@@ -82,6 +84,10 @@ Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 2
 Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053
 ```
 
+For safekeepers, dump_db_connstr and dump_db_table must be
+specified; they should point to table with debug dump which will be used
+to list timelines and find their backup and start LSNs.
+
 ## Cleaning up running pageservers
 
 If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers.
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index 90d58a3bc2..43be258150 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -4,7 +4,8 @@ pub mod checks;
 pub mod cloud_admin_api;
 pub mod garbage;
 pub mod metadata_stream;
-pub mod scan_metadata;
+pub mod scan_pageserver_metadata;
+pub mod scan_safekeeper_metadata;
 pub mod tenant_snapshot;
 
 use std::env;
@@ -141,12 +142,17 @@ impl RootTarget {
     pub fn tenants_root(&self) -> S3Target {
         match self {
             Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME),
-            Self::Safekeeper(root) => root.with_sub_segment("wal"),
+            Self::Safekeeper(root) => root.clone(),
         }
     }
 
     pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target {
-        self.tenants_root().with_sub_segment(&tenant_id.to_string())
+        match self {
+            Self::Pageserver(_) => self.tenants_root().with_sub_segment(&tenant_id.to_string()),
+            Self::Safekeeper(_) => self
+                .tenants_root()
+                .with_sub_segment(&tenant_id.tenant_id.to_string()),
+        }
     }
 
     pub(crate) fn tenant_shards_prefix(&self, tenant_id: &TenantId) -> S3Target {
@@ -337,9 +343,7 @@ fn init_remote(
         }),
         NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
             bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config
-                .prefix_in_bucket
-                .unwrap_or("safekeeper/v1".to_string()),
+            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal".to_string()),
             delimiter,
         }),
     };
diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs
index 88ba9bfa61..e49c280b99 100644
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -1,9 +1,13 @@
+use anyhow::bail;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
-use s3_scrubber::scan_metadata::scan_metadata;
+use s3_scrubber::scan_pageserver_metadata::scan_metadata;
 use s3_scrubber::tenant_snapshot::SnapshotDownloader;
-use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
+use s3_scrubber::{
+    init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig,
+    NodeKind, TraversingDepth,
+};
 
 use clap::{Parser, Subcommand};
 use utils::id::TenantId;
@@ -35,11 +39,20 @@ enum Command {
         #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
         mode: PurgeMode,
     },
+    #[command(verbatim_doc_comment)]
     ScanMetadata {
+        #[arg(short, long)]
+        node_kind: NodeKind,
         #[arg(short, long, default_value_t = false)]
         json: bool,
         #[arg(long = "tenant-id", num_args = 0..)]
         tenant_ids: Vec<TenantShardId>,
+        #[arg(long, default_value = None)]
+        /// For safekeeper node_kind only, points to db with debug dump
+        dump_db_connstr: Option<String>,
+        /// For safekeeper node_kind only, table in the db with debug dump
+        #[arg(long, default_value = None)]
+        dump_db_table: Option<String>,
     },
     TenantSnapshot {
         #[arg(long = "tenant-id")]
@@ -72,33 +85,75 @@ async fn main() -> anyhow::Result<()> {
     ));
 
     match cli.command {
-        Command::ScanMetadata { json, tenant_ids } => {
-            match scan_metadata(bucket_config.clone(), tenant_ids).await {
-                Err(e) => {
-                    tracing::error!("Failed: {e}");
-                    Err(e)
+        Command::ScanMetadata {
+            json,
+            tenant_ids,
+            node_kind,
+            dump_db_connstr,
+            dump_db_table,
+        } => {
+            if let NodeKind::Safekeeper = node_kind {
+                let dump_db_connstr =
+                    dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?;
+                let dump_db_table =
+                    dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?;
+
+                let summary = scan_safekeeper_metadata(
+                    bucket_config.clone(),
+                    tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(),
+                    dump_db_connstr,
+                    dump_db_table,
+                )
+                .await?;
+                if json {
+                    println!("{}", serde_json::to_string(&summary).unwrap())
+                } else {
+                    println!("{}", summary.summary_string());
                 }
-                Ok(summary) => {
-                    if json {
-                        println!("{}", serde_json::to_string(&summary).unwrap())
-                    } else {
-                        println!("{}", summary.summary_string());
+                if summary.is_fatal() {
+                    bail!("Fatal scrub errors detected");
+                }
+                if summary.is_empty() {
+                    // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                    // scrubber they were likely expecting to scan something, and if we see no timelines
+                    // at all then it's likely due to some configuration issues like a bad prefix
+                    bail!(
+                        "No timelines found in bucket {} prefix {}",
+                        bucket_config.bucket,
+                        bucket_config
+                            .prefix_in_bucket
+                            .unwrap_or("<none>".to_string())
+                    );
+                }
+                Ok(())
+            } else {
+                match scan_metadata(bucket_config.clone(), tenant_ids).await {
+                    Err(e) => {
+                        tracing::error!("Failed: {e}");
+                        Err(e)
                     }
-                    if summary.is_fatal() {
-                        Err(anyhow::anyhow!("Fatal scrub errors detected"))
-                    } else if summary.is_empty() {
-                        // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
-                        // scrubber they were likely expecting to scan something, and if we see no timelines
-                        // at all then it's likely due to some configuration issues like a bad prefix
-                        Err(anyhow::anyhow!(
-                            "No timelines found in bucket {} prefix {}",
-                            bucket_config.bucket,
-                            bucket_config
-                                .prefix_in_bucket
-                                .unwrap_or("<none>".to_string())
-                        ))
-                    } else {
-                        Ok(())
+                    Ok(summary) => {
+                        if json {
+                            println!("{}", serde_json::to_string(&summary).unwrap())
+                        } else {
+                            println!("{}", summary.summary_string());
+                        }
+                        if summary.is_fatal() {
+                            Err(anyhow::anyhow!("Fatal scrub errors detected"))
+                        } else if summary.is_empty() {
+                            // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                            // scrubber they were likely expecting to scan something, and if we see no timelines
+                            // at all then it's likely due to some configuration issues like a bad prefix
+                            Err(anyhow::anyhow!(
+                                "No timelines found in bucket {} prefix {}",
+                                bucket_config.bucket,
+                                bucket_config
+                                    .prefix_in_bucket
+                                    .unwrap_or("<none>".to_string())
+                            ))
+                        } else {
+                            Ok(())
+                        }
                     }
                 }
             }
diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_pageserver_metadata.rs
similarity index 100%
rename from s3_scrubber/src/scan_metadata.rs
rename to s3_scrubber/src/scan_pageserver_metadata.rs
diff --git a/s3_scrubber/src/scan_safekeeper_metadata.rs b/s3_scrubber/src/scan_safekeeper_metadata.rs
new file mode 100644
index 0000000000..f56bc165db
--- /dev/null
+++ b/s3_scrubber/src/scan_safekeeper_metadata.rs
@@ -0,0 +1,234 @@
+use std::{collections::HashSet, str::FromStr};
+
+use aws_sdk_s3::Client;
+use futures::stream::{StreamExt, TryStreamExt};
+use pageserver_api::shard::TenantShardId;
+use postgres_ffi::{XLogFileName, PG_TLI};
+use serde::Serialize;
+use tokio_postgres::types::PgLsn;
+use tracing::{error, info, trace};
+use utils::{
+    id::{TenantId, TenantTimelineId, TimelineId},
+    lsn::Lsn,
+};
+
+use crate::{
+    cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
+    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
+};
+
+/// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
+const WAL_SEGSIZE: usize = 16 * 1024 * 1024;
+
+#[derive(Serialize)]
+pub struct MetadataSummary {
+    timeline_count: usize,
+    with_errors: HashSet<TenantTimelineId>,
+    deleted_count: usize,
+}
+
+impl MetadataSummary {
+    fn new() -> Self {
+        Self {
+            timeline_count: 0,
+            with_errors: HashSet::new(),
+            deleted_count: 0,
+        }
+    }
+
+    pub fn summary_string(&self) -> String {
+        format!(
+            "timeline_count: {}, with_errors: {}",
+            self.timeline_count,
+            self.with_errors.len()
+        )
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.timeline_count == 0
+    }
+
+    pub fn is_fatal(&self) -> bool {
+        !self.with_errors.is_empty()
+    }
+}
+
+/// Scan the safekeeper metadata in an S3 bucket, reporting errors and
+/// statistics.
+///
+/// It works by listing timelines along with timeline_start_lsn and backup_lsn
+/// in debug dump in dump_db_table and verifying its s3 contents. If some WAL
+/// segments are missing, before complaining control plane is queried to check if
+/// the project wasn't deleted in the meanwhile.
+pub async fn scan_safekeeper_metadata(
+    bucket_config: BucketConfig,
+    tenant_ids: Vec<TenantId>,
+    dump_db_connstr: String,
+    dump_db_table: String,
+) -> anyhow::Result<MetadataSummary> {
+    info!(
+        "checking bucket {}, region {}, dump_db_table {}",
+        bucket_config.bucket, bucket_config.region, dump_db_table
+    );
+    // Use the native TLS implementation (Neon requires TLS)
+    let tls_connector =
+        postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap());
+    let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
+    // The connection object performs the actual communication with the database,
+    // so spawn it off to run on its own.
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    let tenant_filter_clause = if !tenant_ids.is_empty() {
+        format!(
+            "and tenant_id in ({})",
+            tenant_ids
+                .iter()
+                .map(|t| format!("'{}'", t))
+                .collect::<Vec<_>>()
+                .join(", ")
+        )
+    } else {
+        "".to_owned()
+    };
+    let query = format!(
+        "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;",
+        dump_db_table, tenant_filter_clause,
+    );
+    info!("query is {}", query);
+    let timelines = client.query(&query, &[]).await?;
+    info!("loaded {} timelines", timelines.len());
+
+    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?;
+    let console_config = ConsoleConfig::from_env()?;
+    let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
+
+    let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| {
+        let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id");
+        let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id");
+        let timeline_start_lsn_pg: PgLsn = row.get(2);
+        let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg));
+        let backup_lsn_pg: PgLsn = row.get(3);
+        let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
+        let ttid = TenantTimelineId::new(tenant_id, timeline_id);
+        check_timeline(
+            &s3_client,
+            &target,
+            &cloud_admin_api_client,
+            ttid,
+            timeline_start_lsn,
+            backup_lsn,
+        )
+    });
+    // Run multiple check_timeline's concurrently.
+    const CONCURRENCY: usize = 32;
+    let mut timelines = checks.try_buffered(CONCURRENCY);
+
+    let mut summary = MetadataSummary::new();
+    while let Some(r) = timelines.next().await {
+        let res = r?;
+        summary.timeline_count += 1;
+        if !res.is_ok {
+            summary.with_errors.insert(res.ttid);
+        }
+        if res.is_deleted {
+            summary.deleted_count += 1;
+        }
+    }
+
+    Ok(summary)
+}
+
+struct TimelineCheckResult {
+    ttid: TenantTimelineId,
+    is_ok: bool,
+    is_deleted: bool, // timeline is deleted in cplane
+}
+
+/// List s3 and check that is has all expected WAL for the ttid. Consistency
+/// errors are logged to stderr; returns Ok(true) if timeline is consistent,
+/// Ok(false) if not, Err if failed to check.
+async fn check_timeline(
+    s3_client: &Client,
+    root: &RootTarget,
+    api_client: &CloudAdminApiClient,
+    ttid: TenantTimelineId,
+    timeline_start_lsn: Lsn,
+    backup_lsn: Lsn,
+) -> anyhow::Result<TimelineCheckResult> {
+    trace!(
+        "checking ttid {}, should contain WAL [{}-{}]",
+        ttid,
+        timeline_start_lsn,
+        backup_lsn
+    );
+    // calculate expected segfiles
+    let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE);
+    let expected_last_segno = backup_lsn.segment_number(WAL_SEGSIZE);
+    let mut expected_segfiles: HashSet<String> = HashSet::from_iter(
+        (expected_first_segno..expected_last_segno)
+            .map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)),
+    );
+    let expected_files_num = expected_segfiles.len();
+    trace!("expecting {} files", expected_segfiles.len(),);
+
+    // now list s3 and check if it misses something
+    let ttshid =
+        TenantShardTimelineId::new(TenantShardId::unsharded(ttid.tenant_id), ttid.timeline_id);
+    let mut timeline_dir_target = root.timeline_root(&ttshid);
+    // stream_listing yields only common_prefixes if delimiter is not empty, but
+    // we need files, so unset it.
+    timeline_dir_target.delimiter = String::new();
+
+    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
+    while let Some(obj) = stream.next().await {
+        let obj = obj?;
+        let key = obj.key();
+
+        let seg_name = key
+            .strip_prefix(&timeline_dir_target.prefix_in_bucket)
+            .expect("failed to extract segment name");
+        expected_segfiles.remove(seg_name);
+    }
+    if !expected_segfiles.is_empty() {
+        // Before complaining check cplane, probably timeline is already deleted.
+        let bdata = api_client.find_timeline_branch(ttid.timeline_id).await?;
+        let deleted = match bdata {
+            Some(bdata) => bdata.deleted,
+            None => {
+                // note: should be careful with selecting proper cplane address
+                info!("ttid {} not found, assuming it is deleted", ttid);
+                true
+            }
+        };
+        if deleted {
+            // ok, branch is deleted
+            return Ok(TimelineCheckResult {
+                ttid,
+                is_ok: true,
+                is_deleted: true,
+            });
+        }
+        error!(
+            "ttid {}: missing {} files out of {}, timeline_start_lsn {}, wal_backup_lsn {}",
+            ttid,
+            expected_segfiles.len(),
+            expected_files_num,
+            timeline_start_lsn,
+            backup_lsn,
+        );
+        return Ok(TimelineCheckResult {
+            ttid,
+            is_ok: false,
+            is_deleted: false,
+        });
+    }
+    Ok(TimelineCheckResult {
+        ttid,
+        is_ok: true,
+        is_deleted: false,
+    })
+}
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index abe2718a49..fa83ebdccb 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3734,7 +3734,9 @@ class S3Scrubber:
         return stdout
 
     def scan_metadata(self) -> Any:
-        stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30)
+        stdout = self.scrubber_cli(
+            ["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
+        )
 
         try:
             return json.loads(stdout)

From ea37234cccb6448383bbb7d76e315a7db1af3125 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 5 Apr 2024 14:29:35 +0300
Subject: [PATCH 0656/1571] s3_scrubber: revive garbage collection for
 safekeepers.

- pageserver_id in project details is now is optional, fix it
- add active_timeline_count guard/stat similar to active_tenant_count
- fix safekeeper prefix
- count and log deleted keys
---
 s3_scrubber/src/cloud_admin_api.rs |  2 +-
 s3_scrubber/src/garbage.rs         | 54 +++++++++++++++++++++++++++++-
 s3_scrubber/src/lib.rs             |  2 +-
 s3_scrubber/src/metadata_stream.rs |  4 +--
 4 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/s3_scrubber/src/cloud_admin_api.rs b/s3_scrubber/src/cloud_admin_api.rs
index 45cac23690..66ca2f7180 100644
--- a/s3_scrubber/src/cloud_admin_api.rs
+++ b/s3_scrubber/src/cloud_admin_api.rs
@@ -137,7 +137,7 @@ pub struct ProjectData {
     pub region_id: String,
     pub platform_id: String,
     pub user_id: String,
-    pub pageserver_id: u64,
+    pub pageserver_id: Option<u64>,
     #[serde(deserialize_with = "from_nullable_id")]
     pub tenant: TenantId,
     pub safekeepers: Vec<SafekeeperData>,
diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs
index 7a08dffc66..de3b16b49b 100644
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -60,6 +60,7 @@ pub struct GarbageList {
     /// see garbage, we saw some active tenants too.  This protects against classes of bugs
     /// in the scrubber that might otherwise generate a "deleted all" result.
     active_tenant_count: usize,
+    active_timeline_count: usize,
 }
 
 impl GarbageList {
@@ -67,6 +68,7 @@ impl GarbageList {
         Self {
             items: Vec::new(),
             active_tenant_count: 0,
+            active_timeline_count: 0,
             node_kind,
             bucket_config,
         }
@@ -221,6 +223,7 @@ async fn find_garbage_inner(
         } else {
             tracing::debug!("Tenant {tenant_shard_id} is active");
             active_tenants.push(tenant_shard_id);
+            garbage.active_tenant_count = active_tenants.len();
         }
 
         counter += 1;
@@ -271,15 +274,29 @@ async fn find_garbage_inner(
         std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY));
 
     // Update the GarbageList with any timelines which appear not to exist.
+    let mut active_timelines: Vec<TenantShardTimelineId> = vec![];
     while let Some(result) = timelines_checked.next().await {
         let (ttid, console_result) = result?;
         if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) {
             tracing::debug!("Timeline {ttid} is garbage");
         } else {
             tracing::debug!("Timeline {ttid} is active");
+            active_timelines.push(ttid);
+            garbage.active_timeline_count = active_timelines.len();
         }
     }
 
+    let num_garbage_timelines = garbage
+        .items
+        .iter()
+        .filter(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
+        .count();
+    tracing::info!(
+        "Found {}/{} garbage timelines in active tenants",
+        num_garbage_timelines,
+        active_timelines.len(),
+    );
+
     Ok(garbage)
 }
 
@@ -344,16 +361,22 @@ pub async fn get_timeline_objects(
 const MAX_KEYS_PER_DELETE: usize = 1000;
 
 /// Drain a buffer of keys into DeleteObjects requests
+///
+/// If `drain` is true, drains keys completely; otherwise stops when <
+/// MAX_KEYS_PER_DELETE keys are left.
+/// `num_deleted` returns number of deleted keys.
 async fn do_delete(
     s3_client: &Arc<Client>,
     bucket_name: &str,
     keys: &mut Vec<ObjectIdentifier>,
     dry_run: bool,
     drain: bool,
+    progress_tracker: &mut DeletionProgressTracker,
 ) -> anyhow::Result<()> {
     while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) {
         let request_keys =
             keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
+        let num_deleted = request_keys.len();
         if dry_run {
             tracing::info!("Dry-run deletion of objects: ");
             for k in request_keys {
@@ -368,12 +391,30 @@ async fn do_delete(
                 .send()
                 .await
                 .context("DeleteObjects request")?;
+            progress_tracker.register(num_deleted);
         }
     }
 
     Ok(())
 }
 
+/// Simple tracker reporting each 10k deleted keys.
+#[derive(Default)]
+struct DeletionProgressTracker {
+    num_deleted: usize,
+    last_reported_num_deleted: usize,
+}
+
+impl DeletionProgressTracker {
+    fn register(&mut self, n: usize) {
+        self.num_deleted += n;
+        if self.num_deleted - self.last_reported_num_deleted > 10000 {
+            tracing::info!("progress: deleted {} keys", self.num_deleted);
+            self.last_reported_num_deleted = self.num_deleted;
+        }
+    }
+}
+
 pub async fn purge_garbage(
     input_path: String,
     mode: PurgeMode,
@@ -394,6 +435,14 @@ pub async fn purge_garbage(
     if garbage_list.active_tenant_count == 0 {
         anyhow::bail!("Refusing to purge a garbage list that reports 0 active tenants");
     }
+    if garbage_list
+        .items
+        .iter()
+        .any(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
+        && garbage_list.active_timeline_count == 0
+    {
+        anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines");
+    }
 
     let filtered_items = garbage_list
         .items
@@ -429,6 +478,7 @@ pub async fn purge_garbage(
         std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY));
 
     let mut objects_to_delete = Vec::new();
+    let mut progress_tracker = DeletionProgressTracker::default();
     while let Some(result) = get_objects_results.next().await {
         let mut object_list = result?;
         objects_to_delete.append(&mut object_list);
@@ -439,6 +489,7 @@ pub async fn purge_garbage(
                 &mut objects_to_delete,
                 dry_run,
                 false,
+                &mut progress_tracker,
             )
             .await?;
         }
@@ -450,10 +501,11 @@ pub async fn purge_garbage(
         &mut objects_to_delete,
         dry_run,
         true,
+        &mut progress_tracker,
     )
     .await?;
 
-    tracing::info!("Fell through");
+    tracing::info!("{} keys deleted in total", progress_tracker.num_deleted);
 
     Ok(())
 }
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index 43be258150..78ad9d0da7 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -343,7 +343,7 @@ fn init_remote(
         }),
         NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
             bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal".to_string()),
+            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()),
             delimiter,
         }),
     };
diff --git a/s3_scrubber/src/metadata_stream.rs b/s3_scrubber/src/metadata_stream.rs
index b192e0be2e..c05874f556 100644
--- a/s3_scrubber/src/metadata_stream.rs
+++ b/s3_scrubber/src/metadata_stream.rs
@@ -114,7 +114,7 @@ pub async fn stream_tenant_timelines<'a>(
     let timelines_target = target.timelines_root(&tenant);
 
     loop {
-        tracing::info!("Listing in {}", tenant);
+        tracing::debug!("Listing in {}", tenant);
         let fetch_response =
             list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone())
                 .await;
@@ -151,7 +151,7 @@ pub async fn stream_tenant_timelines<'a>(
         }
     }
 
-    tracing::info!("Yielding for {}", tenant);
+    tracing::debug!("Yielding for {}", tenant);
     Ok(stream! {
         for i in timeline_ids {
             let id = i?;

From 7434674d86d8064122b9cc80529ca989ec3b0a88 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 5 Apr 2024 20:25:09 +0300
Subject: [PATCH 0657/1571] Decrease CONSOLE_CONCURRENCY.

Last run with 128 created too much load on cplane.
---
 s3_scrubber/src/garbage.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs
index de3b16b49b..91f5fa4334 100644
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -121,7 +121,10 @@ pub async fn find_garbage(
 const S3_CONCURRENCY: usize = 32;
 
 // How many concurrent API requests to make to the console API.
-const CONSOLE_CONCURRENCY: usize = 128;
+//
+// Be careful increasing this; roughly we shouldn't have more than ~100 rps. It
+// would be better to implement real rsp limiter.
+const CONSOLE_CONCURRENCY: usize = 16;
 
 struct ConsoleCache {
     /// Set of tenants found in the control plane API

From 9f792f9c0b9758320848a6aeb7e720af6d3eafdf Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 8 Apr 2024 07:56:04 +0300
Subject: [PATCH 0658/1571] Recheck tenant_id in find_timeline_branch.

As it turns out we have at least one case of the same timeline_id in different
projects.
---
 s3_scrubber/src/cloud_admin_api.rs          | 36 +++++++++++++--------
 s3_scrubber/src/garbage.rs                  |  2 +-
 s3_scrubber/src/scan_safekeeper_metadata.rs |  4 ++-
 3 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/s3_scrubber/src/cloud_admin_api.rs b/s3_scrubber/src/cloud_admin_api.rs
index 66ca2f7180..d35dc7e3ca 100644
--- a/s3_scrubber/src/cloud_admin_api.rs
+++ b/s3_scrubber/src/cloud_admin_api.rs
@@ -155,7 +155,7 @@ pub struct ProjectData {
     pub maintenance_set: Option<String>,
 }
 
-#[derive(Debug, serde::Deserialize)]
+#[derive(Debug, Clone, serde::Deserialize)]
 pub struct BranchData {
     pub id: BranchId,
     pub created_at: DateTime<Utc>,
@@ -327,6 +327,7 @@ impl CloudAdminApiClient {
 
     pub async fn find_timeline_branch(
         &self,
+        tenant_id: TenantId,
         timeline_id: TimelineId,
     ) -> Result<Option<BranchData>, Error> {
         let _permit = self
@@ -359,19 +360,28 @@ impl CloudAdminApiClient {
                 ErrorKind::BodyRead(e),
             )
         })?;
-        match response.data.len() {
-            0 => Ok(None),
-            1 => Ok(Some(
-                response
-                    .data
-                    .into_iter()
-                    .next()
-                    .expect("Should have exactly one element"),
-            )),
-            too_many => Err(Error::new(
-                format!("Find branch for timeline returned {too_many} branches instead of 0 or 1"),
+        let mut branches: Vec<BranchData> = response.data.into_iter().collect();
+        // Normally timeline_id is unique. However, we do have at least one case
+        // of the same timeline_id in two different projects, apparently after
+        // manual recovery. So always recheck project_id (discovered through
+        // tenant_id).
+        let project_data = match self.find_tenant_project(tenant_id).await? {
+            Some(pd) => pd,
+            None => return Ok(None),
+        };
+        branches.retain(|b| b.project_id == project_data.id);
+        if branches.len() < 2 {
+            Ok(branches.first().cloned())
+        } else {
+            Err(Error::new(
+                format!(
+                    "Find branch for timeline {}/{} returned {} branches instead of 0 or 1",
+                    tenant_id,
+                    timeline_id,
+                    branches.len()
+                ),
                 ErrorKind::UnexpectedState,
-            )),
+            ))
         }
     }
 
diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs
index 91f5fa4334..ce0ff10ec6 100644
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -267,7 +267,7 @@ async fn find_garbage_inner(
         let api_client = cloud_admin_api_client.clone();
         async move {
             api_client
-                .find_timeline_branch(ttid.timeline_id)
+                .find_timeline_branch(ttid.tenant_shard_id.tenant_id, ttid.timeline_id)
                 .await
                 .map_err(|e| anyhow::anyhow!(e))
                 .map(|r| (ttid, r))
diff --git a/s3_scrubber/src/scan_safekeeper_metadata.rs b/s3_scrubber/src/scan_safekeeper_metadata.rs
index f56bc165db..73dd49ceb5 100644
--- a/s3_scrubber/src/scan_safekeeper_metadata.rs
+++ b/s3_scrubber/src/scan_safekeeper_metadata.rs
@@ -195,7 +195,9 @@ async fn check_timeline(
     }
     if !expected_segfiles.is_empty() {
         // Before complaining check cplane, probably timeline is already deleted.
-        let bdata = api_client.find_timeline_branch(ttid.timeline_id).await?;
+        let bdata = api_client
+            .find_timeline_branch(ttid.tenant_id, ttid.timeline_id)
+            .await?;
         let deleted = match bdata {
             Some(bdata) => bdata.deleted,
             None => {

From 4ac4b2159838f9b98d766d53a5f876fedb94c2e4 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 9 Apr 2024 07:18:26 +0300
Subject: [PATCH 0659/1571] Add retries to cloud_admin client.

---
 Cargo.lock                         |   1 +
 s3_scrubber/Cargo.toml             |   1 +
 s3_scrubber/src/cloud_admin_api.rs | 189 ++++++++++++++++-------------
 3 files changed, 108 insertions(+), 83 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f2f06210cf..2b100560dd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5127,6 +5127,7 @@ dependencies = [
  "tokio-postgres",
  "tokio-rustls 0.25.0",
  "tokio-stream",
+ "tokio-util",
  "tracing",
  "tracing-appender",
  "tracing-subscriber",
diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml
index 37124e6caf..dd5d453a2b 100644
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -27,6 +27,7 @@ postgres-native-tls.workspace = true
 postgres_ffi.workspace = true
 tokio-stream.workspace = true
 tokio-postgres.workspace = true
+tokio-util = { workspace = true }
 futures-util.workspace = true
 itertools.workspace = true
 camino.workspace = true
diff --git a/s3_scrubber/src/cloud_admin_api.rs b/s3_scrubber/src/cloud_admin_api.rs
index d35dc7e3ca..70b108cf23 100644
--- a/s3_scrubber/src/cloud_admin_api.rs
+++ b/s3_scrubber/src/cloud_admin_api.rs
@@ -1,11 +1,13 @@
-use std::time::Duration;
-
 use chrono::{DateTime, Utc};
+use futures::Future;
 use hex::FromHex;
+
 use reqwest::{header, Client, StatusCode, Url};
 use serde::Deserialize;
 use tokio::sync::Semaphore;
 
+use tokio_util::sync::CancellationToken;
+use utils::backoff;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 
@@ -210,30 +212,39 @@ impl CloudAdminApiClient {
             .await
             .expect("Semaphore is not closed");
 
-        let response = self
-            .http_client
-            .get(self.append_url("/projects"))
-            .query(&[
-                ("tenant_id", tenant_id.to_string()),
-                ("show_deleted", "true".to_string()),
-            ])
-            .header(header::ACCEPT, "application/json")
-            .bearer_auth(&self.token)
-            .send()
-            .await
-            .map_err(|e| {
-                Error::new(
-                    "Find project for tenant".to_string(),
-                    ErrorKind::RequestSend(e),
-                )
-            })?;
+        let response = CloudAdminApiClient::with_retries(
+            || async {
+                let response = self
+                    .http_client
+                    .get(self.append_url("/projects"))
+                    .query(&[
+                        ("tenant_id", tenant_id.to_string()),
+                        ("show_deleted", "true".to_string()),
+                    ])
+                    .header(header::ACCEPT, "application/json")
+                    .bearer_auth(&self.token)
+                    .send()
+                    .await
+                    .map_err(|e| {
+                        Error::new(
+                            "Find project for tenant".to_string(),
+                            ErrorKind::RequestSend(e),
+                        )
+                    })?;
+
+                let response: AdminApiResponse<Vec<ProjectData>> =
+                    response.json().await.map_err(|e| {
+                        Error::new(
+                            "Find project for tenant".to_string(),
+                            ErrorKind::BodyRead(e),
+                        )
+                    })?;
+                Ok(response)
+            },
+            "find_tenant_project",
+        )
+        .await?;
 
-        let response: AdminApiResponse<Vec<ProjectData>> = response.json().await.map_err(|e| {
-            Error::new(
-                "Find project for tenant".to_string(),
-                ErrorKind::BodyRead(e),
-            )
-        })?;
         match response.data.len() {
             0 => Ok(None),
             1 => Ok(Some(
@@ -261,42 +272,34 @@ impl CloudAdminApiClient {
         const PAGINATION_LIMIT: usize = 512;
         let mut result: Vec<ProjectData> = Vec::with_capacity(PAGINATION_LIMIT);
         loop {
-            let response = self
-                .http_client
-                .get(self.append_url("/projects"))
-                .query(&[
-                    ("show_deleted", "false".to_string()),
-                    ("limit", format!("{PAGINATION_LIMIT}")),
-                    ("offset", format!("{pagination_offset}")),
-                ])
-                .header(header::ACCEPT, "application/json")
-                .bearer_auth(&self.token)
-                .send()
-                .await
-                .map_err(|e| {
-                    Error::new(
-                        "List active projects".to_string(),
-                        ErrorKind::RequestSend(e),
-                    )
-                })?;
+            let response_bytes = CloudAdminApiClient::with_retries(
+                || async {
+                    let response = self
+                        .http_client
+                        .get(self.append_url("/projects"))
+                        .query(&[
+                            ("show_deleted", "false".to_string()),
+                            ("limit", format!("{PAGINATION_LIMIT}")),
+                            ("offset", format!("{pagination_offset}")),
+                        ])
+                        .header(header::ACCEPT, "application/json")
+                        .bearer_auth(&self.token)
+                        .send()
+                        .await
+                        .map_err(|e| {
+                            Error::new(
+                                "List active projects".to_string(),
+                                ErrorKind::RequestSend(e),
+                            )
+                        })?;
 
-            match response.status() {
-                StatusCode::OK => {}
-                StatusCode::SERVICE_UNAVAILABLE | StatusCode::TOO_MANY_REQUESTS => {
-                    tokio::time::sleep(Duration::from_millis(500)).await;
-                    continue;
-                }
-                _status => {
-                    return Err(Error::new(
-                        "List active projects".to_string(),
-                        ErrorKind::ResponseStatus(response.status()),
-                    ))
-                }
-            }
-
-            let response_bytes = response.bytes().await.map_err(|e| {
-                Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
-            })?;
+                    response.bytes().await.map_err(|e| {
+                        Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
+                    })
+                },
+                "list_projects",
+            )
+            .await?;
 
             let decode_result =
                 serde_json::from_slice::<AdminApiResponse<Vec<ProjectData>>>(&response_bytes);
@@ -336,30 +339,39 @@ impl CloudAdminApiClient {
             .await
             .expect("Semaphore is not closed");
 
-        let response = self
-            .http_client
-            .get(self.append_url("/branches"))
-            .query(&[
-                ("timeline_id", timeline_id.to_string()),
-                ("show_deleted", "true".to_string()),
-            ])
-            .header(header::ACCEPT, "application/json")
-            .bearer_auth(&self.token)
-            .send()
-            .await
-            .map_err(|e| {
-                Error::new(
-                    "Find branch for timeline".to_string(),
-                    ErrorKind::RequestSend(e),
-                )
-            })?;
+        let response = CloudAdminApiClient::with_retries(
+            || async {
+                let response = self
+                    .http_client
+                    .get(self.append_url("/branches"))
+                    .query(&[
+                        ("timeline_id", timeline_id.to_string()),
+                        ("show_deleted", "true".to_string()),
+                    ])
+                    .header(header::ACCEPT, "application/json")
+                    .bearer_auth(&self.token)
+                    .send()
+                    .await
+                    .map_err(|e| {
+                        Error::new(
+                            "Find branch for timeline".to_string(),
+                            ErrorKind::RequestSend(e),
+                        )
+                    })?;
+
+                let response: AdminApiResponse<Vec<BranchData>> =
+                    response.json().await.map_err(|e| {
+                        Error::new(
+                            "Find branch for timeline".to_string(),
+                            ErrorKind::BodyRead(e),
+                        )
+                    })?;
+                Ok(response)
+            },
+            "find_timeline_branch",
+        )
+        .await?;
 
-        let response: AdminApiResponse<Vec<BranchData>> = response.json().await.map_err(|e| {
-            Error::new(
-                "Find branch for timeline".to_string(),
-                ErrorKind::BodyRead(e),
-            )
-        })?;
         let mut branches: Vec<BranchData> = response.data.into_iter().collect();
         // Normally timeline_id is unique. However, we do have at least one case
         // of the same timeline_id in two different projects, apparently after
@@ -542,4 +554,15 @@ impl CloudAdminApiClient {
             .parse()
             .unwrap_or_else(|e| panic!("Could not append {subpath} to base url: {e}"))
     }
+
+    async fn with_retries<T, O, F>(op: O, description: &str) -> Result<T, Error>
+    where
+        O: FnMut() -> F,
+        F: Future<Output = Result<T, Error>>,
+    {
+        let cancel = CancellationToken::new(); // not really used
+        backoff::retry(op, |_| false, 1, 20, description, &cancel)
+            .await
+            .expect("cancellations are disabled")
+    }
 }

From 3a2f10712ad557c978f966579c9bfa89ad6f4bae Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 10 Apr 2024 22:52:57 +0300
Subject: [PATCH 0660/1571] Add more context to s3 listing error.

---
 s3_scrubber/src/lib.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index 78ad9d0da7..e976e66748 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -368,7 +368,10 @@ async fn list_objects_with_retries(
         {
             Ok(response) => return Ok(response),
             Err(e) => {
-                error!("list_objects_v2 query failed: {e}");
+                error!(
+                    "list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}",
+                    s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter
+                );
                 tokio::time::sleep(Duration::from_secs(1)).await;
             }
         }

From a74b60066c7e0d4679d0d2ae7cfce6cd2f488e6e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 30 Apr 2024 16:21:54 +0100
Subject: [PATCH 0661/1571] storage controller: test for large shard counts
 (#7475)

## Problem

Storage controller was observed to have unexpectedly large memory
consumption when loaded with many thousands of shards.

This was recently fixed:
- https://github.com/neondatabase/neon/pull/7493

...but we need a general test that the controller is well behaved with
thousands of shards.

Closes: https://github.com/neondatabase/neon/issues/7460
Closes: https://github.com/neondatabase/neon/issues/7463

## Summary of changes

- Add test test_storage_controller_many_tenants to exercise the system's
behaviour with a more substantial workload. This test measures memory
consumption and reproduces #7460 before the other changes in this PR.
- Tweak reconcile_all's return value to make it nonzero if it spawns no
reconcilers, but _would_ have spawned some reconcilers if they weren't
blocked by the reconcile concurrency limit. This makes the test's
reconcile_until_idle behave as expected (i.e. not complete until the
system is nice and calm).
- Fix an issue where tenant migrations would leave a spurious secondary
location when migrated to some location that was not already their
secondary (this was an existing low-impact bug that tripped up the
test's consistency checks).

On the test with 8000 shards, the resident memory per shard is about
20KiB. This is not really per-shard memory: the primary source of memory
growth is the number of concurrent network/db clients we create.

With 8000 shards, the test takes 125s to run on my workstation.
---
 Cargo.lock                                    |   1 +
 control_plane/Cargo.toml                      |   1 +
 control_plane/src/bin/neon_local.rs           |   4 +-
 control_plane/src/local_env.rs                |  28 +++
 control_plane/src/storage_controller.rs       |  14 +-
 storage_controller/src/service.rs             |  18 +-
 storage_controller/src/tenant_shard.rs        |   4 +-
 test_runner/fixtures/compute_reconfigure.py   |  11 +
 test_runner/fixtures/neon_fixtures.py         |  34 ++-
 .../test_storage_controller_scale.py          | 198 ++++++++++++++++++
 10 files changed, 292 insertions(+), 21 deletions(-)
 create mode 100644 test_runner/performance/test_storage_controller_scale.py

diff --git a/Cargo.lock b/Cargo.lock
index 2b100560dd..e4bf71c64f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1319,6 +1319,7 @@ dependencies = [
  "git-version",
  "hex",
  "humantime",
+ "humantime-serde",
  "hyper 0.14.26",
  "nix 0.27.1",
  "once_cell",
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index b544a8c587..2ce041068e 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -17,6 +17,7 @@ nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
 hex.workspace = true
+humantime-serde.workspace = true
 hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 1a9e9a1e6a..bdd64c8687 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1554,8 +1554,8 @@ fn cli() -> Command {
             Command::new("storage_controller")
                 .arg_required_else_help(true)
                 .about("Manage storage_controller")
-                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
-                .subcommand(Command::new("stop").about("Stop local pageserver")
+                .subcommand(Command::new("start").about("Start storage controller"))
+                .subcommand(Command::new("stop").about("Stop storage controller")
                             .arg(stop_mode_arg.clone()))
         )
         .subcommand(
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 8cbda528a7..59b587389c 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -17,6 +17,7 @@ use std::net::Ipv4Addr;
 use std::net::SocketAddr;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
+use std::time::Duration;
 use utils::{
     auth::{encode_from_key_file, Claims},
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -66,6 +67,10 @@ pub struct LocalEnv {
 
     pub broker: NeonBroker,
 
+    // Configuration for the storage controller (1 per neon_local environment)
+    #[serde(default)]
+    pub storage_controller: NeonStorageControllerConf,
+
     /// This Vec must always contain at least one pageserver
     pub pageservers: Vec<PageServerConf>,
 
@@ -98,6 +103,29 @@ pub struct NeonBroker {
     pub listen_addr: SocketAddr,
 }
 
+/// Broker config for cluster internal communication.
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
+#[serde(default)]
+pub struct NeonStorageControllerConf {
+    /// Heartbeat timeout before marking a node offline
+    #[serde(with = "humantime_serde")]
+    pub max_unavailable: Duration,
+}
+
+impl NeonStorageControllerConf {
+    // Use a shorter pageserver unavailability interval than the default to speed up tests.
+    const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
+        std::time::Duration::from_secs(10);
+}
+
+impl Default for NeonStorageControllerConf {
+    fn default() -> Self {
+        Self {
+            max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
+        }
+    }
+}
+
 // Dummy Default impl to satisfy Deserialize derive.
 impl Default for NeonBroker {
     fn default() -> Self {
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index dbb4475ae8..b919b14758 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -1,4 +1,7 @@
-use crate::{background_process, local_env::LocalEnv};
+use crate::{
+    background_process,
+    local_env::{LocalEnv, NeonStorageControllerConf},
+};
 use camino::{Utf8Path, Utf8PathBuf};
 use hyper::Method;
 use pageserver_api::{
@@ -32,15 +35,13 @@ pub struct StorageController {
     public_key: Option<String>,
     postgres_port: u16,
     client: reqwest::Client,
+    config: NeonStorageControllerConf,
 }
 
 const COMMAND: &str = "storage_controller";
 
 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 
-// Use a shorter pageserver unavailability interval than the default to speed up tests.
-const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
-
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
@@ -135,6 +136,7 @@ impl StorageController {
             client: reqwest::ClientBuilder::new()
                 .build()
                 .expect("Failed to construct http client"),
+            config: env.storage_controller.clone(),
         }
     }
 
@@ -272,8 +274,6 @@ impl StorageController {
         // Run migrations on every startup, in case something changed.
         let database_url = self.setup_database().await?;
 
-        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
-
         let mut args = vec![
             "-l",
             &self.listen,
@@ -283,7 +283,7 @@ impl StorageController {
             "--database-url",
             &database_url,
             "--max-unavailable-interval",
-            &max_unavailable.to_string(),
+            &humantime::Duration::from(self.config.max_unavailable).to_string(),
         ]
         .into_iter()
         .map(|s| s.to_string())
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 882562d99f..186a820adf 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -90,7 +90,11 @@ const INITIAL_GENERATION: Generation = Generation::new(0);
 /// up on unresponsive pageservers and proceed.
 pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
-pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
+/// How long a node may be unresponsive to heartbeats before we declare it offline.
+/// This must be long enough to cover node restarts as well as normal operations: in future
+/// it should be separated into distinct timeouts for startup vs. normal operation
+/// (`<https://github.com/neondatabase/neon/issues/7552>`)
+pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
 
 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 
@@ -4251,7 +4255,9 @@ impl Service {
     /// Check all tenants for pending reconciliation work, and reconcile those in need.
     /// Additionally, reschedule tenants that require it.
     ///
-    /// Returns how many reconciliation tasks were started
+    /// Returns how many reconciliation tasks were started, or `1` if no reconciles were
+    /// spawned but some _would_ have been spawned if `reconciler_concurrency` units where
+    /// available.  A return value of 0 indicates that everything is fully reconciled already.
     fn reconcile_all(&self) -> usize {
         let mut locked = self.inner.write().unwrap();
         let (nodes, tenants, _scheduler) = locked.parts_mut();
@@ -4266,7 +4272,11 @@ impl Service {
             }
 
             // Skip checking if this shard is already enqueued for reconciliation
-            if shard.delayed_reconcile {
+            if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 {
+                // If there is something delayed, then return a nonzero count so that
+                // callers like reconcile_all_now do not incorrectly get the impression
+                // that the system is in a quiescent state.
+                reconciles_spawned = std::cmp::max(1, reconciles_spawned);
                 continue;
             }
 
@@ -4451,7 +4461,7 @@ impl Service {
             waiter_count
         );
 
-        Ok(waiter_count)
+        Ok(std::cmp::max(waiter_count, reconciles_spawned))
     }
 
     pub async fn shutdown(&self) {
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 7b11dfe64d..ffbf2c4b7a 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -952,8 +952,8 @@ impl TenantShard {
 
     /// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet.
     ///
-    /// This is appropriate when you can't spawn a recociler (e.g. due to resource limits), but
-    /// you would like to wait until one gets spawned in the background.
+    /// This is appropriate when you can't spawn a reconciler (e.g. due to resource limits), but
+    /// you would like to wait on the next reconciler that gets spawned in the background.
     pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter {
         self.ensure_sequence_ahead();
 
diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py
index 9dd66fe636..a883d94f73 100644
--- a/test_runner/fixtures/compute_reconfigure.py
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -14,10 +14,18 @@ class ComputeReconfigure:
         self.server = server
         self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach"
         self.workloads = {}
+        self.on_notify = None
 
     def register_workload(self, workload):
         self.workloads[workload.tenant_id] = workload
 
+    def register_on_notify(self, fn):
+        """
+        Add some extra work during a notification, like sleeping to slow things down, or
+        logging what was notified.
+        """
+        self.on_notify = fn
+
 
 @pytest.fixture(scope="function")
 def compute_reconfigure_listener(make_httpserver):
@@ -43,6 +51,9 @@ def compute_reconfigure_listener(make_httpserver):
         body: dict[str, Any] = request.json
         log.info(f"notify-attach request: {body}")
 
+        if self.on_notify is not None:
+            self.on_notify(body)
+
         try:
             workload = self.workloads[TenantId(body["tenant_id"])]
         except KeyError:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index fa83ebdccb..fbd1e22aa9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -499,6 +499,7 @@ class NeonEnvBuilder:
         self.config_init_force: Optional[str] = None
         self.top_output_dir = top_output_dir
         self.control_plane_compute_hook_api: Optional[str] = None
+        self.storage_controller_config: Optional[dict[Any, Any]] = None
 
         self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
 
@@ -1021,6 +1022,7 @@ class NeonEnv:
         self.pg_distrib_dir = config.pg_distrib_dir
         self.endpoint_counter = 0
         self.pageserver_config_override = config.pageserver_config_override
+        self.storage_controller_config = config.storage_controller_config
 
         # generate initial tenant ID here instead of letting 'neon init' generate it,
         # so that we don't need to dig it out of the config file afterwards.
@@ -1066,6 +1068,9 @@ class NeonEnv:
         if self.control_plane_compute_hook_api is not None:
             cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api
 
+        if self.storage_controller_config is not None:
+            cfg["storage_controller"] = self.storage_controller_config
+
         # Create config for pageserver
         http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
         pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
@@ -1134,12 +1139,9 @@ class NeonEnv:
         # bounce through retries on startup
         self.storage_controller.start()
 
-        def storage_controller_ready():
-            assert self.storage_controller.ready() is True
-
         # Wait for storage controller readiness to prevent unnecessary post start-up
         # reconcile.
-        wait_until(30, 1, storage_controller_ready)
+        self.storage_controller.wait_until_ready()
 
         # Start up broker, pageserver and all safekeepers
         futs = []
@@ -2043,6 +2045,15 @@ class NeonStorageController(MetricsGetter):
         else:
             raise RuntimeError(f"Unexpected status {status} from readiness endpoint")
 
+    def wait_until_ready(self):
+        t1 = time.time()
+
+        def storage_controller_ready():
+            assert self.ready() is True
+
+        wait_until(30, 1, storage_controller_ready)
+        return time.time() - t1
+
     def attach_hook_issue(
         self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
     ) -> int:
@@ -2130,7 +2141,7 @@ class NeonStorageController(MetricsGetter):
         shard_count: Optional[int] = None,
         shard_stripe_size: Optional[int] = None,
         tenant_config: Optional[Dict[Any, Any]] = None,
-        placement_policy: Optional[str] = None,
+        placement_policy: Optional[Union[Dict[Any, Any] | str]] = None,
     ):
         """
         Use this rather than pageserver_api() when you need to include shard parameters
@@ -2240,10 +2251,21 @@ class NeonStorageController(MetricsGetter):
     def reconcile_until_idle(self, timeout_secs=30):
         start_at = time.time()
         n = 1
+        delay_sec = 0.5
+        delay_max = 5
         while n > 0:
             n = self.reconcile_all()
-            if time.time() - start_at > timeout_secs:
+            if n == 0:
+                break
+            elif time.time() - start_at > timeout_secs:
                 raise RuntimeError("Timeout in reconcile_until_idle")
+            else:
+                # Don't call again right away: if we're waiting for many reconciles that
+                # are blocked on the concurrency limit, it slows things down to call
+                # reconcile_all frequently.
+                time.sleep(delay_sec)
+                delay_sec *= 2
+                delay_sec = min(delay_sec, delay_max)
 
     def consistency_check(self):
         """
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
new file mode 100644
index 0000000000..17dc96dabe
--- /dev/null
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -0,0 +1,198 @@
+import concurrent.futures
+import random
+import time
+
+import pytest
+from fixtures.compute_reconfigure import ComputeReconfigure
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pg_version import PgVersion
+from fixtures.types import TenantId, TenantShardId, TimelineId
+
+
+@pytest.mark.timeout(3600)  # super long running test: should go down as we optimize
+def test_storage_controller_many_tenants(
+    neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure
+):
+    """
+    Check that we cope well with a not-totally-trivial number of tenants.
+
+    This is checking for:
+    - Obvious concurrency bugs from issuing many tenant creations/modifications
+      concurrently.
+    - Obvious scaling bugs like O(N^2) scaling that would be so slow that even
+      a basic test starts failing from slowness.
+
+    This is _not_ a comprehensive scale test: just a basic sanity check that
+    we don't fall over for a thousand shards.
+    """
+
+    neon_env_builder.num_pageservers = 5
+    neon_env_builder.storage_controller_config = {
+        # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
+        # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to
+        # guard against regressions in restart time.
+        "max_unavailable": "300s"
+    }
+    neon_env_builder.control_plane_compute_hook_api = (
+        compute_reconfigure_listener.control_plane_compute_hook_api
+    )
+
+    # A small sleep on each call into the notify hook, to simulate the latency of doing a database write
+    compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))
+
+    env = neon_env_builder.init_start()
+
+    # We will intentionally stress reconciler concurrrency, which triggers a warning when lots
+    # of shards are hitting the delayed path.
+    env.storage_controller.allowed_errors.append(".*Many shards are waiting to reconcile")
+
+    for ps in env.pageservers:
+        # This can happen because when we do a loop over all pageservers and mark them offline/active,
+        # reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of
+        # bumping generation before other attachments are detached.
+        #
+        # We could clean this up by making reconcilers respect the .observed of their predecessor, if
+        # we spawn with a wait for the predecessor.
+        ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
+
+        # Storage controller is allowed to drop pageserver requests when the cancellation token
+        # for a Reconciler fires.
+        ps.allowed_errors.append(".*request was dropped before completing.*")
+
+    # Total tenants
+    tenant_count = 4000
+
+    # Shards per tenant
+    shard_count = 2
+    stripe_size = 1024
+
+    tenants = set(TenantId.generate() for _i in range(0, tenant_count))
+
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
+
+    def check_memory():
+        # Shards should be cheap_ in memory, as we will have very many of them
+        expect_memory_per_shard = 128 * 1024
+
+        rss = env.storage_controller.get_metric_value("process_resident_memory_bytes")
+        assert rss is not None
+        log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)")
+        assert rss < expect_memory_per_shard * shard_count * tenant_count
+
+    # We use a fixed seed to make the test somewhat reproducible: we want a randomly
+    # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run.
+    rng = random.Random(1234)
+
+    # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore
+    # permits, to ensure that we are exercising stressing that.
+    api_concurrency = 135
+
+    # We will create tenants directly via API, not via neon_local, to avoid any false
+    # serialization of operations in neon_local (it e.g. loads/saves a config file on each call)
+    with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor:
+        futs = []
+        t1 = time.time()
+        for tenant_id in tenants:
+            f = executor.submit(
+                env.storage_controller.tenant_create,
+                tenant_id,
+                shard_count,
+                stripe_size,
+                placement_policy={"Attached": 1},
+            )
+            futs.append(f)
+
+        # Wait for creations to finish
+        for f in futs:
+            f.result()
+        log.info(
+            f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s"
+        )
+
+        run_ops = api_concurrency * 4
+        assert run_ops < len(tenants)
+        op_tenants = list(tenants)[0:run_ops]
+
+        # Generate a mixture of operations and dispatch them all concurrently
+        futs = []
+        for tenant_id in op_tenants:
+            op = rng.choice([0, 1, 2])
+            if op == 0:
+                # A fan-out write operation to all shards in a tenant (timeline creation)
+                f = executor.submit(
+                    virtual_ps_http.timeline_create,
+                    PgVersion.NOT_SET,
+                    tenant_id,
+                    TimelineId.generate(),
+                )
+            elif op == 1:
+                # A reconciler operation: migrate a shard.
+                shard_number = rng.randint(0, shard_count - 1)
+                tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
+                dest_ps_id = rng.choice([ps.id for ps in env.pageservers])
+                f = executor.submit(
+                    env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id
+                )
+            elif op == 2:
+                # A passthrough read to shard zero
+                f = executor.submit(virtual_ps_http.tenant_status, tenant_id)
+
+            futs.append(f)
+
+        # Wait for mixed ops to finish
+        for f in futs:
+            f.result()
+
+    # Consistency check is safe here: all the previous operations waited for reconcile before completing
+    env.storage_controller.consistency_check()
+    check_memory()
+
+    # This loop waits for reconcile_all to indicate no pending work, and then calls it once more to time
+    # how long the call takes when idle: this iterates over shards while doing no I/O and should be reliably fast: if
+    # it isn't, that's a sign that we have made some algorithmic mistake (e.g. O(N**2) scheduling)
+    #
+    # We do not require that the system is quiescent already here, although at present in this point in the test
+    # that may be the case.
+    while True:
+        t1 = time.time()
+        reconcilers = env.storage_controller.reconcile_all()
+        if reconcilers == 0:
+            # Time how long a no-op background reconcile takes: this measures how long it takes to
+            # loop over all the shards looking for work to do.
+            runtime = time.time() - t1
+            log.info(f"No-op call to reconcile_all took {runtime}s")
+            assert runtime < 1
+            break
+
+    # Restart the storage controller
+    env.storage_controller.stop()
+    env.storage_controller.start()
+
+    # See how long the controller takes to pass its readiness check.  This should be fast because
+    # all the nodes are online: offline pageservers are the only thing that's allowed to delay
+    # startup.
+    readiness_period = env.storage_controller.wait_until_ready()
+    assert readiness_period < 5
+
+    # Consistency check is safe here: the storage controller's restart should not have caused any reconcilers
+    # to run, as it was in a stable state before restart.  If it did, that's a bug.
+    env.storage_controller.consistency_check()
+    check_memory()
+
+    # Restart pageservers: this exercises the /re-attach API
+    for pageserver in env.pageservers:
+        pageserver.stop()
+        pageserver.start()
+
+    # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,
+    # as they were not offline long enough to trigger any scheduling changes.
+    env.storage_controller.consistency_check()
+    check_memory()
+
+    # Stop the storage controller before tearing down fixtures, because it otherwise might log
+    # errors trying to call our `ComputeReconfigure`.
+    env.storage_controller.stop()

From e018cac1f714626b1dca7eeab8eab0951cbfaed2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 30 Apr 2024 18:00:24 +0100
Subject: [PATCH 0662/1571] tests: tweak log allow list in
 test_sharding_split_failures (#7549)

## Problem

This test became flaky recently with failures like:
```
AssertionError: Log errors on storage_controller: (129, '2024-04-29T16:41:03.591506Z ERROR request{method=PUT path=/control/v1/tenant/b38c0447fbdbcf4e1c023f00b0f7c221/shard_split request_id=34df4975-2ef3-4ed8-b167-2956650e365c}: Error processing HTTP request: InternalServerError(Reconcile error on shard b38c0447fbdbcf4e1c023f00b0f7c221-0002: Cancelled\n')
```

Likely due to #7508 changing how errors are reported from Reconcilers.

## Summary of changes

- Tolerate `Reconcile error.*Cancelled` log errors
---
 test_runner/regress/test_sharding.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 101d2620b0..bae5945338 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -928,6 +928,8 @@ def test_sharding_split_failures(
             ".*Reconcile error: receive body: error sending request for url.*",
             # Node offline cases will fail inside reconciler when detaching secondaries
             ".*Reconcile error on shard.*: receive body: error sending request for url.*",
+            # Node offline cases may eventually cancel reconcilers when the heartbeater realizes nodes are offline
+            ".*Reconcile error.*Cancelled.*",
             # While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning
             ".*Failed to schedule metadata upload after updating disk_consistent_lsn.*",
         ]

From fcbe60f43691b05d064b4b781e01e50ffb985d55 Mon Sep 17 00:00:00 2001
From: Andrew Rudenko <me@prepor.dev>
Date: Tue, 30 Apr 2024 19:44:02 +0200
Subject: [PATCH 0663/1571] Makefile: DISABLE_HOMEBREW variable (#7556)

## Problem

The current Makefile assumes that homebrew is used on macos. There are
other ways to install dependencies on MacOS (nix, macports, "manually").
It would be great to allow the one who wants to use other options to
disable homebrew integration.

## Summary of changes

It adds DISABLE_HOMEBREW variable that if set skips extra
homebrew-specific configuration steps.
---
 Makefile | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index f13f080f1a..5e2b3c4367 100644
--- a/Makefile
+++ b/Makefile
@@ -25,14 +25,16 @@ ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
 	PG_CONFIGURE_OPTS += --with-libseccomp
 else ifeq ($(UNAME_S),Darwin)
-	# macOS with brew-installed openssl requires explicit paths
-	# It can be configured with OPENSSL_PREFIX variable
-	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
-	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
-	PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
-	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
-	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
-	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
+	ifndef DISABLE_HOMEBREW
+		# macOS with brew-installed openssl requires explicit paths
+		# It can be configured with OPENSSL_PREFIX variable
+		OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+		PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+		PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
+		# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
+		# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
+		EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
+	endif
 endif
 
 # Use -C option so that when PostgreSQL "make install" installs the

From 50a45e67dc295f01c32a397a1951205666406b3f Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 30 Apr 2024 21:50:03 +0300
Subject: [PATCH 0664/1571] Discover safekeepers via broker request (#7279)

We had an incident where pageserver requests timed out because
pageserver couldn't fetch WAL from safekeepers. This incident was caused
by a bug in safekeeper logic for timeline activation, which prevented
pageserver from finding safekeepers.
This bug was since fixed, but there is still a chance of a similar bug
in the future due to overall complexity.

We add a new broker message to "signal interest" for timeline. This
signal will be sent by pageservers `wait_lsn`, and safekeepers will
receive this signal to start broadcasting broker messages. Then every
broker subscriber will be able to find the safekeepers and connect to
them (to start fetching WAL).

This feature is not limited to pageservers and any service that wants to
download WAL from safekeepers will be able to use this discovery
request.

This commit changes pageserver's connection_manager (walreceiver) to
send a SafekeeperDiscoveryRequest when there is no information about
safekeepers present in memory. Current implementation will send these
requests only if there is an active wait_lsn() call and no more often
than once per 10 seconds.

Add `test_broker_discovery` to test this: safekeepers started with
`--disable-periodic-broker-push` will not push info to broker so that
pageserver must use a discovery to start fetching WAL.

Add task_stats in safekeepers broker module to log a warning if there is
no message received from the broker for the last 10 seconds.

Closes #5471

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 libs/utils/src/seqwait.rs                     | 126 +++++++++----
 pageserver/src/tenant/timeline.rs             |   6 +
 .../walreceiver/connection_manager.rs         | 164 ++++++++++++++---
 safekeeper/src/bin/safekeeper.rs              |   5 +
 safekeeper/src/broker.rs                      | 166 +++++++++++++++++-
 safekeeper/src/lib.rs                         |   2 +
 .../tests/walproposer_sim/safekeeper.rs       |   1 +
 storage_broker/src/bin/storage_broker.rs      |  12 +-
 test_runner/regress/test_wal_acceptor.py      |  48 ++++-
 9 files changed, 464 insertions(+), 66 deletions(-)

diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs
index 0544c5be03..375b227b99 100644
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -2,11 +2,10 @@
 
 use std::cmp::{Eq, Ordering};
 use std::collections::BinaryHeap;
-use std::fmt::Debug;
 use std::mem;
 use std::sync::Mutex;
 use std::time::Duration;
-use tokio::sync::watch::{channel, Receiver, Sender};
+use tokio::sync::watch::{self, channel};
 use tokio::time::timeout;
 
 /// An error happened while waiting for a number
@@ -35,23 +34,73 @@ pub trait MonotonicCounter<V> {
     fn cnt_value(&self) -> V;
 }
 
-/// Internal components of a `SeqWait`
-struct SeqWaitInt<S, V>
+/// Heap of waiters, lowest numbers pop first.
+struct Waiters<V>
 where
-    S: MonotonicCounter<V>,
     V: Ord,
 {
-    waiters: BinaryHeap<Waiter<V>>,
-    current: S,
-    shutdown: bool,
+    heap: BinaryHeap<Waiter<V>>,
+    /// Number of the first waiter in the heap, or None if there are no waiters.
+    status_channel: watch::Sender<Option<V>>,
+}
+
+impl<V> Waiters<V>
+where
+    V: Ord + Copy,
+{
+    fn new() -> Self {
+        Waiters {
+            heap: BinaryHeap::new(),
+            status_channel: channel(None).0,
+        }
+    }
+
+    /// `status_channel` contains the number of the first waiter in the heap.
+    /// This function should be called whenever waiters heap changes.
+    fn update_status(&self) {
+        let first_waiter = self.heap.peek().map(|w| w.wake_num);
+        let _ = self.status_channel.send_replace(first_waiter);
+    }
+
+    /// Add new waiter to the heap, return a channel that will be notified when the number arrives.
+    fn add(&mut self, num: V) -> watch::Receiver<()> {
+        let (tx, rx) = channel(());
+        self.heap.push(Waiter {
+            wake_num: num,
+            wake_channel: tx,
+        });
+        self.update_status();
+        rx
+    }
+
+    /// Pop all waiters <= num from the heap. Collect channels in a vector,
+    /// so that caller can wake them up.
+    fn pop_leq(&mut self, num: V) -> Vec<watch::Sender<()>> {
+        let mut wake_these = Vec::new();
+        while let Some(n) = self.heap.peek() {
+            if n.wake_num > num {
+                break;
+            }
+            wake_these.push(self.heap.pop().unwrap().wake_channel);
+        }
+        self.update_status();
+        wake_these
+    }
+
+    /// Used on shutdown to efficiently drop all waiters.
+    fn take_all(&mut self) -> BinaryHeap<Waiter<V>> {
+        let heap = mem::take(&mut self.heap);
+        self.update_status();
+        heap
+    }
 }
 
 struct Waiter<T>
 where
     T: Ord,
 {
-    wake_num: T,              // wake me when this number arrives ...
-    wake_channel: Sender<()>, // ... by sending a message to this channel
+    wake_num: T,                     // wake me when this number arrives ...
+    wake_channel: watch::Sender<()>, // ... by sending a message to this channel
 }
 
 // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
@@ -76,6 +125,17 @@ impl<T: Ord> PartialEq for Waiter<T> {
 
 impl<T: Ord> Eq for Waiter<T> {}
 
+/// Internal components of a `SeqWait`
+struct SeqWaitInt<S, V>
+where
+    S: MonotonicCounter<V>,
+    V: Ord,
+{
+    waiters: Waiters<V>,
+    current: S,
+    shutdown: bool,
+}
+
 /// A tool for waiting on a sequence number
 ///
 /// This provides a way to wait the arrival of a number.
@@ -108,7 +168,7 @@ where
     /// Create a new `SeqWait`, initialized to a particular number
     pub fn new(starting_num: S) -> Self {
         let internal = SeqWaitInt {
-            waiters: BinaryHeap::new(),
+            waiters: Waiters::new(),
             current: starting_num,
             shutdown: false,
         };
@@ -128,9 +188,8 @@ where
             // Block any future waiters from starting
             internal.shutdown = true;
 
-            // This will steal the entire waiters map.
-            // When we drop it all waiters will be woken.
-            mem::take(&mut internal.waiters)
+            // Take all waiters to drop them later.
+            internal.waiters.take_all()
 
             // Drop the lock as we exit this scope.
         };
@@ -196,7 +255,7 @@ where
 
     /// Register and return a channel that will be notified when a number arrives,
     /// or None, if it has already arrived.
-    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
+    fn queue_for_wait(&self, num: V) -> Result<Option<watch::Receiver<()>>, SeqWaitError> {
         let mut internal = self.internal.lock().unwrap();
         if internal.current.cnt_value() >= num {
             return Ok(None);
@@ -205,12 +264,8 @@ where
             return Err(SeqWaitError::Shutdown);
         }
 
-        // Create a new channel.
-        let (tx, rx) = channel(());
-        internal.waiters.push(Waiter {
-            wake_num: num,
-            wake_channel: tx,
-        });
+        // Add waiter channel to the queue.
+        let rx = internal.waiters.add(num);
         // Drop the lock as we exit this scope.
         Ok(Some(rx))
     }
@@ -231,16 +286,8 @@ where
             }
             internal.current.cnt_advance(num);
 
-            // Pop all waiters <= num from the heap. Collect them in a vector, and
-            // wake them up after releasing the lock.
-            let mut wake_these = Vec::new();
-            while let Some(n) = internal.waiters.peek() {
-                if n.wake_num > num {
-                    break;
-                }
-                wake_these.push(internal.waiters.pop().unwrap().wake_channel);
-            }
-            wake_these
+            // Pop all waiters <= num from the heap.
+            internal.waiters.pop_leq(num)
         };
 
         for tx in wake_these {
@@ -255,6 +302,23 @@ where
     pub fn load(&self) -> S {
         self.internal.lock().unwrap().current
     }
+
+    /// Get a Receiver for the current status.
+    ///
+    /// The current status is the number of the first waiter in the queue,
+    /// or None if there are no waiters.
+    ///
+    /// This receiver will be notified whenever the status changes.
+    /// It is useful for receiving notifications when the first waiter
+    /// starts waiting for a number, or when there are no more waiters left.
+    pub fn status_receiver(&self) -> watch::Receiver<Option<V>> {
+        self.internal
+            .lock()
+            .unwrap()
+            .waiters
+            .status_channel
+            .subscribe()
+    }
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2a2c5d4ee5..5537505749 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1253,6 +1253,12 @@ impl Timeline {
         self.last_record_lsn.load()
     }
 
+    /// Subscribe to callers of wait_lsn(). The value of the channel is None if there are no
+    /// wait_lsn() calls in progress, and Some(Lsn) if there is an active waiter for wait_lsn().
+    pub(crate) fn subscribe_for_wait_lsn_updates(&self) -> watch::Receiver<Option<Lsn>> {
+        self.last_record_lsn.status_receiver()
+    }
+
     pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn {
         self.disk_consistent_lsn.load()
     }
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index dae31934ad..7ef063c4e5 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -22,10 +22,12 @@ use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeli
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 use pageserver_api::models::TimelineState;
-use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
-use storage_broker::proto::SafekeeperTimelineInfo;
-use storage_broker::proto::SubscribeSafekeeperInfoRequest;
+
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
+use storage_broker::proto::{
+    FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
+    SubscribeByFilterRequest, TypeSubscription, TypedMessage,
+};
 use storage_broker::{BrokerClientChannel, Code, Streaming};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -89,6 +91,14 @@ pub(super) async fn connection_manager_loop_step(
         .timeline
         .subscribe_for_state_updates();
 
+    let mut wait_lsn_status = connection_manager_state
+        .timeline
+        .subscribe_for_wait_lsn_updates();
+
+    // TODO: create a separate config option for discovery request interval
+    let discovery_request_interval = connection_manager_state.conf.lagging_wal_timeout;
+    let mut last_discovery_ts: Option<std::time::Instant> = None;
+
     // Subscribe to the broker updates. Stream shares underlying TCP connection
     // with other streams on this client (other connection managers). When
     // object goes out of scope, stream finishes in drop() automatically.
@@ -97,10 +107,12 @@ pub(super) async fn connection_manager_loop_step(
 
     loop {
         let time_until_next_retry = connection_manager_state.time_until_next_retry();
+        let any_activity = connection_manager_state.wal_connection.is_some()
+            || !connection_manager_state.wal_stream_candidates.is_empty();
 
         // These things are happening concurrently:
         //
-        // - cancellation request
+        //  - cancellation request
         //  - keep receiving WAL on the current connection
         //      - if the shared state says we need to change connection, disconnect and return
         //      - this runs in a separate task and we receive updates via a watch channel
@@ -108,6 +120,7 @@ pub(super) async fn connection_manager_loop_step(
         //  - receive updates from broker
         //      - this might change the current desired connection
         //  - timeline state changes to something that does not allow walreceiver to run concurrently
+        //  - if there's no connection and no candidates, try to send a discovery request
 
         // NB: make sure each of the select expressions are cancellation-safe
         // (no need for arms to be cancellation-safe).
@@ -214,6 +227,65 @@ pub(super) async fn connection_manager_loop_step(
                     }
                 }
             } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"),
+
+            Some(()) = async {
+                // Reminder: this match arm needs to be cancellation-safe.
+                // Calculating time needed to wait until sending the next discovery request.
+                // Current implementation is conservative and sends discovery requests only when there are no candidates.
+
+                if any_activity {
+                    // No need to send discovery requests if there is an active connection or candidates.
+                    return None;
+                }
+
+                // Waiting for an active wait_lsn request.
+                while wait_lsn_status.borrow().is_none() {
+                    if wait_lsn_status.changed().await.is_err() {
+                        // wait_lsn_status channel was closed, exiting
+                        warn!("wait_lsn_status channel was closed in connection_manager_loop_step");
+                        return None;
+                    }
+                }
+
+                // All preconditions met, preparing to send a discovery request.
+                let now = std::time::Instant::now();
+                let next_discovery_ts = last_discovery_ts
+                    .map(|ts| ts + discovery_request_interval)
+                    .unwrap_or_else(|| now);
+
+                if next_discovery_ts > now {
+                    // Prevent sending discovery requests too frequently.
+                    tokio::time::sleep(next_discovery_ts - now).await;
+                }
+
+                let tenant_timeline_id = Some(ProtoTenantTimelineId {
+                    tenant_id: id.tenant_id.as_ref().to_owned(),
+                    timeline_id: id.timeline_id.as_ref().to_owned(),
+                });
+                let request = SafekeeperDiscoveryRequest { tenant_timeline_id };
+                let msg = TypedMessage {
+                    r#type: MessageType::SafekeeperDiscoveryRequest as i32,
+                    safekeeper_timeline_info: None,
+                    safekeeper_discovery_request: Some(request),
+                    safekeeper_discovery_response: None,
+                    };
+
+                last_discovery_ts = Some(std::time::Instant::now());
+                debug!("No active connection and no candidates, sending discovery request to the broker");
+
+                // Cancellation safety: we want to send a message to the broker, but publish_one()
+                // function can get cancelled by the other select! arm. This is absolutely fine, because
+                // we just want to receive broker updates and discovery is not important if we already
+                // receive updates.
+                //
+                // It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
+                // This is totally fine because of the reason above.
+
+                // This is a fire-and-forget request, we don't care about the response
+                let _ = broker_client.publish_one(msg).await;
+                debug!("Discovery request sent to the broker");
+                None
+            } => {}
         }
 
         if let Some(new_candidate) = connection_manager_state.next_connection_candidate() {
@@ -231,7 +303,7 @@ async fn subscribe_for_timeline_updates(
     broker_client: &mut BrokerClientChannel,
     id: TenantTimelineId,
     cancel: &CancellationToken,
-) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
+) -> Result<Streaming<TypedMessage>, Cancelled> {
     let mut attempt = 0;
     loop {
         exponential_backoff(
@@ -244,17 +316,27 @@ async fn subscribe_for_timeline_updates(
         attempt += 1;
 
         // subscribe to the specific timeline
-        let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
-            tenant_id: id.tenant_id.as_ref().to_owned(),
-            timeline_id: id.timeline_id.as_ref().to_owned(),
-        });
-        let request = SubscribeSafekeeperInfoRequest {
-            subscription_key: Some(key),
+        let request = SubscribeByFilterRequest {
+            types: vec![
+                TypeSubscription {
+                    r#type: MessageType::SafekeeperTimelineInfo as i32,
+                },
+                TypeSubscription {
+                    r#type: MessageType::SafekeeperDiscoveryResponse as i32,
+                },
+            ],
+            tenant_timeline_id: Some(FilterTenantTimelineId {
+                enabled: true,
+                tenant_timeline_id: Some(ProtoTenantTimelineId {
+                    tenant_id: id.tenant_id.as_ref().to_owned(),
+                    timeline_id: id.timeline_id.as_ref().to_owned(),
+                }),
+            }),
         };
 
         match {
             tokio::select! {
-                r = broker_client.subscribe_safekeeper_info(request) => { r }
+                r = broker_client.subscribe_by_filter(request) => { r }
                 _ = cancel.cancelled() => { return Err(Cancelled); }
             }
         } {
@@ -398,7 +480,7 @@ struct RetryInfo {
 /// Data about the timeline to connect to, received from the broker.
 #[derive(Debug, Clone)]
 struct BrokerSkTimeline {
-    timeline: SafekeeperTimelineInfo,
+    timeline: SafekeeperDiscoveryResponse,
     /// Time at which the data was fetched from the broker last time, to track the stale data.
     latest_update: NaiveDateTime,
 }
@@ -606,7 +688,41 @@ impl ConnectionManagerState {
     }
 
     /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
-    fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
+    fn register_timeline_update(&mut self, typed_msg: TypedMessage) {
+        let mut is_discovery = false;
+        let timeline_update = match typed_msg.r#type() {
+            MessageType::SafekeeperTimelineInfo => {
+                let info = match typed_msg.safekeeper_timeline_info {
+                    Some(info) => info,
+                    None => {
+                        warn!("bad proto message from broker: no safekeeper_timeline_info");
+                        return;
+                    }
+                };
+                SafekeeperDiscoveryResponse {
+                    safekeeper_id: info.safekeeper_id,
+                    tenant_timeline_id: info.tenant_timeline_id,
+                    commit_lsn: info.commit_lsn,
+                    safekeeper_connstr: info.safekeeper_connstr,
+                    availability_zone: info.availability_zone,
+                }
+            }
+            MessageType::SafekeeperDiscoveryResponse => {
+                is_discovery = true;
+                match typed_msg.safekeeper_discovery_response {
+                    Some(response) => response,
+                    None => {
+                        warn!("bad proto message from broker: no safekeeper_discovery_response");
+                        return;
+                    }
+                }
+            }
+            _ => {
+                // unexpected message
+                return;
+            }
+        };
+
         WALRECEIVER_BROKER_UPDATES.inc();
 
         let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
@@ -619,7 +735,11 @@ impl ConnectionManagerState {
         );
 
         if old_entry.is_none() {
-            info!("New SK node was added: {new_safekeeper_id}");
+            info!(
+                ?is_discovery,
+                %new_safekeeper_id,
+                "New SK node was added",
+            );
             WALRECEIVER_CANDIDATES_ADDED.inc();
         }
     }
@@ -818,7 +938,7 @@ impl ConnectionManagerState {
     fn select_connection_candidate(
         &self,
         node_to_omit: Option<NodeId>,
-    ) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
+    ) -> Option<(NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
         self.applicable_connection_candidates()
             .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
             .max_by_key(|(_, info, _)| info.commit_lsn)
@@ -828,7 +948,7 @@ impl ConnectionManagerState {
     /// Some safekeepers are filtered by the retry cooldown.
     fn applicable_connection_candidates(
         &self,
-    ) -> impl Iterator<Item = (NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
+    ) -> impl Iterator<Item = (NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
         let now = Utc::now().naive_utc();
 
         self.wal_stream_candidates
@@ -968,19 +1088,11 @@ mod tests {
         latest_update: NaiveDateTime,
     ) -> BrokerSkTimeline {
         BrokerSkTimeline {
-            timeline: SafekeeperTimelineInfo {
+            timeline: SafekeeperDiscoveryResponse {
                 safekeeper_id: 0,
                 tenant_timeline_id: None,
-                term: 0,
-                last_log_term: 0,
-                flush_lsn: 0,
                 commit_lsn,
-                backup_lsn: 0,
-                remote_consistent_lsn: 0,
-                peer_horizon_lsn: 0,
-                local_start_lsn: 0,
                 safekeeper_connstr: safekeeper_connstr.to_owned(),
-                http_connstr: safekeeper_connstr.to_owned(),
                 availability_zone: None,
             },
             latest_update,
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index e53ccaeb3d..09c565ce71 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -177,6 +177,10 @@ struct Args {
     /// Controls how long backup will wait until uploading the partial segment.
     #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
     partial_backup_timeout: Duration,
+    /// Disable task to push messages to broker every second. Supposed to
+    /// be used in tests.
+    #[arg(long)]
+    disable_periodic_broker_push: bool,
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -309,6 +313,7 @@ async fn main() -> anyhow::Result<()> {
         walsenders_keep_horizon: args.walsenders_keep_horizon,
         partial_backup_enabled: args.partial_backup_enabled,
         partial_backup_timeout: args.partial_backup_timeout,
+        disable_periodic_broker_push: args.disable_periodic_broker_push,
     };
 
     // initialize sentry if SENTRY_DSN is provided
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index 2b1db2714b..98f58d3e49 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -10,11 +10,20 @@ use anyhow::Result;
 use storage_broker::parse_proto_ttid;
 
 use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
+use storage_broker::proto::FilterTenantTimelineId;
+use storage_broker::proto::MessageType;
+use storage_broker::proto::SafekeeperDiscoveryResponse;
+use storage_broker::proto::SubscribeByFilterRequest;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
+use storage_broker::proto::TypeSubscription;
+use storage_broker::proto::TypedMessage;
 use storage_broker::Request;
 
+use std::sync::atomic::AtomicU64;
+use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
+use std::time::UNIX_EPOCH;
 use tokio::task::JoinHandle;
 use tokio::time::sleep;
 use tracing::*;
@@ -31,6 +40,12 @@ const PUSH_INTERVAL_MSEC: u64 = 1000;
 
 /// Push once in a while data about all active timelines to the broker.
 async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
+    if conf.disable_periodic_broker_push {
+        info!("broker push_loop is disabled, doing nothing...");
+        futures::future::pending::<()>().await; // sleep forever
+        return Ok(());
+    }
+
     let mut client =
         storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
     let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
@@ -75,7 +90,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
 }
 
 /// Subscribe and fetch all the interesting data from the broker.
-async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
+async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
     let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
 
     // TODO: subscribe only to local timelines instead of all
@@ -94,6 +109,8 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
     let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]);
 
     while let Some(msg) = stream.message().await? {
+        stats.update_pulled();
+
         let proto_ttid = msg
             .tenant_timeline_id
             .as_ref()
@@ -119,12 +136,93 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
     bail!("end of stream");
 }
 
+/// Process incoming discover requests. This is done in a separate task to avoid
+/// interfering with the normal pull/push loops.
+async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
+    let mut client =
+        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
+
+    let request = SubscribeByFilterRequest {
+        types: vec![TypeSubscription {
+            r#type: MessageType::SafekeeperDiscoveryRequest as i32,
+        }],
+        tenant_timeline_id: Some(FilterTenantTimelineId {
+            enabled: false,
+            tenant_timeline_id: None,
+        }),
+    };
+
+    let mut stream = client
+        .subscribe_by_filter(request)
+        .await
+        .context("subscribe_by_filter request failed")?
+        .into_inner();
+
+    let discover_counter = BROKER_PULLED_UPDATES.with_label_values(&["discover"]);
+
+    while let Some(typed_msg) = stream.message().await? {
+        stats.update_pulled();
+
+        match typed_msg.r#type() {
+            MessageType::SafekeeperDiscoveryRequest => {
+                let msg = typed_msg
+                    .safekeeper_discovery_request
+                    .expect("proto type mismatch from broker message");
+
+                let proto_ttid = msg
+                    .tenant_timeline_id
+                    .as_ref()
+                    .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
+                let ttid = parse_proto_ttid(proto_ttid)?;
+                if let Ok(tli) = GlobalTimelines::get(ttid) {
+                    // we received a discovery request for a timeline we know about
+                    discover_counter.inc();
+
+                    // create and reply with discovery response
+                    let sk_info = tli.get_safekeeper_info(&conf).await;
+                    let response = SafekeeperDiscoveryResponse {
+                        safekeeper_id: sk_info.safekeeper_id,
+                        tenant_timeline_id: sk_info.tenant_timeline_id,
+                        commit_lsn: sk_info.commit_lsn,
+                        safekeeper_connstr: sk_info.safekeeper_connstr,
+                        availability_zone: sk_info.availability_zone,
+                    };
+
+                    // note this is a blocking call
+                    client
+                        .publish_one(TypedMessage {
+                            r#type: MessageType::SafekeeperDiscoveryResponse as i32,
+                            safekeeper_timeline_info: None,
+                            safekeeper_discovery_request: None,
+                            safekeeper_discovery_response: Some(response),
+                        })
+                        .await?;
+                }
+            }
+
+            _ => {
+                warn!(
+                    "unexpected message type i32 {}, {:?}",
+                    typed_msg.r#type,
+                    typed_msg.r#type()
+                );
+            }
+        }
+    }
+    bail!("end of stream");
+}
+
 pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
     info!("started, broker endpoint {:?}", conf.broker_endpoint);
 
     let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
     let mut push_handle: Option<JoinHandle<Result<(), Error>>> = None;
     let mut pull_handle: Option<JoinHandle<Result<(), Error>>> = None;
+    let mut discover_handle: Option<JoinHandle<Result<(), Error>>> = None;
+
+    let stats = Arc::new(BrokerStats::new());
+    let stats_task = task_stats(stats.clone());
+    tokio::pin!(stats_task);
 
     // Selecting on JoinHandles requires some squats; is there a better way to
     // reap tasks individually?
@@ -153,13 +251,77 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
                     };
                     pull_handle = None;
                 },
+                res = async { discover_handle.as_mut().unwrap().await }, if discover_handle.is_some() => {
+                    // was it panic or normal error?
+                    match res {
+                        Ok(res_internal) => if let Err(err_inner) = res_internal {
+                            warn!("discover task failed: {:?}", err_inner);
+                        }
+                        Err(err_outer) => { warn!("discover task panicked: {:?}", err_outer) }
+                    };
+                    discover_handle = None;
+                },
                 _ = ticker.tick() => {
                     if push_handle.is_none() {
                         push_handle = Some(tokio::spawn(push_loop(conf.clone())));
                     }
                     if pull_handle.is_none() {
-                        pull_handle = Some(tokio::spawn(pull_loop(conf.clone())));
+                        pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), stats.clone())));
                     }
+                    if discover_handle.is_none() {
+                        discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), stats.clone())));
+                    }
+                },
+                _ = &mut stats_task => {}
+        }
+    }
+}
+
+struct BrokerStats {
+    /// Timestamp of the last received message from the broker.
+    last_pulled_ts: AtomicU64,
+}
+
+impl BrokerStats {
+    fn new() -> Self {
+        BrokerStats {
+            last_pulled_ts: AtomicU64::new(0),
+        }
+    }
+
+    fn now_millis() -> u64 {
+        std::time::SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .expect("time is before epoch")
+            .as_millis() as u64
+    }
+
+    /// Update last_pulled timestamp to current time.
+    fn update_pulled(&self) {
+        self.last_pulled_ts
+            .store(Self::now_millis(), std::sync::atomic::Ordering::Relaxed);
+    }
+}
+
+/// Periodically write to logs if there are issues with receiving data from the broker.
+async fn task_stats(stats: Arc<BrokerStats>) {
+    let warn_duration = Duration::from_secs(10);
+    let mut ticker = tokio::time::interval(warn_duration);
+
+    loop {
+        tokio::select! {
+            _ = ticker.tick() => {
+                let last_pulled = stats.last_pulled_ts.load(std::sync::atomic::Ordering::SeqCst);
+                if last_pulled == 0 {
+                    // no broker updates yet
+                    continue;
+                }
+
+                let now = BrokerStats::now_millis();
+                if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 {
+                    let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
+                    info!("no broker updates for some time, last update: {:?}", ts);
+                }
             }
         }
     }
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 9b4d4dbb38..543714a54e 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -83,6 +83,7 @@ pub struct SafeKeeperConf {
     pub walsenders_keep_horizon: bool,
     pub partial_backup_enabled: bool,
     pub partial_backup_timeout: Duration,
+    pub disable_periodic_broker_push: bool,
 }
 
 impl SafeKeeperConf {
@@ -129,6 +130,7 @@ impl SafeKeeperConf {
             walsenders_keep_horizon: false,
             partial_backup_enabled: false,
             partial_backup_timeout: Duration::from_secs(0),
+            disable_periodic_broker_push: false,
         }
     }
 }
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index bc21c4d765..27e2a4453b 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -178,6 +178,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         walsenders_keep_horizon: false,
         partial_backup_enabled: false,
         partial_backup_timeout: Duration::from_secs(0),
+        disable_periodic_broker_push: false,
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 4e5f8ed724..8c88b61abc 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -196,8 +196,13 @@ impl SubscriptionKey {
 
     /// Parse from FilterTenantTimelineId
     pub fn from_proto_filter_tenant_timeline_id(
-        f: &FilterTenantTimelineId,
+        opt: Option<&FilterTenantTimelineId>,
     ) -> Result<Self, Status> {
+        if opt.is_none() {
+            return Ok(SubscriptionKey::All);
+        }
+
+        let f = opt.unwrap();
         if !f.enabled {
             return Ok(SubscriptionKey::All);
         }
@@ -534,10 +539,7 @@ impl BrokerService for Broker {
             .remote_addr()
             .expect("TCPConnectInfo inserted by handler");
         let proto_filter = request.into_inner();
-        let ttid_filter = proto_filter
-            .tenant_timeline_id
-            .as_ref()
-            .ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?;
+        let ttid_filter = proto_filter.tenant_timeline_id.as_ref();
 
         let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?;
         let types_set = proto_filter
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index ac1a747df3..967d133e18 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1828,7 +1828,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
     tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_sk_auth_restart_endpoint")
+    timeline_id = env.neon_cli.create_branch("test_idle_reconnections")
 
     def collect_stats() -> Dict[str, float]:
         # we need to collect safekeeper_pg_queries_received_total metric from all safekeepers
@@ -1859,7 +1859,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
 
     collect_stats()
 
-    endpoint = env.endpoints.create_start("test_sk_auth_restart_endpoint")
+    endpoint = env.endpoints.create_start("test_idle_reconnections")
     # just write something to the timeline
     endpoint.safe_psql("create table t(i int)")
     collect_stats()
@@ -2007,3 +2007,47 @@ def test_patch_control_file(neon_env_builder: NeonEnvBuilder):
     )
     log.info(f"dump_control_file response: {res}")
     assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1"
+
+
+# Test disables periodic pushes from safekeeper to the broker and checks that
+# pageserver can still discover safekeepers with discovery requests.
+def test_broker_discovery(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS)
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch("test_broker_discovery")
+
+    endpoint = env.endpoints.create_start(
+        "test_broker_discovery",
+        config_lines=["shared_buffers=1MB"],
+    )
+    endpoint.safe_psql("create table t(i int, payload text)")
+    # Install extension containing function needed to clear buffer
+    endpoint.safe_psql("CREATE EXTENSION neon_test_utils")
+
+    def do_something():
+        time.sleep(1)
+        # generate some data to commit WAL on safekeepers
+        endpoint.safe_psql("insert into t select generate_series(1,100), 'action'")
+        # clear the buffers
+        endpoint.safe_psql("select clear_buffer_cache()")
+        # read data to fetch pages from pageserver
+        endpoint.safe_psql("select sum(i) from t")
+
+    do_something()
+    do_something()
+
+    for sk in env.safekeepers:
+        # Disable periodic broker push, so pageserver won't be able to discover
+        # safekeepers without sending a discovery request
+        sk.stop().start(extra_opts=["--disable-periodic-broker-push"])
+
+    do_something()
+    do_something()
+
+    # restart pageserver and check how everything works
+    env.pageserver.stop().start()
+
+    do_something()
+    do_something()

From 26e6ff8ba61c896cae9fd35c1683b0126203f345 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 1 May 2024 11:44:42 -0400
Subject: [PATCH 0665/1571] chore(pageserver): concise error message for layer
 traversal (#7565)

Instead of showing the full path of layer traversal, we now only show
tenant (in tracing context)+timeline+filename.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/storage_layer/layer.rs | 10 +++++-----
 pageserver/src/tenant/timeline.rs            |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index ee9de8de09..2b6934fcee 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -401,8 +401,8 @@ impl Layer {
         &self.0.path
     }
 
-    pub(crate) fn local_path_str(&self) -> &Arc<str> {
-        &self.0.path_str
+    pub(crate) fn debug_str(&self) -> &Arc<str> {
+        &self.0.debug_str
     }
 
     pub(crate) fn metadata(&self) -> LayerFileMetadata {
@@ -527,8 +527,8 @@ struct LayerInner {
     /// Full path to the file; unclear if this should exist anymore.
     path: Utf8PathBuf,
 
-    /// String representation of the full path, used for traversal id.
-    path_str: Arc<str>,
+    /// String representation of the layer, used for traversal id.
+    debug_str: Arc<str>,
 
     desc: PersistentLayerDesc,
 
@@ -735,7 +735,7 @@ impl LayerInner {
 
         LayerInner {
             conf,
-            path_str: path.to_string().into(),
+            debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
             path,
             desc,
             timeline: Arc::downgrade(timeline),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5537505749..cda873d649 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2948,7 +2948,7 @@ trait TraversalLayerExt {
 
 impl TraversalLayerExt for Layer {
     fn traversal_id(&self) -> TraversalId {
-        Arc::clone(self.local_path_str())
+        Arc::clone(self.debug_str())
     }
 }
 

From 5558457c84c2cb2c948989a2ac4139322dce50e3 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 1 May 2024 12:31:59 -0400
Subject: [PATCH 0666/1571] chore(pageserver): categorize basebackup errors
 (#7523)

close https://github.com/neondatabase/neon/issues/7391

## Summary of changes

Categorize basebackup error into two types: server error and client
error. This makes it easier to set up alerts.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/basebackup.rs   | 197 ++++++++++++++++++++++++---------
 pageserver/src/page_service.rs |  28 ++++-
 2 files changed, 166 insertions(+), 59 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 53abd8bfb9..58b18dae7d 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,7 +10,7 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{anyhow, bail, ensure, Context};
+use anyhow::{anyhow, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
 use pageserver_api::key::{key_to_slru_block, Key};
@@ -38,6 +38,14 @@ use postgres_ffi::PG_TLI;
 use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
 use utils::lsn::Lsn;
 
+#[derive(Debug, thiserror::Error)]
+pub enum BasebackupError {
+    #[error("basebackup pageserver error {0:#}")]
+    Server(#[from] anyhow::Error),
+    #[error("basebackup client error {0:#}")]
+    Client(#[source] io::Error),
+}
+
 /// Create basebackup with non-rel data in it.
 /// Only include relational data if 'full_backup' is true.
 ///
@@ -53,7 +61,7 @@ pub async fn send_basebackup_tarball<'a, W>(
     prev_lsn: Option<Lsn>,
     full_backup: bool,
     ctx: &'a RequestContext,
-) -> anyhow::Result<()>
+) -> Result<(), BasebackupError>
 where
     W: AsyncWrite + Send + Sync + Unpin,
 {
@@ -92,8 +100,10 @@ where
 
     // Consolidate the derived and the provided prev_lsn values
     let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
-        if backup_prev != Lsn(0) {
-            ensure!(backup_prev == provided_prev_lsn);
+        if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
+            return Err(BasebackupError::Server(anyhow!(
+                "backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
+            )));
         }
         provided_prev_lsn
     } else {
@@ -159,15 +169,26 @@ where
         }
     }
 
-    async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
+    async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
         let (kind, segno, _) = key_to_slru_block(*key)?;
 
         match kind {
             SlruKind::Clog => {
-                ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
+                if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) {
+                    return Err(BasebackupError::Server(anyhow!(
+                        "invalid SlruKind::Clog record: block.len()={}",
+                        block.len()
+                    )));
+                }
             }
             SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
-                ensure!(block.len() == BLCKSZ as usize);
+                if block.len() != BLCKSZ as usize {
+                    return Err(BasebackupError::Server(anyhow!(
+                        "invalid {:?} record: block.len()={}",
+                        kind,
+                        block.len()
+                    )));
+                }
             }
         }
 
@@ -194,12 +215,15 @@ where
         Ok(())
     }
 
-    async fn flush(&mut self) -> anyhow::Result<()> {
+    async fn flush(&mut self) -> Result<(), BasebackupError> {
         let nblocks = self.buf.len() / BLCKSZ as usize;
         let (kind, segno) = self.current_segment.take().unwrap();
         let segname = format!("{}/{:>04X}", kind.to_str(), segno);
         let header = new_tar_header(&segname, self.buf.len() as u64)?;
-        self.ar.append(&header, self.buf.as_slice()).await?;
+        self.ar
+            .append(&header, self.buf.as_slice())
+            .await
+            .map_err(BasebackupError::Client)?;
 
         self.total_blocks += nblocks;
         debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
@@ -209,7 +233,7 @@ where
         Ok(())
     }
 
-    async fn finish(mut self) -> anyhow::Result<()> {
+    async fn finish(mut self) -> Result<(), BasebackupError> {
         let res = if self.current_segment.is_none() || self.buf.is_empty() {
             Ok(())
         } else {
@@ -226,7 +250,7 @@ impl<'a, W> Basebackup<'a, W>
 where
     W: AsyncWrite + Send + Sync + Unpin,
 {
-    async fn send_tarball(mut self) -> anyhow::Result<()> {
+    async fn send_tarball(mut self) -> Result<(), BasebackupError> {
         // TODO include checksum
 
         let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
@@ -262,7 +286,8 @@ where
             let slru_partitions = self
                 .timeline
                 .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
-                .await?
+                .await
+                .map_err(|e| BasebackupError::Server(e.into()))?
                 .partition(
                     self.timeline.get_shard_identity(),
                     Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
@@ -271,10 +296,15 @@ where
             let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
 
             for part in slru_partitions.parts {
-                let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;
+                let blocks = self
+                    .timeline
+                    .get_vectored(part, self.lsn, self.ctx)
+                    .await
+                    .map_err(|e| BasebackupError::Server(e.into()))?;
 
                 for (key, block) in blocks {
-                    slru_builder.add_block(&key, block?).await?;
+                    let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
+                    slru_builder.add_block(&key, block).await?;
                 }
             }
             slru_builder.finish().await?;
@@ -282,8 +312,11 @@ where
 
         let mut min_restart_lsn: Lsn = Lsn::MAX;
         // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in
-            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
+        for ((spcnode, dbnode), has_relmap_file) in self
+            .timeline
+            .list_dbdirs(self.lsn, self.ctx)
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?
         {
             self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
 
@@ -292,7 +325,8 @@ where
             let rels = self
                 .timeline
                 .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await?;
+                .await
+                .map_err(|e| BasebackupError::Server(e.into()))?;
             for &rel in rels.iter() {
                 // Send init fork as main fork to provide well formed empty
                 // contents of UNLOGGED relations. Postgres copies it in
@@ -315,7 +349,12 @@ where
                 }
             }
 
-            for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
+            for (path, content) in self
+                .timeline
+                .list_aux_files(self.lsn, self.ctx)
+                .await
+                .map_err(|e| BasebackupError::Server(e.into()))?
+            {
                 if path.starts_with("pg_replslot") {
                     let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
                     let restart_lsn = Lsn(u64::from_le_bytes(
@@ -346,34 +385,41 @@ where
         for xid in self
             .timeline
             .list_twophase_files(self.lsn, self.ctx)
-            .await?
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?
         {
             self.add_twophase_file(xid).await?;
         }
 
         fail_point!("basebackup-before-control-file", |_| {
-            bail!("failpoint basebackup-before-control-file")
+            Err(BasebackupError::Server(anyhow!(
+                "failpoint basebackup-before-control-file"
+            )))
         });
 
         // Generate pg_control and bootstrap WAL segment.
         self.add_pgcontrol_file().await?;
-        self.ar.finish().await?;
+        self.ar.finish().await.map_err(BasebackupError::Client)?;
         debug!("all tarred up!");
         Ok(())
     }
 
     /// Add contents of relfilenode `src`, naming it as `dst`.
-    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
+    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
         let nblocks = self
             .timeline
             .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
-            .await?;
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?;
 
         // If the relation is empty, create an empty file
         if nblocks == 0 {
             let file_name = dst.to_segfile_name(0);
             let header = new_tar_header(&file_name, 0)?;
-            self.ar.append(&header, &mut io::empty()).await?;
+            self.ar
+                .append(&header, &mut io::empty())
+                .await
+                .map_err(BasebackupError::Client)?;
             return Ok(());
         }
 
@@ -388,13 +434,17 @@ where
                 let img = self
                     .timeline
                     .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
-                    .await?;
+                    .await
+                    .map_err(|e| BasebackupError::Server(e.into()))?;
                 segment_data.extend_from_slice(&img[..]);
             }
 
             let file_name = dst.to_segfile_name(seg as u32);
             let header = new_tar_header(&file_name, segment_data.len() as u64)?;
-            self.ar.append(&header, segment_data.as_slice()).await?;
+            self.ar
+                .append(&header, segment_data.as_slice())
+                .await
+                .map_err(BasebackupError::Client)?;
 
             seg += 1;
             startblk = endblk;
@@ -414,20 +464,22 @@ where
         spcnode: u32,
         dbnode: u32,
         has_relmap_file: bool,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), BasebackupError> {
         let relmap_img = if has_relmap_file {
             let img = self
                 .timeline
                 .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await?;
+                .await
+                .map_err(|e| BasebackupError::Server(e.into()))?;
 
-            ensure!(
-                img.len()
-                    == dispatch_pgversion!(
-                        self.timeline.pg_version,
-                        pgv::bindings::SIZEOF_RELMAPFILE
-                    )
-            );
+            if img.len()
+                != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
+            {
+                return Err(BasebackupError::Server(anyhow!(
+                    "img.len() != SIZE_OF_RELMAPFILE, img.len()={}",
+                    img.len(),
+                )));
+            }
 
             Some(img)
         } else {
@@ -440,14 +492,20 @@ where
                 ver => format!("{ver}\x0A"),
             };
             let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
-            self.ar.append(&header, pg_version_str.as_bytes()).await?;
+            self.ar
+                .append(&header, pg_version_str.as_bytes())
+                .await
+                .map_err(BasebackupError::Client)?;
 
             info!("timeline.pg_version {}", self.timeline.pg_version);
 
             if let Some(img) = relmap_img {
                 // filenode map for global tablespace
                 let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
-                self.ar.append(&header, &img[..]).await?;
+                self.ar
+                    .append(&header, &img[..])
+                    .await
+                    .map_err(BasebackupError::Client)?;
             } else {
                 warn!("global/pg_filenode.map is missing");
             }
@@ -466,18 +524,26 @@ where
                 && self
                     .timeline
                     .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                    .await?
+                    .await
+                    .map_err(|e| BasebackupError::Server(e.into()))?
                     .is_empty()
             {
                 return Ok(());
             }
             // User defined tablespaces are not supported
-            ensure!(spcnode == DEFAULTTABLESPACE_OID);
+            if spcnode != DEFAULTTABLESPACE_OID {
+                return Err(BasebackupError::Server(anyhow!(
+                    "spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}"
+                )));
+            }
 
             // Append dir path for each database
             let path = format!("base/{}", dbnode);
             let header = new_tar_header_dir(&path)?;
-            self.ar.append(&header, &mut io::empty()).await?;
+            self.ar
+                .append(&header, &mut io::empty())
+                .await
+                .map_err(BasebackupError::Client)?;
 
             if let Some(img) = relmap_img {
                 let dst_path = format!("base/{}/PG_VERSION", dbnode);
@@ -487,11 +553,17 @@ where
                     ver => format!("{ver}\x0A"),
                 };
                 let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
-                self.ar.append(&header, pg_version_str.as_bytes()).await?;
+                self.ar
+                    .append(&header, pg_version_str.as_bytes())
+                    .await
+                    .map_err(BasebackupError::Client)?;
 
                 let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
                 let header = new_tar_header(&relmap_path, img.len() as u64)?;
-                self.ar.append(&header, &img[..]).await?;
+                self.ar
+                    .append(&header, &img[..])
+                    .await
+                    .map_err(BasebackupError::Client)?;
             }
         };
         Ok(())
@@ -500,11 +572,12 @@ where
     //
     // Extract twophase state files
     //
-    async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
+    async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
         let img = self
             .timeline
             .get_twophase_file(xid, self.lsn, self.ctx)
-            .await?;
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?;
 
         let mut buf = BytesMut::new();
         buf.extend_from_slice(&img[..]);
@@ -512,7 +585,10 @@ where
         buf.put_u32_le(crc);
         let path = format!("pg_twophase/{:>08X}", xid);
         let header = new_tar_header(&path, buf.len() as u64)?;
-        self.ar.append(&header, &buf[..]).await?;
+        self.ar
+            .append(&header, &buf[..])
+            .await
+            .map_err(BasebackupError::Client)?;
 
         Ok(())
     }
@@ -521,24 +597,28 @@ where
     // Add generated pg_control file and bootstrap WAL segment.
     // Also send zenith.signal file with extra bootstrap data.
     //
-    async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
+    async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
         // add zenith.signal file
         let mut zenith_signal = String::new();
         if self.prev_record_lsn == Lsn(0) {
             if self.lsn == self.timeline.get_ancestor_lsn() {
-                write!(zenith_signal, "PREV LSN: none")?;
+                write!(zenith_signal, "PREV LSN: none")
+                    .map_err(|e| BasebackupError::Server(e.into()))?;
             } else {
-                write!(zenith_signal, "PREV LSN: invalid")?;
+                write!(zenith_signal, "PREV LSN: invalid")
+                    .map_err(|e| BasebackupError::Server(e.into()))?;
             }
         } else {
-            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
+            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
+                .map_err(|e| BasebackupError::Server(e.into()))?;
         }
         self.ar
             .append(
                 &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
                 zenith_signal.as_bytes(),
             )
-            .await?;
+            .await
+            .map_err(BasebackupError::Client)?;
 
         let checkpoint_bytes = self
             .timeline
@@ -560,7 +640,10 @@ where
 
         //send pg_control
         let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
-        self.ar.append(&header, &pg_control_bytes[..]).await?;
+        self.ar
+            .append(&header, &pg_control_bytes[..])
+            .await
+            .map_err(BasebackupError::Client)?;
 
         //send wal segment
         let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -575,8 +658,16 @@ where
             self.lsn,
         )
         .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
-        ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
-        self.ar.append(&header, &wal_seg[..]).await?;
+        if wal_seg.len() != WAL_SEGMENT_SIZE {
+            return Err(BasebackupError::Server(anyhow!(
+                "wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}",
+                wal_seg.len()
+            )));
+        }
+        self.ar
+            .append(&header, &wal_seg[..])
+            .await
+            .map_err(BasebackupError::Client)?;
         Ok(())
     }
 }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 96d2397c94..f6b251283c 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -48,6 +48,7 @@ use utils::{
 
 use crate::auth::check_permission;
 use crate::basebackup;
+use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
@@ -1236,6 +1237,13 @@ impl PageServerHandler {
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
+        fn map_basebackup_error(err: BasebackupError) -> QueryError {
+            match err {
+                BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)),
+                BasebackupError::Server(e) => QueryError::Other(e),
+            }
+        }
+
         let started = std::time::Instant::now();
 
         // check that the timeline exists
@@ -1261,7 +1269,8 @@ impl PageServerHandler {
         let lsn_awaited_after = started.elapsed();
 
         // switch client to COPYOUT
-        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
+        pgb.write_message_noflush(&BeMessage::CopyOutResponse)
+            .map_err(QueryError::Disconnected)?;
         self.flush_cancellable(pgb, &timeline.cancel).await?;
 
         // Send a tarball of the latest layer on the timeline. Compress if not
@@ -1276,7 +1285,8 @@ impl PageServerHandler {
                 full_backup,
                 ctx,
             )
-            .await?;
+            .await
+            .map_err(map_basebackup_error)?;
         } else {
             let mut writer = pgb.copyout_writer();
             if gzip {
@@ -1297,9 +1307,13 @@ impl PageServerHandler {
                     full_backup,
                     ctx,
                 )
-                .await?;
+                .await
+                .map_err(map_basebackup_error)?;
                 // shutdown the encoder to ensure the gzip footer is written
-                encoder.shutdown().await?;
+                encoder
+                    .shutdown()
+                    .await
+                    .map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?;
             } else {
                 basebackup::send_basebackup_tarball(
                     &mut writer,
@@ -1309,11 +1323,13 @@ impl PageServerHandler {
                     full_backup,
                     ctx,
                 )
-                .await?;
+                .await
+                .map_err(map_basebackup_error)?;
             }
         }
 
-        pgb.write_message_noflush(&BeMessage::CopyDone)?;
+        pgb.write_message_noflush(&BeMessage::CopyDone)
+            .map_err(QueryError::Disconnected)?;
         self.flush_cancellable(pgb, &timeline.cancel).await?;
 
         let basebackup_after = started

From d43d77389e3d38408ec74d7f30b243d1c181569b Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Wed, 1 May 2024 21:36:50 -0700
Subject: [PATCH 0667/1571] Add retry loops and bump test timeout in
 test_pageserver_connection_stress (#7281)

---
 test_runner/regress/test_bad_connection.py | 23 ++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py
index c808fa0f54..82a3a05c2b 100644
--- a/test_runner/regress/test_bad_connection.py
+++ b/test_runner/regress/test_bad_connection.py
@@ -1,10 +1,13 @@
 import random
 import time
 
+import psycopg2.errors
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 
 
+@pytest.mark.timeout(600)
 def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     env.pageserver.allowed_errors.append(".*simulated connection error.*")
@@ -20,12 +23,20 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
+    def execute_retry_on_timeout(query):
+        while True:
+            try:
+                cur.execute(query)
+                return
+            except psycopg2.errors.QueryCanceled:
+                log.info(f"Query '{query}' timed out - retrying")
+
     # Create table, and insert some rows. Make it big enough that it doesn't fit in
     # shared_buffers, otherwise the SELECT after restart will just return answer
     # from shared_buffers without hitting the page server, which defeats the point
     # of this test.
-    cur.execute("CREATE TABLE foo (t text)")
-    cur.execute(
+    execute_retry_on_timeout("CREATE TABLE foo (t text)")
+    execute_retry_on_timeout(
         """
         INSERT INTO foo
             SELECT 'long string to consume some space' || g
@@ -34,7 +45,7 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
     )
 
     # Verify that the table is larger than shared_buffers
-    cur.execute(
+    execute_retry_on_timeout(
         """
         select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
         from pg_settings where name = 'shared_buffers'
@@ -45,16 +56,16 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
     log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
     assert int(row[0]) < int(row[1])
 
-    cur.execute("SELECT count(*) FROM foo")
+    execute_retry_on_timeout("SELECT count(*) FROM foo")
     assert cur.fetchone() == (100000,)
 
     end_time = time.time() + 30
     times_executed = 0
     while time.time() < end_time:
         if random.random() < 0.5:
-            cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')")
+            execute_retry_on_timeout("INSERT INTO foo VALUES ('stas'), ('heikki')")
         else:
-            cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10")
+            execute_retry_on_timeout("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10")
             cur.fetchall()
         times_executed += 1
     log.info(f"Workload executed {times_executed} times")

From cb4b4750badbbe02a2b8000f0df3a490cc3664c1 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 2 May 2024 10:16:04 +0100
Subject: [PATCH 0668/1571] update to reqwest 0.12 (#7561)

## Problem

#7557

## Summary of changes
---
 Cargo.lock                              | 282 +++++++++++++++++-------
 Cargo.toml                              |  13 +-
 control_plane/src/storage_controller.rs |   4 +-
 control_plane/storcon_cli/src/main.rs   |   5 +-
 pageserver/src/control_plane_client.rs  |   2 +-
 proxy/Cargo.toml                        |   5 +-
 proxy/src/http.rs                       |  17 +-
 proxy/src/proxy/wake_compute.rs         |   2 +-
 storage_controller/Cargo.toml           |   2 +-
 storage_controller/src/compute_hook.rs  |  36 ++-
 storage_controller/src/http.rs          |  25 ++-
 storage_controller/src/node.rs          |   2 +-
 storage_controller/src/reconciler.rs    |   2 +-
 storage_controller/src/service.rs       |   2 +-
 workspace_hack/Cargo.toml               |   3 +-
 15 files changed, 273 insertions(+), 129 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e4bf71c64f..775a0d977d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -595,7 +595,7 @@ dependencies = [
  "http 0.2.9",
  "http-body 0.4.5",
  "hyper 0.14.26",
- "hyper-rustls",
+ "hyper-rustls 0.24.0",
  "once_cell",
  "pin-project-lite",
  "pin-utils",
@@ -684,7 +684,7 @@ dependencies = [
  "http-body 0.4.5",
  "hyper 0.14.26",
  "itoa",
- "matchit",
+ "matchit 0.7.0",
  "memchr",
  "mime",
  "percent-encoding",
@@ -740,7 +740,7 @@ dependencies = [
  "pin-project",
  "quick-xml",
  "rand 0.8.5",
- "reqwest",
+ "reqwest 0.11.19",
  "rustc_version",
  "serde",
  "serde_json",
@@ -865,6 +865,12 @@ version = "0.21.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105"
 
+[[package]]
+name = "base64"
+version = "0.22.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9475866fec1451be56a3c2400fd081ff546538961565ccb5b7142cbd22bc7a51"
+
 [[package]]
 name = "base64-simd"
 version = "0.8.0"
@@ -1210,7 +1216,7 @@ dependencies = [
  "postgres",
  "regex",
  "remote_storage",
- "reqwest",
+ "reqwest 0.12.4",
  "rust-ini",
  "serde",
  "serde_json",
@@ -1329,7 +1335,7 @@ dependencies = [
  "postgres_backend",
  "postgres_connection",
  "regex",
- "reqwest",
+ "reqwest 0.12.4",
  "safekeeper_api",
  "scopeguard",
  "serde",
@@ -2363,6 +2369,17 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "hostname"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9c7c7c8ac16c798734b8a24560c1362120597c40d5e1459f09498f8f6c8f2ba"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "windows 0.52.0",
+]
+
 [[package]]
 name = "http"
 version = "0.2.9"
@@ -2509,6 +2526,7 @@ dependencies = [
  "pin-project-lite",
  "smallvec",
  "tokio",
+ "want",
 ]
 
 [[package]]
@@ -2526,6 +2544,23 @@ dependencies = [
  "tokio-rustls 0.24.0",
 ]
 
+[[package]]
+name = "hyper-rustls"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c"
+dependencies = [
+ "futures-util",
+ "http 1.1.0",
+ "hyper 1.2.0",
+ "hyper-util",
+ "rustls 0.22.4",
+ "rustls-pki-types",
+ "tokio",
+ "tokio-rustls 0.25.0",
+ "tower-service",
+]
+
 [[package]]
 name = "hyper-timeout"
 version = "0.4.1"
@@ -2573,6 +2608,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa"
 dependencies = [
  "bytes",
+ "futures-channel",
  "futures-util",
  "http 1.1.0",
  "http-body 1.0.0",
@@ -2580,6 +2616,9 @@ dependencies = [
  "pin-project-lite",
  "socket2 0.5.5",
  "tokio",
+ "tower",
+ "tower-service",
+ "tracing",
 ]
 
 [[package]]
@@ -2593,7 +2632,7 @@ dependencies = [
  "iana-time-zone-haiku",
  "js-sys",
  "wasm-bindgen",
- "windows",
+ "windows 0.48.0",
 ]
 
 [[package]]
@@ -2916,6 +2955,12 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"
 
+[[package]]
+name = "matchit"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "540f1c43aed89909c0cc0cc604e3bb2f7e7a341a3728a9e6cfe760e733cd11ed"
+
 [[package]]
 name = "md-5"
 version = "0.10.5"
@@ -3049,16 +3094,6 @@ version = "0.3.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
 
-[[package]]
-name = "mime_guess"
-version = "2.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef"
-dependencies = [
- "mime",
- "unicase",
-]
-
 [[package]]
 name = "minimal-lexical"
 version = "0.2.1"
@@ -3402,7 +3437,7 @@ dependencies = [
  "bytes",
  "http 0.2.9",
  "opentelemetry_api",
- "reqwest",
+ "reqwest 0.11.19",
 ]
 
 [[package]]
@@ -3420,7 +3455,7 @@ dependencies = [
  "opentelemetry_api",
  "opentelemetry_sdk",
  "prost",
- "reqwest",
+ "reqwest 0.11.19",
  "thiserror",
  "tokio",
  "tonic",
@@ -3649,7 +3684,7 @@ dependencies = [
  "rand 0.8.5",
  "regex",
  "remote_storage",
- "reqwest",
+ "reqwest 0.12.4",
  "rpds",
  "scopeguard",
  "serde",
@@ -3719,7 +3754,7 @@ dependencies = [
  "futures",
  "pageserver_api",
  "postgres",
- "reqwest",
+ "reqwest 0.12.4",
  "serde",
  "thiserror",
  "tokio",
@@ -4328,7 +4363,7 @@ dependencies = [
  "hashlink",
  "hex",
  "hmac",
- "hostname",
+ "hostname 0.3.1",
  "http 1.1.0",
  "http-body-util",
  "humantime",
@@ -4361,7 +4396,7 @@ dependencies = [
  "redis",
  "regex",
  "remote_storage",
- "reqwest",
+ "reqwest 0.12.4",
  "reqwest-middleware",
  "reqwest-retry",
  "reqwest-tracing",
@@ -4388,6 +4423,7 @@ dependencies = [
  "tokio-postgres-rustls",
  "tokio-rustls 0.25.0",
  "tokio-util",
+ "tower-service",
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
@@ -4703,69 +4739,106 @@ dependencies = [
  "http 0.2.9",
  "http-body 0.4.5",
  "hyper 0.14.26",
- "hyper-rustls",
  "hyper-tls",
  "ipnet",
  "js-sys",
  "log",
  "mime",
- "mime_guess",
  "native-tls",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
- "rustls 0.21.11",
- "rustls-pemfile 1.0.2",
  "serde",
  "serde_json",
  "serde_urlencoded",
  "tokio",
  "tokio-native-tls",
- "tokio-rustls 0.24.0",
  "tokio-util",
  "tower-service",
  "url",
  "wasm-bindgen",
  "wasm-bindgen-futures",
- "wasm-streams",
+ "wasm-streams 0.3.0",
  "web-sys",
- "webpki-roots 0.25.2",
- "winreg",
+ "winreg 0.50.0",
+]
+
+[[package]]
+name = "reqwest"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "566cafdd92868e0939d3fb961bd0dc25fcfaaed179291093b3d43e6b3150ea10"
+dependencies = [
+ "base64 0.22.0",
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
+ "hyper 1.2.0",
+ "hyper-rustls 0.26.0",
+ "hyper-util",
+ "ipnet",
+ "js-sys",
+ "log",
+ "mime",
+ "once_cell",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustls 0.22.4",
+ "rustls-pemfile 2.1.1",
+ "rustls-pki-types",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "sync_wrapper",
+ "tokio",
+ "tokio-rustls 0.25.0",
+ "tokio-util",
+ "tower-service",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "wasm-streams 0.4.0",
+ "web-sys",
+ "webpki-roots 0.26.1",
+ "winreg 0.52.0",
 ]
 
 [[package]]
 name = "reqwest-middleware"
-version = "0.2.2"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4531c89d50effe1fac90d095c8b133c20c5c714204feee0bfc3fd158e784209d"
+checksum = "0209efb52486ad88136190094ee214759ef7507068b27992256ed6610eb71a01"
 dependencies = [
  "anyhow",
  "async-trait",
- "http 0.2.9",
- "reqwest",
+ "http 1.1.0",
+ "reqwest 0.12.4",
  "serde",
- "task-local-extensions",
  "thiserror",
+ "tower-service",
 ]
 
 [[package]]
 name = "reqwest-retry"
-version = "0.2.2"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48d0fd6ef4c6d23790399fe15efc8d12cd9f3d4133958f9bd7801ee5cbaec6c4"
+checksum = "40f342894422862af74c50e1e9601cf0931accc9c6981e5eb413c46603b616b5"
 dependencies = [
  "anyhow",
  "async-trait",
  "chrono",
  "futures",
  "getrandom 0.2.11",
- "http 0.2.9",
- "hyper 0.14.26",
+ "http 1.1.0",
+ "hyper 1.2.0",
  "parking_lot 0.11.2",
- "reqwest",
+ "reqwest 0.12.4",
  "reqwest-middleware",
  "retry-policies",
- "task-local-extensions",
  "tokio",
  "tracing",
  "wasm-timer",
@@ -4773,27 +4846,27 @@ dependencies = [
 
 [[package]]
 name = "reqwest-tracing"
-version = "0.4.7"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a0152176687dd5cfe7f507ac1cb1a491c679cfe483afd133a7db7aaea818bb3"
+checksum = "b253954a1979e02eabccd7e9c3d61d8f86576108baa160775e7f160bb4e800a3"
 dependencies = [
  "anyhow",
  "async-trait",
  "getrandom 0.2.11",
- "matchit",
+ "http 1.1.0",
+ "matchit 0.8.2",
  "opentelemetry",
- "reqwest",
+ "reqwest 0.12.4",
  "reqwest-middleware",
- "task-local-extensions",
  "tracing",
  "tracing-opentelemetry",
 ]
 
 [[package]]
 name = "retry-policies"
-version = "0.1.2"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e09bbcb5003282bcb688f0bae741b278e9c7e8f378f561522c9806c58e075d9b"
+checksum = "493b4243e32d6eedd29f9a398896e35c6943a123b55eec97dcaee98310d25810"
 dependencies = [
  "anyhow",
  "chrono",
@@ -5119,7 +5192,7 @@ dependencies = [
  "postgres_ffi",
  "rand 0.8.5",
  "remote_storage",
- "reqwest",
+ "reqwest 0.12.4",
  "serde",
  "serde_json",
  "serde_with",
@@ -5170,7 +5243,7 @@ dependencies = [
  "rand 0.8.5",
  "regex",
  "remote_storage",
- "reqwest",
+ "reqwest 0.12.4",
  "safekeeper_api",
  "scopeguard",
  "sd-notify",
@@ -5300,12 +5373,12 @@ checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
 
 [[package]]
 name = "sentry"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
+checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02"
 dependencies = [
  "httpdate",
- "reqwest",
+ "reqwest 0.12.4",
  "rustls 0.21.11",
  "sentry-backtrace",
  "sentry-contexts",
@@ -5319,9 +5392,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-backtrace"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ac2bac6f310c4c4c4bb094d1541d32ae497f8c5c23405e85492cefdfe0971a9"
+checksum = "a79194074f34b0cbe5dd33896e5928bbc6ab63a889bd9df2264af5acb186921e"
 dependencies = [
  "backtrace",
  "once_cell",
@@ -5331,11 +5404,11 @@ dependencies = [
 
 [[package]]
 name = "sentry-contexts"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c3e17295cecdbacf66c5bd38d6e1147e09e1e9d824d2d5341f76638eda02a3a"
+checksum = "eba8870c5dba2bfd9db25c75574a11429f6b95957b0a78ac02e2970dd7a5249a"
 dependencies = [
- "hostname",
+ "hostname 0.4.0",
  "libc",
  "os_info",
  "rustc_version",
@@ -5345,9 +5418,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-core"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8339474f587f36cb110fa1ed1b64229eea6d47b0b886375579297b7e47aeb055"
+checksum = "46a75011ea1c0d5c46e9e57df03ce81f5c7f0a9e199086334a1f9c0a541e0826"
 dependencies = [
  "once_cell",
  "rand 0.8.5",
@@ -5358,9 +5431,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-panic"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "875b69f506da75bd664029eafb05f8934297d2990192896d17325f066bd665b7"
+checksum = "2eaa3ecfa3c8750c78dcfd4637cfa2598b95b52897ed184b4dc77fcf7d95060d"
 dependencies = [
  "sentry-backtrace",
  "sentry-core",
@@ -5368,9 +5441,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-tracing"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89feead9bdd116f8035e89567651340fc382db29240b6c55ef412078b08d1aa3"
+checksum = "f715932bf369a61b7256687c6f0554141b7ce097287e30e3f7ed6e9de82498fe"
 dependencies = [
  "sentry-backtrace",
  "sentry-core",
@@ -5380,13 +5453,13 @@ dependencies = [
 
 [[package]]
 name = "sentry-types"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99dc599bd6646884fc403d593cdcb9816dd67c50cff3271c01ff123617908dcd"
+checksum = "4519c900ce734f7a0eb7aba0869dfb225a7af8820634a7dd51449e3b093cfb7c"
 dependencies = [
  "debugid",
- "getrandom 0.2.11",
  "hex",
+ "rand 0.8.5",
  "serde",
  "serde_json",
  "thiserror",
@@ -5778,7 +5851,7 @@ dependencies = [
  "pageserver_client",
  "postgres_connection",
  "r2d2",
- "reqwest",
+ "reqwest 0.12.4",
  "routerify",
  "serde",
  "serde_json",
@@ -5800,7 +5873,7 @@ dependencies = [
  "hyper 0.14.26",
  "pageserver_api",
  "pageserver_client",
- "reqwest",
+ "reqwest 0.12.4",
  "serde",
  "serde_json",
  "thiserror",
@@ -6500,12 +6573,14 @@ dependencies = [
 
 [[package]]
 name = "tracing-opentelemetry"
-version = "0.20.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc09e402904a5261e42cf27aea09ccb7d5318c6717a9eec3d8e2e65c56b18f19"
+checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8"
 dependencies = [
  "once_cell",
  "opentelemetry",
+ "opentelemetry_sdk",
+ "smallvec",
  "tracing",
  "tracing-core",
  "tracing-log",
@@ -6551,7 +6626,7 @@ dependencies = [
  "opentelemetry",
  "opentelemetry-otlp",
  "opentelemetry-semantic-conventions",
- "reqwest",
+ "reqwest 0.12.4",
  "tokio",
  "tracing",
  "tracing-opentelemetry",
@@ -6637,15 +6712,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "unicase"
-version = "2.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
-dependencies = [
- "version_check",
-]
-
 [[package]]
 name = "unicode-bidi"
 version = "0.3.13"
@@ -7004,6 +7070,19 @@ dependencies = [
  "web-sys",
 ]
 
+[[package]]
+name = "wasm-streams"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129"
+dependencies = [
+ "futures-util",
+ "js-sys",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
 [[package]]
 name = "wasm-timer"
 version = "0.2.5"
@@ -7044,6 +7123,15 @@ version = "0.25.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc"
 
+[[package]]
+name = "webpki-roots"
+version = "0.26.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b3de34ae270483955a94f4b21bdaaeb83d508bb84a01435f393818edb0012009"
+dependencies = [
+ "rustls-pki-types",
+]
+
 [[package]]
 name = "which"
 version = "4.4.0"
@@ -7095,6 +7183,25 @@ dependencies = [
  "windows-targets 0.48.0",
 ]
 
+[[package]]
+name = "windows"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
+dependencies = [
+ "windows-core",
+ "windows-targets 0.52.4",
+]
+
+[[package]]
+name = "windows-core"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
+dependencies = [
+ "windows-targets 0.52.4",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.42.0"
@@ -7327,6 +7434,16 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "winreg"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5"
+dependencies = [
+ "cfg-if",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "workspace_hack"
 version = "0.1.0"
@@ -7376,7 +7493,8 @@ dependencies = [
  "regex",
  "regex-automata 0.4.3",
  "regex-syntax 0.8.2",
- "reqwest",
+ "reqwest 0.11.19",
+ "reqwest 0.12.4",
  "rustls 0.21.11",
  "scopeguard",
  "serde",
diff --git a/Cargo.toml b/Cargo.toml
index 32a0bc23e6..a6d406dc2f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -130,10 +130,10 @@ prost = "0.11"
 rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
-reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
-reqwest-middleware = "0.2.0"
-reqwest-retry = "0.2.2"
+reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
+reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] }
+reqwest-middleware = "0.3.0"
+reqwest-retry = "0.5"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
@@ -143,7 +143,7 @@ rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
 sd-notify = "0.4.1"
-sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
+sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_path_to_error = "0.1"
@@ -177,9 +177,10 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.7"
 toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
+tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2.0"
-tracing-opentelemetry = "0.20.0"
+tracing-opentelemetry = "0.21.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index b919b14758..f1c43f4036 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -3,7 +3,6 @@ use crate::{
     local_env::{LocalEnv, NeonStorageControllerConf},
 };
 use camino::{Utf8Path, Utf8PathBuf};
-use hyper::Method;
 use pageserver_api::{
     controller_api::{
         NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
@@ -17,6 +16,7 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
+use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
 use std::{fs, str::FromStr};
 use tokio::process::Command;
@@ -379,7 +379,7 @@ impl StorageController {
     /// Simple HTTP request wrapper for calling into storage controller
     async fn dispatch<RQ, RS>(
         &self,
-        method: hyper::Method,
+        method: reqwest::Method,
         path: String,
         body: Option<RQ>,
     ) -> anyhow::Result<RS>
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index b3d1f0be05..c19bc96cdb 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,7 +1,6 @@
 use std::{collections::HashMap, str::FromStr, time::Duration};
 
 use clap::{Parser, Subcommand};
-use hyper::{Method, StatusCode};
 use pageserver_api::{
     controller_api::{
         NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
@@ -14,7 +13,7 @@ use pageserver_api::{
     shard::{ShardStripeSize, TenantShardId},
 };
 use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
-use reqwest::Url;
+use reqwest::{Method, StatusCode, Url};
 use serde::{de::DeserializeOwned, Serialize};
 use utils::id::{NodeId, TenantId};
 
@@ -232,7 +231,7 @@ impl Client {
     /// Simple HTTP request wrapper for calling into storage controller
     async fn dispatch<RQ, RS>(
         &self,
-        method: hyper::Method,
+        method: Method,
         path: String,
         body: Option<RQ>,
     ) -> mgmt_api::Result<RS>
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index f0ed46ce23..db0032891e 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -65,7 +65,7 @@ impl ControlPlaneClient {
         let mut client = reqwest::ClientBuilder::new();
 
         if let Some(jwt) = &conf.control_plane_api_token {
-            let mut headers = hyper::HeaderMap::new();
+            let mut headers = reqwest::header::HeaderMap::new();
             headers.insert(
                 "Authorization",
                 format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 6b8f2ecbf4..0e8d03906b 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -59,8 +59,8 @@ prometheus.workspace = true
 rand.workspace = true
 regex.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
-reqwest = { workspace = true, features = ["json"] }
-reqwest-middleware.workspace = true
+reqwest.workspace = true
+reqwest-middleware = { workspace = true, features = ["json"] }
 reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
 routerify.workspace = true
@@ -84,6 +84,7 @@ tokio-postgres.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
+tower-service.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index e20488e23c..fc7400869f 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -4,7 +4,7 @@
 
 pub mod health_server;
 
-use std::{sync::Arc, time::Duration};
+use std::{str::FromStr, sync::Arc, time::Duration};
 
 use futures::FutureExt;
 pub use reqwest::{Request, Response, StatusCode};
@@ -103,12 +103,12 @@ impl Endpoint {
     }
 }
 
-/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
-use hyper::{
-    client::connect::dns::{GaiResolver as HyperGaiResolver, Name},
-    service::Service,
+use hyper_util::client::legacy::connect::dns::{
+    GaiResolver as HyperGaiResolver, Name as HyperName,
 };
-use reqwest::dns::{Addrs, Resolve, Resolving};
+use reqwest::dns::{Addrs, Name, Resolve, Resolving};
+/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
+use tower_service::Service;
 #[derive(Debug)]
 pub struct GaiResolver(HyperGaiResolver);
 
@@ -121,11 +121,12 @@ impl Default for GaiResolver {
 impl Resolve for GaiResolver {
     fn resolve(&self, name: Name) -> Resolving {
         let this = &mut self.0.clone();
+        let hyper_name = HyperName::from_str(name.as_str()).expect("name should be valid");
         let start = Instant::now();
         Box::pin(
-            Service::<Name>::call(this, name.clone()).map(move |result| {
+            Service::<HyperName>::call(this, hyper_name).map(move |result| {
                 let resolve_duration = start.elapsed();
-                trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
+                trace!(duration = ?resolve_duration, addr = %name.as_str(), "resolve host complete");
                 result
                     .map(|addrs| -> Addrs { Box::new(addrs) })
                     .map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index cb9a293413..3d9e94dd72 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -6,7 +6,7 @@ use crate::metrics::{
     WakeupFailureKind,
 };
 use crate::proxy::retry::retry_after;
-use hyper::StatusCode;
+use hyper1::StatusCode;
 use std::ops::ControlFlow;
 use tracing::{error, info, warn};
 
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 165cafaf4e..789420f2b0 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -31,7 +31,7 @@ once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
-reqwest.workspace = true
+reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 44a156a5ec..9d326ef82d 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -4,7 +4,7 @@ use std::{collections::HashMap, time::Duration};
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
 use futures::StreamExt;
-use hyper::{Method, StatusCode};
+use hyper::StatusCode;
 use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
@@ -328,7 +328,7 @@ impl ComputeHook {
         reconfigure_request: &ComputeHookNotifyRequest,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
-        let req = self.client.request(Method::PUT, url);
+        let req = self.client.request(reqwest::Method::PUT, url);
         let req = if let Some(value) = &self.authorization_header {
             req.header(reqwest::header::AUTHORIZATION, value)
         } else {
@@ -347,8 +347,10 @@ impl ComputeHook {
         };
 
         // Treat all 2xx responses as success
-        if response.status() >= StatusCode::OK && response.status() < StatusCode::MULTIPLE_CHOICES {
-            if response.status() != StatusCode::OK {
+        if response.status() >= reqwest::StatusCode::OK
+            && response.status() < reqwest::StatusCode::MULTIPLE_CHOICES
+        {
+            if response.status() != reqwest::StatusCode::OK {
                 // Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so
                 // log a warning.
                 tracing::warn!(
@@ -362,7 +364,7 @@ impl ComputeHook {
 
         // Error response codes
         match response.status() {
-            StatusCode::TOO_MANY_REQUESTS => {
+            reqwest::StatusCode::TOO_MANY_REQUESTS => {
                 // TODO: 429 handling should be global: set some state visible to other requests
                 // so that they will delay before starting, rather than all notifications trying
                 // once before backing off.
@@ -371,20 +373,30 @@ impl ComputeHook {
                     .ok();
                 Err(NotifyError::SlowDown)
             }
-            StatusCode::LOCKED => {
+            reqwest::StatusCode::LOCKED => {
                 // We consider this fatal, because it's possible that the operation blocking the control one is
                 // also the one that is waiting for this reconcile.  We should let the reconciler calling
                 // this hook fail, to give control plane a chance to un-lock.
                 tracing::info!("Control plane reports tenant is locked, dropping out of notify");
                 Err(NotifyError::Busy)
             }
-            StatusCode::SERVICE_UNAVAILABLE
-            | StatusCode::GATEWAY_TIMEOUT
-            | StatusCode::BAD_GATEWAY => Err(NotifyError::Unavailable(response.status())),
-            StatusCode::BAD_REQUEST | StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => {
-                Err(NotifyError::Fatal(response.status()))
+            reqwest::StatusCode::SERVICE_UNAVAILABLE => {
+                Err(NotifyError::Unavailable(StatusCode::SERVICE_UNAVAILABLE))
             }
-            _ => Err(NotifyError::Unexpected(response.status())),
+            reqwest::StatusCode::GATEWAY_TIMEOUT => {
+                Err(NotifyError::Unavailable(StatusCode::GATEWAY_TIMEOUT))
+            }
+            reqwest::StatusCode::BAD_GATEWAY => {
+                Err(NotifyError::Unavailable(StatusCode::BAD_GATEWAY))
+            }
+
+            reqwest::StatusCode::BAD_REQUEST => Err(NotifyError::Fatal(StatusCode::BAD_REQUEST)),
+            reqwest::StatusCode::UNAUTHORIZED => Err(NotifyError::Fatal(StatusCode::UNAUTHORIZED)),
+            reqwest::StatusCode::FORBIDDEN => Err(NotifyError::Fatal(StatusCode::FORBIDDEN)),
+            status => Err(NotifyError::Unexpected(
+                hyper::StatusCode::from_u16(status.as_u16())
+                    .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR),
+            )),
         }
     }
 
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 09a25a5be0..f9a79afb0d 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -4,6 +4,7 @@ use crate::metrics::{
 };
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
+use anyhow::Context;
 use futures::Future;
 use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
@@ -258,6 +259,12 @@ async fn handle_tenant_time_travel_remote_storage(
     json_response(StatusCode::OK, ())
 }
 
+fn map_reqwest_hyper_status(status: reqwest::StatusCode) -> Result<hyper::StatusCode, ApiError> {
+    hyper::StatusCode::from_u16(status.as_u16())
+        .context("invalid status code")
+        .map_err(ApiError::InternalServerError)
+}
+
 async fn handle_tenant_secondary_download(
     service: Arc<Service>,
     req: Request<Body>,
@@ -266,7 +273,7 @@ async fn handle_tenant_secondary_download(
     let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
 
     let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
-    json_response(status, progress)
+    json_response(map_reqwest_hyper_status(status)?, progress)
 }
 
 async fn handle_tenant_delete(
@@ -277,7 +284,10 @@ async fn handle_tenant_delete(
     check_permissions(&req, Scope::PageServerApi)?;
 
     deletion_wrapper(service, move |service| async move {
-        service.tenant_delete(tenant_id).await
+        service
+            .tenant_delete(tenant_id)
+            .await
+            .and_then(map_reqwest_hyper_status)
     })
     .await
 }
@@ -308,7 +318,10 @@ async fn handle_tenant_timeline_delete(
     let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
 
     deletion_wrapper(service, move |service| async move {
-        service.tenant_timeline_delete(tenant_id, timeline_id).await
+        service
+            .tenant_timeline_delete(tenant_id, timeline_id)
+            .await
+            .and_then(map_reqwest_hyper_status)
     })
     .await
 }
@@ -371,11 +384,9 @@ async fn handle_tenant_timeline_passthrough(
     }
 
     // We have a reqest::Response, would like a http::Response
-    let mut builder = hyper::Response::builder()
-        .status(resp.status())
-        .version(resp.version());
+    let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp.status())?);
     for (k, v) in resp.headers() {
-        builder = builder.header(k, v);
+        builder = builder.header(k.as_str(), v.as_bytes());
     }
 
     let response = builder
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 7ba6828deb..7b5513c908 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -1,6 +1,5 @@
 use std::{str::FromStr, time::Duration};
 
-use hyper::StatusCode;
 use pageserver_api::{
     controller_api::{
         NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
@@ -9,6 +8,7 @@ use pageserver_api::{
     shard::TenantShardId,
 };
 use pageserver_client::mgmt_api;
+use reqwest::StatusCode;
 use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, id::NodeId};
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index f38905b424..fe97f724c1 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -1,12 +1,12 @@
 use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::service;
-use hyper::StatusCode;
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_client::mgmt_api;
+use reqwest::StatusCode;
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 186a820adf..f26122e646 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -21,7 +21,6 @@ use control_plane::storage_controller::{
 };
 use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
-use hyper::StatusCode;
 use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
@@ -33,6 +32,7 @@ use pageserver_api::{
     },
     models::{SecondaryProgress, TenantConfigRequest},
 };
+use reqwest::StatusCode;
 
 use crate::pageserver_client::PageserverClient;
 use pageserver_api::{
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index d6e2cc2996..a225984688 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -57,7 +57,8 @@ rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
-reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] }
+reqwest-5ef9efb8ec2df382 = { package = "reqwest", version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] }
+reqwest-a6292c17cd707f01 = { package = "reqwest", version = "0.11", default-features = false, features = ["blocking", "default-tls", "stream"] }
 rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }

From 25af32e8345d04db3ea26617771caae54be767da Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 2 May 2024 11:50:11 +0200
Subject: [PATCH 0669/1571] proxy: keep track on the number of events from
 redis by type. (#7582)

## Problem

It's unclear what is the distribution of messages, proxy is consuming
from redis.

## Summary of changes

Add counter.
---
 proxy/src/cache/endpoints.rs     | 14 +++++++++++++-
 proxy/src/metrics.rs             | 14 ++++++++++++++
 proxy/src/redis/notifications.rs | 17 ++++++++++++++++-
 3 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 02511e6ff7..4bc10a6020 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -21,7 +21,7 @@ use crate::{
     config::EndpointCacheConfig,
     context::RequestMonitoring,
     intern::{BranchIdInt, EndpointIdInt, ProjectIdInt},
-    metrics::{Metrics, RedisErrors},
+    metrics::{Metrics, RedisErrors, RedisEventsCount},
     rate_limiter::GlobalRateLimiter,
     redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
     EndpointId,
@@ -100,14 +100,26 @@ impl EndpointsCache {
         if let Some(endpoint_created) = key.endpoint_created {
             self.endpoints
                 .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
+            Metrics::get()
+                .proxy
+                .redis_events_count
+                .inc(RedisEventsCount::EndpointCreated);
         }
         if let Some(branch_created) = key.branch_created {
             self.branches
                 .insert(BranchIdInt::from(&branch_created.branch_id.into()));
+            Metrics::get()
+                .proxy
+                .redis_events_count
+                .inc(RedisEventsCount::BranchCreated);
         }
         if let Some(project_created) = key.project_created {
             self.projects
                 .insert(ProjectIdInt::from(&project_created.project_id.into()));
+            Metrics::get()
+                .proxy
+                .redis_events_count
+                .inc(RedisEventsCount::ProjectCreated);
         }
     }
     pub async fn do_read(
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index c129ece059..4a54857012 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -123,6 +123,9 @@ pub struct ProxyMetrics {
     /// Number of retries (per outcome, per retry_type).
     #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))]
     pub retries_metric: HistogramVec<RetriesMetricSet, 9>,
+
+    /// Number of events consumed from redis (per event type).
+    pub redis_events_count: CounterVec<StaticLabelSet<RedisEventsCount>>,
 }
 
 #[derive(MetricGroup)]
@@ -530,3 +533,14 @@ pub enum RetryType {
     WakeCompute,
     ConnectToCompute,
 }
+
+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+#[label(singleton = "event")]
+pub enum RedisEventsCount {
+    EndpointCreated,
+    BranchCreated,
+    ProjectCreated,
+    CancelSession,
+    PasswordUpdate,
+    AllowedIpsUpdate,
+}
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 5a38530faf..ba4dfb755e 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -11,7 +11,7 @@ use crate::{
     cache::project_info::ProjectInfoCache,
     cancellation::{CancelMap, CancellationHandler},
     intern::{ProjectIdInt, RoleNameInt},
-    metrics::{Metrics, RedisErrors},
+    metrics::{Metrics, RedisErrors, RedisEventsCount},
 };
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -118,6 +118,10 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                     "session_id",
                     &tracing::field::display(cancel_session.session_id),
                 );
+                Metrics::get()
+                    .proxy
+                    .redis_events_count
+                    .inc(RedisEventsCount::CancelSession);
                 if let Some(cancel_region) = cancel_session.region_id {
                     // If the message is not for this region, ignore it.
                     if cancel_region != self.region_id {
@@ -138,6 +142,17 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
             }
             _ => {
                 invalidate_cache(self.cache.clone(), msg.clone());
+                if matches!(msg, AllowedIpsUpdate { .. }) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::AllowedIpsUpdate);
+                } else if matches!(msg, PasswordUpdate { .. }) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::PasswordUpdate);
+                }
                 // It might happen that the invalid entry is on the way to be cached.
                 // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
                 // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message.

From 69bf1bae7def8a3f86572f5dd34ab4069614b87b Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 2 May 2024 12:52:30 +0100
Subject: [PATCH 0670/1571] Fix usage of pg_waldump --ignore option (#7578)

Previously, the --ignore option was only used when reading from a single
file.
With this PR pg_waldump -i is enough to open any neon WAL segments
---
 test_runner/regress/test_pg_waldump.py | 46 ++++++++++++++++++++++++++
 vendor/postgres-v14                    |  2 +-
 vendor/postgres-v15                    |  2 +-
 vendor/postgres-v16                    |  2 +-
 vendor/revisions.json                  |  6 ++--
 5 files changed, 52 insertions(+), 6 deletions(-)
 create mode 100644 test_runner/regress/test_pg_waldump.py

diff --git a/test_runner/regress/test_pg_waldump.py b/test_runner/regress/test_pg_waldump.py
new file mode 100644
index 0000000000..1973aa5952
--- /dev/null
+++ b/test_runner/regress/test_pg_waldump.py
@@ -0,0 +1,46 @@
+import os
+
+from fixtures.neon_fixtures import NeonEnv, PgBin
+from fixtures.utils import subprocess_capture
+
+
+# Simple test to check that pg_waldump works with neon WAL files
+def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_pg_waldump", "empty")
+    endpoint = env.endpoints.create_start("test_pg_waldump")
+
+    cur = endpoint.connect().cursor()
+    cur.execute(
+        """
+        BEGIN;
+        CREATE TABLE t1(i int primary key, n_updated int);
+        INSERT INTO t1 select g, 0 from generate_series(1, 50) g;
+        ROLLBACK;
+    """
+    )
+
+    cur.execute(
+        """
+        BEGIN;
+        CREATE TABLE t1(i int primary key, n_updated int);
+        INSERT INTO t1 select g, 0 from generate_series(1, 50) g;
+        COMMIT;
+    """
+    )
+
+    # stop the endpoint to make sure that WAL files are flushed and won't change
+    endpoint.stop()
+
+    assert endpoint.pgdata_dir
+    wal_path = os.path.join(endpoint.pgdata_dir, "pg_wal/000000010000000000000001")
+    pg_waldump_path = os.path.join(pg_bin.pg_bin_path, "pg_waldump")
+
+    # use special --ignore option to ignore the validation checks in pg_waldump
+    # this is necessary, because neon WAL files contain gap at the beginning
+    output_path, _, _ = subprocess_capture(test_output_dir, [pg_waldump_path, "--ignore", wal_path])
+
+    with open(f"{output_path}.stdout", "r") as f:
+        stdout = f.read()
+        assert "ABORT" in stdout
+        assert "COMMIT" in stdout
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index d9149dc59a..d6f7e2c604 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit d9149dc59abcbeeb26293707509aef51752db28f
+Subproject commit d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 85d809c124..f0d6b0ef75 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 85d809c124a898847a97d66a211f7d5ef4f8e0cb
+Subproject commit f0d6b0ef7581bd78011832e23d8420a7d2c8a83a
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 261497dd63..8ef3c33aa0 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 261497dd63ace434045058b1453bcbaaa83f23e5
+Subproject commit 8ef3c33aa01631e17cb24a122776349fcc777b46
diff --git a/vendor/revisions.json b/vendor/revisions.json
index dfc0aa04c3..a353fde8fd 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "postgres-v16": "261497dd63ace434045058b1453bcbaaa83f23e5",
-  "postgres-v15": "85d809c124a898847a97d66a211f7d5ef4f8e0cb",
-  "postgres-v14": "d9149dc59abcbeeb26293707509aef51752db28f"
+  "postgres-v16": "8ef3c33aa01631e17cb24a122776349fcc777b46",
+  "postgres-v15": "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a",
+  "postgres-v14": "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a"
 }

From f656db09a4c0bc65fc249fd63c2d5c276f1860fa Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 2 May 2024 09:19:45 -0400
Subject: [PATCH 0671/1571] fix(pageserver): properly propagate missing key
 error for vectored get (#7569)

Some part of the code requires missing key error to be propagated to the
code path correctly (i.e., aux key range scan). Currently, it's an
anyhow error.

* remove `stuck_lsn` from the missing key error.
* as a result, when matching missing key, we do not distinguish the case
`stuck_lsn = false/true`.
* vectored get now use the unified missing key error.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs       | 55 +++++++++++--------------
 test_runner/regress/test_lsn_mapping.py |  5 +--
 2 files changed, 25 insertions(+), 35 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index cda873d649..3c0a300a9a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -464,7 +464,6 @@ pub(crate) enum PageReconstructError {
 
 #[derive(Debug)]
 pub struct MissingKeyError {
-    stuck_at_lsn: bool,
     key: Key,
     shard: ShardNumber,
     cont_lsn: Lsn,
@@ -476,23 +475,13 @@ pub struct MissingKeyError {
 
 impl std::fmt::Display for MissingKeyError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if self.stuck_at_lsn {
-            // Records are found in this timeline but no image layer or initial delta record was found.
-            write!(
-                f,
-                "could not find layer with more data for key {} (shard {:?}) at LSN {}, request LSN {}",
-                self.key, self.shard, self.cont_lsn, self.request_lsn
-            )?;
-            if let Some(ref ancestor_lsn) = self.ancestor_lsn {
-                write!(f, ", ancestor {}", ancestor_lsn)?;
-            }
-        } else {
-            // No records in this timeline.
-            write!(
-                f,
-                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
-                self.key, self.shard, self.cont_lsn, self.request_lsn
-            )?;
+        write!(
+            f,
+            "could not find data for key {} (shard {:?}) at LSN {}, request LSN {}",
+            self.key, self.shard, self.cont_lsn, self.request_lsn
+        )?;
+        if let Some(ref ancestor_lsn) = self.ancestor_lsn {
+            write!(f, ", ancestor {}", ancestor_lsn)?;
         }
 
         if !self.traversal_path.is_empty() {
@@ -568,8 +557,8 @@ pub(crate) enum GetVectoredError {
     #[error("Requested at invalid LSN: {0}")]
     InvalidLsn(Lsn),
 
-    #[error("Requested key {0} not found")]
-    MissingKey(Key),
+    #[error("Requested key not found: {0}")]
+    MissingKey(MissingKeyError),
 
     #[error(transparent)]
     GetReadyAncestorError(GetReadyAncestorError),
@@ -678,7 +667,7 @@ impl From<GetVectoredError> for PageReconstructError {
             GetVectoredError::Cancelled => PageReconstructError::Cancelled,
             GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")),
             err @ GetVectoredError::Oversized(_) => PageReconstructError::Other(err.into()),
-            err @ GetVectoredError::MissingKey(_) => PageReconstructError::Other(err.into()),
+            GetVectoredError::MissingKey(err) => PageReconstructError::MissingKey(err),
             GetVectoredError::GetReadyAncestorError(err) => PageReconstructError::from(err),
             GetVectoredError::Other(err) => PageReconstructError::Other(err),
         }
@@ -1050,15 +1039,12 @@ impl Timeline {
                         return Err(GetVectoredError::Cancelled)
                     }
                     // we only capture stuck_at_lsn=false now until we figure out https://github.com/neondatabase/neon/issues/7380
-                    Err(MissingKey(MissingKeyError {
-                        stuck_at_lsn: false,
-                        ..
-                    })) if !NON_INHERITED_RANGE.contains(&key) => {
+                    Err(MissingKey(err)) if !NON_INHERITED_RANGE.contains(&key) => {
                         // The vectored read path handles non inherited keys specially.
                         // If such a a key cannot be reconstructed from the current timeline,
                         // the vectored read path returns a key level error as opposed to a top
                         // level error.
-                        return Err(GetVectoredError::MissingKey(key));
+                        return Err(GetVectoredError::MissingKey(err));
                     }
                     Err(Other(err))
                         if err
@@ -1154,7 +1140,7 @@ impl Timeline {
             match (lhs, rhs) {
                 (Oversized(l), Oversized(r)) => l == r,
                 (InvalidLsn(l), InvalidLsn(r)) => l == r,
-                (MissingKey(l), MissingKey(r)) => l == r,
+                (MissingKey(l), MissingKey(r)) => l.key == r.key,
                 (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true,
                 (Other(_), Other(_)) => true,
                 _ => false,
@@ -3024,7 +3010,6 @@ impl Timeline {
                             // Didn't make any progress in last iteration. Error out to avoid
                             // getting stuck in the loop.
                             return Err(PageReconstructError::MissingKey(MissingKeyError {
-                                stuck_at_lsn: true,
                                 key,
                                 shard: self.shard_identity.get_shard_number(&key),
                                 cont_lsn: Lsn(cont_lsn.0 - 1),
@@ -3039,7 +3024,6 @@ impl Timeline {
                 }
                 ValueReconstructResult::Missing => {
                     return Err(PageReconstructError::MissingKey(MissingKeyError {
-                        stuck_at_lsn: false,
                         key,
                         shard: self.shard_identity.get_shard_number(&key),
                         cont_lsn,
@@ -3215,7 +3199,6 @@ impl Timeline {
                         reconstruct_state.on_key_error(
                             key,
                             PageReconstructError::MissingKey(MissingKeyError {
-                                stuck_at_lsn: false,
                                 key,
                                 shard: self.shard_identity.get_shard_number(&key),
                                 cont_lsn,
@@ -3248,7 +3231,17 @@ impl Timeline {
         }
 
         if keyspace.total_raw_size() != 0 {
-            return Err(GetVectoredError::MissingKey(keyspace.start().unwrap()));
+            return Err(GetVectoredError::MissingKey(MissingKeyError {
+                key: keyspace.start().unwrap(), /* better if we can store the full keyspace */
+                shard: self
+                    .shard_identity
+                    .get_shard_number(&keyspace.start().unwrap()),
+                cont_lsn,
+                request_lsn,
+                ancestor_lsn: Some(timeline.ancestor_lsn),
+                traversal_path: vec![],
+                backtrace: None,
+            }));
         }
 
         Ok(())
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 37676ab0d4..5c99ca6733 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -110,10 +110,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
 
 # Test pageserver get_timestamp_of_lsn API
 def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
-    if neon_env_builder.pageserver_get_impl == "vectored":
-        key_not_found_error = r".*Requested key.*not found,*"
-    else:
-        key_not_found_error = r".*could not find data for key.*"
+    key_not_found_error = r".*could not find data for key.*"
 
     env = neon_env_builder.init_start()
 

From ab95942fc25fa1c6bfd6f3041f16a868e8d86dcf Mon Sep 17 00:00:00 2001
From: Matt Podraza <19386552+mattpodraza@users.noreply.github.com>
Date: Thu, 2 May 2024 17:19:51 +0200
Subject: [PATCH 0672/1571] storage controller: make the initial database wait
 configurable (#7591)

This allows passing a humantime string in the CLI to configure the
initial wait for the database.
It defaults to the previously hard-coded value of 5 seconds.
---
 storage_controller/src/main.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index d84803733a..f1454af533 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -5,7 +5,6 @@ use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
 use std::sync::Arc;
-use std::time::Duration;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
@@ -70,6 +69,10 @@ struct Cli {
     /// Maximum number of reconcilers that may run in parallel
     #[arg(long)]
     reconciler_concurrency: Option<usize>,
+
+    /// How long to wait for the initial database connection to be available.
+    #[arg(long, default_value = "5s")]
+    db_connect_timeout: humantime::Duration,
 }
 
 enum StrictMode {
@@ -255,7 +258,7 @@ async fn async_main() -> anyhow::Result<()> {
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
-    Persistence::await_connection(&secrets.database_url, Duration::from_secs(5)).await?;
+    Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
 
     migration_run(&secrets.database_url)
         .await

From 4b55dad813a2dd23d4e653e656ecdc53068d5ef0 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 2 May 2024 12:43:36 -0400
Subject: [PATCH 0673/1571] vm-image: add sqlexporter for autoscaling metrics
 (#7514)

As discussed in https://github.com/neondatabase/autoscaling/pull/895, we
want to have a separate sql_exporter for simple metrics to avoid
overload the database because the autoscaling agent needs to scrape at a
higher interval. The new exporter is exposed at port 9499. I didn't do
any testing for this pull request but given it's just a configuration
change I assume this works.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 vm-image-spec.yaml | 96 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 95 insertions(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 061ff38722..3ccdf5cc64 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -17,6 +17,10 @@ commands:
     user: nobody
     sysvInitAction: respawn
     shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399'
+  - name: sql-exporter-autoscaling
+    user: nobody
+    sysvInitAction: respawn
+    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
 shutdownHook: |
   su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
 files:
@@ -88,6 +92,41 @@ files:
       # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
       collector_files:
         - "neon_collector.yml"
+  - filename: sql_exporter_autoscaling.yml
+    content: |
+      # Configuration for sql_exporter for autoscaling-agent
+      # Global defaults.
+      global:
+        # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
+        scrape_timeout: 10s
+        # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
+        scrape_timeout_offset: 500ms
+        # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
+        min_interval: 0s
+        # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
+        # as will concurrent scrapes.
+        max_connections: 1
+        # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
+        # always be the same as max_connections.
+        max_idle_connections: 1
+        # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
+        # If 0, connections are not closed due to a connection's age.
+        max_connection_lifetime: 5m
+
+      # The target to monitor and the collectors to execute on it.
+      target:
+        # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
+        # the schema gets dropped or replaced to match the driver expected DSN format.
+        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable'
+
+        # Collectors (referenced by name) to execute on the target.
+        # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+        collectors: [neon_collector_autoscaling]
+
+      # Collector files specifies a list of globs. One collector definition is read from each matching file.
+      # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+      collector_files:
+        - "neon_collector_autoscaling.yml"
   - filename: neon_collector.yml
     content: |
       collector_name: neon_collector
@@ -194,6 +233,57 @@ files:
         values: [approximate_working_set_size]
         query: |
           select neon.approximate_working_set_size(false) as approximate_working_set_size;
+  - filename: neon_collector_autoscaling.yml
+    content: |
+      collector_name: neon_collector_autoscaling
+      metrics:
+      - metric_name: lfc_misses
+        type: gauge
+        help: 'lfc_misses'
+        key_labels:
+        values: [lfc_misses]
+        query: |
+          select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
+
+      - metric_name: lfc_used
+        type: gauge
+        help: 'LFC chunks used (chunk = 1MB)'
+        key_labels:
+        values: [lfc_used]
+        query: |
+          select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
+
+      - metric_name: lfc_hits
+        type: gauge
+        help: 'lfc_hits'
+        key_labels:
+        values: [lfc_hits]
+        query: |
+          select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
+
+      - metric_name: lfc_writes
+        type: gauge
+        help: 'lfc_writes'
+        key_labels:
+        values: [lfc_writes]
+        query: |
+          select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
+
+      - metric_name: lfc_cache_size_limit
+        type: gauge
+        help: 'LFC cache size limit in bytes'
+        key_labels:
+        values: [lfc_cache_size_limit]
+        query: |
+          select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
+
+      - metric_name: lfc_approximate_working_set_size
+        type: gauge
+        help: 'Approximate working set size in pages of 8192 bytes'
+        key_labels:
+        values: [approximate_working_set_size]
+        query: |
+          select neon.approximate_working_set_size(false) as approximate_working_set_size;
 
 build: |
   # Build cgroup-tools
@@ -267,13 +357,17 @@ merge: |
   COPY pgbouncer.ini /etc/pgbouncer.ini
   COPY sql_exporter.yml /etc/sql_exporter.yml
   COPY neon_collector.yml /etc/neon_collector.yml
+  COPY sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml
+  COPY neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
 
   RUN set -e \
       && chown postgres:postgres /etc/pgbouncer.ini \
       && chmod 0666 /etc/pgbouncer.ini \
       && chmod 0644 /etc/cgconfig.conf \
       && chmod 0644 /etc/sql_exporter.yml \
-      && chmod 0644 /etc/neon_collector.yml
+      && chmod 0644 /etc/neon_collector.yml \
+      && chmod 0644 /etc/sql_exporter_autoscaling.yml \
+      && chmod 0644 /etc/neon_collector_autoscaling.yml
 
   COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
   COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/

From 45ec8688ea27cbad9789aac934a23069cbe95595 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 2 May 2024 18:58:10 +0200
Subject: [PATCH 0674/1571] chore(pageserver): plumb through RequestContext to
 VirtualFile write methods (#7566)

This PR introduces no functional changes.

The read path will be done separately.

refs https://github.com/neondatabase/neon/issues/6107
refs https://github.com/neondatabase/neon/issues/7386
---
 pageserver/src/task_mgr.rs                    |  2 +
 pageserver/src/tenant/blob_io.rs              | 31 +++++----
 pageserver/src/tenant/ephemeral_file.rs       |  8 +--
 .../src/tenant/ephemeral_file/page_caching.rs | 11 +++-
 .../ephemeral_file/zero_padded_read_write.rs  |  9 ++-
 .../src/tenant/remote_timeline_client.rs      |  3 +
 .../tenant/remote_timeline_client/download.rs | 10 ++-
 pageserver/src/tenant/secondary.rs            |  8 ++-
 pageserver/src/tenant/secondary/downloader.rs | 18 +++--
 .../src/tenant/storage_layer/delta_layer.rs   | 64 +++++++++++++-----
 .../src/tenant/storage_layer/image_layer.rs   | 33 +++++++---
 .../tenant/storage_layer/inmemory_layer.rs    |  4 +-
 pageserver/src/tenant/storage_layer/layer.rs  | 25 +++++--
 pageserver/src/tenant/timeline.rs             |  4 +-
 pageserver/src/tenant/timeline/compaction.rs  | 18 +++--
 pageserver/src/virtual_file.rs                | 32 ++++++---
 .../util/size_tracking_writer.rs              |  5 +-
 .../virtual_file/owned_buffers_io/write.rs    | 66 ++++++++++++-------
 18 files changed, 246 insertions(+), 105 deletions(-)

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index b76105399b..0c245580ee 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -363,6 +363,8 @@ pub enum TaskKind {
 
     EphemeralFilePreWarmPageCache,
 
+    LayerDownload,
+
     #[cfg(test)]
     UnitTest,
 }
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 6e90b3e8ff..1dc451f5c9 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -130,8 +130,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         src_buf: B,
+        ctx: &RequestContext,
     ) -> (B::Buf, Result<(), Error>) {
-        let (src_buf, res) = self.inner.write_all(src_buf).await;
+        let (src_buf, res) = self.inner.write_all(src_buf, ctx).await;
         let nbytes = match res {
             Ok(nbytes) => nbytes,
             Err(e) => return (src_buf, Err(e)),
@@ -142,9 +143,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
 
     #[inline(always)]
     /// Flushes the internal buffer to the underlying `VirtualFile`.
-    pub async fn flush_buffer(&mut self) -> Result<(), Error> {
+    pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> {
         let buf = std::mem::take(&mut self.buf);
-        let (mut buf, res) = self.inner.write_all(buf).await;
+        let (mut buf, res) = self.inner.write_all(buf, ctx).await;
         res?;
         buf.clear();
         self.buf = buf;
@@ -165,10 +166,11 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         src_buf: B,
+        ctx: &RequestContext,
     ) -> (B::Buf, Result<(), Error>) {
         if !BUFFERED {
             assert!(self.buf.is_empty());
-            return self.write_all_unbuffered(src_buf).await;
+            return self.write_all_unbuffered(src_buf, ctx).await;
         }
         let remaining = Self::CAPACITY - self.buf.len();
         let src_buf_len = src_buf.bytes_init();
@@ -183,7 +185,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         }
         // Then, if the buffer is full, flush it out
         if self.buf.len() == Self::CAPACITY {
-            if let Err(e) = self.flush_buffer().await {
+            if let Err(e) = self.flush_buffer(ctx).await {
                 return (Slice::into_inner(src_buf), Err(e));
             }
         }
@@ -199,7 +201,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                 assert_eq!(copied, src_buf.len());
                 Slice::into_inner(src_buf)
             } else {
-                let (src_buf, res) = self.write_all_unbuffered(src_buf).await;
+                let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await;
                 if let Err(e) = res {
                     return (src_buf, Err(e));
                 }
@@ -216,6 +218,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         srcbuf: B,
+        ctx: &RequestContext,
     ) -> (B::Buf, Result<u64, Error>) {
         let offset = self.offset;
 
@@ -227,7 +230,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
             if len < 128 {
                 // Short blob. Write a 1-byte length header
                 io_buf.put_u8(len as u8);
-                self.write_all(io_buf).await
+                self.write_all(io_buf, ctx).await
             } else {
                 // Write a 4-byte length header
                 if len > 0x7fff_ffff {
@@ -242,7 +245,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                 let mut len_buf = (len as u32).to_be_bytes();
                 len_buf[0] |= 0x80;
                 io_buf.extend_from_slice(&len_buf[..]);
-                self.write_all(io_buf).await
+                self.write_all(io_buf, ctx).await
             }
         }
         .await;
@@ -251,7 +254,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
             Ok(_) => (),
             Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
         }
-        let (srcbuf, res) = self.write_all(srcbuf).await;
+        let (srcbuf, res) = self.write_all(srcbuf, ctx).await;
         (srcbuf, res.map(|_| offset))
     }
 }
@@ -261,8 +264,8 @@ impl BlobWriter<true> {
     ///
     /// This function flushes the internal buffer before giving access
     /// to the underlying `VirtualFile`.
-    pub async fn into_inner(mut self) -> Result<VirtualFile, Error> {
-        self.flush_buffer().await?;
+    pub async fn into_inner(mut self, ctx: &RequestContext) -> Result<VirtualFile, Error> {
+        self.flush_buffer(ctx).await?;
         Ok(self.inner)
     }
 
@@ -299,16 +302,16 @@ mod tests {
             let file = VirtualFile::create(pathbuf.as_path()).await?;
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
-                let (_, res) = wtr.write_blob(blob.clone()).await;
+                let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
                 let offs = res?;
                 offsets.push(offs);
             }
             // Write out one page worth of zeros so that we can
             // read again with read_blk
-            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ]).await;
+            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
             let offs = res?;
             println!("Writing final blob at offs={offs}");
-            wtr.flush_buffer().await?;
+            wtr.flush_buffer(&ctx).await?;
         }
 
         let file = VirtualFile::open(pathbuf.as_path()).await?;
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 96efd13c1b..8b815a1885 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -74,7 +74,7 @@ impl EphemeralFile {
     pub(crate) async fn write_blob(
         &mut self,
         srcbuf: &[u8],
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
     ) -> Result<u64, io::Error> {
         let pos = self.rw.bytes_written();
 
@@ -83,15 +83,15 @@ impl EphemeralFile {
             // short one-byte length header
             let len_buf = [srcbuf.len() as u8];
 
-            self.rw.write_all_borrowed(&len_buf).await?;
+            self.rw.write_all_borrowed(&len_buf, ctx).await?;
         } else {
             let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
             len_buf[0] |= 0x80;
-            self.rw.write_all_borrowed(&len_buf).await?;
+            self.rw.write_all_borrowed(&len_buf, ctx).await?;
         }
 
         // Write the payload
-        self.rw.write_all_borrowed(srcbuf).await?;
+        self.rw.write_all_borrowed(srcbuf, ctx).await?;
 
         Ok(pos)
     }
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
index 934400e5be..42def8858e 100644
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -35,10 +35,14 @@ impl RW {
         self.page_cache_file_id
     }
 
-    pub(crate) async fn write_all_borrowed(&mut self, srcbuf: &[u8]) -> Result<usize, io::Error> {
+    pub(crate) async fn write_all_borrowed(
+        &mut self,
+        srcbuf: &[u8],
+        ctx: &RequestContext,
+    ) -> Result<usize, io::Error> {
         // It doesn't make sense to proactively fill the page cache on the Pageserver write path
         // because Compute is unlikely to access recently written data.
-        self.rw.write_all_borrowed(srcbuf).await
+        self.rw.write_all_borrowed(srcbuf, ctx).await
     }
 
     pub(crate) fn bytes_written(&self) -> u64 {
@@ -134,6 +138,7 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
     >(
         &mut self,
         buf: B,
+        ctx: &RequestContext,
     ) -> std::io::Result<(usize, B::Buf)> {
         let buf = buf.slice(..);
         let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done
@@ -150,7 +155,7 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
         );
 
         // Do the IO.
-        let iobuf = match self.file.write_all(buf).await {
+        let iobuf = match self.file.write_all(buf, ctx).await {
             (iobuf, Ok(nwritten)) => {
                 assert_eq!(nwritten, buflen);
                 iobuf
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
index 4159b5820a..b37eafb52c 100644
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -20,6 +20,7 @@
 mod zero_padded;
 
 use crate::{
+    context::RequestContext,
     page_cache::PAGE_SZ,
     virtual_file::owned_buffers_io::{
         self,
@@ -60,8 +61,12 @@ where
         self.buffered_writer.as_inner().as_inner()
     }
 
-    pub async fn write_all_borrowed(&mut self, buf: &[u8]) -> std::io::Result<usize> {
-        self.buffered_writer.write_buffered_borrowed(buf).await
+    pub async fn write_all_borrowed(
+        &mut self,
+        buf: &[u8],
+        ctx: &RequestContext,
+    ) -> std::io::Result<usize> {
+        self.buffered_writer.write_buffered_borrowed(buf, ctx).await
     }
 
     pub fn bytes_written(&self) -> u64 {
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index c0767345ca..a54e93c96b 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -210,6 +210,7 @@ use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
 
+use crate::context::RequestContext;
 use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::metrics::{
     MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
@@ -505,6 +506,7 @@ impl RemoteTimelineClient {
         layer_file_name: &LayerFileName,
         layer_metadata: &LayerFileMetadata,
         cancel: &CancellationToken,
+        ctx: &RequestContext,
     ) -> anyhow::Result<u64> {
         let downloaded_size = {
             let _unfinished_gauge_guard = self.metrics.call_begin(
@@ -522,6 +524,7 @@ impl RemoteTimelineClient {
                 layer_file_name,
                 layer_metadata,
                 cancel,
+                ctx,
             )
             .measure_remote_op(
                 RemoteOpFileKind::Layer,
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 250354ac20..345a12aa86 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -18,6 +18,7 @@ use tracing::warn;
 use utils::backoff;
 
 use crate::config::PageServerConf;
+use crate::context::RequestContext;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
@@ -40,6 +41,7 @@ use super::{
 /// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
 ///
 /// Returns the size of the downloaded file.
+#[allow(clippy::too_many_arguments)]
 pub async fn download_layer_file<'a>(
     conf: &'static PageServerConf,
     storage: &'a GenericRemoteStorage,
@@ -48,6 +50,7 @@ pub async fn download_layer_file<'a>(
     layer_file_name: &'a LayerFileName,
     layer_metadata: &'a LayerFileMetadata,
     cancel: &CancellationToken,
+    ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -75,7 +78,7 @@ pub async fn download_layer_file<'a>(
     let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
 
     let bytes_amount = download_retry(
-        || async { download_object(storage, &remote_path, &temp_file_path, cancel).await },
+        || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
         &format!("download {remote_path:?}"),
         cancel,
     )
@@ -133,6 +136,7 @@ async fn download_object<'a>(
     src_path: &RemotePath,
     dst_path: &Utf8PathBuf,
     cancel: &CancellationToken,
+    ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
     let res = match crate::virtual_file::io_engine::get() {
         crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),
@@ -208,10 +212,10 @@ async fn download_object<'a>(
                             Err(e) => return Err(e),
                         };
                         buffered
-                            .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk))
+                            .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk), ctx)
                             .await?;
                     }
-                    let size_tracking = buffered.flush_and_into_inner().await?;
+                    let size_tracking = buffered.flush_and_into_inner(ctx).await?;
                     Ok(size_tracking.into_inner())
                 }
                 .await?;
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 19f36c722e..5c46df268a 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -7,6 +7,7 @@ use std::{sync::Arc, time::SystemTime};
 
 use crate::{
     config::PageServerConf,
+    context::RequestContext,
     disk_usage_eviction_task::DiskUsageEvictionInfo,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     virtual_file::MaybeFatalIo,
@@ -316,9 +317,13 @@ pub fn spawn_tasks(
     let (upload_req_tx, upload_req_rx) =
         tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
 
+    let downloader_task_ctx = RequestContext::new(
+        TaskKind::SecondaryDownloads,
+        crate::context::DownloadBehavior::Download,
+    );
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
-        TaskKind::SecondaryDownloads,
+        downloader_task_ctx.task_kind(),
         None,
         None,
         "secondary tenant downloads",
@@ -330,6 +335,7 @@ pub fn spawn_tasks(
                 download_req_rx,
                 bg_jobs_clone,
                 cancel_clone,
+                downloader_task_ctx,
             )
             .await;
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 67f866cb7b..8a987b5ade 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -8,6 +8,7 @@ use std::{
 
 use crate::{
     config::PageServerConf,
+    context::RequestContext,
     disk_usage_eviction_task::{
         finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer,
     },
@@ -74,12 +75,14 @@ pub(super) async fn downloader_task(
     command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
     background_jobs_can_start: Barrier,
     cancel: CancellationToken,
+    root_ctx: RequestContext,
 ) {
     let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
 
     let generator = SecondaryDownloader {
         tenant_manager,
         remote_storage,
+        root_ctx,
     };
     let mut scheduler = Scheduler::new(generator, concurrency);
 
@@ -92,6 +95,7 @@ pub(super) async fn downloader_task(
 struct SecondaryDownloader {
     tenant_manager: Arc<TenantManager>,
     remote_storage: GenericRemoteStorage,
+    root_ctx: RequestContext,
 }
 
 #[derive(Debug, Clone)]
@@ -367,11 +371,12 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
         let remote_storage = self.remote_storage.clone();
         let conf = self.tenant_manager.get_conf();
         let tenant_shard_id = *secondary_state.get_tenant_shard_id();
+        let download_ctx = self.root_ctx.attached_child();
         (RunningDownload { barrier }, Box::pin(async move {
             let _completion = completion;
 
             match TenantDownloader::new(conf, &remote_storage, &secondary_state)
-                .download()
+                .download(&download_ctx)
                 .await
             {
                 Err(UpdateError::NoData) => {
@@ -485,7 +490,7 @@ impl<'a> TenantDownloader<'a> {
         }
     }
 
-    async fn download(&self) -> Result<(), UpdateError> {
+    async fn download(&self, ctx: &RequestContext) -> Result<(), UpdateError> {
         debug_assert_current_span_has_tenant_id();
 
         // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure
@@ -560,7 +565,7 @@ impl<'a> TenantDownloader<'a> {
             }
 
             let timeline_id = timeline.timeline_id;
-            self.download_timeline(timeline)
+            self.download_timeline(timeline, ctx)
                 .instrument(tracing::info_span!(
                     "secondary_download_timeline",
                     tenant_id=%tenant_shard_id.tenant_id,
@@ -742,7 +747,11 @@ impl<'a> TenantDownloader<'a> {
         .and_then(|x| x)
     }
 
-    async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
+    async fn download_timeline(
+        &self,
+        timeline: HeatMapTimeline,
+        ctx: &RequestContext,
+    ) -> Result<(), UpdateError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
         let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
         let timeline_path = self
@@ -875,6 +884,7 @@ impl<'a> TenantDownloader<'a> {
                 &layer.name,
                 &LayerFileMetadata::from(&layer.metadata),
                 &self.secondary_state.cancel,
+                ctx,
             )
             .await
             {
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index a9f8404158..b5538dff3a 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -428,9 +428,15 @@ impl DeltaLayerWriterInner {
     ///
     /// The values must be appended in key, lsn order.
     ///
-    async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
+    async fn put_value(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         let (_, res) = self
-            .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init())
+            .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init(), ctx)
             .await;
         res
     }
@@ -441,9 +447,10 @@ impl DeltaLayerWriterInner {
         lsn: Lsn,
         val: Vec<u8>,
         will_init: bool,
+        ctx: &RequestContext,
     ) -> (Vec<u8>, anyhow::Result<()>) {
         assert!(self.lsn_range.start <= lsn);
-        let (val, res) = self.blob_writer.write_blob(val).await;
+        let (val, res) = self.blob_writer.write_blob(val, ctx).await;
         let off = match res {
             Ok(off) => off,
             Err(e) => return (val, Err(anyhow::anyhow!(e))),
@@ -463,18 +470,23 @@ impl DeltaLayerWriterInner {
     ///
     /// Finish writing the delta layer.
     ///
-    async fn finish(self, key_end: Key, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
+    async fn finish(
+        self,
+        key_end: Key,
+        timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ResidentLayer> {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
 
-        let mut file = self.blob_writer.into_inner().await?;
+        let mut file = self.blob_writer.into_inner(ctx).await?;
 
         // Write out the index
         let (index_root_blk, block_buf) = self.tree.finish()?;
         file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
             .await?;
         for buf in block_buf.blocks {
-            let (_buf, res) = file.write_all(buf).await;
+            let (_buf, res) = file.write_all(buf, ctx).await;
             res?;
         }
         assert!(self.lsn_range.start < self.lsn_range.end);
@@ -494,7 +506,7 @@ impl DeltaLayerWriterInner {
         // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&summary, &mut buf)?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf).await;
+        let (_buf, res) = file.write_all(buf, ctx).await;
         res?;
 
         let metadata = file
@@ -592,8 +604,18 @@ impl DeltaLayerWriter {
     ///
     /// The values must be appended in key, lsn order.
     ///
-    pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
-        self.inner.as_mut().unwrap().put_value(key, lsn, val).await
+    pub async fn put_value(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.inner
+            .as_mut()
+            .unwrap()
+            .put_value(key, lsn, val, ctx)
+            .await
     }
 
     pub async fn put_value_bytes(
@@ -602,11 +624,12 @@ impl DeltaLayerWriter {
         lsn: Lsn,
         val: Vec<u8>,
         will_init: bool,
+        ctx: &RequestContext,
     ) -> (Vec<u8>, anyhow::Result<()>) {
         self.inner
             .as_mut()
             .unwrap()
-            .put_value_bytes(key, lsn, val, will_init)
+            .put_value_bytes(key, lsn, val, will_init, ctx)
             .await
     }
 
@@ -621,10 +644,11 @@ impl DeltaLayerWriter {
         mut self,
         key_end: Key,
         timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<ResidentLayer> {
         let inner = self.inner.take().unwrap();
         let temp_path = inner.path.clone();
-        let result = inner.finish(key_end, timeline).await;
+        let result = inner.finish(key_end, timeline, ctx).await;
         // The delta layer files can sometimes be really large. Clean them up.
         if result.is_err() {
             tracing::warn!(
@@ -692,7 +716,7 @@ impl DeltaLayer {
         // TODO: could use smallvec here, but it's a pain with Slice<T>
         Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf).await;
+        let (_buf, res) = file.write_all(buf, ctx).await;
         res?;
         Ok(())
     }
@@ -1281,7 +1305,13 @@ impl DeltaLayerInner {
                     per_blob_copy.extend_from_slice(data);
 
                     let (tmp, res) = writer
-                        .put_value_bytes(key, lsn, std::mem::take(&mut per_blob_copy), will_init)
+                        .put_value_bytes(
+                            key,
+                            lsn,
+                            std::mem::take(&mut per_blob_copy),
+                            will_init,
+                            ctx,
+                        )
                         .await;
                     per_blob_copy = tmp;
                     res?;
@@ -1760,12 +1790,14 @@ mod test {
 
         for entry in entries {
             let (_, res) = writer
-                .put_value_bytes(entry.key, entry.lsn, entry.value, false)
+                .put_value_bytes(entry.key, entry.lsn, entry.value, false, &ctx)
                 .await;
             res?;
         }
 
-        let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
+        let resident = writer
+            .finish(entries_meta.key_range.end, &timeline, &ctx)
+            .await?;
 
         let inner = resident.as_delta(&ctx).await?;
 
@@ -1951,7 +1983,7 @@ mod test {
                 .await
                 .unwrap();
 
-            let copied_layer = writer.finish(Key::MAX, &branch).await.unwrap();
+            let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();
 
             copied_layer.as_delta(ctx).await.unwrap();
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 6f46a0203b..1477a1fc33 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -357,7 +357,7 @@ impl ImageLayer {
         // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf).await;
+        let (_buf, res) = file.write_all(buf, ctx).await;
         res?;
         Ok(())
     }
@@ -677,9 +677,14 @@ impl ImageLayerWriterInner {
     ///
     /// The page versions must be appended in blknum order.
     ///
-    async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
+    async fn put_image(
+        &mut self,
+        key: Key,
+        img: Bytes,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         ensure!(self.key_range.contains(&key));
-        let (_img, res) = self.blob_writer.write_blob(img).await;
+        let (_img, res) = self.blob_writer.write_blob(img, ctx).await;
         // TODO: re-use the buffer for `img` further upstack
         let off = res?;
 
@@ -693,7 +698,11 @@ impl ImageLayerWriterInner {
     ///
     /// Finish writing the image layer.
     ///
-    async fn finish(self, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
+    async fn finish(
+        self,
+        timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ResidentLayer> {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
 
@@ -704,7 +713,7 @@ impl ImageLayerWriterInner {
             .await?;
         let (index_root_blk, block_buf) = self.tree.finish()?;
         for buf in block_buf.blocks {
-            let (_buf, res) = file.write_all(buf).await;
+            let (_buf, res) = file.write_all(buf, ctx).await;
             res?;
         }
 
@@ -724,7 +733,7 @@ impl ImageLayerWriterInner {
         // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&summary, &mut buf)?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf).await;
+        let (_buf, res) = file.write_all(buf, ctx).await;
         res?;
 
         let metadata = file
@@ -806,8 +815,13 @@ impl ImageLayerWriter {
     ///
     /// The page versions must be appended in blknum order.
     ///
-    pub async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
-        self.inner.as_mut().unwrap().put_image(key, img).await
+    pub async fn put_image(
+        &mut self,
+        key: Key,
+        img: Bytes,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.inner.as_mut().unwrap().put_image(key, img, ctx).await
     }
 
     ///
@@ -816,8 +830,9 @@ impl ImageLayerWriter {
     pub(crate) async fn finish(
         mut self,
         timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline).await
+        self.inner.take().unwrap().finish(timeline, ctx).await
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index a2ae8ec29d..4dacbec2f3 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -659,14 +659,14 @@ impl InMemoryLayer {
                 let will_init = Value::des(&buf)?.will_init();
                 let res;
                 (buf, res) = delta_layer_writer
-                    .put_value_bytes(*key, *lsn, buf, will_init)
+                    .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
                     .await;
                 res?;
             }
         }
 
         // MAX is used here because we identify L0 layers by full key range
-        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?;
         Ok(Some(delta_layer))
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 2b6934fcee..ebc0cbf9a4 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -14,9 +14,10 @@ use utils::lsn::Lsn;
 use utils::sync::heavier_once_cell;
 
 use crate::config::PageServerConf;
-use crate::context::RequestContext;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::repository::Key;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::task_mgr::TaskKind;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
 
@@ -939,11 +940,20 @@ impl LayerInner {
             return Err(DownloadError::DownloadRequired);
         }
 
+        let download_ctx = ctx
+            .map(|ctx| ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download))
+            .unwrap_or(RequestContext::new(
+                TaskKind::LayerDownload,
+                DownloadBehavior::Download,
+            ));
+
         async move {
             tracing::info!(%reason, "downloading on-demand");
 
             let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
-            let res = self.download_init_and_wait(timeline, permit).await?;
+            let res = self
+                .download_init_and_wait(timeline, permit, download_ctx)
+                .await?;
             scopeguard::ScopeGuard::into_inner(init_cancelled);
             Ok(res)
         }
@@ -982,6 +992,7 @@ impl LayerInner {
         self: &Arc<Self>,
         timeline: Arc<Timeline>,
         permit: heavier_once_cell::InitPermit,
+        ctx: RequestContext,
     ) -> Result<Arc<DownloadedLayer>, DownloadError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -1011,7 +1022,7 @@ impl LayerInner {
                     .await
                     .unwrap();
 
-                let res = this.download_and_init(timeline, permit).await;
+                let res = this.download_and_init(timeline, permit, &ctx).await;
 
                 if let Err(res) = tx.send(res) {
                     match res {
@@ -1054,6 +1065,7 @@ impl LayerInner {
         self: &Arc<LayerInner>,
         timeline: Arc<Timeline>,
         permit: heavier_once_cell::InitPermit,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Arc<DownloadedLayer>> {
         let client = timeline
             .remote_client
@@ -1061,7 +1073,12 @@ impl LayerInner {
             .expect("checked before download_init_and_wait");
 
         let result = client
-            .download_layer_file(&self.desc.filename(), &self.metadata(), &timeline.cancel)
+            .download_layer_file(
+                &self.desc.filename(),
+                &self.metadata(),
+                &timeline.cancel,
+                ctx,
+            )
             .await;
 
         match result {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3c0a300a9a..22bfa53445 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4179,7 +4179,7 @@ impl Timeline {
                             };
 
                             // Write all the keys we just read into our new image layer.
-                            image_layer_writer.put_image(img_key, img).await?;
+                            image_layer_writer.put_image(img_key, img, ctx).await?;
                             wrote_keys = true;
                         }
                     }
@@ -4190,7 +4190,7 @@ impl Timeline {
                 // Normal path: we have written some data into the new image layer for this
                 // partition, so flush it to disk.
                 start = img_range.end;
-                let image_layer = image_layer_writer.finish(self).await?;
+                let image_layer = image_layer_writer.finish(self, ctx).await?;
                 image_layers.push(image_layer);
             } else {
                 // Special case: the image layer may be empty if this is a sharded tenant and the
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 6ea37bf793..1088101a13 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -520,7 +520,7 @@ impl Timeline {
                             writer
                                 .take()
                                 .unwrap()
-                                .finish(prev_key.unwrap().next(), self)
+                                .finish(prev_key.unwrap().next(), self, ctx)
                                 .await?,
                         );
                         writer = None;
@@ -562,7 +562,11 @@ impl Timeline {
                     );
                 }
 
-                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+                writer
+                    .as_mut()
+                    .unwrap()
+                    .put_value(key, lsn, value, ctx)
+                    .await?;
             } else {
                 debug!(
                     "Dropping key {} during compaction (it belongs on shard {:?})",
@@ -578,7 +582,7 @@ impl Timeline {
             prev_key = Some(key);
         }
         if let Some(writer) = writer {
-            new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?);
+            new_layers.push(writer.finish(prev_key.unwrap().next(), self, ctx).await?);
         }
 
         // Sync layers
@@ -972,7 +976,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
 
             let value = val.load(ctx).await?;
 
-            writer.put_value(key, lsn, value).await?;
+            writer.put_value(key, lsn, value, ctx).await?;
 
             prev = Some((key, lsn));
         }
@@ -988,7 +992,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
         });
 
         let new_delta_layer = writer
-            .finish(prev.unwrap().0.next(), &self.timeline)
+            .finish(prev.unwrap().0.next(), &self.timeline, ctx)
             .await?;
 
         self.new_deltas.push(new_delta_layer);
@@ -1058,11 +1062,11 @@ impl TimelineAdaptor {
                         }
                     }
                 };
-                image_layer_writer.put_image(key, img).await?;
+                image_layer_writer.put_image(key, img, ctx).await?;
                 key = key.next();
             }
         }
-        let image_layer = image_layer_writer.finish(&self.timeline).await?;
+        let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?;
 
         self.new_images.push(image_layer);
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 6127b35079..a17488a286 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -10,6 +10,7 @@
 //! This is similar to PostgreSQL's virtual file descriptor facility in
 //! src/backend/storage/file/fd.c
 //!
+use crate::context::RequestContext;
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
 
 use crate::page_cache::PageWriteGuard;
@@ -615,6 +616,7 @@ impl VirtualFile {
         &self,
         buf: B,
         mut offset: u64,
+        ctx: &RequestContext,
     ) -> (B::Buf, Result<(), Error>) {
         let buf_len = buf.bytes_init();
         if buf_len == 0 {
@@ -623,7 +625,7 @@ impl VirtualFile {
         let mut buf = buf.slice(0..buf_len);
         while !buf.is_empty() {
             let res;
-            (buf, res) = self.write_at(buf, offset).await;
+            (buf, res) = self.write_at(buf, offset, ctx).await;
             match res {
                 Ok(0) => {
                     return (
@@ -652,6 +654,7 @@ impl VirtualFile {
     pub async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         buf: B,
+        ctx: &RequestContext,
     ) -> (B::Buf, Result<usize, Error>) {
         let nbytes = buf.bytes_init();
         if nbytes == 0 {
@@ -660,7 +663,7 @@ impl VirtualFile {
         let mut buf = buf.slice(0..nbytes);
         while !buf.is_empty() {
             let res;
-            (buf, res) = self.write(buf).await;
+            (buf, res) = self.write(buf, ctx).await;
             match res {
                 Ok(0) => {
                     return (
@@ -684,9 +687,10 @@ impl VirtualFile {
     async fn write<B: IoBuf + Send>(
         &mut self,
         buf: Slice<B>,
+        ctx: &RequestContext,
     ) -> (Slice<B>, Result<usize, std::io::Error>) {
         let pos = self.pos;
-        let (buf, res) = self.write_at(buf, pos).await;
+        let (buf, res) = self.write_at(buf, pos, ctx).await;
         let n = match res {
             Ok(n) => n,
             Err(e) => return (buf, Err(e)),
@@ -724,6 +728,7 @@ impl VirtualFile {
         &self,
         buf: Slice<B>,
         offset: u64,
+        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
     ) -> (Slice<B>, Result<usize, Error>) {
         let file_guard = match self.lock_file().await {
             Ok(file_guard) => file_guard,
@@ -1088,8 +1093,9 @@ impl OwnedAsyncWriter for VirtualFile {
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         buf: B,
+        ctx: &RequestContext,
     ) -> std::io::Result<(usize, B::Buf)> {
-        let (buf, res) = VirtualFile::write_all(self, buf).await;
+        let (buf, res) = VirtualFile::write_all(self, buf, ctx).await;
         res.map(move |v| (v, buf))
     }
 }
@@ -1146,6 +1152,9 @@ fn get_open_files() -> &'static OpenFiles {
 
 #[cfg(test)]
 mod tests {
+    use crate::context::DownloadBehavior;
+    use crate::task_mgr::TaskKind;
+
     use super::*;
     use rand::seq::SliceRandom;
     use rand::thread_rng;
@@ -1177,10 +1186,11 @@ mod tests {
             &self,
             buf: B,
             offset: u64,
+            ctx: &RequestContext,
         ) -> Result<(), Error> {
             match self {
                 MaybeVirtualFile::VirtualFile(file) => {
-                    let (_buf, res) = file.write_all_at(buf, offset).await;
+                    let (_buf, res) = file.write_all_at(buf, offset, ctx).await;
                     res
                 }
                 MaybeVirtualFile::File(file) => {
@@ -1201,10 +1211,11 @@ mod tests {
         async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
             &mut self,
             buf: B,
+            ctx: &RequestContext,
         ) -> Result<(), Error> {
             match self {
                 MaybeVirtualFile::VirtualFile(file) => {
-                    let (_buf, res) = file.write_all(buf).await;
+                    let (_buf, res) = file.write_all(buf, ctx).await;
                     res.map(|_| ())
                 }
                 MaybeVirtualFile::File(file) => {
@@ -1275,6 +1286,7 @@ mod tests {
         OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
         FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
     {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
         let testdir = crate::config::PageServerConf::test_repo_dir(testname);
         std::fs::create_dir_all(&testdir)?;
 
@@ -1288,7 +1300,7 @@ mod tests {
                 .to_owned(),
         )
         .await?;
-        file_a.write_all(b"foobar".to_vec()).await?;
+        file_a.write_all(b"foobar".to_vec(), &ctx).await?;
 
         // cannot read from a file opened in write-only mode
         let _ = file_a.read_string().await.unwrap_err();
@@ -1297,7 +1309,7 @@ mod tests {
         let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
 
         // cannot write to a file opened in read-only mode
-        let _ = file_a.write_all(b"bar".to_vec()).await.unwrap_err();
+        let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err();
 
         // Try simple read
         assert_eq!("foobar", file_a.read_string().await?);
@@ -1339,8 +1351,8 @@ mod tests {
                 .to_owned(),
         )
         .await?;
-        file_b.write_all_at(b"BAR".to_vec(), 3).await?;
-        file_b.write_all_at(b"FOO".to_vec(), 0).await?;
+        file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?;
+        file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?;
 
         assert_eq!(file_b.read_string_at(2, 3).await?, "OBA");
 
diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
index c2817699c3..55b1d0b46b 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
@@ -1,4 +1,4 @@
-use crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter;
+use crate::{context::RequestContext, virtual_file::owned_buffers_io::write::OwnedAsyncWriter};
 use tokio_epoll_uring::{BoundedBuf, IoBuf};
 
 pub struct Writer<W> {
@@ -38,8 +38,9 @@ where
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         buf: B,
+        ctx: &RequestContext,
     ) -> std::io::Result<(usize, B::Buf)> {
-        let (nwritten, buf) = self.dst.write_all(buf).await?;
+        let (nwritten, buf) = self.dst.write_all(buf, ctx).await?;
         self.bytes_amount += u64::try_from(nwritten).unwrap();
         Ok((nwritten, buf))
     }
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index 738a642332..ac5169508f 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -1,12 +1,15 @@
 use bytes::BytesMut;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
 
+use crate::context::RequestContext;
+
 /// A trait for doing owned-buffer write IO.
 /// Think [`tokio::io::AsyncWrite`] but with owned buffers.
 pub trait OwnedAsyncWriter {
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         buf: B,
+        ctx: &RequestContext,
     ) -> std::io::Result<(usize, B::Buf)>;
 }
 
@@ -57,8 +60,9 @@ where
     }
 
     #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
-        self.flush().await?;
+    pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result<W> {
+        self.flush(ctx).await?;
+
         let Self { buf, writer } = self;
         assert!(buf.is_some());
         Ok(writer)
@@ -72,14 +76,18 @@ where
     }
 
     #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub async fn write_buffered<S: IoBuf>(&mut self, chunk: Slice<S>) -> std::io::Result<(usize, S)>
+    pub async fn write_buffered<S: IoBuf>(
+        &mut self,
+        chunk: Slice<S>,
+        ctx: &RequestContext,
+    ) -> std::io::Result<(usize, S)>
     where
         S: IoBuf + Send,
     {
         let chunk_len = chunk.len();
         // avoid memcpy for the middle of the chunk
         if chunk.len() >= self.buf().cap() {
-            self.flush().await?;
+            self.flush(ctx).await?;
             // do a big write, bypassing `buf`
             assert_eq!(
                 self.buf
@@ -88,7 +96,7 @@ where
                     .pending(),
                 0
             );
-            let (nwritten, chunk) = self.writer.write_all(chunk).await?;
+            let (nwritten, chunk) = self.writer.write_all(chunk, ctx).await?;
             assert_eq!(nwritten, chunk_len);
             return Ok((nwritten, chunk));
         }
@@ -104,7 +112,7 @@ where
             slice = &slice[n..];
             if buf.pending() >= buf.cap() {
                 assert_eq!(buf.pending(), buf.cap());
-                self.flush().await?;
+                self.flush(ctx).await?;
             }
         }
         assert!(slice.is_empty(), "by now we should have drained the chunk");
@@ -116,7 +124,11 @@ where
     /// It is less performant because we always have to copy the borrowed data into the internal buffer
     /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
     /// for large writes.
-    pub async fn write_buffered_borrowed(&mut self, mut chunk: &[u8]) -> std::io::Result<usize> {
+    pub async fn write_buffered_borrowed(
+        &mut self,
+        mut chunk: &[u8],
+        ctx: &RequestContext,
+    ) -> std::io::Result<usize> {
         let chunk_len = chunk.len();
         while !chunk.is_empty() {
             let buf = self.buf.as_mut().expect("must not use after an error");
@@ -127,20 +139,20 @@ where
             chunk = &chunk[n..];
             if buf.pending() >= buf.cap() {
                 assert_eq!(buf.pending(), buf.cap());
-                self.flush().await?;
+                self.flush(ctx).await?;
             }
         }
         Ok(chunk_len)
     }
 
-    async fn flush(&mut self) -> std::io::Result<()> {
+    async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> {
         let buf = self.buf.take().expect("must not use after an error");
         let buf_len = buf.pending();
         if buf_len == 0 {
             self.buf = Some(buf);
             return Ok(());
         }
-        let (nwritten, io_buf) = self.writer.write_all(buf.flush()).await?;
+        let (nwritten, io_buf) = self.writer.write_all(buf.flush(), ctx).await?;
         assert_eq!(nwritten, buf_len);
         self.buf = Some(Buffer::reuse_after_flush(io_buf));
         Ok(())
@@ -206,6 +218,7 @@ impl OwnedAsyncWriter for Vec<u8> {
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         buf: B,
+        _: &RequestContext,
     ) -> std::io::Result<(usize, B::Buf)> {
         let nbytes = buf.bytes_init();
         if nbytes == 0 {
@@ -222,6 +235,8 @@ mod tests {
     use bytes::BytesMut;
 
     use super::*;
+    use crate::context::{DownloadBehavior, RequestContext};
+    use crate::task_mgr::TaskKind;
 
     #[derive(Default)]
     struct RecorderWriter {
@@ -231,6 +246,7 @@ mod tests {
         async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
             &mut self,
             buf: B,
+            _: &RequestContext,
         ) -> std::io::Result<(usize, B::Buf)> {
             let nbytes = buf.bytes_init();
             if nbytes == 0 {
@@ -243,10 +259,14 @@ mod tests {
         }
     }
 
+    fn test_ctx() -> RequestContext {
+        RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
+    }
+
     macro_rules! write {
         ($writer:ident, $data:literal) => {{
             $writer
-                .write_buffered(::bytes::Bytes::from_static($data).slice_full())
+                .write_buffered(::bytes::Bytes::from_static($data).slice_full(), &test_ctx())
                 .await?;
         }};
     }
@@ -260,7 +280,7 @@ mod tests {
         write!(writer, b"c");
         write!(writer, b"d");
         write!(writer, b"e");
-        let recorder = writer.flush_and_into_inner().await?;
+        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
         assert_eq!(
             recorder.writes,
             vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")]
@@ -276,7 +296,7 @@ mod tests {
         write!(writer, b"de");
         write!(writer, b"");
         write!(writer, b"fghijk");
-        let recorder = writer.flush_and_into_inner().await?;
+        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
         assert_eq!(
             recorder.writes,
             vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")]
@@ -292,7 +312,7 @@ mod tests {
         write!(writer, b"bc");
         write!(writer, b"d");
         write!(writer, b"e");
-        let recorder = writer.flush_and_into_inner().await?;
+        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
         assert_eq!(
             recorder.writes,
             vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")]
@@ -302,18 +322,20 @@ mod tests {
 
     #[tokio::test]
     async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
+        let ctx = test_ctx();
+        let ctx = &ctx;
         let recorder = RecorderWriter::default();
         let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
 
-        writer.write_buffered_borrowed(b"abc").await?;
-        writer.write_buffered_borrowed(b"d").await?;
-        writer.write_buffered_borrowed(b"e").await?;
-        writer.write_buffered_borrowed(b"fg").await?;
-        writer.write_buffered_borrowed(b"hi").await?;
-        writer.write_buffered_borrowed(b"j").await?;
-        writer.write_buffered_borrowed(b"klmno").await?;
+        writer.write_buffered_borrowed(b"abc", ctx).await?;
+        writer.write_buffered_borrowed(b"d", ctx).await?;
+        writer.write_buffered_borrowed(b"e", ctx).await?;
+        writer.write_buffered_borrowed(b"fg", ctx).await?;
+        writer.write_buffered_borrowed(b"hi", ctx).await?;
+        writer.write_buffered_borrowed(b"j", ctx).await?;
+        writer.write_buffered_borrowed(b"klmno", ctx).await?;
 
-        let recorder = writer.flush_and_into_inner().await?;
+        let recorder = writer.flush_and_into_inner(ctx).await?;
         assert_eq!(
             recorder.writes,
             {

From 7a49e5d5c21aeefcba4aa0a1135069fa6a4e8de0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 2 May 2024 20:18:13 +0200
Subject: [PATCH 0675/1571] Remove tenant_id from TenantLocationConfigRequest
 (#7469)

Follow-up of #7055 and #7476 to remove `tenant_id` from
`TenantLocationConfigRequest` completely. All components of our system
should now not specify the `tenant_id`.

cc https://github.com/neondatabase/cloud/pull/11791
---
 Cargo.lock                              | 21 +++++++++++----------
 libs/pageserver_api/src/models.rs       |  2 --
 pageserver/client/src/mgmt_api.rs       |  5 +----
 pageserver/src/http/openapi_spec.yml    |  3 ---
 test_runner/fixtures/pageserver/http.py |  1 -
 5 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 775a0d977d..1db41cd755 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -867,9 +867,9 @@ checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105"
 
 [[package]]
 name = "base64"
-version = "0.22.0"
+version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9475866fec1451be56a3c2400fd081ff546538961565ccb5b7142cbd22bc7a51"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
 
 [[package]]
 name = "base64-simd"
@@ -4769,7 +4769,7 @@ version = "0.12.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "566cafdd92868e0939d3fb961bd0dc25fcfaaed179291093b3d43e6b3150ea10"
 dependencies = [
- "base64 0.22.0",
+ "base64 0.22.1",
  "bytes",
  "futures-channel",
  "futures-core",
@@ -5927,7 +5927,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 [[package]]
 name = "svg_fmt"
 version = "0.4.2"
-source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#b9501105e746629004bc6d0473639320939dbe10"
+source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#c1820b28664b5df68de7f043fccf2ed5d67b6ae8"
 
 [[package]]
 name = "syn"
@@ -6508,10 +6508,11 @@ dependencies = [
 
 [[package]]
 name = "tracing"
-version = "0.1.40"
+version = "0.1.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
 dependencies = [
+ "cfg-if",
  "log",
  "pin-project-lite",
  "tracing-attributes",
@@ -6531,9 +6532,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.27"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6542,9 +6543,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-core"
-version = "0.1.32"
+version = "0.1.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
 dependencies = [
  "once_cell",
  "valuable",
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index c752799c4c..a54cdb520d 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -430,8 +430,6 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub tenant_id: Option<TenantShardId>,
     #[serde(flatten)]
     pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 012cb1a662..bc66c5c6e1 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -291,10 +291,7 @@ impl Client {
         flush_ms: Option<std::time::Duration>,
         lazy: bool,
     ) -> Result<()> {
-        let req_body = TenantLocationConfigRequest {
-            tenant_id: None,
-            config,
-        };
+        let req_body = TenantLocationConfigRequest { config };
 
         let mut path = reqwest::Url::parse(&format!(
             "{}/v1/tenant/{}/location_config",
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index d89f949688..c425f3e628 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -782,9 +782,6 @@ components:
       required:
         - mode
       properties:
-        tenant_id:
-          type: string
-          description: Not used, scheduled for removal.
         mode:
           type: string
           enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index b899b0dac8..231ffd898e 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -293,7 +293,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         lazy: Optional[bool] = None,
     ):
         body = location_conf.copy()
-        body["tenant_id"] = str(tenant_id)
 
         params = {}
         if flush_ms is not None:

From 5f099dc7603d0b41418ad9b5e7267e377f24534c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 2 May 2024 20:19:00 +0200
Subject: [PATCH 0676/1571] Use streaming downloads for Azure as well (#7579)

The main challenge was in the second commit, as `DownloadStream`
requires the inner to be Sync but the stream returned by the Azure SDK
wasn't Sync.

This left us with three options:

* Change the Azure SDK to return Sync streams. This was abandoned after
we realized that we couldn't just make `TokenCredential`'s returned
future Sync: it uses the `async_trait` macro and as the
`TokenCredential` trait is used in dyn form, one can't use Rust's new
"async fn in Trait" feature.
* Change `DownloadStream` to not require `Sync`. This was abandoned
after it turned into a safekeeper refactoring project.
* Put the stream into a `Mutex` and make it obtain a lock on every poll.
This adds some performance overhead but locks that actually don't do
anything should be comparatively cheap.

We went with the third option in the end as the change still represents
an improvement.

Follow up of #5446 , fixes #5563
---
 Cargo.lock                            |  5 ++
 libs/remote_storage/Cargo.toml        |  1 +
 libs/remote_storage/src/azure_blob.rs | 66 ++++++++++++++++-----------
 workspace_hack/Cargo.toml             |  1 +
 4 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1db41cd755..438b68493b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4714,6 +4714,7 @@ dependencies = [
  "scopeguard",
  "serde",
  "serde_json",
+ "sync_wrapper",
  "test-context",
  "tokio",
  "tokio-stream",
@@ -5956,6 +5957,9 @@ name = "sync_wrapper"
 version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
+dependencies = [
+ "futures-core",
+]
 
 [[package]]
 name = "synstructure"
@@ -7505,6 +7509,7 @@ dependencies = [
  "subtle",
  "syn 1.0.109",
  "syn 2.0.52",
+ "sync_wrapper",
  "time",
  "time-macros",
  "tokio",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 4a53f485ca..78da01c9a0 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -38,6 +38,7 @@ azure_storage_blobs.workspace = true
 futures-util.workspace = true
 http-types.workspace = true
 itertools.workspace = true
+sync_wrapper = { workspace = true, features = ["futures"] }
 
 [dev-dependencies]
 camino-tempfile.workspace = true
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 5fff3e25c9..24c1248304 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -3,6 +3,7 @@
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::env;
+use std::io;
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::str::FromStr;
@@ -20,6 +21,7 @@ use azure_storage_blobs::blob::CopyStatus;
 use azure_storage_blobs::prelude::ClientBuilder;
 use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
 use bytes::Bytes;
+use futures::future::Either;
 use futures::stream::Stream;
 use futures_util::StreamExt;
 use futures_util::TryStreamExt;
@@ -128,12 +130,12 @@ impl AzureBlobStorage {
         let kind = RequestKind::Get;
 
         let _permit = self.permit(kind, cancel).await?;
+        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+        let cancel_or_timeout_ = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
 
         let mut etag = None;
         let mut last_modified = None;
         let mut metadata = HashMap::new();
-        // TODO give proper streaming response instead of buffering into RAM
-        // https://github.com/neondatabase/neon/issues/5563
 
         let download = async {
             let response = builder
@@ -152,39 +154,46 @@ impl AzureBlobStorage {
                 Err(_elapsed) => Err(DownloadError::Timeout),
             });
 
-            let mut response = std::pin::pin!(response);
+            let mut response = Box::pin(response);
 
-            let mut bufs = Vec::new();
-            while let Some(part) = response.next().await {
-                let part = part?;
-                if etag.is_none() {
-                    etag = Some(part.blob.properties.etag);
-                }
-                if last_modified.is_none() {
-                    last_modified = Some(part.blob.properties.last_modified.into());
-                }
-                if let Some(blob_meta) = part.blob.metadata {
-                    metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
-                }
-                let data = part
-                    .data
-                    .collect()
-                    .await
-                    .map_err(|e| DownloadError::Other(e.into()))?;
-                bufs.push(data);
-            }
-
-            if bufs.is_empty() {
+            let Some(part) = response.next().await else {
                 return Err(DownloadError::Other(anyhow::anyhow!(
-                    "Azure GET response contained no buffers"
+                    "Azure GET response contained no response body"
                 )));
+            };
+            let part = part?;
+            if etag.is_none() {
+                etag = Some(part.blob.properties.etag);
             }
+            if last_modified.is_none() {
+                last_modified = Some(part.blob.properties.last_modified.into());
+            }
+            if let Some(blob_meta) = part.blob.metadata {
+                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
+            }
+
             // unwrap safety: if these were None, bufs would be empty and we would have returned an error already
             let etag = etag.unwrap();
             let last_modified = last_modified.unwrap();
 
+            let tail_stream = response
+                .map(|part| match part {
+                    Ok(part) => Either::Left(part.data.map(|r| r.map_err(io::Error::other))),
+                    Err(e) => {
+                        Either::Right(futures::stream::once(async { Err(io::Error::other(e)) }))
+                    }
+                })
+                .flatten();
+            let stream = part
+                .data
+                .map(|r| r.map_err(io::Error::other))
+                .chain(sync_wrapper::SyncStream::new(tail_stream));
+            //.chain(SyncStream::from_pin(Box::pin(tail_stream)));
+
+            let download_stream = crate::support::DownloadStream::new(cancel_or_timeout_, stream);
+
             Ok(Download {
-                download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
+                download_stream: Box::pin(download_stream),
                 etag,
                 last_modified,
                 metadata: Some(StorageMetadata(metadata)),
@@ -193,7 +202,10 @@ impl AzureBlobStorage {
 
         tokio::select! {
             bufs = download => bufs,
-            _ = cancel.cancelled() => Err(DownloadError::Cancelled),
+            cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
+                TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
+                TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
+            },
         }
     }
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index a225984688..b2da33e44a 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -66,6 +66,7 @@ serde_json = { version = "1", features = ["raw_value"] }
 sha2 = { version = "0.10", features = ["asm"] }
 smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
 subtle = { version = "2" }
+sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
 time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }

From 240efb82f918166a4b596c698f701f14a76d18f8 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Fri, 3 May 2024 10:00:29 +0200
Subject: [PATCH 0677/1571] Proxy reconnect pubsub before expiration (#7562)

## Problem

Proxy reconnects to redis only after it's already unavailable.

## Summary of changes

Reconnects every 6h.
---
 proxy/src/cache/project_info.rs  | 42 ++++++++++-----
 proxy/src/redis/notifications.rs | 93 ++++++++++++++++++++++----------
 2 files changed, 95 insertions(+), 40 deletions(-)

diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index d8a1d261ce..10cc4ceee1 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -5,9 +5,11 @@ use std::{
     time::Duration,
 };
 
+use async_trait::async_trait;
 use dashmap::DashMap;
 use rand::{thread_rng, Rng};
 use smol_str::SmolStr;
+use tokio::sync::Mutex;
 use tokio::time::Instant;
 use tracing::{debug, info};
 
@@ -21,11 +23,12 @@ use crate::{
 
 use super::{Cache, Cached};
 
+#[async_trait]
 pub trait ProjectInfoCache {
     fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
     fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
-    fn enable_ttl(&self);
-    fn disable_ttl(&self);
+    async fn decrement_active_listeners(&self);
+    async fn increment_active_listeners(&self);
 }
 
 struct Entry<T> {
@@ -116,8 +119,10 @@ pub struct ProjectInfoCacheImpl {
 
     start_time: Instant,
     ttl_disabled_since_us: AtomicU64,
+    active_listeners_lock: Mutex<usize>,
 }
 
+#[async_trait]
 impl ProjectInfoCache for ProjectInfoCacheImpl {
     fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) {
         info!("invalidating allowed ips for project `{}`", project_id);
@@ -148,15 +153,27 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
             }
         }
     }
-    fn enable_ttl(&self) {
-        self.ttl_disabled_since_us
-            .store(u64::MAX, std::sync::atomic::Ordering::Relaxed);
+    async fn decrement_active_listeners(&self) {
+        let mut listeners_guard = self.active_listeners_lock.lock().await;
+        if *listeners_guard == 0 {
+            tracing::error!("active_listeners count is already 0, something is broken");
+            return;
+        }
+        *listeners_guard -= 1;
+        if *listeners_guard == 0 {
+            self.ttl_disabled_since_us
+                .store(u64::MAX, std::sync::atomic::Ordering::SeqCst);
+        }
     }
 
-    fn disable_ttl(&self) {
-        let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64;
-        self.ttl_disabled_since_us
-            .store(new_ttl, std::sync::atomic::Ordering::Relaxed);
+    async fn increment_active_listeners(&self) {
+        let mut listeners_guard = self.active_listeners_lock.lock().await;
+        *listeners_guard += 1;
+        if *listeners_guard == 1 {
+            let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64;
+            self.ttl_disabled_since_us
+                .store(new_ttl, std::sync::atomic::Ordering::SeqCst);
+        }
     }
 }
 
@@ -168,6 +185,7 @@ impl ProjectInfoCacheImpl {
             config,
             ttl_disabled_since_us: AtomicU64::new(u64::MAX),
             start_time: Instant::now(),
+            active_listeners_lock: Mutex::new(0),
         }
     }
 
@@ -432,7 +450,7 @@ mod tests {
             ttl: Duration::from_secs(1),
             gc_interval: Duration::from_secs(600),
         }));
-        cache.clone().disable_ttl();
+        cache.clone().increment_active_listeners().await;
         tokio::time::advance(Duration::from_secs(2)).await;
 
         let project_id: ProjectId = "project".into();
@@ -489,7 +507,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_disable_ttl_invalidate_added_before() {
+    async fn test_increment_active_listeners_invalidate_added_before() {
         tokio::time::pause();
         let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions {
             size: 2,
@@ -514,7 +532,7 @@ mod tests {
             (&user1).into(),
             secret1.clone(),
         );
-        cache.clone().disable_ttl();
+        cache.clone().increment_active_listeners().await;
         tokio::time::advance(Duration::from_millis(100)).await;
         cache.insert_role_secret(
             (&project_id).into(),
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index ba4dfb755e..87d723d17e 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -4,6 +4,7 @@ use futures::StreamExt;
 use pq_proto::CancelKeyData;
 use redis::aio::PubSub;
 use serde::{Deserialize, Serialize};
+use tokio_util::sync::CancellationToken;
 use uuid::Uuid;
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
@@ -77,6 +78,16 @@ struct MessageHandler<C: ProjectInfoCache + Send + Sync + 'static> {
     region_id: String,
 }
 
+impl<C: ProjectInfoCache + Send + Sync + 'static> Clone for MessageHandler<C> {
+    fn clone(&self) -> Self {
+        Self {
+            cache: self.cache.clone(),
+            cancellation_handler: self.cancellation_handler.clone(),
+            region_id: self.region_id.clone(),
+        }
+    }
+}
+
 impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
     pub fn new(
         cache: Arc<C>,
@@ -89,11 +100,11 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
             region_id,
         }
     }
-    pub fn disable_ttl(&self) {
-        self.cache.disable_ttl();
+    pub async fn increment_active_listeners(&self) {
+        self.cache.increment_active_listeners().await;
     }
-    pub fn enable_ttl(&self) {
-        self.cache.enable_ttl();
+    pub async fn decrement_active_listeners(&self) {
+        self.cache.decrement_active_listeners().await;
     }
     #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
     async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> {
@@ -182,37 +193,24 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
     }
 }
 
-/// Handle console's invalidation messages.
-#[tracing::instrument(name = "console_notifications", skip_all)]
-pub async fn task_main<C>(
+async fn handle_messages<C: ProjectInfoCache + Send + Sync + 'static>(
+    handler: MessageHandler<C>,
     redis: ConnectionWithCredentialsProvider,
-    cache: Arc<C>,
-    cancel_map: CancelMap,
-    region_id: String,
-) -> anyhow::Result<Infallible>
-where
-    C: ProjectInfoCache + Send + Sync + 'static,
-{
-    cache.enable_ttl();
-    let handler = MessageHandler::new(
-        cache,
-        Arc::new(CancellationHandler::<()>::new(
-            cancel_map,
-            crate::metrics::CancellationSource::FromRedis,
-        )),
-        region_id,
-    );
-
+    cancellation_token: CancellationToken,
+) -> anyhow::Result<()> {
     loop {
+        if cancellation_token.is_cancelled() {
+            return Ok(());
+        }
         let mut conn = match try_connect(&redis).await {
             Ok(conn) => {
-                handler.disable_ttl();
+                handler.increment_active_listeners().await;
                 conn
             }
             Err(e) => {
                 tracing::error!(
-                    "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}"
-                );
+            "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}"
+        );
                 tokio::time::sleep(RECONNECT_TIMEOUT).await;
                 continue;
             }
@@ -226,8 +224,47 @@ where
                     break;
                 }
             }
+            if cancellation_token.is_cancelled() {
+                handler.decrement_active_listeners().await;
+                return Ok(());
+            }
         }
-        handler.enable_ttl();
+        handler.decrement_active_listeners().await;
+    }
+}
+
+/// Handle console's invalidation messages.
+#[tracing::instrument(name = "redis_notifications", skip_all)]
+pub async fn task_main<C>(
+    redis: ConnectionWithCredentialsProvider,
+    cache: Arc<C>,
+    cancel_map: CancelMap,
+    region_id: String,
+) -> anyhow::Result<Infallible>
+where
+    C: ProjectInfoCache + Send + Sync + 'static,
+{
+    let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
+        cancel_map,
+        crate::metrics::CancellationSource::FromRedis,
+    ));
+    let handler = MessageHandler::new(cache, cancellation_handler, region_id);
+    // 6h - 1m.
+    // There will be 1 minute overlap between two tasks. But at least we can be sure that no message is lost.
+    let mut interval = tokio::time::interval(std::time::Duration::from_secs(6 * 60 * 60 - 60));
+    loop {
+        let cancellation_token = CancellationToken::new();
+        interval.tick().await;
+
+        tokio::spawn(handle_messages(
+            handler.clone(),
+            redis.clone(),
+            cancellation_token.clone(),
+        ));
+        tokio::spawn(async move {
+            tokio::time::sleep(std::time::Duration::from_secs(6 * 60 * 60)).await; // 6h.
+            cancellation_token.cancel();
+        });
     }
 }
 

From 00423152c6eeafb731eddc11453ea683dab6196f Mon Sep 17 00:00:00 2001
From: Jure Bajic <jure.bajic94@gmail.com>
Date: Fri, 3 May 2024 10:38:19 +0200
Subject: [PATCH 0678/1571] Store operation identifier in `IdLockMap` on
 exclusive lock (#7397)

## Problem

Issues around operation and tenant locks would have been hard to debug
since there was little observability around them.

## Summary of changes

- As suggested in the issue, a wrapper was added around
`OwnedRwLockWriteGuard` called `IdentifierLock` that removes the
operation currently holding the exclusive lock when it's dropped.
- The value in `IdLockMap` was extended to hold a pair of locks and
operations that can be accessed and locked independently.
- When requesting an exclusive lock besides returning the lock on that
resource, an operation is changed if the lock is acquired.


Closes https://github.com/neondatabase/neon/issues/7108
---
 Cargo.lock                                    |   2 +
 storage_controller/Cargo.toml                 |   2 +
 storage_controller/src/id_lock_map.rs         | 153 ++++++++++++++++--
 storage_controller/src/service.rs             | 123 ++++++++++----
 test_runner/fixtures/neon_fixtures.py         | 100 +++++++-----
 .../regress/test_storage_controller.py        |  48 ++++++
 6 files changed, 348 insertions(+), 80 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 438b68493b..8438dad41b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5856,6 +5856,8 @@ dependencies = [
  "routerify",
  "serde",
  "serde_json",
+ "strum",
+ "strum_macros",
  "thiserror",
  "tokio",
  "tokio-util",
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 789420f2b0..194619a496 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -40,6 +40,8 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 measured.workspace = true
+strum.workspace = true
+strum_macros.workspace = true
 
 diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
diff --git a/storage_controller/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs
index b03700b50c..dff793289f 100644
--- a/storage_controller/src/id_lock_map.rs
+++ b/storage_controller/src/id_lock_map.rs
@@ -1,25 +1,64 @@
+use std::fmt::Display;
+use std::time::Instant;
 use std::{collections::HashMap, sync::Arc};
 
+use std::time::Duration;
+
+use crate::service::RECONCILE_TIMEOUT;
+
+const LOCK_TIMEOUT_ALERT_THRESHOLD: Duration = RECONCILE_TIMEOUT;
+
+/// A wrapper around `OwnedRwLockWriteGuard` that when dropped changes the
+/// current holding operation in lock.
+pub struct WrappedWriteGuard<T: Display> {
+    guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>,
+    start: Instant,
+}
+
+impl<T: Display> WrappedWriteGuard<T> {
+    pub fn new(guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>) -> Self {
+        Self {
+            guard,
+            start: Instant::now(),
+        }
+    }
+}
+
+impl<T: Display> Drop for WrappedWriteGuard<T> {
+    fn drop(&mut self) {
+        let duration = self.start.elapsed();
+        if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
+            tracing::warn!(
+                "Lock on {} was held for {:?}",
+                self.guard.as_ref().unwrap(),
+                duration
+            );
+        }
+        *self.guard = None;
+    }
+}
+
 /// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
 /// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
 /// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
 /// is needed at a tenant-wide granularity.
-pub(crate) struct IdLockMap<T>
+pub(crate) struct IdLockMap<T, I>
 where
     T: Eq + PartialEq + std::hash::Hash,
 {
     /// A synchronous lock for getting/setting the async locks that our callers will wait on.
-    entities: std::sync::Mutex<std::collections::HashMap<T, Arc<tokio::sync::RwLock<()>>>>,
+    entities: std::sync::Mutex<std::collections::HashMap<T, Arc<tokio::sync::RwLock<Option<I>>>>>,
 }
 
-impl<T> IdLockMap<T>
+impl<T, I> IdLockMap<T, I>
 where
     T: Eq + PartialEq + std::hash::Hash,
+    I: Display,
 {
     pub(crate) fn shared(
         &self,
         key: T,
-    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<()>> {
+    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<Option<I>>> {
         let mut locked = self.entities.lock().unwrap();
         let entry = locked.entry(key).or_default();
         entry.clone().read_owned()
@@ -28,21 +67,26 @@ where
     pub(crate) fn exclusive(
         &self,
         key: T,
-    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockWriteGuard<()>> {
+        operation: I,
+    ) -> impl std::future::Future<Output = WrappedWriteGuard<I>> {
         let mut locked = self.entities.lock().unwrap();
-        let entry = locked.entry(key).or_default();
-        entry.clone().write_owned()
+        let entry = locked.entry(key).or_default().clone();
+        async move {
+            let mut guard = WrappedWriteGuard::new(entry.clone().write_owned().await);
+            *guard.guard = Some(operation);
+            guard
+        }
     }
 
     /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do
     /// periodic housekeeping to avoid the map growing indefinitely
     pub(crate) fn housekeeping(&self) {
         let mut locked = self.entities.lock().unwrap();
-        locked.retain(|_k, lock| lock.try_write().is_err())
+        locked.retain(|_k, entry| entry.try_write().is_err())
     }
 }
 
-impl<T> Default for IdLockMap<T>
+impl<T, I> Default for IdLockMap<T, I>
 where
     T: Eq + PartialEq + std::hash::Hash,
 {
@@ -52,3 +96,94 @@ where
         }
     }
 }
+
+pub async fn trace_exclusive_lock<
+    T: Clone + Display + Eq + PartialEq + std::hash::Hash,
+    I: Display + Clone,
+>(
+    op_locks: &IdLockMap<T, I>,
+    key: T,
+    operation: I,
+) -> WrappedWriteGuard<I> {
+    let start = Instant::now();
+    let guard = op_locks.exclusive(key.clone(), operation.clone()).await;
+
+    let duration = start.elapsed();
+    if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
+        tracing::warn!(
+            "Operation {} on key {} has waited {:?} for exclusive lock",
+            operation,
+            key,
+            duration
+        );
+    }
+
+    guard
+}
+
+pub async fn trace_shared_lock<
+    T: Clone + Display + Eq + PartialEq + std::hash::Hash,
+    I: Display,
+>(
+    op_locks: &IdLockMap<T, I>,
+    key: T,
+    operation: I,
+) -> tokio::sync::OwnedRwLockReadGuard<Option<I>> {
+    let start = Instant::now();
+    let guard = op_locks.shared(key.clone()).await;
+
+    let duration = start.elapsed();
+    if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
+        tracing::warn!(
+            "Operation {} on key {} has waited {:?} for shared lock",
+            operation,
+            key,
+            duration
+        );
+    }
+
+    guard
+}
+
+#[cfg(test)]
+mod tests {
+    use super::IdLockMap;
+
+    #[derive(Clone, Debug, strum_macros::Display, PartialEq)]
+    enum Operations {
+        Op1,
+        Op2,
+    }
+
+    #[tokio::test]
+    async fn multiple_shared_locks() {
+        let id_lock_map: IdLockMap<i32, Operations> = IdLockMap::default();
+
+        let shared_lock_1 = id_lock_map.shared(1).await;
+        let shared_lock_2 = id_lock_map.shared(1).await;
+
+        assert!(shared_lock_1.is_none());
+        assert!(shared_lock_2.is_none());
+    }
+
+    #[tokio::test]
+    async fn exclusive_locks() {
+        let id_lock_map = IdLockMap::default();
+        let resource_id = 1;
+
+        {
+            let _ex_lock = id_lock_map.exclusive(resource_id, Operations::Op1).await;
+            assert_eq!(_ex_lock.guard.clone().unwrap(), Operations::Op1);
+
+            let _ex_lock_2 = tokio::time::timeout(
+                tokio::time::Duration::from_millis(1),
+                id_lock_map.exclusive(resource_id, Operations::Op2),
+            )
+            .await;
+            assert!(_ex_lock_2.is_err());
+        }
+
+        let shared_lock_1 = id_lock_map.shared(resource_id).await;
+        assert!(shared_lock_1.is_none());
+    }
+}
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index f26122e646..eaff87d1ce 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -9,7 +9,7 @@ use std::{
 
 use crate::{
     compute_hook::NotifyError,
-    id_lock_map::IdLockMap,
+    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard},
     persistence::{AbortShardSplitStatus, TenantFilter},
     reconciler::{ReconcileError, ReconcileUnits},
     scheduler::{ScheduleContext, ScheduleMode},
@@ -33,6 +33,7 @@ use pageserver_api::{
     models::{SecondaryProgress, TenantConfigRequest},
 };
 use reqwest::StatusCode;
+use tracing::instrument;
 
 use crate::pageserver_client::PageserverClient;
 use pageserver_api::{
@@ -50,11 +51,11 @@ use pageserver_api::{
     },
 };
 use pageserver_client::mgmt_api;
-use tokio::sync::{mpsc::error::TrySendError, OwnedRwLockWriteGuard};
+use tokio::sync::mpsc::error::TrySendError;
 use tokio_util::sync::CancellationToken;
-use tracing::instrument;
 use utils::{
     completion::Barrier,
+    failpoint_support,
     generation::Generation,
     http::error::ApiError,
     id::{NodeId, TenantId, TimelineId},
@@ -79,7 +80,7 @@ const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
 
 // For operations that might be slow, like migrating a tenant with
 // some data in it.
-const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
+pub const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
 // If we receive a call using Secondary mode initially, it will omit generation.  We will initialize
 // tenant shards into this generation, and as long as it remains in this generation, we will accept
@@ -96,6 +97,26 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 /// (`<https://github.com/neondatabase/neon/issues/7552>`)
 pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
 
+#[derive(Clone, strum_macros::Display)]
+enum TenantOperations {
+    Create,
+    LocationConfig,
+    ConfigSet,
+    TimeTravelRemoteStorage,
+    Delete,
+    UpdatePolicy,
+    ShardSplit,
+    SecondaryDownload,
+    TimelineCreate,
+    TimelineDelete,
+}
+
+#[derive(Clone, strum_macros::Display)]
+enum NodeOperations {
+    Register,
+    Configure,
+}
+
 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 
 // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
@@ -231,11 +252,11 @@ pub struct Service {
     // Locking on a tenant granularity (covers all shards in the tenant):
     // - Take exclusively for rare operations that mutate the tenant's persistent state (e.g. create/delete/split)
     // - Take in shared mode for operations that need the set of shards to stay the same to complete reliably (e.g. timeline CRUD)
-    tenant_op_locks: IdLockMap<TenantId>,
+    tenant_op_locks: IdLockMap<TenantId, TenantOperations>,
 
     // Locking for node-mutating operations: take exclusively for operations that modify the node's persistent state, or
     // that transition it to/from Active.
-    node_op_locks: IdLockMap<NodeId>,
+    node_op_locks: IdLockMap<NodeId, NodeOperations>,
 
     // Limit how many Reconcilers we will spawn concurrently
     reconciler_concurrency: Arc<tokio::sync::Semaphore>,
@@ -307,7 +328,7 @@ struct TenantShardSplitAbort {
     new_shard_count: ShardCount,
     new_stripe_size: Option<ShardStripeSize>,
     /// Until this abort op is complete, no other operations may be done on the tenant
-    _tenant_lock: tokio::sync::OwnedRwLockWriteGuard<()>,
+    _tenant_lock: WrappedWriteGuard<TenantOperations>,
 }
 
 #[derive(thiserror::Error, Debug)]
@@ -1340,7 +1361,7 @@ impl Service {
     async fn node_activate_reconcile(
         &self,
         mut node: Node,
-        _lock: &OwnedRwLockWriteGuard<()>,
+        _lock: &WrappedWriteGuard<NodeOperations>,
     ) -> Result<(), ApiError> {
         // This Node is a mutable local copy: we will set it active so that we can use its
         // API client to reconcile with the node.  The Node in [`Self::nodes`] will get updated
@@ -1586,11 +1607,12 @@ impl Service {
         let tenant_id = create_req.new_tenant_id.tenant_id;
 
         // Exclude any concurrent attempts to create/access the same tenant ID
-        let _tenant_lock = self
-            .tenant_op_locks
-            .exclusive(create_req.new_tenant_id.tenant_id)
-            .await;
-
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            create_req.new_tenant_id.tenant_id,
+            TenantOperations::Create,
+        )
+        .await;
         let (response, waiters) = self.do_tenant_create(create_req).await?;
 
         if let Err(e) = self.await_waiters(waiters, RECONCILE_TIMEOUT).await {
@@ -1929,10 +1951,12 @@ impl Service {
         req: TenantLocationConfigRequest,
     ) -> Result<TenantLocationConfigResponse, ApiError> {
         // We require an exclusive lock, because we are updating both persistent and in-memory state
-        let _tenant_lock = self
-            .tenant_op_locks
-            .exclusive(tenant_shard_id.tenant_id)
-            .await;
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            tenant_shard_id.tenant_id,
+            TenantOperations::LocationConfig,
+        )
+        .await;
 
         if !tenant_shard_id.is_unsharded() {
             return Err(ApiError::BadRequest(anyhow::anyhow!(
@@ -2050,7 +2074,12 @@ impl Service {
 
     pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
         // We require an exclusive lock, because we are updating persistent and in-memory state
-        let _tenant_lock = self.tenant_op_locks.exclusive(req.tenant_id).await;
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            req.tenant_id,
+            TenantOperations::ConfigSet,
+        )
+        .await;
 
         let tenant_id = req.tenant_id;
         let config = req.config;
@@ -2139,7 +2168,12 @@ impl Service {
         timestamp: Cow<'_, str>,
         done_if_after: Cow<'_, str>,
     ) -> Result<(), ApiError> {
-        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimeTravelRemoteStorage,
+        )
+        .await;
 
         let node = {
             let locked = self.inner.read().unwrap();
@@ -2230,7 +2264,12 @@ impl Service {
         tenant_id: TenantId,
         wait: Option<Duration>,
     ) -> Result<(StatusCode, SecondaryProgress), ApiError> {
-        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::SecondaryDownload,
+        )
+        .await;
 
         // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to
         let targets = {
@@ -2324,7 +2363,8 @@ impl Service {
     }
 
     pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
-        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+        let _tenant_lock =
+            trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await;
 
         self.ensure_attached_wait(tenant_id).await?;
 
@@ -2424,7 +2464,14 @@ impl Service {
         req: TenantPolicyRequest,
     ) -> Result<(), ApiError> {
         // We require an exclusive lock, because we are updating persistent and in-memory state
-        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::UpdatePolicy,
+        )
+        .await;
+
+        failpoint_support::sleep_millis_async!("tenant-update-policy-exclusive-lock");
 
         let TenantPolicyRequest {
             placement,
@@ -2478,7 +2525,12 @@ impl Service {
             create_req.new_timeline_id,
         );
 
-        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimelineCreate,
+        )
+        .await;
 
         self.ensure_attached_wait(tenant_id).await?;
 
@@ -2593,7 +2645,12 @@ impl Service {
         timeline_id: TimelineId,
     ) -> Result<StatusCode, ApiError> {
         tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,);
-        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimelineDelete,
+        )
+        .await;
 
         self.ensure_attached_wait(tenant_id).await?;
 
@@ -3132,7 +3189,12 @@ impl Service {
     ) -> Result<TenantShardSplitResponse, ApiError> {
         // TODO: return 503 if we get stuck waiting for this lock
         // (issue https://github.com/neondatabase/neon/issues/7108)
-        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::ShardSplit,
+        )
+        .await;
 
         let new_shard_count = ShardCount::new(split_req.new_shard_count);
         let new_stripe_size = split_req.new_stripe_size;
@@ -3893,9 +3955,13 @@ impl Service {
         &self,
         register_req: NodeRegisterRequest,
     ) -> Result<(), ApiError> {
-        let _node_lock = self.node_op_locks.exclusive(register_req.node_id).await;
+        let _node_lock = trace_exclusive_lock(
+            &self.node_op_locks,
+            register_req.node_id,
+            NodeOperations::Register,
+        )
+        .await;
 
-        // Pre-check for an already-existing node
         {
             let locked = self.inner.read().unwrap();
             if let Some(node) = locked.nodes.get(&register_req.node_id) {
@@ -3982,7 +4048,8 @@ impl Service {
         availability: Option<NodeAvailability>,
         scheduling: Option<NodeSchedulingPolicy>,
     ) -> Result<(), ApiError> {
-        let _node_lock = self.node_op_locks.exclusive(node_id).await;
+        let _node_lock =
+            trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Configure).await;
 
         if let Some(scheduling) = scheduling {
             // Scheduling is a persistent part of Node: we must write updates to the database before
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index fbd1e22aa9..19aa4cc886 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1959,6 +1959,55 @@ class Pagectl(AbstractNeonCli):
         return IndexPartDump.from_json(parsed)
 
 
+class LogUtils:
+    """
+    A mixin class which provides utilities for inspecting the logs of a service.
+    """
+
+    def __init__(self, logfile: Path) -> None:
+        self.logfile = logfile
+
+    def assert_log_contains(
+        self, pattern: str, offset: None | LogCursor = None
+    ) -> Tuple[str, LogCursor]:
+        """Convenient for use inside wait_until()"""
+
+        res = self.log_contains(pattern, offset=offset)
+        assert res is not None
+        return res
+
+    def log_contains(
+        self, pattern: str, offset: None | LogCursor = None
+    ) -> Optional[Tuple[str, LogCursor]]:
+        """Check that the log contains a line that matches the given regex"""
+        logfile = self.logfile
+        if not logfile.exists():
+            log.warning(f"Skipping log check: {logfile} does not exist")
+            return None
+
+        contains_re = re.compile(pattern)
+
+        # XXX: Our rust logging machinery buffers the messages, so if you
+        # call this function immediately after it's been logged, there is
+        # no guarantee it is already present in the log file. This hasn't
+        # been a problem in practice, our python tests are not fast enough
+        # to hit that race condition.
+        skip_until_line_no = 0 if offset is None else offset._line_no
+        cur_line_no = 0
+        with logfile.open("r") as f:
+            for line in f:
+                if cur_line_no < skip_until_line_no:
+                    cur_line_no += 1
+                    continue
+                elif contains_re.search(line):
+                    # found it!
+                    cur_line_no += 1
+                    return (line, LogCursor(cur_line_no))
+                else:
+                    cur_line_no += 1
+        return None
+
+
 class StorageControllerApiException(Exception):
     def __init__(self, message, status_code: int):
         super().__init__(message)
@@ -1966,12 +2015,13 @@ class StorageControllerApiException(Exception):
         self.status_code = status_code
 
 
-class NeonStorageController(MetricsGetter):
+class NeonStorageController(MetricsGetter, LogUtils):
     def __init__(self, env: NeonEnv, auth_enabled: bool):
         self.env = env
         self.running = False
         self.auth_enabled = auth_enabled
         self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
+        self.logfile = self.workdir / "storage_controller.log"
 
     def start(self):
         assert not self.running
@@ -2295,6 +2345,10 @@ class NeonStorageController(MetricsGetter):
         log.info(f"Got failpoints request response code {res.status_code}")
         res.raise_for_status()
 
+    @property
+    def workdir(self) -> Path:
+        return self.env.repo_dir
+
     def __enter__(self) -> "NeonStorageController":
         return self
 
@@ -2312,7 +2366,7 @@ class LogCursor:
     _line_no: int
 
 
-class NeonPageserver(PgProtocol):
+class NeonPageserver(PgProtocol, LogUtils):
     """
     An object representing a running pageserver.
     """
@@ -2329,7 +2383,7 @@ class NeonPageserver(PgProtocol):
         self.service_port = port
         self.config_override = config_override
         self.version = env.get_binary_version("pageserver")
-
+        self.logfile = self.workdir / "pageserver.log"
         # After a test finishes, we will scrape the log to see if there are any
         # unexpected error messages. If your test expects an error, add it to
         # 'allowed_errors' in the test with something like:
@@ -2469,46 +2523,6 @@ class NeonPageserver(PgProtocol):
             value = self.http_client().get_metric_value(metric)
             assert value == 0, f"Nonzero {metric} == {value}"
 
-    def assert_log_contains(
-        self, pattern: str, offset: None | LogCursor = None
-    ) -> Tuple[str, LogCursor]:
-        """Convenient for use inside wait_until()"""
-
-        res = self.log_contains(pattern, offset=offset)
-        assert res is not None
-        return res
-
-    def log_contains(
-        self, pattern: str, offset: None | LogCursor = None
-    ) -> Optional[Tuple[str, LogCursor]]:
-        """Check that the pageserver log contains a line that matches the given regex"""
-        logfile = self.workdir / "pageserver.log"
-        if not logfile.exists():
-            log.warning(f"Skipping log check: {logfile} does not exist")
-            return None
-
-        contains_re = re.compile(pattern)
-
-        # XXX: Our rust logging machinery buffers the messages, so if you
-        # call this function immediately after it's been logged, there is
-        # no guarantee it is already present in the log file. This hasn't
-        # been a problem in practice, our python tests are not fast enough
-        # to hit that race condition.
-        skip_until_line_no = 0 if offset is None else offset._line_no
-        cur_line_no = 0
-        with logfile.open("r") as f:
-            for line in f:
-                if cur_line_no < skip_until_line_no:
-                    cur_line_no += 1
-                    continue
-                elif contains_re.search(line):
-                    # found it!
-                    cur_line_no += 1
-                    return (line, LogCursor(cur_line_no))
-                else:
-                    cur_line_no += 1
-        return None
-
     def tenant_attach(
         self,
         tenant_id: TenantId,
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index bc1f8776b3..63accebc7c 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1,4 +1,5 @@
 import json
+import threading
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
@@ -1259,6 +1260,53 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.consistency_check()
 
 
+def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
+    """
+    Check that when lock on resource (tenants, nodes) is held for too long it is
+    traced in logs.
+    """
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*Lock on.*",
+            ".*Scheduling is disabled by policy.*",
+            f".*Operation TimelineCreate on key {tenant_id} has waited.*",
+        ]
+    )
+
+    # Apply failpoint
+    env.storage_controller.configure_failpoints(
+        ("tenant-update-policy-exclusive-lock", "return(31000)")
+    )
+
+    # This will hold the exclusive for enough time to cause an warning
+    def update_tenent_policy():
+        env.storage_controller.tenant_policy_update(
+            tenant_id=tenant_id,
+            body={
+                "scheduling": "Stop",
+            },
+        )
+
+    thread_update_tenant_policy = threading.Thread(target=update_tenent_policy)
+    thread_update_tenant_policy.start()
+
+    # Make sure the update policy thread has started
+    time.sleep(1)
+    # This will not be able to access and will log a warning
+    timeline_id = TimelineId.generate()
+    env.storage_controller.pageserver_api().timeline_create(
+        pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id
+    )
+    thread_update_tenant_policy.join(timeout=10)
+
+    env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for")
+    env.storage_controller.assert_log_contains(
+        f"Operation TimelineCreate on key {tenant_id} has waited"
+    )
+
+
 @pytest.mark.parametrize("remote_storage", [RemoteStorageKind.LOCAL_FS, s3_storage()])
 @pytest.mark.parametrize("shard_count", [None, 4])
 def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_storage):

From 3582a95c8767fc39f037eed36e0fe3e1052443f2 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 3 May 2024 04:55:48 -0400
Subject: [PATCH 0679/1571] fix(pageserver): compile warning of
 download_object.ctx on macos (#7596)

fix macOS compile warning introduced in
https://github.com/neondatabase/neon/commit/45ec8688ea27cbad9789aac934a23069cbe95595

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/remote_timeline_client/download.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 345a12aa86..b038f264f5 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -136,7 +136,7 @@ async fn download_object<'a>(
     src_path: &RemotePath,
     dst_path: &Utf8PathBuf,
     cancel: &CancellationToken,
-    ctx: &RequestContext,
+    #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
     let res = match crate::virtual_file::io_engine::get() {
         crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),

From 60f570c70da0fec651b5fd5de0d551b60d5f53b6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 3 May 2024 13:11:51 +0300
Subject: [PATCH 0680/1571] refactor(update_gc_info): split GcInfo to compose
 out of GcCutoffs (#7584)

Split `GcInfo` and replace `Timeline::update_gc_info` with a method that
simply finds gc cutoffs `Timeline::find_gc_cutoffs` to be combined as
`Timeline::gc_info` at the caller.

This change will be followed up with a change that finds the GC cutoff
values before taking the `Tenant::gc_cs` lock.

Cc: #7560
---
 pageserver/src/metrics.rs         | 12 ++--
 pageserver/src/tenant.rs          | 87 ++++++++++++----------------
 pageserver/src/tenant/size.rs     |  8 ++-
 pageserver/src/tenant/timeline.rs | 96 +++++++++++++++----------------
 4 files changed, 95 insertions(+), 108 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index d3c8c423e4..d8019b08e2 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -51,8 +51,8 @@ pub(crate) enum StorageTimeOperation {
     #[strum(serialize = "gc")]
     Gc,
 
-    #[strum(serialize = "update gc info")]
-    UpdateGcInfo,
+    #[strum(serialize = "find gc cutoffs")]
+    FindGcCutoffs,
 
     #[strum(serialize = "create tenant")]
     CreateTenant,
@@ -1989,7 +1989,7 @@ pub(crate) struct TimelineMetrics {
     pub imitate_logical_size_histo: StorageTimeMetrics,
     pub load_layer_map_histo: StorageTimeMetrics,
     pub garbage_collect_histo: StorageTimeMetrics,
-    pub update_gc_info_histo: StorageTimeMetrics,
+    pub find_gc_cutoffs_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
     resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
@@ -2050,8 +2050,8 @@ impl TimelineMetrics {
             &shard_id,
             &timeline_id,
         );
-        let update_gc_info_histo = StorageTimeMetrics::new(
-            StorageTimeOperation::UpdateGcInfo,
+        let find_gc_cutoffs_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::FindGcCutoffs,
             &tenant_id,
             &shard_id,
             &timeline_id,
@@ -2098,7 +2098,7 @@ impl TimelineMetrics {
             logical_size_histo,
             imitate_logical_size_histo,
             garbage_collect_histo,
-            update_gc_info_histo,
+            find_gc_cutoffs_histo,
             load_layer_map_histo,
             last_record_gauge,
             resident_physical_size_gauge,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 05ceff2b59..a6cd1471ff 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -62,6 +62,7 @@ use self::timeline::uninit::TimelineCreateGuard;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
+use self::timeline::GcInfo;
 use self::timeline::TimelineResources;
 use self::timeline::WaitLsnError;
 use crate::config::PageServerConf;
@@ -86,7 +87,6 @@ use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::InitializationOrder;
-use std::cmp::min;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
@@ -2886,9 +2886,12 @@ impl Tenant {
                 ))
                 .map(|&x| x.1)
                 .collect();
-            timeline
-                .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
-                .await?;
+            let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
+
+            *timeline.gc_info.write().unwrap() = GcInfo {
+                retain_lsns: branchpoints,
+                cutoffs,
+            };
 
             gc_timelines.push(timeline);
         }
@@ -2977,7 +2980,7 @@ impl Tenant {
         // and then the planned GC cutoff
         {
             let gc_info = src_timeline.gc_info.read().unwrap();
-            let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
+            let cutoff = gc_info.min_cutoff();
             if start_lsn < cutoff {
                 return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
                     "invalid branch start lsn: less than planned GC cutoff {cutoff}"
@@ -4513,18 +4516,20 @@ mod tests {
     }
 
     async fn bulk_insert_compact_gc(
-        timeline: Arc<Timeline>,
+        tenant: &Tenant,
+        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
         lsn: Lsn,
         repeat: usize,
         key_count: usize,
     ) -> anyhow::Result<()> {
         let compact = true;
-        bulk_insert_maybe_compact_gc(timeline, ctx, lsn, repeat, key_count, compact).await
+        bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
     }
 
     async fn bulk_insert_maybe_compact_gc(
-        timeline: Arc<Timeline>,
+        tenant: &Tenant,
+        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
         mut lsn: Lsn,
         repeat: usize,
@@ -4537,6 +4542,8 @@ mod tests {
         // Enforce that key range is monotonously increasing
         let mut keyspace = KeySpaceAccum::new();
 
+        let cancel = CancellationToken::new();
+
         for _ in 0..repeat {
             for _ in 0..key_count {
                 test_key.field6 = blknum;
@@ -4558,24 +4565,19 @@ mod tests {
                 blknum += 1;
             }
 
-            let cutoff = timeline.get_last_record_lsn();
-
-            timeline
-                .update_gc_info(
-                    Vec::new(),
-                    cutoff,
-                    Duration::ZERO,
-                    &CancellationToken::new(),
-                    ctx,
-                )
-                .await?;
             timeline.freeze_and_flush().await?;
             if compact {
-                timeline
-                    .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
-                    .await?;
+                // this requires timeline to be &Arc<Timeline>
+                timeline.compact(&cancel, EnumSet::empty(), ctx).await?;
             }
-            timeline.gc().await?;
+
+            // this doesn't really need to use the timeline_id target, but it is closer to what it
+            // originally was.
+            let res = tenant
+                .gc_iteration(Some(timeline.timeline_id), 0, Duration::ZERO, &cancel, ctx)
+                .await?;
+
+            assert_eq!(res.layers_removed, 0, "this never removes anything");
         }
 
         Ok(())
@@ -4594,7 +4596,7 @@ mod tests {
             .await?;
 
         let lsn = Lsn(0x10);
-        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
+        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
 
         Ok(())
     }
@@ -4625,7 +4627,7 @@ mod tests {
             .await?;
 
         let lsn = Lsn(0x10);
-        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
+        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
 
         let guard = tline.layers.read().await;
         guard.layer_map().dump(true, &ctx).await?;
@@ -5079,6 +5081,7 @@ mod tests {
             .await?;
 
         const NUM_KEYS: usize = 1000;
+        let cancel = CancellationToken::new();
 
         let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
 
@@ -5138,18 +5141,10 @@ mod tests {
             }
 
             // Perform a cycle of flush, and GC
-            let cutoff = tline.get_last_record_lsn();
-            tline
-                .update_gc_info(
-                    Vec::new(),
-                    cutoff,
-                    Duration::ZERO,
-                    &CancellationToken::new(),
-                    &ctx,
-                )
-                .await?;
             tline.freeze_and_flush().await?;
-            tline.gc().await?;
+            tenant
+                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                .await?;
         }
 
         Ok(())
@@ -5170,6 +5165,8 @@ mod tests {
 
         let mut keyspace = KeySpaceAccum::new();
 
+        let cancel = CancellationToken::new();
+
         // Track when each page was last modified. Used to assert that
         // a read sees the latest page version.
         let mut updated = [Lsn(0); NUM_KEYS];
@@ -5233,21 +5230,11 @@ mod tests {
             }
 
             // Perform a cycle of flush, compact, and GC
-            let cutoff = tline.get_last_record_lsn();
-            tline
-                .update_gc_info(
-                    Vec::new(),
-                    cutoff,
-                    Duration::ZERO,
-                    &CancellationToken::new(),
-                    &ctx,
-                )
-                .await?;
             tline.freeze_and_flush().await?;
-            tline
-                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+            tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
+            tenant
+                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
                 .await?;
-            tline.gc().await?;
         }
 
         Ok(())
@@ -5452,7 +5439,7 @@ mod tests {
 
         let lsn = Lsn(0x10);
         let compact = false;
-        bulk_insert_maybe_compact_gc(tline.clone(), &ctx, lsn, 50, 10000, compact).await?;
+        bulk_insert_maybe_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000, compact).await?;
 
         let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
         let read_lsn = Lsn(u64::MAX - 1);
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index f521dfa55d..974c1091fd 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -192,7 +192,9 @@ pub(super) async fn gather_inputs(
         // than a space bound (horizon cutoff).  This means that if someone drops a database and waits for their
         // PITR interval, they will see synthetic size decrease, even if we are still storing data inside
         // horizon_cutoff.
-        let mut next_gc_cutoff = gc_info.pitr_cutoff;
+        let pitr_cutoff = gc_info.cutoffs.pitr;
+        let horizon_cutoff = gc_info.cutoffs.horizon;
+        let mut next_gc_cutoff = pitr_cutoff;
 
         // If the caller provided a shorter retention period, use that instead of the GC cutoff.
         let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
@@ -297,8 +299,8 @@ pub(super) async fn gather_inputs(
             last_record: last_record_lsn,
             // this is not used above, because it might not have updated recently enough
             latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
-            horizon_cutoff: gc_info.horizon_cutoff,
-            pitr_cutoff: gc_info.pitr_cutoff,
+            horizon_cutoff,
+            pitr_cutoff,
             next_gc_cutoff,
             retention_param_cutoff,
         });
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 22bfa53445..7aeb3a6a59 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -325,7 +325,7 @@ pub struct Timeline {
 
     // List of child timelines and their branch points. This is needed to avoid
     // garbage collecting data that is still needed by the child timelines.
-    pub gc_info: std::sync::RwLock<GcInfo>,
+    pub(crate) gc_info: std::sync::RwLock<GcInfo>,
 
     // It may change across major versions so for simplicity
     // keep it after running initdb for a timeline.
@@ -409,33 +409,59 @@ pub struct WalReceiverInfo {
     pub last_received_msg_ts: u128,
 }
 
-///
 /// Information about how much history needs to be retained, needed by
 /// Garbage Collection.
-///
-pub struct GcInfo {
+#[derive(Default)]
+pub(crate) struct GcInfo {
     /// Specific LSNs that are needed.
     ///
     /// Currently, this includes all points where child branches have
     /// been forked off from. In the future, could also include
     /// explicit user-defined snapshot points.
-    pub retain_lsns: Vec<Lsn>,
+    pub(crate) retain_lsns: Vec<Lsn>,
 
-    /// In addition to 'retain_lsns', keep everything newer than this
-    /// point.
+    /// The cutoff coordinates, which are combined by selecting the minimum.
+    pub(crate) cutoffs: GcCutoffs,
+}
+
+impl GcInfo {
+    pub(crate) fn min_cutoff(&self) -> Lsn {
+        self.cutoffs.select_min()
+    }
+}
+
+/// The `GcInfo` component describing which Lsns need to be retained.
+#[derive(Debug)]
+pub(crate) struct GcCutoffs {
+    /// Keep everything newer than this point.
     ///
     /// This is calculated by subtracting 'gc_horizon' setting from
     /// last-record LSN
     ///
     /// FIXME: is this inclusive or exclusive?
-    pub horizon_cutoff: Lsn,
+    pub(crate) horizon: Lsn,
 
     /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this
     /// point.
     ///
     /// This is calculated by finding a number such that a record is needed for PITR
     /// if only if its LSN is larger than 'pitr_cutoff'.
-    pub pitr_cutoff: Lsn,
+    pub(crate) pitr: Lsn,
+}
+
+impl Default for GcCutoffs {
+    fn default() -> Self {
+        Self {
+            horizon: Lsn::INVALID,
+            pitr: Lsn::INVALID,
+        }
+    }
+}
+
+impl GcCutoffs {
+    fn select_min(&self) -> Lsn {
+        std::cmp::min(self.horizon, self.pitr)
+    }
 }
 
 /// An error happened in a get() operation.
@@ -1155,7 +1181,7 @@ impl Timeline {
                                " - keyspace={:?} lsn={}"),
                        seq_err, keyspace, lsn) },
             (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => {
-                // Sequential get runs after vectored get, so it is possible for the later 
+                // Sequential get runs after vectored get, so it is possible for the later
                 // to time out while waiting for its ancestor's Lsn to become ready and for the
                 // former to succeed (it essentially has a doubled wait time).
             },
@@ -2097,11 +2123,7 @@ impl Timeline {
 
                 write_lock: tokio::sync::Mutex::new(None),
 
-                gc_info: std::sync::RwLock::new(GcInfo {
-                    retain_lsns: Vec::new(),
-                    horizon_cutoff: Lsn(0),
-                    pitr_cutoff: Lsn(0),
-                }),
+                gc_info: std::sync::RwLock::new(GcInfo::default()),
 
                 latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
                 initdb_lsn: metadata.initdb_lsn(),
@@ -4383,7 +4405,7 @@ impl Timeline {
         Ok(())
     }
 
-    /// Update information about which layer files need to be retained on
+    /// Find the Lsns above which layer files need to be retained on
     /// garbage collection. This is separate from actually performing the GC,
     /// and is updated more frequently, so that compaction can remove obsolete
     /// page versions more aggressively.
@@ -4391,17 +4413,6 @@ impl Timeline {
     /// TODO: that's wishful thinking, compaction doesn't actually do that
     /// currently.
     ///
-    /// The caller specifies how much history is needed with the 3 arguments:
-    ///
-    /// retain_lsns: keep a version of each page at these LSNs
-    /// cutoff_horizon: also keep everything newer than this LSN
-    /// pitr: the time duration required to keep data for PITR
-    ///
-    /// The 'retain_lsns' list is currently used to prevent removing files that
-    /// are needed by child timelines. In the future, the user might be able to
-    /// name additional points in time to retain. The caller is responsible for
-    /// collecting that information.
-    ///
     /// The 'cutoff_horizon' point is used to retain recent versions that might still be
     /// needed by read-only nodes. (As of this writing, the caller just passes
     /// the latest LSN subtracted by a constant, and doesn't do anything smart
@@ -4409,26 +4420,17 @@ impl Timeline {
     ///
     /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
     /// whether a record is needed for PITR.
-    ///
-    /// NOTE: This function holds a short-lived lock to protect the 'gc_info'
-    /// field, so that the three values passed as argument are stored
-    /// atomically. But the caller is responsible for ensuring that no new
-    /// branches are created that would need to be included in 'retain_lsns',
-    /// for example. The caller should hold `Tenant::gc_cs` lock to ensure
-    /// that.
-    ///
     #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
-    pub(super) async fn update_gc_info(
+    pub(super) async fn find_gc_cutoffs(
         &self,
-        retain_lsns: Vec<Lsn>,
         cutoff_horizon: Lsn,
         pitr: Duration,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<GcCutoffs> {
         let _timer = self
             .metrics
-            .update_gc_info_histo
+            .find_gc_cutoffs_histo
             .start_timer()
             .record_on_drop();
 
@@ -4481,14 +4483,10 @@ impl Timeline {
             self.get_last_record_lsn()
         };
 
-        // Grab the lock and update the values
-        *self.gc_info.write().unwrap() = GcInfo {
-            retain_lsns,
-            horizon_cutoff: cutoff_horizon,
-            pitr_cutoff,
-        };
-
-        Ok(())
+        Ok(GcCutoffs {
+            horizon: cutoff_horizon,
+            pitr: pitr_cutoff,
+        })
     }
 
     /// Garbage collect layer files on a timeline that are no longer needed.
@@ -4517,8 +4515,8 @@ impl Timeline {
         let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
             let gc_info = self.gc_info.read().unwrap();
 
-            let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn());
-            let pitr_cutoff = gc_info.pitr_cutoff;
+            let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
+            let pitr_cutoff = gc_info.cutoffs.pitr;
             let retain_lsns = gc_info.retain_lsns.clone();
             (horizon_cutoff, pitr_cutoff, retain_lsns)
         };

From d76963691f556566bfe08581b7cc32cdca5ee800 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 3 May 2024 13:23:11 +0200
Subject: [PATCH 0681/1571] Increase Azure parallelism limit to 100 (#7597)

After #5563 has been addressed we can now set the Azure strorage
parallelism limit to 100 like it is for S3.

Part of #5567
---
 libs/remote_storage/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 32bc71c513..708662f20f 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -55,11 +55,11 @@ pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
 /// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
-/// We set this a little bit low as we currently buffer the entire file into RAM
+/// Set this limit analogously to the S3 limit
 ///
 /// Here, a limit of max 20k concurrent connections was noted.
 /// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
-pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30;
+pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;

From 37b1930b2f6cb072087cdc011d12a91342a4afc9 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 3 May 2024 12:40:09 +0100
Subject: [PATCH 0682/1571] tests: relax test download remote layers api
 (#7604)

## Problem
This test triggers layer download failures on demand. It is possible to
modify the failpoint
during a `Timeline::get_vectored` right between the vectored read and
it's validation read.
This means that one of the reads can fail while the other one succeeds
and vice versa.

## Summary of changes
These errors are expected, so allow them to happen.
---
 test_runner/regress/test_ondemand_download.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index ba0d53704b..6c2556f6a2 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -333,6 +333,17 @@ def test_download_remote_layers_api(
         }
     )
 
+    # This test triggers layer download failures on demand. It is possible to modify the failpoint
+    # during a `Timeline::get_vectored` right between the vectored read and it's validation read.
+    # This means that one of the reads can fail while the other one succeeds and vice versa.
+    # TODO(vlad): Remove this block once the vectored read path validation goes away.
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*initial_size_calculation.*Vectored get failed with downloading evicted layer file failed, but sequential get did not.*"
+            ".*initial_size_calculation.*Sequential get failed with downloading evicted layer file failed, but vectored get did not.*"
+        ]
+    )
+
     endpoint = env.endpoints.create_start("main")
 
     client = env.pageserver.http_client()

From b7385bb016a3264a5110e6309fff9fd218e95a97 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 3 May 2024 12:52:43 +0100
Subject: [PATCH 0683/1571] storage_controller: fix non-timeline passthrough
 GETs (#7602)

## Problem

We were matching on `/tenant/:tenant_id` and
`/tenant/:tenant_id/timeline*`, but not non-timeline tenant sub-paths.
There aren't many: this was only noticeable when using the
synthetic_size endpoint by hand.

## Summary of changes

- Change the wildcard from `/tenant/:tenant_id/timeline*` to
`/tenant/:tenant_id/*`
- Add test lines that exercise this
---
 storage_controller/src/http.rs                 | 11 ++++++-----
 test_runner/regress/test_storage_controller.py |  4 ++++
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index f9a79afb0d..604ad6fbaa 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -912,7 +912,7 @@ pub fn make_router(
                 RequestName("v1_tenant_timeline"),
             )
         })
-        // Tenant detail GET passthrough to shard zero
+        // Tenant detail GET passthrough to shard zero:
         .get("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(
                 r,
@@ -920,13 +920,14 @@ pub fn make_router(
                 RequestName("v1_tenant_passthrough"),
             )
         })
-        // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
-        // timeline GET APIs will be implicitly included.
-        .get("/v1/tenant/:tenant_id/timeline*", |r| {
+        // The `*` in the  URL is a wildcard: any tenant/timeline GET APIs on the pageserver
+        // are implicitly exposed here.  This must be last in the list to avoid
+        // taking precedence over other GET methods we might implement by hand.
+        .get("/v1/tenant/:tenant_id/*", |r| {
             tenant_service_handler(
                 r,
                 handle_tenant_timeline_passthrough,
-                RequestName("v1_tenant_timeline_passthrough"),
+                RequestName("v1_tenant_passthrough"),
             )
         })
 }
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 63accebc7c..fdcb4cf9a4 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -230,6 +230,10 @@ def test_storage_controller_passthrough(
     }
     assert status["state"]["slug"] == "Active"
 
+    (synthetic_size, size_inputs) = client.tenant_size_and_modelinputs(env.initial_tenant)
+    assert synthetic_size > 0
+    assert "segments" in size_inputs
+
     env.storage_controller.consistency_check()
 
 
From ed9a114bde38b971f49dd12b53163587477fdcc4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 3 May 2024 14:57:26 +0300
Subject: [PATCH 0684/1571] fix: find gc cutoff points without holding
 Tenant::gc_cs (#7585)

The current implementation of finding timeline gc cutoff Lsn(s) is done
while holding `Tenant::gc_cs`. In recent incidents long create branch
times were caused by holding the `Tenant::gc_cs` over extremely long
`Timeline::find_lsn_by_timestamp`. The fix is to find the GC cutoff
values before taking the `Tenant::gc_cs` lock. This change is safe to do
because the GC cutoff values and the branch points have no dependencies
on each other. In the case of `Timeline::find_gc_cutoff` taking a long
time with this change, we should no longer see `Tenant::gc_cs`
interfering with branch creation.

Additionally, the `Tenant::refresh_gc_info` is now tolerant of timeline
deletions (or any other failures to find the pitr_cutoff). This helps
with the synthetic size calculation being constantly completed instead
of having a break for a timely timeline deletion.

Fixes: #7560
Fixes: #7587
---
 pageserver/src/tenant.rs                | 75 +++++++++++++++++++++----
 pageserver/src/tenant/size.rs           |  5 +-
 pageserver/src/tenant/timeline.rs       |  2 +
 test_runner/regress/test_branching.py   | 24 ++++++++
 test_runner/regress/test_tenant_size.py | 67 +++++++++++++++++++++-
 5 files changed, 157 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a6cd1471ff..8fa484e7b2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -62,9 +62,9 @@ use self::timeline::uninit::TimelineCreateGuard;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
-use self::timeline::GcInfo;
 use self::timeline::TimelineResources;
 use self::timeline::WaitLsnError;
+use self::timeline::{GcCutoffs, GcInfo};
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
@@ -2812,7 +2812,48 @@ impl Tenant {
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<Vec<Arc<Timeline>>> {
-        // grab mutex to prevent new timelines from being created here.
+        // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
+        // currently visible timelines.
+        let timelines = self
+            .timelines
+            .lock()
+            .unwrap()
+            .values()
+            .filter(|tl| match target_timeline_id.as_ref() {
+                Some(target) => &tl.timeline_id == target,
+                None => true,
+            })
+            .cloned()
+            .collect::<Vec<_>>();
+
+        let mut gc_cutoffs: HashMap<TimelineId, GcCutoffs> =
+            HashMap::with_capacity(timelines.len());
+
+        for timeline in timelines.iter() {
+            let cutoff = timeline
+                .get_last_record_lsn()
+                .checked_sub(horizon)
+                .unwrap_or(Lsn(0));
+
+            let res = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await;
+
+            match res {
+                Ok(cutoffs) => {
+                    let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
+                    assert!(old.is_none());
+                }
+                Err(e) => {
+                    tracing::warn!(timeline_id = %timeline.timeline_id, "ignoring failure to find gc cutoffs: {e:#}");
+                }
+            }
+        }
+
+        if !self.is_active() {
+            anyhow::bail!("shutting down");
+        }
+
+        // grab mutex to prevent new timelines from being created here; avoid doing long operations
+        // because that will stall branch creation.
         let gc_cs = self.gc_cs.lock().await;
 
         // Scan all timelines. For each timeline, remember the timeline ID and
@@ -2874,11 +2915,6 @@ impl Tenant {
                 }
             }
 
-            let cutoff = timeline
-                .get_last_record_lsn()
-                .checked_sub(horizon)
-                .unwrap_or(Lsn(0));
-
             let branchpoints: Vec<Lsn> = all_branchpoints
                 .range((
                     Included((timeline_id, Lsn(0))),
@@ -2886,12 +2922,27 @@ impl Tenant {
                 ))
                 .map(|&x| x.1)
                 .collect();
-            let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
 
-            *timeline.gc_info.write().unwrap() = GcInfo {
-                retain_lsns: branchpoints,
-                cutoffs,
-            };
+            {
+                let mut target = timeline.gc_info.write().unwrap();
+
+                match gc_cutoffs.remove(&timeline_id) {
+                    Some(cutoffs) => {
+                        *target = GcInfo {
+                            retain_lsns: branchpoints,
+                            cutoffs,
+                        };
+                    }
+                    None => {
+                        // reasons for this being unavailable:
+                        // - this timeline was created while we were finding cutoffs
+                        // - lsn for timestamp search fails for this timeline repeatedly
+                        //
+                        // in both cases, refreshing the branchpoints is correct.
+                        target.retain_lsns = branchpoints;
+                    }
+                };
+            }
 
             gc_timelines.push(timeline);
         }
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 974c1091fd..64fff5536c 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -118,9 +118,6 @@ pub(super) async fn gather_inputs(
     ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
     // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
-    //
-    // FIXME: if a single timeline is deleted while refresh gc info is ongoing, we will fail the
-    // whole computation. It does not make sense from the billing perspective.
     tenant
         .refresh_gc_info(cancel, ctx)
         .await
@@ -221,6 +218,8 @@ pub(super) async fn gather_inputs(
             .map(|lsn| (lsn, LsnKind::BranchPoint))
             .collect::<Vec<_>>();
 
+        drop(gc_info);
+
         // Add branch points we collected earlier, just in case there were any that were
         // not present in retain_lsns. We will remove any duplicates below later.
         if let Some(this_branchpoints) = branchpoints.get(&timeline_id) {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7aeb3a6a59..19228bc1f1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4434,6 +4434,8 @@ impl Timeline {
             .start_timer()
             .record_on_drop();
 
+        pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");
+
         // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
         //
         // Some unit tests depend on garbage-collection working even when
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 5b69649007..9fe9f77fea 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -1,6 +1,7 @@
 import random
 import threading
 import time
+from concurrent.futures import ThreadPoolExecutor
 from typing import List
 
 import pytest
@@ -405,6 +406,29 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
     assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1
 
 
+def test_branching_while_stuck_find_gc_cutoffs(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+
+    client = env.pageserver.http_client()
+
+    failpoint = "Timeline::find_gc_cutoffs-pausable"
+
+    client.configure_failpoints((failpoint, "pause"))
+
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        completion = exec.submit(client.timeline_gc, env.initial_tenant, env.initial_timeline, None)
+
+        wait_until_paused(env, failpoint)
+
+        env.neon_cli.create_branch(
+            tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch"
+        )
+
+        client.configure_failpoints((failpoint, "off"))
+
+        completion.result()
+
+
 def wait_until_paused(env: NeonEnv, failpoint: str):
     found = False
     msg = f"at failpoint {failpoint}"
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index a588f6ab53..53da548524 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,4 +1,5 @@
 import os
+from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import List, Tuple
 
@@ -11,13 +12,15 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
     wait_for_wal_insert_lsn,
 )
-from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
+    tenant_delete_wait_completed,
     timeline_delete_wait_completed,
     wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import wait_until
 
 
 def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder):
@@ -616,6 +619,68 @@ def test_get_tenant_size_with_multiple_branches(
     size_debug_file.write(size_debug)
 
 
+def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
+    """
+    Makes sure synthetic size can still be calculated even if one of the
+    timelines is deleted or the tenant is deleted.
+    """
+
+    env = neon_env_builder.init_start()
+    failpoint = "Timeline::find_gc_cutoffs-pausable"
+    client = env.pageserver.http_client()
+
+    orig_size = client.tenant_size(env.initial_tenant)
+
+    branch_id = env.neon_cli.create_branch(
+        tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch"
+    )
+    client.configure_failpoints((failpoint, "pause"))
+
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        completion = exec.submit(client.tenant_size, env.initial_tenant)
+        _, last_offset = wait_until(
+            10, 1.0, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+        )
+
+        timeline_delete_wait_completed(client, env.initial_tenant, branch_id)
+
+        client.configure_failpoints((failpoint, "off"))
+        size = completion.result()
+
+        assert_size_approx_equal(orig_size, size)
+
+    branch_id = env.neon_cli.create_branch(
+        tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch2"
+    )
+    client.configure_failpoints((failpoint, "pause"))
+
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        completion = exec.submit(client.tenant_size, env.initial_tenant)
+        wait_until(
+            10,
+            1.0,
+            lambda: env.pageserver.assert_log_contains(
+                f"at failpoint {failpoint}", offset=last_offset
+            ),
+        )
+
+        tenant_delete_wait_completed(client, env.initial_tenant, 10)
+
+        client.configure_failpoints((failpoint, "off"))
+
+        with pytest.raises(
+            PageserverApiException, match="Failed to refresh gc_info before gathering inputs"
+        ):
+            completion.result()
+
+    # this happens on both cases
+    env.pageserver.allowed_errors.append(
+        ".*ignoring failure to find gc cutoffs: timeline shutting down.*"
+    )
+    # this happens only in the case of deletion (http response logging)
+    env.pageserver.allowed_errors.append(".*Failed to refresh gc_info before gathering inputs.*")
+
+
 # Helper for tests that compare timeline_inputs
 # We don't want to compare the exact values, because they can be unstable
 # and cause flaky tests. So replace the values with useful invariants.

From 8b4dd5dc277164dbb175319c39ee7b64ed9f9f91 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 3 May 2024 13:31:25 +0100
Subject: [PATCH 0685/1571] pageserver: jitter secondary periods (#7544)

## Problem

After some time the load from heatmap uploads gets rather spiky. They're
unintentionally synchronising.

Chart (does this make a _boing_ sound in anyone else's head?):

![image](https://github.com/neondatabase/neon/assets/944640/18829fc8-c5b7-4739-9a9b-491b5d6fcade)


## Summary of changes

- Add a helper `period_jitter` and apply a 5% jitter from downloader and
heatmap_uploader when updating the next runtime at the end of an
interation.
- Refactor existing places that we pick a startup interval into
`period_warmup`, so that the intent is obvious.
---
 pageserver/src/tenant/secondary/downloader.rs | 16 +++++++-------
 .../src/tenant/secondary/heatmap_uploader.rs  | 22 +++++++++----------
 pageserver/src/tenant/secondary/scheduler.rs  | 21 ++++++++++++++++++
 3 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 8a987b5ade..fb8907b5a8 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -31,7 +31,10 @@ use crate::{
 
 use super::{
     heatmap::HeatMapLayer,
-    scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs},
+    scheduler::{
+        self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult,
+        TenantBackgroundJobs,
+    },
     SecondaryTenant,
 };
 
@@ -45,7 +48,6 @@ use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
-use rand::Rng;
 use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
 
 use tokio_util::sync::CancellationToken;
@@ -274,7 +276,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
         // Update freshened_at even if there was an error: we don't want errored tenants to implicitly
         // take priority to run again.
         let mut detail = secondary_state.detail.lock().unwrap();
-        detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL);
+        detail.next_download = Some(Instant::now() + period_jitter(DOWNLOAD_FRESHEN_INTERVAL, 5));
     }
 
     async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
@@ -305,11 +307,9 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                     }
 
                     if detail.next_download.is_none() {
-                        // Initialize with a jitter: this spreads initial downloads on startup
-                        // or mass-attach across our freshen interval.
-                        let jittered_period =
-                            rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL);
-                        detail.next_download = Some(now.checked_add(jittered_period).expect(
+                        // Initialize randomly in the range from 0 to our interval: this uniformly spreads the start times.  Subsequent
+                        // rounds will use a smaller jitter to avoid accidentally synchronizing later.
+                        detail.next_download = Some(now.checked_add(period_warmup(DOWNLOAD_FRESHEN_INTERVAL)).expect(
                         "Using our constant, which is known to be small compared with clock range",
                     ));
                     }
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 39d088ffc3..352409f5fc 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -20,12 +20,14 @@ use crate::{
 
 use futures::Future;
 use pageserver_api::shard::TenantShardId;
-use rand::Rng;
 use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};
 
 use super::{
     heatmap::HeatMapTenant,
-    scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
+    scheduler::{
+        self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult,
+        TenantBackgroundJobs,
+    },
     CommandRequest, UploadCommand,
 };
 use tokio_util::sync::CancellationToken;
@@ -181,15 +183,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
             let state = self
                 .tenants
                 .entry(*tenant.get_tenant_shard_id())
-                .or_insert_with(|| {
-                    let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period);
-
-                    UploaderTenantState {
-                        tenant: Arc::downgrade(&tenant),
-                        last_upload: None,
-                        next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)),
-                        last_digest: None,
-                    }
+                .or_insert_with(|| UploaderTenantState {
+                    tenant: Arc::downgrade(&tenant),
+                    last_upload: None,
+                    next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)),
+                    last_digest: None,
                 });
 
             // Decline to do the upload if insufficient time has passed
@@ -274,7 +272,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
 
             let next_upload = tenant
                 .get_heatmap_period()
-                .and_then(|period| now.checked_add(period));
+                .and_then(|period| now.checked_add(period_jitter(period, 5)));
 
             WriteComplete {
                     tenant_shard_id: *tenant.get_tenant_shard_id(),
diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs
index 3bd7be782e..3d042f4513 100644
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -1,4 +1,5 @@
 use futures::Future;
+use rand::Rng;
 use std::{
     collections::HashMap,
     marker::PhantomData,
@@ -19,6 +20,26 @@ use super::{CommandRequest, CommandResponse};
 const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10);
 const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1);
 
+/// Jitter a Duration by an integer percentage.  Returned values are uniform
+/// in the range 100-pct..100+pct (i.e. a 5% jitter is 5% either way: a ~10% range)
+pub(super) fn period_jitter(d: Duration, pct: u32) -> Duration {
+    if d == Duration::ZERO {
+        d
+    } else {
+        rand::thread_rng().gen_range((d * (100 - pct)) / 100..(d * (100 + pct)) / 100)
+    }
+}
+
+/// When a periodic task first starts, it should wait for some time in the range 0..period, so
+/// that starting many such tasks at the same time spreads them across the time range.
+pub(super) fn period_warmup(period: Duration) -> Duration {
+    if period == Duration::ZERO {
+        period
+    } else {
+        rand::thread_rng().gen_range(Duration::ZERO..period)
+    }
+}
+
 /// Scheduling helper for background work across many tenants.
 ///
 /// Systems that need to run background work across many tenants may use this type

From 426598cf76d5cc77471b000b9d9880df5059cfa3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 3 May 2024 15:59:28 +0200
Subject: [PATCH 0686/1571] Update rust to 1.78.0 (#7598)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

Release notes: https://blog.rust-lang.org/2024/05/02/Rust-1.78.0.html

Prior update was in #7198
---
 Dockerfile.build-tools                        |  4 +--
 compute_tools/src/spec.rs                     |  2 +-
 control_plane/src/local_env.rs                |  5 ++-
 libs/pageserver_api/src/shard.rs              |  6 ++--
 pageserver/src/pgdatadir_mapping.rs           | 33 ++++++++++---------
 pageserver/src/tenant/layer_map.rs            | 12 +++----
 .../walreceiver/connection_manager.rs         |  4 +--
 .../virtual_file/owned_buffers_io/write.rs    |  7 ++--
 rust-toolchain.toml                           |  2 +-
 9 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index a082f15c34..19739cc1f8 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -65,7 +65,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
     && mv s5cmd /usr/local/bin/s5cmd
 
 # LLVM
-ENV LLVM_VERSION=17
+ENV LLVM_VERSION=18
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
     && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
     && apt update \
@@ -141,7 +141,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.77.0
+ENV RUSTC_VERSION=1.78.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 269177ee16..3a6e18b638 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -490,7 +490,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                 "rename_db" => {
                     let new_name = op.new_name.as_ref().unwrap();
 
-                    if existing_dbs.get(&op.name).is_some() {
+                    if existing_dbs.contains_key(&op.name) {
                         let query: String = format!(
                             "ALTER DATABASE {} RENAME TO {}",
                             op.name.pg_quote(),
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 59b587389c..6437d04ec8 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -382,7 +382,10 @@ impl LocalEnv {
 
         // Find neon binaries.
         if env.neon_distrib_dir == Path::new("") {
-            env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
+            env::current_exe()?
+                .parent()
+                .unwrap()
+                .clone_into(&mut env.neon_distrib_dir);
         }
 
         if env.pageservers.is_empty() {
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index d769b2fd2f..ff6d3d91b6 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -97,7 +97,7 @@ impl ShardCount {
 
     /// The internal value of a ShardCount may be zero, which means "1 shard, but use
     /// legacy format for TenantShardId that excludes the shard suffix", also known
-    /// as `TenantShardId::unsharded`.
+    /// as [`TenantShardId::unsharded`].
     ///
     /// This method returns the actual number of shards, i.e. if our internal value is
     /// zero, we return 1 (unsharded tenants have 1 shard).
@@ -116,7 +116,9 @@ impl ShardCount {
         self.0
     }
 
-    ///
+    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
+    /// uses the legacy format for `TenantShardId`. See also the documentation for
+    /// [`Self::count`].
     pub fn is_unsharded(&self) -> bool {
         self.0 == 0
     }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 015191b875..12314c5961 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -279,7 +279,7 @@ impl Timeline {
 
         match RelDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
-                let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
+                let exists = dir.rels.contains(&(tag.relnode, tag.forknum));
                 Ok(exists)
             }
             Err(e) => Err(PageReconstructError::from(e)),
@@ -379,7 +379,7 @@ impl Timeline {
 
         match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
-                let exists = dir.segments.get(&segno).is_some();
+                let exists = dir.segments.contains(&segno);
                 Ok(exists)
             }
             Err(e) => Err(PageReconstructError::from(e)),
@@ -1143,21 +1143,22 @@ impl<'a> DatadirModification<'a> {
         let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
             .context("deserialize db")?;
         let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
-            // Didn't exist. Update dbdir
-            dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
-            let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
-            self.pending_directory_entries
-                .push((DirectoryKind::Db, dbdir.dbdirs.len()));
-            self.put(DBDIR_KEY, Value::Image(buf.into()));
+        let mut rel_dir =
+            if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
+                // Didn't exist. Update dbdir
+                e.insert(false);
+                let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
+                self.pending_directory_entries
+                    .push((DirectoryKind::Db, dbdir.dbdirs.len()));
+                self.put(DBDIR_KEY, Value::Image(buf.into()));
 
-            // and create the RelDirectory
-            RelDirectory::default()
-        } else {
-            // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
-                .context("deserialize db")?
-        };
+                // and create the RelDirectory
+                RelDirectory::default()
+            } else {
+                // reldir already exists, fetch it
+                RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
+                    .context("deserialize db")?
+            };
 
         // Add the new relation to the rel directory entry, and write it back
         if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 3c4de8fe4d..2724a5cc07 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -588,7 +588,7 @@ impl LayerMap {
             let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
             coverage.push((kr, current_val.take()));
             current_key = change_key;
-            current_val = change_val.clone();
+            current_val.clone_from(&change_val);
         }
 
         // Add the final interval
@@ -672,12 +672,12 @@ impl LayerMap {
         // Loop through the delta coverage and recurse on each part
         for (change_key, change_val) in version.delta_coverage.range(start..end) {
             // If there's a relevant delta in this part, add 1 and recurse down
-            if let Some(val) = current_val {
+            if let Some(val) = &current_val {
                 if val.get_lsn_range().end > lsn.start {
                     let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
                     let lr = lsn.start..val.get_lsn_range().start;
                     if !kr.is_empty() {
-                        let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                        let base_count = Self::is_reimage_worthy(val, key) as usize;
                         let new_limit = limit.map(|l| l - base_count);
                         let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
                         max_stacked_deltas = std::cmp::max(
@@ -689,17 +689,17 @@ impl LayerMap {
             }
 
             current_key = change_key;
-            current_val = change_val.clone();
+            current_val.clone_from(&change_val);
         }
 
         // Consider the last part
-        if let Some(val) = current_val {
+        if let Some(val) = &current_val {
             if val.get_lsn_range().end > lsn.start {
                 let kr = Key::from_i128(current_key)..Key::from_i128(end);
                 let lr = lsn.start..val.get_lsn_range().start;
 
                 if !kr.is_empty() {
-                    let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                    let base_count = Self::is_reimage_worthy(val, key) as usize;
                     let new_limit = limit.map(|l| l - base_count);
                     let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
                     max_stacked_deltas = std::cmp::max(
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 7ef063c4e5..991e4ac045 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -1535,7 +1535,7 @@ mod tests {
 
         let harness = TenantHarness::create("switch_to_same_availability_zone")?;
         let mut state = dummy_state(&harness).await;
-        state.conf.availability_zone = test_az.clone();
+        state.conf.availability_zone.clone_from(&test_az);
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
 
@@ -1568,7 +1568,7 @@ mod tests {
         // We have another safekeeper with the same commit_lsn, and it have the same availability zone as
         // the current pageserver.
         let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now);
-        same_az_sk.timeline.availability_zone = test_az.clone();
+        same_az_sk.timeline.availability_zone.clone_from(&test_az);
 
         state.wal_stream_candidates = HashMap::from([
             (
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index ac5169508f..885a9221c5 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -76,14 +76,11 @@ where
     }
 
     #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub async fn write_buffered<S: IoBuf>(
+    pub async fn write_buffered<S: IoBuf + Send>(
         &mut self,
         chunk: Slice<S>,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, S)>
-    where
-        S: IoBuf + Send,
-    {
+    ) -> std::io::Result<(usize, S)> {
         let chunk_len = chunk.len();
         // avoid memcpy for the middle of the chunk
         if chunk.len() >= self.buf().cap() {
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 50a5a4185b..214de0a77d 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.77.0"
+channel = "1.78.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From ce0ddd749c5945f0660ec0f9327c8aacc77f4666 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 3 May 2024 16:05:00 +0200
Subject: [PATCH 0687/1571] test_runner: remove unused
 `NeonPageserver.config_override` field (#7605)

refs https://github.com/neondatabase/neon/issues/7555
---
 test_runner/fixtures/neon_fixtures.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 19aa4cc886..90884ad7f8 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1104,7 +1104,6 @@ class NeonEnv:
                     self,
                     ps_id,
                     port=pageserver_port,
-                    config_override=self.pageserver_config_override,
                 )
             )
             cfg["pageservers"].append(ps_cfg)
@@ -2373,15 +2372,12 @@ class NeonPageserver(PgProtocol, LogUtils):
 
     TEMP_FILE_SUFFIX = "___temp"
 
-    def __init__(
-        self, env: NeonEnv, id: int, port: PageserverPort, config_override: Optional[str] = None
-    ):
+    def __init__(self, env: NeonEnv, id: int, port: PageserverPort):
         super().__init__(host="localhost", port=port.pg, user="cloud_admin")
         self.env = env
         self.id = id
         self.running = False
         self.service_port = port
-        self.config_override = config_override
         self.version = env.get_binary_version("pageserver")
         self.logfile = self.workdir / "pageserver.log"
         # After a test finishes, we will scrape the log to see if there are any

From b5a6e68e686128652b491aa3fb6cfcfdc0a611ad Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 3 May 2024 15:28:23 +0100
Subject: [PATCH 0688/1571] storage controller: check warmth of secondary
 before doing proactive migration (#7583)

## Problem

The logic in Service::optimize_all would sometimes choose to migrate a
tenant to a secondary location that was only recently created, resulting
in Reconciler::live_migrate hitting its 5 minute timeout warming up the
location, and proceeding to attach a tenant to a location that doesn't
have a warm enough local set of layer files for good performance.

Closes: #7532

## Summary of changes

- Add a pageserver API for checking download progress of a secondary
location
- During `optimize_all`, connect to pageservers of candidate
optimization secondary locations, and check they are warm.
- During shard split, do heatmap uploads and start secondary downloads,
so that the new shards' secondary locations start downloading ASAP,
rather than waiting minutes for background downloads to kick in.

I have intentionally not implemented this by continuously reading the
status of locations, to avoid dealing with the scale challenge of
efficiently polling & updating 10k-100k locations status. If we
implement that in the future, then this code can be simplified to act
based on latest state of a location rather than fetching it inline
during optimize_all.
---
 pageserver/client/src/mgmt_api.rs           |  28 ++
 pageserver/src/http/routes.rs               |  24 ++
 storage_controller/src/pageserver_client.rs |  21 ++
 storage_controller/src/service.rs           | 352 ++++++++++++++++++--
 storage_controller/src/tenant_shard.rs      |  87 +++--
 test_runner/regress/test_sharding.py        |  22 +-
 6 files changed, 471 insertions(+), 63 deletions(-)

diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index bc66c5c6e1..6df8b2170d 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -284,6 +284,34 @@ impl Client {
         Ok((status, progress))
     }
 
+    pub async fn tenant_secondary_status(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<SecondaryProgress> {
+        let path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{}/secondary/status",
+            self.mgmt_api_endpoint, tenant_shard_id
+        ))
+        .expect("Cannot build URL");
+
+        self.request(Method::GET, path, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> {
+        let path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{}/heatmap_upload",
+            self.mgmt_api_endpoint, tenant_id
+        ))
+        .expect("Cannot build URL");
+
+        self.request(Method::POST, path, ()).await?;
+        Ok(())
+    }
+
     pub async fn location_config(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index ae1e7aac78..cf526940f4 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2160,6 +2160,27 @@ async fn secondary_download_handler(
     json_response(status, progress)
 }
 
+async fn secondary_status_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+
+    let Some(secondary_tenant) = state
+        .tenant_manager
+        .get_secondary_tenant_shard(tenant_shard_id)
+    else {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(),
+        ));
+    };
+
+    let progress = secondary_tenant.progress.lock().unwrap().clone();
+
+    json_response(StatusCode::OK, progress)
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(
         StatusCode::NOT_FOUND,
@@ -2521,6 +2542,9 @@ pub fn make_router(
         .put("/v1/deletion_queue/flush", |r| {
             api_handler(r, deletion_queue_flush)
         })
+        .get("/v1/tenant/:tenant_shard_id/secondary/status", |r| {
+            api_handler(r, secondary_status_handler)
+        })
         .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
             api_handler(r, secondary_download_handler)
         })
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 0cea205599..25b6b67e12 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -114,6 +114,27 @@ impl PageserverClient {
         )
     }
 
+    pub(crate) async fn tenant_secondary_status(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<SecondaryProgress> {
+        measured_request!(
+            "tenant_secondary_status",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.tenant_secondary_status(tenant_shard_id).await
+        )
+    }
+
+    pub(crate) async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> {
+        measured_request!(
+            "tenant_heatmap_upload",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.tenant_heatmap_upload(tenant_id).await
+        )
+    }
+
     pub(crate) async fn location_config(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index eaff87d1ce..d3a53066c9 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -13,7 +13,9 @@ use crate::{
     persistence::{AbortShardSplitStatus, TenantFilter},
     reconciler::{ReconcileError, ReconcileUnits},
     scheduler::{ScheduleContext, ScheduleMode},
-    tenant_shard::ReconcileNeeded,
+    tenant_shard::{
+        MigrateAttachment, ReconcileNeeded, ScheduleOptimization, ScheduleOptimizationAction,
+    },
 };
 use anyhow::Context;
 use control_plane::storage_controller::{
@@ -709,7 +711,7 @@ impl Service {
                 let reconciles_spawned = self.reconcile_all();
                 if reconciles_spawned == 0 {
                     // Run optimizer only when we didn't find any other work to do
-                    self.optimize_all();
+                    self.optimize_all().await;
                 }
             }
               _ = self.cancel.cancelled() => return
@@ -2639,6 +2641,45 @@ impl Service {
         Ok(results)
     }
 
+    /// Concurrently invoke a pageserver API call on many shards at once
+    pub(crate) async fn tenant_for_shards_api<T, O, F>(
+        &self,
+        locations: Vec<(TenantShardId, Node)>,
+        op: O,
+        warn_threshold: u32,
+        max_retries: u32,
+        timeout: Duration,
+        cancel: &CancellationToken,
+    ) -> Vec<mgmt_api::Result<T>>
+    where
+        O: Fn(TenantShardId, PageserverClient) -> F + Copy,
+        F: std::future::Future<Output = mgmt_api::Result<T>>,
+    {
+        let mut futs = FuturesUnordered::new();
+        let mut results = Vec::with_capacity(locations.len());
+
+        for (tenant_shard_id, node) in locations {
+            futs.push(async move {
+                node.with_client_retries(
+                    |client| op(tenant_shard_id, client),
+                    &self.config.jwt_token,
+                    warn_threshold,
+                    max_retries,
+                    timeout,
+                    cancel,
+                )
+                .await
+            });
+        }
+
+        while let Some(r) = futs.next().await {
+            let r = r.unwrap_or(Err(mgmt_api::Error::Cancelled));
+            results.push(r);
+        }
+
+        results
+    }
+
     pub(crate) async fn tenant_timeline_delete(
         &self,
         tenant_id: TenantId,
@@ -3088,11 +3129,14 @@ impl Service {
     ) -> (
         TenantShardSplitResponse,
         Vec<(TenantShardId, NodeId, ShardStripeSize)>,
+        Vec<ReconcilerWaiter>,
     ) {
         let mut response = TenantShardSplitResponse {
             new_shards: Vec::new(),
         };
         let mut child_locations = Vec::new();
+        let mut waiters = Vec::new();
+
         {
             let mut locked = self.inner.write().unwrap();
 
@@ -3171,14 +3215,112 @@ impl Service {
                         tracing::warn!("Failed to schedule child shard {child}: {e}");
                     }
                     // In the background, attach secondary locations for the new shards
-                    self.maybe_reconcile_shard(&mut child_state, nodes);
+                    if let Some(waiter) = self.maybe_reconcile_shard(&mut child_state, nodes) {
+                        waiters.push(waiter);
+                    }
 
                     tenants.insert(child, child_state);
                     response.new_shards.push(child);
                 }
             }
+            (response, child_locations, waiters)
+        }
+    }
 
-            (response, child_locations)
+    async fn tenant_shard_split_start_secondaries(
+        &self,
+        tenant_id: TenantId,
+        waiters: Vec<ReconcilerWaiter>,
+    ) {
+        // Wait for initial reconcile of child shards, this creates the secondary locations
+        if let Err(e) = self.await_waiters(waiters, RECONCILE_TIMEOUT).await {
+            // This is not a failure to split: it's some issue reconciling the new child shards, perhaps
+            // their secondaries couldn't be attached.
+            tracing::warn!("Failed to reconcile after split: {e}");
+            return;
+        }
+
+        // Take the state lock to discover the attached & secondary intents for all shards
+        let (attached, secondary) = {
+            let locked = self.inner.read().unwrap();
+            let mut attached = Vec::new();
+            let mut secondary = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                let Some(node_id) = shard.intent.get_attached() else {
+                    // Unexpected.  Race with a PlacementPolicy change?
+                    tracing::warn!(
+                        "No attached node on {tenant_shard_id} immediately after shard split!"
+                    );
+                    continue;
+                };
+
+                let Some(secondary_node_id) = shard.intent.get_secondary().first() else {
+                    // No secondary location.  Nothing for us to do.
+                    continue;
+                };
+
+                let attached_node = locked
+                    .nodes
+                    .get(node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                let secondary_node = locked
+                    .nodes
+                    .get(secondary_node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                attached.push((*tenant_shard_id, attached_node.clone()));
+                secondary.push((*tenant_shard_id, secondary_node.clone()));
+            }
+            (attached, secondary)
+        };
+
+        if secondary.is_empty() {
+            // No secondary locations; nothing for us to do
+            return;
+        }
+
+        for result in self
+            .tenant_for_shards_api(
+                attached,
+                |tenant_shard_id, client| async move {
+                    client.tenant_heatmap_upload(tenant_shard_id).await
+                },
+                1,
+                1,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await
+        {
+            if let Err(e) = result {
+                tracing::warn!("Error calling heatmap upload after shard split: {e}");
+                return;
+            }
+        }
+
+        for result in self
+            .tenant_for_shards_api(
+                secondary,
+                |tenant_shard_id, client| async move {
+                    client
+                        .tenant_secondary_download(tenant_shard_id, Some(Duration::ZERO))
+                        .await
+                },
+                1,
+                1,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await
+        {
+            if let Err(e) = result {
+                tracing::warn!("Error calling secondary download after shard split: {e}");
+                return;
+            }
         }
     }
 
@@ -3212,8 +3354,8 @@ impl Service {
             .do_tenant_shard_split(tenant_id, shard_split_params)
             .await;
 
-        match r {
-            Ok(r) => Ok(r),
+        let (response, waiters) = match r {
+            Ok(r) => r,
             Err(e) => {
                 // Split might be part-done, we must do work to abort it.
                 tracing::warn!("Enqueuing background abort of split on {tenant_id}");
@@ -3226,9 +3368,17 @@ impl Service {
                     })
                     // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it.
                     .ok();
-                Err(e)
+                return Err(e);
             }
-        }
+        };
+
+        // The split is now complete.  As an optimization, we will trigger all the child shards to upload
+        // a heatmap immediately, and all their secondary locations to start downloading: this avoids waiting
+        // for the background heatmap/download interval before secondaries get warm enough to migrate shards
+        // in [`Self::optimize_all`]
+        self.tenant_shard_split_start_secondaries(tenant_id, waiters)
+            .await;
+        Ok(response)
     }
 
     fn prepare_tenant_shard_split(
@@ -3378,7 +3528,7 @@ impl Service {
         &self,
         tenant_id: TenantId,
         params: ShardSplitParams,
-    ) -> Result<TenantShardSplitResponse, ApiError> {
+    ) -> Result<(TenantShardSplitResponse, Vec<ReconcilerWaiter>), ApiError> {
         // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
         // request could occur here, deleting or mutating the tenant.  begin_shard_split checks that the
         // parent shards exist as expected, but it would be neater to do the above pre-checks within the
@@ -3580,7 +3730,7 @@ impl Service {
         ));
 
         // Replace all the shards we just split with their children: this phase is infallible.
-        let (response, child_locations) =
+        let (response, child_locations, waiters) =
             self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
 
         // Send compute notifications for all the new shards
@@ -3607,7 +3757,7 @@ impl Service {
             }
         }
 
-        Ok(response)
+        Ok((response, waiters))
     }
 
     pub(crate) async fn tenant_shard_migrate(
@@ -4373,25 +4523,68 @@ impl Service {
     /// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at
     /// the time of scheduling, this function looks for cases where a better-scoring location is available
     /// according to those same soft constraints.
-    fn optimize_all(&self) -> usize {
-        let mut locked = self.inner.write().unwrap();
-        let (nodes, tenants, scheduler) = locked.parts_mut();
-        let pageservers = nodes.clone();
-
-        let mut schedule_context = ScheduleContext::default();
-
-        let mut reconciles_spawned = 0;
-
-        let mut tenant_shards: Vec<&TenantShard> = Vec::new();
-
+    async fn optimize_all(&self) -> usize {
         // Limit on how many shards' optmizations each call to this function will execute.  Combined
         // with the frequency of background calls, this acts as an implicit rate limit that runs a small
         // trickle of optimizations in the background, rather than executing a large number in parallel
         // when a change occurs.
-        const MAX_OPTIMIZATIONS_PER_PASS: usize = 2;
+        const MAX_OPTIMIZATIONS_EXEC_PER_PASS: usize = 2;
+
+        // Synchronous prepare: scan shards for possible scheduling optimizations
+        let candidate_work = self.optimize_all_plan();
+        let candidate_work_len = candidate_work.len();
+
+        // Asynchronous validate: I/O to pageservers to make sure shards are in a good state to apply validation
+        let validated_work = self.optimize_all_validate(candidate_work).await;
+
+        let was_work_filtered = validated_work.len() != candidate_work_len;
+
+        // Synchronous apply: update the shards' intent states according to validated optimisations
+        let mut reconciles_spawned = 0;
+        let mut optimizations_applied = 0;
+        let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
+        for (tenant_shard_id, optimization) in validated_work {
+            let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
+                // Shard was dropped between planning and execution;
+                continue;
+            };
+            if shard.apply_optimization(scheduler, optimization) {
+                optimizations_applied += 1;
+                if self.maybe_reconcile_shard(shard, nodes).is_some() {
+                    reconciles_spawned += 1;
+                }
+            }
+
+            if optimizations_applied >= MAX_OPTIMIZATIONS_EXEC_PER_PASS {
+                break;
+            }
+        }
+
+        if was_work_filtered {
+            // If we filtered any work out during validation, ensure we return a nonzero value to indicate
+            // to callers that the system is not in a truly quiet state, it's going to do some work as soon
+            // as these validations start passing.
+            reconciles_spawned = std::cmp::max(reconciles_spawned, 1);
+        }
+
+        reconciles_spawned
+    }
+
+    fn optimize_all_plan(&self) -> Vec<(TenantShardId, ScheduleOptimization)> {
+        let mut schedule_context = ScheduleContext::default();
+
+        let mut tenant_shards: Vec<&TenantShard> = Vec::new();
+
+        // How many candidate optimizations we will generate, before evaluating them for readniess: setting
+        // this higher than the execution limit gives us a chance to execute some work even if the first
+        // few optimizations we find are not ready.
+        const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 8;
 
         let mut work = Vec::new();
 
+        let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
         for (tenant_shard_id, shard) in tenants.iter() {
             if tenant_shard_id.is_shard_zero() {
                 // Reset accumulators on the first shard in a tenant
@@ -4400,7 +4593,7 @@ impl Service {
                 tenant_shards.clear();
             }
 
-            if work.len() >= MAX_OPTIMIZATIONS_PER_PASS {
+            if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS {
                 break;
             }
 
@@ -4472,18 +4665,105 @@ impl Service {
             }
         }
 
-        for (tenant_shard_id, optimization) in work {
-            let shard = tenants
-                .get_mut(&tenant_shard_id)
-                .expect("We held lock from place we got this ID");
-            shard.apply_optimization(scheduler, optimization);
+        work
+    }
 
-            if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
-                reconciles_spawned += 1;
+    async fn optimize_all_validate(
+        &self,
+        candidate_work: Vec<(TenantShardId, ScheduleOptimization)>,
+    ) -> Vec<(TenantShardId, ScheduleOptimization)> {
+        // Take a clone of the node map to use outside the lock in async validation phase
+        let validation_nodes = { self.inner.read().unwrap().nodes.clone() };
+
+        let mut want_secondary_status = Vec::new();
+
+        // Validate our plans: this is an async phase where we may do I/O to pageservers to
+        // check that the state of locations is acceptable to run the optimization, such as
+        // checking that a secondary location is sufficiently warmed-up to cleanly cut over
+        // in a live migration.
+        let mut validated_work = Vec::new();
+        for (tenant_shard_id, optimization) in candidate_work {
+            match optimization.action {
+                ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                    old_attached_node_id: _,
+                    new_attached_node_id,
+                }) => {
+                    match validation_nodes.get(&new_attached_node_id) {
+                        None => {
+                            // Node was dropped between planning and validation
+                        }
+                        Some(node) => {
+                            if !node.is_available() {
+                                tracing::info!("Skipping optimization migration of {tenant_shard_id} to {new_attached_node_id} because node unavailable");
+                            } else {
+                                // Accumulate optimizations that require fetching secondary status, so that we can execute these
+                                // remote API requests concurrently.
+                                want_secondary_status.push((
+                                    tenant_shard_id,
+                                    node.clone(),
+                                    optimization,
+                                ));
+                            }
+                        }
+                    }
+                }
+                ScheduleOptimizationAction::ReplaceSecondary(_) => {
+                    // No extra checks needed to replace a secondary: this does not interrupt client access
+                    validated_work.push((tenant_shard_id, optimization))
+                }
+            };
+        }
+
+        // Call into pageserver API to find out if the destination secondary location is warm enough for a reasonably smooth migration: we
+        // do this so that we avoid spawning a Reconciler that would have to wait minutes/hours for a destination to warm up: that reconciler
+        // would hold a precious reconcile semaphore unit the whole time it was waiting for the destination to warm up.
+        let results = self
+            .tenant_for_shards_api(
+                want_secondary_status
+                    .iter()
+                    .map(|i| (i.0, i.1.clone()))
+                    .collect(),
+                |tenant_shard_id, client| async move {
+                    client.tenant_secondary_status(tenant_shard_id).await
+                },
+                1,
+                1,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await;
+
+        for ((tenant_shard_id, node, optimization), secondary_status) in
+            want_secondary_status.into_iter().zip(results.into_iter())
+        {
+            match secondary_status {
+                Err(e) => {
+                    tracing::info!("Skipping migration of {tenant_shard_id} to {node}, error querying secondary: {e}");
+                }
+                Ok(progress) => {
+                    // We require secondary locations to have less than 10GiB of downloads pending before we will use
+                    // them in an optimization
+                    const DOWNLOAD_FRESHNESS_THRESHOLD: u64 = 10 * 1024 * 1024 * 1024;
+
+                    if progress.bytes_total == 0
+                        || progress.bytes_total < DOWNLOAD_FRESHNESS_THRESHOLD
+                            && progress.bytes_downloaded != progress.bytes_total
+                        || progress.bytes_total - progress.bytes_downloaded
+                            > DOWNLOAD_FRESHNESS_THRESHOLD
+                    {
+                        tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}");
+                    } else {
+                        // Location looks ready: proceed
+                        tracing::info!(
+                            "{tenant_shard_id} secondary on {node} is warm enough for migration: {progress:?}"
+                        );
+                        validated_work.push((tenant_shard_id, optimization))
+                    }
+                }
             }
         }
 
-        reconciles_spawned
+        validated_work
     }
 
     /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but
@@ -4491,10 +4771,12 @@ impl Service {
     /// put the system into a quiescent state where future background reconciliations won't do anything.
     pub(crate) async fn reconcile_all_now(&self) -> Result<usize, ReconcileWaitError> {
         let reconciles_spawned = self.reconcile_all();
-        if reconciles_spawned == 0 {
+        let reconciles_spawned = if reconciles_spawned == 0 {
             // Only optimize when we are otherwise idle
-            self.optimize_all();
-        }
+            self.optimize_all().await
+        } else {
+            reconciles_spawned
+        };
 
         let waiters = {
             let mut waiters = Vec::new();
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index ffbf2c4b7a..dda17f9887 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -325,18 +325,28 @@ pub(crate) struct ReplaceSecondary {
 
 #[derive(Eq, PartialEq, Debug)]
 pub(crate) struct MigrateAttachment {
-    old_attached_node_id: NodeId,
-    new_attached_node_id: NodeId,
+    pub(crate) old_attached_node_id: NodeId,
+    pub(crate) new_attached_node_id: NodeId,
 }
 
 #[derive(Eq, PartialEq, Debug)]
-pub(crate) enum ScheduleOptimization {
+pub(crate) enum ScheduleOptimizationAction {
     // Replace one of our secondary locations with a different node
     ReplaceSecondary(ReplaceSecondary),
     // Migrate attachment to an existing secondary location
     MigrateAttachment(MigrateAttachment),
 }
 
+#[derive(Eq, PartialEq, Debug)]
+pub(crate) struct ScheduleOptimization {
+    // What was the reconcile sequence when we generated this optimization?  The optimization
+    // should only be applied if the shard's sequence is still at this value, in case other changes
+    // happened between planning the optimization and applying it.
+    sequence: Sequence,
+
+    pub(crate) action: ScheduleOptimizationAction,
+}
+
 impl ReconcilerWaiter {
     pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
         tokio::select! {
@@ -675,10 +685,13 @@ impl TenantShard {
                         "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
                         self.intent.get_secondary()
                     );
-                    return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                        old_attached_node_id: attached,
-                        new_attached_node_id: *preferred_node,
-                    }));
+                    return Some(ScheduleOptimization {
+                        sequence: self.sequence,
+                        action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                            old_attached_node_id: attached,
+                            new_attached_node_id: *preferred_node,
+                        }),
+                    });
                 }
             } else {
                 tracing::debug!(
@@ -736,28 +749,37 @@ impl TenantShard {
                     "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
                     self.intent.get_secondary()
                 );
-                return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                    old_node_id: *secondary,
-                    new_node_id: candidate_node,
-                }));
+                return Some(ScheduleOptimization {
+                    sequence: self.sequence,
+                    action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
+                        old_node_id: *secondary,
+                        new_node_id: candidate_node,
+                    }),
+                });
             }
         }
 
         None
     }
 
+    /// Return true if the optimization was really applied: it will not be applied if the optimization's
+    /// sequence is behind this tenant shard's
     pub(crate) fn apply_optimization(
         &mut self,
         scheduler: &mut Scheduler,
         optimization: ScheduleOptimization,
-    ) {
+    ) -> bool {
+        if optimization.sequence != self.sequence {
+            return false;
+        }
+
         metrics::METRICS_REGISTRY
             .metrics_group
             .storage_controller_schedule_optimization
             .inc();
 
-        match optimization {
-            ScheduleOptimization::MigrateAttachment(MigrateAttachment {
+        match optimization.action {
+            ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
                 old_attached_node_id,
                 new_attached_node_id,
             }) => {
@@ -765,7 +787,7 @@ impl TenantShard {
                 self.intent
                     .promote_attached(scheduler, new_attached_node_id);
             }
-            ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
+            ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
                 old_node_id,
                 new_node_id,
             }) => {
@@ -773,6 +795,8 @@ impl TenantShard {
                 self.intent.push_secondary(scheduler, new_node_id);
             }
         }
+
+        true
     }
 
     /// Query whether the tenant's observed state for attached node matches its intent state, and if so,
@@ -1428,10 +1452,13 @@ pub(crate) mod tests {
         // would be no other shards from the same tenant, and request to do so.
         assert_eq!(
             optimization_a,
-            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id: NodeId(1),
-                new_attached_node_id: NodeId(2)
-            }))
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                    old_attached_node_id: NodeId(1),
+                    new_attached_node_id: NodeId(2)
+                })
+            })
         );
 
         // Note that these optimizing two shards in the same tenant with the same ScheduleContext is
@@ -1442,10 +1469,13 @@ pub(crate) mod tests {
         let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
         assert_eq!(
             optimization_b,
-            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id: NodeId(1),
-                new_attached_node_id: NodeId(3)
-            }))
+            Some(ScheduleOptimization {
+                sequence: shard_b.sequence,
+                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                    old_attached_node_id: NodeId(1),
+                    new_attached_node_id: NodeId(3)
+                })
+            })
         );
 
         // Applying these optimizations should result in the end state proposed
@@ -1489,10 +1519,13 @@ pub(crate) mod tests {
         // same tenant should generate an optimization to move one away
         assert_eq!(
             optimization_a,
-            Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                old_node_id: NodeId(3),
-                new_node_id: NodeId(4)
-            }))
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
+                    old_node_id: NodeId(3),
+                    new_node_id: NodeId(4)
+                })
+            })
         );
 
         shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index bae5945338..258377f8a2 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -287,6 +287,11 @@ def test_sharding_split_smoke(
         == shard_count
     )
 
+    # Make secondary downloads slow: this exercises the storage controller logic for not migrating an attachment
+    # during post-split optimization until the secondary is ready
+    for ps in env.pageservers:
+        ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")])
+
     env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count)
 
     post_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)]
@@ -300,7 +305,7 @@ def test_sharding_split_smoke(
 
     # Enough background reconciliations should result in the shards being properly distributed.
     # Run this before the workload, because its LSN-waiting code presumes stable locations.
-    env.storage_controller.reconcile_until_idle()
+    env.storage_controller.reconcile_until_idle(timeout_secs=60)
 
     workload.validate()
 
@@ -342,6 +347,10 @@ def test_sharding_split_smoke(
     assert cancelled_reconciles is not None and int(cancelled_reconciles) == 0
     assert errored_reconciles is not None and int(errored_reconciles) == 0
 
+    # We should see that the migration of shards after the split waited for secondaries to warm up
+    # before happening
+    assert env.storage_controller.log_contains(".*Skipping.*because secondary isn't ready.*")
+
     env.storage_controller.consistency_check()
 
     def get_node_shard_counts(env: NeonEnv, tenant_ids):
@@ -1071,6 +1080,17 @@ def test_sharding_split_failures(
         finish_split()
         assert_split_done()
 
+    if isinstance(failure, StorageControllerFailpoint) and "post-complete" in failure.failpoint:
+        # On a post-complete failure, the controller will recover the post-split state
+        # after restart, but it will have missed the optimization part of the split function
+        # where secondary downloads are kicked off.  This means that reconcile_until_idle
+        # will take a very long time if we wait for all optimizations to complete, because
+        # those optimizations will wait for secondary downloads.
+        #
+        # Avoid that by configuring the tenant into Essential scheduling mode, so that it will
+        # skip optimizations when we're exercising this particular failpoint.
+        env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Essential"})
+
     # Having completed the split, pump the background reconciles to ensure that
     # the scheduler reaches an idle state
     env.storage_controller.reconcile_until_idle(timeout_secs=30)

From a3fe12b6d898205bddae4f06947841e14c98ff8e Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 3 May 2024 10:43:30 -0400
Subject: [PATCH 0689/1571] feat(pageserver): add scan interface (#7468)

This pull request adds the scan interface. Scan operates on a sparse
keyspace and retrieves all the key-value pairs from the keyspaces.

Currently, scan only supports the metadata keyspace, and by default do
not retrieve anything from the ancestor branch. This should be fixed in
the future if we need to have some keyspaces that inherits from the
parent.

The scan interface reuses the vectored get code path by disabling the
missing key errors.

This pull request also changes the behavior of vectored get on aux file
v1/v2 key/keyspace: if the key is not found, it is simply not included in the
result, instead of throwing a missing key error.

TODOs in future pull requests: limit memory consumption, ensure the
search stops when all keys are covered by the image layer, remove
`#[allow(dead_code)]` once the code path is used in basebackups / aux
files, remove unnecessary fine-grained keyspace tracking in vectored get
(or have another code path for scan) to improve performance.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs    |   7 +-
 pageserver/src/metrics.rs         |  70 ++++++++++++++
 pageserver/src/tenant.rs          | 124 +++++++++++++++++++++++--
 pageserver/src/tenant/timeline.rs | 148 ++++++++++++++++++++----------
 4 files changed, 290 insertions(+), 59 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index ea6115853e..2511de00d5 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -80,7 +80,7 @@ impl Key {
     }
 
     /// Get the range of metadata keys.
-    pub fn metadata_key_range() -> Range<Self> {
+    pub const fn metadata_key_range() -> Range<Self> {
         Key {
             field1: METADATA_KEY_BEGIN_PREFIX,
             field2: 0,
@@ -572,14 +572,17 @@ pub const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.
 
+/// Non inherited range for vectored get.
 pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
+/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
+pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
 
 // AUX_FILES currently stores only data for logical replication (slots etc), and
 // we don't preserve these on a branch because safekeepers can't follow timeline
 // switch (and generally it likely should be optional), so ignore these.
 #[inline(always)]
 pub fn is_inherited_key(key: Key) -> bool {
-    !NON_INHERITED_RANGE.contains(&key)
+    !NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key)
 }
 
 #[inline(always)]
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index d8019b08e2..903bad34cc 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -194,6 +194,11 @@ pub(crate) struct GetVectoredLatency {
     map: EnumMap<TaskKind, Option<Histogram>>,
 }
 
+#[allow(dead_code)]
+pub(crate) struct ScanLatency {
+    map: EnumMap<TaskKind, Option<Histogram>>,
+}
+
 impl GetVectoredLatency {
     // Only these task types perform vectored gets. Filter all other tasks out to reduce total
     // cardinality of the metric.
@@ -204,6 +209,48 @@ impl GetVectoredLatency {
     }
 }
 
+impl ScanLatency {
+    // Only these task types perform vectored gets. Filter all other tasks out to reduce total
+    // cardinality of the metric.
+    const TRACKED_TASK_KINDS: [TaskKind; 1] = [TaskKind::PageRequestHandler];
+
+    pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
+        self.map[task_kind].as_ref()
+    }
+}
+
+pub(crate) struct ScanLatencyOngoingRecording<'a> {
+    parent: &'a Histogram,
+    start: std::time::Instant,
+}
+
+impl<'a> ScanLatencyOngoingRecording<'a> {
+    pub(crate) fn start_recording(parent: &'a Histogram) -> ScanLatencyOngoingRecording<'a> {
+        let start = Instant::now();
+        ScanLatencyOngoingRecording { parent, start }
+    }
+
+    pub(crate) fn observe(self, throttled: Option<Duration>) {
+        let elapsed = self.start.elapsed();
+        let ex_throttled = if let Some(throttled) = throttled {
+            elapsed.checked_sub(throttled)
+        } else {
+            Some(elapsed)
+        };
+        if let Some(ex_throttled) = ex_throttled {
+            self.parent.observe(ex_throttled.as_secs_f64());
+        } else {
+            use utils::rate_limit::RateLimit;
+            static LOGGED: Lazy<Mutex<RateLimit>> =
+                Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+            let mut rate_limit = LOGGED.lock().unwrap();
+            rate_limit.call(|| {
+                warn!("error deducting time spent throttled; this message is logged at a global rate limit");
+            });
+        }
+    }
+}
+
 pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
     let inner = register_histogram_vec!(
         "pageserver_get_vectored_seconds",
@@ -227,6 +274,29 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(||
     }
 });
 
+pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
+    let inner = register_histogram_vec!(
+        "pageserver_scan_seconds",
+        "Time spent in scan, excluding time spent in timeline_get_throttle.",
+        &["task_kind"],
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric");
+
+    ScanLatency {
+        map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
+            let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
+
+            if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
+                let task_kind = task_kind.into();
+                Some(inner.with_label_values(&[task_kind]))
+            } else {
+                None
+            }
+        })),
+    }
+});
+
 pub(crate) struct PageCacheMetricsForTaskKind {
     pub read_accesses_materialized_page: IntCounter,
     pub read_accesses_immutable: IntCounter,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 8fa484e7b2..c39c21c6dd 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3925,7 +3925,7 @@ mod tests {
     use crate::DEFAULT_PG_VERSION;
     use bytes::BytesMut;
     use hex_literal::hex;
-    use pageserver_api::key::NON_INHERITED_RANGE;
+    use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
     use pageserver_api::keyspace::KeySpace;
     use pageserver_api::models::CompactionAlgorithm;
     use rand::{thread_rng, Rng};
@@ -4791,15 +4791,7 @@ mod tests {
             .await;
 
         let images = vectored_res?;
-        let mut key = NON_INHERITED_RANGE.start;
-        while key < NON_INHERITED_RANGE.end {
-            assert!(matches!(
-                images[&key],
-                Err(PageReconstructError::MissingKey(_))
-            ));
-            key = key.next();
-        }
-
+        assert!(images.is_empty());
         Ok(())
     }
 
@@ -5500,4 +5492,116 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_metadata_scan() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_scan")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        const NUM_KEYS: usize = 1000;
+        const STEP: usize = 100; // random update + scan base_key + idx * STEP
+
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
+        let mut test_key = base_key;
+
+        // Track when each page was last modified. Used to assert that
+        // a read sees the latest page version.
+        let mut updated = [Lsn(0); NUM_KEYS];
+
+        let mut lsn = Lsn(0x10);
+        #[allow(clippy::needless_range_loop)]
+        for blknum in 0..NUM_KEYS {
+            lsn = Lsn(lsn.0 + 0x10);
+            test_key.field6 = (blknum * STEP) as u32;
+            let mut writer = tline.writer().await;
+            writer
+                .put(
+                    test_key,
+                    lsn,
+                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            updated[blknum] = lsn;
+            drop(writer);
+        }
+
+        let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
+
+        for _ in 0..10 {
+            // Read all the blocks
+            for (blknum, last_lsn) in updated.iter().enumerate() {
+                test_key.field6 = (blknum * STEP) as u32;
+                assert_eq!(
+                    tline.get(test_key, lsn, &ctx).await?,
+                    test_img(&format!("{} at {}", blknum, last_lsn))
+                );
+            }
+
+            let mut cnt = 0;
+            for (key, value) in tline
+                .get_vectored_impl(
+                    keyspace.clone(),
+                    lsn,
+                    ValuesReconstructState::default(),
+                    &ctx,
+                )
+                .await?
+            {
+                let blknum = key.field6 as usize;
+                let value = value?;
+                assert!(blknum % STEP == 0);
+                let blknum = blknum / STEP;
+                assert_eq!(
+                    value,
+                    test_img(&format!("{} at {}", blknum, updated[blknum]))
+                );
+                cnt += 1;
+            }
+
+            assert_eq!(cnt, NUM_KEYS);
+
+            for _ in 0..NUM_KEYS {
+                lsn = Lsn(lsn.0 + 0x10);
+                let blknum = thread_rng().gen_range(0..NUM_KEYS);
+                test_key.field6 = (blknum * STEP) as u32;
+                let mut writer = tline.writer().await;
+                writer
+                    .put(
+                        test_key,
+                        lsn,
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
+                        &ctx,
+                    )
+                    .await?;
+                writer.finish_write(lsn);
+                drop(writer);
+                updated[blknum] = lsn;
+            }
+
+            // Perform a cycle of flush, compact, and GC
+            let cutoff = tline.get_last_record_lsn();
+            tline
+                .update_gc_info(
+                    Vec::new(),
+                    cutoff,
+                    Duration::ZERO,
+                    &CancellationToken::new(),
+                    &ctx,
+                )
+                .await?;
+            tline.freeze_and_flush().await?;
+            tline
+                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+                .await?;
+            tline.gc().await?;
+        }
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 19228bc1f1..c7a5598cec 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -16,7 +16,10 @@ use enumset::EnumSet;
 use fail::fail_point;
 use once_cell::sync::Lazy;
 use pageserver_api::{
-    key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
+    key::{
+        AUX_FILES_KEY, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
+        NON_INHERITED_SPARSE_RANGE,
+    },
     keyspace::{KeySpaceAccum, SparseKeyPartitioning},
     models::{
         CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
@@ -55,7 +58,6 @@ use std::{
     ops::ControlFlow,
 };
 
-use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
     metadata::TimelineMetadata,
@@ -77,6 +79,9 @@ use crate::{
 use crate::{
     disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
 };
+use crate::{
+    metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
+};
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
 use crate::{
     pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
@@ -885,16 +890,15 @@ impl Timeline {
                             value
                         }
                     }
-                    None => {
-                        error!(
-                            "Expected {}, but singular vectored get returned nothing",
-                            key
-                        );
-                        Err(PageReconstructError::Other(anyhow!(
-                            "Singular vectored get did not return a value for {}",
-                            key
-                        )))
-                    }
+                    None => Err(PageReconstructError::MissingKey(MissingKeyError {
+                        key,
+                        shard: self.shard_identity.get_shard_number(&key),
+                        cont_lsn: Lsn(0),
+                        request_lsn: lsn,
+                        ancestor_lsn: None,
+                        traversal_path: Vec::new(),
+                        backtrace: None,
+                    })),
                 }
             }
         }
@@ -1044,6 +1048,70 @@ impl Timeline {
         res
     }
 
+    /// Scan the keyspace and return all existing key-values in the keyspace. This currently uses vectored
+    /// get underlying. Normal vectored get would throw an error when a key in the keyspace is not found
+    /// during the search, but for the scan interface, it returns all existing key-value pairs, and does
+    /// not expect each single key in the key space will be found. The semantics is closer to the RocksDB
+    /// scan iterator interface. We could optimize this interface later to avoid some checks in the vectored
+    /// get path to maintain and split the probing and to-be-probe keyspace. We also need to ensure that
+    /// the scan operation will not cause OOM in the future.
+    #[allow(dead_code)]
+    pub(crate) async fn scan(
+        &self,
+        keyspace: KeySpace,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        if !lsn.is_valid() {
+            return Err(GetVectoredError::InvalidLsn(lsn));
+        }
+
+        trace!(
+            "key-value scan request for {:?}@{} from task kind {:?}",
+            keyspace,
+            lsn,
+            ctx.task_kind()
+        );
+
+        // We should generalize this into Keyspace::contains in the future.
+        for range in &keyspace.ranges {
+            if range.start.field1 < METADATA_KEY_BEGIN_PREFIX
+                || range.end.field1 >= METADATA_KEY_END_PREFIX
+            {
+                return Err(GetVectoredError::Other(anyhow::anyhow!(
+                    "only metadata keyspace can be scanned"
+                )));
+            }
+        }
+
+        let start = crate::metrics::SCAN_LATENCY
+            .for_task_kind(ctx.task_kind())
+            .map(ScanLatencyOngoingRecording::start_recording);
+
+        // start counting after throttle so that throttle time
+        // is always less than observation time
+        let throttled = self
+            .timeline_get_throttle
+            // assume scan = 1 quota for now until we find a better way to process this
+            .throttle(ctx, 1)
+            .await;
+
+        let vectored_res = self
+            .get_vectored_impl(
+                keyspace.clone(),
+                lsn,
+                ValuesReconstructState::default(),
+                ctx,
+            )
+            .await;
+
+        if let Some(recording) = start {
+            recording.observe(throttled);
+        }
+
+        vectored_res
+    }
+
     /// Not subject to [`Self::timeline_get_throttle`].
     pub(super) async fn get_vectored_sequential_impl(
         &self,
@@ -1052,6 +1120,7 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
         let mut values = BTreeMap::new();
+
         for range in keyspace.ranges {
             let mut key = range.start;
             while key != range.end {
@@ -1064,12 +1133,16 @@ impl Timeline {
                     Err(Cancelled | AncestorStopping(_)) => {
                         return Err(GetVectoredError::Cancelled)
                     }
-                    // we only capture stuck_at_lsn=false now until we figure out https://github.com/neondatabase/neon/issues/7380
-                    Err(MissingKey(err)) if !NON_INHERITED_RANGE.contains(&key) => {
-                        // The vectored read path handles non inherited keys specially.
-                        // If such a a key cannot be reconstructed from the current timeline,
-                        // the vectored read path returns a key level error as opposed to a top
-                        // level error.
+                    Err(MissingKey(_))
+                        if NON_INHERITED_RANGE.contains(&key)
+                            || NON_INHERITED_SPARSE_RANGE.contains(&key) =>
+                    {
+                        // Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range.
+                        // When we add more types of keys into the page server, we should revisit this part of code and throw errors
+                        // accordingly.
+                        key = key.next();
+                    }
+                    Err(MissingKey(err)) => {
                         return Err(GetVectoredError::MissingKey(err));
                     }
                     Err(Other(err))
@@ -1157,6 +1230,11 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) {
+        if keyspace.overlaps(&Key::metadata_key_range()) {
+            // skip validation for metadata key range
+            return;
+        }
+
         let sequential_res = self
             .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx)
             .await;
@@ -3209,36 +3287,12 @@ impl Timeline {
             // Do not descend into the ancestor timeline for aux files.
             // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
             // stalling compaction.
-            // TODO(chi): this will need to be updated for aux files v2 storage
-            if keyspace.overlaps(&NON_INHERITED_RANGE) {
-                let removed = keyspace.remove_overlapping_with(&KeySpace {
-                    ranges: vec![NON_INHERITED_RANGE],
-                });
-
-                for range in removed.ranges {
-                    let mut key = range.start;
-                    while key < range.end {
-                        reconstruct_state.on_key_error(
-                            key,
-                            PageReconstructError::MissingKey(MissingKeyError {
-                                key,
-                                shard: self.shard_identity.get_shard_number(&key),
-                                cont_lsn,
-                                request_lsn,
-                                ancestor_lsn: None,
-                                traversal_path: Vec::default(),
-                                backtrace: if cfg!(test) {
-                                    Some(std::backtrace::Backtrace::force_capture())
-                                } else {
-                                    None
-                                },
-                            }),
-                        );
-                        key = key.next();
-                    }
-                }
-            }
+            keyspace.remove_overlapping_with(&KeySpace {
+                ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
+            });
 
+            // Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look
+            // into ancestor timelines). TODO: is there any other metadata which we want to inherit?
             if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() {
                 break;
             }

From 9b65946566fc4e4b095cacab56f1843e0679eda0 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 3 May 2024 16:45:24 +0100
Subject: [PATCH 0690/1571] proxy: add connect compute concurrency lock (#7607)

## Problem

Too many connect_compute attempts can overwhelm postgres, getting the
connections stuck.

## Summary of changes

Limit number of connection attempts that can happen at a given time.
---
 proxy/src/bin/proxy.rs             | 46 +++++++++++++++++++++---------
 proxy/src/compute.rs               | 11 +++++++
 proxy/src/config.rs                | 18 ++++++++----
 proxy/src/console/provider.rs      | 13 ++++-----
 proxy/src/console/provider/neon.rs |  8 +++---
 proxy/src/lib.rs                   |  3 ++
 proxy/src/metrics.rs               |  9 ++++++
 proxy/src/proxy.rs                 |  5 +++-
 proxy/src/proxy/connect_compute.rs |  8 +++++-
 proxy/src/serverless/backend.rs    | 26 +++++++++++++++--
 10 files changed, 112 insertions(+), 35 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 39f6bc8b6d..0956aae6c0 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -118,8 +118,11 @@ struct ProxyCliArgs {
     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     wake_compute_cache: String,
     /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
-    #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
+    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
     wake_compute_lock: String,
+    /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
+    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
+    connect_compute_lock: String,
     /// Allow self-signed certificates for compute nodes (for testing)
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     allow_self_signed_compute: bool,
@@ -529,24 +532,21 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
                 endpoint_cache_config,
             )));
 
-            let config::WakeComputeLockOptions {
+            let config::ConcurrencyLockOptions {
                 shards,
                 permits,
                 epoch,
                 timeout,
             } = args.wake_compute_lock.parse()?;
             info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
-            let locks = Box::leak(Box::new(
-                console::locks::ApiLocks::new(
-                    "wake_compute_lock",
-                    permits,
-                    shards,
-                    timeout,
-                    epoch,
-                    &Metrics::get().wake_compute_lock,
-                )
-                .unwrap(),
-            ));
+            let locks = Box::leak(Box::new(console::locks::ApiLocks::new(
+                "wake_compute_lock",
+                permits,
+                shards,
+                timeout,
+                epoch,
+                &Metrics::get().wake_compute_lock,
+            )?));
             tokio::spawn(locks.garbage_collect_worker());
 
             let url = args.auth_endpoint.parse()?;
@@ -572,6 +572,23 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             auth::BackendType::Link(MaybeOwned::Owned(url), ())
         }
     };
+
+    let config::ConcurrencyLockOptions {
+        shards,
+        permits,
+        epoch,
+        timeout,
+    } = args.connect_compute_lock.parse()?;
+    info!(permits, shards, ?epoch, "Using NodeLocks (connect_compute)");
+    let connect_compute_locks = console::locks::ApiLocks::new(
+        "connect_compute_lock",
+        permits,
+        shards,
+        timeout,
+        epoch,
+        &Metrics::get().proxy.connect_compute_lock,
+    )?;
+
     let http_config = HttpConfig {
         request_timeout: args.sql_over_http.sql_over_http_timeout,
         pool_options: GlobalConnPoolOptions {
@@ -607,11 +624,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         region: args.region.clone(),
         aws_region: args.aws_region.clone(),
         wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
+        connect_compute_locks,
         connect_to_compute_retry_config: config::RetryConfig::parse(
             &args.connect_to_compute_retry,
         )?,
     }));
 
+    tokio::spawn(config.connect_compute_locks.garbage_collect_worker());
+
     Ok(config)
 }
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 44d85c2952..23266ac4ef 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -6,6 +6,7 @@ use crate::{
     error::{ReportableError, UserFacingError},
     metrics::{Metrics, NumDbConnectionsGuard},
     proxy::neon_option,
+    Host,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
@@ -101,6 +102,16 @@ impl ConnCfg {
         }
     }
 
+    pub fn get_host(&self) -> Result<Host, WakeComputeError> {
+        match self.0.get_hosts() {
+            [tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()),
+            // we should not have multiple address or unix addresses.
+            _ => Err(WakeComputeError::BadComputeAddress(
+                "invalid compute address".into(),
+            )),
+        }
+    }
+
     /// Apply startup message params to the connection config.
     pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
         // Only set `user` if it's not present in the config.
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index a32ab8c43c..0c8e284d0b 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,7 +1,9 @@
 use crate::{
     auth::{self, backend::AuthRateLimiter},
+    console::locks::ApiLocks,
     rate_limiter::RateBucketInfo,
     serverless::GlobalConnPoolOptions,
+    Host,
 };
 use anyhow::{bail, ensure, Context, Ok};
 use itertools::Itertools;
@@ -34,6 +36,7 @@ pub struct ProxyConfig {
     pub handshake_timeout: Duration,
     pub aws_region: String,
     pub wake_compute_retry_config: RetryConfig,
+    pub connect_compute_locks: ApiLocks<Host>,
     pub connect_to_compute_retry_config: RetryConfig,
 }
 
@@ -573,7 +576,7 @@ impl RetryConfig {
 }
 
 /// Helper for cmdline cache options parsing.
-pub struct WakeComputeLockOptions {
+pub struct ConcurrencyLockOptions {
     /// The number of shards the lock map should have
     pub shards: usize,
     /// The number of allowed concurrent requests for each endpoitn
@@ -584,9 +587,12 @@ pub struct WakeComputeLockOptions {
     pub timeout: Duration,
 }
 
-impl WakeComputeLockOptions {
+impl ConcurrencyLockOptions {
     /// Default options for [`crate::console::provider::ApiLocks`].
     pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
+    /// Default options for [`crate::console::provider::ApiLocks`].
+    pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str =
+        "shards=64,permits=50,epoch=10m,timeout=500ms";
 
     // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";
 
@@ -636,7 +642,7 @@ impl WakeComputeLockOptions {
     }
 }
 
-impl FromStr for WakeComputeLockOptions {
+impl FromStr for ConcurrencyLockOptions {
     type Err = anyhow::Error;
 
     fn from_str(options: &str) -> Result<Self, Self::Err> {
@@ -672,7 +678,7 @@ mod tests {
 
     #[test]
     fn test_parse_lock_options() -> anyhow::Result<()> {
-        let WakeComputeLockOptions {
+        let ConcurrencyLockOptions {
             epoch,
             permits,
             shards,
@@ -683,7 +689,7 @@ mod tests {
         assert_eq!(shards, 32);
         assert_eq!(permits, 4);
 
-        let WakeComputeLockOptions {
+        let ConcurrencyLockOptions {
             epoch,
             permits,
             shards,
@@ -694,7 +700,7 @@ mod tests {
         assert_eq!(shards, 16);
         assert_eq!(permits, 8);
 
-        let WakeComputeLockOptions {
+        let ConcurrencyLockOptions {
             epoch,
             permits,
             shards,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index aa1800a9da..dfda29e0b1 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -17,7 +17,7 @@ use crate::{
     scram, EndpointCacheKey,
 };
 use dashmap::DashMap;
-use std::{sync::Arc, time::Duration};
+use std::{hash::Hash, sync::Arc, time::Duration};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio::time::Instant;
 use tracing::info;
@@ -447,16 +447,16 @@ impl ApiCaches {
 }
 
 /// Various caches for [`console`](super).
-pub struct ApiLocks {
+pub struct ApiLocks<K> {
     name: &'static str,
-    node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
+    node_locks: DashMap<K, Arc<Semaphore>>,
     permits: usize,
     timeout: Duration,
     epoch: std::time::Duration,
     metrics: &'static ApiLockMetrics,
 }
 
-impl ApiLocks {
+impl<K: Hash + Eq + Clone> ApiLocks<K> {
     pub fn new(
         name: &'static str,
         permits: usize,
@@ -475,10 +475,7 @@ impl ApiLocks {
         })
     }
 
-    pub async fn get_wake_compute_permit(
-        &self,
-        key: &EndpointCacheKey,
-    ) -> Result<WakeComputePermit, errors::WakeComputeError> {
+    pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, errors::WakeComputeError> {
         if self.permits == 0 {
             return Ok(WakeComputePermit { permit: None });
         }
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 58b2a1570c..ec66641d01 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -13,7 +13,7 @@ use crate::{
     http,
     metrics::{CacheOutcome, Metrics},
     rate_limiter::EndpointRateLimiter,
-    scram, Normalize,
+    scram, EndpointCacheKey, Normalize,
 };
 use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
@@ -25,7 +25,7 @@ use tracing::{error, info, info_span, warn, Instrument};
 pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
-    pub locks: &'static ApiLocks,
+    pub locks: &'static ApiLocks<EndpointCacheKey>,
     pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     jwt: String,
 }
@@ -35,7 +35,7 @@ impl Api {
     pub fn new(
         endpoint: http::Endpoint,
         caches: &'static ApiCaches,
-        locks: &'static ApiLocks,
+        locks: &'static ApiLocks<EndpointCacheKey>,
         endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     ) -> Self {
         let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
@@ -289,7 +289,7 @@ impl super::Api for Api {
             return Err(WakeComputeError::TooManyConnections);
         }
 
-        let permit = self.locks.get_wake_compute_permit(&key).await?;
+        let permit = self.locks.get_permit(&key).await?;
 
         // after getting back a permit - it's possible the cache was filled
         // double check
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 3f6d985fe8..35c1616481 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -159,6 +159,9 @@ smol_str_wrapper!(EndpointCacheKey);
 
 smol_str_wrapper!(DbName);
 
+// postgres hostname, will likely be a port:ip addr
+smol_str_wrapper!(Host);
+
 // Endpoints are a bit tricky. Rare they might be branches or projects.
 impl EndpointId {
     pub fn is_endpoint(&self) -> bool {
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 4a54857012..1590316925 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -126,6 +126,9 @@ pub struct ProxyMetrics {
 
     /// Number of events consumed from redis (per event type).
     pub redis_events_count: CounterVec<StaticLabelSet<RedisEventsCount>>,
+
+    #[metric(namespace = "connect_compute_lock")]
+    pub connect_compute_lock: ApiLockMetrics,
 }
 
 #[derive(MetricGroup)]
@@ -149,6 +152,12 @@ impl Default for ProxyMetrics {
     }
 }
 
+impl Default for ApiLockMetrics {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 #[derive(FixedCardinalityLabel, Copy, Clone)]
 #[label(singleton = "direction")]
 pub enum HttpDirection {
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 33d73eb675..e4e095d77d 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -301,7 +301,10 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     let mut node = connect_to_compute(
         ctx,
-        &TcpMechanism { params: &params },
+        &TcpMechanism {
+            params: &params,
+            locks: &config.connect_compute_locks,
+        },
         &user_info,
         mode.allow_self_signed_compute(config),
         config.wake_compute_retry_config,
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index da6223209f..c8528d0296 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -2,7 +2,7 @@ use crate::{
     auth::backend::ComputeCredentialKeys,
     compute::{self, PostgresConnection},
     config::RetryConfig,
-    console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
+    console::{self, errors::WakeComputeError, locks::ApiLocks, CachedNodeInfo, NodeInfo},
     context::RequestMonitoring,
     error::ReportableError,
     metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType},
@@ -10,6 +10,7 @@ use crate::{
         retry::{retry_after, ShouldRetry},
         wake_compute::wake_compute,
     },
+    Host,
 };
 use async_trait::async_trait;
 use pq_proto::StartupMessageParams;
@@ -64,6 +65,9 @@ pub trait ComputeConnectBackend {
 pub struct TcpMechanism<'a> {
     /// KV-dictionary with PostgreSQL connection params.
     pub params: &'a StartupMessageParams,
+
+    /// connect_to_compute concurrency lock
+    pub locks: &'static ApiLocks<Host>,
 }
 
 #[async_trait]
@@ -79,6 +83,8 @@ impl ConnectMechanism for TcpMechanism<'_> {
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
+        let host = node_info.config.get_host()?;
+        let _permit = self.locks.get_permit(&host).await?;
         node_info.connect(ctx, timeout).await
     }
 
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index c89ebc3251..963913a260 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -9,11 +9,13 @@ use crate::{
     config::{AuthenticationConfig, ProxyConfig},
     console::{
         errors::{GetAuthInfoError, WakeComputeError},
+        locks::ApiLocks,
         CachedNodeInfo,
     },
     context::RequestMonitoring,
     error::{ErrorKind, ReportableError, UserFacingError},
-    proxy::connect_compute::ConnectMechanism,
+    proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
+    Host,
 };
 
 use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
@@ -105,6 +107,7 @@ impl PoolingBackend {
                 conn_id,
                 conn_info,
                 pool: self.pool.clone(),
+                locks: &self.config.connect_compute_locks,
             },
             &backend,
             false, // do not allow self signed compute for http flow
@@ -154,16 +157,31 @@ impl UserFacingError for HttpConnError {
     }
 }
 
+impl ShouldRetry for HttpConnError {
+    fn could_retry(&self) -> bool {
+        match self {
+            HttpConnError::ConnectionError(e) => e.could_retry(),
+            HttpConnError::ConnectionClosedAbruptly(_) => false,
+            HttpConnError::GetAuthInfo(_) => false,
+            HttpConnError::AuthError(_) => false,
+            HttpConnError::WakeCompute(_) => false,
+        }
+    }
+}
+
 struct TokioMechanism {
     pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
     conn_info: ConnInfo,
     conn_id: uuid::Uuid,
+
+    /// connect_to_compute concurrency lock
+    locks: &'static ApiLocks<Host>,
 }
 
 #[async_trait]
 impl ConnectMechanism for TokioMechanism {
     type Connection = Client<tokio_postgres::Client>;
-    type ConnectError = tokio_postgres::Error;
+    type ConnectError = HttpConnError;
     type Error = HttpConnError;
 
     async fn connect_once(
@@ -172,6 +190,9 @@ impl ConnectMechanism for TokioMechanism {
         node_info: &CachedNodeInfo,
         timeout: Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
+        let host = node_info.config.get_host()?;
+        let permit = self.locks.get_permit(&host).await?;
+
         let mut config = (*node_info.config).clone();
         let config = config
             .user(&self.conn_info.user_info.user)
@@ -182,6 +203,7 @@ impl ConnectMechanism for TokioMechanism {
         let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
         let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
         drop(pause);
+        drop(permit);
 
         tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
         Ok(poll_client(

From ef03b38e5282140a5b7003c7f5010e1707631f31 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 3 May 2024 12:01:33 -0400
Subject: [PATCH 0691/1571] fix(pageserver): remove update_gc_info calls in
 tests (#7608)

introduced by https://github.com/neondatabase/neon/pull/7468 conflicting
with https://github.com/neondatabase/neon/pull/7584

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c39c21c6dd..fdc49ae295 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5504,6 +5504,8 @@ mod tests {
         const NUM_KEYS: usize = 1000;
         const STEP: usize = 100; // random update + scan base_key + idx * STEP
 
+        let cancel = CancellationToken::new();
+
         let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
         base_key.field1 = AUX_KEY_PREFIX;
         let mut test_key = base_key;
@@ -5585,21 +5587,11 @@ mod tests {
             }
 
             // Perform a cycle of flush, compact, and GC
-            let cutoff = tline.get_last_record_lsn();
-            tline
-                .update_gc_info(
-                    Vec::new(),
-                    cutoff,
-                    Duration::ZERO,
-                    &CancellationToken::new(),
-                    &ctx,
-                )
-                .await?;
             tline.freeze_and_flush().await?;
-            tline
-                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+            tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
+            tenant
+                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
                 .await?;
-            tline.gc().await?;
         }
 
         Ok(())

From 1e7cd6ac9f3568ffe9db952cb89f8036330d27b5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 3 May 2024 19:15:38 +0200
Subject: [PATCH 0692/1571] refactor: move `NodeMetadata` to `pageserver_api`;
 use it from `neon_local` (#7606)

This is the first step towards representing all of Pageserver
configuration as clean `serde::Serialize`able Rust structs in
`pageserver_api`.

The `neon_local` code will then use those structs instead of the crude
`toml_edit` / string concatenation that it does today.

refs https://github.com/neondatabase/neon/issues/7555

---------

Co-authored-by: Alex Chi Z <iskyzh@gmail.com>
---
 control_plane/src/bin/neon_local.rs     |  8 +++----
 control_plane/src/pageserver.rs         | 13 ++++++-----
 libs/pageserver_api/src/config.rs       | 31 +++++++++++++++++++++++++
 libs/pageserver_api/src/config/tests.rs | 22 ++++++++++++++++++
 libs/pageserver_api/src/lib.rs          |  6 +----
 pageserver/src/config.rs                | 24 ++-----------------
 pageserver/src/control_plane_client.rs  |  6 ++---
 7 files changed, 69 insertions(+), 41 deletions(-)
 create mode 100644 libs/pageserver_api/src/config.rs
 create mode 100644 libs/pageserver_api/src/config/tests.rs

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index bdd64c8687..14b83c1252 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -14,15 +14,15 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
+use pageserver_api::config::{
+    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
+    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
+};
 use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::models::{
     ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
 use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
-use pageserver_api::{
-    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
-    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
-};
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
 use safekeeper_api::{
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 52accc5890..1a64391306 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -248,12 +248,13 @@ impl PageServerNode {
         // situation: the metadata is written by some other script.
         std::fs::write(
             metadata_path,
-            serde_json::to_vec(&serde_json::json!({
-                "host": "localhost",
-                "port": self.pg_connection_config.port(),
-                "http_host": "localhost",
-                "http_port": http_port,
-            }))
+            serde_json::to_vec(&pageserver_api::config::NodeMetadata {
+                postgres_host: "localhost".to_string(),
+                postgres_port: self.pg_connection_config.port(),
+                http_host: "localhost".to_string(),
+                http_port,
+                other: HashMap::new(),
+            })
             .unwrap(),
         )
         .expect("Failed to write metadata file");
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
new file mode 100644
index 0000000000..d996a62349
--- /dev/null
+++ b/libs/pageserver_api/src/config.rs
@@ -0,0 +1,31 @@
+use std::collections::HashMap;
+
+use const_format::formatcp;
+
+#[cfg(test)]
+mod tests;
+
+pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
+pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
+pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
+pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
+
+// Certain metadata (e.g. externally-addressable name, AZ) is delivered
+// as a separate structure.  This information is not neeed by the pageserver
+// itself, it is only used for registering the pageserver with the control
+// plane and/or storage controller.
+//
+#[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
+pub struct NodeMetadata {
+    #[serde(rename = "host")]
+    pub postgres_host: String,
+    #[serde(rename = "port")]
+    pub postgres_port: u16,
+    pub http_host: String,
+    pub http_port: u16,
+
+    // Deployment tools may write fields to the metadata file beyond what we
+    // use in this type: this type intentionally only names fields that require.
+    #[serde(flatten)]
+    pub other: HashMap<String, serde_json::Value>,
+}
diff --git a/libs/pageserver_api/src/config/tests.rs b/libs/pageserver_api/src/config/tests.rs
new file mode 100644
index 0000000000..edeefc156e
--- /dev/null
+++ b/libs/pageserver_api/src/config/tests.rs
@@ -0,0 +1,22 @@
+use super::*;
+
+#[test]
+fn test_node_metadata_v1_backward_compatibilty() {
+    let v1 = serde_json::to_vec(&serde_json::json!({
+        "host": "localhost",
+        "port": 23,
+        "http_host": "localhost",
+        "http_port": 42,
+    }));
+
+    assert_eq!(
+        serde_json::from_slice::<NodeMetadata>(&v1.unwrap()).unwrap(),
+        NodeMetadata {
+            postgres_host: "localhost".to_string(),
+            postgres_port: 23,
+            http_host: "localhost".to_string(),
+            http_port: 42,
+            other: HashMap::new(),
+        }
+    )
+}
diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs
index 1b948d60c3..532185a366 100644
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,6 +1,5 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
-use const_format::formatcp;
 
 pub mod controller_api;
 pub mod key;
@@ -11,7 +10,4 @@ pub mod shard;
 /// Public API types
 pub mod upcall_api;
 
-pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
-pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
-pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
-pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
+pub mod config;
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 96fff1f0c0..258eed0b12 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -9,7 +9,7 @@ use pageserver_api::shard::TenantShardId;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde;
 use serde::de::IntoDeserializer;
-use std::{collections::HashMap, env};
+use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
@@ -51,7 +51,7 @@ pub mod defaults {
     use crate::tenant::config::defaults::*;
     use const_format::formatcp;
 
-    pub use pageserver_api::{
+    pub use pageserver_api::config::{
         DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
         DEFAULT_PG_LISTEN_PORT,
     };
@@ -335,26 +335,6 @@ impl<T: Clone> BuilderValue<T> {
     }
 }
 
-// Certain metadata (e.g. externally-addressable name, AZ) is delivered
-// as a separate structure.  This information is not neeed by the pageserver
-// itself, it is only used for registering the pageserver with the control
-// plane and/or storage controller.
-//
-#[derive(serde::Deserialize)]
-pub(crate) struct NodeMetadata {
-    #[serde(rename = "host")]
-    pub(crate) postgres_host: String,
-    #[serde(rename = "port")]
-    pub(crate) postgres_port: u16,
-    pub(crate) http_host: String,
-    pub(crate) http_port: u16,
-
-    // Deployment tools may write fields to the metadata file beyond what we
-    // use in this type: this type intentionally only names fields that require.
-    #[serde(flatten)]
-    pub(crate) other: HashMap<String, serde_json::Value>,
-}
-
 // needed to simplify config construction
 #[derive(Default)]
 struct PageServerConfigBuilder {
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index db0032891e..26e7cc7ef8 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -14,10 +14,8 @@ use tokio_util::sync::CancellationToken;
 use url::Url;
 use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
 
-use crate::{
-    config::{NodeMetadata, PageServerConf},
-    virtual_file::on_fatal_io_error,
-};
+use crate::{config::PageServerConf, virtual_file::on_fatal_io_error};
+use pageserver_api::config::NodeMetadata;
 
 /// The Pageserver's client for using the control plane API: this is a small subset
 /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)

From 64f0613edf47a6975f5d6394e5056cf2eaf7e484 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Fri, 3 May 2024 12:57:45 -0700
Subject: [PATCH 0693/1571] compute_ctl: Add support for swap resizing (#7434)

Part of neondatabase/cloud#12047. Resolves #7239.

In short, this PR:

1. Adds `ComputeSpec.swap_size_bytes: Option<u64>`
2. Adds a flag to compute_ctl: `--resize-swap-on-bind`
3. Implements running `/neonvm/bin/resize-swap` with the value from the
   compute spec before starting postgres, if both the value in the spec
   *AND* the flag are specified.
4. Adds `sudo` to the final image
5. Adds a file in `/etc/sudoers.d` to allow `compute_ctl` to resize swap

Various bits of reasoning about design decisions in the added comments.
In short: We have both a compute spec field and a flag to make rollout
easier to implement. The flag will most likely be removed as part of
cleanups for neondatabase/cloud#12047.
---
 compute_tools/src/bin/compute_ctl.rs | 86 +++++++++++++++++++++-------
 compute_tools/src/lib.rs             |  1 +
 compute_tools/src/swap.rs            | 36 ++++++++++++
 control_plane/src/endpoint.rs        |  1 +
 libs/compute_api/src/spec.rs         | 17 ++++++
 vm-image-spec.yaml                   | 22 +++++++
 6 files changed, 143 insertions(+), 20 deletions(-)
 create mode 100644 compute_tools/src/swap.rs

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 117919786e..471d46d4f2 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -47,7 +47,7 @@ use chrono::Utc;
 use clap::Arg;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
-use tracing::{error, info};
+use tracing::{error, info, warn};
 use url::Url;
 
 use compute_api::responses::ComputeStatus;
@@ -62,6 +62,7 @@ use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
+use compute_tools::swap::resize_swap;
 
 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
@@ -110,6 +111,7 @@ fn main() -> Result<()> {
         .expect("Postgres connection string is required");
     let spec_json = matches.get_one::<String>("spec");
     let spec_path = matches.get_one::<String>("spec-path");
+    let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
 
     // Extract OpenTelemetry context for the startup actions from the
     // TRACEPARENT and TRACESTATE env variables, and attach it to the current
@@ -275,33 +277,72 @@ fn main() -> Result<()> {
         "running compute with features: {:?}",
         state.pspec.as_ref().unwrap().spec.features
     );
+    // before we release the mutex, fetch the swap size (if any) for later.
+    let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
     drop(state);
 
     // Launch remaining service threads
     let _monitor_handle = launch_monitor(&compute);
     let _configurator_handle = launch_configurator(&compute);
 
-    // Start Postgres
+    let mut prestartup_failed = false;
     let mut delay_exit = false;
-    let mut exit_code = None;
-    let pg = match compute.start_compute(extension_server_port) {
-        Ok(pg) => Some(pg),
-        Err(err) => {
-            error!("could not start the compute node: {:#}", err);
-            let mut state = compute.state.lock().unwrap();
-            state.error = Some(format!("{:?}", err));
-            state.status = ComputeStatus::Failed;
-            // Notify others that Postgres failed to start. In case of configuring the
-            // empty compute, it's likely that API handler is still waiting for compute
-            // state change. With this we will notify it that compute is in Failed state,
-            // so control plane will know about it earlier and record proper error instead
-            // of timeout.
-            compute.state_changed.notify_all();
-            drop(state); // unlock
-            delay_exit = true;
-            None
+
+    // Resize swap to the desired size if the compute spec says so
+    if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) {
+        // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
+        // *before* starting postgres.
+        //
+        // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
+        // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
+        // OOM-killed during startup because swap wasn't available yet.
+        match resize_swap(size_bytes) {
+            Ok(()) => {
+                let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
+                info!(%size_bytes, %size_gib, "resized swap");
+            }
+            Err(err) => {
+                let err = err.context("failed to resize swap");
+                error!("{err:#}");
+
+                // Mark compute startup as failed; don't try to start postgres, and report this
+                // error to the control plane when it next asks.
+                prestartup_failed = true;
+                let mut state = compute.state.lock().unwrap();
+                state.error = Some(format!("{err:?}"));
+                state.status = ComputeStatus::Failed;
+                compute.state_changed.notify_all();
+                delay_exit = true;
+            }
         }
-    };
+    }
+
+    // Start Postgres
+    let mut pg = None;
+    let mut exit_code = None;
+
+    if !prestartup_failed {
+        pg = match compute.start_compute(extension_server_port) {
+            Ok(pg) => Some(pg),
+            Err(err) => {
+                error!("could not start the compute node: {:#}", err);
+                let mut state = compute.state.lock().unwrap();
+                state.error = Some(format!("{:?}", err));
+                state.status = ComputeStatus::Failed;
+                // Notify others that Postgres failed to start. In case of configuring the
+                // empty compute, it's likely that API handler is still waiting for compute
+                // state change. With this we will notify it that compute is in Failed state,
+                // so control plane will know about it earlier and record proper error instead
+                // of timeout.
+                compute.state_changed.notify_all();
+                drop(state); // unlock
+                delay_exit = true;
+                None
+            }
+        };
+    } else {
+        warn!("skipping postgres startup because pre-startup step failed");
+    }
 
     // Start the vm-monitor if directed to. The vm-monitor only runs on linux
     // because it requires cgroups.
@@ -526,6 +567,11 @@ fn cli() -> clap::Command {
                 )
                 .value_name("FILECACHE_CONNSTR"),
         )
+        .arg(
+            Arg::new("resize-swap-on-bind")
+                .long("resize-swap-on-bind")
+                .action(clap::ArgAction::SetTrue),
+        )
 }
 
 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index 4e01ffd954..eac808385c 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -14,4 +14,5 @@ pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod spec;
+pub mod swap;
 pub mod sync_sk;
diff --git a/compute_tools/src/swap.rs b/compute_tools/src/swap.rs
new file mode 100644
index 0000000000..c22b6bc14e
--- /dev/null
+++ b/compute_tools/src/swap.rs
@@ -0,0 +1,36 @@
+use anyhow::{anyhow, Context};
+use tracing::warn;
+
+pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap";
+
+pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
+    // run `/neonvm/bin/resize-swap --once {size_bytes}`
+    //
+    // Passing '--once' causes resize-swap to delete itself after successful completion, which
+    // means that if compute_ctl restarts later, we won't end up calling 'swapoff' while
+    // postgres is running.
+    //
+    // NOTE: resize-swap is not very clever. If present, --once MUST be the first arg.
+    let child_result = std::process::Command::new("/usr/bin/sudo")
+        .arg(RESIZE_SWAP_BIN)
+        .arg("--once")
+        .arg(size_bytes.to_string())
+        .spawn();
+
+    if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
+        warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
+        return Ok(());
+    }
+
+    child_result
+        .context("spawn() failed")
+        .and_then(|mut child| child.wait().context("wait() failed"))
+        .and_then(|status| match status.success() {
+            true => Ok(()),
+            false => Err(anyhow!("process exited with {status}")),
+        })
+        // wrap any prior error with the overall context that we couldn't run the command
+        .with_context(|| {
+            format!("could not run `/usr/bin/sudo {RESIZE_SWAP_BIN} --once {size_bytes}`")
+        })
+}
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 03f7db99fb..20371e1cb8 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -554,6 +554,7 @@ impl Endpoint {
             format_version: 1.0,
             operation_uuid: None,
             features: self.features.clone(),
+            swap_size_bytes: None,
             cluster: Cluster {
                 cluster_id: None, // project ID: not used
                 name: None,       // project name: not used
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 71ae66c45c..1c4ee2089f 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -33,6 +33,23 @@ pub struct ComputeSpec {
     #[serde(default)]
     pub features: Vec<ComputeFeature>,
 
+    /// If compute_ctl was passed `--resize-swap-on-bind`, a value of `Some(_)` instructs
+    /// compute_ctl to `/neonvm/bin/resize-swap` with the given size, when the spec is first
+    /// received.
+    ///
+    /// Both this field and `--resize-swap-on-bind` are required, so that the control plane's
+    /// spec generation doesn't need to be aware of the actual compute it's running on, while
+    /// guaranteeing gradual rollout of swap. Otherwise, without `--resize-swap-on-bind`, we could
+    /// end up trying to resize swap in VMs without it -- or end up *not* resizing swap, thus
+    /// giving every VM much more swap than it should have (32GiB).
+    ///
+    /// Eventually we may remove `--resize-swap-on-bind` and exclusively use `swap_size_bytes` for
+    /// enabling the swap resizing behavior once rollout is complete.
+    ///
+    /// See neondatabase/cloud#12047 for more.
+    #[serde(default)]
+    pub swap_size_bytes: Option<u64>,
+
     /// Expected cluster state at the end of transition process.
     pub cluster: Cluster,
     pub delta_operations: Option<Vec<DeltaOp>>,
diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 3ccdf5cc64..41ca16f16b 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -5,6 +5,12 @@ commands:
     user: root
     sysvInitAction: sysinit
     shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
+  # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for
+  # running it as root.
+  - name: chmod-resize-swap
+    user: root
+    sysvInitAction: sysinit
+    shell: 'chmod 711 /neonvm/bin/resize-swap'
   - name: pgbouncer
     user: postgres
     sysvInitAction: respawn
@@ -24,6 +30,11 @@ commands:
 shutdownHook: |
   su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
 files:
+  - filename: compute_ctl-resize-swap
+    content: |
+      # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
+      # as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL)
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap
   - filename: pgbouncer.ini
     content: |
       [databases]
@@ -353,6 +364,17 @@ merge: |
       && echo 'root - nofile 1048576' >>/etc/security/limits.conf \
          )
 
+  # Allow postgres user (compute_ctl) to run swap resizer.
+  # Need to install sudo in order to allow this.
+  #
+  # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe.
+  RUN set -e \
+      && apt update \
+      && apt install --no-install-recommends -y \
+             sudo \
+      && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+  COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap
+
   COPY cgconfig.conf /etc/cgconfig.conf
   COPY pgbouncer.ini /etc/pgbouncer.ini
   COPY sql_exporter.yml /etc/sql_exporter.yml

From 4deb8dc52ec26ab59a4d0b7ff548ef389e6717f9 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 4 May 2024 08:44:18 +0300
Subject: [PATCH 0694/1571] compute_ctl: Be more precise in how startup time is
 calculated (#7601)

- On a non-pooled start, do not reset the 'start_time' after launching
the HTTP service. In a non-pooled start, it's fair to include that in
the total startup time.

- When setting wait_for_spec_ms and resetting start_time, call
Utc::now() only once. It's a waste of cycles to call it twice, but also,
it ensures the time between setting wait_for_spec_ms and resetting
start_time is included in one or the other time period.

These differences should be insignificant in practice, in the
microsecond range, but IMHO it seems more logical and readable this way
too. Also fix and clarify some of the surrounding comments.

(This caught my eye while reviewing PR #7577)
---
 compute_tools/src/bin/compute_ctl.rs | 29 ++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 471d46d4f2..67c5250376 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -228,14 +228,14 @@ fn main() -> Result<()> {
 
     // If this is a pooled VM, prewarm before starting HTTP server and becoming
     // available for binding. Prewarming helps Postgres start quicker later,
-    // because QEMU will already have it's memory allocated from the host, and
+    // because QEMU will already have its memory allocated from the host, and
     // the necessary binaries will already be cached.
     if !spec_set {
         compute.prewarm_postgres()?;
     }
 
-    // Launch http service first, so we were able to serve control-plane
-    // requests, while configuration is still in progress.
+    // Launch http service first, so that we can serve control-plane requests
+    // while configuration is still in progress.
     let _http_handle =
         launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
 
@@ -255,21 +255,22 @@ fn main() -> Result<()> {
                 break;
             }
         }
+
+        // Record for how long we slept waiting for the spec.
+        let now = Utc::now();
+        state.metrics.wait_for_spec_ms = now
+            .signed_duration_since(state.start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
+
+        // Reset start time, so that the total startup time that is calculated later will
+        // not include the time that we waited for the spec.
+        state.start_time = now;
     }
 
     // We got all we need, update the state.
     let mut state = compute.state.lock().unwrap();
-
-    // Record for how long we slept waiting for the spec.
-    state.metrics.wait_for_spec_ms = Utc::now()
-        .signed_duration_since(state.start_time)
-        .to_std()
-        .unwrap()
-        .as_millis() as u64;
-    // Reset start time to the actual start of the configuration, so that
-    // total startup time was properly measured at the end.
-    state.start_time = Utc::now();
-
     state.status = ComputeStatus::Init;
     compute.state_changed.notify_all();
 

From 5da3e2113ad309e50b784a96d08b437e201cde06 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 2 May 2024 17:59:41 +0300
Subject: [PATCH 0695/1571] Allow bad state (not active) pageserver error/warns
 in walcraft test.

The top reason for it being flaky.
---
 test_runner/regress/test_crafted_wal_end.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py
index 01ecc2b95f..30f8d81890 100644
--- a/test_runner/regress/test_crafted_wal_end.py
+++ b/test_runner/regress/test_crafted_wal_end.py
@@ -19,6 +19,12 @@ from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft
 def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str):
     env = neon_env_builder.init_start()
     env.neon_cli.create_branch("test_crafted_wal_end")
+    env.pageserver.allowed_errors.extend(
+        [
+            # seems like pageserver stop triggers these
+            ".*initial size calculation failed.*Bad state (not active).*",
+        ]
+    )
 
     endpoint = env.endpoints.create("test_crafted_wal_end")
     wal_craft = WalCraft(env)

From ce4d3da3ae2d83f8a4dc632112c95580a2a25fbe Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 1 May 2024 18:22:34 +0300
Subject: [PATCH 0696/1571] Properly initialize first WAL segment on
 safekeepers.

Previously its segment header and page header of first record weren't
initialized because compute streams data only since first record LSN. Also, fix
a bug in the existing code for initialization: xlp_rem_len must not include page
header.

These changes make first segment pg_waldump'able.
---
 libs/postgres_ffi/src/xlog_utils.rs           | 39 ++++++++++++-------
 safekeeper/src/safekeeper.rs                  | 16 ++++++++
 safekeeper/src/wal_storage.rs                 | 24 ++++++++++++
 .../tests/walproposer_sim/safekeeper_disk.rs  |  4 ++
 4 files changed, 70 insertions(+), 13 deletions(-)

diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 4a66a0ab1d..0bbb91afc2 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -331,7 +331,10 @@ impl CheckPoint {
     /// Returns 'true' if the XID was updated.
     pub fn update_next_xid(&mut self, xid: u32) -> bool {
         // nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
-        let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID);
+        let mut new_xid = std::cmp::max(
+            xid.wrapping_add(1),
+            pg_constants::FIRST_NORMAL_TRANSACTION_ID,
+        );
         // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
         // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
         new_xid =
@@ -367,8 +370,16 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
     let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE);
 
     let first_page_only = seg_off < XLOG_BLCKSZ;
-    let (shdr_rem_len, infoflags) = if first_page_only {
-        (seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD)
+    // If first records starts in the middle of the page, pretend in page header
+    // there is a fake record which ends where first real record starts. This
+    // makes pg_waldump etc happy.
+    let (shdr_rem_len, infoflags) = if first_page_only && seg_off > 0 {
+        assert!(seg_off >= XLOG_SIZE_OF_XLOG_LONG_PHD);
+        // xlp_rem_len doesn't include page header, hence the subtraction.
+        (
+            seg_off - XLOG_SIZE_OF_XLOG_LONG_PHD,
+            pg_constants::XLP_FIRST_IS_CONTRECORD,
+        )
     } else {
         (0, 0)
     };
@@ -397,20 +408,22 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
 
     if !first_page_only {
         let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize;
+        // see comments above about XLP_FIRST_IS_CONTRECORD and xlp_rem_len.
+        let (xlp_rem_len, xlp_info) = if page_off > 0 {
+            assert!(page_off >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
+            (
+                (page_off - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64) as u32,
+                pg_constants::XLP_FIRST_IS_CONTRECORD,
+            )
+        } else {
+            (0, 0)
+        };
         let header = XLogPageHeaderData {
             xlp_magic: XLOG_PAGE_MAGIC as u16,
-            xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
-                pg_constants::XLP_FIRST_IS_CONTRECORD
-            } else {
-                0
-            },
+            xlp_info,
             xlp_tli: PG_TLI,
             xlp_pageaddr: lsn.page_lsn().0,
-            xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
-                page_off as u32
-            } else {
-                0u32
-            },
+            xlp_rem_len,
             ..Default::default() // Put 0 in padding fields.
         };
         let hdr_bytes = header.encode()?;
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index f2ee0403eb..e671d4f36a 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -725,6 +725,18 @@ where
             self.state.inmem.commit_lsn
         );
 
+        // Before first WAL write initialize its segment. It makes first segment
+        // pg_waldump'able because stream from compute doesn't include its
+        // segment and page headers.
+        //
+        // If we fail before first WAL write flush this action would be
+        // repeated, that's ok because it is idempotent.
+        if self.wal_store.flush_lsn() == Lsn::INVALID {
+            self.wal_store
+                .initialize_first_segment(msg.start_streaming_at)
+                .await?;
+        }
+
         // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to
         // intersection of our history and history from msg
 
@@ -1007,6 +1019,10 @@ mod tests {
             self.lsn
         }
 
+        async fn initialize_first_segment(&mut self, _init_lsn: Lsn) -> Result<()> {
+            Ok(())
+        }
+
         async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
             self.lsn = startpos + buf.len() as u64;
             Ok(())
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 147f318b9f..6bc8c7c3f9 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -38,6 +38,12 @@ pub trait Storage {
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn;
 
+    /// Initialize segment by creating proper long header at the beginning of
+    /// the segment and short header at the page of given LSN. This is only used
+    /// for timeline initialization because compute will stream data only since
+    /// init_lsn. Other segment headers are included in compute stream.
+    async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()>;
+
     /// Write piece of WAL from buf to disk, but not necessarily sync it.
     async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>;
 
@@ -78,6 +84,8 @@ pub struct PhysicalStorage {
 
     /// Size of WAL segment in bytes.
     wal_seg_size: usize,
+    pg_version: u32,
+    system_id: u64,
 
     /// Written to disk, but possibly still in the cache and not fully persisted.
     /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
@@ -169,6 +177,8 @@ impl PhysicalStorage {
             timeline_dir,
             conf: conf.clone(),
             wal_seg_size,
+            pg_version: state.server.pg_version,
+            system_id: state.server.system_id,
             write_lsn,
             write_record_lsn: write_lsn,
             flush_record_lsn: flush_lsn,
@@ -324,6 +334,20 @@ impl Storage for PhysicalStorage {
         self.flush_record_lsn
     }
 
+    async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()> {
+        let segno = init_lsn.segment_number(self.wal_seg_size);
+        let (mut file, _) = self.open_or_create(segno).await?;
+        let major_pg_version = self.pg_version / 10000;
+        let wal_seg =
+            postgres_ffi::generate_wal_segment(segno, self.system_id, major_pg_version, init_lsn)?;
+        file.seek(SeekFrom::Start(0)).await?;
+        file.write_all(&wal_seg).await?;
+        file.flush().await?;
+        info!("initialized segno {} at lsn {}", segno, init_lsn);
+        // note: file is *not* fsynced
+        Ok(())
+    }
+
     /// Write WAL to disk.
     async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
         // Disallow any non-sequential writes, which can result in gaps or overwrites.
diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
index 35bca325aa..c2db9de78a 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
@@ -182,6 +182,10 @@ impl wal_storage::Storage for DiskWALStorage {
         self.flush_record_lsn
     }
 
+    async fn initialize_first_segment(&mut self, _init_lsn: Lsn) -> Result<()> {
+        Ok(())
+    }
+
     /// Write piece of WAL from buf to disk, but not necessarily sync it.
     async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
         if self.write_lsn != startpos {

From 0353a72a00887173f802ba044d169a4c278ea8f8 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 2 May 2024 17:43:31 +0300
Subject: [PATCH 0697/1571] pg_waldump segment on safekeeper in
 test_pg_waldump.

To test it as well.
---
 test_runner/regress/test_pg_waldump.py | 33 +++++++++++++++++++-------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/test_runner/regress/test_pg_waldump.py b/test_runner/regress/test_pg_waldump.py
index 1973aa5952..8e80efd9ba 100644
--- a/test_runner/regress/test_pg_waldump.py
+++ b/test_runner/regress/test_pg_waldump.py
@@ -1,13 +1,28 @@
 import os
+import shutil
 
 from fixtures.neon_fixtures import NeonEnv, PgBin
 from fixtures.utils import subprocess_capture
 
 
+def check_wal_segment(pg_waldump_path: str, segment_path: str, test_output_dir):
+    # use special --ignore option to ignore the validation checks in pg_waldump
+    # this is necessary, because neon WAL files contain gap at the beginning
+    output_path, _, _ = subprocess_capture(
+        test_output_dir, [pg_waldump_path, "--ignore", segment_path]
+    )
+
+    with open(f"{output_path}.stdout", "r") as f:
+        stdout = f.read()
+        assert "ABORT" in stdout
+        assert "COMMIT" in stdout
+
+
 # Simple test to check that pg_waldump works with neon WAL files
 def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_pg_waldump", "empty")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_pg_waldump", "empty")
     endpoint = env.endpoints.create_start("test_pg_waldump")
 
     cur = endpoint.connect().cursor()
@@ -35,12 +50,12 @@ def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin):
     assert endpoint.pgdata_dir
     wal_path = os.path.join(endpoint.pgdata_dir, "pg_wal/000000010000000000000001")
     pg_waldump_path = os.path.join(pg_bin.pg_bin_path, "pg_waldump")
+    # check segment on compute
+    check_wal_segment(pg_waldump_path, wal_path, test_output_dir)
 
-    # use special --ignore option to ignore the validation checks in pg_waldump
-    # this is necessary, because neon WAL files contain gap at the beginning
-    output_path, _, _ = subprocess_capture(test_output_dir, [pg_waldump_path, "--ignore", wal_path])
-
-    with open(f"{output_path}.stdout", "r") as f:
-        stdout = f.read()
-        assert "ABORT" in stdout
-        assert "COMMIT" in stdout
+    # Check file on safekeepers as well. pg_waldump is strict about file naming, so remove .partial suffix.
+    sk = env.safekeepers[0]
+    sk_tli_dir = sk.timeline_dir(tenant_id, timeline_id)
+    non_partial_path = os.path.join(sk_tli_dir, "000000010000000000000001")
+    shutil.copyfile(os.path.join(sk_tli_dir, "000000010000000000000001.partial"), non_partial_path)
+    check_wal_segment(pg_waldump_path, non_partial_path, test_output_dir)

From e6da7e29ed0a550ec59686bea7e656e16b2f13e7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 2 May 2024 11:51:24 +0300
Subject: [PATCH 0698/1571] Add option allowing running multiple endpoints on
 the same branch.

This is used by safekeeper tests.
---
 control_plane/src/bin/neon_local.rs           | 28 +++++++++++++++----
 test_runner/fixtures/neon_fixtures.py         | 22 +++++++++++++--
 .../regress/test_wal_acceptor_async.py        | 20 ++++++++++---
 3 files changed, 58 insertions(+), 12 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 14b83c1252..e01d5c9799 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -837,6 +837,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 .copied()
                 .unwrap_or(false);
 
+            let allow_multiple = sub_args.get_flag("allow-multiple");
+
             let mode = match (lsn, hot_standby) {
                 (Some(lsn), false) => ComputeMode::Static(lsn),
                 (None, true) => ComputeMode::Replica,
@@ -854,7 +856,9 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 _ => {}
             }
 
-            cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
+            if !allow_multiple {
+                cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
+            }
 
             cplane.new_endpoint(
                 &endpoint_id,
@@ -883,6 +887,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
 
             let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
 
+            let allow_multiple = sub_args.get_flag("allow-multiple");
+
             // If --safekeepers argument is given, use only the listed safekeeper nodes.
             let safekeepers =
                 if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -908,11 +914,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 .cloned()
                 .unwrap_or_default();
 
-            cplane.check_conflicting_endpoints(
-                endpoint.mode,
-                endpoint.tenant_id,
-                endpoint.timeline_id,
-            )?;
+            if !allow_multiple {
+                cplane.check_conflicting_endpoints(
+                    endpoint.mode,
+                    endpoint.tenant_id,
+                    endpoint.timeline_id,
+                )?;
+            }
 
             let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
                 let conf = env.get_pageserver_conf(pageserver_id).unwrap();
@@ -1444,6 +1452,12 @@ fn cli() -> Command {
         .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
         .required(false);
 
+    let allow_multiple = Arg::new("allow-multiple")
+        .help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.")
+        .long("allow-multiple")
+        .action(ArgAction::SetTrue)
+        .required(false);
+
     Command::new("Neon CLI")
         .arg_required_else_help(true)
         .version(GIT_VERSION)
@@ -1601,6 +1615,7 @@ fn cli() -> Command {
                     .arg(pg_version_arg.clone())
                     .arg(hot_standby_arg.clone())
                     .arg(update_catalog)
+                    .arg(allow_multiple.clone())
                 )
                 .subcommand(Command::new("start")
                     .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
@@ -1609,6 +1624,7 @@ fn cli() -> Command {
                     .arg(safekeepers_arg)
                     .arg(remote_ext_config_args)
                     .arg(create_test_user)
+                    .arg(allow_multiple.clone())
                 )
                 .subcommand(Command::new("reconfigure")
                             .about("Reconfigure the endpoint")
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 90884ad7f8..240b6ee199 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1801,6 +1801,7 @@ class NeonCli(AbstractNeonCli):
         hot_standby: bool = False,
         lsn: Optional[Lsn] = None,
         pageserver_id: Optional[int] = None,
+        allow_multiple=False,
     ) -> "subprocess.CompletedProcess[str]":
         args = [
             "endpoint",
@@ -1824,6 +1825,8 @@ class NeonCli(AbstractNeonCli):
             args.extend(["--hot-standby", "true"])
         if pageserver_id is not None:
             args.extend(["--pageserver-id", str(pageserver_id)])
+        if allow_multiple:
+            args.extend(["--allow-multiple"])
 
         res = self.raw_cli(args)
         res.check_returncode()
@@ -1835,6 +1838,7 @@ class NeonCli(AbstractNeonCli):
         safekeepers: Optional[List[int]] = None,
         remote_ext_config: Optional[str] = None,
         pageserver_id: Optional[int] = None,
+        allow_multiple=False,
     ) -> "subprocess.CompletedProcess[str]":
         args = [
             "endpoint",
@@ -1849,6 +1853,8 @@ class NeonCli(AbstractNeonCli):
             args.append(endpoint_id)
         if pageserver_id is not None:
             args.extend(["--pageserver-id", str(pageserver_id)])
+        if allow_multiple:
+            args.extend(["--allow-multiple"])
 
         res = self.raw_cli(args)
         res.check_returncode()
@@ -3299,6 +3305,7 @@ class Endpoint(PgProtocol):
         lsn: Optional[Lsn] = None,
         config_lines: Optional[List[str]] = None,
         pageserver_id: Optional[int] = None,
+        allow_multiple: bool = False,
     ) -> "Endpoint":
         """
         Create a new Postgres endpoint.
@@ -3321,6 +3328,7 @@ class Endpoint(PgProtocol):
             pg_port=self.pg_port,
             http_port=self.http_port,
             pageserver_id=pageserver_id,
+            allow_multiple=allow_multiple,
         )
         path = Path("endpoints") / self.endpoint_id / "pgdata"
         self.pgdata_dir = os.path.join(self.env.repo_dir, path)
@@ -3337,7 +3345,10 @@ class Endpoint(PgProtocol):
         return self
 
     def start(
-        self, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None
+        self,
+        remote_ext_config: Optional[str] = None,
+        pageserver_id: Optional[int] = None,
+        allow_multiple: bool = False,
     ) -> "Endpoint":
         """
         Start the Postgres instance.
@@ -3353,6 +3364,7 @@ class Endpoint(PgProtocol):
             safekeepers=self.active_safekeepers,
             remote_ext_config=remote_ext_config,
             pageserver_id=pageserver_id,
+            allow_multiple=allow_multiple,
         )
         self.running = True
 
@@ -3482,6 +3494,7 @@ class Endpoint(PgProtocol):
         config_lines: Optional[List[str]] = None,
         remote_ext_config: Optional[str] = None,
         pageserver_id: Optional[int] = None,
+        allow_multiple=False,
     ) -> "Endpoint":
         """
         Create an endpoint, apply config, and start Postgres.
@@ -3497,7 +3510,12 @@ class Endpoint(PgProtocol):
             hot_standby=hot_standby,
             lsn=lsn,
             pageserver_id=pageserver_id,
-        ).start(remote_ext_config=remote_ext_config, pageserver_id=pageserver_id)
+            allow_multiple=allow_multiple,
+        ).start(
+            remote_ext_config=remote_ext_config,
+            pageserver_id=pageserver_id,
+            allow_multiple=allow_multiple,
+        )
 
         log.info(f"Postgres startup took {time.time() - started_at} seconds")
 
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index 5902eb3217..dce5616ac6 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -254,7 +254,9 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder):
     )
 
 
-def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]):
+def endpoint_create_start(
+    env: NeonEnv, branch: str, pgdir_name: Optional[str], allow_multiple: bool = False
+):
     endpoint = Endpoint(
         env,
         tenant_id=env.initial_tenant,
@@ -268,14 +270,23 @@ def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]):
     # embed current time in endpoint ID
     endpoint_id = pgdir_name or f"ep-{time.time()}"
     return endpoint.create_start(
-        branch_name=branch, endpoint_id=endpoint_id, config_lines=["log_statement=all"]
+        branch_name=branch,
+        endpoint_id=endpoint_id,
+        config_lines=["log_statement=all"],
+        allow_multiple=allow_multiple,
     )
 
 
 async def exec_compute_query(
-    env: NeonEnv, branch: str, query: str, pgdir_name: Optional[str] = None
+    env: NeonEnv,
+    branch: str,
+    query: str,
+    pgdir_name: Optional[str] = None,
+    allow_multiple: bool = False,
 ):
-    with endpoint_create_start(env, branch=branch, pgdir_name=pgdir_name) as endpoint:
+    with endpoint_create_start(
+        env, branch=branch, pgdir_name=pgdir_name, allow_multiple=allow_multiple
+    ) as endpoint:
         before_conn = time.time()
         conn = await endpoint.connect_async()
         res = await conn.fetch(query)
@@ -347,6 +358,7 @@ class BackgroundCompute(object):
                     self.branch,
                     f"INSERT INTO query_log(index, verify_key) VALUES ({self.index}, {verify_key}) RETURNING verify_key",
                     pgdir_name=f"bgcompute{self.index}_key{verify_key}",
+                    allow_multiple=True,
                 )
                 log.info(f"result: {res}")
                 if len(res) != 1:

From 0115fe6cb2822f628f0b6f49270a82268a55f3a4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 6 May 2024 14:37:55 +0300
Subject: [PATCH 0699/1571] Make 'neon.protocol_version = 2' the default
 (#7616)

Once all the computes in production have restarted, we can remove
protocol version 1 altogether.

See issue #6211.
---
 pgxn/neon/libpagestore.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index b7b1e7ccbf..f5ce2caff3 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -49,7 +49,7 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;
 
-int         neon_protocol_version = 1;
+int         neon_protocol_version = 2;
 
 static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
@@ -860,7 +860,7 @@ pg_init_libpagestore(void)
 							"Version of compute<->page server protocol",
 							NULL,
 							&neon_protocol_version,
-							1, /* default to old protocol for now */
+							2, /* use protocol version 2 */
 							1, /* min */
 							2, /* max */
 							PGC_SU_BACKEND,

From 3764dd2e84db2e2bcf2df065df25304d4dddcaf6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 6 May 2024 14:07:07 +0100
Subject: [PATCH 0700/1571] pageserver: call maybe_freeze_ephemeral_layer from
 a dedicated task (#7594)

## Problem

In testing of the earlier fix for OOMs under heavy write load
(https://github.com/neondatabase/neon/pull/7218), we saw that the limit
on ephemeral layer size wasn't being reliably enforced. That was
diagnosed as being due to overwhelmed compaction loops: most tenants
were waiting on the semaphore for background tasks, and thereby not
running the function that proactively rolls layers frequently enough.

Related: https://github.com/neondatabase/neon/issues/6939

## Summary of changes

- Create a new per-tenant background loop for "ingest housekeeping",
which invokes maybe_freeze_ephemeral_layer() without taking the
background task semaphore.
- Downgrade to DEBUG a log line in maybe_freeze_ephemeral_layer that had
been INFO, but turns out to be pretty common in the field.

There's some discussion on the issue
(https://github.com/neondatabase/neon/issues/6939#issuecomment-2083554275)
about alternatives for calling this maybe_freeze_epemeral_layer
periodically without it getting stuck behind compaction. A whole task
just for this feels like kind of a big hammer, but we may in future find
that there are other pieces of lightweight housekeeping that we want to
do here too.

Why is it okay to call maybe_freeze_ephemeral_layer outside of the
background tasks semaphore?
- this is the same work we would do anyway if we receive writes from the
safekeeper, just done a bit sooner.
- The period of the new task is generously jittered (+/- 5%), so when
the ephemeral layer size tips over the threshold, we shouldn't see an
excessively aggressive thundering herd of layer freezes (and only layers
larger than the mean layer size will be frozen)
- All that said, this is an imperfect approach that relies on having a
generous amount of RAM to dip into when we need to freeze somewhat
urgently. It would be nice in future to also block compaction/GC when we
recognize resource stress and need to do other work (like layer
freezing) to reduce memory footprint.
---
 pageserver/src/task_mgr.rs        |  3 ++
 pageserver/src/tenant.rs          | 28 ++++++++++
 pageserver/src/tenant/tasks.rs    | 85 ++++++++++++++++++++++++++++++-
 pageserver/src/tenant/timeline.rs | 23 +++------
 4 files changed, 122 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 0c245580ee..01a8974494 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -319,6 +319,9 @@ pub enum TaskKind {
     // Eviction. One per timeline.
     Eviction,
 
+    // Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure)
+    IngestHousekeeping,
+
     /// See [`crate::disk_usage_eviction_task`].
     DiskUsageEviction,
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index fdc49ae295..2d7a2e0f9d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1676,6 +1676,34 @@ impl Tenant {
         Ok(())
     }
 
+    // Call through to all timelines to freeze ephemeral layers if needed.  Usually
+    // this happens during ingest: this background housekeeping is for freezing layers
+    // that are open but haven't been written to for some time.
+    async fn ingest_housekeeping(&self) {
+        // Scan through the hashmap and collect a list of all the timelines,
+        // while holding the lock. Then drop the lock and actually perform the
+        // compactions.  We don't want to block everything else while the
+        // compaction runs.
+        let timelines = {
+            self.timelines
+                .lock()
+                .unwrap()
+                .values()
+                .filter_map(|timeline| {
+                    if timeline.is_active() {
+                        Some(timeline.clone())
+                    } else {
+                        None
+                    }
+                })
+                .collect::<Vec<_>>()
+        };
+
+        for timeline in &timelines {
+            timeline.maybe_freeze_ephemeral_layer().await;
+        }
+    }
+
     pub fn current_state(&self) -> TenantState {
         self.state.borrow().clone()
     }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 41b77c1f4a..f153719f98 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -2,6 +2,7 @@
 //! such as compaction and GC
 
 use std::ops::ControlFlow;
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 
@@ -9,9 +10,11 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD;
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
+use rand::Rng;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{backoff, completion};
@@ -44,6 +47,7 @@ pub(crate) enum BackgroundLoopKind {
     Compaction,
     Gc,
     Eviction,
+    IngestHouseKeeping,
     ConsumptionMetricsCollectMetrics,
     ConsumptionMetricsSyntheticSizeWorker,
     InitialLogicalSizeCalculation,
@@ -132,6 +136,30 @@ pub fn start_background_loops(
             }
         },
     );
+
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::IngestHousekeeping,
+        Some(tenant_shard_id),
+        None,
+        &format!("ingest housekeeping for tenant {tenant_shard_id}"),
+        false,
+        {
+            let tenant = Arc::clone(tenant);
+            let background_jobs_can_start = background_jobs_can_start.cloned();
+            async move {
+                let cancel = task_mgr::shutdown_token();
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()) },
+                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                };
+                ingest_housekeeping_loop(tenant, cancel)
+                    .instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
+                    .await;
+                Ok(())
+            }
+        },
+    );
 }
 
 ///
@@ -379,6 +407,61 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
 }
 
+async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
+    async {
+        loop {
+            tokio::select! {
+                _ = cancel.cancelled() => {
+                    return;
+                },
+                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
+                    ControlFlow::Break(()) => return,
+                    ControlFlow::Continue(()) => (),
+                },
+            }
+
+            // We run ingest housekeeping with the same frequency as compaction: it is not worth
+            // having a distinct setting.  But we don't run it in the same task, because compaction
+            // blocks on acquiring the background job semaphore.
+            let period = tenant.get_compaction_period();
+
+            // If compaction period is set to zero (to disable it), then we will use a reasonable default
+            let period = if period == Duration::ZERO {
+                humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD)
+                    .unwrap()
+                    .into()
+            } else {
+                period
+            };
+
+            // Jitter the period by +/- 5%
+            let period =
+                rand::thread_rng().gen_range((period * (95)) / 100..(period * (105)) / 100);
+
+            // Always sleep first: we do not need to do ingest housekeeping early in the lifetime of
+            // a tenant, since it won't have started writing any ephemeral files yet.
+            if tokio::time::timeout(period, cancel.cancelled())
+                .await
+                .is_ok()
+            {
+                break;
+            }
+
+            let started_at = Instant::now();
+            tenant.ingest_housekeeping().await;
+
+            warn_when_period_overrun(
+                started_at.elapsed(),
+                period,
+                BackgroundLoopKind::IngestHouseKeeping,
+            );
+        }
+    }
+    .await;
+    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
+}
+
 async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
     // if the tenant has a proper status already, no need to wait for anything
     if tenant.current_state() == TenantState::Active {
@@ -420,8 +503,6 @@ pub(crate) async fn random_init_delay(
     period: Duration,
     cancel: &CancellationToken,
 ) -> Result<(), Cancelled> {
-    use rand::Rng;
-
     if period == Duration::ZERO {
         return Ok(());
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c7a5598cec..3748036e4f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1498,11 +1498,11 @@ impl Timeline {
         self.flush_frozen_layers_and_wait(to_lsn).await
     }
 
-    /// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
-    ///
-    /// This is for use in background housekeeping, to provide guarantees of layers closing eventually
-    /// even if there are no ongoing writes to drive that.
-    async fn maybe_freeze_ephemeral_layer(&self) {
+    // Check if an open ephemeral layer should be closed: this provides
+    // background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping
+    // an ephemeral layer open forever when idle.  It also freezes layers if the global limit on
+    // ephemeral layer bytes has been breached.
+    pub(super) async fn maybe_freeze_ephemeral_layer(&self) {
         let Ok(_write_guard) = self.write_lock.try_lock() else {
             // If the write lock is held, there is an active wal receiver: rolling open layers
             // is their responsibility while they hold this lock.
@@ -1529,13 +1529,11 @@ impl Timeline {
                 // we are a sharded tenant and have skipped some WAL
                 let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
                 if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
-                    // This should be somewhat rare, so we log it at INFO level.
-                    //
-                    // We checked for checkpoint timeout so that a shard without any
-                    // data ingested (yet) doesn't write a remote index as soon as it
+                    // Only do this if have been layer-less longer than get_checkpoint_timeout, so that a shard
+                    // without any data ingested (yet) doesn't write a remote index as soon as it
                     // sees its LSN advance: we only do this if we've been layer-less
                     // for some time.
-                    tracing::info!(
+                    tracing::debug!(
                         "Advancing disk_consistent_lsn past WAL ingest gap {} -> {}",
                         disk_consistent_lsn,
                         last_record_lsn
@@ -1625,11 +1623,6 @@ impl Timeline {
             (guard, permit)
         };
 
-        // Prior to compaction, check if an open ephemeral layer should be closed: this provides
-        // background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping
-        // an ephemeral layer open forever when idle.
-        self.maybe_freeze_ephemeral_layer().await;
-
         // this wait probably never needs any "long time spent" logging, because we already nag if
         // compaction task goes over it's period (20s) which is quite often in production.
         let (_guard, _permit) = tokio::select! {

From 67a2215163a0d93eb444c1993d4c4824592d1f12 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 6 May 2024 14:07:15 +0100
Subject: [PATCH 0701/1571] pageserver: label tenant_slots metric by slot type
 (#7603)

## Problem

The current `tenant_slots` metric becomes less useful once we have lots
of secondaries, because we can't tell how many tenants are really
attached (without doing a sum() on some other metric).

## Summary of changes

- Add a `mode` label to this metric
- Update the metric with `slot_added` and `slot_removed` helpers that
are called at all the places we mutate the tenants map.
- Add a debug assertion at shutdown that checks the metrics add up to
the right number, as a cheap way of validating that we're calling the
metric hooks in all the right places.
---
 pageserver/src/metrics.rs                     | 82 +++++++++++++++----
 pageserver/src/tenant/delete.rs               | 17 +++-
 pageserver/src/tenant/mgr.rs                  | 36 ++++++--
 .../regress/test_pageserver_restart.py        | 10 ++-
 test_runner/regress/test_s3_restore.py        |  8 +-
 test_runner/regress/test_tenant_delete.py     | 17 ++--
 6 files changed, 131 insertions(+), 39 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 903bad34cc..40712e4895 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1512,29 +1512,80 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
 });
 
 pub(crate) struct TenantManagerMetrics {
-    pub(crate) tenant_slots: UIntGauge,
+    tenant_slots_attached: UIntGauge,
+    tenant_slots_secondary: UIntGauge,
+    tenant_slots_inprogress: UIntGauge,
     pub(crate) tenant_slot_writes: IntCounter,
     pub(crate) unexpected_errors: IntCounter,
 }
 
+impl TenantManagerMetrics {
+    /// Helpers for tracking slots.  Note that these do not track the lifetime of TenantSlot objects
+    /// exactly: they track the lifetime of the slots _in the tenant map_.
+    pub(crate) fn slot_inserted(&self, slot: &TenantSlot) {
+        match slot {
+            TenantSlot::Attached(_) => {
+                self.tenant_slots_attached.inc();
+            }
+            TenantSlot::Secondary(_) => {
+                self.tenant_slots_secondary.inc();
+            }
+            TenantSlot::InProgress(_) => {
+                self.tenant_slots_inprogress.inc();
+            }
+        }
+    }
+
+    pub(crate) fn slot_removed(&self, slot: &TenantSlot) {
+        match slot {
+            TenantSlot::Attached(_) => {
+                self.tenant_slots_attached.dec();
+            }
+            TenantSlot::Secondary(_) => {
+                self.tenant_slots_secondary.dec();
+            }
+            TenantSlot::InProgress(_) => {
+                self.tenant_slots_inprogress.dec();
+            }
+        }
+    }
+
+    #[cfg(all(debug_assertions, not(test)))]
+    pub(crate) fn slots_total(&self) -> u64 {
+        self.tenant_slots_attached.get()
+            + self.tenant_slots_secondary.get()
+            + self.tenant_slots_inprogress.get()
+    }
+}
+
 pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
-    TenantManagerMetrics {
-    tenant_slots: register_uint_gauge!(
+    let tenant_slots = register_uint_gauge_vec!(
         "pageserver_tenant_manager_slots",
         "How many slots currently exist, including all attached, secondary and in-progress operations",
+        &["mode"]
     )
-    .expect("failed to define a metric"),
-    tenant_slot_writes: register_int_counter!(
-        "pageserver_tenant_manager_slot_writes",
-        "Writes to a tenant slot, including all of create/attach/detach/delete"
-    )
-    .expect("failed to define a metric"),
-    unexpected_errors: register_int_counter!(
-        "pageserver_tenant_manager_unexpected_errors_total",
-        "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
-    )
-    .expect("failed to define a metric"),
-}
+    .expect("failed to define a metric");
+    TenantManagerMetrics {
+        tenant_slots_attached: tenant_slots
+            .get_metric_with_label_values(&["attached"])
+            .unwrap(),
+        tenant_slots_secondary: tenant_slots
+            .get_metric_with_label_values(&["secondary"])
+            .unwrap(),
+        tenant_slots_inprogress: tenant_slots
+            .get_metric_with_label_values(&["inprogress"])
+            .unwrap(),
+        tenant_slot_writes: register_int_counter!(
+            "pageserver_tenant_manager_slot_writes",
+            "Writes to a tenant slot, including all of create/attach/detach/delete"
+        )
+        .expect("failed to define a metric"),
+        unexpected_errors: register_int_counter!(
+            "pageserver_tenant_manager_unexpected_errors_total",
+            "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
+        )
+        .expect("failed to define a metric"),
+    }
 });
 
 pub(crate) struct DeletionQueueMetrics {
@@ -2275,6 +2326,7 @@ use std::time::{Duration, Instant};
 
 use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
+use crate::tenant::mgr::TenantSlot;
 
 /// Maintain a per timeline gauge in addition to the global gauge.
 struct PerTimelineRemotePhysicalSizeGauge {
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 33d0f677e5..2e5259bfe2 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -585,9 +585,20 @@ impl DeleteTenantFlow {
 
                     // FIXME: we should not be modifying this from outside of mgr.rs.
                     // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
-                    crate::metrics::TENANT_MANAGER
-                        .tenant_slots
-                        .set(locked.len() as u64);
+
+                    // Update stats
+                    match &removed {
+                        TenantsMapRemoveResult::Occupied(slot) => {
+                            crate::metrics::TENANT_MANAGER.slot_removed(slot);
+                        }
+                        TenantsMapRemoveResult::InProgress(barrier) => {
+                            crate::metrics::TENANT_MANAGER
+                                .slot_removed(&TenantSlot::InProgress(barrier.clone()));
+                        }
+                        TenantsMapRemoveResult::Vacant => {
+                            // Nothing changed in map, no metric update
+                        }
+                    }
 
                     match removed {
                         TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 006d501daa..22173c6b5a 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -246,6 +246,7 @@ impl TenantsMap {
         }
     }
 
+    #[cfg(all(debug_assertions, not(test)))]
     pub(crate) fn len(&self) -> usize {
         match self {
             TenantsMap::Initializing => 0,
@@ -746,6 +747,7 @@ pub async fn init_tenant_mgr(
             }
         };
 
+        METRICS.slot_inserted(&slot);
         tenants.insert(tenant_shard_id, slot);
     }
 
@@ -753,7 +755,7 @@ pub async fn init_tenant_mgr(
 
     let mut tenants_map = TENANTS.write().unwrap();
     assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
-    METRICS.tenant_slots.set(tenants.len() as u64);
+
     *tenants_map = TenantsMap::Open(tenants);
 
     Ok(TenantManager {
@@ -824,6 +826,14 @@ fn tenant_spawn(
 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
     let mut join_set = JoinSet::new();
 
+    #[cfg(all(debug_assertions, not(test)))]
+    {
+        // Check that our metrics properly tracked the size of the tenants map.  This is a convenient location to check,
+        // as it happens implicitly at the end of tests etc.
+        let m = tenants.read().unwrap();
+        debug_assert_eq!(METRICS.slots_total(), m.len() as u64);
+    }
+
     // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
     let (total_in_progress, total_attached) = {
         let mut m = tenants.write().unwrap();
@@ -2428,10 +2438,13 @@ impl SlotGuard {
                 TenantsMap::Open(m) => m,
             };
 
+            METRICS.slot_inserted(&new_value);
+
             let replaced = m.insert(self.tenant_shard_id, new_value);
             self.upserted = true;
-
-            METRICS.tenant_slots.set(m.len() as u64);
+            if let Some(replaced) = replaced.as_ref() {
+                METRICS.slot_removed(replaced);
+            }
 
             replaced
         };
@@ -2541,9 +2554,13 @@ impl Drop for SlotGuard {
                 }
 
                 if self.old_value_is_shutdown() {
+                    METRICS.slot_removed(entry.get());
                     entry.remove();
                 } else {
-                    entry.insert(self.old_value.take().unwrap());
+                    let inserting = self.old_value.take().unwrap();
+                    METRICS.slot_inserted(&inserting);
+                    let replaced = entry.insert(inserting);
+                    METRICS.slot_removed(&replaced);
                 }
             }
             Entry::Vacant(_) => {
@@ -2554,8 +2571,6 @@ impl Drop for SlotGuard {
                 );
             }
         }
-
-        METRICS.tenant_slots.set(m.len() as u64);
     }
 }
 
@@ -2635,7 +2650,9 @@ fn tenant_map_acquire_slot_impl(
             }
             _ => {
                 let (completion, barrier) = utils::completion::channel();
-                v.insert(TenantSlot::InProgress(barrier));
+                let inserting = TenantSlot::InProgress(barrier);
+                METRICS.slot_inserted(&inserting);
+                v.insert(inserting);
                 tracing::debug!("Vacant, inserted InProgress");
                 Ok(SlotGuard::new(*tenant_shard_id, None, completion))
             }
@@ -2671,7 +2688,10 @@ fn tenant_map_acquire_slot_impl(
                 _ => {
                     // Happy case: the slot was not in any state that violated our mode
                     let (completion, barrier) = utils::completion::channel();
-                    let old_value = o.insert(TenantSlot::InProgress(barrier));
+                    let in_progress = TenantSlot::InProgress(barrier);
+                    METRICS.slot_inserted(&in_progress);
+                    let old_value = o.insert(in_progress);
+                    METRICS.slot_removed(&old_value);
                     tracing::debug!("Occupied, replaced with InProgress");
                     Ok(SlotGuard::new(
                         *tenant_shard_id,
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index 753898f747..759e845927 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -20,7 +20,10 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
     endpoint = env.endpoints.create_start("main")
     pageserver_http = env.pageserver.http_client()
 
-    assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+    assert (
+        pageserver_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"})
+        == 1
+    )
 
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
@@ -55,7 +58,10 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
     env.pageserver.start()
 
     # We reloaded our tenant
-    assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+    assert (
+        pageserver_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"})
+        == 1
+    )
 
     cur.execute("SELECT count(*) FROM foo")
     assert cur.fetchone() == (100000,)
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
index 611bd1c2a2..9227836862 100644
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -47,7 +47,7 @@ def test_tenant_s3_restore(
     tenant_id = env.initial_tenant
 
     # Default tenant and the one we created
-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
 
     # create two timelines one being the parent of another, both with non-trivial data
     parent = None
@@ -72,13 +72,13 @@ def test_tenant_s3_restore(
     time.sleep(4)
 
     assert (
-        ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+        ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
     ), "tenant removed before we deletion was issued"
     iterations = poll_for_remote_storage_iterations(remote_storage_kind)
     tenant_delete_wait_completed(ps_http, tenant_id, iterations)
     ps_http.deletion_queue_flush(execute=True)
     assert (
-        ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
+        ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
     ), "tenant removed before we deletion was issued"
     env.storage_controller.attach_hook_drop(tenant_id)
 
@@ -116,4 +116,4 @@ def test_tenant_s3_restore(
             # There might be some activity that advances the lsn so we can't use a strict equality check
             assert last_flush_lsn >= expected_last_flush_lsn, "last_flush_lsn too old"
 
-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index c115c0375b..363c3c88ec 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -64,7 +64,7 @@ def test_tenant_delete_smoke(
     )
 
     # Default tenant and the one we created
-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2
 
     # create two timelines one being the parent of another
     parent = None
@@ -90,9 +90,9 @@ def test_tenant_delete_smoke(
 
     iterations = poll_for_remote_storage_iterations(remote_storage_kind)
 
-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2
     tenant_delete_wait_completed(ps_http, tenant_id, iterations)
-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
 
     tenant_path = env.pageserver.tenant_dir(tenant_id)
     assert not tenant_path.exists()
@@ -108,7 +108,7 @@ def test_tenant_delete_smoke(
     )
 
     # Deletion updates the tenant count: the one default tenant remains
-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
 
 
 class Check(enum.Enum):
@@ -532,7 +532,9 @@ def test_tenant_delete_concurrent(
 
         # The TenantSlot is still present while the original request is hung before
         # final removal
-        assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+        assert (
+            ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
+        )
 
         # Permit the original request to run to success
         ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "off"))
@@ -556,7 +558,8 @@ def test_tenant_delete_concurrent(
     )
 
     # Zero tenants remain (we deleted the default tenant)
-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0
 
 
 def test_tenant_delete_races_timeline_creation(
@@ -673,7 +676,7 @@ def test_tenant_delete_races_timeline_creation(
     )
 
     # Zero tenants remain (we deleted the default tenant)
-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
 
 
 def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):

From 69337be5c2d8547d4109e4e0ff7b07fcd4d36229 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 6 May 2024 09:14:42 -0500
Subject: [PATCH 0702/1571] Fix grammar in provider.rs error message

s/temporary/temporarily

---------

Co-authored-by: Barry Grenon <barry_grenon@yahoo.ca>
---
 proxy/src/console/provider.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index dfda29e0b1..a05cf248f6 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -76,7 +76,7 @@ pub mod errors {
                     }
                     http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
                         // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
-                        format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support")
+                        format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.")
                     }
                     _ => REQUEST_FAILED.to_owned(),
                 },

From df1def70183f0deb416e68b427e933724c950f9e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 6 May 2024 16:40:44 +0200
Subject: [PATCH 0703/1571] refactor(pageserver): remove --update-init flag
 (#7612)

We don't actually use it.

refs https://github.com/neondatabase/neon/issues/7555
---
 control_plane/src/pageserver.rs            | 13 +---
 pageserver/src/bin/pageserver.rs           | 73 ++++++++++------------
 test_runner/regress/test_pageserver_api.py |  7 +--
 3 files changed, 36 insertions(+), 57 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 1a64391306..c0a366e3b9 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -198,7 +198,7 @@ impl PageServerNode {
     }
 
     pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
-        self.start_node(config_overrides, false).await
+        self.start_node(config_overrides).await
     }
 
     fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -262,11 +262,7 @@ impl PageServerNode {
         Ok(())
     }
 
-    async fn start_node(
-        &self,
-        config_overrides: &[&str],
-        update_config: bool,
-    ) -> anyhow::Result<()> {
+    async fn start_node(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
         // TODO: using a thread here because start_process() is not async but we need to call check_status()
         let datadir = self.repo_path();
         print!(
@@ -283,10 +279,7 @@ impl PageServerNode {
                 self.conf.id, datadir,
             )
         })?;
-        let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
-        if update_config {
-            args.push(Cow::Borrowed("--update-config"));
-        }
+        let args = self.pageserver_basic_args(config_overrides, datadir_path_str);
         background_process::start_process(
             "pageserver",
             &datadir,
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 1345223a43..e9433de05b 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -3,6 +3,7 @@
 //! Main entry point for the Page Server executable.
 
 use std::env::{var, VarError};
+use std::io::Read;
 use std::sync::Arc;
 use std::time::Duration;
 use std::{env, ops::ControlFlow, str::FromStr};
@@ -151,37 +152,34 @@ fn initialize_config(
     workdir: &Utf8Path,
 ) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
     let init = arg_matches.get_flag("init");
-    let update_config = init || arg_matches.get_flag("update-config");
 
-    let (mut toml, config_file_exists) = if cfg_file_path.is_file() {
-        if init {
-            anyhow::bail!(
-                "Config file '{cfg_file_path}' already exists, cannot init it, use --update-config to update it",
-            );
+    let file_contents: Option<toml_edit::Document> = match std::fs::File::open(cfg_file_path) {
+        Ok(mut f) => {
+            if init {
+                anyhow::bail!("config file already exists: {cfg_file_path}");
+            }
+            let md = f.metadata().context("stat config file")?;
+            if md.is_file() {
+                let mut s = String::new();
+                f.read_to_string(&mut s).context("read config file")?;
+                Some(s.parse().context("parse config file toml")?)
+            } else {
+                anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
+            }
+        }
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
+        Err(e) => {
+            anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
         }
-        // Supplement the CLI arguments with the config file
-        let cfg_file_contents = std::fs::read_to_string(cfg_file_path)
-            .with_context(|| format!("Failed to read pageserver config at '{cfg_file_path}'"))?;
-        (
-            cfg_file_contents
-                .parse::<toml_edit::Document>()
-                .with_context(|| {
-                    format!("Failed to parse '{cfg_file_path}' as pageserver config")
-                })?,
-            true,
-        )
-    } else if cfg_file_path.exists() {
-        anyhow::bail!("Config file '{cfg_file_path}' exists but is not a regular file");
-    } else {
-        // We're initializing the tenant, so there's no config file yet
-        (
-            DEFAULT_CONFIG_FILE
-                .parse::<toml_edit::Document>()
-                .context("could not parse built-in config file")?,
-            false,
-        )
     };
 
+    let mut effective_config = file_contents.unwrap_or_else(|| {
+        DEFAULT_CONFIG_FILE
+            .parse()
+            .expect("unit tests ensure this works")
+    });
+
+    // Patch with overrides from the command line
     if let Some(values) = arg_matches.get_many::<String>("config-override") {
         for option_line in values {
             let doc = toml_edit::Document::from_str(option_line).with_context(|| {
@@ -189,22 +187,21 @@ fn initialize_config(
             })?;
 
             for (key, item) in doc.iter() {
-                if config_file_exists && update_config && key == "id" && toml.contains_key(key) {
-                    anyhow::bail!("Pageserver config file exists at '{cfg_file_path}' and has node id already, it cannot be overridden");
-                }
-                toml.insert(key, item.clone());
+                effective_config.insert(key, item.clone());
             }
         }
     }
 
-    debug!("Resulting toml: {toml}");
-    let conf = PageServerConf::parse_and_validate(&toml, workdir)
+    debug!("Resulting toml: {effective_config}");
+
+    // Construct the runtime representation
+    let conf = PageServerConf::parse_and_validate(&effective_config, workdir)
         .context("Failed to parse pageserver configuration")?;
 
-    if update_config {
+    if init {
         info!("Writing pageserver config to '{cfg_file_path}'");
 
-        std::fs::write(cfg_file_path, toml.to_string())
+        std::fs::write(cfg_file_path, effective_config.to_string())
             .with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
         info!("Config successfully written to '{cfg_file_path}'")
     }
@@ -764,12 +761,6 @@ fn cli() -> Command {
                 .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
                 Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
         )
-        .arg(
-            Arg::new("update-config")
-                .long("update-config")
-                .action(ArgAction::SetTrue)
-                .help("Update the config file when started"),
-        )
         .arg(
             Arg::new("enabled-features")
                 .long("enabled-features")
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 81aed704bb..be351db429 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -12,7 +12,6 @@ from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 
-# test that we cannot override node id after init
 def test_pageserver_init_node_id(
     neon_simple_env: NeonEnv, neon_binpath: Path, pg_distrib_dir: Path
 ):
@@ -49,11 +48,7 @@ def test_pageserver_init_node_id(
 
     bad_reinit = run_pageserver(good_init_cmd)
     assert bad_reinit.returncode == 1, "pageserver refuses to init if already exists"
-    assert "already exists, cannot init it" in bad_reinit.stderr
-
-    bad_update = run_pageserver(["--update-config", "-c", "id = 3"])
-    assert bad_update.returncode == 1, "pageserver should not allow updating node id"
-    assert "has node id already, it cannot be overridden" in bad_update.stderr
+    assert "config file already exists" in bad_reinit.stderr
 
 
 def check_client(env: NeonEnv, client: PageserverHttpClient):

From a96e15cb6b1dfd15bd6d2bc5dbe1b353cb811efe Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 6 May 2024 18:52:51 +0300
Subject: [PATCH 0704/1571] test: less flaky test_synthetic_size_while_deleting
 (#7622)

#7585 introduced test case for deletions while synthetic size is being
calculated. The test has a race against deletion, but we only accept one
outcome. Fix it to accept 404 as well, as we cannot control from outside
which outcome happens.

Evidence:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7456/8970595458/index.html#/testresult/32a5b2f8c4094bdb
---
 test_runner/regress/test_tenant_size.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 53da548524..e73eae91f0 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -668,9 +668,9 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
 
         client.configure_failpoints((failpoint, "off"))
 
-        with pytest.raises(
-            PageserverApiException, match="Failed to refresh gc_info before gathering inputs"
-        ):
+        # accept both, because the deletion might still complete before
+        matcher = "(Failed to refresh gc_info before gathering inputs|NotFound: tenant)"
+        with pytest.raises(PageserverApiException, match=matcher):
             completion.result()
 
     # this happens on both cases

From f3af5f4660b519ff4fa1530e3b82b0fd96988b49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 6 May 2024 18:41:51 +0200
Subject: [PATCH 0705/1571] Fix test_ts_of_lsn_api flakiness (#7599)

Changes parameters to fix the flakiness of `test_ts_of_lsn_api`. Already
now, the amount of flakiness of the test is pretty low. With this, it's
even lower.

cc #5768
---
 test_runner/fixtures/pageserver/allowed_errors.py | 2 ++
 test_runner/regress/test_lsn_mapping.py           | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 8b895dcd92..e560844944 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -89,6 +89,8 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     # During teardown, we stop the storage controller before the pageservers, so pageservers
     # can experience connection errors doing background deletion queue work.
     ".*WARN deletion backend: calling control plane generation validation API failed.*Connection refused.*",
+    # Can happen when the test shuts down the storage controller while it is calling the utilization API
+    ".*WARN.*path=/v1/utilization .*request was dropped before completing",
 )
 
 
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 5c99ca6733..225622868d 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -119,11 +119,11 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
 
     cur = endpoint_main.connect().cursor()
     # Create table, and insert rows, each in a separate transaction
-    # Disable synchronous_commit to make this initialization go faster.
+    # Enable synchronous commit as we are timing sensitive
     #
     # Each row contains current insert LSN and the current timestamp, when
     # the row was inserted.
-    cur.execute("SET synchronous_commit=off")
+    cur.execute("SET synchronous_commit=on")
     cur.execute("CREATE TABLE foo (x integer)")
     tbl = []
     for i in range(1000):
@@ -132,7 +132,7 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
         after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=timezone.utc)
         after_lsn = query_scalar(cur, "SELECT pg_current_wal_lsn()")
         tbl.append([i, after_timestamp, after_lsn])
-        time.sleep(0.005)
+        time.sleep(0.02)
 
     # Execute one more transaction with synchronous_commit enabled, to flush
     # all the previous transactions

From 7dd58e1449ae13644e8234d0e120c1a125ff7f3f Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Mon, 6 May 2024 10:54:07 -0700
Subject: [PATCH 0706/1571] On-demand WAL download for walsender (#6872)

## Problem
There's allegedly a bug where if we connect a subscriber before WAL is
downloaded from the safekeeper, it creates an error.

## Summary of changes
Adds support for pausing safekeepers from sending WAL to computes, and
then creates a compute and attaches a subscriber while it's in this
paused state. Fails to reproduce the issue, but probably a good test to
have

---------

Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
---
 libs/walproposer/src/api_bindings.rs          |  35 ++++
 libs/walproposer/src/walproposer.rs           |  24 ++-
 pgxn/neon/Makefile                            |   3 +-
 pgxn/neon/neon.c                              |   3 +-
 pgxn/neon/neon_walreader.c                    |  77 +++++---
 pgxn/neon/neon_walreader.h                    |   9 +-
 pgxn/neon/walproposer.c                       |  75 +++++---
 pgxn/neon/walproposer.h                       |  33 ++--
 pgxn/neon/walproposer_pg.c                    | 175 ++++++------------
 pgxn/neon/walsender_hooks.c                   | 172 +++++++++++++++++
 pgxn/neon/walsender_hooks.h                   |   7 +
 safekeeper/src/send_wal.rs                    | 105 ++++++-----
 .../tests/walproposer_sim/walproposer_api.rs  |  29 +--
 .../regress/test_logical_replication.py       |  76 ++++++++
 14 files changed, 573 insertions(+), 250 deletions(-)
 create mode 100644 pgxn/neon/walsender_hooks.c
 create mode 100644 pgxn/neon/walsender_hooks.h

diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index 906302e46e..bbc3663402 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -50,6 +50,14 @@ extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr {
     }
 }
 
+extern "C" fn update_donor(wp: *mut WalProposer, donor: *mut Safekeeper, donor_lsn: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).update_donor(&mut (*donor), donor_lsn)
+    }
+}
+
 extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz {
     unsafe {
         let callback_data = (*(*wp).config).callback_data;
@@ -391,6 +399,7 @@ pub(crate) fn create_api() -> walproposer_api {
         get_shmem_state: Some(get_shmem_state),
         start_streaming: Some(start_streaming),
         get_flush_rec_ptr: Some(get_flush_rec_ptr),
+        update_donor: Some(update_donor),
         get_current_timestamp: Some(get_current_timestamp),
         conn_error_message: Some(conn_error_message),
         conn_status: Some(conn_status),
@@ -421,6 +430,32 @@ pub(crate) fn create_api() -> walproposer_api {
     }
 }
 
+pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
+    let empty_feedback = crate::bindings::PageserverFeedback {
+        present: false,
+        currentClusterSize: 0,
+        last_received_lsn: 0,
+        disk_consistent_lsn: 0,
+        remote_consistent_lsn: 0,
+        replytime: 0,
+        shard_number: 0,
+    };
+
+    crate::bindings::WalproposerShmemState {
+        propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 },
+        donor_name: [0; 64],
+        donor_conninfo: [0; 1024],
+        donor_lsn: 0,
+        mutex: 0,
+        mineLastElectedTerm: crate::bindings::pg_atomic_uint64 { value: 0 },
+        backpressureThrottlingTime: crate::bindings::pg_atomic_uint64 { value: 0 },
+        currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 },
+        shard_ps_feedback: [empty_feedback; 128],
+        num_shards: 0,
+        min_ps_feedback: empty_feedback,
+    }
+}
+
 impl std::fmt::Display for Level {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         write!(f, "{:?}", self)
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 14cc3e05a2..fb815607a7 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -1,8 +1,5 @@
 use std::ffi::CString;
 
-use postgres_ffi::WAL_SEGMENT_SIZE;
-use utils::{id::TenantTimelineId, lsn::Lsn};
-
 use crate::{
     api_bindings::{create_api, take_vec_u8, Level},
     bindings::{
@@ -10,6 +7,8 @@ use crate::{
         WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart,
     },
 };
+use postgres_ffi::WAL_SEGMENT_SIZE;
+use utils::{id::TenantTimelineId, lsn::Lsn};
 
 /// Rust high-level wrapper for C walproposer API. Many methods are not required
 /// for simple cases, hence todo!() in default implementations.
@@ -28,6 +27,10 @@ pub trait ApiImpl {
         todo!()
     }
 
+    fn update_donor(&self, _donor: &mut Safekeeper, _donor_lsn: u64) {
+        todo!()
+    }
+
     fn get_current_timestamp(&self) -> i64 {
         todo!()
     }
@@ -274,6 +277,7 @@ mod tests {
         sync::{atomic::AtomicUsize, mpsc::sync_channel},
     };
 
+    use std::cell::UnsafeCell;
     use utils::id::TenantTimelineId;
 
     use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
@@ -297,6 +301,8 @@ mod tests {
         replies_ptr: AtomicUsize,
         // channel to send LSN to the main thread
         sync_channel: std::sync::mpsc::SyncSender<u64>,
+        // Shmem state, used for storing donor info
+        shmem: UnsafeCell<crate::bindings::WalproposerShmemState>,
     }
 
     impl MockImpl {
@@ -327,11 +333,22 @@ mod tests {
     }
 
     impl ApiImpl for MockImpl {
+        fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState {
+            self.shmem.get()
+        }
+
         fn get_current_timestamp(&self) -> i64 {
             println!("get_current_timestamp");
             0
         }
 
+        fn update_donor(&self, donor: &mut crate::bindings::Safekeeper, donor_lsn: u64) {
+            let mut shmem = unsafe { *self.get_shmem_state() };
+            shmem.propEpochStartLsn.value = donor_lsn;
+            shmem.donor_conninfo = donor.conninfo;
+            shmem.donor_lsn = donor_lsn;
+        }
+
         fn conn_status(
             &self,
             _: &mut crate::bindings::Safekeeper,
@@ -507,6 +524,7 @@ mod tests {
             ],
             replies_ptr: AtomicUsize::new(0),
             sync_channel: sender,
+            shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
         });
         let config = crate::walproposer::Config {
             ttid,
diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 0bcb9545a6..cd316dbb91 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -14,7 +14,8 @@ OBJS = \
 	relsize_cache.o \
 	walproposer.o \
 	walproposer_pg.o \
-	control_plane_connector.o
+	control_plane_connector.o \
+	walsender_hooks.o
 
 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 8d236144b5..b69a3819c9 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -34,6 +34,7 @@
 #include "walproposer.h"
 #include "pagestore_client.h"
 #include "control_plane_connector.h"
+#include "walsender_hooks.h"
 
 PG_MODULE_MAGIC;
 void		_PG_init(void);
@@ -265,7 +266,6 @@ LogicalSlotsMonitorMain(Datum main_arg)
 	}
 }
 
-
 void
 _PG_init(void)
 {
@@ -279,6 +279,7 @@ _PG_init(void)
 
 	pg_init_libpagestore();
 	pg_init_walproposer();
+        WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 
 	InitLogicalReplicationMonitor();
 
diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c
index f7ec9e5bfa..e43f4d9d96 100644
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -36,10 +36,7 @@
 
 static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
 static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state);
-static void NeonWALReaderResetRemote(NeonWALReader *state);
 static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
-static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
-static void neon_wal_segment_close(NeonWALReader *state);
 static bool is_wal_segment_exists(XLogSegNo segno, int segsize,
 								  TimeLineID tli);
 
@@ -82,8 +79,9 @@ struct NeonWALReader
 	XLogRecPtr	req_lsn;
 	Size		req_len;
 	Size		req_progress;
-	WalProposer *wp;			/* we learn donor through walproposer */
+	char		donor_conninfo[MAXCONNINFO];
 	char		donor_name[64]; /* saved donor safekeeper name for logging */
+	XLogRecPtr	donor_lsn;
 	/* state of connection to safekeeper */
 	NeonWALReaderRemoteState rem_state;
 	WalProposerConn *wp_conn;
@@ -107,7 +105,7 @@ struct NeonWALReader
 
 /* palloc and initialize NeonWALReader */
 NeonWALReader *
-NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix)
+NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix)
 {
 	NeonWALReader *reader;
 
@@ -123,8 +121,6 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalPropose
 	reader->seg.ws_tli = 0;
 	reader->segcxt.ws_segsize = wal_segment_size;
 
-	reader->wp = wp;
-
 	reader->rem_state = RS_NONE;
 
 	if (log_prefix)
@@ -204,21 +200,16 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou
 {
 	if (state->rem_state == RS_NONE)
 	{
-		XLogRecPtr	donor_lsn;
-
-		/* no connection yet; start one */
-		Safekeeper *donor = GetDonor(state->wp, &donor_lsn);
-
-		if (donor == NULL)
+		if (!NeonWALReaderUpdateDonor(state))
 		{
 			snprintf(state->err_msg, sizeof(state->err_msg),
 					 "failed to establish remote connection to fetch WAL: no donor available");
 			return NEON_WALREAD_ERROR;
+
 		}
-		snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port);
-		nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL",
-				state->donor_name, LSN_FORMAT_ARGS(donor_lsn));
-		state->wp_conn = libpqwp_connect_start(donor->conninfo);
+		/* no connection yet; start one */
+		nwr_log(LOG, "establishing connection to %s, lsn=%X/%X to fetch WAL", state->donor_name, LSN_FORMAT_ARGS(state->donor_lsn));
+		state->wp_conn = libpqwp_connect_start(state->donor_conninfo);
 		if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD)
 		{
 			snprintf(state->err_msg, sizeof(state->err_msg),
@@ -251,10 +242,22 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou
 				{
 					/* connection successfully established */
 					char		start_repl_query[128];
+					term_t		term = pg_atomic_read_u64(&GetWalpropShmemState()->mineLastElectedTerm);
 
+					/*
+					 * Set elected walproposer's term to pull only data from
+					 * its history. Note: for logical walsender it means we
+					 * might stream WAL not yet committed by safekeepers. It
+					 * would be cleaner to fix this.
+					 *
+					 * mineLastElectedTerm shouldn't be 0 at this point
+					 * because we checked above that donor exists and it
+					 * appears only after successfull election.
+					 */
+					Assert(term > 0);
 					snprintf(start_repl_query, sizeof(start_repl_query),
 							 "START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')",
-							 LSN_FORMAT_ARGS(startptr), state->wp->propTerm);
+							 LSN_FORMAT_ARGS(startptr), term);
 					nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s",
 							state->donor_name, start_repl_query);
 					if (!libpqwp_send_query(state->wp_conn, start_repl_query))
@@ -404,6 +407,10 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou
 			state->req_lsn = InvalidXLogRecPtr;
 			state->req_len = 0;
 			state->req_progress = 0;
+
+			/* Update the current segment info. */
+			state->seg.ws_tli = tli;
+
 			return NEON_WALREAD_SUCCESS;
 		}
 	}
@@ -526,7 +533,7 @@ err:
 }
 
 /* reset remote connection and request in progress */
-static void
+void
 NeonWALReaderResetRemote(NeonWALReader *state)
 {
 	state->req_lsn = InvalidXLogRecPtr;
@@ -691,13 +698,25 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
 	return true;
 }
 
+XLogRecPtr
+NeonWALReaderGetRemLsn(NeonWALReader *state)
+{
+	return state->rem_lsn;
+}
+
+const WALOpenSegment *
+NeonWALReaderGetSegment(NeonWALReader *state)
+{
+	return &state->seg;
+}
+
 /*
  * Copy of vanilla wal_segment_open, but returns false in case of error instead
  * of ERROR, with errno set.
  *
  * XLogReaderRoutine->segment_open callback for local pg_wal files
  */
-static bool
+bool
 neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo,
 					  TimeLineID *tli_p)
 {
@@ -724,7 +743,7 @@ is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli)
 }
 
 /* copy of vanilla wal_segment_close with NeonWALReader */
-static void
+void
 neon_wal_segment_close(NeonWALReader *state)
 {
 	if (state->seg.ws_file >= 0)
@@ -740,3 +759,19 @@ NeonWALReaderErrMsg(NeonWALReader *state)
 {
 	return state->err_msg;
 }
+
+/*
+ * Returns true if there is a donor, and false otherwise
+ */
+bool
+NeonWALReaderUpdateDonor(NeonWALReader *state)
+{
+	WalproposerShmemState *wps = GetWalpropShmemState();
+
+	SpinLockAcquire(&wps->mutex);
+	memcpy(state->donor_name, wps->donor_name, sizeof(state->donor_name));
+	memcpy(state->donor_conninfo, wps->donor_conninfo, sizeof(state->donor_conninfo));
+	state->donor_lsn = wps->donor_lsn;
+	SpinLockRelease(&wps->mutex);
+	return state->donor_name[0] != '\0';
+}
diff --git a/pgxn/neon/neon_walreader.h b/pgxn/neon/neon_walreader.h
index 6be9f149aa..3e41825069 100644
--- a/pgxn/neon/neon_walreader.h
+++ b/pgxn/neon/neon_walreader.h
@@ -19,12 +19,19 @@ typedef enum
 	NEON_WALREAD_ERROR,
 } NeonWALReadResult;
 
-extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix);
+extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix);
 extern void NeonWALReaderFree(NeonWALReader *state);
+extern void NeonWALReaderResetRemote(NeonWALReader *state);
 extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
 extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
 extern uint32 NeonWALReaderEvents(NeonWALReader *state);
 extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state);
 extern char *NeonWALReaderErrMsg(NeonWALReader *state);
+extern XLogRecPtr NeonWALReaderGetRemLsn(NeonWALReader *state);
+extern const WALOpenSegment *NeonWALReaderGetSegment(NeonWALReader *state);
+extern bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
+extern void neon_wal_segment_close(NeonWALReader *state);
+extern bool NeonWALReaderUpdateDonor(NeonWALReader *state);
+
 
 #endif							/* __NEON_WALREADER_H__ */
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index d7987954d4..dbc67a24f5 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -80,7 +80,7 @@ static int	CompareLsn(const void *a, const void *b);
 static char *FormatSafekeeperState(Safekeeper *sk);
 static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
 static char *FormatEvents(WalProposer *wp, uint32 events);
-
+static void UpdateDonorShmem(WalProposer *wp);
 
 WalProposer *
 WalProposerCreate(WalProposerConfig *config, walproposer_api api)
@@ -922,7 +922,8 @@ static void
 DetermineEpochStartLsn(WalProposer *wp)
 {
 	TermHistory *dth;
-	int          n_ready = 0;
+	int			n_ready = 0;
+	WalproposerShmemState *walprop_shared;
 
 	wp->propEpochStartLsn = InvalidXLogRecPtr;
 	wp->donorEpoch = 0;
@@ -964,16 +965,18 @@ DetermineEpochStartLsn(WalProposer *wp)
 	if (n_ready < wp->quorum)
 	{
 		/*
-		 * This is a rare case that can be triggered if safekeeper has voted and disconnected.
-		 * In this case, its state will not be SS_IDLE and its vote cannot be used, because
-		 * we clean up `voteResponse` in `ShutdownConnection`.
+		 * This is a rare case that can be triggered if safekeeper has voted
+		 * and disconnected. In this case, its state will not be SS_IDLE and
+		 * its vote cannot be used, because we clean up `voteResponse` in
+		 * `ShutdownConnection`.
 		 */
 		wp_log(FATAL, "missing majority of votes, collected %d, expected %d, got %d", wp->n_votes, wp->quorum, n_ready);
 	}
 
 	/*
-	 * If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are bootstrapping
-	 * and nothing was committed yet. Start streaming then from the basebackup LSN.
+	 * If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are
+	 * bootstrapping and nothing was committed yet. Start streaming then from
+	 * the basebackup LSN.
 	 */
 	if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
 	{
@@ -984,11 +987,12 @@ DetermineEpochStartLsn(WalProposer *wp)
 		}
 		wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
 	}
+	pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn);
 
 	/*
-	 * Safekeepers are setting truncateLsn after timelineStartLsn is known, so it
-	 * should never be zero at this point, if we know timelineStartLsn.
-	 * 
+	 * Safekeepers are setting truncateLsn after timelineStartLsn is known, so
+	 * it should never be zero at this point, if we know timelineStartLsn.
+	 *
 	 * timelineStartLsn can be zero only on the first syncSafekeepers run.
 	 */
 	Assert((wp->truncateLsn != InvalidXLogRecPtr) ||
@@ -1022,10 +1026,9 @@ DetermineEpochStartLsn(WalProposer *wp)
 	 * since which we are going to write according to the consensus. If not,
 	 * we must bail out, as clog and other non rel data is inconsistent.
 	 */
+	walprop_shared = wp->api.get_shmem_state(wp);
 	if (!wp->config->syncSafekeepers)
 	{
-		WalproposerShmemState *walprop_shared = wp->api.get_shmem_state(wp);
-
 		/*
 		 * Basebackup LSN always points to the beginning of the record (not
 		 * the page), as StartupXLOG most probably wants it this way.
@@ -1040,7 +1043,7 @@ DetermineEpochStartLsn(WalProposer *wp)
 			 * compute (who could generate WAL) is ok.
 			 */
 			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
-											walprop_shared->mineLastElectedTerm)))
+											pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm))))
 			{
 				/*
 				 * Panic to restart PG as we need to retake basebackup.
@@ -1054,8 +1057,8 @@ DetermineEpochStartLsn(WalProposer *wp)
 					   LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
 			}
 		}
-		walprop_shared->mineLastElectedTerm = wp->propTerm;
 	}
+	pg_atomic_write_u64(&walprop_shared->mineLastElectedTerm, wp->propTerm);
 }
 
 /*
@@ -1105,9 +1108,13 @@ SendProposerElected(Safekeeper *sk)
 	{
 		/* safekeeper is empty or no common point, start from the beginning */
 		sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
-		wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" ,
-		 	 sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
-		/* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */
+		wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u",
+			   sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
+
+		/*
+		 * wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline
+		 * is created manually (test_s3_wal_replay)
+		 */
 		Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
 	}
 	else
@@ -1177,6 +1184,12 @@ StartStreaming(Safekeeper *sk)
 	sk->active_state = SS_ACTIVE_SEND;
 	sk->streamingAt = sk->startStreamingAt;
 
+	/*
+	 * Donors can only be in SS_ACTIVE state, so we potentially update the
+	 * donor when we switch one to SS_ACTIVE.
+	 */
+	UpdateDonorShmem(sk->wp);
+
 	/* event set will be updated inside SendMessageToNode */
 	SendMessageToNode(sk);
 }
@@ -1568,17 +1581,17 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
  * none if it doesn't exist. donor_lsn is set to end position of the donor to
  * the best of our knowledge.
  */
-Safekeeper *
-GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
+static void
+UpdateDonorShmem(WalProposer *wp)
 {
 	Safekeeper *donor = NULL;
 	int			i;
-	*donor_lsn = InvalidXLogRecPtr;
+	XLogRecPtr	donor_lsn = InvalidXLogRecPtr;
 
 	if (wp->n_votes < wp->quorum)
 	{
-		wp_log(WARNING, "GetDonor called before elections are won");
-		return NULL;
+		wp_log(WARNING, "UpdateDonorShmem called before elections are won");
+		return;
 	}
 
 	/*
@@ -1589,7 +1602,7 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
 	if (wp->safekeeper[wp->donor].state >= SS_IDLE)
 	{
 		donor = &wp->safekeeper[wp->donor];
-		*donor_lsn = wp->propEpochStartLsn;
+		donor_lsn = wp->propEpochStartLsn;
 	}
 
 	/*
@@ -1601,13 +1614,19 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
 	{
 		Safekeeper *sk = &wp->safekeeper[i];
 
-		if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > *donor_lsn)
+		if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > donor_lsn)
 		{
 			donor = sk;
-			*donor_lsn = sk->appendResponse.flushLsn;
+			donor_lsn = sk->appendResponse.flushLsn;
 		}
 	}
-	return donor;
+
+	if (donor == NULL)
+	{
+		wp_log(WARNING, "UpdateDonorShmem didn't find a suitable donor, skipping");
+		return;
+	}
+	wp->api.update_donor(wp, donor, donor_lsn);
 }
 
 /*
@@ -1617,7 +1636,7 @@ static void
 HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk)
 {
 	XLogRecPtr	candidateTruncateLsn;
-	XLogRecPtr  newCommitLsn;
+	XLogRecPtr	newCommitLsn;
 
 	newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
 	if (newCommitLsn > wp->commitLsn)
@@ -1627,7 +1646,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk)
 		BroadcastAppendRequest(wp);
 	}
 
-	/* 
+	/*
 	 * Unlock syncrep waiters, update ps_feedback, CheckGracefulShutdown().
 	 * The last one will terminate the process if the shutdown is requested
 	 * and WAL is committed by the quorum. BroadcastAppendRequest() should be
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 69a557fdf2..41daeb87b9 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -284,14 +284,19 @@ typedef struct PageserverFeedback
 
 typedef struct WalproposerShmemState
 {
+	pg_atomic_uint64 propEpochStartLsn;
+	char		donor_name[64];
+	char		donor_conninfo[MAXCONNINFO];
+	XLogRecPtr	donor_lsn;
+
 	slock_t		mutex;
-	term_t		mineLastElectedTerm;
+	pg_atomic_uint64 mineLastElectedTerm;
 	pg_atomic_uint64 backpressureThrottlingTime;
 	pg_atomic_uint64 currentClusterSize;
 
 	/* last feedback from each shard */
 	PageserverFeedback shard_ps_feedback[MAX_SHARDS];
-	int num_shards;
+	int			num_shards;
 
 	/* aggregated feedback with min LSNs across shards */
 	PageserverFeedback min_ps_feedback;
@@ -465,6 +470,9 @@ typedef struct walproposer_api
 	/* Get pointer to the latest available WAL. */
 	XLogRecPtr	(*get_flush_rec_ptr) (WalProposer *wp);
 
+	/* Update current donor info in WalProposer Shmem */
+	void		(*update_donor) (WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn);
+
 	/* Get current time. */
 	TimestampTz (*get_current_timestamp) (WalProposer *wp);
 
@@ -497,7 +505,7 @@ typedef struct walproposer_api
 	 *
 	 * On success, the data is placed in *buf. It is valid until the next call
 	 * to this function.
-	 * 
+	 *
 	 * Returns PG_ASYNC_READ_FAIL on closed connection.
 	 */
 	PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount);
@@ -545,13 +553,14 @@ typedef struct walproposer_api
 	 * Returns 0 if timeout is reached, 1 if some event happened. Updates
 	 * events mask to indicate events and sets sk to the safekeeper which has
 	 * an event.
-	 * 
+	 *
 	 * On timeout, events is set to WL_NO_EVENTS. On socket event, events is
 	 * set to WL_SOCKET_READABLE and/or WL_SOCKET_WRITEABLE. When socket is
 	 * closed, events is set to WL_SOCKET_READABLE.
-	 * 
-	 * WL_SOCKET_WRITEABLE is usually set only when we need to flush the buffer.
-	 * It can be returned only if caller asked for this event in the last *_event_set call.
+	 *
+	 * WL_SOCKET_WRITEABLE is usually set only when we need to flush the
+	 * buffer. It can be returned only if caller asked for this event in the
+	 * last *_event_set call.
 	 */
 	int			(*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events);
 
@@ -571,9 +580,9 @@ typedef struct walproposer_api
 	void		(*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn);
 
 	/*
-	 * Called after every AppendResponse from the safekeeper. Used to propagate
-	 * backpressure feedback and to confirm WAL persistence (has been commited
-	 * on the quorum of safekeepers).
+	 * Called after every AppendResponse from the safekeeper. Used to
+	 * propagate backpressure feedback and to confirm WAL persistence (has
+	 * been commited on the quorum of safekeepers).
 	 */
 	void		(*process_safekeeper_feedback) (WalProposer *wp, Safekeeper *sk);
 
@@ -716,12 +725,14 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt
 extern void WalProposerPoll(WalProposer *wp);
 extern void WalProposerFree(WalProposer *wp);
 
+extern WalproposerShmemState *GetWalpropShmemState();
+
 /*
  * WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to
  * recreate set from scratch, hence the export.
  */
 extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events);
-extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
+extern TimeLineID walprop_pg_get_timeline_id(void);
 
 
 #define WPEVENT		1337		/* special log level for walproposer internal
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 7debb6325e..e5ef93b456 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -85,7 +85,6 @@ static void walprop_pg_init_standalone_sync_safekeepers(void);
 static void walprop_pg_init_walsender(void);
 static void walprop_pg_init_bgworker(void);
 static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp);
-static TimeLineID walprop_pg_get_timeline_id(void);
 static void walprop_pg_load_libpqwalreceiver(void);
 
 static process_interrupts_callback_t PrevProcessInterruptsCallback;
@@ -94,6 +93,8 @@ static shmem_startup_hook_type prev_shmem_startup_hook_type;
 static shmem_request_hook_type prev_shmem_request_hook = NULL;
 static void walproposer_shmem_request(void);
 #endif
+static void WalproposerShmemInit_SyncSafekeeper(void);
+
 
 static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd);
 static void WalSndLoop(WalProposer *wp);
@@ -136,6 +137,7 @@ WalProposerSync(int argc, char *argv[])
 	WalProposer *wp;
 
 	init_walprop_config(true);
+	WalproposerShmemInit_SyncSafekeeper();
 	walprop_pg_init_standalone_sync_safekeepers();
 	walprop_pg_load_libpqwalreceiver();
 
@@ -281,6 +283,8 @@ WalproposerShmemInit(void)
 	{
 		memset(walprop_shared, 0, WalproposerShmemSize());
 		SpinLockInit(&walprop_shared->mutex);
+		pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0);
+		pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0);
 		pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
 		pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0);
 	}
@@ -289,6 +293,17 @@ WalproposerShmemInit(void)
 	return found;
 }
 
+static void
+WalproposerShmemInit_SyncSafekeeper(void)
+{
+	walprop_shared = palloc(WalproposerShmemSize());
+	memset(walprop_shared, 0, WalproposerShmemSize());
+	SpinLockInit(&walprop_shared->mutex);
+	pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0);
+	pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0);
+	pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
+}
+
 #define BACK_PRESSURE_DELAY 10000L // 0.01 sec
 
 static bool
@@ -399,6 +414,13 @@ nwp_shmem_startup_hook(void)
 	WalproposerShmemInit();
 }
 
+WalproposerShmemState *
+GetWalpropShmemState()
+{
+	Assert(walprop_shared != NULL);
+	return walprop_shared;
+}
+
 static WalproposerShmemState *
 walprop_pg_get_shmem_state(WalProposer *wp)
 {
@@ -431,14 +453,15 @@ record_pageserver_feedback(PageserverFeedback *ps_feedback)
 	for (int i = 0; i < walprop_shared->num_shards; i++)
 	{
 		PageserverFeedback *feedback = &walprop_shared->shard_ps_feedback[i];
+
 		if (feedback->present)
 		{
 			if (min_feedback.last_received_lsn == InvalidXLogRecPtr || feedback->last_received_lsn < min_feedback.last_received_lsn)
 				min_feedback.last_received_lsn = feedback->last_received_lsn;
-			
+
 			if (min_feedback.disk_consistent_lsn == InvalidXLogRecPtr || feedback->disk_consistent_lsn < min_feedback.disk_consistent_lsn)
 				min_feedback.disk_consistent_lsn = feedback->disk_consistent_lsn;
-			
+
 			if (min_feedback.remote_consistent_lsn == InvalidXLogRecPtr || feedback->remote_consistent_lsn < min_feedback.remote_consistent_lsn)
 				min_feedback.remote_consistent_lsn = feedback->remote_consistent_lsn;
 		}
@@ -551,6 +574,7 @@ static void
 walprop_sigusr2(SIGNAL_ARGS)
 {
 	int			save_errno = errno;
+
 	got_SIGUSR2 = true;
 	SetLatch(MyLatch);
 	errno = save_errno;
@@ -598,7 +622,7 @@ walprop_pg_get_current_timestamp(WalProposer *wp)
 	return GetCurrentTimestamp();
 }
 
-static TimeLineID
+TimeLineID
 walprop_pg_get_timeline_id(void)
 {
 #if PG_VERSION_NUM >= 150000
@@ -617,6 +641,20 @@ walprop_pg_load_libpqwalreceiver(void)
 		wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly");
 }
 
+static void
+walprop_pg_update_donor(WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn)
+{
+	WalproposerShmemState *wps = wp->api.get_shmem_state(wp);
+	char		donor_name[64];
+
+	pg_snprintf(donor_name, sizeof(donor_name), "%s:%s", donor->host, donor->port);
+	SpinLockAcquire(&wps->mutex);
+	memcpy(wps->donor_name, donor_name, sizeof(donor_name));
+	memcpy(wps->donor_conninfo, donor->conninfo, sizeof(donor->conninfo));
+	wps->donor_lsn = donor_lsn;
+	SpinLockRelease(&wps->mutex);
+}
+
 /* Helper function */
 static bool
 ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
@@ -717,7 +755,6 @@ walprop_connect_start(Safekeeper *sk)
 {
 	Assert(sk->conn == NULL);
 	sk->conn = libpqwp_connect_start(sk->conninfo);
-
 }
 
 static WalProposerConnectPollStatusType
@@ -1091,7 +1128,7 @@ static void
 StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
 {
 	XLogRecPtr	FlushPtr;
-	 __attribute__((unused)) TimeLineID	currTLI;
+	__attribute__((unused)) TimeLineID currTLI;
 
 #if PG_VERSION_NUM < 150000
 	if (ThisTimeLineID == 0)
@@ -1295,116 +1332,13 @@ XLogBroadcastWalProposer(WalProposer *wp)
 	}
 }
 
-/* Download WAL before basebackup for logical walsenders from sk, if needed */
+/*
+  Used to download WAL before basebackup for logical walsenders from sk, no longer
+  needed because walsender always uses neon_walreader.
+ */
 static bool
 WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 {
-	char	   *err;
-	WalReceiverConn *wrconn;
-	WalRcvStreamOptions options;
-	char		conninfo[MAXCONNINFO];
-	TimeLineID	timeline;
-	XLogRecPtr	startpos;
-	XLogRecPtr	endpos;
-
-	startpos = GetLogRepRestartLSN(wp);
-	if (startpos == InvalidXLogRecPtr)
-		return true;			/* recovery not needed */
-	endpos = wp->propEpochStartLsn;
-
-	timeline = wp->greetRequest.timeline;
-
-	if (!neon_auth_token)
-	{
-		memcpy(conninfo, sk->conninfo, MAXCONNINFO);
-	}
-	else
-	{
-		int			written = 0;
-
-		written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo);
-		if (written > MAXCONNINFO || written < 0)
-			wpg_log(FATAL, "could not append password to the safekeeper connection string");
-	}
-
-#if PG_MAJORVERSION_NUM < 16
-	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
-#else
-	wrconn = walrcv_connect(conninfo, false, false, "wal_proposer_recovery", &err);
-#endif
-
-	if (!wrconn)
-	{
-		ereport(WARNING,
-				(errmsg("could not connect to WAL acceptor %s:%s: %s",
-						sk->host, sk->port,
-						err)));
-		return false;
-	}
-	wpg_log(LOG,
-			"start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
-			"%d",
-			sk->host, sk->port, (uint32) (startpos >> 32),
-			(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
-
-	options.logical = false;
-	options.startpoint = startpos;
-	options.slotname = NULL;
-	options.proto.physical.startpointTLI = timeline;
-
-	if (walrcv_startstreaming(wrconn, &options))
-	{
-		XLogRecPtr	rec_start_lsn;
-		XLogRecPtr	rec_end_lsn = 0;
-		int			len;
-		char	   *buf;
-		pgsocket	wait_fd = PGINVALID_SOCKET;
-
-		while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0)
-		{
-			if (len == 0)
-			{
-				(void) WaitLatchOrSocket(
-										 MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd,
-										 -1, WAIT_EVENT_WAL_RECEIVER_MAIN);
-			}
-			else
-			{
-				Assert(buf[0] == 'w' || buf[0] == 'k');
-				if (buf[0] == 'k')
-					continue;	/* keepalive */
-				memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS],
-					   sizeof rec_start_lsn);
-				rec_start_lsn = pg_ntoh64(rec_start_lsn);
-				rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
-
-				/* write WAL to disk */
-				XLogWalPropWrite(sk->wp, &buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);
-
-				ereport(DEBUG1,
-						(errmsg("Recover message %X/%X length %d",
-								LSN_FORMAT_ARGS(rec_start_lsn), len)));
-				if (rec_end_lsn >= endpos)
-					break;
-			}
-		}
-		ereport(LOG,
-				(errmsg("end of replication stream at %X/%X: %m",
-						LSN_FORMAT_ARGS(rec_end_lsn))));
-		walrcv_disconnect(wrconn);
-
-		/* failed to receive all WAL till endpos */
-		if (rec_end_lsn < endpos)
-			return false;
-	}
-	else
-	{
-		ereport(LOG,
-				(errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X",
-						timeline, (uint32) (startpos >> 32), (uint32) startpos)));
-		return false;
-	}
-
 	return true;
 }
 
@@ -1545,7 +1479,7 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk)
 
 	snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port);
 	Assert(!sk->xlogreader);
-	sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
+	sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, log_prefix);
 	if (sk->xlogreader == NULL)
 		wpg_log(FATAL, "failed to allocate xlog reader");
 }
@@ -1960,8 +1894,8 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
 static void
 walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 {
-	HotStandbyFeedback	hsFeedback;
-	bool				needToAdvanceSlot = false;
+	HotStandbyFeedback hsFeedback;
+	bool		needToAdvanceSlot = false;
 
 	if (wp->config->syncSafekeepers)
 		return;
@@ -2095,22 +2029,25 @@ GetLogRepRestartLSN(WalProposer *wp)
 	return lrRestartLsn;
 }
 
-void SetNeonCurrentClusterSize(uint64 size)
+void
+SetNeonCurrentClusterSize(uint64 size)
 {
 	pg_atomic_write_u64(&walprop_shared->currentClusterSize, size);
 }
 
-uint64 GetNeonCurrentClusterSize(void)
+uint64
+GetNeonCurrentClusterSize(void)
 {
 	return pg_atomic_read_u64(&walprop_shared->currentClusterSize);
 }
-uint64 GetNeonCurrentClusterSize(void);
+uint64		GetNeonCurrentClusterSize(void);
 
 
 static const walproposer_api walprop_pg = {
 	.get_shmem_state = walprop_pg_get_shmem_state,
 	.start_streaming = walprop_pg_start_streaming,
 	.get_flush_rec_ptr = walprop_pg_get_flush_rec_ptr,
+	.update_donor = walprop_pg_update_donor,
 	.get_current_timestamp = walprop_pg_get_current_timestamp,
 	.conn_error_message = walprop_error_message,
 	.conn_status = walprop_status,
diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c
new file mode 100644
index 0000000000..93dce9de84
--- /dev/null
+++ b/pgxn/neon/walsender_hooks.c
@@ -0,0 +1,172 @@
+/*-------------------------------------------------------------------------
+ *
+ * walsender_hooks.c
+ *
+ * Implements XLogReaderRoutine in terms of NeonWALReader. Allows for
+ * fetching WAL from safekeepers, which normal xlogreader can't do.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "walsender_hooks.h"
+#include "postgres.h"
+#include "fmgr.h"
+#include "access/xlogdefs.h"
+#include "replication/walsender.h"
+#include "access/xlog.h"
+#include "access/xlog_internal.h"
+#include "access/xlogreader.h"
+#include "miscadmin.h"
+#include "utils/wait_event.h"
+#include "utils/guc.h"
+#include "postmaster/interrupt.h"
+
+#include "neon_walreader.h"
+#include "walproposer.h"
+
+static NeonWALReader *wal_reader = NULL;
+extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc);
+extern bool GetDonorShmem(XLogRecPtr *donor_lsn);
+
+static XLogRecPtr
+NeonWALReadWaitForWAL(XLogRecPtr loc)
+{
+	while (!NeonWALReaderUpdateDonor(wal_reader))
+	{
+		pg_usleep(1000);
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	return WalSndWaitForWal(loc);
+}
+
+static int
+NeonWALPageRead(
+				XLogReaderState *xlogreader,
+				XLogRecPtr targetPagePtr,
+				int reqLen,
+				XLogRecPtr targetRecPtr,
+				char *readBuf)
+{
+	XLogRecPtr	rem_lsn;
+
+	/* Wait for flush pointer to advance past our request */
+	XLogRecPtr	flushptr = NeonWALReadWaitForWAL(targetPagePtr + reqLen);
+	int			count;
+
+	if (flushptr < targetPagePtr + reqLen)
+		return -1;
+
+	/* Read at most XLOG_BLCKSZ bytes */
+	if (targetPagePtr + XLOG_BLCKSZ <= flushptr)
+		count = XLOG_BLCKSZ;
+	else
+		count = flushptr - targetPagePtr;
+
+	/*
+	 * Sometimes walsender requests non-monotonic sequences of WAL. If that's
+	 * the case, we have to reset streaming from remote at the correct
+	 * position. For example, walsender may try to verify the segment header
+	 * when trying to read in the middle of it.
+	 */
+	rem_lsn = NeonWALReaderGetRemLsn(wal_reader);
+	if (rem_lsn != InvalidXLogRecPtr && targetPagePtr != rem_lsn)
+	{
+		NeonWALReaderResetRemote(wal_reader);
+	}
+
+	for (;;)
+	{
+		NeonWALReadResult res = NeonWALRead(
+											wal_reader,
+											readBuf,
+											targetPagePtr,
+											count,
+											walprop_pg_get_timeline_id());
+
+		if (res == NEON_WALREAD_SUCCESS)
+		{
+			/*
+			 * Setting ws_tli is required by the XLogReaderRoutine, it is used
+			 * for segment name generation in error reports.
+			 *
+			 * ReadPageInternal updates ws_segno after calling cb on its own
+			 * and XLogReaderRoutine description doesn't require it, but
+			 * WALRead sets, let's follow it.
+			 */
+			xlogreader->seg.ws_tli = NeonWALReaderGetSegment(wal_reader)->ws_tli;
+			xlogreader->seg.ws_segno = NeonWALReaderGetSegment(wal_reader)->ws_segno;
+
+			/*
+			 * ws_file doesn't exist in case of remote read, and isn't used by
+			 * xlogreader except by WALRead on which we don't rely anyway.
+			 */
+			return count;
+		}
+		if (res == NEON_WALREAD_ERROR)
+		{
+			elog(ERROR, "[walsender] Failed to read WAL (req_lsn=%X/%X, len=%d): %s",
+				 LSN_FORMAT_ARGS(targetPagePtr),
+				 reqLen,
+				 NeonWALReaderErrMsg(wal_reader));
+			return -1;
+		}
+
+		/*
+		 * Res is WOULDBLOCK, so we wait on the socket, recreating event set
+		 * if necessary
+		 */
+		{
+
+			pgsocket	sock = NeonWALReaderSocket(wal_reader);
+			uint32_t	reader_events = NeonWALReaderEvents(wal_reader);
+			long		timeout_ms = 1000;
+
+			ResetLatch(MyLatch);
+			CHECK_FOR_INTERRUPTS();
+			if (ConfigReloadPending)
+			{
+				ConfigReloadPending = false;
+				ProcessConfigFile(PGC_SIGHUP);
+			}
+
+			WaitLatchOrSocket(
+							  MyLatch,
+							  WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | reader_events,
+							  sock,
+							  timeout_ms,
+							  WAIT_EVENT_WAL_SENDER_MAIN);
+		}
+	}
+}
+
+static void
+NeonWALReadSegmentOpen(XLogReaderState *xlogreader, XLogSegNo nextSegNo, TimeLineID *tli_p)
+{
+	neon_wal_segment_open(wal_reader, nextSegNo, tli_p);
+	xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file;
+}
+
+static void
+NeonWALReadSegmentClose(XLogReaderState *xlogreader)
+{
+	neon_wal_segment_close(wal_reader);
+	xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file;
+}
+
+void
+NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr)
+{
+	if (!wal_reader)
+	{
+		XLogRecPtr	epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn);
+
+		if (epochStartLsn == 0)
+		{
+			elog(ERROR, "Unable to start walsender when propEpochStartLsn is 0!");
+		}
+		wal_reader = NeonWALReaderAllocate(wal_segment_size, epochStartLsn, "[walsender] ");
+	}
+	xlr->page_read = NeonWALPageRead;
+	xlr->segment_open = NeonWALReadSegmentOpen;
+	xlr->segment_close = NeonWALReadSegmentClose;
+}
diff --git a/pgxn/neon/walsender_hooks.h b/pgxn/neon/walsender_hooks.h
new file mode 100644
index 0000000000..2e3ce180f9
--- /dev/null
+++ b/pgxn/neon/walsender_hooks.h
@@ -0,0 +1,7 @@
+#ifndef __WALSENDER_HOOKS_H__
+#define __WALSENDER_HOOKS_H__
+
+struct XLogReaderRoutine;
+void		NeonOnDemandXLogReaderRoutines(struct XLogReaderRoutine *xlr);
+
+#endif
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 7da5fd00b0..59a8c595ab 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -506,6 +506,8 @@ struct WalSender<'a, IO> {
     send_buf: [u8; MAX_SEND_SIZE],
 }
 
+const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
+
 impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
     /// Send WAL until
     /// - an error occurs
@@ -584,14 +586,22 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
     async fn wait_wal(&mut self) -> Result<(), CopyStreamHandlerEnd> {
         loop {
             self.end_pos = self.end_watch.get();
-            if self.end_pos > self.start_pos {
-                // We have something to send.
+            let have_something_to_send = (|| {
+                fail::fail_point!(
+                    "sk-pause-send",
+                    self.appname.as_deref() != Some("pageserver"),
+                    |_| { false }
+                );
+                self.end_pos > self.start_pos
+            })();
+
+            if have_something_to_send {
                 trace!("got end_pos {:?}, streaming", self.end_pos);
                 return Ok(());
             }
 
             // Wait for WAL to appear, now self.end_pos == self.start_pos.
-            if let Some(lsn) = wait_for_lsn(&mut self.end_watch, self.term, self.start_pos).await? {
+            if let Some(lsn) = self.wait_for_lsn().await? {
                 self.end_pos = lsn;
                 trace!("got end_pos {:?}, streaming", self.end_pos);
                 return Ok(());
@@ -628,6 +638,54 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                 .await?;
         }
     }
+
+    /// Wait until we have available WAL > start_pos or timeout expires. Returns
+    /// - Ok(Some(end_pos)) if needed lsn is successfully observed;
+    /// - Ok(None) if timeout expired;
+    /// - Err in case of error -- only if 1) term changed while fetching in recovery
+    ///   mode 2) watch channel closed, which must never happen.
+    async fn wait_for_lsn(&mut self) -> anyhow::Result<Option<Lsn>> {
+        let fp = (|| {
+            fail::fail_point!(
+                "sk-pause-send",
+                self.appname.as_deref() != Some("pageserver"),
+                |_| { true }
+            );
+            false
+        })();
+        if fp {
+            tokio::time::sleep(POLL_STATE_TIMEOUT).await;
+            return Ok(None);
+        }
+
+        let res = timeout(POLL_STATE_TIMEOUT, async move {
+            loop {
+                let end_pos = self.end_watch.get();
+                if end_pos > self.start_pos {
+                    return Ok(end_pos);
+                }
+                if let EndWatch::Flush(rx) = &self.end_watch {
+                    let curr_term = rx.borrow().term;
+                    if let Some(client_term) = self.term {
+                        if curr_term != client_term {
+                            bail!("term changed: requested {}, now {}", client_term, curr_term);
+                        }
+                    }
+                }
+                self.end_watch.changed().await?;
+            }
+        })
+        .await;
+
+        match res {
+            // success
+            Ok(Ok(commit_lsn)) => Ok(Some(commit_lsn)),
+            // error inside closure
+            Ok(Err(err)) => Err(err),
+            // timeout
+            Err(_) => Ok(None),
+        }
+    }
 }
 
 /// A half driving receiving replies.
@@ -685,47 +743,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
     }
 }
 
-const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
-
-/// Wait until we have available WAL > start_pos or timeout expires. Returns
-/// - Ok(Some(end_pos)) if needed lsn is successfully observed;
-/// - Ok(None) if timeout expired;
-/// - Err in case of error -- only if 1) term changed while fetching in recovery
-///   mode 2) watch channel closed, which must never happen.
-async fn wait_for_lsn(
-    rx: &mut EndWatch,
-    client_term: Option<Term>,
-    start_pos: Lsn,
-) -> anyhow::Result<Option<Lsn>> {
-    let res = timeout(POLL_STATE_TIMEOUT, async move {
-        loop {
-            let end_pos = rx.get();
-            if end_pos > start_pos {
-                return Ok(end_pos);
-            }
-            if let EndWatch::Flush(rx) = rx {
-                let curr_term = rx.borrow().term;
-                if let Some(client_term) = client_term {
-                    if curr_term != client_term {
-                        bail!("term changed: requested {}, now {}", client_term, curr_term);
-                    }
-                }
-            }
-            rx.changed().await?;
-        }
-    })
-    .await;
-
-    match res {
-        // success
-        Ok(Ok(commit_lsn)) => Ok(Some(commit_lsn)),
-        // error inside closure
-        Ok(Err(err)) => Err(err),
-        // timeout
-        Err(_) => Ok(None),
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use utils::id::{TenantId, TimelineId};
diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs
index c49495a4f3..5578c94cf6 100644
--- a/safekeeper/tests/walproposer_sim/walproposer_api.rs
+++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs
@@ -17,8 +17,7 @@ use utils::lsn::Lsn;
 use walproposer::{
     api_bindings::Level,
     bindings::{
-        pg_atomic_uint64, NeonWALReadResult, PageserverFeedback, SafekeeperStateDesiredEvents,
-        WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE,
+        NeonWALReadResult, SafekeeperStateDesiredEvents, WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE,
     },
     walproposer::{ApiImpl, Config},
 };
@@ -224,31 +223,13 @@ impl SimulationApi {
             })
             .collect::<Vec<_>>();
 
-        let empty_feedback = PageserverFeedback {
-            present: false,
-            currentClusterSize: 0,
-            last_received_lsn: 0,
-            disk_consistent_lsn: 0,
-            remote_consistent_lsn: 0,
-            replytime: 0,
-            shard_number: 0,
-        };
-
         Self {
             os: args.os,
             safekeepers: RefCell::new(sk_conns),
             disk: args.disk,
             redo_start_lsn: args.redo_start_lsn,
             last_logged_commit_lsn: 0,
-            shmem: UnsafeCell::new(walproposer::bindings::WalproposerShmemState {
-                mutex: 0,
-                mineLastElectedTerm: 0,
-                backpressureThrottlingTime: pg_atomic_uint64 { value: 0 },
-                currentClusterSize: pg_atomic_uint64 { value: 0 },
-                shard_ps_feedback: [empty_feedback; 128],
-                num_shards: 0,
-                min_ps_feedback: empty_feedback,
-            }),
+            shmem: UnsafeCell::new(walproposer::api_bindings::empty_shmem()),
             config: args.config,
             event_set: RefCell::new(None),
         }
@@ -274,6 +255,12 @@ impl ApiImpl for SimulationApi {
         self.os.now() as i64 * 1000
     }
 
+    fn update_donor(&self, donor: &mut walproposer::bindings::Safekeeper, donor_lsn: u64) {
+        let mut shmem = unsafe { *self.get_shmem_state() };
+        shmem.propEpochStartLsn.value = donor_lsn;
+        shmem.donor_conninfo = donor.conninfo;
+    }
+
     fn conn_status(
         &self,
         _: &mut walproposer::bindings::Safekeeper,
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 1bac528397..9b2abe608c 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -7,6 +7,7 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
+    NeonEnvBuilder,
     logical_replication_sync,
     wait_for_last_flush_lsn,
 )
@@ -203,6 +204,81 @@ def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
     wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint))
 
 
+# Tests that walsender correctly blocks until WAL is downloaded from safekeepers
+def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch("init")
+    endpoint = env.endpoints.create_start("init")
+
+    with endpoint.connect().cursor() as cur:
+        cur.execute("create table wal_generator (id serial primary key, data text)")
+        cur.execute(
+            """
+INSERT INTO wal_generator (data)
+SELECT repeat('A', 1024) -- Generates a kilobyte of data per row
+FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data
+"""
+        )
+        cur.execute("create table t(a int)")
+        cur.execute("create publication pub for table t")
+        cur.execute("insert into t values (1)")
+
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create table t(a int)")
+    connstr = endpoint.connstr().replace("'", "''")
+    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub")
+    logical_replication_sync(vanilla_pg, endpoint)
+    vanilla_pg.stop()
+
+    # Pause the safekeepers so that they can't send WAL (except to pageserver)
+    for sk in env.safekeepers:
+        sk_http = sk.http_client()
+        sk_http.configure_failpoints([("sk-pause-send", "return")])
+
+    # Insert a 2
+    with endpoint.connect().cursor() as cur:
+        cur.execute("insert into t values (2)")
+
+    endpoint.stop_and_destroy()
+
+    # This new endpoint should contain [1, 2], but it can't access WAL from safekeeper
+    endpoint = env.endpoints.create_start("init")
+    with endpoint.connect().cursor() as cur:
+        cur.execute("select * from t")
+        res = [r[0] for r in cur.fetchall()]
+        assert res == [1, 2]
+
+    # Reconnect subscriber
+    vanilla_pg.start()
+    connstr = endpoint.connstr().replace("'", "''")
+    vanilla_pg.safe_psql(f"alter subscription sub1 connection '{connstr}'")
+
+    time.sleep(5)
+    # Make sure the 2 isn't replicated
+    assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1]
+
+    # Re-enable WAL download
+    for sk in env.safekeepers:
+        sk_http = sk.http_client()
+        sk_http.configure_failpoints([("sk-pause-send", "off")])
+
+    logical_replication_sync(vanilla_pg, endpoint)
+    assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2]
+
+    # Check that local reads also work
+    with endpoint.connect().cursor() as cur:
+        cur.execute("insert into t values (3)")
+    logical_replication_sync(vanilla_pg, endpoint)
+    assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2, 3]
+
+    log_path = vanilla_pg.pgdatadir / "pg.log"
+    with open(log_path, "r") as log_file:
+        logs = log_file.read()
+        assert "could not receive data from WAL stream" not in logs
+
+
 # Test compute start at LSN page of which starts with contrecord
 # https://github.com/neondatabase/neon/issues/5749
 def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):

From f1b654b77d5aa0f0898df9e372856ff3d2550d90 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 6 May 2024 21:03:25 +0200
Subject: [PATCH 0707/1571] proxy: reduce number of concurrent connections
 (#7620)

## Problem

Usually, the connection itself is quite fast (bellow 10ms for p999:
https://neonprod.grafana.net/goto/aOyn8vYIg?orgId=1).

It doesn't make a lot of sense to wait for a lot of time for the lock,
if it takes a lot of time to acquire it, probably, something goes wrong.

We also spawn a lot of retries, but they are not super helpful (0 means
that it was connected successfully, 1, most probably, that it was
re-request of the compute node address
https://neonprod.grafana.net/goto/J_8VQvLIR?orgId=1). Let's try to keep
a small number of retries.
---
 proxy/src/config.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 0c8e284d0b..e090407756 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -536,9 +536,9 @@ pub struct RetryConfig {
 impl RetryConfig {
     /// Default options for RetryConfig.
 
-    /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
+    /// Total delay for 5 retries with 200ms base delay and 2 backoff factor is about 6s.
     pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
-        "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6";
+        "num_retries=5,base_retry_wait_duration=200ms,retry_wait_exponent_base=2";
     /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
     /// Cplane has timeout of 60s on each request. 8m7s in total.
     pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str =
@@ -592,7 +592,7 @@ impl ConcurrencyLockOptions {
     pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
     /// Default options for [`crate::console::provider::ApiLocks`].
     pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str =
-        "shards=64,permits=50,epoch=10m,timeout=500ms";
+        "shards=64,permits=10,epoch=10m,timeout=10ms";
 
     // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";
 

From ac7dc821034abe15a287a79f4ab773d02d617977 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 6 May 2024 22:31:26 +0200
Subject: [PATCH 0708/1571] use less `neon_local --pageserver-config-override`
 / `pageserver -c` (#7613)

---
 control_plane/src/pageserver.rs            | 38 ++++++-------
 pageserver/src/bin/pageserver.rs           |  1 +
 test_runner/README.md                      |  3 -
 test_runner/fixtures/neon_fixtures.py      | 64 ++++++++--------------
 test_runner/regress/test_remote_storage.py |  3 -
 5 files changed, 39 insertions(+), 70 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index c0a366e3b9..45be14ef95 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -4,7 +4,6 @@
 //!
 //!   .neon/
 //!
-use std::borrow::Cow;
 use std::collections::HashMap;
 
 use std::io;
@@ -219,11 +218,18 @@ impl PageServerNode {
         let datadir_path_str = datadir.to_str().with_context(|| {
             format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
         })?;
-        let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
-        args.push(Cow::Borrowed("--init"));
 
+        // `pageserver --init` merges the `--config-override`s into a built-in default config,
+        // then writes out the merged product to `pageserver.toml`.
+        // TODO: just write the full `pageserver.toml` and get rid of `--config-override`.
+        let mut args = vec!["--init", "--workdir", datadir_path_str];
+        let overrides = self.neon_local_overrides(config_overrides);
+        for piece in &overrides {
+            args.push("--config-override");
+            args.push(piece);
+        }
         let init_output = Command::new(self.env.pageserver_bin())
-            .args(args.iter().map(Cow::as_ref))
+            .args(args)
             .envs(self.pageserver_env_variables()?)
             .output()
             .with_context(|| format!("Failed to run pageserver init for node {node_id}"))?;
@@ -279,12 +285,16 @@ impl PageServerNode {
                 self.conf.id, datadir,
             )
         })?;
-        let args = self.pageserver_basic_args(config_overrides, datadir_path_str);
+        let mut args = vec!["-D", datadir_path_str];
+        for config_override in config_overrides {
+            args.push("--config-override");
+            args.push(*config_override);
+        }
         background_process::start_process(
             "pageserver",
             &datadir,
             &self.env.pageserver_bin(),
-            args.iter().map(Cow::as_ref),
+            args,
             self.pageserver_env_variables()?,
             background_process::InitialPidFile::Expect(self.pid_file()),
             || async {
@@ -301,22 +311,6 @@ impl PageServerNode {
         Ok(())
     }
 
-    fn pageserver_basic_args<'a>(
-        &self,
-        config_overrides: &'a [&'a str],
-        datadir_path_str: &'a str,
-    ) -> Vec<Cow<'a, str>> {
-        let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];
-
-        let overrides = self.neon_local_overrides(config_overrides);
-        for config_override in overrides {
-            args.push(Cow::Borrowed("-c"));
-            args.push(Cow::Owned(config_override));
-        }
-
-        args
-    }
-
     fn pageserver_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
         // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
         // needs a token, and how to generate that token, seems independent to whether
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index e9433de05b..eb4b8bb8bb 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -755,6 +755,7 @@ fn cli() -> Command {
         // See `settings.md` for more details on the extra configuration patameters pageserver can process
         .arg(
             Arg::new("config-override")
+                .long("config-override")
                 .short('c')
                 .num_args(1)
                 .action(ArgAction::Append)
diff --git a/test_runner/README.md b/test_runner/README.md
index 96e74659ce..051897744a 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -76,13 +76,10 @@ you can use `--pg-version` argument.
 `TEST_OUTPUT`: Set the directory where test state and test output files
 should go.
 `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
-`NEON_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as
 `RUST_LOG`: logging configuration to pass into Neon CLI
 
 Useful parameters and commands:
 
-`--pageserver-config-override=${value}` `-c` values to pass into pageserver through neon_local cli
-
 `--preserve-database-files` to preserve pageserver (layer) and safekeer (segment) timeline files on disk
 after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents.
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 240b6ee199..1552e7e48a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -450,6 +450,7 @@ class NeonEnvBuilder:
         test_output_dir: Path,
         test_overlay_dir: Optional[Path] = None,
         pageserver_remote_storage: Optional[RemoteStorage] = None,
+        # toml that will be decomposed into `--config-override` flags during `pageserver --init`
         pageserver_config_override: Optional[str] = None,
         num_safekeepers: int = 1,
         num_pageservers: int = 1,
@@ -1021,7 +1022,6 @@ class NeonEnv:
         self.neon_local_binpath = config.neon_binpath
         self.pg_distrib_dir = config.pg_distrib_dir
         self.endpoint_counter = 0
-        self.pageserver_config_override = config.pageserver_config_override
         self.storage_controller_config = config.storage_controller_config
 
         # generate initial tenant ID here instead of letting 'neon init' generate it,
@@ -1131,7 +1131,11 @@ class NeonEnv:
             cfg["safekeepers"].append(sk_cfg)
 
         log.info(f"Config: {cfg}")
-        self.neon_cli.init(cfg, force=config.config_init_force)
+        self.neon_cli.init(
+            cfg,
+            force=config.config_init_force,
+            pageserver_config_override=config.pageserver_config_override,
+        )
 
     def start(self):
         # Storage controller starts first, so that pageserver /re-attach calls don't
@@ -1703,6 +1707,7 @@ class NeonCli(AbstractNeonCli):
         self,
         config: Dict[str, Any],
         force: Optional[str] = None,
+        pageserver_config_override: Optional[str] = None,
     ) -> "subprocess.CompletedProcess[str]":
         with tempfile.NamedTemporaryFile(mode="w+") as tmp:
             tmp.write(toml.dumps(config))
@@ -1713,17 +1718,24 @@ class NeonCli(AbstractNeonCli):
             if force is not None:
                 cmd.extend(["--force", force])
 
-            storage = self.env.pageserver_remote_storage
+            remote_storage = self.env.pageserver_remote_storage
 
-            append_pageserver_param_overrides(
-                params_to_update=cmd,
-                remote_storage=storage,
-                pageserver_config_override=self.env.pageserver_config_override,
-            )
+            if remote_storage is not None:
+                remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage)
+
+                cmd.append(
+                    f"--pageserver-config-override=remote_storage={remote_storage_toml_table}"
+                )
+
+            if pageserver_config_override is not None:
+                cmd += [
+                    f"--pageserver-config-override={o.strip()}"
+                    for o in pageserver_config_override.split(";")
+                ]
 
             s3_env_vars = None
-            if isinstance(storage, S3Storage):
-                s3_env_vars = storage.access_env_vars()
+            if isinstance(remote_storage, S3Storage):
+                s3_env_vars = remote_storage.access_env_vars()
             res = self.raw_cli(cmd, extra_env_vars=s3_env_vars)
             res.check_returncode()
             return res
@@ -1746,11 +1758,6 @@ class NeonCli(AbstractNeonCli):
     ) -> "subprocess.CompletedProcess[str]":
         start_args = ["pageserver", "start", f"--id={id}", *overrides]
         storage = self.env.pageserver_remote_storage
-        append_pageserver_param_overrides(
-            params_to_update=start_args,
-            remote_storage=storage,
-            pageserver_config_override=self.env.pageserver_config_override,
-        )
 
         if isinstance(storage, S3Storage):
             s3_env_vars = storage.access_env_vars()
@@ -2591,33 +2598,6 @@ class NeonPageserver(PgProtocol, LogUtils):
         )
 
 
-def append_pageserver_param_overrides(
-    params_to_update: List[str],
-    remote_storage: Optional[RemoteStorage],
-    pageserver_config_override: Optional[str] = None,
-):
-    if remote_storage is not None:
-        remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage)
-
-        params_to_update.append(
-            f"--pageserver-config-override=remote_storage={remote_storage_toml_table}"
-        )
-    else:
-        params_to_update.append('--pageserver-config-override=remote_storage=""')
-
-    env_overrides = os.getenv("NEON_PAGESERVER_OVERRIDES")
-    if env_overrides is not None:
-        params_to_update += [
-            f"--pageserver-config-override={o.strip()}" for o in env_overrides.split(";")
-        ]
-
-    if pageserver_config_override is not None:
-        params_to_update += [
-            f"--pageserver-config-override={o.strip()}"
-            for o in pageserver_config_override.split(";")
-        ]
-
-
 class PgBin:
     """A helper class for executing postgres binaries"""
 
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 47200a856e..ad4b4a42f1 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -1,6 +1,3 @@
-# It's possible to run any regular test with the local fs remote storage via
-# env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
-
 import os
 import queue
 import shutil

From af849a1f6127c72e671ac0bcf76f17977db474bb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 7 May 2024 11:15:58 +0100
Subject: [PATCH 0709/1571] pageserver: post-shard-split layer trimming (1/2)
 (#7572)

## Problem

After a shard split of a large existing tenant, child tenants can end up
with oversized historic layers indefinitely, if those layers are
prevented from being GC'd by branchpoints.

This PR is followed by https://github.com/neondatabase/neon/pull/7531

Related issue: https://github.com/neondatabase/neon/issues/7504

## Summary of changes

- Add a new compaction phase `compact_shard_ancestors`, which identifies
layers that are no longer needed after a shard split.
- Add a Timeline->LayerMap code path called `rewrite_layers` , which is
currently only used to drop layers, but will later be used to rewrite
them as well in https://github.com/neondatabase/neon/pull/7531
- Add a new test that compacts after a split, and checks that something
is deleted.

Note that this doesn't have much impact on a tenant's resident size
(since unused layers would end up evicted anyway), but it:
- Makes index_part.json much smaller
- Makes the system easier to reason about: avoid having tenants which
are like "my physical size is 4TiB but don't worry I'll never actually
download it", instead have tenants report the real physical size of what
they might download.

Why do we remove these layers in compaction rather than during the
split? Because we have existing split tenants that need cleaning up. We
can add it to the split operation in future as an optimization.
---
 libs/pageserver_api/src/keyspace.rs           |   2 +-
 pageserver/src/tenant/timeline.rs             |  18 +++
 pageserver/src/tenant/timeline/compaction.rs  | 147 +++++++++++++++++-
 .../src/tenant/timeline/layer_manager.rs      |  18 +++
 test_runner/regress/test_sharding.py          |  61 ++++++++
 5 files changed, 243 insertions(+), 3 deletions(-)

diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index a9ad3aca18..c0c4710a00 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -240,7 +240,7 @@ impl<'a> ShardedRange<'a> {
     /// pages that would not actually be stored on this node.
     ///
     /// Don't use this function in code that works with physical entities like layer files.
-    fn raw_size(range: &Range<Key>) -> u32 {
+    pub fn raw_size(range: &Range<Key>) -> u32 {
         if is_contiguous_range(range) {
             contiguous_range_len(range)
         } else {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3748036e4f..2a39f05106 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4434,6 +4434,24 @@ impl Timeline {
         Ok(())
     }
 
+    async fn rewrite_layers(
+        self: &Arc<Self>,
+        replace_layers: Vec<(Layer, ResidentLayer)>,
+        drop_layers: Vec<Layer>,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.layers.write().await;
+
+        guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics);
+
+        let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
+
+        if let Some(remote_client) = self.remote_client.as_ref() {
+            remote_client.schedule_compaction_update(&drop_layers, &upload_layers)?;
+        }
+
+        Ok(())
+    }
+
     /// Schedules the uploads of the given image layers
     fn upload_new_image_layers(
         self: &Arc<Self>,
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 1088101a13..e83878b8fb 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -15,7 +15,8 @@ use anyhow::{anyhow, Context};
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
-use pageserver_api::shard::{ShardIdentity, TenantShardId};
+use pageserver_api::keyspace::ShardedRange;
+use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;
@@ -93,7 +94,7 @@ impl Timeline {
         // Define partitioning schema if needed
 
         // FIXME: the match should only cover repartitioning, not the next steps
-        match self
+        let partition_count = match self
             .repartition(
                 self.get_last_record_lsn(),
                 self.get_compaction_target_size(),
@@ -146,6 +147,7 @@ impl Timeline {
                 assert!(sparse_layers.is_empty());
 
                 self.upload_new_image_layers(dense_layers)?;
+                dense_partitioning.parts.len()
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -157,9 +159,150 @@ impl Timeline {
                 if !self.cancel.is_cancelled() {
                     tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
                 }
+                1
             }
         };
 
+        if self.shard_identity.count >= ShardCount::new(2) {
+            // Limit the number of layer rewrites to the number of partitions: this means its
+            // runtime should be comparable to a full round of image layer creations, rather than
+            // being potentially much longer.
+            let rewrite_max = partition_count;
+
+            self.compact_shard_ancestors(rewrite_max, ctx).await?;
+        }
+
+        Ok(())
+    }
+
+    /// Check for layers that are elegible to be rewritten:
+    /// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that
+    ///   we don't indefinitely retain keys in this shard that aren't needed.
+    /// - For future use: layers beyond pitr_interval that are in formats we would
+    ///   rather not maintain compatibility with indefinitely.
+    ///
+    /// Note: this phase may read and write many gigabytes of data: use rewrite_max to bound
+    /// how much work it will try to do in each compaction pass.
+    async fn compact_shard_ancestors(
+        self: &Arc<Self>,
+        rewrite_max: usize,
+        _ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let mut drop_layers = Vec::new();
+        let layers_to_rewrite: Vec<Layer> = Vec::new();
+
+        // We will use the PITR cutoff as a condition for rewriting layers.
+        let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr;
+
+        let layers = self.layers.read().await;
+        for layer_desc in layers.layer_map().iter_historic_layers() {
+            let layer = layers.get_from_desc(&layer_desc);
+            if layer.metadata().shard.shard_count == self.shard_identity.count {
+                // This layer does not belong to a historic ancestor, no need to re-image it.
+                continue;
+            }
+
+            // This layer was created on an ancestor shard: check if it contains any data for this shard.
+            let sharded_range = ShardedRange::new(layer_desc.get_key_range(), &self.shard_identity);
+            let layer_local_page_count = sharded_range.page_count();
+            let layer_raw_page_count = ShardedRange::raw_size(&layer_desc.get_key_range());
+            if layer_local_page_count == 0 {
+                // This ancestral layer only covers keys that belong to other shards.
+                // We include the full metadata in the log: if we had some critical bug that caused
+                // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers.
+                info!(%layer, old_metadata=?layer.metadata(),
+                    "dropping layer after shard split, contains no keys for this shard.",
+                );
+
+                if cfg!(debug_assertions) {
+                    // Expensive, exhaustive check of keys in this layer: this guards against ShardedRange's calculations being
+                    // wrong.  If ShardedRange claims the local page count is zero, then no keys in this layer
+                    // should be !is_key_disposable()
+                    let range = layer_desc.get_key_range();
+                    let mut key = range.start;
+                    while key < range.end {
+                        debug_assert!(self.shard_identity.is_key_disposable(&key));
+                        key = key.next();
+                    }
+                }
+
+                drop_layers.push(layer);
+                continue;
+            } else if layer_local_page_count != u32::MAX
+                && layer_local_page_count == layer_raw_page_count
+            {
+                debug!(%layer,
+                    "layer is entirely shard local ({} keys), no need to filter it",
+                    layer_local_page_count
+                );
+                continue;
+            }
+
+            // Don't bother re-writing a layer unless it will at least halve its size
+            if layer_local_page_count != u32::MAX
+                && layer_local_page_count > layer_raw_page_count / 2
+            {
+                debug!(%layer,
+                    "layer is already mostly local ({}/{}), not rewriting",
+                    layer_local_page_count,
+                    layer_raw_page_count
+                );
+            }
+
+            // Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually
+            // without incurring the I/O cost of a rewrite.
+            if layer_desc.get_lsn_range().end >= pitr_cutoff {
+                debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})",
+                    layer_desc.get_lsn_range().end, pitr_cutoff);
+                continue;
+            }
+
+            if layer_desc.is_delta() {
+                // We do not yet implement rewrite of delta layers
+                debug!(%layer, "Skipping rewrite of delta layer");
+                continue;
+            }
+
+            // Only rewrite layers if they would have different remote paths: either they belong to this
+            // shard but an old generation, or they belonged to another shard.  This also implicitly
+            // guarantees that the layer is persistent in remote storage (as only remote persistent
+            // layers are carried across shard splits, any local-only layer would be in the current generation)
+            if layer.metadata().generation == self.generation
+                && layer.metadata().shard.shard_count == self.shard_identity.count
+            {
+                debug!(%layer, "Skipping rewrite, is not from old generation");
+                continue;
+            }
+
+            if layers_to_rewrite.len() >= rewrite_max {
+                tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
+                    layers_to_rewrite.len()
+                );
+                continue;
+            }
+
+            // Fall through: all our conditions for doing a rewrite passed.
+            // TODO: implement rewriting
+            tracing::debug!(%layer, "Would rewrite layer");
+        }
+
+        // Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`]
+        drop(layers);
+
+        // TODO: collect layers to rewrite
+        let replace_layers = Vec::new();
+
+        // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
+        self.rewrite_layers(replace_layers, drop_layers).await?;
+
+        if let Some(remote_client) = self.remote_client.as_ref() {
+            // We wait for all uploads to complete before finishing this compaction stage.  This is not
+            // necessary for correctness, but it simplifies testing, and avoids proceeding with another
+            // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
+            // load.
+            remote_client.wait_completion().await?;
+        }
+
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 64edcc5e40..8e8d64e0c6 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -205,6 +205,24 @@ impl LayerManager {
         updates.flush();
     }
 
+    /// Called when compaction is completed.
+    pub(crate) fn rewrite_layers(
+        &mut self,
+        rewrite_layers: &[(Layer, ResidentLayer)],
+        drop_layers: &[Layer],
+        _metrics: &TimelineMetrics,
+    ) {
+        let mut updates = self.layer_map.batch_update();
+
+        // TODO: implement rewrites (currently this code path only used for drops)
+        assert!(rewrite_layers.is_empty());
+
+        for l in drop_layers {
+            Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
+        }
+        updates.flush();
+    }
+
     /// Called when garbage collect has selected the layers to be removed.
     pub(crate) fn finish_gc_timeline(&mut self, gc_layers: &[Layer]) {
         let mut updates = self.layer_map.batch_update();
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 258377f8a2..d33803250f 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -177,6 +177,67 @@ def test_sharding_split_unsharded(
     env.storage_controller.consistency_check()
 
 
+def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that after a split, we clean up parent layer data in the child shards via compaction.
+    """
+    TENANT_CONF = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": f"{128 * 1024}",
+        "compaction_threshold": "1",
+        "compaction_target_size": f"{128 * 1024}",
+        # no PITR horizon, we specify the horizon when we request on-demand GC
+        "pitr_interval": "3600s",
+        # disable background compaction and GC. We invoke it manually when we want it to happen.
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # create image layers eagerly, so that GC can remove some layers
+        "image_creation_threshold": "1",
+        "image_layer_creation_check_threshold": "0",
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Check that we created with an unsharded TenantShardId: this is the default,
+    # but check it in case we change the default in future
+    assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 0)) is not None
+
+    workload = Workload(env, tenant_id, timeline_id, branch_name="main")
+    workload.init()
+    workload.write_rows(256)
+    workload.validate()
+    workload.stop()
+
+    # Split one shard into two
+    shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)
+
+    # Check we got the shard IDs we expected
+    assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None
+    assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None
+
+    workload.validate()
+    workload.stop()
+
+    env.storage_controller.consistency_check()
+
+    # Cleanup part 1: while layers are still in PITR window, we should only drop layers that are fully redundant
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+
+        # Invoke compaction: this should drop any layers that don't overlap with the shard's key stripes
+        detail_before = ps.http_client().timeline_detail(shard, timeline_id)
+        ps.http_client().timeline_compact(shard, timeline_id)
+        detail_after = ps.http_client().timeline_detail(shard, timeline_id)
+
+        # Physical size should shrink because some layers have been dropped
+        assert detail_after["current_physical_size"] < detail_before["current_physical_size"]
+
+    # Compaction shouldn't make anything unreadable
+    workload.validate()
+
+
 def test_sharding_split_smoke(
     neon_env_builder: NeonEnvBuilder,
 ):

From 3c9b484c4dbd52e286f17f3c6a5c6691990aa983 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 7 May 2024 13:47:57 +0300
Subject: [PATCH 0710/1571] feat: Timeline detach ancestor (#7456)

## Problem

Timelines cannot be deleted if they have children. In many production
cases, a branch or a timeline has been created off the main branch for
various reasons to the effect of having now a "new main" branch. This
feature will make it possible to detach a timeline from its ancestor by
inheriting all of the data before the branchpoint to the detached
timeline and by also reparenting all of the ancestor's earlier branches
to the detached timeline.

## Summary of changes

- Earlier added copy_lsn_prefix functionality is used
- RemoteTimelineClient learns to adopt layers by copying them from
another timeline
- LayerManager adds support for adding adopted layers
-
`timeline::Timeline::{prepare_to_detach,complete_detaching}_from_ancestor`
and `timeline::detach_ancestor` are added
- HTTP PUT handler

Cc: #6994

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 libs/pageserver_api/src/models.rs             |   1 +
 .../src/models/detach_ancestor.rs             |   6 +
 pageserver/src/http/routes.rs                 |  73 +++
 pageserver/src/repository.rs                  |   3 -
 pageserver/src/task_mgr.rs                    |   2 +
 pageserver/src/tenant.rs                      |   4 +
 pageserver/src/tenant/metadata.rs             |  18 +
 pageserver/src/tenant/mgr.rs                  |  96 +++
 .../src/tenant/remote_timeline_client.rs      | 140 ++++-
 .../tenant/remote_timeline_client/upload.rs   |  24 +
 .../src/tenant/storage_layer/delta_layer.rs   |  22 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  16 +-
 pageserver/src/tenant/timeline.rs             |  54 +-
 pageserver/src/tenant/timeline/delete.rs      |   4 +
 .../src/tenant/timeline/detach_ancestor.rs    | 550 ++++++++++++++++++
 test_runner/fixtures/pageserver/http.py       |  17 +
 .../regress/test_timeline_detach_ancestor.py  | 410 +++++++++++++
 17 files changed, 1411 insertions(+), 29 deletions(-)
 create mode 100644 libs/pageserver_api/src/models/detach_ancestor.rs
 create mode 100644 pageserver/src/tenant/timeline/detach_ancestor.rs
 create mode 100644 test_runner/regress/test_timeline_detach_ancestor.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index a54cdb520d..37d968cebd 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1,3 +1,4 @@
+pub mod detach_ancestor;
 pub mod partitioning;
 pub mod utilization;
 
diff --git a/libs/pageserver_api/src/models/detach_ancestor.rs b/libs/pageserver_api/src/models/detach_ancestor.rs
new file mode 100644
index 0000000000..fc1f10e734
--- /dev/null
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -0,0 +1,6 @@
+use utils::id::TimelineId;
+
+#[derive(Default, serde::Serialize)]
+pub struct AncestorDetached {
+    pub reparented_timelines: Vec<TimelineId>,
+}
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index cf526940f4..ea4c7f1e3b 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1827,6 +1827,75 @@ async fn timeline_download_remote_layers_handler_get(
     json_response(StatusCode::OK, info)
 }
 
+async fn timeline_detach_ancestor_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    use crate::tenant::timeline::detach_ancestor::Options;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+
+    let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
+
+    async move {
+        let mut options = Options::default();
+
+        let rewrite_concurrency =
+            parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
+        let copy_concurrency =
+            parse_query_param::<_, std::num::NonZeroUsize>(&request, "copy_concurrency")?;
+
+        [
+            (&mut options.rewrite_concurrency, rewrite_concurrency),
+            (&mut options.copy_concurrency, copy_concurrency),
+        ]
+        .into_iter()
+        .filter_map(|(target, val)| val.map(|val| (target, val)))
+        .for_each(|(target, val)| *target = val);
+
+        let state = get_state(&request);
+
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+        let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
+        let ctx = &ctx;
+
+        let timeline = tenant
+            .get_timeline(timeline_id, true)
+            .map_err(|e| ApiError::NotFound(e.into()))?;
+
+        let (_guard, prepared) = timeline
+            .prepare_to_detach_from_ancestor(&tenant, options, ctx)
+            .await
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+
+        let res = state
+            .tenant_manager
+            .complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
+            .await;
+
+        match res {
+            Ok(reparented_timelines) => {
+                let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
+                    reparented_timelines,
+                };
+
+                json_response(StatusCode::OK, resp)
+            }
+            Err(e) => Err(ApiError::InternalServerError(
+                e.context("timeline detach completion"),
+            )),
+        }
+    }
+    .instrument(span)
+    .await
+}
+
 async fn deletion_queue_flush(
     r: Request<Body>,
     cancel: CancellationToken,
@@ -2515,6 +2584,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
             |r| api_handler(r, timeline_download_remote_layers_handler_get),
         )
+        .put(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/detach_ancestor",
+            |r| api_handler(r, timeline_detach_ancestor_handler),
+        )
         .delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
             api_handler(r, timeline_delete_handler)
         })
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 0a9ac50aad..7b30c3ecf7 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -33,7 +33,6 @@ impl Value {
     }
 }
 
-#[cfg(test)]
 #[derive(Debug, PartialEq)]
 pub(crate) enum InvalidInput {
     TooShortValue,
@@ -42,10 +41,8 @@ pub(crate) enum InvalidInput {
 
 /// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
 /// use this type for querying if a slice looks some particular way.
-#[cfg(test)]
 pub(crate) struct ValueBytes;
 
-#[cfg(test)]
 impl ValueBytes {
     pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
         if raw.len() < 12 {
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 01a8974494..5f46ce3d69 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -370,6 +370,8 @@ pub enum TaskKind {
 
     #[cfg(test)]
     UnitTest,
+
+    DetachAncestor,
 }
 
 #[derive(Default)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2d7a2e0f9d..1d483af278 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -322,6 +322,9 @@ pub struct Tenant {
     /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
     pub(crate) timeline_get_throttle:
         Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
+
+    /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
+    ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
 }
 
 impl std::fmt::Debug for Tenant {
@@ -2557,6 +2560,7 @@ impl Tenant {
                 &crate::metrics::tenant_throttling::TIMELINE_GET,
             )),
             tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
+            ongoing_timeline_detach: std::sync::Mutex::default(),
         }
     }
 
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 39da713479..8ba0775120 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -207,6 +207,24 @@ impl TimelineMetadata {
         self.body.ancestor_lsn
     }
 
+    /// When reparenting, the `ancestor_lsn` does not change.
+    pub fn reparent(&mut self, timeline: &TimelineId) {
+        assert!(self.body.ancestor_timeline.is_some());
+        // no assertion for redoing this: it's fine, we may have to repeat this multiple times over
+        self.body.ancestor_timeline = Some(*timeline);
+    }
+
+    pub fn detach_from_ancestor(&mut self, timeline: &TimelineId, ancestor_lsn: &Lsn) {
+        if let Some(ancestor) = self.body.ancestor_timeline {
+            assert_eq!(ancestor, *timeline);
+        }
+        if self.body.ancestor_lsn != Lsn(0) {
+            assert_eq!(self.body.ancestor_lsn, *ancestor_lsn);
+        }
+        self.body.ancestor_timeline = None;
+        self.body.ancestor_lsn = Lsn(0);
+    }
+
     pub fn latest_gc_cutoff_lsn(&self) -> Lsn {
         self.body.latest_gc_cutoff_lsn
     }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 22173c6b5a..6be66e99ad 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -56,6 +56,7 @@ use utils::id::{TenantId, TimelineId};
 
 use super::delete::DeleteTenantError;
 use super::secondary::SecondaryTenant;
+use super::timeline::detach_ancestor::PreparedTimelineDetach;
 use super::TenantSharedResources;
 
 /// For a tenant that appears in TenantsMap, it may either be
@@ -2007,6 +2008,101 @@ impl TenantManager {
             })
             .collect())
     }
+
+    /// Completes an earlier prepared timeline detach ancestor.
+    pub(crate) async fn complete_detaching_timeline_ancestor(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        prepared: PreparedTimelineDetach,
+        ctx: &RequestContext,
+    ) -> Result<Vec<TimelineId>, anyhow::Error> {
+        struct RevertOnDropSlot(Option<SlotGuard>);
+
+        impl Drop for RevertOnDropSlot {
+            fn drop(&mut self) {
+                if let Some(taken) = self.0.take() {
+                    taken.revert();
+                }
+            }
+        }
+
+        impl RevertOnDropSlot {
+            fn into_inner(mut self) -> SlotGuard {
+                self.0.take().unwrap()
+            }
+        }
+
+        impl std::ops::Deref for RevertOnDropSlot {
+            type Target = SlotGuard;
+
+            fn deref(&self) -> &Self::Target {
+                self.0.as_ref().unwrap()
+            }
+        }
+
+        let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
+        let slot_guard = RevertOnDropSlot(Some(slot_guard));
+
+        let tenant = {
+            let Some(old_slot) = slot_guard.get_old_value() else {
+                anyhow::bail!(
+                    "Tenant not found when trying to complete detaching timeline ancestor"
+                );
+            };
+
+            let Some(tenant) = old_slot.get_attached() else {
+                anyhow::bail!("Tenant is not in attached state");
+            };
+
+            if !tenant.is_active() {
+                anyhow::bail!("Tenant is not active");
+            }
+
+            tenant.clone()
+        };
+
+        let timeline = tenant.get_timeline(timeline_id, true)?;
+
+        let reparented = timeline
+            .complete_detaching_timeline_ancestor(&tenant, prepared, ctx)
+            .await?;
+
+        let mut slot_guard = slot_guard.into_inner();
+
+        let (_guard, progress) = utils::completion::channel();
+        match tenant.shutdown(progress, ShutdownMode::Hard).await {
+            Ok(()) => {
+                slot_guard.drop_old_value()?;
+            }
+            Err(_barrier) => {
+                slot_guard.revert();
+                // this really should not happen, at all, unless shutdown was already going?
+                anyhow::bail!("Cannot restart Tenant, already shutting down");
+            }
+        }
+
+        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
+        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+
+        let shard_identity = config.shard;
+        let tenant = tenant_spawn(
+            self.conf,
+            tenant_shard_id,
+            &tenant_path,
+            self.resources.clone(),
+            AttachedTenantConf::try_from(config)?,
+            shard_identity,
+            None,
+            self.tenants,
+            SpawnMode::Eager,
+            ctx,
+        )?;
+
+        slot_guard.upsert(TenantSlot::Attached(tenant))?;
+
+        Ok(reparented)
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index a54e93c96b..49dbac2f13 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -645,9 +645,61 @@ impl RemoteTimelineClient {
         self.launch_queued_tasks(upload_queue);
     }
 
+    pub(crate) async fn schedule_reparenting_and_wait(
+        self: &Arc<Self>,
+        new_parent: &TimelineId,
+    ) -> anyhow::Result<()> {
+        // FIXME: because of how Timeline::schedule_uploads works when called from layer flushing
+        // and reads the in-memory part we cannot do the detaching like this
+        let receiver = {
+            let mut guard = self.upload_queue.lock().unwrap();
+            let upload_queue = guard.initialized_mut()?;
+
+            upload_queue.latest_metadata.reparent(new_parent);
+
+            self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+
+            self.schedule_barrier0(upload_queue)
+        };
+
+        Self::wait_completion0(receiver).await
+    }
+
+    /// Schedules uploading a new version of `index_part.json` with the given layers added,
+    /// detaching from ancestor and waits for it to complete.
     ///
-    /// Launch an upload operation in the background.
-    ///
+    /// This is used with `Timeline::detach_ancestor` functionality.
+    pub(crate) async fn schedule_adding_existing_layers_to_index_detach_and_wait(
+        self: &Arc<Self>,
+        layers: &[Layer],
+        adopted: (TimelineId, Lsn),
+    ) -> anyhow::Result<()> {
+        let barrier = {
+            let mut guard = self.upload_queue.lock().unwrap();
+            let upload_queue = guard.initialized_mut()?;
+
+            upload_queue
+                .latest_metadata
+                .detach_from_ancestor(&adopted.0, &adopted.1);
+
+            for layer in layers {
+                upload_queue
+                    .latest_files
+                    .insert(layer.layer_desc().filename(), layer.metadata());
+            }
+
+            self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+
+            let barrier = self.schedule_barrier0(upload_queue);
+            self.launch_queued_tasks(upload_queue);
+            barrier
+        };
+
+        Self::wait_completion0(barrier).await
+    }
+
+    /// Launch an upload operation in the background; the file is added to be included in next
+    /// `index_part.json` upload.
     pub(crate) fn schedule_layer_file_upload(
         self: &Arc<Self>,
         layer: ResidentLayer,
@@ -673,9 +725,11 @@ impl RemoteTimelineClient {
         upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
 
         info!(
-            "scheduled layer file upload {layer} gen={:?} shard={:?}",
-            metadata.generation, metadata.shard
+            gen=?metadata.generation,
+            shard=?metadata.shard,
+            "scheduled layer file upload {layer}",
         );
+
         let op = UploadOp::UploadLayer(layer, metadata);
         self.metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
@@ -882,12 +936,18 @@ impl RemoteTimelineClient {
 
     /// Wait for all previously scheduled uploads/deletions to complete
     pub(crate) async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
-        let mut receiver = {
+        let receiver = {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
             self.schedule_barrier0(upload_queue)
         };
 
+        Self::wait_completion0(receiver).await
+    }
+
+    async fn wait_completion0(
+        mut receiver: tokio::sync::watch::Receiver<()>,
+    ) -> anyhow::Result<()> {
         if receiver.changed().await.is_err() {
             anyhow::bail!("wait_completion aborted because upload queue was stopped");
         }
@@ -1085,6 +1145,72 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Uploads the given layer **without** adding it to be part of a future `index_part.json` upload.
+    ///
+    /// This is not normally needed.
+    pub(crate) async fn upload_layer_file(
+        self: &Arc<Self>,
+        uploaded: &ResidentLayer,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        backoff::retry(
+            || async {
+                let m = uploaded.metadata();
+                upload::upload_timeline_layer(
+                    self.conf,
+                    &self.storage_impl,
+                    uploaded.local_path(),
+                    &uploaded.metadata(),
+                    m.generation,
+                    cancel,
+                )
+                .await
+            },
+            TimeoutOrCancel::caused_by_cancel,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "upload a layer without adding it to latest files",
+            cancel,
+        )
+        .await
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
+        .and_then(|x| x)
+        .context("upload a layer without adding it to latest files")
+    }
+
+    /// Copies the `adopted` remote existing layer to the remote path of `adopted_as`. The layer is
+    /// not added to be part of a future `index_part.json` upload.
+    pub(crate) async fn copy_timeline_layer(
+        self: &Arc<Self>,
+        adopted: &Layer,
+        adopted_as: &Layer,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        backoff::retry(
+            || async {
+                upload::copy_timeline_layer(
+                    self.conf,
+                    &self.storage_impl,
+                    adopted.local_path(),
+                    &adopted.metadata(),
+                    adopted_as.local_path(),
+                    &adopted_as.metadata(),
+                    cancel,
+                )
+                .await
+            },
+            TimeoutOrCancel::caused_by_cancel,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "copy timeline layer",
+            cancel,
+        )
+        .await
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
+        .and_then(|x| x)
+        .context("remote copy timeline layer")
+    }
+
     async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> {
         match tokio::time::timeout(
             DELETION_QUEUE_FLUSH_TIMEOUT,
@@ -1256,7 +1382,7 @@ impl RemoteTimelineClient {
         while let Some(next_op) = upload_queue.queued_operations.front() {
             // Can we run this task now?
             let can_run_now = match next_op {
-                UploadOp::UploadLayer(_, _) => {
+                UploadOp::UploadLayer(..) => {
                     // Can always be scheduled.
                     true
                 }
@@ -1822,7 +1948,7 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
 /// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
 ///
 /// Errors if the path provided does not start from pageserver's workdir.
-pub fn remote_path(
+pub(crate) fn remote_path(
     conf: &PageServerConf,
     local_path: &Utf8Path,
     generation: Generation,
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 0227331953..a988369b6a 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -120,6 +120,30 @@ pub(super) async fn upload_timeline_layer<'a>(
         .with_context(|| format!("upload layer from local path '{source_path}'"))
 }
 
+pub(super) async fn copy_timeline_layer(
+    conf: &'static PageServerConf,
+    storage: &GenericRemoteStorage,
+    source_path: &Utf8Path,
+    source_metadata: &LayerFileMetadata,
+    target_path: &Utf8Path,
+    target_metadata: &LayerFileMetadata,
+    cancel: &CancellationToken,
+) -> anyhow::Result<()> {
+    fail_point!("before-copy-layer", |_| {
+        bail!("failpoint before-copy-layer")
+    });
+
+    pausable_failpoint!("before-copy-layer-pausable");
+
+    let source_path = remote_path(conf, source_path, source_metadata.generation)?;
+    let target_path = remote_path(conf, target_path, target_metadata.generation)?;
+
+    storage
+        .copy_object(&source_path, &target_path, cancel)
+        .await
+        .with_context(|| format!("copy layer {source_path} to {target_path}"))
+}
+
 /// Uploads the given `initdb` data to the remote storage.
 pub(crate) async fn upload_initdb_dir(
     storage: &GenericRemoteStorage,
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index b5538dff3a..6fd96b0e2f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1139,15 +1139,15 @@ impl DeltaLayerInner {
         Ok(all_keys)
     }
 
-    /// Using the given writer, write out a truncated version, where LSNs higher than the
-    /// truncate_at are missing.
-    #[cfg(test)]
+    /// Using the given writer, write out a version which has the earlier Lsns than `until`.
+    ///
+    /// Return the amount of key value records pushed to the writer.
     pub(super) async fn copy_prefix(
         &self,
         writer: &mut DeltaLayerWriter,
-        truncate_at: Lsn,
+        until: Lsn,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<usize> {
         use crate::tenant::vectored_blob_io::{
             BlobMeta, VectoredReadBuilder, VectoredReadExtended,
         };
@@ -1211,6 +1211,8 @@ impl DeltaLayerInner {
         // FIXME: buffering of DeltaLayerWriter
         let mut per_blob_copy = Vec::new();
 
+        let mut records = 0;
+
         while let Some(item) = stream.try_next().await? {
             tracing::debug!(?item, "popped");
             let offset = item
@@ -1229,7 +1231,7 @@ impl DeltaLayerInner {
 
             prev = Option::from(item);
 
-            let actionable = actionable.filter(|x| x.0.lsn < truncate_at);
+            let actionable = actionable.filter(|x| x.0.lsn < until);
 
             let builder = if let Some((meta, offsets)) = actionable {
                 // extend or create a new builder
@@ -1297,7 +1299,7 @@ impl DeltaLayerInner {
                     let will_init = crate::repository::ValueBytes::will_init(data)
                         .inspect_err(|_e| {
                             #[cfg(feature = "testing")]
-                            tracing::error!(data=?utils::Hex(data), err=?_e, "failed to parse will_init out of serialized value");
+                            tracing::error!(data=?utils::Hex(data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
                         })
                         .unwrap_or(false);
 
@@ -1314,7 +1316,10 @@ impl DeltaLayerInner {
                         )
                         .await;
                     per_blob_copy = tmp;
+
                     res?;
+
+                    records += 1;
                 }
 
                 buffer = Some(res.buf);
@@ -1326,7 +1331,7 @@ impl DeltaLayerInner {
             "with the sentinel above loop should had handled all"
         );
 
-        Ok(())
+        Ok(records)
     }
 
     pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
@@ -1399,7 +1404,6 @@ impl DeltaLayerInner {
         Ok(())
     }
 
-    #[cfg(test)]
     fn stream_index_forwards<'a, R>(
         &'a self,
         reader: &'a DiskBtreeReader<R, DELTA_KEY_SIZE>,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index ebc0cbf9a4..27faa507ca 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1797,25 +1797,23 @@ impl ResidentLayer {
         }
     }
 
-    /// FIXME: truncate is bad name because we are not truncating anything, but copying the
-    /// filtered parts.
-    #[cfg(test)]
-    pub(super) async fn copy_delta_prefix(
+    /// Returns the amount of keys and values written to the writer.
+    pub(crate) async fn copy_delta_prefix(
         &self,
         writer: &mut super::delta_layer::DeltaLayerWriter,
-        truncate_at: Lsn,
+        until: Lsn,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<usize> {
         use LayerKind::*;
 
         let owner = &self.owner.0;
 
         match self.downloaded.get(owner, ctx).await? {
             Delta(ref d) => d
-                .copy_prefix(writer, truncate_at, ctx)
+                .copy_prefix(writer, until, ctx)
                 .await
-                .with_context(|| format!("truncate {self}")),
-            Image(_) => anyhow::bail!(format!("cannot truncate image layer {self}")),
+                .with_context(|| format!("copy_delta_prefix until {until} of {self}")),
+            Image(_) => anyhow::bail!(format!("cannot copy_lsn_prefix of image layer {self}")),
         }
     }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2a39f05106..add6e3806e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,5 +1,6 @@
 mod compaction;
 pub mod delete;
+pub(crate) mod detach_ancestor;
 mod eviction_task;
 mod init;
 pub mod layer_manager;
@@ -1494,6 +1495,12 @@ impl Timeline {
     /// Flush to disk all data that was written with the put_* functions
     #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
     pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
+        self.freeze_and_flush0().await
+    }
+
+    // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
+    // polluting the span hierarchy.
+    pub(crate) async fn freeze_and_flush0(&self) -> anyhow::Result<()> {
         let to_lsn = self.freeze_inmem_layer(false).await;
         self.flush_frozen_layers_and_wait(to_lsn).await
     }
@@ -3510,7 +3517,7 @@ impl Timeline {
         Ok(ancestor)
     }
 
-    fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
+    pub(crate) fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
         let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
             format!(
                 "Ancestor is missing. Timeline id: {} Ancestor id {:?}",
@@ -4326,6 +4333,49 @@ impl Timeline {
             _ = self.cancel.cancelled() => {}
         )
     }
+
+    /// Detach this timeline from its ancestor by copying all of ancestors layers as this
+    /// Timelines layers up to the ancestor_lsn.
+    ///
+    /// Requires a timeline that:
+    /// - has an ancestor to detach from
+    /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
+    /// a technical requirement
+    /// - has prev_lsn in remote storage (temporary restriction)
+    ///
+    /// After the operation has been started, it cannot be canceled. Upon restart it needs to be
+    /// polled again until completion.
+    ///
+    /// During the operation all timelines sharing the data with this timeline will be reparented
+    /// from our ancestor to be branches of this timeline.
+    pub(crate) async fn prepare_to_detach_from_ancestor(
+        self: &Arc<Timeline>,
+        tenant: &crate::tenant::Tenant,
+        options: detach_ancestor::Options,
+        ctx: &RequestContext,
+    ) -> Result<
+        (
+            completion::Completion,
+            detach_ancestor::PreparedTimelineDetach,
+        ),
+        detach_ancestor::Error,
+    > {
+        detach_ancestor::prepare(self, tenant, options, ctx).await
+    }
+
+    /// Completes the ancestor detach. This method is to be called while holding the
+    /// TenantManager's tenant slot, so during this method we cannot be deleted nor can any
+    /// timeline be deleted. After this method returns successfully, tenant must be reloaded.
+    ///
+    /// Pageserver receiving a SIGKILL during this operation is not supported (yet).
+    pub(crate) async fn complete_detaching_timeline_ancestor(
+        self: &Arc<Timeline>,
+        tenant: &crate::tenant::Tenant,
+        prepared: detach_ancestor::PreparedTimelineDetach,
+        ctx: &RequestContext,
+    ) -> Result<Vec<TimelineId>, anyhow::Error> {
+        detach_ancestor::complete(self, tenant, prepared, ctx).await
+    }
 }
 
 /// Top-level failure to compact.
@@ -4610,6 +4660,8 @@ impl Timeline {
         retain_lsns: Vec<Lsn>,
         new_gc_cutoff: Lsn,
     ) -> anyhow::Result<GcResult> {
+        // FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc
+
         let now = SystemTime::now();
         let mut result: GcResult = GcResult::default();
 
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index af10c1c84b..d8701be170 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -422,6 +422,10 @@ impl DeleteTimelineFlow {
     pub(crate) fn is_finished(&self) -> bool {
         matches!(self, Self::Finished)
     }
+
+    pub(crate) fn is_not_started(&self) -> bool {
+        matches!(self, Self::NotStarted)
+    }
 }
 
 struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
new file mode 100644
index 0000000000..5c2b25da56
--- /dev/null
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -0,0 +1,550 @@
+use std::sync::Arc;
+
+use super::{layer_manager::LayerManager, Timeline};
+use crate::{
+    context::{DownloadBehavior, RequestContext},
+    task_mgr::TaskKind,
+    tenant::{
+        storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer},
+        Tenant,
+    },
+    virtual_file::{MaybeFatalIo, VirtualFile},
+};
+use tokio_util::sync::CancellationToken;
+use tracing::Instrument;
+use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};
+
+#[derive(Debug, thiserror::Error)]
+pub(crate) enum Error {
+    #[error("no ancestors")]
+    NoAncestor,
+    #[error("too many ancestors")]
+    TooManyAncestors,
+    #[error("shutting down, please retry later")]
+    ShuttingDown,
+    #[error("detached timeline must receive writes before the operation")]
+    DetachedTimelineNeedsWrites,
+    #[error("flushing failed")]
+    FlushAncestor(#[source] anyhow::Error),
+    #[error("layer download failed")]
+    RewrittenDeltaDownloadFailed(#[source] anyhow::Error),
+    #[error("copying LSN prefix locally failed")]
+    CopyDeltaPrefix(#[source] anyhow::Error),
+    #[error("upload rewritten layer")]
+    UploadRewritten(#[source] anyhow::Error),
+
+    #[error("ancestor is already being detached by: {}", .0)]
+    OtherTimelineDetachOngoing(TimelineId),
+
+    #[error("remote copying layer failed")]
+    CopyFailed(#[source] anyhow::Error),
+
+    #[error("unexpected error")]
+    Unexpected(#[source] anyhow::Error),
+}
+
+pub(crate) struct PreparedTimelineDetach {
+    layers: Vec<Layer>,
+}
+
+/// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments.
+#[derive(Debug)]
+pub(crate) struct Options {
+    pub(crate) rewrite_concurrency: std::num::NonZeroUsize,
+    pub(crate) copy_concurrency: std::num::NonZeroUsize,
+}
+
+impl Default for Options {
+    fn default() -> Self {
+        Self {
+            rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(),
+            copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(),
+        }
+    }
+}
+
+/// See [`Timeline::prepare_to_detach_from_ancestor`]
+pub(super) async fn prepare(
+    detached: &Arc<Timeline>,
+    tenant: &Tenant,
+    options: Options,
+    ctx: &RequestContext,
+) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
+    use Error::*;
+
+    if detached.remote_client.as_ref().is_none() {
+        unimplemented!("no new code for running without remote storage");
+    }
+
+    let Some((ancestor, ancestor_lsn)) = detached
+        .ancestor_timeline
+        .as_ref()
+        .map(|tl| (tl.clone(), detached.ancestor_lsn))
+    else {
+        return Err(NoAncestor);
+    };
+
+    if !ancestor_lsn.is_valid() {
+        return Err(NoAncestor);
+    }
+
+    if ancestor.ancestor_timeline.is_some() {
+        // non-technical requirement; we could flatten N ancestors just as easily but we chose
+        // not to
+        return Err(TooManyAncestors);
+    }
+
+    if detached.get_prev_record_lsn() == Lsn::INVALID
+        || detached.disk_consistent_lsn.load() == ancestor_lsn
+    {
+        // this is to avoid a problem that after detaching we would be unable to start up the
+        // compute because of "PREV_LSN: invalid".
+        return Err(DetachedTimelineNeedsWrites);
+    }
+
+    // before we acquire the gate, we must mark the ancestor as having a detach operation
+    // ongoing which will block other concurrent detach operations so we don't get to ackward
+    // situations where there would be two branches trying to reparent earlier branches.
+    let (guard, barrier) = completion::channel();
+
+    {
+        let mut guard = tenant.ongoing_timeline_detach.lock().unwrap();
+        if let Some((tl, other)) = guard.as_ref() {
+            if !other.is_ready() {
+                return Err(OtherTimelineDetachOngoing(*tl));
+            }
+        }
+        *guard = Some((detached.timeline_id, barrier));
+    }
+
+    let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
+
+    if ancestor_lsn >= ancestor.get_disk_consistent_lsn() {
+        let span =
+            tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id);
+        async {
+            let started_at = std::time::Instant::now();
+            let freeze_and_flush = ancestor.freeze_and_flush0();
+            let mut freeze_and_flush = std::pin::pin!(freeze_and_flush);
+
+            let res =
+                tokio::time::timeout(std::time::Duration::from_secs(1), &mut freeze_and_flush)
+                    .await;
+
+            let res = match res {
+                Ok(res) => res,
+                Err(_elapsed) => {
+                    tracing::info!("freezing and flushing ancestor is still ongoing");
+                    freeze_and_flush.await
+                }
+            };
+
+            res.map_err(FlushAncestor)?;
+
+            // we do not need to wait for uploads to complete but we do need `struct Layer`,
+            // copying delta prefix is unsupported currently for `InMemoryLayer`.
+            tracing::info!(
+                elapsed_ms = started_at.elapsed().as_millis(),
+                "froze and flushed the ancestor"
+            );
+            Ok(())
+        }
+        .instrument(span)
+        .await?;
+    }
+
+    let end_lsn = ancestor_lsn + 1;
+
+    let (filtered_layers, straddling_branchpoint, rest_of_historic) = {
+        // we do not need to start from our layers, because they can only be layers that come
+        // *after* ancestor_lsn
+        let layers = tokio::select! {
+            guard = ancestor.layers.read() => guard,
+            _ = detached.cancel.cancelled() => {
+                return Err(ShuttingDown);
+            }
+            _ = ancestor.cancel.cancelled() => {
+                return Err(ShuttingDown);
+            }
+        };
+
+        // between retries, these can change if compaction or gc ran in between. this will mean
+        // we have to redo work.
+        partition_work(ancestor_lsn, &layers)
+    };
+
+    // TODO: layers are already sorted by something: use that to determine how much of remote
+    // copies are already done.
+    tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers");
+
+    // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after
+    let mut new_layers: Vec<Layer> =
+        Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len());
+
+    {
+        tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");
+
+        let mut tasks = tokio::task::JoinSet::new();
+
+        let mut wrote_any = false;
+
+        let limiter = Arc::new(tokio::sync::Semaphore::new(
+            options.rewrite_concurrency.get(),
+        ));
+
+        for layer in straddling_branchpoint {
+            let limiter = limiter.clone();
+            let timeline = detached.clone();
+            let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download);
+
+            tasks.spawn(async move {
+                let _permit = limiter.acquire().await;
+                let copied =
+                    upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx)
+                        .await?;
+                Ok(copied)
+            });
+        }
+
+        while let Some(res) = tasks.join_next().await {
+            match res {
+                Ok(Ok(Some(copied))) => {
+                    wrote_any = true;
+                    tracing::info!(layer=%copied, "rewrote and uploaded");
+                    new_layers.push(copied);
+                }
+                Ok(Ok(None)) => {}
+                Ok(Err(e)) => return Err(e),
+                Err(je) => return Err(Unexpected(je.into())),
+            }
+        }
+
+        // FIXME: the fsync should be mandatory, after both rewrites and copies
+        if wrote_any {
+            let timeline_dir = VirtualFile::open(
+                &detached
+                    .conf
+                    .timeline_path(&detached.tenant_shard_id, &detached.timeline_id),
+            )
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
+        }
+    }
+
+    let mut tasks = tokio::task::JoinSet::new();
+    let limiter = Arc::new(tokio::sync::Semaphore::new(options.copy_concurrency.get()));
+
+    for adopted in rest_of_historic {
+        let limiter = limiter.clone();
+        let timeline = detached.clone();
+
+        tasks.spawn(
+            async move {
+                let _permit = limiter.acquire().await;
+                let owned =
+                    remote_copy(&adopted, &timeline, timeline.generation, &timeline.cancel).await?;
+                tracing::info!(layer=%owned, "remote copied");
+                Ok(owned)
+            }
+            .in_current_span(),
+        );
+    }
+
+    while let Some(res) = tasks.join_next().await {
+        match res {
+            Ok(Ok(owned)) => {
+                new_layers.push(owned);
+            }
+            Ok(Err(failed)) => {
+                return Err(failed);
+            }
+            Err(je) => return Err(Unexpected(je.into())),
+        }
+    }
+
+    // TODO: fsync directory again if we hardlinked something
+
+    let prepared = PreparedTimelineDetach { layers: new_layers };
+
+    Ok((guard, prepared))
+}
+
+fn partition_work(
+    ancestor_lsn: Lsn,
+    source_layermap: &LayerManager,
+) -> (usize, Vec<Layer>, Vec<Layer>) {
+    let mut straddling_branchpoint = vec![];
+    let mut rest_of_historic = vec![];
+
+    let mut later_by_lsn = 0;
+
+    for desc in source_layermap.layer_map().iter_historic_layers() {
+        // off by one chances here:
+        // - start is inclusive
+        // - end is exclusive
+        if desc.lsn_range.start > ancestor_lsn {
+            later_by_lsn += 1;
+            continue;
+        }
+
+        let target = if desc.lsn_range.start <= ancestor_lsn
+            && desc.lsn_range.end > ancestor_lsn
+            && desc.is_delta
+        {
+            // TODO: image layer at Lsn optimization
+            &mut straddling_branchpoint
+        } else {
+            &mut rest_of_historic
+        };
+
+        target.push(source_layermap.get_from_desc(&desc));
+    }
+
+    (later_by_lsn, straddling_branchpoint, rest_of_historic)
+}
+
+async fn upload_rewritten_layer(
+    end_lsn: Lsn,
+    layer: &Layer,
+    target: &Arc<Timeline>,
+    cancel: &CancellationToken,
+    ctx: &RequestContext,
+) -> Result<Option<Layer>, Error> {
+    use Error::UploadRewritten;
+    let copied = copy_lsn_prefix(end_lsn, layer, target, ctx).await?;
+
+    let Some(copied) = copied else {
+        return Ok(None);
+    };
+
+    // FIXME: better shuttingdown error
+    target
+        .remote_client
+        .as_ref()
+        .unwrap()
+        .upload_layer_file(&copied, cancel)
+        .await
+        .map_err(UploadRewritten)?;
+
+    Ok(Some(copied.into()))
+}
+
+async fn copy_lsn_prefix(
+    end_lsn: Lsn,
+    layer: &Layer,
+    target_timeline: &Arc<Timeline>,
+    ctx: &RequestContext,
+) -> Result<Option<ResidentLayer>, Error> {
+    use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed};
+
+    tracing::debug!(%layer, %end_lsn, "copying lsn prefix");
+
+    let mut writer = DeltaLayerWriter::new(
+        target_timeline.conf,
+        target_timeline.timeline_id,
+        target_timeline.tenant_shard_id,
+        layer.layer_desc().key_range.start,
+        layer.layer_desc().lsn_range.start..end_lsn,
+    )
+    .await
+    .map_err(CopyDeltaPrefix)?;
+
+    let resident = layer
+        .download_and_keep_resident()
+        .await
+        // likely shutdown
+        .map_err(RewrittenDeltaDownloadFailed)?;
+
+    let records = resident
+        .copy_delta_prefix(&mut writer, end_lsn, ctx)
+        .await
+        .map_err(CopyDeltaPrefix)?;
+
+    drop(resident);
+
+    tracing::debug!(%layer, records, "copied records");
+
+    if records == 0 {
+        drop(writer);
+        // TODO: we might want to store an empty marker in remote storage for this
+        // layer so that we will not needlessly walk `layer` on repeated attempts.
+        Ok(None)
+    } else {
+        // reuse the key instead of adding more holes between layers by using the real
+        // highest key in the layer.
+        let reused_highest_key = layer.layer_desc().key_range.end;
+        let copied = writer
+            .finish(reused_highest_key, target_timeline, ctx)
+            .await
+            .map_err(CopyDeltaPrefix)?;
+
+        tracing::debug!(%layer, %copied, "new layer produced");
+
+        Ok(Some(copied))
+    }
+}
+
+/// Creates a new Layer instance for the adopted layer, and ensures it is found from the remote
+/// storage on successful return without the adopted layer being added to `index_part.json`.
+async fn remote_copy(
+    adopted: &Layer,
+    adoptee: &Arc<Timeline>,
+    generation: Generation,
+    cancel: &CancellationToken,
+) -> Result<Layer, Error> {
+    use Error::CopyFailed;
+
+    // depending if Layer::keep_resident we could hardlink
+
+    let mut metadata = adopted.metadata();
+    debug_assert!(metadata.generation <= generation);
+    metadata.generation = generation;
+
+    let owned = crate::tenant::storage_layer::Layer::for_evicted(
+        adoptee.conf,
+        adoptee,
+        adopted.layer_desc().filename(),
+        metadata,
+    );
+
+    // FIXME: better shuttingdown error
+    adoptee
+        .remote_client
+        .as_ref()
+        .unwrap()
+        .copy_timeline_layer(adopted, &owned, cancel)
+        .await
+        .map(move |()| owned)
+        .map_err(CopyFailed)
+}
+
+/// See [`Timeline::complete_detaching_timeline_ancestor`].
+pub(super) async fn complete(
+    detached: &Arc<Timeline>,
+    tenant: &Tenant,
+    prepared: PreparedTimelineDetach,
+    _ctx: &RequestContext,
+) -> Result<Vec<TimelineId>, anyhow::Error> {
+    let rtc = detached
+        .remote_client
+        .as_ref()
+        .expect("has to have a remote timeline client for timeline ancestor detach");
+
+    let PreparedTimelineDetach { layers } = prepared;
+
+    let ancestor = detached
+        .get_ancestor_timeline()
+        .expect("must still have a ancestor");
+    let ancestor_lsn = detached.get_ancestor_lsn();
+
+    // publish the prepared layers before we reparent any of the timelines, so that on restart
+    // reparented timelines find layers. also do the actual detaching.
+    //
+    // if we crash after this operation, we will at least come up having detached a timeline, but
+    // we cannot go back and reparent the timelines which would had been reparented in normal
+    // execution.
+    //
+    // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
+    // which could give us a completely wrong layer combination.
+    rtc.schedule_adding_existing_layers_to_index_detach_and_wait(
+        &layers,
+        (ancestor.timeline_id, ancestor_lsn),
+    )
+    .await?;
+
+    let mut tasks = tokio::task::JoinSet::new();
+
+    // because we are now keeping the slot in progress, it is unlikely that there will be any
+    // timeline deletions during this time. if we raced one, then we'll just ignore it.
+    tenant
+        .timelines
+        .lock()
+        .unwrap()
+        .values()
+        .filter_map(|tl| {
+            if Arc::ptr_eq(tl, detached) {
+                return None;
+            }
+
+            if !tl.is_active() {
+                return None;
+            }
+
+            let tl_ancestor = tl.ancestor_timeline.as_ref()?;
+            let is_same = Arc::ptr_eq(&ancestor, tl_ancestor);
+            let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
+
+            let is_deleting = tl
+                .delete_progress
+                .try_lock()
+                .map(|flow| !flow.is_not_started())
+                .unwrap_or(true);
+
+            if is_same && is_earlier && !is_deleting {
+                Some(tl.clone())
+            } else {
+                None
+            }
+        })
+        .for_each(|timeline| {
+            // important in this scope: we are holding the Tenant::timelines lock
+            let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id);
+            let new_parent = detached.timeline_id;
+
+            tasks.spawn(
+                async move {
+                    let res = timeline
+                        .remote_client
+                        .as_ref()
+                        .expect("reparented has to have remote client because detached has one")
+                        .schedule_reparenting_and_wait(&new_parent)
+                        .await;
+
+                    match res {
+                        Ok(()) => Some(timeline),
+                        Err(e) => {
+                            // with the use of tenant slot, we no longer expect these.
+                            tracing::warn!("reparenting failed: {e:#}");
+                            None
+                        }
+                    }
+                }
+                .instrument(span),
+            );
+        });
+
+    let reparenting_candidates = tasks.len();
+    let mut reparented = Vec::with_capacity(tasks.len());
+
+    while let Some(res) = tasks.join_next().await {
+        match res {
+            Ok(Some(timeline)) => {
+                tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
+                reparented.push(timeline.timeline_id);
+            }
+            Ok(None) => {
+                // lets just ignore this for now. one or all reparented timelines could had
+                // started deletion, and that is fine.
+            }
+            Err(je) if je.is_cancelled() => unreachable!("not used"),
+            Err(je) if je.is_panic() => {
+                // ignore; it's better to continue with a single reparenting failing (or even
+                // all of them) in order to get to the goal state.
+                //
+                // these timelines will never be reparentable, but they can be always detached as
+                // separate tree roots.
+            }
+            Err(je) => tracing::error!("unexpected join error: {je:?}"),
+        }
+    }
+
+    if reparenting_candidates != reparented.len() {
+        tracing::info!("failed to reparent some candidates");
+    }
+
+    Ok(reparented)
+}
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 231ffd898e..b06972056c 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -819,6 +819,23 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
                 continue
             self.download_layer(tenant_id, timeline_id, layer.layer_file_name)
 
+    def detach_ancestor(
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+        timeline_id: TimelineId,
+        batch_size: int | None = None,
+    ) -> Set[TimelineId]:
+        params = {}
+        if batch_size is not None:
+            params["batch_size"] = batch_size
+        res = self.put(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor",
+            params=params,
+        )
+        self.verbose_error(res)
+        json = res.json()
+        return set(map(TimelineId, json["reparented_timelines"]))
+
     def evict_layer(
         self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
     ):
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
new file mode 100644
index 0000000000..bc983c36ee
--- /dev/null
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -0,0 +1,410 @@
+import enum
+from concurrent.futures import ThreadPoolExecutor
+from queue import Empty, Queue
+from threading import Barrier
+from typing import List
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.http import HistoricLayerInfo
+from fixtures.pageserver.utils import wait_timeline_detail_404
+from fixtures.types import Lsn, TimelineId
+
+
+def by_end_lsn(info: HistoricLayerInfo) -> Lsn:
+    assert info.lsn_end is not None
+    return Lsn(info.lsn_end)
+
+
+def layer_name(info: HistoricLayerInfo) -> str:
+    return info.layer_file_name
+
+
+@enum.unique
+class Branchpoint(str, enum.Enum):
+    """
+    Have branches at these Lsns possibly relative to L0 layer boundary.
+    """
+
+    EARLIER = "earlier"
+    AT_L0 = "at"
+    AFTER_L0 = "after"
+    LAST_RECORD_LSN = "head"
+
+    def __str__(self) -> str:
+        return self.value
+
+    @staticmethod
+    def all() -> List["Branchpoint"]:
+        return [
+            Branchpoint.EARLIER,
+            Branchpoint.AT_L0,
+            Branchpoint.AFTER_L0,
+            Branchpoint.LAST_RECORD_LSN,
+        ]
+
+
+@pytest.mark.parametrize("branchpoint", Branchpoint.all())
+@pytest.mark.parametrize("restart_after", [True, False])
+def test_ancestor_detach_branched_from(
+    neon_env_builder: NeonEnvBuilder, branchpoint: Branchpoint, restart_after: bool
+):
+    """
+    Creates a branch relative to L0 lsn boundary according to Branchpoint. Later the timeline is detached.
+    """
+    # TODO: parametrize; currently unimplemented over at pageserver
+    write_to_branch_first = True
+
+    env = neon_env_builder.init_start()
+
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*initial size calculation failed: downloading failed, possibly for shutdown"
+            ".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+        ]
+    )
+
+    client = env.pageserver.http_client()
+
+    with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep:
+        ep.safe_psql("CREATE TABLE foo (i BIGINT);")
+
+        after_first_tx = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+
+        ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);")
+
+        # create a single layer for us to remote copy
+        wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+        client.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
+
+        ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);")
+        wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+
+    deltas = client.layer_map_info(env.initial_tenant, env.initial_timeline).delta_layers()
+    # there is also the in-mem layer, but ignore it for now
+    assert len(deltas) == 2, "expecting there to be two deltas: initdb and checkpointed"
+    later_delta = max(deltas, key=by_end_lsn)
+    assert later_delta.lsn_end is not None
+
+    # -1 as the lsn_end is exclusive.
+    last_lsn = Lsn(later_delta.lsn_end).lsn_int - 1
+
+    if branchpoint == Branchpoint.EARLIER:
+        branch_at = after_first_tx
+        rows = 0
+        truncated_layers = 1
+    elif branchpoint == Branchpoint.AT_L0:
+        branch_at = Lsn(last_lsn)
+        rows = 8192
+        truncated_layers = 0
+    elif branchpoint == Branchpoint.AFTER_L0:
+        branch_at = Lsn(last_lsn + 8)
+        rows = 8192
+        # as there is no 8 byte walrecord, nothing should get copied from the straddling layer
+        truncated_layers = 0
+    else:
+        # this case also covers the implicit flush of ancestor as the inmemory hasn't been flushed yet
+        assert branchpoint == Branchpoint.LAST_RECORD_LSN
+        branch_at = None
+        rows = 16384
+        truncated_layers = 0
+
+    name = "new main"
+
+    timeline_id = env.neon_cli.create_branch(
+        name, "main", env.initial_tenant, ancestor_start_lsn=branch_at
+    )
+
+    recorded = Lsn(client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_lsn"])
+    if branch_at is None:
+        # fix it up if we need it later (currently unused)
+        branch_at = recorded
+    else:
+        assert branch_at == recorded, "the test should not use unaligned lsns"
+
+    if write_to_branch_first:
+        with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep:
+            assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
+            # make sure the ep is writable
+            # with BEFORE_L0, AFTER_L0 there will be a gap in Lsns caused by accurate end_lsn on straddling layers
+            ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;")
+            wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id)
+
+        # branch must have a flush for "PREV_LSN: none"
+        client.timeline_checkpoint(env.initial_tenant, timeline_id)
+        branch_layers = set(
+            map(layer_name, client.layer_map_info(env.initial_tenant, timeline_id).historic_layers)
+        )
+    else:
+        branch_layers = set()
+
+    all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
+    assert all_reparented == set()
+
+    if restart_after:
+        env.pageserver.stop()
+        env.pageserver.start()
+
+    with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep:
+        assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == 16384
+
+    with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep:
+        assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
+
+    old_main_info = client.layer_map_info(env.initial_tenant, env.initial_timeline)
+    old_main = set(map(layer_name, old_main_info.historic_layers))
+
+    new_main_info = client.layer_map_info(env.initial_tenant, timeline_id)
+    new_main = set(map(layer_name, new_main_info.historic_layers))
+
+    new_main_copied_or_truncated = new_main - branch_layers
+    new_main_truncated = new_main_copied_or_truncated - old_main
+
+    assert len(new_main_truncated) == truncated_layers
+    # could additionally check that the symmetric difference has layers starting at the same lsn
+    # but if nothing was copied, then there is no nice rule.
+    # there could be a hole in LSNs between copied from the "old main" and the first branch layer.
+
+    client.timeline_delete(env.initial_tenant, env.initial_timeline)
+    wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
+
+
+@pytest.mark.parametrize("restart_after", [True, False])
+def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, restart_after: bool):
+    """
+    The case from RFC:
+
+                              +-> another branch with same ancestor_lsn as new main
+                              |
+    old main -------|---------X--------->
+                    |         |         |
+                    |         |         +-> after
+                    |         |
+                    |         +-> new main
+                    |
+                    +-> reparented
+
+    Ends up as:
+
+    old main --------------------------->
+                                        |
+                                        +-> after
+
+                              +-> another branch with same ancestor_lsn as new main
+                              |
+    new main -------|---------|->
+                    |
+                    +-> reparented
+
+    We confirm the end result by being able to delete "old main" after deleting "after".
+    """
+
+    # TODO: support not yet implemented for these
+    write_to_branch_first = True
+
+    env = neon_env_builder.init_start()
+
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*initial size calculation failed: downloading failed, possibly for shutdown",
+            # after restart this is likely to happen if there is other load on the runner
+            ".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+        ]
+    )
+
+    client = env.pageserver.http_client()
+
+    with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep:
+        ep.safe_psql("CREATE TABLE foo (i BIGINT);")
+        ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;")
+
+        branchpoint_pipe = wait_for_last_flush_lsn(
+            env, ep, env.initial_tenant, env.initial_timeline
+        )
+
+        ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);")
+
+        branchpoint_x = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+        client.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
+
+        ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);")
+        wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+
+    # as this only gets reparented, we don't need to write to it like new main
+    reparented = env.neon_cli.create_branch(
+        "reparented", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_pipe
+    )
+
+    same_branchpoint = env.neon_cli.create_branch(
+        "same_branchpoint", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_x
+    )
+
+    timeline_id = env.neon_cli.create_branch(
+        "new main", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_x
+    )
+
+    after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None)
+
+    if write_to_branch_first:
+        with env.endpoints.create_start("new main", tenant_id=env.initial_tenant) as ep:
+            assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == 8192
+            with ep.cursor() as cur:
+                cur.execute("UPDATE audit SET starts = starts + 1")
+                assert cur.rowcount == 1
+            wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id)
+
+        client.timeline_checkpoint(env.initial_tenant, timeline_id)
+
+    all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
+    assert all_reparented == {reparented, same_branchpoint}
+
+    if restart_after:
+        env.pageserver.stop()
+        env.pageserver.start()
+
+    env.pageserver.quiesce_tenants()
+
+    # checking the ancestor after is much faster than waiting for the endpoint not start
+    expected_result = [
+        ("main", env.initial_timeline, None, 16384, 1),
+        ("after", after, env.initial_timeline, 16384, 1),
+        ("new main", timeline_id, None, 8192, 2),
+        ("same_branchpoint", same_branchpoint, timeline_id, 8192, 1),
+        ("reparented", reparented, timeline_id, 0, 1),
+    ]
+
+    for _, timeline_id, expected_ancestor, _, _ in expected_result:
+        details = client.timeline_detail(env.initial_tenant, timeline_id)
+        ancestor_timeline_id = details["ancestor_timeline_id"]
+        if expected_ancestor is None:
+            assert ancestor_timeline_id is None
+        else:
+            assert TimelineId(ancestor_timeline_id) == expected_ancestor
+
+    for name, _, _, rows, starts in expected_result:
+        with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep:
+            assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
+            assert ep.safe_psql(f"SELECT count(*) FROM audit WHERE starts = {starts}")[0][0] == 1
+
+    # delete the timelines to confirm detach actually worked
+    client.timeline_delete(env.initial_tenant, after)
+    wait_timeline_detail_404(client, env.initial_tenant, after, 10, 1.0)
+
+    client.timeline_delete(env.initial_tenant, env.initial_timeline)
+    wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
+
+
+@pytest.mark.parametrize("restart_after", [True, False])
+def test_detached_receives_flushes_while_being_detached(
+    neon_env_builder: NeonEnvBuilder, restart_after: bool
+):
+    """
+    Makes sure that the timeline is able to receive writes through-out the detach process.
+    """
+    write_to_branch_first = True
+
+    env = neon_env_builder.init_start()
+
+    client = env.pageserver.http_client()
+
+    # row counts have been manually verified to cause reconnections and getpage
+    # requests when restart_after=False with pg16
+    def insert_rows(n: int, ep) -> int:
+        ep.safe_psql(
+            f"INSERT INTO foo SELECT i::bigint, 'more info!! this is a long string' || i FROM generate_series(0, {n - 1}) g(i);"
+        )
+        return n
+
+    with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep:
+        ep.safe_psql("CREATE EXTENSION neon_test_utils;")
+        ep.safe_psql("CREATE TABLE foo (i BIGINT, aux TEXT NOT NULL);")
+
+        rows = insert_rows(256, ep)
+
+        branchpoint = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+
+    timeline_id = env.neon_cli.create_branch(
+        "new main", "main", tenant_id=env.initial_tenant, ancestor_start_lsn=branchpoint
+    )
+
+    log.info("starting the new main endpoint")
+    ep = env.endpoints.create_start("new main", tenant_id=env.initial_tenant)
+    assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
+
+    if write_to_branch_first:
+        rows += insert_rows(256, ep)
+        wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id)
+        client.timeline_checkpoint(env.initial_tenant, timeline_id)
+        log.info("completed {write_to_branch_first=}")
+
+    def small_txs(ep, queue: Queue[str], barrier):
+        extra_rows = 0
+
+        with ep.connect() as conn:
+            while True:
+                try:
+                    queue.get_nowait()
+                    break
+                except Empty:
+                    pass
+
+                if barrier is not None:
+                    barrier.wait()
+                    barrier = None
+
+                cursor = conn.cursor()
+                cursor.execute(
+                    "INSERT INTO foo(i, aux) VALUES (1, 'more info!! this is a long string' || 1);"
+                )
+                extra_rows += 1
+        return extra_rows
+
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        queue: Queue[str] = Queue()
+        barrier = Barrier(2)
+
+        completion = exec.submit(small_txs, ep, queue, barrier)
+        barrier.wait()
+
+        reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
+        assert len(reparented) == 0
+
+        if restart_after:
+            # ep and row production is kept alive on purpose
+            env.pageserver.stop()
+            env.pageserver.start()
+
+        env.pageserver.quiesce_tenants()
+
+        queue.put("done")
+        extra_rows = completion.result()
+        assert extra_rows > 0, "some rows should had been written"
+        rows += extra_rows
+
+    assert client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_timeline_id"] is None
+
+    assert ep.safe_psql("SELECT clear_buffer_cache();")
+    assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
+    assert ep.safe_psql("SELECT SUM(LENGTH(aux)) FROM foo")[0][0] != 0
+    ep.stop()
+
+    # finally restart the endpoint and make sure we still have the same answer
+    with env.endpoints.create_start("new main", tenant_id=env.initial_tenant) as ep:
+        assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
+
+    env.pageserver.allowed_errors.append(
+        "initial size calculation failed: downloading failed, possibly for shutdown"
+    )
+
+
+# TODO:
+# - after starting the operation, tenant is deleted
+# - after starting the operation, pageserver is shutdown, restarted
+# - after starting the operation, bottom-most timeline is deleted, pageserver is restarted, gc is inhibited
+# - deletion of reparented while reparenting should fail once, then succeed (?)
+# - branch near existing L1 boundary, image layers?
+# - investigate: why are layers started at uneven lsn? not just after branching, but in general.

From 6e4e578841ce9ec09a8b8e255a511163407901bd Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 7 May 2024 13:12:53 +0100
Subject: [PATCH 0711/1571] build(deps): bump werkzeug from 3.0.1 to 3.0.3
 (#7625)

---
 poetry.lock    | 8 ++++----
 pyproject.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 6ed64d28fc..1e4acf5a44 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2612,13 +2612,13 @@ files = [
 
 [[package]]
 name = "werkzeug"
-version = "3.0.1"
+version = "3.0.3"
 description = "The comprehensive WSGI web application library."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "werkzeug-3.0.1-py3-none-any.whl", hash = "sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10"},
-    {file = "werkzeug-3.0.1.tar.gz", hash = "sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc"},
+    {file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"},
+    {file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"},
 ]
 
 [package.dependencies]
@@ -2900,4 +2900,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "b3452b50901123fd5f2c385ce8a0c1c492296393b8a7926a322b6df0ea3ac572"
+content-hash = "6440e52af2ac8dfd3709d8bd1644fe92006a14b024bc5476ed7c359516729673"
diff --git a/pyproject.toml b/pyproject.toml
index aadcf26818..9fef8f14a7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,7 +24,7 @@ backoff = "^2.2.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
 pytest-timeout = "^2.1.0"
-Werkzeug = "^3.0.1"
+Werkzeug = "^3.0.3"
 pytest-order = "^1.1.0"
 allure-pytest = "^2.13.2"
 pytest-asyncio = "^0.21.0"

From 5a3d8e75edd5f684726b662638c833f02b1423e6 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 7 May 2024 12:53:52 +0000
Subject: [PATCH 0712/1571] build(deps): bump jinja2 from 3.1.3 to 3.1.4
 (#7626)

---
 poetry.lock    | 8 ++++----
 pyproject.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 1e4acf5a44..6d7d6e5719 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1243,13 +1243,13 @@ files = [
 
 [[package]]
 name = "jinja2"
-version = "3.1.3"
+version = "3.1.4"
 description = "A very fast and expressive template engine."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"},
-    {file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"},
+    {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"},
+    {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"},
 ]
 
 [package.dependencies]
@@ -2900,4 +2900,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "6440e52af2ac8dfd3709d8bd1644fe92006a14b024bc5476ed7c359516729673"
+content-hash = "496d6d9f722983bda4d1265370bc8ba75560da74ab5d6b68c94a03290815e1eb"
diff --git a/pyproject.toml b/pyproject.toml
index 9fef8f14a7..4ec8efc2ff 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,7 @@ requests = "^2.31.0"
 pytest-xdist = "^3.3.1"
 asyncpg = "^0.29.0"
 aiopg = "^1.4.0"
-Jinja2 = "^3.1.3"
+Jinja2 = "^3.1.4"
 types-requests = "^2.31.0.0"
 types-psycopg2 = "^2.9.21.10"
 boto3 = "^1.34.11"

From 51376ef3c8436180c8d693e0f4e8c97df9354c35 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 7 May 2024 16:18:17 +0100
Subject: [PATCH 0713/1571] Add Postgres commit sha to Postgres version (#4603)

## Problem

Ref https://neondb.slack.com/archives/C036U0GRMRB/p1688122168477729

## Summary of changes
- Add sha from postgres repo into postgres version string (via
`--with-extra-version`)
- Add a test that Postgres version matches the expected one
- Remove build-time hard check and allow only related tests to fail
---
 .github/workflows/build_and_test.yml         | 21 ------------
 Makefile                                     | 11 +++---
 test_runner/fixtures/neon_fixtures.py        |  2 +-
 test_runner/regress/test_postgres_version.py | 35 ++++++++++++++++++++
 vendor/revisions.json                        |  6 ++--
 5 files changed, 46 insertions(+), 29 deletions(-)
 create mode 100644 test_runner/regress/test_postgres_version.py

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 606564f209..eada65505f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -236,27 +236,6 @@ jobs:
           submodules: true
           fetch-depth: 1
 
-      - name: Check Postgres submodules revision
-        shell: bash -euo pipefail {0}
-        run: |
-          # This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally).
-          # Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603
-
-          FAILED=false
-          for postgres in postgres-v14 postgres-v15 postgres-v16; do
-            expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
-            actual=$(git rev-parse "HEAD:vendor/${postgres}")
-            if [ "${expected}" != "${actual}" ]; then
-              echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'"
-              FAILED=true
-            fi
-          done
-
-          if [ "${FAILED}" = "true" ]; then
-            echo >&2 "Please update vendor/revisions.json if these changes are intentional"
-            exit 1
-          fi
-
       - name: Set pg 14 revision for caching
         id: pg_v14_rev
         run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
diff --git a/Makefile b/Makefile
index 5e2b3c4367..dcbfdbcbc1 100644
--- a/Makefile
+++ b/Makefile
@@ -81,11 +81,14 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
 		exit 1; }
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
-	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
-	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
+
+	VERSION=$*; \
+	EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
+	(cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \
+	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
 		CFLAGS='$(PG_CFLAGS)' \
-		$(PG_CONFIGURE_OPTS) \
-		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log)
+		$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
+		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)
 
 # nicer alias to run 'configure'
 # Note: I've been unable to use templates for this part of our configuration.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 1552e7e48a..db74577b40 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -982,7 +982,7 @@ class NeonEnv:
 
     Some notable functions and fields in NeonEnv:
 
-    postgres - A factory object for creating postgres compute nodes.
+    endpoints - A factory object for creating postgres compute nodes.
 
     pageservers - An array containing objects representing the pageservers
 
diff --git a/test_runner/regress/test_postgres_version.py b/test_runner/regress/test_postgres_version.py
new file mode 100644
index 0000000000..03e8c7c0df
--- /dev/null
+++ b/test_runner/regress/test_postgres_version.py
@@ -0,0 +1,35 @@
+import json
+import re
+from pathlib import Path
+
+from fixtures.neon_fixtures import PgBin
+from fixtures.pg_version import PgVersion
+
+
+def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion):
+    """Test that Postgres version matches the one we expect"""
+
+    with (base_dir / "vendor" / "revisions.json").open() as f:
+        expected_revisions = json.load(f)
+
+    output_prefix = pg_bin.run_capture(["postgres", "--version"], with_command_header=False)
+    stdout = Path(f"{output_prefix}.stdout")
+    assert stdout.exists(), "postgres --version didn't print anything to stdout"
+
+    with stdout.open() as f:
+        output = f.read().strip()
+
+    # `postgres --version` prints something like "postgres (PostgreSQL) 15.6 (85d809c124a898847a97d66a211f7d5ef4f8e0cb)".
+    pattern = r"postgres \(PostgreSQL\) (?P<version>\d+\.\d+) \((?P<commit>[0-9a-f]{40})\)"
+    match = re.search(pattern, output, re.IGNORECASE)
+    assert match is not None, f"Can't parse {output} with {pattern}"
+
+    version = match.group("version")
+    commit = match.group("commit")
+
+    assert (
+        pg_version.v_prefixed in expected_revisions
+    ), f"Version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional"
+
+    msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional"
+    assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg
diff --git a/vendor/revisions.json b/vendor/revisions.json
index a353fde8fd..c5b55762fa 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "postgres-v16": "8ef3c33aa01631e17cb24a122776349fcc777b46",
-  "postgres-v15": "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a",
-  "postgres-v14": "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a"
+  "v16": ["16.2", "8ef3c33aa01631e17cb24a122776349fcc777b46"],
+  "v15": ["15.6", "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a"],
+  "v14": ["14.11", "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a"]
 }

From 2dbd1c1ed5cd0458933e8ffd40a9c0a5f4d610b8 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 7 May 2024 16:29:40 +0100
Subject: [PATCH 0714/1571] build(deps): bump flask-cors from 3.0.10 to 4.0.1
 (#7633)

---
 poetry.lock | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 6d7d6e5719..e437f5de74 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1001,18 +1001,17 @@ dotenv = ["python-dotenv"]
 
 [[package]]
 name = "flask-cors"
-version = "3.0.10"
+version = "4.0.1"
 description = "A Flask extension adding a decorator for CORS support"
 optional = false
 python-versions = "*"
 files = [
-    {file = "Flask-Cors-3.0.10.tar.gz", hash = "sha256:b60839393f3b84a0f3746f6cdca56c1ad7426aa738b70d6c61375857823181de"},
-    {file = "Flask_Cors-3.0.10-py2.py3-none-any.whl", hash = "sha256:74efc975af1194fc7891ff5cd85b0f7478be4f7f59fe158102e91abb72bb4438"},
+    {file = "Flask_Cors-4.0.1-py2.py3-none-any.whl", hash = "sha256:f2a704e4458665580c074b714c4627dd5a306b333deb9074d0b1794dfa2fb677"},
+    {file = "flask_cors-4.0.1.tar.gz", hash = "sha256:eeb69b342142fdbf4766ad99357a7f3876a2ceb77689dc10ff912aac06c389e4"},
 ]
 
 [package.dependencies]
 Flask = ">=0.9"
-Six = "*"
 
 [[package]]
 name = "frozenlist"

From ea531d448eb65c4f58abb9ef7d8cd461952f7c5f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 7 May 2024 17:43:04 +0200
Subject: [PATCH 0715/1571] fix(test suite): forward compat test is not using
 latest neon_local (#7637)

The `test_forward_compatibility` test runs the old production binaries,
but is supposed to always run the latest neon_local binary.

I think commit 6acbee23 broke that by accident because in that commit,
`from_repo_dir` is introduced and runs an `init_start()` before the
`test_forward_compatibility` gets a chance to patch up the
neon_local_binpath.
---
 test_runner/fixtures/neon_fixtures.py     | 14 +++++---------
 test_runner/regress/test_compatibility.py | 15 ++++++++-------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index db74577b40..1e4de9a888 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -488,6 +488,7 @@ class NeonEnvBuilder:
         self.env: Optional[NeonEnv] = None
         self.keep_remote_storage_contents: bool = True
         self.neon_binpath = neon_binpath
+        self.neon_local_binpath = neon_binpath
         self.pg_distrib_dir = pg_distrib_dir
         self.pg_version = pg_version
         self.preserve_database_files = preserve_database_files
@@ -632,17 +633,11 @@ class NeonEnvBuilder:
     def from_repo_dir(
         self,
         repo_dir: Path,
-        neon_binpath: Optional[Path] = None,
-        pg_distrib_dir: Optional[Path] = None,
     ) -> NeonEnv:
         """
         A simple method to import data into the current NeonEnvBuilder from a snapshot of a repo dir.
         """
 
-        # Setting custom `neon_binpath` and `pg_distrib_dir` is useful for compatibility tests
-        self.neon_binpath = neon_binpath or self.neon_binpath
-        self.pg_distrib_dir = pg_distrib_dir or self.pg_distrib_dir
-
         # Get the initial tenant and timeline from the snapshot config
         snapshot_config_toml = repo_dir / "config"
         with snapshot_config_toml.open("r") as f:
@@ -1017,9 +1012,10 @@ class NeonEnv:
         self.pg_version = config.pg_version
         # Binary path for pageserver, safekeeper, etc
         self.neon_binpath = config.neon_binpath
-        # Binary path for neon_local test-specific binaries: may be overridden
-        # after construction for compat testing
-        self.neon_local_binpath = config.neon_binpath
+        # Binary path for neon_local test-specific binaries
+        self.neon_local_binpath = config.neon_local_binpath
+        if self.neon_local_binpath is None:
+            self.neon_local_binpath = self.neon_binpath
         self.pg_distrib_dir = config.pg_distrib_dir
         self.endpoint_counter = 0
         self.storage_controller_config = config.storage_controller_config
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index e1ccb3e0c6..787c114fc1 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -233,17 +233,18 @@ def test_forward_compatibility(
         neon_env_builder.pageserver_validate_vectored_get = None
 
         neon_env_builder.num_safekeepers = 3
-        neon_local_binpath = neon_env_builder.neon_binpath
+
+        # Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.).
+        # But always use the current version's neon_local binary.
+        # This is because we want to test the compatibility of the data format, not the compatibility of the neon_local CLI.
+        neon_env_builder.neon_binpath = compatibility_neon_bin
+        neon_env_builder.pg_distrib_dir = compatibility_postgres_distrib_dir
+        neon_env_builder.neon_local_binpath = neon_env_builder.neon_local_binpath
+
         env = neon_env_builder.from_repo_dir(
             compatibility_snapshot_dir / "repo",
-            neon_binpath=compatibility_neon_bin,
-            pg_distrib_dir=compatibility_postgres_distrib_dir,
         )
 
-        # Use current neon_local even though we're using old binaries for
-        # everything else: our test code is written for latest CLI args.
-        env.neon_local_binpath = neon_local_binpath
-
         neon_env_builder.start()
 
         check_neon_works(

From d041f9a8872771a94075215605f624e861e081a8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 7 May 2024 19:22:29 +0300
Subject: [PATCH 0716/1571] refactor(rtc): remove excess cloning (#7635)

RemoteTimelineClient has a lot of mandatory cloning. By using a single
way of creating IndexPart out of UploadQueueInitialized we can simplify
things and also avoid cloning the latest files for each
`index_part.json` upload (the contents will still be cloned).
---
 .../src/tenant/remote_timeline_client.rs      | 38 +++++-------------
 .../tenant/remote_timeline_client/index.rs    | 40 ++++++++++---------
 pageserver/src/tenant/timeline.rs             |  2 +-
 3 files changed, 32 insertions(+), 48 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 49dbac2f13..255449c049 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -570,7 +570,7 @@ impl RemoteTimelineClient {
         // ahead of what's _actually_ on the remote during index upload.
         upload_queue.latest_metadata = metadata.clone();
 
-        self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+        self.schedule_index_upload(upload_queue);
 
         Ok(())
     }
@@ -591,7 +591,7 @@ impl RemoteTimelineClient {
 
         upload_queue.latest_metadata.apply(update);
 
-        self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+        self.schedule_index_upload(upload_queue);
 
         Ok(())
     }
@@ -611,18 +611,14 @@ impl RemoteTimelineClient {
         let upload_queue = guard.initialized_mut()?;
 
         if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+            self.schedule_index_upload(upload_queue);
         }
 
         Ok(())
     }
 
     /// Launch an index-file upload operation in the background (internal function)
-    fn schedule_index_upload(
-        self: &Arc<Self>,
-        upload_queue: &mut UploadQueueInitialized,
-        metadata: TimelineMetadata,
-    ) {
+    fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
         let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
 
         info!(
@@ -631,11 +627,7 @@ impl RemoteTimelineClient {
             upload_queue.latest_files_changes_since_metadata_upload_scheduled,
         );
 
-        let index_part = IndexPart::new(
-            upload_queue.latest_files.clone(),
-            disk_consistent_lsn,
-            metadata,
-        );
+        let index_part = IndexPart::from(&*upload_queue);
         let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
         self.metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
@@ -657,7 +649,7 @@ impl RemoteTimelineClient {
 
             upload_queue.latest_metadata.reparent(new_parent);
 
-            self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+            self.schedule_index_upload(upload_queue);
 
             self.schedule_barrier0(upload_queue)
         };
@@ -688,7 +680,7 @@ impl RemoteTimelineClient {
                     .insert(layer.layer_desc().filename(), layer.metadata());
             }
 
-            self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+            self.schedule_index_upload(upload_queue);
 
             let barrier = self.schedule_barrier0(upload_queue);
             self.launch_queued_tasks(upload_queue);
@@ -792,10 +784,6 @@ impl RemoteTimelineClient {
     where
         I: IntoIterator<Item = LayerFileName>,
     {
-        // Deleting layers doesn't affect the values stored in TimelineMetadata,
-        // so we don't need update it. Just serialize it.
-        let metadata = upload_queue.latest_metadata.clone();
-
         // Decorate our list of names with each name's metadata, dropping
         // names that are unexpectedly missing from our metadata.  This metadata
         // is later used when physically deleting layers, to construct key paths.
@@ -834,7 +822,7 @@ impl RemoteTimelineClient {
         // index_part update, because that needs to be uploaded before we can actually delete the
         // files.
         if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue, metadata);
+            self.schedule_index_upload(upload_queue);
         }
 
         with_metadata
@@ -1063,8 +1051,7 @@ impl RemoteTimelineClient {
             let deleted_at = Utc::now().naive_utc();
             stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
 
-            let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion)
-                .context("IndexPart serialize")?;
+            let mut index_part = IndexPart::from(&stopped.upload_queue_for_deletion);
             index_part.deleted_at = Some(deleted_at);
             index_part
         };
@@ -2378,12 +2365,7 @@ mod tests {
 
     async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
         // An empty IndexPart, just sufficient to ensure deserialization will succeed
-        let example_metadata = TimelineMetadata::example();
-        let example_index_part = IndexPart::new(
-            HashMap::new(),
-            example_metadata.disk_consistent_lsn(),
-            example_metadata,
-        );
+        let example_index_part = IndexPart::example();
 
         let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();
 
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 0abfdeef02..7e0619945f 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -6,7 +6,6 @@ use std::collections::HashMap;
 
 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
-use utils::bin_ser::SerializeError;
 
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerFileName;
@@ -104,15 +103,14 @@ impl IndexPart {
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
-    pub fn new(
-        layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
+    fn new(
+        layers_and_metadata: &HashMap<LayerFileName, LayerFileMetadata>,
         disk_consistent_lsn: Lsn,
         metadata: TimelineMetadata,
     ) -> Self {
-        // Transform LayerFileMetadata into IndexLayerMetadata
         let layer_metadata = layers_and_metadata
-            .into_iter()
-            .map(|(k, v)| (k, IndexLayerMetadata::from(v)))
+            .iter()
+            .map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v)))
             .collect();
 
         Self {
@@ -141,20 +139,24 @@ impl IndexPart {
     pub fn to_s3_bytes(&self) -> serde_json::Result<Vec<u8>> {
         serde_json::to_vec(self)
     }
+
+    #[cfg(test)]
+    pub(crate) fn example() -> Self {
+        let example_metadata = TimelineMetadata::example();
+        Self::new(
+            &HashMap::new(),
+            example_metadata.disk_consistent_lsn(),
+            example_metadata,
+        )
+    }
 }
 
-impl TryFrom<&UploadQueueInitialized> for IndexPart {
-    type Error = SerializeError;
+impl From<&UploadQueueInitialized> for IndexPart {
+    fn from(uq: &UploadQueueInitialized) -> Self {
+        let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn();
+        let metadata = uq.latest_metadata.clone();
 
-    fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
-        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
-        let metadata = upload_queue.latest_metadata.clone();
-
-        Ok(Self::new(
-            upload_queue.latest_files.clone(),
-            disk_consistent_lsn,
-            metadata,
-        ))
+        Self::new(&uq.latest_files, disk_consistent_lsn, metadata)
     }
 }
 
@@ -172,8 +174,8 @@ pub struct IndexLayerMetadata {
     pub shard: ShardIndex,
 }
 
-impl From<LayerFileMetadata> for IndexLayerMetadata {
-    fn from(other: LayerFileMetadata) -> Self {
+impl From<&LayerFileMetadata> for IndexLayerMetadata {
+    fn from(other: &LayerFileMetadata) -> Self {
         IndexLayerMetadata {
             file_size: other.file_size,
             generation: other.generation,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index add6e3806e..1c417262b0 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3015,7 +3015,7 @@ impl Timeline {
 
             HeatMapLayer::new(
                 layer.layer_desc().filename(),
-                layer.metadata().into(),
+                (&layer.metadata()).into(),
                 last_activity_ts,
             )
         });

From 308227fa5114f6c81af165861740ed176bd389ee Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 7 May 2024 18:29:59 +0200
Subject: [PATCH 0717/1571] remove `neon_local --pageserver-config-override`
 (#7614)

Preceding PR https://github.com/neondatabase/neon/pull/7613 reduced the
usage of `--pageserver-config-override`.

This PR builds on top of that work and fully removes the `neon_local
--pageserver-config-override`.

Tests that need a non-default `pageserver.toml` control it using two
options:

1. Specify `NeonEnvBuilder.pageserver_config_override` before
`NeonEnvBuilder.init_start()`. This uses a new `neon_local init
--pageserver-config` flag.
2. After `init_start()`: `env.pageserver.stop()` +
`NeonPageserver.edit_config_toml()` + `env.pageserver.start()`

A few test cases were using
`env.pageserver.start(overrides=("--pageserver-config-override...",))`.
I changed them to use one of the options above.

Future Work
-----------

The `neon_local init --pageserver-config` flag still uses `pageserver
--config-override` under the hood. In the future, neon_local should just
write the `pageserver.toml` directly.

The `NeonEnvBuilder.pageserver_config_override` field should be renamed
to `pageserver_initial_config`. Let's save this churn for a separate
refactor commit.
---
 Cargo.lock                                    |  1 +
 control_plane/Cargo.toml                      |  1 +
 control_plane/src/bin/neon_local.rs           | 58 +++++-------
 control_plane/src/pageserver.rs               | 25 ++---
 test_runner/fixtures/neon_fixtures.py         | 94 +++++++++++++------
 test_runner/fixtures/remote_storage.py        | 22 ++++-
 .../performance/test_branch_creation.py       | 10 +-
 .../regress/test_disk_usage_eviction.py       | 48 +++++-----
 .../regress/test_pageserver_generations.py    | 17 +++-
 .../regress/test_storage_controller.py        |  7 +-
 10 files changed, 163 insertions(+), 120 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 8438dad41b..b0c7aec6ae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1348,6 +1348,7 @@ dependencies = [
  "tokio-postgres",
  "tokio-util",
  "toml",
+ "toml_edit",
  "tracing",
  "url",
  "utils",
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 2ce041068e..e62f3b8a47 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -28,6 +28,7 @@ serde_with.workspace = true
 tar.workspace = true
 thiserror.workspace = true
 toml.workspace = true
+toml_edit.workspace = true
 tokio.workspace = true
 tokio-postgres.workspace = true
 tokio-util.workspace = true
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index e01d5c9799..3f09042d9d 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -133,7 +133,7 @@ fn main() -> Result<()> {
         let subcommand_result = match sub_name {
             "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
             "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
-            "start" => rt.block_on(handle_start_all(sub_args, &env)),
+            "start" => rt.block_on(handle_start_all(&env)),
             "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
             "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
             "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
@@ -358,6 +358,13 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
         default_conf(*num_pageservers)
     };
 
+    let pageserver_config: toml_edit::Document =
+        if let Some(path) = init_match.get_one::<PathBuf>("pageserver-config") {
+            std::fs::read_to_string(path)?.parse()?
+        } else {
+            toml_edit::Document::new()
+        };
+
     let pg_version = init_match
         .get_one::<u32>("pg-version")
         .copied()
@@ -375,7 +382,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
     // Initialize pageserver, create initial tenant and timeline.
     for ps_conf in &env.pageservers {
         PageServerNode::from_env(&env, ps_conf)
-            .initialize(&pageserver_config_overrides(init_match))
+            .initialize(&pageserver_config)
             .unwrap_or_else(|e| {
                 eprintln!("pageserver init failed: {e:?}");
                 exit(1);
@@ -397,15 +404,6 @@ fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode {
     PageServerNode::from_env(env, ps_conf)
 }
 
-fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
-    init_match
-        .get_many::<String>("pageserver-config-override")
-        .into_iter()
-        .flatten()
-        .map(String::as_str)
-        .collect()
-}
-
 async fn handle_tenant(
     tenant_match: &ArgMatches,
     env: &mut local_env::LocalEnv,
@@ -1076,10 +1074,7 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     match sub_match.subcommand() {
         Some(("start", subcommand_args)) => {
-            if let Err(e) = get_pageserver(env, subcommand_args)?
-                .start(&pageserver_config_overrides(subcommand_args))
-                .await
-            {
+            if let Err(e) = get_pageserver(env, subcommand_args)?.start().await {
                 eprintln!("pageserver start failed: {e}");
                 exit(1);
             }
@@ -1105,10 +1100,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
                 exit(1);
             }
 
-            if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args))
-                .await
-            {
+            if let Err(e) = pageserver.start().await {
                 eprintln!("pageserver start failed: {e}");
                 exit(1);
             }
@@ -1235,7 +1227,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
     Ok(())
 }
 
-async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
+async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
     // Endpoints are not started automatically
 
     broker::start_broker_process(env).await?;
@@ -1252,10 +1244,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
 
     for ps_conf in &env.pageservers {
         let pageserver = PageServerNode::from_env(env, ps_conf);
-        if let Err(e) = pageserver
-            .start(&pageserver_config_overrides(sub_match))
-            .await
-        {
+        if let Err(e) = pageserver.start().await {
             eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
             try_stop_all(env, true).await;
             exit(1);
@@ -1396,13 +1385,6 @@ fn cli() -> Command {
         .required(false)
         .value_name("stop-mode");
 
-    let pageserver_config_args = Arg::new("pageserver-config-override")
-        .long("pageserver-config-override")
-        .num_args(1)
-        .action(ArgAction::Append)
-        .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
-        .required(false);
-
     let remote_ext_config_args = Arg::new("remote-ext-config")
         .long("remote-ext-config")
         .num_args(1)
@@ -1464,14 +1446,21 @@ fn cli() -> Command {
         .subcommand(
             Command::new("init")
                 .about("Initialize a new Neon repository, preparing configs for services to start with")
-                .arg(pageserver_config_args.clone())
                 .arg(num_pageservers_arg.clone())
                 .arg(
                     Arg::new("config")
                         .long("config")
                         .required(false)
                         .value_parser(value_parser!(PathBuf))
-                        .value_name("config"),
+                        .value_name("config")
+                )
+                .arg(
+                    Arg::new("pageserver-config")
+                        .long("pageserver-config")
+                        .required(false)
+                        .value_parser(value_parser!(PathBuf))
+                        .value_name("pageserver-config")
+                        .help("Merge the provided pageserver config into the one generated by neon_local."),
                 )
                 .arg(pg_version_arg.clone())
                 .arg(force_arg)
@@ -1553,7 +1542,6 @@ fn cli() -> Command {
                 .subcommand(Command::new("status"))
                 .subcommand(Command::new("start")
                     .about("Start local pageserver")
-                    .arg(pageserver_config_args.clone())
                 )
                 .subcommand(Command::new("stop")
                     .about("Stop local pageserver")
@@ -1561,7 +1549,6 @@ fn cli() -> Command {
                 )
                 .subcommand(Command::new("restart")
                     .about("Restart local pageserver")
-                    .arg(pageserver_config_args.clone())
                 )
         )
         .subcommand(
@@ -1676,7 +1663,6 @@ fn cli() -> Command {
         .subcommand(
             Command::new("start")
                 .about("Start page server and safekeepers")
-                .arg(pageserver_config_args)
         )
         .subcommand(
             Command::new("stop")
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 45be14ef95..fbe0d419ae 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -76,7 +76,7 @@ impl PageServerNode {
     /// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
     ///
     /// These all end up on the command line of the `pageserver` binary.
-    fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
+    fn neon_local_overrides(&self, cli_overrides: &toml_edit::Document) -> Vec<String> {
         // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
         let pg_distrib_dir_param = format!(
             "pg_distrib_dir='{}'",
@@ -156,10 +156,7 @@ impl PageServerNode {
             }
         }
 
-        if !cli_overrides
-            .iter()
-            .any(|c| c.starts_with("remote_storage"))
-        {
+        if !cli_overrides.contains_key("remote_storage") {
             overrides.push(format!(
                 "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
             ));
@@ -172,13 +169,13 @@ impl PageServerNode {
         }
 
         // Apply the user-provided overrides
-        overrides.extend(cli_overrides.iter().map(|&c| c.to_owned()));
+        overrides.push(cli_overrides.to_string());
 
         overrides
     }
 
     /// Initializes a pageserver node by creating its config with the overrides provided.
-    pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
+    pub fn initialize(&self, config_overrides: &toml_edit::Document) -> anyhow::Result<()> {
         // First, run `pageserver --init` and wait for it to write a config into FS and exit.
         self.pageserver_init(config_overrides)
             .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
@@ -196,11 +193,11 @@ impl PageServerNode {
             .expect("non-Unicode path")
     }
 
-    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
-        self.start_node(config_overrides).await
+    pub async fn start(&self) -> anyhow::Result<()> {
+        self.start_node().await
     }
 
-    fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
+    fn pageserver_init(&self, config_overrides: &toml_edit::Document) -> anyhow::Result<()> {
         let datadir = self.repo_path();
         let node_id = self.conf.id;
         println!(
@@ -268,7 +265,7 @@ impl PageServerNode {
         Ok(())
     }
 
-    async fn start_node(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
+    async fn start_node(&self) -> anyhow::Result<()> {
         // TODO: using a thread here because start_process() is not async but we need to call check_status()
         let datadir = self.repo_path();
         print!(
@@ -285,11 +282,7 @@ impl PageServerNode {
                 self.conf.id, datadir,
             )
         })?;
-        let mut args = vec!["-D", datadir_path_str];
-        for config_override in config_overrides {
-            args.push("--config-override");
-            args.push(*config_override);
-        }
+        let args = vec!["-D", datadir_path_str];
         background_process::start_process(
             "pageserver",
             &datadir,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 1e4de9a888..82f17fe20d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -14,7 +14,7 @@ import textwrap
 import threading
 import time
 import uuid
-from contextlib import closing, contextmanager
+from contextlib import ExitStack, closing, contextmanager
 from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
@@ -68,7 +68,7 @@ from fixtures.remote_storage import (
     RemoteStorageUser,
     S3Storage,
     default_remote_storage,
-    remote_storage_to_toml_inline_table,
+    remote_storage_to_toml_dict,
 )
 from fixtures.safekeeper.http import SafekeeperHttpClient
 from fixtures.safekeeper.utils import are_walreceivers_absent
@@ -1705,36 +1705,44 @@ class NeonCli(AbstractNeonCli):
         force: Optional[str] = None,
         pageserver_config_override: Optional[str] = None,
     ) -> "subprocess.CompletedProcess[str]":
-        with tempfile.NamedTemporaryFile(mode="w+") as tmp:
-            tmp.write(toml.dumps(config))
-            tmp.flush()
+        remote_storage = self.env.pageserver_remote_storage
 
-            cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version]
+        ps_config = {}
+        if remote_storage is not None:
+            ps_config["remote_storage"] = remote_storage_to_toml_dict(remote_storage)
+
+        if pageserver_config_override is not None:
+            for o in pageserver_config_override.split(";"):
+                override = toml.loads(o)
+                for key, value in override.items():
+                    ps_config[key] = value
+
+        with ExitStack() as stack:
+            ps_config_file = stack.enter_context(tempfile.NamedTemporaryFile(mode="w+"))
+            ps_config_file.write(toml.dumps(ps_config))
+            ps_config_file.flush()
+
+            neon_local_config = stack.enter_context(tempfile.NamedTemporaryFile(mode="w+"))
+            neon_local_config.write(toml.dumps(config))
+            neon_local_config.flush()
+
+            cmd = [
+                "init",
+                f"--config={neon_local_config.name}",
+                "--pg-version",
+                self.env.pg_version,
+                f"--pageserver-config={ps_config_file.name}",
+            ]
 
             if force is not None:
                 cmd.extend(["--force", force])
 
-            remote_storage = self.env.pageserver_remote_storage
-
-            if remote_storage is not None:
-                remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage)
-
-                cmd.append(
-                    f"--pageserver-config-override=remote_storage={remote_storage_toml_table}"
-                )
-
-            if pageserver_config_override is not None:
-                cmd += [
-                    f"--pageserver-config-override={o.strip()}"
-                    for o in pageserver_config_override.split(";")
-                ]
-
             s3_env_vars = None
             if isinstance(remote_storage, S3Storage):
                 s3_env_vars = remote_storage.access_env_vars()
             res = self.raw_cli(cmd, extra_env_vars=s3_env_vars)
             res.check_returncode()
-            return res
+        return res
 
     def storage_controller_start(self):
         cmd = ["storage_controller", "start"]
@@ -1749,10 +1757,9 @@ class NeonCli(AbstractNeonCli):
     def pageserver_start(
         self,
         id: int,
-        overrides: Tuple[str, ...] = (),
         extra_env_vars: Optional[Dict[str, str]] = None,
     ) -> "subprocess.CompletedProcess[str]":
-        start_args = ["pageserver", "start", f"--id={id}", *overrides]
+        start_args = ["pageserver", "start", f"--id={id}"]
         storage = self.env.pageserver_remote_storage
 
         if isinstance(storage, S3Storage):
@@ -2417,9 +2424,42 @@ class NeonPageserver(PgProtocol, LogUtils):
             return self.workdir / "tenants"
         return self.workdir / "tenants" / str(tenant_shard_id)
 
+    @property
+    def config_toml_path(self) -> Path:
+        return self.workdir / "pageserver.toml"
+
+    def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], None]):
+        """
+        Edit the pageserver's config toml file in place.
+        """
+        path = self.config_toml_path
+        with open(path, "r") as f:
+            config = toml.load(f)
+        edit_fn(config)
+        with open(path, "w") as f:
+            toml.dump(config, f)
+
+    def patch_config_toml_nonrecursive(self, patch: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Non-recursively merge the given `patch` dict into the existing config toml, using `dict.update()`.
+        Returns the replaced values.
+        If there was no previous value, the key is mapped to None.
+        This allows to restore the original value by calling this method with the returned dict.
+        """
+        replacements = {}
+
+        def doit(config: Dict[str, Any]):
+            while len(patch) > 0:
+                key, new = patch.popitem()
+                old = config.get(key, None)
+                config[key] = new
+                replacements[key] = old
+
+        self.edit_config_toml(doit)
+        return replacements
+
     def start(
         self,
-        overrides: Tuple[str, ...] = (),
         extra_env_vars: Optional[Dict[str, str]] = None,
     ) -> "NeonPageserver":
         """
@@ -2429,9 +2469,7 @@ class NeonPageserver(PgProtocol, LogUtils):
         """
         assert self.running is False
 
-        self.env.neon_cli.pageserver_start(
-            self.id, overrides=overrides, extra_env_vars=extra_env_vars
-        )
+        self.env.neon_cli.pageserver_start(self.id, extra_env_vars=extra_env_vars)
         self.running = True
         return self
 
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 83f9f26837..925e1b450f 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -141,11 +141,13 @@ class LocalFsStorage:
         with self.heatmap_path(tenant_id).open("r") as f:
             return json.load(f)
 
-    def to_toml_inline_table(self) -> str:
-        rv = {
+    def to_toml_dict(self) -> Dict[str, Any]:
+        return {
             "local_path": str(self.root),
         }
-        return toml.TomlEncoder().dump_inline_table(rv)
+
+    def to_toml_inline_table(self) -> str:
+        return toml.TomlEncoder().dump_inline_table(self.to_toml_dict())
 
     def cleanup(self):
         # no cleanup is done here, because there's NeonEnvBuilder.cleanup_local_storage which will remove everything, including localfs files
@@ -194,7 +196,7 @@ class S3Storage:
             }
         )
 
-    def to_toml_inline_table(self) -> str:
+    def to_toml_dict(self) -> Dict[str, Any]:
         rv = {
             "bucket_name": self.bucket_name,
             "bucket_region": self.bucket_region,
@@ -206,7 +208,10 @@ class S3Storage:
         if self.endpoint is not None:
             rv["endpoint"] = self.endpoint
 
-        return toml.TomlEncoder().dump_inline_table(rv)
+        return rv
+
+    def to_toml_inline_table(self) -> str:
+        return toml.TomlEncoder().dump_inline_table(self.to_toml_dict())
 
     def do_cleanup(self):
         if not self.cleanup:
@@ -414,6 +419,13 @@ def default_remote_storage() -> RemoteStorageKind:
     return RemoteStorageKind.LOCAL_FS
 
 
+def remote_storage_to_toml_dict(remote_storage: RemoteStorage) -> Dict[str, Any]:
+    if not isinstance(remote_storage, (LocalFsStorage, S3Storage)):
+        raise Exception("invalid remote storage type")
+
+    return remote_storage.to_toml_dict()
+
+
 # serialize as toml inline table
 def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str:
     if not isinstance(remote_storage, (LocalFsStorage, S3Storage)):
diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index 54905759bd..7687b8417f 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -140,10 +140,14 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
 
     # start without gc so we can time compaction with less noise; use shorter
     # period for compaction so it starts earlier
+    def patch_default_tenant_config(config):
+        tenant_config = config.get("tenant_config", {})
+        tenant_config["compaction_period"] = "3s"
+        tenant_config["gc_period"] = "0s"
+        config["tenant_config"] = tenant_config
+
+    env.pageserver.edit_config_toml(patch_default_tenant_config)
     env.pageserver.start(
-        overrides=(
-            "--pageserver-config-override=tenant_config={ compaction_period = '3s', gc_period = '0s' }",
-        ),
         # this does print more than we want, but the number should be comparable between runs
         extra_env_vars={
             "RUST_LOG": f"[compaction_loop{{tenant_id={env.initial_tenant}}}]=debug,info"
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index b83545216d..5e9efa7cce 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -5,7 +5,6 @@ from dataclasses import dataclass
 from typing import Any, Dict, Iterable, Tuple
 
 import pytest
-import toml
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -45,17 +44,16 @@ def test_min_resident_size_override_handling(
         ps_http.set_tenant_config(tenant_id, {})
         assert_config(tenant_id, None, default_tenant_conf_value)
 
-    env.pageserver.stop()
     if config_level_override is not None:
-        env.pageserver.start(
-            overrides=(
-                "--pageserver-config-override=tenant_config={ min_resident_size_override =  "
-                + str(config_level_override)
-                + " }",
-            )
-        )
-    else:
-        env.pageserver.start()
+
+        def set_min_resident_size(config):
+            tenant_config = config.get("tenant_config", {})
+            tenant_config["min_resident_size_override"] = config_level_override
+            config["tenant_config"] = tenant_config
+
+        env.pageserver.edit_config_toml(set_min_resident_size)
+    env.pageserver.stop()
+    env.pageserver.start()
 
     tenant_id, _ = env.neon_cli.create_tenant()
     assert_overrides(tenant_id, config_level_override)
@@ -164,34 +162,32 @@ class EvictionEnv:
         usage eviction task is unknown; it might need to run one more iteration
         before assertions can be made.
         """
-        disk_usage_config = {
-            "period": period,
-            "max_usage_pct": max_usage_pct,
-            "min_avail_bytes": min_avail_bytes,
-            "mock_statvfs": mock_behavior,
-            "eviction_order": eviction_order.config(),
-        }
-
-        enc = toml.TomlEncoder()
 
         # these can sometimes happen during startup before any tenants have been
         # loaded, so nothing can be evicted, we just wait for next iteration which
         # is able to evict.
         pageserver.allowed_errors.append(".*WARN.* disk usage still high.*")
 
-        pageserver.start(
-            overrides=(
-                "--pageserver-config-override=disk_usage_based_eviction="
-                + enc.dump_inline_table(disk_usage_config).replace("\n", " "),
+        pageserver.patch_config_toml_nonrecursive(
+            {
+                "disk_usage_based_eviction": {
+                    "period": period,
+                    "max_usage_pct": max_usage_pct,
+                    "min_avail_bytes": min_avail_bytes,
+                    "mock_statvfs": mock_behavior,
+                    "eviction_order": eviction_order.config(),
+                },
                 # Disk usage based eviction runs as a background task.
                 # But pageserver startup delays launch of background tasks for some time, to prioritize initial logical size calculations during startup.
                 # But, initial logical size calculation may not be triggered if safekeepers don't publish new broker messages.
                 # But, we only have a 10-second-timeout in this test.
                 # So, disable the delay for this test.
-                "--pageserver-config-override=background_task_maximum_delay='0s'",
-            ),
+                "background_task_maximum_delay": "0s",
+            }
         )
 
+        pageserver.start()
+
         # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction
         for tenant_id, timeline_id in self.timelines:
             tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id)
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 67f68a62af..f957bea156 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -220,7 +220,12 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     # We will start a pageserver with no control_plane_api set, so it won't be able to self-register
     env.storage_controller.node_register(env.pageserver)
 
-    env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
+    replaced_config = env.pageserver.patch_config_toml_nonrecursive(
+        {
+            "control_plane_api": "",
+        }
+    )
+    env.pageserver.start()
     env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"})
 
     env.neon_cli.create_tenant(
@@ -251,8 +256,8 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
         assert parse_generation_suffix(key) is None
 
     env.pageserver.stop()
-
     # Starting without the override that disabled control_plane_api
+    env.pageserver.patch_config_toml_nonrecursive(replaced_config)
     env.pageserver.start()
 
     generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False)
@@ -525,9 +530,12 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     # incident, but it might be unavoidable: if so, we want to be able to start up
     # and serve clients.
     env.pageserver.stop()  # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
-    env.pageserver.start(
-        overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
+    replaced = env.pageserver.patch_config_toml_nonrecursive(
+        {
+            "control_plane_emergency_mode": True,
+        }
     )
+    env.pageserver.start()
 
     # The pageserver should provide service to clients
     generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)
@@ -549,6 +557,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
 
     # The pageserver should work fine when subsequently restarted in non-emergency mode
     env.pageserver.stop()  # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
+    env.pageserver.patch_config_toml_nonrecursive(replaced)
     env.pageserver.start()
 
     generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index fdcb4cf9a4..bdd356388f 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -290,9 +290,12 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     # This is the pageserver where we'll initially create the tenant.  Run it in emergency
     # mode so that it doesn't talk to storage controller, and do not register it.
     env.pageservers[0].allowed_errors.append(".*Emergency mode!.*")
-    env.pageservers[0].start(
-        overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
+    env.pageservers[0].patch_config_toml_nonrecursive(
+        {
+            "control_plane_emergency_mode": True,
+        }
     )
+    env.pageservers[0].start()
     origin_ps = env.pageservers[0]
 
     # These are the pageservers managed by the sharding service, where the tenant

From 017c34b7736119f250c68c8f2aecfdee2866dc5f Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 7 May 2024 12:30:18 -0400
Subject: [PATCH 0718/1571] feat(pageserver): generate basebackup from aux file
 v2 storage (#7517)

This pull request adds the new basebackup read path + aux file write
path. In the regression test, all logical replication tests are run with
matrix aux_file_v2=false/true.

Also fixed the vectored get code path to correctly return missing key
error when being called from the unified sequential get code path.
---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/src/pageserver.rs               |  19 +-
 libs/pageserver_api/src/models.rs             |  27 +-
 pageserver/src/aux_file.rs                    |  96 +++++++
 pageserver/src/pgdatadir_mapping.rs           | 241 ++++++++++++------
 pageserver/src/tenant.rs                      |   2 +-
 pageserver/src/tenant/config.rs               |  17 +-
 pageserver/src/tenant/timeline.rs             |  24 +-
 test_runner/fixtures/neon_fixtures.py         |  21 ++
 test_runner/fixtures/parametrize.py           |   6 +
 test_runner/fixtures/utils.py                 |  14 +
 .../regress/test_attach_tenant_config.py      |   2 +-
 .../regress/test_logical_replication.py       |  26 ++
 12 files changed, 391 insertions(+), 104 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index fbe0d419ae..2179859023 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,7 +17,8 @@ use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
 use pageserver_api::models::{
-    self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
+    self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo,
+    TimelineInfo,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
@@ -429,11 +430,11 @@ impl PageServerNode {
                 .map(serde_json::from_str)
                 .transpose()
                 .context("parse `timeline_get_throttle` from json")?,
-            switch_to_aux_file_v2: settings
-                .remove("switch_to_aux_file_v2")
-                .map(|x| x.parse::<bool>())
+            switch_aux_file_policy: settings
+                .remove("switch_aux_file_policy")
+                .map(|x| x.parse::<AuxFilePolicy>())
                 .transpose()
-                .context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
+                .context("Failed to parse 'switch_aux_file_policy'")?,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
@@ -552,11 +553,11 @@ impl PageServerNode {
                     .map(serde_json::from_str)
                     .transpose()
                     .context("parse `timeline_get_throttle` from json")?,
-                switch_to_aux_file_v2: settings
-                    .remove("switch_to_aux_file_v2")
-                    .map(|x| x.parse::<bool>())
+                switch_aux_file_policy: settings
+                    .remove("switch_aux_file_policy")
+                    .map(|x| x.parse::<AuxFilePolicy>())
                     .transpose()
-                    .context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
+                    .context("Failed to parse 'switch_aux_file_policy'")?,
             }
         };
 
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 37d968cebd..1df5820fb9 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,6 +9,7 @@ use std::{
     collections::HashMap,
     io::{BufRead, Read},
     num::{NonZeroU64, NonZeroUsize},
+    str::FromStr,
     time::{Duration, SystemTime},
 };
 
@@ -304,7 +305,31 @@ pub struct TenantConfig {
     pub lazy_slru_download: Option<bool>,
     pub timeline_get_throttle: Option<ThrottleConfig>,
     pub image_layer_creation_check_threshold: Option<u8>,
-    pub switch_to_aux_file_v2: Option<bool>,
+    pub switch_aux_file_policy: Option<AuxFilePolicy>,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum AuxFilePolicy {
+    V1,
+    V2,
+    CrossValidation,
+}
+
+impl FromStr for AuxFilePolicy {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let s = s.to_lowercase();
+        if s == "v1" {
+            Ok(Self::V1)
+        } else if s == "v2" {
+            Ok(Self::V2)
+        } else if s == "crossvalidation" || s == "cross_validation" {
+            Ok(Self::CrossValidation)
+        } else {
+            anyhow::bail!("cannot parse {} to aux file policy", s)
+        }
+    }
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs
index a343acaf7a..a26ed84a0d 100644
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -1,3 +1,4 @@
+use bytes::{Buf, BufMut, Bytes};
 use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
 use tracing::warn;
 
@@ -61,6 +62,84 @@ pub fn encode_aux_file_key(path: &str) -> Key {
     }
 }
 
+const AUX_FILE_ENCODING_VERSION: u8 = 0x01;
+
+pub fn decode_file_value(val: &[u8]) -> anyhow::Result<Vec<(&str, &[u8])>> {
+    let mut ptr = val;
+    if ptr.is_empty() {
+        // empty value = no files
+        return Ok(Vec::new());
+    }
+    assert_eq!(
+        ptr.get_u8(),
+        AUX_FILE_ENCODING_VERSION,
+        "unsupported aux file value"
+    );
+    let mut files = vec![];
+    while ptr.has_remaining() {
+        let key_len = ptr.get_u32() as usize;
+        let key = &ptr[..key_len];
+        ptr.advance(key_len);
+        let val_len = ptr.get_u32() as usize;
+        let content = &ptr[..val_len];
+        ptr.advance(val_len);
+
+        let path = std::str::from_utf8(key)?;
+        files.push((path, content));
+    }
+    Ok(files)
+}
+
+/// Decode an aux file key-value pair into a list of files. The returned `Bytes` contains reference
+/// to the original value slice. Be cautious about memory consumption.
+pub fn decode_file_value_bytes(val: &Bytes) -> anyhow::Result<Vec<(String, Bytes)>> {
+    let mut ptr = val.clone();
+    if ptr.is_empty() {
+        // empty value = no files
+        return Ok(Vec::new());
+    }
+    assert_eq!(
+        ptr.get_u8(),
+        AUX_FILE_ENCODING_VERSION,
+        "unsupported aux file value"
+    );
+    let mut files = vec![];
+    while ptr.has_remaining() {
+        let key_len = ptr.get_u32() as usize;
+        let key = ptr.slice(..key_len);
+        ptr.advance(key_len);
+        let val_len = ptr.get_u32() as usize;
+        let content = ptr.slice(..val_len);
+        ptr.advance(val_len);
+
+        let path = std::str::from_utf8(&key)?.to_string();
+        files.push((path, content));
+    }
+    Ok(files)
+}
+
+pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result<Vec<u8>> {
+    if files.is_empty() {
+        // no files = empty value
+        return Ok(Vec::new());
+    }
+    let mut encoded = vec![];
+    encoded.put_u8(AUX_FILE_ENCODING_VERSION);
+    for (path, content) in files {
+        if path.len() > u32::MAX as usize {
+            anyhow::bail!("{} exceeds path size limit", path);
+        }
+        encoded.put_u32(path.len() as u32);
+        encoded.put_slice(path.as_bytes());
+        if content.len() > u32::MAX as usize {
+            anyhow::bail!("{} exceeds content size limit", path);
+        }
+        encoded.put_u32(content.len() as u32);
+        encoded.put_slice(content);
+    }
+    Ok(encoded)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -109,4 +188,21 @@ mod tests {
             encode_aux_file_key("other_file_not_supported").to_string()
         );
     }
+
+    #[test]
+    fn test_value_encoding() {
+        let files = vec![
+            ("pg_logical/1.file", "1111".as_bytes()),
+            ("pg_logical/2.file", "2222".as_bytes()),
+        ];
+        assert_eq!(
+            files,
+            decode_file_value(&encode_file_value(&files).unwrap()).unwrap()
+        );
+        let files = vec![];
+        assert_eq!(
+            files,
+            decode_file_value(&encode_file_value(&files).unwrap()).unwrap()
+        );
+    }
 }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 12314c5961..a4215ee107 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -10,9 +10,9 @@ use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::metrics::WAL_INGEST;
-use crate::repository::*;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
+use crate::{aux_file, repository::*};
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
@@ -24,6 +24,7 @@ use pageserver_api::key::{
     AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
+use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -670,7 +671,7 @@ impl Timeline {
         self.get(CHECKPOINT_KEY, lsn, ctx).await
     }
 
-    pub(crate) async fn list_aux_files(
+    async fn list_aux_files_v1(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -688,6 +689,63 @@ impl Timeline {
         }
     }
 
+    async fn list_aux_files_v2(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
+        let kv = self
+            .scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx)
+            .await
+            .context("scan")?;
+        let mut result = HashMap::new();
+        for (_, v) in kv {
+            let v = v.context("get value")?;
+            let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
+            for (fname, content) in v {
+                result.insert(fname, content);
+            }
+        }
+        Ok(result)
+    }
+
+    pub(crate) async fn list_aux_files(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
+        match self.get_switch_aux_file_policy() {
+            AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await,
+            AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await,
+            AuxFilePolicy::CrossValidation => {
+                let v1_result = self.list_aux_files_v1(lsn, ctx).await;
+                let v2_result = self.list_aux_files_v2(lsn, ctx).await;
+                match (v1_result, v2_result) {
+                    (Ok(v1), Ok(v2)) => {
+                        if v1 != v2 {
+                            tracing::error!(
+                                "unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}"
+                            );
+                            return Err(PageReconstructError::Other(anyhow::anyhow!(
+                                "unmatched aux file v1 v2 result"
+                            )));
+                        }
+                        Ok(v1)
+                    }
+                    (Ok(_), Err(v2)) => {
+                        tracing::error!("aux file v1 returns Ok while aux file v2 returns an err");
+                        Err(v2)
+                    }
+                    (Err(v1), Ok(_)) => {
+                        tracing::error!("aux file v2 returns Ok while aux file v1 returns an err");
+                        Err(v1)
+                    }
+                    (Err(_), Err(v2)) => Err(v2),
+                }
+            }
+        }
+    }
+
     /// Does the same as get_current_logical_size but counted on demand.
     /// Used to initialize the logical size tracking on startup.
     ///
@@ -1389,6 +1447,9 @@ impl<'a> DatadirModification<'a> {
     }
 
     pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
+        if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() {
+            return Ok(());
+        }
         let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
             files: HashMap::new(),
         })?;
@@ -1404,89 +1465,121 @@ impl<'a> DatadirModification<'a> {
         content: &[u8],
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        let file_path = path.to_string();
-        let content = if content.is_empty() {
-            None
-        } else {
-            Some(Bytes::copy_from_slice(content))
-        };
-
-        let n_files;
-        let mut aux_files = self.tline.aux_files.lock().await;
-        if let Some(mut dir) = aux_files.dir.take() {
-            // We already updated aux files in `self`: emit a delta and update our latest value.
-            dir.upsert(file_path.clone(), content.clone());
-            n_files = dir.files.len();
-            if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
-                self.put(
-                    AUX_FILES_KEY,
-                    Value::Image(Bytes::from(
-                        AuxFilesDirectory::ser(&dir).context("serialize")?,
-                    )),
-                );
-                aux_files.n_deltas = 0;
+        let policy = self.tline.get_switch_aux_file_policy();
+        if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
+            let key = aux_file::encode_aux_file_key(path);
+            // retrieve the key from the engine
+            let old_val = match self.get(key, ctx).await {
+                Ok(val) => Some(val),
+                Err(PageReconstructError::MissingKey(_)) => None,
+                Err(e) => return Err(e.into()),
+            };
+            let files = if let Some(ref old_val) = old_val {
+                aux_file::decode_file_value(old_val)?
             } else {
-                self.put(
-                    AUX_FILES_KEY,
-                    Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
-                );
-                aux_files.n_deltas += 1;
-            }
-            aux_files.dir = Some(dir);
-        } else {
-            // Check if the AUX_FILES_KEY is initialized
-            match self.get(AUX_FILES_KEY, ctx).await {
-                Ok(dir_bytes) => {
-                    let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
-                    // Key is already set, we may append a delta
-                    self.put(
-                        AUX_FILES_KEY,
-                        Value::WalRecord(NeonWalRecord::AuxFile {
-                            file_path: file_path.clone(),
-                            content: content.clone(),
-                        }),
-                    );
-                    dir.upsert(file_path, content);
-                    n_files = dir.files.len();
-                    aux_files.dir = Some(dir);
-                }
-                Err(
-                    e @ (PageReconstructError::AncestorStopping(_)
-                    | PageReconstructError::Cancelled
-                    | PageReconstructError::AncestorLsnTimeout(_)),
-                ) => {
-                    // Important that we do not interpret a shutdown error as "not found" and thereby
-                    // reset the map.
-                    return Err(e.into());
-                }
-                // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
-                // the original code assumes all other errors are missing keys. Therefore, we keep the code path
-                // the same for now, though in theory, we should only match the `MissingKey` variant.
-                Err(
-                    PageReconstructError::Other(_)
-                    | PageReconstructError::WalRedo(_)
-                    | PageReconstructError::MissingKey { .. },
-                ) => {
-                    // Key is missing, we must insert an image as the basis for subsequent deltas.
+                Vec::new()
+            };
+            let new_files = if content.is_empty() {
+                files
+                    .into_iter()
+                    .filter(|(p, _)| &path != p)
+                    .collect::<Vec<_>>()
+            } else {
+                files
+                    .into_iter()
+                    .filter(|(p, _)| &path != p)
+                    .chain(std::iter::once((path, content)))
+                    .collect::<Vec<_>>()
+            };
+            let new_val = aux_file::encode_file_value(&new_files)?;
+            self.put(key, Value::Image(new_val.into()));
+        }
 
-                    let mut dir = AuxFilesDirectory {
-                        files: HashMap::new(),
-                    };
-                    dir.upsert(file_path, content);
+        if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy {
+            let file_path = path.to_string();
+            let content = if content.is_empty() {
+                None
+            } else {
+                Some(Bytes::copy_from_slice(content))
+            };
+
+            let n_files;
+            let mut aux_files = self.tline.aux_files.lock().await;
+            if let Some(mut dir) = aux_files.dir.take() {
+                // We already updated aux files in `self`: emit a delta and update our latest value.
+                dir.upsert(file_path.clone(), content.clone());
+                n_files = dir.files.len();
+                if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
                     self.put(
                         AUX_FILES_KEY,
                         Value::Image(Bytes::from(
                             AuxFilesDirectory::ser(&dir).context("serialize")?,
                         )),
                     );
-                    n_files = 1;
-                    aux_files.dir = Some(dir);
+                    aux_files.n_deltas = 0;
+                } else {
+                    self.put(
+                        AUX_FILES_KEY,
+                        Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
+                    );
+                    aux_files.n_deltas += 1;
+                }
+                aux_files.dir = Some(dir);
+            } else {
+                // Check if the AUX_FILES_KEY is initialized
+                match self.get(AUX_FILES_KEY, ctx).await {
+                    Ok(dir_bytes) => {
+                        let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
+                        // Key is already set, we may append a delta
+                        self.put(
+                            AUX_FILES_KEY,
+                            Value::WalRecord(NeonWalRecord::AuxFile {
+                                file_path: file_path.clone(),
+                                content: content.clone(),
+                            }),
+                        );
+                        dir.upsert(file_path, content);
+                        n_files = dir.files.len();
+                        aux_files.dir = Some(dir);
+                    }
+                    Err(
+                        e @ (PageReconstructError::AncestorStopping(_)
+                        | PageReconstructError::Cancelled
+                        | PageReconstructError::AncestorLsnTimeout(_)),
+                    ) => {
+                        // Important that we do not interpret a shutdown error as "not found" and thereby
+                        // reset the map.
+                        return Err(e.into());
+                    }
+                    // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
+                    // the original code assumes all other errors are missing keys. Therefore, we keep the code path
+                    // the same for now, though in theory, we should only match the `MissingKey` variant.
+                    Err(
+                        PageReconstructError::Other(_)
+                        | PageReconstructError::WalRedo(_)
+                        | PageReconstructError::MissingKey { .. },
+                    ) => {
+                        // Key is missing, we must insert an image as the basis for subsequent deltas.
+
+                        let mut dir = AuxFilesDirectory {
+                            files: HashMap::new(),
+                        };
+                        dir.upsert(file_path, content);
+                        self.put(
+                            AUX_FILES_KEY,
+                            Value::Image(Bytes::from(
+                                AuxFilesDirectory::ser(&dir).context("serialize")?,
+                            )),
+                        );
+                        n_files = 1;
+                        aux_files.dir = Some(dir);
+                    }
                 }
             }
-        }
 
-        self.pending_directory_entries
-            .push((DirectoryKind::AuxFiles, n_files));
+            self.pending_directory_entries
+                .push((DirectoryKind::AuxFiles, n_files));
+        }
 
         Ok(())
     }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1d483af278..010e56a899 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3758,7 +3758,7 @@ pub(crate) mod harness {
                 image_layer_creation_check_threshold: Some(
                     tenant_conf.image_layer_creation_check_threshold,
                 ),
-                switch_to_aux_file_v2: Some(tenant_conf.switch_to_aux_file_v2),
+                switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
             }
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 9975c9edbc..a743ce3c16 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,6 +9,7 @@
 //! may lead to a data loss.
 //!
 use anyhow::bail;
+use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::CompactionAlgorithm;
 use pageserver_api::models::EvictionPolicy;
 use pageserver_api::models::{self, ThrottleConfig};
@@ -370,9 +371,9 @@ pub struct TenantConf {
     // Expresed in multiples of checkpoint distance.
     pub image_layer_creation_check_threshold: u8,
 
-    /// Switch to aux file v2. Switching this flag requires the user has not written any aux file into
+    /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
     /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
-    pub switch_to_aux_file_v2: bool,
+    pub switch_aux_file_policy: AuxFilePolicy,
 }
 
 /// Same as TenantConf, but this struct preserves the information about
@@ -471,7 +472,7 @@ pub struct TenantConfOpt {
 
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
-    pub switch_to_aux_file_v2: Option<bool>,
+    pub switch_aux_file_policy: Option<AuxFilePolicy>,
 }
 
 impl TenantConfOpt {
@@ -529,9 +530,9 @@ impl TenantConfOpt {
             image_layer_creation_check_threshold: self
                 .image_layer_creation_check_threshold
                 .unwrap_or(global_conf.image_layer_creation_check_threshold),
-            switch_to_aux_file_v2: self
-                .switch_to_aux_file_v2
-                .unwrap_or(global_conf.switch_to_aux_file_v2),
+            switch_aux_file_policy: self
+                .switch_aux_file_policy
+                .unwrap_or(global_conf.switch_aux_file_policy),
         }
     }
 }
@@ -573,7 +574,7 @@ impl Default for TenantConf {
             lazy_slru_download: false,
             timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
             image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
-            switch_to_aux_file_v2: false,
+            switch_aux_file_policy: AuxFilePolicy::V1,
         }
     }
 }
@@ -648,7 +649,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             lazy_slru_download: value.lazy_slru_download,
             timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
             image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
-            switch_to_aux_file_v2: value.switch_to_aux_file_v2,
+            switch_aux_file_policy: value.switch_aux_file_policy,
         }
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1c417262b0..7213ff8f75 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,8 +23,9 @@ use pageserver_api::{
     },
     keyspace::{KeySpaceAccum, SparseKeyPartitioning},
     models::{
-        CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
-        EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
+        AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
+        DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
+        TimelineState,
     },
     reltag::BlockNumber,
     shard::{ShardIdentity, ShardNumber, TenantShardId},
@@ -863,9 +864,13 @@ impl Timeline {
                 // Initialise the reconstruct state for the key with the cache
                 // entry returned above.
                 let mut reconstruct_state = ValuesReconstructState::new();
-                let mut key_state = VectoredValueReconstructState::default();
-                key_state.img = cached_page_img;
-                reconstruct_state.keys.insert(key, Ok(key_state));
+
+                // Only add the cached image to the reconstruct state when it exists.
+                if cached_page_img.is_some() {
+                    let mut key_state = VectoredValueReconstructState::default();
+                    key_state.img = cached_page_img;
+                    reconstruct_state.keys.insert(key, Ok(key_state));
+                }
 
                 let vectored_res = self
                     .get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx)
@@ -1077,7 +1082,7 @@ impl Timeline {
         // We should generalize this into Keyspace::contains in the future.
         for range in &keyspace.ranges {
             if range.start.field1 < METADATA_KEY_BEGIN_PREFIX
-                || range.end.field1 >= METADATA_KEY_END_PREFIX
+                || range.end.field1 > METADATA_KEY_END_PREFIX
             {
                 return Err(GetVectoredError::Other(anyhow::anyhow!(
                     "only metadata keyspace can be scanned"
@@ -1991,13 +1996,12 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 
 // Private functions
 impl Timeline {
-    #[allow(dead_code)]
-    pub(crate) fn get_switch_to_aux_file_v2(&self) -> bool {
+    pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
             .tenant_conf
-            .switch_to_aux_file_v2
-            .unwrap_or(self.conf.default_tenant_conf.switch_to_aux_file_v2)
+            .switch_aux_file_policy
+            .unwrap_or(self.conf.default_tenant_conf.switch_aux_file_policy)
     }
 
     pub(crate) fn get_lazy_slru_download(&self) -> bool {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 82f17fe20d..fc66822eb9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -82,6 +82,7 @@ from fixtures.utils import (
     subprocess_capture,
     wait_until,
 )
+from fixtures.utils import AuxFileStore as AuxFileStore  # reexport
 
 """
 This file contains pytest fixtures. A fixture is a test resource that can be
@@ -465,6 +466,7 @@ class NeonEnvBuilder:
         initial_tenant: Optional[TenantId] = None,
         initial_timeline: Optional[TimelineId] = None,
         pageserver_virtual_file_io_engine: Optional[str] = None,
+        pageserver_aux_file_policy: Optional[AuxFileStore] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -520,6 +522,8 @@ class NeonEnvBuilder:
             self.pageserver_validate_vectored_get = bool(validate)
             log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"')
 
+        self.pageserver_aux_file_policy = pageserver_aux_file_policy
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -565,6 +569,7 @@ class NeonEnvBuilder:
             timeline_id=env.initial_timeline,
             shard_count=initial_tenant_shard_count,
             shard_stripe_size=initial_tenant_shard_stripe_size,
+            aux_file_v2=self.pageserver_aux_file_policy,
         )
         assert env.initial_tenant == initial_tenant
         assert env.initial_timeline == initial_timeline
@@ -1047,6 +1052,7 @@ class NeonEnv:
         )
 
         self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
+        self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
 
         # Create a config file corresponding to the options
         cfg: Dict[str, Any] = {
@@ -1283,6 +1289,7 @@ def _shared_simple_env(
     pg_distrib_dir: Path,
     pg_version: PgVersion,
     pageserver_virtual_file_io_engine: str,
+    pageserver_aux_file_policy: Optional[AuxFileStore],
 ) -> Iterator[NeonEnv]:
     """
     # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
@@ -1313,6 +1320,7 @@ def _shared_simple_env(
         test_name=request.node.name,
         test_output_dir=test_output_dir,
         pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
+        pageserver_aux_file_policy=pageserver_aux_file_policy,
     ) as builder:
         env = builder.init_start()
 
@@ -1352,6 +1360,7 @@ def neon_env_builder(
     test_overlay_dir: Path,
     top_output_dir: Path,
     pageserver_virtual_file_io_engine: str,
+    pageserver_aux_file_policy: Optional[AuxFileStore] = None,
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.
@@ -1385,6 +1394,7 @@ def neon_env_builder(
         test_name=request.node.name,
         test_output_dir=test_output_dir,
         test_overlay_dir=test_overlay_dir,
+        pageserver_aux_file_policy=pageserver_aux_file_policy,
     ) as builder:
         yield builder
 
@@ -1544,6 +1554,7 @@ class NeonCli(AbstractNeonCli):
         shard_stripe_size: Optional[int] = None,
         placement_policy: Optional[str] = None,
         set_default: bool = False,
+        aux_file_v2: Optional[AuxFileStore] = None,
     ) -> Tuple[TenantId, TimelineId]:
         """
         Creates a new tenant, returns its id and its initial timeline's id.
@@ -1567,6 +1578,16 @@ class NeonCli(AbstractNeonCli):
                     product(["-c"], (f"{key}:{value}" for key, value in conf.items()))
                 )
             )
+
+        if aux_file_v2 is AuxFileStore.V2:
+            args.extend(["-c", "switch_aux_file_policy:v2"])
+
+        if aux_file_v2 is AuxFileStore.V1:
+            args.extend(["-c", "switch_aux_file_policy:v1"])
+
+        if aux_file_v2 is AuxFileStore.CrossValidation:
+            args.extend(["-c", "switch_aux_file_policy:cross_validation"])
+
         if set_default:
             args.append("--set-default")
 
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index c8ab550ad7..77523a542b 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -5,6 +5,7 @@ import pytest
 from _pytest.python import Metafunc
 
 from fixtures.pg_version import PgVersion
+from fixtures.utils import AuxFileStore
 
 """
 Dynamically parametrize tests by different parameters
@@ -31,6 +32,11 @@ def pageserver_virtual_file_io_engine() -> Optional[str]:
     return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE")
 
 
+@pytest.fixture(scope="function", autouse=True)
+def pageserver_aux_file_policy() -> Optional[AuxFileStore]:
+    return None
+
+
 def pytest_generate_tests(metafunc: Metafunc):
     if (bt := os.getenv("BUILD_TYPE")) is None:
         build_types = ["debug", "release"]
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 9365d65fc9..6470621900 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -1,4 +1,5 @@
 import contextlib
+import enum
 import json
 import os
 import re
@@ -484,3 +485,16 @@ def assert_no_errors(log_file, service, allowed_errors):
         log.info(f"not allowed {service} error: {error.strip()}")
 
     assert not errors, f"Log errors on {service}: {errors[0]}"
+
+
+@enum.unique
+class AuxFileStore(str, enum.Enum):
+    V1 = "V1"
+    V2 = "V2"
+    CrossValidation = "CrossValidation"
+
+    def __repr__(self) -> str:
+        return f"'aux-{self.value}'"
+
+    def __str__(self) -> str:
+        return f"'aux-{self.value}'"
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 59461cc095..693add422f 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -190,7 +190,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "trace_read_requests": True,
         "walreceiver_connect_timeout": "13m",
         "image_layer_creation_check_threshold": 1,
-        "switch_to_aux_file_v2": True,
+        "switch_aux_file_policy": "CrossValidation",
     }
 
     ps_http = env.pageserver.http_client()
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 9b2abe608c..57d3447cae 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -6,6 +6,7 @@ from string import ascii_lowercase
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    AuxFileStore,
     NeonEnv,
     NeonEnvBuilder,
     logical_replication_sync,
@@ -19,6 +20,19 @@ def random_string(n: int):
     return "".join([choice(ascii_lowercase) for _ in range(n)])
 
 
+@pytest.mark.parametrize(
+    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.V2, AuxFileStore.CrossValidation]
+)
+def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy: AuxFileStore):
+    env = neon_simple_env
+    with env.pageserver.http_client() as client:
+        tenant_config = client.tenant_config(env.initial_tenant).effective_config
+        assert pageserver_aux_file_policy == tenant_config["switch_aux_file_policy"]
+
+
+@pytest.mark.parametrize(
+    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
+)
 def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -160,6 +174,9 @@ COMMIT;
 
 
 # Test that neon.logical_replication_max_snap_files works
+@pytest.mark.parametrize(
+    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
+)
 def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
     def slot_removed(ep):
         assert (
@@ -281,6 +298,9 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
 
 # Test compute start at LSN page of which starts with contrecord
 # https://github.com/neondatabase/neon/issues/5749
+@pytest.mark.parametrize(
+    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
+)
 def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -371,6 +391,9 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
 # logical replication bug as such, but without logical replication,
 # records passed ot the WAL redo process are never large enough to hit
 # the bug.
+@pytest.mark.parametrize(
+    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
+)
 def test_large_records(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -442,6 +465,9 @@ def test_slots_and_branching(neon_simple_env: NeonEnv):
     ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')")
 
 
+@pytest.mark.parametrize(
+    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
+)
 def test_replication_shutdown(neon_simple_env: NeonEnv):
     # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed
     env = neon_simple_env

From 0af66a60030282775217ad9df4f306f5c0d7bb8b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 7 May 2024 18:03:12 +0100
Subject: [PATCH 0719/1571] pageserver: include generation number in local
 layer paths (#7609)

## Problem

In https://github.com/neondatabase/neon/pull/7531, we would like to be
able to rewrite layers safely. One option is to make `Layer` able to
rewrite files in place safely (e.g. by blocking evictions/deletions for
an old Layer while a new one is created), but that's relatively fragile.
It's more robust in general if we simply never overwrite the same local
file: we can do that by putting the generation number in the filename.

## Summary of changes

- Add `local_layer_path` (counterpart to `remote_layer_path`) and
convert all locations that manually constructed a local layer path by
joining LayerFileName to timeline path
- In the layer upload path, construct remote paths with
`remote_layer_path` rather than trying to build them out of a local
path.
- During startup, carry the full path to layer files through
`init::reconcile`, and pass it into `Layer::for_resident`
- Add a test to make sure we handle upgrades properly.
- Comment out the generation part of `local_layer_path`, since we need
to maintain forward compatibility for one release. A tiny followup PR
will enable it afterwards.

We could make this a bit simpler if we bulk renamed existing layers on
startup instead of carrying literal paths through init, but that is
operationally risky on existing servers with millions of layer files. We
can always do a renaming change in future if it becomes annoying, but
for the moment it's kind of nice to have a structure that enables us to
change local path names again in future quite easily.

We should rename `LayerFileName` to `LayerName` or somesuch, to make it
more obvious that it's not a literal filename: this was already a bit
confusing where that type is used in remote paths. That will be a
followup, to avoid polluting this PR's diff.
---
 pageserver/src/disk_usage_eviction_task.rs    |   7 +-
 pageserver/src/http/routes.rs                 |  10 +-
 pageserver/src/metrics.rs                     |   2 +
 .../src/tenant/remote_timeline_client.rs      | 111 +++++++++++-------
 .../tenant/remote_timeline_client/download.rs |   9 +-
 .../tenant/remote_timeline_client/upload.rs   |  52 +++-----
 pageserver/src/tenant/secondary.rs            |  16 ++-
 pageserver/src/tenant/secondary/downloader.rs |  49 +++++---
 .../src/tenant/storage_layer/filename.rs      |  71 ++++++++++-
 pageserver/src/tenant/storage_layer/layer.rs  |  58 ++++++++-
 pageserver/src/tenant/timeline.rs             |  48 +++++---
 pageserver/src/tenant/timeline/init.rs        |  38 ++++--
 test_runner/fixtures/neon_fixtures.py         |  33 +++++-
 test_runner/fixtures/pageserver/types.py      |  59 ++++------
 test_runner/regress/test_duplicate_layers.py  |  24 ++--
 test_runner/regress/test_layer_eviction.py    |  53 +++++----
 .../regress/test_pageserver_generations.py    |  48 ++++++++
 .../regress/test_pageserver_secondary.py      |  64 ++++------
 test_runner/regress/test_remote_storage.py    |  16 +--
 .../test_tenants_with_remote_storage.py       |   8 +-
 .../regress/test_timeline_detach_ancestor.py  |   2 +-
 21 files changed, 507 insertions(+), 271 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 6248424cee..400930245b 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -540,7 +540,12 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                     js.spawn(async move {
                         layer
                             .secondary_tenant
-                            .evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name)
+                            .evict_layer(
+                                tenant_manager.get_conf(),
+                                layer.timeline_id,
+                                layer.name,
+                                layer.metadata,
+                            )
                             .await;
                         Ok(file_size)
                     });
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index ea4c7f1e3b..83b7b8a45e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -63,6 +63,7 @@ use crate::tenant::remote_timeline_client::list_remote_timelines;
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
+use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::SpawnMode;
@@ -1228,13 +1229,15 @@ async fn layer_download_handler(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let layer_file_name = get_request_param(&request, "layer_file_name")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let layer_name = LayerFileName::from_str(layer_file_name)
+        .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
     let state = get_state(&request);
 
     let timeline =
         active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
             .await?;
     let downloaded = timeline
-        .download_layer(layer_file_name)
+        .download_layer(&layer_name)
         .await
         .map_err(ApiError::InternalServerError)?;
 
@@ -1258,11 +1261,14 @@ async fn evict_timeline_layer_handler(
     let layer_file_name = get_request_param(&request, "layer_file_name")?;
     let state = get_state(&request);
 
+    let layer_name = LayerFileName::from_str(layer_file_name)
+        .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
+
     let timeline =
         active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
             .await?;
     let evicted = timeline
-        .evict_layer(layer_file_name)
+        .evict_layer(&layer_name)
         .await
         .map_err(ApiError::InternalServerError)?;
 
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 40712e4895..256f2f334c 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2929,6 +2929,8 @@ pub fn preinitialize_metrics() {
         &WALRECEIVER_CANDIDATES_REMOVED,
         &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
         &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
+        &REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
+        &REMOTE_ONDEMAND_DOWNLOADED_BYTES,
     ]
     .into_iter()
     .for_each(|c| {
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 255449c049..356a0dc51c 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1140,15 +1140,21 @@ impl RemoteTimelineClient {
         uploaded: &ResidentLayer,
         cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
+        let remote_path = remote_layer_path(
+            &self.tenant_shard_id.tenant_id,
+            &self.timeline_id,
+            self.tenant_shard_id.to_index(),
+            &uploaded.layer_desc().filename(),
+            uploaded.metadata().generation,
+        );
+
         backoff::retry(
             || async {
-                let m = uploaded.metadata();
                 upload::upload_timeline_layer(
-                    self.conf,
                     &self.storage_impl,
                     uploaded.local_path(),
-                    &uploaded.metadata(),
-                    m.generation,
+                    &remote_path,
+                    uploaded.metadata().file_size(),
                     cancel,
                 )
                 .await
@@ -1173,15 +1179,30 @@ impl RemoteTimelineClient {
         adopted_as: &Layer,
         cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
+        let source_remote_path = remote_layer_path(
+            &self.tenant_shard_id.tenant_id,
+            &adopted
+                .get_timeline_id()
+                .expect("Source timeline should be alive"),
+            self.tenant_shard_id.to_index(),
+            &adopted.layer_desc().filename(),
+            adopted.metadata().generation,
+        );
+
+        let target_remote_path = remote_layer_path(
+            &self.tenant_shard_id.tenant_id,
+            &self.timeline_id,
+            self.tenant_shard_id.to_index(),
+            &adopted_as.layer_desc().filename(),
+            adopted_as.metadata().generation,
+        );
+
         backoff::retry(
             || async {
                 upload::copy_timeline_layer(
-                    self.conf,
                     &self.storage_impl,
-                    adopted.local_path(),
-                    &adopted.metadata(),
-                    adopted_as.local_path(),
-                    &adopted_as.metadata(),
+                    &source_remote_path,
+                    &target_remote_path,
                     cancel,
                 )
                 .await
@@ -1496,13 +1517,25 @@ impl RemoteTimelineClient {
 
             let upload_result: anyhow::Result<()> = match &task.op {
                 UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
-                    let path = layer.local_path();
+                    let local_path = layer.local_path();
+
+                    // We should only be uploading layers created by this `Tenant`'s lifetime, so
+                    // the metadata in the upload should always match our current generation.
+                    assert_eq!(layer_metadata.generation, self.generation);
+
+                    let remote_path = remote_layer_path(
+                        &self.tenant_shard_id.tenant_id,
+                        &self.timeline_id,
+                        layer_metadata.shard,
+                        &layer.layer_desc().filename(),
+                        layer_metadata.generation,
+                    );
+
                     upload::upload_timeline_layer(
-                        self.conf,
                         &self.storage_impl,
-                        path,
-                        layer_metadata,
-                        self.generation,
+                        local_path,
+                        &remote_path,
+                        layer_metadata.file_size(),
                         &self.cancel,
                     )
                     .measure_remote_op(
@@ -1931,29 +1964,6 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
     }
 }
 
-/// Files on the remote storage are stored with paths, relative to the workdir.
-/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
-///
-/// Errors if the path provided does not start from pageserver's workdir.
-pub(crate) fn remote_path(
-    conf: &PageServerConf,
-    local_path: &Utf8Path,
-    generation: Generation,
-) -> anyhow::Result<RemotePath> {
-    let stripped = local_path
-        .strip_prefix(&conf.workdir)
-        .context("Failed to strip workdir prefix")?;
-
-    let suffixed = format!("{0}{1}", stripped, generation.get_suffix());
-
-    RemotePath::new(Utf8Path::new(&suffixed)).with_context(|| {
-        format!(
-            "to resolve remote part of path {:?} for base {:?}",
-            local_path, conf.workdir
-        )
-    })
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -1961,6 +1971,7 @@ mod tests {
         context::RequestContext,
         tenant::{
             harness::{TenantHarness, TIMELINE_ID},
+            storage_layer::layer::local_layer_path,
             Tenant, Timeline,
         },
         DEFAULT_PG_VERSION,
@@ -2143,11 +2154,20 @@ mod tests {
         ]
         .into_iter()
         .map(|(name, contents): (LayerFileName, Vec<u8>)| {
-            std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
+
+            let local_path = local_layer_path(
+                harness.conf,
+                &timeline.tenant_shard_id,
+                &timeline.timeline_id,
+                &name,
+                &generation,
+            );
+            std::fs::write(&local_path, &contents).unwrap();
 
             Layer::for_resident(
                 harness.conf,
                 &timeline,
+                local_path,
                 name,
                 LayerFileMetadata::new(contents.len() as u64, generation, shard),
             )
@@ -2284,19 +2304,22 @@ mod tests {
             ..
         } = TestSetup::new("metrics").await.unwrap();
         let client = timeline.remote_client.as_ref().unwrap();
-        let timeline_path = harness.timeline_path(&TIMELINE_ID);
 
         let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let local_path = local_layer_path(
+            harness.conf,
+            &timeline.tenant_shard_id,
+            &timeline.timeline_id,
+            &layer_file_name_1,
+            &harness.generation,
+        );
         let content_1 = dummy_contents("foo");
-        std::fs::write(
-            timeline_path.join(layer_file_name_1.file_name()),
-            &content_1,
-        )
-        .unwrap();
+        std::fs::write(&local_path, &content_1).unwrap();
 
         let layer_file_1 = Layer::for_resident(
             harness.conf,
             &timeline,
+            local_path,
             layer_file_name_1.clone(),
             LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard),
         );
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index b038f264f5..c86b22d481 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -21,6 +21,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
+use crate::tenant::storage_layer::layer::local_layer_path;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::Generation;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
@@ -55,7 +56,13 @@ pub async fn download_layer_file<'a>(
     debug_assert_current_span_has_tenant_and_timeline_id();
 
     let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
-    let local_path = timeline_path.join(layer_file_name.file_name());
+    let local_path = local_layer_path(
+        conf,
+        &tenant_shard_id,
+        &timeline_id,
+        layer_file_name,
+        &layer_metadata.generation,
+    );
 
     let remote_path = remote_layer_path(
         &tenant_shard_id.tenant_id,
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index a988369b6a..caa843316f 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -12,18 +12,13 @@ use tokio_util::sync::CancellationToken;
 use utils::backoff;
 
 use super::Generation;
-use crate::{
-    config::PageServerConf,
-    tenant::remote_timeline_client::{
-        index::IndexPart, remote_index_path, remote_initdb_archive_path,
-        remote_initdb_preserved_archive_path, remote_path,
-    },
+use crate::tenant::remote_timeline_client::{
+    index::IndexPart, remote_index_path, remote_initdb_archive_path,
+    remote_initdb_preserved_archive_path,
 };
-use remote_storage::{GenericRemoteStorage, TimeTravelError};
+use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
 use utils::id::{TenantId, TimelineId};
 
-use super::index::LayerFileMetadata;
-
 use tracing::info;
 
 /// Serializes and uploads the given index part data to the remote storage.
@@ -65,11 +60,10 @@ pub(crate) async fn upload_index_part<'a>(
 ///
 /// On an error, bumps the retries count and reschedules the entire task.
 pub(super) async fn upload_timeline_layer<'a>(
-    conf: &'static PageServerConf,
     storage: &'a GenericRemoteStorage,
-    source_path: &'a Utf8Path,
-    known_metadata: &'a LayerFileMetadata,
-    generation: Generation,
+    local_path: &'a Utf8Path,
+    remote_path: &'a RemotePath,
+    metadata_size: u64,
     cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
     fail_point!("before-upload-layer", |_| {
@@ -78,8 +72,7 @@ pub(super) async fn upload_timeline_layer<'a>(
 
     pausable_failpoint!("before-upload-layer-pausable");
 
-    let storage_path = remote_path(conf, source_path, generation)?;
-    let source_file_res = fs::File::open(&source_path).await;
+    let source_file_res = fs::File::open(&local_path).await;
     let source_file = match source_file_res {
         Ok(source_file) => source_file,
         Err(e) if e.kind() == ErrorKind::NotFound => {
@@ -90,43 +83,37 @@ pub(super) async fn upload_timeline_layer<'a>(
             // it has been written to disk yet.
             //
             // This is tested against `test_compaction_delete_before_upload`
-            info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
+            info!(path = %local_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
             return Ok(());
         }
-        Err(e) => {
-            Err(e).with_context(|| format!("open a source file for layer {source_path:?}"))?
-        }
+        Err(e) => Err(e).with_context(|| format!("open a source file for layer {local_path:?}"))?,
     };
 
     let fs_size = source_file
         .metadata()
         .await
-        .with_context(|| format!("get the source file metadata for layer {source_path:?}"))?
+        .with_context(|| format!("get the source file metadata for layer {local_path:?}"))?
         .len();
 
-    let metadata_size = known_metadata.file_size();
     if metadata_size != fs_size {
-        bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
+        bail!("File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
     }
 
     let fs_size = usize::try_from(fs_size)
-        .with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;
+        .with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?;
 
     let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
 
     storage
-        .upload(reader, fs_size, &storage_path, None, cancel)
+        .upload(reader, fs_size, remote_path, None, cancel)
         .await
-        .with_context(|| format!("upload layer from local path '{source_path}'"))
+        .with_context(|| format!("upload layer from local path '{local_path}'"))
 }
 
 pub(super) async fn copy_timeline_layer(
-    conf: &'static PageServerConf,
     storage: &GenericRemoteStorage,
-    source_path: &Utf8Path,
-    source_metadata: &LayerFileMetadata,
-    target_path: &Utf8Path,
-    target_metadata: &LayerFileMetadata,
+    source_path: &RemotePath,
+    target_path: &RemotePath,
     cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
     fail_point!("before-copy-layer", |_| {
@@ -135,11 +122,8 @@ pub(super) async fn copy_timeline_layer(
 
     pausable_failpoint!("before-copy-layer-pausable");
 
-    let source_path = remote_path(conf, source_path, source_metadata.generation)?;
-    let target_path = remote_path(conf, target_path, target_metadata.generation)?;
-
     storage
-        .copy_object(&source_path, &target_path, cancel)
+        .copy_object(source_path, target_path, cancel)
         .await
         .with_context(|| format!("copy layer {source_path} to {target_path}"))
 }
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 5c46df268a..0bb25f0ace 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -21,8 +21,9 @@ use self::{
 use super::{
     config::{SecondaryLocationConfig, TenantConfOpt},
     mgr::TenantManager,
+    remote_timeline_client::LayerFileMetadata,
     span::debug_assert_current_span_has_tenant_id,
-    storage_layer::LayerFileName,
+    storage_layer::{layer::local_layer_path, LayerFileName},
 };
 
 use pageserver_api::{
@@ -182,6 +183,7 @@ impl SecondaryTenant {
         conf: &PageServerConf,
         timeline_id: TimelineId,
         name: LayerFileName,
+        metadata: LayerFileMetadata,
     ) {
         debug_assert_current_span_has_tenant_id();
 
@@ -195,9 +197,13 @@ impl SecondaryTenant {
 
         let now = SystemTime::now();
 
-        let path = conf
-            .timeline_path(&self.tenant_shard_id, &timeline_id)
-            .join(name.file_name());
+        let local_path = local_layer_path(
+            conf,
+            &self.tenant_shard_id,
+            &timeline_id,
+            &name,
+            &metadata.generation,
+        );
 
         let this = self.clone();
 
@@ -208,7 +214,7 @@ impl SecondaryTenant {
             // it, the secondary downloader could have seen an updated heatmap that
             // resulted in a layer being deleted.
             // Other local I/O errors are process-fatal: these should never happen.
-            let deleted = std::fs::remove_file(path);
+            let deleted = std::fs::remove_file(local_path);
 
             let not_found = deleted
                 .as_ref()
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index fb8907b5a8..092630e74d 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -22,7 +22,7 @@ use crate::{
             FAILED_REMOTE_OP_RETRIES,
         },
         span::debug_assert_current_span_has_tenant_id,
-        storage_layer::LayerFileName,
+        storage_layer::{layer::local_layer_path, LayerFileName},
         tasks::{warn_when_period_overrun, BackgroundLoopKind},
     },
     virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
@@ -621,12 +621,12 @@ impl<'a> TenantDownloader<'a> {
                 let layers_in_heatmap = heatmap_timeline
                     .layers
                     .iter()
-                    .map(|l| &l.name)
+                    .map(|l| (&l.name, l.metadata.generation))
                     .collect::<HashSet<_>>();
                 let layers_on_disk = timeline_state
                     .on_disk_layers
                     .iter()
-                    .map(|l| l.0)
+                    .map(|l| (l.0, l.1.metadata.generation))
                     .collect::<HashSet<_>>();
 
                 let mut layer_count = layers_on_disk.len();
@@ -637,16 +637,24 @@ impl<'a> TenantDownloader<'a> {
                     .sum();
 
                 // Remove on-disk layers that are no longer present in heatmap
-                for layer in layers_on_disk.difference(&layers_in_heatmap) {
+                for (layer_file_name, generation) in layers_on_disk.difference(&layers_in_heatmap) {
                     layer_count -= 1;
                     layer_byte_count -= timeline_state
                         .on_disk_layers
-                        .get(layer)
+                        .get(layer_file_name)
                         .unwrap()
                         .metadata
                         .file_size();
 
-                    delete_layers.push((*timeline_id, (*layer).clone()));
+                    let local_path = local_layer_path(
+                        self.conf,
+                        self.secondary_state.get_tenant_shard_id(),
+                        timeline_id,
+                        layer_file_name,
+                        generation,
+                    );
+
+                    delete_layers.push((*timeline_id, (*layer_file_name).clone(), local_path));
                 }
 
                 progress.bytes_downloaded += layer_byte_count;
@@ -661,11 +669,7 @@ impl<'a> TenantDownloader<'a> {
         }
 
         // Execute accumulated deletions
-        for (timeline_id, layer_name) in delete_layers {
-            let timeline_path = self
-                .conf
-                .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
-            let local_path = timeline_path.join(layer_name.to_string());
+        for (timeline_id, layer_name, local_path) in delete_layers {
             tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",);
 
             tokio::fs::remove_file(&local_path)
@@ -754,9 +758,6 @@ impl<'a> TenantDownloader<'a> {
     ) -> Result<(), UpdateError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
         let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
-        let timeline_path = self
-            .conf
-            .timeline_path(tenant_shard_id, &timeline.timeline_id);
 
         // Accumulate updates to the state
         let mut touched = Vec::new();
@@ -806,10 +807,14 @@ impl<'a> TenantDownloader<'a> {
                 if cfg!(debug_assertions) {
                     // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
                     // are already present on disk are really there.
-                    let local_path = self
-                        .conf
-                        .timeline_path(tenant_shard_id, &timeline.timeline_id)
-                        .join(layer.name.file_name());
+                    let local_path = local_layer_path(
+                        self.conf,
+                        tenant_shard_id,
+                        &timeline.timeline_id,
+                        &layer.name,
+                        &layer.metadata.generation,
+                    );
+
                     match tokio::fs::metadata(&local_path).await {
                         Ok(meta) => {
                             tracing::debug!(
@@ -903,7 +908,13 @@ impl<'a> TenantDownloader<'a> {
             };
 
             if downloaded_bytes != layer.metadata.file_size {
-                let local_path = timeline_path.join(layer.name.to_string());
+                let local_path = local_layer_path(
+                    self.conf,
+                    tenant_shard_id,
+                    &timeline.timeline_id,
+                    &layer.name,
+                    &layer.metadata.generation,
+                );
 
                 tracing::warn!(
                     "Downloaded layer {} with unexpected size {} != {}.  Removing download.",
diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs
index a98be0842b..fff66a9d07 100644
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -2,11 +2,13 @@
 //! Helper functions for dealing with filenames of the image and delta layer files.
 //!
 use crate::repository::Key;
+use std::borrow::Cow;
 use std::cmp::Ordering;
 use std::fmt;
 use std::ops::Range;
 use std::str::FromStr;
 
+use regex::Regex;
 use utils::lsn::Lsn;
 
 use super::PersistentLayerDesc;
@@ -74,10 +76,19 @@ impl DeltaFileName {
         let key_end_str = key_parts.next()?;
         let lsn_start_str = lsn_parts.next()?;
         let lsn_end_str = lsn_parts.next()?;
+
         if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() {
             return None;
         }
 
+        if key_start_str.len() != 36
+            || key_end_str.len() != 36
+            || lsn_start_str.len() != 16
+            || lsn_end_str.len() != 16
+        {
+            return None;
+        }
+
         let key_start = Key::from_hex(key_start_str).ok()?;
         let key_end = Key::from_hex(key_end_str).ok()?;
 
@@ -182,6 +193,10 @@ impl ImageFileName {
             return None;
         }
 
+        if key_start_str.len() != 36 || key_end_str.len() != 36 || lsn_str.len() != 16 {
+            return None;
+        }
+
         let key_start = Key::from_hex(key_start_str).ok()?;
         let key_end = Key::from_hex(key_end_str).ok()?;
 
@@ -259,9 +274,22 @@ impl From<DeltaFileName> for LayerFileName {
 impl FromStr for LayerFileName {
     type Err = String;
 
+    /// Conversion from either a physical layer filename, or the string-ization of
+    /// Self. When loading a physical layer filename, we drop any extra information
+    /// not needed to build Self.
     fn from_str(value: &str) -> Result<Self, Self::Err> {
-        let delta = DeltaFileName::parse_str(value);
-        let image = ImageFileName::parse_str(value);
+        let gen_suffix_regex = Regex::new("^(?<base>.+)-(?<gen>[0-9a-f]{8})$").unwrap();
+        let file_name: Cow<str> = match gen_suffix_regex.captures(value) {
+            Some(captures) => captures
+                .name("base")
+                .expect("Non-optional group")
+                .as_str()
+                .into(),
+            None => value.into(),
+        };
+
+        let delta = DeltaFileName::parse_str(&file_name);
+        let image = ImageFileName::parse_str(&file_name);
         let ok = match (delta, image) {
             (None, None) => {
                 return Err(format!(
@@ -315,3 +343,42 @@ impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
         v.parse().map_err(|e| E::custom(e))
     }
 }
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    #[test]
+    fn image_layer_parse() -> anyhow::Result<()> {
+        let expected = LayerFileName::Image(ImageFileName {
+            key_range: Key::from_i128(0)
+                ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
+            lsn: Lsn::from_hex("00000000014FED58").unwrap(),
+        });
+        let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-00000001").map_err(|s| anyhow::anyhow!(s))?;
+        assert_eq!(parsed, expected,);
+
+        // Omitting generation suffix is valid
+        let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
+        assert_eq!(parsed, expected,);
+
+        Ok(())
+    }
+
+    #[test]
+    fn delta_layer_parse() -> anyhow::Result<()> {
+        let expected = LayerFileName::Delta(DeltaFileName {
+            key_range: Key::from_i128(0)
+                ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
+            lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
+                ..Lsn::from_hex("000000000154C481").unwrap(),
+        });
+        let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-00000001").map_err(|s| anyhow::anyhow!(s))?;
+        assert_eq!(parsed, expected);
+
+        // Omitting generation suffix is valid
+        let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
+        assert_eq!(parsed, expected);
+
+        Ok(())
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 27faa507ca..b5e69db7f4 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -4,12 +4,13 @@ use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::{
     HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
 };
-use pageserver_api::shard::ShardIndex;
+use pageserver_api::shard::{ShardIndex, TenantShardId};
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
 use std::time::{Duration, SystemTime};
 use tracing::Instrument;
+use utils::id::TimelineId;
 use utils::lsn::Lsn;
 use utils::sync::heavier_once_cell;
 
@@ -123,6 +124,25 @@ impl PartialEq for Layer {
     }
 }
 
+pub(crate) fn local_layer_path(
+    conf: &PageServerConf,
+    tenant_shard_id: &TenantShardId,
+    timeline_id: &TimelineId,
+    layer_file_name: &LayerFileName,
+    _generation: &Generation,
+) -> Utf8PathBuf {
+    let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id);
+
+    timeline_path.join(layer_file_name.file_name())
+
+    // TOOD: include generation in the name in now+1 releases.
+    // timeline_path.join(format!(
+    //     "{}{}",
+    //     layer_file_name.file_name(),
+    //     generation.get_suffix()
+    // ))
+}
+
 impl Layer {
     /// Creates a layer value for a file we know to not be resident.
     pub(crate) fn for_evicted(
@@ -131,6 +151,14 @@ impl Layer {
         file_name: LayerFileName,
         metadata: LayerFileMetadata,
     ) -> Self {
+        let local_path = local_layer_path(
+            conf,
+            &timeline.tenant_shard_id,
+            &timeline.timeline_id,
+            &file_name,
+            &metadata.generation,
+        );
+
         let desc = PersistentLayerDesc::from_filename(
             timeline.tenant_shard_id,
             timeline.timeline_id,
@@ -143,6 +171,7 @@ impl Layer {
         let owner = Layer(Arc::new(LayerInner::new(
             conf,
             timeline,
+            local_path,
             access_stats,
             desc,
             None,
@@ -159,6 +188,7 @@ impl Layer {
     pub(crate) fn for_resident(
         conf: &'static PageServerConf,
         timeline: &Arc<Timeline>,
+        local_path: Utf8PathBuf,
         file_name: LayerFileName,
         metadata: LayerFileMetadata,
     ) -> ResidentLayer {
@@ -184,6 +214,7 @@ impl Layer {
             LayerInner::new(
                 conf,
                 timeline,
+                local_path,
                 access_stats,
                 desc,
                 Some(inner),
@@ -225,9 +256,19 @@ impl Layer {
                 LayerResidenceStatus::Resident,
                 LayerResidenceEventReason::LayerCreate,
             );
+
+            let local_path = local_layer_path(
+                conf,
+                &timeline.tenant_shard_id,
+                &timeline.timeline_id,
+                &desc.filename(),
+                &timeline.generation,
+            );
+
             LayerInner::new(
                 conf,
                 timeline,
+                local_path,
                 access_stats,
                 desc,
                 Some(inner),
@@ -410,6 +451,13 @@ impl Layer {
         self.0.metadata()
     }
 
+    pub(crate) fn get_timeline_id(&self) -> Option<TimelineId> {
+        self.0
+            .timeline
+            .upgrade()
+            .map(|timeline| timeline.timeline_id)
+    }
+
     /// Traditional debug dumping facility
     #[allow(unused)]
     pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> {
@@ -709,19 +757,17 @@ impl Drop for LayerInner {
 }
 
 impl LayerInner {
+    #[allow(clippy::too_many_arguments)]
     fn new(
         conf: &'static PageServerConf,
         timeline: &Arc<Timeline>,
+        local_path: Utf8PathBuf,
         access_stats: LayerAccessStats,
         desc: PersistentLayerDesc,
         downloaded: Option<Arc<DownloadedLayer>>,
         generation: Generation,
         shard: ShardIndex,
     ) -> Self {
-        let path = conf
-            .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
-            .join(desc.filename().to_string());
-
         let (inner, version, init_status) = if let Some(inner) = downloaded {
             let version = inner.version;
             let resident = ResidentOrWantedEvicted::Resident(inner);
@@ -737,7 +783,7 @@ impl LayerInner {
         LayerInner {
             conf,
             debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
-            path,
+            path: local_path,
             desc,
             timeline: Arc::downgrade(timeline),
             have_remote_client: timeline.remote_client.is_some(),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7213ff8f75..d6d012c70c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -60,6 +60,7 @@ use std::{
     ops::ControlFlow,
 };
 
+use crate::tenant::storage_layer::layer::local_layer_path;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
     metadata::TimelineMetadata,
@@ -1904,7 +1905,7 @@ impl Timeline {
     #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
     pub(crate) async fn download_layer(
         &self,
-        layer_file_name: &str,
+        layer_file_name: &LayerFileName,
     ) -> anyhow::Result<Option<bool>> {
         let Some(layer) = self.find_layer(layer_file_name).await else {
             return Ok(None);
@@ -1922,7 +1923,10 @@ impl Timeline {
     /// Evict just one layer.
     ///
     /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
-    pub(crate) async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
+    pub(crate) async fn evict_layer(
+        &self,
+        layer_file_name: &LayerFileName,
+    ) -> anyhow::Result<Option<bool>> {
         let _gate = self
             .gate
             .enter()
@@ -2413,8 +2417,8 @@ impl Timeline {
 
                 for discovered in discovered {
                     let (name, kind) = match discovered {
-                        Discovered::Layer(file_name, file_size) => {
-                            discovered_layers.push((file_name, file_size));
+                        Discovered::Layer(layer_file_name, local_path, file_size) => {
+                            discovered_layers.push((layer_file_name, local_path, file_size));
                             continue;
                         }
                         Discovered::Metadata => {
@@ -2459,7 +2463,7 @@ impl Timeline {
                 let mut needs_cleanup = Vec::new();
                 let mut total_physical_size = 0;
 
-                for (name, decision) in decided {
+                for (name, local_path, decision) in decided {
                     let decision = match decision {
                         Ok(UseRemote { local, remote }) => {
                             // Remote is authoritative, but we may still choose to retain
@@ -2469,26 +2473,23 @@ impl Timeline {
                                 // the correct generation.
                                 UseLocal(remote)
                             } else {
-                                path.push(name.file_name());
-                                init::cleanup_local_file_for_remote(&path, &local, &remote)?;
-                                path.pop();
+                                let local_path = local_path.as_ref().expect("Locally found layer must have path");
+                                init::cleanup_local_file_for_remote(local_path, &local, &remote)?;
                                 UseRemote { local, remote }
                             }
                         }
                         Ok(decision) => decision,
                         Err(DismissedLayer::Future { local }) => {
                             if local.is_some() {
-                                path.push(name.file_name());
-                                init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?;
-                                path.pop();
+                                let local_path = local_path.expect("Locally found layer must have path");
+                                init::cleanup_future_layer(&local_path, &name, disk_consistent_lsn)?;
                             }
                             needs_cleanup.push(name);
                             continue;
                         }
                         Err(DismissedLayer::LocalOnly(local)) => {
-                            path.push(name.file_name());
-                            init::cleanup_local_only_file(&path, &name, &local)?;
-                            path.pop();
+                            let local_path = local_path.expect("Locally found layer must have path");
+                            init::cleanup_local_only_file(&local_path, &name, &local)?;
                             // this file never existed remotely, we will have to do rework
                             continue;
                         }
@@ -2504,7 +2505,18 @@ impl Timeline {
                     let layer = match decision {
                         UseLocal(m) => {
                             total_physical_size += m.file_size();
-                            Layer::for_resident(conf, &this, name, m).drop_eviction_guard()
+
+                            let local_path = local_path.unwrap_or_else(|| {
+                                local_layer_path(
+                                    conf,
+                                    &this.tenant_shard_id,
+                                    &this.timeline_id,
+                                    &name,
+                                    &m.generation,
+                                )
+                            });
+
+                            Layer::for_resident(conf, &this, local_path, name, m).drop_eviction_guard()
                         }
                         Evicted(remote) | UseRemote { remote, .. } => {
                             Layer::for_evicted(conf, &this, name, remote)
@@ -2985,11 +2997,11 @@ impl Timeline {
         }
     }
 
-    async fn find_layer(&self, layer_file_name: &str) -> Option<Layer> {
+    async fn find_layer(&self, layer_name: &LayerFileName) -> Option<Layer> {
         let guard = self.layers.read().await;
         for historic_layer in guard.layer_map().iter_historic_layers() {
-            let historic_layer_name = historic_layer.filename().file_name();
-            if layer_file_name == historic_layer_name {
+            let historic_layer_name = historic_layer.filename();
+            if layer_name == &historic_layer_name {
                 return Some(guard.get_from_desc(&historic_layer));
             }
         }
diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs
index 916ebfc6d9..9c33981807 100644
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -12,7 +12,7 @@ use crate::{
     METADATA_FILE_NAME,
 };
 use anyhow::Context;
-use camino::Utf8Path;
+use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::shard::ShardIndex;
 use std::{collections::HashMap, str::FromStr};
 use utils::lsn::Lsn;
@@ -20,7 +20,7 @@ use utils::lsn::Lsn;
 /// Identified files in the timeline directory.
 pub(super) enum Discovered {
     /// The only one we care about
-    Layer(LayerFileName, u64),
+    Layer(LayerFileName, Utf8PathBuf, u64),
     /// Old ephmeral files from previous launches, should be removed
     Ephemeral(String),
     /// Old temporary timeline files, unsure what these really are, should be removed
@@ -46,7 +46,7 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
         let discovered = match LayerFileName::from_str(&file_name) {
             Ok(file_name) => {
                 let file_size = direntry.metadata()?.len();
-                Discovered::Layer(file_name, file_size)
+                Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
             }
             Err(_) => {
                 if file_name == METADATA_FILE_NAME {
@@ -104,26 +104,38 @@ pub(super) enum DismissedLayer {
 
 /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
 pub(super) fn reconcile(
-    discovered: Vec<(LayerFileName, u64)>,
+    discovered: Vec<(LayerFileName, Utf8PathBuf, u64)>,
     index_part: Option<&IndexPart>,
     disk_consistent_lsn: Lsn,
     generation: Generation,
     shard: ShardIndex,
-) -> Vec<(LayerFileName, Result<Decision, DismissedLayer>)> {
+) -> Vec<(
+    LayerFileName,
+    Option<Utf8PathBuf>,
+    Result<Decision, DismissedLayer>,
+)> {
     use Decision::*;
 
-    // name => (local, remote)
-    type Collected = HashMap<LayerFileName, (Option<LayerFileMetadata>, Option<LayerFileMetadata>)>;
+    // name => (local_path, local_metadata, remote_metadata)
+    type Collected = HashMap<
+        LayerFileName,
+        (
+            Option<Utf8PathBuf>,
+            Option<LayerFileMetadata>,
+            Option<LayerFileMetadata>,
+        ),
+    >;
 
     let mut discovered = discovered
         .into_iter()
-        .map(|(name, file_size)| {
+        .map(|(layer_name, local_path, file_size)| {
             (
-                name,
+                layer_name,
                 // The generation and shard here will be corrected to match IndexPart in the merge below, unless
                 // it is not in IndexPart, in which case using our current generation makes sense
                 // because it will be uploaded in this generation.
                 (
+                    Some(local_path),
                     Some(LayerFileMetadata::new(file_size, generation, shard)),
                     None,
                 ),
@@ -140,15 +152,15 @@ pub(super) fn reconcile(
         .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
         .for_each(|(name, metadata)| {
             if let Some(existing) = discovered.get_mut(name) {
-                existing.1 = Some(metadata);
+                existing.2 = Some(metadata);
             } else {
-                discovered.insert(name.to_owned(), (None, Some(metadata)));
+                discovered.insert(name.to_owned(), (None, None, Some(metadata)));
             }
         });
 
     discovered
         .into_iter()
-        .map(|(name, (local, remote))| {
+        .map(|(name, (local_path, local, remote))| {
             let decision = if name.is_in_future(disk_consistent_lsn) {
                 Err(DismissedLayer::Future { local })
             } else {
@@ -165,7 +177,7 @@ pub(super) fn reconcile(
                 }
             };
 
-            (name, decision)
+            (name, local_path, decision)
         })
         .collect::<Vec<_>>()
 }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index fc66822eb9..30cec4c726 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -54,7 +54,7 @@ from fixtures.pageserver.allowed_errors import (
     DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
 )
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pageserver.types import IndexPartDump
+from fixtures.pageserver.types import IndexPartDump, LayerFileName, parse_layer_file_name
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
     wait_for_upload,
@@ -2652,6 +2652,37 @@ class NeonPageserver(PgProtocol, LogUtils):
             tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
         )
 
+    def list_layers(self, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
+        """
+        Inspect local storage on a pageserver to discover which layer files are present.
+
+        :return: list of relative paths to layers, from the timeline root.
+        """
+        timeline_path = self.timeline_dir(tenant_id, timeline_id)
+
+        def relative(p: Path) -> Path:
+            return p.relative_to(timeline_path)
+
+        return sorted(
+            list(
+                map(
+                    relative,
+                    filter(
+                        lambda path: path.name != "metadata"
+                        and "ephemeral" not in path.name
+                        and "temp" not in path.name,
+                        timeline_path.glob("*"),
+                    ),
+                )
+            )
+        )
+
+    def layer_exists(
+        self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerFileName
+    ) -> bool:
+        layers = self.list_layers(tenant_id, timeline_id)
+        return layer_name in [parse_layer_file_name(p.name) for p in layers]
+
 
 class PgBin:
     """A helper class for executing postgres binaries"""
diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/types.py
index 72fa30a2f2..fd018cb778 100644
--- a/test_runner/fixtures/pageserver/types.py
+++ b/test_runner/fixtures/pageserver/types.py
@@ -1,3 +1,4 @@
+import re
 from dataclasses import dataclass
 from typing import Any, Dict, Tuple, Union
 
@@ -47,46 +48,36 @@ class InvalidFileName(Exception):
     pass
 
 
+IMAGE_LAYER_FILE_NAME = re.compile("^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-[a-f0-9]{8})?$")
+
+
 def parse_image_layer(f_name: str) -> Tuple[int, int, int]:
     """Parse an image layer file name. Return key start, key end, and snapshot lsn"""
-    parts = f_name.split("__")
-    if len(parts) != 2:
-        raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}")
-    key_parts = parts[0].split("-")
-    if len(key_parts) != 2:
-        raise InvalidFileName(
-            f"expecting two key parts separated by '--' in parts[0], got: {key_parts}"
-        )
-    try:
-        return int(key_parts[0], 16), int(key_parts[1], 16), int(parts[1], 16)
-    except ValueError as e:
-        raise InvalidFileName(f"conversion error: {f_name}") from e
+
+    match = IMAGE_LAYER_FILE_NAME.match(f_name)
+    if match is None:
+        raise InvalidFileName(f"'{f_name}' is not an image layer filename")
+
+    return int(match.group(1), 16), int(match.group(2), 16), int(match.group(3), 16)
+
+
+DELTA_LAYER_FILE_NAME = re.compile(
+    "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-[a-f0-9]{8})?$"
+)
 
 
 def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]:
     """Parse a delta layer file name. Return key start, key end, lsn start, and lsn end"""
-    parts = f_name.split("__")
-    if len(parts) != 2:
-        raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}")
-    key_parts = parts[0].split("-")
-    if len(key_parts) != 2:
-        raise InvalidFileName(
-            f"expecting two key parts separated by '--' in parts[0], got: {key_parts}"
-        )
-    lsn_parts = parts[1].split("-")
-    if len(lsn_parts) != 2:
-        raise InvalidFileName(
-            f"expecting two lsn parts separated by '--' in parts[1], got: {lsn_parts}"
-        )
-    try:
-        return (
-            int(key_parts[0], 16),
-            int(key_parts[1], 16),
-            int(lsn_parts[0], 16),
-            int(lsn_parts[1], 16),
-        )
-    except ValueError as e:
-        raise InvalidFileName(f"conversion error: {f_name}") from e
+    match = DELTA_LAYER_FILE_NAME.match(f_name)
+    if match is None:
+        raise InvalidFileName(f"'{f_name}' is not an delta layer filename")
+
+    return (
+        int(match.group(1), 16),
+        int(match.group(2), 16),
+        int(match.group(3), 16),
+        int(match.group(4), 16),
+    )
 
 
 def parse_layer_file_name(file_name: str) -> LayerFileName:
diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py
index cb4fa43be7..7471338ce5 100644
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_duplicate_layers.py
@@ -2,6 +2,7 @@ import time
 
 import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
+from fixtures.pageserver.types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
     wait_for_upload_queue_empty,
@@ -86,14 +87,7 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
 
     # path = env.remote_storage.timeline_path(tenant_id, timeline_id)
     l1_found = None
-    for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir():
-        if path.name == "metadata" or path.name.startswith("ephemeral-"):
-            continue
-
-        if len(path.suffixes) > 0:
-            # temp files
-            continue
-
+    for path in env.pageserver.list_layers(tenant_id, timeline_id):
         [key_range, lsn_range] = path.name.split("__", maxsplit=1)
 
         if "-" not in lsn_range:
@@ -108,19 +102,21 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
 
         if l1_found is not None:
             raise RuntimeError(f"found multiple L1: {l1_found.name} and {path.name}")
-        l1_found = path
+        l1_found = parse_layer_file_name(path.name)
 
     assert l1_found is not None, "failed to find L1 locally"
 
     uploaded = env.pageserver_remote_storage.remote_layer_path(
-        tenant_id, timeline_id, l1_found.name
+        tenant_id, timeline_id, l1_found.to_str()
     )
     assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded"
 
     env.pageserver.start()
     wait_until_tenant_active(pageserver_http, tenant_id)
 
-    assert not l1_found.exists(), "partial compaction result should had been removed during startup"
+    assert not env.pageserver.layer_exists(
+        tenant_id, timeline_id, l1_found
+    ), "partial compaction result should had been removed during startup"
 
     # wait for us to catch up again
     wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)
@@ -130,18 +126,18 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
     # give time for log flush
     time.sleep(1)
 
-    message = f".*duplicated L1 layer layer={l1_found.name}"
+    message = f".*duplicated L1 layer layer={l1_found}"
     found_msg = env.pageserver.log_contains(message)
     # resident or evicted, it should not be overwritten, however it should had been non-existing at startup
     assert (
         found_msg is None
     ), "layer should had been removed during startup, did it live on as evicted?"
 
-    assert l1_found.exists(), "the L1 reappears"
+    assert env.pageserver.layer_exists(tenant_id, timeline_id, l1_found), "the L1 reappears"
 
     wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
 
     uploaded = env.pageserver_remote_storage.remote_layer_path(
-        tenant_id, timeline_id, l1_found.name
+        tenant_id, timeline_id, l1_found.to_str()
     )
     assert uploaded.exists(), "the L1 is uploaded"
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index fefb30bbdd..5c967fd72e 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -7,6 +7,7 @@ from fixtures.neon_fixtures import (
     flush_ep_to_pageserver,
     wait_for_last_flush_lsn,
 )
+from fixtures.pageserver.types import parse_layer_file_name
 from fixtures.pageserver.utils import wait_for_upload
 from fixtures.remote_storage import RemoteStorageKind
 
@@ -57,9 +58,9 @@ def test_basic_eviction(
     for sk in env.safekeepers:
         sk.stop()
 
-    timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
-    initial_local_layers = sorted(
-        list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
+    initial_local_layers = dict(
+        (parse_layer_file_name(path.name), path)
+        for path in env.pageserver.list_layers(tenant_id, timeline_id)
     )
     assert (
         len(initial_local_layers) > 1
@@ -73,6 +74,7 @@ def test_basic_eviction(
     assert len(initial_local_layers) == len(
         initial_layer_map_info.historic_layers
     ), "Should have the same layers in memory and on disk"
+
     for returned_layer in initial_layer_map_info.historic_layers:
         assert (
             returned_layer.kind == "Delta"
@@ -81,27 +83,29 @@ def test_basic_eviction(
             not returned_layer.remote
         ), f"All created layers should be present locally, but got {returned_layer}"
 
-        local_layers = list(
-            filter(lambda layer: layer.name == returned_layer.layer_file_name, initial_local_layers)
+        returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name)
+        assert (
+            returned_layer_name in initial_local_layers
+        ), f"Did not find returned layer {returned_layer_name} in local layers {list(initial_local_layers.keys())}"
+
+        local_layer_path = (
+            env.pageserver.timeline_dir(tenant_id, timeline_id)
+            / initial_local_layers[returned_layer_name]
         )
         assert (
-            len(local_layers) == 1
-        ), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}"
-        local_layer = local_layers[0]
-        assert (
-            returned_layer.layer_file_size == local_layer.stat().st_size
-        ), f"Returned layer {returned_layer} has a different file size than local layer {local_layer}"
+            returned_layer.layer_file_size == local_layer_path.stat().st_size
+        ), f"Returned layer {returned_layer} has a different file size than local layer {local_layer_path}"
 
     # Detach all layers, ensre they are not in the local FS, but are still dumped as part of the layer map
-    for local_layer in initial_local_layers:
+    for local_layer_name, local_layer_path in initial_local_layers.items():
         client.evict_layer(
-            tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name
+            tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer_path.name
         )
-        assert not any(
-            new_local_layer.name == local_layer.name for new_local_layer in timeline_path.glob("*")
-        ), f"Did not expect to find {local_layer} layer after evicting"
+        assert not env.pageserver.layer_exists(
+            tenant_id, timeline_id, local_layer_name
+        ), f"Did not expect to find {local_layer_name} layer after evicting"
 
-    empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
+    empty_layers = env.pageserver.list_layers(tenant_id, timeline_id)
     assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}"
 
     evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id)
@@ -118,15 +122,15 @@ def test_basic_eviction(
         assert (
             returned_layer.remote
         ), f"All layers should be evicted and not present locally, but got {returned_layer}"
-        assert any(
-            local_layer.name == returned_layer.layer_file_name
-            for local_layer in initial_local_layers
+        returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name)
+        assert (
+            returned_layer_name in initial_local_layers
         ), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}"
 
     # redownload all evicted layers and ensure the initial state is restored
-    for local_layer in initial_local_layers:
+    for local_layer_name, _local_layer_path in initial_local_layers.items():
         client.download_layer(
-            tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name
+            tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer_name.to_str()
         )
     client.timeline_download_remote_layers(
         tenant_id,
@@ -137,8 +141,9 @@ def test_basic_eviction(
         at_least_one_download=False,
     )
 
-    redownloaded_layers = sorted(
-        list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
+    redownloaded_layers = dict(
+        (parse_layer_file_name(path.name), path)
+        for path in env.pageserver.list_layers(tenant_id, timeline_id)
     )
     assert (
         redownloaded_layers == initial_local_layers
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index f957bea156..adcf7de8d4 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -10,6 +10,7 @@ of the pageserver are:
 """
 
 import enum
+import os
 import re
 import time
 from typing import Optional
@@ -700,3 +701,50 @@ def test_multi_attach(
 
     # All data we wrote while multi-attached remains readable
     workload.validate(pageservers[2].id)
+
+
+@pytest.mark.skip(reason="To be enabled after release with new local path style")
+def test_upgrade_generationless_local_file_paths(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test pageserver behavior when startup up with local layer paths without
+    generation numbers: it should accept these layer files, and avoid doing
+    a delete/download cycle on them.
+    """
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(1000)
+
+    env.pageserver.stop()
+
+    # Rename the local paths to legacy format, to simulate what
+    # we would see when upgrading
+    timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
+    files_renamed = 0
+    for filename in os.listdir(timeline_dir):
+        path = os.path.join(timeline_dir, filename)
+        log.info(f"Found file {path}")
+        if path.endswith("-00000001"):
+            new_path = path[:-9]
+            os.rename(path, new_path)
+            log.info(f"Renamed {path} -> {new_path}")
+            files_renamed += 1
+
+    assert files_renamed > 0
+
+    env.pageserver.start()
+
+    workload.validate()
+
+    # Assert that there were no on-demand downloads
+    assert (
+        env.pageserver.http_client().get_metric_value(
+            "pageserver_remote_ondemand_downloaded_layers_total"
+        )
+        == 0
+    )
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 8f194e5dda..c40bb962f2 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -2,12 +2,12 @@ import json
 import os
 import random
 import time
-from pathlib import Path
 from typing import Any, Dict, Optional
 
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
+from fixtures.pageserver.types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     assert_prefix_empty,
     poll_for_remote_storage_iterations,
@@ -51,9 +51,13 @@ def evict_random_layers(
         if "ephemeral" in layer.name or "temp_download" in layer.name:
             continue
 
+        layer_name = parse_layer_file_name(layer.name)
+
         if rng.choice([True, False]):
-            log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}")
-            client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name)
+            log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer_name.to_str()}")
+            client.evict_layer(
+                tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer_name.to_str()
+            )
 
 
 @pytest.mark.parametrize("seed", [1, 2, 3])
@@ -402,32 +406,6 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
     validate_heatmap(heatmap_second)
 
 
-def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
-    """
-    Inspect local storage on a pageserver to discover which layer files are present.
-
-    :return: list of relative paths to layers, from the timeline root.
-    """
-    timeline_path = pageserver.timeline_dir(tenant_id, timeline_id)
-
-    def relative(p: Path) -> Path:
-        return p.relative_to(timeline_path)
-
-    return sorted(
-        list(
-            map(
-                relative,
-                filter(
-                    lambda path: path.name != "metadata"
-                    and "ephemeral" not in path.name
-                    and "temp" not in path.name,
-                    timeline_path.glob("*"),
-                ),
-            )
-        )
-    )
-
-
 def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     """
     Test the overall data flow in secondary mode:
@@ -482,8 +460,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
 
     ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
-    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
-        ps_secondary, tenant_id, timeline_id
+    assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
+        tenant_id, timeline_id
     )
 
     # Make changes on attached pageserver, check secondary downloads them
@@ -500,8 +478,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
     try:
-        assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
-            ps_secondary, tenant_id, timeline_id
+        assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
+            tenant_id, timeline_id
         )
     except:
         # Do a full listing of the secondary location on errors, to help debug of
@@ -523,8 +501,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     # ==================================================================
     try:
         log.info("Evicting a layer...")
-        layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
-        some_other_layer = list_layers(ps_attached, tenant_id, timeline_id)[1]
+        layer_to_evict = ps_attached.list_layers(tenant_id, timeline_id)[0]
+        some_other_layer = ps_attached.list_layers(tenant_id, timeline_id)[1]
         log.info(f"Victim layer: {layer_to_evict.name}")
         ps_attached.http_client().evict_layer(
             tenant_id, timeline_id, layer_name=layer_to_evict.name
@@ -537,13 +515,13 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
             layer["name"] for layer in heatmap_after_eviction["timelines"][0]["layers"]
         )
         assert layer_to_evict.name not in heatmap_layers
-        assert some_other_layer.name in heatmap_layers
+        assert parse_layer_file_name(some_other_layer.name).to_str() in heatmap_layers
 
         ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
-        assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
-        assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
-            ps_secondary, tenant_id, timeline_id
+        assert layer_to_evict not in ps_attached.list_layers(tenant_id, timeline_id)
+        assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
+            tenant_id, timeline_id
         )
     except:
         # On assertion failures, log some details to help with debugging
@@ -630,7 +608,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         for timeline_id in timelines:
             log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}")
             # One or more layers should be present for all timelines
-            assert list_layers(ps_secondary, tenant_id, timeline_id)
+            assert ps_secondary.list_layers(tenant_id, timeline_id)
 
         # Delete the second timeline: this should be reflected later on the secondary
         env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])
@@ -645,10 +623,10 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         ps_secondary = next(p for p in env.pageservers if p != ps_attached)
 
         # This one was not deleted
-        assert list_layers(ps_secondary, tenant_id, timelines[0])
+        assert ps_secondary.list_layers(tenant_id, timelines[0])
 
         # This one was deleted
-        assert not list_layers(ps_secondary, tenant_id, timelines[1])
+        assert not ps_secondary.list_layers(tenant_id, timelines[1])
 
     t_end = time.time()
 
@@ -708,7 +686,7 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll
     ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
 
     # Expect lots of layers
-    assert len(list_layers(ps_attached, tenant_id, timeline_id)) > 10
+    assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10
 
     # Simulate large data by making layer downloads artifically slow
     for ps in env.pageservers:
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index ad4b4a42f1..70c025c225 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -12,6 +12,7 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.pageserver.types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
@@ -829,8 +830,9 @@ def test_compaction_waits_for_upload(
     assert len(upload_stuck_layers) > 0
 
     for name in upload_stuck_layers:
-        path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name
-        assert path.exists(), "while uploads are stuck the layers should be present on disk"
+        assert env.pageserver.layer_exists(
+            tenant_id, timeline_id, parse_layer_file_name(name)
+        ), "while uploads are stuck the layers should be present on disk"
 
     # now this will do the L0 => L1 compaction and want to remove
     # upload_stuck_layers and the original initdb L0
@@ -838,8 +840,9 @@ def test_compaction_waits_for_upload(
 
     # as uploads are paused, the upload_stuck_layers should still be with us
     for name in upload_stuck_layers:
-        path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name
-        assert path.exists(), "uploads are stuck still over compaction"
+        assert env.pageserver.layer_exists(
+            tenant_id, timeline_id, parse_layer_file_name(name)
+        ), "uploads are stuck still over compaction"
 
     compacted_layers = client.layer_map_info(tenant_id, timeline_id).historic_by_name()
     overlap = compacted_layers.intersection(upload_stuck_layers)
@@ -873,9 +876,8 @@ def test_compaction_waits_for_upload(
     wait_until(10, 1, until_layer_deletes_completed)
 
     for name in upload_stuck_layers:
-        path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name
-        assert (
-            not path.exists()
+        assert not env.pageserver.layer_exists(
+            tenant_id, timeline_id, parse_layer_file_name(name)
         ), "l0 should now be removed because of L0 => L1 compaction and completed uploads"
 
     # We should not have hit the error handling path in uploads where a uploaded file is gone
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index d16978d02a..a1e96928bf 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -18,6 +18,7 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     last_flush_lsn_upload,
 )
+from fixtures.pageserver.types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     assert_tenant_state,
     wait_for_last_record_lsn,
@@ -246,7 +247,10 @@ def test_tenant_redownloads_truncated_file_on_startup(
 
     # ensure the same size is found from the index_part.json
     index_part = env.pageserver_remote_storage.index_content(tenant_id, timeline_id)
-    assert index_part["layer_metadata"][path.name]["file_size"] == expected_size
+    assert (
+        index_part["layer_metadata"][parse_layer_file_name(path.name).to_str()]["file_size"]
+        == expected_size
+    )
 
     ## Start the pageserver. It will notice that the file size doesn't match, and
     ## rename away the local file. It will be re-downloaded when it's needed.
@@ -276,7 +280,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
 
     # the remote side of local_layer_truncated
     remote_layer_path = env.pageserver_remote_storage.remote_layer_path(
-        tenant_id, timeline_id, path.name
+        tenant_id, timeline_id, parse_layer_file_name(path.name).to_str()
     )
 
     # if the upload ever was ongoing, this check would be racy, but at least one
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index bc983c36ee..5abb3e28e4 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -63,7 +63,7 @@ def test_ancestor_detach_branched_from(
 
     env.pageserver.allowed_errors.extend(
         [
-            ".*initial size calculation failed: downloading failed, possibly for shutdown"
+            ".*initial size calculation failed: downloading failed, possibly for shutdown",
             ".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
         ]
     )

From 0c99e5ec6d3cd0296a668ece1961c7ce4674b695 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 7 May 2024 18:15:06 +0100
Subject: [PATCH 0720/1571] proxy: cull http connections (#7632)

## Problem

Some HTTP client connections can stay open for quite a long time.

## Summary of changes

When there are too many HTTP client connections, pick a random
connection and gracefully cancel it.
---
 Cargo.lock                            |   1 +
 Cargo.toml                            |   1 +
 libs/metrics/src/lib.rs               |   9 +++
 proxy/Cargo.toml                      |   1 +
 proxy/src/bin/proxy.rs                |   9 +++
 proxy/src/config.rs                   |   4 +-
 proxy/src/serverless.rs               |  47 ++++++++----
 proxy/src/serverless/cancel_set.rs    | 102 ++++++++++++++++++++++++++
 proxy/src/serverless/conn_pool.rs     |   4 +-
 proxy/src/serverless/sql_over_http.rs |   4 +-
 10 files changed, 164 insertions(+), 18 deletions(-)
 create mode 100644 proxy/src/serverless/cancel_set.rs

diff --git a/Cargo.lock b/Cargo.lock
index b0c7aec6ae..9bff5e1eff 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4372,6 +4372,7 @@ dependencies = [
  "hyper 1.2.0",
  "hyper-tungstenite",
  "hyper-util",
+ "indexmap 2.0.1",
  "ipnet",
  "itertools",
  "lasso",
diff --git a/Cargo.toml b/Cargo.toml
index a6d406dc2f..1ddadd2f3c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -99,6 +99,7 @@ humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
 hyper-tungstenite = "0.13.0"
+indexmap = "2"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 8e0dbe6ce4..141d8a6d01 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -480,6 +480,15 @@ impl<A: CounterPairAssoc> CounterPairVec<A> {
         let id = self.vec.with_labels(labels);
         self.vec.remove_metric(id)
     }
+
+    pub fn sample(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) -> u64 {
+        let id = self.vec.with_labels(labels);
+        let metric = self.vec.get_metric(id);
+
+        let inc = metric.inc.count.load(std::sync::atomic::Ordering::Relaxed);
+        let dec = metric.dec.count.load(std::sync::atomic::Ordering::Relaxed);
+        inc.saturating_sub(dec)
+    }
 }
 
 impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 0e8d03906b..3002006aed 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -40,6 +40,7 @@ hyper.workspace = true
 hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
 hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
 http-body-util = { version = "0.1" }
+indexmap.workspace = true
 ipnet.workspace = true
 itertools.workspace = true
 lasso = { workspace = true, features = ["multi-threaded"] }
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 0956aae6c0..5399f13edd 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -27,6 +27,7 @@ use proxy::redis::cancellation_publisher::RedisPublisherClient;
 use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use proxy::redis::elasticache;
 use proxy::redis::notifications;
+use proxy::serverless::cancel_set::CancelSet;
 use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;
 
@@ -243,6 +244,12 @@ struct SqlOverHttpArgs {
     /// increase memory used by the pool
     #[clap(long, default_value_t = 128)]
     sql_over_http_pool_shards: usize,
+
+    #[clap(long, default_value_t = 10000)]
+    sql_over_http_client_conn_threshold: u64,
+
+    #[clap(long, default_value_t = 64)]
+    sql_over_http_cancel_set_shards: usize,
 }
 
 #[tokio::main]
@@ -599,6 +606,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
             max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
         },
+        cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
+        client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
     };
     let authentication_config = AuthenticationConfig {
         scram_protocol_timeout: args.scram_protocol_timeout,
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index e090407756..b7ab2c00f9 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -2,7 +2,7 @@ use crate::{
     auth::{self, backend::AuthRateLimiter},
     console::locks::ApiLocks,
     rate_limiter::RateBucketInfo,
-    serverless::GlobalConnPoolOptions,
+    serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
     Host,
 };
 use anyhow::{bail, ensure, Context, Ok};
@@ -56,6 +56,8 @@ pub struct TlsConfig {
 pub struct HttpConfig {
     pub request_timeout: tokio::time::Duration,
     pub pool_options: GlobalConnPoolOptions,
+    pub cancel_set: CancelSet,
+    pub client_conn_threshold: u64,
 }
 
 pub struct AuthenticationConfig {
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 1a0d1f7b0e..cbff51f207 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -3,6 +3,7 @@
 //! Handles both SQL over HTTP and SQL over Websockets.
 
 mod backend;
+pub mod cancel_set;
 mod conn_pool;
 mod http_util;
 mod json;
@@ -109,20 +110,37 @@ pub async fn task_main(
         let conn_id = uuid::Uuid::new_v4();
         let http_conn_span = tracing::info_span!("http_conn", ?conn_id);
 
-        connections.spawn(
-            connection_handler(
-                config,
-                backend.clone(),
-                connections.clone(),
-                cancellation_handler.clone(),
-                cancellation_token.clone(),
-                server.clone(),
-                tls_acceptor.clone(),
-                conn,
-                peer_addr,
-            )
-            .instrument(http_conn_span),
-        );
+        let n_connections = Metrics::get()
+            .proxy
+            .client_connections
+            .sample(crate::metrics::Protocol::Http);
+        tracing::trace!(?n_connections, threshold = ?config.http_config.client_conn_threshold, "check");
+        if n_connections > config.http_config.client_conn_threshold {
+            tracing::trace!("attempting to cancel a random connection");
+            if let Some(token) = config.http_config.cancel_set.take() {
+                tracing::debug!("cancelling a random connection");
+                token.cancel()
+            }
+        }
+
+        let conn_token = cancellation_token.child_token();
+        let conn = connection_handler(
+            config,
+            backend.clone(),
+            connections.clone(),
+            cancellation_handler.clone(),
+            conn_token.clone(),
+            server.clone(),
+            tls_acceptor.clone(),
+            conn,
+            peer_addr,
+        )
+        .instrument(http_conn_span);
+
+        connections.spawn(async move {
+            let _cancel_guard = config.http_config.cancel_set.insert(conn_id, conn_token);
+            conn.await
+        });
     }
 
     connections.wait().await;
@@ -243,6 +261,7 @@ async fn connection_handler(
     // On cancellation, trigger the HTTP connection handler to shut down.
     let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await {
         Either::Left((_cancelled, mut conn)) => {
+            tracing::debug!(%peer_addr, "cancelling connection");
             conn.as_mut().graceful_shutdown();
             conn.await
         }
diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs
new file mode 100644
index 0000000000..390df7f4f7
--- /dev/null
+++ b/proxy/src/serverless/cancel_set.rs
@@ -0,0 +1,102 @@
+//! A set for cancelling random http connections
+
+use std::{
+    hash::{BuildHasher, BuildHasherDefault},
+    num::NonZeroUsize,
+    time::Duration,
+};
+
+use indexmap::IndexMap;
+use parking_lot::Mutex;
+use rand::{thread_rng, Rng};
+use rustc_hash::FxHasher;
+use tokio::time::Instant;
+use tokio_util::sync::CancellationToken;
+use uuid::Uuid;
+
+type Hasher = BuildHasherDefault<FxHasher>;
+
+pub struct CancelSet {
+    shards: Box<[Mutex<CancelShard>]>,
+    // keyed by random uuid, fxhasher is fine
+    hasher: Hasher,
+}
+
+pub struct CancelShard {
+    tokens: IndexMap<uuid::Uuid, (Instant, CancellationToken), Hasher>,
+}
+
+impl CancelSet {
+    pub fn new(shards: usize) -> Self {
+        CancelSet {
+            shards: (0..shards)
+                .map(|_| {
+                    Mutex::new(CancelShard {
+                        tokens: IndexMap::with_hasher(Hasher::default()),
+                    })
+                })
+                .collect(),
+            hasher: Hasher::default(),
+        }
+    }
+
+    pub fn take(&self) -> Option<CancellationToken> {
+        for _ in 0..4 {
+            if let Some(token) = self.take_raw(thread_rng().gen()) {
+                return Some(token);
+            }
+            tracing::trace!("failed to get cancel token");
+        }
+        None
+    }
+
+    pub fn take_raw(&self, rng: usize) -> Option<CancellationToken> {
+        NonZeroUsize::new(self.shards.len())
+            .and_then(|len| self.shards[rng % len].lock().take(rng / len))
+    }
+
+    pub fn insert(&self, id: uuid::Uuid, token: CancellationToken) -> CancelGuard<'_> {
+        let shard = NonZeroUsize::new(self.shards.len()).map(|len| {
+            let hash = self.hasher.hash_one(id) as usize;
+            let shard = &self.shards[hash % len];
+            shard.lock().insert(id, token);
+            shard
+        });
+        CancelGuard { shard, id }
+    }
+}
+
+impl CancelShard {
+    fn take(&mut self, rng: usize) -> Option<CancellationToken> {
+        NonZeroUsize::new(self.tokens.len()).and_then(|len| {
+            // 10 second grace period so we don't cancel new connections
+            if self.tokens.get_index(rng % len)?.1 .0.elapsed() < Duration::from_secs(10) {
+                return None;
+            }
+
+            let (_key, (_insert, token)) = self.tokens.swap_remove_index(rng % len)?;
+            Some(token)
+        })
+    }
+
+    fn remove(&mut self, id: uuid::Uuid) {
+        self.tokens.swap_remove(&id);
+    }
+
+    fn insert(&mut self, id: uuid::Uuid, token: CancellationToken) {
+        self.tokens.insert(id, (Instant::now(), token));
+    }
+}
+
+pub struct CancelGuard<'a> {
+    shard: Option<&'a Mutex<CancelShard>>,
+    id: Uuid,
+}
+
+impl Drop for CancelGuard<'_> {
+    fn drop(&mut self) {
+        if let Some(shard) = self.shard {
+            shard.lock().remove(self.id);
+        }
+    }
+}
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 798e488509..5fa253acf8 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -716,7 +716,7 @@ impl<C: ClientInnerExt> Drop for Client<C> {
 mod tests {
     use std::{mem, sync::atomic::AtomicBool};
 
-    use crate::{BranchId, EndpointId, ProjectId};
+    use crate::{serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId};
 
     use super::*;
 
@@ -767,6 +767,8 @@ mod tests {
                 max_total_conns: 3,
             },
             request_timeout: Duration::from_secs(1),
+            cancel_set: CancelSet::new(0),
+            client_conn_threshold: u64::MAX,
         }));
         let pool = GlobalConnPool::new(config);
         let conn_info = ConnInfo {
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index e856053a7e..5376bddfd3 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -424,8 +424,8 @@ pub enum SqlOverHttpCancel {
 impl ReportableError for SqlOverHttpCancel {
     fn get_error_kind(&self) -> ErrorKind {
         match self {
-            SqlOverHttpCancel::Postgres => ErrorKind::RateLimit,
-            SqlOverHttpCancel::Connect => ErrorKind::ServiceRateLimit,
+            SqlOverHttpCancel::Postgres => ErrorKind::ClientDisconnect,
+            SqlOverHttpCancel::Connect => ErrorKind::ClientDisconnect,
         }
     }
 }

From b158a5eda00f7812c67e524157407b30b795f44c Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Wed, 1 May 2024 11:25:31 -0700
Subject: [PATCH 0721/1571] compute_ctl: Non-functional prep changes to reduce
 diff (#7577)

A couple lines moved further down in main(), and one case of using
Option<&str> instead of Option<&String>.
---
 compute_tools/src/bin/compute_ctl.rs | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 67c5250376..9760cfb2a3 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -84,8 +84,11 @@ fn main() -> Result<()> {
     info!("build_tag: {build_tag}");
 
     let matches = cli().get_matches();
-    let pgbin_default = String::from("postgres");
-    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
+    let pgbin_default = "postgres";
+    let pgbin = matches
+        .get_one::<String>("pgbin")
+        .map(|s| s.as_str())
+        .unwrap_or(pgbin_default);
 
     let ext_remote_storage = matches
         .get_one::<String>("remote-ext-config")
@@ -239,8 +242,6 @@ fn main() -> Result<()> {
     let _http_handle =
         launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
 
-    let extension_server_port: u16 = http_port;
-
     if !spec_set {
         // No spec provided, hang waiting for it.
         info!("no compute spec provided, waiting");
@@ -318,10 +319,10 @@ fn main() -> Result<()> {
         }
     }
 
+    let extension_server_port: u16 = http_port;
+
     // Start Postgres
     let mut pg = None;
-    let mut exit_code = None;
-
     if !prestartup_failed {
         pg = match compute.start_compute(extension_server_port) {
             Ok(pg) => Some(pg),
@@ -391,6 +392,7 @@ fn main() -> Result<()> {
 
     // Wait for the child Postgres process forever. In this state Ctrl+C will
     // propagate to Postgres and it will be shut down as well.
+    let mut exit_code = None;
     if let Some((mut pg, logs_handle)) = pg {
         // Startup is finished, exit the startup tracing span
         drop(startup_context_guard);

From d709bcba81d8544a725e07b7e9cf598644ef9989 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Wed, 1 May 2024 12:01:18 -0700
Subject: [PATCH 0722/1571] compute_ctl: Break up main() into discrete phases
 (#7577)

This commit is intentionally designed to have as small a diff as
possible. To that end, the basic idea is that each distinct "chunk" of
the previous main() has been wrapped in its own function, with the
return values from each function being passed directly into the next.

The structure of main() is now visible from its contents, which have a
handful of smaller functions.

There's a lot of other work that can / should(?) be done beyond this,
but I figure that's more opinionated, and this should be a solid start.

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs | 180 ++++++++++++++++++++++++++-
 1 file changed, 176 insertions(+), 4 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 9760cfb2a3..cc6c12e493 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -51,6 +51,7 @@ use tracing::{error, info, warn};
 use url::Url;
 
 use compute_api::responses::ComputeStatus;
+use compute_api::spec::ComputeSpec;
 
 use compute_tools::compute::{
     forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
@@ -69,6 +70,30 @@ use compute_tools::swap::resize_swap;
 const BUILD_TAG_DEFAULT: &str = "latest";
 
 fn main() -> Result<()> {
+    let (build_tag, clap_args) = init()?;
+
+    let cli_args = process_cli(&clap_args)?;
+
+    // Enter startup tracing context
+    let startup_context_guard = startup_context_from_env();
+
+    let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
+
+    let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
+
+    let (pg_handle, start_pg_result) = start_postgres(&clap_args, wait_spec_result)?;
+
+    // PostgreSQL is now running, if startup was successful. Wait until it exits.
+    let wait_pg_result = wait_postgres(pg_handle, startup_context_guard)?;
+
+    let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
+
+    maybe_delay_exit(delay_exit);
+
+    deinit_and_exit(wait_pg_result);
+}
+
+fn init() -> Result<(String, clap::ArgMatches)> {
     init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
 
     let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -83,7 +108,10 @@ fn main() -> Result<()> {
         .to_string();
     info!("build_tag: {build_tag}");
 
-    let matches = cli().get_matches();
+    Ok((build_tag, cli().get_matches()))
+}
+
+fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
     let pgbin_default = "postgres";
     let pgbin = matches
         .get_one::<String>("pgbin")
@@ -116,6 +144,30 @@ fn main() -> Result<()> {
     let spec_path = matches.get_one::<String>("spec-path");
     let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
 
+    Ok(ProcessCliResult {
+        connstr,
+        pgdata,
+        pgbin,
+        ext_remote_storage,
+        http_port,
+        spec_json,
+        spec_path,
+        resize_swap_on_bind,
+    })
+}
+
+struct ProcessCliResult<'clap> {
+    connstr: &'clap str,
+    pgdata: &'clap str,
+    pgbin: &'clap str,
+    ext_remote_storage: Option<&'clap str>,
+    http_port: u16,
+    spec_json: Option<&'clap String>,
+    spec_path: Option<&'clap String>,
+    resize_swap_on_bind: bool,
+}
+
+fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
     // Extract OpenTelemetry context for the startup actions from the
     // TRACEPARENT and TRACESTATE env variables, and attach it to the current
     // tracing context.
@@ -152,7 +204,7 @@ fn main() -> Result<()> {
     if let Ok(val) = std::env::var("TRACESTATE") {
         startup_tracing_carrier.insert("tracestate".to_string(), val);
     }
-    let startup_context_guard = if !startup_tracing_carrier.is_empty() {
+    if !startup_tracing_carrier.is_empty() {
         use opentelemetry::propagation::TextMapPropagator;
         use opentelemetry::sdk::propagation::TraceContextPropagator;
         let guard = TraceContextPropagator::new()
@@ -162,8 +214,17 @@ fn main() -> Result<()> {
         Some(guard)
     } else {
         None
-    };
+    }
+}
 
+fn try_spec_from_cli(
+    matches: &clap::ArgMatches,
+    ProcessCliResult {
+        spec_json,
+        spec_path,
+        ..
+    }: &ProcessCliResult,
+) -> Result<CliSpecParams> {
     let compute_id = matches.get_one::<String>("compute-id");
     let control_plane_uri = matches.get_one::<String>("control-plane-uri");
 
@@ -204,6 +265,34 @@ fn main() -> Result<()> {
         }
     };
 
+    Ok(CliSpecParams {
+        spec,
+        live_config_allowed,
+    })
+}
+
+struct CliSpecParams {
+    /// If a spec was provided via CLI or file, the [`ComputeSpec`]
+    spec: Option<ComputeSpec>,
+    live_config_allowed: bool,
+}
+
+fn wait_spec(
+    build_tag: String,
+    ProcessCliResult {
+        connstr,
+        pgdata,
+        pgbin,
+        ext_remote_storage,
+        resize_swap_on_bind,
+        http_port,
+        ..
+    }: ProcessCliResult,
+    CliSpecParams {
+        spec,
+        live_config_allowed,
+    }: CliSpecParams,
+) -> Result<WaitSpecResult> {
     let mut new_state = ComputeState::new();
     let spec_set;
 
@@ -270,6 +359,28 @@ fn main() -> Result<()> {
         state.start_time = now;
     }
 
+    Ok(WaitSpecResult {
+        compute,
+        http_port,
+        resize_swap_on_bind,
+    })
+}
+
+struct WaitSpecResult {
+    compute: Arc<ComputeNode>,
+    // passed through from ProcessCliResult
+    http_port: u16,
+    resize_swap_on_bind: bool,
+}
+
+fn start_postgres(
+    matches: &clap::ArgMatches,
+    WaitSpecResult {
+        compute,
+        http_port,
+        resize_swap_on_bind,
+    }: WaitSpecResult,
+) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
     // We got all we need, update the state.
     let mut state = compute.state.lock().unwrap();
     state.status = ComputeStatus::Init;
@@ -377,7 +488,7 @@ fn main() -> Result<()> {
             // This token is used internally by the monitor to clean up all threads
             let token = CancellationToken::new();
 
-            let vm_monitor = &rt.as_ref().map(|rt| {
+            let vm_monitor = rt.as_ref().map(|rt| {
                 rt.spawn(vm_monitor::start(
                     Box::leak(Box::new(vm_monitor::Args {
                         cgroup: cgroup.cloned(),
@@ -390,11 +501,47 @@ fn main() -> Result<()> {
         }
     }
 
+    Ok((
+        pg,
+        StartPostgresResult {
+            delay_exit,
+            compute,
+            #[cfg(target_os = "linux")]
+            rt,
+            #[cfg(target_os = "linux")]
+            token,
+            #[cfg(target_os = "linux")]
+            vm_monitor,
+        },
+    ))
+}
+
+type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
+
+struct StartPostgresResult {
+    delay_exit: bool,
+    // passed through from WaitSpecResult
+    compute: Arc<ComputeNode>,
+
+    #[cfg(target_os = "linux")]
+    rt: Option<tokio::runtime::Runtime>,
+    #[cfg(target_os = "linux")]
+    token: tokio_util::sync::CancellationToken,
+    #[cfg(target_os = "linux")]
+    vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
+}
+
+fn wait_postgres(
+    pg: Option<PostgresHandle>,
+    startup_context_guard: Option<opentelemetry::ContextGuard>,
+) -> Result<WaitPostgresResult> {
     // Wait for the child Postgres process forever. In this state Ctrl+C will
     // propagate to Postgres and it will be shut down as well.
     let mut exit_code = None;
     if let Some((mut pg, logs_handle)) = pg {
         // Startup is finished, exit the startup tracing span
+        // TODO: Probably easier to drop startup_context_guard outside this function. It's here
+        // right now because keeping it here reduced the size of the diff.
         drop(startup_context_guard);
 
         let ecode = pg
@@ -411,6 +558,25 @@ fn main() -> Result<()> {
         exit_code = ecode.code()
     }
 
+    Ok(WaitPostgresResult { exit_code })
+}
+
+struct WaitPostgresResult {
+    exit_code: Option<i32>,
+}
+
+fn cleanup_after_postgres_exit(
+    StartPostgresResult {
+        mut delay_exit,
+        compute,
+        #[cfg(target_os = "linux")]
+        vm_monitor,
+        #[cfg(target_os = "linux")]
+        token,
+        #[cfg(target_os = "linux")]
+        rt,
+    }: StartPostgresResult,
+) -> Result<bool> {
     // Terminate the vm_monitor so it releases the file watcher on
     // /sys/fs/cgroup/neon-postgres.
     // Note: the vm-monitor only runs on linux because it requires cgroups.
@@ -452,13 +618,19 @@ fn main() -> Result<()> {
         error!("error while checking for core dumps: {err:?}");
     }
 
+    Ok(delay_exit)
+}
+
+fn maybe_delay_exit(delay_exit: bool) {
     // If launch failed, keep serving HTTP requests for a while, so the cloud
     // control plane can get the actual error.
     if delay_exit {
         info!("giving control plane 30s to collect the error before shutdown");
         thread::sleep(Duration::from_secs(30));
     }
+}
 
+fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
     // Shutdown trace pipeline gracefully, so that it has a chance to send any
     // pending traces before we exit. Shutting down OTEL tracing provider may
     // hang for quite some time, see, for example:

From 26b14832049dec5eebf722390bdf842243106fbc Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Fri, 3 May 2024 17:35:01 -0700
Subject: [PATCH 0723/1571] compute_ctl: Lift drop(startup_context_guard) into
 main() (#7577)

Part of applying the changes from #7600. This piece *technically* can
change the semantics because now the context guard is held before
process_cli, but... the difference is likely quite small.

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index cc6c12e493..8fa7ed547b 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -72,19 +72,23 @@ const BUILD_TAG_DEFAULT: &str = "latest";
 fn main() -> Result<()> {
     let (build_tag, clap_args) = init()?;
 
-    let cli_args = process_cli(&clap_args)?;
+    let (pg_handle, start_pg_result) = {
+        // Enter startup tracing context
+        let _startup_context_guard = startup_context_from_env();
 
-    // Enter startup tracing context
-    let startup_context_guard = startup_context_from_env();
+        let cli_args = process_cli(&clap_args)?;
 
-    let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
+        let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
 
-    let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
+        let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
 
-    let (pg_handle, start_pg_result) = start_postgres(&clap_args, wait_spec_result)?;
+        start_postgres(&clap_args, wait_spec_result)?
+
+        // Startup is finished, exit the startup tracing span
+    };
 
     // PostgreSQL is now running, if startup was successful. Wait until it exits.
-    let wait_pg_result = wait_postgres(pg_handle, startup_context_guard)?;
+    let wait_pg_result = wait_postgres(pg_handle)?;
 
     let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
 
@@ -531,19 +535,11 @@ struct StartPostgresResult {
     vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
 }
 
-fn wait_postgres(
-    pg: Option<PostgresHandle>,
-    startup_context_guard: Option<opentelemetry::ContextGuard>,
-) -> Result<WaitPostgresResult> {
+fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
     // Wait for the child Postgres process forever. In this state Ctrl+C will
     // propagate to Postgres and it will be shut down as well.
     let mut exit_code = None;
     if let Some((mut pg, logs_handle)) = pg {
-        // Startup is finished, exit the startup tracing span
-        // TODO: Probably easier to drop startup_context_guard outside this function. It's here
-        // right now because keeping it here reduced the size of the diff.
-        drop(startup_context_guard);
-
         let ecode = pg
             .wait()
             .expect("failed to start waiting on Postgres process");

From b827e7b3306663326b7d4f8c472576e65a5ce934 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Tue, 7 May 2024 15:35:23 -0700
Subject: [PATCH 0724/1571] compute_ctl: Fix unused variable on non-Linux
 (#7646)

Introduced by refactorings from #7577.

See an example check-macos-build failure here:
https://github.com/neondatabase/neon/actions/runs/8992211409/job/24701531264
---
 compute_tools/src/bin/compute_ctl.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 8fa7ed547b..9295f091d5 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -378,7 +378,8 @@ struct WaitSpecResult {
 }
 
 fn start_postgres(
-    matches: &clap::ArgMatches,
+    // need to allow unused because `matches` is only used if target_os = "linux"
+    #[allow(unused_variables)] matches: &clap::ArgMatches,
     WaitSpecResult {
         compute,
         http_port,

From 586e77bb24b331e0a61dae654fedcd62d77b2221 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 8 May 2024 08:50:34 +0100
Subject: [PATCH 0725/1571] tests: common log allow list for ancestor detach
 tests (#7645)

These log lines were repeated, and
`test_detached_receives_flushes_while_being_detached` had an incomplete
definition.

Example failure:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7531/8989511410/index.html#suites/a1c2be32556270764423c495fad75d47/992897d3a3369210
---
 .../regress/test_timeline_detach_ancestor.py  | 25 +++++++------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 5abb3e28e4..b8a88ca6df 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -48,6 +48,12 @@ class Branchpoint(str, enum.Enum):
         ]
 
 
+SHUTDOWN_ALLOWED_ERRORS = [
+    ".*initial size calculation failed: downloading failed, possibly for shutdown",
+    ".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+]
+
+
 @pytest.mark.parametrize("branchpoint", Branchpoint.all())
 @pytest.mark.parametrize("restart_after", [True, False])
 def test_ancestor_detach_branched_from(
@@ -61,12 +67,7 @@ def test_ancestor_detach_branched_from(
 
     env = neon_env_builder.init_start()
 
-    env.pageserver.allowed_errors.extend(
-        [
-            ".*initial size calculation failed: downloading failed, possibly for shutdown",
-            ".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
-        ]
-    )
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
 
     client = env.pageserver.http_client()
 
@@ -208,13 +209,7 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, res
 
     env = neon_env_builder.init_start()
 
-    env.pageserver.allowed_errors.extend(
-        [
-            ".*initial size calculation failed: downloading failed, possibly for shutdown",
-            # after restart this is likely to happen if there is other load on the runner
-            ".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
-        ]
-    )
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
 
     client = env.pageserver.http_client()
 
@@ -396,9 +391,7 @@ def test_detached_receives_flushes_while_being_detached(
     with env.endpoints.create_start("new main", tenant_id=env.initial_tenant) as ep:
         assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
 
-    env.pageserver.allowed_errors.append(
-        "initial size calculation failed: downloading failed, possibly for shutdown"
-    )
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
 
 
 # TODO:

From 02d42861e452b9a691b67dc0705ca4c73435f391 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 8 May 2024 11:03:29 +0200
Subject: [PATCH 0726/1571] `neon_local init`: write `pageserver.toml`
 directly; no `pageserver --init --config-override` (#7638)

This does to `neon_local` what
https://github.com/neondatabase/aws/pull/1322 does to our production
deployment.

After both are merged, there are no users of `pageserver --init` /
`pageserver --config-override` left, and we can remove those flags
eventually.
---
 control_plane/src/bin/neon_local.rs |  2 +-
 control_plane/src/local_env.rs      |  4 ++
 control_plane/src/pageserver.rs     | 73 ++++++++++++++---------------
 3 files changed, 39 insertions(+), 40 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 3f09042d9d..179a756135 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -382,7 +382,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
     // Initialize pageserver, create initial tenant and timeline.
     for ps_conf in &env.pageservers {
         PageServerNode::from_env(&env, ps_conf)
-            .initialize(&pageserver_config)
+            .initialize(pageserver_config.clone())
             .unwrap_or_else(|e| {
                 eprintln!("pageserver init failed: {e:?}");
                 exit(1);
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 6437d04ec8..7abbbce95a 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -562,6 +562,10 @@ impl LocalEnv {
             fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
         }
 
+        for ps in &self.pageservers {
+            fs::create_dir(self.pageserver_data_dir(ps.id))?;
+        }
+
         self.persist_config(base_path)
     }
 
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 2179859023..6046c93bad 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -10,7 +10,7 @@ use std::io;
 use std::io::Write;
 use std::num::NonZeroU64;
 use std::path::PathBuf;
-use std::process::Command;
+use std::str::FromStr;
 use std::time::Duration;
 
 use anyhow::{bail, Context};
@@ -74,10 +74,12 @@ impl PageServerNode {
         }
     }
 
-    /// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
-    ///
-    /// These all end up on the command line of the `pageserver` binary.
-    fn neon_local_overrides(&self, cli_overrides: &toml_edit::Document) -> Vec<String> {
+    fn pageserver_init_make_toml(
+        &self,
+        cli_overrides: toml_edit::Document,
+    ) -> anyhow::Result<toml_edit::Document> {
+        // TODO: this is a legacy code, it should be refactored to use toml_edit directly.
+
         // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
         let pg_distrib_dir_param = format!(
             "pg_distrib_dir='{}'",
@@ -172,12 +174,21 @@ impl PageServerNode {
         // Apply the user-provided overrides
         overrides.push(cli_overrides.to_string());
 
-        overrides
+        // Turn `overrides` into a toml document.
+        // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
+        let mut config_toml = toml_edit::Document::new();
+        for fragment_str in overrides {
+            let fragment = toml_edit::Document::from_str(&fragment_str)
+                .expect("all fragments in `overrides` are valid toml documents, this function controls that");
+            for (key, item) in fragment.iter() {
+                config_toml.insert(key, item.clone());
+            }
+        }
+        Ok(config_toml)
     }
 
     /// Initializes a pageserver node by creating its config with the overrides provided.
-    pub fn initialize(&self, config_overrides: &toml_edit::Document) -> anyhow::Result<()> {
-        // First, run `pageserver --init` and wait for it to write a config into FS and exit.
+    pub fn initialize(&self, config_overrides: toml_edit::Document) -> anyhow::Result<()> {
         self.pageserver_init(config_overrides)
             .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
     }
@@ -198,7 +209,7 @@ impl PageServerNode {
         self.start_node().await
     }
 
-    fn pageserver_init(&self, config_overrides: &toml_edit::Document) -> anyhow::Result<()> {
+    fn pageserver_init(&self, cli_overrides: toml_edit::Document) -> anyhow::Result<()> {
         let datadir = self.repo_path();
         let node_id = self.conf.id;
         println!(
@@ -209,36 +220,20 @@ impl PageServerNode {
         );
         io::stdout().flush()?;
 
-        if !datadir.exists() {
-            std::fs::create_dir(&datadir)?;
-        }
-
-        let datadir_path_str = datadir.to_str().with_context(|| {
-            format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
-        })?;
-
-        // `pageserver --init` merges the `--config-override`s into a built-in default config,
-        // then writes out the merged product to `pageserver.toml`.
-        // TODO: just write the full `pageserver.toml` and get rid of `--config-override`.
-        let mut args = vec!["--init", "--workdir", datadir_path_str];
-        let overrides = self.neon_local_overrides(config_overrides);
-        for piece in &overrides {
-            args.push("--config-override");
-            args.push(piece);
-        }
-        let init_output = Command::new(self.env.pageserver_bin())
-            .args(args)
-            .envs(self.pageserver_env_variables()?)
-            .output()
-            .with_context(|| format!("Failed to run pageserver init for node {node_id}"))?;
-
-        anyhow::ensure!(
-            init_output.status.success(),
-            "Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}",
-            node_id,
-            String::from_utf8_lossy(&init_output.stdout),
-            String::from_utf8_lossy(&init_output.stderr),
-        );
+        let config = self
+            .pageserver_init_make_toml(cli_overrides)
+            .context("make pageserver toml")?;
+        let config_file_path = datadir.join("pageserver.toml");
+        let mut config_file = std::fs::OpenOptions::new()
+            .create_new(true)
+            .write(true)
+            .open(&config_file_path)
+            .with_context(|| format!("open pageserver toml for write: {config_file_path:?}"))?;
+        config_file
+            .write_all(config.to_string().as_bytes())
+            .context("write pageserver toml")?;
+        drop(config_file);
+        // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
 
         // Write metadata file, used by pageserver on startup to register itself with
         // the storage controller

From e3a2631df9850d9eb3682b3a1765f93644425678 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 8 May 2024 11:33:41 +0100
Subject: [PATCH 0727/1571] proxy: do not invalidate cache for permit errors
 (#7652)

## Problem

If a permit cannot be acquired to connect to compute, the cache is
invalidated. This had the observed affect of sending more traffic to
ProxyWakeCompute on cplane.

## Summary of changes

Make sure that permit acquire failures are marked as "should not
invalidate cache".
---
 proxy/src/compute.rs            |  9 ++++++-
 proxy/src/console/provider.rs   | 43 +++++++++++++++++++++------------
 proxy/src/proxy/retry.rs        |  2 ++
 proxy/src/proxy/wake_compute.rs |  2 +-
 proxy/src/serverless/backend.rs | 16 ++++++++++++
 5 files changed, 54 insertions(+), 18 deletions(-)

diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 23266ac4ef..4433b3c1c2 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,7 +1,7 @@
 use crate::{
     auth::parse_endpoint_param,
     cancellation::CancelClosure,
-    console::{errors::WakeComputeError, messages::MetricsAuxInfo},
+    console::{errors::WakeComputeError, messages::MetricsAuxInfo, provider::ApiLockError},
     context::RequestMonitoring,
     error::{ReportableError, UserFacingError},
     metrics::{Metrics, NumDbConnectionsGuard},
@@ -34,6 +34,9 @@ pub enum ConnectionError {
 
     #[error("{COULD_NOT_CONNECT}: {0}")]
     WakeComputeError(#[from] WakeComputeError),
+
+    #[error("error acquiring resource permit: {0}")]
+    TooManyConnectionAttempts(#[from] ApiLockError),
 }
 
 impl UserFacingError for ConnectionError {
@@ -57,6 +60,9 @@ impl UserFacingError for ConnectionError {
                 None => err.to_string(),
             },
             WakeComputeError(err) => err.to_string_client(),
+            TooManyConnectionAttempts(_) => {
+                "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
+            }
             _ => COULD_NOT_CONNECT.to_owned(),
         }
     }
@@ -72,6 +78,7 @@ impl ReportableError for ConnectionError {
             ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute,
             ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute,
             ConnectionError::WakeComputeError(e) => e.get_error_kind(),
+            ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(),
         }
     }
 }
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index a05cf248f6..3b996cdbd1 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -12,6 +12,7 @@ use crate::{
     compute,
     config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
     context::RequestMonitoring,
+    error::ReportableError,
     intern::ProjectIdInt,
     metrics::ApiLockMetrics,
     scram, EndpointCacheKey,
@@ -30,6 +31,8 @@ pub mod errors {
     };
     use thiserror::Error;
 
+    use super::ApiLockError;
+
     /// A go-to error message which doesn't leak any detail.
     const REQUEST_FAILED: &str = "Console request failed";
 
@@ -211,8 +214,8 @@ pub mod errors {
         #[error("Too many connections attempts")]
         TooManyConnections,
 
-        #[error("Timeout waiting to acquire wake compute lock")]
-        TimeoutError,
+        #[error("error acquiring resource permit: {0}")]
+        TooManyConnectionAttempts(#[from] ApiLockError),
     }
 
     // This allows more useful interactions than `#[from]`.
@@ -222,17 +225,6 @@ pub mod errors {
         }
     }
 
-    impl From<tokio::sync::AcquireError> for WakeComputeError {
-        fn from(_: tokio::sync::AcquireError) -> Self {
-            WakeComputeError::TimeoutError
-        }
-    }
-    impl From<tokio::time::error::Elapsed> for WakeComputeError {
-        fn from(_: tokio::time::error::Elapsed) -> Self {
-            WakeComputeError::TimeoutError
-        }
-    }
-
     impl UserFacingError for WakeComputeError {
         fn to_string_client(&self) -> String {
             use WakeComputeError::*;
@@ -245,7 +237,9 @@ pub mod errors {
 
                 TooManyConnections => self.to_string(),
 
-                TimeoutError => "timeout while acquiring the compute resource lock".to_owned(),
+                TooManyConnectionAttempts(_) => {
+                    "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
+                }
             }
         }
     }
@@ -256,7 +250,7 @@ pub mod errors {
                 WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
                 WakeComputeError::ApiError(e) => e.get_error_kind(),
                 WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit,
-                WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
+                WakeComputeError::TooManyConnectionAttempts(e) => e.get_error_kind(),
             }
         }
     }
@@ -456,6 +450,23 @@ pub struct ApiLocks<K> {
     metrics: &'static ApiLockMetrics,
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum ApiLockError {
+    #[error("lock was closed")]
+    AcquireError(#[from] tokio::sync::AcquireError),
+    #[error("permit could not be acquired")]
+    TimeoutError(#[from] tokio::time::error::Elapsed),
+}
+
+impl ReportableError for ApiLockError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            ApiLockError::AcquireError(_) => crate::error::ErrorKind::Service,
+            ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit,
+        }
+    }
+}
+
 impl<K: Hash + Eq + Clone> ApiLocks<K> {
     pub fn new(
         name: &'static str,
@@ -475,7 +486,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
         })
     }
 
-    pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, errors::WakeComputeError> {
+    pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
         if self.permits == 0 {
             return Ok(WakeComputePermit { permit: None });
         }
diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs
index 36a05ba190..8dec1f1137 100644
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -86,6 +86,8 @@ impl ShouldRetry for compute::ConnectionError {
         match self {
             compute::ConnectionError::Postgres(err) => err.should_retry_database_address(),
             compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(),
+            // the cache entry was not checked for validity
+            compute::ConnectionError::TooManyConnectionAttempts(_) => false,
             _ => true,
         }
     }
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 3d9e94dd72..94b03e1ccc 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -119,7 +119,7 @@ fn report_error(e: &WakeComputeError, retry: bool) {
             WakeupFailureKind::ApiConsoleOtherError
         }
         WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked,
-        WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError,
+        WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError,
     };
     Metrics::get()
         .proxy
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 963913a260..ce58f575e2 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -10,6 +10,7 @@ use crate::{
     console::{
         errors::{GetAuthInfoError, WakeComputeError},
         locks::ApiLocks,
+        provider::ApiLockError,
         CachedNodeInfo,
     },
     context::RequestMonitoring,
@@ -131,6 +132,8 @@ pub enum HttpConnError {
     AuthError(#[from] AuthError),
     #[error("wake_compute returned error")]
     WakeCompute(#[from] WakeComputeError),
+    #[error("error acquiring resource permit: {0}")]
+    TooManyConnectionAttempts(#[from] ApiLockError),
 }
 
 impl ReportableError for HttpConnError {
@@ -141,6 +144,7 @@ impl ReportableError for HttpConnError {
             HttpConnError::GetAuthInfo(a) => a.get_error_kind(),
             HttpConnError::AuthError(a) => a.get_error_kind(),
             HttpConnError::WakeCompute(w) => w.get_error_kind(),
+            HttpConnError::TooManyConnectionAttempts(w) => w.get_error_kind(),
         }
     }
 }
@@ -153,6 +157,9 @@ impl UserFacingError for HttpConnError {
             HttpConnError::GetAuthInfo(c) => c.to_string_client(),
             HttpConnError::AuthError(c) => c.to_string_client(),
             HttpConnError::WakeCompute(c) => c.to_string_client(),
+            HttpConnError::TooManyConnectionAttempts(_) => {
+                "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
+            }
         }
     }
 }
@@ -165,6 +172,15 @@ impl ShouldRetry for HttpConnError {
             HttpConnError::GetAuthInfo(_) => false,
             HttpConnError::AuthError(_) => false,
             HttpConnError::WakeCompute(_) => false,
+            HttpConnError::TooManyConnectionAttempts(_) => false,
+        }
+    }
+    fn should_retry_database_address(&self) -> bool {
+        match self {
+            HttpConnError::ConnectionError(e) => e.should_retry_database_address(),
+            // we never checked cache validity
+            HttpConnError::TooManyConnectionAttempts(_) => false,
+            _ => true,
         }
     }
 }

From b6d547cf92394cb3f8f73b23a769a3f4c241eec3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 8 May 2024 13:22:27 +0200
Subject: [PATCH 0728/1571] Tiered compaction: add order asserts after delta
 key k-merge (#7648)

Adds ordering asserts to the output of the delta key iterator
`MergeDeltaKeys` that implements a k-merge.

Part of #7296 : the asserts added by this PR get hit in the reproducers
of #7296 as well, but they are earlier in the pipeline.
---
 pageserver/compaction/src/helpers.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index 1b80373ba7..eb0e5ee82a 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -9,6 +9,7 @@ use pageserver_api::shard::ShardIdentity;
 use pin_project_lite::pin_project;
 use std::collections::BinaryHeap;
 use std::collections::VecDeque;
+use std::fmt::Display;
 use std::future::Future;
 use std::ops::{DerefMut, Range};
 use std::pin::Pin;
@@ -214,7 +215,7 @@ pub struct KeySize<K> {
 
 pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
 where
-    K: Eq,
+    K: Eq + PartialOrd + Display + Copy,
     I: Stream<Item = Result<D, E>>,
     D: CompactionDeltaEntry<'a, K>,
 {
@@ -229,12 +230,15 @@ where
                 num_values: 1,
                 size: first.size(),
             };
+            let mut last_key = accum.key;
             while let Some(this) = input.next().await {
                 let this = this?;
                 if this.key() == accum.key {
                     accum.size += this.size();
                     accum.num_values += 1;
                 } else {
+                    assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
+                    last_key = accum.key;
                     yield accum;
                     accum = KeySize {
                         key: this.key(),
@@ -243,6 +247,7 @@ where
                     };
                 }
             }
+            assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
             yield accum;
         }
     }

From 870786bd8214480d8ce4aa56706cd8606b07ef15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 8 May 2024 13:22:55 +0200
Subject: [PATCH 0729/1571] Improve tiered compaction tests (#7643)

Improves the tiered compaction tests:

* Adds a new test that is a simpler version of the ignored
`test_many_updates_for_single_key` test.
* Reduces the amount of data that `test_many_updates_for_single_key`
processes to make it execute more quickly.
* Adds logging support.
---
 pageserver/compaction/tests/tests.rs | 45 ++++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs
index 1cea2a20e1..7aa20e6863 100644
--- a/pageserver/compaction/tests/tests.rs
+++ b/pageserver/compaction/tests/tests.rs
@@ -1,5 +1,20 @@
+use once_cell::sync::OnceCell;
 use pageserver_compaction::interface::CompactionLayer;
 use pageserver_compaction::simulator::MockTimeline;
+use utils::logging;
+
+static LOG_HANDLE: OnceCell<()> = OnceCell::new();
+
+pub(crate) fn setup_logging() {
+    LOG_HANDLE.get_or_init(|| {
+        logging::init(
+            logging::LogFormat::Test,
+            logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+            logging::Output::Stdout,
+        )
+        .expect("Failed to init test logging")
+    });
+}
 
 /// Test the extreme case that there are so many updates for a single key that
 /// even if we produce an extremely narrow delta layer, spanning just that one
@@ -11,13 +26,14 @@ use pageserver_compaction::simulator::MockTimeline;
 #[ignore]
 #[tokio::test]
 async fn test_many_updates_for_single_key() {
+    setup_logging();
     let mut executor = MockTimeline::new();
-    executor.target_file_size = 10_000_000; // 10 MB
+    executor.target_file_size = 1_000_000; // 1 MB
 
-    // Ingest 100 MB of updates to a single key.
+    // Ingest 10 MB of updates to a single key.
     for _ in 1..1000 {
         executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
-        executor.ingest_uniform(10_000, 10, &(0..1)).unwrap();
+        executor.ingest_uniform(1000, 10, &(0..1)).unwrap();
         executor.compact().await.unwrap();
     }
 
@@ -33,3 +49,26 @@ async fn test_many_updates_for_single_key() {
         }
     }
 }
+
+#[tokio::test]
+async fn test_simple_updates() {
+    setup_logging();
+    let mut executor = MockTimeline::new();
+    executor.target_file_size = 500_000; // 500 KB
+
+    // Ingest some traffic.
+    for _ in 1..400 {
+        executor.ingest_uniform(100, 500, &(0..100_000)).unwrap();
+    }
+
+    for l in executor.live_layers.iter() {
+        println!("layer {}: {}", l.short_id(), l.file_size());
+    }
+
+    println!("Running compaction...");
+    executor.compact().await.unwrap();
+
+    for l in executor.live_layers.iter() {
+        println!("layer {}: {}", l.short_id(), l.file_size());
+    }
+}

From a4a4d78993781e7aa723c1df6b833435c2fb2e8c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 8 May 2024 12:26:56 +0100
Subject: [PATCH 0730/1571] build(deps): bump moto from 4.1.2 to 5.0.6 (#7653)

## Problem

The main point of this PR is to get rid of `python-jose` and `ecdsa`
packages as transitive dependencies through `moto`.
They have a bunch of open vulnerabilities[1][2][3] (which don't affect
us directly), but it's nice not to have them at all.

- [1] https://github.com/advisories/GHSA-wj6h-64fc-37mp
- [2] https://github.com/advisories/GHSA-6c5p-j8vq-pqhj
- [3] https://github.com/advisories/GHSA-cjwg-qfpm-7377

## Summary of changes
- Update `moto` from 4.1.2 to 5.0.6
- Update code to accommodate breaking changes in `moto_server`
---
 poetry.lock                            | 585 +++++++++++++++++++------
 pyproject.toml                         |   2 +-
 test_runner/fixtures/remote_storage.py |   2 +-
 3 files changed, 443 insertions(+), 146 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index e437f5de74..ef9f572b17 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -158,6 +158,28 @@ files = [
 attrs = ">=16.0.0"
 pluggy = ">=0.4.0"
 
+[[package]]
+name = "annotated-types"
+version = "0.6.0"
+description = "Reusable constraint types to use with typing.Annotated"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"},
+    {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"},
+]
+
+[[package]]
+name = "antlr4-python3-runtime"
+version = "4.13.1"
+description = "ANTLR 4.13.1 runtime for Python 3"
+optional = false
+python-versions = "*"
+files = [
+    {file = "antlr4-python3-runtime-4.13.1.tar.gz", hash = "sha256:3cd282f5ea7cfb841537fe01f143350fdb1c0b1ce7981443a2fa8513fddb6d1a"},
+    {file = "antlr4_python3_runtime-4.13.1-py3-none-any.whl", hash = "sha256:78ec57aad12c97ac039ca27403ad61cb98aaec8a3f9bb8144f889aa0fa28b943"},
+]
+
 [[package]]
 name = "anyio"
 version = "4.3.0"
@@ -267,22 +289,23 @@ tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy"
 
 [[package]]
 name = "aws-sam-translator"
-version = "1.48.0"
+version = "1.88.0"
 description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates"
 optional = false
-python-versions = ">=3.7, <=4.0, !=4.0"
+python-versions = "!=4.0,<=4.0,>=3.8"
 files = [
-    {file = "aws-sam-translator-1.48.0.tar.gz", hash = "sha256:7171037323dfa30f8f73e9bccb9210e4c384a585e087219a9518a5204f0a2c44"},
-    {file = "aws_sam_translator-1.48.0-py2-none-any.whl", hash = "sha256:be18dfa3dfe7ab291d281667c5f73ac62dbe6bfe86df7d122e4258b906b736f0"},
-    {file = "aws_sam_translator-1.48.0-py3-none-any.whl", hash = "sha256:ca4f8f9910d7713aeaba59346775bfb3198f6acb47c6704572f9bd3fc0fb5bf0"},
+    {file = "aws_sam_translator-1.88.0-py3-none-any.whl", hash = "sha256:aa93d498d8de3fb3d485c316155b1628144b823bbc176099a20de06df666fcac"},
+    {file = "aws_sam_translator-1.88.0.tar.gz", hash = "sha256:e77c65f3488566122277accd44a0f1ec018e37403e0d5fe25120d96e537e91a7"},
 ]
 
 [package.dependencies]
 boto3 = ">=1.19.5,<2.dev0"
-jsonschema = ">=3.2,<4.0"
+jsonschema = ">=3.2,<5"
+pydantic = ">=1.8,<3"
+typing-extensions = ">=4.4"
 
 [package.extras]
-dev = ["black (==20.8b1)", "boto3 (>=1.23,<2)", "click (>=7.1,<8.0)", "coverage (>=5.3,<6.0)", "dateparser (>=0.7,<1.0)", "docopt (>=0.6.2,<0.7.0)", "flake8 (>=3.8.4,<3.9.0)", "parameterized (>=0.7.4,<0.8.0)", "pylint (>=2.9.0,<2.10.0)", "pytest (>=6.2.5,<6.3.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-env (>=0.6.2,<0.7.0)", "pytest-xdist (>=2.5,<3.0)", "pyyaml (>=5.4,<6.0)", "requests (>=2.24.0,<2.25.0)", "tenacity (>=7.0.0,<7.1.0)", "tox (>=3.24,<4.0)"]
+dev = ["black (==24.3.0)", "boto3 (>=1.23,<2)", "boto3-stubs[appconfig,serverlessrepo] (>=1.19.5,<2.dev0)", "coverage (>=5.3,<8)", "dateparser (>=1.1,<2.0)", "mypy (>=1.3.0,<1.4.0)", "parameterized (>=0.7,<1.0)", "pytest (>=6.2,<8)", "pytest-cov (>=2.10,<5)", "pytest-env (>=0.6,<1)", "pytest-rerunfailures (>=9.1,<12)", "pytest-xdist (>=2.5,<4)", "pyyaml (>=6.0,<7.0)", "requests (>=2.28,<3.0)", "ruamel.yaml (==0.17.21)", "ruff (>=0.1.0,<0.2.0)", "tenacity (>=8.0,<9.0)", "types-PyYAML (>=6.0,<7.0)", "types-jsonschema (>=3.2,<4.0)"]
 
 [[package]]
 name = "aws-xray-sdk"
@@ -798,24 +821,26 @@ pycparser = "*"
 
 [[package]]
 name = "cfn-lint"
-version = "0.61.3"
+version = "0.87.1"
 description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved"
 optional = false
-python-versions = ">=3.6, <=4.0, !=4.0"
+python-versions = "!=4.0,<=4.0,>=3.8"
 files = [
-    {file = "cfn-lint-0.61.3.tar.gz", hash = "sha256:3806e010d77901f5e935496df690c10e39676434a738fce1a1161cf9c7bd36a2"},
-    {file = "cfn_lint-0.61.3-py3-none-any.whl", hash = "sha256:8e9522fad0c7c98b31ecbdd4724f8d8a5787457cc0f71e62ae0d11104d6e52ab"},
+    {file = "cfn_lint-0.87.1-py3-none-any.whl", hash = "sha256:d450f450635fc223b6f66880ccac52a5fd1a52966fa1705f1ba52b88dfed3071"},
+    {file = "cfn_lint-0.87.1.tar.gz", hash = "sha256:b3ce9d3e5e0eadcea5d584c8ccaa00bf2a990a36a64d7ffd8683bc60b7e4f06f"},
 ]
 
 [package.dependencies]
-aws-sam-translator = ">=1.47.0"
+aws-sam-translator = ">=1.87.0"
 jschema-to-python = ">=1.2.3,<1.3.0"
 jsonpatch = "*"
-jsonschema = ">=3.0,<4.0"
+jsonschema = ">=3.0,<5"
 junit-xml = ">=1.9,<2.0"
-networkx = ">=2.4,<3.0"
+networkx = ">=2.4,<4"
 pyyaml = ">5.4"
+regex = ">=2021.7.1"
 sarif-om = ">=1.0.4,<1.1.0"
+sympy = ">=1.0.0"
 
 [[package]]
 name = "charset-normalizer"
@@ -931,24 +956,6 @@ websocket-client = ">=0.32.0"
 ssh = ["paramiko (>=2.4.2)"]
 tls = ["cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"]
 
-[[package]]
-name = "ecdsa"
-version = "0.18.0"
-description = "ECDSA cryptographic signature library (pure python)"
-optional = false
-python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
-files = [
-    {file = "ecdsa-0.18.0-py2.py3-none-any.whl", hash = "sha256:80600258e7ed2f16b9aa1d7c295bd70194109ad5a30fdee0eaeefef1d4c559dd"},
-    {file = "ecdsa-0.18.0.tar.gz", hash = "sha256:190348041559e21b22a1d65cee485282ca11a6f81d503fddb84d5017e9ed1e49"},
-]
-
-[package.dependencies]
-six = ">=1.9.0"
-
-[package.extras]
-gmpy = ["gmpy"]
-gmpy2 = ["gmpy2"]
-
 [[package]]
 name = "exceptiongroup"
 version = "1.1.1"
@@ -1268,6 +1275,23 @@ files = [
     {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"},
 ]
 
+[[package]]
+name = "joserfc"
+version = "0.9.0"
+description = "The ultimate Python library for JOSE RFCs, including JWS, JWE, JWK, JWA, JWT"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "joserfc-0.9.0-py3-none-any.whl", hash = "sha256:4026bdbe2c196cd40574e916fa1e28874d99649412edaab0e373dec3077153fb"},
+    {file = "joserfc-0.9.0.tar.gz", hash = "sha256:eebca7f587b1761ce43a98ffd5327f2b600b9aa5bb0a77b947687f503ad43bc0"},
+]
+
+[package.dependencies]
+cryptography = "*"
+
+[package.extras]
+drafts = ["pycryptodome"]
+
 [[package]]
 name = "jschema-to-python"
 version = "1.2.3"
@@ -1309,6 +1333,20 @@ files = [
 [package.dependencies]
 jsonpointer = ">=1.9"
 
+[[package]]
+name = "jsonpath-ng"
+version = "1.6.1"
+description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming."
+optional = false
+python-versions = "*"
+files = [
+    {file = "jsonpath-ng-1.6.1.tar.gz", hash = "sha256:086c37ba4917304850bd837aeab806670224d3f038fe2833ff593a672ef0a5fa"},
+    {file = "jsonpath_ng-1.6.1-py3-none-any.whl", hash = "sha256:8f22cd8273d7772eea9aaa84d922e0841aa36fdb8a2c6b7f6c3791a16a9bc0be"},
+]
+
+[package.dependencies]
+ply = "*"
+
 [[package]]
 name = "jsonpickle"
 version = "2.2.0"
@@ -1338,24 +1376,39 @@ files = [
 
 [[package]]
 name = "jsonschema"
-version = "3.2.0"
+version = "4.17.3"
 description = "An implementation of JSON Schema validation for Python"
 optional = false
-python-versions = "*"
+python-versions = ">=3.7"
 files = [
-    {file = "jsonschema-3.2.0-py2.py3-none-any.whl", hash = "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163"},
-    {file = "jsonschema-3.2.0.tar.gz", hash = "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a"},
+    {file = "jsonschema-4.17.3-py3-none-any.whl", hash = "sha256:a870ad254da1a8ca84b6a2905cac29d265f805acc57af304784962a2aa6508f6"},
+    {file = "jsonschema-4.17.3.tar.gz", hash = "sha256:0f864437ab8b6076ba6707453ef8f98a6a0d512a80e93f8abdb676f737ecb60d"},
 ]
 
 [package.dependencies]
 attrs = ">=17.4.0"
-pyrsistent = ">=0.14.0"
-setuptools = "*"
-six = ">=1.11.0"
+pyrsistent = ">=0.14.0,<0.17.0 || >0.17.0,<0.17.1 || >0.17.1,<0.17.2 || >0.17.2"
 
 [package.extras]
-format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"]
-format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
+format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"]
+format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"]
+
+[[package]]
+name = "jsonschema-spec"
+version = "0.1.6"
+description = "JSONSchema Spec with object-oriented paths"
+optional = false
+python-versions = ">=3.7.0,<4.0.0"
+files = [
+    {file = "jsonschema_spec-0.1.6-py3-none-any.whl", hash = "sha256:f2206d18c89d1824c1f775ba14ed039743b41a9167bd2c5bdb774b66b3ca0bbf"},
+    {file = "jsonschema_spec-0.1.6.tar.gz", hash = "sha256:90215863b56e212086641956b20127ccbf6d8a3a38343dad01d6a74d19482f76"},
+]
+
+[package.dependencies]
+jsonschema = ">=4.0.0,<4.18.0"
+pathable = ">=0.4.1,<0.5.0"
+PyYAML = ">=5.1"
+requests = ">=2.31.0,<3.0.0"
 
 [[package]]
 name = "junit-xml"
@@ -1371,6 +1424,52 @@ files = [
 [package.dependencies]
 six = "*"
 
+[[package]]
+name = "lazy-object-proxy"
+version = "1.10.0"
+description = "A fast and thorough lazy object proxy."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "lazy-object-proxy-1.10.0.tar.gz", hash = "sha256:78247b6d45f43a52ef35c25b5581459e85117225408a4128a3daf8bf9648ac69"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:855e068b0358ab916454464a884779c7ffa312b8925c6f7401e952dcf3b89977"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab7004cf2e59f7c2e4345604a3e6ea0d92ac44e1c2375527d56492014e690c3"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc0d2fc424e54c70c4bc06787e4072c4f3b1aa2f897dfdc34ce1013cf3ceef05"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e2adb09778797da09d2b5ebdbceebf7dd32e2c96f79da9052b2e87b6ea495895"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b1f711e2c6dcd4edd372cf5dec5c5a30d23bba06ee012093267b3376c079ec83"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-win32.whl", hash = "sha256:76a095cfe6045c7d0ca77db9934e8f7b71b14645f0094ffcd842349ada5c5fb9"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:b4f87d4ed9064b2628da63830986c3d2dca7501e6018347798313fcf028e2fd4"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fec03caabbc6b59ea4a638bee5fce7117be8e99a4103d9d5ad77f15d6f81020c"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02c83f957782cbbe8136bee26416686a6ae998c7b6191711a04da776dc9e47d4"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:009e6bb1f1935a62889ddc8541514b6a9e1fcf302667dcb049a0be5c8f613e56"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75fc59fc450050b1b3c203c35020bc41bd2695ed692a392924c6ce180c6f1dc9"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:782e2c9b2aab1708ffb07d4bf377d12901d7a1d99e5e410d648d892f8967ab1f"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-win32.whl", hash = "sha256:edb45bb8278574710e68a6b021599a10ce730d156e5b254941754a9cc0b17d03"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:e271058822765ad5e3bca7f05f2ace0de58a3f4e62045a8c90a0dfd2f8ad8cc6"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e98c8af98d5707dcdecc9ab0863c0ea6e88545d42ca7c3feffb6b4d1e370c7ba"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:952c81d415b9b80ea261d2372d2a4a2332a3890c2b83e0535f263ddfe43f0d43"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80b39d3a151309efc8cc48675918891b865bdf742a8616a337cb0090791a0de9"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e221060b701e2aa2ea991542900dd13907a5c90fa80e199dbf5a03359019e7a3"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:92f09ff65ecff3108e56526f9e2481b8116c0b9e1425325e13245abfd79bdb1b"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-win32.whl", hash = "sha256:3ad54b9ddbe20ae9f7c1b29e52f123120772b06dbb18ec6be9101369d63a4074"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:127a789c75151db6af398b8972178afe6bda7d6f68730c057fbbc2e96b08d282"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4ed0518a14dd26092614412936920ad081a424bdcb54cc13349a8e2c6d106a"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ad9e6ed739285919aa9661a5bbed0aaf410aa60231373c5579c6b4801bd883c"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fc0a92c02fa1ca1e84fc60fa258458e5bf89d90a1ddaeb8ed9cc3147f417255"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0aefc7591920bbd360d57ea03c995cebc204b424524a5bd78406f6e1b8b2a5d8"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5faf03a7d8942bb4476e3b62fd0f4cf94eaf4618e304a19865abf89a35c0bbee"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-win32.whl", hash = "sha256:e333e2324307a7b5d86adfa835bb500ee70bfcd1447384a822e96495796b0ca4"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:cb73507defd385b7705c599a94474b1d5222a508e502553ef94114a143ec6696"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:366c32fe5355ef5fc8a232c5436f4cc66e9d3e8967c01fb2e6302fd6627e3d94"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2297f08f08a2bb0d32a4265e98a006643cd7233fb7983032bd61ac7a02956b3b"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18dd842b49456aaa9a7cf535b04ca4571a302ff72ed8740d06b5adcd41fe0757"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:217138197c170a2a74ca0e05bddcd5f1796c735c37d0eee33e43259b192aa424"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9a3a87cf1e133e5b1994144c12ca4aa3d9698517fe1e2ca82977781b16955658"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-win32.whl", hash = "sha256:30b339b2a743c5288405aa79a69e706a06e02958eab31859f7f3c04980853b70"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:a899b10e17743683b293a729d3a11f2f399e8a90c73b089e29f5d0fe3509f0dd"},
+    {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
+]
+
 [[package]]
 name = "markupsafe"
 version = "2.1.1"
@@ -1422,64 +1521,80 @@ files = [
 
 [[package]]
 name = "moto"
-version = "4.1.2"
+version = "5.0.6"
 description = ""
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"},
-    {file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"},
+    {file = "moto-5.0.6-py2.py3-none-any.whl", hash = "sha256:ca1e22831a741733b581ff2ef4d6ae2e1c6db1eab97af1b78b86ca2c6e88c609"},
+    {file = "moto-5.0.6.tar.gz", hash = "sha256:ad8b23f2b555ad694da8b2432a42b6d96beaaf67a4e7d932196a72193a2eee2c"},
 ]
 
 [package.dependencies]
+antlr4-python3-runtime = {version = "*", optional = true, markers = "extra == \"server\""}
 aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""}
 boto3 = ">=1.9.201"
-botocore = ">=1.12.201"
+botocore = ">=1.14.0"
 cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""}
 cryptography = ">=3.3.1"
-docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""}
-ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""}
+docker = {version = ">=3.0.0", optional = true, markers = "extra == \"server\""}
 flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""}
 flask-cors = {version = "*", optional = true, markers = "extra == \"server\""}
 graphql-core = {version = "*", optional = true, markers = "extra == \"server\""}
 Jinja2 = ">=2.10.1"
+joserfc = {version = ">=0.9.0", optional = true, markers = "extra == \"server\""}
 jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""}
-openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""}
+jsonpath-ng = {version = "*", optional = true, markers = "extra == \"server\""}
+openapi-spec-validator = {version = ">=0.5.0", optional = true, markers = "extra == \"server\""}
+py-partiql-parser = {version = "0.5.4", optional = true, markers = "extra == \"server\""}
 pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""}
 python-dateutil = ">=2.1,<3.0.0"
-python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""}
 PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""}
 requests = ">=2.5"
-responses = ">=0.13.0"
+responses = ">=0.15.0"
 setuptools = {version = "*", optional = true, markers = "extra == \"server\""}
-sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""}
 werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1"
 xmltodict = "*"
 
 [package.extras]
-all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
-apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
-apigatewayv2 = ["PyYAML (>=5.1)"]
+all = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"]
+apigateway = ["PyYAML (>=5.1)", "joserfc (>=0.9.0)", "openapi-spec-validator (>=0.5.0)"]
+apigatewayv2 = ["PyYAML (>=5.1)", "openapi-spec-validator (>=0.5.0)"]
 appsync = ["graphql-core"]
-awslambda = ["docker (>=2.5.1)"]
-batch = ["docker (>=2.5.1)"]
-cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
-cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
-ds = ["sshpubkeys (>=3.1.0)"]
-dynamodb = ["docker (>=2.5.1)"]
-dynamodbstreams = ["docker (>=2.5.1)"]
-ebs = ["sshpubkeys (>=3.1.0)"]
-ec2 = ["sshpubkeys (>=3.1.0)"]
-efs = ["sshpubkeys (>=3.1.0)"]
-eks = ["sshpubkeys (>=3.1.0)"]
+awslambda = ["docker (>=3.0.0)"]
+batch = ["docker (>=3.0.0)"]
+cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"]
+cognitoidp = ["joserfc (>=0.9.0)"]
+dynamodb = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"]
+dynamodbstreams = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"]
 glue = ["pyparsing (>=3.0.7)"]
 iotdata = ["jsondiff (>=1.1.2)"]
-route53resolver = ["sshpubkeys (>=3.1.0)"]
-s3 = ["PyYAML (>=5.1)"]
-server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
+proxy = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"]
+resourcegroupstaggingapi = ["PyYAML (>=5.1)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)"]
+s3 = ["PyYAML (>=5.1)", "py-partiql-parser (==0.5.4)"]
+s3crc32c = ["PyYAML (>=5.1)", "crc32c", "py-partiql-parser (==0.5.4)"]
+server = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"]
 ssm = ["PyYAML (>=5.1)"]
+stepfunctions = ["antlr4-python3-runtime", "jsonpath-ng"]
 xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]
 
+[[package]]
+name = "mpmath"
+version = "1.3.0"
+description = "Python library for arbitrary-precision floating-point arithmetic"
+optional = false
+python-versions = "*"
+files = [
+    {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"},
+    {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"},
+]
+
+[package.extras]
+develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"]
+docs = ["sphinx"]
+gmpy = ["gmpy2 (>=2.1.0a4)"]
+tests = ["pytest (>=4.6)"]
+
 [[package]]
 name = "multidict"
 version = "6.0.4"
@@ -1654,42 +1769,38 @@ test = ["codecov (>=2.1)", "pytest (>=7.1)", "pytest-cov (>=3.0)"]
 
 [[package]]
 name = "openapi-schema-validator"
-version = "0.2.3"
+version = "0.4.4"
 description = "OpenAPI schema validation for Python"
 optional = false
 python-versions = ">=3.7.0,<4.0.0"
 files = [
-    {file = "openapi-schema-validator-0.2.3.tar.gz", hash = "sha256:2c64907728c3ef78e23711c8840a423f0b241588c9ed929855e4b2d1bb0cf5f2"},
-    {file = "openapi_schema_validator-0.2.3-py3-none-any.whl", hash = "sha256:9bae709212a19222892cabcc60cafd903cbf4b220223f48583afa3c0e3cc6fc4"},
+    {file = "openapi_schema_validator-0.4.4-py3-none-any.whl", hash = "sha256:79f37f38ef9fd5206b924ed7a6f382cea7b649b3b56383c47f1906082b7b9015"},
+    {file = "openapi_schema_validator-0.4.4.tar.gz", hash = "sha256:c573e2be2c783abae56c5a1486ab716ca96e09d1c3eab56020d1dc680aa57bf8"},
 ]
 
 [package.dependencies]
-jsonschema = ">=3.0.0,<5.0.0"
+jsonschema = ">=4.0.0,<4.18.0"
+rfc3339-validator = "*"
 
 [package.extras]
-isodate = ["isodate"]
-rfc3339-validator = ["rfc3339-validator"]
-strict-rfc3339 = ["strict-rfc3339"]
+docs = ["sphinx (>=5.3.0,<6.0.0)", "sphinx-immaterial (>=0.11.0,<0.12.0)"]
 
 [[package]]
 name = "openapi-spec-validator"
-version = "0.4.0"
-description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3.0 spec validator"
+version = "0.5.7"
+description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3 spec validator"
 optional = false
 python-versions = ">=3.7.0,<4.0.0"
 files = [
-    {file = "openapi-spec-validator-0.4.0.tar.gz", hash = "sha256:97f258850afc97b048f7c2653855e0f88fa66ac103c2be5077c7960aca2ad49a"},
-    {file = "openapi_spec_validator-0.4.0-py3-none-any.whl", hash = "sha256:06900ac4d546a1df3642a779da0055be58869c598e3042a2fef067cfd99d04d0"},
+    {file = "openapi_spec_validator-0.5.7-py3-none-any.whl", hash = "sha256:8712d2879db7692974ef89c47a3ebfc79436442921ec3a826ac0ce80cde8c549"},
+    {file = "openapi_spec_validator-0.5.7.tar.gz", hash = "sha256:6c2d42180045a80fd6314de848b94310bdb0fa4949f4b099578b69f79d9fa5ac"},
 ]
 
 [package.dependencies]
-jsonschema = ">=3.2.0,<5.0.0"
-openapi-schema-validator = ">=0.2.0,<0.3.0"
-PyYAML = ">=5.1"
-setuptools = "*"
-
-[package.extras]
-requests = ["requests"]
+jsonschema = ">=4.0.0,<4.18.0"
+jsonschema-spec = ">=0.1.1,<0.2.0"
+lazy-object-proxy = ">=1.7.1,<2.0.0"
+openapi-schema-validator = ">=0.4.2,<0.5.0"
 
 [[package]]
 name = "packaging"
@@ -1702,6 +1813,17 @@ files = [
     {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"},
 ]
 
+[[package]]
+name = "pathable"
+version = "0.4.3"
+description = "Object-oriented paths"
+optional = false
+python-versions = ">=3.7.0,<4.0.0"
+files = [
+    {file = "pathable-0.4.3-py3-none-any.whl", hash = "sha256:cdd7b1f9d7d5c8b8d3315dbf5a86b2596053ae845f056f57d97c0eefff84da14"},
+    {file = "pathable-0.4.3.tar.gz", hash = "sha256:5c869d315be50776cc8a993f3af43e0c60dc01506b399643f919034ebf4cdcab"},
+]
+
 [[package]]
 name = "pbr"
 version = "5.9.0"
@@ -1728,6 +1850,17 @@ files = [
 dev = ["pre-commit", "tox"]
 testing = ["pytest", "pytest-benchmark"]
 
+[[package]]
+name = "ply"
+version = "3.11"
+description = "Python Lex & Yacc"
+optional = false
+python-versions = "*"
+files = [
+    {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"},
+    {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"},
+]
+
 [[package]]
 name = "prometheus-client"
 version = "0.14.1"
@@ -1840,16 +1973,19 @@ files = [
 ]
 
 [[package]]
-name = "pyasn1"
-version = "0.4.8"
-description = "ASN.1 types and codecs"
+name = "py-partiql-parser"
+version = "0.5.4"
+description = "Pure Python PartiQL Parser"
 optional = false
 python-versions = "*"
 files = [
-    {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
-    {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
+    {file = "py_partiql_parser-0.5.4-py2.py3-none-any.whl", hash = "sha256:3dc4295a47da9587681a96b35c6e151886fdbd0a4acbe0d97c4c68e5f689d315"},
+    {file = "py_partiql_parser-0.5.4.tar.gz", hash = "sha256:72e043919538fa63edae72fb59afc7e3fd93adbde656718a7d2b4666f23dd114"},
 ]
 
+[package.extras]
+dev = ["black (==22.6.0)", "flake8", "mypy", "pytest"]
+
 [[package]]
 name = "pycparser"
 version = "2.21"
@@ -1861,6 +1997,116 @@ files = [
     {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
 ]
 
+[[package]]
+name = "pydantic"
+version = "2.7.1"
+description = "Data validation using Python type hints"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pydantic-2.7.1-py3-none-any.whl", hash = "sha256:e029badca45266732a9a79898a15ae2e8b14840b1eabbb25844be28f0b33f3d5"},
+    {file = "pydantic-2.7.1.tar.gz", hash = "sha256:e9dbb5eada8abe4d9ae5f46b9939aead650cd2b68f249bb3a8139dbe125803cc"},
+]
+
+[package.dependencies]
+annotated-types = ">=0.4.0"
+pydantic-core = "2.18.2"
+typing-extensions = ">=4.6.1"
+
+[package.extras]
+email = ["email-validator (>=2.0.0)"]
+
+[[package]]
+name = "pydantic-core"
+version = "2.18.2"
+description = "Core functionality for Pydantic validation and serialization"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pydantic_core-2.18.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:9e08e867b306f525802df7cd16c44ff5ebbe747ff0ca6cf3fde7f36c05a59a81"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f0a21cbaa69900cbe1a2e7cad2aa74ac3cf21b10c3efb0fa0b80305274c0e8a2"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0680b1f1f11fda801397de52c36ce38ef1c1dc841a0927a94f226dea29c3ae3d"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:95b9d5e72481d3780ba3442eac863eae92ae43a5f3adb5b4d0a1de89d42bb250"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fcf5cd9c4b655ad666ca332b9a081112cd7a58a8b5a6ca7a3104bc950f2038"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b5155ff768083cb1d62f3e143b49a8a3432e6789a3abee8acd005c3c7af1c74"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:553ef617b6836fc7e4df130bb851e32fe357ce36336d897fd6646d6058d980af"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b89ed9eb7d616ef5714e5590e6cf7f23b02d0d539767d33561e3675d6f9e3857"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:75f7e9488238e920ab6204399ded280dc4c307d034f3924cd7f90a38b1829563"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ef26c9e94a8c04a1b2924149a9cb081836913818e55681722d7f29af88fe7b38"},
+    {file = "pydantic_core-2.18.2-cp310-none-win32.whl", hash = "sha256:182245ff6b0039e82b6bb585ed55a64d7c81c560715d1bad0cbad6dfa07b4027"},
+    {file = "pydantic_core-2.18.2-cp310-none-win_amd64.whl", hash = "sha256:e23ec367a948b6d812301afc1b13f8094ab7b2c280af66ef450efc357d2ae543"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:219da3f096d50a157f33645a1cf31c0ad1fe829a92181dd1311022f986e5fbe3"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc1cfd88a64e012b74e94cd00bbe0f9c6df57049c97f02bb07d39e9c852e19a4"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05b7133a6e6aeb8df37d6f413f7705a37ab4031597f64ab56384c94d98fa0e90"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:224c421235f6102e8737032483f43c1a8cfb1d2f45740c44166219599358c2cd"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b14d82cdb934e99dda6d9d60dc84a24379820176cc4a0d123f88df319ae9c150"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2728b01246a3bba6de144f9e3115b532ee44bd6cf39795194fb75491824a1413"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:470b94480bb5ee929f5acba6995251ada5e059a5ef3e0dfc63cca287283ebfa6"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:997abc4df705d1295a42f95b4eec4950a37ad8ae46d913caeee117b6b198811c"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75250dbc5290e3f1a0f4618db35e51a165186f9034eff158f3d490b3fed9f8a0"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4456f2dca97c425231d7315737d45239b2b51a50dc2b6f0c2bb181fce6207664"},
+    {file = "pydantic_core-2.18.2-cp311-none-win32.whl", hash = "sha256:269322dcc3d8bdb69f054681edff86276b2ff972447863cf34c8b860f5188e2e"},
+    {file = "pydantic_core-2.18.2-cp311-none-win_amd64.whl", hash = "sha256:800d60565aec896f25bc3cfa56d2277d52d5182af08162f7954f938c06dc4ee3"},
+    {file = "pydantic_core-2.18.2-cp311-none-win_arm64.whl", hash = "sha256:1404c69d6a676245199767ba4f633cce5f4ad4181f9d0ccb0577e1f66cf4c46d"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:fb2bd7be70c0fe4dfd32c951bc813d9fe6ebcbfdd15a07527796c8204bd36242"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6132dd3bd52838acddca05a72aafb6eab6536aa145e923bb50f45e78b7251043"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d904828195733c183d20a54230c0df0eb46ec746ea1a666730787353e87182"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c9bd70772c720142be1020eac55f8143a34ec9f82d75a8e7a07852023e46617f"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8ed04b3582771764538f7ee7001b02e1170223cf9b75dff0bc698fadb00cf3"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e6dac87ddb34aaec85f873d737e9d06a3555a1cc1a8e0c44b7f8d5daeb89d86f"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ca4ae5a27ad7a4ee5170aebce1574b375de390bc01284f87b18d43a3984df72"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:886eec03591b7cf058467a70a87733b35f44707bd86cf64a615584fd72488b7c"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ca7b0c1f1c983e064caa85f3792dd2fe3526b3505378874afa84baf662e12241"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b4356d3538c3649337df4074e81b85f0616b79731fe22dd11b99499b2ebbdf3"},
+    {file = "pydantic_core-2.18.2-cp312-none-win32.whl", hash = "sha256:8b172601454f2d7701121bbec3425dd71efcb787a027edf49724c9cefc14c038"},
+    {file = "pydantic_core-2.18.2-cp312-none-win_amd64.whl", hash = "sha256:b1bd7e47b1558ea872bd16c8502c414f9e90dcf12f1395129d7bb42a09a95438"},
+    {file = "pydantic_core-2.18.2-cp312-none-win_arm64.whl", hash = "sha256:98758d627ff397e752bc339272c14c98199c613f922d4a384ddc07526c86a2ec"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:9fdad8e35f278b2c3eb77cbdc5c0a49dada440657bf738d6905ce106dc1de439"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1d90c3265ae107f91a4f279f4d6f6f1d4907ac76c6868b27dc7fb33688cfb347"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:390193c770399861d8df9670fb0d1874f330c79caaca4642332df7c682bf6b91"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:82d5d4d78e4448683cb467897fe24e2b74bb7b973a541ea1dcfec1d3cbce39fb"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4774f3184d2ef3e14e8693194f661dea5a4d6ca4e3dc8e39786d33a94865cefd"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4d938ec0adf5167cb335acb25a4ee69a8107e4984f8fbd2e897021d9e4ca21b"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0e8b1be28239fc64a88a8189d1df7fad8be8c1ae47fcc33e43d4be15f99cc70"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:868649da93e5a3d5eacc2b5b3b9235c98ccdbfd443832f31e075f54419e1b96b"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:78363590ef93d5d226ba21a90a03ea89a20738ee5b7da83d771d283fd8a56761"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:852e966fbd035a6468fc0a3496589b45e2208ec7ca95c26470a54daed82a0788"},
+    {file = "pydantic_core-2.18.2-cp38-none-win32.whl", hash = "sha256:6a46e22a707e7ad4484ac9ee9f290f9d501df45954184e23fc29408dfad61350"},
+    {file = "pydantic_core-2.18.2-cp38-none-win_amd64.whl", hash = "sha256:d91cb5ea8b11607cc757675051f61b3d93f15eca3cefb3e6c704a5d6e8440f4e"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ae0a8a797a5e56c053610fa7be147993fe50960fa43609ff2a9552b0e07013e8"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:042473b6280246b1dbf530559246f6842b56119c2926d1e52b631bdc46075f2a"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a388a77e629b9ec814c1b1e6b3b595fe521d2cdc625fcca26fbc2d44c816804"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25add29b8f3b233ae90ccef2d902d0ae0432eb0d45370fe315d1a5cf231004b"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f459a5ce8434614dfd39bbebf1041952ae01da6bed9855008cb33b875cb024c0"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eff2de745698eb46eeb51193a9f41d67d834d50e424aef27df2fcdee1b153845"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8309f67285bdfe65c372ea3722b7a5642680f3dba538566340a9d36e920b5f0"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f93a8a2e3938ff656a7c1bc57193b1319960ac015b6e87d76c76bf14fe0244b4"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:22057013c8c1e272eb8d0eebc796701167d8377441ec894a8fed1af64a0bf399"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfeecd1ac6cc1fb2692c3d5110781c965aabd4ec5d32799773ca7b1456ac636b"},
+    {file = "pydantic_core-2.18.2-cp39-none-win32.whl", hash = "sha256:0d69b4c2f6bb3e130dba60d34c0845ba31b69babdd3f78f7c0c8fae5021a253e"},
+    {file = "pydantic_core-2.18.2-cp39-none-win_amd64.whl", hash = "sha256:d9319e499827271b09b4e411905b24a426b8fb69464dfa1696258f53a3334641"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a1874c6dd4113308bd0eb568418e6114b252afe44319ead2b4081e9b9521fe75"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:ccdd111c03bfd3666bd2472b674c6899550e09e9f298954cfc896ab92b5b0e6d"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e18609ceaa6eed63753037fc06ebb16041d17d28199ae5aba0052c51449650a9"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e5c584d357c4e2baf0ff7baf44f4994be121e16a2c88918a5817331fc7599d7"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43f0f463cf89ace478de71a318b1b4f05ebc456a9b9300d027b4b57c1a2064fb"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e1b395e58b10b73b07b7cf740d728dd4ff9365ac46c18751bf8b3d8cca8f625a"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0098300eebb1c837271d3d1a2cd2911e7c11b396eac9661655ee524a7f10587b"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:36789b70d613fbac0a25bb07ab3d9dba4d2e38af609c020cf4d888d165ee0bf3"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f9a801e7c8f1ef8718da265bba008fa121243dfe37c1cea17840b0944dfd72c"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:3a6515ebc6e69d85502b4951d89131ca4e036078ea35533bb76327f8424531ce"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20aca1e2298c56ececfd8ed159ae4dde2df0781988c97ef77d5c16ff4bd5b400"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:223ee893d77a310a0391dca6df00f70bbc2f36a71a895cecd9a0e762dc37b349"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2334ce8c673ee93a1d6a65bd90327588387ba073c17e61bf19b4fd97d688d63c"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cbca948f2d14b09d20268cda7b0367723d79063f26c4ffc523af9042cad95592"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b3ef08e20ec49e02d5c6717a91bb5af9b20f1805583cb0adfe9ba2c6b505b5ae"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6fdc8627910eed0c01aed6a390a252fe3ea6d472ee70fdde56273f198938374"},
+    {file = "pydantic_core-2.18.2.tar.gz", hash = "sha256:2e29d20810dfc3043ee13ac7d9e25105799817683348823f305ab3f349b9386e"},
+]
+
+[package.dependencies]
+typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
+
 [[package]]
 name = "pyjwt"
 version = "2.4.0"
@@ -2115,28 +2361,6 @@ files = [
 [package.dependencies]
 six = ">=1.5"
 
-[[package]]
-name = "python-jose"
-version = "3.3.0"
-description = "JOSE implementation in Python"
-optional = false
-python-versions = "*"
-files = [
-    {file = "python-jose-3.3.0.tar.gz", hash = "sha256:55779b5e6ad599c6336191246e95eb2293a9ddebd555f796a65f838f07e5d78a"},
-    {file = "python_jose-3.3.0-py2.py3-none-any.whl", hash = "sha256:9b1376b023f8b298536eedd47ae1089bcdb848f1535ab30555cd92002d78923a"},
-]
-
-[package.dependencies]
-cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"cryptography\""}
-ecdsa = "!=0.15"
-pyasn1 = "*"
-rsa = "*"
-
-[package.extras]
-cryptography = ["cryptography (>=3.4.0)"]
-pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"]
-pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"]
-
 [[package]]
 name = "pywin32"
 version = "301"
@@ -2181,7 +2405,6 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2216,6 +2439,94 @@ files = [
     {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
 ]
 
+[[package]]
+name = "regex"
+version = "2024.4.28"
+description = "Alternative regular expression module, to replace re."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd196d056b40af073d95a2879678585f0b74ad35190fac04ca67954c582c6b61"},
+    {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8bb381f777351bd534462f63e1c6afb10a7caa9fa2a421ae22c26e796fe31b1f"},
+    {file = "regex-2024.4.28-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:47af45b6153522733aa6e92543938e97a70ce0900649ba626cf5aad290b737b6"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99d6a550425cc51c656331af0e2b1651e90eaaa23fb4acde577cf15068e2e20f"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf29304a8011feb58913c382902fde3395957a47645bf848eea695839aa101b7"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:92da587eee39a52c91aebea8b850e4e4f095fe5928d415cb7ed656b3460ae79a"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6277d426e2f31bdbacb377d17a7475e32b2d7d1f02faaecc48d8e370c6a3ff31"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28e1f28d07220c0f3da0e8fcd5a115bbb53f8b55cecf9bec0c946eb9a059a94c"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:aaa179975a64790c1f2701ac562b5eeb733946eeb036b5bcca05c8d928a62f10"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6f435946b7bf7a1b438b4e6b149b947c837cb23c704e780c19ba3e6855dbbdd3"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:19d6c11bf35a6ad077eb23852827f91c804eeb71ecb85db4ee1386825b9dc4db"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:fdae0120cddc839eb8e3c15faa8ad541cc6d906d3eb24d82fb041cfe2807bc1e"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e672cf9caaf669053121f1766d659a8813bd547edef6e009205378faf45c67b8"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f57515750d07e14743db55d59759893fdb21d2668f39e549a7d6cad5d70f9fea"},
+    {file = "regex-2024.4.28-cp310-cp310-win32.whl", hash = "sha256:a1409c4eccb6981c7baabc8888d3550df518add6e06fe74fa1d9312c1838652d"},
+    {file = "regex-2024.4.28-cp310-cp310-win_amd64.whl", hash = "sha256:1f687a28640f763f23f8a9801fe9e1b37338bb1ca5d564ddd41619458f1f22d1"},
+    {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:84077821c85f222362b72fdc44f7a3a13587a013a45cf14534df1cbbdc9a6796"},
+    {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b45d4503de8f4f3dc02f1d28a9b039e5504a02cc18906cfe744c11def942e9eb"},
+    {file = "regex-2024.4.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:457c2cd5a646dd4ed536c92b535d73548fb8e216ebee602aa9f48e068fc393f3"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b51739ddfd013c6f657b55a508de8b9ea78b56d22b236052c3a85a675102dc6"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:459226445c7d7454981c4c0ce0ad1a72e1e751c3e417f305722bbcee6697e06a"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:670fa596984b08a4a769491cbdf22350431970d0112e03d7e4eeaecaafcd0fec"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe00f4fe11c8a521b173e6324d862ee7ee3412bf7107570c9b564fe1119b56fb"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36f392dc7763fe7924575475736bddf9ab9f7a66b920932d0ea50c2ded2f5636"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:23a412b7b1a7063f81a742463f38821097b6a37ce1e5b89dd8e871d14dbfd86b"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f1d6e4b7b2ae3a6a9df53efbf199e4bfcff0959dbdb5fd9ced34d4407348e39a"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:499334ad139557de97cbc4347ee921c0e2b5e9c0f009859e74f3f77918339257"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:0940038bec2fe9e26b203d636c44d31dd8766abc1fe66262da6484bd82461ccf"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:66372c2a01782c5fe8e04bff4a2a0121a9897e19223d9eab30c54c50b2ebeb7f"},
+    {file = "regex-2024.4.28-cp311-cp311-win32.whl", hash = "sha256:c77d10ec3c1cf328b2f501ca32583625987ea0f23a0c2a49b37a39ee5c4c4630"},
+    {file = "regex-2024.4.28-cp311-cp311-win_amd64.whl", hash = "sha256:fc0916c4295c64d6890a46e02d4482bb5ccf33bf1a824c0eaa9e83b148291f90"},
+    {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:08a1749f04fee2811c7617fdd46d2e46d09106fa8f475c884b65c01326eb15c5"},
+    {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b8eb28995771c087a73338f695a08c9abfdf723d185e57b97f6175c5051ff1ae"},
+    {file = "regex-2024.4.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dd7ef715ccb8040954d44cfeff17e6b8e9f79c8019daae2fd30a8806ef5435c0"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb0315a2b26fde4005a7c401707c5352df274460f2f85b209cf6024271373013"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f2fc053228a6bd3a17a9b0a3f15c3ab3cf95727b00557e92e1cfe094b88cc662"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7fe9739a686dc44733d52d6e4f7b9c77b285e49edf8570754b322bca6b85b4cc"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74fcf77d979364f9b69fcf8200849ca29a374973dc193a7317698aa37d8b01c"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:965fd0cf4694d76f6564896b422724ec7b959ef927a7cb187fc6b3f4e4f59833"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2fef0b38c34ae675fcbb1b5db760d40c3fc3612cfa186e9e50df5782cac02bcd"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bc365ce25f6c7c5ed70e4bc674f9137f52b7dd6a125037f9132a7be52b8a252f"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:ac69b394764bb857429b031d29d9604842bc4cbfd964d764b1af1868eeebc4f0"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:144a1fc54765f5c5c36d6d4b073299832aa1ec6a746a6452c3ee7b46b3d3b11d"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2630ca4e152c221072fd4a56d4622b5ada876f668ecd24d5ab62544ae6793ed6"},
+    {file = "regex-2024.4.28-cp312-cp312-win32.whl", hash = "sha256:7f3502f03b4da52bbe8ba962621daa846f38489cae5c4a7b5d738f15f6443d17"},
+    {file = "regex-2024.4.28-cp312-cp312-win_amd64.whl", hash = "sha256:0dd3f69098511e71880fb00f5815db9ed0ef62c05775395968299cb400aeab82"},
+    {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:374f690e1dd0dbdcddea4a5c9bdd97632cf656c69113f7cd6a361f2a67221cb6"},
+    {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25f87ae6b96374db20f180eab083aafe419b194e96e4f282c40191e71980c666"},
+    {file = "regex-2024.4.28-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5dbc1bcc7413eebe5f18196e22804a3be1bfdfc7e2afd415e12c068624d48247"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f85151ec5a232335f1be022b09fbbe459042ea1951d8a48fef251223fc67eee1"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:57ba112e5530530fd175ed550373eb263db4ca98b5f00694d73b18b9a02e7185"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:224803b74aab56aa7be313f92a8d9911dcade37e5f167db62a738d0c85fdac4b"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a54a047b607fd2d2d52a05e6ad294602f1e0dec2291152b745870afc47c1397"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a2a512d623f1f2d01d881513af9fc6a7c46e5cfffb7dc50c38ce959f9246c94"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c06bf3f38f0707592898428636cbb75d0a846651b053a1cf748763e3063a6925"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1031a5e7b048ee371ab3653aad3030ecfad6ee9ecdc85f0242c57751a05b0ac4"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d7a353ebfa7154c871a35caca7bfd8f9e18666829a1dc187115b80e35a29393e"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7e76b9cfbf5ced1aca15a0e5b6f229344d9b3123439ffce552b11faab0114a02"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:5ce479ecc068bc2a74cb98dd8dba99e070d1b2f4a8371a7dfe631f85db70fe6e"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7d77b6f63f806578c604dca209280e4c54f0fa9a8128bb8d2cc5fb6f99da4150"},
+    {file = "regex-2024.4.28-cp38-cp38-win32.whl", hash = "sha256:d84308f097d7a513359757c69707ad339da799e53b7393819ec2ea36bc4beb58"},
+    {file = "regex-2024.4.28-cp38-cp38-win_amd64.whl", hash = "sha256:2cc1b87bba1dd1a898e664a31012725e48af826bf3971e786c53e32e02adae6c"},
+    {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7413167c507a768eafb5424413c5b2f515c606be5bb4ef8c5dee43925aa5718b"},
+    {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:108e2dcf0b53a7c4ab8986842a8edcb8ab2e59919a74ff51c296772e8e74d0ae"},
+    {file = "regex-2024.4.28-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f1c5742c31ba7d72f2dedf7968998730664b45e38827637e0f04a2ac7de2f5f1"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecc6148228c9ae25ce403eade13a0961de1cb016bdb35c6eafd8e7b87ad028b1"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7d893c8cf0e2429b823ef1a1d360a25950ed11f0e2a9df2b5198821832e1947"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4290035b169578ffbbfa50d904d26bec16a94526071ebec3dadbebf67a26b25e"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a22ae1cfd82e4ffa2066eb3390777dc79468f866f0625261a93e44cdf6482b"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd24fd140b69f0b0bcc9165c397e9b2e89ecbeda83303abf2a072609f60239e2"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:39fb166d2196413bead229cd64a2ffd6ec78ebab83fff7d2701103cf9f4dfd26"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9301cc6db4d83d2c0719f7fcda37229691745168bf6ae849bea2e85fc769175d"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7c3d389e8d76a49923683123730c33e9553063d9041658f23897f0b396b2386f"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:99ef6289b62042500d581170d06e17f5353b111a15aa6b25b05b91c6886df8fc"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:b91d529b47798c016d4b4c1d06cc826ac40d196da54f0de3c519f5a297c5076a"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:43548ad74ea50456e1c68d3c67fff3de64c6edb85bcd511d1136f9b5376fc9d1"},
+    {file = "regex-2024.4.28-cp39-cp39-win32.whl", hash = "sha256:05d9b6578a22db7dedb4df81451f360395828b04f4513980b6bd7a1412c679cc"},
+    {file = "regex-2024.4.28-cp39-cp39-win_amd64.whl", hash = "sha256:3986217ec830c2109875be740531feb8ddafe0dfa49767cdcd072ed7e8927962"},
+    {file = "regex-2024.4.28.tar.gz", hash = "sha256:83ab366777ea45d58f72593adf35d36ca911ea8bd838483c1823b883a121b0e4"},
+]
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -2256,18 +2567,18 @@ urllib3 = ">=1.25.10"
 tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-localserver", "types-mock", "types-requests"]
 
 [[package]]
-name = "rsa"
-version = "4.9"
-description = "Pure-Python RSA implementation"
+name = "rfc3339-validator"
+version = "0.1.4"
+description = "A pure python RFC3339 validator"
 optional = false
-python-versions = ">=3.6,<4"
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 files = [
-    {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"},
-    {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"},
+    {file = "rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa"},
+    {file = "rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b"},
 ]
 
 [package.dependencies]
-pyasn1 = ">=0.1.3"
+six = "*"
 
 [[package]]
 name = "ruff"
@@ -2366,22 +2677,18 @@ files = [
 ]
 
 [[package]]
-name = "sshpubkeys"
-version = "3.3.1"
-description = "SSH public key parser"
+name = "sympy"
+version = "1.12"
+description = "Computer algebra system (CAS) in Python"
 optional = false
-python-versions = ">=3"
+python-versions = ">=3.8"
 files = [
-    {file = "sshpubkeys-3.3.1-py2.py3-none-any.whl", hash = "sha256:946f76b8fe86704b0e7c56a00d80294e39bc2305999844f079a217885060b1ac"},
-    {file = "sshpubkeys-3.3.1.tar.gz", hash = "sha256:3020ed4f8c846849299370fbe98ff4157b0ccc1accec105e07cfa9ae4bb55064"},
+    {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"},
+    {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"},
 ]
 
 [package.dependencies]
-cryptography = ">=2.1.4"
-ecdsa = ">=0.13"
-
-[package.extras]
-dev = ["twine", "wheel", "yapf"]
+mpmath = ">=0.19"
 
 [[package]]
 name = "toml"
@@ -2652,16 +2959,6 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2899,4 +3196,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "496d6d9f722983bda4d1265370bc8ba75560da74ab5d6b68c94a03290815e1eb"
+content-hash = "dcde14c58a32bda5f123319a069352c458b3719f3c62977991eebb9803a46a9e"
diff --git a/pyproject.toml b/pyproject.toml
index 4ec8efc2ff..ac7f9b061c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ types-requests = "^2.31.0.0"
 types-psycopg2 = "^2.9.21.10"
 boto3 = "^1.34.11"
 boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
-moto = {extras = ["server"], version = "^4.1.2"}
+moto = {extras = ["server"], version = "^5.0.6"}
 backoff = "^2.2.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 925e1b450f..132d2450a7 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -50,7 +50,7 @@ class MockS3Server:
         # XXX: do not use `shell=True` or add `exec ` to the command here otherwise.
         # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux
         # if a process is started from the shell process.
-        self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"])
+        self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", f"-p{port}"])
         error = None
         try:
             return_code = self.subprocess.poll()

From 8728d5a5fd261a483560fcc53ece7cc51ec82600 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 8 May 2024 16:32:21 +0200
Subject: [PATCH 0731/1571] neon_local: use `pageserver.toml` as source of
 truth for `struct PageServerConf` (#7642)

Before this PR, `neon_local` would store a copy of a subset of the
initial `pageserver.toml` in its `.neon/config`, e.g, `listen_pg_addr`.
That copy is represented as `struct PageServerConf`.

This copy was used to inform e.g., `neon_local endpoint` and other
commands that depend on Pageserver about which port to connect to.

The problem with that scheme is that the duplicated information in
`.neon/config` can get stale if `pageserver.toml` is changed.

This PR fixes that by eliminating populating `struct PageServerConf`
from the `pageserver.toml`s.

The `[[pageservers]]` TOML table in the `.neon/config` is obsolete.
As of this PR, `neon_local` will fail to start and print an error
informing about this change.

Code-level changes:

- Remove the `--pg-version` flag, it was only used for some checks
during `neon_local init`
- Remove the warn-but-continue behavior for when auth key creation fails
but auth keys are not required. It's just complexity that is unjustified
for a tool like `neon_local`.
- Introduce a type-system-level distinction between the runtime state
and the two (!) toml formats that are almost the same but not quite.
  - runtime state: `struct PageServerConf`, now without `serde` derives
  - toml format 1: the state in `.neon/config` => `struct OnDiskState`
- toml format 2: the `neon_local init --config TMPFILE` that, unlike
`struct OnDiskState`, allows specifying `pageservers`
- Remove `[[pageservers]]` from the `struct OnDiskState` and load the
data from the individual `pageserver.toml`s instead.
---
 control_plane/src/bin/neon_local.rs        | 151 +++----
 control_plane/src/local_env.rs             | 468 +++++++++++++--------
 control_plane/src/pageserver.rs            |  81 +---
 test_runner/fixtures/neon_fixtures.py      |  54 +--
 test_runner/regress/test_pageserver_api.py |  35 +-
 5 files changed, 412 insertions(+), 377 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 179a756135..18e395e2b5 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -9,8 +9,11 @@ use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
 use control_plane::endpoint::ComputeControlPlane;
-use control_plane::local_env::{InitForceMode, LocalEnv};
-use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
+use control_plane::local_env::{
+    InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
+    SafekeeperConf,
+};
+use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
@@ -52,44 +55,6 @@ const DEFAULT_PG_VERSION: &str = "15";
 
 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
 
-fn default_conf(num_pageservers: u16) -> String {
-    let mut template = format!(
-        r#"
-# Default built-in configuration, defined in main.rs
-control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
-
-[broker]
-listen_addr = '{DEFAULT_BROKER_ADDR}'
-
-[[safekeepers]]
-id = {DEFAULT_SAFEKEEPER_ID}
-pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
-http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
-
-"#,
-    );
-
-    for i in 0..num_pageservers {
-        let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
-        let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
-        let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
-
-        template += &format!(
-            r#"
-[[pageservers]]
-id = {pageserver_id}
-listen_pg_addr = '127.0.0.1:{pg_port}'
-listen_http_addr = '127.0.0.1:{http_port}'
-pg_auth_type = '{trust_auth}'
-http_auth_type = '{trust_auth}'
-"#,
-            trust_auth = AuthType::Trust,
-        )
-    }
-
-    template
-}
-
 ///
 /// Timelines tree element used as a value in the HashMap.
 ///
@@ -152,7 +117,7 @@ fn main() -> Result<()> {
     };
 
     match subcommand_result {
-        Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?,
+        Ok(Some(updated_env)) => updated_env.persist_config()?,
         Ok(None) => (),
         Err(e) => {
             eprintln!("command failed: {e:?}");
@@ -341,55 +306,65 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
 }
 
 fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
-    let num_pageservers = init_match
-        .get_one::<u16>("num-pageservers")
-        .expect("num-pageservers arg has a default");
-    // Create config file
-    let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
+    let num_pageservers = init_match.get_one::<u16>("num-pageservers");
+
+    let force = init_match.get_one("force").expect("we set a default value");
+
+    // Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`.
+    let init_conf: NeonLocalInitConf = if let Some(config_path) =
+        init_match.get_one::<PathBuf>("config")
+    {
+        // User (likely the Python test suite) provided a description of the environment.
+        if num_pageservers.is_some() {
+            bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead");
+        }
         // load and parse the file
-        std::fs::read_to_string(config_path).with_context(|| {
+        let contents = std::fs::read_to_string(config_path).with_context(|| {
             format!(
                 "Could not read configuration file '{}'",
                 config_path.display()
             )
-        })?
+        })?;
+        toml_edit::de::from_str(&contents)?
     } else {
-        // Built-in default config
-        default_conf(*num_pageservers)
+        // User (likely interactive) did not provide a description of the environment, give them the default
+        NeonLocalInitConf {
+            control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())),
+            broker: NeonBroker {
+                listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(),
+            },
+            safekeepers: vec![SafekeeperConf {
+                id: DEFAULT_SAFEKEEPER_ID,
+                pg_port: DEFAULT_SAFEKEEPER_PG_PORT,
+                http_port: DEFAULT_SAFEKEEPER_HTTP_PORT,
+                ..Default::default()
+            }],
+            pageservers: (0..num_pageservers.copied().unwrap_or(1))
+                .map(|i| {
+                    let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
+                    let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
+                    let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
+                    NeonLocalInitPageserverConf {
+                        id: pageserver_id,
+                        listen_pg_addr: format!("127.0.0.1:{pg_port}"),
+                        listen_http_addr: format!("127.0.0.1:{http_port}"),
+                        pg_auth_type: AuthType::Trust,
+                        http_auth_type: AuthType::Trust,
+                        other: Default::default(),
+                    }
+                })
+                .collect(),
+            pg_distrib_dir: None,
+            neon_distrib_dir: None,
+            default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
+            storage_controller: None,
+            control_plane_compute_hook_api: None,
+        }
     };
 
-    let pageserver_config: toml_edit::Document =
-        if let Some(path) = init_match.get_one::<PathBuf>("pageserver-config") {
-            std::fs::read_to_string(path)?.parse()?
-        } else {
-            toml_edit::Document::new()
-        };
-
-    let pg_version = init_match
-        .get_one::<u32>("pg-version")
-        .copied()
-        .context("Failed to parse postgres version from the argument string")?;
-
-    let mut env =
-        LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
-    let force = init_match.get_one("force").expect("we set a default value");
-    env.init(pg_version, force)
-        .context("Failed to initialize neon repository")?;
-
-    // Create remote storage location for default LocalFs remote storage
-    std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
-
-    // Initialize pageserver, create initial tenant and timeline.
-    for ps_conf in &env.pageservers {
-        PageServerNode::from_env(&env, ps_conf)
-            .initialize(pageserver_config.clone())
-            .unwrap_or_else(|e| {
-                eprintln!("pageserver init failed: {e:?}");
-                exit(1);
-            });
-    }
-
-    Ok(env)
+    LocalEnv::init(init_conf, force)
+        .context("materialize initial neon_local environment on disk")?;
+    Ok(LocalEnv::load_config().expect("freshly written config should be loadable"))
 }
 
 /// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
@@ -1418,9 +1393,7 @@ fn cli() -> Command {
     let num_pageservers_arg = Arg::new("num-pageservers")
         .value_parser(value_parser!(u16))
         .long("num-pageservers")
-        .help("How many pageservers to create (default 1)")
-        .required(false)
-        .default_value("1");
+        .help("How many pageservers to create (default 1)");
 
     let update_catalog = Arg::new("update-catalog")
         .value_parser(value_parser!(bool))
@@ -1454,14 +1427,6 @@ fn cli() -> Command {
                         .value_parser(value_parser!(PathBuf))
                         .value_name("config")
                 )
-                .arg(
-                    Arg::new("pageserver-config")
-                        .long("pageserver-config")
-                        .required(false)
-                        .value_parser(value_parser!(PathBuf))
-                        .value_name("pageserver-config")
-                        .help("Merge the provided pageserver config into the one generated by neon_local."),
-                )
                 .arg(pg_version_arg.clone())
                 .arg(force_arg)
         )
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 7abbbce95a..d13884198e 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -3,7 +3,7 @@
 //! Now it also provides init method which acts like a stub for proper installation
 //! script which will use local paths.
 
-use anyhow::{bail, ensure, Context};
+use anyhow::{bail, Context};
 
 use clap::ValueEnum;
 use postgres_backend::AuthType;
@@ -23,6 +23,8 @@ use utils::{
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
 };
 
+use crate::pageserver::PageServerNode;
+use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
 use crate::safekeeper::SafekeeperNode;
 
 pub const DEFAULT_PG_VERSION: u32 = 15;
@@ -34,7 +36,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
 // to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
 // an example.
 //
-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
+#[derive(PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
     // Base directory for all the nodes (the pageserver, safekeepers and
     // compute endpoints).
@@ -42,59 +44,99 @@ pub struct LocalEnv {
     // This is not stored in the config file. Rather, this is the path where the
     // config file itself is. It is read from the NEON_REPO_DIR env variable or
     // '.neon' if not given.
-    #[serde(skip)]
     pub base_data_dir: PathBuf,
 
     // Path to postgres distribution. It's expected that "bin", "include",
     // "lib", "share" from postgres distribution are there. If at some point
     // in time we will be able to run against vanilla postgres we may split that
     // to four separate paths and match OS-specific installation layout.
-    #[serde(default)]
     pub pg_distrib_dir: PathBuf,
 
     // Path to pageserver binary.
-    #[serde(default)]
     pub neon_distrib_dir: PathBuf,
 
     // Default tenant ID to use with the 'neon_local' command line utility, when
     // --tenant_id is not explicitly specified.
-    #[serde(default)]
     pub default_tenant_id: Option<TenantId>,
 
     // used to issue tokens during e.g pg start
-    #[serde(default)]
     pub private_key_path: PathBuf,
 
     pub broker: NeonBroker,
 
     // Configuration for the storage controller (1 per neon_local environment)
-    #[serde(default)]
     pub storage_controller: NeonStorageControllerConf,
 
     /// This Vec must always contain at least one pageserver
+    /// Populdated by [`Self::load_config`] from the individual `pageserver.toml`s.
+    /// NB: not used anymore except for informing users that they need to change their `.neon/config`.
     pub pageservers: Vec<PageServerConf>,
 
-    #[serde(default)]
     pub safekeepers: Vec<SafekeeperConf>,
 
     // Control plane upcall API for pageserver: if None, we will not run storage_controller  If set, this will
     // be propagated into each pageserver's configuration.
-    #[serde(default)]
     pub control_plane_api: Option<Url>,
 
     // Control plane upcall API for storage controller.  If set, this will be propagated into the
     // storage controller's configuration.
-    #[serde(default)]
     pub control_plane_compute_hook_api: Option<Url>,
 
     /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
-    #[serde(default)]
     // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
     // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
     // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
+    pub branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
+}
+
+/// On-disk state stored in `.neon/config`.
+#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)]
+#[serde(default, deny_unknown_fields)]
+pub struct OnDiskConfig {
+    pub pg_distrib_dir: PathBuf,
+    pub neon_distrib_dir: PathBuf,
+    pub default_tenant_id: Option<TenantId>,
+    pub private_key_path: PathBuf,
+    pub broker: NeonBroker,
+    pub storage_controller: NeonStorageControllerConf,
+    #[serde(
+        skip_serializing,
+        deserialize_with = "fail_if_pageservers_field_specified"
+    )]
+    pub pageservers: Vec<PageServerConf>,
+    pub safekeepers: Vec<SafekeeperConf>,
+    pub control_plane_api: Option<Url>,
+    pub control_plane_compute_hook_api: Option<Url>,
     branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
 }
 
+fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result<Vec<PageServerConf>, D::Error>
+where
+    D: serde::Deserializer<'de>,
+{
+    Err(serde::de::Error::custom(
+        "The 'pageservers' field is no longer used; pageserver.toml is now authoritative; \
+         Please remove the `pageservers` from your .neon/config.",
+    ))
+}
+
+/// The description of the neon_local env to be initialized by `neon_local init --config`.
+#[derive(Clone, Debug, Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct NeonLocalInitConf {
+    // TODO: do we need this? Seems unused
+    pub pg_distrib_dir: Option<PathBuf>,
+    // TODO: do we need this? Seems unused
+    pub neon_distrib_dir: Option<PathBuf>,
+    pub default_tenant_id: TenantId,
+    pub broker: NeonBroker,
+    pub storage_controller: Option<NeonStorageControllerConf>,
+    pub pageservers: Vec<NeonLocalInitPageserverConf>,
+    pub safekeepers: Vec<SafekeeperConf>,
+    pub control_plane_api: Option<Option<Url>>,
+    pub control_plane_compute_hook_api: Option<Option<Url>>,
+}
+
 /// Broker config for cluster internal communication.
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
@@ -141,24 +183,18 @@ impl NeonBroker {
     }
 }
 
+// neon_local needs to know this subset of pageserver configuration.
+// For legacy reasons, this information is duplicated from `pageserver.toml` into `.neon/config`.
+// It can get stale if `pageserver.toml` is changed.
+// TODO(christian): don't store this at all in `.neon/config`, always load it from `pageserver.toml`
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default, deny_unknown_fields)]
 pub struct PageServerConf {
-    // node id
     pub id: NodeId,
-
-    // Pageserver connection settings
     pub listen_pg_addr: String,
     pub listen_http_addr: String,
-
-    // auth type used for the PG and HTTP ports
     pub pg_auth_type: AuthType,
     pub http_auth_type: AuthType,
-
-    pub(crate) virtual_file_io_engine: Option<String>,
-    pub(crate) get_vectored_impl: Option<String>,
-    pub(crate) get_impl: Option<String>,
-    pub(crate) validate_vectored_get: Option<bool>,
 }
 
 impl Default for PageServerConf {
@@ -169,10 +205,40 @@ impl Default for PageServerConf {
             listen_http_addr: String::new(),
             pg_auth_type: AuthType::Trust,
             http_auth_type: AuthType::Trust,
-            virtual_file_io_engine: None,
-            get_vectored_impl: None,
-            get_impl: None,
-            validate_vectored_get: None,
+        }
+    }
+}
+
+/// The toml that can be passed to `neon_local init --config`.
+/// This is a subset of the `pageserver.toml` configuration.
+// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
+#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
+pub struct NeonLocalInitPageserverConf {
+    pub id: NodeId,
+    pub listen_pg_addr: String,
+    pub listen_http_addr: String,
+    pub pg_auth_type: AuthType,
+    pub http_auth_type: AuthType,
+    #[serde(flatten)]
+    pub other: HashMap<String, toml::Value>,
+}
+
+impl From<&NeonLocalInitPageserverConf> for PageServerConf {
+    fn from(conf: &NeonLocalInitPageserverConf) -> Self {
+        let NeonLocalInitPageserverConf {
+            id,
+            listen_pg_addr,
+            listen_http_addr,
+            pg_auth_type,
+            http_auth_type,
+            other: _,
+        } = conf;
+        Self {
+            id: *id,
+            listen_pg_addr: listen_pg_addr.clone(),
+            listen_http_addr: listen_http_addr.clone(),
+            pg_auth_type: *pg_auth_type,
+            http_auth_type: *http_auth_type,
         }
     }
 }
@@ -360,44 +426,7 @@ impl LocalEnv {
             .collect()
     }
 
-    /// Create a LocalEnv from a config file.
-    ///
-    /// Unlike 'load_config', this function fills in any defaults that are missing
-    /// from the config file.
-    pub fn parse_config(toml: &str) -> anyhow::Result<Self> {
-        let mut env: LocalEnv = toml::from_str(toml)?;
-
-        // Find postgres binaries.
-        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
-        // Note that later in the code we assume, that distrib dirs follow the same pattern
-        // for all postgres versions.
-        if env.pg_distrib_dir == Path::new("") {
-            if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
-                env.pg_distrib_dir = postgres_bin.into();
-            } else {
-                let cwd = env::current_dir()?;
-                env.pg_distrib_dir = cwd.join("pg_install")
-            }
-        }
-
-        // Find neon binaries.
-        if env.neon_distrib_dir == Path::new("") {
-            env::current_exe()?
-                .parent()
-                .unwrap()
-                .clone_into(&mut env.neon_distrib_dir);
-        }
-
-        if env.pageservers.is_empty() {
-            anyhow::bail!("Configuration must contain at least one pageserver");
-        }
-
-        env.base_data_dir = base_path();
-
-        Ok(env)
-    }
-
-    /// Locate and load config
+    ///  Construct `Self` from on-disk state.
     pub fn load_config() -> anyhow::Result<Self> {
         let repopath = base_path();
 
@@ -411,38 +440,129 @@ impl LocalEnv {
         // TODO: check that it looks like a neon repository
 
         // load and parse file
-        let config = fs::read_to_string(repopath.join("config"))?;
-        let mut env: LocalEnv = toml::from_str(config.as_str())?;
+        let config_file_contents = fs::read_to_string(repopath.join("config"))?;
+        let on_disk_config: OnDiskConfig = toml::from_str(config_file_contents.as_str())?;
+        let mut env = {
+            let OnDiskConfig {
+                pg_distrib_dir,
+                neon_distrib_dir,
+                default_tenant_id,
+                private_key_path,
+                broker,
+                storage_controller,
+                pageservers,
+                safekeepers,
+                control_plane_api,
+                control_plane_compute_hook_api,
+                branch_name_mappings,
+            } = on_disk_config;
+            LocalEnv {
+                base_data_dir: repopath.clone(),
+                pg_distrib_dir,
+                neon_distrib_dir,
+                default_tenant_id,
+                private_key_path,
+                broker,
+                storage_controller,
+                pageservers,
+                safekeepers,
+                control_plane_api,
+                control_plane_compute_hook_api,
+                branch_name_mappings,
+            }
+        };
 
-        env.base_data_dir = repopath;
+        // The source of truth for pageserver configuration is the pageserver.toml.
+        assert!(
+            env.pageservers.is_empty(),
+            "we ensure this during deserialization"
+        );
+        env.pageservers = {
+            let iter = std::fs::read_dir(&repopath).context("open dir")?;
+            let mut pageservers = Vec::new();
+            for res in iter {
+                let dentry = res?;
+                const PREFIX: &str = "pageserver_";
+                let dentry_name = dentry
+                    .file_name()
+                    .into_string()
+                    .ok()
+                    .with_context(|| format!("non-utf8 dentry: {:?}", dentry.path()))
+                    .unwrap();
+                if !dentry_name.starts_with(PREFIX) {
+                    continue;
+                }
+                if !dentry.file_type().context("determine file type")?.is_dir() {
+                    anyhow::bail!("expected a directory, got {:?}", dentry.path());
+                }
+                let id = dentry_name[PREFIX.len()..]
+                    .parse::<NodeId>()
+                    .with_context(|| format!("parse id from {:?}", dentry.path()))?;
+                // TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
+                #[derive(serde::Serialize, serde::Deserialize)]
+                // (allow unknown fields, unlike PageServerConf)
+                struct PageserverConfigTomlSubset {
+                    id: NodeId,
+                    listen_pg_addr: String,
+                    listen_http_addr: String,
+                    pg_auth_type: AuthType,
+                    http_auth_type: AuthType,
+                }
+                let config_toml_path = dentry.path().join("pageserver.toml");
+                let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
+                    &std::fs::read_to_string(&config_toml_path)
+                        .with_context(|| format!("read {:?}", config_toml_path))?,
+                )
+                .context("parse pageserver.toml")?;
+                let PageserverConfigTomlSubset {
+                    id: config_toml_id,
+                    listen_pg_addr,
+                    listen_http_addr,
+                    pg_auth_type,
+                    http_auth_type,
+                } = config_toml;
+                let conf = PageServerConf {
+                    id: {
+                        anyhow::ensure!(
+                            config_toml_id == id,
+                            "id mismatch: config_toml.id={config_toml_id} id={id}",
+                        );
+                        id
+                    },
+                    listen_pg_addr,
+                    listen_http_addr,
+                    pg_auth_type,
+                    http_auth_type,
+                };
+                pageservers.push(conf);
+            }
+            pageservers
+        };
 
         Ok(env)
     }
 
-    pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
-        // Currently, the user first passes a config file with 'neon_local init --config=<path>'
-        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
-        // to .neon/config. TODO: We lose any formatting and comments along the way, which is
-        // a bit sad.
-        let mut conf_content = r#"# This file describes a local deployment of the page server
-# and safekeeeper node. It is read by the 'neon_local' command-line
-# utility.
-"#
-        .to_string();
-
-        // Convert the LocalEnv to a toml file.
-        //
-        // This could be as simple as this:
-        //
-        // conf_content += &toml::to_string_pretty(env)?;
-        //
-        // But it results in a "values must be emitted before tables". I'm not sure
-        // why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
-        // Maybe rust reorders the fields to squeeze avoid padding or something?
-        // In any case, converting to toml::Value first, and serializing that, works.
-        // See https://github.com/alexcrichton/toml-rs/issues/142
-        conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?;
+    pub fn persist_config(&self) -> anyhow::Result<()> {
+        Self::persist_config_impl(
+            &self.base_data_dir,
+            &OnDiskConfig {
+                pg_distrib_dir: self.pg_distrib_dir.clone(),
+                neon_distrib_dir: self.neon_distrib_dir.clone(),
+                default_tenant_id: self.default_tenant_id,
+                private_key_path: self.private_key_path.clone(),
+                broker: self.broker.clone(),
+                storage_controller: self.storage_controller.clone(),
+                pageservers: vec![], // it's skip_serializing anyway
+                safekeepers: self.safekeepers.clone(),
+                control_plane_api: self.control_plane_api.clone(),
+                control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(),
+                branch_name_mappings: self.branch_name_mappings.clone(),
+            },
+        )
+    }
 
+    pub fn persist_config_impl(base_path: &Path, config: &OnDiskConfig) -> anyhow::Result<()> {
+        let conf_content = &toml::to_string_pretty(config)?;
         let target_config_path = base_path.join("config");
         fs::write(&target_config_path, conf_content).with_context(|| {
             format!(
@@ -467,17 +587,13 @@ impl LocalEnv {
         }
     }
 
-    //
-    // Initialize a new Neon repository
-    //
-    pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> {
-        // check if config already exists
-        let base_path = &self.base_data_dir;
-        ensure!(
-            base_path != Path::new(""),
-            "repository base path is missing"
-        );
+    /// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`].
+    pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> {
+        let base_path = base_path();
+        assert_ne!(base_path, Path::new(""));
+        let base_path = &base_path;
 
+        // create base_path dir
         if base_path.exists() {
             match force {
                 InitForceMode::MustNotExist => {
@@ -509,74 +625,96 @@ impl LocalEnv {
                 }
             }
         }
-
-        if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
-            bail!(
-                "Can't find postgres binary at {}",
-                self.pg_bin_dir(pg_version)?.display()
-            );
-        }
-        for binary in ["pageserver", "safekeeper"] {
-            if !self.neon_distrib_dir.join(binary).exists() {
-                bail!(
-                    "Can't find binary '{binary}' in neon distrib dir '{}'",
-                    self.neon_distrib_dir.display()
-                );
-            }
-        }
-
         if !base_path.exists() {
             fs::create_dir(base_path)?;
         }
 
+        let NeonLocalInitConf {
+            pg_distrib_dir,
+            neon_distrib_dir,
+            default_tenant_id,
+            broker,
+            storage_controller,
+            pageservers,
+            safekeepers,
+            control_plane_api,
+            control_plane_compute_hook_api,
+        } = conf;
+
+        // Find postgres binaries.
+        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
+        // Note that later in the code we assume, that distrib dirs follow the same pattern
+        // for all postgres versions.
+        let pg_distrib_dir = pg_distrib_dir.unwrap_or_else(|| {
+            if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
+                postgres_bin.into()
+            } else {
+                let cwd = env::current_dir().unwrap();
+                cwd.join("pg_install")
+            }
+        });
+
+        // Find neon binaries.
+        let neon_distrib_dir = neon_distrib_dir
+            .unwrap_or_else(|| env::current_exe().unwrap().parent().unwrap().to_owned());
+
         // Generate keypair for JWT.
         //
         // The keypair is only needed if authentication is enabled in any of the
         // components. For convenience, we generate the keypair even if authentication
         // is not enabled, so that you can easily enable it after the initialization
-        // step. However, if the key generation fails, we treat it as non-fatal if
-        // authentication was not enabled.
-        if self.private_key_path == PathBuf::new() {
-            match generate_auth_keys(
-                base_path.join("auth_private_key.pem").as_path(),
-                base_path.join("auth_public_key.pem").as_path(),
-            ) {
-                Ok(()) => {
-                    self.private_key_path = PathBuf::from("auth_private_key.pem");
-                }
-                Err(e) => {
-                    if !self.auth_keys_needed() {
-                        eprintln!("Could not generate keypair for JWT authentication: {e}");
-                        eprintln!("Continuing anyway because authentication was not enabled");
-                        self.private_key_path = PathBuf::from("auth_private_key.pem");
-                    } else {
-                        return Err(e);
-                    }
-                }
-            }
+        // step.
+        generate_auth_keys(
+            base_path.join("auth_private_key.pem").as_path(),
+            base_path.join("auth_public_key.pem").as_path(),
+        )
+        .context("generate auth keys")?;
+        let private_key_path = PathBuf::from("auth_private_key.pem");
+
+        // create the runtime type because the remaining initialization code below needs
+        // a LocalEnv instance op operation
+        // TODO: refactor to avoid this, LocalEnv should only be constructed from on-disk state
+        let env = LocalEnv {
+            base_data_dir: base_path.clone(),
+            pg_distrib_dir,
+            neon_distrib_dir,
+            default_tenant_id: Some(default_tenant_id),
+            private_key_path,
+            broker,
+            storage_controller: storage_controller.unwrap_or_default(),
+            pageservers: pageservers.iter().map(Into::into).collect(),
+            safekeepers,
+            control_plane_api: control_plane_api.unwrap_or_default(),
+            control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(),
+            branch_name_mappings: Default::default(),
+        };
+
+        // create endpoints dir
+        fs::create_dir_all(env.endpoints_path())?;
+
+        // create safekeeper dirs
+        for safekeeper in &env.safekeepers {
+            fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?;
         }
 
-        fs::create_dir_all(self.endpoints_path())?;
-
-        for safekeeper in &self.safekeepers {
-            fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
+        // initialize pageserver state
+        for (i, ps) in pageservers.into_iter().enumerate() {
+            let runtime_ps = &env.pageservers[i];
+            assert_eq!(&PageServerConf::from(&ps), runtime_ps);
+            fs::create_dir(env.pageserver_data_dir(ps.id))?;
+            PageServerNode::from_env(&env, runtime_ps)
+                .initialize(ps)
+                .context("pageserver init failed")?;
         }
 
-        for ps in &self.pageservers {
-            fs::create_dir(self.pageserver_data_dir(ps.id))?;
-        }
+        // setup remote remote location for default LocalFs remote storage
+        std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
 
-        self.persist_config(base_path)
-    }
-
-    fn auth_keys_needed(&self) -> bool {
-        self.pageservers.iter().any(|ps| {
-            ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT
-        }) || self.safekeepers.iter().any(|sk| sk.auth_enabled)
+        env.persist_config()
     }
 }
 
-fn base_path() -> PathBuf {
+pub fn base_path() -> PathBuf {
     match std::env::var_os("NEON_REPO_DIR") {
         Some(val) => PathBuf::from(val),
         None => PathBuf::from(".neon"),
@@ -619,31 +757,3 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
     }
     Ok(())
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn simple_conf_parsing() {
-        let simple_conf_toml = include_str!("../simple.conf");
-        let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml);
-        assert!(
-            simple_conf_parse_result.is_ok(),
-            "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
-        );
-
-        let string_to_replace = "listen_addr = '127.0.0.1:50051'";
-        let spoiled_url_str = "listen_addr = '!@$XOXO%^&'";
-        let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
-        assert!(
-            spoiled_url_toml.contains(spoiled_url_str),
-            "Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}"
-        );
-        let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml);
-        assert!(
-            spoiled_url_parse_result.is_err(),
-            "expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}"
-        );
-    }
-}
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 6046c93bad..5a84763697 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -30,7 +30,7 @@ use utils::{
     lsn::Lsn,
 };
 
-use crate::local_env::PageServerConf;
+use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf};
 use crate::{background_process, local_env::LocalEnv};
 
 /// Directory within .neon which will be used by default for LocalFs remote storage.
@@ -76,9 +76,11 @@ impl PageServerNode {
 
     fn pageserver_init_make_toml(
         &self,
-        cli_overrides: toml_edit::Document,
+        conf: NeonLocalInitPageserverConf,
     ) -> anyhow::Result<toml_edit::Document> {
-        // TODO: this is a legacy code, it should be refactored to use toml_edit directly.
+        assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
+
+        // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
 
         // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
         let pg_distrib_dir_param = format!(
@@ -86,61 +88,9 @@ impl PageServerNode {
             self.env.pg_distrib_dir_raw().display()
         );
 
-        let PageServerConf {
-            id,
-            listen_pg_addr,
-            listen_http_addr,
-            pg_auth_type,
-            http_auth_type,
-            virtual_file_io_engine,
-            get_vectored_impl,
-            get_impl,
-            validate_vectored_get,
-        } = &self.conf;
-
-        let id = format!("id={}", id);
-
-        let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
-        let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
-
-        let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
-        let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
-        let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
-            format!("virtual_file_io_engine='{virtual_file_io_engine}'")
-        } else {
-            String::new()
-        };
-        let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
-            format!("get_vectored_impl='{get_vectored_impl}'")
-        } else {
-            String::new()
-        };
-        let get_impl = if let Some(get_impl) = get_impl {
-            format!("get_impl='{get_impl}'")
-        } else {
-            String::new()
-        };
-        let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get {
-            format!("validate_vectored_get={validate_vectored_get}")
-        } else {
-            String::new()
-        };
-
         let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
 
-        let mut overrides = vec![
-            id,
-            pg_distrib_dir_param,
-            http_auth_type_param,
-            pg_auth_type_param,
-            listen_http_addr_param,
-            listen_pg_addr_param,
-            broker_endpoint_param,
-            virtual_file_io_engine,
-            get_vectored_impl,
-            get_impl,
-            validate_vectored_get,
-        ];
+        let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param];
 
         if let Some(control_plane_api) = &self.env.control_plane_api {
             overrides.push(format!(
@@ -150,7 +100,7 @@ impl PageServerNode {
 
             // Storage controller uses the same auth as pageserver: if JWT is enabled
             // for us, we will also need it to talk to them.
-            if matches!(http_auth_type, AuthType::NeonJWT) {
+            if matches!(conf.http_auth_type, AuthType::NeonJWT) {
                 let jwt_token = self
                     .env
                     .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
@@ -159,20 +109,23 @@ impl PageServerNode {
             }
         }
 
-        if !cli_overrides.contains_key("remote_storage") {
+        if !conf.other.contains_key("remote_storage") {
             overrides.push(format!(
                 "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
             ));
         }
 
-        if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
+        if conf.http_auth_type != AuthType::Trust || conf.pg_auth_type != AuthType::Trust {
             // Keys are generated in the toplevel repo dir, pageservers' workdirs
             // are one level below that, so refer to keys with ../
             overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
         }
 
         // Apply the user-provided overrides
-        overrides.push(cli_overrides.to_string());
+        overrides.push(
+            toml_edit::ser::to_string_pretty(&conf)
+                .expect("we deserialized this from toml earlier"),
+        );
 
         // Turn `overrides` into a toml document.
         // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
@@ -188,8 +141,8 @@ impl PageServerNode {
     }
 
     /// Initializes a pageserver node by creating its config with the overrides provided.
-    pub fn initialize(&self, config_overrides: toml_edit::Document) -> anyhow::Result<()> {
-        self.pageserver_init(config_overrides)
+    pub fn initialize(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
+        self.pageserver_init(conf)
             .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
     }
 
@@ -209,7 +162,7 @@ impl PageServerNode {
         self.start_node().await
     }
 
-    fn pageserver_init(&self, cli_overrides: toml_edit::Document) -> anyhow::Result<()> {
+    fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
         let datadir = self.repo_path();
         let node_id = self.conf.id;
         println!(
@@ -221,7 +174,7 @@ impl PageServerNode {
         io::stdout().flush()?;
 
         let config = self
-            .pageserver_init_make_toml(cli_overrides)
+            .pageserver_init_make_toml(conf)
             .context("make pageserver toml")?;
         let config_file_path = datadir.join("pageserver.toml");
         let mut config_file = std::fs::OpenOptions::new()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 30cec4c726..f618c508bc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -14,7 +14,7 @@ import textwrap
 import threading
 import time
 import uuid
-from contextlib import ExitStack, closing, contextmanager
+from contextlib import closing, contextmanager
 from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
@@ -1054,14 +1054,14 @@ class NeonEnv:
         self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
         self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
 
-        # Create a config file corresponding to the options
+        # Create the neon_local's `NeonLocalInitConf`
         cfg: Dict[str, Any] = {
             "default_tenant_id": str(self.initial_tenant),
             "broker": {
                 "listen_addr": self.broker.listen_addr(),
             },
-            "pageservers": [],
             "safekeepers": [],
+            "pageservers": [],
         }
 
         if self.control_plane_api is not None:
@@ -1100,6 +1100,17 @@ class NeonEnv:
             if config.pageserver_validate_vectored_get is not None:
                 ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get
 
+            if self.pageserver_remote_storage is not None:
+                ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
+                    self.pageserver_remote_storage
+                )
+
+            if config.pageserver_config_override is not None:
+                for o in config.pageserver_config_override.split(";"):
+                    override = toml.loads(o)
+                    for key, value in override.items():
+                        ps_cfg[key] = value
+
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
                 NeonPageserver(
@@ -1136,7 +1147,6 @@ class NeonEnv:
         self.neon_cli.init(
             cfg,
             force=config.config_init_force,
-            pageserver_config_override=config.pageserver_config_override,
         )
 
     def start(self):
@@ -1722,46 +1732,22 @@ class NeonCli(AbstractNeonCli):
 
     def init(
         self,
-        config: Dict[str, Any],
+        init_config: Dict[str, Any],
         force: Optional[str] = None,
-        pageserver_config_override: Optional[str] = None,
     ) -> "subprocess.CompletedProcess[str]":
-        remote_storage = self.env.pageserver_remote_storage
-
-        ps_config = {}
-        if remote_storage is not None:
-            ps_config["remote_storage"] = remote_storage_to_toml_dict(remote_storage)
-
-        if pageserver_config_override is not None:
-            for o in pageserver_config_override.split(";"):
-                override = toml.loads(o)
-                for key, value in override.items():
-                    ps_config[key] = value
-
-        with ExitStack() as stack:
-            ps_config_file = stack.enter_context(tempfile.NamedTemporaryFile(mode="w+"))
-            ps_config_file.write(toml.dumps(ps_config))
-            ps_config_file.flush()
-
-            neon_local_config = stack.enter_context(tempfile.NamedTemporaryFile(mode="w+"))
-            neon_local_config.write(toml.dumps(config))
-            neon_local_config.flush()
+        with tempfile.NamedTemporaryFile(mode="w+") as init_config_tmpfile:
+            init_config_tmpfile.write(toml.dumps(init_config))
+            init_config_tmpfile.flush()
 
             cmd = [
                 "init",
-                f"--config={neon_local_config.name}",
-                "--pg-version",
-                self.env.pg_version,
-                f"--pageserver-config={ps_config_file.name}",
+                f"--config={init_config_tmpfile.name}",
             ]
 
             if force is not None:
                 cmd.extend(["--force", force])
 
-            s3_env_vars = None
-            if isinstance(remote_storage, S3Storage):
-                s3_env_vars = remote_storage.access_env_vars()
-            res = self.raw_cli(cmd, extra_env_vars=s3_env_vars)
+            res = self.raw_cli(cmd)
             res.check_returncode()
         return res
 
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index be351db429..bd7e4f118f 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -2,6 +2,7 @@ import subprocess
 from pathlib import Path
 from typing import Optional
 
+import toml
 from fixtures.neon_fixtures import (
     DEFAULT_BRANCH_NAME,
     NeonEnv,
@@ -12,9 +13,11 @@ from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 
-def test_pageserver_init_node_id(
-    neon_simple_env: NeonEnv, neon_binpath: Path, pg_distrib_dir: Path
-):
+def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path):
+    """
+    NB: The neon_local doesn't use `--init` mode anymore, but our production
+    deployment still does => https://github.com/neondatabase/aws/pull/1322
+    """
     workdir = neon_simple_env.pageserver.workdir
     pageserver_config = workdir / "pageserver.toml"
     pageserver_bin = neon_binpath / "pageserver"
@@ -28,18 +31,36 @@ def test_pageserver_init_node_id(
             stderr=subprocess.PIPE,
         )
 
-    # remove initial config and stop existing pageserver
-    pageserver_config.unlink()
     neon_simple_env.pageserver.stop()
 
-    bad_init = run_pageserver(["--init", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'])
+    with open(neon_simple_env.pageserver.config_toml_path, "r") as f:
+        ps_config = toml.load(f)
+
+    required_config_keys = [
+        "pg_distrib_dir",
+        "listen_pg_addr",
+        "listen_http_addr",
+        "pg_auth_type",
+        "http_auth_type",
+    ]
+    required_config_overrides = [
+        f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys
+    ]
+
+    pageserver_config.unlink()
+
+    bad_init = run_pageserver(["--init", *required_config_overrides])
     assert (
         bad_init.returncode == 1
     ), "pageserver should not be able to init new config without the node id"
     assert 'missing config value "id"' in bad_init.stderr
     assert not pageserver_config.exists(), "config file should not be created after init error"
 
-    good_init_cmd = ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']
+    good_init_cmd = [
+        "--init",
+        f"--config-override=id={ps_config['id']}",
+        *required_config_overrides,
+    ]
     completed_init = run_pageserver(good_init_cmd)
     assert (
         completed_init.returncode == 0

From 0457980728d93e6c3a4fc25b6f5b6052bdff1457 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 8 May 2024 16:22:13 +0100
Subject: [PATCH 0732/1571] Fix flaky test_gc_of_remote_layers (#7647)

Fixes flaky test `test_gc_of_remote_layers`, which was failing because
of the `Nothing to GC` pageserver log.
I looked into the fails, it seems that backround `gc_loop` sometimes
started GC for initial tenant, which wasn't
configured to disable GC. The fix is to not create initial tenant with
enabled gc at all.

Fixes #7538
---
 test_runner/regress/test_layer_eviction.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index 5c967fd72e..b178baea11 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -159,7 +159,9 @@ def test_basic_eviction(
 def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
-    env = neon_env_builder.init_start()
+    # don't create initial tenant, we'll create it manually with custom config
+    env = neon_env_builder.init_configs()
+    env.start()
 
     tenant_config = {
         "pitr_interval": "1s",  # set to non-zero, so GC actually does something

From 21e1a496a3f706097578de396a9107813c541001 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Wed, 8 May 2024 08:49:57 -0700
Subject: [PATCH 0733/1571] Expose LSN and replication delay as metrics (#7610)

## Problem
We currently have no way to see what the current LSN of a compute its,
and in case of read replicas, we don't know what the difference in LSNs
is.

## Summary of changes
Adds these metrics
---
 vm-image-spec.yaml | 44 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 41ca16f16b..56538630ac 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -244,6 +244,49 @@ files:
         values: [approximate_working_set_size]
         query: |
           select neon.approximate_working_set_size(false) as approximate_working_set_size;
+
+      - metric_name: current_lsn
+        type: gauge
+        help: 'Current LSN of the database'
+        key_labels:
+        values: [lsn]
+        query: |
+          select
+            case
+              when pg_catalog.pg_is_in_recovery()
+              then pg_last_wal_replay_lsn()
+              else pg_current_wal_lsn()
+            end as lsn;
+
+      - metric_name: replication_delay_bytes
+        type: gauge
+        help: 'Bytes between received and replayed LSN'
+        key_labels:
+        values: [replication_delay_bytes]
+        query: |
+          SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes;
+
+      - metric_name: replication_delay_seconds
+        type: gauge
+        help: 'Time since last LSN was replayed'
+        key_labels:
+        values: [replication_delay_seconds]
+        query: |
+          SELECT
+            CASE
+              WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
+              ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
+            END AS replication_delay_seconds;
+
+      - metric_name: checkpoint_stats
+        type: gauge
+        help: 'Number of requested and scheduled checkpoints'
+        key_labels:
+        values:
+          - checkpoints_req
+          - checkpoints_timed
+        query: |
+          SELECT checkpoints_req, checkpoints_timed FROM pg_stat_bgwriter;
   - filename: neon_collector_autoscaling.yml
     content: |
       collector_name: neon_collector_autoscaling
@@ -295,7 +338,6 @@ files:
         values: [approximate_working_set_size]
         query: |
           select neon.approximate_working_set_size(false) as approximate_working_set_size;
-
 build: |
   # Build cgroup-tools
   #

From 1173ee6a7e1168e671a6847eb94807b45c703490 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 8 May 2024 11:53:54 -0400
Subject: [PATCH 0734/1571] chore(neon_test_utils): restrict installation to
 superuser (#7624)

The test utils should only be used during tests. Users should not be
able to create this extension on their own.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pgxn/neon_test_utils/neon_test_utils.control | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control
index 5f6d640835..8c5b9b5dfe 100644
--- a/pgxn/neon_test_utils/neon_test_utils.control
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -3,4 +3,5 @@ comment = 'helpers for neon testing and debugging'
 default_version = '1.1'
 module_pathname = '$libdir/neon_test_utils'
 relocatable = true
-trusted = true
+trusted = false
+superuser = true

From ca154d9cd843dcc10d234266be0effff091e71e7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 8 May 2024 17:50:21 +0100
Subject: [PATCH 0735/1571] pageserver: local layer path followups (#7640)

- Rename "filename" types which no longer map directly to a filename
(LayerFileName -> LayerName)
- Add a -v1- part to local layer paths to smooth the path to future
updates (we anticipate a -v2- that uses checksums later)
- Rename methods that refer to the string-ized version of a LayerName to
no longer be called "filename"
- Refactor reconcile() function to use a LocalLayerFileMetadata type
that includes the local path, rather than carrying local path separately
in a tuple and unwrap()'ing it later.
---
 pageserver/benches/bench_layer_map.rs         |   4 +-
 pageserver/ctl/src/index_part.rs              |   4 +-
 pageserver/src/deletion_queue.rs              |  16 +--
 pageserver/src/deletion_queue/list_writer.rs  |   4 +-
 pageserver/src/disk_usage_eviction_task.rs    |   8 +-
 pageserver/src/http/routes.rs                 |   6 +-
 .../src/tenant/remote_timeline_client.rs      |  64 +++++-----
 .../tenant/remote_timeline_client/download.rs |   4 +-
 .../tenant/remote_timeline_client/index.rs    |   6 +-
 pageserver/src/tenant/secondary.rs            |   4 +-
 pageserver/src/tenant/secondary/downloader.rs |  12 +-
 pageserver/src/tenant/secondary/heatmap.rs    |   8 +-
 pageserver/src/tenant/storage_layer.rs        |  20 +--
 .../src/tenant/storage_layer/delta_layer.rs   |  14 ++-
 .../src/tenant/storage_layer/image_layer.rs   |  21 ++--
 pageserver/src/tenant/storage_layer/layer.rs  |  39 +++---
 .../src/tenant/storage_layer/layer_desc.rs    |  28 ++---
 .../{filename.rs => layer_name.rs}            | 115 +++++++++---------
 pageserver/src/tenant/timeline.rs             |  71 +++++------
 .../src/tenant/timeline/detach_ancestor.rs    |   2 +-
 pageserver/src/tenant/timeline/init.rs        |  93 ++++++++------
 .../src/tenant/timeline/layer_manager.rs      |   2 +-
 pageserver/src/tenant/upload_queue.rs         |   8 +-
 s3_scrubber/src/checks.rs                     |  23 ++--
 s3_scrubber/src/tenant_snapshot.rs            |  18 +--
 test_runner/fixtures/neon_fixtures.py         |   4 +-
 test_runner/fixtures/pageserver/types.py      |  29 +++--
 .../regress/test_layers_from_future.py        |   8 +-
 .../regress/test_pageserver_generations.py    |   4 +-
 29 files changed, 324 insertions(+), 315 deletions(-)
 rename pageserver/src/tenant/storage_layer/{filename.rs => layer_name.rs} (72%)

diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 5d05af0c00..1d02aa7709 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,7 +1,7 @@
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::storage_layer::PersistentLayerDesc;
 use pageserver_api::shard::TenantShardId;
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
@@ -28,7 +28,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
     let mut updates = layer_map.batch_update();
     for fname in filenames {
         let fname = fname.unwrap();
-        let fname = LayerFileName::from_str(&fname).unwrap();
+        let fname = LayerName::from_str(&fname).unwrap();
         let layer = PersistentLayerDesc::from(fname);
 
         let lsn_range = layer.get_lsn_range();
diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs
index 20e5572914..0d010eb009 100644
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -3,7 +3,7 @@ use std::collections::HashMap;
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
-use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
 use utils::lsn::Lsn;
 
@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
             let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
             #[derive(serde::Serialize)]
             struct Output<'a> {
-                layer_metadata: &'a HashMap<LayerFileName, IndexLayerMetadata>,
+                layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
                 disk_consistent_lsn: Lsn,
                 timeline_metadata: &'a TimelineMetadata,
             }
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index e3c11cb299..c937309d83 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -38,7 +38,7 @@ use deleter::DeleterMessage;
 use list_writer::ListWriterQueueMessage;
 use validator::ValidatorQueueMessage;
 
-use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};
+use crate::{config::PageServerConf, tenant::storage_layer::LayerName};
 
 // TODO: configurable for how long to wait before executing deletions
 
@@ -479,7 +479,7 @@ impl DeletionQueueClient {
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         current_generation: Generation,
-        layers: Vec<(LayerFileName, LayerFileMetadata)>,
+        layers: Vec<(LayerName, LayerFileMetadata)>,
     ) -> Result<(), DeletionQueueError> {
         if current_generation.is_none() {
             debug!("Enqueuing deletions in legacy mode, skipping queue");
@@ -511,7 +511,7 @@ impl DeletionQueueClient {
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         current_generation: Generation,
-        layers: Vec<(LayerFileName, LayerFileMetadata)>,
+        layers: Vec<(LayerName, LayerFileMetadata)>,
     ) -> Result<(), DeletionQueueError> {
         metrics::DELETION_QUEUE
             .keys_submitted
@@ -734,20 +734,20 @@ mod test {
     use crate::{
         control_plane_client::RetryForeverError,
         repository::Key,
-        tenant::{harness::TenantHarness, storage_layer::DeltaFileName},
+        tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
     };
 
     use super::*;
     pub const TIMELINE_ID: TimelineId =
         TimelineId::from_array(hex!("11223344556677881122334455667788"));
 
-    pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName {
+    pub const EXAMPLE_LAYER_NAME: LayerName = LayerName::Delta(DeltaLayerName {
         key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
         lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51),
     });
 
     // When you need a second layer in a test.
-    pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName {
+    pub const EXAMPLE_LAYER_NAME_ALT: LayerName = LayerName::Delta(DeltaLayerName {
         key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
         lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61),
     });
@@ -797,7 +797,7 @@ mod test {
         /// Returns remote layer file name, suitable for use in assert_remote_files
         fn write_remote_layer(
             &self,
-            file_name: LayerFileName,
+            file_name: LayerName,
             gen: Generation,
         ) -> anyhow::Result<String> {
             let tenant_shard_id = self.harness.tenant_shard_id;
@@ -952,7 +952,7 @@ mod test {
         let client = ctx.deletion_queue.new_client();
         client.recover(HashMap::new())?;
 
-        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
         let tenant_shard_id = ctx.harness.tenant_shard_id;
 
         let content: Vec<u8> = "victim1 contents".into();
diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs
index 3a3d600ac2..ae3b2c9180 100644
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -34,7 +34,7 @@ use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::LayerFileMetadata;
-use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::storage_layer::LayerName;
 use crate::virtual_file::on_fatal_io_error;
 use crate::virtual_file::MaybeFatalIo;
 
@@ -59,7 +59,7 @@ pub(super) struct DeletionOp {
     // `layers` and `objects` are both just lists of objects.  `layers` is used if you do not
     // have a config object handy to project it to a remote key, and need the consuming worker
     // to do it for you.
-    pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>,
+    pub(super) layers: Vec<(LayerName, LayerFileMetadata)>,
     pub(super) objects: Vec<RemotePath>,
 
     /// The _current_ generation of the Tenant shard attachment in which we are enqueuing
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 400930245b..ebeb8bbb20 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -64,7 +64,7 @@ use crate::{
         mgr::TenantManager,
         remote_timeline_client::LayerFileMetadata,
         secondary::SecondaryTenant,
-        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
+        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
     },
 };
 
@@ -604,7 +604,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 pub(crate) struct EvictionSecondaryLayer {
     pub(crate) secondary_tenant: Arc<SecondaryTenant>,
     pub(crate) timeline_id: TimelineId,
-    pub(crate) name: LayerFileName,
+    pub(crate) name: LayerName,
     pub(crate) metadata: LayerFileMetadata,
 }
 
@@ -637,9 +637,9 @@ impl EvictionLayer {
         }
     }
 
-    pub(crate) fn get_name(&self) -> LayerFileName {
+    pub(crate) fn get_name(&self) -> LayerName {
         match self {
-            Self::Attached(l) => l.layer_desc().filename(),
+            Self::Attached(l) => l.layer_desc().layer_name(),
             Self::Secondary(sl) => sl.name.clone(),
         }
     }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 83b7b8a45e..a8ca642dc5 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -63,7 +63,7 @@ use crate::tenant::remote_timeline_client::list_remote_timelines;
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
-use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::SpawnMode;
@@ -1229,7 +1229,7 @@ async fn layer_download_handler(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let layer_file_name = get_request_param(&request, "layer_file_name")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let layer_name = LayerFileName::from_str(layer_file_name)
+    let layer_name = LayerName::from_str(layer_file_name)
         .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
     let state = get_state(&request);
 
@@ -1261,7 +1261,7 @@ async fn evict_timeline_layer_handler(
     let layer_file_name = get_request_param(&request, "layer_file_name")?;
     let state = get_state(&request);
 
-    let layer_name = LayerFileName::from_str(layer_file_name)
+    let layer_name = LayerName::from_str(layer_file_name)
         .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
 
     let timeline =
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 356a0dc51c..bbe4e16378 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -240,7 +240,7 @@ use utils::id::{TenantId, TimelineId};
 use self::index::IndexPart;
 
 use super::metadata::MetadataUpdate;
-use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
+use super::storage_layer::{Layer, LayerName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;
 
@@ -503,7 +503,7 @@ impl RemoteTimelineClient {
     /// On success, returns the size of the downloaded file.
     pub async fn download_layer_file(
         &self,
-        layer_file_name: &LayerFileName,
+        layer_file_name: &LayerName,
         layer_metadata: &LayerFileMetadata,
         cancel: &CancellationToken,
         ctx: &RequestContext,
@@ -677,7 +677,7 @@ impl RemoteTimelineClient {
             for layer in layers {
                 upload_queue
                     .latest_files
-                    .insert(layer.layer_desc().filename(), layer.metadata());
+                    .insert(layer.layer_desc().layer_name(), layer.metadata());
             }
 
             self.schedule_index_upload(upload_queue);
@@ -713,7 +713,7 @@ impl RemoteTimelineClient {
 
         upload_queue
             .latest_files
-            .insert(layer.layer_desc().filename(), metadata.clone());
+            .insert(layer.layer_desc().layer_name(), metadata.clone());
         upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
 
         info!(
@@ -737,7 +737,7 @@ impl RemoteTimelineClient {
     /// successfully.
     pub fn schedule_layer_file_deletion(
         self: &Arc<Self>,
-        names: &[LayerFileName],
+        names: &[LayerName],
     ) -> anyhow::Result<()> {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
@@ -765,7 +765,7 @@ impl RemoteTimelineClient {
         // the layer files as "dangling". this is fine, at worst case we create work for the
         // scrubber.
 
-        let names = gc_layers.iter().map(|x| x.layer_desc().filename());
+        let names = gc_layers.iter().map(|x| x.layer_desc().layer_name());
 
         self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
 
@@ -780,9 +780,9 @@ impl RemoteTimelineClient {
         self: &Arc<Self>,
         upload_queue: &mut UploadQueueInitialized,
         names: I,
-    ) -> Vec<(LayerFileName, LayerFileMetadata)>
+    ) -> Vec<(LayerName, LayerFileMetadata)>
     where
-        I: IntoIterator<Item = LayerFileName>,
+        I: IntoIterator<Item = LayerName>,
     {
         // Decorate our list of names with each name's metadata, dropping
         // names that are unexpectedly missing from our metadata.  This metadata
@@ -832,7 +832,7 @@ impl RemoteTimelineClient {
     /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
     pub(crate) fn schedule_deletion_of_unlinked(
         self: &Arc<Self>,
-        layers: Vec<(LayerFileName, LayerFileMetadata)>,
+        layers: Vec<(LayerName, LayerFileMetadata)>,
     ) -> anyhow::Result<()> {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
@@ -845,7 +845,7 @@ impl RemoteTimelineClient {
     fn schedule_deletion_of_unlinked0(
         self: &Arc<Self>,
         upload_queue: &mut UploadQueueInitialized,
-        mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
+        mut with_metadata: Vec<(LayerName, LayerFileMetadata)>,
     ) {
         // Filter out any layers which were not created by this tenant shard.  These are
         // layers that originate from some ancestor shard after a split, and may still
@@ -914,7 +914,7 @@ impl RemoteTimelineClient {
             self.schedule_layer_file_upload0(upload_queue, layer.clone());
         }
 
-        let names = compacted_from.iter().map(|x| x.layer_desc().filename());
+        let names = compacted_from.iter().map(|x| x.layer_desc().layer_name());
 
         self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
         self.launch_queued_tasks(upload_queue);
@@ -1144,7 +1144,7 @@ impl RemoteTimelineClient {
             &self.tenant_shard_id.tenant_id,
             &self.timeline_id,
             self.tenant_shard_id.to_index(),
-            &uploaded.layer_desc().filename(),
+            &uploaded.layer_desc().layer_name(),
             uploaded.metadata().generation,
         );
 
@@ -1185,7 +1185,7 @@ impl RemoteTimelineClient {
                 .get_timeline_id()
                 .expect("Source timeline should be alive"),
             self.tenant_shard_id.to_index(),
-            &adopted.layer_desc().filename(),
+            &adopted.layer_desc().layer_name(),
             adopted.metadata().generation,
         );
 
@@ -1193,7 +1193,7 @@ impl RemoteTimelineClient {
             &self.tenant_shard_id.tenant_id,
             &self.timeline_id,
             self.tenant_shard_id.to_index(),
-            &adopted_as.layer_desc().filename(),
+            &adopted_as.layer_desc().layer_name(),
             adopted_as.metadata().generation,
         );
 
@@ -1527,7 +1527,7 @@ impl RemoteTimelineClient {
                         &self.tenant_shard_id.tenant_id,
                         &self.timeline_id,
                         layer_metadata.shard,
-                        &layer.layer_desc().filename(),
+                        &layer.layer_desc().layer_name(),
                         layer_metadata.generation,
                     );
 
@@ -1896,14 +1896,14 @@ pub fn remote_layer_path(
     tenant_id: &TenantId,
     timeline_id: &TimelineId,
     shard: ShardIndex,
-    layer_file_name: &LayerFileName,
+    layer_file_name: &LayerName,
     generation: Generation,
 ) -> RemotePath {
     // Generation-aware key format
     let path = format!(
         "tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}",
         shard.get_suffix(),
-        layer_file_name.file_name(),
+        layer_file_name,
         generation.get_suffix()
     );
 
@@ -2000,8 +2000,8 @@ mod tests {
         TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap()
     }
 
-    fn assert_file_list(a: &HashSet<LayerFileName>, b: &[&str]) {
-        let mut avec: Vec<String> = a.iter().map(|x| x.file_name()).collect();
+    fn assert_file_list(a: &HashSet<LayerName>, b: &[&str]) {
+        let mut avec: Vec<String> = a.iter().map(|x| x.to_string()).collect();
         avec.sort();
 
         let mut bvec = b.to_vec();
@@ -2127,7 +2127,7 @@ mod tests {
             .layer_metadata
             .keys()
             .map(|f| f.to_owned())
-            .collect::<HashSet<LayerFileName>>();
+            .collect::<HashSet<LayerName>>();
         let initial_layer = {
             assert!(initial_layers.len() == 1);
             initial_layers.into_iter().next().unwrap()
@@ -2153,7 +2153,7 @@ mod tests {
             ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
         ]
         .into_iter()
-        .map(|(name, contents): (LayerFileName, Vec<u8>)| {
+        .map(|(name, contents): (LayerName, Vec<u8>)| {
 
             let local_path = local_layer_path(
                 harness.conf,
@@ -2234,9 +2234,9 @@ mod tests {
                 .map(|f| f.to_owned())
                 .collect(),
             &[
-                &initial_layer.file_name(),
-                &layers[0].layer_desc().filename().file_name(),
-                &layers[1].layer_desc().filename().file_name(),
+                &initial_layer.to_string(),
+                &layers[0].layer_desc().layer_name().to_string(),
+                &layers[1].layer_desc().layer_name().to_string(),
             ],
         );
         assert_eq!(index_part.metadata, metadata);
@@ -2250,7 +2250,7 @@ mod tests {
         // keep using schedule_layer_file_deletion because we don't have a way to wait for the
         // spawn_blocking started by the drop.
         client
-            .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
+            .schedule_layer_file_deletion(&[layers[0].layer_desc().layer_name()])
             .unwrap();
         {
             let mut guard = client.upload_queue.lock().unwrap();
@@ -2268,9 +2268,9 @@ mod tests {
         }
         assert_remote_files(
             &[
-                &initial_layer.file_name(),
-                &layers[0].layer_desc().filename().file_name(),
-                &layers[1].layer_desc().filename().file_name(),
+                &initial_layer.to_string(),
+                &layers[0].layer_desc().layer_name().to_string(),
+                &layers[1].layer_desc().layer_name().to_string(),
                 "index_part.json",
             ],
             &remote_timeline_dir,
@@ -2283,9 +2283,9 @@ mod tests {
 
         assert_remote_files(
             &[
-                &initial_layer.file_name(),
-                &layers[1].layer_desc().filename().file_name(),
-                &layers[2].layer_desc().filename().file_name(),
+                &initial_layer.to_string(),
+                &layers[1].layer_desc().layer_name().to_string(),
+                &layers[2].layer_desc().layer_name().to_string(),
                 "index_part.json",
             ],
             &remote_timeline_dir,
@@ -2305,7 +2305,7 @@ mod tests {
         } = TestSetup::new("metrics").await.unwrap();
         let client = timeline.remote_client.as_ref().unwrap();
 
-        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
         let local_path = local_layer_path(
             harness.conf,
             &timeline.tenant_shard_id,
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index c86b22d481..b464437422 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -22,7 +22,7 @@ use crate::context::RequestContext;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::layer::local_layer_path;
-use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
@@ -48,7 +48,7 @@ pub async fn download_layer_file<'a>(
     storage: &'a GenericRemoteStorage,
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
-    layer_file_name: &'a LayerFileName,
+    layer_file_name: &'a LayerName,
     layer_metadata: &'a LayerFileMetadata,
     cancel: &CancellationToken,
     ctx: &RequestContext,
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 7e0619945f..3e05905afa 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -8,7 +8,7 @@ use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
 
 use crate::tenant::metadata::TimelineMetadata;
-use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::storage_layer::LayerName;
 use crate::tenant::upload_queue::UploadQueueInitialized;
 use crate::tenant::Generation;
 use pageserver_api::shard::ShardIndex;
@@ -75,7 +75,7 @@ pub struct IndexPart {
     ///
     /// Older versions of `IndexPart` will not have this property or have only a part of metadata
     /// that latest version stores.
-    pub layer_metadata: HashMap<LayerFileName, IndexLayerMetadata>,
+    pub layer_metadata: HashMap<LayerName, IndexLayerMetadata>,
 
     // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
     // It's duplicated for convenience when reading the serialized structure, but is
@@ -104,7 +104,7 @@ impl IndexPart {
     pub const FILE_NAME: &'static str = "index_part.json";
 
     fn new(
-        layers_and_metadata: &HashMap<LayerFileName, LayerFileMetadata>,
+        layers_and_metadata: &HashMap<LayerName, LayerFileMetadata>,
         disk_consistent_lsn: Lsn,
         metadata: TimelineMetadata,
     ) -> Self {
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 0bb25f0ace..7075044baf 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -23,7 +23,7 @@ use super::{
     mgr::TenantManager,
     remote_timeline_client::LayerFileMetadata,
     span::debug_assert_current_span_has_tenant_id,
-    storage_layer::{layer::local_layer_path, LayerFileName},
+    storage_layer::{layer::local_layer_path, LayerName},
 };
 
 use pageserver_api::{
@@ -182,7 +182,7 @@ impl SecondaryTenant {
         self: &Arc<Self>,
         conf: &PageServerConf,
         timeline_id: TimelineId,
-        name: LayerFileName,
+        name: LayerName,
         metadata: LayerFileMetadata,
     ) {
         debug_assert_current_span_has_tenant_id();
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 092630e74d..2a8f83be95 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -22,7 +22,7 @@ use crate::{
             FAILED_REMOTE_OP_RETRIES,
         },
         span::debug_assert_current_span_has_tenant_id,
-        storage_layer::{layer::local_layer_path, LayerFileName},
+        storage_layer::{layer::local_layer_path, LayerName},
         tasks::{warn_when_period_overrun, BackgroundLoopKind},
     },
     virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
@@ -111,7 +111,7 @@ impl OnDiskState {
         _conf: &'static PageServerConf,
         _tenant_shard_id: &TenantShardId,
         _imeline_id: &TimelineId,
-        _ame: LayerFileName,
+        _ame: LayerName,
         metadata: LayerFileMetadata,
         access_time: SystemTime,
     ) -> Self {
@@ -124,10 +124,10 @@ impl OnDiskState {
 
 #[derive(Debug, Clone, Default)]
 pub(super) struct SecondaryDetailTimeline {
-    pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
+    pub(super) on_disk_layers: HashMap<LayerName, OnDiskState>,
 
     /// We remember when layers were evicted, to prevent re-downloading them.
-    pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
+    pub(super) evicted_at: HashMap<LayerName, SystemTime>,
 }
 
 /// This state is written by the secondary downloader, it is opaque
@@ -997,7 +997,7 @@ async fn init_timeline_state(
 
     // As we iterate through layers found on disk, we will look up their metadata from this map.
     // Layers not present in metadata will be discarded.
-    let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> =
+    let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
         heatmap.layers.iter().map(|l| (&l.name, l)).collect();
 
     while let Some(dentry) = dir
@@ -1034,7 +1034,7 @@ async fn init_timeline_state(
             continue;
         }
 
-        match LayerFileName::from_str(file_name) {
+        match LayerName::from_str(file_name) {
             Ok(name) => {
                 let remote_meta = heatmap_metadata.get(&name);
                 match remote_meta {
diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
index 73cdf6c6d4..ca91ec24c6 100644
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -1,8 +1,6 @@
 use std::time::SystemTime;
 
-use crate::tenant::{
-    remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName,
-};
+use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName};
 
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
@@ -31,7 +29,7 @@ pub(crate) struct HeatMapTimeline {
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapLayer {
-    pub(super) name: LayerFileName,
+    pub(super) name: LayerName,
     pub(super) metadata: IndexLayerMetadata,
 
     #[serde_as(as = "TimestampSeconds<i64>")]
@@ -42,7 +40,7 @@ pub(crate) struct HeatMapLayer {
 
 impl HeatMapLayer {
     pub(crate) fn new(
-        name: LayerFileName,
+        name: LayerName,
         metadata: IndexLayerMetadata,
         access_time: SystemTime,
     ) -> Self {
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 4f1b56ef9f..94a5e9ec47 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -1,11 +1,11 @@
 //! Common traits and structs for layers
 
 pub mod delta_layer;
-mod filename;
 pub mod image_layer;
 pub(crate) mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
+mod layer_name;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
@@ -34,10 +34,10 @@ use utils::rate_limit::RateLimit;
 use utils::{id::TimelineId, lsn::Lsn};
 
 pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
-pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
+pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
 
 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
 
@@ -646,8 +646,8 @@ pub mod tests {
 
     use super::*;
 
-    impl From<DeltaFileName> for PersistentLayerDesc {
-        fn from(value: DeltaFileName) -> Self {
+    impl From<DeltaLayerName> for PersistentLayerDesc {
+        fn from(value: DeltaLayerName) -> Self {
             PersistentLayerDesc::new_delta(
                 TenantShardId::from([0; 18]),
                 TimelineId::from_array([0; 16]),
@@ -658,8 +658,8 @@ pub mod tests {
         }
     }
 
-    impl From<ImageFileName> for PersistentLayerDesc {
-        fn from(value: ImageFileName) -> Self {
+    impl From<ImageLayerName> for PersistentLayerDesc {
+        fn from(value: ImageLayerName) -> Self {
             PersistentLayerDesc::new_img(
                 TenantShardId::from([0; 18]),
                 TimelineId::from_array([0; 16]),
@@ -670,11 +670,11 @@ pub mod tests {
         }
     }
 
-    impl From<LayerFileName> for PersistentLayerDesc {
-        fn from(value: LayerFileName) -> Self {
+    impl From<LayerName> for PersistentLayerDesc {
+        fn from(value: LayerName) -> Self {
             match value {
-                LayerFileName::Delta(d) => Self::from(d),
-                LayerFileName::Image(i) => Self::from(i),
+                LayerName::Delta(d) => Self::from(d),
+                LayerName::Image(i) => Self::from(i),
             }
         }
     }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 6fd96b0e2f..c38c9bb656 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -57,6 +57,7 @@ use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
+use std::str::FromStr;
 use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tracing::*;
@@ -68,7 +69,8 @@ use utils::{
 };
 
 use super::{
-    AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
+    AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
+    ValuesReconstructState,
 };
 
 ///
@@ -309,13 +311,13 @@ impl DeltaLayer {
             .and_then(|res| res)?;
 
         // not production code
-        let actual_filename = path.file_name().unwrap().to_owned();
-        let expected_filename = self.layer_desc().filename().file_name();
+        let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
+        let expected_layer_name = self.layer_desc().layer_name();
 
-        if actual_filename != expected_filename {
+        if actual_layer_name != expected_layer_name {
             println!("warning: filename does not match what is expected from in-file summary");
-            println!("actual: {:?}", actual_filename);
-            println!("expected: {:?}", expected_filename);
+            println!("actual: {:?}", actual_layer_name.to_string());
+            println!("expected: {:?}", expected_layer_name.to_string());
         }
 
         Ok(Arc::new(loaded))
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 1477a1fc33..c9874873e4 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -54,6 +54,7 @@ use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
+use std::str::FromStr;
 use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
@@ -65,8 +66,10 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::filename::ImageFileName;
-use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer, ValuesReconstructState};
+use super::layer_name::ImageLayerName;
+use super::{
+    AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
+};
 
 ///
 /// Header stored in the beginning of the file
@@ -231,7 +234,7 @@ impl ImageLayer {
         conf: &PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
-        fname: &ImageFileName,
+        fname: &ImageLayerName,
     ) -> Utf8PathBuf {
         let rand_string: String = rand::thread_rng()
             .sample_iter(&Alphanumeric)
@@ -267,13 +270,13 @@ impl ImageLayer {
             .and_then(|res| res)?;
 
         // not production code
-        let actual_filename = path.file_name().unwrap().to_owned();
-        let expected_filename = self.layer_desc().filename().file_name();
+        let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
+        let expected_layer_name = self.layer_desc().layer_name();
 
-        if actual_filename != expected_filename {
+        if actual_layer_name != expected_layer_name {
             println!("warning: filename does not match what is expected from in-file summary");
-            println!("actual: {:?}", actual_filename);
-            println!("expected: {:?}", expected_filename);
+            println!("actual: {:?}", actual_layer_name.to_string());
+            println!("expected: {:?}", expected_layer_name.to_string());
         }
 
         Ok(loaded)
@@ -635,7 +638,7 @@ impl ImageLayerWriterInner {
             conf,
             timeline_id,
             tenant_shard_id,
-            &ImageFileName {
+            &ImageLayerName {
                 key_range: key_range.clone(),
                 lsn,
             },
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index b5e69db7f4..b5b0260327 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -25,7 +25,7 @@ use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
 use super::delta_layer::{self, DeltaEntry};
 use super::image_layer;
 use super::{
-    AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc,
+    AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc,
     ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
 };
 
@@ -128,19 +128,20 @@ pub(crate) fn local_layer_path(
     conf: &PageServerConf,
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
-    layer_file_name: &LayerFileName,
+    layer_file_name: &LayerName,
     _generation: &Generation,
 ) -> Utf8PathBuf {
     let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id);
 
-    timeline_path.join(layer_file_name.file_name())
+    timeline_path.join(layer_file_name.to_string())
 
-    // TOOD: include generation in the name in now+1 releases.
-    // timeline_path.join(format!(
-    //     "{}{}",
-    //     layer_file_name.file_name(),
-    //     generation.get_suffix()
-    // ))
+    // TODO: switch to enabling new-style layer paths after next release
+    // if generation.is_none() {
+    //     // Without a generation, we may only use legacy path style
+    //     timeline_path.join(layer_file_name.to_string())
+    // } else {
+    //     timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix()))
+    // }
 }
 
 impl Layer {
@@ -148,7 +149,7 @@ impl Layer {
     pub(crate) fn for_evicted(
         conf: &'static PageServerConf,
         timeline: &Arc<Timeline>,
-        file_name: LayerFileName,
+        file_name: LayerName,
         metadata: LayerFileMetadata,
     ) -> Self {
         let local_path = local_layer_path(
@@ -189,7 +190,7 @@ impl Layer {
         conf: &'static PageServerConf,
         timeline: &Arc<Timeline>,
         local_path: Utf8PathBuf,
-        file_name: LayerFileName,
+        file_name: LayerName,
         metadata: LayerFileMetadata,
     ) -> ResidentLayer {
         let desc = PersistentLayerDesc::from_filename(
@@ -261,7 +262,7 @@ impl Layer {
                 conf,
                 &timeline.tenant_shard_id,
                 &timeline.timeline_id,
-                &desc.filename(),
+                &desc.layer_name(),
                 &timeline.generation,
             );
 
@@ -689,7 +690,7 @@ impl Drop for LayerInner {
         let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
 
         let path = std::mem::take(&mut self.path);
-        let file_name = self.layer_desc().filename();
+        let file_name = self.layer_desc().layer_name();
         let file_size = self.layer_desc().file_size;
         let timeline = self.timeline.clone();
         let meta = self.metadata();
@@ -782,7 +783,9 @@ impl LayerInner {
 
         LayerInner {
             conf,
-            debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
+            debug_str: {
+                format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
+            },
             path: local_path,
             desc,
             timeline: Arc::downgrade(timeline),
@@ -1120,7 +1123,7 @@ impl LayerInner {
 
         let result = client
             .download_layer_file(
-                &self.desc.filename(),
+                &self.desc.layer_name(),
                 &self.metadata(),
                 &timeline.cancel,
                 ctx,
@@ -1257,7 +1260,7 @@ impl LayerInner {
     }
 
     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.desc.filename().file_name();
+        let layer_name = self.desc.layer_name().to_string();
 
         let resident = self
             .inner
@@ -1271,7 +1274,7 @@ impl LayerInner {
             let lsn_range = &self.desc.lsn_range;
 
             HistoricLayerInfo::Delta {
-                layer_file_name,
+                layer_file_name: layer_name,
                 layer_file_size: self.desc.file_size,
                 lsn_start: lsn_range.start,
                 lsn_end: lsn_range.end,
@@ -1282,7 +1285,7 @@ impl LayerInner {
             let lsn = self.desc.image_layer_lsn();
 
             HistoricLayerInfo::Image {
-                layer_file_name,
+                layer_file_name: layer_name,
                 layer_file_size: self.desc.file_size,
                 lsn_start: lsn,
                 remote: !resident,
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index c375923e81..a89b66e4a1 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -5,7 +5,7 @@ use utils::{id::TimelineId, lsn::Lsn};
 
 use crate::repository::Key;
 
-use super::{DeltaFileName, ImageFileName, LayerFileName};
+use super::{DeltaLayerName, ImageLayerName, LayerName};
 
 use serde::{Deserialize, Serialize};
 
@@ -51,7 +51,7 @@ impl PersistentLayerDesc {
     }
 
     pub fn short_id(&self) -> impl Display {
-        self.filename()
+        self.layer_name()
     }
 
     #[cfg(test)]
@@ -103,14 +103,14 @@ impl PersistentLayerDesc {
     pub fn from_filename(
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
-        filename: LayerFileName,
+        filename: LayerName,
         file_size: u64,
     ) -> Self {
         match filename {
-            LayerFileName::Image(i) => {
+            LayerName::Image(i) => {
                 Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size)
             }
-            LayerFileName::Delta(d) => Self::new_delta(
+            LayerName::Delta(d) => Self::new_delta(
                 tenant_shard_id,
                 timeline_id,
                 d.key_range,
@@ -132,34 +132,34 @@ impl PersistentLayerDesc {
         lsn..(lsn + 1)
     }
 
-    /// Get a delta file name for this layer.
+    /// Get a delta layer name for this layer.
     ///
     /// Panic: if this is not a delta layer.
-    pub fn delta_file_name(&self) -> DeltaFileName {
+    pub fn delta_layer_name(&self) -> DeltaLayerName {
         assert!(self.is_delta);
-        DeltaFileName {
+        DeltaLayerName {
             key_range: self.key_range.clone(),
             lsn_range: self.lsn_range.clone(),
         }
     }
 
-    /// Get a delta file name for this layer.
+    /// Get a image layer name for this layer.
     ///
     /// Panic: if this is not an image layer, or the lsn range is invalid
-    pub fn image_file_name(&self) -> ImageFileName {
+    pub fn image_layer_name(&self) -> ImageLayerName {
         assert!(!self.is_delta);
         assert!(self.lsn_range.start + 1 == self.lsn_range.end);
-        ImageFileName {
+        ImageLayerName {
             key_range: self.key_range.clone(),
             lsn: self.lsn_range.start,
         }
     }
 
-    pub fn filename(&self) -> LayerFileName {
+    pub fn layer_name(&self) -> LayerName {
         if self.is_delta {
-            self.delta_file_name().into()
+            self.delta_layer_name().into()
         } else {
-            self.image_file_name().into()
+            self.image_layer_name().into()
         }
     }
 
diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
similarity index 72%
rename from pageserver/src/tenant/storage_layer/filename.rs
rename to pageserver/src/tenant/storage_layer/layer_name.rs
index fff66a9d07..c733404693 100644
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -15,29 +15,29 @@ use super::PersistentLayerDesc;
 
 // Note: Timeline::load_layer_map() relies on this sort order
 #[derive(PartialEq, Eq, Clone, Hash)]
-pub struct DeltaFileName {
+pub struct DeltaLayerName {
     pub key_range: Range<Key>,
     pub lsn_range: Range<Lsn>,
 }
 
-impl std::fmt::Debug for DeltaFileName {
+impl std::fmt::Debug for DeltaLayerName {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         use super::RangeDisplayDebug;
 
-        f.debug_struct("DeltaFileName")
+        f.debug_struct("DeltaLayerName")
             .field("key_range", &RangeDisplayDebug(&self.key_range))
             .field("lsn_range", &self.lsn_range)
             .finish()
     }
 }
 
-impl PartialOrd for DeltaFileName {
+impl PartialOrd for DeltaLayerName {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl Ord for DeltaFileName {
+impl Ord for DeltaLayerName {
     fn cmp(&self, other: &Self) -> Ordering {
         let mut cmp = self.key_range.start.cmp(&other.key_range.start);
         if cmp != Ordering::Equal {
@@ -57,16 +57,14 @@ impl Ord for DeltaFileName {
     }
 }
 
-/// Represents the filename of a DeltaLayer
+/// Represents the region of the LSN-Key space covered by a DeltaLayer
 ///
 /// ```text
 ///    <key start>-<key end>__<LSN start>-<LSN end>
 /// ```
-impl DeltaFileName {
-    ///
-    /// Parse a string as a delta file name. Returns None if the filename does not
-    /// match the expected pattern.
-    ///
+impl DeltaLayerName {
+    /// Parse the part of a delta layer's file name that represents the LayerName. Returns None
+    /// if the filename does not match the expected pattern.
     pub fn parse_str(fname: &str) -> Option<Self> {
         let mut parts = fname.split("__");
         let mut key_parts = parts.next()?.split('-');
@@ -105,14 +103,14 @@ impl DeltaFileName {
             // or panic?
         }
 
-        Some(DeltaFileName {
+        Some(DeltaLayerName {
             key_range: key_start..key_end,
             lsn_range: start_lsn..end_lsn,
         })
     }
 }
 
-impl fmt::Display for DeltaFileName {
+impl fmt::Display for DeltaLayerName {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(
             f,
@@ -126,29 +124,29 @@ impl fmt::Display for DeltaFileName {
 }
 
 #[derive(PartialEq, Eq, Clone, Hash)]
-pub struct ImageFileName {
+pub struct ImageLayerName {
     pub key_range: Range<Key>,
     pub lsn: Lsn,
 }
 
-impl std::fmt::Debug for ImageFileName {
+impl std::fmt::Debug for ImageLayerName {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         use super::RangeDisplayDebug;
 
-        f.debug_struct("ImageFileName")
+        f.debug_struct("ImageLayerName")
             .field("key_range", &RangeDisplayDebug(&self.key_range))
             .field("lsn", &self.lsn)
             .finish()
     }
 }
 
-impl PartialOrd for ImageFileName {
+impl PartialOrd for ImageLayerName {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl Ord for ImageFileName {
+impl Ord for ImageLayerName {
     fn cmp(&self, other: &Self) -> Ordering {
         let mut cmp = self.key_range.start.cmp(&other.key_range.start);
         if cmp != Ordering::Equal {
@@ -164,7 +162,7 @@ impl Ord for ImageFileName {
     }
 }
 
-impl ImageFileName {
+impl ImageLayerName {
     pub fn lsn_as_range(&self) -> Range<Lsn> {
         // Saves from having to copypaste this all over
         PersistentLayerDesc::image_layer_lsn_range(self.lsn)
@@ -172,16 +170,14 @@ impl ImageFileName {
 }
 
 ///
-/// Represents the filename of an ImageLayer
+/// Represents the part of the Key-LSN space covered by an ImageLayer
 ///
 /// ```text
 ///    <key start>-<key end>__<LSN>
 /// ```
-impl ImageFileName {
-    ///
-    /// Parse a string as an image file name. Returns None if the filename does not
-    /// match the expected pattern.
-    ///
+impl ImageLayerName {
+    /// Parse a string as then LayerName part of an image layer file name. Returns None if the
+    /// filename does not match the expected pattern.
     pub fn parse_str(fname: &str) -> Option<Self> {
         let mut parts = fname.split("__");
         let mut key_parts = parts.next()?.split('-');
@@ -202,14 +198,14 @@ impl ImageFileName {
 
         let lsn = Lsn::from_hex(lsn_str).ok()?;
 
-        Some(ImageFileName {
+        Some(ImageLayerName {
             key_range: key_start..key_end,
             lsn,
         })
     }
 }
 
-impl fmt::Display for ImageFileName {
+impl fmt::Display for ImageLayerName {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(
             f,
@@ -220,21 +216,24 @@ impl fmt::Display for ImageFileName {
         )
     }
 }
+
+/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.  The
+/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
+/// over time (e.g. across shard splits or compression). The physical filenames of layers in local
+/// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
+/// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
+/// and [`crate::tenant::storage_layer::layer::local_layer_path`])
 #[derive(Debug, PartialEq, Eq, Hash, Clone)]
-pub enum LayerFileName {
-    Image(ImageFileName),
-    Delta(DeltaFileName),
+pub enum LayerName {
+    Image(ImageLayerName),
+    Delta(DeltaLayerName),
 }
 
-impl LayerFileName {
-    pub fn file_name(&self) -> String {
-        self.to_string()
-    }
-
+impl LayerName {
     /// Determines if this layer file is considered to be in future meaning we will discard these
     /// layers during timeline initialization from the given disk_consistent_lsn.
     pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool {
-        use LayerFileName::*;
+        use LayerName::*;
         match self {
             Image(file_name) if file_name.lsn > disk_consistent_lsn => true,
             Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true,
@@ -243,7 +242,7 @@ impl LayerFileName {
     }
 
     pub(crate) fn kind(&self) -> &'static str {
-        use LayerFileName::*;
+        use LayerName::*;
         match self {
             Delta(_) => "delta",
             Image(_) => "image",
@@ -251,7 +250,7 @@ impl LayerFileName {
     }
 }
 
-impl fmt::Display for LayerFileName {
+impl fmt::Display for LayerName {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
             Self::Image(fname) => write!(f, "{fname}"),
@@ -260,25 +259,25 @@ impl fmt::Display for LayerFileName {
     }
 }
 
-impl From<ImageFileName> for LayerFileName {
-    fn from(fname: ImageFileName) -> Self {
+impl From<ImageLayerName> for LayerName {
+    fn from(fname: ImageLayerName) -> Self {
         Self::Image(fname)
     }
 }
-impl From<DeltaFileName> for LayerFileName {
-    fn from(fname: DeltaFileName) -> Self {
+impl From<DeltaLayerName> for LayerName {
+    fn from(fname: DeltaLayerName) -> Self {
         Self::Delta(fname)
     }
 }
 
-impl FromStr for LayerFileName {
+impl FromStr for LayerName {
     type Err = String;
 
     /// Conversion from either a physical layer filename, or the string-ization of
     /// Self. When loading a physical layer filename, we drop any extra information
     /// not needed to build Self.
     fn from_str(value: &str) -> Result<Self, Self::Err> {
-        let gen_suffix_regex = Regex::new("^(?<base>.+)-(?<gen>[0-9a-f]{8})$").unwrap();
+        let gen_suffix_regex = Regex::new("^(?<base>.+)(?<gen>-v1-[0-9a-f]{8})$").unwrap();
         let file_name: Cow<str> = match gen_suffix_regex.captures(value) {
             Some(captures) => captures
                 .name("base")
@@ -288,8 +287,8 @@ impl FromStr for LayerFileName {
             None => value.into(),
         };
 
-        let delta = DeltaFileName::parse_str(&file_name);
-        let image = ImageFileName::parse_str(&file_name);
+        let delta = DeltaLayerName::parse_str(&file_name);
+        let image = ImageLayerName::parse_str(&file_name);
         let ok = match (delta, image) {
             (None, None) => {
                 return Err(format!(
@@ -304,7 +303,7 @@ impl FromStr for LayerFileName {
     }
 }
 
-impl serde::Serialize for LayerFileName {
+impl serde::Serialize for LayerName {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
@@ -316,19 +315,19 @@ impl serde::Serialize for LayerFileName {
     }
 }
 
-impl<'de> serde::Deserialize<'de> for LayerFileName {
+impl<'de> serde::Deserialize<'de> for LayerName {
     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
     where
         D: serde::Deserializer<'de>,
     {
-        deserializer.deserialize_string(LayerFileNameVisitor)
+        deserializer.deserialize_string(LayerNameVisitor)
     }
 }
 
-struct LayerFileNameVisitor;
+struct LayerNameVisitor;
 
-impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
-    type Value = LayerFileName;
+impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
+    type Value = LayerName;
 
     fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
         write!(
@@ -349,16 +348,16 @@ mod test {
     use super::*;
     #[test]
     fn image_layer_parse() -> anyhow::Result<()> {
-        let expected = LayerFileName::Image(ImageFileName {
+        let expected = LayerName::Image(ImageLayerName {
             key_range: Key::from_i128(0)
                 ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
             lsn: Lsn::from_hex("00000000014FED58").unwrap(),
         });
-        let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-00000001").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
         assert_eq!(parsed, expected,);
 
         // Omitting generation suffix is valid
-        let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
         assert_eq!(parsed, expected,);
 
         Ok(())
@@ -366,17 +365,17 @@ mod test {
 
     #[test]
     fn delta_layer_parse() -> anyhow::Result<()> {
-        let expected = LayerFileName::Delta(DeltaFileName {
+        let expected = LayerName::Delta(DeltaLayerName {
             key_range: Key::from_i128(0)
                 ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
             lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
                 ..Lsn::from_hex("000000000154C481").unwrap(),
         });
-        let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-00000001").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
         assert_eq!(parsed, expected);
 
         // Omitting generation suffix is valid
-        let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
         assert_eq!(parsed, expected);
 
         Ok(())
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d6d012c70c..7edb922069 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -60,7 +60,7 @@ use std::{
     ops::ControlFlow,
 };
 
-use crate::tenant::storage_layer::layer::local_layer_path;
+use crate::tenant::timeline::init::LocalLayerFileMetadata;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
     metadata::TimelineMetadata,
@@ -75,7 +75,7 @@ use crate::{
     disk_usage_eviction_task::finite_f32,
     tenant::storage_layer::{
         AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
-        LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
+        LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult,
         ValueReconstructState, ValuesReconstructState,
     },
 };
@@ -1905,7 +1905,7 @@ impl Timeline {
     #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
     pub(crate) async fn download_layer(
         &self,
-        layer_file_name: &LayerFileName,
+        layer_file_name: &LayerName,
     ) -> anyhow::Result<Option<bool>> {
         let Some(layer) = self.find_layer(layer_file_name).await else {
             return Ok(None);
@@ -1925,7 +1925,7 @@ impl Timeline {
     /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
     pub(crate) async fn evict_layer(
         &self,
-        layer_file_name: &LayerFileName,
+        layer_file_name: &LayerName,
     ) -> anyhow::Result<Option<bool>> {
         let _gate = self
             .gate
@@ -2387,13 +2387,13 @@ impl Timeline {
         index_part: Option<IndexPart>,
     ) -> anyhow::Result<()> {
         use init::{Decision::*, Discovered, DismissedLayer};
-        use LayerFileName::*;
+        use LayerName::*;
 
         let mut guard = self.layers.write().await;
 
         let timer = self.metrics.load_layer_map_histo.start_timer();
 
-        // Scan timeline directory and create ImageFileName and DeltaFilename
+        // Scan timeline directory and create ImageLayerName and DeltaFilename
         // structs representing all files on disk
         let timeline_path = self
             .conf
@@ -2463,33 +2463,35 @@ impl Timeline {
                 let mut needs_cleanup = Vec::new();
                 let mut total_physical_size = 0;
 
-                for (name, local_path, decision) in decided {
+                for (name, decision) in decided {
                     let decision = match decision {
                         Ok(UseRemote { local, remote }) => {
                             // Remote is authoritative, but we may still choose to retain
                             // the local file if the contents appear to match
-                            if local.file_size() == remote.file_size() {
+                            if local.metadata.file_size() == remote.file_size() {
                                 // Use the local file, but take the remote metadata so that we pick up
                                 // the correct generation.
-                                UseLocal(remote)
+                                UseLocal(
+                                    LocalLayerFileMetadata {
+                                        metadata: remote,
+                                        local_path: local.local_path
+                                    }
+                                )
                             } else {
-                                let local_path = local_path.as_ref().expect("Locally found layer must have path");
-                                init::cleanup_local_file_for_remote(local_path, &local, &remote)?;
+                                init::cleanup_local_file_for_remote(&local, &remote)?;
                                 UseRemote { local, remote }
                             }
                         }
                         Ok(decision) => decision,
                         Err(DismissedLayer::Future { local }) => {
-                            if local.is_some() {
-                                let local_path = local_path.expect("Locally found layer must have path");
-                                init::cleanup_future_layer(&local_path, &name, disk_consistent_lsn)?;
+                            if let Some(local) = local {
+                                init::cleanup_future_layer(&local.local_path, &name, disk_consistent_lsn)?;
                             }
                             needs_cleanup.push(name);
                             continue;
                         }
                         Err(DismissedLayer::LocalOnly(local)) => {
-                            let local_path = local_path.expect("Locally found layer must have path");
-                            init::cleanup_local_only_file(&local_path, &name, &local)?;
+                            init::cleanup_local_only_file(&name, &local)?;
                             // this file never existed remotely, we will have to do rework
                             continue;
                         }
@@ -2503,20 +2505,9 @@ impl Timeline {
                     tracing::debug!(layer=%name, ?decision, "applied");
 
                     let layer = match decision {
-                        UseLocal(m) => {
-                            total_physical_size += m.file_size();
-
-                            let local_path = local_path.unwrap_or_else(|| {
-                                local_layer_path(
-                                    conf,
-                                    &this.tenant_shard_id,
-                                    &this.timeline_id,
-                                    &name,
-                                    &m.generation,
-                                )
-                            });
-
-                            Layer::for_resident(conf, &this, local_path, name, m).drop_eviction_guard()
+                        UseLocal(local) => {
+                            total_physical_size += local.metadata.file_size();
+                            Layer::for_resident(conf, &this, local.local_path, name, local.metadata).drop_eviction_guard()
                         }
                         Evicted(remote) | UseRemote { remote, .. } => {
                             Layer::for_evicted(conf, &this, name, remote)
@@ -2997,10 +2988,10 @@ impl Timeline {
         }
     }
 
-    async fn find_layer(&self, layer_name: &LayerFileName) -> Option<Layer> {
+    async fn find_layer(&self, layer_name: &LayerName) -> Option<Layer> {
         let guard = self.layers.read().await;
         for historic_layer in guard.layer_map().iter_historic_layers() {
-            let historic_layer_name = historic_layer.filename();
+            let historic_layer_name = historic_layer.layer_name();
             if layer_name == &historic_layer_name {
                 return Some(guard.get_from_desc(&historic_layer));
             }
@@ -3030,7 +3021,7 @@ impl Timeline {
             let last_activity_ts = layer.access_stats().latest_activity_or_now();
 
             HeatMapLayer::new(
-                layer.layer_desc().filename(),
+                layer.layer_desc().layer_name(),
                 (&layer.metadata()).into(),
                 last_activity_ts,
             )
@@ -3177,7 +3168,7 @@ impl Timeline {
             if let Some(open_layer) = &layers.open_layer {
                 let start_lsn = open_layer.get_lsn_range().start;
                 if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display());
+                    //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display());
                     // Get all the data needed to reconstruct the page version from this layer.
                     // But if we have an older cached page image, no need to go past that.
                     let lsn_floor = max(cached_lsn + 1, start_lsn);
@@ -3206,7 +3197,7 @@ impl Timeline {
             for frozen_layer in layers.frozen_layers.iter().rev() {
                 let start_lsn = frozen_layer.get_lsn_range().start;
                 if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
+                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display());
                     let lsn_floor = max(cached_lsn + 1, start_lsn);
 
                     let frozen_layer = frozen_layer.clone();
@@ -4731,7 +4722,7 @@ impl Timeline {
             if l.get_lsn_range().end > horizon_cutoff {
                 debug!(
                     "keeping {} because it's newer than horizon_cutoff {}",
-                    l.filename(),
+                    l.layer_name(),
                     horizon_cutoff,
                 );
                 result.layers_needed_by_cutoff += 1;
@@ -4742,7 +4733,7 @@ impl Timeline {
             if l.get_lsn_range().end > pitr_cutoff {
                 debug!(
                     "keeping {} because it's newer than pitr_cutoff {}",
-                    l.filename(),
+                    l.layer_name(),
                     pitr_cutoff,
                 );
                 result.layers_needed_by_pitr += 1;
@@ -4761,7 +4752,7 @@ impl Timeline {
                 if &l.get_lsn_range().start <= retain_lsn {
                     debug!(
                         "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}",
-                        l.filename(),
+                        l.layer_name(),
                         retain_lsn,
                         l.is_incremental(),
                     );
@@ -4792,7 +4783,7 @@ impl Timeline {
             if !layers
                 .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
             {
-                debug!("keeping {} because it is the latest layer", l.filename());
+                debug!("keeping {} because it is the latest layer", l.layer_name());
                 result.layers_not_updated += 1;
                 continue 'outer;
             }
@@ -4800,7 +4791,7 @@ impl Timeline {
             // We didn't find any reason to keep this file, so remove it.
             debug!(
                 "garbage collecting {} is_dropped: xx is_incremental: {}",
-                l.filename(),
+                l.layer_name(),
                 l.is_incremental(),
             );
             layers_to_remove.push(l);
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 5c2b25da56..2641bf3d13 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -407,7 +407,7 @@ async fn remote_copy(
     let owned = crate::tenant::storage_layer::Layer::for_evicted(
         adoptee.conf,
         adoptee,
-        adopted.layer_desc().filename(),
+        adopted.layer_desc().layer_name(),
         metadata,
     );
 
diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs
index 9c33981807..66aa765015 100644
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -6,7 +6,7 @@ use crate::{
             self,
             index::{IndexPart, LayerFileMetadata},
         },
-        storage_layer::LayerFileName,
+        storage_layer::LayerName,
         Generation,
     },
     METADATA_FILE_NAME,
@@ -20,7 +20,7 @@ use utils::lsn::Lsn;
 /// Identified files in the timeline directory.
 pub(super) enum Discovered {
     /// The only one we care about
-    Layer(LayerFileName, Utf8PathBuf, u64),
+    Layer(LayerName, Utf8PathBuf, u64),
     /// Old ephmeral files from previous launches, should be removed
     Ephemeral(String),
     /// Old temporary timeline files, unsure what these really are, should be removed
@@ -43,7 +43,7 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
         let direntry = direntry?;
         let file_name = direntry.file_name().to_string();
 
-        let discovered = match LayerFileName::from_str(&file_name) {
+        let discovered = match LayerName::from_str(&file_name) {
             Ok(file_name) => {
                 let file_size = direntry.metadata()?.len();
                 Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
@@ -72,6 +72,28 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
     Ok(ret)
 }
 
+/// Whereas `LayerFileMetadata` describes the metadata we would store in remote storage,
+/// this structure extends it with metadata describing the layer's presence in local storage.
+#[derive(Clone, Debug)]
+pub(super) struct LocalLayerFileMetadata {
+    pub(super) metadata: LayerFileMetadata,
+    pub(super) local_path: Utf8PathBuf,
+}
+
+impl LocalLayerFileMetadata {
+    pub fn new(
+        local_path: Utf8PathBuf,
+        file_size: u64,
+        generation: Generation,
+        shard: ShardIndex,
+    ) -> Self {
+        Self {
+            local_path,
+            metadata: LayerFileMetadata::new(file_size, generation, shard),
+        }
+    }
+}
+
 /// Decision on what to do with a layer file after considering its local and remote metadata.
 #[derive(Clone, Debug)]
 pub(super) enum Decision {
@@ -80,11 +102,11 @@ pub(super) enum Decision {
     /// The layer is present locally, but local metadata does not match remote; we must
     /// delete it and treat it as evicted.
     UseRemote {
-        local: LayerFileMetadata,
+        local: LocalLayerFileMetadata,
         remote: LayerFileMetadata,
     },
     /// The layer is present locally, and metadata matches.
-    UseLocal(LayerFileMetadata),
+    UseLocal(LocalLayerFileMetadata),
 }
 
 /// A layer needs to be left out of the layer map.
@@ -92,39 +114,29 @@ pub(super) enum Decision {
 pub(super) enum DismissedLayer {
     /// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
     Future {
-        /// The local metadata. `None` if the layer is only known through [`IndexPart`].
-        local: Option<LayerFileMetadata>,
+        /// `None` if the layer is only known through [`IndexPart`].
+        local: Option<LocalLayerFileMetadata>,
     },
     /// The layer only exists locally.
     ///
     /// In order to make crash safe updates to layer map, we must dismiss layers which are only
     /// found locally or not yet included in the remote `index_part.json`.
-    LocalOnly(LayerFileMetadata),
+    LocalOnly(LocalLayerFileMetadata),
 }
 
 /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
 pub(super) fn reconcile(
-    discovered: Vec<(LayerFileName, Utf8PathBuf, u64)>,
+    discovered: Vec<(LayerName, Utf8PathBuf, u64)>,
     index_part: Option<&IndexPart>,
     disk_consistent_lsn: Lsn,
     generation: Generation,
     shard: ShardIndex,
-) -> Vec<(
-    LayerFileName,
-    Option<Utf8PathBuf>,
-    Result<Decision, DismissedLayer>,
-)> {
+) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
     use Decision::*;
 
-    // name => (local_path, local_metadata, remote_metadata)
-    type Collected = HashMap<
-        LayerFileName,
-        (
-            Option<Utf8PathBuf>,
-            Option<LayerFileMetadata>,
-            Option<LayerFileMetadata>,
-        ),
-    >;
+    // name => (local_metadata, remote_metadata)
+    type Collected =
+        HashMap<LayerName, (Option<LocalLayerFileMetadata>, Option<LayerFileMetadata>)>;
 
     let mut discovered = discovered
         .into_iter()
@@ -135,8 +147,9 @@ pub(super) fn reconcile(
                 // it is not in IndexPart, in which case using our current generation makes sense
                 // because it will be uploaded in this generation.
                 (
-                    Some(local_path),
-                    Some(LayerFileMetadata::new(file_size, generation, shard)),
+                    Some(LocalLayerFileMetadata::new(
+                        local_path, file_size, generation, shard,
+                    )),
                     None,
                 ),
             )
@@ -152,20 +165,20 @@ pub(super) fn reconcile(
         .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
         .for_each(|(name, metadata)| {
             if let Some(existing) = discovered.get_mut(name) {
-                existing.2 = Some(metadata);
+                existing.1 = Some(metadata);
             } else {
-                discovered.insert(name.to_owned(), (None, None, Some(metadata)));
+                discovered.insert(name.to_owned(), (None, Some(metadata)));
             }
         });
 
     discovered
         .into_iter()
-        .map(|(name, (local_path, local, remote))| {
+        .map(|(name, (local, remote))| {
             let decision = if name.is_in_future(disk_consistent_lsn) {
                 Err(DismissedLayer::Future { local })
             } else {
                 match (local, remote) {
-                    (Some(local), Some(remote)) if local != remote => {
+                    (Some(local), Some(remote)) if local.metadata != remote => {
                         Ok(UseRemote { local, remote })
                     }
                     (Some(x), Some(_)) => Ok(UseLocal(x)),
@@ -177,7 +190,7 @@ pub(super) fn reconcile(
                 }
             };
 
-            (name, local_path, decision)
+            (name, decision)
         })
         .collect::<Vec<_>>()
 }
@@ -189,12 +202,12 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
 }
 
 pub(super) fn cleanup_local_file_for_remote(
-    path: &Utf8Path,
-    local: &LayerFileMetadata,
+    local: &LocalLayerFileMetadata,
     remote: &LayerFileMetadata,
 ) -> anyhow::Result<()> {
-    let local_size = local.file_size();
+    let local_size = local.metadata.file_size();
     let remote_size = remote.file_size();
+    let path = &local.local_path;
 
     let file_name = path.file_name().expect("must be file path");
     tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
@@ -211,7 +224,7 @@ pub(super) fn cleanup_local_file_for_remote(
 
 pub(super) fn cleanup_future_layer(
     path: &Utf8Path,
-    name: &LayerFileName,
+    name: &LayerName,
     disk_consistent_lsn: Lsn,
 ) -> anyhow::Result<()> {
     // future image layers are allowed to be produced always for not yet flushed to disk
@@ -223,12 +236,14 @@ pub(super) fn cleanup_future_layer(
 }
 
 pub(super) fn cleanup_local_only_file(
-    path: &Utf8Path,
-    name: &LayerFileName,
-    local: &LayerFileMetadata,
+    name: &LayerName,
+    local: &LocalLayerFileMetadata,
 ) -> anyhow::Result<()> {
     let kind = name.kind();
-    tracing::info!("found local-only {kind} layer {name}, metadata {local:?}");
-    std::fs::remove_file(path)?;
+    tracing::info!(
+        "found local-only {kind} layer {name}, metadata {:?}",
+        local.metadata
+    );
+    std::fs::remove_file(&local.local_path)?;
     Ok(())
 }
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 8e8d64e0c6..a72eb1b3bf 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -294,7 +294,7 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
         // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
         self.0
             .get(&desc.key())
-            .with_context(|| format!("get layer from desc: {}", desc.filename()))
+            .with_context(|| format!("get layer from desc: {}", desc.layer_name()))
             .expect("not found")
             .clone()
     }
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 0bf4d1e599..7797117e0f 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,4 +1,4 @@
-use super::storage_layer::LayerFileName;
+use super::storage_layer::LayerName;
 use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
@@ -45,7 +45,7 @@ pub(crate) struct UploadQueueInitialized {
 
     /// All layer files stored in the remote storage, taking into account all
     /// in-progress and queued operations
-    pub(crate) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
+    pub(crate) latest_files: HashMap<LayerName, LayerFileMetadata>,
 
     /// How many file uploads or deletions been scheduled, since the
     /// last (scheduling of) metadata index upload?
@@ -89,7 +89,7 @@ pub(crate) struct UploadQueueInitialized {
     /// Putting this behind a testing feature to catch problems in tests, but assuming we could have a
     /// bug causing leaks, then it's better to not leave this enabled for production builds.
     #[cfg(feature = "testing")]
-    pub(crate) dangling_files: HashMap<LayerFileName, Generation>,
+    pub(crate) dangling_files: HashMap<LayerName, Generation>,
 
     /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`.
     pub(crate) shutting_down: bool,
@@ -281,7 +281,7 @@ pub(crate) struct UploadTask {
 /// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
 #[derive(Debug)]
 pub(crate) struct Delete {
-    pub(crate) layers: Vec<(LayerFileName, LayerFileMetadata)>,
+    pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>,
 }
 
 #[derive(Debug)]
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index 7c0f699958..68133fc0a9 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -13,7 +13,7 @@ use crate::metadata_stream::stream_listing;
 use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
 use futures_util::StreamExt;
 use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
-use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use remote_storage::RemotePath;
 
@@ -110,7 +110,7 @@ pub(crate) fn branch_cleanup_and_check_errors(
                     for (layer, metadata) in index_part.layer_metadata {
                         if metadata.file_size == 0 {
                             result.errors.push(format!(
-                                "index_part.json contains a layer {} that has 0 size in its layer metadata", layer.file_name(),
+                                "index_part.json contains a layer {} that has 0 size in its layer metadata", layer,
                             ))
                         }
 
@@ -121,7 +121,7 @@ pub(crate) fn branch_cleanup_and_check_errors(
                             // layer we think is missing.
                             result.errors.push(format!(
                                 "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage",
-                                layer.file_name(),
+                                layer,
                                 metadata.generation.get_suffix(),
                                 metadata.shard
                             ))
@@ -170,8 +170,7 @@ pub(crate) struct LayerRef {
 /// the tenant to query whether an object exists.
 #[derive(Default)]
 pub(crate) struct TenantObjectListing {
-    shard_timelines:
-        HashMap<(ShardIndex, TimelineId), HashMap<(LayerFileName, Generation), LayerRef>>,
+    shard_timelines: HashMap<(ShardIndex, TimelineId), HashMap<(LayerName, Generation), LayerRef>>,
 }
 
 impl TenantObjectListing {
@@ -180,7 +179,7 @@ impl TenantObjectListing {
     pub(crate) fn push(
         &mut self,
         ttid: TenantShardTimelineId,
-        layers: HashSet<(LayerFileName, Generation)>,
+        layers: HashSet<(LayerName, Generation)>,
     ) {
         let shard_index = ShardIndex::new(
             ttid.tenant_shard_id.shard_number,
@@ -208,7 +207,7 @@ impl TenantObjectListing {
     pub(crate) fn check_ref(
         &mut self,
         timeline_id: TimelineId,
-        layer_file: &LayerFileName,
+        layer_file: &LayerName,
         metadata: &IndexLayerMetadata,
     ) -> bool {
         let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else {
@@ -224,7 +223,7 @@ impl TenantObjectListing {
         true
     }
 
-    pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerFileName, Generation)> {
+    pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerName, Generation)> {
         let mut result = Vec::new();
         for ((shard_index, timeline_id), layers) in &self.shard_timelines {
             for ((layer_file, generation), layer_ref) in layers {
@@ -249,23 +248,23 @@ pub(crate) enum BlobDataParseResult {
     Parsed {
         index_part: IndexPart,
         index_part_generation: Generation,
-        s3_layers: HashSet<(LayerFileName, Generation)>,
+        s3_layers: HashSet<(LayerName, Generation)>,
     },
     /// The remains of a deleted Timeline (i.e. an initdb archive only)
     Relic,
     Incorrect(Vec<String>),
 }
 
-fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), String> {
+fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
     match name.rsplit_once('-') {
         // FIXME: this is gross, just use a regex?
         Some((layer_filename, gen)) if gen.len() == 8 => {
-            let layer = layer_filename.parse::<LayerFileName>()?;
+            let layer = layer_filename.parse::<LayerName>()?;
             let gen =
                 Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?;
             Ok((layer, gen))
         }
-        _ => Ok((name.parse::<LayerFileName>()?, Generation::none())),
+        _ => Ok((name.parse::<LayerName>()?, Generation::none())),
     }
 }
 
diff --git a/s3_scrubber/src/tenant_snapshot.rs b/s3_scrubber/src/tenant_snapshot.rs
index 4eccad381b..2c93a8490a 100644
--- a/s3_scrubber/src/tenant_snapshot.rs
+++ b/s3_scrubber/src/tenant_snapshot.rs
@@ -12,7 +12,7 @@ use aws_sdk_s3::Client;
 use camino::Utf8PathBuf;
 use futures::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
-use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use pageserver_api::shard::TenantShardId;
 use utils::generation::Generation;
@@ -48,16 +48,16 @@ impl SnapshotDownloader {
     async fn download_layer(
         &self,
         ttid: TenantShardTimelineId,
-        layer_name: LayerFileName,
+        layer_name: LayerName,
         layer_metadata: IndexLayerMetadata,
-    ) -> anyhow::Result<(LayerFileName, IndexLayerMetadata)> {
+    ) -> anyhow::Result<(LayerName, IndexLayerMetadata)> {
         // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format.  They use
         // different layer names (remote-style has the generation suffix)
         let local_path = self.output_path.join(format!(
             "{}/timelines/{}/{}{}",
             ttid.tenant_shard_id,
             ttid.timeline_id,
-            layer_name.file_name(),
+            layer_name,
             layer_metadata.generation.get_suffix()
         ));
 
@@ -76,7 +76,7 @@ impl SnapshotDownloader {
             let remote_layer_path = format!(
                 "{}{}{}",
                 timeline_root.prefix_in_bucket,
-                layer_name.file_name(),
+                layer_name,
                 layer_metadata.generation.get_suffix()
             );
 
@@ -110,7 +110,7 @@ impl SnapshotDownloader {
     async fn download_layers(
         &self,
         ttid: TenantShardTimelineId,
-        layers: Vec<(LayerFileName, IndexLayerMetadata)>,
+        layers: Vec<(LayerName, IndexLayerMetadata)>,
     ) -> anyhow::Result<()> {
         let layer_count = layers.len();
         tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count);
@@ -138,7 +138,7 @@ impl SnapshotDownloader {
                     tracing::info!(
                         "[{download_count}/{layer_count}] OK: {} bytes {ttid} {}",
                         layer_metadata.file_size,
-                        layer_name.file_name()
+                        layer_name
                     );
                 }
                 Err(e) => {
@@ -163,7 +163,7 @@ impl SnapshotDownloader {
         index_part_generation: Generation,
         ancestor_layers: &mut HashMap<
             TenantShardTimelineId,
-            HashMap<LayerFileName, IndexLayerMetadata>,
+            HashMap<LayerName, IndexLayerMetadata>,
         >,
     ) -> anyhow::Result<()> {
         let index_bytes = serde_json::to_string(&index_part).unwrap();
@@ -234,7 +234,7 @@ impl SnapshotDownloader {
         // happen if this tenant has been split at some point)
         let mut ancestor_layers: HashMap<
             TenantShardTimelineId,
-            HashMap<LayerFileName, IndexLayerMetadata>,
+            HashMap<LayerName, IndexLayerMetadata>,
         > = Default::default();
 
         for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index f618c508bc..390b94c2ea 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -54,7 +54,7 @@ from fixtures.pageserver.allowed_errors import (
     DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
 )
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pageserver.types import IndexPartDump, LayerFileName, parse_layer_file_name
+from fixtures.pageserver.types import IndexPartDump, LayerName, parse_layer_file_name
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
     wait_for_upload,
@@ -2664,7 +2664,7 @@ class NeonPageserver(PgProtocol, LogUtils):
         )
 
     def layer_exists(
-        self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerFileName
+        self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerName
     ) -> bool:
         layers = self.list_layers(tenant_id, timeline_id)
         return layer_name in [parse_layer_file_name(p.name) for p in layers]
diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/types.py
index fd018cb778..1fb618f445 100644
--- a/test_runner/fixtures/pageserver/types.py
+++ b/test_runner/fixtures/pageserver/types.py
@@ -12,7 +12,7 @@ class IndexLayerMetadata:
 
 
 @dataclass(frozen=True)
-class ImageLayerFileName:
+class ImageLayerName:
     lsn: Lsn
     key_start: Key
     key_end: Key
@@ -26,7 +26,7 @@ class ImageLayerFileName:
 
 
 @dataclass(frozen=True)
-class DeltaLayerFileName:
+class DeltaLayerName:
     lsn_start: Lsn
     lsn_end: Lsn
     key_start: Key
@@ -41,14 +41,16 @@ class DeltaLayerFileName:
         return ret
 
 
-LayerFileName = Union[ImageLayerFileName, DeltaLayerFileName]
+LayerName = Union[ImageLayerName, DeltaLayerName]
 
 
 class InvalidFileName(Exception):
     pass
 
 
-IMAGE_LAYER_FILE_NAME = re.compile("^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-[a-f0-9]{8})?$")
+IMAGE_LAYER_FILE_NAME = re.compile(
+    "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-v1-[a-f0-9]{8})?$"
+)
 
 
 def parse_image_layer(f_name: str) -> Tuple[int, int, int]:
@@ -62,7 +64,7 @@ def parse_image_layer(f_name: str) -> Tuple[int, int, int]:
 
 
 DELTA_LAYER_FILE_NAME = re.compile(
-    "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-[a-f0-9]{8})?$"
+    "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-v1-[a-f0-9]{8})?$"
 )
 
 
@@ -80,16 +82,16 @@ def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]:
     )
 
 
-def parse_layer_file_name(file_name: str) -> LayerFileName:
+def parse_layer_file_name(file_name: str) -> LayerName:
     try:
         key_start, key_end, lsn = parse_image_layer(file_name)
-        return ImageLayerFileName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end))
+        return ImageLayerName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end))
     except InvalidFileName:
         pass
 
     try:
         key_start, key_end, lsn_start, lsn_end = parse_delta_layer(file_name)
-        return DeltaLayerFileName(
+        return DeltaLayerName(
             lsn_start=Lsn(lsn_start),
             lsn_end=Lsn(lsn_end),
             key_start=Key(key_start),
@@ -101,18 +103,15 @@ def parse_layer_file_name(file_name: str) -> LayerFileName:
     raise InvalidFileName("neither image nor delta layer")
 
 
-def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn):
+def is_future_layer(layer_file_name: LayerName, disk_consistent_lsn: Lsn):
     """
     Determines if this layer file is considered to be in future meaning we will discard these
     layers during timeline initialization from the given disk_consistent_lsn.
     """
-    if (
-        isinstance(layer_file_name, ImageLayerFileName)
-        and layer_file_name.lsn > disk_consistent_lsn
-    ):
+    if isinstance(layer_file_name, ImageLayerName) and layer_file_name.lsn > disk_consistent_lsn:
         return True
     elif (
-        isinstance(layer_file_name, DeltaLayerFileName)
+        isinstance(layer_file_name, DeltaLayerName)
         and layer_file_name.lsn_end > disk_consistent_lsn + 1
     ):
         return True
@@ -122,7 +121,7 @@ def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn):
 
 @dataclass
 class IndexPartDump:
-    layer_metadata: Dict[LayerFileName, IndexLayerMetadata]
+    layer_metadata: Dict[LayerName, IndexLayerMetadata]
     disk_consistent_lsn: Lsn
 
     @classmethod
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index f311a8bf2c..cc34fd83e9 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -3,8 +3,8 @@ import time
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
 from fixtures.pageserver.types import (
-    DeltaLayerFileName,
-    ImageLayerFileName,
+    DeltaLayerName,
+    ImageLayerName,
     is_future_layer,
 )
 from fixtures.pageserver.utils import (
@@ -81,7 +81,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     current = get_index_part()
     assert len(set(current.layer_metadata.keys())) == 1
     layer_file_name = list(current.layer_metadata.keys())[0]
-    assert isinstance(layer_file_name, DeltaLayerFileName)
+    assert isinstance(layer_file_name, DeltaLayerName)
     assert layer_file_name.is_l0(), f"{layer_file_name}"
 
     log.info("force image layer creation in the future by writing some data into in-memory layer")
@@ -146,7 +146,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     future_layers = get_future_layers()
     assert len(future_layers) == 1
     future_layer = future_layers[0]
-    assert isinstance(future_layer, ImageLayerFileName)
+    assert isinstance(future_layer, ImageLayerName)
     assert future_layer.lsn == last_record_lsn
     log.info(
         f"got layer from the future: lsn={future_layer.lsn} disk_consistent_lsn={ip.disk_consistent_lsn} last_record_lsn={last_record_lsn}"
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index adcf7de8d4..58eaf404d3 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -729,8 +729,8 @@ def test_upgrade_generationless_local_file_paths(
     for filename in os.listdir(timeline_dir):
         path = os.path.join(timeline_dir, filename)
         log.info(f"Found file {path}")
-        if path.endswith("-00000001"):
-            new_path = path[:-9]
+        if path.endswith("-v1-00000001"):
+            new_path = path[:-12]
             os.rename(path, new_path)
             log.info(f"Renamed {path} -> {new_path}")
             files_renamed += 1

From b06eec41fa5971899fd15ed4b643889863c616c7 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 8 May 2024 20:58:35 +0300
Subject: [PATCH 0736/1571] Ignore page header when comparing VM pages in
 test_vm_bits.py (#7499)

## Problem

See #6714, #6967

## Summary of changes

Completely ignore page header when comparing VM pages.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 test_runner/regress/test_vm_bits.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 06f2a8befd..b549db1af6 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -168,15 +168,16 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
     # The VM page in shared buffer cache, and the same page as reconstructed
     # by the pageserver, should be equal.
     #
-    # Ignore the LSN on the page though (first 8 bytes). If the dirty
-    # VM page is flushed from the cache for some reason, it gets WAL-logged,
-    # which changes the LSN on the page.
+    # Ignore page header (24 bytes) of visibility map.
+    # If the dirty VM page is flushed from the cache for some reason,
+    # it gets WAL-logged, which changes the LSN on the page.
+    # Also in neon SMGR we can replace empty heap page with zero (uninitialized) heap page.
     cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
-    vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex()
+    vm_page_in_cache = (cur.fetchall()[0][0])[24:100].hex()
     cur.execute(
         "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )"
     )
-    vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex()
+    vm_page_at_pageserver = (cur.fetchall()[0][0])[24:100].hex()
 
     assert vm_page_at_pageserver == vm_page_in_cache
 

From d5399b729b3ecd3d9d38d8e61d3511fc4bf321b5 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 8 May 2024 19:29:16 +0100
Subject: [PATCH 0737/1571] pageserver: fix division by zero in layer counting
 metric (#7662)

For aux file keys (v1 or v2) the vectored read path does not return an
error when they're missing. Instead they are omitted from the resulting
btree (this is a requirement, not a bug). Skip updating the metric in
these cases to avoid infinite results.
---
 pageserver/src/tenant/timeline.rs | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7edb922069..5983529a44 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1220,11 +1220,17 @@ impl Timeline {
         }
         reconstruct_timer.stop_and_record();
 
-        // Note that this is an approximation. Tracking the exact number of layers visited
-        // per key requires virtually unbounded memory usage and is inefficient
-        // (i.e. segment tree tracking each range queried from a layer)
-        crate::metrics::VEC_READ_NUM_LAYERS_VISITED
-            .observe(layers_visited as f64 / results.len() as f64);
+        // For aux file keys (v1 or v2) the vectored read path does not return an error
+        // when they're missing. Instead they are omitted from the resulting btree
+        // (this is a requirement, not a bug). Skip updating the metric in these cases
+        // to avoid infinite results.
+        if !results.is_empty() {
+            // Note that this is an approximation. Tracking the exact number of layers visited
+            // per key requires virtually unbounded memory usage and is inefficient
+            // (i.e. segment tree tracking each range queried from a layer)
+            crate::metrics::VEC_READ_NUM_LAYERS_VISITED
+                .observe(layers_visited as f64 / results.len() as f64);
+        }
 
         Ok(results)
     }

From ab10523cc1d59cd65d88181645b149d4adc23c5e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 9 May 2024 10:58:38 +0200
Subject: [PATCH 0738/1571] remote_storage: AWS_PROFILE with endpoint overrides
 in ~/.aws/config (updates AWS SDKs) (#7664)

Before this PR, using the AWS SDK profile feature for running against
minio didn't work because
* our SDK versions were too old and didn't include
  https://github.com/awslabs/aws-sdk-rust/issues/1060 and
* we didn't massage the s3 client config builder correctly.

This PR
* udpates all the AWS SDKs we use to, respectively, the latest version I
could find on crates.io (Is there a better process?)
* changes the way remote_storage constructs the S3 client, and
* documents how to run the test suite against real S3 & local minio.

Regarding the changes to `remote_storage`: if one reads the SDK docs, it
is clear that the recommended way is to use `aws_config::from_env`, then
customize.
What we were doing instead is to use the `aws_sdk_s3` builder directly.

To get the `local-minio` in the added docs working, I needed to update
both the SDKs and make the changes to the `remote_storage`. See the
commit history in this PR for details.

Refs:
* byproduct: https://github.com/smithy-lang/smithy-rs/pull/3633
* follow-up on deprecation:
https://github.com/neondatabase/neon/issues/7665
* follow-up for scrubber S3 setup:
https://github.com/neondatabase/neon/issues/7667
---
 Cargo.lock                           |  88 +++++++++------
 Cargo.toml                           |  14 +--
 libs/remote_storage/src/s3_bucket.rs |  92 +++++++++------
 s3_scrubber/src/lib.rs               |   5 +-
 test_runner/README.md                | 160 +++++++++++++++++++++++++++
 workspace_hack/Cargo.toml            |   2 +-
 6 files changed, 285 insertions(+), 76 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9bff5e1eff..6ce7180d67 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
 [[package]]
 name = "ahash"
-version = "0.8.9"
+version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
+checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
 dependencies = [
  "cfg-if",
  "const-random",
@@ -284,9 +284,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "aws-config"
-version = "1.1.4"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b30c39ebe61f75d1b3785362b1586b41991873c9ab3e317a9181c246fb71d82"
+checksum = "baaa0be6ee7d90b775ae6ccb6d2ba182b91219ec2001f92338773a094246af1d"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -309,14 +309,15 @@ dependencies = [
  "time",
  "tokio",
  "tracing",
+ "url",
  "zeroize",
 ]
 
 [[package]]
 name = "aws-credential-types"
-version = "1.1.8"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8"
+checksum = "e16838e6c9e12125face1c1eff1343c75e3ff540de98ff7ebd61874a89bcfeb9"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-runtime-api",
@@ -326,9 +327,9 @@ dependencies = [
 
 [[package]]
 name = "aws-runtime"
-version = "1.1.8"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8"
+checksum = "785da4a15e7b166b505fd577e4560c7a7cd8fbdf842eb1336cbcbf8944ce56f1"
 dependencies = [
  "aws-credential-types",
  "aws-sigv4",
@@ -373,10 +374,11 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-s3"
-version = "1.14.0"
+version = "1.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "951f7730f51a2155c711c85c79f337fbc02a577fa99d2a0a8059acfce5392113"
+checksum = "7bc5ce518d4b8d16e0408de7bdf1b3097cec61a7daa979750a208f8d9934386d"
 dependencies = [
+ "ahash",
  "aws-credential-types",
  "aws-runtime",
  "aws-sigv4",
@@ -391,20 +393,25 @@ dependencies = [
  "aws-smithy-xml",
  "aws-types",
  "bytes",
+ "fastrand 2.0.0",
+ "hex",
+ "hmac",
  "http 0.2.9",
  "http-body 0.4.5",
+ "lru",
  "once_cell",
  "percent-encoding",
  "regex-lite",
+ "sha2",
  "tracing",
  "url",
 ]
 
 [[package]]
 name = "aws-sdk-sso"
-version = "1.12.0"
+version = "1.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f486420a66caad72635bc2ce0ff6581646e0d32df02aa39dc983bfe794955a5b"
+checksum = "ca3d6c4cba4e009391b72b0fcf12aff04ea3c9c3aa2ecaafa330326a8bd7e601"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -424,9 +431,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-ssooidc"
-version = "1.12.0"
+version = "1.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "39ddccf01d82fce9b4a15c8ae8608211ee7db8ed13a70b514bbfe41df3d24841"
+checksum = "73400dc239d14f63d932f4ca7b55af5e9ef1f857f7d70655249ccc287adb2570"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -446,9 +453,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sts"
-version = "1.12.0"
+version = "1.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a591f8c7e6a621a501b2b5d2e88e1697fcb6274264523a6ad4d5959889a41ce"
+checksum = "10f8858308af76fba3e5ffcf1bb56af5471574d2bdfaf0159470c25bc2f760e5"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -469,9 +476,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sigv4"
-version = "1.2.0"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263"
+checksum = "58b56f1cbe6fd4d0c2573df72868f20ab1c125ca9c9dbce17927a463433a2e57"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-eventstream",
@@ -498,9 +505,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-async"
-version = "1.1.8"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46"
+checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c"
 dependencies = [
  "futures-util",
  "pin-project-lite",
@@ -509,9 +516,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-checksums"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be2acd1b9c6ae5859999250ed5a62423aedc5cf69045b844432de15fa2f31f2b"
+checksum = "83fa43bc04a6b2441968faeab56e68da3812f978a670a5db32accbdcafddd12f"
 dependencies = [
  "aws-smithy-http",
  "aws-smithy-types",
@@ -541,9 +548,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http"
-version = "0.60.7"
+version = "0.60.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9"
+checksum = "4a7de001a1b9a25601016d8057ea16e31a45fdca3751304c8edf4ad72e706c08"
 dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-runtime-api",
@@ -581,9 +588,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.1.8"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01"
+checksum = "c9ac79e9f3a4d576f3cd4a470a0275b138d9e7b11b1cd514a6858ae0a79dd5bb"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -594,6 +601,7 @@ dependencies = [
  "h2 0.3.26",
  "http 0.2.9",
  "http-body 0.4.5",
+ "http-body 1.0.0",
  "hyper 0.14.26",
  "hyper-rustls 0.24.0",
  "once_cell",
@@ -606,9 +614,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime-api"
-version = "1.2.0"
+version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5"
+checksum = "04ec42c2f5c0e7796a2848dde4d9f3bf8ce12ccbb3d5aa40c52fa0cdd61a1c47"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-types",
@@ -623,16 +631,19 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "1.1.8"
+version = "1.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729"
+checksum = "baf98d97bba6ddaba180f1b1147e202d8fe04940403a95a3f826c790f931bbd1"
 dependencies = [
  "base64-simd",
  "bytes",
  "bytes-utils",
  "futures-core",
  "http 0.2.9",
+ "http 1.1.0",
  "http-body 0.4.5",
+ "http-body 1.0.0",
+ "http-body-util",
  "itoa",
  "num-integer",
  "pin-project-lite",
@@ -646,18 +657,18 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-xml"
-version = "0.60.7"
+version = "0.60.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9"
+checksum = "d123fbc2a4adc3c301652ba8e149bf4bc1d1725affb9784eb20c953ace06bf55"
 dependencies = [
  "xmlparser",
 ]
 
 [[package]]
 name = "aws-types"
-version = "1.1.8"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40"
+checksum = "5a43b56df2c529fe44cb4d92bd64d0479883fb9608ff62daede4df5405381814"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-async",
@@ -2935,6 +2946,15 @@ version = "0.4.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
 
+[[package]]
+name = "lru"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc"
+dependencies = [
+ "hashbrown 0.14.0",
+]
+
 [[package]]
 name = "match_cfg"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index 1ddadd2f3c..17f30a1327 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -52,14 +52,14 @@ azure_storage_blobs = "0.19"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "1.14"
+aws-config = { version = "1.3", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "1.26"
 aws-sdk-iam = "1.15.0"
-aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
-aws-smithy-types = "1.1.4"
-aws-credential-types = "1.1.4"
-aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
-aws-types = "1.1.7"
+aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] }
+aws-smithy-types = "1.1.9"
+aws-credential-types = "1.2.0"
+aws-sigv4 = { version = "1.2.1", features = ["sign-http"] }
+aws-types = "1.2.0"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index c0b89cee2a..c3d6c75e20 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -27,7 +27,7 @@ use aws_config::{
 };
 use aws_credential_types::provider::SharedCredentialsProvider;
 use aws_sdk_s3::{
-    config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
+    config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
     error::SdkError,
     operation::get_object::GetObjectError,
     types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
@@ -75,13 +75,13 @@ struct GetObjectRequest {
 }
 impl S3Bucket {
     /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
+    pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
         tracing::debug!(
             "Creating s3 remote storage for S3 bucket {}",
-            aws_config.bucket_name
+            remote_storage_config.bucket_name
         );
 
-        let region = Some(Region::new(aws_config.bucket_region.clone()));
+        let region = Some(Region::new(remote_storage_config.bucket_region.clone()));
 
         let provider_conf = ProviderConfig::without_region().with_region(region.clone());
 
@@ -113,6 +113,38 @@ impl S3Bucket {
         // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
         let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
 
+        let sdk_config_loader: aws_config::ConfigLoader = aws_config::defaults(
+            #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
+            BehaviorVersion::v2023_11_09(),
+        )
+        .region(region)
+        .identity_cache(IdentityCache::lazy().build())
+        .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
+        .sleep_impl(SharedAsyncSleep::from(sleep_impl));
+
+        let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
+            s.spawn(|| {
+                // TODO: make this function async.
+                tokio::runtime::Builder::new_current_thread()
+                    .enable_all()
+                    .build()
+                    .unwrap()
+                    .block_on(sdk_config_loader.load())
+            })
+            .join()
+            .unwrap()
+        });
+
+        let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config);
+
+        // Technically, the `remote_storage_config.endpoint` field only applies to S3 interactions.
+        // (In case we ever re-use the `sdk_config` for more than just the S3 client in the future)
+        if let Some(custom_endpoint) = remote_storage_config.endpoint.clone() {
+            s3_config_builder = s3_config_builder
+                .endpoint_url(custom_endpoint)
+                .force_path_style(true);
+        }
+
         // We do our own retries (see [`backoff::retry`]).  However, for the AWS SDK to enable rate limiting in response to throttling
         // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config.  We set it to use at most one
         // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
@@ -120,42 +152,36 @@ impl S3Bucket {
         retry_config
             .set_max_attempts(Some(1))
             .set_mode(Some(RetryMode::Adaptive));
+        s3_config_builder = s3_config_builder.retry_config(retry_config.build());
 
-        let mut config_builder = Builder::default()
-            .behavior_version(BehaviorVersion::v2023_11_09())
-            .region(region)
-            .identity_cache(IdentityCache::lazy().build())
-            .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
-            .retry_config(retry_config.build())
-            .sleep_impl(SharedAsyncSleep::from(sleep_impl));
+        let s3_config = s3_config_builder.build();
+        let client = aws_sdk_s3::Client::from_conf(s3_config);
 
-        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
-            config_builder = config_builder
-                .endpoint_url(custom_endpoint)
-                .force_path_style(true);
-        }
+        let prefix_in_bucket = remote_storage_config
+            .prefix_in_bucket
+            .as_deref()
+            .map(|prefix| {
+                let mut prefix = prefix;
+                while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                    prefix = &prefix[1..]
+                }
 
-        let client = Client::from_conf(config_builder.build());
+                let mut prefix = prefix.to_string();
+                while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                    prefix.pop();
+                }
+                prefix
+            });
 
-        let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
-            let mut prefix = prefix;
-            while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-                prefix = &prefix[1..]
-            }
-
-            let mut prefix = prefix.to_string();
-            while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-                prefix.pop();
-            }
-            prefix
-        });
         Ok(Self {
             client,
-            bucket_name: aws_config.bucket_name.clone(),
-            max_keys_per_list_response: aws_config.max_keys_per_list_response,
+            bucket_name: remote_storage_config.bucket_name.clone(),
+            max_keys_per_list_response: remote_storage_config.max_keys_per_list_response,
             prefix_in_bucket,
-            concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
-            upload_storage_class: aws_config.upload_storage_class.clone(),
+            concurrency_limiter: ConcurrencyLimiter::new(
+                remote_storage_config.concurrency_limit.get(),
+            ),
+            upload_storage_class: remote_storage_config.upload_storage_class.clone(),
             timeout,
         })
     }
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index e976e66748..7966fb6a88 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -312,7 +312,10 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
     let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
 
     let mut builder = Config::builder()
-        .behavior_version(BehaviorVersion::v2023_11_09())
+        .behavior_version(
+            #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
+            BehaviorVersion::v2023_11_09(),
+        )
         .region(bucket_region)
         .retry_config(RetryConfig::adaptive().with_max_attempts(3))
         .sleep_impl(SharedAsyncSleep::from(sleep_impl))
diff --git a/test_runner/README.md b/test_runner/README.md
index 051897744a..fd68cfff79 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -92,6 +92,166 @@ Exit after the first test failure:
 `./scripts/pytest -x ...`
 (there are many more pytest options; run `pytest -h` to see them.)
 
+#### Running Python tests against real S3 or S3-compatible services
+
+Neon's `libs/remote_storage` supports multiple implementations of remote storage.
+At the time of writing, that is
+```rust
+pub enum RemoteStorageKind {
+    /// Storage based on local file system.
+    /// Specify a root folder to place all stored files into.
+    LocalFs(Utf8PathBuf),
+    /// AWS S3 based storage, storing all files in the S3 bucket
+    /// specified by the config
+    AwsS3(S3Config),
+    /// Azure Blob based storage, storing all files in the container
+    /// specified by the config
+    AzureContainer(AzureConfig),
+}
+```
+
+The test suite has a Python enum with equal name but different meaning:
+
+```python
+@enum.unique
+class RemoteStorageKind(str, enum.Enum):
+    LOCAL_FS = "local_fs"
+    MOCK_S3 = "mock_s3"
+    REAL_S3 = "real_s3"
+```
+
+* `LOCAL_FS` => `LocalFs`
+* `MOCK_S3`: starts [`moto`](https://github.com/getmoto/moto)'s S3 implementation, then configures Pageserver with `AwsS3`
+* `REAL_S3` => configure `AwsS3` as detailed below
+
+When a test in the test suite needs an `AwsS3`, it is supposed to call `remote_storage.s3_storage()`.
+That function checks env var `ENABLE_REAL_S3_REMOTE_STORAGE`:
+* If it is not set, use `MOCK_S3`
+* If it is set, use `REAL_S3`.
+
+For `REAL_S3`, the test suite creates the dict/toml representation of the `RemoteStorageKind::AwsS3` based on env vars:
+
+```rust
+pub struct S3Config {
+    // test suite env var: REMOTE_STORAGE_S3_BUCKET
+    pub bucket_name: String,
+    // test suite env var: REMOTE_STORAGE_S3_REGION
+    pub bucket_region: String,
+    // test suite determines this
+    pub prefix_in_bucket: Option<String>,
+    // no env var exists; test suite sets it for MOCK_S3, because that's how moto works
+    pub endpoint: Option<String>,
+    ...
+}
+```
+
+*Credentials* are not part of the config, but discovered by the AWS SDK.
+See the `libs/remote_storage` Rust code.
+We're documenting two mechanism here:
+
+The test suite supports two mechanisms (`remote_storage.py`):
+
+**Credential mechanism 1**: env vars `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
+Populate the env vars with AWS access keys that you created in IAM.
+Our CI uses this mechanism.
+However, it is _not_ recommended for interactive use by developers ([learn more](https://docs.aws.amazon.com/sdkref/latest/guide/access-users.html#credentials-long-term)).
+Instead, use profiles (next section).
+
+**Credential mechanism 2**: env var `AWS_PROFILE`.
+This uses the AWS SDK's (and CLI's) profile mechanism.
+Learn more about it [in the official docs](https://docs.aws.amazon.com/sdkref/latest/guide/file-format.html).
+After configuring a profile (e.g. via the aws CLI), set the env var to its name.
+
+In conclusion, the full command line is:
+
+```bash
+# with long-term AWS access keys
+ENABLE_REAL_S3_REMOTE_STORAGE=true \
+REMOTE_STORAGE_S3_BUCKET=mybucket \
+REMOTE_STORAGE_S3_REGION=eu-central-1 \
+AWS_ACCESS_KEY_ID=... \
+AWS_SECRET_ACCESS_KEY=... \
+./scripts/pytest
+```
+<!-- Don't forget to update the Minio example when changing these -->
+```bash
+# with AWS PROFILE
+ENABLE_REAL_S3_REMOTE_STORAGE=true \
+REMOTE_STORAGE_S3_BUCKET=mybucket \
+REMOTE_STORAGE_S3_REGION=eu-central-1 \
+AWS_PROFILE=... \
+./scripts/pytest
+```
+
+If you're using SSO, make sure to `aws sso login --profile $AWS_PROFILE` first.
+
+##### Minio
+
+If you want to run test without the cloud setup, we recommend [minio](https://min.io/docs/minio/linux/index.html).
+
+```bash
+# Start in Terminal 1
+mkdir /tmp/minio_data
+minio server /tmp/minio_data --console-address 127.0.0.1:9001 --address 127.0.0.1:9000
+```
+
+In another terminal, create an `aws` CLI profile for it:
+
+```ini
+# append to ~/.aws/config
+[profile local-minio]
+services = local-minio-services
+[services local-minio-services]
+s3 =
+  endpoint_url=http://127.0.0.1:9000/
+```
+
+
+Now configure the credentials (this is going to write `~/.aws/credentials` for you).
+It's an interactive prompt.
+
+```bash
+# Terminal 2
+$ aws --profile local-minio configure
+AWS Access Key ID [None]: minioadmin
+AWS Secret Access Key [None]: minioadmin
+Default region name [None]:
+Default output format [None]:
+```
+
+Now create a bucket `testbucket` using the CLI.
+
+```bash
+# (don't forget to have AWS_PROFILE env var set; or use --profile)
+aws --profile local-minio s3 mb s3://mybucket
+```
+
+(If it doesn't work, make sure you update your AWS CLI to a recent version.
+ The [service-specific endpoint feature](https://docs.aws.amazon.com/sdkref/latest/guide/feature-ss-endpoints.html)
+ that we're using is quite new.)
+
+```bash
+# with AWS PROFILE
+ENABLE_REAL_S3_REMOTE_STORAGE=true \
+REMOTE_STORAGE_S3_BUCKET=mybucket \
+REMOTE_STORAGE_S3_REGION=doesntmatterforminio \
+AWS_PROFILE=local-minio \
+./scripts/pytest
+```
+
+NB: you can avoid the `--profile` by setting the `AWS_PROFILE` variable.
+Just like the AWS SDKs, the `aws` CLI is sensible to it.
+
+#### Running Rust tests against real S3 or S3-compatible services
+
+We have some Rust tests that only run against real S3, e.g., [here](https://github.com/neondatabase/neon/blob/c18d3340b5e3c978a81c3db8b6f1e83cd9087e8a/libs/remote_storage/tests/test_real_s3.rs#L392-L397).
+
+They use the same env vars as the Python test suite (see previous section)
+but interpret them on their own.
+However, at this time, the interpretation is identical.
+
+So, above instructions apply to the Rust test as well.
+
 ### Writing a test
 
 Every test needs a Neon Environment, or NeonEnv to operate in. A Neon Environment
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index b2da33e44a..b605757f64 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -19,7 +19,7 @@ aws-runtime = { version = "1", default-features = false, features = ["event-stre
 aws-sigv4 = { version = "1", features = ["http0-compat", "sign-eventstream", "sigv4a"] }
 aws-smithy-async = { version = "1", default-features = false, features = ["rt-tokio"] }
 aws-smithy-http = { version = "0.60", default-features = false, features = ["event-stream"] }
-aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "rt-tokio", "test-util"] }
+aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "http-body-1-x", "rt-tokio", "test-util"] }
 axum = { version = "0.6", features = ["ws"] }
 base64 = { version = "0.21", features = ["alloc"] }
 base64ct = { version = "1", default-features = false, features = ["std"] }

From 39c712f2ca216a1d1556d4c0f8a846919418e661 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 9 May 2024 10:07:59 +0100
Subject: [PATCH 0739/1571] tests: adjust log allow list since reqwest upgrade
 (#7666)

## Problem

Various performance test cases were destabilized by the recent upgrade
of `reqwest`, because it changes an error string.

Examples:
-
https://neon-github-public-dev.s3.amazonaws.com/reports/main/9005532594/index.html#testresult/3f984e471a9029a5/
-
https://neon-github-public-dev.s3.amazonaws.com/reports/main/9005532594/index.html#testresult/8bd0f095fe0402b7/

The performance tests suffer from this more than most tests, because
they churn enough data that the pageserver is still trying to contact
the storage controller while it is shut down at the end of tests.

## Summary of changes

s/Connection refused/error sending request/
---
 test_runner/fixtures/pageserver/allowed_errors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index e560844944..58a76d7586 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -88,7 +88,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*Flushed oversized open layer with size.*",
     # During teardown, we stop the storage controller before the pageservers, so pageservers
     # can experience connection errors doing background deletion queue work.
-    ".*WARN deletion backend: calling control plane generation validation API failed.*Connection refused.*",
+    ".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*",
     # Can happen when the test shuts down the storage controller while it is calling the utilization API
     ".*WARN.*path=/v1/utilization .*request was dropped before completing",
 )

From 107f53529409533fec5e1ca39abf9acde8161862 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 9 May 2024 12:33:09 +0100
Subject: [PATCH 0740/1571] storage controller: fix handing of tenants with no
 timelines during scheduling optimization (#7673)

## Problem

Storage controller was using a zero layer count in SecondaryProgress as
a proxy for "not initialized". However, in tenants with zero timelines
(a legitimate state), the layer count remains zero forever.

This caused https://github.com/neondatabase/neon/pull/7583 to
destabilize the storage controller scale test, which creates lots of
tenants, some of which don't get any timelines.

## Summary of changes

- Use a None mtime instead of zero layer count to determine if a
SecondaryProgress should be ignored.
- Adjust the test to use a shorter heatmap upload period to let it
proceed faster while waiting for scheduling optimizations to complete.
---
 storage_controller/src/service.rs                        | 2 +-
 test_runner/performance/test_storage_controller_scale.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index d3a53066c9..ae7e8d3d7d 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4745,7 +4745,7 @@ impl Service {
                     // them in an optimization
                     const DOWNLOAD_FRESHNESS_THRESHOLD: u64 = 10 * 1024 * 1024 * 1024;
 
-                    if progress.bytes_total == 0
+                    if progress.heatmap_mtime.is_none()
                         || progress.bytes_total < DOWNLOAD_FRESHNESS_THRESHOLD
                             && progress.bytes_downloaded != progress.bytes_total
                         || progress.bytes_total - progress.bytes_downloaded
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 17dc96dabe..632d465c3f 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -102,6 +102,9 @@ def test_storage_controller_many_tenants(
                 tenant_id,
                 shard_count,
                 stripe_size,
+                # Upload heatmaps fast, so that secondary downloads happen promptly, enabling
+                # the controller's optimization migrations to proceed promptly.
+                tenant_config={"heatmap_period": "10s"},
                 placement_policy={"Attached": 1},
             )
             futs.append(f)

From 41fb838799ca2b0e3c20c440d49151b7153d9ff8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 9 May 2024 16:01:16 +0200
Subject: [PATCH 0741/1571] Fix tiered compaction k-merge bug and use in-memory
 alternative (#7661)

This PR does two things:

First, it fixes a bug with tiered compaction's k-merge implementation.
It ignored the lsn of a key during ordering, so multiple updates of the
same key could be read in arbitrary order, say from different layers.
For example there is layers `[(a, 2),(b, 3)]` and `[(a, 1),(c, 2)]` in
the heap, they might return `(a,2)` and `(a,1)`.

Ultimately, this change wasn't enough to fix the ordering issues in
#7296, in other words there is likely still bugs in the k-merge. So as
the second thing, we switch away from the k-merge to an in-memory based
one, similar to #4839, but leave the code around to be improved and
maybe switched to later on.

Part of #7296
---
 pageserver/compaction/src/compact_tiered.rs |  9 +++++--
 pageserver/compaction/src/helpers.rs        | 30 ++++++++++++++++++---
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index 137b93055a..12882c9d59 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -24,7 +24,9 @@ use tracing::{debug, info};
 use std::collections::{HashSet, VecDeque};
 use std::ops::Range;
 
-use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
+use crate::helpers::{
+    accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with,
+};
 use crate::interface::*;
 use utils::lsn::Lsn;
 
@@ -535,7 +537,10 @@ where
             }
         }
         // Open stream
-        let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
+        let key_value_stream =
+            std::pin::pin!(merge_delta_keys_buffered::<E>(deltas.as_slice(), ctx)
+                .await?
+                .map(Result::<_, anyhow::Error>::Ok));
         let mut new_jobs = Vec::new();
 
         // Slide a window through the keyspace
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index eb0e5ee82a..06454ee1d0 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -14,6 +14,7 @@ use std::future::Future;
 use std::ops::{DerefMut, Range};
 use std::pin::Pin;
 use std::task::{ready, Poll};
+use utils::lsn::Lsn;
 
 pub fn keyspace_total_size<K>(
     keyspace: &CompactionKeySpace<K>,
@@ -109,17 +110,40 @@ pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
     }
 }
 
+pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>(
+    layers: &'a [E::DeltaLayer],
+    ctx: &'a E::RequestContext,
+) -> anyhow::Result<impl Stream<Item = <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>
+{
+    let mut keys = Vec::new();
+    for l in layers {
+        // Boxing and casting to LoadFuture is required to obtain the right Sync bound.
+        // If we do l.load_keys(ctx).await? directly, there is a compilation error.
+        let load_future: LoadFuture<'a, _> = Box::pin(l.load_keys(ctx));
+        keys.extend(load_future.await?.into_iter());
+    }
+    keys.sort_by_key(|k| (k.key(), k.lsn()));
+    let stream = futures::stream::iter(keys.into_iter());
+    Ok(stream)
+}
+
 enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
     Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
     Unloaded(&'a E::DeltaLayer),
 }
 impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
-    fn key(&self) -> E::Key {
+    fn min_key(&self) -> E::Key {
         match self {
             Self::Loaded(entries) => entries.front().unwrap().key(),
             Self::Unloaded(dl) => dl.key_range().start,
         }
     }
+    fn min_lsn(&self) -> Lsn {
+        match self {
+            Self::Loaded(entries) => entries.front().unwrap().lsn(),
+            Self::Unloaded(dl) => dl.lsn_range().start,
+        }
+    }
 }
 impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
@@ -129,12 +153,12 @@ impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
 impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
         // reverse order so that we get a min-heap
-        other.key().cmp(&self.key())
+        (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn()))
     }
 }
 impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
     fn eq(&self, other: &Self) -> bool {
-        self.key().eq(&other.key())
+        self.cmp(other) == std::cmp::Ordering::Equal
     }
 }
 impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}

From 2682e0254ffb82f2e1eef0ec875346742b6e8b4e Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 9 May 2024 11:15:19 -0400
Subject: [PATCH 0742/1571] Revert "chore(neon_test_utils): restrict
 installation to superuser" (#7679)

This reverts commit 1173ee6a7e1168e671a6847eb94807b45c703490.

## Problem

It breaks autoscaling tests
---
 pgxn/neon_test_utils/neon_test_utils.control | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control
index 8c5b9b5dfe..5f6d640835 100644
--- a/pgxn/neon_test_utils/neon_test_utils.control
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -3,5 +3,4 @@ comment = 'helpers for neon testing and debugging'
 default_version = '1.1'
 module_pathname = '$libdir/neon_test_utils'
 relocatable = true
-trusted = false
-superuser = true
+trusted = true

From 5ea117cddfe3bc58c500f0eff8352af796b58268 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 9 May 2024 17:55:57 +0000
Subject: [PATCH 0743/1571] build(deps): bump Npgsql from 8.0.2 to 8.0.3 in
 /test_runner/pg_clients/csharp/npgsql (#7680)

---
 test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
index 50243e3ea7..edf2a01337 100644
--- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
+++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
@@ -8,7 +8,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Npgsql" Version="8.0.2" />
+    <PackageReference Include="Npgsql" Version="8.0.3" />
   </ItemGroup>
 
 </Project>

From b9fd8dcf13e13b804047fc21089d2ecb509a1548 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 9 May 2024 15:52:56 -0400
Subject: [PATCH 0744/1571] fix(test): update the config for neon_binpath in
 from_repo_dir (#7684)

## Problem

https://github.com/neondatabase/neon/pull/7637 breaks forward compat
test.

On commit ea531d448eb65c4f58abb9ef7d8cd461952f7c5f.


https://neon-github-public-dev.s3.amazonaws.com/reports/main/8988324349/index.html

```
test_create_snapshot
2024-05-07T16:03:11.331883Z  INFO version: git-env:ea531d448eb65c4f58abb9ef7d8cd461952f7c5f failpoints: true, features: ["testing"] launch_timestamp: 2024-05-07 16:03:11.316131763 UTC build_tag: build_tag-env:5159

test_forward_compatibility
2024-05-07T16:07:02.310769Z  INFO version: git-env:ea531d448eb65c4f58abb9ef7d8cd461952f7c5f failpoints: true, features: ["testing"] launch_timestamp: 2024-05-07 16:07:02.294676183 UTC build_tag: build_tag-env:5159
```

The forward compatibility test is actually using the same tag as the
current build.

The commit before that,


https://neon-github-public-dev.s3.amazonaws.com/reports/main/8988126011/index.html

```
test_create_snapshot
2024-05-07T15:47:21.900796Z  INFO version: git-env:2dbd1c1ed5cd0458933e8ffd40a9c0a5f4d610b8 failpoints: true, features: ["testing"] launch_timestamp: 2024-05-07 15:47:21.882784185 UTC build_tag: build_tag-env:5158

test_forward_compatibility
2024-05-07T15:50:48.828733Z  INFO version: git-env:c4d7d5982553d2cf66634d1fbf85d95ef44a6524 failpoints: true, features: ["testing"] launch_timestamp: 2024-05-07 15:50:48.816635176 UTC build_tag: build_tag-env:release-5434
```

This pull request patches the bin path so that the new neon_local will
use the old binary.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 390b94c2ea..da379693a0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -701,6 +701,11 @@ class NeonEnvBuilder:
         config["default_tenant_id"] = snapshot_config["default_tenant_id"]
         config["branch_name_mappings"] = snapshot_config["branch_name_mappings"]
 
+        # Update the config with new neon + postgres path in case of compat test
+        # FIXME: overriding pg_distrib_dir cause storage controller fail to start
+        # config["pg_distrib_dir"] = str(self.pg_distrib_dir)
+        config["neon_distrib_dir"] = str(self.neon_binpath)
+
         with (self.repo_dir / "config").open("w") as f:
             toml.dump(config, f)
 

From be1a88e574379ef29005e5a8760105509046584a Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Fri, 10 May 2024 12:17:00 +0200
Subject: [PATCH 0745/1571] Proxy added per ep rate limiter (#7636)

## Problem

There is no global per-ep rate limiter in proxy.

## Summary of changes

* Return global per-ep rate limiter back.
* Rename weak compute rate limiter (the cli flags were not used
anywhere, so it's safe to rename).
---
 proxy/src/auth/backend.rs          | 73 ++++++++++++++++++++++++------
 proxy/src/bin/proxy.rs             | 26 ++++++++---
 proxy/src/console/provider/neon.rs |  8 ++--
 proxy/src/proxy.rs                 |  7 ++-
 proxy/src/rate_limiter/limiter.rs  | 17 ++++---
 proxy/src/serverless.rs            | 19 ++++++--
 proxy/src/serverless/backend.rs    |  8 ++++
 proxy/src/serverless/websocket.rs  |  3 ++
 8 files changed, 126 insertions(+), 35 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 3795e3b608..6a906b299b 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -13,7 +13,7 @@ use tokio_postgres::config::AuthKeys;
 use tracing::{info, warn};
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
-use crate::auth::validate_password_and_exchange;
+use crate::auth::{validate_password_and_exchange, AuthError};
 use crate::cache::Cached;
 use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
@@ -23,7 +23,7 @@ use crate::intern::EndpointIdInt;
 use crate::metrics::Metrics;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
-use crate::rate_limiter::{BucketRateLimiter, RateBucketInfo};
+use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo};
 use crate::stream::Stream;
 use crate::{
     auth::{self, ComputeUserInfoMaybeEndpoint},
@@ -280,6 +280,7 @@ async fn auth_quirks(
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     allow_cleartext: bool,
     config: &'static AuthenticationConfig,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> auth::Result<ComputeCredentials> {
     // If there's no project so far, that entails that client doesn't
     // support SNI or other means of passing the endpoint (project) name.
@@ -305,6 +306,10 @@ async fn auth_quirks(
     if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
         return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
     }
+
+    if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
+        return Err(AuthError::too_many_connections());
+    }
     let cached_secret = match maybe_secret {
         Some(secret) => secret,
         None => api.get_role_secret(ctx, &info).await?,
@@ -417,6 +422,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
         client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
+        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     ) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
         use BackendType::*;
 
@@ -428,8 +434,16 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
                     "performing authentication using the console"
                 );
 
-                let credentials =
-                    auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
+                let credentials = auth_quirks(
+                    ctx,
+                    &*api,
+                    user_info,
+                    client,
+                    allow_cleartext,
+                    config,
+                    endpoint_rate_limiter,
+                )
+                .await?;
                 BackendType::Console(api, credentials)
             }
             // NOTE: this auth backend doesn't use client credentials.
@@ -539,7 +553,7 @@ mod tests {
         },
         context::RequestMonitoring,
         proxy::NeonOptions,
-        rate_limiter::RateBucketInfo,
+        rate_limiter::{EndpointRateLimiter, RateBucketInfo},
         scram::ServerSecret,
         stream::{PqStream, Stream},
     };
@@ -699,10 +713,20 @@ mod tests {
                 _ => panic!("wrong message"),
             }
         });
+        let endpoint_rate_limiter =
+            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
 
-        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, &CONFIG)
-            .await
-            .unwrap();
+        let _creds = auth_quirks(
+            &mut ctx,
+            &api,
+            user_info,
+            &mut stream,
+            false,
+            &CONFIG,
+            endpoint_rate_limiter,
+        )
+        .await
+        .unwrap();
 
         handle.await.unwrap();
     }
@@ -739,10 +763,20 @@ mod tests {
             frontend::password_message(b"my-secret-password", &mut write).unwrap();
             client.write_all(&write).await.unwrap();
         });
+        let endpoint_rate_limiter =
+            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
 
-        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
-            .await
-            .unwrap();
+        let _creds = auth_quirks(
+            &mut ctx,
+            &api,
+            user_info,
+            &mut stream,
+            true,
+            &CONFIG,
+            endpoint_rate_limiter,
+        )
+        .await
+        .unwrap();
 
         handle.await.unwrap();
     }
@@ -780,9 +814,20 @@ mod tests {
             client.write_all(&write).await.unwrap();
         });
 
-        let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
-            .await
-            .unwrap();
+        let endpoint_rate_limiter =
+            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
+
+        let creds = auth_quirks(
+            &mut ctx,
+            &api,
+            user_info,
+            &mut stream,
+            true,
+            &CONFIG,
+            endpoint_rate_limiter,
+        )
+        .await
+        .unwrap();
 
         assert_eq!(creds.info.endpoint, "my-endpoint");
 
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 5399f13edd..be7d961b8c 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -144,6 +144,9 @@ struct ProxyCliArgs {
     /// Can be given multiple times for different bucket sizes.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
     endpoint_rps_limit: Vec<RateBucketInfo>,
+    /// Wake compute rate limiter max number of requests per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    wake_compute_limit: Vec<RateBucketInfo>,
     /// Whether the auth rate limiter actually takes effect (for testing)
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     auth_rate_limit_enabled: bool,
@@ -154,7 +157,7 @@ struct ProxyCliArgs {
     #[clap(long, default_value_t = 64)]
     auth_rate_limit_ip_subnet: u8,
     /// Redis rate limiter max number of requests per second.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
     redis_rps_limit: Vec<RateBucketInfo>,
     /// cache for `allowed_ips` (use `size=0` to disable)
     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
@@ -365,6 +368,10 @@ async fn main() -> anyhow::Result<()> {
         proxy::metrics::CancellationSource::FromClient,
     ));
 
+    let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
+    RateBucketInfo::validate(&mut endpoint_rps_limit)?;
+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
+
     // client facing tasks. these will exit on error or on cancellation
     // cancellation returns Ok(())
     let mut client_tasks = JoinSet::new();
@@ -373,6 +380,7 @@ async fn main() -> anyhow::Result<()> {
         proxy_listener,
         cancellation_token.clone(),
         cancellation_handler.clone(),
+        endpoint_rate_limiter.clone(),
     ));
 
     // TODO: rename the argument to something like serverless.
@@ -387,6 +395,7 @@ async fn main() -> anyhow::Result<()> {
             serverless_listener,
             cancellation_token.clone(),
             cancellation_handler.clone(),
+            endpoint_rate_limiter.clone(),
         ));
     }
 
@@ -559,11 +568,16 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             let url = args.auth_endpoint.parse()?;
             let endpoint = http::Endpoint::new(url, http::new_client());
 
-            let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
-            RateBucketInfo::validate(&mut endpoint_rps_limit)?;
-            let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
-            let api =
-                console::provider::neon::Api::new(endpoint, caches, locks, endpoint_rate_limiter);
+            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
+            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
+            let wake_compute_endpoint_rate_limiter =
+                Arc::new(EndpointRateLimiter::new(wake_compute_rps_limit));
+            let api = console::provider::neon::Api::new(
+                endpoint,
+                caches,
+                locks,
+                wake_compute_endpoint_rate_limiter,
+            );
             let api = console::provider::ConsoleBackend::Console(api);
             auth::BackendType::Console(MaybeOwned::Owned(api), ())
         }
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index ec66641d01..7728d2cafa 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -26,7 +26,7 @@ pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
     pub locks: &'static ApiLocks<EndpointCacheKey>,
-    pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    pub wake_compute_endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     jwt: String,
 }
 
@@ -36,7 +36,7 @@ impl Api {
         endpoint: http::Endpoint,
         caches: &'static ApiCaches,
         locks: &'static ApiLocks<EndpointCacheKey>,
-        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+        wake_compute_endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     ) -> Self {
         let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
             Ok(v) => v,
@@ -46,7 +46,7 @@ impl Api {
             endpoint,
             caches,
             locks,
-            endpoint_rate_limiter,
+            wake_compute_endpoint_rate_limiter,
             jwt,
         }
     }
@@ -283,7 +283,7 @@ impl super::Api for Api {
 
         // check rate limit
         if !self
-            .endpoint_rate_limiter
+            .wake_compute_endpoint_rate_limiter
             .check(user_info.endpoint.normalize().into(), 1)
         {
             return Err(WakeComputeError::TooManyConnections);
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index e4e095d77d..5824b70df9 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -19,6 +19,7 @@ use crate::{
     metrics::{Metrics, NumClientConnectionsGuard},
     protocol2::read_proxy_protocol,
     proxy::handshake::{handshake, HandshakeData},
+    rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
     EndpointCacheKey,
 };
@@ -61,6 +62,7 @@ pub async fn task_main(
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
     cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("proxy has shut down");
@@ -86,6 +88,7 @@ pub async fn task_main(
         let cancellation_handler = Arc::clone(&cancellation_handler);
 
         tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
+        let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();
 
         connections.spawn(async move {
             let (socket, peer_addr) = match read_proxy_protocol(socket).await{
@@ -123,6 +126,7 @@ pub async fn task_main(
                 cancellation_handler,
                 socket,
                 ClientMode::Tcp,
+                endpoint_rate_limiter2,
                 conn_gauge,
             )
             .instrument(span.clone())
@@ -234,6 +238,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
     mode: ClientMode,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
     info!(
@@ -243,7 +248,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     let metrics = &Metrics::get().proxy;
     let proto = ctx.protocol;
-    // let _client_gauge = metrics.client_connections.guard(proto);
     let _request_gauge = metrics.connection_requests.guard(proto);
 
     let tls = config.tls_config.as_ref();
@@ -286,6 +290,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
             &mut stream,
             mode.allow_cleartext(),
             &config.authentication_config,
+            endpoint_rate_limiter,
         )
         .await
     {
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 5ba2c36436..b8c9490696 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -128,12 +128,18 @@ impl std::str::FromStr for RateBucketInfo {
 }
 
 impl RateBucketInfo {
-    pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [
+    pub const DEFAULT_SET: [Self; 3] = [
         Self::new(300, Duration::from_secs(1)),
         Self::new(200, Duration::from_secs(60)),
         Self::new(100, Duration::from_secs(600)),
     ];
 
+    pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [
+        Self::new(500, Duration::from_secs(1)),
+        Self::new(300, Duration::from_secs(60)),
+        Self::new(200, Duration::from_secs(600)),
+    ];
+
     pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
         info.sort_unstable_by_key(|info| info.interval);
         let invalid = info
@@ -266,7 +272,7 @@ mod tests {
 
     #[test]
     fn default_rate_buckets() {
-        let mut defaults = RateBucketInfo::DEFAULT_ENDPOINT_SET;
+        let mut defaults = RateBucketInfo::DEFAULT_SET;
         RateBucketInfo::validate(&mut defaults[..]).unwrap();
     }
 
@@ -333,11 +339,8 @@ mod tests {
         let rand = rand::rngs::StdRng::from_seed([1; 32]);
         let hasher = BuildHasherDefault::<FxHasher>::default();
 
-        let limiter = BucketRateLimiter::new_with_rand_and_hasher(
-            &RateBucketInfo::DEFAULT_ENDPOINT_SET,
-            rand,
-            hasher,
-        );
+        let limiter =
+            BucketRateLimiter::new_with_rand_and_hasher(&RateBucketInfo::DEFAULT_SET, rand, hasher);
         for i in 0..1_000_000 {
             limiter.check(i, 1);
         }
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index cbff51f207..f634ab4e98 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -36,6 +36,7 @@ use crate::context::RequestMonitoring;
 use crate::metrics::Metrics;
 use crate::protocol2::read_proxy_protocol;
 use crate::proxy::run_until_cancelled;
+use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};
 
@@ -54,6 +55,7 @@ pub async fn task_main(
     ws_listener: TcpListener,
     cancellation_token: CancellationToken,
     cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("websocket server has shut down");
@@ -82,6 +84,7 @@ pub async fn task_main(
     let backend = Arc::new(PoolingBackend {
         pool: Arc::clone(&conn_pool),
         config,
+        endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter),
     });
 
     let tls_config = match config.tls_config.as_ref() {
@@ -129,6 +132,7 @@ pub async fn task_main(
             backend.clone(),
             connections.clone(),
             cancellation_handler.clone(),
+            endpoint_rate_limiter.clone(),
             conn_token.clone(),
             server.clone(),
             tls_acceptor.clone(),
@@ -162,6 +166,7 @@ async fn connection_handler(
     backend: Arc<PoolingBackend>,
     connections: TaskTracker,
     cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_token: CancellationToken,
     server: Builder<TokioExecutor>,
     tls_acceptor: TlsAcceptor,
@@ -245,6 +250,7 @@ async fn connection_handler(
                     session_id,
                     peer_addr,
                     http_request_token,
+                    endpoint_rate_limiter.clone(),
                 )
                 .in_current_span()
                 .map_ok_or_else(api_error_into_response, |r| r),
@@ -285,6 +291,7 @@ async fn request_handler(
     peer_addr: IpAddr,
     // used to cancel in-flight HTTP requests. not used to cancel websockets
     http_cancellation_token: CancellationToken,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> Result<Response<Full<Bytes>>, ApiError> {
     let host = request
         .headers()
@@ -310,9 +317,15 @@ async fn request_handler(
 
         ws_connections.spawn(
             async move {
-                if let Err(e) =
-                    websocket::serve_websocket(config, ctx, websocket, cancellation_handler, host)
-                        .await
+                if let Err(e) = websocket::serve_websocket(
+                    config,
+                    ctx,
+                    websocket,
+                    cancellation_handler,
+                    endpoint_rate_limiter,
+                    host,
+                )
+                .await
                 {
                     error!("error in websocket connection: {e:#}");
                 }
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index ce58f575e2..6b79c12316 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -16,6 +16,7 @@ use crate::{
     context::RequestMonitoring,
     error::{ErrorKind, ReportableError, UserFacingError},
     proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
+    rate_limiter::EndpointRateLimiter,
     Host,
 };
 
@@ -24,6 +25,7 @@ use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
 pub struct PoolingBackend {
     pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
     pub config: &'static ProxyConfig,
+    pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 }
 
 impl PoolingBackend {
@@ -39,6 +41,12 @@ impl PoolingBackend {
         if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
             return Err(AuthError::ip_address_not_allowed(ctx.peer_addr));
         }
+        if !self
+            .endpoint_rate_limiter
+            .check(conn_info.user_info.endpoint.clone().into(), 1)
+        {
+            return Err(AuthError::too_many_connections());
+        }
         let cached_secret = match maybe_secret {
             Some(secret) => secret,
             None => backend.get_role_secret(ctx).await?,
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index b6cd85af73..649bec2c7c 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -5,6 +5,7 @@ use crate::{
     error::{io_error, ReportableError},
     metrics::Metrics,
     proxy::{handle_client, ClientMode},
+    rate_limiter::EndpointRateLimiter,
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream};
@@ -134,6 +135,7 @@ pub async fn serve_websocket(
     mut ctx: RequestMonitoring,
     websocket: HyperWebsocket,
     cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     hostname: Option<String>,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
@@ -148,6 +150,7 @@ pub async fn serve_websocket(
         cancellation_handler,
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },
+        endpoint_rate_limiter,
         conn_gauge,
     )
     .await;

From 13d9589c35d444b444b6ed9ef4d8d7144ad232d0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 10 May 2024 12:01:39 +0100
Subject: [PATCH 0746/1571] pageserver: don't call get_vectored with empty
 keyspace (#7686)

## Problem

This caused a variation of the stats bug fixed by
https://github.com/neondatabase/neon/pull/7662. That PR also fixed this
case, but we still shouldn't make redundant get calls.

## Summary of changes

- Only call get in the create image layers loop at the end of a range if
some keys have been accumulated
---
 pageserver/src/tenant/timeline.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5983529a44..60b3873b71 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4230,7 +4230,7 @@ impl Timeline {
 
                     // Maybe flush `key_rest_accum`
                     if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
-                        || last_key_in_range
+                        || (last_key_in_range && key_request_accum.raw_size() > 0)
                     {
                         let results = self
                             .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)

From 873b2220808e0bc059edb631186ed19b0238394d Mon Sep 17 00:00:00 2001
From: Andrey Taranik <andrey@cicd.team>
Date: Fri, 10 May 2024 15:04:23 +0400
Subject: [PATCH 0747/1571] use own arm64 gha runners (#7373)

## Problem

Move from aws based arm64 runners to bare-metal based

## Summary of changes
Changes in GitHub action workflows where `runs-on: arm64` used. More
parallelism added, build time for `neon with extra platform builds`
workflow reduced from 45m to 25m
---
 .config/nextest.toml                          |  2 +-
 .github/actionlint.yml                        |  4 +--
 .github/workflows/build-build-tools-image.yml |  2 +-
 .github/workflows/neon_extra_builds.yml       | 33 ++++++++++++-------
 4 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/.config/nextest.toml b/.config/nextest.toml
index a9398e4ab0..affdc16f31 100644
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -1,2 +1,2 @@
 [profile.default]
-slow-timeout = { period = "20s", terminate-after = 3 }
+slow-timeout = { period = "60s", terminate-after = 3 }
diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index cb36e2eee6..942861ecd8 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -1,11 +1,9 @@
 self-hosted-runner:
   labels:
     - arm64
-    - dev
     - gen3
     - large
-    # Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged.
-    - macos-14
+    - large-arm64
     - small
     - us-east-2
 config-variables:
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index c527cef1ac..bdf00bcaae 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -39,7 +39,7 @@ jobs:
       matrix:
         arch: [ x64, arm64 ]
 
-    runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
+    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     env:
       IMAGE_TAG: ${{ inputs.image-tag }}
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 5a2f9d6645..fdb03963fb 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -136,7 +136,7 @@ jobs:
   check-linux-arm-build:
     needs: [ check-permissions, build-build-tools-image ]
     timeout-minutes: 90
-    runs-on: [ self-hosted, dev, arm64 ]
+    runs-on: [ self-hosted, large-arm64 ]
 
     env:
       # Use release build only, to have less debug info around
@@ -232,20 +232,20 @@ jobs:
 
       - name: Run cargo build
         run: |
-          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
 
       - name: Run cargo test
         env:
           NEXTEST_RETRIES: 3
         run: |
-          cargo nextest run $CARGO_FEATURES
+          cargo nextest run $CARGO_FEATURES -j$(nproc)
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
           export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
           export REMOTE_STORAGE_S3_REGION=eu-central-1
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo nextest run --package remote_storage --test test_real_s3
+          cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc)
 
           # Run separate tests for real Azure Blob Storage
           # XXX: replace region with `eu-central-1`-like region
@@ -255,12 +255,12 @@ jobs:
           export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
           export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo nextest run --package remote_storage --test test_real_azure
+          cargo nextest run --package remote_storage --test test_real_azure -j$(nproc)
 
   check-codestyle-rust-arm:
     needs: [ check-permissions, build-build-tools-image ]
     timeout-minutes: 90
-    runs-on: [ self-hosted, dev, arm64 ]
+    runs-on: [ self-hosted, large-arm64 ]
 
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -269,6 +269,11 @@ jobs:
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug, release ]
+
     steps:
       - name: Fix git ownership
         run: |
@@ -305,31 +310,35 @@ jobs:
             exit 1
           fi
           echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+
       - name: Run cargo clippy (debug)
+        if: matrix.build_type == 'debug'
         run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
       - name: Run cargo clippy (release)
+        if: matrix.build_type == 'release'
         run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
 
       - name: Check documentation generation
-        run: cargo doc --workspace --no-deps --document-private-items
+        if: matrix.build_type == 'release'
+        run: cargo doc --workspace --no-deps --document-private-items -j$(nproc)
         env:
             RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
 
       # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
       - name: Check formatting
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
         run: cargo fmt --all -- --check
 
       # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
       - name: Check rust dependencies
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
         run: |
           cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
           cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
 
       # https://github.com/EmbarkStudios/cargo-deny
       - name: Check rust licenses/bans/advisories/sources
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
         run: cargo deny check
 
   gather-rust-build-stats:
@@ -338,7 +347,7 @@ jobs:
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
       github.ref_name == 'main'
-    runs-on: [ self-hosted, gen3, large ]
+    runs-on: [ self-hosted, large ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
       credentials:
@@ -369,7 +378,7 @@ jobs:
         run: make walproposer-lib -j$(nproc)
 
       - name: Produce the build stats
-        run: cargo build --all --release --timings
+        run: cargo build --all --release --timings -j$(nproc)
 
       - name: Upload the build stats
         id: upload-stats

From 0b02043ba4e8d477b77a1f01bef9809c1f433ab4 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 10 May 2024 13:32:42 +0100
Subject: [PATCH 0748/1571] Fix permissions for safekeeper failpoints (#7669)

We didn't check permission in `"/v1/failpoints"` endpoint, it means that
everyone with per-tenant token could modify the failpoints. This commit
fixes that.
---
 safekeeper/src/http/routes.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 9ce26e6c5d..30d0081a47 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -519,6 +519,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .get("/v1/status", |r| request_span(r, status_handler))
         .put("/v1/failpoints", |r| {
             request_span(r, move |r| async {
+                check_permission(&r, None)?;
                 let cancel = CancellationToken::new();
                 failpoints_handler(r, cancel).await
             })

From 86905c132205b5cbffb199fb002bc9c0484d4f43 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 10 May 2024 17:15:11 +0300
Subject: [PATCH 0749/1571] openapi: resolve the synthetic_size duplication
 (#7651)

We had accidentally left two endpoints for `tenant`: `/synthetic_size`
and `/size`. Size had the more extensive description but has returned
404 since renaming. Remove the `/size` in favor of the working one and
describe the `text/html` output.
---
 pageserver/src/http/openapi_spec.yml | 38 +++++-----------------------
 1 file changed, 6 insertions(+), 32 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index c425f3e628..36c74ed140 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -420,25 +420,6 @@ paths:
           description: Tenant scheduled to load successfully
 
   /v1/tenant/{tenant_id}/synthetic_size:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-    get:
-      description: |
-        Calculate tenant's synthetic size
-      responses:
-        "200":
-          description: Tenant's synthetic size
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/SyntheticSizeResponse"
-
-  # This route has no handler. TODO: remove?
-  /v1/tenant/{tenant_id}/size:
     parameters:
       - name: tenant_id
         in: path
@@ -468,19 +449,9 @@ paths:
           content:
             application/json:
               schema:
-                type: object
-                required:
-                  - id
-                  - size
-                properties:
-                  id:
-                    type: string
-                    format: hex
-                  size:
-                    type: integer
-                    nullable: true
-                    description: |
-                      Size metric in bytes or null if inputs_only=true was given.
+                $ref: "#/components/schemas/SyntheticSizeResponse"
+            text/html:
+              description: SVG representation of the tenant and it's timelines.
         "401":
           description: Unauthorized Error
           content:
@@ -929,6 +900,9 @@ components:
           format: hex
         size:
           type: integer
+          nullable: true
+          description: |
+            Size metric in bytes or null if inputs_only=true was given.
         segment_sizes:
           type: array
           items:

From d7f34bc3399c31f8e4c773cb5ae6f919e5d02d64 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 10 May 2024 17:41:34 +0300
Subject: [PATCH 0750/1571] draw_timeline_dir: draw branch points and gc cutoff
 lines (#7657)

in addition to layer names, expand the input vocabulary to recognize
lines in the form of:

    ${kind}:${lsn}

where:
- kind in `gc_cutoff` or `branch`
- lsn is accepted in Lsn display format (x/y) or hex (as used in layer
names)

gc_cutoff and branch have different colors.
---
 pageserver/ctl/src/draw_timeline_dir.rs | 89 +++++++++++++++++++++++--
 1 file changed, 82 insertions(+), 7 deletions(-)

diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs
index 9a556cb3d4..d8082f8ab4 100644
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -28,6 +28,8 @@
 //! # From an `index_part.json` in S3
 //! (jq -r '.layer_metadata | keys[]' | cargo  run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
 //!
+//! # enrich with lines for gc_cutoff and a child branch point
+//! cat <(jq -r '.historic_layers[] | .layer_file_name' < layers.json) <(echo -e 'gc_cutoff:0000001CE3FE32C9\nbranch:0000001DE3FE32C9') | cargo run --bin pagectl draw-timeline >| out.svg
 //! ```
 //!
 //! ## Viewing
@@ -48,7 +50,7 @@
 //! ```
 //!
 
-use anyhow::Result;
+use anyhow::{Context, Result};
 use pageserver::repository::Key;
 use pageserver::METADATA_FILE_NAME;
 use std::cmp::Ordering;
@@ -90,6 +92,33 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
     (keys, lsns)
 }
 
+#[derive(Clone, Copy)]
+enum LineKind {
+    GcCutoff,
+    Branch,
+}
+
+impl From<LineKind> for Fill {
+    fn from(value: LineKind) -> Self {
+        match value {
+            LineKind::GcCutoff => Fill::Color(rgb(255, 0, 0)),
+            LineKind::Branch => Fill::Color(rgb(0, 255, 0)),
+        }
+    }
+}
+
+impl FromStr for LineKind {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> std::prelude::v1::Result<Self, Self::Err> {
+        Ok(match s {
+            "gc_cutoff" => LineKind::GcCutoff,
+            "branch" => LineKind::Branch,
+            _ => anyhow::bail!("unsupported linekind: {s}"),
+        })
+    }
+}
+
 pub fn main() -> Result<()> {
     // Parse layer filenames from stdin
     struct Layer {
@@ -99,8 +128,29 @@ pub fn main() -> Result<()> {
     }
     let mut files: Vec<Layer> = vec![];
     let stdin = io::stdin();
-    for line in stdin.lock().lines() {
+
+    let mut lines: Vec<(Lsn, LineKind)> = vec![];
+
+    for (lineno, line) in stdin.lock().lines().enumerate() {
+        let lineno = lineno + 1;
+
         let line = line.unwrap();
+        if let Some((kind, lsn)) = line.split_once(':') {
+            let (kind, lsn) = LineKind::from_str(kind)
+                .context("parse kind")
+                .and_then(|kind| {
+                    if lsn.contains('/') {
+                        Lsn::from_str(lsn)
+                    } else {
+                        Lsn::from_hex(lsn)
+                    }
+                    .map(|lsn| (kind, lsn))
+                    .context("parse lsn")
+                })
+                .with_context(|| format!("parse {line:?} on {lineno}"))?;
+            lines.push((lsn, kind));
+            continue;
+        }
         let line = PathBuf::from_str(&line).unwrap();
         let filename = line.file_name().unwrap();
         let filename = filename.to_str().unwrap();
@@ -117,8 +167,9 @@ pub fn main() -> Result<()> {
     }
 
     // Collect all coordinates
-    let mut keys: Vec<Key> = vec![];
-    let mut lsns: Vec<Lsn> = vec![];
+    let mut keys: Vec<Key> = Vec::with_capacity(files.len());
+    let mut lsns: Vec<Lsn> = Vec::with_capacity(files.len() + lines.len());
+
     for Layer {
         key_range: keyr,
         lsn_range: lsnr,
@@ -131,6 +182,8 @@ pub fn main() -> Result<()> {
         lsns.push(lsnr.end);
     }
 
+    lsns.extend(lines.iter().map(|(lsn, _)| *lsn));
+
     // Analyze
     let key_map = build_coordinate_compression_map(keys);
     let lsn_map = build_coordinate_compression_map(lsns);
@@ -144,10 +197,13 @@ pub fn main() -> Result<()> {
     println!(
         "{}",
         BeginSvg {
-            w: key_map.len() as f32,
+            w: (key_map.len() + 10) as f32,
             h: stretch * lsn_map.len() as f32
         }
     );
+
+    let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
+
     for Layer {
         filename,
         key_range: keyr,
@@ -169,7 +225,6 @@ pub fn main() -> Result<()> {
         let mut lsn_diff = (lsn_end - lsn_start) as f32;
         let mut fill = Fill::None;
         let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
-        let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
         let mut lsn_offset = 0.0;
 
         // Fill in and thicken rectangle if it's an
@@ -189,7 +244,7 @@ pub fn main() -> Result<()> {
         println!(
             "    {}",
             rectangle(
-                key_start as f32 + stretch * xmargin,
+                5.0 + key_start as f32 + stretch * xmargin,
                 stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
                 key_diff as f32 - stretch * 2.0 * xmargin,
                 stretch * (lsn_diff - 2.0 * ymargin)
@@ -200,6 +255,26 @@ pub fn main() -> Result<()> {
             .comment(filename)
         );
     }
+
+    for (lsn, kind) in lines {
+        let lsn_start = *lsn_map.get(&lsn).unwrap();
+        let lsn_end = lsn_start;
+        let stretch = 2.0;
+        let lsn_diff = 0.3;
+        let lsn_offset = -lsn_diff / 2.0;
+        let ymargin = 0.05;
+        println!(
+            "{}",
+            rectangle(
+                0.0f32 + stretch * xmargin,
+                stretch * (lsn_map.len() as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
+                (key_map.len() + 10) as f32,
+                stretch * (lsn_diff - 2.0 * ymargin)
+            )
+            .fill(kind)
+        );
+    }
+
     println!("{}", EndSvg);
 
     eprintln!("num_images: {}", num_images);

From 6206f76419416c6c936c97df5e660d28333ee835 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 10 May 2024 17:46:50 +0300
Subject: [PATCH 0751/1571] build: run doctests (#7697)

While switching to use nextest with the repository in f28bdb6, we had
not noticed that it doesn't yet support running doctests. Run the doc
tests before other tests.
---
 .github/workflows/build_and_test.yml | 3 +++
 libs/utils/src/poison.rs             | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index eada65505f..21e7a56670 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -341,6 +341,9 @@ jobs:
         env:
           NEXTEST_RETRIES: 3
         run: |
+          #nextest does not yet support running doctests
+          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
+
           for io_engine in std-fs tokio-epoll-uring ; do
             NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
           done
diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs
index 0bf5664f47..27378c69fc 100644
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -3,7 +3,7 @@
 //!  # Example
 //!
 //!  ```
-//!  # tokio_test::block_on(async {
+//!  # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async {
 //!  use utils::poison::Poison;
 //!  use std::time::Duration;
 //!

From d7c68dc981db2d73cb5ff617472266b29bbc2ace Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 10 May 2024 17:50:47 +0200
Subject: [PATCH 0752/1571] Tiered compaction: fix early exit check in main
 loop (#7702)

The old test based on the immutable `target_file_size` that was a
parameter to the function.

It makes no sense to go further once `current_level_target_height` has
reached `u64::MAX`, as lsn's are u64 typed. In practice, we should only
run into this if there is a bug, as the practical lsn range usually ends
much earlier.

Testing on `target_file_size` makes less sense, it basically implements
an invocation mode that turns off the looping and only runs one
iteration of it.
@hlinnaka agrees that `current_level_target_height` is better here.

Part of #7554
---
 pageserver/compaction/src/compact_tiered.rs | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index 12882c9d59..20e9cf2196 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -106,7 +106,13 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
             ctx,
         )
         .await?;
-        if target_file_size == u64::MAX {
+        if current_level_target_height == u64::MAX {
+            // our target height includes all possible lsns
+            info!(
+                level = current_level_no,
+                depth = depth,
+                "compaction loop reached max current_level_target_height"
+            );
             break;
         }
         current_level_no += 1;

From 95098c3216929506b66ce244a2420d07ab65e8dc Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 10 May 2024 17:20:14 +0100
Subject: [PATCH 0753/1571] Fix checkpoint metric (#7701)

Split checkpoint_stats into two separate metrics: checkpoints_req and
checkpoints_timed

Fixes commit
https://github.com/neondatabase/neon/commit/21e1a496a3f706097578de396a9107813c541001

---------

Co-authored-by: Peter Bendel <peterbendel@neon.tech>
---
 vm-image-spec.yaml | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 56538630ac..e9d983eba3 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -278,15 +278,21 @@ files:
               ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
             END AS replication_delay_seconds;
 
-      - metric_name: checkpoint_stats
+      - metric_name: checkpoints_req
         type: gauge
-        help: 'Number of requested and scheduled checkpoints'
+        help: 'Number of requested checkpoints'
         key_labels:
-        values:
-          - checkpoints_req
-          - checkpoints_timed
+        values: [checkpoints_req]
         query: |
-          SELECT checkpoints_req, checkpoints_timed FROM pg_stat_bgwriter;
+          SELECT checkpoints_req FROM pg_stat_bgwriter;
+
+      - metric_name: checkpoints_timed
+        type: gauge
+        help: 'Number of scheduled checkpoints'
+        key_labels:
+        values: [checkpoints_timed]
+        query: |
+          SELECT checkpoints_timed FROM pg_stat_bgwriter;
   - filename: neon_collector_autoscaling.yml
     content: |
       collector_name: neon_collector_autoscaling

From 6351313ae96ab6d0e3e2b27ed2d86eed3dd004c9 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 10 May 2024 22:30:05 +0300
Subject: [PATCH 0754/1571] feat: allow detaching from ancestor for timelines
 without writes (#7639)

The first implementation #7456 did not include `index_part.json` changes
in an attempt to keep amount of changes down. Tracks the historic
reparentings and earlier detach in `index_part.json`.

- `index_part.json` receives a new field `lineage: Lineage`
- `Lineage` is queried through RemoteTimelineClient during basebackup,
creating `PREV LSN: none` for the invalid prev record lsn just as it
would had been created for a newly created timeline
- as `struct IndexPart` grew, it is now boxed in places

Cc: #6994
---
 pageserver/src/basebackup.rs                  |   2 +-
 pageserver/src/tenant/metadata.rs             |   6 +-
 .../src/tenant/remote_timeline_client.rs      |  28 +++-
 .../tenant/remote_timeline_client/index.rs    | 140 +++++++++++++++++-
 pageserver/src/tenant/timeline.rs             |  13 +-
 .../src/tenant/timeline/detach_ancestor.rs    |  10 --
 pageserver/src/tenant/upload_queue.rs         |   8 +-
 s3_scrubber/src/checks.rs                     |   4 +-
 s3_scrubber/src/tenant_snapshot.rs            |   2 +-
 .../regress/test_timeline_detach_ancestor.py  |  83 +++++------
 10 files changed, 225 insertions(+), 71 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 58b18dae7d..dca1510810 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -601,7 +601,7 @@ where
         // add zenith.signal file
         let mut zenith_signal = String::new();
         if self.prev_record_lsn == Lsn(0) {
-            if self.lsn == self.timeline.get_ancestor_lsn() {
+            if self.timeline.is_ancestor_lsn(self.lsn) {
                 write!(zenith_signal, "PREV LSN: none")
                     .map_err(|e| BasebackupError::Server(e.into()))?;
             } else {
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 8ba0775120..fc71ea7642 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -214,12 +214,12 @@ impl TimelineMetadata {
         self.body.ancestor_timeline = Some(*timeline);
     }
 
-    pub fn detach_from_ancestor(&mut self, timeline: &TimelineId, ancestor_lsn: &Lsn) {
+    pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) {
         if let Some(ancestor) = self.body.ancestor_timeline {
-            assert_eq!(ancestor, *timeline);
+            assert_eq!(ancestor, branchpoint.0);
         }
         if self.body.ancestor_lsn != Lsn(0) {
-            assert_eq!(self.body.ancestor_lsn, *ancestor_lsn);
+            assert_eq!(self.body.ancestor_lsn, branchpoint.1);
         }
         self.body.ancestor_timeline = None;
         self.body.ancestor_lsn = Lsn(0);
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index bbe4e16378..9103760388 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -437,6 +437,19 @@ impl RemoteTimelineClient {
         }
     }
 
+    /// Returns true if this timeline was previously detached at this Lsn and the remote timeline
+    /// client is currently initialized.
+    pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
+        // technically this is a dirty read, but given how timeline detach ancestor is implemented
+        // via tenant restart, the lineage has always been uploaded.
+        self.upload_queue
+            .lock()
+            .unwrap()
+            .initialized_mut()
+            .map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn))
+            .unwrap_or(false)
+    }
+
     fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
         let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
             current_remote_index_part
@@ -628,7 +641,7 @@ impl RemoteTimelineClient {
         );
 
         let index_part = IndexPart::from(&*upload_queue);
-        let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
+        let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn);
         self.metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
         upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
@@ -647,7 +660,14 @@ impl RemoteTimelineClient {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
 
+            let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else {
+                return Err(anyhow::anyhow!(
+                    "cannot reparent without a current ancestor"
+                ));
+            };
+
             upload_queue.latest_metadata.reparent(new_parent);
+            upload_queue.latest_lineage.record_previous_ancestor(&prev);
 
             self.schedule_index_upload(upload_queue);
 
@@ -670,9 +690,8 @@ impl RemoteTimelineClient {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
 
-            upload_queue
-                .latest_metadata
-                .detach_from_ancestor(&adopted.0, &adopted.1);
+            upload_queue.latest_metadata.detach_from_ancestor(&adopted);
+            upload_queue.latest_lineage.record_detaching(&adopted);
 
             for layer in layers {
                 upload_queue
@@ -1811,6 +1830,7 @@ impl RemoteTimelineClient {
                         latest_files: initialized.latest_files.clone(),
                         latest_files_changes_since_metadata_upload_scheduled: 0,
                         latest_metadata: initialized.latest_metadata.clone(),
+                        latest_lineage: initialized.latest_lineage.clone(),
                         projected_remote_consistent_lsn: None,
                         visible_remote_consistent_lsn: initialized
                             .visible_remote_consistent_lsn
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 3e05905afa..b114d6aa10 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -6,6 +6,7 @@ use std::collections::HashMap;
 
 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
+use utils::id::TimelineId;
 
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerName;
@@ -84,6 +85,9 @@ pub struct IndexPart {
 
     #[serde(rename = "metadata_bytes")]
     pub metadata: TimelineMetadata,
+
+    #[serde(default)]
+    pub(crate) lineage: Lineage,
 }
 
 impl IndexPart {
@@ -96,10 +100,11 @@ impl IndexPart {
     /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
     ///      is always generated from the keys of `layer_metadata`)
     /// - 4: timeline_layers is fully removed.
-    const LATEST_VERSION: usize = 4;
+    /// - 5: lineage was added
+    const LATEST_VERSION: usize = 5;
 
     // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5];
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
@@ -107,6 +112,7 @@ impl IndexPart {
         layers_and_metadata: &HashMap<LayerName, LayerFileMetadata>,
         disk_consistent_lsn: Lsn,
         metadata: TimelineMetadata,
+        lineage: Lineage,
     ) -> Self {
         let layer_metadata = layers_and_metadata
             .iter()
@@ -119,6 +125,7 @@ impl IndexPart {
             disk_consistent_lsn,
             metadata,
             deleted_at: None,
+            lineage,
         }
     }
 
@@ -147,6 +154,7 @@ impl IndexPart {
             &HashMap::new(),
             example_metadata.disk_consistent_lsn(),
             example_metadata,
+            Default::default(),
         )
     }
 }
@@ -155,8 +163,9 @@ impl From<&UploadQueueInitialized> for IndexPart {
     fn from(uq: &UploadQueueInitialized) -> Self {
         let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn();
         let metadata = uq.latest_metadata.clone();
+        let lineage = uq.latest_lineage.clone();
 
-        Self::new(&uq.latest_files, disk_consistent_lsn, metadata)
+        Self::new(&uq.latest_files, disk_consistent_lsn, metadata, lineage)
     }
 }
 
@@ -184,8 +193,76 @@ impl From<&LayerFileMetadata> for IndexLayerMetadata {
     }
 }
 
+/// Limited history of earlier ancestors.
+///
+/// A timeline can have more than 1 earlier ancestor, in the rare case that it was repeatedly
+/// reparented by having an later timeline be detached from it's ancestor.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
+pub(crate) struct Lineage {
+    /// Has the `reparenting_history` been truncated to [`Lineage::REMEMBER_AT_MOST`].
+    #[serde(skip_serializing_if = "is_false", default)]
+    reparenting_history_truncated: bool,
+
+    /// Earlier ancestors, truncated when [`Self::reparenting_history_truncated`]
+    ///
+    /// These are stored in case we want to support WAL based DR on the timeline. There can be many
+    /// of these and at most one [`Self::original_ancestor`]. There cannot be more reparentings
+    /// after [`Self::original_ancestor`] has been set.
+    #[serde(skip_serializing_if = "Vec::is_empty", default)]
+    reparenting_history: Vec<TimelineId>,
+
+    /// The ancestor from which this timeline has been detached from and when.
+    ///
+    /// If you are adding support for detaching from a hierarchy, consider changing the ancestry
+    /// into a `Vec<(TimelineId, Lsn)>` to be a path instead.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>,
+}
+
+fn is_false(b: &bool) -> bool {
+    !b
+}
+
+impl Lineage {
+    const REMEMBER_AT_MOST: usize = 100;
+
+    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) {
+        if self.reparenting_history.last() == Some(old_ancestor) {
+            // do not re-record it
+            return;
+        }
+
+        let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
+
+        self.reparenting_history_truncated |= drop_oldest;
+        if drop_oldest {
+            self.reparenting_history.remove(0);
+        }
+        self.reparenting_history.push(*old_ancestor);
+    }
+
+    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) {
+        assert!(self.original_ancestor.is_none());
+
+        self.original_ancestor =
+            Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
+    }
+
+    /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
+    /// to start a read/write primary at this lsn".
+    ///
+    /// Returns true if the Lsn was previously a branch point.
+    pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
+        self.original_ancestor
+            .as_ref()
+            .is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn)
+    }
+}
+
 #[cfg(test)]
 mod tests {
+    use std::str::FromStr;
+
     use super::*;
 
     #[test]
@@ -221,6 +298,7 @@ mod tests {
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: None,
+            lineage: Lineage::default(),
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -261,6 +339,7 @@ mod tests {
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: None,
+            lineage: Lineage::default(),
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -302,7 +381,8 @@ mod tests {
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            lineage: Lineage::default(),
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -347,6 +427,7 @@ mod tests {
             ])
             .unwrap(),
             deleted_at: None,
+            lineage: Lineage::default(),
         };
 
         let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -385,11 +466,58 @@ mod tests {
             ]),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
-            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
+            lineage: Lineage::default(),
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
+
+    #[test]
+    fn v5_indexpart_is_parsed() {
+        let example = r#"{
+            "version":5,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499":{"file_size":23289856,"generation":1},
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619":{"file_size":1015808,"generation":1}},
+                "disk_consistent_lsn":"0/15A7618",
+                "metadata_bytes":[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+                "lineage":{
+                    "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"],
+                    "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"]
+                }
+        }"#;
+
+        let expected = IndexPart {
+            version: 5,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 23289856,
+                    generation: Generation::new(1),
+                    shard: ShardIndex::unsharded(),
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 1015808,
+                    generation: Generation::new(1),
+                    shard: ShardIndex::unsharded(),
+                })
+            ]),
+            disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(),
+            metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            deleted_at: None,
+            lineage: Lineage {
+                reparenting_history_truncated: false,
+                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
+                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
+            },
+        };
+
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    fn parse_naive_datetime(s: &str) -> NaiveDateTime {
+        chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
+    }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 60b3873b71..505dc8c30d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3037,6 +3037,18 @@ impl Timeline {
 
         Some(HeatMapTimeline::new(self.timeline_id, layers))
     }
+
+    /// Returns true if the given lsn is or was an ancestor branchpoint.
+    pub(crate) fn is_ancestor_lsn(&self, lsn: Lsn) -> bool {
+        // upon timeline detach, we set the ancestor_lsn to Lsn::INVALID and the store the original
+        // branchpoint in the value in IndexPart::lineage
+        self.ancestor_lsn == lsn
+            || (self.ancestor_lsn == Lsn::INVALID
+                && self
+                    .remote_client
+                    .as_ref()
+                    .is_some_and(|rtc| rtc.is_previous_ancestor_lsn(lsn)))
+    }
 }
 
 type TraversalId = Arc<str>;
@@ -4354,7 +4366,6 @@ impl Timeline {
     /// - has an ancestor to detach from
     /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
     /// a technical requirement
-    /// - has prev_lsn in remote storage (temporary restriction)
     ///
     /// After the operation has been started, it cannot be canceled. Upon restart it needs to be
     /// polled again until completion.
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 2641bf3d13..69b82344a6 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -22,8 +22,6 @@ pub(crate) enum Error {
     TooManyAncestors,
     #[error("shutting down, please retry later")]
     ShuttingDown,
-    #[error("detached timeline must receive writes before the operation")]
-    DetachedTimelineNeedsWrites,
     #[error("flushing failed")]
     FlushAncestor(#[source] anyhow::Error),
     #[error("layer download failed")]
@@ -94,14 +92,6 @@ pub(super) async fn prepare(
         return Err(TooManyAncestors);
     }
 
-    if detached.get_prev_record_lsn() == Lsn::INVALID
-        || detached.disk_consistent_lsn.load() == ancestor_lsn
-    {
-        // this is to avoid a problem that after detaching we would be unable to start up the
-        // compute because of "PREV_LSN: invalid".
-        return Err(DetachedTimelineNeedsWrites);
-    }
-
     // before we acquire the gate, we must mark the ancestor as having a detach operation
     // ongoing which will block other concurrent detach operations so we don't get to ackward
     // situations where there would be two branches trying to reparent earlier branches.
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 7797117e0f..a2f761fa94 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -3,6 +3,7 @@ use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::remote_timeline_client::index::Lineage;
 use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;
 
@@ -56,6 +57,9 @@ pub(crate) struct UploadQueueInitialized {
     /// DANGER: do not return to outside world, e.g., safekeepers.
     pub(crate) latest_metadata: TimelineMetadata,
 
+    /// Part of the flattened "next" `index_part.json`.
+    pub(crate) latest_lineage: Lineage,
+
     /// `disk_consistent_lsn` from the last metadata file that was successfully
     /// uploaded. `Lsn(0)` if nothing was uploaded yet.
     /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
@@ -171,6 +175,7 @@ impl UploadQueue {
             latest_files: HashMap::new(),
             latest_files_changes_since_metadata_upload_scheduled: 0,
             latest_metadata: metadata.clone(),
+            latest_lineage: Lineage::default(),
             projected_remote_consistent_lsn: None,
             visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
             // what follows are boring default initializations
@@ -218,6 +223,7 @@ impl UploadQueue {
             latest_files: files,
             latest_files_changes_since_metadata_upload_scheduled: 0,
             latest_metadata: index_part.metadata.clone(),
+            latest_lineage: index_part.lineage.clone(),
             projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
             visible_remote_consistent_lsn: Arc::new(
                 index_part.metadata.disk_consistent_lsn().into(),
@@ -290,7 +296,7 @@ pub(crate) enum UploadOp {
     UploadLayer(ResidentLayer, LayerFileMetadata),
 
     /// Upload the metadata file
-    UploadMetadata(IndexPart, Lsn),
+    UploadMetadata(Box<IndexPart>, Lsn),
 
     /// Delete layer files
     Delete(Delete),
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index 68133fc0a9..dd64a0a98f 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -246,7 +246,7 @@ pub(crate) struct S3TimelineBlobData {
 #[derive(Debug)]
 pub(crate) enum BlobDataParseResult {
     Parsed {
-        index_part: IndexPart,
+        index_part: Box<IndexPart>,
         index_part_generation: Generation,
         s3_layers: HashSet<(LayerName, Generation)>,
     },
@@ -368,7 +368,7 @@ pub(crate) async fn list_timeline_blobs(
             Ok(index_part) => {
                 return Ok(S3TimelineBlobData {
                     blob_data: BlobDataParseResult::Parsed {
-                        index_part,
+                        index_part: Box::new(index_part),
                         index_part_generation,
                         s3_layers,
                     },
diff --git a/s3_scrubber/src/tenant_snapshot.rs b/s3_scrubber/src/tenant_snapshot.rs
index 2c93a8490a..a24a1e92ae 100644
--- a/s3_scrubber/src/tenant_snapshot.rs
+++ b/s3_scrubber/src/tenant_snapshot.rs
@@ -159,7 +159,7 @@ impl SnapshotDownloader {
     async fn download_timeline(
         &self,
         ttid: TenantShardTimelineId,
-        index_part: IndexPart,
+        index_part: Box<IndexPart>,
         index_part_generation: Generation,
         ancestor_layers: &mut HashMap<
             TenantShardTimelineId,
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index b8a88ca6df..075f0a6bbc 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -1,3 +1,4 @@
+import datetime
 import enum
 from concurrent.futures import ThreadPoolExecutor
 from queue import Empty, Queue
@@ -12,6 +13,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import HistoricLayerInfo
 from fixtures.pageserver.utils import wait_timeline_detail_404
+from fixtures.remote_storage import LocalFsStorage
 from fixtures.types import Lsn, TimelineId
 
 
@@ -56,15 +58,16 @@ SHUTDOWN_ALLOWED_ERRORS = [
 
 @pytest.mark.parametrize("branchpoint", Branchpoint.all())
 @pytest.mark.parametrize("restart_after", [True, False])
+@pytest.mark.parametrize("write_to_branch_first", [True, False])
 def test_ancestor_detach_branched_from(
-    neon_env_builder: NeonEnvBuilder, branchpoint: Branchpoint, restart_after: bool
+    neon_env_builder: NeonEnvBuilder,
+    branchpoint: Branchpoint,
+    restart_after: bool,
+    write_to_branch_first: bool,
 ):
     """
     Creates a branch relative to L0 lsn boundary according to Branchpoint. Later the timeline is detached.
     """
-    # TODO: parametrize; currently unimplemented over at pageserver
-    write_to_branch_first = True
-
     env = neon_env_builder.init_start()
 
     env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
@@ -174,8 +177,7 @@ def test_ancestor_detach_branched_from(
     wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
 
 
-@pytest.mark.parametrize("restart_after", [True, False])
-def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, restart_after: bool):
+def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
     """
     The case from RFC:
 
@@ -204,9 +206,6 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, res
     We confirm the end result by being able to delete "old main" after deleting "after".
     """
 
-    # TODO: support not yet implemented for these
-    write_to_branch_first = True
-
     env = neon_env_builder.init_start()
 
     env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
@@ -244,42 +243,57 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, res
 
     after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None)
 
-    if write_to_branch_first:
-        with env.endpoints.create_start("new main", tenant_id=env.initial_tenant) as ep:
-            assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == 8192
-            with ep.cursor() as cur:
-                cur.execute("UPDATE audit SET starts = starts + 1")
-                assert cur.rowcount == 1
-            wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id)
-
-        client.timeline_checkpoint(env.initial_tenant, timeline_id)
-
     all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
     assert all_reparented == {reparented, same_branchpoint}
 
-    if restart_after:
-        env.pageserver.stop()
-        env.pageserver.start()
-
     env.pageserver.quiesce_tenants()
 
     # checking the ancestor after is much faster than waiting for the endpoint not start
     expected_result = [
         ("main", env.initial_timeline, None, 16384, 1),
         ("after", after, env.initial_timeline, 16384, 1),
-        ("new main", timeline_id, None, 8192, 2),
+        ("new main", timeline_id, None, 8192, 1),
         ("same_branchpoint", same_branchpoint, timeline_id, 8192, 1),
         ("reparented", reparented, timeline_id, 0, 1),
     ]
 
-    for _, timeline_id, expected_ancestor, _, _ in expected_result:
-        details = client.timeline_detail(env.initial_tenant, timeline_id)
+    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+
+    for _, queried_timeline, expected_ancestor, _, _ in expected_result:
+        details = client.timeline_detail(env.initial_tenant, queried_timeline)
         ancestor_timeline_id = details["ancestor_timeline_id"]
         if expected_ancestor is None:
             assert ancestor_timeline_id is None
         else:
             assert TimelineId(ancestor_timeline_id) == expected_ancestor
 
+        index_part = env.pageserver_remote_storage.index_content(
+            env.initial_tenant, queried_timeline
+        )
+        lineage = index_part["lineage"]
+        assert lineage is not None
+
+        assert lineage.get("reparenting_history_overflown", "false") == "false"
+
+        if queried_timeline == timeline_id:
+            original_ancestor = lineage["original_ancestor"]
+            assert original_ancestor is not None
+            assert original_ancestor[0] == str(env.initial_timeline)
+            assert original_ancestor[1] == str(branchpoint_x)
+
+            # this does not contain Z in the end, so fromisoformat accepts it
+            # it is to be in line with the deletion timestamp.. well, almost.
+            when = original_ancestor[2][:26]
+            when_ts = datetime.datetime.fromisoformat(when)
+            assert when_ts < datetime.datetime.now()
+            assert len(lineage.get("reparenting_history", [])) == 0
+        elif expected_ancestor == timeline_id:
+            assert len(lineage.get("original_ancestor", [])) == 0
+            assert lineage["reparenting_history"] == [str(env.initial_timeline)]
+        else:
+            assert len(lineage.get("original_ancestor", [])) == 0
+            assert len(lineage.get("reparenting_history", [])) == 0
+
     for name, _, _, rows, starts in expected_result:
         with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep:
             assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
@@ -293,14 +307,10 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, res
     wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
 
 
-@pytest.mark.parametrize("restart_after", [True, False])
-def test_detached_receives_flushes_while_being_detached(
-    neon_env_builder: NeonEnvBuilder, restart_after: bool
-):
+def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder):
     """
     Makes sure that the timeline is able to receive writes through-out the detach process.
     """
-    write_to_branch_first = True
 
     env = neon_env_builder.init_start()
 
@@ -330,12 +340,6 @@ def test_detached_receives_flushes_while_being_detached(
     ep = env.endpoints.create_start("new main", tenant_id=env.initial_tenant)
     assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
 
-    if write_to_branch_first:
-        rows += insert_rows(256, ep)
-        wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id)
-        client.timeline_checkpoint(env.initial_tenant, timeline_id)
-        log.info("completed {write_to_branch_first=}")
-
     def small_txs(ep, queue: Queue[str], barrier):
         extra_rows = 0
 
@@ -368,11 +372,6 @@ def test_detached_receives_flushes_while_being_detached(
         reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
         assert len(reparented) == 0
 
-        if restart_after:
-            # ep and row production is kept alive on purpose
-            env.pageserver.stop()
-            env.pageserver.start()
-
         env.pageserver.quiesce_tenants()
 
         queue.put("done")

From 4270e86eb282367fedebe87a5a363619b466eece Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 13 May 2024 10:58:03 +0300
Subject: [PATCH 0755/1571] test(ancestor detach): verify with fullbackup
 (#7706)

In timeline detach ancestor tests there is no way to really be sure that
there were no subtle off-by one bugs. One such bug is demoed and
reverted. Add verifying fullbackup is equal before and after detaching
ancestor.

Fullbackup is expected to be equal apart from `zenith.signal`, which is
known to be good because endpoint can be started without the detached
branch receiving writes.
---
 .../regress/test_timeline_detach_ancestor.py  | 89 ++++++++++++++++++-
 1 file changed, 88 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 075f0a6bbc..214e10c32e 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -1,14 +1,19 @@
 import datetime
 import enum
+import tarfile
+import time
 from concurrent.futures import ThreadPoolExecutor
+from hashlib import sha256
+from pathlib import Path
 from queue import Empty, Queue
 from threading import Barrier
-from typing import List
+from typing import IO, List, Set, Tuple, Union
 
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
+    PgBin,
     wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import HistoricLayerInfo
@@ -60,7 +65,10 @@ SHUTDOWN_ALLOWED_ERRORS = [
 @pytest.mark.parametrize("restart_after", [True, False])
 @pytest.mark.parametrize("write_to_branch_first", [True, False])
 def test_ancestor_detach_branched_from(
+    test_output_dir,
+    pg_distrib_dir,
     neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
     branchpoint: Branchpoint,
     restart_after: bool,
     write_to_branch_first: bool,
@@ -70,6 +78,7 @@ def test_ancestor_detach_branched_from(
     """
     env = neon_env_builder.init_start()
 
+    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
     env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
 
     client = env.pageserver.http_client()
@@ -146,6 +155,20 @@ def test_ancestor_detach_branched_from(
     else:
         branch_layers = set()
 
+    # run fullbackup to make sure there are no off by one errors
+    # take this on the parent
+    fullbackup_before = test_output_dir / "fullbackup-before.tar"
+    cmd = [
+        "psql",
+        "--no-psqlrc",
+        env.pageserver.connstr(),
+        "-c",
+        f"fullbackup {env.initial_tenant} {env.initial_timeline} {branch_at}",
+        "-o",
+        str(fullbackup_before),
+    ]
+    pg_bin.run_capture(cmd, env=psql_env)
+
     all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
     assert all_reparented == set()
 
@@ -173,9 +196,73 @@ def test_ancestor_detach_branched_from(
     # but if nothing was copied, then there is no nice rule.
     # there could be a hole in LSNs between copied from the "old main" and the first branch layer.
 
+    # take this on the detached, at same lsn
+    fullbackup_after = test_output_dir / "fullbackup-after.tar"
+    cmd = [
+        "psql",
+        "--no-psqlrc",
+        env.pageserver.connstr(),
+        "-c",
+        f"fullbackup {env.initial_tenant} {timeline_id} {branch_at}",
+        "-o",
+        str(fullbackup_after),
+    ]
+    pg_bin.run_capture(cmd, env=psql_env)
+
     client.timeline_delete(env.initial_tenant, env.initial_timeline)
     wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
 
+    # because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different
+    # as there is always "PREV_LSN: invalid" for "before"
+    skip_files = {"zenith.signal"}
+
+    tar_cmp(fullbackup_before, fullbackup_after, skip_files)
+
+
+def tar_cmp(left: Path, right: Path, skip_files: Set[str]):
+    """
+    This is essentially:
+
+    lines=$(comm -3 \
+        <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \
+        <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \
+        | wc -l)
+    [ "$lines" = "0" ]
+
+    But in a more mac friendly fashion.
+    """
+    started_at = time.time()
+
+    def hash_extracted(reader: Union[IO[bytes], None]) -> bytes:
+        assert reader is not None
+        digest = sha256(usedforsecurity=False)
+        while True:
+            buf = reader.read(64 * 1024)
+            if not buf:
+                break
+            digest.update(buf)
+        return digest.digest()
+
+    def build_hash_list(p: Path) -> List[Tuple[str, bytes]]:
+        with tarfile.open(p) as f:
+            matching_files = (info for info in f if info.isreg() and info.name not in skip_files)
+            ret = list(
+                map(lambda info: (info.name, hash_extracted(f.extractfile(info))), matching_files)
+            )
+            ret.sort(key=lambda t: t[0])
+            return ret
+
+    left_list, right_list = map(build_hash_list, [left, right])
+
+    try:
+        assert len(left_list) == len(right_list)
+
+        for left_tuple, right_tuple in zip(left_list, right_list):
+            assert left_tuple == right_tuple
+    finally:
+        elapsed = time.time() - started_at
+        log.info(f"tar_cmp completed in {elapsed}s")
+
 
 def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
     """

From 216fc5ba7beb48ac7ae45aa007f72f1a3c0f62e0 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 13 May 2024 11:56:07 +0300
Subject: [PATCH 0756/1571] test: fix confusing limit and logging (#7589)

The test has been flaky since 2024-04-11 for unknown reason, and the
logging was off. Fix the logging and raise the limit a bit. The
problematic ratio reproduces with pg14 and added sleep (not included)
but not on pg15. The new ratio abs diff limit works for all inspected
examples.

Cc: #7536
---
 test_runner/regress/test_disk_usage_eviction.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 5e9efa7cce..1d73e9cb18 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -623,15 +623,16 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
             ratio = count_now / original_count
             abs_diff = abs(ratio - expected_ratio)
             assert original_count > count_now
-            log.info(
-                f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < 0.1"
-            )
 
+            expectation = 0.06
+            log.info(
+                f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}"
+            )
             # in this test case both relative_spare and relative_equal produce
             # the same outcomes; this must be a quantization effect of similar
             # sizes (-s4 and -s6) and small (5MB) layer size.
             # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02
-            assert abs_diff < 0.05
+            assert abs_diff < expectation
 
 
 @pytest.mark.parametrize(

From 1a1d52787579a617028c66f8a9f41b46641a1035 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 13 May 2024 12:21:49 +0300
Subject: [PATCH 0757/1571] test: allow vectored get validation failure during
 shutdown (#7716)

Per [evidence] the timeline ancestor detach tests can panic while
shutting down on vectored get validation. Allow the error because tenant
is restarted twice in the test.

[evidence]:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7708/9058185709/index.html#suites/a1c2be32556270764423c495fad75d47/d444f7e5c0a18ce9
---
 test_runner/regress/test_timeline_detach_ancestor.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 214e10c32e..1e961a4b2f 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -58,6 +58,8 @@ class Branchpoint(str, enum.Enum):
 SHUTDOWN_ALLOWED_ERRORS = [
     ".*initial size calculation failed: downloading failed, possibly for shutdown",
     ".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+    ".*logical_size_calculation_task:panic.*: Sequential get failed with Bad state \\(not active\\).*",
+    ".*Task 'initial size calculation' .* panicked.*",
 ]
 
 
From b58a615197374da349525da843b04849c47d610f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 13 May 2024 11:22:10 +0200
Subject: [PATCH 0758/1571] chore(pageserver): plumb through RequestContext to
 VirtualFile read methods (#7720)

This PR introduces no functional changes.

The `open()` path will be done separately.

refs https://github.com/neondatabase/neon/issues/6107
refs https://github.com/neondatabase/neon/issues/7386
---
 pageserver/src/tenant/block_io.rs             |  7 +-
 .../src/tenant/ephemeral_file/page_caching.rs |  2 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  9 +-
 .../src/tenant/storage_layer/image_layer.rs   |  5 +-
 pageserver/src/tenant/vectored_blob_io.rs     |  4 +-
 pageserver/src/virtual_file.rs                | 98 +++++++++++++------
 6 files changed, 82 insertions(+), 43 deletions(-)

diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 37c84be342..92928116c1 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -102,7 +102,7 @@ impl<'a> BlockReaderRef<'a> {
             #[cfg(test)]
             TestDisk(r) => r.read_blk(blknum),
             #[cfg(test)]
-            VirtualFile(r) => r.read_blk(blknum).await,
+            VirtualFile(r) => r.read_blk(blknum, ctx).await,
         }
     }
 }
@@ -177,10 +177,11 @@ impl<'a> FileBlockReader<'a> {
         &self,
         buf: PageWriteGuard<'static>,
         blkno: u32,
+        ctx: &RequestContext,
     ) -> Result<PageWriteGuard<'static>, std::io::Error> {
         assert!(buf.len() == PAGE_SZ);
         self.file
-            .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64)
+            .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64, ctx)
             .await
     }
     /// Read a block.
@@ -206,7 +207,7 @@ impl<'a> FileBlockReader<'a> {
             ReadBufResult::Found(guard) => Ok(guard.into()),
             ReadBufResult::NotFound(write_guard) => {
                 // Read the page from disk into the buffer
-                let write_guard = self.fill_buffer(write_guard, blknum).await?;
+                let write_guard = self.fill_buffer(write_guard, blknum, ctx).await?;
                 Ok(write_guard.mark_valid().into())
             }
         }
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
index 42def8858e..276ac87064 100644
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -78,7 +78,7 @@ impl RW {
                     page_cache::ReadBufResult::NotFound(write_guard) => {
                         let write_guard = writer
                             .file
-                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
+                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
                             .await?;
                         let read_guard = write_guard.mark_valid();
                         return Ok(BlockLease::PageReadGuard(read_guard));
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index c38c9bb656..4f30cf2e84 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -908,7 +908,7 @@ impl DeltaLayerInner {
         .await
         .map_err(GetVectoredError::Other)?;
 
-        self.do_reads_and_update_state(reads, reconstruct_state)
+        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
             .await;
 
         reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);
@@ -1012,6 +1012,7 @@ impl DeltaLayerInner {
         &self,
         reads: Vec<VectoredRead>,
         reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
     ) {
         let vectored_blob_reader = VectoredBlobReader::new(&self.file);
         let mut ignore_key_with_err = None;
@@ -1029,7 +1030,7 @@ impl DeltaLayerInner {
         // track when a key is done.
         for read in reads.into_iter().rev() {
             let res = vectored_blob_reader
-                .read_blobs(&read, buf.take().expect("Should have a buffer"))
+                .read_blobs(&read, buf.take().expect("Should have a buffer"), ctx)
                 .await;
 
             let blobs_buf = match res {
@@ -1274,7 +1275,7 @@ impl DeltaLayerInner {
 
                 buf.clear();
                 buf.reserve(read.size());
-                let res = reader.read_blobs(&read, buf).await?;
+                let res = reader.read_blobs(&read, buf, ctx).await?;
 
                 for blob in res.blobs {
                     let key = blob.meta.key;
@@ -1848,7 +1849,7 @@ mod test {
 
             for read in vectored_reads {
                 let blobs_buf = vectored_blob_reader
-                    .read_blobs(&read, buf.take().expect("Should have a buffer"))
+                    .read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx)
                     .await?;
                 for meta in blobs_buf.blobs.iter() {
                     let value = &blobs_buf.buf[meta.start..meta.end];
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index c9874873e4..72d1f36cab 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -474,7 +474,7 @@ impl ImageLayerInner {
             .await
             .map_err(GetVectoredError::Other)?;
 
-        self.do_reads_and_update_state(reads, reconstruct_state)
+        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
             .await;
 
         Ok(())
@@ -537,6 +537,7 @@ impl ImageLayerInner {
         &self,
         reads: Vec<VectoredRead>,
         reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
     ) {
         let max_vectored_read_bytes = self
             .max_vectored_read_bytes
@@ -565,7 +566,7 @@ impl ImageLayerInner {
             }
 
             let buf = BytesMut::with_capacity(buf_size);
-            let res = vectored_blob_reader.read_blobs(&read, buf).await;
+            let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;
 
             match res {
                 Ok(blobs_buf) => {
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 91934d5e0e..6e825760e3 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -23,6 +23,7 @@ use pageserver_api::key::Key;
 use utils::lsn::Lsn;
 use utils::vec_map::VecMap;
 
+use crate::context::RequestContext;
 use crate::virtual_file::VirtualFile;
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
@@ -285,6 +286,7 @@ impl<'a> VectoredBlobReader<'a> {
         &self,
         read: &VectoredRead,
         buf: BytesMut,
+        ctx: &RequestContext,
     ) -> Result<VectoredBlobsBuf, std::io::Error> {
         assert!(read.size() > 0);
         assert!(
@@ -295,7 +297,7 @@ impl<'a> VectoredBlobReader<'a> {
         );
         let buf = self
             .file
-            .read_exact_at_n(buf, read.start, read.size())
+            .read_exact_at_n(buf, read.start, read.size(), ctx)
             .await?;
 
         let blobs_at = read.blobs_at.as_slice();
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index a17488a286..8dee73891b 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -576,21 +576,34 @@ impl VirtualFile {
         Ok(self.pos)
     }
 
-    pub async fn read_exact_at<B>(&self, buf: B, offset: u64) -> Result<B, Error>
+    pub async fn read_exact_at<B>(
+        &self,
+        buf: B,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> Result<B, Error>
     where
         B: IoBufMut + Send,
     {
-        let (buf, res) =
-            read_exact_at_impl(buf, offset, None, |buf, offset| self.read_at(buf, offset)).await;
+        let (buf, res) = read_exact_at_impl(buf, offset, None, |buf, offset| {
+            self.read_at(buf, offset, ctx)
+        })
+        .await;
         res.map(|()| buf)
     }
 
-    pub async fn read_exact_at_n<B>(&self, buf: B, offset: u64, count: usize) -> Result<B, Error>
+    pub async fn read_exact_at_n<B>(
+        &self,
+        buf: B,
+        offset: u64,
+        count: usize,
+        ctx: &RequestContext,
+    ) -> Result<B, Error>
     where
         B: IoBufMut + Send,
     {
         let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| {
-            self.read_at(buf, offset)
+            self.read_at(buf, offset, ctx)
         })
         .await;
         res.map(|()| buf)
@@ -601,12 +614,13 @@ impl VirtualFile {
         &self,
         page: PageWriteGuard<'static>,
         offset: u64,
+        ctx: &RequestContext,
     ) -> Result<PageWriteGuard<'static>, Error> {
         let buf = PageWriteGuardBuf {
             page,
             init_up_to: 0,
         };
-        let res = self.read_exact_at(buf, offset).await;
+        let res = self.read_exact_at(buf, offset, ctx).await;
         res.map(|PageWriteGuardBuf { page, .. }| page)
             .map_err(|e| Error::new(ErrorKind::Other, e))
     }
@@ -699,7 +713,12 @@ impl VirtualFile {
         (buf, Ok(n))
     }
 
-    pub(crate) async fn read_at<B>(&self, buf: B, offset: u64) -> (B, Result<usize, Error>)
+    pub(crate) async fn read_at<B>(
+        &self,
+        buf: B,
+        offset: u64,
+        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
+    ) -> (B, Result<usize, Error>)
     where
         B: tokio_epoll_uring::BoundedBufMut + Send,
     {
@@ -1020,20 +1039,21 @@ impl VirtualFile {
     pub(crate) async fn read_blk(
         &self,
         blknum: u32,
+        ctx: &RequestContext,
     ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
         use crate::page_cache::PAGE_SZ;
         let buf = vec![0; PAGE_SZ];
         let buf = self
-            .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64))
+            .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64), ctx)
             .await?;
         Ok(crate::tenant::block_io::BlockLease::Vec(buf))
     }
 
-    async fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<(), Error> {
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
         let mut tmp = vec![0; 128];
         loop {
             let res;
-            (tmp, res) = self.read_at(tmp, self.pos).await;
+            (tmp, res) = self.read_at(tmp, self.pos, ctx).await;
             match res {
                 Ok(0) => return Ok(()),
                 Ok(n) => {
@@ -1176,9 +1196,14 @@ mod tests {
     }
 
     impl MaybeVirtualFile {
-        async fn read_exact_at(&self, mut buf: Vec<u8>, offset: u64) -> Result<Vec<u8>, Error> {
+        async fn read_exact_at(
+            &self,
+            mut buf: Vec<u8>,
+            offset: u64,
+            ctx: &RequestContext,
+        ) -> Result<Vec<u8>, Error> {
             match self {
-                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await,
+                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset, ctx).await,
                 MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
             }
         }
@@ -1230,13 +1255,13 @@ mod tests {
 
         // Helper function to slurp contents of a file, starting at the current position,
         // into a string
-        async fn read_string(&mut self) -> Result<String, Error> {
+        async fn read_string(&mut self, ctx: &RequestContext) -> Result<String, Error> {
             use std::io::Read;
             let mut buf = String::new();
             match self {
                 MaybeVirtualFile::VirtualFile(file) => {
                     let mut buf = Vec::new();
-                    file.read_to_end(&mut buf).await?;
+                    file.read_to_end(&mut buf, ctx).await?;
                     return Ok(String::from_utf8(buf).unwrap());
                 }
                 MaybeVirtualFile::File(file) => {
@@ -1247,9 +1272,14 @@ mod tests {
         }
 
         // Helper function to slurp a portion of a file into a string
-        async fn read_string_at(&mut self, pos: u64, len: usize) -> Result<String, Error> {
+        async fn read_string_at(
+            &mut self,
+            pos: u64,
+            len: usize,
+            ctx: &RequestContext,
+        ) -> Result<String, Error> {
             let buf = vec![0; len];
-            let buf = self.read_exact_at(buf, pos).await?;
+            let buf = self.read_exact_at(buf, pos, ctx).await?;
             Ok(String::from_utf8(buf).unwrap())
         }
     }
@@ -1303,7 +1333,7 @@ mod tests {
         file_a.write_all(b"foobar".to_vec(), &ctx).await?;
 
         // cannot read from a file opened in write-only mode
-        let _ = file_a.read_string().await.unwrap_err();
+        let _ = file_a.read_string(&ctx).await.unwrap_err();
 
         // Close the file and re-open for reading
         let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
@@ -1312,24 +1342,24 @@ mod tests {
         let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err();
 
         // Try simple read
-        assert_eq!("foobar", file_a.read_string().await?);
+        assert_eq!("foobar", file_a.read_string(&ctx).await?);
 
         // It's positioned at the EOF now.
-        assert_eq!("", file_a.read_string().await?);
+        assert_eq!("", file_a.read_string(&ctx).await?);
 
         // Test seeks.
         assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
-        assert_eq!("oobar", file_a.read_string().await?);
+        assert_eq!("oobar", file_a.read_string(&ctx).await?);
 
         assert_eq!(file_a.seek(SeekFrom::End(-2)).await?, 4);
-        assert_eq!("ar", file_a.read_string().await?);
+        assert_eq!("ar", file_a.read_string(&ctx).await?);
 
         assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
         assert_eq!(file_a.seek(SeekFrom::Current(2)).await?, 3);
-        assert_eq!("bar", file_a.read_string().await?);
+        assert_eq!("bar", file_a.read_string(&ctx).await?);
 
         assert_eq!(file_a.seek(SeekFrom::Current(-5)).await?, 1);
-        assert_eq!("oobar", file_a.read_string().await?);
+        assert_eq!("oobar", file_a.read_string(&ctx).await?);
 
         // Test erroneous seeks to before byte 0
         file_a.seek(SeekFrom::End(-7)).await.unwrap_err();
@@ -1337,7 +1367,7 @@ mod tests {
         file_a.seek(SeekFrom::Current(-2)).await.unwrap_err();
 
         // the erroneous seek should have left the position unchanged
-        assert_eq!("oobar", file_a.read_string().await?);
+        assert_eq!("oobar", file_a.read_string(&ctx).await?);
 
         // Create another test file, and try FileExt functions on it.
         let path_b = testdir.join("file_b");
@@ -1354,7 +1384,7 @@ mod tests {
         file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?;
         file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?;
 
-        assert_eq!(file_b.read_string_at(2, 3).await?, "OBA");
+        assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA");
 
         // Open a lot of files, enough to cause some evictions. (Or to be precise,
         // open the same file many times. The effect is the same.)
@@ -1366,7 +1396,7 @@ mod tests {
         for _ in 0..100 {
             let mut vfile =
                 openfunc(path_b.clone(), OpenOptions::new().read(true).to_owned()).await?;
-            assert_eq!("FOOBAR", vfile.read_string().await?);
+            assert_eq!("FOOBAR", vfile.read_string(&ctx).await?);
             vfiles.push(vfile);
         }
 
@@ -1375,13 +1405,13 @@ mod tests {
 
         // The underlying file descriptor for 'file_a' should be closed now. Try to read
         // from it again. We left the file positioned at offset 1 above.
-        assert_eq!("oobar", file_a.read_string().await?);
+        assert_eq!("oobar", file_a.read_string(&ctx).await?);
 
         // Check that all the other FDs still work too. Use them in random order for
         // good measure.
         vfiles.as_mut_slice().shuffle(&mut thread_rng());
         for vfile in vfiles.iter_mut() {
-            assert_eq!("OOBAR", vfile.read_string_at(1, 5).await?);
+            assert_eq!("OOBAR", vfile.read_string_at(1, 5, &ctx).await?);
         }
 
         Ok(())
@@ -1397,6 +1427,7 @@ mod tests {
         const THREADS: usize = 100;
         const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
 
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
         let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
         std::fs::create_dir_all(&testdir)?;
 
@@ -1425,12 +1456,13 @@ mod tests {
         let mut hdls = Vec::new();
         for _threadno in 0..THREADS {
             let files = files.clone();
+            let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error);
             let hdl = rt.spawn(async move {
                 let mut buf = vec![0u8; SIZE];
                 let mut rng = rand::rngs::OsRng;
                 for _ in 1..1000 {
                     let f = &files[rng.gen_range(0..files.len())];
-                    buf = f.read_exact_at(buf, 0).await.unwrap();
+                    buf = f.read_exact_at(buf, 0, &ctx).await.unwrap();
                     assert!(buf == SAMPLE);
                 }
             });
@@ -1446,6 +1478,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_atomic_overwrite_basic() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
         let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic");
         std::fs::create_dir_all(&testdir).unwrap();
 
@@ -1456,7 +1489,7 @@ mod tests {
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
-        let post = file.read_string().await.unwrap();
+        let post = file.read_string(&ctx).await.unwrap();
         assert_eq!(post, "foo");
         assert!(!tmp_path.exists());
         drop(file);
@@ -1465,7 +1498,7 @@ mod tests {
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
-        let post = file.read_string().await.unwrap();
+        let post = file.read_string(&ctx).await.unwrap();
         assert_eq!(post, "bar");
         assert!(!tmp_path.exists());
         drop(file);
@@ -1473,6 +1506,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_atomic_overwrite_preexisting_tmp() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
         let testdir =
             crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp");
         std::fs::create_dir_all(&testdir).unwrap();
@@ -1488,7 +1522,7 @@ mod tests {
             .unwrap();
 
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
-        let post = file.read_string().await.unwrap();
+        let post = file.read_string(&ctx).await.unwrap();
         assert_eq!(post, "foo");
         assert!(!tmp_path.exists());
         drop(file);

From f50ff1456091d8e5ac8db2d01f4b058ce36d6569 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 13 May 2024 13:05:46 +0100
Subject: [PATCH 0759/1571] pageserver: refuse to run without remote storage
 (#7722)

## Problem

Since https://github.com/neondatabase/neon/pull/6769, the pageserver is
intentionally not usable without remote storage: it's purpose is to act
as a cache to an object store, rather than as a source of truth in its
own right.

## Summary of changes

- Make remote storage configuration mandatory: the pageserver will
refuse to start if it is not provided.

This is a precursor that will make it safe to subsequently remove all
the internal Option<>s
---
 pageserver/src/bin/pageserver.rs | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index eb4b8bb8bb..49f8a41b37 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -383,7 +383,7 @@ fn start_pageserver(
     let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
 
     // Set up remote storage client
-    let remote_storage = create_remote_storage_client(conf)?;
+    let remote_storage = Some(create_remote_storage_client(conf)?);
 
     // Set up deletion queue
     let (deletion_queue, deletion_workers) = DeletionQueue::new(
@@ -708,12 +708,11 @@ fn start_pageserver(
 
 fn create_remote_storage_client(
     conf: &'static PageServerConf,
-) -> anyhow::Result<Option<GenericRemoteStorage>> {
+) -> anyhow::Result<GenericRemoteStorage> {
     let config = if let Some(config) = &conf.remote_storage_config {
         config
     } else {
-        tracing::warn!("no remote storage configured, this is a deprecated configuration");
-        return Ok(None);
+        anyhow::bail!("no remote storage configured, this is a deprecated configuration");
     };
 
     // Create the client
@@ -733,7 +732,7 @@ fn create_remote_storage_client(
             GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
     }
 
-    Ok(Some(remote_storage))
+    Ok(remote_storage)
 }
 
 fn cli() -> Command {

From d9dcbffac37ccd3331ec9adcd12fd20ce0ea31aa Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 13 May 2024 15:16:23 +0300
Subject: [PATCH 0760/1571] python: allow using allowed_errors.py (#7719)

See #7718. Fix it by renaming all `types.py` to `common_types.py`.

Additionally, add an advert for using `allowed_errors.py` to test any
added regex.
---
 scripts/check_allowed_errors.sh                | 18 ++++++++++++++++++
 test_runner/fixtures/benchmark_fixture.py      |  2 +-
 .../fixtures/{types.py => common_types.py}     |  0
 test_runner/fixtures/compute_reconfigure.py    |  2 +-
 test_runner/fixtures/neon_fixtures.py          |  4 ++--
 .../fixtures/pageserver/allowed_errors.py      |  5 +++--
 .../pageserver/{types.py => common_types.py}   |  2 +-
 test_runner/fixtures/pageserver/http.py        |  2 +-
 .../fixtures/pageserver/many_tenants.py        |  2 +-
 .../fixtures/pageserver/remote_storage.py      |  4 ++--
 test_runner/fixtures/pageserver/utils.py       |  2 +-
 test_runner/fixtures/remote_storage.py         |  2 +-
 test_runner/fixtures/safekeeper/http.py        |  2 +-
 test_runner/fixtures/safekeeper/utils.py       |  2 +-
 test_runner/fixtures/utils.py                  |  7 ++++---
 test_runner/fixtures/workload.py               |  2 +-
 test_runner/performance/pageserver/util.py     |  2 +-
 .../performance/test_branch_creation.py        |  2 +-
 test_runner/performance/test_bulk_insert.py    |  2 +-
 .../test_storage_controller_scale.py           |  2 +-
 .../performance/test_wal_backpressure.py       |  2 +-
 test_runner/regress/test_ancestor_branch.py    |  2 +-
 .../regress/test_attach_tenant_config.py       |  2 +-
 test_runner/regress/test_auth.py               |  2 +-
 test_runner/regress/test_branch_and_gc.py      |  2 +-
 test_runner/regress/test_branch_behind.py      |  2 +-
 test_runner/regress/test_branching.py          |  2 +-
 test_runner/regress/test_broken_timeline.py    |  2 +-
 test_runner/regress/test_compatibility.py      |  2 +-
 .../regress/test_disk_usage_eviction.py        |  2 +-
 test_runner/regress/test_duplicate_layers.py   |  2 +-
 test_runner/regress/test_fullbackup.py         |  2 +-
 test_runner/regress/test_gc_aggressive.py      |  2 +-
 test_runner/regress/test_import.py             |  2 +-
 test_runner/regress/test_layer_eviction.py     |  2 +-
 test_runner/regress/test_layers_from_future.py |  4 ++--
 .../regress/test_logical_replication.py        |  2 +-
 test_runner/regress/test_lsn_mapping.py        |  2 +-
 test_runner/regress/test_neon_cli.py           |  2 +-
 test_runner/regress/test_next_xid.py           |  2 +-
 test_runner/regress/test_old_request_lsn.py    |  2 +-
 test_runner/regress/test_ondemand_download.py  |  2 +-
 .../regress/test_ondemand_slru_download.py     |  2 +-
 test_runner/regress/test_pageserver_api.py     |  2 +-
 .../regress/test_pageserver_generations.py     |  2 +-
 .../test_pageserver_getpage_throttle.py        |  2 +-
 .../regress/test_pageserver_layer_rolling.py   |  2 +-
 .../test_pageserver_metric_collection.py       |  2 +-
 .../regress/test_pageserver_secondary.py       |  4 ++--
 test_runner/regress/test_pitr_gc.py            |  2 +-
 test_runner/regress/test_read_trace.py         |  2 +-
 test_runner/regress/test_readonly_node.py      |  2 +-
 test_runner/regress/test_remote_storage.py     |  4 ++--
 test_runner/regress/test_s3_restore.py         |  2 +-
 test_runner/regress/test_s3_scrubber.py        |  2 +-
 test_runner/regress/test_sharding.py           |  2 +-
 test_runner/regress/test_storage_controller.py |  2 +-
 test_runner/regress/test_tenant_conf.py        |  2 +-
 test_runner/regress/test_tenant_delete.py      |  2 +-
 test_runner/regress/test_tenant_detach.py      |  2 +-
 test_runner/regress/test_tenant_relocation.py  |  2 +-
 test_runner/regress/test_tenant_size.py        |  2 +-
 test_runner/regress/test_tenant_tasks.py       |  2 +-
 test_runner/regress/test_tenants.py            |  2 +-
 .../test_tenants_with_remote_storage.py        |  4 ++--
 test_runner/regress/test_timeline_delete.py    |  2 +-
 .../regress/test_timeline_detach_ancestor.py   |  2 +-
 test_runner/regress/test_timeline_size.py      |  2 +-
 test_runner/regress/test_wal_acceptor.py       |  2 +-
 test_runner/regress/test_wal_acceptor_async.py |  2 +-
 test_runner/regress/test_wal_receiver.py       |  2 +-
 test_runner/regress/test_wal_restore.py        |  2 +-
 .../test_walredo_not_left_behind_on_detach.py  |  2 +-
 73 files changed, 100 insertions(+), 80 deletions(-)
 create mode 100755 scripts/check_allowed_errors.sh
 rename test_runner/fixtures/{types.py => common_types.py} (100%)
 rename test_runner/fixtures/pageserver/{types.py => common_types.py} (98%)

diff --git a/scripts/check_allowed_errors.sh b/scripts/check_allowed_errors.sh
new file mode 100755
index 0000000000..87e52c1e64
--- /dev/null
+++ b/scripts/check_allowed_errors.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+set -eu
+
+HELPER_DIR="$(dirname "${BASH_SOURCE[0]}")"
+SCRIPT="test_runner/fixtures/pageserver/allowed_errors.py"
+
+# first run to understand all of the errors:
+#
+# example: ./scripts/check_allowed_errors.sh -i - < pageserver.log
+# example: ./scripts/check_allowed_errors.sh -i pageserver.log
+#
+# then edit the test local allowed_errors to the
+# test_runner/fixtures/pageserver/allowed_errors.py, then re-run to make sure
+# they are handled.
+#
+# finally revert any local changes to allowed_errors.py.
+poetry run python3 "$HELPER_DIR/../$SCRIPT" $*
diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index c32748f6f0..038f557cc8 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -19,9 +19,9 @@ from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
 from _pytest.terminal import TerminalReporter
 
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonPageserver
-from fixtures.types import TenantId, TimelineId
 
 """
 This file contains fixtures for micro-benchmarks.
diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/common_types.py
similarity index 100%
rename from test_runner/fixtures/types.py
rename to test_runner/fixtures/common_types.py
diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py
index a883d94f73..66fc35b6aa 100644
--- a/test_runner/fixtures/compute_reconfigure.py
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -5,8 +5,8 @@ import pytest
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
+from fixtures.common_types import TenantId
 from fixtures.log_helper import log
-from fixtures.types import TenantId
 
 
 class ComputeReconfigure:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index da379693a0..0c2b70202e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -47,14 +47,15 @@ from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
 from fixtures.broker import NeonBroker
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pageserver.allowed_errors import (
     DEFAULT_PAGESERVER_ALLOWED_ERRORS,
     DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
 )
+from fixtures.pageserver.common_types import IndexPartDump, LayerName, parse_layer_file_name
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pageserver.types import IndexPartDump, LayerName, parse_layer_file_name
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
     wait_for_upload,
@@ -72,7 +73,6 @@ from fixtures.remote_storage import (
 )
 from fixtures.safekeeper.http import SafekeeperHttpClient
 from fixtures.safekeeper.utils import are_walreceivers_absent
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import (
     ATTACHMENT_NAME_REGEX,
     allure_add_grafana_links,
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 58a76d7586..91cd67d107 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -131,9 +131,10 @@ if __name__ == "__main__":
         "-i",
         "--input",
         type=argparse.FileType("r"),
-        default=sys.stdin,
-        help="Pageserver logs file. Reads from stdin if no file is provided.",
+        help="Pageserver logs file. Use '-' for stdin.",
+        required=True,
     )
+
     args = parser.parse_args()
     errors = _check_allowed_errors(args.input)
 
diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/common_types.py
similarity index 98%
rename from test_runner/fixtures/pageserver/types.py
rename to test_runner/fixtures/pageserver/common_types.py
index 1fb618f445..a6c327a8a0 100644
--- a/test_runner/fixtures/pageserver/types.py
+++ b/test_runner/fixtures/pageserver/common_types.py
@@ -2,7 +2,7 @@ import re
 from dataclasses import dataclass
 from typing import Any, Dict, Tuple, Union
 
-from fixtures.types import KEY_MAX, KEY_MIN, Key, Lsn
+from fixtures.common_types import KEY_MAX, KEY_MIN, Key, Lsn
 
 
 @dataclass
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index b06972056c..0b2963d89c 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -11,10 +11,10 @@ import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pg_version import PgVersion
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import Fn
 
 
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index f47a3ea043..def80a1c3e 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -3,6 +3,7 @@ import time
 from typing import Any, Callable, Dict, Tuple
 
 import fixtures.pageserver.remote_storage
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -12,7 +13,6 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_state,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.types import TenantId, TimelineId
 
 
 def single_timeline(
diff --git a/test_runner/fixtures/pageserver/remote_storage.py b/test_runner/fixtures/pageserver/remote_storage.py
index e6cd9b4614..0c3612716a 100644
--- a/test_runner/fixtures/pageserver/remote_storage.py
+++ b/test_runner/fixtures/pageserver/remote_storage.py
@@ -6,13 +6,13 @@ import threading
 from pathlib import Path
 from typing import Any, List, Tuple
 
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.neon_fixtures import NeonEnv, Pagectl
-from fixtures.pageserver.types import (
+from fixtures.pageserver.common_types import (
     InvalidFileName,
     parse_layer_file_name,
 )
 from fixtures.remote_storage import LocalFsStorage
-from fixtures.types import TenantId, TimelineId
 
 
 def duplicate_one_tenant(env: NeonEnv, template_tenant: TenantId, new_tenant: TenantId):
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 4b0dd7a815..91435e8a1f 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -8,10 +8,10 @@ from mypy_boto3_s3.type_defs import (
     ObjectTypeDef,
 )
 
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import wait_until
 
 
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 132d2450a7..ee18c53b52 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -12,8 +12,8 @@ import boto3
 import toml
 from mypy_boto3_s3 import S3Client
 
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.types import TenantId, TimelineId
 
 TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"
 TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json"
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index b9c1986818..82148d0556 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -6,8 +6,8 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import pytest
 import requests
 
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.types import Lsn, TenantId, TimelineId
 
 
 # Walreceiver as returned by sk's timeline status endpoint.
diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py
index 2818a493d6..0e4b5d7883 100644
--- a/test_runner/fixtures/safekeeper/utils.py
+++ b/test_runner/fixtures/safekeeper/utils.py
@@ -1,6 +1,6 @@
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.safekeeper.http import SafekeeperHttpClient
-from fixtures.types import TenantId, TimelineId
 
 
 def are_walreceivers_absent(
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 6470621900..16dc9e8cfb 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -25,14 +25,14 @@ import zstandard
 from psycopg2.extensions import cursor
 
 from fixtures.log_helper import log
-from fixtures.pageserver.types import (
+from fixtures.pageserver.common_types import (
     parse_delta_layer,
     parse_image_layer,
 )
 
 if TYPE_CHECKING:
     from fixtures.neon_fixtures import PgBin
-from fixtures.types import TimelineId
+from fixtures.common_types import TimelineId
 
 Fn = TypeVar("Fn", bound=Callable[..., Any])
 
@@ -452,6 +452,7 @@ def humantime_to_ms(humantime: str) -> float:
 
 
 def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]:
+    # FIXME: this duplicates test_runner/fixtures/pageserver/allowed_errors.py
     error_or_warn = re.compile(r"\s(ERROR|WARN)")
     errors = []
     for lineno, line in enumerate(input, start=1):
@@ -484,7 +485,7 @@ def assert_no_errors(log_file, service, allowed_errors):
     for _lineno, error in errors:
         log.info(f"not allowed {service} error: {error.strip()}")
 
-    assert not errors, f"Log errors on {service}: {errors[0]}"
+    assert not errors, f"First log error on {service}: {errors[0]}\nHint: use scripts/check_allowed_errors.sh to test any new allowed_error you add"
 
 
 @enum.unique
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index c44628ce06..dfd9caba3e 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -1,6 +1,7 @@
 import threading
 from typing import Any, Optional
 
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -10,7 +11,6 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
-from fixtures.types import TenantId, TimelineId
 
 # neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex
 # to ensure we don't do that: this enables running lots of Workloads in parallel safely.
diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py
index 009d62c9ba..f31cd9a9f8 100644
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -5,13 +5,13 @@ Utilities used by all code in this sub-directory
 from typing import Any, Callable, Dict, Tuple
 
 import fixtures.pageserver.many_tenants as many_tenants
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
 )
 from fixtures.pageserver.utils import wait_until_all_tenants_state
-from fixtures.types import TenantId, TimelineId
 
 
 def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index 7687b8417f..b3866f1813 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -9,11 +9,11 @@ from typing import List
 
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonPageserver
 from fixtures.pageserver.utils import wait_for_last_record_lsn
-from fixtures.types import Lsn
 from fixtures.utils import wait_until
 from prometheus_client.samples import Sample
 
diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index 1df3f2f5f1..3f56da7c1d 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -2,10 +2,10 @@ from contextlib import closing
 
 import pytest
 from fixtures.benchmark_fixture import MetricReport
+from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare, PgCompare
 from fixtures.pageserver.utils import wait_tenant_status_404
 from fixtures.pg_version import PgVersion
-from fixtures.types import Lsn
 
 
 #
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 632d465c3f..cb013ae8c3 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -3,6 +3,7 @@ import random
 import time
 
 import pytest
+from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -10,7 +11,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pg_version import PgVersion
-from fixtures.types import TenantId, TenantShardId, TimelineId
 
 
 @pytest.mark.timeout(3600)  # super long running test: should go down as we optimize
diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py
index 7eb244d378..513ebc74c3 100644
--- a/test_runner/performance/test_wal_backpressure.py
+++ b/test_runner/performance/test_wal_backpressure.py
@@ -6,10 +6,10 @@ from typing import Any, Callable, List
 
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin
-from fixtures.types import Lsn
 
 from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix
 
diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py
index d16d2d6a24..7e40081aa2 100644
--- a/test_runner/regress/test_ancestor_branch.py
+++ b/test_runner/regress/test_ancestor_branch.py
@@ -1,6 +1,6 @@
+from fixtures.common_types import TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.types import TimelineId
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 693add422f..2ec375271c 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -2,13 +2,13 @@ from dataclasses import dataclass
 from typing import Generator, Optional
 
 import pytest
+from fixtures.common_types import TenantId
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
 )
 from fixtures.pageserver.http import PageserverApiException, TenantConfig
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.types import TenantId
 from fixtures.utils import wait_until
 
 
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index bb622c0d59..035ab2796f 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -4,13 +4,13 @@ from pathlib import Path
 
 import psycopg2
 import pytest
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
     PgProtocol,
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
-from fixtures.types import TenantId, TimelineId
 
 
 def assert_client_authorized(env: NeonEnv, http_client: PageserverHttpClient):
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index ddd02238ea..eb503ddbfa 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -2,10 +2,10 @@ import threading
 import time
 
 import pytest
+from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.pageserver.http import TimelineCreate406
-from fixtures.types import Lsn, TimelineId
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index b79cad979f..ac2fc79be4 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -1,8 +1,8 @@
 import pytest
+from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.pageserver.http import TimelineCreate406
-from fixtures.types import Lsn, TimelineId
 from fixtures.utils import print_gc_result, query_scalar
 
 
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 9fe9f77fea..03d6946c15 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -5,6 +5,7 @@ from concurrent.futures import ThreadPoolExecutor
 from typing import List
 
 import pytest
+from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -14,7 +15,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import wait_until_tenant_active
-from fixtures.types import Lsn, TimelineId
 from fixtures.utils import query_scalar
 from performance.test_perf_pgbench import get_scales_matrix
 from requests import RequestException
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 1279c1bf81..7d4e101189 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -3,6 +3,7 @@ import os
 from typing import List, Tuple
 
 import pytest
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -11,7 +12,6 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.types import TenantId, TimelineId
 
 
 # Test restarting page server, while safekeeper and compute node keep
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 787c114fc1..ef35bf4696 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -7,6 +7,7 @@ from typing import List, Optional
 
 import pytest
 import toml
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -21,7 +22,6 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn
 
 #
 # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases.
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 1d73e9cb18..7ae2352c06 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -5,6 +5,7 @@ from dataclasses import dataclass
 from typing import Any, Dict, Iterable, Tuple
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -16,7 +17,6 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_upload_queue_empty
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py
index 7471338ce5..0ebb99c712 100644
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_duplicate_layers.py
@@ -2,7 +2,7 @@ import time
 
 import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
-from fixtures.pageserver.types import parse_layer_file_name
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
     wait_for_upload_queue_empty,
diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py
index d5f898492b..e1e4f700d4 100644
--- a/test_runner/regress/test_fullbackup.py
+++ b/test_runner/regress/test_fullbackup.py
@@ -1,6 +1,7 @@
 import os
 from pathlib import Path
 
+from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -8,7 +9,6 @@ from fixtures.neon_fixtures import (
     VanillaPostgres,
 )
 from fixtures.port_distributor import PortDistributor
-from fixtures.types import Lsn, TimelineId
 from fixtures.utils import query_scalar, subprocess_capture
 
 num_rows = 1000
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index c5070ee815..e5067bba8b 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -2,6 +2,7 @@ import asyncio
 import concurrent.futures
 import random
 
+from fixtures.common_types import TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -10,7 +11,6 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import TimelineId
 
 # Test configuration
 #
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 132427ba2d..1f1c8cc582 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -7,6 +7,7 @@ from contextlib import closing
 from pathlib import Path
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -20,7 +21,6 @@ from fixtures.pageserver.utils import (
     wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import subprocess_capture
 
 
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index b178baea11..0ef4f6d95b 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -7,7 +7,7 @@ from fixtures.neon_fixtures import (
     flush_ep_to_pageserver,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.types import parse_layer_file_name
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import wait_for_upload
 from fixtures.remote_storage import RemoteStorageKind
 
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index cc34fd83e9..18e5111786 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -1,8 +1,9 @@
 import time
 
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
-from fixtures.pageserver.types import (
+from fixtures.pageserver.common_types import (
     DeltaLayerName,
     ImageLayerName,
     is_future_layer,
@@ -13,7 +14,6 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_active,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar, wait_until
 
 
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 57d3447cae..a657d5a035 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -4,6 +4,7 @@ from random import choice
 from string import ascii_lowercase
 
 import pytest
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     AuxFileStore,
@@ -12,7 +13,6 @@ from fixtures.neon_fixtures import (
     logical_replication_sync,
     wait_for_last_flush_lsn,
 )
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar, wait_until
 
 
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 225622868d..83d52d4c4c 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -2,10 +2,10 @@ import re
 import time
 from datetime import datetime, timedelta, timezone
 
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py
index cb69f0ef39..ba170cfb4c 100644
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -5,6 +5,7 @@ from typing import cast
 
 import pytest
 import requests
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.neon_fixtures import (
     DEFAULT_BRANCH_NAME,
     NeonEnv,
@@ -13,7 +14,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pg_version import PgVersion, skip_on_postgres
-from fixtures.types import TenantId, TimelineId
 
 
 def helper_compare_timeline_list(
diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py
index e880445c4d..45c0e3e409 100644
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -3,13 +3,13 @@ import os
 import time
 from pathlib import Path
 
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_wal_insert_lsn
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
 )
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py
index 391305c58a..43b0bb56f0 100644
--- a/test_runner/regress/test_old_request_lsn.py
+++ b/test_runner/regress/test_old_request_lsn.py
@@ -1,6 +1,6 @@
+from fixtures.common_types import TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.types import TimelineId
 from fixtures.utils import print_gc_result, query_scalar
 
 
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 6c2556f6a2..b51754c9e0 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -5,6 +5,7 @@ import time
 from collections import defaultdict
 from typing import Any, DefaultDict, Dict, Tuple
 
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -21,7 +22,6 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_active,
 )
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar, wait_until
 
 
diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py
index 0b36b32552..4af7dcdfc3 100644
--- a/test_runner/regress/test_ondemand_slru_download.py
+++ b/test_runner/regress/test_ondemand_slru_download.py
@@ -1,9 +1,9 @@
 from typing import Optional
 
 import pytest
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, tenant_get_shards
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index bd7e4f118f..80a1d72f4a 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -3,13 +3,13 @@ from pathlib import Path
 from typing import Optional
 
 import toml
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.neon_fixtures import (
     DEFAULT_BRANCH_NAME,
     NeonEnv,
     NeonEnvBuilder,
 )
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 58eaf404d3..a38bcd45da 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -16,6 +16,7 @@ import time
 from typing import Optional
 
 import pytest
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -37,7 +38,6 @@ from fixtures.pageserver.utils import (
 from fixtures.remote_storage import (
     RemoteStorageKind,
 )
-from fixtures.types import TenantId, TimelineId
 from fixtures.utils import print_gc_result, wait_until
 from fixtures.workload import Workload
 
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index 42cc28efee..111285b40c 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -2,10 +2,10 @@ import json
 import uuid
 
 from anyio import Path
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 from fixtures.pg_version import PgVersion
-from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
 
 
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index c5dc0f2919..aab0536f5a 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -5,6 +5,7 @@ from typing import Optional, Tuple
 
 import psutil
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -13,7 +14,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 TIMELINE_COUNT = 10
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index c34ef46d07..b0465f2a96 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -7,6 +7,7 @@ from pathlib import Path
 from queue import SimpleQueue
 from typing import Any, Dict, Set
 
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -17,7 +18,6 @@ from fixtures.remote_storage import (
     RemoteStorageKind,
     remote_storage_to_toml_inline_table,
 )
-from fixtures.types import TenantId, TimelineId
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index c40bb962f2..fdc09a063d 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -5,9 +5,10 @@ import time
 from typing import Any, Dict, Optional
 
 import pytest
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
-from fixtures.pageserver.types import parse_layer_file_name
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     assert_prefix_empty,
     poll_for_remote_storage_iterations,
@@ -15,7 +16,6 @@ from fixtures.pageserver.utils import (
     wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage
-from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
 
diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py
index 539ef3eda7..6434f431a4 100644
--- a/test_runner/regress/test_pitr_gc.py
+++ b/test_runner/regress/test_pitr_gc.py
@@ -1,6 +1,6 @@
+from fixtures.common_types import TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.types import TimelineId
 from fixtures.utils import print_gc_result, query_scalar
 
 
diff --git a/test_runner/regress/test_read_trace.py b/test_runner/regress/test_read_trace.py
index e6b3ccd7ec..cc5853b727 100644
--- a/test_runner/regress/test_read_trace.py
+++ b/test_runner/regress/test_read_trace.py
@@ -1,8 +1,8 @@
 from contextlib import closing
 
+from fixtures.common_types import Lsn
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.pageserver.utils import wait_for_last_record_lsn
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index b7c8f36107..ba8b91e84d 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -1,8 +1,8 @@
 import pytest
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.pageserver.utils import wait_for_last_record_lsn
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 70c025c225..7f79bf5d5c 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -6,13 +6,14 @@ import time
 from typing import Dict, List, Optional, Tuple
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     wait_for_last_flush_lsn,
 )
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
-from fixtures.pageserver.types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
@@ -25,7 +26,6 @@ from fixtures.remote_storage import (
     RemoteStorageKind,
     available_remote_storages,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
     assert_eq,
     assert_ge,
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
index 9227836862..7fdabaaec7 100644
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -1,6 +1,7 @@
 import time
 from datetime import datetime, timezone
 
+from fixtures.common_types import Lsn
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
@@ -14,7 +15,6 @@ from fixtures.pageserver.utils import (
     wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
-from fixtures.types import Lsn
 from fixtures.utils import run_pg_bench_small
 
 
diff --git a/test_runner/regress/test_s3_scrubber.py b/test_runner/regress/test_s3_scrubber.py
index 018c1637d0..8981000c24 100644
--- a/test_runner/regress/test_s3_scrubber.py
+++ b/test_runner/regress/test_s3_scrubber.py
@@ -3,12 +3,12 @@ import shutil
 from typing import Optional
 
 import pytest
+from fixtures.common_types import TenantShardId
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     S3Scrubber,
 )
 from fixtures.remote_storage import S3Storage, s3_storage
-from fixtures.types import TenantShardId
 from fixtures.workload import Workload
 
 
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index d33803250f..87544af598 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -5,6 +5,7 @@ from typing import Dict, List, Optional, Union
 
 import pytest
 import requests
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -18,7 +19,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty
 from fixtures.remote_storage import s3_storage
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
 from pytest_httpserver import HTTPServer
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index bdd356388f..4a501e60ed 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -6,6 +6,7 @@ from datetime import datetime, timezone
 from typing import Any, Dict, List, Union
 
 import pytest
+from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -25,7 +26,6 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
-from fixtures.types import TenantId, TenantShardId, TimelineId
 from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until
 from fixtures.workload import Workload
 from mypy_boto3_s3.type_defs import (
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index fc099297e1..a345464208 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -2,13 +2,13 @@ import json
 from contextlib import closing
 
 import psycopg2.extras
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
 )
 from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.types import Lsn
 from fixtures.utils import wait_until
 
 
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 363c3c88ec..3fc44de6fa 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -5,6 +5,7 @@ import shutil
 from threading import Thread
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -26,7 +27,6 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_state,
 )
 from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import run_pg_bench_small, wait_until
 from requests.exceptions import ReadTimeout
 
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 0ba0108651..12a4730e69 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -7,6 +7,7 @@ from typing import List, Optional
 
 import asyncpg
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -22,7 +23,6 @@ from fixtures.pageserver.utils import (
 from fixtures.remote_storage import (
     RemoteStorageKind,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
 from prometheus_client.samples import Sample
 
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 68d9d9a660..be289e03d6 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -7,6 +7,7 @@ from pathlib import Path
 from typing import Any, Dict, Optional, Tuple
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder, NeonPageserver
 from fixtures.pageserver.http import PageserverHttpClient
@@ -20,7 +21,6 @@ from fixtures.remote_storage import (
     LocalFsStorage,
     RemoteStorageKind,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
     query_scalar,
     wait_until,
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index e73eae91f0..7894f6933d 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -4,6 +4,7 @@ from pathlib import Path
 from typing import List, Tuple
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -19,7 +20,6 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 
diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py
index 75e5c2c91c..d08ad3cd2e 100644
--- a/test_runner/regress/test_tenant_tasks.py
+++ b/test_runner/regress/test_tenant_tasks.py
@@ -1,3 +1,4 @@
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.pageserver.utils import (
@@ -5,7 +6,6 @@ from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
     wait_until_tenant_active,
 )
-from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
 
 
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 2832304dcc..93e9ad3673 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -9,6 +9,7 @@ from typing import List
 
 import pytest
 import requests
+from fixtures.common_types import Lsn, TenantId
 from fixtures.log_helper import log
 from fixtures.metrics import (
     PAGESERVER_GLOBAL_METRICS,
@@ -24,7 +25,6 @@ from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import timeline_delete_wait_completed, wait_until_tenant_active
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId
 from fixtures.utils import wait_until
 from prometheus_client.samples import Sample
 
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index a1e96928bf..168876b711 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -11,6 +11,7 @@ import os
 from pathlib import Path
 from typing import List, Tuple
 
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -18,7 +19,7 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     last_flush_lsn_upload,
 )
-from fixtures.pageserver.types import parse_layer_file_name
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     assert_tenant_state,
     wait_for_last_record_lsn,
@@ -28,7 +29,6 @@ from fixtures.remote_storage import (
     LocalFsStorage,
     RemoteStorageKind,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
 
 
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 0eb1327c9e..da37f469b3 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -6,6 +6,7 @@ import threading
 
 import pytest
 import requests
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -31,7 +32,6 @@ from fixtures.remote_storage import (
     RemoteStorageKind,
     s3_storage,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, run_pg_bench_small, wait_until
 from urllib3.util.retry import Retry
 
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 1e961a4b2f..8406de8bc1 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -10,6 +10,7 @@ from threading import Barrier
 from typing import IO, List, Set, Tuple, Union
 
 import pytest
+from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -19,7 +20,6 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import HistoricLayerInfo
 from fixtures.pageserver.utils import wait_timeline_detail_404
 from fixtures.remote_storage import LocalFsStorage
-from fixtures.types import Lsn, TimelineId
 
 
 def by_end_lsn(info: HistoricLayerInfo) -> Lsn:
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 628c484fbd..18063bf104 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -10,6 +10,7 @@ from typing import Optional
 import psycopg2.errors
 import psycopg2.extras
 import pytest
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -31,7 +32,6 @@ from fixtures.pageserver.utils import (
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import TenantId, TimelineId
 from fixtures.utils import get_timeline_dir_size, wait_until
 
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 967d133e18..ea66eeff63 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -18,6 +18,7 @@ import psycopg2.errors
 import psycopg2.extras
 import pytest
 from fixtures.broker import NeonBroker
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import (
@@ -47,7 +48,6 @@ from fixtures.remote_storage import (
 )
 from fixtures.safekeeper.http import SafekeeperHttpClient
 from fixtures.safekeeper.utils import are_walreceivers_absent
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import get_dir_size, query_scalar, start_in_background
 
 
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index dce5616ac6..b5d86de574 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -8,10 +8,10 @@ from typing import List, Optional
 import asyncpg
 import pytest
 import toml
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import getLogger
 from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TimelineId
 
 log = getLogger("root.safekeeper_async")
 
diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py
index 7ac6e6332c..d9265dcbcd 100644
--- a/test_runner/regress/test_wal_receiver.py
+++ b/test_runner/regress/test_wal_receiver.py
@@ -1,8 +1,8 @@
 import time
 
+from fixtures.common_types import Lsn, TenantId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
-from fixtures.types import Lsn, TenantId
 
 
 # Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout.
diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py
index 083a259d85..01a1d5cf55 100644
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -6,6 +6,7 @@ from typing import List
 
 import pytest
 import zstandard
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -19,7 +20,6 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import LocalFsStorage, S3Storage, s3_storage
-from fixtures.types import Lsn, TenantId, TimelineId
 from mypy_boto3_s3.type_defs import (
     ObjectTypeDef,
 )
diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
index 13159efbe8..ad37807dba 100644
--- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py
+++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
@@ -2,10 +2,10 @@ import time
 
 import psutil
 import pytest
+from fixtures.common_types import TenantId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.types import TenantId
 
 
 def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_present=False):

From 5a0da93c530e99a374a9b2cabc1e2034a68773f1 Mon Sep 17 00:00:00 2001
From: Jure Bajic <jure.bajic94@gmail.com>
Date: Mon, 13 May 2024 14:18:14 +0200
Subject: [PATCH 0761/1571] Fix `test_lock_time_tracing` flakiness (#7712)

## Problem

Closes
[test_lock_time_tracing](https://github.com/neondatabase/neon/issues/7691)

## Summary of changes

Taking a look at the execution of the same test in logs, it can be
concluded that the time we are holding the lock is sometimes not
enough(must be above 30s) to cause the second log to be shown by the
thread that is creating a timeline.

In the [successful
execution](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7663/9021247520/index.html#testresult/a21bce8c702b37f0)
it can be seen that the log `Operation TimelineCreate on key
5e088fc2dd14945020d0fa6d9efd1e36 has waited 30.000887709s for shared
lock` was on the edge of being logged, if it was below 30s it would not
be shown.

```
2024-05-09T18:02:32.552093Z  WARN request{method=PUT path=/control/v1/tenant/5e088fc2dd14945020d0fa6d9efd1e36/policy request_id=af7e4a04-d181-4acb-952f-9597c8eba5a8}: Lock on UpdatePolicy was held for 31.001892592s
2024-05-09T18:02:32.552109Z  INFO request{method=PUT path=/control/v1/tenant/5e088fc2dd14945020d0fa6d9efd1e36/policy request_id=af7e4a04-d181-4acb-952f-9597c8eba5a8}: Request handled, status: 200 OK
2024-05-09T18:02:32.552271Z  WARN request{method=POST path=/v1/tenant/5e088fc2dd14945020d0fa6d9efd1e36/timeline request_id=d3af756e-dbb3-476b-89bd-3594f19bbb67}: Operation TimelineCreate on key 5e088fc2dd14945020d0fa6d9efd1e36 has waited 30.000887709s for shared lock
```

In the [failed
execution](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7663/9022743601/index.html#/testresult/deb90136aeae4fce):
```
2024-05-09T20:14:33.526311Z  INFO request{method=POST path=/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/timeline request_id=1daa8c31-522d-4805-9114-68cdcffb9823}: Creating timeline 68194ffadb61ca11adcbb11cbeb4ec6e/f72185990ed13f0b0533383f81d877af
2024-05-09T20:14:36.441165Z  INFO Heartbeat round complete for 1 nodes, 0 offline
2024-05-09T20:14:41.441657Z  INFO Heartbeat round complete for 1 nodes, 0 offline
2024-05-09T20:14:41.535227Z  INFO request{method=POST path=/upcall/v1/validate request_id=94a7be88-474e-4163-92f8-57b401473add}: Handling request
2024-05-09T20:14:41.535269Z  INFO request{method=POST path=/upcall/v1/validate request_id=94a7be88-474e-4163-92f8-57b401473add}: handle_validate: 68194ffadb61ca11adcbb11cbeb4ec6e(gen 1): valid=true (latest Some(00000001))
2024-05-09T20:14:41.535284Z  INFO request{method=POST path=/upcall/v1/validate request_id=94a7be88-474e-4163-92f8-57b401473add}: Request handled, status: 200 OK
2024-05-09T20:14:46.441854Z  INFO Heartbeat round complete for 1 nodes, 0 offline
2024-05-09T20:14:51.441151Z  INFO Heartbeat round complete for 1 nodes, 0 offline
2024-05-09T20:14:56.441199Z  INFO Heartbeat round complete for 1 nodes, 0 offline
2024-05-09T20:15:01.440971Z  INFO Heartbeat round complete for 1 nodes, 0 offline
2024-05-09T20:15:03.516320Z  INFO request{method=PUT path=/control/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/policy request_id=0edfdb5b-2b05-486b-9879-d83f234d2f0d}: failpoint "tenant-update-policy-exclusive-lock": sleep done
2024-05-09T20:15:03.518474Z  INFO request{method=PUT path=/control/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/policy request_id=0edfdb5b-2b05-486b-9879-d83f234d2f0d}: Updated scheduling policy to Stop tenant_id=68194ffadb61ca11adcbb11cbeb4ec6e shard_id=0000
2024-05-09T20:15:03.518512Z  WARN request{method=PUT path=/control/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/policy request_id=0edfdb5b-2b05-486b-9879-d83f234d2f0d}: Scheduling is disabled by policy Stop tenant_id=68194ffadb61ca11adcbb11cbeb4ec6e shard_id=0000
2024-05-09T20:15:03.518540Z  WARN request{method=PUT path=/control/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/policy request_id=0edfdb5b-2b05-486b-9879-d83f234d2f0d}: Lock on UpdatePolicy was held for 31.003712703s
2024-05-09T20:15:03.518570Z  INFO request{method=PUT path=/control/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/policy request_id=0edfdb5b-2b05-486b-9879-d83f234d2f0d}: Request handled, status: 200 OK
2024-05-09T20:15:03.518804Z  WARN request{method=POST path=/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/timeline request_id=1daa8c31-522d-4805-9114-68cdcffb9823}: Scheduling is disabled by policy Stop tenant_id=68194ffadb61ca11adcbb11cbeb4ec6e shard_id=0000
2024-05-09T20:15:03.518815Z  INFO request{method=POST path=/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/timeline request_id=1daa8c31-522d-4805-9114-68cdcffb9823}: Creating timeline on shard 68194ffadb61ca11adcbb11cbeb4ec6e/f72185990ed13f0b0533383f81d877af, attached to node 1 (localhost)
```
we can see that the difference between starting to create timeline
`2024-05-09T20:14:33.526311Z` and creating timeline
`2024-05-09T20:15:03.518815Z` is not above 30s and will not cause any
logs to appear.

The proposed solution is to prolong how long we will pause to ensure
that the thread that creates the timeline waits above 30s.
---
 test_runner/regress/test_storage_controller.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 4a501e60ed..3a9a522f3f 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1284,7 +1284,7 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
 
     # Apply failpoint
     env.storage_controller.configure_failpoints(
-        ("tenant-update-policy-exclusive-lock", "return(31000)")
+        ("tenant-update-policy-exclusive-lock", "return(35000)")
     )
 
     # This will hold the exclusive for enough time to cause an warning
@@ -1306,7 +1306,7 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.pageserver_api().timeline_create(
         pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id
     )
-    thread_update_tenant_policy.join(timeout=10)
+    thread_update_tenant_policy.join()
 
     env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for")
     env.storage_controller.assert_log_contains(

From bbe730d7caec380edb389487fb1d6db4c9b12cc8 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 13 May 2024 13:41:14 +0100
Subject: [PATCH 0762/1571] Revert protocol version upgrade (#7727)

## Problem

"John pointed out that the switch to protocol version 2 made
test_gc_aggressive test flaky:
https://github.com/neondatabase/neon/issues/7692.
I tracked it down, and that is indeed an issue. Conditions for hitting
the issue:
The problem occurs in the primary
GC horizon is set to a very low value, e.g. 0.
If the primary is actively writing WAL, and GC runs in the pageserver at
the same time that the primary sends a GetPage request, it's possible
that the GC advances the GC horizon past the GetPage request's LSN. I'm
working on a fix here: https://github.com/neondatabase/neon/pull/7708."
- Heikki

## Summary of changes
Use protocol version 1 as default.
---
 pgxn/neon/libpagestore.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index f5ce2caff3..b7b1e7ccbf 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -49,7 +49,7 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;
 
-int         neon_protocol_version = 2;
+int         neon_protocol_version = 1;
 
 static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
@@ -860,7 +860,7 @@ pg_init_libpagestore(void)
 							"Version of compute<->page server protocol",
 							NULL,
 							&neon_protocol_version,
-							2, /* use protocol version 2 */
+							1, /* default to old protocol for now */
 							1, /* min */
 							2, /* max */
 							PGC_SU_BACKEND,

From 6ff74295b5b21d54192d20d114d78621d8d53ba0 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 13 May 2024 14:52:06 +0200
Subject: [PATCH 0763/1571] chore(pageserver): plumb through RequestContext to
 VirtualFile open methods (#7725)

This PR introduces no functional changes.

The `open()` path will be done separately.

refs https://github.com/neondatabase/neon/issues/6107
refs https://github.com/neondatabase/neon/issues/7386

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/ctl/src/layer_map_analyzer.rs      |   2 +-
 pageserver/ctl/src/layers.rs                  |   2 +-
 pageserver/src/pgdatadir_mapping.rs           |   2 +-
 pageserver/src/tenant/blob_io.rs              |   4 +-
 pageserver/src/tenant/ephemeral_file.rs       |   4 +-
 .../tenant/remote_timeline_client/download.rs |  21 ++--
 .../src/tenant/storage_layer/delta_layer.rs   |  10 +-
 .../src/tenant/storage_layer/image_layer.rs   |   8 +-
 .../tenant/storage_layer/inmemory_layer.rs    |   4 +-
 pageserver/src/tenant/timeline.rs             |  35 ++++--
 pageserver/src/tenant/timeline/compaction.rs  |   4 +
 .../src/tenant/timeline/detach_ancestor.rs    |   2 +
 .../src/tenant/timeline/layer_manager.rs      |   4 +-
 pageserver/src/virtual_file.rs                | 102 +++++++++++++-----
 14 files changed, 143 insertions(+), 61 deletions(-)

diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index c4c282f33d..b4bb239f44 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -100,7 +100,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 
 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
-    let file = VirtualFile::open(path).await?;
+    let file = VirtualFile::open(path, ctx).await?;
     let file_id = page_cache::next_file_id();
     let block_reader = FileBlockReader::new(&file, file_id);
     let summary_blk = block_reader.read_blk(0, ctx).await?;
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index be8f91675d..3611b0baab 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -61,7 +61,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
     let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
     virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
     page_cache::init(100);
-    let file = VirtualFile::open(path).await?;
+    let file = VirtualFile::open(path, ctx).await?;
     let file_id = page_cache::next_file_id();
     let block_reader = FileBlockReader::new(&file, file_id);
     let summary_blk = block_reader.read_blk(0, ctx).await?;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index a4215ee107..ffcab5f140 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1671,7 +1671,7 @@ impl<'a> DatadirModification<'a> {
         }
 
         if !self.pending_deletions.is_empty() {
-            writer.delete_batch(&self.pending_deletions).await?;
+            writer.delete_batch(&self.pending_deletions, ctx).await?;
             self.pending_deletions.clear();
         }
 
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 1dc451f5c9..24b4e4f3ea 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -299,7 +299,7 @@ mod tests {
         // Write part (in block to drop the file)
         let mut offsets = Vec::new();
         {
-            let file = VirtualFile::create(pathbuf.as_path()).await?;
+            let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
                 let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
@@ -314,7 +314,7 @@ mod tests {
             wtr.flush_buffer(&ctx).await?;
         }
 
-        let file = VirtualFile::open(pathbuf.as_path()).await?;
+        let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
         let rdr = BlockReaderRef::VirtualFile(&file);
         let rdr = BlockCursor::new(rdr);
         for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 8b815a1885..79cc7bf153 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -28,6 +28,7 @@ impl EphemeralFile {
         conf: &PageServerConf,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
+        ctx: &RequestContext,
     ) -> Result<EphemeralFile, io::Error> {
         static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
         let filename_disambiguator =
@@ -45,6 +46,7 @@ impl EphemeralFile {
                 .read(true)
                 .write(true)
                 .create(true),
+            ctx,
         )
         .await?;
 
@@ -153,7 +155,7 @@ mod tests {
     async fn test_ephemeral_blobs() -> Result<(), io::Error> {
         let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
 
-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?;
 
         let pos_foo = file.write_blob(b"foo", &ctx).await?;
         assert_eq!(
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index b464437422..f3c9e64533 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -112,14 +112,17 @@ pub async fn download_layer_file<'a>(
     // We use fatal_err() below because the after the rename above,
     // the in-memory state of the filesystem already has the layer file in its final place,
     // and subsequent pageserver code could think it's durable while it really isn't.
-    let work = async move {
-        let timeline_dir = VirtualFile::open(&timeline_path)
-            .await
-            .fatal_err("VirtualFile::open for timeline dir fsync");
-        timeline_dir
-            .sync_all()
-            .await
-            .fatal_err("VirtualFile::sync_all timeline dir");
+    let work = {
+        let ctx = ctx.detached_child(ctx.task_kind(), ctx.download_behavior());
+        async move {
+            let timeline_dir = VirtualFile::open(&timeline_path, &ctx)
+                .await
+                .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
+        }
     };
     crate::virtual_file::io_engine::get()
         .spawn_blocking_and_block_on_if_std(work)
@@ -196,7 +199,7 @@ async fn download_object<'a>(
             use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
             use bytes::BytesMut;
             async {
-                let destination_file = VirtualFile::create(dst_path)
+                let destination_file = VirtualFile::create(dst_path, ctx)
                     .await
                     .with_context(|| format!("create a destination file for layer '{dst_path}'"))
                     .map_err(DownloadError::Other)?;
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 4f30cf2e84..1b3802840f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -394,6 +394,7 @@ impl DeltaLayerWriterInner {
         tenant_shard_id: TenantShardId,
         key_start: Key,
         lsn_range: Range<Lsn>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         // Create the file initially with a temporary filename. We don't know
         // the end key yet, so we cannot form the final filename yet. We will
@@ -404,7 +405,7 @@ impl DeltaLayerWriterInner {
         let path =
             DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range);
 
-        let mut file = VirtualFile::create(&path).await?;
+        let mut file = VirtualFile::create(&path, ctx).await?;
         // make room for the header block
         file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
         let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
@@ -586,6 +587,7 @@ impl DeltaLayerWriter {
         tenant_shard_id: TenantShardId,
         key_start: Key,
         lsn_range: Range<Lsn>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         Ok(Self {
             inner: Some(
@@ -595,6 +597,7 @@ impl DeltaLayerWriter {
                     tenant_shard_id,
                     key_start,
                     lsn_range,
+                    ctx,
                 )
                 .await?,
             ),
@@ -701,6 +704,7 @@ impl DeltaLayer {
         let mut file = VirtualFile::open_with_options(
             path,
             virtual_file::OpenOptions::new().read(true).write(true),
+            ctx,
         )
         .await
         .with_context(|| format!("Failed to open file '{}'", path))?;
@@ -734,7 +738,7 @@ impl DeltaLayerInner {
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
     ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
-        let file = match VirtualFile::open(path).await {
+        let file = match VirtualFile::open(path, ctx).await {
             Ok(file) => file,
             Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
         };
@@ -1792,6 +1796,7 @@ mod test {
             harness.tenant_shard_id,
             entries_meta.key_range.start,
             entries_meta.lsn_range.clone(),
+            &ctx,
         )
         .await?;
 
@@ -1979,6 +1984,7 @@ mod test {
                 tenant.tenant_shard_id,
                 Key::MIN,
                 Lsn(0x11)..truncate_at,
+                ctx,
             )
             .await
             .unwrap();
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 72d1f36cab..6ea452b993 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -343,6 +343,7 @@ impl ImageLayer {
         let mut file = VirtualFile::open_with_options(
             path,
             virtual_file::OpenOptions::new().read(true).write(true),
+            ctx,
         )
         .await
         .with_context(|| format!("Failed to open file '{}'", path))?;
@@ -377,7 +378,7 @@ impl ImageLayerInner {
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
     ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
-        let file = match VirtualFile::open(path).await {
+        let file = match VirtualFile::open(path, ctx).await {
             Ok(file) => file,
             Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
         };
@@ -632,6 +633,7 @@ impl ImageLayerWriterInner {
         tenant_shard_id: TenantShardId,
         key_range: &Range<Key>,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         // Create the file initially with a temporary filename.
         // We'll atomically rename it to the final name when we're done.
@@ -651,6 +653,7 @@ impl ImageLayerWriterInner {
                 virtual_file::OpenOptions::new()
                     .write(true)
                     .create_new(true),
+                ctx,
             )
             .await?
         };
@@ -805,10 +808,11 @@ impl ImageLayerWriter {
         tenant_shard_id: TenantShardId,
         key_range: &Range<Key>,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> anyhow::Result<ImageLayerWriter> {
         Ok(Self {
             inner: Some(
-                ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn)
+                ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx)
                     .await?,
             ),
         })
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 4dacbec2f3..9553f83026 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -473,10 +473,11 @@ impl InMemoryLayer {
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         start_lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<InMemoryLayer> {
         trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
 
-        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
+        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?;
         let key = InMemoryLayerFileId(file.page_cache_file_id());
 
         Ok(InMemoryLayer {
@@ -642,6 +643,7 @@ impl InMemoryLayer {
             self.tenant_shard_id,
             Key::MIN,
             self.start_lsn..end_lsn,
+            ctx,
         )
         .await?;
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 505dc8c30d..d2fcd6c4a5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3560,7 +3560,11 @@ impl Timeline {
     ///
     /// Get a handle to the latest layer for appending.
     ///
-    async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
+    async fn get_layer_for_write(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Arc<InMemoryLayer>> {
         let mut guard = self.layers.write().await;
         let layer = guard
             .get_layer_for_write(
@@ -3569,6 +3573,7 @@ impl Timeline {
                 self.conf,
                 self.timeline_id,
                 self.tenant_shard_id,
+                ctx,
             )
             .await?;
         Ok(layer)
@@ -3833,8 +3838,8 @@ impl Timeline {
                 );
                 self.create_delta_layer(
                     &frozen_layer,
-                    ctx,
                     Some(metadata_keyspace.0.ranges[0].clone()),
+                    ctx,
                 )
                 .await?
             } else {
@@ -3863,7 +3868,7 @@ impl Timeline {
             // Normal case, write out a L0 delta layer file.
             // `create_delta_layer` will not modify the layer map.
             // We will remove frozen layer and add delta layer in one atomic operation later.
-            let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else {
+            let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else {
                 panic!("delta layer cannot be empty if no filter is applied");
             };
             (
@@ -3992,8 +3997,8 @@ impl Timeline {
     async fn create_delta_layer(
         self: &Arc<Self>,
         frozen_layer: &Arc<InMemoryLayer>,
-        ctx: &RequestContext,
         key_range: Option<Range<Key>>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Option<ResidentLayer>> {
         let self_clone = Arc::clone(self);
         let frozen_layer = Arc::clone(frozen_layer);
@@ -4016,6 +4021,7 @@ impl Timeline {
                 &self_clone
                     .conf
                     .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id),
+                &ctx,
             )
             .await
             .fatal_err("VirtualFile::open for timeline dir fsync");
@@ -4209,6 +4215,7 @@ impl Timeline {
                 self.tenant_shard_id,
                 &img_range,
                 lsn,
+                ctx,
             )
             .await?;
 
@@ -4313,6 +4320,7 @@ impl Timeline {
                 &self
                     .conf
                     .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+                ctx,
             )
             .await
             .fatal_err("VirtualFile::open for timeline dir fsync");
@@ -5214,7 +5222,7 @@ impl<'a> TimelineWriter<'a> {
         let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
 
         let action = self.get_open_layer_action(lsn, buf_size);
-        let layer = self.handle_open_layer_action(lsn, action).await?;
+        let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
         let res = layer.put_value(key, lsn, &buf, ctx).await;
 
         if res.is_ok() {
@@ -5237,14 +5245,15 @@ impl<'a> TimelineWriter<'a> {
         &mut self,
         at: Lsn,
         action: OpenLayerAction,
+        ctx: &RequestContext,
     ) -> anyhow::Result<&Arc<InMemoryLayer>> {
         match action {
             OpenLayerAction::Roll => {
                 let freeze_at = self.write_guard.as_ref().unwrap().max_lsn.unwrap();
                 self.roll_layer(freeze_at).await?;
-                self.open_layer(at).await?;
+                self.open_layer(at, ctx).await?;
             }
-            OpenLayerAction::Open => self.open_layer(at).await?,
+            OpenLayerAction::Open => self.open_layer(at, ctx).await?,
             OpenLayerAction::None => {
                 assert!(self.write_guard.is_some());
             }
@@ -5253,8 +5262,8 @@ impl<'a> TimelineWriter<'a> {
         Ok(&self.write_guard.as_ref().unwrap().open_layer)
     }
 
-    async fn open_layer(&mut self, at: Lsn) -> anyhow::Result<()> {
-        let layer = self.tl.get_layer_for_write(at).await?;
+    async fn open_layer(&mut self, at: Lsn, ctx: &RequestContext) -> anyhow::Result<()> {
+        let layer = self.tl.get_layer_for_write(at, ctx).await?;
         let initial_size = layer.size().await?;
 
         let last_freeze_at = self.last_freeze_at.load();
@@ -5331,10 +5340,14 @@ impl<'a> TimelineWriter<'a> {
         Ok(())
     }
 
-    pub(crate) async fn delete_batch(&mut self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+    pub(crate) async fn delete_batch(
+        &mut self,
+        batch: &[(Range<Key>, Lsn)],
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         if let Some((_, lsn)) = batch.first() {
             let action = self.get_open_layer_action(*lsn, 0);
-            let layer = self.handle_open_layer_action(*lsn, action).await?;
+            let layer = self.handle_open_layer_action(*lsn, action, ctx).await?;
             layer.put_tombstones(batch).await?;
         }
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index e83878b8fb..4226bf431e 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -700,6 +700,7 @@ impl Timeline {
                                 debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
                                 lsn_range.clone()
                             },
+                            ctx,
                         )
                         .await?,
                     );
@@ -755,6 +756,7 @@ impl Timeline {
                 &self
                     .conf
                     .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+                ctx,
             )
             .await
             .fatal_err("VirtualFile::open for timeline dir fsync");
@@ -1093,6 +1095,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
             self.timeline.tenant_shard_id,
             key_range.start,
             lsn_range.clone(),
+            ctx,
         )
         .await?;
 
@@ -1167,6 +1170,7 @@ impl TimelineAdaptor {
             self.timeline.tenant_shard_id,
             key_range,
             lsn,
+            ctx,
         )
         .await?;
 
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 69b82344a6..9471ba860f 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -215,6 +215,7 @@ pub(super) async fn prepare(
                 &detached
                     .conf
                     .timeline_path(&detached.tenant_shard_id, &detached.timeline_id),
+                ctx,
             )
             .await
             .fatal_err("VirtualFile::open for timeline dir fsync");
@@ -339,6 +340,7 @@ async fn copy_lsn_prefix(
         target_timeline.tenant_shard_id,
         layer.layer_desc().key_range.start,
         layer.layer_desc().lsn_range.start..end_lsn,
+        ctx,
     )
     .await
     .map_err(CopyDeltaPrefix)?;
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index a72eb1b3bf..248420e632 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -9,6 +9,7 @@ use utils::{
 
 use crate::{
     config::PageServerConf,
+    context::RequestContext,
     metrics::TimelineMetrics,
     tenant::{
         layer_map::{BatchedUpdates, LayerMap},
@@ -69,6 +70,7 @@ impl LayerManager {
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
+        ctx: &RequestContext,
     ) -> Result<Arc<InMemoryLayer>> {
         ensure!(lsn.is_aligned());
 
@@ -105,7 +107,7 @@ impl LayerManager {
             );
 
             let new_layer =
-                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn).await?;
+                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?;
             let layer = Arc::new(new_layer);
 
             self.layer_map.open_layer = Some(layer.clone());
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 8dee73891b..b68f3a0e89 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -344,16 +344,23 @@ macro_rules! with_file {
 
 impl VirtualFile {
     /// Open a file in read-only mode. Like File::open.
-    pub async fn open(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
-        Self::open_with_options(path, OpenOptions::new().read(true)).await
+    pub async fn open(
+        path: &Utf8Path,
+        ctx: &RequestContext,
+    ) -> Result<VirtualFile, std::io::Error> {
+        Self::open_with_options(path, OpenOptions::new().read(true), ctx).await
     }
 
     /// Create a new file for writing. If the file exists, it will be truncated.
     /// Like File::create.
-    pub async fn create(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
+    pub async fn create(
+        path: &Utf8Path,
+        ctx: &RequestContext,
+    ) -> Result<VirtualFile, std::io::Error> {
         Self::open_with_options(
             path,
             OpenOptions::new().write(true).create(true).truncate(true),
+            ctx,
         )
         .await
     }
@@ -366,6 +373,7 @@ impl VirtualFile {
     pub async fn open_with_options(
         path: &Utf8Path,
         open_options: &OpenOptions,
+        _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
     ) -> Result<VirtualFile, std::io::Error> {
         let path_str = path.to_string();
         let parts = path_str.split('/').collect::<Vec<&str>>();
@@ -1179,7 +1187,6 @@ mod tests {
     use rand::seq::SliceRandom;
     use rand::thread_rng;
     use rand::Rng;
-    use std::future::Future;
     use std::io::Write;
     use std::os::unix::fs::FileExt;
     use std::sync::Arc;
@@ -1293,41 +1300,69 @@ mod tests {
         // results with VirtualFiles as with native Files. (Except that with
         // native files, you will run out of file descriptors if the ulimit
         // is low enough.)
-        test_files("virtual_files", |path, open_options| async move {
-            let vf = VirtualFile::open_with_options(&path, &open_options).await?;
-            Ok(MaybeVirtualFile::VirtualFile(vf))
-        })
-        .await
+        struct A;
+
+        impl Adapter for A {
+            async fn open(
+                path: Utf8PathBuf,
+                opts: OpenOptions,
+                ctx: &RequestContext,
+            ) -> Result<MaybeVirtualFile, anyhow::Error> {
+                let vf = VirtualFile::open_with_options(&path, &opts, ctx).await?;
+                Ok(MaybeVirtualFile::VirtualFile(vf))
+            }
+        }
+        test_files::<A>("virtual_files").await
     }
 
     #[tokio::test]
     async fn test_physical_files() -> anyhow::Result<()> {
-        test_files("physical_files", |path, open_options| async move {
-            Ok(MaybeVirtualFile::File({
-                let owned_fd = open_options.open(path.as_std_path()).await?;
-                File::from(owned_fd)
-            }))
-        })
-        .await
+        struct B;
+
+        impl Adapter for B {
+            async fn open(
+                path: Utf8PathBuf,
+                opts: OpenOptions,
+                _ctx: &RequestContext,
+            ) -> Result<MaybeVirtualFile, anyhow::Error> {
+                Ok(MaybeVirtualFile::File({
+                    let owned_fd = opts.open(path.as_std_path()).await?;
+                    File::from(owned_fd)
+                }))
+            }
+        }
+
+        test_files::<B>("physical_files").await
     }
 
-    async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> anyhow::Result<()>
+    /// This is essentially a closure which returns a MaybeVirtualFile, but because rust edition
+    /// 2024 is not yet out with new lifetime capture or outlives rules, this is a async function
+    /// in trait which benefits from the new lifetime capture rules already.
+    trait Adapter {
+        async fn open(
+            path: Utf8PathBuf,
+            opts: OpenOptions,
+            ctx: &RequestContext,
+        ) -> Result<MaybeVirtualFile, anyhow::Error>;
+    }
+
+    async fn test_files<A>(testname: &str) -> anyhow::Result<()>
     where
-        OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
-        FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
+        A: Adapter,
     {
         let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
         let testdir = crate::config::PageServerConf::test_repo_dir(testname);
         std::fs::create_dir_all(&testdir)?;
 
         let path_a = testdir.join("file_a");
-        let mut file_a = openfunc(
+        let mut file_a = A::open(
             path_a.clone(),
             OpenOptions::new()
                 .write(true)
                 .create(true)
                 .truncate(true)
                 .to_owned(),
+            &ctx,
         )
         .await?;
         file_a.write_all(b"foobar".to_vec(), &ctx).await?;
@@ -1336,7 +1371,7 @@ mod tests {
         let _ = file_a.read_string(&ctx).await.unwrap_err();
 
         // Close the file and re-open for reading
-        let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
+        let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?;
 
         // cannot write to a file opened in read-only mode
         let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err();
@@ -1371,7 +1406,7 @@ mod tests {
 
         // Create another test file, and try FileExt functions on it.
         let path_b = testdir.join("file_b");
-        let mut file_b = openfunc(
+        let mut file_b = A::open(
             path_b.clone(),
             OpenOptions::new()
                 .read(true)
@@ -1379,6 +1414,7 @@ mod tests {
                 .create(true)
                 .truncate(true)
                 .to_owned(),
+            &ctx,
         )
         .await?;
         file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?;
@@ -1394,8 +1430,12 @@ mod tests {
 
         let mut vfiles = Vec::new();
         for _ in 0..100 {
-            let mut vfile =
-                openfunc(path_b.clone(), OpenOptions::new().read(true).to_owned()).await?;
+            let mut vfile = A::open(
+                path_b.clone(),
+                OpenOptions::new().read(true).to_owned(),
+                &ctx,
+            )
+            .await?;
             assert_eq!("FOOBAR", vfile.read_string(&ctx).await?);
             vfiles.push(vfile);
         }
@@ -1441,8 +1481,12 @@ mod tests {
         // Open the file many times.
         let mut files = Vec::new();
         for _ in 0..VIRTUAL_FILES {
-            let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true))
-                .await?;
+            let f = VirtualFile::open_with_options(
+                &test_file_path,
+                OpenOptions::new().read(true),
+                &ctx,
+            )
+            .await?;
             files.push(f);
         }
         let files = Arc::new(files);
@@ -1488,7 +1532,7 @@ mod tests {
         VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
             .await
             .unwrap();
-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
         let post = file.read_string(&ctx).await.unwrap();
         assert_eq!(post, "foo");
         assert!(!tmp_path.exists());
@@ -1497,7 +1541,7 @@ mod tests {
         VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
             .await
             .unwrap();
-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
         let post = file.read_string(&ctx).await.unwrap();
         assert_eq!(post, "bar");
         assert!(!tmp_path.exists());
@@ -1521,7 +1565,7 @@ mod tests {
             .await
             .unwrap();
 
-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
         let post = file.read_string(&ctx).await.unwrap();
         assert_eq!(post, "foo");
         assert!(!tmp_path.exists());

From 55ba885f6be431b2b694bca4d95d7a40f4bda98a Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 13 May 2024 14:16:03 +0100
Subject: [PATCH 0764/1571] CI(report-benchmarks-failures): report benchmarks
 failures to slack (#7678)

## Problem

`benchmarks` job that we run on the main doesn't block anything, so it's
easy to miss its failure.

Ref https://github.com/neondatabase/cloud/issues/13087

## Summary of changes
- Add `report-benchmarks-failures` job that report failures of
`benchmarks` job to a Slack channel
---
 .github/workflows/build_and_test.yml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 21e7a56670..f417cecd58 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -546,9 +546,27 @@ jobs:
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
+  report-benchmarks-failures:
+    needs: [ benchmarks, create-test-report ]
+    if: github.ref_name == 'main' && failure()
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: C060CNA47S9 # on-call-staging-storage-stream
+        slack-message: |
+          Benchmarks failed on main: ${{ github.event.head_commit.url }}
+
+          Allure report: ${{ needs.create-test-report.outputs.report-url }}
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
   create-test-report:
     needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
     if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
+    outputs:
+      report-url: ${{ steps.create-allure-report.outputs.report-url }}
 
     runs-on: [ self-hosted, gen3, small ]
     container:

From 4d8a10af1cc41563cc4542beb327a2d75fb1bad8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 13 May 2024 17:49:50 +0300
Subject: [PATCH 0765/1571] fix: do not create metrics contention from
 background task permit (#7730)

The background task loop permit metrics do two of `with_label_values`
very often. Change the codepath to cache the counters on first access
into a `Lazy` with `enum_map::EnumMap`. The expectation is that this
should not fix for metric collection failures under load, but it doesn't
hurt.

Cc: #7161
---
 pageserver/src/tenant/tasks.rs | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index f153719f98..ba2b8afd03 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -41,7 +41,7 @@ static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore
         tokio::sync::Semaphore::new(permits)
     });
 
-#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr, enum_map::Enum)]
 #[strum(serialize_all = "snake_case")]
 pub(crate) enum BackgroundLoopKind {
     Compaction,
@@ -57,19 +57,25 @@ pub(crate) enum BackgroundLoopKind {
 
 impl BackgroundLoopKind {
     fn as_static_str(&self) -> &'static str {
-        let s: &'static str = self.into();
-        s
+        self.into()
     }
 }
 
+static PERMIT_GAUGES: once_cell::sync::Lazy<
+    enum_map::EnumMap<BackgroundLoopKind, metrics::IntCounterPair>,
+> = once_cell::sync::Lazy::new(|| {
+    enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+        let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()])
+    }))
+});
+
 /// Cancellation safe.
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
     loop_kind: BackgroundLoopKind,
     _ctx: &RequestContext,
 ) -> tokio::sync::SemaphorePermit<'static> {
-    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
-        .with_label_values(&[loop_kind.as_static_str()])
-        .guard();
+    let _guard = PERMIT_GAUGES[loop_kind].guard();
 
     pausable_failpoint!(
         "initial-size-calculation-permit-pause",

From 7f517640011a5dc2c811c66c2767ae58e27d0c79 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 13 May 2024 11:33:41 -0400
Subject: [PATCH 0766/1571] feat(pageserver): add metrics for aux file size
 (#7623)

ref https://github.com/neondatabase/neon/issues/7443

## Summary of changes

This pull request adds a size estimator for aux files. Each timeline
stores a cached `isize` for the estimated total size of aux files. It
gets reset on basebackup, and gets updated for each aux file
modification. TODO: print a warning when it exceeds the size.

The size metrics is not accurate. Race between `on_basebackup` and other
functions could create a negative basebackup size, but the chance is
rare. Anyways, this does not impose any extra I/Os to the storage as
everything is computed in-memory.

The aux files are only stored on shard 0. As basebackups are only
generated on shard 0, only shard 0 will report this metrics.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/aux_file.rs          | 52 +++++++++++++++++++++++++++++
 pageserver/src/metrics.rs           | 15 +++++++++
 pageserver/src/pgdatadir_mapping.rs | 52 +++++++++++++++++++++--------
 pageserver/src/tenant/timeline.rs   | 32 ++++++++++++------
 test_runner/fixtures/metrics.py     |  1 +
 5 files changed, 128 insertions(+), 24 deletions(-)

diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs
index a26ed84a0d..e6d950487d 100644
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -1,3 +1,6 @@
+use std::sync::Arc;
+
+use ::metrics::IntGauge;
 use bytes::{Buf, BufMut, Bytes};
 use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
 use tracing::warn;
@@ -140,6 +143,55 @@ pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result<Vec<u8>> {
     Ok(encoded)
 }
 
+/// An estimation of the size of aux files.
+pub struct AuxFileSizeEstimator {
+    aux_file_size_gauge: IntGauge,
+    size: Arc<std::sync::Mutex<Option<isize>>>,
+}
+
+impl AuxFileSizeEstimator {
+    pub fn new(aux_file_size_gauge: IntGauge) -> Self {
+        Self {
+            aux_file_size_gauge,
+            size: Arc::new(std::sync::Mutex::new(None)),
+        }
+    }
+
+    pub fn on_base_backup(&self, new_size: usize) {
+        let mut guard = self.size.lock().unwrap();
+        *guard = Some(new_size as isize);
+        self.report(new_size as isize);
+    }
+
+    pub fn on_add(&self, file_size: usize) {
+        let mut guard = self.size.lock().unwrap();
+        if let Some(size) = &mut *guard {
+            *size += file_size as isize;
+            self.report(*size);
+        }
+    }
+
+    pub fn on_remove(&self, file_size: usize) {
+        let mut guard = self.size.lock().unwrap();
+        if let Some(size) = &mut *guard {
+            *size -= file_size as isize;
+            self.report(*size);
+        }
+    }
+
+    pub fn on_update(&self, old_size: usize, new_size: usize) {
+        let mut guard = self.size.lock().unwrap();
+        if let Some(size) = &mut *guard {
+            *size += new_size as isize - old_size as isize;
+            self.report(*size);
+        }
+    }
+
+    pub fn report(&self, size: isize) {
+        self.aux_file_size_gauge.set(size as i64);
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 256f2f334c..b27bfb43b0 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -585,6 +585,15 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define current logical size metric")
 });
 
+static AUX_FILE_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_aux_file_estimated_size",
+        "The size of all aux files for a timeline in aux file v2 store.",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) mod initial_logical_size {
     use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
     use once_cell::sync::Lazy;
@@ -2115,6 +2124,7 @@ pub(crate) struct TimelineMetrics {
     resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
+    pub aux_file_size_gauge: IntGauge,
     pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
     pub evictions: IntCounter,
     pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
@@ -2187,6 +2197,9 @@ impl TimelineMetrics {
         let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
+        let aux_file_size_gauge = AUX_FILE_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
         // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065
         let directory_entries_count_gauge_closure = {
             let tenant_shard_id = *tenant_shard_id;
@@ -2224,6 +2237,7 @@ impl TimelineMetrics {
             last_record_gauge,
             resident_physical_size_gauge,
             current_logical_size_gauge,
+            aux_file_size_gauge,
             directory_entries_count_gauge,
             evictions,
             evictions_with_low_residence_duration: std::sync::RwLock::new(
@@ -2264,6 +2278,7 @@ impl TimelineMetrics {
             let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
         let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
 
         self.evictions_with_low_residence_duration
             .write()
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index ffcab5f140..1092d64d33 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -699,13 +699,17 @@ impl Timeline {
             .await
             .context("scan")?;
         let mut result = HashMap::new();
+        let mut sz = 0;
         for (_, v) in kv {
             let v = v.context("get value")?;
             let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
             for (fname, content) in v {
+                sz += fname.len();
+                sz += content.len();
                 result.insert(fname, content);
             }
         }
+        self.aux_file_size_estimator.on_base_backup(sz);
         Ok(result)
     }
 
@@ -1474,23 +1478,45 @@ impl<'a> DatadirModification<'a> {
                 Err(PageReconstructError::MissingKey(_)) => None,
                 Err(e) => return Err(e.into()),
             };
-            let files = if let Some(ref old_val) = old_val {
+            let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
                 aux_file::decode_file_value(old_val)?
             } else {
                 Vec::new()
             };
-            let new_files = if content.is_empty() {
-                files
-                    .into_iter()
-                    .filter(|(p, _)| &path != p)
-                    .collect::<Vec<_>>()
-            } else {
-                files
-                    .into_iter()
-                    .filter(|(p, _)| &path != p)
-                    .chain(std::iter::once((path, content)))
-                    .collect::<Vec<_>>()
-            };
+            let mut other_files = Vec::with_capacity(files.len());
+            let mut modifying_file = None;
+            for file @ (p, content) in files {
+                if path == p {
+                    assert!(
+                        modifying_file.is_none(),
+                        "duplicated entries found for {}",
+                        path
+                    );
+                    modifying_file = Some(content);
+                } else {
+                    other_files.push(file);
+                }
+            }
+            let mut new_files = other_files;
+            match (modifying_file, content.is_empty()) {
+                (Some(old_content), false) => {
+                    self.tline
+                        .aux_file_size_estimator
+                        .on_update(old_content.len(), content.len());
+                    new_files.push((path, content));
+                }
+                (Some(old_content), true) => {
+                    self.tline
+                        .aux_file_size_estimator
+                        .on_remove(old_content.len());
+                    // not adding the file key to the final `new_files` vec.
+                }
+                (None, false) => {
+                    self.tline.aux_file_size_estimator.on_add(content.len());
+                    new_files.push((path, content));
+                }
+                (None, true) => anyhow::bail!("removing non-existing aux file: {}", path),
+            }
             let new_val = aux_file::encode_file_value(&new_files)?;
             self.put(key, Value::Image(new_val.into()));
         }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d2fcd6c4a5..01f354b9e8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -61,9 +61,12 @@ use std::{
 };
 
 use crate::tenant::timeline::init::LocalLayerFileMetadata;
-use crate::tenant::{
-    layer_map::{LayerMap, SearchResult},
-    metadata::TimelineMetadata,
+use crate::{
+    aux_file::AuxFileSizeEstimator,
+    tenant::{
+        layer_map::{LayerMap, SearchResult},
+        metadata::TimelineMetadata,
+    },
 };
 use crate::{
     context::{DownloadBehavior, RequestContext},
@@ -409,6 +412,8 @@ pub struct Timeline {
 
     /// Keep aux directory cache to avoid it's reconstruction on each update
     pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
+
+    pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
 }
 
 pub struct WalReceiverInfo {
@@ -2161,6 +2166,16 @@ impl Timeline {
         };
 
         Arc::new_cyclic(|myself| {
+            let metrics = TimelineMetrics::new(
+                &tenant_shard_id,
+                &timeline_id,
+                crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
+                    "mtime",
+                    evictions_low_residence_duration_metric_threshold,
+                ),
+            );
+            let aux_file_metrics = metrics.aux_file_size_gauge.clone();
+
             let mut result = Timeline {
                 conf,
                 tenant_conf,
@@ -2192,14 +2207,7 @@ impl Timeline {
                 ancestor_timeline: ancestor,
                 ancestor_lsn: metadata.ancestor_lsn(),
 
-                metrics: TimelineMetrics::new(
-                    &tenant_shard_id,
-                    &timeline_id,
-                    crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
-                        "mtime",
-                        evictions_low_residence_duration_metric_threshold,
-                    ),
-                ),
+                metrics,
 
                 query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
                     &tenant_shard_id,
@@ -2263,6 +2271,8 @@ impl Timeline {
                     dir: None,
                     n_deltas: 0,
                 }),
+
+                aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
             };
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 7d34e12ca3..8fa67e75c9 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -149,6 +149,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_storage_operations_seconds_sum_total",
     "pageserver_evictions_total",
     "pageserver_evictions_with_low_residence_duration_total",
+    "pageserver_aux_file_estimated_size",
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold
     # "pageserver_broken_tenants_count" -- used only for broken

From be0c73f8e7543ca4de8c89b816eb70f792f2685a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 13 May 2024 17:59:59 +0100
Subject: [PATCH 0767/1571] pageserver: improve API for invoking GC (#7655)

## Problem

In https://github.com/neondatabase/neon/pull/7531, I had a test flaky
because the GC API endpoint fails if the tenant happens not to be active
yet.

## Summary of changes

While adding that wait for the tenant to be active, I noticed that this
endpoint is kind of strange (spawns a TaskManager task) and has a
comment `// TODO: spawning is redundant now, need to hold the gate`, so
this PR cleans it up to just run the GC inline while holding a gate.

The GC code is updated to avoid assuming it runs inside a task manager
task. Avoiding checking the task_mgr cancellation token is safe, because
our timeline shutdown always cancels Timeline::cancel.
---
 pageserver/src/http/routes.rs     |   7 +-
 pageserver/src/tenant.rs          |   2 +-
 pageserver/src/tenant/mgr.rs      | 107 +++++++++++++-----------------
 pageserver/src/tenant/timeline.rs |   2 -
 4 files changed, 49 insertions(+), 69 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a8ca642dc5..2370561756 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1715,12 +1715,7 @@ async fn timeline_gc_handler(
     let gc_req: TimelineGcRequest = json_request(&mut request).await?;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?;
-    let gc_result = wait_task_done
-        .await
-        .context("wait for gc task")
-        .map_err(ApiError::InternalServerError)?
-        .map_err(ApiError::InternalServerError)?;
+    let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
 
     json_response(StatusCode::OK, gc_result)
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 010e56a899..80d354d79e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2800,7 +2800,7 @@ impl Tenant {
         // See comments in [`Tenant::branch_timeline`] for more information about why branch
         // creation task can run concurrently with timeline's GC iteration.
         for timeline in gc_timelines {
-            if task_mgr::is_shutdown_requested() || cancel.is_cancelled() {
+            if cancel.is_cancelled() {
                 // We were requested to shut down. Stop and return with the progress we
                 // made.
                 break;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 6be66e99ad..7a3e36bf02 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2880,86 +2880,73 @@ use {
     utils::http::error::ApiError,
 };
 
-pub(crate) fn immediate_gc(
+#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))]
+pub(crate) async fn immediate_gc(
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
     gc_req: TimelineGcRequest,
     cancel: CancellationToken,
     ctx: &RequestContext,
-) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
-    let guard = TENANTS.read().unwrap();
-
-    let tenant = guard
-        .get(&tenant_shard_id)
-        .cloned()
-        .with_context(|| format!("tenant {tenant_shard_id}"))
-        .map_err(|e| ApiError::NotFound(e.into()))?;
+) -> Result<GcResult, ApiError> {
+    let tenant = {
+        let guard = TENANTS.read().unwrap();
+        guard
+            .get(&tenant_shard_id)
+            .cloned()
+            .with_context(|| format!("tenant {tenant_shard_id}"))
+            .map_err(|e| ApiError::NotFound(e.into()))?
+    };
 
     let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
     // Use tenant's pitr setting
     let pitr = tenant.get_pitr_interval();
 
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
     // Run in task_mgr to avoid race with tenant_detach operation
-    let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
-    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
-    let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
+    let ctx: RequestContext =
+        ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
 
-    // TODO: spawning is redundant now, need to hold the gate
-    task_mgr::spawn(
-        &tokio::runtime::Handle::current(),
-        TaskKind::GarbageCollector,
-        Some(tenant_shard_id),
-        Some(timeline_id),
-        &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
-        false,
-        async move {
-            fail::fail_point!("immediate_gc_task_pre");
+    let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
 
-            #[allow(unused_mut)]
-            let mut result = tenant
-                .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
-                .await;
-                // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
-                // better once the types support it.
+    fail::fail_point!("immediate_gc_task_pre");
 
-            #[cfg(feature = "testing")]
-            {
-                // we need to synchronize with drop completion for python tests without polling for
-                // log messages
-                if let Ok(result) = result.as_mut() {
-                    let mut js = tokio::task::JoinSet::new();
-                    for layer in std::mem::take(&mut result.doomed_layers) {
-                        js.spawn(layer.wait_drop());
-                    }
-                    tracing::info!(total = js.len(), "starting to wait for the gc'd layers to be dropped");
-                    while let Some(res) = js.join_next().await {
-                        res.expect("wait_drop should not panic");
-                    }
-                }
+    #[allow(unused_mut)]
+    let mut result = tenant
+        .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
+        .await;
+    // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
+    // better once the types support it.
 
-                let timeline = tenant.get_timeline(timeline_id, false).ok();
-                let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref());
-
-                if let Some(rtc) = rtc {
-                    // layer drops schedule actions on remote timeline client to actually do the
-                    // deletions; don't care about the shutdown error, just exit fast
-                    drop(rtc.wait_completion().await);
-                }
+    #[cfg(feature = "testing")]
+    {
+        // we need to synchronize with drop completion for python tests without polling for
+        // log messages
+        if let Ok(result) = result.as_mut() {
+            let mut js = tokio::task::JoinSet::new();
+            for layer in std::mem::take(&mut result.doomed_layers) {
+                js.spawn(layer.wait_drop());
             }
-
-            match task_done.send(result) {
-                Ok(_) => (),
-                Err(result) => error!("failed to send gc result: {result:?}"),
+            tracing::info!(
+                total = js.len(),
+                "starting to wait for the gc'd layers to be dropped"
+            );
+            while let Some(res) = js.join_next().await {
+                res.expect("wait_drop should not panic");
             }
-            Ok(())
         }
-        .instrument(span)
-    );
 
-    // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
-    drop(guard);
+        let timeline = tenant.get_timeline(timeline_id, false).ok();
+        let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref());
 
-    Ok(wait_task_done)
+        if let Some(rtc) = rtc {
+            // layer drops schedule actions on remote timeline client to actually do the
+            // deletions; don't care about the shutdown error, just exit fast
+            drop(rtc.wait_completion().await);
+        }
+    }
+
+    result.map_err(ApiError::InternalServerError)
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 01f354b9e8..9ee24a4ff0 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4656,11 +4656,9 @@ impl Timeline {
     pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
         // this is most likely the background tasks, but it might be the spawned task from
         // immediate_gc
-        let cancel = crate::task_mgr::shutdown_token();
         let _g = tokio::select! {
             guard = self.gc_lock.lock() => guard,
             _ = self.cancel.cancelled() => return Ok(GcResult::default()),
-            _ = cancel.cancelled() => return Ok(GcResult::default()),
         };
         let timer = self.metrics.garbage_collect_histo.start_timer();
 

From 1412e9b3e827435bab6de0eeea0a3f9d721273c0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 13 May 2024 18:24:12 +0100
Subject: [PATCH 0768/1571] pagectl: fix diagrams generation for paths
 containing generations (#7739)

## Problem
When layer paths include generations, the lsn parsing does not work and
`pagectl` errors out.

## Summary of changes
If the last "word" of the layer path contains 8 characters, discard it
for the purpose of lsn parsing.
---
 pageserver/ctl/src/draw_timeline_dir.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs
index d8082f8ab4..4dff8af1fc 100644
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -83,6 +83,11 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
     let split: Vec<&str> = name.split("__").collect();
     let keys: Vec<&str> = split[0].split('-').collect();
     let mut lsns: Vec<&str> = split[1].split('-').collect();
+
+    if lsns.last().expect("should").len() == 8 {
+        lsns.pop();
+    }
+
     if lsns.len() == 1 {
         lsns.push(lsns[0]);
     }

From 972470b1745146ad28c833385d34f8d413e34d1d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 13 May 2024 18:38:30 +0100
Subject: [PATCH 0769/1571] pageserver: use adaptive concurrency in secondary
 layer downloads (#7675)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

Secondary downloads are a low priority task, and intentionally do not
try to max out download speeds. This is almost always fine when they are
used through the life of a tenant shard as a continuous "trickle" of
background downloads.

However, there are sometimes circumstances where we would like to
populate a secondary location as fast as we can, within the constraint
that we don't want to impact the activity of attached tenants:
- During node removal, where we will need to create replacements for
secondary locations on the node being removed
- After a shard split, we need new secondary locations for the new
shards to populate before the shards can be migrated to their final
location.

## Summary of changes

- Add an activity() function to the remote storage interface, enabling
callers to query how busy the remote storage backend is
- In the secondary download code, use a very modest amount of
concurrency, driven by the remote storage's state: we only use
concurrency if the remote storage semaphore is 75% free, and scale the
amount of concurrency used within that range.

This is not a super clever form of prioritization, but it should
accomplish the key goals:
- Enable secondary downloads to happen faster when the system is idle
- Make secondary downloads a much lower priority than attached tenants
when the remote storage is busy.

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 libs/remote_storage/src/azure_blob.rs         |   5 +
 libs/remote_storage/src/lib.rs                |  34 +++
 libs/remote_storage/src/local_fs.rs           |  14 +-
 libs/remote_storage/src/s3_bucket.rs          |   8 +-
 libs/remote_storage/src/simulate_failures.rs  |   6 +-
 pageserver/src/tenant/secondary/downloader.rs | 238 +++++++++++++-----
 6 files changed, 238 insertions(+), 67 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 24c1248304..220d4ef115 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -29,6 +29,7 @@ use http_types::{StatusCode, Url};
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 
+use crate::RemoteStorageActivity;
 use crate::{
     error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
     DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
@@ -525,6 +526,10 @@ impl RemoteStorage for AzureBlobStorage {
         // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
         Err(TimeTravelError::Unimplemented)
     }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.concurrency_limiter.activity()
+    }
 }
 
 pin_project_lite::pin_project! {
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 708662f20f..f024021507 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -263,6 +263,17 @@ pub trait RemoteStorage: Send + Sync + 'static {
         done_if_after: SystemTime,
         cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError>;
+
+    /// Query how busy we currently are: may be used by callers which wish to politely
+    /// back off if there are already a lot of operations underway.
+    fn activity(&self) -> RemoteStorageActivity;
+}
+
+pub struct RemoteStorageActivity {
+    pub read_available: usize,
+    pub read_total: usize,
+    pub write_available: usize,
+    pub write_total: usize,
 }
 
 /// DownloadStream is sensitive to the timeout and cancellation used with the original
@@ -444,6 +455,15 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
             }
         }
     }
+
+    pub fn activity(&self) -> RemoteStorageActivity {
+        match self {
+            Self::LocalFs(s) => s.activity(),
+            Self::AwsS3(s) => s.activity(),
+            Self::AzureBlob(s) => s.activity(),
+            Self::Unreliable(s) => s.activity(),
+        }
+    }
 }
 
 impl GenericRemoteStorage {
@@ -774,6 +794,9 @@ struct ConcurrencyLimiter {
     // The helps to ensure we don't exceed the thresholds.
     write: Arc<Semaphore>,
     read: Arc<Semaphore>,
+
+    write_total: usize,
+    read_total: usize,
 }
 
 impl ConcurrencyLimiter {
@@ -802,10 +825,21 @@ impl ConcurrencyLimiter {
         Arc::clone(self.for_kind(kind)).acquire_owned().await
     }
 
+    fn activity(&self) -> RemoteStorageActivity {
+        RemoteStorageActivity {
+            read_available: self.read.available_permits(),
+            read_total: self.read_total,
+            write_available: self.write.available_permits(),
+            write_total: self.write_total,
+        }
+    }
+
     fn new(limit: usize) -> ConcurrencyLimiter {
         Self {
             read: Arc::new(Semaphore::new(limit)),
             write: Arc::new(Semaphore::new(limit)),
+            read_total: limit,
+            write_total: limit,
         }
     }
 }
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 1f7bcfc982..f12f6590a3 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use utils::crashsafe::path_with_suffix_extension;
 
 use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity,
+    TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 use super::{RemoteStorage, StorageMetadata};
@@ -605,6 +605,16 @@ impl RemoteStorage for LocalFs {
     ) -> Result<(), TimeTravelError> {
         Err(TimeTravelError::Unimplemented)
     }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        // LocalFS has no concurrency limiting: give callers the impression that plenty of units are available
+        RemoteStorageActivity {
+            read_available: 16,
+            read_total: 16,
+            write_available: 16,
+            write_total: 16,
+        }
+    }
 }
 
 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index c3d6c75e20..0f6772b274 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -47,8 +47,8 @@ use utils::backoff;
 use super::StorageMetadata;
 use crate::{
     error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
-    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
-    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config,
+    TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 pub(super) mod metrics;
@@ -975,6 +975,10 @@ impl RemoteStorage for S3Bucket {
         }
         Ok(())
     }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.concurrency_limiter.activity()
+    }
 }
 
 /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index c467a2d196..66522e04ca 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken;
 
 use crate::{
     Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
-    StorageMetadata, TimeTravelError,
+    RemoteStorageActivity, StorageMetadata, TimeTravelError,
 };
 
 pub struct UnreliableWrapper {
@@ -213,4 +213,8 @@ impl RemoteStorage for UnreliableWrapper {
             .time_travel_recover(prefix, timestamp, done_if_after, cancel)
             .await
     }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.inner.activity()
+    }
 }
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 2a8f83be95..c28e041fa2 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -45,10 +45,10 @@ use crate::tenant::{
 
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
-use futures::Future;
+use futures::{Future, StreamExt};
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
+use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity};
 
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
@@ -71,6 +71,12 @@ use super::{
 /// `<ttps://github.com/neondatabase/neon/issues/6200>`
 const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
 
+/// Range of concurrency we may use when downloading layers within a timeline.  This is independent
+/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
+/// `PageServerConf::secondary_download_concurrency`
+const MAX_LAYER_CONCURRENCY: usize = 16;
+const MIN_LAYER_CONCURRENCY: usize = 1;
+
 pub(super) async fn downloader_task(
     tenant_manager: Arc<TenantManager>,
     remote_storage: GenericRemoteStorage,
@@ -79,14 +85,15 @@ pub(super) async fn downloader_task(
     cancel: CancellationToken,
     root_ctx: RequestContext,
 ) {
-    let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
+    // How many tenants' secondary download operations we will run concurrently
+    let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency;
 
     let generator = SecondaryDownloader {
         tenant_manager,
         remote_storage,
         root_ctx,
     };
-    let mut scheduler = Scheduler::new(generator, concurrency);
+    let mut scheduler = Scheduler::new(generator, tenant_concurrency);
 
     scheduler
         .run(command_queue, background_jobs_can_start, cancel)
@@ -792,6 +799,8 @@ impl<'a> TenantDownloader<'a> {
 
         tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
 
+        let mut download_futs = Vec::new();
+
         // Download heatmap layers that are not present on local disk, or update their
         // access time if they are already present.
         for layer in timeline.layers {
@@ -874,67 +883,33 @@ impl<'a> TenantDownloader<'a> {
                 }
             }
 
-            // Failpoint for simulating slow remote storage
-            failpoint_support::sleep_millis_async!(
-                "secondary-layer-download-sleep",
-                &self.secondary_state.cancel
-            );
-
-            // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
-            let downloaded_bytes = match download_layer_file(
-                self.conf,
-                self.remote_storage,
-                *tenant_shard_id,
-                timeline.timeline_id,
-                &layer.name,
-                &LayerFileMetadata::from(&layer.metadata),
-                &self.secondary_state.cancel,
+            download_futs.push(self.download_layer(
+                tenant_shard_id,
+                &timeline.timeline_id,
+                layer,
                 ctx,
-            )
-            .await
-            {
-                Ok(bytes) => bytes,
-                Err(DownloadError::NotFound) => {
-                    // A heatmap might be out of date and refer to a layer that doesn't exist any more.
-                    // This is harmless: continue to download the next layer. It is expected during compaction
-                    // GC.
-                    tracing::debug!(
-                        "Skipped downloading missing layer {}, raced with compaction/gc?",
-                        layer.name
-                    );
-                    continue;
+            ));
+        }
+
+        // Break up layer downloads into chunks, so that for each chunk we can re-check how much
+        // concurrency to use based on activity level of remote storage.
+        while !download_futs.is_empty() {
+            let chunk =
+                download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY));
+
+            let concurrency = Self::layer_concurrency(self.remote_storage.activity());
+
+            let mut result_stream = futures::stream::iter(chunk).buffered(concurrency);
+            let mut result_stream = std::pin::pin!(result_stream);
+            while let Some(result) = result_stream.next().await {
+                match result {
+                    Err(e) => return Err(e),
+                    Ok(None) => {
+                        // No error, but we didn't download the layer.  Don't mark it touched
+                    }
+                    Ok(Some(layer)) => touched.push(layer),
                 }
-                Err(e) => return Err(e.into()),
-            };
-
-            if downloaded_bytes != layer.metadata.file_size {
-                let local_path = local_layer_path(
-                    self.conf,
-                    tenant_shard_id,
-                    &timeline.timeline_id,
-                    &layer.name,
-                    &layer.metadata.generation,
-                );
-
-                tracing::warn!(
-                    "Downloaded layer {} with unexpected size {} != {}.  Removing download.",
-                    layer.name,
-                    downloaded_bytes,
-                    layer.metadata.file_size
-                );
-
-                tokio::fs::remove_file(&local_path)
-                    .await
-                    .or_else(fs_ext::ignore_not_found)?;
-            } else {
-                tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
-                let mut progress = self.secondary_state.progress.lock().unwrap();
-                progress.bytes_downloaded += downloaded_bytes;
-                progress.layers_downloaded += 1;
             }
-
-            SECONDARY_MODE.download_layer.inc();
-            touched.push(layer)
         }
 
         // Write updates to state to record layers we just downloaded or touched.
@@ -966,6 +941,90 @@ impl<'a> TenantDownloader<'a> {
 
         Ok(())
     }
+
+    async fn download_layer(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        layer: HeatMapLayer,
+        ctx: &RequestContext,
+    ) -> Result<Option<HeatMapLayer>, UpdateError> {
+        // Failpoint for simulating slow remote storage
+        failpoint_support::sleep_millis_async!(
+            "secondary-layer-download-sleep",
+            &self.secondary_state.cancel
+        );
+
+        // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
+        let downloaded_bytes = match download_layer_file(
+            self.conf,
+            self.remote_storage,
+            *tenant_shard_id,
+            *timeline_id,
+            &layer.name,
+            &LayerFileMetadata::from(&layer.metadata),
+            &self.secondary_state.cancel,
+            ctx,
+        )
+        .await
+        {
+            Ok(bytes) => bytes,
+            Err(DownloadError::NotFound) => {
+                // A heatmap might be out of date and refer to a layer that doesn't exist any more.
+                // This is harmless: continue to download the next layer. It is expected during compaction
+                // GC.
+                tracing::debug!(
+                    "Skipped downloading missing layer {}, raced with compaction/gc?",
+                    layer.name
+                );
+                return Ok(None);
+            }
+            Err(e) => return Err(e.into()),
+        };
+
+        if downloaded_bytes != layer.metadata.file_size {
+            let local_path = local_layer_path(
+                self.conf,
+                tenant_shard_id,
+                timeline_id,
+                &layer.name,
+                &layer.metadata.generation,
+            );
+
+            tracing::warn!(
+                "Downloaded layer {} with unexpected size {} != {}.  Removing download.",
+                layer.name,
+                downloaded_bytes,
+                layer.metadata.file_size
+            );
+
+            tokio::fs::remove_file(&local_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)?;
+        } else {
+            tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
+            let mut progress = self.secondary_state.progress.lock().unwrap();
+            progress.bytes_downloaded += downloaded_bytes;
+            progress.layers_downloaded += 1;
+        }
+
+        SECONDARY_MODE.download_layer.inc();
+
+        Ok(Some(layer))
+    }
+
+    /// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage
+    fn layer_concurrency(activity: RemoteStorageActivity) -> usize {
+        // When less than 75% of units are available, use minimum concurrency.  Else, do a linear mapping
+        // of our concurrency range to the units available within the remaining 25%.
+        let clamp_at = (activity.read_total * 3) / 4;
+        if activity.read_available > clamp_at {
+            (MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at))
+                / (activity.read_total - clamp_at)
+        } else {
+            MIN_LAYER_CONCURRENCY
+        }
+    }
 }
 
 /// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
@@ -1092,3 +1151,58 @@ async fn init_timeline_state(
 
     detail
 }
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn layer_concurrency() {
+        // Totally idle
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 16,
+                read_total: 16,
+                write_available: 16,
+                write_total: 16
+            }),
+            MAX_LAYER_CONCURRENCY
+        );
+
+        // Totally busy
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 0,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MIN_LAYER_CONCURRENCY
+        );
+
+        // Edge of the range at which we interpolate
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 12,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MIN_LAYER_CONCURRENCY
+        );
+
+        // Midpoint of the range in which we interpolate
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 14,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MAX_LAYER_CONCURRENCY / 2
+        );
+    }
+}

From 9ffb8523597d846b6afce7eb855b820b79efcbd6 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 13 May 2024 17:14:08 -0400
Subject: [PATCH 0770/1571] fix(test): ensure compatibility test uses the
 correct compute node (#7741)

Use the old compute node for compat tests.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0c2b70202e..8432655370 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -702,8 +702,7 @@ class NeonEnvBuilder:
         config["branch_name_mappings"] = snapshot_config["branch_name_mappings"]
 
         # Update the config with new neon + postgres path in case of compat test
-        # FIXME: overriding pg_distrib_dir cause storage controller fail to start
-        # config["pg_distrib_dir"] = str(self.pg_distrib_dir)
+        config["pg_distrib_dir"] = str(self.pg_distrib_dir)
         config["neon_distrib_dir"] = str(self.neon_binpath)
 
         with (self.repo_dir / "config").open("w") as f:

From 3a6fa768286fced1787b2320d4878e1b961b5a0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 14 May 2024 01:13:25 +0200
Subject: [PATCH 0771/1571] Tiered compaction: cut deltas along lsn as well if
 needed (#7671)

In general, tiered compaction is splitting delta layers along the key
dimension, but this can only continue until a single key is reached: if
the changes from a single key don't fit into one layer file, we used to
create layer files of unbounded sizes.

This patch implements the method listed as TODO/FIXME in the source
code. It does the following things:

* Make `accum_key_values` take the target size and if one key's
modifications exceed it, make it fill `partition_lsns`, a vector of lsns
to use for partitioning.
* Have `retile_deltas` use that `partition_lsns` to create delta layers
separated by lsn.
* Adjust the `test_many_updates_for_single_key` to allow layer files
below 0.5 the target size. This situation can create arbitarily small
layer files: The amount of data is arbitrary that sits between having
just cut a new delta, and then stumbling upon the key that needs to be
split along lsn. This data will end up in a dedicated layer and it can
be arbitrarily small.
* Ignore single-key delta layers for depth calculation: in theory we
might have only single-key delta layers in a tier, and this might
confuse depth calculation as well, but this should be unlikely.

Fixes #7243

Part of #7554

---------

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pageserver/compaction/src/compact_tiered.rs  | 107 ++++++++++++++-----
 pageserver/compaction/src/helpers.rs         |  23 +++-
 pageserver/compaction/src/identify_levels.rs |   6 ++
 pageserver/compaction/tests/tests.rs         |   8 +-
 4 files changed, 109 insertions(+), 35 deletions(-)

diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index 20e9cf2196..a8f184af24 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -530,8 +530,6 @@ where
         // If we have accumulated only a narrow band of keyspace, create an
         // image layer. Otherwise write a delta layer.
 
-        // FIXME: deal with the case of lots of values for same key
-
         // FIXME: we are ignoring images here. Did we already divide the work
         // so that we won't encounter them here?
 
@@ -550,39 +548,94 @@ where
         let mut new_jobs = Vec::new();
 
         // Slide a window through the keyspace
-        let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
+        let mut key_accum =
+            std::pin::pin!(accum_key_values(key_value_stream, self.target_file_size));
         let mut all_in_window: bool = false;
         let mut window = Window::new();
+
+        // Helper function to create a job for a new delta layer with given key-lsn
+        // rectangle.
+        let create_delta_job = |key_range, lsn_range: &Range<Lsn>, new_jobs: &mut Vec<_>| {
+            // The inputs for the job are all the input layers of the original job that
+            // overlap with the rectangle.
+            let batch_layers: Vec<LayerId> = job
+                .input_layers
+                .iter()
+                .filter(|layer_id| {
+                    overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
+                })
+                .cloned()
+                .collect();
+            assert!(!batch_layers.is_empty());
+            new_jobs.push(CompactionJob {
+                key_range,
+                lsn_range: lsn_range.clone(),
+                strategy: CompactionStrategy::CreateDelta,
+                input_layers: batch_layers,
+                completed: false,
+            });
+        };
+
         loop {
-            if all_in_window && window.elems.is_empty() {
+            if all_in_window && window.is_empty() {
                 // All done!
                 break;
             }
+
+            // If we now have enough keyspace for next delta layer in the window, create a
+            // new delta layer
             if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
             {
-                let batch_layers: Vec<LayerId> = job
-                    .input_layers
-                    .iter()
-                    .filter(|layer_id| {
-                        overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
-                    })
-                    .cloned()
-                    .collect();
-                assert!(!batch_layers.is_empty());
-                new_jobs.push(CompactionJob {
-                    key_range,
-                    lsn_range: job.lsn_range.clone(),
-                    strategy: CompactionStrategy::CreateDelta,
-                    input_layers: batch_layers,
-                    completed: false,
-                });
-            } else {
-                assert!(!all_in_window);
-                if let Some(next_key) = key_accum.next().await.transpose()? {
-                    window.feed(next_key.key, next_key.size);
-                } else {
+                create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
+                continue;
+            }
+            assert!(!all_in_window);
+
+            // Process next key in the key space
+            match key_accum.next().await.transpose()? {
+                None => {
                     all_in_window = true;
                 }
+                Some(next_key) if next_key.partition_lsns.is_empty() => {
+                    // Normal case: extend the window by the key
+                    window.feed(next_key.key, next_key.size);
+                }
+                Some(next_key) => {
+                    // A key with too large size impact for a single delta layer. This
+                    // case occurs if you make a huge number of updates for a single key.
+                    //
+                    // Drain the window with has_more = false to make a clean cut before
+                    // the key, and then make dedicated delta layers for the single key.
+                    //
+                    // We cannot cluster the key with the others, because we don't want
+                    // layer files to overlap with each other in the lsn,key space (no
+                    // overlaps for the rectangles).
+                    let key = next_key.key;
+                    debug!("key {key} with size impact larger than the layer size");
+                    while !window.is_empty() {
+                        let has_more = false;
+                        let key_range = window.choose_next_delta(self.target_file_size, has_more)
+                            .expect("with has_more==false, choose_next_delta always returns something for a non-empty Window");
+                        create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
+                    }
+
+                    // Not really required: but here for future resilience:
+                    // We make a "gap" here, so any structure the window holds should
+                    // probably be reset.
+                    window = Window::new();
+
+                    let mut prior_lsn = job.lsn_range.start;
+                    let mut lsn_ranges = Vec::new();
+                    for (lsn, _size) in next_key.partition_lsns.iter() {
+                        lsn_ranges.push(prior_lsn..*lsn);
+                        prior_lsn = *lsn;
+                    }
+                    lsn_ranges.push(prior_lsn..job.lsn_range.end);
+                    for lsn_range in lsn_ranges {
+                        let key_range = key..key.next();
+                        create_delta_job(key_range, &lsn_range, &mut new_jobs);
+                    }
+                }
             }
         }
 
@@ -803,6 +856,10 @@ where
         self.elems.front().unwrap().accum_size - self.splitoff_size
     }
 
+    fn is_empty(&self) -> bool {
+        self.elems.is_empty()
+    }
+
     fn commit_upto(&mut self, mut upto: usize) {
         while upto > 1 {
             let popped = self.elems.pop_front().unwrap();
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index 06454ee1d0..2c922b0a49 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -235,9 +235,14 @@ pub struct KeySize<K> {
     pub key: K,
     pub num_values: u64,
     pub size: u64,
+    /// The lsns to partition at (if empty then no per-lsn partitioning)
+    pub partition_lsns: Vec<(Lsn, u64)>,
 }
 
-pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
+pub fn accum_key_values<'a, I, K, D, E>(
+    input: I,
+    target_size: u64,
+) -> impl Stream<Item = Result<KeySize<K>, E>>
 where
     K: Eq + PartialOrd + Display + Copy,
     I: Stream<Item = Result<D, E>>,
@@ -249,25 +254,35 @@ where
 
         if let Some(first) = input.next().await {
             let first = first?;
+            let mut part_size = first.size();
             let mut accum: KeySize<K> = KeySize {
                 key: first.key(),
                 num_values: 1,
-                size: first.size(),
+                size: part_size,
+                partition_lsns: Vec::new(),
             };
             let mut last_key = accum.key;
             while let Some(this) = input.next().await {
                 let this = this?;
                 if this.key() == accum.key {
-                    accum.size += this.size();
+                    let add_size = this.size();
+                    if part_size + add_size > target_size {
+                        accum.partition_lsns.push((this.lsn(), part_size));
+                        part_size = 0;
+                    }
+                    part_size += add_size;
+                    accum.size += add_size;
                     accum.num_values += 1;
                 } else {
                     assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
                     last_key = accum.key;
                     yield accum;
+                    part_size = this.size();
                     accum = KeySize {
                         key: this.key(),
                         num_values: 1,
-                        size: this.size(),
+                        size: part_size,
+                        partition_lsns: Vec::new(),
                     };
                 }
             }
diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs
index 98dd46925c..1853afffdd 100644
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -184,6 +184,12 @@ impl<L> Level<L> {
         }
         let mut events: Vec<Event<K>> = Vec::new();
         for (idx, l) in self.layers.iter().enumerate() {
+            let key_range = l.key_range();
+            if key_range.end == key_range.start.next() && l.is_delta() {
+                // Ignore single-key delta layers as they can be stacked on top of each other
+                // as that is the only way to cut further.
+                continue;
+            }
             events.push(Event {
                 key: l.key_range().start,
                 layer_idx: idx,
diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs
index 7aa20e6863..bd8b54a286 100644
--- a/pageserver/compaction/tests/tests.rs
+++ b/pageserver/compaction/tests/tests.rs
@@ -20,10 +20,6 @@ pub(crate) fn setup_logging() {
 /// even if we produce an extremely narrow delta layer, spanning just that one
 /// key, we still too many records to fit in the target file size. We need to
 /// split in the LSN dimension too in that case.
-///
-/// TODO: The code to avoid this problem has not been implemented yet! So the
-/// assertion currently fails, but we need to make it not fail.
-#[ignore]
 #[tokio::test]
 async fn test_many_updates_for_single_key() {
     setup_logging();
@@ -43,9 +39,9 @@ async fn test_many_updates_for_single_key() {
     }
     for l in executor.live_layers.iter() {
         assert!(l.file_size() < executor.target_file_size * 2);
-        // sanity check that none of the delta layers are stupidly small either
+        // Sanity check that none of the delta layers are empty either.
         if l.is_delta() {
-            assert!(l.file_size() > executor.target_file_size / 2);
+            assert!(l.file_size() > 0);
         }
     }
 }

From ba20752b7678179f5db0248ce98f56d225684c77 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 13 May 2024 13:17:27 +0300
Subject: [PATCH 0772/1571] Refactor the request LSNs to a separate struct
 (#7708)

We had a lot of code that passed around the two LSNs that are
associated with each GetPage request. Introduce a new struct to
encapsulate them. I'm about to add a third LSN to the struct in the
next commit, this is a mechanical refactoring in preparation for that.
---
 pgxn/neon/pagestore_client.h    |  24 ++++-
 pgxn/neon/pagestore_smgr.c      | 179 ++++++++++++++------------------
 pgxn/neon_test_utils/neontest.c |  24 ++---
 3 files changed, 111 insertions(+), 116 deletions(-)

diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 7709ab9d42..1334a04f9a 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -237,18 +237,38 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum,
 extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum);
 
+/*
+ * LSN values associated with each request to the pageserver
+ */
+typedef struct
+{
+	/*
+	 * 'request_lsn' is the main value that determines which page version to
+	 * fetch.
+	 */
+	XLogRecPtr request_lsn;
+
+	/*
+	 * A hint to the pageserver that the requested page hasn't been modified
+	 * between this LSN and 'request_lsn'. That allows the pageserver to
+	 * return the page faster, without waiting for 'request_lsn' to arrive in
+	 * the pageserver, as long as 'not_modified_since' has arrived.
+	 */
+	XLogRecPtr not_modified_since;
+} neon_request_lsns;
+
 #if PG_MAJORVERSION_NUM < 16
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  char *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
+										 neon_request_lsns request_lsns, char *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
 #else
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  void *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
+										 neon_request_lsns request_lsns, void *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 #endif
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 44ecdbd9aa..8fcbbfe54d 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -168,8 +168,7 @@ typedef enum PrefetchStatus
 typedef struct PrefetchRequest
 {
 	BufferTag	buftag;			/* must be first entry in the struct */
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns request_lsns;
 	NeonResponse *response;		/* may be null */
 	PrefetchStatus status;
 	shardno_t   shard_no;
@@ -271,16 +270,15 @@ static PrefetchState *MyPState;
 
 static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
-static uint64 prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
+static uint64 prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns);
 static bool prefetch_read(PrefetchRequest *slot);
-static void prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
+static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns);
 static bool prefetch_wait_for(uint64 ring_index);
 static void prefetch_cleanup_trailing_unused(void);
 static inline void prefetch_set_unused(uint64 ring_index);
 
-static void neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
-								 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since);
-static bool neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
+static neon_request_lsns neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno);
+static bool neon_prefetch_response_usable(neon_request_lsns request_lsns,
 										  PrefetchRequest *slot);
 
 static bool
@@ -338,8 +336,7 @@ compact_prefetch_buffers(void)
 		target_slot->shard_no = source_slot->shard_no;
 		target_slot->status = source_slot->status;
 		target_slot->response = source_slot->response;
-		target_slot->request_lsn = source_slot->request_lsn;
-		target_slot->not_modified_since = source_slot->not_modified_since;
+		target_slot->request_lsns = source_slot->request_lsns;
 		target_slot->my_ring_index = empty_ring_index;
 
 		prfh_delete(MyPState->prf_hash, source_slot);
@@ -358,8 +355,9 @@ compact_prefetch_buffers(void)
 		};
 		source_slot->response = NULL;
 		source_slot->my_ring_index = 0;
-		source_slot->request_lsn = InvalidXLogRecPtr;
-		source_slot->not_modified_since = InvalidXLogRecPtr;
+		source_slot->request_lsns = (neon_request_lsns) {
+			InvalidXLogRecPtr, InvalidXLogRecPtr
+		};
 
 		/* update bookkeeping */
 		n_moved++;
@@ -689,7 +687,7 @@ prefetch_set_unused(uint64 ring_index)
  * prefetch_wait_for().
  */
 static void
-prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since)
+prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
 {
 	bool		found;
 	NeonGetPageRequest request = {
@@ -700,23 +698,14 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe
 		.blkno = slot->buftag.blockNum,
 	};
 
-	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
-
-	if (force_request_lsn)
-	{
-		request.req.lsn = *force_request_lsn;
-		request.req.not_modified_since = *force_not_modified_since;
-	}
+	if (force_request_lsns)
+		slot->request_lsns = *force_request_lsns;
 	else
-	{
-		neon_get_request_lsn(BufTagGetNRelFileInfo(slot->buftag),
-							 slot->buftag.forkNum,
-							 slot->buftag.blockNum,
-							 &request.req.lsn,
-							 &request.req.not_modified_since);
-	}
-	slot->request_lsn = request.req.lsn;
-	slot->not_modified_since = request.req.not_modified_since;
+		slot->request_lsns = neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
+												   slot->buftag.forkNum,
+												   slot->buftag.blockNum);
+	request.req.lsn = slot->request_lsns.request_lsn;
+	request.req.not_modified_since = slot->request_lsns.not_modified_since;
 
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
@@ -742,25 +731,22 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe
  *
  * Register that we may want the contents of BufferTag in the near future.
  *
- * If force_request_lsn and force_not_modified_since are not NULL, those
- * values are sent to the pageserver. If they are NULL, we utilize the
- * lastWrittenLsn -infrastructure to fill them in.
+ * If force_request_lsns is not NULL, those values are sent to the
+ * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure
+ * to calculate the LSNs to send.
  *
  * NOTE: this function may indirectly update MyPState->pfs_hash; which
  * invalidates any active pointers into the hash table.
  */
 
 static uint64
-prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn,
-						 XLogRecPtr *force_not_modified_since)
+prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns)
 {
 	uint64		ring_index;
 	PrefetchRequest req;
 	PrefetchRequest *slot;
 	PrfHashEntry *entry;
 
-	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
-
 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
 Retry:
@@ -781,10 +767,9 @@ Retry:
 		 * If the caller specified a request LSN to use, only accept prefetch
 		 * responses that satisfy that request.
 		 */
-		if (force_request_lsn)
+		if (force_request_lsns)
 		{
-			if (!neon_prefetch_response_usable(*force_request_lsn,
-											   *force_not_modified_since, slot))
+			if (!neon_prefetch_response_usable(*force_request_lsns, slot))
 			{
 				/* Wait for the old request to finish and discard it */
 				if (!prefetch_wait_for(ring_index))
@@ -886,7 +871,7 @@ Retry:
 	slot->shard_no = get_shard_number(&tag);
 	slot->my_ring_index = ring_index;
 
-	prefetch_do_request(slot, force_request_lsn, force_not_modified_since);
+	prefetch_do_request(slot, force_request_lsns);
 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(MyPState->ring_last <= ring_index &&
 		   ring_index < MyPState->ring_unused);
@@ -1529,11 +1514,11 @@ nm_adjust_lsn(XLogRecPtr lsn)
 /*
  * Return LSN for requesting pages and number of blocks from page server
  */
-static void
-neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
-					 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since)
+static neon_request_lsns
+neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
 {
 	XLogRecPtr	last_written_lsn;
+	neon_request_lsns result;
 
 	last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
 	last_written_lsn = nm_adjust_lsn(last_written_lsn);
@@ -1542,12 +1527,12 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	if (RecoveryInProgress())
 	{
 		/* Request the page at the last replayed LSN. */
-		*request_lsn = GetXLogReplayRecPtr(NULL);
-		*not_modified_since = last_written_lsn;
-		Assert(last_written_lsn <= *request_lsn);
+		result.request_lsn = GetXLogReplayRecPtr(NULL);
+		result.not_modified_since = last_written_lsn;
+		Assert(last_written_lsn <= result.request_lsn);
 
-		neon_log(DEBUG1, "neon_get_request_lsn request lsn %X/%X, not_modified_since %X/%X",
-				 LSN_FORMAT_ARGS(*request_lsn), LSN_FORMAT_ARGS(*not_modified_since));
+		neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X",
+				 LSN_FORMAT_ARGS(result.request_lsn), LSN_FORMAT_ARGS(result.not_modified_since));
 	}
 	else
 	{
@@ -1559,7 +1544,7 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		 * must still in the buffer cache, so our request cannot concern
 		 * those.
 		 */
-		neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
+		neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X",
 				 LSN_FORMAT_ARGS(last_written_lsn));
 
 		/*
@@ -1592,9 +1577,11 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		 * flush, it should still be in the buffer cache, and we wouldn't be
 		 * requesting it.
 		 */
-		*request_lsn = flushlsn;
-		*not_modified_since = last_written_lsn;
+		result.request_lsn = flushlsn;
+		result.not_modified_since = last_written_lsn;
 	}
+
+	return result;
 }
 
 /*
@@ -1604,12 +1591,12 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
  * satisfy a page read now.
  */
 static bool
-neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
+neon_prefetch_response_usable(neon_request_lsns request_lsns,
 							  PrefetchRequest *slot)
 {
 	/* sanity check the LSN's on the old and the new request */
-	Assert(request_lsn >= not_modified_since);
-	Assert(slot->request_lsn >= slot->not_modified_since);
+	Assert(request_lsns.request_lsn >= request_lsns.not_modified_since);
+	Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
 	Assert(slot->status != PRFS_UNUSED);
 
 	/*
@@ -1627,14 +1614,15 @@ neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_si
 	 * calculate LSNs "out of order" with each other, but the prefetch queue
 	 * is backend-private at the moment.)
 	 */
-	if (request_lsn < slot->request_lsn || not_modified_since < slot->not_modified_since)
+	if (request_lsns.request_lsn < slot->request_lsns.request_lsn ||
+		request_lsns.not_modified_since < slot->request_lsns.not_modified_since)
 	{
 		ereport(LOG,
 				(errcode(ERRCODE_IO_ERROR),
 				 errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
 				 errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
-						   LSN_FORMAT_ARGS(request_lsn), LSN_FORMAT_ARGS(not_modified_since),
-						   LSN_FORMAT_ARGS(slot->request_lsn), LSN_FORMAT_ARGS(slot->not_modified_since))));
+						   LSN_FORMAT_ARGS(request_lsns.request_lsn), LSN_FORMAT_ARGS(request_lsns.not_modified_since),
+						   LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since))));
 		return false;
 	}
 
@@ -1675,9 +1663,9 @@ neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_si
 	 */
 
 	/* this follows from the checks above */
-	Assert(request_lsn >= slot->not_modified_since);
+	Assert(request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
 
-	return not_modified_since <= slot->request_lsn;
+	return request_lsns.not_modified_since <= slot->request_lsns.request_lsn;
 }
 
 /*
@@ -1689,8 +1677,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 	bool		exists;
 	NeonResponse *resp;
 	BlockNumber n_blocks;
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns request_lsns;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -1745,15 +1732,15 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}
 
-	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO,
-						 &request_lsn, &not_modified_since);
+	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonExistsRequest request = {
 			.req.tag = T_NeonExistsRequest,
-			.req.lsn = request_lsn,
-			.req.not_modified_since = not_modified_since,
+			.req.lsn = request_lsns.request_lsn,
+			.req.not_modified_since = request_lsns.not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
-		.forknum = forkNum};
+			.forknum = forkNum
+		};
 
 		resp = page_server_request(&request);
 	}
@@ -1770,7 +1757,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 					 errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forkNum,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+							LSN_FORMAT_ARGS(request_lsns.request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2135,7 +2122,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
 	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
 
-	ring_index = prefetch_register_buffer(tag, NULL, NULL);
+	ring_index = prefetch_register_buffer(tag, NULL);
 
 	Assert(ring_index < MyPState->ring_unused &&
 		   MyPState->ring_last <= ring_index);
@@ -2188,10 +2175,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 void
 #if PG_MAJORVERSION_NUM < 16
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer)
+				 neon_request_lsns request_lsns, char *buffer)
 #else
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer)
+				 neon_request_lsns request_lsns, void *buffer)
 #endif
 {
 	NeonResponse *resp;
@@ -2223,7 +2210,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	 * value of the LwLsn cache when the entry is not found.
 	 */
 	if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
-		XLogWaitForReplayOf(request_lsn);
+		XLogWaitForReplayOf(request_lsns.request_lsn);
 
 	/*
 	 * Try to find prefetched page in the list of received pages.
@@ -2234,7 +2221,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	if (entry != NULL)
 	{
 		slot = entry->slot;
-		if (neon_prefetch_response_usable(request_lsn, not_modified_since, slot))
+		if (neon_prefetch_response_usable(request_lsns, slot))
 		{
 			ring_index = slot->my_ring_index;
 			pgBufferUsage.prefetch.hits += 1;
@@ -2268,8 +2255,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		{
 			pgBufferUsage.prefetch.misses += 1;
 
-			ring_index = prefetch_register_buffer(buftag, &request_lsn,
-												  &not_modified_since);
+			ring_index = prefetch_register_buffer(buftag, &request_lsns);
 			slot = GetPrfSlot(ring_index);
 		}
 		else
@@ -2310,7 +2296,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 							slot->shard_no, blkno,
 							RelFileInfoFmt(rinfo),
 							forkNum,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+							LSN_FORMAT_ARGS(request_lsns.request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2333,8 +2319,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer
 neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
 #endif
 {
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns request_lsns;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -2359,9 +2344,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 		return;
 	}
 
-	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, blkno,
-						 &request_lsn, &not_modified_since);
-	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, not_modified_since, buffer);
+	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno);
+	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -2530,8 +2514,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	NeonResponse *resp;
 	BlockNumber n_blocks;
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns request_lsns;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -2558,13 +2541,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 	}
 
-	neon_get_request_lsn(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO,
-						 &request_lsn, &not_modified_since);
+	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonNblocksRequest request = {
 			.req.tag = T_NeonNblocksRequest,
-			.req.lsn = request_lsn,
-			.req.not_modified_since = not_modified_since,
+			.req.lsn = request_lsns.request_lsn,
+			.req.not_modified_since = request_lsns.not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
 			.forknum = forknum,
 		};
@@ -2584,7 +2566,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 					 errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+							LSN_FORMAT_ARGS(request_lsns.request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2595,10 +2577,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
 
 	neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
-		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-		 forknum,
-		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-		 n_blocks);
+			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+			 forknum,
+			 LSN_FORMAT_ARGS(request_lsns.request_lsn),
+			 n_blocks);
 
 	pfree(resp);
 	return n_blocks;
@@ -2612,17 +2594,15 @@ neon_dbsize(Oid dbNode)
 {
 	NeonResponse *resp;
 	int64		db_size;
-	XLogRecPtr	request_lsn,
-				not_modified_since;
+	neon_request_lsns request_lsns;
 	NRelFileInfo dummy_node = {0};
 
-	neon_get_request_lsn(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO,
-						 &request_lsn, &not_modified_since);
+	request_lsns = neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonDbSizeRequest request = {
 			.req.tag = T_NeonDbSizeRequest,
-			.req.lsn = request_lsn,
-			.req.not_modified_since = not_modified_since,
+			.req.lsn = request_lsns.request_lsn,
+			.req.not_modified_since = request_lsns.not_modified_since,
 			.dbNode = dbNode,
 		};
 
@@ -2639,8 +2619,7 @@ neon_dbsize(Oid dbNode)
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
 					 errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
-							dbNode,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+							dbNode, LSN_FORMAT_ARGS(request_lsns.request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2650,9 +2629,7 @@ neon_dbsize(Oid dbNode)
 	}
 
 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
-		 dbNode,
-		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-		 db_size);
+			 dbNode, LSN_FORMAT_ARGS(request_lsns.request_lsn), db_size);
 
 	pfree(resp);
 	return db_size;
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 677006923d..9f63b58a86 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -48,10 +48,10 @@ PG_FUNCTION_INFO_V1(neon_xlogflush);
  */
 #if PG_MAJORVERSION_NUM < 16
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
+									   neon_request_lsns request_lsns, char *buffer);
 #else
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
+									   neon_request_lsns request_lsns, void *buffer);
 #endif
 
 static neon_read_at_lsn_type neon_read_at_lsn_ptr;
@@ -298,9 +298,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	text	   *relname;
 	text	   *forkname;
 	uint32		blkno;
-
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns	request_lsns;
 
 	if (PG_NARGS() != 5)
 		elog(ERROR, "unexpected number of arguments in SQL function signature");
@@ -312,8 +310,8 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	forkname = PG_GETARG_TEXT_PP(1);
 	blkno = PG_GETARG_UINT32(2);
 
-	request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3);
-	not_modified_since = PG_ARGISNULL(4) ? request_lsn : PG_GETARG_LSN(4);
+	request_lsns.request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3);
+	request_lsns.not_modified_since = PG_ARGISNULL(4) ? request_lsns.request_lsn : PG_GETARG_LSN(4);
 
 	if (!superuser())
 		ereport(ERROR,
@@ -367,7 +365,8 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 	raw_page_data = VARDATA(raw_page);
 
-	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsn, not_modified_since, raw_page_data);
+	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsns,
+					 raw_page_data);
 
 	relation_close(rel, AccessShareLock);
 
@@ -413,19 +412,18 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 
 		ForkNumber	forknum = PG_GETARG_UINT32(3);
 		uint32		blkno = PG_GETARG_UINT32(4);
-		XLogRecPtr	request_lsn;
-		XLogRecPtr	not_modified_since;
+		neon_request_lsns	request_lsns;
 
 		/* Initialize buffer to copy to */
 		bytea	   *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
 
-		request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5);
-		not_modified_since = PG_ARGISNULL(6) ? request_lsn : PG_GETARG_LSN(6);
+		request_lsns.request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5);
+		request_lsns.not_modified_since = PG_ARGISNULL(6) ? request_lsns.request_lsn : PG_GETARG_LSN(6);
 
 		SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 		raw_page_data = VARDATA(raw_page);
 
-		neon_read_at_lsn(rinfo, forknum, blkno, request_lsn, not_modified_since, raw_page_data);
+		neon_read_at_lsn(rinfo, forknum, blkno, request_lsns, raw_page_data);
 		PG_RETURN_BYTEA_P(raw_page);
 	}
 }

From 22afaea6e1c595282633b210ac02990f98378458 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 13 May 2024 13:17:30 +0300
Subject: [PATCH 0773/1571] Always use Lsn::MAX as the request LSN in the
 primary (#7708)

The new protocol version supports sending two LSNs to the pageserver:
request LSN and a "not_modified_since" hint. A primary always wants to
read the latest version of each page, so having two values was not
strictly necessary, and the old protocol worked fine with just the
"not_modified_since" LSN and a flag to request the latest page
version. Nevertheless, it seemed like a good idea to set the request
LSN to the current insert/flush LSN, because that's logically the page
version that the primary wants to read.

However, that made the test_gc_aggressive test case flaky. When the
primary requests a page with the last inserted or flushed LSN, it's
possible that by the time that the pageserver processes the request,
more WAL has been generated by other processes in the compute and
already digested by the pageserver. Furthermore, if the PITR horizon
in the pageserver is set to 0, and GC runs during that window, it's
possible that the GC horizon has advances past the request LSN, before
the pageserver processes the request. It is still correct to send the
latest page version in that case, because the compute either has the
page locked so the it cannot have been modified in the primary, or if
it's a prefetch request, and we will validate the LSNs when the
prefetch response is processed and discard it if the page has been
modified. But the pageserver doesn't know that and rightly complains.

To fix, modify the compute so that the primary always uses Lsn::MAX in
the requests. This reverts the primary's behavior to how the protocol
version 1 worked. In protocol version 1, there was only one LSN, the
"not_modified_since" hint, and a flag was set to read the latest page
version, whatever that might be. Requests from computes that are still
using protocol version 1 were already mapped to Lsn::MAX in the
pageserver, now we do the same with protocol version 2 for primary's
requests. (I'm a bit sad about losing the information in the
pageserver, what the last LSN was at the time that the request wa
made. We never had it with protocol version 1, but I wanted to make it
available for debugging purposes.)

Add another field, 'effective_request_lsn', to track what the flush
LSN was when the request was made. It's not sent to the pageserver,
Lsn::MAX is now used as the request LSN, but it's still needed
internally in the compute to track the validity of prefetch requests.

Fixes issue https://github.com/neondatabase/neon/issues/7692
---
 pgxn/neon/pagestore_client.h    | 12 +++++
 pgxn/neon/pagestore_smgr.c      | 95 +++++++++++++++++++++++----------
 pgxn/neon_test_utils/neontest.c | 14 +++++
 3 files changed, 92 insertions(+), 29 deletions(-)

diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 1334a04f9a..8951e6607b 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -255,6 +255,18 @@ typedef struct
 	 * the pageserver, as long as 'not_modified_since' has arrived.
 	 */
 	XLogRecPtr not_modified_since;
+
+	/*
+	 * 'effective_request_lsn' is not included in the request that's sent to
+	 * the pageserver, but is used to keep track of the latest LSN of when the
+	 * request was made. In a standby server, this is always the same as the
+	 * 'request_lsn', but in the primary we use UINT64_MAX as the
+	 * 'request_lsn' to request the latest page version, so we need this
+	 * separate field to remember that latest LSN was when the request was
+	 * made. It's needed to manage prefetch request, to verify if the response
+	 * to a prefetched request is still valid.
+	 */
+	XLogRecPtr effective_request_lsn;
 } neon_request_lsns;
 
 #if PG_MAJORVERSION_NUM < 16
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 8fcbbfe54d..e3b841f526 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -356,7 +356,7 @@ compact_prefetch_buffers(void)
 		source_slot->response = NULL;
 		source_slot->my_ring_index = 0;
 		source_slot->request_lsns = (neon_request_lsns) {
-			InvalidXLogRecPtr, InvalidXLogRecPtr
+			InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr
 		};
 
 		/* update bookkeeping */
@@ -1529,6 +1529,7 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
 		/* Request the page at the last replayed LSN. */
 		result.request_lsn = GetXLogReplayRecPtr(NULL);
 		result.not_modified_since = last_written_lsn;
+		result.effective_request_lsn = result.request_lsn;
 		Assert(last_written_lsn <= result.request_lsn);
 
 		neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X",
@@ -1570,15 +1571,30 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
 		}
 
 		/*
-		 * Request the latest version of the page. The most up-to-date request
-		 * LSN we could use would be the current insert LSN, but to avoid the
-		 * overhead of looking it up, use 'flushlsn' instead. This relies on
-		 * the assumption that if the page was modified since the last WAL
-		 * flush, it should still be in the buffer cache, and we wouldn't be
-		 * requesting it.
+		 * Request the very latest version of the page. In principle we
+		 * want to read the page at the current insert LSN, and we could
+		 * use that value in the request. However, there's a corner case
+		 * with pageserver's garbage collection. If the GC horizon is
+		 * set to a very small value, it's possible that by the time
+		 * that the pageserver processes our request, the GC horizon has
+		 * already moved past the LSN we calculate here. Standby servers
+		 * always have that problem as the can always lag behind the
+		 * primary, but for the primary we can avoid it by always
+		 * requesting the latest page, by setting request LSN to
+		 * UINT64_MAX.
+		 *
+		 * Remember the current LSN, however, so that we can later
+		 * correctly determine if the response to the request is still
+		 * valid. The most up-to-date LSN we could use for that purpose
+		 * would be the current insert LSN, but to avoid the overhead of
+		 * looking it up, use 'flushlsn' instead. This relies on the
+		 * assumption that if the page was modified since the last WAL
+		 * flush, it should still be in the buffer cache, and we
+		 * wouldn't be requesting it.
 		 */
-		result.request_lsn = flushlsn;
+		result.request_lsn = UINT64_MAX;
 		result.not_modified_since = last_written_lsn;
+		result.effective_request_lsn = flushlsn;
 	}
 
 	return result;
@@ -1596,7 +1612,11 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns,
 {
 	/* sanity check the LSN's on the old and the new request */
 	Assert(request_lsns.request_lsn >= request_lsns.not_modified_since);
+	Assert(request_lsns.effective_request_lsn >= request_lsns.not_modified_since);
+	Assert(request_lsns.effective_request_lsn <= request_lsns.request_lsn);
 	Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
+	Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
+	Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn);
 	Assert(slot->status != PRFS_UNUSED);
 
 	/*
@@ -1614,27 +1634,40 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns,
 	 * calculate LSNs "out of order" with each other, but the prefetch queue
 	 * is backend-private at the moment.)
 	 */
-	if (request_lsns.request_lsn < slot->request_lsns.request_lsn ||
+	if (request_lsns.effective_request_lsn < slot->request_lsns.effective_request_lsn ||
 		request_lsns.not_modified_since < slot->request_lsns.not_modified_since)
 	{
 		ereport(LOG,
 				(errcode(ERRCODE_IO_ERROR),
 				 errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
 				 errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
-						   LSN_FORMAT_ARGS(request_lsns.request_lsn), LSN_FORMAT_ARGS(request_lsns.not_modified_since),
-						   LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since))));
+						   LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
+						   LSN_FORMAT_ARGS(request_lsns.not_modified_since),
+						   LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn),
+						   LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since))));
 		return false;
 	}
 
 	/*---
-	 * Each request to the pageserver carries two LSN values:
-	 * `not_modified_since` and `request_lsn`. The (not_modified_since,
-	 * request_lsn] range of each request is effectively a claim that the page
-	 * has not been modified between those LSNs.  If the range of the old
-	 * request in the queue overlaps with the new request, we know that the
-	 * page hasn't been modified in the union of the ranges. We can use the
-	 * response to old request to satisfy the new request in that case. For
-	 * example:
+	 * Each request to the pageserver has three LSN values associated with it:
+	 * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'.
+	 * `not_modified_since` and `request_lsn` are sent to the pageserver, but
+	 * in the primary node, we always use UINT64_MAX as the `request_lsn`, so
+	 * we remember `effective_request_lsn` separately. In a primary,
+	 * `effective_request_lsn` is the last flush WAL position when the request
+	 * was sent to the pageserver. That's logically the LSN that we are
+	 * requesting the page at, but we send UINT64_MAX to the pageserver so
+	 * that if the GC horizon advances past that position, we still get a
+	 * valid response instead of an error.
+	 *
+	 * To determine whether a response to a GetPage request issued earlier is
+	 * still valid to satisfy a new page read, we look at the
+	 * (not_modified_since, effective_request_lsn] range of the request. It is
+	 * effectively a claim that the page has not been modified between those
+	 * LSNs.  If the range of the old request in the queue overlaps with the
+	 * new request, we know that the page hasn't been modified in the union of
+	 * the ranges. We can use the response to old request to satisfy the new
+	 * request in that case. For example:
 	 *
 	 *              100      500
 	 * Old request:  +--------+
@@ -1663,9 +1696,9 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns,
 	 */
 
 	/* this follows from the checks above */
-	Assert(request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
+	Assert(request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
 
-	return request_lsns.not_modified_since <= slot->request_lsns.request_lsn;
+	return request_lsns.not_modified_since <= slot->request_lsns.effective_request_lsn;
 }
 
 /*
@@ -1757,7 +1790,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 					 errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forkNum,
-							LSN_FORMAT_ARGS(request_lsns.request_lsn)),
+							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2296,7 +2329,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 							slot->shard_no, blkno,
 							RelFileInfoFmt(rinfo),
 							forkNum,
-							LSN_FORMAT_ARGS(request_lsns.request_lsn)),
+							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2566,7 +2599,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 					 errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum,
-							LSN_FORMAT_ARGS(request_lsns.request_lsn)),
+							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2579,7 +2612,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 	neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
 			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 			 forknum,
-			 LSN_FORMAT_ARGS(request_lsns.request_lsn),
+			 LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
 			 n_blocks);
 
 	pfree(resp);
@@ -2619,7 +2652,7 @@ neon_dbsize(Oid dbNode)
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
 					 errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
-							dbNode, LSN_FORMAT_ARGS(request_lsns.request_lsn)),
+							dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2629,7 +2662,7 @@ neon_dbsize(Oid dbNode)
 	}
 
 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
-			 dbNode, LSN_FORMAT_ARGS(request_lsns.request_lsn), db_size);
+			 dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
 
 	pfree(resp);
 	return db_size;
@@ -2874,6 +2907,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	XLogRecPtr request_lsn,
 		not_modified_since;
 
+	/*
+	 * Compute a request LSN to use, similar to neon_get_request_lsns() but the
+	 * logic is a bit simpler.
+	 */
 	if (RecoveryInProgress())
 	{
 		request_lsn = GetXLogReplayRecPtr(NULL);
@@ -2885,10 +2922,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 			 */
 			request_lsn = GetRedoStartLsn();
 		}
+		request_lsn = nm_adjust_lsn(request_lsn);
 	}
 	else
-		request_lsn = GetXLogInsertRecPtr();
-	request_lsn = nm_adjust_lsn(request_lsn);
+		request_lsn = UINT64_MAX;
 
 	/*
 	 * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 9f63b58a86..47f245fbf1 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -312,6 +312,13 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 
 	request_lsns.request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3);
 	request_lsns.not_modified_since = PG_ARGISNULL(4) ? request_lsns.request_lsn : PG_GETARG_LSN(4);
+	/*
+	 * For the time being, use the same LSN for request and
+	 * effective request LSN. If any test needed to use UINT64_MAX
+	 * as the request LSN, we'd need to add effective_request_lsn
+	 * as a new argument.
+	 */
+	request_lsns.effective_request_lsn = request_lsns.request_lsn;
 
 	if (!superuser())
 		ereport(ERROR,
@@ -419,6 +426,13 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 
 		request_lsns.request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5);
 		request_lsns.not_modified_since = PG_ARGISNULL(6) ? request_lsns.request_lsn : PG_GETARG_LSN(6);
+		/*
+		 * For the time being, use the same LSN for request
+		 * and effective request LSN. If any test needed to
+		 * use UINT64_MAX as the request LSN, we'd need to add
+		 * effective_request_lsn as a new argument.
+		 */
+		request_lsns.effective_request_lsn = request_lsns.request_lsn;
 
 		SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 		raw_page_data = VARDATA(raw_page);

From cd0e34493887c91485184f3fa5515ea94438aeda Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 14 May 2024 09:31:26 +0100
Subject: [PATCH 0774/1571] pageserver: do fewer heatmap uploads for tiny
 tenants (#7731)

## Problem

Currently we do a large number of heatmap uploads for tiny tenants.
"tiny" in this context is defined as being less than a single layer in
size. These uploads are triggered by atime changes rather than changes
in the set of layers.

Uploading heatmaps for atime changes on small tenants isn't useful,
because even without bumping these atimes, disk usage eviction still
avoids evicting the largest resident layer of a tenant, which in
practice keeps tiny/empty tenants mostly resident irrespective of
atimes.

## Summary of changes

- For tenants smaller than one checkpoint interval, only upload heatmap
if the set of layers has changed, not if only the atimes have changed.
- Include the heatmap period in the uploaded heatmap, as a precursor to
implementing https://github.com/neondatabase/neon/issues/6200
(auto-adjusting download intervals to match upload intervals)
---
 pageserver/src/tenant/secondary/heatmap.rs    | 25 ++++++
 .../src/tenant/secondary/heatmap_uploader.rs  | 82 +++++++++++++------
 2 files changed, 81 insertions(+), 26 deletions(-)

diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
index ca91ec24c6..2da4a3b9d5 100644
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -15,6 +15,14 @@ pub(super) struct HeatMapTenant {
     pub(super) generation: Generation,
 
     pub(super) timelines: Vec<HeatMapTimeline>,
+
+    /// Uploaders provide their own upload period in the heatmap, as a hint to downloaders
+    /// of how frequently it is worthwhile to check for updates.
+    ///
+    /// This is optional for backward compat, and because we sometimes might upload
+    /// a heatmap explicitly via API for a tenant that has no periodic upload configured.
+    #[serde(default)]
+    pub(super) upload_period_ms: Option<u128>,
 }
 
 #[serde_as]
@@ -81,4 +89,21 @@ impl HeatMapTenant {
 
         stats
     }
+
+    pub(crate) fn strip_atimes(self) -> Self {
+        Self {
+            timelines: self
+                .timelines
+                .into_iter()
+                .map(|mut tl| {
+                    for layer in &mut tl.layers {
+                        layer.access_time = SystemTime::UNIX_EPOCH;
+                    }
+                    tl
+                })
+                .collect(),
+            generation: self.generation,
+            upload_period_ms: self.upload_period_ms,
+        }
+    }
 }
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 352409f5fc..fddced3ead 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -80,7 +80,7 @@ impl RunningJob for WriteInProgress {
 
 struct UploadPending {
     tenant: Arc<Tenant>,
-    last_digest: Option<md5::Digest>,
+    last_upload: Option<LastUploadState>,
     target_time: Option<Instant>,
     period: Option<Duration>,
 }
@@ -94,7 +94,7 @@ impl scheduler::PendingJob for UploadPending {
 struct WriteComplete {
     tenant_shard_id: TenantShardId,
     completed_at: Instant,
-    digest: Option<md5::Digest>,
+    uploaded: Option<LastUploadState>,
     next_upload: Option<Instant>,
 }
 
@@ -115,10 +115,7 @@ struct UploaderTenantState {
     tenant: Weak<Tenant>,
 
     /// Digest of the serialized heatmap that we last successfully uploaded
-    ///
-    /// md5 is generally a bad hash.  We use it because it's convenient for interop with AWS S3's ETag,
-    /// which is also an md5sum.
-    last_digest: Option<md5::Digest>,
+    last_upload_state: Option<LastUploadState>,
 
     /// When the last upload attempt completed (may have been successful or failed)
     last_upload: Option<Instant>,
@@ -187,7 +184,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                     tenant: Arc::downgrade(&tenant),
                     last_upload: None,
                     next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)),
-                    last_digest: None,
+                    last_upload_state: None,
                 });
 
             // Decline to do the upload if insufficient time has passed
@@ -195,10 +192,10 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                 return;
             }
 
-            let last_digest = state.last_digest;
+            let last_upload = state.last_upload_state.clone();
             result.jobs.push(UploadPending {
                 tenant,
-                last_digest,
+                last_upload,
                 target_time: state.next_upload,
                 period: Some(period),
             });
@@ -218,7 +215,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
     ) {
         let UploadPending {
             tenant,
-            last_digest,
+            last_upload,
             target_time,
             period,
         } = job;
@@ -231,16 +228,16 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
             let _completion = completion;
 
             let started_at = Instant::now();
-            let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await {
-                Ok(UploadHeatmapOutcome::Uploaded(digest)) => {
+            let uploaded = match upload_tenant_heatmap(remote_storage, &tenant, last_upload.clone()).await {
+                Ok(UploadHeatmapOutcome::Uploaded(uploaded)) => {
                     let duration = Instant::now().duration_since(started_at);
                     SECONDARY_MODE
                         .upload_heatmap_duration
                         .observe(duration.as_secs_f64());
                     SECONDARY_MODE.upload_heatmap.inc();
-                    Some(digest)
+                    Some(uploaded)
                 }
-                Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest,
+                Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_upload,
                 Err(UploadHeatmapError::Upload(e)) => {
                     tracing::warn!(
                         "Failed to upload heatmap for tenant {}: {e:#}",
@@ -251,11 +248,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                         .upload_heatmap_duration
                         .observe(duration.as_secs_f64());
                     SECONDARY_MODE.upload_heatmap_errors.inc();
-                    last_digest
+                    last_upload
                 }
                 Err(UploadHeatmapError::Cancelled) => {
                     tracing::info!("Cancelled heatmap upload, shutting down");
-                    last_digest
+                    last_upload
                 }
             };
 
@@ -277,7 +274,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
             WriteComplete {
                     tenant_shard_id: *tenant.get_tenant_shard_id(),
                     completed_at: now,
-                    digest,
+                    uploaded,
                     next_upload,
                 }
         }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
@@ -299,7 +296,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
 
         Ok(UploadPending {
             // Ignore our state for last digest: this forces an upload even if nothing has changed
-            last_digest: None,
+            last_upload: None,
             tenant,
             target_time: None,
             period: None,
@@ -312,7 +309,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
         let WriteComplete {
             tenant_shard_id,
             completed_at,
-            digest,
+            uploaded,
             next_upload,
         } = completion;
         use std::collections::hash_map::Entry;
@@ -322,7 +319,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
             }
             Entry::Occupied(mut entry) => {
                 entry.get_mut().last_upload = Some(completed_at);
-                entry.get_mut().last_digest = digest;
+                entry.get_mut().last_upload_state = uploaded;
                 entry.get_mut().next_upload = next_upload
             }
         }
@@ -331,7 +328,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
 
 enum UploadHeatmapOutcome {
     /// We successfully wrote to remote storage, with this digest.
-    Uploaded(md5::Digest),
+    Uploaded(LastUploadState),
     /// We did not upload because the heatmap digest was unchanged since the last upload
     NoChange,
     /// We skipped the upload for some reason, such as tenant/timeline not ready
@@ -347,12 +344,25 @@ enum UploadHeatmapError {
     Upload(#[from] anyhow::Error),
 }
 
+/// Digests describing the heatmap we most recently uploaded successfully.
+///
+/// md5 is generally a bad hash.  We use it because it's convenient for interop with AWS S3's ETag,
+/// which is also an md5sum.
+#[derive(Clone)]
+struct LastUploadState {
+    // Digest of json-encoded HeatMapTenant
+    uploaded_digest: md5::Digest,
+
+    // Digest without atimes set.
+    layers_only_digest: md5::Digest,
+}
+
 /// The inner upload operation.  This will skip if `last_digest` is Some and matches the digest
 /// of the object we would have uploaded.
 async fn upload_tenant_heatmap(
     remote_storage: GenericRemoteStorage,
     tenant: &Arc<Tenant>,
-    last_digest: Option<md5::Digest>,
+    last_upload: Option<LastUploadState>,
 ) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
     debug_assert_current_span_has_tenant_id();
 
@@ -368,6 +378,7 @@ async fn upload_tenant_heatmap(
     let mut heatmap = HeatMapTenant {
         timelines: Vec::new(),
         generation,
+        upload_period_ms: tenant.get_heatmap_period().map(|p| p.as_millis()),
     };
     let timelines = tenant.timelines.lock().unwrap().clone();
 
@@ -396,15 +407,31 @@ async fn upload_tenant_heatmap(
 
     // Serialize the heatmap
     let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
-    let bytes = bytes::Bytes::from(bytes);
-    let size = bytes.len();
 
     // Drop out early if nothing changed since our last upload
     let digest = md5::compute(&bytes);
-    if Some(digest) == last_digest {
+    if Some(&digest) == last_upload.as_ref().map(|d| &d.uploaded_digest) {
         return Ok(UploadHeatmapOutcome::NoChange);
     }
 
+    // Calculate a digest that omits atimes, so that we can distinguish actual changes in
+    // layers from changes only in atimes.
+    let heatmap_size_bytes = heatmap.get_stats().bytes;
+    let layers_only_bytes =
+        serde_json::to_vec(&heatmap.strip_atimes()).map_err(|e| anyhow::anyhow!(e))?;
+    let layers_only_digest = md5::compute(&layers_only_bytes);
+    if heatmap_size_bytes < tenant.get_checkpoint_distance() {
+        // For small tenants, skip upload if only atimes changed. This avoids doing frequent
+        // uploads from long-idle tenants whose atimes are just incremented by periodic
+        // size calculations.
+        if Some(&layers_only_digest) == last_upload.as_ref().map(|d| &d.layers_only_digest) {
+            return Ok(UploadHeatmapOutcome::NoChange);
+        }
+    }
+
+    let bytes = bytes::Bytes::from(bytes);
+    let size = bytes.len();
+
     let path = remote_heatmap_path(tenant.get_tenant_shard_id());
 
     let cancel = &tenant.cancel;
@@ -436,5 +463,8 @@ async fn upload_tenant_heatmap(
 
     tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
 
-    Ok(UploadHeatmapOutcome::Uploaded(digest))
+    Ok(UploadHeatmapOutcome::Uploaded(LastUploadState {
+        uploaded_digest: digest,
+        layers_only_digest,
+    }))
 }

From df0f1e359b1ff01474ffaac26cec730697d1d643 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 14 May 2024 09:37:48 +0100
Subject: [PATCH 0775/1571] pageserver: switch on new-style local layer paths
 (#7660)

We recently added support for local layer paths that contain a
generation number:
- https://github.com/neondatabase/neon/pull/7609
- https://github.com/neondatabase/neon/pull/7640

Now that we've cut a
[release](https://github.com/neondatabase/neon/pull/7735) that includes
those changes, we can proceed to enable writing the new format without
breaking forward compatibility.
---
 pageserver/src/tenant/storage_layer/layer.rs    | 17 +++++++----------
 .../regress/test_pageserver_generations.py      |  1 -
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index b5b0260327..b6f7702247 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -129,19 +129,16 @@ pub(crate) fn local_layer_path(
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
     layer_file_name: &LayerName,
-    _generation: &Generation,
+    generation: &Generation,
 ) -> Utf8PathBuf {
     let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id);
 
-    timeline_path.join(layer_file_name.to_string())
-
-    // TODO: switch to enabling new-style layer paths after next release
-    // if generation.is_none() {
-    //     // Without a generation, we may only use legacy path style
-    //     timeline_path.join(layer_file_name.to_string())
-    // } else {
-    //     timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix()))
-    // }
+    if generation.is_none() {
+        // Without a generation, we may only use legacy path style
+        timeline_path.join(layer_file_name.to_string())
+    } else {
+        timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix()))
+    }
 }
 
 impl Layer {
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index a38bcd45da..4fdc5852f5 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -703,7 +703,6 @@ def test_multi_attach(
     workload.validate(pageservers[2].id)
 
 
-@pytest.mark.skip(reason="To be enabled after release with new local path style")
 def test_upgrade_generationless_local_file_paths(
     neon_env_builder: NeonEnvBuilder,
 ):

From b6ee91835b5eaf1d210805d0e4e07d01214445dd Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 14 May 2024 11:39:59 +0100
Subject: [PATCH 0776/1571]  CI(report-benchmarks-failures): fix condition
 (#7745)

## Problem

`report-benchmarks-failures` job is triggered for any failure in the CI
pipeline, but we need it to be triggered only for failed `benchmarks`
job

## Summary of changes
- replace `failure()` with `needs.benchmarks.result == 'failure'` in the
condition
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index f417cecd58..14d19f7ae3 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -548,7 +548,7 @@ jobs:
 
   report-benchmarks-failures:
     needs: [ benchmarks, create-test-report ]
-    if: github.ref_name == 'main' && failure()
+    if: github.ref_name == 'main' && needs.benchmarks.result == 'failure'
     runs-on: ubuntu-latest
 
     steps:

From 30d15ad4032e543be37fc0378345d9b6568b20cc Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 14 May 2024 10:36:48 -0400
Subject: [PATCH 0777/1571] chore(test): add version check for forward compat
 test (#7685)

A test for https://github.com/neondatabase/neon/pull/7684.

This pull request checks if the pageserver version we specified is the
one actually running by comparing the git hash in forward compatibility
tests.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_compatibility.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index ef35bf4696..65649e0c0a 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -1,4 +1,5 @@
 import os
+import re
 import shutil
 import subprocess
 import tempfile
@@ -245,14 +246,34 @@ def test_forward_compatibility(
             compatibility_snapshot_dir / "repo",
         )
 
+        # not using env.pageserver.version because it was initialized before
+        prev_pageserver_version_str = env.get_binary_version("pageserver")
+        prev_pageserver_version_match = re.search(
+            "Neon page server git-env:(.*) failpoints: (.*), features: (.*)",
+            prev_pageserver_version_str,
+        )
+        if prev_pageserver_version_match is not None:
+            prev_pageserver_version = prev_pageserver_version_match.group(1)
+        else:
+            raise AssertionError(
+                "cannot find git hash in the version string: " + prev_pageserver_version_str
+            )
+
+        # does not include logs from previous runs
+        assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version)
+
         neon_env_builder.start()
 
+        # ensure the specified pageserver is running
+        assert env.pageserver.log_contains("git-env:" + prev_pageserver_version)
+
         check_neon_works(
             env,
             test_output_dir=test_output_dir,
             sql_dump_path=compatibility_snapshot_dir / "dump.sql",
             repo_dir=env.repo_dir,
         )
+
     except Exception:
         if breaking_changes_allowed:
             pytest.xfail(

From 82960b2175211c0f666b91b5258c5e2253a245c7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 14 May 2024 16:39:17 +0100
Subject: [PATCH 0778/1571] pageserver: skip waiting for logical size on shard
 >0 (#7744)

## Problem

Shards with number >0 could hang waiting for
`await_initial_logical_size`, as we don't calculate logical size on
these shards. This causes them to hold onto semaphore units and starve
other tenants out from proceeding with warmup activation.

That doesn't hurt availability (we still have on-demand activation), but
it does mean that some background tasks like consumption metrics would
omit some tenants.

## Summary of changes

- Skip waiting for logical size calculation on shards >0
- Upgrade unexpected code paths to use debug_assert!(), which acts as an
implicit regression test for this issue, and make the info() one into a
warn()
---
 .../src/tenant/remote_timeline_client.rs      |  5 +++++
 pageserver/src/tenant/timeline.rs             | 19 ++++++++++++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 9103760388..630ade5c13 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1127,6 +1127,11 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    pub(crate) fn is_deleting(&self) -> bool {
+        let mut locked = self.upload_queue.lock().unwrap();
+        locked.stopped_mut().is_ok()
+    }
+
     pub(crate) async fn preserve_initdb_archive(
         self: &Arc<Self>,
         tenant_id: &TenantId,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 9ee24a4ff0..ca34b4fadc 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2631,6 +2631,7 @@ impl Timeline {
                             // Don't make noise.
                         } else {
                             warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
+                            debug_assert!(false);
                         }
                     }
                 };
@@ -4355,6 +4356,21 @@ impl Timeline {
     /// this Timeline is shut down.  Calling this function will cause the initial
     /// logical size calculation to skip waiting for the background jobs barrier.
     pub(crate) async fn await_initial_logical_size(self: Arc<Self>) {
+        if !self.shard_identity.is_shard_zero() {
+            // We don't populate logical size on shard >0: skip waiting for it.
+            return;
+        }
+
+        if self
+            .remote_client
+            .as_ref()
+            .map(|c| c.is_deleting())
+            .unwrap_or(false)
+        {
+            // The timeline was created in a deletion-resume state, we don't expect logical size to be populated
+            return;
+        }
+
         if let Some(await_bg_cancel) = self
             .current_logical_size
             .cancel_wait_for_background_loop_concurrency_limit_semaphore
@@ -4366,9 +4382,10 @@ impl Timeline {
             // the logical size cancellation to skip the concurrency limit semaphore.
             // TODO: this is an unexpected case.  We should restructure so that it
             // can't happen.
-            tracing::info!(
+            tracing::warn!(
                 "await_initial_logical_size: can't get semaphore cancel token, skipping"
             );
+            debug_assert!(false);
         }
 
         tokio::select!(

From e67fcf9563faa1ebf0653a378a1999f446a3c1a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 14 May 2024 17:49:19 +0200
Subject: [PATCH 0779/1571] Update mold to 2.31 (#7757)

The [2.31.0 release](https://github.com/rui314/mold/releases/tag/v2.31.0) of mold
includes a 10% speed improvement for binaries with a lot of debug info.
As we have such, it might be useful to update mold to the latest
release. The jump is from 2.4.0 to 2.31.0, but it's not been many
releases in between as the version number was raised by the mold
maintainers to 2.30.0 after 2.4.1 [to avoid confusion for some
tools](https://github.com/rui314/mold/releases/tag/v2.30.0).
---
 Dockerfile.build-tools | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 19739cc1f8..460b8c996d 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -87,7 +87,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
     && rm awscliv2.zip
 
 # Mold: A Modern Linker
-ENV MOLD_VERSION v2.4.0
+ENV MOLD_VERSION v2.31.0
 RUN set -e \
     && git clone https://github.com/rui314/mold.git \
     && mkdir mold/build \

From 4eedb3b6f188408cd74397f81e02fc4c416eb5f5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 14 May 2024 18:03:08 +0200
Subject: [PATCH 0780/1571] test suite: allow overriding default compaction
 algorithm via env var (#7747)

This PR allows setting the
`PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM` env var to
override the `tenant_config.compaction_algorithm` field in the initial
`pageserver.toml` for all tests.

I tested manually that this works by halting a test using pdb and
inspecting the `effective_config` in the tenant status managment API.

If the env var is set, the tests are parametrized by the `kind` tag
field, allowing to do a matrix build in CI and let Allure summarize
everything in a nice report.

If the env var is not set, the tests are not parametrized. So, merging
this PR doesn't cause problems for flaky test detection. In fact, it
doesn't cause any runtime change if the env var is not set.

There are some tests in the test suite that set used to override
the entire tenant_config using
`NeonEnvBuilder.pageserver_config_override`.
Since config overrides are merged non-recursively, such overrides
that don't specify `kind = ` cause a fallback to pageserver's built-in
`DEFAULT_COMPACTION_ALGORITHM`.

Such cases can be found using

```
["']tenant_config\s*[='"]
```

We'll deal with these tests in a future PR.

closes https://github.com/neondatabase/neon/issues/7555
---
 scripts/flaky_tests.py                | 25 ++++++++++++++++++++++---
 test_runner/fixtures/neon_fixtures.py | 18 ++++++++++++++++++
 test_runner/fixtures/parametrize.py   | 27 ++++++++++++++++++++++++++-
 3 files changed, 66 insertions(+), 4 deletions(-)

diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
index 878840fcee..919a9278a9 100755
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -5,10 +5,11 @@ import json
 import logging
 import os
 from collections import defaultdict
-from typing import DefaultDict, Dict
+from typing import Any, DefaultDict, Dict, Optional
 
 import psycopg2
 import psycopg2.extras
+import toml
 
 FLAKY_TESTS_QUERY = """
     SELECT
@@ -58,6 +59,24 @@ def main(args: argparse.Namespace):
     else:
         pageserver_virtual_file_io_engine_parameter = ""
 
+    # re-use existing records of flaky tests from before parametrization by compaction_algorithm
+    def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]:
+        """Duplicated from parametrize.py"""
+        toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
+        if toml_table is None:
+            return None
+        v = toml.loads(toml_table)
+        assert isinstance(v, dict)
+        return v
+
+    pageserver_default_tenant_config_compaction_algorithm_parameter = ""
+    if (
+        explicit_default := get_pageserver_default_tenant_config_compaction_algorithm()
+    ) is not None:
+        pageserver_default_tenant_config_compaction_algorithm_parameter = (
+            f"-{explicit_default['kind']}"
+        )
+
     for row in rows:
         # We don't want to automatically rerun tests in a performance suite
         if row["parent_suite"] != "test_runner.regress":
@@ -66,10 +85,10 @@ def main(args: argparse.Namespace):
         if row["name"].endswith("]"):
             parametrized_test = row["name"].replace(
                 "[",
-                f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}-",
+                f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}-",
             )
         else:
-            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}]"
+            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}]"
 
         res[row["parent_suite"]][row["suite"]][parametrized_test] = True
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8432655370..62a4b974a3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -467,6 +467,7 @@ class NeonEnvBuilder:
         initial_timeline: Optional[TimelineId] = None,
         pageserver_virtual_file_io_engine: Optional[str] = None,
         pageserver_aux_file_policy: Optional[AuxFileStore] = None,
+        pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -507,6 +508,14 @@ class NeonEnvBuilder:
 
         self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
 
+        self.pageserver_default_tenant_config_compaction_algorithm: Optional[
+            Dict[str, Any]
+        ] = pageserver_default_tenant_config_compaction_algorithm
+        if self.pageserver_default_tenant_config_compaction_algorithm is not None:
+            log.debug(
+                f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}"
+            )
+
         self.pageserver_get_vectored_impl: Optional[str] = None
         if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored":
             self.pageserver_get_vectored_impl = "vectored"
@@ -1103,6 +1112,11 @@ class NeonEnv:
                 ps_cfg["get_impl"] = config.pageserver_get_impl
             if config.pageserver_validate_vectored_get is not None:
                 ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get
+            if config.pageserver_default_tenant_config_compaction_algorithm is not None:
+                tenant_config = ps_cfg.setdefault("tenant_config", {})
+                tenant_config[
+                    "compaction_algorithm"
+                ] = config.pageserver_default_tenant_config_compaction_algorithm
 
             if self.pageserver_remote_storage is not None:
                 ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
@@ -1304,6 +1318,7 @@ def _shared_simple_env(
     pg_version: PgVersion,
     pageserver_virtual_file_io_engine: str,
     pageserver_aux_file_policy: Optional[AuxFileStore],
+    pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
 ) -> Iterator[NeonEnv]:
     """
     # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
@@ -1335,6 +1350,7 @@ def _shared_simple_env(
         test_output_dir=test_output_dir,
         pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
         pageserver_aux_file_policy=pageserver_aux_file_policy,
+        pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
     ) as builder:
         env = builder.init_start()
 
@@ -1375,6 +1391,7 @@ def neon_env_builder(
     top_output_dir: Path,
     pageserver_virtual_file_io_engine: str,
     pageserver_aux_file_policy: Optional[AuxFileStore] = None,
+    pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.
@@ -1409,6 +1426,7 @@ def neon_env_builder(
         test_output_dir=test_output_dir,
         test_overlay_dir=test_overlay_dir,
         pageserver_aux_file_policy=pageserver_aux_file_policy,
+        pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
     ) as builder:
         yield builder
 
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 77523a542b..0227285822 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -1,7 +1,8 @@
 import os
-from typing import Optional
+from typing import Any, Dict, Optional
 
 import pytest
+import toml
 from _pytest.python import Metafunc
 
 from fixtures.pg_version import PgVersion
@@ -37,6 +38,20 @@ def pageserver_aux_file_policy() -> Optional[AuxFileStore]:
     return None
 
 
+def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]:
+    toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
+    if toml_table is None:
+        return None
+    v = toml.loads(toml_table)
+    assert isinstance(v, dict)
+    return v
+
+
+@pytest.fixture(scope="function", autouse=True)
+def pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]:
+    return get_pageserver_default_tenant_config_compaction_algorithm()
+
+
 def pytest_generate_tests(metafunc: Metafunc):
     if (bt := os.getenv("BUILD_TYPE")) is None:
         build_types = ["debug", "release"]
@@ -60,6 +75,16 @@ def pytest_generate_tests(metafunc: Metafunc):
     ):
         metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine])
 
+    # Same hack for pageserver_default_tenant_config_compaction_algorithm
+    if (
+        explicit_default := get_pageserver_default_tenant_config_compaction_algorithm()
+    ) is not None:
+        metafunc.parametrize(
+            "pageserver_default_tenant_config_compaction_algorithm",
+            [explicit_default],
+            ids=[explicit_default["kind"]],
+        )
+
     # For performance tests, parametrize also by platform
     if (
         "test_runner/performance" in metafunc.definition._nodeid

From 1a2a3cb446be40b7a1453876f0ea8d128c3435cd Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 10 May 2024 16:23:36 +0300
Subject: [PATCH 0781/1571] Add restart_lsn metric for logical slots.

---
 vm-image-spec.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index e9d983eba3..fa7cd014bf 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -293,6 +293,16 @@ files:
         values: [checkpoints_timed]
         query: |
           SELECT checkpoints_timed FROM pg_stat_bgwriter;
+
+      # Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
+      - metric_name: logical_slot_restart_lsn
+        type: gauge
+        help: 'restart_lsn of logical slots'
+        key_labels:
+          - slot_name
+        values: [restart_lsn]
+        query: |
+          select slot_name, restart_lsn from pg_replication_slots where slot_type = 'logical';
   - filename: neon_collector_autoscaling.yml
     content: |
       collector_name: neon_collector_autoscaling

From 438bacc32eb5d6888d0aec623006e6046b59299e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 15 May 2024 12:29:12 +0100
Subject: [PATCH 0782/1571] CI(neon-extra-builds): Use small-arm64 runners
 instead of large-arm64 (#7740)

## Problem
There are not enough arm runners and jobs in `neon-extra-builds` workflow
take about the same amount of time on a small-arm runner as on
large-arm.

## Summary of changes
- Switch `neon-extra-builds` workflow from `large-arm64` to
`small-arm64` runners
---
 .github/actionlint.yml                  | 1 +
 .github/workflows/neon_extra_builds.yml | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 942861ecd8..37983798b7 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -5,6 +5,7 @@ self-hosted-runner:
     - large
     - large-arm64
     - small
+    - small-arm64
     - us-east-2
 config-variables:
   - REMOTE_STORAGE_AZURE_CONTAINER
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index fdb03963fb..7d2187e59c 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -136,7 +136,7 @@ jobs:
   check-linux-arm-build:
     needs: [ check-permissions, build-build-tools-image ]
     timeout-minutes: 90
-    runs-on: [ self-hosted, large-arm64 ]
+    runs-on: [ self-hosted, small-arm64 ]
 
     env:
       # Use release build only, to have less debug info around
@@ -260,7 +260,7 @@ jobs:
   check-codestyle-rust-arm:
     needs: [ check-permissions, build-build-tools-image ]
     timeout-minutes: 90
-    runs-on: [ self-hosted, large-arm64 ]
+    runs-on: [ self-hosted, small-arm64 ]
 
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}

From f342b87f306f5a05cf91d8191833c2df9f6d4acd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 15 May 2024 13:05:24 +0100
Subject: [PATCH 0783/1571] pageserver: remove Option<> around remote storage,
 clean up metadata file refs (#7752)

## Problem

This is historical baggage from when the pageserver could be run with
local disk only: we had a bunch of places where we had to treat remote
storage as optional.

Closes: https://github.com/neondatabase/neon/issues/6890

## Changes

- Remove Option<> around remote storage (in
https://github.com/neondatabase/neon/pull/7722 we made remote storage
clearly mandatory)
- Remove code for deleting old metadata files: they're all gone now.
- Remove other references to metadata files when loading directories, as
none exist.

I checked last 14 days of logs for "found legacy metadata", there are no
instances.
---
 pageserver/ctl/src/draw_timeline_dir.rs       |   5 -
 pageserver/src/bin/pageserver.rs              |  43 ++--
 pageserver/src/deletion_queue.rs              |  23 +-
 pageserver/src/http/routes.rs                 |  44 +---
 pageserver/src/lib.rs                         |  10 +-
 pageserver/src/tenant.rs                      | 177 ++++++---------
 pageserver/src/tenant/delete.rs               |  55 ++---
 pageserver/src/tenant/mgr.rs                  |  84 ++-----
 .../src/tenant/remote_timeline_client.rs      |   4 +-
 pageserver/src/tenant/secondary/downloader.rs |   8 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  56 ++---
 .../src/tenant/storage_layer/layer/tests.rs   |  10 +-
 pageserver/src/tenant/timeline.rs             | 210 +++++++-----------
 pageserver/src/tenant/timeline/compaction.rs  |  12 +-
 pageserver/src/tenant/timeline/delete.rs      |  40 ++--
 .../src/tenant/timeline/detach_ancestor.rs    |  27 +--
 .../src/tenant/timeline/eviction_task.rs      |   7 +-
 pageserver/src/tenant/timeline/init.rs        |   7 +-
 test_runner/regress/test_broken_timeline.py   |  20 +-
 19 files changed, 285 insertions(+), 557 deletions(-)

diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs
index 4dff8af1fc..389519c65a 100644
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -52,7 +52,6 @@
 
 use anyhow::{Context, Result};
 use pageserver::repository::Key;
-use pageserver::METADATA_FILE_NAME;
 use std::cmp::Ordering;
 use std::io::{self, BufRead};
 use std::path::PathBuf;
@@ -159,10 +158,6 @@ pub fn main() -> Result<()> {
         let line = PathBuf::from_str(&line).unwrap();
         let filename = line.file_name().unwrap();
         let filename = filename.to_str().unwrap();
-        if filename == METADATA_FILE_NAME {
-            // Don't try and parse "metadata" like a key-lsn range
-            continue;
-        }
         let (key_range, lsn_range) = parse_filename(filename);
         files.push(Layer {
             filename: filename.to_owned(),
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 49f8a41b37..c0099aa704 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -383,7 +383,7 @@ fn start_pageserver(
     let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
 
     // Set up remote storage client
-    let remote_storage = Some(create_remote_storage_client(conf)?);
+    let remote_storage = create_remote_storage_client(conf)?;
 
     // Set up deletion queue
     let (deletion_queue, deletion_workers) = DeletionQueue::new(
@@ -516,16 +516,12 @@ fn start_pageserver(
         }
     });
 
-    let secondary_controller = if let Some(remote_storage) = &remote_storage {
-        secondary::spawn_tasks(
-            tenant_manager.clone(),
-            remote_storage.clone(),
-            background_jobs_barrier.clone(),
-            shutdown_pageserver.clone(),
-        )
-    } else {
-        secondary::null_controller()
-    };
+    let secondary_controller = secondary::spawn_tasks(
+        tenant_manager.clone(),
+        remote_storage.clone(),
+        background_jobs_barrier.clone(),
+        shutdown_pageserver.clone(),
+    );
 
     // shared state between the disk-usage backed eviction background task and the http endpoint
     // that allows triggering disk-usage based eviction manually. note that the http endpoint
@@ -533,15 +529,13 @@ fn start_pageserver(
     // been configured.
     let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();
 
-    if let Some(remote_storage) = &remote_storage {
-        launch_disk_usage_global_eviction_task(
-            conf,
-            remote_storage.clone(),
-            disk_usage_eviction_state.clone(),
-            tenant_manager.clone(),
-            background_jobs_barrier.clone(),
-        )?;
-    }
+    launch_disk_usage_global_eviction_task(
+        conf,
+        remote_storage.clone(),
+        disk_usage_eviction_state.clone(),
+        tenant_manager.clone(),
+        background_jobs_barrier.clone(),
+    )?;
 
     // Start up the service to handle HTTP mgmt API request. We created the
     // listener earlier already.
@@ -693,14 +687,7 @@ fn start_pageserver(
             // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
             // The plan is to change that over time.
             shutdown_pageserver.take();
-            let bg_remote_storage = remote_storage.clone();
-            let bg_deletion_queue = deletion_queue.clone();
-            pageserver::shutdown_pageserver(
-                &tenant_manager,
-                bg_remote_storage.map(|_| bg_deletion_queue),
-                0,
-            )
-            .await;
+            pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await;
             unreachable!()
         })
     }
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index c937309d83..8790a9b0a8 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -632,7 +632,7 @@ impl DeletionQueue {
     ///
     /// If remote_storage is None, then the returned workers will also be None.
     pub fn new<C>(
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         control_plane_client: Option<C>,
         conf: &'static PageServerConf,
     ) -> (Self, Option<DeletionQueueWorkers<C>>)
@@ -658,23 +658,6 @@ impl DeletionQueue {
         // longer to flush after Tenants have all been torn down.
         let cancel = CancellationToken::new();
 
-        let remote_storage = match remote_storage {
-            None => {
-                return (
-                    Self {
-                        client: DeletionQueueClient {
-                            tx,
-                            executor_tx,
-                            lsn_table: lsn_table.clone(),
-                        },
-                        cancel,
-                    },
-                    None,
-                )
-            }
-            Some(r) => r,
-        };
-
         (
             Self {
                 client: DeletionQueueClient {
@@ -765,7 +748,7 @@ mod test {
         /// Simulate a pageserver restart by destroying and recreating the deletion queue
         async fn restart(&mut self) {
             let (deletion_queue, workers) = DeletionQueue::new(
-                Some(self.storage.clone()),
+                self.storage.clone(),
                 Some(self.mock_control_plane.clone()),
                 self.harness.conf,
             );
@@ -875,7 +858,7 @@ mod test {
         let mock_control_plane = MockControlPlane::new();
 
         let (deletion_queue, worker) = DeletionQueue::new(
-            Some(storage.clone()),
+            storage.clone(),
             Some(mock_control_plane.clone()),
             harness.conf,
         );
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 2370561756..0a98d32f02 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -104,7 +104,7 @@ pub struct State {
     tenant_manager: Arc<TenantManager>,
     auth: Option<Arc<SwappableJwtAuth>>,
     allowlist_routes: Vec<Uri>,
-    remote_storage: Option<GenericRemoteStorage>,
+    remote_storage: GenericRemoteStorage,
     broker_client: storage_broker::BrokerClientChannel,
     disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
     deletion_queue_client: DeletionQueueClient,
@@ -118,7 +118,7 @@ impl State {
         conf: &'static PageServerConf,
         tenant_manager: Arc<TenantManager>,
         auth: Option<Arc<SwappableJwtAuth>>,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         broker_client: storage_broker::BrokerClientChannel,
         disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
         deletion_queue_client: DeletionQueueClient,
@@ -813,12 +813,6 @@ async fn tenant_attach_handler(
 
     let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
 
-    if state.remote_storage.is_none() {
-        return Err(ApiError::BadRequest(anyhow!(
-            "attach_tenant is not possible because pageserver was configured without remote storage"
-        )));
-    }
-
     let tenant_shard_id = TenantShardId::unsharded(tenant_id);
     let shard_params = ShardParameters::default();
     let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
@@ -1643,12 +1637,6 @@ async fn tenant_time_travel_remote_storage_handler(
         )));
     }
 
-    let Some(storage) = state.remote_storage.as_ref() else {
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "remote storage not configured, cannot run time travel"
-        )));
-    };
-
     if timestamp > done_if_after {
         return Err(ApiError::BadRequest(anyhow!(
             "The done_if_after timestamp comes before the timestamp to recover to"
@@ -1658,7 +1646,7 @@ async fn tenant_time_travel_remote_storage_handler(
     tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}");
 
     remote_timeline_client::upload::time_travel_recover_tenant(
-        storage,
+        &state.remote_storage,
         &tenant_shard_id,
         timestamp,
         done_if_after,
@@ -1903,11 +1891,6 @@ async fn deletion_queue_flush(
 ) -> Result<Response<Body>, ApiError> {
     let state = get_state(&r);
 
-    if state.remote_storage.is_none() {
-        // Nothing to do if remote storage is disabled.
-        return json_response(StatusCode::OK, ());
-    }
-
     let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
 
     let flush = async {
@@ -2072,18 +2055,11 @@ async fn disk_usage_eviction_run(
     };
 
     let state = get_state(&r);
-
-    let Some(storage) = state.remote_storage.as_ref() else {
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "remote storage not configured, cannot run eviction iteration"
-        )));
-    };
-
     let eviction_state = state.disk_usage_eviction_state.clone();
 
     let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
         &eviction_state,
-        storage,
+        &state.remote_storage,
         usage,
         &state.tenant_manager,
         config.eviction_order,
@@ -2120,29 +2096,23 @@ async fn tenant_scan_remote_handler(
     let state = get_state(&request);
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
 
-    let Some(remote_storage) = state.remote_storage.as_ref() else {
-        return Err(ApiError::BadRequest(anyhow::anyhow!(
-            "Remote storage not configured"
-        )));
-    };
-
     let mut response = TenantScanRemoteStorageResponse::default();
 
     let (shards, _other_keys) =
-        list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone())
+        list_remote_tenant_shards(&state.remote_storage, tenant_id, cancel.clone())
             .await
             .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
 
     for tenant_shard_id in shards {
         let (timeline_ids, _other_keys) =
-            list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone())
+            list_remote_timelines(&state.remote_storage, tenant_shard_id, cancel.clone())
                 .await
                 .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
 
         let mut generation = Generation::none();
         for timeline_id in timeline_ids {
             match download_index_part(
-                remote_storage,
+                &state.remote_storage,
                 &tenant_shard_id,
                 &timeline_id,
                 Generation::MAX,
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 930700e50c..c69fb8c83b 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -57,7 +57,7 @@ pub use crate::metrics::preinitialize_metrics;
 #[tracing::instrument(skip_all, fields(%exit_code))]
 pub async fn shutdown_pageserver(
     tenant_manager: &TenantManager,
-    deletion_queue: Option<DeletionQueue>,
+    mut deletion_queue: DeletionQueue,
     exit_code: i32,
 ) {
     use std::time::Duration;
@@ -89,9 +89,7 @@ pub async fn shutdown_pageserver(
     .await;
 
     // Best effort to persist any outstanding deletions, to avoid leaking objects
-    if let Some(mut deletion_queue) = deletion_queue {
-        deletion_queue.shutdown(Duration::from_secs(5)).await;
-    }
+    deletion_queue.shutdown(Duration::from_secs(5)).await;
 
     // Shut down the HTTP endpoint last, so that you can still check the server's
     // status while it's shutting down.
@@ -114,10 +112,6 @@ pub async fn shutdown_pageserver(
     std::process::exit(exit_code);
 }
 
-/// The name of the metadata file pageserver creates per timeline.
-/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
-pub const METADATA_FILE_NAME: &str = "metadata";
-
 /// Per-tenant configuration file.
 /// Full path: `tenants/<tenant_id>/config`.
 pub(crate) const TENANT_CONFIG_NAME: &str = "config";
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 80d354d79e..026cbc107c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -190,7 +190,7 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
 #[derive(Clone)]
 pub struct TenantSharedResources {
     pub broker_client: storage_broker::BrokerClientChannel,
-    pub remote_storage: Option<GenericRemoteStorage>,
+    pub remote_storage: GenericRemoteStorage,
     pub deletion_queue_client: DeletionQueueClient,
 }
 
@@ -292,7 +292,7 @@ pub struct Tenant {
     walredo_mgr: Option<Arc<WalRedoManager>>,
 
     // provides access to timeline data sitting in the remote storage
-    pub(crate) remote_storage: Option<GenericRemoteStorage>,
+    pub(crate) remote_storage: GenericRemoteStorage,
 
     // Access to global deletion queue for when this tenant wants to schedule a deletion
     deletion_queue_client: DeletionQueueClient,
@@ -551,21 +551,22 @@ impl Tenant {
         );
 
         if let Some(index_part) = index_part.as_ref() {
-            timeline
-                .remote_client
-                .as_ref()
-                .unwrap()
-                .init_upload_queue(index_part)?;
-        } else if self.remote_storage.is_some() {
+            timeline.remote_client.init_upload_queue(index_part)?;
+        } else {
             // No data on the remote storage, but we have local metadata file. We can end up
             // here with timeline_create being interrupted before finishing index part upload.
             // By doing what we do here, the index part upload is retried.
             // If control plane retries timeline creation in the meantime, the mgmt API handler
             // for timeline creation will coalesce on the upload we queue here.
+
             // FIXME: this branch should be dead code as we no longer write local metadata.
-            let rtc = timeline.remote_client.as_ref().unwrap();
-            rtc.init_upload_queue_for_empty_remote(&metadata)?;
-            rtc.schedule_index_upload_for_full_metadata_update(&metadata)?;
+
+            timeline
+                .remote_client
+                .init_upload_queue_for_empty_remote(&metadata)?;
+            timeline
+                .remote_client
+                .schedule_index_upload_for_full_metadata_update(&metadata)?;
         }
 
         timeline
@@ -777,14 +778,14 @@ impl Tenant {
                     AttachType::Normal
                 };
 
-                let preload = match (&mode, &remote_storage) {
-                    (SpawnMode::Create, _) => {
+                let preload = match &mode {
+                    SpawnMode::Create => {
                         None
                     },
-                    (SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => {
+                    SpawnMode::Eager | SpawnMode::Lazy => {
                         let _preload_timer = TENANT.preload.start_timer();
                         let res = tenant_clone
-                            .preload(remote_storage, task_mgr::shutdown_token())
+                            .preload(&remote_storage, task_mgr::shutdown_token())
                             .await;
                         match res {
                             Ok(p) => Some(p),
@@ -794,10 +795,7 @@ impl Tenant {
                             }
                         }
                     }
-                    (_, None) => {
-                        let _preload_timer = TENANT.preload.start_timer();
-                        None
-                    }
+
                 };
 
                 // Remote preload is complete.
@@ -1021,7 +1019,7 @@ impl Tenant {
                 index_part,
                 remote_metadata,
                 TimelineResources {
-                    remote_client: Some(remote_client),
+                    remote_client,
                     deletion_queue_client: self.deletion_queue_client.clone(),
                     timeline_get_throttle: self.timeline_get_throttle.clone(),
                 },
@@ -1047,7 +1045,7 @@ impl Tenant {
                 Arc::clone(self),
                 timeline_id,
                 &index_part.metadata,
-                Some(remote_timeline_client),
+                remote_timeline_client,
                 self.deletion_queue_client.clone(),
             )
             .instrument(tracing::info_span!("timeline_delete", %timeline_id))
@@ -1139,9 +1137,7 @@ impl Tenant {
         let mut size = 0;
 
         for timeline in self.list_timelines() {
-            if let Some(remote_client) = &timeline.remote_client {
-                size += remote_client.get_remote_physical_size();
-            }
+            size += timeline.remote_client.get_remote_physical_size();
         }
 
         size
@@ -1191,6 +1187,7 @@ impl Tenant {
     pub fn create_broken_tenant(
         conf: &'static PageServerConf,
         tenant_shard_id: TenantShardId,
+        remote_storage: GenericRemoteStorage,
         reason: String,
     ) -> Arc<Tenant> {
         Arc::new(Tenant::new(
@@ -1205,7 +1202,7 @@ impl Tenant {
             ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
             None,
             tenant_shard_id,
-            None,
+            remote_storage,
             DeletionQueueClient::broken(),
         ))
     }
@@ -1398,13 +1395,7 @@ impl Tenant {
         tline.freeze_and_flush().await.context("freeze_and_flush")?;
 
         // Make sure the freeze_and_flush reaches remote storage.
-        tline
-            .remote_client
-            .as_ref()
-            .unwrap()
-            .wait_completion()
-            .await
-            .unwrap();
+        tline.remote_client.wait_completion().await.unwrap();
 
         let tl = uninit_tl.finish_creation()?;
         // The non-test code would call tl.activate() here.
@@ -1470,20 +1461,19 @@ impl Tenant {
                     return Err(CreateTimelineError::Conflict);
                 }
 
-                if let Some(remote_client) = existing.remote_client.as_ref() {
-                    // Wait for uploads to complete, so that when we return Ok, the timeline
-                    // is known to be durable on remote storage. Just like we do at the end of
-                    // this function, after we have created the timeline ourselves.
-                    //
-                    // We only really care that the initial version of `index_part.json` has
-                    // been uploaded. That's enough to remember that the timeline
-                    // exists. However, there is no function to wait specifically for that so
-                    // we just wait for all in-progress uploads to finish.
-                    remote_client
-                        .wait_completion()
-                        .await
-                        .context("wait for timeline uploads to complete")?;
-                }
+                // Wait for uploads to complete, so that when we return Ok, the timeline
+                // is known to be durable on remote storage. Just like we do at the end of
+                // this function, after we have created the timeline ourselves.
+                //
+                // We only really care that the initial version of `index_part.json` has
+                // been uploaded. That's enough to remember that the timeline
+                // exists. However, there is no function to wait specifically for that so
+                // we just wait for all in-progress uploads to finish.
+                existing
+                    .remote_client
+                    .wait_completion()
+                    .await
+                    .context("wait for timeline uploads to complete")?;
 
                 return Ok(existing);
             }
@@ -1559,14 +1549,14 @@ impl Tenant {
         // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet.  We must
         // not send a success to the caller until it is.  The same applies to handling retries,
         // see the handling of [`TimelineExclusionError::AlreadyExists`] above.
-        if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
-            let kind = ancestor_timeline_id
-                .map(|_| "branched")
-                .unwrap_or("bootstrapped");
-            remote_client.wait_completion().await.with_context(|| {
-                format!("wait for {} timeline initial uploads to complete", kind)
-            })?;
-        }
+        let kind = ancestor_timeline_id
+            .map(|_| "branched")
+            .unwrap_or("bootstrapped");
+        loaded_timeline
+            .remote_client
+            .wait_completion()
+            .await
+            .with_context(|| format!("wait for {} timeline initial uploads to complete", kind))?;
 
         loaded_timeline.activate(self.clone(), broker_client, None, ctx);
 
@@ -2161,32 +2151,26 @@ impl Tenant {
     ) -> anyhow::Result<()> {
         let timelines = self.timelines.lock().unwrap().clone();
         for timeline in timelines.values() {
-            let Some(tl_client) = &timeline.remote_client else {
-                anyhow::bail!("Remote storage is mandatory");
-            };
-
-            let Some(remote_storage) = &self.remote_storage else {
-                anyhow::bail!("Remote storage is mandatory");
-            };
-
             // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
             // to ensure that they do not start a split if currently in the process of doing these.
 
             // Upload an index from the parent: this is partly to provide freshness for the
             // child tenants that will copy it, and partly for general ease-of-debugging: there will
             // always be a parent shard index in the same generation as we wrote the child shard index.
-            tl_client.schedule_index_upload_for_file_changes()?;
-            tl_client.wait_completion().await?;
+            timeline
+                .remote_client
+                .schedule_index_upload_for_file_changes()?;
+            timeline.remote_client.wait_completion().await?;
 
             // Shut down the timeline's remote client: this means that the indices we write
             // for child shards will not be invalidated by the parent shard deleting layers.
-            tl_client.shutdown().await;
+            timeline.remote_client.shutdown().await;
 
             // Download methods can still be used after shutdown, as they don't flow through the remote client's
             // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
             // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
             // we use here really is the remotely persistent one).
-            let result = tl_client
+            let result = timeline.remote_client
                 .download_index_file(&self.cancel)
                 .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
                 .await?;
@@ -2199,7 +2183,7 @@ impl Tenant {
 
             for child_shard in child_shards {
                 upload_index_part(
-                    remote_storage,
+                    &self.remote_storage,
                     child_shard,
                     &timeline.timeline_id,
                     self.generation,
@@ -2475,7 +2459,7 @@ impl Tenant {
         shard_identity: ShardIdentity,
         walredo_mgr: Option<Arc<WalRedoManager>>,
         tenant_shard_id: TenantShardId,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         deletion_queue_client: DeletionQueueClient,
     ) -> Tenant {
         let (state, mut rx) = watch::channel(state);
@@ -3119,11 +3103,10 @@ impl Tenant {
         // We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC
         // could get incorrect information and remove more layers, than needed.
         // See also https://github.com/neondatabase/neon/issues/3865
-        if let Some(remote_client) = new_timeline.remote_client.as_ref() {
-            remote_client
-                .schedule_index_upload_for_full_metadata_update(&metadata)
-                .context("branch initial metadata upload")?;
-        }
+        new_timeline
+            .remote_client
+            .schedule_index_upload_for_full_metadata_update(&metadata)
+            .context("branch initial metadata upload")?;
 
         Ok(new_timeline)
     }
@@ -3155,11 +3138,6 @@ impl Tenant {
         pgdata_path: &Utf8PathBuf,
         timeline_id: &TimelineId,
     ) -> anyhow::Result<()> {
-        let Some(storage) = &self.remote_storage else {
-            // No remote storage?  No upload.
-            return Ok(());
-        };
-
         let temp_path = timelines_path.join(format!(
             "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
         ));
@@ -3183,7 +3161,7 @@ impl Tenant {
         backoff::retry(
             || async {
                 self::remote_timeline_client::upload_initdb_dir(
-                    storage,
+                    &self.remote_storage,
                     &self.tenant_shard_id.tenant_id,
                     timeline_id,
                     pgdata_zstd.try_clone().await?,
@@ -3240,9 +3218,6 @@ impl Tenant {
             }
         }
         if let Some(existing_initdb_timeline_id) = load_existing_initdb {
-            let Some(storage) = &self.remote_storage else {
-                bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}");
-            };
             if existing_initdb_timeline_id != timeline_id {
                 let source_path = &remote_initdb_archive_path(
                     &self.tenant_shard_id.tenant_id,
@@ -3252,7 +3227,7 @@ impl Tenant {
                     &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id);
 
                 // if this fails, it will get retried by retried control plane requests
-                storage
+                self.remote_storage
                     .copy_object(source_path, dest_path, &self.cancel)
                     .await
                     .context("copy initdb tar")?;
@@ -3260,7 +3235,7 @@ impl Tenant {
             let (initdb_tar_zst_path, initdb_tar_zst) =
                 self::remote_timeline_client::download_initdb_tar_zst(
                     self.conf,
-                    storage,
+                    &self.remote_storage,
                     &self.tenant_shard_id,
                     &existing_initdb_timeline_id,
                     &self.cancel,
@@ -3355,20 +3330,14 @@ impl Tenant {
 
     /// Call this before constructing a timeline, to build its required structures
     fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
-        let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
-            let remote_client = RemoteTimelineClient::new(
-                remote_storage.clone(),
-                self.deletion_queue_client.clone(),
-                self.conf,
-                self.tenant_shard_id,
-                timeline_id,
-                self.generation,
-            );
-            Some(remote_client)
-        } else {
-            None
-        };
-
+        let remote_client = RemoteTimelineClient::new(
+            self.remote_storage.clone(),
+            self.deletion_queue_client.clone(),
+            self.conf,
+            self.tenant_shard_id,
+            timeline_id,
+            self.generation,
+        );
         TimelineResources {
             remote_client,
             deletion_queue_client: self.deletion_queue_client.clone(),
@@ -3392,9 +3361,9 @@ impl Tenant {
         let tenant_shard_id = self.tenant_shard_id;
 
         let resources = self.build_timeline_resources(new_timeline_id);
-        if let Some(remote_client) = &resources.remote_client {
-            remote_client.init_upload_queue_for_empty_remote(new_metadata)?;
-        }
+        resources
+            .remote_client
+            .init_upload_queue_for_empty_remote(new_metadata)?;
 
         let timeline_struct = self
             .create_timeline_struct(
@@ -3562,9 +3531,7 @@ impl Tenant {
             tracing::info!(timeline_id=%timeline.timeline_id, "Flushing...");
             timeline.freeze_and_flush().await?;
             tracing::info!(timeline_id=%timeline.timeline_id, "Waiting for uploads...");
-            if let Some(client) = &timeline.remote_client {
-                client.wait_completion().await?;
-            }
+            timeline.remote_client.wait_completion().await?;
 
             Ok(())
         }
@@ -3878,7 +3845,7 @@ pub(crate) mod harness {
                 ShardIdentity::unsharded(),
                 Some(walredo_mgr),
                 self.tenant_shard_id,
-                Some(self.remote_storage.clone()),
+                self.remote_storage.clone(),
                 self.deletion_queue.new_client(),
             ));
 
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 2e5259bfe2..3173a33dad 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -181,25 +181,23 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
 
 async fn remove_tenant_remote_delete_mark(
     conf: &PageServerConf,
-    remote_storage: Option<&GenericRemoteStorage>,
+    remote_storage: &GenericRemoteStorage,
     tenant_shard_id: &TenantShardId,
     cancel: &CancellationToken,
 ) -> Result<(), DeleteTenantError> {
-    if let Some(remote_storage) = remote_storage {
-        let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
-        backoff::retry(
-            || async { remote_storage.delete(&path, cancel).await },
-            TimeoutOrCancel::caused_by_cancel,
-            FAILED_UPLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "remove_tenant_remote_delete_mark",
-            cancel,
-        )
-        .await
-        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
-        .and_then(|x| x)
-        .context("remove_tenant_remote_delete_mark")?;
-    }
+    let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
+    backoff::retry(
+        || async { remote_storage.delete(&path, cancel).await },
+        TimeoutOrCancel::caused_by_cancel,
+        FAILED_UPLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        "remove_tenant_remote_delete_mark",
+        cancel,
+    )
+    .await
+    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
+    .and_then(|x| x)
+    .context("remove_tenant_remote_delete_mark")?;
     Ok(())
 }
 
@@ -297,7 +295,7 @@ impl DeleteTenantFlow {
     #[instrument(skip_all)]
     pub(crate) async fn run(
         conf: &'static PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         tenants: &'static std::sync::RwLock<TenantsMap>,
         tenant: Arc<Tenant>,
         cancel: &CancellationToken,
@@ -308,9 +306,7 @@ impl DeleteTenantFlow {
 
         let mut guard = Self::prepare(&tenant).await?;
 
-        if let Err(e) =
-            Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
-        {
+        if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await {
             tenant.set_broken(format!("{e:#}")).await;
             return Err(e);
         }
@@ -327,7 +323,7 @@ impl DeleteTenantFlow {
     async fn run_inner(
         guard: &mut OwnedMutexGuard<Self>,
         conf: &'static PageServerConf,
-        remote_storage: Option<&GenericRemoteStorage>,
+        remote_storage: &GenericRemoteStorage,
         tenant: &Tenant,
         cancel: &CancellationToken,
     ) -> Result<(), DeleteTenantError> {
@@ -339,14 +335,9 @@ impl DeleteTenantFlow {
             ))?
         });
 
-        // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
-        // Though sounds scary, different mark name?
-        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
-        if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
-                .await
-                .context("remote_mark")?
-        }
+        create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
+            .await
+            .context("remote_mark")?;
 
         fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
             Err(anyhow::anyhow!(
@@ -483,7 +474,7 @@ impl DeleteTenantFlow {
     fn schedule_background(
         guard: OwnedMutexGuard<Self>,
         conf: &'static PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         tenants: &'static std::sync::RwLock<TenantsMap>,
         tenant: Arc<Tenant>,
     ) {
@@ -512,7 +503,7 @@ impl DeleteTenantFlow {
     async fn background(
         mut guard: OwnedMutexGuard<Self>,
         conf: &PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         tenants: &'static std::sync::RwLock<TenantsMap>,
         tenant: &Arc<Tenant>,
     ) -> Result<(), DeleteTenantError> {
@@ -551,7 +542,7 @@ impl DeleteTenantFlow {
 
         remove_tenant_remote_delete_mark(
             conf,
-            remote_storage.as_ref(),
+            &remote_storage,
             &tenant.tenant_shard_id,
             &task_mgr::shutdown_token(),
         )
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 7a3e36bf02..5abda7b64e 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -47,7 +47,7 @@ use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
 use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
-use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
 
 use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext::PathExt;
@@ -391,22 +391,17 @@ async fn init_load_generations(
     // deletion list entries may still be valid.  We provide that by pushing a recovery operation into
     // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
     // are processed, even though we don't block on recovery completing here.
-    //
-    // Must only do this if remote storage is enabled, otherwise deletion queue
-    // is not running and channel push will fail.
-    if resources.remote_storage.is_some() {
-        let attached_tenants = generations
-            .iter()
-            .flat_map(|(id, start_mode)| {
-                match start_mode {
-                    TenantStartupMode::Attached((_mode, generation)) => Some(generation),
-                    TenantStartupMode::Secondary => None,
-                }
-                .map(|gen| (*id, *gen))
-            })
-            .collect();
-        resources.deletion_queue_client.recover(attached_tenants)?;
-    }
+    let attached_tenants = generations
+        .iter()
+        .flat_map(|(id, start_mode)| {
+            match start_mode {
+                TenantStartupMode::Attached((_mode, generation)) => Some(generation),
+                TenantStartupMode::Secondary => None,
+            }
+            .map(|gen| (*id, *gen))
+        })
+        .collect();
+    resources.deletion_queue_client.recover(attached_tenants)?;
 
     Ok(Some(generations))
 }
@@ -460,53 +455,6 @@ fn load_tenant_config(
         }
     };
 
-    // Clean up legacy `metadata` files.
-    // Doing it here because every single tenant directory is visited here.
-    // In any later code, there's different treatment of tenant dirs
-    // ... depending on whether the tenant is in re-attach response or not
-    // ... epending on whether the tenant is ignored or not
-    assert_eq!(
-        &conf.tenant_path(&tenant_shard_id),
-        &tenant_dir_path,
-        "later use of conf....path() methods would be dubious"
-    );
-    let timelines: Vec<TimelineId> = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() {
-        Ok(iter) => {
-            let mut timelines = Vec::new();
-            for res in iter {
-                let p = res?;
-                let Some(timeline_id) = p.file_name().parse::<TimelineId>().ok() else {
-                    // skip any entries that aren't TimelineId, such as
-                    // - *.___temp dirs
-                    // - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart)
-                    continue;
-                };
-                timelines.push(timeline_id);
-            }
-            timelines
-        }
-        Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![],
-        Err(e) => return Err(anyhow::anyhow!(e)),
-    };
-    for timeline_id in timelines {
-        let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id);
-        let metadata_path = timeline_path.join(METADATA_FILE_NAME);
-        match std::fs::remove_file(&metadata_path) {
-            Ok(()) => {
-                crashsafe::fsync(timeline_path)
-                    .context("fsync timeline dir after removing legacy metadata file")?;
-                info!("removed legacy metadata file at {metadata_path}");
-            }
-            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-                // something removed the file earlier, or it was never there
-                // We don't care, this software version doesn't write it again, so, we're good.
-            }
-            Err(e) => {
-                anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}");
-            }
-        }
-    }
-
     let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
     if tenant_ignore_mark_file.exists() {
         info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
@@ -611,6 +559,7 @@ pub async fn init_tenant_mgr(
                     TenantSlot::Attached(Tenant::create_broken_tenant(
                         conf,
                         tenant_shard_id,
+                        resources.remote_storage.clone(),
                         format!("{}", e),
                     )),
                 );
@@ -803,6 +752,7 @@ fn tenant_spawn(
         "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
     );
 
+    let remote_storage = resources.remote_storage.clone();
     let tenant = match Tenant::spawn(
         conf,
         tenant_shard_id,
@@ -817,7 +767,7 @@ fn tenant_spawn(
         Ok(tenant) => tenant,
         Err(e) => {
             error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}");
-            Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}"))
+            Tenant::create_broken_tenant(conf, tenant_shard_id, remote_storage, format!("{e:#}"))
         }
     };
 
@@ -2276,7 +2226,7 @@ pub(crate) async fn load_tenant(
     tenant_id: TenantId,
     generation: Generation,
     broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
+    remote_storage: GenericRemoteStorage,
     deletion_queue_client: DeletionQueueClient,
     ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
@@ -2937,7 +2887,7 @@ pub(crate) async fn immediate_gc(
         }
 
         let timeline = tenant.get_timeline(timeline_id, false).ok();
-        let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref());
+        let rtc = timeline.as_ref().map(|x| &x.remote_client);
 
         if let Some(rtc) = rtc {
             // layer drops schedule actions on remote timeline client to actually do the
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 630ade5c13..c5462dac43 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2137,7 +2137,7 @@ mod tests {
             tenant_ctx: _tenant_ctx,
         } = test_setup;
 
-        let client = timeline.remote_client.as_ref().unwrap();
+        let client = &timeline.remote_client;
 
         // Download back the index.json, and check that the list of files is correct
         let initial_index_part = match client
@@ -2328,7 +2328,7 @@ mod tests {
             timeline,
             ..
         } = TestSetup::new("metrics").await.unwrap();
-        let client = timeline.remote_client.as_ref().unwrap();
+        let client = &timeline.remote_client;
 
         let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
         let local_path = local_layer_path(
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index c28e041fa2..46a3d7e81f 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -26,7 +26,7 @@ use crate::{
         tasks::{warn_when_period_overrun, BackgroundLoopKind},
     },
     virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
-    METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
+    TEMP_FILE_SUFFIX,
 };
 
 use super::{
@@ -1074,11 +1074,7 @@ async fn init_timeline_state(
             .fatal_err(&format!("Read metadata on {}", file_path));
 
         let file_name = file_path.file_name().expect("created it from the dentry");
-        if file_name == METADATA_FILE_NAME {
-            // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
-            warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
-            continue;
-        } else if crate::is_temporary(&file_path)
+        if crate::is_temporary(&file_path)
             || is_temp_download_file(&file_path)
             || is_ephemeral_file(file_name)
         {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index b6f7702247..e8c712c4c6 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -585,9 +585,6 @@ struct LayerInner {
     /// [`Timeline::gate`] at the same time.
     timeline: Weak<Timeline>,
 
-    /// Cached knowledge of [`Timeline::remote_client`] being `Some`.
-    have_remote_client: bool,
-
     access_stats: LayerAccessStats,
 
     /// This custom OnceCell is backed by std mutex, but only held for short time periods.
@@ -732,23 +729,23 @@ impl Drop for LayerInner {
             if removed {
                 timeline.metrics.resident_physical_size_sub(file_size);
             }
-            if let Some(remote_client) = timeline.remote_client.as_ref() {
-                let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
+            let res = timeline
+                .remote_client
+                .schedule_deletion_of_unlinked(vec![(file_name, meta)]);
 
-                if let Err(e) = res {
-                    // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
-                    // demonstrating this deadlock (without spawn_blocking): stop will drop
-                    // queued items, which will have ResidentLayer's, and those drops would try
-                    // to re-entrantly lock the RemoteTimelineClient inner state.
-                    if !timeline.is_active() {
-                        tracing::info!("scheduling deletion on drop failed: {e:#}");
-                    } else {
-                        tracing::warn!("scheduling deletion on drop failed: {e:#}");
-                    }
-                    LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
+            if let Err(e) = res {
+                // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
+                // demonstrating this deadlock (without spawn_blocking): stop will drop
+                // queued items, which will have ResidentLayer's, and those drops would try
+                // to re-entrantly lock the RemoteTimelineClient inner state.
+                if !timeline.is_active() {
+                    tracing::info!("scheduling deletion on drop failed: {e:#}");
                 } else {
-                    LAYER_IMPL_METRICS.inc_completed_deletes();
+                    tracing::warn!("scheduling deletion on drop failed: {e:#}");
                 }
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
+            } else {
+                LAYER_IMPL_METRICS.inc_completed_deletes();
             }
         });
     }
@@ -786,7 +783,6 @@ impl LayerInner {
             path: local_path,
             desc,
             timeline: Arc::downgrade(timeline),
-            have_remote_client: timeline.remote_client.is_some(),
             access_stats,
             wanted_deleted: AtomicBool::new(false),
             inner,
@@ -815,8 +811,6 @@ impl LayerInner {
     /// in a new attempt to evict OR join the previously started attempt.
     #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))]
     pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
-        assert!(self.have_remote_client);
-
         let mut rx = self.status.as_ref().unwrap().subscribe();
 
         {
@@ -973,10 +967,6 @@ impl LayerInner {
             return Err(DownloadError::NotFile(ft));
         }
 
-        if timeline.remote_client.as_ref().is_none() {
-            return Err(DownloadError::NoRemoteStorage);
-        }
-
         if let Some(ctx) = ctx {
             self.check_expected_download(ctx)?;
         }
@@ -1113,12 +1103,8 @@ impl LayerInner {
         permit: heavier_once_cell::InitPermit,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<DownloadedLayer>> {
-        let client = timeline
+        let result = timeline
             .remote_client
-            .as_ref()
-            .expect("checked before download_init_and_wait");
-
-        let result = client
             .download_layer_file(
                 &self.desc.layer_name(),
                 &self.metadata(),
@@ -1293,20 +1279,10 @@ impl LayerInner {
 
     /// `DownloadedLayer` is being dropped, so it calls this method.
     fn on_downloaded_layer_drop(self: Arc<LayerInner>, only_version: usize) {
-        let can_evict = self.have_remote_client;
-
         // we cannot know without inspecting LayerInner::inner if we should evict or not, even
         // though here it is very likely
         let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version);
 
-        if !can_evict {
-            // it would be nice to assert this case out, but we are in drop
-            span.in_scope(|| {
-                tracing::error!("bug in struct Layer: ResidentOrWantedEvicted has been downgraded while we have no remote storage");
-            });
-            return;
-        }
-
         // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
         // drop while the `self.inner` is being locked, leading to a deadlock.
 
@@ -1578,8 +1554,6 @@ pub(crate) enum EvictionError {
 pub(crate) enum DownloadError {
     #[error("timeline has already shutdown")]
     TimelineShutdown,
-    #[error("no remote storage configured")]
-    NoRemoteStorage,
     #[error("context denies downloading")]
     ContextAndConfigReallyDeniesDownloads,
     #[error("downloading is really required but not allowed by this method")]
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 52f62faa8d..fa9142d5e9 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -145,7 +145,7 @@ async fn smoke_test() {
         .await
         .expect("the local layer file still exists");
 
-    let rtc = timeline.remote_client.as_ref().unwrap();
+    let rtc = &timeline.remote_client;
 
     {
         let layers = &[layer];
@@ -761,13 +761,7 @@ async fn eviction_cancellation_on_drop() {
     timeline.freeze_and_flush().await.unwrap();
 
     // wait for the upload to complete so our Arc::strong_count assertion holds
-    timeline
-        .remote_client
-        .as_ref()
-        .unwrap()
-        .wait_completion()
-        .await
-        .unwrap();
+    timeline.remote_client.wait_completion().await.unwrap();
 
     let (evicted_layer, not_evicted) = {
         let mut layers = {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ca34b4fadc..df9bc9b35b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -200,7 +200,7 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
-    pub remote_client: Option<RemoteTimelineClient>,
+    pub remote_client: RemoteTimelineClient,
     pub deletion_queue_client: DeletionQueueClient,
     pub timeline_get_throttle: Arc<
         crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
@@ -272,7 +272,7 @@ pub struct Timeline {
 
     /// Remote storage client.
     /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
-    pub remote_client: Option<Arc<RemoteTimelineClient>>,
+    pub remote_client: Arc<RemoteTimelineClient>,
 
     // What page versions do we hold in the repository? If we get a
     // request > last_record_lsn, we need to wait until we receive all
@@ -1375,22 +1375,14 @@ impl Timeline {
     /// not validated with control plane yet.
     /// See [`Self::get_remote_consistent_lsn_visible`].
     pub(crate) fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
-        if let Some(remote_client) = &self.remote_client {
-            remote_client.remote_consistent_lsn_projected()
-        } else {
-            None
-        }
+        self.remote_client.remote_consistent_lsn_projected()
     }
 
     /// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
     /// i.e. a value of remote_consistent_lsn_projected which has undergone
     /// generation validation in the deletion queue.
     pub(crate) fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
-        if let Some(remote_client) = &self.remote_client {
-            remote_client.remote_consistent_lsn_visible()
-        } else {
-            None
-        }
+        self.remote_client.remote_consistent_lsn_visible()
     }
 
     /// The sum of the file size of all historic layers in the layer map.
@@ -1760,16 +1752,14 @@ impl Timeline {
             match self.freeze_and_flush().await {
                 Ok(_) => {
                     // drain the upload queue
-                    if let Some(client) = self.remote_client.as_ref() {
-                        // if we did not wait for completion here, it might be our shutdown process
-                        // didn't wait for remote uploads to complete at all, as new tasks can forever
-                        // be spawned.
-                        //
-                        // what is problematic is the shutting down of RemoteTimelineClient, because
-                        // obviously it does not make sense to stop while we wait for it, but what
-                        // about corner cases like s3 suddenly hanging up?
-                        client.shutdown().await;
-                    }
+                    // if we did not wait for completion here, it might be our shutdown process
+                    // didn't wait for remote uploads to complete at all, as new tasks can forever
+                    // be spawned.
+                    //
+                    // what is problematic is the shutting down of RemoteTimelineClient, because
+                    // obviously it does not make sense to stop while we wait for it, but what
+                    // about corner cases like s3 suddenly hanging up?
+                    self.remote_client.shutdown().await;
                 }
                 Err(e) => {
                     // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
@@ -1785,18 +1775,16 @@ impl Timeline {
 
         // Transition the remote_client into a state where it's only useful for timeline deletion.
         // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
-        if let Some(remote_client) = self.remote_client.as_ref() {
-            remote_client.stop();
-            // As documented in remote_client.stop()'s doc comment, it's our responsibility
-            // to shut down the upload queue tasks.
-            // TODO: fix that, task management should be encapsulated inside remote_client.
-            task_mgr::shutdown_tasks(
-                Some(TaskKind::RemoteUploadTask),
-                Some(self.tenant_shard_id),
-                Some(self.timeline_id),
-            )
-            .await;
-        }
+        self.remote_client.stop();
+        // As documented in remote_client.stop()'s doc comment, it's our responsibility
+        // to shut down the upload queue tasks.
+        // TODO: fix that, task management should be encapsulated inside remote_client.
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::RemoteUploadTask),
+            Some(self.tenant_shard_id),
+            Some(self.timeline_id),
+        )
+        .await;
 
         // TODO: work toward making this a no-op. See this funciton's doc comment for more context.
         tracing::debug!("Waiting for tasks...");
@@ -1922,10 +1910,6 @@ impl Timeline {
             return Ok(None);
         };
 
-        if self.remote_client.is_none() {
-            return Ok(Some(false));
-        }
-
         layer.download().await?;
 
         Ok(Some(true))
@@ -2190,7 +2174,7 @@ impl Timeline {
                 walredo_mgr,
                 walreceiver: Mutex::new(None),
 
-                remote_client: resources.remote_client.map(Arc::new),
+                remote_client: Arc::new(resources.remote_client),
 
                 // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
                 last_record_lsn: SeqWait::new(RecordLsn {
@@ -2437,10 +2421,6 @@ impl Timeline {
                             discovered_layers.push((layer_file_name, local_path, file_size));
                             continue;
                         }
-                        Discovered::Metadata => {
-                            warn!("found legacy metadata file, these should have been removed in load_tenant_config");
-                            continue;
-                        }
                         Discovered::IgnoredBackup => {
                             continue;
                         }
@@ -2487,12 +2467,10 @@ impl Timeline {
                             if local.metadata.file_size() == remote.file_size() {
                                 // Use the local file, but take the remote metadata so that we pick up
                                 // the correct generation.
-                                UseLocal(
-                                    LocalLayerFileMetadata {
-                                        metadata: remote,
-                                        local_path: local.local_path
-                                    }
-                                )
+                                UseLocal(LocalLayerFileMetadata {
+                                    metadata: remote,
+                                    local_path: local.local_path,
+                                })
                             } else {
                                 init::cleanup_local_file_for_remote(&local, &remote)?;
                                 UseRemote { local, remote }
@@ -2501,7 +2479,11 @@ impl Timeline {
                         Ok(decision) => decision,
                         Err(DismissedLayer::Future { local }) => {
                             if let Some(local) = local {
-                                init::cleanup_future_layer(&local.local_path, &name, disk_consistent_lsn)?;
+                                init::cleanup_future_layer(
+                                    &local.local_path,
+                                    &name,
+                                    disk_consistent_lsn,
+                                )?;
                             }
                             needs_cleanup.push(name);
                             continue;
@@ -2523,7 +2505,8 @@ impl Timeline {
                     let layer = match decision {
                         UseLocal(local) => {
                             total_physical_size += local.metadata.file_size();
-                            Layer::for_resident(conf, &this, local.local_path, name, local.metadata).drop_eviction_guard()
+                            Layer::for_resident(conf, &this, local.local_path, name, local.metadata)
+                                .drop_eviction_guard()
                         }
                         Evicted(remote) | UseRemote { remote, .. } => {
                             Layer::for_evicted(conf, &this, name, remote)
@@ -2543,36 +2526,36 @@ impl Timeline {
 
         guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
 
-        if let Some(rtc) = self.remote_client.as_ref() {
-            rtc.schedule_layer_file_deletion(&needs_cleanup)?;
-            rtc.schedule_index_upload_for_file_changes()?;
-            // This barrier orders above DELETEs before any later operations.
-            // This is critical because code executing after the barrier might
-            // create again objects with the same key that we just scheduled for deletion.
-            // For example, if we just scheduled deletion of an image layer "from the future",
-            // later compaction might run again and re-create the same image layer.
-            // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn.
-            // "same" here means same key range and LSN.
-            //
-            // Without a barrier between above DELETEs and the re-creation's PUTs,
-            // the upload queue may execute the PUT first, then the DELETE.
-            // In our example, we will end up with an IndexPart referencing a non-existent object.
-            //
-            // 1. a future image layer is created and uploaded
-            // 2. ps restart
-            // 3. the future layer from (1) is deleted during load layer map
-            // 4. image layer is re-created and uploaded
-            // 5. deletion queue would like to delete (1) but actually deletes (4)
-            // 6. delete by name works as expected, but it now deletes the wrong (later) version
-            //
-            // See https://github.com/neondatabase/neon/issues/5878
-            //
-            // NB: generation numbers naturally protect against this because they disambiguate
-            //     (1) and (4)
-            rtc.schedule_barrier()?;
-            // Tenant::create_timeline will wait for these uploads to happen before returning, or
-            // on retry.
-        }
+        self.remote_client
+            .schedule_layer_file_deletion(&needs_cleanup)?;
+        self.remote_client
+            .schedule_index_upload_for_file_changes()?;
+        // This barrier orders above DELETEs before any later operations.
+        // This is critical because code executing after the barrier might
+        // create again objects with the same key that we just scheduled for deletion.
+        // For example, if we just scheduled deletion of an image layer "from the future",
+        // later compaction might run again and re-create the same image layer.
+        // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn.
+        // "same" here means same key range and LSN.
+        //
+        // Without a barrier between above DELETEs and the re-creation's PUTs,
+        // the upload queue may execute the PUT first, then the DELETE.
+        // In our example, we will end up with an IndexPart referencing a non-existent object.
+        //
+        // 1. a future image layer is created and uploaded
+        // 2. ps restart
+        // 3. the future layer from (1) is deleted during load layer map
+        // 4. image layer is re-created and uploaded
+        // 5. deletion queue would like to delete (1) but actually deletes (4)
+        // 6. delete by name works as expected, but it now deletes the wrong (later) version
+        //
+        // See https://github.com/neondatabase/neon/issues/5878
+        //
+        // NB: generation numbers naturally protect against this because they disambiguate
+        //     (1) and (4)
+        self.remote_client.schedule_barrier()?;
+        // Tenant::create_timeline will wait for these uploads to happen before returning, or
+        // on retry.
 
         info!(
             "loaded layer map with {} layers at {}, total physical size: {}",
@@ -3025,9 +3008,6 @@ impl Timeline {
     /// should treat this as a cue to simply skip doing any heatmap uploading
     /// for this timeline.
     pub(crate) async fn generate_heatmap(&self) -> Option<HeatMapTimeline> {
-        // no point in heatmaps without remote client
-        let _remote_client = self.remote_client.as_ref()?;
-
         if !self.is_active() {
             return None;
         }
@@ -3055,10 +3035,7 @@ impl Timeline {
         // branchpoint in the value in IndexPart::lineage
         self.ancestor_lsn == lsn
             || (self.ancestor_lsn == Lsn::INVALID
-                && self
-                    .remote_client
-                    .as_ref()
-                    .is_some_and(|rtc| rtc.is_previous_ancestor_lsn(lsn)))
+                && self.remote_client.is_previous_ancestor_lsn(lsn))
     }
 }
 
@@ -3978,29 +3955,23 @@ impl Timeline {
             x.unwrap()
         ));
 
-        if let Some(remote_client) = &self.remote_client {
-            for layer in layers_to_upload {
-                remote_client.schedule_layer_file_upload(layer)?;
-            }
-            remote_client.schedule_index_upload_for_metadata_update(&update)?;
+        for layer in layers_to_upload {
+            self.remote_client.schedule_layer_file_upload(layer)?;
         }
+        self.remote_client
+            .schedule_index_upload_for_metadata_update(&update)?;
 
         Ok(())
     }
 
     pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
-        if let Some(remote_client) = &self.remote_client {
-            remote_client
-                .preserve_initdb_archive(
-                    &self.tenant_shard_id.tenant_id,
-                    &self.timeline_id,
-                    &self.cancel,
-                )
-                .await?;
-        } else {
-            bail!("No remote storage configured, but was asked to backup the initdb archive for {} / {}", self.tenant_shard_id.tenant_id, self.timeline_id);
-        }
-        Ok(())
+        self.remote_client
+            .preserve_initdb_archive(
+                &self.tenant_shard_id.tenant_id,
+                &self.timeline_id,
+                &self.cancel,
+            )
+            .await
     }
 
     // Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked
@@ -4361,12 +4332,7 @@ impl Timeline {
             return;
         }
 
-        if self
-            .remote_client
-            .as_ref()
-            .map(|c| c.is_deleting())
-            .unwrap_or(false)
-        {
+        if self.remote_client.is_deleting() {
             // The timeline was created in a deletion-resume state, we don't expect logical size to be populated
             return;
         }
@@ -4534,9 +4500,8 @@ impl Timeline {
         // deletion will happen later, the layer file manager calls garbage_collect_on_drop
         guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics);
 
-        if let Some(remote_client) = self.remote_client.as_ref() {
-            remote_client.schedule_compaction_update(&remove_layers, new_deltas)?;
-        }
+        self.remote_client
+            .schedule_compaction_update(&remove_layers, new_deltas)?;
 
         drop_wlock(guard);
 
@@ -4554,9 +4519,8 @@ impl Timeline {
 
         let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
 
-        if let Some(remote_client) = self.remote_client.as_ref() {
-            remote_client.schedule_compaction_update(&drop_layers, &upload_layers)?;
-        }
+        self.remote_client
+            .schedule_compaction_update(&drop_layers, &upload_layers)?;
 
         Ok(())
     }
@@ -4566,16 +4530,14 @@ impl Timeline {
         self: &Arc<Self>,
         new_images: impl IntoIterator<Item = ResidentLayer>,
     ) -> anyhow::Result<()> {
-        let Some(remote_client) = &self.remote_client else {
-            return Ok(());
-        };
         for layer in new_images {
-            remote_client.schedule_layer_file_upload(layer)?;
+            self.remote_client.schedule_layer_file_upload(layer)?;
         }
         // should any new image layer been created, not uploading index_part will
         // result in a mismatch between remote_physical_size and layermap calculated
         // size, which will fail some tests, but should not be an issue otherwise.
-        remote_client.schedule_index_upload_for_file_changes()?;
+        self.remote_client
+            .schedule_index_upload_for_file_changes()?;
         Ok(())
     }
 
@@ -4861,9 +4823,7 @@ impl Timeline {
 
             result.layers_removed = gc_layers.len() as u64;
 
-            if let Some(remote_client) = self.remote_client.as_ref() {
-                remote_client.schedule_gc_update(&gc_layers)?;
-            }
+            self.remote_client.schedule_gc_update(&gc_layers)?;
 
             guard.finish_gc_timeline(&gc_layers);
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 4226bf431e..ed48b4c9cb 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -295,13 +295,11 @@ impl Timeline {
         // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
         self.rewrite_layers(replace_layers, drop_layers).await?;
 
-        if let Some(remote_client) = self.remote_client.as_ref() {
-            // We wait for all uploads to complete before finishing this compaction stage.  This is not
-            // necessary for correctness, but it simplifies testing, and avoids proceeding with another
-            // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
-            // load.
-            remote_client.wait_completion().await?;
-        }
+        // We wait for all uploads to complete before finishing this compaction stage.  This is not
+        // necessary for correctness, but it simplifies testing, and avoids proceeding with another
+        // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
+        // load.
+        self.remote_client.wait_completion().await?;
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index d8701be170..901f5149b3 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -26,19 +26,21 @@ use super::{Timeline, TimelineResources};
 /// during attach or pageserver restart.
 /// See comment in persist_index_part_with_deleted_flag.
 async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
-    if let Some(remote_client) = timeline.remote_client.as_ref() {
-        match remote_client.persist_index_part_with_deleted_flag().await {
-            // If we (now, or already) marked it successfully as deleted, we can proceed
-            Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
-            // Bail out otherwise
-            //
-            // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
-            // two tasks from performing the deletion at the same time. The first task
-            // that starts deletion should run it to completion.
-            Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
-            | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
-                return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
-            }
+    match timeline
+        .remote_client
+        .persist_index_part_with_deleted_flag()
+        .await
+    {
+        // If we (now, or already) marked it successfully as deleted, we can proceed
+        Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
+        // Bail out otherwise
+        //
+        // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
+        // two tasks from performing the deletion at the same time. The first task
+        // that starts deletion should run it to completion.
+        Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
+        | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
+            return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
         }
     }
     Ok(())
@@ -117,11 +119,11 @@ pub(super) async fn delete_local_timeline_directory(
 
 /// Removes remote layers and an index file after them.
 async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
-    if let Some(remote_client) = &timeline.remote_client {
-        remote_client.delete_all().await.context("delete_all")?
-    };
-
-    Ok(())
+    timeline
+        .remote_client
+        .delete_all()
+        .await
+        .context("delete_all")
 }
 
 // This function removs remaining traces of a timeline on disk.
@@ -260,7 +262,7 @@ impl DeleteTimelineFlow {
         tenant: Arc<Tenant>,
         timeline_id: TimelineId,
         local_metadata: &TimelineMetadata,
-        remote_client: Option<RemoteTimelineClient>,
+        remote_client: RemoteTimelineClient,
         deletion_queue_client: DeletionQueueClient,
     ) -> anyhow::Result<()> {
         // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 9471ba860f..7f59758c87 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -70,10 +70,6 @@ pub(super) async fn prepare(
 ) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
     use Error::*;
 
-    if detached.remote_client.as_ref().is_none() {
-        unimplemented!("no new code for running without remote storage");
-    }
-
     let Some((ancestor, ancestor_lsn)) = detached
         .ancestor_timeline
         .as_ref()
@@ -315,8 +311,6 @@ async fn upload_rewritten_layer(
     // FIXME: better shuttingdown error
     target
         .remote_client
-        .as_ref()
-        .unwrap()
         .upload_layer_file(&copied, cancel)
         .await
         .map_err(UploadRewritten)?;
@@ -406,8 +400,6 @@ async fn remote_copy(
     // FIXME: better shuttingdown error
     adoptee
         .remote_client
-        .as_ref()
-        .unwrap()
         .copy_timeline_layer(adopted, &owned, cancel)
         .await
         .map(move |()| owned)
@@ -421,11 +413,6 @@ pub(super) async fn complete(
     prepared: PreparedTimelineDetach,
     _ctx: &RequestContext,
 ) -> Result<Vec<TimelineId>, anyhow::Error> {
-    let rtc = detached
-        .remote_client
-        .as_ref()
-        .expect("has to have a remote timeline client for timeline ancestor detach");
-
     let PreparedTimelineDetach { layers } = prepared;
 
     let ancestor = detached
@@ -442,11 +429,13 @@ pub(super) async fn complete(
     //
     // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
     // which could give us a completely wrong layer combination.
-    rtc.schedule_adding_existing_layers_to_index_detach_and_wait(
-        &layers,
-        (ancestor.timeline_id, ancestor_lsn),
-    )
-    .await?;
+    detached
+        .remote_client
+        .schedule_adding_existing_layers_to_index_detach_and_wait(
+            &layers,
+            (ancestor.timeline_id, ancestor_lsn),
+        )
+        .await?;
 
     let mut tasks = tokio::task::JoinSet::new();
 
@@ -491,8 +480,6 @@ pub(super) async fn complete(
                 async move {
                     let res = timeline
                         .remote_client
-                        .as_ref()
-                        .expect("reparented has to have remote client because detached has one")
                         .schedule_reparenting_and_wait(&new_parent)
                         .await;
 
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 3567761b9a..8a8c38d0ce 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -23,7 +23,7 @@ use std::{
 use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
+use tracing::{debug, info, info_span, instrument, warn, Instrument};
 
 use crate::{
     context::{DownloadBehavior, RequestContext},
@@ -211,11 +211,6 @@ impl Timeline {
 
         // So, we just need to deal with this.
 
-        if self.remote_client.is_none() {
-            error!("no remote storage configured, cannot evict layers");
-            return ControlFlow::Continue(());
-        }
-
         let mut js = tokio::task::JoinSet::new();
         {
             let guard = self.layers.read().await;
diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs
index 66aa765015..feadc79e5e 100644
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -9,7 +9,6 @@ use crate::{
         storage_layer::LayerName,
         Generation,
     },
-    METADATA_FILE_NAME,
 };
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
@@ -27,8 +26,6 @@ pub(super) enum Discovered {
     Temporary(String),
     /// Temporary on-demand download files, should be removed
     TemporaryDownload(String),
-    /// "metadata" file we persist locally and include in `index_part.json`
-    Metadata,
     /// Backup file from previously future layers
     IgnoredBackup,
     /// Unrecognized, warn about these
@@ -49,9 +46,7 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
                 Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
             }
             Err(_) => {
-                if file_name == METADATA_FILE_NAME {
-                    Discovered::Metadata
-                } else if file_name.ends_with(".old") {
+                if file_name.ends_with(".old") {
                     // ignore these
                     Discovered::IgnoredBackup
                 } else if remote_timeline_client::is_temp_download_file(direntry.path()) {
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 7d4e101189..61afd820ca 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -56,14 +56,8 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
     (tenant0, timeline0, pg0) = tenant_timelines[0]
     log.info(f"Timeline {tenant0}/{timeline0} is left intact")
 
-    (tenant1, timeline1, pg1) = tenant_timelines[1]
-    metadata_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/metadata"
-    with open(metadata_path, "w") as f:
-        f.write("overwritten with garbage!")
-    log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled")
-
-    (tenant2, timeline2, pg2) = tenant_timelines[2]
-    timeline_path = f"{env.pageserver.workdir}/tenants/{tenant2}/timelines/{timeline2}/"
+    (tenant1, timeline1, pg1) = tenant_timelines[2]
+    timeline_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/"
     for filename in os.listdir(timeline_path):
         if filename.startswith("00000"):
             # Looks like a layer file. Corrupt it
@@ -72,7 +66,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
             with open(p, "wb") as f:
                 f.truncate(0)
                 f.truncate(size)
-    log.info(f"Timeline {tenant2}/{timeline2} got its local layer files spoiled")
+    log.info(f"Timeline {tenant1}/{timeline1} got its local layer files spoiled")
 
     env.pageserver.start()
 
@@ -80,19 +74,15 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
     pg0.start()
     assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100
 
-    # Tenant with corrupt local metadata works: remote storage is authoritative for metadata
-    pg1.start()
-    assert pg1.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100
-
     # Second timeline will fail during basebackup, because the local layer file is corrupt.
     # It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
     # (We don't check layer file contents on startup, when loading the timeline)
     #
     # This will change when we implement checksums for layers
     with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err:
-        pg2.start()
+        pg1.start()
     log.info(
-        f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"
+        f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}"
     )
 
 
From bc78b0e9cc95ea033797b13d5bb36e61d338a070 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 15 May 2024 14:18:02 +0200
Subject: [PATCH 0784/1571] chore(deps): use upstream svg_fmt after they merged
 our PR (#7764)

They have merged our PR https://github.com/nical/rust_debug/pull/4 but
they haven't released a new crate version yet.

refs https://github.com/neondatabase/neon/issues/7763
---
 Cargo.lock | 2 +-
 Cargo.toml | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6ce7180d67..961101b151 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5952,7 +5952,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 [[package]]
 name = "svg_fmt"
 version = "0.4.2"
-source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#c1820b28664b5df68de7f043fccf2ed5d67b6ae8"
+source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4"
 
 [[package]]
 name = "syn"
diff --git a/Cargo.toml b/Cargo.toml
index 17f30a1327..3ccdabee18 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -158,8 +158,8 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-# https://github.com/nical/rust_debug/pull/4
-svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
+# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
+svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"

From c3dd646ab3a48e1063c7efc6d080e21fdfb48fa7 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 15 May 2024 15:04:52 +0200
Subject: [PATCH 0785/1571] chore!: always use async walredo, warn if sync is
 configured (#7754)

refs https://github.com/neondatabase/neon/issues/7753

This PR is step (1) of removing sync walredo from Pageserver.

Changes:
* Remove the sync impl
* If sync is configured, warn! and use async instead
* Remove the metric that exposes `kind`
* Remove the tenant status API that exposes `kind`

Future Work
-----------

After we've released this change to prod and are sure we won't
roll back, we will

1. update the prod Ansible to remove the config flag from the prod
   pageserver.toml.
2. remove the remaining `kind` code in pageserver

These two changes need no release inbetween.

See  https://github.com/neondatabase/neon/issues/7753 for details.
---
 libs/pageserver_api/src/models.rs             |   3 -
 pageserver/benches/bench_walredo.rs           | 139 ++----
 pageserver/src/bin/pageserver.rs              |   1 -
 pageserver/src/config.rs                      |   2 +-
 pageserver/src/metrics.rs                     |  23 -
 pageserver/src/walredo.rs                     |   5 +-
 pageserver/src/walredo/process.rs             |  57 +--
 .../process/process_impl/process_std.rs       | 405 ------------------
 test_runner/regress/test_pageserver_config.py |  35 --
 9 files changed, 67 insertions(+), 603 deletions(-)
 delete mode 100644 pageserver/src/walredo/process/process_impl/process_std.rs
 delete mode 100644 test_runner/regress/test_pageserver_config.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 1df5820fb9..d78d2bcbea 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -776,9 +776,6 @@ pub struct TimelineGcRequest {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalRedoManagerProcessStatus {
     pub pid: u32,
-    /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
-    /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
-    pub kind: Cow<'static, str>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 5b871c5d5e..5aab10e5d9 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -30,47 +30,27 @@
 //! 2024-04-15 on i3en.3xlarge
 //!
 //! ```text
-//! async-short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
-//! async-short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
-//! async-short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
-//! async-short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
-//! async-short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
-//! async-short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
-//! async-short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
-//! async-short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
-//! async-medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
-//! async-medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
-//! async-medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
-//! async-medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
-//! async-medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
-//! async-medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
-//! async-medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
-//! async-medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
-//! sync-short/1            time:   [25.503 µs 25.626 µs 25.771 µs]
-//! sync-short/2            time:   [30.850 µs 31.013 µs 31.208 µs]
-//! sync-short/4            time:   [45.543 µs 45.856 µs 46.193 µs]
-//! sync-short/8            time:   [84.114 µs 84.639 µs 85.220 µs]
-//! sync-short/16           time:   [185.22 µs 186.15 µs 187.13 µs]
-//! sync-short/32           time:   [377.43 µs 378.87 µs 380.46 µs]
-//! sync-short/64           time:   [756.49 µs 759.04 µs 761.70 µs]
-//! sync-short/128          time:   [1.4825 ms 1.4874 ms 1.4923 ms]
-//! sync-medium/1           time:   [105.66 µs 106.01 µs 106.43 µs]
-//! sync-medium/2           time:   [153.10 µs 153.84 µs 154.72 µs]
-//! sync-medium/4           time:   [327.13 µs 329.44 µs 332.27 µs]
-//! sync-medium/8           time:   [654.26 µs 658.73 µs 663.63 µs]
-//! sync-medium/16          time:   [1.2682 ms 1.2748 ms 1.2816 ms]
-//! sync-medium/32          time:   [2.4456 ms 2.4595 ms 2.4731 ms]
-//! sync-medium/64          time:   [4.6523 ms 4.6890 ms 4.7256 ms]
-//! sync-medium/128         time:   [8.7215 ms 8.8323 ms 8.9344 ms]
+//! short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
+//! short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
+//! short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
+//! short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
+//! short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
+//! short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
+//! short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
+//! short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
+//! medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
+//! medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
+//! medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
+//! medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
+//! medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
+//! medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
+//! medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
+//! medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
 //! ```
 
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
-use pageserver::{
-    config::PageServerConf,
-    walrecord::NeonWalRecord,
-    walredo::{PostgresRedoManager, ProcessKind},
-};
+use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
     sync::Arc,
@@ -80,39 +60,32 @@ use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};
 
 fn bench(c: &mut Criterion) {
-    for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
-        {
-            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-            for nclients in nclients {
-                let mut group = c.benchmark_group(format!("{process_kind}-short"));
-                group.bench_with_input(
-                    BenchmarkId::from_parameter(nclients),
-                    &nclients,
-                    |b, nclients| {
-                        let redo_work = Arc::new(Request::short_input());
-                        b.iter_custom(|iters| {
-                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
-                        });
-                    },
-                );
-            }
+    {
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        for nclients in nclients {
+            let mut group = c.benchmark_group("short");
+            group.bench_with_input(
+                BenchmarkId::from_parameter(nclients),
+                &nclients,
+                |b, nclients| {
+                    let redo_work = Arc::new(Request::short_input());
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                },
+            );
         }
-
-        {
-            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-            for nclients in nclients {
-                let mut group = c.benchmark_group(format!("{process_kind}-medium"));
-                group.bench_with_input(
-                    BenchmarkId::from_parameter(nclients),
-                    &nclients,
-                    |b, nclients| {
-                        let redo_work = Arc::new(Request::medium_input());
-                        b.iter_custom(|iters| {
-                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
-                        });
-                    },
-                );
-            }
+    }
+    {
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        for nclients in nclients {
+            let mut group = c.benchmark_group("medium");
+            group.bench_with_input(
+                BenchmarkId::from_parameter(nclients),
+                &nclients,
+                |b, nclients| {
+                    let redo_work = Arc::new(Request::medium_input());
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                },
+            );
         }
     }
 }
@@ -120,16 +93,10 @@ criterion::criterion_group!(benches, bench);
 criterion::criterion_main!(benches);
 
 // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(
-    process_kind: ProcessKind,
-    redo_work: Arc<Request>,
-    n_redos: u64,
-    nclients: u64,
-) -> Duration {
+fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
     let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
 
-    let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
-    conf.walredo_process_kind = process_kind;
+    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
     let conf = Box::leak(Box::new(conf));
     let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
 
@@ -158,27 +125,13 @@ fn bench_impl(
         });
     }
 
-    let elapsed = rt.block_on(async move {
+    rt.block_on(async move {
         let mut total_wallclock_time = Duration::ZERO;
         while let Some(res) = tasks.join_next().await {
             total_wallclock_time += res.unwrap();
         }
         total_wallclock_time
-    });
-
-    // consistency check to ensure process kind setting worked
-    if nredos_per_client > 0 {
-        assert_eq!(
-            manager
-                .status()
-                .process
-                .map(|p| p.kind)
-                .expect("the benchmark work causes a walredo process to be spawned"),
-            std::borrow::Cow::Borrowed(process_kind.into())
-        );
-    }
-
-    elapsed
+    })
 }
 
 async fn client(
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index c0099aa704..a04195e12b 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -284,7 +284,6 @@ fn start_pageserver(
     ))
     .unwrap();
     pageserver::preinitialize_metrics();
-    pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
 
     // If any failpoints were set from FAILPOINTS environment variable,
     // print them to the log for debugging purposes
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 258eed0b12..b0afb6414b 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -99,7 +99,7 @@ pub mod defaults {
 
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
 
-    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
+    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";
 
     ///
     /// Default built-in configuration file.
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index b27bfb43b0..ffcd08b4b3 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1999,29 +1999,6 @@ impl Default for WalRedoProcessCounters {
 pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
     Lazy::new(WalRedoProcessCounters::default);
 
-#[cfg(not(test))]
-pub mod wal_redo {
-    use super::*;
-
-    static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
-        std::sync::Mutex::new(
-            register_uint_gauge_vec!(
-                "pageserver_wal_redo_process_kind",
-                "The configured process kind for walredo",
-                &["kind"],
-            )
-            .unwrap(),
-        )
-    });
-
-    pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
-        // use guard to avoid races around the next two steps
-        let guard = PROCESS_KIND.lock().unwrap();
-        guard.reset();
-        guard.with_label_values(&[&format!("{kind}")]).set(1);
-    }
-}
-
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
 pub(crate) struct StorageTimeMetricsTimer {
     metrics: StorageTimeMetrics,
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 9776d4ce88..3decea0c6d 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -153,10 +153,7 @@ impl PostgresRedoManager {
             process: self
                 .redo_process
                 .get()
-                .map(|p| WalRedoManagerProcessStatus {
-                    pid: p.id(),
-                    kind: std::borrow::Cow::Borrowed(p.kind().into()),
-                }),
+                .map(|p| WalRedoManagerProcessStatus { pid: p.id() }),
         }
     }
 }
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index ad6b4e5fe9..02c9c04bf1 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -1,7 +1,10 @@
+/// Layer of indirection previously used to support multiple implementations.
+/// Subject to removal: <https://github.com/neondatabase/neon/issues/7753>
 use std::time::Duration;
 
 use bytes::Bytes;
 use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use tracing::warn;
 use utils::lsn::Lsn;
 
 use crate::{config::PageServerConf, walrecord::NeonWalRecord};
@@ -12,7 +15,6 @@ mod protocol;
 
 mod process_impl {
     pub(super) mod process_async;
-    pub(super) mod process_std;
 }
 
 #[derive(
@@ -34,10 +36,7 @@ pub enum Kind {
     Async,
 }
 
-pub(crate) enum Process {
-    Sync(process_impl::process_std::WalRedoProcess),
-    Async(process_impl::process_async::WalRedoProcess),
-}
+pub(crate) struct Process(process_impl::process_async::WalRedoProcess);
 
 impl Process {
     #[inline(always)]
@@ -46,18 +45,17 @@ impl Process {
         tenant_shard_id: TenantShardId,
         pg_version: u32,
     ) -> anyhow::Result<Self> {
-        Ok(match conf.walredo_process_kind {
-            Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch(
-                conf,
-                tenant_shard_id,
-                pg_version,
-            )?),
-            Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch(
-                conf,
-                tenant_shard_id,
-                pg_version,
-            )?),
-        })
+        if conf.walredo_process_kind != Kind::Async {
+            warn!(
+                configured = %conf.walredo_process_kind,
+                "the walredo_process_kind setting has been turned into a no-op, using async implementation"
+            );
+        }
+        Ok(Self(process_impl::process_async::WalRedoProcess::launch(
+            conf,
+            tenant_shard_id,
+            pg_version,
+        )?))
     }
 
     #[inline(always)]
@@ -69,29 +67,12 @@ impl Process {
         records: &[(Lsn, NeonWalRecord)],
         wal_redo_timeout: Duration,
     ) -> anyhow::Result<Bytes> {
-        match self {
-            Process::Sync(p) => {
-                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
-                    .await
-            }
-            Process::Async(p) => {
-                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
-                    .await
-            }
-        }
+        self.0
+            .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+            .await
     }
 
     pub(crate) fn id(&self) -> u32 {
-        match self {
-            Process::Sync(p) => p.id(),
-            Process::Async(p) => p.id(),
-        }
-    }
-
-    pub(crate) fn kind(&self) -> Kind {
-        match self {
-            Process::Sync(_) => Kind::Sync,
-            Process::Async(_) => Kind::Async,
-        }
+        self.0.id()
     }
 }
diff --git a/pageserver/src/walredo/process/process_impl/process_std.rs b/pageserver/src/walredo/process/process_impl/process_std.rs
deleted file mode 100644
index e7a6c263c9..0000000000
--- a/pageserver/src/walredo/process/process_impl/process_std.rs
+++ /dev/null
@@ -1,405 +0,0 @@
-use self::no_leak_child::NoLeakChild;
-use crate::{
-    config::PageServerConf,
-    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
-    walrecord::NeonWalRecord,
-    walredo::process::{no_leak_child, protocol},
-};
-use anyhow::Context;
-use bytes::Bytes;
-use nix::poll::{PollFd, PollFlags};
-use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use postgres_ffi::BLCKSZ;
-use std::os::fd::AsRawFd;
-#[cfg(feature = "testing")]
-use std::sync::atomic::AtomicUsize;
-use std::{
-    collections::VecDeque,
-    io::{Read, Write},
-    process::{ChildStdin, ChildStdout, Command, Stdio},
-    sync::{Mutex, MutexGuard},
-    time::Duration,
-};
-use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, nonblock::set_nonblock};
-
-pub struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: Mutex<ProcessOutput>,
-    stdin: Mutex<ProcessInput>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
-}
-
-struct ProcessInput {
-    stdin: ChildStdin,
-    n_requests: usize,
-}
-
-struct ProcessOutput {
-    stdout: ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
-}
-
-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(pg_version=pg_version))]
-    pub(crate) fn launch(
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-        pg_version: u32,
-    ) -> anyhow::Result<Self> {
-        crate::span::debug_assert_current_span_has_tenant_id();
-
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        use no_leak_child::NoLeakChildCommandExt;
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        macro_rules! set_nonblock_or_log_err {
-        ($file:ident) => {{
-            let res = set_nonblock($file.as_raw_fd());
-            if let Err(e) = &res {
-                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
-            }
-            res
-        }};
-    }
-        set_nonblock_or_log_err!(stdin)?;
-        set_nonblock_or_log_err!(stdout)?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-            async move {
-                scopeguard::defer! {
-                    debug!("wal-redo-postgres stderr_logger_task finished");
-                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-                }
-                debug!("wal-redo-postgres stderr_logger_task started");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-                use tokio::io::AsyncBufReadExt;
-                let mut stderr_lines = tokio::io::BufReader::new(stderr);
-                let mut buf = Vec::new();
-                let res = loop {
-                    buf.clear();
-                    // TODO we don't trust the process to cap its stderr length.
-                    // Currently it can do unbounded Vec allocation.
-                    match stderr_lines.read_until(b'\n', &mut buf).await {
-                        Ok(0) => break Ok(()), // eof
-                        Ok(num_bytes) => {
-                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                            error!(%output, "received output");
-                        }
-                        Err(e) => {
-                            break Err(e);
-                        }
-                    }
-                };
-                match res {
-                    Ok(()) => (),
-                    Err(e) => {
-                        error!(error=?e, "failed to read from walredo stderr");
-                    }
-                }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-        );
-
-        Ok(Self {
-            conf,
-            tenant_shard_id,
-            child: Some(child),
-            stdin: Mutex::new(ProcessInput {
-                stdin,
-                n_requests: 0,
-            }),
-            stdout: Mutex::new(ProcessOutput {
-                stdout,
-                pending_responses: VecDeque::new(),
-                n_processed_responses: 0,
-            }),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
-        })
-    }
-
-    pub(crate) fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    // Apply given WAL records ('records') over an old page image. Returns
-    // new page image.
-    //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
-    pub(crate) async fn apply_wal_records(
-        &self,
-        rel: RelTag,
-        blknum: u32,
-        base_img: &Option<Bytes>,
-        records: &[(Lsn, NeonWalRecord)],
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let tag = protocol::BufferTag { rel, blknum };
-        let input = self.stdin.lock().unwrap();
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            protocol::build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
-            }
-        }
-        protocol::build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
-    }
-
-    fn apply_wal_records0(
-        &self,
-        writebuf: &[u8],
-        input: MutexGuard<ProcessInput>,
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
-        let mut nwrite = 0usize;
-
-        while nwrite < writebuf.len() {
-            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
-            let n = loop {
-                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
-                    Err(nix::errno::Errno::EINTR) => continue,
-                    res => break res,
-                }
-            }?;
-
-            if n == 0 {
-                anyhow::bail!("WAL redo timed out");
-            }
-
-            // If 'stdin' is writeable, do write.
-            let in_revents = stdin_pollfds[0].revents().unwrap();
-            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            }
-            if in_revents.contains(PollFlags::POLLHUP) {
-                // We still have more data to write, but the process closed the pipe.
-                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
-            }
-        }
-        let request_no = proc.n_requests;
-        proc.n_requests += 1;
-        drop(proc);
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut output = self.stdout.lock().unwrap();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
-            while nresult < BLCKSZ.into() {
-                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
-                // We do two things simultaneously: reading response from stdout
-                // and forward any logging information that the child writes to its stderr to the page server's log.
-                let n = loop {
-                    match nix::poll::poll(
-                        &mut stdout_pollfds[..],
-                        wal_redo_timeout.as_millis() as i32,
-                    ) {
-                        Err(nix::errno::Errno::EINTR) => continue,
-                        res => break res,
-                    }
-                }?;
-
-                if n == 0 {
-                    anyhow::bail!("WAL redo timed out");
-                }
-
-                // If we have some data in stdout, read it to the result buffer.
-                let out_revents = stdout_pollfds[0].revents().unwrap();
-                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                }
-                if out_revents.contains(PollFlags::POLLHUP) {
-                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
-                }
-            }
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        use std::sync::atomic::Ordering;
-
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
-        }
-    }
-
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
-
-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
-    }
-}
diff --git a/test_runner/regress/test_pageserver_config.py b/test_runner/regress/test_pageserver_config.py
deleted file mode 100644
index c04348b488..0000000000
--- a/test_runner/regress/test_pageserver_config.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import pytest
-from fixtures.neon_fixtures import (
-    NeonEnvBuilder,
-    last_flush_lsn_upload,
-)
-
-
-@pytest.mark.parametrize("kind", ["sync", "async"])
-def test_walredo_process_kind_config(neon_env_builder: NeonEnvBuilder, kind: str):
-    neon_env_builder.pageserver_config_override = f"walredo_process_kind = '{kind}'"
-    # ensure it starts
-    env = neon_env_builder.init_start()
-    # ensure the metric is set
-    ps_http = env.pageserver.http_client()
-    metrics = ps_http.get_metrics()
-    samples = metrics.query_all("pageserver_wal_redo_process_kind")
-    assert [(s.labels, s.value) for s in samples] == [({"kind": kind}, 1)]
-    # ensure default tenant's config kind matches
-    # => write some data to force-spawn walredo
-    ep = env.endpoints.create_start("main")
-    with ep.connect() as conn:
-        with conn.cursor() as cur:
-            cur.execute("create table foo(bar text)")
-            cur.execute("insert into foo select from generate_series(1, 100)")
-    last_flush_lsn_upload(env, ep, env.initial_tenant, env.initial_timeline)
-    ep.stop()
-    ep.start()
-    with ep.connect() as conn:
-        with conn.cursor() as cur:
-            cur.execute("select count(*) from foo")
-            [(count,)] = cur.fetchall()
-            assert count == 100
-
-    status = ps_http.tenant_status(env.initial_tenant)
-    assert status["walredo"]["process"]["kind"] == kind

From 1075386d778f67ad32200c3e7d7279479a7eb84f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 15 May 2024 15:32:47 +0200
Subject: [PATCH 0786/1571] Add test_uploads_and_deletions test (#7758)

Adds a test that is a reproducer for many tiered compaction bugs,
both ones that have since been fixed as well as still unfxied ones:
* (now fixed) #7296
* #7707
* #7759
* Likely also #7244 but I haven't tried that.

The key ordering bug can be reproduced by switching to
`merge_delta_keys` instead of `merge_delta_keys_buffered`, so reverting
a big part of #7661, although it only sometimes reproduces (30-50% of
cases).

part of https://github.com/neondatabase/neon/issues/7554
---
 test_runner/fixtures/neon_fixtures.py         | 78 +++++++++++++++++
 test_runner/regress/test_compaction.py        | 62 +++++++++++++-
 .../regress/test_pageserver_generations.py    | 83 +------------------
 3 files changed, 141 insertions(+), 82 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 62a4b974a3..405ef19bfc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -59,6 +59,7 @@ from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
     wait_for_upload,
+    wait_for_upload_queue_empty,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
@@ -79,6 +80,7 @@ from fixtures.utils import (
     allure_attach_from_dir,
     assert_no_errors,
     get_self_dir,
+    print_gc_result,
     subprocess_capture,
     wait_until,
 )
@@ -4419,3 +4421,79 @@ def parse_project_git_version_output(s: str) -> str:
         return commit
 
     raise ValueError(f"unable to parse --version output: '{s}'")
+
+
+def generate_uploads_and_deletions(
+    env: NeonEnv,
+    *,
+    init: bool = True,
+    tenant_id: Optional[TenantId] = None,
+    timeline_id: Optional[TimelineId] = None,
+    data: Optional[str] = None,
+    pageserver: NeonPageserver,
+):
+    """
+    Using the environment's default tenant + timeline, generate a load pattern
+    that results in some uploads and some deletions to remote storage.
+    """
+
+    if tenant_id is None:
+        tenant_id = env.initial_tenant
+    assert tenant_id is not None
+
+    if timeline_id is None:
+        timeline_id = env.initial_timeline
+    assert timeline_id is not None
+
+    ps_http = pageserver.http_client()
+
+    with env.endpoints.create_start(
+        "main", tenant_id=tenant_id, pageserver_id=pageserver.id
+    ) as endpoint:
+        if init:
+            endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+            last_flush_lsn_upload(
+                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+            )
+
+        def churn(data):
+            endpoint.safe_psql_many(
+                [
+                    f"""
+                INSERT INTO foo (id, val)
+                SELECT g, '{data}'
+                FROM generate_series(1, 200) g
+                ON CONFLICT (id) DO UPDATE
+                SET val = EXCLUDED.val
+                """,
+                    # to ensure that GC can actually remove some layers
+                    "VACUUM foo",
+                ]
+            )
+            assert tenant_id is not None
+            assert timeline_id is not None
+            # We are waiting for uploads as well as local flush, in order to avoid leaving the system
+            # in a state where there are "future layers" in remote storage that will generate deletions
+            # after a restart.
+            last_flush_lsn_upload(
+                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+            )
+
+        # Compaction should generate some GC-elegible layers
+        for i in range(0, 2):
+            churn(f"{i if data is None else data}")
+
+        gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0)
+        print_gc_result(gc_result)
+        assert gc_result["layers_removed"] > 0
+
+        # Stop endpoint and flush all data to pageserver, then checkpoint it: this
+        # ensures that the pageserver is in a fully idle state: there will be no more
+        # background ingest, no more uploads pending, and therefore no non-determinism
+        # in subsequent actions like pageserver restarts.
+        final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
+        ps_http.timeline_checkpoint(tenant_id, timeline_id)
+        # Finish uploads
+        wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn)
+        # Finish all remote writes (including deletions)
+        wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 43a3323462..93a16620a3 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -1,10 +1,12 @@
+import enum
 import json
 import os
 from typing import Optional
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions
+from fixtures.pageserver.http import PageserverApiException
 from fixtures.workload import Workload
 
 AGGRESIVE_COMPACTION_TENANT_CONF = {
@@ -190,3 +192,61 @@ def test_sharding_compaction(
 
     # Assert that everything is still readable
     workload.validate()
+
+
+class CompactionAlgorithm(str, enum.Enum):
+    LEGACY = "Legacy"
+    TIERED = "Tiered"
+
+
+@pytest.mark.parametrize(
+    "compaction_algorithm", [CompactionAlgorithm.LEGACY, CompactionAlgorithm.TIERED]
+)
+def test_uploads_and_deletions(
+    neon_env_builder: NeonEnvBuilder,
+    compaction_algorithm: CompactionAlgorithm,
+):
+    """
+    :param compaction_algorithm: the compaction algorithm to use.
+    """
+
+    tenant_conf = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": f"{128 * 1024}",
+        "compaction_threshold": "1",
+        "compaction_target_size": f"{128 * 1024}",
+        # no PITR horizon, we specify the horizon when we request on-demand GC
+        "pitr_interval": "0s",
+        # disable background compaction and GC. We invoke it manually when we want it to happen.
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # create image layers eagerly, so that GC can remove some layers
+        "image_creation_threshold": "1",
+        "image_layer_creation_check_threshold": "0",
+        "compaction_algorithm": json.dumps({"kind": compaction_algorithm.value}),
+    }
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
+
+    # TODO remove these allowed errors
+    # https://github.com/neondatabase/neon/issues/7707
+    # https://github.com/neondatabase/neon/issues/7759
+    allowed_errors = [
+        ".*duplicated L1 layer.*",
+        ".*delta layer created with.*duplicate values.*",
+        ".*assertion failed: self.lsn_range.start <= lsn.*",
+        ".*HTTP request handler task panicked: task.*panicked.*",
+    ]
+    if compaction_algorithm == CompactionAlgorithm.TIERED:
+        env.pageserver.allowed_errors.extend(allowed_errors)
+
+    try:
+        generate_uploads_and_deletions(env, pageserver=env.pageserver)
+    except PageserverApiException as e:
+        log.info(f"Obtained PageserverApiException: {e}")
+
+    # The errors occur flakily and no error is ensured to occur,
+    # however at least one of them occurs.
+    if compaction_algorithm == CompactionAlgorithm.TIERED:
+        found_allowed_error = any(env.pageserver.log_contains(e) for e in allowed_errors)
+        if not found_allowed_error:
+            raise Exception("None of the allowed_errors occured in the log")
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 4fdc5852f5..9b97254410 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -21,11 +21,9 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
-    NeonPageserver,
     PgBin,
     S3Scrubber,
-    flush_ep_to_pageserver,
-    last_flush_lsn_upload,
+    generate_uploads_and_deletions,
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
@@ -33,12 +31,11 @@ from fixtures.pageserver.utils import (
     list_prefix,
     wait_for_last_record_lsn,
     wait_for_upload,
-    wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import (
     RemoteStorageKind,
 )
-from fixtures.utils import print_gc_result, wait_until
+from fixtures.utils import wait_until
 from fixtures.workload import Workload
 
 # A tenant configuration that is convenient for generating uploads and deletions
@@ -59,82 +56,6 @@ TENANT_CONF = {
 }
 
 
-def generate_uploads_and_deletions(
-    env: NeonEnv,
-    *,
-    init: bool = True,
-    tenant_id: Optional[TenantId] = None,
-    timeline_id: Optional[TimelineId] = None,
-    data: Optional[str] = None,
-    pageserver: NeonPageserver,
-):
-    """
-    Using the environment's default tenant + timeline, generate a load pattern
-    that results in some uploads and some deletions to remote storage.
-    """
-
-    if tenant_id is None:
-        tenant_id = env.initial_tenant
-    assert tenant_id is not None
-
-    if timeline_id is None:
-        timeline_id = env.initial_timeline
-    assert timeline_id is not None
-
-    ps_http = pageserver.http_client()
-
-    with env.endpoints.create_start(
-        "main", tenant_id=tenant_id, pageserver_id=pageserver.id
-    ) as endpoint:
-        if init:
-            endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
-            last_flush_lsn_upload(
-                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
-            )
-
-        def churn(data):
-            endpoint.safe_psql_many(
-                [
-                    f"""
-                INSERT INTO foo (id, val)
-                SELECT g, '{data}'
-                FROM generate_series(1, 200) g
-                ON CONFLICT (id) DO UPDATE
-                SET val = EXCLUDED.val
-                """,
-                    # to ensure that GC can actually remove some layers
-                    "VACUUM foo",
-                ]
-            )
-            assert tenant_id is not None
-            assert timeline_id is not None
-            # We are waiting for uploads as well as local flush, in order to avoid leaving the system
-            # in a state where there are "future layers" in remote storage that will generate deletions
-            # after a restart.
-            last_flush_lsn_upload(
-                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
-            )
-
-        # Compaction should generate some GC-elegible layers
-        for i in range(0, 2):
-            churn(f"{i if data is None else data}")
-
-        gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0)
-        print_gc_result(gc_result)
-        assert gc_result["layers_removed"] > 0
-
-        # Stop endpoint and flush all data to pageserver, then checkpoint it: this
-        # ensures that the pageserver is in a fully idle state: there will be no more
-        # background ingest, no more uploads pending, and therefore no non-determinism
-        # in subsequent actions like pageserver restarts.
-        final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
-        ps_http.timeline_checkpoint(tenant_id, timeline_id)
-        # Finish uploads
-        wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn)
-        # Finish all remote writes (including deletions)
-        wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
-
-
 def read_all(
     env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None
 ):

From 3ef6e2121178a8c4e7f498aff9a9e7cb9376fd1c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 15 May 2024 18:17:55 +0200
Subject: [PATCH 0787/1571] fixup #7747: actually use the fixture for
 neon_env_builder (#7767)

The `= None` makes it not use the fixture.

This slipped due to last-minute changes.
---
 test_runner/fixtures/neon_fixtures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 405ef19bfc..d3aadbe612 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1392,8 +1392,8 @@ def neon_env_builder(
     test_overlay_dir: Path,
     top_output_dir: Path,
     pageserver_virtual_file_io_engine: str,
+    pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
     pageserver_aux_file_policy: Optional[AuxFileStore] = None,
-    pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.

From affc18f912b67a31b05a05a05a9cff468a74d75f Mon Sep 17 00:00:00 2001
From: Jure Bajic <jure.bajic94@gmail.com>
Date: Wed, 15 May 2024 18:41:12 +0200
Subject: [PATCH 0788/1571] Add performance regress
 `test_ondemand_download_churn.py` (#7242)

Add performance regress test  for on-demand download throughput.

Closes https://github.com/neondatabase/neon/issues/7146

Co-authored-by: Christian Schwarz <christian@neon.tech>
Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 libs/pageserver_api/src/models.rs             |  10 +
 .../src/cmd/ondemand_download_churn.rs        | 103 ++++++++---
 .../pagebench/test_ondemand_download_churn.py | 175 ++++++++++++++++++
 3 files changed, 267 insertions(+), 21 deletions(-)
 create mode 100644 test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d78d2bcbea..7cf54bf32a 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -745,6 +745,16 @@ impl HistoricLayerInfo {
         };
         *field = value;
     }
+    pub fn layer_file_size(&self) -> u64 {
+        match self {
+            HistoricLayerInfo::Delta {
+                layer_file_size, ..
+            } => *layer_file_size,
+            HistoricLayerInfo::Image {
+                layer_file_size, ..
+            } => *layer_file_size,
+        }
+    }
 }
 
 #[derive(Debug, Serialize, Deserialize)]
diff --git a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
index 197e782dca..1bb71b9353 100644
--- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
+++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
@@ -2,9 +2,11 @@ use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
 
 use pageserver_client::mgmt_api;
 use rand::seq::SliceRandom;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, info};
 use utils::id::{TenantTimelineId, TimelineId};
 
+use std::{f64, sync::Arc};
 use tokio::{
     sync::{mpsc, OwnedSemaphorePermit},
     task::JoinSet,
@@ -12,10 +14,7 @@ use tokio::{
 
 use std::{
     num::NonZeroUsize,
-    sync::{
-        atomic::{AtomicU64, Ordering},
-        Arc,
-    },
+    sync::atomic::{AtomicU64, Ordering},
     time::{Duration, Instant},
 };
 
@@ -51,19 +50,31 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
     Ok(())
 }
 
+#[derive(serde::Serialize)]
+struct Output {
+    downloads_count: u64,
+    downloads_bytes: u64,
+    evictions_count: u64,
+    timeline_restarts: u64,
+    #[serde(with = "humantime_serde")]
+    runtime: Duration,
+}
+
 #[derive(Debug, Default)]
 struct LiveStats {
-    evictions: AtomicU64,
-    downloads: AtomicU64,
+    evictions_count: AtomicU64,
+    downloads_count: AtomicU64,
+    downloads_bytes: AtomicU64,
     timeline_restarts: AtomicU64,
 }
 
 impl LiveStats {
     fn eviction_done(&self) {
-        self.evictions.fetch_add(1, Ordering::Relaxed);
+        self.evictions_count.fetch_add(1, Ordering::Relaxed);
     }
-    fn download_done(&self) {
-        self.downloads.fetch_add(1, Ordering::Relaxed);
+    fn download_done(&self, size: u64) {
+        self.downloads_count.fetch_add(1, Ordering::Relaxed);
+        self.downloads_bytes.fetch_add(size, Ordering::Relaxed);
     }
     fn timeline_restart_done(&self) {
         self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
@@ -92,28 +103,49 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
     )
     .await?;
 
+    let token = CancellationToken::new();
     let mut tasks = JoinSet::new();
 
-    let live_stats = Arc::new(LiveStats::default());
+    let periodic_stats = Arc::new(LiveStats::default());
+    let total_stats = Arc::new(LiveStats::default());
+
+    let start = Instant::now();
     tasks.spawn({
-        let live_stats = Arc::clone(&live_stats);
+        let periodic_stats = Arc::clone(&periodic_stats);
+        let total_stats = Arc::clone(&total_stats);
+        let cloned_token = token.clone();
         async move {
             let mut last_at = Instant::now();
             loop {
+                if cloned_token.is_cancelled() {
+                    return;
+                }
                 tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
                 let now = Instant::now();
                 let delta: Duration = now - last_at;
                 last_at = now;
 
                 let LiveStats {
-                    evictions,
-                    downloads,
+                    evictions_count,
+                    downloads_count,
+                    downloads_bytes,
                     timeline_restarts,
-                } = &*live_stats;
-                let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
-                let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
+                } = &*periodic_stats;
+                let evictions_count = evictions_count.swap(0, Ordering::Relaxed);
+                let downloads_count = downloads_count.swap(0, Ordering::Relaxed);
+                let downloads_bytes = downloads_bytes.swap(0, Ordering::Relaxed);
                 let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
-                info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
+
+                total_stats.evictions_count.fetch_add(evictions_count, Ordering::Relaxed);
+                total_stats.downloads_count.fetch_add(downloads_count, Ordering::Relaxed);
+                total_stats.downloads_bytes.fetch_add(downloads_bytes, Ordering::Relaxed);
+                total_stats.timeline_restarts.fetch_add(timeline_restarts, Ordering::Relaxed);
+
+                let evictions_per_s = evictions_count as f64 / delta.as_secs_f64();
+                let downloads_per_s = downloads_count as f64 / delta.as_secs_f64();
+                let downloads_mibs_per_s = downloads_bytes as f64 / delta.as_secs_f64() / ((1 << 20) as f64);
+
+                info!("evictions={evictions_per_s:.2}/s downloads={downloads_per_s:.2}/s download_bytes={downloads_mibs_per_s:.2}MiB/s timeline_restarts={timeline_restarts}");
             }
         }
     });
@@ -124,14 +156,42 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
                 args,
                 Arc::clone(&mgmt_api_client),
                 tl,
-                Arc::clone(&live_stats),
+                Arc::clone(&periodic_stats),
+                token.clone(),
             ));
         }
     }
+    if let Some(runtime) = args.runtime {
+        tokio::spawn(async move {
+            tokio::time::sleep(runtime.into()).await;
+            token.cancel();
+        });
+    }
 
     while let Some(res) = tasks.join_next().await {
         res.unwrap();
     }
+    let end = Instant::now();
+    let duration: Duration = end - start;
+
+    let output = {
+        let LiveStats {
+            evictions_count,
+            downloads_count,
+            downloads_bytes,
+            timeline_restarts,
+        } = &*total_stats;
+        Output {
+            downloads_count: downloads_count.load(Ordering::Relaxed),
+            downloads_bytes: downloads_bytes.load(Ordering::Relaxed),
+            evictions_count: evictions_count.load(Ordering::Relaxed),
+            timeline_restarts: timeline_restarts.load(Ordering::Relaxed),
+            runtime: duration,
+        }
+    };
+    let output = serde_json::to_string_pretty(&output).unwrap();
+    println!("{output}");
+
     Ok(())
 }
 
@@ -140,6 +200,7 @@ async fn timeline_actor(
     mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
     timeline: TenantTimelineId,
     live_stats: Arc<LiveStats>,
+    token: CancellationToken,
 ) {
     // TODO: support sharding
     let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
@@ -149,7 +210,7 @@ async fn timeline_actor(
         layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
         concurrency: Arc<tokio::sync::Semaphore>,
     }
-    loop {
+    while !token.is_cancelled() {
         debug!("restarting timeline");
         let layer_map_info = mgmt_api_client
             .layer_map_info(tenant_shard_id, timeline.timeline_id)
@@ -185,7 +246,7 @@ async fn timeline_actor(
 
         live_stats.timeline_restart_done();
 
-        loop {
+        while !token.is_cancelled() {
             assert!(!timeline.joinset.is_empty());
             if let Some(res) = timeline.joinset.try_join_next() {
                 debug!(?res, "a layer actor exited, should not happen");
@@ -255,7 +316,7 @@ async fn layer_actor(
                     .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
                     .await
                     .unwrap();
-                live_stats.download_done();
+                live_stats.download_done(layer.layer_file_size());
                 did_it
             }
         };
diff --git a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
new file mode 100644
index 0000000000..644c1f559b
--- /dev/null
+++ b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
@@ -0,0 +1,175 @@
+import json
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
+from fixtures.pageserver.utils import wait_for_upload_queue_empty
+from fixtures.remote_storage import s3_storage
+from fixtures.utils import humantime_to_ms
+
+
+@pytest.mark.parametrize("duration", [30])
+@pytest.mark.parametrize("io_engine", ["tokio-epoll-uring", "std-fs"])
+@pytest.mark.parametrize("concurrency_per_target", [1, 10, 100])
+@pytest.mark.timeout(1000)
+def test_download_churn(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    io_engine: str,
+    concurrency_per_target: int,
+    duration: int,
+):
+    def record(metric, **kwargs):
+        zenbenchmark.record(metric_name=f"pageserver_ondemand_download_churn.{metric}", **kwargs)
+
+    params: Dict[str, Tuple[Any, Dict[str, Any]]] = {}
+
+    # params from fixtures
+    params.update(
+        {
+            # we don't capture `duration`, but instead use the `runtime` output field from pagebench
+        }
+    )
+
+    # configure cache sizes like in prod
+    page_cache_size = 16384
+    max_file_descriptors = 500000
+    neon_env_builder.pageserver_config_override = (
+        f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}"
+    )
+    params.update(
+        {
+            "pageserver_config_override.page_cache_size": (
+                page_cache_size * 8192,
+                {"unit": "byte"},
+            ),
+            "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}),
+        }
+    )
+
+    for param, (value, kwargs) in params.items():
+        record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs)
+
+    # Setup env
+    env = setup_env(neon_env_builder, pg_bin)
+    env.pageserver.allowed_errors.append(
+        f".*path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
+    )
+
+    run_benchmark(env, pg_bin, record, io_engine, concurrency_per_target, duration)
+
+
+def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    # We configure tenant conf such that SQL query below produces a lot of layers.
+    # We don't care what's in the layers really, we just care that layers are created.
+    bytes_per_layer = 10 * (1024**2)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "pitr_interval": "1000d",  # let's not make it get in the way
+            "gc_period": "0s",  # disable periodic gc to avoid noise
+            "compaction_period": "0s",  # disable L0=>L1 compaction
+            "checkpoint_timeout": "10years",  # rely solely on checkpoint_distance
+            "checkpoint_distance": bytes_per_layer,  # 10M instead of 256M to create more smaller layers
+            "image_creation_threshold": 100000,  # don't create image layers ever
+        }
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    client = env.pageserver.http_client()
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
+        ep.safe_psql("CREATE TABLE data (random_text text)")
+        bytes_per_row = 512  # make big enough so WAL record size doesn't dominate
+        desired_layers = 300
+        desired_bytes = bytes_per_layer * desired_layers
+        nrows = desired_bytes / bytes_per_row
+        ep.safe_psql(
+            f"INSERT INTO data SELECT lpad(i::text, {bytes_per_row}, '0') FROM generate_series(1, {int(nrows)})  as i",
+            options="-c statement_timeout=0",
+        )
+        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
+    # TODO: this is a bit imprecise, there could be frozen layers being written out that we don't observe here
+    wait_for_upload_queue_empty(client, tenant_id, timeline_id)
+
+    return env
+
+
+def run_benchmark(
+    env: NeonEnv,
+    pg_bin: PgBin,
+    record,
+    io_engine: str,
+    concurrency_per_target: int,
+    duration_secs: int,
+):
+    ps_http = env.pageserver.http_client()
+    cmd = [
+        str(env.neon_binpath / "pagebench"),
+        "ondemand-download-churn",
+        "--mgmt-api-endpoint",
+        ps_http.base_url,
+        "--runtime",
+        f"{duration_secs}s",
+        "--set-io-engine",
+        f"{io_engine}",
+        "--concurrency-per-target",
+        f"{concurrency_per_target}",
+        # don't specify the targets explicitly, let pagebench auto-discover them
+    ]
+
+    log.info(f"command: {' '.join(cmd)}")
+    basepath = pg_bin.run_capture(cmd, with_command_header=False)
+    results_path = Path(basepath + ".stdout")
+    log.info(f"Benchmark results at: {results_path}")
+
+    with open(results_path, "r") as f:
+        results = json.load(f)
+    log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
+
+    metric = "downloads_count"
+    record(
+        metric,
+        metric_value=results[metric],
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+    metric = "downloads_bytes"
+    record(
+        metric,
+        metric_value=results[metric],
+        unit="byte",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+    metric = "evictions_count"
+    record(
+        metric,
+        metric_value=results[metric],
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+    metric = "timeline_restarts"
+    record(
+        metric,
+        metric_value=results[metric],
+        unit="",
+        report=MetricReport.LOWER_IS_BETTER,
+    )
+
+    metric = "runtime"
+    record(
+        metric,
+        metric_value=humantime_to_ms(results[metric]) / 1000,
+        unit="s",
+        report=MetricReport.TEST_PARAM,
+    )

From 4b97683338bc21e13686fe4311946b36462729c1 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 15 May 2024 13:17:57 -0400
Subject: [PATCH 0789/1571] feat(pageserver): use fnv hash for aux file
 encoding (#7742)

FNV hash is simple, portable, and stable. This pull request vendors the
FNV hash implementation from servo and modified it to use the u128
variant.

replaces https://github.com/neondatabase/neon/pull/7644

ref https://github.com/neondatabase/neon/issues/7462

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/aux_file.rs | 65 ++++++++++++++++++++++++++------------
 1 file changed, 45 insertions(+), 20 deletions(-)

diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs
index e6d950487d..38e1875db1 100644
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -5,14 +5,35 @@ use bytes::{Buf, BufMut, Bytes};
 use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
 use tracing::warn;
 
-/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
+// BEGIN Copyright (c) 2017 Servo Contributors
+
+/// Const version of FNV hash.
+#[inline]
+#[must_use]
+pub const fn fnv_hash(bytes: &[u8]) -> u128 {
+    const INITIAL_STATE: u128 = 0x6c62272e07bb014262b821756295c58d;
+    const PRIME: u128 = 0x0000000001000000000000000000013B;
+
+    let mut hash = INITIAL_STATE;
+    let mut i = 0;
+    while i < bytes.len() {
+        hash ^= bytes[i] as u128;
+        hash = hash.wrapping_mul(PRIME);
+        i += 1;
+    }
+    hash
+}
+
+// END Copyright (c) 2017 Servo Contributors
+
+/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, least significant 13B of FNV hash].
 fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
-    let mut key = [0; METADATA_KEY_SIZE];
-    let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
+    let mut key: [u8; 16] = [0; METADATA_KEY_SIZE];
+    let hash = fnv_hash(data).to_be_bytes();
     key[0] = AUX_KEY_PREFIX;
     key[1] = dir_level1;
     key[2] = dir_level2;
-    key[3..16].copy_from_slice(&hash[0..13]);
+    key[3..16].copy_from_slice(&hash[3..16]);
     Key::from_metadata_key_fixed_size(&key)
 }
 
@@ -200,15 +221,19 @@ mod tests {
     fn test_hash_portable() {
         // AUX file encoding requires the hash to be portable across all platforms. This test case checks
         // if the algorithm produces the same hash across different environments.
+
         assert_eq!(
-            305317690835051308206966631765527126151,
-            twox_hash::xxh3::hash128("test1".as_bytes())
+            265160408618497461376862998434862070044,
+            super::fnv_hash("test1".as_bytes())
         );
         assert_eq!(
-            85104974691013376326742244813280798847,
-            twox_hash::xxh3::hash128("test/test2".as_bytes())
+            295486155126299629456360817749600553988,
+            super::fnv_hash("test/test2".as_bytes())
+        );
+        assert_eq!(
+            144066263297769815596495629667062367629,
+            super::fnv_hash("".as_bytes())
         );
-        assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
     }
 
     #[test]
@@ -216,28 +241,28 @@ mod tests {
         // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
         // of the page server.
         assert_eq!(
-            "6200000101E5B20C5F8DD5AA3289D6D9EAFA",
-            encode_aux_file_key("pg_logical/mappings/test1").to_string()
+            "62000001017F8B83D94F7081693471ABF91C",
+            encode_aux_file_key("pg_logical/mappings/test1").to_string(),
         );
         assert_eq!(
-            "620000010239AAC544893139B26F501B97E6",
-            encode_aux_file_key("pg_logical/snapshots/test2").to_string()
+            "62000001027F8E83D94F7081693471ABFCCD",
+            encode_aux_file_key("pg_logical/snapshots/test2").to_string(),
         );
         assert_eq!(
-            "620000010300000000000000000000000000",
-            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
+            "62000001032E07BB014262B821756295C58D",
+            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string(),
         );
         assert_eq!(
-            "62000001FF8635AF2134B7266EC5B4189FD6",
-            encode_aux_file_key("pg_logical/unsupported").to_string()
+            "62000001FF4F38E1C74754E7D03C1A660178",
+            encode_aux_file_key("pg_logical/unsupported").to_string(),
         );
         assert_eq!(
-            "6200000201772D0E5D71DE14DA86142A1619",
+            "62000002017F8D83D94F7081693471ABFB92",
             encode_aux_file_key("pg_replslot/test3").to_string()
         );
         assert_eq!(
-            "620000FFFF1866EBEB53B807B26A2416F317",
-            encode_aux_file_key("other_file_not_supported").to_string()
+            "620000FFFF2B6ECC8AEF93F643DC44F15E03",
+            encode_aux_file_key("other_file_not_supported").to_string(),
         );
     }
 

From c6d5ff944db91f498e46fa24eb0d667abdf94dba Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 15 May 2024 14:29:12 -0400
Subject: [PATCH 0790/1571] fix(test): ensure fixtures are correctly used for
 pageserver_aux_file_policy (#7769)

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d3aadbe612..a6fd4792dd 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1393,7 +1393,7 @@ def neon_env_builder(
     top_output_dir: Path,
     pageserver_virtual_file_io_engine: str,
     pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
-    pageserver_aux_file_policy: Optional[AuxFileStore] = None,
+    pageserver_aux_file_policy: Optional[AuxFileStore],
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.

From 03c603970748844cbc188f1e0dc6179fa1a1e83d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 16 May 2024 09:26:34 +0100
Subject: [PATCH 0791/1571] pageserver: refine tenant_id->shard lookup (#7762)

## Problem

This is tech debt from when shard splitting was implemented, to handle
more nicely the edge case of a client reconnect at the moment of the
split.

During shard splits, there were edge cases where we could incorrectly
return NotFound to a getpage@lsn request, prompting an unwanted
reconnect/backoff from the client.

It is already the case that parent shards during splits are marked
InProgress before child shards are created, so `resolve_attached_shard`
will not match on them, thereby implicitly preferring child shards
(good).

However, we were not doing any elegant handling of InProgress in
general: `get_active_tenant_with_timeout` was previously mostly dead
code: it was inspecting the slot found by `resolve_attached_shard` and
maybe waiting for InProgress, but that path is never taken because since
ef7c9c2ccc1a385f74455f45b54faa5b101065e6 the resolve function only ever
returns attached slots.

Closes: https://github.com/neondatabase/neon/issues/7044

## Summary of changes

- Change return value of `resolve_attached_shard` to distinguish between
true NotFound case, and the case where we skipped slots that were
InProgress.
- Rework `get_active_tenant_with_timeout` to loop over calling
resolve_attached_shard, waiting if it sees an InProgress result.

The resulting behavior during a shard split is:
- If we look up a shard early in split when parent is InProgress but
children aren't created yet, we'll wait for the parent to be shut down.
This corresponds to the part of the split where we wait for LSNs to
catch up: so a small delay to the request, but a clean enough handling.
- If we look up a shard while child shards are already present, we will
match on those shards rather than the parent, as intended.
---
 pageserver/src/bin/pageserver.rs |  25 ++--
 pageserver/src/page_service.rs   | 125 +++++++++++------
 pageserver/src/tenant/mgr.rs     | 222 +++++++++++--------------------
 3 files changed, 176 insertions(+), 196 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index a04195e12b..ba5b2608bd 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -647,17 +647,20 @@ fn start_pageserver(
             None,
             "libpq endpoint listener",
             true,
-            async move {
-                page_service::libpq_listener_main(
-                    conf,
-                    broker_client,
-                    pg_auth,
-                    pageserver_listener,
-                    conf.pg_auth_type,
-                    libpq_ctx,
-                    task_mgr::shutdown_token(),
-                )
-                .await
+            {
+                let tenant_manager = tenant_manager.clone();
+                async move {
+                    page_service::libpq_listener_main(
+                        tenant_manager,
+                        broker_client,
+                        pg_auth,
+                        pageserver_listener,
+                        conf.pg_auth_type,
+                        libpq_ctx,
+                        task_mgr::shutdown_token(),
+                    )
+                    .await
+                }
             },
         );
     }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index f6b251283c..35aba044b2 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -32,6 +32,7 @@ use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use std::time::Instant;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
@@ -49,7 +50,6 @@ use utils::{
 use crate::auth::check_permission;
 use crate::basebackup;
 use crate::basebackup::BasebackupError;
-use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
@@ -59,13 +59,15 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
-use crate::tenant::mgr;
-use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
+use crate::tenant::mgr::GetTenantError;
+use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
+use crate::tenant::mgr::TenantManager;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
+use crate::tenant::Tenant;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;
 use pageserver_api::key::rel_block_to_key;
@@ -135,7 +137,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 /// Listens for connections, and launches a new handler task for each.
 ///
 pub async fn libpq_listener_main(
-    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
     broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     listener: TcpListener,
@@ -180,7 +182,7 @@ pub async fn libpq_listener_main(
                     "serving compute connection task",
                     false,
                     page_service_conn_main(
-                        conf,
+                        tenant_manager.clone(),
                         broker_client.clone(),
                         local_auth,
                         socket,
@@ -203,7 +205,7 @@ pub async fn libpq_listener_main(
 
 #[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
-    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
     broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     socket: tokio::net::TcpStream,
@@ -260,7 +262,8 @@ async fn page_service_conn_main(
     // and create a child per-query context when it invokes process_query.
     // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
     // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
+    let mut conn_handler =
+        PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
     let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
 
     match pgbackend
@@ -291,11 +294,12 @@ struct HandlerTimeline {
 }
 
 struct PageServerHandler {
-    _conf: &'static PageServerConf,
     broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     claims: Option<Claims>,
 
+    tenant_manager: Arc<TenantManager>,
+
     /// The context created for the lifetime of the connection
     /// services by this PageServerHandler.
     /// For each query received over the connection,
@@ -381,13 +385,13 @@ impl From<WaitLsnError> for QueryError {
 
 impl PageServerHandler {
     pub fn new(
-        conf: &'static PageServerConf,
+        tenant_manager: Arc<TenantManager>,
         broker_client: storage_broker::BrokerClientChannel,
         auth: Option<Arc<SwappableJwtAuth>>,
         connection_ctx: RequestContext,
     ) -> Self {
         PageServerHandler {
-            _conf: conf,
+            tenant_manager,
             broker_client,
             auth,
             claims: None,
@@ -552,13 +556,9 @@ impl PageServerHandler {
     {
         debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
 
-        let tenant = mgr::get_active_tenant_with_timeout(
-            tenant_id,
-            ShardSelector::First,
-            ACTIVE_TENANT_TIMEOUT,
-            &task_mgr::shutdown_token(),
-        )
-        .await?;
+        let tenant = self
+            .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
+            .await?;
 
         // Make request tracer if needed
         let mut tracer = if tenant.get_trace_read_requests() {
@@ -726,13 +726,9 @@ impl PageServerHandler {
 
         // Create empty timeline
         info!("creating new timeline");
-        let tenant = get_active_tenant_with_timeout(
-            tenant_id,
-            ShardSelector::Zero,
-            ACTIVE_TENANT_TIMEOUT,
-            &task_mgr::shutdown_token(),
-        )
-        .await?;
+        let tenant = self
+            .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
+            .await?;
         let timeline = tenant
             .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
             .await?;
@@ -1370,18 +1366,69 @@ impl PageServerHandler {
         timeline_id: TimelineId,
         selector: ShardSelector,
     ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
-        let tenant = get_active_tenant_with_timeout(
-            tenant_id,
-            selector,
-            ACTIVE_TENANT_TIMEOUT,
-            &task_mgr::shutdown_token(),
-        )
-        .await
-        .map_err(GetActiveTimelineError::Tenant)?;
+        let tenant = self
+            .get_active_tenant_with_timeout(tenant_id, selector, ACTIVE_TENANT_TIMEOUT)
+            .await
+            .map_err(GetActiveTimelineError::Tenant)?;
         let timeline = tenant.get_timeline(timeline_id, true)?;
         set_tracing_field_shard_id(&timeline);
         Ok(timeline)
     }
+
+    /// Get a shard's [`Tenant`] in its active state, if present.  If we don't find the shard and some
+    /// slots for this tenant are `InProgress` then we will wait.
+    /// If we find the [`Tenant`] and it's not yet in state [`TenantState::Active`], we will wait.
+    ///
+    /// `timeout` is used as a total timeout for the whole wait operation.
+    async fn get_active_tenant_with_timeout(
+        &self,
+        tenant_id: TenantId,
+        shard_selector: ShardSelector,
+        timeout: Duration,
+    ) -> Result<Arc<Tenant>, GetActiveTenantError> {
+        let wait_start = Instant::now();
+        let deadline = wait_start + timeout;
+
+        // Resolve TenantId to TenantShardId.  This is usually a quick one-shot thing, the loop is
+        // for handling the rare case that the slot we're accessing is InProgress.
+        let tenant_shard = loop {
+            let resolved = self
+                .tenant_manager
+                .resolve_attached_shard(&tenant_id, shard_selector);
+            match resolved {
+                ShardResolveResult::Found(tenant_shard) => break tenant_shard,
+                ShardResolveResult::NotFound => {
+                    return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
+                        tenant_id,
+                    )));
+                }
+                ShardResolveResult::InProgress(barrier) => {
+                    // We can't authoritatively answer right now: wait for InProgress state
+                    // to end, then try again
+                    tokio::select! {
+                        _ = self.await_connection_cancelled() => {
+                            return Err(GetActiveTenantError::Cancelled)
+                        },
+                        _  = barrier.wait() => {
+                            // The barrier completed: proceed around the loop to try looking up again
+                        },
+                        _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
+                            return Err(GetActiveTenantError::WaitForActiveTimeout {
+                                latest_state: None,
+                                wait_time: timeout,
+                            });
+                        }
+                    }
+                }
+            };
+        };
+
+        tracing::debug!("Waiting for tenant to enter active state...");
+        tenant_shard
+            .wait_to_become_active(deadline.duration_since(Instant::now()))
+            .await?;
+        Ok(tenant_shard)
+    }
 }
 
 #[async_trait::async_trait]
@@ -1771,13 +1818,13 @@ where
 
             self.check_permission(Some(tenant_id))?;
 
-            let tenant = get_active_tenant_with_timeout(
-                tenant_id,
-                ShardSelector::Zero,
-                ACTIVE_TENANT_TIMEOUT,
-                &task_mgr::shutdown_token(),
-            )
-            .await?;
+            let tenant = self
+                .get_active_tenant_with_timeout(
+                    tenant_id,
+                    ShardSelector::Zero,
+                    ACTIVE_TENANT_TIMEOUT,
+                )
+                .await?;
             pgb.write_message_noflush(&BeMessage::RowDescription(&[
                 RowDescriptor::int8_col(b"checkpoint_distance"),
                 RowDescriptor::int8_col(b"checkpoint_timeout"),
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 5abda7b64e..1d8e2cf6d3 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -16,10 +16,9 @@ use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashMap};
 use std::ops::Deref;
 use std::sync::Arc;
-use std::time::{Duration, Instant};
+use std::time::Duration;
 use sysinfo::SystemExt;
 use tokio::fs;
-use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
 
 use anyhow::Context;
 use once_cell::sync::Lazy;
@@ -119,6 +118,7 @@ pub(crate) enum TenantsMapRemoveResult {
 
 /// When resolving a TenantId to a shard, we may be looking for the 0th
 /// shard, or we might be looking for whichever shard holds a particular page.
+#[derive(Copy, Clone)]
 pub(crate) enum ShardSelector {
     /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
     /// ignore it.
@@ -169,6 +169,14 @@ impl TenantStartupMode {
     }
 }
 
+/// Result type for looking up a TenantId to a specific shard
+pub(crate) enum ShardResolveResult {
+    NotFound,
+    Found(Arc<Tenant>),
+    // Wait for this barrrier, then query again
+    InProgress(utils::completion::Barrier),
+}
+
 impl TenantsMap {
     /// Convenience function for typical usage, where we want to get a `Tenant` object, for
     /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
@@ -182,51 +190,6 @@ impl TenantsMap {
         }
     }
 
-    /// A page service client sends a TenantId, and to look up the correct Tenant we must
-    /// resolve this to a fully qualified TenantShardId.
-    fn resolve_attached_shard(
-        &self,
-        tenant_id: &TenantId,
-        selector: ShardSelector,
-    ) -> Option<TenantShardId> {
-        let mut want_shard = None;
-        match self {
-            TenantsMap::Initializing => None,
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
-                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
-                    // Ignore all slots that don't contain an attached tenant
-                    let tenant = match &slot.1 {
-                        TenantSlot::Attached(t) => t,
-                        _ => continue,
-                    };
-
-                    match selector {
-                        ShardSelector::First => return Some(*slot.0),
-                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
-                            return Some(*slot.0)
-                        }
-                        ShardSelector::Page(key) => {
-                            // First slot we see for this tenant, calculate the expected shard number
-                            // for the key: we will use this for checking if this and subsequent
-                            // slots contain the key, rather than recalculating the hash each time.
-                            if want_shard.is_none() {
-                                want_shard = Some(tenant.shard_identity.get_shard_number(&key));
-                            }
-
-                            if Some(tenant.shard_identity.number) == want_shard {
-                                return Some(*slot.0);
-                            }
-                        }
-                        _ => continue,
-                    }
-                }
-
-                // Fall through: we didn't find an acceptable shard
-                None
-            }
-        }
-    }
-
     /// Only for use from DeleteTenantFlow.  This method directly removes a TenantSlot from the map.
     ///
     /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
@@ -2053,6 +2016,72 @@ impl TenantManager {
 
         Ok(reparented)
     }
+
+    /// A page service client sends a TenantId, and to look up the correct Tenant we must
+    /// resolve this to a fully qualified TenantShardId.
+    ///
+    /// During shard splits: we shall see parent shards in InProgress state and skip them, and
+    /// instead match on child shards which should appear in Attached state.  Very early in a shard
+    /// split, or in other cases where a shard is InProgress, we will return our own InProgress result
+    /// to instruct the caller to wait for that to finish before querying again.
+    pub(crate) fn resolve_attached_shard(
+        &self,
+        tenant_id: &TenantId,
+        selector: ShardSelector,
+    ) -> ShardResolveResult {
+        let tenants = self.tenants.read().unwrap();
+        let mut want_shard = None;
+        let mut any_in_progress = None;
+
+        match &*tenants {
+            TenantsMap::Initializing => ShardResolveResult::NotFound,
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
+                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
+                    // Ignore all slots that don't contain an attached tenant
+                    let tenant = match &slot.1 {
+                        TenantSlot::Attached(t) => t,
+                        TenantSlot::InProgress(barrier) => {
+                            // We might still find a usable shard, but in case we don't, remember that
+                            // we saw at least one InProgress slot, so that we can distinguish this case
+                            // from a simple NotFound in our return value.
+                            any_in_progress = Some(barrier.clone());
+                            continue;
+                        }
+                        _ => continue,
+                    };
+
+                    match selector {
+                        ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
+                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
+                            return ShardResolveResult::Found(tenant.clone())
+                        }
+                        ShardSelector::Page(key) => {
+                            // First slot we see for this tenant, calculate the expected shard number
+                            // for the key: we will use this for checking if this and subsequent
+                            // slots contain the key, rather than recalculating the hash each time.
+                            if want_shard.is_none() {
+                                want_shard = Some(tenant.shard_identity.get_shard_number(&key));
+                            }
+
+                            if Some(tenant.shard_identity.number) == want_shard {
+                                return ShardResolveResult::Found(tenant.clone());
+                            }
+                        }
+                        _ => continue,
+                    }
+                }
+
+                // Fall through: we didn't find a slot that was in Attached state & matched our selector.  If
+                // we found one or more InProgress slot, indicate to caller that they should retry later.  Otherwise
+                // this requested shard simply isn't found.
+                if let Some(barrier) = any_in_progress {
+                    ShardResolveResult::InProgress(barrier)
+                } else {
+                    ShardResolveResult::NotFound
+                }
+            }
+        }
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -2101,105 +2130,6 @@ pub(crate) enum GetActiveTenantError {
     Broken(String),
 }
 
-/// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`]
-/// state, then wait for up to `timeout`.  If the [`Tenant`] is not currently in [`TenantState::Active`],
-/// then wait for up to `timeout` (minus however long we waited for the slot).
-pub(crate) async fn get_active_tenant_with_timeout(
-    tenant_id: TenantId,
-    shard_selector: ShardSelector,
-    timeout: Duration,
-    cancel: &CancellationToken,
-) -> Result<Arc<Tenant>, GetActiveTenantError> {
-    enum WaitFor {
-        Barrier(utils::completion::Barrier),
-        Tenant(Arc<Tenant>),
-    }
-
-    let wait_start = Instant::now();
-    let deadline = wait_start + timeout;
-
-    let (wait_for, tenant_shard_id) = {
-        let locked = TENANTS.read().unwrap();
-
-        // Resolve TenantId to TenantShardId
-        let tenant_shard_id = locked
-            .resolve_attached_shard(&tenant_id, shard_selector)
-            .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
-                tenant_id,
-            )))?;
-
-        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
-            .map_err(GetTenantError::MapState)?;
-        match peek_slot {
-            Some(TenantSlot::Attached(tenant)) => {
-                match tenant.current_state() {
-                    TenantState::Active => {
-                        // Fast path: we don't need to do any async waiting.
-                        return Ok(tenant.clone());
-                    }
-                    _ => {
-                        tenant.activate_now();
-                        (WaitFor::Tenant(tenant.clone()), tenant_shard_id)
-                    }
-                }
-            }
-            Some(TenantSlot::Secondary(_)) => {
-                return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
-                    tenant_shard_id,
-                )))
-            }
-            Some(TenantSlot::InProgress(barrier)) => {
-                (WaitFor::Barrier(barrier.clone()), tenant_shard_id)
-            }
-            None => {
-                return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
-                    tenant_id,
-                )))
-            }
-        }
-    };
-
-    let tenant = match wait_for {
-        WaitFor::Barrier(barrier) => {
-            tracing::debug!("Waiting for tenant InProgress state to pass...");
-            timeout_cancellable(
-                deadline.duration_since(Instant::now()),
-                cancel,
-                barrier.wait(),
-            )
-            .await
-            .map_err(|e| match e {
-                TimeoutCancellableError::Timeout => GetActiveTenantError::WaitForActiveTimeout {
-                    latest_state: None,
-                    wait_time: wait_start.elapsed(),
-                },
-                TimeoutCancellableError::Cancelled => GetActiveTenantError::Cancelled,
-            })?;
-            {
-                let locked = TENANTS.read().unwrap();
-                let peek_slot =
-                    tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
-                        .map_err(GetTenantError::MapState)?;
-                match peek_slot {
-                    Some(TenantSlot::Attached(tenant)) => tenant.clone(),
-                    _ => {
-                        return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
-                            tenant_shard_id,
-                        )))
-                    }
-                }
-            }
-        }
-        WaitFor::Tenant(tenant) => tenant,
-    };
-
-    tracing::debug!("Waiting for tenant to enter active state...");
-    tenant
-        .wait_to_become_active(deadline.duration_since(Instant::now()))
-        .await?;
-    Ok(tenant)
-}
-
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum DeleteTimelineError {
     #[error("Tenant {0}")]

From 923cf91aa4c1986c47fba0158fff0ac9bf8225ce Mon Sep 17 00:00:00 2001
From: Andrew Rudenko <me@prepor.dev>
Date: Thu, 16 May 2024 12:04:16 +0200
Subject: [PATCH 0792/1571] compute_ctl: catalog API endpoints (#7575)

## Problem

There are two cloud's features that require extra compute endpoints.

1. We are running pg_dump to get DB schemas. Currently, we are using a
special service for this. But it would be great to execute pg_dump in an
isolated environment. And we already have such an environment, it's our
compute! And likely enough pg_dump already exists there too! (see
https://github.com/neondatabase/cloud/issues/11644#issuecomment-2084617832)
2. We need to have a way to get databases and roles from compute after
time travel (see https://github.com/neondatabase/cloud/issues/12109)

## Summary of changes

It adds two API endpoints to compute_ctl HTTP API that target both of
the aforementioned cases.

---------

Co-authored-by: Tristan Partin <tristan@neon.tech>
---
 Cargo.lock                                  |   2 +
 compute_tools/Cargo.toml                    |   2 +
 compute_tools/src/catalog.rs                | 116 ++++++++++++++++++++
 compute_tools/src/http/api.rs               |  47 ++++++++
 compute_tools/src/http/openapi_spec.yaml    | 112 +++++++++++++++++++
 compute_tools/src/lib.rs                    |   1 +
 libs/compute_api/src/responses.rs           |   8 +-
 test_runner/fixtures/endpoint/__init__.py   |   0
 test_runner/fixtures/endpoint/http.py       |  23 ++++
 test_runner/fixtures/neon_fixtures.py       |   8 ++
 test_runner/regress/test_compute_catalog.py |  34 ++++++
 11 files changed, 352 insertions(+), 1 deletion(-)
 create mode 100644 compute_tools/src/catalog.rs
 create mode 100644 test_runner/fixtures/endpoint/__init__.py
 create mode 100644 test_runner/fixtures/endpoint/http.py
 create mode 100644 test_runner/regress/test_compute_catalog.py

diff --git a/Cargo.lock b/Cargo.lock
index 961101b151..b1f53404ea 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1233,8 +1233,10 @@ dependencies = [
  "serde_json",
  "signal-hook",
  "tar",
+ "thiserror",
  "tokio",
  "tokio-postgres",
+ "tokio-stream",
  "tokio-util",
  "toml_edit",
  "tracing",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 759a117ee9..8f96530a9d 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -27,10 +27,12 @@ reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
 tokio-util.workspace = true
+tokio-stream.workspace = true
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
+thiserror.workspace = true
 url.workspace = true
 
 compute_api.workspace = true
diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs
new file mode 100644
index 0000000000..4fefa831e0
--- /dev/null
+++ b/compute_tools/src/catalog.rs
@@ -0,0 +1,116 @@
+use compute_api::{
+    responses::CatalogObjects,
+    spec::{Database, Role},
+};
+use futures::Stream;
+use postgres::{Client, NoTls};
+use std::{path::Path, process::Stdio, result::Result, sync::Arc};
+use tokio::{
+    io::{AsyncBufReadExt, BufReader},
+    process::Command,
+    task,
+};
+use tokio_stream::{self as stream, StreamExt};
+use tokio_util::codec::{BytesCodec, FramedRead};
+use tracing::warn;
+
+use crate::{
+    compute::ComputeNode,
+    pg_helpers::{get_existing_dbs, get_existing_roles},
+};
+
+pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
+    let connstr = compute.connstr.clone();
+    task::spawn_blocking(move || {
+        let mut client = Client::connect(connstr.as_str(), NoTls)?;
+        let roles: Vec<Role>;
+        {
+            let mut xact = client.transaction()?;
+            roles = get_existing_roles(&mut xact)?;
+        }
+        let databases: Vec<Database> = get_existing_dbs(&mut client)?.values().cloned().collect();
+
+        Ok(CatalogObjects { roles, databases })
+    })
+    .await?
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum SchemaDumpError {
+    #[error("Database does not exist.")]
+    DatabaseDoesNotExist,
+    #[error("Failed to execute pg_dump.")]
+    IO(#[from] std::io::Error),
+}
+
+// It uses the pg_dump utility to dump the schema of the specified database.
+// The output is streamed back to the caller and supposed to be streamed via HTTP.
+//
+// Before return the result with the output, it checks that pg_dump produced any output.
+// If not, it tries to parse the stderr output to determine if the database does not exist
+// and special error is returned.
+//
+// To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature.
+pub async fn get_database_schema(
+    compute: &Arc<ComputeNode>,
+    dbname: &str,
+) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>>, SchemaDumpError> {
+    let pgbin = &compute.pgbin;
+    let basepath = Path::new(pgbin).parent().unwrap();
+    let pgdump = basepath.join("pg_dump");
+    let mut connstr = compute.connstr.clone();
+    connstr.set_path(dbname);
+    let mut cmd = Command::new(pgdump)
+        .arg("--schema-only")
+        .arg(connstr.as_str())
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        .kill_on_drop(true)
+        .spawn()?;
+
+    let stdout = cmd.stdout.take().ok_or_else(|| {
+        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
+    })?;
+
+    let stderr = cmd.stderr.take().ok_or_else(|| {
+        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
+    })?;
+
+    let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
+    let stderr_reader = BufReader::new(stderr);
+
+    let first_chunk = match stdout_reader.next().await {
+        Some(Ok(bytes)) if !bytes.is_empty() => bytes,
+        Some(Err(e)) => {
+            return Err(SchemaDumpError::IO(e));
+        }
+        _ => {
+            let mut lines = stderr_reader.lines();
+            if let Some(line) = lines.next_line().await? {
+                if line.contains(&format!("FATAL:  database \"{}\" does not exist", dbname)) {
+                    return Err(SchemaDumpError::DatabaseDoesNotExist);
+                }
+                warn!("pg_dump stderr: {}", line)
+            }
+            tokio::spawn(async move {
+                while let Ok(Some(line)) = lines.next_line().await {
+                    warn!("pg_dump stderr: {}", line)
+                }
+            });
+
+            return Err(SchemaDumpError::IO(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "failed to start pg_dump",
+            )));
+        }
+    };
+    let initial_stream = stream::once(Ok(first_chunk.freeze()));
+    // Consume stderr and log warnings
+    tokio::spawn(async move {
+        let mut lines = stderr_reader.lines();
+        while let Ok(Some(line)) = lines.next_line().await {
+            warn!("pg_dump stderr: {}", line)
+        }
+    });
+    Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))))
+}
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 128783b477..0286429cf2 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -5,17 +5,21 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
 
+use crate::catalog::SchemaDumpError;
+use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
 
 use anyhow::Result;
+use hyper::header::CONTENT_TYPE;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use tokio::task;
 use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;
+use utils::http::request::must_get_query_param;
 
 fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
     ComputeStatusResponse {
@@ -133,6 +137,34 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
         }
 
+        (&Method::GET, "/dbs_and_roles") => {
+            info!("serving /dbs_and_roles GET request",);
+            match get_dbs_and_roles(compute).await {
+                Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
+                Err(_) => {
+                    render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
+
+        (&Method::GET, "/database_schema") => {
+            let database = match must_get_query_param(&req, "database") {
+                Err(e) => return e.into_response(),
+                Ok(database) => database,
+            };
+            info!("serving /database_schema GET request with database: {database}",);
+            match get_database_schema(compute, &database).await {
+                Ok(res) => render_plain(Body::wrap_stream(res)),
+                Err(SchemaDumpError::DatabaseDoesNotExist) => {
+                    render_json_error("database does not exist", StatusCode::NOT_FOUND)
+                }
+                Err(e) => {
+                    error!("can't get schema dump: {}", e);
+                    render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
+
         // download extension files from remote extension storage on demand
         (&Method::POST, route) if route.starts_with("/extension_server/") => {
             info!("serving {:?} POST request", route);
@@ -303,10 +335,25 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
     };
     Response::builder()
         .status(status)
+        .header(CONTENT_TYPE, "application/json")
         .body(Body::from(serde_json::to_string(&error).unwrap()))
         .unwrap()
 }
 
+fn render_json(body: Body) -> Response<Body> {
+    Response::builder()
+        .header(CONTENT_TYPE, "application/json")
+        .body(body)
+        .unwrap()
+}
+
+fn render_plain(body: Body) -> Response<Body> {
+    Response::builder()
+        .header(CONTENT_TYPE, "text/plain")
+        .body(body)
+        .unwrap()
+}
+
 async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
     {
         let mut state = compute.state.lock().unwrap();
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index d2ec54299f..b0ddaeae2b 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -68,6 +68,51 @@ paths:
               schema:
                 $ref: "#/components/schemas/Info"
 
+  /dbs_and_roles:
+    get:
+      tags:
+        - Info
+      summary: Get databases and roles in the catalog.
+      description: ""
+      operationId: getDbsAndRoles
+      responses:
+        200:
+          description: Compute schema objects
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/DbsAndRoles"
+
+  /database_schema:
+    get:
+      tags:
+        - Info
+      summary: Get schema dump
+      parameters:
+        - name: database
+          in: query
+          description: Database name to dump.
+          required: true
+          schema:
+            type: string
+          example: "postgres"
+      description: Get schema dump in SQL format.
+      operationId: getDatabaseSchema
+      responses:
+        200:
+          description: Schema dump
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Schema dump in SQL format.
+        404:
+          description: Non existing database.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
   /check_writability:
     post:
       tags:
@@ -229,6 +274,73 @@ components:
         num_cpus:
           type: integer
 
+    DbsAndRoles:
+      type: object
+      description: Databases and Roles
+      required:
+        - roles
+        - databases
+      properties:
+        roles:
+          type: array
+          items:
+            $ref: "#/components/schemas/Role"
+        databases:
+          type: array
+          items:
+            $ref: "#/components/schemas/Database"
+
+    Database:
+      type: object
+      description: Database
+      required:
+        - name
+        - owner
+        - restrict_conn
+        - invalid
+      properties:
+        name:
+          type: string
+        owner:
+          type: string
+        options:
+          type: array
+          items:
+            $ref: "#/components/schemas/GenericOption"
+        restrict_conn:
+          type: boolean
+        invalid:
+          type: boolean
+
+    Role:
+      type: object
+      description: Role
+      required:
+        - name
+      properties:
+        name:
+          type: string
+        encrypted_password:
+          type: string
+        options:
+          type: array
+          items:
+            $ref: "#/components/schemas/GenericOption"
+
+    GenericOption:
+      type: object
+      description: Schema Generic option
+      required:
+        - name
+        - vartype
+      properties:
+        name:
+          type: string
+        value:
+          type: string
+        vartype:
+          type: string
+
     ComputeState:
       type: object
       required:
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index eac808385c..18c228ba54 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -8,6 +8,7 @@ pub mod configurator;
 pub mod http;
 #[macro_use]
 pub mod logger;
+pub mod catalog;
 pub mod compute;
 pub mod extension_server;
 pub mod monitor;
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index fd0c90d447..d05d625b0a 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -3,7 +3,7 @@
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize, Serializer};
 
-use crate::spec::ComputeSpec;
+use crate::spec::{ComputeSpec, Database, Role};
 
 #[derive(Serialize, Debug, Deserialize)]
 pub struct GenericAPIError {
@@ -113,6 +113,12 @@ pub struct ComputeMetrics {
     pub total_ext_download_size: u64,
 }
 
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct CatalogObjects {
+    pub roles: Vec<Role>,
+    pub databases: Vec<Database>,
+}
+
 /// Response of the `/computes/{compute_id}/spec` control-plane API.
 /// This is not actually a compute API response, so consider moving
 /// to a different place.
diff --git a/test_runner/fixtures/endpoint/__init__.py b/test_runner/fixtures/endpoint/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
new file mode 100644
index 0000000000..42f0539c19
--- /dev/null
+++ b/test_runner/fixtures/endpoint/http.py
@@ -0,0 +1,23 @@
+import requests
+from requests.adapters import HTTPAdapter
+
+
+class EndpointHttpClient(requests.Session):
+    def __init__(
+        self,
+        port: int,
+    ):
+        super().__init__()
+        self.port = port
+
+        self.mount("http://", HTTPAdapter())
+
+    def dbs_and_roles(self):
+        res = self.get(f"http://localhost:{self.port}/dbs_and_roles")
+        res.raise_for_status()
+        return res.json()
+
+    def database_schema(self, database: str):
+        res = self.get(f"http://localhost:{self.port}/database_schema?database={database}")
+        res.raise_for_status()
+        return res.text
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a6fd4792dd..b4761f103b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -48,6 +48,7 @@ from urllib3.util.retry import Retry
 from fixtures import overlayfs
 from fixtures.broker import NeonBroker
 from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pageserver.allowed_errors import (
@@ -3373,6 +3374,13 @@ class Endpoint(PgProtocol):
         self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
         # path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf
 
+    def http_client(
+        self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
+    ) -> EndpointHttpClient:
+        return EndpointHttpClient(
+            port=self.http_port,
+        )
+
     def create(
         self,
         branch_name: str,
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
new file mode 100644
index 0000000000..dd36190fcd
--- /dev/null
+++ b/test_runner/regress/test_compute_catalog.py
@@ -0,0 +1,34 @@
+import requests
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_compute_catalog(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_config", "empty")
+
+    endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"])
+    client = endpoint.http_client()
+
+    objects = client.dbs_and_roles()
+
+    # Assert that 'cloud_admin' role exists in the 'roles' list
+    assert any(
+        role["name"] == "cloud_admin" for role in objects["roles"]
+    ), "The 'cloud_admin' role is missing"
+
+    # Assert that 'postgres' database exists in the 'databases' list
+    assert any(
+        db["name"] == "postgres" for db in objects["databases"]
+    ), "The 'postgres' database is missing"
+
+    ddl = client.database_schema(database="postgres")
+
+    assert "-- PostgreSQL database dump" in ddl
+
+    try:
+        client.database_schema(database="nonexistentdb")
+        raise AssertionError("Expected HTTPError was not raised")
+    except requests.exceptions.HTTPError as e:
+        assert (
+            e.response.status_code == 404
+        ), f"Expected 404 status code, but got {e.response.status_code}"

From 790c05d67543a2f183193eff66de4e90dbe2e7f9 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 16 May 2024 12:05:50 +0100
Subject: [PATCH 0793/1571] proxy: swap tungstenite for a simpler impl (#7353)

## Problem

I wanted to do a deep dive of the tungstenite codebase.
tokio-tungstenite is incredibly convoluted... In my searching I found
[fastwebsockets by deno](https://github.com/denoland/fastwebsockets),
but it wasn't quite sufficient.

This also removes the default 16MB/64MB frame/message size limitation.
framed-websockets solves this by inserting continuation frames for
partially received messages, so the whole message does not need to be
entirely read into memory.

## Summary of changes

I took the fastwebsockets code as a starting off point and rewrote it to
be simpler, server-only, and be poll-based to support our Read/Write
wrappers.

I have replaced our tungstenite code with my framed-websockets fork.

<https://github.com/neondatabase/framed-websockets>
---
 Cargo.lock                                   | 109 ++++++++-----------
 Cargo.toml                                   |   7 +-
 proxy/Cargo.toml                             |   4 +-
 proxy/src/serverless.rs                      |   9 +-
 proxy/src/serverless/websocket.rs            |  93 ++++++++--------
 test_runner/regress/test_proxy_websockets.py |   9 +-
 6 files changed, 107 insertions(+), 124 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b1f53404ea..e1edd53fea 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -708,7 +708,7 @@ dependencies = [
  "sha1",
  "sync_wrapper",
  "tokio",
- "tokio-tungstenite 0.20.0",
+ "tokio-tungstenite",
  "tower",
  "tower-layer",
  "tower-service",
@@ -979,6 +979,12 @@ version = "3.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
 
+[[package]]
+name = "bytemuck"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
+
 [[package]]
 name = "byteorder"
 version = "1.4.3"
@@ -1598,7 +1604,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
 dependencies = [
  "cfg-if",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
  "lock_api",
  "once_cell",
  "parking_lot_core 0.9.8",
@@ -1999,6 +2005,27 @@ dependencies = [
  "percent-encoding",
 ]
 
+[[package]]
+name = "framed-websockets"
+version = "0.1.0"
+source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
+dependencies = [
+ "base64 0.21.1",
+ "bytemuck",
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "http-body-util",
+ "hyper 1.2.0",
+ "hyper-util",
+ "pin-project",
+ "rand 0.8.5",
+ "sha1",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+]
+
 [[package]]
 name = "fs2"
 version = "0.4.3"
@@ -2277,9 +2304,9 @@ dependencies = [
 
 [[package]]
 name = "hashbrown"
-version = "0.14.0"
+version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
 dependencies = [
  "ahash",
  "allocator-api2",
@@ -2287,11 +2314,11 @@ dependencies = [
 
 [[package]]
 name = "hashlink"
-version = "0.8.4"
+version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
+checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
 dependencies = [
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 ]
 
 [[package]]
@@ -2600,21 +2627,6 @@ dependencies = [
  "tokio-native-tls",
 ]
 
-[[package]]
-name = "hyper-tungstenite"
-version = "0.13.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad"
-dependencies = [
- "http-body-util",
- "hyper 1.2.0",
- "hyper-util",
- "pin-project-lite",
- "tokio",
- "tokio-tungstenite 0.21.0",
- "tungstenite 0.21.0",
-]
-
 [[package]]
 name = "hyper-util"
 version = "0.1.3"
@@ -2692,7 +2704,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
 dependencies = [
  "equivalent",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 ]
 
 [[package]]
@@ -2954,7 +2966,7 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc"
 dependencies = [
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 ]
 
 [[package]]
@@ -3007,7 +3019,7 @@ checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
 dependencies = [
  "bytes",
  "crossbeam-utils",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
  "itoa",
  "lasso",
  "measured-derive",
@@ -3569,7 +3581,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
 dependencies = [
  "dlv-list",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 ]
 
 [[package]]
@@ -3896,7 +3908,7 @@ dependencies = [
  "ahash",
  "bytes",
  "chrono",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
  "num",
  "num-bigint",
  "paste",
@@ -4380,9 +4392,10 @@ dependencies = [
  "dashmap",
  "env_logger",
  "fallible-iterator",
+ "framed-websockets",
  "futures",
  "git-version",
- "hashbrown 0.13.2",
+ "hashbrown 0.14.5",
  "hashlink",
  "hex",
  "hmac",
@@ -4392,7 +4405,6 @@ dependencies = [
  "humantime",
  "hyper 0.14.26",
  "hyper 1.2.0",
- "hyper-tungstenite",
  "hyper-util",
  "indexmap 2.0.1",
  "ipnet",
@@ -4437,7 +4449,6 @@ dependencies = [
  "smol_str",
  "socket2 0.5.5",
  "subtle",
- "sync_wrapper",
  "task-local-extensions",
  "thiserror",
  "tikv-jemalloc-ctl",
@@ -4446,6 +4457,7 @@ dependencies = [
  "tokio-postgres",
  "tokio-postgres-rustls",
  "tokio-rustls 0.25.0",
+ "tokio-tungstenite",
  "tokio-util",
  "tower-service",
  "tracing",
@@ -6382,19 +6394,7 @@ dependencies = [
  "futures-util",
  "log",
  "tokio",
- "tungstenite 0.20.1",
-]
-
-[[package]]
-name = "tokio-tungstenite"
-version = "0.21.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
-dependencies = [
- "futures-util",
- "log",
- "tokio",
- "tungstenite 0.21.0",
+ "tungstenite",
 ]
 
 [[package]]
@@ -6408,7 +6408,7 @@ dependencies = [
  "futures-io",
  "futures-sink",
  "futures-util",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
  "pin-project-lite",
  "tokio",
  "tracing",
@@ -6690,25 +6690,6 @@ dependencies = [
  "utf-8",
 ]
 
-[[package]]
-name = "tungstenite"
-version = "0.21.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
-dependencies = [
- "byteorder",
- "bytes",
- "data-encoding",
- "http 1.1.0",
- "httparse",
- "log",
- "rand 0.8.5",
- "sha1",
- "thiserror",
- "url",
- "utf-8",
-]
-
 [[package]]
 name = "twox-hash"
 version = "1.6.3"
@@ -7504,7 +7485,7 @@ dependencies = [
  "futures-sink",
  "futures-util",
  "getrandom 0.2.11",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
  "hex",
  "hmac",
  "hyper 0.14.26",
diff --git a/Cargo.toml b/Cargo.toml
index 3ccdabee18..b59a5dcd6d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -81,13 +81,14 @@ enum-map = "2.4.2"
 enumset = "1.0.12"
 fail = "0.5.0"
 fallible-iterator = "0.2"
+framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
 fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
 git-version = "0.3"
-hashbrown = "0.13"
-hashlink = "0.8.4"
+hashbrown = "0.14"
+hashlink = "0.9.1"
 hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
@@ -98,7 +99,7 @@ http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-hyper-tungstenite = "0.13.0"
+tokio-tungstenite = "0.20.0"
 indexmap = "2"
 inotify = "0.10.2"
 ipnet = "2.9.0"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 3002006aed..5f9b0aa75b 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -26,6 +26,7 @@ clap.workspace = true
 consumption_metrics.workspace = true
 dashmap.workspace = true
 env_logger.workspace = true
+framed-websockets.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hashbrown.workspace = true
@@ -35,7 +36,6 @@ hmac.workspace = true
 hostname.workspace = true
 http.workspace = true
 humantime.workspace = true
-hyper-tungstenite.workspace = true
 hyper.workspace = true
 hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
 hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
@@ -76,7 +76,6 @@ smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
 subtle.workspace = true
-sync_wrapper.workspace = true
 task-local-extensions.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
@@ -106,6 +105,7 @@ workspace_hack.workspace = true
 [dev-dependencies]
 camino-tempfile.workspace = true
 fallible-iterator.workspace = true
+tokio-tungstenite.workspace = true
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index f634ab4e98..24ee749e6e 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -102,7 +102,7 @@ pub async fn task_main(
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
     connections.close(); // allows `connections.wait to complete`
 
-    let server = Builder::new(hyper_util::rt::TokioExecutor::new());
+    let server = Builder::new(TokioExecutor::new());
 
     while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await {
         let (conn, peer_addr) = res.context("could not accept TCP stream")?;
@@ -255,7 +255,6 @@ async fn connection_handler(
                 .in_current_span()
                 .map_ok_or_else(api_error_into_response, |r| r),
             );
-
             async move {
                 let res = handler.await;
                 cancel_request.disarm();
@@ -301,7 +300,7 @@ async fn request_handler(
         .map(|s| s.to_string());
 
     // Check if the request is a websocket upgrade request.
-    if hyper_tungstenite::is_upgrade_request(&request) {
+    if framed_websockets::upgrade::is_upgrade_request(&request) {
         let ctx = RequestMonitoring::new(
             session_id,
             peer_addr,
@@ -312,7 +311,7 @@ async fn request_handler(
         let span = ctx.span.clone();
         info!(parent: &span, "performing websocket upgrade");
 
-        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
+        let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request)
             .map_err(|e| ApiError::BadRequest(e.into()))?;
 
         ws_connections.spawn(
@@ -334,7 +333,7 @@ async fn request_handler(
         );
 
         // Return the response so the spawned future can continue.
-        Ok(response)
+        Ok(response.map(|_: http_body_util::Empty<Bytes>| Full::new(Bytes::new())))
     } else if request.uri().path() == "/sql" && *request.method() == Method::POST {
         let ctx = RequestMonitoring::new(
             session_id,
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 649bec2c7c..61d6d60dbe 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -7,10 +7,11 @@ use crate::{
     proxy::{handle_client, ClientMode},
     rate_limiter::EndpointRateLimiter,
 };
-use bytes::{Buf, Bytes};
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+use framed_websockets::{Frame, OpCode, WebSocketServer};
 use futures::{Sink, Stream};
-use hyper::upgrade::Upgraded;
-use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
+use hyper1::upgrade::OnUpgrade;
+use hyper_util::rt::TokioIo;
 use pin_project_lite::pin_project;
 
 use std::{
@@ -21,25 +22,23 @@ use std::{
 use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
 use tracing::warn;
 
-// TODO: use `std::sync::Exclusive` once it's stabilized.
-// Tracking issue: https://github.com/rust-lang/rust/issues/98407.
-use sync_wrapper::SyncWrapper;
-
 pin_project! {
     /// This is a wrapper around a [`WebSocketStream`] that
     /// implements [`AsyncRead`] and [`AsyncWrite`].
-    pub struct WebSocketRw<S = Upgraded> {
+    pub struct WebSocketRw<S> {
         #[pin]
-        stream: SyncWrapper<WebSocketStream<S>>,
-        bytes: Bytes,
+        stream: WebSocketServer<S>,
+        recv: Bytes,
+        send: BytesMut,
     }
 }
 
 impl<S> WebSocketRw<S> {
-    pub fn new(stream: WebSocketStream<S>) -> Self {
+    pub fn new(stream: WebSocketServer<S>) -> Self {
         Self {
-            stream: stream.into(),
-            bytes: Bytes::new(),
+            stream,
+            recv: Bytes::new(),
+            send: BytesMut::new(),
         }
     }
 }
@@ -50,22 +49,24 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for WebSocketRw<S> {
         cx: &mut Context<'_>,
         buf: &[u8],
     ) -> Poll<io::Result<usize>> {
-        let mut stream = self.project().stream.get_pin_mut();
+        let this = self.project();
+        let mut stream = this.stream;
+        this.send.put(buf);
 
         ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
-        match stream.as_mut().start_send(Message::Binary(buf.into())) {
+        match stream.as_mut().start_send(Frame::binary(this.send.split())) {
             Ok(()) => Poll::Ready(Ok(buf.len())),
             Err(e) => Poll::Ready(Err(io_error(e))),
         }
     }
 
     fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let stream = self.project().stream.get_pin_mut();
+        let stream = self.project().stream;
         stream.poll_flush(cx).map_err(io_error)
     }
 
     fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let stream = self.project().stream.get_pin_mut();
+        let stream = self.project().stream;
         stream.poll_close(cx).map_err(io_error)
     }
 }
@@ -76,13 +77,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncRead for WebSocketRw<S> {
         cx: &mut Context<'_>,
         buf: &mut ReadBuf<'_>,
     ) -> Poll<io::Result<()>> {
-        if buf.remaining() > 0 {
-            let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
-            let len = std::cmp::min(bytes.len(), buf.remaining());
-            buf.put_slice(&bytes[..len]);
-            self.consume(len);
-        }
-
+        let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
+        let len = std::cmp::min(bytes.len(), buf.remaining());
+        buf.put_slice(&bytes[..len]);
+        self.consume(len);
         Poll::Ready(Ok(()))
     }
 }
@@ -94,31 +92,27 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
 
         let mut this = self.project();
         loop {
-            if !this.bytes.chunk().is_empty() {
-                let chunk = (*this.bytes).chunk();
+            if !this.recv.chunk().is_empty() {
+                let chunk = (*this.recv).chunk();
                 return Poll::Ready(Ok(chunk));
             }
 
-            let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx));
+            let res = ready!(this.stream.as_mut().poll_next(cx));
             match res.transpose().map_err(io_error)? {
-                Some(message) => match message {
-                    Message::Ping(_) => {}
-                    Message::Pong(_) => {}
-                    Message::Text(text) => {
+                Some(message) => match message.opcode {
+                    OpCode::Ping => {}
+                    OpCode::Pong => {}
+                    OpCode::Text => {
                         // We expect to see only binary messages.
                         let error = "unexpected text message in the websocket";
-                        warn!(length = text.len(), error);
+                        warn!(length = message.payload.len(), error);
                         return Poll::Ready(Err(io_error(error)));
                     }
-                    Message::Frame(_) => {
-                        // This case is impossible according to Frame's doc.
-                        panic!("unexpected raw frame in the websocket");
+                    OpCode::Binary | OpCode::Continuation => {
+                        debug_assert!(this.recv.is_empty());
+                        *this.recv = message.payload.freeze();
                     }
-                    Message::Binary(chunk) => {
-                        assert!(this.bytes.is_empty());
-                        *this.bytes = Bytes::from(chunk);
-                    }
-                    Message::Close(_) => return EOF,
+                    OpCode::Close => return EOF,
                 },
                 None => return EOF,
             }
@@ -126,19 +120,21 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
     }
 
     fn consume(self: Pin<&mut Self>, amount: usize) {
-        self.project().bytes.advance(amount);
+        self.project().recv.advance(amount);
     }
 }
 
 pub async fn serve_websocket(
     config: &'static ProxyConfig,
     mut ctx: RequestMonitoring,
-    websocket: HyperWebsocket,
+    websocket: OnUpgrade,
     cancellation_handler: Arc<CancellationHandlerMain>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     hostname: Option<String>,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
+    let websocket = WebSocketServer::after_handshake(TokioIo::new(websocket));
+
     let conn_gauge = Metrics::get()
         .proxy
         .client_connections
@@ -177,15 +173,16 @@ pub async fn serve_websocket(
 mod tests {
     use std::pin::pin;
 
+    use framed_websockets::WebSocketServer;
     use futures::{SinkExt, StreamExt};
-    use hyper_tungstenite::{
-        tungstenite::{protocol::Role, Message},
-        WebSocketStream,
-    };
     use tokio::{
         io::{duplex, AsyncReadExt, AsyncWriteExt},
         task::JoinSet,
     };
+    use tokio_tungstenite::{
+        tungstenite::{protocol::Role, Message},
+        WebSocketStream,
+    };
 
     use super::WebSocketRw;
 
@@ -210,9 +207,7 @@ mod tests {
         });
 
         js.spawn(async move {
-            let mut rw = pin!(WebSocketRw::new(
-                WebSocketStream::from_raw_socket(stream2, Role::Server, None).await
-            ));
+            let mut rw = pin!(WebSocketRw::new(WebSocketServer::after_handshake(stream2)));
 
             let mut buf = vec![0; 1024];
             let n = rw.read(&mut buf).await.unwrap();
diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py
index 6d1cb9765a..6211446a40 100644
--- a/test_runner/regress/test_proxy_websockets.py
+++ b/test_runner/regress/test_proxy_websockets.py
@@ -135,7 +135,14 @@ async def test_websockets_pipelined(static_proxy: NeonProxy):
         query_message = "SELECT 1".encode("utf-8") + b"\0"
         length2 = (4 + len(query_message)).to_bytes(4, byteorder="big")
         await websocket.send(
-            [length0, startup_message, b"p", length1, auth_message, b"Q", length2, query_message]
+            length0
+            + startup_message
+            + b"p"
+            + length1
+            + auth_message
+            + b"Q"
+            + length2
+            + query_message
         )
 
         startup_response = await websocket.recv()

From ec069dc45ec6c0ef9b500dc0f433a0415b7e26db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 16 May 2024 16:48:49 +0200
Subject: [PATCH 0794/1571] tiered compaction: introduce PAGE_SZ constant and
 use it (#7785)

pointed out by @problame : we use the literal 8192 instead of a properly
defined constant. replace the literal by a PAGE_SZ constant.
---
 pageserver/compaction/src/bin/compaction-simulator.rs | 3 ++-
 pageserver/compaction/src/compact_tiered.rs           | 6 +++---
 pageserver/compaction/src/helpers.rs                  | 2 ++
 pageserver/compaction/src/simulator.rs                | 3 ++-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/pageserver/compaction/src/bin/compaction-simulator.rs b/pageserver/compaction/src/bin/compaction-simulator.rs
index 1fd69407d3..c308694ae1 100644
--- a/pageserver/compaction/src/bin/compaction-simulator.rs
+++ b/pageserver/compaction/src/bin/compaction-simulator.rs
@@ -1,4 +1,5 @@
 use clap::{Parser, Subcommand};
+use pageserver_compaction::helpers::PAGE_SZ;
 use pageserver_compaction::simulator::MockTimeline;
 use rand::Rng;
 use std::io::Write;
@@ -51,7 +52,7 @@ async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()>
     let mut executor = MockTimeline::new();
 
     // Convert the logical size in MB into a key range.
-    let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
+    let key_range = 0..((cmd.logical_size * 1024 * 1024) / PAGE_SZ);
     //let key_range = u64::MIN..u64::MAX;
     println!(
         "starting simulation with key range {:016X}-{:016X}",
diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index a8f184af24..33c9948f45 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -25,7 +25,7 @@ use std::collections::{HashSet, VecDeque};
 use std::ops::Range;
 
 use crate::helpers::{
-    accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with,
+    accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ,
 };
 use crate::interface::*;
 use utils::lsn::Lsn;
@@ -379,7 +379,7 @@ where
                 .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
                 .await?,
             &self.shard_identity,
-        ) * 8192;
+        ) * PAGE_SZ;
 
         let wal_size = job
             .input_layers
@@ -441,7 +441,7 @@ where
         let mut window = KeyspaceWindow::new(
             E::Key::MIN..E::Key::MAX,
             keyspace,
-            self.target_file_size / 8192,
+            self.target_file_size / PAGE_SZ,
         );
         while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
             new_jobs.push(CompactionJob::<E> {
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index 2c922b0a49..8ed1d16082 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -16,6 +16,8 @@ use std::pin::Pin;
 use std::task::{ready, Poll};
 use utils::lsn::Lsn;
 
+pub const PAGE_SZ: u64 = 8192;
+
 pub fn keyspace_total_size<K>(
     keyspace: &CompactionKeySpace<K>,
     shard_identity: &ShardIdentity,
diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs
index 3543df64fa..a7c8bd5c1f 100644
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -14,6 +14,7 @@ use std::ops::Range;
 use std::sync::Arc;
 use std::sync::Mutex;
 
+use crate::helpers::PAGE_SZ;
 use crate::helpers::{merge_delta_keys, overlaps_with};
 
 use crate::interface;
@@ -509,7 +510,7 @@ impl interface::CompactionJobExecutor for MockTimeline {
         let new_layer = Arc::new(MockImageLayer {
             key_range: key_range.clone(),
             lsn_range: lsn..lsn,
-            file_size: accum_size * 8192,
+            file_size: accum_size * PAGE_SZ,
             deleted: Mutex::new(false),
         });
         info!(

From 4c5afb7b1000768b4ec6dd3db362c1159a189aaa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 16 May 2024 19:35:13 +0200
Subject: [PATCH 0795/1571] Remove SSO_ACCOUNT_ID from scrubber docs and
 BucketConfig (#7774)

As of #6202 we support `AWS_PROFILE` as well, which is more convenient.
Change the docs to using it instead of `SSO_ACCOUNT_ID`. Also, remove
`SSO_ACCOUNT_ID` from BucketConfig as it is confusing to the code's
reader: it's not the "main" way of setting up authentication for the
scrubber any more.

It is a breaking change for the on-disk format as we persist `sso_account_id` to disk,
but it was quite inconsistent with the other methods which are not persistet. Also,
I don't think we want to support the case where one version writes the json and
another version reads it.

Related: #7667
---
 s3_scrubber/README.md  | 12 +++++++-----
 s3_scrubber/src/lib.rs | 24 ++++--------------------
 2 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/s3_scrubber/README.md b/s3_scrubber/README.md
index c1deab8852..8a96542ada 100644
--- a/s3_scrubber/README.md
+++ b/s3_scrubber/README.md
@@ -9,11 +9,13 @@ and `safekeeper`, and does housekeeping such as cleaning up objects for tenants
 
 #### S3
 
-Do `aws sso login --profile dev` to get the SSO access to the bucket to clean, get the SSO_ACCOUNT_ID for your profile (`cat ~/.aws/config` may help).
+Do `aws sso login --profile dev` to get the SSO access to the bucket to clean.
+Also, set the following environment variables:
 
-- `SSO_ACCOUNT_ID`: Credentials id to use for accessing S3 buckets
+- `AWS_PROFILE`: Profile name to use for accessing S3 buckets (e.g. `dev`)
 - `REGION`: A region where the bucket is located at.
 - `BUCKET`: Bucket name
+- `BUCKET_PREFIX` (optional): Prefix inside the bucket
 
 #### Console API
 
@@ -43,7 +45,7 @@ processing by the `purge-garbage` subcommand.
 
 Example:
 
-`env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`
+`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`
 
 #### `purge-garbage`
 
@@ -59,7 +61,7 @@ to pass them on the command line
 
 Example:
 
-`env SSO_ACCOUNT_ID=123456 cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json`
+`env AWS_PROFILE=dev cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json`
 
 Add the `--delete` argument before `purge-garbage` to enable deletion.  This is intentionally
 not provided inline in the example above to avoid accidents.  Without the `--delete` flag
@@ -72,7 +74,7 @@ Errors are logged to stderr and summary to stdout.
 
 For pageserver:
 ```
-env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
+env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
 
 Timelines: 31106
 With errors: 3
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index 7966fb6a88..e0f99ecd9c 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -200,30 +200,15 @@ impl RootTarget {
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(deny_unknown_fields)]
 pub struct BucketConfig {
     pub region: String,
     pub bucket: String,
     pub prefix_in_bucket: Option<String>,
-
-    /// Use SSO if this is set, else rely on AWS_* environment vars
-    pub sso_account_id: Option<String>,
-}
-
-impl Display for BucketConfig {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{}/{}/{}",
-            self.sso_account_id.as_deref().unwrap_or("<none>"),
-            self.region,
-            self.bucket
-        )
-    }
 }
 
 impl BucketConfig {
     pub fn from_env() -> anyhow::Result<Self> {
-        let sso_account_id = env::var("SSO_ACCOUNT_ID").ok();
         let region = env::var("REGION").context("'REGION' param retrieval")?;
         let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?;
         let prefix_in_bucket = env::var("BUCKET_PREFIX").ok();
@@ -232,7 +217,6 @@ impl BucketConfig {
             region,
             bucket,
             prefix_in_bucket,
-            sso_account_id,
         })
     }
 }
@@ -276,7 +260,7 @@ pub fn init_logging(file_name: &str) -> WorkerGuard {
     guard
 }
 
-pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Client {
+pub fn init_s3_client(bucket_region: Region) -> Client {
     let credentials_provider = {
         // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
         let chain = CredentialsProviderChain::first_try(
@@ -290,7 +274,7 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
         );
 
         // Use SSO if we were given an account ID
-        match account_id {
+        match std::env::var("SSO_ACCOUNT_ID").ok() {
             Some(sso_account) => chain.or_else(
                 "sso",
                 SsoCredentialsProvider::builder()
@@ -334,7 +318,7 @@ fn init_remote(
 ) -> anyhow::Result<(Arc<Client>, RootTarget)> {
     let bucket_region = Region::new(bucket_config.region);
     let delimiter = "/".to_string();
-    let s3_client = Arc::new(init_s3_client(bucket_config.sso_account_id, bucket_region));
+    let s3_client = Arc::new(init_s3_client(bucket_region));
 
     let s3_root = match node_kind {
         NodeKind::Pageserver => RootTarget::Pageserver(S3Target {

From 4b8809b280b04c92d0c9e2cfb21cbc230de7995b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 16 May 2024 22:25:19 +0200
Subject: [PATCH 0796/1571] Tiered compaction: improvements to the windows
 (#7787)

Tiered compaction employs two sliding windows over the keyspace:
`KeyspaceWindow` for the image layer generation and `Window` for the
delta layer generation. Do some fixes to both windows:

* The distinction between the two windows is not very clear. Do the
absolute minimum to mention where they are used in the rustdoc
description of the struct. Maybe we should rename them (say
`WindowForImage` and `WindowForDelta`) or merge them into one window
implementation.
* Require the keys to strictly increase. The `accum_key_values` already
combines the key, so there is no logic needed in `Window::feed` for the
same key repeating. This is a follow-up to address the request in
https://github.com/neondatabase/neon/pull/7671#pullrequestreview-2051995541
* In `choose_next_delta`, we claimed in the comment to use 1.25 as the
factor but it was 1.66 instead. Fix this discrepancy by using `*5/4` as
the two operations.
---
 pageserver/compaction/src/compact_tiered.rs | 24 +++++++++++----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index 33c9948f45..20f88868f9 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -663,8 +663,8 @@ where
     }
 }
 
-// Sliding window through keyspace and values
-// This is used by over_with_images to decide on good split points
+/// Sliding window through keyspace and values for image layer
+/// This is used by [`LevelCompactionState::cover_with_images`] to decide on good split points
 struct KeyspaceWindow<K> {
     head: KeyspaceWindowHead<K>,
 
@@ -804,9 +804,9 @@ struct WindowElement<K> {
     accum_size: u64,
 }
 
-// Sliding window through keyspace and values
-//
-// This is used to decide what layer to write next, from the beginning of the window.
+/// Sliding window through keyspace and values for delta layer tiling
+///
+/// This is used to decide which delta layer to write next.
 struct Window<K> {
     elems: VecDeque<WindowElement<K>>,
 
@@ -830,11 +830,13 @@ where
     fn feed(&mut self, key: K, size: u64) {
         let last_size;
         if let Some(last) = self.elems.back_mut() {
-            assert!(last.last_key <= key);
-            if key == last.last_key {
-                last.accum_size += size;
-                return;
-            }
+            // We require the keys to be strictly increasing for the window.
+            // Keys should already have been deduplicated by `accum_key_values`
+            assert!(
+                last.last_key < key,
+                "last_key(={}) >= key(={key})",
+                last.last_key
+            );
             last_size = last.accum_size;
         } else {
             last_size = 0;
@@ -922,7 +924,7 @@ where
         // If we're willing to stretch it up to 1.25 target size, could we
         // gobble up the rest of the work? This avoids creating very small
         // "tail" layers at the end of the keyspace
-        if !has_more && self.remain_size() < target_size * 5 / 3 {
+        if !has_more && self.remain_size() < target_size * 5 / 4 {
             self.commit_upto(self.elems.len());
         } else {
             let delta_split_at = self.find_size_split(target_size);

From 6d951e69d636dc1fbfda2bc0282547fcae19ec81 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 17 May 2024 12:24:02 +0200
Subject: [PATCH 0797/1571] test_suite: patch, don't replace, the
 `tenant_config` field, where appropriate (#7771)

Before this PR, the changed tests would overwrite the entire
`tenant_config` because `pageserver_config_override` is merged
non-recursively into the `ps_cfg`.

This meant they would override the
`PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM`, impacting our
matrix build for `compaction_algorithm=Tiered|Legacy` in
https://github.com/neondatabase/neon/pull/7748.

I found the tests fixed in this PR using the
`NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM` env var that
I added in #7748. Therefore, I think this is an exhaustive fix. This is
better than just searching the code base for `tenant_config`, which is
what I had sketched in #7747.

refs #7749
---
 test_runner/fixtures/neon_fixtures.py       | 14 +++++++----
 test_runner/regress/test_branch_behind.py   |  3 +--
 test_runner/regress/test_gc_aggressive.py   |  9 +++----
 test_runner/regress/test_old_request_lsn.py |  3 +--
 test_runner/regress/test_pageserver_api.py  |  2 ++
 test_runner/regress/test_pitr_gc.py         |  6 ++---
 test_runner/regress/test_recovery.py        |  8 +++---
 test_runner/regress/test_tenant_conf.py     | 27 +++++++++++++--------
 test_runner/regress/test_tenant_size.py     | 11 ++++++---
 test_runner/regress/test_timeline_size.py   | 20 +++++++++------
 test_runner/regress/test_wal_receiver.py    | 13 +++++++---
 11 files changed, 70 insertions(+), 46 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b4761f103b..23f30804b4 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -455,7 +455,7 @@ class NeonEnvBuilder:
         test_overlay_dir: Optional[Path] = None,
         pageserver_remote_storage: Optional[RemoteStorage] = None,
         # toml that will be decomposed into `--config-override` flags during `pageserver --init`
-        pageserver_config_override: Optional[str] = None,
+        pageserver_config_override: Optional[str | Callable[[Dict[str, Any]], None]] = None,
         num_safekeepers: int = 1,
         num_pageservers: int = 1,
         # Use non-standard SK ids to check for various parsing bugs
@@ -1127,10 +1127,14 @@ class NeonEnv:
                 )
 
             if config.pageserver_config_override is not None:
-                for o in config.pageserver_config_override.split(";"):
-                    override = toml.loads(o)
-                    for key, value in override.items():
-                        ps_cfg[key] = value
+                if callable(config.pageserver_config_override):
+                    config.pageserver_config_override(ps_cfg)
+                else:
+                    assert isinstance(config.pageserver_config_override, str)
+                    for o in config.pageserver_config_override.split(";"):
+                        override = toml.loads(o)
+                        for key, value in override.items():
+                            ps_cfg[key] = value
 
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index ac2fc79be4..0a5336f5a2 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -11,8 +11,7 @@ from fixtures.utils import print_gc_result, query_scalar
 #
 def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     # Disable pitr, because here we want to test branch creation after GC
-    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
 
     error_regexes = [
         ".*invalid branch start lsn.*",
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index e5067bba8b..44133f2350 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -67,8 +67,7 @@ async def update_and_gc(env: NeonEnv, endpoint: Endpoint, timeline: TimelineId):
 #
 def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
     # Disable pitr, because here we want to test branch creation after GC
-    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
     timeline = env.neon_cli.create_branch("test_gc_aggressive", "main")
     endpoint = env.endpoints.create_start("test_gc_aggressive")
 
@@ -94,13 +93,11 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
 
 #
 def test_gc_index_upload(neon_env_builder: NeonEnvBuilder):
-    # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
-    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
     num_index_uploads = 0
 
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-
-    env = neon_env_builder.init_start()
+    # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
+    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
     tenant_id = env.initial_tenant
     timeline_id = env.neon_cli.create_branch("test_gc_index_upload", "main")
     endpoint = env.endpoints.create_start("test_gc_index_upload")
diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py
index 43b0bb56f0..f1dd3fb67d 100644
--- a/test_runner/regress/test_old_request_lsn.py
+++ b/test_runner/regress/test_old_request_lsn.py
@@ -16,8 +16,7 @@ from fixtures.utils import print_gc_result, query_scalar
 #
 def test_old_request_lsn(neon_env_builder: NeonEnvBuilder):
     # Disable pitr, because here we want to test branch creation after GC
-    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
     env.neon_cli.create_branch("test_old_request_lsn", "main")
     endpoint = env.endpoints.create_start("test_old_request_lsn")
 
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 80a1d72f4a..abbea59113 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -42,6 +42,8 @@ def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path):
         "listen_http_addr",
         "pg_auth_type",
         "http_auth_type",
+        # TODO: only needed for NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM in https://github.com/neondatabase/neon/pull/7748
+        # "tenant_config",
     ]
     required_config_overrides = [
         f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys
diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py
index 6434f431a4..7e676b5515 100644
--- a/test_runner/regress/test_pitr_gc.py
+++ b/test_runner/regress/test_pitr_gc.py
@@ -10,11 +10,9 @@ from fixtures.utils import print_gc_result, query_scalar
 #
 def test_pitr_gc(neon_env_builder: NeonEnvBuilder):
     # Set pitr interval such that we need to keep the data
-    neon_env_builder.pageserver_config_override = (
-        "tenant_config={pitr_interval = '1 day', gc_horizon = 0}"
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={"pitr_interval": "1 day", "gc_horizon": "0"}
     )
-
-    env = neon_env_builder.init_start()
     endpoint_main = env.endpoints.create_start("main")
 
     main_pg_conn = endpoint_main.connect()
diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py
index ab5c8be256..e21f9bb6f6 100644
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -10,9 +10,11 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 #
 def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
     # Override default checkpointer settings to run it more often
-    neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"
-
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_distance": "1048576",
+        }
+    )
     env.pageserver.is_testing_enabled_or_skip()
 
     # We expect the pageserver to exit, which will cause storage storage controller
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index a345464208..2cbb036c0d 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -1,5 +1,6 @@
 import json
 from contextlib import closing
+from typing import Any, Dict
 
 import psycopg2.extras
 from fixtures.common_types import Lsn
@@ -14,16 +15,22 @@ from fixtures.utils import wait_until
 
 def test_tenant_config(neon_env_builder: NeonEnvBuilder):
     """Test per tenant configuration"""
-    # set some non-default global config
-    neon_env_builder.pageserver_config_override = """
-page_cache_size=444;
-wait_lsn_timeout='111 s';
-[tenant_config]
-checkpoint_distance = 10000
-compaction_target_size = 1048576
-evictions_low_residence_duration_metric_threshold = "2 days"
-eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "23 hours" }
-"""
+
+    def set_some_nondefault_global_config(ps_cfg: Dict[str, Any]):
+        ps_cfg["page_cache_size"] = 444
+        ps_cfg["wait_lsn_timeout"] = "111 s"
+
+        tenant_config = ps_cfg.setdefault("tenant_config", {})
+        tenant_config["checkpoint_distance"] = 10000
+        tenant_config["compaction_target_size"] = 1048576
+        tenant_config["evictions_low_residence_duration_metric_threshold"] = "2 days"
+        tenant_config["eviction_policy"] = {
+            "kind": "LayerAccessThreshold",
+            "period": "20s",
+            "threshold": "23 hours",
+        }
+
+    neon_env_builder.pageserver_config_override = set_some_nondefault_global_config
 
     env = neon_env_builder.init_start()
     # we configure eviction but no remote storage, there might be error lines
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 7894f6933d..d3a228dbeb 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -502,9 +502,14 @@ def test_get_tenant_size_with_multiple_branches(
 
     gc_horizon = 128 * 1024
 
-    neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
-
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "compaction_period": "0s",
+            "gc_period": "0s",
+            "pitr_interval": "0sec",
+            "gc_horizon": gc_horizon,
+        }
+    )
 
     # FIXME: we have a race condition between GC and delete timeline. GC might fail with this
     # error. Similar to https://github.com/neondatabase/neon/issues/2671
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 18063bf104..a6d06df3b6 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -415,11 +415,12 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
 
     # Disable background compaction as we don't want it to happen after `get_physical_size` request
     # and before checking the expected size on disk, which makes the assertion failed
-    neon_env_builder.pageserver_config_override = (
-        "tenant_config={checkpoint_distance=100000, compaction_period='10m'}"
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_distance": "100000",
+            "compaction_period": "10m",
+        }
     )
-
-    env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
 
     new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction")
@@ -462,9 +463,14 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
 
     # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request
     # and before checking the expected size on disk, which makes the assertion failed
-    neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='0s', gc_period='0s', pitr_interval='1s'}"
-
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_distance": "100000",
+            "compaction_period": "0s",
+            "gc_period": "0s",
+            "pitr_interval": "1s",
+        }
+    )
     pageserver_http = env.pageserver.http_client()
 
     new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc")
diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py
index d9265dcbcd..6582b34218 100644
--- a/test_runner/regress/test_wal_receiver.py
+++ b/test_runner/regress/test_wal_receiver.py
@@ -1,4 +1,5 @@
 import time
+from typing import Any, Dict
 
 from fixtures.common_types import Lsn, TenantId
 from fixtures.log_helper import log
@@ -42,10 +43,14 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
 # Kills one of the safekeepers and ensures that only the active ones are printed in the state.
 def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
     # Trigger WAL wait timeout faster
-    neon_env_builder.pageserver_config_override = """
-        wait_lsn_timeout = "1s"
-        tenant_config={walreceiver_connect_timeout = "2s", lagging_wal_timeout = "2s"}
-    """
+    def customize_pageserver_toml(ps_cfg: Dict[str, Any]):
+        ps_cfg["wait_lsn_timeout"] = "1s"
+        tenant_config = ps_cfg.setdefault("tenant_config", {})
+        tenant_config["walreceiver_connect_timeout"] = "2s"
+        tenant_config["lagging_wal_timeout"] = "2s"
+
+    neon_env_builder.pageserver_config_override = customize_pageserver_toml
+
     # Have notable SK ids to ensure we check logs for their presence, not some other random numbers
     neon_env_builder.safekeepers_id_start = 12345
     neon_env_builder.num_safekeepers = 3

From c1390bfc3bbd3a7f00df334a39220ca312fc888e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 17 May 2024 13:25:01 +0300
Subject: [PATCH 0798/1571] chore: update defaults for timeline_detach_ancestor
 (#7779)

by having 100 copy operations in flight twe climb up to 2500 requests
per min or 41/s. This is still probably less than is allowed, but fast
enough for our purposes.
---
 pageserver/src/tenant/timeline/detach_ancestor.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 7f59758c87..4d8e570181 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -56,7 +56,7 @@ impl Default for Options {
     fn default() -> Self {
         Self {
             rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(),
-            copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(),
+            copy_concurrency: std::num::NonZeroUsize::new(100).unwrap(),
         }
     }
 }

From a8e6d259cb49d1bf156dfc2215b92c04d1e8a08f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 17 May 2024 13:24:03 +0100
Subject: [PATCH 0799/1571] pageserver: fixes for layer path changes (#7786)

## Problem

- When a layer with legacy local path format is evicted and then
re-downloaded, a panic happened because the path downloaded by remote
storage didn't match the path stored in Layer.
- While investigating, I also realized that secondary locations would
have a similar issue with evictions.

Closes: #7783

## Summary of changes

- Make remote timeline client take local paths as an input: it should
not have its own ideas about local paths, instead it just uses the layer
path that the Layer has.
- Make secondary state store an explicit local path, populated on scan
of local disk at startup. This provides the same behavior as for Layer,
that our local_layer_path is a _default_, but the layer path can
actually be anything (e.g. an old style one).
- Add tests for both cases.
---
 pageserver/src/disk_usage_eviction_task.rs    |  8 +-
 .../src/tenant/remote_timeline_client.rs      |  2 +
 .../tenant/remote_timeline_client/download.rs | 11 +--
 pageserver/src/tenant/secondary.rs            | 54 +++---------
 pageserver/src/tenant/secondary/downloader.rs | 47 ++++++++---
 pageserver/src/tenant/storage_layer/layer.rs  |  1 +
 .../regress/test_pageserver_generations.py    | 84 +++++++++++++++----
 7 files changed, 119 insertions(+), 88 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index ebeb8bbb20..7f25e49570 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -535,17 +535,11 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                 }
                 EvictionLayer::Secondary(layer) => {
                     let file_size = layer.metadata.file_size();
-                    let tenant_manager = tenant_manager.clone();
 
                     js.spawn(async move {
                         layer
                             .secondary_tenant
-                            .evict_layer(
-                                tenant_manager.get_conf(),
-                                layer.timeline_id,
-                                layer.name,
-                                layer.metadata,
-                            )
+                            .evict_layer(layer.timeline_id, layer.name)
                             .await;
                         Ok(file_size)
                     });
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index c5462dac43..07d6af696c 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -518,6 +518,7 @@ impl RemoteTimelineClient {
         &self,
         layer_file_name: &LayerName,
         layer_metadata: &LayerFileMetadata,
+        local_path: &Utf8Path,
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<u64> {
@@ -536,6 +537,7 @@ impl RemoteTimelineClient {
                 self.timeline_id,
                 layer_file_name,
                 layer_metadata,
+                local_path,
                 cancel,
                 ctx,
             )
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index f3c9e64533..70c5cae05e 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -21,7 +21,6 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
-use crate::tenant::storage_layer::layer::local_layer_path;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
@@ -50,19 +49,13 @@ pub async fn download_layer_file<'a>(
     timeline_id: TimelineId,
     layer_file_name: &'a LayerName,
     layer_metadata: &'a LayerFileMetadata,
+    local_path: &Utf8Path,
     cancel: &CancellationToken,
     ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
     let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
-    let local_path = local_layer_path(
-        conf,
-        &tenant_shard_id,
-        &timeline_id,
-        layer_file_name,
-        &layer_metadata.generation,
-    );
 
     let remote_path = remote_layer_path(
         &tenant_shard_id.tenant_id,
@@ -82,7 +75,7 @@ pub async fn download_layer_file<'a>(
     // For more context about durable_rename check this email from postgres mailing list:
     // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
     // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
-    let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
+    let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
 
     let bytes_amount = download_retry(
         || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 7075044baf..252b6eb11b 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -6,11 +6,9 @@ mod scheduler;
 use std::{sync::Arc, time::SystemTime};
 
 use crate::{
-    config::PageServerConf,
     context::RequestContext,
     disk_usage_eviction_task::DiskUsageEvictionInfo,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    virtual_file::MaybeFatalIo,
 };
 
 use self::{
@@ -21,9 +19,8 @@ use self::{
 use super::{
     config::{SecondaryLocationConfig, TenantConfOpt},
     mgr::TenantManager,
-    remote_timeline_client::LayerFileMetadata,
     span::debug_assert_current_span_has_tenant_id,
-    storage_layer::{layer::local_layer_path, LayerName},
+    storage_layer::LayerName,
 };
 
 use pageserver_api::{
@@ -178,13 +175,7 @@ impl SecondaryTenant {
 
     /// Cancellation safe, but on cancellation the eviction will go through
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
-    pub(crate) async fn evict_layer(
-        self: &Arc<Self>,
-        conf: &PageServerConf,
-        timeline_id: TimelineId,
-        name: LayerName,
-        metadata: LayerFileMetadata,
-    ) {
+    pub(crate) async fn evict_layer(self: &Arc<Self>, timeline_id: TimelineId, name: LayerName) {
         debug_assert_current_span_has_tenant_id();
 
         let guard = match self.gate.enter() {
@@ -197,41 +188,11 @@ impl SecondaryTenant {
 
         let now = SystemTime::now();
 
-        let local_path = local_layer_path(
-            conf,
-            &self.tenant_shard_id,
-            &timeline_id,
-            &name,
-            &metadata.generation,
-        );
-
         let this = self.clone();
 
         // spawn it to be cancellation safe
         tokio::task::spawn_blocking(move || {
             let _guard = guard;
-            // We tolerate ENOENT, because between planning eviction and executing
-            // it, the secondary downloader could have seen an updated heatmap that
-            // resulted in a layer being deleted.
-            // Other local I/O errors are process-fatal: these should never happen.
-            let deleted = std::fs::remove_file(local_path);
-
-            let not_found = deleted
-                .as_ref()
-                .is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
-
-            let deleted = if not_found {
-                false
-            } else {
-                deleted
-                    .map(|()| true)
-                    .fatal_err("Deleting layer during eviction")
-            };
-
-            if !deleted {
-                // skip updating accounting and putting perhaps later timestamp
-                return;
-            }
 
             // Update the timeline's state.  This does not have to be synchronized with
             // the download process, because:
@@ -250,8 +211,15 @@ impl SecondaryTenant {
             // of the cache.
             let mut detail = this.detail.lock().unwrap();
             if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
-                timeline_detail.on_disk_layers.remove(&name);
-                timeline_detail.evicted_at.insert(name, now);
+                let removed = timeline_detail.on_disk_layers.remove(&name);
+
+                // We might race with removal of the same layer during downloads, if it was removed
+                // from the heatmap.  If we see that the OnDiskState is gone, then no need to
+                // do a physical deletion or store in evicted_at.
+                if let Some(removed) = removed {
+                    removed.remove_blocking();
+                    timeline_detail.evicted_at.insert(name, now);
+                }
             }
         })
         .await
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 46a3d7e81f..8f27220771 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -111,6 +111,7 @@ struct SecondaryDownloader {
 pub(super) struct OnDiskState {
     metadata: LayerFileMetadata,
     access_time: SystemTime,
+    local_path: Utf8PathBuf,
 }
 
 impl OnDiskState {
@@ -121,12 +122,26 @@ impl OnDiskState {
         _ame: LayerName,
         metadata: LayerFileMetadata,
         access_time: SystemTime,
+        local_path: Utf8PathBuf,
     ) -> Self {
         Self {
             metadata,
             access_time,
+            local_path,
         }
     }
+
+    // This is infallible, because all errors are either acceptable (ENOENT), or totally
+    // unexpected (fatal).
+    pub(super) fn remove_blocking(&self) {
+        // We tolerate ENOENT, because between planning eviction and executing
+        // it, the secondary downloader could have seen an updated heatmap that
+        // resulted in a layer being deleted.
+        // Other local I/O errors are process-fatal: these should never happen.
+        std::fs::remove_file(&self.local_path)
+            .or_else(fs_ext::ignore_not_found)
+            .fatal_err("Deleting secondary layer")
+    }
 }
 
 #[derive(Debug, Clone, Default)]
@@ -816,20 +831,12 @@ impl<'a> TenantDownloader<'a> {
                 if cfg!(debug_assertions) {
                     // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
                     // are already present on disk are really there.
-                    let local_path = local_layer_path(
-                        self.conf,
-                        tenant_shard_id,
-                        &timeline.timeline_id,
-                        &layer.name,
-                        &layer.metadata.generation,
-                    );
-
-                    match tokio::fs::metadata(&local_path).await {
+                    match tokio::fs::metadata(&on_disk.local_path).await {
                         Ok(meta) => {
                             tracing::debug!(
                                 "Layer {} present at {}, size {}",
                                 layer.name,
-                                local_path,
+                                on_disk.local_path,
                                 meta.len(),
                             );
                         }
@@ -837,7 +844,7 @@ impl<'a> TenantDownloader<'a> {
                             tracing::warn!(
                                 "Layer {} not found at {} ({})",
                                 layer.name,
-                                local_path,
+                                on_disk.local_path,
                                 e
                             );
                             debug_assert!(false);
@@ -926,6 +933,13 @@ impl<'a> TenantDownloader<'a> {
                         v.get_mut().access_time = t.access_time;
                     }
                     Entry::Vacant(e) => {
+                        let local_path = local_layer_path(
+                            self.conf,
+                            tenant_shard_id,
+                            &timeline.timeline_id,
+                            &t.name,
+                            &t.metadata.generation,
+                        );
                         e.insert(OnDiskState::new(
                             self.conf,
                             tenant_shard_id,
@@ -933,6 +947,7 @@ impl<'a> TenantDownloader<'a> {
                             t.name,
                             LayerFileMetadata::from(&t.metadata),
                             t.access_time,
+                            local_path,
                         ));
                     }
                 }
@@ -955,6 +970,14 @@ impl<'a> TenantDownloader<'a> {
             &self.secondary_state.cancel
         );
 
+        let local_path = local_layer_path(
+            self.conf,
+            tenant_shard_id,
+            timeline_id,
+            &layer.name,
+            &layer.metadata.generation,
+        );
+
         // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
         let downloaded_bytes = match download_layer_file(
             self.conf,
@@ -963,6 +986,7 @@ impl<'a> TenantDownloader<'a> {
             *timeline_id,
             &layer.name,
             &LayerFileMetadata::from(&layer.metadata),
+            &local_path,
             &self.secondary_state.cancel,
             ctx,
         )
@@ -1116,6 +1140,7 @@ async fn init_timeline_state(
                                     name,
                                     LayerFileMetadata::from(&remote_meta.metadata),
                                     remote_meta.access_time,
+                                    file_path,
                                 ),
                             );
                         }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index e8c712c4c6..97b349f635 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1108,6 +1108,7 @@ impl LayerInner {
             .download_layer_file(
                 &self.desc.layer_name(),
                 &self.metadata(),
+                &self.path,
                 &timeline.cancel,
                 ctx,
             )
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 9b97254410..0235cf6d20 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -25,6 +25,7 @@ from fixtures.neon_fixtures import (
     S3Scrubber,
     generate_uploads_and_deletions,
 )
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     assert_tenant_state,
@@ -632,39 +633,86 @@ def test_upgrade_generationless_local_file_paths(
     generation numbers: it should accept these layer files, and avoid doing
     a delete/download cycle on them.
     """
-    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(
+        tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}'
+    )
 
     workload = Workload(env, tenant_id, timeline_id)
     workload.init()
     workload.write_rows(1000)
 
-    env.pageserver.stop()
+    attached_pageserver = env.get_tenant_pageserver(tenant_id)
+    secondary_pageserver = list([ps for ps in env.pageservers if ps.id != attached_pageserver.id])[
+        0
+    ]
+
+    attached_pageserver.http_client().tenant_heatmap_upload(tenant_id)
+    secondary_pageserver.http_client().tenant_secondary_download(tenant_id)
 
     # Rename the local paths to legacy format, to simulate what
-    # we would see when upgrading
-    timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
-    files_renamed = 0
-    for filename in os.listdir(timeline_dir):
-        path = os.path.join(timeline_dir, filename)
-        log.info(f"Found file {path}")
-        if path.endswith("-v1-00000001"):
-            new_path = path[:-12]
-            os.rename(path, new_path)
-            log.info(f"Renamed {path} -> {new_path}")
-            files_renamed += 1
+    # we would see when upgrading.  Do this on both attached and secondary locations, as we will
+    # test the behavior of both.
+    for pageserver in env.pageservers:
+        pageserver.stop()
+        timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
+        files_renamed = 0
+        for filename in os.listdir(timeline_dir):
+            path = os.path.join(timeline_dir, filename)
+            log.info(f"Found file {path}")
+            if path.endswith("-v1-00000001"):
+                new_path = path[:-12]
+                os.rename(path, new_path)
+                log.info(f"Renamed {path} -> {new_path}")
+                files_renamed += 1
 
-    assert files_renamed > 0
+        assert files_renamed > 0
 
-    env.pageserver.start()
+        pageserver.start()
 
     workload.validate()
 
     # Assert that there were no on-demand downloads
     assert (
-        env.pageserver.http_client().get_metric_value(
+        attached_pageserver.http_client().get_metric_value(
             "pageserver_remote_ondemand_downloaded_layers_total"
         )
         == 0
     )
+
+    # Do a secondary download and ensure there were no layer downloads
+    secondary_pageserver.http_client().tenant_secondary_download(tenant_id)
+    assert (
+        secondary_pageserver.http_client().get_metric_value(
+            "pageserver_secondary_download_layer_total"
+        )
+        == 0
+    )
+
+    # Check that when we evict and promote one of the legacy-named layers, everything works as
+    # expected
+    local_layers = list(
+        (
+            parse_layer_file_name(path.name),
+            os.path.join(attached_pageserver.timeline_dir(tenant_id, timeline_id), path),
+        )
+        for path in attached_pageserver.list_layers(tenant_id, timeline_id)
+    )
+    (victim_layer_name, victim_path) = local_layers[0]
+    assert os.path.exists(victim_path)
+
+    attached_pageserver.http_client().evict_layer(
+        tenant_id, timeline_id, victim_layer_name.to_str()
+    )
+    assert not os.path.exists(victim_path)
+
+    attached_pageserver.http_client().download_layer(
+        tenant_id, timeline_id, victim_layer_name.to_str()
+    )
+    # We should download into the same local path we started with
+    assert os.path.exists(victim_path)

From af99c959ef460326b35716239c09b3a572c43b4c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 17 May 2024 16:44:33 +0100
Subject: [PATCH 0800/1571] storage controller: use SERIALIZABLE isolation
 level (#7792)

## Problem

The storage controller generally assumes that things like updating
generation numbers are atomic: it should use a strict isolation level.

## Summary of changes

- Wrap all database operations in a SERIALIZABLE transaction.
- Retry serialization failures, as these do not indicate problems and
are normal when plenty of concurrent work is happening.

Using this isolation level for all reads is overkill, but much simpler
than reasoning about it on a per-operation basis, and does not hurt
performance.

Tested this with a modified version of storage_controller_many_tenants
test with 128k shards, to check that our performance is still fine: it
is.
---
 storage_controller/src/persistence.rs | 230 ++++++++++++++------------
 1 file changed, 126 insertions(+), 104 deletions(-)

diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index dca37166ba..67c05296d5 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -173,7 +173,7 @@ impl Persistence {
     /// Wraps `with_conn` in order to collect latency and error metrics
     async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
     where
-        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
         R: Send + 'static,
     {
         let latency = &METRICS_REGISTRY
@@ -199,13 +199,48 @@ impl Persistence {
     /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
     async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
     where
-        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
         R: Send + 'static,
     {
+        // A generous allowance for how many times we may retry serializable transactions
+        // before giving up.  This is not expected to be hit: it is a defensive measure in case we
+        // somehow engineer a situation where duelling transactions might otherwise live-lock.
+        const MAX_RETRIES: usize = 128;
+
         let mut conn = self.connection_pool.get()?;
-        tokio::task::spawn_blocking(move || -> DatabaseResult<R> { func(&mut conn) })
-            .await
-            .expect("Task panic")
+        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
+            let mut retry_count = 0;
+            loop {
+                match conn.build_transaction().serializable().run(|c| func(c)) {
+                    Ok(r) => break Ok(r),
+                    Err(
+                        err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
+                            diesel::result::DatabaseErrorKind::SerializationFailure,
+                            _,
+                        )),
+                    ) => {
+                        retry_count += 1;
+                        if retry_count > MAX_RETRIES {
+                            tracing::error!(
+                                "Exceeded max retries on SerializationFailure errors: {err:?}"
+                            );
+                            break Err(err);
+                        } else {
+                            // Retry on serialization errors: these are expected, because even though our
+                            // transactions don't fight for the same rows, they will occasionally collide
+                            // on index pages (e.g. increment_generation for unrelated shards can collide)
+                            tracing::debug!(
+                                "Retrying transaction on serialization failure {err:?}"
+                            );
+                            continue;
+                        }
+                    }
+                    Err(e) => break Err(e),
+                }
+            }
+        })
+        .await
+        .expect("Task panic")
     }
 
     /// When a node is first registered, persist it before using it for anything
@@ -358,14 +393,11 @@ impl Persistence {
         self.with_measured_conn(
             DatabaseOperation::InsertTenantShards,
             move |conn| -> DatabaseResult<()> {
-                conn.transaction(|conn| -> QueryResult<()> {
-                    for tenant in &shards {
-                        diesel::insert_into(tenant_shards)
-                            .values(tenant)
-                            .execute(conn)?;
-                    }
-                    Ok(())
-                })?;
+                for tenant in &shards {
+                    diesel::insert_into(tenant_shards)
+                        .values(tenant)
+                        .execute(conn)?;
+                }
                 Ok(())
             },
         )
@@ -533,8 +565,11 @@ impl Persistence {
             let update = ShardUpdate {
                 generation: input_generation.map(|g| g.into().unwrap() as i32),
                 placement_policy: input_placement_policy
+                    .as_ref()
                     .map(|p| serde_json::to_string(&p).unwrap()),
-                config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
+                config: input_config
+                    .as_ref()
+                    .map(|c| serde_json::to_string(&c).unwrap()),
                 scheduling_policy: input_scheduling_policy
                     .map(|p| serde_json::to_string(&p).unwrap()),
             };
@@ -581,55 +616,51 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
         self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
-            conn.transaction(|conn| -> DatabaseResult<()> {
-                // Mark parent shards as splitting
+            // Mark parent shards as splitting
 
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(old_shard_count.literal() as i32))
-                    .set((splitting.eq(1),))
-                    .execute(conn)?;
-                if u8::try_from(updated)
-                    .map_err(|_| DatabaseError::Logical(
-                        format!("Overflow existing shard count {} while splitting", updated))
-                    )? != old_shard_count.count() {
-                    // Perhaps a deletion or another split raced with this attempt to split, mutating
-                    // the parent shards that we intend to split. In this case the split request should fail.
-                    return Err(DatabaseError::Logical(
-                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
-                    ));
+            let updated = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(split_tenant_id.to_string()))
+                .filter(shard_count.eq(old_shard_count.literal() as i32))
+                .set((splitting.eq(1),))
+                .execute(conn)?;
+            if u8::try_from(updated)
+                .map_err(|_| DatabaseError::Logical(
+                    format!("Overflow existing shard count {} while splitting", updated))
+                )? != old_shard_count.count() {
+                // Perhaps a deletion or another split raced with this attempt to split, mutating
+                // the parent shards that we intend to split. In this case the split request should fail.
+                return Err(DatabaseError::Logical(
+                    format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
+                ));
+            }
+
+            // FIXME: spurious clone to sidestep closure move rules
+            let parent_to_children = parent_to_children.clone();
+
+            // Insert child shards
+            for (parent_shard_id, children) in parent_to_children {
+                let mut parent = crate::schema::tenant_shards::table
+                    .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
+                    .load::<TenantShardPersistence>(conn)?;
+                let parent = if parent.len() != 1 {
+                    return Err(DatabaseError::Logical(format!(
+                        "Parent shard {parent_shard_id} not found"
+                    )));
+                } else {
+                    parent.pop().unwrap()
+                };
+                for mut shard in children {
+                    // Carry the parent's generation into the child
+                    shard.generation = parent.generation;
+
+                    debug_assert!(shard.splitting == SplitState::Splitting);
+                    diesel::insert_into(tenant_shards)
+                        .values(shard)
+                        .execute(conn)?;
                 }
-
-                // FIXME: spurious clone to sidestep closure move rules
-                let parent_to_children = parent_to_children.clone();
-
-                // Insert child shards
-                for (parent_shard_id, children) in parent_to_children {
-                    let mut parent = crate::schema::tenant_shards::table
-                        .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
-                        .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
-                        .load::<TenantShardPersistence>(conn)?;
-                    let parent = if parent.len() != 1 {
-                        return Err(DatabaseError::Logical(format!(
-                            "Parent shard {parent_shard_id} not found"
-                        )));
-                    } else {
-                        parent.pop().unwrap()
-                    };
-                    for mut shard in children {
-                        // Carry the parent's generation into the child
-                        shard.generation = parent.generation;
-
-                        debug_assert!(shard.splitting == SplitState::Splitting);
-                        diesel::insert_into(tenant_shards)
-                            .values(shard)
-                            .execute(conn)?;
-                    }
-                }
-
-                Ok(())
-            })?;
+            }
 
             Ok(())
         })
@@ -647,22 +678,18 @@ impl Persistence {
         self.with_measured_conn(
             DatabaseOperation::CompleteShardSplit,
             move |conn| -> DatabaseResult<()> {
-                conn.transaction(|conn| -> QueryResult<()> {
-                    // Drop parent shards
-                    diesel::delete(tenant_shards)
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
-                        .filter(shard_count.eq(old_shard_count.literal() as i32))
-                        .execute(conn)?;
+                // Drop parent shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(old_shard_count.literal() as i32))
+                    .execute(conn)?;
 
-                    // Clear sharding flag
-                    let updated = diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
-                        .set((splitting.eq(0),))
-                        .execute(conn)?;
-                    debug_assert!(updated > 0);
-
-                    Ok(())
-                })?;
+                // Clear sharding flag
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
+                debug_assert!(updated > 0);
 
                 Ok(())
             },
@@ -681,39 +708,34 @@ impl Persistence {
         self.with_measured_conn(
             DatabaseOperation::AbortShardSplit,
             move |conn| -> DatabaseResult<AbortShardSplitStatus> {
-                let aborted =
-                    conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
-                        // Clear the splitting state on parent shards
-                        let updated = diesel::update(tenant_shards)
-                            .filter(tenant_id.eq(split_tenant_id.to_string()))
-                            .filter(shard_count.ne(new_shard_count.literal() as i32))
-                            .set((splitting.eq(0),))
-                            .execute(conn)?;
+                // Clear the splitting state on parent shards
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.ne(new_shard_count.literal() as i32))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
 
-                        // Parent shards are already gone: we cannot abort.
-                        if updated == 0 {
-                            return Ok(AbortShardSplitStatus::Complete);
-                        }
+                // Parent shards are already gone: we cannot abort.
+                if updated == 0 {
+                    return Ok(AbortShardSplitStatus::Complete);
+                }
 
-                        // Sanity check: if parent shards were present, their cardinality should
-                        // be less than the number of child shards.
-                        if updated >= new_shard_count.count() as usize {
-                            return Err(DatabaseError::Logical(format!(
-                                "Unexpected parent shard count {updated} while aborting split to \
+                // Sanity check: if parent shards were present, their cardinality should
+                // be less than the number of child shards.
+                if updated >= new_shard_count.count() as usize {
+                    return Err(DatabaseError::Logical(format!(
+                        "Unexpected parent shard count {updated} while aborting split to \
                             count {new_shard_count:?} on tenant {split_tenant_id}"
-                            )));
-                        }
+                    )));
+                }
 
-                        // Erase child shards
-                        diesel::delete(tenant_shards)
-                            .filter(tenant_id.eq(split_tenant_id.to_string()))
-                            .filter(shard_count.eq(new_shard_count.literal() as i32))
-                            .execute(conn)?;
+                // Erase child shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(new_shard_count.literal() as i32))
+                    .execute(conn)?;
 
-                        Ok(AbortShardSplitStatus::Aborted)
-                    })?;
-
-                Ok(aborted)
+                Ok(AbortShardSplitStatus::Aborted)
             },
         )
         .await

From c84656a53e92ca4628ffa8061e34102263576f43 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 17 May 2024 17:01:24 +0100
Subject: [PATCH 0801/1571] pageserver: implement auto-splitting (#7681)

## Problem

Currently tenants are only split into multiple shards if a human being
calls the API to do it.

Issue: #7388

## Summary of changes

- Add a pageserver API for returning the top tenants by size
- Add a step to the controller's background loop where if there is no
reconciliation or optimization to be done, it looks for things to split.
- Add a test that runs pgbench on many tenants concurrently, and checks
that splitting happens as expected as tenants grow, without interrupting
the client I/O.

This PR is quite basic: there is a tasklist in
https://github.com/neondatabase/neon/issues/7388 for further work. This
PR is meant to be safe (off by default), and sufficient to enable our
staging environment to run lots of sharded tenants without a human
having to set them up.
---
 control_plane/src/local_env.rs                |   4 +
 control_plane/src/storage_controller.rs       |   4 +
 libs/pageserver_api/src/models.rs             |  49 +++
 libs/pageserver_api/src/shard.rs              |   2 +-
 pageserver/client/src/mgmt_api.rs             |  12 +
 pageserver/src/http/routes.rs                 |  98 ++++++
 pageserver/src/metrics.rs                     |  64 ++--
 pageserver/src/tenant.rs                      |  26 ++
 .../src/tenant/remote_timeline_client.rs      |   6 +-
 storage_controller/src/main.rs                |   5 +
 storage_controller/src/pageserver_client.rs   |  14 +-
 storage_controller/src/service.rs             | 114 ++++++-
 test_runner/fixtures/pageserver/http.py       |  15 +
 .../performance/test_sharding_autosplit.py    | 280 ++++++++++++++++++
 test_runner/regress/test_sharding.py          |  42 +++
 15 files changed, 689 insertions(+), 46 deletions(-)
 create mode 100644 test_runner/performance/test_sharding_autosplit.py

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index d13884198e..0edcf1be4e 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -152,6 +152,9 @@ pub struct NeonStorageControllerConf {
     /// Heartbeat timeout before marking a node offline
     #[serde(with = "humantime_serde")]
     pub max_unavailable: Duration,
+
+    /// Threshold for auto-splitting a tenant into shards
+    pub split_threshold: Option<u64>,
 }
 
 impl NeonStorageControllerConf {
@@ -164,6 +167,7 @@ impl Default for NeonStorageControllerConf {
     fn default() -> Self {
         Self {
             max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
+            split_threshold: None,
         }
     }
 }
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index f1c43f4036..96e8276f4d 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -305,6 +305,10 @@ impl StorageController {
             ));
         }
 
+        if let Some(split_threshold) = self.config.split_threshold.as_ref() {
+            args.push(format!("--split-threshold={split_threshold}"))
+        }
+
         background_process::start_process(
             COMMAND,
             &self.env.base_data_dir,
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 7cf54bf32a..d52fb5e93d 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -824,6 +824,55 @@ pub struct TenantScanRemoteStorageResponse {
     pub shards: Vec<TenantScanRemoteStorageShard>,
 }
 
+#[derive(Serialize, Deserialize, Debug, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum TenantSorting {
+    ResidentSize,
+    MaxLogicalSize,
+}
+
+impl Default for TenantSorting {
+    fn default() -> Self {
+        Self::ResidentSize
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct TopTenantShardsRequest {
+    // How would you like to sort the tenants?
+    pub order_by: TenantSorting,
+
+    // How many results?
+    pub limit: usize,
+
+    // Omit tenants with more than this many shards (e.g. if this is the max number of shards
+    // that the caller would ever split to)
+    pub where_shards_lt: Option<ShardCount>,
+
+    // Omit tenants where the ordering metric is less than this (this is an optimization to
+    // let us quickly exclude numerous tiny shards)
+    pub where_gt: Option<u64>,
+}
+
+#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
+pub struct TopTenantShardItem {
+    pub id: TenantShardId,
+
+    /// Total size of layers on local disk for all timelines in this tenant
+    pub resident_size: u64,
+
+    /// Total size of layers in remote storage for all timelines in this tenant
+    pub physical_size: u64,
+
+    /// The largest logical size of a timeline within this tenant
+    pub max_logical_size: u64,
+}
+
+#[derive(Serialize, Deserialize, Debug, Default)]
+pub struct TopTenantShardsResponse {
+    pub shards: Vec<TopTenantShardItem>,
+}
+
 pub mod virtual_file {
     #[derive(
         Copy,
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index ff6d3d91b6..43d9b2e48c 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -125,7 +125,7 @@ impl ShardCount {
 
     /// `v` may be zero, or the number of shards in the tenant.  `v` is what
     /// [`Self::literal`] would return.
-    pub fn new(val: u8) -> Self {
+    pub const fn new(val: u8) -> Self {
         Self(val)
     }
 }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 6df8b2170d..5904713da9 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -486,6 +486,18 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    pub async fn top_tenant_shards(
+        &self,
+        request: TopTenantShardsRequest,
+    ) -> Result<TopTenantShardsResponse> {
+        let uri = format!("{}/v1/top_tenants", self.mgmt_api_endpoint);
+        self.request(Method::POST, uri, request)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn layer_map_info(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0a98d32f02..b8d5c67ce0 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1,6 +1,8 @@
 //!
 //! Management HTTP API
 //!
+use std::cmp::Reverse;
+use std::collections::BinaryHeap;
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -24,7 +26,11 @@ use pageserver_api::models::TenantScanRemoteStorageShard;
 use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
+use pageserver_api::models::TenantSorting;
 use pageserver_api::models::TenantState;
+use pageserver_api::models::TopTenantShardItem;
+use pageserver_api::models::TopTenantShardsRequest;
+use pageserver_api::models::TopTenantShardsResponse;
 use pageserver_api::models::{
     DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
     TenantLoadRequest, TenantLocationConfigRequest,
@@ -2323,6 +2329,97 @@ async fn get_utilization(
         .map_err(ApiError::InternalServerError)
 }
 
+/// Report on the largest tenants on this pageserver, for the storage controller to identify
+/// candidates for splitting
+async fn post_top_tenants(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
+    let request: TopTenantShardsRequest = json_request(&mut r).await?;
+    let state = get_state(&r);
+
+    fn get_size_metric(sizes: &TopTenantShardItem, order_by: &TenantSorting) -> u64 {
+        match order_by {
+            TenantSorting::ResidentSize => sizes.resident_size,
+            TenantSorting::MaxLogicalSize => sizes.max_logical_size,
+        }
+    }
+
+    #[derive(Eq, PartialEq)]
+    struct HeapItem {
+        metric: u64,
+        sizes: TopTenantShardItem,
+    }
+
+    impl PartialOrd for HeapItem {
+        fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+            Some(self.cmp(other))
+        }
+    }
+
+    /// Heap items have reverse ordering on their metric: this enables using BinaryHeap, which
+    /// supports popping the greatest item but not the smallest.
+    impl Ord for HeapItem {
+        fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+            Reverse(self.metric).cmp(&Reverse(other.metric))
+        }
+    }
+
+    let mut top_n: BinaryHeap<HeapItem> = BinaryHeap::with_capacity(request.limit);
+
+    // FIXME: this is a lot of clones to take this tenant list
+    for (tenant_shard_id, tenant_slot) in state.tenant_manager.list() {
+        if let Some(shards_lt) = request.where_shards_lt {
+            // Ignore tenants which already have >= this many shards
+            if tenant_shard_id.shard_count >= shards_lt {
+                continue;
+            }
+        }
+
+        let sizes = match tenant_slot {
+            TenantSlot::Attached(tenant) => tenant.get_sizes(),
+            TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => {
+                continue;
+            }
+        };
+        let metric = get_size_metric(&sizes, &request.order_by);
+
+        if let Some(gt) = request.where_gt {
+            // Ignore tenants whose metric is <= the lower size threshold, to do less sorting work
+            if metric <= gt {
+                continue;
+            }
+        };
+
+        match top_n.peek() {
+            None => {
+                // Top N list is empty: candidate becomes first member
+                top_n.push(HeapItem { metric, sizes });
+            }
+            Some(i) if i.metric > metric && top_n.len() < request.limit => {
+                // Lowest item in list is greater than our candidate, but we aren't at limit yet: push to end
+                top_n.push(HeapItem { metric, sizes });
+            }
+            Some(i) if i.metric > metric => {
+                // List is at limit and lowest value is greater than our candidate, drop it.
+            }
+            Some(_) => top_n.push(HeapItem { metric, sizes }),
+        }
+
+        while top_n.len() > request.limit {
+            top_n.pop();
+        }
+    }
+
+    json_response(
+        StatusCode::OK,
+        TopTenantShardsResponse {
+            shards: top_n.into_iter().map(|i| i.sizes).collect(),
+        },
+    )
+}
+
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2609,5 +2706,6 @@ pub fn make_router(
         )
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
         .get("/v1/utilization", |r| api_handler(r, get_utilization))
+        .post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
         .any(handler_404))
 }
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ffcd08b4b3..5315f0b936 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2098,7 +2098,7 @@ pub(crate) struct TimelineMetrics {
     pub garbage_collect_histo: StorageTimeMetrics,
     pub find_gc_cutoffs_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
-    resident_physical_size_gauge: UIntGauge,
+    pub resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
     pub aux_file_size_gauge: IntGauge,
@@ -2312,6 +2312,7 @@ use pin_project_lite::pin_project;
 use std::collections::HashMap;
 use std::num::NonZeroUsize;
 use std::pin::Pin;
+use std::sync::atomic::AtomicU64;
 use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
 use std::time::{Duration, Instant};
@@ -2321,35 +2322,35 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
 
 /// Maintain a per timeline gauge in addition to the global gauge.
-struct PerTimelineRemotePhysicalSizeGauge {
-    last_set: u64,
+pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
+    last_set: AtomicU64,
     gauge: UIntGauge,
 }
 
 impl PerTimelineRemotePhysicalSizeGauge {
     fn new(per_timeline_gauge: UIntGauge) -> Self {
         Self {
-            last_set: per_timeline_gauge.get(),
+            last_set: AtomicU64::new(0),
             gauge: per_timeline_gauge,
         }
     }
-    fn set(&mut self, sz: u64) {
+    pub(crate) fn set(&self, sz: u64) {
         self.gauge.set(sz);
-        if sz < self.last_set {
-            REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
+        let prev = self.last_set.swap(sz, std::sync::atomic::Ordering::Relaxed);
+        if sz < prev {
+            REMOTE_PHYSICAL_SIZE_GLOBAL.sub(prev - sz);
         } else {
-            REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
+            REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - prev);
         };
-        self.last_set = sz;
     }
-    fn get(&self) -> u64 {
+    pub(crate) fn get(&self) -> u64 {
         self.gauge.get()
     }
 }
 
 impl Drop for PerTimelineRemotePhysicalSizeGauge {
     fn drop(&mut self) {
-        REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
+        REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set.load(std::sync::atomic::Ordering::Relaxed));
     }
 }
 
@@ -2357,7 +2358,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
     tenant_id: String,
     shard_id: String,
     timeline_id: String,
-    remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
+    pub(crate) remote_physical_size_gauge: PerTimelineRemotePhysicalSizeGauge,
     calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
     bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
     bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
@@ -2365,38 +2366,27 @@ pub(crate) struct RemoteTimelineClientMetrics {
 
 impl RemoteTimelineClientMetrics {
     pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
+        let tenant_id_str = tenant_shard_id.tenant_id.to_string();
+        let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
+        let timeline_id_str = timeline_id.to_string();
+
+        let remote_physical_size_gauge = PerTimelineRemotePhysicalSizeGauge::new(
+            REMOTE_PHYSICAL_SIZE
+                .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
+                .unwrap(),
+        );
+
         RemoteTimelineClientMetrics {
-            tenant_id: tenant_shard_id.tenant_id.to_string(),
-            shard_id: format!("{}", tenant_shard_id.shard_slug()),
-            timeline_id: timeline_id.to_string(),
+            tenant_id: tenant_id_str,
+            shard_id: shard_id_str,
+            timeline_id: timeline_id_str,
             calls: Mutex::new(HashMap::default()),
             bytes_started_counter: Mutex::new(HashMap::default()),
             bytes_finished_counter: Mutex::new(HashMap::default()),
-            remote_physical_size_gauge: Mutex::new(None),
+            remote_physical_size_gauge,
         }
     }
 
-    pub(crate) fn remote_physical_size_set(&self, sz: u64) {
-        let mut guard = self.remote_physical_size_gauge.lock().unwrap();
-        let gauge = guard.get_or_insert_with(|| {
-            PerTimelineRemotePhysicalSizeGauge::new(
-                REMOTE_PHYSICAL_SIZE
-                    .get_metric_with_label_values(&[
-                        &self.tenant_id,
-                        &self.shard_id,
-                        &self.timeline_id,
-                    ])
-                    .unwrap(),
-            )
-        });
-        gauge.set(sz);
-    }
-
-    pub(crate) fn remote_physical_size_get(&self) -> u64 {
-        let guard = self.remote_physical_size_gauge.lock().unwrap();
-        guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
-    }
-
     pub fn remote_operation_time(
         &self,
         file_kind: &RemoteOpFileKind,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 026cbc107c..54b63f7042 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -21,6 +21,7 @@ use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
 use pageserver_api::models::TimelineState;
+use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
@@ -2196,6 +2197,31 @@ impl Tenant {
 
         Ok(())
     }
+
+    pub(crate) fn get_sizes(&self) -> TopTenantShardItem {
+        let mut result = TopTenantShardItem {
+            id: self.tenant_shard_id,
+            resident_size: 0,
+            physical_size: 0,
+            max_logical_size: 0,
+        };
+
+        for timeline in self.timelines.lock().unwrap().values() {
+            result.resident_size += timeline.metrics.resident_physical_size_gauge.get();
+
+            result.physical_size += timeline
+                .remote_client
+                .metrics
+                .remote_physical_size_gauge
+                .get();
+            result.max_logical_size = std::cmp::max(
+                result.max_logical_size,
+                timeline.metrics.current_logical_size_gauge.get(),
+            );
+        }
+
+        result
+    }
 }
 
 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 07d6af696c..3a1113cf01 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -317,7 +317,7 @@ pub struct RemoteTimelineClient {
 
     upload_queue: Mutex<UploadQueue>,
 
-    metrics: Arc<RemoteTimelineClientMetrics>,
+    pub(crate) metrics: Arc<RemoteTimelineClientMetrics>,
 
     storage_impl: GenericRemoteStorage,
 
@@ -461,11 +461,11 @@ impl RemoteTimelineClient {
         } else {
             0
         };
-        self.metrics.remote_physical_size_set(size);
+        self.metrics.remote_physical_size_gauge.set(size);
     }
 
     pub fn get_remote_physical_size(&self) -> u64 {
-        self.metrics.remote_physical_size_get()
+        self.metrics.remote_physical_size_gauge.get()
     }
 
     //
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index f1454af533..ce8f8d0cdd 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -66,6 +66,10 @@ struct Cli {
     #[arg(long)]
     max_unavailable_interval: Option<humantime::Duration>,
 
+    /// Size threshold for automatically splitting shards (disabled by default)
+    #[arg(long)]
+    split_threshold: Option<u64>,
+
     /// Maximum number of reconcilers that may run in parallel
     #[arg(long)]
     reconciler_concurrency: Option<usize>,
@@ -255,6 +259,7 @@ async fn async_main() -> anyhow::Result<()> {
         reconciler_concurrency: args
             .reconciler_concurrency
             .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
+        split_threshold: args.split_threshold,
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 25b6b67e12..769aba80ca 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -2,7 +2,7 @@ use pageserver_api::{
     models::{
         LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
         TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse,
-        TimelineCreateRequest, TimelineInfo,
+        TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse,
     },
     shard::TenantShardId,
 };
@@ -234,4 +234,16 @@ impl PageserverClient {
             self.inner.get_utilization().await
         )
     }
+
+    pub(crate) async fn top_tenant_shards(
+        &self,
+        request: TopTenantShardsRequest,
+    ) -> Result<TopTenantShardsResponse> {
+        measured_request!(
+            "top_tenants",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.top_tenant_shards(request).await
+        )
+    }
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ae7e8d3d7d..f914f4e0bb 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -32,10 +32,10 @@ use pageserver_api::{
         TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
         UtilizationScore,
     },
-    models::{SecondaryProgress, TenantConfigRequest},
+    models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
 };
 use reqwest::StatusCode;
-use tracing::instrument;
+use tracing::{instrument, Instrument};
 
 use crate::pageserver_client::PageserverClient;
 use pageserver_api::{
@@ -222,6 +222,10 @@ pub struct Config {
 
     /// How many Reconcilers may be spawned concurrently
     pub reconciler_concurrency: usize,
+
+    /// How large must a shard grow in bytes before we split it?
+    /// None disables auto-splitting.
+    pub split_threshold: Option<u64>,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -699,7 +703,7 @@ impl Service {
     /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible
     /// for those retries.
     #[instrument(skip_all)]
-    async fn background_reconcile(&self) {
+    async fn background_reconcile(self: &Arc<Self>) {
         self.startup_complete.clone().wait().await;
 
         const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20);
@@ -711,7 +715,11 @@ impl Service {
                 let reconciles_spawned = self.reconcile_all();
                 if reconciles_spawned == 0 {
                     // Run optimizer only when we didn't find any other work to do
-                    self.optimize_all().await;
+                    let optimizations = self.optimize_all().await;
+                    if optimizations == 0 {
+                        // Run new splits only when no optimizations are pending
+                        self.autosplit_tenants().await;
+                    }
                 }
             }
               _ = self.cancel.cancelled() => return
@@ -4766,6 +4774,104 @@ impl Service {
         validated_work
     }
 
+    /// Look for shards which are oversized and in need of splitting
+    async fn autosplit_tenants(self: &Arc<Self>) {
+        let Some(split_threshold) = self.config.split_threshold else {
+            // Auto-splitting is disabled
+            return;
+        };
+
+        let nodes = self.inner.read().unwrap().nodes.clone();
+
+        const SPLIT_TO_MAX: ShardCount = ShardCount::new(8);
+
+        let mut top_n = Vec::new();
+
+        // Call into each node to look for big tenants
+        let top_n_request = TopTenantShardsRequest {
+            // We currently split based on logical size, for simplicity: logical size is a signal of
+            // the user's intent to run a large database, whereas physical/resident size can be symptoms
+            // of compaction issues.  Eventually we should switch to using resident size to bound the
+            // disk space impact of one shard.
+            order_by: models::TenantSorting::MaxLogicalSize,
+            limit: 10,
+            where_shards_lt: Some(SPLIT_TO_MAX),
+            where_gt: Some(split_threshold),
+        };
+        for node in nodes.values() {
+            let request_ref = &top_n_request;
+            match node
+                .with_client_retries(
+                    |client| async move {
+                        let request = request_ref.clone();
+                        client.top_tenant_shards(request.clone()).await
+                    },
+                    &self.config.jwt_token,
+                    3,
+                    3,
+                    Duration::from_secs(5),
+                    &self.cancel,
+                )
+                .await
+            {
+                Some(Ok(node_top_n)) => {
+                    top_n.extend(node_top_n.shards.into_iter());
+                }
+                Some(Err(mgmt_api::Error::Cancelled)) => {
+                    continue;
+                }
+                Some(Err(e)) => {
+                    tracing::warn!("Failed to fetch top N tenants from {node}: {e}");
+                    continue;
+                }
+                None => {
+                    // Node is shutting down
+                    continue;
+                }
+            };
+        }
+
+        // Pick the biggest tenant to split first
+        top_n.sort_by_key(|i| i.resident_size);
+        let Some(split_candidate) = top_n.into_iter().next() else {
+            tracing::debug!("No split-elegible shards found");
+            return;
+        };
+
+        // We spawn a task to run this, so it's exactly like some external API client requesting it.  We don't
+        // want to block the background reconcile loop on this.
+        tracing::info!("Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}");
+
+        let this = self.clone();
+        tokio::spawn(
+            async move {
+                match this
+                    .tenant_shard_split(
+                        split_candidate.id.tenant_id,
+                        TenantShardSplitRequest {
+                            // Always split to the max number of shards: this avoids stepping through
+                            // intervening shard counts and encountering the overrhead of a split+cleanup
+                            // each time as a tenant grows, and is not too expensive because our max shard
+                            // count is relatively low anyway.
+                            // This policy will be adjusted in future once we support higher shard count.
+                            new_shard_count: SPLIT_TO_MAX.literal(),
+                            new_stripe_size: Some(ShardParameters::DEFAULT_STRIPE_SIZE),
+                        },
+                    )
+                    .await
+                {
+                    Ok(_) => {
+                        tracing::info!("Successful auto-split");
+                    }
+                    Err(e) => {
+                        tracing::error!("Auto-split failed: {e}");
+                    }
+                }
+            }
+            .instrument(tracing::info_span!("auto_split", tenant_id=%split_candidate.id.tenant_id)),
+        );
+    }
+
     /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but
     /// also wait for any generated Reconcilers to complete.  Calling this until it returns zero should
     /// put the system into a quiescent state where future background reconciliations won't do anything.
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 0b2963d89c..4d563a532b 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -890,3 +890,18 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         assert current_logical_size == non_incremental
         assert isinstance(current_logical_size, int)
         return current_logical_size
+
+    def top_tenants(
+        self, order_by: str, limit: int, where_shards_lt: int, where_gt: int
+    ) -> dict[Any, Any]:
+        res = self.post(
+            f"http://localhost:{self.port}/v1/top_tenants",
+            json={
+                "order_by": order_by,
+                "limit": limit,
+                "where_shards_lt": where_shards_lt,
+                "where_gt": where_gt,
+            },
+        )
+        self.verbose_error(res)
+        return res.json()  # type: ignore
diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py
new file mode 100644
index 0000000000..9cd83f0959
--- /dev/null
+++ b/test_runner/performance/test_sharding_autosplit.py
@@ -0,0 +1,280 @@
+import concurrent.futures
+import re
+from pathlib import Path
+
+import pytest
+from fixtures.common_types import TenantId, TimelineId
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PgBin,
+    tenant_get_shards,
+)
+
+
+@pytest.mark.timeout(600)
+def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    """
+    Check that sharding, including auto-splitting, "just works" under pgbench workloads.
+
+    This is not a benchmark, but it lives in the same place as benchmarks in order to be run
+    on a dedicated node that can sustain some significant throughput.
+
+    Other tests validate the details of shard splitting, error cases etc.  This test is
+    the sanity check that it all really works as expected with realistic amounts of data
+    and under load.
+
+    Success conditions:
+    - Tenants auto-split when their capacity grows
+    - Client workloads are not interrupted while that happens
+    """
+
+    neon_env_builder.num_pageservers = 8
+    neon_env_builder.storage_controller_config = {
+        # Split tenants at 500MB: it's up to the storage controller how it interprets this (logical
+        # sizes, physical sizes, etc).  We will write this much data logically, therefore other sizes
+        # will reliably be greater.
+        "split_threshold": 1024 * 1024 * 500
+    }
+
+    tenant_conf = {
+        # We want layer rewrites to happen as soon as possible (this is the most stressful
+        # case for the system), so set PITR interval to something tiny.
+        "pitr_interval": "5s",
+        # Scaled down thresholds.  We will run at ~1GB scale but would like to emulate
+        # the behavior of a system running at ~100GB scale.
+        "checkpoint_distance": f"{1024 * 1024}",
+        "compaction_threshold": "1",
+        "compaction_target_size": f"{1024 * 1024}",
+        "image_creation_threshold": "2",
+        "image_layer_creation_check_threshold": "0",
+    }
+
+    env = neon_env_builder.init_start()
+
+    for ps in env.pageservers:
+        ps.allowed_errors.extend(
+            [
+                # We shut down pageservers while they might have some compaction work going on
+                ".*Compaction failed.*shutting down.*"
+            ]
+        )
+
+    env.storage_controller.allowed_errors.extend(
+        [
+            # The neon_local functionality for updating computes is flaky for unknown reasons
+            ".*Local notification hook failed.*",
+            ".*Marking shard.*for notification retry.*",
+            ".*Failed to notify compute.*",
+        ]
+    )
+
+    # Total tenants
+    tenant_count = 4
+
+    # Transaction rate: we set this rather than running at full-speed because we
+    # might run on a slow node that doesn't cope well with many full-speed pgbenches running concurrently.
+    transaction_rate = 100
+
+    class TenantState:
+        def __init__(self, timeline_id, endpoint):
+            self.timeline_id = timeline_id
+            self.endpoint = endpoint
+
+    # Create tenants
+    tenants = {}
+    for tenant_id in set(TenantId.generate() for _i in range(0, tenant_count)):
+        timeline_id = TimelineId.generate()
+        env.neon_cli.create_tenant(tenant_id, timeline_id, conf=tenant_conf)
+        endpoint = env.endpoints.create("main", tenant_id=tenant_id)
+        tenants[tenant_id] = TenantState(timeline_id, endpoint)
+        endpoint.start()
+
+    def run_pgbench_init(endpoint):
+        pg_bin.run_capture(
+            [
+                "pgbench",
+                "-s50",
+                "-i",
+                f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres",
+            ]
+        )
+
+    def check_pgbench_output(out_path: str):
+        """
+        When we run pgbench, we want not just an absence of errors, but also continuous evidence
+        of I/O progressing: our shard splitting and migration should not interrrupt the benchmark.
+        """
+        matched_lines = 0
+        stderr = Path(f"{out_path}.stderr").read_text()
+
+        low_watermark = None
+
+        # Apply this as a threshold for what we consider an unacceptable interruption to I/O
+        min_tps = transaction_rate // 10
+
+        for line in stderr.split("\n"):
+            match = re.match(r"progress: ([0-9\.]+) s, ([0-9\.]+) tps, .* ([0-9]+) failed", line)
+            if match is None:
+                # Fall back to older-version pgbench output (omits failure count)
+                match = re.match(r"progress: ([0-9\.]+) s, ([0-9\.]+) tps, .*", line)
+                if match is None:
+                    continue
+                else:
+                    (_time, tps) = match.groups()
+                    tps = float(tps)
+                    failed = 0
+            else:
+                (_time, tps, failed) = match.groups()  # type: ignore
+                tps = float(tps)
+                failed = int(failed)
+
+            matched_lines += 1
+
+            if failed > 0:
+                raise RuntimeError(
+                    f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has failed > 0"
+                )
+
+            if low_watermark is None or low_watermark > tps:
+                low_watermark = tps
+
+            # Temporarily disabled: have seen some 0 tps regions on Hetzner runners, but not
+            # at the same time as a shard split.
+            # if tps < min_tps:
+            #     raise RuntimeError(
+            #         f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has tps < {min_tps}"
+            #     )
+
+        log.info(f"Checked {matched_lines} progress lines, lowest TPS was {min_tps}")
+
+        if matched_lines == 0:
+            raise RuntimeError(f"pgbench output at {out_path} contained no progress lines")
+
+    def run_pgbench_main(endpoint):
+        out_path = pg_bin.run_capture(
+            [
+                "pgbench",
+                "-s50",
+                "-T",
+                "180",
+                "-R",
+                f"{transaction_rate}",
+                "-P",
+                "1",
+                f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres",
+            ]
+        )
+
+        check_pgbench_output(out_path)
+
+    def run_pgbench_read(endpoint):
+        out_path = pg_bin.run_capture(
+            [
+                "pgbench",
+                "-s50",
+                "-T",
+                "30",
+                "-R",
+                f"{transaction_rate}",
+                "-S",
+                "-P",
+                "1",
+                f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres",
+            ]
+        )
+
+        check_pgbench_output(out_path)
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
+        pgbench_futs = []
+        for tenant_state in tenants.values():
+            fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint)
+            pgbench_futs.append(fut)
+
+        log.info("Waiting for pgbench inits")
+        for fut in pgbench_futs:
+            fut.result()
+
+        pgbench_futs = []
+        for tenant_state in tenants.values():
+            fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint)
+            pgbench_futs.append(fut)
+
+        log.info("Waiting for pgbench read/write pass")
+        for fut in pgbench_futs:
+            fut.result()
+
+    def assert_all_split():
+        for tenant_id in tenants.keys():
+            shards = tenant_get_shards(env, tenant_id)
+            assert len(shards) == 8
+
+    # This is not a wait_until, because we wanted the splits to happen _while_ pgbench is running: otherwise
+    # this test is not properly doing its job of validating that splits work nicely under load.
+    assert_all_split()
+
+    env.storage_controller.assert_log_contains(".*Successful auto-split.*")
+
+    # Log timeline sizes, useful for debug, and implicitly validates that the shards
+    # are available in the places the controller thinks they should be.
+    for tenant_id, tenant_state in tenants.items():
+        (shard_zero_id, shard_zero_ps) = tenant_get_shards(env, tenant_id)[0]
+        timeline_info = shard_zero_ps.http_client().timeline_detail(
+            shard_zero_id, tenant_state.timeline_id
+        )
+        log.info(f"{shard_zero_id} timeline: {timeline_info}")
+
+    # Run compaction for all tenants, restart endpoint so that on subsequent reads we will
+    # definitely hit pageserver for reads.  This compaction passis expected to drop unwanted
+    # layers but not do any rewrites (we're still in the same generation)
+    for tenant_id, tenant_state in tenants.items():
+        tenant_state.endpoint.stop()
+        for shard_id, shard_ps in tenant_get_shards(env, tenant_id):
+            shard_ps.http_client().timeline_gc(shard_id, tenant_state.timeline_id, gc_horizon=None)
+            shard_ps.http_client().timeline_compact(shard_id, tenant_state.timeline_id)
+        tenant_state.endpoint.start()
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
+        pgbench_futs = []
+        for tenant_state in tenants.values():
+            fut = pgbench_threads.submit(run_pgbench_read, tenant_state.endpoint)
+            pgbench_futs.append(fut)
+
+        log.info("Waiting for pgbench read pass")
+        for fut in pgbench_futs:
+            fut.result()
+
+    env.storage_controller.consistency_check()
+
+    # Restart the storage controller
+    env.storage_controller.stop()
+    env.storage_controller.start()
+
+    env.storage_controller.consistency_check()
+
+    # Restart all pageservers
+    for ps in env.pageservers:
+        ps.stop()
+        ps.start()
+
+    # Freshen gc_info in Timeline, so that when compaction runs in the background in the
+    # subsequent pgbench period, the last_gc_cutoff is updated and enables the conditions for a rewrite to pass.
+    for tenant_id, tenant_state in tenants.items():
+        for shard_id, shard_ps in tenant_get_shards(env, tenant_id):
+            shard_ps.http_client().timeline_gc(shard_id, tenant_state.timeline_id, gc_horizon=None)
+
+    # One last check data remains readable after everything has restarted
+    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
+        pgbench_futs = []
+        for tenant_state in tenants.values():
+            fut = pgbench_threads.submit(run_pgbench_read, tenant_state.endpoint)
+            pgbench_futs.append(fut)
+
+        log.info("Waiting for pgbench read pass")
+        for fut in pgbench_futs:
+            fut.result()
+
+    # Assert that some rewrites happened
+    # TODO: uncomment this after https://github.com/neondatabase/neon/pull/7531 is merged
+    # assert any(ps.log_contains(".*Rewriting layer after shard split.*") for ps in env.pageservers)
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 87544af598..1bfeec6f4b 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1326,3 +1326,45 @@ def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder):
 
         # Ensure that post-endpoint-restart modifications are ingested happily by pageserver
         wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
+
+
+def test_top_tenants(neon_env_builder: NeonEnvBuilder):
+    """
+    The top_tenants API is used in shard auto-splitting to find candidates.
+    """
+
+    env = neon_env_builder.init_configs()
+    neon_env_builder.start()
+
+    tenants = []
+    n_tenants = 8
+    for i in range(0, n_tenants):
+        tenant_id = TenantId.generate()
+        timeline_id = TimelineId.generate()
+        env.neon_cli.create_tenant(tenant_id, timeline_id)
+
+        # Write a different amount of data to each tenant
+        w = Workload(env, tenant_id, timeline_id)
+        w.init()
+        w.write_rows(i * 1000)
+        w.stop()
+
+        logical_size = env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)[
+            "current_logical_size"
+        ]
+        tenants.append((tenant_id, timeline_id, logical_size))
+
+        log.info(f"Created {tenant_id}/{timeline_id} with size {logical_size}")
+
+    # Ask for 1 largest tenant
+    top_1 = env.pageserver.http_client().top_tenants("max_logical_size", 1, 8, 0)
+    assert len(top_1["shards"]) == 1
+    assert top_1["shards"][0]["id"] == str(tenants[-1][0])
+    assert top_1["shards"][0]["max_logical_size"] == tenants[-1][2]
+
+    # Apply a lower bound limit
+    top = env.pageserver.http_client().top_tenants(
+        "max_logical_size", 100, 8, where_gt=tenants[3][2]
+    )
+    assert len(top["shards"]) == n_tenants - 4
+    assert set(i["id"] for i in top["shards"]) == set(str(i[0]) for i in tenants[4:])

From aaf60819fa479e37a4b477b20e1fbcee2d5a046f Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 17 May 2024 15:22:49 -0400
Subject: [PATCH 0802/1571] feat(pageserver): persist aux file policy in index
 part (#7668)

Part of https://github.com/neondatabase/neon/issues/7462

## Summary of changes

Tenant config is not persisted unless it's attached on the storage
controller. In this pull request, we persist the aux file policy flag in
the `index_part.json`.

Admins can set `switch_aux_file_policy` in the storage controller or
using the page server API. Upon the first aux file gets written, the
write path will compare the aux file policy target with the current
policy. If it is switch-able, we will do the switch. Otherwise, the
original policy will be used. The test cases show what the admins can do
/ cannot do.

The `last_aux_file_policy` is stored in `IndexPart`. Updates to the
persisted policy are done via
`schedule_index_upload_for_aux_file_policy_update`. On the write path,
the writer will update the field.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 libs/pageserver_api/src/models.rs             | 134 ++++++++
 pageserver/ctl/src/main.rs                    |   1 +
 pageserver/src/http/routes.rs                 |   2 +
 pageserver/src/pgdatadir_mapping.rs           |  33 +-
 pageserver/src/tenant.rs                      | 296 ++++++++++++++++++
 pageserver/src/tenant/config.rs               |   4 +-
 .../src/tenant/remote_timeline_client.rs      |  13 +
 .../tenant/remote_timeline_client/index.rs    |  87 ++++-
 pageserver/src/tenant/timeline.rs             |   9 +-
 pageserver/src/tenant/timeline/delete.rs      |   2 +
 pageserver/src/tenant/upload_queue.rs         |   6 +
 test_runner/regress/test_aux_files.py         |  72 +++++
 12 files changed, 648 insertions(+), 11 deletions(-)
 create mode 100644 test_runner/regress/test_aux_files.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d52fb5e93d..80ca696313 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -10,6 +10,7 @@ use std::{
     io::{BufRead, Read},
     num::{NonZeroU64, NonZeroUsize},
     str::FromStr,
+    sync::atomic::AtomicUsize,
     time::{Duration, SystemTime},
 };
 
@@ -308,13 +309,88 @@ pub struct TenantConfig {
     pub switch_aux_file_policy: Option<AuxFilePolicy>,
 }
 
+/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
+/// tenant config. When the first aux file written, the policy will be persisted in the
+/// `index_part.json` file and has a limited migration path.
+///
+/// Currently, we only allow the following migration path:
+///
+/// Unset -> V1
+///       -> V2
+///       -> CrossValidation -> V2
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub enum AuxFilePolicy {
+    /// V1 aux file policy: store everything in AUX_FILE_KEY
     V1,
+    /// V2 aux file policy: store in the AUX_FILE keyspace
     V2,
+    /// Cross validation runs both formats on the write path and does validation
+    /// on the read path.
     CrossValidation,
 }
 
+impl AuxFilePolicy {
+    pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
+        matches!(
+            (from, to),
+            (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
+        )
+    }
+
+    /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
+    pub fn default_tenant_config() -> Self {
+        Self::V1
+    }
+}
+
+/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
+pub struct AtomicAuxFilePolicy(AtomicUsize);
+
+impl AtomicAuxFilePolicy {
+    pub fn new(policy: Option<AuxFilePolicy>) -> Self {
+        Self(AtomicUsize::new(
+            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
+        ))
+    }
+
+    pub fn load(&self) -> Option<AuxFilePolicy> {
+        match self.0.load(std::sync::atomic::Ordering::Acquire) {
+            0 => None,
+            other => Some(AuxFilePolicy::from_usize(other)),
+        }
+    }
+
+    pub fn store(&self, policy: Option<AuxFilePolicy>) {
+        self.0.store(
+            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
+            std::sync::atomic::Ordering::Release,
+        );
+    }
+}
+
+impl AuxFilePolicy {
+    pub fn to_usize(self) -> usize {
+        match self {
+            Self::V1 => 1,
+            Self::CrossValidation => 2,
+            Self::V2 => 3,
+        }
+    }
+
+    pub fn try_from_usize(this: usize) -> Option<Self> {
+        match this {
+            1 => Some(Self::V1),
+            2 => Some(Self::CrossValidation),
+            3 => Some(Self::V2),
+            _ => None,
+        }
+    }
+
+    pub fn from_usize(this: usize) -> Self {
+        Self::try_from_usize(this).unwrap()
+    }
+}
+
 impl FromStr for AuxFilePolicy {
     type Err = anyhow::Error;
 
@@ -604,6 +680,9 @@ pub struct TimelineInfo {
     pub state: TimelineState,
 
     pub walreceiver_status: String,
+
+    /// The last aux file policy being used on this timeline
+    pub last_aux_file_policy: Option<AuxFilePolicy>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -1505,4 +1584,59 @@ mod tests {
             assert_eq!(actual, expected, "example on {line}");
         }
     }
+
+    #[test]
+    fn test_aux_file_migration_path() {
+        assert!(AuxFilePolicy::is_valid_migration_path(
+            None,
+            AuxFilePolicy::V1
+        ));
+        assert!(AuxFilePolicy::is_valid_migration_path(
+            None,
+            AuxFilePolicy::V2
+        ));
+        assert!(AuxFilePolicy::is_valid_migration_path(
+            None,
+            AuxFilePolicy::CrossValidation
+        ));
+        // Self-migration is not a valid migration path, and the caller should handle it by itself.
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V1),
+            AuxFilePolicy::V1
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V2),
+            AuxFilePolicy::V2
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::CrossValidation),
+            AuxFilePolicy::CrossValidation
+        ));
+        // Migrations not allowed
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::CrossValidation),
+            AuxFilePolicy::V1
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V1),
+            AuxFilePolicy::V2
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V2),
+            AuxFilePolicy::V1
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V2),
+            AuxFilePolicy::CrossValidation
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V1),
+            AuxFilePolicy::CrossValidation
+        ));
+        // Migrations allowed
+        assert!(AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::CrossValidation),
+            AuxFilePolicy::V2
+        ));
+    }
 }
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 1fb75584fc..e92c352dab 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -219,6 +219,7 @@ fn handle_metadata(
     let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
     println!("Current metadata:\n{meta:?}");
     let mut update_meta = false;
+    // TODO: simplify this part
     if let Some(disk_consistent_lsn) = disk_consistent_lsn {
         meta = TimelineMetadata::new(
             *disk_consistent_lsn,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index b8d5c67ce0..7efd48afc7 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -439,6 +439,8 @@ async fn build_timeline_info_common(
         state,
 
         walreceiver_status,
+
+        last_aux_file_policy: timeline.last_aux_file_policy.load(),
     };
     Ok(info)
 }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 1092d64d33..402f075365 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -35,7 +35,7 @@ use std::ops::ControlFlow;
 use std::ops::Range;
 use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, trace, warn};
+use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -718,10 +718,11 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        match self.get_switch_aux_file_policy() {
-            AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await,
-            AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await,
-            AuxFilePolicy::CrossValidation => {
+        let current_policy = self.last_aux_file_policy.load();
+        match current_policy {
+            Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
+            Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
+            Some(AuxFilePolicy::CrossValidation) => {
                 let v1_result = self.list_aux_files_v1(lsn, ctx).await;
                 let v2_result = self.list_aux_files_v2(lsn, ctx).await;
                 match (v1_result, v2_result) {
@@ -1469,7 +1470,27 @@ impl<'a> DatadirModification<'a> {
         content: &[u8],
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        let policy = self.tline.get_switch_aux_file_policy();
+        let switch_policy = self.tline.get_switch_aux_file_policy();
+
+        let policy = {
+            let current_policy = self.tline.last_aux_file_policy.load();
+            // Allowed switch path:
+            // * no aux files -> v1/v2/cross-validation
+            // * cross-validation->v2
+            if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
+                self.tline.last_aux_file_policy.store(Some(switch_policy));
+                self.tline
+                    .remote_client
+                    .schedule_index_upload_for_aux_file_policy_update(Some(switch_policy))?;
+                info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
+                switch_policy
+            } else {
+                // This branch handles non-valid migration path, and the case that switch_policy == current_policy.
+                // And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit.
+                current_policy.unwrap_or(AuxFilePolicy::default_tenant_config())
+            }
+        };
+
         if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
             let key = aux_file::encode_aux_file_key(path);
             // retrieve the key from the engine
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 54b63f7042..d42b9082b7 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -20,6 +20,7 @@ use futures::stream::FuturesUnordered;
 use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
+use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::TimelineState;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::WalRedoManagerStatus;
@@ -529,6 +530,7 @@ impl Tenant {
         index_part: Option<IndexPart>,
         metadata: TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
+        last_aux_file_policy: Option<AuxFilePolicy>,
         _ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let tenant_id = self.tenant_shard_id;
@@ -539,6 +541,10 @@ impl Tenant {
             ancestor.clone(),
             resources,
             CreateTimelineCause::Load,
+            // This could be derived from ancestor branch + index part. Though the only caller of `timeline_init_and_sync` is `load_remote_timeline`,
+            // there will potentially be other caller of this function in the future, and we don't know whether `index_part` or `ancestor` takes precedence.
+            // Therefore, we pass this field explicitly for now, and remove it once we fully migrate to aux file v2.
+            last_aux_file_policy,
         )?;
         let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
         anyhow::ensure!(
@@ -553,6 +559,10 @@ impl Tenant {
 
         if let Some(index_part) = index_part.as_ref() {
             timeline.remote_client.init_upload_queue(index_part)?;
+
+            timeline
+                .last_aux_file_policy
+                .store(index_part.last_aux_file_policy());
         } else {
             // No data on the remote storage, but we have local metadata file. We can end up
             // here with timeline_create being interrupted before finishing index part upload.
@@ -1173,12 +1183,15 @@ impl Tenant {
             None
         };
 
+        let last_aux_file_policy = index_part.last_aux_file_policy();
+
         self.timeline_init_and_sync(
             timeline_id,
             resources,
             Some(index_part),
             remote_metadata,
             ancestor,
+            last_aux_file_policy,
             ctx,
         )
         .await
@@ -1358,6 +1371,7 @@ impl Tenant {
             create_guard,
             initdb_lsn,
             None,
+            None,
         )
         .await
     }
@@ -2441,6 +2455,7 @@ impl Tenant {
         ancestor: Option<Arc<Timeline>>,
         resources: TimelineResources,
         cause: CreateTimelineCause,
+        last_aux_file_policy: Option<AuxFilePolicy>,
     ) -> anyhow::Result<Arc<Timeline>> {
         let state = match cause {
             CreateTimelineCause::Load => {
@@ -2469,6 +2484,7 @@ impl Tenant {
             resources,
             pg_version,
             state,
+            last_aux_file_policy,
             self.cancel.child_token(),
         );
 
@@ -3119,6 +3135,7 @@ impl Tenant {
                 timeline_create_guard,
                 start_lsn + 1,
                 Some(Arc::clone(src_timeline)),
+                src_timeline.last_aux_file_policy.load(),
             )
             .await?;
 
@@ -3312,6 +3329,7 @@ impl Tenant {
                 timeline_create_guard,
                 pgdata_lsn,
                 None,
+                None,
             )
             .await?;
 
@@ -3383,6 +3401,7 @@ impl Tenant {
         create_guard: TimelineCreateGuard<'a>,
         start_lsn: Lsn,
         ancestor: Option<Arc<Timeline>>,
+        last_aux_file_policy: Option<AuxFilePolicy>,
     ) -> anyhow::Result<UninitializedTimeline> {
         let tenant_shard_id = self.tenant_shard_id;
 
@@ -3398,6 +3417,7 @@ impl Tenant {
                 ancestor,
                 resources,
                 CreateTimelineCause::Load,
+                last_aux_file_policy,
             )
             .context("Failed to create timeline data structure")?;
 
@@ -5621,4 +5641,280 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_branch_copies_dirty_aux_file_flag() {
+        let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap();
+
+        // the default aux file policy to switch is v1 if not set by the admins
+        assert_eq!(
+            harness.tenant_conf.switch_aux_file_policy,
+            AuxFilePolicy::V1
+        );
+        let (tenant, ctx) = harness.load().await;
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        // no aux file is written at this point, so the persistent flag should be unset
+        assert_eq!(tline.last_aux_file_policy.load(), None);
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test1", b"first", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        // there is no tenant manager to pass the configuration through, so lets mimic it
+        tenant.set_new_location_config(
+            AttachedTenantConf::try_from(LocationConf::attached_single(
+                TenantConfOpt {
+                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
+                    ..Default::default()
+                },
+                tenant.generation,
+                &pageserver_api::models::ShardParameters::default(),
+            ))
+            .unwrap(),
+        );
+
+        assert_eq!(
+            tline.get_switch_aux_file_policy(),
+            AuxFilePolicy::V2,
+            "wanted state has been updated"
+        );
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V1),
+            "aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
+        );
+
+        // we can read everything from the storage
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test2", b"second", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V1),
+            "keep v1 storage format when new files are written"
+        );
+
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"second"))
+        );
+
+        let child = tenant
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .await
+            .unwrap();
+
+        // child copies the last flag even if that is not on remote storage yet
+        assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
+        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));
+
+        let files = child.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(files.get("pg_logical/mappings/test1"), None);
+        assert_eq!(files.get("pg_logical/mappings/test2"), None);
+
+        // even if we crash here without flushing parent timeline with it's new
+        // last_aux_file_policy we are safe, because child was never meant to access ancestor's
+        // files. the ancestor can even switch back to V1 because of a migration safely.
+    }
+
+    #[tokio::test]
+    async fn aux_file_policy_switch() {
+        let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap();
+        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
+        let (tenant, ctx) = harness.load().await;
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            None,
+            "no aux file is written so it should be unset"
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test1", b"first", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        // there is no tenant manager to pass the configuration through, so lets mimic it
+        tenant.set_new_location_config(
+            AttachedTenantConf::try_from(LocationConf::attached_single(
+                TenantConfOpt {
+                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
+                    ..Default::default()
+                },
+                tenant.generation,
+                &pageserver_api::models::ShardParameters::default(),
+            ))
+            .unwrap(),
+        );
+
+        assert_eq!(
+            tline.get_switch_aux_file_policy(),
+            AuxFilePolicy::V2,
+            "wanted state has been updated"
+        );
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::CrossValidation),
+            "dirty index_part.json reflected state is yet to be updated"
+        );
+
+        // we can still read the auxfile v1 before we ingest anything new
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test2", b"second", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V2),
+            "ingesting a file should apply the wanted switch state when applicable"
+        );
+
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first")),
+            "cross validation writes to both v1 and v2 so this should be available in v2"
+        );
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"second"))
+        );
+
+        // mimic again by trying to flip it from V2 to V1 (not switched to while ingesting a file)
+        tenant.set_new_location_config(
+            AttachedTenantConf::try_from(LocationConf::attached_single(
+                TenantConfOpt {
+                    switch_aux_file_policy: Some(AuxFilePolicy::V1),
+                    ..Default::default()
+                },
+                tenant.generation,
+                &pageserver_api::models::ShardParameters::default(),
+            ))
+            .unwrap(),
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test2", b"third", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(
+            tline.get_switch_aux_file_policy(),
+            AuxFilePolicy::V1,
+            "wanted state has been updated again, even if invalid request"
+        );
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V2),
+            "ingesting a file should apply the wanted switch state when applicable"
+        );
+
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"third"))
+        );
+
+        // mimic again by trying to flip it from from V1 to V2 (not switched to while ingesting a file)
+        tenant.set_new_location_config(
+            AttachedTenantConf::try_from(LocationConf::attached_single(
+                TenantConfOpt {
+                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
+                    ..Default::default()
+                },
+                tenant.generation,
+                &pageserver_api::models::ShardParameters::default(),
+            ))
+            .unwrap(),
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test3", b"last", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(tline.get_switch_aux_file_policy(), AuxFilePolicy::V2);
+
+        assert_eq!(tline.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
+
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"third"))
+        );
+        assert_eq!(
+            files.get("pg_logical/mappings/test3"),
+            Some(&bytes::Bytes::from_static(b"last"))
+        );
+    }
 }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index a743ce3c16..a695363cdc 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -373,6 +373,8 @@ pub struct TenantConf {
 
     /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
     /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
+    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
+    /// file is written.
     pub switch_aux_file_policy: AuxFilePolicy,
 }
 
@@ -574,7 +576,7 @@ impl Default for TenantConf {
             lazy_slru_download: false,
             timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
             image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
-            switch_aux_file_policy: AuxFilePolicy::V1,
+            switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
         }
     }
 }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 3a1113cf01..d3adae6841 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -189,6 +189,7 @@ use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
 
 pub(crate) use download::download_initdb_tar_zst;
+use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
@@ -611,6 +612,17 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Launch an index-file upload operation in the background, with only aux_file_policy flag updated.
+    pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
+        self: &Arc<Self>,
+        last_aux_file_policy: Option<AuxFilePolicy>,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        upload_queue.last_aux_file_policy = last_aux_file_policy;
+        self.schedule_index_upload(upload_queue);
+        Ok(())
+    }
     ///
     /// Launch an index-file upload operation in the background, if necessary.
     ///
@@ -1851,6 +1863,7 @@ impl RemoteTimelineClient {
                         dangling_files: HashMap::default(),
                         shutting_down: false,
                         shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+                        last_aux_file_policy: initialized.last_aux_file_policy,
                     };
 
                     let upload_queue = std::mem::replace(
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index b114d6aa10..032dda7ff3 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -5,6 +5,7 @@
 use std::collections::HashMap;
 
 use chrono::NaiveDateTime;
+use pageserver_api::models::AuxFilePolicy;
 use serde::{Deserialize, Serialize};
 use utils::id::TimelineId;
 
@@ -88,6 +89,16 @@ pub struct IndexPart {
 
     #[serde(default)]
     pub(crate) lineage: Lineage,
+
+    /// Describes the kind of aux files stored in the timeline.
+    ///
+    /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
+    /// A V1 setting after V2 files have been committed is not accepted.
+    ///
+    /// None means no aux files have been written to the storage before the point
+    /// when this flag is introduced.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
 }
 
 impl IndexPart {
@@ -101,10 +112,11 @@ impl IndexPart {
     ///      is always generated from the keys of `layer_metadata`)
     /// - 4: timeline_layers is fully removed.
     /// - 5: lineage was added
-    const LATEST_VERSION: usize = 5;
+    /// - 6: last_aux_file_policy is added.
+    const LATEST_VERSION: usize = 6;
 
     // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
@@ -113,6 +125,7 @@ impl IndexPart {
         disk_consistent_lsn: Lsn,
         metadata: TimelineMetadata,
         lineage: Lineage,
+        last_aux_file_policy: Option<AuxFilePolicy>,
     ) -> Self {
         let layer_metadata = layers_and_metadata
             .iter()
@@ -126,6 +139,7 @@ impl IndexPart {
             metadata,
             deleted_at: None,
             lineage,
+            last_aux_file_policy,
         }
     }
 
@@ -155,8 +169,13 @@ impl IndexPart {
             example_metadata.disk_consistent_lsn(),
             example_metadata,
             Default::default(),
+            Some(AuxFilePolicy::V1),
         )
     }
+
+    pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
+        self.last_aux_file_policy
+    }
 }
 
 impl From<&UploadQueueInitialized> for IndexPart {
@@ -165,7 +184,13 @@ impl From<&UploadQueueInitialized> for IndexPart {
         let metadata = uq.latest_metadata.clone();
         let lineage = uq.latest_lineage.clone();
 
-        Self::new(&uq.latest_files, disk_consistent_lsn, metadata, lineage)
+        Self::new(
+            &uq.latest_files,
+            disk_consistent_lsn,
+            metadata,
+            lineage,
+            uq.last_aux_file_policy,
+        )
     }
 }
 
@@ -299,6 +324,7 @@ mod tests {
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: None,
             lineage: Lineage::default(),
+            last_aux_file_policy: None,
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -340,6 +366,7 @@ mod tests {
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: None,
             lineage: Lineage::default(),
+            last_aux_file_policy: None,
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -383,6 +410,7 @@ mod tests {
             deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
                 "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
             lineage: Lineage::default(),
+            last_aux_file_policy: None,
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -428,6 +456,7 @@ mod tests {
             .unwrap(),
             deleted_at: None,
             lineage: Lineage::default(),
+            last_aux_file_policy: None,
         };
 
         let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -468,6 +497,7 @@ mod tests {
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             lineage: Lineage::default(),
+            last_aux_file_policy: None,
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -511,6 +541,57 @@ mod tests {
                 reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                 original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
             },
+            last_aux_file_policy: None,
+        };
+
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    #[test]
+    fn v6_indexpart_is_parsed() {
+        let example = r#"{
+            "version":6,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+            "deleted_at": "2023-07-31T09:00:00.123",
+            "lineage":{
+                "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"],
+                "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"]
+            },
+            "last_aux_file_policy": "V2"
+        }"#;
+
+        let expected = IndexPart {
+            version: 6,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            lineage: Lineage {
+                reparenting_history_truncated: false,
+                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
+                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
+            },
+            last_aux_file_policy: Some(AuxFilePolicy::V2),
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index df9bc9b35b..1fb1928079 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,7 +23,7 @@ use pageserver_api::{
     },
     keyspace::{KeySpaceAccum, SparseKeyPartitioning},
     models::{
-        AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
+        AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
         DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
         TimelineState,
     },
@@ -413,7 +413,11 @@ pub struct Timeline {
     /// Keep aux directory cache to avoid it's reconstruction on each update
     pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
 
+    /// Size estimator for aux file v2
     pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
+
+    /// Indicate whether aux file v2 storage is enabled.
+    pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
 }
 
 pub struct WalReceiverInfo {
@@ -2133,6 +2137,7 @@ impl Timeline {
         resources: TimelineResources,
         pg_version: u32,
         state: TimelineState,
+        aux_file_policy: Option<AuxFilePolicy>,
         cancel: CancellationToken,
     ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2257,6 +2262,8 @@ impl Timeline {
                 }),
 
                 aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
+
+                last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
             };
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 901f5149b3..b5dfc86e77 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -280,6 +280,8 @@ impl DeleteTimelineFlow {
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.
                 CreateTimelineCause::Delete,
+                // Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace
+                None,
             )
             .context("create_timeline_struct")?;
 
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index a2f761fa94..c0cc8f3124 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -8,6 +8,7 @@ use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;
 
 use chrono::NaiveDateTime;
+use pageserver_api::models::AuxFilePolicy;
 use std::sync::Arc;
 use tracing::info;
 use utils::lsn::AtomicLsn;
@@ -60,6 +61,9 @@ pub(crate) struct UploadQueueInitialized {
     /// Part of the flattened "next" `index_part.json`.
     pub(crate) latest_lineage: Lineage,
 
+    /// The last aux file policy used on this timeline.
+    pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
+
     /// `disk_consistent_lsn` from the last metadata file that was successfully
     /// uploaded. `Lsn(0)` if nothing was uploaded yet.
     /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
@@ -189,6 +193,7 @@ impl UploadQueue {
             dangling_files: HashMap::new(),
             shutting_down: false,
             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+            last_aux_file_policy: Default::default(),
         };
 
         *self = UploadQueue::Initialized(state);
@@ -239,6 +244,7 @@ impl UploadQueue {
             dangling_files: HashMap::new(),
             shutting_down: false,
             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+            last_aux_file_policy: index_part.last_aux_file_policy(),
         };
 
         *self = UploadQueue::Initialized(state);
diff --git a/test_runner/regress/test_aux_files.py b/test_runner/regress/test_aux_files.py
new file mode 100644
index 0000000000..be9c41a867
--- /dev/null
+++ b/test_runner/regress/test_aux_files.py
@@ -0,0 +1,72 @@
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    logical_replication_sync,
+)
+
+
+def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start("main")
+    client = env.pageserver.http_client()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    tenant_config = client.tenant_config(tenant_id).effective_config
+    tenant_config["switch_aux_file_policy"] = "V2"
+    client.set_tenant_config(tenant_id, tenant_config)
+    # aux file v2 is enabled on the write path, so for now, it should be unset (or null)
+    assert (
+        client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["last_aux_file_policy"]
+        is None
+    )
+
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    cur.execute("create table t(pk integer primary key, payload integer)")
+    cur.execute(
+        "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));"
+    )
+    cur.execute("create publication pub1 for table t, replication_example")
+
+    # now start subscriber, aux files will be created at this point. TODO: find better ways of testing aux files (i.e., neon_test_utils)
+    # instead of going through the full logical replication process.
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)")
+    vanilla_pg.safe_psql(
+        "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);"
+    )
+    connstr = endpoint.connstr().replace("'", "''")
+    log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
+    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
+
+    # Wait logical replication channel to be established
+    logical_replication_sync(vanilla_pg, endpoint)
+    vanilla_pg.stop()
+    endpoint.stop()
+
+    with env.pageserver.http_client() as client:
+        # aux file v2 flag should be enabled at this point
+        assert client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"] == "V2"
+    with env.pageserver.http_client() as client:
+        tenant_config = client.tenant_config(tenant_id).effective_config
+        tenant_config["switch_aux_file_policy"] = "V1"
+        client.set_tenant_config(tenant_id, tenant_config)
+        # the flag should still be enabled
+        assert (
+            client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
+                "last_aux_file_policy"
+            ]
+            == "V2"
+        )
+    env.pageserver.restart()
+    with env.pageserver.http_client() as client:
+        # aux file v2 flag should be persisted
+        assert (
+            client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
+                "last_aux_file_policy"
+            ]
+            == "V2"
+        )

From e1a9669d05374ea27685a1cf527676fe01df7722 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 17 May 2024 16:04:02 -0400
Subject: [PATCH 0803/1571] feat(pagebench): add aux file bench (#7746)

part of https://github.com/neondatabase/neon/issues/7462

## Summary of changes

This pull request adds two APIs to the pageserver management API:
list_aux_files and ingest_aux_files. The aux file pagebench is intended
to be used on an empty timeline because the data do not go through the
safekeeper. LSNs are advanced by 8 for each ingestion, to avoid
invariant checks inside the pageserver.

For now, I only care about space amplification / read amplification, so
the bench is designed in a very simple way: ingest 10000 files, and I
will manually dump the layer map to analyze.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs         | 10 +++
 pageserver/client/src/mgmt_api.rs         | 57 +++++++++++++
 pageserver/pagebench/src/cmd/aux_files.rs | 98 +++++++++++++++++++++++
 pageserver/pagebench/src/main.rs          |  3 +
 pageserver/src/http/routes.rs             | 75 +++++++++++++++++
 5 files changed, 243 insertions(+)
 create mode 100644 pageserver/pagebench/src/cmd/aux_files.rs

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 80ca696313..451ee1a13c 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -841,6 +841,16 @@ pub struct DownloadRemoteLayersTaskSpawnRequest {
     pub max_concurrent_downloads: NonZeroUsize,
 }
 
+#[derive(Debug, Serialize, Deserialize)]
+pub struct IngestAuxFilesRequest {
+    pub aux_files: HashMap<String, String>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct ListAuxFilesRequest {
+    pub lsn: Lsn,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct DownloadRemoteLayersTaskInfo {
     pub task_id: String,
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 5904713da9..69b86d9c46 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,8 +1,12 @@
+use std::collections::HashMap;
+
+use bytes::Bytes;
 use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
     http::error::HttpErrorBody,
     id::{TenantId, TimelineId},
+    lsn::Lsn,
 };
 
 pub mod util;
@@ -561,4 +565,57 @@ impl Client {
             }),
         }
     }
+
+    pub async fn ingest_aux_files(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        aux_files: HashMap<String, String>,
+    ) -> Result<bool> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/ingest_aux_files",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id
+        );
+        let resp = self
+            .request_noerror(Method::POST, &uri, IngestAuxFilesRequest { aux_files })
+            .await?;
+        match resp.status() {
+            StatusCode::OK => Ok(true),
+            status => Err(match resp.json::<HttpErrorBody>().await {
+                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
+                Err(_) => {
+                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
+                }
+            }),
+        }
+    }
+
+    pub async fn list_aux_files(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+    ) -> Result<HashMap<String, Bytes>> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/list_aux_files",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id
+        );
+        let resp = self
+            .request_noerror(Method::POST, &uri, ListAuxFilesRequest { lsn })
+            .await?;
+        match resp.status() {
+            StatusCode::OK => {
+                let resp: HashMap<String, Bytes> = resp.json().await.map_err(|e| {
+                    Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}"))
+                })?;
+                Ok(resp)
+            }
+            status => Err(match resp.json::<HttpErrorBody>().await {
+                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
+                Err(_) => {
+                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
+                }
+            }),
+        }
+    }
 }
diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs
new file mode 100644
index 0000000000..eb5b242a5f
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -0,0 +1,98 @@
+use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
+use pageserver_api::shard::TenantShardId;
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+/// Ingest aux files into the pageserver.
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    mgmt_api_endpoint: String,
+    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
+    page_service_connstring: String,
+    #[clap(long)]
+    pageserver_jwt: Option<String>,
+
+    targets: Option<Vec<TenantTimelineId>>,
+}
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    let main_task = rt.spawn(main_impl(args));
+    rt.block_on(main_task).unwrap()
+}
+
+async fn main_impl(args: Args) -> anyhow::Result<()> {
+    let args: &'static Args = Box::leak(Box::new(args));
+
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        args.mgmt_api_endpoint.clone(),
+        args.pageserver_jwt.as_deref(),
+    ));
+
+    // discover targets
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: None,
+            targets: {
+                if let Some(targets) = &args.targets {
+                    if targets.len() != 1 {
+                        anyhow::bail!("must specify exactly one target");
+                    }
+                    Some(targets.clone())
+                } else {
+                    None
+                }
+            },
+        },
+    )
+    .await?;
+
+    let timeline = timelines[0];
+    let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
+    let timeline_id = timeline.timeline_id;
+
+    println!("operating on timeline {}", timeline);
+
+    mgmt_api_client
+        .tenant_config(&TenantConfigRequest {
+            tenant_id: timeline.tenant_id,
+            config: TenantConfig {
+                switch_aux_file_policy: Some(AuxFilePolicy::V2),
+                ..Default::default()
+            },
+        })
+        .await?;
+
+    for batch in 0..100 {
+        let items = (0..100)
+            .map(|id| {
+                (
+                    format!("pg_logical/mappings/{:03}.{:03}", batch, id),
+                    format!("{:08}", id),
+                )
+            })
+            .collect::<HashMap<_, _>>();
+        let file_cnt = items.len();
+        mgmt_api_client
+            .ingest_aux_files(tenant_shard_id, timeline_id, items)
+            .await?;
+        println!("ingested {file_cnt} files");
+    }
+
+    let files = mgmt_api_client
+        .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
+        .await?;
+
+    println!("{} files found", files.len());
+
+    anyhow::Ok(())
+}
diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs
index 743102d853..5527557450 100644
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -14,6 +14,7 @@ mod util {
 
 /// The pagebench CLI sub-commands, dispatched in [`main`] below.
 mod cmd {
+    pub(super) mod aux_files;
     pub(super) mod basebackup;
     pub(super) mod getpage_latest_lsn;
     pub(super) mod ondemand_download_churn;
@@ -27,6 +28,7 @@ enum Args {
     GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
     TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
     OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
+    AuxFiles(cmd::aux_files::Args),
 }
 
 fn main() {
@@ -46,6 +48,7 @@ fn main() {
             cmd::trigger_initial_size_calculation::main(args)
         }
         Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
+        Args::AuxFiles(args) => cmd::aux_files::main(args),
     }
     .unwrap()
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7efd48afc7..0eab6510ca 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -16,6 +16,8 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::IngestAuxFilesRequest;
+use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
@@ -2331,6 +2333,71 @@ async fn get_utilization(
         .map_err(ApiError::InternalServerError)
 }
 
+async fn list_aux_files(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let body: ListAuxFilesRequest = json_request(&mut request).await?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    let process = || async move {
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        let files = timeline.list_aux_files(body.lsn, &ctx).await?;
+        Ok::<_, anyhow::Error>(files)
+    };
+
+    match process().await {
+        Ok(st) => json_response(StatusCode::OK, st),
+        Err(err) => json_response(
+            StatusCode::INTERNAL_SERVER_ERROR,
+            ApiError::InternalServerError(err).to_string(),
+        ),
+    }
+}
+
+async fn ingest_aux_files(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let body: IngestAuxFilesRequest = json_request(&mut request).await?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    let process = || async move {
+        let mut modification = timeline.begin_modification(Lsn(
+            timeline.get_last_record_lsn().0 + 8
+        ) /* advance LSN by 8 */);
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        for (fname, content) in body.aux_files {
+            modification
+                .put_file(&fname, content.as_bytes(), &ctx)
+                .await?;
+        }
+        modification.commit(&ctx).await?;
+        Ok::<_, anyhow::Error>(())
+    };
+
+    match process().await {
+        Ok(st) => json_response(StatusCode::OK, st),
+        Err(err) => Err(ApiError::InternalServerError(err)),
+    }
+}
+
 /// Report on the largest tenants on this pageserver, for the storage controller to identify
 /// candidates for splitting
 async fn post_top_tenants(
@@ -2708,6 +2775,14 @@ pub fn make_router(
         )
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
         .get("/v1/utilization", |r| api_handler(r, get_utilization))
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
+            |r| testing_api_handler("ingest_aux_files", r, ingest_aux_files),
+        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/list_aux_files",
+            |r| testing_api_handler("list_aux_files", r, list_aux_files),
+        )
         .post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
         .any(handler_404))
 }

From 5caee4ca54ea16905ccc3e7f60b3221f33a74b91 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sun, 19 May 2024 20:49:49 +0300
Subject: [PATCH 0804/1571] Fix calculation in test

The comment says that this checks if there's enough space on the page
for logical message *and* an XLOG_SWITCH. So the sizes of the logical
message and the XLOG_SWITCH record should be added together, not
subtracted.

I saw a panic in the test that led me to investigate and notice this
(https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7803/9142396223/index.html):

    RuntimeError: Run ['/tmp/neon/bin/wal_craft', 'in-existing', 'last_wal_record_xlog_switch_ends_on_page_boundary', "host=localhost port=16165 user=cloud_admin dbname=postgres options='-cstatement_timeout=120s '"] failed:
      stdout:

      stderr:
        thread 'main' panicked at libs/postgres_ffi/wal_craft/src/lib.rs:370:27:
        attempt to subtract with overflow
        stack backtrace:
           0: rust_begin_unwind
                     at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/std/src/panicking.rs:645:5
           1: core::panicking::panic_fmt
                     at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/core/src/panicking.rs:72:14
           2: core::panicking::panic
                     at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/core/src/panicking.rs:145:5
           3: <wal_craft::LastWalRecordXlogSwitchEndsOnPageBoundary as wal_craft::Crafter>::craft::<postgres::client::Client>
                     at libs/postgres_ffi/wal_craft/src/lib.rs:370:27
           4: wal_craft::main::{closure#0}
                     at libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs:21:17
           5: wal_craft::main
                     at libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs:66:47
           6: <fn() -> core::result::Result<(), anyhow::Error> as core::ops::function::FnOnce<()>>::call_once
                     at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/core/src/ops/function.rs:250:5
        note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.
---
 libs/postgres_ffi/wal_craft/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 262068cbda..b6769629a8 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -359,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
             // Is there enough space on the page for another logical message and an
             // XLOG_SWITCH? If not, start over.
             let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
-            if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
+            if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 {
                 continue;
             }
 

From a5ecca976ec3abf97c8c3db4f78c230b4553b509 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Sun, 19 May 2024 20:45:53 +0100
Subject: [PATCH 0805/1571] proxy: bump parquet (#7782)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary of changes

Updates the parquet lib. one change left that we need is in an open PR
against upstream, hopefully we can remove the git dependency by 52.0.0
https://github.com/apache/arrow-rs/pull/5773

I'm not sure why the parquet files got a little bit bigger. I tested
them and they still open fine. 🤷

side effect of the update, chrono updated and added yet another
deprecation warning (hence why the safekeepers change)
---
 Cargo.lock                   | 29 ++++++++++----
 Cargo.toml                   |  8 ++--
 proxy/src/context/parquet.rs | 73 +++++++++++++++++++-----------------
 safekeeper/src/broker.rs     |  2 +-
 workspace_hack/Cargo.toml    |  4 +-
 5 files changed, 67 insertions(+), 49 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e1edd53fea..e6060c82f5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1072,9 +1072,9 @@ dependencies = [
 
 [[package]]
 name = "chrono"
-version = "0.4.31"
+version = "0.4.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
+checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
 dependencies = [
  "android-tzdata",
  "iana-time-zone",
@@ -1082,7 +1082,7 @@ dependencies = [
  "num-traits",
  "serde",
  "wasm-bindgen",
- "windows-targets 0.48.0",
+ "windows-targets 0.52.4",
 ]
 
 [[package]]
@@ -1109,7 +1109,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
 dependencies = [
  "ciborium-io",
- "half",
+ "half 1.8.2",
 ]
 
 [[package]]
@@ -2278,6 +2278,17 @@ version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
 
+[[package]]
+name = "half"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+ "num-traits",
+]
+
 [[package]]
 name = "hash32"
 version = "0.3.1"
@@ -3902,12 +3913,13 @@ dependencies = [
 
 [[package]]
 name = "parquet"
-version = "49.0.0"
-source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
+version = "51.0.0"
+source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
 dependencies = [
  "ahash",
  "bytes",
  "chrono",
+ "half 2.4.1",
  "hashbrown 0.14.5",
  "num",
  "num-bigint",
@@ -3916,12 +3928,13 @@ dependencies = [
  "thrift",
  "twox-hash",
  "zstd",
+ "zstd-sys",
 ]
 
 [[package]]
 name = "parquet_derive"
-version = "49.0.0"
-source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
+version = "51.0.0"
+source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
 dependencies = [
  "parquet",
  "proc-macro2",
diff --git a/Cargo.toml b/Cargo.toml
index b59a5dcd6d..2a7dea447e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -122,8 +122,8 @@ opentelemetry = "0.20.0"
 opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
-parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
-parquet_derive = "49.0.0"
+parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
+parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.14"
@@ -244,8 +244,8 @@ tonic-build = "0.9"
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 
 # bug fixes for UUID
-parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
-parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
+parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
+parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
 
 ################# Binary contents sections
 
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 8104fe6087..392821c430 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -307,7 +307,7 @@ where
 }
 
 async fn upload_parquet(
-    w: SerializedFileWriter<Writer<BytesMut>>,
+    mut w: SerializedFileWriter<Writer<BytesMut>>,
     len: i64,
     storage: &GenericRemoteStorage,
 ) -> anyhow::Result<Writer<BytesMut>> {
@@ -319,11 +319,15 @@ async fn upload_parquet(
 
     // I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry.
     // finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253
-    let (writer, metadata) = tokio::task::spawn_blocking(move || w.finish())
+    let (mut buffer, metadata) =
+        tokio::task::spawn_blocking(move || -> parquet::errors::Result<_> {
+            let metadata = w.finish()?;
+            let buffer = std::mem::take(w.inner_mut().get_mut());
+            Ok((buffer, metadata))
+        })
         .await
         .unwrap()?;
 
-    let mut buffer = writer.into_inner();
     let data = buffer.split().freeze();
 
     let compression = len as f64 / len_uncompressed as f64;
@@ -474,10 +478,11 @@ mod tests {
         RequestData {
             session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(),
             peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(),
-            timestamp: chrono::NaiveDateTime::from_timestamp_millis(
+            timestamp: chrono::DateTime::from_timestamp_millis(
                 rng.gen_range(1703862754..1803862754),
             )
-            .unwrap(),
+            .unwrap()
+            .naive_utc(),
             application_name: Some("test".to_owned()),
             username: Some(hex::encode(rng.gen::<[u8; 4]>())),
             endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())),
@@ -560,15 +565,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315008, 3, 6000),
-                (1315001, 3, 6000),
-                (1315061, 3, 6000),
-                (1315018, 3, 6000),
-                (1315148, 3, 6000),
-                (1314990, 3, 6000),
-                (1314782, 3, 6000),
-                (1315018, 3, 6000),
-                (438575, 1, 2000)
+                (1315314, 3, 6000),
+                (1315307, 3, 6000),
+                (1315367, 3, 6000),
+                (1315324, 3, 6000),
+                (1315454, 3, 6000),
+                (1315296, 3, 6000),
+                (1315088, 3, 6000),
+                (1315324, 3, 6000),
+                (438713, 1, 2000)
             ]
         );
 
@@ -598,11 +603,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1221738, 5, 10000),
-                (1227888, 5, 10000),
-                (1229682, 5, 10000),
-                (1229044, 5, 10000),
-                (1220322, 5, 10000)
+                (1222212, 5, 10000),
+                (1228362, 5, 10000),
+                (1230156, 5, 10000),
+                (1229518, 5, 10000),
+                (1220796, 5, 10000)
             ]
         );
 
@@ -634,11 +639,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1207385, 5, 10000),
-                (1207116, 5, 10000),
-                (1207409, 5, 10000),
-                (1207397, 5, 10000),
-                (1207652, 5, 10000)
+                (1207859, 5, 10000),
+                (1207590, 5, 10000),
+                (1207883, 5, 10000),
+                (1207871, 5, 10000),
+                (1208126, 5, 10000)
             ]
         );
 
@@ -663,15 +668,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315008, 3, 6000),
-                (1315001, 3, 6000),
-                (1315061, 3, 6000),
-                (1315018, 3, 6000),
-                (1315148, 3, 6000),
-                (1314990, 3, 6000),
-                (1314782, 3, 6000),
-                (1315018, 3, 6000),
-                (438575, 1, 2000)
+                (1315314, 3, 6000),
+                (1315307, 3, 6000),
+                (1315367, 3, 6000),
+                (1315324, 3, 6000),
+                (1315454, 3, 6000),
+                (1315296, 3, 6000),
+                (1315088, 3, 6000),
+                (1315324, 3, 6000),
+                (438713, 1, 2000)
             ]
         );
 
@@ -708,7 +713,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(659240, 2, 3001), (658954, 2, 3000), (658750, 2, 2999)]
+            [(659462, 2, 3001), (659176, 2, 3000), (658972, 2, 2999)]
         );
 
         tmpdir.close().unwrap();
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index 98f58d3e49..ea16ce450f 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -319,7 +319,7 @@ async fn task_stats(stats: Arc<BrokerStats>) {
 
                 let now = BrokerStats::now_millis();
                 if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 {
-                    let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
+                    let ts = chrono::DateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
                     info!("no broker updates for some time, last update: {:?}", ts);
                 }
             }
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index b605757f64..7582562450 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -51,7 +51,7 @@ num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
-parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
+parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
@@ -102,7 +102,7 @@ num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
-parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
+parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }

From 291fcb9e4f7086237c409580b1fd3accf46d5ba3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 20 May 2024 09:25:25 +0100
Subject: [PATCH 0806/1571] pageserver: use the heatmap upload interval to set
 the secondary download interval (#7793)

## Problem

The heatmap upload period is configurable, but secondary mode downloads
were using a fixed download period.

Closes: #6200

## Summary of changes

- Use the upload period in the heatmap to adjust the download period.

In practice, this will reduce the frequency of downloads from its
current 60 second period to what heatmaps use, which is 5-10m depending
on environment.

This is an improvement rather than being optimal: we could be smarter
about periods, and schedule downloads to occur around the time we expect
the next upload, rather than just using the same period, but that's
something we can address in future if it comes up.
---
 pageserver/src/tenant/secondary/downloader.rs | 84 ++++++++++---------
 .../regress/test_pageserver_secondary.py      | 18 ++--
 2 files changed, 58 insertions(+), 44 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 8f27220771..609e1431cf 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -62,14 +62,10 @@ use super::{
     CommandRequest, DownloadCommand,
 };
 
-/// For each tenant, how long must have passed since the last download_tenant call before
-/// calling it again.  This is approximately the time by which local data is allowed
-/// to fall behind remote data.
-///
-/// TODO: this should just be a default, and the actual period should be controlled
-/// via the heatmap itself
-/// `<ttps://github.com/neondatabase/neon/issues/6200>`
-const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
+/// For each tenant, default period for how long must have passed since the last download_tenant call before
+/// calling it again.  This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first
+/// download, if the uploader populated it.
+const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);
 
 /// Range of concurrency we may use when downloading layers within a timeline.  This is independent
 /// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
@@ -152,14 +148,22 @@ pub(super) struct SecondaryDetailTimeline {
     pub(super) evicted_at: HashMap<LayerName, SystemTime>,
 }
 
+// Aspects of a heatmap that we remember after downloading it
+#[derive(Clone, Debug)]
+struct DownloadSummary {
+    etag: Etag,
+    #[allow(unused)]
+    mtime: SystemTime,
+    upload_period: Duration,
+}
+
 /// This state is written by the secondary downloader, it is opaque
 /// to TenantManager
 #[derive(Debug)]
 pub(super) struct SecondaryDetail {
     pub(super) config: SecondaryLocationConfig,
 
-    last_download: Option<Instant>,
-    last_etag: Option<Etag>,
+    last_download: Option<DownloadSummary>,
     next_download: Option<Instant>,
     pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
 }
@@ -189,7 +193,6 @@ impl SecondaryDetail {
         Self {
             config,
             last_download: None,
-            last_etag: None,
             next_download: None,
             timelines: HashMap::new(),
         }
@@ -243,9 +246,8 @@ impl SecondaryDetail {
 
 struct PendingDownload {
     secondary_state: Arc<SecondaryTenant>,
-    last_download: Option<Instant>,
+    last_download: Option<DownloadSummary>,
     target_time: Option<Instant>,
-    period: Option<Duration>,
 }
 
 impl scheduler::PendingJob for PendingDownload {
@@ -295,10 +297,17 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
 
         tracing::debug!("Secondary tenant download completed");
 
-        // Update freshened_at even if there was an error: we don't want errored tenants to implicitly
-        // take priority to run again.
         let mut detail = secondary_state.detail.lock().unwrap();
-        detail.next_download = Some(Instant::now() + period_jitter(DOWNLOAD_FRESHEN_INTERVAL, 5));
+
+        let period = detail
+            .last_download
+            .as_ref()
+            .map(|d| d.upload_period)
+            .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
+
+        // We advance next_download irrespective of errors: we don't want error cases to result in
+        // expensive busy-polling.
+        detail.next_download = Some(Instant::now() + period_jitter(period, 5));
     }
 
     async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
@@ -331,11 +340,11 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                     if detail.next_download.is_none() {
                         // Initialize randomly in the range from 0 to our interval: this uniformly spreads the start times.  Subsequent
                         // rounds will use a smaller jitter to avoid accidentally synchronizing later.
-                        detail.next_download = Some(now.checked_add(period_warmup(DOWNLOAD_FRESHEN_INTERVAL)).expect(
+                        detail.next_download = Some(now.checked_add(period_warmup(DEFAULT_DOWNLOAD_INTERVAL)).expect(
                         "Using our constant, which is known to be small compared with clock range",
                     ));
                     }
-                    (detail.last_download, detail.next_download.unwrap())
+                    (detail.last_download.clone(), detail.next_download.unwrap())
                 };
 
                 if now > next_download {
@@ -343,7 +352,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                         secondary_state: secondary_tenant,
                         last_download,
                         target_time: Some(next_download),
-                        period: Some(DOWNLOAD_FRESHEN_INTERVAL),
                     })
                 } else {
                     None
@@ -369,7 +377,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
 
         Ok(PendingDownload {
             target_time: None,
-            period: None,
             last_download: None,
             secondary_state: tenant,
         })
@@ -386,7 +393,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
             secondary_state,
             last_download,
             target_time,
-            period,
         } = job;
 
         let (completion, barrier) = utils::completion::channel();
@@ -423,20 +429,15 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
 
             // If the job had a target execution time, we may check our final execution
             // time against that for observability purposes.
-            if let (Some(target_time), Some(period)) = (target_time, period) {
-                // Only track execution lag if this isn't our first download: otherwise, it is expected
-                // that execution will have taken longer than our configured interval, for example
-                // when starting up a pageserver and
-                if last_download.is_some() {
-                    // Elapsed time includes any scheduling lag as well as the execution of the job
-                    let elapsed = Instant::now().duration_since(target_time);
+            if let (Some(target_time), Some(last_download)) = (target_time, last_download) {
+                // Elapsed time includes any scheduling lag as well as the execution of the job
+                let elapsed = Instant::now().duration_since(target_time);
 
-                    warn_when_period_overrun(
-                        elapsed,
-                        period,
-                        BackgroundLoopKind::SecondaryDownload,
-                    );
-                }
+                warn_when_period_overrun(
+                    elapsed,
+                    last_download.upload_period,
+                    BackgroundLoopKind::SecondaryDownload,
+                );
             }
 
             CompleteDownload {
@@ -525,12 +526,12 @@ impl<'a> TenantDownloader<'a> {
         let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
 
         // We will use the etag from last successful download to make the download conditional on changes
-        let last_etag = self
+        let last_download = self
             .secondary_state
             .detail
             .lock()
             .unwrap()
-            .last_etag
+            .last_download
             .clone();
 
         // Download the tenant's heatmap
@@ -539,7 +540,7 @@ impl<'a> TenantDownloader<'a> {
             etag: heatmap_etag,
             bytes: heatmap_bytes,
         } = match tokio::select!(
-            bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
+            bytes = self.download_heatmap(last_download.as_ref().map(|d| &d.etag)) => {bytes?},
             _ = self.secondary_state.cancel.cancelled() => return Ok(())
         ) {
             HeatMapDownload::Unmodified => {
@@ -599,7 +600,14 @@ impl<'a> TenantDownloader<'a> {
 
         // Only update last_etag after a full successful download: this way will not skip
         // the next download, even if the heatmap's actual etag is unchanged.
-        self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
+        self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
+            etag: heatmap_etag,
+            mtime: heatmap_mtime,
+            upload_period: heatmap
+                .upload_period_ms
+                .map(|ms| Duration::from_millis(ms as u64))
+                .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL),
+        });
 
         Ok(())
     }
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index fdc09a063d..127340a1e7 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -575,7 +575,10 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
     tenant_timelines = {}
 
     # This mirrors a constant in `downloader.rs`
-    freshen_interval_secs = 60
+    default_download_period_secs = 60
+
+    # The upload period, which will also be the download once the secondary has seen its first heatmap
+    upload_period_secs = 20
 
     for _i in range(0, tenant_count):
         tenant_id = TenantId.generate()
@@ -587,7 +590,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
             placement_policy='{"Attached":1}',
             # Run with a low heatmap period so that we can avoid having to do synthetic API calls
             # to trigger the upload promptly.
-            conf={"heatmap_period": "1s"},
+            conf={"heatmap_period": f"{upload_period_secs}s"},
         )
         env.neon_cli.create_timeline("main2", tenant_id, timeline_b)
 
@@ -597,7 +600,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
 
     # Wait long enough that the background downloads should happen; we expect all the inital layers
     # of all the initial timelines to show up on the secondary location of each tenant.
-    time.sleep(freshen_interval_secs * 1.5)
+    time.sleep(default_download_period_secs * 1.5)
 
     for tenant_id, timelines in tenant_timelines.items():
         attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -613,8 +616,8 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         # Delete the second timeline: this should be reflected later on the secondary
         env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])
 
-    # Wait long enough for the secondary locations to see the deletion
-    time.sleep(freshen_interval_secs * 1.5)
+    # Wait long enough for the secondary locations to see the deletion: 2x period plus a grace factor
+    time.sleep(upload_period_secs * 2.5)
 
     for tenant_id, timelines in tenant_timelines.items():
         attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -626,6 +629,9 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         assert ps_secondary.list_layers(tenant_id, timelines[0])
 
         # This one was deleted
+        log.info(
+            f"Checking for secondary timeline deletion {tenant_id}/{timeline_id} on node {ps_secondary.id}"
+        )
         assert not ps_secondary.list_layers(tenant_id, timelines[1])
 
     t_end = time.time()
@@ -640,7 +646,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
 
     download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start)
 
-    expect_download_rate = 1.0 / freshen_interval_secs
+    expect_download_rate = 1.0 / upload_period_secs
     log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min")
 
     assert download_rate < expect_download_rate * 2

From a7b84cca5aef3dacc90895f39f0e0ad2b6c950cc Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 20 May 2024 12:07:25 +0200
Subject: [PATCH 0807/1571] Upgrade of pgvector to 0.7.0 (#7726)

Upgrade pgvector to 0.7.0.

This PR is based on Heikki's PR #6753 and just uses pgvector 0.7.0
instead of 0.6.0

I have now done all planned manual tests.

The pull request is ready to be reviewed and merged and can be deployed
in production together / after swap enablement.

See (https://github.com/neondatabase/autoscaling/issues/800)

Fixes https://github.com/neondatabase/neon/issues/6516
Fixes https://github.com/neondatabase/neon/issues/7780

## Documentation input for usage recommendations

### maintenance_work_mem
In Neon

`maintenance_work_mem` is very small by default (depends on configured
RAM for your compute but can be as low as 64 MB).
To optimize pgvector index build time you may have to bump it up
according to your working set size (size of tuples for vector index
creation).
You can do so in the current session using

`SET maintenance_work_mem='10 GB';`

The target value you choose should fit into the memory of your compute
size and not exceed 50-60% of available RAM.
The value above has been successfully used on a 7CU endpoint.

### max_parallel_maintenance_workers

max_parallel_maintenance_workers is also small by default (2). For
efficient parallel pgvector index creation you have to bump it up with

`SET max_parallel_maintenance_workers = 7`

to make use of all the CPUs available, assuming you have configured your
endpoint to use 7CU.

## ID input for changelog

pgvector extension in Neon has been upgraded from version 0.5.1 to
version 0.7.0.
Please see https://github.com/pgvector/pgvector/ for documentation of
new capabilities in pgvector version 0.7.0

If you have existing databases with pgvector 0.5.1 already installed
there is a slight difference in behavior in the following corner cases
even if you don't run `ALTER EXTENSION UPDATE`:

### L2 distance from NULL::vector

For the following script, comparing the NULL::vector to non-null vectors
the resulting output changes:

```sql
SET enable_seqscan = off;

CREATE TABLE t (val vector(3));
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
CREATE INDEX ON t USING hnsw (val vector_l2_ops);

INSERT INTO t (val) VALUES ('[1,2,4]');

SELECT * FROM t ORDER BY val <-> (SELECT NULL::vector);
```
and now the output is
```
   val
---------
 [1,1,1]
 [1,2,4]
 [1,2,3]
 [0,0,0]
(4 rows)
```

For the following script
```sql
SET enable_seqscan = off;

CREATE TABLE t (val vector(3));
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
CREATE INDEX ON t USING ivfflat (val vector_l2_ops) WITH (lists = 1);

INSERT INTO t (val) VALUES ('[1,2,4]');

SELECT * FROM t ORDER BY val <-> (SELECT NULL::vector);
```
the output now is

```
   val
---------
 [0,0,0]
 [1,2,3]
 [1,1,1]
 [1,2,4]
(4 rows)
```

### changed error messages
If you provide invalid literals for datatype vector you may get
improved/changed error messages, for example:
```sql
neondb=> SELECT '[4e38,1]'::vector;
ERROR:  "4e38" is out of range for type vector
LINE 1: SELECT '[4e38,1]'::vector;
               ^
```

---------

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 .dockerignore              |  1 +
 Dockerfile.compute-node    |  7 +++-
 patches/pgvector.patch     | 78 ++++++++++++++++++++++++++++++++++++++
 pgxn/neon/pagestore_smgr.c | 19 +++++++++-
 4 files changed, 101 insertions(+), 4 deletions(-)
 create mode 100644 patches/pgvector.patch

diff --git a/.dockerignore b/.dockerignore
index f7a6232ba1..1258532db8 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -17,6 +17,7 @@
 !libs/
 !neon_local/
 !pageserver/
+!patches/
 !pgxn/
 !proxy/
 !s3_scrubber/
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index bd4534ce1d..5bf3246f34 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,9 +241,12 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
-    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
+COPY patches/pgvector.patch /pgvector.patch
+
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
+    echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
diff --git a/patches/pgvector.patch b/patches/pgvector.patch
new file mode 100644
index 0000000000..84ac6644c5
--- /dev/null
+++ b/patches/pgvector.patch
@@ -0,0 +1,78 @@
+From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
+From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+Date: Fri, 2 Feb 2024 22:26:45 +0200
+Subject: [PATCH 1/1] Make v0.6.0 work with Neon
+
+Now that the WAL-logging happens as a separate step at the end of the
+build, we need a few neon-specific hints to make it work.
+---
+ src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
+ 1 file changed, 36 insertions(+)
+
+diff --git a/src/hnswbuild.c b/src/hnswbuild.c
+index 680789b..ec54dea 100644
+--- a/src/hnswbuild.c
++++ b/src/hnswbuild.c
+@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
+ 
+ 	hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
+ 
++#ifdef NEON_SMGR
++	smgr_start_unlogged_build(RelationGetSmgr(indexRel));
++#endif
++
+ 	/* Perform inserts */
+ 	HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false);
+ 
++#ifdef NEON_SMGR
++	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel));
++#endif
++
+ 	/* Close relations within worker */
+ 	index_close(indexRel, indexLockmode);
+ 	table_close(heapRel, heapLockmode);
+@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+ 	SeedRandom(42);
+ #endif
+ 
++#ifdef NEON_SMGR
++	smgr_start_unlogged_build(RelationGetSmgr(index));
++#endif
++
+ 	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
+ 
+ 	BuildGraph(buildstate, forkNum);
+ 
++#ifdef NEON_SMGR
++	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
++#endif
++
+ 	if (RelationNeedsWAL(index))
++	{
+ 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
+ 
++#ifdef NEON_SMGR
++		{
++#if PG_VERSION_NUM >= 160000
++			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
++#else
++			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
++#endif
++
++			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
++										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
++			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
++		}
++#endif
++	}
++
++#ifdef NEON_SMGR
++	smgr_end_unlogged_build(RelationGetSmgr(index));
++#endif
++
+ 	FreeBuildState(buildstate);
+ }
+ 
+-- 
+2.39.2
+
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index e3b841f526..249ad313b0 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -45,6 +45,7 @@
  */
 #include "postgres.h"
 
+#include "access/parallel.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xlogdefs.h"
@@ -2822,10 +2823,14 @@ neon_start_unlogged_build(SMgrRelation reln)
 	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
 
 	/*
+	 * Create the local file. In a parallel build, the leader is expected to
+	 * call this first and do it.
+	 *
 	 * FIXME: should we pass isRedo true to create the tablespace dir if it
 	 * doesn't exist? Is it needed?
 	 */
-	mdcreate(reln, MAIN_FORKNUM, false);
+	if (!IsParallelWorker())
+		mdcreate(reln, MAIN_FORKNUM, false);
 }
 
 /*
@@ -2849,7 +2854,17 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
 	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
 
-	unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
+	/*
+	 * In a parallel build, (only) the leader process performs the 2nd
+	 * phase.
+	 */
+	if (IsParallelWorker())
+	{
+		unlogged_build_rel = NULL;
+		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+	}
+	else
+		unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
 }
 
 /*

From e3f51abadf835784471034c583605c6b41154f2c Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 13 May 2024 13:18:18 +0300
Subject: [PATCH 0808/1571] safekeeper: close connection when COPY stream ends.

We can't gracefully exit COPY mode (and don't need that), so close connection to
prevent further attempts to use it.
---
 libs/postgres_backend/src/lib.rs | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 260018ad89..6c41b7f347 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -820,10 +820,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         Ok(ProcessMsgResult::Continue)
     }
 
-    /// Log as info/error result of handling COPY stream and send back
-    /// ErrorResponse if that makes sense. Shutdown the stream if we got
-    /// Terminate. TODO: transition into waiting for Sync msg if we initiate the
-    /// close.
+    /// - Log as info/error result of handling COPY stream and send back
+    ///   ErrorResponse if that makes sense.
+    /// - Shutdown the stream if we got Terminate.
+    /// - Then close the connection because we don't handle exiting from COPY
+    ///   stream normally.
     pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) {
         use CopyStreamHandlerEnd::*;
 
@@ -849,10 +850,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
             }
         }
 
-        if let Terminate = &end {
-            self.state = ProtoState::Closed;
-        }
-
         let err_to_send_and_errcode = match &end {
             ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
             Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)),
@@ -882,6 +879,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                 error!("failed to send ErrorResponse: {}", ee);
             }
         }
+
+        // Proper COPY stream finishing to continue using the connection is not
+        // implemented at the server side (we don't need it so far). To prevent
+        // further usages of the connection, close it.
+        self.framed.shutdown().await.ok();
+        self.state = ProtoState::Closed;
     }
 }
 

From de8dfee4bda97deb8a2f02e4e4cc0c7641f56d09 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 13 May 2024 13:19:12 +0300
Subject: [PATCH 0809/1571] safekeeper: log LSNs on walreceiver/walsender exit.

Useful for observability.
---
 safekeeper/src/receive_wal.rs | 29 ++++++++++++++++++++++-------
 safekeeper/src/send_wal.rs    | 11 +++++++----
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 015b53bb2e..0356def7df 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -183,9 +183,19 @@ impl SafekeeperPostgresHandler {
         &mut self,
         pgb: &mut PostgresBackend<IO>,
     ) -> Result<(), QueryError> {
-        if let Err(end) = self.handle_start_wal_push_guts(pgb).await {
+        let mut tli: Option<Arc<Timeline>> = None;
+        if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
             // Log the result and probably send it to the client, closing the stream.
-            pgb.handle_copy_stream_end(end).await;
+            let handle_end_fut = pgb.handle_copy_stream_end(end);
+            // If we managed to create the timeline, augment logging with current LSNs etc.
+            if let Some(tli) = tli {
+                let info = tli.get_safekeeper_info(&self.conf).await;
+                handle_end_fut
+                    .instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.commit_lsn)))
+                    .await;
+            } else {
+                handle_end_fut.await;
+            }
         }
         Ok(())
     }
@@ -193,6 +203,7 @@ impl SafekeeperPostgresHandler {
     pub async fn handle_start_wal_push_guts<IO: AsyncRead + AsyncWrite + Unpin>(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
+        tli: &mut Option<Arc<Timeline>>,
     ) -> Result<(), CopyStreamHandlerEnd> {
         // Notify the libpq client that it's allowed to send `CopyData` messages
         pgb.write_message(&BeMessage::CopyBothResponse).await?;
@@ -222,13 +233,17 @@ impl SafekeeperPostgresHandler {
         // Read first message and create timeline if needed.
         let res = network_reader.read_first_message().await;
 
-        let res = if let Ok((tli, next_msg)) = res {
+        let network_res = if let Ok((timeline, next_msg)) = res {
             let pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback> =
-                tli.get_walreceivers().pageserver_feedback_tx.subscribe();
+                timeline
+                    .get_walreceivers()
+                    .pageserver_feedback_tx
+                    .subscribe();
+            *tli = Some(timeline.clone());
 
             tokio::select! {
                 // todo: add read|write .context to these errors
-                r = network_reader.run(msg_tx, msg_rx, reply_tx, tli.clone(), next_msg) => r,
+                r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline.clone(), next_msg) => r,
                 r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
             }
         } else {
@@ -244,13 +259,13 @@ impl SafekeeperPostgresHandler {
         match acceptor_handle {
             None => {
                 // failed even before spawning; read_network should have error
-                Err(res.expect_err("no error with WalAcceptor not spawn"))
+                Err(network_res.expect_err("no error with WalAcceptor not spawn"))
             }
             Some(handle) => {
                 let wal_acceptor_res = handle.await;
 
                 // If there was any network error, return it.
-                res?;
+                network_res?;
 
                 // Otherwise, WalAcceptor thread must have errored.
                 match wal_acceptor_res {
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 59a8c595ab..ecaae9cfe7 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -340,12 +340,16 @@ impl SafekeeperPostgresHandler {
         start_pos: Lsn,
         term: Option<Term>,
     ) -> Result<(), QueryError> {
+        let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
         if let Err(end) = self
-            .handle_start_replication_guts(pgb, start_pos, term)
+            .handle_start_replication_guts(pgb, start_pos, term, tli.clone())
             .await
         {
+            let info = tli.get_safekeeper_info(&self.conf).await;
             // Log the result and probably send it to the client, closing the stream.
-            pgb.handle_copy_stream_end(end).await;
+            pgb.handle_copy_stream_end(end)
+            .instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.flush_lsn)))
+            .await;
         }
         Ok(())
     }
@@ -355,10 +359,9 @@ impl SafekeeperPostgresHandler {
         pgb: &mut PostgresBackend<IO>,
         start_pos: Lsn,
         term: Option<Term>,
+        tli: Arc<Timeline>,
     ) -> Result<(), CopyStreamHandlerEnd> {
         let appname = self.appname.clone();
-        let tli =
-            GlobalTimelines::get(self.ttid).map_err(|e| CopyStreamHandlerEnd::Other(e.into()))?;
 
         // Use a guard object to remove our entry from the timeline when we are done.
         let ws_guard = Arc::new(tli.get_walsenders().register(

From 7701ca45dd2215ecca8b8c3de50926ae9b520ffd Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 20 May 2024 12:08:45 -0400
Subject: [PATCH 0810/1571] feat(pageserver): generate image layers for sparse
 keyspace (#7567)

Part of https://github.com/neondatabase/neon/issues/7462

Sparse keyspace does not generate image layers for now. This pull
request adds support for generating image layers for sparse keyspace.


## Summary of changes

* Use the scan interface to generate compaction data for sparse
keyspace.
* Track num of delta layers reads during scan.
* Read-trigger compaction: when a scan on the keyspace touches too many
delta files, generate an image layer. There are one hard-coded threshold
for now: max delta layers we want to touch for a scan.
* L0 compaction does not need to compute holes for metadata keyspace.

Know issue: the scan interface currently reads past the image layer,
which causes `delta_layer_accessed` keeps increasing even if image
layers are generated. The pull request to fix that will be separate, and
orthogonal to this one.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs          |   6 +-
 pageserver/src/tenant.rs                     | 105 +++++-
 pageserver/src/tenant/storage_layer.rs       |  19 +-
 pageserver/src/tenant/timeline.rs            | 339 +++++++++++++------
 pageserver/src/tenant/timeline/compaction.rs |  35 +-
 5 files changed, 363 insertions(+), 141 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 402f075365..b4fc4a08ee 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -40,7 +40,11 @@ use utils::bin_ser::DeserializeError;
 use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
-const MAX_AUX_FILE_DELTAS: usize = 1024;
+/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
+pub const MAX_AUX_FILE_DELTAS: usize = 1024;
+
+/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
+pub const MAX_AUX_FILE_V2_DELTAS: usize = 64;
 
 #[derive(Debug)]
 pub enum LsnForTimestamp {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d42b9082b7..e598e9d2e3 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4777,7 +4777,12 @@ mod tests {
             info!("Doing vectored read on {:?}", read);
 
             let vectored_res = tline
-                .get_vectored_impl(read.clone(), reads_lsn, ValuesReconstructState::new(), &ctx)
+                .get_vectored_impl(
+                    read.clone(),
+                    reads_lsn,
+                    &mut ValuesReconstructState::new(),
+                    &ctx,
+                )
                 .await;
             tline
                 .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
@@ -4826,7 +4831,7 @@ mod tests {
             .get_vectored_impl(
                 aux_keyspace.clone(),
                 read_lsn,
-                ValuesReconstructState::new(),
+                &mut ValuesReconstructState::new(),
                 &ctx,
             )
             .await;
@@ -4971,7 +4976,7 @@ mod tests {
             .get_vectored_impl(
                 read.clone(),
                 current_lsn,
-                ValuesReconstructState::new(),
+                &mut ValuesReconstructState::new(),
                 &ctx,
             )
             .await?;
@@ -5106,7 +5111,7 @@ mod tests {
                         ranges: vec![child_gap_at_key..child_gap_at_key.next()],
                     },
                     query_lsn,
-                    ValuesReconstructState::new(),
+                    &mut ValuesReconstructState::new(),
                     &ctx,
                 )
                 .await;
@@ -5547,7 +5552,7 @@ mod tests {
             .await?;
 
         const NUM_KEYS: usize = 1000;
-        const STEP: usize = 100; // random update + scan base_key + idx * STEP
+        const STEP: usize = 10000; // random update + scan base_key + idx * STEP
 
         let cancel = CancellationToken::new();
 
@@ -5580,7 +5585,7 @@ mod tests {
 
         let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
 
-        for _ in 0..10 {
+        for iter in 0..=10 {
             // Read all the blocks
             for (blknum, last_lsn) in updated.iter().enumerate() {
                 test_key.field6 = (blknum * STEP) as u32;
@@ -5595,7 +5600,7 @@ mod tests {
                 .get_vectored_impl(
                     keyspace.clone(),
                     lsn,
-                    ValuesReconstructState::default(),
+                    &mut ValuesReconstructState::default(),
                     &ctx,
                 )
                 .await?
@@ -5631,17 +5636,91 @@ mod tests {
                 updated[blknum] = lsn;
             }
 
-            // Perform a cycle of flush, compact, and GC
-            tline.freeze_and_flush().await?;
-            tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
-            tenant
-                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
-                .await?;
+            // Perform two cycles of flush, compact, and GC
+            for round in 0..2 {
+                tline.freeze_and_flush().await?;
+                tline
+                    .compact(
+                        &cancel,
+                        if iter % 5 == 0 && round == 0 {
+                            let mut flags = EnumSet::new();
+                            flags.insert(CompactFlags::ForceImageLayerCreation);
+                            flags.insert(CompactFlags::ForceRepartition);
+                            flags
+                        } else {
+                            EnumSet::empty()
+                        },
+                        &ctx,
+                    )
+                    .await?;
+                tenant
+                    .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                    .await?;
+            }
         }
 
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_metadata_compaction_trigger() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_compaction_trigger")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
+        let test_key = base_key;
+        let mut lsn = Lsn(0x10);
+
+        for _ in 0..20 {
+            lsn = Lsn(lsn.0 + 0x10);
+            let mut writer = tline.writer().await;
+            writer
+                .put(
+                    test_key,
+                    lsn,
+                    &Value::Image(test_img(&format!("{} at {}", 0, lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+            tline.freeze_and_flush().await?; // force create a delta layer
+        }
+
+        let before_num_l0_delta_files = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .get_level0_deltas()?
+            .len();
+
+        tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
+
+        let after_num_l0_delta_files = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .get_level0_deltas()?
+            .len();
+
+        assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");
+
+        assert_eq!(
+            tline.get(test_key, lsn, &ctx).await?,
+            test_img(&format!("{} at {}", 0, lsn))
+        );
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_branch_copies_dirty_aux_file_flag() {
         let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap();
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 94a5e9ec47..4c8a518551 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -113,12 +113,17 @@ impl From<VectoredValueReconstructState> for ValueReconstructState {
     }
 }
 
-/// Bag of data accumulated during a vectored get
+/// Bag of data accumulated during a vectored get.
 pub(crate) struct ValuesReconstructState {
+    /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
+    /// should not expect to get anything from this hashmap.
     pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
 
     keys_done: KeySpaceRandomAccum,
+
+    // Statistics that are still accessible as a caller of `get_vectored_impl`.
     layers_visited: u32,
+    delta_layers_visited: u32,
 }
 
 impl ValuesReconstructState {
@@ -127,6 +132,7 @@ impl ValuesReconstructState {
             keys: HashMap::new(),
             keys_done: KeySpaceRandomAccum::new(),
             layers_visited: 0,
+            delta_layers_visited: 0,
         }
     }
 
@@ -140,8 +146,17 @@ impl ValuesReconstructState {
         }
     }
 
-    pub(crate) fn on_layer_visited(&mut self) {
+    pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
         self.layers_visited += 1;
+        if let ReadableLayer::PersistentLayer(layer) = layer {
+            if layer.layer_desc().is_delta() {
+                self.delta_layers_visited += 1;
+            }
+        }
+    }
+
+    pub(crate) fn get_delta_layers_visited(&self) -> u32 {
+        self.delta_layers_visited
     }
 
     pub(crate) fn get_layers_visited(&self) -> u32 {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1fb1928079..e6b58b7166 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -18,8 +18,8 @@ use fail::fail_point;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::{
-        AUX_FILES_KEY, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
-        NON_INHERITED_SPARSE_RANGE,
+        AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
+        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
     },
     keyspace::{KeySpaceAccum, SparseKeyPartitioning},
     models::{
@@ -60,7 +60,6 @@ use std::{
     ops::ControlFlow,
 };
 
-use crate::tenant::timeline::init::LocalLayerFileMetadata;
 use crate::{
     aux_file::AuxFileSizeEstimator,
     tenant::{
@@ -89,6 +88,9 @@ use crate::{
     metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
+use crate::{
+    pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::timeline::init::LocalLayerFileMetadata,
+};
 use crate::{
     pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
     virtual_file::{MaybeFatalIo, VirtualFile},
@@ -782,6 +784,11 @@ pub(crate) enum ShutdownMode {
     Hard,
 }
 
+struct ImageLayerCreationOutcome {
+    image: Option<ResidentLayer>,
+    next_start_key: Key,
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -883,7 +890,7 @@ impl Timeline {
                 }
 
                 let vectored_res = self
-                    .get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx)
+                    .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
                     .await;
 
                 if self.conf.validate_vectored_get {
@@ -1028,7 +1035,12 @@ impl Timeline {
             }
             GetVectoredImpl::Vectored => {
                 let vectored_res = self
-                    .get_vectored_impl(keyspace.clone(), lsn, ValuesReconstructState::new(), ctx)
+                    .get_vectored_impl(
+                        keyspace.clone(),
+                        lsn,
+                        &mut ValuesReconstructState::new(),
+                        ctx,
+                    )
                     .await;
 
                 if self.conf.validate_vectored_get {
@@ -1116,7 +1128,7 @@ impl Timeline {
             .get_vectored_impl(
                 keyspace.clone(),
                 lsn,
-                ValuesReconstructState::default(),
+                &mut ValuesReconstructState::default(),
                 ctx,
             )
             .await;
@@ -1193,7 +1205,7 @@ impl Timeline {
         &self,
         keyspace: KeySpace,
         lsn: Lsn,
-        mut reconstruct_state: ValuesReconstructState,
+        reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
         let get_kind = if keyspace.total_raw_size() == 1 {
@@ -1205,7 +1217,7 @@ impl Timeline {
         let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
             .for_get_kind(get_kind)
             .start_timer();
-        self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
+        self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx)
             .await?;
         get_data_timer.stop_and_record();
 
@@ -1214,7 +1226,8 @@ impl Timeline {
             .start_timer();
         let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
         let layers_visited = reconstruct_state.get_layers_visited();
-        for (key, res) in reconstruct_state.keys {
+
+        for (key, res) in std::mem::take(&mut reconstruct_state.keys) {
             match res {
                 Err(err) => {
                     results.insert(key, Err(err));
@@ -3448,7 +3461,7 @@ impl Timeline {
                 unmapped_keyspace = keyspace_to_read;
                 cont_lsn = next_cont_lsn;
 
-                reconstruct_state.on_layer_visited();
+                reconstruct_state.on_layer_visited(&layer_to_read);
             } else {
                 break;
             }
@@ -4134,6 +4147,176 @@ impl Timeline {
         false
     }
 
+    /// Create image layers for Postgres data. Assumes the caller passes a partition that is not too large,
+    /// so that at most one image layer will be produced from this function.
+    async fn create_image_layer_for_rel_blocks(
+        self: &Arc<Self>,
+        partition: &KeySpace,
+        mut image_layer_writer: ImageLayerWriter,
+        lsn: Lsn,
+        ctx: &RequestContext,
+        img_range: Range<Key>,
+        start: Key,
+    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
+        let mut wrote_keys = false;
+
+        let mut key_request_accum = KeySpaceAccum::new();
+        for range in &partition.ranges {
+            let mut key = range.start;
+            while key < range.end {
+                // Decide whether to retain this key: usually we do, but sharded tenants may
+                // need to drop keys that don't belong to them.  If we retain the key, add it
+                // to `key_request_accum` for later issuing a vectored get
+                if self.shard_identity.is_key_disposable(&key) {
+                    debug!(
+                        "Dropping key {} during compaction (it belongs on shard {:?})",
+                        key,
+                        self.shard_identity.get_shard_number(&key)
+                    );
+                } else {
+                    key_request_accum.add_key(key);
+                }
+
+                let last_key_in_range = key.next() == range.end;
+                key = key.next();
+
+                // Maybe flush `key_rest_accum`
+                if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
+                    || (last_key_in_range && key_request_accum.raw_size() > 0)
+                {
+                    let results = self
+                        .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
+                        .await?;
+
+                    for (img_key, img) in results {
+                        let img = match img {
+                            Ok(img) => img,
+                            Err(err) => {
+                                // If we fail to reconstruct a VM or FSM page, we can zero the
+                                // page without losing any actual user data. That seems better
+                                // than failing repeatedly and getting stuck.
+                                //
+                                // We had a bug at one point, where we truncated the FSM and VM
+                                // in the pageserver, but the Postgres didn't know about that
+                                // and continued to generate incremental WAL records for pages
+                                // that didn't exist in the pageserver. Trying to replay those
+                                // WAL records failed to find the previous image of the page.
+                                // This special case allows us to recover from that situation.
+                                // See https://github.com/neondatabase/neon/issues/2601.
+                                //
+                                // Unfortunately we cannot do this for the main fork, or for
+                                // any metadata keys, keys, as that would lead to actual data
+                                // loss.
+                                if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) {
+                                    warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
+                                    ZERO_PAGE.clone()
+                                } else {
+                                    return Err(CreateImageLayersError::PageReconstructError(err));
+                                }
+                            }
+                        };
+
+                        // Write all the keys we just read into our new image layer.
+                        image_layer_writer.put_image(img_key, img, ctx).await?;
+                        wrote_keys = true;
+                    }
+                }
+            }
+        }
+
+        if wrote_keys {
+            // Normal path: we have written some data into the new image layer for this
+            // partition, so flush it to disk.
+            let image_layer = image_layer_writer.finish(self, ctx).await?;
+            Ok(ImageLayerCreationOutcome {
+                image: Some(image_layer),
+                next_start_key: img_range.end,
+            })
+        } else {
+            // Special case: the image layer may be empty if this is a sharded tenant and the
+            // partition does not cover any keys owned by this shard.  In this case, to ensure
+            // we don't leave gaps between image layers, leave `start` where it is, so that the next
+            // layer we write will cover the key range that we just scanned.
+            tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
+            Ok(ImageLayerCreationOutcome {
+                image: None,
+                next_start_key: start,
+            })
+        }
+    }
+
+    /// Create an image layer for metadata keys. This function produces one image layer for all metadata
+    /// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it
+    /// would not be too large to fit in a single image layer.
+    #[allow(clippy::too_many_arguments)]
+    async fn create_image_layer_for_metadata_keys(
+        self: &Arc<Self>,
+        partition: &KeySpace,
+        mut image_layer_writer: ImageLayerWriter,
+        lsn: Lsn,
+        ctx: &RequestContext,
+        img_range: Range<Key>,
+        mode: ImageLayerCreationMode,
+    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
+        assert!(!matches!(mode, ImageLayerCreationMode::Initial));
+
+        // Metadata keys image layer creation.
+        let mut reconstruct_state = ValuesReconstructState::default();
+        let data = self
+            .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
+            .await?;
+        let (data, total_kb_retrieved, total_key_retrieved) = {
+            let mut new_data = BTreeMap::new();
+            let mut total_kb_retrieved = 0;
+            let mut total_key_retrieved = 0;
+            for (k, v) in data {
+                let v = v.map_err(CreateImageLayersError::PageReconstructError)?;
+                total_kb_retrieved += KEY_SIZE + v.len();
+                total_key_retrieved += 1;
+                new_data.insert(k, v);
+            }
+            (new_data, total_kb_retrieved / 1024, total_key_retrieved)
+        };
+        let delta_file_accessed = reconstruct_state.get_delta_layers_visited();
+
+        let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
+        info!(
+            "generate image layers for metadata keys: trigger_generation={trigger_generation}, \
+                delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
+                total_key_retrieved={total_key_retrieved}"
+        );
+        if !trigger_generation && mode == ImageLayerCreationMode::Try {
+            return Ok(ImageLayerCreationOutcome {
+                image: None,
+                next_start_key: img_range.end,
+            });
+        }
+        let has_keys = !data.is_empty();
+        for (k, v) in data {
+            // Even if the value is empty (deleted), we do not delete it for now until we can ensure vectored get
+            // considers this situation properly.
+            // if v.is_empty() {
+            //     continue;
+            // }
+
+            // No need to handle sharding b/c metadata keys are always on the 0-th shard.
+
+            // TODO: split image layers to avoid too large layer files. Too large image files are not handled
+            // on the normal data path either.
+            image_layer_writer.put_image(k, v, ctx).await?;
+        }
+        Ok(ImageLayerCreationOutcome {
+            image: if has_keys {
+                let image_layer = image_layer_writer.finish(self, ctx).await?;
+                Some(image_layer)
+            } else {
+                tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
+                None
+            },
+            next_start_key: img_range.end,
+        })
+    }
+
     #[tracing::instrument(skip_all, fields(%lsn, %mode))]
     async fn create_image_layers(
         self: &Arc<Timeline>,
@@ -4175,19 +4358,17 @@ impl Timeline {
 
         for partition in partitioning.parts.iter() {
             let img_range = start..partition.ranges.last().unwrap().end;
-
-            if partition.overlaps(&Key::metadata_key_range()) {
-                // TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a
-                // rather big change. Keep this patch small for now.
-                match mode {
-                    ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => {
-                        // skip image layer creation anyways for metadata keys.
-                        start = img_range.end;
-                        continue;
-                    }
-                    ImageLayerCreationMode::Initial => {
-                        return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
-                    }
+            let compact_metadata = partition.overlaps(&Key::metadata_key_range());
+            if compact_metadata {
+                for range in &partition.ranges {
+                    assert!(
+                        range.start.field1 >= METADATA_KEY_BEGIN_PREFIX
+                            && range.end.field1 <= METADATA_KEY_END_PREFIX,
+                        "metadata keys must be partitioned separately"
+                    );
+                }
+                if mode == ImageLayerCreationMode::Initial {
+                    return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
                 }
             } else if let ImageLayerCreationMode::Try = mode {
                 // check_for_image_layers = false -> skip
@@ -4198,7 +4379,7 @@ impl Timeline {
                 }
             }
 
-            let mut image_layer_writer = ImageLayerWriter::new(
+            let image_layer_writer = ImageLayerWriter::new(
                 self.conf,
                 self.timeline_id,
                 self.tenant_shard_id,
@@ -4214,87 +4395,39 @@ impl Timeline {
                 )))
             });
 
-            let mut wrote_keys = false;
+            if !compact_metadata {
+                let ImageLayerCreationOutcome {
+                    image,
+                    next_start_key,
+                } = self
+                    .create_image_layer_for_rel_blocks(
+                        partition,
+                        image_layer_writer,
+                        lsn,
+                        ctx,
+                        img_range,
+                        start,
+                    )
+                    .await?;
 
-            let mut key_request_accum = KeySpaceAccum::new();
-            for range in &partition.ranges {
-                let mut key = range.start;
-                while key < range.end {
-                    // Decide whether to retain this key: usually we do, but sharded tenants may
-                    // need to drop keys that don't belong to them.  If we retain the key, add it
-                    // to `key_request_accum` for later issuing a vectored get
-                    if self.shard_identity.is_key_disposable(&key) {
-                        debug!(
-                            "Dropping key {} during compaction (it belongs on shard {:?})",
-                            key,
-                            self.shard_identity.get_shard_number(&key)
-                        );
-                    } else {
-                        key_request_accum.add_key(key);
-                    }
-
-                    let last_key_in_range = key.next() == range.end;
-                    key = key.next();
-
-                    // Maybe flush `key_rest_accum`
-                    if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
-                        || (last_key_in_range && key_request_accum.raw_size() > 0)
-                    {
-                        let results = self
-                            .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
-                            .await?;
-
-                        for (img_key, img) in results {
-                            let img = match img {
-                                Ok(img) => img,
-                                Err(err) => {
-                                    // If we fail to reconstruct a VM or FSM page, we can zero the
-                                    // page without losing any actual user data. That seems better
-                                    // than failing repeatedly and getting stuck.
-                                    //
-                                    // We had a bug at one point, where we truncated the FSM and VM
-                                    // in the pageserver, but the Postgres didn't know about that
-                                    // and continued to generate incremental WAL records for pages
-                                    // that didn't exist in the pageserver. Trying to replay those
-                                    // WAL records failed to find the previous image of the page.
-                                    // This special case allows us to recover from that situation.
-                                    // See https://github.com/neondatabase/neon/issues/2601.
-                                    //
-                                    // Unfortunately we cannot do this for the main fork, or for
-                                    // any metadata keys, keys, as that would lead to actual data
-                                    // loss.
-                                    if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key)
-                                    {
-                                        warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
-                                        ZERO_PAGE.clone()
-                                    } else {
-                                        return Err(CreateImageLayersError::PageReconstructError(
-                                            err,
-                                        ));
-                                    }
-                                }
-                            };
-
-                            // Write all the keys we just read into our new image layer.
-                            image_layer_writer.put_image(img_key, img, ctx).await?;
-                            wrote_keys = true;
-                        }
-                    }
-                }
-            }
-
-            if wrote_keys {
-                // Normal path: we have written some data into the new image layer for this
-                // partition, so flush it to disk.
-                start = img_range.end;
-                let image_layer = image_layer_writer.finish(self, ctx).await?;
-                image_layers.push(image_layer);
+                start = next_start_key;
+                image_layers.extend(image);
             } else {
-                // Special case: the image layer may be empty if this is a sharded tenant and the
-                // partition does not cover any keys owned by this shard.  In this case, to ensure
-                // we don't leave gaps between image layers, leave `start` where it is, so that the next
-                // layer we write will cover the key range that we just scanned.
-                tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
+                let ImageLayerCreationOutcome {
+                    image,
+                    next_start_key,
+                } = self
+                    .create_image_layer_for_metadata_keys(
+                        partition,
+                        image_layer_writer,
+                        lsn,
+                        ctx,
+                        img_range,
+                        mode,
+                    )
+                    .await?;
+                start = next_start_key;
+                image_layers.extend(image);
             }
         }
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index ed48b4c9cb..2eff469591 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -116,9 +116,13 @@ impl Timeline {
 
                 // 3. Create new image layers for partitions that have been modified
                 // "enough".
-                let dense_layers = self
+                let mut partitioning = dense_partitioning;
+                partitioning
+                    .parts
+                    .extend(sparse_partitioning.into_dense().parts);
+                let image_layers = self
                     .create_image_layers(
-                        &dense_partitioning,
+                        &partitioning,
                         lsn,
                         if flags.contains(CompactFlags::ForceImageLayerCreation) {
                             ImageLayerCreationMode::Force
@@ -130,24 +134,8 @@ impl Timeline {
                     .await
                     .map_err(anyhow::Error::from)?;
 
-                // For now, nothing will be produced...
-                let sparse_layers = self
-                    .create_image_layers(
-                        &sparse_partitioning.clone().into_dense(),
-                        lsn,
-                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
-                            ImageLayerCreationMode::Force
-                        } else {
-                            ImageLayerCreationMode::Try
-                        },
-                        &image_ctx,
-                    )
-                    .await
-                    .map_err(anyhow::Error::from)?;
-                assert!(sparse_layers.is_empty());
-
-                self.upload_new_image_layers(dense_layers)?;
-                dense_partitioning.parts.len()
+                self.upload_new_image_layers(image_layers)?;
+                partitioning.parts.len()
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -499,8 +487,11 @@ impl Timeline {
 
         for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
             if let Some(prev_key) = prev {
-                // just first fast filter
-                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
+                // just first fast filter, do not create hole entries for metadata keys. The last hole in the
+                // compaction is the gap between data key and metadata keys.
+                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
+                    && !Key::is_metadata_key(&prev_key)
+                {
                     let key_range = prev_key..next_key;
                     // Measuring hole by just subtraction of i128 representation of key range boundaries
                     // has not so much sense, because largest holes will corresponds field1/field2 changes.

From 2d7091871f82b6e9d598fdaed8cc28bdc27b9a16 Mon Sep 17 00:00:00 2001
From: Andy Hattemer <andrew.hattemer@gmail.com>
Date: Mon, 20 May 2024 12:15:43 -0400
Subject: [PATCH 0811/1571] Update banner image in Readme (#7801)

Update the readme banner with updated branding.
---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 00a90f4483..ea0a289502 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,6 @@
-[![Neon](https://user-images.githubusercontent.com/13738772/236813940-dcfdcb5b-69d3-449b-a686-013febe834d4.png)](https://neon.tech)
+[![Neon](https://github.com/neondatabase/neon/assets/11527560/f15a17f0-836e-40c5-b35d-030606a6b660)](https://neon.tech)
+
+
 
 # Neon
 

From 6810d2aa53b7b7646013d2f236d155a4f1b4721d Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 20 May 2024 14:24:18 -0400
Subject: [PATCH 0812/1571] feat(pageserver): do not read past image layers for
 vectored get (#7773)

## Problem

Part of https://github.com/neondatabase/neon/issues/7462

On metadata keyspace, vectored get will not stop if a key is not found,
and will read past the image layer. However, the semantics is different
from single get, because if a key does not exist in the image layer, it
means that the key does not exist in the past, or have been deleted.
This pull request fixed it by recording image layer coverage during the
vectored get process and stop when the full keyspace is covered by an
image layer. A corresponding test case is added to ensure generating
image layer reduces the number of delta layers.

This optimization (or bug fix) also applies to rel block keyspaces. If a
key is missing, we can know it's missing once the first image layer is
reached. Page server will not attempt to read lower layers, which
potentially incurs layer downloads + evictions.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/keyspace.rs           |   2 +-
 pageserver/src/tenant.rs                      | 372 +++++++++++++++++-
 pageserver/src/tenant/storage_layer.rs        |  26 +-
 .../src/tenant/storage_layer/image_layer.rs   |   4 +
 pageserver/src/tenant/timeline.rs             |  79 +++-
 5 files changed, 461 insertions(+), 22 deletions(-)

diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index c0c4710a00..12c6dc3a6d 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -307,7 +307,7 @@ impl KeySpace {
     }
 
     /// Merge another keyspace into the current one.
-    /// Note: the keyspaces must not ovelap (enforced via assertions)
+    /// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`.
     pub fn merge(&mut self, other: &KeySpace) {
         let all_ranges = self
             .ranges
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e598e9d2e3..1a66f2c919 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3968,7 +3968,7 @@ mod tests {
     use crate::tenant::harness::*;
     use crate::tenant::timeline::CompactFlags;
     use crate::DEFAULT_PG_VERSION;
-    use bytes::BytesMut;
+    use bytes::{Bytes, BytesMut};
     use hex_literal::hex;
     use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
     use pageserver_api::keyspace::KeySpace;
@@ -5996,4 +5996,374 @@ mod tests {
             Some(&bytes::Bytes::from_static(b"last"))
         );
     }
+
+    #[tokio::test]
+    async fn test_metadata_image_creation() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_image_creation")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        const NUM_KEYS: usize = 1000;
+        const STEP: usize = 10000; // random update + scan base_key + idx * STEP
+
+        let cancel = CancellationToken::new();
+
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
+        let mut test_key = base_key;
+        let mut lsn = Lsn(0x10);
+
+        async fn scan_with_statistics(
+            tline: &Timeline,
+            keyspace: &KeySpace,
+            lsn: Lsn,
+            ctx: &RequestContext,
+        ) -> anyhow::Result<(BTreeMap<Key, Result<Bytes, PageReconstructError>>, usize)> {
+            let mut reconstruct_state = ValuesReconstructState::default();
+            let res = tline
+                .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
+                .await?;
+            Ok((res, reconstruct_state.get_delta_layers_visited() as usize))
+        }
+
+        #[allow(clippy::needless_range_loop)]
+        for blknum in 0..NUM_KEYS {
+            lsn = Lsn(lsn.0 + 0x10);
+            test_key.field6 = (blknum * STEP) as u32;
+            let mut writer = tline.writer().await;
+            writer
+                .put(
+                    test_key,
+                    lsn,
+                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+        }
+
+        let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
+
+        for iter in 1..=10 {
+            for _ in 0..NUM_KEYS {
+                lsn = Lsn(lsn.0 + 0x10);
+                let blknum = thread_rng().gen_range(0..NUM_KEYS);
+                test_key.field6 = (blknum * STEP) as u32;
+                let mut writer = tline.writer().await;
+                writer
+                    .put(
+                        test_key,
+                        lsn,
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
+                        &ctx,
+                    )
+                    .await?;
+                writer.finish_write(lsn);
+                drop(writer);
+            }
+
+            tline.freeze_and_flush().await?;
+
+            if iter % 5 == 0 {
+                let (_, before_delta_file_accessed) =
+                    scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
+                tline
+                    .compact(
+                        &cancel,
+                        {
+                            let mut flags = EnumSet::new();
+                            flags.insert(CompactFlags::ForceImageLayerCreation);
+                            flags.insert(CompactFlags::ForceRepartition);
+                            flags
+                        },
+                        &ctx,
+                    )
+                    .await?;
+                let (_, after_delta_file_accessed) =
+                    scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
+                assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}");
+                // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances.
+                assert!(
+                    after_delta_file_accessed <= 2,
+                    "after_delta_file_accessed={after_delta_file_accessed}"
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
+        let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
+
+        let mut lsn = Lsn(0x20);
+
+        {
+            let mut writer = tline.writer().await;
+            writer
+                .put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx)
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            tline.freeze_and_flush().await?; // this will create a image layer
+        }
+
+        let child = tenant
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .await
+            .unwrap();
+
+        lsn.0 += 0x10;
+
+        {
+            let mut writer = child.writer().await;
+            writer
+                .put(
+                    base_key_child,
+                    lsn,
+                    &Value::Image(test_img("data key 2")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            child.freeze_and_flush().await?; // this will create a delta
+
+            {
+                // update the partitioning to include the test key space, otherwise they
+                // will be dropped by image layer creation
+                let mut guard = child.partitioning.lock().await;
+                let ((partitioning, _), partition_lsn) = &mut *guard;
+                partitioning
+                    .parts
+                    .push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key
+                *partition_lsn = lsn;
+            }
+
+            child
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for the keys, TODO: check if the image layer is created
+        }
+
+        async fn get_vectored_impl_wrapper(
+            tline: &Arc<Timeline>,
+            key: Key,
+            lsn: Lsn,
+            ctx: &RequestContext,
+        ) -> Result<Option<Bytes>, GetVectoredError> {
+            let mut reconstruct_state = ValuesReconstructState::new();
+            let mut res = tline
+                .get_vectored_impl(
+                    KeySpace::single(key..key.next()),
+                    lsn,
+                    &mut reconstruct_state,
+                    ctx,
+                )
+                .await?;
+            Ok(res.pop_last().map(|(k, v)| {
+                assert_eq!(k, key);
+                v.unwrap()
+            }))
+        }
+
+        // test vectored get on parent timeline
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
+            Some(test_img("data key 1"))
+        );
+        assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx)
+            .await
+            .unwrap_err()
+            .is_missing_key_error());
+        assert!(
+            get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx)
+                .await
+                .unwrap_err()
+                .is_missing_key_error()
+        );
+
+        // test vectored get on child timeline
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?,
+            Some(test_img("data key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?,
+            Some(test_img("data key 2"))
+        );
+        assert!(
+            get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx)
+                .await
+                .unwrap_err()
+                .is_missing_key_error()
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
+        let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
+        base_key_child.field1 = AUX_KEY_PREFIX;
+        base_key_nonexist.field1 = AUX_KEY_PREFIX;
+
+        let mut lsn = Lsn(0x20);
+
+        {
+            let mut writer = tline.writer().await;
+            writer
+                .put(
+                    base_key,
+                    lsn,
+                    &Value::Image(test_img("metadata key 1")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            tline.freeze_and_flush().await?; // this will create an image layer
+
+            tline
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set.insert(CompactFlags::ForceRepartition);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for metadata keys
+            tenant
+                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                .await?;
+        }
+
+        let child = tenant
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .await
+            .unwrap();
+
+        lsn.0 += 0x10;
+
+        {
+            let mut writer = child.writer().await;
+            writer
+                .put(
+                    base_key_child,
+                    lsn,
+                    &Value::Image(test_img("metadata key 2")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            child.freeze_and_flush().await?;
+
+            child
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set.insert(CompactFlags::ForceRepartition);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for metadata keys
+            tenant
+                .gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                .await?;
+        }
+
+        async fn get_vectored_impl_wrapper(
+            tline: &Arc<Timeline>,
+            key: Key,
+            lsn: Lsn,
+            ctx: &RequestContext,
+        ) -> Result<Option<Bytes>, GetVectoredError> {
+            let mut reconstruct_state = ValuesReconstructState::new();
+            let mut res = tline
+                .get_vectored_impl(
+                    KeySpace::single(key..key.next()),
+                    lsn,
+                    &mut reconstruct_state,
+                    ctx,
+                )
+                .await?;
+            Ok(res.pop_last().map(|(k, v)| {
+                assert_eq!(k, key);
+                v.unwrap()
+            }))
+        }
+
+        // test vectored get on parent timeline
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
+            Some(test_img("metadata key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+
+        // test vectored get on child timeline
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?,
+            Some(test_img("metadata key 2"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 4c8a518551..9ccf20c0d4 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -113,14 +113,17 @@ impl From<VectoredValueReconstructState> for ValueReconstructState {
     }
 }
 
-/// Bag of data accumulated during a vectored get.
+/// Bag of data accumulated during a vectored get..
 pub(crate) struct ValuesReconstructState {
     /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
     /// should not expect to get anything from this hashmap.
     pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
-
+    /// The keys which are already retrieved
     keys_done: KeySpaceRandomAccum,
 
+    /// The keys covered by the image layers
+    keys_with_image_coverage: Option<Range<Key>>,
+
     // Statistics that are still accessible as a caller of `get_vectored_impl`.
     layers_visited: u32,
     delta_layers_visited: u32,
@@ -131,6 +134,7 @@ impl ValuesReconstructState {
         Self {
             keys: HashMap::new(),
             keys_done: KeySpaceRandomAccum::new(),
+            keys_with_image_coverage: None,
             layers_visited: 0,
             delta_layers_visited: 0,
         }
@@ -186,6 +190,16 @@ impl ValuesReconstructState {
         }
     }
 
+    /// On hitting image layer, we can mark all keys in this range as done, because
+    /// if the image layer does not contain a key, it is deleted/never added.
+    pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range<Key>) {
+        let prev_val = self.keys_with_image_coverage.replace(key_range.clone());
+        assert_eq!(
+            prev_val, None,
+            "should consume the keyspace before the next iteration"
+        );
+    }
+
     /// Update the state collected for a given key.
     /// Returns true if this was the last value needed for the key and false otherwise.
     ///
@@ -248,8 +262,12 @@ impl ValuesReconstructState {
 
     /// Returns the key space describing the keys that have
     /// been marked as completed since the last call to this function.
-    pub(crate) fn consume_done_keys(&mut self) -> KeySpace {
-        self.keys_done.consume_keyspace()
+    /// Returns individual keys done, and the image layer coverage.
+    pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option<Range<Key>>) {
+        (
+            self.keys_done.consume_keyspace(),
+            self.keys_with_image_coverage.take(),
+        )
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 6ea452b993..becd1e7a6d 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -158,6 +158,7 @@ pub struct ImageLayerInner {
     index_start_blk: u32,
     index_root_blk: u32,
 
+    key_range: Range<Key>,
     lsn: Lsn,
 
     file: VirtualFile,
@@ -419,6 +420,7 @@ impl ImageLayerInner {
             file,
             file_id,
             max_vectored_read_bytes,
+            key_range: actual_summary.key_range,
         }))
     }
 
@@ -478,6 +480,8 @@ impl ImageLayerInner {
         self.do_reads_and_update_state(reads, reconstruct_state, ctx)
             .await;
 
+        reconstruct_state.on_image_layer_visited(&self.key_range);
+
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e6b58b7166..7f2a41d90c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -21,7 +21,7 @@ use pageserver_api::{
         AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
         NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
     },
-    keyspace::{KeySpaceAccum, SparseKeyPartitioning},
+    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
         AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
         DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
@@ -348,8 +348,8 @@ pub struct Timeline {
     // though let's keep them both for better error visibility.
     pub initdb_lsn: Lsn,
 
-    /// When did we last calculate the partitioning?
-    partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
+    /// When did we last calculate the partitioning? Make it pub to test cases.
+    pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
 
     /// Configuration: how often should the partitioning be recalculated.
     repartition_threshold: u64,
@@ -483,6 +483,11 @@ impl GcCutoffs {
     }
 }
 
+pub(crate) struct TimelineVisitOutcome {
+    completed_keyspace: KeySpace,
+    image_covered_keyspace: KeySpace,
+}
+
 /// An error happened in a get() operation.
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum PageReconstructError {
@@ -507,6 +512,13 @@ pub(crate) enum PageReconstructError {
     MissingKey(MissingKeyError),
 }
 
+impl GetVectoredError {
+    #[cfg(test)]
+    pub(crate) fn is_missing_key_error(&self) -> bool {
+        matches!(self, Self::MissingKey(_))
+    }
+}
+
 #[derive(Debug)]
 pub struct MissingKeyError {
     key: Key,
@@ -3300,12 +3312,15 @@ impl Timeline {
 
         let mut cont_lsn = Lsn(request_lsn.0 + 1);
 
-        loop {
+        let missing_keyspace = loop {
             if self.cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
             }
 
-            let completed = Self::get_vectored_reconstruct_data_timeline(
+            let TimelineVisitOutcome {
+                completed_keyspace: completed,
+                image_covered_keyspace,
+            } = Self::get_vectored_reconstruct_data_timeline(
                 timeline,
                 keyspace.clone(),
                 cont_lsn,
@@ -3324,12 +3339,31 @@ impl Timeline {
                 ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
             });
 
-            // Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look
-            // into ancestor timelines). TODO: is there any other metadata which we want to inherit?
-            if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() {
-                break;
+            // Keyspace is fully retrieved
+            if keyspace.is_empty() {
+                break None;
             }
 
+            // Not fully retrieved but no ancestor timeline.
+            if timeline.ancestor_timeline.is_none() {
+                break Some(keyspace);
+            }
+
+            // Now we see if there are keys covered by the image layer but does not exist in the
+            // image layer, which means that the key does not exist.
+
+            // The block below will stop the vectored search if any of the keys encountered an image layer
+            // which did not contain a snapshot for said key. Since we have already removed all completed
+            // keys from `keyspace`, we expect there to be no overlap between it and the image covered key
+            // space. If that's not the case, we had at least one key encounter a gap in the image layer
+            // and stop the search as a result of that.
+            let removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
+            if !removed.is_empty() {
+                break Some(removed);
+            }
+            // If we reached this point, `remove_overlapping_with` should not have made any change to the
+            // keyspace.
+
             // Take the min to avoid reconstructing a page with data newer than request Lsn.
             cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
             timeline_owned = timeline
@@ -3337,14 +3371,14 @@ impl Timeline {
                 .await
                 .map_err(GetVectoredError::GetReadyAncestorError)?;
             timeline = &*timeline_owned;
-        }
+        };
 
-        if keyspace.total_raw_size() != 0 {
+        if let Some(missing_keyspace) = missing_keyspace {
             return Err(GetVectoredError::MissingKey(MissingKeyError {
-                key: keyspace.start().unwrap(), /* better if we can store the full keyspace */
+                key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
                 shard: self
                     .shard_identity
-                    .get_shard_number(&keyspace.start().unwrap()),
+                    .get_shard_number(&missing_keyspace.start().unwrap()),
                 cont_lsn,
                 request_lsn,
                 ancestor_lsn: Some(timeline.ancestor_lsn),
@@ -3369,6 +3403,9 @@ impl Timeline {
     ///
     /// At each iteration pop the top of the fringe (the layer with the highest Lsn)
     /// and get all the required reconstruct data from the layer in one go.
+    ///
+    /// Returns the completed keyspace and the keyspaces with image coverage. The caller
+    /// decides how to deal with these two keyspaces.
     async fn get_vectored_reconstruct_data_timeline(
         timeline: &Timeline,
         keyspace: KeySpace,
@@ -3376,20 +3413,27 @@ impl Timeline {
         reconstruct_state: &mut ValuesReconstructState,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> Result<KeySpace, GetVectoredError> {
+    ) -> Result<TimelineVisitOutcome, GetVectoredError> {
         let mut unmapped_keyspace = keyspace.clone();
         let mut fringe = LayerFringe::new();
 
         let mut completed_keyspace = KeySpace::default();
+        let mut image_covered_keyspace = KeySpaceRandomAccum::new();
 
         loop {
             if cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
             }
 
-            let keys_done_last_step = reconstruct_state.consume_done_keys();
+            let (keys_done_last_step, keys_with_image_coverage) =
+                reconstruct_state.consume_done_keys();
             unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
             completed_keyspace.merge(&keys_done_last_step);
+            if let Some(keys_with_image_coverage) = keys_with_image_coverage {
+                unmapped_keyspace
+                    .remove_overlapping_with(&KeySpace::single(keys_with_image_coverage.clone()));
+                image_covered_keyspace.add_range(keys_with_image_coverage);
+            }
 
             // Do not descent any further if the last layer we visited
             // completed all keys in the keyspace it inspected. This is not
@@ -3467,7 +3511,10 @@ impl Timeline {
             }
         }
 
-        Ok(completed_keyspace)
+        Ok(TimelineVisitOutcome {
+            completed_keyspace,
+            image_covered_keyspace: image_covered_keyspace.consume_keyspace(),
+        })
     }
 
     /// # Cancel-safety

From 6f3e043a76dd47a18180eec627ad5f5bbade6186 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Mon, 20 May 2024 17:00:47 -0700
Subject: [PATCH 0813/1571] Add some more replication slot metrics (#7761)

## Problem
We want to add alerts for when people's replication slots break, and
also metrics for retained WAL so that we can make warn customers when
their storage gets bloated.

## Summary of changes
Adds the metrics. Addresses
https://github.com/neondatabase/neon/issues/7593
---
 vm-image-spec.yaml | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index fa7cd014bf..0f9d56e466 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -254,8 +254,8 @@ files:
           select
             case
               when pg_catalog.pg_is_in_recovery()
-              then pg_last_wal_replay_lsn()
-              else pg_current_wal_lsn()
+              then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
+              else (pg_current_wal_lsn() - '0/0')::FLOAT8
             end as lsn;
 
       - metric_name: replication_delay_bytes
@@ -294,6 +294,9 @@ files:
         query: |
           SELECT checkpoints_timed FROM pg_stat_bgwriter;
 
+      # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
+      # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
+
       # Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
       - metric_name: logical_slot_restart_lsn
         type: gauge
@@ -302,7 +305,32 @@ files:
           - slot_name
         values: [restart_lsn]
         query: |
-          select slot_name, restart_lsn from pg_replication_slots where slot_type = 'logical';
+          select slot_name, (restart_lsn - '0/0')::FLOAT8 from pg_replication_slots where slot_type = 'logical';
+
+      - metric_name: retained_wal
+        type: gauge
+        help: 'Retained WAL in inactive replication slots'
+        key_labels:
+          - slot_name
+        values: [retained_wal]
+        query: |
+          SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
+          FROM pg_replication_slots
+          WHERE active = false;
+
+      - metric_name: wal_is_lost
+        type: gauge
+        help: 'Whether or not the replication slot\'s wal_status is lost'
+        key_labels:
+          - slot_name
+        values: [wal_status_is_lost]
+        query: |
+          SELECT slot_name,
+          CASE
+            WHEN wal_status = 'lost' THEN 1
+            ELSE 0
+          END AS wal_status_is_lost
+          FROM pg_replication_slots;
   - filename: neon_collector_autoscaling.yml
     content: |
       collector_name: neon_collector_autoscaling

From baeb58432f77809f68eb648481e81ed5af15a8b0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 21 May 2024 10:48:17 +0000
Subject: [PATCH 0814/1571] build(deps): bump requests from 2.31.0 to 2.32.0
 (#7816)

---
 poetry.lock    | 21 ++++++++++++++++-----
 pyproject.toml |  2 +-
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index ef9f572b17..25c0c7398d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2405,6 +2405,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2529,13 +2530,13 @@ files = [
 
 [[package]]
 name = "requests"
-version = "2.31.0"
+version = "2.32.0"
 description = "Python HTTP for Humans."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
-    {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
+    {file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"},
+    {file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"},
 ]
 
 [package.dependencies]
@@ -2959,6 +2960,16 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3196,4 +3207,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "dcde14c58a32bda5f123319a069352c458b3719f3c62977991eebb9803a46a9e"
+content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
diff --git a/pyproject.toml b/pyproject.toml
index ac7f9b061c..131d1121f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ pytest = "^7.4.4"
 psycopg2-binary = "^2.9.6"
 typing-extensions = "^4.6.1"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
-requests = "^2.31.0"
+requests = "^2.32.0"
 pytest-xdist = "^3.3.1"
 asyncpg = "^0.29.0"
 aiopg = "^1.4.0"

From 4ce6e2d2fc83ff8664eef2f80912cade71240669 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 21 May 2024 13:46:04 +0100
Subject: [PATCH 0815/1571] pageserver: fix secondary progress stats when
 layers are 404 (#7814)

## Problem

Noticed this issue in staging.

When a tenant is under somewhat heavy timeline creation/deletion
thrashing, it becomes quite common for secondary downloads to encounter
404s downloading layers. This is tolerated by design, because heatmaps
are not guaranteed to be up to date with what layers/timelines actually
exist.

However, we were not updating the SecondaryProgress structure in this
case, so after such a download pass, we would leave a SecondaryProgress
state with lower "downloaded" stats than "total" stats. This causes the
storage controller to consider this secondary location inelegible for
optimization actions such as we do after shard splits

This issue has relative low impact because a typical tenant will
eventually upload a heatmap where we do download all the layers and
thereby enable the controller to progress with migrations -- the heavy
thrashing of timeline creation/deletion is an artifact of our nightly
stress tests.

## Summary of changes

- In the layer 404 case, subtract the skipped layer's stats from the
totals, so that at the end of this download pass we should still end up
in a complete state.
- When updating `last_downloaded`, do a sanity check that our progress
is complete. In debug builds, assert out if this is not the case. In
prod builds, correct the stats and log a warning.
---
 pageserver/src/tenant/secondary/downloader.rs | 92 +++++++++++++------
 1 file changed, 63 insertions(+), 29 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 609e1431cf..870475eb57 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -569,6 +569,39 @@ impl<'a> TenantDownloader<'a> {
             heatmap.timelines.len()
         );
 
+        // Get or initialize the local disk state for the timelines we will update
+        let mut timeline_states = HashMap::new();
+        for timeline in &heatmap.timelines {
+            let timeline_state = self
+                .secondary_state
+                .detail
+                .lock()
+                .unwrap()
+                .timelines
+                .get(&timeline.timeline_id)
+                .cloned();
+
+            let timeline_state = match timeline_state {
+                Some(t) => t,
+                None => {
+                    // We have no existing state: need to scan local disk for layers first.
+                    let timeline_state =
+                        init_timeline_state(self.conf, tenant_shard_id, timeline).await;
+
+                    // Re-acquire detail lock now that we're done with async load from local FS
+                    self.secondary_state
+                        .detail
+                        .lock()
+                        .unwrap()
+                        .timelines
+                        .insert(timeline.timeline_id, timeline_state.clone());
+                    timeline_state
+                }
+            };
+
+            timeline_states.insert(timeline.timeline_id, timeline_state);
+        }
+
         // Clean up any local layers that aren't in the heatmap.  We do this first for all timelines, on the general
         // principle that deletions should be done before writes wherever possible, and so that we can use this
         // phase to initialize our SecondaryProgress.
@@ -579,6 +612,10 @@ impl<'a> TenantDownloader<'a> {
 
         // Download the layers in the heatmap
         for timeline in heatmap.timelines {
+            let timeline_state = timeline_states
+                .remove(&timeline.timeline_id)
+                .expect("Just populated above");
+
             if self.secondary_state.cancel.is_cancelled() {
                 tracing::debug!(
                     "Cancelled before downloading timeline {}",
@@ -588,7 +625,7 @@ impl<'a> TenantDownloader<'a> {
             }
 
             let timeline_id = timeline.timeline_id;
-            self.download_timeline(timeline, ctx)
+            self.download_timeline(timeline, timeline_state, ctx)
                 .instrument(tracing::info_span!(
                     "secondary_download_timeline",
                     tenant_id=%tenant_shard_id.tenant_id,
@@ -609,6 +646,22 @@ impl<'a> TenantDownloader<'a> {
                 .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL),
         });
 
+        // Robustness: we should have updated progress properly, but in case we didn't, make sure
+        // we don't leave the tenant in a state where we claim to have successfully downloaded
+        // everything, but our progress is incomplete.  The invariant here should be that if
+        // we have set `last_download` to this heatmap's etag, then the next time we see that
+        // etag we can safely do no work (i.e. we must be complete).
+        let mut progress = self.secondary_state.progress.lock().unwrap();
+        debug_assert!(progress.layers_downloaded == progress.layers_total);
+        debug_assert!(progress.bytes_downloaded == progress.bytes_total);
+        if progress.layers_downloaded != progress.layers_total
+            || progress.bytes_downloaded != progress.bytes_total
+        {
+            tracing::warn!("Correcting drift in progress stats ({progress:?})");
+            progress.layers_downloaded = progress.layers_total;
+            progress.bytes_downloaded = progress.bytes_total;
+        }
+
         Ok(())
     }
 
@@ -784,6 +837,7 @@ impl<'a> TenantDownloader<'a> {
     async fn download_timeline(
         &self,
         timeline: HeatMapTimeline,
+        timeline_state: SecondaryDetailTimeline,
         ctx: &RequestContext,
     ) -> Result<(), UpdateError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
@@ -792,34 +846,6 @@ impl<'a> TenantDownloader<'a> {
         // Accumulate updates to the state
         let mut touched = Vec::new();
 
-        // Clone a view of what layers already exist on disk
-        let timeline_state = self
-            .secondary_state
-            .detail
-            .lock()
-            .unwrap()
-            .timelines
-            .get(&timeline.timeline_id)
-            .cloned();
-
-        let timeline_state = match timeline_state {
-            Some(t) => t,
-            None => {
-                // We have no existing state: need to scan local disk for layers first.
-                let timeline_state =
-                    init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
-
-                // Re-acquire detail lock now that we're done with async load from local FS
-                self.secondary_state
-                    .detail
-                    .lock()
-                    .unwrap()
-                    .timelines
-                    .insert(timeline.timeline_id, timeline_state.clone());
-                timeline_state
-            }
-        };
-
         tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
 
         let mut download_futs = Vec::new();
@@ -1009,6 +1035,14 @@ impl<'a> TenantDownloader<'a> {
                     "Skipped downloading missing layer {}, raced with compaction/gc?",
                     layer.name
                 );
+
+                // If the layer is 404, adjust the progress statistics to reflect that we will not download it.
+                let mut progress = self.secondary_state.progress.lock().unwrap();
+                progress.layers_total = progress.layers_total.saturating_sub(1);
+                progress.bytes_total = progress
+                    .bytes_total
+                    .saturating_sub(layer.metadata.file_size);
+
                 return Ok(None);
             }
             Err(e) => return Err(e.into()),

From 478cc37a70c4357e7817e6c8834264be4d3e3bfd Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 21 May 2024 14:32:29 +0300
Subject: [PATCH 0816/1571] Propagate standby apply LSN to pageserver to hold
 off GC.

To avoid pageserver gc'ing data needed by standby, propagate standby apply LSN
through standby -> safekeeper -> broker -> pageserver flow and hold off GC for
it. Iteration of GC resets the value to remove the horizon when standby goes
away -- pushes are assumed to happen at least once between gc iterations. As a
safety guard max allowed lag compared to normal GC horizon is hardcoded as 10GB.
Add test for the feature.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 libs/safekeeper_api/src/models.rs             |  3 +
 pageserver/src/tenant/timeline.rs             | 28 +++++-
 .../walreceiver/connection_manager.rs         | 13 +++
 safekeeper/src/broker.rs                      |  1 +
 safekeeper/src/http/routes.rs                 |  1 +
 safekeeper/src/send_wal.rs                    | 85 +++++++++++++++----
 safekeeper/src/timeline.rs                    |  7 +-
 storage_broker/benches/rps.rs                 |  1 +
 storage_broker/proto/broker.proto             |  3 +
 storage_broker/src/bin/storage_broker.rs      |  1 +
 test_runner/fixtures/neon_fixtures.py         | 11 +++
 test_runner/regress/test_hot_standby.py       | 47 +++++++---
 12 files changed, 169 insertions(+), 32 deletions(-)

diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index ce5a1e411e..2fbc333075 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -50,6 +50,9 @@ pub struct SkTimelineInfo {
     pub safekeeper_connstr: Option<String>,
     #[serde(default)]
     pub http_connstr: Option<String>,
+    // Minimum of all active RO replicas flush LSN
+    #[serde(default = "lsn_invalid")]
+    pub standby_horizon: Lsn,
 }
 
 #[derive(Debug, Clone, Deserialize, Serialize)]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7f2a41d90c..2c43c26359 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -269,6 +269,8 @@ pub struct Timeline {
     // Atomic would be more appropriate here.
     last_freeze_ts: RwLock<Instant>,
 
+    pub(crate) standby_horizon: AtomicLsn,
+
     // WAL redo manager. `None` only for broken tenants.
     walredo_mgr: Option<Arc<super::WalRedoManager>>,
 
@@ -2279,6 +2281,8 @@ impl Timeline {
                 compaction_lock: tokio::sync::Mutex::default(),
                 gc_lock: tokio::sync::Mutex::default(),
 
+                standby_horizon: AtomicLsn::new(0),
+
                 timeline_get_throttle: resources.timeline_get_throttle,
 
                 aux_files: tokio::sync::Mutex::new(AuxFilesState {
@@ -4844,7 +4848,29 @@ impl Timeline {
             (horizon_cutoff, pitr_cutoff, retain_lsns)
         };
 
-        let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
+        let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
+        let standby_horizon = self.standby_horizon.load();
+        // Hold GC for the standby, but as a safety guard do it only within some
+        // reasonable lag.
+        if standby_horizon != Lsn::INVALID {
+            if let Some(standby_lag) = new_gc_cutoff.checked_sub(standby_horizon) {
+                const MAX_ALLOWED_STANDBY_LAG: u64 = 10u64 << 30; // 10 GB
+                if standby_lag.0 < MAX_ALLOWED_STANDBY_LAG {
+                    new_gc_cutoff = Lsn::min(standby_horizon, new_gc_cutoff);
+                    trace!("holding off GC for standby apply LSN {}", standby_horizon);
+                } else {
+                    warn!(
+                        "standby is lagging for more than {}MB, not holding gc for it",
+                        MAX_ALLOWED_STANDBY_LAG / 1024 / 1024
+                    )
+                }
+            }
+        }
+
+        // Reset standby horizon to ignore it if it is not updated till next GC.
+        // It is an easy way to unset it when standby disappears without adding
+        // more conf options.
+        self.standby_horizon.store(Lsn::INVALID);
 
         let res = self
             .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 991e4ac045..a3c7adae44 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -705,6 +705,7 @@ impl ConnectionManagerState {
                     commit_lsn: info.commit_lsn,
                     safekeeper_connstr: info.safekeeper_connstr,
                     availability_zone: info.availability_zone,
+                    standby_horizon: info.standby_horizon,
                 }
             }
             MessageType::SafekeeperDiscoveryResponse => {
@@ -725,6 +726,17 @@ impl ConnectionManagerState {
 
         WALRECEIVER_BROKER_UPDATES.inc();
 
+        trace!(
+            "safekeeper info update: standby_horizon(cutoff)={}",
+            timeline_update.standby_horizon
+        );
+        if timeline_update.standby_horizon != 0 {
+            // ignore reports from safekeepers not connected to replicas
+            self.timeline
+                .standby_horizon
+                .store(Lsn(timeline_update.standby_horizon));
+        }
+
         let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
         let old_entry = self.wal_stream_candidates.insert(
             new_safekeeper_id,
@@ -1094,6 +1106,7 @@ mod tests {
                 commit_lsn,
                 safekeeper_connstr: safekeeper_connstr.to_owned(),
                 availability_zone: None,
+                standby_horizon: 0,
             },
             latest_update,
         }
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index ea16ce450f..46a51438ea 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -186,6 +186,7 @@ async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<
                         commit_lsn: sk_info.commit_lsn,
                         safekeeper_connstr: sk_info.safekeeper_connstr,
                         availability_zone: sk_info.availability_zone,
+                        standby_horizon: 0,
                     };
 
                     // note this is a blocking call
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 30d0081a47..808bb1e490 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -350,6 +350,7 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
         backup_lsn: sk_info.backup_lsn.0,
         local_start_lsn: sk_info.local_start_lsn.0,
         availability_zone: None,
+        standby_horizon: sk_info.standby_horizon.0,
     };
 
     let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index ecaae9cfe7..4edd09a318 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -23,7 +23,7 @@ use utils::failpoint_support;
 use utils::id::TenantTimelineId;
 use utils::pageserver_feedback::PageserverFeedback;
 
-use std::cmp::min;
+use std::cmp::{max, min};
 use std::net::SocketAddr;
 use std::str;
 use std::sync::Arc;
@@ -85,8 +85,17 @@ impl StandbyReply {
 
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub struct StandbyFeedback {
-    reply: StandbyReply,
-    hs_feedback: HotStandbyFeedback,
+    pub reply: StandbyReply,
+    pub hs_feedback: HotStandbyFeedback,
+}
+
+impl StandbyFeedback {
+    pub fn empty() -> Self {
+        StandbyFeedback {
+            reply: StandbyReply::empty(),
+            hs_feedback: HotStandbyFeedback::empty(),
+        }
+    }
 }
 
 /// WalSenders registry. Timeline holds it (wrapped in Arc).
@@ -162,8 +171,8 @@ impl WalSenders {
     }
 
     /// Get aggregated hot standby feedback (we send it to compute).
-    pub fn get_hotstandby(self: &Arc<WalSenders>) -> HotStandbyFeedback {
-        self.mutex.lock().agg_hs_feedback
+    pub fn get_hotstandby(self: &Arc<WalSenders>) -> StandbyFeedback {
+        self.mutex.lock().agg_standby_feedback
     }
 
     /// Record new pageserver feedback, update aggregated values.
@@ -184,6 +193,10 @@ impl WalSenders {
     fn record_standby_reply(self: &Arc<WalSenders>, id: WalSenderId, reply: &StandbyReply) {
         let mut shared = self.mutex.lock();
         let slot = shared.get_slot_mut(id);
+        debug!(
+            "Record standby reply: ts={} apply_lsn={}",
+            reply.reply_ts, reply.apply_lsn
+        );
         match &mut slot.feedback {
             ReplicationFeedback::Standby(sf) => sf.reply = *reply,
             ReplicationFeedback::Pageserver(_) => {
@@ -208,7 +221,7 @@ impl WalSenders {
                 })
             }
         }
-        shared.update_hs_feedback();
+        shared.update_reply_feedback();
     }
 
     /// Get remote_consistent_lsn reported by the pageserver. Returns None if
@@ -226,13 +239,13 @@ impl WalSenders {
     fn unregister(self: &Arc<WalSenders>, id: WalSenderId) {
         let mut shared = self.mutex.lock();
         shared.slots[id] = None;
-        shared.update_hs_feedback();
+        shared.update_reply_feedback();
     }
 }
 
 struct WalSendersShared {
     // aggregated over all walsenders value
-    agg_hs_feedback: HotStandbyFeedback,
+    agg_standby_feedback: StandbyFeedback,
     // last feedback ever received from any pageserver, empty if none
     last_ps_feedback: PageserverFeedback,
     // total counter of pageserver feedbacks received
@@ -243,7 +256,7 @@ struct WalSendersShared {
 impl WalSendersShared {
     fn new() -> Self {
         WalSendersShared {
-            agg_hs_feedback: HotStandbyFeedback::empty(),
+            agg_standby_feedback: StandbyFeedback::empty(),
             last_ps_feedback: PageserverFeedback::empty(),
             ps_feedback_counter: 0,
             slots: Vec::new(),
@@ -260,10 +273,11 @@ impl WalSendersShared {
         self.slots[id].as_mut().expect("walsender doesn't exist")
     }
 
-    /// Update aggregated hot standy feedback. We just take min of valid xmins
+    /// Update aggregated hot standy and normal reply feedbacks. We just take min of valid xmins
     /// and ts.
-    fn update_hs_feedback(&mut self) {
+    fn update_reply_feedback(&mut self) {
         let mut agg = HotStandbyFeedback::empty();
+        let mut reply_agg = StandbyReply::empty();
         for ws_state in self.slots.iter().flatten() {
             if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback {
                 let hs_feedback = standby_feedback.hs_feedback;
@@ -276,7 +290,7 @@ impl WalSendersShared {
                     } else {
                         agg.xmin = hs_feedback.xmin;
                     }
-                    agg.ts = min(agg.ts, hs_feedback.ts);
+                    agg.ts = max(agg.ts, hs_feedback.ts);
                 }
                 if hs_feedback.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
                     if agg.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
@@ -284,11 +298,43 @@ impl WalSendersShared {
                     } else {
                         agg.catalog_xmin = hs_feedback.catalog_xmin;
                     }
-                    agg.ts = min(agg.ts, hs_feedback.ts);
+                    agg.ts = max(agg.ts, hs_feedback.ts);
+                }
+                let reply = standby_feedback.reply;
+                if reply.write_lsn != Lsn::INVALID {
+                    if reply_agg.write_lsn != Lsn::INVALID {
+                        reply_agg.write_lsn = Lsn::min(reply_agg.write_lsn, reply.write_lsn);
+                    } else {
+                        reply_agg.write_lsn = reply.write_lsn;
+                    }
+                }
+                if reply.flush_lsn != Lsn::INVALID {
+                    if reply_agg.flush_lsn != Lsn::INVALID {
+                        reply_agg.flush_lsn = Lsn::min(reply_agg.flush_lsn, reply.flush_lsn);
+                    } else {
+                        reply_agg.flush_lsn = reply.flush_lsn;
+                    }
+                }
+                if reply.apply_lsn != Lsn::INVALID {
+                    if reply_agg.apply_lsn != Lsn::INVALID {
+                        reply_agg.apply_lsn = Lsn::min(reply_agg.apply_lsn, reply.apply_lsn);
+                    } else {
+                        reply_agg.apply_lsn = reply.apply_lsn;
+                    }
+                }
+                if reply.reply_ts != 0 {
+                    if reply_agg.reply_ts != 0 {
+                        reply_agg.reply_ts = TimestampTz::min(reply_agg.reply_ts, reply.reply_ts);
+                    } else {
+                        reply_agg.reply_ts = reply.reply_ts;
+                    }
                 }
             }
         }
-        self.agg_hs_feedback = agg;
+        self.agg_standby_feedback = StandbyFeedback {
+            reply: reply_agg,
+            hs_feedback: agg,
+        };
     }
 }
 
@@ -793,8 +839,11 @@ mod tests {
     fn test_hs_feedback_no_valid() {
         let mut wss = WalSendersShared::new();
         push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
-        wss.update_hs_feedback();
-        assert_eq!(wss.agg_hs_feedback.xmin, INVALID_FULL_TRANSACTION_ID);
+        wss.update_reply_feedback();
+        assert_eq!(
+            wss.agg_standby_feedback.hs_feedback.xmin,
+            INVALID_FULL_TRANSACTION_ID
+        );
     }
 
     #[test]
@@ -803,7 +852,7 @@ mod tests {
         push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
         push_feedback(&mut wss, hs_feedback(1, 42));
         push_feedback(&mut wss, hs_feedback(1, 64));
-        wss.update_hs_feedback();
-        assert_eq!(wss.agg_hs_feedback.xmin, 42);
+        wss.update_reply_feedback();
+        assert_eq!(wss.agg_standby_feedback.hs_feedback.xmin, 42);
     }
 }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 64f764f191..e97247dc7c 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -248,6 +248,7 @@ impl SharedState {
         &self,
         ttid: &TenantTimelineId,
         conf: &SafeKeeperConf,
+        standby_apply_lsn: Lsn,
     ) -> SafekeeperTimelineInfo {
         SafekeeperTimelineInfo {
             safekeeper_id: conf.my_id.0,
@@ -270,6 +271,7 @@ impl SharedState {
             backup_lsn: self.sk.state.inmem.backup_lsn.0,
             local_start_lsn: self.sk.state.local_start_lsn.0,
             availability_zone: conf.availability_zone.clone(),
+            standby_horizon: standby_apply_lsn.0,
         }
     }
 
@@ -663,7 +665,7 @@ impl Timeline {
 
             // if this is AppendResponse, fill in proper hot standby feedback.
             if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
-                resp.hs_feedback = self.walsenders.get_hotstandby();
+                resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
             }
 
             commit_lsn = shared_state.sk.state.inmem.commit_lsn;
@@ -716,7 +718,8 @@ impl Timeline {
     /// Get safekeeper info for broadcasting to broker and other peers.
     pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
         let shared_state = self.write_shared_state().await;
-        shared_state.get_safekeeper_info(&self.ttid, conf)
+        let standby_apply_lsn = self.walsenders.get_hotstandby().reply.apply_lsn;
+        shared_state.get_safekeeper_info(&self.ttid, conf, standby_apply_lsn)
     }
 
     /// Update timeline state with peer safekeeper data.
diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs
index d66cbefa45..1a6fb7fedf 100644
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -147,6 +147,7 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
                 http_connstr: "zenith-1-sk-1.local:7677".to_owned(),
                 local_start_lsn: 0,
                 availability_zone: None,
+                standby_horizon: 0,
             };
             counter += 1;
             yield info;
diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto
index 7d1b63d23f..a420fd9c66 100644
--- a/storage_broker/proto/broker.proto
+++ b/storage_broker/proto/broker.proto
@@ -42,6 +42,7 @@ message SafekeeperTimelineInfo {
     uint64 remote_consistent_lsn = 7;
     uint64 peer_horizon_lsn = 8;
     uint64 local_start_lsn = 9;
+    uint64 standby_horizon = 14;
     // A connection string to use for WAL receiving.
     string safekeeper_connstr = 10;
     // HTTP endpoint connection string
@@ -105,4 +106,6 @@ message SafekeeperDiscoveryResponse {
     string safekeeper_connstr = 4;
     // Availability zone of a safekeeper.
     optional string availability_zone = 5;
+    // Replica apply LSN
+    uint64 standby_horizon = 6;
 }
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 8c88b61abc..0a4af543ab 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -736,6 +736,7 @@ mod tests {
             http_connstr: "neon-1-sk-1.local:7677".to_owned(),
             local_start_lsn: 0,
             availability_zone: None,
+            standby_horizon: 0,
         })
     }
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 23f30804b4..41377e2db2 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4288,6 +4288,17 @@ def wait_replica_caughtup(primary: Endpoint, secondary: Endpoint):
         time.sleep(1)
 
 
+def log_replica_lag(primary: Endpoint, secondary: Endpoint):
+    last_replay_lsn = Lsn(
+        secondary.safe_psql_scalar("SELECT pg_last_wal_replay_lsn()", log_query=False)
+    )
+    primary_lsn = Lsn(
+        primary.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()", log_query=False)
+    )
+    lag = primary_lsn - last_replay_lsn
+    log.info(f"primary_lsn={primary_lsn}, replay_lsn={last_replay_lsn}, lag={lag}")
+
+
 def wait_for_last_flush_lsn(
     env: NeonEnv,
     endpoint: Endpoint,
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 179cc273ec..31f436cb4c 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -1,9 +1,20 @@
 import os
 import re
+import threading
 import time
+from functools import partial
 
+import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, tenant_get_shards, wait_replica_caughtup
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    log_replica_lag,
+    tenant_get_shards,
+    wait_replica_caughtup,
+)
+from fixtures.utils import wait_until
 
 
 # Check for corrupted WAL messages which might otherwise go unnoticed if
@@ -104,19 +115,28 @@ def test_2_replicas_start(neon_simple_env: NeonEnv):
                 wait_replica_caughtup(primary, secondary2)
 
 
-# We had an issue that a standby server made GetPage requests with an
-# old LSN, based on the last-written LSN cache, to avoid waits in the
-# pageserver.  However, requesting a page with a very old LSN, such
-# that the GC horizon has already advanced past it, results in an
-# error from the pageserver:
-# "Bad request: tried to request a page version that was garbage collected"
+# Test two different scenarios related to gc of data needed by hot standby.
 #
-# To avoid that, the compute<-> pageserver protocol was updated so
-# that that the standby now sends two LSNs, the old last-written LSN
-# and the current replay LSN.
+# When pause_apply is False, standby is mostly caught up with the primary.
+# However, in compute <-> pageserver protocol version 1 only one LSN had been
+# sent to the pageserver in page request, and to avoid waits in the pageserver
+# it was last-written LSN cache value. If page hasn't been updated for a long
+# time that resulted in an error from the pageserver: "Bad request: tried to
+# request a page version that was garbage collected". For primary this wasn't a
+# problem because pageserver always bumped LSN to the newest one; for standy
+# that would be incorrect since we might get page fresher then apply LSN. Hence,
+# in protocol version v2 two LSNs were introduced: main request_lsn (apply LSN
+# in case of standby) and not_modified_since which could be used as an
+# optimization to avoid waiting.
 #
 # https://github.com/neondatabase/neon/issues/6211
-def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
+#
+# When pause_apply is True we model standby lagging behind primary (e.g. due to
+# high max_standby_streaming_delay). To prevent pageserver from removing data
+# still needed by the standby apply LSN is propagated in standby -> safekeepers
+# -> broker -> pageserver flow so that pageserver could hold off gc for it.
+@pytest.mark.parametrize("pause_apply", [False, True])
+def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
     tenant_conf = {
         # set PITR interval to be small, so we can do GC
         "pitr_interval": "0 s",
@@ -160,6 +180,9 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
             # so we still remember the LSNs of the pages.
             s_cur.execute("SELECT clear_buffer_cache()")
 
+            if pause_apply:
+                s_cur.execute("SELECT pg_wal_replay_pause()")
+
             # Do other stuff on the primary, to advance the WAL
             p_cur.execute("CREATE TABLE test2 AS SELECT generate_series(1, 1000000) AS g")
 
@@ -176,6 +199,8 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
             # generates use old not_modified_since LSNs, older than
             # the GC cutoff, but new request LSNs. (In protocol
             # version 1 there was only one LSN, and this failed.)
+            log_replica_lag(primary, secondary)
             s_cur.execute("SELECT COUNT(*) FROM test")
+            log_replica_lag(primary, secondary)
             res = s_cur.fetchone()
             assert res[0] == 10000

From f54c3b96e08f78e56d1f82f8305994e9cc8f867f Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 21 May 2024 14:42:25 +0300
Subject: [PATCH 0817/1571] Fix bugs in hot standby feedback propagation and
 add test for it.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/walproposer_pg.c              | 19 +++---
 safekeeper/src/send_wal.rs              |  9 ++-
 test_runner/regress/test_hot_standby.py | 88 +++++++++++++++++++++++++
 3 files changed, 104 insertions(+), 12 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index e5ef93b456..492a46fd54 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1852,34 +1852,30 @@ static void
 CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
 {
 	hs->ts = 0;
-	hs->xmin.value = ~0;		/* largest unsigned value */
-	hs->catalog_xmin.value = ~0;	/* largest unsigned value */
+	hs->xmin = InvalidFullTransactionId;
+	hs->catalog_xmin = InvalidFullTransactionId;
 
 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
-		if (wp->safekeeper[i].appendResponse.hs.ts != 0)
+
+		if (wp->safekeeper[i].state == SS_ACTIVE)
 		{
 			HotStandbyFeedback *skhs = &wp->safekeeper[i].appendResponse.hs;
 
 			if (FullTransactionIdIsNormal(skhs->xmin)
-				&& FullTransactionIdPrecedes(skhs->xmin, hs->xmin))
+				&& (!FullTransactionIdIsValid(hs->xmin) || FullTransactionIdPrecedes(skhs->xmin, hs->xmin)))
 			{
 				hs->xmin = skhs->xmin;
 				hs->ts = skhs->ts;
 			}
 			if (FullTransactionIdIsNormal(skhs->catalog_xmin)
-				&& FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin))
+				&& (!FullTransactionIdIsValid(hs->catalog_xmin) || FullTransactionIdPrecedes(skhs->catalog_xmin, hs->catalog_xmin)))
 			{
 				hs->catalog_xmin = skhs->catalog_xmin;
 				hs->ts = skhs->ts;
 			}
 		}
 	}
-
-	if (hs->xmin.value == ~0)
-		hs->xmin = InvalidFullTransactionId;
-	if (hs->catalog_xmin.value == ~0)
-		hs->catalog_xmin = InvalidFullTransactionId;
 }
 
 /*
@@ -1946,9 +1942,10 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 	}
 
 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
-	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
+	if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
 	{
 		agg_hs_feedback = hsFeedback;
+		elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin));
 		ProcessStandbyHSFeedback(hsFeedback.ts,
 								 XidFromFullTransactionId(hsFeedback.xmin),
 								 EpochFromFullTransactionId(hsFeedback.xmin),
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 4edd09a318..5a9745e1c9 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -756,8 +756,15 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
         match msg.first().cloned() {
             Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => {
                 // Note: deserializing is on m[1..] because we skip the tag byte.
-                let hs_feedback = HotStandbyFeedback::des(&msg[1..])
+                let mut hs_feedback = HotStandbyFeedback::des(&msg[1..])
                     .context("failed to deserialize HotStandbyFeedback")?;
+                // TODO: xmin/catalog_xmin are serialized by walreceiver.c in this way:
+                // pq_sendint32(&reply_message, xmin);
+                // pq_sendint32(&reply_message, xmin_epoch);
+                // So it is two big endian 32-bit words in low endian order!
+                hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32);
+                hs_feedback.catalog_xmin =
+                    (hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32);
                 self.ws_guard
                     .walsenders
                     .record_hs_feedback(self.ws_guard.id, &hs_feedback);
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 31f436cb4c..244d482c18 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -204,3 +204,91 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
             log_replica_lag(primary, secondary)
             res = s_cur.fetchone()
             assert res[0] == 10000
+
+
+def run_pgbench(connstr: str, pg_bin: PgBin):
+    log.info(f"Start a pgbench workload on pg {connstr}")
+    # s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
+    pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
+    log.info("pgbench init done")
+    pg_bin.run_capture(["pgbench", "-T60", connstr])
+
+
+# assert that pgbench_accounts and its index are created.
+def pgbench_accounts_initialized(ep):
+    ep.safe_psql_scalar("select 'pgbench_accounts_pkey'::regclass")
+
+
+# Test that hot_standby_feedback works in neon (it is forwarded through
+# safekeepers). That is, ensure queries on standby don't fail during load on
+# primary under the following conditions:
+# - pgbench bombards primary with updates.
+# - On the secondary we run long select of the updated table.
+# - Set small max_standby_streaming_delay: hs feedback should prevent conflicts
+#   so apply doesn't need to wait.
+# - Do agressive vacuum on primary which still shouldn't create conflicts.
+#   Actually this appears to be redundant due to microvacuum existence.
+#
+# Without hs feedback enabled we'd see 'User query might have needed to see row
+# versions that must be removed.' errors.
+def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    env = neon_env_builder.init_start()
+    agressive_vacuum_conf = [
+        "log_autovacuum_min_duration = 0",
+        "autovacuum_naptime = 10s",
+        "autovacuum_vacuum_threshold = 25",
+        "autovacuum_vacuum_scale_factor = 0.1",
+        "autovacuum_vacuum_cost_delay = -1",
+    ]
+    with env.endpoints.create_start(
+        branch_name="main", endpoint_id="primary", config_lines=agressive_vacuum_conf
+    ) as primary:
+        # It would be great to have more strict max_standby_streaming_delay=0s here, but then sometimes it fails with
+        # 'User was holding shared buffer pin for too long.'.
+        with env.endpoints.new_replica_start(
+            origin=primary,
+            endpoint_id="secondary",
+            config_lines=[
+                "max_standby_streaming_delay=2s",
+                "neon.protocol_version=2",
+                "hot_standby_feedback=true",
+            ],
+        ) as secondary:
+            log.info(
+                f"primary connstr is {primary.connstr()}, secondary connstr {secondary.connstr()}"
+            )
+            t = threading.Thread(target=run_pgbench, args=(primary.connstr(), pg_bin))
+            t.start()
+            # Wait until pgbench_accounts is created + filled on replica *and*
+            # index is created. Otherwise index creation would conflict with
+            # read queries and hs feedback won't save us.
+            wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary))
+
+            # Test should fail if hs feedback is disabled anyway, but cross
+            # check that walproposer sets some xmin.
+            def xmin_is_not_null():
+                slot_xmin = primary.safe_psql_scalar(
+                    "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'",
+                    log_query=False,
+                )
+                log.info(f"xmin is {slot_xmin}")
+                assert int(slot_xmin) > 0
+
+            wait_until(10, 1.0, xmin_is_not_null)
+            for _ in range(1, 5):
+                # in debug mode takes about 5-7s
+                balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts")
+                log.info(f"balance={balance}")
+                log_replica_lag(primary, secondary)
+            t.join()
+
+        # check xmin is reset when standby is gone
+        def xmin_is_null():
+            slot_xmin = primary.safe_psql_scalar(
+                "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'",
+                log_query=False,
+            )
+            log.info(f"xmin is {slot_xmin}")
+            assert slot_xmin is None
+
+        wait_until(10, 1.0, xmin_is_null)

From f2771a99b7cc286a936ecc27949d305d1fe7103c Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 20 May 2024 14:28:36 +0300
Subject: [PATCH 0818/1571] Add metric for pageserver standby horizon.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pageserver/src/metrics.rs                         | 15 +++++++++++++++
 pageserver/src/tenant/timeline.rs                 |  3 +++
 .../timeline/walreceiver/connection_manager.rs    |  4 ++++
 test_runner/fixtures/metrics.py                   |  1 +
 4 files changed, 23 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 5315f0b936..27e25d8e32 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -525,6 +525,15 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_standby_horizon",
+        "Standby apply LSN for which GC is hold off, by timeline.",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_resident_physical_size",
@@ -2098,6 +2107,7 @@ pub(crate) struct TimelineMetrics {
     pub garbage_collect_histo: StorageTimeMetrics,
     pub find_gc_cutoffs_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
+    pub standby_horizon_gauge: IntGauge,
     pub resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
@@ -2167,6 +2177,9 @@ impl TimelineMetrics {
         let last_record_gauge = LAST_RECORD_LSN
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
+        let standby_horizon_gauge = STANDBY_HORIZON
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
         let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -2212,6 +2225,7 @@ impl TimelineMetrics {
             find_gc_cutoffs_histo,
             load_layer_map_histo,
             last_record_gauge,
+            standby_horizon_gauge,
             resident_physical_size_gauge,
             current_logical_size_gauge,
             aux_file_size_gauge,
@@ -2246,6 +2260,7 @@ impl TimelineMetrics {
         let timeline_id = &self.timeline_id;
         let shard_id = &self.shard_id;
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
             let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2c43c26359..b0e5275b5f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4871,6 +4871,9 @@ impl Timeline {
         // It is an easy way to unset it when standby disappears without adding
         // more conf options.
         self.standby_horizon.store(Lsn::INVALID);
+        self.metrics
+            .standby_horizon_gauge
+            .set(Lsn::INVALID.0 as i64);
 
         let res = self
             .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index a3c7adae44..1d2ffec08f 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -735,6 +735,10 @@ impl ConnectionManagerState {
             self.timeline
                 .standby_horizon
                 .store(Lsn(timeline_update.standby_horizon));
+            self.timeline
+                .metrics
+                .standby_horizon_gauge
+                .set(timeline_update.standby_horizon as i64);
         }
 
         let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 8fa67e75c9..8b8075f8c1 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -142,6 +142,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_resident_physical_size",
     "pageserver_io_operations_bytes_total",
     "pageserver_last_record_lsn",
+    "pageserver_standby_horizon",
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",
     "pageserver_smgr_query_seconds_sum",

From d43dcceef9bac464cd2b80bd8f40cfae496c7f62 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 20 May 2024 18:41:39 +0300
Subject: [PATCH 0819/1571] Minimize hot standby feedback xmins to next_xid.

Hot standby feedback xmins can be greater than next_xid due to sparse update of
nextXid on pageserver (to do less writes it advances next xid on
1024). ProcessStandbyHSFeedback ignores such xids from the future; to fix,
minimize received xmin to next_xid.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/walproposer_pg.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 492a46fd54..316e23a72e 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1944,13 +1944,26 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
 	if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
 	{
+		FullTransactionId xmin = hsFeedback.xmin;
+		FullTransactionId catalog_xmin = hsFeedback.catalog_xmin;
+		FullTransactionId next_xid = ReadNextFullTransactionId();
+		/*
+		 * Page server is updating nextXid in checkpoint each 1024 transactions,
+		 * so feedback xmin can be actually larger then nextXid and
+		 * function TransactionIdInRecentPast return false in this case,
+		 * preventing update of slot's xmin.
+		 */
+		if (FullTransactionIdPrecedes(next_xid, xmin))
+			xmin = next_xid;
+		if (FullTransactionIdPrecedes(next_xid, catalog_xmin))
+			catalog_xmin = next_xid;
 		agg_hs_feedback = hsFeedback;
 		elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin));
 		ProcessStandbyHSFeedback(hsFeedback.ts,
-								 XidFromFullTransactionId(hsFeedback.xmin),
-								 EpochFromFullTransactionId(hsFeedback.xmin),
-								 XidFromFullTransactionId(hsFeedback.catalog_xmin),
-								 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
+								 XidFromFullTransactionId(xmin),
+								 EpochFromFullTransactionId(xmin),
+								 XidFromFullTransactionId(catalog_xmin),
+								 EpochFromFullTransactionId(catalog_xmin));
 	}
 
 	CheckGracefulShutdown(wp);

From d9d471e3c43c9e47b0ffc1ce9a796bd15aacf36c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 14 May 2024 13:49:04 -0500
Subject: [PATCH 0820/1571] Add some Python typing in a few test files

---
 test_runner/fixtures/neon_fixtures.py  |  7 ++++++-
 test_runner/regress/test_pg_regress.py | 22 +++++++++++++++-------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 41377e2db2..5c865baa54 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2721,7 +2721,12 @@ class PgBin:
         env.update(env_add)
         return env
 
-    def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None):
+    def run(
+        self,
+        command: List[str],
+        env: Optional[Env] = None,
+        cwd: Optional[Union[str, Path]] = None,
+    ):
         """
         Run one of the postgres binaries.
 
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 2b1b7fff34..c00a8ff6b7 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -1,8 +1,10 @@
 #
 # This file runs pg_regress-based tests.
 #
+from __future__ import annotations
+
 from pathlib import Path
-from typing import Optional
+from typing import TYPE_CHECKING
 
 import pytest
 from fixtures.neon_fixtures import (
@@ -11,6 +13,12 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.remote_storage import s3_storage
 
+if TYPE_CHECKING:
+    from typing import Optional
+
+    from fixtures.neon_fixtures import PgBin
+    from pytest import CaptureFixture
+
 
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
@@ -19,8 +27,8 @@ def test_pg_regress(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
     build_type: str,
-    pg_bin,
-    capsys,
+    pg_bin: PgBin,
+    capsys: CaptureFixture[str],
     base_dir: Path,
     pg_distrib_dir: Path,
     shard_count: Optional[int],
@@ -86,8 +94,8 @@ def test_pg_regress(
 def test_isolation(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
-    pg_bin,
-    capsys,
+    pg_bin: PgBin,
+    capsys: CaptureFixture[str],
     base_dir: Path,
     pg_distrib_dir: Path,
     shard_count: Optional[int],
@@ -142,8 +150,8 @@ def test_isolation(
 def test_sql_regress(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
-    pg_bin,
-    capsys,
+    pg_bin: PgBin,
+    capsys: CaptureFixture[str],
     base_dir: Path,
     pg_distrib_dir: Path,
     shard_count: Optional[int],

From e8b8ebfa1da3f528b5ebfb0a5aff7b946a8eabc1 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 17 May 2024 13:39:33 -0500
Subject: [PATCH 0821/1571] Allow check_restored_datadir_content to ignore
 certain files

Some files may have known differences that we are okay with.
---
 test_runner/fixtures/neon_fixtures.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5c865baa54..7a660b64ed 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4150,7 +4150,12 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
 
 
 # pg is the existing and running compute node, that we want to compare with a basebackup
-def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
+def check_restored_datadir_content(
+    test_output_dir: Path,
+    env: NeonEnv,
+    endpoint: Endpoint,
+    ignored_files: Optional[list[str]] = None,
+):
     pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
 
     # Get the timeline ID. We need it for the 'basebackup' command
@@ -4203,6 +4208,10 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
             if not f.startswith("pg_xact") and not f.startswith("pg_multixact")
         ]
 
+    if ignored_files:
+        pgdata_files = [f for f in pgdata_files if f not in ignored_files]
+        restored_files = [f for f in restored_files if f not in ignored_files]
+
     # check that file sets are equal
     assert pgdata_files == restored_files
 

From 9a4b896636465d4d24e5b11c6a7bdb6aed83f23f Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 17 May 2024 13:50:36 -0500
Subject: [PATCH 0822/1571] Use a constant for database name in test_pg_regress

---
 test_runner/regress/test_pg_regress.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index c00a8ff6b7..302f94064c 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -33,6 +33,8 @@ def test_pg_regress(
     pg_distrib_dir: Path,
     shard_count: Optional[int],
 ):
+    DBNAME = "regression"
+
     """
     :param shard_count: if None, create an unsharded tenant.  Otherwise create a tenant with this
                         many shards.
@@ -50,7 +52,7 @@ def test_pg_regress(
 
     # Connect to postgres and create a database called "regression".
     endpoint = env.endpoints.create_start("main")
-    endpoint.safe_psql("CREATE DATABASE regression")
+    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_regress to run in.
     runpath = test_output_dir / "regress"

From 8030b8e4c5f50a2320888433a39e550bfa1ec22e Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 17 May 2024 13:56:02 -0500
Subject: [PATCH 0823/1571] Fix test_pg_regress for unlogged relations

Previously we worked around file comparison issues by dropping unlogged
relations in the pg_regress tests, but this would lead to an unnecessary
diff when compared to upstream in our Postgres fork. Instead, we can
precompute the files that we know will be different, and ignore them.
---
 test_runner/regress/test_pg_regress.py | 65 +++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 302f94064c..885a94a557 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -4,13 +4,14 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, cast
 
 import pytest
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     check_restored_datadir_content,
 )
+from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import s3_storage
 
 if TYPE_CHECKING:
@@ -87,7 +88,67 @@ def test_pg_regress(
     with capsys.disabled():
         pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
 
-        check_restored_datadir_content(test_output_dir, env, endpoint)
+        ignored_files: Optional[list[str]] = None
+
+        # Neon handles unlogged relations in a special manner. During a
+        # basebackup, we ship the init fork as the main fork. This presents a
+        # problem in that the endpoint's data directory and the basebackup will
+        # have differences and will fail the eventual file comparison.
+        #
+        # Unlogged tables were introduced in version 9.1. ALTER TABLE grew
+        # support for setting the persistence of a table in 9.5. The reason that
+        # this doesn't affect versions < 15 (but probably would between 9.1 and
+        # 9.5) is that all the regression tests that deal with unlogged tables
+        # up until that point dropped the unlogged tables or set them to logged
+        # at some point during the test.
+        #
+        # In version 15, Postgres grew support for unlogged sequences, and with
+        # that came a few more regression tests. These tests did not all drop
+        # the unlogged tables/sequences prior to finishing.
+        #
+        # But unlogged sequences came with a bug in that, sequences didn't
+        # inherit the persistence of their "parent" tables if they had one. This
+        # was fixed and backported to 15, thus exacerbating our problem a bit.
+        #
+        # So what we can do is just ignore file differences between the data
+        # directory and basebackup for unlogged relations.
+        results = cast(
+            "list[tuple[str, str]]",
+            endpoint.safe_psql(
+                """
+            SELECT
+                relkind,
+                pg_relation_filepath(
+                    pg_filenode_relation(reltablespace, relfilenode)
+                ) AS unlogged_relation_paths
+            FROM pg_class
+            WHERE relpersistence = 'u'
+            """,
+                dbname=DBNAME,
+            ),
+        )
+
+        unlogged_relation_files: list[str] = []
+        for r in results:
+            unlogged_relation_files.append(r[1])
+            # This is related to the following Postgres commit:
+            #
+            # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
+            # Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+            # Date:   2023-08-23 09:21:31 -0500
+            #
+            # Use the buffer cache when initializing an unlogged index.
+            #
+            # This patch was backpatched to 16. Without it, the LSN in the
+            # page header would be 0/0 in the data directory, which wouldn't
+            # match the LSN generated during the basebackup, thus creating
+            # a difference.
+            if env.pg_version <= PgVersion.V15 and r[0] == "i":
+                unlogged_relation_files.append(f"{r[1]}_init")
+
+        ignored_files = unlogged_relation_files
+
+        check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
 
 
 # Run the PostgreSQL "isolation" tests, in src/test/isolation.

From 781352bd8e4dca5676168fe67521b178fa91a0ec Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 20 May 2024 10:01:58 -0500
Subject: [PATCH 0824/1571] Upgrade Postgres v14 to 14.12

---
 vendor/postgres-v14   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index d6f7e2c604..21ec61d539 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a
+Subproject commit 21ec61d539d22a81fe811c2d79e26436820bc3f4
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c5b55762fa..ec82786109 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
   "v16": ["16.2", "8ef3c33aa01631e17cb24a122776349fcc777b46"],
   "v15": ["15.6", "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a"],
-  "v14": ["14.11", "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a"]
+  "v14": ["14.12", "21ec61d539d22a81fe811c2d79e26436820bc3f4"]
 }

From 9d081851ec846523d3893f8e66e8fb5112c1e7ca Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 20 May 2024 10:03:09 -0500
Subject: [PATCH 0825/1571] Upgrade Postgres v15 to 15.7

---
 vendor/postgres-v15   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index f0d6b0ef75..e2dbd63345 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit f0d6b0ef7581bd78011832e23d8420a7d2c8a83a
+Subproject commit e2dbd63345c584de75173c27951f111249ae0016
diff --git a/vendor/revisions.json b/vendor/revisions.json
index ec82786109..d022e6f455 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
   "v16": ["16.2", "8ef3c33aa01631e17cb24a122776349fcc777b46"],
-  "v15": ["15.6", "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a"],
+  "v15": ["15.7", "e2dbd63345c584de75173c27951f111249ae0016"],
   "v14": ["14.12", "21ec61d539d22a81fe811c2d79e26436820bc3f4"]
 }

From e3415706b7b23a30f21051a8063bc257a79e3953 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 20 May 2024 10:04:04 -0500
Subject: [PATCH 0826/1571] Upgrade Postgres v16 to 16.3

---
 libs/walproposer/src/walproposer.rs | 4 ++--
 vendor/postgres-v16                 | 2 +-
 vendor/revisions.json               | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index fb815607a7..f7b72b205f 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -496,9 +496,9 @@ mod tests {
                 // TODO: When updating Postgres versions, this test will cause
                 // problems. Postgres version in message needs updating.
                 //
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
                 vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
                     147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
                     188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 8ef3c33aa0..c271017c6c 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 8ef3c33aa01631e17cb24a122776349fcc777b46
+Subproject commit c271017c6c4846be59948766baec2ba4ace5dc9c
diff --git a/vendor/revisions.json b/vendor/revisions.json
index d022e6f455..a3af9331fe 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.2", "8ef3c33aa01631e17cb24a122776349fcc777b46"],
+  "v16": ["16.3", "c271017c6c4846be59948766baec2ba4ace5dc9c"],
   "v15": ["15.7", "e2dbd63345c584de75173c27951f111249ae0016"],
   "v14": ["14.12", "21ec61d539d22a81fe811c2d79e26436820bc3f4"]
 }

From 1988ad8db702a24afc19ce4332e1199531441a8c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 17 May 2024 13:42:51 -0500
Subject: [PATCH 0827/1571] Extend test_unlogged to include a sequence

Unlogged sequences were added in v15, so let's just test to make sure
they work on Neon.
---
 test_runner/regress/test_unlogged.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py
index 708bf0dfeb..137d28b9fa 100644
--- a/test_runner/regress/test_unlogged.py
+++ b/test_runner/regress/test_unlogged.py
@@ -1,4 +1,5 @@
 from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
+from fixtures.pg_version import PgVersion
 
 
 #
@@ -17,7 +18,8 @@ def test_unlogged(neon_simple_env: NeonEnv):
     cur.execute("CREATE UNLOGGED TABLE iut (id int);")
     # create index to test unlogged index relation as well
     cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);")
-    cur.execute("INSERT INTO iut values (42);")
+    cur.execute("ALTER TABLE iut ADD COLUMN seq int GENERATED ALWAYS AS IDENTITY;")
+    cur.execute("INSERT INTO iut (id) values (42);")
 
     # create another compute to fetch inital empty contents from pageserver
     fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "test_unlogged")
@@ -26,7 +28,15 @@ def test_unlogged(neon_simple_env: NeonEnv):
     conn2 = endpoint2.connect()
     cur2 = conn2.cursor()
     # after restart table should be empty but valid
-    cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut VALUES ($1)")
+    cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut (id) VALUES ($1)")
     cur2.execute("EXECUTE iut_plan (43);")
     cur2.execute("SELECT * FROM iut")
-    assert cur2.fetchall() == [(43,)]
+    results = cur2.fetchall()
+    # Unlogged sequences were introduced in v15. On <= v14, the sequence created
+    # for the GENERATED ALWAYS AS IDENTITY column is logged, and hence it keeps
+    # the old value (2) on restart. While on v15 and above, it's unlogged, so it
+    # gets reset to 1.
+    if env.pg_version <= PgVersion.V14:
+        assert results == [(43, 2)]
+    else:
+        assert results == [(43, 1)]

From 353afe4fe7dc2c5a131aa01d06d10d0d229fd84a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 21 May 2024 16:13:54 +0100
Subject: [PATCH 0828/1571] neon_local: run controller's postgres with
 fsync=off (#7817)

## Problem

In `test_storage_controller_many_tenants` we
[occasionally](https://neon-github-public-dev.s3.amazonaws.com/reports/main/9155810417/index.html#/testresult/8fbdf57a0e859c2d)
see it hit the retry limit on serializable transactions. That's likely
due to a combination of relative slow fsync on the hetzner nodes running
the test, and the way the test does lots of parallel timeline creations,
putting high load on the drive.

Running the storage controller's db with fsync=off may help here.

## Summary of changes

- Set `fsync=off` in the postgres config for the database used by the
storage controller in tests
---
 control_plane/src/storage_controller.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 96e8276f4d..b6b7ea7762 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -243,9 +243,13 @@ impl StorageController {
                 anyhow::bail!("initdb failed with status {status}");
             }
 
+            // Write a minimal config file:
+            // - Specify the port, since this is chosen dynamically
+            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
+            //   the storage controller we don't want a slow local disk to interfere with that.
             tokio::fs::write(
                 &pg_data_path.join("postgresql.conf"),
-                format!("port = {}", self.postgres_port),
+                format!("port = {}\nfsync=off\n", self.postgres_port),
             )
             .await?;
         };

From a8a88ba7bcf99505b1552858db0f5415974b8f8a Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 21 May 2024 20:08:43 +0300
Subject: [PATCH 0829/1571] test(detach_ancestor): ensure L0 compaction in
 history is ok (#7813)

detaching a timeline from its ancestor can leave the resulting timeline
with more L0 layers than the compaction threshold. most of the time, the
detached timeline has made progress, and next time the L0 -> L1
compaction happens near the original branch point and not near the
last_record_lsn.

add a test to ensure that inheriting the historical L0s does not change
fullbackup. additionally:
- add `wait_until_completed` to test-only timeline checkpoint and
compact HTTP endpoints. with `?wait_until_completed=true` the endpoints
will wait until the remote client has completed uploads.
- for delta layers, describe L0-ness with the `/layer` endpoint

Cc: #6994
---
 libs/pageserver_api/src/models.rs             |   2 +
 pageserver/src/http/routes.rs                 |  11 ++
 pageserver/src/tenant/storage_layer/layer.rs  |   1 +
 .../src/tenant/storage_layer/layer_name.rs    |  16 +-
 test_runner/fixtures/pageserver/http.py       |  20 ++-
 test_runner/regress/test_compaction.py        |   3 +-
 test_runner/regress/test_layer_eviction.py    |   4 +-
 test_runner/regress/test_ondemand_download.py |   1 -
 .../regress/test_pageserver_layer_rolling.py  |   2 +-
 test_runner/regress/test_sharding.py          |   1 -
 .../regress/test_timeline_detach_ancestor.py  | 157 ++++++++++++++++++
 test_runner/regress/test_timeline_size.py     |   2 +-
 12 files changed, 200 insertions(+), 20 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 451ee1a13c..05a444d738 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -789,6 +789,8 @@ pub enum HistoricLayerInfo {
         lsn_end: Lsn,
         remote: bool,
         access_stats: LayerAccessStats,
+
+        l0: bool,
     },
     Image {
         layer_file_name: String,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0eab6510ca..ec3b1141f3 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1736,6 +1736,8 @@ async fn timeline_compact_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
         flags |= CompactFlags::ForceImageLayerCreation;
     }
+    let wait_until_uploaded =
+        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -1744,6 +1746,9 @@ async fn timeline_compact_handler(
             .compact(&cancel, flags, &ctx)
             .await
             .map_err(|e| ApiError::InternalServerError(e.into()))?;
+        if wait_until_uploaded {
+            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
+        }
         json_response(StatusCode::OK, ())
     }
     .instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
@@ -1768,6 +1773,8 @@ async fn timeline_checkpoint_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
         flags |= CompactFlags::ForceImageLayerCreation;
     }
+    let wait_until_uploaded =
+        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -1781,6 +1788,10 @@ async fn timeline_checkpoint_handler(
             .await
             .map_err(|e| ApiError::InternalServerError(e.into()))?;
 
+        if wait_until_uploaded {
+            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
+        }
+
         json_response(StatusCode::OK, ())
     }
     .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 97b349f635..45d61ce048 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1264,6 +1264,7 @@ impl LayerInner {
                 lsn_end: lsn_range.end,
                 remote: !resident,
                 access_stats,
+                l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()),
             }
         } else {
             let lsn = self.desc.image_layer_lsn();
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index c733404693..da26e1eeb7 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -347,37 +347,33 @@ impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
 mod test {
     use super::*;
     #[test]
-    fn image_layer_parse() -> anyhow::Result<()> {
+    fn image_layer_parse() {
         let expected = LayerName::Image(ImageLayerName {
             key_range: Key::from_i128(0)
                 ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
             lsn: Lsn::from_hex("00000000014FED58").unwrap(),
         });
-        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap();
         assert_eq!(parsed, expected,);
 
         // Omitting generation suffix is valid
-        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap();
         assert_eq!(parsed, expected,);
-
-        Ok(())
     }
 
     #[test]
-    fn delta_layer_parse() -> anyhow::Result<()> {
+    fn delta_layer_parse() {
         let expected = LayerName::Delta(DeltaLayerName {
             key_range: Key::from_i128(0)
                 ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
             lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
                 ..Lsn::from_hex("000000000154C481").unwrap(),
         });
-        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap();
         assert_eq!(parsed, expected);
 
         // Omitting generation suffix is valid
-        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap();
         assert_eq!(parsed, expected);
-
-        Ok(())
     }
 }
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 4d563a532b..f1f96f6d5f 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -56,20 +56,30 @@ class InMemoryLayerInfo:
 class HistoricLayerInfo:
     kind: str
     layer_file_name: str
-    layer_file_size: Optional[int]
+    layer_file_size: int
     lsn_start: str
     lsn_end: Optional[str]
     remote: bool
+    # None for image layers, true if pageserver thinks this is an L0 delta layer
+    l0: Optional[bool]
 
     @classmethod
     def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
+        # instead of parsing the key range lets keep the definition of "L0" in pageserver
+        l0_ness = d.get("l0")
+        assert l0_ness is None or isinstance(l0_ness, bool)
+
+        size = d["layer_file_size"]
+        assert isinstance(size, int)
+
         return HistoricLayerInfo(
             kind=d["kind"],
             layer_file_name=d["layer_file_name"],
-            layer_file_size=d.get("layer_file_size"),
+            layer_file_size=size,
             lsn_start=d["lsn_start"],
             lsn_end=d.get("lsn_end"),
             remote=d["remote"],
+            l0=l0_ness,
         )
 
 
@@ -583,6 +593,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         timeline_id: TimelineId,
         force_repartition=False,
         force_image_layer_creation=False,
+        wait_until_uploaded=False,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
@@ -590,6 +601,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
             query["force_repartition"] = "true"
         if force_image_layer_creation:
             query["force_image_layer_creation"] = "true"
+        if wait_until_uploaded:
+            query["wait_until_uploaded"] = "true"
 
         log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
         res = self.put(
@@ -656,6 +669,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         timeline_id: TimelineId,
         force_repartition=False,
         force_image_layer_creation=False,
+        wait_until_uploaded=False,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
@@ -663,6 +677,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
             query["force_repartition"] = "true"
         if force_image_layer_creation:
             query["force_image_layer_creation"] = "true"
+        if wait_until_uploaded:
+            query["wait_until_uploaded"] = "true"
 
         log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
         res = self.put(
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 93a16620a3..6a3515e1bd 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -165,7 +165,6 @@ def test_sharding_compaction(
                 image_layer_sizes[layer.layer_file_name] = layer.layer_file_size
 
                 # Pageserver should assert rather than emit an empty layer file, but double check here
-                assert layer.layer_file_size is not None
                 assert layer.layer_file_size > 0
 
         shard_has_image_layers.append(len(image_layer_sizes) > 1)
@@ -178,7 +177,7 @@ def test_sharding_compaction(
             #
             # We only do this check with tiny stripes, because large stripes may not give all shards enough
             # data to have statistically significant image layers
-            avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes)  # type: ignore
+            avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes)
             log.info(f"Shard {shard_id} average image layer size: {avg_size}")
             assert avg_size > compaction_target_size / 2
 
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index 0ef4f6d95b..193149ea03 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -272,14 +272,14 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
             resident_physical_size_metric == 0
         ), "ensure that resident_physical_size metric is zero"
         assert resident_physical_size_metric == sum(
-            layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote
+            layer.layer_file_size for layer in info.historic_layers if not layer.remote
         ), "ensure that resident_physical_size metric corresponds to layer map dump"
 
         remote_physical_size_metric = ps_http.get_timeline_metric(
             tenant_id, timeline_id, "pageserver_remote_physical_size"
         )
         assert remote_physical_size_metric == sum(
-            layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote
+            layer.layer_file_size for layer in info.historic_layers if layer.remote
         ), "ensure that remote_physical_size metric corresponds to layer map dump"
 
     log.info("before runnning GC, ensure that remote_physical size is zero")
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index b51754c9e0..b137fb3a5c 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -540,7 +540,6 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder:
 
     for layer in layers.historic_layers:
         log.info(f"pre-compact:  {layer}")
-        assert layer.layer_file_size is not None, "we must know layer file sizes"
         layer_sizes += layer.layer_file_size
         pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
 
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index aab0536f5a..66b6185aaa 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -287,7 +287,7 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
             total_historic_bytes += sum(
                 layer.layer_file_size
                 for layer in layer_map.historic_layers
-                if layer.layer_file_size is not None and Lsn(layer.lsn_start) > initdb_lsn
+                if Lsn(layer.lsn_start) > initdb_lsn
             )
             total_ephemeral_layers += len(layer_map.in_memory_layers)
 
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 1bfeec6f4b..bbb1ad0c6d 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -632,7 +632,6 @@ def test_sharding_ingest_layer_sizes(
         historic_layers = sorted(layer_map.historic_layers, key=lambda layer: layer.lsn_start)
 
         for layer in historic_layers:
-            assert layer.layer_file_size is not None
             if layer.layer_file_size < expect_layer_size // 2:
                 classification = "Small"
                 small_layer_count += 1
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 8406de8bc1..3e435caeee 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -482,6 +482,163 @@ def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEn
     env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
 
 
+def test_compaction_induced_by_detaches_in_history(
+    neon_env_builder: NeonEnvBuilder, test_output_dir, pg_distrib_dir, pg_bin: PgBin
+):
+    """
+    Assuming the tree of timelines:
+
+    root
+    |- child1
+       |- ...
+          |- wanted_detached_child
+
+    Each detach can add N more L0 per level, this is actually unbounded because
+    compaction can be arbitrarily delayed (or detach happen right before one
+    starts). If "wanted_detached_child" has already made progress and compacted
+    L1s, we want to make sure "compaction in the history" does not leave the
+    timeline broken.
+    """
+
+    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
+
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            # we want to create layers manually so we don't branch on arbitrary
+            # Lsn, but we also do not want to compact L0 -> L1.
+            "compaction_threshold": "99999",
+            "compaction_period": "0s",
+            # shouldn't matter, but just in case
+            "gc_period": "0s",
+        }
+    )
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+    client = env.pageserver.http_client()
+
+    def delta_layers(timeline_id: TimelineId):
+        # shorthand for more readable formatting
+        return client.layer_map_info(env.initial_tenant, timeline_id).delta_layers()
+
+    with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep:
+        ep.safe_psql("create table integers (i bigint not null);")
+        ep.safe_psql("insert into integers (i) values (42)")
+        branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+
+        client.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
+
+        assert len(delta_layers(env.initial_timeline)) == 2
+
+    more_good_numbers = range(0, 3)
+
+    branches: List[Tuple[str, TimelineId]] = [("main", env.initial_timeline)]
+
+    for num in more_good_numbers:
+        branch_name = f"br-{len(branches)}"
+        branch_timeline_id = env.neon_cli.create_branch(
+            branch_name,
+            ancestor_branch_name=branches[-1][0],
+            tenant_id=env.initial_tenant,
+            ancestor_start_lsn=branch_lsn,
+        )
+        branches.append((branch_name, branch_timeline_id))
+
+        with env.endpoints.create_start(branches[-1][0], tenant_id=env.initial_tenant) as ep:
+            ep.safe_psql(
+                f"insert into integers (i) select i from generate_series({num}, {num + 100}) as s(i)"
+            )
+            branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id)
+            client.timeline_checkpoint(env.initial_tenant, branch_timeline_id)
+
+        assert len(delta_layers(branch_timeline_id)) == 1
+
+    # now fill in the final, most growing timeline
+
+    branch_name, branch_timeline_id = branches[-1]
+    with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep:
+        ep.safe_psql("insert into integers (i) select i from generate_series(50, 500) s(i)")
+
+        last_suffix = None
+        for suffix in range(0, 4):
+            ep.safe_psql(f"create table other_table_{suffix} as select * from integers")
+            wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id)
+            client.timeline_checkpoint(env.initial_tenant, branch_timeline_id)
+            last_suffix = suffix
+
+        assert last_suffix is not None
+
+        assert len(delta_layers(branch_timeline_id)) == 5
+
+        client.patch_tenant_config_client_side(
+            env.initial_tenant, {"compaction_threshold": 5}, None
+        )
+
+        client.timeline_compact(env.initial_tenant, branch_timeline_id)
+
+        # one more layer
+        ep.safe_psql(f"create table other_table_{last_suffix + 1} as select * from integers")
+        wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id)
+
+        # we need to wait here, because the detaches will do implicit tenant restart,
+        # and we could get unexpected layer counts
+        client.timeline_checkpoint(env.initial_tenant, branch_timeline_id, wait_until_uploaded=True)
+
+    assert len([filter(lambda x: x.l0, delta_layers(branch_timeline_id))]) == 1
+
+    skip_main = branches[1:]
+    branch_lsn = client.timeline_detail(env.initial_tenant, branch_timeline_id)["ancestor_lsn"]
+
+    # take the fullbackup before and after inheriting the new L0s
+    fullbackup_before = test_output_dir / "fullbackup-before.tar"
+    cmd = [
+        "psql",
+        "--no-psqlrc",
+        env.pageserver.connstr(),
+        "-c",
+        f"fullbackup {env.initial_tenant} {branch_timeline_id} {branch_lsn}",
+        "-o",
+        str(fullbackup_before),
+    ]
+    pg_bin.run_capture(cmd, env=psql_env)
+
+    for _, timeline_id in skip_main:
+        reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
+        assert reparented == set(), "we have no earlier branches at any level"
+
+    post_detach_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id)))
+    assert len(post_detach_l0s) == 5, "should had inherited 4 L0s, have 5 in total"
+
+    # checkpoint does compaction, which in turn decides to run, because
+    # there is now in total threshold number L0s even if they are not
+    # adjacent in Lsn space:
+    #
+    # inherited  flushed during this checkpoint
+    #       \\\\ /
+    #       1234X5---> lsn
+    #           |
+    #       l1 layers from "fill in the final, most growing timeline"
+    #
+    # branch_lsn is between 4 and first X.
+    client.timeline_checkpoint(env.initial_tenant, branch_timeline_id)
+
+    post_compact_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id)))
+    assert len(post_compact_l0s) == 1, "only the consecutive inherited L0s should be compacted"
+
+    fullbackup_after = test_output_dir / "fullbackup_after.tar"
+    cmd = [
+        "psql",
+        "--no-psqlrc",
+        env.pageserver.connstr(),
+        "-c",
+        f"fullbackup {env.initial_tenant} {branch_timeline_id} {branch_lsn}",
+        "-o",
+        str(fullbackup_after),
+    ]
+    pg_bin.run_capture(cmd, env=psql_env)
+
+    # we don't need to skip any files, because zenith.signal will be identical
+    tar_cmp(fullbackup_before, fullbackup_after, set())
+
+
 # TODO:
 # - after starting the operation, tenant is deleted
 # - after starting the operation, pageserver is shutdown, restarted
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index a6d06df3b6..db5297870e 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -656,7 +656,7 @@ def get_physical_size_values(
     client = env.pageserver.http_client()
 
     res.layer_map_file_size_sum = sum(
-        layer.layer_file_size or 0
+        layer.layer_file_size
         for layer in client.layer_map_info(tenant_id, timeline_id).historic_layers
     )
 

From e3f6a07ca300549181042752c5943c71424e476a Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 21 May 2024 13:33:29 -0400
Subject: [PATCH 0830/1571] chore(pageserver): remove metrics for in-memory
 ingestion (#7823)

The metrics was added in https://github.com/neondatabase/neon/pull/7515/
to observe if https://github.com/neondatabase/neon/pull/7467 introduces
any perf regressions.

The change was deployed on 5/7 and no changes are observed in the
metrics. So it's safe to remove the metrics now.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/metrics.rs           | 7 -------
 pageserver/src/pgdatadir_mapping.rs | 5 -----
 2 files changed, 12 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 27e25d8e32..4f2c75d308 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1867,7 +1867,6 @@ pub(crate) struct WalIngestMetrics {
     pub(crate) records_received: IntCounter,
     pub(crate) records_committed: IntCounter,
     pub(crate) records_filtered: IntCounter,
-    pub(crate) time_spent_on_ingest: Histogram,
 }
 
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
@@ -1891,12 +1890,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
         "Number of WAL records filtered out due to sharding"
     )
     .expect("failed to define a metric"),
-    time_spent_on_ingest: register_histogram!(
-        "pageserver_wal_ingest_put_value_seconds",
-        "Actual time spent on ingesting a record",
-        redo_histogram_time_buckets!(),
-    )
-    .expect("failed to define a metric"),
 });
 
 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index b4fc4a08ee..f9d8c1020d 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,7 +9,6 @@
 use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::metrics::WAL_INGEST;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use crate::{aux_file, repository::*};
@@ -1702,8 +1701,6 @@ impl<'a> DatadirModification<'a> {
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
         let mut writer = self.tline.writer().await;
 
-        let timer = WAL_INGEST.time_spent_on_ingest.start_timer();
-
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
@@ -1743,8 +1740,6 @@ impl<'a> DatadirModification<'a> {
             writer.update_directory_entries_count(kind, count as u64);
         }
 
-        timer.observe_duration();
-
         Ok(())
     }
 

From 679e031cf641d45d95705c61e121aa40781cc86f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 21 May 2024 23:31:20 +0200
Subject: [PATCH 0831/1571] Add dummy lsn lease http and page service APIs
 (#7815)

We want to introduce a concept of temporary and expiring LSN leases.
This adds both a http API as well as one for the page service to obtain
temporary LSN leases.

This adds a dummy implementation to unblock integration work of this
API. A functional implementation of the lease feature is deferred to a
later step.

Fixes #7808

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 libs/pageserver_api/src/models.rs    | 16 ++++++
 libs/pageserver_api/src/shard.rs     |  8 +++
 pageserver/src/http/openapi_spec.yml | 40 +++++++++++++++
 pageserver/src/http/routes.rs        | 30 ++++++++++++
 pageserver/src/page_service.rs       | 73 ++++++++++++++++++++++++++++
 pageserver/src/tenant/mgr.rs         |  9 +++-
 pageserver/src/tenant/timeline.rs    | 16 +++++-
 7 files changed, 190 insertions(+), 2 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 05a444d738..55e9c48421 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -162,6 +162,22 @@ impl std::fmt::Debug for TenantState {
     }
 }
 
+/// A temporary lease to a specific lsn inside a timeline.
+/// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
+#[serde_as]
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct LsnLease {
+    #[serde_as(as = "SystemTimeAsRfc3339Millis")]
+    pub valid_until: SystemTime,
+}
+
+serde_with::serde_conv!(
+    SystemTimeAsRfc3339Millis,
+    SystemTime,
+    |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
+    |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
+);
+
 /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum ActivatingFrom {
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 43d9b2e48c..1c05a01926 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -559,6 +559,14 @@ impl ShardIdentity {
         }
     }
 
+    /// Obtains the shard number and count combined into a `ShardIndex`.
+    pub fn shard_index(&self) -> ShardIndex {
+        ShardIndex {
+            shard_count: self.count,
+            shard_number: self.number,
+        }
+    }
+
     pub fn shard_slug(&self) -> String {
         if self.count > ShardCount(0) {
             format!("-{:02x}{:02x}", self.number.0, self.count.0)
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 36c74ed140..107bcd4a22 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -257,6 +257,37 @@ paths:
               schema:
                 $ref: "#/components/schemas/LsnByTimestampResponse"
 
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: Obtain lease for the given LSN
+      parameters:
+        - name: lsn
+          in: query
+          required: true
+          schema:
+            type: string
+            format: hex
+          description: A LSN to obtain the lease for
+      responses:
+        "200":
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/LsnLease"
+
   /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
     parameters:
       - name: tenant_id
@@ -980,6 +1011,15 @@ components:
           type: string
           enum: [past, present, future, nodata]
 
+    LsnLease:
+      type: object
+      required:
+        - valid_until
+      properties:
+        valid_until:
+          type: string
+          format: date-time
+
     PageserverUtilization:
       type: object
       required:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index ec3b1141f3..c75e4ca5a9 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1701,6 +1701,32 @@ async fn handle_tenant_break(
     json_response(StatusCode::OK, ())
 }
 
+// Obtains an lsn lease on the given timeline.
+async fn lsn_lease_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let lsn: Lsn = parse_query_param(&request, "lsn")?
+        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+
+    let state = get_state(&request);
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+    let result = timeline
+        .make_lsn_lease(lsn, &ctx)
+        .map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
+
+    json_response(StatusCode::OK, result)
+}
+
 // Run GC immediately on given timeline.
 async fn timeline_gc_handler(
     mut request: Request<Body>,
@@ -2712,6 +2738,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
             |r| api_handler(r, get_timestamp_of_lsn_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
+            |r| api_handler(r, lsn_lease_handler),
+        )
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
             |r| api_handler(r, timeline_gc_handler),
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 35aba044b2..c066f56c17 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -19,6 +19,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::ShardNumber;
+use pageserver_api::shard::TenantShardId;
 use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
@@ -33,6 +34,7 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
+use std::time::SystemTime;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
@@ -905,6 +907,39 @@ impl PageServerHandler {
         }
     }
 
+    #[instrument(skip_all, fields(shard_id, %lsn))]
+    async fn handle_make_lsn_lease<IO>(
+        &self,
+        pgb: &mut PostgresBackend<IO>,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        let shard_selector = ShardSelector::Known(tenant_shard_id.to_index());
+        let timeline = self
+            .get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector)
+            .await?;
+        let lease = timeline.make_lsn_lease(lsn, ctx)?;
+        let valid_until = lease
+            .valid_until
+            .duration_since(SystemTime::UNIX_EPOCH)
+            .map_err(|e| QueryError::Other(e.into()))?;
+
+        pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
+            b"valid_until",
+        )]))?
+        .write_message_noflush(&BeMessage::DataRow(&[Some(
+            &valid_until.as_millis().to_be_bytes(),
+        )]))?
+        .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+
+        Ok(())
+    }
+
     #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_rel_exists_request(
         &mut self,
@@ -1802,6 +1837,44 @@ where
             // important because psycopg2 executes "SET datestyle TO 'ISO'"
             // on connect
             pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        } else if query_string.starts_with("lease lsn ") {
+            let (_, params_raw) = query_string.split_at("lease lsn ".len());
+            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+            if params.len() != 3 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number {} for lease lsn command",
+                    params.len()
+                )));
+            }
+
+            let tenant_shard_id = TenantShardId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_shard_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_shard_id.tenant_id))?;
+
+            // The caller is responsible for providing correct lsn.
+            let lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+
+            match self
+                .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
+                .await
+            {
+                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
+                Err(e) => {
+                    error!("error obtaining lsn lease for {lsn}: {e:?}");
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
+                }
+            };
         } else if query_string.starts_with("show ") {
             // show <tenant_id>
             let (_, params_raw) = query_string.split_at("show ".len());
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 1d8e2cf6d3..89fdf31849 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -7,7 +7,7 @@ use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
 use pageserver_api::shard::{
-    ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
+    ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId,
 };
 use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
@@ -127,6 +127,8 @@ pub(crate) enum ShardSelector {
     First,
     /// Pick the shard that holds this key
     Page(Key),
+    /// The shard ID is known: pick the given shard
+    Known(ShardIndex),
 }
 
 /// A convenience for use with the re_attach ControlPlaneClient function: rather
@@ -2067,6 +2069,11 @@ impl TenantManager {
                                 return ShardResolveResult::Found(tenant.clone());
                             }
                         }
+                        ShardSelector::Known(shard)
+                            if tenant.shard_identity.shard_index() == shard =>
+                        {
+                            return ShardResolveResult::Found(tenant.clone());
+                        }
                         _ => continue,
                     }
                 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b0e5275b5f..1f8ee9ffc4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -25,7 +25,7 @@ use pageserver_api::{
     models::{
         AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
         DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
-        TimelineState,
+        LsnLease, TimelineState,
     },
     reltag::BlockNumber,
     shard::{ShardIdentity, ShardNumber, TenantShardId},
@@ -1532,6 +1532,20 @@ impl Timeline {
         Ok(())
     }
 
+    /// Obtains a temporary lease blocking garbage collection for the given LSN
+    pub(crate) fn make_lsn_lease(
+        &self,
+        _lsn: Lsn,
+        _ctx: &RequestContext,
+    ) -> anyhow::Result<LsnLease> {
+        const LEASE_LENGTH: Duration = Duration::from_secs(5 * 60);
+        let lease = LsnLease {
+            valid_until: SystemTime::now() + LEASE_LENGTH,
+        };
+        // TODO: dummy implementation
+        Ok(lease)
+    }
+
     /// Flush to disk all data that was written with the put_* functions
     #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
     pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {

From 00d66e8012c2d4b0df706fb5e89ab8351e4eb402 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Tue, 21 May 2024 16:52:48 -0700
Subject: [PATCH 0832/1571] compute_ctl: Fix handling of missing
 /neonvm/bin/resize-swap (#7832)

The logic added in the original PR (#7434) only worked before sudo was
used, because 'sudo foo' will only fail with NotFound if 'sudo' doesn't
exist; if 'foo' doesn't exist, then sudo will fail with a normal error
exit.

This means that compute_ctl may fail to restart if it exits after
successfully enabling swap.
---
 compute_tools/src/swap.rs | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/compute_tools/src/swap.rs b/compute_tools/src/swap.rs
index c22b6bc14e..024c5b338e 100644
--- a/compute_tools/src/swap.rs
+++ b/compute_tools/src/swap.rs
@@ -1,3 +1,5 @@
+use std::path::Path;
+
 use anyhow::{anyhow, Context};
 use tracing::warn;
 
@@ -17,17 +19,24 @@ pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
         .arg(size_bytes.to_string())
         .spawn();
 
-    if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
-        warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
-        return Ok(());
-    }
-
     child_result
         .context("spawn() failed")
         .and_then(|mut child| child.wait().context("wait() failed"))
         .and_then(|status| match status.success() {
             true => Ok(()),
-            false => Err(anyhow!("process exited with {status}")),
+            false => {
+                // The command failed. Maybe it was because the resize-swap file doesn't exist?
+                // The --once flag causes it to delete itself on success so we don't disable swap
+                // while postgres is running; maybe this is fine.
+                match Path::new(RESIZE_SWAP_BIN).try_exists() {
+                    Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")),
+                    // The path doesn't exist; we're actually ok 
+                    Ok(false) => {
+                        warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
+                        Ok(())
+                    },
+                }
+            }
         })
         // wrap any prior error with the overall context that we couldn't run the command
         .with_context(|| {

From bd5cb9e86b2f8623b2528df9fa59daa21759e183 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 22 May 2024 09:34:39 +0100
Subject: [PATCH 0833/1571] Implement timeline_manager for safekeeper
 background tasks (#7768)

In safekeepers we have several background tasks. Previously `WAL backup`
task was spawned by another task called `wal_backup_launcher`. That task
received notifications via `wal_backup_launcher_rx` and decided to spawn
or kill existing backup task associated with the timeline. This was
inconvenient because each code segment that touched shared state was
responsible for pushing notification into `wal_backup_launcher_tx`
channel. This was error prone because it's easy to miss and could lead
to deadlock in some cases, if notification pushing was done in the wrong
order.

We also had a similar issue with `is_active` timeline flag. That flag
was calculated based on the state and code modifying the state had to
call function to update the flag. We had a few bugs related to that,
when we forgot to update `is_active` flag in some places where it could
change.

To fix these issues, this PR adds a new `timeline_manager` background
task associated with each timeline. This task is responsible for
managing all background tasks, including `is_active` flag which is used
for pushing broker messages. It is subscribed for updates in timeline
state in a loop and decides to spawn/kill background tasks when needed.

There is a new structure called `TimelinesSet`. It stores a set of
`Arc<Timeline>` and allows to copy the set to iterate without holding
the mutex. This is what replaced `is_active` flag for the broker. Now
broker push task holds a reference to the `TimelinesSet` with active
timelines and use it instead of iterating over all timelines and
filtering by `is_active` flag.

Also added some metrics for manager iterations and active backup tasks.
Ideally manager should be doing not too many iterations and we should
not have a lot of backup tasks spawned at the same time.

Fixes #7751

---------

Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
---
 safekeeper/src/bin/safekeeper.rs       |  17 +-
 safekeeper/src/broker.rs               |  11 +-
 safekeeper/src/lib.rs                  |   2 +
 safekeeper/src/metrics.rs              |  35 ++-
 safekeeper/src/receive_wal.rs          |  57 ++--
 safekeeper/src/remove_wal.rs           |  19 +-
 safekeeper/src/timeline.rs             | 404 +++++++++++--------------
 safekeeper/src/timeline_manager.rs     | 153 ++++++++++
 safekeeper/src/timelines_global_map.rs |  60 ++--
 safekeeper/src/timelines_set.rs        |  90 ++++++
 safekeeper/src/wal_backup.rs           | 201 +++++-------
 11 files changed, 587 insertions(+), 462 deletions(-)
 create mode 100644 safekeeper/src/timeline_manager.rs
 create mode 100644 safekeeper/src/timelines_set.rs

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 09c565ce71..aee3898ac7 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -20,7 +20,6 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 use storage_broker::Uri;
-use tokio::sync::mpsc;
 
 use tracing::*;
 use utils::pid_file;
@@ -30,13 +29,13 @@ use safekeeper::defaults::{
     DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
     DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
+use safekeeper::remove_wal;
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
 use safekeeper::SafeKeeperConf;
 use safekeeper::{broker, WAL_SERVICE_RUNTIME};
 use safekeeper::{control_file, BROKER_RUNTIME};
 use safekeeper::{http, WAL_REMOVER_RUNTIME};
-use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
 use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
@@ -377,8 +376,6 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     let timeline_collector = safekeeper::metrics::TimelineCollector::new();
     metrics::register_internal(Box::new(timeline_collector))?;
 
-    let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
-
     wal_backup::init_remote_storage(&conf);
 
     // Keep handles to main tasks to die if any of them disappears.
@@ -391,19 +388,9 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     let current_thread_rt = conf
         .current_thread_runtime
         .then(|| Handle::try_current().expect("no runtime in main"));
-    let conf_ = conf.clone();
-    let wal_backup_handle = current_thread_rt
-        .as_ref()
-        .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
-        .spawn(wal_backup::wal_backup_launcher_task_main(
-            conf_,
-            wal_backup_launcher_rx,
-        ))
-        .map(|res| ("WAL backup launcher".to_owned(), res));
-    tasks_handles.push(Box::pin(wal_backup_handle));
 
     // Load all timelines from disk to memory.
-    GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx).await?;
+    GlobalTimelines::init(conf.clone()).await?;
 
     let conf_ = conf.clone();
     // Run everything in current thread rt, if asked.
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index 46a51438ea..7cc2142291 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -46,6 +46,8 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
         return Ok(());
     }
 
+    let active_timelines_set = GlobalTimelines::get_global_broker_active_set();
+
     let mut client =
         storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
     let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
@@ -57,15 +59,9 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
             // sensitive and there is no risk of deadlock as we don't await while
             // lock is held.
             let now = Instant::now();
-            let all_tlis = GlobalTimelines::get_all();
+            let all_tlis = active_timelines_set.get_all();
             let mut n_pushed_tlis = 0;
             for tli in &all_tlis {
-                // filtering alternative futures::stream::iter(all_tlis)
-                //   .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::<Vec<_>>().await;
-                // doesn't look better, and I'm not sure how to do that without collect.
-                if !tli.is_active().await {
-                    continue;
-                }
                 let sk_info = tli.get_safekeeper_info(&conf).await;
                 yield sk_info;
                 BROKER_PUSHED_UPDATES.inc();
@@ -90,6 +86,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
 }
 
 /// Subscribe and fetch all the interesting data from the broker.
+#[instrument(name = "broker pull", skip_all)]
 async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
     let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
 
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 543714a54e..8d8d2cf23e 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -31,6 +31,8 @@ pub mod safekeeper;
 pub mod send_wal;
 pub mod state;
 pub mod timeline;
+pub mod timeline_manager;
+pub mod timelines_set;
 pub mod wal_backup;
 pub mod wal_backup_partial;
 pub mod wal_service;
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 28ae042bb3..1e965393e3 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -11,8 +11,9 @@ use futures::Future;
 use metrics::{
     core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
     proto::MetricFamily,
-    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, Gauge,
-    IntCounter, IntCounterPairVec, IntCounterVec, IntGaugeVec,
+    register_int_counter, register_int_counter_pair, register_int_counter_pair_vec,
+    register_int_counter_vec, Gauge, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
+    IntGaugeVec,
 };
 use once_cell::sync::Lazy;
 
@@ -162,6 +163,29 @@ pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
     )
     .expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter")
 });
+pub static MANAGER_ITERATIONS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_manager_iterations_total",
+        "Number of iterations of the timeline manager task"
+    )
+    .expect("Failed to register safekeeper_manager_iterations_total counter")
+});
+pub static MANAGER_ACTIVE_CHANGES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_manager_active_changes_total",
+        "Number of timeline active status changes in the timeline manager task"
+    )
+    .expect("Failed to register safekeeper_manager_active_changes_total counter")
+});
+pub static WAL_BACKUP_TASKS: Lazy<IntCounterPair> = Lazy::new(|| {
+    register_int_counter_pair!(
+        "safekeeper_wal_backup_tasks_started_total",
+        "Number of active WAL backup tasks",
+        "safekeeper_wal_backup_tasks_finished_total",
+        "Number of finished WAL backup tasks",
+    )
+    .expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter")
+});
 
 pub const LABEL_UNKNOWN: &str = "unknown";
 
@@ -614,8 +638,7 @@ impl Collector for TimelineCollector {
         self.written_wal_seconds.reset();
         self.flushed_wal_seconds.reset();
 
-        let timelines = GlobalTimelines::get_all();
-        let timelines_count = timelines.len();
+        let timelines_count = GlobalTimelines::get_all().len();
         let mut active_timelines_count = 0;
 
         // Prometheus Collector is sync, and data is stored under async lock. To
@@ -746,9 +769,9 @@ impl Collector for TimelineCollector {
 
 async fn collect_timeline_metrics() -> Vec<FullTimelineInfo> {
     let mut res = vec![];
-    let timelines = GlobalTimelines::get_all();
+    let active_timelines = GlobalTimelines::get_global_broker_active_set().get_all();
 
-    for tli in timelines {
+    for tli in active_timelines {
         if let Some(info) = tli.info_for_metrics().await {
             res.push(info);
         }
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 0356def7df..03cfa882c4 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -45,6 +45,9 @@ const DEFAULT_FEEDBACK_CAPACITY: usize = 8;
 pub struct WalReceivers {
     mutex: Mutex<WalReceiversShared>,
     pageserver_feedback_tx: tokio::sync::broadcast::Sender<PageserverFeedback>,
+
+    num_computes_tx: tokio::sync::watch::Sender<usize>,
+    num_computes_rx: tokio::sync::watch::Receiver<usize>,
 }
 
 /// Id under which walreceiver is registered in shmem.
@@ -55,16 +58,21 @@ impl WalReceivers {
         let (pageserver_feedback_tx, _) =
             tokio::sync::broadcast::channel(DEFAULT_FEEDBACK_CAPACITY);
 
+        let (num_computes_tx, num_computes_rx) = tokio::sync::watch::channel(0usize);
+
         Arc::new(WalReceivers {
             mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }),
             pageserver_feedback_tx,
+            num_computes_tx,
+            num_computes_rx,
         })
     }
 
     /// Register new walreceiver. Returned guard provides access to the slot and
     /// automatically deregisters in Drop.
     pub fn register(self: &Arc<WalReceivers>, conn_id: Option<ConnectionId>) -> WalReceiverGuard {
-        let slots = &mut self.mutex.lock().slots;
+        let mut shared = self.mutex.lock();
+        let slots = &mut shared.slots;
         let walreceiver = WalReceiverState {
             conn_id,
             status: WalReceiverStatus::Voting,
@@ -78,6 +86,9 @@ impl WalReceivers {
             slots.push(Some(walreceiver));
             pos
         };
+
+        self.update_num(&shared);
+
         WalReceiverGuard {
             id: pos,
             walreceivers: self.clone(),
@@ -99,7 +110,18 @@ impl WalReceivers {
 
     /// Get number of walreceivers (compute connections).
     pub fn get_num(self: &Arc<WalReceivers>) -> usize {
-        self.mutex.lock().slots.iter().flatten().count()
+        self.mutex.lock().get_num()
+    }
+
+    /// Get channel for number of walreceivers.
+    pub fn get_num_rx(self: &Arc<WalReceivers>) -> tokio::sync::watch::Receiver<usize> {
+        self.num_computes_rx.clone()
+    }
+
+    /// Should get called after every update of slots.
+    fn update_num(self: &Arc<WalReceivers>, shared: &MutexGuard<WalReceiversShared>) {
+        let num = shared.get_num();
+        self.num_computes_tx.send_replace(num);
     }
 
     /// Get state of all walreceivers.
@@ -123,6 +145,7 @@ impl WalReceivers {
     fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
         let mut shared = self.mutex.lock();
         shared.slots[id] = None;
+        self.update_num(&shared);
     }
 
     /// Broadcast pageserver feedback to connected walproposers.
@@ -137,6 +160,13 @@ struct WalReceiversShared {
     slots: Vec<Option<WalReceiverState>>,
 }
 
+impl WalReceiversShared {
+    /// Get number of walreceivers (compute connections).
+    fn get_num(&self) -> usize {
+        self.slots.iter().flatten().count()
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalReceiverState {
     /// None means it is recovery initiated by us (this safekeeper).
@@ -456,14 +486,7 @@ impl WalAcceptor {
     /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed;
     /// it must mean that network thread terminated.
     async fn run(&mut self) -> anyhow::Result<()> {
-        // Register the connection and defer unregister.
-        // Order of the next two lines is important: we want first to remove our entry and then
-        // update status which depends on registered connections.
-        let _compute_conn_guard = ComputeConnectionGuard {
-            timeline: Arc::clone(&self.tli),
-        };
         let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
-        self.tli.update_status_notify().await?;
 
         // After this timestamp we will stop processing AppendRequests and send a response
         // to the walproposer. walproposer sends at least one AppendRequest per second,
@@ -529,19 +552,3 @@ impl WalAcceptor {
         }
     }
 }
-
-/// Calls update_status_notify in drop to update timeline status.
-struct ComputeConnectionGuard {
-    timeline: Arc<Timeline>,
-}
-
-impl Drop for ComputeConnectionGuard {
-    fn drop(&mut self) {
-        let tli = self.timeline.clone();
-        tokio::spawn(async move {
-            if let Err(e) = tli.update_status_notify().await {
-                error!("failed to update timeline status: {}", e);
-            }
-        });
-    }
-}
diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs
index 9dce06a886..98ce671182 100644
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -7,29 +7,18 @@ use tracing::*;
 
 use crate::{GlobalTimelines, SafeKeeperConf};
 
-const ALLOW_INACTIVE_TIMELINES: bool = true;
-
-pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
+pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
     let wal_removal_interval = Duration::from_millis(5000);
     loop {
         let now = tokio::time::Instant::now();
-        let mut active_timelines = 0;
-
         let tlis = GlobalTimelines::get_all();
         for tli in &tlis {
-            let is_active = tli.is_active().await;
-            if is_active {
-                active_timelines += 1;
-            }
-            if !ALLOW_INACTIVE_TIMELINES && !is_active {
-                continue;
-            }
             let ttid = tli.ttid;
             async {
                 if let Err(e) = tli.maybe_persist_control_file().await {
                     warn!("failed to persist control file: {e}");
                 }
-                if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled).await {
+                if let Err(e) = tli.remove_old_wal().await {
                     error!("failed to remove WAL: {}", e);
                 }
             }
@@ -42,8 +31,8 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
 
         if elapsed > wal_removal_interval {
             info!(
-                "WAL removal is too long, processed {} active timelines ({} total) in {:?}",
-                active_timelines, total_timelines, elapsed
+                "WAL removal is too long, processed {} timelines in {:?}",
+                total_timelines, elapsed
             );
         }
 
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index e97247dc7c..da2e3f4538 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -8,13 +8,12 @@ use serde::{Deserialize, Serialize};
 use tokio::fs;
 
 use std::cmp::max;
+use std::ops::{Deref, DerefMut};
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 use std::time::Duration;
-use tokio::sync::{Mutex, MutexGuard};
-use tokio::{
-    sync::{mpsc::Sender, watch},
-    time::Instant,
-};
+use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
+use tokio::{sync::watch, time::Instant};
 use tracing::*;
 use utils::http::error::ApiError;
 use utils::{
@@ -33,12 +32,13 @@ use crate::safekeeper::{
 };
 use crate::send_wal::WalSenders;
 use crate::state::{TimelineMemState, TimelinePersistentState};
+use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::{self};
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
 use crate::metrics::FullTimelineInfo;
 use crate::wal_storage::Storage as wal_storage_iface;
-use crate::{debug_dump, wal_backup_partial, wal_storage};
+use crate::{debug_dump, timeline_manager, wal_backup_partial, wal_storage};
 use crate::{GlobalTimelines, SafeKeeperConf};
 
 /// Things safekeeper should know about timeline state on peers.
@@ -51,8 +51,7 @@ pub struct PeerInfo {
     /// LSN of the last record.
     pub flush_lsn: Lsn,
     pub commit_lsn: Lsn,
-    /// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
-    /// sk since backup_lsn.
+    /// Since which LSN safekeeper has WAL.
     pub local_start_lsn: Lsn,
     /// When info was received. Serde annotations are not very useful but make
     /// the code compile -- we don't rely on this field externally.
@@ -97,25 +96,72 @@ impl PeersInfo {
     }
 }
 
+pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>;
+
+/// WriteGuardSharedState is a wrapper around `RwLockWriteGuard<SharedState>` that
+/// automatically updates `watch::Sender` channels with state on drop.
+pub struct WriteGuardSharedState<'a> {
+    tli: Arc<Timeline>,
+    guard: RwLockWriteGuard<'a, SharedState>,
+}
+
+impl<'a> WriteGuardSharedState<'a> {
+    fn new(tli: Arc<Timeline>, guard: RwLockWriteGuard<'a, SharedState>) -> Self {
+        WriteGuardSharedState { tli, guard }
+    }
+}
+
+impl<'a> Deref for WriteGuardSharedState<'a> {
+    type Target = SharedState;
+
+    fn deref(&self) -> &Self::Target {
+        &self.guard
+    }
+}
+
+impl<'a> DerefMut for WriteGuardSharedState<'a> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.guard
+    }
+}
+
+impl<'a> Drop for WriteGuardSharedState<'a> {
+    fn drop(&mut self) {
+        let term_flush_lsn = TermLsn::from((self.guard.sk.get_term(), self.guard.sk.flush_lsn()));
+        let commit_lsn = self.guard.sk.state.inmem.commit_lsn;
+
+        let _ = self.tli.term_flush_lsn_watch_tx.send_if_modified(|old| {
+            if *old != term_flush_lsn {
+                *old = term_flush_lsn;
+                true
+            } else {
+                false
+            }
+        });
+
+        let _ = self.tli.commit_lsn_watch_tx.send_if_modified(|old| {
+            if *old != commit_lsn {
+                *old = commit_lsn;
+                true
+            } else {
+                false
+            }
+        });
+
+        // send notification about shared state update
+        self.tli.shared_state_version_tx.send_modify(|old| {
+            *old += 1;
+        });
+    }
+}
+
 /// Shared state associated with database instance
 pub struct SharedState {
     /// Safekeeper object
-    sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
+    pub(crate) sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
     /// In memory list containing state of peers sent in latest messages from them.
-    peers_info: PeersInfo,
-    /// True when WAL backup launcher oversees the timeline, making sure WAL is
-    /// offloaded, allows to bother launcher less.
-    wal_backup_active: bool,
-    /// True whenever there is at least some pending activity on timeline: live
-    /// compute connection, pageserver is not caughtup (it must have latest WAL
-    /// for new compute start) or WAL backuping is not finished. Practically it
-    /// means safekeepers broadcast info to peers about the timeline, old WAL is
-    /// trimmed.
-    ///
-    /// TODO: it might be better to remove tli completely from GlobalTimelines
-    /// when tli is inactive instead of having this flag.
-    active: bool,
-    last_removed_segno: XLogSegNo,
+    pub(crate) peers_info: PeersInfo,
+    pub(crate) last_removed_segno: XLogSegNo,
 }
 
 impl SharedState {
@@ -152,8 +198,6 @@ impl SharedState {
         Ok(Self {
             sk,
             peers_info: PeersInfo(vec![]),
-            wal_backup_active: false,
-            active: false,
             last_removed_segno: 0,
         })
     }
@@ -171,75 +215,10 @@ impl SharedState {
         Ok(Self {
             sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
             peers_info: PeersInfo(vec![]),
-            wal_backup_active: false,
-            active: false,
             last_removed_segno: 0,
         })
     }
 
-    fn is_active(&self, num_computes: usize) -> bool {
-        self.is_wal_backup_required(num_computes)
-            // FIXME: add tracking of relevant pageservers and check them here individually,
-            // otherwise migration won't work (we suspend too early).
-            || self.sk.state.inmem.remote_consistent_lsn < self.sk.state.inmem.commit_lsn
-    }
-
-    /// Mark timeline active/inactive and return whether s3 offloading requires
-    /// start/stop action. If timeline is deactivated, control file is persisted
-    /// as maintenance task does that only for active timelines.
-    async fn update_status(&mut self, num_computes: usize, ttid: TenantTimelineId) -> bool {
-        let is_active = self.is_active(num_computes);
-        if self.active != is_active {
-            info!(
-                "timeline {} active={} now, remote_consistent_lsn={}, commit_lsn={}",
-                ttid,
-                is_active,
-                self.sk.state.inmem.remote_consistent_lsn,
-                self.sk.state.inmem.commit_lsn
-            );
-            if !is_active {
-                if let Err(e) = self.sk.state.flush().await {
-                    warn!("control file save in update_status failed: {:?}", e);
-                }
-            }
-        }
-        self.active = is_active;
-        self.is_wal_backup_action_pending(num_computes)
-    }
-
-    /// Should we run s3 offloading in current state?
-    fn is_wal_backup_required(&self, num_computes: usize) -> bool {
-        let seg_size = self.get_wal_seg_size();
-        num_computes > 0 ||
-        // Currently only the whole segment is offloaded, so compare segment numbers.
-            (self.sk.state.inmem.commit_lsn.segment_number(seg_size) >
-             self.sk.state.inmem.backup_lsn.segment_number(seg_size))
-    }
-
-    /// Is current state of s3 offloading is not what it ought to be?
-    fn is_wal_backup_action_pending(&self, num_computes: usize) -> bool {
-        let res = self.wal_backup_active != self.is_wal_backup_required(num_computes);
-        if res {
-            let action_pending = if self.is_wal_backup_required(num_computes) {
-                "start"
-            } else {
-                "stop"
-            };
-            trace!(
-                "timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}",
-                self.sk.state.timeline_id, action_pending, num_computes, self.sk.state.inmem.commit_lsn, self.sk.state.inmem.backup_lsn
-            );
-        }
-        res
-    }
-
-    /// Returns whether s3 offloading is required and sets current status as
-    /// matching.
-    fn wal_backup_attend(&mut self, num_computes: usize) -> bool {
-        self.wal_backup_active = self.is_wal_backup_required(num_computes);
-        self.wal_backup_active
-    }
-
     fn get_wal_seg_size(&self) -> usize {
         self.sk.state.server.wal_seg_size as usize
     }
@@ -278,7 +257,7 @@ impl SharedState {
     /// Get our latest view of alive peers status on the timeline.
     /// We pass our own info through the broker as well, so when we don't have connection
     /// to the broker returned vec is empty.
-    fn get_peers(&self, heartbeat_timeout: Duration) -> Vec<PeerInfo> {
+    pub(crate) fn get_peers(&self, heartbeat_timeout: Duration) -> Vec<PeerInfo> {
         let now = Instant::now();
         self.peers_info
             .0
@@ -294,18 +273,13 @@ impl SharedState {
     /// offloading.
     /// While it is safe to use inmem values for determining horizon,
     /// we use persistent to make possible normal states less surprising.
-    fn get_horizon_segno(
-        &self,
-        wal_backup_enabled: bool,
-        extra_horizon_lsn: Option<Lsn>,
-    ) -> XLogSegNo {
+    fn get_horizon_segno(&self, extra_horizon_lsn: Option<Lsn>) -> XLogSegNo {
         let state = &self.sk.state;
 
         use std::cmp::min;
         let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn);
-        if wal_backup_enabled {
-            horizon_lsn = min(horizon_lsn, state.backup_lsn);
-        }
+        // we don't want to remove WAL that is not yet offloaded to s3
+        horizon_lsn = min(horizon_lsn, state.backup_lsn);
         if let Some(extra_horizon_lsn) = extra_horizon_lsn {
             horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
         }
@@ -346,11 +320,6 @@ impl From<TimelineError> for ApiError {
 pub struct Timeline {
     pub ttid: TenantTimelineId,
 
-    /// Sending here asks for wal backup launcher attention (start/stop
-    /// offloading). Sending ttid instead of concrete command allows to do
-    /// sending without timeline lock.
-    pub wal_backup_launcher_tx: Sender<TenantTimelineId>,
-
     /// Used to broadcast commit_lsn updates to all background jobs.
     commit_lsn_watch_tx: watch::Sender<Lsn>,
     commit_lsn_watch_rx: watch::Receiver<Lsn>,
@@ -362,10 +331,14 @@ pub struct Timeline {
     term_flush_lsn_watch_tx: watch::Sender<TermLsn>,
     term_flush_lsn_watch_rx: watch::Receiver<TermLsn>,
 
+    /// Broadcasts shared state updates.
+    shared_state_version_tx: watch::Sender<usize>,
+    shared_state_version_rx: watch::Receiver<usize>,
+
     /// Safekeeper and other state, that should remain consistent and
     /// synchronized with the disk. This is tokio mutex as we write WAL to disk
     /// while holding it, ensuring that consensus checks are in order.
-    mutex: Mutex<SharedState>,
+    mutex: RwLock<SharedState>,
     walsenders: Arc<WalSenders>,
     walreceivers: Arc<WalReceivers>,
 
@@ -384,15 +357,15 @@ pub struct Timeline {
     /// with different speed.
     // TODO: add `Arc<SafeKeeperConf>` here instead of adding each field separately.
     walsenders_keep_horizon: bool,
+
+    // timeline_manager controlled state
+    pub(crate) broker_active: AtomicBool,
+    pub(crate) wal_backup_active: AtomicBool,
 }
 
 impl Timeline {
     /// Load existing timeline from disk.
-    pub fn load_timeline(
-        conf: &SafeKeeperConf,
-        ttid: TenantTimelineId,
-        wal_backup_launcher_tx: Sender<TenantTimelineId>,
-    ) -> Result<Timeline> {
+    pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Timeline> {
         let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
 
         let shared_state = SharedState::restore(conf, &ttid)?;
@@ -402,23 +375,27 @@ impl Timeline {
             shared_state.sk.get_term(),
             shared_state.sk.flush_lsn(),
         )));
+        let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
         let (cancellation_tx, cancellation_rx) = watch::channel(false);
 
         let walreceivers = WalReceivers::new();
         Ok(Timeline {
             ttid,
-            wal_backup_launcher_tx,
             commit_lsn_watch_tx,
             commit_lsn_watch_rx,
             term_flush_lsn_watch_tx,
             term_flush_lsn_watch_rx,
-            mutex: Mutex::new(shared_state),
+            shared_state_version_tx,
+            shared_state_version_rx,
+            mutex: RwLock::new(shared_state),
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
             cancellation_rx,
             cancellation_tx,
             timeline_dir: conf.timeline_dir(&ttid),
             walsenders_keep_horizon: conf.walsenders_keep_horizon,
+            broker_active: AtomicBool::new(false),
+            wal_backup_active: AtomicBool::new(false),
         })
     }
 
@@ -426,7 +403,6 @@ impl Timeline {
     pub fn create_empty(
         conf: &SafeKeeperConf,
         ttid: TenantTimelineId,
-        wal_backup_launcher_tx: Sender<TenantTimelineId>,
         server_info: ServerInfo,
         commit_lsn: Lsn,
         local_start_lsn: Lsn,
@@ -434,25 +410,30 @@ impl Timeline {
         let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID);
         let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) =
             watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID)));
+        let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
         let (cancellation_tx, cancellation_rx) = watch::channel(false);
+
         let state =
             TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
 
         let walreceivers = WalReceivers::new();
         Ok(Timeline {
             ttid,
-            wal_backup_launcher_tx,
             commit_lsn_watch_tx,
             commit_lsn_watch_rx,
             term_flush_lsn_watch_tx,
             term_flush_lsn_watch_rx,
-            mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?),
+            shared_state_version_tx,
+            shared_state_version_rx,
+            mutex: RwLock::new(SharedState::create_new(conf, &ttid, state)?),
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
             cancellation_rx,
             cancellation_tx,
             timeline_dir: conf.timeline_dir(&ttid),
             walsenders_keep_horizon: conf.walsenders_keep_horizon,
+            broker_active: AtomicBool::new(false),
+            wal_backup_active: AtomicBool::new(false),
         })
     }
 
@@ -463,8 +444,9 @@ impl Timeline {
     /// and state on disk should remain unchanged.
     pub async fn init_new(
         self: &Arc<Timeline>,
-        shared_state: &mut MutexGuard<'_, SharedState>,
+        shared_state: &mut WriteGuardSharedState<'_>,
         conf: &SafeKeeperConf,
+        broker_active_set: Arc<TimelinesSet>,
     ) -> Result<()> {
         match fs::metadata(&self.timeline_dir).await {
             Ok(_) => {
@@ -495,16 +477,29 @@ impl Timeline {
 
             return Err(e);
         }
-        self.bootstrap(conf);
+        self.bootstrap(conf, broker_active_set);
         Ok(())
     }
 
-    /// Bootstrap new or existing timeline starting background stasks.
-    pub fn bootstrap(self: &Arc<Timeline>, conf: &SafeKeeperConf) {
+    /// Bootstrap new or existing timeline starting background tasks.
+    pub fn bootstrap(
+        self: &Arc<Timeline>,
+        conf: &SafeKeeperConf,
+        broker_active_set: Arc<TimelinesSet>,
+    ) {
+        // Start manager task which will monitor timeline state and update
+        // background tasks.
+        tokio::spawn(timeline_manager::main_task(
+            self.clone(),
+            conf.clone(),
+            broker_active_set,
+        ));
+
         // Start recovery task which always runs on the timeline.
         if conf.peer_recovery_enabled {
             tokio::spawn(recovery_main(self.clone(), conf.clone()));
         }
+        // TODO: migrate to timeline_manager
         if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
             tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
         }
@@ -517,10 +512,9 @@ impl Timeline {
     /// deletion API endpoint is retriable.
     pub async fn delete(
         &self,
-        shared_state: &mut MutexGuard<'_, SharedState>,
+        shared_state: &mut WriteGuardSharedState<'_>,
         only_local: bool,
-    ) -> Result<(bool, bool)> {
-        let was_active = shared_state.active;
+    ) -> Result<bool> {
         self.cancel(shared_state);
 
         // TODO: It's better to wait for s3 offloader termination before
@@ -534,18 +528,12 @@ impl Timeline {
             wal_backup::delete_timeline(&self.ttid).await?;
         }
         let dir_existed = delete_dir(&self.timeline_dir).await?;
-        Ok((dir_existed, was_active))
+        Ok(dir_existed)
     }
 
     /// Cancel timeline to prevent further usage. Background tasks will stop
     /// eventually after receiving cancellation signal.
-    ///
-    /// Note that we can't notify backup launcher here while holding
-    /// shared_state lock, as this is a potential deadlock: caller is
-    /// responsible for that. Generally we should probably make WAL backup tasks
-    /// to shut down on their own, checking once in a while whether it is the
-    /// time.
-    fn cancel(&self, shared_state: &mut MutexGuard<'_, SharedState>) {
+    fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) {
         info!("timeline {} is cancelled", self.ttid);
         let _ = self.cancellation_tx.send(true);
         // Close associated FDs. Nobody will be able to touch timeline data once
@@ -569,30 +557,12 @@ impl Timeline {
     }
 
     /// Take a writing mutual exclusive lock on timeline shared_state.
-    pub async fn write_shared_state(&self) -> MutexGuard<SharedState> {
-        self.mutex.lock().await
+    pub async fn write_shared_state<'a>(self: &'a Arc<Self>) -> WriteGuardSharedState<'a> {
+        WriteGuardSharedState::new(self.clone(), self.mutex.write().await)
     }
 
-    async fn update_status(&self, shared_state: &mut SharedState) -> bool {
-        shared_state
-            .update_status(self.walreceivers.get_num(), self.ttid)
-            .await
-    }
-
-    /// Update timeline status and kick wal backup launcher to stop/start offloading if needed.
-    pub async fn update_status_notify(&self) -> Result<()> {
-        if self.is_cancelled() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-        let is_wal_backup_action_pending: bool = {
-            let mut shared_state = self.write_shared_state().await;
-            self.update_status(&mut shared_state).await
-        };
-        if is_wal_backup_action_pending {
-            // Can fail only if channel to a static thread got closed, which is not normal at all.
-            self.wal_backup_launcher_tx.send(self.ttid).await?;
-        }
-        Ok(())
+    pub async fn read_shared_state(&self) -> ReadGuardSharedState {
+        self.mutex.read().await
     }
 
     /// Returns true if walsender should stop sending WAL to pageserver. We
@@ -604,7 +574,7 @@ impl Timeline {
         if self.is_cancelled() {
             return true;
         }
-        let shared_state = self.write_shared_state().await;
+        let shared_state = self.read_shared_state().await;
         if self.walreceivers.get_num() == 0 {
             return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
             reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
@@ -612,9 +582,9 @@ impl Timeline {
         false
     }
 
-    /// Ensure taht current term is t, erroring otherwise, and lock the state.
-    pub async fn acquire_term(&self, t: Term) -> Result<MutexGuard<SharedState>> {
-        let ss = self.write_shared_state().await;
+    /// Ensure that current term is t, erroring otherwise, and lock the state.
+    pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
+        let ss = self.read_shared_state().await;
         if ss.sk.state.acceptor_state.term != t {
             bail!(
                 "failed to acquire term {}, current term {}",
@@ -625,18 +595,6 @@ impl Timeline {
         Ok(ss)
     }
 
-    /// Returns whether s3 offloading is required and sets current status as
-    /// matching it.
-    pub async fn wal_backup_attend(&self) -> bool {
-        if self.is_cancelled() {
-            return false;
-        }
-
-        self.write_shared_state()
-            .await
-            .wal_backup_attend(self.walreceivers.get_num())
-    }
-
     /// Returns commit_lsn watch channel.
     pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
         self.commit_lsn_watch_rx.clone()
@@ -647,9 +605,14 @@ impl Timeline {
         self.term_flush_lsn_watch_rx.clone()
     }
 
+    /// Returns watch channel for SharedState update version.
+    pub fn get_state_version_rx(&self) -> watch::Receiver<usize> {
+        self.shared_state_version_rx.clone()
+    }
+
     /// Pass arrived message to the safekeeper.
     pub async fn process_msg(
-        &self,
+        self: &Arc<Self>,
         msg: &ProposerAcceptorMessage,
     ) -> Result<Option<AcceptorProposerMessage>> {
         if self.is_cancelled() {
@@ -657,8 +620,6 @@ impl Timeline {
         }
 
         let mut rmsg: Option<AcceptorProposerMessage>;
-        let commit_lsn: Lsn;
-        let term_flush_lsn: TermLsn;
         {
             let mut shared_state = self.write_shared_state().await;
             rmsg = shared_state.sk.process_msg(msg).await?;
@@ -667,43 +628,28 @@ impl Timeline {
             if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
                 resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
             }
-
-            commit_lsn = shared_state.sk.state.inmem.commit_lsn;
-            term_flush_lsn =
-                TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn()));
         }
-        self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
-        self.commit_lsn_watch_tx.send(commit_lsn)?;
         Ok(rmsg)
     }
 
     /// Returns wal_seg_size.
     pub async fn get_wal_seg_size(&self) -> usize {
-        self.write_shared_state().await.get_wal_seg_size()
-    }
-
-    /// Returns true only if the timeline is loaded and active.
-    pub async fn is_active(&self) -> bool {
-        if self.is_cancelled() {
-            return false;
-        }
-
-        self.write_shared_state().await.active
+        self.read_shared_state().await.get_wal_seg_size()
     }
 
     /// Returns state of the timeline.
     pub async fn get_state(&self) -> (TimelineMemState, TimelinePersistentState) {
-        let state = self.write_shared_state().await;
+        let state = self.read_shared_state().await;
         (state.sk.state.inmem.clone(), state.sk.state.clone())
     }
 
     /// Returns latest backup_lsn.
     pub async fn get_wal_backup_lsn(&self) -> Lsn {
-        self.write_shared_state().await.sk.state.inmem.backup_lsn
+        self.read_shared_state().await.sk.state.inmem.backup_lsn
     }
 
     /// Sets backup_lsn to the given value.
-    pub async fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> {
+    pub async fn set_wal_backup_lsn(self: &Arc<Self>, backup_lsn: Lsn) -> Result<()> {
         if self.is_cancelled() {
             bail!(TimelineError::Cancelled(self.ttid));
         }
@@ -717,40 +663,34 @@ impl Timeline {
 
     /// Get safekeeper info for broadcasting to broker and other peers.
     pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
-        let shared_state = self.write_shared_state().await;
         let standby_apply_lsn = self.walsenders.get_hotstandby().reply.apply_lsn;
+        let shared_state = self.read_shared_state().await;
         shared_state.get_safekeeper_info(&self.ttid, conf, standby_apply_lsn)
     }
 
     /// Update timeline state with peer safekeeper data.
-    pub async fn record_safekeeper_info(&self, sk_info: SafekeeperTimelineInfo) -> Result<()> {
-        let is_wal_backup_action_pending: bool;
-        let commit_lsn: Lsn;
+    pub async fn record_safekeeper_info(
+        self: &Arc<Self>,
+        sk_info: SafekeeperTimelineInfo,
+    ) -> Result<()> {
         {
             let mut shared_state = self.write_shared_state().await;
             shared_state.sk.record_safekeeper_info(&sk_info).await?;
             let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
             shared_state.peers_info.upsert(&peer_info);
-            is_wal_backup_action_pending = self.update_status(&mut shared_state).await;
-            commit_lsn = shared_state.sk.state.inmem.commit_lsn;
-        }
-        self.commit_lsn_watch_tx.send(commit_lsn)?;
-        // Wake up wal backup launcher, if it is time to stop the offloading.
-        if is_wal_backup_action_pending {
-            self.wal_backup_launcher_tx.send(self.ttid).await?;
         }
         Ok(())
     }
 
     /// Update in memory remote consistent lsn.
-    pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) {
+    pub async fn update_remote_consistent_lsn(self: &Arc<Self>, candidate: Lsn) {
         let mut shared_state = self.write_shared_state().await;
         shared_state.sk.state.inmem.remote_consistent_lsn =
             max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
     }
 
     pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
-        let shared_state = self.write_shared_state().await;
+        let shared_state = self.read_shared_state().await;
         shared_state.get_peers(conf.heartbeat_timeout)
     }
 
@@ -772,7 +712,7 @@ impl Timeline {
     /// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
     /// Thus we don't try to predict it here.
     pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
-        let ss = self.write_shared_state().await;
+        let ss = self.read_shared_state().await;
         let term = ss.sk.state.acceptor_state.term;
         let last_log_term = ss.sk.get_epoch();
         let flush_lsn = ss.sk.flush_lsn();
@@ -843,12 +783,12 @@ impl Timeline {
 
     /// Returns flush_lsn.
     pub async fn get_flush_lsn(&self) -> Lsn {
-        self.write_shared_state().await.sk.wal_store.flush_lsn()
+        self.read_shared_state().await.sk.wal_store.flush_lsn()
     }
 
     /// Delete WAL segments from disk that are no longer needed. This is determined
     /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
-    pub async fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> {
+    pub async fn remove_old_wal(self: &Arc<Self>) -> Result<()> {
         if self.is_cancelled() {
             bail!(TimelineError::Cancelled(self.ttid));
         }
@@ -864,9 +804,8 @@ impl Timeline {
 
         let horizon_segno: XLogSegNo;
         let remover = {
-            let shared_state = self.write_shared_state().await;
-            horizon_segno =
-                shared_state.get_horizon_segno(wal_backup_enabled, replication_horizon_lsn);
+            let shared_state = self.read_shared_state().await;
+            horizon_segno = shared_state.get_horizon_segno(replication_horizon_lsn);
             if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
                 return Ok(()); // nothing to do
             }
@@ -888,7 +827,7 @@ impl Timeline {
     /// passed after the last save. This helps to keep remote_consistent_lsn up
     /// to date so that storage nodes restart doesn't cause many pageserver ->
     /// safekeeper reconnections.
-    pub async fn maybe_persist_control_file(&self) -> Result<()> {
+    pub async fn maybe_persist_control_file(self: &Arc<Self>) -> Result<()> {
         self.write_shared_state()
             .await
             .sk
@@ -896,38 +835,33 @@ impl Timeline {
             .await
     }
 
-    /// Gather timeline data for metrics. If the timeline is not active, returns
-    /// None, we do not collect these.
+    /// Gather timeline data for metrics.
     pub async fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
         if self.is_cancelled() {
             return None;
         }
 
         let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats();
-        let state = self.write_shared_state().await;
-        if state.active {
-            Some(FullTimelineInfo {
-                ttid: self.ttid,
-                ps_feedback_count,
-                last_ps_feedback,
-                wal_backup_active: state.wal_backup_active,
-                timeline_is_active: state.active,
-                num_computes: self.walreceivers.get_num() as u32,
-                last_removed_segno: state.last_removed_segno,
-                epoch_start_lsn: state.sk.epoch_start_lsn,
-                mem_state: state.sk.state.inmem.clone(),
-                persisted_state: state.sk.state.clone(),
-                flush_lsn: state.sk.wal_store.flush_lsn(),
-                wal_storage: state.sk.wal_store.get_metrics(),
-            })
-        } else {
-            None
-        }
+        let state = self.read_shared_state().await;
+        Some(FullTimelineInfo {
+            ttid: self.ttid,
+            ps_feedback_count,
+            last_ps_feedback,
+            wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
+            timeline_is_active: self.broker_active.load(Ordering::Relaxed),
+            num_computes: self.walreceivers.get_num() as u32,
+            last_removed_segno: state.last_removed_segno,
+            epoch_start_lsn: state.sk.epoch_start_lsn,
+            mem_state: state.sk.state.inmem.clone(),
+            persisted_state: state.sk.state.clone(),
+            flush_lsn: state.sk.wal_store.flush_lsn(),
+            wal_storage: state.sk.wal_store.get_metrics(),
+        })
     }
 
     /// Returns in-memory timeline state to build a full debug dump.
     pub async fn memory_dump(&self) -> debug_dump::Memory {
-        let state = self.write_shared_state().await;
+        let state = self.read_shared_state().await;
 
         let (write_lsn, write_record_lsn, flush_lsn, file_open) =
             state.sk.wal_store.internal_state();
@@ -936,8 +870,8 @@ impl Timeline {
             is_cancelled: self.is_cancelled(),
             peers_info_len: state.peers_info.0.len(),
             walsenders: self.walsenders.get_all(),
-            wal_backup_active: state.wal_backup_active,
-            active: state.active,
+            wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
+            active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
             last_removed_segno: state.last_removed_segno,
             epoch_start_lsn: state.sk.epoch_start_lsn,
@@ -951,7 +885,7 @@ impl Timeline {
 
     /// Apply a function to the control file state and persist it.
     pub async fn map_control_file<T>(
-        &self,
+        self: &Arc<Self>,
         f: impl FnOnce(&mut TimelinePersistentState) -> Result<T>,
     ) -> Result<T> {
         let mut state = self.write_shared_state().await;
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
new file mode 100644
index 0000000000..52ad915065
--- /dev/null
+++ b/safekeeper/src/timeline_manager.rs
@@ -0,0 +1,153 @@
+//! The timeline manager task is responsible for managing the timeline's background tasks.
+//! It is spawned alongside each timeline and exits when the timeline is deleted.
+//! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
+//! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
+
+use std::{sync::Arc, time::Duration};
+
+use tracing::{info, instrument, warn};
+use utils::lsn::Lsn;
+
+use crate::{
+    metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
+    timeline::{PeerInfo, ReadGuardSharedState, Timeline},
+    timelines_set::TimelinesSet,
+    wal_backup::{self, WalBackupTaskHandle},
+    SafeKeeperConf,
+};
+
+pub struct StateSnapshot {
+    pub commit_lsn: Lsn,
+    pub backup_lsn: Lsn,
+    pub remote_consistent_lsn: Lsn,
+    pub peers: Vec<PeerInfo>,
+}
+
+impl StateSnapshot {
+    /// Create a new snapshot of the timeline state.
+    fn new(read_guard: ReadGuardSharedState, heartbeat_timeout: Duration) -> Self {
+        Self {
+            commit_lsn: read_guard.sk.state.inmem.commit_lsn,
+            backup_lsn: read_guard.sk.state.inmem.backup_lsn,
+            remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn,
+            peers: read_guard.get_peers(heartbeat_timeout),
+        }
+    }
+}
+
+/// Control how often the manager task should wake up to check updates.
+/// There is no need to check for updates more often than this.
+const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
+
+/// This task gets spawned alongside each timeline and is responsible for managing the timeline's
+/// background tasks.
+#[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))]
+pub async fn main_task(
+    tli: Arc<Timeline>,
+    conf: SafeKeeperConf,
+    broker_active_set: Arc<TimelinesSet>,
+) {
+    let mut cancellation_rx = match tli.get_cancellation_rx() {
+        Ok(rx) => rx,
+        Err(_) => {
+            info!("timeline canceled during task start");
+            return;
+        }
+    };
+
+    scopeguard::defer! {
+        if tli.is_cancelled() {
+            info!("manager task finished");
+        } else {
+            warn!("manager task finished prematurely");
+        }
+    };
+
+    // sets whether timeline is active for broker pushes or not
+    let mut tli_broker_active = broker_active_set.guard(tli.clone());
+
+    let ttid = tli.ttid;
+    let wal_seg_size = tli.get_wal_seg_size().await;
+    let heartbeat_timeout = conf.heartbeat_timeout;
+
+    let mut state_version_rx = tli.get_state_version_rx();
+
+    let walreceivers = tli.get_walreceivers();
+    let mut num_computes_rx = walreceivers.get_num_rx();
+
+    // list of background tasks
+    let mut backup_task: Option<WalBackupTaskHandle> = None;
+
+    let last_state = 'outer: loop {
+        MANAGER_ITERATIONS_TOTAL.inc();
+
+        let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout);
+        let num_computes = *num_computes_rx.borrow();
+
+        let is_wal_backup_required =
+            wal_backup::is_wal_backup_required(wal_seg_size, num_computes, &state_snapshot);
+
+        if conf.is_wal_backup_enabled() {
+            wal_backup::update_task(
+                &conf,
+                ttid,
+                is_wal_backup_required,
+                &state_snapshot,
+                &mut backup_task,
+            )
+            .await;
+        }
+
+        let is_active = is_wal_backup_required
+            || num_computes > 0
+            || state_snapshot.remote_consistent_lsn < state_snapshot.commit_lsn;
+
+        // update the broker timeline set
+        if tli_broker_active.set(is_active) {
+            // write log if state has changed
+            info!(
+                "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
+                is_active, state_snapshot.remote_consistent_lsn, state_snapshot.commit_lsn,
+            );
+
+            MANAGER_ACTIVE_CHANGES.inc();
+
+            if !is_active {
+                // TODO: maybe use tokio::spawn?
+                if let Err(e) = tli.maybe_persist_control_file().await {
+                    warn!("control file save in update_status failed: {:?}", e);
+                }
+            }
+        }
+
+        // update the state in Arc<Timeline>
+        tli.wal_backup_active
+            .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
+        tli.broker_active
+            .store(is_active, std::sync::atomic::Ordering::Relaxed);
+
+        // wait until something changes. tx channels are stored under Arc, so they will not be
+        // dropped until the manager task is finished.
+        tokio::select! {
+            _ = cancellation_rx.changed() => {
+                // timeline was deleted
+                break 'outer state_snapshot;
+            }
+            _ = async {
+                // don't wake up on every state change, but at most every REFRESH_INTERVAL
+                tokio::time::sleep(REFRESH_INTERVAL).await;
+                let _ = state_version_rx.changed().await;
+            } => {
+                // state was updated
+            }
+            _ = num_computes_rx.changed() => {
+                // number of connected computes was updated
+            }
+        }
+    };
+
+    // shutdown background tasks
+    if conf.is_wal_backup_enabled() {
+        wal_backup::update_task(&conf, ttid, false, &last_state, &mut backup_task).await;
+    }
+}
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 079e706ff8..8d37bd6371 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -4,6 +4,7 @@
 
 use crate::safekeeper::ServerInfo;
 use crate::timeline::{Timeline, TimelineError};
+use crate::timelines_set::TimelinesSet;
 use crate::SafeKeeperConf;
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
@@ -11,16 +12,16 @@ use once_cell::sync::Lazy;
 use serde::Serialize;
 use std::collections::HashMap;
 use std::str::FromStr;
+use std::sync::atomic::Ordering;
 use std::sync::{Arc, Mutex};
-use tokio::sync::mpsc::Sender;
 use tracing::*;
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 
 struct GlobalTimelinesState {
     timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
-    wal_backup_launcher_tx: Option<Sender<TenantTimelineId>>,
     conf: Option<SafeKeeperConf>,
+    broker_active_set: Arc<TimelinesSet>,
     load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
 }
 
@@ -36,11 +37,8 @@ impl GlobalTimelinesState {
     }
 
     /// Get dependencies for a timeline constructor.
-    fn get_dependencies(&self) -> (SafeKeeperConf, Sender<TenantTimelineId>) {
-        (
-            self.get_conf().clone(),
-            self.wal_backup_launcher_tx.as_ref().unwrap().clone(),
-        )
+    fn get_dependencies(&self) -> (SafeKeeperConf, Arc<TimelinesSet>) {
+        (self.get_conf().clone(), self.broker_active_set.clone())
     }
 
     /// Insert timeline into the map. Returns error if timeline with the same id already exists.
@@ -65,8 +63,8 @@ impl GlobalTimelinesState {
 static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
     Mutex::new(GlobalTimelinesState {
         timelines: HashMap::new(),
-        wal_backup_launcher_tx: None,
         conf: None,
+        broker_active_set: Arc::new(TimelinesSet::default()),
         load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
     })
 });
@@ -76,16 +74,11 @@ pub struct GlobalTimelines;
 
 impl GlobalTimelines {
     /// Inject dependencies needed for the timeline constructors and load all timelines to memory.
-    pub async fn init(
-        conf: SafeKeeperConf,
-        wal_backup_launcher_tx: Sender<TenantTimelineId>,
-    ) -> Result<()> {
+    pub async fn init(conf: SafeKeeperConf) -> Result<()> {
         // clippy isn't smart enough to understand that drop(state) releases the
         // lock, so use explicit block
         let tenants_dir = {
             let mut state = TIMELINES_STATE.lock().unwrap();
-            assert!(state.wal_backup_launcher_tx.is_none());
-            state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx);
             state.conf = Some(conf);
 
             // Iterate through all directories and load tenants for all directories
@@ -129,12 +122,9 @@ impl GlobalTimelines {
     /// this function is called during init when nothing else is running, so
     /// this is fine.
     async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
-        let (conf, wal_backup_launcher_tx) = {
+        let (conf, broker_active_set) = {
             let state = TIMELINES_STATE.lock().unwrap();
-            (
-                state.get_conf().clone(),
-                state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
-            )
+            state.get_dependencies()
         };
 
         let timelines_dir = conf.tenant_dir(&tenant_id);
@@ -147,7 +137,7 @@ impl GlobalTimelines {
                         TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
                     {
                         let ttid = TenantTimelineId::new(tenant_id, timeline_id);
-                        match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx.clone()) {
+                        match Timeline::load_timeline(&conf, ttid) {
                             Ok(timeline) => {
                                 let tli = Arc::new(timeline);
                                 TIMELINES_STATE
@@ -155,8 +145,7 @@ impl GlobalTimelines {
                                     .unwrap()
                                     .timelines
                                     .insert(ttid, tli.clone());
-                                tli.bootstrap(&conf);
-                                tli.update_status_notify().await.unwrap();
+                                tli.bootstrap(&conf, broker_active_set.clone());
                             }
                             // If we can't load a timeline, it's most likely because of a corrupted
                             // directory. We will log an error and won't allow to delete/recreate
@@ -189,9 +178,9 @@ impl GlobalTimelines {
         _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>,
         ttid: TenantTimelineId,
     ) -> Result<Arc<Timeline>> {
-        let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();
+        let (conf, broker_active_set) = TIMELINES_STATE.lock().unwrap().get_dependencies();
 
-        match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) {
+        match Timeline::load_timeline(&conf, ttid) {
             Ok(timeline) => {
                 let tli = Arc::new(timeline);
 
@@ -202,7 +191,7 @@ impl GlobalTimelines {
                     .timelines
                     .insert(ttid, tli.clone());
 
-                tli.bootstrap(&conf);
+                tli.bootstrap(&conf, broker_active_set);
 
                 Ok(tli)
             }
@@ -221,6 +210,10 @@ impl GlobalTimelines {
         TIMELINES_STATE.lock().unwrap().get_conf().clone()
     }
 
+    pub fn get_global_broker_active_set() -> Arc<TimelinesSet> {
+        TIMELINES_STATE.lock().unwrap().broker_active_set.clone()
+    }
+
     /// Create a new timeline with the given id. If the timeline already exists, returns
     /// an existing timeline.
     pub async fn create(
@@ -229,7 +222,7 @@ impl GlobalTimelines {
         commit_lsn: Lsn,
         local_start_lsn: Lsn,
     ) -> Result<Arc<Timeline>> {
-        let (conf, wal_backup_launcher_tx) = {
+        let (conf, broker_active_set) = {
             let state = TIMELINES_STATE.lock().unwrap();
             if let Ok(timeline) = state.get(&ttid) {
                 // Timeline already exists, return it.
@@ -243,7 +236,6 @@ impl GlobalTimelines {
         let timeline = Arc::new(Timeline::create_empty(
             &conf,
             ttid,
-            wal_backup_launcher_tx,
             server_info,
             commit_lsn,
             local_start_lsn,
@@ -264,7 +256,10 @@ impl GlobalTimelines {
             // Write the new timeline to the disk and start background workers.
             // Bootstrap is transactional, so if it fails, the timeline will be deleted,
             // and the state on disk should remain unchanged.
-            if let Err(e) = timeline.init_new(&mut shared_state, &conf).await {
+            if let Err(e) = timeline
+                .init_new(&mut shared_state, &conf, broker_active_set)
+                .await
+            {
                 // Note: the most likely reason for init failure is that the timeline
                 // directory already exists on disk. This happens when timeline is corrupted
                 // and wasn't loaded from disk on startup because of that. We want to preserve
@@ -281,8 +276,6 @@ impl GlobalTimelines {
             // We are done with bootstrap, release the lock, return the timeline.
             // {} block forces release before .await
         }
-        timeline.update_status_notify().await?;
-        timeline.wal_backup_launcher_tx.send(timeline.ttid).await?;
         Ok(timeline)
     }
 
@@ -335,12 +328,13 @@ impl GlobalTimelines {
         let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid);
         match tli_res {
             Ok(timeline) => {
+                let was_active = timeline.broker_active.load(Ordering::Relaxed);
+
                 // Take a lock and finish the deletion holding this mutex.
                 let mut shared_state = timeline.write_shared_state().await;
 
                 info!("deleting timeline {}, only_local={}", ttid, only_local);
-                let (dir_existed, was_active) =
-                    timeline.delete(&mut shared_state, only_local).await?;
+                let dir_existed = timeline.delete(&mut shared_state, only_local).await?;
 
                 // Remove timeline from the map.
                 // FIXME: re-enable it once we fix the issue with recreation of deleted timelines
@@ -349,7 +343,7 @@ impl GlobalTimelines {
 
                 Ok(TimelineDeleteForceResult {
                     dir_existed,
-                    was_active,
+                    was_active, // TODO: we probably should remove this field
                 })
             }
             Err(_) => {
diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs
new file mode 100644
index 0000000000..ea8e23bb72
--- /dev/null
+++ b/safekeeper/src/timelines_set.rs
@@ -0,0 +1,90 @@
+use std::{collections::HashMap, sync::Arc};
+
+use utils::id::TenantTimelineId;
+
+use crate::timeline::Timeline;
+
+/// Set of timelines, supports operations:
+/// - add timeline
+/// - remove timeline
+/// - clone the set
+///
+/// Usually used for keeping subset of timelines. For example active timelines that require broker push.
+pub struct TimelinesSet {
+    timelines: std::sync::Mutex<HashMap<TenantTimelineId, Arc<Timeline>>>,
+}
+
+impl Default for TimelinesSet {
+    fn default() -> Self {
+        Self {
+            timelines: std::sync::Mutex::new(HashMap::new()),
+        }
+    }
+}
+
+impl TimelinesSet {
+    pub fn insert(&self, tli: Arc<Timeline>) {
+        self.timelines.lock().unwrap().insert(tli.ttid, tli);
+    }
+
+    pub fn delete(&self, ttid: &TenantTimelineId) {
+        self.timelines.lock().unwrap().remove(ttid);
+    }
+
+    /// If present is true, adds timeline to the set, otherwise removes it.
+    pub fn set_present(&self, tli: Arc<Timeline>, present: bool) {
+        if present {
+            self.insert(tli);
+        } else {
+            self.delete(&tli.ttid);
+        }
+    }
+
+    pub fn is_present(&self, ttid: &TenantTimelineId) -> bool {
+        self.timelines.lock().unwrap().contains_key(ttid)
+    }
+
+    /// Returns all timelines in the set.
+    pub fn get_all(&self) -> Vec<Arc<Timeline>> {
+        self.timelines.lock().unwrap().values().cloned().collect()
+    }
+
+    /// Returns a timeline guard for easy presence control.
+    pub fn guard(self: &Arc<Self>, tli: Arc<Timeline>) -> TimelineSetGuard {
+        let is_present = self.is_present(&tli.ttid);
+        TimelineSetGuard {
+            timelines_set: self.clone(),
+            tli,
+            is_present,
+        }
+    }
+}
+
+/// Guard is used to add or remove timeline from the set.
+/// If the timeline present in set, it will be removed from it on drop.
+/// Note: do not use more than one guard for the same timeline, it caches the presence state.
+/// It is designed to be used in the manager task only.
+pub struct TimelineSetGuard {
+    timelines_set: Arc<TimelinesSet>,
+    tli: Arc<Timeline>,
+    is_present: bool,
+}
+
+impl TimelineSetGuard {
+    /// Returns true if the state was changed.
+    pub fn set(&mut self, present: bool) -> bool {
+        if present == self.is_present {
+            return false;
+        }
+        self.is_present = present;
+        self.timelines_set.set_present(self.tli.clone(), present);
+        true
+    }
+}
+
+impl Drop for TimelineSetGuard {
+    fn drop(&mut self) {
+        // remove timeline from the map on drop
+        self.timelines_set.delete(&self.tli.ttid);
+    }
+}
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index e496f07114..84680557f9 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -9,7 +9,7 @@ use utils::backoff;
 use utils::id::NodeId;
 
 use std::cmp::min;
-use std::collections::{HashMap, HashSet};
+use std::collections::HashSet;
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::sync::Arc;
@@ -29,9 +29,10 @@ use tracing::*;
 
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
-use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS};
+use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
 use crate::timeline::{PeerInfo, Timeline};
-use crate::{GlobalTimelines, SafeKeeperConf};
+use crate::timeline_manager::StateSnapshot;
+use crate::{GlobalTimelines, SafeKeeperConf, WAL_BACKUP_RUNTIME};
 
 use once_cell::sync::OnceCell;
 
@@ -41,35 +42,84 @@ const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 const BUFFER_SIZE: usize = 32 * 1024;
 
-/// Check whether wal backup is required for timeline. If yes, mark that launcher is
-/// aware of current status and return the timeline.
-async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option<Arc<Timeline>> {
-    match GlobalTimelines::get(ttid).ok() {
-        Some(tli) => {
-            tli.wal_backup_attend().await;
-            Some(tli)
-        }
-        None => None,
-    }
-}
-
-struct WalBackupTaskHandle {
+pub struct WalBackupTaskHandle {
     shutdown_tx: Sender<()>,
     handle: JoinHandle<()>,
 }
 
-struct WalBackupTimelineEntry {
-    timeline: Arc<Timeline>,
-    handle: Option<WalBackupTaskHandle>,
+/// Do we have anything to upload to S3, i.e. should safekeepers run backup activity?
+pub fn is_wal_backup_required(
+    wal_seg_size: usize,
+    num_computes: usize,
+    state: &StateSnapshot,
+) -> bool {
+    num_computes > 0 ||
+    // Currently only the whole segment is offloaded, so compare segment numbers.
+    (state.commit_lsn.segment_number(wal_seg_size) > state.backup_lsn.segment_number(wal_seg_size))
 }
 
-async fn shut_down_task(ttid: TenantTimelineId, entry: &mut WalBackupTimelineEntry) {
-    if let Some(wb_handle) = entry.handle.take() {
+/// Based on peer information determine which safekeeper should offload; if it
+/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
+/// is running, kill it.
+pub async fn update_task(
+    conf: &SafeKeeperConf,
+    ttid: TenantTimelineId,
+    need_backup: bool,
+    state: &StateSnapshot,
+    entry: &mut Option<WalBackupTaskHandle>,
+) {
+    let (offloader, election_dbg_str) =
+        determine_offloader(&state.peers, state.backup_lsn, ttid, conf);
+    let elected_me = Some(conf.my_id) == offloader;
+
+    let should_task_run = need_backup && elected_me;
+
+    // start or stop the task
+    if should_task_run != (entry.is_some()) {
+        if should_task_run {
+            info!("elected for backup: {}", election_dbg_str);
+
+            let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
+            let timeline_dir = conf.timeline_dir(&ttid);
+
+            let async_task = backup_task_main(
+                ttid,
+                timeline_dir,
+                conf.workdir.clone(),
+                conf.backup_parallel_jobs,
+                shutdown_rx,
+            );
+
+            let handle = if conf.current_thread_runtime {
+                tokio::spawn(async_task)
+            } else {
+                WAL_BACKUP_RUNTIME.spawn(async_task)
+            };
+
+            *entry = Some(WalBackupTaskHandle {
+                shutdown_tx,
+                handle,
+            });
+        } else {
+            if !need_backup {
+                // don't need backup at all
+                info!("stepping down from backup, need_backup={}", need_backup);
+            } else {
+                // someone else has been elected
+                info!("stepping down from backup: {}", election_dbg_str);
+            }
+            shut_down_task(entry).await;
+        }
+    }
+}
+
+async fn shut_down_task(entry: &mut Option<WalBackupTaskHandle>) {
+    if let Some(wb_handle) = entry.take() {
         // Tell the task to shutdown. Error means task exited earlier, that's ok.
         let _ = wb_handle.shutdown_tx.send(()).await;
         // Await the task itself. TODO: restart panicked tasks earlier.
         if let Err(e) = wb_handle.handle.await {
-            warn!("WAL backup task for {} panicked: {}", ttid, e);
+            warn!("WAL backup task panicked: {}", e);
         }
     }
 }
@@ -126,49 +176,6 @@ fn determine_offloader(
     }
 }
 
-/// Based on peer information determine which safekeeper should offload; if it
-/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
-/// is running, kill it.
-async fn update_task(
-    conf: &SafeKeeperConf,
-    ttid: TenantTimelineId,
-    entry: &mut WalBackupTimelineEntry,
-) {
-    let alive_peers = entry.timeline.get_peers(conf).await;
-    let wal_backup_lsn = entry.timeline.get_wal_backup_lsn().await;
-    let (offloader, election_dbg_str) =
-        determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf);
-    let elected_me = Some(conf.my_id) == offloader;
-
-    if elected_me != (entry.handle.is_some()) {
-        if elected_me {
-            info!("elected for backup: {}", election_dbg_str);
-
-            let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
-            let timeline_dir = conf.timeline_dir(&ttid);
-
-            let handle = tokio::spawn(
-                backup_task_main(
-                    ttid,
-                    timeline_dir,
-                    conf.workdir.clone(),
-                    conf.backup_parallel_jobs,
-                    shutdown_rx,
-                )
-                .in_current_span(),
-            );
-
-            entry.handle = Some(WalBackupTaskHandle {
-                shutdown_tx,
-                handle,
-            });
-        } else {
-            info!("stepping down from backup: {}", election_dbg_str);
-            shut_down_task(ttid, entry).await;
-        }
-    }
-}
-
 static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
 
 // Storage must be configured and initialized when this is called.
@@ -190,67 +197,6 @@ pub fn init_remote_storage(conf: &SafeKeeperConf) {
     });
 }
 
-const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
-
-/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
-/// tasks. Having this in separate task simplifies locking, allows to reap
-/// panics and separate elections from offloading itself.
-pub async fn wal_backup_launcher_task_main(
-    conf: SafeKeeperConf,
-    mut wal_backup_launcher_rx: Receiver<TenantTimelineId>,
-) -> anyhow::Result<()> {
-    info!(
-        "WAL backup launcher started, remote config {:?}",
-        conf.remote_storage
-    );
-
-    // Presence in this map means launcher is aware s3 offloading is needed for
-    // the timeline, but task is started only if it makes sense for to offload
-    // from this safekeeper.
-    let mut tasks: HashMap<TenantTimelineId, WalBackupTimelineEntry> = HashMap::new();
-
-    let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC));
-    loop {
-        tokio::select! {
-            ttid = wal_backup_launcher_rx.recv() => {
-                // channel is never expected to get closed
-                let ttid = ttid.unwrap();
-                if !conf.is_wal_backup_enabled() {
-                    continue; /* just drain the channel and do nothing */
-                }
-                async {
-                    let timeline = is_wal_backup_required(ttid).await;
-                    // do we need to do anything at all?
-                    if timeline.is_some() != tasks.contains_key(&ttid) {
-                        if let Some(timeline) = timeline {
-                            // need to start the task
-                            let entry = tasks.entry(ttid).or_insert(WalBackupTimelineEntry {
-                                timeline,
-                                handle: None,
-                            });
-                            update_task(&conf, ttid, entry).await;
-                        } else {
-                            // need to stop the task
-                            info!("stopping WAL backup task");
-                            let mut entry = tasks.remove(&ttid).unwrap();
-                            shut_down_task(ttid, &mut entry).await;
-                        }
-                    }
-                }.instrument(info_span!("WAL backup", ttid = %ttid)).await;
-            }
-            // For each timeline needing offloading, check if this safekeeper
-            // should do the job and start/stop the task accordingly.
-            _ = ticker.tick() => {
-                for (ttid, entry) in tasks.iter_mut() {
-                    update_task(&conf, *ttid, entry)
-                        .instrument(info_span!("WAL backup", ttid = %ttid))
-                        .await;
-                }
-            }
-        }
-    }
-}
-
 struct WalBackupTask {
     timeline: Arc<Timeline>,
     timeline_dir: Utf8PathBuf,
@@ -261,6 +207,7 @@ struct WalBackupTask {
 }
 
 /// Offload single timeline.
+#[instrument(name = "WAL backup", skip_all, fields(ttid = %ttid))]
 async fn backup_task_main(
     ttid: TenantTimelineId,
     timeline_dir: Utf8PathBuf,
@@ -268,6 +215,8 @@ async fn backup_task_main(
     parallel_jobs: usize,
     mut shutdown_rx: Receiver<()>,
 ) {
+    let _guard = WAL_BACKUP_TASKS.guard();
+
     info!("started");
     let res = GlobalTimelines::get(ttid);
     if let Err(e) = res {

From 664f92dc6e6eac25380272fe4e466b4ddc4954bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 22 May 2024 12:43:03 +0200
Subject: [PATCH 0834/1571] Refactor PageServerHandler::process_query parsing
 (#7835)

In the process_query function in page_service.rs there was some
redundant duplication. Remove it and create a vector of whitespace
separated parts at the start and then use `slice::strip_prefix`. Only
use `starts_with` in the places with multiple whitespace separated
parameters: here we want to preserve grep/rg ability.

Followup of #7815, requested in
https://github.com/neondatabase/neon/pull/7815#pullrequestreview-2068835674
---
 pageserver/src/page_service.rs | 70 ++++++++++++----------------------
 1 file changed, 25 insertions(+), 45 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index c066f56c17..d250864fd6 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1521,9 +1521,8 @@ where
 
         let ctx = self.connection_ctx.attached_child();
         debug!("process query {query_string:?}");
-        if query_string.starts_with("pagestream_v2 ") {
-            let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
-            let params = params_raw.split(' ').collect::<Vec<_>>();
+        let parts = query_string.split_whitespace().collect::<Vec<_>>();
+        if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
             if params.len() != 2 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for pagestream command"
@@ -1548,9 +1547,7 @@ where
                 ctx,
             )
             .await?;
-        } else if query_string.starts_with("pagestream ") {
-            let (_, params_raw) = query_string.split_at("pagestream ".len());
-            let params = params_raw.split(' ').collect::<Vec<_>>();
+        } else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
             if params.len() != 2 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for pagestream command"
@@ -1575,10 +1572,7 @@ where
                 ctx,
             )
             .await?;
-        } else if query_string.starts_with("basebackup ") {
-            let (_, params_raw) = query_string.split_at("basebackup ".len());
-            let params = params_raw.split_whitespace().collect::<Vec<_>>();
-
+        } else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
             if params.len() < 2 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for basebackup command"
@@ -1596,26 +1590,23 @@ where
 
             self.check_permission(Some(tenant_id))?;
 
-            let lsn = if params.len() >= 3 {
+            let lsn = if let Some(lsn_str) = params.get(2) {
                 Some(
-                    Lsn::from_str(params[2])
-                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
+                    Lsn::from_str(lsn_str)
+                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
                 )
             } else {
                 None
             };
 
-            let gzip = if params.len() >= 4 {
-                if params[3] == "--gzip" {
-                    true
-                } else {
+            let gzip = match params.get(3) {
+                Some(&"--gzip") => true,
+                None => false,
+                Some(third_param) => {
                     return Err(QueryError::Other(anyhow::anyhow!(
-                        "Parameter in position 3 unknown {}",
-                        params[3],
-                    )));
+                        "Parameter in position 3 unknown {third_param}",
+                    )))
                 }
-            } else {
-                false
             };
 
             let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
@@ -1639,10 +1630,7 @@ where
             res?;
         }
         // return pair of prev_lsn and last_lsn
-        else if query_string.starts_with("get_last_record_rlsn ") {
-            let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
-            let params = params_raw.split_whitespace().collect::<Vec<_>>();
-
+        else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
             if params.len() != 2 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for get_last_record_rlsn command"
@@ -1684,10 +1672,7 @@ where
             .await?;
         }
         // same as basebackup, but result includes relational data as well
-        else if query_string.starts_with("fullbackup ") {
-            let (_, params_raw) = query_string.split_at("fullbackup ".len());
-            let params = params_raw.split_whitespace().collect::<Vec<_>>();
-
+        else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
             if params.len() < 2 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for fullbackup command"
@@ -1704,18 +1689,18 @@ where
                 .record("timeline_id", field::display(timeline_id));
 
             // The caller is responsible for providing correct lsn and prev_lsn.
-            let lsn = if params.len() > 2 {
+            let lsn = if let Some(lsn_str) = params.get(2) {
                 Some(
-                    Lsn::from_str(params[2])
-                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
+                    Lsn::from_str(lsn_str)
+                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
                 )
             } else {
                 None
             };
-            let prev_lsn = if params.len() > 3 {
+            let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
                 Some(
-                    Lsn::from_str(params[3])
-                        .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?,
+                    Lsn::from_str(prev_lsn_str)
+                        .with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
                 )
             } else {
                 None
@@ -1748,8 +1733,7 @@ where
             // 2. Run:
             // cat my_backup/base.tar | psql -h $PAGESERVER \
             //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
-            let (_, params_raw) = query_string.split_at("import basebackup ".len());
-            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+            let params = &parts[2..];
             if params.len() != 5 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for import basebackup command"
@@ -1798,8 +1782,7 @@ where
             //
             // Files are scheduled to be persisted to remote storage, and the
             // caller should poll the http api to check when that is done.
-            let (_, params_raw) = query_string.split_at("import wal ".len());
-            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+            let params = &parts[2..];
             if params.len() != 4 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for import wal command"
@@ -1838,8 +1821,7 @@ where
             // on connect
             pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
         } else if query_string.starts_with("lease lsn ") {
-            let (_, params_raw) = query_string.split_at("lease lsn ".len());
-            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+            let params = &parts[2..];
             if params.len() != 3 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number {} for lease lsn command",
@@ -1875,10 +1857,8 @@ where
                     ))?
                 }
             };
-        } else if query_string.starts_with("show ") {
+        } else if let Some(params) = parts.strip_prefix(&["show"]) {
             // show <tenant_id>
-            let (_, params_raw) = query_string.split_at("show ".len());
-            let params = params_raw.split(' ').collect::<Vec<_>>();
             if params.len() != 1 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for config command"

From b43f6daa488de05691775a8908920d274fb53369 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 2 May 2024 15:53:30 +0300
Subject: [PATCH 0835/1571] One more iteration on making walcraft test more
 robust.

Some WAL might be inserted on the page boundary before XLOG_SWITCH lands there,
repeat construction in this case.
---
 libs/postgres_ffi/wal_craft/src/lib.rs | 46 ++++++++++++--------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index b6769629a8..6052f04d11 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -373,31 +373,29 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
                 "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
                 &[&(repeats as i32)],
             )?;
-            break;
-        }
-        info!(
-            "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
-            client.pg_current_wal_insert_lsn()?,
-            XLOG_SIZE_OF_XLOG_RECORD
-        );
+            info!(
+                "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
+                client.pg_current_wal_insert_lsn()?,
+                XLOG_SIZE_OF_XLOG_RECORD
+            );
 
-        // Emit the XLOG_SWITCH
-        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
-        let next_segment = PgLsn::from(0x0200_0000);
-        ensure!(
-            xlog_switch_record_end < next_segment,
-            "XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
-            xlog_switch_record_end,
-            next_segment
-        );
-        ensure!(
-            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
-            "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
-            xlog_switch_record_end,
-            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
-        );
-        Ok(vec![before_xlog_switch, xlog_switch_record_end])
+            // Emit the XLOG_SWITCH
+            let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
+            let xlog_switch_record_end: PgLsn =
+                client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+
+            if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
+                != XLOG_SIZE_OF_XLOG_SHORT_PHD
+            {
+                warn!(
+                    "XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating",
+                    xlog_switch_record_end,
+                    u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
+                );
+                continue;
+            }
+            return Ok(vec![before_xlog_switch, xlog_switch_record_end]);
+        }
     }
 }
 

From ef96c82c9f189633629e6bfc051a78fbab12c9a6 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 21 May 2024 19:33:03 +0300
Subject: [PATCH 0836/1571] Fix zenith_test_evict mode and clear_buffer_cache()
 function

Using InvalidateBuffer is wrong, because if the page is concurrently
dirtied, it will throw away the dirty page without calling
smgwrite(). In Neon, that means that the last-written LSN update for
the page is missed.

In v16, use the new InvalidateVictimBuffer() function that does what
we need. In v15 and v14, backport the InvalidateVictimBuffer()
function.

Fixes issue https://github.com/neondatabase/neon/issues/7802
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 21ec61d539..0d30e28f74 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 21ec61d539d22a81fe811c2d79e26436820bc3f4
+Subproject commit 0d30e28f74f49fe6a27a6bd45dcfeb1060656b8f
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index e2dbd63345..74fb144890 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit e2dbd63345c584de75173c27951f111249ae0016
+Subproject commit 74fb144890c4f955db1ef50ee1eeb9d8a6c2f69d
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index c271017c6c..3c2b9d576c 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit c271017c6c4846be59948766baec2ba4ace5dc9c
+Subproject commit 3c2b9d576c580e0b5b7108001f959b8c5b42e0a2
diff --git a/vendor/revisions.json b/vendor/revisions.json
index a3af9331fe..2f16f334c5 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "c271017c6c4846be59948766baec2ba4ace5dc9c"],
-  "v15": ["15.7", "e2dbd63345c584de75173c27951f111249ae0016"],
-  "v14": ["14.12", "21ec61d539d22a81fe811c2d79e26436820bc3f4"]
+  "v16": ["16.3", "3c2b9d576c580e0b5b7108001f959b8c5b42e0a2"],
+  "v15": ["15.7", "74fb144890c4f955db1ef50ee1eeb9d8a6c2f69d"],
+  "v14": ["14.12", "0d30e28f74f49fe6a27a6bd45dcfeb1060656b8f"]
 }

From df9ab1b5e3962da4d98c7b1cd6a7fa20f4ff3902 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 22 May 2024 15:43:21 +0300
Subject: [PATCH 0837/1571] refactor(test): duplication with fullbackup, tar
 content hashing (#7828)

"taking a fullbackup" is an ugly multi-liner copypasted in multiple
places, most recently with timeline ancestor detach tests. move it under
`PgBin` which is not a great place, but better than yet another utility
function.

Additionally:
- cleanup `psql_env` repetition (PgBin already configures that)
- move the backup tar comparison as a yet another free utility function
- use backup tar comparison in `test_import.py` where a size check was
done previously
- cleanup extra timeline creation from test

Cc: #7715
---
 test_runner/fixtures/neon_fixtures.py         |  22 ++++
 test_runner/fixtures/utils.py                 |  50 ++++++++
 test_runner/regress/test_fullbackup.py        |  23 ++--
 test_runner/regress/test_import.py            |  25 +---
 test_runner/regress/test_next_xid.py          |  11 +-
 .../regress/test_timeline_detach_ancestor.py  | 120 +++---------------
 6 files changed, 106 insertions(+), 145 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 7a660b64ed..b02054a702 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2788,6 +2788,28 @@ class PgBin:
         log.info(f"last checkpoint at {checkpoint_lsn}")
         return Lsn(checkpoint_lsn)
 
+    def take_fullbackup(
+        self,
+        pageserver: NeonPageserver,
+        tenant: TenantId,
+        timeline: TimelineId,
+        lsn: Lsn,
+        output: Path,
+    ):
+        """
+        Request fullbackup from pageserver, store it at 'output'.
+        """
+        cmd = [
+            "psql",
+            "--no-psqlrc",
+            pageserver.connstr(),
+            "-c",
+            f"fullbackup {tenant} {timeline} {lsn}",
+            "-o",
+            str(output),
+        ]
+        self.run_capture(cmd)
+
 
 @pytest.fixture(scope="function")
 def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin:
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 16dc9e8cfb..70263245e7 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -4,10 +4,13 @@ import json
 import os
 import re
 import subprocess
+import tarfile
 import threading
 import time
+from hashlib import sha256
 from pathlib import Path
 from typing import (
+    IO,
     TYPE_CHECKING,
     Any,
     Callable,
@@ -15,8 +18,10 @@ from typing import (
     Iterable,
     List,
     Optional,
+    Set,
     Tuple,
     TypeVar,
+    Union,
 )
 from urllib.parse import urlencode
 
@@ -499,3 +504,48 @@ class AuxFileStore(str, enum.Enum):
 
     def __str__(self) -> str:
         return f"'aux-{self.value}'"
+
+
+def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: Set[str]):
+    """
+    This is essentially:
+
+    lines=$(comm -3 \
+        <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \
+        <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \
+        | wc -l)
+    [ "$lines" = "0" ]
+
+    But in a more mac friendly fashion.
+    """
+    started_at = time.time()
+
+    def hash_extracted(reader: Union[IO[bytes], None]) -> bytes:
+        assert reader is not None
+        digest = sha256(usedforsecurity=False)
+        while True:
+            buf = reader.read(64 * 1024)
+            if not buf:
+                break
+            digest.update(buf)
+        return digest.digest()
+
+    def build_hash_list(p: Path) -> List[Tuple[str, bytes]]:
+        with tarfile.open(p) as f:
+            matching_files = (info for info in f if info.isreg() and info.name not in skip_files)
+            ret = list(
+                map(lambda info: (info.name, hash_extracted(f.extractfile(info))), matching_files)
+            )
+            ret.sort(key=lambda t: t[0])
+            return ret
+
+    left_list, right_list = map(build_hash_list, [left, right])
+
+    try:
+        assert len(left_list) == len(right_list)
+
+        for left_tuple, right_tuple in zip(left_list, right_list):
+            assert left_tuple == right_tuple
+    finally:
+        elapsed = time.time() - started_at
+        log.info(f"assert_pageserver_backups_equal completed in {elapsed}s")
diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py
index e1e4f700d4..e6d51a77a6 100644
--- a/test_runner/regress/test_fullbackup.py
+++ b/test_runner/regress/test_fullbackup.py
@@ -1,7 +1,7 @@
 import os
 from pathlib import Path
 
-from fixtures.common_types import Lsn, TimelineId
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -19,17 +19,16 @@ def test_fullbackup(
     neon_env_builder: NeonEnvBuilder,
     pg_bin: PgBin,
     port_distributor: PortDistributor,
-    pg_distrib_dir: Path,
     test_output_dir: Path,
 ):
     env = neon_env_builder.init_start()
 
-    env.neon_cli.create_branch("test_fullbackup")
-    endpoint_main = env.endpoints.create_start("test_fullbackup")
+    # endpoint needs to be alive until the fullbackup so that we have
+    # prev_record_lsn for the vanilla_pg to start in read-write mode
+    # for some reason this does not happen if endpoint is shutdown.
+    endpoint_main = env.endpoints.create_start("main")
 
     with endpoint_main.cursor() as cur:
-        timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
-
         # data loading may take a while, so increase statement timeout
         cur.execute("SET statement_timeout='300s'")
         cur.execute(
@@ -41,17 +40,13 @@ def test_fullbackup(
         lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
         log.info(f"start_backup_lsn = {lsn}")
 
-    # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
-    # PgBin sets it automatically, but here we need to pipe psql output to the tar command.
-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
-
     # Get and unpack fullbackup from pageserver
     restored_dir_path = env.repo_dir / "restored_datadir"
     os.mkdir(restored_dir_path, 0o750)
-    query = f"fullbackup {env.initial_tenant} {timeline} {lsn}"
     tar_output_file = test_output_dir / "fullbackup.tar"
-    cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, env.initial_timeline, lsn, tar_output_file
+    )
     subprocess_capture(
         env.repo_dir, ["tar", "-xf", str(tar_output_file), "-C", str(restored_dir_path)]
     )
@@ -61,7 +56,7 @@ def test_fullbackup(
     # use resetwal to overwrite it
     pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
     cmd = [pg_resetwal_path, "-D", str(restored_dir_path)]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.run_capture(cmd)
 
     # Restore from the backup and find the data we inserted
     port = port_distributor.get_port()
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 1f1c8cc582..62229ebfe7 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -21,7 +21,7 @@ from fixtures.pageserver.utils import (
     wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.utils import subprocess_capture
+from fixtures.utils import assert_pageserver_backups_equal, subprocess_capture
 
 
 def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder):
@@ -248,15 +248,9 @@ def _import(
     path to the backup archive file"""
     log.info(f"start_backup_lsn = {lsn}")
 
-    # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
-    # PgBin sets it automatically, but here we need to pipe psql output to the tar command.
-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
-
     # Get a fullbackup from pageserver
-    query = f"fullbackup { env.initial_tenant} {timeline} {lsn}"
     tar_output_file = test_output_dir / "fullbackup.tar"
-    cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(env.pageserver, env.initial_tenant, timeline, lsn, tar_output_file)
 
     # Stop the first pageserver instance, erase all its data
     env.endpoints.stop_all()
@@ -305,22 +299,11 @@ def _import(
     assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)]
 
     # Take another fullbackup
-    query = f"fullbackup { tenant} {timeline} {lsn}"
     new_tar_output_file = test_output_dir / "fullbackup-new.tar"
-    cmd = [
-        "psql",
-        "--no-psqlrc",
-        env.pageserver.connstr(),
-        "-c",
-        query,
-        "-o",
-        str(new_tar_output_file),
-    ]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(env.pageserver, tenant, timeline, lsn, new_tar_output_file)
 
     # Check it's the same as the first fullbackup
-    # TODO pageserver should be checking checksum
-    assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file)
+    assert_pageserver_backups_equal(tar_output_file, new_tar_output_file, set())
 
     # Check that gc works
     pageserver_http = env.pageserver.http_client()
diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py
index 45c0e3e409..98fb06a0d6 100644
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -5,7 +5,7 @@ from pathlib import Path
 
 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_wal_insert_lsn
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_wal_insert_lsn
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
 )
@@ -71,22 +71,17 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder):
 def test_import_at_2bil(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
-    pg_distrib_dir: Path,
-    pg_bin,
+    pg_bin: PgBin,
     vanilla_pg,
 ):
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
     env = neon_env_builder.init_start()
     ps_http = env.pageserver.http_client()
 
-    # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
-    # PgBin sets it automatically, but here we need to pipe psql output to the tar command.
-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
-
     # Reset the vanilla Postgres instance to somewhat before 2 billion transactions.
     pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
     cmd = [pg_resetwal_path, "--next-transaction-id=2129920000", "-D", str(vanilla_pg.pgdatadir)]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.run_capture(cmd)
 
     vanilla_pg.start()
     vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 3e435caeee..1563c161e0 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -1,25 +1,18 @@
 import datetime
 import enum
-import tarfile
-import time
 from concurrent.futures import ThreadPoolExecutor
-from hashlib import sha256
-from pathlib import Path
 from queue import Empty, Queue
 from threading import Barrier
-from typing import IO, List, Set, Tuple, Union
+from typing import List, Tuple
 
 import pytest
 from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    NeonEnvBuilder,
-    PgBin,
-    wait_for_last_flush_lsn,
-)
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
 from fixtures.pageserver.http import HistoricLayerInfo
 from fixtures.pageserver.utils import wait_timeline_detail_404
 from fixtures.remote_storage import LocalFsStorage
+from fixtures.utils import assert_pageserver_backups_equal
 
 
 def by_end_lsn(info: HistoricLayerInfo) -> Lsn:
@@ -68,7 +61,6 @@ SHUTDOWN_ALLOWED_ERRORS = [
 @pytest.mark.parametrize("write_to_branch_first", [True, False])
 def test_ancestor_detach_branched_from(
     test_output_dir,
-    pg_distrib_dir,
     neon_env_builder: NeonEnvBuilder,
     pg_bin: PgBin,
     branchpoint: Branchpoint,
@@ -80,7 +72,6 @@ def test_ancestor_detach_branched_from(
     """
     env = neon_env_builder.init_start()
 
-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
     env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
 
     client = env.pageserver.http_client()
@@ -160,16 +151,9 @@ def test_ancestor_detach_branched_from(
     # run fullbackup to make sure there are no off by one errors
     # take this on the parent
     fullbackup_before = test_output_dir / "fullbackup-before.tar"
-    cmd = [
-        "psql",
-        "--no-psqlrc",
-        env.pageserver.connstr(),
-        "-c",
-        f"fullbackup {env.initial_tenant} {env.initial_timeline} {branch_at}",
-        "-o",
-        str(fullbackup_before),
-    ]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, env.initial_timeline, branch_at, fullbackup_before
+    )
 
     all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
     assert all_reparented == set()
@@ -200,16 +184,9 @@ def test_ancestor_detach_branched_from(
 
     # take this on the detached, at same lsn
     fullbackup_after = test_output_dir / "fullbackup-after.tar"
-    cmd = [
-        "psql",
-        "--no-psqlrc",
-        env.pageserver.connstr(),
-        "-c",
-        f"fullbackup {env.initial_tenant} {timeline_id} {branch_at}",
-        "-o",
-        str(fullbackup_after),
-    ]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, timeline_id, branch_at, fullbackup_after
+    )
 
     client.timeline_delete(env.initial_tenant, env.initial_timeline)
     wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
@@ -218,52 +195,7 @@ def test_ancestor_detach_branched_from(
     # as there is always "PREV_LSN: invalid" for "before"
     skip_files = {"zenith.signal"}
 
-    tar_cmp(fullbackup_before, fullbackup_after, skip_files)
-
-
-def tar_cmp(left: Path, right: Path, skip_files: Set[str]):
-    """
-    This is essentially:
-
-    lines=$(comm -3 \
-        <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \
-        <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \
-        | wc -l)
-    [ "$lines" = "0" ]
-
-    But in a more mac friendly fashion.
-    """
-    started_at = time.time()
-
-    def hash_extracted(reader: Union[IO[bytes], None]) -> bytes:
-        assert reader is not None
-        digest = sha256(usedforsecurity=False)
-        while True:
-            buf = reader.read(64 * 1024)
-            if not buf:
-                break
-            digest.update(buf)
-        return digest.digest()
-
-    def build_hash_list(p: Path) -> List[Tuple[str, bytes]]:
-        with tarfile.open(p) as f:
-            matching_files = (info for info in f if info.isreg() and info.name not in skip_files)
-            ret = list(
-                map(lambda info: (info.name, hash_extracted(f.extractfile(info))), matching_files)
-            )
-            ret.sort(key=lambda t: t[0])
-            return ret
-
-    left_list, right_list = map(build_hash_list, [left, right])
-
-    try:
-        assert len(left_list) == len(right_list)
-
-        for left_tuple, right_tuple in zip(left_list, right_list):
-            assert left_tuple == right_tuple
-    finally:
-        elapsed = time.time() - started_at
-        log.info(f"tar_cmp completed in {elapsed}s")
+    assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, skip_files)
 
 
 def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
@@ -483,7 +415,7 @@ def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEn
 
 
 def test_compaction_induced_by_detaches_in_history(
-    neon_env_builder: NeonEnvBuilder, test_output_dir, pg_distrib_dir, pg_bin: PgBin
+    neon_env_builder: NeonEnvBuilder, test_output_dir, pg_bin: PgBin
 ):
     """
     Assuming the tree of timelines:
@@ -500,8 +432,6 @@ def test_compaction_induced_by_detaches_in_history(
     timeline broken.
     """
 
-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
-
     env = neon_env_builder.init_start(
         initial_tenant_conf={
             # we want to create layers manually so we don't branch on arbitrary
@@ -589,16 +519,9 @@ def test_compaction_induced_by_detaches_in_history(
 
     # take the fullbackup before and after inheriting the new L0s
     fullbackup_before = test_output_dir / "fullbackup-before.tar"
-    cmd = [
-        "psql",
-        "--no-psqlrc",
-        env.pageserver.connstr(),
-        "-c",
-        f"fullbackup {env.initial_tenant} {branch_timeline_id} {branch_lsn}",
-        "-o",
-        str(fullbackup_before),
-    ]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_before
+    )
 
     for _, timeline_id in skip_main:
         reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
@@ -624,19 +547,12 @@ def test_compaction_induced_by_detaches_in_history(
     assert len(post_compact_l0s) == 1, "only the consecutive inherited L0s should be compacted"
 
     fullbackup_after = test_output_dir / "fullbackup_after.tar"
-    cmd = [
-        "psql",
-        "--no-psqlrc",
-        env.pageserver.connstr(),
-        "-c",
-        f"fullbackup {env.initial_tenant} {branch_timeline_id} {branch_lsn}",
-        "-o",
-        str(fullbackup_after),
-    ]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_after
+    )
 
     # we don't need to skip any files, because zenith.signal will be identical
-    tar_cmp(fullbackup_before, fullbackup_after, set())
+    assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set())
 
 
 # TODO:

From d1d55bbd9fbdca34d7f96c391f4a0368fb8c6583 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 22 May 2024 14:43:10 +0100
Subject: [PATCH 0838/1571] CI(report-benchmarks-failures): fix condition
 (#7820)

## Problem

`report-benchmarks-failures` got skipped if a dependent job fails.

## Summary of changes
- Fix the if-condition by adding `&& failures()` to it; it'll make the
job run if the dependent job fails.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 14d19f7ae3..d8ad6e26d0 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -548,7 +548,7 @@ jobs:
 
   report-benchmarks-failures:
     needs: [ benchmarks, create-test-report ]
-    if: github.ref_name == 'main' && needs.benchmarks.result == 'failure'
+    if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
     runs-on: ubuntu-latest
 
     steps:

From ce44dfe3532b22689464cbeddbf392d05049b9c4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 22 May 2024 16:55:34 +0300
Subject: [PATCH 0839/1571] openapi: document timeline ancestor detach (#7650)

The openapi description with the error descriptions:

- 200 is used for "detached or has been detached previously"
- 400 is used for "cannot be detached right now" -- it's an odd thing,
but good enough
- 404 is used for tenant or timeline not found
- 409 is used for "can never be detached" (root timeline)
- 500 is used for transient errors (basically ill-defined shutdown
errors)
- 503 is used for busy (other tenant ancestor detach underway,
pageserver shutdown)

Cc: #6994
---
 pageserver/src/http/openapi_spec.yml          | 87 +++++++++++++++++++
 pageserver/src/http/routes.rs                 | 27 +++---
 .../src/tenant/timeline/detach_ancestor.rs    | 30 ++++++-
 .../regress/test_timeline_detach_ancestor.py  | 36 +++++++-
 4 files changed, 161 insertions(+), 19 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 107bcd4a22..e5eafc51f4 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -612,6 +612,80 @@ paths:
               schema:
                 $ref: "#/components/schemas/Error"
 
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        ŕequired: true
+        schema:
+          type: string
+
+    put:
+      description: |
+        Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`.
+        Current implementation might not be retryable across failure cases, but will be enhanced in future.
+        Detaching should be expected to be expensive operation. Timeouts should be retried.
+      responses:
+        "200":
+          description: |
+            The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented.
+            If any timelines were deleted after reparenting, they might not be on this list.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/AncestorDetached"
+
+        "400":
+          description: |
+            Number of early checks meaning the timeline cannot be detached now:
+              - the ancestor of timeline has an ancestor: not supported, see RFC
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
+        "404":
+          description: Tenant or timeline not found.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+
+        "409":
+          description: |
+            The timeline can never be detached:
+              - timeline has no ancestor, implying that the timeline has never had an ancestor
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+
+        "500":
+          description: |
+            Transient error, for example, pageserver shutdown happened while
+            processing the request but we were unable to distinguish that. Must
+            be retried.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
+        "503":
+          description: |
+            Temporarily unavailable, please retry. Possible reasons:
+              - another timeline detach for the same tenant is underway, please retry later
+              - detected shutdown error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
+
   /v1/tenant/:
     get:
       description: Get tenants list
@@ -1077,6 +1151,19 @@ components:
           format: int64
           description: How many bytes of layer content were in the latest layer heatmap
 
+    AncestorDetached:
+      type: object
+      required:
+        - reparented_timelines
+      properties:
+        reparented_timelines:
+          type: array
+          description: Set of reparented timeline ids
+          properties:
+            type: string
+            format: hex
+            description: TimelineId
+
 
     Error:
       type: object
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index c75e4ca5a9..34b9806a26 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -74,6 +74,7 @@ use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::Timeline;
+use crate::tenant::GetTimelineError;
 use crate::tenant::SpawnMode;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::{config::PageServerConf, tenant::mgr};
@@ -279,6 +280,13 @@ impl From<GetTenantError> for ApiError {
     }
 }
 
+impl From<GetTimelineError> for ApiError {
+    fn from(gte: GetTimelineError) -> Self {
+        // Rationale: tenant is activated only after eligble timelines activate
+        ApiError::NotFound(gte.into())
+    }
+}
+
 impl From<GetActiveTenantError> for ApiError {
     fn from(e: GetActiveTenantError) -> ApiError {
         match e {
@@ -643,9 +651,7 @@ async fn timeline_preserve_initdb_handler(
             .tenant_manager
             .get_attached_tenant_shard(tenant_shard_id)?;
 
-        let timeline = tenant
-            .get_timeline(timeline_id, false)
-            .map_err(|e| ApiError::NotFound(e.into()))?;
+        let timeline = tenant.get_timeline(timeline_id, false)?;
 
         timeline
             .preserve_initdb_archive()
@@ -687,9 +693,7 @@ async fn timeline_detail_handler(
 
         tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
-        let timeline = tenant
-            .get_timeline(timeline_id, false)
-            .map_err(|e| ApiError::NotFound(e.into()))?;
+        let timeline = tenant.get_timeline(timeline_id, false)?;
 
         let timeline_info = build_timeline_info(
             &timeline,
@@ -1901,14 +1905,11 @@ async fn timeline_detach_ancestor_handler(
         let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
         let ctx = &ctx;
 
-        let timeline = tenant
-            .get_timeline(timeline_id, true)
-            .map_err(|e| ApiError::NotFound(e.into()))?;
+        let timeline = tenant.get_timeline(timeline_id, true)?;
 
         let (_guard, prepared) = timeline
             .prepare_to_detach_from_ancestor(&tenant, options, ctx)
-            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .await?;
 
         let res = state
             .tenant_manager
@@ -2042,9 +2043,7 @@ async fn active_timeline_of_active_tenant(
 
     tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
-    tenant
-        .get_timeline(timeline_id, true)
-        .map_err(|e| ApiError::NotFound(e.into()))
+    Ok(tenant.get_timeline(timeline_id, true)?)
 }
 
 async fn always_panic_handler(
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 4d8e570181..e6ddabe5b5 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -12,7 +12,7 @@ use crate::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
-use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};
+use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
 
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum Error {
@@ -41,6 +41,27 @@ pub(crate) enum Error {
     Unexpected(#[source] anyhow::Error),
 }
 
+impl From<Error> for ApiError {
+    fn from(value: Error) -> Self {
+        match value {
+            e @ Error::NoAncestor => ApiError::Conflict(e.to_string()),
+            // TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError?
+            e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)),
+            Error::ShuttingDown => ApiError::ShuttingDown,
+            Error::OtherTimelineDetachOngoing(_) => {
+                ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
+            }
+            // All of these contain shutdown errors, in fact, it's the most common
+            e @ Error::FlushAncestor(_)
+            | e @ Error::RewrittenDeltaDownloadFailed(_)
+            | e @ Error::CopyDeltaPrefix(_)
+            | e @ Error::UploadRewritten(_)
+            | e @ Error::CopyFailed(_)
+            | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
+        }
+    }
+}
+
 pub(crate) struct PreparedTimelineDetach {
     layers: Vec<Layer>,
 }
@@ -75,6 +96,11 @@ pub(super) async fn prepare(
         .as_ref()
         .map(|tl| (tl.clone(), detached.ancestor_lsn))
     else {
+        // TODO: check if we have already been detached; for this we need to read the stored data
+        // on remote client, for that we need a follow-up which makes uploads cheaper and maintains
+        // a projection of the commited data.
+        //
+        // the error is wrong per openapi
         return Err(NoAncestor);
     };
 
@@ -84,7 +110,7 @@ pub(super) async fn prepare(
 
     if ancestor.ancestor_timeline.is_some() {
         // non-technical requirement; we could flatten N ancestors just as easily but we chose
-        // not to
+        // not to, at least initially
         return Err(TooManyAncestors);
     }
 
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 1563c161e0..f0b2f7d733 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -8,9 +8,13 @@ from typing import List, Tuple
 import pytest
 from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
-from fixtures.pageserver.http import HistoricLayerInfo
-from fixtures.pageserver.utils import wait_timeline_detail_404
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PgBin,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
+from fixtures.pageserver.utils import wait_tenant_status_404, wait_timeline_detail_404
 from fixtures.remote_storage import LocalFsStorage
 from fixtures.utils import assert_pageserver_backups_equal
 
@@ -555,6 +559,32 @@ def test_compaction_induced_by_detaches_in_history(
     assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set())
 
 
+def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    client = env.pageserver.http_client()
+
+    with pytest.raises(PageserverApiException, match=".* no ancestors") as info:
+        client.detach_ancestor(env.initial_tenant, env.initial_timeline)
+    assert info.value.status_code == 409
+
+    first_branch = env.neon_cli.create_branch("first_branch")
+    second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch")
+
+    # funnily enough this does not have a prefix
+    with pytest.raises(PageserverApiException, match="too many ancestors") as info:
+        client.detach_ancestor(env.initial_tenant, second_branch)
+    assert info.value.status_code == 400
+
+    client.tenant_delete(env.initial_tenant)
+    wait_tenant_status_404(client, env.initial_tenant, 10, 1)
+
+    with pytest.raises(PageserverApiException) as e:
+        client.detach_ancestor(env.initial_tenant, first_branch)
+    assert e.value.status_code == 404
+
+
 # TODO:
 # - after starting the operation, tenant is deleted
 # - after starting the operation, pageserver is shutdown, restarted

From 8901ce9c99f068464e9ee15673f25569ff298f17 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 21 May 2024 18:26:47 -0500
Subject: [PATCH 0840/1571] Fix typos in action definitions

---
 .github/actions/neon-branch-create/action.yml  |  6 +++---
 .github/actions/neon-branch-delete/action.yml  |  8 ++++----
 .github/actions/neon-project-create/action.yml | 12 ++++++------
 .github/actions/neon-project-delete/action.yml |  6 +++---
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml
index dea3fc2357..9f752d5a89 100644
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -3,13 +3,13 @@ description: 'Create Branch using API'
 
 inputs:
   api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
     required: true
   project_id:
-    desctiption: 'ID of the Project to create Branch in'
+    description: 'ID of the Project to create Branch in'
     required: true
   api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
     default: console-stage.neon.build
 outputs:
   dsn:
diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml
index 8acba7ad00..58141a4a3f 100644
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -3,16 +3,16 @@ description: 'Delete Branch using API'
 
 inputs:
   api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
     required: true
   project_id:
-    desctiption: 'ID of the Project which should be deleted'
+    description: 'ID of the Project which should be deleted'
     required: true
   branch_id:
-    desctiption: 'ID of the branch to delete'
+    description: 'ID of the branch to delete'
     required: true
   api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
     default: console-stage.neon.build
 
 runs:
diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index 7f0e599b97..4039a58b9e 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -3,22 +3,22 @@ description: 'Create Neon Project using API'
 
 inputs:
   api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
     required: true
   region_id:
-    desctiption: 'Region ID, if not set the project will be created in the default region'
+    description: 'Region ID, if not set the project will be created in the default region'
     default: aws-us-east-2
   postgres_version:
-    desctiption: 'Postgres version; default is 15'
+    description: 'Postgres version; default is 15'
     default: 15
   api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
     default: console-stage.neon.build
   provisioner:
-    desctiption: 'k8s-pod or k8s-neonvm'
+    description: 'k8s-pod or k8s-neonvm'
     default: 'k8s-pod'
   compute_units:
-    desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
+    description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
     default: '[1, 1]'
 
 outputs:
diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml
index b8ec6cac70..35e165fd61 100644
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -3,13 +3,13 @@ description: 'Delete Neon Project using API'
 
 inputs:
   api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
     required: true
   project_id:
-    desctiption: 'ID of the Project to delete'
+    description: 'ID of the Project to delete'
     required: true
   api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
     default: console-stage.neon.build
 
 runs:

From 900f39111507bc058bd541b66deddd66d6473443 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 21 May 2024 18:27:30 -0500
Subject: [PATCH 0841/1571] Make postgres_version action input default to a
 string

This is "required" by GitHub Actions, though they must do some coersion
on their side.
---
 .github/actions/neon-project-create/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index 4039a58b9e..16759ad038 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -10,7 +10,7 @@ inputs:
     default: aws-us-east-2
   postgres_version:
     description: 'Postgres version; default is 15'
-    default: 15
+    default: '15'
   api_host:
     description: 'Neon API host'
     default: console-stage.neon.build

From 325f3784f9fe46b5152000ba50ae4a01c9fd68a4 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 22 May 2024 16:02:20 +0100
Subject: [PATCH 0842/1571] CI(promote-images): simplify & fix the job (#7826)

## Problem

Currently, `latest` tag is added to the images in several cases:
```
github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
```

This leads to a race; the `latest` tag jumps back and forth depending on
the branch that has built images.

## Summary of changes
- Do not push `latest` images to prod ECR (we don't use it)
- Use `docker buildx imagetools` instead of `crane` for tagging images
- Unify `vm-compute-node-image` job with others and use dockerhub as a
first source for images (sync images with ECR)
- Tag images with `latest` only for commits in `main`
---
 .github/workflows/build_and_test.yml | 117 ++++++++++++---------------
 1 file changed, 52 insertions(+), 65 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index d8ad6e26d0..5056025457 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -883,22 +883,39 @@ jobs:
           curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
           chmod +x vm-builder
 
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
       # Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
       # it won't have the proper authentication (written at v0.6.0)
       - name: Pulling compute-node image
         run: |
-          docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
 
       - name: Build vm image
         run: |
           ./vm-builder \
             -spec=vm-image-spec.yaml \
-            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+            -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
+            -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
 
       - name: Pushing vm-compute-node image
         run: |
-          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
 
   test-images:
     needs: [ check-permissions, tag, neon-image, compute-node-image ]
@@ -946,78 +963,48 @@ jobs:
 
   promote-images:
     needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
-    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
-    # Don't add if-condition here.
-    # The job should always be run because we have dependant other jobs that shouldn't be skipped
+    runs-on: ubuntu-latest
+
+    env:
+      VERSIONS: v14 v15 v16
 
     steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
-      - name: Copy vm-compute-node images to Docker Hub
+      - name: Copy vm-compute-node images to ECR
         run: |
-          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
-          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
-          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
+          for version in ${VERSIONS}; do
+            docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
+                                               neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+          done
 
       - name: Add latest tag to images
-        if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
+        if: github.ref_name == 'main'
         run: |
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+          for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
+            docker buildx imagetools create -t $repo/neon:latest \
+                                               $repo/neon:${{ needs.tag.outputs.build-tag }}
 
-      - name: Push images to production ECR
-        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
-        run: |
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest
+            docker buildx imagetools create -t $repo/compute-tools:latest \
+                                               $repo/compute-tools:${{ needs.tag.outputs.build-tag }}
 
-      - name: Configure Docker Hub login
-        run: |
-          # ECR Credential Helper & Docker Hub don't work together in config, hence reset
-          echo "" > /github/home/.docker/config.json
-          crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
+            for version in ${VERSIONS}; do
+              docker buildx imagetools create -t $repo/compute-node-${version}:latest \
+                                                 $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
 
-      - name: Push vm-compute-node to Docker Hub
-        run: |
-          crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
-          crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
-          crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
-
-      - name: Push latest tags to Docker Hub
-        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
-        run: |
-          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
-
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
+              docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
+                                                 $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+            done
+          done
 
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]

From a7f31f1a59aec1cdd66df71ffaf3b7ad176c4966 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 22 May 2024 16:06:05 +0100
Subject: [PATCH 0843/1571] CI: build multi-arch images (#7696)

## Problem

We don't build our docker images for ARM arch, and that makes it harder
to run images on ARM (on MacBooks with Apple Silicon, for example).

## Summary of changes
- Build `neondatabase/neon` for ARM and create a multi-arch image
- Build `neondatabase/compute-node-vXX` for ARM and create a multi-arch
image
- Run `test-images` job on ARM as well
---
 .github/workflows/build_and_test.yml      | 123 +++++++++++++++++-----
 docker-compose/compute_wrapper/Dockerfile |   2 +-
 docker-compose/docker_compose_test.sh     |   2 -
 3 files changed, 97 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 5056025457..2ab1417d6d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -723,9 +723,13 @@ jobs:
     uses: ./.github/workflows/trigger-e2e-tests.yml
     secrets: inherit
 
-  neon-image:
+  neon-image-arch:
     needs: [ check-permissions, build-build-tools-image, tag ]
-    runs-on: [ self-hosted, gen3, large ]
+    strategy:
+      matrix:
+        arch: [ x64, arm64 ]
+
+    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     steps:
       - name: Checkout
@@ -747,12 +751,6 @@ jobs:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - uses: docker/login-action@v3
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
       - uses: docker/build-push-action@v5
         with:
           context: .
@@ -764,25 +762,52 @@ jobs:
           push: true
           pull: true
           file: Dockerfile
-          cache-from: type=registry,ref=neondatabase/neon:cache
-          cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
+          cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
+          cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
           tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
-            neondatabase/neon:${{needs.tag.outputs.build-tag}}
+            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
       - name: Remove custom docker config directory
         if: always()
         run: |
           rm -rf .docker-custom
 
-  compute-node-image:
-    needs: [ check-permissions, build-build-tools-image, tag ]
-    runs-on: [ self-hosted, gen3, large ]
+  neon-image:
+    needs: [ neon-image-arch, tag ]
+    runs-on: ubuntu-latest
 
+    steps:
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Create multi-arch image
+        run: |
+          docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
+                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \
+                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64
+
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - name: Push multi-arch image to ECR
+        run: |
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
+                                                                                neondatabase/neon:${{ needs.tag.outputs.build-tag }}
+
+  compute-node-image-arch:
+    needs: [ check-permissions, build-build-tools-image, tag ]
     strategy:
       fail-fast: false
       matrix:
         version: [ v14, v15, v16 ]
+        arch: [ x64, arm64 ]
+
+    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     steps:
       - name: Checkout
@@ -829,15 +854,14 @@ jobs:
           push: true
           pull: true
           file: Dockerfile.compute-node
-          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
-          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
+          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
+          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
           tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
-            neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+            neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
       - name: Build compute-tools image
         # compute-tools are Postgres independent, so build it only once
-        if: ${{ matrix.version == 'v16' }}
+        if: matrix.version == 'v16'
         uses: docker/build-push-action@v5
         with:
           target: compute-tools-image
@@ -851,14 +875,57 @@ jobs:
           pull: true
           file: Dockerfile.compute-node
           tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
-            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
+            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
       - name: Remove custom docker config directory
         if: always()
         run: |
           rm -rf .docker-custom
 
+  compute-node-image:
+    needs: [ compute-node-image-arch, tag ]
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        version: [ v14, v15, v16 ]
+
+    steps:
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Create multi-arch compute-node image
+        run: |
+          docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
+                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
+                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
+
+      - name: Create multi-arch compute-tools image
+        if: matrix.version == 'v16'
+        run: |
+          docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
+                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
+                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64
+
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR
+        run: |
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
+                                                                                neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+
+      - name: Push multi-arch compute-tools image to ECR
+        if: matrix.version == 'v16'
+        run: |
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
+                                                                                neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
+
   vm-compute-node-image:
     needs: [ check-permissions, tag, compute-node-image ]
     runs-on: [ self-hosted, gen3, large ]
@@ -866,9 +933,6 @@ jobs:
       fail-fast: false
       matrix:
         version: [ v14, v15, v16 ]
-    defaults:
-      run:
-        shell: sh -eu {0}
     env:
       VM_BUILDER_VERSION: v0.28.1
 
@@ -919,7 +983,12 @@ jobs:
 
   test-images:
     needs: [ check-permissions, tag, neon-image, compute-node-image ]
-    runs-on: [ self-hosted, gen3, small ]
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: [ x64, arm64 ]
+
+    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
 
     steps:
       - name: Checkout
@@ -937,7 +1006,7 @@ jobs:
       - name: Verify image versions
         shell: bash # ensure no set -e for better error messages
         run: |
-          pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
+          pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
 
           echo "Pageserver version string: $pageserver_version"
 
diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile
index f1b1986072..974dcd7f03 100644
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -1,4 +1,4 @@
-ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG REPOSITORY=neondatabase
 ARG COMPUTE_IMAGE=compute-node-v14
 ARG TAG=latest
 
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index e18b0f9176..062fc6fc92 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -8,8 +8,6 @@
 # Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
 # to verify custom image builds (e.g pre-published ones).
 
-# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
-
 set -eux -o pipefail
 
 SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"

From e015b2bf3eb96c7a5d8b9bf0253ae13e569f5747 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 22 May 2024 16:10:58 +0100
Subject: [PATCH 0844/1571] safekeeper: use CancellationToken instead of watch
 channel (#7836)

## Problem

Safekeeper Timeline uses a channel for cancellation, but we have a
dedicated type for that.

## Summary of changes

- Use CancellationToken in Timeline
---
 safekeeper/src/recovery.rs           | 10 ++-------
 safekeeper/src/timeline.rs           | 31 +++++++---------------------
 safekeeper/src/timeline_manager.rs   | 10 +--------
 safekeeper/src/wal_backup_partial.rs | 14 +++----------
 4 files changed, 13 insertions(+), 52 deletions(-)

diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index e8fa6c55f4..dfa1892c40 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -37,17 +37,11 @@ use crate::{
 #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
 pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
     info!("started");
-    let mut cancellation_rx = match tli.get_cancellation_rx() {
-        Ok(rx) => rx,
-        Err(_) => {
-            info!("timeline canceled during task start");
-            return;
-        }
-    };
 
+    let cancel = tli.cancel.clone();
     select! {
         _ = recovery_main_loop(tli, conf) => { unreachable!() }
-        _ = cancellation_rx.changed() => {
+        _ = cancel.cancelled() => {
             info!("stopped");
         }
     }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index da2e3f4538..89c157d514 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -6,6 +6,7 @@ use camino::Utf8PathBuf;
 use postgres_ffi::XLogSegNo;
 use serde::{Deserialize, Serialize};
 use tokio::fs;
+use tokio_util::sync::CancellationToken;
 
 use std::cmp::max;
 use std::ops::{Deref, DerefMut};
@@ -342,12 +343,8 @@ pub struct Timeline {
     walsenders: Arc<WalSenders>,
     walreceivers: Arc<WalReceivers>,
 
-    /// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal.
-    cancellation_tx: watch::Sender<bool>,
-
-    /// Timeline should not be used after cancellation. Background tasks should
-    /// monitor this channel and stop eventually after receiving `true` from this channel.
-    cancellation_rx: watch::Receiver<bool>,
+    /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires
+    pub(crate) cancel: CancellationToken,
 
     /// Directory where timeline state is stored.
     pub timeline_dir: Utf8PathBuf,
@@ -376,7 +373,6 @@ impl Timeline {
             shared_state.sk.flush_lsn(),
         )));
         let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
-        let (cancellation_tx, cancellation_rx) = watch::channel(false);
 
         let walreceivers = WalReceivers::new();
         Ok(Timeline {
@@ -390,8 +386,7 @@ impl Timeline {
             mutex: RwLock::new(shared_state),
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
-            cancellation_rx,
-            cancellation_tx,
+            cancel: CancellationToken::default(),
             timeline_dir: conf.timeline_dir(&ttid),
             walsenders_keep_horizon: conf.walsenders_keep_horizon,
             broker_active: AtomicBool::new(false),
@@ -411,7 +406,6 @@ impl Timeline {
         let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) =
             watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID)));
         let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
-        let (cancellation_tx, cancellation_rx) = watch::channel(false);
 
         let state =
             TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
@@ -428,8 +422,7 @@ impl Timeline {
             mutex: RwLock::new(SharedState::create_new(conf, &ttid, state)?),
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
-            cancellation_rx,
-            cancellation_tx,
+            cancel: CancellationToken::default(),
             timeline_dir: conf.timeline_dir(&ttid),
             walsenders_keep_horizon: conf.walsenders_keep_horizon,
             broker_active: AtomicBool::new(false),
@@ -535,7 +528,7 @@ impl Timeline {
     /// eventually after receiving cancellation signal.
     fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) {
         info!("timeline {} is cancelled", self.ttid);
-        let _ = self.cancellation_tx.send(true);
+        self.cancel.cancel();
         // Close associated FDs. Nobody will be able to touch timeline data once
         // it is cancelled, so WAL storage won't be opened again.
         shared_state.sk.wal_store.close();
@@ -543,17 +536,7 @@ impl Timeline {
 
     /// Returns if timeline is cancelled.
     pub fn is_cancelled(&self) -> bool {
-        *self.cancellation_rx.borrow()
-    }
-
-    /// Returns watch channel which gets value when timeline is cancelled. It is
-    /// guaranteed to have not cancelled value observed (errors otherwise).
-    pub fn get_cancellation_rx(&self) -> Result<watch::Receiver<bool>> {
-        let rx = self.cancellation_rx.clone();
-        if *rx.borrow() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-        Ok(rx)
+        self.cancel.is_cancelled()
     }
 
     /// Take a writing mutual exclusive lock on timeline shared_state.
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index 52ad915065..e74ba37ad8 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -47,14 +47,6 @@ pub async fn main_task(
     conf: SafeKeeperConf,
     broker_active_set: Arc<TimelinesSet>,
 ) {
-    let mut cancellation_rx = match tli.get_cancellation_rx() {
-        Ok(rx) => rx,
-        Err(_) => {
-            info!("timeline canceled during task start");
-            return;
-        }
-    };
-
     scopeguard::defer! {
         if tli.is_cancelled() {
             info!("manager task finished");
@@ -129,7 +121,7 @@ pub async fn main_task(
         // wait until something changes. tx channels are stored under Arc, so they will not be
         // dropped until the manager task is finished.
         tokio::select! {
-            _ = cancellation_rx.changed() => {
+            _ = tli.cancel.cancelled() => {
                 // timeline was deleted
                 break 'outer state_snapshot;
             }
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 200096ac5c..29e944bff3 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -277,14 +277,6 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
     debug!("started");
     let await_duration = conf.partial_backup_timeout;
 
-    let mut cancellation_rx = match tli.get_cancellation_rx() {
-        Ok(rx) => rx,
-        Err(_) => {
-            info!("timeline canceled during task start");
-            return;
-        }
-    };
-
     // sleep for random time to avoid thundering herd
     {
         let randf64 = rand::thread_rng().gen_range(0.0..1.0);
@@ -327,7 +319,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
                 && flush_lsn_rx.borrow().term == seg.term
             {
                 tokio::select! {
-                    _ = cancellation_rx.changed() => {
+                    _ = backup.tli.cancel.cancelled() => {
                         info!("timeline canceled");
                         return;
                     }
@@ -340,7 +332,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
         // if we don't have any data and zero LSNs, wait for something
         while flush_lsn_rx.borrow().lsn == Lsn(0) {
             tokio::select! {
-                _ = cancellation_rx.changed() => {
+                _ = backup.tli.cancel.cancelled() => {
                     info!("timeline canceled");
                     return;
                 }
@@ -357,7 +349,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
         // waiting until timeout expires OR segno changes
         'inner: loop {
             tokio::select! {
-                _ = cancellation_rx.changed() => {
+                _ = backup.tli.cancel.cancelled() => {
                     info!("timeline canceled");
                     return;
                 }

From 62aac6c8add432da1ab2d206f19323e808622e8d Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 22 May 2024 18:13:45 +0300
Subject: [PATCH 0845/1571] fix(Layer): carry gate until eviction is complete
 (#7838)

the gate was accidentially being dropped before the final blocking
phase, possibly explaining the resident physical size global problems
during deletions.

it could had caused more harm as well, but the path is not actively
being tested because cplane no longer puts locationconfigs with higher
generation number during normal operation which prompted the last wave
of fixes.

Cc: #7341.
---
 pageserver/src/tenant/storage_layer/layer.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 45d61ce048..8c64621710 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -12,7 +12,7 @@ use std::time::{Duration, SystemTime};
 use tracing::Instrument;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
-use utils::sync::heavier_once_cell;
+use utils::sync::{gate, heavier_once_cell};
 
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -1333,7 +1333,7 @@ impl LayerInner {
 
         is_good_to_continue(&rx.borrow_and_update())?;
 
-        let Ok(_gate) = timeline.gate.enter() else {
+        let Ok(gate) = timeline.gate.enter() else {
             return Err(EvictionCancelled::TimelineGone);
         };
 
@@ -1421,7 +1421,7 @@ impl LayerInner {
         Self::spawn_blocking(move || {
             let _span = span.entered();
 
-            let res = self.evict_blocking(&timeline, &permit);
+            let res = self.evict_blocking(&timeline, &gate, &permit);
 
             let waiters = self.inner.initializer_count();
 
@@ -1447,6 +1447,7 @@ impl LayerInner {
     fn evict_blocking(
         &self,
         timeline: &Timeline,
+        _gate: &gate::GateGuard,
         _permit: &heavier_once_cell::InitPermit,
     ) -> Result<(), EvictionCancelled> {
         // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit`

From 3404e76a51311a595e28beb7d0962488cafaf143 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 22 May 2024 14:36:13 +0300
Subject: [PATCH 0846/1571] Fix confusion between 1-based Buffer and 0-based
 index (#7825)

The code was working correctly, but was incorrectly using Buffer for a
0-based index into the BufferDesc array.
---
 pgxn/neon/pagestore_smgr.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 249ad313b0..4361c74905 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -3216,7 +3216,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	BufferTag	tag;
 	uint32		hash;
 	LWLock	   *partitionLock;
-	Buffer		buffer;
+	int			buf_id;
 	bool		no_redo_needed;
 
 	if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
@@ -3254,9 +3254,9 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	else
 	{
 		/* Try to find the relevant buffer */
-		buffer = BufTableLookup(&tag, hash);
+		buf_id = BufTableLookup(&tag, hash);
 
-		no_redo_needed = buffer < 0;
+		no_redo_needed = buf_id < 0;
 	}
 	/* In both cases st lwlsn past this WAL record */
 	SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);

From 921756402673e9e2fa764fbacf33457f05bc3d59 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 22 May 2024 14:57:09 +0300
Subject: [PATCH 0847/1571] Fix issues with determining request LSN in read
 replica (#7795)

Don't set last-written LSN of a page when the record is replayed, only
when the page is evicted from cache. For comparison, we don't update
the last-written LSN on every page modification on the primary either,
only when the page is evicted. Do update the last-written LSN when the
page update is skipped in WAL redo, however.

In neon_get_request_lsns(), don't be surprised if the last-written LSN
is equal to the record being replayed. Use the LSN of the record being
replayed as the request LSN in that case. Add a long comment
explaining how that can happen.

In neon_wallog_page, update last-written LSN also when Shutdown has
been requested. We might still fetch and evict pages for a while,
after shutdown has been requested, so we better continue to do that
correctly.

Enable the check that we don't evict a page with zero LSN also in
standby, but make it a LOG message instead of PANIC

Fixes issue https://github.com/neondatabase/neon/issues/7791
---
 pgxn/neon/pagestore_smgr.c              | 142 +++++++++++++++++++++---
 test_runner/regress/test_hot_standby.py |  60 ++++++++++
 2 files changed, 185 insertions(+), 17 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 4361c74905..41546eae85 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1349,6 +1349,10 @@ PageIsEmptyHeapPage(char *buffer)
 	return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
 }
 
+/*
+ * A page is being evicted from the shared buffer cache. Update the
+ * last-written LSN of the page, and WAL-log it if needed.
+ */
 static void
 #if PG_MAJORVERSION_NUM < 16
 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
@@ -1357,12 +1361,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 #endif
 {
 	XLogRecPtr	lsn = PageGetLSN((Page) buffer);
-
-	if (ShutdownRequestPending)
-		return;
-	/* Don't log any pages if we're not allowed to do so. */
-	if (!XLogInsertAllowed())
-		return;
+	bool		log_page;
 
 	/*
 	 * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
@@ -1371,9 +1370,21 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 	 * correctness, the non-logged updates are not critical. But we want to
 	 * have a reasonably up-to-date VM and FSM in the page server.
 	 */
-	if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress())
+	log_page = false;
+	if (force)
+	{
+		Assert(XLogInsertAllowed());
+		log_page = true;
+	}
+	else if (XLogInsertAllowed() &&
+			 !ShutdownRequestPending &&
+			 (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM))
+	{
+		log_page = true;
+	}
+
+	if (log_page)
 	{
-		/* FSM is never WAL-logged and we don't care. */
 		XLogRecPtr	recptr;
 
 		recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum,
@@ -1386,7 +1397,8 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
 	}
-	else if (lsn == InvalidXLogRecPtr)
+
+	if (lsn == InvalidXLogRecPtr)
 	{
 		/*
 		 * When PostgreSQL extends a relation, it calls smgrextend() with an
@@ -1422,19 +1434,31 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
 		}
-		else
+		else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM)
 		{
-			ereport(PANIC,
+			/*
+			 * Its a bad sign if there is a page with zero LSN in the buffer
+			 * cache in a standby, too. However, PANICing seems like a cure
+			 * worse than the disease, as the damage has likely already been
+			 * done in the primary. So in a standby, make this an assertion,
+			 * and in a release build just LOG the error and soldier on. We
+			 * update the last-written LSN of the page with a conservative
+			 * value in that case, which is the last replayed LSN.
+			 */
+			ereport(RecoveryInProgress() ? LOG : PANIC,
 					(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
 							blocknum,
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
+			Assert(false);
+
+			lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */
 		}
 	}
 	else
 	{
 		ereport(SmgrTrace,
-				(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
+				(errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X",
 						blocknum,
 						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1527,8 +1551,92 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
 
 	if (RecoveryInProgress())
 	{
-		/* Request the page at the last replayed LSN. */
-		result.request_lsn = GetXLogReplayRecPtr(NULL);
+		/*---
+		 * In broad strokes, a replica always requests the page at the current
+		 * replay LSN. But looking closer, what exactly is the replay LSN? Is
+		 * it the last replayed record, or the record being replayed? And does
+		 * the startup process performing the replay need to do something
+		 * differently than backends running queries? Let's take a closer look
+		 * at the different scenarios:
+		 *
+		 * 1. Startup process reads a page, last_written_lsn is old.
+		 *
+		 * Read the old version of the page. We will apply the WAL record on
+		 * it to bring it up-to-date.
+		 *
+		 * We could read the new version, with the changes from this WAL
+		 * record already applied, to offload the work of replaying the record
+		 * to the pageserver. The pageserver might not have received the WAL
+		 * record yet, though, so a read of the old page version and applying
+		 * the record ourselves is likely faster. Also, the redo function
+		 * might be surprised if the changes have already applied. That's
+		 * normal during crash recovery, but not in hot standby.
+		 *
+		 * 2. Startup process reads a page, last_written_lsn == record we're
+		 *    replaying.
+		 *
+		 * Can this happen? There are a few theoretical cases when it might:
+		 *
+		 * A) The redo function reads the same page twice. We had already read
+		 *    and applied the changes once, and now we're reading it for the
+		 *    second time.  That would be a rather silly thing for a redo
+		 *    function to do, and I'm not aware of any that would do it.
+		 *
+		 * B) The redo function modifies multiple pages, and it already
+		 *    applied the changes to one of the pages, released the lock on
+		 *    it, and is now reading a second page.  Furthermore, the first
+		 *    page was already evicted from the buffer cache, and also from
+		 *    the last-written LSN cache, so that the per-relation or global
+		 *    last-written LSN was already updated. All the WAL redo functions
+		 *    hold the locks on pages that they modify, until all the changes
+		 *    have been modified (?), which would make that impossible.
+		 *    However, we skip the locking, if the page isn't currently in the
+		 *    page cache (see neon_redo_read_buffer_filter below).
+		 *
+		 * Even if the one of the above cases were possible in theory, they
+		 * would also require the pages being modified by the redo function to
+		 * be immediately evicted from the page cache.
+		 *
+		 * So this probably does not happen in practice. But if it does, we
+		 * request the new version, including the changes from the record
+		 * being replayed. That seems like the correct behavior in any case.
+		 *
+		 * 3. Backend process reads a page with old last-written LSN
+		 *
+		 * Nothing special here. Read the old version.
+		 *
+		 * 4. Backend process reads a page with last_written_lsn == record being replayed
+		 *
+		 * This can happen, if the redo function has started to run, and saw
+		 * that the page isn't present in the page cache (see
+		 * neon_redo_read_buffer_filter below).  Normally, in a normal
+		 * Postgres server, the redo function would hold a lock on the page,
+		 * so we would get blocked waiting the redo function to release the
+		 * lock. To emulate that, wait for the WAL replay of the record to
+		 * finish.
+		 */
+		/* Request the page at the end of the last fully replayed LSN. */
+		XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL);
+
+		if (last_written_lsn > replay_lsn)
+		{
+			/* GetCurrentReplayRecPtr was introduced in v15 */
+#if PG_VERSION_NUM >= 150000
+			Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL));
+#endif
+
+			/*
+			 * Cases 2 and 4. If this is a backend (case 4), the
+			 * neon_read_at_lsn() call later will wait for the WAL record to be
+			 * fully replayed.
+			 */
+			result.request_lsn = last_written_lsn;
+		}
+		else
+		{
+			/* cases 1 and 3 */
+			result.request_lsn = replay_lsn;
+		}
 		result.not_modified_since = last_written_lsn;
 		result.effective_request_lsn = result.request_lsn;
 		Assert(last_written_lsn <= result.request_lsn);
@@ -3258,16 +3366,16 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 
 		no_redo_needed = buf_id < 0;
 	}
-	/* In both cases st lwlsn past this WAL record */
-	SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
 
 	/*
 	 * we don't have the buffer in memory, update lwLsn past this record, also
 	 * evict page from file cache
 	 */
 	if (no_redo_needed)
+	{
+		SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
 		lfc_evict(rinfo, forknum, blkno);
-
+	}
 
 	LWLockRelease(partitionLock);
 
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 244d482c18..cf7a1c56ee 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -1,3 +1,4 @@
+import asyncio
 import os
 import re
 import threading
@@ -292,3 +293,62 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             assert slot_xmin is None
 
         wait_until(10, 1.0, xmin_is_null)
+
+
+# Test race condition between WAL replay and backends performing queries
+# https://github.com/neondatabase/neon/issues/7791
+def test_replica_query_race(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    primary_ep = env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+    )
+
+    with primary_ep.connect() as p_con:
+        with p_con.cursor() as p_cur:
+            p_cur.execute("CREATE EXTENSION neon_test_utils")
+            p_cur.execute("CREATE TABLE test AS SELECT 0 AS counter")
+
+    standby_ep = env.endpoints.new_replica_start(origin=primary_ep, endpoint_id="standby")
+    time.sleep(1)
+
+    # In primary, run a lot of UPDATEs on a single page
+    finished = False
+    writecounter = 1
+
+    async def primary_workload():
+        nonlocal writecounter, finished
+        conn = await primary_ep.connect_async()
+        while writecounter < 10000:
+            writecounter += 1
+            await conn.execute(f"UPDATE test SET counter = {writecounter}")
+        finished = True
+
+    # In standby, at the same time, run queries on it. And repeatedly drop caches
+    async def standby_workload():
+        nonlocal writecounter, finished
+        conn = await standby_ep.connect_async()
+        reads = 0
+        while not finished:
+            readcounter = await conn.fetchval("SELECT counter FROM test")
+
+            # Check that the replica is keeping up with the primary. In local
+            # testing, the lag between primary and standby is much smaller, in
+            # the ballpark of 2-3 counter values. But be generous in case there's
+            # some hiccup.
+            # assert(writecounter - readcounter < 1000)
+            assert readcounter <= writecounter
+            if reads % 100 == 0:
+                log.info(f"read {reads}: counter {readcounter}, last update {writecounter}")
+            reads += 1
+
+            await conn.execute("SELECT clear_buffer_cache()")
+
+    async def both():
+        await asyncio.gather(
+            primary_workload(),
+            standby_workload(),
+        )
+
+    asyncio.run(both())

From 37f81289c2d1b27f88316980fb6b307e7e46d409 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 22 May 2024 18:24:52 +0300
Subject: [PATCH 0848/1571] Make 'neon.protocol_version = 2' the default, take
 two (#7819)

Once all the computes in production have restarted, we can remove
protocol version 1 altogether.

See issue #6211.

This was done earlier already in commit 0115fe6cb2, but reverted before
it was released to production in commit bbe730d7ca because of issue
https://github.com/neondatabase/neon/issues/7692. That issue was fixed
in commit 22afaea6e1, so we are ready to change the default again.
---
 pgxn/neon/libpagestore.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index b7b1e7ccbf..f5ce2caff3 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -49,7 +49,7 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;
 
-int         neon_protocol_version = 1;
+int         neon_protocol_version = 2;
 
 static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
@@ -860,7 +860,7 @@ pg_init_libpagestore(void)
 							"Version of compute<->page server protocol",
 							NULL,
 							&neon_protocol_version,
-							1, /* default to old protocol for now */
+							2, /* use protocol version 2 */
 							1, /* min */
 							2, /* max */
 							PGC_SU_BACKEND,

From 64577cfddcdac89d649e1ba6db3a6f44e14e2eee Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 22 May 2024 12:41:13 -0400
Subject: [PATCH 0849/1571] feat(pageserver): auto-detect previous aux file
 policy (#7841)

## Problem

If an existing user already has some aux v1 files, we don't want to
switch them to the global tenant-level config.

Part of #7462

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs | 29 +++++++++++++
 pageserver/src/tenant.rs            | 67 ++++++++++++++++++++++++++++-
 2 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index f9d8c1020d..7dea687c46 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1480,6 +1480,29 @@ impl<'a> DatadirModification<'a> {
             // Allowed switch path:
             // * no aux files -> v1/v2/cross-validation
             // * cross-validation->v2
+
+            let current_policy = if current_policy.is_none() {
+                // This path will only be hit once per tenant: we will decide the final policy in this code block.
+                // The next call to `put_file` will always have `last_aux_file_policy != None`.
+                let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
+                let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
+                if aux_files_key_v1.is_empty() {
+                    None
+                } else {
+                    self.tline
+                        .last_aux_file_policy
+                        .store(Some(AuxFilePolicy::V1));
+                    self.tline
+                        .remote_client
+                        .schedule_index_upload_for_aux_file_policy_update(Some(
+                            AuxFilePolicy::V1,
+                        ))?;
+                    Some(AuxFilePolicy::V1)
+                }
+            } else {
+                current_policy
+            };
+
             if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
                 self.tline.last_aux_file_policy.store(Some(switch_policy));
                 self.tline
@@ -1775,6 +1798,12 @@ impl<'a> DatadirModification<'a> {
         self.tline.get(key, lsn, ctx).await
     }
 
+    /// Only used during unit tests, force putting a key into the modification.
+    #[cfg(test)]
+    pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
+        self.put(key, val);
+    }
+
     fn put(&mut self, key: Key, val: Value) {
         let values = self.pending_updates.entry(key).or_default();
         // Replace the previous value if it exists at the same lsn
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1a66f2c919..caf26e0a0b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3964,18 +3964,20 @@ mod tests {
 
     use super::*;
     use crate::keyspace::KeySpaceAccum;
+    use crate::pgdatadir_mapping::AuxFilesDirectory;
     use crate::repository::{Key, Value};
     use crate::tenant::harness::*;
     use crate::tenant::timeline::CompactFlags;
     use crate::DEFAULT_PG_VERSION;
     use bytes::{Bytes, BytesMut};
     use hex_literal::hex;
-    use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
+    use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
     use pageserver_api::keyspace::KeySpace;
     use pageserver_api::models::CompactionAlgorithm;
     use rand::{thread_rng, Rng};
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
+    use utils::bin_ser::BeSer;
 
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -5997,6 +5999,69 @@ mod tests {
         );
     }
 
+    #[tokio::test]
+    async fn aux_file_policy_auto_detect() {
+        let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap();
+        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
+        let (tenant, ctx) = harness.load().await;
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            None,
+            "no aux file is written so it should be unset"
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
+                files: vec![(
+                    "test_file".to_string(),
+                    Bytes::copy_from_slice(b"test_file"),
+                )]
+                .into_iter()
+                .collect(),
+            })
+            .unwrap();
+            modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test1", b"first", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V1),
+            "keep using v1 because there are aux files writting with v1"
+        );
+
+        // we can still read the auxfile v1
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+        assert_eq!(
+            files.get("test_file"),
+            Some(&bytes::Bytes::from_static(b"test_file"))
+        );
+    }
+
     #[tokio::test]
     async fn test_metadata_image_creation() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_metadata_image_creation")?;

From 9cfe08e3d9f1181a163322705ff41cbcfb11db3b Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 22 May 2024 18:05:43 +0100
Subject: [PATCH 0850/1571] proxy password threadpool (#7806)

## Problem

Despite making password hashing async, it can still take time away from
the network code.

## Summary of changes

Introduce a custom threadpool, inspired by rayon. Features:

### Fairness

Each task is tagged with it's endpoint ID. The more times we have seen
the endpoint, the more likely we are to skip the task if it comes up in
the queue. This is using a min-count-sketch estimator for the number of
times we have seen the endpoint, resetting it every 1000+ steps.

Since tasks are immediately rescheduled if they do not complete, the
worker could get stuck in a "always work available loop". To combat
this, we check the global queue every 61 steps to ensure all tasks
quickly get a worker assigned to them.

### Balanced

Using crossbeam_deque, like rayon does, we have workstealing out of the
box. I've tested it a fair amount and it seems to balance the workload
accordingly
---
 Cargo.lock                      |  20 +-
 Cargo.toml                      |   2 +
 proxy/Cargo.toml                |   4 +-
 proxy/src/auth/backend.rs       |  10 +-
 proxy/src/auth/backend/hacks.rs |  11 +-
 proxy/src/auth/flow.rs          |  24 ++-
 proxy/src/bin/proxy.rs          |   8 +
 proxy/src/config.rs             |   2 +
 proxy/src/metrics.rs            |  89 +++++++--
 proxy/src/scram.rs              |  18 +-
 proxy/src/scram/countmin.rs     | 173 +++++++++++++++++
 proxy/src/scram/exchange.rs     |  49 ++---
 proxy/src/scram/pbkdf2.rs       |  89 +++++++++
 proxy/src/scram/threadpool.rs   | 321 ++++++++++++++++++++++++++++++++
 proxy/src/serverless/backend.rs |  11 +-
 workspace_hack/Cargo.toml       |   2 +
 16 files changed, 759 insertions(+), 74 deletions(-)
 create mode 100644 proxy/src/scram/countmin.rs
 create mode 100644 proxy/src/scram/pbkdf2.rs
 create mode 100644 proxy/src/scram/threadpool.rs

diff --git a/Cargo.lock b/Cargo.lock
index e6060c82f5..d8f9021eb8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1471,26 +1471,21 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-deque"
-version = "0.8.3"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
+checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
 dependencies = [
- "cfg-if",
  "crossbeam-epoch",
  "crossbeam-utils",
 ]
 
 [[package]]
 name = "crossbeam-epoch"
-version = "0.9.14"
+version = "0.9.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
 dependencies = [
- "autocfg",
- "cfg-if",
  "crossbeam-utils",
- "memoffset 0.8.0",
- "scopeguard",
 ]
 
 [[package]]
@@ -3961,9 +3956,9 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
 
 [[package]]
 name = "pbkdf2"
-version = "0.12.1"
+version = "0.12.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
+checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
 dependencies = [
  "digest",
  "hmac",
@@ -4386,6 +4381,7 @@ dependencies = [
 name = "proxy"
 version = "0.1.0"
 dependencies = [
+ "ahash",
  "anyhow",
  "async-compression",
  "async-trait",
@@ -4402,6 +4398,7 @@ dependencies = [
  "chrono",
  "clap",
  "consumption_metrics",
+ "crossbeam-deque",
  "dashmap",
  "env_logger",
  "fallible-iterator",
@@ -7473,6 +7470,7 @@ dependencies = [
 name = "workspace_hack"
 version = "0.1.0"
 dependencies = [
+ "ahash",
  "anyhow",
  "aws-config",
  "aws-runtime",
diff --git a/Cargo.toml b/Cargo.toml
index 2a7dea447e..0887c039f8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,6 +41,7 @@ license = "Apache-2.0"
 
 ## All dependency versions, used in the project
 [workspace.dependencies]
+ahash = "0.8"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
@@ -74,6 +75,7 @@ clap = { version = "4.0", features = ["derive"] }
 comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
+crossbeam-deque = "0.8.5"
 crossbeam-utils = "0.8.5"
 dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 5f9b0aa75b..7da0763bc1 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -9,6 +9,7 @@ default = []
 testing = []
 
 [dependencies]
+ahash.workspace = true
 anyhow.workspace = true
 async-compression.workspace = true
 async-trait.workspace = true
@@ -24,6 +25,7 @@ camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
+crossbeam-deque.workspace = true
 dashmap.workspace = true
 env_logger.workspace = true
 framed-websockets.workspace = true
@@ -52,7 +54,6 @@ opentelemetry.workspace = true
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
-pbkdf2 = { workspace = true, features = ["simple", "std"] }
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
 pq_proto.workspace = true
@@ -106,6 +107,7 @@ workspace_hack.workspace = true
 camino-tempfile.workspace = true
 fallible-iterator.workspace = true
 tokio-tungstenite.workspace = true
+pbkdf2 = { workspace = true, features = ["simple", "std"] }
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 6a906b299b..3555eba543 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -365,7 +365,10 @@ async fn authenticate_with_secret(
     config: &'static AuthenticationConfig,
 ) -> auth::Result<ComputeCredentials> {
     if let Some(password) = unauthenticated_password {
-        let auth_outcome = validate_password_and_exchange(&password, secret).await?;
+        let ep = EndpointIdInt::from(&info.endpoint);
+
+        let auth_outcome =
+            validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?;
         let keys = match auth_outcome {
             crate::sasl::Outcome::Success(key) => key,
             crate::sasl::Outcome::Failure(reason) => {
@@ -386,7 +389,7 @@ async fn authenticate_with_secret(
     // Currently, we use it for websocket connections (latency).
     if allow_cleartext {
         ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
-        return hacks::authenticate_cleartext(ctx, info, client, secret).await;
+        return hacks::authenticate_cleartext(ctx, info, client, secret, config).await;
     }
 
     // Finally, proceed with the main auth flow (SCRAM-based).
@@ -554,7 +557,7 @@ mod tests {
         context::RequestMonitoring,
         proxy::NeonOptions,
         rate_limiter::{EndpointRateLimiter, RateBucketInfo},
-        scram::ServerSecret,
+        scram::{threadpool::ThreadPool, ServerSecret},
         stream::{PqStream, Stream},
     };
 
@@ -596,6 +599,7 @@ mod tests {
     }
 
     static CONFIG: Lazy<AuthenticationConfig> = Lazy::new(|| AuthenticationConfig {
+        thread_pool: ThreadPool::new(1),
         scram_protocol_timeout: std::time::Duration::from_secs(5),
         rate_limiter_enabled: true,
         rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index f7241be4a9..6b0f5e1726 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -3,8 +3,10 @@ use super::{
 };
 use crate::{
     auth::{self, AuthFlow},
+    config::AuthenticationConfig,
     console::AuthSecret,
     context::RequestMonitoring,
+    intern::EndpointIdInt,
     sasl,
     stream::{self, Stream},
 };
@@ -20,6 +22,7 @@ pub async fn authenticate_cleartext(
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     secret: AuthSecret,
+    config: &'static AuthenticationConfig,
 ) -> auth::Result<ComputeCredentials> {
     warn!("cleartext auth flow override is enabled, proceeding");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
@@ -27,8 +30,14 @@ pub async fn authenticate_cleartext(
     // pause the timer while we communicate with the client
     let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
 
+    let ep = EndpointIdInt::from(&info.endpoint);
+
     let auth_flow = AuthFlow::new(client)
-        .begin(auth::CleartextPassword(secret))
+        .begin(auth::CleartextPassword {
+            secret,
+            endpoint: ep,
+            pool: config.thread_pool.clone(),
+        })
         .await?;
     drop(paused);
     // cleartext auth is only allowed to the ws/http protocol.
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 45bbad8cb2..59d1ac17f4 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -5,12 +5,14 @@ use crate::{
     config::TlsServerEndPoint,
     console::AuthSecret,
     context::RequestMonitoring,
-    sasl, scram,
+    intern::EndpointIdInt,
+    sasl,
+    scram::{self, threadpool::ThreadPool},
     stream::{PqStream, Stream},
 };
 use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
 use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
-use std::io;
+use std::{io, sync::Arc};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
@@ -53,7 +55,11 @@ impl AuthMethod for PasswordHack {
 
 /// Use clear-text password auth called `password` in docs
 /// <https://www.postgresql.org/docs/current/auth-password.html>
-pub struct CleartextPassword(pub AuthSecret);
+pub struct CleartextPassword {
+    pub pool: Arc<ThreadPool>,
+    pub endpoint: EndpointIdInt,
+    pub secret: AuthSecret,
+}
 
 impl AuthMethod for CleartextPassword {
     #[inline(always)]
@@ -126,7 +132,13 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
             .strip_suffix(&[0])
             .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
 
-        let outcome = validate_password_and_exchange(password, self.state.0).await?;
+        let outcome = validate_password_and_exchange(
+            &self.state.pool,
+            self.state.endpoint,
+            password,
+            self.state.secret,
+        )
+        .await?;
 
         if let sasl::Outcome::Success(_) = &outcome {
             self.stream.write_message_noflush(&Be::AuthenticationOk)?;
@@ -181,6 +193,8 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
 }
 
 pub(crate) async fn validate_password_and_exchange(
+    pool: &ThreadPool,
+    endpoint: EndpointIdInt,
     password: &[u8],
     secret: AuthSecret,
 ) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
@@ -194,7 +208,7 @@ pub(crate) async fn validate_password_and_exchange(
         }
         // perform scram authentication as both client and server to validate the keys
         AuthSecret::Scram(scram_secret) => {
-            let outcome = crate::scram::exchange(&scram_secret, password).await?;
+            let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?;
 
             let client_key = match outcome {
                 sasl::Outcome::Success(client_key) => client_key,
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index be7d961b8c..30f2e6f4b7 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -27,6 +27,7 @@ use proxy::redis::cancellation_publisher::RedisPublisherClient;
 use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use proxy::redis::elasticache;
 use proxy::redis::notifications;
+use proxy::scram::threadpool::ThreadPool;
 use proxy::serverless::cancel_set::CancelSet;
 use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;
@@ -132,6 +133,9 @@ struct ProxyCliArgs {
     /// timeout for scram authentication protocol
     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     scram_protocol_timeout: tokio::time::Duration,
+    /// size of the threadpool for password hashing
+    #[clap(long, default_value_t = 4)]
+    scram_thread_pool_size: u8,
     /// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated.
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     require_client_ip: bool,
@@ -489,6 +493,9 @@ async fn main() -> anyhow::Result<()> {
 
 /// ProxyConfig is created at proxy startup, and lives forever.
 fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
+    let thread_pool = ThreadPool::new(args.scram_thread_pool_size);
+    Metrics::install(thread_pool.metrics.clone());
+
     let tls_config = match (&args.tls_key, &args.tls_cert) {
         (Some(key_path), Some(cert_path)) => Some(config::configure_tls(
             key_path,
@@ -624,6 +631,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
     };
     let authentication_config = AuthenticationConfig {
+        thread_pool,
         scram_protocol_timeout: args.scram_protocol_timeout,
         rate_limiter_enabled: args.auth_rate_limit_enabled,
         rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index b7ab2c00f9..5a0c251ce2 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -2,6 +2,7 @@ use crate::{
     auth::{self, backend::AuthRateLimiter},
     console::locks::ApiLocks,
     rate_limiter::RateBucketInfo,
+    scram::threadpool::ThreadPool,
     serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
     Host,
 };
@@ -61,6 +62,7 @@ pub struct HttpConfig {
 }
 
 pub struct AuthenticationConfig {
+    pub thread_pool: Arc<ThreadPool>,
     pub scram_protocol_timeout: tokio::time::Duration,
     pub rate_limiter_enabled: bool,
     pub rate_limiter: AuthRateLimiter,
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 1590316925..e2a75a8720 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,11 +1,11 @@
-use std::sync::OnceLock;
+use std::sync::{Arc, OnceLock};
 
 use lasso::ThreadedRodeo;
 use measured::{
-    label::StaticLabelSet,
+    label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet},
     metric::{histogram::Thresholds, name::MetricName},
-    Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
-    MetricGroup,
+    Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
+    LabelGroup, MetricGroup,
 };
 use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
 
@@ -14,26 +14,36 @@ use tokio::time::{self, Instant};
 use crate::console::messages::ColdStartInfo;
 
 #[derive(MetricGroup)]
+#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
 pub struct Metrics {
     #[metric(namespace = "proxy")]
+    #[metric(init = ProxyMetrics::new(thread_pool))]
     pub proxy: ProxyMetrics,
 
     #[metric(namespace = "wake_compute_lock")]
     pub wake_compute_lock: ApiLockMetrics,
 }
 
+static SELF: OnceLock<Metrics> = OnceLock::new();
 impl Metrics {
+    pub fn install(thread_pool: Arc<ThreadPoolMetrics>) {
+        SELF.set(Metrics::new(thread_pool))
+            .ok()
+            .expect("proxy metrics must not be installed more than once");
+    }
+
     pub fn get() -> &'static Self {
-        static SELF: OnceLock<Metrics> = OnceLock::new();
-        SELF.get_or_init(|| Metrics {
-            proxy: ProxyMetrics::default(),
-            wake_compute_lock: ApiLockMetrics::new(),
-        })
+        #[cfg(test)]
+        return SELF.get_or_init(|| Metrics::new(Arc::new(ThreadPoolMetrics::new(0))));
+
+        #[cfg(not(test))]
+        SELF.get()
+            .expect("proxy metrics must be installed by the main() function")
     }
 }
 
 #[derive(MetricGroup)]
-#[metric(new())]
+#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
 pub struct ProxyMetrics {
     #[metric(flatten)]
     pub db_connections: CounterPairVec<NumDbConnectionsGauge>,
@@ -129,6 +139,10 @@ pub struct ProxyMetrics {
 
     #[metric(namespace = "connect_compute_lock")]
     pub connect_compute_lock: ApiLockMetrics,
+
+    #[metric(namespace = "scram_pool")]
+    #[metric(init = thread_pool)]
+    pub scram_pool: Arc<ThreadPoolMetrics>,
 }
 
 #[derive(MetricGroup)]
@@ -146,12 +160,6 @@ pub struct ApiLockMetrics {
     pub semaphore_acquire_seconds: Histogram<16>,
 }
 
-impl Default for ProxyMetrics {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl Default for ApiLockMetrics {
     fn default() -> Self {
         Self::new()
@@ -553,3 +561,52 @@ pub enum RedisEventsCount {
     PasswordUpdate,
     AllowedIpsUpdate,
 }
+
+pub struct ThreadPoolWorkers(usize);
+pub struct ThreadPoolWorkerId(pub usize);
+
+impl LabelValue for ThreadPoolWorkerId {
+    fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
+        v.write_int(self.0 as i64)
+    }
+}
+
+impl LabelGroup for ThreadPoolWorkerId {
+    fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) {
+        v.write_value(LabelName::from_str("worker"), self);
+    }
+}
+
+impl LabelSet for ThreadPoolWorkers {
+    type Value<'a> = ThreadPoolWorkerId;
+
+    fn dynamic_cardinality(&self) -> Option<usize> {
+        Some(self.0)
+    }
+
+    fn encode(&self, value: Self::Value<'_>) -> Option<usize> {
+        (value.0 < self.0).then_some(value.0)
+    }
+
+    fn decode(&self, value: usize) -> Self::Value<'_> {
+        ThreadPoolWorkerId(value)
+    }
+}
+
+impl FixedCardinalitySet for ThreadPoolWorkers {
+    fn cardinality(&self) -> usize {
+        self.0
+    }
+}
+
+#[derive(MetricGroup)]
+#[metric(new(workers: usize))]
+pub struct ThreadPoolMetrics {
+    pub injector_queue_depth: Gauge,
+    #[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))]
+    pub worker_queue_depth: GaugeVec<ThreadPoolWorkers>,
+    #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
+    pub worker_task_turns_total: CounterVec<ThreadPoolWorkers>,
+    #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
+    pub worker_task_skips_total: CounterVec<ThreadPoolWorkers>,
+}
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index ed80675f8a..862facb4e5 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -6,11 +6,14 @@
 //! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/backend/libpq/auth-scram.c>
 //! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/interfaces/libpq/fe-auth-scram.c>
 
+mod countmin;
 mod exchange;
 mod key;
 mod messages;
+mod pbkdf2;
 mod secret;
 mod signature;
+pub mod threadpool;
 
 pub use exchange::{exchange, Exchange};
 pub use key::ScramKey;
@@ -56,9 +59,13 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
 
 #[cfg(test)]
 mod tests {
-    use crate::sasl::{Mechanism, Step};
+    use crate::{
+        intern::EndpointIdInt,
+        sasl::{Mechanism, Step},
+        EndpointId,
+    };
 
-    use super::{Exchange, ServerSecret};
+    use super::{threadpool::ThreadPool, Exchange, ServerSecret};
 
     #[test]
     fn snapshot() {
@@ -112,8 +119,13 @@ mod tests {
     }
 
     async fn run_round_trip_test(server_password: &str, client_password: &str) {
+        let pool = ThreadPool::new(1);
+
+        let ep = EndpointId::from("foo");
+        let ep = EndpointIdInt::from(ep);
+
         let scram_secret = ServerSecret::build(server_password).await.unwrap();
-        let outcome = super::exchange(&scram_secret, client_password.as_bytes())
+        let outcome = super::exchange(&pool, ep, &scram_secret, client_password.as_bytes())
             .await
             .unwrap();
 
diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs
new file mode 100644
index 0000000000..f2b794e5fe
--- /dev/null
+++ b/proxy/src/scram/countmin.rs
@@ -0,0 +1,173 @@
+use std::hash::Hash;
+
+/// estimator of hash jobs per second.
+/// <https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch>
+pub struct CountMinSketch {
+    // one for each depth
+    hashers: Vec<ahash::RandomState>,
+    width: usize,
+    depth: usize,
+    // buckets, width*depth
+    buckets: Vec<u32>,
+}
+
+impl CountMinSketch {
+    /// Given parameters (ε, δ),
+    ///   set width = ceil(e/ε)
+    ///   set depth = ceil(ln(1/δ))
+    ///
+    /// guarantees:
+    /// actual <= estimate
+    /// estimate <= actual + ε * N with probability 1 - δ
+    /// where N is the cardinality of the stream
+    pub fn with_params(epsilon: f64, delta: f64) -> Self {
+        CountMinSketch::new(
+            (std::f64::consts::E / epsilon).ceil() as usize,
+            (1.0_f64 / delta).ln().ceil() as usize,
+        )
+    }
+
+    fn new(width: usize, depth: usize) -> Self {
+        Self {
+            #[cfg(test)]
+            hashers: (0..depth)
+                .map(|i| {
+                    // digits of pi for good randomness
+                    ahash::RandomState::with_seeds(
+                        314159265358979323,
+                        84626433832795028,
+                        84197169399375105,
+                        82097494459230781 + i as u64,
+                    )
+                })
+                .collect(),
+            #[cfg(not(test))]
+            hashers: (0..depth).map(|_| ahash::RandomState::new()).collect(),
+            width,
+            depth,
+            buckets: vec![0; width * depth],
+        }
+    }
+
+    pub fn inc_and_return<T: Hash>(&mut self, t: &T, x: u32) -> u32 {
+        let mut min = u32::MAX;
+        for row in 0..self.depth {
+            let col = (self.hashers[row].hash_one(t) as usize) % self.width;
+
+            let row = &mut self.buckets[row * self.width..][..self.width];
+            row[col] = row[col].saturating_add(x);
+            min = std::cmp::min(min, row[col]);
+        }
+        min
+    }
+
+    pub fn reset(&mut self) {
+        self.buckets.clear();
+        self.buckets.resize(self.width * self.depth, 0);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng};
+
+    use super::CountMinSketch;
+
+    fn eval_precision(n: usize, p: f64, q: f64) -> usize {
+        // fixed value of phi for consistent test
+        let mut rng = StdRng::seed_from_u64(16180339887498948482);
+
+        #[allow(non_snake_case)]
+        let mut N = 0;
+
+        let mut ids = vec![];
+
+        for _ in 0..n {
+            // number of insert operations
+            let n = rng.gen_range(1..100);
+            // number to insert at once
+            let m = rng.gen_range(1..4096);
+
+            let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid();
+            ids.push((id, n, m));
+
+            // N = sum(actual)
+            N += n * m;
+        }
+
+        // q% of counts will be within p of the actual value
+        let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
+
+        dbg!(sketch.buckets.len());
+
+        // insert a bunch of entries in a random order
+        let mut ids2 = ids.clone();
+        while !ids2.is_empty() {
+            ids2.shuffle(&mut rng);
+
+            let mut i = 0;
+            while i < ids2.len() {
+                sketch.inc_and_return(&ids2[i].0, ids2[i].1);
+                ids2[i].2 -= 1;
+                if ids2[i].2 == 0 {
+                    ids2.remove(i);
+                } else {
+                    i += 1;
+                }
+            }
+        }
+
+        let mut within_p = 0;
+        for (id, n, m) in ids {
+            let actual = n * m;
+            let estimate = sketch.inc_and_return(&id, 0);
+
+            // This estimate has the guarantee that actual <= estimate
+            assert!(actual <= estimate);
+
+            // This estimate has the guarantee that estimate <= actual + εN with probability 1 - δ.
+            // ε = p / N, δ = 1 - q;
+            // therefore, estimate <= actual + p with probability q.
+            if estimate as f64 <= actual as f64 + p {
+                within_p += 1;
+            }
+        }
+        within_p
+    }
+
+    #[test]
+    fn precision() {
+        assert_eq!(eval_precision(100, 100.0, 0.99), 100);
+        assert_eq!(eval_precision(1000, 100.0, 0.99), 1000);
+        assert_eq!(eval_precision(100, 4096.0, 0.99), 100);
+        assert_eq!(eval_precision(1000, 4096.0, 0.99), 1000);
+
+        // seems to be more precise than the literature indicates?
+        // probably numbers are too small to truly represent the probabilities.
+        assert_eq!(eval_precision(100, 4096.0, 0.90), 100);
+        assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000);
+        assert_eq!(eval_precision(100, 4096.0, 0.1), 98);
+        assert_eq!(eval_precision(1000, 4096.0, 0.1), 991);
+    }
+
+    // returns memory usage in bytes, and the time complexity per insert.
+    fn eval_cost(p: f64, q: f64) -> (usize, usize) {
+        #[allow(non_snake_case)]
+        // N = sum(actual)
+        // Let's assume 1021 samples, all of 4096
+        let N = 1021 * 4096;
+        let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
+
+        let memory = std::mem::size_of::<u32>() * sketch.buckets.len();
+        let time = sketch.depth;
+        (memory, time)
+    }
+
+    #[test]
+    fn memory_usage() {
+        assert_eq!(eval_cost(100.0, 0.99), (2273580, 5));
+        assert_eq!(eval_cost(4096.0, 0.99), (55520, 5));
+        assert_eq!(eval_cost(4096.0, 0.90), (33312, 3));
+        assert_eq!(eval_cost(4096.0, 0.1), (11104, 1));
+    }
+}
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 89dd33e59f..d0adbc780e 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -4,15 +4,17 @@ use std::convert::Infallible;
 
 use hmac::{Hmac, Mac};
 use sha2::Sha256;
-use tokio::task::yield_now;
 
 use super::messages::{
     ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
 };
+use super::pbkdf2::Pbkdf2;
 use super::secret::ServerSecret;
 use super::signature::SignatureBuilder;
+use super::threadpool::ThreadPool;
 use super::ScramKey;
 use crate::config;
+use crate::intern::EndpointIdInt;
 use crate::sasl::{self, ChannelBinding, Error as SaslError};
 
 /// The only channel binding mode we currently support.
@@ -74,37 +76,18 @@ impl<'a> Exchange<'a> {
     }
 }
 
-// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
-async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
-    let hmac = Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
-    let mut prev = hmac
-        .clone()
-        .chain_update(salt)
-        .chain_update(1u32.to_be_bytes())
-        .finalize()
-        .into_bytes();
-
-    let mut hi = prev;
-
-    for i in 1..iterations {
-        prev = hmac.clone().chain_update(prev).finalize().into_bytes();
-
-        for (hi, prev) in hi.iter_mut().zip(prev) {
-            *hi ^= prev;
-        }
-        // yield every ~250us
-        // hopefully reduces tail latencies
-        if i % 1024 == 0 {
-            yield_now().await
-        }
-    }
-
-    hi.into()
-}
-
 // copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L236-L248>
-async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey {
-    let salted_password = pbkdf2(password, salt, iterations).await;
+async fn derive_client_key(
+    pool: &ThreadPool,
+    endpoint: EndpointIdInt,
+    password: &[u8],
+    salt: &[u8],
+    iterations: u32,
+) -> ScramKey {
+    let salted_password = pool
+        .spawn_job(endpoint, Pbkdf2::start(password, salt, iterations))
+        .await
+        .expect("job should not be cancelled");
 
     let make_key = |name| {
         let key = Hmac::<Sha256>::new_from_slice(&salted_password)
@@ -119,11 +102,13 @@ async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> Scr
 }
 
 pub async fn exchange(
+    pool: &ThreadPool,
+    endpoint: EndpointIdInt,
     secret: &ServerSecret,
     password: &[u8],
 ) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
     let salt = base64::decode(&secret.salt_base64)?;
-    let client_key = derive_client_key(password, &salt, secret.iterations).await;
+    let client_key = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await;
 
     if secret.is_password_invalid(&client_key).into() {
         Ok(sasl::Outcome::Failure("password doesn't match"))
diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs
new file mode 100644
index 0000000000..a803ba7e1b
--- /dev/null
+++ b/proxy/src/scram/pbkdf2.rs
@@ -0,0 +1,89 @@
+use hmac::{
+    digest::{consts::U32, generic_array::GenericArray},
+    Hmac, Mac,
+};
+use sha2::Sha256;
+
+pub struct Pbkdf2 {
+    hmac: Hmac<Sha256>,
+    prev: GenericArray<u8, U32>,
+    hi: GenericArray<u8, U32>,
+    iterations: u32,
+}
+
+// inspired from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
+impl Pbkdf2 {
+    pub fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
+        let hmac =
+            Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
+
+        let prev = hmac
+            .clone()
+            .chain_update(salt)
+            .chain_update(1u32.to_be_bytes())
+            .finalize()
+            .into_bytes();
+
+        Self {
+            hmac,
+            // one consumed for the hash above
+            iterations: iterations - 1,
+            hi: prev,
+            prev,
+        }
+    }
+
+    pub fn cost(&self) -> u32 {
+        (self.iterations).clamp(0, 4096)
+    }
+
+    pub fn turn(&mut self) -> std::task::Poll<[u8; 32]> {
+        let Self {
+            hmac,
+            prev,
+            hi,
+            iterations,
+        } = self;
+
+        // only do 4096 iterations per turn before sharing the thread for fairness
+        let n = (*iterations).clamp(0, 4096);
+        for _ in 0..n {
+            *prev = hmac.clone().chain_update(*prev).finalize().into_bytes();
+
+            for (hi, prev) in hi.iter_mut().zip(*prev) {
+                *hi ^= prev;
+            }
+        }
+
+        *iterations -= n;
+        if *iterations == 0 {
+            std::task::Poll::Ready((*hi).into())
+        } else {
+            std::task::Poll::Pending
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::Pbkdf2;
+    use pbkdf2::pbkdf2_hmac_array;
+    use sha2::Sha256;
+
+    #[test]
+    fn works() {
+        let salt = b"sodium chloride";
+        let pass = b"Ne0n_!5_50_C007";
+
+        let mut job = Pbkdf2::start(pass, salt, 600000);
+        let hash = loop {
+            let std::task::Poll::Ready(hash) = job.turn() else {
+                continue;
+            };
+            break hash;
+        };
+
+        let expected = pbkdf2_hmac_array::<Sha256, 32>(pass, salt, 600000);
+        assert_eq!(hash, expected)
+    }
+}
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
new file mode 100644
index 0000000000..7701b869a3
--- /dev/null
+++ b/proxy/src/scram/threadpool.rs
@@ -0,0 +1,321 @@
+//! Custom threadpool implementation for password hashing.
+//!
+//! Requirements:
+//! 1. Fairness per endpoint.
+//! 2. Yield support for high iteration counts.
+
+use std::sync::{
+    atomic::{AtomicU64, Ordering},
+    Arc,
+};
+
+use crossbeam_deque::{Injector, Stealer, Worker};
+use itertools::Itertools;
+use parking_lot::{Condvar, Mutex};
+use rand::Rng;
+use rand::{rngs::SmallRng, SeedableRng};
+use tokio::sync::oneshot;
+
+use crate::{
+    intern::EndpointIdInt,
+    metrics::{ThreadPoolMetrics, ThreadPoolWorkerId},
+    scram::countmin::CountMinSketch,
+};
+
+use super::pbkdf2::Pbkdf2;
+
+pub struct ThreadPool {
+    queue: Injector<JobSpec>,
+    stealers: Vec<Stealer<JobSpec>>,
+    parkers: Vec<(Condvar, Mutex<ThreadState>)>,
+    /// bitpacked representation.
+    /// lower 8 bits = number of sleeping threads
+    /// next 8 bits = number of idle threads (searching for work)
+    counters: AtomicU64,
+
+    pub metrics: Arc<ThreadPoolMetrics>,
+}
+
+#[derive(PartialEq)]
+enum ThreadState {
+    Parked,
+    Active,
+}
+
+impl ThreadPool {
+    pub fn new(n_workers: u8) -> Arc<Self> {
+        let workers = (0..n_workers).map(|_| Worker::new_fifo()).collect_vec();
+        let stealers = workers.iter().map(|w| w.stealer()).collect_vec();
+
+        let parkers = (0..n_workers)
+            .map(|_| (Condvar::new(), Mutex::new(ThreadState::Active)))
+            .collect_vec();
+
+        let pool = Arc::new(Self {
+            queue: Injector::new(),
+            stealers,
+            parkers,
+            // threads start searching for work
+            counters: AtomicU64::new((n_workers as u64) << 8),
+            metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)),
+        });
+
+        for (i, worker) in workers.into_iter().enumerate() {
+            let pool = Arc::clone(&pool);
+            std::thread::spawn(move || thread_rt(pool, worker, i));
+        }
+
+        pool
+    }
+
+    pub fn spawn_job(
+        &self,
+        endpoint: EndpointIdInt,
+        pbkdf2: Pbkdf2,
+    ) -> oneshot::Receiver<[u8; 32]> {
+        let (tx, rx) = oneshot::channel();
+
+        let queue_was_empty = self.queue.is_empty();
+
+        self.metrics.injector_queue_depth.inc();
+        self.queue.push(JobSpec {
+            response: tx,
+            pbkdf2,
+            endpoint,
+        });
+
+        // inspired from <https://github.com/rayon-rs/rayon/blob/3e3962cb8f7b50773bcc360b48a7a674a53a2c77/rayon-core/src/sleep/mod.rs#L242>
+        let counts = self.counters.load(Ordering::SeqCst);
+        let num_awake_but_idle = (counts >> 8) & 0xff;
+        let num_sleepers = counts & 0xff;
+
+        // If the queue is non-empty, then we always wake up a worker
+        // -- clearly the existing idle jobs aren't enough. Otherwise,
+        // check to see if we have enough idle workers.
+        if !queue_was_empty || num_awake_but_idle == 0 {
+            let num_to_wake = Ord::min(1, num_sleepers);
+            self.wake_any_threads(num_to_wake);
+        }
+
+        rx
+    }
+
+    #[cold]
+    fn wake_any_threads(&self, mut num_to_wake: u64) {
+        if num_to_wake > 0 {
+            for i in 0..self.parkers.len() {
+                if self.wake_specific_thread(i) {
+                    num_to_wake -= 1;
+                    if num_to_wake == 0 {
+                        return;
+                    }
+                }
+            }
+        }
+    }
+
+    fn wake_specific_thread(&self, index: usize) -> bool {
+        let (condvar, lock) = &self.parkers[index];
+
+        let mut state = lock.lock();
+        if *state == ThreadState::Parked {
+            condvar.notify_one();
+
+            // When the thread went to sleep, it will have incremented
+            // this value. When we wake it, its our job to decrement
+            // it. We could have the thread do it, but that would
+            // introduce a delay between when the thread was
+            // *notified* and when this counter was decremented. That
+            // might mislead people with new work into thinking that
+            // there are sleeping threads that they should try to
+            // wake, when in fact there is nothing left for them to
+            // do.
+            self.counters.fetch_sub(1, Ordering::SeqCst);
+            *state = ThreadState::Active;
+
+            true
+        } else {
+            false
+        }
+    }
+
+    fn steal(&self, rng: &mut impl Rng, skip: usize, worker: &Worker<JobSpec>) -> Option<JobSpec> {
+        // announce thread as idle
+        self.counters.fetch_add(256, Ordering::SeqCst);
+
+        // try steal from the global queue
+        loop {
+            match self.queue.steal_batch_and_pop(worker) {
+                crossbeam_deque::Steal::Success(job) => {
+                    self.metrics
+                        .injector_queue_depth
+                        .set(self.queue.len() as i64);
+                    // no longer idle
+                    self.counters.fetch_sub(256, Ordering::SeqCst);
+                    return Some(job);
+                }
+                crossbeam_deque::Steal::Retry => continue,
+                crossbeam_deque::Steal::Empty => break,
+            }
+        }
+
+        // try steal from our neighbours
+        loop {
+            let mut retry = false;
+            let start = rng.gen_range(0..self.stealers.len());
+            let job = (start..self.stealers.len())
+                .chain(0..start)
+                .filter(|i| *i != skip)
+                .find_map(
+                    |victim| match self.stealers[victim].steal_batch_and_pop(worker) {
+                        crossbeam_deque::Steal::Success(job) => Some(job),
+                        crossbeam_deque::Steal::Empty => None,
+                        crossbeam_deque::Steal::Retry => {
+                            retry = true;
+                            None
+                        }
+                    },
+                );
+            if job.is_some() {
+                // no longer idle
+                self.counters.fetch_sub(256, Ordering::SeqCst);
+                return job;
+            }
+            if !retry {
+                return None;
+            }
+        }
+    }
+}
+
+fn thread_rt(pool: Arc<ThreadPool>, worker: Worker<JobSpec>, index: usize) {
+    /// interval when we should steal from the global queue
+    /// so that tail latencies are managed appropriately
+    const STEAL_INTERVAL: usize = 61;
+
+    /// How often to reset the sketch values
+    const SKETCH_RESET_INTERVAL: usize = 1021;
+
+    let mut rng = SmallRng::from_entropy();
+
+    // used to determine whether we should temporarily skip tasks for fairness.
+    // 99% of estimates will overcount by no more than 4096 samples
+    let mut sketch = CountMinSketch::with_params(1.0 / (SKETCH_RESET_INTERVAL as f64), 0.01);
+
+    let (condvar, lock) = &pool.parkers[index];
+
+    'wait: loop {
+        // wait for notification of work
+        {
+            let mut lock = lock.lock();
+
+            // queue is empty
+            pool.metrics
+                .worker_queue_depth
+                .set(ThreadPoolWorkerId(index), 0);
+
+            // subtract 1 from idle count, add 1 to sleeping count.
+            pool.counters.fetch_sub(255, Ordering::SeqCst);
+
+            *lock = ThreadState::Parked;
+            condvar.wait(&mut lock);
+        }
+
+        for i in 0.. {
+            let mut job = match worker
+                .pop()
+                .or_else(|| pool.steal(&mut rng, index, &worker))
+            {
+                Some(job) => job,
+                None => continue 'wait,
+            };
+
+            pool.metrics
+                .worker_queue_depth
+                .set(ThreadPoolWorkerId(index), worker.len() as i64);
+
+            // receiver is closed, cancel the task
+            if !job.response.is_closed() {
+                let rate = sketch.inc_and_return(&job.endpoint, job.pbkdf2.cost());
+
+                const P: f64 = 2000.0;
+                // probability decreases as rate increases.
+                // lower probability, higher chance of being skipped
+                //
+                // estimates (rate in terms of 4096 rounds):
+                // rate = 0    => probability = 100%
+                // rate = 10   => probability = 71.3%
+                // rate = 50   => probability = 62.1%
+                // rate = 500  => probability = 52.3%
+                // rate = 1021 => probability = 49.8%
+                //
+                // My expectation is that the pool queue will only begin backing up at ~1000rps
+                // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above
+                // are in requests per second.
+                let probability = P.ln() / (P + rate as f64).ln();
+                if pool.queue.len() > 32 || rng.gen_bool(probability) {
+                    pool.metrics
+                        .worker_task_turns_total
+                        .inc(ThreadPoolWorkerId(index));
+
+                    match job.pbkdf2.turn() {
+                        std::task::Poll::Ready(result) => {
+                            let _ = job.response.send(result);
+                        }
+                        std::task::Poll::Pending => worker.push(job),
+                    }
+                } else {
+                    pool.metrics
+                        .worker_task_skips_total
+                        .inc(ThreadPoolWorkerId(index));
+
+                    // skip for now
+                    worker.push(job)
+                }
+            }
+
+            // if we get stuck with a few long lived jobs in the queue
+            // it's better to try and steal from the queue too for fairness
+            if i % STEAL_INTERVAL == 0 {
+                let _ = pool.queue.steal_batch(&worker);
+            }
+
+            if i % SKETCH_RESET_INTERVAL == 0 {
+                sketch.reset();
+            }
+        }
+    }
+}
+
+struct JobSpec {
+    response: oneshot::Sender<[u8; 32]>,
+    pbkdf2: Pbkdf2,
+    endpoint: EndpointIdInt,
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::EndpointId;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn hash_is_correct() {
+        let pool = ThreadPool::new(1);
+
+        let ep = EndpointId::from("foo");
+        let ep = EndpointIdInt::from(ep);
+
+        let salt = [0x55; 32];
+        let actual = pool
+            .spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096))
+            .await
+            .unwrap();
+
+        let expected = [
+            10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242,
+            178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140,
+        ];
+        assert_eq!(actual, expected)
+    }
+}
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 6b79c12316..52fc7b556a 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -15,6 +15,7 @@ use crate::{
     },
     context::RequestMonitoring,
     error::{ErrorKind, ReportableError, UserFacingError},
+    intern::EndpointIdInt,
     proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
     rate_limiter::EndpointRateLimiter,
     Host,
@@ -66,8 +67,14 @@ impl PoolingBackend {
                 return Err(AuthError::auth_failed(&*user_info.user));
             }
         };
-        let auth_outcome =
-            crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
+        let ep = EndpointIdInt::from(&conn_info.user_info.endpoint);
+        let auth_outcome = crate::auth::validate_password_and_exchange(
+            &config.thread_pool,
+            ep,
+            &conn_info.password,
+            secret,
+        )
+        .await?;
         let res = match auth_outcome {
             crate::sasl::Outcome::Success(key) => {
                 info!("user successfully authenticated");
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 7582562450..f364a6c2e0 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -13,6 +13,7 @@ publish = false
 
 ### BEGIN HAKARI SECTION
 [dependencies]
+ahash = { version = "0.8" }
 anyhow = { version = "1", features = ["backtrace"] }
 aws-config = { version = "1", default-features = false, features = ["rustls", "sso"] }
 aws-runtime = { version = "1", default-features = false, features = ["event-stream", "http-02x", "sigv4a"] }
@@ -85,6 +86,7 @@ zstd-safe = { version = "7", default-features = false, features = ["arrays", "le
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }
 
 [build-dependencies]
+ahash = { version = "0.8" }
 anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }

From ddd8ebd2536f90872da2678aecbd94a3e1b3f547 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 22 May 2024 13:06:00 -0400
Subject: [PATCH 0851/1571] chore(pageserver): use kebab case for aux file flag
 (#7840)

part of https://github.com/neondatabase/neon/issues/7462

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs             | 45 +++++++++++--------
 test_runner/fixtures/neon_fixtures.py         |  2 +-
 test_runner/fixtures/utils.py                 |  6 +--
 .../regress/test_attach_tenant_config.py      |  2 +-
 test_runner/regress/test_aux_files.py         | 12 +++--
 5 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 55e9c48421..6b0403c8ab 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,7 +9,6 @@ use std::{
     collections::HashMap,
     io::{BufRead, Read},
     num::{NonZeroU64, NonZeroUsize},
-    str::FromStr,
     sync::atomic::AtomicUsize,
     time::{Duration, SystemTime},
 };
@@ -334,14 +333,28 @@ pub struct TenantConfig {
 /// Unset -> V1
 ///       -> V2
 ///       -> CrossValidation -> V2
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
 pub enum AuxFilePolicy {
     /// V1 aux file policy: store everything in AUX_FILE_KEY
+    #[strum(ascii_case_insensitive)]
     V1,
     /// V2 aux file policy: store in the AUX_FILE keyspace
+    #[strum(ascii_case_insensitive)]
     V2,
     /// Cross validation runs both formats on the write path and does validation
     /// on the read path.
+    #[strum(ascii_case_insensitive)]
     CrossValidation,
 }
 
@@ -407,23 +420,6 @@ impl AuxFilePolicy {
     }
 }
 
-impl FromStr for AuxFilePolicy {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let s = s.to_lowercase();
-        if s == "v1" {
-            Ok(Self::V1)
-        } else if s == "v2" {
-            Ok(Self::V2)
-        } else if s == "crossvalidation" || s == "cross_validation" {
-            Ok(Self::CrossValidation)
-        } else {
-            anyhow::bail!("cannot parse {} to aux file policy", s)
-        }
-    }
-}
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(tag = "kind")]
 pub enum EvictionPolicy {
@@ -1405,6 +1401,7 @@ impl PagestreamBeMessage {
 #[cfg(test)]
 mod tests {
     use serde_json::json;
+    use std::str::FromStr;
 
     use super::*;
 
@@ -1667,4 +1664,14 @@ mod tests {
             AuxFilePolicy::V2
         ));
     }
+
+    #[test]
+    fn test_aux_parse() {
+        assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
+        assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
+        assert_eq!(
+            AuxFilePolicy::from_str("cross-validation").unwrap(),
+            AuxFilePolicy::CrossValidation
+        );
+    }
 }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b02054a702..796ae7217b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1625,7 +1625,7 @@ class NeonCli(AbstractNeonCli):
             args.extend(["-c", "switch_aux_file_policy:v1"])
 
         if aux_file_v2 is AuxFileStore.CrossValidation:
-            args.extend(["-c", "switch_aux_file_policy:cross_validation"])
+            args.extend(["-c", "switch_aux_file_policy:cross-validation"])
 
         if set_default:
             args.append("--set-default")
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 70263245e7..22bb43c580 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -495,9 +495,9 @@ def assert_no_errors(log_file, service, allowed_errors):
 
 @enum.unique
 class AuxFileStore(str, enum.Enum):
-    V1 = "V1"
-    V2 = "V2"
-    CrossValidation = "CrossValidation"
+    V1 = "v1"
+    V2 = "v2"
+    CrossValidation = "cross-validation"
 
     def __repr__(self) -> str:
         return f"'aux-{self.value}'"
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 2ec375271c..531679c7af 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -190,7 +190,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "trace_read_requests": True,
         "walreceiver_connect_timeout": "13m",
         "image_layer_creation_check_threshold": 1,
-        "switch_aux_file_policy": "CrossValidation",
+        "switch_aux_file_policy": "cross-validation",
     }
 
     ps_http = env.pageserver.http_client()
diff --git a/test_runner/regress/test_aux_files.py b/test_runner/regress/test_aux_files.py
index be9c41a867..5328aef156 100644
--- a/test_runner/regress/test_aux_files.py
+++ b/test_runner/regress/test_aux_files.py
@@ -1,5 +1,6 @@
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    AuxFileStore,
     NeonEnvBuilder,
     logical_replication_sync,
 )
@@ -14,7 +15,7 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
     timeline_id = env.initial_timeline
 
     tenant_config = client.tenant_config(tenant_id).effective_config
-    tenant_config["switch_aux_file_policy"] = "V2"
+    tenant_config["switch_aux_file_policy"] = AuxFileStore.V2
     client.set_tenant_config(tenant_id, tenant_config)
     # aux file v2 is enabled on the write path, so for now, it should be unset (or null)
     assert (
@@ -49,7 +50,10 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
 
     with env.pageserver.http_client() as client:
         # aux file v2 flag should be enabled at this point
-        assert client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"] == "V2"
+        assert (
+            client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"]
+            == AuxFileStore.V2
+        )
     with env.pageserver.http_client() as client:
         tenant_config = client.tenant_config(tenant_id).effective_config
         tenant_config["switch_aux_file_policy"] = "V1"
@@ -59,7 +63,7 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
             client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
                 "last_aux_file_policy"
             ]
-            == "V2"
+            == AuxFileStore.V2
         )
     env.pageserver.restart()
     with env.pageserver.http_client() as client:
@@ -68,5 +72,5 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
             client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
                 "last_aux_file_policy"
             ]
-            == "V2"
+            == AuxFileStore.V2
         )

From 014f822a789efd2a64fbcf79de5acd2f07018952 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 22 May 2024 19:17:47 +0100
Subject: [PATCH 0852/1571] tests: refine test_secondary_background_downloads
 (#7829)

## Problem

This test relied on some sleeps, and was failing ~5% of the time.

## Summary of changes

Use log-watching rather than straight waits, and make timeouts more
generous for the CI environment.
---
 .../regress/test_pageserver_secondary.py      | 51 +++++++++++++++++--
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 127340a1e7..25a3f8521c 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -578,7 +578,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
     default_download_period_secs = 60
 
     # The upload period, which will also be the download once the secondary has seen its first heatmap
-    upload_period_secs = 20
+    upload_period_secs = 30
 
     for _i in range(0, tenant_count):
         tenant_id = TenantId.generate()
@@ -596,11 +596,26 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
 
         tenant_timelines[tenant_id] = [timeline_a, timeline_b]
 
+    def await_log(pageserver, deadline, expression):
+        """
+        Wrapper around assert_log_contains that waits with a deadline rather than timeout
+        """
+        now = time.time()
+        if now > deadline:
+            raise RuntimeError(f"Timed out waiting for {expression}")
+        else:
+            timeout = int(deadline - now) + 1
+            try:
+                wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression))  # type: ignore
+            except:
+                log.error(f"Timed out waiting for '{expression}'")
+                raise
+
     t_start = time.time()
 
     # Wait long enough that the background downloads should happen; we expect all the inital layers
     # of all the initial timelines to show up on the secondary location of each tenant.
-    time.sleep(default_download_period_secs * 1.5)
+    initial_download_deadline = time.time() + default_download_period_secs * 3
 
     for tenant_id, timelines in tenant_timelines.items():
         attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -608,8 +623,24 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         # We only have two: the other one must be secondary
         ps_secondary = next(p for p in env.pageservers if p != ps_attached)
 
+        now = time.time()
+        if now > initial_download_deadline:
+            raise RuntimeError("Timed out waiting for initial secondary download")
+        else:
+            for timeline_id in timelines:
+                log.info(
+                    f"Waiting for downloads of timeline {timeline_id} on secondary pageserver {ps_secondary.id}"
+                )
+                await_log(
+                    ps_secondary,
+                    initial_download_deadline,
+                    f".*{timeline_id}.*Wrote timeline_detail.*",
+                )
+
         for timeline_id in timelines:
-            log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}")
+            log.info(
+                f"Checking for secondary timeline downloads {timeline_id} on node {ps_secondary.id}"
+            )
             # One or more layers should be present for all timelines
             assert ps_secondary.list_layers(tenant_id, timeline_id)
 
@@ -617,7 +648,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])
 
     # Wait long enough for the secondary locations to see the deletion: 2x period plus a grace factor
-    time.sleep(upload_period_secs * 2.5)
+    deletion_deadline = time.time() + upload_period_secs * 3
 
     for tenant_id, timelines in tenant_timelines.items():
         attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -625,6 +656,16 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         # We only have two: the other one must be secondary
         ps_secondary = next(p for p in env.pageservers if p != ps_attached)
 
+        expect_del_timeline = timelines[1]
+        log.info(
+            f"Waiting for deletion of timeline {expect_del_timeline} on secondary pageserver {ps_secondary.id}"
+        )
+        await_log(
+            ps_secondary,
+            deletion_deadline,
+            f".*Timeline no longer in heatmap.*{expect_del_timeline}.*",
+        )
+
         # This one was not deleted
         assert ps_secondary.list_layers(tenant_id, timelines[0])
 
@@ -632,7 +673,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         log.info(
             f"Checking for secondary timeline deletion {tenant_id}/{timeline_id} on node {ps_secondary.id}"
         )
-        assert not ps_secondary.list_layers(tenant_id, timelines[1])
+        assert not ps_secondary.list_layers(tenant_id, expect_del_timeline)
 
     t_end = time.time()
 

From f98fdd20e390822a890f596a7932ca81b47e00ce Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 22 May 2024 19:38:22 +0100
Subject: [PATCH 0853/1571] tests: add a couple of allow lists for shutdown
 cases (#7844)

## Problem

Failures on some of our uglier shutdown log messages:

https://neon-github-public-dev.s3.amazonaws.com/reports/main/9192662995/index.html#suites/07874de07c4a1c9effe0d92da7755ebf/51b365408678c66f/

## Summary of changes

- Allow-list these errors.
---
 test_runner/fixtures/pageserver/allowed_errors.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 91cd67d107..fa6e4eaafd 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -70,6 +70,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     # this is expected given our collaborative shutdown approach for the UploadQueue
     ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
     ".*Compaction failed.*, retrying in .*: ShuttingDown",
+    ".*Compaction failed.*, retrying in .*: timeline shutting down.*",
     # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
     ".*Error processing HTTP request: NotFound: Timeline .* was not found",
     ".*took more than expected to complete.*",
@@ -91,6 +92,10 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*",
     # Can happen when the test shuts down the storage controller while it is calling the utilization API
     ".*WARN.*path=/v1/utilization .*request was dropped before completing",
+    # Can happen during shutdown
+    ".*scheduling deletion on drop failed: queue is in state Stopped.*",
+    # Can happen during shutdown
+    ".*ignoring failure to find gc cutoffs: timeline shutting down.*",
 )
 
 
From 4a278cce7ce5b7f32360e85fd41219df95cc9a86 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 22 May 2024 15:05:26 -0400
Subject: [PATCH 0854/1571] chore(pageserver): add force aux file policy switch
 handler (#7842)

For existing users, we want to allow doing a force switch for their aux
file policy.

Part of #7462

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs       | 78 ++++++++++++++++++-----------
 pageserver/src/pgdatadir_mapping.rs | 14 +-----
 pageserver/src/tenant.rs            | 61 ++++++++++++++++++++++
 pageserver/src/tenant/timeline.rs   |  8 +++
 4 files changed, 119 insertions(+), 42 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 34b9806a26..7b55e88096 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -16,6 +16,7 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::IngestAuxFilesRequest;
 use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
@@ -2307,6 +2308,31 @@ async fn post_tracing_event_handler(
     json_response(StatusCode::OK, ())
 }
 
+async fn force_aux_policy_switch_handler(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
+    let policy: AuxFilePolicy = json_request(&mut r).await?;
+
+    let state = get_state(&r);
+
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+    timeline
+        .do_switch_aux_policy(policy)
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn put_io_engine_handler(
     mut r: Request<Body>,
     _cancel: CancellationToken,
@@ -2384,19 +2410,9 @@ async fn list_aux_files(
         active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
             .await?;
 
-    let process = || async move {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let files = timeline.list_aux_files(body.lsn, &ctx).await?;
-        Ok::<_, anyhow::Error>(files)
-    };
-
-    match process().await {
-        Ok(st) => json_response(StatusCode::OK, st),
-        Err(err) => json_response(
-            StatusCode::INTERNAL_SERVER_ERROR,
-            ApiError::InternalServerError(err).to_string(),
-        ),
-    }
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let files = timeline.list_aux_files(body.lsn, &ctx).await?;
+    json_response(StatusCode::OK, files)
 }
 
 async fn ingest_aux_files(
@@ -2414,24 +2430,22 @@ async fn ingest_aux_files(
         active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
             .await?;
 
-    let process = || async move {
-        let mut modification = timeline.begin_modification(Lsn(
-            timeline.get_last_record_lsn().0 + 8
-        ) /* advance LSN by 8 */);
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        for (fname, content) in body.aux_files {
-            modification
-                .put_file(&fname, content.as_bytes(), &ctx)
-                .await?;
-        }
-        modification.commit(&ctx).await?;
-        Ok::<_, anyhow::Error>(())
-    };
-
-    match process().await {
-        Ok(st) => json_response(StatusCode::OK, st),
-        Err(err) => Err(ApiError::InternalServerError(err)),
+    let mut modification = timeline.begin_modification(
+        Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */
+    );
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    for (fname, content) in body.aux_files {
+        modification
+            .put_file(&fname, content.as_bytes(), &ctx)
+            .await
+            .map_err(ApiError::InternalServerError)?;
     }
+    modification
+        .commit(&ctx)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
 }
 
 /// Report on the largest tenants on this pageserver, for the storage controller to identify
@@ -2814,6 +2828,10 @@ pub fn make_router(
             |r| api_handler(r, timeline_collect_keyspace),
         )
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
+        .put(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
+            |r| api_handler(r, force_aux_policy_switch_handler),
+        )
         .get("/v1/utilization", |r| api_handler(r, get_utilization))
         .post(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 7dea687c46..afba34c6d1 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1489,14 +1489,7 @@ impl<'a> DatadirModification<'a> {
                 if aux_files_key_v1.is_empty() {
                     None
                 } else {
-                    self.tline
-                        .last_aux_file_policy
-                        .store(Some(AuxFilePolicy::V1));
-                    self.tline
-                        .remote_client
-                        .schedule_index_upload_for_aux_file_policy_update(Some(
-                            AuxFilePolicy::V1,
-                        ))?;
+                    self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
                     Some(AuxFilePolicy::V1)
                 }
             } else {
@@ -1504,10 +1497,7 @@ impl<'a> DatadirModification<'a> {
             };
 
             if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
-                self.tline.last_aux_file_policy.store(Some(switch_policy));
-                self.tline
-                    .remote_client
-                    .schedule_index_upload_for_aux_file_policy_update(Some(switch_policy))?;
+                self.tline.do_switch_aux_policy(switch_policy)?;
                 info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
                 switch_policy
             } else {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index caf26e0a0b..2bb199b228 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5999,6 +5999,67 @@ mod tests {
         );
     }
 
+    #[tokio::test]
+    async fn aux_file_policy_force_switch() {
+        let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap();
+        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
+        let (tenant, ctx) = harness.load().await;
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            None,
+            "no aux file is written so it should be unset"
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test1", b"first", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap();
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V2),
+            "dirty index_part.json reflected state is yet to be updated"
+        );
+
+        // lose all data from v1
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(files.get("pg_logical/mappings/test1"), None);
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test2", b"second", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        // read data ingested in v2
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"second"))
+        );
+        // lose all data from v1
+        assert_eq!(files.get("pg_logical/mappings/test1"), None);
+    }
+
     #[tokio::test]
     async fn aux_file_policy_auto_detect() {
         let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1f8ee9ffc4..63d03b3c68 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4606,6 +4606,14 @@ impl Timeline {
     ) -> Result<Vec<TimelineId>, anyhow::Error> {
         detach_ancestor::complete(self, tenant, prepared, ctx).await
     }
+
+    /// Switch aux file policy and schedule upload to the index part.
+    pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> {
+        self.last_aux_file_policy.store(Some(policy));
+        self.remote_client
+            .schedule_index_upload_for_aux_file_policy_update(Some(policy))?;
+        Ok(())
+    }
 }
 
 /// Top-level failure to compact.

From ff560a1113046ff29acf2723c0183db7c2d128f1 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 22 May 2024 17:28:47 -0400
Subject: [PATCH 0855/1571] chore(pageserver): use kebab case for compaction
 algorithms (#7845)

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs             | 21 ++++++++++++++++---
 pageserver/src/tenant.rs                      | 10 ++++++---
 pageserver/src/tenant/config.rs               | 13 ++++++++----
 pageserver/src/tenant/timeline.rs             | 14 +++++++------
 .../regress/test_attach_tenant_config.py      |  2 +-
 test_runner/regress/test_compaction.py        |  4 ++--
 6 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 6b0403c8ab..9311dab33c 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -305,7 +305,7 @@ pub struct TenantConfig {
     pub compaction_period: Option<String>,
     pub compaction_threshold: Option<usize>,
     // defer parsing compaction_algorithm, like eviction_policy
-    pub compaction_algorithm: Option<CompactionAlgorithm>,
+    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
     pub gc_horizon: Option<u64>,
     pub gc_period: Option<String>,
     pub image_creation_threshold: Option<usize>,
@@ -438,13 +438,28 @@ impl EvictionPolicy {
     }
 }
 
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "kind")]
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
 pub enum CompactionAlgorithm {
     Legacy,
     Tiered,
 }
 
+#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
+pub struct CompactionAlgorithmSettings {
+    pub kind: CompactionAlgorithm,
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
     #[serde(with = "humantime_serde")]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2bb199b228..540eb10ed2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3973,7 +3973,7 @@ mod tests {
     use hex_literal::hex;
     use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
     use pageserver_api::keyspace::KeySpace;
-    use pageserver_api::models::CompactionAlgorithm;
+    use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
     use rand::{thread_rng, Rng};
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
@@ -5169,7 +5169,9 @@ mod tests {
         compaction_algorithm: CompactionAlgorithm,
     ) -> anyhow::Result<()> {
         let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
+        harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
+            kind: compaction_algorithm,
+        };
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5526,7 +5528,9 @@ mod tests {
         compaction_algorithm: CompactionAlgorithm,
     ) -> anyhow::Result<()> {
         let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
+        harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
+            kind: compaction_algorithm,
+        };
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index a695363cdc..342d705954 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,6 +11,7 @@
 use anyhow::bail;
 use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::CompactionAlgorithm;
+use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
 use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
@@ -320,7 +321,7 @@ pub struct TenantConf {
     pub compaction_period: Duration,
     // Level0 delta layer threshold for compaction.
     pub compaction_threshold: usize,
-    pub compaction_algorithm: CompactionAlgorithm,
+    pub compaction_algorithm: CompactionAlgorithmSettings,
     // Determines how much history is retained, to allow
     // branching and read replicas at an older point in time.
     // The unit is #of bytes of WAL.
@@ -406,7 +407,7 @@ pub struct TenantConfOpt {
 
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
-    pub compaction_algorithm: Option<CompactionAlgorithm>,
+    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
 
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
@@ -497,7 +498,9 @@ impl TenantConfOpt {
                 .unwrap_or(global_conf.compaction_threshold),
             compaction_algorithm: self
                 .compaction_algorithm
-                .unwrap_or(global_conf.compaction_algorithm),
+                .as_ref()
+                .unwrap_or(&global_conf.compaction_algorithm)
+                .clone(),
             gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
             gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
             image_creation_threshold: self
@@ -550,7 +553,9 @@ impl Default for TenantConf {
             compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
                 .expect("cannot parse default compaction period"),
             compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
-            compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
+            compaction_algorithm: CompactionAlgorithmSettings {
+                kind: DEFAULT_COMPACTION_ALGORITHM,
+            },
             gc_horizon: DEFAULT_GC_HORIZON,
             gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                 .expect("cannot parse default gc period"),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 63d03b3c68..881e7f8f3c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,9 +23,9 @@ use pageserver_api::{
     },
     keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
-        AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
-        DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
-        LsnLease, TimelineState,
+        AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings,
+        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
+        InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState,
     },
     reltag::BlockNumber,
     shard::{ShardIdentity, ShardNumber, TenantShardId},
@@ -1700,7 +1700,7 @@ impl Timeline {
             return Ok(());
         }
 
-        match self.get_compaction_algorithm() {
+        match self.get_compaction_algorithm_settings().kind {
             CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
             CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
         }
@@ -2096,12 +2096,14 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
-    fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
+    fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings {
         let tenant_conf = &self.tenant_conf.load();
         tenant_conf
             .tenant_conf
             .compaction_algorithm
-            .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
+            .as_ref()
+            .unwrap_or(&self.conf.default_tenant_conf.compaction_algorithm)
+            .clone()
     }
 
     fn get_eviction_policy(&self) -> EvictionPolicy {
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 531679c7af..8c60b454d8 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -162,7 +162,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "checkpoint_distance": 10000,
         "checkpoint_timeout": "13m",
         "compaction_algorithm": {
-            "kind": "Tiered",
+            "kind": "tiered",
         },
         "eviction_policy": {
             "kind": "LayerAccessThreshold",
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 6a3515e1bd..4850a5c688 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -194,8 +194,8 @@ def test_sharding_compaction(
 
 
 class CompactionAlgorithm(str, enum.Enum):
-    LEGACY = "Legacy"
-    TIERED = "Tiered"
+    LEGACY = "legacy"
+    TIERED = "tiered"
 
 
 @pytest.mark.parametrize(

From eb0c026aac95bf8aad1c115aabb5697e86594eac Mon Sep 17 00:00:00 2001
From: Oleg Vasilev <oleg@neon.tech>
Date: Thu, 23 May 2024 00:48:59 +0300
Subject: [PATCH 0856/1571] Bump vm-builder v0.28.1 -> v0.29.3 (#7849)

One change:
runner: allow coredump collection (#931)
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 2ab1417d6d..f8c011a0a5 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -934,7 +934,7 @@ jobs:
       matrix:
         version: [ v14, v15, v16 ]
     env:
-      VM_BUILDER_VERSION: v0.28.1
+      VM_BUILDER_VERSION: v0.29.3
 
     steps:
       - name: Checkout

From a43a1ad1df11fc547fa16874f9776fc2e1bf4c9b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 23 May 2024 09:13:55 +0100
Subject: [PATCH 0857/1571] pageserver: fix API-driven secondary downloads
 possibly colliding with background downloads (#7848)

## Problem

We've seen some strange behaviors when doing lots of migrations
involving secondary locations. One of these was where a tenant was
apparently stuck in the `Scheduler::running` list, but didn't appear to
be making any progress. Another was a shutdown hang
(https://github.com/neondatabase/cloud/issues/13576).

## Summary of changes

- Fix one issue (probably not the only one) where a tenant in the
`pending` list could proceed to `spawn` even if the same tenant already
had a running task via `handle_command` (this could have resulted in a
weird value of SecondaryProgress)
- Add various extra logging:
- log before as well as after layer downloads so that it would be
obvious if we were stuck in remote storage code (we shouldn't be, it has
built in timeouts)
- log the number of running + pending jobs from the scheduler every time
it wakes up to do a scheduling iteration (~10s) -- this is quite chatty,
but not compared with the volume of logs on a busy pageserver. It should
give us confidence that the scheduler loop is still alive, and
visibility of how many tasks the scheduler thinks are running.
---
 pageserver/src/tenant/secondary/downloader.rs |  7 ++++++-
 .../src/tenant/secondary/heatmap_uploader.rs  |  2 +-
 pageserver/src/tenant/secondary/scheduler.rs  | 20 ++++++++++++++++---
 3 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 870475eb57..de30c4dcb6 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -93,7 +93,7 @@ pub(super) async fn downloader_task(
 
     scheduler
         .run(command_queue, background_jobs_can_start, cancel)
-        .instrument(info_span!("secondary_downloads"))
+        .instrument(info_span!("secondary_download_scheduler"))
         .await
 }
 
@@ -1013,6 +1013,11 @@ impl<'a> TenantDownloader<'a> {
         );
 
         // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
+        tracing::info!(
+            "Starting download of layer {}, size {}",
+            layer.name,
+            layer.metadata.file_size
+        );
         let downloaded_bytes = match download_layer_file(
             self.conf,
             self.remote_storage,
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index fddced3ead..9c7a9c4234 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -53,7 +53,7 @@ pub(super) async fn heatmap_uploader_task(
 
     scheduler
         .run(command_queue, background_jobs_can_start, cancel)
-        .instrument(info_span!("heatmap_uploader"))
+        .instrument(info_span!("heatmap_upload_scheduler"))
         .await
 }
 
diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs
index 3d042f4513..0ec1c7872a 100644
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -179,6 +179,13 @@ where
             // Schedule some work, if concurrency limit permits it
             self.spawn_pending();
 
+            // This message is printed every scheduling iteration as proof of liveness when looking at logs
+            tracing::info!(
+                "Status: {} tasks running, {} pending",
+                self.running.len(),
+                self.pending.len()
+            );
+
             // Between scheduling iterations, we will:
             //  - Drain any complete tasks and spawn pending tasks
             //  - Handle incoming administrative commands
@@ -258,7 +265,11 @@ where
 
         self.tasks.spawn(fut);
 
-        self.running.insert(tenant_shard_id, in_progress);
+        let replaced = self.running.insert(tenant_shard_id, in_progress);
+        debug_assert!(replaced.is_none());
+        if replaced.is_some() {
+            tracing::warn!(%tenant_shard_id, "Unexpectedly spawned a task when one was already running")
+        }
     }
 
     /// For all pending tenants that are elegible for execution, spawn their task.
@@ -268,7 +279,9 @@ where
         while !self.pending.is_empty() && self.running.len() < self.concurrency {
             // unwrap: loop condition includes !is_empty()
             let pending = self.pending.pop_front().unwrap();
-            self.do_spawn(pending);
+            if !self.running.contains_key(pending.get_tenant_shard_id()) {
+                self.do_spawn(pending);
+            }
         }
     }
 
@@ -321,7 +334,8 @@ where
 
         let tenant_shard_id = job.get_tenant_shard_id();
         let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
-            tracing::info!("Command already running, waiting for it");
+            tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                           "Command already running, waiting for it");
             barrier
         } else {
             let running = self.spawn_now(job);

From 58e31fe0980060f7345f30c617fa30aea0c2d11e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 23 May 2024 11:25:38 +0300
Subject: [PATCH 0858/1571] test_attach_tenant_config: add allowed error
 (#7839)

[evidence] of quite rare flaky. the detach can cause this with the right
timing.

[evidence]:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7650/9191613501/index.html#suites/7745dadbd815ab87f5798aa881796f47/2190222925001078
---
 test_runner/regress/test_attach_tenant_config.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 8c60b454d8..1d193b8999 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -17,9 +17,13 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
     env = neon_env_builder.init_start()
 
-    # eviction might be the first one after an attach to access the layers
-    env.pageserver.allowed_errors.append(
-        ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction"
+    env.pageserver.allowed_errors.extend(
+        [
+            # eviction might be the first one after an attach to access the layers
+            ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction",
+            # detach can happen before we get to validate the generation number
+            ".*deletion backend: Dropped remote consistent LSN updates for tenant.*",
+        ]
     )
     assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
     return env

From 8f3c316bae85b14b351a4cbd0bbe3c13c5ca9770 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 23 May 2024 09:45:24 +0100
Subject: [PATCH 0859/1571] Skip unnecessary shared state updates in
 safekeepers (#7851)

I looked at the metrics from
https://github.com/neondatabase/neon/pull/7768 on staging and it seems
that manager does too many iterations. This is probably caused by
background job `remove_wal.rs` which iterates over all timelines and
tries to remove WAL and persist control file. This causes shared state
updates and wakes up the manager. The fix is to skip notifying about the
updates if nothing was updated.
---
 safekeeper/src/safekeeper.rs |  6 +++---
 safekeeper/src/timeline.rs   | 32 +++++++++++++++++++++-----------
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index e671d4f36a..4b1481a397 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -827,10 +827,10 @@ where
 
     /// Persist control file if there is something to save and enough time
     /// passed after the last save.
-    pub async fn maybe_persist_inmem_control_file(&mut self) -> Result<()> {
+    pub async fn maybe_persist_inmem_control_file(&mut self) -> Result<bool> {
         const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
         if self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
-            return Ok(());
+            return Ok(false);
         }
         let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn
             || self.state.inmem.backup_lsn > self.state.backup_lsn
@@ -840,7 +840,7 @@ where
             self.state.flush().await?;
             trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
         }
-        Ok(())
+        Ok(need_persist)
     }
 
     /// Handle request to append WAL.
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 89c157d514..0cc6153373 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -104,11 +104,16 @@ pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>;
 pub struct WriteGuardSharedState<'a> {
     tli: Arc<Timeline>,
     guard: RwLockWriteGuard<'a, SharedState>,
+    skip_update: bool,
 }
 
 impl<'a> WriteGuardSharedState<'a> {
     fn new(tli: Arc<Timeline>, guard: RwLockWriteGuard<'a, SharedState>) -> Self {
-        WriteGuardSharedState { tli, guard }
+        WriteGuardSharedState {
+            tli,
+            guard,
+            skip_update: false,
+        }
     }
 }
 
@@ -149,10 +154,12 @@ impl<'a> Drop for WriteGuardSharedState<'a> {
             }
         });
 
-        // send notification about shared state update
-        self.tli.shared_state_version_tx.send_modify(|old| {
-            *old += 1;
-        });
+        if !self.skip_update {
+            // send notification about shared state update
+            self.tli.shared_state_version_tx.send_modify(|old| {
+                *old += 1;
+            });
+        }
     }
 }
 
@@ -802,7 +809,11 @@ impl Timeline {
 
         // update last_removed_segno
         let mut shared_state = self.write_shared_state().await;
-        shared_state.last_removed_segno = horizon_segno;
+        if shared_state.last_removed_segno != horizon_segno {
+            shared_state.last_removed_segno = horizon_segno;
+        } else {
+            shared_state.skip_update = true;
+        }
         Ok(())
     }
 
@@ -811,11 +822,10 @@ impl Timeline {
     /// to date so that storage nodes restart doesn't cause many pageserver ->
     /// safekeeper reconnections.
     pub async fn maybe_persist_control_file(self: &Arc<Self>) -> Result<()> {
-        self.write_shared_state()
-            .await
-            .sk
-            .maybe_persist_inmem_control_file()
-            .await
+        let mut guard = self.write_shared_state().await;
+        let changed = guard.sk.maybe_persist_inmem_control_file().await?;
+        guard.skip_update = !changed;
+        Ok(())
     }
 
     /// Gather timeline data for metrics.

From cd6d811213158195dca2327571f7a3cfcc571061 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 23 May 2024 11:41:29 +0200
Subject: [PATCH 0860/1571] [proxy] Do not fail after parquet upload error
 (#7858)

## Problem

If the parquet upload was unsuccessful, it will panic.

## Summary of changes

Write error in logs instead.
---
 proxy/src/context/parquet.rs | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 392821c430..a213a32ca4 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -355,7 +355,7 @@ async fn upload_parquet(
         "{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
     ))?;
     let cancel = CancellationToken::new();
-    backoff::retry(
+    let maybe_err = backoff::retry(
         || async {
             let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
             storage
@@ -372,7 +372,12 @@ async fn upload_parquet(
     .await
     .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
     .and_then(|x| x)
-    .context("request_data_upload")?;
+    .context("request_data_upload")
+    .err();
+
+    if let Some(err) = maybe_err {
+        tracing::warn!(%id, %err, "failed to upload request data");
+    }
 
     Ok(buffer.writer())
 }

From 545f7e8cd7fcca09134f5c1eb47c8ff323dfad22 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 23 May 2024 10:50:21 +0100
Subject: [PATCH 0861/1571] tests: fix an allow list entry (#7856)

https://github.com/neondatabase/neon/pull/7844 typo'd one of the
expressions:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/9196993886/index.html#suites/07874de07c4a1c9effe0d92da7755ebf/e420fbfdb193bf80/
---
 test_runner/fixtures/pageserver/allowed_errors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index fa6e4eaafd..ad8bbe2021 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -70,7 +70,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     # this is expected given our collaborative shutdown approach for the UploadQueue
     ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
     ".*Compaction failed.*, retrying in .*: ShuttingDown",
-    ".*Compaction failed.*, retrying in .*: timeline shutting down.*",
+    ".*Compaction failed.*, retrying in .*: Other\\(timeline shutting down.*",
     # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
     ".*Error processing HTTP request: NotFound: Timeline .* was not found",
     ".*took more than expected to complete.*",

From 95a49f00752c8e22f5f912b26f76409e2b515804 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 23 May 2024 12:08:06 +0200
Subject: [PATCH 0862/1571] remove march=native from pgvector Makefile's
 OPTFLAGS (#7854)

## Problem

By default, pgvector compiles with `-march=native` on some platforms for
best performance. However, this can lead to `Illegal instruction` errors
if trying to run the compiled extension on a different machine.

I had this problem when trying to run the Neon compute docker image on
MacOS with Apple Silicon with Rosetta.

see
https://github.com/pgvector/pgvector/blob/ff9b22977e3ef19866d23a54332c8717f258e8db/README.md?plain=1#L1021

## Summary of changes

Pass OPTFLAGS="" to make.
---
 Dockerfile.compute-node | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 5bf3246f34..87fb218245 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -243,12 +243,15 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY patches/pgvector.patch /pgvector.patch
 
+# By default, pgvector Makefile uses `-march=native`. We don't want that, 
+# because we build the images on different machines than where we run them.
+# Pass OPTFLAGS="" to remove it.
 RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
     echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
     patch -p1 < /pgvector.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
 
 #########################################################################################

From 49d7f9b5a4cab00797bd8c85bf116478d9c1dee0 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 23 May 2024 14:44:08 +0300
Subject: [PATCH 0863/1571] test_import_from_pageserver_small: try to make less
 flaky (#7843)

With #7828 and proper fullbackup testing the test became flaky
([evidence]).

- produce better assertion messages in `assert_pageserver_backups_equal`
- use read only endpoint to confirm the row count

[evidence]:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7839/9192447962/index.html#suites/89cfa994d71769e01e3fc4f475a1f3fa/49009214d0f8b8ce
---
 test_runner/fixtures/utils.py      | 25 ++++++++++++++++++-------
 test_runner/regress/test_import.py |  9 +++------
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 22bb43c580..89e116df28 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -541,11 +541,22 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: Set[str
 
     left_list, right_list = map(build_hash_list, [left, right])
 
-    try:
-        assert len(left_list) == len(right_list)
+    assert len(left_list) == len(
+        right_list
+    ), f"unexpected number of files on tar files, {len(left_list)} != {len(right_list)}"
 
-        for left_tuple, right_tuple in zip(left_list, right_list):
-            assert left_tuple == right_tuple
-    finally:
-        elapsed = time.time() - started_at
-        log.info(f"assert_pageserver_backups_equal completed in {elapsed}s")
+    mismatching = set()
+
+    for left_tuple, right_tuple in zip(left_list, right_list):
+        left_path, left_hash = left_tuple
+        right_path, right_hash = right_tuple
+        assert (
+            left_path == right_path
+        ), f"file count matched, expected these to be same paths: {left_path}, {right_path}"
+        if left_hash != right_hash:
+            mismatching.add(left_path)
+
+    assert len(mismatching) == 0, f"files with hash mismatch: {mismatching}"
+
+    elapsed = time.time() - started_at
+    log.info(f"assert_pageserver_backups_equal completed in {elapsed}s")
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 62229ebfe7..ac27a4cf36 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -163,7 +163,7 @@ def test_import_from_pageserver_small(
 
     num_rows = 3000
     lsn = _generate_data(num_rows, endpoint)
-    _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir)
+    _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir)
 
 
 @pytest.mark.timeout(1800)
@@ -193,9 +193,7 @@ def test_import_from_pageserver_multisegment(
     log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB")
     assert logical_size > 1024**3  # = 1GB
 
-    tar_output_file = _import(
-        num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir
-    )
+    tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir)
 
     # Check if the backup data contains multiple segment files
     cnt_seg_files = 0
@@ -235,7 +233,6 @@ def _import(
     env: NeonEnv,
     pg_bin: PgBin,
     timeline: TimelineId,
-    pg_distrib_dir: Path,
     test_output_dir: Path,
 ) -> Path:
     """Test importing backup data to the pageserver.
@@ -295,7 +292,7 @@ def _import(
     wait_for_upload(client, tenant, timeline, lsn)
 
     # Check it worked
-    endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant)
+    endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant, lsn=lsn)
     assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)]
 
     # Take another fullbackup

From d5d15eb6eb184b589483882148ab177a73f8db64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 23 May 2024 14:28:05 +0200
Subject: [PATCH 0864/1571] Warn if a blob in an image is larger than 256 MiB
 (#7852)

We'd like to get some bits reserved in the length field of image layers
for future usage (compression). This PR bases on the assumption that we
don't have any blobs that require more than 28 bits (3 bytes + 4 bits)
to store the length, but as a preparation, before erroring, we want to
first emit warnings as if the assumption is wrong, such warnings are less
disruptive than errors.

A metric would be even less disruptive (log messages are more slow, if
we have a LOT of such large blobs then it would take a lot of time to
print them). At the same time, likely such 256 MiB blobs will occupy an
entire layer file, as they are larger than our target size. For layer
files we already log something, so there shouldn't be a large increase
in overhead.

Part of #5431
---
 pageserver/src/tenant/blob_io.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 24b4e4f3ea..2be8816cef 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -238,10 +238,13 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                         io_buf,
                         Err(Error::new(
                             ErrorKind::Other,
-                            format!("blob too large ({} bytes)", len),
+                            format!("blob too large ({len} bytes)"),
                         )),
                     );
                 }
+                if len > 0x0fff_ffff {
+                    tracing::warn!("writing blob above future limit ({len} bytes)");
+                }
                 let mut len_buf = (len as u32).to_be_bytes();
                 len_buf[0] |= 0x80;
                 io_buf.extend_from_slice(&len_buf[..]);

From e28e46f20be1f1bd298bbba6c5a131f435acde52 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 23 May 2024 09:45:29 -0400
Subject: [PATCH 0865/1571] fix(pageserver): make wal connstr a connstr (#7846)

The list timeline API gives something like
`"wal_source_connstr":"PgConnectionConfig { host:
Domain(\"safekeeper-5.us-east-2.aws.neon.build\"), port: 6500, password:
Some(REDACTED-STRING) }"`, which is weird. This pull request makes it
somehow like a connection string. This field is not used at least in the
neon database, so I assume no one is reading or parsing it.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/postgres_connection/src/lib.rs | 7 +++++++
 pageserver/src/http/routes.rs       | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs
index ccf9108895..9f57f3d507 100644
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -178,6 +178,13 @@ impl PgConnectionConfig {
     }
 }
 
+impl fmt::Display for PgConnectionConfig {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // The password is intentionally hidden and not part of this display string.
+        write!(f, "postgresql://{}:{}", self.host, self.port)
+    }
+}
+
 impl fmt::Debug for PgConnectionConfig {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         // We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")`
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7b55e88096..8a061f3ae1 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -395,7 +395,7 @@ async fn build_timeline_info_common(
         let guard = timeline.last_received_wal.lock().unwrap();
         if let Some(info) = guard.as_ref() {
             (
-                Some(format!("{:?}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
+                Some(format!("{}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
                 Some(info.last_received_msg_lsn),
                 Some(info.last_received_msg_ts),
             )

From 75a52ac7fd7996f5faeb3c68bd15a9811b0d84f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 23 May 2024 17:10:24 +0200
Subject: [PATCH 0866/1571] Use Timeline::create_image_layer_for_rel_blocks in
 tiered compaction (#7850)

Reduces duplication between tiered and legacy compaction by using the
`Timeline::create_image_layer_for_rel_blocks` function. This way, we
also use vectored get in tiered compaction, so the change has two
benefits in one.

fixes #7659

---------

Co-authored-by: Alex Chi Z. <iskyzh@gmail.com>
---
 pageserver/src/tenant/timeline/compaction.rs | 75 +++++++++-----------
 1 file changed, 32 insertions(+), 43 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 2eff469591..db8adfc16c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -9,7 +9,10 @@ use std::ops::{Deref, Range};
 use std::sync::Arc;
 
 use super::layer_manager::LayerManager;
-use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
+use super::{
+    CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
+    RecordedDuration, Timeline,
+};
 
 use anyhow::{anyhow, Context};
 use enumset::EnumSet;
@@ -22,14 +25,13 @@ use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
+use crate::page_cache;
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
-use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole};
+use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
 use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
-use crate::tenant::PageReconstructError;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
-use crate::{page_cache, ZERO_PAGE};
 
 use crate::keyspace::KeySpace;
 use crate::repository::Key;
@@ -1150,10 +1152,10 @@ impl TimelineAdaptor {
         lsn: Lsn,
         key_range: &Range<Key>,
         ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
+    ) -> Result<(), CreateImageLayersError> {
         let timer = self.timeline.metrics.create_images_time_histo.start_timer();
 
-        let mut image_layer_writer = ImageLayerWriter::new(
+        let image_layer_writer = ImageLayerWriter::new(
             self.timeline.conf,
             self.timeline.timeline_id,
             self.timeline.tenant_shard_id,
@@ -1164,47 +1166,34 @@ impl TimelineAdaptor {
         .await?;
 
         fail_point!("image-layer-writer-fail-before-finish", |_| {
-            Err(PageReconstructError::Other(anyhow::anyhow!(
+            Err(CreateImageLayersError::Other(anyhow::anyhow!(
                 "failpoint image-layer-writer-fail-before-finish"
             )))
         });
-        let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?;
-        for range in &keyspace_ranges {
-            let mut key = range.start;
-            while key < range.end {
-                let img = match self.timeline.get(key, lsn, ctx).await {
-                    Ok(img) => img,
-                    Err(err) => {
-                        // If we fail to reconstruct a VM or FSM page, we can zero the
-                        // page without losing any actual user data. That seems better
-                        // than failing repeatedly and getting stuck.
-                        //
-                        // We had a bug at one point, where we truncated the FSM and VM
-                        // in the pageserver, but the Postgres didn't know about that
-                        // and continued to generate incremental WAL records for pages
-                        // that didn't exist in the pageserver. Trying to replay those
-                        // WAL records failed to find the previous image of the page.
-                        // This special case allows us to recover from that situation.
-                        // See https://github.com/neondatabase/neon/issues/2601.
-                        //
-                        // Unfortunately we cannot do this for the main fork, or for
-                        // any metadata keys, keys, as that would lead to actual data
-                        // loss.
-                        if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
-                            warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
-                            ZERO_PAGE.clone()
-                        } else {
-                            return Err(err);
-                        }
-                    }
-                };
-                image_layer_writer.put_image(key, img, ctx).await?;
-                key = key.next();
-            }
-        }
-        let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?;
 
-        self.new_images.push(image_layer);
+        let keyspace = KeySpace {
+            ranges: self.get_keyspace(key_range, lsn, ctx).await?,
+        };
+        // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
+        let start = Key::MIN;
+        let ImageLayerCreationOutcome {
+            image,
+            next_start_key: _,
+        } = self
+            .timeline
+            .create_image_layer_for_rel_blocks(
+                &keyspace,
+                image_layer_writer,
+                lsn,
+                ctx,
+                key_range.clone(),
+                start,
+            )
+            .await?;
+
+        if let Some(image_layer) = image {
+            self.new_images.push(image_layer);
+        }
 
         timer.stop_and_record();
 

From 6b3164269cc3a6c577428071986447aadffc0008 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 23 May 2024 11:30:43 -0400
Subject: [PATCH 0867/1571] chore(pageserver): reduce logging related to image
 layers (#7864)

* Reduce the logging level for create image layers of metadata keys.
(question: is it possible to adjust logging levels at runtime?)
* Do a info logging of image layers only after the layer is created. Now
there are a lot of cases where we create the image layer writer but then
discarding that image layer because it does not contain any key.
Therefore, I changed the new image layer logging to trace, and create
image layer logging to info.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/storage_layer/image_layer.rs | 4 ++--
 pageserver/src/tenant/timeline.rs                  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index becd1e7a6d..67b489ce0d 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -650,7 +650,7 @@ impl ImageLayerWriterInner {
                 lsn,
             },
         );
-        info!("new image layer {path}");
+        trace!("creating image layer {}", path);
         let mut file = {
             VirtualFile::open_with_options(
                 &path,
@@ -770,7 +770,7 @@ impl ImageLayerWriterInner {
         // FIXME: why not carry the virtualfile here, it supports renaming?
         let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
 
-        trace!("created image layer {}", layer.local_path());
+        info!("created image layer {}", layer.local_path());
 
         Ok(layer)
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 881e7f8f3c..262e1896ce 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4347,7 +4347,7 @@ impl Timeline {
         let delta_file_accessed = reconstruct_state.get_delta_layers_visited();
 
         let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
-        info!(
+        debug!(
             "generate image layers for metadata keys: trigger_generation={trigger_generation}, \
                 delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
                 total_key_retrieved={total_key_retrieved}"

From 7cf726e36e3a9dab2516a861b914eda3d2cac2c4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 23 May 2024 23:24:31 +0300
Subject: [PATCH 0868/1571] refactor(rtc): remove the duplicate
 IndexLayerMetadata (#7860)

Once upon a time, we used to have duplicated types for runtime IndexPart
and whatever we stored. Because of the serde fixes in #5335 we have no
need for duplicated IndexPart type anymore, but the `IndexLayerMetadata`
stayed.

- remove the type
- remove LayerFileMetadata::file_size() in favor of direct field access

Split off from #7833. Cc: #3072.
---
 pageserver/compaction/src/simulator.rs        |  4 +-
 pageserver/ctl/src/index_part.rs              |  4 +-
 pageserver/src/disk_usage_eviction_task.rs    |  4 +-
 .../src/tenant/remote_timeline_client.rs      |  6 +-
 .../tenant/remote_timeline_client/download.rs |  2 +-
 .../tenant/remote_timeline_client/index.rs    | 92 ++++++-------------
 pageserver/src/tenant/secondary/downloader.rs | 14 ++-
 pageserver/src/tenant/secondary/heatmap.rs    |  6 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  6 +-
 pageserver/src/tenant/timeline.rs             |  8 +-
 pageserver/src/tenant/timeline/init.rs        |  6 +-
 pageserver/src/tenant/upload_queue.rs         |  9 +-
 s3_scrubber/src/checks.rs                     |  4 +-
 s3_scrubber/src/tenant_snapshot.rs            | 15 ++-
 14 files changed, 65 insertions(+), 115 deletions(-)

diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs
index a7c8bd5c1f..776c537d03 100644
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -380,8 +380,8 @@ impl interface::CompactionLayer<Key> for MockLayer {
     }
     fn file_size(&self) -> u64 {
         match self {
-            MockLayer::Delta(this) => this.file_size(),
-            MockLayer::Image(this) => this.file_size(),
+            MockLayer::Delta(this) => this.file_size,
+            MockLayer::Image(this) => this.file_size,
         }
     }
     fn short_id(&self) -> String {
diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs
index 0d010eb009..2998b5c732 100644
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -2,7 +2,7 @@ use std::collections::HashMap;
 
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
 use utils::lsn::Lsn;
@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
             let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
             #[derive(serde::Serialize)]
             struct Output<'a> {
-                layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
+                layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
                 disk_consistent_lsn: Lsn,
                 timeline_metadata: &'a TimelineMetadata,
             }
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 7f25e49570..90bd4294bb 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -534,7 +534,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                     });
                 }
                 EvictionLayer::Secondary(layer) => {
-                    let file_size = layer.metadata.file_size();
+                    let file_size = layer.metadata.file_size;
 
                     js.spawn(async move {
                         layer
@@ -641,7 +641,7 @@ impl EvictionLayer {
     pub(crate) fn get_file_size(&self) -> u64 {
         match self {
             Self::Attached(l) => l.layer_desc().file_size,
-            Self::Secondary(sl) => sl.metadata.file_size(),
+            Self::Secondary(sl) => sl.metadata.file_size,
         }
     }
 }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index d3adae6841..23904b9da4 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1192,7 +1192,7 @@ impl RemoteTimelineClient {
                     &self.storage_impl,
                     uploaded.local_path(),
                     &remote_path,
-                    uploaded.metadata().file_size(),
+                    uploaded.metadata().file_size,
                     cancel,
                 )
                 .await
@@ -1573,7 +1573,7 @@ impl RemoteTimelineClient {
                         &self.storage_impl,
                         local_path,
                         &remote_path,
-                        layer_metadata.file_size(),
+                        layer_metadata.file_size,
                         &self.cancel,
                     )
                     .measure_remote_op(
@@ -1768,7 +1768,7 @@ impl RemoteTimelineClient {
             UploadOp::UploadLayer(_, m) => (
                 RemoteOpFileKind::Layer,
                 RemoteOpKind::Upload,
-                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()),
+                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
             ),
             UploadOp::UploadMetadata(_, _) => (
                 RemoteOpFileKind::Index,
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 70c5cae05e..bd75f980e8 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -84,7 +84,7 @@ pub async fn download_layer_file<'a>(
     )
     .await?;
 
-    let expected = layer_metadata.file_size();
+    let expected = layer_metadata.file_size;
     if expected != bytes_amount {
         return Err(DownloadError::Other(anyhow!(
             "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 032dda7ff3..f5d939c747 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -17,46 +17,6 @@ use pageserver_api::shard::ShardIndex;
 
 use utils::lsn::Lsn;
 
-/// Metadata gathered for each of the layer files.
-///
-/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
-/// might have less or more metadata depending if upgrading or rolling back an upgrade.
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
-//#[cfg_attr(test, derive(Default))]
-pub struct LayerFileMetadata {
-    file_size: u64,
-
-    pub(crate) generation: Generation,
-
-    pub(crate) shard: ShardIndex,
-}
-
-impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
-    fn from(other: &IndexLayerMetadata) -> Self {
-        LayerFileMetadata {
-            file_size: other.file_size,
-            generation: other.generation,
-            shard: other.shard,
-        }
-    }
-}
-
-impl LayerFileMetadata {
-    pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
-        LayerFileMetadata {
-            file_size,
-            generation,
-            shard,
-        }
-    }
-
-    pub fn file_size(&self) -> u64 {
-        self.file_size
-    }
-}
-
-// TODO seems like another part of the remote storage file format
-// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
 /// In-memory representation of an `index_part.json` file
 ///
 /// Contains the data about all files in the timeline, present remotely and its metadata.
@@ -77,7 +37,7 @@ pub struct IndexPart {
     ///
     /// Older versions of `IndexPart` will not have this property or have only a part of metadata
     /// that latest version stores.
-    pub layer_metadata: HashMap<LayerName, IndexLayerMetadata>,
+    pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,
 
     // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
     // It's duplicated for convenience when reading the serialized structure, but is
@@ -127,10 +87,7 @@ impl IndexPart {
         lineage: Lineage,
         last_aux_file_policy: Option<AuxFilePolicy>,
     ) -> Self {
-        let layer_metadata = layers_and_metadata
-            .iter()
-            .map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v)))
-            .collect();
+        let layer_metadata = layers_and_metadata.clone();
 
         Self {
             version: Self::LATEST_VERSION,
@@ -194,9 +151,12 @@ impl From<&UploadQueueInitialized> for IndexPart {
     }
 }
 
-/// Serialized form of [`LayerFileMetadata`].
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
-pub struct IndexLayerMetadata {
+/// Metadata gathered for each of the layer files.
+///
+/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
+/// might have less or more metadata depending if upgrading or rolling back an upgrade.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+pub struct LayerFileMetadata {
     pub file_size: u64,
 
     #[serde(default = "Generation::none")]
@@ -208,12 +168,12 @@ pub struct IndexLayerMetadata {
     pub shard: ShardIndex,
 }
 
-impl From<&LayerFileMetadata> for IndexLayerMetadata {
-    fn from(other: &LayerFileMetadata) -> Self {
-        IndexLayerMetadata {
-            file_size: other.file_size,
-            generation: other.generation,
-            shard: other.shard,
+impl LayerFileMetadata {
+    pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
+        LayerFileMetadata {
+            file_size,
+            generation,
+            shard,
         }
     }
 }
@@ -307,12 +267,12 @@ mod tests {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 1,
             layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                     file_size: 25600000,
                     generation: Generation::none(),
                     shard: ShardIndex::unsharded()
                 }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
                     file_size: 9007199254741001,
@@ -349,12 +309,12 @@ mod tests {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 1,
             layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                     file_size: 25600000,
                     generation: Generation::none(),
                     shard: ShardIndex::unsharded()
                 }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
                     file_size: 9007199254741001,
@@ -392,12 +352,12 @@ mod tests {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 2,
             layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                     file_size: 25600000,
                     generation: Generation::none(),
                     shard: ShardIndex::unsharded()
                 }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
                     file_size: 9007199254741001,
@@ -480,12 +440,12 @@ mod tests {
         let expected = IndexPart {
             version: 4,
             layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                     file_size: 25600000,
                     generation: Generation::none(),
                     shard: ShardIndex::unsharded()
                 }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
                     file_size: 9007199254741001,
@@ -522,12 +482,12 @@ mod tests {
         let expected = IndexPart {
             version: 5,
             layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), LayerFileMetadata {
                     file_size: 23289856,
                     generation: Generation::new(1),
                     shard: ShardIndex::unsharded(),
                 }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), LayerFileMetadata {
                     file_size: 1015808,
                     generation: Generation::new(1),
                     shard: ShardIndex::unsharded(),
@@ -569,12 +529,12 @@ mod tests {
         let expected = IndexPart {
             version: 6,
             layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                     file_size: 25600000,
                     generation: Generation::none(),
                     shard: ShardIndex::unsharded()
                 }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
                     file_size: 9007199254741001,
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index de30c4dcb6..789f1a0fa9 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -716,7 +716,7 @@ impl<'a> TenantDownloader<'a> {
                 let mut layer_byte_count: u64 = timeline_state
                     .on_disk_layers
                     .values()
-                    .map(|l| l.metadata.file_size())
+                    .map(|l| l.metadata.file_size)
                     .sum();
 
                 // Remove on-disk layers that are no longer present in heatmap
@@ -727,7 +727,7 @@ impl<'a> TenantDownloader<'a> {
                         .get(layer_file_name)
                         .unwrap()
                         .metadata
-                        .file_size();
+                        .file_size;
 
                     let local_path = local_layer_path(
                         self.conf,
@@ -886,9 +886,7 @@ impl<'a> TenantDownloader<'a> {
                     }
                 }
 
-                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
-                    || on_disk.access_time != layer.access_time
-                {
+                if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
                     // We already have this layer on disk.  Update its access time.
                     tracing::debug!(
                         "Access time updated for layer {}: {} -> {}",
@@ -979,7 +977,7 @@ impl<'a> TenantDownloader<'a> {
                             tenant_shard_id,
                             &timeline.timeline_id,
                             t.name,
-                            LayerFileMetadata::from(&t.metadata),
+                            t.metadata.clone(),
                             t.access_time,
                             local_path,
                         ));
@@ -1024,7 +1022,7 @@ impl<'a> TenantDownloader<'a> {
             *tenant_shard_id,
             *timeline_id,
             &layer.name,
-            &LayerFileMetadata::from(&layer.metadata),
+            &layer.metadata,
             &local_path,
             &self.secondary_state.cancel,
             ctx,
@@ -1185,7 +1183,7 @@ async fn init_timeline_state(
                                     tenant_shard_id,
                                     &heatmap.timeline_id,
                                     name,
-                                    LayerFileMetadata::from(&remote_meta.metadata),
+                                    remote_meta.metadata.clone(),
                                     remote_meta.access_time,
                                     file_path,
                                 ),
diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
index 2da4a3b9d5..166483ba5d 100644
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -1,6 +1,6 @@
 use std::time::SystemTime;
 
-use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName};
+use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};
 
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
@@ -38,7 +38,7 @@ pub(crate) struct HeatMapTimeline {
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapLayer {
     pub(super) name: LayerName,
-    pub(super) metadata: IndexLayerMetadata,
+    pub(super) metadata: LayerFileMetadata,
 
     #[serde_as(as = "TimestampSeconds<i64>")]
     pub(super) access_time: SystemTime,
@@ -49,7 +49,7 @@ pub(crate) struct HeatMapLayer {
 impl HeatMapLayer {
     pub(crate) fn new(
         name: LayerName,
-        metadata: IndexLayerMetadata,
+        metadata: LayerFileMetadata,
         access_time: SystemTime,
     ) -> Self {
         Self {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 8c64621710..b2f3bdb552 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -161,7 +161,7 @@ impl Layer {
             timeline.tenant_shard_id,
             timeline.timeline_id,
             file_name,
-            metadata.file_size(),
+            metadata.file_size,
         );
 
         let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
@@ -194,7 +194,7 @@ impl Layer {
             timeline.tenant_shard_id,
             timeline.timeline_id,
             file_name,
-            metadata.file_size(),
+            metadata.file_size,
         );
 
         let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
@@ -227,7 +227,7 @@ impl Layer {
 
         timeline
             .metrics
-            .resident_physical_size_add(metadata.file_size());
+            .resident_physical_size_add(metadata.file_size);
 
         ResidentLayer { downloaded, owner }
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 262e1896ce..342fc4fc59 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1424,7 +1424,7 @@ impl Timeline {
         let layer_map = guard.layer_map();
         let mut size = 0;
         for l in layer_map.iter_historic_layers() {
-            size += l.file_size();
+            size += l.file_size;
         }
         size
     }
@@ -2516,7 +2516,7 @@ impl Timeline {
                         Ok(UseRemote { local, remote }) => {
                             // Remote is authoritative, but we may still choose to retain
                             // the local file if the contents appear to match
-                            if local.metadata.file_size() == remote.file_size() {
+                            if local.metadata.file_size == remote.file_size {
                                 // Use the local file, but take the remote metadata so that we pick up
                                 // the correct generation.
                                 UseLocal(LocalLayerFileMetadata {
@@ -2556,7 +2556,7 @@ impl Timeline {
 
                     let layer = match decision {
                         UseLocal(local) => {
-                            total_physical_size += local.metadata.file_size();
+                            total_physical_size += local.metadata.file_size;
                             Layer::for_resident(conf, &this, local.local_path, name, local.metadata)
                                 .drop_eviction_guard()
                         }
@@ -3071,7 +3071,7 @@ impl Timeline {
 
             HeatMapLayer::new(
                 layer.layer_desc().layer_name(),
-                (&layer.metadata()).into(),
+                layer.metadata(),
                 last_activity_ts,
             )
         });
diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs
index feadc79e5e..0cbaf39555 100644
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -157,7 +157,7 @@ pub(super) fn reconcile(
         .map(|ip| ip.layer_metadata.iter())
         .into_iter()
         .flatten()
-        .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
+        .map(|(name, metadata)| (name, metadata.clone()))
         .for_each(|(name, metadata)| {
             if let Some(existing) = discovered.get_mut(name) {
                 existing.1 = Some(metadata);
@@ -200,8 +200,8 @@ pub(super) fn cleanup_local_file_for_remote(
     local: &LocalLayerFileMetadata,
     remote: &LayerFileMetadata,
 ) -> anyhow::Result<()> {
-    let local_size = local.metadata.file_size();
-    let remote_size = remote.file_size();
+    let local_size = local.metadata.file_size;
+    let remote_size = remote.file_size;
     let path = &local.local_path;
 
     let file_name = path.file_name().expect("must be file path");
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index c0cc8f3124..02f87303d1 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -213,10 +213,7 @@ impl UploadQueue {
 
         let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
         for (layer_name, layer_metadata) in &index_part.layer_metadata {
-            files.insert(
-                layer_name.to_owned(),
-                LayerFileMetadata::from(layer_metadata),
-            );
+            files.insert(layer_name.to_owned(), layer_metadata.clone());
         }
 
         info!(
@@ -322,9 +319,7 @@ impl std::fmt::Display for UploadOp {
                 write!(
                     f,
                     "UploadLayer({}, size={:?}, gen={:?})",
-                    layer,
-                    metadata.file_size(),
-                    metadata.generation
+                    layer, metadata.file_size, metadata.generation
                 )
             }
             UploadOp::UploadMetadata(_, lsn) => {
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index dd64a0a98f..134afa53da 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -2,7 +2,7 @@ use std::collections::{HashMap, HashSet};
 
 use anyhow::Context;
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
-use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
 use tracing::{error, info, warn};
 use utils::generation::Generation;
@@ -208,7 +208,7 @@ impl TenantObjectListing {
         &mut self,
         timeline_id: TimelineId,
         layer_file: &LayerName,
-        metadata: &IndexLayerMetadata,
+        metadata: &LayerFileMetadata,
     ) -> bool {
         let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else {
             return false;
diff --git a/s3_scrubber/src/tenant_snapshot.rs b/s3_scrubber/src/tenant_snapshot.rs
index a24a1e92ae..450b337235 100644
--- a/s3_scrubber/src/tenant_snapshot.rs
+++ b/s3_scrubber/src/tenant_snapshot.rs
@@ -11,7 +11,7 @@ use async_stream::stream;
 use aws_sdk_s3::Client;
 use camino::Utf8PathBuf;
 use futures::{StreamExt, TryStreamExt};
-use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use pageserver_api::shard::TenantShardId;
@@ -49,8 +49,8 @@ impl SnapshotDownloader {
         &self,
         ttid: TenantShardTimelineId,
         layer_name: LayerName,
-        layer_metadata: IndexLayerMetadata,
-    ) -> anyhow::Result<(LayerName, IndexLayerMetadata)> {
+        layer_metadata: LayerFileMetadata,
+    ) -> anyhow::Result<(LayerName, LayerFileMetadata)> {
         // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format.  They use
         // different layer names (remote-style has the generation suffix)
         let local_path = self.output_path.join(format!(
@@ -110,7 +110,7 @@ impl SnapshotDownloader {
     async fn download_layers(
         &self,
         ttid: TenantShardTimelineId,
-        layers: Vec<(LayerName, IndexLayerMetadata)>,
+        layers: Vec<(LayerName, LayerFileMetadata)>,
     ) -> anyhow::Result<()> {
         let layer_count = layers.len();
         tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count);
@@ -161,10 +161,7 @@ impl SnapshotDownloader {
         ttid: TenantShardTimelineId,
         index_part: Box<IndexPart>,
         index_part_generation: Generation,
-        ancestor_layers: &mut HashMap<
-            TenantShardTimelineId,
-            HashMap<LayerName, IndexLayerMetadata>,
-        >,
+        ancestor_layers: &mut HashMap<TenantShardTimelineId, HashMap<LayerName, LayerFileMetadata>>,
     ) -> anyhow::Result<()> {
         let index_bytes = serde_json::to_string(&index_part).unwrap();
 
@@ -234,7 +231,7 @@ impl SnapshotDownloader {
         // happen if this tenant has been split at some point)
         let mut ancestor_layers: HashMap<
             TenantShardTimelineId,
-            HashMap<LayerName, IndexLayerMetadata>,
+            HashMap<LayerName, LayerFileMetadata>,
         > = Default::default();
 
         for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {

From ea2e830707bcd5cf8b7fca25ae598d2937994c9e Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 23 May 2024 13:35:59 -0700
Subject: [PATCH 0869/1571] Remove apostrophe (#7868)

## Problem

## Summary of changes

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 0f9d56e466..484a86fc21 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -320,7 +320,7 @@ files:
 
       - metric_name: wal_is_lost
         type: gauge
-        help: 'Whether or not the replication slot\'s wal_status is lost'
+        help: 'Whether or not the replication slot wal_status is lost'
         key_labels:
           - slot_name
         values: [wal_status_is_lost]

From 0e4f1826805d040a23c25c54f3993a942755dbc2 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Thu, 23 May 2024 23:26:42 +0200
Subject: [PATCH 0870/1571] Rework PageStream connection state handling:
 (#7611)

 * Make PS connection startup use async APIs
   This allows for improved query cancellation when we start connections
 * Make PS connections have per-shard connection retry state.
   Previously they shared global backoff state, which is bad for quickly
   getting all connections started and/or back online.
 * Make sure we clean up most connection state on failed connections.
   Previously, we could technically leak some resources that we'd otherwise
   clean up. Now, the resources are correctly cleaned up.
 * pagestore_smgr.c now PANICs on unexpected response message types.
   Unexpected responses are likely a symptom of having a desynchronized
   view of the connection state. As a desynchronized connection state can
   cause corruption, we PANIC, as we don't know what data may have been
   written to buffers: the only solution is to fail fast & hope we didn't
   write wrong data.
 * Catch errors in sync pagestream request handling.
   Previously, if a query was cancelled after a message was sent to
   the pageserver, but before the data was received, the backend
   could forget that it sent the synchronous request, and let others
   deal with the repercussions. This could then lead to incorrect
   responses, or errors such as "unexpected response from page
   server with tag 0x68"
---
 pageserver/src/page_service.rs                |  11 +
 pgxn/neon/libpagestore.c                      | 529 ++++++++++++------
 pgxn/neon/pagestore_smgr.c                    |  91 ++-
 .../regress/test_pg_query_cancellation.py     | 282 ++++++++++
 4 files changed, 729 insertions(+), 184 deletions(-)
 create mode 100644 test_runner/regress/test_pg_query_cancellation.py

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index d250864fd6..e9651165b1 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -260,6 +260,8 @@ async fn page_service_conn_main(
     socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
     let socket = std::pin::pin!(socket);
 
+    fail::fail_point!("ps::connection-start::pre-login");
+
     // XXX: pgbackend.run() should take the connection_ctx,
     // and create a child per-query context when it invokes process_query.
     // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
@@ -603,6 +605,7 @@ impl PageServerHandler {
             };
 
             trace!("query: {copy_data_bytes:?}");
+            fail::fail_point!("ps::handle-pagerequest-message");
 
             // Trace request if needed
             if let Some(t) = tracer.as_mut() {
@@ -617,6 +620,7 @@ impl PageServerHandler {
 
             let (response, span) = match neon_fe_msg {
                 PagestreamFeMessage::Exists(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::exists");
                     let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
@@ -626,6 +630,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::Nblocks(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::nblocks");
                     let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
@@ -635,6 +640,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::GetPage(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::getpage");
                     // shard_id is filled in by the handler
                     let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
                     (
@@ -645,6 +651,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::DbSize(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::dbsize");
                     let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
                     (
                         self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
@@ -654,6 +661,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::GetSlruSegment(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
                     let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
@@ -1505,6 +1513,7 @@ where
         _pgb: &mut PostgresBackend<IO>,
         _sm: &FeStartupPacket,
     ) -> Result<(), QueryError> {
+        fail::fail_point!("ps::connection-start::startup-packet");
         Ok(())
     }
 
@@ -1519,6 +1528,8 @@ where
             Err(QueryError::SimulatedConnectionError)
         });
 
+        fail::fail_point!("ps::connection-start::process-query");
+
         let ctx = self.connection_ctx.attached_child();
         debug!("process query {query_string:?}");
         let parts = query_string.split_whitespace().collect::<Vec<_>>();
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index f5ce2caff3..a9c8d59c3a 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -51,7 +51,6 @@ int			flush_every_n_requests = 8;
 
 int         neon_protocol_version = 2;
 
-static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
 static int	stripe_size;
 
@@ -95,18 +94,44 @@ static shmem_startup_hook_type prev_shmem_startup_hook;
 static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;
 
+typedef enum PSConnectionState {
+	PS_Disconnected,			/* no connection yet */
+	PS_Connecting_Startup,		/* connection starting up */
+	PS_Connecting_PageStream,	/* negotiating pagestream */ 
+	PS_Connected,				/* connected, pagestream established */
+} PSConnectionState;
+
 /* This backend's per-shard connections */
 typedef struct
 {
-	PGconn	   *conn;
+	TimestampTz		last_connect_time; /* read-only debug value */
+	TimestampTz		last_reconnect_time;
+	uint32			delay_us;
+	int				n_reconnect_attempts;
 
 	/*---
-	 * WaitEventSet containing:
-	 * - WL_SOCKET_READABLE on 'conn'
-	 * - WL_LATCH_SET on MyLatch, and
-	 * - WL_EXIT_ON_PM_DEATH.
+	 * Pageserver connection state, i.e.
+	 *	disconnected: conn == NULL, wes == NULL;
+	 *	conn_startup: connection initiated, waiting for connection establishing
+	 *	conn_ps:      PageStream query sent, waiting for confirmation
+	 *	connected:    PageStream established
 	 */
-	WaitEventSet *wes;
+	PSConnectionState state;
+	PGconn		   *conn;
+	/*---
+	 * WaitEventSet containing:
+	 *	- WL_SOCKET_READABLE on 'conn'
+	 *	- WL_LATCH_SET on MyLatch, and
+	 *	- WL_EXIT_ON_PM_DEATH.
+	 */
+	WaitEventSet   *wes_read;
+	/*---
+	 * WaitEventSet containing:
+	 *	- WL_SOCKET_WRITABLE on 'conn'
+	 *	- WL_LATCH_SET on MyLatch, and
+	 *	- WL_EXIT_ON_PM_DEATH.
+	 */
+	WaitEventSet   *wes_write;
 } PageServer;
 
 static PageServer page_servers[MAX_SHARDS];
@@ -303,119 +328,269 @@ get_shard_number(BufferTag *tag)
 	return hash % n_shards;
 }
 
+static inline void
+CLEANUP_AND_DISCONNECT(PageServer *shard) 
+{
+	if (shard->wes_read)
+	{
+		FreeWaitEventSet(shard->wes_read);
+		shard->wes_read = NULL;
+	}
+	if (shard->wes_write)
+	{
+		FreeWaitEventSet(shard->wes_write);
+		shard->wes_write = NULL;
+	}
+	if (shard->conn)
+	{
+		PQfinish(shard->conn);
+		shard->conn = NULL;
+	}
+
+	shard->state = PS_Disconnected;
+}
+
+/*
+ * Connect to a pageserver, or continue to try to connect if we're yet to
+ * complete the connection (e.g. due to receiving an earlier cancellation
+ * during connection start).
+ * Returns true if successfully connected; false if the connection failed.
+ * 
+ * Throws errors in unrecoverable situations, or when this backend's query
+ * is canceled.
+ */
 static bool
 pageserver_connect(shardno_t shard_no, int elevel)
 {
-	char	   *query;
-	int			ret;
-	const char *keywords[3];
-	const char *values[3];
-	int			n;
-	PGconn	   *conn;
-	WaitEventSet *wes;
+	PageServer *shard = &page_servers[shard_no];
 	char		connstr[MAX_PAGESERVER_CONNSTRING_SIZE];
 
-	static TimestampTz last_connect_time = 0;
-	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
-	TimestampTz now;
-	uint64_t	us_since_last_connect;
-	bool	broke_from_loop = false;
-
-	Assert(page_servers[shard_no].conn == NULL);
-
 	/*
 	 * Get the connection string for this shard. If the shard map has been
 	 * updated since we last looked, this will also disconnect any existing
 	 * pageserver connections as a side effect.
+	 * Note that connstr is used both during connection start, and when we
+	 * log the successful connection.
 	 */
 	load_shard_map(shard_no, connstr, NULL);
 
-	now = GetCurrentTimestamp();
-	us_since_last_connect = now - last_connect_time;
-	if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC)
+	switch (shard->state)
 	{
-		pg_usleep(delay_us);
-		delay_us *= 2;
-	}
-	else
+	case PS_Disconnected:
 	{
-		delay_us = MIN_RECONNECT_INTERVAL_USEC;
-	}
+		const char *keywords[3];
+		const char *values[3];
+		int			n_pgsql_params;
+		TimestampTz	now;
+		int64		us_since_last_attempt;
 
-	/*
-	 * Connect using the connection string we got from the
-	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
-	 * variable was set, use that as the password.
-	 *
-	 * The connection options are parsed in the order they're given, so when
-	 * we set the password before the connection string, the connection string
-	 * can override the password from the env variable. Seems useful, although
-	 * we don't currently use that capability anywhere.
-	 */
-	n = 0;
-	if (neon_auth_token)
-	{
-		keywords[n] = "password";
-		values[n] = neon_auth_token;
-		n++;
+		/* Make sure we start with a clean slate */
+		CLEANUP_AND_DISCONNECT(shard);
+
+		neon_shard_log(shard_no, DEBUG5, "Connection state: Disconnected");
+
+		now = GetCurrentTimestamp();
+		us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
+		shard->last_reconnect_time = now;
+
+		/*
+		 * If we did other tasks between reconnect attempts, then we won't
+		 * need to wait as long as a full delay.
+		 */
+		if (us_since_last_attempt < shard->delay_us)
+		{
+			pg_usleep(shard->delay_us - us_since_last_attempt);
+		}
+
+		/* update the delay metric */
+		shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC);
+
+		/*
+		 * Connect using the connection string we got from the
+		 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
+		 * variable was set, use that as the password.
+		 *
+		 * The connection options are parsed in the order they're given, so when
+		 * we set the password before the connection string, the connection string
+		 * can override the password from the env variable. Seems useful, although
+		 * we don't currently use that capability anywhere.
+		 */
+		keywords[0] = "dbname";
+		values[0] = connstr;
+		n_pgsql_params = 1;
+
+		if (neon_auth_token)
+		{
+			keywords[1] = "password";
+			values[1] = neon_auth_token;
+			n_pgsql_params++;
+		}
+
+		keywords[n_pgsql_params] = NULL;
+		values[n_pgsql_params] = NULL;
+
+		shard->conn = PQconnectStartParams(keywords, values, 1);
+		if (!shard->conn)
+		{
+			neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
+			return false;
+		}
+
+		shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
+		AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
+						  MyLatch, NULL);
+		AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+						  NULL, NULL);
+		AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL);
+
+		shard->wes_write = CreateWaitEventSet(TopMemoryContext, 3);
+		AddWaitEventToSet(shard->wes_write, WL_LATCH_SET, PGINVALID_SOCKET,
+						  MyLatch, NULL);
+		AddWaitEventToSet(shard->wes_write, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+						  NULL, NULL);
+		AddWaitEventToSet(shard->wes_write, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE,
+						  PQsocket(shard->conn),
+						  NULL, NULL);
+
+		shard->state = PS_Connecting_Startup;
+		/* fallthrough */
 	}
-	keywords[n] = "dbname";
-	values[n] = connstr;
-	n++;
-	keywords[n] = NULL;
-	values[n] = NULL;
-	n++;
-	conn = PQconnectdbParams(keywords, values, 1);
-	last_connect_time = GetCurrentTimestamp();
-
-	if (PQstatus(conn) == CONNECTION_BAD)
+	case PS_Connecting_Startup:
 	{
-		char	   *msg = pchomp(PQerrorMessage(conn));
+		char	   *pagestream_query;
+		int			ps_send_query_ret;
+		bool		connected = false;
 
-		PQfinish(conn);
+		neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup");
 
-		ereport(elevel,
-				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-				 errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
-				 errdetail_internal("%s", msg)));
-		pfree(msg);
-		return false;
-	}
-	switch (neon_protocol_version)
-	{
+		do
+		{
+			WaitEvent	event;
+			int			poll_result = PQconnectPoll(shard->conn);
+
+			switch (poll_result)
+			{
+			default: /* unknown/unused states are handled as a failed connection */
+			case PGRES_POLLING_FAILED:
+				{
+					char	   *pqerr = PQerrorMessage(shard->conn);
+					char	   *msg = NULL;
+					neon_shard_log(shard_no, DEBUG5, "POLLING_FAILED");
+
+					if (pqerr)
+						msg = pchomp(pqerr);
+
+					CLEANUP_AND_DISCONNECT(shard);
+
+					if (msg)
+					{
+						neon_shard_log(shard_no, elevel,
+									   "could not connect to pageserver: %s",
+									   msg);
+						pfree(msg);
+					}
+					else
+						neon_shard_log(shard_no, elevel,
+									   "could not connect to pageserver");
+
+					return false;
+				}
+			case PGRES_POLLING_READING:
+				/* Sleep until there's something to do */
+				(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
+										PG_WAIT_EXTENSION);
+				ResetLatch(MyLatch);
+
+				/* query cancellation, backend shutdown */
+				CHECK_FOR_INTERRUPTS();
+
+				/* PQconnectPoll() handles the socket polling state updates */
+
+				break;
+			case PGRES_POLLING_WRITING:
+				/* Sleep until there's something to do */
+				(void) WaitEventSetWait(shard->wes_write, -1L, &event, 1,
+										PG_WAIT_EXTENSION);
+				ResetLatch(MyLatch);
+
+				/* query cancellation, backend shutdown */
+				CHECK_FOR_INTERRUPTS();
+
+				/* PQconnectPoll() handles the socket polling state updates */
+
+				break;
+			case PGRES_POLLING_OK:
+				neon_shard_log(shard_no, DEBUG5, "POLLING_OK");
+				connected = true;
+				break;
+			}
+		}
+		while (!connected);
+
+		/* No more polling needed; connection succeeded */
+		shard->last_connect_time = GetCurrentTimestamp();
+
+		switch (neon_protocol_version)
+		{
 		case 2:
-			query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
+			pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
 			break;
 		case 1:
-			query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
+			pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
 			break;
 		default:
 			elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
-	}
-	ret = PQsendQuery(conn, query);
-	pfree(query);
-	if (ret != 1)
-	{
-		PQfinish(conn);
-		neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
-		return false;
-	}
+		}
 
-	wes = CreateWaitEventSet(TopMemoryContext, 3);
-	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
-					  MyLatch, NULL);
-	AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-					  NULL, NULL);
-	AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);
+		if (PQstatus(shard->conn) == CONNECTION_BAD)
+		{
+			char	   *msg = pchomp(PQerrorMessage(shard->conn));
 
-	PG_TRY();
+			CLEANUP_AND_DISCONNECT(shard);
+
+			ereport(elevel,
+					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+						errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
+						errdetail_internal("%s", msg)));
+			pfree(msg);
+			return false;
+		}
+
+		ps_send_query_ret = PQsendQuery(shard->conn, pagestream_query);
+		pfree(pagestream_query);
+		if (ps_send_query_ret != 1)
+		{
+			CLEANUP_AND_DISCONNECT(shard);
+
+			neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
+			return false;
+		}
+
+		shard->state = PS_Connecting_PageStream;
+		/* fallthrough */
+	}
+	case PS_Connecting_PageStream:
 	{
-		while (PQisBusy(conn))
+		neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_PageStream");
+
+		if (PQstatus(shard->conn) == CONNECTION_BAD)
+		{
+			char	   *msg = pchomp(PQerrorMessage(shard->conn));
+			CLEANUP_AND_DISCONNECT(shard);
+			ereport(elevel,
+					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+						errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
+						errdetail_internal("%s", msg)));
+			pfree(msg);
+			return false;
+		}
+
+		while (PQisBusy(shard->conn))
 		{
 			WaitEvent	event;
 
 			/* Sleep until there's something to do */
-			(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+			(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
 			ResetLatch(MyLatch);
 
 			CHECK_FOR_INTERRUPTS();
@@ -423,40 +598,37 @@ pageserver_connect(shardno_t shard_no, int elevel)
 			/* Data available in socket? */
 			if (event.events & WL_SOCKET_READABLE)
 			{
-				if (!PQconsumeInput(conn))
+				if (!PQconsumeInput(shard->conn))
 				{
-					char	   *msg = pchomp(PQerrorMessage(conn));
-
-					PQfinish(conn);
-					FreeWaitEventSet(wes);
+					char	   *msg = pchomp(PQerrorMessage(shard->conn));
 
+					CLEANUP_AND_DISCONNECT(shard);
 					neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
 								   msg);
-					/* Returning from inside PG_TRY is bad, so we break/return later */
-					broke_from_loop = true;
-					break;
+					pfree(msg);
+					return false;
 				}
 			}
 		}
-	}
-	PG_CATCH();
-	{
-		PQfinish(conn);
-		FreeWaitEventSet(wes);
-		PG_RE_THROW();
-	}
-	PG_END_TRY();
 
-	if (broke_from_loop)
-	{
-		return false;
+		shard->state = PS_Connected;
+		/* fallthrough */
 	}
+	case PS_Connected:
+		/*
+		 * We successfully connected. Future connections to this PageServer
+		 * will do fast retries again, with exponential backoff.
+		 */
+		shard->delay_us = MIN_RECONNECT_INTERVAL_USEC;
 
-	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
-	page_servers[shard_no].conn = conn;
-	page_servers[shard_no].wes = wes;
-
-	return true;
+		neon_shard_log(shard_no, DEBUG5, "Connection state: Connected");
+		neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
+		return true;
+	default:
+		neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state);
+	}
+	/* This shouldn't be hit */
+	Assert(false);
 }
 
 /*
@@ -476,7 +648,7 @@ retry:
 		WaitEvent	event;
 
 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);
 
 		CHECK_FOR_INTERRUPTS();
@@ -502,7 +674,8 @@ retry:
 
 /*
  * Reset prefetch and drop connection to the shard.
- * It also drops connection to all other shards involved in prefetch.
+ * It also drops connection to all other shards involved in prefetch, through
+ * prefetch_on_ps_disconnect().
  */
 static void
 pageserver_disconnect(shardno_t shard_no)
@@ -512,9 +685,6 @@ pageserver_disconnect(shardno_t shard_no)
 	 * whole prefetch queue, even for other pageservers. It should not
 	 * cause big problems, because connection loss is supposed to be a
 	 * rare event.
-	 *
-	 * Prefetch state should be reset even if page_servers[shard_no].conn == NULL,
-	 * because prefetch request may be registered before connection is established.
 	 */
 	prefetch_on_ps_disconnect();
 
@@ -527,37 +697,36 @@ pageserver_disconnect(shardno_t shard_no)
 static void
 pageserver_disconnect_shard(shardno_t shard_no)
 {
+	PageServer *shard = &page_servers[shard_no];
 	/*
 	 * If anything goes wrong while we were sending a request, it's not clear
 	 * what state the connection is in. For example, if we sent the request
 	 * but didn't receive a response yet, we might receive the response some
 	 * time later after we have already sent a new unrelated request. Close
 	 * the connection to avoid getting confused.
+	 * Similarly, even when we're in PS_DISCONNECTED, we may have junk to
+	 * clean up: It is possible that we encountered an error allocating any
+	 * of the wait event sets or the psql connection, or failed when we tried
+	 * to attach wait events to the WaitEventSets.
 	 */
-	if (page_servers[shard_no].conn)
-	{
-		neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
-		PQfinish(page_servers[shard_no].conn);
-		page_servers[shard_no].conn = NULL;
-	}
-	if (page_servers[shard_no].wes != NULL)
-	{
-		FreeWaitEventSet(page_servers[shard_no].wes);
-		page_servers[shard_no].wes = NULL;
-	}
+	CLEANUP_AND_DISCONNECT(shard);
+
+	shard->state = PS_Disconnected;
 }
 
 static bool
 pageserver_send(shardno_t shard_no, NeonRequest *request)
 {
 	StringInfoData req_buff;
-	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
+	PageServer *shard = &page_servers[shard_no];
+	PGconn	   *pageserver_conn;
 
 	/* If the connection was lost for some reason, reconnect */
-	if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
+	if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD)
 	{
 		neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
 		pageserver_disconnect(shard_no);
+		pageserver_conn = NULL;
 	}
 
 	req_buff = nm_pack_request(request);
@@ -571,17 +740,19 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	 * https://github.com/neondatabase/neon/issues/1138 So try to reestablish
 	 * connection in case of failure.
 	 */
-	if (!page_servers[shard_no].conn)
+	if (shard->state != PS_Connected)
 	{
-		while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
+		while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
 			HandleMainLoopInterrupts();
-			n_reconnect_attempts += 1;
+			shard->n_reconnect_attempts += 1;
 		}
-		n_reconnect_attempts = 0;
+		shard->n_reconnect_attempts = 0;
+	} else {
+		Assert(shard->conn != NULL);
 	}
 
-	pageserver_conn = page_servers[shard_no].conn;
+	pageserver_conn = shard->conn;
 
 	/*
 	 * Send request.
@@ -590,13 +761,17 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	 * should use async mode and check for interrupts while waiting. In
 	 * practice, our requests are small enough to always fit in the output and
 	 * TCP buffer.
+	 *
+	 * Note that this also will fail when the connection is in the
+	 * PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
+	 * point, but on the grand scheme of things it's only a small issue.
 	 */
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 
 		pageserver_disconnect(shard_no);
-		neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnected: failed to send page request (try to reconnect): %s", msg);
 		pfree(msg);
 		pfree(req_buff.data);
 		return false;
@@ -611,6 +786,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 		neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
 		pfree(msg);
 	}
+
 	return true;
 }
 
@@ -619,58 +795,68 @@ pageserver_receive(shardno_t shard_no)
 {
 	StringInfoData resp_buff;
 	NeonResponse *resp;
-	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
+	PageServer *shard = &page_servers[shard_no];
+	PGconn	   *pageserver_conn = shard->conn;
+	/* read response */
+	int			rc;
 
-	if (!pageserver_conn)
-		return NULL;
-
-	PG_TRY();
+	if (shard->state != PS_Connected)
 	{
-		/* read response */
-		int			rc;
+		neon_shard_log(shard_no, LOG,
+					   "pageserver_receive: returning NULL for non-connected pageserver connection: 0x%02x",
+					   shard->state);
+		return NULL;
+	}
 
-		rc = call_PQgetCopyData(shard_no, &resp_buff.data);
-		if (rc >= 0)
+	Assert(pageserver_conn);
+
+	rc = call_PQgetCopyData(shard_no, &resp_buff.data);
+	if (rc >= 0)
+	{
+		/* call_PQgetCopyData handles rc == 0 */
+		Assert(rc > 0);
+
+		PG_TRY();
 		{
 			resp_buff.len = rc;
 			resp_buff.cursor = 0;
 			resp = nm_unpack_response(&resp_buff);
 			PQfreemem(resp_buff.data);
-
-			if (message_level_is_interesting(PageStoreTrace))
-			{
-				char	   *msg = nm_to_string((NeonMessage *) resp);
-
-				neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
-				pfree(msg);
-			}
 		}
-		else if (rc == -1)
+		PG_CATCH();
 		{
-			neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
+			neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response");
 			pageserver_disconnect(shard_no);
-			resp = NULL;
+			PG_RE_THROW();
 		}
-		else if (rc == -2)
-		{
-			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+		PG_END_TRY();
 
-			pageserver_disconnect(shard_no);
-			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
-		}
-		else
+		if (message_level_is_interesting(PageStoreTrace))
 		{
-			pageserver_disconnect(shard_no);
-			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
+			char	   *msg = nm_to_string((NeonMessage *) resp);
+
+			neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
+			pfree(msg);
 		}
 	}
-	PG_CATCH();
+	else if (rc == -1)
 	{
-		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
+		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
 		pageserver_disconnect(shard_no);
-		PG_RE_THROW();
+		resp = NULL;
+	}
+	else if (rc == -2)
+	{
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
+		pageserver_disconnect(shard_no);
+		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
+	}
+	else
+	{
+		pageserver_disconnect(shard_no);
+		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
 	}
-	PG_END_TRY();
 
 	return (NeonResponse *) resp;
 }
@@ -681,7 +867,7 @@ pageserver_flush(shardno_t shard_no)
 {
 	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
 
-	if (!pageserver_conn)
+	if (page_servers[shard_no].state != PS_Connected)
 	{
 		neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
 	}
@@ -697,6 +883,7 @@ pageserver_flush(shardno_t shard_no)
 			return false;
 		}
 	}
+
 	return true;
 }
 
@@ -891,5 +1078,7 @@ pg_init_libpagestore(void)
 		dbsize_hook = neon_dbsize;
 	}
 
+	memset(page_servers, 0, sizeof(page_servers));
+
 	lfc_init();
 }
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 41546eae85..ac505fe6fb 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -94,6 +94,10 @@ static char *hexdump_page(char *page);
 
 const int	SmgrTrace = DEBUG5;
 
+#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \
+	neon_shard_log(shard_no, elvl, "Broken connection state: " message, \
+				   ##__VA_ARGS__)
+
 page_server_api *page_server;
 
 /* unlogged relation build states */
@@ -526,6 +530,8 @@ prefetch_flush_requests(void)
  *
  * NOTE: this function may indirectly update MyPState->pfs_hash; which
  * invalidates any active pointers into the hash table.
+ * NOTE: callers should make sure they can handle query cancellations in this
+ * function's call path.
  */
 static bool
 prefetch_wait_for(uint64 ring_index)
@@ -561,6 +567,8 @@ prefetch_wait_for(uint64 ring_index)
  *
  * NOTE: this function may indirectly update MyPState->pfs_hash; which
  * invalidates any active pointers into the hash table.
+ *
+ * NOTE: this does IO, and can get canceled out-of-line.
  */
 static bool
 prefetch_read(PrefetchRequest *slot)
@@ -572,6 +580,14 @@ prefetch_read(PrefetchRequest *slot)
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_receive);
 
+	if (slot->status != PRFS_REQUESTED ||
+		slot->response != NULL ||
+		slot->my_ring_index != MyPState->ring_receive)
+		neon_shard_log(slot->shard_no, ERROR,
+					   "Incorrect prefetch read: status=%d response=%llx my=%llu receive=%llu",
+					   slot->status, (size_t) (void *) slot->response,
+					   slot->my_ring_index, MyPState->ring_receive);
+
 	old = MemoryContextSwitchTo(MyPState->errctx);
 	response = (NeonResponse *) page_server->receive(slot->shard_no);
 	MemoryContextSwitchTo(old);
@@ -589,6 +605,11 @@ prefetch_read(PrefetchRequest *slot)
 	}
 	else
 	{
+		neon_shard_log(slot->shard_no, WARNING,
+					   "No response from reading prefetch entry %llu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
+					   slot->my_ring_index,
+					   RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
+					   slot->buftag.forkNum, slot->buftag.blockNum);
 		return false;
 	}
 }
@@ -603,6 +624,7 @@ void
 prefetch_on_ps_disconnect(void)
 {
 	MyPState->ring_flush = MyPState->ring_unused;
+
 	while (MyPState->ring_receive < MyPState->ring_unused)
 	{
 		PrefetchRequest *slot;
@@ -625,6 +647,7 @@ prefetch_on_ps_disconnect(void)
 		slot->status = PRFS_TAG_REMAINS;
 		MyPState->n_requests_inflight -= 1;
 		MyPState->ring_receive += 1;
+
 		prefetch_set_unused(ring_index);
 	}
 }
@@ -691,6 +714,8 @@ static void
 prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
 {
 	bool		found;
+	uint64		mySlotNo = slot->my_ring_index;
+
 	NeonGetPageRequest request = {
 		.req.tag = T_NeonGetPageRequest,
 		/* lsn and not_modified_since are filled in below */
@@ -699,6 +724,8 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 		.blkno = slot->buftag.blockNum,
 	};
 
+	Assert(mySlotNo == MyPState->ring_unused);
+
 	if (force_request_lsns)
 		slot->request_lsns = *force_request_lsns;
 	else
@@ -711,7 +738,11 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
 
-	while (!page_server->send(slot->shard_no, (NeonRequest *) &request));
+	while (!page_server->send(slot->shard_no, (NeonRequest *) &request))
+	{
+		Assert(mySlotNo == MyPState->ring_unused);
+		/* loop */
+	}
 
 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;
@@ -722,7 +753,6 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 
 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
-
 	prfh_insert(MyPState->prf_hash, slot, &found);
 	Assert(!found);
 }
@@ -894,6 +924,10 @@ Retry:
 	return ring_index;
 }
 
+/*
+ * Note: this function can get canceled and use a long jump to the next catch
+ * context. Take care.
+ */
 static NeonResponse *
 page_server_request(void const *req)
 {
@@ -925,19 +959,38 @@ page_server_request(void const *req)
 	 * Current sharding model assumes that all metadata is present only at shard 0.
 	 * We still need to call get_shard_no() to check if shard map is up-to-date.
 	 */
-	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
+	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest ||
+		((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
 	{
 		shard_no = 0;
 	}
 
 	do
 	{
-		while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
-		consume_prefetch_responses();
-		resp = page_server->receive(shard_no);
-	} while (resp == NULL);
-	return resp;
+		PG_TRY();
+		{
+			while (!page_server->send(shard_no, (NeonRequest *) req)
+				   || !page_server->flush(shard_no))
+			{
+				/* do nothing */
+			}
+			consume_prefetch_responses();
+			resp = page_server->receive(shard_no);
+		}
+		PG_CATCH();
+		{
+			/*
+			 * Cancellation in this code needs to be handled better at some
+			 * point, but this currently seems fine for now.
+			 */
+			page_server->disconnect(shard_no);
+			PG_RE_THROW();
+		}
+		PG_END_TRY();
 
+	} while (resp == NULL);
+
+	return resp;
 }
 
 
@@ -1905,7 +1958,9 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			break;
 
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
+										T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
 	}
 	pfree(resp);
 	return exists;
@@ -2357,7 +2412,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	/*
 	 * Try to find prefetched page in the list of received pages.
 	 */
-  Retry:
+Retry:
 	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
 
 	if (entry != NULL)
@@ -2443,7 +2498,9 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
+										"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
+										T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
 	}
 
 	/* buffer was used, clean up for later reuse */
@@ -2714,7 +2771,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			break;
 
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
+										T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
 	}
 	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
 
@@ -2767,7 +2826,9 @@ neon_dbsize(Oid dbNode)
 			break;
 
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
+										T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
 	}
 
 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
@@ -3106,7 +3167,9 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 			break;
 
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x",
+										T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag);
 	}
 	pfree(resp);
 
diff --git a/test_runner/regress/test_pg_query_cancellation.py b/test_runner/regress/test_pg_query_cancellation.py
new file mode 100644
index 0000000000..bad2e5865e
--- /dev/null
+++ b/test_runner/regress/test_pg_query_cancellation.py
@@ -0,0 +1,282 @@
+from contextlib import closing
+from typing import Set
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonPageserver
+from fixtures.pageserver.http import PageserverHttpClient
+from psycopg2.errors import QueryCanceled
+
+CRITICAL_PG_PS_WAIT_FAILPOINTS: Set[str] = {
+    "ps::connection-start::pre-login",
+    "ps::connection-start::startup-packet",
+    "ps::connection-start::process-query",
+    "ps::handle-pagerequest-message::exists",
+    "ps::handle-pagerequest-message::nblocks",
+    "ps::handle-pagerequest-message::getpage",
+    "ps::handle-pagerequest-message::dbsize",
+    # We don't yet have a good way to on-demand guarantee the download of an
+    # SLRU segment, so that's disabled for now.
+    # "ps::handle-pagerequest-message::slrusegment",
+}
+
+PG_PS_START_FAILPOINTS = {
+    "ps::connection-start::pre-login",
+    "ps::connection-start::startup-packet",
+    "ps::connection-start::process-query",
+}
+SMGR_EXISTS = "ps::handle-pagerequest-message::exists"
+SMGR_NBLOCKS = "ps::handle-pagerequest-message::nblocks"
+SMGR_GETPAGE = "ps::handle-pagerequest-message::getpage"
+SMGR_DBSIZE = "ps::handle-pagerequest-message::dbsize"
+
+"""
+Test that we can handle connection delays and cancellations at various
+unfortunate connection startup and request states.
+"""
+
+
+def test_cancellations(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    ps = env.pageserver
+    ps_http = ps.http_client()
+    ps_http.is_testing_enabled_or_skip()
+
+    env.neon_cli.create_branch("test_config", "empty")
+
+    # We don't want to have any racy behaviour with autovacuum IOs
+    ep = env.endpoints.create_start(
+        "test_config",
+        config_lines=[
+            "autovacuum = off",
+            "shared_buffers = 128MB",
+        ],
+    )
+
+    with closing(ep.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                CREATE TABLE test1 AS
+                    SELECT id, sha256(id::text::bytea) payload
+                    FROM generate_series(1, 1024::bigint) p(id);
+                """
+            )
+            cur.execute(
+                """
+                CREATE TABLE test2 AS
+                    SELECT id, sha256(id::text::bytea) payload
+                    FROM generate_series(1025, 2048::bigint) p(id);
+                """
+            )
+            cur.execute(
+                """
+                VACUUM (ANALYZE, FREEZE) test1, test2;
+                """
+            )
+            cur.execute(
+                """
+                CREATE EXTENSION pg_buffercache;
+                """
+            )
+            cur.execute(
+                """
+                CREATE EXTENSION pg_prewarm;
+                """
+            )
+
+    # data preparation is now complete, with 2 disjoint tables that aren't
+    # preloaded into any caches.
+
+    ep.stop()
+
+    for failpoint in CRITICAL_PG_PS_WAIT_FAILPOINTS:
+        connect_works_correctly(failpoint, ep, ps, ps_http)
+
+
+ENABLED_FAILPOINTS: Set[str] = set()
+
+
+def connect_works_correctly(
+    failpoint: str, ep: Endpoint, ps: NeonPageserver, ps_http: PageserverHttpClient
+):
+    log.debug("Starting work on %s", failpoint)
+    # All queries we use should finish (incl. IO) within 500ms,
+    # including all their IO.
+    # This allows us to use `SET statement_timeout` to let the query
+    # timeout system cancel queries, rather than us having to go
+    # through the most annoying effort of manual query cancellation
+    # in psycopg2.
+    options = "-cstatement_timeout=500ms -ceffective_io_concurrency=1"
+
+    ep.start()
+
+    def fp_enable():
+        global ENABLED_FAILPOINTS
+        ps_http.configure_failpoints(
+            [
+                (failpoint, "pause"),
+            ]
+        )
+        ENABLED_FAILPOINTS = ENABLED_FAILPOINTS | {failpoint}
+        log.info(
+            'Enabled failpoint "%s", current_active=%s', failpoint, ENABLED_FAILPOINTS, stacklevel=2
+        )
+
+    def fp_disable():
+        global ENABLED_FAILPOINTS
+        ps_http.configure_failpoints(
+            [
+                (failpoint, "off"),
+            ]
+        )
+        ENABLED_FAILPOINTS = ENABLED_FAILPOINTS - {failpoint}
+        log.info(
+            'Disabled failpoint "%s", current_active=%s',
+            failpoint,
+            ENABLED_FAILPOINTS,
+            stacklevel=2,
+        )
+
+    def check_buffers(cur):
+        cur.execute(
+            """
+            SELECT n.nspname AS nspname
+                 , c.relname AS relname
+                 , count(*)  AS count
+            FROM pg_buffercache b
+            JOIN pg_class c
+              ON b.relfilenode = pg_relation_filenode(c.oid) AND
+                 b.reldatabase = (SELECT oid FROM pg_database WHERE datname = current_database())
+            JOIN pg_namespace n ON n.oid = c.relnamespace
+            WHERE c.oid IN ('test1'::regclass::oid, 'test2'::regclass::oid)
+            GROUP BY n.nspname, c.relname
+            ORDER BY 3 DESC
+            LIMIT 10
+            """
+        )
+        return cur.fetchone()
+
+    def exec_may_cancel(query, cursor, result, cancels):
+        if cancels:
+            with pytest.raises(QueryCanceled):
+                cursor.execute(query)
+                assert cursor.fetchone() == result
+        else:
+            cursor.execute(query)
+            assert cursor.fetchone() == result
+
+    fp_disable()
+
+    # Warm caches required for new connections, so that they can run without
+    # requiring catalog reads.
+    with closing(ep.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                SELECT 1;
+                """
+            )
+            assert cur.fetchone() == (1,)
+
+            assert check_buffers(cur) is None
+            # Ensure all caches required for connection start are correctly
+            # filled, so that we don't have any "accidents" in this test run
+            # caused by changes in connection startup plans that require
+            # requests to the PageServer.
+            cur.execute(
+                """
+                select array_agg(distinct (pg_prewarm(c.oid::regclass, 'buffer') >= 0))
+                from pg_class c
+                where c.oid < 16384 AND c.relkind IN ('i', 'r');
+                """
+            )
+            assert cur.fetchone() == ([True],)
+
+    # Enable failpoint
+    fp_enable()
+
+    with closing(ep.connect(options=options, autocommit=True)) as conn:
+        with conn.cursor() as cur:
+            cur.execute("SHOW statement_timeout;")
+            assert cur.fetchone() == ("500ms",)
+            assert check_buffers(cur) is None
+            exec_may_cancel(
+                """
+                SELECT min(id) FROM test1;
+                """,
+                cur,
+                (1,),
+                failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}),
+            )
+
+    fp_disable()
+
+    with closing(ep.connect(options=options, autocommit=True)) as conn:
+        with conn.cursor() as cur:
+            # Do a select on the data, putting some buffers into the prefetch
+            # queue.
+            cur.execute(
+                """
+                SELECT count(id) FROM (select * from test1 LIMIT 256) a;
+                """
+            )
+            assert cur.fetchone() == (256,)
+
+            ps.stop()
+            ps.start()
+            fp_enable()
+
+            exec_may_cancel(
+                """
+                SELECT COUNT(id) FROM test1;
+                """,
+                cur,
+                (1024,),
+                failpoint
+                in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_NBLOCKS, SMGR_DBSIZE}),
+            )
+
+    with closing(ep.connect(options=options, autocommit=True)) as conn:
+        with conn.cursor() as cur:
+            exec_may_cancel(
+                """
+                SELECT COUNT(id) FROM test2;
+                """,
+                cur,
+                (1024,),
+                failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}),
+            )
+
+            fp_disable()
+            fp_enable()
+
+            exec_may_cancel(
+                """
+                SELECT 0 < pg_database_size(CURRENT_DATABASE());
+                """,
+                cur,
+                (True,),
+                failpoint
+                in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_GETPAGE, SMGR_NBLOCKS}),
+            )
+
+            fp_disable()
+
+            cur.execute(
+                """
+                SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test2;
+                """
+            )
+
+            assert cur.fetchone() == (1024, 1024, 1025, 2048, 1573376)
+
+            cur.execute(
+                """
+                SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test1;
+                """
+            )
+
+            assert cur.fetchone() == (1024, 1024, 1, 1024, 524800)
+
+    ep.stop()

From c1f4028fc0e76b0945d7eaaed5a460412ea7980e Mon Sep 17 00:00:00 2001
From: Roman Zaynetdinov <roman@neon.tech>
Date: Fri, 24 May 2024 11:05:20 +0300
Subject: [PATCH 0871/1571] Export db size metrics for 10 user databases
 (#7857)

## Problem

One database is too limiting. We have agreed to raise this limit to 10.

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 vm-image-spec.yaml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 484a86fc21..73a24c42d6 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -194,7 +194,7 @@ files:
 
       - metric_name: pg_stats_userdb
         type: gauge
-        help: 'Stats for the oldest non-system db'
+        help: 'Stats for several oldest non-system dbs'
         key_labels:
           - datname
         value_label: kind
@@ -205,9 +205,8 @@ files:
           - inserted
           - updated
           - deleted
-        # We export stats for only one non-system database. Without this limit
+        # We export stats for 10 non-system database. Without this limit
         # it is too easy to abuse the system by creating lots of databases.
-        # We can try lifting this limit in the future after we understand the needs better.
         query: |
           select pg_database_size(datname) as db_size, deadlocks,
                  tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
@@ -218,7 +217,7 @@ files:
                from pg_database
               where datname <> 'postgres' and not datistemplate
               order by oid
-              limit 1
+              limit 10
            );
 
       - metric_name: max_cluster_size

From 3860bc9c6c74ca4f84ff2f12d6c3521b3df99df2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 24 May 2024 09:33:19 +0100
Subject: [PATCH 0872/1571] pageserver: post-shard-split layer rewrites (2/2)
 (#7531)

## Problem

- After a shard split of a large existing tenant, child tenants can end
up with oversized historic layers indefinitely, if those layers are
prevented from being GC'd by branchpoints.

This PR follows https://github.com/neondatabase/neon/pull/7531, and adds
rewriting of layers that contain a mixture of needed & un-needed
contents, in addition to dropping un-needed layers.

Closes: https://github.com/neondatabase/neon/issues/7504

## Summary of changes

- Add methods to ImageLayer for reading back existing layers
- Extend `compact_shard_ancestors` to rewrite layer files that contain a
mixture of keys that we want and keys we do not, if unwanted keys are
the majority of those in the file.
- Amend initialization code to handle multiple layers with the same
LayerName properly
- Get rid of of renaming bad layer files to `.old` since that's now
expected on restarts during rewrites.
---
 .../src/tenant/storage_layer/image_layer.rs   | 200 +++++++++++++++++-
 pageserver/src/tenant/storage_layer/layer.rs  |  32 ++-
 pageserver/src/tenant/timeline.rs             |  84 +++-----
 pageserver/src/tenant/timeline/compaction.rs  | 101 +++++++--
 pageserver/src/tenant/timeline/init.rs        | 183 ++++++++--------
 .../src/tenant/timeline/layer_manager.rs      |  27 ++-
 test_runner/fixtures/neon_fixtures.py         |   4 +-
 test_runner/regress/test_sharding.py          | 104 ++++++++-
 8 files changed, 545 insertions(+), 190 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 67b489ce0d..8394b33f19 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -47,7 +47,7 @@ use hex;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs::File;
@@ -473,7 +473,7 @@ impl ImageLayerInner {
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
         let reads = self
-            .plan_reads(keyspace, ctx)
+            .plan_reads(keyspace, None, ctx)
             .await
             .map_err(GetVectoredError::Other)?;
 
@@ -485,9 +485,15 @@ impl ImageLayerInner {
         Ok(())
     }
 
+    /// Traverse the layer's index to build read operations on the overlap of the input keyspace
+    /// and the keys in this layer.
+    ///
+    /// If shard_identity is provided, it will be used to filter keys down to those stored on
+    /// this shard.
     async fn plan_reads(
         &self,
         keyspace: KeySpace,
+        shard_identity: Option<&ShardIdentity>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Vec<VectoredRead>> {
         let mut planner = VectoredReadPlanner::new(
@@ -507,7 +513,6 @@ impl ImageLayerInner {
 
         for range in keyspace.ranges.iter() {
             let mut range_end_handled = false;
-
             let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
             range.start.write_to_byte_slice(&mut search_key);
 
@@ -520,12 +525,22 @@ impl ImageLayerInner {
                 let key = Key::from_slice(&raw_key[..KEY_SIZE]);
                 assert!(key >= range.start);
 
+                let flag = if let Some(shard_identity) = shard_identity {
+                    if shard_identity.is_key_disposable(&key) {
+                        BlobFlag::Ignore
+                    } else {
+                        BlobFlag::None
+                    }
+                } else {
+                    BlobFlag::None
+                };
+
                 if key >= range.end {
                     planner.handle_range_end(offset);
                     range_end_handled = true;
                     break;
                 } else {
-                    planner.handle(key, self.lsn, offset, BlobFlag::None);
+                    planner.handle(key, self.lsn, offset, flag);
                 }
             }
 
@@ -538,6 +553,50 @@ impl ImageLayerInner {
         Ok(planner.finish())
     }
 
+    /// Given a key range, select the parts of that range that should be retained by the ShardIdentity,
+    /// then execute vectored GET operations, passing the results of all read keys into the writer.
+    pub(super) async fn filter(
+        &self,
+        shard_identity: &ShardIdentity,
+        writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize> {
+        // Fragment the range into the regions owned by this ShardIdentity
+        let plan = self
+            .plan_reads(
+                KeySpace {
+                    // If asked for the total key space, plan_reads will give us all the keys in the layer
+                    ranges: vec![Key::MIN..Key::MAX],
+                },
+                Some(shard_identity),
+                ctx,
+            )
+            .await?;
+
+        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
+        let mut key_count = 0;
+        for read in plan.into_iter() {
+            let buf_size = read.size();
+
+            let buf = BytesMut::with_capacity(buf_size);
+            let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
+
+            let frozen_buf = blobs_buf.buf.freeze();
+
+            for meta in blobs_buf.blobs.iter() {
+                let img_buf = frozen_buf.slice(meta.start..meta.end);
+
+                key_count += 1;
+                writer
+                    .put_image(meta.meta.key, img_buf, ctx)
+                    .await
+                    .context(format!("Storing key {}", meta.meta.key))?;
+            }
+        }
+
+        Ok(key_count)
+    }
+
     async fn do_reads_and_update_state(
         &self,
         reads: Vec<VectoredRead>,
@@ -855,3 +914,136 @@ impl Drop for ImageLayerWriter {
         }
     }
 }
+
+#[cfg(test)]
+mod test {
+    use bytes::Bytes;
+    use pageserver_api::{
+        key::Key,
+        shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
+    };
+    use utils::{id::TimelineId, lsn::Lsn};
+
+    use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
+
+    use super::ImageLayerWriter;
+
+    #[tokio::test]
+    async fn image_layer_rewrite() {
+        let harness = TenantHarness::create("test_image_layer_rewrite").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        // The LSN at which we will create an image layer to filter
+        let lsn = Lsn(0xdeadbeef0000);
+
+        let timeline_id = TimelineId::generate();
+        let timeline = tenant
+            .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
+        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
+        let range = input_start..input_end;
+
+        // Build an image layer to filter
+        let resident = {
+            let mut writer = ImageLayerWriter::new(
+                harness.conf,
+                timeline_id,
+                harness.tenant_shard_id,
+                &range,
+                lsn,
+                &ctx,
+            )
+            .await
+            .unwrap();
+
+            let foo_img = Bytes::from_static(&[1, 2, 3, 4]);
+            let mut key = range.start;
+            while key < range.end {
+                writer.put_image(key, foo_img.clone(), &ctx).await.unwrap();
+
+                key = key.next();
+            }
+            writer.finish(&timeline, &ctx).await.unwrap()
+        };
+        let original_size = resident.metadata().file_size;
+
+        // Filter for various shards: this exercises cases like values at start of key range, end of key
+        // range, middle of key range.
+        for shard_number in 0..4 {
+            let mut filtered_writer = ImageLayerWriter::new(
+                harness.conf,
+                timeline_id,
+                harness.tenant_shard_id,
+                &range,
+                lsn,
+                &ctx,
+            )
+            .await
+            .unwrap();
+
+            // TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity
+            // to exercise filter()
+            let shard_identity = ShardIdentity::new(
+                ShardNumber(shard_number),
+                ShardCount::new(4),
+                ShardStripeSize(0x8000),
+            )
+            .unwrap();
+
+            let wrote_keys = resident
+                .filter(&shard_identity, &mut filtered_writer, &ctx)
+                .await
+                .unwrap();
+            let replacement = if wrote_keys > 0 {
+                Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
+            } else {
+                None
+            };
+
+            // This exact size and those below will need updating as/when the layer encoding changes, but
+            // should be deterministic for a given version of the format, as we used no randomness generating the input.
+            assert_eq!(original_size, 1597440);
+
+            match shard_number {
+                0 => {
+                    // We should have written out just one stripe for our shard identity
+                    assert_eq!(wrote_keys, 0x8000);
+                    let replacement = replacement.unwrap();
+
+                    // We should have dropped some of the data
+                    assert!(replacement.metadata().file_size < original_size);
+                    assert!(replacement.metadata().file_size > 0);
+
+                    // Assert that we dropped ~3/4 of the data.
+                    assert_eq!(replacement.metadata().file_size, 417792);
+                }
+                1 => {
+                    // Shard 1 has no keys in our input range
+                    assert_eq!(wrote_keys, 0x0);
+                    assert!(replacement.is_none());
+                }
+                2 => {
+                    // Shard 2 has one stripes in the input range
+                    assert_eq!(wrote_keys, 0x8000);
+                    let replacement = replacement.unwrap();
+                    assert!(replacement.metadata().file_size < original_size);
+                    assert!(replacement.metadata().file_size > 0);
+                    assert_eq!(replacement.metadata().file_size, 417792);
+                }
+                3 => {
+                    // Shard 3 has two stripes in the input range
+                    assert_eq!(wrote_keys, 0x10000);
+                    let replacement = replacement.unwrap();
+                    assert!(replacement.metadata().file_size < original_size);
+                    assert!(replacement.metadata().file_size > 0);
+                    assert_eq!(replacement.metadata().file_size, 811008);
+                }
+                _ => unreachable!(),
+            }
+        }
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index b2f3bdb552..3ac799c69a 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -4,7 +4,7 @@ use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::{
     HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
 };
-use pageserver_api::shard::{ShardIndex, TenantShardId};
+use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
@@ -23,10 +23,10 @@ use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
 
 use super::delta_layer::{self, DeltaEntry};
-use super::image_layer;
+use super::image_layer::{self};
 use super::{
-    AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc,
-    ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
+    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
+    PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
 };
 
 use utils::generation::Generation;
@@ -1802,16 +1802,15 @@ impl ResidentLayer {
         use LayerKind::*;
 
         let owner = &self.owner.0;
-
         match self.downloaded.get(owner, ctx).await? {
             Delta(ref d) => {
+                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
+                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
+                // while it's being held.
                 owner
                     .access_stats
                     .record_access(LayerAccessKind::KeyIter, ctx);
 
-                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
-                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
-                // while it's being held.
                 delta_layer::DeltaLayerInner::load_keys(d, ctx)
                     .await
                     .with_context(|| format!("Layer index is corrupted for {self}"))
@@ -1820,6 +1819,23 @@ impl ResidentLayer {
         }
     }
 
+    /// Read all they keys in this layer which match the ShardIdentity, and write them all to
+    /// the provided writer.  Return the number of keys written.
+    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
+    pub(crate) async fn filter<'a>(
+        &'a self,
+        shard_identity: &ShardIdentity,
+        writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize> {
+        use LayerKind::*;
+
+        match self.downloaded.get(&self.owner.0, ctx).await? {
+            Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")),
+            Image(i) => i.filter(shard_identity, writer, ctx).await,
+        }
+    }
+
     /// Returns the amount of keys and values written to the writer.
     pub(crate) async fn copy_delta_prefix(
         &self,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 342fc4fc59..1bdbddd95f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -41,6 +41,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
     bin_ser::BeSer,
+    fs_ext,
     sync::gate::{Gate, GateGuard},
     vec_map::VecMap,
 };
@@ -60,6 +61,7 @@ use std::{
     ops::ControlFlow,
 };
 
+use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
     aux_file::AuxFileSizeEstimator,
     tenant::{
@@ -88,9 +90,6 @@ use crate::{
     metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
-use crate::{
-    pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::timeline::init::LocalLayerFileMetadata,
-};
 use crate::{
     pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
     virtual_file::{MaybeFatalIo, VirtualFile},
@@ -2454,8 +2453,6 @@ impl Timeline {
         let span = tracing::Span::current();
 
         // Copy to move into the task we're about to spawn
-        let generation = self.generation;
-        let shard = self.get_shard_index();
         let this = self.myself.upgrade().expect("&self method holds the arc");
 
         let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({
@@ -2469,11 +2466,14 @@ impl Timeline {
 
                 for discovered in discovered {
                     let (name, kind) = match discovered {
-                        Discovered::Layer(layer_file_name, local_path, file_size) => {
-                            discovered_layers.push((layer_file_name, local_path, file_size));
+                        Discovered::Layer(layer_file_name, local_metadata) => {
+                            discovered_layers.push((layer_file_name, local_metadata));
                             continue;
                         }
-                        Discovered::IgnoredBackup => {
+                        Discovered::IgnoredBackup(path) => {
+                            std::fs::remove_file(path)
+                                .or_else(fs_ext::ignore_not_found)
+                                .fatal_err("Removing .old file");
                             continue;
                         }
                         Discovered::Unknown(file_name) => {
@@ -2499,13 +2499,8 @@ impl Timeline {
                     );
                 }
 
-                let decided = init::reconcile(
-                    discovered_layers,
-                    index_part.as_ref(),
-                    disk_consistent_lsn,
-                    generation,
-                    shard,
-                );
+                let decided =
+                    init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn);
 
                 let mut loaded_layers = Vec::new();
                 let mut needs_cleanup = Vec::new();
@@ -2513,21 +2508,6 @@ impl Timeline {
 
                 for (name, decision) in decided {
                     let decision = match decision {
-                        Ok(UseRemote { local, remote }) => {
-                            // Remote is authoritative, but we may still choose to retain
-                            // the local file if the contents appear to match
-                            if local.metadata.file_size == remote.file_size {
-                                // Use the local file, but take the remote metadata so that we pick up
-                                // the correct generation.
-                                UseLocal(LocalLayerFileMetadata {
-                                    metadata: remote,
-                                    local_path: local.local_path,
-                                })
-                            } else {
-                                init::cleanup_local_file_for_remote(&local, &remote)?;
-                                UseRemote { local, remote }
-                            }
-                        }
                         Ok(decision) => decision,
                         Err(DismissedLayer::Future { local }) => {
                             if let Some(local) = local {
@@ -2545,6 +2525,11 @@ impl Timeline {
                             // this file never existed remotely, we will have to do rework
                             continue;
                         }
+                        Err(DismissedLayer::BadMetadata(local)) => {
+                            init::cleanup_local_file_for_remote(&local)?;
+                            // this file never existed remotely, we will have to do rework
+                            continue;
+                        }
                     };
 
                     match &name {
@@ -2555,14 +2540,12 @@ impl Timeline {
                     tracing::debug!(layer=%name, ?decision, "applied");
 
                     let layer = match decision {
-                        UseLocal(local) => {
-                            total_physical_size += local.metadata.file_size;
-                            Layer::for_resident(conf, &this, local.local_path, name, local.metadata)
+                        Resident { local, remote } => {
+                            total_physical_size += local.file_size;
+                            Layer::for_resident(conf, &this, local.local_path, name, remote)
                                 .drop_eviction_guard()
                         }
-                        Evicted(remote) | UseRemote { remote, .. } => {
-                            Layer::for_evicted(conf, &this, name, remote)
-                        }
+                        Evicted(remote) => Layer::for_evicted(conf, &this, name, remote),
                     };
 
                     loaded_layers.push(layer);
@@ -4725,11 +4708,16 @@ impl Timeline {
 
     async fn rewrite_layers(
         self: &Arc<Self>,
-        replace_layers: Vec<(Layer, ResidentLayer)>,
-        drop_layers: Vec<Layer>,
+        mut replace_layers: Vec<(Layer, ResidentLayer)>,
+        mut drop_layers: Vec<Layer>,
     ) -> anyhow::Result<()> {
         let mut guard = self.layers.write().await;
 
+        // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want
+        // to avoid double-removing, and avoid rewriting something that was removed.
+        replace_layers.retain(|(l, _)| guard.contains(l));
+        drop_layers.retain(|l| guard.contains(l));
+
         guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics);
 
         let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
@@ -5604,26 +5592,6 @@ fn is_send() {
     _assert_send::<TimelineWriter<'_>>();
 }
 
-/// Add a suffix to a layer file's name: .{num}.old
-/// Uses the first available num (starts at 0)
-fn rename_to_backup(path: &Utf8Path) -> anyhow::Result<()> {
-    let filename = path
-        .file_name()
-        .ok_or_else(|| anyhow!("Path {path} don't have a file name"))?;
-    let mut new_path = path.to_owned();
-
-    for i in 0u32.. {
-        new_path.set_file_name(format!("{filename}.{i}.old"));
-        if !new_path.exists() {
-            std::fs::rename(path, &new_path)
-                .with_context(|| format!("rename {path:?} to {new_path:?}"))?;
-            return Ok(());
-        }
-    }
-
-    bail!("couldn't find an unused backup number for {:?}", path)
-}
-
 #[cfg(test)]
 mod tests {
     use utils::{id::TimelineId, lsn::Lsn};
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index db8adfc16c..07a12f535a 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -176,13 +176,24 @@ impl Timeline {
     async fn compact_shard_ancestors(
         self: &Arc<Self>,
         rewrite_max: usize,
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let mut drop_layers = Vec::new();
-        let layers_to_rewrite: Vec<Layer> = Vec::new();
+        let mut layers_to_rewrite: Vec<Layer> = Vec::new();
 
-        // We will use the PITR cutoff as a condition for rewriting layers.
-        let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr;
+        // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
+        // layer is behind this Lsn, it indicates that the layer is being retained beyond the
+        // pitr_interval, for example because a branchpoint references it.
+        //
+        // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
+        // are rewriting layers.
+        let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
+
+        tracing::info!(
+            "latest_gc_cutoff: {}, pitr cutoff {}",
+            *latest_gc_cutoff,
+            self.gc_info.read().unwrap().cutoffs.pitr
+        );
 
         let layers = self.layers.read().await;
         for layer_desc in layers.layer_map().iter_historic_layers() {
@@ -241,9 +252,9 @@ impl Timeline {
 
             // Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually
             // without incurring the I/O cost of a rewrite.
-            if layer_desc.get_lsn_range().end >= pitr_cutoff {
-                debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})",
-                    layer_desc.get_lsn_range().end, pitr_cutoff);
+            if layer_desc.get_lsn_range().end >= *latest_gc_cutoff {
+                debug!(%layer, "Skipping rewrite of layer still in GC window ({} >= {})",
+                    layer_desc.get_lsn_range().end, *latest_gc_cutoff);
                 continue;
             }
 
@@ -253,13 +264,10 @@ impl Timeline {
                 continue;
             }
 
-            // Only rewrite layers if they would have different remote paths: either they belong to this
-            // shard but an old generation, or they belonged to another shard.  This also implicitly
-            // guarantees that the layer is persistent in remote storage (as only remote persistent
-            // layers are carried across shard splits, any local-only layer would be in the current generation)
-            if layer.metadata().generation == self.generation
-                && layer.metadata().shard.shard_count == self.shard_identity.count
-            {
+            // Only rewrite layers if their generations differ.  This guarantees:
+            //  - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one
+            //  - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage
+            if layer.metadata().generation == self.generation {
                 debug!(%layer, "Skipping rewrite, is not from old generation");
                 continue;
             }
@@ -272,18 +280,69 @@ impl Timeline {
             }
 
             // Fall through: all our conditions for doing a rewrite passed.
-            // TODO: implement rewriting
-            tracing::debug!(%layer, "Would rewrite layer");
+            layers_to_rewrite.push(layer);
         }
 
-        // Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`]
+        // Drop read lock on layer map before we start doing time-consuming I/O
         drop(layers);
 
-        // TODO: collect layers to rewrite
-        let replace_layers = Vec::new();
+        let mut replace_image_layers = Vec::new();
+
+        for layer in layers_to_rewrite {
+            tracing::info!(layer=%layer, "Rewriting layer after shard split...");
+            let mut image_layer_writer = ImageLayerWriter::new(
+                self.conf,
+                self.timeline_id,
+                self.tenant_shard_id,
+                &layer.layer_desc().key_range,
+                layer.layer_desc().image_layer_lsn(),
+                ctx,
+            )
+            .await?;
+
+            // Safety of layer rewrites:
+            // - We are writing to a different local file path than we are reading from, so the old Layer
+            //   cannot interfere with the new one.
+            // - In the page cache, contents for a particular VirtualFile are stored with a file_id that
+            //   is different for two layers with the same name (in `ImageLayerInner::new` we always
+            //   acquire a fresh id from [`crate::page_cache::next_file_id`].  So readers do not risk
+            //   reading the index from one layer file, and then data blocks from the rewritten layer file.
+            // - Any readers that have a reference to the old layer will keep it alive until they are done
+            //   with it. If they are trying to promote from remote storage, that will fail, but this is the same
+            //   as for compaction generally: compaction is allowed to delete layers that readers might be trying to use.
+            // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
+            //    - GC, which at worst witnesses us "undelete" a layer that they just deleted.
+            //    - ingestion, which only inserts layers, therefore cannot collide with us.
+            let resident = layer.download_and_keep_resident().await?;
+
+            let keys_written = resident
+                .filter(&self.shard_identity, &mut image_layer_writer, ctx)
+                .await?;
+
+            if keys_written > 0 {
+                let new_layer = image_layer_writer.finish(self, ctx).await?;
+                tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
+                    layer.metadata().file_size,
+                    new_layer.metadata().file_size);
+
+                replace_image_layers.push((layer, new_layer));
+            } else {
+                // Drop the old layer.  Usually for this case we would already have noticed that
+                // the layer has no data for us with the ShardedRange check above, but
+                drop_layers.push(layer);
+            }
+        }
+
+        // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
+        // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch
+        // to remote index) and be removed. This is inefficient but safe.
+        fail::fail_point!("compact-shard-ancestors-localonly");
 
         // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
-        self.rewrite_layers(replace_layers, drop_layers).await?;
+        self.rewrite_layers(replace_image_layers, drop_layers)
+            .await?;
+
+        fail::fail_point!("compact-shard-ancestors-enqueued");
 
         // We wait for all uploads to complete before finishing this compaction stage.  This is not
         // necessary for correctness, but it simplifies testing, and avoids proceeding with another
@@ -291,6 +350,8 @@ impl Timeline {
         // load.
         self.remote_client.wait_completion().await?;
 
+        fail::fail_point!("compact-shard-ancestors-persistent");
+
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs
index 0cbaf39555..5bc67c7133 100644
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -7,19 +7,20 @@ use crate::{
             index::{IndexPart, LayerFileMetadata},
         },
         storage_layer::LayerName,
-        Generation,
     },
 };
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver_api::shard::ShardIndex;
-use std::{collections::HashMap, str::FromStr};
+use std::{
+    collections::{hash_map, HashMap},
+    str::FromStr,
+};
 use utils::lsn::Lsn;
 
 /// Identified files in the timeline directory.
 pub(super) enum Discovered {
     /// The only one we care about
-    Layer(LayerName, Utf8PathBuf, u64),
+    Layer(LayerName, LocalLayerFileMetadata),
     /// Old ephmeral files from previous launches, should be removed
     Ephemeral(String),
     /// Old temporary timeline files, unsure what these really are, should be removed
@@ -27,7 +28,7 @@ pub(super) enum Discovered {
     /// Temporary on-demand download files, should be removed
     TemporaryDownload(String),
     /// Backup file from previously future layers
-    IgnoredBackup,
+    IgnoredBackup(Utf8PathBuf),
     /// Unrecognized, warn about these
     Unknown(String),
 }
@@ -43,12 +44,15 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
         let discovered = match LayerName::from_str(&file_name) {
             Ok(file_name) => {
                 let file_size = direntry.metadata()?.len();
-                Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
+                Discovered::Layer(
+                    file_name,
+                    LocalLayerFileMetadata::new(direntry.path().to_owned(), file_size),
+                )
             }
             Err(_) => {
                 if file_name.ends_with(".old") {
                     // ignore these
-                    Discovered::IgnoredBackup
+                    Discovered::IgnoredBackup(direntry.path().to_owned())
                 } else if remote_timeline_client::is_temp_download_file(direntry.path()) {
                     Discovered::TemporaryDownload(file_name)
                 } else if is_ephemeral_file(&file_name) {
@@ -71,37 +75,32 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
 /// this structure extends it with metadata describing the layer's presence in local storage.
 #[derive(Clone, Debug)]
 pub(super) struct LocalLayerFileMetadata {
-    pub(super) metadata: LayerFileMetadata,
+    pub(super) file_size: u64,
     pub(super) local_path: Utf8PathBuf,
 }
 
 impl LocalLayerFileMetadata {
-    pub fn new(
-        local_path: Utf8PathBuf,
-        file_size: u64,
-        generation: Generation,
-        shard: ShardIndex,
-    ) -> Self {
+    pub fn new(local_path: Utf8PathBuf, file_size: u64) -> Self {
         Self {
             local_path,
-            metadata: LayerFileMetadata::new(file_size, generation, shard),
+            file_size,
         }
     }
 }
 
-/// Decision on what to do with a layer file after considering its local and remote metadata.
+/// For a layer that is present in remote metadata, this type describes how to handle
+/// it during startup: it is either Resident (and we have some metadata about a local file),
+/// or it is Evicted (and we only have remote metadata).
 #[derive(Clone, Debug)]
 pub(super) enum Decision {
     /// The layer is not present locally.
     Evicted(LayerFileMetadata),
-    /// The layer is present locally, but local metadata does not match remote; we must
-    /// delete it and treat it as evicted.
-    UseRemote {
+    /// The layer is present locally, and metadata matches: we may hook up this layer to the
+    /// existing file in local storage.
+    Resident {
         local: LocalLayerFileMetadata,
         remote: LayerFileMetadata,
     },
-    /// The layer is present locally, and metadata matches.
-    UseLocal(LocalLayerFileMetadata),
 }
 
 /// A layer needs to be left out of the layer map.
@@ -117,77 +116,81 @@ pub(super) enum DismissedLayer {
     /// In order to make crash safe updates to layer map, we must dismiss layers which are only
     /// found locally or not yet included in the remote `index_part.json`.
     LocalOnly(LocalLayerFileMetadata),
+
+    /// The layer exists in remote storage but the local layer's metadata (e.g. file size)
+    /// does not match it
+    BadMetadata(LocalLayerFileMetadata),
 }
 
 /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
 pub(super) fn reconcile(
-    discovered: Vec<(LayerName, Utf8PathBuf, u64)>,
+    local_layers: Vec<(LayerName, LocalLayerFileMetadata)>,
     index_part: Option<&IndexPart>,
     disk_consistent_lsn: Lsn,
-    generation: Generation,
-    shard: ShardIndex,
 ) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
-    use Decision::*;
+    let Some(index_part) = index_part else {
+        // If we have no remote metadata, no local layer files are considered valid to load
+        return local_layers
+            .into_iter()
+            .map(|(layer_name, local_metadata)| {
+                (layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))
+            })
+            .collect();
+    };
 
-    // name => (local_metadata, remote_metadata)
-    type Collected =
-        HashMap<LayerName, (Option<LocalLayerFileMetadata>, Option<LayerFileMetadata>)>;
+    let mut result = Vec::new();
 
-    let mut discovered = discovered
-        .into_iter()
-        .map(|(layer_name, local_path, file_size)| {
-            (
-                layer_name,
-                // The generation and shard here will be corrected to match IndexPart in the merge below, unless
-                // it is not in IndexPart, in which case using our current generation makes sense
-                // because it will be uploaded in this generation.
-                (
-                    Some(LocalLayerFileMetadata::new(
-                        local_path, file_size, generation, shard,
-                    )),
-                    None,
-                ),
-            )
-        })
-        .collect::<Collected>();
+    let mut remote_layers = HashMap::new();
 
-    // merge any index_part information, when available
+    // Construct Decisions for layers that are found locally, if they're in remote metadata.  Otherwise
+    // construct DismissedLayers to get rid of them.
+    for (layer_name, local_metadata) in local_layers {
+        let Some(remote_metadata) = index_part.layer_metadata.get(&layer_name) else {
+            result.push((layer_name, Err(DismissedLayer::LocalOnly(local_metadata))));
+            continue;
+        };
+
+        if remote_metadata.file_size != local_metadata.file_size {
+            result.push((layer_name, Err(DismissedLayer::BadMetadata(local_metadata))));
+            continue;
+        }
+
+        remote_layers.insert(
+            layer_name,
+            Decision::Resident {
+                local: local_metadata,
+                remote: remote_metadata.clone(),
+            },
+        );
+    }
+
+    // Construct Decision for layers that were not found locally
     index_part
-        .as_ref()
-        .map(|ip| ip.layer_metadata.iter())
-        .into_iter()
-        .flatten()
-        .map(|(name, metadata)| (name, metadata.clone()))
+        .layer_metadata
+        .iter()
         .for_each(|(name, metadata)| {
-            if let Some(existing) = discovered.get_mut(name) {
-                existing.1 = Some(metadata);
-            } else {
-                discovered.insert(name.to_owned(), (None, Some(metadata)));
+            if let hash_map::Entry::Vacant(entry) = remote_layers.entry(name.clone()) {
+                entry.insert(Decision::Evicted(metadata.clone()));
             }
         });
 
-    discovered
-        .into_iter()
-        .map(|(name, (local, remote))| {
-            let decision = if name.is_in_future(disk_consistent_lsn) {
-                Err(DismissedLayer::Future { local })
-            } else {
-                match (local, remote) {
-                    (Some(local), Some(remote)) if local.metadata != remote => {
-                        Ok(UseRemote { local, remote })
-                    }
-                    (Some(x), Some(_)) => Ok(UseLocal(x)),
-                    (None, Some(x)) => Ok(Evicted(x)),
-                    (Some(x), None) => Err(DismissedLayer::LocalOnly(x)),
-                    (None, None) => {
-                        unreachable!("there must not be any non-local non-remote files")
-                    }
-                }
-            };
+    // For layers that were found in authoritative remote metadata, apply a final check that they are within
+    // the disk_consistent_lsn.
+    result.extend(remote_layers.into_iter().map(|(name, decision)| {
+        if name.is_in_future(disk_consistent_lsn) {
+            match decision {
+                Decision::Evicted(_remote) => (name, Err(DismissedLayer::Future { local: None })),
+                Decision::Resident {
+                    local,
+                    remote: _remote,
+                } => (name, Err(DismissedLayer::Future { local: Some(local) })),
+            }
+        } else {
+            (name, Ok(decision))
+        }
+    }));
 
-            (name, decision)
-        })
-        .collect::<Vec<_>>()
+    result
 }
 
 pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
@@ -196,25 +199,15 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
     std::fs::remove_file(path).with_context(|| format!("failed to remove {kind} at {path}"))
 }
 
-pub(super) fn cleanup_local_file_for_remote(
-    local: &LocalLayerFileMetadata,
-    remote: &LayerFileMetadata,
-) -> anyhow::Result<()> {
-    let local_size = local.metadata.file_size;
-    let remote_size = remote.file_size;
+pub(super) fn cleanup_local_file_for_remote(local: &LocalLayerFileMetadata) -> anyhow::Result<()> {
+    let local_size = local.file_size;
     let path = &local.local_path;
-
     let file_name = path.file_name().expect("must be file path");
-    tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
-    if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
-        assert!(
-            path.exists(),
-            "we would leave the local_layer without a file if this does not hold: {path}",
-        );
-        Err(err)
-    } else {
-        Ok(())
-    }
+    tracing::warn!(
+        "removing local file {file_name:?} because it has unexpected length {local_size};"
+    );
+
+    std::fs::remove_file(path).with_context(|| format!("failed to remove layer at {path}"))
 }
 
 pub(super) fn cleanup_future_layer(
@@ -236,8 +229,8 @@ pub(super) fn cleanup_local_only_file(
 ) -> anyhow::Result<()> {
     let kind = name.kind();
     tracing::info!(
-        "found local-only {kind} layer {name}, metadata {:?}",
-        local.metadata
+        "found local-only {kind} layer {name} size {}",
+        local.file_size
     );
     std::fs::remove_file(&local.local_path)?;
     Ok(())
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 248420e632..884b71df75 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -212,13 +212,34 @@ impl LayerManager {
         &mut self,
         rewrite_layers: &[(Layer, ResidentLayer)],
         drop_layers: &[Layer],
-        _metrics: &TimelineMetrics,
+        metrics: &TimelineMetrics,
     ) {
         let mut updates = self.layer_map.batch_update();
+        for (old_layer, new_layer) in rewrite_layers {
+            debug_assert_eq!(
+                old_layer.layer_desc().key_range,
+                new_layer.layer_desc().key_range
+            );
+            debug_assert_eq!(
+                old_layer.layer_desc().lsn_range,
+                new_layer.layer_desc().lsn_range
+            );
 
-        // TODO: implement rewrites (currently this code path only used for drops)
-        assert!(rewrite_layers.is_empty());
+            // Safety: we may never rewrite the same file in-place.  Callers are responsible
+            // for ensuring that they only rewrite layers after something changes the path,
+            // such as an increment in the generation number.
+            assert_ne!(old_layer.local_path(), new_layer.local_path());
 
+            Self::delete_historic_layer(old_layer, &mut updates, &mut self.layer_fmgr);
+
+            Self::insert_historic_layer(
+                new_layer.as_ref().clone(),
+                &mut updates,
+                &mut self.layer_fmgr,
+            );
+
+            metrics.record_new_file_metrics(new_layer.layer_desc().file_size);
+        }
         for l in drop_layers {
             Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
         }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 796ae7217b..36aa18f1f9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2667,7 +2667,9 @@ class NeonPageserver(PgProtocol, LogUtils):
             tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
         )
 
-    def list_layers(self, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
+    def list_layers(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ) -> list[Path]:
         """
         Inspect local storage on a pageserver to discover which layer files are present.
 
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index bbb1ad0c6d..545ba05b17 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -177,7 +177,16 @@ def test_sharding_split_unsharded(
     env.storage_controller.consistency_check()
 
 
-def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize(
+    "failpoint",
+    [
+        None,
+        "compact-shard-ancestors-localonly",
+        "compact-shard-ancestors-enqueued",
+        "compact-shard-ancestors-persistent",
+    ],
+)
+def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: Optional[str]):
     """
     Test that after a split, we clean up parent layer data in the child shards via compaction.
     """
@@ -196,6 +205,11 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
         "image_layer_creation_check_threshold": "0",
     }
 
+    neon_env_builder.storage_controller_config = {
+        # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
+        "max_unavailable": "300s"
+    }
+
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
@@ -213,6 +227,10 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
     # Split one shard into two
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)
 
+    # Let all shards move into their stable locations, so that during subsequent steps we
+    # don't have reconciles in progress (simpler to reason about what messages we expect in logs)
+    env.storage_controller.reconcile_until_idle()
+
     # Check we got the shard IDs we expected
     assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None
     assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None
@@ -237,6 +255,90 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
     # Compaction shouldn't make anything unreadable
     workload.validate()
 
+    # Force a generation increase: layer rewrites are a long-term thing and only happen after
+    # the generation has increased.
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    # Cleanup part 2: once layers are outside the PITR window, they will be rewritten if they are partially redundant
+    env.storage_controller.pageserver_api().set_tenant_config(tenant_id, {"pitr_interval": "0s"})
+    env.storage_controller.reconcile_until_idle()
+
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+
+        # Apply failpoints for the layer-rewriting phase: this is the area of code that has sensitive behavior
+        # across restarts, as we will have local layer files that temporarily disagree with the remote metadata
+        # for the same local layer file name.
+        if failpoint is not None:
+            ps.http_client().configure_failpoints((failpoint, "exit"))
+
+        # Do a GC to update gc_info (compaction uses this to decide whether a layer is to be rewritten)
+        # Set gc_horizon=0 to let PITR horizon control GC cutoff exclusively.
+        ps.http_client().timeline_gc(shard, timeline_id, gc_horizon=0)
+
+        # We will compare stats before + after compaction
+        detail_before = ps.http_client().timeline_detail(shard, timeline_id)
+
+        # Invoke compaction: this should rewrite layers that are behind the pitr horizon
+        try:
+            ps.http_client().timeline_compact(shard, timeline_id)
+        except requests.ConnectionError as e:
+            if failpoint is None:
+                raise e
+            else:
+                log.info(f"Compaction failed (failpoint={failpoint}): {e}")
+
+            if failpoint in (
+                "compact-shard-ancestors-localonly",
+                "compact-shard-ancestors-enqueued",
+            ):
+                # If we left local files that don't match remote metadata, we expect warnings on next startup
+                env.pageserver.allowed_errors.append(
+                    ".*removing local file .+ because it has unexpected length.*"
+                )
+
+            # Post-failpoint: we check that the pageserver comes back online happily.
+            env.pageserver.running = False
+            env.pageserver.start()
+        else:
+            assert failpoint is None  # We shouldn't reach success path if a failpoint was set
+
+            detail_after = ps.http_client().timeline_detail(shard, timeline_id)
+
+            # Physical size should shrink because layers are smaller
+            assert detail_after["current_physical_size"] < detail_before["current_physical_size"]
+
+    # Validate size statistics
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+        timeline_info = ps.http_client().timeline_detail(shard, timeline_id)
+        reported_size = timeline_info["current_physical_size"]
+        layer_paths = ps.list_layers(shard, timeline_id)
+        measured_size = 0
+        for p in layer_paths:
+            abs_path = ps.timeline_dir(shard, timeline_id) / p
+            measured_size += os.stat(abs_path).st_size
+
+        log.info(
+            f"shard {shard} reported size {reported_size}, measured size {measured_size} ({len(layer_paths)} layers)"
+        )
+
+        if failpoint in (
+            "compact-shard-ancestors-localonly",
+            "compact-shard-ancestors-enqueued",
+        ):
+            # If we injected a failure between local rewrite and remote upload, then after
+            # restart we may end up with neither version of the file on local disk (the new file
+            # is cleaned up because it doesn't matchc remote metadata).  So local size isn't
+            # necessarily going to match remote physical size.
+            continue
+
+        assert measured_size == reported_size
+
+    # Compaction shouldn't make anything unreadable
+    workload.validate()
+
 
 def test_sharding_split_smoke(
     neon_env_builder: NeonEnvBuilder,

From 1455f5a2612a95d1c8fe3f68311cb42fc4785523 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 24 May 2024 09:45:34 +0100
Subject: [PATCH 0873/1571] pageserver: revert concurrent secondary downloads,
 make DownloadStream always yield Err after cancel (#7866)

## Problem

Ongoing hunt for secondary location shutdown hang issues.

## Summary of changes

- Revert the functional changes from #7675
- Tweak a log in secondary downloads to make it more apparent when we
drop out on cancellation
- Modify DownloadStream's behavior to always return an Err after it has
been cancelled. This _should_ not impact anything, but it makes the
behavior simpler to reason about (e.g. even if the poll function somehow
got called again, it could never end up in an un-cancellable state)

Related #https://github.com/neondatabase/cloud/issues/13576
---
 libs/remote_storage/src/azure_blob.rs         |   5 -
 libs/remote_storage/src/lib.rs                |  34 -----
 libs/remote_storage/src/local_fs.rs           |  14 +-
 libs/remote_storage/src/s3_bucket.rs          |   8 +-
 libs/remote_storage/src/simulate_failures.rs  |   6 +-
 pageserver/src/tenant/secondary/downloader.rs | 120 ++----------------
 6 files changed, 18 insertions(+), 169 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 220d4ef115..24c1248304 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -29,7 +29,6 @@ use http_types::{StatusCode, Url};
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 
-use crate::RemoteStorageActivity;
 use crate::{
     error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
     DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
@@ -526,10 +525,6 @@ impl RemoteStorage for AzureBlobStorage {
         // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
         Err(TimeTravelError::Unimplemented)
     }
-
-    fn activity(&self) -> RemoteStorageActivity {
-        self.concurrency_limiter.activity()
-    }
 }
 
 pin_project_lite::pin_project! {
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index f024021507..708662f20f 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -263,17 +263,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
         done_if_after: SystemTime,
         cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError>;
-
-    /// Query how busy we currently are: may be used by callers which wish to politely
-    /// back off if there are already a lot of operations underway.
-    fn activity(&self) -> RemoteStorageActivity;
-}
-
-pub struct RemoteStorageActivity {
-    pub read_available: usize,
-    pub read_total: usize,
-    pub write_available: usize,
-    pub write_total: usize,
 }
 
 /// DownloadStream is sensitive to the timeout and cancellation used with the original
@@ -455,15 +444,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
             }
         }
     }
-
-    pub fn activity(&self) -> RemoteStorageActivity {
-        match self {
-            Self::LocalFs(s) => s.activity(),
-            Self::AwsS3(s) => s.activity(),
-            Self::AzureBlob(s) => s.activity(),
-            Self::Unreliable(s) => s.activity(),
-        }
-    }
 }
 
 impl GenericRemoteStorage {
@@ -794,9 +774,6 @@ struct ConcurrencyLimiter {
     // The helps to ensure we don't exceed the thresholds.
     write: Arc<Semaphore>,
     read: Arc<Semaphore>,
-
-    write_total: usize,
-    read_total: usize,
 }
 
 impl ConcurrencyLimiter {
@@ -825,21 +802,10 @@ impl ConcurrencyLimiter {
         Arc::clone(self.for_kind(kind)).acquire_owned().await
     }
 
-    fn activity(&self) -> RemoteStorageActivity {
-        RemoteStorageActivity {
-            read_available: self.read.available_permits(),
-            read_total: self.read_total,
-            write_available: self.write.available_permits(),
-            write_total: self.write_total,
-        }
-    }
-
     fn new(limit: usize) -> ConcurrencyLimiter {
         Self {
             read: Arc::new(Semaphore::new(limit)),
             write: Arc::new(Semaphore::new(limit)),
-            read_total: limit,
-            write_total: limit,
         }
     }
 }
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index f12f6590a3..1f7bcfc982 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use utils::crashsafe::path_with_suffix_extension;
 
 use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity,
-    TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 use super::{RemoteStorage, StorageMetadata};
@@ -605,16 +605,6 @@ impl RemoteStorage for LocalFs {
     ) -> Result<(), TimeTravelError> {
         Err(TimeTravelError::Unimplemented)
     }
-
-    fn activity(&self) -> RemoteStorageActivity {
-        // LocalFS has no concurrency limiting: give callers the impression that plenty of units are available
-        RemoteStorageActivity {
-            read_available: 16,
-            read_total: 16,
-            write_available: 16,
-            write_total: 16,
-        }
-    }
 }
 
 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 0f6772b274..c3d6c75e20 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -47,8 +47,8 @@ use utils::backoff;
 use super::StorageMetadata;
 use crate::{
     error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
-    Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config,
-    TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
+    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 pub(super) mod metrics;
@@ -975,10 +975,6 @@ impl RemoteStorage for S3Bucket {
         }
         Ok(())
     }
-
-    fn activity(&self) -> RemoteStorageActivity {
-        self.concurrency_limiter.activity()
-    }
 }
 
 /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 66522e04ca..c467a2d196 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken;
 
 use crate::{
     Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
-    RemoteStorageActivity, StorageMetadata, TimeTravelError,
+    StorageMetadata, TimeTravelError,
 };
 
 pub struct UnreliableWrapper {
@@ -213,8 +213,4 @@ impl RemoteStorage for UnreliableWrapper {
             .time_travel_recover(prefix, timestamp, done_if_after, cancel)
             .await
     }
-
-    fn activity(&self) -> RemoteStorageActivity {
-        self.inner.activity()
-    }
 }
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 789f1a0fa9..0ec1bd649b 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -45,10 +45,10 @@ use crate::tenant::{
 
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
-use futures::{Future, StreamExt};
+use futures::Future;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity};
+use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
 
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
@@ -67,12 +67,6 @@ use super::{
 /// download, if the uploader populated it.
 const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);
 
-/// Range of concurrency we may use when downloading layers within a timeline.  This is independent
-/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
-/// `PageServerConf::secondary_download_concurrency`
-const MAX_LAYER_CONCURRENCY: usize = 16;
-const MIN_LAYER_CONCURRENCY: usize = 1;
-
 pub(super) async fn downloader_task(
     tenant_manager: Arc<TenantManager>,
     remote_storage: GenericRemoteStorage,
@@ -81,15 +75,14 @@ pub(super) async fn downloader_task(
     cancel: CancellationToken,
     root_ctx: RequestContext,
 ) {
-    // How many tenants' secondary download operations we will run concurrently
-    let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency;
+    let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
 
     let generator = SecondaryDownloader {
         tenant_manager,
         remote_storage,
         root_ctx,
     };
-    let mut scheduler = Scheduler::new(generator, tenant_concurrency);
+    let mut scheduler = Scheduler::new(generator, concurrency);
 
     scheduler
         .run(command_queue, background_jobs_can_start, cancel)
@@ -414,7 +407,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                     tracing::warn!("Insufficient space while downloading.  Will retry later.");
                 }
                 Err(UpdateError::Cancelled) => {
-                    tracing::debug!("Shut down while downloading");
+                    tracing::info!("Shut down while downloading");
                 },
                 Err(UpdateError::Deserialize(e)) => {
                     tracing::error!("Corrupt content while downloading tenant: {e}");
@@ -848,8 +841,6 @@ impl<'a> TenantDownloader<'a> {
 
         tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
 
-        let mut download_futs = Vec::new();
-
         // Download heatmap layers that are not present on local disk, or update their
         // access time if they are already present.
         for layer in timeline.layers {
@@ -922,31 +913,14 @@ impl<'a> TenantDownloader<'a> {
                 }
             }
 
-            download_futs.push(self.download_layer(
-                tenant_shard_id,
-                &timeline.timeline_id,
-                layer,
-                ctx,
-            ));
-        }
-
-        // Break up layer downloads into chunks, so that for each chunk we can re-check how much
-        // concurrency to use based on activity level of remote storage.
-        while !download_futs.is_empty() {
-            let chunk =
-                download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY));
-
-            let concurrency = Self::layer_concurrency(self.remote_storage.activity());
-
-            let mut result_stream = futures::stream::iter(chunk).buffered(concurrency);
-            let mut result_stream = std::pin::pin!(result_stream);
-            while let Some(result) = result_stream.next().await {
-                match result {
-                    Err(e) => return Err(e),
-                    Ok(None) => {
-                        // No error, but we didn't download the layer.  Don't mark it touched
-                    }
-                    Ok(Some(layer)) => touched.push(layer),
+            match self
+                .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
+                .await?
+            {
+                Some(layer) => touched.push(layer),
+                None => {
+                    // Not an error but we didn't download it: remote layer is missing.  Don't add it to the list of
+                    // things to consider touched.
                 }
             }
         }
@@ -1081,19 +1055,6 @@ impl<'a> TenantDownloader<'a> {
 
         Ok(Some(layer))
     }
-
-    /// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage
-    fn layer_concurrency(activity: RemoteStorageActivity) -> usize {
-        // When less than 75% of units are available, use minimum concurrency.  Else, do a linear mapping
-        // of our concurrency range to the units available within the remaining 25%.
-        let clamp_at = (activity.read_total * 3) / 4;
-        if activity.read_available > clamp_at {
-            (MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at))
-                / (activity.read_total - clamp_at)
-        } else {
-            MIN_LAYER_CONCURRENCY
-        }
-    }
 }
 
 /// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
@@ -1217,58 +1178,3 @@ async fn init_timeline_state(
 
     detail
 }
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    #[test]
-    fn layer_concurrency() {
-        // Totally idle
-        assert_eq!(
-            TenantDownloader::layer_concurrency(RemoteStorageActivity {
-                read_available: 16,
-                read_total: 16,
-                write_available: 16,
-                write_total: 16
-            }),
-            MAX_LAYER_CONCURRENCY
-        );
-
-        // Totally busy
-        assert_eq!(
-            TenantDownloader::layer_concurrency(RemoteStorageActivity {
-                read_available: 0,
-                read_total: 16,
-
-                write_available: 16,
-                write_total: 16
-            }),
-            MIN_LAYER_CONCURRENCY
-        );
-
-        // Edge of the range at which we interpolate
-        assert_eq!(
-            TenantDownloader::layer_concurrency(RemoteStorageActivity {
-                read_available: 12,
-                read_total: 16,
-
-                write_available: 16,
-                write_total: 16
-            }),
-            MIN_LAYER_CONCURRENCY
-        );
-
-        // Midpoint of the range in which we interpolate
-        assert_eq!(
-            TenantDownloader::layer_concurrency(RemoteStorageActivity {
-                read_available: 14,
-                read_total: 16,
-
-                write_available: 16,
-                write_total: 16
-            }),
-            MAX_LAYER_CONCURRENCY / 2
-        );
-    }
-}

From a3f5b836772d54464e18302beb132f4c19b8adf8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 24 May 2024 16:07:58 +0300
Subject: [PATCH 0874/1571] chore: lower gate guard drop logging threshold to
 100ms (#7862)

We have some 1001ms cases, which do not yield gate guard context.
---
 libs/utils/src/sync/gate.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs
index c34176af57..156b99a010 100644
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -135,7 +135,8 @@ impl Gate {
         let started_at = std::time::Instant::now();
         let mut do_close = std::pin::pin!(self.do_close());
 
-        let nag_after = Duration::from_secs(1);
+        // with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms
+        let nag_after = Duration::from_millis(100);
 
         let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
             return;

From 71a7fd983e13d71508350c750b32081e0bb7832d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 24 May 2024 14:11:51 +0100
Subject: [PATCH 0875/1571] CI(release): tune Storage & Compute release PR
 title (#7870)

## Problem

A title for automatic proxy release PRs is `Proxy release`, and for
storage & compute, it's just `Release`

## Summary of changes
- Amend PR title for Storage & Compute releases to "Storage & Compute
release"
---
 .github/workflows/release.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index b2c9a19588..fe24f6330e 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -53,7 +53,7 @@ jobs:
         GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
       run: |
         cat << EOF > body.md
-          ## Release ${RELEASE_DATE}
+          ## Storage & Compute release ${RELEASE_DATE}
 
           **Please merge this Pull Request using 'Create a merge commit' button**
         EOF

From 43f9a16e4608789a2c190aa5c32303a0d0182f30 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 24 May 2024 17:56:12 +0100
Subject: [PATCH 0876/1571] proxy: fix websocket buffering (#7878)

## Problem

Seems the websocket buffering was broken for large query responses only

## Summary of changes

Move buffering until after the underlying stream is ready.
Tested locally confirms this fixes the bug.

Also fixes the pg-sni-router missing metrics bug
---
 proxy/src/bin/pg_sni_router.rs    | 3 +++
 proxy/src/serverless/websocket.rs | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index fb16b76567..e1674049a6 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -9,6 +9,7 @@ use futures::future::Either;
 use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
+use proxy::metrics::{Metrics, ThreadPoolMetrics};
 use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled};
 use rustls::pki_types::PrivateKeyDer;
 use tokio::net::TcpListener;
@@ -65,6 +66,8 @@ async fn main() -> anyhow::Result<()> {
     let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
+    Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
+
     let args = cli().get_matches();
     let destination: String = args.get_one::<String>("dest").unwrap().parse()?;
 
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 61d6d60dbe..7d3153a3c1 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -51,9 +51,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for WebSocketRw<S> {
     ) -> Poll<io::Result<usize>> {
         let this = self.project();
         let mut stream = this.stream;
-        this.send.put(buf);
 
         ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
+
+        this.send.put(buf);
         match stream.as_mut().start_send(Frame::binary(this.send.split())) {
             Ok(()) => Poll::Ready(Ok(buf.len())),
             Err(e) => Poll::Ready(Err(io_error(e))),

From 3797566c36c767155648ddde6c21ece4a24827cd Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 20 May 2024 16:21:57 +0300
Subject: [PATCH 0877/1571] safekeeper: test pull_timeline with WAL gc.

Do pull_timeline while WAL is being removed. To this end
- extract pausable_failpoint to utils, sprinkle pull_timeline with it
- add 'checkpoint' sk http endpoint to force WAL removal.

After fixing checking for pull file status code test fails so far which is
expected.
---
 libs/utils/src/failpoint_support.rs           | 27 +++++++
 pageserver/src/tenant.rs                      | 27 +------
 pageserver/src/tenant/delete.rs               |  2 +-
 .../src/tenant/remote_timeline_client.rs      |  1 +
 .../tenant/remote_timeline_client/upload.rs   |  2 +-
 pageserver/src/tenant/tasks.rs                |  2 +-
 pageserver/src/tenant/timeline.rs             |  2 +-
 pageserver/src/tenant/timeline/delete.rs      |  2 +-
 safekeeper/src/http/routes.rs                 | 24 ++++++
 safekeeper/src/pull_timeline.rs               | 10 +++
 safekeeper/src/remove_wal.rs                  |  2 +-
 safekeeper/src/safekeeper.rs                  |  4 +-
 safekeeper/src/timeline.rs                    |  4 +-
 safekeeper/src/timeline_manager.rs            |  2 +-
 test_runner/fixtures/common_types.py          |  5 ++
 test_runner/fixtures/neon_fixtures.py         | 67 +++++++++++++++-
 test_runner/fixtures/safekeeper/http.py       |  7 ++
 test_runner/fixtures/utils.py                 | 22 ++++++
 test_runner/regress/test_wal_acceptor.py      | 79 ++++++++++++++++---
 19 files changed, 241 insertions(+), 50 deletions(-)

diff --git a/libs/utils/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs
index 8704b72921..870684b399 100644
--- a/libs/utils/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -9,6 +9,33 @@ use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 
+/// Declare a failpoint that can use the `pause` failpoint action.
+/// We don't want to block the executor thread, hence, spawn_blocking + await.
+#[macro_export]
+macro_rules! pausable_failpoint {
+    ($name:literal) => {
+        if cfg!(feature = "testing") {
+            tokio::task::spawn_blocking({
+                let current = tracing::Span::current();
+                move || {
+                    let _entered = current.entered();
+                    tracing::info!("at failpoint {}", $name);
+                    fail::fail_point!($name);
+                }
+            })
+            .await
+            .expect("spawn_blocking");
+        }
+    };
+    ($name:literal, $cond:expr) => {
+        if cfg!(feature = "testing") {
+            if $cond {
+                pausable_failpoint!($name)
+            }
+        }
+    };
+}
+
 /// use with fail::cfg("$name", "return(2000)")
 ///
 /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 540eb10ed2..e6bfd57a44 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -42,6 +42,7 @@ use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::failpoint_support;
 use utils::fs_ext;
+use utils::pausable_failpoint;
 use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
 use utils::timeout::timeout_cancellable;
@@ -122,32 +123,6 @@ use utils::{
     lsn::{Lsn, RecordLsn},
 };
 
-/// Declare a failpoint that can use the `pause` failpoint action.
-/// We don't want to block the executor thread, hence, spawn_blocking + await.
-macro_rules! pausable_failpoint {
-    ($name:literal) => {
-        if cfg!(feature = "testing") {
-            tokio::task::spawn_blocking({
-                let current = tracing::Span::current();
-                move || {
-                    let _entered = current.entered();
-                    tracing::info!("at failpoint {}", $name);
-                    fail::fail_point!($name);
-                }
-            })
-            .await
-            .expect("spawn_blocking");
-        }
-    };
-    ($name:literal, $cond:expr) => {
-        if cfg!(feature = "testing") {
-            if $cond {
-                pausable_failpoint!($name)
-            }
-        }
-    };
-}
-
 pub mod blob_io;
 pub mod block_io;
 pub mod vectored_blob_io;
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 3173a33dad..7c6640eaac 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -8,7 +8,7 @@ use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, instrument, Instrument};
 
-use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};
+use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
 
 use crate::{
     config::PageServerConf,
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 23904b9da4..73438a790f 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -197,6 +197,7 @@ pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
     self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
+use utils::pausable_failpoint;
 
 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index caa843316f..e8e824f415 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -9,7 +9,7 @@ use std::time::SystemTime;
 use tokio::fs::{self, File};
 use tokio::io::AsyncSeekExt;
 use tokio_util::sync::CancellationToken;
-use utils::backoff;
+use utils::{backoff, pausable_failpoint};
 
 use super::Generation;
 use crate::tenant::remote_timeline_client::{
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index ba2b8afd03..bf2d8a47b4 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -17,7 +17,7 @@ use crate::tenant::{Tenant, TenantState};
 use rand::Rng;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::{backoff, completion};
+use utils::{backoff, completion, pausable_failpoint};
 
 static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
     once_cell::sync::Lazy::new(|| {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1bdbddd95f..d4f6e25843 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -41,7 +41,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
     bin_ser::BeSer,
-    fs_ext,
+    fs_ext, pausable_failpoint,
     sync::gate::{Gate, GateGuard},
     vec_map::VecMap,
 };
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index b5dfc86e77..5ca8544d49 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -7,7 +7,7 @@ use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
 use tracing::{error, info, instrument, Instrument};
-use utils::{crashsafe, fs_ext, id::TimelineId};
+use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
 
 use crate::{
     config::PageServerConf,
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 808bb1e490..4aacd3421d 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -287,6 +287,26 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
         .map_err(|e| ApiError::InternalServerError(e.into()))
 }
 
+/// Force persist control file and remove old WAL.
+async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+
+    let tli = GlobalTimelines::get(ttid)?;
+    tli.maybe_persist_control_file(true)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    tli.remove_old_wal()
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 /// Deactivates the timeline and removes its data directory.
 async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let ttid = TenantTimelineId::new(
@@ -553,6 +573,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
             "/v1/tenant/:tenant_id/timeline/:timeline_id/control_file",
             |r| request_span(r, patch_control_file_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
+            |r| request_span(r, timeline_checkpoint_handler),
+        )
         // for tests
         .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
             request_span(r, record_safekeeper_info)
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 93b51f32c0..f7cc40f58a 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -11,6 +11,7 @@ use tracing::info;
 use utils::{
     id::{TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
+    pausable_failpoint,
 };
 
 use crate::{
@@ -162,6 +163,8 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
     filenames.remove(control_file_index);
     filenames.insert(0, "safekeeper.control".to_string());
 
+    pausable_failpoint!("sk-pull-timeline-after-list-pausable");
+
     info!(
         "downloading {} files from safekeeper {}",
         filenames.len(),
@@ -183,6 +186,13 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
 
         let mut file = tokio::fs::File::create(&file_path).await?;
         let mut response = client.get(&http_url).send().await?;
+        if response.status() != reqwest::StatusCode::OK {
+            bail!(
+                "pulling file {} failed: status is {}",
+                filename,
+                response.status()
+            );
+        }
         while let Some(chunk) = response.chunk().await? {
             file.write_all(&chunk).await?;
             file.flush().await?;
diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs
index 98ce671182..3400eee9b7 100644
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -15,7 +15,7 @@ pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
         for tli in &tlis {
             let ttid = tli.ttid;
             async {
-                if let Err(e) = tli.maybe_persist_control_file().await {
+                if let Err(e) = tli.maybe_persist_control_file(false).await {
                     warn!("failed to persist control file: {e}");
                 }
                 if let Err(e) = tli.remove_old_wal().await {
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 4b1481a397..2a620f5fef 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -827,9 +827,9 @@ where
 
     /// Persist control file if there is something to save and enough time
     /// passed after the last save.
-    pub async fn maybe_persist_inmem_control_file(&mut self) -> Result<bool> {
+    pub async fn maybe_persist_inmem_control_file(&mut self, force: bool) -> Result<bool> {
         const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
-        if self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
+        if !force && self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
             return Ok(false);
         }
         let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 0cc6153373..f30c503382 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -821,9 +821,9 @@ impl Timeline {
     /// passed after the last save. This helps to keep remote_consistent_lsn up
     /// to date so that storage nodes restart doesn't cause many pageserver ->
     /// safekeeper reconnections.
-    pub async fn maybe_persist_control_file(self: &Arc<Self>) -> Result<()> {
+    pub async fn maybe_persist_control_file(self: &Arc<Self>, force: bool) -> Result<()> {
         let mut guard = self.write_shared_state().await;
-        let changed = guard.sk.maybe_persist_inmem_control_file().await?;
+        let changed = guard.sk.maybe_persist_inmem_control_file(force).await?;
         guard.skip_update = !changed;
         Ok(())
     }
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index e74ba37ad8..ed544352f9 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -106,7 +106,7 @@ pub async fn main_task(
 
             if !is_active {
                 // TODO: maybe use tokio::spawn?
-                if let Err(e) = tli.maybe_persist_control_file().await {
+                if let Err(e) = tli.maybe_persist_control_file(false).await {
                     warn!("control file save in update_status failed: {:?}", e);
                 }
             }
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index b5458b5c26..e9be765669 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -5,6 +5,8 @@ from typing import Any, Type, TypeVar, Union
 
 T = TypeVar("T", bound="Id")
 
+DEFAULT_WAL_SEG_SIZE = 16 * 1024 * 1024
+
 
 @total_ordering
 class Lsn:
@@ -67,6 +69,9 @@ class Lsn:
     def as_int(self) -> int:
         return self.lsn_int
 
+    def segment_lsn(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> "Lsn":
+        return Lsn(self.lsn_int - (self.lsn_int % seg_sz))
+
 
 @dataclass(frozen=True)
 class Key:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 36aa18f1f9..c9d0acb967 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3771,7 +3771,7 @@ class SafekeeperPort:
 
 
 @dataclass
-class Safekeeper:
+class Safekeeper(LogUtils):
     """An object representing a running safekeeper daemon."""
 
     env: NeonEnv
@@ -3779,6 +3779,13 @@ class Safekeeper:
     id: int
     running: bool = False
 
+    def __init__(self, env: NeonEnv, port: SafekeeperPort, id: int, running: bool = False):
+        self.env = env
+        self.port = port
+        self.id = id
+        self.running = running
+        self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"
+
     def start(self, extra_opts: Optional[List[str]] = None) -> "Safekeeper":
         assert self.running is False
         self.env.neon_cli.safekeeper_start(self.id, extra_opts=extra_opts)
@@ -3839,11 +3846,38 @@ class Safekeeper:
             port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled
         )
 
+    def get_timeline_start_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
+        timeline_status = self.http_client().timeline_status(tenant_id, timeline_id)
+        timeline_start_lsn = timeline_status.timeline_start_lsn
+        log.info(f"sk {self.id} timeline start LSN: {timeline_start_lsn}")
+        return timeline_start_lsn
+
+    def get_flush_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
+        timeline_status = self.http_client().timeline_status(tenant_id, timeline_id)
+        flush_lsn = timeline_status.flush_lsn
+        log.info(f"sk {self.id} flush LSN: {flush_lsn}")
+        return flush_lsn
+
+    def pull_timeline(
+        self, srcs: list[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId
+    ) -> Dict[str, Any]:
+        """
+        pull_timeline from srcs to self.
+        """
+        src_https = [f"http://localhost:{sk.port.http}" for sk in srcs]
+        res = self.http_client().pull_timeline(
+            {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id), "http_hosts": src_https}
+        )
+        src_ids = [sk.id for sk in srcs]
+        log.info(f"finished pulling timeline from {src_ids} to {self.id}")
+        return res
+
+    @property
     def data_dir(self) -> str:
         return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")
 
     def timeline_dir(self, tenant_id, timeline_id) -> str:
-        return os.path.join(self.data_dir(), str(tenant_id), str(timeline_id))
+        return os.path.join(self.data_dir, str(tenant_id), str(timeline_id))
 
     def list_segments(self, tenant_id, timeline_id) -> List[str]:
         """
@@ -3856,6 +3890,35 @@ class Safekeeper:
         segments.sort()
         return segments
 
+    def checkpoint_up_to(self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn):
+        """
+        Assuming pageserver(s) uploaded to s3 up to `lsn`,
+        1) wait for remote_consistent_lsn and wal_backup_lsn on safekeeper to reach it.
+        2) checkpoint timeline on safekeeper, which should remove WAL before this LSN.
+        """
+        cli = self.http_client()
+
+        def are_lsns_advanced():
+            stat = cli.timeline_status(tenant_id, timeline_id)
+            log.info(
+                f"waiting for remote_consistent_lsn and backup_lsn on sk {self.id} to reach {lsn}, currently remote_consistent_lsn={stat.remote_consistent_lsn}, backup_lsn={stat.backup_lsn}"
+            )
+            assert stat.remote_consistent_lsn >= lsn and stat.backup_lsn >= lsn.segment_lsn()
+
+        # xxx: max wait is long because we might be waiting for reconnection from
+        # pageserver to this safekeeper
+        wait_until(30, 1, are_lsns_advanced)
+        cli.checkpoint(tenant_id, timeline_id)
+
+    def wait_until_paused(self, failpoint: str):
+        msg = f"at failpoint {failpoint}"
+
+        def paused():
+            log.info(f"waiting for hitting failpoint {failpoint}")
+            self.assert_log_contains(msg)
+
+        wait_until(20, 0.5, paused)
+
 
 class S3Scrubber:
     def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 82148d0556..a5480f557f 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -177,6 +177,13 @@ class SafekeeperHttpClient(requests.Session):
         )
         res.raise_for_status()
 
+    def checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
+            json={},
+        )
+        res.raise_for_status()
+
     # only_local doesn't remove segments in the remote storage.
     def timeline_delete(
         self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 89e116df28..c05cb3e744 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -560,3 +560,25 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: Set[str
 
     elapsed = time.time() - started_at
     log.info(f"assert_pageserver_backups_equal completed in {elapsed}s")
+
+
+class PropagatingThread(threading.Thread):
+    _target: Any
+    _args: Any
+    _kwargs: Any
+    """
+    Simple Thread wrapper with join() propagating the possible exception in the thread.
+    """
+
+    def run(self):
+        self.exc = None
+        try:
+            self.ret = self._target(*self._args, **self._kwargs)
+        except BaseException as e:
+            self.exc = e
+
+    def join(self, timeout=None):
+        super(PropagatingThread, self).join(timeout)
+        if self.exc:
+            raise self.exc
+        return self.ret
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index ea66eeff63..0c37711f7a 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -23,7 +23,6 @@ from fixtures.log_helper import log
 from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import (
     Endpoint,
-    NeonEnv,
     NeonEnvBuilder,
     NeonPageserver,
     PgBin,
@@ -48,7 +47,7 @@ from fixtures.remote_storage import (
 )
 from fixtures.safekeeper.http import SafekeeperHttpClient
 from fixtures.safekeeper.utils import are_walreceivers_absent
-from fixtures.utils import get_dir_size, query_scalar, start_in_background
+from fixtures.utils import PropagatingThread, get_dir_size, query_scalar, start_in_background
 
 
 def wait_lsn_force_checkpoint(
@@ -360,7 +359,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
 
     # We will wait for first segment removal. Make sure they exist for starter.
     first_segments = [
-        os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id), "000000010000000000000001")
+        os.path.join(sk.data_dir, str(tenant_id), str(timeline_id), "000000010000000000000001")
         for sk in env.safekeepers
     ]
     assert all(os.path.exists(p) for p in first_segments)
@@ -445,7 +444,7 @@ def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: Tim
 def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb):
     http_cli = sk.http_client()
     tli_status = http_cli.timeline_status(tenant_id, timeline_id)
-    sk_wal_size = get_dir_size(os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id)))
+    sk_wal_size = get_dir_size(os.path.join(sk.data_dir, str(tenant_id), str(timeline_id)))
     sk_wal_size_mb = sk_wal_size / 1024 / 1024
     log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}")
     return sk_wal_size_mb <= target_size_mb
@@ -591,10 +590,10 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
 
     # save the last (partial) file to put it back after recreation; others will be fetched from s3
     sk = env.safekeepers[0]
-    tli_dir = Path(sk.data_dir()) / str(tenant_id) / str(timeline_id)
+    tli_dir = Path(sk.data_dir) / str(tenant_id) / str(timeline_id)
     f_partial = Path([f for f in os.listdir(tli_dir) if f.endswith(".partial")][0])
     f_partial_path = tli_dir / f_partial
-    f_partial_saved = Path(sk.data_dir()) / f_partial.name
+    f_partial_saved = Path(sk.data_dir) / f_partial.name
     f_partial_path.rename(f_partial_saved)
 
     pg_version = sk.http_client().timeline_status(tenant_id, timeline_id).pg_version
@@ -616,7 +615,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
         cli = sk.http_client()
         cli.timeline_create(tenant_id, timeline_id, pg_version, last_lsn)
         f_partial_path = (
-            Path(sk.data_dir()) / str(tenant_id) / str(timeline_id) / f_partial_saved.name
+            Path(sk.data_dir) / str(tenant_id) / str(timeline_id) / f_partial_saved.name
         )
         shutil.copy(f_partial_saved, f_partial_path)
 
@@ -1631,7 +1630,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
             with conn.cursor() as cur:
                 cur.execute("CREATE TABLE t(key int primary key)")
     sk = env.safekeepers[0]
-    sk_data_dir = Path(sk.data_dir())
+    sk_data_dir = Path(sk.data_dir)
     if not auth_enabled:
         sk_http = sk.http_client()
         sk_http_other = sk_http
@@ -1724,9 +1723,6 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
 
 
 def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
-    def safekeepers_guc(env: NeonEnv, sk_names: List[int]) -> str:
-        return ",".join([f"localhost:{sk.port.pg}" for sk in env.safekeepers if sk.id in sk_names])
-
     def execute_payload(endpoint: Endpoint):
         with closing(endpoint.connect()) as conn:
             with conn.cursor() as cur:
@@ -1812,6 +1808,67 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
     show_statuses(env.safekeepers, tenant_id, timeline_id)
 
 
+# Test pull_timeline while concurrently gc'ing WAL on safekeeper:
+# 1) Start pull_timeline, listing files to fetch.
+# 2) Write segment, do gc.
+# 3) Finish pull_timeline.
+# 4) Do some write, verify integrity with timeline_digest.
+# Expected to fail while holding off WAL gc plus fetching commit_lsn WAL
+# segment is not implemented.
+@pytest.mark.xfail
+def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2])
+
+    log.info("use only first 2 safekeepers, 3rd will be seeded")
+    endpoint = env.endpoints.create("main")
+    endpoint.active_safekeepers = [1, 2]
+    endpoint.start()
+    endpoint.safe_psql("create table t(key int, value text)")
+    endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'")
+
+    src_flush_lsn = src_sk.get_flush_lsn(tenant_id, timeline_id)
+    log.info(f"flush_lsn on src before pull_timeline: {src_flush_lsn}")
+
+    dst_http = dst_sk.http_client()
+    # run pull_timeline which will halt before downloading files
+    dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "pause"))
+    pt_handle = PropagatingThread(
+        target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id)
+    )
+    pt_handle.start()
+    dst_sk.wait_until_paused("sk-pull-timeline-after-list-pausable")
+
+    # ensure segment exists
+    endpoint.safe_psql("insert into t select generate_series(1, 180000), 'papaya'")
+    lsn = last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+    assert lsn > Lsn("0/2000000")
+    # Checkpoint timeline beyond lsn.
+    src_sk.checkpoint_up_to(tenant_id, timeline_id, lsn)
+    first_segment_p = os.path.join(
+        src_sk.timeline_dir(tenant_id, timeline_id), "000000010000000000000001"
+    )
+    log.info(f"first segment exist={os.path.exists(first_segment_p)}")
+
+    dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "off"))
+    pt_handle.join()
+
+    timeline_start_lsn = src_sk.get_timeline_start_lsn(tenant_id, timeline_id)
+    dst_flush_lsn = dst_sk.get_flush_lsn(tenant_id, timeline_id)
+    log.info(f"flush_lsn on dst after pull_timeline: {dst_flush_lsn}")
+    assert dst_flush_lsn >= src_flush_lsn
+    digests = [
+        sk.http_client().timeline_digest(tenant_id, timeline_id, timeline_start_lsn, dst_flush_lsn)
+        for sk in [src_sk, dst_sk]
+    ]
+    assert digests[0] == digests[1], f"digest on src is {digests[0]} but on dst is {digests[1]}"
+
+
 # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
 # when compute is active, but there are no writes to the timeline. In that case
 # pageserver should maintain a single connection to safekeeper and don't attempt

From b2d34a82b909c102d81cd301c870ebe9b11aec86 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 24 May 2024 22:00:08 +0300
Subject: [PATCH 0878/1571] Make python Safekeeper datadir Path instead of str.

---
 test_runner/fixtures/neon_fixtures.py    |  8 ++++----
 test_runner/fixtures/utils.py            |  2 +-
 test_runner/regress/test_wal_acceptor.py | 14 ++++++--------
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c9d0acb967..b8ef63faa9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3873,11 +3873,11 @@ class Safekeeper(LogUtils):
         return res
 
     @property
-    def data_dir(self) -> str:
-        return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")
+    def data_dir(self) -> Path:
+        return self.env.repo_dir / "safekeepers" / f"sk{self.id}"
 
-    def timeline_dir(self, tenant_id, timeline_id) -> str:
-        return os.path.join(self.data_dir, str(tenant_id), str(timeline_id))
+    def timeline_dir(self, tenant_id, timeline_id) -> Path:
+        return self.data_dir / str(tenant_id) / str(timeline_id)
 
     def list_segments(self, tenant_id, timeline_id) -> List[str]:
         """
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index c05cb3e744..b55329e054 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -196,7 +196,7 @@ def query_scalar(cur: cursor, query: str) -> Any:
 
 
 # Traverse directory to get total size.
-def get_dir_size(path: str) -> int:
+def get_dir_size(path: Path) -> int:
     """Return size in bytes."""
     totalbytes = 0
     for root, _dirs, files in os.walk(path):
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 0c37711f7a..cff13e74ee 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -359,7 +359,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
 
     # We will wait for first segment removal. Make sure they exist for starter.
     first_segments = [
-        os.path.join(sk.data_dir, str(tenant_id), str(timeline_id), "000000010000000000000001")
+        sk.timeline_dir(tenant_id, timeline_id) / "000000010000000000000001"
         for sk in env.safekeepers
     ]
     assert all(os.path.exists(p) for p in first_segments)
@@ -444,7 +444,7 @@ def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: Tim
 def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb):
     http_cli = sk.http_client()
     tli_status = http_cli.timeline_status(tenant_id, timeline_id)
-    sk_wal_size = get_dir_size(os.path.join(sk.data_dir, str(tenant_id), str(timeline_id)))
+    sk_wal_size = get_dir_size(sk.timeline_dir(tenant_id, timeline_id))
     sk_wal_size_mb = sk_wal_size / 1024 / 1024
     log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}")
     return sk_wal_size_mb <= target_size_mb
@@ -1131,8 +1131,8 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline
         )
 
         for f in mismatch:
-            f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f)
-            f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f)
+            f1 = sk0.timeline_dir(tenant_id, timeline_id) / f
+            f2 = sk.timeline_dir(tenant_id, timeline_id) / f
             stdout_filename = f"{f2}.filediff"
 
             with open(stdout_filename, "w") as stdout_f:
@@ -1630,7 +1630,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
             with conn.cursor() as cur:
                 cur.execute("CREATE TABLE t(key int primary key)")
     sk = env.safekeepers[0]
-    sk_data_dir = Path(sk.data_dir)
+    sk_data_dir = sk.data_dir
     if not auth_enabled:
         sk_http = sk.http_client()
         sk_http_other = sk_http
@@ -1850,9 +1850,7 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
     assert lsn > Lsn("0/2000000")
     # Checkpoint timeline beyond lsn.
     src_sk.checkpoint_up_to(tenant_id, timeline_id, lsn)
-    first_segment_p = os.path.join(
-        src_sk.timeline_dir(tenant_id, timeline_id), "000000010000000000000001"
-    )
+    first_segment_p = src_sk.timeline_dir(tenant_id, timeline_id) / "000000010000000000000001"
     log.info(f"first segment exist={os.path.exists(first_segment_p)}")
 
     dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "off"))

From d61e9241035c3d03eb90a5722b0d38769f92e18a Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 27 May 2024 15:57:57 +0300
Subject: [PATCH 0879/1571] Fix connect to PS on MacOS/X (#7885)

## Problem

After [0e4f1826805d040a23c25c54f3993a942755dbc2] which introduce async
connect
Neon is not able to connect to page server.

## Summary of changes

Perform sync commit at MacOS/X

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/libpagestore.c   | 89 +++++++++++++++++++-------------------
 pgxn/neon/pagestore_smgr.c | 10 ++---
 2 files changed, 50 insertions(+), 49 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index a9c8d59c3a..5eae2d8204 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -125,13 +125,6 @@ typedef struct
 	 *	- WL_EXIT_ON_PM_DEATH.
 	 */
 	WaitEventSet   *wes_read;
-	/*---
-	 * WaitEventSet containing:
-	 *	- WL_SOCKET_WRITABLE on 'conn'
-	 *	- WL_LATCH_SET on MyLatch, and
-	 *	- WL_EXIT_ON_PM_DEATH.
-	 */
-	WaitEventSet   *wes_write;
 } PageServer;
 
 static PageServer page_servers[MAX_SHARDS];
@@ -336,11 +329,6 @@ CLEANUP_AND_DISCONNECT(PageServer *shard)
 		FreeWaitEventSet(shard->wes_read);
 		shard->wes_read = NULL;
 	}
-	if (shard->wes_write)
-	{
-		FreeWaitEventSet(shard->wes_write);
-		shard->wes_write = NULL;
-	}
 	if (shard->conn)
 	{
 		PQfinish(shard->conn);
@@ -436,22 +424,6 @@ pageserver_connect(shardno_t shard_no, int elevel)
 			return false;
 		}
 
-		shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
-		AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
-						  MyLatch, NULL);
-		AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
-		AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL);
-
-		shard->wes_write = CreateWaitEventSet(TopMemoryContext, 3);
-		AddWaitEventToSet(shard->wes_write, WL_LATCH_SET, PGINVALID_SOCKET,
-						  MyLatch, NULL);
-		AddWaitEventToSet(shard->wes_write, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
-		AddWaitEventToSet(shard->wes_write, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE,
-						  PQsocket(shard->conn),
-						  NULL, NULL);
-
 		shard->state = PS_Connecting_Startup;
 		/* fallthrough */
 	}
@@ -460,13 +432,12 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		char	   *pagestream_query;
 		int			ps_send_query_ret;
 		bool		connected = false;
-
+		int poll_result = PGRES_POLLING_WRITING;
 		neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup");
 
 		do
 		{
 			WaitEvent	event;
-			int			poll_result = PQconnectPoll(shard->conn);
 
 			switch (poll_result)
 			{
@@ -497,25 +468,45 @@ pageserver_connect(shardno_t shard_no, int elevel)
 				}
 			case PGRES_POLLING_READING:
 				/* Sleep until there's something to do */
-				(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
-										PG_WAIT_EXTENSION);
-				ResetLatch(MyLatch);
-
-				/* query cancellation, backend shutdown */
-				CHECK_FOR_INTERRUPTS();
-
+				while (true)
+				{
+					int rc = WaitLatchOrSocket(MyLatch,
+											   WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_READABLE,
+											   PQsocket(shard->conn),
+											   0,
+											   PG_WAIT_EXTENSION);
+					elog(DEBUG5, "PGRES_POLLING_READING=>%d", rc);
+					if (rc & WL_LATCH_SET)
+					{
+						ResetLatch(MyLatch);
+						/* query cancellation, backend shutdown */
+						CHECK_FOR_INTERRUPTS();
+					}
+					if (rc & WL_SOCKET_READABLE)
+						break;
+				}
 				/* PQconnectPoll() handles the socket polling state updates */
 
 				break;
 			case PGRES_POLLING_WRITING:
 				/* Sleep until there's something to do */
-				(void) WaitEventSetWait(shard->wes_write, -1L, &event, 1,
-										PG_WAIT_EXTENSION);
-				ResetLatch(MyLatch);
-
-				/* query cancellation, backend shutdown */
-				CHECK_FOR_INTERRUPTS();
-
+				while (true)
+				{
+					int rc = WaitLatchOrSocket(MyLatch,
+											   WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_WRITEABLE,
+											   PQsocket(shard->conn),
+											   0,
+											   PG_WAIT_EXTENSION);
+					elog(DEBUG5, "PGRES_POLLING_WRITING=>%d", rc);
+					if (rc & WL_LATCH_SET)
+					{
+						ResetLatch(MyLatch);
+						/* query cancellation, backend shutdown */
+						CHECK_FOR_INTERRUPTS();
+					}
+					if (rc & WL_SOCKET_WRITEABLE)
+						break;
+				}
 				/* PQconnectPoll() handles the socket polling state updates */
 
 				break;
@@ -524,12 +515,22 @@ pageserver_connect(shardno_t shard_no, int elevel)
 				connected = true;
 				break;
 			}
+			poll_result = PQconnectPoll(shard->conn);
+			elog(DEBUG5, "PQconnectPoll=>%d", poll_result);
 		}
 		while (!connected);
 
 		/* No more polling needed; connection succeeded */
 		shard->last_connect_time = GetCurrentTimestamp();
 
+		shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
+		AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
+						  MyLatch, NULL);
+		AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+						  NULL, NULL);
+		AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL);
+
+
 		switch (neon_protocol_version)
 		{
 		case 2:
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index ac505fe6fb..0e4d210be8 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -584,9 +584,9 @@ prefetch_read(PrefetchRequest *slot)
 		slot->response != NULL ||
 		slot->my_ring_index != MyPState->ring_receive)
 		neon_shard_log(slot->shard_no, ERROR,
-					   "Incorrect prefetch read: status=%d response=%llx my=%llu receive=%llu",
-					   slot->status, (size_t) (void *) slot->response,
-					   slot->my_ring_index, MyPState->ring_receive);
+					   "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu",
+					   slot->status, slot->response,
+					   (long)slot->my_ring_index, (long)MyPState->ring_receive);
 
 	old = MemoryContextSwitchTo(MyPState->errctx);
 	response = (NeonResponse *) page_server->receive(slot->shard_no);
@@ -606,8 +606,8 @@ prefetch_read(PrefetchRequest *slot)
 	else
 	{
 		neon_shard_log(slot->shard_no, WARNING,
-					   "No response from reading prefetch entry %llu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
-					   slot->my_ring_index,
+					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
+					   (long)slot->my_ring_index,
 					   RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
 					   slot->buftag.forkNum, slot->buftag.blockNum);
 		return false;

From 4a0ce9512b5eb26b636006cda2488411d07bfc03 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 27 May 2024 17:35:46 +0300
Subject: [PATCH 0880/1571] Add safekeeper test truncating WAL.

We do it as a part of more complicated tests like test_compute_restarts, but
let's have a simple test as well.
---
 .../regress/test_wal_acceptor_async.py        | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index b5d86de574..715d22eed8 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -531,6 +531,64 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder):
     asyncio.run(run_recovery_uncommitted(env))
 
 
+async def run_wal_truncation(env: NeonEnv):
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    (sk1, sk2, sk3) = env.safekeepers
+
+    ep = env.endpoints.create_start("main")
+    ep.safe_psql("create table t (key int, value text)")
+    ep.safe_psql("insert into t select generate_series(1, 100), 'payload'")
+
+    # insert with only one sk3 up to create tail of flushed but not committed WAL on it
+    sk1.stop()
+    sk2.stop()
+    conn = await ep.connect_async()
+    # query should hang, so execute in separate task
+    bg_query = asyncio.create_task(
+        conn.execute("insert into t select generate_series(1, 180000), 'Papaya'")
+    )
+    sleep_sec = 2
+    await asyncio.sleep(sleep_sec)
+    # it must still be not finished
+    assert not bg_query.done()
+    # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
+    ep.stop_and_destroy()
+
+    # stop sk3 as well
+    sk3.stop()
+
+    # now start sk1 and sk2 and make them commit something
+    sk1.start()
+    sk2.start()
+    ep = env.endpoints.create_start(
+        "main",
+    )
+    ep.safe_psql("insert into t select generate_series(1, 200), 'payload'")
+
+    # start sk3 and wait for it to catch up
+    sk3.start()
+    flush_lsn = Lsn(ep.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()"))
+    await wait_for_lsn(sk3, tenant_id, timeline_id, flush_lsn)
+
+    timeline_start_lsn = sk1.get_timeline_start_lsn(tenant_id, timeline_id)
+    digests = [
+        sk.http_client().timeline_digest(tenant_id, timeline_id, timeline_start_lsn, flush_lsn)
+        for sk in [sk1, sk2]
+    ]
+    assert digests[0] == digests[1], f"digest on sk1 is {digests[0]} but on sk3 is {digests[1]}"
+
+
+# Simple deterministic test creating tail of WAL on safekeeper which is
+# truncated when majority without this sk elects walproposer starting earlier.
+def test_wal_truncation(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    asyncio.run(run_wal_truncation(env))
+
+
 async def run_segment_init_failure(env: NeonEnv):
     env.neon_cli.create_branch("test_segment_init_failure")
     ep = env.endpoints.create_start("test_segment_init_failure")

From fabeff822fac24b7ba45214e907295003874252a Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Tue, 28 May 2024 13:05:33 +0200
Subject: [PATCH 0881/1571] Performance test for pgvector HNSW index build and
 queries (#7873)

## Problem

We want to regularly verify the performance of pgvector HNSW parallel
index builds and parallel similarity search using HNSW indexes.
The first release that considerably improved the index-build parallelism
was pgvector 0.7.0 and we want to make sure that we do not regress by
our neon compute VM settings (swap, memory over commit, pg conf etc.)

## Summary of changes

Prepare a Neon project with 1 million openAI vector embeddings (vector
size 1536).
Run HNSW indexing operations in the regression test for the various
distance metrics.
Run similarity queries using pgbench with 100 concurrent clients.

I have also added the relevant metrics to the grafana dashboards pgbench
and olape

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/benchmarking.yml            | 100 +++++++++++++++++-
 pyproject.toml                                |   1 +
 .../performance/pgvector/HNSW_build.sql       |  47 ++++++++
 .../performance/pgvector/IVFFLAT_build.sql    |  52 +++++++++
 test_runner/performance/pgvector/README.md    |  38 +++++++
 test_runner/performance/pgvector/loaddata.py  |  72 +++++++++++++
 ...ch_custom_script_pgvector_hsnw_queries.sql |  10 ++
 .../pgvector/pgbench_hnsw_queries.sql         |  13 +++
 test_runner/performance/test_perf_olap.py     |  34 ++++++
 test_runner/performance/test_perf_pgbench.py  |  31 ++++++
 10 files changed, 395 insertions(+), 3 deletions(-)
 create mode 100644 test_runner/performance/pgvector/HNSW_build.sql
 create mode 100644 test_runner/performance/pgvector/IVFFLAT_build.sql
 create mode 100644 test_runner/performance/pgvector/README.md
 create mode 100644 test_runner/performance/pgvector/loaddata.py
 create mode 100644 test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql
 create mode 100644 test_runner/performance/pgvector/pgbench_hnsw_queries.sql

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 1eaf05cd54..d5a375d704 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -38,6 +38,11 @@ on:
         description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch'
         required: false
         default: false
+      run_only_pgvector_tests:
+        type: boolean
+        description: 'Run pgvector tests but no other tests. If not set, all tests including pgvector tests will be run'
+        required: false
+        default: false
 
 defaults:
   run:
@@ -50,6 +55,7 @@ concurrency:
 
 jobs:
   bench:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "300"
       TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -120,6 +126,7 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   generate-matrices:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
     #
     # Available platforms:
@@ -197,6 +204,7 @@ jobs:
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
 
   pgbench-compare:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     needs: [ generate-matrices ]
 
     strategy:
@@ -343,6 +351,92 @@ jobs:
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
+  pgbench-pgvector:
+    env:
+      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
+      TEST_PG_BENCH_SCALES_MATRIX: "1"
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 16
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: "neon-captest-pgvector"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      options: --init
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Add Postgres binaries to PATH
+      run: |
+        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+        CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
+        
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+
+        QUERIES=("SELECT version()")
+        QUERIES+=("SHOW neon.tenant_id")
+        QUERIES+=("SHOW neon.timeline_id")
+        
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done
+
+    - name: Benchmark pgvector hnsw indexing
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_perf_olap.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+
+    - name: Benchmark pgvector hnsw queries
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_pgvector
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+    
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+
   clickbench-compare:
     # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
     # we use for performance testing in pgbench-compare.
@@ -351,7 +445,7 @@ jobs:
     #
     # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
     # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
-    if: ${{ !cancelled() }}
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
     needs: [ generate-matrices, pgbench-compare ]
 
     strategy:
@@ -455,7 +549,7 @@ jobs:
     # We might change it after https://github.com/neondatabase/neon/issues/2900.
     #
     # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
-    if: ${{ !cancelled() }}
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
     needs: [ generate-matrices, clickbench-compare ]
 
     strategy:
@@ -557,7 +651,7 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   user-examples-compare:
-    if: ${{ !cancelled() }}
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
     needs: [ generate-matrices, tpch-compare ]
 
     strategy:
diff --git a/pyproject.toml b/pyproject.toml
index 131d1121f7..c7f1a07512 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,6 +54,7 @@ build-backend = "poetry.core.masonry.api"
 exclude = [
     "^vendor/",
     "^target/",
+    "test_runner/performance/pgvector/loaddata.py",
 ]
 check_untyped_defs = true
 # Help mypy find imports when running against list of individual files.
diff --git a/test_runner/performance/pgvector/HNSW_build.sql b/test_runner/performance/pgvector/HNSW_build.sql
new file mode 100644
index 0000000000..9e6918b755
--- /dev/null
+++ b/test_runner/performance/pgvector/HNSW_build.sql
@@ -0,0 +1,47 @@
+
+\set ECHO queries
+\timing
+
+-- prepare test table
+DROP TABLE IF EXISTS hnsw_test_table;
+CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA;
+INSERT INTO hnsw_test_table SELECT * FROM documents;
+CREATE INDEX ON hnsw_test_table (_id); -- needed later for random tuple queries
+-- tune index build params
+SET max_parallel_maintenance_workers = 7; 
+SET maintenance_work_mem = '8GB';
+-- create HNSW index for the supported distance metrics
+CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops);
+CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops);
+CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops);
+CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops);
+CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops);
+-- note: in a second psql session we can monitor the progress of the index build phases using
+-- the following query:
+-- SELECT phase, round(100.0 * blocks_done / nullif(blocks_total, 0), 1) AS "%" FROM pg_stat_progress_create_index;
+
+-- show all indexes built on the table
+SELECT 
+    idx.relname AS index_name,
+    tbl.relname AS table_name,
+    am.amname AS access_method,
+    a.attname AS column_name,
+    opc.opcname AS operator_class
+FROM 
+    pg_index i
+JOIN 
+    pg_class idx ON idx.oid = i.indexrelid
+JOIN 
+    pg_class tbl ON tbl.oid = i.indrelid
+JOIN 
+    pg_am am ON am.oid = idx.relam
+JOIN 
+    pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey)
+JOIN 
+    pg_opclass opc ON opc.oid = i.indclass[0]
+WHERE 
+    tbl.relname = 'hnsw_test_table' 
+    AND a.attname = 'embeddings';
+
+-- show table sizes
+\dt+
diff --git a/test_runner/performance/pgvector/IVFFLAT_build.sql b/test_runner/performance/pgvector/IVFFLAT_build.sql
new file mode 100644
index 0000000000..338980831a
--- /dev/null
+++ b/test_runner/performance/pgvector/IVFFLAT_build.sql
@@ -0,0 +1,52 @@
+
+\set ECHO queries
+\timing
+
+-- prepare test table
+DROP TABLE IF EXISTS ivfflat_test_table;
+CREATE TABLE ivfflat_test_table AS TABLE documents WITH NO DATA;
+INSERT INTO ivfflat_test_table SELECT * FROM documents;
+CREATE INDEX ON ivfflat_test_table (_id); -- needed later for random tuple queries
+-- tune index build params
+SET max_parallel_maintenance_workers = 7; 
+SET maintenance_work_mem = '8GB';
+-- create ivfflat index for the supported distance metrics
+-- the formulat for lists is # rows / 1000 or sqrt(# rows) if # rows > 1 million
+-- we have 1 million embeddings of vector size 1536 in column embeddings of table documents
+-- so we use 1000 lists
+CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_l2_ops) WITH (lists = 1000);
+CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_ip_ops) WITH (lists = 1000);
+CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_cosine_ops) WITH (lists = 1000);
+CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings::halfvec(1536) halfvec_l2_ops) WITH (lists = 1000);
+CREATE INDEX ON ivfflat_test_table
+    USING ivfflat ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops) WITH (lists = 1000);
+
+\d ivfflat_test_table
+
+
+-- show all indexes built on the table
+SELECT 
+    idx.relname AS index_name,
+    tbl.relname AS table_name,
+    am.amname AS access_method,
+    a.attname AS column_name,
+    opc.opcname AS operator_class
+FROM 
+    pg_index i
+JOIN 
+    pg_class idx ON idx.oid = i.indexrelid
+JOIN 
+    pg_class tbl ON tbl.oid = i.indrelid
+JOIN 
+    pg_am am ON am.oid = idx.relam
+JOIN 
+    pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey)
+JOIN 
+    pg_opclass opc ON opc.oid = i.indclass[0]
+WHERE 
+    tbl.relname = 'ivfflat_test_table' 
+    AND a.attname = 'embeddings';
+-- show table sizes
+\dt+
+
+
diff --git a/test_runner/performance/pgvector/README.md b/test_runner/performance/pgvector/README.md
new file mode 100644
index 0000000000..c55db12e74
--- /dev/null
+++ b/test_runner/performance/pgvector/README.md
@@ -0,0 +1,38 @@
+---
+dataset_info:
+  features:
+  - name: _id
+    dtype: string
+  - name: title
+    dtype: string
+  - name: text
+    dtype: string
+  - name: text-embedding-3-large-1536-embedding
+    sequence: float64
+  splits:
+  - name: train
+    num_bytes: 12679725776
+    num_examples: 1000000
+  download_size: 9551862565
+  dataset_size: 12679725776
+configs:
+- config_name: default
+  data_files:
+  - split: train
+    path: data/train-*
+license: mit
+task_categories:
+- feature-extraction
+language:
+- en
+size_categories:
+- 1M<n<10M
+---
+
+
+1M OpenAI Embeddings: text-embedding-3-large 1536 dimensions
+
+- Created: February 2024. 
+- Text used for Embedding: title (string) + text (string)
+- Embedding Model: OpenAI text-embedding-3-large
+- This dataset was generated from the first 1M entries of https://huggingface.co/datasets/BeIR/dbpedia-entity, extracted by @KShivendu_ [here](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M)
\ No newline at end of file
diff --git a/test_runner/performance/pgvector/loaddata.py b/test_runner/performance/pgvector/loaddata.py
new file mode 100644
index 0000000000..36c209aed3
--- /dev/null
+++ b/test_runner/performance/pgvector/loaddata.py
@@ -0,0 +1,72 @@
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import psycopg2
+from pgvector.psycopg2 import register_vector
+from psycopg2.extras import execute_values
+
+
+def print_usage():
+    print("Usage: loaddata.py <CONNSTR> <DATADIR>")
+
+
+def main(conn_str, directory_path):
+    # Connection to PostgreSQL
+    with psycopg2.connect(conn_str) as conn:
+        with conn.cursor() as cursor:
+            # Run SQL statements
+            cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")
+            register_vector(conn)
+            cursor.execute("DROP TABLE IF EXISTS documents;")
+            cursor.execute(
+                """
+                CREATE TABLE documents (
+                    _id TEXT PRIMARY KEY,
+                    title TEXT,
+                    text TEXT,
+                    embeddings vector(1536) -- text-embedding-3-large-1536-embedding (OpenAI)
+                );
+            """
+            )
+            conn.commit()
+
+            # List and sort Parquet files
+            parquet_files = sorted(Path(directory_path).glob("*.parquet"))
+
+            for file in parquet_files:
+                print(f"Loading {file} into PostgreSQL")
+                df = pd.read_parquet(file)
+
+                print(df.head())
+
+                data_list = [
+                    (
+                        row["_id"],
+                        row["title"],
+                        row["text"],
+                        np.array(row["text-embedding-3-large-1536-embedding"]),
+                    )
+                    for index, row in df.iterrows()
+                ]
+                # Use execute_values to perform batch insertion
+                execute_values(
+                    cursor,
+                    "INSERT INTO documents (_id, title, text, embeddings) VALUES %s",
+                    data_list,
+                )
+                # Commit after we insert all embeddings
+                conn.commit()
+
+                print(f"Loaded {file} into PostgreSQL")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print_usage()
+        sys.exit(1)
+
+    conn_str = sys.argv[1]
+    directory_path = sys.argv[2]
+    main(conn_str, directory_path)
diff --git a/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql
new file mode 100644
index 0000000000..886ae9645b
--- /dev/null
+++ b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql
@@ -0,0 +1,10 @@
+with x (x) as (
+  select "embeddings" as x
+  from hnsw_test_table 
+  TABLESAMPLE SYSTEM (1) 
+  LIMIT 1
+)
+SELECT title, "embeddings" <=> (select x from x) as distance
+FROM hnsw_test_table
+ORDER BY 2
+LIMIT 30;
diff --git a/test_runner/performance/pgvector/pgbench_hnsw_queries.sql b/test_runner/performance/pgvector/pgbench_hnsw_queries.sql
new file mode 100644
index 0000000000..5034063c1b
--- /dev/null
+++ b/test_runner/performance/pgvector/pgbench_hnsw_queries.sql
@@ -0,0 +1,13 @@
+-- run with pooled connection
+-- pgbench -T 300 -c 100 -j20 -f pgbench_hnsw_queries.sql -postgresql://neondb_owner:<secret>@ep-floral-thunder-w1gzhaxi-pooler.eu-west-1.aws.neon.build/neondb?sslmode=require"
+
+with x (x) as (
+  select "embeddings" as x
+  from hnsw_test_table 
+  TABLESAMPLE SYSTEM (1) 
+  LIMIT 1
+)
+SELECT title, "embeddings" <=> (select x from x) as distance
+FROM hnsw_test_table
+ORDER BY 2
+LIMIT 30;
diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py
index 8a9509ea44..2367676e67 100644
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -100,6 +100,25 @@ QUERIES: Tuple[LabelledQuery, ...] = (
 )
 # fmt: on
 
+# A list of pgvector HNSW index builds to run.
+# Please do not alter the label for the query, as it is used to identify it.
+#
+# Disable auto formatting for the list of queries so that it's easier to read
+# fmt: off
+PGVECTOR_QUERIES: Tuple[LabelledQuery, ...] = (
+    LabelledQuery("PGV0",  r"DROP TABLE IF EXISTS hnsw_test_table;"),
+    LabelledQuery("PGV1",  r"CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA;"),
+    LabelledQuery("PGV2",  r"INSERT INTO hnsw_test_table SELECT * FROM documents;"),
+    LabelledQuery("PGV3",  r"CREATE INDEX ON hnsw_test_table (_id);"),
+    LabelledQuery("PGV4",  r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops);"),
+    LabelledQuery("PGV5",  r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops);"),
+    LabelledQuery("PGV6",  r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops);"),
+    LabelledQuery("PGV7",  r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops);"),
+    LabelledQuery("PGV8",  r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops);"),
+)
+# fmt: on
+
+
 EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)"
 
 
@@ -245,3 +264,18 @@ def test_clickbench_collect_pg_stat_statements(remote_compare: RemoteCompare):
     log.info("Collecting pg_stat_statements")
     query = LabelledQuery("Q_COLLECT_PG_STAT_STATEMENTS", r"SELECT * from pg_stat_statements;")
     run_psql(remote_compare, query, times=1, explain=False)
+
+
+@pytest.mark.parametrize("query", PGVECTOR_QUERIES)
+@pytest.mark.remote_cluster
+def test_pgvector_indexing(query: LabelledQuery, remote_compare: RemoteCompare):
+    """
+    An pgvector test that tests HNSW index build performance and parallelism.
+
+    The DB prepared manually in advance.
+    See
+    - test_runner/performance/pgvector/README.md
+    - test_runner/performance/pgvector/loaddata.py
+    - test_runner/performance/pgvector/HNSW_build.sql
+    """
+    run_psql(remote_compare, query, times=1, explain=False)
diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py
index 2b8760dff2..d756d6eeca 100644
--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -17,6 +17,7 @@ class PgBenchLoadType(enum.Enum):
     INIT = "init"
     SIMPLE_UPDATE = "simple-update"
     SELECT_ONLY = "select-only"
+    PGVECTOR_HNSW = "pgvector-hnsw"
 
 
 def utc_now_timestamp() -> int:
@@ -132,6 +133,26 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P
             password=password,
         )
 
+    if workload_type == PgBenchLoadType.PGVECTOR_HNSW:
+        # Run simple-update workload
+        run_pgbench(
+            env,
+            "pgvector-hnsw",
+            [
+                "pgbench",
+                "-f",
+                "test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql",
+                "-c100",
+                "-j20",
+                f"-T{duration}",
+                "-P2",
+                "--protocol=prepared",
+                "--progress-timestamp",
+                connstr,
+            ],
+            password=password,
+        )
+
     env.report_size()
 
 
@@ -201,3 +222,13 @@ def test_pgbench_remote_simple_update(remote_compare: PgCompare, scale: int, dur
 @pytest.mark.remote_cluster
 def test_pgbench_remote_select_only(remote_compare: PgCompare, scale: int, duration: int):
     run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.SELECT_ONLY)
+
+
+# The following test runs on an existing database that has pgvector extension installed
+# and a table with 1 million embedding vectors loaded and indexed with HNSW.
+#
+# Run this pgbench tests against an existing remote Postgres cluster with the necessary setup.
+@pytest.mark.parametrize("duration", get_durations_matrix())
+@pytest.mark.remote_cluster
+def test_pgbench_remote_pgvector(remote_compare: PgCompare, duration: int):
+    run_test_pgbench(remote_compare, 1, duration, PgBenchLoadType.PGVECTOR_HNSW)

From f9f69a2ee7fb11b9d713bd3f2c50c5be516253c9 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Tue, 28 May 2024 16:21:09 +0200
Subject: [PATCH 0882/1571] clarify how to load the dbpedia vector embeddings
 into a postgres database (#7894)

## Problem


Improve the readme for the data load step in the pgvector performance
test.
---
 test_runner/performance/pgvector/README.md | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/test_runner/performance/pgvector/README.md b/test_runner/performance/pgvector/README.md
index c55db12e74..83495d270a 100644
--- a/test_runner/performance/pgvector/README.md
+++ b/test_runner/performance/pgvector/README.md
@@ -1,3 +1,20 @@
+# Source of the dataset for pgvector tests
+
+This readme was copied from https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M
+
+## Download the parquet files
+
+```bash
+brew install git-lfs
+git-lfs clone https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M
+```
+
+## Load into postgres:
+
+see loaddata.py in this directory
+
+## Rest of dataset card as on huggingface
+
 ---
 dataset_info:
   features:
@@ -35,4 +52,4 @@ size_categories:
 - Created: February 2024. 
 - Text used for Embedding: title (string) + text (string)
 - Embedding Model: OpenAI text-embedding-3-large
-- This dataset was generated from the first 1M entries of https://huggingface.co/datasets/BeIR/dbpedia-entity, extracted by @KShivendu_ [here](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M)
\ No newline at end of file
+- This dataset was generated from the first 1M entries of https://huggingface.co/datasets/BeIR/dbpedia-entity, extracted by @KShivendu_
\ No newline at end of file

From 352b08d0be56c73ba9017a82cd6496aea7ba5758 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 28 May 2024 16:06:47 +0100
Subject: [PATCH 0883/1571] pageserver: fix a warning on secondary mode
 downloads after evictions (#7877)

## Problem

In 4ce6e2d2fc we added a warning when progress stats don't look right at
the end of a secondary download pass.

This `Correcting drift in progress stats` warning fired in staging on a
pageserver that had been doing some disk usage eviction.

The impact is low because in the same place we log the warning, we also
fix up the progress values.

## Summary of changes

- When we skip downloading a layer because it was recently evicted,
update the progress stats to ensure they still reach a clean complete
state at the end of a download pass.
- Also add a log for evicting secondary location layers, for symmetry
with attached locations, so that we can clearly see when eviction has
happened for a particular tenant's layers when investigating issues.

This is a point fix -- the code would also benefit from being refactored
so that there is some "download result" type with a Skip variant, to
ensure that we are updating the progress stats uniformly for those
cases.
---
 pageserver/src/tenant/secondary.rs            |  1 +
 pageserver/src/tenant/secondary/downloader.rs | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 252b6eb11b..af6840f525 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -187,6 +187,7 @@ impl SecondaryTenant {
         };
 
         let now = SystemTime::now();
+        tracing::info!("Evicting secondary layer");
 
         let this = self.clone();
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 0ec1bd649b..5c915d6b53 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -909,6 +909,7 @@ impl<'a> TenantDownloader<'a> {
                         strftime(&layer.access_time),
                         strftime(evicted_at)
                     );
+                    self.skip_layer(layer);
                     continue;
                 }
             }
@@ -963,6 +964,15 @@ impl<'a> TenantDownloader<'a> {
         Ok(())
     }
 
+    /// Call this during timeline download if a layer will _not_ be downloaded, to update progress statistics
+    fn skip_layer(&self, layer: HeatMapLayer) {
+        let mut progress = self.secondary_state.progress.lock().unwrap();
+        progress.layers_total = progress.layers_total.saturating_sub(1);
+        progress.bytes_total = progress
+            .bytes_total
+            .saturating_sub(layer.metadata.file_size);
+    }
+
     async fn download_layer(
         &self,
         tenant_shard_id: &TenantShardId,
@@ -1012,13 +1022,7 @@ impl<'a> TenantDownloader<'a> {
                     "Skipped downloading missing layer {}, raced with compaction/gc?",
                     layer.name
                 );
-
-                // If the layer is 404, adjust the progress statistics to reflect that we will not download it.
-                let mut progress = self.secondary_state.progress.lock().unwrap();
-                progress.layers_total = progress.layers_total.saturating_sub(1);
-                progress.bytes_total = progress
-                    .bytes_total
-                    .saturating_sub(layer.metadata.file_size);
+                self.skip_layer(layer);
 
                 return Ok(None);
             }

From 14df69d0e38c2ab3e1b8f1bef4a6981c842fd913 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 28 May 2024 17:40:52 +0200
Subject: [PATCH 0884/1571] Drop postgres-native-tls in favour of
 tokio-postgres-rustls (#7883)

Get rid of postgres-native-tls and openssl in favour of rustls in our
dependency tree.

Do further steps to completely remove native-tls and openssl.

Among other advantages, this allows us to do static musl builds more
easily: #7889
---
 Cargo.lock                                  | 154 ++------------------
 Cargo.toml                                  |  15 +-
 deny.toml                                   |   7 +
 proxy/Cargo.toml                            |   5 +-
 proxy/src/compute.rs                        |  84 +++++++++--
 s3_scrubber/Cargo.toml                      |   6 +-
 s3_scrubber/src/scan_safekeeper_metadata.rs |  20 ++-
 workspace_hack/Cargo.toml                   |   4 +-
 8 files changed, 124 insertions(+), 171 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d8f9021eb8..b1a307dd19 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -776,7 +776,6 @@ dependencies = [
  "pin-project",
  "serde",
  "time",
- "tz-rs",
  "url",
  "uuid",
 ]
@@ -1291,12 +1290,6 @@ dependencies = [
  "tiny-keccak",
 ]
 
-[[package]]
-name = "const_fn"
-version = "0.4.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fbdcdcb6d86f71c5e97409ad45898af11cbc995b4ee8112d59095a28d376c935"
-
 [[package]]
 name = "const_format"
 version = "0.2.30"
@@ -1976,21 +1969,6 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
-[[package]]
-name = "foreign-types"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
-dependencies = [
- "foreign-types-shared",
-]
-
-[[package]]
-name = "foreign-types-shared"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
-
 [[package]]
 name = "form_urlencoded"
 version = "1.1.0"
@@ -2620,19 +2598,6 @@ dependencies = [
  "tokio-io-timeout",
 ]
 
-[[package]]
-name = "hyper-tls"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
-dependencies = [
- "bytes",
- "hyper 0.14.26",
- "native-tls",
- "tokio",
- "tokio-native-tls",
-]
-
 [[package]]
 name = "hyper-util"
 version = "0.1.3"
@@ -3168,24 +3133,6 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
-[[package]]
-name = "native-tls"
-version = "0.2.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
-dependencies = [
- "lazy_static",
- "libc",
- "log",
- "openssl",
- "openssl-probe",
- "openssl-sys",
- "schannel",
- "security-framework",
- "security-framework-sys",
- "tempfile",
-]
-
 [[package]]
 name = "nix"
 version = "0.25.1"
@@ -3356,15 +3303,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "num_threads"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "oauth2"
 version = "4.4.2"
@@ -3414,50 +3352,12 @@ version = "11.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 
-[[package]]
-name = "openssl"
-version = "0.10.60"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800"
-dependencies = [
- "bitflags 2.4.1",
- "cfg-if",
- "foreign-types",
- "libc",
- "once_cell",
- "openssl-macros",
- "openssl-sys",
-]
-
-[[package]]
-name = "openssl-macros"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.52",
-]
-
 [[package]]
 name = "openssl-probe"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
-[[package]]
-name = "openssl-sys"
-version = "0.9.96"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f"
-dependencies = [
- "cc",
- "libc",
- "pkg-config",
- "vcpkg",
-]
-
 [[package]]
 name = "opentelemetry"
 version = "0.20.0"
@@ -4105,17 +4005,6 @@ dependencies = [
  "tokio-postgres",
 ]
 
-[[package]]
-name = "postgres-native-tls"
-version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
-dependencies = [
- "native-tls",
- "tokio",
- "tokio-native-tls",
- "tokio-postgres",
-]
-
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
@@ -4423,7 +4312,6 @@ dependencies = [
  "md5",
  "measured",
  "metrics",
- "native-tls",
  "once_cell",
  "opentelemetry",
  "parking_lot 0.12.1",
@@ -4431,7 +4319,6 @@ dependencies = [
  "parquet_derive",
  "pbkdf2",
  "pin-project-lite",
- "postgres-native-tls",
  "postgres-protocol",
  "postgres_backend",
  "pq_proto",
@@ -4450,6 +4337,7 @@ dependencies = [
  "rstest",
  "rustc-hash",
  "rustls 0.22.4",
+ "rustls-native-certs 0.7.0",
  "rustls-pemfile 2.1.1",
  "scopeguard",
  "serde",
@@ -4479,7 +4367,6 @@ dependencies = [
  "utils",
  "uuid",
  "walkdir",
- "webpki-roots 0.25.2",
  "workspace_hack",
  "x509-parser",
 ]
@@ -4786,20 +4673,21 @@ dependencies = [
  "http 0.2.9",
  "http-body 0.4.5",
  "hyper 0.14.26",
- "hyper-tls",
+ "hyper-rustls 0.24.0",
  "ipnet",
  "js-sys",
  "log",
  "mime",
- "native-tls",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
+ "rustls 0.21.11",
+ "rustls-pemfile 1.0.2",
  "serde",
  "serde_json",
  "serde_urlencoded",
  "tokio",
- "tokio-native-tls",
+ "tokio-rustls 0.24.0",
  "tokio-util",
  "tower-service",
  "url",
@@ -4807,6 +4695,7 @@ dependencies = [
  "wasm-bindgen-futures",
  "wasm-streams 0.3.0",
  "web-sys",
+ "webpki-roots 0.25.2",
  "winreg 0.50.0",
 ]
 
@@ -5232,20 +5121,22 @@ dependencies = [
  "hex",
  "histogram",
  "itertools",
- "native-tls",
+ "once_cell",
  "pageserver",
  "pageserver_api",
- "postgres-native-tls",
  "postgres_ffi",
  "rand 0.8.5",
  "remote_storage",
  "reqwest 0.12.4",
+ "rustls 0.22.4",
+ "rustls-native-certs 0.7.0",
  "serde",
  "serde_json",
  "serde_with",
  "thiserror",
  "tokio",
  "tokio-postgres",
+ "tokio-postgres-rustls",
  "tokio-rustls 0.25.0",
  "tokio-stream",
  "tokio-util",
@@ -6189,8 +6080,6 @@ checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
 dependencies = [
  "itoa",
  "js-sys",
- "libc",
- "num_threads",
  "serde",
  "time-core",
  "time-macros",
@@ -6300,16 +6189,6 @@ dependencies = [
  "syn 2.0.52",
 ]
 
-[[package]]
-name = "tokio-native-tls"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
-dependencies = [
- "native-tls",
- "tokio",
-]
-
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
@@ -6716,15 +6595,6 @@ version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
 
-[[package]]
-name = "tz-rs"
-version = "0.6.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33851b15c848fad2cf4b105c6bb66eb9512b6f6c44a4b13f57c53c73c707e2b4"
-dependencies = [
- "const_fn",
-]
-
 [[package]]
 name = "uname"
 version = "0.1.1"
@@ -7629,9 +7499,9 @@ dependencies = [
 
 [[package]]
 name = "zeroize"
-version = "1.6.0"
+version = "1.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
+checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
 dependencies = [
  "zeroize_derive",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 0887c039f8..58715db32b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,10 +46,10 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
-azure_core = "0.19"
-azure_identity = "0.19"
-azure_storage = "0.19"
-azure_storage_blobs = "0.19"
+azure_core = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
+azure_identity = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage_blobs = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -114,7 +114,6 @@ md5 = "0.7.0"
 measured = { version = "0.0.21", features=["lasso"] }
 measured-process = { version = "0.0.21" }
 memoffset = "0.8"
-native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
 num_cpus = "1.15"
@@ -191,7 +190,7 @@ url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
-webpki-roots = "0.25"
+rustls-native-certs = "0.7"
 x509-parser = "0.15"
 
 ## TODO replace this with tracing
@@ -200,7 +199,6 @@ log = "0.4"
 
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
@@ -241,8 +239,7 @@ tonic-build = "0.9"
 
 [patch.crates-io]
 
-# This is only needed for proxy's tests.
-# TODO: we should probably fork `tokio-postgres-rustls` instead.
+# Needed to get `tokio-postgres-rustls` to depend on our fork.
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 
 # bug fixes for UUID
diff --git a/deny.toml b/deny.toml
index 22e39a2ca3..469609c496 100644
--- a/deny.toml
+++ b/deny.toml
@@ -99,6 +99,13 @@ name = "async-executor"
 [[bans.deny]]
 name = "smol"
 
+[[bans.deny]]
+# We want to use rustls instead of the platform's native tls implementation.
+name = "native-tls"
+
+[[bans.deny]]
+name = "openssl"
+
 # This section is considered when running `cargo deny check sources`.
 # More documentation about the 'sources' section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 7da0763bc1..0b892e3277 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -82,6 +82,7 @@ thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
 tokio-postgres.workspace = true
+tokio-postgres-rustls.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
@@ -94,10 +95,8 @@ url.workspace = true
 urlencoding.workspace = true
 utils.workspace = true
 uuid.workspace = true
-webpki-roots.workspace = true
+rustls-native-certs.workspace = true
 x509-parser.workspace = true
-native-tls.workspace = true
-postgres-native-tls.workspace = true
 postgres-protocol.workspace = true
 redis.workspace = true
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 4433b3c1c2..feb09d5638 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -10,11 +10,14 @@ use crate::{
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
+use once_cell::sync::OnceCell;
 use pq_proto::StartupMessageParams;
-use std::{io, net::SocketAddr, time::Duration};
+use rustls::{client::danger::ServerCertVerifier, pki_types::InvalidDnsNameError};
+use std::{io, net::SocketAddr, sync::Arc, time::Duration};
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::tls::MakeTlsConnect;
+use tokio_postgres_rustls::MakeRustlsConnect;
 use tracing::{error, info, warn};
 
 const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
@@ -30,7 +33,7 @@ pub enum ConnectionError {
     CouldNotConnect(#[from] io::Error),
 
     #[error("{COULD_NOT_CONNECT}: {0}")]
-    TlsError(#[from] native_tls::Error),
+    TlsError(#[from] InvalidDnsNameError),
 
     #[error("{COULD_NOT_CONNECT}: {0}")]
     WakeComputeError(#[from] WakeComputeError),
@@ -257,7 +260,7 @@ pub struct PostgresConnection {
     /// Socket connected to a compute node.
     pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
         tokio::net::TcpStream,
-        postgres_native_tls::TlsStream<tokio::net::TcpStream>,
+        tokio_postgres_rustls::RustlsStream<tokio::net::TcpStream>,
     >,
     /// PostgreSQL connection parameters.
     pub params: std::collections::HashMap<String, String>,
@@ -282,12 +285,23 @@ impl ConnCfg {
         let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
         drop(pause);
 
-        let tls_connector = native_tls::TlsConnector::builder()
-            .danger_accept_invalid_certs(allow_self_signed_compute)
-            .build()
-            .unwrap();
-        let mut mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector);
-        let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;
+        let client_config = if allow_self_signed_compute {
+            // Allow all certificates for creating the connection
+            let verifier = Arc::new(AcceptEverythingVerifier) as Arc<dyn ServerCertVerifier>;
+            rustls::ClientConfig::builder()
+                .dangerous()
+                .with_custom_certificate_verifier(verifier)
+        } else {
+            let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
+            rustls::ClientConfig::builder().with_root_certificates(root_store)
+        };
+        let client_config = client_config.with_no_client_auth();
+
+        let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
+        let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
+            &mut mk_tls,
+            host,
+        )?;
 
         // connect_raw() will not use TLS if sslmode is "disable"
         let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
@@ -340,6 +354,58 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
     Some(options)
 }
 
+fn load_certs() -> Result<Arc<rustls::RootCertStore>, io::Error> {
+    let der_certs = rustls_native_certs::load_native_certs()?;
+    let mut store = rustls::RootCertStore::empty();
+    store.add_parsable_certificates(der_certs);
+    Ok(Arc::new(store))
+}
+static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
+
+#[derive(Debug)]
+struct AcceptEverythingVerifier;
+impl ServerCertVerifier for AcceptEverythingVerifier {
+    fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
+        use rustls::SignatureScheme::*;
+        // The schemes for which `SignatureScheme::supported_in_tls13` returns true.
+        vec![
+            ECDSA_NISTP521_SHA512,
+            ECDSA_NISTP384_SHA384,
+            ECDSA_NISTP256_SHA256,
+            RSA_PSS_SHA512,
+            RSA_PSS_SHA384,
+            RSA_PSS_SHA256,
+            ED25519,
+        ]
+    }
+    fn verify_server_cert(
+        &self,
+        _end_entity: &rustls::pki_types::CertificateDer<'_>,
+        _intermediates: &[rustls::pki_types::CertificateDer<'_>],
+        _server_name: &rustls::pki_types::ServerName<'_>,
+        _ocsp_response: &[u8],
+        _now: rustls::pki_types::UnixTime,
+    ) -> Result<rustls::client::danger::ServerCertVerified, rustls::Error> {
+        Ok(rustls::client::danger::ServerCertVerified::assertion())
+    }
+    fn verify_tls12_signature(
+        &self,
+        _message: &[u8],
+        _cert: &rustls::pki_types::CertificateDer<'_>,
+        _dss: &rustls::DigitallySignedStruct,
+    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
+        Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
+    }
+    fn verify_tls13_signature(
+        &self,
+        _message: &[u8],
+        _cert: &rustls::pki_types::CertificateDer<'_>,
+        _dss: &rustls::DigitallySignedStruct,
+    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
+        Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml
index dd5d453a2b..e56bd43fb8 100644
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -22,8 +22,7 @@ serde_with.workspace = true
 workspace_hack.workspace = true
 utils.workspace = true
 async-stream.workspace = true
-native-tls.workspace = true
-postgres-native-tls.workspace = true
+tokio-postgres-rustls.workspace = true
 postgres_ffi.workspace = true
 tokio-stream.workspace = true
 tokio-postgres.workspace = true
@@ -31,6 +30,9 @@ tokio-util = { workspace = true }
 futures-util.workspace = true
 itertools.workspace = true
 camino.workspace = true
+rustls.workspace = true
+rustls-native-certs.workspace = true
+once_cell.workspace = true
 
 tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
 chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }
diff --git a/s3_scrubber/src/scan_safekeeper_metadata.rs b/s3_scrubber/src/scan_safekeeper_metadata.rs
index 73dd49ceb5..24051b03de 100644
--- a/s3_scrubber/src/scan_safekeeper_metadata.rs
+++ b/s3_scrubber/src/scan_safekeeper_metadata.rs
@@ -1,7 +1,8 @@
-use std::{collections::HashSet, str::FromStr};
+use std::{collections::HashSet, str::FromStr, sync::Arc};
 
 use aws_sdk_s3::Client;
 use futures::stream::{StreamExt, TryStreamExt};
+use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::{XLogFileName, PG_TLI};
 use serde::Serialize;
@@ -70,9 +71,12 @@ pub async fn scan_safekeeper_metadata(
         "checking bucket {}, region {}, dump_db_table {}",
         bucket_config.bucket, bucket_config.region, dump_db_table
     );
-    // Use the native TLS implementation (Neon requires TLS)
-    let tls_connector =
-        postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap());
+    // Use rustls (Neon requires TLS)
+    let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
+    let client_config = rustls::ClientConfig::builder()
+        .with_root_certificates(root_store)
+        .with_no_client_auth();
+    let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
     let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
     // The connection object performs the actual communication with the database,
     // so spawn it off to run on its own.
@@ -234,3 +238,11 @@ async fn check_timeline(
         is_deleted: false,
     })
 }
+
+fn load_certs() -> Result<Arc<rustls::RootCertStore>, std::io::Error> {
+    let der_certs = rustls_native_certs::load_native_certs()?;
+    let mut store = rustls::RootCertStore::empty();
+    store.add_parsable_certificates(der_certs);
+    Ok(Arc::new(store))
+}
+static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index f364a6c2e0..df16c71789 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -59,7 +59,7 @@ regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
 reqwest-5ef9efb8ec2df382 = { package = "reqwest", version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] }
-reqwest-a6292c17cd707f01 = { package = "reqwest", version = "0.11", default-features = false, features = ["blocking", "default-tls", "stream"] }
+reqwest-a6292c17cd707f01 = { package = "reqwest", version = "0.11", default-features = false, features = ["blocking", "rustls-tls", "stream"] }
 rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
@@ -68,7 +68,7 @@ sha2 = { version = "0.10", features = ["asm"] }
 smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
 subtle = { version = "2" }
 sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
-time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }
+time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }

From c8cebecabf75149211866cd8e8f07ec061ccc2a5 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 29 May 2024 11:17:05 +0100
Subject: [PATCH 0885/1571] proxy: reintroduce dynamic limiter for compute lock
 (#7737)

## Problem

Computes that are healthy can manage many connection attempts at a time.
Unhealthy computes cannot. We initially handled this with a fixed
concurrency limit, but it seems this inhibits pgbench.

## Summary of changes

Support AIMD for connect_to_compute lock to allow varying the
concurrency limit based on compute health
---
 Cargo.lock                                    |   1 +
 proxy/Cargo.toml                              |   1 +
 proxy/src/bin/proxy.rs                        |  17 +-
 proxy/src/config.rs                           |  67 ++++-
 proxy/src/console/provider.rs                 |  44 +--
 proxy/src/console/provider/neon.rs            |   2 +-
 proxy/src/proxy/connect_compute.rs            |   4 +-
 proxy/src/rate_limiter.rs                     |   4 +
 proxy/src/rate_limiter/limit_algorithm.rs     | 275 ++++++++++++++++++
 .../src/rate_limiter/limit_algorithm/aimd.rs  | 184 ++++++++++++
 proxy/src/serverless/backend.rs               |   4 +-
 11 files changed, 563 insertions(+), 40 deletions(-)
 create mode 100644 proxy/src/rate_limiter/limit_algorithm.rs
 create mode 100644 proxy/src/rate_limiter/limit_algorithm/aimd.rs

diff --git a/Cargo.lock b/Cargo.lock
index b1a307dd19..794486e2e1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4302,6 +4302,7 @@ dependencies = [
  "http 1.1.0",
  "http-body-util",
  "humantime",
+ "humantime-serde",
  "hyper 0.14.26",
  "hyper 1.2.0",
  "hyper-util",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 0b892e3277..288f7769fe 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -38,6 +38,7 @@ hmac.workspace = true
 hostname.workspace = true
 http.workspace = true
 humantime.workspace = true
+humantime-serde.workspace = true
 hyper.workspace = true
 hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
 hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 30f2e6f4b7..dffebf5580 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -557,14 +557,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
 
             let config::ConcurrencyLockOptions {
                 shards,
-                permits,
+                limiter,
                 epoch,
                 timeout,
             } = args.wake_compute_lock.parse()?;
-            info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
+            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
             let locks = Box::leak(Box::new(console::locks::ApiLocks::new(
                 "wake_compute_lock",
-                permits,
+                limiter,
                 shards,
                 timeout,
                 epoch,
@@ -603,14 +603,19 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
 
     let config::ConcurrencyLockOptions {
         shards,
-        permits,
+        limiter,
         epoch,
         timeout,
     } = args.connect_compute_lock.parse()?;
-    info!(permits, shards, ?epoch, "Using NodeLocks (connect_compute)");
+    info!(
+        ?limiter,
+        shards,
+        ?epoch,
+        "Using NodeLocks (connect_compute)"
+    );
     let connect_compute_locks = console::locks::ApiLocks::new(
         "connect_compute_lock",
-        permits,
+        limiter,
         shards,
         timeout,
         epoch,
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 5a0c251ce2..f4707a33aa 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,7 +1,7 @@
 use crate::{
     auth::{self, backend::AuthRateLimiter},
     console::locks::ApiLocks,
-    rate_limiter::RateBucketInfo,
+    rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig},
     scram::threadpool::ThreadPool,
     serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
     Host,
@@ -580,14 +580,18 @@ impl RetryConfig {
 }
 
 /// Helper for cmdline cache options parsing.
+#[derive(serde::Deserialize)]
 pub struct ConcurrencyLockOptions {
     /// The number of shards the lock map should have
     pub shards: usize,
     /// The number of allowed concurrent requests for each endpoitn
-    pub permits: usize,
+    #[serde(flatten)]
+    pub limiter: RateLimiterConfig,
     /// Garbage collection epoch
+    #[serde(deserialize_with = "humantime_serde::deserialize")]
     pub epoch: Duration,
     /// Lock timeout
+    #[serde(deserialize_with = "humantime_serde::deserialize")]
     pub timeout: Duration,
 }
 
@@ -596,13 +600,18 @@ impl ConcurrencyLockOptions {
     pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
     /// Default options for [`crate::console::provider::ApiLocks`].
     pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str =
-        "shards=64,permits=10,epoch=10m,timeout=10ms";
+        "shards=64,permits=100,epoch=10m,timeout=10ms";
 
     // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";
 
     /// Parse lock options passed via cmdline.
     /// Example: [`Self::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK`].
     fn parse(options: &str) -> anyhow::Result<Self> {
+        let options = options.trim();
+        if options.starts_with('{') && options.ends_with('}') {
+            return Ok(serde_json::from_str(options)?);
+        }
+
         let mut shards = None;
         let mut permits = None;
         let mut epoch = None;
@@ -629,9 +638,13 @@ impl ConcurrencyLockOptions {
             shards = Some(2);
         }
 
+        let permits = permits.context("missing `permits`")?;
         let out = Self {
             shards: shards.context("missing `shards`")?,
-            permits: permits.context("missing `permits`")?,
+            limiter: RateLimiterConfig {
+                algorithm: RateLimitAlgorithm::Fixed,
+                initial_limit: permits,
+            },
             epoch: epoch.context("missing `epoch`")?,
             timeout: timeout.context("missing `timeout`")?,
         };
@@ -657,6 +670,8 @@ impl FromStr for ConcurrencyLockOptions {
 
 #[cfg(test)]
 mod tests {
+    use crate::rate_limiter::Aimd;
+
     use super::*;
 
     #[test]
@@ -684,36 +699,68 @@ mod tests {
     fn test_parse_lock_options() -> anyhow::Result<()> {
         let ConcurrencyLockOptions {
             epoch,
-            permits,
+            limiter,
             shards,
             timeout,
         } = "shards=32,permits=4,epoch=10m,timeout=1s".parse()?;
         assert_eq!(epoch, Duration::from_secs(10 * 60));
         assert_eq!(timeout, Duration::from_secs(1));
         assert_eq!(shards, 32);
-        assert_eq!(permits, 4);
+        assert_eq!(limiter.initial_limit, 4);
+        assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed);
 
         let ConcurrencyLockOptions {
             epoch,
-            permits,
+            limiter,
             shards,
             timeout,
         } = "epoch=60s,shards=16,timeout=100ms,permits=8".parse()?;
         assert_eq!(epoch, Duration::from_secs(60));
         assert_eq!(timeout, Duration::from_millis(100));
         assert_eq!(shards, 16);
-        assert_eq!(permits, 8);
+        assert_eq!(limiter.initial_limit, 8);
+        assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed);
 
         let ConcurrencyLockOptions {
             epoch,
-            permits,
+            limiter,
             shards,
             timeout,
         } = "permits=0".parse()?;
         assert_eq!(epoch, Duration::ZERO);
         assert_eq!(timeout, Duration::ZERO);
         assert_eq!(shards, 2);
-        assert_eq!(permits, 0);
+        assert_eq!(limiter.initial_limit, 0);
+        assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_parse_json_lock_options() -> anyhow::Result<()> {
+        let ConcurrencyLockOptions {
+            epoch,
+            limiter,
+            shards,
+            timeout,
+        } = r#"{"shards":32,"initial_limit":44,"aimd":{"min":5,"max":500,"inc":10,"dec":0.9,"utilisation":0.8},"epoch":"10m","timeout":"1s"}"#
+            .parse()?;
+        assert_eq!(epoch, Duration::from_secs(10 * 60));
+        assert_eq!(timeout, Duration::from_secs(1));
+        assert_eq!(shards, 32);
+        assert_eq!(limiter.initial_limit, 44);
+        assert_eq!(
+            limiter.algorithm,
+            RateLimitAlgorithm::Aimd {
+                conf: Aimd {
+                    min: 5,
+                    max: 500,
+                    dec: 0.9,
+                    inc: 10,
+                    utilisation: 0.8
+                }
+            },
+        );
 
         Ok(())
     }
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 3b996cdbd1..4d074f98a5 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -15,11 +15,11 @@ use crate::{
     error::ReportableError,
     intern::ProjectIdInt,
     metrics::ApiLockMetrics,
+    rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token},
     scram, EndpointCacheKey,
 };
 use dashmap::DashMap;
 use std::{hash::Hash, sync::Arc, time::Duration};
-use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio::time::Instant;
 use tracing::info;
 
@@ -443,8 +443,8 @@ impl ApiCaches {
 /// Various caches for [`console`](super).
 pub struct ApiLocks<K> {
     name: &'static str,
-    node_locks: DashMap<K, Arc<Semaphore>>,
-    permits: usize,
+    node_locks: DashMap<K, Arc<DynamicLimiter>>,
+    config: RateLimiterConfig,
     timeout: Duration,
     epoch: std::time::Duration,
     metrics: &'static ApiLockMetrics,
@@ -452,8 +452,6 @@ pub struct ApiLocks<K> {
 
 #[derive(Debug, thiserror::Error)]
 pub enum ApiLockError {
-    #[error("lock was closed")]
-    AcquireError(#[from] tokio::sync::AcquireError),
     #[error("permit could not be acquired")]
     TimeoutError(#[from] tokio::time::error::Elapsed),
 }
@@ -461,7 +459,6 @@ pub enum ApiLockError {
 impl ReportableError for ApiLockError {
     fn get_error_kind(&self) -> crate::error::ErrorKind {
         match self {
-            ApiLockError::AcquireError(_) => crate::error::ErrorKind::Service,
             ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit,
         }
     }
@@ -470,7 +467,7 @@ impl ReportableError for ApiLockError {
 impl<K: Hash + Eq + Clone> ApiLocks<K> {
     pub fn new(
         name: &'static str,
-        permits: usize,
+        config: RateLimiterConfig,
         shards: usize,
         timeout: Duration,
         epoch: std::time::Duration,
@@ -479,7 +476,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
         Ok(Self {
             name,
             node_locks: DashMap::with_shard_amount(shards),
-            permits,
+            config,
             timeout,
             epoch,
             metrics,
@@ -487,8 +484,10 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
     }
 
     pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
-        if self.permits == 0 {
-            return Ok(WakeComputePermit { permit: None });
+        if self.config.initial_limit == 0 {
+            return Ok(WakeComputePermit {
+                permit: Token::disabled(),
+            });
         }
         let now = Instant::now();
         let semaphore = {
@@ -500,24 +499,22 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
                     .entry(key.clone())
                     .or_insert_with(|| {
                         self.metrics.semaphores_registered.inc();
-                        Arc::new(Semaphore::new(self.permits))
+                        DynamicLimiter::new(self.config)
                     })
                     .clone()
             }
         };
-        let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await;
+        let permit = semaphore.acquire_deadline(now + self.timeout).await;
 
         self.metrics
             .semaphore_acquire_seconds
             .observe(now.elapsed().as_secs_f64());
 
-        Ok(WakeComputePermit {
-            permit: Some(permit??),
-        })
+        Ok(WakeComputePermit { permit: permit? })
     }
 
     pub async fn garbage_collect_worker(&self) {
-        if self.permits == 0 {
+        if self.config.initial_limit == 0 {
             return;
         }
         let mut interval =
@@ -547,12 +544,21 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
 }
 
 pub struct WakeComputePermit {
-    // None if the lock is disabled
-    permit: Option<OwnedSemaphorePermit>,
+    permit: Token,
 }
 
 impl WakeComputePermit {
     pub fn should_check_cache(&self) -> bool {
-        self.permit.is_some()
+        !self.permit.is_disabled()
+    }
+    pub fn release(self, outcome: Outcome) {
+        self.permit.release(outcome)
+    }
+    pub fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
+        match res {
+            Ok(_) => self.release(Outcome::Success),
+            Err(_) => self.release(Outcome::Overload),
+        }
+        res
     }
 }
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 7728d2cafa..5d691e5f15 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -301,7 +301,7 @@ impl super::Api for Api {
             }
         }
 
-        let mut node = self.do_wake_compute(ctx, user_info).await?;
+        let mut node = permit.release_result(self.do_wake_compute(ctx, user_info).await)?;
         ctx.set_project(node.aux.clone());
         let cold_start_info = node.aux.cold_start_info;
         info!("woken up a compute node");
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index c8528d0296..409d45b39a 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -84,8 +84,8 @@ impl ConnectMechanism for TcpMechanism<'_> {
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
         let host = node_info.config.get_host()?;
-        let _permit = self.locks.get_permit(&host).await?;
-        node_info.connect(ctx, timeout).await
+        let permit = self.locks.get_permit(&host).await?;
+        permit.release_result(node_info.connect(ctx, timeout).await)
     }
 
     fn update_connect_config(&self, config: &mut compute::ConnCfg) {
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index c542267547..be9072dd8c 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -1,2 +1,6 @@
+mod limit_algorithm;
 mod limiter;
+pub use limit_algorithm::{
+    aimd::Aimd, DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
+};
 pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
new file mode 100644
index 0000000000..072fdb80b0
--- /dev/null
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -0,0 +1,275 @@
+//! Algorithms for controlling concurrency limits.
+use parking_lot::Mutex;
+use std::{pin::pin, sync::Arc, time::Duration};
+use tokio::{
+    sync::Notify,
+    time::{error::Elapsed, timeout_at, Instant},
+};
+
+use self::aimd::Aimd;
+
+pub mod aimd;
+
+/// Whether a job succeeded or failed as a result of congestion/overload.
+///
+/// Errors not considered to be caused by overload should be ignored.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Outcome {
+    /// The job succeeded, or failed in a way unrelated to overload.
+    Success,
+    /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal
+    /// was observed.
+    Overload,
+}
+
+/// An algorithm for controlling a concurrency limit.
+pub trait LimitAlgorithm: Send + Sync + 'static {
+    /// Update the concurrency limit in response to a new job completion.
+    fn update(&self, old_limit: usize, sample: Sample) -> usize;
+}
+
+/// The result of a job (or jobs), including the [`Outcome`] (loss) and latency (delay).
+#[derive(Debug, Clone, PartialEq, Eq, Copy)]
+pub struct Sample {
+    pub(crate) latency: Duration,
+    /// Jobs in flight when the sample was taken.
+    pub(crate) in_flight: usize,
+    pub(crate) outcome: Outcome,
+}
+
+#[derive(Clone, Copy, Debug, Default, serde::Deserialize, PartialEq)]
+#[serde(rename_all = "snake_case")]
+pub enum RateLimitAlgorithm {
+    #[default]
+    Fixed,
+    Aimd {
+        #[serde(flatten)]
+        conf: Aimd,
+    },
+}
+
+pub struct Fixed;
+
+impl LimitAlgorithm for Fixed {
+    fn update(&self, old_limit: usize, _sample: Sample) -> usize {
+        old_limit
+    }
+}
+
+#[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)]
+pub struct RateLimiterConfig {
+    #[serde(flatten)]
+    pub algorithm: RateLimitAlgorithm,
+    pub initial_limit: usize,
+}
+
+impl RateLimiterConfig {
+    pub fn create_rate_limit_algorithm(self) -> Box<dyn LimitAlgorithm> {
+        match self.algorithm {
+            RateLimitAlgorithm::Fixed => Box::new(Fixed),
+            RateLimitAlgorithm::Aimd { conf } => Box::new(conf),
+        }
+    }
+}
+
+pub struct LimiterInner {
+    alg: Box<dyn LimitAlgorithm>,
+    available: usize,
+    limit: usize,
+    in_flight: usize,
+}
+
+impl LimiterInner {
+    fn update(&mut self, latency: Duration, outcome: Option<Outcome>) {
+        if let Some(outcome) = outcome {
+            let sample = Sample {
+                latency,
+                in_flight: self.in_flight,
+                outcome,
+            };
+            self.limit = self.alg.update(self.limit, sample);
+        }
+    }
+
+    fn take(&mut self, ready: &Notify) -> Option<()> {
+        if self.available > 1 {
+            self.available -= 1;
+            self.in_flight += 1;
+
+            // tell the next in the queue that there is a permit ready
+            if self.available > 1 {
+                ready.notify_one();
+            }
+            Some(())
+        } else {
+            None
+        }
+    }
+}
+
+/// Limits the number of concurrent jobs.
+///
+/// Concurrency is limited through the use of [`Token`]s. Acquire a token to run a job, and release the
+/// token once the job is finished.
+///
+/// The limit will be automatically adjusted based on observed latency (delay) and/or failures
+/// caused by overload (loss).
+pub struct DynamicLimiter {
+    config: RateLimiterConfig,
+    inner: Mutex<LimiterInner>,
+    // to notify when a token is available
+    ready: Notify,
+}
+
+/// A concurrency token, required to run a job.
+///
+/// Release the token back to the [`DynamicLimiter`] after the job is complete.
+pub struct Token {
+    start: Instant,
+    limiter: Option<Arc<DynamicLimiter>>,
+}
+
+/// A snapshot of the state of the [`DynamicLimiter`].
+///
+/// Not guaranteed to be consistent under high concurrency.
+#[derive(Debug, Clone, Copy)]
+pub struct LimiterState {
+    limit: usize,
+    in_flight: usize,
+}
+
+impl DynamicLimiter {
+    /// Create a limiter with a given limit control algorithm.
+    pub fn new(config: RateLimiterConfig) -> Arc<Self> {
+        let ready = Notify::new();
+        ready.notify_one();
+
+        Arc::new(Self {
+            inner: Mutex::new(LimiterInner {
+                alg: config.create_rate_limit_algorithm(),
+                available: config.initial_limit,
+                limit: config.initial_limit,
+                in_flight: 0,
+            }),
+            ready,
+            config,
+        })
+    }
+
+    /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available.
+    ///
+    /// Returns `None` if there are none available after `duration`.
+    pub async fn acquire_timeout(self: &Arc<Self>, duration: Duration) -> Result<Token, Elapsed> {
+        self.acquire_deadline(Instant::now() + duration).await
+    }
+
+    /// Try to acquire a concurrency [Token], waiting until `deadline` if there are none available.
+    ///
+    /// Returns `None` if there are none available after `deadline`.
+    pub async fn acquire_deadline(self: &Arc<Self>, deadline: Instant) -> Result<Token, Elapsed> {
+        if self.config.initial_limit == 0 {
+            // If the rate limiter is disabled, we can always acquire a token.
+            Ok(Token::disabled())
+        } else {
+            let mut notified = pin!(self.ready.notified());
+            let mut ready = notified.as_mut().enable();
+            loop {
+                let mut limit = None;
+                if ready {
+                    let mut inner = self.inner.lock();
+                    if inner.take(&self.ready).is_some() {
+                        break Ok(Token::new(self.clone()));
+                    }
+                    limit = Some(inner.limit);
+                }
+                match timeout_at(deadline, notified.as_mut()).await {
+                    Ok(()) => ready = true,
+                    Err(e) => {
+                        let limit = limit.unwrap_or_else(|| self.inner.lock().limit);
+                        tracing::info!(limit, "could not acquire token in time");
+                        break Err(e);
+                    }
+                }
+            }
+        }
+    }
+
+    /// Return the concurrency [Token], along with the outcome of the job.
+    ///
+    /// The [Outcome] of the job, and the time taken to perform it, may be used
+    /// to update the concurrency limit.
+    ///
+    /// Set the outcome to `None` to ignore the job.
+    fn release_inner(&self, start: Instant, outcome: Option<Outcome>) {
+        tracing::info!("outcome is {:?}", outcome);
+        if self.config.initial_limit == 0 {
+            return;
+        }
+
+        let mut inner = self.inner.lock();
+
+        inner.update(start.elapsed(), outcome);
+        if inner.in_flight < inner.limit {
+            inner.available = inner.limit - inner.in_flight;
+            // At least 1 permit is now available
+            self.ready.notify_one();
+        }
+
+        inner.in_flight -= 1;
+    }
+
+    /// The current state of the limiter.
+    pub fn state(&self) -> LimiterState {
+        let inner = self.inner.lock();
+        LimiterState {
+            limit: inner.limit,
+            in_flight: inner.in_flight,
+        }
+    }
+}
+
+impl Token {
+    fn new(limiter: Arc<DynamicLimiter>) -> Self {
+        Self {
+            start: Instant::now(),
+            limiter: Some(limiter),
+        }
+    }
+    pub fn disabled() -> Self {
+        Self {
+            start: Instant::now(),
+            limiter: None,
+        }
+    }
+
+    pub fn is_disabled(&self) -> bool {
+        self.limiter.is_none()
+    }
+
+    pub fn release(mut self, outcome: Outcome) {
+        self.release_mut(Some(outcome))
+    }
+
+    pub fn release_mut(&mut self, outcome: Option<Outcome>) {
+        if let Some(limiter) = self.limiter.take() {
+            limiter.release_inner(self.start, outcome);
+        }
+    }
+}
+
+impl Drop for Token {
+    fn drop(&mut self) {
+        self.release_mut(None)
+    }
+}
+
+impl LimiterState {
+    /// The current concurrency limit.
+    pub fn limit(&self) -> usize {
+        self.limit
+    }
+    /// The number of jobs in flight.
+    pub fn in_flight(&self) -> usize {
+        self.in_flight
+    }
+}
diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
new file mode 100644
index 0000000000..370d4be802
--- /dev/null
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -0,0 +1,184 @@
+use std::usize;
+
+use super::{LimitAlgorithm, Outcome, Sample};
+
+/// Loss-based congestion avoidance.
+///
+/// Additive-increase, multiplicative decrease.
+///
+/// Adds available currency when:
+/// 1. no load-based errors are observed, and
+/// 2. the utilisation of the current limit is high.
+///
+/// Reduces available concurrency by a factor when load-based errors are detected.
+#[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)]
+pub struct Aimd {
+    /// Minimum limit for AIMD algorithm.
+    pub min: usize,
+    /// Maximum limit for AIMD algorithm.
+    pub max: usize,
+    /// Decrease AIMD decrease by value in case of error.
+    pub dec: f32,
+    /// Increase AIMD increase by value in case of success.
+    pub inc: usize,
+    /// A threshold below which the limit won't be increased.
+    pub utilisation: f32,
+}
+
+impl LimitAlgorithm for Aimd {
+    fn update(&self, old_limit: usize, sample: Sample) -> usize {
+        use Outcome::*;
+        match sample.outcome {
+            Success => {
+                let utilisation = sample.in_flight as f32 / old_limit as f32;
+
+                if utilisation > self.utilisation {
+                    let limit = old_limit + self.inc;
+                    let increased_limit = limit.clamp(self.min, self.max);
+                    if increased_limit > old_limit {
+                        tracing::info!(increased_limit, "limit increased");
+                    }
+
+                    increased_limit
+                } else {
+                    old_limit
+                }
+            }
+            Overload => {
+                let limit = old_limit as f32 * self.dec;
+
+                // Floor instead of round, so the limit reduces even with small numbers.
+                // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1
+                let limit = limit.floor() as usize;
+
+                limit.clamp(self.min, self.max)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use crate::rate_limiter::limit_algorithm::{
+        DynamicLimiter, RateLimitAlgorithm, RateLimiterConfig,
+    };
+
+    use super::*;
+
+    #[tokio::test(start_paused = true)]
+    async fn should_decrease_limit_on_overload() {
+        let config = RateLimiterConfig {
+            initial_limit: 10,
+            algorithm: RateLimitAlgorithm::Aimd {
+                conf: Aimd {
+                    min: 1,
+                    max: 1500,
+                    inc: 10,
+                    dec: 0.5,
+                    utilisation: 0.8,
+                },
+            },
+        };
+
+        let limiter = DynamicLimiter::new(config);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        token.release(Outcome::Overload);
+
+        assert_eq!(limiter.state().limit(), 5, "overload: decrease");
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn should_increase_limit_on_success_when_using_gt_util_threshold() {
+        let config = RateLimiterConfig {
+            initial_limit: 4,
+            algorithm: RateLimitAlgorithm::Aimd {
+                conf: Aimd {
+                    min: 1,
+                    max: 1500,
+                    inc: 1,
+                    dec: 0.5,
+                    utilisation: 0.5,
+                },
+            },
+        };
+
+        let limiter = DynamicLimiter::new(config);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        let _token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        let _token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+
+        token.release(Outcome::Success);
+        assert_eq!(limiter.state().limit(), 5, "success: increase");
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn should_not_change_limit_on_success_when_using_lt_util_threshold() {
+        let config = RateLimiterConfig {
+            initial_limit: 4,
+            algorithm: RateLimitAlgorithm::Aimd {
+                conf: Aimd {
+                    min: 1,
+                    max: 1500,
+                    inc: 10,
+                    dec: 0.5,
+                    utilisation: 0.5,
+                },
+            },
+        };
+
+        let limiter = DynamicLimiter::new(config);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+
+        token.release(Outcome::Success);
+        assert_eq!(
+            limiter.state().limit(),
+            4,
+            "success: ignore when < half limit"
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn should_not_change_limit_when_no_outcome() {
+        let config = RateLimiterConfig {
+            initial_limit: 10,
+            algorithm: RateLimitAlgorithm::Aimd {
+                conf: Aimd {
+                    min: 1,
+                    max: 1500,
+                    inc: 10,
+                    dec: 0.5,
+                    utilisation: 0.5,
+                },
+            },
+        };
+
+        let limiter = DynamicLimiter::new(config);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        drop(token);
+        assert_eq!(limiter.state().limit(), 10, "ignore");
+    }
+}
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 52fc7b556a..a40c66a80d 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -232,9 +232,9 @@ impl ConnectMechanism for TokioMechanism {
             .connect_timeout(timeout);
 
         let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
-        let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
+        let res = config.connect(tokio_postgres::NoTls).await;
         drop(pause);
-        drop(permit);
+        let (client, connection) = permit.release_result(res)?;
 
         tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
         Ok(poll_client(

From 7ac11d39421330b64b8dfa72b83439d51c05da0b Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 29 May 2024 22:18:09 +0300
Subject: [PATCH 0886/1571] Do not produce error if gin page is not restored in
 redo (#7876)

## Problem

See https://github.com/neondatabase/cloud/issues/10845

## Summary of changes

Do not report error if GIN page is not restored

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 test_runner/regress/test_gin_redo.py | 22 ++++++++++++++++++++++
 vendor/postgres-v14                  |  2 +-
 vendor/postgres-v15                  |  2 +-
 vendor/postgres-v16                  |  2 +-
 vendor/revisions.json                |  6 +++---
 5 files changed, 28 insertions(+), 6 deletions(-)
 create mode 100644 test_runner/regress/test_gin_redo.py

diff --git a/test_runner/regress/test_gin_redo.py b/test_runner/regress/test_gin_redo.py
new file mode 100644
index 0000000000..9205882239
--- /dev/null
+++ b/test_runner/regress/test_gin_redo.py
@@ -0,0 +1,22 @@
+import time
+
+from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup
+
+
+#
+# Test that redo of XLOG_GIN_VACUUM_PAGE doesn't produce error
+#
+def test_gin_redo(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    time.sleep(1)
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+    con = primary.connect()
+    cur = con.cursor()
+    cur.execute("create table gin_test_tbl(id integer, i int4[])")
+    cur.execute("create index gin_test_idx on gin_test_tbl using gin (i)")
+    cur.execute("insert into gin_test_tbl select g,array[3, 1, g] from generate_series(1, 10000) g")
+    cur.execute("delete from gin_test_tbl where id % 2 = 0")
+    cur.execute("vacuum gin_test_tbl")
+    wait_replica_caughtup(primary, secondary)
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 0d30e28f74..17e0f5ff4e 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 0d30e28f74f49fe6a27a6bd45dcfeb1060656b8f
+Subproject commit 17e0f5ff4e1905691aa40e1e08f9b79b14c99652
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 74fb144890..c2c3d40534 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 74fb144890c4f955db1ef50ee1eeb9d8a6c2f69d
+Subproject commit c2c3d40534db97d83dd7e185d1971e707fa2f445
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 3c2b9d576c..b228f20372 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 3c2b9d576c580e0b5b7108001f959b8c5b42e0a2
+Subproject commit b228f20372ebcabfd7946647cb7adbd38bacb14a
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 2f16f334c5..5bf4e289ef 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "3c2b9d576c580e0b5b7108001f959b8c5b42e0a2"],
-  "v15": ["15.7", "74fb144890c4f955db1ef50ee1eeb9d8a6c2f69d"],
-  "v14": ["14.12", "0d30e28f74f49fe6a27a6bd45dcfeb1060656b8f"]
+  "v16": ["16.3", "b228f20372ebcabfd7946647cb7adbd38bacb14a"],
+  "v15": ["15.7", "c2c3d40534db97d83dd7e185d1971e707fa2f445"],
+  "v14": ["14.12", "17e0f5ff4e1905691aa40e1e08f9b79b14c99652"]
 }

From b0a954bde237f424381a76af5ffd046a9b0e85a7 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Thu, 30 May 2024 08:25:10 +0200
Subject: [PATCH 0887/1571] CI: switch ubuntu-latest with ubuntu-22.04 (#7256)
 (#7901)

## Problem
We use ubuntu-latest as a default OS for running jobs. It can cause
problems due to instability, so we should use the LTS version of Ubuntu.

## Summary of changes
The image ubuntu-latest was changed with ubuntu-22.04 in workflows.

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 .github/workflows/actionlint.yml                 | 14 +++++++++++++-
 .github/workflows/approved-for-ci-run.yml        |  6 +++---
 .github/workflows/benchmarking.yml               |  2 +-
 .github/workflows/build-build-tools-image.yml    |  2 +-
 .github/workflows/build_and_test.yml             | 12 ++++++------
 .github/workflows/check-build-tools-image.yml    |  2 +-
 .github/workflows/check-permissions.yml          |  2 +-
 .github/workflows/cleanup-caches-by-a-branch.yml |  2 +-
 .github/workflows/pg_clients.yml                 |  2 +-
 .github/workflows/pin-build-tools-image.yml      |  2 +-
 .github/workflows/release-notify.yml             |  2 +-
 .github/workflows/release.yml                    |  4 ++--
 .github/workflows/trigger-e2e-tests.yml          |  6 +++---
 13 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index f2736614bf..078c7f88c4 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -24,7 +24,7 @@ jobs:
 
   actionlint:
     needs: [ check-permissions ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
       - uses: reviewdog/action-actionlint@v1
@@ -36,3 +36,15 @@ jobs:
           fail_on_error: true
           filter_mode: nofilter
           level: error
+      - run: |
+          PAT='^\s*runs-on:.*-latest'
+          if grep -ERq $PAT .github/workflows
+          then
+            grep -ERl $PAT .github/workflows |\
+            while read -r f
+            do
+              l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
+              echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
+            done
+            exit 1
+          fi
diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml
index ab616d17e2..b14b66a439 100644
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -44,7 +44,7 @@ jobs:
       contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
       contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
 
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
@@ -60,7 +60,7 @@ jobs:
       github.event.action == 'labeled' &&
       contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
 
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
@@ -109,7 +109,7 @@ jobs:
       github.event.action == 'closed' &&
       github.event.pull_request.head.repo.full_name != github.repository
 
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index d5a375d704..57d24063bf 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -137,7 +137,7 @@ jobs:
     # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
     env:
       RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     outputs:
       pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
       olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index bdf00bcaae..9aacb09d10 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -88,7 +88,7 @@ jobs:
 
   merge-images:
     needs: [ build-image ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     env:
       IMAGE_TAG: ${{ inputs.image-tag }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index f8c011a0a5..b9caf76060 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -35,7 +35,7 @@ jobs:
   cancel-previous-e2e-tests:
     needs: [ check-permissions ]
     if: github.event_name == 'pull_request'
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - name: Cancel previous e2e-tests runs for this PR
@@ -549,7 +549,7 @@ jobs:
   report-benchmarks-failures:
     needs: [ benchmarks, create-test-report ]
     if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
     - uses: slackapi/slack-github-action@v1
@@ -774,7 +774,7 @@ jobs:
 
   neon-image:
     needs: [ neon-image-arch, tag ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - uses: docker/login-action@v3
@@ -884,7 +884,7 @@ jobs:
 
   compute-node-image:
     needs: [ compute-node-image-arch, tag ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     strategy:
       matrix:
@@ -1032,7 +1032,7 @@ jobs:
 
   promote-images:
     needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     env:
       VERSIONS: v14 v15 v16
@@ -1077,7 +1077,7 @@ jobs:
 
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - name: Set PR's status to pending and request a remote CI test
         run: |
diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml
index a1e22cf93f..97116940a0 100644
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -19,7 +19,7 @@ permissions: {}
 
 jobs:
   check-image:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     outputs:
       tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
       found: ${{ steps.check-image.outputs.found }}
diff --git a/.github/workflows/check-permissions.yml b/.github/workflows/check-permissions.yml
index c3357c6cf8..9c42794797 100644
--- a/.github/workflows/check-permissions.yml
+++ b/.github/workflows/check-permissions.yml
@@ -16,7 +16,7 @@ permissions: {}
 
 jobs:
   check-permissions:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
     - name: Disallow CI runs on PRs from forks
       if: |
diff --git a/.github/workflows/cleanup-caches-by-a-branch.yml b/.github/workflows/cleanup-caches-by-a-branch.yml
index d8c225dedb..0c074e36dc 100644
--- a/.github/workflows/cleanup-caches-by-a-branch.yml
+++ b/.github/workflows/cleanup-caches-by-a-branch.yml
@@ -9,7 +9,7 @@ on:
 
 jobs:
   cleanup:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - name: Cleanup
         run: |
diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml
index 50e3227a74..fef3aec754 100644
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -20,7 +20,7 @@ concurrency:
 jobs:
   test-postgres-client-libs:
     # TODO: switch to gen2 runner, requires docker
-    runs-on: [ ubuntu-latest ]
+    runs-on: ubuntu-22.04
 
     env:
       DEFAULT_PG_VERSION: 14
diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index d495a158e8..024594532f 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -26,7 +26,7 @@ permissions: {}
 
 jobs:
   tag-image:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     env:
       FROM_TAG: ${{ inputs.from-tag }}
diff --git a/.github/workflows/release-notify.yml b/.github/workflows/release-notify.yml
index ba396dba74..8bd10e993c 100644
--- a/.github/workflows/release-notify.yml
+++ b/.github/workflows/release-notify.yml
@@ -19,7 +19,7 @@ on:
 
 jobs:
   notify:
-    runs-on: [ ubuntu-latest ]
+    runs-on: ubuntu-22.04
 
     steps:
       - uses: neondatabase/dev-actions/release-pr-notify@main
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index fe24f6330e..90a3aaaf2d 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -26,7 +26,7 @@ defaults:
 jobs:
   create-storage-release-branch:
     if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     permissions:
       contents: write # for `git push`
@@ -65,7 +65,7 @@ jobs:
 
   create-proxy-release-branch:
     if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     permissions:
       contents: write # for `git push`
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 7111ee37fa..77928a343e 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -19,7 +19,7 @@ env:
 jobs:
   cancel-previous-e2e-tests:
     if: github.event_name == 'pull_request'
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - name: Cancel previous e2e-tests runs for this PR
@@ -31,7 +31,7 @@ jobs:
               --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
 
   tag:
-    runs-on: [ ubuntu-latest ]
+    runs-on: ubuntu-22.04
     outputs:
       build-tag: ${{ steps.build-tag.outputs.tag }}
 
@@ -62,7 +62,7 @@ jobs:
 
   trigger-e2e-tests:
     needs: [ tag ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     env:
       TAG: ${{ needs.tag.outputs.build-tag }}
     steps:

From 238fa47bee911d23730dc1e8e91defb0bf57dda9 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 30 May 2024 11:09:27 +0100
Subject: [PATCH 0888/1571] proxy fix wake compute rate limit (#7902)

## Problem

We were rate limiting wake_compute in the wrong place

## Summary of changes

Move wake_compute rate limit to after the permit is acquired. Also makes
a slight refactor on normalize, as it caught my eye
---
 proxy/src/auth/backend.rs          |  2 +-
 proxy/src/console/provider/neon.rs | 19 ++++++++++---------
 proxy/src/lib.rs                   | 21 ++++++++++++---------
 3 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 3555eba543..f757a15fbb 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -35,7 +35,7 @@ use crate::{
     },
     stream, url,
 };
-use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
+use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
 
 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 5d691e5f15..d72229b029 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -13,7 +13,7 @@ use crate::{
     http,
     metrics::{CacheOutcome, Metrics},
     rate_limiter::EndpointRateLimiter,
-    scram, EndpointCacheKey, Normalize,
+    scram, EndpointCacheKey,
 };
 use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
@@ -281,14 +281,6 @@ impl super::Api for Api {
             return Ok(cached);
         }
 
-        // check rate limit
-        if !self
-            .wake_compute_endpoint_rate_limiter
-            .check(user_info.endpoint.normalize().into(), 1)
-        {
-            return Err(WakeComputeError::TooManyConnections);
-        }
-
         let permit = self.locks.get_permit(&key).await?;
 
         // after getting back a permit - it's possible the cache was filled
@@ -301,6 +293,15 @@ impl super::Api for Api {
             }
         }
 
+        // check rate limit
+        if !self
+            .wake_compute_endpoint_rate_limiter
+            .check(user_info.endpoint.normalize_intern(), 1)
+        {
+            info!(key = &*key, "found cached compute node info");
+            return Err(WakeComputeError::TooManyConnections);
+        }
+
         let mut node = permit.release_result(self.do_wake_compute(ctx, user_info).await)?;
         ctx.set_project(node.aux.clone());
         let cold_start_info = node.aux.cold_start_info;
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 35c1616481..ea92eaaa55 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -3,6 +3,7 @@
 use std::convert::Infallible;
 
 use anyhow::{bail, Context};
+use intern::{EndpointIdInt, EndpointIdTag, InternId};
 use tokio::task::JoinError;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
@@ -129,20 +130,22 @@ macro_rules! smol_str_wrapper {
 
 const POOLER_SUFFIX: &str = "-pooler";
 
-pub trait Normalize {
-    fn normalize(&self) -> Self;
-}
-
-impl<S: Clone + AsRef<str> + From<String>> Normalize for S {
+impl EndpointId {
     fn normalize(&self) -> Self {
-        if self.as_ref().ends_with(POOLER_SUFFIX) {
-            let mut s = self.as_ref().to_string();
-            s.truncate(s.len() - POOLER_SUFFIX.len());
-            s.into()
+        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
+            stripped.into()
         } else {
             self.clone()
         }
     }
+
+    fn normalize_intern(&self) -> EndpointIdInt {
+        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
+            EndpointIdTag::get_interner().get_or_intern(stripped)
+        } else {
+            self.into()
+        }
+    }
 }
 
 // 90% of role name strings are 20 characters or less.

From fddd11dd1a4d86623d1683ecc7f9f679a5132f89 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 30 May 2024 11:10:27 +0100
Subject: [PATCH 0889/1571] proxy: upload postgres connection options as json
 in the parquet upload (#7903)

## Problem

https://github.com/neondatabase/cloud/issues/9943

## Summary of changes

Captures the postgres options, converts them to json, uploads them in
parquet.
---
 libs/pq_proto/src/lib.rs              |  7 ++-
 proxy/src/auth/backend/link.rs        |  1 +
 proxy/src/auth/credentials.rs         |  8 ---
 proxy/src/context.rs                  | 21 ++++++-
 proxy/src/context/parquet.rs          | 83 +++++++++++++++++----------
 proxy/src/proxy.rs                    |  2 +
 proxy/src/serverless/sql_over_http.rs | 13 +++--
 7 files changed, 89 insertions(+), 46 deletions(-)

diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 522b65f5d1..f8e578c6f2 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -50,12 +50,17 @@ pub enum FeStartupPacket {
     },
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone, Default)]
 pub struct StartupMessageParams {
     params: HashMap<String, String>,
 }
 
 impl StartupMessageParams {
+    /// Set parameter's value by its name.
+    pub fn insert(&mut self, name: &str, value: &str) {
+        self.params.insert(name.to_owned(), value.to_owned());
+    }
+
     /// Get parameter's value by its name.
     pub fn get(&self, name: &str) -> Option<&str> {
         self.params.get(name).map(|s| s.as_str())
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index 415a4b7d85..5932e1337c 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -100,6 +100,7 @@ pub(super) async fn authenticate(
         .dbname(&db_info.dbname)
         .user(&db_info.user);
 
+    ctx.set_dbname(db_info.dbname.into());
     ctx.set_user(db_info.user.into());
     ctx.set_project(db_info.aux.clone());
     info!("woken up a compute node");
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 783a1a5a21..d06f5614f1 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -11,7 +11,6 @@ use crate::{
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
-use smol_str::SmolStr;
 use std::{collections::HashSet, net::IpAddr, str::FromStr};
 use thiserror::Error;
 use tracing::{info, warn};
@@ -96,13 +95,6 @@ impl ComputeUserInfoMaybeEndpoint {
         let get_param = |key| params.get(key).ok_or(MissingKey(key));
         let user: RoleName = get_param("user")?.into();
 
-        // record the values if we have them
-        ctx.set_application(params.get("application_name").map(SmolStr::from));
-        ctx.set_user(user.clone());
-        if let Some(dbname) = params.get("database") {
-            ctx.set_dbname(dbname.into());
-        }
-
         // Project name might be passed via PG's command-line options.
         let endpoint_option = params
             .options_raw()
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index dfd3ef108e..ff79ba8275 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -2,6 +2,7 @@
 
 use chrono::Utc;
 use once_cell::sync::OnceCell;
+use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
@@ -46,6 +47,7 @@ pub struct RequestMonitoring {
     pub(crate) auth_method: Option<AuthMethod>,
     success: bool,
     pub(crate) cold_start_info: ColdStartInfo,
+    pg_options: Option<StartupMessageParams>,
 
     // extra
     // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -102,6 +104,7 @@ impl RequestMonitoring {
             success: false,
             rejected: None,
             cold_start_info: ColdStartInfo::Unknown,
+            pg_options: None,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
             disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
@@ -132,6 +135,18 @@ impl RequestMonitoring {
         self.latency_timer.cold_start_info(info);
     }
 
+    pub fn set_db_options(&mut self, options: StartupMessageParams) {
+        self.set_application(options.get("application_name").map(SmolStr::from));
+        if let Some(user) = options.get("user") {
+            self.set_user(user.into());
+        }
+        if let Some(dbname) = options.get("database") {
+            self.set_dbname(dbname.into());
+        }
+
+        self.pg_options = Some(options);
+    }
+
     pub fn set_project(&mut self, x: MetricsAuxInfo) {
         if self.endpoint_id.is_none() {
             self.set_endpoint_id(x.endpoint_id.as_str().into())
@@ -155,8 +170,10 @@ impl RequestMonitoring {
         }
     }
 
-    pub fn set_application(&mut self, app: Option<SmolStr>) {
-        self.application = app.or_else(|| self.application.clone());
+    fn set_application(&mut self, app: Option<SmolStr>) {
+        if let Some(app) = app {
+            self.application = Some(app);
+        }
     }
 
     pub fn set_dbname(&mut self, dbname: DbName) {
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index a213a32ca4..1355b7e1d8 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -13,7 +13,9 @@ use parquet::{
     },
     record::RecordWriter,
 };
+use pq_proto::StartupMessageParams;
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
+use serde::ser::SerializeMap;
 use tokio::{sync::mpsc, time};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
@@ -87,6 +89,7 @@ pub struct RequestData {
     database: Option<String>,
     project: Option<String>,
     branch: Option<String>,
+    pg_options: Option<String>,
     auth_method: Option<&'static str>,
     error: Option<&'static str>,
     /// Success is counted if we form a HTTP response with sql rows inside
@@ -101,6 +104,23 @@ pub struct RequestData {
     disconnect_timestamp: Option<chrono::NaiveDateTime>,
 }
 
+struct Options<'a> {
+    options: &'a StartupMessageParams,
+}
+
+impl<'a> serde::Serialize for Options<'a> {
+    fn serialize<S>(&self, s: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        let mut state = s.serialize_map(None)?;
+        for (k, v) in self.options.iter() {
+            state.serialize_entry(k, v)?;
+        }
+        state.end()
+    }
+}
+
 impl From<&RequestMonitoring> for RequestData {
     fn from(value: &RequestMonitoring) -> Self {
         Self {
@@ -113,6 +133,10 @@ impl From<&RequestMonitoring> for RequestData {
             database: value.dbname.as_deref().map(String::from),
             project: value.project.as_deref().map(String::from),
             branch: value.branch.as_deref().map(String::from),
+            pg_options: value
+                .pg_options
+                .as_ref()
+                .and_then(|options| serde_json::to_string(&Options { options }).ok()),
             auth_method: value.auth_method.as_ref().map(|x| match x {
                 super::AuthMethod::Web => "web",
                 super::AuthMethod::ScramSha256 => "scram_sha_256",
@@ -494,6 +518,7 @@ mod tests {
             database: Some(hex::encode(rng.gen::<[u8; 16]>())),
             project: Some(hex::encode(rng.gen::<[u8; 16]>())),
             branch: Some(hex::encode(rng.gen::<[u8; 16]>())),
+            pg_options: None,
             auth_method: None,
             protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
             region: "us-east-1",
@@ -570,15 +595,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315314, 3, 6000),
-                (1315307, 3, 6000),
-                (1315367, 3, 6000),
-                (1315324, 3, 6000),
-                (1315454, 3, 6000),
-                (1315296, 3, 6000),
-                (1315088, 3, 6000),
-                (1315324, 3, 6000),
-                (438713, 1, 2000)
+                (1315874, 3, 6000),
+                (1315867, 3, 6000),
+                (1315927, 3, 6000),
+                (1315884, 3, 6000),
+                (1316014, 3, 6000),
+                (1315856, 3, 6000),
+                (1315648, 3, 6000),
+                (1315884, 3, 6000),
+                (438913, 1, 2000)
             ]
         );
 
@@ -608,11 +633,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1222212, 5, 10000),
-                (1228362, 5, 10000),
-                (1230156, 5, 10000),
-                (1229518, 5, 10000),
-                (1220796, 5, 10000)
+                (1223214, 5, 10000),
+                (1229364, 5, 10000),
+                (1231158, 5, 10000),
+                (1230520, 5, 10000),
+                (1221798, 5, 10000)
             ]
         );
 
@@ -644,11 +669,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1207859, 5, 10000),
-                (1207590, 5, 10000),
-                (1207883, 5, 10000),
-                (1207871, 5, 10000),
-                (1208126, 5, 10000)
+                (1208861, 5, 10000),
+                (1208592, 5, 10000),
+                (1208885, 5, 10000),
+                (1208873, 5, 10000),
+                (1209128, 5, 10000)
             ]
         );
 
@@ -673,15 +698,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315314, 3, 6000),
-                (1315307, 3, 6000),
-                (1315367, 3, 6000),
-                (1315324, 3, 6000),
-                (1315454, 3, 6000),
-                (1315296, 3, 6000),
-                (1315088, 3, 6000),
-                (1315324, 3, 6000),
-                (438713, 1, 2000)
+                (1315874, 3, 6000),
+                (1315867, 3, 6000),
+                (1315927, 3, 6000),
+                (1315884, 3, 6000),
+                (1316014, 3, 6000),
+                (1315856, 3, 6000),
+                (1315648, 3, 6000),
+                (1315884, 3, 6000),
+                (438913, 1, 2000)
             ]
         );
 
@@ -718,7 +743,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(659462, 2, 3001), (659176, 2, 3000), (658972, 2, 2999)]
+            [(659836, 2, 3001), (659550, 2, 3000), (659346, 2, 2999)]
         );
 
         tmpdir.close().unwrap();
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 5824b70df9..95b46ae002 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -267,6 +267,8 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         };
     drop(pause);
 
+    ctx.set_db_options(params.clone());
+
     let hostname = mode.hostname(stream.get_ref());
 
     let common_names = tls.map(|tls| &tls.common_names);
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 5376bddfd3..9a7cdc8577 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -17,6 +17,7 @@ use hyper1::http::HeaderValue;
 use hyper1::Response;
 use hyper1::StatusCode;
 use hyper1::{HeaderMap, Request};
+use pq_proto::StartupMessageParams;
 use serde_json::json;
 use serde_json::Value;
 use tokio::time;
@@ -192,13 +193,13 @@ fn get_conn_info(
 
     let mut options = Option::None;
 
+    let mut params = StartupMessageParams::default();
+    params.insert("user", &username);
+    params.insert("database", &dbname);
     for (key, value) in pairs {
-        match &*key {
-            "options" => {
-                options = Some(NeonOptions::parse_options_raw(&value));
-            }
-            "application_name" => ctx.set_application(Some(value.into())),
-            _ => {}
+        params.insert(&key, &value);
+        if key == "options" {
+            options = Some(NeonOptions::parse_options_raw(&value));
         }
     }
 

From 9a081c230f6b1d2bff7fea1e201631a7e7ee4328 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 30 May 2024 12:02:38 +0100
Subject: [PATCH 0890/1571] proxy: lazily parse startup pg params (#7905)

## Problem

proxy params being a `HashMap<String,String>` when it contains just
```
application_name: psql
database: neondb
user: neondb_owner
```
is quite wasteful allocation wise.

## Summary of changes

Keep the params in the wire protocol form, eg:
```
application_name\0psql\0database\0neondb\0user\0neondb_owner\0
```

Using a linear search for the map is fast enough at small sizes, which
is the normal case.
---
 Cargo.lock                            |  1 +
 libs/pq_proto/Cargo.toml              |  1 +
 libs/pq_proto/src/lib.rs              | 80 +++++++++++++++------------
 proxy/src/serverless/sql_over_http.rs |  4 +-
 4 files changed, 48 insertions(+), 38 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 794486e2e1..44edbabaf6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4113,6 +4113,7 @@ version = "0.1.0"
 dependencies = [
  "byteorder",
  "bytes",
+ "itertools",
  "pin-project-lite",
  "postgres-protocol",
  "rand 0.8.5",
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index 6eeb3bafef..8afabe670e 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -7,6 +7,7 @@ license.workspace = true
 [dependencies]
 bytes.workspace = true
 byteorder.workspace = true
+itertools.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index f8e578c6f2..cee3742017 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -7,8 +7,9 @@ pub mod framed;
 
 use byteorder::{BigEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
+use itertools::Itertools;
 use serde::{Deserialize, Serialize};
-use std::{borrow::Cow, collections::HashMap, fmt, io, str};
+use std::{borrow::Cow, fmt, io, str};
 
 // re-export for use in utils pageserver_feedback.rs
 pub use postgres_protocol::PG_EPOCH;
@@ -50,20 +51,37 @@ pub enum FeStartupPacket {
     },
 }
 
+#[derive(Debug, Clone, Default)]
+pub struct StartupMessageParamsBuilder {
+    params: BytesMut,
+}
+
+impl StartupMessageParamsBuilder {
+    /// Set parameter's value by its name.
+    /// name and value must not contain a \0 byte
+    pub fn insert(&mut self, name: &str, value: &str) {
+        self.params.put(name.as_bytes());
+        self.params.put(&b"\0"[..]);
+        self.params.put(value.as_bytes());
+        self.params.put(&b"\0"[..]);
+    }
+
+    pub fn freeze(self) -> StartupMessageParams {
+        StartupMessageParams {
+            params: self.params.freeze(),
+        }
+    }
+}
+
 #[derive(Debug, Clone, Default)]
 pub struct StartupMessageParams {
-    params: HashMap<String, String>,
+    params: Bytes,
 }
 
 impl StartupMessageParams {
-    /// Set parameter's value by its name.
-    pub fn insert(&mut self, name: &str, value: &str) {
-        self.params.insert(name.to_owned(), value.to_owned());
-    }
-
     /// Get parameter's value by its name.
     pub fn get(&self, name: &str) -> Option<&str> {
-        self.params.get(name).map(|s| s.as_str())
+        self.iter().find_map(|(k, v)| (k == name).then_some(v))
     }
 
     /// Split command-line options according to PostgreSQL's logic,
@@ -117,15 +135,19 @@ impl StartupMessageParams {
 
     /// Iterate through key-value pairs in an arbitrary order.
     pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
-        self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
+        let params =
+            std::str::from_utf8(&self.params).expect("should be validated as utf8 already");
+        params.split_terminator('\0').tuples()
     }
 
     // This function is mostly useful in tests.
     #[doc(hidden)]
     pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self {
-        Self {
-            params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(),
+        let mut b = StartupMessageParamsBuilder::default();
+        for (k, v) in pairs {
+            b.insert(k, v)
         }
+        b.freeze()
     }
 }
 
@@ -350,35 +372,21 @@ impl FeStartupPacket {
             (major_version, minor_version) => {
                 // StartupMessage
 
-                // Parse pairs of null-terminated strings (key, value).
-                // See `postgres: ProcessStartupPacket, build_startup_packet`.
-                let mut tokens = str::from_utf8(&msg)
-                    .map_err(|_e| {
-                        ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
-                    })?
-                    .strip_suffix('\0') // drop packet's own null
-                    .ok_or_else(|| {
-                        ProtocolError::Protocol(
-                            "StartupMessage params: missing null terminator".to_string(),
-                        )
-                    })?
-                    .split_terminator('\0');
-
-                let mut params = HashMap::new();
-                while let Some(name) = tokens.next() {
-                    let value = tokens.next().ok_or_else(|| {
-                        ProtocolError::Protocol(
-                            "StartupMessage params: key without value".to_string(),
-                        )
-                    })?;
-
-                    params.insert(name.to_owned(), value.to_owned());
-                }
+                let s = str::from_utf8(&msg).map_err(|_e| {
+                    ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
+                })?;
+                let s = s.strip_suffix('\0').ok_or_else(|| {
+                    ProtocolError::Protocol(
+                        "StartupMessage params: missing null terminator".to_string(),
+                    )
+                })?;
 
                 FeStartupPacket::StartupMessage {
                     major_version,
                     minor_version,
-                    params: StartupMessageParams { params },
+                    params: StartupMessageParams {
+                        params: msg.slice_ref(s.as_bytes()),
+                    },
                 }
             }
         };
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 9a7cdc8577..9d6a475aeb 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -17,7 +17,7 @@ use hyper1::http::HeaderValue;
 use hyper1::Response;
 use hyper1::StatusCode;
 use hyper1::{HeaderMap, Request};
-use pq_proto::StartupMessageParams;
+use pq_proto::StartupMessageParamsBuilder;
 use serde_json::json;
 use serde_json::Value;
 use tokio::time;
@@ -193,7 +193,7 @@ fn get_conn_info(
 
     let mut options = Option::None;
 
-    let mut params = StartupMessageParams::default();
+    let mut params = StartupMessageParamsBuilder::default();
     params.insert("user", &username);
     params.insert("database", &dbname);
     for (key, value) in pairs {

From 167394a0735abb422e0f2b544af4b160edbd2f34 Mon Sep 17 00:00:00 2001
From: YukiSeino <36467282+SeinoYuki@users.noreply.github.com>
Date: Thu, 30 May 2024 22:58:20 +0900
Subject: [PATCH 0891/1571] refacter : VirtualFile::open uses AsRef (#7908)

## Problem
#7371

## Summary of changes
* The VirtualFile::open, open_with_options, and create methods use
AsRef, similar to the standard library's std::fs APIs.
---
 pageserver/src/virtual_file.rs | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index b68f3a0e89..04d9386fab 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -344,21 +344,21 @@ macro_rules! with_file {
 
 impl VirtualFile {
     /// Open a file in read-only mode. Like File::open.
-    pub async fn open(
-        path: &Utf8Path,
+    pub async fn open<P: AsRef<Utf8Path>>(
+        path: P,
         ctx: &RequestContext,
     ) -> Result<VirtualFile, std::io::Error> {
-        Self::open_with_options(path, OpenOptions::new().read(true), ctx).await
+        Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await
     }
 
     /// Create a new file for writing. If the file exists, it will be truncated.
     /// Like File::create.
-    pub async fn create(
-        path: &Utf8Path,
+    pub async fn create<P: AsRef<Utf8Path>>(
+        path: P,
         ctx: &RequestContext,
     ) -> Result<VirtualFile, std::io::Error> {
         Self::open_with_options(
-            path,
+            path.as_ref(),
             OpenOptions::new().write(true).create(true).truncate(true),
             ctx,
         )
@@ -370,12 +370,13 @@ impl VirtualFile {
     /// Note: If any custom flags were set in 'open_options' through OpenOptionsExt,
     /// they will be applied also when the file is subsequently re-opened, not only
     /// on the first time. Make sure that's sane!
-    pub async fn open_with_options(
-        path: &Utf8Path,
+    pub async fn open_with_options<P: AsRef<Utf8Path>>(
+        path: P,
         open_options: &OpenOptions,
         _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
     ) -> Result<VirtualFile, std::io::Error> {
-        let path_str = path.to_string();
+        let path_ref = path.as_ref();
+        let path_str = path_ref.to_string();
         let parts = path_str.split('/').collect::<Vec<&str>>();
         let (tenant_id, shard_id, timeline_id) =
             if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
@@ -401,7 +402,7 @@ impl VirtualFile {
         // where our caller doesn't get to use the returned VirtualFile before its
         // slot gets re-used by someone else.
         let file = observe_duration!(StorageIoOperation::Open, {
-            open_options.open(path.as_std_path()).await?
+            open_options.open(path_ref.as_std_path()).await?
         });
 
         // Strip all options other than read and write.
@@ -417,7 +418,7 @@ impl VirtualFile {
         let vfile = VirtualFile {
             handle: RwLock::new(handle),
             pos: 0,
-            path: path.to_path_buf(),
+            path: path_ref.to_path_buf(),
             open_options: reopen_options,
             tenant_id,
             shard_id,

From 1eca8b8a6b56e82445d9d8354e7acfee97c80603 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 30 May 2024 10:03:17 -0400
Subject: [PATCH 0892/1571] fix(pageserver): ensure to_i128 works for metadata
 keys (#7895)

field2 of metadata keys can be 0xFFFF because of the mapping. Allow
0xFFFF for `to_i128`. An alternative is to encode 0xFFFF as 0xFFFFFFFF
(which is allowed in the original `to_i128`). But checking the places
where field2 is referenced, the rest part of the system does not seem to
depend on this assertion.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs | 37 +++++++++++-----------------------
 1 file changed, 12 insertions(+), 25 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 2511de00d5..b00d48498c 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,6 +1,5 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
-use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
@@ -53,14 +52,8 @@ impl Key {
     /// Encode a metadata key to a storage key.
     pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
         assert!(is_metadata_key_slice(key), "key not in metadata key range");
-        Key {
-            field1: key[0],
-            field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
-            field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
-            field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
-            field5: key[11],
-            field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
-        }
+        // Metadata key space ends at 0x7F so it's fine to directly convert it to i128.
+        Self::from_i128(i128::from_be_bytes(*key))
     }
 
     /// Encode a metadata key to a storage key.
@@ -68,17 +61,6 @@ impl Key {
         Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
     }
 
-    /// Extract a metadata key to a writer. The result should always be 16 bytes.
-    pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
-        writer.put_u8(self.field1);
-        assert!(self.field2 <= 0xFFFF);
-        writer.put_u16(self.field2 as u16);
-        writer.put_u32(self.field3);
-        writer.put_u32(self.field4);
-        writer.put_u8(self.field5);
-        writer.put_u32(self.field6);
-    }
-
     /// Get the range of metadata keys.
     pub const fn metadata_key_range() -> Range<Self> {
         Key {
@@ -121,7 +103,7 @@ impl Key {
     /// As long as Neon does not support tablespace (because of lack of access to local file system),
     /// we can assume that only some predefined namespace OIDs are used which can fit in u16
     pub fn to_i128(&self) -> i128 {
-        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
+        assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
         (((self.field1 & 0x7F) as i128) << 120)
             | (((self.field2 & 0xFFFF) as i128) << 104)
             | ((self.field3 as i128) << 72)
@@ -175,7 +157,7 @@ impl Key {
     }
 
     /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::from_metadata_key`] instead.
+    /// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys).
     pub fn from_slice(b: &[u8]) -> Self {
         Key {
             field1: b[0],
@@ -188,7 +170,7 @@ impl Key {
     }
 
     /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::extract_metadata_key_to_writer`] instead.
+    /// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys).
     pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
         buf[0] = self.field1;
         BE::write_u32(&mut buf[1..5], self.field2);
@@ -687,10 +669,15 @@ mod tests {
         let mut metadata_key = vec![AUX_KEY_PREFIX];
         metadata_key.extend_from_slice(&[0xFF; 15]);
         let encoded_key = Key::from_metadata_key(&metadata_key);
-        let mut output_key = Vec::new();
-        encoded_key.extract_metadata_key_to_writer(&mut output_key);
+        let output_key = encoded_key.to_i128().to_be_bytes();
         assert_eq!(metadata_key, output_key);
         assert!(encoded_key.is_metadata_key());
         assert!(is_metadata_key_slice(&metadata_key));
     }
+
+    #[test]
+    fn test_possible_largest_key() {
+        Key::from_i128(0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF);
+        // TODO: put this key into the system and see if anything breaks.
+    }
 }

From 33395dcf4ef137b39d3ba8022e9e4d07e7de9ed4 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 30 May 2024 10:31:57 -0400
Subject: [PATCH 0893/1571] perf(pageserver): postpone vectored get fringe
 keyspace construction (#7904)

Perf shows a significant amount of time is spent on `Keyspace::merge`.
This pull request postpones merging keyspace until retrieving the layer,
which contributes to a 30x improvement in aux keyspace basebackup time.

```
--- old
10000 files found in 0.580569459s
--- new
10000 files found in 0.02995075s
```

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/pagebench/src/cmd/aux_files.rs | 17 ++++++++++++-----
 pageserver/src/tenant/storage_layer.rs    | 17 +++++++++++++----
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs
index eb5b242a5f..bce3285606 100644
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -5,6 +5,7 @@ use utils::lsn::Lsn;
 
 use std::collections::HashMap;
 use std::sync::Arc;
+use std::time::Instant;
 
 /// Ingest aux files into the pageserver.
 #[derive(clap::Parser)]
@@ -88,11 +89,17 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
         println!("ingested {file_cnt} files");
     }
 
-    let files = mgmt_api_client
-        .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
-        .await?;
-
-    println!("{} files found", files.len());
+    for _ in 0..100 {
+        let start = Instant::now();
+        let files = mgmt_api_client
+            .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
+            .await?;
+        println!(
+            "{} files found in {}s",
+            files.len(),
+            start.elapsed().as_secs_f64()
+        );
+    }
 
     anyhow::Ok(())
 }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9ccf20c0d4..0b3f841ccf 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -318,7 +318,7 @@ pub(crate) struct LayerFringe {
 #[derive(Debug)]
 struct LayerKeyspace {
     layer: ReadableLayer,
-    target_keyspace: KeySpace,
+    target_keyspace: Vec<KeySpace>,
 }
 
 impl LayerFringe {
@@ -336,6 +336,7 @@ impl LayerFringe {
         };
 
         let removed = self.layers.remove_entry(&read_desc.layer_id);
+
         match removed {
             Some((
                 _,
@@ -343,7 +344,15 @@ impl LayerFringe {
                     layer,
                     target_keyspace,
                 },
-            )) => Some((layer, target_keyspace, read_desc.lsn_range)),
+            )) => {
+                let mut keyspace = KeySpaceRandomAccum::new();
+                for ks in target_keyspace {
+                    for part in ks.ranges {
+                        keyspace.add_range(part);
+                    }
+                }
+                Some((layer, keyspace.consume_keyspace(), read_desc.lsn_range))
+            }
             None => unreachable!("fringe internals are always consistent"),
         }
     }
@@ -358,7 +367,7 @@ impl LayerFringe {
         let entry = self.layers.entry(layer_id.clone());
         match entry {
             Entry::Occupied(mut entry) => {
-                entry.get_mut().target_keyspace.merge(&keyspace);
+                entry.get_mut().target_keyspace.push(keyspace);
             }
             Entry::Vacant(entry) => {
                 self.planned_reads_by_lsn.push(ReadDesc {
@@ -367,7 +376,7 @@ impl LayerFringe {
                 });
                 entry.insert(LayerKeyspace {
                     layer,
-                    target_keyspace: keyspace,
+                    target_keyspace: vec![keyspace],
                 });
             }
         }

From f20a9e760fc7371c84c550adcb3fe6c553610c96 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 30 May 2024 10:45:34 -0400
Subject: [PATCH 0894/1571] chore(pageserver): warn on delete non-existing file
 (#7847)

Consider the following sequence of migration:

```
1. user starts compute
2. force migrate to v2
3. user continues to write data
```

At the time of (3), the compute node is not aware that the page server
does not contain replication states any more, and might continue to
ingest neon-file records into the safekeeper. This will leave the
pageserver store a partial replication state and cause some errors. For
example, the compute could issue a deletion of some aux files in v1, but
this file does not exist in v2. Therefore, we should ignore all these
errors until everyone is migrated to v2.

Also note that if we see this warning in prod, it is likely because we
did not fully suspend users' compute when flipping the v1/v2 flag.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index afba34c6d1..4480c7df6e 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1552,7 +1552,7 @@ impl<'a> DatadirModification<'a> {
                     self.tline.aux_file_size_estimator.on_add(content.len());
                     new_files.push((path, content));
                 }
-                (None, true) => anyhow::bail!("removing non-existing aux file: {}", path),
+                (None, true) => warn!("removing non-existing aux file: {}", path),
             }
             let new_val = aux_file::encode_file_value(&new_files)?;
             self.put(key, Value::Image(new_val.into()));

From c18b1c06460f1a55d947cdad267da4f71278655c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 30 May 2024 17:45:48 +0200
Subject: [PATCH 0895/1571] Update tokio-epoll-uring for linux-raw-sys (#7918)

Updates the `tokio-epoll-uring` dependency.

There is [only one change](https://github.com/neondatabase/tokio-epoll-uring/compare/342ddd197a060a8354e8f11f4d12994419fff939...08ccfa94ff5507727bf4d8d006666b5b192e04c6),
the adoption of linux-raw-sys for `statx` instead of using libc.

Part of #7889.
---
 Cargo.lock | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 44edbabaf6..96ba5c8ec3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2915,6 +2915,12 @@ version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
 
+[[package]]
+name = "linux-raw-sys"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0b5399f6804fbab912acbd8878ed3532d506b7c951b8f9f164ef90fef39e3f4"
+
 [[package]]
 name = "lock_api"
 version = "0.4.10"
@@ -6157,7 +6163,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
 dependencies = [
  "futures",
  "nix 0.26.4",
@@ -6669,11 +6675,12 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
 dependencies = [
  "bytes",
  "io-uring",
  "libc",
+ "linux-raw-sys 0.6.4",
 ]
 
 [[package]]

From 98dadf854383f7d96cd6b30c87ba49b1b5a48d1e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 31 May 2024 09:18:58 +0100
Subject: [PATCH 0896/1571] pageserver: quieten some shutdown logs around
 logical size and flush (#7907)

## Problem

Looking at several noisy shutdown logs:
- In https://github.com/neondatabase/neon/issues/7861 we're hitting a
log error with `InternalServerError(timeline shutting down\n'` on the
checkpoint API handler.
- In the field, we see initial_logical_size_calculation errors on
shutdown, via DownloadError
- In the field, we see errors logged from layer download code
(independent of the error propagated) during shutdown

Closes: https://github.com/neondatabase/neon/issues/7861

## Summary of changes

The theme of these changes is to avoid propagating anyhow::Errors for
cases that aren't really unexpected error cases that we might want a
stacktrace for, and avoid "Other" error variants unless we really do
have unexpected error cases to propagate.

- On the flush_frozen_layers path, use the `FlushLayerError` type
throughout, rather than munging it into an anyhow::Error. Give
FlushLayerError an explicit from_anyhow helper that checks for timeline
cancellation, and uses it to give a Cancelled error instead of an Other
error when the timeline is shutting down.
- In logical size calculation, remove BackgroundCalculationError (this
type was just a Cancelled variant and an Other variant), and instead use
CalculateLogicalSizeError throughout. This can express a
PageReconstructError, and has a From impl that translates cancel-like
page reconstruct errors to Cancelled.
- Replace CalculateLogicalSizeError's Other(anyhow::Error) variant case
with a Decode(DeserializeError) variant, as this was the only kind of
error we actually used in the Other case.
- During layer download, drop out early if the timeline is shutting
down, so that we don't do an `error!()` log of the shutdown error in
this case.
---
 pageserver/src/http/routes.rs                 |  16 ++-
 pageserver/src/page_service.rs                |   6 +-
 pageserver/src/pgdatadir_mapping.rs           |  18 +++-
 pageserver/src/tenant.rs                      |   2 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  10 +-
 pageserver/src/tenant/timeline.rs             | 100 ++++++++++--------
 .../src/tenant/timeline/detach_ancestor.rs    |   4 +-
 test_runner/regress/test_ondemand_download.py |   2 +-
 8 files changed, 102 insertions(+), 56 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 8a061f3ae1..913d45d63c 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -74,6 +74,7 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::CompactFlags;
+use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
 use crate::tenant::SpawnMode;
@@ -1813,11 +1814,22 @@ async fn timeline_checkpoint_handler(
         timeline
             .freeze_and_flush()
             .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| {
+                match e {
+                    tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
+                    other => ApiError::InternalServerError(other.into()),
+
+                }
+            })?;
         timeline
             .compact(&cancel, flags, &ctx)
             .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .map_err(|e|
+                match e {
+                    CompactionError::ShuttingDown => ApiError::ShuttingDown,
+                    CompactionError::Other(e) => ApiError::InternalServerError(e)
+                }
+            )?;
 
         if wait_until_uploaded {
             timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index e9651165b1..35150b210e 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -66,6 +66,7 @@ use crate::tenant::mgr::GetTenantError;
 use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
+use crate::tenant::timeline::FlushLayerError;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
@@ -830,7 +831,10 @@ impl PageServerHandler {
         // We only want to persist the data, and it doesn't matter if it's in the
         // shape of deltas or images.
         info!("flushing layers");
-        timeline.freeze_and_flush().await?;
+        timeline.freeze_and_flush().await.map_err(|e| match e {
+            FlushLayerError::Cancelled => QueryError::Shutdown,
+            other => QueryError::Other(other.into()),
+        })?;
 
         info!("done");
         Ok(())
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 4480c7df6e..0fc846e5f3 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -78,11 +78,19 @@ pub enum LsnForTimestamp {
 }
 
 #[derive(Debug, thiserror::Error)]
-pub enum CalculateLogicalSizeError {
+pub(crate) enum CalculateLogicalSizeError {
     #[error("cancelled")]
     Cancelled,
+
+    /// Something went wrong while reading the metadata we use to calculate logical size
+    /// Note that cancellation variants of `PageReconstructError` are transformed to [`Self::Cancelled`]
+    /// in the `From` implementation for this variant.
     #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    PageRead(PageReconstructError),
+
+    /// Something went wrong deserializing metadata that we read to calculate logical size
+    #[error("decode error: {0}")]
+    Decode(#[from] DeserializeError),
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -110,7 +118,7 @@ impl From<PageReconstructError> for CalculateLogicalSizeError {
             PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
                 Self::Cancelled
             }
-            _ => Self::Other(pre.into()),
+            _ => Self::PageRead(pre),
         }
     }
 }
@@ -763,7 +771,7 @@ impl Timeline {
     /// # Cancel-Safety
     ///
     /// This method is cancellation-safe.
-    pub async fn get_current_logical_size_non_incremental(
+    pub(crate) async fn get_current_logical_size_non_incremental(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -772,7 +780,7 @@ impl Timeline {
 
         // Fetch list of database dirs and iterate them
         let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
+        let dbdir = DbDirectory::des(&buf)?;
 
         let mut total_size: u64 = 0;
         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e6bfd57a44..311338554c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4154,7 +4154,7 @@ mod tests {
                 .await?;
             writer.finish_write(lsn);
         }
-        tline.freeze_and_flush().await
+        tline.freeze_and_flush().await.map_err(|e| e.into())
     }
 
     #[tokio::test]
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 3ac799c69a..1ec13882da 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -366,7 +366,10 @@ impl Layer {
             .0
             .get_or_maybe_download(true, Some(ctx))
             .await
-            .map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;
+            .map_err(|err| match err {
+                DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
+                other => GetVectoredError::Other(anyhow::anyhow!(other)),
+            })?;
 
         self.0
             .access_stats
@@ -1158,6 +1161,11 @@ impl LayerInner {
                 let consecutive_failures =
                     1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
 
+                if timeline.cancel.is_cancelled() {
+                    // If we're shutting down, drop out before logging the error
+                    return Err(e);
+                }
+
                 tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
 
                 let backoff = utils::backoff::exponential_backoff_duration_seconds(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d4f6e25843..d07e2352fa 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -138,7 +138,7 @@ use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub(super) enum FlushLoopState {
+pub(crate) enum FlushLoopState {
     NotStarted,
     Running {
         #[cfg(test)]
@@ -577,7 +577,7 @@ impl PageReconstructError {
 }
 
 #[derive(thiserror::Error, Debug)]
-enum CreateImageLayersError {
+pub(crate) enum CreateImageLayersError {
     #[error("timeline shutting down")]
     Cancelled,
 
@@ -591,17 +591,35 @@ enum CreateImageLayersError {
     Other(#[from] anyhow::Error),
 }
 
-#[derive(thiserror::Error, Debug)]
-enum FlushLayerError {
+#[derive(thiserror::Error, Debug, Clone)]
+pub(crate) enum FlushLayerError {
     /// Timeline cancellation token was cancelled
     #[error("timeline shutting down")]
     Cancelled,
 
+    /// We tried to flush a layer while the Timeline is in an unexpected state
+    #[error("cannot flush frozen layers when flush_loop is not running, state is {0:?}")]
+    NotRunning(FlushLoopState),
+
+    // Arc<> the following non-clonable error types: we must be Clone-able because the flush error is propagated from the flush
+    // loop via a watch channel, where we can only borrow it.
     #[error(transparent)]
-    CreateImageLayersError(CreateImageLayersError),
+    CreateImageLayersError(Arc<CreateImageLayersError>),
 
     #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    Other(#[from] Arc<anyhow::Error>),
+}
+
+impl FlushLayerError {
+    // When crossing from generic anyhow errors to this error type, we explicitly check
+    // for timeline cancellation to avoid logging inoffensive shutdown errors as warn/err.
+    fn from_anyhow(timeline: &Timeline, err: anyhow::Error) -> Self {
+        if timeline.cancel.is_cancelled() {
+            Self::Cancelled
+        } else {
+            Self::Other(Arc::new(err))
+        }
+    }
 }
 
 #[derive(thiserror::Error, Debug)]
@@ -696,7 +714,7 @@ impl From<CreateImageLayersError> for FlushLayerError {
     fn from(e: CreateImageLayersError) -> Self {
         match e {
             CreateImageLayersError::Cancelled => FlushLayerError::Cancelled,
-            any => FlushLayerError::CreateImageLayersError(any),
+            any => FlushLayerError::CreateImageLayersError(Arc::new(any)),
         }
     }
 }
@@ -1547,13 +1565,13 @@ impl Timeline {
 
     /// Flush to disk all data that was written with the put_* functions
     #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
-    pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
+    pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> {
         self.freeze_and_flush0().await
     }
 
     // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
     // polluting the span hierarchy.
-    pub(crate) async fn freeze_and_flush0(&self) -> anyhow::Result<()> {
+    pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
         let to_lsn = self.freeze_inmem_layer(false).await;
         self.flush_frozen_layers_and_wait(to_lsn).await
     }
@@ -2735,11 +2753,6 @@ impl Timeline {
             self.current_logical_size.initialized.add_permits(1);
         }
 
-        enum BackgroundCalculationError {
-            Cancelled,
-            Other(anyhow::Error),
-        }
-
         let try_once = |attempt: usize| {
             let background_ctx = &background_ctx;
             let self_ref = &self;
@@ -2757,10 +2770,10 @@ impl Timeline {
                         (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
                     }
                     _ = self_ref.cancel.cancelled() => {
-                        return Err(BackgroundCalculationError::Cancelled);
+                        return Err(CalculateLogicalSizeError::Cancelled);
                     }
                     _ = cancel.cancelled() => {
-                        return Err(BackgroundCalculationError::Cancelled);
+                        return Err(CalculateLogicalSizeError::Cancelled);
                     },
                     () = skip_concurrency_limiter.cancelled() => {
                         // Some action that is part of a end user interaction requested logical size
@@ -2787,18 +2800,7 @@ impl Timeline {
                     .await
                 {
                     Ok(calculated_size) => Ok((calculated_size, metrics_guard)),
-                    Err(CalculateLogicalSizeError::Cancelled) => {
-                        Err(BackgroundCalculationError::Cancelled)
-                    }
-                    Err(CalculateLogicalSizeError::Other(err)) => {
-                        if let Some(PageReconstructError::AncestorStopping(_)) =
-                            err.root_cause().downcast_ref()
-                        {
-                            Err(BackgroundCalculationError::Cancelled)
-                        } else {
-                            Err(BackgroundCalculationError::Other(err))
-                        }
-                    }
+                    Err(e) => Err(e),
                 }
             }
         };
@@ -2810,8 +2812,11 @@ impl Timeline {
 
                 match try_once(attempt).await {
                     Ok(res) => return ControlFlow::Continue(res),
-                    Err(BackgroundCalculationError::Cancelled) => return ControlFlow::Break(()),
-                    Err(BackgroundCalculationError::Other(e)) => {
+                    Err(CalculateLogicalSizeError::Cancelled) => return ControlFlow::Break(()),
+                    Err(
+                        e @ (CalculateLogicalSizeError::Decode(_)
+                        | CalculateLogicalSizeError::PageRead(_)),
+                    ) => {
                         warn!(attempt, "initial size calculation failed: {e:?}");
                         // exponential back-off doesn't make sense at these long intervals;
                         // use fixed retry interval with generous jitter instead
@@ -3717,7 +3722,9 @@ impl Timeline {
                         return;
                     }
                     err @ Err(
-                        FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
+                        FlushLayerError::NotRunning(_)
+                        | FlushLayerError::Other(_)
+                        | FlushLayerError::CreateImageLayersError(_),
                     ) => {
                         error!("could not flush frozen layer: {err:?}");
                         break err.map(|_| ());
@@ -3763,7 +3770,10 @@ impl Timeline {
     /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
     /// it means no data will be written between the top of the highest frozen layer and to_lsn,
     /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
-    async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> {
+    async fn flush_frozen_layers_and_wait(
+        &self,
+        last_record_lsn: Lsn,
+    ) -> Result<(), FlushLayerError> {
         let mut rx = self.layer_flush_done_tx.subscribe();
 
         // Increment the flush cycle counter and wake up the flush task.
@@ -3774,7 +3784,7 @@ impl Timeline {
 
         let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
         if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
-            anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
+            return Err(FlushLayerError::NotRunning(flush_loop_state));
         }
 
         self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
@@ -3787,14 +3797,11 @@ impl Timeline {
             {
                 let (last_result_counter, last_result) = &*rx.borrow();
                 if *last_result_counter >= my_flush_request {
-                    if let Err(_err) = last_result {
+                    if let Err(err) = last_result {
                         // We already logged the original error in
                         // flush_loop. We cannot propagate it to the caller
                         // here, because it might not be Cloneable
-                        anyhow::bail!(
-                            "Could not flush frozen layer. Request id: {}",
-                            my_flush_request
-                        );
+                        return Err(err.clone());
                     } else {
                         return Ok(());
                     }
@@ -3803,7 +3810,7 @@ impl Timeline {
             trace!("waiting for flush to complete");
             tokio::select! {
                 rx_e = rx.changed() => {
-                    rx_e?;
+                    rx_e.map_err(|_| FlushLayerError::NotRunning(*self.flush_loop_state.lock().unwrap()))?;
                 },
                 // Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring
                 // the notification from [`flush_loop`] that it completed.
@@ -3875,7 +3882,8 @@ impl Timeline {
                     EnumSet::empty(),
                     ctx,
                 )
-                .await?;
+                .await
+                .map_err(|e| FlushLayerError::from_anyhow(self, e))?;
 
             if self.cancel.is_cancelled() {
                 return Err(FlushLayerError::Cancelled);
@@ -3899,7 +3907,8 @@ impl Timeline {
                     Some(metadata_keyspace.0.ranges[0].clone()),
                     ctx,
                 )
-                .await?
+                .await
+                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
             } else {
                 None
             };
@@ -3926,7 +3935,11 @@ impl Timeline {
             // Normal case, write out a L0 delta layer file.
             // `create_delta_layer` will not modify the layer map.
             // We will remove frozen layer and add delta layer in one atomic operation later.
-            let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else {
+            let Some(layer) = self
+                .create_delta_layer(&frozen_layer, None, ctx)
+                .await
+                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
+            else {
                 panic!("delta layer cannot be empty if no filter is applied");
             };
             (
@@ -3959,7 +3972,8 @@ impl Timeline {
 
             if self.set_disk_consistent_lsn(disk_consistent_lsn) {
                 // Schedule remote uploads that will reflect our new disk_consistent_lsn
-                self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
+                self.schedule_uploads(disk_consistent_lsn, layers_to_upload)
+                    .map_err(|e| FlushLayerError::from_anyhow(self, e))?;
             }
             // release lock on 'layers'
         };
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index e6ddabe5b5..4fc89330ba 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;
 
-use super::{layer_manager::LayerManager, Timeline};
+use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
 use crate::{
     context::{DownloadBehavior, RequestContext},
     task_mgr::TaskKind,
@@ -23,7 +23,7 @@ pub(crate) enum Error {
     #[error("shutting down, please retry later")]
     ShuttingDown,
     #[error("flushing failed")]
-    FlushAncestor(#[source] anyhow::Error),
+    FlushAncestor(#[source] FlushLayerError),
     #[error("layer download failed")]
     RewrittenDeltaDownloadFailed(#[source] anyhow::Error),
     #[error("copying LSN prefix locally failed")]
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index b137fb3a5c..6fe23846c7 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -402,7 +402,7 @@ def test_download_remote_layers_api(
     env.pageserver.allowed_errors.extend(
         [
             ".*download failed: downloading evicted layer file failed.*",
-            f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed: downloading evicted layer file failed",
+            f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed.*downloading evicted layer file failed",
         ]
     )
 

From e6db8069b0a5476504e723ff1b5bbe079afbac0b Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 28 May 2024 21:28:44 +0300
Subject: [PATCH 0897/1571] neon_walreader: check after local read that the
 segment still exists.

Otherwise read might receive zeros/garbage if the file is recycled (renamed) for
as a future segment.
---
 pgxn/neon/neon_walreader.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c
index e43f4d9d96..60eb8e1fc9 100644
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -184,8 +184,8 @@ NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, Ti
 	}
 	else if (state->wre_errno == ENOENT)
 	{
-		nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
-				LSN_FORMAT_ARGS(startptr));
+		nwr_log(LOG, "local read at %X/%X len %zu failed as segment file doesn't exist, attempting remote",
+				LSN_FORMAT_ARGS(startptr), count);
 		return NeonWALReadRemote(state, buf, startptr, count, tli);
 	}
 	else
@@ -614,6 +614,7 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
 		uint32		startoff;
 		int			segbytes;
 		int			readbytes;
+		XLogSegNo	lastRemovedSegNo;
 
 		startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
 
@@ -689,6 +690,23 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
 			return false;
 		}
 
+		/*
+		 * Recheck that the segment hasn't been removed while we were reading
+		 * it.
+		 */
+		lastRemovedSegNo = XLogGetLastRemovedSegno();
+		if (state->seg.ws_segno <= lastRemovedSegNo)
+		{
+			char		fname[MAXFNAMELEN];
+
+			state->wre_errno = ENOENT;
+
+			XLogFileName(fname, tli, state->seg.ws_segno, state->segcxt.ws_segsize);
+			snprintf(state->err_msg, sizeof(state->err_msg), "WAL segment %s has been removed during the read, lastRemovedSegNo " UINT64_FORMAT,
+					 fname, lastRemovedSegNo);
+			return false;
+		}
+
 		/* Update state for read */
 		recptr += readbytes;
 		nbytes -= readbytes;

From af40bf3c2ee11f23ef10f8e2d99edc3024137a3b Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 28 May 2024 13:34:04 +0300
Subject: [PATCH 0898/1571] Fix term/epoch confusion in python tests.

Call epoch last_log_term and add separate term field.
---
 test_runner/fixtures/safekeeper/http.py  |  6 ++++--
 test_runner/regress/test_wal_acceptor.py | 10 +++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index a5480f557f..11e6fef28f 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -19,7 +19,8 @@ class Walreceiver:
 
 @dataclass
 class SafekeeperTimelineStatus:
-    acceptor_epoch: int
+    term: int
+    last_log_term: int
     pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
     flush_lsn: Lsn
     commit_lsn: Lsn
@@ -156,7 +157,8 @@ class SafekeeperHttpClient(requests.Session):
         resj = res.json()
         walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
         return SafekeeperTimelineStatus(
-            acceptor_epoch=resj["acceptor_state"]["epoch"],
+            term=resj["acceptor_state"]["term"],
+            last_log_term=resj["acceptor_state"]["epoch"],
             pg_version=resj["pg_info"]["pg_version"],
             flush_lsn=Lsn(resj["flush_lsn"]),
             commit_lsn=Lsn(resj["commit_lsn"]),
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index cff13e74ee..7dca6c3ec2 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -841,7 +841,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
 
     # fetch something sensible from status
     tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id)
-    epoch = tli_status.acceptor_epoch
+    term = tli_status.term
     timeline_start_lsn = tli_status.timeline_start_lsn
 
     if auth_enabled:
@@ -862,8 +862,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     endpoint.safe_psql("insert into t values(10)")
 
     tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id)
-    epoch_after_reboot = tli_status.acceptor_epoch
-    assert epoch_after_reboot > epoch
+    term_after_reboot = tli_status.term
+    assert term_after_reboot > term
 
     # and timeline_start_lsn stays the same
     assert tli_status.timeline_start_lsn == timeline_start_lsn
@@ -1104,11 +1104,11 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline
     # First check that term / flush_lsn are the same: it is easier to
     # report/understand if WALs are different due to that.
     statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis]
-    term_flush_lsns = [(s.acceptor_epoch, s.flush_lsn) for s in statuses]
+    term_flush_lsns = [(s.last_log_term, s.flush_lsn) for s in statuses]
     for tfl, sk in zip(term_flush_lsns[1:], sks[1:]):
         assert (
             term_flush_lsns[0] == tfl
-        ), f"(term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}"
+        ), f"(last_log_term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}"
 
     # check that WALs are identic.
     segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks]

From 1fcc2b37eba2e3be3f248c50713a6cda02a99453 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 28 May 2024 13:43:28 +0300
Subject: [PATCH 0899/1571] Add test checking term change during pull_timeline.

---
 test_runner/regress/test_wal_acceptor.py | 60 ++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 7dca6c3ec2..dce30f5388 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -17,6 +17,7 @@ import psycopg2
 import psycopg2.errors
 import psycopg2.extras
 import pytest
+import requests
 from fixtures.broker import NeonBroker
 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
@@ -1867,6 +1868,65 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
     assert digests[0] == digests[1], f"digest on src is {digests[0]} but on dst is {digests[1]}"
 
 
+# Test pull_timeline while concurrently changing term on the donor:
+# 1) Start pull_timeline, listing files to fetch.
+# 2) Change term on the donor
+# 3) Finish pull_timeline.
+#
+# Currently (until proper membership change procedure), we want to pull_timeline
+# to fetch the log up to <last_log_term, flush_lsn>. This is unsafe if term
+# changes during the procedure (unless timeline is locked all the time but we
+# don't want that): recepient might end up with mix of WAL from different
+# histories. Thus the schedule above is expected to fail. Later we'd allow
+# pull_timeline to only initialize timeline to any valid state (up to
+# commit_lsn), holding switch to fully new configuration until it recovers
+# enough, so it won't be affected by term change anymore.
+#
+# Expected to fail while term check is not implemented.
+@pytest.mark.xfail
+def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2])
+
+    log.info("use only first 2 safekeepers, 3rd will be seeded")
+    ep = env.endpoints.create("main")
+    ep.active_safekeepers = [1, 2]
+    ep.start()
+    ep.safe_psql("create table t(key int, value text)")
+    ep.safe_psql("insert into t select generate_series(1, 1000), 'pear'")
+
+    dst_http = dst_sk.http_client()
+    # run pull_timeline which will halt before downloading files
+    dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "pause"))
+    pt_handle = PropagatingThread(
+        target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id)
+    )
+    pt_handle.start()
+    dst_sk.wait_until_paused("sk-pull-timeline-after-list-pausable")
+
+    src_http = src_sk.http_client()
+    term_before = src_http.timeline_status(tenant_id, timeline_id).term
+
+    # restart compute to bump term
+    ep.stop()
+    ep = env.endpoints.create("main")
+    ep.active_safekeepers = [1, 2]
+    ep.start()
+    ep.safe_psql("insert into t select generate_series(1, 100), 'pear'")
+
+    term_after = src_http.timeline_status(tenant_id, timeline_id).term
+    assert term_after > term_before, f"term_after={term_after}, term_before={term_before}"
+
+    dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "off"))
+    with pytest.raises(requests.exceptions.HTTPError):
+        pt_handle.join()
+
+
 # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
 # when compute is active, but there are no writes to the timeline. In that case
 # pageserver should maintain a single connection to safekeeper and don't attempt

From 7ec70b5eff731f6072e4804050d49f0b951ef872 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 28 May 2024 12:47:04 +0300
Subject: [PATCH 0900/1571] safekeeper: rename epoch to last_log_term.

epoch is a historical and potentially confusing name. It semantically means
lastLogTerm from the raft paper, so let's use it.

This commit changes only internal namings, not public interface (http).
---
 safekeeper/src/http/routes.rs |  8 +++---
 safekeeper/src/json_ctrl.rs   |  2 +-
 safekeeper/src/recovery.rs    |  2 +-
 safekeeper/src/safekeeper.rs  | 53 ++++++++++++++++++-----------------
 safekeeper/src/timeline.rs    |  8 +++---
 5 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 4aacd3421d..593e102e35 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -85,11 +85,11 @@ impl From<TermSwitchApiEntry> for TermLsn {
     }
 }
 
-/// Augment AcceptorState with epoch for convenience
+/// Augment AcceptorState with last_log_term for convenience
 #[derive(Debug, Serialize, Deserialize)]
 pub struct AcceptorStateStatus {
     pub term: Term,
-    pub epoch: Term,
+    pub epoch: Term, // aka last_log_term
     pub term_history: Vec<TermSwitchApiEntry>,
 }
 
@@ -130,7 +130,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
     let (inmem, state) = tli.get_state().await;
     let flush_lsn = tli.get_flush_lsn().await;
 
-    let epoch = state.acceptor_state.get_epoch(flush_lsn);
+    let last_log_term = state.acceptor_state.get_last_log_term(flush_lsn);
     let term_history = state
         .acceptor_state
         .term_history
@@ -143,7 +143,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
         .collect();
     let acc_state = AcceptorStateStatus {
         term: state.acceptor_state.term,
-        epoch,
+        epoch: last_log_term,
         term_history,
     };
 
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index 32d5889803..f4e491cc5f 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -165,7 +165,7 @@ pub async fn append_logical_message(
     let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest {
         h: AppendRequestHeader {
             term: msg.term,
-            epoch_start_lsn: begin_lsn,
+            term_start_lsn: begin_lsn,
             begin_lsn,
             end_lsn,
             commit_lsn,
diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index dfa1892c40..568a512c4a 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -337,7 +337,7 @@ async fn network_io(
             ReplicationMessage::XLogData(xlog_data) => {
                 let ar_hdr = AppendRequestHeader {
                     term: donor.term,
-                    epoch_start_lsn: Lsn::INVALID, // unused
+                    term_start_lsn: Lsn::INVALID, // unused
                     begin_lsn: Lsn(xlog_data.wal_start()),
                     end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
                     commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 2a620f5fef..4686c9aa8e 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -188,8 +188,8 @@ pub struct AcceptorState {
 }
 
 impl AcceptorState {
-    /// acceptor's epoch is the term of the highest entry in the log
-    pub fn get_epoch(&self, flush_lsn: Lsn) -> Term {
+    /// acceptor's last_log_term is the term of the highest entry in the log
+    pub fn get_last_log_term(&self, flush_lsn: Lsn) -> Term {
         let th = self.term_history.up_to(flush_lsn);
         match th.0.last() {
             Some(e) => e.term,
@@ -305,9 +305,9 @@ pub struct AppendRequest {
 pub struct AppendRequestHeader {
     // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
     pub term: Term,
-    // TODO: remove this field, it in unused -- LSN of term switch can be taken
-    // from ProposerElected (as well as from term history).
-    pub epoch_start_lsn: Lsn,
+    // TODO: remove this field from the protocol, it in unused -- LSN of term
+    // switch can be taken from ProposerElected (as well as from term history).
+    pub term_start_lsn: Lsn,
     /// start position of message in WAL
     pub begin_lsn: Lsn,
     /// end position of message in WAL
@@ -326,9 +326,10 @@ pub struct AppendResponse {
     // Current term of the safekeeper; if it is higher than proposer's, the
     // compute is out of date.
     pub term: Term,
-    // NOTE: this is physical end of wal on safekeeper; currently it doesn't
-    // make much sense without taking epoch into account, as history can be
-    // diverged.
+    // Flushed end of wal on safekeeper; one should be always mindful from what
+    // term history this value comes, either checking history directly or
+    // observing term being set to one for which WAL truncation is known to have
+    // happened.
     pub flush_lsn: Lsn,
     // We report back our awareness about which WAL is committed, as this is
     // a criterion for walproposer --sync mode exit
@@ -482,8 +483,8 @@ impl AcceptorProposerMessage {
 /// - messages from broker peers
 pub struct SafeKeeper<CTRL: control_file::Storage, WAL: wal_storage::Storage> {
     /// LSN since the proposer safekeeper currently talking to appends WAL;
-    /// determines epoch switch point.
-    pub epoch_start_lsn: Lsn,
+    /// determines last_log_term switch point.
+    pub term_start_lsn: Lsn,
 
     pub state: TimelineState<CTRL>, // persistent state storage
     pub wal_store: WAL,
@@ -511,7 +512,7 @@ where
         }
 
         Ok(SafeKeeper {
-            epoch_start_lsn: Lsn(0),
+            term_start_lsn: Lsn(0),
             state: TimelineState::new(state),
             wal_store,
             node_id,
@@ -531,8 +532,10 @@ where
         self.state.acceptor_state.term
     }
 
-    pub fn get_epoch(&self) -> Term {
-        self.state.acceptor_state.get_epoch(self.flush_lsn())
+    pub fn get_last_log_term(&self) -> Term {
+        self.state
+            .acceptor_state
+            .get_last_log_term(self.flush_lsn())
     }
 
     /// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet.
@@ -713,7 +716,7 @@ where
         // proceed, but to prevent commit_lsn surprisingly going down we should
         // either refuse the session (simpler) or skip the part we already have
         // from the stream (can be implemented).
-        if msg.term == self.get_epoch() && self.flush_lsn() > msg.start_streaming_at {
+        if msg.term == self.get_last_log_term() && self.flush_lsn() > msg.start_streaming_at {
             bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
                    msg.term, self.flush_lsn(), msg.start_streaming_at)
         }
@@ -788,7 +791,7 @@ where
         // Cache LSN where term starts to immediately fsync control file with
         // commit_lsn once we reach it -- sync-safekeepers finishes when
         // persisted commit_lsn on majority of safekeepers aligns.
-        self.epoch_start_lsn = match msg.term_history.0.last() {
+        self.term_start_lsn = match msg.term_history.0.last() {
             None => bail!("proposer elected with empty term history"),
             Some(term_lsn_start) => term_lsn_start.lsn,
         };
@@ -814,11 +817,11 @@ where
 
         self.state.inmem.commit_lsn = commit_lsn;
 
-        // If new commit_lsn reached epoch switch, force sync of control
+        // If new commit_lsn reached term switch, force sync of control
         // file: walproposer in sync mode is very interested when this
         // happens. Note: this is for sync-safekeepers mode only, as
-        // otherwise commit_lsn might jump over epoch_start_lsn.
-        if commit_lsn >= self.epoch_start_lsn && self.state.commit_lsn < self.epoch_start_lsn {
+        // otherwise commit_lsn might jump over term_start_lsn.
+        if commit_lsn >= self.term_start_lsn && self.state.commit_lsn < self.term_start_lsn {
             self.state.flush().await?;
         }
 
@@ -933,7 +936,7 @@ where
             // Note: the check is too restrictive, generally we can update local
             // commit_lsn if our history matches (is part of) history of advanced
             // commit_lsn provider.
-            if sk_info.last_log_term == self.get_epoch() {
+            if sk_info.last_log_term == self.get_last_log_term() {
                 self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?;
             }
         }
@@ -1079,7 +1082,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_epoch_switch() {
+    async fn test_last_log_term_switch() {
         let storage = InMemoryState {
             persisted_state: test_sk_state(),
         };
@@ -1089,7 +1092,7 @@ mod tests {
 
         let mut ar_hdr = AppendRequestHeader {
             term: 1,
-            epoch_start_lsn: Lsn(3),
+            term_start_lsn: Lsn(3),
             begin_lsn: Lsn(1),
             end_lsn: Lsn(2),
             commit_lsn: Lsn(0),
@@ -1114,14 +1117,14 @@ mod tests {
             .await
             .unwrap();
 
-        // check that AppendRequest before epochStartLsn doesn't switch epoch
+        // check that AppendRequest before term_start_lsn doesn't switch last_log_term.
         let resp = sk
             .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
             .await;
         assert!(resp.is_ok());
-        assert_eq!(sk.get_epoch(), 0);
+        assert_eq!(sk.get_last_log_term(), 0);
 
-        // but record at epochStartLsn does the switch
+        // but record at term_start_lsn does the switch
         ar_hdr.begin_lsn = Lsn(2);
         ar_hdr.end_lsn = Lsn(3);
         append_request = AppendRequest {
@@ -1133,7 +1136,7 @@ mod tests {
             .await;
         assert!(resp.is_ok());
         sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
-        assert_eq!(sk.get_epoch(), 1);
+        assert_eq!(sk.get_last_log_term(), 1);
     }
 
     #[test]
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index f30c503382..aa9ccfc21e 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -244,7 +244,7 @@ impl SharedState {
                 timeline_id: ttid.timeline_id.as_ref().to_owned(),
             }),
             term: self.sk.state.acceptor_state.term,
-            last_log_term: self.sk.get_epoch(),
+            last_log_term: self.sk.get_last_log_term(),
             flush_lsn: self.sk.flush_lsn().0,
             // note: this value is not flushed to control file yet and can be lost
             commit_lsn: self.sk.state.inmem.commit_lsn.0,
@@ -704,7 +704,7 @@ impl Timeline {
     pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
         let ss = self.read_shared_state().await;
         let term = ss.sk.state.acceptor_state.term;
-        let last_log_term = ss.sk.get_epoch();
+        let last_log_term = ss.sk.get_last_log_term();
         let flush_lsn = ss.sk.flush_lsn();
         // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
         let mut peers = ss.get_peers(heartbeat_timeout);
@@ -844,7 +844,7 @@ impl Timeline {
             timeline_is_active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
             last_removed_segno: state.last_removed_segno,
-            epoch_start_lsn: state.sk.epoch_start_lsn,
+            epoch_start_lsn: state.sk.term_start_lsn,
             mem_state: state.sk.state.inmem.clone(),
             persisted_state: state.sk.state.clone(),
             flush_lsn: state.sk.wal_store.flush_lsn(),
@@ -867,7 +867,7 @@ impl Timeline {
             active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
             last_removed_segno: state.last_removed_segno,
-            epoch_start_lsn: state.sk.epoch_start_lsn,
+            epoch_start_lsn: state.sk.term_start_lsn,
             mem_state: state.sk.state.inmem.clone(),
             write_lsn,
             write_record_lsn,

From 5a394fde566eea238e317604a3e9b414bed04d1b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 31 May 2024 13:31:42 +0100
Subject: [PATCH 0901/1571] pageserver: avoid spurious "bad state" logs/errors
 during shutdown (#7912)

## Problem

- Initial size calculations tend to fail with `Bad state (not active)`

Closes: https://github.com/neondatabase/neon/issues/7911

## Summary of changes

- In `wait_lsn`, return WaitLsnError::Cancelled rather than BadState
when the state is Stopping
- Replace PageReconstructError's `Other` variant with a specific
`BadState` variant
- Avoid returning anyhow::Error from get_ready_ancestor_timeline -- this
was only used for the case where there was no ancestor. All callers of
this function had implicitly checked that the ancestor timeline exists
before calling it, so they can pass in the ancestor instead of handling
an error.
---
 pageserver/src/page_service.rs    |  4 +-
 pageserver/src/tenant.rs          |  9 ++--
 pageserver/src/tenant/timeline.rs | 89 +++++++++++++++----------------
 3 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 35150b210e..ae389826d5 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -373,7 +373,7 @@ impl From<WaitLsnError> for PageStreamError {
         match value {
             e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
             WaitLsnError::Shutdown => Self::Shutdown,
-            WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
+            e @ WaitLsnError::BadState { .. } => Self::Reconnect(format!("{e}").into()),
         }
     }
 }
@@ -383,7 +383,7 @@ impl From<WaitLsnError> for QueryError {
         match value {
             e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
             WaitLsnError::Shutdown => Self::Shutdown,
-            WaitLsnError::BadState => Self::Reconnect,
+            WaitLsnError::BadState { .. } => Self::Reconnect,
         }
     }
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 311338554c..0a9637884f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1507,7 +1507,7 @@ impl Tenant {
                         .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
                         .await
                         .map_err(|e| match e {
-                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
+                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => {
                                 CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
                             }
                             WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
@@ -4308,9 +4308,10 @@ mod tests {
 
         // This needs to traverse to the parent, and fails.
         let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
-        assert!(err
-            .to_string()
-            .contains("will not become active. Current state: Broken"));
+        assert!(err.to_string().starts_with(&format!(
+            "Bad state on timeline {}: Broken",
+            tline.timeline_id
+        )));
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d07e2352fa..b498876465 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -496,7 +496,7 @@ pub(crate) enum PageReconstructError {
     Other(#[from] anyhow::Error),
 
     #[error("Ancestor LSN wait error: {0}")]
-    AncestorLsnTimeout(#[from] WaitLsnError),
+    AncestorLsnTimeout(WaitLsnError),
 
     #[error("timeline shutting down")]
     Cancelled,
@@ -651,11 +651,14 @@ pub(crate) enum GetReadyAncestorError {
     #[error("Ancestor LSN wait error: {0}")]
     AncestorLsnTimeout(#[from] WaitLsnError),
 
+    #[error("Bad state on timeline {timeline_id}: {state:?}")]
+    BadState {
+        timeline_id: TimelineId,
+        state: TimelineState,
+    },
+
     #[error("Cancelled")]
     Cancelled,
-
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
 }
 
 #[derive(Clone, Copy)]
@@ -690,8 +693,8 @@ pub(crate) enum WaitLsnError {
     Shutdown,
 
     // Called on an timeline not in active state or shutting down
-    #[error("Bad state (not active)")]
-    BadState,
+    #[error("Bad timeline state: {0:?}")]
+    BadState(TimelineState),
 
     // Timeout expired while waiting for LSN to catch up with goal.
     #[error("{0}")]
@@ -756,8 +759,8 @@ impl From<GetReadyAncestorError> for PageReconstructError {
         match e {
             AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid),
             AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err),
+            bad_state @ BadState { .. } => PageReconstructError::Other(anyhow::anyhow!(bad_state)),
             Cancelled => PageReconstructError::Cancelled,
-            Other(other) => PageReconstructError::Other(other),
         }
     }
 }
@@ -1466,10 +1469,11 @@ impl Timeline {
         who_is_waiting: WaitLsnWaiter<'_>,
         ctx: &RequestContext, /* Prepare for use by cancellation */
     ) -> Result<(), WaitLsnError> {
-        if self.cancel.is_cancelled() {
+        let state = self.current_state();
+        if self.cancel.is_cancelled() || matches!(state, TimelineState::Stopping) {
             return Err(WaitLsnError::Shutdown);
-        } else if !self.is_active() {
-            return Err(WaitLsnError::BadState);
+        } else if !matches!(state, TimelineState::Active) {
+            return Err(WaitLsnError::BadState(state));
         }
 
         if cfg!(debug_assertions) {
@@ -3193,17 +3197,21 @@ impl Timeline {
             }
 
             // Recurse into ancestor if needed
-            if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
-                trace!(
-                    "going into ancestor {}, cont_lsn is {}",
-                    timeline.ancestor_lsn,
-                    cont_lsn
-                );
+            if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
+                if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
+                    trace!(
+                        "going into ancestor {}, cont_lsn is {}",
+                        timeline.ancestor_lsn,
+                        cont_lsn
+                    );
 
-                timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
-                timeline = &*timeline_owned;
-                prev_lsn = None;
-                continue 'outer;
+                    timeline_owned = timeline
+                        .get_ready_ancestor_timeline(ancestor_timeline, ctx)
+                        .await?;
+                    timeline = &*timeline_owned;
+                    prev_lsn = None;
+                    continue 'outer;
+                }
             }
 
             let guard = timeline.layers.read().await;
@@ -3352,10 +3360,10 @@ impl Timeline {
                 break None;
             }
 
-            // Not fully retrieved but no ancestor timeline.
-            if timeline.ancestor_timeline.is_none() {
+            let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else {
+                // Not fully retrieved but no ancestor timeline.
                 break Some(keyspace);
-            }
+            };
 
             // Now we see if there are keys covered by the image layer but does not exist in the
             // image layer, which means that the key does not exist.
@@ -3375,7 +3383,7 @@ impl Timeline {
             // Take the min to avoid reconstructing a page with data newer than request Lsn.
             cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
             timeline_owned = timeline
-                .get_ready_ancestor_timeline(ctx)
+                .get_ready_ancestor_timeline(ancestor_timeline, ctx)
                 .await
                 .map_err(GetVectoredError::GetReadyAncestorError)?;
             timeline = &*timeline_owned;
@@ -3547,13 +3555,9 @@ impl Timeline {
 
     async fn get_ready_ancestor_timeline(
         &self,
+        ancestor: &Arc<Timeline>,
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, GetReadyAncestorError> {
-        let ancestor = match self.get_ancestor_timeline() {
-            Ok(timeline) => timeline,
-            Err(e) => return Err(GetReadyAncestorError::from(e)),
-        };
-
         // It's possible that the ancestor timeline isn't active yet, or
         // is active but hasn't yet caught up to the branch point. Wait
         // for it.
@@ -3586,11 +3590,10 @@ impl Timeline {
                 ));
             }
             Err(state) => {
-                return Err(GetReadyAncestorError::Other(anyhow::anyhow!(
-                    "Timeline {} will not become active. Current state: {:?}",
-                    ancestor.timeline_id,
-                    &state,
-                )));
+                return Err(GetReadyAncestorError::BadState {
+                    timeline_id: ancestor.timeline_id,
+                    state,
+                });
             }
         }
         ancestor
@@ -3599,21 +3602,17 @@ impl Timeline {
             .map_err(|e| match e {
                 e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
                 WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled,
-                e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)),
+                WaitLsnError::BadState(state) => GetReadyAncestorError::BadState {
+                    timeline_id: ancestor.timeline_id,
+                    state,
+                },
             })?;
 
-        Ok(ancestor)
+        Ok(ancestor.clone())
     }
 
-    pub(crate) fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
-        let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
-            format!(
-                "Ancestor is missing. Timeline id: {} Ancestor id {:?}",
-                self.timeline_id,
-                self.get_ancestor_timeline_id(),
-            )
-        })?;
-        Ok(Arc::clone(ancestor))
+    pub(crate) fn get_ancestor_timeline(&self) -> Option<Arc<Timeline>> {
+        self.ancestor_timeline.clone()
     }
 
     pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {

From 16b2e74037dfcacec2ceeef3bc597e75b435cc1b Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 31 May 2024 14:19:45 +0100
Subject: [PATCH 0902/1571] Add FullAccessTimeline guard in safekeepers (#7887)

This is a preparation for
https://github.com/neondatabase/neon/issues/6337.

The idea is to add FullAccessTimeline, which will act as a guard for
tasks requiring access to WAL files. Eviction will be blocked on these
tasks and WAL won't be deleted from disk until there is at least one
active FullAccessTimeline.

To get FullAccessTimeline, tasks call `tli.full_access_guard().await?`.
After eviction is implemented, this function will be responsible for
downloading missing WAL file and waiting until the download finishes.

This commit also contains other small refactorings:
- Separate `get_tenant_dir` and `get_timeline_dir` functions for
building a local path. This is useful for looking at usages and finding
tasks requiring access to local filesystem.
- `timeline_manager` is now responsible for spawning all background
tasks
- WAL removal task is now spawned instantly after horizon is updated
---
 libs/remote_storage/src/lib.rs         |   4 +-
 pageserver/src/deletion_queue.rs       |   2 +-
 safekeeper/src/bin/safekeeper.rs       |  11 +-
 safekeeper/src/control_file.rs         |  37 ++-
 safekeeper/src/copy_timeline.rs        |  26 +-
 safekeeper/src/debug_dump.rs           |  38 +--
 safekeeper/src/http/routes.rs          |  22 +-
 safekeeper/src/json_ctrl.rs            |  20 +-
 safekeeper/src/lib.rs                  |  14 +-
 safekeeper/src/pull_timeline.rs        |   6 +-
 safekeeper/src/receive_wal.rs          |  19 +-
 safekeeper/src/recovery.rs             | 100 ++++++-
 safekeeper/src/remove_wal.rs           |  56 ++--
 safekeeper/src/safekeeper.rs           |  19 --
 safekeeper/src/send_wal.rs             |  21 +-
 safekeeper/src/timeline.rs             | 379 +++++++++----------------
 safekeeper/src/timeline_manager.rs     | 331 +++++++++++++++++----
 safekeeper/src/timelines_global_map.rs |  21 +-
 safekeeper/src/wal_backup.rs           | 100 +++----
 safekeeper/src/wal_backup_partial.rs   |  27 +-
 safekeeper/src/wal_storage.rs          |  21 +-
 test_runner/fixtures/common_types.py   |  12 +
 test_runner/fixtures/neon_fixtures.py  |  16 ++
 23 files changed, 726 insertions(+), 576 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 708662f20f..cb3df0985d 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -121,8 +121,8 @@ impl RemotePath {
         self.0.file_name()
     }
 
-    pub fn join(&self, segment: &Utf8Path) -> Self {
-        Self(self.0.join(segment))
+    pub fn join(&self, path: impl AsRef<Utf8Path>) -> Self {
+        Self(self.0.join(path))
     }
 
     pub fn get_path(&self) -> &Utf8PathBuf {
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 8790a9b0a8..3960fc1b99 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -311,7 +311,7 @@ impl DeletionList {
                 result.extend(
                     timeline_layers
                         .into_iter()
-                        .map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))),
+                        .map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))),
                 );
             }
         }
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index aee3898ac7..7476654426 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -29,13 +29,12 @@ use safekeeper::defaults::{
     DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
     DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
-use safekeeper::remove_wal;
+use safekeeper::http;
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
 use safekeeper::SafeKeeperConf;
 use safekeeper::{broker, WAL_SERVICE_RUNTIME};
 use safekeeper::{control_file, BROKER_RUNTIME};
-use safekeeper::{http, WAL_REMOVER_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
 use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
@@ -441,14 +440,6 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
         .map(|res| ("broker main".to_owned(), res));
     tasks_handles.push(Box::pin(broker_task_handle));
 
-    let conf_ = conf.clone();
-    let wal_remover_handle = current_thread_rt
-        .as_ref()
-        .unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle())
-        .spawn(remove_wal::task_main(conf_))
-        .map(|res| ("WAL remover".to_owned(), res));
-    tasks_handles.push(Box::pin(wal_remover_handle));
-
     set_build_info_metric(GIT_VERSION, BUILD_TAG);
 
     // TODO: update tokio-stream, convert to real async Stream with
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index fe9f2e6899..e9bb5202da 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -2,7 +2,7 @@
 
 use anyhow::{bail, ensure, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
-use camino::Utf8PathBuf;
+use camino::{Utf8Path, Utf8PathBuf};
 use tokio::fs::File;
 use tokio::io::AsyncWriteExt;
 use utils::crashsafe::durable_rename;
@@ -12,9 +12,9 @@ use std::ops::Deref;
 use std::path::Path;
 use std::time::Instant;
 
-use crate::control_file_upgrade::upgrade_control_file;
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
 use crate::state::TimelinePersistentState;
+use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
 use utils::{bin_ser::LeSer, id::TenantTimelineId};
 
 use crate::SafeKeeperConf;
@@ -43,7 +43,7 @@ pub trait Storage: Deref<Target = TimelinePersistentState> {
 pub struct FileStorage {
     // save timeline dir to avoid reconstructing it every time
     timeline_dir: Utf8PathBuf,
-    conf: SafeKeeperConf,
+    no_sync: bool,
 
     /// Last state persisted to disk.
     state: TimelinePersistentState,
@@ -54,13 +54,12 @@ pub struct FileStorage {
 impl FileStorage {
     /// Initialize storage by loading state from disk.
     pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result<FileStorage> {
-        let timeline_dir = conf.timeline_dir(ttid);
-
-        let state = Self::load_control_file_conf(conf, ttid)?;
+        let timeline_dir = get_timeline_dir(conf, ttid);
+        let state = Self::load_control_file_from_dir(&timeline_dir)?;
 
         Ok(FileStorage {
             timeline_dir,
-            conf: conf.clone(),
+            no_sync: conf.no_sync,
             state,
             last_persist_at: Instant::now(),
         })
@@ -74,7 +73,7 @@ impl FileStorage {
     ) -> Result<FileStorage> {
         let store = FileStorage {
             timeline_dir,
-            conf: conf.clone(),
+            no_sync: conf.no_sync,
             state,
             last_persist_at: Instant::now(),
         };
@@ -102,12 +101,9 @@ impl FileStorage {
         upgrade_control_file(buf, version)
     }
 
-    /// Load control file for given ttid at path specified by conf.
-    pub fn load_control_file_conf(
-        conf: &SafeKeeperConf,
-        ttid: &TenantTimelineId,
-    ) -> Result<TimelinePersistentState> {
-        let path = conf.timeline_dir(ttid).join(CONTROL_FILE_NAME);
+    /// Load control file from given directory.
+    pub fn load_control_file_from_dir(timeline_dir: &Utf8Path) -> Result<TimelinePersistentState> {
+        let path = timeline_dir.join(CONTROL_FILE_NAME);
         Self::load_control_file(path)
     }
 
@@ -203,7 +199,7 @@ impl Storage for FileStorage {
         })?;
 
         let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
-        durable_rename(&control_partial_path, &control_path, !self.conf.no_sync).await?;
+        durable_rename(&control_partial_path, &control_path, !self.no_sync).await?;
 
         // update internal state
         self.state = s.clone();
@@ -233,12 +229,13 @@ mod test {
         conf: &SafeKeeperConf,
         ttid: &TenantTimelineId,
     ) -> Result<(FileStorage, TimelinePersistentState)> {
-        fs::create_dir_all(conf.timeline_dir(ttid))
+        let timeline_dir = get_timeline_dir(conf, ttid);
+        fs::create_dir_all(&timeline_dir)
             .await
             .expect("failed to create timeline dir");
         Ok((
             FileStorage::restore_new(ttid, conf)?,
-            FileStorage::load_control_file_conf(conf, ttid)?,
+            FileStorage::load_control_file_from_dir(&timeline_dir)?,
         ))
     }
 
@@ -246,11 +243,11 @@ mod test {
         conf: &SafeKeeperConf,
         ttid: &TenantTimelineId,
     ) -> Result<(FileStorage, TimelinePersistentState)> {
-        fs::create_dir_all(conf.timeline_dir(ttid))
+        let timeline_dir = get_timeline_dir(conf, ttid);
+        fs::create_dir_all(&timeline_dir)
             .await
             .expect("failed to create timeline dir");
         let state = TimelinePersistentState::empty();
-        let timeline_dir = conf.timeline_dir(ttid);
         let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?;
         Ok((storage, state))
     }
@@ -291,7 +288,7 @@ mod test {
                 .await
                 .expect("failed to persist state");
         }
-        let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME);
+        let control_path = get_timeline_dir(&conf, &ttid).join(CONTROL_FILE_NAME);
         let mut data = fs::read(&control_path).await.unwrap();
         data[0] += 1; // change the first byte of the file to fail checksum validation
         fs::write(&control_path, &data)
diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
index 3023d4e2cb..51cf4db6b5 100644
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -15,10 +15,10 @@ use crate::{
     control_file::{FileStorage, Storage},
     pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline},
     state::TimelinePersistentState,
-    timeline::{Timeline, TimelineError},
+    timeline::{FullAccessTimeline, Timeline, TimelineError},
     wal_backup::copy_s3_segments,
     wal_storage::{wal_file_paths, WalReader},
-    GlobalTimelines, SafeKeeperConf,
+    GlobalTimelines,
 };
 
 // we don't want to have more than 10 segments on disk after copy, because they take space
@@ -46,12 +46,14 @@ pub async fn handle_request(request: Request) -> Result<()> {
         }
     }
 
+    let source_tli = request.source.full_access_guard().await?;
+
     let conf = &GlobalTimelines::get_global_config();
     let ttid = request.destination_ttid;
 
     let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
 
-    let (mem_state, state) = request.source.get_state().await;
+    let (mem_state, state) = source_tli.get_state().await;
     let start_lsn = state.timeline_start_lsn;
     if start_lsn == Lsn::INVALID {
         bail!("timeline is not initialized");
@@ -60,7 +62,7 @@ pub async fn handle_request(request: Request) -> Result<()> {
 
     {
         let commit_lsn = mem_state.commit_lsn;
-        let flush_lsn = request.source.get_flush_lsn().await;
+        let flush_lsn = source_tli.get_flush_lsn().await;
 
         info!(
             "collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}",
@@ -127,10 +129,8 @@ pub async fn handle_request(request: Request) -> Result<()> {
     .await?;
 
     copy_disk_segments(
-        conf,
-        &state,
+        &source_tli,
         wal_seg_size,
-        &request.source.ttid,
         new_backup_lsn,
         request.until_lsn,
         &tli_dir_path,
@@ -159,21 +159,13 @@ pub async fn handle_request(request: Request) -> Result<()> {
 }
 
 async fn copy_disk_segments(
-    conf: &SafeKeeperConf,
-    persisted_state: &TimelinePersistentState,
+    tli: &FullAccessTimeline,
     wal_seg_size: usize,
-    source_ttid: &TenantTimelineId,
     start_lsn: Lsn,
     end_lsn: Lsn,
     tli_dir_path: &Utf8PathBuf,
 ) -> Result<()> {
-    let mut wal_reader = WalReader::new(
-        conf.workdir.clone(),
-        conf.timeline_dir(source_ttid),
-        persisted_state,
-        start_lsn,
-        true,
-    )?;
+    let mut wal_reader = tli.get_walreader(start_lsn).await?;
 
     let mut buf = [0u8; MAX_SEND_SIZE];
 
diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs
index b50f2e1158..062ff4b3db 100644
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -10,6 +10,7 @@ use std::sync::Arc;
 use anyhow::bail;
 use anyhow::Result;
 use camino::Utf8Path;
+use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use postgres_ffi::XLogSegNo;
 use postgres_ffi::MAX_SEND_SIZE;
@@ -26,7 +27,8 @@ use crate::safekeeper::TermHistory;
 use crate::send_wal::WalSenderState;
 use crate::state::TimelineMemState;
 use crate::state::TimelinePersistentState;
-use crate::wal_storage::WalReader;
+use crate::timeline::get_timeline_dir;
+use crate::timeline::FullAccessTimeline;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;
 
@@ -68,6 +70,7 @@ pub struct Response {
 pub struct TimelineDumpSer {
     pub tli: Arc<crate::timeline::Timeline>,
     pub args: Args,
+    pub timeline_dir: Utf8PathBuf,
     pub runtime: Arc<tokio::runtime::Runtime>,
 }
 
@@ -85,14 +88,20 @@ impl Serialize for TimelineDumpSer {
     where
         S: serde::Serializer,
     {
-        let dump = self
-            .runtime
-            .block_on(build_from_tli_dump(self.tli.clone(), self.args.clone()));
+        let dump = self.runtime.block_on(build_from_tli_dump(
+            &self.tli,
+            &self.args,
+            &self.timeline_dir,
+        ));
         dump.serialize(serializer)
     }
 }
 
-async fn build_from_tli_dump(timeline: Arc<crate::timeline::Timeline>, args: Args) -> Timeline {
+async fn build_from_tli_dump(
+    timeline: &Arc<crate::timeline::Timeline>,
+    args: &Args,
+    timeline_dir: &Utf8Path,
+) -> Timeline {
     let control_file = if args.dump_control_file {
         let mut state = timeline.get_state().await.1;
         if !args.dump_term_history {
@@ -112,7 +121,8 @@ async fn build_from_tli_dump(timeline: Arc<crate::timeline::Timeline>, args: Arg
     let disk_content = if args.dump_disk_content {
         // build_disk_content can fail, but we don't want to fail the whole
         // request because of that.
-        build_disk_content(&timeline.timeline_dir).ok()
+        // Note: timeline can be in offloaded state, this is not a problem.
+        build_disk_content(timeline_dir).ok()
     } else {
         None
     };
@@ -186,6 +196,7 @@ pub struct FileInfo {
 pub async fn build(args: Args) -> Result<Response> {
     let start_time = Utc::now();
     let timelines_count = GlobalTimelines::timelines_count();
+    let config = GlobalTimelines::get_global_config();
 
     let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() {
         // If both tenant_id and timeline_id are specified, we can just get the
@@ -223,12 +234,11 @@ pub async fn build(args: Args) -> Result<Response> {
         timelines.push(TimelineDumpSer {
             tli,
             args: args.clone(),
+            timeline_dir: get_timeline_dir(&config, &ttid),
             runtime: runtime.clone(),
         });
     }
 
-    let config = GlobalTimelines::get_global_config();
-
     Ok(Response {
         start_time,
         finish_time: Utc::now(),
@@ -316,27 +326,19 @@ pub struct TimelineDigest {
 }
 
 pub async fn calculate_digest(
-    tli: &Arc<crate::timeline::Timeline>,
+    tli: &FullAccessTimeline,
     request: TimelineDigestRequest,
 ) -> Result<TimelineDigest> {
     if request.from_lsn > request.until_lsn {
         bail!("from_lsn is greater than until_lsn");
     }
 
-    let conf = GlobalTimelines::get_global_config();
     let (_, persisted_state) = tli.get_state().await;
-
     if persisted_state.timeline_start_lsn > request.from_lsn {
         bail!("requested LSN is before the start of the timeline");
     }
 
-    let mut wal_reader = WalReader::new(
-        conf.workdir.clone(),
-        tli.timeline_dir.clone(),
-        &persisted_state,
-        request.from_lsn,
-        true,
-    )?;
+    let mut wal_reader = tli.get_walreader(request.from_lsn).await?;
 
     let mut hasher = Sha256::new();
     let mut buf = [0u8; MAX_SEND_SIZE];
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 593e102e35..1e29b21fac 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -249,6 +249,10 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
     };
 
     let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let tli = tli
+        .full_access_guard()
+        .await
+        .map_err(ApiError::InternalServerError)?;
 
     let response = debug_dump::calculate_digest(&tli, request)
         .await
@@ -268,8 +272,12 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
     let filename: String = parse_request_param(&request, "filename")?;
 
     let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let tli = tli
+        .full_access_guard()
+        .await
+        .map_err(ApiError::InternalServerError)?;
 
-    let filepath = tli.timeline_dir.join(filename);
+    let filepath = tli.get_timeline_dir().join(filename);
     let mut file = File::open(&filepath)
         .await
         .map_err(|e| ApiError::InternalServerError(e.into()))?;
@@ -287,7 +295,7 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
         .map_err(|e| ApiError::InternalServerError(e.into()))
 }
 
-/// Force persist control file and remove old WAL.
+/// Force persist control file.
 async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
@@ -297,13 +305,13 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
     );
 
     let tli = GlobalTimelines::get(ttid)?;
-    tli.maybe_persist_control_file(true)
+    tli.write_shared_state()
+        .await
+        .sk
+        .state
+        .flush()
         .await
         .map_err(ApiError::InternalServerError)?;
-    tli.remove_old_wal()
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
     json_response(StatusCode::OK, ())
 }
 
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index f4e491cc5f..27e54776e0 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -6,8 +6,6 @@
 //! modifications in tests.
 //!
 
-use std::sync::Arc;
-
 use anyhow::Context;
 use bytes::Bytes;
 use postgres_backend::QueryError;
@@ -23,7 +21,7 @@ use crate::safekeeper::{
 };
 use crate::safekeeper::{Term, TermHistory, TermLsn};
 use crate::state::TimelinePersistentState;
-use crate::timeline::Timeline;
+use crate::timeline::FullAccessTimeline;
 use crate::GlobalTimelines;
 use postgres_backend::PostgresBackend;
 use postgres_ffi::encode_logical_message;
@@ -104,8 +102,8 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
 async fn prepare_safekeeper(
     ttid: TenantTimelineId,
     pg_version: u32,
-) -> anyhow::Result<Arc<Timeline>> {
-    GlobalTimelines::create(
+) -> anyhow::Result<FullAccessTimeline> {
+    let tli = GlobalTimelines::create(
         ttid,
         ServerInfo {
             pg_version,
@@ -115,10 +113,16 @@ async fn prepare_safekeeper(
         Lsn::INVALID,
         Lsn::INVALID,
     )
-    .await
+    .await?;
+
+    tli.full_access_guard().await
 }
 
-async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
+async fn send_proposer_elected(
+    tli: &FullAccessTimeline,
+    term: Term,
+    lsn: Lsn,
+) -> anyhow::Result<()> {
     // add new term to existing history
     let history = tli.get_state().await.1.acceptor_state.term_history;
     let history = history.up_to(lsn.checked_sub(1u64).unwrap());
@@ -147,7 +151,7 @@ pub struct InsertedWAL {
 /// Extend local WAL with new LogicalMessage record. To do that,
 /// create AppendRequest with new WAL and pass it to safekeeper.
 pub async fn append_logical_message(
-    tli: &Arc<Timeline>,
+    tli: &FullAccessTimeline,
     msg: &AppendLogicalMessage,
 ) -> anyhow::Result<InsertedWAL> {
     let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 8d8d2cf23e..1a56ff736c 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -7,10 +7,7 @@ use tokio::runtime::Runtime;
 use std::time::Duration;
 use storage_broker::Uri;
 
-use utils::{
-    auth::SwappableJwtAuth,
-    id::{NodeId, TenantId, TenantTimelineId},
-};
+use utils::{auth::SwappableJwtAuth, id::NodeId};
 
 mod auth;
 pub mod broker;
@@ -89,15 +86,6 @@ pub struct SafeKeeperConf {
 }
 
 impl SafeKeeperConf {
-    pub fn tenant_dir(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.workdir.join(tenant_id.to_string())
-    }
-
-    pub fn timeline_dir(&self, ttid: &TenantTimelineId) -> Utf8PathBuf {
-        self.tenant_dir(&ttid.tenant_id)
-            .join(ttid.timeline_id.to_string())
-    }
-
     pub fn is_wal_backup_enabled(&self) -> bool {
         self.remote_storage.is_some() && self.wal_backup_enabled
     }
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index f7cc40f58a..7b41c98cb8 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -17,7 +17,7 @@ use utils::{
 use crate::{
     control_file, debug_dump,
     http::routes::TimelineStatus,
-    timeline::{Timeline, TimelineError},
+    timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError},
     wal_storage::{self, Storage},
     GlobalTimelines, SafeKeeperConf,
 };
@@ -283,13 +283,13 @@ pub async fn load_temp_timeline(
     }
 
     // Move timeline dir to the correct location
-    let timeline_path = conf.timeline_dir(&ttid);
+    let timeline_path = get_timeline_dir(conf, &ttid);
 
     info!(
         "moving timeline {} from {} to {}",
         ttid, tmp_path, timeline_path
     );
-    tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
+    tokio::fs::create_dir_all(get_tenant_dir(conf, &ttid.tenant_id)).await?;
     tokio::fs::rename(tmp_path, &timeline_path).await?;
 
     let tli = GlobalTimelines::load_timeline(&guard, ttid)
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 03cfa882c4..7943a2fd86 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -6,7 +6,7 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::safekeeper::AcceptorProposerMessage;
 use crate::safekeeper::ProposerAcceptorMessage;
 use crate::safekeeper::ServerInfo;
-use crate::timeline::Timeline;
+use crate::timeline::FullAccessTimeline;
 use crate::wal_service::ConnectionId;
 use crate::GlobalTimelines;
 use anyhow::{anyhow, Context};
@@ -213,7 +213,7 @@ impl SafekeeperPostgresHandler {
         &mut self,
         pgb: &mut PostgresBackend<IO>,
     ) -> Result<(), QueryError> {
-        let mut tli: Option<Arc<Timeline>> = None;
+        let mut tli: Option<FullAccessTimeline> = None;
         if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
             // Log the result and probably send it to the client, closing the stream.
             let handle_end_fut = pgb.handle_copy_stream_end(end);
@@ -233,7 +233,7 @@ impl SafekeeperPostgresHandler {
     pub async fn handle_start_wal_push_guts<IO: AsyncRead + AsyncWrite + Unpin>(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
-        tli: &mut Option<Arc<Timeline>>,
+        tli: &mut Option<FullAccessTimeline>,
     ) -> Result<(), CopyStreamHandlerEnd> {
         // Notify the libpq client that it's allowed to send `CopyData` messages
         pgb.write_message(&BeMessage::CopyBothResponse).await?;
@@ -323,7 +323,7 @@ struct NetworkReader<'a, IO> {
 impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
     async fn read_first_message(
         &mut self,
-    ) -> Result<(Arc<Timeline>, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
+    ) -> Result<(FullAccessTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
         // Receive information about server to create timeline, if not yet.
         let next_msg = read_message(self.pgb_reader).await?;
         let tli = match next_msg {
@@ -337,7 +337,10 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
                     system_id: greeting.system_id,
                     wal_seg_size: greeting.wal_seg_size,
                 };
-                GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID).await?
+                let tli =
+                    GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
+                        .await?;
+                tli.full_access_guard().await?
             }
             _ => {
                 return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!(
@@ -353,7 +356,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
         msg_tx: Sender<ProposerAcceptorMessage>,
         msg_rx: Receiver<ProposerAcceptorMessage>,
         reply_tx: Sender<AcceptorProposerMessage>,
-        tli: Arc<Timeline>,
+        tli: FullAccessTimeline,
         next_msg: ProposerAcceptorMessage,
     ) -> Result<(), CopyStreamHandlerEnd> {
         *self.acceptor_handle = Some(WalAcceptor::spawn(
@@ -448,7 +451,7 @@ const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
 /// replies to reply_tx; reading from socket and writing to disk in parallel is
 /// beneficial for performance, this struct provides writing to disk part.
 pub struct WalAcceptor {
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     msg_rx: Receiver<ProposerAcceptorMessage>,
     reply_tx: Sender<AcceptorProposerMessage>,
     conn_id: Option<ConnectionId>,
@@ -461,7 +464,7 @@ impl WalAcceptor {
     ///
     /// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper.
     pub fn spawn(
-        tli: Arc<Timeline>,
+        tli: FullAccessTimeline,
         msg_rx: Receiver<ProposerAcceptorMessage>,
         reply_tx: Sender<AcceptorProposerMessage>,
         conn_id: Option<ConnectionId>,
diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index 568a512c4a..80a630b1e1 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -2,7 +2,7 @@
 //! provide it, i.e. safekeeper lags too much.
 
 use std::time::SystemTime;
-use std::{fmt, pin::pin, sync::Arc};
+use std::{fmt, pin::pin};
 
 use anyhow::{bail, Context};
 use futures::StreamExt;
@@ -21,6 +21,7 @@ use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config}
 
 use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
 use crate::safekeeper::{AppendRequest, AppendRequestHeader};
+use crate::timeline::FullAccessTimeline;
 use crate::{
     http::routes::TimelineStatus,
     receive_wal::MSG_QUEUE_SIZE,
@@ -28,14 +29,14 @@ use crate::{
         AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory,
         TermLsn, VoteRequest,
     },
-    timeline::{PeerInfo, Timeline},
+    timeline::PeerInfo,
     SafeKeeperConf,
 };
 
 /// Entrypoint for per timeline task which always runs, checking whether
 /// recovery for this safekeeper is needed and starting it if so.
 #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
-pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
+pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) {
     info!("started");
 
     let cancel = tli.cancel.clone();
@@ -47,6 +48,87 @@ pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
     }
 }
 
+/// Should we start fetching WAL from a peer safekeeper, and if yes, from
+/// which? Answer is yes, i.e. .donors is not empty if 1) there is something
+/// to fetch, and we can do that without running elections; 2) there is no
+/// actively streaming compute, as we don't want to compete with it.
+///
+/// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
+/// to its last_log_term so we are sure such a leader ever had been elected.
+///
+/// All possible donors are returned so that we could keep connection to the
+/// current one if it is good even if it slightly lags behind.
+///
+/// Note that term conditions above might be not met, but safekeepers are
+/// still not aligned on last flush_lsn. Generally in this case until
+/// elections are run it is not possible to say which safekeeper should
+/// recover from which one -- history which would be committed is different
+/// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
+/// Thus we don't try to predict it here.
+async fn recovery_needed(
+    tli: &FullAccessTimeline,
+    heartbeat_timeout: Duration,
+) -> RecoveryNeededInfo {
+    let ss = tli.read_shared_state().await;
+    let term = ss.sk.state.acceptor_state.term;
+    let last_log_term = ss.sk.get_last_log_term();
+    let flush_lsn = ss.sk.flush_lsn();
+    // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
+    let mut peers = ss.get_peers(heartbeat_timeout);
+    // Sort by <last log term, lsn> pairs.
+    peers.sort_by(|p1, p2| {
+        let tl1 = TermLsn {
+            term: p1.last_log_term,
+            lsn: p1.flush_lsn,
+        };
+        let tl2 = TermLsn {
+            term: p2.last_log_term,
+            lsn: p2.flush_lsn,
+        };
+        tl2.cmp(&tl1) // desc
+    });
+    let num_streaming_computes = tli.get_walreceivers().get_num_streaming();
+    let donors = if num_streaming_computes > 0 {
+        vec![] // If there is a streaming compute, don't try to recover to not intervene.
+    } else {
+        peers
+            .iter()
+            .filter_map(|candidate| {
+                // Are we interested in this candidate?
+                let candidate_tl = TermLsn {
+                    term: candidate.last_log_term,
+                    lsn: candidate.flush_lsn,
+                };
+                let my_tl = TermLsn {
+                    term: last_log_term,
+                    lsn: flush_lsn,
+                };
+                if my_tl < candidate_tl {
+                    // Yes, we are interested. Can we pull from it without
+                    // (re)running elections? It is possible if 1) his term
+                    // is equal to his last_log_term so we could act on
+                    // behalf of leader of this term (we must be sure he was
+                    // ever elected) and 2) our term is not higher, or we'll refuse data.
+                    if candidate.term == candidate.last_log_term && candidate.term >= term {
+                        Some(Donor::from(candidate))
+                    } else {
+                        None
+                    }
+                } else {
+                    None
+                }
+            })
+            .collect()
+    };
+    RecoveryNeededInfo {
+        term,
+        last_log_term,
+        flush_lsn,
+        peers,
+        num_streaming_computes,
+        donors,
+    }
+}
 /// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and
 /// fields to explain the choice.
 #[derive(Debug)]
@@ -113,10 +195,10 @@ impl From<&PeerInfo> for Donor {
 const CHECK_INTERVAL_MS: u64 = 2000;
 
 /// Check regularly whether we need to start recovery.
-async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
+async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) {
     let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
     loop {
-        let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
+        let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
         match recovery_needed_info.donors.first() {
             Some(donor) => {
                 info!(
@@ -146,7 +228,7 @@ async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
 /// Recover from the specified donor. Returns message explaining normal finish
 /// reason or error.
 async fn recover(
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     donor: &Donor,
     conf: &SafeKeeperConf,
 ) -> anyhow::Result<String> {
@@ -232,7 +314,7 @@ async fn recover(
 
 // Pull WAL from donor, assuming handshake is already done.
 async fn recovery_stream(
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     donor: &Donor,
     start_streaming_at: Lsn,
     conf: &SafeKeeperConf,
@@ -316,7 +398,7 @@ async fn network_io(
     physical_stream: ReplicationStream,
     msg_tx: Sender<ProposerAcceptorMessage>,
     donor: Donor,
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     conf: SafeKeeperConf,
 ) -> anyhow::Result<Option<String>> {
     let mut physical_stream = pin!(physical_stream);
@@ -365,7 +447,7 @@ async fn network_io(
             }
             ReplicationMessage::PrimaryKeepAlive(_) => {
                 // keepalive means nothing is being streamed for a while. Check whether we need to stop.
-                let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
+                let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
                 // do current donors still contain one we currently connected to?
                 if !recovery_needed_info
                     .donors
diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs
index 3400eee9b7..b661e48cb5 100644
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -1,41 +1,25 @@
-//! Thread removing old WAL.
+use utils::lsn::Lsn;
 
-use std::time::Duration;
+use crate::timeline_manager::StateSnapshot;
 
-use tokio::time::sleep;
-use tracing::*;
+/// Get oldest LSN we still need to keep. We hold WAL till it is consumed
+/// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
+/// offloading.
+/// While it is safe to use inmem values for determining horizon,
+/// we use persistent to make possible normal states less surprising.
+/// All segments covering LSNs before horizon_lsn can be removed.
+pub fn calc_horizon_lsn(state: &StateSnapshot, extra_horizon_lsn: Option<Lsn>) -> Lsn {
+    use std::cmp::min;
 
-use crate::{GlobalTimelines, SafeKeeperConf};
-
-pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
-    let wal_removal_interval = Duration::from_millis(5000);
-    loop {
-        let now = tokio::time::Instant::now();
-        let tlis = GlobalTimelines::get_all();
-        for tli in &tlis {
-            let ttid = tli.ttid;
-            async {
-                if let Err(e) = tli.maybe_persist_control_file(false).await {
-                    warn!("failed to persist control file: {e}");
-                }
-                if let Err(e) = tli.remove_old_wal().await {
-                    error!("failed to remove WAL: {}", e);
-                }
-            }
-            .instrument(info_span!("WAL removal", ttid = %ttid))
-            .await;
-        }
-
-        let elapsed = now.elapsed();
-        let total_timelines = tlis.len();
-
-        if elapsed > wal_removal_interval {
-            info!(
-                "WAL removal is too long, processed {} timelines in {:?}",
-                total_timelines, elapsed
-            );
-        }
-
-        sleep(wal_removal_interval).await;
+    let mut horizon_lsn = min(
+        state.cfile_remote_consistent_lsn,
+        state.cfile_peer_horizon_lsn,
+    );
+    // we don't want to remove WAL that is not yet offloaded to s3
+    horizon_lsn = min(horizon_lsn, state.cfile_backup_lsn);
+    if let Some(extra_horizon_lsn) = extra_horizon_lsn {
+        horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
     }
+
+    horizon_lsn
 }
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 4686c9aa8e..563dbbe315 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -10,7 +10,6 @@ use std::cmp::max;
 use std::cmp::min;
 use std::fmt;
 use std::io::Read;
-use std::time::Duration;
 use storage_broker::proto::SafekeeperTimelineInfo;
 
 use tracing::*;
@@ -828,24 +827,6 @@ where
         Ok(())
     }
 
-    /// Persist control file if there is something to save and enough time
-    /// passed after the last save.
-    pub async fn maybe_persist_inmem_control_file(&mut self, force: bool) -> Result<bool> {
-        const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
-        if !force && self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
-            return Ok(false);
-        }
-        let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn
-            || self.state.inmem.backup_lsn > self.state.backup_lsn
-            || self.state.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn
-            || self.state.inmem.remote_consistent_lsn > self.state.remote_consistent_lsn;
-        if need_persist {
-            self.state.flush().await?;
-            trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
-        }
-        Ok(need_persist)
-    }
-
     /// Handle request to append WAL.
     #[allow(clippy::comparison_chain)]
     async fn handle_append_request(
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 5a9745e1c9..df75893838 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -5,7 +5,7 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::metrics::RECEIVED_PS_FEEDBACKS;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{Term, TermLsn};
-use crate::timeline::Timeline;
+use crate::timeline::FullAccessTimeline;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
@@ -387,8 +387,10 @@ impl SafekeeperPostgresHandler {
         term: Option<Term>,
     ) -> Result<(), QueryError> {
         let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
+        let full_access = tli.full_access_guard().await?;
+
         if let Err(end) = self
-            .handle_start_replication_guts(pgb, start_pos, term, tli.clone())
+            .handle_start_replication_guts(pgb, start_pos, term, full_access)
             .await
         {
             let info = tli.get_safekeeper_info(&self.conf).await;
@@ -405,7 +407,7 @@ impl SafekeeperPostgresHandler {
         pgb: &mut PostgresBackend<IO>,
         start_pos: Lsn,
         term: Option<Term>,
-        tli: Arc<Timeline>,
+        tli: FullAccessTimeline,
     ) -> Result<(), CopyStreamHandlerEnd> {
         let appname = self.appname.clone();
 
@@ -448,14 +450,7 @@ impl SafekeeperPostgresHandler {
         // switch to copy
         pgb.write_message(&BeMessage::CopyBothResponse).await?;
 
-        let (_, persisted_state) = tli.get_state().await;
-        let wal_reader = WalReader::new(
-            self.conf.workdir.clone(),
-            self.conf.timeline_dir(&tli.ttid),
-            &persisted_state,
-            start_pos,
-            self.conf.is_wal_backup_enabled(),
-        )?;
+        let wal_reader = tli.get_walreader(start_pos).await?;
 
         // Split to concurrently receive and send data; replies are generally
         // not synchronized with sends, so this avoids deadlocks.
@@ -532,7 +527,7 @@ impl EndWatch {
 /// A half driving sending WAL.
 struct WalSender<'a, IO> {
     pgb: &'a mut PostgresBackend<IO>,
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     appname: Option<String>,
     // Position since which we are sending next chunk.
     start_pos: Lsn,
@@ -741,7 +736,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
 struct ReplyReader<IO> {
     reader: PostgresBackendReader<IO>,
     ws_guard: Arc<WalSenderGuard>,
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
 }
 
 impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index aa9ccfc21e..148a7e90bd 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -3,14 +3,14 @@
 
 use anyhow::{anyhow, bail, Result};
 use camino::Utf8PathBuf;
-use postgres_ffi::XLogSegNo;
 use serde::{Deserialize, Serialize};
 use tokio::fs;
 use tokio_util::sync::CancellationToken;
+use utils::id::TenantId;
 
 use std::cmp::max;
 use std::ops::{Deref, DerefMut};
-use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
 use std::sync::Arc;
 use std::time::Duration;
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
@@ -26,7 +26,6 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 
 use crate::receive_wal::WalReceivers;
-use crate::recovery::{recovery_main, Donor, RecoveryNeededInfo};
 use crate::safekeeper::{
     AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
     INVALID_TERM,
@@ -38,8 +37,8 @@ use crate::wal_backup::{self};
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
 use crate::metrics::FullTimelineInfo;
-use crate::wal_storage::Storage as wal_storage_iface;
-use crate::{debug_dump, timeline_manager, wal_backup_partial, wal_storage};
+use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
+use crate::{debug_dump, timeline_manager, wal_storage};
 use crate::{GlobalTimelines, SafeKeeperConf};
 
 /// Things safekeeper should know about timeline state on peers.
@@ -169,7 +168,6 @@ pub struct SharedState {
     pub(crate) sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
     /// In memory list containing state of peers sent in latest messages from them.
     pub(crate) peers_info: PeersInfo,
-    pub(crate) last_removed_segno: XLogSegNo,
 }
 
 impl SharedState {
@@ -197,33 +195,33 @@ impl SharedState {
 
         // We don't want to write anything to disk, because we may have existing timeline there.
         // These functions should not change anything on disk.
-        let timeline_dir = conf.timeline_dir(ttid);
-        let control_store = control_file::FileStorage::create_new(timeline_dir, conf, state)?;
+        let timeline_dir = get_timeline_dir(conf, ttid);
+        let control_store =
+            control_file::FileStorage::create_new(timeline_dir.clone(), conf, state)?;
         let wal_store =
-            wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
+            wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
         let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
 
         Ok(Self {
             sk,
             peers_info: PeersInfo(vec![]),
-            last_removed_segno: 0,
         })
     }
 
     /// Restore SharedState from control file. If file doesn't exist, bails out.
     fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
+        let timeline_dir = get_timeline_dir(conf, ttid);
         let control_store = control_file::FileStorage::restore_new(ttid, conf)?;
         if control_store.server.wal_seg_size == 0 {
             bail!(TimelineError::UninitializedWalSegSize(*ttid));
         }
 
         let wal_store =
-            wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
+            wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
 
         Ok(Self {
             sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
             peers_info: PeersInfo(vec![]),
-            last_removed_segno: 0,
         })
     }
 
@@ -275,24 +273,6 @@ impl SharedState {
             .cloned()
             .collect()
     }
-
-    /// Get oldest segno we still need to keep. We hold WAL till it is consumed
-    /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
-    /// offloading.
-    /// While it is safe to use inmem values for determining horizon,
-    /// we use persistent to make possible normal states less surprising.
-    fn get_horizon_segno(&self, extra_horizon_lsn: Option<Lsn>) -> XLogSegNo {
-        let state = &self.sk.state;
-
-        use std::cmp::min;
-        let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn);
-        // we don't want to remove WAL that is not yet offloaded to s3
-        horizon_lsn = min(horizon_lsn, state.backup_lsn);
-        if let Some(extra_horizon_lsn) = extra_horizon_lsn {
-            horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
-        }
-        horizon_lsn.segment_number(state.server.wal_seg_size as usize)
-    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -349,22 +329,15 @@ pub struct Timeline {
     mutex: RwLock<SharedState>,
     walsenders: Arc<WalSenders>,
     walreceivers: Arc<WalReceivers>,
+    timeline_dir: Utf8PathBuf,
 
     /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires
     pub(crate) cancel: CancellationToken,
 
-    /// Directory where timeline state is stored.
-    pub timeline_dir: Utf8PathBuf,
-
-    /// Should we keep WAL on disk for active replication connections.
-    /// Especially useful for sharding, when different shards process WAL
-    /// with different speed.
-    // TODO: add `Arc<SafeKeeperConf>` here instead of adding each field separately.
-    walsenders_keep_horizon: bool,
-
     // timeline_manager controlled state
     pub(crate) broker_active: AtomicBool,
     pub(crate) wal_backup_active: AtomicBool,
+    pub(crate) last_removed_segno: AtomicU64,
 }
 
 impl Timeline {
@@ -394,10 +367,10 @@ impl Timeline {
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
             cancel: CancellationToken::default(),
-            timeline_dir: conf.timeline_dir(&ttid),
-            walsenders_keep_horizon: conf.walsenders_keep_horizon,
+            timeline_dir: get_timeline_dir(conf, &ttid),
             broker_active: AtomicBool::new(false),
             wal_backup_active: AtomicBool::new(false),
+            last_removed_segno: AtomicU64::new(0),
         })
     }
 
@@ -430,10 +403,10 @@ impl Timeline {
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
             cancel: CancellationToken::default(),
-            timeline_dir: conf.timeline_dir(&ttid),
-            walsenders_keep_horizon: conf.walsenders_keep_horizon,
+            timeline_dir: get_timeline_dir(conf, &ttid),
             broker_active: AtomicBool::new(false),
             wal_backup_active: AtomicBool::new(false),
+            last_removed_segno: AtomicU64::new(0),
         })
     }
 
@@ -494,15 +467,6 @@ impl Timeline {
             conf.clone(),
             broker_active_set,
         ));
-
-        // Start recovery task which always runs on the timeline.
-        if conf.peer_recovery_enabled {
-            tokio::spawn(recovery_main(self.clone(), conf.clone()));
-        }
-        // TODO: migrate to timeline_manager
-        if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
-            tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
-        }
     }
 
     /// Delete timeline from disk completely, by removing timeline directory.
@@ -555,36 +519,6 @@ impl Timeline {
         self.mutex.read().await
     }
 
-    /// Returns true if walsender should stop sending WAL to pageserver. We
-    /// terminate it if remote_consistent_lsn reached commit_lsn and there is no
-    /// computes. While there might be nothing to stream already, we learn about
-    /// remote_consistent_lsn update through replication feedback, and we want
-    /// to stop pushing to the broker if pageserver is fully caughtup.
-    pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
-        if self.is_cancelled() {
-            return true;
-        }
-        let shared_state = self.read_shared_state().await;
-        if self.walreceivers.get_num() == 0 {
-            return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
-            reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
-        }
-        false
-    }
-
-    /// Ensure that current term is t, erroring otherwise, and lock the state.
-    pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
-        let ss = self.read_shared_state().await;
-        if ss.sk.state.acceptor_state.term != t {
-            bail!(
-                "failed to acquire term {}, current term {}",
-                t,
-                ss.sk.state.acceptor_state.term
-            );
-        }
-        Ok(ss)
-    }
-
     /// Returns commit_lsn watch channel.
     pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
         self.commit_lsn_watch_rx.clone()
@@ -600,28 +534,6 @@ impl Timeline {
         self.shared_state_version_rx.clone()
     }
 
-    /// Pass arrived message to the safekeeper.
-    pub async fn process_msg(
-        self: &Arc<Self>,
-        msg: &ProposerAcceptorMessage,
-    ) -> Result<Option<AcceptorProposerMessage>> {
-        if self.is_cancelled() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-
-        let mut rmsg: Option<AcceptorProposerMessage>;
-        {
-            let mut shared_state = self.write_shared_state().await;
-            rmsg = shared_state.sk.process_msg(msg).await?;
-
-            // if this is AppendResponse, fill in proper hot standby feedback.
-            if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
-                resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
-            }
-        }
-        Ok(rmsg)
-    }
-
     /// Returns wal_seg_size.
     pub async fn get_wal_seg_size(&self) -> usize {
         self.read_shared_state().await.get_wal_seg_size()
@@ -672,97 +584,11 @@ impl Timeline {
         Ok(())
     }
 
-    /// Update in memory remote consistent lsn.
-    pub async fn update_remote_consistent_lsn(self: &Arc<Self>, candidate: Lsn) {
-        let mut shared_state = self.write_shared_state().await;
-        shared_state.sk.state.inmem.remote_consistent_lsn =
-            max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
-    }
-
     pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
         let shared_state = self.read_shared_state().await;
         shared_state.get_peers(conf.heartbeat_timeout)
     }
 
-    /// Should we start fetching WAL from a peer safekeeper, and if yes, from
-    /// which? Answer is yes, i.e. .donors is not empty if 1) there is something
-    /// to fetch, and we can do that without running elections; 2) there is no
-    /// actively streaming compute, as we don't want to compete with it.
-    ///
-    /// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
-    /// to its last_log_term so we are sure such a leader ever had been elected.
-    ///
-    /// All possible donors are returned so that we could keep connection to the
-    /// current one if it is good even if it slightly lags behind.
-    ///
-    /// Note that term conditions above might be not met, but safekeepers are
-    /// still not aligned on last flush_lsn. Generally in this case until
-    /// elections are run it is not possible to say which safekeeper should
-    /// recover from which one -- history which would be committed is different
-    /// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
-    /// Thus we don't try to predict it here.
-    pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
-        let ss = self.read_shared_state().await;
-        let term = ss.sk.state.acceptor_state.term;
-        let last_log_term = ss.sk.get_last_log_term();
-        let flush_lsn = ss.sk.flush_lsn();
-        // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
-        let mut peers = ss.get_peers(heartbeat_timeout);
-        // Sort by <last log term, lsn> pairs.
-        peers.sort_by(|p1, p2| {
-            let tl1 = TermLsn {
-                term: p1.last_log_term,
-                lsn: p1.flush_lsn,
-            };
-            let tl2 = TermLsn {
-                term: p2.last_log_term,
-                lsn: p2.flush_lsn,
-            };
-            tl2.cmp(&tl1) // desc
-        });
-        let num_streaming_computes = self.walreceivers.get_num_streaming();
-        let donors = if num_streaming_computes > 0 {
-            vec![] // If there is a streaming compute, don't try to recover to not intervene.
-        } else {
-            peers
-                .iter()
-                .filter_map(|candidate| {
-                    // Are we interested in this candidate?
-                    let candidate_tl = TermLsn {
-                        term: candidate.last_log_term,
-                        lsn: candidate.flush_lsn,
-                    };
-                    let my_tl = TermLsn {
-                        term: last_log_term,
-                        lsn: flush_lsn,
-                    };
-                    if my_tl < candidate_tl {
-                        // Yes, we are interested. Can we pull from it without
-                        // (re)running elections? It is possible if 1) his term
-                        // is equal to his last_log_term so we could act on
-                        // behalf of leader of this term (we must be sure he was
-                        // ever elected) and 2) our term is not higher, or we'll refuse data.
-                        if candidate.term == candidate.last_log_term && candidate.term >= term {
-                            Some(Donor::from(candidate))
-                        } else {
-                            None
-                        }
-                    } else {
-                        None
-                    }
-                })
-                .collect()
-        };
-        RecoveryNeededInfo {
-            term,
-            last_log_term,
-            flush_lsn,
-            peers,
-            num_streaming_computes,
-            donors,
-        }
-    }
-
     pub fn get_walsenders(&self) -> &Arc<WalSenders> {
         &self.walsenders
     }
@@ -776,58 +602,6 @@ impl Timeline {
         self.read_shared_state().await.sk.wal_store.flush_lsn()
     }
 
-    /// Delete WAL segments from disk that are no longer needed. This is determined
-    /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
-    pub async fn remove_old_wal(self: &Arc<Self>) -> Result<()> {
-        if self.is_cancelled() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-
-        // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
-        // This allows to get better read speed for pageservers that are lagging behind,
-        // at the cost of keeping more WAL on disk.
-        let replication_horizon_lsn = if self.walsenders_keep_horizon {
-            self.walsenders.laggard_lsn()
-        } else {
-            None
-        };
-
-        let horizon_segno: XLogSegNo;
-        let remover = {
-            let shared_state = self.read_shared_state().await;
-            horizon_segno = shared_state.get_horizon_segno(replication_horizon_lsn);
-            if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
-                return Ok(()); // nothing to do
-            }
-
-            // release the lock before removing
-            shared_state.sk.wal_store.remove_up_to(horizon_segno - 1)
-        };
-
-        // delete old WAL files
-        remover.await?;
-
-        // update last_removed_segno
-        let mut shared_state = self.write_shared_state().await;
-        if shared_state.last_removed_segno != horizon_segno {
-            shared_state.last_removed_segno = horizon_segno;
-        } else {
-            shared_state.skip_update = true;
-        }
-        Ok(())
-    }
-
-    /// Persist control file if there is something to save and enough time
-    /// passed after the last save. This helps to keep remote_consistent_lsn up
-    /// to date so that storage nodes restart doesn't cause many pageserver ->
-    /// safekeeper reconnections.
-    pub async fn maybe_persist_control_file(self: &Arc<Self>, force: bool) -> Result<()> {
-        let mut guard = self.write_shared_state().await;
-        let changed = guard.sk.maybe_persist_inmem_control_file(force).await?;
-        guard.skip_update = !changed;
-        Ok(())
-    }
-
     /// Gather timeline data for metrics.
     pub async fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
         if self.is_cancelled() {
@@ -843,7 +617,7 @@ impl Timeline {
             wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
             timeline_is_active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
-            last_removed_segno: state.last_removed_segno,
+            last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
             epoch_start_lsn: state.sk.term_start_lsn,
             mem_state: state.sk.state.inmem.clone(),
             persisted_state: state.sk.state.clone(),
@@ -866,7 +640,7 @@ impl Timeline {
             wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
             active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
-            last_removed_segno: state.last_removed_segno,
+            last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
             epoch_start_lsn: state.sk.term_start_lsn,
             mem_state: state.sk.state.inmem.clone(),
             write_lsn,
@@ -889,6 +663,110 @@ impl Timeline {
         state.sk.state.finish_change(&persistent_state).await?;
         Ok(res)
     }
+
+    /// Get the timeline guard for reading/writing WAL files.
+    /// TODO: if WAL files are not present on disk (evicted), they will be
+    /// downloaded from S3. Also there will logic for preventing eviction
+    /// while someone is holding FullAccessTimeline guard.
+    pub async fn full_access_guard(self: &Arc<Self>) -> Result<FullAccessTimeline> {
+        if self.is_cancelled() {
+            bail!(TimelineError::Cancelled(self.ttid));
+        }
+        Ok(FullAccessTimeline { tli: self.clone() })
+    }
+}
+
+/// This is a guard that allows to read/write disk timeline state.
+/// All tasks that are using the disk should use this guard.
+#[derive(Clone)]
+pub struct FullAccessTimeline {
+    pub tli: Arc<Timeline>,
+}
+
+impl Deref for FullAccessTimeline {
+    type Target = Arc<Timeline>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.tli
+    }
+}
+
+impl FullAccessTimeline {
+    /// Returns true if walsender should stop sending WAL to pageserver. We
+    /// terminate it if remote_consistent_lsn reached commit_lsn and there is no
+    /// computes. While there might be nothing to stream already, we learn about
+    /// remote_consistent_lsn update through replication feedback, and we want
+    /// to stop pushing to the broker if pageserver is fully caughtup.
+    pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
+        if self.is_cancelled() {
+            return true;
+        }
+        let shared_state = self.read_shared_state().await;
+        if self.walreceivers.get_num() == 0 {
+            return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
+            reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
+        }
+        false
+    }
+
+    /// Ensure that current term is t, erroring otherwise, and lock the state.
+    pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
+        let ss = self.read_shared_state().await;
+        if ss.sk.state.acceptor_state.term != t {
+            bail!(
+                "failed to acquire term {}, current term {}",
+                t,
+                ss.sk.state.acceptor_state.term
+            );
+        }
+        Ok(ss)
+    }
+
+    /// Pass arrived message to the safekeeper.
+    pub async fn process_msg(
+        &self,
+        msg: &ProposerAcceptorMessage,
+    ) -> Result<Option<AcceptorProposerMessage>> {
+        if self.is_cancelled() {
+            bail!(TimelineError::Cancelled(self.ttid));
+        }
+
+        let mut rmsg: Option<AcceptorProposerMessage>;
+        {
+            let mut shared_state = self.write_shared_state().await;
+            rmsg = shared_state.sk.process_msg(msg).await?;
+
+            // if this is AppendResponse, fill in proper hot standby feedback.
+            if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
+                resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
+            }
+        }
+        Ok(rmsg)
+    }
+
+    pub async fn get_walreader(&self, start_lsn: Lsn) -> Result<WalReader> {
+        let (_, persisted_state) = self.get_state().await;
+        let enable_remote_read = GlobalTimelines::get_global_config().is_wal_backup_enabled();
+
+        WalReader::new(
+            &self.ttid,
+            self.timeline_dir.clone(),
+            &persisted_state,
+            start_lsn,
+            enable_remote_read,
+        )
+    }
+
+    pub fn get_timeline_dir(&self) -> Utf8PathBuf {
+        self.timeline_dir.clone()
+    }
+
+    /// Update in memory remote consistent lsn.
+    pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) {
+        let mut shared_state = self.write_shared_state().await;
+        shared_state.sk.state.inmem.remote_consistent_lsn =
+            max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
+    }
 }
 
 /// Deletes directory and it's contents. Returns false if directory does not exist.
@@ -899,3 +777,16 @@ async fn delete_dir(path: &Utf8PathBuf) -> Result<bool> {
         Err(e) => Err(e.into()),
     }
 }
+
+/// Get a path to the tenant directory. If you just need to get a timeline directory,
+/// use FullAccessTimeline::get_timeline_dir instead.
+pub(crate) fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf {
+    conf.workdir.join(tenant_id.to_string())
+}
+
+/// Get a path to the timeline directory. If you need to read WAL files from disk,
+/// use FullAccessTimeline::get_timeline_dir instead. This function does not check
+/// timeline eviction status and WAL files might not be present on disk.
+pub(crate) fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf {
+    get_tenant_dir(conf, &ttid.tenant_id).join(ttid.timeline_id.to_string())
+}
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index ed544352f9..84862207d5 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -3,23 +3,42 @@
 //! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
 //! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
 
-use std::{sync::Arc, time::Duration};
+use std::{
+    sync::Arc,
+    time::{Duration, Instant},
+};
 
+use postgres_ffi::XLogSegNo;
+use tokio::task::{JoinError, JoinHandle};
 use tracing::{info, instrument, warn};
 use utils::lsn::Lsn;
 
 use crate::{
+    control_file::Storage,
     metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
+    recovery::recovery_main,
+    remove_wal::calc_horizon_lsn,
+    send_wal::WalSenders,
     timeline::{PeerInfo, ReadGuardSharedState, Timeline},
-    timelines_set::TimelinesSet,
+    timelines_set::{TimelineSetGuard, TimelinesSet},
     wal_backup::{self, WalBackupTaskHandle},
-    SafeKeeperConf,
+    wal_backup_partial, SafeKeeperConf,
 };
 
 pub struct StateSnapshot {
+    // inmem values
     pub commit_lsn: Lsn,
     pub backup_lsn: Lsn,
     pub remote_consistent_lsn: Lsn,
+
+    // persistent control file values
+    pub cfile_peer_horizon_lsn: Lsn,
+    pub cfile_remote_consistent_lsn: Lsn,
+    pub cfile_backup_lsn: Lsn,
+
+    // misc
+    pub cfile_last_persist_at: Instant,
+    pub inmem_flush_pending: bool,
     pub peers: Vec<PeerInfo>,
 }
 
@@ -30,17 +49,34 @@ impl StateSnapshot {
             commit_lsn: read_guard.sk.state.inmem.commit_lsn,
             backup_lsn: read_guard.sk.state.inmem.backup_lsn,
             remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn,
+            cfile_peer_horizon_lsn: read_guard.sk.state.peer_horizon_lsn,
+            cfile_remote_consistent_lsn: read_guard.sk.state.remote_consistent_lsn,
+            cfile_backup_lsn: read_guard.sk.state.backup_lsn,
+            cfile_last_persist_at: read_guard.sk.state.pers.last_persist_at(),
+            inmem_flush_pending: Self::has_unflushed_inmem_state(&read_guard),
             peers: read_guard.get_peers(heartbeat_timeout),
         }
     }
+
+    fn has_unflushed_inmem_state(read_guard: &ReadGuardSharedState) -> bool {
+        let state = &read_guard.sk.state;
+        state.inmem.commit_lsn > state.commit_lsn
+            || state.inmem.backup_lsn > state.backup_lsn
+            || state.inmem.peer_horizon_lsn > state.peer_horizon_lsn
+            || state.inmem.remote_consistent_lsn > state.remote_consistent_lsn
+    }
 }
 
 /// Control how often the manager task should wake up to check updates.
 /// There is no need to check for updates more often than this.
 const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
 
+/// How often to save the control file if the is no other activity.
+const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
+
 /// This task gets spawned alongside each timeline and is responsible for managing the timeline's
 /// background tasks.
+/// Be careful, this task is not respawned on panic, so it should not panic.
 #[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))]
 pub async fn main_task(
     tli: Arc<Timeline>,
@@ -55,20 +91,50 @@ pub async fn main_task(
         }
     };
 
-    // sets whether timeline is active for broker pushes or not
-    let mut tli_broker_active = broker_active_set.guard(tli.clone());
-
-    let ttid = tli.ttid;
+    // configuration & dependencies
     let wal_seg_size = tli.get_wal_seg_size().await;
     let heartbeat_timeout = conf.heartbeat_timeout;
-
-    let mut state_version_rx = tli.get_state_version_rx();
-
+    let walsenders = tli.get_walsenders();
     let walreceivers = tli.get_walreceivers();
+
+    // current state
+    let mut state_version_rx = tli.get_state_version_rx();
     let mut num_computes_rx = walreceivers.get_num_rx();
+    let mut tli_broker_active = broker_active_set.guard(tli.clone());
+    let mut last_removed_segno = 0 as XLogSegNo;
 
     // list of background tasks
     let mut backup_task: Option<WalBackupTaskHandle> = None;
+    let mut recovery_task: Option<JoinHandle<()>> = None;
+    let mut partial_backup_task: Option<JoinHandle<()>> = None;
+    let mut wal_removal_task: Option<JoinHandle<anyhow::Result<u64>>> = None;
+
+    // Start recovery task which always runs on the timeline.
+    if conf.peer_recovery_enabled {
+        match tli.full_access_guard().await {
+            Ok(tli) => {
+                recovery_task = Some(tokio::spawn(recovery_main(tli, conf.clone())));
+            }
+            Err(e) => {
+                warn!("failed to start recovery task: {:?}", e);
+            }
+        }
+    }
+
+    // Start partial backup task which always runs on the timeline.
+    if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
+        match tli.full_access_guard().await {
+            Ok(tli) => {
+                partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
+                    tli,
+                    conf.clone(),
+                )));
+            }
+            Err(e) => {
+                warn!("failed to start partial backup task: {:?}", e);
+            }
+        }
+    }
 
     let last_state = 'outer: loop {
         MANAGER_ITERATIONS_TOTAL.inc();
@@ -76,47 +142,36 @@ pub async fn main_task(
         let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout);
         let num_computes = *num_computes_rx.borrow();
 
-        let is_wal_backup_required =
-            wal_backup::is_wal_backup_required(wal_seg_size, num_computes, &state_snapshot);
+        let is_wal_backup_required = update_backup(
+            &conf,
+            &tli,
+            wal_seg_size,
+            num_computes,
+            &state_snapshot,
+            &mut backup_task,
+        )
+        .await;
 
-        if conf.is_wal_backup_enabled() {
-            wal_backup::update_task(
-                &conf,
-                ttid,
-                is_wal_backup_required,
-                &state_snapshot,
-                &mut backup_task,
-            )
-            .await;
-        }
+        let _is_active = update_is_active(
+            is_wal_backup_required,
+            num_computes,
+            &state_snapshot,
+            &mut tli_broker_active,
+            &tli,
+        );
 
-        let is_active = is_wal_backup_required
-            || num_computes > 0
-            || state_snapshot.remote_consistent_lsn < state_snapshot.commit_lsn;
+        let next_cfile_save = update_control_file_save(&state_snapshot, &tli).await;
 
-        // update the broker timeline set
-        if tli_broker_active.set(is_active) {
-            // write log if state has changed
-            info!(
-                "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
-                is_active, state_snapshot.remote_consistent_lsn, state_snapshot.commit_lsn,
-            );
-
-            MANAGER_ACTIVE_CHANGES.inc();
-
-            if !is_active {
-                // TODO: maybe use tokio::spawn?
-                if let Err(e) = tli.maybe_persist_control_file(false).await {
-                    warn!("control file save in update_status failed: {:?}", e);
-                }
-            }
-        }
-
-        // update the state in Arc<Timeline>
-        tli.wal_backup_active
-            .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
-        tli.broker_active
-            .store(is_active, std::sync::atomic::Ordering::Relaxed);
+        update_wal_removal(
+            &conf,
+            walsenders,
+            &tli,
+            wal_seg_size,
+            &state_snapshot,
+            last_removed_segno,
+            &mut wal_removal_task,
+        )
+        .await;
 
         // wait until something changes. tx channels are stored under Arc, so they will not be
         // dropped until the manager task is finished.
@@ -135,11 +190,189 @@ pub async fn main_task(
             _ = num_computes_rx.changed() => {
                 // number of connected computes was updated
             }
+            _ = async {
+                if let Some(timeout) = next_cfile_save {
+                    tokio::time::sleep_until(timeout).await
+                } else {
+                    futures::future::pending().await
+                }
+            } => {
+                // it's time to save the control file
+            }
+            res = async {
+                if let Some(task) = &mut wal_removal_task {
+                    task.await
+                } else {
+                    futures::future::pending().await
+                }
+            } => {
+                // WAL removal task finished
+                wal_removal_task = None;
+                update_wal_removal_end(res, &tli, &mut last_removed_segno);
+            }
         }
     };
 
     // shutdown background tasks
     if conf.is_wal_backup_enabled() {
-        wal_backup::update_task(&conf, ttid, false, &last_state, &mut backup_task).await;
+        wal_backup::update_task(&conf, &tli, false, &last_state, &mut backup_task).await;
+    }
+
+    if let Some(recovery_task) = recovery_task {
+        if let Err(e) = recovery_task.await {
+            warn!("recovery task failed: {:?}", e);
+        }
+    }
+
+    if let Some(partial_backup_task) = partial_backup_task {
+        if let Err(e) = partial_backup_task.await {
+            warn!("partial backup task failed: {:?}", e);
+        }
+    }
+
+    if let Some(wal_removal_task) = wal_removal_task {
+        let res = wal_removal_task.await;
+        update_wal_removal_end(res, &tli, &mut last_removed_segno);
     }
 }
+
+/// Spawns/kills backup task and returns true if backup is required.
+async fn update_backup(
+    conf: &SafeKeeperConf,
+    tli: &Arc<Timeline>,
+    wal_seg_size: usize,
+    num_computes: usize,
+    state: &StateSnapshot,
+    backup_task: &mut Option<WalBackupTaskHandle>,
+) -> bool {
+    let is_wal_backup_required =
+        wal_backup::is_wal_backup_required(wal_seg_size, num_computes, state);
+
+    if conf.is_wal_backup_enabled() {
+        wal_backup::update_task(conf, tli, is_wal_backup_required, state, backup_task).await;
+    }
+
+    // update the state in Arc<Timeline>
+    tli.wal_backup_active
+        .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
+    is_wal_backup_required
+}
+
+/// Update is_active flag and returns its value.
+fn update_is_active(
+    is_wal_backup_required: bool,
+    num_computes: usize,
+    state: &StateSnapshot,
+    tli_broker_active: &mut TimelineSetGuard,
+    tli: &Arc<Timeline>,
+) -> bool {
+    let is_active = is_wal_backup_required
+        || num_computes > 0
+        || state.remote_consistent_lsn < state.commit_lsn;
+
+    // update the broker timeline set
+    if tli_broker_active.set(is_active) {
+        // write log if state has changed
+        info!(
+            "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
+            is_active, state.remote_consistent_lsn, state.commit_lsn,
+        );
+
+        MANAGER_ACTIVE_CHANGES.inc();
+    }
+
+    // update the state in Arc<Timeline>
+    tli.broker_active
+        .store(is_active, std::sync::atomic::Ordering::Relaxed);
+    is_active
+}
+
+/// Save control file if needed. Returns Instant if we should persist the control file in the future.
+async fn update_control_file_save(
+    state: &StateSnapshot,
+    tli: &Arc<Timeline>,
+) -> Option<tokio::time::Instant> {
+    if !state.inmem_flush_pending {
+        return None;
+    }
+
+    if state.cfile_last_persist_at.elapsed() > CF_SAVE_INTERVAL {
+        let mut write_guard = tli.write_shared_state().await;
+        // this can be done in the background because it blocks manager task, but flush() should
+        // be fast enough not to be a problem now
+        if let Err(e) = write_guard.sk.state.flush().await {
+            warn!("failed to save control file: {:?}", e);
+        }
+
+        None
+    } else {
+        // we should wait until next CF_SAVE_INTERVAL
+        Some((state.cfile_last_persist_at + CF_SAVE_INTERVAL).into())
+    }
+}
+
+/// Spawns WAL removal task if needed.
+async fn update_wal_removal(
+    conf: &SafeKeeperConf,
+    walsenders: &Arc<WalSenders>,
+    tli: &Arc<Timeline>,
+    wal_seg_size: usize,
+    state: &StateSnapshot,
+    last_removed_segno: u64,
+    wal_removal_task: &mut Option<JoinHandle<anyhow::Result<u64>>>,
+) {
+    if wal_removal_task.is_some() {
+        // WAL removal is already in progress
+        return;
+    }
+
+    // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
+    // This allows to get better read speed for pageservers that are lagging behind,
+    // at the cost of keeping more WAL on disk.
+    let replication_horizon_lsn = if conf.walsenders_keep_horizon {
+        walsenders.laggard_lsn()
+    } else {
+        None
+    };
+
+    let removal_horizon_lsn = calc_horizon_lsn(state, replication_horizon_lsn);
+    let removal_horizon_segno = removal_horizon_lsn
+        .segment_number(wal_seg_size)
+        .saturating_sub(1);
+
+    if removal_horizon_segno > last_removed_segno {
+        // we need to remove WAL
+        let remover = crate::wal_storage::Storage::remove_up_to(
+            &tli.read_shared_state().await.sk.wal_store,
+            removal_horizon_segno,
+        );
+        *wal_removal_task = Some(tokio::spawn(async move {
+            remover.await?;
+            Ok(removal_horizon_segno)
+        }));
+    }
+}
+
+/// Update the state after WAL removal task finished.
+fn update_wal_removal_end(
+    res: Result<anyhow::Result<u64>, JoinError>,
+    tli: &Arc<Timeline>,
+    last_removed_segno: &mut u64,
+) {
+    let new_last_removed_segno = match res {
+        Ok(Ok(segno)) => segno,
+        Err(e) => {
+            warn!("WAL removal task failed: {:?}", e);
+            return;
+        }
+        Ok(Err(e)) => {
+            warn!("WAL removal task failed: {:?}", e);
+            return;
+        }
+    };
+
+    *last_removed_segno = new_last_removed_segno;
+    // update the state in Arc<Timeline>
+    tli.last_removed_segno
+        .store(new_last_removed_segno, std::sync::atomic::Ordering::Relaxed);
+}
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 8d37bd6371..45e08ede3c 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -3,7 +3,7 @@
 //! all from the disk on startup and keeping them in memory.
 
 use crate::safekeeper::ServerInfo;
-use crate::timeline::{Timeline, TimelineError};
+use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
 use crate::timelines_set::TimelinesSet;
 use crate::SafeKeeperConf;
 use anyhow::{bail, Context, Result};
@@ -127,7 +127,7 @@ impl GlobalTimelines {
             state.get_dependencies()
         };
 
-        let timelines_dir = conf.tenant_dir(&tenant_id);
+        let timelines_dir = get_tenant_dir(&conf, &tenant_id);
         for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
             .with_context(|| format!("failed to list timelines dir {}", timelines_dir))?
         {
@@ -348,11 +348,7 @@ impl GlobalTimelines {
             }
             Err(_) => {
                 // Timeline is not memory, but it may still exist on disk in broken state.
-                let dir_path = TIMELINES_STATE
-                    .lock()
-                    .unwrap()
-                    .get_conf()
-                    .timeline_dir(ttid);
+                let dir_path = get_timeline_dir(TIMELINES_STATE.lock().unwrap().get_conf(), ttid);
                 let dir_existed = delete_dir(dir_path)?;
 
                 Ok(TimelineDeleteForceResult {
@@ -401,13 +397,10 @@ impl GlobalTimelines {
         // Note that we could concurrently create new timelines while we were deleting them,
         // so the directory may be not empty. In this case timelines will have bad state
         // and timeline background jobs can panic.
-        delete_dir(
-            TIMELINES_STATE
-                .lock()
-                .unwrap()
-                .get_conf()
-                .tenant_dir(tenant_id),
-        )?;
+        delete_dir(get_tenant_dir(
+            TIMELINES_STATE.lock().unwrap().get_conf(),
+            tenant_id,
+        ))?;
 
         // FIXME: we temporarily disabled removing timelines from the map, see `delete_force`
         // let tlis_after_delete = Self::get_all_for_tenant(*tenant_id);
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 84680557f9..58591aecfa 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -30,9 +30,9 @@ use tracing::*;
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
 use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
-use crate::timeline::{PeerInfo, Timeline};
+use crate::timeline::{FullAccessTimeline, PeerInfo, Timeline};
 use crate::timeline_manager::StateSnapshot;
-use crate::{GlobalTimelines, SafeKeeperConf, WAL_BACKUP_RUNTIME};
+use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME};
 
 use once_cell::sync::OnceCell;
 
@@ -63,13 +63,13 @@ pub fn is_wal_backup_required(
 /// is running, kill it.
 pub async fn update_task(
     conf: &SafeKeeperConf,
-    ttid: TenantTimelineId,
+    tli: &Arc<Timeline>,
     need_backup: bool,
     state: &StateSnapshot,
     entry: &mut Option<WalBackupTaskHandle>,
 ) {
     let (offloader, election_dbg_str) =
-        determine_offloader(&state.peers, state.backup_lsn, ttid, conf);
+        determine_offloader(&state.peers, state.backup_lsn, tli.ttid, conf);
     let elected_me = Some(conf.my_id) == offloader;
 
     let should_task_run = need_backup && elected_me;
@@ -80,15 +80,8 @@ pub async fn update_task(
             info!("elected for backup: {}", election_dbg_str);
 
             let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
-            let timeline_dir = conf.timeline_dir(&ttid);
 
-            let async_task = backup_task_main(
-                ttid,
-                timeline_dir,
-                conf.workdir.clone(),
-                conf.backup_parallel_jobs,
-                shutdown_rx,
-            );
+            let async_task = backup_task_main(tli.clone(), conf.backup_parallel_jobs, shutdown_rx);
 
             let handle = if conf.current_thread_runtime {
                 tokio::spawn(async_task)
@@ -198,39 +191,32 @@ pub fn init_remote_storage(conf: &SafeKeeperConf) {
 }
 
 struct WalBackupTask {
-    timeline: Arc<Timeline>,
+    timeline: FullAccessTimeline,
     timeline_dir: Utf8PathBuf,
-    workspace_dir: Utf8PathBuf,
     wal_seg_size: usize,
     parallel_jobs: usize,
     commit_lsn_watch_rx: watch::Receiver<Lsn>,
 }
 
 /// Offload single timeline.
-#[instrument(name = "WAL backup", skip_all, fields(ttid = %ttid))]
-async fn backup_task_main(
-    ttid: TenantTimelineId,
-    timeline_dir: Utf8PathBuf,
-    workspace_dir: Utf8PathBuf,
-    parallel_jobs: usize,
-    mut shutdown_rx: Receiver<()>,
-) {
+#[instrument(name = "WAL backup", skip_all, fields(ttid = %tli.ttid))]
+async fn backup_task_main(tli: Arc<Timeline>, parallel_jobs: usize, mut shutdown_rx: Receiver<()>) {
     let _guard = WAL_BACKUP_TASKS.guard();
 
+    let tli = match tli.full_access_guard().await {
+        Ok(tli) => tli,
+        Err(e) => {
+            error!("backup error: {}", e);
+            return;
+        }
+    };
     info!("started");
-    let res = GlobalTimelines::get(ttid);
-    if let Err(e) = res {
-        error!("backup error: {}", e);
-        return;
-    }
-    let tli = res.unwrap();
 
     let mut wb = WalBackupTask {
         wal_seg_size: tli.get_wal_seg_size().await,
         commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
+        timeline_dir: tli.get_timeline_dir(),
         timeline: tli,
-        timeline_dir,
-        workspace_dir,
         parallel_jobs,
     };
 
@@ -297,7 +283,6 @@ impl WalBackupTask {
                 commit_lsn,
                 self.wal_seg_size,
                 &self.timeline_dir,
-                &self.workspace_dir,
                 self.parallel_jobs,
             )
             .await
@@ -319,18 +304,18 @@ impl WalBackupTask {
 }
 
 async fn backup_lsn_range(
-    timeline: &Arc<Timeline>,
+    timeline: &FullAccessTimeline,
     backup_lsn: &mut Lsn,
     end_lsn: Lsn,
     wal_seg_size: usize,
     timeline_dir: &Utf8Path,
-    workspace_dir: &Utf8Path,
     parallel_jobs: usize,
 ) -> Result<()> {
     if parallel_jobs < 1 {
         anyhow::bail!("parallel_jobs must be >= 1");
     }
 
+    let remote_timeline_path = remote_timeline_path(&timeline.ttid)?;
     let start_lsn = *backup_lsn;
     let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
 
@@ -343,7 +328,11 @@ async fn backup_lsn_range(
     loop {
         let added_task = match iter.next() {
             Some(s) => {
-                uploads.push_back(backup_single_segment(s, timeline_dir, workspace_dir));
+                uploads.push_back(backup_single_segment(
+                    s,
+                    timeline_dir,
+                    &remote_timeline_path,
+                ));
                 true
             }
             None => false,
@@ -381,18 +370,10 @@ async fn backup_lsn_range(
 async fn backup_single_segment(
     seg: &Segment,
     timeline_dir: &Utf8Path,
-    workspace_dir: &Utf8Path,
+    remote_timeline_path: &RemotePath,
 ) -> Result<Segment> {
     let segment_file_path = seg.file_path(timeline_dir)?;
-    let remote_segment_path = segment_file_path
-        .strip_prefix(workspace_dir)
-        .context("Failed to strip workspace dir prefix")
-        .and_then(RemotePath::new)
-        .with_context(|| {
-            format!(
-                "Failed to resolve remote part of path {segment_file_path:?} for base {workspace_dir:?}",
-            )
-        })?;
+    let remote_segment_path = seg.remote_path(remote_timeline_path);
 
     let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await;
     if res.is_ok() {
@@ -430,6 +411,10 @@ impl Segment {
         Ok(timeline_dir.join(self.object_name()))
     }
 
+    pub fn remote_path(self, remote_timeline_path: &RemotePath) -> RemotePath {
+        remote_timeline_path.join(self.object_name())
+    }
+
     pub fn size(self) -> usize {
         (u64::from(self.end_lsn) - u64::from(self.start_lsn)) as usize
     }
@@ -530,8 +515,7 @@ pub async fn read_object(
 /// when called.
 pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     let storage = get_configured_remote_storage();
-    let ttid_path = Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string());
-    let remote_path = RemotePath::new(&ttid_path)?;
+    let remote_path = remote_timeline_path(ttid)?;
 
     // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
     // const Option unwrap is not stable, otherwise it would be const.
@@ -613,15 +597,17 @@ pub async fn copy_s3_segments(
         .as_ref()
         .unwrap();
 
-    let relative_dst_path =
-        Utf8Path::new(&dst_ttid.tenant_id.to_string()).join(dst_ttid.timeline_id.to_string());
-
-    let remote_path = RemotePath::new(&relative_dst_path)?;
+    let remote_dst_path = remote_timeline_path(dst_ttid)?;
 
     let cancel = CancellationToken::new();
 
     let files = storage
-        .list(Some(&remote_path), ListingMode::NoDelimiter, None, &cancel)
+        .list(
+            Some(&remote_dst_path),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
         .await?
         .keys;
 
@@ -635,9 +621,6 @@ pub async fn copy_s3_segments(
         uploaded_segments
     );
 
-    let relative_src_path =
-        Utf8Path::new(&src_ttid.tenant_id.to_string()).join(src_ttid.timeline_id.to_string());
-
     for segno in from_segment..to_segment {
         if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 {
             info!("copied all segments from {} until {}", from_segment, segno);
@@ -649,8 +632,8 @@ pub async fn copy_s3_segments(
         }
         debug!("copying segment {}", segment_name);
 
-        let from = RemotePath::new(&relative_src_path.join(&segment_name))?;
-        let to = RemotePath::new(&relative_dst_path.join(&segment_name))?;
+        let from = remote_timeline_path(src_ttid)?.join(&segment_name);
+        let to = remote_dst_path.join(&segment_name);
 
         storage.copy_object(&from, &to, &cancel).await?;
     }
@@ -661,3 +644,8 @@ pub async fn copy_s3_segments(
     );
     Ok(())
 }
+
+/// Get S3 (remote_storage) prefix path used for timeline files.
+pub fn remote_timeline_path(ttid: &TenantTimelineId) -> Result<RemotePath> {
+    RemotePath::new(&Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string()))
+}
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 29e944bff3..a320be3bad 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -18,8 +18,6 @@
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.
 
-use std::sync::Arc;
-
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use rand::Rng;
@@ -32,8 +30,9 @@ use utils::lsn::Lsn;
 use crate::{
     metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
     safekeeper::Term,
-    timeline::Timeline,
-    wal_backup, SafeKeeperConf,
+    timeline::FullAccessTimeline,
+    wal_backup::{self, remote_timeline_path},
+    SafeKeeperConf,
 };
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
@@ -83,10 +82,10 @@ impl State {
 
 struct PartialBackup {
     wal_seg_size: usize,
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     conf: SafeKeeperConf,
     local_prefix: Utf8PathBuf,
-    remote_prefix: Utf8PathBuf,
+    remote_timeline_path: RemotePath,
 
     state: State,
 }
@@ -153,7 +152,7 @@ impl PartialBackup {
         let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size);
 
         let local_path = self.local_prefix.join(self.local_segment_name(segno));
-        let remote_path = RemotePath::new(self.remote_prefix.join(&prepared.name).as_ref())?;
+        let remote_path = self.remote_timeline_path.join(&prepared.name);
 
         // Upload first `backup_bytes` bytes of the segment to the remote storage.
         wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
@@ -253,7 +252,7 @@ impl PartialBackup {
         info!("deleting objects: {:?}", segments_to_delete);
         let mut objects_to_delete = vec![];
         for seg in segments_to_delete.iter() {
-            let remote_path = RemotePath::new(self.remote_prefix.join(seg).as_ref())?;
+            let remote_path = self.remote_timeline_path.join(seg);
             objects_to_delete.push(remote_path);
         }
 
@@ -273,7 +272,7 @@ impl PartialBackup {
 }
 
 #[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))]
-pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
+pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
     debug!("started");
     let await_duration = conf.partial_backup_timeout;
 
@@ -289,11 +288,11 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
     let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
     let wal_seg_size = tli.get_wal_seg_size().await;
 
-    let local_prefix = tli.timeline_dir.clone();
-    let remote_prefix = match tli.timeline_dir.strip_prefix(&conf.workdir) {
-        Ok(path) => path.to_owned(),
+    let local_prefix = tli.get_timeline_dir();
+    let remote_timeline_path = match remote_timeline_path(&tli.ttid) {
+        Ok(path) => path,
         Err(e) => {
-            error!("failed to strip workspace dir prefix: {:?}", e);
+            error!("failed to create remote path: {:?}", e);
             return;
         }
     };
@@ -304,7 +303,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
         state: persistent_state.partial_backup,
         conf,
         local_prefix,
-        remote_prefix,
+        remote_timeline_path,
     };
 
     debug!("state: {:?}", backup.state);
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 6bc8c7c3f9..45e27e1951 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -25,7 +25,7 @@ use utils::crashsafe::durable_rename;
 
 use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
 use crate::state::TimelinePersistentState;
-use crate::wal_backup::read_object;
+use crate::wal_backup::{read_object, remote_timeline_path};
 use crate::SafeKeeperConf;
 use postgres_ffi::waldecoder::WalStreamDecoder;
 use postgres_ffi::XLogFileName;
@@ -536,7 +536,7 @@ async fn remove_segments_from_disk(
 }
 
 pub struct WalReader {
-    workdir: Utf8PathBuf,
+    remote_path: RemotePath,
     timeline_dir: Utf8PathBuf,
     wal_seg_size: usize,
     pos: Lsn,
@@ -558,7 +558,7 @@ pub struct WalReader {
 
 impl WalReader {
     pub fn new(
-        workdir: Utf8PathBuf,
+        ttid: &TenantTimelineId,
         timeline_dir: Utf8PathBuf,
         state: &TimelinePersistentState,
         start_pos: Lsn,
@@ -586,7 +586,7 @@ impl WalReader {
         }
 
         Ok(Self {
-            workdir,
+            remote_path: remote_timeline_path(ttid)?,
             timeline_dir,
             wal_seg_size: state.server.wal_seg_size as usize,
             pos: start_pos,
@@ -684,7 +684,7 @@ impl WalReader {
         let xlogoff = self.pos.segment_offset(self.wal_seg_size);
         let segno = self.pos.segment_number(self.wal_seg_size);
         let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size);
-        let wal_file_path = self.timeline_dir.join(wal_file_name);
+        let wal_file_path = self.timeline_dir.join(&wal_file_name);
 
         // Try to open local file, if we may have WAL locally
         if self.pos >= self.local_start_lsn {
@@ -712,16 +712,7 @@ impl WalReader {
 
         // Try to open remote file, if remote reads are enabled
         if self.enable_remote_read {
-            let remote_wal_file_path = wal_file_path
-                .strip_prefix(&self.workdir)
-                .context("Failed to strip workdir prefix")
-                .and_then(RemotePath::new)
-                .with_context(|| {
-                    format!(
-                        "Failed to resolve remote part of path {:?} for base {:?}",
-                        wal_file_path, self.workdir,
-                    )
-                })?;
+            let remote_wal_file_path = self.remote_path.join(&wal_file_name);
             return read_object(&remote_wal_file_path, xlogoff as u64).await;
         }
 
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index e9be765669..147264762c 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -72,6 +72,18 @@ class Lsn:
     def segment_lsn(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> "Lsn":
         return Lsn(self.lsn_int - (self.lsn_int % seg_sz))
 
+    def segno(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> int:
+        return self.lsn_int // seg_sz
+
+    def segment_name(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> str:
+        segno = self.segno(seg_sz)
+        # The filename format is 00000001XXXXXXXX000000YY, where XXXXXXXXYY is segno in hex.
+        # XXXXXXXX is the higher 8 hex digits of segno
+        high_bits = segno >> 8
+        # YY is the lower 2 hex digits of segno
+        low_bits = segno & 0xFF
+        return f"00000001{high_bits:08X}000000{low_bits:02X}"
+
 
 @dataclass(frozen=True)
 class Key:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b8ef63faa9..0004745bf0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -973,6 +973,9 @@ class NeonEnvBuilder:
             for pageserver in self.env.pageservers:
                 pageserver.assert_no_errors()
 
+            for safekeeper in self.env.safekeepers:
+                safekeeper.assert_no_errors()
+
             self.env.storage_controller.assert_no_errors()
 
         try:
@@ -3813,6 +3816,9 @@ class Safekeeper(LogUtils):
         self.running = False
         return self
 
+    def assert_no_errors(self):
+        assert not self.log_contains("manager task finished prematurely")
+
     def append_logical_message(
         self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any]
     ) -> Dict[str, Any]:
@@ -3898,6 +3904,15 @@ class Safekeeper(LogUtils):
         """
         cli = self.http_client()
 
+        target_segment_file = lsn.segment_name()
+
+        def are_segments_removed():
+            segments = self.list_segments(tenant_id, timeline_id)
+            log.info(
+                f"waiting for all segments before {target_segment_file} to be removed from sk {self.id}, current segments: {segments}"
+            )
+            assert all(target_segment_file <= s for s in segments)
+
         def are_lsns_advanced():
             stat = cli.timeline_status(tenant_id, timeline_id)
             log.info(
@@ -3909,6 +3924,7 @@ class Safekeeper(LogUtils):
         # pageserver to this safekeeper
         wait_until(30, 1, are_lsns_advanced)
         cli.checkpoint(tenant_id, timeline_id)
+        wait_until(30, 1, are_segments_removed)
 
     def wait_until_paused(self, failpoint: str):
         msg = f"at failpoint {failpoint}"

From 87afbf6b24313cbfa28809ec7ada2e72911263be Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 31 May 2024 12:00:40 -0400
Subject: [PATCH 0903/1571] test(pageserver): add test interface to create
 artificial layers (#7899)

This pull request adds necessary interfaces to deterministically create
scenarios we want to test. Simplify some test cases to use this
interface to make it stable + reproducible.

Compaction test will be able to use this interface. Also the upcoming
delete tombstone tests will use this interface to make test
reproducible.

## Summary of changes

* `force_create_image_layer`
* `force_create_delta_layer`
* `force_advance_lsn`
* `create_test_timeline_with_states`
* `branch_timeline_test_with_states`

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                      | 261 ++++++++----------
 pageserver/src/tenant/timeline.rs             |  96 +++++++
 .../src/tenant/timeline/layer_manager.rs      |   7 +
 3 files changed, 223 insertions(+), 141 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0a9637884f..cfa683beb8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1393,6 +1393,36 @@ impl Tenant {
         Ok(tl)
     }
 
+    /// Helper for unit tests to create a timeline with some pre-loaded states.
+    #[cfg(test)]
+    #[allow(clippy::too_many_arguments)]
+    pub async fn create_test_timeline_with_layers(
+        &self,
+        new_timeline_id: TimelineId,
+        initdb_lsn: Lsn,
+        pg_version: u32,
+        ctx: &RequestContext,
+        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
+        end_lsn: Lsn,
+    ) -> anyhow::Result<Arc<Timeline>> {
+        let tline = self
+            .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
+            .await?;
+        tline.force_advance_lsn(end_lsn);
+        for deltas in delta_layer_desc {
+            tline
+                .force_create_delta_layer(deltas, Some(initdb_lsn), ctx)
+                .await?;
+        }
+        for (lsn, images) in image_layer_desc {
+            tline
+                .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
+                .await?;
+        }
+        Ok(tline)
+    }
+
     /// Create a new timeline.
     ///
     /// Returns the new timeline ID and reference to its Timeline object.
@@ -2992,17 +3022,53 @@ impl Tenant {
         &self,
         src_timeline: &Arc<Timeline>,
         dst_id: TimelineId,
-        start_lsn: Option<Lsn>,
+        ancestor_lsn: Option<Lsn>,
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
         let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
         let tl = self
-            .branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx)
+            .branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, create_guard, ctx)
             .await?;
         tl.set_state(TimelineState::Active);
         Ok(tl)
     }
 
+    /// Helper for unit tests to branch a timeline with some pre-loaded states.
+    #[cfg(test)]
+    #[allow(clippy::too_many_arguments)]
+    pub async fn branch_timeline_test_with_layers(
+        &self,
+        src_timeline: &Arc<Timeline>,
+        dst_id: TimelineId,
+        ancestor_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
+        end_lsn: Lsn,
+    ) -> anyhow::Result<Arc<Timeline>> {
+        let tline = self
+            .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
+            .await?;
+        let ancestor_lsn = if let Some(ancestor_lsn) = ancestor_lsn {
+            ancestor_lsn
+        } else {
+            tline.get_last_record_lsn()
+        };
+        assert!(end_lsn >= ancestor_lsn);
+        tline.force_advance_lsn(end_lsn);
+        for deltas in delta_layer_desc {
+            tline
+                .force_create_delta_layer(deltas, Some(ancestor_lsn), ctx)
+                .await?;
+        }
+        for (lsn, images) in image_layer_desc {
+            tline
+                .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
+                .await?;
+        }
+        Ok(tline)
+    }
+
     /// Branch an existing timeline.
     ///
     /// The caller is responsible for activating the returned timeline.
@@ -6206,75 +6272,36 @@ mod tests {
     async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
         let (tenant, ctx) = harness.load().await;
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await?;
-
-        let cancel = CancellationToken::new();
 
         let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
         let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
         let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
 
-        let mut lsn = Lsn(0x20);
-
-        {
-            let mut writer = tline.writer().await;
-            writer
-                .put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx)
-                .await?;
-            writer.finish_write(lsn);
-            drop(writer);
-
-            tline.freeze_and_flush().await?; // this will create a image layer
-        }
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                Vec::new(), // delta layers
+                vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
+                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
+            )
+            .await?;
 
         let child = tenant
-            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .branch_timeline_test_with_layers(
+                &tline,
+                NEW_TIMELINE_ID,
+                Some(Lsn(0x20)),
+                &ctx,
+                Vec::new(), // delta layers
+                vec![(Lsn(0x30), vec![(base_key_child, test_img("data key 2"))])], // image layers
+                Lsn(0x30),
+            )
             .await
             .unwrap();
 
-        lsn.0 += 0x10;
-
-        {
-            let mut writer = child.writer().await;
-            writer
-                .put(
-                    base_key_child,
-                    lsn,
-                    &Value::Image(test_img("data key 2")),
-                    &ctx,
-                )
-                .await?;
-            writer.finish_write(lsn);
-            drop(writer);
-
-            child.freeze_and_flush().await?; // this will create a delta
-
-            {
-                // update the partitioning to include the test key space, otherwise they
-                // will be dropped by image layer creation
-                let mut guard = child.partitioning.lock().await;
-                let ((partitioning, _), partition_lsn) = &mut *guard;
-                partitioning
-                    .parts
-                    .push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key
-                *partition_lsn = lsn;
-            }
-
-            child
-                .compact(
-                    &cancel,
-                    {
-                        let mut set = EnumSet::empty();
-                        set.insert(CompactFlags::ForceImageLayerCreation);
-                        set
-                    },
-                    &ctx,
-                )
-                .await?; // force create an image layer for the keys, TODO: check if the image layer is created
-        }
-
         async fn get_vectored_impl_wrapper(
             tline: &Arc<Timeline>,
             key: Key,
@@ -6296,6 +6323,8 @@ mod tests {
             }))
         }
 
+        let lsn = Lsn(0x30);
+
         // test vectored get on parent timeline
         assert_eq!(
             get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
@@ -6333,94 +6362,42 @@ mod tests {
 
     #[tokio::test]
     async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
+        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
         let (tenant, ctx) = harness.load().await;
+
+        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+        let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
+        let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
+        assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
+
         let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                Vec::new(), // delta layers
+                vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
+                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
+            )
             .await?;
 
-        let cancel = CancellationToken::new();
-
-        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-        let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
-        let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
-        base_key.field1 = AUX_KEY_PREFIX;
-        base_key_child.field1 = AUX_KEY_PREFIX;
-        base_key_nonexist.field1 = AUX_KEY_PREFIX;
-
-        let mut lsn = Lsn(0x20);
-
-        {
-            let mut writer = tline.writer().await;
-            writer
-                .put(
-                    base_key,
-                    lsn,
-                    &Value::Image(test_img("metadata key 1")),
-                    &ctx,
-                )
-                .await?;
-            writer.finish_write(lsn);
-            drop(writer);
-
-            tline.freeze_and_flush().await?; // this will create an image layer
-
-            tline
-                .compact(
-                    &cancel,
-                    {
-                        let mut set = EnumSet::empty();
-                        set.insert(CompactFlags::ForceImageLayerCreation);
-                        set.insert(CompactFlags::ForceRepartition);
-                        set
-                    },
-                    &ctx,
-                )
-                .await?; // force create an image layer for metadata keys
-            tenant
-                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
-                .await?;
-        }
-
         let child = tenant
-            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .branch_timeline_test_with_layers(
+                &tline,
+                NEW_TIMELINE_ID,
+                Some(Lsn(0x20)),
+                &ctx,
+                Vec::new(), // delta layers
+                vec![(
+                    Lsn(0x30),
+                    vec![(base_key_child, test_img("metadata key 2"))],
+                )], // image layers
+                Lsn(0x30),
+            )
             .await
             .unwrap();
 
-        lsn.0 += 0x10;
-
-        {
-            let mut writer = child.writer().await;
-            writer
-                .put(
-                    base_key_child,
-                    lsn,
-                    &Value::Image(test_img("metadata key 2")),
-                    &ctx,
-                )
-                .await?;
-            writer.finish_write(lsn);
-            drop(writer);
-
-            child.freeze_and_flush().await?;
-
-            child
-                .compact(
-                    &cancel,
-                    {
-                        let mut set = EnumSet::empty();
-                        set.insert(CompactFlags::ForceImageLayerCreation);
-                        set.insert(CompactFlags::ForceRepartition);
-                        set
-                    },
-                    &ctx,
-                )
-                .await?; // force create an image layer for metadata keys
-            tenant
-                .gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
-                .await?;
-        }
-
         async fn get_vectored_impl_wrapper(
             tline: &Arc<Timeline>,
             key: Key,
@@ -6442,6 +6419,8 @@ mod tests {
             }))
         }
 
+        let lsn = Lsn(0x30);
+
         // test vectored get on parent timeline
         assert_eq!(
             get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b498876465..8033edaa12 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5371,6 +5371,102 @@ impl Timeline {
             shard_count: self.tenant_shard_id.shard_count,
         }
     }
+
+    #[cfg(test)]
+    pub(super) fn force_advance_lsn(self: &Arc<Timeline>, new_lsn: Lsn) {
+        self.last_record_lsn.advance(new_lsn);
+    }
+
+    /// Force create an image layer and place it into the layer map.
+    ///
+    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
+    #[cfg(test)]
+    pub(super) async fn force_create_image_layer(
+        self: &Arc<Timeline>,
+        lsn: Lsn,
+        mut images: Vec<(Key, Bytes)>,
+        check_start_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let last_record_lsn = self.get_last_record_lsn();
+        assert!(
+            lsn <= last_record_lsn,
+            "advance last record lsn before inserting a layer, lsn={lsn}, last_record_lsn={last_record_lsn}"
+        );
+        if let Some(check_start_lsn) = check_start_lsn {
+            assert!(lsn >= check_start_lsn);
+        }
+        images.sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb));
+        let min_key = *images.first().map(|(k, _)| k).unwrap();
+        let max_key = images.last().map(|(k, _)| k).unwrap().next();
+        let mut image_layer_writer = ImageLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            &(min_key..max_key),
+            lsn,
+            ctx,
+        )
+        .await?;
+        for (key, img) in images {
+            image_layer_writer.put_image(key, img, ctx).await?;
+        }
+        let image_layer = image_layer_writer.finish(self, ctx).await?;
+
+        {
+            let mut guard = self.layers.write().await;
+            guard.force_insert_layer(image_layer);
+        }
+
+        Ok(())
+    }
+
+    /// Force create a delta layer and place it into the layer map.
+    ///
+    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
+    #[cfg(test)]
+    pub(super) async fn force_create_delta_layer(
+        self: &Arc<Timeline>,
+        mut deltas: Vec<(Key, Lsn, Value)>,
+        check_start_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let last_record_lsn = self.get_last_record_lsn();
+        deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
+        let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
+        let max_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
+        let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
+        let max_lsn = Lsn(deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap().0 + 1);
+        assert!(
+            max_lsn <= last_record_lsn,
+            "advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}"
+        );
+        if let Some(check_start_lsn) = check_start_lsn {
+            assert!(min_lsn >= check_start_lsn);
+        }
+        let mut delta_layer_writer = DeltaLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            min_key,
+            min_lsn..max_lsn,
+            ctx,
+        )
+        .await?;
+        for (key, lsn, val) in deltas {
+            delta_layer_writer.put_value(key, lsn, val, ctx).await?;
+        }
+        let delta_layer = delta_layer_writer.finish(max_key, self, ctx).await?;
+
+        {
+            let mut guard = self.layers.write().await;
+            guard.force_insert_layer(delta_layer);
+        }
+
+        Ok(())
+    }
 }
 
 type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 884b71df75..b78c98a506 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -255,6 +255,13 @@ impl LayerManager {
         updates.flush()
     }
 
+    #[cfg(test)]
+    pub(crate) fn force_insert_layer(&mut self, layer: ResidentLayer) {
+        let mut updates = self.layer_map.batch_update();
+        Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+        updates.flush()
+    }
+
     /// Helper function to insert a layer into the layer map and file manager.
     fn insert_historic_layer(
         layer: Layer,

From 9fda85b4862bccf7e57c1f2fadfd03f1c3c7288b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 31 May 2024 17:02:10 +0100
Subject: [PATCH 0904/1571] pageserver: remove AncestorStopping error variants
 (#7916)

## Problem

In all cases, AncestorStopping is equivalent to Cancelled.

This became more obvious in
https://github.com/neondatabase/neon/pull/7912#discussion_r1620582309
when updating these error types.

## Summary of changes

- Remove AncestorStopping, always use Cancelled instead
---
 pageserver/src/consumption_metrics.rs         |  2 +-
 pageserver/src/http/routes.rs                 |  3 ---
 pageserver/src/pgdatadir_mapping.rs           |  7 ++-----
 pageserver/src/tenant/timeline.rs             | 19 ++++---------------
 .../fixtures/pageserver/allowed_errors.py     |  2 +-
 5 files changed, 8 insertions(+), 25 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 62bbde42f4..540d0d2e8c 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -358,7 +358,7 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
     // mean the synthetic size worker should terminate.
     let shutting_down = matches!(
         e.downcast_ref::<PageReconstructError>(),
-        Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
+        Some(PageReconstructError::Cancelled)
     );
 
     if !shutting_down {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 913d45d63c..bd6fa028ac 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -184,9 +184,6 @@ impl From<PageReconstructError> for ApiError {
             PageReconstructError::Cancelled => {
                 ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
             }
-            PageReconstructError::AncestorStopping(_) => {
-                ApiError::ResourceUnavailable(format!("{pre}").into())
-            }
             PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
             PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
         }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 0fc846e5f3..c78c358855 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -115,9 +115,7 @@ impl From<PageReconstructError> for CollectKeySpaceError {
 impl From<PageReconstructError> for CalculateLogicalSizeError {
     fn from(pre: PageReconstructError) -> Self {
         match pre {
-            PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
-                Self::Cancelled
-            }
+            PageReconstructError::Cancelled => Self::Cancelled,
             _ => Self::PageRead(pre),
         }
     }
@@ -1614,8 +1612,7 @@ impl<'a> DatadirModification<'a> {
                         aux_files.dir = Some(dir);
                     }
                     Err(
-                        e @ (PageReconstructError::AncestorStopping(_)
-                        | PageReconstructError::Cancelled
+                        e @ (PageReconstructError::Cancelled
                         | PageReconstructError::AncestorLsnTimeout(_)),
                     ) => {
                         // Important that we do not interpret a shutdown error as "not found" and thereby
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8033edaa12..4a9d981ad8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -501,10 +501,6 @@ pub(crate) enum PageReconstructError {
     #[error("timeline shutting down")]
     Cancelled,
 
-    /// The ancestor of this is being stopped
-    #[error("ancestor timeline {0} is being stopped")]
-    AncestorStopping(TimelineId),
-
     /// An error happened replaying WAL records
     #[error(transparent)]
     WalRedo(anyhow::Error),
@@ -569,7 +565,7 @@ impl PageReconstructError {
         match self {
             Other(_) => false,
             AncestorLsnTimeout(_) => false,
-            Cancelled | AncestorStopping(_) => true,
+            Cancelled => true,
             WalRedo(_) => false,
             MissingKey { .. } => false,
         }
@@ -645,9 +641,6 @@ pub(crate) enum GetVectoredError {
 
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum GetReadyAncestorError {
-    #[error("ancestor timeline {0} is being stopped")]
-    AncestorStopping(TimelineId),
-
     #[error("Ancestor LSN wait error: {0}")]
     AncestorLsnTimeout(#[from] WaitLsnError),
 
@@ -757,7 +750,6 @@ impl From<GetReadyAncestorError> for PageReconstructError {
     fn from(e: GetReadyAncestorError) -> Self {
         use GetReadyAncestorError::*;
         match e {
-            AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid),
             AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err),
             bad_state @ BadState { .. } => PageReconstructError::Other(anyhow::anyhow!(bad_state)),
             Cancelled => PageReconstructError::Cancelled,
@@ -1192,9 +1184,7 @@ impl Timeline {
 
                 use PageReconstructError::*;
                 match block {
-                    Err(Cancelled | AncestorStopping(_)) => {
-                        return Err(GetVectoredError::Cancelled)
-                    }
+                    Err(Cancelled) => return Err(GetVectoredError::Cancelled),
                     Err(MissingKey(_))
                         if NON_INHERITED_RANGE.contains(&key)
                             || NON_INHERITED_SPARSE_RANGE.contains(&key) =>
@@ -3585,9 +3575,8 @@ impl Timeline {
         match ancestor.wait_to_become_active(ctx).await {
             Ok(()) => {}
             Err(TimelineState::Stopping) => {
-                return Err(GetReadyAncestorError::AncestorStopping(
-                    ancestor.timeline_id,
-                ));
+                // If an ancestor is stopping, it means the tenant is stopping: handle this the same as if this timeline was stopping.
+                return Err(GetReadyAncestorError::Cancelled);
             }
             Err(state) => {
                 return Err(GetReadyAncestorError::BadState {
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index ad8bbe2021..ef412cade7 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -66,7 +66,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
     ".*task iteration took longer than the configured period.*",
     # these can happen anytime we do compactions from background task and shutdown pageserver
-    r".*ERROR.*ancestor timeline \S+ is being stopped",
+    ".*could not compact.*cancelled.*",
     # this is expected given our collaborative shutdown approach for the UploadQueue
     ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
     ".*Compaction failed.*, retrying in .*: ShuttingDown",

From ef83f31e77abf7cf55387635eb3e8ad2191d97a1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 31 May 2024 21:19:41 +0300
Subject: [PATCH 0905/1571] pagectl: key command for dumping what we know about
 the key (#7890)

What we know about the key via added `pagectl key $key` command:
- debug formatting
- shard placement when `--shard-count` is specified
- different boolean queries in `key.rs`
- aux files v2

Example:

```
$ cargo run -qp pagectl -- key 000000063F00004005000060270000100E2C
parsed from hex: 000000063F00004005000060270000100E2C:

Key { field1: 0, field2: 1599, field3: 16389, field4: 24615, field5: 0, field6: 1052204 }
rel_block:         true
rel_vm_block:      false
rel_fsm_block:     false
slru_block:        false
inherited:         true
rel_size:          false
slru_segment_size: false
recognized kind:   None
```
---
 Cargo.lock                        |   1 +
 libs/pageserver_api/src/key.rs    |  36 ++-
 libs/pageserver_api/src/reltag.rs |  53 +++-
 libs/pageserver_api/src/shard.rs  |  25 ++
 libs/utils/src/hex.rs             |  19 +-
 pageserver/ctl/Cargo.toml         |   1 +
 pageserver/ctl/src/key.rs         | 477 ++++++++++++++++++++++++++++++
 pageserver/ctl/src/main.rs        |   4 +
 8 files changed, 608 insertions(+), 8 deletions(-)
 create mode 100644 pageserver/ctl/src/key.rs

diff --git a/Cargo.lock b/Cargo.lock
index 96ba5c8ec3..6a60104472 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3570,6 +3570,7 @@ dependencies = [
  "serde",
  "serde_json",
  "svg_fmt",
+ "thiserror",
  "tokio",
  "tokio-util",
  "toml_edit",
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index b00d48498c..e52d4ef986 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -381,10 +381,15 @@ pub fn rel_size_to_key(rel: RelTag) -> Key {
         field3: rel.dbnode,
         field4: rel.relnode,
         field5: rel.forknum,
-        field6: 0xffffffff,
+        field6: 0xffff_ffff,
     }
 }
 
+#[inline(always)]
+pub fn is_rel_size_key(key: &Key) -> bool {
+    key.field1 == 0 && key.field6 == u32::MAX
+}
+
 #[inline(always)]
 pub fn rel_key_range(rel: RelTag) -> Range<Key> {
     Key {
@@ -422,6 +427,25 @@ pub fn slru_dir_to_key(kind: SlruKind) -> Key {
     }
 }
 
+#[inline(always)]
+pub fn slru_dir_kind(key: &Key) -> Option<Result<SlruKind, u32>> {
+    if key.field1 == 0x01
+        && key.field3 == 0
+        && key.field4 == 0
+        && key.field5 == 0
+        && key.field6 == 0
+    {
+        match key.field2 {
+            0 => Some(Ok(SlruKind::Clog)),
+            1 => Some(Ok(SlruKind::MultiXactMembers)),
+            2 => Some(Ok(SlruKind::MultiXactOffsets)),
+            x => Some(Err(x)),
+        }
+    } else {
+        None
+    }
+}
+
 #[inline(always)]
 pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
     Key {
@@ -450,10 +474,18 @@ pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
         field3: 1,
         field4: segno,
         field5: 0,
-        field6: 0xffffffff,
+        field6: 0xffff_ffff,
     }
 }
 
+pub fn is_slru_segment_size_key(key: &Key) -> bool {
+    key.field1 == 0x01
+        && key.field2 < 0x03
+        && key.field3 == 0x01
+        && key.field5 == 0
+        && key.field6 == u32::MAX
+}
+
 #[inline(always)]
 pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
     let field2 = match kind {
diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs
index 38693ab847..010a9c2932 100644
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -3,7 +3,7 @@ use std::cmp::Ordering;
 use std::fmt;
 
 use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
-use postgres_ffi::relfile_utils::forknumber_to_name;
+use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM};
 use postgres_ffi::Oid;
 
 ///
@@ -68,6 +68,57 @@ impl fmt::Display for RelTag {
     }
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum ParseRelTagError {
+    #[error("invalid forknum")]
+    InvalidForknum(#[source] std::num::ParseIntError),
+    #[error("missing triplet member {}", .0)]
+    MissingTripletMember(usize),
+    #[error("invalid triplet member {}", .0)]
+    InvalidTripletMember(usize, #[source] std::num::ParseIntError),
+}
+
+impl std::str::FromStr for RelTag {
+    type Err = ParseRelTagError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        use ParseRelTagError::*;
+
+        // FIXME: in postgres logs this separator is dot
+        // Example:
+        //     could not read block 2 in rel 1663/208101/2620.1 from page server at lsn 0/2431E6F0
+        // with a regex we could get this more painlessly
+        let (triplet, forknum) = match s.split_once('_').or_else(|| s.split_once('.')) {
+            Some((t, f)) => {
+                let forknum = forkname_to_number(Some(f));
+                let forknum = if let Ok(f) = forknum {
+                    f
+                } else {
+                    f.parse::<u8>().map_err(InvalidForknum)?
+                };
+
+                (t, Some(forknum))
+            }
+            None => (s, None),
+        };
+
+        let mut split = triplet
+            .splitn(3, '/')
+            .enumerate()
+            .map(|(i, s)| s.parse::<u32>().map_err(|e| InvalidTripletMember(i, e)));
+        let spcnode = split.next().ok_or(MissingTripletMember(0))??;
+        let dbnode = split.next().ok_or(MissingTripletMember(1))??;
+        let relnode = split.next().ok_or(MissingTripletMember(2))??;
+
+        Ok(RelTag {
+            spcnode,
+            forknum: forknum.unwrap_or(MAIN_FORKNUM),
+            dbnode,
+            relnode,
+        })
+    }
+}
+
 impl RelTag {
     pub fn to_segfile_name(&self, segno: u32) -> String {
         let mut name = if self.spcnode == GLOBALTABLESPACE_OID {
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 1c05a01926..8ace426f88 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -428,6 +428,12 @@ impl<'de> Deserialize<'de> for TenantShardId {
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
 
+impl Default for ShardStripeSize {
+    fn default() -> Self {
+        DEFAULT_STRIPE_SIZE
+    }
+}
+
 /// Layout version: for future upgrades where we might change how the key->shard mapping works
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardLayout(u8);
@@ -713,6 +719,25 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
     ShardNumber((hash % count.0 as u32) as u8)
 }
 
+/// For debugging, while not exposing the internals.
+#[derive(Debug)]
+#[allow(unused)] // used by debug formatting by pagectl
+struct KeyShardingInfo {
+    shard0: bool,
+    shard_number: ShardNumber,
+}
+
+pub fn describe(
+    key: &Key,
+    shard_count: ShardCount,
+    stripe_size: ShardStripeSize,
+) -> impl std::fmt::Debug {
+    KeyShardingInfo {
+        shard0: key_is_shard0(key),
+        shard_number: key_to_shard_number(shard_count, stripe_size, key),
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use utils::Hex;
diff --git a/libs/utils/src/hex.rs b/libs/utils/src/hex.rs
index fc0bb7e4a2..382f805a96 100644
--- a/libs/utils/src/hex.rs
+++ b/libs/utils/src/hex.rs
@@ -19,13 +19,13 @@
 /// // right: [0x68; 1]
 /// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
 /// ```
-#[derive(PartialEq)]
-pub struct Hex<'a>(pub &'a [u8]);
+pub struct Hex<S>(pub S);
 
-impl std::fmt::Debug for Hex<'_> {
+impl<S: AsRef<[u8]>> std::fmt::Debug for Hex<S> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "[")?;
-        for (i, c) in self.0.chunks(16).enumerate() {
+        let chunks = self.0.as_ref().chunks(16);
+        for (i, c) in chunks.enumerate() {
             if i > 0 && !c.is_empty() {
                 writeln!(f, ", ")?;
             }
@@ -36,6 +36,15 @@ impl std::fmt::Debug for Hex<'_> {
                 write!(f, "0x{b:02x}")?;
             }
         }
-        write!(f, "; {}]", self.0.len())
+        write!(f, "; {}]", self.0.as_ref().len())
+    }
+}
+
+impl<R: AsRef<[u8]>, L: AsRef<[u8]>> PartialEq<Hex<R>> for Hex<L> {
+    fn eq(&self, other: &Hex<R>) -> bool {
+        let left = self.0.as_ref();
+        let right = other.0.as_ref();
+
+        left == right
     }
 }
diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml
index 843f5dd862..be5626040b 100644
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -17,6 +17,7 @@ pageserver = { path = ".." }
 pageserver_api.workspace = true
 remote_storage = { path = "../../libs/remote_storage" }
 postgres_ffi.workspace = true
+thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 toml_edit.workspace = true
diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs
new file mode 100644
index 0000000000..28448811f8
--- /dev/null
+++ b/pageserver/ctl/src/key.rs
@@ -0,0 +1,477 @@
+use anyhow::Context;
+use clap::Parser;
+use pageserver_api::{
+    key::Key,
+    reltag::{BlockNumber, RelTag, SlruKind},
+    shard::{ShardCount, ShardStripeSize},
+};
+use std::str::FromStr;
+
+#[derive(Parser)]
+pub(super) struct DescribeKeyCommand {
+    /// Key material in one of the forms: hex, span attributes captured from log, reltag blocknum
+    input: Vec<String>,
+
+    /// The number of shards to calculate what Keys placement would be.
+    #[arg(long)]
+    shard_count: Option<CustomShardCount>,
+
+    /// The sharding stripe size.
+    ///
+    /// The default is hardcoded. It makes no sense to provide this without providing
+    /// `--shard-count`.
+    #[arg(long, requires = "shard_count")]
+    stripe_size: Option<u32>,
+}
+
+/// Sharded shard count without unsharded count, which the actual ShardCount supports.
+#[derive(Clone, Copy)]
+pub(super) struct CustomShardCount(std::num::NonZeroU8);
+
+#[derive(Debug, thiserror::Error)]
+pub(super) enum InvalidShardCount {
+    #[error(transparent)]
+    ParsingFailed(#[from] std::num::ParseIntError),
+    #[error("too few shards")]
+    TooFewShards,
+}
+
+impl FromStr for CustomShardCount {
+    type Err = InvalidShardCount;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let inner: std::num::NonZeroU8 = s.parse()?;
+        if inner.get() < 2 {
+            Err(InvalidShardCount::TooFewShards)
+        } else {
+            Ok(CustomShardCount(inner))
+        }
+    }
+}
+
+impl From<CustomShardCount> for ShardCount {
+    fn from(value: CustomShardCount) -> Self {
+        ShardCount::new(value.0.get())
+    }
+}
+
+impl DescribeKeyCommand {
+    pub(super) fn execute(self) {
+        let DescribeKeyCommand {
+            input,
+            shard_count,
+            stripe_size,
+        } = self;
+
+        let material = KeyMaterial::try_from(input.as_slice()).unwrap();
+        let kind = material.kind();
+        let key = Key::from(material);
+
+        println!("parsed from {kind}: {key}:");
+        println!();
+        println!("{key:?}");
+
+        macro_rules! kind_query {
+            ($name:ident) => {{
+                let s: &'static str = stringify!($name);
+                let s = s.strip_prefix("is_").unwrap_or(s);
+                let s = s.strip_suffix("_key").unwrap_or(s);
+
+                #[allow(clippy::needless_borrow)]
+                (s, pageserver_api::key::$name(key))
+            }};
+        }
+
+        // the current characterization is a mess of these boolean queries and separate
+        // "recognization". I think it accurately represents how strictly we model the Key
+        // right now, but could of course be made less confusing.
+
+        let queries = [
+            ("rel_block", pageserver_api::key::is_rel_block_key(&key)),
+            kind_query!(is_rel_vm_block_key),
+            kind_query!(is_rel_fsm_block_key),
+            kind_query!(is_slru_block_key),
+            kind_query!(is_inherited_key),
+            ("rel_size", pageserver_api::key::is_rel_size_key(&key)),
+            (
+                "slru_segment_size",
+                pageserver_api::key::is_slru_segment_size_key(&key),
+            ),
+        ];
+
+        let recognized_kind = "recognized kind";
+        let metadata_key = "metadata key";
+        let shard_placement = "shard placement";
+
+        let longest = queries
+            .iter()
+            .map(|t| t.0)
+            .chain([recognized_kind, metadata_key, shard_placement])
+            .map(|s| s.len())
+            .max()
+            .unwrap();
+
+        let colon = 1;
+        let padding = 1;
+
+        for (name, is) in queries {
+            let width = longest - name.len() + colon + padding;
+            println!("{}{:width$}{}", name, ":", is);
+        }
+
+        let width = longest - recognized_kind.len() + colon + padding;
+        println!(
+            "{}{:width$}{:?}",
+            recognized_kind,
+            ":",
+            RecognizedKeyKind::new(key),
+        );
+
+        if let Some(shard_count) = shard_count {
+            // seeing the sharding placement might be confusing, so leave it out unless shard
+            // count was given.
+
+            let stripe_size = stripe_size.map(ShardStripeSize).unwrap_or_default();
+            println!(
+                "# placement with shard_count: {} and stripe_size: {}:",
+                shard_count.0, stripe_size.0
+            );
+            let width = longest - shard_placement.len() + colon + padding;
+            println!(
+                "{}{:width$}{:?}",
+                shard_placement,
+                ":",
+                pageserver_api::shard::describe(&key, shard_count.into(), stripe_size)
+            );
+        }
+    }
+}
+
+/// Hand-wavy "inputs we accept" for a key.
+#[derive(Debug)]
+pub(super) enum KeyMaterial {
+    Hex(Key),
+    String(SpanAttributesFromLogs),
+    Split(RelTag, BlockNumber),
+}
+
+impl KeyMaterial {
+    fn kind(&self) -> &'static str {
+        match self {
+            KeyMaterial::Hex(_) => "hex",
+            KeyMaterial::String(_) | KeyMaterial::Split(_, _) => "split",
+        }
+    }
+}
+
+impl From<KeyMaterial> for Key {
+    fn from(value: KeyMaterial) -> Self {
+        match value {
+            KeyMaterial::Hex(key) => key,
+            KeyMaterial::String(SpanAttributesFromLogs(rt, blocknum))
+            | KeyMaterial::Split(rt, blocknum) => {
+                pageserver_api::key::rel_block_to_key(rt, blocknum)
+            }
+        }
+    }
+}
+
+impl<S: AsRef<str>> TryFrom<&[S]> for KeyMaterial {
+    type Error = anyhow::Error;
+
+    fn try_from(value: &[S]) -> Result<Self, Self::Error> {
+        match value {
+            [] => anyhow::bail!(
+                "need 1..N positional arguments describing the key, try hex or a log line"
+            ),
+            [one] => {
+                let one = one.as_ref();
+
+                let key = Key::from_hex(one).map(KeyMaterial::Hex);
+
+                let attrs = SpanAttributesFromLogs::from_str(one).map(KeyMaterial::String);
+
+                match (key, attrs) {
+                    (Ok(key), _) => Ok(key),
+                    (_, Ok(s)) => Ok(s),
+                    (Err(e1), Err(e2)) => anyhow::bail!(
+                        "failed to parse {one:?} as hex or span attributes:\n- {e1:#}\n- {e2:#}"
+                    ),
+                }
+            }
+            more => {
+                // assume going left to right one of these is a reltag and then we find a blocknum
+                // this works, because we don't have plain numbers at least right after reltag in
+                // logs. for some definition of "works".
+
+                let Some((reltag_at, reltag)) = more
+                    .iter()
+                    .map(AsRef::as_ref)
+                    .enumerate()
+                    .find_map(|(i, s)| {
+                        s.split_once("rel=")
+                            .map(|(_garbage, actual)| actual)
+                            .unwrap_or(s)
+                            .parse::<RelTag>()
+                            .ok()
+                            .map(|rt| (i, rt))
+                    })
+                else {
+                    anyhow::bail!("found no RelTag in arguments");
+                };
+
+                let Some(blocknum) = more
+                    .iter()
+                    .map(AsRef::as_ref)
+                    .skip(reltag_at)
+                    .find_map(|s| {
+                        s.split_once("blkno=")
+                            .map(|(_garbage, actual)| actual)
+                            .unwrap_or(s)
+                            .parse::<BlockNumber>()
+                            .ok()
+                    })
+                else {
+                    anyhow::bail!("found no blocknum in arguments");
+                };
+
+                Ok(KeyMaterial::Split(reltag, blocknum))
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+pub(super) struct SpanAttributesFromLogs(RelTag, BlockNumber);
+
+impl std::str::FromStr for SpanAttributesFromLogs {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // accept the span separator but do not require or fail if either is missing
+        // "whatever{rel=1663/16389/24615 blkno=1052204 req_lsn=FFFFFFFF/FFFFFFFF}"
+        let (_, reltag) = s
+            .split_once("rel=")
+            .ok_or_else(|| anyhow::anyhow!("cannot find 'rel='"))?;
+        let reltag = reltag.split_whitespace().next().unwrap();
+
+        let (_, blocknum) = s
+            .split_once("blkno=")
+            .ok_or_else(|| anyhow::anyhow!("cannot find 'blkno='"))?;
+        let blocknum = blocknum.split_whitespace().next().unwrap();
+
+        let reltag = reltag
+            .parse()
+            .with_context(|| format!("parse reltag from {reltag:?}"))?;
+        let blocknum = blocknum
+            .parse()
+            .with_context(|| format!("parse blocknum from {blocknum:?}"))?;
+
+        Ok(Self(reltag, blocknum))
+    }
+}
+
+#[derive(Debug)]
+#[allow(dead_code)] // debug print is used
+enum RecognizedKeyKind {
+    DbDir,
+    ControlFile,
+    Checkpoint,
+    AuxFilesV1,
+    SlruDir(Result<SlruKind, u32>),
+    RelMap(RelTagish<2>),
+    RelDir(RelTagish<2>),
+    AuxFileV2(Result<AuxFileV2, utils::Hex<[u8; 16]>>),
+}
+
+#[derive(Debug, PartialEq)]
+#[allow(unused)]
+enum AuxFileV2 {
+    Recognized(&'static str, utils::Hex<[u8; 13]>),
+    OtherWithPrefix(&'static str, utils::Hex<[u8; 13]>),
+    Other(utils::Hex<[u8; 13]>),
+}
+
+impl RecognizedKeyKind {
+    fn new(key: Key) -> Option<Self> {
+        use RecognizedKeyKind::{
+            AuxFilesV1, Checkpoint, ControlFile, DbDir, RelDir, RelMap, SlruDir,
+        };
+
+        let slru_dir_kind = pageserver_api::key::slru_dir_kind(&key);
+
+        Some(match key {
+            pageserver_api::key::DBDIR_KEY => DbDir,
+            pageserver_api::key::CONTROLFILE_KEY => ControlFile,
+            pageserver_api::key::CHECKPOINT_KEY => Checkpoint,
+            pageserver_api::key::AUX_FILES_KEY => AuxFilesV1,
+            _ if slru_dir_kind.is_some() => SlruDir(slru_dir_kind.unwrap()),
+            _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 0 => {
+                RelMap([key.field2, key.field3].into())
+            }
+            _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 1 => {
+                RelDir([key.field2, key.field3].into())
+            }
+            _ if key.is_metadata_key() => RecognizedKeyKind::AuxFileV2(
+                AuxFileV2::new(key).ok_or_else(|| utils::Hex(key.to_i128().to_be_bytes())),
+            ),
+            _ => return None,
+        })
+    }
+}
+
+impl AuxFileV2 {
+    fn new(key: Key) -> Option<AuxFileV2> {
+        const EMPTY_HASH: [u8; 13] = {
+            let mut out = [0u8; 13];
+            let hash = pageserver::aux_file::fnv_hash(b"").to_be_bytes();
+            let mut i = 3;
+            while i < 16 {
+                out[i - 3] = hash[i];
+                i += 1;
+            }
+            out
+        };
+
+        let bytes = key.to_i128().to_be_bytes();
+        let hash = utils::Hex(<[u8; 13]>::try_from(&bytes[3..]).unwrap());
+
+        assert_eq!(EMPTY_HASH.len(), hash.0.len());
+
+        // TODO: we could probably find the preimages for the hashes
+
+        Some(match (bytes[1], bytes[2]) {
+            (1, 1) => AuxFileV2::Recognized("pg_logical/mappings/", hash),
+            (1, 2) => AuxFileV2::Recognized("pg_logical/snapshots/", hash),
+            (1, 3) if hash.0 == EMPTY_HASH => {
+                AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash)
+            }
+            (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash),
+            (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash),
+            (0xff, 0xff) => AuxFileV2::Other(hash),
+            _ => return None,
+        })
+    }
+}
+
+/// Prefix of RelTag, currently only known use cases are the two item versions.
+///
+/// Renders like a reltag with `/`, nothing else.
+struct RelTagish<const N: usize>([u32; N]);
+
+impl<const N: usize> From<[u32; N]> for RelTagish<N> {
+    fn from(val: [u32; N]) -> Self {
+        RelTagish(val)
+    }
+}
+
+impl<const N: usize> std::fmt::Debug for RelTagish<N> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use std::fmt::Write as _;
+        let mut first = true;
+        self.0.iter().try_for_each(|x| {
+            if !first {
+                f.write_char('/')?;
+            }
+            first = false;
+            write!(f, "{}", x)
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use pageserver::aux_file::encode_aux_file_key;
+
+    use super::*;
+
+    #[test]
+    fn hex_is_key_material() {
+        let m = KeyMaterial::try_from(&["000000067F0000400200DF927900FFFFFFFF"][..]).unwrap();
+        assert!(matches!(m, KeyMaterial::Hex(_)), "{m:?}");
+    }
+
+    #[test]
+    fn single_positional_spanalike_is_key_material() {
+        // why is this needed? if you are checking many, then copypaste starts to appeal
+        let strings = [
+            (line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"),
+            (line!(), "rel=1663/208101/2620_fsm blkno=2"),
+            (line!(), "rel=1663/208101/2620.1 blkno=2"),
+        ];
+
+        let mut first: Option<Key> = None;
+
+        for (line, example) in strings {
+            let m = KeyMaterial::try_from(&[example][..])
+                .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
+            let key = Key::from(m);
+            if let Some(first) = first {
+                assert_eq!(first, key);
+            } else {
+                first = Some(key);
+            }
+        }
+
+        // not supporting this is rather accidential, but I think the input parsing is lenient
+        // enough already
+        KeyMaterial::try_from(&["1663/208101/2620_fsm 2"][..]).unwrap_err();
+    }
+
+    #[test]
+    fn multiple_spanlike_args() {
+        let strings = [
+            (line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]),
+            (line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]),
+            (line!(), &["1663/208101/2620_fsm", "2"][..]),
+        ];
+
+        let mut first: Option<Key> = None;
+
+        for (line, example) in strings {
+            let m = KeyMaterial::try_from(example)
+                .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
+            let key = Key::from(m);
+            if let Some(first) = first {
+                assert_eq!(first, key);
+            } else {
+                first = Some(key);
+            }
+        }
+    }
+    #[test]
+    fn recognized_auxfiles() {
+        use AuxFileV2::*;
+
+        let empty = [
+            0x2e, 0x07, 0xbb, 0x01, 0x42, 0x62, 0xb8, 0x21, 0x75, 0x62, 0x95, 0xc5, 0x8d,
+        ];
+        let foobar = [
+            0x62, 0x79, 0x3c, 0x64, 0xbf, 0x6f, 0x0d, 0x35, 0x97, 0xba, 0x44, 0x6f, 0x18,
+        ];
+
+        #[rustfmt::skip]
+        let examples = [
+            (line!(), "pg_logical/mappings/foobar", Recognized("pg_logical/mappings/", utils::Hex(foobar))),
+            (line!(), "pg_logical/snapshots/foobar", Recognized("pg_logical/snapshots/", utils::Hex(foobar))),
+            (line!(), "pg_logical/replorigin_checkpoint", Recognized("pg_logical/replorigin_checkpoint", utils::Hex(empty))),
+            (line!(), "pg_logical/foobar", OtherWithPrefix("pg_logical/", utils::Hex(foobar))),
+            (line!(), "pg_replslot/foobar", Recognized("pg_replslot/", utils::Hex(foobar))),
+            (line!(), "foobar", Other(utils::Hex(foobar))),
+        ];
+
+        for (line, path, expected) in examples {
+            let key = encode_aux_file_key(path);
+            let recognized =
+                AuxFileV2::new(key).unwrap_or_else(|| panic!("line {line} example failed"));
+
+            assert_eq!(recognized, expected);
+        }
+
+        assert_eq!(
+            AuxFileV2::new(Key::from_hex("600000102000000000000000000000000000").unwrap()),
+            None,
+            "example key has one too few 0 after 6 before 1"
+        );
+    }
+}
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index e92c352dab..50c3ac4c61 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -6,6 +6,7 @@
 
 mod draw_timeline_dir;
 mod index_part;
+mod key;
 mod layer_map_analyzer;
 mod layers;
 
@@ -61,6 +62,8 @@ enum Commands {
     AnalyzeLayerMap(AnalyzeLayerMapCmd),
     #[command(subcommand)]
     Layer(LayerCmd),
+    /// Debug print a hex key found from logs
+    Key(key::DescribeKeyCommand),
 }
 
 /// Read and update pageserver metadata file
@@ -183,6 +186,7 @@ async fn main() -> anyhow::Result<()> {
                 .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
                 .await?;
         }
+        Commands::Key(dkc) => dkc.execute(),
     };
     Ok(())
 }

From 7e60563910936cf6643edb686a8163b0b03c7108 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 31 May 2024 22:20:06 +0100
Subject: [PATCH 0906/1571] pageserver: add GcError type (#7917)

## Problem

- Because GC exposes all errors as an anyhow::Error, we have
intermittent issues with spurious log errors during shutdown, e.g. in
this failure of a performance test
https://neon-github-public-dev.s3.amazonaws.com/reports/main/9300804302/index.html#suites/07874de07c4a1c9effe0d92da7755ebf/214a2154f6f0217a/

```
Gc failed 1 times, retrying in 2s: shutting down
```

GC really doesn't do a lot of complicated IO: it doesn't benefit from
the backtrace capabilities of anyhow::Error, and can be expressed more
robustly as an enum.

## Summary of changes

- Add GcError type and use it instead of anyhow::Error in GC functions
- In `gc_iteration_internal`, return GcError::Cancelled on shutdown
rather than Ok(()) (we only used Ok before because we didn't have a
clear cancellation error variant to use).
- In `gc_iteration_internal`, skip past timelines that are shutting
down, to avoid having to go through another GC iteration if we happen to
see a deleting timeline during a GC run.
- In `refresh_gc_info_internal`, avoid an error case where a timeline
might not be found after being looked up, by carrying an Arc<Timeline>
instead of a TimelineId between the first loop and second loop in the
function.
- In HTTP request handler, handle Cancelled variants as 503 instead of
turning all GC errors into 500s.
---
 pageserver/src/tenant.rs                  | 112 +++++++++++++---------
 pageserver/src/tenant/mgr.rs              |  10 +-
 pageserver/src/tenant/tasks.rs            |  33 ++++---
 pageserver/src/tenant/timeline.rs         |  47 ++++++---
 test_runner/regress/test_tenant_detach.py |   4 +-
 5 files changed, 129 insertions(+), 77 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index cfa683beb8..eff9c742c1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -487,6 +487,33 @@ enum CreateTimelineCause {
     Delete,
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum GcError {
+    // The tenant is shutting down
+    #[error("tenant shutting down")]
+    TenantCancelled,
+
+    // The tenant is shutting down
+    #[error("timeline shutting down")]
+    TimelineCancelled,
+
+    // The tenant is in a state inelegible to run GC
+    #[error("not active")]
+    NotActive,
+
+    // A requested GC cutoff LSN was invalid, for example it tried to move backwards
+    #[error("not active")]
+    BadLsn { why: String },
+
+    // A remote storage error while scheduling updates after compaction
+    #[error(transparent)]
+    Remote(anyhow::Error),
+
+    // If GC was invoked for a particular timeline, this error means it didn't exist
+    #[error("timeline not found")]
+    TimelineNotFound,
+}
+
 impl Tenant {
     /// Yet another helper for timeline initialization.
     ///
@@ -1605,24 +1632,23 @@ impl Tenant {
     /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever
     /// requires more history to be retained.
     //
-    pub async fn gc_iteration(
+    pub(crate) async fn gc_iteration(
         &self,
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<GcResult> {
+    ) -> Result<GcResult, GcError> {
         // Don't start doing work during shutdown
         if let TenantState::Stopping { .. } = self.current_state() {
             return Ok(GcResult::default());
         }
 
         // there is a global allowed_error for this
-        anyhow::ensure!(
-            self.is_active(),
-            "Cannot run GC iteration on inactive tenant"
-        );
+        if !self.is_active() {
+            return Err(GcError::NotActive);
+        }
 
         {
             let conf = self.tenant_conf.load();
@@ -2790,28 +2816,13 @@ impl Tenant {
         pitr: Duration,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<GcResult> {
+    ) -> Result<GcResult, GcError> {
         let mut totals: GcResult = Default::default();
         let now = Instant::now();
 
-        let gc_timelines = match self
+        let gc_timelines = self
             .refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx)
-            .await
-        {
-            Ok(result) => result,
-            Err(e) => {
-                if let Some(PageReconstructError::Cancelled) =
-                    e.downcast_ref::<PageReconstructError>()
-                {
-                    // Handle cancellation
-                    totals.elapsed = now.elapsed();
-                    return Ok(totals);
-                } else {
-                    // Propagate other errors
-                    return Err(e);
-                }
-            }
-        };
+            .await?;
 
         failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
 
@@ -2836,7 +2847,19 @@ impl Tenant {
                 // made.
                 break;
             }
-            let result = timeline.gc().await?;
+            let result = match timeline.gc().await {
+                Err(GcError::TimelineCancelled) => {
+                    if target_timeline_id.is_some() {
+                        // If we were targetting this specific timeline, surface cancellation to caller
+                        return Err(GcError::TimelineCancelled);
+                    } else {
+                        // A timeline may be shutting down independently of the tenant's lifecycle: we should
+                        // skip past this and proceed to try GC on other timelines.
+                        continue;
+                    }
+                }
+                r => r?,
+            };
             totals += result;
         }
 
@@ -2849,11 +2872,11 @@ impl Tenant {
     /// [`Tenant::get_gc_horizon`].
     ///
     /// This is usually executed as part of periodic gc, but can now be triggered more often.
-    pub async fn refresh_gc_info(
+    pub(crate) async fn refresh_gc_info(
         &self,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
+    ) -> Result<Vec<Arc<Timeline>>, GcError> {
         // since this method can now be called at different rates than the configured gc loop, it
         // might be that these configuration values get applied faster than what it was previously,
         // since these were only read from the gc task.
@@ -2874,7 +2897,7 @@ impl Tenant {
         pitr: Duration,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
+    ) -> Result<Vec<Arc<Timeline>>, GcError> {
         // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
         // currently visible timelines.
         let timelines = self
@@ -2911,8 +2934,8 @@ impl Tenant {
             }
         }
 
-        if !self.is_active() {
-            anyhow::bail!("shutting down");
+        if !self.is_active() || self.cancel.is_cancelled() {
+            return Err(GcError::TenantCancelled);
         }
 
         // grab mutex to prevent new timelines from being created here; avoid doing long operations
@@ -2921,19 +2944,19 @@ impl Tenant {
 
         // Scan all timelines. For each timeline, remember the timeline ID and
         // the branch point where it was created.
-        let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = {
+        let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = {
             let timelines = self.timelines.lock().unwrap();
             let mut all_branchpoints = BTreeSet::new();
-            let timeline_ids = {
+            let timelines = {
                 if let Some(target_timeline_id) = target_timeline_id.as_ref() {
                     if timelines.get(target_timeline_id).is_none() {
-                        bail!("gc target timeline does not exist")
+                        return Err(GcError::TimelineNotFound);
                     }
                 };
 
                 timelines
                     .iter()
-                    .map(|(timeline_id, timeline_entry)| {
+                    .map(|(_timeline_id, timeline_entry)| {
                         if let Some(ancestor_timeline_id) =
                             &timeline_entry.get_ancestor_timeline_id()
                         {
@@ -2955,33 +2978,28 @@ impl Tenant {
                             }
                         }
 
-                        *timeline_id
+                        timeline_entry.clone()
                     })
                     .collect::<Vec<_>>()
             };
-            (all_branchpoints, timeline_ids)
+            (all_branchpoints, timelines)
         };
 
         // Ok, we now know all the branch points.
         // Update the GC information for each timeline.
-        let mut gc_timelines = Vec::with_capacity(timeline_ids.len());
-        for timeline_id in timeline_ids {
-            // Timeline is known to be local and loaded.
-            let timeline = self
-                .get_timeline(timeline_id, false)
-                .with_context(|| format!("Timeline {timeline_id} was not found"))?;
-
+        let mut gc_timelines = Vec::with_capacity(timelines.len());
+        for timeline in timelines {
             // If target_timeline is specified, ignore all other timelines
             if let Some(target_timeline_id) = target_timeline_id {
-                if timeline_id != target_timeline_id {
+                if timeline.timeline_id != target_timeline_id {
                     continue;
                 }
             }
 
             let branchpoints: Vec<Lsn> = all_branchpoints
                 .range((
-                    Included((timeline_id, Lsn(0))),
-                    Included((timeline_id, Lsn(u64::MAX))),
+                    Included((timeline.timeline_id, Lsn(0))),
+                    Included((timeline.timeline_id, Lsn(u64::MAX))),
                 ))
                 .map(|&x| x.1)
                 .collect();
@@ -2989,7 +3007,7 @@ impl Tenant {
             {
                 let mut target = timeline.gc_info.write().unwrap();
 
-                match gc_cutoffs.remove(&timeline_id) {
+                match gc_cutoffs.remove(&timeline.timeline_id) {
                     Some(cutoffs) => {
                         *target = GcInfo {
                             retain_lsns: branchpoints,
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 89fdf31849..0bb1d750aa 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -45,7 +45,7 @@ use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
-use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
+use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
 
 use utils::crashsafe::path_with_suffix_extension;
@@ -2833,7 +2833,13 @@ pub(crate) async fn immediate_gc(
         }
     }
 
-    result.map_err(ApiError::InternalServerError)
+    result.map_err(|e| match e {
+        GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
+        GcError::TimelineNotFound => {
+            ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
+        }
+        other => ApiError::InternalServerError(anyhow::anyhow!(other)),
+    })
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index bf2d8a47b4..a6dfa84f35 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -380,21 +380,28 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 let res = tenant
                     .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
                     .await;
-                if let Err(e) = res {
-                    let wait_duration = backoff::exponential_backoff_duration_seconds(
-                        error_run_count + 1,
-                        1.0,
-                        MAX_BACKOFF_SECS,
-                    );
-                    error_run_count += 1;
-                    let wait_duration = Duration::from_secs_f64(wait_duration);
-                    error!(
+                match res {
+                    Ok(_) => {
+                        error_run_count = 0;
+                        period
+                    }
+                    Err(crate::tenant::GcError::TenantCancelled) => {
+                        return;
+                    }
+                    Err(e) => {
+                        let wait_duration = backoff::exponential_backoff_duration_seconds(
+                            error_run_count + 1,
+                            1.0,
+                            MAX_BACKOFF_SECS,
+                        );
+                        error_run_count += 1;
+                        let wait_duration = Duration::from_secs_f64(wait_duration);
+
+                        error!(
                         "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
                     );
-                    wait_duration
-                } else {
-                    error_run_count = 0;
-                    period
+                        wait_duration
+                    }
                 }
             };
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4a9d981ad8..9bf429972d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -131,11 +131,14 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
 use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
+use super::{
+    secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
+    GcError,
+};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(crate) enum FlushLoopState {
@@ -4837,7 +4840,7 @@ impl Timeline {
     /// Currently, we don't make any attempt at removing unneeded page versions
     /// within a layer file. We can only remove the whole file if it's fully
     /// obsolete.
-    pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
+    pub(super) async fn gc(&self) -> Result<GcResult, GcError> {
         // this is most likely the background tasks, but it might be the spawned task from
         // immediate_gc
         let _g = tokio::select! {
@@ -4850,7 +4853,7 @@ impl Timeline {
 
         // Is the timeline being deleted?
         if self.is_stopping() {
-            anyhow::bail!("timeline is Stopping");
+            return Err(GcError::TimelineCancelled);
         }
 
         let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
@@ -4908,7 +4911,7 @@ impl Timeline {
         pitr_cutoff: Lsn,
         retain_lsns: Vec<Lsn>,
         new_gc_cutoff: Lsn,
-    ) -> anyhow::Result<GcResult> {
+    ) -> Result<GcResult, GcError> {
         // FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc
 
         let now = SystemTime::now();
@@ -4930,12 +4933,15 @@ impl Timeline {
         // The GC cutoff should only ever move forwards.
         let waitlist = {
             let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
-            ensure!(
-                *write_guard <= new_gc_cutoff,
-                "Cannot move GC cutoff LSN backwards (was {}, new {})",
-                *write_guard,
-                new_gc_cutoff
-            );
+            if *write_guard > new_gc_cutoff {
+                return Err(GcError::BadLsn {
+                    why: format!(
+                        "Cannot move GC cutoff LSN backwards (was {}, new {})",
+                        *write_guard, new_gc_cutoff
+                    ),
+                });
+            }
+
             write_guard.store_and_unlock(new_gc_cutoff)
         };
         waitlist.wait().await;
@@ -5044,7 +5050,14 @@ impl Timeline {
             // This unconditionally schedules also an index_part.json update, even though, we will
             // be doing one a bit later with the unlinked gc'd layers.
             let disk_consistent_lsn = self.disk_consistent_lsn.load();
-            self.schedule_uploads(disk_consistent_lsn, None)?;
+            self.schedule_uploads(disk_consistent_lsn, None)
+                .map_err(|e| {
+                    if self.cancel.is_cancelled() {
+                        GcError::TimelineCancelled
+                    } else {
+                        GcError::Remote(e)
+                    }
+                })?;
 
             let gc_layers = layers_to_remove
                 .iter()
@@ -5053,7 +5066,15 @@ impl Timeline {
 
             result.layers_removed = gc_layers.len() as u64;
 
-            self.remote_client.schedule_gc_update(&gc_layers)?;
+            self.remote_client
+                .schedule_gc_update(&gc_layers)
+                .map_err(|e| {
+                    if self.cancel.is_cancelled() {
+                        GcError::TimelineCancelled
+                    } else {
+                        GcError::Remote(e)
+                    }
+                })?;
 
             guard.finish_gc_timeline(&gc_layers);
 
@@ -5068,7 +5089,7 @@ impl Timeline {
             result.layers_removed, new_gc_cutoff
         );
 
-        result.elapsed = now.elapsed()?;
+        result.elapsed = now.elapsed().unwrap_or(Duration::ZERO);
         Ok(result)
     }
 
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 12a4730e69..871351b2d5 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -302,7 +302,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
 
     # gc should not try to even start on a timeline that doesn't exist
     with pytest.raises(
-        expected_exception=PageserverApiException, match="gc target timeline does not exist"
+        expected_exception=PageserverApiException, match="NotFound: Timeline not found"
     ):
         bogus_timeline_id = TimelineId.generate()
         pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0)
@@ -310,7 +310,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
     env.pageserver.allowed_errors.extend(
         [
             # the error will be printed to the log too
-            ".*gc target timeline does not exist.*",
+            ".*NotFound: Timeline not found.*",
             # Timelines get stopped during detach, ignore the gc calls that error, witnessing that
             ".*InternalServerError\\(timeline is Stopping.*",
         ]

From e98bc4fd2ba3cb4a3fa6d00f98406fb0fdb916a8 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Sat, 1 Jun 2024 00:18:56 +0100
Subject: [PATCH 0907/1571] Run gc on too many partial backup segments (#7700)

The general partial backup idea is that each safekeeper keeps only one
partial segment in remote storage at a time. Sometimes this is not true,
for example if we uploaded object to S3 but got an error when tried to
remove the previous upload. In this case we still keep a list of all
potentially uploaded objects in safekeeper state.

This commit prints a warning to logs if there is too many objects in
safekeeper state. This is not expected and we should try to fix this
state, we can do this by running gc.

I haven't seen this being an issue anywhere, but printing a warning is
something that I wanted to do and forgot in initial PR.
---
 safekeeper/src/wal_backup_partial.rs | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index a320be3bad..6c0f35095b 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -24,7 +24,7 @@ use rand::Rng;
 use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
 
-use tracing::{debug, error, info, instrument};
+use tracing::{debug, error, info, instrument, warn};
 use utils::lsn::Lsn;
 
 use crate::{
@@ -308,7 +308,23 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
 
     debug!("state: {:?}", backup.state);
 
+    // The general idea is that each safekeeper keeps only one partial segment
+    // both in remote storage and in local state. If this is not true, something
+    // went wrong.
+    const MAX_SIMULTANEOUS_SEGMENTS: usize = 10;
+
     'outer: loop {
+        if backup.state.segments.len() > MAX_SIMULTANEOUS_SEGMENTS {
+            warn!(
+                "too many segments in control_file state, running gc: {}",
+                backup.state.segments.len()
+            );
+
+            backup.gc().await.unwrap_or_else(|e| {
+                error!("failed to run gc: {:#}", e);
+            });
+        }
+
         // wait until we have something to upload
         let uploaded_segment = backup.state.uploaded_segment();
         if let Some(seg) = &uploaded_segment {

From a345cf3fc695282823a7cc2a8711213adb64ec3c Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Sat, 1 Jun 2024 12:23:59 +0100
Subject: [PATCH 0908/1571] Fix span for WAL removal task (#7930)

During refactoring in https://github.com/neondatabase/neon/pull/7887 I
forgot to add "WAL removal" span with ttid. This commit fixes it.
---
 safekeeper/src/timeline_manager.rs | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index 84862207d5..7174d843fc 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -10,7 +10,7 @@ use std::{
 
 use postgres_ffi::XLogSegNo;
 use tokio::task::{JoinError, JoinHandle};
-use tracing::{info, instrument, warn};
+use tracing::{info, info_span, instrument, warn, Instrument};
 use utils::lsn::Lsn;
 
 use crate::{
@@ -346,10 +346,13 @@ async fn update_wal_removal(
             &tli.read_shared_state().await.sk.wal_store,
             removal_horizon_segno,
         );
-        *wal_removal_task = Some(tokio::spawn(async move {
-            remover.await?;
-            Ok(removal_horizon_segno)
-        }));
+        *wal_removal_task = Some(tokio::spawn(
+            async move {
+                remover.await?;
+                Ok(removal_horizon_segno)
+            }
+            .instrument(info_span!("WAL removal", ttid=%tli.ttid)),
+        ));
     }
 }
 

From db477c0b8c59081207c1ba7fc6e599b741a0b717 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sun, 2 Jun 2024 16:10:56 +0200
Subject: [PATCH 0909/1571] Add metrics for Azure blob storage (#7933)

In issue #5590 it was proposed to implement metrics for Azure blob
storage. This PR implements them except for the part that performs the
rename, which is left for a followup.

Closes #5590
---
 libs/remote_storage/src/azure_blob.rs         | 85 ++++++++++++++-----
 libs/remote_storage/src/lib.rs                |  1 +
 .../src/{s3_bucket => }/metrics.rs            | 52 +++++++++---
 libs/remote_storage/src/s3_bucket.rs          | 65 +++++---------
 4 files changed, 125 insertions(+), 78 deletions(-)
 rename libs/remote_storage/src/{s3_bucket => }/metrics.rs (76%)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 24c1248304..aca22c6b3e 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -26,13 +26,14 @@ use futures::stream::Stream;
 use futures_util::StreamExt;
 use futures_util::TryStreamExt;
 use http_types::{StatusCode, Url};
+use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 
+use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
 use crate::{
-    error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
-    DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
-    TimeTravelError, TimeoutOrCancel,
+    error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing,
+    ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
 };
 
 pub struct AzureBlobStorage {
@@ -137,6 +138,8 @@ impl AzureBlobStorage {
         let mut last_modified = None;
         let mut metadata = HashMap::new();
 
+        let started_at = start_measuring_requests(kind);
+
         let download = async {
             let response = builder
                 // convert to concrete Pageable
@@ -200,13 +203,22 @@ impl AzureBlobStorage {
             })
         };
 
-        tokio::select! {
+        let download = tokio::select! {
             bufs = download => bufs,
             cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
-                TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
-                TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
+                TimeoutOrCancel::Timeout => return Err(DownloadError::Timeout),
+                TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled),
             },
-        }
+        };
+        let started_at = ScopeGuard::into_inner(started_at);
+        let outcome = match &download {
+            Ok(_) => AttemptOutcome::Ok,
+            Err(_) => AttemptOutcome::Err,
+        };
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, outcome, started_at);
+        download
     }
 
     async fn permit(
@@ -340,7 +352,10 @@ impl RemoteStorage for AzureBlobStorage {
         metadata: Option<StorageMetadata>,
         cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Put, cancel).await?;
+        let kind = RequestKind::Put;
+        let _permit = self.permit(kind, cancel).await?;
+
+        let started_at = start_measuring_requests(kind);
 
         let op = async {
             let blob_client = self.client.blob_client(self.relative_path_to_name(to));
@@ -364,14 +379,25 @@ impl RemoteStorage for AzureBlobStorage {
             match fut.await {
                 Ok(Ok(_response)) => Ok(()),
                 Ok(Err(azure)) => Err(azure.into()),
-                Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()),
+                Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
             }
         };
 
-        tokio::select! {
+        let res = tokio::select! {
             res = op => res,
-            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
-        }
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        let outcome = match res {
+            Ok(_) => AttemptOutcome::Ok,
+            Err(_) => AttemptOutcome::Err,
+        };
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, outcome, started_at);
+
+        res
     }
 
     async fn download(
@@ -417,12 +443,13 @@ impl RemoteStorage for AzureBlobStorage {
         paths: &'a [RemotePath],
         cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Delete, cancel).await?;
+        let kind = RequestKind::Delete;
+        let _permit = self.permit(kind, cancel).await?;
+        let started_at = start_measuring_requests(kind);
 
         let op = async {
-            // TODO batch requests are also not supported by the SDK
+            // TODO batch requests are not supported by the SDK
             // https://github.com/Azure/azure-sdk-for-rust/issues/1068
-            // https://github.com/Azure/azure-sdk-for-rust/issues/1249
             for path in paths {
                 let blob_client = self.client.blob_client(self.relative_path_to_name(path));
 
@@ -447,10 +474,16 @@ impl RemoteStorage for AzureBlobStorage {
             Ok(())
         };
 
-        tokio::select! {
+        let res = tokio::select! {
             res = op => res,
-            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
-        }
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+        res
     }
 
     async fn copy(
@@ -459,7 +492,9 @@ impl RemoteStorage for AzureBlobStorage {
         to: &RemotePath,
         cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Copy, cancel).await?;
+        let kind = RequestKind::Copy;
+        let _permit = self.permit(kind, cancel).await?;
+        let started_at = start_measuring_requests(kind);
 
         let timeout = tokio::time::sleep(self.timeout);
 
@@ -503,15 +538,21 @@ impl RemoteStorage for AzureBlobStorage {
             }
         };
 
-        tokio::select! {
+        let res = tokio::select! {
             res = op => res,
-            _ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
+            _ = cancel.cancelled() => return Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
             _ = timeout => {
                 let e = anyhow::Error::new(TimeoutOrCancel::Timeout);
                 let e = e.context(format!("Timeout, last status: {copy_status:?}"));
                 Err(e)
             },
-        }
+        };
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+        res
     }
 
     async fn time_travel_recover(
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index cb3df0985d..8c984abed2 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -12,6 +12,7 @@
 mod azure_blob;
 mod error;
 mod local_fs;
+mod metrics;
 mod s3_bucket;
 mod simulate_failures;
 mod support;
diff --git a/libs/remote_storage/src/s3_bucket/metrics.rs b/libs/remote_storage/src/metrics.rs
similarity index 76%
rename from libs/remote_storage/src/s3_bucket/metrics.rs
rename to libs/remote_storage/src/metrics.rs
index beca755920..bbb51590f3 100644
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -15,6 +15,7 @@ pub(crate) enum RequestKind {
     TimeTravel = 5,
 }
 
+use scopeguard::ScopeGuard;
 use RequestKind::*;
 
 impl RequestKind {
@@ -33,10 +34,10 @@ impl RequestKind {
     }
 }
 
-pub(super) struct RequestTyped<C>([C; 6]);
+pub(crate) struct RequestTyped<C>([C; 6]);
 
 impl<C> RequestTyped<C> {
-    pub(super) fn get(&self, kind: RequestKind) -> &C {
+    pub(crate) fn get(&self, kind: RequestKind) -> &C {
         &self.0[kind.as_index()]
     }
 
@@ -58,19 +59,19 @@ impl<C> RequestTyped<C> {
 }
 
 impl RequestTyped<Histogram> {
-    pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
+    pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
         self.get(kind).observe(started_at.elapsed().as_secs_f64())
     }
 }
 
-pub(super) struct PassFailCancelledRequestTyped<C> {
+pub(crate) struct PassFailCancelledRequestTyped<C> {
     success: RequestTyped<C>,
     fail: RequestTyped<C>,
     cancelled: RequestTyped<C>,
 }
 
 #[derive(Debug, Clone, Copy)]
-pub(super) enum AttemptOutcome {
+pub(crate) enum AttemptOutcome {
     Ok,
     Err,
     Cancelled,
@@ -86,7 +87,7 @@ impl<T, E> From<&Result<T, E>> for AttemptOutcome {
 }
 
 impl AttemptOutcome {
-    pub(super) fn as_str(&self) -> &'static str {
+    pub(crate) fn as_str(&self) -> &'static str {
         match self {
             AttemptOutcome::Ok => "ok",
             AttemptOutcome::Err => "err",
@@ -96,7 +97,7 @@ impl AttemptOutcome {
 }
 
 impl<C> PassFailCancelledRequestTyped<C> {
-    pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
+    pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
         let target = match outcome {
             AttemptOutcome::Ok => &self.success,
             AttemptOutcome::Err => &self.fail,
@@ -119,7 +120,7 @@ impl<C> PassFailCancelledRequestTyped<C> {
 }
 
 impl PassFailCancelledRequestTyped<Histogram> {
-    pub(super) fn observe_elapsed(
+    pub(crate) fn observe_elapsed(
         &self,
         kind: RequestKind,
         outcome: impl Into<AttemptOutcome>,
@@ -130,19 +131,44 @@ impl PassFailCancelledRequestTyped<Histogram> {
     }
 }
 
-pub(super) struct BucketMetrics {
+/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`].
+pub(crate) fn start_counting_cancelled_wait(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
+        crate::metrics::BUCKET_METRICS
+            .cancelled_waits
+            .get(kind)
+            .inc()
+    })
+}
+
+/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
+pub(crate) fn start_measuring_requests(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
+        crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+            kind,
+            AttemptOutcome::Cancelled,
+            started_at,
+        )
+    })
+}
+
+pub(crate) struct BucketMetrics {
     /// Full request duration until successful completion, error or cancellation.
-    pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
+    pub(crate) req_seconds: PassFailCancelledRequestTyped<Histogram>,
     /// Total amount of seconds waited on queue.
-    pub(super) wait_seconds: RequestTyped<Histogram>,
+    pub(crate) wait_seconds: RequestTyped<Histogram>,
 
     /// Track how many semaphore awaits were cancelled per request type.
     ///
     /// This is in case cancellations are happening more than expected.
-    pub(super) cancelled_waits: RequestTyped<IntCounter>,
+    pub(crate) cancelled_waits: RequestTyped<IntCounter>,
 
     /// Total amount of deleted objects in batches or single requests.
-    pub(super) deleted_objects_total: IntCounter,
+    pub(crate) deleted_objects_total: IntCounter,
 }
 
 impl Default for BucketMetrics {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index c3d6c75e20..76cf3eac80 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -46,15 +46,16 @@ use utils::backoff;
 
 use super::StorageMetadata;
 use crate::{
-    error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
-    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
-    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    error::Cancelled,
+    metrics::{start_counting_cancelled_wait, start_measuring_requests},
+    support::PermitCarrying,
+    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
+    S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
-pub(super) mod metrics;
-
-use self::metrics::AttemptOutcome;
-pub(super) use self::metrics::RequestKind;
+use crate::metrics::AttemptOutcome;
+pub(super) use crate::metrics::RequestKind;
 
 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -227,7 +228,7 @@ impl S3Bucket {
         };
 
         let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
+        crate::metrics::BUCKET_METRICS
             .wait_seconds
             .observe_elapsed(kind, started_at);
 
@@ -248,7 +249,7 @@ impl S3Bucket {
         };
 
         let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
+        crate::metrics::BUCKET_METRICS
             .wait_seconds
             .observe_elapsed(kind, started_at);
         Ok(permit)
@@ -287,7 +288,7 @@ impl S3Bucket {
                 // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                 // an error: we expect to sometimes fetch an object and find it missing,
                 // e.g. when probing for timeline indices.
-                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
                     kind,
                     AttemptOutcome::Ok,
                     started_at,
@@ -295,7 +296,7 @@ impl S3Bucket {
                 return Err(DownloadError::NotFound);
             }
             Err(e) => {
-                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
                     kind,
                     AttemptOutcome::Err,
                     started_at,
@@ -371,12 +372,12 @@ impl S3Bucket {
             };
 
             let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
+            crate::metrics::BUCKET_METRICS
                 .req_seconds
                 .observe_elapsed(kind, &resp, started_at);
 
             let resp = resp.context("request deletion")?;
-            metrics::BUCKET_METRICS
+            crate::metrics::BUCKET_METRICS
                 .deleted_objects_total
                 .inc_by(chunk.len() as u64);
 
@@ -435,14 +436,14 @@ pin_project_lite::pin_project! {
     /// Times and tracks the outcome of the request.
     struct TimedDownload<S> {
         started_at: std::time::Instant,
-        outcome: metrics::AttemptOutcome,
+        outcome: AttemptOutcome,
         #[pin]
         inner: S
     }
 
     impl<S> PinnedDrop for TimedDownload<S> {
         fn drop(mut this: Pin<&mut Self>) {
-            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
+            crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
         }
     }
 }
@@ -451,7 +452,7 @@ impl<S> TimedDownload<S> {
     fn new(started_at: std::time::Instant, inner: S) -> Self {
         TimedDownload {
             started_at,
-            outcome: metrics::AttemptOutcome::Cancelled,
+            outcome: AttemptOutcome::Cancelled,
             inner,
         }
     }
@@ -468,8 +469,8 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
         let res = ready!(this.inner.poll_next(cx));
         match &res {
             Some(Ok(_)) => {}
-            Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
-            None => *this.outcome = metrics::AttemptOutcome::Ok,
+            Some(Err(_)) => *this.outcome = AttemptOutcome::Err,
+            None => *this.outcome = AttemptOutcome::Ok,
         }
 
         Poll::Ready(res)
@@ -543,7 +544,7 @@ impl RemoteStorage for S3Bucket {
 
             let started_at = ScopeGuard::into_inner(started_at);
 
-            metrics::BUCKET_METRICS
+            crate::metrics::BUCKET_METRICS
                 .req_seconds
                 .observe_elapsed(kind, &response, started_at);
 
@@ -625,7 +626,7 @@ impl RemoteStorage for S3Bucket {
         if let Ok(inner) = &res {
             // do not incl. timeouts as errors in metrics but cancellations
             let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
+            crate::metrics::BUCKET_METRICS
                 .req_seconds
                 .observe_elapsed(kind, inner, started_at);
         }
@@ -673,7 +674,7 @@ impl RemoteStorage for S3Bucket {
         };
 
         let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
+        crate::metrics::BUCKET_METRICS
             .req_seconds
             .observe_elapsed(kind, &res, started_at);
 
@@ -977,28 +978,6 @@ impl RemoteStorage for S3Bucket {
     }
 }
 
-/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
-fn start_counting_cancelled_wait(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
-        metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
-    })
-}
-
-/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
-fn start_measuring_requests(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
-        metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-            kind,
-            AttemptOutcome::Cancelled,
-            started_at,
-        )
-    })
-}
-
 // Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
 struct VerOrDelete {
     kind: VerOrDeleteKind,

From 34f450c05ae92d3bbc840310838ee70f57000b38 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 3 Jun 2024 16:37:11 +0300
Subject: [PATCH 0910/1571] test: allow no vectored gets happening (#7939)

when running the regress tests locally without any environment variables
we use on CI, `test_pageserver_compaction_smoke` fails with division by
zero. fix it temporarily by allowing no vectored read happening. to be
cleaned when vectored get validation gets removed and the default value
can be changed.

Cc: https://github.com/neondatabase/neon/issues/7381
---
 test_runner/regress/test_compaction.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 4850a5c688..9772e2d106 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -85,7 +85,13 @@ page_cache_size=10
 
     vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
     vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
-    vectored_average = vectored_sum.value / vectored_count.value
+    if vectored_count.value > 0:
+        assert vectored_sum.value > 0
+        vectored_average = vectored_sum.value / vectored_count.value
+    else:
+        # special case: running local tests with default legacy configuration
+        assert vectored_sum.value == 0
+        vectored_average = 0
 
     log.info(f"{non_vectored_average=} {vectored_average=}")
 

From c1f55c1525e64a3260635fd63e1abb0efcfd2147 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 3 Jun 2024 09:56:36 -0400
Subject: [PATCH 0911/1571] feat(pageserver): collect aux file tombstones
 (#7900)

close https://github.com/neondatabase/neon/issues/7800

This is a small change to enable the tombstone -> exclude from image
layer path. Most of the pull request is unit tests.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs          | 204 ++++++++++++++++++++++++++++++
 pageserver/src/tenant/timeline.rs | 101 +++++++++++----
 2 files changed, 278 insertions(+), 27 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index eff9c742c1..7ca829535b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -6469,4 +6469,208 @@ mod tests {
 
         Ok(())
     }
+
+    async fn get_vectored_impl_wrapper(
+        tline: &Arc<Timeline>,
+        key: Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Option<Bytes>, GetVectoredError> {
+        let mut reconstruct_state = ValuesReconstructState::new();
+        let mut res = tline
+            .get_vectored_impl(
+                KeySpace::single(key..key.next()),
+                lsn,
+                &mut reconstruct_state,
+                ctx,
+            )
+            .await?;
+        Ok(res.pop_last().map(|(k, v)| {
+            assert_eq!(k, key);
+            v.unwrap()
+        }))
+    }
+
+    #[tokio::test]
+    async fn test_metadata_tombstone_reads() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_tombstone_reads")?;
+        let (tenant, ctx) = harness.load().await;
+        let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
+        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
+        let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
+        let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap();
+
+        // We emulate the situation that the compaction algorithm creates an image layer that removes the tombstones
+        // Lsn 0x30 key0, key3, no key1+key2
+        // Lsn 0x20 key1+key2 tomestones
+        // Lsn 0x10 key1 in image, key2 in delta
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                // delta layers
+                vec![
+                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                ],
+                // image layers
+                vec![
+                    (Lsn(0x10), vec![(key1, test_img("metadata key 1"))]),
+                    (
+                        Lsn(0x30),
+                        vec![
+                            (key0, test_img("metadata key 0")),
+                            (key3, test_img("metadata key 3")),
+                        ],
+                    ),
+                ],
+                Lsn(0x30),
+            )
+            .await?;
+
+        let lsn = Lsn(0x30);
+        let old_lsn = Lsn(0x20);
+
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key0, lsn, &ctx).await?,
+            Some(test_img("metadata key 0"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key1, lsn, &ctx).await?,
+            None,
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key2, lsn, &ctx).await?,
+            None,
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key1, old_lsn, &ctx).await?,
+            Some(Bytes::new()),
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key2, old_lsn, &ctx).await?,
+            Some(Bytes::new()),
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key3, lsn, &ctx).await?,
+            Some(test_img("metadata key 3"))
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_metadata_tombstone_image_creation() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
+        let (tenant, ctx) = harness.load().await;
+
+        let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
+        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
+        let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
+        let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap();
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                // delta layers
+                vec![
+                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    vec![
+                        (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
+                        (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
+                    ],
+                ],
+                // image layers
+                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
+                Lsn(0x30),
+            )
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        tline
+            .compact(
+                &cancel,
+                {
+                    let mut flags = EnumSet::new();
+                    flags.insert(CompactFlags::ForceImageLayerCreation);
+                    flags.insert(CompactFlags::ForceRepartition);
+                    flags
+                },
+                &ctx,
+            )
+            .await?;
+
+        // Image layers are created at last_record_lsn
+        let images = tline
+            .inspect_image_layers(Lsn(0x30), &ctx)
+            .await?
+            .into_iter()
+            .filter(|(k, _)| k.is_metadata_key())
+            .collect::<Vec<_>>();
+        assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed.
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_metadata_tombstone_empty_image_creation() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
+        let (tenant, ctx) = harness.load().await;
+
+        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
+        let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                // delta layers
+                vec![
+                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                ],
+                // image layers
+                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
+                Lsn(0x30),
+            )
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        tline
+            .compact(
+                &cancel,
+                {
+                    let mut flags = EnumSet::new();
+                    flags.insert(CompactFlags::ForceImageLayerCreation);
+                    flags.insert(CompactFlags::ForceRepartition);
+                    flags
+                },
+                &ctx,
+            )
+            .await?;
+
+        // Image layers are created at last_record_lsn
+        let images = tline
+            .inspect_image_layers(Lsn(0x30), &ctx)
+            .await?
+            .into_iter()
+            .filter(|(k, _)| k.is_metadata_key())
+            .collect::<Vec<_>>();
+        assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 9bf429972d..fb1f55f5e3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4312,6 +4312,7 @@ impl Timeline {
         ctx: &RequestContext,
         img_range: Range<Key>,
         mode: ImageLayerCreationMode,
+        start: Key,
     ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
         assert!(!matches!(mode, ImageLayerCreationMode::Initial));
 
@@ -4320,39 +4321,43 @@ impl Timeline {
         let data = self
             .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
             .await?;
-        let (data, total_kb_retrieved, total_key_retrieved) = {
+        let (data, total_kb_retrieved, total_keys_retrieved) = {
             let mut new_data = BTreeMap::new();
             let mut total_kb_retrieved = 0;
-            let mut total_key_retrieved = 0;
+            let mut total_keys_retrieved = 0;
             for (k, v) in data {
                 let v = v.map_err(CreateImageLayersError::PageReconstructError)?;
                 total_kb_retrieved += KEY_SIZE + v.len();
-                total_key_retrieved += 1;
+                total_keys_retrieved += 1;
                 new_data.insert(k, v);
             }
-            (new_data, total_kb_retrieved / 1024, total_key_retrieved)
+            (new_data, total_kb_retrieved / 1024, total_keys_retrieved)
         };
-        let delta_file_accessed = reconstruct_state.get_delta_layers_visited();
+        let delta_files_accessed = reconstruct_state.get_delta_layers_visited();
 
-        let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
+        let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
         debug!(
-            "generate image layers for metadata keys: trigger_generation={trigger_generation}, \
-                delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
-                total_key_retrieved={total_key_retrieved}"
+            trigger_generation,
+            delta_files_accessed,
+            total_kb_retrieved,
+            total_keys_retrieved,
+            "generate metadata images"
         );
+
         if !trigger_generation && mode == ImageLayerCreationMode::Try {
             return Ok(ImageLayerCreationOutcome {
                 image: None,
                 next_start_key: img_range.end,
             });
         }
-        let has_keys = !data.is_empty();
+        let mut wrote_any_image = false;
         for (k, v) in data {
-            // Even if the value is empty (deleted), we do not delete it for now until we can ensure vectored get
-            // considers this situation properly.
-            // if v.is_empty() {
-            //     continue;
-            // }
+            if v.is_empty() {
+                // the key has been deleted, it does not need an image
+                // in metadata keyspace, an empty image == tombstone
+                continue;
+            }
+            wrote_any_image = true;
 
             // No need to handle sharding b/c metadata keys are always on the 0-th shard.
 
@@ -4360,16 +4365,26 @@ impl Timeline {
             // on the normal data path either.
             image_layer_writer.put_image(k, v, ctx).await?;
         }
-        Ok(ImageLayerCreationOutcome {
-            image: if has_keys {
-                let image_layer = image_layer_writer.finish(self, ctx).await?;
-                Some(image_layer)
-            } else {
-                tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
-                None
-            },
-            next_start_key: img_range.end,
-        })
+
+        if wrote_any_image {
+            // Normal path: we have written some data into the new image layer for this
+            // partition, so flush it to disk.
+            let image_layer = image_layer_writer.finish(self, ctx).await?;
+            Ok(ImageLayerCreationOutcome {
+                image: Some(image_layer),
+                next_start_key: img_range.end,
+            })
+        } else {
+            // Special case: the image layer may be empty if this is a sharded tenant and the
+            // partition does not cover any keys owned by this shard. In this case, to ensure
+            // we don't leave gaps between image layers, leave `start` where it is, so that the next
+            // layer we write will cover the key range that we just scanned.
+            tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
+            Ok(ImageLayerCreationOutcome {
+                image: None,
+                next_start_key: start,
+            })
+        }
     }
 
     #[tracing::instrument(skip_all, fields(%lsn, %mode))]
@@ -4479,6 +4494,7 @@ impl Timeline {
                         ctx,
                         img_range,
                         mode,
+                        start,
                     )
                     .await?;
                 start = next_start_key;
@@ -5448,11 +5464,12 @@ impl Timeline {
         let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
         let max_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
         let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
-        let max_lsn = Lsn(deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap().0 + 1);
+        let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
         assert!(
             max_lsn <= last_record_lsn,
             "advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}"
         );
+        let end_lsn = Lsn(max_lsn.0 + 1);
         if let Some(check_start_lsn) = check_start_lsn {
             assert!(min_lsn >= check_start_lsn);
         }
@@ -5461,7 +5478,7 @@ impl Timeline {
             self.timeline_id,
             self.tenant_shard_id,
             min_key,
-            min_lsn..max_lsn,
+            min_lsn..end_lsn,
             ctx,
         )
         .await?;
@@ -5477,6 +5494,36 @@ impl Timeline {
 
         Ok(())
     }
+
+    /// Return all keys at the LSN in the image layers
+    #[cfg(test)]
+    pub(crate) async fn inspect_image_layers(
+        self: &Arc<Timeline>,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<(Key, Bytes)>> {
+        let mut all_data = Vec::new();
+        let guard = self.layers.read().await;
+        for layer in guard.layer_map().iter_historic_layers() {
+            if !layer.is_delta() && layer.image_layer_lsn() == lsn {
+                let layer = guard.get_from_desc(&layer);
+                let mut reconstruct_data = ValuesReconstructState::default();
+                layer
+                    .get_values_reconstruct_data(
+                        KeySpace::single(Key::MIN..Key::MAX),
+                        lsn..Lsn(lsn.0 + 1),
+                        &mut reconstruct_data,
+                        ctx,
+                    )
+                    .await?;
+                for (k, v) in reconstruct_data.keys {
+                    all_data.push((k, v?.img.unwrap().1));
+                }
+            }
+        }
+        all_data.sort();
+        Ok(all_data)
+    }
 }
 
 type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);

From acf0a11feafb579c3c5f892ddb74ec9477571ceb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 3 Jun 2024 16:18:07 +0200
Subject: [PATCH 0912/1571] Move keyspace utils to inherent impls (#7929)

The keyspace utils like `is_rel_size_key` or `is_rel_fsm_block_key` and
many others are free functions and have to be either imported separately
or specified with the full path starting in `pageserver_api::key::`.
This is less convenient than if these functions were just inherent
impls.

Follow-up of #7890
Fixes #6438
---
 libs/pageserver_api/src/key.rs                | 147 ++++++++++--------
 libs/pageserver_api/src/shard.rs              |   7 +-
 pageserver/ctl/src/key.rs                     |  24 ++-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |  11 +-
 pageserver/src/basebackup.rs                  |   4 +-
 pageserver/src/pgdatadir_mapping.rs           |  10 +-
 pageserver/src/tenant/timeline.rs             |   5 +-
 pageserver/src/walredo.rs                     |   3 +-
 pageserver/src/walredo/apply_neon.rs          |  12 +-
 9 files changed, 113 insertions(+), 110 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index e52d4ef986..27fab5e7a0 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -385,9 +385,11 @@ pub fn rel_size_to_key(rel: RelTag) -> Key {
     }
 }
 
-#[inline(always)]
-pub fn is_rel_size_key(key: &Key) -> bool {
-    key.field1 == 0 && key.field6 == u32::MAX
+impl Key {
+    #[inline(always)]
+    pub fn is_rel_size_key(&self) -> bool {
+        self.field1 == 0 && self.field6 == u32::MAX
+    }
 }
 
 #[inline(always)]
@@ -478,12 +480,14 @@ pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
     }
 }
 
-pub fn is_slru_segment_size_key(key: &Key) -> bool {
-    key.field1 == 0x01
-        && key.field2 < 0x03
-        && key.field3 == 0x01
-        && key.field5 == 0
-        && key.field6 == u32::MAX
+impl Key {
+    pub fn is_slru_segment_size_key(&self) -> bool {
+        self.field1 == 0x01
+            && self.field2 < 0x03
+            && self.field3 == 0x01
+            && self.field5 == 0
+            && self.field6 == u32::MAX
+    }
 }
 
 #[inline(always)]
@@ -591,73 +595,78 @@ pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
 /// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
 pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
 
-// AUX_FILES currently stores only data for logical replication (slots etc), and
-// we don't preserve these on a branch because safekeepers can't follow timeline
-// switch (and generally it likely should be optional), so ignore these.
-#[inline(always)]
-pub fn is_inherited_key(key: Key) -> bool {
-    !NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key)
-}
+impl Key {
+    // AUX_FILES currently stores only data for logical replication (slots etc), and
+    // we don't preserve these on a branch because safekeepers can't follow timeline
+    // switch (and generally it likely should be optional), so ignore these.
+    #[inline(always)]
+    pub fn is_inherited_key(self) -> bool {
+        !NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
+    }
 
-#[inline(always)]
-pub fn is_rel_fsm_block_key(key: Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
-}
+    #[inline(always)]
+    pub fn is_rel_fsm_block_key(self) -> bool {
+        self.field1 == 0x00
+            && self.field4 != 0
+            && self.field5 == FSM_FORKNUM
+            && self.field6 != 0xffffffff
+    }
 
-#[inline(always)]
-pub fn is_rel_vm_block_key(key: Key) -> bool {
-    key.field1 == 0x00
-        && key.field4 != 0
-        && key.field5 == VISIBILITYMAP_FORKNUM
-        && key.field6 != 0xffffffff
-}
+    #[inline(always)]
+    pub fn is_rel_vm_block_key(self) -> bool {
+        self.field1 == 0x00
+            && self.field4 != 0
+            && self.field5 == VISIBILITYMAP_FORKNUM
+            && self.field6 != 0xffffffff
+    }
 
-#[inline(always)]
-pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
-    Ok(match key.field1 {
-        0x01 => {
-            let kind = match key.field2 {
-                0x00 => SlruKind::Clog,
-                0x01 => SlruKind::MultiXactMembers,
-                0x02 => SlruKind::MultiXactOffsets,
-                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
-            };
-            let segno = key.field4;
-            let blknum = key.field6;
+    #[inline(always)]
+    pub fn to_slru_block(self) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
+        Ok(match self.field1 {
+            0x01 => {
+                let kind = match self.field2 {
+                    0x00 => SlruKind::Clog,
+                    0x01 => SlruKind::MultiXactMembers,
+                    0x02 => SlruKind::MultiXactOffsets,
+                    _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", self.field2),
+                };
+                let segno = self.field4;
+                let blknum = self.field6;
 
-            (kind, segno, blknum)
-        }
-        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
-    })
-}
+                (kind, segno, blknum)
+            }
+            _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
+        })
+    }
 
-#[inline(always)]
-pub fn is_slru_block_key(key: Key) -> bool {
-    key.field1 == 0x01                // SLRU-related
-        && key.field3 == 0x00000001   // but not SlruDir
-        && key.field6 != 0xffffffff // and not SlruSegSize
-}
+    #[inline(always)]
+    pub fn is_slru_block_key(self) -> bool {
+        self.field1 == 0x01                // SLRU-related
+        && self.field3 == 0x00000001   // but not SlruDir
+        && self.field6 != 0xffffffff // and not SlruSegSize
+    }
 
-#[inline(always)]
-pub fn is_rel_block_key(key: &Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
-}
+    #[inline(always)]
+    pub fn is_rel_block_key(&self) -> bool {
+        self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff
+    }
 
-/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
-#[inline(always)]
-pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
-    Ok(match key.field1 {
-        0x00 => (
-            RelTag {
-                spcnode: key.field2,
-                dbnode: key.field3,
-                relnode: key.field4,
-                forknum: key.field5,
-            },
-            key.field6,
-        ),
-        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
-    })
+    /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
+    #[inline(always)]
+    pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
+        Ok(match self.field1 {
+            0x00 => (
+                RelTag {
+                    spcnode: self.field2,
+                    dbnode: self.field3,
+                    relnode: self.field4,
+                    forknum: self.field5,
+                },
+                self.field6,
+            ),
+            _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
+        })
+    }
 }
 
 impl std::str::FromStr for Key {
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 8ace426f88..8c5a4e6168 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,9 +1,6 @@
 use std::{ops::RangeInclusive, str::FromStr};
 
-use crate::{
-    key::{is_rel_block_key, Key},
-    models::ShardParameters,
-};
+use crate::{key::Key, models::ShardParameters};
 use hex::FromHex;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
@@ -672,7 +669,7 @@ fn key_is_shard0(key: &Key) -> bool {
     // because they must be included in basebackups.
     let is_initfork = key.field5 == INIT_FORKNUM;
 
-    !is_rel_block_key(key) || is_initfork
+    !key.is_rel_block_key() || is_initfork
 }
 
 /// Provide the same result as the function in postgres `hashfn.h` with the same name
diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs
index 28448811f8..af4b5a21ab 100644
--- a/pageserver/ctl/src/key.rs
+++ b/pageserver/ctl/src/key.rs
@@ -72,13 +72,14 @@ impl DescribeKeyCommand {
         println!("{key:?}");
 
         macro_rules! kind_query {
+            ([$($name:ident),*$(,)?]) => {{[$(kind_query!($name)),*]}};
             ($name:ident) => {{
                 let s: &'static str = stringify!($name);
                 let s = s.strip_prefix("is_").unwrap_or(s);
                 let s = s.strip_suffix("_key").unwrap_or(s);
 
                 #[allow(clippy::needless_borrow)]
-                (s, pageserver_api::key::$name(key))
+                (s, key.$name())
             }};
         }
 
@@ -86,18 +87,15 @@ impl DescribeKeyCommand {
         // "recognization". I think it accurately represents how strictly we model the Key
         // right now, but could of course be made less confusing.
 
-        let queries = [
-            ("rel_block", pageserver_api::key::is_rel_block_key(&key)),
-            kind_query!(is_rel_vm_block_key),
-            kind_query!(is_rel_fsm_block_key),
-            kind_query!(is_slru_block_key),
-            kind_query!(is_inherited_key),
-            ("rel_size", pageserver_api::key::is_rel_size_key(&key)),
-            (
-                "slru_segment_size",
-                pageserver_api::key::is_slru_segment_size_key(&key),
-            ),
-        ];
+        let queries = kind_query!([
+            is_rel_block_key,
+            is_rel_vm_block_key,
+            is_rel_fsm_block_key,
+            is_slru_block_key,
+            is_inherited_key,
+            is_rel_size_key,
+            is_slru_segment_size_key,
+        ]);
 
         let recognized_kind = "recognized kind";
         let metadata_key = "metadata key";
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 5043a207fc..4992f37465 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,6 +1,6 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
+use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::models::PagestreamGetPageRequest;
 
@@ -187,7 +187,7 @@ async fn main_impl(
                     for r in partitioning.keys.ranges.iter() {
                         let mut i = r.start;
                         while i != r.end {
-                            if is_rel_block_key(&i) {
+                            if i.is_rel_block_key() {
                                 filtered.add_key(i);
                             }
                             i = i.next();
@@ -308,9 +308,10 @@ async fn main_impl(
                     let r = &ranges[weights.sample(&mut rng)];
                     let key: i128 = rng.gen_range(r.start..r.end);
                     let key = Key::from_i128(key);
-                    assert!(is_rel_block_key(&key));
-                    let (rel_tag, block_no) =
-                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
+                    assert!(key.is_rel_block_key());
+                    let (rel_tag, block_no) = key
+                        .to_rel_block()
+                        .expect("we filter non-rel-block keys out above");
                     PagestreamGetPageRequest {
                         request_lsn: if rng.gen_bool(args.req_latest_probability) {
                             Lsn::MAX
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index dca1510810..31518f5632 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::{key_to_slru_block, Key};
+use pageserver_api::key::Key;
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -170,7 +170,7 @@ where
     }
 
     async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
-        let (kind, segno, _) = key_to_slru_block(*key)?;
+        let (kind, segno, _) = key.to_slru_block()?;
 
         match kind {
             SlruKind::Clog => {
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index c78c358855..764c528a9e 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -17,10 +17,10 @@ use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use itertools::Itertools;
 use pageserver_api::key::{
-    dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
-    rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
-    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
-    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
+    relmap_file_key, slru_block_to_key, slru_dir_to_key, slru_segment_key_range,
+    slru_segment_size_to_key, twophase_file_key, twophase_key_range, AUX_FILES_KEY, CHECKPOINT_KEY,
+    CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::models::AuxFilePolicy;
@@ -1684,7 +1684,7 @@ impl<'a> DatadirModification<'a> {
         let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
         for (key, values) in self.pending_updates.drain() {
             for (lsn, value) in values {
-                if is_rel_block_key(&key) || is_slru_block_key(key) {
+                if key.is_rel_block_key() || key.is_slru_block_key() {
                     // This bails out on first error without modifying pending_updates.
                     // That's Ok, cf this function's doc comment.
                     writer.put(key, lsn, &value, ctx).await?;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fb1f55f5e3..5402c776e3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -102,7 +102,6 @@ use crate::metrics::{
 };
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
-use pageserver_api::key::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::ShardIndex;
 
@@ -3191,7 +3190,7 @@ impl Timeline {
 
             // Recurse into ancestor if needed
             if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
-                if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
+                if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
                     trace!(
                         "going into ancestor {}, cont_lsn is {}",
                         timeline.ancestor_lsn,
@@ -4262,7 +4261,7 @@ impl Timeline {
                                 // Unfortunately we cannot do this for the main fork, or for
                                 // any metadata keys, keys, as that would lead to actual data
                                 // loss.
-                                if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) {
+                                if img_key.is_rel_fsm_block_key() || img_key.is_rel_vm_block_key() {
                                     warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
                                     ZERO_PAGE.clone()
                                 } else {
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 3decea0c6d..1d72a97688 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -34,7 +34,6 @@ use crate::repository::Key;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Bytes, BytesMut};
-use pageserver_api::key::key_to_rel_block;
 use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
 use pageserver_api::shard::TenantShardId;
 use std::sync::Arc;
@@ -208,7 +207,7 @@ impl PostgresRedoManager {
     ) -> anyhow::Result<Bytes> {
         *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
 
-        let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
+        let (rel, blknum) = key.to_rel_block().context("invalid record")?;
         const MAX_RETRY_ATTEMPTS: u32 = 1;
         let mut n_attempts = 0u32;
         loop {
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index 247704e2a5..695894a924 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -3,7 +3,7 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, BytesMut};
-use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key};
+use pageserver_api::key::Key;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
@@ -48,7 +48,7 @@ pub(crate) fn apply_in_neon(
             flags,
         } => {
             // sanity check that this is modifying the correct relation
-            let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
+            let (rel, blknum) = key.to_rel_block().context("invalid record")?;
             assert!(
                 rel.forknum == VISIBILITYMAP_FORKNUM,
                 "ClearVisibilityMapFlags record on unexpected rel {}",
@@ -85,7 +85,7 @@ pub(crate) fn apply_in_neon(
         // Non-relational WAL records are handled here, with custom code that has the
         // same effects as the corresponding Postgres WAL redo function.
         NeonWalRecord::ClogSetCommitted { xids, timestamp } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
             assert_eq!(
                 slru_kind,
                 SlruKind::Clog,
@@ -130,7 +130,7 @@ pub(crate) fn apply_in_neon(
             }
         }
         NeonWalRecord::ClogSetAborted { xids } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
             assert_eq!(
                 slru_kind,
                 SlruKind::Clog,
@@ -160,7 +160,7 @@ pub(crate) fn apply_in_neon(
             }
         }
         NeonWalRecord::MultixactOffsetCreate { mid, moff } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
             assert_eq!(
                 slru_kind,
                 SlruKind::MultiXactOffsets,
@@ -192,7 +192,7 @@ pub(crate) fn apply_in_neon(
             LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
         }
         NeonWalRecord::MultixactMembersCreate { moff, members } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
             assert_eq!(
                 slru_kind,
                 SlruKind::MultiXactMembers,

From 69d18d642996f80dbe351f0b8456da75a5861710 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 3 Jun 2024 17:16:23 +0100
Subject: [PATCH 0913/1571] s3_scrubber: add `pageserver-physical-gc` (#7925)

## Problem

Currently, we leave `index_part.json` objects from old generations
behind each time a pageserver restarts or a tenant is migrated. This
doesn't break anything, but it's annoying when a tenant has been around
for a long time and starts to accumulate 10s-100s of these.

Partially implements: #7043

## Summary of changes

- Add a new `pageserver-physical-gc` command to `s3_scrubber`

The name is a bit of a mouthful, but I think it makes sense:
- GC is the accurate term for what we are doing here: removing data that
takes up storage but can never be accessed.
- "physical" is a necessary distinction from the "normal" GC that we do
online in the pageserver, which operates at a higher level in terms of
LSNs+layers, whereas this type of GC is purely about S3 objects.
- "pageserver" makes clear that this command deals exclusively with
pageserver data, not safekeeper.
---
 Cargo.lock                                    |   1 +
 s3_scrubber/Cargo.toml                        |   1 +
 s3_scrubber/src/checks.rs                     |  64 ++---
 s3_scrubber/src/lib.rs                        |   3 +-
 s3_scrubber/src/main.rs                       |  25 +-
 s3_scrubber/src/pageserver_physical_gc.rs     | 239 ++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py         |  24 ++
 .../regress/test_pageserver_secondary.py      |  16 +-
 test_runner/regress/test_s3_scrubber.py       |  51 +++-
 9 files changed, 387 insertions(+), 37 deletions(-)
 create mode 100644 s3_scrubber/src/pageserver_physical_gc.rs

diff --git a/Cargo.lock b/Cargo.lock
index 6a60104472..84d919b817 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5129,6 +5129,7 @@ dependencies = [
  "futures-util",
  "hex",
  "histogram",
+ "humantime",
  "itertools",
  "once_cell",
  "pageserver",
diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml
index e56bd43fb8..48b50ca21c 100644
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -11,6 +11,7 @@ either.workspace = true
 tokio-rustls.workspace = true
 anyhow.workspace = true
 hex.workspace = true
+humantime.workspace = true
 thiserror.workspace = true
 rand.workspace = true
 bytes.workspace = true
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index 134afa53da..2c14fef0af 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -1,7 +1,7 @@
 use std::collections::{HashMap, HashSet};
 
 use anyhow::Context;
-use aws_sdk_s3::{types::ObjectIdentifier, Client};
+use aws_sdk_s3::Client;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
 use tracing::{error, info, warn};
@@ -70,7 +70,7 @@ pub(crate) fn branch_cleanup_and_check_errors(
 
     match s3_data {
         Some(s3_data) => {
-            result.garbage_keys.extend(s3_data.keys_to_remove);
+            result.garbage_keys.extend(s3_data.unknown_keys);
 
             match s3_data.blob_data {
                 BlobDataParseResult::Parsed {
@@ -240,7 +240,12 @@ impl TenantObjectListing {
 #[derive(Debug)]
 pub(crate) struct S3TimelineBlobData {
     pub(crate) blob_data: BlobDataParseResult,
-    pub(crate) keys_to_remove: Vec<String>,
+
+    // Index objects that were not used when loading `blob_data`, e.g. those from old generations
+    pub(crate) unused_index_keys: Vec<String>,
+
+    // Objects whose keys were not recognized at all, i.e. not layer files, not indices
+    pub(crate) unknown_keys: Vec<String>,
 }
 
 #[derive(Debug)]
@@ -276,12 +281,12 @@ pub(crate) async fn list_timeline_blobs(
     let mut s3_layers = HashSet::new();
 
     let mut errors = Vec::new();
-    let mut keys_to_remove = Vec::new();
+    let mut unknown_keys = Vec::new();
 
     let mut timeline_dir_target = s3_root.timeline_root(&id);
     timeline_dir_target.delimiter = String::new();
 
-    let mut index_parts: Vec<ObjectIdentifier> = Vec::new();
+    let mut index_part_keys: Vec<String> = Vec::new();
     let mut initdb_archive: bool = false;
 
     let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
@@ -292,16 +297,16 @@ pub(crate) async fn list_timeline_blobs(
         let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
         match blob_name {
             Some(name) if name.starts_with("index_part.json") => {
-                tracing::info!("Index key {key}");
-                index_parts.push(obj)
+                tracing::debug!("Index key {key}");
+                index_part_keys.push(key.to_owned())
             }
             Some("initdb.tar.zst") => {
-                tracing::info!("initdb archive {key}");
+                tracing::debug!("initdb archive {key}");
                 initdb_archive = true;
             }
             Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
                 Ok((new_layer, gen)) => {
-                    tracing::info!("Parsed layer key: {} {:?}", new_layer, gen);
+                    tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen);
                     s3_layers.insert((new_layer, gen));
                 }
                 Err(e) => {
@@ -309,37 +314,37 @@ pub(crate) async fn list_timeline_blobs(
                     errors.push(
                         format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
                     );
-                    keys_to_remove.push(key.to_string());
+                    unknown_keys.push(key.to_string());
                 }
             },
             None => {
-                tracing::info!("Peculiar key {}", key);
+                tracing::warn!("Unknown key {}", key);
                 errors.push(format!("S3 list response got an object with odd key {key}"));
-                keys_to_remove.push(key.to_string());
+                unknown_keys.push(key.to_string());
             }
         }
     }
 
-    if index_parts.is_empty() && s3_layers.is_empty() && initdb_archive {
-        tracing::info!(
+    if index_part_keys.is_empty() && s3_layers.is_empty() && initdb_archive {
+        tracing::debug!(
             "Timeline is empty apart from initdb archive: expected post-deletion state."
         );
         return Ok(S3TimelineBlobData {
             blob_data: BlobDataParseResult::Relic,
-            keys_to_remove: Vec::new(),
+            unused_index_keys: index_part_keys,
+            unknown_keys: Vec::new(),
         });
     }
 
     // Choose the index_part with the highest generation
-    let (index_part_object, index_part_generation) = match index_parts
+    let (index_part_object, index_part_generation) = match index_part_keys
         .iter()
-        .filter_map(|k| {
-            let key = k.key();
+        .filter_map(|key| {
             // Stripping the index key to the last part, because RemotePath doesn't
             // like absolute paths, and depending on prefix_in_bucket it's possible
             // for the keys we read back to start with a slash.
             let basename = key.rsplit_once('/').unwrap().1;
-            parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (k, g))
+            parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (key, g))
         })
         .max_by_key(|i| i.1)
         .map(|(k, g)| (k.clone(), g))
@@ -347,15 +352,18 @@ pub(crate) async fn list_timeline_blobs(
         Some((key, gen)) => (Some(key), gen),
         None => {
             // Legacy/missing case: one or zero index parts, which did not have a generation
-            (index_parts.pop(), Generation::none())
+            (index_part_keys.pop(), Generation::none())
         }
     };
 
-    if index_part_object.is_none() {
-        errors.push("S3 list response got no index_part.json file".to_string());
+    match index_part_object.as_ref() {
+        Some(selected) => index_part_keys.retain(|k| k != selected),
+        None => {
+            errors.push("S3 list response got no index_part.json file".to_string());
+        }
     }
 
-    if let Some(index_part_object_key) = index_part_object.as_ref().map(|object| object.key()) {
+    if let Some(index_part_object_key) = index_part_object.as_ref() {
         let index_part_bytes = download_object_with_retries(
             s3_client,
             &timeline_dir_target.bucket_name,
@@ -372,17 +380,14 @@ pub(crate) async fn list_timeline_blobs(
                         index_part_generation,
                         s3_layers,
                     },
-                    keys_to_remove,
+                    unused_index_keys: index_part_keys,
+                    unknown_keys,
                 })
             }
             Err(index_parse_error) => errors.push(format!(
                 "index_part.json body parsing error: {index_parse_error}"
             )),
         }
-    } else {
-        errors.push(format!(
-            "Index part object {index_part_object:?} has no key"
-        ));
     }
 
     if errors.is_empty() {
@@ -393,6 +398,7 @@ pub(crate) async fn list_timeline_blobs(
 
     Ok(S3TimelineBlobData {
         blob_data: BlobDataParseResult::Incorrect(errors),
-        keys_to_remove,
+        unused_index_keys: index_part_keys,
+        unknown_keys,
     })
 }
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index e0f99ecd9c..64273432fc 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -4,6 +4,7 @@ pub mod checks;
 pub mod cloud_admin_api;
 pub mod garbage;
 pub mod metadata_stream;
+pub mod pageserver_physical_gc;
 pub mod scan_pageserver_metadata;
 pub mod scan_safekeeper_metadata;
 pub mod tenant_snapshot;
@@ -396,7 +397,7 @@ async fn download_object_with_retries(
             .await
         {
             Ok(bytes_read) => {
-                tracing::info!("Downloaded {bytes_read} bytes for object object with key {key}");
+                tracing::debug!("Downloaded {bytes_read} bytes for object {key}");
                 return Ok(body_buf);
             }
             Err(e) => {
diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs
index e49c280b99..ade8ef7d7a 100644
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -2,11 +2,13 @@ use anyhow::bail;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
+use s3_scrubber::pageserver_physical_gc::GcMode;
 use s3_scrubber::scan_pageserver_metadata::scan_metadata;
 use s3_scrubber::tenant_snapshot::SnapshotDownloader;
 use s3_scrubber::{
-    init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig,
-    NodeKind, TraversingDepth,
+    init_logging, pageserver_physical_gc::pageserver_physical_gc,
+    scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind,
+    TraversingDepth,
 };
 
 use clap::{Parser, Subcommand};
@@ -62,6 +64,14 @@ enum Command {
         #[arg(short, long)]
         output_path: Utf8PathBuf,
     },
+    PageserverPhysicalGc {
+        #[arg(long = "tenant-id", num_args = 0..)]
+        tenant_ids: Vec<TenantShardId>,
+        #[arg(long = "min-age")]
+        min_age: humantime::Duration,
+        #[arg(short, long, default_value_t = GcMode::IndicesOnly)]
+        mode: GcMode,
+    },
 }
 
 #[tokio::main]
@@ -75,6 +85,7 @@ async fn main() -> anyhow::Result<()> {
         Command::FindGarbage { .. } => "find-garbage",
         Command::PurgeGarbage { .. } => "purge-garbage",
         Command::TenantSnapshot { .. } => "tenant-snapshot",
+        Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc",
     };
     let _guard = init_logging(&format!(
         "{}_{}_{}_{}.log",
@@ -178,5 +189,15 @@ async fn main() -> anyhow::Result<()> {
                 SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?;
             downloader.download().await
         }
+        Command::PageserverPhysicalGc {
+            tenant_ids,
+            min_age,
+            mode,
+        } => {
+            let summary =
+                pageserver_physical_gc(bucket_config, tenant_ids, min_age.into(), mode).await?;
+            println!("{}", serde_json::to_string(&summary).unwrap());
+            Ok(())
+        }
     }
 }
diff --git a/s3_scrubber/src/pageserver_physical_gc.rs b/s3_scrubber/src/pageserver_physical_gc.rs
new file mode 100644
index 0000000000..0146433128
--- /dev/null
+++ b/s3_scrubber/src/pageserver_physical_gc.rs
@@ -0,0 +1,239 @@
+use std::time::{Duration, UNIX_EPOCH};
+
+use crate::checks::{list_timeline_blobs, BlobDataParseResult};
+use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
+use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
+use aws_sdk_s3::Client;
+use futures_util::{StreamExt, TryStreamExt};
+use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
+use pageserver::tenant::IndexPart;
+use pageserver_api::shard::TenantShardId;
+use remote_storage::RemotePath;
+use serde::Serialize;
+use tracing::{info_span, Instrument};
+use utils::generation::Generation;
+
+#[derive(Serialize, Default)]
+pub struct GcSummary {
+    indices_deleted: usize,
+    remote_storage_errors: usize,
+}
+
+#[derive(clap::ValueEnum, Debug, Clone, Copy)]
+pub enum GcMode {
+    // Delete nothing
+    DryRun,
+
+    // Enable only removing old-generation indices
+    IndicesOnly,
+    // Enable all forms of GC
+    // TODO: this will be used when shard split ancestor layer deletion is added
+    // All,
+}
+
+impl std::fmt::Display for GcMode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            GcMode::DryRun => write!(f, "dry-run"),
+            GcMode::IndicesOnly => write!(f, "indices-only"),
+        }
+    }
+}
+
+async fn maybe_delete_index(
+    s3_client: &Client,
+    bucket_config: &BucketConfig,
+    min_age: &Duration,
+    latest_gen: Generation,
+    key: &str,
+    mode: GcMode,
+    summary: &mut GcSummary,
+) {
+    // Validation: we will only delete things that parse cleanly
+    let basename = key.rsplit_once('/').unwrap().1;
+    let candidate_generation =
+        match parse_remote_index_path(RemotePath::from_string(basename).unwrap()) {
+            Some(g) => g,
+            None => {
+                if basename == IndexPart::FILE_NAME {
+                    // A legacy pre-generation index
+                    Generation::none()
+                } else {
+                    // A strange key: we will not delete this because we don't understand it.
+                    tracing::warn!("Bad index key");
+                    return;
+                }
+            }
+        };
+
+    // Validation: we will only delete indices more than one generation old, to avoid interfering
+    // in typical migrations, even if they are very long running.
+    if candidate_generation >= latest_gen {
+        // This shouldn't happen: when we loaded metadata, it should have selected the latest
+        // generation already, and only populated [`S3TimelineBlobData::unused_index_keys`]
+        // with older generations.
+        tracing::warn!("Deletion candidate is >= latest generation, this is a bug!");
+        return;
+    } else if candidate_generation.next() == latest_gen {
+        // Skip deleting the latest-1th generation's index.
+        return;
+    }
+
+    // Validation: we will only delete indices after one week, so that during incidents we will have
+    // easy access to recent indices.
+    let age: Duration = match s3_client
+        .head_object()
+        .bucket(&bucket_config.bucket)
+        .key(key)
+        .send()
+        .await
+    {
+        Ok(response) => match response.last_modified {
+            None => {
+                tracing::warn!("Missing last_modified");
+                summary.remote_storage_errors += 1;
+                return;
+            }
+            Some(last_modified) => {
+                let last_modified =
+                    UNIX_EPOCH + Duration::from_secs_f64(last_modified.as_secs_f64());
+                match last_modified.elapsed() {
+                    Ok(e) => e,
+                    Err(_) => {
+                        tracing::warn!("Bad last_modified time: {last_modified:?}");
+                        return;
+                    }
+                }
+            }
+        },
+        Err(e) => {
+            tracing::warn!("Failed to HEAD {key}: {e}");
+            summary.remote_storage_errors += 1;
+            return;
+        }
+    };
+    if &age < min_age {
+        tracing::info!(
+            "Skipping young object {} < {}",
+            age.as_secs_f64(),
+            min_age.as_secs_f64()
+        );
+        return;
+    }
+
+    if matches!(mode, GcMode::DryRun) {
+        tracing::info!("Dry run: would delete this key");
+        return;
+    }
+
+    // All validations passed: erase the object
+    match s3_client
+        .delete_object()
+        .bucket(&bucket_config.bucket)
+        .key(key)
+        .send()
+        .await
+    {
+        Ok(_) => {
+            tracing::info!("Successfully deleted index");
+            summary.indices_deleted += 1;
+        }
+        Err(e) => {
+            tracing::warn!("Failed to delete index: {e}");
+            summary.remote_storage_errors += 1;
+        }
+    }
+}
+
+/// Physical garbage collection: removing unused S3 objects.  This is distinct from the garbage collection
+/// done inside the pageserver, which operates at a higher level (keys, layers).  This type of garbage collection
+/// is about removing:
+/// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between
+///   uploading a layer and uploading an index)
+/// - Index objects from historic generations
+///
+/// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and
+/// make sure that object listings don't get slowed down by large numbers of garbage objects.
+pub async fn pageserver_physical_gc(
+    bucket_config: BucketConfig,
+    tenant_ids: Vec<TenantShardId>,
+    min_age: Duration,
+    mode: GcMode,
+) -> anyhow::Result<GcSummary> {
+    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+
+    let tenants = if tenant_ids.is_empty() {
+        futures::future::Either::Left(stream_tenants(&s3_client, &target))
+    } else {
+        futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
+    };
+
+    // How many tenants to process in parallel.  We need to be mindful of pageservers
+    // accessing the same per tenant prefixes, so use a lower setting than pageservers.
+    const CONCURRENCY: usize = 32;
+
+    // Generate a stream of TenantTimelineId
+    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
+    let timelines = timelines.try_buffered(CONCURRENCY);
+    let timelines = timelines.try_flatten();
+
+    // Generate a stream of S3TimelineBlobData
+    async fn gc_timeline(
+        s3_client: &Client,
+        bucket_config: &BucketConfig,
+        min_age: &Duration,
+        target: &RootTarget,
+        mode: GcMode,
+        ttid: TenantShardTimelineId,
+    ) -> anyhow::Result<GcSummary> {
+        let mut summary = GcSummary::default();
+        let data = list_timeline_blobs(s3_client, ttid, target).await?;
+
+        let (latest_gen, candidates) = match &data.blob_data {
+            BlobDataParseResult::Parsed {
+                index_part: _index_part,
+                index_part_generation,
+                s3_layers: _s3_layers,
+            } => (*index_part_generation, data.unused_index_keys),
+            BlobDataParseResult::Relic => {
+                // Post-deletion tenant location: don't try and GC it.
+                return Ok(summary);
+            }
+            BlobDataParseResult::Incorrect(reasons) => {
+                // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
+                tracing::warn!("Skipping timeline {ttid}, bad metadata: {reasons:?}");
+                return Ok(summary);
+            }
+        };
+
+        for key in candidates {
+            maybe_delete_index(
+                s3_client,
+                bucket_config,
+                min_age,
+                latest_gen,
+                &key,
+                mode,
+                &mut summary,
+            )
+            .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, key))
+            .await;
+        }
+
+        Ok(summary)
+    }
+    let timelines = timelines
+        .map_ok(|ttid| gc_timeline(&s3_client, &bucket_config, &min_age, &target, mode, ttid));
+    let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
+
+    let mut summary = GcSummary::default();
+
+    while let Some(i) = timelines.next().await {
+        let tl_summary = i?;
+
+        summary.indices_deleted += tl_summary.indices_deleted;
+        summary.remote_storage_errors += tl_summary.remote_storage_errors;
+    }
+
+    Ok(summary)
+}
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0004745bf0..a25b8bfca1 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3998,6 +3998,30 @@ class S3Scrubber:
         )
         log.info(f"tenant-snapshot output: {stdout}")
 
+    def pageserver_physical_gc(
+        self, min_age_secs: int, tenant_ids: Optional[list[TenantId]] = None
+    ):
+        args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"]
+
+        if tenant_ids is None:
+            tenant_ids = []
+
+        for tenant_id in tenant_ids:
+            args.extend(["--tenant-id", str(tenant_id)])
+
+        stdout = self.scrubber_cli(
+            args,
+            timeout=30,
+        )
+        try:
+            return json.loads(stdout)
+        except:
+            log.error(
+                "Failed to decode JSON output from `pageserver-physical_gc`.  Dumping stdout:"
+            )
+            log.error(stdout)
+            raise
+
 
 def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path:
     """Compute the path to a working directory for an individual test."""
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 25a3f8521c..9b9bdb2b08 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -15,7 +15,7 @@ from fixtures.pageserver.utils import (
     tenant_delete_wait_completed,
     wait_for_upload_queue_empty,
 )
-from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage
+from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
 
@@ -73,7 +73,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     """
     neon_env_builder.num_pageservers = 3
     neon_env_builder.enable_pageserver_remote_storage(
-        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+        remote_storage_kind=s3_storage(),
     )
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
 
@@ -215,6 +215,13 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
                 )
                 workload.validate(pageserver.id)
 
+    # Having done a bunch of attach/detach cycles, we will have generated some index garbage: check
+    # that the scrubber sees it and cleans it up.  We do this before the final attach+validate pass,
+    # to also validate that the scrubber isn't breaking anything.
+    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] > 0
+
     # Attach all pageservers
     for ps in env.pageservers:
         location_conf = {"mode": "AttachedMulti", "secondary_conf": None, "tenant_conf": {}}
@@ -227,10 +234,11 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     # Detach all pageservers
     for ps in env.pageservers:
         location_conf = {"mode": "Detached", "secondary_conf": None, "tenant_conf": {}}
+        assert ps.list_layers(tenant_id, timeline_id) != []
         ps.tenant_location_configure(tenant_id, location_conf)
 
-    # Confirm that all local disk state was removed on detach
-    # TODO
+        # Confirm that all local disk state was removed on detach
+        assert ps.list_layers(tenant_id, timeline_id) == []
 
 
 def test_live_migration(neon_env_builder: NeonEnvBuilder):
diff --git a/test_runner/regress/test_s3_scrubber.py b/test_runner/regress/test_s3_scrubber.py
index 8981000c24..6baba190f3 100644
--- a/test_runner/regress/test_s3_scrubber.py
+++ b/test_runner/regress/test_s3_scrubber.py
@@ -3,7 +3,7 @@ import shutil
 from typing import Optional
 
 import pytest
-from fixtures.common_types import TenantShardId
+from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     S3Scrubber,
@@ -109,3 +109,52 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
 
     # Check we can read everything
     workload.validate()
+
+
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.num_pageservers = 2
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=shard_count)
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+
+    # We will end up with an index per shard, per cycle, plus one for the initial startup
+    n_cycles = 4
+    expect_indices_per_shard = n_cycles + 1
+    shard_count = 1 if shard_count is None else shard_count
+
+    # For each cycle, detach and attach the tenant to bump the generation, and do some writes to generate uploads
+    for _i in range(0, n_cycles):
+        env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})
+        env.storage_controller.reconcile_until_idle()
+
+        env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}})
+        env.storage_controller.reconcile_until_idle()
+
+        # This write includes remote upload, will generate an index in this generation
+        workload.write_rows(1)
+
+    # With a high min_age, the scrubber should decline to delete anything
+    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600)
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+
+    # If targeting a different tenant, the scrubber shouldn't do anything
+    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(
+        min_age_secs=1, tenant_ids=[TenantId.generate()]
+    )
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+
+    #  With a low min_age, the scrubber should go ahead and clean up all but the latest 2 generations
+    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == (expect_indices_per_shard - 2) * shard_count

From 7006caf3a1480567e911169b4f9488ac2a81d699 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 3 Jun 2024 19:37:33 +0300
Subject: [PATCH 0914/1571] Store logical replication origin in KV storage
 (#7099)

Store logical replication origin in KV storage

## Problem

See  #6977

## Summary of changes

* Extract origin_lsn from commit WAl record
* Add ReplOrigin key to KV storage and store origin_lsn
* In basebackup replace snapshot origin_lsn with last committed
origin_lsn at basebackup LSN

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs                | 35 ++++++++++++
 libs/postgres_ffi/build.rs                    |  1 +
 libs/postgres_ffi/src/lib.rs                  |  1 +
 libs/postgres_ffi/src/pg_constants.rs         | 10 +++-
 pageserver/src/basebackup.rs                  | 33 +++++++++++
 pageserver/src/pgdatadir_mapping.rs           | 47 +++++++++++++--
 pageserver/src/tenant/timeline.rs             | 17 +++---
 pageserver/src/walingest.rs                   | 20 +++++++
 pageserver/src/walrecord.rs                   | 46 ++++++++++++++-
 test_runner/regress/test_compaction.py        |  6 +-
 .../regress/test_subscriber_restart.py        | 57 +++++++++++++++++++
 11 files changed, 255 insertions(+), 18 deletions(-)
 create mode 100644 test_runner/regress/test_subscriber_restart.py

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 27fab5e7a0..997c1cc43a 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,6 +1,7 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::RepOriginId;
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::{fmt, ops::Range};
@@ -38,6 +39,9 @@ pub const RELATION_SIZE_PREFIX: u8 = 0x61;
 /// The key prefix of AUX file keys.
 pub const AUX_KEY_PREFIX: u8 = 0x62;
 
+/// The key prefix of ReplOrigin keys.
+pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63;
+
 /// Check if the key falls in the range of metadata keys.
 pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
     key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
@@ -587,6 +591,37 @@ pub const AUX_FILES_KEY: Key = Key {
     field6: 2,
 };
 
+#[inline(always)]
+pub fn repl_origin_key(origin_id: RepOriginId) -> Key {
+    Key {
+        field1: REPL_ORIGIN_KEY_PREFIX,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: origin_id as u32,
+    }
+}
+
+/// Get the range of replorigin keys.
+pub fn repl_origin_key_range() -> Range<Key> {
+    Key {
+        field1: REPL_ORIGIN_KEY_PREFIX,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }..Key {
+        field1: REPL_ORIGIN_KEY_PREFIX,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: 0x10000,
+    }
+}
+
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.
 
diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs
index 8e6761d6d3..370d9e9a6f 100644
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -126,6 +126,7 @@ fn main() -> anyhow::Result<()> {
             .allowlist_type("PageHeaderData")
             .allowlist_type("DBState")
             .allowlist_type("RelMapFile")
+            .allowlist_type("RepOriginId")
             // Because structs are used for serialization, tell bindgen to emit
             // explicit padding fields.
             .explicit_padding(true)
diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 0d6986778a..729f57f829 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -110,6 +110,7 @@ pub mod pg_constants;
 pub mod relfile_utils;
 
 // Export some widely used datatypes that are unlikely to change across Postgres versions
+pub use v14::bindings::RepOriginId;
 pub use v14::bindings::{uint32, uint64, Oid};
 pub use v14::bindings::{BlockNumber, OffsetNumber};
 pub use v14::bindings::{MultiXactId, TransactionId};
diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs
index 2701ddf5e0..54b032d138 100644
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -102,7 +102,7 @@ pub const XACT_XINFO_HAS_SUBXACTS: u32 = 1u32 << 1;
 pub const XACT_XINFO_HAS_RELFILENODES: u32 = 1u32 << 2;
 pub const XACT_XINFO_HAS_INVALS: u32 = 1u32 << 3;
 pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
-// pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
+pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
 // pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6;
 // pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7;
 
@@ -167,6 +167,7 @@ pub const RM_RELMAP_ID: u8 = 7;
 pub const RM_STANDBY_ID: u8 = 8;
 pub const RM_HEAP2_ID: u8 = 9;
 pub const RM_HEAP_ID: u8 = 10;
+pub const RM_REPLORIGIN_ID: u8 = 19;
 pub const RM_LOGICALMSG_ID: u8 = 21;
 
 // from neon_rmgr.h
@@ -223,6 +224,10 @@ pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
 pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_LONG_HEADER: u16 = 0x0002;
 
+/* From xlog.h */
+pub const XLOG_REPLORIGIN_SET: u8 = 0x00;
+pub const XLOG_REPLORIGIN_DROP: u8 = 0x10;
+
 /* From replication/slot.h */
 pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4  /* offset of `slotdata` in ReplicationSlotOnDisk  */
    + 64 /* NameData */  + 4*4;
@@ -237,6 +242,9 @@ pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32;
 pub const VM_HEAPBLOCKS_PER_PAGE: u32 =
     (BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
 
+/* From origin.c */
+pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE;
+
 // List of subdirectories inside pgdata.
 // Copied from src/bin/initdb/initdb.c
 pub const PGDATA_SUBDIRS: [&str; 22] = [
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 31518f5632..0f057a4368 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -362,6 +362,13 @@ where
                     ));
                     info!("Replication slot {} restart LSN={}", path, restart_lsn);
                     min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
+                } else if path == "pg_logical/replorigin_checkpoint" {
+                    // replorigin_checkoint is written only on compute shutdown, so it contains
+                    // deteriorated values. So we generate our own version of this file for the particular LSN
+                    // based on information about replorigins extracted from transaction commit records.
+                    // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
+                    // but now we should handle (skip) it for backward compatibility.
+                    continue;
                 }
                 let header = new_tar_header(&path, content.len() as u64)?;
                 self.ar
@@ -390,6 +397,32 @@ where
         {
             self.add_twophase_file(xid).await?;
         }
+        let repl_origins = self
+            .timeline
+            .get_replorigins(self.lsn, self.ctx)
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?;
+        let n_origins = repl_origins.len();
+        if n_origins != 0 {
+            //
+            // Construct "pg_logical/replorigin_checkpoint" file based on information about replication origins
+            // extracted from transaction commit record. We are using this file to pass information about replication
+            // origins to compute to allow logical replication to restart from proper point.
+            //
+            let mut content = Vec::with_capacity(n_origins * 16 + 8);
+            content.extend_from_slice(&pg_constants::REPLICATION_STATE_MAGIC.to_le_bytes());
+            for (origin_id, origin_lsn) in repl_origins {
+                content.extend_from_slice(&origin_id.to_le_bytes());
+                content.extend_from_slice(&[0u8; 6]); // align to 8 bytes
+                content.extend_from_slice(&origin_lsn.0.to_le_bytes());
+            }
+            let crc32 = crc32c::crc32c(&content);
+            content.extend_from_slice(&crc32.to_le_bytes());
+            let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?;
+            self.ar.append(&header, &*content).await.context(
+                "could not add pg_logical/replorigin_checkpoint file to basebackup tarball",
+            )?;
+        }
 
         fail_point!("basebackup-before-control-file", |_| {
             Err(BasebackupError::Server(anyhow!(
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 764c528a9e..5eaf80bdaf 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -18,16 +18,16 @@ use enum_map::Enum;
 use itertools::Itertools;
 use pageserver_api::key::{
     dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
-    relmap_file_key, slru_block_to_key, slru_dir_to_key, slru_segment_key_range,
-    slru_segment_size_to_key, twophase_file_key, twophase_key_range, AUX_FILES_KEY, CHECKPOINT_KEY,
-    CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
+    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
+    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
-use postgres_ffi::{Oid, TimestampTz, TransactionId};
+use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
@@ -760,6 +760,27 @@ impl Timeline {
         }
     }
 
+    pub(crate) async fn get_replorigins(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
+        let kv = self
+            .scan(KeySpace::single(repl_origin_key_range()), lsn, ctx)
+            .await
+            .context("scan")?;
+        let mut result = HashMap::new();
+        for (k, v) in kv {
+            let v = v.context("get value")?;
+            let origin_id = k.field6 as RepOriginId;
+            let origin_lsn = Lsn::des(&v).unwrap();
+            if origin_lsn != Lsn::INVALID {
+                result.insert(origin_id, origin_lsn);
+            }
+        }
+        Ok(result)
+    }
+
     /// Does the same as get_current_logical_size but counted on demand.
     /// Used to initialize the logical size tracking on startup.
     ///
@@ -885,7 +906,9 @@ impl Timeline {
         Ok((
             result.to_keyspace(),
             /* AUX sparse key space */
-            SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
+            SparseKeySpace(KeySpace {
+                ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
+            }),
         ))
     }
 
@@ -1154,6 +1177,20 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
+    pub async fn set_replorigin(
+        &mut self,
+        origin_id: RepOriginId,
+        origin_lsn: Lsn,
+    ) -> anyhow::Result<()> {
+        let key = repl_origin_key(origin_id);
+        self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
+        Ok(())
+    }
+
+    pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
+        self.set_replorigin(origin_id, Lsn::INVALID).await
+    }
+
     pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
         self.put(CONTROLFILE_KEY, Value::Image(img));
         Ok(())
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5402c776e3..35e6d1f92f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3879,22 +3879,25 @@ impl Timeline {
                 return Err(FlushLayerError::Cancelled);
             }
 
+            // FIXME(auxfilesv2): support multiple metadata key partitions might need initdb support as well?
+            // This code path will not be hit during regression tests. After #7099 we have a single partition
+            // with two key ranges. If someone wants to fix initdb optimization in the future, this might need
+            // to be fixed.
+
             // For metadata, always create delta layers.
             let delta_layer = if !metadata_partition.parts.is_empty() {
                 assert_eq!(
                     metadata_partition.parts.len(),
                     1,
-                    "currently sparse keyspace should only contain a single aux file keyspace"
+                    "currently sparse keyspace should only contain a single metadata keyspace"
                 );
                 let metadata_keyspace = &metadata_partition.parts[0];
-                assert_eq!(
-                    metadata_keyspace.0.ranges.len(),
-                    1,
-                    "aux file keyspace should be a single range"
-                );
                 self.create_delta_layer(
                     &frozen_layer,
-                    Some(metadata_keyspace.0.ranges[0].clone()),
+                    Some(
+                        metadata_keyspace.0.ranges.first().unwrap().start
+                            ..metadata_keyspace.0.ranges.last().unwrap().end,
+                    ),
                     ctx,
                 )
                 .await
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 79f075b877..4f26f2f6d1 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -234,6 +234,7 @@ impl WalIngest {
                         modification,
                         &parsed_xact,
                         info == pg_constants::XLOG_XACT_COMMIT,
+                        decoded.origin_id,
                         ctx,
                     )
                     .await?;
@@ -246,6 +247,7 @@ impl WalIngest {
                         modification,
                         &parsed_xact,
                         info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
+                        decoded.origin_id,
                         ctx,
                     )
                     .await?;
@@ -375,6 +377,18 @@ impl WalIngest {
                     self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
                 }
             }
+            pg_constants::RM_REPLORIGIN_ID => {
+                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+                if info == pg_constants::XLOG_REPLORIGIN_SET {
+                    let xlrec = crate::walrecord::XlReploriginSet::decode(&mut buf);
+                    modification
+                        .set_replorigin(xlrec.node_id, xlrec.remote_lsn)
+                        .await?
+                } else if info == pg_constants::XLOG_REPLORIGIN_DROP {
+                    let xlrec = crate::walrecord::XlReploriginDrop::decode(&mut buf);
+                    modification.drop_replorigin(xlrec.node_id).await?
+                }
+            }
             _x => {
                 // TODO: should probably log & fail here instead of blindly
                 // doing something without understanding the protocol
@@ -1178,6 +1192,7 @@ impl WalIngest {
         modification: &mut DatadirModification<'_>,
         parsed: &XlXactParsedRecord,
         is_commit: bool,
+        origin_id: u16,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Record update of CLOG pages
@@ -1243,6 +1258,11 @@ impl WalIngest {
                 }
             }
         }
+        if origin_id != 0 {
+            modification
+                .set_replorigin(origin_id, parsed.origin_lsn)
+                .await?;
+        }
         Ok(())
     }
 
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index 02f6f49694..205f8dee4d 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -9,10 +9,10 @@ use postgres_ffi::pg_constants;
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{BlockNumber, TimestampTz};
 use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId};
-use postgres_ffi::{XLogRecord, XLOG_SIZE_OF_XLOG_RECORD};
+use postgres_ffi::{RepOriginId, XLogRecord, XLOG_SIZE_OF_XLOG_RECORD};
 use serde::{Deserialize, Serialize};
 use tracing::*;
-use utils::bin_ser::DeserializeError;
+use utils::{bin_ser::DeserializeError, lsn::Lsn};
 
 /// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
 /// around a PostgreSQL WAL record, or a custom neon-specific "record".
@@ -116,6 +116,7 @@ pub struct DecodedWALRecord {
 
     pub blocks: Vec<DecodedBkpBlock>,
     pub main_data_offset: usize,
+    pub origin_id: u16,
 }
 
 #[repr(C)]
@@ -573,6 +574,7 @@ pub struct XlXactParsedRecord {
     pub subxacts: Vec<TransactionId>,
 
     pub xnodes: Vec<RelFileNode>,
+    pub origin_lsn: Lsn,
 }
 
 impl XlXactParsedRecord {
@@ -651,6 +653,11 @@ impl XlXactParsedRecord {
             debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid);
         }
 
+        let origin_lsn = if xinfo & pg_constants::XACT_XINFO_HAS_ORIGIN != 0 {
+            Lsn(buf.get_u64_le())
+        } else {
+            Lsn::INVALID
+        };
         XlXactParsedRecord {
             xid,
             info,
@@ -660,6 +667,7 @@ impl XlXactParsedRecord {
             ts_id,
             subxacts,
             xnodes,
+            origin_lsn,
         }
     }
 }
@@ -810,6 +818,36 @@ impl XlRunningXacts {
     }
 }
 
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlReploriginDrop {
+    pub node_id: RepOriginId,
+}
+
+impl XlReploriginDrop {
+    pub fn decode(buf: &mut Bytes) -> XlReploriginDrop {
+        XlReploriginDrop {
+            node_id: buf.get_u16_le(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlReploriginSet {
+    pub remote_lsn: Lsn,
+    pub node_id: RepOriginId,
+}
+
+impl XlReploriginSet {
+    pub fn decode(buf: &mut Bytes) -> XlReploriginSet {
+        XlReploriginSet {
+            remote_lsn: Lsn(buf.get_u64_le()),
+            node_id: buf.get_u16_le(),
+        }
+    }
+}
+
 /// Main routine to decode a WAL record and figure out which blocks are modified
 //
 // See xlogrecord.h for details
@@ -844,6 +882,7 @@ pub fn decode_wal_record(
     let mut rnode_dbnode: u32 = 0;
     let mut rnode_relnode: u32 = 0;
     let mut got_rnode = false;
+    let mut origin_id: u16 = 0;
 
     let mut buf = record.clone();
 
@@ -891,7 +930,7 @@ pub fn decode_wal_record(
 
             pg_constants::XLR_BLOCK_ID_ORIGIN => {
                 // RepOriginId is uint16
-                buf.advance(2);
+                origin_id = buf.get_u16_le();
             }
 
             pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => {
@@ -1088,6 +1127,7 @@ pub fn decode_wal_record(
     decoded.xl_info = xlogrec.xl_info;
     decoded.xl_rmid = xlogrec.xl_rmid;
     decoded.record = record;
+    decoded.origin_id = origin_id;
     decoded.main_data_offset = main_data_offset;
 
     Ok(())
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 9772e2d106..b2e4d35cb8 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -81,8 +81,10 @@ page_cache_size=10
 
     non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum")
     non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count")
-    non_vectored_average = non_vectored_sum.value / non_vectored_count.value
-
+    if non_vectored_count.value != 0:
+        non_vectored_average = non_vectored_sum.value / non_vectored_count.value
+    else:
+        non_vectored_average = 0
     vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
     vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
     if vectored_count.value > 0:
diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py
new file mode 100644
index 0000000000..d7f3962620
--- /dev/null
+++ b/test_runner/regress/test_subscriber_restart.py
@@ -0,0 +1,57 @@
+import threading
+import time
+
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.utils import wait_until
+
+
+# This test checks of logical replication subscriber is able to correctly restart replication without receiving duplicates.
+# It requires tracking information about replication origins at page server side
+def test_subscriber_restart(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.neon_cli.create_branch("publisher")
+    pub = env.endpoints.create("publisher")
+    pub.start()
+
+    env.neon_cli.create_branch("subscriber")
+    sub = env.endpoints.create("subscriber")
+    sub.start()
+
+    n_records = 100000
+    n_restarts = 100
+
+    def check_that_changes_propagated():
+        scur.execute("SELECT count(*) FROM t")
+        res = scur.fetchall()
+        assert res[0][0] == n_records
+
+    def insert_data(pub):
+        with pub.cursor() as pcur:
+            for i in range(0, n_records):
+                pcur.execute("INSERT into t values (%s,random()*100000)", (i,))
+
+    with pub.cursor() as pcur:
+        with sub.cursor() as scur:
+            pcur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
+            pcur.execute("CREATE PUBLICATION pub FOR TABLE t")
+            scur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
+            # scur.execute("CREATE INDEX on t(sk)") # slowdown applying WAL at replica
+            pub_conn = f"host=localhost port={pub.pg_port} dbname=postgres user=cloud_admin"
+            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub"
+            scur.execute(query)
+            time.sleep(2)  # let initial table sync complete
+
+        thread = threading.Thread(target=insert_data, args=(pub,), daemon=True)
+        thread.start()
+
+        for _ in range(n_restarts):
+            # restart subscriber
+            # time.sleep(2)
+            sub.stop("immediate")
+            sub.start()
+
+        thread.join()
+        pcur.execute(f"INSERT into t values ({n_records}, 0)")
+        n_records += 1
+        with sub.cursor() as scur:
+            wait_until(10, 0.5, check_that_changes_propagated)

From 69026a9a364bfe5d944c8443fea5f3cdc2d1f7e2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 3 Jun 2024 19:13:01 +0100
Subject: [PATCH 0915/1571] storcon_cli: add 'drop' and eviction interval
 utilities (#7938)

The storage controller has 'drop' APIs for tenants and nodes, for use in
situations where something weird has happened:
- node-drop is useful until we implement proper node decom, or if we
have a partially provisioned node that somehow gets registered with the
storage controller but is then dead.
- tenant-drop is useful if we accidentally add a tenant that shouldn't
be there at all, or if we want to make the controller forget about a
tenant without deleting its data. For example, if one uses the
tenant-warmup command with a bad tenant ID and needs to clean that up.

The drop commands require an `--unsafe` parameter, to reduce the chance
that someone incorrectly assumes these are the normal/clean ways to
delete things.

This PR also adds a convenience command for setting the time based
eviction parameters on a tenant. This is useful when onboarding an
existing tenant that has high resident size due to storage amplification
in compaction: setting a lower time based eviction threshold brings down
the resident size ahead of doing a shard split.
---
 Cargo.lock                            |  1 +
 control_plane/storcon_cli/Cargo.toml  |  1 +
 control_plane/storcon_cli/src/main.rs | 67 ++++++++++++++++++++++++++-
 3 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 84d919b817..dbbf330cf9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5820,6 +5820,7 @@ dependencies = [
  "anyhow",
  "clap",
  "comfy-table",
+ "humantime",
  "hyper 0.14.26",
  "pageserver_api",
  "pageserver_client",
diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml
index 61eb7fa4e4..ed3462961f 100644
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -9,6 +9,7 @@ license.workspace = true
 anyhow.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+humantime.workspace = true
 hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index c19bc96cdb..05c4acdf90 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -7,8 +7,9 @@ use pageserver_api::{
         TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
-        LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
-        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
+        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
+        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
+        TenantShardSplitRequest, TenantShardSplitResponse,
     },
     shard::{ShardStripeSize, TenantShardId},
 };
@@ -125,6 +126,28 @@ enum Command {
         #[arg(long)]
         tenant_id: TenantId,
     },
+    /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
+    /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
+    TenantDrop {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        unclean: bool,
+    },
+    NodeDrop {
+        #[arg(long)]
+        node_id: NodeId,
+        #[arg(long)]
+        unclean: bool,
+    },
+    TenantSetTimeBasedEviction {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        period: humantime::Duration,
+        #[arg(long)]
+        threshold: humantime::Duration,
+    },
 }
 
 #[derive(Parser)]
@@ -674,6 +697,46 @@ async fn main() -> anyhow::Result<()> {
                 }
             }
         }
+        Command::TenantDrop { tenant_id, unclean } => {
+            if !unclean {
+                anyhow::bail!("This command is not a tenant deletion, and uncleanly drops all controller state for the tenant.  If you know what you're doing, add `--unclean` to proceed.")
+            }
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::POST,
+                    format!("debug/v1/tenant/{tenant_id}/drop"),
+                    None,
+                )
+                .await?;
+        }
+        Command::NodeDrop { node_id, unclean } => {
+            if !unclean {
+                anyhow::bail!("This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it.  If you know what you're doing, add `--unclean` to proceed.")
+            }
+            storcon_client
+                .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
+                .await?;
+        }
+        Command::TenantSetTimeBasedEviction {
+            tenant_id,
+            period,
+            threshold,
+        } => {
+            vps_client
+                .tenant_config(&TenantConfigRequest {
+                    tenant_id,
+                    config: TenantConfig {
+                        eviction_policy: Some(EvictionPolicy::LayerAccessThreshold(
+                            EvictionPolicyLayerAccessThreshold {
+                                period: period.into(),
+                                threshold: threshold.into(),
+                            },
+                        )),
+                        ..Default::default()
+                    },
+                })
+                .await?;
+        }
     }
 
     Ok(())

From 11bb265de1aff794d0945c7e9c888d87f7d13824 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 3 Jun 2024 21:10:13 +0100
Subject: [PATCH 0916/1571] pageserver: don't squash all image layer generation
 errors into anyhow::Error (#7943)

## Problem

CreateImageLayersError and CompactionError had proper From
implementations, but compact_legacy was explicitly squashing all image
layer errors into an anyhow::Error anyway.

This led to errors like:
```
 Error processing HTTP request: InternalServerError(timeline shutting down

Stack backtrace:
   0: <<anyhow::Error as core::convert::From<pageserver::tenant::timeline::CreateImageLayersError>>::from as core::ops::function::FnOnce<(pageserver::tenant::timeline::CreateImageLayersError,)>>::call_once
             at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/core/src/ops/function.rs:250:5
   1: <core::result::Result<alloc::vec::Vec<pageserver::tenant::storage_layer::layer::ResidentLayer>, pageserver::tenant::timeline::CreateImageLayersError>>::map_err::<anyhow::Error, <anyhow::Error as core::convert::From<pageserver::tenant::timeline::CreateImageLayersError>>::from>
             at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/core/src/result.rs:829:27
   2: <pageserver::tenant::timeline::Timeline>::compact_legacy::{closure#0}
             at pageserver/src/tenant/timeline/compaction.rs:125:36
   3: <pageserver::tenant::timeline::Timeline>::compact::{closure#0}
             at pageserver/src/tenant/timeline.rs:1719:84
   4: pageserver::http::routes::timeline_checkpoint_handler::{closure#0}::{closure#0}
```

Closes: https://github.com/neondatabase/neon/issues/7861
---
 pageserver/src/tenant/timeline/compaction.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 07a12f535a..15c77d0316 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -133,8 +133,7 @@ impl Timeline {
                         },
                         &image_ctx,
                     )
-                    .await
-                    .map_err(anyhow::Error::from)?;
+                    .await?;
 
                 self.upload_new_image_layers(image_layers)?;
                 partitioning.parts.len()

From 00032c9d9fff0dab5c69e612166c10e5245b43a4 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 4 Jun 2024 06:07:54 +0200
Subject: [PATCH 0917/1571] [proxy] Fix dynamic rate limiter (#7950)

## Problem

There was a bug in dynamic rate limiter, which exhausted CPU in proxy
and proxy wasn't able to accept any connections.

## Summary of changes

1. `if self.available > 1` -> `if self.available >= 1`
2. remove `timeout_at` to use just timeout
3. remove potential infinite loops which can exhaust CPUs.
---
 proxy/src/console/provider.rs                 |  4 +-
 proxy/src/rate_limiter/limit_algorithm.rs     | 38 +++-----
 .../src/rate_limiter/limit_algorithm/aimd.rs  | 88 ++++++++++++++++++-
 3 files changed, 102 insertions(+), 28 deletions(-)

diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 4d074f98a5..634ec9042c 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -452,7 +452,7 @@ pub struct ApiLocks<K> {
 
 #[derive(Debug, thiserror::Error)]
 pub enum ApiLockError {
-    #[error("permit could not be acquired")]
+    #[error("timeout acquiring resource permit")]
     TimeoutError(#[from] tokio::time::error::Elapsed),
 }
 
@@ -504,7 +504,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
                     .clone()
             }
         };
-        let permit = semaphore.acquire_deadline(now + self.timeout).await;
+        let permit = semaphore.acquire_timeout(self.timeout).await;
 
         self.metrics
             .semaphore_acquire_seconds
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
index 072fdb80b0..3842ce269e 100644
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -3,7 +3,7 @@ use parking_lot::Mutex;
 use std::{pin::pin, sync::Arc, time::Duration};
 use tokio::{
     sync::Notify,
-    time::{error::Elapsed, timeout_at, Instant},
+    time::{error::Elapsed, Instant},
 };
 
 use self::aimd::Aimd;
@@ -80,7 +80,7 @@ pub struct LimiterInner {
 }
 
 impl LimiterInner {
-    fn update(&mut self, latency: Duration, outcome: Option<Outcome>) {
+    fn update_limit(&mut self, latency: Duration, outcome: Option<Outcome>) {
         if let Some(outcome) = outcome {
             let sample = Sample {
                 latency,
@@ -92,12 +92,12 @@ impl LimiterInner {
     }
 
     fn take(&mut self, ready: &Notify) -> Option<()> {
-        if self.available > 1 {
+        if self.available >= 1 {
             self.available -= 1;
             self.in_flight += 1;
 
             // tell the next in the queue that there is a permit ready
-            if self.available > 1 {
+            if self.available >= 1 {
                 ready.notify_one();
             }
             Some(())
@@ -157,16 +157,12 @@ impl DynamicLimiter {
     }
 
     /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available.
-    ///
-    /// Returns `None` if there are none available after `duration`.
     pub async fn acquire_timeout(self: &Arc<Self>, duration: Duration) -> Result<Token, Elapsed> {
-        self.acquire_deadline(Instant::now() + duration).await
+        tokio::time::timeout(duration, self.acquire()).await?
     }
 
-    /// Try to acquire a concurrency [Token], waiting until `deadline` if there are none available.
-    ///
-    /// Returns `None` if there are none available after `deadline`.
-    pub async fn acquire_deadline(self: &Arc<Self>, deadline: Instant) -> Result<Token, Elapsed> {
+    /// Try to acquire a concurrency [Token].
+    async fn acquire(self: &Arc<Self>) -> Result<Token, Elapsed> {
         if self.config.initial_limit == 0 {
             // If the rate limiter is disabled, we can always acquire a token.
             Ok(Token::disabled())
@@ -174,22 +170,16 @@ impl DynamicLimiter {
             let mut notified = pin!(self.ready.notified());
             let mut ready = notified.as_mut().enable();
             loop {
-                let mut limit = None;
                 if ready {
                     let mut inner = self.inner.lock();
                     if inner.take(&self.ready).is_some() {
                         break Ok(Token::new(self.clone()));
-                    }
-                    limit = Some(inner.limit);
-                }
-                match timeout_at(deadline, notified.as_mut()).await {
-                    Ok(()) => ready = true,
-                    Err(e) => {
-                        let limit = limit.unwrap_or_else(|| self.inner.lock().limit);
-                        tracing::info!(limit, "could not acquire token in time");
-                        break Err(e);
+                    } else {
+                        notified.set(self.ready.notified());
                     }
                 }
+                notified.as_mut().await;
+                ready = true;
             }
         }
     }
@@ -208,14 +198,14 @@ impl DynamicLimiter {
 
         let mut inner = self.inner.lock();
 
-        inner.update(start.elapsed(), outcome);
+        inner.update_limit(start.elapsed(), outcome);
+
+        inner.in_flight -= 1;
         if inner.in_flight < inner.limit {
             inner.available = inner.limit - inner.in_flight;
             // At least 1 permit is now available
             self.ready.notify_one();
         }
-
-        inner.in_flight -= 1;
     }
 
     /// The current state of the limiter.
diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
index 370d4be802..ccc9c42420 100644
--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -51,7 +51,9 @@ impl LimitAlgorithm for Aimd {
                 // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1
                 let limit = limit.floor() as usize;
 
-                limit.clamp(self.min, self.max)
+                let limit = limit.clamp(self.min, self.max);
+                tracing::info!(limit, "limit decreased");
+                limit
             }
         }
     }
@@ -67,6 +69,53 @@ mod tests {
 
     use super::*;
 
+    #[tokio::test(start_paused = true)]
+    async fn increase_decrease() {
+        let config = RateLimiterConfig {
+            initial_limit: 1,
+            algorithm: RateLimitAlgorithm::Aimd {
+                conf: Aimd {
+                    min: 1,
+                    max: 2,
+                    inc: 10,
+                    dec: 0.5,
+                    utilisation: 0.8,
+                },
+            },
+        };
+
+        let limiter = DynamicLimiter::new(config);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        token.release(Outcome::Success);
+
+        assert_eq!(limiter.state().limit(), 2);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        token.release(Outcome::Success);
+        assert_eq!(limiter.state().limit(), 2);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        token.release(Outcome::Overload);
+        assert_eq!(limiter.state().limit(), 1);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        token.release(Outcome::Overload);
+        assert_eq!(limiter.state().limit(), 1);
+    }
+
     #[tokio::test(start_paused = true)]
     async fn should_decrease_limit_on_overload() {
         let config = RateLimiterConfig {
@@ -85,7 +134,7 @@ mod tests {
         let limiter = DynamicLimiter::new(config);
 
         let token = limiter
-            .acquire_timeout(Duration::from_millis(1))
+            .acquire_timeout(Duration::from_millis(100))
             .await
             .unwrap();
         token.release(Outcome::Overload);
@@ -93,6 +142,41 @@ mod tests {
         assert_eq!(limiter.state().limit(), 5, "overload: decrease");
     }
 
+    #[tokio::test(start_paused = true)]
+    async fn acquire_timeout_times_out() {
+        let config = RateLimiterConfig {
+            initial_limit: 1,
+            algorithm: RateLimitAlgorithm::Aimd {
+                conf: Aimd {
+                    min: 1,
+                    max: 2,
+                    inc: 10,
+                    dec: 0.5,
+                    utilisation: 0.8,
+                },
+            },
+        };
+
+        let limiter = DynamicLimiter::new(config);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        let now = tokio::time::Instant::now();
+        limiter
+            .acquire_timeout(Duration::from_secs(1))
+            .await
+            .err()
+            .unwrap();
+
+        assert!(now.elapsed() >= Duration::from_secs(1));
+
+        token.release(Outcome::Success);
+
+        assert_eq!(limiter.state().limit(), 2);
+    }
+
     #[tokio::test(start_paused = true)]
     async fn should_increase_limit_on_success_when_using_gt_util_threshold() {
         let config = RateLimiterConfig {

From 387a36874c9bd145982c5ee3c5a3d47af16344d7 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 4 Jun 2024 11:56:03 +0300
Subject: [PATCH 0918/1571] Set page LSN when reconstructing VM in page server
 (#7935)

## Problem

Page LSN is not set while VM update.
May be reason of test_vm_bits flukyness.
Buit more serious issues can be also caused by wrong LSN.

Related: https://github.com/neondatabase/neon/pull/7935

## Summary of changes

- In `apply_in_neon`, set the LSN bytes when applying records of type
`ClearVisibilityMapFlags`
---
 pageserver/src/tenant.rs             | 4 ++--
 pageserver/src/walredo.rs            | 4 ++--
 pageserver/src/walredo/apply_neon.rs | 6 +++++-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7ca829535b..19a0f59b2a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3992,8 +3992,8 @@ pub(crate) mod harness {
                 let base_img = base_img.expect("Neon WAL redo requires base image").1;
                 let mut page = BytesMut::new();
                 page.extend_from_slice(&base_img);
-                for (_record_lsn, record) in records {
-                    apply_neon::apply_in_neon(&record, key, &mut page)?;
+                for (record_lsn, record) in records {
+                    apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?;
                 }
                 Ok(page.freeze())
             } else {
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 1d72a97688..d660b68a34 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -361,10 +361,10 @@ impl PostgresRedoManager {
         &self,
         key: Key,
         page: &mut BytesMut,
-        _record_lsn: Lsn,
+        record_lsn: Lsn,
         record: &NeonWalRecord,
     ) -> anyhow::Result<()> {
-        apply_neon::apply_in_neon(record, key, page)?;
+        apply_neon::apply_in_neon(record, record_lsn, key, page)?;
 
         Ok(())
     }
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index 695894a924..24e8d8b01c 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -14,6 +14,7 @@ use postgres_ffi::v14::nonrelfile_utils::{
 use postgres_ffi::BLCKSZ;
 use tracing::*;
 use utils::bin_ser::BeSer;
+use utils::lsn::Lsn;
 
 /// Can this request be served by neon redo functions
 /// or we need to pass it to wal-redo postgres process?
@@ -32,6 +33,7 @@ pub(crate) fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {
 
 pub(crate) fn apply_in_neon(
     record: &NeonWalRecord,
+    lsn: Lsn,
     key: Key,
     page: &mut BytesMut,
 ) -> Result<(), anyhow::Error> {
@@ -67,6 +69,7 @@ pub(crate) fn apply_in_neon(
                 let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
 
                 map[map_byte as usize] &= !(flags << map_offset);
+                postgres_ffi::page_set_lsn(page, lsn);
             }
 
             // Repeat for 'old_heap_blkno', if any
@@ -80,6 +83,7 @@ pub(crate) fn apply_in_neon(
                 let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
 
                 map[map_byte as usize] &= !(flags << map_offset);
+                postgres_ffi::page_set_lsn(page, lsn);
             }
         }
         // Non-relational WAL records are handled here, with custom code that has the
@@ -285,7 +289,7 @@ mod test {
         let mut page = BytesMut::from_iter(base_image);
 
         for record in deltas {
-            apply_in_neon(&record, file_path, &mut page)?;
+            apply_in_neon(&record, Lsn(8), file_path, &mut page)?;
         }
 
         let reconstructed = AuxFilesDirectory::des(&page)?;

From 0acb604fa3793a0ae99238c026bf3c40391ae461 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 4 Jun 2024 14:19:36 +0300
Subject: [PATCH 0919/1571] test: no missed wakeups, cancellation and timeout
 flow to downloads (#7863)

I suspected a wakeup could be lost with
`remote_storage::support::DownloadStream` if the cancellation and inner
stream wakeups happen simultaneously. The next poll would only return
the cancellation error without setting the wakeup. There is no lost
wakeup because the single future for getting the cancellation error is
consumed when the value is ready, and a new future is created for the
*next* value. The new future is always polled. Similarly, if only the
`Stream::poll_next` is being used after a `Some(_)` value has been
yielded, it makes no sense to have an expectation of a wakeup for the
*(N+1)th* stream value already set because when a value is wanted,
`Stream::poll_next` will be called.

A test is added to show that the above is true.

Additionally, there was a question of these cancellations and timeouts
flowing to attached or secondary tenant downloads. A test is added to
show that this, in fact, happens.

Lastly, a warning message is logged when a download stream is polled
after a timeout or cancellation error (currently unexpected) so we can
rule it out while troubleshooting.
---
 libs/remote_storage/src/support.rs            |  50 ++++-
 .../tenant/remote_timeline_client/download.rs |   5 +
 pageserver/src/tenant/secondary/downloader.rs |   7 +-
 pageserver/src/tenant/secondary/scheduler.rs  |   7 +-
 test_runner/fixtures/remote_storage.py        |   5 +
 test_runner/regress/test_ondemand_download.py | 201 +++++++++++++++++-
 6 files changed, 263 insertions(+), 12 deletions(-)

diff --git a/libs/remote_storage/src/support.rs b/libs/remote_storage/src/support.rs
index d146b5445b..1ed9ed9305 100644
--- a/libs/remote_storage/src/support.rs
+++ b/libs/remote_storage/src/support.rs
@@ -78,6 +78,10 @@ where
                 let e = Err(std::io::Error::from(e));
                 return Poll::Ready(Some(e));
             }
+        } else {
+            // this would be perfectly valid behaviour for doing a graceful completion on the
+            // download for example, but not one we expect to do right now.
+            tracing::warn!("continuing polling after having cancelled or timeouted");
         }
 
         this.inner.poll_next(cx)
@@ -89,13 +93,22 @@ where
 }
 
 /// Fires only on the first cancel or timeout, not on both.
-pub(crate) async fn cancel_or_timeout(
+pub(crate) fn cancel_or_timeout(
     timeout: Duration,
     cancel: CancellationToken,
-) -> TimeoutOrCancel {
-    tokio::select! {
-        _ = tokio::time::sleep(timeout) => TimeoutOrCancel::Timeout,
-        _ = cancel.cancelled() => TimeoutOrCancel::Cancel,
+) -> impl std::future::Future<Output = TimeoutOrCancel> + 'static {
+    // futures are lazy, they don't do anything before being polled.
+    //
+    // "precalculate" the wanted deadline before returning the future, so that we can use pause
+    // failpoint to trigger a timeout in test.
+    let deadline = tokio::time::Instant::now() + timeout;
+    async move {
+        tokio::select! {
+            _ = tokio::time::sleep_until(deadline) => TimeoutOrCancel::Timeout,
+            _ = cancel.cancelled() => {
+                TimeoutOrCancel::Cancel
+            },
+        }
     }
 }
 
@@ -172,4 +185,31 @@ mod tests {
             _ = tokio::time::sleep(Duration::from_secs(121)) => {},
         }
     }
+
+    #[tokio::test]
+    async fn notified_but_pollable_after() {
+        let inner = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from_static(
+            b"hello world",
+        ))));
+        let timeout = Duration::from_secs(120);
+        let cancel = CancellationToken::new();
+
+        cancel.cancel();
+        let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
+        let mut stream = std::pin::pin!(stream);
+
+        let next = stream.next().await;
+        let ioe = next.unwrap().unwrap_err();
+        assert!(
+            matches!(
+                ioe.get_ref().unwrap().downcast_ref::<DownloadError>(),
+                Some(&DownloadError::Cancelled)
+            ),
+            "{ioe:?}"
+        );
+
+        let next = stream.next().await;
+        let bytes = next.unwrap().unwrap();
+        assert_eq!(&b"hello world"[..], bytes);
+    }
 }
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index bd75f980e8..d0385e4aee 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -28,6 +28,7 @@ use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
+use utils::pausable_failpoint;
 
 use super::index::{IndexPart, LayerFileMetadata};
 use super::{
@@ -152,6 +153,8 @@ async fn download_object<'a>(
 
                 let download = storage.download(src_path, cancel).await?;
 
+                pausable_failpoint!("before-downloading-layer-stream-pausable");
+
                 let mut buf_writer =
                     tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
 
@@ -199,6 +202,8 @@ async fn download_object<'a>(
 
                 let mut download = storage.download(src_path, cancel).await?;
 
+                pausable_failpoint!("before-downloading-layer-stream-pausable");
+
                 // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
                 // There's chunks_vectored() on the stream.
                 let (bytes_amount, destination_file) = async {
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 5c915d6b53..62803c7838 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -1000,7 +1000,7 @@ impl<'a> TenantDownloader<'a> {
             layer.name,
             layer.metadata.file_size
         );
-        let downloaded_bytes = match download_layer_file(
+        let downloaded_bytes = download_layer_file(
             self.conf,
             self.remote_storage,
             *tenant_shard_id,
@@ -1011,8 +1011,9 @@ impl<'a> TenantDownloader<'a> {
             &self.secondary_state.cancel,
             ctx,
         )
-        .await
-        {
+        .await;
+
+        let downloaded_bytes = match downloaded_bytes {
             Ok(bytes) => bytes,
             Err(DownloadError::NotFound) => {
                 // A heatmap might be out of date and refer to a layer that doesn't exist any more.
diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs
index 0ec1c7872a..28cf2125df 100644
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -334,8 +334,11 @@ where
 
         let tenant_shard_id = job.get_tenant_shard_id();
         let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
-            tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                           "Command already running, waiting for it");
+            tracing::info!(
+                tenant_id=%tenant_shard_id.tenant_id,
+                shard_id=%tenant_shard_id.shard_slug(),
+                "Command already running, waiting for it"
+            );
             barrier
         } else {
             let running = self.spawn_now(job);
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index ee18c53b52..6f6526d3fc 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -171,6 +171,8 @@ class S3Storage:
     """Is this MOCK_S3 (false) or REAL_S3 (true)"""
     real: bool
     endpoint: Optional[str] = None
+    """formatting deserialized with humantime crate, for example "1s"."""
+    custom_timeout: Optional[str] = None
 
     def access_env_vars(self) -> Dict[str, str]:
         if self.aws_profile is not None:
@@ -208,6 +210,9 @@ class S3Storage:
         if self.endpoint is not None:
             rv["endpoint"] = self.endpoint
 
+        if self.custom_timeout is not None:
+            rv["timeout"] = self.custom_timeout
+
         return rv
 
     def to_toml_inline_table(self) -> str:
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 6fe23846c7..4a25dfd874 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -3,8 +3,10 @@
 
 import time
 from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
 from typing import Any, DefaultDict, Dict, Tuple
 
+import pytest
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -13,7 +15,7 @@ from fixtures.neon_fixtures import (
     last_flush_lsn_upload,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
     assert_tenant_state,
     wait_for_last_record_lsn,
@@ -21,7 +23,7 @@ from fixtures.pageserver.utils import (
     wait_for_upload_queue_empty,
     wait_until_tenant_active,
 )
-from fixtures.remote_storage import RemoteStorageKind
+from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
 from fixtures.utils import query_scalar, wait_until
 
 
@@ -656,5 +658,200 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne
     assert dict(kinds_after) == {"Delta": 4, "Image": 1}
 
 
+def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBuilder):
+    """
+    Demonstrates that tenant shutdown will cancel on-demand download and secondary doing warmup.
+    """
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+
+    # turn off background tasks so that they don't interfere with the downloads
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+        }
+    )
+    client = env.pageserver.http_client()
+    failpoint = "before-downloading-layer-stream-pausable"
+    client.configure_failpoints((failpoint, "pause"))
+
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*downloading failed, possibly for shutdown.*",
+        ]
+    )
+
+    info = client.layer_map_info(env.initial_tenant, env.initial_timeline)
+    assert len(info.delta_layers()) == 1
+
+    layer = info.delta_layers()[0]
+
+    client.tenant_heatmap_upload(env.initial_tenant)
+
+    # evict the initdb layer so we can download it
+    client.evict_layer(env.initial_tenant, env.initial_timeline, layer.layer_file_name)
+
+    with ThreadPoolExecutor(max_workers=2) as exec:
+        download = exec.submit(
+            client.download_layer,
+            env.initial_tenant,
+            env.initial_timeline,
+            layer.layer_file_name,
+        )
+
+        _, offset = wait_until(
+            20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+        )
+
+        location_conf = {"mode": "Detached", "tenant_conf": {}}
+        # assume detach removes the layers
+        detach = exec.submit(client.tenant_location_conf, env.initial_tenant, location_conf)
+
+        _, offset = wait_until(
+            20,
+            0.5,
+            lambda: env.pageserver.assert_log_contains(
+                "closing is taking longer than expected", offset
+            ),
+        )
+
+        client.configure_failpoints((failpoint, "off"))
+
+        with pytest.raises(
+            PageserverApiException, match="downloading failed, possibly for shutdown"
+        ):
+            download.result()
+
+        env.pageserver.assert_log_contains(".*downloading failed, possibly for shutdown.*")
+
+        detach.result()
+
+        client.configure_failpoints((failpoint, "pause"))
+
+        _, offset = wait_until(
+            20,
+            0.5,
+            lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset),
+        )
+
+        location_conf = {
+            "mode": "Secondary",
+            "secondary_conf": {"warm": True},
+            "tenant_conf": {},
+        }
+
+        client.tenant_location_conf(env.initial_tenant, location_conf)
+
+        warmup = exec.submit(client.tenant_secondary_download, env.initial_tenant, wait_ms=30000)
+
+        _, offset = wait_until(
+            20,
+            0.5,
+            lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}", offset),
+        )
+
+        client.configure_failpoints((failpoint, "off"))
+        location_conf = {"mode": "Detached", "tenant_conf": {}}
+        client.tenant_location_conf(env.initial_tenant, location_conf)
+
+        client.configure_failpoints((failpoint, "off"))
+
+        # here we have nothing in the log, but we see that the warmup and conf location update worked
+        warmup.result()
+
+
+def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder):
+    """
+    Pause using a pausable_failpoint longer than the client timeout to simulate the timeout happening.
+    """
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    assert isinstance(neon_env_builder.pageserver_remote_storage, S3Storage)
+    neon_env_builder.pageserver_remote_storage.custom_timeout = "1s"
+
+    # turn off background tasks so that they don't interfere with the downloads
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+        }
+    )
+    client = env.pageserver.http_client()
+    failpoint = "before-downloading-layer-stream-pausable"
+    client.configure_failpoints((failpoint, "pause"))
+
+    info = client.layer_map_info(env.initial_tenant, env.initial_timeline)
+    assert len(info.delta_layers()) == 1
+
+    layer = info.delta_layers()[0]
+
+    client.tenant_heatmap_upload(env.initial_tenant)
+
+    # evict so we can download it
+    client.evict_layer(env.initial_tenant, env.initial_timeline, layer.layer_file_name)
+
+    with ThreadPoolExecutor(max_workers=2) as exec:
+        download = exec.submit(
+            client.download_layer,
+            env.initial_tenant,
+            env.initial_timeline,
+            layer.layer_file_name,
+        )
+
+        _, offset = wait_until(
+            20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+        )
+        # ensure enough time while paused to trip the timeout
+        time.sleep(2)
+
+        client.configure_failpoints((failpoint, "off"))
+        download.result()
+
+        _, offset = env.pageserver.assert_log_contains(
+            ".*failed, will retry \\(attempt 0\\): timeout.*"
+        )
+        _, offset = env.pageserver.assert_log_contains(".*succeeded after [0-9]+ retries.*", offset)
+
+        client.evict_layer(env.initial_tenant, env.initial_timeline, layer.layer_file_name)
+
+        client.configure_failpoints((failpoint, "pause"))
+
+        # capture the next offset for a new synchronization with the failpoint
+        _, offset = wait_until(
+            20,
+            0.5,
+            lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset),
+        )
+
+        location_conf = {
+            "mode": "Secondary",
+            "secondary_conf": {"warm": True},
+            "tenant_conf": {},
+        }
+
+        client.tenant_location_conf(
+            env.initial_tenant,
+            location_conf,
+        )
+
+        started = time.time()
+
+        warmup = exec.submit(client.tenant_secondary_download, env.initial_tenant, wait_ms=30000)
+        # ensure enough time while paused to trip the timeout
+        time.sleep(2)
+
+        client.configure_failpoints((failpoint, "off"))
+
+        warmup.result()
+
+        elapsed = time.time() - started
+
+        _, offset = env.pageserver.assert_log_contains(
+            ".*failed, will retry \\(attempt 0\\): timeout.*", offset
+        )
+        _, offset = env.pageserver.assert_log_contains(".*succeeded after [0-9]+ retries.*", offset)
+
+        assert elapsed < 30, "too long passed: {elapsed=}"
+
+
 def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
     return dict(map(lambda x: (x[0], str(x[1])), conf.items()))

From 9d4c113f9ba9aad94e01999d0e4e0c4c366960cf Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 4 Jun 2024 14:42:57 +0300
Subject: [PATCH 0920/1571] build(Dockerfile.compute-node): do not log tar
 contents (#7953)

in build logs we get a lot of lines for building the compute node images
because of verbose tar unpack. we know the sha256 so we don't need to
log the contents. my hope is that this will allow us more reliably use
the github live updating log view.
---
 Dockerfile.compute-node | 70 ++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 87fb218245..db3734047e 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -89,7 +89,7 @@ RUN apt update && \
 # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
 RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
     echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
-    mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
+    mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
     cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
     DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
     make clean && cp -R /sfcgal/* /
@@ -98,7 +98,7 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
 
 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
     echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
-    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
+    mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     ./autogen.sh && \
     ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
@@ -124,7 +124,7 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postg
 
 RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
     echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
-    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
+    mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \
     mkdir build && cd build && \
     cmake -DCMAKE_BUILD_TYPE=Release .. && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -149,7 +149,7 @@ RUN apt update && \
 
 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
     echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
-    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
+    mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \
     # generate and copy upgrade scripts
     mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
     cp upgrade/* /usr/local/pgsql/share/extension/ && \
@@ -194,7 +194,7 @@ RUN case "$(uname -m)" in \
 
 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
     echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
-    mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \
+    mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \
     mkdir build && cd build && \
     cmake .. -DCMAKE_BUILD_TYPE=Release && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -204,7 +204,7 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz
 
 RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
     echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
-    mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
+    mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -222,7 +222,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
     echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \
-    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
+    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
@@ -243,12 +243,12 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY patches/pgvector.patch /pgvector.patch
 
-# By default, pgvector Makefile uses `-march=native`. We don't want that, 
+# By default, pgvector Makefile uses `-march=native`. We don't want that,
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
 RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
     echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
-    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
+    mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -266,7 +266,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 # 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
 RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
     echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \
-    mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \
+    mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
 
@@ -281,7 +281,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
     echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
-    mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
+    mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
@@ -297,7 +297,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
     echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
-    mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
+    mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
@@ -313,7 +313,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
     echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
-    mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \
+    mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
@@ -329,7 +329,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
     echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \
-    mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \
+    mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
@@ -345,7 +345,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
     echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
-    mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
+    mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
@@ -361,7 +361,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
     echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
-    mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
+    mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
@@ -377,7 +377,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
     echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
-    mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
+    mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
@@ -393,7 +393,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
     echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
-    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
+    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
@@ -424,7 +424,7 @@ RUN case "${PG_VERSION}" in \
     apt-get install -y cmake && \
     wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
     echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
-    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
+    mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \
     ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
     cd build && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -462,7 +462,7 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \
     echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \
-    mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xvzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
+    mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make install -j $(getconf _NPROCESSORS_ONLN) && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control
@@ -481,7 +481,7 @@ RUN apt-get update && \
     apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \
     wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
     echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
-    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
+    mkdir kq_imcx-src && cd kq_imcx-src && tar xzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     mkdir build && cd build && \
     cmake -DCMAKE_BUILD_TYPE=Release .. && \
@@ -505,7 +505,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
     echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
-    mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
+    mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
@@ -531,7 +531,7 @@ RUN apt-get update && \
 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
     echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
-    mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
+    mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
     cmake \
         -D RDK_BUILD_CAIRO_SUPPORT=OFF \
         -D RDK_BUILD_INCHI_SUPPORT=ON \
@@ -571,7 +571,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
     echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
-    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
+    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
@@ -588,7 +588,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
     echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
-    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
+    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
@@ -605,7 +605,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
     echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
-    mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \
+    mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
@@ -631,7 +631,7 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
     echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
-    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
+    mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install
 
@@ -647,7 +647,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
     echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
-    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
+    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
@@ -696,7 +696,7 @@ ARG PG_VERSION
 
 RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
     echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
-    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
+    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
@@ -713,7 +713,7 @@ ARG PG_VERSION
 
 RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
     echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
-    mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
+    mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
     # it's needed to enable extension because it uses untrusted C language
@@ -733,7 +733,7 @@ ARG PG_VERSION
 # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
 RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
     echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
-    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
+    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
 
@@ -749,7 +749,7 @@ ARG PG_VERSION
 
 RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
     echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
-    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
+    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
     echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
     wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
     patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
@@ -771,7 +771,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
     echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
-    mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
+    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install
 
@@ -787,7 +787,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
     echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
-    mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
+    mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
@@ -804,7 +804,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
     echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
-    mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \
+    mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control

From 0112097e1321d16c4ff51dc4e69818aed395ec32 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 4 Jun 2024 17:27:08 +0300
Subject: [PATCH 0921/1571] feat(rtc): maintain dirty and uploaded IndexPart
 (#7833)

RemoteTimelineClient maintains a copy of "next IndexPart" as a number of
fields which are like an IndexPart but this is not immediately obvious.
Instead of multiple fields, maintain a `dirty` ("next IndexPart") and
`clean` ("uploaded IndexPart") fields.

Additional cleanup:
- rename `IndexPart::disk_consistent_lsn` accessor
`duplicated_disk_consistent_lsn`
- no one except scrubber should be looking at it, even scrubber is a
stretch
- remove usage elsewhere (pagectl used by tests, metadata scan endpoint)
- serialize index part *before* the index upload operation
- avoid upload operation being retried because of serialization error
- serialization error is fatal anyway for timeline -- it can only make
transient local progress after that, at least the error is bubbled up
now
- gather exploded IndexPart fields into single actual
`UploadQueueInitialized::dirty` of which the uploaded snapshot is
serialized
- implement the long wished monotonicity check with the `clean`
IndexPart with an assertion which is not expected to fire

Continued work from #7860 towards next step of #6994.
---
 pageserver/ctl/src/index_part.rs              |   2 +-
 pageserver/src/http/routes.rs                 |   2 +-
 .../src/tenant/remote_timeline_client.rs      | 150 ++++++++++--------
 .../tenant/remote_timeline_client/index.rs    |  55 ++-----
 .../tenant/remote_timeline_client/upload.rs   |  19 +--
 pageserver/src/tenant/upload_queue.rs         |  77 ++++-----
 s3_scrubber/src/checks.rs                     |   4 +-
 7 files changed, 144 insertions(+), 165 deletions(-)

diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs
index 2998b5c732..a33cae6769 100644
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -26,7 +26,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
 
             let output = Output {
                 layer_metadata: &des.layer_metadata,
-                disk_consistent_lsn: des.get_disk_consistent_lsn(),
+                disk_consistent_lsn: des.metadata.disk_consistent_lsn(),
                 timeline_metadata: &des.metadata,
             };
 
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index bd6fa028ac..6b6a131c88 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2182,7 +2182,7 @@ async fn tenant_scan_remote_handler(
             {
                 Ok((index_part, index_generation)) => {
                     tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
-                        index_part.layer_metadata.len(), index_part.get_disk_consistent_lsn());
+                        index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn());
                     generation = std::cmp::max(generation, index_generation);
                 }
                 Err(DownloadError::NotFound) => {
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 73438a790f..e33e4b84aa 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -91,8 +91,7 @@
 //!
 //! The *actual* remote state lags behind the *desired* remote state while
 //! there are in-flight operations.
-//! We keep track of the desired remote state in
-//! [`UploadQueueInitialized::latest_files`] and [`UploadQueueInitialized::latest_metadata`].
+//! We keep track of the desired remote state in [`UploadQueueInitialized::dirty`].
 //! It is initialized based on the [`IndexPart`] that was passed during init
 //! and updated with every `schedule_*` function call.
 //! All this is necessary necessary to compute the future [`IndexPart`]s
@@ -115,8 +114,7 @@
 //!
 //! # Completion
 //!
-//! Once an operation has completed, we update
-//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately,
+//! Once an operation has completed, we update [`UploadQueueInitialized::clean`] immediately,
 //! and submit a request through the DeletionQueue to update
 //! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has
 //! validated that our generation is not stale.  It is this visible value
@@ -416,6 +414,7 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Returns `None` if nothing is yet uplodaded, `Some(disk_consistent_lsn)` otherwise.
     pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
         match &mut *self.upload_queue.lock().unwrap() {
             UploadQueue::Uninitialized => None,
@@ -442,13 +441,11 @@ impl RemoteTimelineClient {
     /// Returns true if this timeline was previously detached at this Lsn and the remote timeline
     /// client is currently initialized.
     pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
-        // technically this is a dirty read, but given how timeline detach ancestor is implemented
-        // via tenant restart, the lineage has always been uploaded.
         self.upload_queue
             .lock()
             .unwrap()
             .initialized_mut()
-            .map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn))
+            .map(|uq| uq.clean.0.lineage.is_previous_ancestor_lsn(lsn))
             .unwrap_or(false)
     }
 
@@ -457,7 +454,6 @@ impl RemoteTimelineClient {
             current_remote_index_part
                 .layer_metadata
                 .values()
-                // If we don't have the file size for the layer, don't account for it in the metric.
                 .map(|ilmd| ilmd.file_size)
                 .sum()
         } else {
@@ -585,9 +581,9 @@ impl RemoteTimelineClient {
 
         // As documented in the struct definition, it's ok for latest_metadata to be
         // ahead of what's _actually_ on the remote during index upload.
-        upload_queue.latest_metadata = metadata.clone();
+        upload_queue.dirty.metadata = metadata.clone();
 
-        self.schedule_index_upload(upload_queue);
+        self.schedule_index_upload(upload_queue)?;
 
         Ok(())
     }
@@ -606,9 +602,9 @@ impl RemoteTimelineClient {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
 
-        upload_queue.latest_metadata.apply(update);
+        upload_queue.dirty.metadata.apply(update);
 
-        self.schedule_index_upload(upload_queue);
+        self.schedule_index_upload(upload_queue)?;
 
         Ok(())
     }
@@ -620,8 +616,8 @@ impl RemoteTimelineClient {
     ) -> anyhow::Result<()> {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
-        upload_queue.last_aux_file_policy = last_aux_file_policy;
-        self.schedule_index_upload(upload_queue);
+        upload_queue.dirty.last_aux_file_policy = last_aux_file_policy;
+        self.schedule_index_upload(upload_queue)?;
         Ok(())
     }
     ///
@@ -639,30 +635,44 @@ impl RemoteTimelineClient {
         let upload_queue = guard.initialized_mut()?;
 
         if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue);
+            self.schedule_index_upload(upload_queue)?;
         }
 
         Ok(())
     }
 
     /// Launch an index-file upload operation in the background (internal function)
-    fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
-        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
+    fn schedule_index_upload(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+    ) -> anyhow::Result<()> {
+        let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();
+        // fix up the duplicated field
+        upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn;
+
+        // make sure it serializes before doing it in perform_upload_task so that it doesn't
+        // look like a retryable error
+        let void = std::io::sink();
+        serde_json::to_writer(void, &upload_queue.dirty).context("serialize index_part.json")?;
+
+        let index_part = &upload_queue.dirty;
 
         info!(
             "scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
-            upload_queue.latest_files.len(),
+            index_part.layer_metadata.len(),
             upload_queue.latest_files_changes_since_metadata_upload_scheduled,
         );
 
-        let index_part = IndexPart::from(&*upload_queue);
-        let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn);
+        let op = UploadOp::UploadMetadata {
+            uploaded: Box::new(index_part.clone()),
+        };
         self.metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
         upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
 
         // Launch the task immediately, if possible
         self.launch_queued_tasks(upload_queue);
+        Ok(())
     }
 
     pub(crate) async fn schedule_reparenting_and_wait(
@@ -675,16 +685,16 @@ impl RemoteTimelineClient {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
 
-            let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else {
+            let Some(prev) = upload_queue.dirty.metadata.ancestor_timeline() else {
                 return Err(anyhow::anyhow!(
                     "cannot reparent without a current ancestor"
                 ));
             };
 
-            upload_queue.latest_metadata.reparent(new_parent);
-            upload_queue.latest_lineage.record_previous_ancestor(&prev);
+            upload_queue.dirty.metadata.reparent(new_parent);
+            upload_queue.dirty.lineage.record_previous_ancestor(&prev);
 
-            self.schedule_index_upload(upload_queue);
+            self.schedule_index_upload(upload_queue)?;
 
             self.schedule_barrier0(upload_queue)
         };
@@ -705,16 +715,17 @@ impl RemoteTimelineClient {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
 
-            upload_queue.latest_metadata.detach_from_ancestor(&adopted);
-            upload_queue.latest_lineage.record_detaching(&adopted);
+            upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
+            upload_queue.dirty.lineage.record_detaching(&adopted);
 
             for layer in layers {
                 upload_queue
-                    .latest_files
+                    .dirty
+                    .layer_metadata
                     .insert(layer.layer_desc().layer_name(), layer.metadata());
             }
 
-            self.schedule_index_upload(upload_queue);
+            self.schedule_index_upload(upload_queue)?;
 
             let barrier = self.schedule_barrier0(upload_queue);
             self.launch_queued_tasks(upload_queue);
@@ -746,7 +757,8 @@ impl RemoteTimelineClient {
         let metadata = layer.metadata();
 
         upload_queue
-            .latest_files
+            .dirty
+            .layer_metadata
             .insert(layer.layer_desc().layer_name(), metadata.clone());
         upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
 
@@ -776,8 +788,8 @@ impl RemoteTimelineClient {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
 
-        let with_metadata =
-            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
+        let with_metadata = self
+            .schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned())?;
 
         self.schedule_deletion_of_unlinked0(upload_queue, with_metadata);
 
@@ -801,7 +813,7 @@ impl RemoteTimelineClient {
 
         let names = gc_layers.iter().map(|x| x.layer_desc().layer_name());
 
-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?;
 
         self.launch_queued_tasks(upload_queue);
 
@@ -814,7 +826,7 @@ impl RemoteTimelineClient {
         self: &Arc<Self>,
         upload_queue: &mut UploadQueueInitialized,
         names: I,
-    ) -> Vec<(LayerName, LayerFileMetadata)>
+    ) -> anyhow::Result<Vec<(LayerName, LayerFileMetadata)>>
     where
         I: IntoIterator<Item = LayerName>,
     {
@@ -824,7 +836,7 @@ impl RemoteTimelineClient {
         let with_metadata: Vec<_> = names
             .into_iter()
             .filter_map(|name| {
-                let meta = upload_queue.latest_files.remove(&name);
+                let meta = upload_queue.dirty.layer_metadata.remove(&name);
 
                 if let Some(meta) = meta {
                     upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
@@ -856,10 +868,10 @@ impl RemoteTimelineClient {
         // index_part update, because that needs to be uploaded before we can actually delete the
         // files.
         if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue);
+            self.schedule_index_upload(upload_queue)?;
         }
 
-        with_metadata
+        Ok(with_metadata)
     }
 
     /// Schedules deletion for layer files which have previously been unlinked from the
@@ -950,7 +962,7 @@ impl RemoteTimelineClient {
 
         let names = compacted_from.iter().map(|x| x.layer_desc().layer_name());
 
-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?;
         self.launch_queued_tasks(upload_queue);
 
         Ok(())
@@ -1085,7 +1097,7 @@ impl RemoteTimelineClient {
             let deleted_at = Utc::now().naive_utc();
             stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
 
-            let mut index_part = IndexPart::from(&stopped.upload_queue_for_deletion);
+            let mut index_part = stopped.upload_queue_for_deletion.dirty.clone();
             index_part.deleted_at = Some(deleted_at);
             index_part
         };
@@ -1296,7 +1308,8 @@ impl RemoteTimelineClient {
 
             stopped
                 .upload_queue_for_deletion
-                .latest_files
+                .dirty
+                .layer_metadata
                 .drain()
                 .map(|(file_name, meta)| {
                     remote_layer_path(
@@ -1433,7 +1446,7 @@ impl RemoteTimelineClient {
                     // Can always be scheduled.
                     true
                 }
-                UploadOp::UploadMetadata(_, _) => {
+                UploadOp::UploadMetadata { .. } => {
                     // These can only be performed after all the preceding operations
                     // have finished.
                     upload_queue.inprogress_tasks.is_empty()
@@ -1475,7 +1488,7 @@ impl RemoteTimelineClient {
                 UploadOp::UploadLayer(_, _) => {
                     upload_queue.num_inprogress_layer_uploads += 1;
                 }
-                UploadOp::UploadMetadata(_, _) => {
+                UploadOp::UploadMetadata { .. } => {
                     upload_queue.num_inprogress_metadata_uploads += 1;
                 }
                 UploadOp::Delete(_) => {
@@ -1584,22 +1597,13 @@ impl RemoteTimelineClient {
                     )
                     .await
                 }
-                UploadOp::UploadMetadata(ref index_part, _lsn) => {
-                    let mention_having_future_layers = if cfg!(feature = "testing") {
-                        index_part
-                            .layer_metadata
-                            .keys()
-                            .any(|x| x.is_in_future(*_lsn))
-                    } else {
-                        false
-                    };
-
+                UploadOp::UploadMetadata { ref uploaded } => {
                     let res = upload::upload_index_part(
                         &self.storage_impl,
                         &self.tenant_shard_id,
                         &self.timeline_id,
                         self.generation,
-                        index_part,
+                        uploaded,
                         &self.cancel,
                     )
                     .measure_remote_op(
@@ -1609,10 +1613,21 @@ impl RemoteTimelineClient {
                     )
                     .await;
                     if res.is_ok() {
-                        self.update_remote_physical_size_gauge(Some(index_part));
+                        self.update_remote_physical_size_gauge(Some(uploaded));
+                        let mention_having_future_layers = if cfg!(feature = "testing") {
+                            uploaded
+                                .layer_metadata
+                                .keys()
+                                .any(|x| x.is_in_future(uploaded.metadata.disk_consistent_lsn()))
+                        } else {
+                            false
+                        };
                         if mention_having_future_layers {
                             // find rationale near crate::tenant::timeline::init::cleanup_future_layer
-                            tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup");
+                            tracing::info!(
+                                disk_consistent_lsn = %uploaded.metadata.disk_consistent_lsn(),
+                                "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup"
+                            );
                         }
                     }
                     res
@@ -1713,11 +1728,23 @@ impl RemoteTimelineClient {
                     upload_queue.num_inprogress_layer_uploads -= 1;
                     None
                 }
-                UploadOp::UploadMetadata(_, lsn) => {
+                UploadOp::UploadMetadata { ref uploaded } => {
                     upload_queue.num_inprogress_metadata_uploads -= 1;
-                    // XXX monotonicity check?
 
-                    upload_queue.projected_remote_consistent_lsn = Some(lsn);
+                    // the task id is reused as a monotonicity check for storing the "clean"
+                    // IndexPart.
+                    let last_updater = upload_queue.clean.1;
+                    let is_later = last_updater.is_some_and(|task_id| task_id < task.task_id);
+                    let monotone = is_later || last_updater.is_none();
+
+                    assert!(monotone, "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", task.task_id);
+
+                    // not taking ownership is wasteful
+                    upload_queue.clean.0.clone_from(uploaded);
+                    upload_queue.clean.1 = Some(task.task_id);
+
+                    let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
+
                     if self.generation.is_none() {
                         // Legacy mode: skip validating generation
                         upload_queue.visible_remote_consistent_lsn.store(lsn);
@@ -1771,7 +1798,7 @@ impl RemoteTimelineClient {
                 RemoteOpKind::Upload,
                 RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
             ),
-            UploadOp::UploadMetadata(_, _) => (
+            UploadOp::UploadMetadata { .. } => (
                 RemoteOpFileKind::Index,
                 RemoteOpKind::Upload,
                 DontTrackSize {
@@ -1847,11 +1874,9 @@ impl RemoteTimelineClient {
                     // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
                     let upload_queue_for_deletion = UploadQueueInitialized {
                         task_counter: 0,
-                        latest_files: initialized.latest_files.clone(),
+                        dirty: initialized.dirty.clone(),
+                        clean: initialized.clean.clone(),
                         latest_files_changes_since_metadata_upload_scheduled: 0,
-                        latest_metadata: initialized.latest_metadata.clone(),
-                        latest_lineage: initialized.latest_lineage.clone(),
-                        projected_remote_consistent_lsn: None,
                         visible_remote_consistent_lsn: initialized
                             .visible_remote_consistent_lsn
                             .clone(),
@@ -1864,7 +1889,6 @@ impl RemoteTimelineClient {
                         dangling_files: HashMap::default(),
                         shutting_down: false,
                         shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
-                        last_aux_file_policy: initialized.last_aux_file_policy,
                     };
 
                     let upload_queue = std::mem::replace(
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index f5d939c747..6494261312 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -11,7 +11,6 @@ use utils::id::TimelineId;
 
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerName;
-use crate::tenant::upload_queue::UploadQueueInitialized;
 use crate::tenant::Generation;
 use pageserver_api::shard::ShardIndex;
 
@@ -42,7 +41,7 @@ pub struct IndexPart {
     // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
     // It's duplicated for convenience when reading the serialized structure, but is
     // private because internally we would read from metadata instead.
-    disk_consistent_lsn: Lsn,
+    pub(super) disk_consistent_lsn: Lsn,
 
     #[serde(rename = "metadata_bytes")]
     pub metadata: TimelineMetadata,
@@ -80,23 +79,15 @@ impl IndexPart {
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
-    fn new(
-        layers_and_metadata: &HashMap<LayerName, LayerFileMetadata>,
-        disk_consistent_lsn: Lsn,
-        metadata: TimelineMetadata,
-        lineage: Lineage,
-        last_aux_file_policy: Option<AuxFilePolicy>,
-    ) -> Self {
-        let layer_metadata = layers_and_metadata.clone();
-
-        Self {
+    pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
+        IndexPart {
             version: Self::LATEST_VERSION,
-            layer_metadata,
-            disk_consistent_lsn,
+            layer_metadata: Default::default(),
+            disk_consistent_lsn: metadata.disk_consistent_lsn(),
             metadata,
             deleted_at: None,
-            lineage,
-            last_aux_file_policy,
+            lineage: Default::default(),
+            last_aux_file_policy: None,
         }
     }
 
@@ -106,7 +97,7 @@ impl IndexPart {
 
     /// If you want this under normal operations, read it from self.metadata:
     /// this method is just for the scrubber to use when validating an index.
-    pub fn get_disk_consistent_lsn(&self) -> Lsn {
+    pub fn duplicated_disk_consistent_lsn(&self) -> Lsn {
         self.disk_consistent_lsn
     }
 
@@ -120,14 +111,7 @@ impl IndexPart {
 
     #[cfg(test)]
     pub(crate) fn example() -> Self {
-        let example_metadata = TimelineMetadata::example();
-        Self::new(
-            &HashMap::new(),
-            example_metadata.disk_consistent_lsn(),
-            example_metadata,
-            Default::default(),
-            Some(AuxFilePolicy::V1),
-        )
+        Self::empty(TimelineMetadata::example())
     }
 
     pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
@@ -135,22 +119,6 @@ impl IndexPart {
     }
 }
 
-impl From<&UploadQueueInitialized> for IndexPart {
-    fn from(uq: &UploadQueueInitialized) -> Self {
-        let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn();
-        let metadata = uq.latest_metadata.clone();
-        let lineage = uq.latest_lineage.clone();
-
-        Self::new(
-            &uq.latest_files,
-            disk_consistent_lsn,
-            metadata,
-            lineage,
-            uq.last_aux_file_policy,
-        )
-    }
-}
-
 /// Metadata gathered for each of the layer files.
 ///
 /// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
@@ -236,11 +204,10 @@ impl Lineage {
     /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
     /// to start a read/write primary at this lsn".
     ///
-    /// Returns true if the Lsn was previously a branch point.
+    /// Returns true if the Lsn was previously our branch point.
     pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
         self.original_ancestor
-            .as_ref()
-            .is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn)
+            .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
     }
 }
 
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index e8e824f415..c4dd184610 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -1,6 +1,7 @@
 //! Helper functions to upload files to remote storage with a RemoteStorage
 
 use anyhow::{bail, Context};
+use bytes::Bytes;
 use camino::Utf8Path;
 use fail::fail_point;
 use pageserver_api::shard::TenantShardId;
@@ -11,10 +12,10 @@ use tokio::io::AsyncSeekExt;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, pausable_failpoint};
 
+use super::index::IndexPart;
 use super::Generation;
 use crate::tenant::remote_timeline_client::{
-    index::IndexPart, remote_index_path, remote_initdb_archive_path,
-    remote_initdb_preserved_archive_path,
+    remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path,
 };
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
 use utils::id::{TenantId, TimelineId};
@@ -27,7 +28,7 @@ pub(crate) async fn upload_index_part<'a>(
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
     generation: Generation,
-    index_part: &'a IndexPart,
+    index_part: &IndexPart,
     cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
     tracing::trace!("uploading new index part");
@@ -37,16 +38,16 @@ pub(crate) async fn upload_index_part<'a>(
     });
     pausable_failpoint!("before-upload-index-pausable");
 
-    let index_part_bytes = index_part
-        .to_s3_bytes()
-        .context("serialize index part file into bytes")?;
-    let index_part_size = index_part_bytes.len();
-    let index_part_bytes = bytes::Bytes::from(index_part_bytes);
+    // FIXME: this error comes too late
+    let serialized = index_part.to_s3_bytes()?;
+    let serialized = Bytes::from(serialized);
+
+    let index_part_size = serialized.len();
 
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
     storage
         .upload_storage_object(
-            futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
+            futures::stream::once(futures::future::ready(Ok(serialized))),
             index_part_size,
             &remote_path,
             cancel,
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 02f87303d1..50c977a950 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -3,12 +3,10 @@ use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
-use crate::tenant::remote_timeline_client::index::Lineage;
 use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;
 
 use chrono::NaiveDateTime;
-use pageserver_api::models::AuxFilePolicy;
 use std::sync::Arc;
 use tracing::info;
 use utils::lsn::AtomicLsn;
@@ -45,34 +43,25 @@ pub(crate) struct UploadQueueInitialized {
     /// Counter to assign task IDs
     pub(crate) task_counter: u64,
 
-    /// All layer files stored in the remote storage, taking into account all
-    /// in-progress and queued operations
-    pub(crate) latest_files: HashMap<LayerName, LayerFileMetadata>,
+    /// The next uploaded index_part.json; assumed to be dirty.
+    ///
+    /// Should not be read, directly except for layer file updates. Instead you should add a
+    /// projected field.
+    pub(crate) dirty: IndexPart,
+
+    /// The latest remote persisted IndexPart.
+    ///
+    /// Each completed metadata upload will update this. The second item is the task_id which last
+    /// updated the value, used to ensure we never store an older value over a newer one.
+    pub(crate) clean: (IndexPart, Option<u64>),
 
     /// How many file uploads or deletions been scheduled, since the
     /// last (scheduling of) metadata index upload?
     pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64,
 
-    /// Metadata stored in the remote storage, taking into account all
-    /// in-progress and queued operations.
-    /// DANGER: do not return to outside world, e.g., safekeepers.
-    pub(crate) latest_metadata: TimelineMetadata,
-
-    /// Part of the flattened "next" `index_part.json`.
-    pub(crate) latest_lineage: Lineage,
-
-    /// The last aux file policy used on this timeline.
-    pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
-
-    /// `disk_consistent_lsn` from the last metadata file that was successfully
-    /// uploaded. `Lsn(0)` if nothing was uploaded yet.
-    /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
-    /// Safekeeper can rely on it to make decisions for WAL storage.
-    ///
-    /// visible_remote_consistent_lsn is only updated after our generation has been validated with
+    /// The Lsn is only updated after our generation has been validated with
     /// the control plane (unlesss a timeline's generation is None, in which case
     /// we skip validation)
-    pub(crate) projected_remote_consistent_lsn: Option<Lsn>,
     pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,
 
     // Breakdown of different kinds of tasks currently in-progress
@@ -118,7 +107,8 @@ impl UploadQueueInitialized {
     }
 
     pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
-        self.projected_remote_consistent_lsn
+        let lsn = self.clean.0.metadata.disk_consistent_lsn();
+        self.clean.1.map(|_| lsn)
     }
 }
 
@@ -174,13 +164,12 @@ impl UploadQueue {
 
         info!("initializing upload queue for empty remote");
 
+        let index_part = IndexPart::empty(metadata.clone());
+
         let state = UploadQueueInitialized {
-            // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
-            latest_files: HashMap::new(),
+            dirty: index_part.clone(),
+            clean: (index_part, None),
             latest_files_changes_since_metadata_upload_scheduled: 0,
-            latest_metadata: metadata.clone(),
-            latest_lineage: Lineage::default(),
-            projected_remote_consistent_lsn: None,
             visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
             // what follows are boring default initializations
             task_counter: 0,
@@ -193,7 +182,6 @@ impl UploadQueue {
             dangling_files: HashMap::new(),
             shutting_down: false,
             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
-            last_aux_file_policy: Default::default(),
         };
 
         *self = UploadQueue::Initialized(state);
@@ -211,22 +199,15 @@ impl UploadQueue {
             }
         }
 
-        let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
-        for (layer_name, layer_metadata) in &index_part.layer_metadata {
-            files.insert(layer_name.to_owned(), layer_metadata.clone());
-        }
-
         info!(
             "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
             index_part.metadata.disk_consistent_lsn()
         );
 
         let state = UploadQueueInitialized {
-            latest_files: files,
+            dirty: index_part.clone(),
+            clean: (index_part.clone(), None),
             latest_files_changes_since_metadata_upload_scheduled: 0,
-            latest_metadata: index_part.metadata.clone(),
-            latest_lineage: index_part.lineage.clone(),
-            projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
             visible_remote_consistent_lsn: Arc::new(
                 index_part.metadata.disk_consistent_lsn().into(),
             ),
@@ -241,7 +222,6 @@ impl UploadQueue {
             dangling_files: HashMap::new(),
             shutting_down: false,
             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
-            last_aux_file_policy: index_part.last_aux_file_policy(),
         };
 
         *self = UploadQueue::Initialized(state);
@@ -298,13 +278,16 @@ pub(crate) enum UploadOp {
     /// Upload a layer file
     UploadLayer(ResidentLayer, LayerFileMetadata),
 
-    /// Upload the metadata file
-    UploadMetadata(Box<IndexPart>, Lsn),
+    /// Upload a index_part.json file
+    UploadMetadata {
+        /// The next [`UploadQueueInitialized::clean`] after this upload succeeds.
+        uploaded: Box<IndexPart>,
+    },
 
     /// Delete layer files
     Delete(Delete),
 
-    /// Barrier. When the barrier operation is reached,
+    /// Barrier. When the barrier operation is reached, the channel is closed.
     Barrier(tokio::sync::watch::Sender<()>),
 
     /// Shutdown; upon encountering this operation no new operations will be spawned, otherwise
@@ -322,8 +305,12 @@ impl std::fmt::Display for UploadOp {
                     layer, metadata.file_size, metadata.generation
                 )
             }
-            UploadOp::UploadMetadata(_, lsn) => {
-                write!(f, "UploadMetadata(lsn: {})", lsn)
+            UploadOp::UploadMetadata { uploaded, .. } => {
+                write!(
+                    f,
+                    "UploadMetadata(lsn: {})",
+                    uploaded.metadata.disk_consistent_lsn()
+                )
             }
             UploadOp::Delete(delete) => {
                 write!(f, "Delete({} layers)", delete.layers.len())
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index 2c14fef0af..44fb53696c 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -93,12 +93,12 @@ pub(crate) fn branch_cleanup_and_check_errors(
                     }
 
                     if index_part.metadata.disk_consistent_lsn()
-                        != index_part.get_disk_consistent_lsn()
+                        != index_part.duplicated_disk_consistent_lsn()
                     {
                         result.errors.push(format!(
                             "Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})",
                             index_part.metadata.disk_consistent_lsn(),
-                            index_part.get_disk_consistent_lsn(),
+                            index_part.duplicated_disk_consistent_lsn(),
                         ))
                     }
 

From fd22fc5b7d29214e8544c65062494c2ad03d744d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 4 Jun 2024 16:16:50 +0100
Subject: [PATCH 0922/1571] pageserver: include heatmap in tenant deletion
 (#7928)

## Problem

This was an oversight when adding heatmaps: because they are at the top
level of the tenant, they aren't included in the catch-all list & delete
that happens for timeline paths.

This doesn't break anything, but it leaves behind a few kilobytes of
garbage in the S3 bucket after a tenant is deleted, generating work for
the scrubber.

## Summary of changes

- During deletion, explicitly remove the heatmap file
- In test_tenant_delete_smoke, upload a heatmap so that the test would
fail its "remote storage empty after delete" check if we didn't delete
it.
---
 pageserver/src/tenant/delete.rs           | 20 ++++++++++++++++++++
 test_runner/regress/test_tenant_delete.py |  3 +++
 2 files changed, 23 insertions(+)

diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 7c6640eaac..8b36aa15e5 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -16,6 +16,7 @@ use crate::{
     task_mgr::{self, TaskKind},
     tenant::{
         mgr::{TenantSlot, TenantsMapRemoveResult},
+        remote_timeline_client::remote_heatmap_path,
         timeline::ShutdownMode,
     },
 };
@@ -531,6 +532,25 @@ impl DeleteTenantFlow {
             }
         }
 
+        // Remove top-level tenant objects that don't belong to a timeline, such as heatmap
+        let heatmap_path = remote_heatmap_path(&tenant.tenant_shard_id());
+        if let Some(Err(e)) = backoff::retry(
+            || async {
+                remote_storage
+                    .delete(&heatmap_path, &task_mgr::shutdown_token())
+                    .await
+            },
+            TimeoutOrCancel::caused_by_cancel,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "remove_remote_tenant_heatmap",
+            &task_mgr::shutdown_token(),
+        )
+        .await
+        {
+            tracing::warn!("Failed to delete heatmap at {heatmap_path}: {e}");
+        }
+
         let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
         // May not exist if we fail in cleanup_remaining_fs_traces after removing it
         if timelines_path.exists() {
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 3fc44de6fa..e120aa1a7c 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -88,6 +88,9 @@ def test_tenant_delete_smoke(
 
         parent = timeline
 
+    # Upload a heatmap so that we exercise deletion of that too
+    ps_http.tenant_heatmap_upload(tenant_id)
+
     iterations = poll_for_remote_storage_iterations(remote_storage_kind)
 
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2

From 17116f2ea941d6d5e821e81dd5f17c88edc968f3 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 4 Jun 2024 18:16:23 +0200
Subject: [PATCH 0923/1571] fix(pageserver): abort on duplicate layers, before
 doing damage (#7799)

fixes https://github.com/neondatabase/neon/issues/7790 (duplicating most
of the issue description here for posterity)

# Background

From the time before always-authoritative `index_part.json`, we had to
handle duplicate layers. See the RFC for an illustration of how
duplicate layers could happen:
https://github.com/neondatabase/neon/blob/a8e6d259cb49d1bf156dfc2215b92c04d1e8a08f/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md?plain=1#L41-L50

As of #5198 , we should not be exposed to that problem anymore.

# Problem 1

We still have
1. [code in
Pageserver](https://github.com/neondatabase/neon/blob/82960b2175211c0f666b91b5258c5e2253a245c7/pageserver/src/tenant/timeline.rs#L4502-L4521)
than handles duplicate layers
2. [tests in the test
suite](https://github.com/neondatabase/neon/blob/d9dcbffac37ccd3331ec9adcd12fd20ce0ea31aa/test_runner/regress/test_duplicate_layers.py#L15)
that demonstrates the problem using a failpoint

However, the test in the test suite doesn't use the failpoint to induce
a crash that could legitimately happen in production.
What is does instead is to return early with an `Ok()`, so that the code
in Pageserver that handles duplicate layers (item 1) actually gets
exercised.

That "return early" would be a bug in the routine if it happened in
production.
So, the tests in the test suite are tests for their own sake, but don't
serve to actually regress-test any production behavior.

# Problem 2

Further, if production code _did_ (it nowawdays doesn't!) create a
duplicate layer, the code in Pageserver that handles the condition (item
1 above) is too little and too late:

* the code handles it by discarding the newer `struct Layer`; that's
good.
* however, on disk, we have already overwritten the old with the new
layer file
* the fact that we do it atomically doesn't matter because ...
* if the new layer file is not bit-identical, then we have a cache
coherency problem
  * PS PageCache block cache: caches old bit battern
* blob_io offsets stored in variables, based on pre-overwrite bit
pattern / offsets
* => reading based on these offsets from the new file might yield
different data than before

# Solution

- Remove the test suite code pertaining to Problem 1
- Move & rename test suite code that actually tests RFC-27
crash-consistent layer map.
- Remove the Pageserver code that handles duplicate layers too late
(Problem 1)
- Use `RENAME_NOREPLACE` to prevent over-rename the file during
`.finish()`, bail with an error if it happens (Problem 2)
- This bailing prevents the caller from even trying to insert into the
layer map, as they don't even get a `struct Layer` at hand.
- Add `abort`s in the place where we have the layer map lock and check
for duplicates (Problem 2)
- Note again, we can't reach there because we bail from `.finish()` much
earlier in the code.
- Share the logic to clean up after failed `.finish()` between image
layers and delta layers (drive-by cleanup)
- This exposed that test `image_layer_rewrite` was overwriting layer
files in place. Fix the test.

# Future Work

This PR adds a new failure scenario that was previously "papered over"
by the overwriting of layers:
1. Start a compaction that will produce 3 layers: A, B, C
2. Layer A is `finish()`ed successfully.
3. Layer B fails mid-way at some `put_value()`.
4. Compaction bails out, sleeps 20s.
5. Some disk space gets freed in the meantime.
6. Compaction wakes from sleep, another iteration starts, it attempts to
write Layer A again. But the `.finish()` **fails because A already
exists on disk**.

The failure in step 5 is new with this PR, and it **causes the
compaction to get stuck**.
Before, it would silently overwrite the file and "successfully" complete
the second iteration.

The mitigation for this is to `/reset` the tenant.
---
 libs/utils/src/fs_ext.rs                      |   3 +
 libs/utils/src/fs_ext/rename_noreplace.rs     | 109 ++++++++++++++++++
 pageserver/src/tenant.rs                      |  35 ++++--
 .../src/tenant/storage_layer/delta_layer.rs   |  35 +++---
 .../src/tenant/storage_layer/image_layer.rs   |  92 ++++++++++++---
 pageserver/src/tenant/storage_layer/layer.rs  |   7 +-
 pageserver/src/tenant/timeline/compaction.rs  |  42 -------
 test_runner/regress/test_compaction.py        |   2 +-
 ...y => test_pageserver_crash_consistency.py} |  50 ++------
 .../regress/test_pageserver_restart.py        |   5 -
 .../regress/test_pageserver_secondary.py      |   4 -
 11 files changed, 252 insertions(+), 132 deletions(-)
 create mode 100644 libs/utils/src/fs_ext/rename_noreplace.rs
 rename test_runner/regress/{test_duplicate_layers.py => test_pageserver_crash_consistency.py} (66%)

diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs
index 90ba348a02..8e53d2c79b 100644
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -3,6 +3,9 @@ use std::{fs, io, path::Path};
 
 use anyhow::Context;
 
+mod rename_noreplace;
+pub use rename_noreplace::rename_noreplace;
+
 pub trait PathExt {
     /// Returns an error if `self` is not a directory.
     fn is_empty_dir(&self) -> io::Result<bool>;
diff --git a/libs/utils/src/fs_ext/rename_noreplace.rs b/libs/utils/src/fs_ext/rename_noreplace.rs
new file mode 100644
index 0000000000..897e30d7f1
--- /dev/null
+++ b/libs/utils/src/fs_ext/rename_noreplace.rs
@@ -0,0 +1,109 @@
+use nix::NixPath;
+
+/// Rename a file without replacing an existing file.
+///
+/// This is a wrapper around platform-specific APIs.
+pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
+    src: &P1,
+    dst: &P2,
+) -> nix::Result<()> {
+    {
+        #[cfg(target_os = "linux")]
+        {
+            nix::fcntl::renameat2(
+                None,
+                src,
+                None,
+                dst,
+                nix::fcntl::RenameFlags::RENAME_NOREPLACE,
+            )
+        }
+        #[cfg(target_os = "macos")]
+        {
+            let res = src.with_nix_path(|src| {
+                dst.with_nix_path(|dst|
+                    // SAFETY: `src` and `dst` are valid C strings as per the NixPath trait and they outlive the call to renamex_np.
+                    unsafe {
+                        nix::libc::renamex_np(src.as_ptr(), dst.as_ptr(), nix::libc::RENAME_EXCL)
+                })
+            })??;
+            nix::errno::Errno::result(res).map(drop)
+        }
+        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
+        {
+            std::compile_error!("OS does not support no-replace renames");
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::{fs, path::PathBuf};
+
+    use super::*;
+
+    fn testdir() -> camino_tempfile::Utf8TempDir {
+        match crate::env::var("NEON_UTILS_RENAME_NOREPLACE_TESTDIR") {
+            Some(path) => {
+                let path: camino::Utf8PathBuf = path;
+                camino_tempfile::tempdir_in(path).unwrap()
+            }
+            None => camino_tempfile::tempdir().unwrap(),
+        }
+    }
+
+    #[test]
+    fn test_absolute_paths() {
+        let testdir = testdir();
+        println!("testdir: {}", testdir.path());
+
+        let src = testdir.path().join("src");
+        let dst = testdir.path().join("dst");
+
+        fs::write(&src, b"").unwrap();
+        fs::write(&dst, b"").unwrap();
+
+        let src = src.canonicalize().unwrap();
+        assert!(src.is_absolute());
+        let dst = dst.canonicalize().unwrap();
+        assert!(dst.is_absolute());
+
+        let result = rename_noreplace(&src, &dst);
+        assert_eq!(result.unwrap_err(), nix::Error::EEXIST);
+    }
+
+    #[test]
+    fn test_relative_paths() {
+        let testdir = testdir();
+        println!("testdir: {}", testdir.path());
+
+        // this is fine because we run in nextest => process per test
+        std::env::set_current_dir(testdir.path()).unwrap();
+
+        let src = PathBuf::from("src");
+        let dst = PathBuf::from("dst");
+
+        fs::write(&src, b"").unwrap();
+        fs::write(&dst, b"").unwrap();
+
+        let result = rename_noreplace(&src, &dst);
+        assert_eq!(result.unwrap_err(), nix::Error::EEXIST);
+    }
+
+    #[test]
+    fn test_works_when_not_exists() {
+        let testdir = testdir();
+        println!("testdir: {}", testdir.path());
+
+        let src = testdir.path().join("src");
+        let dst = testdir.path().join("dst");
+
+        fs::write(&src, b"content").unwrap();
+
+        rename_noreplace(src.as_std_path(), dst.as_std_path()).unwrap();
+        assert_eq!(
+            "content",
+            String::from_utf8(std::fs::read(&dst).unwrap()).unwrap()
+        );
+    }
+}
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 19a0f59b2a..60cd5c9695 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3865,6 +3865,9 @@ pub(crate) mod harness {
         pub fn create_custom(
             test_name: &'static str,
             tenant_conf: TenantConf,
+            tenant_id: TenantId,
+            shard_identity: ShardIdentity,
+            generation: Generation,
         ) -> anyhow::Result<Self> {
             setup_logging();
 
@@ -3877,8 +3880,12 @@ pub(crate) mod harness {
             // OK in a test.
             let conf: &'static PageServerConf = Box::leak(Box::new(conf));
 
-            let tenant_id = TenantId::generate();
-            let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+            let shard = shard_identity.shard_index();
+            let tenant_shard_id = TenantShardId {
+                tenant_id,
+                shard_number: shard.shard_number,
+                shard_count: shard.shard_count,
+            };
             fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?;
             fs::create_dir_all(conf.timelines_path(&tenant_shard_id))?;
 
@@ -3896,8 +3903,8 @@ pub(crate) mod harness {
                 conf,
                 tenant_conf,
                 tenant_shard_id,
-                generation: Generation::new(0xdeadbeef),
-                shard: ShardIndex::unsharded(),
+                generation,
+                shard,
                 remote_storage,
                 remote_fs_dir,
                 deletion_queue,
@@ -3912,8 +3919,15 @@ pub(crate) mod harness {
                 compaction_period: Duration::ZERO,
                 ..TenantConf::default()
             };
-
-            Self::create_custom(test_name, tenant_conf)
+            let tenant_id = TenantId::generate();
+            let shard = ShardIdentity::unsharded();
+            Self::create_custom(
+                test_name,
+                tenant_conf,
+                tenant_id,
+                shard,
+                Generation::new(0xdeadbeef),
+            )
         }
 
         pub fn span(&self) -> tracing::Span {
@@ -4037,6 +4051,7 @@ mod tests {
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
     use utils::bin_ser::BeSer;
+    use utils::id::TenantId;
 
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4936,7 +4951,13 @@ mod tests {
             ..TenantConf::default()
         };
 
-        let harness = TenantHarness::create_custom("test_get_vectored_key_gap", tenant_conf)?;
+        let harness = TenantHarness::create_custom(
+            "test_get_vectored_key_gap",
+            tenant_conf,
+            TenantId::generate(),
+            ShardIdentity::unsharded(),
+            Generation::new(0xdeadbeef),
+        )?;
         let (tenant, ctx) = harness.load().await;
 
         let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 1b3802840f..999e2e8679 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -478,6 +478,23 @@ impl DeltaLayerWriterInner {
         key_end: Key,
         timeline: &Arc<Timeline>,
         ctx: &RequestContext,
+    ) -> anyhow::Result<ResidentLayer> {
+        let temp_path = self.path.clone();
+        let result = self.finish0(key_end, timeline, ctx).await;
+        if result.is_err() {
+            tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
+            if let Err(e) = std::fs::remove_file(&temp_path) {
+                tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
+            }
+        }
+        result
+    }
+
+    async fn finish0(
+        self,
+        key_end: Key,
+        timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<ResidentLayer> {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -651,19 +668,11 @@ impl DeltaLayerWriter {
         timeline: &Arc<Timeline>,
         ctx: &RequestContext,
     ) -> anyhow::Result<ResidentLayer> {
-        let inner = self.inner.take().unwrap();
-        let temp_path = inner.path.clone();
-        let result = inner.finish(key_end, timeline, ctx).await;
-        // The delta layer files can sometimes be really large. Clean them up.
-        if result.is_err() {
-            tracing::warn!(
-                "Cleaning up temporary delta file {temp_path} after error during writing"
-            );
-            if let Err(e) = std::fs::remove_file(&temp_path) {
-                tracing::warn!("Error cleaning up temporary delta layer file {temp_path}: {e:?}")
-            }
-        }
-        result
+        self.inner
+            .take()
+            .unwrap()
+            .finish(key_end, timeline, ctx)
+            .await
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 8394b33f19..285618b146 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -917,26 +917,57 @@ impl Drop for ImageLayerWriter {
 
 #[cfg(test)]
 mod test {
+    use std::time::Duration;
+
     use bytes::Bytes;
     use pageserver_api::{
         key::Key,
         shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
     };
-    use utils::{id::TimelineId, lsn::Lsn};
+    use utils::{
+        generation::Generation,
+        id::{TenantId, TimelineId},
+        lsn::Lsn,
+    };
 
-    use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
+    use crate::{
+        tenant::{config::TenantConf, harness::TenantHarness},
+        DEFAULT_PG_VERSION,
+    };
 
     use super::ImageLayerWriter;
 
     #[tokio::test]
     async fn image_layer_rewrite() {
-        let harness = TenantHarness::create("test_image_layer_rewrite").unwrap();
-        let (tenant, ctx) = harness.load().await;
-
+        let tenant_conf = TenantConf {
+            gc_period: Duration::ZERO,
+            compaction_period: Duration::ZERO,
+            ..TenantConf::default()
+        };
+        let tenant_id = TenantId::generate();
+        let mut gen = Generation::new(0xdead0001);
+        let mut get_next_gen = || {
+            let ret = gen;
+            gen = gen.next();
+            ret
+        };
         // The LSN at which we will create an image layer to filter
         let lsn = Lsn(0xdeadbeef0000);
-
         let timeline_id = TimelineId::generate();
+
+        //
+        // Create an unsharded parent with a layer.
+        //
+
+        let harness = TenantHarness::create_custom(
+            "test_image_layer_rewrite--parent",
+            tenant_conf.clone(),
+            tenant_id,
+            ShardIdentity::unsharded(),
+            get_next_gen(),
+        )
+        .unwrap();
+        let (tenant, ctx) = harness.load().await;
         let timeline = tenant
             .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
             .await
@@ -971,9 +1002,47 @@ mod test {
         };
         let original_size = resident.metadata().file_size;
 
+        //
+        // Create child shards and do the rewrite, exercising filter().
+        // TODO: abstraction in TenantHarness for splits.
+        //
+
         // Filter for various shards: this exercises cases like values at start of key range, end of key
         // range, middle of key range.
-        for shard_number in 0..4 {
+        let shard_count = ShardCount::new(4);
+        for shard_number in 0..shard_count.count() {
+            //
+            // mimic the shard split
+            //
+            let shard_identity = ShardIdentity::new(
+                ShardNumber(shard_number),
+                shard_count,
+                ShardStripeSize(0x8000),
+            )
+            .unwrap();
+            let harness = TenantHarness::create_custom(
+                Box::leak(Box::new(format!(
+                    "test_image_layer_rewrite--child{}",
+                    shard_identity.shard_slug()
+                ))),
+                tenant_conf.clone(),
+                tenant_id,
+                shard_identity,
+                // NB: in reality, the shards would each fork off their own gen number sequence from the parent.
+                // But here, all we care about is that the gen number is unique.
+                get_next_gen(),
+            )
+            .unwrap();
+            let (tenant, ctx) = harness.load().await;
+            let timeline = tenant
+                .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
+                .await
+                .unwrap();
+
+            //
+            // use filter() and make assertions
+            //
+
             let mut filtered_writer = ImageLayerWriter::new(
                 harness.conf,
                 timeline_id,
@@ -985,15 +1054,6 @@ mod test {
             .await
             .unwrap();
 
-            // TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity
-            // to exercise filter()
-            let shard_identity = ShardIdentity::new(
-                ShardNumber(shard_number),
-                ShardCount::new(4),
-                ShardStripeSize(0x8000),
-            )
-            .unwrap();
-
             let wrote_keys = resident
                 .filter(&shard_identity, &mut filtered_writer, &ctx)
                 .await
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 1ec13882da..18f9ba4ef8 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -277,9 +277,10 @@ impl Layer {
 
         let downloaded = resident.expect("just initialized");
 
-        // if the rename works, the path is as expected
-        // TODO: sync system call
-        std::fs::rename(temp_path, owner.local_path())
+        // We never want to overwrite an existing file, so we use `RENAME_NOREPLACE`.
+        // TODO: this leaves the temp file in place if the rename fails, risking us running
+        // out of space. Should we clean it up here or does the calling context deal with this?
+        utils::fs_ext::rename_noreplace(temp_path.as_std_path(), owner.local_path().as_std_path())
             .with_context(|| format!("rename temporary file as correct path for {owner}"))?;
 
         Ok(ResidentLayer { downloaded, owner })
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 15c77d0316..d8de6aee7c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -421,48 +421,6 @@ impl Timeline {
             return Ok(CompactLevel0Phase1Result::default());
         }
 
-        // This failpoint is used together with `test_duplicate_layers` integration test.
-        // It returns the compaction result exactly the same layers as input to compaction.
-        // We want to ensure that this will not cause any problem when updating the layer map
-        // after the compaction is finished.
-        //
-        // Currently, there are two rare edge cases that will cause duplicated layers being
-        // inserted.
-        // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
-        //    is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
-        //    map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
-        //    point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
-        //    and this causes an overwrite. This is acceptable because the content is the same, and we should do a
-        //    layer replace instead of the normal remove / upload process.
-        // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
-        //    size length. Compaction will likely create the same set of n files afterwards.
-        //
-        // This failpoint is a superset of both of the cases.
-        if cfg!(feature = "testing") {
-            let active = (|| {
-                ::fail::fail_point!("compact-level0-phase1-return-same", |_| true);
-                false
-            })();
-
-            if active {
-                let mut new_layers = Vec::with_capacity(level0_deltas.len());
-                for delta in &level0_deltas {
-                    // we are just faking these layers as being produced again for this failpoint
-                    new_layers.push(
-                        delta
-                            .download_and_keep_resident()
-                            .await
-                            .context("download layer for failpoint")?,
-                    );
-                }
-                tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
-                return Ok(CompactLevel0Phase1Result {
-                    new_layers,
-                    deltas_to_compact: level0_deltas,
-                });
-            }
-        }
-
         // Gather the files to compact in this iteration.
         //
         // Start with the oldest Level 0 delta file, and collect any other
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index b2e4d35cb8..49dcb9b86a 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -238,7 +238,7 @@ def test_uploads_and_deletions(
     # https://github.com/neondatabase/neon/issues/7707
     # https://github.com/neondatabase/neon/issues/7759
     allowed_errors = [
-        ".*duplicated L1 layer.*",
+        ".*/checkpoint.*rename temporary file as correct path for.*",  # EEXIST
         ".*delta layer created with.*duplicate values.*",
         ".*assertion failed: self.lsn_range.start <= lsn.*",
         ".*HTTP request handler task panicked: task.*panicked.*",
diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_pageserver_crash_consistency.py
similarity index 66%
rename from test_runner/regress/test_duplicate_layers.py
rename to test_runner/regress/test_pageserver_crash_consistency.py
index 0ebb99c712..3831d2f917 100644
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_pageserver_crash_consistency.py
@@ -12,42 +12,14 @@ from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from requests.exceptions import ConnectionError
 
 
-def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    env = neon_env_builder.init_start()
-    pageserver_http = env.pageserver.http_client()
-
-    # use a failpoint to return all L0s as L1s
-    message = ".*duplicated L1 layer layer=.*"
-    env.pageserver.allowed_errors.append(message)
-
-    # Use aggressive compaction and checkpoint settings
-    tenant_id, _ = env.neon_cli.create_tenant(
-        conf={
-            "checkpoint_distance": f"{1024 ** 2}",
-            "compaction_target_size": f"{1024 ** 2}",
-            "compaction_period": "5 s",
-            "compaction_threshold": "3",
-        }
-    )
-
-    pageserver_http.configure_failpoints(("compact-level0-phase1-return-same", "return"))
-
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
-    connstr = endpoint.connstr(options="-csynchronous_commit=off")
-    pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])
-
-    time.sleep(10)  # let compaction to be performed
-    env.pageserver.assert_log_contains("compact-level0-phase1-return-same")
-
-
-def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     """
-    Test sets fail point at the end of first compaction phase: after
-    flushing new L1 layer but before deletion of L0 layers.
+    Test case for docs/rfcs/027-crash-consistent-layer-map-through-index-part.md.
 
-    The L1 used to be overwritten, but with crash-consistency via remote
-    index_part.json, we end up deleting the not yet uploaded L1 layer on
-    startup.
+    Simulate crash after compaction has written layers to disk
+    but before they have been uploaded/linked into remote index_part.json.
+
+    Startup handles this situation by deleting the not yet uploaded L1 layer files.
     """
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
@@ -126,13 +98,6 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
     # give time for log flush
     time.sleep(1)
 
-    message = f".*duplicated L1 layer layer={l1_found}"
-    found_msg = env.pageserver.log_contains(message)
-    # resident or evicted, it should not be overwritten, however it should had been non-existing at startup
-    assert (
-        found_msg is None
-    ), "layer should had been removed during startup, did it live on as evicted?"
-
     assert env.pageserver.layer_exists(tenant_id, timeline_id, l1_found), "the L1 reappears"
 
     wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
@@ -141,3 +106,6 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
         tenant_id, timeline_id, l1_found.to_str()
     )
     assert uploaded.exists(), "the L1 is uploaded"
+
+
+# TODO: same test for L0s produced by ingest.
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index 759e845927..4ce53df214 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -163,11 +163,6 @@ def test_pageserver_chaos(
 
     env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
-    # these can happen, if we shutdown at a good time. to be fixed as part of #5172.
-    message = ".*duplicated L1 layer layer=.*"
-    for ps in env.pageservers:
-        ps.allowed_errors.append(message)
-
     # Use a tiny checkpoint distance, to create a lot of layers quickly.
     # That allows us to stress the compaction and layer flushing logic more.
     tenant, _ = env.neon_cli.create_tenant(
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 9b9bdb2b08..5bfa9cce8c 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -100,10 +100,6 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
             ]
         )
 
-        # these can happen, if we shutdown at a good time. to be fixed as part of #5172.
-        message = ".*duplicated L1 layer layer=.*"
-        ps.allowed_errors.append(message)
-
     workload = Workload(env, tenant_id, timeline_id)
     workload.init(env.pageservers[0].id)
     workload.write_rows(256, env.pageservers[0].id)

From 3d6e389aa2d04baf6adc60584951811df77da8c7 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 4 Jun 2024 19:36:22 +0300
Subject: [PATCH 0924/1571] feat: support changing IndexPart::metadata_bytes to
 json in future release (#7693)

## Problem

Currently we serialize the `TimelineMetadata` into bytes to put it into
`index_part.json`. This `Vec<u8>` (hopefully `[u8; 512]`) representation
was chosen because of problems serializing TimelineId and Lsn between
different serializers (bincode, json). After #5335, the serialization of
those types became serialization format aware or format agnostic.

We've removed the pageserver local `metadata` file writing in #6769.

## Summary of changes

Allow switching from the current serialization format to plain JSON for
the legacy TimelineMetadata format in the future by adding a competitive
serialization method to the current one
(`crate::tenant::metadata::modern_serde`), which accepts both old bytes
and new plain JSON.

The benefits of this are that dumping the index_part.json with pretty
printing no longer produces more than 500 lines of output, but after
enabling it produces lines only proportional to the layer count, like:

```json
{
  "version": ???,
  "layer_metadata": { ... },
  "disk_consistent_lsn": "0/15FD5D8",
  "legacy_metadata": {
    "disk_consistent_lsn": "0/15FD5D8",
    "prev_record_lsn": "0/15FD5A0",
    "ancestor_timeline": null,
    "ancestor_lsn": "0/0",
    "latest_gc_cutoff_lsn": "0/149FD18",
    "initdb_lsn": "0/149FD18",
    "pg_version": 15
  }
}
```

In the future, I propose we completely stop using this legacy metadata
type and wasting time trying to come up with another version numbering
scheme in addition to the informative-only one already found in
`index_part.json`, and go ahead with storing metadata or feature flags
on the `index_part.json` itself.

#7699 is the "one release after" changes which starts to produce
metadata in the index_part.json as json.
---
 pageserver/src/tenant/metadata.rs             | 158 +++++++++++++++++-
 .../tenant/remote_timeline_client/index.rs    |   6 +-
 2 files changed, 159 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index fc71ea7642..c00672895a 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -267,7 +267,7 @@ impl<'de> Deserialize<'de> for TimelineMetadata {
         D: serde::Deserializer<'de>,
     {
         let bytes = Vec::<u8>::deserialize(deserializer)?;
-        Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}")))
+        Self::from_bytes(bytes.as_slice()).map_err(D::Error::custom)
     }
 }
 
@@ -276,13 +276,163 @@ impl Serialize for TimelineMetadata {
     where
         S: Serializer,
     {
-        let bytes = self
-            .to_bytes()
-            .map_err(|e| serde::ser::Error::custom(format!("{e}")))?;
+        let bytes = self.to_bytes().map_err(serde::ser::Error::custom)?;
         bytes.serialize(serializer)
     }
 }
 
+pub(crate) mod modern_serde {
+    use crate::tenant::metadata::METADATA_FORMAT_VERSION;
+
+    use super::{
+        TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader, METADATA_HDR_SIZE,
+    };
+    use serde::{Deserialize, Serialize};
+
+    pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result<TimelineMetadata, D::Error>
+    where
+        D: serde::de::Deserializer<'de>,
+    {
+        // for legacy reasons versions 1-5 had TimelineMetadata serialized as a Vec<u8> field with
+        // BeSer.
+        struct Visitor;
+
+        impl<'d> serde::de::Visitor<'d> for Visitor {
+            type Value = TimelineMetadata;
+
+            fn expecting(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+                f.write_str("BeSer bytes or json structure")
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'d>,
+            {
+                use serde::de::Error;
+                let de = serde::de::value::SeqAccessDeserializer::new(seq);
+                Vec::<u8>::deserialize(de)
+                    .map(|v| TimelineMetadata::from_bytes(&v).map_err(A::Error::custom))?
+            }
+
+            fn visit_map<A>(self, map: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::MapAccess<'d>,
+            {
+                use serde::de::Error;
+
+                let de = serde::de::value::MapAccessDeserializer::new(map);
+                let body = TimelineMetadataBodyV2::deserialize(de)?;
+
+                // jump through hoops to calculate the crc32 so that TimelineMetadata::ne works
+                // across serialization versions
+                let mut sink = Crc32Sink::default();
+                <TimelineMetadataBodyV2 as utils::bin_ser::BeSer>::ser_into(&body, &mut sink)
+                    .map_err(|e| A::Error::custom(Crc32CalculationFailed(e)))?;
+
+                let size = METADATA_HDR_SIZE + sink.count;
+
+                Ok(TimelineMetadata {
+                    hdr: TimelineMetadataHeader {
+                        checksum: sink.crc,
+                        size: size as u16,
+                        format_version: METADATA_FORMAT_VERSION,
+                    },
+                    body,
+                })
+            }
+        }
+
+        deserializer.deserialize_any(Visitor)
+    }
+
+    #[derive(Default)]
+    struct Crc32Sink {
+        crc: u32,
+        count: usize,
+    }
+
+    impl std::io::Write for Crc32Sink {
+        fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+            self.crc = crc32c::crc32c_append(self.crc, buf);
+            self.count += buf.len();
+            Ok(buf.len())
+        }
+
+        fn flush(&mut self) -> std::io::Result<()> {
+            Ok(())
+        }
+    }
+
+    #[derive(thiserror::Error)]
+    #[error("re-serializing for crc32 failed")]
+    struct Crc32CalculationFailed<E>(#[source] E);
+
+    // this should be true for one release, after that we can change it to false
+    // remember to check the IndexPart::metadata field TODO comment as well
+    const LEGACY_BINCODED_BYTES: bool = true;
+
+    #[derive(serde::Serialize)]
+    #[serde(transparent)]
+    struct LegacyPaddedBytes<'a>(&'a TimelineMetadata);
+
+    struct JustTheBodyV2<'a>(&'a TimelineMetadata);
+
+    impl serde::Serialize for JustTheBodyV2<'_> {
+        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            // header is not needed, upon reading we've upgraded all v1 to v2
+            self.0.body.serialize(serializer)
+        }
+    }
+
+    pub(crate) fn serialize<S>(
+        metadata: &TimelineMetadata,
+        serializer: S,
+    ) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        // we cannot use TimelineMetadata::serialize for now because it'll do
+        // TimelineMetadata::to_bytes
+        if LEGACY_BINCODED_BYTES {
+            LegacyPaddedBytes(metadata).serialize(serializer)
+        } else {
+            JustTheBodyV2(metadata).serialize(serializer)
+        }
+    }
+
+    #[test]
+    fn deserializes_bytes_as_well_as_equivalent_body_v2() {
+        #[derive(serde::Deserialize, serde::Serialize)]
+        struct Wrapper(#[serde(deserialize_with = "deserialize")] TimelineMetadata);
+
+        let too_many_bytes = "[216,111,252,208,0,54,0,4,0,0,0,0,1,73,253,144,1,0,0,0,0,1,73,253,24,0,0,0,0,0,0,0,0,0,0,0,0,0,1,73,253,24,0,0,0,0,1,73,253,24,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]";
+
+        let wrapper_from_bytes = serde_json::from_str::<Wrapper>(too_many_bytes).unwrap();
+
+        let serialized = serde_json::to_value(JustTheBodyV2(&wrapper_from_bytes.0)).unwrap();
+
+        assert_eq!(
+            serialized,
+            serde_json::json! {{
+                "disk_consistent_lsn": "0/149FD90",
+                "prev_record_lsn": "0/149FD18",
+                "ancestor_timeline": null,
+                "ancestor_lsn": "0/0",
+                "latest_gc_cutoff_lsn": "0/149FD18",
+                "initdb_lsn": "0/149FD18",
+                "pg_version": 15
+            }}
+        );
+
+        let wrapper_from_json = serde_json::value::from_value::<Wrapper>(serialized).unwrap();
+
+        assert_eq!(wrapper_from_bytes.0, wrapper_from_json.0);
+    }
+}
+
 /// Parts of the metadata which are regularly modified.
 pub(crate) struct MetadataUpdate {
     disk_consistent_lsn: Lsn,
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 6494261312..7d2e9b9a91 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -43,7 +43,11 @@ pub struct IndexPart {
     // private because internally we would read from metadata instead.
     pub(super) disk_consistent_lsn: Lsn,
 
-    #[serde(rename = "metadata_bytes")]
+    // TODO: later make this "rename" to "alias", rename field as "legacy_metadata"
+    #[serde(
+        rename = "metadata_bytes",
+        with = "crate::tenant::metadata::modern_serde"
+    )]
     pub metadata: TimelineMetadata,
 
     #[serde(default)]

From 1a8d53ab9d8e3a3e0c4ea147ea4992e91a4ef9b6 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 4 Jun 2024 13:47:48 -0400
Subject: [PATCH 0925/1571] feat(pageserver): compute aux file size on initial
 logical size calculation (#7958)

close https://github.com/neondatabase/neon/issues/7822
close https://github.com/neondatabase/neon/issues/7443

Aux file metrics is computed incrementally. If the size is not
initialized, the metrics will never show up. This pull request adds the
functionality to compute the aux file size on initial logical size
calculation.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/aux_file.rs          |  3 ++-
 pageserver/src/pgdatadir_mapping.rs | 14 +++++++++++++-
 pageserver/src/tenant/timeline.rs   | 16 ++++++++++------
 3 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs
index 38e1875db1..5e527b7d61 100644
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -178,7 +178,8 @@ impl AuxFileSizeEstimator {
         }
     }
 
-    pub fn on_base_backup(&self, new_size: usize) {
+    /// When generating base backup or doing initial logical size calculation
+    pub fn on_initial(&self, new_size: usize) {
         let mut guard = self.size.lock().unwrap();
         *guard = Some(new_size as isize);
         self.report(new_size as isize);
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 5eaf80bdaf..0bff4be150 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -718,10 +718,22 @@ impl Timeline {
                 result.insert(fname, content);
             }
         }
-        self.aux_file_size_estimator.on_base_backup(sz);
+        self.aux_file_size_estimator.on_initial(sz);
         Ok(result)
     }
 
+    pub(crate) async fn trigger_aux_file_size_computation(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<(), PageReconstructError> {
+        let current_policy = self.last_aux_file_policy.load();
+        if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy {
+            self.list_aux_files_v2(lsn, ctx).await?;
+        }
+        Ok(())
+    }
+
     pub(crate) async fn list_aux_files(
         &self,
         lsn: Lsn,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 35e6d1f92f..4c46c4e635 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2787,17 +2787,21 @@ impl Timeline {
                     crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
                 };
 
-                match self_ref
+                let calculated_size = self_ref
                     .logical_size_calculation_task(
                         initial_part_end,
                         LogicalSizeCalculationCause::Initial,
                         background_ctx,
                     )
-                    .await
-                {
-                    Ok(calculated_size) => Ok((calculated_size, metrics_guard)),
-                    Err(e) => Err(e),
-                }
+                    .await?;
+
+                self_ref
+                    .trigger_aux_file_size_computation(initial_part_end, background_ctx)
+                    .await?;
+
+                // TODO: add aux file size to logical size
+
+                Ok((calculated_size, metrics_guard))
             }
         };
 

From 85ef6b16459bc344756f25ef62dd90591231242d Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 5 Jun 2024 10:32:03 +0200
Subject: [PATCH 0926/1571] upgrade pgvector from 0.7.0 to 0.7.1 (#7954)

## Problem

## Summary of changes

performance improvements in pgvector 0.7.1 for hnsw index builds, see
https://github.com/pgvector/pgvector/issues/570
---
 Dockerfile.compute-node | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index db3734047e..90b8868b43 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -246,8 +246,8 @@ COPY patches/pgvector.patch /pgvector.patch
 # By default, pgvector Makefile uses `-march=native`. We don't want that,
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
-    echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.1.tar.gz -O pgvector.tar.gz && \
+    echo "fe6c8cb4e0cd1a8cb60f5badf9e1701e0fcabcfc260931c26d01e155c4dd21d1 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \

From 83ab14e27119ffdfef6ce0f5cd883b847de8c24a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 5 Jun 2024 14:21:10 +0200
Subject: [PATCH 0927/1571] chore!: remove walredo_process_kind config option &
 kind type (#7756)

refs https://github.com/neondatabase/neon/issues/7753

Preceding PR https://github.com/neondatabase/neon/pull/7754
laid out the plan, this one wraps it up.
---
 pageserver/src/config.rs                      |  21 -
 pageserver/src/walredo.rs                     |  48 ++-
 pageserver/src/walredo/process.rs             | 397 +++++++++++++++---
 .../process/process_impl/process_async.rs     | 374 -----------------
 4 files changed, 374 insertions(+), 466 deletions(-)
 delete mode 100644 pageserver/src/walredo/process/process_impl/process_async.rs

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index b0afb6414b..b4a0d1ac02 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -99,8 +99,6 @@ pub mod defaults {
 
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
 
-    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";
-
     ///
     /// Default built-in configuration file.
     ///
@@ -146,8 +144,6 @@ pub mod defaults {
 
 #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
 
-#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
-
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -300,8 +296,6 @@ pub struct PageServerConf {
     ///
     /// Setting this to zero disables limits on total ephemeral layer size.
     pub ephemeral_bytes_per_memory_kb: usize,
-
-    pub walredo_process_kind: crate::walredo::ProcessKind,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -407,8 +401,6 @@ struct PageServerConfigBuilder {
     validate_vectored_get: BuilderValue<bool>,
 
     ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
-
-    walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
 }
 
 impl PageServerConfigBuilder {
@@ -497,8 +489,6 @@ impl PageServerConfigBuilder {
             )),
             validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
-
-            walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
         }
     }
 }
@@ -686,10 +676,6 @@ impl PageServerConfigBuilder {
         self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
     }
 
-    pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
-        self.walredo_process_kind = BuilderValue::Set(value);
-    }
-
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
@@ -747,7 +733,6 @@ impl PageServerConfigBuilder {
                 max_vectored_read_bytes,
                 validate_vectored_get,
                 ephemeral_bytes_per_memory_kb,
-                walredo_process_kind,
             }
             CUSTOM LOGIC
             {
@@ -1044,9 +1029,6 @@ impl PageServerConf {
                 "ephemeral_bytes_per_memory_kb" => {
                     builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                 }
-                "walredo_process_kind" => {
-                    builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
-                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1130,7 +1112,6 @@ impl PageServerConf {
             ),
             validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-            walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
         }
     }
 }
@@ -1370,7 +1351,6 @@ background_task_maximum_delay = '334 s'
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1444,7 +1424,6 @@ background_task_maximum_delay = '334 s'
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index d660b68a34..d562540bde 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -20,7 +20,6 @@
 
 /// Process lifecycle and abstracction for the IPC protocol.
 mod process;
-pub use process::Kind as ProcessKind;
 
 /// Code to apply [`NeonWalRecord`]s.
 pub(crate) mod apply_neon;
@@ -54,7 +53,7 @@ pub struct PostgresRedoManager {
     tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
     last_redo_at: std::sync::Mutex<Option<Instant>>,
-    /// The current [`process::Process`] that is used by new redo requests.
+    /// The current [`process::WalRedoProcess`] that is used by new redo requests.
     /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
     /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
     /// their process object; we use [`Arc::clone`] for that.
@@ -66,7 +65,7 @@ pub struct PostgresRedoManager {
     /// still be using the old redo process. But, those other tasks will most likely
     /// encounter an error as well, and errors are an unexpected condition anyway.
     /// So, probably we could get rid of the `Arc` in the future.
-    redo_process: heavier_once_cell::OnceCell<Arc<process::Process>>,
+    redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
 }
 
 ///
@@ -211,26 +210,31 @@ impl PostgresRedoManager {
         const MAX_RETRY_ATTEMPTS: u32 = 1;
         let mut n_attempts = 0u32;
         loop {
-            let proc: Arc<process::Process> = match self.redo_process.get_or_init_detached().await {
-                Ok(guard) => Arc::clone(&guard),
-                Err(permit) => {
-                    // don't hold poison_guard, the launch code can bail
-                    let start = Instant::now();
-                    let proc = Arc::new(
-                        process::Process::launch(self.conf, self.tenant_shard_id, pg_version)
+            let proc: Arc<process::WalRedoProcess> =
+                match self.redo_process.get_or_init_detached().await {
+                    Ok(guard) => Arc::clone(&guard),
+                    Err(permit) => {
+                        // don't hold poison_guard, the launch code can bail
+                        let start = Instant::now();
+                        let proc = Arc::new(
+                            process::WalRedoProcess::launch(
+                                self.conf,
+                                self.tenant_shard_id,
+                                pg_version,
+                            )
                             .context("launch walredo process")?,
-                    );
-                    let duration = start.elapsed();
-                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                    info!(
-                        duration_ms = duration.as_millis(),
-                        pid = proc.id(),
-                        "launched walredo process"
-                    );
-                    self.redo_process.set(Arc::clone(&proc), permit);
-                    proc
-                }
-            };
+                        );
+                        let duration = start.elapsed();
+                        WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                        info!(
+                            duration_ms = duration.as_millis(),
+                            pid = proc.id(),
+                            "launched walredo process"
+                        );
+                        self.redo_process.set(Arc::clone(&proc), permit);
+                        proc
+                    }
+                };
 
             let started_at = std::time::Instant::now();
 
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index 02c9c04bf1..5b0af334ee 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -1,64 +1,184 @@
-/// Layer of indirection previously used to support multiple implementations.
-/// Subject to removal: <https://github.com/neondatabase/neon/issues/7753>
-use std::time::Duration;
-
-use bytes::Bytes;
-use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use tracing::warn;
-use utils::lsn::Lsn;
-
-use crate::{config::PageServerConf, walrecord::NeonWalRecord};
-
 mod no_leak_child;
 /// The IPC protocol that pageserver and walredo process speak over their shared pipe.
 mod protocol;
 
-mod process_impl {
-    pub(super) mod process_async;
+use self::no_leak_child::NoLeakChild;
+use crate::{
+    config::PageServerConf,
+    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    walrecord::NeonWalRecord,
+};
+use anyhow::Context;
+use bytes::Bytes;
+use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use postgres_ffi::BLCKSZ;
+#[cfg(feature = "testing")]
+use std::sync::atomic::AtomicUsize;
+use std::{
+    collections::VecDeque,
+    process::{Command, Stdio},
+    time::Duration,
+};
+use tokio::io::{AsyncReadExt, AsyncWriteExt};
+use tracing::{debug, error, instrument, Instrument};
+use utils::{lsn::Lsn, poison::Poison};
+
+pub struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
+    stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
 }
 
-#[derive(
-    Clone,
-    Copy,
-    Debug,
-    PartialEq,
-    Eq,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    strum_macros::IntoStaticStr,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
-#[repr(u8)]
-pub enum Kind {
-    Sync,
-    Async,
+struct ProcessInput {
+    stdin: tokio::process::ChildStdin,
+    n_requests: usize,
 }
 
-pub(crate) struct Process(process_impl::process_async::WalRedoProcess);
+struct ProcessOutput {
+    stdout: tokio::process::ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
 
-impl Process {
-    #[inline(always)]
-    pub fn launch(
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(pg_version=pg_version))]
+    pub(crate) fn launch(
         conf: &'static PageServerConf,
         tenant_shard_id: TenantShardId,
         pg_version: u32,
     ) -> anyhow::Result<Self> {
-        if conf.walredo_process_kind != Kind::Async {
-            warn!(
-                configured = %conf.walredo_process_kind,
-                "the walredo_process_kind setting has been turned into a no-op, using async implementation"
-            );
-        }
-        Ok(Self(process_impl::process_async::WalRedoProcess::launch(
+        crate::span::debug_assert_current_span_has_tenant_id();
+
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        use no_leak_child::NoLeakChildCommandExt;
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        let stdin =
+            tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
+        let stdout = tokio::process::ChildStdout::from_std(stdout)
+            .context("convert to tokio::ChildStdout")?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                }
+                debug!("wal-redo-postgres stderr_logger_task started");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+                use tokio::io::AsyncBufReadExt;
+                let mut stderr_lines = tokio::io::BufReader::new(stderr);
+                let mut buf = Vec::new();
+                let res = loop {
+                    buf.clear();
+                    // TODO we don't trust the process to cap its stderr length.
+                    // Currently it can do unbounded Vec allocation.
+                    match stderr_lines.read_until(b'\n', &mut buf).await {
+                        Ok(0) => break Ok(()), // eof
+                        Ok(num_bytes) => {
+                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                            error!(%output, "received output");
+                        }
+                        Err(e) => {
+                            break Err(e);
+                        }
+                    }
+                };
+                match res {
+                    Ok(()) => (),
+                    Err(e) => {
+                        error!(error=?e, "failed to read from walredo stderr");
+                    }
+                }
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+        );
+
+        Ok(Self {
             conf,
             tenant_shard_id,
-            pg_version,
-        )?))
+            child: Some(child),
+            stdin: tokio::sync::Mutex::new(Poison::new(
+                "stdin",
+                ProcessInput {
+                    stdin,
+                    n_requests: 0,
+                },
+            )),
+            stdout: tokio::sync::Mutex::new(Poison::new(
+                "stdout",
+                ProcessOutput {
+                    stdout,
+                    pending_responses: VecDeque::new(),
+                    n_processed_responses: 0,
+                },
+            )),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+        })
     }
 
-    #[inline(always)]
+    pub(crate) fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    /// Apply given WAL records ('records') over an old page image. Returns
+    /// new page image.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// Cancellation safe.
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
     pub(crate) async fn apply_wal_records(
         &self,
         rel: RelTag,
@@ -67,12 +187,191 @@ impl Process {
         records: &[(Lsn, NeonWalRecord)],
         wal_redo_timeout: Duration,
     ) -> anyhow::Result<Bytes> {
-        self.0
-            .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
-            .await
+        let tag = protocol::BufferTag { rel, blknum };
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            protocol::build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+            }
+        }
+        protocol::build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let Ok(res) =
+            tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
+        else {
+            anyhow::bail!("WAL redo timed out");
+        };
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
     }
 
-    pub(crate) fn id(&self) -> u32 {
-        self.0.id()
+    /// # Cancel-Safety
+    ///
+    /// When not polled to completion (e.g. because in `tokio::select!` another
+    /// branch becomes ready before this future), concurrent and subsequent
+    /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
+    /// Dispose of this process instance and create a new one.
+    async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
+        let request_no = {
+            let mut lock_guard = self.stdin.lock().await;
+            let mut poison_guard = lock_guard.check_and_arm()?;
+            let input = poison_guard.data_mut();
+            input
+                .stdin
+                .write_all(writebuf)
+                .await
+                .context("write to walredo stdin")?;
+            let request_no = input.n_requests;
+            input.n_requests += 1;
+            poison_guard.disarm();
+            request_no
+        };
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut lock_guard = self.stdout.lock().await;
+        let mut poison_guard = lock_guard.check_and_arm()?;
+        let output = poison_guard.data_mut();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            output
+                .stdout
+                .read_exact(&mut resultbuf)
+                .await
+                .context("read walredo stdout")?;
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        poison_guard.disarm();
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        use std::sync::atomic::Ordering;
+
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        use std::io::Write;
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}
+
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
     }
 }
diff --git a/pageserver/src/walredo/process/process_impl/process_async.rs b/pageserver/src/walredo/process/process_impl/process_async.rs
deleted file mode 100644
index 262858b033..0000000000
--- a/pageserver/src/walredo/process/process_impl/process_async.rs
+++ /dev/null
@@ -1,374 +0,0 @@
-use self::no_leak_child::NoLeakChild;
-use crate::{
-    config::PageServerConf,
-    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
-    walrecord::NeonWalRecord,
-    walredo::process::{no_leak_child, protocol},
-};
-use anyhow::Context;
-use bytes::Bytes;
-use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use postgres_ffi::BLCKSZ;
-#[cfg(feature = "testing")]
-use std::sync::atomic::AtomicUsize;
-use std::{
-    collections::VecDeque,
-    process::{Command, Stdio},
-    time::Duration,
-};
-use tokio::io::{AsyncReadExt, AsyncWriteExt};
-use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, poison::Poison};
-
-pub struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
-    stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
-}
-
-struct ProcessInput {
-    stdin: tokio::process::ChildStdin,
-    n_requests: usize,
-}
-
-struct ProcessOutput {
-    stdout: tokio::process::ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
-}
-
-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(pg_version=pg_version))]
-    pub(crate) fn launch(
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-        pg_version: u32,
-    ) -> anyhow::Result<Self> {
-        crate::span::debug_assert_current_span_has_tenant_id();
-
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        use no_leak_child::NoLeakChildCommandExt;
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        let stdin =
-            tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
-        let stdout = tokio::process::ChildStdout::from_std(stdout)
-            .context("convert to tokio::ChildStdout")?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-            async move {
-                scopeguard::defer! {
-                    debug!("wal-redo-postgres stderr_logger_task finished");
-                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-                }
-                debug!("wal-redo-postgres stderr_logger_task started");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-                use tokio::io::AsyncBufReadExt;
-                let mut stderr_lines = tokio::io::BufReader::new(stderr);
-                let mut buf = Vec::new();
-                let res = loop {
-                    buf.clear();
-                    // TODO we don't trust the process to cap its stderr length.
-                    // Currently it can do unbounded Vec allocation.
-                    match stderr_lines.read_until(b'\n', &mut buf).await {
-                        Ok(0) => break Ok(()), // eof
-                        Ok(num_bytes) => {
-                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                            error!(%output, "received output");
-                        }
-                        Err(e) => {
-                            break Err(e);
-                        }
-                    }
-                };
-                match res {
-                    Ok(()) => (),
-                    Err(e) => {
-                        error!(error=?e, "failed to read from walredo stderr");
-                    }
-                }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-        );
-
-        Ok(Self {
-            conf,
-            tenant_shard_id,
-            child: Some(child),
-            stdin: tokio::sync::Mutex::new(Poison::new(
-                "stdin",
-                ProcessInput {
-                    stdin,
-                    n_requests: 0,
-                },
-            )),
-            stdout: tokio::sync::Mutex::new(Poison::new(
-                "stdout",
-                ProcessOutput {
-                    stdout,
-                    pending_responses: VecDeque::new(),
-                    n_processed_responses: 0,
-                },
-            )),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
-        })
-    }
-
-    pub(crate) fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    /// Apply given WAL records ('records') over an old page image. Returns
-    /// new page image.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// Cancellation safe.
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
-    pub(crate) async fn apply_wal_records(
-        &self,
-        rel: RelTag,
-        blknum: u32,
-        base_img: &Option<Bytes>,
-        records: &[(Lsn, NeonWalRecord)],
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let tag = protocol::BufferTag { rel, blknum };
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            protocol::build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
-            }
-        }
-        protocol::build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let Ok(res) =
-            tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
-        else {
-            anyhow::bail!("WAL redo timed out");
-        };
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
-    }
-
-    /// # Cancel-Safety
-    ///
-    /// When not polled to completion (e.g. because in `tokio::select!` another
-    /// branch becomes ready before this future), concurrent and subsequent
-    /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
-    /// Dispose of this process instance and create a new one.
-    async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
-        let request_no = {
-            let mut lock_guard = self.stdin.lock().await;
-            let mut poison_guard = lock_guard.check_and_arm()?;
-            let input = poison_guard.data_mut();
-            input
-                .stdin
-                .write_all(writebuf)
-                .await
-                .context("write to walredo stdin")?;
-            let request_no = input.n_requests;
-            input.n_requests += 1;
-            poison_guard.disarm();
-            request_no
-        };
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut lock_guard = self.stdout.lock().await;
-        let mut poison_guard = lock_guard.check_and_arm()?;
-        let output = poison_guard.data_mut();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            output
-                .stdout
-                .read_exact(&mut resultbuf)
-                .await
-                .context("read walredo stdout")?;
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        poison_guard.disarm();
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        use std::sync::atomic::Ordering;
-
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        use std::io::Write;
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
-        }
-    }
-
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
-
-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
-    }
-}

From 91dd99038e8e85a29d028fe59e9abd1cc978e415 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 5 Jun 2024 21:22:54 +0100
Subject: [PATCH 0928/1571] pageserver/controller: enable tenant deletion
 without attachment (#7957)

## Problem

As described in #7952, the controller's attempt to reconcile a tenant
before finally deleting it can get hung up waiting for the compute
notification hook to accept updates.

The fact that we try and reconcile a tenant at all during deletion is
part of a more general design issue (#5080), where deletion was
implemented as an operation on attached tenant, requiring the tenant to
be attached in order to delete it, which is not in principle necessary.

Closes: #7952

## Summary of changes

- In the pageserver deletion API, only do the traditional deletion path
if the tenant is attached. If it's secondary, then tear down the
secondary location, and then do a remote delete. If it's not attached at
all, just do the remote delete.
- In the storage controller, instead of ensuring a tenant is attached
before deletion, do a best-effort detach of the tenant, and then call
into some arbitrary pageserver to issue a deletion of remote content.

The pageserver retains its existing delete behavior when invoked on
attached locations. We can remove this later when all users of the API
are updated to either do a detach-before-delete. This will enable
removing the "weird" code paths during startup that sometimes load a
tenant and then immediately delete it, and removing the deletion markers
on tenants.
---
 pageserver/src/http/openapi_spec.yml          |   4 +-
 pageserver/src/http/routes.rs                 |  11 +-
 pageserver/src/tenant/mgr.rs                  |  86 +++++++++++--
 storage_controller/src/http.rs                | 109 +++++++++--------
 storage_controller/src/service.rs             | 113 ++++++++++--------
 .../regress/test_storage_controller.py        |  83 +++++++++++++
 test_runner/regress/test_tenant_delete.py     |  23 +++-
 7 files changed, 312 insertions(+), 117 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index e5eafc51f4..71b486a4d3 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -81,8 +81,10 @@ paths:
         Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
         404 means that deletion successfully finished"
       responses:
+        "200":
+          description: Tenant was successfully deleted, or was already not found.
         "404":
-          description: Tenant not found. This is the success path.
+          description: Tenant not found. This is a success result, equivalent to 200.
           content:
             application/json:
               schema:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6b6a131c88..7fa6c35ad6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1073,7 +1073,7 @@ async fn tenant_delete_handler(
 
     let state = get_state(&request);
 
-    state
+    let status = state
         .tenant_manager
         .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
         .instrument(info_span!("tenant_delete_handler",
@@ -1082,7 +1082,14 @@ async fn tenant_delete_handler(
         ))
         .await?;
 
-    json_response(StatusCode::ACCEPTED, ())
+    // Callers use 404 as success for deletions, for historical reasons.
+    if status == StatusCode::NOT_FOUND {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("Deletion complete").into(),
+        ));
+    }
+
+    json_response(status, ())
 }
 
 /// HTTP endpoint to query the current tenant_size of a tenant.
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 0bb1d750aa..4520bb9295 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -3,6 +3,7 @@
 
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
+use hyper::StatusCode;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
@@ -54,6 +55,7 @@ use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};
 
 use super::delete::DeleteTenantError;
+use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
 use super::timeline::detach_ancestor::PreparedTimelineDetach;
 use super::TenantSharedResources;
@@ -1369,7 +1371,7 @@ impl TenantManager {
         &self,
         tenant_shard_id: TenantShardId,
         activation_timeout: Duration,
-    ) -> Result<(), DeleteTenantError> {
+    ) -> Result<StatusCode, DeleteTenantError> {
         super::span::debug_assert_current_span_has_tenant_id();
         // We acquire a SlotGuard during this function to protect against concurrent
         // changes while the ::prepare phase of DeleteTenantFlow executes, but then
@@ -1382,18 +1384,79 @@ impl TenantManager {
         //
         // See https://github.com/neondatabase/neon/issues/5080
 
-        let slot_guard =
-            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
+        // Tenant deletion can happen two ways:
+        // - Legacy: called on an attached location. The attached Tenant object stays alive in Stopping
+        //   state until deletion is complete.
+        // - New: called on a pageserver without an attached location.  We proceed with deletion from
+        //   remote storage.
+        //
+        // See https://github.com/neondatabase/neon/issues/5080 for more context on this transition.
 
-        // unwrap is safe because we used MustExist mode when acquiring
-        let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
-            TenantSlot::Attached(tenant) => tenant.clone(),
-            _ => {
-                // Express "not attached" as equivalent to "not found"
-                return Err(DeleteTenantError::NotAttached);
+        let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
+        match &slot_guard.old_value {
+            Some(TenantSlot::Attached(tenant)) => {
+                // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
+                // deletion will be resumed across restarts.
+                let tenant = tenant.clone();
+                return self
+                    .delete_tenant_attached(slot_guard, tenant, activation_timeout)
+                    .await;
             }
+            Some(TenantSlot::Secondary(secondary_tenant)) => {
+                secondary_tenant.shutdown().await;
+                let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
+                let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
+                    .await
+                    .with_context(|| {
+                        format!("local tenant directory {local_tenant_directory:?} rename")
+                    })?;
+                spawn_background_purge(tmp_dir);
+            }
+            Some(TenantSlot::InProgress(_)) => unreachable!(),
+            None => {}
         };
 
+        // Fall through: local state for this tenant is no longer present, proceed with remote delete
+        let remote_path = remote_tenant_path(&tenant_shard_id);
+        let keys = match self
+            .resources
+            .remote_storage
+            .list(
+                Some(&remote_path),
+                remote_storage::ListingMode::NoDelimiter,
+                None,
+                &self.cancel,
+            )
+            .await
+        {
+            Ok(listing) => listing.keys,
+            Err(remote_storage::DownloadError::Cancelled) => {
+                return Err(DeleteTenantError::Cancelled)
+            }
+            Err(remote_storage::DownloadError::NotFound) => return Ok(StatusCode::NOT_FOUND),
+            Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
+        };
+
+        if keys.is_empty() {
+            tracing::info!("Remote storage already deleted");
+        } else {
+            tracing::info!("Deleting {} keys from remote storage", keys.len());
+            self.resources
+                .remote_storage
+                .delete_objects(&keys, &self.cancel)
+                .await?;
+        }
+
+        // Callers use 404 as success for deletions, for historical reasons.
+        Ok(StatusCode::NOT_FOUND)
+    }
+
+    async fn delete_tenant_attached(
+        &self,
+        slot_guard: SlotGuard,
+        tenant: Arc<Tenant>,
+        activation_timeout: Duration,
+    ) -> Result<StatusCode, DeleteTenantError> {
         match tenant.current_state() {
             TenantState::Broken { .. } | TenantState::Stopping { .. } => {
                 // If deletion is already in progress, return success (the semantics of this
@@ -1403,7 +1466,7 @@ impl TenantManager {
                     // The `delete_progress` lock is held: deletion is already happening
                     // in the bacckground
                     slot_guard.revert();
-                    return Ok(());
+                    return Ok(StatusCode::ACCEPTED);
                 }
             }
             _ => {
@@ -1436,7 +1499,8 @@ impl TenantManager {
 
         // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
         slot_guard.revert();
-        result
+        let () = result?;
+        Ok(StatusCode::ACCEPTED)
     }
 
     #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 604ad6fbaa..bbb6d2cb32 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -142,52 +142,6 @@ async fn handle_tenant_create(
     )
 }
 
-// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
-// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream.  This avoids
-// needing to track a "deleting" state for tenants.
-async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
-where
-    R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
-    F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
-{
-    let started_at = Instant::now();
-    // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
-    // completed.
-    let mut retry_period = Duration::from_secs(1);
-    // On subsequent retries, wait longer.
-    let max_retry_period = Duration::from_secs(5);
-    // Enable callers with a 30 second request timeout to reliably get a response
-    let max_wait = Duration::from_secs(25);
-
-    loop {
-        let status = f(service.clone()).await?;
-        match status {
-            StatusCode::ACCEPTED => {
-                tracing::info!("Deletion accepted, waiting to try again...");
-                tokio::time::sleep(retry_period).await;
-                retry_period = max_retry_period;
-            }
-            StatusCode::NOT_FOUND => {
-                tracing::info!("Deletion complete");
-                return json_response(StatusCode::OK, ());
-            }
-            _ => {
-                tracing::warn!("Unexpected status {status}");
-                return json_response(status, ());
-            }
-        }
-
-        let now = Instant::now();
-        if now + retry_period > started_at + max_wait {
-            tracing::info!("Deletion timed out waiting for 404");
-            // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
-            // the pageserver's swagger definition for this endpoint, and has the same desired
-            // effect of causing the control plane to retry later.
-            return json_response(StatusCode::CONFLICT, ());
-        }
-    }
-}
-
 async fn handle_tenant_location_config(
     service: Arc<Service>,
     mut req: Request<Body>,
@@ -283,13 +237,17 @@ async fn handle_tenant_delete(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
-    deletion_wrapper(service, move |service| async move {
-        service
-            .tenant_delete(tenant_id)
-            .await
-            .and_then(map_reqwest_hyper_status)
-    })
-    .await
+    let status_code = service
+        .tenant_delete(tenant_id)
+        .await
+        .and_then(map_reqwest_hyper_status)?;
+
+    if status_code == StatusCode::NOT_FOUND {
+        // The pageserver uses 404 for successful deletion, but we use 200
+        json_response(StatusCode::OK, ())
+    } else {
+        json_response(status_code, ())
+    }
 }
 
 async fn handle_tenant_timeline_create(
@@ -317,6 +275,51 @@ async fn handle_tenant_timeline_delete(
 
     let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
 
+    // For timeline deletions, which both implement an "initially return 202, then 404 once
+    // we're done" semantic, we wrap with a retry loop to expose a simpler API upstream.
+    async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
+    where
+        R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
+        F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
+    {
+        let started_at = Instant::now();
+        // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
+        // completed.
+        let mut retry_period = Duration::from_secs(1);
+        // On subsequent retries, wait longer.
+        let max_retry_period = Duration::from_secs(5);
+        // Enable callers with a 30 second request timeout to reliably get a response
+        let max_wait = Duration::from_secs(25);
+
+        loop {
+            let status = f(service.clone()).await?;
+            match status {
+                StatusCode::ACCEPTED => {
+                    tracing::info!("Deletion accepted, waiting to try again...");
+                    tokio::time::sleep(retry_period).await;
+                    retry_period = max_retry_period;
+                }
+                StatusCode::NOT_FOUND => {
+                    tracing::info!("Deletion complete");
+                    return json_response(StatusCode::OK, ());
+                }
+                _ => {
+                    tracing::warn!("Unexpected status {status}");
+                    return json_response(status, ());
+                }
+            }
+
+            let now = Instant::now();
+            if now + retry_period > started_at + max_wait {
+                tracing::info!("Deletion timed out waiting for 404");
+                // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
+                // the pageserver's swagger definition for this endpoint, and has the same desired
+                // effect of causing the control plane to retry later.
+                return json_response(StatusCode::CONFLICT, ());
+            }
+        }
+    }
+
     deletion_wrapper(service, move |service| async move {
         service
             .tenant_timeline_delete(tenant_id, timeline_id)
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index f914f4e0bb..756dc10a2a 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2376,61 +2376,80 @@ impl Service {
         let _tenant_lock =
             trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await;
 
-        self.ensure_attached_wait(tenant_id).await?;
-
-        // TODO: refactor into helper
-        let targets = {
-            let locked = self.inner.read().unwrap();
-            let mut targets = Vec::new();
-
+        // Detach all shards
+        let (detach_waiters, shard_ids, node) = {
+            let mut shard_ids = Vec::new();
+            let mut detach_waiters = Vec::new();
+            let mut locked = self.inner.write().unwrap();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
             for (tenant_shard_id, shard) in
-                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+                tenants.range_mut(TenantShardId::tenant_range(tenant_id))
             {
-                let node_id = shard.intent.get_attached().ok_or_else(|| {
-                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
-                })?;
-                let node = locked
-                    .nodes
-                    .get(&node_id)
-                    .expect("Pageservers may not be deleted while referenced");
+                shard_ids.push(*tenant_shard_id);
 
-                targets.push((*tenant_shard_id, node.clone()));
+                // Update the tenant's intent to remove all attachments
+                shard.policy = PlacementPolicy::Detached;
+                shard
+                    .schedule(scheduler, &mut ScheduleContext::default())
+                    .expect("De-scheduling is infallible");
+                debug_assert!(shard.intent.get_attached().is_none());
+                debug_assert!(shard.intent.get_secondary().is_empty());
+
+                if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
+                    detach_waiters.push(waiter);
+                }
             }
-            targets
+
+            // Pick an arbitrary node to use for remote deletions (does not have to be where the tenant
+            // was attached, just has to be able to see the S3 content)
+            let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
+            let node = nodes
+                .get(&node_id)
+                .expect("Pageservers may not be deleted while lock is active");
+            (detach_waiters, shard_ids, node.clone())
         };
 
-        // Phase 1: delete on the pageservers
-        let mut any_pending = false;
-        for (tenant_shard_id, node) in targets {
-            let client = PageserverClient::new(
-                node.get_id(),
-                node.base_url(),
-                self.config.jwt_token.as_deref(),
-            );
-            // TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
-            // surface immediately as an error to our caller.
-            let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
-                ApiError::InternalServerError(anyhow::anyhow!(
-                    "Error deleting shard {tenant_shard_id} on node {node}: {e}",
-                ))
-            })?;
-            tracing::info!(
-                "Shard {tenant_shard_id} on node {node}, delete returned {}",
-                status
-            );
-            if status == StatusCode::ACCEPTED {
-                any_pending = true;
-            }
+        if let Err(e) = self.await_waiters(detach_waiters, RECONCILE_TIMEOUT).await {
+            // Failing to detach shouldn't hold up deletion, e.g. if a node is offline we should be able
+            // to use some other node to run the remote deletion.
+            tracing::warn!("Failed to detach some locations: {e}");
         }
 
-        if any_pending {
-            // Caller should call us again later.  When we eventually see 404s from
-            // all the shards, we may proceed to delete our records of the tenant.
-            tracing::info!(
-                "Tenant {} has some shards pending deletion, returning 202",
-                tenant_id
-            );
-            return Ok(StatusCode::ACCEPTED);
+        let locations = shard_ids
+            .into_iter()
+            .map(|s| (s, node.clone()))
+            .collect::<Vec<_>>();
+        let results = self.tenant_for_shards_api(
+            locations,
+            |tenant_shard_id, client| async move { client.tenant_delete(tenant_shard_id).await },
+            1,
+            3,
+            RECONCILE_TIMEOUT,
+            &self.cancel,
+        )
+        .await;
+        for result in results {
+            match result {
+                Ok(StatusCode::ACCEPTED) => {
+                    // This could happen if we failed detach above, and hit a pageserver where the tenant
+                    // is still attached: it will accept the deletion in the background
+                    tracing::warn!(
+                        "Unexpectedly still attached on {}, client should retry",
+                        node
+                    );
+                    return Ok(StatusCode::ACCEPTED);
+                }
+                Ok(_) => {}
+                Err(mgmt_api::Error::Cancelled) => {
+                    return Err(ApiError::ShuttingDown);
+                }
+                Err(e) => {
+                    // This is unexpected: remote deletion should be infallible, unless the object store
+                    // at large is unavailable.
+                    tracing::error!("Error deleting via node {}: {e}", node);
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
+                }
+            }
         }
 
         // Fall through: deletion of the tenant on pageservers is complete, we may proceed to drop
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 3a9a522f3f..2031feaa83 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -7,6 +7,7 @@ from typing import Any, Dict, List, Union
 
 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
+from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -18,6 +19,8 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
     MANY_SMALL_LAYERS_TENANT_CONFIG,
+    assert_prefix_empty,
+    assert_prefix_not_empty,
     enable_remote_storage_versioning,
     list_prefix,
     remote_storage_delete_key,
@@ -839,6 +842,86 @@ def test_storage_controller_tenant_conf(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.consistency_check()
 
 
+def test_storage_controller_tenant_deletion(
+    neon_env_builder: NeonEnvBuilder,
+    compute_reconfigure_listener: ComputeReconfigure,
+):
+    """
+    Validate that:
+    - Deleting a tenant deletes all its shards
+    - Deletion does not require the compute notification hook to be responsive
+    - Deleting a tenant also removes all secondary locations
+    """
+    neon_env_builder.num_pageservers = 4
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.control_plane_compute_hook_api = (
+        compute_reconfigure_listener.control_plane_compute_hook_api
+    )
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(
+        tenant_id, timeline_id, shard_count=2, placement_policy='{"Attached":1}'
+    )
+
+    # Ensure all the locations are configured, including secondaries
+    env.storage_controller.reconcile_until_idle()
+
+    shard_ids = [
+        TenantShardId.parse(shard["shard_id"]) for shard in env.storage_controller.locate(tenant_id)
+    ]
+
+    # Assert attachments all have local content
+    for shard_id in shard_ids:
+        pageserver = env.get_tenant_pageserver(shard_id)
+        assert pageserver.tenant_dir(shard_id).exists()
+
+    # Assert all shards have some content in remote storage
+    for shard_id in shard_ids:
+        assert_prefix_not_empty(
+            neon_env_builder.pageserver_remote_storage,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(shard_id),
+                )
+            ),
+        )
+
+    # Break the compute hook: we are checking that deletion does not depend on the compute hook being available
+    def break_hook():
+        raise RuntimeError("Unexpected call to compute hook")
+
+    compute_reconfigure_listener.register_on_notify(break_hook)
+
+    # No retry loop: deletion should complete in one shot without polling for 202 responses, because
+    # it cleanly detaches all the shards first, and then deletes them in remote storage
+    env.storage_controller.pageserver_api().tenant_delete(tenant_id)
+
+    # Assert no pageservers have any local content
+    for pageserver in env.pageservers:
+        for shard_id in shard_ids:
+            assert not pageserver.tenant_dir(shard_id).exists()
+
+    for shard_id in shard_ids:
+        assert_prefix_empty(
+            neon_env_builder.pageserver_remote_storage,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(shard_id),
+                )
+            ),
+        )
+
+    # Assert the tenant is not visible in storage controller API
+    with pytest.raises(StorageControllerApiException):
+        env.storage_controller.tenant_describe(tenant_id)
+
+
 class Failure:
     pageserver_id: int
 
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index e120aa1a7c..fa7cead1bd 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -54,9 +54,26 @@ def test_tenant_delete_smoke(
 
     # first try to delete non existing tenant
     tenant_id = TenantId.generate()
-    env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
-    with pytest.raises(PageserverApiException, match=f"NotFound: tenant {tenant_id}"):
-        ps_http.tenant_delete(tenant_id=tenant_id)
+    env.pageserver.allowed_errors.append(".*NotFound.*")
+    env.pageserver.allowed_errors.append(".*simulated failure.*")
+
+    # Check that deleting a non-existent tenant gives the expected result: this is a loop because we
+    # may need to retry on some remote storage errors injected by the test harness
+    while True:
+        try:
+            ps_http.tenant_delete(tenant_id=tenant_id)
+        except PageserverApiException as e:
+            if e.status_code == 500:
+                # This test uses failure injection, which can produce 500s as the pageserver expects
+                # the object store to always be available, and the ListObjects during deletion is generally
+                # an infallible operation
+                assert "simulated failure of remote operation" in e.message
+            elif e.status_code == 404:
+                # This is our expected result: trying to erase a non-existent tenant gives us 404
+                assert "NotFound" in e.message
+                break
+            else:
+                raise
 
     env.neon_cli.create_tenant(
         tenant_id=tenant_id,

From 0a65333fff72a6781b005432737261b54a6756be Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 6 Jun 2024 15:10:16 +0200
Subject: [PATCH 0929/1571] chore(walredo): avoid duplicate tenant_id and
 shard_slug fields (#7977)

spotted during reviews of async walredo work in #6628
---
 pageserver/src/walredo/process.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index 5b0af334ee..9140d4f6aa 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -6,6 +6,7 @@ use self::no_leak_child::NoLeakChild;
 use crate::{
     config::PageServerConf,
     metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    span::debug_assert_current_span_has_tenant_id,
     walrecord::NeonWalRecord,
 };
 use anyhow::Context;
@@ -26,6 +27,7 @@ use utils::{lsn::Lsn, poison::Poison};
 pub struct WalRedoProcess {
     #[allow(dead_code)]
     conf: &'static PageServerConf,
+    #[cfg(feature = "testing")]
     tenant_shard_id: TenantShardId,
     // Some() on construction, only becomes None on Drop.
     child: Option<NoLeakChild>,
@@ -143,6 +145,7 @@ impl WalRedoProcess {
 
         Ok(Self {
             conf,
+            #[cfg(feature = "testing")]
             tenant_shard_id,
             child: Some(child),
             stdin: tokio::sync::Mutex::new(Poison::new(
@@ -178,7 +181,7 @@ impl WalRedoProcess {
     /// # Cancel-Safety
     ///
     /// Cancellation safe.
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    #[instrument(skip_all, fields(pid=%self.id()))]
     pub(crate) async fn apply_wal_records(
         &self,
         rel: RelTag,
@@ -187,6 +190,8 @@ impl WalRedoProcess {
         records: &[(Lsn, NeonWalRecord)],
         wal_redo_timeout: Duration,
     ) -> anyhow::Result<Bytes> {
+        debug_assert_current_span_has_tenant_id();
+
         let tag = protocol::BufferTag { rel, blknum };
 
         // Serialize all the messages to send the WAL redo process first.

From 630cfbe4206b1e3f5439cde96b048f345dce3266 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Thu, 6 Jun 2024 10:00:14 -0400
Subject: [PATCH 0930/1571] refactor(pageserver): designated api error type for
 cancelled request (#7949)

Closes #7406.

## Problem

When a `get_lsn_by_timestamp` request is cancelled, an anyhow error is
exposed to handle that case, which verbosely logs the error. However, we
don't benefit from having the full backtrace provided by anyhow in this
case.

## Summary of changes

This PR introduces a new `ApiError` type to handle errors caused by
cancelled request more robustly.
-  A new enum variant `ApiError::Cancelled`
- Currently the cancelled request is mapped to status code 500.
- Need to handle this error in proxy's `http_util` as well.
- Added a failpoint test to simulate cancelled `get_lsn_by_timestamp`
request.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 libs/utils/src/http/error.rs            |  8 ++++
 pageserver/src/http/routes.rs           |  4 +-
 pageserver/src/pgdatadir_mapping.rs     |  4 ++
 proxy/src/serverless/http_util.rs       |  4 ++
 test_runner/fixtures/pageserver/http.py |  2 +
 test_runner/regress/test_lsn_mapping.py | 51 ++++++++++++++++++++++++-
 6 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index d55823b0b7..3d863a6518 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -34,6 +34,9 @@ pub enum ApiError {
     #[error("Timeout")]
     Timeout(Cow<'static, str>),
 
+    #[error("Request cancelled")]
+    Cancelled,
+
     #[error(transparent)]
     InternalServerError(anyhow::Error),
 }
@@ -74,6 +77,10 @@ impl ApiError {
                 err.to_string(),
                 StatusCode::REQUEST_TIMEOUT,
             ),
+            ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
+                self.to_string(),
+                StatusCode::INTERNAL_SERVER_ERROR,
+            ),
             ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                 err.to_string(),
                 StatusCode::INTERNAL_SERVER_ERROR,
@@ -133,6 +140,7 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
         ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
         ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
         ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
+        ApiError::Cancelled => info!("Request cancelled while processing HTTP request"),
         _ => info!("Error processing HTTP request: {api_error:#}"),
     }
 
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7fa6c35ad6..19bc88fbc7 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -181,9 +181,7 @@ impl From<PageReconstructError> for ApiError {
             PageReconstructError::MissingKey(e) => {
                 ApiError::InternalServerError(anyhow::anyhow!("{e}"))
             }
-            PageReconstructError::Cancelled => {
-                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
-            }
+            PageReconstructError::Cancelled => ApiError::Cancelled,
             PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
             PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
         }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 0bff4be150..336d1c3fb8 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -36,6 +36,7 @@ use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
+use utils::pausable_failpoint;
 use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
@@ -409,6 +410,8 @@ impl Timeline {
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> Result<LsnForTimestamp, PageReconstructError> {
+        pausable_failpoint!("find-lsn-for-timestamp-pausable");
+
         let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
         // We use this method to figure out the branching LSN for the new branch, but the
         // GC cutoff could be before the branching point and we cannot create a new branch
@@ -424,6 +427,7 @@ impl Timeline {
 
         let mut found_smaller = false;
         let mut found_larger = false;
+
         while low < high {
             if cancel.is_cancelled() {
                 return Err(PageReconstructError::Cancelled);
diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs
index ab9127b13e..701ab58f63 100644
--- a/proxy/src/serverless/http_util.rs
+++ b/proxy/src/serverless/http_util.rs
@@ -45,6 +45,10 @@ pub fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
             err.to_string(),
             StatusCode::REQUEST_TIMEOUT,
         ),
+        ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
+            this.to_string(),
+            StatusCode::INTERNAL_SERVER_ERROR,
+        ),
         ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
             err.to_string(),
             StatusCode::INTERNAL_SERVER_ERROR,
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index f1f96f6d5f..08bf66058a 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -630,12 +630,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         tenant_id: Union[TenantId, TenantShardId],
         timeline_id: TimelineId,
         timestamp: datetime,
+        **kwargs,
     ):
         log.info(
             f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
         )
         res = self.get(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z",
+            **kwargs,
         )
         self.verbose_error(res)
         res_json = res.json()
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 83d52d4c4c..263730a823 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -1,12 +1,15 @@
 import re
 import time
+from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime, timedelta, timezone
 
+import pytest
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.utils import query_scalar
+from fixtures.utils import query_scalar, wait_until
+from requests.exceptions import ReadTimeout
 
 
 #
@@ -108,6 +111,52 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         assert Lsn(result["lsn"]) >= last_flush_lsn
 
 
+def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder):
+    """
+    Test if cancelled pageserver get_lsn_by_timestamp request is correctly handled.
+    Added as an effort to improve error handling and avoid full anyhow backtrace.
+    """
+
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*request was dropped before completing.*",
+            ".*Cancelled request finished with an error: Cancelled",
+        ]
+    )
+
+    client = env.pageserver.http_client()
+    failpoint = "find-lsn-for-timestamp-pausable"
+    client.configure_failpoints((failpoint, "pause"))
+
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        # Request get_lsn_by_timestamp, hit the pausable failpoint
+        failing = exec.submit(
+            client.timeline_get_lsn_by_timestamp,
+            env.initial_tenant,
+            env.initial_timeline,
+            datetime.now(),
+            timeout=2,
+        )
+
+        _, offset = wait_until(
+            20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+        )
+
+        with pytest.raises(ReadTimeout):
+            failing.result()
+
+        client.configure_failpoints((failpoint, "off"))
+
+        _, offset = wait_until(
+            20,
+            0.5,
+            lambda: env.pageserver.assert_log_contains(
+                "Cancelled request finished with an error: Cancelled$", offset
+            ),
+        )
+
+
 # Test pageserver get_timestamp_of_lsn API
 def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
     key_not_found_error = r".*could not find data for key.*"

From a8be07785ebf388ba51a2084e6add16f1f269056 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 6 Jun 2024 17:20:54 +0300
Subject: [PATCH 0931/1571] fix: do TimelineMetrics::shutdown only once (#7983)

Related to #7341 tenant deletion will end up shutting down timelines
twice, once before actually starting and the second time when per
timeline deletion is requested. Shutting down TimelineMetrics causes
underflows. Add an atomic boolean and only do the shutdown once.
---
 pageserver/src/metrics.rs | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 4f2c75d308..e8a1e063c5 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2108,6 +2108,7 @@ pub(crate) struct TimelineMetrics {
     pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
     pub evictions: IntCounter,
     pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
+    shutdown: std::sync::atomic::AtomicBool,
 }
 
 impl TimelineMetrics {
@@ -2227,6 +2228,7 @@ impl TimelineMetrics {
             evictions_with_low_residence_duration: std::sync::RwLock::new(
                 evictions_with_low_residence_duration,
             ),
+            shutdown: std::sync::atomic::AtomicBool::default(),
         }
     }
 
@@ -2249,6 +2251,17 @@ impl TimelineMetrics {
     }
 
     pub(crate) fn shutdown(&self) {
+        let was_shutdown = self
+            .shutdown
+            .swap(true, std::sync::atomic::Ordering::Relaxed);
+
+        if was_shutdown {
+            // this happens on tenant deletion because tenant first shuts down timelines, then
+            // invokes timeline deletion which first shuts down the timeline again.
+            // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
+            return;
+        }
+
         let tenant_id = &self.tenant_id;
         let timeline_id = &self.timeline_id;
         let shard_id = &self.shard_id;

From 75bca9bb19b9943db7358c06db5523337eb9e239 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 6 Jun 2024 16:21:27 +0200
Subject: [PATCH 0932/1571] Perform retries on azure bulk deletion (#7964)

This adds retries to the bulk deletion, because if there is a certain
chance n that a request fails, the chance that at least one of the
requests in a chain of requests fails increases exponentially.

We've had similar issues with the S3 DR tests, which in the end yielded
in adding retries at the remote_storage level. Retries at the top level
are not sufficient when one remote_storage "operation" is multiple
network requests in a trench coat, especially when there is no notion of
saving the progress: even if prior deletions had been successful, we'd
still need to get a 404 in order to continue the loop and get to the
point where we failed in the last iteration. Maybe we'll fail again but
before we've even reached it.

Retries at the bottom level avoid this issue because they have the
notion of progress and also when one network operation fails, only that
operation is retried.

First part of #7931.
---
 libs/remote_storage/src/azure_blob.rs | 70 ++++++++++++++++++++-------
 1 file changed, 52 insertions(+), 18 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index aca22c6b3e..2aa05a9d30 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -3,6 +3,7 @@
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::env;
+use std::fmt::Display;
 use std::io;
 use std::num::NonZeroU32;
 use std::pin::Pin;
@@ -29,6 +30,7 @@ use http_types::{StatusCode, Url};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
+use utils::backoff;
 
 use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
 use crate::{
@@ -451,26 +453,58 @@ impl RemoteStorage for AzureBlobStorage {
             // TODO batch requests are not supported by the SDK
             // https://github.com/Azure/azure-sdk-for-rust/issues/1068
             for path in paths {
-                let blob_client = self.client.blob_client(self.relative_path_to_name(path));
-
-                let request = blob_client.delete().into_future();
-
-                let res = tokio::time::timeout(self.timeout, request).await;
-
-                match res {
-                    Ok(Ok(_response)) => continue,
-                    Ok(Err(e)) => {
-                        if let Some(http_err) = e.as_http_error() {
-                            if http_err.status() == StatusCode::NotFound {
-                                continue;
-                            }
-                        }
-                        return Err(e.into());
-                    }
-                    Err(_elapsed) => return Err(TimeoutOrCancel::Timeout.into()),
+                #[derive(Debug)]
+                enum AzureOrTimeout {
+                    AzureError(azure_core::Error),
+                    Timeout,
+                    Cancel,
                 }
-            }
+                impl Display for AzureOrTimeout {
+                    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(f, "{self:?}")
+                    }
+                }
+                let warn_threshold = 3;
+                let max_retries = 5;
+                backoff::retry(
+                    || async {
+                        let blob_client = self.client.blob_client(self.relative_path_to_name(path));
 
+                        let request = blob_client.delete().into_future();
+
+                        let res = tokio::time::timeout(self.timeout, request).await;
+
+                        match res {
+                            Ok(Ok(_v)) => Ok(()),
+                            Ok(Err(azure_err)) => {
+                                if let Some(http_err) = azure_err.as_http_error() {
+                                    if http_err.status() == StatusCode::NotFound {
+                                        return Ok(());
+                                    }
+                                }
+                                Err(AzureOrTimeout::AzureError(azure_err))
+                            }
+                            Err(_elapsed) => Err(AzureOrTimeout::Timeout),
+                        }
+                    },
+                    |err| match err {
+                        AzureOrTimeout::AzureError(_) | AzureOrTimeout::Timeout => false,
+                        AzureOrTimeout::Cancel => true,
+                    },
+                    warn_threshold,
+                    max_retries,
+                    "deleting remote object",
+                    cancel,
+                )
+                .await
+                .ok_or_else(|| AzureOrTimeout::Cancel)
+                .and_then(|x| x)
+                .map_err(|e| match e {
+                    AzureOrTimeout::AzureError(err) => anyhow::Error::from(err),
+                    AzureOrTimeout::Timeout => TimeoutOrCancel::Timeout.into(),
+                    AzureOrTimeout::Cancel => TimeoutOrCancel::Cancel.into(),
+                })?;
+            }
             Ok(())
         };
 

From 014509987dbc714f3a80459c5c30fd70a0a1f517 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 6 Jun 2024 10:40:58 -0400
Subject: [PATCH 0933/1571] fix(pageserver): more flexible layer size test
 (#7945)

M-series macOS has different alignments/size for some fields (which I
did not investigate in detail) and therefore this test cannot pass on
macOS. Fixed by using `<=` for the comparison so that we do not test for
an exact match.

observed by @yliang412

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/storage_layer/layer/tests.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index fa9142d5e9..3a7aca7a6c 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -815,6 +815,7 @@ async fn eviction_cancellation_on_drop() {
 /// A test case to remind you the cost of these structures. You can bump the size limit
 /// below if it is really necessary to add more fields to the structures.
 #[test]
+#[cfg(target_arch = "x86_64")]
 fn layer_size() {
     assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
     assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);

From 5d05013857fd8a4b868b796ff9514dd773bca2bb Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 6 Jun 2024 11:34:44 -0400
Subject: [PATCH 0934/1571] fix(pageserver): skip metadata compaction is LSN is
 not accumulated enough (#7962)

close https://github.com/neondatabase/neon/issues/7937

Only trigger metadata image layer creation if enough delta layers are
accumulated.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4c46c4e635..59480ba141 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4446,6 +4446,12 @@ impl Timeline {
                 if mode == ImageLayerCreationMode::Initial {
                     return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
                 }
+                if mode == ImageLayerCreationMode::Try && !check_for_image_layers {
+                    // Skip compaction if there are not enough updates. Metadata compaction will do a scan and
+                    // might mess up with evictions.
+                    start = img_range.end;
+                    continue;
+                }
             } else if let ImageLayerCreationMode::Try = mode {
                 // check_for_image_layers = false -> skip
                 // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate

From d46d19456d4a089dad16552996d211ccd818dab1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 6 Jun 2024 20:18:39 +0300
Subject: [PATCH 0935/1571] raise the warning for oversized L0 to 2*target
 (#7985)

currently we warn even by going over a single byte. even that will be
hit much more rarely once #7927 lands, but get this in earlier.

rationale for 2*checkpoint_distance: anything smaller is not really
worth a warn.

we have an global allowed_error for this warning, which still cannot be
removed nor can it be removed with #7927 because of many tests with very
small `checkpoint_distance`.
---
 pageserver/src/tenant/timeline.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 59480ba141..32cf7be0f7 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5678,7 +5678,7 @@ impl<'a> TimelineWriter<'a> {
         self.tl.flush_frozen_layers();
 
         let current_size = self.write_guard.as_ref().unwrap().current_size;
-        if current_size > self.get_checkpoint_distance() {
+        if current_size >= self.get_checkpoint_distance() * 2 {
             warn!("Flushed oversized open layer with size {}", current_size)
         }
 

From e4e444f59fc1ba2c5241acaabd143d2d3cabfb31 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 6 Jun 2024 18:54:44 +0100
Subject: [PATCH 0936/1571] Remove random sleep in partial backup (#7982)

We had a random sleep in the beginning of partial backup task, which was
needed for the first partial backup deploy. It helped with gradual
upload of segments without causing network overload. Now partial backup
is deployed everywhere, so we don't need this random sleep anymore.

We also had an issue related to this, in which manager task was not shut
down for a long time. The cause of the issue is this random sleep that
didn't take timeline cancellation into account, meanwhile manager task
waited for partial backup to complete.

Fixes https://github.com/neondatabase/neon/issues/7967
---
 safekeeper/src/timeline_manager.rs   | 3 +++
 safekeeper/src/wal_backup_partial.rs | 8 --------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index 7174d843fc..087b988c69 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -213,6 +213,9 @@ pub async fn main_task(
         }
     };
 
+    // remove timeline from the broker active set sooner, before waiting for background tasks
+    tli_broker_active.set(false);
+
     // shutdown background tasks
     if conf.is_wal_backup_enabled() {
         wal_backup::update_task(&conf, &tli, false, &last_state, &mut backup_task).await;
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 6c0f35095b..ed5ddb71f5 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -20,7 +20,6 @@
 
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
-use rand::Rng;
 use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
 
@@ -276,13 +275,6 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
     debug!("started");
     let await_duration = conf.partial_backup_timeout;
 
-    // sleep for random time to avoid thundering herd
-    {
-        let randf64 = rand::thread_rng().gen_range(0.0..1.0);
-        let sleep_duration = await_duration.mul_f64(randf64);
-        tokio::time::sleep(sleep_duration).await;
-    }
-
     let (_, persistent_state) = tli.get_state().await;
     let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
     let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();

From 66c6b270f1a2a0bca51349db35e876ce7e135de4 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 6 Jun 2024 20:11:38 +0100
Subject: [PATCH 0937/1571] Downgrade No response from reading prefetch entry
 WARNING to LOG

---
 pgxn/neon/pagestore_smgr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 0e4d210be8..6305f2ec92 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -605,7 +605,7 @@ prefetch_read(PrefetchRequest *slot)
 	}
 	else
 	{
-		neon_shard_log(slot->shard_no, WARNING,
+		neon_shard_log(slot->shard_no, LOG,
 					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
 					   (long)slot->my_ring_index,
 					   RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),

From 8ee191c2714ca404f51fb13f3c86c7d50e8daa27 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 7 Jun 2024 10:18:05 +0300
Subject: [PATCH 0938/1571] test_local_only_layers_after_crash: various fixes
 (#7986)

In #7927 I needed to fix this test case, but the fixes should be
possible to land irrespective of the layer ingestion code change.

The most important fix is the behavior if an image layer is found: the
assertion message formatting raises a runtime error, which obscures the
fact that we found an image layer.
---
 .../test_pageserver_crash_consistency.py      | 29 +++++++++----------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/test_runner/regress/test_pageserver_crash_consistency.py b/test_runner/regress/test_pageserver_crash_consistency.py
index 3831d2f917..2d6b50490e 100644
--- a/test_runner/regress/test_pageserver_crash_consistency.py
+++ b/test_runner/regress/test_pageserver_crash_consistency.py
@@ -1,11 +1,8 @@
-import time
-
 import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
-from fixtures.pageserver.common_types import parse_layer_file_name
+from fixtures.pageserver.common_types import ImageLayerName, parse_layer_file_name
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
-    wait_for_upload_queue_empty,
     wait_until_tenant_active,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
@@ -25,10 +22,9 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin:
 
     env = neon_env_builder.init_start(
         initial_tenant_conf={
-            "checkpoint_distance": f"{1024 ** 2}",
-            "compaction_target_size": f"{1024 ** 2}",
+            "checkpoint_distance": f"{10 * 1024**2}",
             "compaction_period": "0 s",
-            "compaction_threshold": "3",
+            "compaction_threshold": "999999",
         }
     )
     pageserver_http = env.pageserver.http_client()
@@ -42,13 +38,13 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin:
     pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])
 
     lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-    endpoint.stop()
 
     # make sure we receive no new wal after this, so that we'll write over the same L1 file.
     endpoint.stop()
     for sk in env.safekeepers:
         sk.stop()
 
+    pageserver_http.patch_tenant_config_client_side(tenant_id, {"compaction_threshold": 3})
     # hit the exit failpoint
     with pytest.raises(ConnectionError, match="Remote end closed connection without response"):
         pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
@@ -72,9 +68,15 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin:
             # L0
             continue
 
+        candidate = parse_layer_file_name(path.name)
+
+        if isinstance(candidate, ImageLayerName):
+            continue
+
         if l1_found is not None:
-            raise RuntimeError(f"found multiple L1: {l1_found.name} and {path.name}")
-        l1_found = parse_layer_file_name(path.name)
+            raise RuntimeError(f"found multiple L1: {l1_found.to_str()} and {path.name}")
+
+        l1_found = candidate
 
     assert l1_found is not None, "failed to find L1 locally"
 
@@ -93,15 +95,10 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin:
     # wait for us to catch up again
     wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)
 
-    pageserver_http.timeline_compact(tenant_id, timeline_id)
-
-    # give time for log flush
-    time.sleep(1)
+    pageserver_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True)
 
     assert env.pageserver.layer_exists(tenant_id, timeline_id, l1_found), "the L1 reappears"
 
-    wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
-
     uploaded = env.pageserver_remote_storage.remote_layer_path(
         tenant_id, timeline_id, l1_found.to_str()
     )

From 2078dc827b0495703f9ddcb8ea944df3a4fe9c41 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Fri, 7 Jun 2024 10:04:59 +0200
Subject: [PATCH 0939/1571] CI: copy run-* labels from external contributors'
 PRs  (#7915)

## Problem
We don't carry run-* labels from external contributors' PRs to
ci-run/pr-* PRs. This is not really convenient.
Need to sync labels in approved-for-ci-run workflow.
## Summary of changes
Added the procedure of transition of labels from the original PR

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/approved-for-ci-run.yml | 53 ++++++++++++++++++++---
 1 file changed, 48 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml
index b14b66a439..0a0898d30c 100644
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -69,15 +69,41 @@ jobs:
         with:
           ref: main
           token: ${{ secrets.CI_ACCESS_TOKEN }}
+      
+      - name: Look for existing PR
+        id: get-pr
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
+          echo "ALREADY_CREATED=${ALREADY_CREATED}" >> ${GITHUB_OUTPUT}
+      
+      - name: Get changed labels
+        id: get-labels
+        if: steps.get-pr.outputs.ALREADY_CREATED != ''
+        env:
+          ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }}
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          LABELS_TO_REMOVE=$(comm -23 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) | sort) \
+          <(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' | ( grep -E '^run' || true ) | sort ) |\
+          ( grep -v run-e2e-tests-in-draft || true ) | paste -sd , -)
+          LABELS_TO_ADD=$(comm -13 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) |sort) \
+          <(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' |  ( grep -E '^run' || true ) | sort ) |\
+          paste -sd , -)
+          echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT}
+          echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT}
 
       - run: gh pr checkout "${PR_NUMBER}"
 
       - run: git checkout -b "${BRANCH}"
 
       - run: git push --force origin "${BRANCH}"
+        if: steps.get-pr.outputs.ALREADY_CREATED == ''
 
       - name: Create a Pull Request for CI run (if required)
-        env:
+        if: steps.get-pr.outputs.ALREADY_CREATED == ''
+        env: 
           GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
         run: |
           cat << EOF > body.md
@@ -88,16 +114,33 @@ jobs:
             Feel free to review/comment/discuss the original PR #${PR_NUMBER}.
           EOF
 
-          ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
-          if [ -z "${ALREADY_CREATED}" ]; then
-            gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
+          LABELS=$( (gh pr --repo "${GITHUB_REPOSITORY}" view ${PR_NUMBER}  --json labels --jq '.labels.[].name'; echo run-e2e-tests-in-draft  )| \
+          grep -E '^run' | paste -sd , -)
+          gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
                                                        --body-file "body.md" \
                                                        --head "${BRANCH}" \
                                                        --base "main" \
-                                                       --label "run-e2e-tests-in-draft" \
+                                                       --label ${LABELS} \
                                                        --draft
+      - name: Modify the existing pull request (if required)
+        if: steps.get-pr.outputs.ALREADY_CREATED != ''
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          LABELS_TO_ADD: ${{ steps.get-labels.outputs.LABELS_TO_ADD }}
+          LABELS_TO_REMOVE: ${{ steps.get-labels.outputs.LABELS_TO_REMOVE }}
+          ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }}
+        run: |
+          ADD_CMD=
+          REMOVE_CMD=
+          [ -z "${LABELS_TO_ADD}" ] || ADD_CMD="--add-label ${LABELS_TO_ADD}"
+          [ -z "${LABELS_TO_REMOVE}" ] || REMOVE_CMD="--remove-label ${LABELS_TO_REMOVE}"
+          if [ -n "${ADD_CMD}" ] || [ -n "${REMOVE_CMD}" ]; then
+            gh pr --repo "${GITHUB_REPOSITORY}" edit ${ALREADY_CREATED} ${ADD_CMD} ${REMOVE_CMD}
           fi
 
+      - run: git push --force origin "${BRANCH}"
+        if: steps.get-pr.outputs.ALREADY_CREATED != ''
+             
   cleanup:
     # Close PRs and delete branchs if the original PR is closed.
 

From 26c68f91f3ca2c0d5f06004bb0fa3b7f8cb98bcd Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 6 Jun 2024 16:43:29 -0500
Subject: [PATCH 0940/1571] Move SQL migrations out of line

It makes them much easier to reason about, and allows other SQL tooling
to operate on them like language servers, formatters, etc.

I also brought back the removed migrations such that we can more easily
understand what they were. I included a "-- SKIP" comment describing why
those migrations are now skipped. We no longer skip migrations by
checking if it is empty, but instead check to see if the migration
starts with "-- SKIP".
---
 .../0000-neon_superuser_bypass_rls.sql        |  1 +
 .../src/migrations/0001-alter_roles.sql       | 18 ++++++
 ..._create_subscription_to_neon_superuser.sql |  6 ++
 ...003-grant_pg_monitor_to_neon_superuser.sql |  1 +
 ...-grant_all_on_tables_to_neon_superuser.sql |  4 ++
 ...ant_all_on_sequences_to_neon_superuser.sql |  4 ++
 ...es_to_neon_superuser_with_grant_option.sql |  3 +
 ...es_to_neon_superuser_with_grant_option.sql |  3 +
 ...plication_for_previously_allowed_roles.sql | 13 ++++
 compute_tools/src/spec.rs                     | 60 +++++++------------
 10 files changed, 73 insertions(+), 40 deletions(-)
 create mode 100644 compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql
 create mode 100644 compute_tools/src/migrations/0001-alter_roles.sql
 create mode 100644 compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql
 create mode 100644 compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql
 create mode 100644 compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql
 create mode 100644 compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql
 create mode 100644 compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
 create mode 100644 compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
 create mode 100644 compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql

diff --git a/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql b/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql
new file mode 100644
index 0000000000..73b36a37f6
--- /dev/null
+++ b/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql
@@ -0,0 +1 @@
+ALTER ROLE neon_superuser BYPASSRLS;
diff --git a/compute_tools/src/migrations/0001-alter_roles.sql b/compute_tools/src/migrations/0001-alter_roles.sql
new file mode 100644
index 0000000000..6cb49f873f
--- /dev/null
+++ b/compute_tools/src/migrations/0001-alter_roles.sql
@@ -0,0 +1,18 @@
+DO $$
+DECLARE
+    role_name text;
+BEGIN
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
+    END LOOP;
+
+    FOR role_name IN SELECT rolname FROM pg_roles
+        WHERE
+            NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
+    END LOOP;
+END $$;
diff --git a/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql b/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql
new file mode 100644
index 0000000000..37f0ce211f
--- /dev/null
+++ b/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql
@@ -0,0 +1,6 @@
+DO $$
+BEGIN
+    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
+        EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
+    END IF;
+END $$;
diff --git a/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql
new file mode 100644
index 0000000000..11afd3b635
--- /dev/null
+++ b/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql
@@ -0,0 +1 @@
+GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION;
diff --git a/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql b/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql
new file mode 100644
index 0000000000..8abe052494
--- /dev/null
+++ b/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql
@@ -0,0 +1,4 @@
+-- SKIP: Deemed insufficient for allowing relations created by extensions to be
+--       interacted with by neon_superuser without permission issues.
+
+ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser;
diff --git a/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql b/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql
new file mode 100644
index 0000000000..5bcb026e0c
--- /dev/null
+++ b/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql
@@ -0,0 +1,4 @@
+-- SKIP: Deemed insufficient for allowing relations created by extensions to be
+--       interacted with by neon_superuser without permission issues.
+
+ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser;
diff --git a/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
new file mode 100644
index 0000000000..ce7c96753e
--- /dev/null
+++ b/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
@@ -0,0 +1,3 @@
+-- SKIP: Moved inline to the handle_grants() functions.
+
+ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;
diff --git a/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
new file mode 100644
index 0000000000..72baf920cd
--- /dev/null
+++ b/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
@@ -0,0 +1,3 @@
+-- SKIP: Moved inline to the handle_grants() functions.
+
+ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;
diff --git a/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql b/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql
new file mode 100644
index 0000000000..47129d65b8
--- /dev/null
+++ b/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql
@@ -0,0 +1,13 @@
+-- SKIP: The original goal of this migration was to prevent creating
+--       subscriptions, but this migration was insufficient.
+
+DO $$
+DECLARE
+    role_name TEXT;
+BEGIN
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
+    END LOOP;
+END $$;
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 3a6e18b638..143f6c1e5f 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -774,44 +774,21 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
     // !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN!
     // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
+    // Add new migrations in numerical order.
     let migrations = [
-        "ALTER ROLE neon_superuser BYPASSRLS",
-        r#"
-DO $$
-DECLARE
-    role_name text;
-BEGIN
-    FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
-    LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
-        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
-    END LOOP;
-
-    FOR role_name IN SELECT rolname FROM pg_roles
-        WHERE
-            NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
-    LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
-        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
-    END LOOP;
-END $$;
-"#,
-        r#"
-DO $$
-BEGIN
-    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
-        EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
-    END IF;
-END
-$$;"#,
-        "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
-        // Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else.
-        "",
-        "",
-        "",
-        "",
-        "",
-        // Add new migrations below.
+        include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
+        include_str!("./migrations/0001-alter_roles.sql"),
+        include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
+        include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
+        include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
+        include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
+        include_str!(
+            "./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
+        ),
+        include_str!(
+            "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
+        ),
+        include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
     ];
 
     let mut func = || {
@@ -847,10 +824,13 @@ $$;"#,
 
     while current_migration < migrations.len() {
         let migration = &migrations[current_migration];
-        if migration.is_empty() {
-            info!("Skip migration id={}", current_migration);
+        if migration.starts_with("-- SKIP") {
+            info!("Skipping migration id={}", current_migration);
         } else {
-            info!("Running migration:\n{}\n", migration);
+            info!(
+                "Running migration id={}:\n{}\n",
+                current_migration, migration
+            );
             client.simple_query(migration).with_context(|| {
                 format!("handle_migrations current_migration={}", current_migration)
             })?;

From 3b647cd55d7254945718227f7849a67813192fc6 Mon Sep 17 00:00:00 2001
From: Rahul Patil <rahul@neon.tech>
Date: Fri, 7 Jun 2024 19:28:10 +0200
Subject: [PATCH 0941/1571] Include openssl and ICU statically linked (#7956)

## Problem

Due to the upcoming End of Life (EOL) for Debian 11, we need to upgrade
the base OS for Pageservers from Debian 11 to Debian 12 for security
reasons.

When deploying a new Pageserver on Debian 12 with the same binary built
on
Debian 11, we encountered the following errors:

```
could not execute operation: pageserver error, status: 500,
msg: Command failed with status ExitStatus(unix_wait_status(32512)):
/usr/local/neon/v16/bin/initdb: error while loading shared libraries:
libicuuc.so.67: cannot open shared object file: No such file or directory
```

and

```
could not execute operation: pageserver error, status: 500,
msg: Command failed with status ExitStatus(unix_wait_status(32512)):
 /usr/local/neon/v14/bin/initdb: error while loading shared libraries:
 libssl.so.1.1: cannot open shared object file: No such file or directory
```

These issues occur when creating new projects.


## Summary of changes

- To address these issues, we configured PostgreSQL build to use
  statically linked OpenSSL and ICU libraries.

- This resolves the missing shared library errors when running the
  binaries on Debian 12.

Closes: https://github.com/neondatabase/cloud/issues/12648

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [x] Do not forget to reformat commit message to not include the above
checklist
---
 .github/workflows/build_and_test.yml |  6 +++---
 Dockerfile                           |  2 --
 Dockerfile.build-tools               | 32 ++++++++++++++++++++++++++++
 Makefile                             | 15 ++++++++++++-
 4 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b9caf76060..1fc0fbb0b6 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -299,21 +299,21 @@ jobs:
         uses: actions/cache@v4
         with:
           path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}-${{ hashFiles('Dockerfile.build-tools') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
         uses: actions/cache@v4
         with:
           path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}-${{ hashFiles('Dockerfile.build-tools') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
         uses: actions/cache@v4
         with:
           path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}-${{ hashFiles('Dockerfile.build-tools') }}
 
       - name: Build postgres v14
         if: steps.cache_pg_14.outputs.cache-hit != 'true'
diff --git a/Dockerfile b/Dockerfile
index 5f82df3e18..b4900d4a94 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -69,8 +69,6 @@ RUN set -e \
     && apt install -y \
         libreadline-dev \
         libseccomp-dev \
-        libicu67 \
-        openssl \
         ca-certificates \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
     && useradd -d /data neon \
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 460b8c996d..91194eda1a 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -112,6 +112,35 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
     && make install \
     && rm -rf ../lcov.tar.gz
 
+# Compile and install the static OpenSSL library
+ENV OPENSSL_VERSION=3.2.2
+ENV OPENSSL_PREFIX=/usr/local/openssl
+RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
+    echo "197149c18d9e9f292c43f0400acaba12e5f52cacfe050f3d199277ea738ec2e7 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
+    cd /tmp && \
+    tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
+    rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
+    cd /tmp/openssl-${OPENSSL_VERSION} && \
+    ./config --prefix=${OPENSSL_PREFIX}  -static --static no-shared -fPIC && \
+    make ${MAKE_ARGS} && \
+    make install && \
+    cd /tmp && \
+    rm -rf /tmp/openssl-${OPENSSL_VERSION}
+
+# Set the ICU version
+ENV ICU_VERSION=72.1
+ENV ICU_PREFIX=/usr/local/icu
+
+# Download and build static ICU
+RUN wget https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION//./-}/icu4c-${ICU_VERSION//./_}-src.tgz && \
+    tar -xzf icu4c-${ICU_VERSION//./_}-src.tgz && \
+    cd icu/source && \
+    ./configure --prefix=${ICU_PREFIX}  --enable-static --enable-shared=no CXXFLAGS="-fPIC" CFLAGS="-fPIC" && \
+    make && \
+    make install && \
+    cd ../.. && \
+    rm -rf icu icu4c-${ICU_VERSION//./_}-src.tgz
+
 # Switch to nonroot user
 USER nonroot:nonroot
 WORKDIR /home/nonroot
@@ -170,3 +199,6 @@ RUN whoami \
     && rustup --version --verbose \
     && rustc --version --verbose \
     && clang --version
+
+# Set following flag to check in Makefile if its running in Docker
+RUN touch /home/nonroot/.docker_build
diff --git a/Makefile b/Makefile
index dcbfdbcbc1..b5f426344e 100644
--- a/Makefile
+++ b/Makefile
@@ -3,6 +3,9 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Where to install Postgres, default is ./pg_install, maybe useful for package managers
 POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
 
+OPENSSL_PREFIX_DIR := /usr/local/openssl
+ICU_PREFIX_DIR := /usr/local/icu
+
 #
 # We differentiate between release / debug build types using the BUILD_TYPE
 # environment variable.
@@ -20,6 +23,16 @@ else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif
 
+ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
+	# Exclude static build openssl, icu for local build (MacOS, Linux)
+	# Only keep for build type release and debug
+	PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
+	PG_CONFIGURE_OPTS += --with-icu
+	PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
+	PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
+	PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
+endif
+
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
@@ -28,7 +41,7 @@ else ifeq ($(UNAME_S),Darwin)
 	ifndef DISABLE_HOMEBREW
 		# macOS with brew-installed openssl requires explicit paths
 		# It can be configured with OPENSSL_PREFIX variable
-		OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+		OPENSSL_PREFIX := $(shell brew --prefix openssl@3)
 		PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
 		PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
 		# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure

From 3e63d0f9e0532d0e49668bf41b2534ffcee0ca2a Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 10 Jun 2024 04:42:13 -0400
Subject: [PATCH 0942/1571]  test(pageserver): quantify compaction outcome 
 (#7867)

A simple API to collect some statistics after compaction to easily
understand the result.

The tool reads the layer map, and analyze range by range instead of
doing single-key operations, which is more efficient than doing a
benchmark to collect the result. It currently computes two key metrics:

* Latest data access efficiency, which finds how many delta layers /
image layers the system needs to iterate before returning any key in a
key range.
* (Approximate) PiTR efficiency, as in
https://github.com/neondatabase/neon/issues/7770, which is simply the
number of delta files in the range. The reason behind that is, assume no
image layer is created, PiTR efficiency is simply the cost of collect
records from the delta layers, and the replay time. Number of delta
files (or in the future, estimated size of reads) is a simple yet
efficient way of estimating how much effort the page server needs to
reconstruct a page.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs                 | 23 +++++
 pageserver/src/tenant/timeline.rs             |  1 +
 pageserver/src/tenant/timeline/analysis.rs    | 90 +++++++++++++++++++
 .../src/tenant/timeline/layer_manager.rs      |  5 ++
 test_runner/fixtures/pageserver/http.py       | 15 ++++
 test_runner/performance/test_gc_feedback.py   | 17 ++++
 6 files changed, 151 insertions(+)
 create mode 100644 pageserver/src/tenant/timeline/analysis.rs

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 19bc88fbc7..12d02c52fe 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2429,6 +2429,25 @@ async fn list_aux_files(
     json_response(StatusCode::OK, files)
 }
 
+async fn perf_info(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    let result = timeline.perf_info().await;
+
+    json_response(StatusCode::OK, result)
+}
+
 async fn ingest_aux_files(
     mut request: Request<Body>,
     _cancel: CancellationToken,
@@ -2856,5 +2875,9 @@ pub fn make_router(
             |r| testing_api_handler("list_aux_files", r, list_aux_files),
         )
         .post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
+            |r| testing_api_handler("perf_info", r, perf_info),
+        )
         .any(handler_404))
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 32cf7be0f7..388d5b9d54 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,3 +1,4 @@
+pub(crate) mod analysis;
 mod compaction;
 pub mod delete;
 pub(crate) mod detach_ancestor;
diff --git a/pageserver/src/tenant/timeline/analysis.rs b/pageserver/src/tenant/timeline/analysis.rs
new file mode 100644
index 0000000000..cd61418f3d
--- /dev/null
+++ b/pageserver/src/tenant/timeline/analysis.rs
@@ -0,0 +1,90 @@
+use std::{collections::BTreeSet, ops::Range};
+
+use utils::lsn::Lsn;
+
+use super::Timeline;
+
+#[derive(serde::Serialize)]
+pub(crate) struct RangeAnalysis {
+    start: String,
+    end: String,
+    has_image: bool,
+    num_of_deltas_above_image: usize,
+    total_num_of_deltas: usize,
+}
+
+impl Timeline {
+    pub(crate) async fn perf_info(&self) -> Vec<RangeAnalysis> {
+        // First, collect all split points of the layers.
+        let mut split_points = BTreeSet::new();
+        let mut delta_ranges = Vec::new();
+        let mut image_ranges = Vec::new();
+
+        let all_layer_files = {
+            let guard = self.layers.read().await;
+            guard.all_persistent_layers()
+        };
+        let lsn = self.get_last_record_lsn();
+
+        for key in all_layer_files {
+            split_points.insert(key.key_range.start);
+            split_points.insert(key.key_range.end);
+            if key.is_delta {
+                delta_ranges.push((key.key_range.clone(), key.lsn_range.clone()));
+            } else {
+                image_ranges.push((key.key_range.clone(), key.lsn_range.start));
+            }
+        }
+
+        // For each split range, compute the estimated read amplification.
+        let split_points = split_points.into_iter().collect::<Vec<_>>();
+
+        let mut result = Vec::new();
+
+        for i in 0..(split_points.len() - 1) {
+            let start = split_points[i];
+            let end = split_points[i + 1];
+            // Find the latest image layer that contains the information.
+            let mut maybe_image_layers = image_ranges
+                .iter()
+                // We insert split points for all image layers, and therefore a `contains` check for the start point should be enough.
+                .filter(|(key_range, img_lsn)| key_range.contains(&start) && img_lsn <= &lsn)
+                .cloned()
+                .collect::<Vec<_>>();
+            maybe_image_layers.sort_by(|a, b| a.1.cmp(&b.1));
+            let image_layer = maybe_image_layers.last().cloned();
+            let lsn_filter_start = image_layer
+                .as_ref()
+                .map(|(_, lsn)| *lsn)
+                .unwrap_or(Lsn::INVALID);
+
+            fn overlaps_with(lsn_range_a: &Range<Lsn>, lsn_range_b: &Range<Lsn>) -> bool {
+                !(lsn_range_a.end <= lsn_range_b.start || lsn_range_a.start >= lsn_range_b.end)
+            }
+
+            let maybe_delta_layers = delta_ranges
+                .iter()
+                .filter(|(key_range, lsn_range)| {
+                    key_range.contains(&start) && overlaps_with(&(lsn_filter_start..lsn), lsn_range)
+                })
+                .cloned()
+                .collect::<Vec<_>>();
+
+            let pitr_delta_layers = delta_ranges
+                .iter()
+                .filter(|(key_range, _)| key_range.contains(&start))
+                .cloned()
+                .collect::<Vec<_>>();
+
+            result.push(RangeAnalysis {
+                start: start.to_string(),
+                end: end.to_string(),
+                has_image: image_layer.is_some(),
+                num_of_deltas_above_image: maybe_delta_layers.len(),
+                total_num_of_deltas: pitr_delta_layers.len(),
+            });
+        }
+
+        result
+    }
+}
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index b78c98a506..0e82dedecb 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,4 +1,5 @@
 use anyhow::{bail, ensure, Context, Result};
+use itertools::Itertools;
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
 use tracing::trace;
@@ -308,6 +309,10 @@ impl LayerManager {
     pub(crate) fn contains(&self, layer: &Layer) -> bool {
         self.layer_fmgr.contains(layer)
     }
+
+    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
+        self.layer_fmgr.0.keys().cloned().collect_vec()
+    }
 }
 
 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 08bf66058a..d5441bd694 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -923,3 +923,18 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         )
         self.verbose_error(res)
         return res.json()  # type: ignore
+
+    def perf_info(
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+        timeline_id: TimelineId,
+    ):
+        self.is_testing_enabled_or_skip()
+
+        log.info(f"Requesting perf info: tenant {tenant_id}, timeline {timeline_id}")
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/perf_info",
+        )
+        log.info(f"Got perf info response code: {res.status_code}")
+        self.verbose_error(res)
+        return res.json()
diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py
index be56203b26..9a03994b29 100644
--- a/test_runner/performance/test_gc_feedback.py
+++ b/test_runner/performance/test_gc_feedback.py
@@ -75,12 +75,29 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
             physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"]
             log.info(f"Physical storage size {physical_size}")
 
+    max_num_of_deltas_above_image = 0
+    max_total_num_of_deltas = 0
+    for key_range in client.perf_info(tenant_id, timeline_id):
+        max_total_num_of_deltas = max(max_total_num_of_deltas, key_range["total_num_of_deltas"])
+        max_num_of_deltas_above_image = max(
+            max_num_of_deltas_above_image, key_range["num_of_deltas_above_image"]
+        )
+
     MB = 1024 * 1024
     zenbenchmark.record("logical_size", logical_size // MB, "Mb", MetricReport.LOWER_IS_BETTER)
     zenbenchmark.record("physical_size", physical_size // MB, "Mb", MetricReport.LOWER_IS_BETTER)
     zenbenchmark.record(
         "physical/logical ratio", physical_size / logical_size, "", MetricReport.LOWER_IS_BETTER
     )
+    zenbenchmark.record(
+        "max_total_num_of_deltas", max_total_num_of_deltas, "", MetricReport.LOWER_IS_BETTER
+    )
+    zenbenchmark.record(
+        "max_num_of_deltas_above_image",
+        max_num_of_deltas_above_image,
+        "",
+        MetricReport.LOWER_IS_BETTER,
+    )
 
     layer_map_path = env.repo_dir / "layer-map.json"
     log.info(f"Writing layer map to {layer_map_path}")

From ae5badd375e284ed6098503c3e4ead09995b902f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 10 Jun 2024 13:20:20 +0200
Subject: [PATCH 0943/1571] Revert "Include openssl and ICU statically linked"
 (#8003)

Reverts neondatabase/neon#7956

Rationale: compute incompatibilties

Slack thread:
https://neondb.slack.com/archives/C033RQ5SPDH/p1718011276665839?thread_ts=1718008160.431869&cid=C033RQ5SPDH

Relevant quotes from @hlinnaka

> If we go through with the current release candidate, but the compute
is pinned, people who create new projects will get that warning, which
is silly. To them, it looks like the ICU version was downgraded, because
initdb was run with newer version.

> We should upgrade the ICU version eventually. And when we do that,
users with old projects that use ICU will start to see that warning. I
think that's acceptable, as long as we do homework, notify users, and
communicate that properly.
> When do that, we should to try to upgrade the storage and compute
versions at roughly the same time.
---
 .github/workflows/build_and_test.yml |  6 +++---
 Dockerfile                           |  2 ++
 Dockerfile.build-tools               | 32 ----------------------------
 Makefile                             | 15 +------------
 4 files changed, 6 insertions(+), 49 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1fc0fbb0b6..b9caf76060 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -299,21 +299,21 @@ jobs:
         uses: actions/cache@v4
         with:
           path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}-${{ hashFiles('Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
         uses: actions/cache@v4
         with:
           path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}-${{ hashFiles('Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
         uses: actions/cache@v4
         with:
           path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}-${{ hashFiles('Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Build postgres v14
         if: steps.cache_pg_14.outputs.cache-hit != 'true'
diff --git a/Dockerfile b/Dockerfile
index b4900d4a94..5f82df3e18 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -69,6 +69,8 @@ RUN set -e \
     && apt install -y \
         libreadline-dev \
         libseccomp-dev \
+        libicu67 \
+        openssl \
         ca-certificates \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
     && useradd -d /data neon \
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 91194eda1a..460b8c996d 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -112,35 +112,6 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
     && make install \
     && rm -rf ../lcov.tar.gz
 
-# Compile and install the static OpenSSL library
-ENV OPENSSL_VERSION=3.2.2
-ENV OPENSSL_PREFIX=/usr/local/openssl
-RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
-    echo "197149c18d9e9f292c43f0400acaba12e5f52cacfe050f3d199277ea738ec2e7 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
-    cd /tmp && \
-    tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    cd /tmp/openssl-${OPENSSL_VERSION} && \
-    ./config --prefix=${OPENSSL_PREFIX}  -static --static no-shared -fPIC && \
-    make ${MAKE_ARGS} && \
-    make install && \
-    cd /tmp && \
-    rm -rf /tmp/openssl-${OPENSSL_VERSION}
-
-# Set the ICU version
-ENV ICU_VERSION=72.1
-ENV ICU_PREFIX=/usr/local/icu
-
-# Download and build static ICU
-RUN wget https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION//./-}/icu4c-${ICU_VERSION//./_}-src.tgz && \
-    tar -xzf icu4c-${ICU_VERSION//./_}-src.tgz && \
-    cd icu/source && \
-    ./configure --prefix=${ICU_PREFIX}  --enable-static --enable-shared=no CXXFLAGS="-fPIC" CFLAGS="-fPIC" && \
-    make && \
-    make install && \
-    cd ../.. && \
-    rm -rf icu icu4c-${ICU_VERSION//./_}-src.tgz
-
 # Switch to nonroot user
 USER nonroot:nonroot
 WORKDIR /home/nonroot
@@ -199,6 +170,3 @@ RUN whoami \
     && rustup --version --verbose \
     && rustc --version --verbose \
     && clang --version
-
-# Set following flag to check in Makefile if its running in Docker
-RUN touch /home/nonroot/.docker_build
diff --git a/Makefile b/Makefile
index b5f426344e..dcbfdbcbc1 100644
--- a/Makefile
+++ b/Makefile
@@ -3,9 +3,6 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Where to install Postgres, default is ./pg_install, maybe useful for package managers
 POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
 
-OPENSSL_PREFIX_DIR := /usr/local/openssl
-ICU_PREFIX_DIR := /usr/local/icu
-
 #
 # We differentiate between release / debug build types using the BUILD_TYPE
 # environment variable.
@@ -23,16 +20,6 @@ else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif
 
-ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
-	# Exclude static build openssl, icu for local build (MacOS, Linux)
-	# Only keep for build type release and debug
-	PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
-	PG_CONFIGURE_OPTS += --with-icu
-	PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
-	PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
-	PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
-endif
-
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
@@ -41,7 +28,7 @@ else ifeq ($(UNAME_S),Darwin)
 	ifndef DISABLE_HOMEBREW
 		# macOS with brew-installed openssl requires explicit paths
 		# It can be configured with OPENSSL_PREFIX variable
-		OPENSSL_PREFIX := $(shell brew --prefix openssl@3)
+		OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
 		PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
 		PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
 		# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure

From 5a7e285c2c98d0ae15c6e2d7059881bf52a23027 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 10 Jun 2024 15:52:49 +0300
Subject: [PATCH 0944/1571] Simplify scanning compute logs in tests (#7997)

Implement LogUtils in the Endpoint fixture class, so that the
"log_contains" function can be used on compute logs too.

Per discussion at:
https://github.com/neondatabase/neon/pull/7288#discussion_r1623633803
---
 test_runner/fixtures/neon_fixtures.py   |  3 ++-
 test_runner/regress/test_hot_standby.py | 21 +++++----------------
 test_runner/regress/test_migrations.py  | 10 ++--------
 3 files changed, 9 insertions(+), 25 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a25b8bfca1..6fdad2188c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3386,7 +3386,7 @@ def static_proxy(
         yield proxy
 
 
-class Endpoint(PgProtocol):
+class Endpoint(PgProtocol, LogUtils):
     """An object representing a Postgres compute endpoint managed by the control plane."""
 
     def __init__(
@@ -3452,6 +3452,7 @@ class Endpoint(PgProtocol):
         )
         path = Path("endpoints") / self.endpoint_id / "pgdata"
         self.pgdata_dir = os.path.join(self.env.repo_dir, path)
+        self.logfile = self.endpoint_path() / "compute.log"
 
         config_lines = config_lines or []
 
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index cf7a1c56ee..1d1b2fb485 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -1,6 +1,5 @@
 import asyncio
 import os
-import re
 import threading
 import time
 from functools import partial
@@ -18,20 +17,6 @@ from fixtures.neon_fixtures import (
 from fixtures.utils import wait_until
 
 
-# Check for corrupted WAL messages which might otherwise go unnoticed if
-# reconnection fixes this.
-def scan_standby_log_for_errors(secondary):
-    log_path = secondary.endpoint_path() / "compute.log"
-    with log_path.open("r") as f:
-        markers = re.compile(
-            r"incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr"
-        )
-        for line in f:
-            if markers.search(line):
-                log.info(f"bad error in standby log: {line}")
-                raise AssertionError()
-
-
 def test_hot_standby(neon_simple_env: NeonEnv):
     env = neon_simple_env
 
@@ -91,7 +76,11 @@ def test_hot_standby(neon_simple_env: NeonEnv):
                         assert response is not None
                         assert response == responses[query]
 
-            scan_standby_log_for_errors(secondary)
+            # Check for corrupted WAL messages which might otherwise go unnoticed if
+            # reconnection fixes this.
+            assert not secondary.log_contains(
+                "incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr"
+            )
 
     # clean up
     if slow_down_send:
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 526ae14b87..5637f160cf 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -8,8 +8,6 @@ def test_migrations(neon_simple_env: NeonEnv):
     env.neon_cli.create_branch("test_migrations", "empty")
 
     endpoint = env.endpoints.create("test_migrations")
-    log_path = endpoint.endpoint_path() / "compute.log"
-
     endpoint.respec(skip_pg_catalog_updates=False)
     endpoint.start()
 
@@ -22,9 +20,7 @@ def test_migrations(neon_simple_env: NeonEnv):
         migration_id = cur.fetchall()
         assert migration_id[0][0] == num_migrations
 
-    with open(log_path, "r") as log_file:
-        logs = log_file.read()
-        assert f"INFO handle_migrations: Ran {num_migrations} migrations" in logs
+    endpoint.assert_log_contains(f"INFO handle_migrations: Ran {num_migrations} migrations")
 
     endpoint.stop()
     endpoint.start()
@@ -36,6 +32,4 @@ def test_migrations(neon_simple_env: NeonEnv):
         migration_id = cur.fetchall()
         assert migration_id[0][0] == num_migrations
 
-    with open(log_path, "r") as log_file:
-        logs = log_file.read()
-        assert "INFO handle_migrations: Ran 0 migrations" in logs
+    endpoint.assert_log_contains("INFO handle_migrations: Ran 0 migrations")

From b52e31c1a42e186d578975cce632bf244c5f2957 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 10 Jun 2024 16:50:17 +0300
Subject: [PATCH 0945/1571] fix: allow layer flushes more often (#7927)

As seen with the pgvector 0.7.0 index builds, we can receive large
batches of images, leading to very large L0 layers in the range of 1GB.
These large layers are produced because we are only able to roll the
layer after we have witnessed two different Lsns in a single
`DataDirModification::commit`. As the single Lsn batches of images can
span over multiple `DataDirModification` lifespans, we will rarely get
to write two different Lsns in a single `put_batch` currently.

The solution is to remember the TimelineWriterState instead of eagerly
forgetting it until we really open the next layer or someone else
flushes (while holding the write_guard).

Additional changes are test fixes to avoid "initdb image layer
optimization" or ignoring initdb layers for assertion.

Cc: #7197 because small `checkpoint_distance` will now trigger the
"initdb image layer optimization"
---
 .../tenant/storage_layer/inmemory_layer.rs    |   2 +-
 pageserver/src/tenant/timeline.rs             |  40 +++--
 test_runner/fixtures/pageserver/utils.py      |   2 +-
 test_runner/fixtures/utils.py                 |  17 ++
 .../regress/test_disk_usage_eviction.py       |  22 +--
 .../regress/test_ingestion_layer_size.py      | 151 ++++++++++++++++++
 .../regress/test_pageserver_secondary.py      |   1 +
 test_runner/regress/test_s3_restore.py        |  18 ++-
 test_runner/regress/test_sharding.py          |  29 +++-
 9 files changed, 245 insertions(+), 37 deletions(-)
 create mode 100644 test_runner/regress/test_ingestion_layer_size.py

diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 9553f83026..1ecc56ce99 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -52,7 +52,7 @@ pub struct InMemoryLayer {
 
     /// Frozen layers have an exclusive end LSN.
     /// Writes are only allowed when this is `None`.
-    end_lsn: OnceLock<Lsn>,
+    pub(crate) end_lsn: OnceLock<Lsn>,
 
     /// Used for traversal path. Cached representation of the in-memory layer before frozen.
     local_path_str: Arc<str>,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 388d5b9d54..58bdd84906 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -322,6 +322,8 @@ pub struct Timeline {
     /// Locked automatically by [`TimelineWriter`] and checkpointer.
     /// Must always be acquired before the layer map/individual layer lock
     /// to avoid deadlock.
+    ///
+    /// The state is cleared upon freezing.
     write_lock: tokio::sync::Mutex<Option<TimelineWriterState>>,
 
     /// Used to avoid multiple `flush_loop` tasks running
@@ -1578,7 +1580,7 @@ impl Timeline {
     // an ephemeral layer open forever when idle.  It also freezes layers if the global limit on
     // ephemeral layer bytes has been breached.
     pub(super) async fn maybe_freeze_ephemeral_layer(&self) {
-        let Ok(_write_guard) = self.write_lock.try_lock() else {
+        let Ok(mut write_guard) = self.write_lock.try_lock() else {
             // If the write lock is held, there is an active wal receiver: rolling open layers
             // is their responsibility while they hold this lock.
             return;
@@ -1672,6 +1674,7 @@ impl Timeline {
                         .await;
                 }
             }
+            write_guard.take();
             self.flush_frozen_layers();
         }
     }
@@ -2036,11 +2039,11 @@ impl Timeline {
             true
         } else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() {
             info!(
-                    "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
-                    projected_lsn,
-                    layer_size,
-                    opened_at.elapsed()
-                );
+                "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
+                projected_lsn,
+                layer_size,
+                opened_at.elapsed()
+            );
 
             true
         } else {
@@ -3653,7 +3656,10 @@ impl Timeline {
         let _write_guard = if write_lock_held {
             None
         } else {
-            Some(self.write_lock.lock().await)
+            let mut g = self.write_lock.lock().await;
+            // remove the reference to an open layer
+            g.take();
+            Some(g)
         };
 
         let to_lsn = self.get_last_record_lsn();
@@ -5541,6 +5547,9 @@ impl Timeline {
 
 type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
 
+/// Tracking writes ingestion does to a particular in-memory layer.
+///
+/// Cleared upon freezing a layer.
 struct TimelineWriterState {
     open_layer: Arc<InMemoryLayer>,
     current_size: u64,
@@ -5581,12 +5590,6 @@ impl Deref for TimelineWriter<'_> {
     }
 }
 
-impl Drop for TimelineWriter<'_> {
-    fn drop(&mut self) {
-        self.write_guard.take();
-    }
-}
-
 #[derive(PartialEq)]
 enum OpenLayerAction {
     Roll,
@@ -5692,6 +5695,17 @@ impl<'a> TimelineWriter<'a> {
             return OpenLayerAction::Open;
         };
 
+        if state.cached_last_freeze_at < self.tl.last_freeze_at.load() {
+            // TODO(#7993): branch is needed before refactoring the many places of freezing for the
+            // possibility `state` having a "dangling" reference to an already frozen in-memory
+            // layer.
+            assert!(
+                state.open_layer.end_lsn.get().is_some(),
+                "our open_layer must be outdated"
+            );
+            return OpenLayerAction::Open;
+        }
+
         if state.prev_lsn == Some(lsn) {
             // Rolling mid LSN is not supported by downstream code.
             // Hence, only roll at LSN boundaries.
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 91435e8a1f..72384c138b 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -313,7 +313,7 @@ def assert_prefix_empty(
             # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865
             # this seems like a mock_s3 issue
             log.warning(
-                f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
+                f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
             )
             keys = 0
         elif keys != 0 and len(objects) == 0:
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index b55329e054..0989dc1893 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -582,3 +582,20 @@ class PropagatingThread(threading.Thread):
         if self.exc:
             raise self.exc
         return self.ret
+
+
+def human_bytes(amt: float) -> str:
+    """
+    Render a bytes amount into nice IEC bytes string.
+    """
+
+    suffixes = ["", "Ki", "Mi", "Gi"]
+
+    last = suffixes[-1]
+
+    for name in suffixes:
+        if amt < 1024 or name == last:
+            return f"{int(round(amt))} {name}B"
+        amt = amt / 1024
+
+    raise RuntimeError("unreachable")
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 7ae2352c06..7722828c79 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -17,7 +17,7 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_upload_queue_empty
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.utils import wait_until
+from fixtures.utils import human_bytes, wait_until
 
 GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
 
@@ -218,19 +218,6 @@ def count_layers_per_tenant(
     return dict(ret)
 
 
-def human_bytes(amt: float) -> str:
-    suffixes = ["", "Ki", "Mi", "Gi"]
-
-    last = suffixes[-1]
-
-    for name in suffixes:
-        if amt < 1024 or name == last:
-            return f"{int(round(amt))} {name}B"
-        amt = amt / 1024
-
-    raise RuntimeError("unreachable")
-
-
 def _eviction_env(
     request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, num_pageservers: int
 ) -> EvictionEnv:
@@ -294,7 +281,7 @@ def pgbench_init_tenant(
             "gc_period": "0s",
             "compaction_period": "0s",
             "checkpoint_distance": f"{layer_size}",
-            "image_creation_threshold": "100",
+            "image_creation_threshold": "999999",
             "compaction_target_size": f"{layer_size}",
         }
     )
@@ -668,11 +655,10 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or
         finish_tenant_creation(env, tenant_id, timeline_id, min_expected_layers)
 
     tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines))
-    (total_on_disk, _, _) = poor_mans_du(env, map(lambda x: x[0], timelines), env.pageserver, False)
+    (total_on_disk, _, _) = poor_mans_du(env, map(lambda x: x[0], timelines), env.pageserver, True)
 
-    # cut 10 percent
     response = env.pageserver.http_client().disk_usage_eviction_run(
-        {"evict_bytes": total_on_disk // 10, "eviction_order": order.config()}
+        {"evict_bytes": total_on_disk // 5, "eviction_order": order.config()}
     )
     log.info(f"{response}")
 
diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py
new file mode 100644
index 0000000000..44c77b3410
--- /dev/null
+++ b/test_runner/regress/test_ingestion_layer_size.py
@@ -0,0 +1,151 @@
+from dataclasses import dataclass
+from typing import Iterable, List, Union
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
+from fixtures.pageserver.http import HistoricLayerInfo, LayerMapInfo
+from fixtures.utils import human_bytes
+
+
+def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder, build_type: str):
+    """
+    Build a non-small GIN index which includes similarly batched up images in WAL stream as does pgvector
+    to show that we no longer create oversized layers.
+    """
+
+    if build_type == "debug":
+        pytest.skip("debug run is unnecessarily slow")
+
+    minimum_initdb_size = 20 * 1024**2
+    checkpoint_distance = 32 * 1024**2
+    minimum_good_layer_size = checkpoint_distance * 0.9
+    minimum_too_large_layer_size = 2 * checkpoint_distance
+
+    # index size: 99MiB
+    rows = 2_500_000
+
+    # bucket lower limits
+    buckets = [0, minimum_initdb_size, minimum_good_layer_size, minimum_too_large_layer_size]
+
+    assert (
+        minimum_initdb_size < minimum_good_layer_size
+    ), "keep checkpoint_distance higher than the initdb size (find it by experimenting)"
+
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_distance": f"{checkpoint_distance}",
+            "compaction_target_size": f"{checkpoint_distance}",
+            # this test is primarly interested in L0 sizes but we'll compact after ingestion to ensure sizes are good even then
+            "compaction_period": "0s",
+            "gc_period": "0s",
+            "compaction_threshold": "255",
+            "image_creation_threshold": "99999",
+        }
+    )
+
+    # build a larger than 3*checkpoint_distance sized gin index.
+    # gin index building exhibits the same behaviour as the pgvector with the two phase build
+    with env.endpoints.create_start("main") as ep, ep.cursor() as cur:
+        cur.execute(
+            f"create table int_array_test as select array_agg(g) as int_array from generate_series(1, {rows}) g group by g / 10;"
+        )
+        cur.execute(
+            "create index int_array_test_gin_index on int_array_test using gin (int_array);"
+        )
+        cur.execute("select pg_table_size('int_array_test_gin_index')")
+        size = cur.fetchone()
+        assert size is not None
+        assert isinstance(size[0], int)
+        log.info(f"gin index size: {human_bytes(size[0])}")
+        assert (
+            size[0] > checkpoint_distance * 3
+        ), f"gin index is not large enough: {human_bytes(size[0])}"
+        wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+
+    ps_http = env.pageserver.http_client()
+    ps_http.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
+
+    infos = ps_http.layer_map_info(env.initial_tenant, env.initial_timeline)
+    assert len(infos.in_memory_layers) == 0, "should had flushed open layers"
+    post_ingest = histogram_historic_layers(infos, buckets)
+
+    # describe first, assert later for easier debugging
+    log.info("non-cumulative layer size distribution after ingestion:")
+    print_layer_size_histogram(post_ingest)
+
+    # since all we have are L0s, we should be getting nice L1s and images out of them now
+    ps_http.patch_tenant_config_client_side(
+        env.initial_tenant,
+        {
+            "compaction_threshold": 1,
+            "image_creation_threshold": 1,
+        },
+    )
+
+    ps_http.timeline_compact(env.initial_tenant, env.initial_timeline, True, True)
+
+    infos = ps_http.layer_map_info(env.initial_tenant, env.initial_timeline)
+    assert len(infos.in_memory_layers) == 0, "no new inmem layers expected"
+    post_compact = histogram_historic_layers(infos, buckets)
+
+    log.info("non-cumulative layer size distribution after compaction:")
+    print_layer_size_histogram(post_compact)
+
+    assert (
+        post_ingest.counts[3] == 0
+    ), f"there should be no layers larger than 2*checkpoint_distance ({human_bytes(2*checkpoint_distance)})"
+    assert post_ingest.counts[1] == 1, "expect one smaller layer for initdb"
+    assert (
+        post_ingest.counts[0] <= 1
+    ), "expect at most one tiny layer from shutting down the endpoint"
+
+    # just make sure we don't have trouble splitting the layers apart
+    assert post_compact.counts[3] == 0
+
+
+@dataclass
+class Histogram:
+    buckets: List[Union[int, float]]
+    counts: List[int]
+    sums: List[int]
+
+
+def histogram_historic_layers(
+    infos: LayerMapInfo, minimum_sizes: List[Union[int, float]]
+) -> Histogram:
+    def log_layer(layer: HistoricLayerInfo) -> HistoricLayerInfo:
+        log.info(
+            f"{layer.layer_file_name} {human_bytes(layer.layer_file_size)} ({layer.layer_file_size} bytes)"
+        )
+        return layer
+
+    layers = map(log_layer, infos.historic_layers)
+    sizes = (x.layer_file_size for x in layers)
+    return histogram(sizes, minimum_sizes)
+
+
+def histogram(sizes: Iterable[int], minimum_sizes: List[Union[int, float]]) -> Histogram:
+    assert all(minimum_sizes[i] < minimum_sizes[i + 1] for i in range(len(minimum_sizes) - 1))
+    buckets = list(enumerate(minimum_sizes))
+    counts = [0 for _ in buckets]
+    sums = [0 for _ in buckets]
+
+    for size in sizes:
+        found = False
+        for index, min_size in reversed(buckets):
+            if size >= min_size:
+                counts[index] += 1
+                sums[index] += size
+                found = True
+                break
+        assert found
+
+    return Histogram(minimum_sizes, counts, sums)
+
+
+def print_layer_size_histogram(h: Histogram):
+    for index, min_size in enumerate(h.buckets):
+        log.info(
+            f">= {human_bytes(min_size)}: {h.counts[index]} layers total {human_bytes(h.sums[index])}"
+        )
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 5bfa9cce8c..757ea60882 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -563,6 +563,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
             )
         ),
     )
+    workload.stop()
 
 
 def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
index 7fdabaaec7..6383d24c57 100644
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -2,6 +2,7 @@ import time
 from datetime import datetime, timezone
 
 from fixtures.common_types import Lsn
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
@@ -32,7 +33,12 @@ def test_tenant_s3_restore(
         assert remote_storage, "remote storage not configured"
         enable_remote_storage_versioning(remote_storage)
 
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    # change it back after initdb, recovery doesn't work if the two
+    # index_part.json uploads happen at same second or too close to each other.
+    initial_tenant_conf = MANY_SMALL_LAYERS_TENANT_CONFIG
+    del initial_tenant_conf["checkpoint_distance"]
+
+    env = neon_env_builder.init_start(initial_tenant_conf)
     env.pageserver.allowed_errors.extend(
         [
             # The deletion queue will complain when it encounters simulated S3 errors
@@ -43,14 +49,16 @@ def test_tenant_s3_restore(
     )
 
     ps_http = env.pageserver.http_client()
-
     tenant_id = env.initial_tenant
 
+    # now lets create the small layers
+    ps_http.set_tenant_config(tenant_id, MANY_SMALL_LAYERS_TENANT_CONFIG)
+
     # Default tenant and the one we created
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
 
     # create two timelines one being the parent of another, both with non-trivial data
-    parent = None
+    parent = "main"
     last_flush_lsns = []
 
     for timeline in ["first", "second"]:
@@ -64,6 +72,7 @@ def test_tenant_s3_restore(
             last_flush_lsns.append(last_flush_lsn)
         ps_http.timeline_checkpoint(tenant_id, timeline_id)
         wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
+        log.info(f"{timeline} timeline {timeline_id} {last_flush_lsn=}")
         parent = timeline
 
     # These sleeps are important because they fend off differences in clocks between us and S3
@@ -108,6 +117,9 @@ def test_tenant_s3_restore(
     ps_http.tenant_attach(tenant_id, generation=generation)
     env.pageserver.quiesce_tenants()
 
+    for tline in ps_http.timeline_list(env.initial_tenant):
+        log.info(f"timeline detail: {tline}")
+
     for i, timeline in enumerate(["first", "second"]):
         with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
             endpoint.safe_psql(f"SELECT * FROM created_{timeline};")
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 545ba05b17..1996e99557 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -697,6 +697,9 @@ def test_sharding_ingest_layer_sizes(
         # small checkpointing and compaction targets to ensure we generate many upload operations
         "checkpoint_distance": f"{expect_layer_size}",
         "compaction_target_size": f"{expect_layer_size}",
+        # aim to reduce flakyness, we are not doing explicit checkpointing
+        "compaction_period": "0s",
+        "gc_period": "0s",
     }
     shard_count = 4
     neon_env_builder.num_pageservers = shard_count
@@ -712,6 +715,23 @@ def test_sharding_ingest_layer_sizes(
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
+    # ignore the initdb layer(s) for the purposes of the size comparison as a initdb image layer optimization
+    # will produce a lot more smaller layers.
+    initial_layers_per_shard = {}
+    log.info("initdb distribution (not asserted on):")
+    for shard in env.storage_controller.locate(tenant_id):
+        pageserver = env.get_pageserver(shard["node_id"])
+        shard_id = shard["shard_id"]
+        layers = (
+            env.get_pageserver(shard["node_id"]).http_client().layer_map_info(shard_id, timeline_id)
+        )
+        for layer in layers.historic_layers:
+            log.info(
+                f"layer[{pageserver.id}]: {layer.layer_file_name} (size {layer.layer_file_size})"
+            )
+
+        initial_layers_per_shard[shard_id] = set(layers.historic_layers)
+
     workload = Workload(env, tenant_id, timeline_id)
     workload.init()
     workload.write_rows(4096, upload=False)
@@ -733,7 +753,13 @@ def test_sharding_ingest_layer_sizes(
 
         historic_layers = sorted(layer_map.historic_layers, key=lambda layer: layer.lsn_start)
 
+        initial_layers = initial_layers_per_shard[shard_id]
+
         for layer in historic_layers:
+            if layer in initial_layers:
+                # ignore the initdb image layers for the size histogram
+                continue
+
             if layer.layer_file_size < expect_layer_size // 2:
                 classification = "Small"
                 small_layer_count += 1
@@ -763,7 +789,8 @@ def test_sharding_ingest_layer_sizes(
         pass
     else:
         # General case:
-        assert float(small_layer_count) / float(ok_layer_count) < 0.25
+        # old limit was 0.25 but pg14 is right at the limit with 7/28
+        assert float(small_layer_count) / float(ok_layer_count) < 0.3
 
     # Each shard may emit up to one huge layer, because initdb ingest doesn't respect checkpoint_distance.
     assert huge_layer_count <= shard_count

From a8ca7a1a1d88c6cff476eaf55c9f38c46dbfc645 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 10 Jun 2024 12:08:16 -0400
Subject: [PATCH 0946/1571] docs: highlight neon env comes with an initial
 timeline (#7995)

Quite a few existing test cases create their own timelines instead of
using the default one. This pull request highlights that and hopefully
people can write simpler tests in the future.

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
---
 test_runner/README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/test_runner/README.md b/test_runner/README.md
index fd68cfff79..7d95634ea8 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -285,6 +285,21 @@ def test_foobar(neon_env_builder: NeonEnvBuilder):
     ...
 ```
 
+The env includes a default tenant and timeline. Therefore, you do not need to create your own
+tenant/timeline for testing.
+
+```python
+def test_foobar2(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start() # Start the environment
+    with env.endpoints.create_start("main") as endpoint:
+        # Start the compute endpoint
+    client = env.pageserver.http_client() # Get the pageserver client
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)
+```
+
 For more information about pytest fixtures, see https://docs.pytest.org/en/stable/fixture.html
 
 At the end of a test, all the nodes in the environment are automatically stopped, so you

From e46692788e9d3e2b010b156c034af0c95a13a2a8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 10 Jun 2024 19:34:34 +0300
Subject: [PATCH 0947/1571] refactor: Timeline layer flushing (#7993)

The new features have deteriorated layer flushing, most recently with
#7927. Changes:

- inline `Timeline::freeze_inmem_layer` to the only caller
- carry the TimelineWriterState guard to the actual point of freezing
the layer
- this allows us to `#[cfg(feature = "testing")]` the assertion added in
#7927
- remove duplicate `flush_frozen_layer` in favor of splitting the
`flush_frozen_layers_and_wait`
- this requires starting the flush loop earlier for `checkpoint_distance
< initdb size` tests
---
 pageserver/src/tenant.rs                      |  12 +-
 pageserver/src/tenant/timeline.rs             | 140 ++++++++++--------
 .../src/tenant/timeline/layer_manager.rs      |  30 +++-
 3 files changed, 106 insertions(+), 76 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 60cd5c9695..2e3ce45c2b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3395,6 +3395,12 @@ impl Tenant {
         let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id;
         let unfinished_timeline = raw_timeline.raw_timeline()?;
 
+        // Flush the new layer files to disk, before we make the timeline as available to
+        // the outside world.
+        //
+        // Flush loop needs to be spawned in order to be able to flush.
+        unfinished_timeline.maybe_spawn_flush_loop();
+
         import_datadir::import_timeline_from_postgres_datadir(
             unfinished_timeline,
             &pgdata_path,
@@ -3406,12 +3412,6 @@ impl Tenant {
             format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}")
         })?;
 
-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        unfinished_timeline.maybe_spawn_flush_loop();
-
         fail::fail_point!("before-checkpoint-new-timeline", |_| {
             anyhow::bail!("failpoint before-checkpoint-new-timeline");
         });
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 58bdd84906..6da0f9d91c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1571,7 +1571,15 @@ impl Timeline {
     // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
     // polluting the span hierarchy.
     pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
-        let to_lsn = self.freeze_inmem_layer(false).await;
+        let to_lsn = {
+            // Freeze the current open in-memory layer. It will be written to disk on next
+            // iteration.
+            let mut g = self.write_lock.lock().await;
+
+            let to_lsn = self.get_last_record_lsn();
+            self.freeze_inmem_layer_at(to_lsn, &mut g).await;
+            to_lsn
+        };
         self.flush_frozen_layers_and_wait(to_lsn).await
     }
 
@@ -1657,25 +1665,35 @@ impl Timeline {
             self.last_freeze_at.load(),
             open_layer.get_opened_at(),
         ) {
-            match open_layer.info() {
+            let at_lsn = match open_layer.info() {
                 InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
                     // We may reach this point if the layer was already frozen by not yet flushed: flushing
                     // happens asynchronously in the background.
                     tracing::debug!(
                         "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
                     );
+                    None
                 }
                 InMemoryLayerInfo::Open { .. } => {
                     // Upgrade to a write lock and freeze the layer
                     drop(layers_guard);
                     let mut layers_guard = self.layers.write().await;
-                    layers_guard
-                        .try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at)
+                    let froze = layers_guard
+                        .try_freeze_in_memory_layer(
+                            current_lsn,
+                            &self.last_freeze_at,
+                            &mut write_guard,
+                        )
                         .await;
+                    Some(current_lsn).filter(|_| froze)
+                }
+            };
+            if let Some(lsn) = at_lsn {
+                let res: Result<u64, _> = self.flush_frozen_layers(lsn);
+                if let Err(e) = res {
+                    tracing::info!("failed to flush frozen layer after background freeze: {e:#}");
                 }
             }
-            write_guard.take();
-            self.flush_frozen_layers();
         }
     }
 
@@ -2384,7 +2402,7 @@ impl Timeline {
                 let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
                 self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
                 let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
-                assert!(matches!(*flush_loop_state, FlushLoopState::Running{ ..}));
+                assert!(matches!(*flush_loop_state, FlushLoopState::Running{..}));
                 *flush_loop_state  = FlushLoopState::Exited;
                 Ok(())
             }
@@ -3647,31 +3665,21 @@ impl Timeline {
         self.last_record_lsn.advance(new_lsn);
     }
 
-    /// Whether there was a layer to freeze or not, return the value of get_last_record_lsn
-    /// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive).
-    async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn {
-        // Freeze the current open in-memory layer. It will be written to disk on next
-        // iteration.
-
-        let _write_guard = if write_lock_held {
-            None
-        } else {
-            let mut g = self.write_lock.lock().await;
-            // remove the reference to an open layer
-            g.take();
-            Some(g)
+    async fn freeze_inmem_layer_at(
+        &self,
+        at: Lsn,
+        write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
+    ) {
+        let frozen = {
+            let mut guard = self.layers.write().await;
+            guard
+                .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock)
+                .await
         };
-
-        let to_lsn = self.get_last_record_lsn();
-        self.freeze_inmem_layer_at(to_lsn).await;
-        to_lsn
-    }
-
-    async fn freeze_inmem_layer_at(&self, at: Lsn) {
-        let mut guard = self.layers.write().await;
-        guard
-            .try_freeze_in_memory_layer(at, &self.last_freeze_at)
-            .await;
+        if frozen {
+            let now = Instant::now();
+            *(self.last_freeze_ts.write().unwrap()) = now;
+        }
     }
 
     /// Layer flusher task's main loop.
@@ -3765,18 +3773,14 @@ impl Timeline {
         }
     }
 
-    /// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk.
-    /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`].
+    /// Request the flush loop to write out all frozen layers up to `at_lsn` as Delta L0 files to disk.
+    /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer_at`].
     ///
-    /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
-    /// it means no data will be written between the top of the highest frozen layer and to_lsn,
-    /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
-    async fn flush_frozen_layers_and_wait(
-        &self,
-        last_record_lsn: Lsn,
-    ) -> Result<(), FlushLayerError> {
-        let mut rx = self.layer_flush_done_tx.subscribe();
-
+    /// `at_lsn` may be higher than the highest LSN of a frozen layer: if this is the
+    /// case, it means no data will be written between the top of the highest frozen layer and
+    /// to_lsn, e.g. because this tenant shard has ingested up to to_lsn and not written any data
+    /// locally for that part of the WAL.
+    fn flush_frozen_layers(&self, at_lsn: Lsn) -> Result<u64, FlushLayerError> {
         // Increment the flush cycle counter and wake up the flush task.
         // Remember the new value, so that when we listen for the flush
         // to finish, we know when the flush that we initiated has
@@ -3791,13 +3795,18 @@ impl Timeline {
         self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
             my_flush_request = *counter + 1;
             *counter = my_flush_request;
-            *lsn = std::cmp::max(last_record_lsn, *lsn);
+            *lsn = std::cmp::max(at_lsn, *lsn);
         });
 
+        Ok(my_flush_request)
+    }
+
+    async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> {
+        let mut rx = self.layer_flush_done_tx.subscribe();
         loop {
             {
                 let (last_result_counter, last_result) = &*rx.borrow();
-                if *last_result_counter >= my_flush_request {
+                if *last_result_counter >= request {
                     if let Err(err) = last_result {
                         // We already logged the original error in
                         // flush_loop. We cannot propagate it to the caller
@@ -3824,12 +3833,9 @@ impl Timeline {
         }
     }
 
-    fn flush_frozen_layers(&self) {
-        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
-            *counter += 1;
-
-            *lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1));
-        });
+    async fn flush_frozen_layers_and_wait(&self, at_lsn: Lsn) -> Result<(), FlushLayerError> {
+        let token = self.flush_frozen_layers(at_lsn)?;
+        self.wait_flush_completion(token).await
     }
 
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
@@ -5672,16 +5678,15 @@ impl<'a> TimelineWriter<'a> {
     }
 
     async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> {
-        assert!(self.write_guard.is_some());
-
-        self.tl.freeze_inmem_layer_at(freeze_at).await;
-
-        let now = Instant::now();
-        *(self.last_freeze_ts.write().unwrap()) = now;
-
-        self.tl.flush_frozen_layers();
-
         let current_size = self.write_guard.as_ref().unwrap().current_size;
+
+        // self.write_guard will be taken by the freezing
+        self.tl
+            .freeze_inmem_layer_at(freeze_at, &mut self.write_guard)
+            .await;
+
+        self.tl.flush_frozen_layers(freeze_at)?;
+
         if current_size >= self.get_checkpoint_distance() * 2 {
             warn!("Flushed oversized open layer with size {}", current_size)
         }
@@ -5695,20 +5700,27 @@ impl<'a> TimelineWriter<'a> {
             return OpenLayerAction::Open;
         };
 
+        #[cfg(feature = "testing")]
         if state.cached_last_freeze_at < self.tl.last_freeze_at.load() {
-            // TODO(#7993): branch is needed before refactoring the many places of freezing for the
-            // possibility `state` having a "dangling" reference to an already frozen in-memory
-            // layer.
+            // this check and assertion are not really needed because
+            // LayerManager::try_freeze_in_memory_layer will always clear out the
+            // TimelineWriterState if something is frozen. however, we can advance last_freeze_at when there
+            // is no TimelineWriterState.
             assert!(
                 state.open_layer.end_lsn.get().is_some(),
                 "our open_layer must be outdated"
             );
-            return OpenLayerAction::Open;
+
+            // this would be a memory leak waiting to happen because the in-memory layer always has
+            // an index
+            panic!("BUG: TimelineWriterState held on to frozen in-memory layer.");
         }
 
         if state.prev_lsn == Some(lsn) {
-            // Rolling mid LSN is not supported by downstream code.
+            // Rolling mid LSN is not supported by [downstream code].
             // Hence, only roll at LSN boundaries.
+            //
+            // [downstream code]: https://github.com/neondatabase/neon/pull/7993#discussion_r1633345422
             return OpenLayerAction::None;
         }
 
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 0e82dedecb..21e64d562a 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -21,6 +21,8 @@ use crate::{
     },
 };
 
+use super::TimelineWriterState;
+
 /// Provides semantic APIs to manipulate the layer map.
 #[derive(Default)]
 pub(crate) struct LayerManager {
@@ -120,18 +122,20 @@ impl LayerManager {
         Ok(layer)
     }
 
-    /// Called from `freeze_inmem_layer`, returns true if successfully frozen.
-    pub(crate) async fn try_freeze_in_memory_layer(
+    /// Tries to freeze an open layer and also manages clearing the TimelineWriterState.
+    ///
+    /// Returns true if anything was frozen.
+    pub(super) async fn try_freeze_in_memory_layer(
         &mut self,
         lsn: Lsn,
         last_freeze_at: &AtomicLsn,
-    ) {
+        write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
+    ) -> bool {
         let Lsn(last_record_lsn) = lsn;
         let end_lsn = Lsn(last_record_lsn + 1);
 
-        if let Some(open_layer) = &self.layer_map.open_layer {
+        let froze = if let Some(open_layer) = &self.layer_map.open_layer {
             let open_layer_rc = Arc::clone(open_layer);
-            // Does this layer need freezing?
             open_layer.freeze(end_lsn).await;
 
             // The layer is no longer open, update the layer map to reflect this.
@@ -139,11 +143,25 @@ impl LayerManager {
             self.layer_map.frozen_layers.push_back(open_layer_rc);
             self.layer_map.open_layer = None;
             self.layer_map.next_open_layer_at = Some(end_lsn);
-        }
+
+            true
+        } else {
+            false
+        };
 
         // Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this
         // accounts for regions in the LSN range where we might have ingested no data due to sharding.
         last_freeze_at.store(end_lsn);
+
+        // the writer state must no longer have a reference to the frozen layer
+        let taken = write_lock.take();
+        assert_eq!(
+            froze,
+            taken.is_some(),
+            "should only had frozen a layer when TimelineWriterState existed"
+        );
+
+        froze
     }
 
     /// Add image layers to the layer map, called from `create_image_layers`.

From e27ce3861914e89ad43aafb36e5fb96f13cd2bc2 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 11 Jun 2024 13:07:51 +0200
Subject: [PATCH 0948/1571] Add testing for extensions (#7818)

## Problem

We need automated tests of extensions shipped with Neon to detect
possible problems.

## Summary of changes

A new image neon-test-extensions is added. Workflow changes to test the
shipped extensions are added as well.
Currently, the regression tests, shipped with extensions are in use.
Some extensions, i.e. rum, timescaledb, rdkit, postgis, pgx_ulid, pgtap,
pg_tiktoken, pg_jsonschema, pg_graphql, kq_imcx, wal2json_2_5 are
excluded due to problems or absence of internal tests.

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 .dockerignore                                 |   1 +
 .github/workflows/build_and_test.yml          |  29 ++-
 Dockerfile.compute-node                       |  63 +++++
 docker-compose/compute_wrapper/Dockerfile     |   7 +-
 .../var/db/postgres/specs/spec.json           |  12 +-
 docker-compose/docker-compose.yml             |  12 +-
 docker-compose/docker_compose_test.sh         |  83 +++++--
 docker-compose/run-tests.sh                   |  15 ++
 patches/pg_anon.patch                         | 223 ++++++++++++++++++
 patches/pg_cron.patch                         |  19 ++
 patches/pg_hintplan.patch                     |  39 +++
 11 files changed, 478 insertions(+), 25 deletions(-)
 create mode 100644 docker-compose/run-tests.sh
 create mode 100644 patches/pg_anon.patch
 create mode 100644 patches/pg_cron.patch
 create mode 100644 patches/pg_hintplan.patch

diff --git a/.dockerignore b/.dockerignore
index 1258532db8..eead727994 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -8,6 +8,7 @@
 !scripts/combine_control_files.py
 !scripts/ninstall.sh
 !vm-cgconfig.conf
+!docker-compose/run-tests.sh
 
 # Directories
 !.cargo/
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b9caf76060..79a0a77638 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -858,6 +858,26 @@ jobs:
           cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
           tags: |
             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+      
+      - name: Build neon extensions test image
+        if: matrix.version == 'v16'
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            PG_VERSION=${{ matrix.version }}
+            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.compute-node
+          target: neon-pg-ext-test
+          cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
+          cache-to: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
+          tags: |
+            neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
 
       - name: Build compute-tools image
         # compute-tools are Postgres independent, so build it only once
@@ -902,6 +922,13 @@ jobs:
                                              neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
                                              neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
 
+      - name: Create multi-arch neon-test-extensions image
+        if: matrix.version == 'v16'
+        run: |
+          docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
+                                             neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
+                                             neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
+
       - name: Create multi-arch compute-tools image
         if: matrix.version == 'v16'
         run: |
@@ -1020,7 +1047,7 @@ jobs:
             exit 1
           fi
 
-      - name: Verify docker-compose example
+      - name: Verify docker-compose example and test extensions
         timeout-minutes: 20
         run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
 
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 90b8868b43..a86fdd0bc3 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -928,6 +928,69 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a
 
+
+#########################################################################################
+#
+# Layer neon-pg-ext-test
+#
+#########################################################################################
+
+FROM neon-pg-ext-build AS neon-pg-ext-test
+ARG PG_VERSION
+RUN mkdir /ext-src
+
+#COPY --from=postgis-build /postgis.tar.gz /ext-src/
+#COPY --from=postgis-build /sfcgal/* /usr
+COPY --from=plv8-build /plv8.tar.gz /ext-src/
+COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
+COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
+COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
+COPY --from=vector-pg-build /pgvector.patch /ext-src/
+COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
+#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
+#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
+#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
+COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
+COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
+#COPY --from=rum-pg-build /rum.tar.gz /ext-src
+#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
+COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
+COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
+COPY --from=hll-pg-build /hll.tar.gz /ext-src
+COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
+#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
+COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
+COPY patches/pg_hintplan.patch /ext-src
+#COPY --from=kq-imcx-pg-build /kq_imcx.tar.gz /ext-src
+COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
+COPY patches/pg_cron.patch /ext-src
+#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
+COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
+COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
+COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
+COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
+#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
+#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
+COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
+COPY patches/pg_anon.patch /ext-src
+COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
+COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
+RUN cd /ext-src/ && for f in *.tar.gz; \
+    do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
+    rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
+    || exit 1; rm -f $f; done
+RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch 
+# cmake is required for the h3 test
+RUN apt-get update && apt-get install -y cmake
+RUN patch -p1 < /ext-src/pg_hintplan.patch
+COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
+RUN patch -p1 </ext-src/pg_anon.patch
+RUN patch -p1 </ext-src/pg_cron.patch
+ENV PATH=/usr/local/pgsql/bin:$PATH
+ENV PGHOST=compute
+ENV PGPORT=55433
+ENV PGUSER=cloud_admin
+ENV PGDATABASE=postgres
 #########################################################################################
 #
 # Final layer
diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile
index 974dcd7f03..8378f37b48 100644
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -8,6 +8,11 @@ USER root
 RUN apt-get update &&       \
     apt-get install -y curl \
                        jq   \
+                       python3-pip \
                        netcat
+#Faker is required for the pg_anon test
+RUN pip3 install Faker
+#This is required for the pg_hintplan test
+RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src 
 
-USER postgres
+USER postgres
\ No newline at end of file
diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
index ccf0a91b90..8e582e74e1 100644
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -95,7 +95,7 @@
             },
             {
                 "name": "shared_preload_libraries",
-                "value": "neon",
+                "value": "neon,pg_cron,timescaledb,pg_stat_statements",
                 "vartype": "string"
             },
             {
@@ -127,6 +127,16 @@
                 "name": "max_replication_flush_lag",
                 "value": "10GB",
                 "vartype": "string"
+            },
+            {
+                "name": "cron.database",
+                "value": "postgres",
+                "vartype": "string"
+            },
+            {
+                "name": "session_preload_libraries",
+                "value": "anon",
+                "vartype": "string"
             }
         ]
     },
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index 9777d1fdd2..a395f0331b 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -1,5 +1,3 @@
-version: '3'
-
 services:
   minio:
     restart: always
@@ -194,3 +192,13 @@ services:
          done"
     depends_on:
       - compute
+
+  neon-test-extensions:
+    image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest}
+    entrypoint:
+      - "/bin/bash"
+      - "-c"
+    command:
+      - sleep 1800
+    depends_on:
+      - compute
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index 062fc6fc92..71e73c2d0a 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -7,15 +7,21 @@
 # Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file
 # Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
 # to verify custom image builds (e.g pre-published ones).
-
+#
+# A test script for postgres extensions
+# Currently supports only v16
+#
 set -eux -o pipefail
 
-SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
-COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml
-
+COMPOSE_FILE='docker-compose.yml'
+cd $(dirname $0)
+docker compose -f $COMPOSE_FILE 
 COMPUTE_CONTAINER_NAME=docker-compose-compute-1
-SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
-PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
+TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1
+PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
+: ${http_proxy:=}
+: ${https_proxy:=}
+export http_proxy https_proxy
 
 cleanup() {
     echo "show container information"
@@ -25,34 +31,71 @@ cleanup() {
     docker compose -f $COMPOSE_FILE down
 }
 
-echo "clean up containers if exists"
-cleanup
-
 for pg_version in 14 15 16; do
-    echo "start containers (pg_version=$pg_version)."
-    PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d
+    echo "clean up containers if exists"
+    cleanup
+    PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version))
+    PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose -f $COMPOSE_FILE up --build -d
 
     echo "wait until the compute is ready. timeout after 60s. "
     cnt=0
-    while sleep 1; do
+    while sleep 3; do
         # check timeout
-        cnt=`expr $cnt + 1`
+        cnt=`expr $cnt + 3`
         if [ $cnt -gt 60 ]; then
             echo "timeout before the compute is ready."
             cleanup
             exit 1
         fi
-
-        # check if the compute is ready
-        set +o pipefail
-        result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
-        set -o pipefail
-        if [ $result -eq 1 ]; then
+        if docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then
             echo "OK. The compute is ready to connect."
             echo "execute simple queries."
             docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
-            cleanup
             break
         fi
     done
+
+    if [ $pg_version -ge 16 ]
+    then
+        echo Enabling trust connection
+        docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' "
+        echo Adding postgres role
+        docker exec $COMPUTE_CONTAINER_NAME psql $PSQL_OPTION -c "CREATE ROLE postgres SUPERUSER LOGIN"
+        # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
+        # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
+        echo Adding dummy config
+        docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf
+        # This block is required for the pg_anon extension test.
+        # The test assumes that it is running on the same host with the postgres engine.
+        # In our case it's not true, that's why we are copying files to the compute node
+        TMPDIR=$(mktemp -d)
+        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
+        echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
+        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
+        rm -rf $TMPDIR
+        TMPDIR=$(mktemp -d)
+        # The following block does the same for the pg_hintplan test
+        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
+        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
+        rm -rf $TMPDIR
+        # We are running tests now
+        if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
+        then
+            cleanup
+        else
+            FAILED=$(tail -1 testout.txt)
+            for d in $FAILED
+            do
+                mkdir $d
+                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true
+                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true
+                cat $d/regression.out $d/regression.diffs || true
+            done
+        rm -rf $FAILED
+        cleanup
+        exit 1
+        fi
+    fi
+    cleanup
 done
diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh
new file mode 100644
index 0000000000..c05fc159aa
--- /dev/null
+++ b/docker-compose/run-tests.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -x
+
+cd /ext-src
+FAILED=
+LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
+for d in ${LIST}
+do
+       [ -d ${d} ] || continue
+    psql -c "select 1" >/dev/null || break
+       make -C ${d} installcheck || FAILED="${d} ${FAILED}"
+done
+[ -z "${FAILED}" ] && exit 0
+echo ${FAILED}
+exit 1
\ No newline at end of file
diff --git a/patches/pg_anon.patch b/patches/pg_anon.patch
new file mode 100644
index 0000000000..15dfd3c5a0
--- /dev/null
+++ b/patches/pg_anon.patch
@@ -0,0 +1,223 @@
+commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
+Author: Alexey Masterov <alexeymasterov@neon.tech>
+Date:   Fri May 31 06:34:26 2024 +0000
+
+    These alternative expected files were added to consider the neon features
+
+diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
+new file mode 100644
+index 0000000..2539cfd
+--- /dev/null
++++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
+@@ -0,0 +1,101 @@
++BEGIN;
++CREATE EXTENSION anon CASCADE;
++NOTICE:  installing required extension "pgcrypto"
++SELECT anon.init();
++ init 
++------
++ t
++(1 row)
++
++CREATE ROLE mallory_the_masked_user;
++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
++CREATE TABLE t1(i INT);
++ALTER TABLE t1 ADD COLUMN t TEXT;
++SECURITY LABEL FOR anon ON COLUMN t1.t
++IS 'MASKED WITH VALUE NULL';
++INSERT INTO t1 VALUES (1,'test');
++--
++-- We're checking the owner's permissions
++--
++-- see
++-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
++--
++SET ROLE mallory_the_masked_user;
++SELECT anon.pseudo_first_name(0) IS NOT NULL;
++ ?column? 
++----------
++ t
++(1 row)
++
++-- SHOULD FAIL
++DO $$
++BEGIN
++  PERFORM anon.init();
++  EXCEPTION WHEN insufficient_privilege
++  THEN RAISE NOTICE 'insufficient_privilege';
++END$$;
++NOTICE:  insufficient_privilege
++-- SHOULD FAIL
++DO $$
++BEGIN
++  PERFORM anon.anonymize_table('t1');
++  EXCEPTION WHEN insufficient_privilege
++  THEN RAISE NOTICE 'insufficient_privilege';
++END$$;
++NOTICE:  insufficient_privilege
++-- SHOULD FAIL
++SAVEPOINT fail_start_engine;
++SELECT anon.start_dynamic_masking();
++ERROR:  Only supersusers can start the dynamic masking engine.
++CONTEXT:  PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE
++ROLLBACK TO fail_start_engine;
++RESET ROLE;
++SELECT anon.start_dynamic_masking();
++ start_dynamic_masking 
++-----------------------
++ t
++(1 row)
++
++SET ROLE mallory_the_masked_user;
++SELECT * FROM mask.t1;
++ i | t 
++---+---
++ 1 | 
++(1 row)
++
++-- SHOULD FAIL
++DO $$
++BEGIN
++  SELECT * FROM public.t1;
++  EXCEPTION WHEN insufficient_privilege
++  THEN RAISE NOTICE 'insufficient_privilege';
++END$$;
++NOTICE:  insufficient_privilege
++-- SHOULD FAIL
++SAVEPOINT fail_stop_engine;
++SELECT anon.stop_dynamic_masking();
++ERROR:  Only supersusers can stop the dynamic masking engine.
++CONTEXT:  PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE
++ROLLBACK TO fail_stop_engine;
++RESET ROLE;
++SELECT anon.stop_dynamic_masking();
++NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
++ stop_dynamic_masking 
++----------------------
++ t
++(1 row)
++
++SET ROLE mallory_the_masked_user;
++SELECT COUNT(*)=1 FROM anon.pg_masking_rules;
++ ?column? 
++----------
++ t
++(1 row)
++
++-- SHOULD FAIL
++SAVEPOINT fail_seclabel_on_role;
++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
++ERROR:  permission denied
++DETAIL:  The current user must have the CREATEROLE attribute.
++ROLLBACK TO fail_seclabel_on_role;
++ROLLBACK;
+diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
+new file mode 100644
+index 0000000..8b090fe
+--- /dev/null
++++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
+@@ -0,0 +1,104 @@
++BEGIN;
++CREATE EXTENSION anon CASCADE;
++NOTICE:  installing required extension "pgcrypto"
++SELECT anon.init();
++ init 
++------
++ t
++(1 row)
++
++CREATE ROLE oscar_the_owner;
++ALTER DATABASE :DBNAME OWNER TO oscar_the_owner;
++CREATE ROLE mallory_the_masked_user;
++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
++--
++-- We're checking the owner's permissions
++--
++-- see
++-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
++--
++SET ROLE oscar_the_owner;
++SELECT anon.pseudo_first_name(0) IS NOT NULL;
++ ?column? 
++----------
++ t
++(1 row)
++
++-- SHOULD FAIL
++DO $$
++BEGIN
++  PERFORM anon.init();
++  EXCEPTION WHEN insufficient_privilege
++  THEN RAISE NOTICE 'insufficient_privilege';
++END$$;
++NOTICE:  insufficient_privilege
++CREATE TABLE t1(i INT);
++ALTER TABLE t1 ADD COLUMN t TEXT;
++SECURITY LABEL FOR anon ON COLUMN t1.t
++IS 'MASKED WITH VALUE NULL';
++INSERT INTO t1 VALUES (1,'test');
++SELECT anon.anonymize_table('t1');
++ anonymize_table 
++-----------------
++ t
++(1 row)
++
++SELECT * FROM t1;
++ i | t 
++---+---
++ 1 | 
++(1 row)
++
++UPDATE t1 SET t='test' WHERE i=1;
++-- SHOULD FAIL
++SAVEPOINT fail_start_engine;
++SELECT anon.start_dynamic_masking();
++ start_dynamic_masking 
++-----------------------
++ t
++(1 row)
++
++ROLLBACK TO fail_start_engine;
++RESET ROLE;
++SELECT anon.start_dynamic_masking();
++ start_dynamic_masking 
++-----------------------
++ t
++(1 row)
++
++SET ROLE oscar_the_owner;
++SELECT * FROM t1;
++ i |  t   
++---+------
++ 1 | test
++(1 row)
++
++--SELECT * FROM mask.t1;
++-- SHOULD FAIL
++SAVEPOINT fail_stop_engine;
++SELECT anon.stop_dynamic_masking();
++ERROR:  permission denied for schema mask
++CONTEXT:  SQL statement "DROP VIEW mask.t1;"
++PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE
++SQL statement "SELECT anon.mask_drop_view(oid)
++  FROM pg_catalog.pg_class
++  WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE
++  AND relkind IN ('r','p','f')"
++PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM
++ROLLBACK TO fail_stop_engine;
++RESET ROLE;
++SELECT anon.stop_dynamic_masking();
++NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
++ stop_dynamic_masking 
++----------------------
++ t
++(1 row)
++
++SET ROLE oscar_the_owner;
++-- SHOULD FAIL
++SAVEPOINT fail_seclabel_on_role;
++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
++ERROR:  permission denied
++DETAIL:  The current user must have the CREATEROLE attribute.
++ROLLBACK TO fail_seclabel_on_role;
++ROLLBACK;
diff --git a/patches/pg_cron.patch b/patches/pg_cron.patch
new file mode 100644
index 0000000000..c2b648c20c
--- /dev/null
+++ b/patches/pg_cron.patch
@@ -0,0 +1,19 @@
+commit b3ea51ee158f113f2f82d0b97c12c54343c9a695 (HEAD -> master)
+Author: Alexey Masterov <alexeymasterov@neon.tech>
+Date:   Fri Jun 7 19:23:42 2024 +0000
+
+    Disable REGRESS_OPTIONS causing initdb
+
+diff --git a/ext-src/pg_cron-src/Makefile b/ext-src/pg_cron-src/Makefile
+index 053314c..fbd5fb5 100644
+--- a/ext-src/pg_cron-src/Makefile
++++ b/ext-src/pg_cron-src/Makefile
+@@ -5,7 +5,7 @@ EXTENSION = pg_cron
+ DATA_built = $(EXTENSION)--1.0.sql
+ DATA = $(wildcard $(EXTENSION)--*--*.sql)
+ 
+-REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check
++#REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check
+ REGRESS = pg_cron-test 
+ 
+ # compilation configuration
diff --git a/patches/pg_hintplan.patch b/patches/pg_hintplan.patch
new file mode 100644
index 0000000000..61a5ecbb90
--- /dev/null
+++ b/patches/pg_hintplan.patch
@@ -0,0 +1,39 @@
+commit f7925d4d1406c0f0229e3c691c94b69e381899b1 (HEAD -> master)
+Author: Alexey Masterov <alexeymasterov@neon.tech>
+Date:   Thu Jun 6 08:02:42 2024 +0000
+
+    Patch expected files to consider Neon's log messages
+
+diff --git a/ext-src/pg_hint_plan-src/expected/ut-A.out b/ext-src/pg_hint_plan-src/expected/ut-A.out
+index da723b8..f8d0102 100644
+--- a/ext-src/pg_hint_plan-src/expected/ut-A.out
++++ b/ext-src/pg_hint_plan-src/expected/ut-A.out
+@@ -9,13 +9,16 @@ SET search_path TO public;
+ ----
+ -- No.A-1-1-3
+ CREATE EXTENSION pg_hint_plan;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ -- No.A-1-2-3
+ DROP EXTENSION pg_hint_plan;
+ -- No.A-1-1-4
+ CREATE SCHEMA other_schema;
+ CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ ERROR:  extension "pg_hint_plan" must be installed in schema "hint_plan"
+ CREATE EXTENSION pg_hint_plan;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ DROP SCHEMA other_schema;
+ ----
+ ---- No. A-5-1 comment pattern
+diff --git a/ext-src/pg_hint_plan-src/expected/ut-fdw.out b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
+index d372459..6282afe 100644
+--- a/ext-src/pg_hint_plan-src/expected/ut-fdw.out
++++ b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
+@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
+ SET client_min_messages TO LOG;
+ SET pg_hint_plan.enable_hint TO on;
+ CREATE EXTENSION file_fdw;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
+ CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
+ CREATE USER MAPPING FOR PUBLIC SERVER file_server;
+ CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');

From 7515d0f368e14dfb82520c8a493a49e5671e479e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 11 Jun 2024 15:38:54 +0300
Subject: [PATCH 0949/1571] fix: stop storing TimelineMetadata in
 index_part.json as bytes (#7699)

We've stored metadata as bytes within the `index_part.json` for
long fixed reasons. #7693 added support for reading out normal json
serialization of the `TimelineMetadata`.

Change the serialization to only write `TimelineMetadata` as json for
going forward, keeping the backward compatibility to reading the
metadata as bytes. Because of failure to include `alias = "metadata"` in
#7693, one more follow-up is required to make the switch from the old
name to `"metadata": <json>`, but that affects only the field name in
serialized format.

In documentation and naming, an effort is made to add enough warning
signs around TimelineMetadata so that it will receive no changes in the
future. We can add those fields to `IndexPart` directly instead.

Additionally, the path to cleaning up `metadata.rs` is documented in the
`metadata.rs` module comment. If we must extend `TimelineMetadata`
before that, the duplication suggested in [review comment] is the way to
go.

[review comment]:
https://github.com/neondatabase/neon/pull/7699#pullrequestreview-2107081558
---
 pageserver/ctl/src/index_part.rs              |  22 +-
 pageserver/src/tenant/metadata.rs             | 256 +++++++-----------
 .../tenant/remote_timeline_client/index.rs    |  82 +++++-
 s3_scrubber/src/checks.rs                     |  13 +-
 s3_scrubber/src/scan_pageserver_metadata.rs   |   2 +-
 .../regress/test_layers_from_future.py        |   3 +-
 6 files changed, 178 insertions(+), 200 deletions(-)

diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs
index a33cae6769..20018846f8 100644
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -1,11 +1,6 @@
-use std::collections::HashMap;
-
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
-use pageserver::tenant::storage_layer::LayerName;
-use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
-use utils::lsn::Lsn;
+use pageserver::tenant::IndexPart;
 
 #[derive(clap::Subcommand)]
 pub(crate) enum IndexPartCmd {
@@ -17,20 +12,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
         IndexPartCmd::Dump { path } => {
             let bytes = tokio::fs::read(path).await.context("read file")?;
             let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
-            #[derive(serde::Serialize)]
-            struct Output<'a> {
-                layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
-                disk_consistent_lsn: Lsn,
-                timeline_metadata: &'a TimelineMetadata,
-            }
-
-            let output = Output {
-                layer_metadata: &des.layer_metadata,
-                disk_consistent_lsn: des.metadata.disk_consistent_lsn(),
-                timeline_metadata: &des.metadata,
-            };
-
-            let output = serde_json::to_string_pretty(&output).context("serialize output")?;
+            let output = serde_json::to_string_pretty(&des).context("serialize output")?;
             println!("{output}");
             Ok(())
         }
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index c00672895a..6ba1bdef9b 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -1,15 +1,23 @@
-//! Every image of a certain timeline from [`crate::tenant::Tenant`]
-//! has a metadata that needs to be stored persistently.
+//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in
+//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines,
+//! this struct and it's original serialization format is still needed because they were written a
+//! long time ago.
 //!
-//! Later, the file gets used in [`remote_timeline_client`] as a part of
-//! external storage import and export operations.
+//! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json
+//! versioning.
 //!
-//! The module contains all structs and related helper methods related to timeline metadata.
+//! To clean up this module we need to migrate all index_part.json files to a later version.
+//! While doing this, we need to be mindful about s3 based recovery as well, so it might take
+//! however long we keep the old versions to be able to delete the old code. After that, we can
+//! remove everything else than [`TimelineMetadataBodyV2`], rename it as `TimelineMetadata` and
+//! move it to `index.rs`. Before doing all of this, we need to keep the structures for backwards
+//! compatibility.
 //!
 //! [`remote_timeline_client`]: super::remote_timeline_client
+//! [`IndexPart`]: super::remote_timeline_client::index::IndexPart
 
 use anyhow::ensure;
-use serde::{de::Error, Deserialize, Serialize, Serializer};
+use serde::{Deserialize, Serialize};
 use utils::bin_ser::SerializeError;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
 
@@ -17,17 +25,37 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
 const METADATA_FORMAT_VERSION: u16 = 4;
 
 /// Previous supported format versions.
+///
+/// In practice, none of these should remain, all are [`METADATA_FORMAT_VERSION`], but confirming
+/// that requires a scrubber run which is yet to be done.
 const METADATA_OLD_FORMAT_VERSION: u16 = 3;
 
-/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic.
+/// When the file existed on disk we assumed that a write of up to METADATA_MAX_SIZE bytes is atomic.
 ///
 /// This is the same assumption that PostgreSQL makes with the control file,
+///
 /// see PG_CONTROL_MAX_SAFE_SIZE
 const METADATA_MAX_SIZE: usize = 512;
 
-/// Metadata stored on disk for each timeline
+/// Legacy metadata stored as a component of `index_part.json` per timeline.
 ///
-/// The fields correspond to the values we hold in memory, in Timeline.
+/// Do not make new changes to this type or the module. In production, we have two different kinds
+/// of serializations of this type: bincode and json. Bincode version reflects what used to be
+/// stored on disk in earlier versions and does internal crc32 checksumming.
+///
+/// This type should not implement `serde::Serialize` or `serde::Deserialize` because there would
+/// be a confusion whether you want the old version ([`TimelineMetadata::from_bytes`]) or the modern
+/// as-exists in `index_part.json` ([`self::modern_serde`]).
+///
+/// ```compile_fail
+/// #[derive(serde::Serialize)]
+/// struct DoNotDoThis(pageserver::tenant::metadata::TimelineMetadata);
+/// ```
+///
+/// ```compile_fail
+/// #[derive(serde::Deserialize)]
+/// struct NeitherDoThis(pageserver::tenant::metadata::TimelineMetadata);
+/// ```
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct TimelineMetadata {
     hdr: TimelineMetadataHeader,
@@ -40,6 +68,49 @@ struct TimelineMetadataHeader {
     size: u16,           // size of serialized metadata
     format_version: u16, // metadata format version (used for compatibility checks)
 }
+
+impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader {
+    type Error = Crc32CalculationFailed;
+
+    fn try_from(value: &TimelineMetadataBodyV2) -> Result<Self, Self::Error> {
+        #[derive(Default)]
+        struct Crc32Sink {
+            crc: u32,
+            count: usize,
+        }
+
+        impl std::io::Write for Crc32Sink {
+            fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+                self.crc = crc32c::crc32c_append(self.crc, buf);
+                self.count += buf.len();
+                Ok(buf.len())
+            }
+
+            fn flush(&mut self) -> std::io::Result<()> {
+                Ok(())
+            }
+        }
+
+        // jump through hoops to calculate the crc32 so that TimelineMetadata::ne works
+        // across serialization versions
+        let mut sink = Crc32Sink::default();
+        <TimelineMetadataBodyV2 as utils::bin_ser::BeSer>::ser_into(value, &mut sink)
+            .map_err(Crc32CalculationFailed)?;
+
+        let size = METADATA_HDR_SIZE + sink.count;
+
+        Ok(TimelineMetadataHeader {
+            checksum: sink.crc,
+            size: size as u16,
+            format_version: METADATA_FORMAT_VERSION,
+        })
+    }
+}
+
+#[derive(thiserror::Error, Debug)]
+#[error("re-serializing for crc32 failed")]
+struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError);
+
 const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();
 
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -111,6 +182,12 @@ impl TimelineMetadata {
         }
     }
 
+    #[cfg(test)]
+    pub(crate) fn with_recalculated_checksum(mut self) -> anyhow::Result<Self> {
+        self.hdr = TimelineMetadataHeader::try_from(&self.body)?;
+        Ok(self)
+    }
+
     fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
         let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?;
 
@@ -261,32 +338,8 @@ impl TimelineMetadata {
     }
 }
 
-impl<'de> Deserialize<'de> for TimelineMetadata {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let bytes = Vec::<u8>::deserialize(deserializer)?;
-        Self::from_bytes(bytes.as_slice()).map_err(D::Error::custom)
-    }
-}
-
-impl Serialize for TimelineMetadata {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: Serializer,
-    {
-        let bytes = self.to_bytes().map_err(serde::ser::Error::custom)?;
-        bytes.serialize(serializer)
-    }
-}
-
 pub(crate) mod modern_serde {
-    use crate::tenant::metadata::METADATA_FORMAT_VERSION;
-
-    use super::{
-        TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader, METADATA_HDR_SIZE,
-    };
+    use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader};
     use serde::{Deserialize, Serialize};
 
     pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result<TimelineMetadata, D::Error>
@@ -322,71 +375,15 @@ pub(crate) mod modern_serde {
 
                 let de = serde::de::value::MapAccessDeserializer::new(map);
                 let body = TimelineMetadataBodyV2::deserialize(de)?;
+                let hdr = TimelineMetadataHeader::try_from(&body).map_err(A::Error::custom)?;
 
-                // jump through hoops to calculate the crc32 so that TimelineMetadata::ne works
-                // across serialization versions
-                let mut sink = Crc32Sink::default();
-                <TimelineMetadataBodyV2 as utils::bin_ser::BeSer>::ser_into(&body, &mut sink)
-                    .map_err(|e| A::Error::custom(Crc32CalculationFailed(e)))?;
-
-                let size = METADATA_HDR_SIZE + sink.count;
-
-                Ok(TimelineMetadata {
-                    hdr: TimelineMetadataHeader {
-                        checksum: sink.crc,
-                        size: size as u16,
-                        format_version: METADATA_FORMAT_VERSION,
-                    },
-                    body,
-                })
+                Ok(TimelineMetadata { hdr, body })
             }
         }
 
         deserializer.deserialize_any(Visitor)
     }
 
-    #[derive(Default)]
-    struct Crc32Sink {
-        crc: u32,
-        count: usize,
-    }
-
-    impl std::io::Write for Crc32Sink {
-        fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
-            self.crc = crc32c::crc32c_append(self.crc, buf);
-            self.count += buf.len();
-            Ok(buf.len())
-        }
-
-        fn flush(&mut self) -> std::io::Result<()> {
-            Ok(())
-        }
-    }
-
-    #[derive(thiserror::Error)]
-    #[error("re-serializing for crc32 failed")]
-    struct Crc32CalculationFailed<E>(#[source] E);
-
-    // this should be true for one release, after that we can change it to false
-    // remember to check the IndexPart::metadata field TODO comment as well
-    const LEGACY_BINCODED_BYTES: bool = true;
-
-    #[derive(serde::Serialize)]
-    #[serde(transparent)]
-    struct LegacyPaddedBytes<'a>(&'a TimelineMetadata);
-
-    struct JustTheBodyV2<'a>(&'a TimelineMetadata);
-
-    impl serde::Serialize for JustTheBodyV2<'_> {
-        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            // header is not needed, upon reading we've upgraded all v1 to v2
-            self.0.body.serialize(serializer)
-        }
-    }
-
     pub(crate) fn serialize<S>(
         metadata: &TimelineMetadata,
         serializer: S,
@@ -394,25 +391,23 @@ pub(crate) mod modern_serde {
     where
         S: serde::Serializer,
     {
-        // we cannot use TimelineMetadata::serialize for now because it'll do
-        // TimelineMetadata::to_bytes
-        if LEGACY_BINCODED_BYTES {
-            LegacyPaddedBytes(metadata).serialize(serializer)
-        } else {
-            JustTheBodyV2(metadata).serialize(serializer)
-        }
+        // header is not needed, upon reading we've upgraded all v1 to v2
+        metadata.body.serialize(serializer)
     }
 
     #[test]
     fn deserializes_bytes_as_well_as_equivalent_body_v2() {
         #[derive(serde::Deserialize, serde::Serialize)]
-        struct Wrapper(#[serde(deserialize_with = "deserialize")] TimelineMetadata);
+        struct Wrapper(
+            #[serde(deserialize_with = "deserialize", serialize_with = "serialize")]
+            TimelineMetadata,
+        );
 
         let too_many_bytes = "[216,111,252,208,0,54,0,4,0,0,0,0,1,73,253,144,1,0,0,0,0,1,73,253,24,0,0,0,0,0,0,0,0,0,0,0,0,0,1,73,253,24,0,0,0,0,1,73,253,24,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]";
 
         let wrapper_from_bytes = serde_json::from_str::<Wrapper>(too_many_bytes).unwrap();
 
-        let serialized = serde_json::to_value(JustTheBodyV2(&wrapper_from_bytes.0)).unwrap();
+        let serialized = serde_json::to_value(&wrapper_from_bytes).unwrap();
 
         assert_eq!(
             serialized,
@@ -553,59 +548,6 @@ mod tests {
         );
     }
 
-    #[test]
-    fn test_metadata_bincode_serde() {
-        let original_metadata = TimelineMetadata::new(
-            Lsn(0x200),
-            Some(Lsn(0x100)),
-            Some(TIMELINE_ID),
-            Lsn(0),
-            Lsn(0),
-            Lsn(0),
-            // Any version will do here, so use the default
-            crate::DEFAULT_PG_VERSION,
-        );
-        let metadata_bytes = original_metadata
-            .to_bytes()
-            .expect("Cannot create bytes array from metadata");
-
-        let metadata_bincode_be_bytes = original_metadata
-            .ser()
-            .expect("Cannot serialize the metadata");
-
-        // 8 bytes for the length of the vector
-        assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len());
-
-        let expected_bincode_bytes = {
-            let mut temp = vec![];
-            let len_bytes = metadata_bytes.len().to_be_bytes();
-            temp.extend_from_slice(&len_bytes);
-            temp.extend_from_slice(&metadata_bytes);
-            temp
-        };
-        assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes);
-
-        let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap();
-        // Deserialized metadata has the metadata header, which is different from the serialized one.
-        //   Reference: TimelineMetaData::to_bytes()
-        let expected_metadata = {
-            let mut temp_metadata = original_metadata;
-            let body_bytes = temp_metadata
-                .body
-                .ser()
-                .expect("Cannot serialize the metadata body");
-            let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
-            let hdr = TimelineMetadataHeader {
-                size: metadata_size as u16,
-                format_version: METADATA_FORMAT_VERSION,
-                checksum: crc32c::crc32c(&body_bytes),
-            };
-            temp_metadata.hdr = hdr;
-            temp_metadata
-        };
-        assert_eq!(deserialized_metadata, expected_metadata);
-    }
-
     #[test]
     fn test_metadata_bincode_serde_ensure_roundtrip() {
         let original_metadata = TimelineMetadata::new(
@@ -619,8 +561,6 @@ mod tests {
             crate::DEFAULT_PG_VERSION,
         );
         let expected_bytes = vec![
-            /* bincode length encoding bytes */
-            0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector
             /* TimelineMetadataHeader */
             4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
             /* TimelineMetadataBodyV2 */
@@ -650,7 +590,7 @@ mod tests {
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0, 0, 0,
         ];
-        let metadata_ser_bytes = original_metadata.ser().unwrap();
+        let metadata_ser_bytes = original_metadata.to_bytes().unwrap();
         assert_eq!(metadata_ser_bytes, expected_bytes);
 
         let expected_metadata = {
@@ -668,7 +608,7 @@ mod tests {
             temp_metadata.hdr = hdr;
             temp_metadata
         };
-        let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap();
+        let des_metadata = TimelineMetadata::from_bytes(&metadata_ser_bytes).unwrap();
         assert_eq!(des_metadata, expected_metadata);
     }
 }
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 7d2e9b9a91..6233a3477e 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -38,14 +38,17 @@ pub struct IndexPart {
     /// that latest version stores.
     pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,
 
-    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
-    // It's duplicated for convenience when reading the serialized structure, but is
-    // private because internally we would read from metadata instead.
+    /// Because of the trouble of eyeballing the legacy "metadata" field, we copied the
+    /// "disk_consistent_lsn" out. After version 7 this is no longer needed, but the name cannot be
+    /// reused.
     pub(super) disk_consistent_lsn: Lsn,
 
-    // TODO: later make this "rename" to "alias", rename field as "legacy_metadata"
+    // TODO: rename as "metadata" next week, keep the alias = "metadata_bytes", bump version Adding
+    // the "alias = metadata" was forgotten in #7693, so we have to use "rewrite = metadata_bytes"
+    // for backwards compatibility.
     #[serde(
         rename = "metadata_bytes",
+        alias = "metadata",
         with = "crate::tenant::metadata::modern_serde"
     )]
     pub metadata: TimelineMetadata,
@@ -76,10 +79,11 @@ impl IndexPart {
     /// - 4: timeline_layers is fully removed.
     /// - 5: lineage was added
     /// - 6: last_aux_file_policy is added.
-    const LATEST_VERSION: usize = 6;
+    /// - 7: metadata_bytes is no longer written, but still read
+    const LATEST_VERSION: usize = 7;
 
     // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7];
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
@@ -95,7 +99,7 @@ impl IndexPart {
         }
     }
 
-    pub fn get_version(&self) -> usize {
+    pub fn version(&self) -> usize {
         self.version
     }
 
@@ -217,9 +221,9 @@ impl Lineage {
 
 #[cfg(test)]
 mod tests {
-    use std::str::FromStr;
-
     use super::*;
+    use std::str::FromStr;
+    use utils::id::TimelineId;
 
     #[test]
     fn v1_indexpart_is_parsed() {
@@ -338,8 +342,7 @@ mod tests {
             ]),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
-            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             lineage: Lineage::default(),
             last_aux_file_policy: None,
         };
@@ -515,8 +518,7 @@ mod tests {
             ]),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
-            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             lineage: Lineage {
                 reparenting_history_truncated: false,
                 reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
@@ -529,6 +531,60 @@ mod tests {
         assert_eq!(part, expected);
     }
 
+    #[test]
+    fn v7_indexpart_is_parsed() {
+        let example = r#"{
+            "version": 7,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata": {
+                "disk_consistent_lsn": "0/16960E8",
+                "prev_record_lsn": "0/1696070",
+                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
+                "ancestor_lsn": "0/0",
+                "latest_gc_cutoff_lsn": "0/1696070",
+                "initdb_lsn": "0/1696070",
+                "pg_version": 14
+            },
+            "deleted_at": "2023-07-31T09:00:00.123"
+        }"#;
+
+        let expected = IndexPart {
+            version: 7,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::new(
+                Lsn::from_str("0/16960E8").unwrap(),
+                Some(Lsn::from_str("0/1696070").unwrap()),
+                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
+                Lsn::INVALID,
+                Lsn::from_str("0/1696070").unwrap(),
+                Lsn::from_str("0/1696070").unwrap(),
+                14,
+            ).with_recalculated_checksum().unwrap(),
+            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
+            lineage: Default::default(),
+            last_aux_file_policy: Default::default(),
+        };
+
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
     fn parse_naive_datetime(s: &str) -> NaiveDateTime {
         chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
     }
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index 44fb53696c..4eb8580e32 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -78,17 +78,16 @@ pub(crate) fn branch_cleanup_and_check_errors(
                     index_part_generation: _index_part_generation,
                     s3_layers: _s3_layers,
                 } => {
-                    if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) {
-                        result.errors.push(format!(
-                            "index_part.json version: {}",
-                            index_part.get_version()
-                        ))
+                    if !IndexPart::KNOWN_VERSIONS.contains(&index_part.version()) {
+                        result
+                            .errors
+                            .push(format!("index_part.json version: {}", index_part.version()))
                     }
 
-                    if &index_part.get_version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
+                    if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
                         result.warnings.push(format!(
                             "index_part.json version is not latest: {}",
-                            index_part.get_version()
+                            index_part.version()
                         ))
                     }
 
diff --git a/s3_scrubber/src/scan_pageserver_metadata.rs b/s3_scrubber/src/scan_pageserver_metadata.rs
index 6ff9783875..af74ffa4cd 100644
--- a/s3_scrubber/src/scan_pageserver_metadata.rs
+++ b/s3_scrubber/src/scan_pageserver_metadata.rs
@@ -125,7 +125,7 @@ impl MetadataSummary {
         {
             *self
                 .indices_by_version
-                .entry(index_part.get_version())
+                .entry(index_part.version())
                 .or_insert(0) += 1;
 
             if let Err(e) = self.update_histograms(index_part) {
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 18e5111786..54d3b2d515 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -37,7 +37,8 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     """
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_configs()
+    env.start()
     env.pageserver.allowed_errors.extend(
         [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
     )

From d3b892e9ad39c50e869ada51b7f892666d4bf476 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 11 Jun 2024 17:10:05 +0300
Subject: [PATCH 0950/1571] test: fix duplicated harness name (#8010)

We need unique tenant harness names in case you want to inspect the
results of the last failing run. We are not using any proc macros to get
the test name as there is no stable way of doing that, and there will
not be one in the future, so we need to fix these duplicates.

Also, clean up the duplicated tests to not mix `?` and `unwrap/assert`.
---
 pageserver/src/tenant.rs | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2e3ce45c2b..10842c1504 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -6584,8 +6584,8 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_metadata_tombstone_image_creation() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
+    async fn test_metadata_tombstone_image_creation() {
+        let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6613,7 +6613,8 @@ mod tests {
                 vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
                 Lsn(0x30),
             )
-            .await?;
+            .await
+            .unwrap();
 
         let cancel = CancellationToken::new();
 
@@ -6628,23 +6629,24 @@ mod tests {
                 },
                 &ctx,
             )
-            .await?;
+            .await
+            .unwrap();
 
         // Image layers are created at last_record_lsn
         let images = tline
             .inspect_image_layers(Lsn(0x30), &ctx)
-            .await?
+            .await
+            .unwrap()
             .into_iter()
             .filter(|(k, _)| k.is_metadata_key())
             .collect::<Vec<_>>();
         assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed.
-
-        Ok(())
     }
 
     #[tokio::test]
-    async fn test_metadata_tombstone_empty_image_creation() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
+    async fn test_metadata_tombstone_empty_image_creation() {
+        let harness =
+            TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -6666,7 +6668,8 @@ mod tests {
                 vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
                 Lsn(0x30),
             )
-            .await?;
+            .await
+            .unwrap();
 
         let cancel = CancellationToken::new();
 
@@ -6681,17 +6684,17 @@ mod tests {
                 },
                 &ctx,
             )
-            .await?;
+            .await
+            .unwrap();
 
         // Image layers are created at last_record_lsn
         let images = tline
             .inspect_image_layers(Lsn(0x30), &ctx)
-            .await?
+            .await
+            .unwrap()
             .into_iter()
             .filter(|(k, _)| k.is_metadata_key())
             .collect::<Vec<_>>();
         assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created
-
-        Ok(())
     }
 }

From 4c2100794b97c1f635bf9dcc9013d2b2c8733de6 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 11 Jun 2024 10:14:51 -0400
Subject: [PATCH 0951/1571] feat(pageserver): initial code sketch & test case
 for combined gc+compaction at gc_horizon (#7948)

A demo for a building block for compaction. The GC-compaction operation
iterates all layers below/intersect with the GC horizon, and do a full
layer rewrite of all of them. The end result will be image layer
covering the full keyspace at GC-horizon, and a bunch of delta layers
above the GC-horizon. This helps us collect the garbages of the
test_gc_feedback test case to reduce space amplification.

This operation can be manually triggered using an HTTP API or be
triggered based on some metrics. Actual method TBD.

The test is very basic and it's very likely that most part of the
algorithm will be rewritten. I would like to get this merged so that I
can have a basic skeleton for the algorithm and then make incremental
changes.

<img width="924" alt="image"
src="https://github.com/neondatabase/neon/assets/4198311/f3d49f4e-634f-4f56-986d-bfefc6ae6ee2">

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                      | 160 ++++++++++++++++
 .../src/tenant/storage_layer/delta_layer.rs   |  39 ++++
 .../src/tenant/storage_layer/image_layer.rs   |  28 +++
 pageserver/src/tenant/storage_layer/layer.rs  |  31 ++++
 pageserver/src/tenant/timeline.rs             |  13 ++
 pageserver/src/tenant/timeline/compaction.rs  | 172 ++++++++++++++++++
 .../src/tenant/timeline/layer_manager.rs      |  12 ++
 7 files changed, 455 insertions(+)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 10842c1504..f9ed6d3071 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4044,10 +4044,12 @@ mod tests {
     use crate::DEFAULT_PG_VERSION;
     use bytes::{Bytes, BytesMut};
     use hex_literal::hex;
+    use itertools::Itertools;
     use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
     use pageserver_api::keyspace::KeySpace;
     use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
     use rand::{thread_rng, Rng};
+    use storage_layer::PersistentLayerKey;
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
     use utils::bin_ser::BeSer;
@@ -6697,4 +6699,162 @@ mod tests {
             .collect::<Vec<_>>();
         assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created
     }
+
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction")?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
+        //
+        //  | D1 |                       | D3 |
+        // -|    |-- gc horizon -----------------
+        //  |    |                | D2 |
+        // --------- img layer ------------------
+        //
+        // What we should expact from this compaction is:
+        //  | Part of D1 |               | D3 |
+        // --------- img layer with D1+D2 at GC horizon------------------
+
+        // img layer at 0x10
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), test_img(&format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            // TODO: we should test a real delta record here, which requires us to add a variant of NeonWalRecord for testing purpose.
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::Image(test_img("value 1@0x20")),
+            ),
+            (
+                get_key(2),
+                Lsn(0x30),
+                Value::Image(test_img("value 2@0x30")),
+            ),
+            (
+                get_key(3),
+                Lsn(0x40),
+                Value::Image(test_img("value 3@0x40")),
+            ),
+        ];
+        let delta2 = vec![
+            (
+                get_key(5),
+                Lsn(0x20),
+                Value::Image(test_img("value 5@0x20")),
+            ),
+            (
+                get_key(6),
+                Lsn(0x20),
+                Value::Image(test_img("value 6@0x20")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x40),
+                Value::Image(test_img("value 8@0x40")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x40),
+                Value::Image(test_img("value 9@0x40")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![delta1, delta2, delta3], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            guard.cutoffs.pitr = Lsn(0x30);
+            guard.cutoffs.horizon = Lsn(0x30);
+        }
+
+        let cancel = CancellationToken::new();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+
+        // Check if the image layer at the GC horizon contains exactly what we want
+        let image_at_gc_horizon = tline
+            .inspect_image_layers(Lsn(0x30), &ctx)
+            .await
+            .unwrap()
+            .into_iter()
+            .filter(|(k, _)| k.is_metadata_key())
+            .collect::<Vec<_>>();
+
+        assert_eq!(image_at_gc_horizon.len(), 10);
+        let expected_lsn = [0x10, 0x20, 0x30, 0x10, 0x10, 0x20, 0x20, 0x10, 0x10, 0x10];
+        for idx in 0..10 {
+            assert_eq!(
+                image_at_gc_horizon[idx],
+                (
+                    get_key(idx as u32),
+                    test_img(&format!("value {idx}@{:#x}", expected_lsn[idx]))
+                )
+            );
+        }
+
+        // Check if old layers are removed / new layers have the expected LSN
+        let mut all_layers = tline.inspect_historic_layers().await.unwrap();
+        all_layers.sort_by(|k1, k2| {
+            (
+                k1.is_delta,
+                k1.key_range.start,
+                k1.key_range.end,
+                k1.lsn_range.start,
+                k1.lsn_range.end,
+            )
+                .cmp(&(
+                    k2.is_delta,
+                    k2.key_range.start,
+                    k2.key_range.end,
+                    k2.lsn_range.start,
+                    k2.lsn_range.end,
+                ))
+        });
+        assert_eq!(
+            all_layers,
+            vec![
+                // Image layer at GC horizon
+                PersistentLayerKey {
+                    key_range: Key::MIN..get_key(10),
+                    lsn_range: Lsn(0x30)..Lsn(0x31),
+                    is_delta: false
+                },
+                // The delta layer that is cut in the middle
+                PersistentLayerKey {
+                    key_range: Key::MIN..get_key(9),
+                    lsn_range: Lsn(0x30)..Lsn(0x41),
+                    is_delta: true
+                },
+                // The delta layer we created and should not be picked for the compaction
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x40)..Lsn(0x41),
+                    is_delta: true
+                }
+            ]
+        );
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 999e2e8679..eb7cf81643 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -929,6 +929,45 @@ impl DeltaLayerInner {
         Ok(())
     }
 
+    /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
+    #[cfg(test)]
+    pub(super) async fn load_key_values(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            block_reader,
+        );
+        let mut result = Vec::new();
+        let mut stream =
+            Box::pin(self.stream_index_forwards(&index_reader, &[0; DELTA_KEY_SIZE], ctx));
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let cursor = block_reader.block_cursor();
+        let mut buf = Vec::new();
+        while let Some(item) = stream.next().await {
+            let (key, lsn, pos) = item?;
+            // TODO: dedup code with get_reconstruct_value
+            // TODO: ctx handling and sharding
+            cursor
+                .read_blob_into_buf(pos.pos(), &mut buf, ctx)
+                .await
+                .with_context(|| {
+                    format!("Failed to read blob from virtual file {}", self.file.path)
+                })?;
+            let val = Value::des(&buf).with_context(|| {
+                format!(
+                    "Failed to deserialize file blob from virtual file {}",
+                    self.file.path
+                )
+            })?;
+            result.push((key, lsn, val));
+        }
+        Ok(result)
+    }
+
     async fn plan_reads<Reader>(
         keyspace: &KeySpace,
         lsn_range: Range<Lsn>,
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 285618b146..06e2f09384 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -485,6 +485,34 @@ impl ImageLayerInner {
         Ok(())
     }
 
+    /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
+    #[cfg(test)]
+    pub(super) async fn load_key_values(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
+        let mut result = Vec::new();
+        let mut stream = Box::pin(tree_reader.get_stream_from(&[0; KEY_SIZE], ctx));
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let cursor = block_reader.block_cursor();
+        while let Some(item) = stream.next().await {
+            // TODO: dedup code with get_reconstruct_value
+            let (raw_key, offset) = item?;
+            let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+            // TODO: ctx handling and sharding
+            let blob = cursor
+                .read_blob(offset, ctx)
+                .await
+                .with_context(|| format!("failed to read value from offset {}", offset))?;
+            let value = Bytes::from(blob);
+            result.push((key, self.lsn, Value::Image(value)));
+        }
+        Ok(result)
+    }
+
     /// Traverse the layer's index to build read operations on the overlap of the input keyspace
     /// and the keys in this layer.
     ///
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 18f9ba4ef8..32acb3f0cd 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -388,6 +388,23 @@ impl Layer {
             })
     }
 
+    /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
+    #[cfg(test)]
+    pub(crate) async fn load_key_values(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
+        let layer = self
+            .0
+            .get_or_maybe_download(true, Some(ctx))
+            .await
+            .map_err(|err| match err {
+                DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
+                other => GetVectoredError::Other(anyhow::anyhow!(other)),
+            })?;
+        layer.load_key_values(&self.0, ctx).await
+    }
+
     /// Download the layer if evicted.
     ///
     /// Will not error when the layer is already downloaded.
@@ -1757,6 +1774,20 @@ impl DownloadedLayer {
         }
     }
 
+    #[cfg(test)]
+    async fn load_key_values(
+        &self,
+        owner: &Arc<LayerInner>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
+        use LayerKind::*;
+
+        match self.get(owner, ctx).await? {
+            Delta(d) => d.load_key_values(ctx).await,
+            Image(i) => i.load_key_values(ctx).await,
+        }
+    }
+
     async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
         use LayerKind::*;
         match self.get(owner, ctx).await? {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6da0f9d91c..54a4ceeaf3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5549,6 +5549,19 @@ impl Timeline {
         all_data.sort();
         Ok(all_data)
     }
+
+    /// Get all historic layer descriptors in the layer map
+    #[cfg(test)]
+    pub(crate) async fn inspect_historic_layers(
+        self: &Arc<Timeline>,
+    ) -> anyhow::Result<Vec<super::storage_layer::PersistentLayerKey>> {
+        let mut layers = Vec::new();
+        let guard = self.layers.read().await;
+        for layer in guard.layer_map().iter_historic_layers() {
+            layers.push(layer.key());
+        }
+        Ok(layers)
+    }
 }
 
 type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index d8de6aee7c..8a95029f33 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -952,6 +952,178 @@ impl Timeline {
         adaptor.flush_updates().await?;
         Ok(())
     }
+
+    /// An experimental compaction building block that combines compaction with garbage collection.
+    ///
+    /// The current implementation picks all delta + image layers that are below or intersecting with
+    /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
+    /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
+    /// and create delta layers with all deltas >= gc horizon.
+    #[cfg(test)]
+    pub(crate) async fn compact_with_gc(
+        self: &Arc<Self>,
+        _cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
+        use crate::tenant::storage_layer::ValueReconstructState;
+        // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
+        // The layer selection has the following properties:
+        // 1. If a layer is in the selection, all layers below it are in the selection.
+        // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
+        let (layer_selection, gc_cutoff) = {
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+            let gc_info = self.gc_info.read().unwrap();
+            let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
+            let mut selected_layers = Vec::new();
+            // TODO: consider retain_lsns
+            drop(gc_info);
+            for desc in layers.iter_historic_layers() {
+                if desc.get_lsn_range().start <= gc_cutoff {
+                    selected_layers.push(guard.get_from_desc(&desc));
+                }
+            }
+            (selected_layers, gc_cutoff)
+        };
+        // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
+        let mut all_key_values = Vec::new();
+        for layer in &layer_selection {
+            all_key_values.extend(layer.load_key_values(ctx).await?);
+        }
+        // Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
+        // image layers, make image appear later than delta.
+        struct ValueWrapper<'a>(&'a crate::repository::Value);
+        impl Ord for ValueWrapper<'_> {
+            fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+                use crate::repository::Value;
+                use std::cmp::Ordering;
+                match (self.0, other.0) {
+                    (Value::Image(_), Value::WalRecord(_)) => Ordering::Greater,
+                    (Value::WalRecord(_), Value::Image(_)) => Ordering::Less,
+                    _ => Ordering::Equal,
+                }
+            }
+        }
+        impl PartialOrd for ValueWrapper<'_> {
+            fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+                Some(self.cmp(other))
+            }
+        }
+        impl PartialEq for ValueWrapper<'_> {
+            fn eq(&self, other: &Self) -> bool {
+                self.cmp(other) == std::cmp::Ordering::Equal
+            }
+        }
+        impl Eq for ValueWrapper<'_> {}
+        all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
+            (k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
+        });
+        let max_lsn = all_key_values
+            .iter()
+            .map(|(_, lsn, _)| lsn)
+            .max()
+            .copied()
+            .unwrap()
+            + 1;
+        // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
+        // Data of the same key.
+        let mut accumulated_values = Vec::new();
+        let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty
+
+        /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
+        async fn flush_accumulated_states(
+            tline: &Arc<Timeline>,
+            key: Key,
+            accumulated_values: &[&(Key, Lsn, crate::repository::Value)],
+            horizon: Lsn,
+        ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
+            let mut base_image = None;
+            let mut keys_above_horizon = Vec::new();
+            let mut delta_above_base_image = Vec::new();
+            // We have a list of deltas/images. We want to create image layers while collect garbages.
+            for (key, lsn, val) in accumulated_values.iter().rev() {
+                if *lsn > horizon {
+                    keys_above_horizon.push((*key, *lsn, val.clone())); // TODO: ensure one LSN corresponds to either delta or image instead of both
+                } else if *lsn <= horizon {
+                    match val {
+                        crate::repository::Value::Image(image) => {
+                            if lsn <= &horizon {
+                                base_image = Some((*lsn, image.clone()));
+                                break;
+                            }
+                        }
+                        crate::repository::Value::WalRecord(wal) => {
+                            delta_above_base_image.push((*lsn, wal.clone()));
+                        }
+                    }
+                }
+            }
+            delta_above_base_image.reverse();
+            keys_above_horizon.reverse();
+            let state = ValueReconstructState {
+                img: base_image,
+                records: delta_above_base_image,
+            };
+            let img = tline.reconstruct_value(key, horizon, state).await?;
+            Ok((keys_above_horizon, img))
+        }
+
+        let mut delta_layer_writer = DeltaLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            all_key_values.first().unwrap().0,
+            gc_cutoff..max_lsn, // TODO: off by one?
+            ctx,
+        )
+        .await?;
+        let mut image_layer_writer = ImageLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            &(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()),
+            gc_cutoff,
+            ctx,
+        )
+        .await?;
+
+        for item @ (key, _, _) in &all_key_values {
+            if &last_key == key {
+                accumulated_values.push(item);
+            } else {
+                let (deltas, image) =
+                    flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
+                        .await?;
+                image_layer_writer.put_image(last_key, image, ctx).await?;
+                for (key, lsn, val) in deltas {
+                    delta_layer_writer.put_value(key, lsn, val, ctx).await?;
+                }
+                accumulated_values.clear();
+                accumulated_values.push(item);
+                last_key = *key;
+            }
+        }
+        let (deltas, image) =
+            flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
+        image_layer_writer.put_image(last_key, image, ctx).await?;
+        for (key, lsn, val) in deltas {
+            delta_layer_writer.put_value(key, lsn, val, ctx).await?;
+        }
+        accumulated_values.clear();
+        // TODO: split layers
+        let delta_layer = delta_layer_writer.finish(last_key, self, ctx).await?;
+        let image_layer = image_layer_writer.finish(self, ctx).await?;
+        // Step 3: Place back to the layer map.
+        {
+            let mut guard = self.layers.write().await;
+            guard.finish_gc_compaction(
+                &layer_selection,
+                &[delta_layer.clone(), image_layer.clone()],
+                &self.metrics,
+            )
+        };
+        Ok(())
+    }
 }
 
 struct TimelineAdaptor {
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 21e64d562a..550a9a567a 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -226,6 +226,18 @@ impl LayerManager {
         updates.flush();
     }
 
+    /// Called when a GC-compaction is completed.
+    #[cfg(test)]
+    pub(crate) fn finish_gc_compaction(
+        &mut self,
+        compact_from: &[Layer],
+        compact_to: &[ResidentLayer],
+        metrics: &TimelineMetrics,
+    ) {
+        // We can simply reuse compact l0 logic. Use a different function name to indicate a different type of layer map modification.
+        self.finish_compact_l0(compact_from, compact_to, metrics)
+    }
+
     /// Called when compaction is completed.
     pub(crate) fn rewrite_layers(
         &mut self,

From 126bcc3794a41e3b776108f826c68c6871044876 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 11 Jun 2024 16:03:25 +0100
Subject: [PATCH 0952/1571] storcon: track number of attached shards for each
 node (#8011)

## Problem
The storage controller does not track the number of shards attached to a
given pageserver. This is a requirement for various scheduling
operations (e.g. draining and filling will use this to figure out if the
cluster is balanced)

## Summary of Changes
Track the number of shards attached to each node.

Related https://github.com/neondatabase/neon/issues/7387
---
 storage_controller/src/scheduler.rs    | 101 ++++++++++++++++++-------
 storage_controller/src/service.rs      |   2 +-
 storage_controller/src/tenant_shard.rs |  56 +++++++++-----
 3 files changed, 114 insertions(+), 45 deletions(-)

diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 3ff0d87988..4ab85509dc 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -29,6 +29,8 @@ pub enum MaySchedule {
 struct SchedulerNode {
     /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
     shard_count: usize,
+    /// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
+    attached_shard_count: usize,
 
     /// Whether this node is currently elegible to have new shards scheduled (this is derived
     /// from a node's availability state and scheduling policy).
@@ -42,7 +44,9 @@ impl PartialEq for SchedulerNode {
             (MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
         );
 
-        may_schedule_matches && self.shard_count == other.shard_count
+        may_schedule_matches
+            && self.shard_count == other.shard_count
+            && self.attached_shard_count == other.attached_shard_count
     }
 }
 
@@ -138,6 +142,15 @@ impl ScheduleContext {
     }
 }
 
+pub(crate) enum RefCountUpdate {
+    PromoteSecondary,
+    Attach,
+    Detach,
+    DemoteAttached,
+    AddSecondary,
+    RemoveSecondary,
+}
+
 impl Scheduler {
     pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
         let mut scheduler_nodes = HashMap::new();
@@ -146,6 +159,7 @@ impl Scheduler {
                 node.get_id(),
                 SchedulerNode {
                     shard_count: 0,
+                    attached_shard_count: 0,
                     may_schedule: node.may_schedule(),
                 },
             );
@@ -171,6 +185,7 @@ impl Scheduler {
                 node.get_id(),
                 SchedulerNode {
                     shard_count: 0,
+                    attached_shard_count: 0,
                     may_schedule: node.may_schedule(),
                 },
             );
@@ -179,7 +194,10 @@ impl Scheduler {
         for shard in shards {
             if let Some(node_id) = shard.intent.get_attached() {
                 match expect_nodes.get_mut(node_id) {
-                    Some(node) => node.shard_count += 1,
+                    Some(node) => {
+                        node.shard_count += 1;
+                        node.attached_shard_count += 1;
+                    }
                     None => anyhow::bail!(
                         "Tenant {} references nonexistent node {}",
                         shard.tenant_shard_id,
@@ -227,31 +245,42 @@ impl Scheduler {
         Ok(())
     }
 
-    /// Increment the reference count of a node.  This reference count is used to guide scheduling
-    /// decisions, not for memory management: it represents one tenant shard whose IntentState targets
-    /// this node.
+    /// Update the reference counts of a node. These reference counts are used to guide scheduling
+    /// decisions, not for memory management: they represent the number of tenant shard whose IntentState
+    /// targets this node and the number of tenants shars whose IntentState is attached to this
+    /// node.
     ///
     /// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
     /// [`Self::new`] or [`Self::node_upsert`])
-    pub(crate) fn node_inc_ref(&mut self, node_id: NodeId) {
-        let Some(node) = self.nodes.get_mut(&node_id) else {
-            tracing::error!("Scheduler missing node {node_id}");
-            debug_assert!(false);
-            return;
-        };
-
-        node.shard_count += 1;
-    }
-
-    /// Decrement a node's reference count.  Inverse of [`Self::node_inc_ref`].
-    pub(crate) fn node_dec_ref(&mut self, node_id: NodeId) {
+    pub(crate) fn update_node_ref_counts(&mut self, node_id: NodeId, update: RefCountUpdate) {
         let Some(node) = self.nodes.get_mut(&node_id) else {
             debug_assert!(false);
             tracing::error!("Scheduler missing node {node_id}");
             return;
         };
 
-        node.shard_count -= 1;
+        match update {
+            RefCountUpdate::PromoteSecondary => {
+                node.attached_shard_count += 1;
+            }
+            RefCountUpdate::Attach => {
+                node.shard_count += 1;
+                node.attached_shard_count += 1;
+            }
+            RefCountUpdate::Detach => {
+                node.shard_count -= 1;
+                node.attached_shard_count -= 1;
+            }
+            RefCountUpdate::DemoteAttached => {
+                node.attached_shard_count -= 1;
+            }
+            RefCountUpdate::AddSecondary => {
+                node.shard_count += 1;
+            }
+            RefCountUpdate::RemoveSecondary => {
+                node.shard_count -= 1;
+            }
+        }
     }
 
     pub(crate) fn node_upsert(&mut self, node: &Node) {
@@ -263,6 +292,7 @@ impl Scheduler {
             Vacant(entry) => {
                 entry.insert(SchedulerNode {
                     shard_count: 0,
+                    attached_shard_count: 0,
                     may_schedule: node.may_schedule(),
                 });
             }
@@ -385,6 +415,11 @@ impl Scheduler {
     pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
         self.nodes.get(&node_id).unwrap().shard_count
     }
+
+    #[cfg(test)]
+    pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize {
+        self.nodes.get(&node_id).unwrap().attached_shard_count
+    }
 }
 
 #[cfg(test)]
@@ -437,18 +472,28 @@ mod tests {
         let scheduled = scheduler.schedule_shard(&[], &context)?;
         t2_intent.set_attached(&mut scheduler, Some(scheduled));
 
-        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
-        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
+
+        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
 
         let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
         t1_intent.push_secondary(&mut scheduler, scheduled);
 
-        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
-        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 2);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
+
+        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
 
         t1_intent.clear(&mut scheduler);
-        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
-        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
+
+        let total_attached = scheduler.get_node_attached_shard_count(NodeId(1))
+            + scheduler.get_node_attached_shard_count(NodeId(2));
+        assert_eq!(total_attached, 1);
 
         if cfg!(debug_assertions) {
             // Dropping an IntentState without clearing it causes a panic in debug mode,
@@ -459,8 +504,12 @@ mod tests {
             assert!(result.is_err());
         } else {
             t2_intent.clear(&mut scheduler);
-            assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
-            assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 0);
+
+            assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0);
+            assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 0);
+
+            assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 0);
+            assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
         }
 
         Ok(())
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 756dc10a2a..1e81b5c5a2 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4312,7 +4312,7 @@ impl Service {
                         continue;
                     }
 
-                    if tenant_shard.intent.demote_attached(node_id) {
+                    if tenant_shard.intent.demote_attached(scheduler, node_id) {
                         tenant_shard.sequence = tenant_shard.sequence.next();
 
                         // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index dda17f9887..77bbf4c604 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -8,7 +8,7 @@ use crate::{
     metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
     persistence::TenantShardPersistence,
     reconciler::ReconcileUnits,
-    scheduler::{AffinityScore, MaySchedule, ScheduleContext},
+    scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
 };
 use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
 use pageserver_api::{
@@ -153,7 +153,7 @@ impl IntentState {
     }
     pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option<NodeId>) -> Self {
         if let Some(node_id) = node_id {
-            scheduler.node_inc_ref(node_id);
+            scheduler.update_node_ref_counts(node_id, RefCountUpdate::Attach);
         }
         Self {
             attached: node_id,
@@ -164,10 +164,10 @@ impl IntentState {
     pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option<NodeId>) {
         if self.attached != new_attached {
             if let Some(old_attached) = self.attached.take() {
-                scheduler.node_dec_ref(old_attached);
+                scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach);
             }
             if let Some(new_attached) = &new_attached {
-                scheduler.node_inc_ref(*new_attached);
+                scheduler.update_node_ref_counts(*new_attached, RefCountUpdate::Attach);
             }
             self.attached = new_attached;
         }
@@ -177,22 +177,27 @@ impl IntentState {
     /// secondary to attached while maintaining the scheduler's reference counts.
     pub(crate) fn promote_attached(
         &mut self,
-        _scheduler: &mut Scheduler,
+        scheduler: &mut Scheduler,
         promote_secondary: NodeId,
     ) {
         // If we call this with a node that isn't in secondary, it would cause incorrect
         // scheduler reference counting, since we assume the node is already referenced as a secondary.
         debug_assert!(self.secondary.contains(&promote_secondary));
 
-        // TODO: when scheduler starts tracking attached + secondary counts separately, we will
-        // need to call into it here.
         self.secondary.retain(|n| n != &promote_secondary);
+
+        let demoted = self.attached;
         self.attached = Some(promote_secondary);
+
+        scheduler.update_node_ref_counts(promote_secondary, RefCountUpdate::PromoteSecondary);
+        if let Some(demoted) = demoted {
+            scheduler.update_node_ref_counts(demoted, RefCountUpdate::DemoteAttached);
+        }
     }
 
     pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
         debug_assert!(!self.secondary.contains(&new_secondary));
-        scheduler.node_inc_ref(new_secondary);
+        scheduler.update_node_ref_counts(new_secondary, RefCountUpdate::AddSecondary);
         self.secondary.push(new_secondary);
     }
 
@@ -200,27 +205,27 @@ impl IntentState {
     pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) {
         let index = self.secondary.iter().position(|n| *n == node_id);
         if let Some(index) = index {
-            scheduler.node_dec_ref(node_id);
+            scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary);
             self.secondary.remove(index);
         }
     }
 
     pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) {
         for secondary in self.secondary.drain(..) {
-            scheduler.node_dec_ref(secondary);
+            scheduler.update_node_ref_counts(secondary, RefCountUpdate::RemoveSecondary);
         }
     }
 
     /// Remove the last secondary node from the list of secondaries
     pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
         if let Some(node_id) = self.secondary.pop() {
-            scheduler.node_dec_ref(node_id);
+            scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary);
         }
     }
 
     pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
         if let Some(old_attached) = self.attached.take() {
-            scheduler.node_dec_ref(old_attached);
+            scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach);
         }
 
         self.clear_secondary(scheduler);
@@ -251,12 +256,11 @@ impl IntentState {
     /// forget the location on the offline node.
     ///
     /// Returns true if a change was made
-    pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool {
+    pub(crate) fn demote_attached(&mut self, scheduler: &mut Scheduler, node_id: NodeId) -> bool {
         if self.attached == Some(node_id) {
-            // TODO: when scheduler starts tracking attached + secondary counts separately, we will
-            // need to call into it here.
             self.attached = None;
             self.secondary.push(node_id);
+            scheduler.update_node_ref_counts(node_id, RefCountUpdate::DemoteAttached);
             true
         } else {
             false
@@ -593,7 +597,7 @@ impl TenantShard {
             Secondary => {
                 if let Some(node_id) = self.intent.get_attached() {
                     // Populate secondary by demoting the attached node
-                    self.intent.demote_attached(*node_id);
+                    self.intent.demote_attached(scheduler, *node_id);
                     modified = true;
                 } else if self.intent.secondary.is_empty() {
                     // Populate secondary by scheduling a fresh node
@@ -783,7 +787,7 @@ impl TenantShard {
                 old_attached_node_id,
                 new_attached_node_id,
             }) => {
-                self.intent.demote_attached(old_attached_node_id);
+                self.intent.demote_attached(scheduler, old_attached_node_id);
                 self.intent
                     .promote_attached(scheduler, new_attached_node_id);
             }
@@ -1321,7 +1325,9 @@ pub(crate) mod tests {
         assert_ne!(attached_node_id, secondary_node_id);
 
         // Notifying the attached node is offline should demote it to a secondary
-        let changed = tenant_shard.intent.demote_attached(attached_node_id);
+        let changed = tenant_shard
+            .intent
+            .demote_attached(&mut scheduler, attached_node_id);
         assert!(changed);
         assert!(tenant_shard.intent.attached.is_none());
         assert_eq!(tenant_shard.intent.secondary.len(), 2);
@@ -1604,7 +1610,14 @@ pub(crate) mod tests {
 
         // We should see equal number of locations on the two nodes.
         assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
+        // Scheduling does not consider the number of attachments picking the initial
+        // pageserver to attach to (hence the assertion that all primaries are on the
+        // same node)
+        // TODO: Tweak the scheduling to evenly distribute attachments for new shards.
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 4);
+
         assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
 
         // Add another two nodes: we should see the shards spread out when their optimize
         // methods are called
@@ -1613,9 +1626,16 @@ pub(crate) mod tests {
         optimize_til_idle(&nodes, &mut scheduler, &mut shards);
 
         assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
+
         assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
+
         assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 1);
+
         assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 1);
 
         for shard in shards.iter_mut() {
             shard.intent.clear(&mut scheduler);

From 7121db3669349ad8be323f55d84906fe1f62af4f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 11 Jun 2024 17:39:38 +0100
Subject: [PATCH 0953/1571] storcon_cli: add 'drain' command (#8007)

## Problem
We need the ability to prepare a subset of storage controller managed
pageservers for decommisioning. The storage controller cannot currently
express this in terms of scheduling constraints (it's a pretty special
case, so I'm not sure it even should).

## Summary of Changes
A new `drain` command is added to `storcon_cli`. It takes a set of nodes
to drain and migrates primary attachments outside of said set. Simple
round robing assignment is used under the assumption that nodes outside
of the draining set are evenly balanced.

Note that secondary locations are not migrated. This is fine for
staging, but the migration API will have to be extended for prod in
order to allow migration of secondaries as well.

I've tested this out against a neon local cluster. The immediate use for
this command will be to migrate staging to ARM(Arch64) pageservers.

Related https://github.com/neondatabase/cloud/issues/14029
---
 Cargo.lock                            |   1 +
 control_plane/storcon_cli/Cargo.toml  |   1 +
 control_plane/storcon_cli/src/main.rs | 208 ++++++++++++++++++++++++++
 3 files changed, 210 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index dbbf330cf9..66879fd743 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5820,6 +5820,7 @@ dependencies = [
  "anyhow",
  "clap",
  "comfy-table",
+ "futures",
  "humantime",
  "hyper 0.14.26",
  "pageserver_api",
diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml
index ed3462961f..f96f0084b2 100644
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -9,6 +9,7 @@ license.workspace = true
 anyhow.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+futures.workspace = true
 humantime.workspace = true
 hyper.workspace = true
 pageserver_api.workspace = true
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 05c4acdf90..8c84911d33 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,3 +1,4 @@
+use futures::StreamExt;
 use std::{collections::HashMap, str::FromStr, time::Duration};
 
 use clap::{Parser, Subcommand};
@@ -148,6 +149,22 @@ enum Command {
         #[arg(long)]
         threshold: humantime::Duration,
     },
+    // Drain a set of specified pageservers by moving the primary attachments to pageservers
+    // outside of the specified set.
+    Drain {
+        // Set of pageserver node ids to drain.
+        #[arg(long)]
+        nodes: Vec<NodeId>,
+        // Optional: migration concurrency (default is 8)
+        #[arg(long)]
+        concurrency: Option<usize>,
+        // Optional: maximum number of shards to migrate
+        #[arg(long)]
+        max_shards: Option<usize>,
+        // Optional: when set to true, nothing is migrated, but the plan is printed to stdout
+        #[arg(long)]
+        dry_run: Option<bool>,
+    },
 }
 
 #[derive(Parser)]
@@ -737,6 +754,197 @@ async fn main() -> anyhow::Result<()> {
                 })
                 .await?;
         }
+        Command::Drain {
+            nodes,
+            concurrency,
+            max_shards,
+            dry_run,
+        } => {
+            // Load the list of nodes, split them up into the drained and filled sets,
+            // and validate that draining is possible.
+            let node_descs = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+
+            let mut node_to_drain_descs = Vec::new();
+            let mut node_to_fill_descs = Vec::new();
+
+            for desc in node_descs {
+                let to_drain = nodes.iter().any(|id| *id == desc.id);
+                if to_drain {
+                    node_to_drain_descs.push(desc);
+                } else {
+                    node_to_fill_descs.push(desc);
+                }
+            }
+
+            if nodes.len() != node_to_drain_descs.len() {
+                anyhow::bail!("Drain requested for node which doesn't exist.")
+            }
+
+            let can_fill = node_to_fill_descs
+                .iter()
+                .filter(|desc| {
+                    matches!(desc.availability, NodeAvailabilityWrapper::Active)
+                        && matches!(
+                            desc.scheduling,
+                            NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Filling
+                        )
+                })
+                .any(|_| true);
+
+            if !can_fill {
+                anyhow::bail!("There are no nodes to drain to")
+            }
+
+            // Set the node scheduling policy to draining for the nodes which
+            // we plan to drain.
+            for node_desc in node_to_drain_descs.iter() {
+                let req = NodeConfigureRequest {
+                    node_id: node_desc.id,
+                    availability: None,
+                    scheduling: Some(NodeSchedulingPolicy::Draining),
+                };
+
+                storcon_client
+                    .dispatch::<_, ()>(
+                        Method::PUT,
+                        format!("control/v1/node/{}/config", node_desc.id),
+                        Some(req),
+                    )
+                    .await?;
+            }
+
+            // Perform the drain: move each tenant shard scheduled on a node to
+            // be drained to a node which is being filled. A simple round robin
+            // strategy is used to pick the new node.
+            let tenants = storcon_client
+                .dispatch::<(), Vec<TenantDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/tenant".to_string(),
+                    None,
+                )
+                .await?;
+
+            let mut selected_node_idx = 0;
+
+            struct DrainMove {
+                tenant_shard_id: TenantShardId,
+                from: NodeId,
+                to: NodeId,
+            }
+
+            let mut moves: Vec<DrainMove> = Vec::new();
+
+            let shards = tenants
+                .into_iter()
+                .flat_map(|tenant| tenant.shards.into_iter());
+            for shard in shards {
+                if let Some(max_shards) = max_shards {
+                    if moves.len() >= max_shards {
+                        println!(
+                            "Stop planning shard moves since the requested maximum was reached"
+                        );
+                        break;
+                    }
+                }
+
+                let should_migrate = {
+                    if let Some(attached_to) = shard.node_attached {
+                        node_to_drain_descs
+                            .iter()
+                            .map(|desc| desc.id)
+                            .any(|id| id == attached_to)
+                    } else {
+                        false
+                    }
+                };
+
+                if !should_migrate {
+                    continue;
+                }
+
+                moves.push(DrainMove {
+                    tenant_shard_id: shard.tenant_shard_id,
+                    from: shard
+                        .node_attached
+                        .expect("We only migrate attached tenant shards"),
+                    to: node_to_fill_descs[selected_node_idx].id,
+                });
+                selected_node_idx = (selected_node_idx + 1) % node_to_fill_descs.len();
+            }
+
+            let total_moves = moves.len();
+
+            if dry_run == Some(true) {
+                println!("Dryrun requested. Planned {total_moves} moves:");
+                for mv in &moves {
+                    println!("{}: {} -> {}", mv.tenant_shard_id, mv.from, mv.to)
+                }
+
+                return Ok(());
+            }
+
+            const DEFAULT_MIGRATE_CONCURRENCY: usize = 8;
+            let mut stream = futures::stream::iter(moves)
+                .map(|mv| {
+                    let client = Client::new(cli.api.clone(), cli.jwt.clone());
+                    async move {
+                        client
+                            .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
+                                Method::PUT,
+                                format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
+                                Some(TenantShardMigrateRequest {
+                                    tenant_shard_id: mv.tenant_shard_id,
+                                    node_id: mv.to,
+                                }),
+                            )
+                            .await
+                            .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
+                    }
+                })
+                .buffered(concurrency.unwrap_or(DEFAULT_MIGRATE_CONCURRENCY));
+
+            let mut success = 0;
+            let mut failure = 0;
+
+            while let Some(res) = stream.next().await {
+                match res {
+                    Ok(_) => {
+                        success += 1;
+                    }
+                    Err((tenant_shard_id, from, to, error)) => {
+                        failure += 1;
+                        println!(
+                            "Failed to migrate {} from node {} to node {}: {}",
+                            tenant_shard_id, from, to, error
+                        );
+                    }
+                }
+
+                if (success + failure) % 20 == 0 {
+                    println!(
+                        "Processed {}/{} shards: {} succeeded, {} failed",
+                        success + failure,
+                        total_moves,
+                        success,
+                        failure
+                    );
+                }
+            }
+
+            println!(
+                "Processed {}/{} shards: {} succeeded, {} failed",
+                success + failure,
+                total_moves,
+                success,
+                failure
+            );
+        }
     }
 
     Ok(())

From 78a59b94f59a9679a6c8f3759d43b05de238ecbd Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 11 Jun 2024 23:19:18 +0300
Subject: [PATCH 0954/1571] Copy editor config for the neon extension from
 PostgreSQL (#8009)

This makes IDEs and github diff format the code the same way as
PostgreSQL sources, which is the style we try to maintain.
---
 pgxn/.dir-locals.el | 19 +++++++++++++++++++
 pgxn/.editorconfig  | 14 ++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 pgxn/.dir-locals.el
 create mode 100644 pgxn/.editorconfig

diff --git a/pgxn/.dir-locals.el b/pgxn/.dir-locals.el
new file mode 100644
index 0000000000..ab6208b698
--- /dev/null
+++ b/pgxn/.dir-locals.el
@@ -0,0 +1,19 @@
+;; see also src/tools/editors/emacs.samples for more complete settings
+
+((c-mode . ((c-basic-offset . 4)
+            (c-file-style . "bsd")
+            (fill-column . 78)
+            (indent-tabs-mode . t)
+            (tab-width . 4)))
+ (nxml-mode . ((fill-column . 78)
+               (indent-tabs-mode . nil)))
+ (perl-mode . ((perl-indent-level . 4)
+               (perl-continued-statement-offset . 2)
+               (perl-continued-brace-offset . -2)
+               (perl-brace-offset . 0)
+               (perl-brace-imaginary-offset . 0)
+               (perl-label-offset . -2)
+               (indent-tabs-mode . t)
+               (tab-width . 4)))
+ (sgml-mode . ((fill-column . 78)
+               (indent-tabs-mode . nil))))
diff --git a/pgxn/.editorconfig b/pgxn/.editorconfig
new file mode 100644
index 0000000000..d69a3d1dc4
--- /dev/null
+++ b/pgxn/.editorconfig
@@ -0,0 +1,14 @@
+root = true
+
+[*.{c,h,l,y,pl,pm}]
+indent_style = tab
+indent_size = tab
+tab_width = 4
+
+[*.{sgml,xml}]
+indent_style = space
+indent_size = 1
+
+[*.xsl]
+indent_style = space
+indent_size = 2

From 27518676d7aebb26ad81bf0a926749c5ed9e75d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 12 Jun 2024 00:45:22 +0200
Subject: [PATCH 0955/1571] Rename S3 scrubber to storage scrubber (#8013)

The S3 scrubber contains "S3" in its name, but we want to make it
generic in terms of which storage is used (#7547). Therefore, rename it
to "storage scrubber", following the naming scheme of already existing
components "storage broker" and "storage controller".

Part of #7547
---
 .dockerignore                                 |  2 +-
 Cargo.lock                                    | 96 +++++++++----------
 Cargo.toml                                    |  2 +-
 {s3_scrubber => storage_scrubber}/Cargo.toml  |  2 +-
 {s3_scrubber => storage_scrubber}/README.md   |  2 +-
 .../src/checks.rs                             |  0
 .../src/cloud_admin_api.rs                    |  0
 .../src/garbage.rs                            |  0
 {s3_scrubber => storage_scrubber}/src/lib.rs  |  0
 {s3_scrubber => storage_scrubber}/src/main.rs | 10 +-
 .../src/metadata_stream.rs                    |  0
 .../src/pageserver_physical_gc.rs             |  0
 .../src/scan_pageserver_metadata.rs           |  0
 .../src/scan_safekeeper_metadata.rs           |  0
 .../src/tenant_snapshot.rs                    |  0
 test_runner/fixtures/neon_fixtures.py         |  8 +-
 .../regress/test_pageserver_generations.py    |  4 +-
 .../regress/test_pageserver_secondary.py      |  6 +-
 test_runner/regress/test_sharding.py          |  4 +-
 ...3_scrubber.py => test_storage_scrubber.py} | 10 +-
 test_runner/regress/test_tenant_delete.py     |  4 +-
 21 files changed, 75 insertions(+), 75 deletions(-)
 rename {s3_scrubber => storage_scrubber}/Cargo.toml (98%)
 rename {s3_scrubber => storage_scrubber}/README.md (99%)
 rename {s3_scrubber => storage_scrubber}/src/checks.rs (100%)
 rename {s3_scrubber => storage_scrubber}/src/cloud_admin_api.rs (100%)
 rename {s3_scrubber => storage_scrubber}/src/garbage.rs (100%)
 rename {s3_scrubber => storage_scrubber}/src/lib.rs (100%)
 rename {s3_scrubber => storage_scrubber}/src/main.rs (96%)
 rename {s3_scrubber => storage_scrubber}/src/metadata_stream.rs (100%)
 rename {s3_scrubber => storage_scrubber}/src/pageserver_physical_gc.rs (100%)
 rename {s3_scrubber => storage_scrubber}/src/scan_pageserver_metadata.rs (100%)
 rename {s3_scrubber => storage_scrubber}/src/scan_safekeeper_metadata.rs (100%)
 rename {s3_scrubber => storage_scrubber}/src/tenant_snapshot.rs (100%)
 rename test_runner/regress/{test_s3_scrubber.py => test_storage_scrubber.py} (94%)

diff --git a/.dockerignore b/.dockerignore
index eead727994..c7a2f78e32 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -21,7 +21,7 @@
 !patches/
 !pgxn/
 !proxy/
-!s3_scrubber/
+!storage_scrubber/
 !safekeeper/
 !storage_broker/
 !storage_controller/
diff --git a/Cargo.lock b/Cargo.lock
index 66879fd743..1c8a8b0c0f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5109,54 +5109,6 @@ version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
 
-[[package]]
-name = "s3_scrubber"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "async-stream",
- "aws-config",
- "aws-sdk-s3",
- "aws-smithy-async",
- "bincode",
- "bytes",
- "camino",
- "chrono",
- "clap",
- "crc32c",
- "either",
- "futures",
- "futures-util",
- "hex",
- "histogram",
- "humantime",
- "itertools",
- "once_cell",
- "pageserver",
- "pageserver_api",
- "postgres_ffi",
- "rand 0.8.5",
- "remote_storage",
- "reqwest 0.12.4",
- "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
- "serde",
- "serde_json",
- "serde_with",
- "thiserror",
- "tokio",
- "tokio-postgres",
- "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
- "tokio-stream",
- "tokio-util",
- "tracing",
- "tracing-appender",
- "tracing-subscriber",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "safekeeper"
 version = "0.1.0"
@@ -5813,6 +5765,54 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "storage_scrubber"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-stream",
+ "aws-config",
+ "aws-sdk-s3",
+ "aws-smithy-async",
+ "bincode",
+ "bytes",
+ "camino",
+ "chrono",
+ "clap",
+ "crc32c",
+ "either",
+ "futures",
+ "futures-util",
+ "hex",
+ "histogram",
+ "humantime",
+ "itertools",
+ "once_cell",
+ "pageserver",
+ "pageserver_api",
+ "postgres_ffi",
+ "rand 0.8.5",
+ "remote_storage",
+ "reqwest 0.12.4",
+ "rustls 0.22.4",
+ "rustls-native-certs 0.7.0",
+ "serde",
+ "serde_json",
+ "serde_with",
+ "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-postgres-rustls",
+ "tokio-rustls 0.25.0",
+ "tokio-stream",
+ "tokio-util",
+ "tracing",
+ "tracing-appender",
+ "tracing-subscriber",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "storcon_cli"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index 58715db32b..dc89c2341b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@ members = [
     "safekeeper",
     "storage_broker",
     "storage_controller",
-    "s3_scrubber",
+    "storage_scrubber",
     "workspace_hack",
     "trace",
     "libs/compute_api",
diff --git a/s3_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml
similarity index 98%
rename from s3_scrubber/Cargo.toml
rename to storage_scrubber/Cargo.toml
index 48b50ca21c..050be66483 100644
--- a/s3_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "s3_scrubber"
+name = "storage_scrubber"
 version = "0.1.0"
 edition.workspace = true
 license.workspace = true
diff --git a/s3_scrubber/README.md b/storage_scrubber/README.md
similarity index 99%
rename from s3_scrubber/README.md
rename to storage_scrubber/README.md
index 8a96542ada..0930f343ec 100644
--- a/s3_scrubber/README.md
+++ b/storage_scrubber/README.md
@@ -1,4 +1,4 @@
-# Neon S3 scrubber
+# Neon Storage Scrubber
 
 This tool directly accesses the S3 buckets used by the Neon `pageserver`
 and `safekeeper`, and does housekeeping such as cleaning up objects for tenants & timelines that no longer exist.
diff --git a/s3_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
similarity index 100%
rename from s3_scrubber/src/checks.rs
rename to storage_scrubber/src/checks.rs
diff --git a/s3_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs
similarity index 100%
rename from s3_scrubber/src/cloud_admin_api.rs
rename to storage_scrubber/src/cloud_admin_api.rs
diff --git a/s3_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
similarity index 100%
rename from s3_scrubber/src/garbage.rs
rename to storage_scrubber/src/garbage.rs
diff --git a/s3_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
similarity index 100%
rename from s3_scrubber/src/lib.rs
rename to storage_scrubber/src/lib.rs
diff --git a/s3_scrubber/src/main.rs b/storage_scrubber/src/main.rs
similarity index 96%
rename from s3_scrubber/src/main.rs
rename to storage_scrubber/src/main.rs
index ade8ef7d7a..222bd10ed2 100644
--- a/s3_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,11 +1,11 @@
 use anyhow::bail;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
-use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
-use s3_scrubber::pageserver_physical_gc::GcMode;
-use s3_scrubber::scan_pageserver_metadata::scan_metadata;
-use s3_scrubber::tenant_snapshot::SnapshotDownloader;
-use s3_scrubber::{
+use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
+use storage_scrubber::pageserver_physical_gc::GcMode;
+use storage_scrubber::scan_pageserver_metadata::scan_metadata;
+use storage_scrubber::tenant_snapshot::SnapshotDownloader;
+use storage_scrubber::{
     init_logging, pageserver_physical_gc::pageserver_physical_gc,
     scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind,
     TraversingDepth,
diff --git a/s3_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
similarity index 100%
rename from s3_scrubber/src/metadata_stream.rs
rename to storage_scrubber/src/metadata_stream.rs
diff --git a/s3_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
similarity index 100%
rename from s3_scrubber/src/pageserver_physical_gc.rs
rename to storage_scrubber/src/pageserver_physical_gc.rs
diff --git a/s3_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
similarity index 100%
rename from s3_scrubber/src/scan_pageserver_metadata.rs
rename to storage_scrubber/src/scan_pageserver_metadata.rs
diff --git a/s3_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
similarity index 100%
rename from s3_scrubber/src/scan_safekeeper_metadata.rs
rename to storage_scrubber/src/scan_safekeeper_metadata.rs
diff --git a/s3_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
similarity index 100%
rename from s3_scrubber/src/tenant_snapshot.rs
rename to storage_scrubber/src/tenant_snapshot.rs
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 6fdad2188c..394f5283f3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -833,7 +833,7 @@ class NeonEnvBuilder:
     def enable_scrub_on_exit(self):
         """
         Call this if you would like the fixture to automatically run
-        s3_scrubber at the end of the test, as a bidirectional test
+        storage_scrubber at the end of the test, as a bidirectional test
         that the scrubber is working properly, and that the code within
         the test didn't produce any invalid remote state.
         """
@@ -948,7 +948,7 @@ class NeonEnvBuilder:
 
             if self.scrub_on_exit:
                 try:
-                    S3Scrubber(self).scan_metadata()
+                    StorageScrubber(self).scan_metadata()
                 except Exception as e:
                     log.error(f"Error during remote storage scrub: {e}")
                     cleanup_error = e
@@ -3937,7 +3937,7 @@ class Safekeeper(LogUtils):
         wait_until(20, 0.5, paused)
 
 
-class S3Scrubber:
+class StorageScrubber:
     def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
         self.env = env
         self.log_dir = log_dir or env.test_output_dir
@@ -3957,7 +3957,7 @@ class S3Scrubber:
         if s3_storage.endpoint is not None:
             env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})
 
-        base_args = [str(self.env.neon_binpath / "s3_scrubber")]
+        base_args = [str(self.env.neon_binpath / "storage_scrubber")]
         args = base_args + args
 
         (output_path, stdout, status_code) = subprocess_capture(
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 0235cf6d20..696af24e5c 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -22,7 +22,7 @@ from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
     PgBin,
-    S3Scrubber,
+    StorageScrubber,
     generate_uploads_and_deletions,
 )
 from fixtures.pageserver.common_types import parse_layer_file_name
@@ -215,7 +215,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
 
     # Having written a mixture of generation-aware and legacy index_part.json,
     # ensure the scrubber handles the situation as expected.
-    metadata_summary = S3Scrubber(neon_env_builder).scan_metadata()
+    metadata_summary = StorageScrubber(neon_env_builder).scan_metadata()
     assert metadata_summary["tenant_count"] == 1  # Scrubber should have seen our timeline
     assert metadata_summary["timeline_count"] == 1
     assert metadata_summary["timeline_shard_count"] == 1
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 757ea60882..2782d33e15 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -7,7 +7,7 @@ from typing import Any, Dict, Optional
 import pytest
 from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
+from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, StorageScrubber
 from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     assert_prefix_empty,
@@ -214,7 +214,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     # Having done a bunch of attach/detach cycles, we will have generated some index garbage: check
     # that the scrubber sees it and cleans it up.  We do this before the final attach+validate pass,
     # to also validate that the scrubber isn't breaking anything.
-    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
+    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
     assert gc_summary["remote_storage_errors"] == 0
     assert gc_summary["indices_deleted"] > 0
 
@@ -536,7 +536,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     # Scrub the remote storage
     # ========================
     # This confirms that the scrubber isn't upset by the presence of the heatmap
-    S3Scrubber(neon_env_builder).scan_metadata()
+    StorageScrubber(neon_env_builder).scan_metadata()
 
     # Detach secondary and delete tenant
     # ===================================
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 1996e99557..56075c5975 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -11,8 +11,8 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
-    S3Scrubber,
     StorageControllerApiException,
+    StorageScrubber,
     last_flush_lsn_upload,
     tenant_get_shards,
     wait_for_last_flush_lsn,
@@ -128,7 +128,7 @@ def test_sharding_smoke(
 
     # Check the scrubber isn't confused by sharded content, then disable
     # it during teardown because we'll have deleted by then
-    S3Scrubber(neon_env_builder).scan_metadata()
+    StorageScrubber(neon_env_builder).scan_metadata()
     neon_env_builder.scrub_on_exit = False
 
     env.storage_controller.pageserver_api().tenant_delete(tenant_id)
diff --git a/test_runner/regress/test_s3_scrubber.py b/test_runner/regress/test_storage_scrubber.py
similarity index 94%
rename from test_runner/regress/test_s3_scrubber.py
rename to test_runner/regress/test_storage_scrubber.py
index 6baba190f3..35ae61c380 100644
--- a/test_runner/regress/test_s3_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -6,7 +6,7 @@ import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
-    S3Scrubber,
+    StorageScrubber,
 )
 from fixtures.remote_storage import S3Storage, s3_storage
 from fixtures.workload import Workload
@@ -60,7 +60,7 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
     output_path = neon_env_builder.test_output_dir / "snapshot"
     os.makedirs(output_path)
 
-    scrubber = S3Scrubber(neon_env_builder)
+    scrubber = StorageScrubber(neon_env_builder)
     scrubber.tenant_snapshot(tenant_id, output_path)
 
     assert len(os.listdir(output_path)) > 0
@@ -143,18 +143,18 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt
         workload.write_rows(1)
 
     # With a high min_age, the scrubber should decline to delete anything
-    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600)
+    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600)
     assert gc_summary["remote_storage_errors"] == 0
     assert gc_summary["indices_deleted"] == 0
 
     # If targeting a different tenant, the scrubber shouldn't do anything
-    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(
+    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(
         min_age_secs=1, tenant_ids=[TenantId.generate()]
     )
     assert gc_summary["remote_storage_errors"] == 0
     assert gc_summary["indices_deleted"] == 0
 
     #  With a low min_age, the scrubber should go ahead and clean up all but the latest 2 generations
-    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
+    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
     assert gc_summary["remote_storage_errors"] == 0
     assert gc_summary["indices_deleted"] == (expect_indices_per_shard - 2) * shard_count
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index fa7cead1bd..fd3cc45c3f 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -10,7 +10,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
-    S3Scrubber,
+    StorageScrubber,
     last_flush_lsn_upload,
     wait_for_last_flush_lsn,
 )
@@ -707,7 +707,7 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
 
     remote_storage_kind = RemoteStorageKind.MOCK_S3
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-    scrubber = S3Scrubber(neon_env_builder)
+    scrubber = StorageScrubber(neon_env_builder)
     env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
 
     ps_http = env.pageserver.http_client()

From b7a0c2b61430eb5f88200b679acdcbee3503f15b Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Tue, 11 Jun 2024 17:59:32 -0700
Subject: [PATCH 0956/1571] Add On-demand WAL Download to logicalfuncs (#7960)

We implemented on-demand WAL download for walsender, but other things
that may want to read the WAL from safekeepers don't do that yet. This
PR makes it do that by adding the same set of hooks to logicalfuncs.

Addresses https://github.com/neondatabase/neon/issues/7959

Also relies on:
https://github.com/neondatabase/postgres/pull/438
https://github.com/neondatabase/postgres/pull/437
https://github.com/neondatabase/postgres/pull/436
---
 Makefile                                      |  2 ++
 pgxn/neon/neon.c                              |  2 ++
 pgxn/neon/walsender_hooks.c                   | 27 ++++++++++++++++-
 .../regress/test_logical_replication.py       | 30 +++++++++++++++++++
 vendor/postgres-v14                           |  2 +-
 vendor/postgres-v15                           |  2 +-
 vendor/postgres-v16                           |  2 +-
 vendor/revisions.json                         |  6 ++--
 8 files changed, 66 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index dcbfdbcbc1..37bd19ba44 100644
--- a/Makefile
+++ b/Makefile
@@ -124,6 +124,8 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
 	+@echo "Compiling amcheck $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
+	+@echo "Compiling test_decoding $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/test_decoding install
 
 .PHONY: postgres-clean-%
 postgres-clean-%:
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index b69a3819c9..276d1542fe 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -19,6 +19,7 @@
 #include "catalog/pg_type.h"
 #include "postmaster/bgworker.h"
 #include "postmaster/interrupt.h"
+#include "replication/logical.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "storage/procsignal.h"
@@ -280,6 +281,7 @@ _PG_init(void)
 	pg_init_libpagestore();
 	pg_init_walproposer();
         WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
+	LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 
 	InitLogicalReplicationMonitor();
 
diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c
index 93dce9de84..8f8d1dfc01 100644
--- a/pgxn/neon/walsender_hooks.c
+++ b/pgxn/neon/walsender_hooks.c
@@ -24,8 +24,12 @@
 #include "walproposer.h"
 
 static NeonWALReader *wal_reader = NULL;
+
+struct WalSnd;
+extern struct WalSnd *MyWalSnd;
 extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc);
 extern bool GetDonorShmem(XLogRecPtr *donor_lsn);
+extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI);
 
 static XLogRecPtr
 NeonWALReadWaitForWAL(XLogRecPtr loc)
@@ -36,7 +40,28 @@ NeonWALReadWaitForWAL(XLogRecPtr loc)
 		CHECK_FOR_INTERRUPTS();
 	}
 
-	return WalSndWaitForWal(loc);
+	// Walsender sends keepalives and stuff, so better use its normal wait
+	if (MyWalSnd != NULL)
+		return WalSndWaitForWal(loc);
+
+	for (;;)
+	{
+		XLogRecPtr flush_ptr;
+		if (!RecoveryInProgress())
+#if PG_VERSION_NUM >= 150000
+			flush_ptr = GetFlushRecPtr(NULL);
+#else
+			flush_ptr = GetFlushRecPtr();
+#endif
+		else
+			flush_ptr = GetXLogReplayRecPtr(NULL);
+
+		if (loc <= flush_ptr)
+			return flush_ptr;
+
+		CHECK_FOR_INTERRUPTS();
+		pg_usleep(1000);
+	}
 }
 
 static int
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index a657d5a035..ca3c81d6e5 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -221,6 +221,35 @@ def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
     wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint))
 
 
+def test_ondemand_wal_download_in_replication_slot_funcs(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch("init")
+    endpoint = env.endpoints.create_start("init")
+
+    with endpoint.connect().cursor() as cur:
+        cur.execute("create table wal_generator (id serial primary key, data text)")
+        cur.execute(
+            "SELECT * FROM pg_create_logical_replication_slot('slotty_mcslotface', 'test_decoding')"
+        )
+        cur.execute(
+            """
+INSERT INTO wal_generator (data)
+SELECT repeat('A', 1024) -- Generates a kilobyte of data per row
+FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data
+"""
+        )
+
+    endpoint.stop_and_destroy()
+    endpoint = env.endpoints.create_start("init")
+
+    with endpoint.connect().cursor() as cur:
+        cur.execute(
+            "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')"
+        )
+
+
 # Tests that walsender correctly blocks until WAL is downloaded from safekeepers
 def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg):
     neon_env_builder.num_safekeepers = 3
@@ -247,6 +276,7 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
     connstr = endpoint.connstr().replace("'", "''")
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub")
     logical_replication_sync(vanilla_pg, endpoint)
+
     vanilla_pg.stop()
 
     # Pause the safekeepers so that they can't send WAL (except to pageserver)
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 17e0f5ff4e..4c51945a61 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 17e0f5ff4e1905691aa40e1e08f9b79b14c99652
+Subproject commit 4c51945a6167ca06c0169e7a4ca5a8e7ffa3faba
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index c2c3d40534..e22098d86d 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit c2c3d40534db97d83dd7e185d1971e707fa2f445
+Subproject commit e22098d86d6c40276b6bd75c29133a33fb283ab6
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index b228f20372..9837db1578 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit b228f20372ebcabfd7946647cb7adbd38bacb14a
+Subproject commit 9837db157837fcf43ef7348be0017d3a2238cd27
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 5bf4e289ef..f945ea6d73 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "b228f20372ebcabfd7946647cb7adbd38bacb14a"],
-  "v15": ["15.7", "c2c3d40534db97d83dd7e185d1971e707fa2f445"],
-  "v14": ["14.12", "17e0f5ff4e1905691aa40e1e08f9b79b14c99652"]
+  "v16": ["16.3", "9837db157837fcf43ef7348be0017d3a2238cd27"],
+  "v15": ["15.7", "e22098d86d6c40276b6bd75c29133a33fb283ab6"],
+  "v14": ["14.12", "4c51945a6167ca06c0169e7a4ca5a8e7ffa3faba"]
 }

From 9983ae291bf97fcd4a80fc3be6b00da39aca2663 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 12 Jun 2024 09:18:52 +0300
Subject: [PATCH 0957/1571] Another attempt at making test_vm_bits less flaky
 (#7989)

- Split the first and second parts of the test to two separate tests

- In the first test, disable the aggressive GC, compaction, and
autovacuum. They are only needed by the second test. I'd like to get the
first test to a point that the VM page is never all-zeros. Disabling
autovacuum in the first test is hopefully enough to accomplish that.

- Compare the full page images, don't skip page header. After fixing the
previous point, there should be no discrepancy. LSN still won't match,
though, because of commit 387a36874c.

Fixes issue https://github.com/neondatabase/neon/issues/7984
---
 test_runner/regress/test_vm_bits.py | 116 +++++++++++++++++++++-------
 1 file changed, 86 insertions(+), 30 deletions(-)

diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index b549db1af6..225b952e73 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -1,7 +1,9 @@
 import time
+from contextlib import closing
 
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, fork_at_current_lsn
+from fixtures.utils import query_scalar
 
 
 #
@@ -113,11 +115,88 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     assert cur_new.fetchall() == []
 
 
-#
-# Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK
-# record.
-#
-def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
+def test_vm_bit_clear_on_heap_lock_whitebox(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK record.
+
+    This is a repro for the bug fixed in commit 66fa176cc8.
+    """
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            # If auto-analyze runs at the same time that we run VACUUM FREEZE, it
+            # can hold a snasphot that prevent the tuples from being frozen.
+            "autovacuum=off",
+            "log_checkpoints=on",
+        ],
+    )
+
+    # Run the tests in a dedicated database, because the activity monitor
+    # periodically runs some queries on to the 'postgres' database. If that
+    # happens at the same time that we're trying to freeze, the activity
+    # monitor's queries can hold back the xmin horizon and prevent freezing.
+    with closing(endpoint.connect()) as pg_conn:
+        pg_conn.cursor().execute("CREATE DATABASE vmbitsdb")
+    pg_conn = endpoint.connect(dbname="vmbitsdb")
+    cur = pg_conn.cursor()
+
+    # Install extension containing function needed for test
+    cur.execute("CREATE EXTENSION neon_test_utils")
+    cur.execute("CREATE EXTENSION pageinspect")
+
+    # Create a test table and freeze it to set the all-frozen VM bit on all pages.
+    cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)")
+    cur.execute("BEGIN")
+    cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g")
+    xid = int(query_scalar(cur, "SELECT txid_current()"))
+    cur.execute("COMMIT")
+    cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true, VERBOSE) vmtest_lock")
+    for notice in pg_conn.notices:
+        log.info(f"{notice}")
+
+    # This test has been flaky in the past, because background activity like
+    # auto-analyze and compute_ctl's activity monitor queries have prevented the
+    # tuples from being frozen. Check that they were frozen.
+    relfrozenxid = int(
+        query_scalar(cur, "SELECT relfrozenxid FROM pg_class WHERE relname='vmtest_lock'")
+    )
+    assert (
+        relfrozenxid > xid
+    ), f"Inserted rows were not frozen. This can be caused by concurrent activity in the database. (XID {xid}, relfrozenxid {relfrozenxid}"
+
+    # Lock a row. This clears the all-frozen VM bit for that page.
+    cur.execute("BEGIN")
+    cur.execute("SELECT * FROM vmtest_lock WHERE id = 40000 FOR UPDATE")
+    cur.execute("COMMIT")
+
+    # The VM page in shared buffer cache, and the same page as reconstructed by
+    # the pageserver, should be equal. Except for the LSN: Clearing a bit in the
+    # VM doesn't bump the LSN in PostgreSQL, but the pageserver updates the LSN
+    # when it replays the VM-bit clearing record (since commit 387a36874c)
+    #
+    # This is a bit fragile, we've had lot of flakiness in this test before. For
+    # example, because all the VM bits were not set because concurrent
+    # autoanalyze prevented the VACUUM FREEZE from freezing the tuples. Or
+    # because autoavacuum kicked in and re-froze the page between the
+    # get_raw_page() and get_raw_page_at_lsn() calls. We disable autovacuum now,
+    # which should make this deterministic.
+    cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
+    vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex()
+    cur.execute(
+        "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )"
+    )
+    vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex()
+
+    assert vm_page_at_pageserver == vm_page_in_cache
+
+
+def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):
+    """
+    The previous test is enough to verify the bug that was fixed in
+    commit 66fa176cc8. But for good measure, we also reproduce the
+    original problem that the missing VM page update caused.
+    """
     tenant_conf = {
         "checkpoint_distance": f"{128 * 1024}",
         "compaction_target_size": f"{128 * 1024}",
@@ -130,9 +209,9 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
 
     tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock")
+    timeline_id = env.initial_timeline
     endpoint = env.endpoints.create_start(
-        "test_vm_bit_clear_on_heap_lock",
+        "main",
         config_lines=[
             "log_autovacuum_min_duration = 0",
             # Perform anti-wraparound vacuuming aggressively
@@ -146,12 +225,10 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
 
     # Install extension containing function needed for test
     cur.execute("CREATE EXTENSION neon_test_utils")
-    cur.execute("CREATE EXTENSION pageinspect")
 
     # Create a test table and freeze it to set the all-frozen VM bit on all pages.
     cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)")
     cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g")
-
     cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true) vmtest_lock")
 
     # Lock a row. This clears the all-frozen VM bit for that page.
@@ -165,27 +242,6 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
 
     cur.execute("COMMIT")
 
-    # The VM page in shared buffer cache, and the same page as reconstructed
-    # by the pageserver, should be equal.
-    #
-    # Ignore page header (24 bytes) of visibility map.
-    # If the dirty VM page is flushed from the cache for some reason,
-    # it gets WAL-logged, which changes the LSN on the page.
-    # Also in neon SMGR we can replace empty heap page with zero (uninitialized) heap page.
-    cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
-    vm_page_in_cache = (cur.fetchall()[0][0])[24:100].hex()
-    cur.execute(
-        "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )"
-    )
-    vm_page_at_pageserver = (cur.fetchall()[0][0])[24:100].hex()
-
-    assert vm_page_at_pageserver == vm_page_in_cache
-
-    # The above assert is enough to verify the bug that was fixed in
-    # commit 66fa176cc8. But for good measure, we also reproduce the
-    # original problem that the missing VM page update caused. The
-    # rest of the test does that.
-
     # Kill and restart postgres, to clear the buffer cache.
     #
     # NOTE: clear_buffer_cache() will not do, because it evicts the dirty pages

From 69aa1aca356b1893a48b06627b31de4933a172f9 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 12 Jun 2024 09:19:24 +0300
Subject: [PATCH 0958/1571] Update default Postgres version in
 docker-compose.yml (#8019)

Let's be modern.
---
 docker-compose/docker-compose.yml | 4 ++--
 docs/docker.md                    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index a395f0331b..3f097f2700 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -159,12 +159,12 @@ services:
       context: ./compute_wrapper/
       args:
         - REPOSITORY=${REPOSITORY:-neondatabase}
-        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
+        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16}
         - TAG=${TAG:-latest}
         - http_proxy=$http_proxy
         - https_proxy=$https_proxy
     environment:
-      - PG_VERSION=${PG_VERSION:-14}
+      - PG_VERSION=${PG_VERSION:-16}
       #- RUST_BACKTRACE=1
     # Mount the test files directly, for faster editing cycle.
     volumes:
diff --git a/docs/docker.md b/docs/docker.md
index cbf68be3a7..ccd2afc27a 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -34,12 +34,12 @@ You can see a [docker compose](https://docs.docker.com/compose/) example to crea
 1. create containers
 
 You can specify version of neon cluster using following environment values.
-- PG_VERSION: postgres version for compute (default is 14)
+- PG_VERSION: postgres version for compute (default is 16)
 - TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
 ```
 $ cd docker-compose/
 $ docker-compose down   # remove the containers if exists
-$ PG_VERSION=15 TAG=2937 docker-compose up --build -d  # You can specify the postgres and image version
+$ PG_VERSION=16 TAG=2937 docker-compose up --build -d  # You can specify the postgres and image version
 Creating network "dockercompose_default" with the default driver
 Creating docker-compose_storage_broker_1       ... done
 (...omit...)

From 0a256148b0345d4a04e63347947bc93bcacdfb24 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 12 Jun 2024 10:06:00 +0300
Subject: [PATCH 0959/1571] Update documentation on running locally with Docker
 (#8020)

- Fix the dockerhub URLs

- `neondatabase/compute-node` image has been replaced with Postgres
version specific images like `neondatabase/compute-node-v16`

- Use TAG=latest in the example, rather than some old tag. That's a
sensible default for people to copy-past

- For convenience, use a Postgres connection URL in the `psql` example
that also includes the password. That way, there's no need to set up
.pgpass

- Update the image names in `docker ps` example to match what you get
when you follow the example
---
 docs/docker.md | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/docs/docker.md b/docs/docker.md
index ccd2afc27a..ce806c4e6c 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -4,18 +4,18 @@
 
 Currently we build two main images:
 
-- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
-- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres).
+- [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
+- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14.
 
 And additional intermediate image:
 
 - [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.
 
-## Building pipeline
+## Build pipeline
 
 We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs
 
-1. `neondatabase/compute-tools` and `neondatabase/compute-node`
+1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14)
 
 2. `neondatabase/neon`
 
@@ -34,12 +34,12 @@ You can see a [docker compose](https://docs.docker.com/compose/) example to crea
 1. create containers
 
 You can specify version of neon cluster using following environment values.
-- PG_VERSION: postgres version for compute (default is 16)
-- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
+- PG_VERSION: postgres version for compute (default is 16 as of this writing)
+- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags), which is tagged in [CI test](/.github/workflows/build_and_test.yml). Default is 'latest'
 ```
 $ cd docker-compose/
 $ docker-compose down   # remove the containers if exists
-$ PG_VERSION=16 TAG=2937 docker-compose up --build -d  # You can specify the postgres and image version
+$ PG_VERSION=16 TAG=latest docker-compose up --build -d  # You can specify the postgres and image version
 Creating network "dockercompose_default" with the default driver
 Creating docker-compose_storage_broker_1       ... done
 (...omit...)
@@ -47,29 +47,31 @@ Creating docker-compose_storage_broker_1       ... done
 
 2. connect compute node
 ```
-$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
-$ chmod 600 ~/.pgpass
-$ psql -h localhost -p 55433 -U cloud_admin
+$ psql postgresql://cloud_admin:cloud_admin@localhost:55433/postgres
+psql (16.3)
+Type "help" for help.
+
 postgres=# CREATE TABLE t(key int primary key, value text);
 CREATE TABLE
-postgres=# insert into t values(1,1);
+postgres=# insert into t values(1, 1);
 INSERT 0 1
 postgres=# select * from t;
- key | value
+ key | value 
 -----+-------
    1 | 1
 (1 row)
+
 ```
 
 3. If you want to see the log, you can use `docker-compose logs` command.
 ```
 # check the container name you want to see
 $ docker ps
-CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                                                                  NAMES
-d6968a5ae912   dockercompose_compute                              "/shell/compute.sh"      5 minutes ago   Up 5 minutes   0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp                                                                                       dockercompose_compute_1
+CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                      NAMES
+3582f6d76227   docker-compose_compute                             "/shell/compute.sh"      2 minutes ago   Up 2 minutes   0.0.0.0:3080->3080/tcp, :::3080->3080/tcp, 0.0.0.0:55433->55433/tcp, :::55433->55433/tcp   docker-compose_compute_1
 (...omit...)
 
-$ docker logs -f dockercompose_compute_1
+$ docker logs -f docker-compose_compute_1
 2022-10-21 06:15:48.757 GMT [56] LOG:  connection authorized: user=cloud_admin database=postgres application_name=psql
 2022-10-21 06:17:00.307 GMT [56] LOG:  [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400'
 (...omit...)

From f749437cec056bd4387dc8c17a5591003ef1fff6 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 12 Jun 2024 12:25:13 +0200
Subject: [PATCH 0960/1571] Resolve the problem the docker compose caused by
 the extensions tests (#8024)

## Problem
The merging of #7818 caused the problem with the docker-compose file.
Running docker compose is now impossible due to the unavailability of
the neon-test-extensions:latest image

## Summary of changes
Fix the problem:
Add the latest tag to the neon-test-extensions image and use the
profiles feature of the docker-compose file to avoid loading the
neon-test-extensions container if it is not needed.
---
 .github/workflows/build_and_test.yml  | 2 ++
 docker-compose/docker-compose.yml     | 1 +
 docker-compose/docker_compose_test.sh | 9 ++++-----
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 79a0a77638..57635f4920 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1101,6 +1101,8 @@ jobs:
                                                  $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
             done
           done
+          docker buildx imagetools create -t neondatabase/neon-test-extensions:latest \
+                                             neondatabase/neon-test-extensions:${{ needs.tag.outputs.build-tag }}
 
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index 3f097f2700..5503b6611a 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -194,6 +194,7 @@ services:
       - compute
 
   neon-test-extensions:
+    profiles: ["test-extensions"]
     image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest}
     entrypoint:
       - "/bin/bash"
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index 71e73c2d0a..a00591afd0 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -15,7 +15,6 @@ set -eux -o pipefail
 
 COMPOSE_FILE='docker-compose.yml'
 cd $(dirname $0)
-docker compose -f $COMPOSE_FILE 
 COMPUTE_CONTAINER_NAME=docker-compose-compute-1
 TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1
 PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
@@ -26,16 +25,16 @@ export http_proxy https_proxy
 cleanup() {
     echo "show container information"
     docker ps
-    docker compose -f $COMPOSE_FILE logs
+    docker compose --profile test-extensions -f $COMPOSE_FILE logs
     echo "stop containers..."
-    docker compose -f $COMPOSE_FILE down
+    docker compose --profile test-extensions -f $COMPOSE_FILE down
 }
 
 for pg_version in 14 15 16; do
     echo "clean up containers if exists"
     cleanup
     PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version))
-    PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose -f $COMPOSE_FILE up --build -d
+    PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d
 
     echo "wait until the compute is ready. timeout after 60s. "
     cnt=0
@@ -47,7 +46,7 @@ for pg_version in 14 15 16; do
             cleanup
             exit 1
         fi
-        if docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then
+        if docker compose --profile test-extensions -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then
             echo "OK. The compute is ready to connect."
             echo "execute simple queries."
             docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"

From 3099e1a787693c7a7b2c694648462fad7eb2bcf5 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 12 Jun 2024 12:33:54 +0100
Subject: [PATCH 0961/1571] storcon_cli: do not drain to undesirable nodes
 (#8027)

## Problem
The previous code would attempt to drain to unavailable or unschedulable
nodes.

## Summary of Changes
Remove such nodes from the list of nodes to fill.
---
 control_plane/storcon_cli/src/main.rs | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 8c84911d33..7b48b75c21 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -786,18 +786,15 @@ async fn main() -> anyhow::Result<()> {
                 anyhow::bail!("Drain requested for node which doesn't exist.")
             }
 
-            let can_fill = node_to_fill_descs
-                .iter()
-                .filter(|desc| {
-                    matches!(desc.availability, NodeAvailabilityWrapper::Active)
-                        && matches!(
-                            desc.scheduling,
-                            NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Filling
-                        )
-                })
-                .any(|_| true);
+            node_to_fill_descs.retain(|desc| {
+                matches!(desc.availability, NodeAvailabilityWrapper::Active)
+                    && matches!(
+                        desc.scheduling,
+                        NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Filling
+                    )
+            });
 
-            if !can_fill {
+            if node_to_fill_descs.is_empty() {
                 anyhow::bail!("There are no nodes to drain to")
             }
 

From 9ba9f32dfe35ad99335497f7d22c14ba02ebea9f Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 12 Jun 2024 16:10:57 +0200
Subject: [PATCH 0962/1571] Reactivate page bench test in CI after ignoring
 CopyFail error in pageserver (#8023)

## Problem

Testcase page bench test_pageserver_max_throughput_getpage_at_latest_lsn
had been deactivated because it was flaky.

We now ignore copy fail error messages like in


https://github.com/neondatabase/neon/blob/270d3be507643f068120b52838c497f6c1b45b61/test_runner/regress/test_pageserver_getpage_throttle.py#L17-L20

and want to reactivate it to see it it is still flaky

## Summary of changes

- reactivate the test in CI
- ignore CopyFail error message during page bench test cases

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 ...geserver_max_throughput_getpage_at_latest_lsn.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 1a0012397c..772a39fe35 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -1,5 +1,4 @@
 import json
-import os
 from pathlib import Path
 from typing import Any, Dict, Tuple
 
@@ -35,10 +34,6 @@ from performance.pageserver.util import (
 @pytest.mark.timeout(
     10000
 )  # TODO: this value is just "a really high number"; have this per instance type
-@pytest.mark.skipif(
-    os.getenv("CI", "false") == "true",
-    reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/6724",
-)
 def test_pageserver_max_throughput_getpage_at_latest_lsn(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,
@@ -91,6 +86,14 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
         n_tenants,
         setup_wrapper,
     )
+
+    env.pageserver.allowed_errors.append(
+        # https://github.com/neondatabase/neon/issues/6925
+        # https://github.com/neondatabase/neon/issues/6390
+        # https://github.com/neondatabase/neon/issues/6724
+        r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
+    )
+
     run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration)
 
 
From 9dda13eccec8e2043566be1069709709dfec425a Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 12 Jun 2024 18:15:20 +0200
Subject: [PATCH 0963/1571] Add the image version to the neon-test-extensions
 image (#8032)

## Problem

The version was missing in the image name causing the error during the
workflow

## Summary of changes

Added the version to the image name
---
 .github/workflows/build_and_test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 57635f4920..71ca7329ee 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1101,8 +1101,8 @@ jobs:
                                                  $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
             done
           done
-          docker buildx imagetools create -t neondatabase/neon-test-extensions:latest \
-                                             neondatabase/neon-test-extensions:${{ needs.tag.outputs.build-tag }}
+          docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
+                                             neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
 
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]

From 836d1f4af79cb74b69178213c76ef66993903307 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 12 Jun 2024 13:42:43 -0400
Subject: [PATCH 0964/1571] test(pageserver): add test keyspace into
 collect_keyspace (#8016)

Some test cases add random keys into the timeline, but it is not part of
the `collect_keyspace`, this will cause compaction remove the keys.

The pull request adds a field to supply extra keyspaces during unit
tests.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs |  8 ++++++++
 pageserver/src/tenant.rs            |  8 ++++++--
 pageserver/src/tenant/timeline.rs   | 18 ++++++++++++++++++
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 336d1c3fb8..25d00d6dfd 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -919,6 +919,14 @@ impl Timeline {
             result.add_key(AUX_FILES_KEY);
         }
 
+        #[cfg(test)]
+        {
+            let guard = self.extra_test_dense_keyspace.load();
+            for kr in &guard.ranges {
+                result.add_range(kr.clone());
+            }
+        }
+
         Ok((
             result.to_keyspace(),
             /* AUX sparse key space */
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f9ed6d3071..d556f72335 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5264,6 +5264,9 @@ mod tests {
         let cancel = CancellationToken::new();
 
         let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let mut test_key_end = test_key;
+        test_key_end.field6 = NUM_KEYS as u32;
+        tline.add_extra_test_dense_keyspace(KeySpace::single(test_key..test_key_end));
 
         let mut keyspace = KeySpaceAccum::new();
 
@@ -6223,8 +6226,8 @@ mod tests {
 
         let cancel = CancellationToken::new();
 
-        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-        base_key.field1 = AUX_KEY_PREFIX;
+        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+        assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
         let mut test_key = base_key;
         let mut lsn = Lsn(0x10);
 
@@ -6329,6 +6332,7 @@ mod tests {
                 Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
             )
             .await?;
+        tline.add_extra_test_dense_keyspace(KeySpace::single(base_key..(base_key_nonexist.next())));
 
         let child = tenant
             .branch_timeline_test_with_layers(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 54a4ceeaf3..28627e7911 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -426,6 +426,14 @@ pub struct Timeline {
 
     /// Indicate whether aux file v2 storage is enabled.
     pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
+
+    /// Some test cases directly place keys into the timeline without actually modifying the directory
+    /// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that
+    /// these keys won't get garbage-collected during compaction/GC. This field only modifies the dense
+    /// keyspace return value of `collect_keyspace`. For sparse keyspaces, use AUX keys for testing, and
+    /// in the future, add `extra_test_sparse_keyspace` if necessary.
+    #[cfg(test)]
+    pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
 }
 
 pub struct WalReceiverInfo {
@@ -2344,6 +2352,9 @@ impl Timeline {
                 aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
 
                 last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
+
+                #[cfg(test)]
+                extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
             };
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -5562,6 +5573,13 @@ impl Timeline {
         }
         Ok(layers)
     }
+
+    #[cfg(test)]
+    pub(crate) fn add_extra_test_dense_keyspace(&self, ks: KeySpace) {
+        let mut keyspace = self.extra_test_dense_keyspace.load().as_ref().clone();
+        keyspace.merge(&ks);
+        self.extra_test_dense_keyspace.store(Arc::new(keyspace));
+    }
 }
 
 type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);

From ad0ab3b81bacc35e73a6902b6ce53db18f5e6293 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Wed, 12 Jun 2024 20:25:04 +0200
Subject: [PATCH 0965/1571] Fix query error in vm-image-spec.yaml (#8028)

This query causes metrics exporter to complain about missing data
because it can't find the correct column.

Issue was introduced with https://github.com/neondatabase/neon/pull/7761
---
 vm-image-spec.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 73a24c42d6..15f820bebd 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -304,7 +304,9 @@ files:
           - slot_name
         values: [restart_lsn]
         query: |
-          select slot_name, (restart_lsn - '0/0')::FLOAT8 from pg_replication_slots where slot_type = 'logical';
+          select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
+          from pg_replication_slots
+          where slot_type = 'logical';
 
       - metric_name: retained_wal
         type: gauge

From dc2ab4407f8b9636a6e570818154f21fde14b9ce Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 13 Jun 2024 00:31:31 +0300
Subject: [PATCH 0966/1571] Fix on-demand SLRU download on standby starting at
 WAL segment boundary (#8031)

If a standby is started right after switching to a new WAL segment, the
request in the SLRU download request would point to the beginning of the
segment (e.g. 0/5000000), while the not-modified-since LSN would point
to just after the page header (e.g. 0/5000028). It's effectively the
same position, as there cannot be any WAL records in between, but the
pageserver rightly errors out on any request where the request LSN <
not-modified since LSN.

To fix, round down the not-modified since LSN to the beginning of the
page like the request LSN.

Fixes issue #8030
---
 pgxn/neon/pagestore_smgr.c                    |  4 +--
 .../regress/test_ondemand_slru_download.py    | 30 +++++++++++++++++++
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 6305f2ec92..8edaf65639 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -3112,12 +3112,12 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 		request_lsn = UINT64_MAX;
 
 	/*
-	 * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
+	 * GetRedoStartLsn() returns LSN of the basebackup. We know that the SLRU
 	 * segment has not changed since the basebackup, because in order to
 	 * modify it, we would have had to download it already. And once
 	 * downloaded, we never evict SLRU segments from local disk.
 	 */
-	not_modified_since = GetRedoStartLsn();
+	not_modified_since = nm_adjust_lsn(GetRedoStartLsn());
 
 	SlruKind kind;
 
diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py
index 4af7dcdfc3..d6babe4393 100644
--- a/test_runner/regress/test_ondemand_slru_download.py
+++ b/test_runner/regress/test_ondemand_slru_download.py
@@ -129,3 +129,33 @@ def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count
     cur_replica = conn_replica.cursor()
     cur_replica.execute("SELECT * FROM clogtest")
     assert cur_replica.fetchall() == [(1,), (3,)]
+
+
+def test_ondemand_download_after_wal_switch(neon_env_builder: NeonEnvBuilder):
+    """
+    Test on-demand SLRU download on standby, when starting right after
+    WAL segment switch.
+
+    This is a repro for a bug in how the LSN at WAL page/segment
+    boundary was handled (https://github.com/neondatabase/neon/issues/8030)
+    """
+
+    tenant_conf = {
+        "lazy_slru_download": "true",
+    }
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
+
+    endpoint = env.endpoints.create_start("main")
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    # Create a test table
+    cur.execute("CREATE TABLE clogtest (id integer)")
+    cur.execute("INSERT INTO clogtest VALUES (1)")
+
+    # Start standby at WAL segment boundary
+    cur.execute("SELECT pg_switch_wal()")
+    lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
+    _endpoint_at_lsn = env.endpoints.create_start(
+        branch_name="main", endpoint_id="ep-at-lsn", lsn=lsn
+    )

From fbccd1e6762f61686a810ba6b4654da7da1247d9 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 13 Jun 2024 14:42:26 +0200
Subject: [PATCH 0967/1571] Proxy process updated errors (#8026)

## Problem

Respect errors classification from cplane
---
 proxy/src/console/messages.rs      | 169 ++++++++++++++++++++++++++++-
 proxy/src/console/provider.rs      | 133 ++++++++++-------------
 proxy/src/console/provider/neon.rs |  23 ++--
 proxy/src/proxy/tests.rs           |  20 ++--
 proxy/src/proxy/wake_compute.rs    | 101 ++++++++++++-----
 5 files changed, 324 insertions(+), 122 deletions(-)

diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 9869b95768..3b7d681a41 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,16 +1,183 @@
 use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
-use std::fmt;
+use std::fmt::{self, Display};
 
 use crate::auth::IpPattern;
 
 use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
+use crate::proxy::retry::ShouldRetry;
 
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
 #[derive(Debug, Deserialize)]
 pub struct ConsoleError {
     pub error: Box<str>,
+    #[serde(skip)]
+    pub http_status_code: http::StatusCode,
+    pub status: Option<Status>,
+}
+
+impl ConsoleError {
+    pub fn get_reason(&self) -> Reason {
+        self.status
+            .as_ref()
+            .and_then(|s| s.details.error_info.as_ref())
+            .map(|e| e.reason)
+            .unwrap_or(Reason::Unknown)
+    }
+    pub fn get_user_facing_message(&self) -> String {
+        use super::provider::errors::REQUEST_FAILED;
+        self.status
+            .as_ref()
+            .and_then(|s| s.details.user_facing_message.as_ref())
+            .map(|m| m.message.clone().into())
+            .unwrap_or_else(|| {
+                // Ask @neondatabase/control-plane for review before adding more.
+                match self.http_status_code {
+                    http::StatusCode::NOT_FOUND => {
+                        // Status 404: failed to get a project-related resource.
+                        format!("{REQUEST_FAILED}: endpoint cannot be found")
+                    }
+                    http::StatusCode::NOT_ACCEPTABLE => {
+                        // Status 406: endpoint is disabled (we don't allow connections).
+                        format!("{REQUEST_FAILED}: endpoint is disabled")
+                    }
+                    http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
+                        // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
+                        format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.")
+                    }
+                    _ => REQUEST_FAILED.to_owned(),
+                }
+            })
+    }
+}
+
+impl Display for ConsoleError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let msg = self
+            .status
+            .as_ref()
+            .and_then(|s| s.details.user_facing_message.as_ref())
+            .map(|m| m.message.as_ref())
+            .unwrap_or_else(|| &self.error);
+        write!(f, "{}", msg)
+    }
+}
+
+impl ShouldRetry for ConsoleError {
+    fn could_retry(&self) -> bool {
+        if self.status.is_none() || self.status.as_ref().unwrap().details.retry_info.is_none() {
+            // retry some temporary failures because the compute was in a bad state
+            // (bad request can be returned when the endpoint was in transition)
+            return match &self {
+                ConsoleError {
+                    http_status_code: http::StatusCode::BAD_REQUEST,
+                    ..
+                } => true,
+                // don't retry when quotas are exceeded
+                ConsoleError {
+                    http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
+                    ref error,
+                    ..
+                } => !error.contains("compute time quota of non-primary branches is exceeded"),
+                // locked can be returned when the endpoint was in transition
+                // or when quotas are exceeded. don't retry when quotas are exceeded
+                ConsoleError {
+                    http_status_code: http::StatusCode::LOCKED,
+                    ref error,
+                    ..
+                } => {
+                    !error.contains("quota exceeded")
+                        && !error.contains("the limit for current plan reached")
+                }
+                _ => false,
+            };
+        }
+
+        // retry if the response has a retry delay
+        if let Some(retry_info) = self
+            .status
+            .as_ref()
+            .and_then(|s| s.details.retry_info.as_ref())
+        {
+            retry_info.retry_delay_ms > 0
+        } else {
+            false
+        }
+    }
+}
+
+#[derive(Debug, Deserialize)]
+pub struct Status {
+    pub code: Box<str>,
+    pub message: Box<str>,
+    pub details: Details,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct Details {
+    pub error_info: Option<ErrorInfo>,
+    pub retry_info: Option<RetryInfo>,
+    pub user_facing_message: Option<UserFacingMessage>,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct ErrorInfo {
+    pub reason: Reason,
+    // Schema could also have `metadata` field, but it's not structured. Skip it for now.
+}
+
+#[derive(Clone, Copy, Debug, Deserialize, Default)]
+pub enum Reason {
+    #[serde(rename = "ROLE_PROTECTED")]
+    RoleProtected,
+    #[serde(rename = "RESOURCE_NOT_FOUND")]
+    ResourceNotFound,
+    #[serde(rename = "PROJECT_NOT_FOUND")]
+    ProjectNotFound,
+    #[serde(rename = "ENDPOINT_NOT_FOUND")]
+    EndpointNotFound,
+    #[serde(rename = "BRANCH_NOT_FOUND")]
+    BranchNotFound,
+    #[serde(rename = "RATE_LIMIT_EXCEEDED")]
+    RateLimitExceeded,
+    #[serde(rename = "NON_PRIMARY_BRANCH_COMPUTE_TIME_EXCEEDED")]
+    NonPrimaryBranchComputeTimeExceeded,
+    #[serde(rename = "ACTIVE_TIME_QUOTA_EXCEEDED")]
+    ActiveTimeQuotaExceeded,
+    #[serde(rename = "COMPUTE_TIME_QUOTA_EXCEEDED")]
+    ComputeTimeQuotaExceeded,
+    #[serde(rename = "WRITTEN_DATA_QUOTA_EXCEEDED")]
+    WrittenDataQuotaExceeded,
+    #[serde(rename = "DATA_TRANSFER_QUOTA_EXCEEDED")]
+    DataTransferQuotaExceeded,
+    #[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")]
+    LogicalSizeQuotaExceeded,
+    #[default]
+    #[serde(other)]
+    Unknown,
+}
+
+impl Reason {
+    pub fn is_not_found(&self) -> bool {
+        matches!(
+            self,
+            Reason::ResourceNotFound
+                | Reason::ProjectNotFound
+                | Reason::EndpointNotFound
+                | Reason::BranchNotFound
+        )
+    }
+}
+
+#[derive(Debug, Deserialize)]
+pub struct RetryInfo {
+    pub retry_delay_ms: u64,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct UserFacingMessage {
+    pub message: Box<str>,
 }
 
 /// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 634ec9042c..915c2ee7a6 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -25,8 +25,8 @@ use tracing::info;
 
 pub mod errors {
     use crate::{
+        console::messages::{self, ConsoleError},
         error::{io_error, ReportableError, UserFacingError},
-        http,
         proxy::retry::ShouldRetry,
     };
     use thiserror::Error;
@@ -34,17 +34,14 @@ pub mod errors {
     use super::ApiLockError;
 
     /// A go-to error message which doesn't leak any detail.
-    const REQUEST_FAILED: &str = "Console request failed";
+    pub const REQUEST_FAILED: &str = "Console request failed";
 
     /// Common console API error.
     #[derive(Debug, Error)]
     pub enum ApiError {
         /// Error returned by the console itself.
-        #[error("{REQUEST_FAILED} with {}: {}", .status, .text)]
-        Console {
-            status: http::StatusCode,
-            text: Box<str>,
-        },
+        #[error("{REQUEST_FAILED} with {0}")]
+        Console(ConsoleError),
 
         /// Various IO errors like broken pipe or malformed payload.
         #[error("{REQUEST_FAILED}: {0}")]
@@ -53,11 +50,11 @@ pub mod errors {
 
     impl ApiError {
         /// Returns HTTP status code if it's the reason for failure.
-        pub fn http_status_code(&self) -> Option<http::StatusCode> {
+        pub fn get_reason(&self) -> messages::Reason {
             use ApiError::*;
             match self {
-                Console { status, .. } => Some(*status),
-                _ => None,
+                Console(e) => e.get_reason(),
+                _ => messages::Reason::Unknown,
             }
         }
     }
@@ -67,22 +64,7 @@ pub mod errors {
             use ApiError::*;
             match self {
                 // To minimize risks, only select errors are forwarded to users.
-                // Ask @neondatabase/control-plane for review before adding more.
-                Console { status, .. } => match *status {
-                    http::StatusCode::NOT_FOUND => {
-                        // Status 404: failed to get a project-related resource.
-                        format!("{REQUEST_FAILED}: endpoint cannot be found")
-                    }
-                    http::StatusCode::NOT_ACCEPTABLE => {
-                        // Status 406: endpoint is disabled (we don't allow connections).
-                        format!("{REQUEST_FAILED}: endpoint is disabled")
-                    }
-                    http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
-                        // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
-                        format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.")
-                    }
-                    _ => REQUEST_FAILED.to_owned(),
-                },
+                Console(c) => c.get_user_facing_message(),
                 _ => REQUEST_FAILED.to_owned(),
             }
         }
@@ -91,29 +73,56 @@ pub mod errors {
     impl ReportableError for ApiError {
         fn get_error_kind(&self) -> crate::error::ErrorKind {
             match self {
-                ApiError::Console {
-                    status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
-                    ..
-                } => crate::error::ErrorKind::User,
-                ApiError::Console {
-                    status: http::StatusCode::UNPROCESSABLE_ENTITY,
-                    text,
-                } if text.contains("compute time quota of non-primary branches is exceeded") => {
-                    crate::error::ErrorKind::User
+                ApiError::Console(e) => {
+                    use crate::error::ErrorKind::*;
+                    match e.get_reason() {
+                        crate::console::messages::Reason::RoleProtected => User,
+                        crate::console::messages::Reason::ResourceNotFound => User,
+                        crate::console::messages::Reason::ProjectNotFound => User,
+                        crate::console::messages::Reason::EndpointNotFound => User,
+                        crate::console::messages::Reason::BranchNotFound => User,
+                        crate::console::messages::Reason::RateLimitExceeded => ServiceRateLimit,
+                        crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => {
+                            User
+                        }
+                        crate::console::messages::Reason::ActiveTimeQuotaExceeded => User,
+                        crate::console::messages::Reason::ComputeTimeQuotaExceeded => User,
+                        crate::console::messages::Reason::WrittenDataQuotaExceeded => User,
+                        crate::console::messages::Reason::DataTransferQuotaExceeded => User,
+                        crate::console::messages::Reason::LogicalSizeQuotaExceeded => User,
+                        crate::console::messages::Reason::Unknown => match &e {
+                            ConsoleError {
+                                http_status_code:
+                                    http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
+                                ..
+                            } => crate::error::ErrorKind::User,
+                            ConsoleError {
+                                http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
+                                error,
+                                ..
+                            } if error.contains(
+                                "compute time quota of non-primary branches is exceeded",
+                            ) =>
+                            {
+                                crate::error::ErrorKind::User
+                            }
+                            ConsoleError {
+                                http_status_code: http::StatusCode::LOCKED,
+                                error,
+                                ..
+                            } if error.contains("quota exceeded")
+                                || error.contains("the limit for current plan reached") =>
+                            {
+                                crate::error::ErrorKind::User
+                            }
+                            ConsoleError {
+                                http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
+                                ..
+                            } => crate::error::ErrorKind::ServiceRateLimit,
+                            ConsoleError { .. } => crate::error::ErrorKind::ControlPlane,
+                        },
+                    }
                 }
-                ApiError::Console {
-                    status: http::StatusCode::LOCKED,
-                    text,
-                } if text.contains("quota exceeded")
-                    || text.contains("the limit for current plan reached") =>
-                {
-                    crate::error::ErrorKind::User
-                }
-                ApiError::Console {
-                    status: http::StatusCode::TOO_MANY_REQUESTS,
-                    ..
-                } => crate::error::ErrorKind::ServiceRateLimit,
-                ApiError::Console { .. } => crate::error::ErrorKind::ControlPlane,
                 ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
             }
         }
@@ -124,31 +133,7 @@ pub mod errors {
             match self {
                 // retry some transport errors
                 Self::Transport(io) => io.could_retry(),
-                // retry some temporary failures because the compute was in a bad state
-                // (bad request can be returned when the endpoint was in transition)
-                Self::Console {
-                    status: http::StatusCode::BAD_REQUEST,
-                    ..
-                } => true,
-                // don't retry when quotas are exceeded
-                Self::Console {
-                    status: http::StatusCode::UNPROCESSABLE_ENTITY,
-                    ref text,
-                } => !text.contains("compute time quota of non-primary branches is exceeded"),
-                // locked can be returned when the endpoint was in transition
-                // or when quotas are exceeded. don't retry when quotas are exceeded
-                Self::Console {
-                    status: http::StatusCode::LOCKED,
-                    ref text,
-                } => {
-                    // written data quota exceeded
-                    // data transfer quota exceeded
-                    // compute time quota exceeded
-                    // logical size quota exceeded
-                    !text.contains("quota exceeded")
-                        && !text.contains("the limit for current plan reached")
-                }
-                _ => false,
+                Self::Console(e) => e.could_retry(),
             }
         }
     }
@@ -509,7 +494,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
         self.metrics
             .semaphore_acquire_seconds
             .observe(now.elapsed().as_secs_f64());
-
+        info!("acquired permit {:?}", now.elapsed().as_secs_f64());
         Ok(WakeComputePermit { permit: permit? })
     }
 
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index d72229b029..41bd2f4956 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -94,12 +94,14 @@ impl Api {
             let body = match parse_body::<GetRoleSecret>(response).await {
                 Ok(body) => body,
                 // Error 404 is special: it's ok not to have a secret.
-                Err(e) => match e.http_status_code() {
-                    Some(http::StatusCode::NOT_FOUND) => {
+                // TODO(anna): retry
+                Err(e) => {
+                    if e.get_reason().is_not_found() {
                         return Ok(AuthInfo::default());
+                    } else {
+                        return Err(e.into());
                     }
-                    _otherwise => return Err(e.into()),
-                },
+                }
             };
 
             let secret = if body.role_secret.is_empty() {
@@ -328,19 +330,24 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
         info!("request succeeded, processing the body");
         return Ok(response.json().await?);
     }
+    let s = response.bytes().await?;
+    // Log plaintext to be able to detect, whether there are some cases not covered by the error struct.
+    info!("response_error plaintext: {:?}", s);
 
     // Don't throw an error here because it's not as important
     // as the fact that the request itself has failed.
-    let body = response.json().await.unwrap_or_else(|e| {
+    let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
         warn!("failed to parse error body: {e}");
         ConsoleError {
             error: "reason unclear (malformed error message)".into(),
+            http_status_code: status,
+            status: None,
         }
     });
+    body.http_status_code = status;
 
-    let text = body.error;
-    error!("console responded with an error ({status}): {text}");
-    Err(ApiError::Console { status, text })
+    error!("console responded with an error ({status}): {body:?}");
+    Err(ApiError::Console(body))
 }
 
 fn parse_host_port(input: &str) -> Option<(&str, u16)> {
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index ad48af0093..96683511fe 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -12,7 +12,7 @@ use crate::auth::backend::{
 };
 use crate::config::{CertResolver, RetryConfig};
 use crate::console::caches::NodeInfoCache;
-use crate::console::messages::MetricsAuxInfo;
+use crate::console::messages::{ConsoleError, MetricsAuxInfo};
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
@@ -484,18 +484,20 @@ impl TestBackend for TestConnectMechanism {
         match action {
             ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
             ConnectAction::WakeFail => {
-                let err = console::errors::ApiError::Console {
-                    status: http::StatusCode::FORBIDDEN,
-                    text: "TEST".into(),
-                };
+                let err = console::errors::ApiError::Console(ConsoleError {
+                    http_status_code: http::StatusCode::FORBIDDEN,
+                    error: "TEST".into(),
+                    status: None,
+                });
                 assert!(!err.could_retry());
                 Err(console::errors::WakeComputeError::ApiError(err))
             }
             ConnectAction::WakeRetry => {
-                let err = console::errors::ApiError::Console {
-                    status: http::StatusCode::BAD_REQUEST,
-                    text: "TEST".into(),
-                };
+                let err = console::errors::ApiError::Console(ConsoleError {
+                    http_status_code: http::StatusCode::BAD_REQUEST,
+                    error: "TEST".into(),
+                    status: None,
+                });
                 assert!(err.could_retry());
                 Err(console::errors::WakeComputeError::ApiError(err))
             }
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 94b03e1ccc..c166cf4389 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,4 +1,5 @@
 use crate::config::RetryConfig;
+use crate::console::messages::ConsoleError;
 use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
 use crate::context::RequestMonitoring;
 use crate::metrics::{
@@ -88,36 +89,76 @@ fn report_error(e: &WakeComputeError, retry: bool) {
     let kind = match e {
         WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress,
         WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError,
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ref text,
-        }) if text.contains("written data quota exceeded")
-            || text.contains("the limit for current plan reached") =>
-        {
-            WakeupFailureKind::QuotaExceeded
-        }
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::UNPROCESSABLE_ENTITY,
-            ref text,
-        }) if text.contains("compute time quota of non-primary branches is exceeded") => {
-            WakeupFailureKind::QuotaExceeded
-        }
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ..
-        }) => WakeupFailureKind::ApiConsoleLocked,
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::BAD_REQUEST,
-            ..
-        }) => WakeupFailureKind::ApiConsoleBadRequest,
-        WakeComputeError::ApiError(ApiError::Console { status, .. })
-            if status.is_server_error() =>
-        {
-            WakeupFailureKind::ApiConsoleOtherServerError
-        }
-        WakeComputeError::ApiError(ApiError::Console { .. }) => {
-            WakeupFailureKind::ApiConsoleOtherError
-        }
+        WakeComputeError::ApiError(ApiError::Console(e)) => match e.get_reason() {
+            crate::console::messages::Reason::RoleProtected => {
+                WakeupFailureKind::ApiConsoleBadRequest
+            }
+            crate::console::messages::Reason::ResourceNotFound => {
+                WakeupFailureKind::ApiConsoleBadRequest
+            }
+            crate::console::messages::Reason::ProjectNotFound => {
+                WakeupFailureKind::ApiConsoleBadRequest
+            }
+            crate::console::messages::Reason::EndpointNotFound => {
+                WakeupFailureKind::ApiConsoleBadRequest
+            }
+            crate::console::messages::Reason::BranchNotFound => {
+                WakeupFailureKind::ApiConsoleBadRequest
+            }
+            crate::console::messages::Reason::RateLimitExceeded => {
+                WakeupFailureKind::ApiConsoleLocked
+            }
+            crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => {
+                WakeupFailureKind::QuotaExceeded
+            }
+            crate::console::messages::Reason::ActiveTimeQuotaExceeded => {
+                WakeupFailureKind::QuotaExceeded
+            }
+            crate::console::messages::Reason::ComputeTimeQuotaExceeded => {
+                WakeupFailureKind::QuotaExceeded
+            }
+            crate::console::messages::Reason::WrittenDataQuotaExceeded => {
+                WakeupFailureKind::QuotaExceeded
+            }
+            crate::console::messages::Reason::DataTransferQuotaExceeded => {
+                WakeupFailureKind::QuotaExceeded
+            }
+            crate::console::messages::Reason::LogicalSizeQuotaExceeded => {
+                WakeupFailureKind::QuotaExceeded
+            }
+            crate::console::messages::Reason::Unknown => match e {
+                ConsoleError {
+                    http_status_code: StatusCode::LOCKED,
+                    ref error,
+                    ..
+                } if error.contains("written data quota exceeded")
+                    || error.contains("the limit for current plan reached") =>
+                {
+                    WakeupFailureKind::QuotaExceeded
+                }
+                ConsoleError {
+                    http_status_code: StatusCode::UNPROCESSABLE_ENTITY,
+                    ref error,
+                    ..
+                } if error.contains("compute time quota of non-primary branches is exceeded") => {
+                    WakeupFailureKind::QuotaExceeded
+                }
+                ConsoleError {
+                    http_status_code: StatusCode::LOCKED,
+                    ..
+                } => WakeupFailureKind::ApiConsoleLocked,
+                ConsoleError {
+                    http_status_code: StatusCode::BAD_REQUEST,
+                    ..
+                } => WakeupFailureKind::ApiConsoleBadRequest,
+                ConsoleError {
+                    http_status_code, ..
+                } if http_status_code.is_server_error() => {
+                    WakeupFailureKind::ApiConsoleOtherServerError
+                }
+                ConsoleError { .. } => WakeupFailureKind::ApiConsoleOtherError,
+            },
+        },
         WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked,
         WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError,
     };

From d25f7e3dd575878df49925bead4c797a61757751 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 13 Jun 2024 09:44:37 -0400
Subject: [PATCH 0968/1571] test(pageserver): add test wal record for unit
 testing (#8015)

https://github.com/neondatabase/neon/issues/8002

We need mock WAL record to make it easier to write unit tests. This pull
request adds such a record. It has `clear` flag and `append` field. The
tests for legacy-enhanced compaction are not modified yet and will be
part of the next pull request.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs             | 80 +++++++++++++++++++++++++++-
 pageserver/src/walrecord.rs          | 43 ++++++++++++++-
 pageserver/src/walredo/apply_neon.rs | 14 +++++
 3 files changed, 134 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d556f72335..0bd3ece2e3 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4041,6 +4041,7 @@ mod tests {
     use crate::repository::{Key, Value};
     use crate::tenant::harness::*;
     use crate::tenant::timeline::CompactFlags;
+    use crate::walrecord::NeonWalRecord;
     use crate::DEFAULT_PG_VERSION;
     use bytes::{Bytes, BytesMut};
     use hex_literal::hex;
@@ -6705,8 +6706,8 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_simple_bottom_most_compaction() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction")?;
+    async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_images")?;
         let (tenant, ctx) = harness.load().await;
 
         fn get_key(id: u32) -> Key {
@@ -6861,4 +6862,79 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_neon_test_record() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_neon_test_record")?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let delta1 = vec![
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append(",0x20")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append(",0x30")),
+            ),
+            (get_key(2), Lsn(0x10), Value::Image("0x10".into())),
+            (
+                get_key(2),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append(",0x20")),
+            ),
+            (
+                get_key(2),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append(",0x30")),
+            ),
+            (get_key(3), Lsn(0x10), Value::Image("0x10".into())),
+            (
+                get_key(3),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_clear()),
+            ),
+            (get_key(4), Lsn(0x10), Value::Image("0x10".into())),
+            (
+                get_key(4),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_init()),
+            ),
+        ];
+        let image1 = vec![(get_key(1), "0x10".into())];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![delta1],              // delta layers
+                vec![(Lsn(0x10), image1)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+
+        assert_eq!(
+            tline.get(get_key(1), Lsn(0x50), &ctx).await?,
+            Bytes::from_static(b"0x10,0x20,0x30")
+        );
+        assert_eq!(
+            tline.get(get_key(2), Lsn(0x50), &ctx).await?,
+            Bytes::from_static(b"0x10,0x20,0x30")
+        );
+        // assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new());
+        // assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new());
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index 205f8dee4d..62a3a91b0b 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -49,6 +49,19 @@ pub enum NeonWalRecord {
         file_path: String,
         content: Option<Bytes>,
     },
+
+    /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
+    #[cfg(test)]
+    Test {
+        /// Append a string to the image.
+        append: String,
+        /// Clear the image before appending.
+        clear: bool,
+        /// Treat this record as an init record. `clear` should be set to true if this field is set
+        /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
+        /// its references in `timeline.rs`.
+        will_init: bool,
+    },
 }
 
 impl NeonWalRecord {
@@ -58,11 +71,39 @@ impl NeonWalRecord {
         // If you change this function, you'll also need to change ValueBytes::will_init
         match self {
             NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
-
+            #[cfg(test)]
+            NeonWalRecord::Test { will_init, .. } => *will_init,
             // None of the special neon record types currently initialize the page
             _ => false,
         }
     }
+
+    #[cfg(test)]
+    pub(crate) fn wal_append(s: impl AsRef<str>) -> Self {
+        Self::Test {
+            append: s.as_ref().to_string(),
+            clear: false,
+            will_init: false,
+        }
+    }
+
+    #[cfg(test)]
+    pub(crate) fn wal_clear() -> Self {
+        Self::Test {
+            append: "".to_string(),
+            clear: true,
+            will_init: false,
+        }
+    }
+
+    #[cfg(test)]
+    pub(crate) fn wal_init() -> Self {
+        Self::Test {
+            append: "".to_string(),
+            clear: true,
+            will_init: true,
+        }
+    }
 }
 
 /// DecodedBkpBlock represents per-page data contained in a WAL record.
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index 24e8d8b01c..facf01004c 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -244,6 +244,20 @@ pub(crate) fn apply_in_neon(
             let mut writer = page.writer();
             dir.ser_into(&mut writer)?;
         }
+        #[cfg(test)]
+        NeonWalRecord::Test {
+            append,
+            clear,
+            will_init,
+        } => {
+            if *will_init {
+                assert!(*clear, "init record must be clear to ensure correctness");
+            }
+            if *clear {
+                page.clear();
+            }
+            page.put_slice(append.as_bytes());
+        }
     }
     Ok(())
 }

From 82719542c617a74850e078c99c01a3c7f9e32beb Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 13 Jun 2024 20:20:47 +0200
Subject: [PATCH 0969/1571] fix: vectored get returns incorrect result on
 inexact materialized page cache hit (#8050)

# Problem

Suppose our vectored get starts with an inexact materialized page cache
hit ("cached lsn") that is shadowed by a newer image layer image layer.
Like so:


```
    <inmemory layers>

    +-+ < delta layer
    | |
   -|-|----- < image layer
    | |
    | |
   -|-|----- < cached lsn for requested key
    +_+
```

The correct visitation order is
1. inmemory layers
2. delta layer records in LSN range `[image_layer.lsn,
oldest_inmemory_layer.lsn_range.start)`
3. image layer

However, the vectored get code, when it visits the delta layer, it
(incorrectly!) returns with state `Complete`.

The reason why it returns is that it calls `on_lsn_advanced` with
`self.lsn_range.start`, i.e., the layer's LSN range.

Instead, it should use `lsn_range.start`, i.e., the LSN range from the
correct visitation order listed above.

# Solution

Use `lsn_range.start` instead of `self.lsn_range.start`.

# Refs

discovered by & fixes https://github.com/neondatabase/neon/issues/6967

Co-authored-by: Vlad Lazar <vlad@neon.tech>
---
 pageserver/src/tenant/storage_layer/delta_layer.rs | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index eb7cf81643..5e01ecd71d 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -219,7 +219,6 @@ pub struct DeltaLayerInner {
     // values copied from summary
     index_start_blk: u32,
     index_root_blk: u32,
-    lsn_range: Range<Lsn>,
 
     file: VirtualFile,
     file_id: FileId,
@@ -785,7 +784,6 @@ impl DeltaLayerInner {
             file_id,
             index_start_blk: actual_summary.index_start_blk,
             index_root_blk: actual_summary.index_root_blk,
-            lsn_range: actual_summary.lsn_range,
             max_vectored_read_bytes,
         }))
     }
@@ -911,7 +909,7 @@ impl DeltaLayerInner {
 
         let reads = Self::plan_reads(
             &keyspace,
-            lsn_range,
+            lsn_range.clone(),
             data_end_offset,
             index_reader,
             planner,
@@ -924,7 +922,7 @@ impl DeltaLayerInner {
         self.do_reads_and_update_state(reads, reconstruct_state, ctx)
             .await;
 
-        reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);
+        reconstruct_state.on_lsn_advanced(&keyspace, lsn_range.start);
 
         Ok(())
     }

From 0c3e3a8667294a3dc345b0f03364aa359a5154de Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 13 Jun 2024 10:31:58 -0500
Subject: [PATCH 0970/1571] Set application_name for internal connections to
 computes

This will help when analyzing the origins of connections to a compute
like in [0].

[0]: https://github.com/neondatabase/cloud/issues/14247
---
 compute_tools/src/bin/compute_ctl.rs | 2 +-
 vm-image-spec.yaml                   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 9295f091d5..7bf5db5a57 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -735,7 +735,7 @@ fn cli() -> clap::Command {
             Arg::new("filecache-connstr")
                 .long("filecache-connstr")
                 .default_value(
-                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable",
+                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor",
                 )
                 .value_name("FILECACHE_CONNSTR"),
         )
diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 15f820bebd..99164645a7 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -18,7 +18,7 @@ commands:
   - name: postgres-exporter
     user: nobody
     sysvInitAction: respawn
-    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres" /bin/postgres_exporter'
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
   - name: sql-exporter
     user: nobody
     sysvInitAction: respawn
@@ -93,7 +93,7 @@ files:
       target:
         # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
         # the schema gets dropped or replaced to match the driver expected DSN format.
-        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable'
+        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter'
 
         # Collectors (referenced by name) to execute on the target.
         # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
@@ -128,7 +128,7 @@ files:
       target:
         # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
         # the schema gets dropped or replaced to match the driver expected DSN format.
-        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable'
+        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling'
 
         # Collectors (referenced by name) to execute on the target.
         # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).

From f67010109f488cd20e7aae9b8feaf0c2016b63e9 Mon Sep 17 00:00:00 2001
From: James Broadhead <jamesbroadhead@users.noreply.github.com>
Date: Fri, 14 Jun 2024 09:17:43 +0100
Subject: [PATCH 0971/1571] extensions: pgvector-0.7.2 (#8037)

Update pgvector to 0.7.2

Purely mechanical update to pgvector.patch, just as a place to start
from
---
 Dockerfile.compute-node |  6 +++---
 patches/pgvector.patch  | 32 ++++++++------------------------
 2 files changed, 11 insertions(+), 27 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index a86fdd0bc3..3a73ac71b0 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -246,8 +246,8 @@ COPY patches/pgvector.patch /pgvector.patch
 # By default, pgvector Makefile uses `-march=native`. We don't want that,
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.1.tar.gz -O pgvector.tar.gz && \
-    echo "fe6c8cb4e0cd1a8cb60f5badf9e1701e0fcabcfc260931c26d01e155c4dd21d1 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
+    echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -979,7 +979,7 @@ RUN cd /ext-src/ && for f in *.tar.gz; \
     do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
     rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
     || exit 1; rm -f $f; done
-RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch 
+RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
 # cmake is required for the h3 test
 RUN apt-get update && apt-get install -y cmake
 RUN patch -p1 < /ext-src/pg_hintplan.patch
diff --git a/patches/pgvector.patch b/patches/pgvector.patch
index 84ac6644c5..3e1ffcaaaf 100644
--- a/patches/pgvector.patch
+++ b/patches/pgvector.patch
@@ -1,19 +1,8 @@
-From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
-From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
-Date: Fri, 2 Feb 2024 22:26:45 +0200
-Subject: [PATCH 1/1] Make v0.6.0 work with Neon
-
-Now that the WAL-logging happens as a separate step at the end of the
-build, we need a few neon-specific hints to make it work.
----
- src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
- 1 file changed, 36 insertions(+)
-
 diff --git a/src/hnswbuild.c b/src/hnswbuild.c
-index 680789b..ec54dea 100644
+index dcfb2bd..d5189ee 100644
 --- a/src/hnswbuild.c
 +++ b/src/hnswbuild.c
-@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
+@@ -860,9 +860,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
  
  	hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
  
@@ -31,7 +20,7 @@ index 680789b..ec54dea 100644
  	/* Close relations within worker */
  	index_close(indexRel, indexLockmode);
  	table_close(heapRel, heapLockmode);
-@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+@@ -1117,12 +1125,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
  	SeedRandom(42);
  #endif
  
@@ -43,14 +32,13 @@ index 680789b..ec54dea 100644
  
  	BuildGraph(buildstate, forkNum);
  
+-	if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
 +#ifdef NEON_SMGR
 +	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
 +#endif
 +
- 	if (RelationNeedsWAL(index))
-+	{
- 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
- 
++	if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM) {
+ 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocksInFork(index, forkNum), true);
 +#ifdef NEON_SMGR
 +		{
 +#if PG_VERSION_NUM >= 160000
@@ -60,7 +48,7 @@ index 680789b..ec54dea 100644
 +#endif
 +
 +			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
-+										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
++									   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
 +			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
 +		}
 +#endif
@@ -69,10 +57,6 @@ index 680789b..ec54dea 100644
 +#ifdef NEON_SMGR
 +	smgr_end_unlogged_build(RelationGetSmgr(index));
 +#endif
-+
+ 
  	FreeBuildState(buildstate);
  }
- 
--- 
-2.39.2
-

From 425eed24e896a99d0ed03d118ee07fb0ae339bde Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 14 Jun 2024 09:39:31 +0100
Subject: [PATCH 0972/1571] pageserver: refine shutdown handling in secondary
 download (#8052)

## Problem

Some code paths during secondary mode download are returning Ok() rather
than UpdateError::Cancelled. This is functionally okay, but it means
that the end of TenantDownloader::download has a sanity check that the
progress is 100% on success, and prints a "Correcting drift..." warning
if not. This warning can be emitted in a test, e.g.
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8049/9503642976/index.html#/testresult/fff1624ba6adae9e.

## Summary of changes

- In secondary download cancellation paths, use
Err(UpdateError::Cancelled) rather than Ok(), so that we drop out of the
download function and do not reach the progress sanity check.
---
 pageserver/src/tenant/secondary/downloader.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 62803c7838..24176ecf19 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -513,7 +513,7 @@ impl<'a> TenantDownloader<'a> {
         // cover our access to local storage.
         let Ok(_guard) = self.secondary_state.gate.enter() else {
             // Shutting down
-            return Ok(());
+            return Err(UpdateError::Cancelled);
         };
 
         let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
@@ -846,7 +846,7 @@ impl<'a> TenantDownloader<'a> {
         for layer in timeline.layers {
             if self.secondary_state.cancel.is_cancelled() {
                 tracing::debug!("Cancelled -- dropping out of layer loop");
-                return Ok(());
+                return Err(UpdateError::Cancelled);
             }
 
             // Existing on-disk layers: just update their access time.

From 789196572e5e2921371a053ae8cd70dd31b27c5b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 14 Jun 2024 11:51:12 +0300
Subject: [PATCH 0973/1571] Fix test_replica_query_race flakiness (#8038)

This failed once with `relation "test" does not exist` when trying to
run the query on the standby. It's possible that the standby is started
before the CREATE TABLE is processed in the pageserver, and the standby
opens up for queries before it has received the CREATE TABLE transaction
from the primary. To fix, wait for the standby to catch up to the
primary before starting to run the queries.


https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8025/9483658488/index.html
---
 test_runner/regress/test_hot_standby.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 1d1b2fb485..8edc8c554c 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -300,7 +300,7 @@ def test_replica_query_race(neon_simple_env: NeonEnv):
             p_cur.execute("CREATE TABLE test AS SELECT 0 AS counter")
 
     standby_ep = env.endpoints.new_replica_start(origin=primary_ep, endpoint_id="standby")
-    time.sleep(1)
+    wait_replica_caughtup(primary_ep, standby_ep)
 
     # In primary, run a lot of UPDATEs on a single page
     finished = False

From edc900028e5440bb500d16c1a05cef554d92f692 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 14 Jun 2024 10:24:13 +0100
Subject: [PATCH 0974/1571] CI: Update outdated GitHub Actions (#8042)

## Problem
We have some amount of outdated action in the CI pipeline, GitHub
complains about some of them.

## Summary of changes
- Update `actions/checkout@1` (a really old one) in
`vm-compute-node-image`
- Update `actions/checkout@3` in `build-build-tools-image`
- Update `docker/setup-buildx-action` in all workflows / jobs, it was
downgraded in https://github.com/neondatabase/neon/pull/7445, but it
it seems it works fine now
---
 .github/workflows/build-build-tools-image.yml |  4 ++--
 .github/workflows/build_and_test.yml          | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 9aacb09d10..da1efe9571 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -55,7 +55,7 @@ jobs:
             exit 1
           fi
 
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
       # The default value is ~/.docker
@@ -64,7 +64,7 @@ jobs:
           mkdir -p /tmp/.docker-custom
           echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
 
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
 
       - uses: docker/login-action@v2
         with:
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 71ca7329ee..1b433a7033 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -744,7 +744,7 @@ jobs:
         run: |
           mkdir -p .docker-custom
           echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
 
       - uses: docker/login-action@v3
         with:
@@ -822,11 +822,11 @@ jobs:
         run: |
           mkdir -p .docker-custom
           echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
         with:
           # Disable parallelism for docker buildkit.
           # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
-          config-inline: |
+          buildkitd-config-inline: |
             [worker.oci]
               max-parallelism = 1
 
@@ -858,7 +858,7 @@ jobs:
           cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
           tags: |
             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
-      
+
       - name: Build neon extensions test image
         if: matrix.version == 'v16'
         uses: docker/build-push-action@v5
@@ -965,7 +965,7 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
 

From 6843fd8f89a24aa08ad71bcabbb320a3211c979e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 14 Jun 2024 10:37:30 +0100
Subject: [PATCH 0975/1571] storage controller: always wait for tenant detach
 before delete (#8049)

## Problem

This test could fail with a timeout waiting for tenant deletions.

Tenant deletions could get tripped up on nodes transitioning from
offline to online at the moment of the deletion. In a previous
reconciliation, the reconciler would skip detaching a particular
location because the node was offline, but then when we do the delete
the node is marked as online and can be picked as the node to use for
issuing a deletion request. This hits the "Unexpectedly still attached
path", which would still work if the caller kept calling DELETE, but if
a caller does a Delete,get,get,get poll, then it doesn't work because
the GET calls fail after we've marked the tenant as detached.

## Summary of changes

Fix the undesirable storage controller behavior highlighted by this test
failure:
- Change tenant deletion flow to _always_ wait for reconciliation to
succeed: it was unsound to proceed and return 202 if something was still
attached, because after the 202 callers can no longer GET the tenant.

Stabilize the test:
- Add a reconcile_until_idle to the test, so that it will not have
reconciliations running in the background while we mark a node online.
This test is not meant to be a chaos test: we should test that kind of
complexity elsewhere.
- This reconcile_until_idle also fixes another failure mode where the
test might see a None for a tenant location because a reconcile was
mutating it
(https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7288/9500177581/index.html#suites/8fc5d1648d2225380766afde7c428d81/4acece42ae00c442/)

It remains the case that a motivated tester could produce a situation
where a DELETE gives a 500, when precisely the wrong node transitions
from offline to available at the precise moment of a deletion (but the
500 is better than returning 202 and then failing all subsequent GETs).
Note that nodes don't go through the offline state during normal
restarts, so this is super rare. We should eventually fix this by making
DELETE to the pageserver implicitly detach the tenant if it's attached,
but that should wait until nobody is using the legacy-style deletes (the
ones that use 202 + polling)
---
 storage_controller/src/service.rs             | 26 +++++++++++--------
 .../regress/test_storage_controller.py        |  3 +++
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 1e81b5c5a2..cf6a95bf0b 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2409,11 +2409,17 @@ impl Service {
             (detach_waiters, shard_ids, node.clone())
         };
 
-        if let Err(e) = self.await_waiters(detach_waiters, RECONCILE_TIMEOUT).await {
-            // Failing to detach shouldn't hold up deletion, e.g. if a node is offline we should be able
-            // to use some other node to run the remote deletion.
-            tracing::warn!("Failed to detach some locations: {e}");
-        }
+        // This reconcile wait can fail in a few ways:
+        //  A there is a very long queue for the reconciler semaphore
+        //  B some pageserver is failing to handle a detach promptly
+        //  C some pageserver goes offline right at the moment we send it a request.
+        //
+        // A and C are transient: the semaphore will eventually become available, and once a node is marked offline
+        // the next attempt to reconcile will silently skip detaches for an offline node and succeed.  If B happens,
+        // it's a bug, and needs resolving at the pageserver level (we shouldn't just leave attachments behind while
+        // deleting the underlying data).
+        self.await_waiters(detach_waiters, RECONCILE_TIMEOUT)
+            .await?;
 
         let locations = shard_ids
             .into_iter()
@@ -2431,13 +2437,11 @@ impl Service {
         for result in results {
             match result {
                 Ok(StatusCode::ACCEPTED) => {
-                    // This could happen if we failed detach above, and hit a pageserver where the tenant
-                    // is still attached: it will accept the deletion in the background
-                    tracing::warn!(
-                        "Unexpectedly still attached on {}, client should retry",
+                    // This should never happen: we waited for detaches to finish above
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "Unexpectedly still attached on {}",
                         node
-                    );
-                    return Ok(StatusCode::ACCEPTED);
+                    )));
                 }
                 Ok(_) => {}
                 Err(mgmt_api::Error::Cancelled) => {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 2031feaa83..f41468210c 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -133,6 +133,9 @@ def test_storage_controller_smoke(
 
     wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))
 
+    # Let all the reconciliations after marking the node offline complete
+    env.storage_controller.reconcile_until_idle()
+
     # Marking pageserver active should not migrate anything to it
     # immediately
     env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Active"})

From eb0ca9b648b745142969913a262b2aa4fccbf55a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 14 Jun 2024 11:08:11 +0100
Subject: [PATCH 0976/1571] pageserver: improved synthetic size &
 find_gc_cutoff error handling (#8051)

## Problem

This PR refactors some error handling to avoid log spam on
tenant/timeline shutdown.

- "ignoring failure to find gc cutoffs: timeline shutting down." logs
(https://github.com/neondatabase/neon/issues/8012)
- "synthetic_size_worker: failed to calculate synthetic size for tenant
...: Failed to refresh gc_info before gathering inputs: tenant shutting
down", for example here:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8049/9502988669/index.html#suites/3fc871d9ee8127d8501d607e03205abb/1a074a66548bbcea

Closes: https://github.com/neondatabase/neon/issues/8012

## Summary of changes

- Refactor: Add a PageReconstructError variant to GcError: this is the
only kind of error that find_gc_cutoffs can emit.
- Functional change: only ignore shutdown PageReconstructError variant:
for other variants, treat it as a real error
- Refactor: add a structured CalculateSyntheticSizeError type and use it
instead of anyhow::Error in synthetic size calculations
- Functional change: while iterating through timelines gathering logical
sizes, only drop out if the whole tenant is cancelled: individual
timeline cancellations indicate deletion in progress and we can just
ignore those.
---
 pageserver/src/consumption_metrics.rs         |  26 ++---
 pageserver/src/http/routes.rs                 |   9 +-
 pageserver/src/tenant.rs                      |  37 +++---
 pageserver/src/tenant/size.rs                 | 106 +++++++++++++-----
 pageserver/src/tenant/timeline.rs             |   2 +-
 .../fixtures/pageserver/allowed_errors.py     |   2 -
 test_runner/regress/test_tenant_size.py       |   4 -
 7 files changed, 115 insertions(+), 71 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 540d0d2e8c..18c1a6cd9b 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -2,10 +2,9 @@
 //! and push them to a HTTP endpoint.
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::size::CalculateSyntheticSizeError;
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{
-    mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
-};
+use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
@@ -350,19 +349,12 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
     // Same for the loop that fetches computed metrics.
     // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
     // which turns out is really handy to understand the system.
-    let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
-        return;
-    };
-
-    // this error can be returned if timeline is shutting down, but it does not
-    // mean the synthetic size worker should terminate.
-    let shutting_down = matches!(
-        e.downcast_ref::<PageReconstructError>(),
-        Some(PageReconstructError::Cancelled)
-    );
-
-    if !shutting_down {
-        let tenant_shard_id = tenant.tenant_shard_id();
-        error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
+    match tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await {
+        Ok(_) => {}
+        Err(CalculateSyntheticSizeError::Cancelled) => {}
+        Err(e) => {
+            let tenant_shard_id = tenant.tenant_shard_id();
+            error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
+        }
     }
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 12d02c52fe..657708c0d6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1135,7 +1135,10 @@ async fn tenant_size_handler(
             &ctx,
         )
         .await
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(|e| match e {
+            crate::tenant::size::CalculateSyntheticSizeError::Cancelled => ApiError::ShuttingDown,
+            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
+        })?;
 
     let mut sizes = None;
     let accepts_html = headers
@@ -1143,9 +1146,7 @@ async fn tenant_size_handler(
         .map(|v| v == "text/html")
         .unwrap_or_default();
     if !inputs_only.unwrap_or(false) {
-        let storage_model = inputs
-            .calculate_model()
-            .map_err(ApiError::InternalServerError)?;
+        let storage_model = inputs.calculate_model();
         let size = storage_model.calculate();
 
         // If request header expects html, return html
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0bd3ece2e3..a31fea1e58 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -509,11 +509,24 @@ pub(crate) enum GcError {
     #[error(transparent)]
     Remote(anyhow::Error),
 
+    // An error reading while calculating GC cutoffs
+    #[error(transparent)]
+    GcCutoffs(PageReconstructError),
+
     // If GC was invoked for a particular timeline, this error means it didn't exist
     #[error("timeline not found")]
     TimelineNotFound,
 }
 
+impl From<PageReconstructError> for GcError {
+    fn from(value: PageReconstructError) -> Self {
+        match value {
+            PageReconstructError::Cancelled => Self::TimelineCancelled,
+            other => Self::GcCutoffs(other),
+        }
+    }
+}
+
 impl Tenant {
     /// Yet another helper for timeline initialization.
     ///
@@ -2921,17 +2934,9 @@ impl Tenant {
                 .checked_sub(horizon)
                 .unwrap_or(Lsn(0));
 
-            let res = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await;
-
-            match res {
-                Ok(cutoffs) => {
-                    let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
-                    assert!(old.is_none());
-                }
-                Err(e) => {
-                    tracing::warn!(timeline_id = %timeline.timeline_id, "ignoring failure to find gc cutoffs: {e:#}");
-                }
-            }
+            let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
+            let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
+            assert!(old.is_none());
         }
 
         if !self.is_active() || self.cancel.is_cancelled() {
@@ -3553,7 +3558,7 @@ impl Tenant {
         cause: LogicalSizeCalculationCause,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<size::ModelInputs> {
+    ) -> Result<size::ModelInputs, size::CalculateSyntheticSizeError> {
         let logical_sizes_at_once = self
             .conf
             .concurrent_tenant_size_logical_size_queries
@@ -3568,8 +3573,8 @@ impl Tenant {
         // See more for on the issue #2748 condenced out of the initial PR review.
         let mut shared_cache = tokio::select! {
             locked = self.cached_logical_sizes.lock() => locked,
-            _ = cancel.cancelled() => anyhow::bail!("cancelled"),
-            _ = self.cancel.cancelled() => anyhow::bail!("tenant is shutting down"),
+            _ = cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
+            _ = self.cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
         };
 
         size::gather_inputs(
@@ -3593,10 +3598,10 @@ impl Tenant {
         cause: LogicalSizeCalculationCause,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<u64> {
+    ) -> Result<u64, size::CalculateSyntheticSizeError> {
         let inputs = self.gather_size_inputs(None, cause, cancel, ctx).await?;
 
-        let size = inputs.calculate()?;
+        let size = inputs.calculate();
 
         self.set_cached_synthetic_size(size);
 
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 64fff5536c..cdd5b0cbe7 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -3,7 +3,6 @@ use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
-use anyhow::{bail, Context};
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -11,7 +10,7 @@ use tokio_util::sync::CancellationToken;
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 
-use super::{LogicalSizeCalculationCause, Tenant};
+use super::{GcError, LogicalSizeCalculationCause, Tenant};
 use crate::tenant::Timeline;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -43,6 +42,44 @@ pub struct SegmentMeta {
     pub kind: LsnKind,
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum CalculateSyntheticSizeError {
+    /// Something went wrong internally to the calculation of logical size at a particular branch point
+    #[error("Failed to calculated logical size on timeline {timeline_id} at {lsn}: {error}")]
+    LogicalSize {
+        timeline_id: TimelineId,
+        lsn: Lsn,
+        error: CalculateLogicalSizeError,
+    },
+
+    /// Something went wrong internally when calculating GC parameters at start of size calculation
+    #[error(transparent)]
+    GcInfo(GcError),
+
+    /// Totally unexpected errors, like panics joining a task
+    #[error(transparent)]
+    Fatal(anyhow::Error),
+
+    /// The LSN we are trying to calculate a size at no longer exists at the point we query it
+    #[error("Could not find size at {lsn} in timeline {timeline_id}")]
+    LsnNotFound { timeline_id: TimelineId, lsn: Lsn },
+
+    /// Tenant shut down while calculating size
+    #[error("Cancelled")]
+    Cancelled,
+}
+
+impl From<GcError> for CalculateSyntheticSizeError {
+    fn from(value: GcError) -> Self {
+        match value {
+            GcError::TenantCancelled | GcError::TimelineCancelled => {
+                CalculateSyntheticSizeError::Cancelled
+            }
+            other => CalculateSyntheticSizeError::GcInfo(other),
+        }
+    }
+}
+
 impl SegmentMeta {
     fn size_needed(&self) -> bool {
         match self.kind {
@@ -116,12 +153,9 @@ pub(super) async fn gather_inputs(
     cause: LogicalSizeCalculationCause,
     cancel: &CancellationToken,
     ctx: &RequestContext,
-) -> anyhow::Result<ModelInputs> {
+) -> Result<ModelInputs, CalculateSyntheticSizeError> {
     // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
-    tenant
-        .refresh_gc_info(cancel, ctx)
-        .await
-        .context("Failed to refresh gc_info before gathering inputs")?;
+    tenant.refresh_gc_info(cancel, ctx).await?;
 
     // Collect information about all the timelines
     let mut timelines = tenant.list_timelines();
@@ -327,6 +361,12 @@ pub(super) async fn gather_inputs(
     )
     .await?;
 
+    if tenant.cancel.is_cancelled() {
+        // If we're shutting down, return an error rather than a sparse result that might include some
+        // timelines from before we started shutting down
+        return Err(CalculateSyntheticSizeError::Cancelled);
+    }
+
     Ok(ModelInputs {
         segments,
         timeline_inputs,
@@ -345,7 +385,7 @@ async fn fill_logical_sizes(
     logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
     cause: LogicalSizeCalculationCause,
     ctx: &RequestContext,
-) -> anyhow::Result<()> {
+) -> Result<(), CalculateSyntheticSizeError> {
     let timeline_hash: HashMap<TimelineId, Arc<Timeline>> = HashMap::from_iter(
         timelines
             .iter()
@@ -387,7 +427,7 @@ async fn fill_logical_sizes(
     }
 
     // Perform the size lookups
-    let mut have_any_error = false;
+    let mut have_any_error = None;
     while let Some(res) = joinset.join_next().await {
         // each of these come with Result<anyhow::Result<_>, JoinError>
         // because of spawn + spawn_blocking
@@ -398,21 +438,36 @@ async fn fill_logical_sizes(
             Err(join_error) => {
                 // cannot really do anything, as this panic is likely a bug
                 error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}");
-                have_any_error = true;
+
+                have_any_error = Some(CalculateSyntheticSizeError::Fatal(
+                    anyhow::anyhow!(join_error)
+                        .context("task that calls spawn_ondemand_logical_size_calculation"),
+                ));
             }
             Ok(Err(recv_result_error)) => {
                 // cannot really do anything, as this panic is likely a bug
                 error!("failed to receive logical size query result: {recv_result_error:#}");
-                have_any_error = true;
+                have_any_error = Some(CalculateSyntheticSizeError::Fatal(
+                    anyhow::anyhow!(recv_result_error)
+                        .context("Receiving logical size query result"),
+                ));
             }
             Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
-                if !matches!(error, CalculateLogicalSizeError::Cancelled) {
+                if matches!(error, CalculateLogicalSizeError::Cancelled) {
+                    // Skip this: it's okay if one timeline among many is shutting down while we
+                    // calculate inputs for the overall tenant.
+                    continue;
+                } else {
                     warn!(
                         timeline_id=%timeline.timeline_id,
                         "failed to calculate logical size at {lsn}: {error:#}"
                     );
+                    have_any_error = Some(CalculateSyntheticSizeError::LogicalSize {
+                        timeline_id: timeline.timeline_id,
+                        lsn,
+                        error,
+                    });
                 }
-                have_any_error = true;
             }
             Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
                 debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
@@ -426,10 +481,10 @@ async fn fill_logical_sizes(
     // prune any keys not needed anymore; we record every used key and added key.
     logical_size_cache.retain(|key, _| sizes_needed.contains_key(key));
 
-    if have_any_error {
+    if let Some(error) = have_any_error {
         // we cannot complete this round, because we are missing data.
         // we have however cached all we were able to request calculation on.
-        anyhow::bail!("failed to calculate some logical_sizes");
+        return Err(error);
     }
 
     // Insert the looked up sizes to the Segments
@@ -444,32 +499,29 @@ async fn fill_logical_sizes(
         if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) {
             seg.segment.size = Some(*size);
         } else {
-            bail!("could not find size at {} in timeline {}", lsn, timeline_id);
+            return Err(CalculateSyntheticSizeError::LsnNotFound { timeline_id, lsn });
         }
     }
     Ok(())
 }
 
 impl ModelInputs {
-    pub fn calculate_model(&self) -> anyhow::Result<tenant_size_model::StorageModel> {
+    pub fn calculate_model(&self) -> tenant_size_model::StorageModel {
         // Convert SegmentMetas into plain Segments
-        let storage = StorageModel {
+        StorageModel {
             segments: self
                 .segments
                 .iter()
                 .map(|seg| seg.segment.clone())
                 .collect(),
-        };
-
-        Ok(storage)
+        }
     }
 
     // calculate total project size
-    pub fn calculate(&self) -> anyhow::Result<u64> {
-        let storage = self.calculate_model()?;
+    pub fn calculate(&self) -> u64 {
+        let storage = self.calculate_model();
         let sizes = storage.calculate();
-
-        Ok(sizes.total_size)
+        sizes.total_size
     }
 }
 
@@ -656,7 +708,7 @@ fn verify_size_for_multiple_branches() {
 "#;
     let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
 
-    assert_eq!(inputs.calculate().unwrap(), 37_851_408);
+    assert_eq!(inputs.calculate(), 37_851_408);
 }
 
 #[test]
@@ -711,7 +763,7 @@ fn verify_size_for_one_branch() {
 
     let model: ModelInputs = serde_json::from_str(doc).unwrap();
 
-    let res = model.calculate_model().unwrap().calculate();
+    let res = model.calculate_model().calculate();
 
     println!("calculated synthetic size: {}", res.total_size);
     println!("result: {:?}", serde_json::to_string(&res.segments));
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 28627e7911..324d909dac 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4823,7 +4823,7 @@ impl Timeline {
         pitr: Duration,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<GcCutoffs> {
+    ) -> Result<GcCutoffs, PageReconstructError> {
         let _timer = self
             .metrics
             .find_gc_cutoffs_histo
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index ef412cade7..147d5705d3 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -94,8 +94,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*WARN.*path=/v1/utilization .*request was dropped before completing",
     # Can happen during shutdown
     ".*scheduling deletion on drop failed: queue is in state Stopped.*",
-    # Can happen during shutdown
-    ".*ignoring failure to find gc cutoffs: timeline shutting down.*",
 )
 
 
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index d3a228dbeb..a3dd422903 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -678,10 +678,6 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
         with pytest.raises(PageserverApiException, match=matcher):
             completion.result()
 
-    # this happens on both cases
-    env.pageserver.allowed_errors.append(
-        ".*ignoring failure to find gc cutoffs: timeline shutting down.*"
-    )
     # this happens only in the case of deletion (http response logging)
     env.pageserver.allowed_errors.append(".*Failed to refresh gc_info before gathering inputs.*")
 

From e6eb0020a10163475115960398f3c206b601d0b8 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 14 Jun 2024 12:23:52 +0100
Subject: [PATCH 0977/1571] update rust to 1.79.0 (#8048)

## Problem

rust 1.79 new enabled by default lints

## Summary of changes

* update to rust 1.79
* `s/default_features/default-features/`
* fix proxy dead code.
* fix pageserver dead code.
---
 Cargo.toml                                    |  6 ++--
 Dockerfile.build-tools                        |  2 +-
 libs/tracing-utils/Cargo.toml                 |  2 +-
 pageserver/src/tenant.rs                      |  3 --
 pageserver/src/tenant/timeline.rs             |  3 +-
 pageserver/src/tenant/timeline/delete.rs      |  3 --
 .../src/rate_limiter/limit_algorithm/aimd.rs  |  2 --
 proxy/src/scram/messages.rs                   | 33 ++++++++++---------
 rust-toolchain.toml                           |  2 +-
 9 files changed, 25 insertions(+), 31 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index dc89c2341b..8fddaaef12 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -120,7 +120,7 @@ num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
 opentelemetry = "0.20.0"
-opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
 parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
@@ -128,7 +128,7 @@ parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.14"
-prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
+prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
@@ -184,7 +184,7 @@ tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
+tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 urlencoding = "2.1"
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 460b8c996d..e7c61ace0e 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -141,7 +141,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.78.0
+ENV RUSTC_VERSION=1.79.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml
index b285c9b5b0..512a748124 100644
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -7,7 +7,7 @@ license.workspace = true
 [dependencies]
 hyper.workspace = true
 opentelemetry = { workspace = true, features=["rt-tokio"] }
-opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions.workspace = true
 reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a31fea1e58..801321e36d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1046,7 +1046,6 @@ impl Tenant {
                 remote_metadata,
                 TimelineResources {
                     remote_client,
-                    deletion_queue_client: self.deletion_queue_client.clone(),
                     timeline_get_throttle: self.timeline_get_throttle.clone(),
                 },
                 ctx,
@@ -1072,7 +1071,6 @@ impl Tenant {
                 timeline_id,
                 &index_part.metadata,
                 remote_timeline_client,
-                self.deletion_queue_client.clone(),
             )
             .instrument(tracing::info_span!("timeline_delete", %timeline_id))
             .await
@@ -3448,7 +3446,6 @@ impl Tenant {
         );
         TimelineResources {
             remote_client,
-            deletion_queue_client: self.deletion_queue_client.clone(),
             timeline_get_throttle: self.timeline_get_throttle.clone(),
         }
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 324d909dac..08bec329e1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -62,6 +62,7 @@ use std::{
     ops::ControlFlow,
 };
 
+use crate::metrics::GetKind;
 use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
     aux_file::AuxFileSizeEstimator,
@@ -75,7 +76,6 @@ use crate::{
     disk_usage_eviction_task::DiskUsageEvictionInfo,
     pgdatadir_mapping::CollectKeySpaceError,
 };
-use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind};
 use crate::{
     disk_usage_eviction_task::finite_f32,
     tenant::storage_layer::{
@@ -205,7 +205,6 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
     pub remote_client: RemoteTimelineClient,
-    pub deletion_queue_client: DeletionQueueClient,
     pub timeline_get_throttle: Arc<
         crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
     >,
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 5ca8544d49..441298f3e9 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -11,7 +11,6 @@ use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
 
 use crate::{
     config::PageServerConf,
-    deletion_queue::DeletionQueueClient,
     task_mgr::{self, TaskKind},
     tenant::{
         metadata::TimelineMetadata,
@@ -263,7 +262,6 @@ impl DeleteTimelineFlow {
         timeline_id: TimelineId,
         local_metadata: &TimelineMetadata,
         remote_client: RemoteTimelineClient,
-        deletion_queue_client: DeletionQueueClient,
     ) -> anyhow::Result<()> {
         // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
         // RemoteTimelineClient is the only functioning part.
@@ -274,7 +272,6 @@ impl DeleteTimelineFlow {
                 None, // Ancestor is not needed for deletion.
                 TimelineResources {
                     remote_client,
-                    deletion_queue_client,
                     timeline_get_throttle: tenant.timeline_get_throttle.clone(),
                 },
                 // Important. We dont pass ancestor above because it can be missing.
diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
index ccc9c42420..b39740bb21 100644
--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -1,5 +1,3 @@
-use std::usize;
-
 use super::{LimitAlgorithm, Outcome, Sample};
 
 /// Loss-based congestion avoidance.
diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs
index f9372540ca..cf677a3334 100644
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -32,8 +32,6 @@ pub struct ClientFirstMessage<'a> {
     pub bare: &'a str,
     /// Channel binding mode.
     pub cbind_flag: ChannelBinding<&'a str>,
-    /// (Client username)[<https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf/src/backend/libpq/auth-scram.c#L13>].
-    pub username: &'a str,
     /// Client nonce.
     pub nonce: &'a str,
 }
@@ -58,6 +56,14 @@ impl<'a> ClientFirstMessage<'a> {
 
         // In theory, these might be preceded by "reserved-mext" (i.e. "m=")
         let username = parts.next()?.strip_prefix("n=")?;
+
+        // https://github.com/postgres/postgres/blob/f83908798f78c4cafda217ca875602c88ea2ae28/src/backend/libpq/auth-scram.c#L13-L14
+        if !username.is_empty() {
+            tracing::warn!(username, "scram username provided, but is not expected")
+            // TODO(conrad):
+            // return None;
+        }
+
         let nonce = parts.next()?.strip_prefix("r=")?;
 
         // Validate but ignore auth extensions
@@ -66,7 +72,6 @@ impl<'a> ClientFirstMessage<'a> {
         Some(Self {
             bare,
             cbind_flag,
-            username,
             nonce,
         })
     }
@@ -188,19 +193,18 @@ mod tests {
 
         // (Almost) real strings captured during debug sessions
         let cases = [
-            (NotSupportedClient, "n,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju"),
-            (NotSupportedServer, "y,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju"),
+            (NotSupportedClient, "n,,n=,r=t8JwklwKecDLwSsA72rHmVju"),
+            (NotSupportedServer, "y,,n=,r=t8JwklwKecDLwSsA72rHmVju"),
             (
                 Required("tls-server-end-point"),
-                "p=tls-server-end-point,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju",
+                "p=tls-server-end-point,,n=,r=t8JwklwKecDLwSsA72rHmVju",
             ),
         ];
 
         for (cb, input) in cases {
             let msg = ClientFirstMessage::parse(input).unwrap();
 
-            assert_eq!(msg.bare, "n=pepe,r=t8JwklwKecDLwSsA72rHmVju");
-            assert_eq!(msg.username, "pepe");
+            assert_eq!(msg.bare, "n=,r=t8JwklwKecDLwSsA72rHmVju");
             assert_eq!(msg.nonce, "t8JwklwKecDLwSsA72rHmVju");
             assert_eq!(msg.cbind_flag, cb);
         }
@@ -208,14 +212,13 @@ mod tests {
 
     #[test]
     fn parse_client_first_message_with_invalid_gs2_authz() {
-        assert!(ClientFirstMessage::parse("n,authzid,n=user,r=nonce").is_none())
+        assert!(ClientFirstMessage::parse("n,authzid,n=,r=nonce").is_none())
     }
 
     #[test]
     fn parse_client_first_message_with_extra_params() {
-        let msg = ClientFirstMessage::parse("n,,n=user,r=nonce,a=foo,b=bar,c=baz").unwrap();
-        assert_eq!(msg.bare, "n=user,r=nonce,a=foo,b=bar,c=baz");
-        assert_eq!(msg.username, "user");
+        let msg = ClientFirstMessage::parse("n,,n=,r=nonce,a=foo,b=bar,c=baz").unwrap();
+        assert_eq!(msg.bare, "n=,r=nonce,a=foo,b=bar,c=baz");
         assert_eq!(msg.nonce, "nonce");
         assert_eq!(msg.cbind_flag, ChannelBinding::NotSupportedClient);
     }
@@ -223,9 +226,9 @@ mod tests {
     #[test]
     fn parse_client_first_message_with_extra_params_invalid() {
         // must be of the form `<ascii letter>=<...>`
-        assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,abc=foo").is_none());
-        assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,1=foo").is_none());
-        assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,a").is_none());
+        assert!(ClientFirstMessage::parse("n,,n=,r=nonce,abc=foo").is_none());
+        assert!(ClientFirstMessage::parse("n,,n=,r=nonce,1=foo").is_none());
+        assert!(ClientFirstMessage::parse("n,,n=,r=nonce,a").is_none());
     }
 
     #[test]
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 214de0a77d..dcae25a287 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.78.0"
+channel = "1.79.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From a71f58e69c762e55f0fe9055a088f6232facbf28 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 14 Jun 2024 05:47:09 +0300
Subject: [PATCH 0978/1571] Fix test_segment_init_failure.

Graceful shutdown broke it.
---
 test_runner/regress/test_wal_acceptor_async.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index 715d22eed8..971fad787a 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -601,13 +601,16 @@ async def run_segment_init_failure(env: NeonEnv):
     conn = await ep.connect_async()
     ep.safe_psql("select pg_switch_wal()")  # jump to the segment boundary
     # next insertion should hang until failpoint is disabled.
-    asyncio.create_task(conn.execute("insert into t select generate_series(1,1), 'payload'"))
+    bg_query = asyncio.create_task(
+        conn.execute("insert into t select generate_series(1,1), 'payload'")
+    )
     sleep_sec = 2
     await asyncio.sleep(sleep_sec)
-    # also restart ep at segment boundary to make test more interesting
-    ep.stop()
     # it must still be not finished
-    # assert not bg_query.done()
+    assert not bg_query.done()
+    # Also restart ep at segment boundary to make test more interesting. Do it in immediate mode;
+    # fast will hang because it will try to gracefully finish sending WAL.
+    ep.stop(mode="immediate")
     # Without segment rename during init (#6402) previous statement created
     # partially initialized 16MB segment, so sk restart also triggers #6401.
     sk.stop().start()

From 83eb02b07af20c27842231c995fe883f6d9a6299 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 14 Jun 2024 12:43:51 +0100
Subject: [PATCH 0979/1571] CI: downgrade docker/setup-buildx-action (#8062)

## Problem

I've bumped `docker/setup-buildx-action` in #8042 because I wasn't able
to reproduce the issue from #7445.
But now the issue appears again in
https://github.com/neondatabase/neon/actions/runs/9514373620/job/26226626923?pr=8059
The steps to reproduce aren't clear, it required
`docker/setup-buildx-action@v3` and rebuilding the image without cache,
probably

## Summary of changes
- Downgrade `docker/setup-buildx-action@v3`
to `docker/setup-buildx-action@v2`
---
 .github/workflows/build-build-tools-image.yml | 2 +-
 .github/workflows/build_and_test.yml          | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index da1efe9571..2c994b08ae 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -64,7 +64,7 @@ jobs:
           mkdir -p /tmp/.docker-custom
           echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
 
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2
 
       - uses: docker/login-action@v2
         with:
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1b433a7033..703fc8d145 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -744,7 +744,7 @@ jobs:
         run: |
           mkdir -p .docker-custom
           echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2
 
       - uses: docker/login-action@v3
         with:
@@ -822,11 +822,11 @@ jobs:
         run: |
           mkdir -p .docker-custom
           echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2
         with:
           # Disable parallelism for docker buildkit.
           # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
-          buildkitd-config-inline: |
+          config-inline: |
             [worker.oci]
               max-parallelism = 1
 

From 81892199f627a1021b3f4f5f8043d35281501c1a Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 14 Jun 2024 11:57:58 -0400
Subject: [PATCH 0980/1571] chore(pageserver): vectored get target_keyspace
 directly accums (#8055)

follow up on https://github.com/neondatabase/neon/pull/7904

avoid a layer of indirection introduced by `Vec<Range<Key>>`

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/keyspace.rs    |  6 ++++++
 pageserver/src/tenant/storage_layer.rs | 24 +++++++++++-------------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 12c6dc3a6d..9a61f2ad81 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -558,6 +558,12 @@ impl KeySpaceRandomAccum {
         self.ranges.push(range);
     }
 
+    pub fn add_keyspace(&mut self, keyspace: KeySpace) {
+        for range in keyspace.ranges {
+            self.add_range(range);
+        }
+    }
+
     pub fn to_keyspace(mut self) -> KeySpace {
         let mut ranges = Vec::new();
         if !self.ranges.is_empty() {
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 0b3f841ccf..9607546ce0 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -318,7 +318,7 @@ pub(crate) struct LayerFringe {
 #[derive(Debug)]
 struct LayerKeyspace {
     layer: ReadableLayer,
-    target_keyspace: Vec<KeySpace>,
+    target_keyspace: KeySpaceRandomAccum,
 }
 
 impl LayerFringe {
@@ -342,17 +342,13 @@ impl LayerFringe {
                 _,
                 LayerKeyspace {
                     layer,
-                    target_keyspace,
+                    mut target_keyspace,
                 },
-            )) => {
-                let mut keyspace = KeySpaceRandomAccum::new();
-                for ks in target_keyspace {
-                    for part in ks.ranges {
-                        keyspace.add_range(part);
-                    }
-                }
-                Some((layer, keyspace.consume_keyspace(), read_desc.lsn_range))
-            }
+            )) => Some((
+                layer,
+                target_keyspace.consume_keyspace(),
+                read_desc.lsn_range,
+            )),
             None => unreachable!("fringe internals are always consistent"),
         }
     }
@@ -367,16 +363,18 @@ impl LayerFringe {
         let entry = self.layers.entry(layer_id.clone());
         match entry {
             Entry::Occupied(mut entry) => {
-                entry.get_mut().target_keyspace.push(keyspace);
+                entry.get_mut().target_keyspace.add_keyspace(keyspace);
             }
             Entry::Vacant(entry) => {
                 self.planned_reads_by_lsn.push(ReadDesc {
                     lsn_range,
                     layer_id: layer_id.clone(),
                 });
+                let mut accum = KeySpaceRandomAccum::new();
+                accum.add_keyspace(keyspace);
                 entry.insert(LayerKeyspace {
                     layer,
-                    target_keyspace: vec![keyspace],
+                    target_keyspace: accum,
                 });
             }
         }

From 46210035c551212a1b9383fe5249d547f284c39a Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Fri, 14 Jun 2024 18:36:50 +0200
Subject: [PATCH 0981/1571] add halfvec indexing and queries to periodic
 pgvector performance tests (#8057)

## Problem

halfvec data type was introduced in pgvector 0.7.0 and is popular
because
it allows smaller vectors, smaller indexes and potentially better
performance.

So far we have not tested halfvec in our periodic performance tests.
This PR adds halfvec indexing and halfvec queries to the test.
---
 .github/workflows/benchmarking.yml            |  8 ++---
 .../performance/pgvector/halfvec_build.sql    | 15 +++++++++
 ...custom_script_pgvector_halfvec_queries.sql | 13 ++++++++
 .../pgvector/pgbench_hnsw_queries.sql         | 13 --------
 test_runner/performance/test_perf_olap.py     |  5 +++
 test_runner/performance/test_perf_pgbench.py  | 31 +++++++++++++------
 .../performance/test_perf_pgvector_queries.py | 24 ++++++++++++++
 7 files changed, 82 insertions(+), 27 deletions(-)
 create mode 100644 test_runner/performance/pgvector/halfvec_build.sql
 create mode 100644 test_runner/performance/pgvector/pgbench_custom_script_pgvector_halfvec_queries.sql
 delete mode 100644 test_runner/performance/pgvector/pgbench_hnsw_queries.sql
 create mode 100644 test_runner/performance/test_perf_pgvector_queries.py

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 57d24063bf..9eff483680 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -99,7 +99,7 @@ jobs:
         # Set --sparse-ordering option of pytest-order plugin
         # to ensure tests are running in order of appears in the file.
         # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
-        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py
+        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
       env:
         BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -410,14 +410,14 @@ jobs:
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
 
-    - name: Benchmark pgvector hnsw queries
+    - name: Benchmark pgvector queries
       uses: ./.github/actions/run-python-test-set
       with:
         build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance
+        test_selection: performance/test_perf_pgvector_queries.py
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_pgvector
+        extra_params: -m remote_cluster --timeout 21600 
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
diff --git a/test_runner/performance/pgvector/halfvec_build.sql b/test_runner/performance/pgvector/halfvec_build.sql
new file mode 100644
index 0000000000..7e923e4bde
--- /dev/null
+++ b/test_runner/performance/pgvector/halfvec_build.sql
@@ -0,0 +1,15 @@
+DROP TABLE IF EXISTS halfvec_test_table;
+
+CREATE TABLE halfvec_test_table (
+    _id text NOT NULL,
+    title text,
+    text text,
+    embeddings halfvec(1536),
+    PRIMARY KEY (_id)
+);
+
+INSERT INTO halfvec_test_table (_id, title, text, embeddings)
+SELECT _id, title, text, embeddings::halfvec
+FROM documents;
+
+CREATE INDEX documents_half_precision_hnsw_idx ON halfvec_test_table USING hnsw (embeddings halfvec_cosine_ops) WITH (m = 64, ef_construction = 128);
\ No newline at end of file
diff --git a/test_runner/performance/pgvector/pgbench_custom_script_pgvector_halfvec_queries.sql b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_halfvec_queries.sql
new file mode 100644
index 0000000000..70d0c18149
--- /dev/null
+++ b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_halfvec_queries.sql
@@ -0,0 +1,13 @@
+-- run with pooled connection
+-- pgbench -T 300 -c 100 -j20 -f pgbench_halfvec_queries.sql -postgresql://neondb_owner:<secret>@ep-floral-thunder-w1gzhaxi-pooler.eu-west-1.aws.neon.build/neondb?sslmode=require"
+
+with x (x) as (
+  select "embeddings" as x
+  from halfvec_test_table 
+  TABLESAMPLE SYSTEM (1) 
+  LIMIT 1
+)
+SELECT title, "embeddings" <=> (select x from x) as distance
+FROM halfvec_test_table
+ORDER BY 2
+LIMIT 30;
\ No newline at end of file
diff --git a/test_runner/performance/pgvector/pgbench_hnsw_queries.sql b/test_runner/performance/pgvector/pgbench_hnsw_queries.sql
deleted file mode 100644
index 5034063c1b..0000000000
--- a/test_runner/performance/pgvector/pgbench_hnsw_queries.sql
+++ /dev/null
@@ -1,13 +0,0 @@
--- run with pooled connection
--- pgbench -T 300 -c 100 -j20 -f pgbench_hnsw_queries.sql -postgresql://neondb_owner:<secret>@ep-floral-thunder-w1gzhaxi-pooler.eu-west-1.aws.neon.build/neondb?sslmode=require"
-
-with x (x) as (
-  select "embeddings" as x
-  from hnsw_test_table 
-  TABLESAMPLE SYSTEM (1) 
-  LIMIT 1
-)
-SELECT title, "embeddings" <=> (select x from x) as distance
-FROM hnsw_test_table
-ORDER BY 2
-LIMIT 30;
diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py
index 2367676e67..aaa2f8fec2 100644
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -106,6 +106,7 @@ QUERIES: Tuple[LabelledQuery, ...] = (
 # Disable auto formatting for the list of queries so that it's easier to read
 # fmt: off
 PGVECTOR_QUERIES: Tuple[LabelledQuery, ...] = (
+    LabelledQuery("PGVPREP",  r"ALTER EXTENSION VECTOR UPDATE;"),
     LabelledQuery("PGV0",  r"DROP TABLE IF EXISTS hnsw_test_table;"),
     LabelledQuery("PGV1",  r"CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA;"),
     LabelledQuery("PGV2",  r"INSERT INTO hnsw_test_table SELECT * FROM documents;"),
@@ -115,6 +116,10 @@ PGVECTOR_QUERIES: Tuple[LabelledQuery, ...] = (
     LabelledQuery("PGV6",  r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops);"),
     LabelledQuery("PGV7",  r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops);"),
     LabelledQuery("PGV8",  r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops);"),
+    LabelledQuery("PGV9",  r"DROP TABLE IF EXISTS halfvec_test_table;"),
+    LabelledQuery("PGV10", r"CREATE TABLE halfvec_test_table (_id text NOT NULL, title text, text text, embeddings halfvec(1536), PRIMARY KEY (_id));"),
+    LabelledQuery("PGV11", r"INSERT INTO halfvec_test_table (_id, title, text, embeddings) SELECT _id, title, text, embeddings::halfvec FROM documents;"),
+    LabelledQuery("PGV12", r"CREATE INDEX documents_half_precision_hnsw_idx ON halfvec_test_table USING hnsw (embeddings halfvec_cosine_ops) WITH (m = 64, ef_construction = 128);"),
 )
 # fmt: on
 
diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py
index d756d6eeca..6eaa29e4f8 100644
--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -18,6 +18,7 @@ class PgBenchLoadType(enum.Enum):
     SIMPLE_UPDATE = "simple-update"
     SELECT_ONLY = "select-only"
     PGVECTOR_HNSW = "pgvector-hnsw"
+    PGVECTOR_HALFVEC = "pgvector-halfvec"
 
 
 def utc_now_timestamp() -> int:
@@ -153,6 +154,26 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P
             password=password,
         )
 
+    if workload_type == PgBenchLoadType.PGVECTOR_HALFVEC:
+        # Run simple-update workload
+        run_pgbench(
+            env,
+            "pgvector-halfvec",
+            [
+                "pgbench",
+                "-f",
+                "test_runner/performance/pgvector/pgbench_custom_script_pgvector_halfvec_queries.sql",
+                "-c100",
+                "-j20",
+                f"-T{duration}",
+                "-P2",
+                "--protocol=prepared",
+                "--progress-timestamp",
+                connstr,
+            ],
+            password=password,
+        )
+
     env.report_size()
 
 
@@ -222,13 +243,3 @@ def test_pgbench_remote_simple_update(remote_compare: PgCompare, scale: int, dur
 @pytest.mark.remote_cluster
 def test_pgbench_remote_select_only(remote_compare: PgCompare, scale: int, duration: int):
     run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.SELECT_ONLY)
-
-
-# The following test runs on an existing database that has pgvector extension installed
-# and a table with 1 million embedding vectors loaded and indexed with HNSW.
-#
-# Run this pgbench tests against an existing remote Postgres cluster with the necessary setup.
-@pytest.mark.parametrize("duration", get_durations_matrix())
-@pytest.mark.remote_cluster
-def test_pgbench_remote_pgvector(remote_compare: PgCompare, duration: int):
-    run_test_pgbench(remote_compare, 1, duration, PgBenchLoadType.PGVECTOR_HNSW)
diff --git a/test_runner/performance/test_perf_pgvector_queries.py b/test_runner/performance/test_perf_pgvector_queries.py
new file mode 100644
index 0000000000..bb3db16305
--- /dev/null
+++ b/test_runner/performance/test_perf_pgvector_queries.py
@@ -0,0 +1,24 @@
+import pytest
+from fixtures.compare_fixtures import PgCompare
+
+from performance.test_perf_pgbench import PgBenchLoadType, get_durations_matrix, run_test_pgbench
+
+
+# The following test runs on an existing database that has pgvector extension installed
+# and a table with 1 million embedding vectors loaded and indexed with HNSW.
+#
+# Run this pgbench tests against an existing remote Postgres cluster with the necessary setup.
+@pytest.mark.parametrize("duration", get_durations_matrix())
+@pytest.mark.remote_cluster
+def test_pgbench_remote_pgvector_hnsw(remote_compare: PgCompare, duration: int):
+    run_test_pgbench(remote_compare, 1, duration, PgBenchLoadType.PGVECTOR_HNSW)
+
+
+# The following test runs on an existing database that has pgvector extension installed
+# and a table with 1 million embedding vectors loaded and indexed with halfvec.
+#
+# Run this pgbench tests against an existing remote Postgres cluster with the necessary setup.
+@pytest.mark.parametrize("duration", get_durations_matrix())
+@pytest.mark.remote_cluster
+def test_pgbench_remote_pgvector_halfvec(remote_compare: PgCompare, duration: int):
+    run_test_pgbench(remote_compare, 1, duration, PgBenchLoadType.PGVECTOR_HALFVEC)

From 2ba414525e7605c8570eadaccef576321d601571 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 13 Jun 2024 22:56:01 +0300
Subject: [PATCH 0982/1571] Install rust binaries before running rust tests.

cargo test (or nextest) might rebuild the binaries with different
features/flags, so do install immediately after the build. Triggered by the
particular case of nextest invocations missing $CARGO_FEATURES, which recompiled
safekeeper without 'testing' feature which made python tests needing
it (failpoints) not run in the CI.

Also add CARGO_FEATURES to the nextest runs anyway because there doesn't seem to
be an important reason not to.
---
 .github/workflows/build_and_test.yml | 56 ++++++++++++++--------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 703fc8d145..bd2996ec4c 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -337,34 +337,8 @@ jobs:
         run: |
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
-      - name: Run rust tests
-        env:
-          NEXTEST_RETRIES: 3
-        run: |
-          #nextest does not yet support running doctests
-          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
-
-          for io_engine in std-fs tokio-epoll-uring ; do
-            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
-          done
-
-          # Run separate tests for real S3
-          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
-          export REMOTE_STORAGE_S3_REGION=eu-central-1
-          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
-
-          # Run separate tests for real Azure Blob Storage
-          # XXX: replace region with `eu-central-1`-like region
-          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
-          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
-          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
-          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
-
+      # Do install *before* running rust tests because they might recompile the
+      # binaries with different features/flags.
       - name: Install rust binaries
         run: |
           # Install target binaries
@@ -405,6 +379,32 @@ jobs:
             done
           fi
 
+      - name: Run rust tests
+        env:
+          NEXTEST_RETRIES: 3
+        run: |
+          #nextest does not yet support running doctests
+          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
+
+          for io_engine in std-fs tokio-epoll-uring ; do
+            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          done
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
+
       - name: Install postgres binaries
         run: cp -a pg_install /tmp/neon/pg_install
 

From 16d80128eea32b0f2fd1051c90e93e8e0d537381 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 17 Jun 2024 11:40:35 +0100
Subject: [PATCH 0983/1571] storcon: handle entire cluster going unavailable
 correctly (#8060)

## Problem
A period of unavailability for all pageservers in a cluster produced the
following fallout in staging:
all tenants became detached and required manual operation to re-attach.
Manually restarting
the storage controller re-attached all tenants due to a consistency bug.

Turns out there are two related bugs which caused the issue:
1. Pageserver re-attach can be processed before the first heartbeat.
Hence, when handling
the availability delta produced by the heartbeater,
`Node::get_availability_transition` claims
that there's no need to reconfigure the node.
2. We would still attempt to reschedule tenant shards when handling
offline transitions even
if the entire cluster is down. This puts tenant shards into a state
where the reconciler believes
they have to be detached (no pageserver shows up in their intent state).
This is doubly wrong
because we don't mark the tenant shards as detached in the database,
thus causing memory vs
database consistency issues. Luckily, this bug allowed all tenant shards
to re-attach after restart.

## Summary of changes
* For (1), abuse the fact that re-attach requests do not contain an
utilisation score and use that
to differentiate from a node that replied to heartbeats.
* For (2), introduce a special case that skips any rescheduling if the
entire cluster is unavailable.
* Update the storage controller heartbeat test with an extra scenario
where the entire cluster goes
for lunch.

Fixes https://github.com/neondatabase/neon/issues/8044
---
 storage_controller/src/heartbeater.rs         |  4 +
 storage_controller/src/node.rs                | 12 ++-
 storage_controller/src/service.rs             | 88 +++++++++++++----
 .../regress/test_storage_controller.py        | 94 ++++++++++++-------
 4 files changed, 141 insertions(+), 57 deletions(-)

diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 1ef97e78eb..14cda0a289 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -31,6 +31,7 @@ pub(crate) enum PageserverState {
     Available {
         last_seen_at: Instant,
         utilization: PageserverUtilization,
+        new: bool,
     },
     Offline,
 }
@@ -127,6 +128,7 @@ impl HeartbeaterTask {
             heartbeat_futs.push({
                 let jwt_token = self.jwt_token.clone();
                 let cancel = self.cancel.clone();
+                let new_node = !self.state.contains_key(node_id);
 
                 // Clone the node and mark it as available such that the request
                 // goes through to the pageserver even when the node is marked offline.
@@ -159,6 +161,7 @@ impl HeartbeaterTask {
                         PageserverState::Available {
                             last_seen_at: Instant::now(),
                             utilization,
+                            new: new_node,
                         }
                     } else {
                         PageserverState::Offline
@@ -220,6 +223,7 @@ impl HeartbeaterTask {
                     }
                 },
                 Vacant(_) => {
+                    // This is a new node. Don't generate a delta for it.
                     deltas.push((node_id, ps_state.clone()));
                 }
             }
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 7b5513c908..34dcf0c642 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -3,7 +3,7 @@ use std::{str::FromStr, time::Duration};
 use pageserver_api::{
     controller_api::{
         NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
-        TenantLocateResponseShard,
+        TenantLocateResponseShard, UtilizationScore,
     },
     shard::TenantShardId,
 };
@@ -116,6 +116,16 @@ impl Node {
         match (self.availability, availability) {
             (Offline, Active(_)) => ToActive,
             (Active(_), Offline) => ToOffline,
+            // Consider the case when the storage controller handles the re-attach of a node
+            // before the heartbeats detect that the node is back online. We still need
+            // [`Service::node_configure`] to attempt reconciliations for shards with an
+            // unknown observed location.
+            // The unsavoury match arm below handles this situation.
+            (Active(lhs), Active(rhs))
+                if lhs == UtilizationScore::worst() && rhs < UtilizationScore::worst() =>
+            {
+                ToActive
+            }
             _ => Unchanged,
         }
     }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index cf6a95bf0b..926332f946 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -12,7 +12,7 @@ use crate::{
     id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard},
     persistence::{AbortShardSplitStatus, TenantFilter},
     reconciler::{ReconcileError, ReconcileUnits},
-    scheduler::{ScheduleContext, ScheduleMode},
+    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
     tenant_shard::{
         MigrateAttachment, ReconcileNeeded, ScheduleOptimization, ScheduleOptimizationAction,
     },
@@ -747,29 +747,61 @@ impl Service {
             let res = self.heartbeater.heartbeat(nodes).await;
             if let Ok(deltas) = res {
                 for (node_id, state) in deltas.0 {
-                    let new_availability = match state {
-                        PageserverState::Available { utilization, .. } => NodeAvailability::Active(
-                            UtilizationScore(utilization.utilization_score),
+                    let (new_node, new_availability) = match state {
+                        PageserverState::Available {
+                            utilization, new, ..
+                        } => (
+                            new,
+                            NodeAvailability::Active(UtilizationScore(
+                                utilization.utilization_score,
+                            )),
                         ),
-                        PageserverState::Offline => NodeAvailability::Offline,
+                        PageserverState::Offline => (false, NodeAvailability::Offline),
                     };
-                    let res = self
-                        .node_configure(node_id, Some(new_availability), None)
-                        .await;
 
-                    match res {
-                        Ok(()) => {}
-                        Err(ApiError::NotFound(_)) => {
-                            // This should be rare, but legitimate since the heartbeats are done
-                            // on a snapshot of the nodes.
-                            tracing::info!("Node {} was not found after heartbeat round", node_id);
+                    if new_node {
+                        // When the heartbeats detect a newly added node, we don't wish
+                        // to attempt to reconcile the shards assigned to it. The node
+                        // is likely handling it's re-attach response, so reconciling now
+                        // would be counterproductive.
+                        //
+                        // Instead, update the in-memory state with the details learned about the
+                        // node.
+                        let mut locked = self.inner.write().unwrap();
+                        let (nodes, _tenants, scheduler) = locked.parts_mut();
+
+                        let mut new_nodes = (**nodes).clone();
+
+                        if let Some(node) = new_nodes.get_mut(&node_id) {
+                            node.set_availability(new_availability);
+                            scheduler.node_upsert(node);
                         }
-                        Err(err) => {
-                            tracing::error!(
-                                "Failed to update node {} after heartbeat round: {}",
-                                node_id,
-                                err
-                            );
+
+                        locked.nodes = Arc::new(new_nodes);
+                    } else {
+                        // This is the code path for geniune availability transitions (i.e node
+                        // goes unavailable and/or comes back online).
+                        let res = self
+                            .node_configure(node_id, Some(new_availability), None)
+                            .await;
+
+                        match res {
+                            Ok(()) => {}
+                            Err(ApiError::NotFound(_)) => {
+                                // This should be rare, but legitimate since the heartbeats are done
+                                // on a snapshot of the nodes.
+                                tracing::info!(
+                                    "Node {} was not found after heartbeat round",
+                                    node_id
+                                );
+                            }
+                            Err(err) => {
+                                tracing::error!(
+                                    "Failed to update node {} after heartbeat round: {}",
+                                    node_id,
+                                    err
+                                );
+                            }
                         }
                     }
                 }
@@ -4316,6 +4348,16 @@ impl Service {
                         continue;
                     }
 
+                    if !new_nodes
+                        .values()
+                        .any(|n| matches!(n.may_schedule(), MaySchedule::Yes(_)))
+                    {
+                        // Special case for when all nodes are unavailable and/or unschedulable: there is no point
+                        // trying to reschedule since there's nowhere else to go. Without this
+                        // branch we incorrectly detach tenants in response to node unavailability.
+                        continue;
+                    }
+
                     if tenant_shard.intent.demote_attached(scheduler, node_id) {
                         tenant_shard.sequence = tenant_shard.sequence.next();
 
@@ -4353,6 +4395,12 @@ impl Service {
                 // When a node comes back online, we must reconcile any tenant that has a None observed
                 // location on the node.
                 for tenant_shard in locked.tenants.values_mut() {
+                    // If a reconciliation is already in progress, rely on the previous scheduling
+                    // decision and skip triggering a new reconciliation.
+                    if tenant_shard.reconciler.is_some() {
+                        continue;
+                    }
+
                     if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
                         if observed_loc.conf.is_none() {
                             self.maybe_reconcile_shard(tenant_shard, &new_nodes);
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index f41468210c..8624a45f45 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -934,19 +934,27 @@ class Failure:
     def clear(self, env: NeonEnv):
         raise NotImplementedError()
 
+    def nodes(self):
+        raise NotImplementedError()
+
 
 class NodeStop(Failure):
-    def __init__(self, pageserver_id, immediate):
-        self.pageserver_id = pageserver_id
+    def __init__(self, pageserver_ids, immediate):
+        self.pageserver_ids = pageserver_ids
         self.immediate = immediate
 
     def apply(self, env: NeonEnv):
-        pageserver = env.get_pageserver(self.pageserver_id)
-        pageserver.stop(immediate=self.immediate)
+        for ps_id in self.pageserver_ids:
+            pageserver = env.get_pageserver(ps_id)
+            pageserver.stop(immediate=self.immediate)
 
     def clear(self, env: NeonEnv):
-        pageserver = env.get_pageserver(self.pageserver_id)
-        pageserver.start()
+        for ps_id in self.pageserver_ids:
+            pageserver = env.get_pageserver(ps_id)
+            pageserver.start()
+
+    def nodes(self):
+        return self.pageserver_ids
 
 
 class PageserverFailpoint(Failure):
@@ -962,6 +970,9 @@ class PageserverFailpoint(Failure):
         pageserver = env.get_pageserver(self.pageserver_id)
         pageserver.http_client().configure_failpoints((self.failpoint, "off"))
 
+    def nodes(self):
+        return [self.pageserver_id]
+
 
 def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]:
     tenants = env.storage_controller.tenant_list()
@@ -985,8 +996,9 @@ def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]:
 @pytest.mark.parametrize(
     "failure",
     [
-        NodeStop(pageserver_id=1, immediate=False),
-        NodeStop(pageserver_id=1, immediate=True),
+        NodeStop(pageserver_ids=[1], immediate=False),
+        NodeStop(pageserver_ids=[1], immediate=True),
+        NodeStop(pageserver_ids=[1, 2], immediate=True),
         PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"),
     ],
 )
@@ -1039,33 +1051,50 @@ def test_storage_controller_heartbeats(
     wait_until(10, 1, tenants_placed)
 
     # ... then we apply the failure
-    offline_node_id = failure.pageserver_id
-    online_node_id = (set(range(1, len(env.pageservers) + 1)) - {offline_node_id}).pop()
-    env.get_pageserver(offline_node_id).allowed_errors.append(
-        # In the case of the failpoint failure, the impacted pageserver
-        # still believes it has the tenant attached since location
-        # config calls into it will fail due to being marked offline.
-        ".*Dropped remote consistent LSN updates.*",
-    )
+    offline_node_ids = set(failure.nodes())
+    online_node_ids = set(range(1, len(env.pageservers) + 1)) - offline_node_ids
+
+    for node_id in offline_node_ids:
+        env.get_pageserver(node_id).allowed_errors.append(
+            # In the case of the failpoint failure, the impacted pageserver
+            # still believes it has the tenant attached since location
+            # config calls into it will fail due to being marked offline.
+            ".*Dropped remote consistent LSN updates.*",
+        )
+
+        if len(offline_node_ids) > 1:
+            env.get_pageserver(node_id).allowed_errors.append(
+                ".*Scheduling error when marking pageserver.*offline.*",
+            )
 
     failure.apply(env)
 
     # ... expecting the heartbeats to mark it offline
-    def node_offline():
+    def nodes_offline():
         nodes = env.storage_controller.node_list()
         log.info(f"{nodes=}")
-        target = next(n for n in nodes if n["id"] == offline_node_id)
-        assert target["availability"] == "Offline"
+        for node in nodes:
+            if node["id"] in offline_node_ids:
+                assert node["availability"] == "Offline"
 
     # A node is considered offline if the last successful heartbeat
     # was more than 10 seconds ago (hardcoded in the storage controller).
-    wait_until(20, 1, node_offline)
+    wait_until(20, 1, nodes_offline)
 
     # .. expecting the tenant on the offline node to be migrated
     def tenant_migrated():
+        if len(online_node_ids) == 0:
+            time.sleep(5)
+            return
+
         node_to_tenants = build_node_to_tenants_map(env)
         log.info(f"{node_to_tenants=}")
-        assert set(node_to_tenants[online_node_id]) == set(tenant_ids)
+
+        observed_tenants = set()
+        for node_id in online_node_ids:
+            observed_tenants |= set(node_to_tenants[node_id])
+
+        assert observed_tenants == set(tenant_ids)
 
     wait_until(10, 1, tenant_migrated)
 
@@ -1073,31 +1102,24 @@ def test_storage_controller_heartbeats(
     failure.clear(env)
 
     # ... expecting the offline node to become active again
-    def node_online():
+    def nodes_online():
         nodes = env.storage_controller.node_list()
-        target = next(n for n in nodes if n["id"] == offline_node_id)
-        assert target["availability"] == "Active"
+        for node in nodes:
+            if node["id"] in online_node_ids:
+                assert node["availability"] == "Active"
 
-    wait_until(10, 1, node_online)
+    wait_until(10, 1, nodes_online)
 
     time.sleep(5)
 
-    # ... then we create a new tenant
-    tid = TenantId.generate()
-    env.storage_controller.tenant_create(tid)
-
-    # ... expecting it to be placed on the node that just came back online
-    tenants = env.storage_controller.tenant_list()
-    newest_tenant = next(t for t in tenants if t["tenant_shard_id"] == str(tid))
-    locations = list(newest_tenant["observed"]["locations"].keys())
-    locations = [int(node_id) for node_id in locations]
-    assert locations == [offline_node_id]
+    node_to_tenants = build_node_to_tenants_map(env)
+    log.info(f"Back online: {node_to_tenants=}")
 
     # ... expecting the storage controller to reach a consistent state
     def storage_controller_consistent():
         env.storage_controller.consistency_check()
 
-    wait_until(10, 1, storage_controller_consistent)
+    wait_until(30, 1, storage_controller_consistent)
 
 
 def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder):

From b6e1c09c733a699cbaf76ff8508f9552eec1db7f Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 17 Jun 2024 12:47:20 +0100
Subject: [PATCH 0984/1571] CI(check-build-tools-image): change build-tools
 image persistent tag (#8059)

## Problem

We don't rebuild `build-tools` image for changes in a workflow that
builds this image itself
(`.github/workflows/build-build-tools-image.yml`) or in a workflow that
determines which tag to use
(`.github/workflows/check-build-tools-image.yml`)

## Summary of changes
- Use a hash of `Dockerfile.build-tools` and workflow files as a
persistent tag instead of using a commit sha.
---
 .github/workflows/build-build-tools-image.yml |  1 -
 .github/workflows/check-build-tools-image.yml | 23 ++++++-------------
 2 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 2c994b08ae..6e90a80ab7 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -30,7 +30,6 @@ jobs:
   check-image:
     uses: ./.github/workflows/check-build-tools-image.yml
 
-  # This job uses older version of GitHub Actions because it's run on gen2 runners, which don't support node 20 (for newer versions)
   build-image:
     needs: [ check-image ]
     if: needs.check-image.outputs.found == 'false'
diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml
index 97116940a0..807a9ef3bd 100644
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -25,26 +25,17 @@ jobs:
       found: ${{ steps.check-image.outputs.found }}
 
     steps:
+      - uses: actions/checkout@v4
+
       - name: Get build-tools image tag for the current commit
         id: get-build-tools-tag
         env:
-          # Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
-          # we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
-          COMMIT_SHA: ${{ github.sha }}
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          IMAGE_TAG: |
+            ${{ hashFiles('Dockerfile.build-tools',
+                          '.github/workflows/check-build-tools-image.yml',
+                          '.github/workflows/build-build-tools-image.yml') }}
         run: |
-          LAST_BUILD_TOOLS_SHA=$(
-            gh api \
-              -H "Accept: application/vnd.github+json" \
-              -H "X-GitHub-Api-Version: 2022-11-28" \
-              --method GET \
-              --field path=Dockerfile.build-tools \
-              --field sha=${COMMIT_SHA} \
-              --field per_page=1 \
-              --jq ".[0].sha" \
-              "/repos/${GITHUB_REPOSITORY}/commits"
-          )
-          echo "image-tag=${LAST_BUILD_TOOLS_SHA}" | tee -a $GITHUB_OUTPUT
+          echo "image-tag=${IMAGE_TAG}" | tee -a $GITHUB_OUTPUT
 
       - name: Check if such tag found in the registry
         id: check-image

From e729f282051a990e77edfe613c45749f9d3d5fbe Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Mon, 17 Jun 2024 20:57:49 +0200
Subject: [PATCH 0985/1571] Fix log rates (#8035)

## Summary of changes

- Stop logging HealthCheck message passing at INFO level (moved to
  DEBUG)
- Stop logging /status accesses at INFO (moved to DEBUG)
- Stop logging most occurances of
  `missing config file "compute_ctl_temp_override.conf"`
- Log memory usage only when the data has changed significantly, or if
  we've not recently logged the data, rather than always every 2 seconds.
---
 compute_tools/src/compute.rs      |  70 +++++++++++----------
 compute_tools/src/config.rs       |  19 +++---
 compute_tools/src/http/api.rs     |   4 +-
 libs/vm_monitor/src/cgroup.rs     | 101 +++++++++++++++++++++++++++++-
 libs/vm_monitor/src/dispatcher.rs |  13 ++--
 libs/vm_monitor/src/runner.rs     |  23 ++++---
 vm-image-spec.yaml                |   5 +-
 7 files changed, 176 insertions(+), 59 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 40060f4117..a79b666409 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -918,38 +918,39 @@ impl ComputeNode {
         // temporarily reset max_cluster_size in config
         // to avoid the possibility of hitting the limit, while we are reconfiguring:
         // creating new extensions, roles, etc...
-        config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
-        self.pg_reload_conf()?;
+        config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
+            self.pg_reload_conf()?;
 
-        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
+            let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
 
-        // Proceed with post-startup configuration. Note, that order of operations is important.
-        // Disable DDL forwarding because control plane already knows about these roles/databases.
-        if spec.mode == ComputeMode::Primary {
-            client.simple_query("SET neon.forward_ddl = false")?;
-            cleanup_instance(&mut client)?;
-            handle_roles(&spec, &mut client)?;
-            handle_databases(&spec, &mut client)?;
-            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-            handle_grants(
-                &spec,
-                &mut client,
-                self.connstr.as_str(),
-                self.has_feature(ComputeFeature::AnonExtension),
-            )?;
-            handle_extensions(&spec, &mut client)?;
-            handle_extension_neon(&mut client)?;
-            // We can skip handle_migrations here because a new migration can only appear
-            // if we have a new version of the compute_ctl binary, which can only happen
-            // if compute got restarted, in which case we'll end up inside of apply_config
-            // instead of reconfigure.
-        }
+            // Proceed with post-startup configuration. Note, that order of operations is important.
+            // Disable DDL forwarding because control plane already knows about these roles/databases.
+            if spec.mode == ComputeMode::Primary {
+                client.simple_query("SET neon.forward_ddl = false")?;
+                cleanup_instance(&mut client)?;
+                handle_roles(&spec, &mut client)?;
+                handle_databases(&spec, &mut client)?;
+                handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
+                handle_grants(
+                    &spec,
+                    &mut client,
+                    self.connstr.as_str(),
+                    self.has_feature(ComputeFeature::AnonExtension),
+                )?;
+                handle_extensions(&spec, &mut client)?;
+                handle_extension_neon(&mut client)?;
+                // We can skip handle_migrations here because a new migration can only appear
+                // if we have a new version of the compute_ctl binary, which can only happen
+                // if compute got restarted, in which case we'll end up inside of apply_config
+                // instead of reconfigure.
+            }
 
-        // 'Close' connection
-        drop(client);
+            // 'Close' connection
+            drop(client);
+
+            Ok(())
+        })?;
 
-        // reset max_cluster_size in config back to original value and reload config
-        config::compute_ctl_temp_override_remove(pgdata_path)?;
         self.pg_reload_conf()?;
 
         let unknown_op = "unknown".to_string();
@@ -1040,12 +1041,17 @@ impl ComputeNode {
                 // temporarily reset max_cluster_size in config
                 // to avoid the possibility of hitting the limit, while we are applying config:
                 // creating new extensions, roles, etc...
-                config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
-                self.pg_reload_conf()?;
+                config::with_compute_ctl_tmp_override(
+                    pgdata_path,
+                    "neon.max_cluster_size=-1",
+                    || {
+                        self.pg_reload_conf()?;
 
-                self.apply_config(&compute_state)?;
+                        self.apply_config(&compute_state)?;
 
-                config::compute_ctl_temp_override_remove(pgdata_path)?;
+                        Ok(())
+                    },
+                )?;
                 self.pg_reload_conf()?;
             }
             self.post_apply_config()?;
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 89c866b20c..2c4aec4116 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -131,18 +131,17 @@ pub fn write_postgres_conf(
     Ok(())
 }
 
-/// create file compute_ctl_temp_override.conf in pgdata_dir
-/// add provided options to this file
-pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> {
+pub fn with_compute_ctl_tmp_override<F>(pgdata_path: &Path, options: &str, exec: F) -> Result<()>
+where
+    F: FnOnce() -> Result<()>,
+{
     let path = pgdata_path.join("compute_ctl_temp_override.conf");
     let mut file = File::create(path)?;
     write!(file, "{}", options)?;
-    Ok(())
-}
 
-/// remove file compute_ctl_temp_override.conf in pgdata_dir
-pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> {
-    let path = pgdata_path.join("compute_ctl_temp_override.conf");
-    std::fs::remove_file(path)?;
-    Ok(())
+    let res = exec();
+
+    file.set_len(0)?;
+
+    res
 }
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 0286429cf2..43d29402bc 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -17,7 +17,7 @@ use hyper::header::CONTENT_TYPE;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use tokio::task;
-use tracing::{error, info, warn};
+use tracing::{debug, error, info, warn};
 use tracing_utils::http::OtelName;
 use utils::http::request::must_get_query_param;
 
@@ -48,7 +48,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
     match (req.method(), req.uri().path()) {
         // Serialized compute state.
         (&Method::GET, "/status") => {
-            info!("serving /status GET request");
+            debug!("serving /status GET request");
             let state = compute.state.lock().unwrap();
             let status_response = status_response_from_state(&state);
             Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs
index 7160a42df2..3223765016 100644
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -25,6 +25,8 @@ pub struct Config {
     ///
     /// For simplicity, this value must be greater than or equal to `memory_history_len`.
     memory_history_log_interval: usize,
+    /// The max number of iterations to skip before logging the next iteration
+    memory_history_log_noskip_interval: Duration,
 }
 
 impl Default for Config {
@@ -33,6 +35,7 @@ impl Default for Config {
             memory_poll_interval: Duration::from_millis(100),
             memory_history_len: 5, // use 500ms of history for decision-making
             memory_history_log_interval: 20, // but only log every ~2s (otherwise it's spammy)
+            memory_history_log_noskip_interval: Duration::from_secs(15), // but only if it's changed, or 60 seconds have passed
         }
     }
 }
@@ -85,7 +88,12 @@ impl CgroupWatcher {
 
         // buffer for samples that will be logged. once full, it remains so.
         let history_log_len = self.config.memory_history_log_interval;
+        let max_skip = self.config.memory_history_log_noskip_interval;
         let mut history_log_buf = vec![MemoryStatus::zeroed(); history_log_len];
+        let mut last_logged_memusage = MemoryStatus::zeroed();
+
+        // Ensure that we're tracking a value that's definitely in the past, as Instant::now is only guaranteed to be non-decreasing on Rust's T1-supported systems.
+        let mut can_skip_logs_until = Instant::now() - max_skip;
 
         for t in 0_u64.. {
             ticker.tick().await;
@@ -115,12 +123,24 @@ impl CgroupWatcher {
             // equal to the logging interval, we can just log the entire buffer every time we set
             // the last entry, which also means that for this log line, we can ignore that it's a
             // ring buffer (because all the entries are in order of increasing time).
-            if i == history_log_len - 1 {
+            //
+            // We skip logging the data if data hasn't meaningfully changed in a while, unless
+            // we've already ignored previous iterations for the last max_skip period.
+            if i == history_log_len - 1
+                && (now > can_skip_logs_until
+                    || !history_log_buf
+                        .iter()
+                        .all(|usage| last_logged_memusage.status_is_close_or_similar(usage)))
+            {
                 info!(
                     history = ?MemoryStatus::debug_slice(&history_log_buf),
                     summary = ?summary,
                     "Recent cgroup memory statistics history"
                 );
+
+                can_skip_logs_until = now + max_skip;
+
+                last_logged_memusage = *history_log_buf.last().unwrap();
             }
 
             updates
@@ -232,6 +252,24 @@ impl MemoryStatus {
 
         DS(slice)
     }
+
+    /// Check if the other memory status is a close or similar result.
+    /// Returns true if the larger value is not larger than the smaller value
+    /// by 1/8 of the smaller value, and within 128MiB.
+    /// See tests::check_similarity_behaviour for examples of behaviour
+    fn status_is_close_or_similar(&self, other: &MemoryStatus) -> bool {
+        let margin;
+        let diff;
+        if self.non_reclaimable >= other.non_reclaimable {
+            margin = other.non_reclaimable / 8;
+            diff = self.non_reclaimable - other.non_reclaimable;
+        } else {
+            margin = self.non_reclaimable / 8;
+            diff = other.non_reclaimable - self.non_reclaimable;
+        }
+
+        diff < margin && diff < 128 * 1024 * 1024
+    }
 }
 
 #[cfg(test)]
@@ -261,4 +299,65 @@ mod tests {
         assert_eq!(values(2, 4), [9, 0, 1, 2]);
         assert_eq!(values(2, 10), [3, 4, 5, 6, 7, 8, 9, 0, 1, 2]);
     }
+
+    #[test]
+    fn check_similarity_behaviour() {
+        // This all accesses private methods, so we can't actually run this
+        // as doctests, because doctests run as an external crate.
+        let mut small = super::MemoryStatus {
+            non_reclaimable: 1024,
+        };
+        let mut large = super::MemoryStatus {
+            non_reclaimable: 1024 * 1024 * 1024 * 1024,
+        };
+
+        // objects are self-similar, no matter the size
+        assert!(small.status_is_close_or_similar(&small));
+        assert!(large.status_is_close_or_similar(&large));
+
+        // inequality is symmetric
+        assert!(!small.status_is_close_or_similar(&large));
+        assert!(!large.status_is_close_or_similar(&small));
+
+        small.non_reclaimable = 64;
+        large.non_reclaimable = (small.non_reclaimable / 8) * 9;
+
+        // objects are self-similar, no matter the size
+        assert!(small.status_is_close_or_similar(&small));
+        assert!(large.status_is_close_or_similar(&large));
+
+        // values are similar if the larger value is larger by less than
+        // 12.5%, i.e. 1/8 of the smaller value.
+        // In the example above, large is exactly 12.5% larger, so this doesn't
+        // match.
+        assert!(!small.status_is_close_or_similar(&large));
+        assert!(!large.status_is_close_or_similar(&small));
+
+        large.non_reclaimable -= 1;
+        assert!(large.status_is_close_or_similar(&large));
+
+        assert!(small.status_is_close_or_similar(&large));
+        assert!(large.status_is_close_or_similar(&small));
+
+        // The 1/8 rule only applies up to 128MiB of difference
+        small.non_reclaimable = 1024 * 1024 * 1024 * 1024;
+        large.non_reclaimable = small.non_reclaimable / 8 * 9;
+        assert!(small.status_is_close_or_similar(&small));
+        assert!(large.status_is_close_or_similar(&large));
+
+        assert!(!small.status_is_close_or_similar(&large));
+        assert!(!large.status_is_close_or_similar(&small));
+        // the large value is put just above the threshold
+        large.non_reclaimable = small.non_reclaimable + 128 * 1024 * 1024;
+        assert!(large.status_is_close_or_similar(&large));
+
+        assert!(!small.status_is_close_or_similar(&large));
+        assert!(!large.status_is_close_or_similar(&small));
+        // now below
+        large.non_reclaimable -= 1;
+        assert!(large.status_is_close_or_similar(&large));
+
+        assert!(small.status_is_close_or_similar(&large));
+        assert!(large.status_is_close_or_similar(&small));
+    }
 }
diff --git a/libs/vm_monitor/src/dispatcher.rs b/libs/vm_monitor/src/dispatcher.rs
index c76baf04e7..6a965ace9b 100644
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -12,11 +12,11 @@ use futures::{
     stream::{SplitSink, SplitStream},
     SinkExt, StreamExt,
 };
-use tracing::info;
+use tracing::{debug, info};
 
 use crate::protocol::{
-    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, PROTOCOL_MAX_VERSION,
-    PROTOCOL_MIN_VERSION,
+    OutboundMsg, OutboundMsgKind, ProtocolRange, ProtocolResponse, ProtocolVersion,
+    PROTOCOL_MAX_VERSION, PROTOCOL_MIN_VERSION,
 };
 
 /// The central handler for all communications in the monitor.
@@ -118,7 +118,12 @@ impl Dispatcher {
     /// serialize the wrong thing and send it, since `self.sink.send` will take
     /// any string.
     pub async fn send(&mut self, message: OutboundMsg) -> anyhow::Result<()> {
-        info!(?message, "sending message");
+        if matches!(&message.inner, OutboundMsgKind::HealthCheck { .. }) {
+            debug!(?message, "sending message");
+        } else {
+            info!(?message, "sending message");
+        }
+
         let json = serde_json::to_string(&message).context("failed to serialize message")?;
         self.sink
             .send(Message::Text(json))
diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs
index ca02637ecf..36f8573a38 100644
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -12,7 +12,7 @@ use axum::extract::ws::{Message, WebSocket};
 use futures::StreamExt;
 use tokio::sync::{broadcast, watch};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, warn};
+use tracing::{debug, error, info, warn};
 
 use crate::cgroup::{self, CgroupWatcher};
 use crate::dispatcher::Dispatcher;
@@ -474,26 +474,29 @@ impl Runner {
                 // there is a message from the agent
                 msg = self.dispatcher.source.next() => {
                     if let Some(msg) = msg {
-                        // Don't use 'message' as a key as the string also uses
-                        // that for its key
-                        info!(?msg, "received message");
-                        match msg {
+                        match &msg {
                             Ok(msg) => {
                                 let message: InboundMsg = match msg {
                                     Message::Text(text) => {
-                                        serde_json::from_str(&text).context("failed to deserialize text message")?
+                                        serde_json::from_str(text).context("failed to deserialize text message")?
                                     }
                                     other => {
                                         warn!(
                                             // Don't use 'message' as a key as the
                                             // string also uses that for its key
                                             msg = ?other,
-                                            "agent should only send text messages but received different type"
+                                            "problem processing incoming message: agent should only send text messages but received different type"
                                         );
                                         continue
                                     },
                                 };
 
+                                if matches!(&message.inner, InboundMsgKind::HealthCheck { .. }) {
+                                    debug!(?msg, "received message");
+                                } else {
+                                    info!(?msg, "received message");
+                                }
+
                                 let out = match self.process_message(message.clone()).await {
                                     Ok(Some(out)) => out,
                                     Ok(None) => continue,
@@ -517,7 +520,11 @@ impl Runner {
                                     .await
                                     .context("failed to send message")?;
                             }
-                            Err(e) => warn!("{e}"),
+                            Err(e) => warn!(
+                                error = format!("{e}"),
+                                msg = ?msg,
+                                "received error message"
+                            ),
                         }
                     } else {
                         anyhow::bail!("dispatcher connection closed")
diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 99164645a7..3c446ecdea 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -324,14 +324,15 @@ files:
         help: 'Whether or not the replication slot wal_status is lost'
         key_labels:
           - slot_name
-        values: [wal_status_is_lost]
+        values: [wal_is_lost]
         query: |
           SELECT slot_name,
           CASE
             WHEN wal_status = 'lost' THEN 1
             ELSE 0
-          END AS wal_status_is_lost
+          END AS wal_is_lost
           FROM pg_replication_slots;
+
   - filename: neon_collector_autoscaling.yml
     content: |
       collector_name: neon_collector_autoscaling

From 6c6a7f9acee6ee785c9993837bd6be349c294a92 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 18 Jun 2024 09:42:22 +0200
Subject: [PATCH 0986/1571] [v2] Include openssl and ICU statically linked
 (#8074)

We had to revert the earlier static linking change due to libicu version
incompatibilities:

- original PR: https://github.com/neondatabase/neon/pull/7956
- revert PR: https://github.com/neondatabase/neon/pull/8003

Specifically, the problem manifests for existing projects as error

```
DETAIL:  The collation in the database was created using version 153.120.42, but the operating system provides version 153.14.37.
```

So, this PR reintroduces the original change but with the exact same
libicu version as in Debian `bullseye`, i.e., the libicu version that
we're using today.
This avoids the version incompatibility.


Additional changes made by Christian
====================================
- `hashFiles` can take multiple arguments, use that feature
- validation of the libicu tarball checksum
- parallel build (`-j $(nproc)`) for openssl and libicu

Follow-ups
==========

Debian bullseye has a few patches on top of libicu:
https://sources.debian.org/patches/icu/67.1-7/
We still decide whether we need to include these patches or not.
=> https://github.com/neondatabase/cloud/issues/14527

Eventually, we'll have to figure out an upgrade story for libicu.
That work is tracked in epic
https://github.com/neondatabase/cloud/issues/14525.

The OpenSSL version in this PR is arbitrary.
We should use `1.1.1w` + Debian patches if applicable.
See https://github.com/neondatabase/cloud/issues/14526.

Longer-term:
* https://github.com/neondatabase/cloud/issues/14519
* https://github.com/neondatabase/cloud/issues/14525

Refs
====

Co-authored-by: Christian Schwarz <christian@neon.tech>

refs https://github.com/neondatabase/cloud/issues/12648

---------

Co-authored-by: Rahul Patil <rahul@neon.tech>
---
 .github/workflows/build_and_test.yml |  6 ++--
 Dockerfile                           |  2 --
 Dockerfile.build-tools               | 42 ++++++++++++++++++++++++++++
 Makefile                             | 15 +++++++++-
 4 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index bd2996ec4c..742716776e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -299,21 +299,21 @@ jobs:
         uses: actions/cache@v4
         with:
           path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
         uses: actions/cache@v4
         with:
           path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
         uses: actions/cache@v4
         with:
           path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
 
       - name: Build postgres v14
         if: steps.cache_pg_14.outputs.cache-hit != 'true'
diff --git a/Dockerfile b/Dockerfile
index 5f82df3e18..b4900d4a94 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -69,8 +69,6 @@ RUN set -e \
     && apt install -y \
         libreadline-dev \
         libseccomp-dev \
-        libicu67 \
-        openssl \
         ca-certificates \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
     && useradd -d /data neon \
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index e7c61ace0e..5dd2c13c0e 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -112,6 +112,45 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
     && make install \
     && rm -rf ../lcov.tar.gz
 
+# Compile and install the static OpenSSL library
+ENV OPENSSL_VERSION=3.2.2
+ENV OPENSSL_PREFIX=/usr/local/openssl
+RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
+    echo "197149c18d9e9f292c43f0400acaba12e5f52cacfe050f3d199277ea738ec2e7 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
+    cd /tmp && \
+    tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
+    rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
+    cd /tmp/openssl-${OPENSSL_VERSION} && \
+    ./config --prefix=${OPENSSL_PREFIX}  -static --static no-shared -fPIC && \
+    make -j "$(nproc)" && \
+    make install && \
+    cd /tmp && \
+    rm -rf /tmp/openssl-${OPENSSL_VERSION}
+
+# Use the same version of libicu as the compute nodes so that
+# clusters created using inidb on pageserver can be used by computes.
+#
+# TODO: at this time, Dockerfile.compute-node uses the debian bullseye libicu
+# package, which is 67.1. We're duplicating that knowledge here, and also, technically,
+# Debian has a few patches on top of 67.1 that we're not adding here.
+ENV ICU_VERSION=67.1
+ENV ICU_PREFIX=/usr/local/icu
+
+# Download and build static ICU
+RUN wget -O /tmp/libicu-${ICU_VERSION}.tgz https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION//./-}/icu4c-${ICU_VERSION//./_}-src.tgz && \
+    echo "94a80cd6f251a53bd2a997f6f1b5ac6653fe791dfab66e1eb0227740fb86d5dc /tmp/libicu-${ICU_VERSION}.tgz" | sha256sum --check && \
+    mkdir /tmp/icu && \
+    pushd /tmp/icu && \
+    tar -xzf /tmp/libicu-${ICU_VERSION}.tgz && \
+    pushd icu/source && \
+    ./configure --prefix=${ICU_PREFIX}  --enable-static --enable-shared=no CXXFLAGS="-fPIC" CFLAGS="-fPIC" && \
+    make -j "$(nproc)" && \
+    make install && \
+    popd && \
+    rm -rf icu && \
+    rm -f /tmp/libicu-${ICU_VERSION}.tgz && \
+    popd
+
 # Switch to nonroot user
 USER nonroot:nonroot
 WORKDIR /home/nonroot
@@ -170,3 +209,6 @@ RUN whoami \
     && rustup --version --verbose \
     && rustc --version --verbose \
     && clang --version
+
+# Set following flag to check in Makefile if its running in Docker
+RUN touch /home/nonroot/.docker_build
diff --git a/Makefile b/Makefile
index 37bd19ba44..942867d81a 100644
--- a/Makefile
+++ b/Makefile
@@ -3,6 +3,9 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Where to install Postgres, default is ./pg_install, maybe useful for package managers
 POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
 
+OPENSSL_PREFIX_DIR := /usr/local/openssl
+ICU_PREFIX_DIR := /usr/local/icu
+
 #
 # We differentiate between release / debug build types using the BUILD_TYPE
 # environment variable.
@@ -20,6 +23,16 @@ else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif
 
+ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
+	# Exclude static build openssl, icu for local build (MacOS, Linux)
+	# Only keep for build type release and debug
+	PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
+	PG_CONFIGURE_OPTS += --with-icu
+	PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
+	PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
+	PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
+endif
+
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
@@ -28,7 +41,7 @@ else ifeq ($(UNAME_S),Darwin)
 	ifndef DISABLE_HOMEBREW
 		# macOS with brew-installed openssl requires explicit paths
 		# It can be configured with OPENSSL_PREFIX variable
-		OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+		OPENSSL_PREFIX := $(shell brew --prefix openssl@3)
 		PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
 		PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
 		# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure

From ed9ffb9af2ce30eff88e9c6fcfe0c315d69e025b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 18 Jun 2024 13:44:30 +0100
Subject: [PATCH 0987/1571] pageserver: eliminate
 CalculateSyntheticSizeError::LsnNotFound (`test_metric_collection` flake)
 (#8065)

## Problem

```
ERROR synthetic_size_worker: failed to calculate synthetic size for tenant ae449af30216ac56d2c1173f894b1122: Could not find size at 0/218CA70 in timeline d8da32b5e3e0bf18cfdb560f9de29638\n')
```

e.g.
https://neon-github-public-dev.s3.amazonaws.com/reports/main/9518948590/index.html#/testresult/30a6d1e2471d2775

This test had allow lists but was disrupted by
https://github.com/neondatabase/neon/pull/8051. In that PR, I had kept
an error path in fill_logical_sizes that covered the case where we
couldn't find sizes for some of the segments, but that path could only
be hit in the case that some Timeline was shut down concurrently with a
synthetic size calculation, so it makes sense to just leave the
segment's size None in this case: the subsequent size calculations do
not assume it is Some.

## Summary of changes

- Remove `CalculateSyntheticSizeError::LsnNotFound` and just proceed in
the case where we used to return it
- Remove defunct allow list entries in `test_metric_collection`
---
 pageserver/src/tenant/size.rs                         | 11 ++---------
 .../regress/test_pageserver_metric_collection.py      |  6 ------
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index cdd5b0cbe7..b2338b620e 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -60,10 +60,6 @@ pub(crate) enum CalculateSyntheticSizeError {
     #[error(transparent)]
     Fatal(anyhow::Error),
 
-    /// The LSN we are trying to calculate a size at no longer exists at the point we query it
-    #[error("Could not find size at {lsn} in timeline {timeline_id}")]
-    LsnNotFound { timeline_id: TimelineId, lsn: Lsn },
-
     /// Tenant shut down while calculating size
     #[error("Cancelled")]
     Cancelled,
@@ -375,9 +371,8 @@ pub(super) async fn gather_inputs(
 
 /// Augment 'segments' with logical sizes
 ///
-/// this will probably conflict with on-demand downloaded layers, or at least force them all
-/// to be downloaded
-///
+/// This will leave segments' sizes as None if the Timeline associated with the segment is deleted concurrently
+/// (i.e. we cannot read its logical size at a particular LSN).
 async fn fill_logical_sizes(
     timelines: &[Arc<Timeline>],
     segments: &mut [SegmentMeta],
@@ -498,8 +493,6 @@ async fn fill_logical_sizes(
 
         if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) {
             seg.segment.size = Some(*size);
-        } else {
-            return Err(CalculateSyntheticSizeError::LsnNotFound { timeline_id, lsn });
         }
     }
     Ok(())
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index b0465f2a96..cea35a6acb 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -75,9 +75,6 @@ def test_metric_collection(
     env.pageserver.allowed_errors.extend(
         [
             ".*metrics endpoint refused the sent metrics*",
-            # we have a fast rate of calculation, these can happen at shutdown
-            ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*",
-            ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes",
             ".*metrics_collection: failed to upload to S3: Failed to upload data of length .* to storage path.*",
         ]
     )
@@ -238,9 +235,6 @@ def test_metric_collection_cleans_up_tempfile(
     env.pageserver.allowed_errors.extend(
         [
             ".*metrics endpoint refused the sent metrics*",
-            # we have a fast rate of calculation, these can happen at shutdown
-            ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*",
-            ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes",
         ]
     )
 

From d8b2a49c5574873eebeaf97cdc9d6d6531ff6f51 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 3 Jun 2024 15:47:21 +0300
Subject: [PATCH 0988/1571] safekeeper: streaming pull_timeline

- Add /snapshot http endpoing streaming tar archive timeline contents up to
  flush_lsn.
- Add check that term doesn't change, corresponding test passes now.
- Also prepares infra to hold off WAL removal during the basebackup.
- Sprinkle fsyncs to persist the pull_timeline result.

ref https://github.com/neondatabase/neon/issues/6340
---
 Cargo.lock                               |   1 +
 safekeeper/Cargo.toml                    |   1 +
 safekeeper/src/control_file.rs           |   2 +-
 safekeeper/src/http/routes.rs            | 121 ++++----
 safekeeper/src/pull_timeline.rs          | 351 +++++++++++++++++------
 safekeeper/src/safekeeper.rs             |   3 +
 safekeeper/src/timeline.rs               |   4 +-
 safekeeper/src/wal_storage.rs            |  52 ++--
 test_runner/regress/test_wal_acceptor.py |  27 +-
 9 files changed, 367 insertions(+), 195 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1c8a8b0c0f..5eac648fd9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5158,6 +5158,7 @@ dependencies = [
  "tokio-io-timeout",
  "tokio-postgres",
  "tokio-stream",
+ "tokio-tar",
  "tokio-util",
  "toml_edit",
  "tracing",
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index c8b732fee1..a650d5e207 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -46,6 +46,7 @@ tokio = { workspace = true, features = ["fs"] }
 tokio-util = { workspace = true }
 tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
+tokio-tar.workspace = true
 toml_edit.workspace = true
 tracing.workspace = true
 url.workspace = true
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index e9bb5202da..9d65187350 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -23,7 +23,7 @@ pub const SK_MAGIC: u32 = 0xcafeceefu32;
 pub const SK_FORMAT_VERSION: u32 = 8;
 
 // contains persistent metadata for safekeeper
-const CONTROL_FILE_NAME: &str = "safekeeper.control";
+pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
 // needed to atomically update the state using `rename`
 const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial";
 pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 1e29b21fac..40ac2c105d 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -1,38 +1,25 @@
 use hyper::{Body, Request, Response, StatusCode, Uri};
-
 use once_cell::sync::Lazy;
-use postgres_ffi::WAL_SEGMENT_SIZE;
-use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
 use serde::{Deserialize, Serialize};
 use std::collections::{HashMap, HashSet};
 use std::fmt;
+use std::io::Write as _;
 use std::str::FromStr;
 use std::sync::Arc;
 use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
-use tokio::fs::File;
-use tokio::io::AsyncReadExt;
+use tokio::sync::mpsc;
+use tokio::task;
+use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::sync::CancellationToken;
+use tracing::{info_span, Instrument};
 use utils::failpoint_support::failpoints_handler;
+use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWriter};
 use utils::http::request::parse_query_param;
 
-use std::io::Write as _;
-use tokio::sync::mpsc;
-use tokio_stream::wrappers::ReceiverStream;
-use tracing::{info_span, Instrument};
-use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWriter};
-
-use crate::debug_dump::TimelineDigestRequest;
-use crate::receive_wal::WalReceiverState;
-use crate::safekeeper::Term;
-use crate::safekeeper::{ServerInfo, TermLsn};
-use crate::send_wal::WalSenderState;
-use crate::timeline::PeerInfo;
-use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline};
-
-use crate::timelines_global_map::TimelineDeleteForceResult;
-use crate::GlobalTimelines;
-use crate::SafeKeeperConf;
+use postgres_ffi::WAL_SEGMENT_SIZE;
+use safekeeper_api::models::TimelineCreateRequest;
+use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
 use utils::{
     auth::SwappableJwtAuth,
     http::{
@@ -46,7 +33,16 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::models::TimelineCreateRequest;
+use crate::debug_dump::TimelineDigestRequest;
+use crate::receive_wal::WalReceiverState;
+use crate::safekeeper::Term;
+use crate::safekeeper::{ServerInfo, TermLsn};
+use crate::send_wal::WalSenderState;
+use crate::timeline::PeerInfo;
+use crate::timelines_global_map::TimelineDeleteForceResult;
+use crate::GlobalTimelines;
+use crate::SafeKeeperConf;
+use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline};
 
 #[derive(Debug, Serialize)]
 struct SafekeeperStatus {
@@ -206,6 +202,42 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
     json_response(StatusCode::OK, resp)
 }
 
+/// Stream tar archive with all timeline data.
+async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+    check_permission(&request, Some(ttid.tenant_id))?;
+
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    // Note: with evicted timelines it should work better then de-evict them and
+    // stream; probably start_snapshot would copy partial s3 file to dest path
+    // and stream control file, or return FullAccessTimeline if timeline is not
+    // evicted.
+    let tli = tli
+        .full_access_guard()
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    // To stream the body use wrap_stream which wants Stream of Result<Bytes>,
+    // so create the chan and write to it in another task.
+    let (tx, rx) = mpsc::channel(1);
+
+    task::spawn(pull_timeline::stream_snapshot(tli, tx));
+
+    let rx_stream = ReceiverStream::new(rx);
+    let body = Body::wrap_stream(rx_stream);
+
+    let response = Response::builder()
+        .status(200)
+        .header(hyper::header::CONTENT_TYPE, "application/octet-stream")
+        .body(body)
+        .unwrap();
+
+    Ok(response)
+}
+
 async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
@@ -260,41 +292,6 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
     json_response(StatusCode::OK, response)
 }
 
-/// Download a file from the timeline directory.
-// TODO: figure out a better way to copy files between safekeepers
-async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let ttid = TenantTimelineId::new(
-        parse_request_param(&request, "tenant_id")?,
-        parse_request_param(&request, "timeline_id")?,
-    );
-    check_permission(&request, Some(ttid.tenant_id))?;
-
-    let filename: String = parse_request_param(&request, "filename")?;
-
-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
-    let tli = tli
-        .full_access_guard()
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    let filepath = tli.get_timeline_dir().join(filename);
-    let mut file = File::open(&filepath)
-        .await
-        .map_err(|e| ApiError::InternalServerError(e.into()))?;
-
-    let mut content = Vec::new();
-    // TODO: don't store files in memory
-    file.read_to_end(&mut content)
-        .await
-        .map_err(|e| ApiError::InternalServerError(e.into()))?;
-
-    Response::builder()
-        .status(StatusCode::OK)
-        .header("Content-Type", "application/octet-stream")
-        .body(Body::from(content))
-        .map_err(|e| ApiError::InternalServerError(e.into()))
-}
-
 /// Force persist control file.
 async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
@@ -566,13 +563,13 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .delete("/v1/tenant/:tenant_id", |r| {
             request_span(r, tenant_delete_handler)
         })
+        .get(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot",
+            |r| request_span(r, timeline_snapshot_handler),
+        )
         .post("/v1/pull_timeline", |r| {
             request_span(r, timeline_pull_handler)
         })
-        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename",
-            |r| request_span(r, timeline_files_handler),
-        )
         .post(
             "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
             |r| request_span(r, timeline_copy_handler),
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 7b41c98cb8..4099a324f9 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -1,28 +1,223 @@
-use std::sync::Arc;
-
+use anyhow::{anyhow, bail, Context, Result};
+use bytes::Bytes;
 use camino::Utf8PathBuf;
 use camino_tempfile::Utf8TempDir;
 use chrono::{DateTime, Utc};
+use futures::{SinkExt, StreamExt, TryStreamExt};
+use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use serde::{Deserialize, Serialize};
+use std::{
+    cmp::min,
+    io::{self, ErrorKind},
+    sync::Arc,
+};
+use tokio::{
+    fs::{File, OpenOptions},
+    io::AsyncWrite,
+    sync::mpsc,
+};
+use tokio_tar::{Archive, Builder};
+use tokio_util::{
+    io::{CopyToBytes, SinkWriter},
+    sync::PollSender,
+};
+use tracing::{error, info, instrument};
 
-use anyhow::{bail, Context, Result};
-use tokio::io::AsyncWriteExt;
-use tracing::info;
+use crate::{
+    control_file::{self, CONTROL_FILE_NAME},
+    debug_dump,
+    http::routes::TimelineStatus,
+    safekeeper::Term,
+    timeline::{get_tenant_dir, get_timeline_dir, FullAccessTimeline, Timeline, TimelineError},
+    wal_storage::{self, open_wal_file, Storage},
+    GlobalTimelines, SafeKeeperConf,
+};
 use utils::{
+    crashsafe::{durable_rename, fsync_async_opt},
     id::{TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
     pausable_failpoint,
 };
 
-use crate::{
-    control_file, debug_dump,
-    http::routes::TimelineStatus,
-    timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError},
-    wal_storage::{self, Storage},
-    GlobalTimelines, SafeKeeperConf,
-};
+/// Stream tar archive of timeline to tx.
+#[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))]
+pub async fn stream_snapshot(tli: FullAccessTimeline, tx: mpsc::Sender<Result<Bytes>>) {
+    if let Err(e) = stream_snapshot_guts(tli, tx.clone()).await {
+        // Error type/contents don't matter as they won't can't reach the client
+        // (hyper likely doesn't do anything with it), but http stream will be
+        // prematurely terminated. It would be nice to try to send the error in
+        // trailers though.
+        tx.send(Err(anyhow!("snapshot failed"))).await.ok();
+        error!("snapshot failed: {:#}", e);
+    }
+}
 
-/// Info about timeline on safekeeper ready for reporting.
+/// State needed while streaming the snapshot.
+pub struct SnapshotContext {
+    pub from_segno: XLogSegNo, // including
+    pub upto_segno: XLogSegNo, // including
+    pub term: Term,
+    pub last_log_term: Term,
+    pub flush_lsn: Lsn,
+    pub wal_seg_size: usize,
+    // used to remove WAL hold off in Drop.
+    pub tli: FullAccessTimeline,
+}
+
+impl Drop for SnapshotContext {
+    fn drop(&mut self) {
+        // todo: spawn task removing WAL gc hold off
+    }
+}
+
+pub async fn stream_snapshot_guts(
+    tli: FullAccessTimeline,
+    tx: mpsc::Sender<Result<Bytes>>,
+) -> Result<()> {
+    // tokio-tar wants Write implementor, but we have mpsc tx <Result<Bytes>>;
+    // use SinkWriter as a Write impl. That is,
+    // - create Sink from the tx. It returns PollSendError if chan is closed.
+    let sink = PollSender::new(tx);
+    // - SinkWriter needs sink error to be io one, map it.
+    let sink_io_err = sink.sink_map_err(|_| io::Error::from(ErrorKind::BrokenPipe));
+    // - SinkWriter wants sink type to be just Bytes, not Result<Bytes>, so map
+    //   it with with(). Note that with() accepts async function which we don't
+    //   need and allows the map to fail, which we don't need either, but hence
+    //   two Oks.
+    let oksink = sink_io_err.with(|b: Bytes| async { io::Result::Ok(Result::Ok(b)) });
+    // - SinkWriter (not surprisingly) wants sink of &[u8], not bytes, so wrap
+    // into CopyToBytes. This is a data copy.
+    let copy_to_bytes = CopyToBytes::new(oksink);
+    let mut writer = SinkWriter::new(copy_to_bytes);
+    let pinned_writer = std::pin::pin!(writer);
+
+    // Note that tokio_tar append_* funcs use tokio::io::copy with 8KB buffer
+    // which is also likely suboptimal.
+    let mut ar = Builder::new_non_terminated(pinned_writer);
+
+    let bctx = tli.start_snapshot(&mut ar).await?;
+    pausable_failpoint!("sk-snapshot-after-list-pausable");
+
+    let tli_dir = tli.get_timeline_dir();
+    info!(
+        "sending {} segments [{:#X}-{:#X}], term={}, last_log_term={}, flush_lsn={}",
+        bctx.upto_segno - bctx.from_segno + 1,
+        bctx.from_segno,
+        bctx.upto_segno,
+        bctx.term,
+        bctx.last_log_term,
+        bctx.flush_lsn,
+    );
+    for segno in bctx.from_segno..=bctx.upto_segno {
+        let (mut sf, is_partial) = open_wal_file(&tli_dir, segno, bctx.wal_seg_size).await?;
+        let mut wal_file_name = XLogFileName(PG_TLI, segno, bctx.wal_seg_size);
+        if is_partial {
+            wal_file_name.push_str(".partial");
+        }
+        ar.append_file(&wal_file_name, &mut sf).await?;
+    }
+
+    // Do the term check before ar.finish to make archive corrupted in case of
+    // term change. Client shouldn't ignore abrupt stream end, but to be sure.
+    tli.finish_snapshot(&bctx).await?;
+
+    ar.finish().await?;
+
+    Ok(())
+}
+
+impl FullAccessTimeline {
+    /// Start streaming tar archive with timeline:
+    /// 1) stream control file under lock;
+    /// 2) hold off WAL removal;
+    /// 3) collect SnapshotContext to understand which WAL segments should be
+    ///    streamed.
+    ///
+    /// Snapshot streams data up to flush_lsn. To make this safe, we must check
+    /// that term doesn't change during the procedure, or we risk sending mix of
+    /// WAL from different histories. Term is remembered in the SnapshotContext
+    /// and checked in finish_snapshot. Note that in the last segment some WAL
+    /// higher than flush_lsn set here might be streamed; that's fine as long as
+    /// terms doesn't change.
+    ///
+    /// Alternatively we could send only up to commit_lsn to get some valid
+    /// state which later will be recovered by compute, in this case term check
+    /// is not needed, but we likely don't want that as there might be no
+    /// compute which could perform the recovery.
+    ///
+    /// When returned SnapshotContext is dropped WAL hold is removed.
+    async fn start_snapshot<W: AsyncWrite + Unpin + Send>(
+        &self,
+        ar: &mut tokio_tar::Builder<W>,
+    ) -> Result<SnapshotContext> {
+        let shared_state = self.read_shared_state().await;
+
+        let cf_path = self.get_timeline_dir().join(CONTROL_FILE_NAME);
+        let mut cf = File::open(cf_path).await?;
+        ar.append_file(CONTROL_FILE_NAME, &mut cf).await?;
+
+        // We need to stream since the oldest segment someone (s3 or pageserver)
+        // still needs. This duplicates calc_horizon_lsn logic.
+        let from_lsn = min(
+            shared_state.sk.state.remote_consistent_lsn,
+            shared_state.sk.state.backup_lsn,
+        );
+        if from_lsn == Lsn::INVALID {
+            // this is possible if snapshot is called before handling first
+            // elected message
+            bail!("snapshot is called on uninitialized timeline");
+        }
+        let from_segno = from_lsn.segment_number(shared_state.get_wal_seg_size());
+        let term = shared_state.sk.get_term();
+        let last_log_term = shared_state.sk.get_last_log_term();
+        let flush_lsn = shared_state.sk.flush_lsn();
+        let upto_segno = flush_lsn.segment_number(shared_state.get_wal_seg_size());
+        // have some limit on max number of segments as a sanity check
+        const MAX_ALLOWED_SEGS: u64 = 1000;
+        let num_segs = upto_segno - from_segno + 1;
+        if num_segs > MAX_ALLOWED_SEGS {
+            bail!(
+                "snapshot is called on timeline with {} segments, but the limit is {}",
+                num_segs,
+                MAX_ALLOWED_SEGS
+            );
+        }
+
+        // TODO: set WAL hold off.
+
+        let bctx = SnapshotContext {
+            from_segno,
+            upto_segno,
+            term,
+            last_log_term,
+            flush_lsn,
+            wal_seg_size: shared_state.get_wal_seg_size(),
+            tli: self.clone(),
+        };
+
+        Ok(bctx)
+    }
+
+    /// Finish snapshotting: check that term(s) hasn't changed.
+    ///
+    /// Note that WAL gc hold off is removed in Drop of SnapshotContext to not
+    /// forget this if snapshotting fails mid the way.
+    pub async fn finish_snapshot(&self, bctx: &SnapshotContext) -> Result<()> {
+        let shared_state = self.read_shared_state().await;
+        let term = shared_state.sk.get_term();
+        let last_log_term = shared_state.sk.get_last_log_term();
+        // There are some cases to relax this check (e.g. last_log_term might
+        // change, but as long as older history is strictly part of new that's
+        // fine), but there is no need to do it.
+        if bctx.term != term || bctx.last_log_term != last_log_term {
+            bail!("term(s) changed during snapshot: were term={}, last_log_term={}, now term={}, last_log_term={}",
+              bctx.term, bctx.last_log_term, term, last_log_term);
+        }
+        Ok(())
+    }
+}
+
+/// pull_timeline request body.
 #[derive(Debug, Serialize, Deserialize)]
 pub struct Request {
     pub tenant_id: TenantId,
@@ -72,13 +267,15 @@ pub async fn handle_request(request: Request) -> Result<Response> {
 
     let mut statuses = Vec::new();
     for (i, response) in responses.into_iter().enumerate() {
-        let response = response.context(format!("Failed to get status from {}", http_hosts[i]))?;
+        let response = response.context(format!("fetching status from {}", http_hosts[i]))?;
+        response
+            .error_for_status_ref()
+            .context(format!("checking status from {}", http_hosts[i]))?;
         let status: crate::http::routes::TimelineStatus = response.json().await?;
         statuses.push((status, i));
     }
 
     // Find the most advanced safekeeper
-    // TODO: current logic may be wrong, fix it later
     let (status, i) = statuses
         .into_iter()
         .max_by_key(|(status, _)| {
@@ -111,95 +308,59 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
 
     let conf = &GlobalTimelines::get_global_config();
 
-    let client = reqwest::Client::new();
-    // TODO: don't use debug dump, it should be used only in tests.
-    //      This is a proof of concept, we should figure out a way
-    //      to use scp without implementing it manually.
+    let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
 
-    // Implementing our own scp over HTTP.
-    // At first, we need to fetch list of files from safekeeper.
-    let dump: DebugDumpResponse = client
+    let client = reqwest::Client::new();
+
+    // Request stream with basebackup archive.
+    let bb_resp = client
         .get(format!(
-            "{}/v1/debug_dump?dump_all=true&tenant_id={}&timeline_id={}",
+            "{}/v1/tenant/{}/timeline/{}/snapshot",
             host, status.tenant_id, status.timeline_id
         ))
         .send()
-        .await?
-        .json()
         .await?;
+    bb_resp.error_for_status_ref()?;
 
-    if dump.timelines.len() != 1 {
-        bail!(
-            "expected to fetch single timeline, got {} timelines",
-            dump.timelines.len()
-        );
-    }
+    // Make Stream of Bytes from it...
+    let bb_stream = bb_resp.bytes_stream().map_err(std::io::Error::other);
+    // and turn it into StreamReader implementing AsyncRead.
+    let bb_reader = tokio_util::io::StreamReader::new(bb_stream);
 
-    let timeline = dump.timelines.into_iter().next().unwrap();
-    let disk_content = timeline.disk_content.ok_or(anyhow::anyhow!(
-        "timeline {} doesn't have disk content",
-        ttid
-    ))?;
-
-    let mut filenames = disk_content
-        .files
-        .iter()
-        .map(|file| file.name.clone())
-        .collect::<Vec<_>>();
-
-    // Sort filenames to make sure we pull files in correct order
-    // After sorting, we should have:
-    // - 000000010000000000000001
-    // - ...
-    // - 000000010000000000000002.partial
-    // - safekeeper.control
-    filenames.sort();
-
-    // safekeeper.control should be the first file, so we need to move it to the beginning
-    let control_file_index = filenames
-        .iter()
-        .position(|name| name == "safekeeper.control")
-        .ok_or(anyhow::anyhow!("safekeeper.control not found"))?;
-    filenames.remove(control_file_index);
-    filenames.insert(0, "safekeeper.control".to_string());
-
-    pausable_failpoint!("sk-pull-timeline-after-list-pausable");
-
-    info!(
-        "downloading {} files from safekeeper {}",
-        filenames.len(),
-        host
-    );
-
-    let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
-
-    // Note: some time happens between fetching list of files and fetching files themselves.
-    //       It's possible that some files will be removed from safekeeper and we will fail to fetch them.
-    //       This function will fail in this case, should be retried by the caller.
-    for filename in filenames {
-        let file_path = tli_dir_path.join(&filename);
-        // /v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename
-        let http_url = format!(
-            "{}/v1/tenant/{}/timeline/{}/file/{}",
-            host, status.tenant_id, status.timeline_id, filename
-        );
-
-        let mut file = tokio::fs::File::create(&file_path).await?;
-        let mut response = client.get(&http_url).send().await?;
-        if response.status() != reqwest::StatusCode::OK {
-            bail!(
-                "pulling file {} failed: status is {}",
-                filename,
-                response.status()
-            );
-        }
-        while let Some(chunk) = response.chunk().await? {
-            file.write_all(&chunk).await?;
-            file.flush().await?;
+    // Extract it on the fly to the disk. We don't use simple unpack() to fsync
+    // files.
+    let mut entries = Archive::new(bb_reader).entries()?;
+    while let Some(base_tar_entry) = entries.next().await {
+        let mut entry = base_tar_entry?;
+        let header = entry.header();
+        let file_path = header.path()?.into_owned();
+        match header.entry_type() {
+            tokio_tar::EntryType::Regular => {
+                let utf8_file_path =
+                    Utf8PathBuf::from_path_buf(file_path).expect("non-Unicode path");
+                let dst_path = tli_dir_path.join(utf8_file_path);
+                let mut f = OpenOptions::new()
+                    .create(true)
+                    .truncate(true)
+                    .write(true)
+                    .open(&dst_path)
+                    .await?;
+                tokio::io::copy(&mut entry, &mut f).await?;
+                // fsync the file
+                f.sync_all().await?;
+            }
+            _ => {
+                bail!(
+                    "entry {} in backup tar archive is of unexpected type: {:?}",
+                    file_path.display(),
+                    header.entry_type()
+                );
+            }
         }
     }
 
-    // TODO: fsync?
+    // fsync temp timeline directory to remember its contents.
+    fsync_async_opt(&tli_dir_path, !conf.no_sync).await?;
 
     // Let's create timeline from temp directory and verify that it's correct
     let (commit_lsn, flush_lsn) = validate_temp_timeline(conf, ttid, &tli_dir_path).await?;
@@ -290,7 +451,9 @@ pub async fn load_temp_timeline(
         ttid, tmp_path, timeline_path
     );
     tokio::fs::create_dir_all(get_tenant_dir(conf, &ttid.tenant_id)).await?;
-    tokio::fs::rename(tmp_path, &timeline_path).await?;
+    // fsync tenant dir creation
+    fsync_async_opt(&conf.workdir, !conf.no_sync).await?;
+    durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?;
 
     let tli = GlobalTimelines::load_timeline(&guard, ttid)
         .await
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 563dbbe315..ae230960ae 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -780,6 +780,9 @@ where
 
             // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment.
             state.backup_lsn = max(state.backup_lsn, state.timeline_start_lsn);
+            // similar for remote_consistent_lsn
+            state.remote_consistent_lsn =
+                max(state.remote_consistent_lsn, state.timeline_start_lsn);
 
             state.acceptor_state.term_history = msg.term_history.clone();
             self.state.finish_change(&state).await?;
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 148a7e90bd..e510a05a32 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -4,7 +4,7 @@
 use anyhow::{anyhow, bail, Result};
 use camino::Utf8PathBuf;
 use serde::{Deserialize, Serialize};
-use tokio::fs;
+use tokio::fs::{self};
 use tokio_util::sync::CancellationToken;
 use utils::id::TenantId;
 
@@ -225,7 +225,7 @@ impl SharedState {
         })
     }
 
-    fn get_wal_seg_size(&self) -> usize {
+    pub(crate) fn get_wal_seg_size(&self) -> usize {
         self.sk.state.server.wal_seg_size as usize
     }
 
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 45e27e1951..0c1731937c 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -684,13 +684,12 @@ impl WalReader {
         let xlogoff = self.pos.segment_offset(self.wal_seg_size);
         let segno = self.pos.segment_number(self.wal_seg_size);
         let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size);
-        let wal_file_path = self.timeline_dir.join(&wal_file_name);
 
         // Try to open local file, if we may have WAL locally
         if self.pos >= self.local_start_lsn {
-            let res = Self::open_wal_file(&wal_file_path).await;
+            let res = open_wal_file(&self.timeline_dir, segno, self.wal_seg_size).await;
             match res {
-                Ok(mut file) => {
+                Ok((mut file, _)) => {
                     file.seek(SeekFrom::Start(xlogoff as u64)).await?;
                     return Ok(Box::pin(file));
                 }
@@ -718,25 +717,6 @@ impl WalReader {
 
         bail!("WAL segment is not found")
     }
-
-    /// Helper function for opening a wal file.
-    async fn open_wal_file(wal_file_path: &Utf8Path) -> Result<tokio::fs::File> {
-        // First try to open the .partial file.
-        let mut partial_path = wal_file_path.to_owned();
-        partial_path.set_extension("partial");
-        if let Ok(opened_file) = tokio::fs::File::open(&partial_path).await {
-            return Ok(opened_file);
-        }
-
-        // If that failed, try it without the .partial extension.
-        tokio::fs::File::open(&wal_file_path)
-            .await
-            .with_context(|| format!("Failed to open WAL file {:?}", wal_file_path))
-            .map_err(|e| {
-                warn!("{}", e);
-                e
-            })
-    }
 }
 
 /// Zero block for filling created WAL segments.
@@ -758,6 +738,34 @@ async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
     Ok(())
 }
 
+/// Helper function for opening WAL segment `segno` in `dir`. Returns file and
+/// whether it is .partial.
+pub(crate) async fn open_wal_file(
+    timeline_dir: &Utf8Path,
+    segno: XLogSegNo,
+    wal_seg_size: usize,
+) -> Result<(tokio::fs::File, bool)> {
+    let (wal_file_path, wal_file_partial_path) = wal_file_paths(timeline_dir, segno, wal_seg_size)?;
+
+    // First try to open the .partial file.
+    let mut partial_path = wal_file_path.to_owned();
+    partial_path.set_extension("partial");
+    if let Ok(opened_file) = tokio::fs::File::open(&wal_file_partial_path).await {
+        return Ok((opened_file, true));
+    }
+
+    // If that failed, try it without the .partial extension.
+    let pf = tokio::fs::File::open(&wal_file_path)
+        .await
+        .with_context(|| format!("failed to open WAL file {:#}", wal_file_path))
+        .map_err(|e| {
+            warn!("{}", e);
+            e
+        })?;
+
+    Ok((pf, false))
+}
+
 /// Helper returning full path to WAL segment file and its .partial brother.
 pub fn wal_file_paths(
     timeline_dir: &Utf8Path,
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index dce30f5388..11aeb8f182 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -317,9 +317,9 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
         time.sleep(1)
 
     # Ensure that safekeepers don't lose remote_consistent_lsn on restart.
-    # Control file is persisted each 5s. TODO: do that on shutdown and remove sleep.
-    time.sleep(6)
     for sk in env.safekeepers:
+        # force persist cfile
+        sk.http_client().checkpoint(tenant_id, timeline_id)
         sk.stop()
         sk.start()
     stat_after_restart = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
@@ -1749,11 +1749,11 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_safekeepers = 4
     env = neon_env_builder.init_start()
     tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_pull_timeline")
+    timeline_id = env.initial_timeline
 
     log.info("Use only first 3 safekeepers")
     env.safekeepers[3].stop()
-    endpoint = env.endpoints.create("test_pull_timeline")
+    endpoint = env.endpoints.create("main")
     endpoint.active_safekeepers = [1, 2, 3]
     endpoint.start()
 
@@ -1787,7 +1787,7 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
     show_statuses(env.safekeepers, tenant_id, timeline_id)
 
     log.info("Restarting compute with new config to verify that it works")
-    endpoint.stop_and_destroy().create("test_pull_timeline")
+    endpoint.stop_and_destroy().create("main")
     endpoint.active_safekeepers = [1, 3, 4]
     endpoint.start()
 
@@ -1836,14 +1836,14 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
     src_flush_lsn = src_sk.get_flush_lsn(tenant_id, timeline_id)
     log.info(f"flush_lsn on src before pull_timeline: {src_flush_lsn}")
 
-    dst_http = dst_sk.http_client()
+    src_http = src_sk.http_client()
     # run pull_timeline which will halt before downloading files
-    dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "pause"))
+    src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "pause"))
     pt_handle = PropagatingThread(
         target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id)
     )
     pt_handle.start()
-    dst_sk.wait_until_paused("sk-pull-timeline-after-list-pausable")
+    src_sk.wait_until_paused("sk-snapshot-after-list-pausable")
 
     # ensure segment exists
     endpoint.safe_psql("insert into t select generate_series(1, 180000), 'papaya'")
@@ -1854,7 +1854,7 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
     first_segment_p = src_sk.timeline_dir(tenant_id, timeline_id) / "000000010000000000000001"
     log.info(f"first segment exist={os.path.exists(first_segment_p)}")
 
-    dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "off"))
+    src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "off"))
     pt_handle.join()
 
     timeline_start_lsn = src_sk.get_timeline_start_lsn(tenant_id, timeline_id)
@@ -1883,7 +1883,6 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
 # enough, so it won't be affected by term change anymore.
 #
 # Expected to fail while term check is not implemented.
-@pytest.mark.xfail
 def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_safekeepers = 3
     neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
@@ -1900,14 +1899,14 @@ def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder):
     ep.safe_psql("create table t(key int, value text)")
     ep.safe_psql("insert into t select generate_series(1, 1000), 'pear'")
 
-    dst_http = dst_sk.http_client()
+    src_http = src_sk.http_client()
     # run pull_timeline which will halt before downloading files
-    dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "pause"))
+    src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "pause"))
     pt_handle = PropagatingThread(
         target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id)
     )
     pt_handle.start()
-    dst_sk.wait_until_paused("sk-pull-timeline-after-list-pausable")
+    src_sk.wait_until_paused("sk-snapshot-after-list-pausable")
 
     src_http = src_sk.http_client()
     term_before = src_http.timeline_status(tenant_id, timeline_id).term
@@ -1922,7 +1921,7 @@ def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder):
     term_after = src_http.timeline_status(tenant_id, timeline_id).term
     assert term_after > term_before, f"term_after={term_after}, term_before={term_before}"
 
-    dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "off"))
+    src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "off"))
     with pytest.raises(requests.exceptions.HTTPError):
         pt_handle.join()
 

From 29a41fc7b913e76666702344b1a751bc4d1aab69 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 13 Jun 2024 16:10:05 +0300
Subject: [PATCH 0989/1571] Implement holding off WAL removal for
 pull_timeline.

---
 safekeeper/src/pull_timeline.rs          | 18 +++++++++++++++---
 safekeeper/src/timeline.rs               |  5 +++++
 safekeeper/src/timeline_manager.rs       |  6 ++++--
 test_runner/fixtures/neon_fixtures.py    |  9 ++++++---
 test_runner/regress/test_wal_acceptor.py |  6 ++++--
 5 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 4099a324f9..2c4cc836f7 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -15,6 +15,7 @@ use tokio::{
     fs::{File, OpenOptions},
     io::AsyncWrite,
     sync::mpsc,
+    task,
 };
 use tokio_tar::{Archive, Builder};
 use tokio_util::{
@@ -66,7 +67,11 @@ pub struct SnapshotContext {
 
 impl Drop for SnapshotContext {
     fn drop(&mut self) {
-        // todo: spawn task removing WAL gc hold off
+        let tli = self.tli.clone();
+        task::spawn(async move {
+            let mut shared_state = tli.write_shared_state().await;
+            shared_state.wal_removal_on_hold = false;
+        });
     }
 }
 
@@ -150,7 +155,7 @@ impl FullAccessTimeline {
         &self,
         ar: &mut tokio_tar::Builder<W>,
     ) -> Result<SnapshotContext> {
-        let shared_state = self.read_shared_state().await;
+        let mut shared_state = self.write_shared_state().await;
 
         let cf_path = self.get_timeline_dir().join(CONTROL_FILE_NAME);
         let mut cf = File::open(cf_path).await?;
@@ -183,7 +188,14 @@ impl FullAccessTimeline {
             );
         }
 
-        // TODO: set WAL hold off.
+        // Prevent WAL removal while we're streaming data.
+        //
+        // Since this a flag, not a counter just bail out if already set; we
+        // shouldn't need concurrent snapshotting.
+        if shared_state.wal_removal_on_hold {
+            bail!("wal_removal_on_hold is already true");
+        }
+        shared_state.wal_removal_on_hold = true;
 
         let bctx = SnapshotContext {
             from_segno,
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index e510a05a32..544ffdbb36 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -168,6 +168,9 @@ pub struct SharedState {
     pub(crate) sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
     /// In memory list containing state of peers sent in latest messages from them.
     pub(crate) peers_info: PeersInfo,
+    // True value hinders old WAL removal; this is used by snapshotting. We
+    // could make it a counter, but there is no need to.
+    pub(crate) wal_removal_on_hold: bool,
 }
 
 impl SharedState {
@@ -205,6 +208,7 @@ impl SharedState {
         Ok(Self {
             sk,
             peers_info: PeersInfo(vec![]),
+            wal_removal_on_hold: false,
         })
     }
 
@@ -222,6 +226,7 @@ impl SharedState {
         Ok(Self {
             sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
             peers_info: PeersInfo(vec![]),
+            wal_removal_on_hold: false,
         })
     }
 
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index 087b988c69..592426bba3 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -39,6 +39,7 @@ pub struct StateSnapshot {
     // misc
     pub cfile_last_persist_at: Instant,
     pub inmem_flush_pending: bool,
+    pub wal_removal_on_hold: bool,
     pub peers: Vec<PeerInfo>,
 }
 
@@ -54,6 +55,7 @@ impl StateSnapshot {
             cfile_backup_lsn: read_guard.sk.state.backup_lsn,
             cfile_last_persist_at: read_guard.sk.state.pers.last_persist_at(),
             inmem_flush_pending: Self::has_unflushed_inmem_state(&read_guard),
+            wal_removal_on_hold: read_guard.wal_removal_on_hold,
             peers: read_guard.get_peers(heartbeat_timeout),
         }
     }
@@ -324,8 +326,8 @@ async fn update_wal_removal(
     last_removed_segno: u64,
     wal_removal_task: &mut Option<JoinHandle<anyhow::Result<u64>>>,
 ) {
-    if wal_removal_task.is_some() {
-        // WAL removal is already in progress
+    if wal_removal_task.is_some() || state.wal_removal_on_hold {
+        // WAL removal is already in progress or hold off
         return;
     }
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 394f5283f3..12fda5468f 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3897,11 +3897,13 @@ class Safekeeper(LogUtils):
         segments.sort()
         return segments
 
-    def checkpoint_up_to(self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn):
+    def checkpoint_up_to(
+        self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn, wait_wal_removal=True
+    ):
         """
         Assuming pageserver(s) uploaded to s3 up to `lsn`,
         1) wait for remote_consistent_lsn and wal_backup_lsn on safekeeper to reach it.
-        2) checkpoint timeline on safekeeper, which should remove WAL before this LSN.
+        2) checkpoint timeline on safekeeper, which should remove WAL before this LSN; optionally wait for that.
         """
         cli = self.http_client()
 
@@ -3925,7 +3927,8 @@ class Safekeeper(LogUtils):
         # pageserver to this safekeeper
         wait_until(30, 1, are_lsns_advanced)
         cli.checkpoint(tenant_id, timeline_id)
-        wait_until(30, 1, are_segments_removed)
+        if wait_wal_removal:
+            wait_until(30, 1, are_segments_removed)
 
     def wait_until_paused(self, failpoint: str):
         msg = f"at failpoint {failpoint}"
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 11aeb8f182..300a6b7115 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1816,7 +1816,6 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
 # 4) Do some write, verify integrity with timeline_digest.
 # Expected to fail while holding off WAL gc plus fetching commit_lsn WAL
 # segment is not implemented.
-@pytest.mark.xfail
 def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_safekeepers = 3
     neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
@@ -1850,13 +1849,16 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
     lsn = last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
     assert lsn > Lsn("0/2000000")
     # Checkpoint timeline beyond lsn.
-    src_sk.checkpoint_up_to(tenant_id, timeline_id, lsn)
+    src_sk.checkpoint_up_to(tenant_id, timeline_id, lsn, wait_wal_removal=False)
     first_segment_p = src_sk.timeline_dir(tenant_id, timeline_id) / "000000010000000000000001"
     log.info(f"first segment exist={os.path.exists(first_segment_p)}")
 
     src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "off"))
     pt_handle.join()
 
+    # after pull_timeline is finished WAL should be removed on donor
+    src_sk.checkpoint_up_to(tenant_id, timeline_id, lsn, wait_wal_removal=True)
+
     timeline_start_lsn = src_sk.get_timeline_start_lsn(tenant_id, timeline_id)
     dst_flush_lsn = dst_sk.get_flush_lsn(tenant_id, timeline_id)
     log.info(f"flush_lsn on dst after pull_timeline: {dst_flush_lsn}")

From 4feb6ba29c11f7e5a945bdb7add681c5e4a04b9b Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 13 Jun 2024 22:47:58 +0300
Subject: [PATCH 0990/1571] Make pull_timeline work with auth enabled.

- Make safekeeper read SAFEKEEPER_AUTH_TOKEN env variable with JWT
  token to connect to other safekeepers.
- Set it in neon_local when auth is enabled.
- Create simple rust http client supporting it, and use it in pull_timeline
  implementation.
- Enable auth in all pull_timeline tests.
- Make sk http_client() by default generate safekeeper wide token, it makes
  easier enabling auth in all tests by default.
---
 control_plane/src/safekeeper.rs               |  15 +-
 safekeeper/src/bin/safekeeper.rs              |  19 +++
 safekeeper/src/http/client.rs                 | 139 ++++++++++++++++++
 safekeeper/src/http/mod.rs                    |   1 +
 safekeeper/src/http/routes.rs                 |   3 +-
 safekeeper/src/lib.rs                         |   5 +-
 safekeeper/src/pull_timeline.rs               |  60 ++++----
 .../tests/walproposer_sim/safekeeper.rs       |   1 +
 test_runner/fixtures/neon_fixtures.py         |  18 ++-
 test_runner/regress/test_wal_acceptor.py      |  25 +++-
 10 files changed, 245 insertions(+), 41 deletions(-)
 create mode 100644 safekeeper/src/http/client.rs

diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index d62a2e80b5..4a320ce53d 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -14,6 +14,7 @@ use camino::Utf8PathBuf;
 use postgres_connection::PgConnectionConfig;
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
+use utils::auth::{Claims, Scope};
 use utils::{http::error::HttpErrorBody, id::NodeId};
 
 use crate::{
@@ -197,7 +198,7 @@ impl SafekeeperNode {
             &datadir,
             &self.env.safekeeper_bin(),
             &args,
-            [],
+            self.safekeeper_env_variables()?,
             background_process::InitialPidFile::Expect(self.pid_file()),
             || async {
                 match self.check_status().await {
@@ -210,6 +211,18 @@ impl SafekeeperNode {
         .await
     }
 
+    fn safekeeper_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
+        // Generate a token to connect from safekeeper to peers
+        if self.conf.auth_enabled {
+            let token = self
+                .env
+                .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
+            Ok(vec![("SAFEKEEPER_AUTH_TOKEN".to_owned(), token)])
+        } else {
+            Ok(Vec::new())
+        }
+    }
+
     ///
     /// Stop the server.
     ///
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 7476654426..86238c7292 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -13,7 +13,9 @@ use tokio::runtime::Handle;
 use tokio::signal::unix::{signal, SignalKind};
 use tokio::task::JoinError;
 use toml_edit::Document;
+use utils::logging::SecretString;
 
+use std::env::{var, VarError};
 use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::str::FromStr;
@@ -287,6 +289,22 @@ async fn main() -> anyhow::Result<()> {
         }
     };
 
+    // Load JWT auth token to connect to other safekeepers for pull_timeline.
+    let sk_auth_token = match var("SAFEKEEPER_AUTH_TOKEN") {
+        Ok(v) => {
+            info!("loaded JWT token for authentication with safekeepers");
+            Some(SecretString::from(v))
+        }
+        Err(VarError::NotPresent) => {
+            info!("no JWT token for authentication with safekeepers detected");
+            None
+        }
+        Err(_) => {
+            warn!("JWT token for authentication with safekeepers is not unicode");
+            None
+        }
+    };
+
     let conf = SafeKeeperConf {
         workdir,
         my_id: id,
@@ -307,6 +325,7 @@ async fn main() -> anyhow::Result<()> {
         pg_auth,
         pg_tenant_only_auth,
         http_auth,
+        sk_auth_token,
         current_thread_runtime: args.current_thread_runtime,
         walsenders_keep_horizon: args.walsenders_keep_horizon,
         partial_backup_enabled: args.partial_backup_enabled,
diff --git a/safekeeper/src/http/client.rs b/safekeeper/src/http/client.rs
new file mode 100644
index 0000000000..0bb31c200d
--- /dev/null
+++ b/safekeeper/src/http/client.rs
@@ -0,0 +1,139 @@
+//! Safekeeper http client.
+//!
+//! Partially copied from pageserver client; some parts might be better to be
+//! united.
+//!
+//! It would be also good to move it out to separate crate, but this needs
+//! duplication of internal-but-reported structs like WalSenderState, ServerInfo
+//! etc.
+
+use reqwest::{IntoUrl, Method, StatusCode};
+use utils::{
+    http::error::HttpErrorBody,
+    id::{TenantId, TimelineId},
+    logging::SecretString,
+};
+
+use super::routes::TimelineStatus;
+
+#[derive(Debug, Clone)]
+pub struct Client {
+    mgmt_api_endpoint: String,
+    authorization_header: Option<SecretString>,
+    client: reqwest::Client,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+    /// Failed to receive body (reqwest error).
+    #[error("receive body: {0}")]
+    ReceiveBody(reqwest::Error),
+
+    /// Status is not ok, but failed to parse body as `HttpErrorBody`.
+    #[error("receive error body: {0}")]
+    ReceiveErrorBody(String),
+
+    /// Status is not ok; parsed error in body as `HttpErrorBody`.
+    #[error("safekeeper API: {1}")]
+    ApiError(StatusCode, String),
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+pub trait ResponseErrorMessageExt: Sized {
+    fn error_from_body(self) -> impl std::future::Future<Output = Result<Self>> + Send;
+}
+
+/// If status is not ok, try to extract error message from the body.
+impl ResponseErrorMessageExt for reqwest::Response {
+    async fn error_from_body(self) -> Result<Self> {
+        let status = self.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            return Ok(self);
+        }
+
+        let url = self.url().to_owned();
+        Err(match self.json::<HttpErrorBody>().await {
+            Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
+            Err(_) => {
+                Error::ReceiveErrorBody(format!("http error ({}) at {}.", status.as_u16(), url))
+            }
+        })
+    }
+}
+
+impl Client {
+    pub fn new(mgmt_api_endpoint: String, jwt: Option<SecretString>) -> Self {
+        Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt)
+    }
+
+    pub fn from_client(
+        client: reqwest::Client,
+        mgmt_api_endpoint: String,
+        jwt: Option<SecretString>,
+    ) -> Self {
+        Self {
+            mgmt_api_endpoint,
+            authorization_header: jwt
+                .map(|jwt| SecretString::from(format!("Bearer {}", jwt.get_contents()))),
+            client,
+        }
+    }
+
+    pub async fn timeline_status(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<TimelineStatus> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}",
+            self.mgmt_api_endpoint, tenant_id, timeline_id
+        );
+        let resp = self.get(&uri).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+
+    pub async fn snapshot(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<reqwest::Response> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/snapshot",
+            self.mgmt_api_endpoint, tenant_id, timeline_id
+        );
+        self.get(&uri).await
+    }
+
+    async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
+        self.request(Method::GET, uri, ()).await
+    }
+
+    /// Send the request and check that the status code is good.
+    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+        body: B,
+    ) -> Result<reqwest::Response> {
+        let res = self.request_noerror(method, uri, body).await?;
+        let response = res.error_from_body().await?;
+        Ok(response)
+    }
+
+    /// Just send the request.
+    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+        body: B,
+    ) -> Result<reqwest::Response> {
+        let req = self.client.request(method, uri);
+        let req = if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value.get_contents())
+        } else {
+            req
+        };
+        req.json(&body).send().await.map_err(Error::ReceiveBody)
+    }
+}
diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs
index 2a9570595f..52fb13ff5b 100644
--- a/safekeeper/src/http/mod.rs
+++ b/safekeeper/src/http/mod.rs
@@ -1,3 +1,4 @@
+pub mod client;
 pub mod routes;
 pub use routes::make_router;
 
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 40ac2c105d..3f2cd97ccd 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -195,8 +195,9 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
     check_permission(&request, None)?;
 
     let data: pull_timeline::Request = json_request(&mut request).await?;
+    let conf = get_conf(&request);
 
-    let resp = pull_timeline::handle_request(data)
+    let resp = pull_timeline::handle_request(data, conf.sk_auth_token.clone())
         .await
         .map_err(ApiError::InternalServerError)?;
     json_response(StatusCode::OK, resp)
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 1a56ff736c..cbd67f0064 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -7,7 +7,7 @@ use tokio::runtime::Runtime;
 use std::time::Duration;
 use storage_broker::Uri;
 
-use utils::{auth::SwappableJwtAuth, id::NodeId};
+use utils::{auth::SwappableJwtAuth, id::NodeId, logging::SecretString};
 
 mod auth;
 pub mod broker;
@@ -78,6 +78,8 @@ pub struct SafeKeeperConf {
     pub pg_auth: Option<Arc<JwtAuth>>,
     pub pg_tenant_only_auth: Option<Arc<JwtAuth>>,
     pub http_auth: Option<Arc<SwappableJwtAuth>>,
+    /// JWT token to connect to other safekeepers with.
+    pub sk_auth_token: Option<SecretString>,
     pub current_thread_runtime: bool,
     pub walsenders_keep_horizon: bool,
     pub partial_backup_enabled: bool,
@@ -114,6 +116,7 @@ impl SafeKeeperConf {
             pg_auth: None,
             pg_tenant_only_auth: None,
             http_auth: None,
+            sk_auth_token: None,
             heartbeat_timeout: Duration::new(5, 0),
             max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
             current_thread_runtime: false,
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 2c4cc836f7..66c41f65ff 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -27,7 +27,10 @@ use tracing::{error, info, instrument};
 use crate::{
     control_file::{self, CONTROL_FILE_NAME},
     debug_dump,
-    http::routes::TimelineStatus,
+    http::{
+        client::{self, Client},
+        routes::TimelineStatus,
+    },
     safekeeper::Term,
     timeline::{get_tenant_dir, get_timeline_dir, FullAccessTimeline, Timeline, TimelineError},
     wal_storage::{self, open_wal_file, Storage},
@@ -36,6 +39,7 @@ use crate::{
 use utils::{
     crashsafe::{durable_rename, fsync_async_opt},
     id::{TenantId, TenantTimelineId, TimelineId},
+    logging::SecretString,
     lsn::Lsn,
     pausable_failpoint,
 };
@@ -163,6 +167,11 @@ impl FullAccessTimeline {
 
         // We need to stream since the oldest segment someone (s3 or pageserver)
         // still needs. This duplicates calc_horizon_lsn logic.
+        //
+        // We know that WAL wasn't removed up to this point because it cannot be
+        // removed further than `backup_lsn`. Since we're holding shared_state
+        // lock and setting `wal_removal_on_hold` later, it guarantees that WAL
+        // won't be removed until we're done.
         let from_lsn = min(
             shared_state.sk.state.remote_consistent_lsn,
             shared_state.sk.state.backup_lsn,
@@ -255,7 +264,10 @@ pub struct DebugDumpResponse {
 }
 
 /// Find the most advanced safekeeper and pull timeline from it.
-pub async fn handle_request(request: Request) -> Result<Response> {
+pub async fn handle_request(
+    request: Request,
+    sk_auth_token: Option<SecretString>,
+) -> Result<Response> {
     let existing_tli = GlobalTimelines::get(TenantTimelineId::new(
         request.tenant_id,
         request.timeline_id,
@@ -264,26 +276,22 @@ pub async fn handle_request(request: Request) -> Result<Response> {
         bail!("Timeline {} already exists", request.timeline_id);
     }
 
-    let client = reqwest::Client::new();
     let http_hosts = request.http_hosts.clone();
 
-    // Send request to /v1/tenant/:tenant_id/timeline/:timeline_id
-    let responses = futures::future::join_all(http_hosts.iter().map(|url| {
-        let url = format!(
-            "{}/v1/tenant/{}/timeline/{}",
-            url, request.tenant_id, request.timeline_id
-        );
-        client.get(url).send()
-    }))
-    .await;
+    // Figure out statuses of potential donors.
+    let responses: Vec<Result<TimelineStatus, client::Error>> =
+        futures::future::join_all(http_hosts.iter().map(|url| async {
+            let cclient = Client::new(url.clone(), sk_auth_token.clone());
+            let info = cclient
+                .timeline_status(request.tenant_id, request.timeline_id)
+                .await?;
+            Ok(info)
+        }))
+        .await;
 
     let mut statuses = Vec::new();
     for (i, response) in responses.into_iter().enumerate() {
-        let response = response.context(format!("fetching status from {}", http_hosts[i]))?;
-        response
-            .error_for_status_ref()
-            .context(format!("checking status from {}", http_hosts[i]))?;
-        let status: crate::http::routes::TimelineStatus = response.json().await?;
+        let status = response.context(format!("fetching status from {}", http_hosts[i]))?;
         statuses.push((status, i));
     }
 
@@ -303,10 +311,14 @@ pub async fn handle_request(request: Request) -> Result<Response> {
     assert!(status.tenant_id == request.tenant_id);
     assert!(status.timeline_id == request.timeline_id);
 
-    pull_timeline(status, safekeeper_host).await
+    pull_timeline(status, safekeeper_host, sk_auth_token).await
 }
 
-async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response> {
+async fn pull_timeline(
+    status: TimelineStatus,
+    host: String,
+    sk_auth_token: Option<SecretString>,
+) -> Result<Response> {
     let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
     info!(
         "pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}",
@@ -322,17 +334,11 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
 
     let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
 
-    let client = reqwest::Client::new();
-
+    let client = Client::new(host.clone(), sk_auth_token.clone());
     // Request stream with basebackup archive.
     let bb_resp = client
-        .get(format!(
-            "{}/v1/tenant/{}/timeline/{}/snapshot",
-            host, status.tenant_id, status.timeline_id
-        ))
-        .send()
+        .snapshot(status.tenant_id, status.timeline_id)
         .await?;
-    bb_resp.error_for_status_ref()?;
 
     // Make Stream of Bytes from it...
     let bb_stream = bb_resp.bytes_stream().map_err(std::io::Error::other);
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 27e2a4453b..47539872a6 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -174,6 +174,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         pg_auth: None,
         pg_tenant_only_auth: None,
         http_auth: None,
+        sk_auth_token: None,
         current_thread_runtime: false,
         walsenders_keep_horizon: false,
         partial_backup_enabled: false,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 12fda5468f..aa55b6e4cb 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3847,7 +3847,15 @@ class Safekeeper(LogUtils):
                 assert isinstance(res, dict)
                 return res
 
-    def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient:
+    def http_client(
+        self, auth_token: Optional[str] = None, gen_sk_wide_token: bool = True
+    ) -> SafekeeperHttpClient:
+        """
+        When auth_token is None but gen_sk_wide is True creates safekeeper wide
+        token, which is a reasonable default.
+        """
+        if auth_token is None and gen_sk_wide_token:
+            auth_token = self.env.auth_keys.generate_safekeeper_token()
         is_testing_enabled = '"testing"' in self.env.get_binary_version("safekeeper")
         return SafekeeperHttpClient(
             port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled
@@ -4450,6 +4458,7 @@ def wait_for_last_flush_lsn(
     tenant: TenantId,
     timeline: TimelineId,
     pageserver_id: Optional[int] = None,
+    auth_token: Optional[str] = None,
 ) -> Lsn:
     """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
 
@@ -4463,7 +4472,7 @@ def wait_for_last_flush_lsn(
             f"wait_for_last_flush_lsn: waiting for {last_flush_lsn} on shard {tenant_shard_id} on pageserver {pageserver.id})"
         )
         waited = wait_for_last_record_lsn(
-            pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
+            pageserver.http_client(auth_token=auth_token), tenant_shard_id, timeline, last_flush_lsn
         )
 
         assert waited >= last_flush_lsn
@@ -4559,6 +4568,7 @@ def last_flush_lsn_upload(
     tenant_id: TenantId,
     timeline_id: TimelineId,
     pageserver_id: Optional[int] = None,
+    auth_token: Optional[str] = None,
 ) -> Lsn:
     """
     Wait for pageserver to catch to the latest flush LSN of given endpoint,
@@ -4566,11 +4576,11 @@ def last_flush_lsn_upload(
     reaching flush LSN).
     """
     last_flush_lsn = wait_for_last_flush_lsn(
-        env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver_id
+        env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver_id, auth_token=auth_token
     )
     shards = tenant_get_shards(env, tenant_id, pageserver_id)
     for tenant_shard_id, pageserver in shards:
-        ps_http = pageserver.http_client()
+        ps_http = pageserver.http_client(auth_token=auth_token)
         wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
         # force a checkpoint to trigger upload
         ps_http.timeline_checkpoint(tenant_shard_id, timeline_id)
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 300a6b7115..7bf208db54 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -374,7 +374,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
         http_cli_other = env.safekeepers[0].http_client(
             auth_token=env.auth_keys.generate_tenant_token(TenantId.generate())
         )
-        http_cli_noauth = env.safekeepers[0].http_client()
+        http_cli_noauth = env.safekeepers[0].http_client(gen_sk_wide_token=False)
 
     # Pretend WAL is offloaded to s3.
     if auth_enabled:
@@ -830,7 +830,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
             auth_token=env.auth_keys.generate_tenant_token(TenantId.generate())
         )
         wa_http_cli_bad.check_status()
-        wa_http_cli_noauth = wa.http_client()
+        wa_http_cli_noauth = wa.http_client(gen_sk_wide_token=False)
         wa_http_cli_noauth.check_status()
 
         # debug endpoint requires safekeeper scope
@@ -964,7 +964,7 @@ def test_sk_auth(neon_env_builder: NeonEnvBuilder):
 
     # By default, neon_local enables auth on all services if auth is configured,
     # so http must require the token.
-    sk_http_cli_noauth = sk.http_client()
+    sk_http_cli_noauth = sk.http_client(gen_sk_wide_token=False)
     sk_http_cli_auth = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id))
     with pytest.raises(sk_http_cli_noauth.HTTPError, match="Forbidden|Unauthorized"):
         sk_http_cli_noauth.timeline_status(tenant_id, timeline_id)
@@ -1640,7 +1640,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
         sk_http_other = sk.http_client(
             auth_token=env.auth_keys.generate_tenant_token(tenant_id_other)
         )
-        sk_http_noauth = sk.http_client()
+        sk_http_noauth = sk.http_client(gen_sk_wide_token=False)
     assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir()
     assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
     assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
@@ -1723,7 +1723,10 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
             cur.execute("INSERT INTO t (key) VALUES (123)")
 
 
+# Basic pull_timeline test.
 def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.auth_enabled = True
+
     def execute_payload(endpoint: Endpoint):
         with closing(endpoint.connect()) as conn:
             with conn.cursor() as cur:
@@ -1739,7 +1742,7 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
 
     def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId):
         for sk in safekeepers:
-            http_cli = sk.http_client()
+            http_cli = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id))
             try:
                 status = http_cli.timeline_status(tenant_id, timeline_id)
                 log.info(f"Safekeeper {sk.id} status: {status}")
@@ -1769,7 +1772,7 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
 
     res = (
         env.safekeepers[3]
-        .http_client()
+        .http_client(auth_token=env.auth_keys.generate_safekeeper_token())
         .pull_timeline(
             {
                 "tenant_id": str(tenant_id),
@@ -1817,6 +1820,7 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
 # Expected to fail while holding off WAL gc plus fetching commit_lsn WAL
 # segment is not implemented.
 def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.auth_enabled = True
     neon_env_builder.num_safekeepers = 3
     neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
     env = neon_env_builder.init_start()
@@ -1846,7 +1850,13 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
 
     # ensure segment exists
     endpoint.safe_psql("insert into t select generate_series(1, 180000), 'papaya'")
-    lsn = last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+    lsn = last_flush_lsn_upload(
+        env,
+        endpoint,
+        tenant_id,
+        timeline_id,
+        auth_token=env.auth_keys.generate_tenant_token(tenant_id),
+    )
     assert lsn > Lsn("0/2000000")
     # Checkpoint timeline beyond lsn.
     src_sk.checkpoint_up_to(tenant_id, timeline_id, lsn, wait_wal_removal=False)
@@ -1886,6 +1896,7 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
 #
 # Expected to fail while term check is not implemented.
 def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.auth_enabled = True
     neon_env_builder.num_safekeepers = 3
     neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
     env = neon_env_builder.init_start()

From 68a2298973603df5b21f230d813f59b73ae2ebb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 18 Jun 2024 16:03:23 +0200
Subject: [PATCH 0991/1571] Add support to specifying storage account in
 AzureConfig (#8090)

We want to be able to specify the storage account via the toml
configuration, so that we can connect to multiple storage accounts in
the same process.

https://neondb.slack.com/archives/C06SJG60FRB/p1718702144270139
---
 docs/pageserver-services.md                  |  3 ++-
 libs/remote_storage/src/azure_blob.rs        |  5 ++++-
 libs/remote_storage/src/lib.rs               | 17 +++++++++++++++--
 libs/remote_storage/tests/test_real_azure.rs |  1 +
 4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/docs/pageserver-services.md b/docs/pageserver-services.md
index ba5d3c423e..11d984eb08 100644
--- a/docs/pageserver-services.md
+++ b/docs/pageserver-services.md
@@ -101,11 +101,12 @@ or
 ```toml
 [remote_storage]
 container_name = 'some-container-name'
+storage_account = 'somestorageaccnt'
 container_region = 'us-east'
 prefix_in_container = '/test-prefix/'
 ```
 
-`AZURE_STORAGE_ACCOUNT` and `AZURE_STORAGE_ACCESS_KEY` env variables can be used to specify the azure credentials if needed.
+The `AZURE_STORAGE_ACCESS_KEY` env variable can be used to specify the azure credentials if needed.
 
 ## Repository background tasks
 
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 2aa05a9d30..dbd64fb5a6 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -54,7 +54,10 @@ impl AzureBlobStorage {
             azure_config.container_name
         );
 
-        let account = env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT");
+        // Use the storage account from the config by default, fall back to env var if not present.
+        let account = azure_config.storage_account.clone().unwrap_or_else(|| {
+            env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT")
+        });
 
         // If the `AZURE_STORAGE_ACCESS_KEY` env var has an access key, use that,
         // otherwise try the token based credentials.
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 8c984abed2..72748e156c 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -466,7 +466,11 @@ impl GenericRemoteStorage {
                 Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
             }
             RemoteStorageKind::AzureContainer(azure_config) => {
-                info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'",
+                let storage_account = azure_config
+                    .storage_account
+                    .as_deref()
+                    .unwrap_or("<AZURE_STORAGE_ACCOUNT>");
+                info!("Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'",
                       azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
                 Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config, timeout)?))
             }
@@ -589,6 +593,8 @@ impl Debug for S3Config {
 pub struct AzureConfig {
     /// Name of the container to connect to.
     pub container_name: String,
+    /// Name of the storage account the container is inside of
+    pub storage_account: Option<String>,
     /// The region where the bucket is located at.
     pub container_region: String,
     /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once.
@@ -603,8 +609,9 @@ impl Debug for AzureConfig {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("AzureConfig")
             .field("bucket_name", &self.container_name)
+            .field("storage_account", &self.storage_account)
             .field("bucket_region", &self.container_region)
-            .field("prefix_in_bucket", &self.prefix_in_container)
+            .field("prefix_in_container", &self.prefix_in_container)
             .field("concurrency_limit", &self.concurrency_limit)
             .field(
                 "max_keys_per_list_response",
@@ -718,6 +725,12 @@ impl RemoteStorageConfig {
             (None, None, None, Some(container_name), Some(container_region)) => {
                 RemoteStorageKind::AzureContainer(AzureConfig {
                     container_name: parse_toml_string("container_name", container_name)?,
+                    storage_account: toml
+                        .get("storage_account")
+                        .map(|storage_account| {
+                            parse_toml_string("storage_account", storage_account)
+                        })
+                        .transpose()?,
                     container_region: parse_toml_string("container_region", container_region)?,
                     prefix_in_container: toml
                         .get("prefix_in_container")
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index cd0b2be4b5..23628dfebe 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -212,6 +212,7 @@ fn create_azure_client(
     let remote_storage_config = RemoteStorageConfig {
         storage: RemoteStorageKind::AzureContainer(AzureConfig {
             container_name: remote_storage_azure_container,
+            storage_account: None,
             container_region: remote_storage_azure_region,
             prefix_in_container: Some(format!("test_{millis}_{random:08x}/")),
             concurrency_limit: NonZeroUsize::new(100).unwrap(),

From cf60e4c0c5bdd7b719b999b3b6603b3d8ec6c04d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 18 Jun 2024 16:40:27 +0100
Subject: [PATCH 0992/1571] build(deps): bump ws from 8.16.0 to 8.17.1 in
 /test_runner/pg_clients/typescript/serverless-driver (#8087)

---
 .../typescript/serverless-driver/package-lock.json        | 8 ++++----
 .../pg_clients/typescript/serverless-driver/package.json  | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
index 5a3ad3c238..f3b456f1ed 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
+++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
@@ -6,7 +6,7 @@
     "": {
       "dependencies": {
         "@neondatabase/serverless": "0.9.0",
-        "ws": "8.16.0"
+        "ws": "8.17.1"
       }
     },
     "node_modules/@neondatabase/serverless": {
@@ -96,9 +96,9 @@
       }
     },
     "node_modules/ws": {
-      "version": "8.16.0",
-      "resolved": "https://registry.npmjs.org/ws/-/ws-8.16.0.tgz",
-      "integrity": "sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==",
+      "version": "8.17.1",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.1.tgz",
+      "integrity": "sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==",
       "engines": {
         "node": ">=10.0.0"
       },
diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json
index 9d9da0f42c..3ae7a8a6cf 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/package.json
+++ b/test_runner/pg_clients/typescript/serverless-driver/package.json
@@ -2,6 +2,6 @@
   "type": "module",
   "dependencies": {
     "@neondatabase/serverless": "0.9.0",
-    "ws": "8.16.0"
+    "ws": "8.17.1"
   }
 }

From 8a9fa0a4e41937c6b20474d89534699806c7c706 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 18 Jun 2024 16:40:46 +0100
Subject: [PATCH 0993/1571] build(deps): bump urllib3 from 1.26.18 to 1.26.19
 (#8086)

---
 poetry.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 25c0c7398d..7740388fb8 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2806,13 +2806,13 @@ files = [
 
 [[package]]
 name = "urllib3"
-version = "1.26.18"
+version = "1.26.19"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
 files = [
-    {file = "urllib3-1.26.18-py2.py3-none-any.whl", hash = "sha256:34b97092d7e0a3a8cf7cd10e386f401b3737364026c45e622aa02903dffe0f07"},
-    {file = "urllib3-1.26.18.tar.gz", hash = "sha256:f8ecc1bba5667413457c529ab955bf8c67b45db799d159066261719e328580a0"},
+    {file = "urllib3-1.26.19-py2.py3-none-any.whl", hash = "sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3"},
+    {file = "urllib3-1.26.19.tar.gz", hash = "sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429"},
 ]
 
 [package.extras]

From 8ee67241678150f9bb8413161481376150a3a849 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 18 Jun 2024 13:30:53 +0300
Subject: [PATCH 0994/1571] Update overview section to reflect current code
 organization

---
 docs/core_changes.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/core_changes.md b/docs/core_changes.md
index ea219adae9..f86d5133e8 100644
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -11,9 +11,10 @@ page server. We currently use the same binary for both, with --wal-redo runtime
 the WAL redo mode. Some PostgreSQL changes are needed in the compute node, while others are just for
 the WAL redo process.
 
-In addition to core PostgreSQL changes, there is a Neon extension in contrib/neon, to hook into the
-smgr interface. Once all the core changes have been submitted to upstream or eliminated some other
-way, the extension could live outside the postgres repository and build against vanilla PostgreSQL.
+In addition to core PostgreSQL changes, there is a Neon extension in the pgxn/neon directory that
+hooks into the smgr interface, and rmgr extension in pgxn/neon_rmgr. The extensions are loaded into
+the Postgres processes with shared_preload_libraries. Most of the Neon-specific code is in the
+extensions, and for any new features, that is preferred over modifying core PostgreSQL code.
 
 Below is a list of all the PostgreSQL source code changes, categorized into changes needed for
 compute, and changes needed for the WAL redo process:

From 0396ed67f710739c3b3bd8f1dafbcd517432a467 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 18 Jun 2024 13:30:56 +0300
Subject: [PATCH 0995/1571] Update comments on various items

To update things that have changed since this was written, and to
reflect discussions at offsite meeting.
---
 docs/core_changes.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/core_changes.md b/docs/core_changes.md
index f86d5133e8..9dd4ea806b 100644
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -38,6 +38,7 @@ The problem is that the XLOG_HEAP_INSERT record does not include the command id
 
 Bite the bullet and submit the patch to PostgreSQL, to add the t_cid to the WAL records. It makes the WAL records larger, which could make this unpopular in the PostgreSQL community. However, it might simplify some logical decoding code; Andres Freund briefly mentioned in PGCon 2022 discussion on Heikki's Neon presentation that logical decoding currently needs to jump through some hoops to reconstruct the same information.
 
+Update from Heikki (2024-04-17): I tried to write an upstream patch for that, to use the t_cid field for logical decoding, but it was not as straightforward as it first sounded.
 
 ### Alternatives
 Perhaps we could write an extra WAL record with the t_cid information, when a page is evicted that contains rows that were touched a transaction that's still running. However, that seems very complicated.
@@ -96,6 +97,8 @@ Maybe some bigger rewrite of FSM and VM would help to avoid WAL-logging FSM and
 also some changes in src/backend/storage/smgr/smgr.c
 ```
 
+pgvector 0.6.0 also needs a similar change, which would be very nice to get rid of too.
+
 When a GIN index is built, for example, it is built by inserting the entries into the index more or
 less normally, but without WAL-logging anything. After the index has been built, we iterate through
 all pages and write them to the WAL. That doesn't work for Neon, because if a page is not WAL-logged
@@ -110,6 +113,10 @@ an operation: `smgr_start_unlogged_build`, `smgr_finish_unlogged_build_phase_1`
 I think it would make sense to be more explicit about that in PostgreSQL too. So extract these
 changes to a patch and post to pgsql-hackers.
 
+Perhaps we could deduce that an unlogged index build has started when we see a page being evicted
+with zero LSN. How to be sure it's an unlogged index build rather than a bug? Currently we have a
+check for that and PANIC if we see page with zero LSN being evicted. And how do we detect when the
+index build has finished? See https://github.com/neondatabase/neon/pull/7440 for an attempt at that.
 
 ## Track last-written page LSN
 
@@ -322,6 +329,8 @@ and finally WAL-log that the extension succeeded.
 
 Submit to upstream. This could be useful for the Disk Encryption patches too, or for compression.
 
+We have submitted this to upstream, but it's moving at glacial a speed.
+https://commitfest.postgresql.org/47/4428/
 
 ## Added relpersistence argument to smgropen()
 

From 33a09946fc5d0b5dec92ab84ce25803879e99358 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 18 Jun 2024 13:30:58 +0300
Subject: [PATCH 0996/1571] Prefetching has been implemented

---
 docs/core_changes.md | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/docs/core_changes.md b/docs/core_changes.md
index 9dd4ea806b..57d1fdfe59 100644
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -21,6 +21,18 @@ compute, and changes needed for the WAL redo process:
 
 # Changes for Compute node
 
+## Prefetching
+
+There are changes in many places to perform prefetching, for example for sequential scans. Neon
+doesn't benefit from OS readahead, and the latency to pageservers is quite high compared to local
+disk, so prefetching is critical for performance, also for sequential scans.
+
+### How to get rid of the patch
+
+Upcoming "streaming read" work in v17 might simplify this. And async I/O work in v18 will hopefully
+do more.
+
+
 ## Add t_cid to heap WAL records
 
 ```
@@ -482,19 +494,6 @@ hint bits are set. Wal logging hint bits updates requires FPI which significantl
 
 Add special WAL record for setting page hints.
 
-## Prefetching
-
-### Why?
-
-As far as pages in Neon are loaded on demand, to reduce node startup time
-and also speedup some massive queries we need some mechanism for bulk loading to
-reduce page request round-trip overhead.
-
-Currently Postgres is supporting prefetching only for bitmap scan.
-In Neon we should also use prefetch for sequential and index scans, because the OS is not doing it for us.
-For sequential scan we could prefetch some number of following pages. For index scan we could prefetch pages
-of heap relation addressed by TIDs.
-
 ## Prewarming
 
 ### Why?

From b774ab54d428a82bfad1772917e237dfd3939eb3 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 18 Jun 2024 13:31:01 +0300
Subject: [PATCH 0997/1571] Remove obsolete ones

- Relation size cache was moved to extension

- the changes in visibilitymap.c and freespace.c became unnecessary
  with v16, thanks to changes in upstream code

- WALProposer was moved to extension

- The hack in ReadBuffer_common to not throw an error on unexpected
  data beyond EOF was removed in v16 rebase. We haven't seen such
  errors, so I guess that was some early issue that was fixed long
  time ago.

- The ginfast.c diff was made unnecessary by upstream commit 56b662523f
---
 docs/core_changes.md | 155 -------------------------------------------
 1 file changed, 155 deletions(-)

diff --git a/docs/core_changes.md b/docs/core_changes.md
index 57d1fdfe59..60b54825c4 100644
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -55,50 +55,6 @@ Update from Heikki (2024-04-17): I tried to write an upstream patch for that, to
 ### Alternatives
 Perhaps we could write an extra WAL record with the t_cid information, when a page is evicted that contains rows that were touched a transaction that's still running. However, that seems very complicated.
 
-## ginfast.c
-
-```
-diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
-index e0d9940946..2d964c02e9 100644
---- a/src/backend/access/gin/ginfast.c
-+++ b/src/backend/access/gin/ginfast.c
-@@ -285,6 +285,17 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
-                memset(&sublist, 0, sizeof(GinMetaPageData));
-                makeSublist(index, collector->tuples, collector->ntuples, &sublist);
- 
-+               if (metadata->head != InvalidBlockNumber)
-+               {
-+                       /*
-+                        * ZENITH: Get buffer before XLogBeginInsert() to avoid recursive call
-+                        * of XLogBeginInsert(). Reading a new buffer might evict a dirty page from
-+                        * the buffer cache, and if that page happens to be an FSM or VM page, zenith_write()
-+                        * will try to WAL-log an image of the page.
-+                        */
-+                       buffer = ReadBuffer(index, metadata->tail);
-+               }
-+
-                if (needWal)
-                        XLogBeginInsert();
- 
-@@ -316,7 +327,6 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
-                        data.prevTail = metadata->tail;
-                        data.newRightlink = sublist.head;
- 
--                       buffer = ReadBuffer(index, metadata->tail);
-                        LockBuffer(buffer, GIN_EXCLUSIVE);
-                        page = BufferGetPage(buffer);
-```
-
-The problem is explained in the comment above
-
-### How to get rid of the patch
-
-Can we stop WAL-logging FSM or VM pages? Or delay the WAL logging until we're out of the critical
-section or something.
-
-Maybe some bigger rewrite of FSM and VM would help to avoid WAL-logging FSM and VM page images?
-
-
 ## Mark index builds that use buffer manager without logging explicitly
 
 ```
@@ -160,57 +116,6 @@ The old method is still available, though.
 Wait until v15?
 
 
-## Cache relation sizes
-
-The Neon extension contains a little cache for smgrnblocks() and smgrexists() calls, to avoid going
-to the page server every time. It might be useful to cache those in PostgreSQL, maybe in the
-relcache? (I think we do cache nblocks in relcache already, check why that's not good enough for
-Neon)
-
-
-## Use buffer manager when extending VM or FSM
-
-```
- src/backend/storage/freespace/freespace.c                   |   14 +-
- src/backend/access/heap/visibilitymap.c                     |   15 +-
-
-diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
-index e198df65d8..addfe93eac 100644
---- a/src/backend/access/heap/visibilitymap.c
-+++ b/src/backend/access/heap/visibilitymap.c
-@@ -652,10 +652,19 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
-        /* Now extend the file */
-        while (vm_nblocks_now < vm_nblocks)
-        {
--               PageSetChecksumInplace((Page) pg.data, vm_nblocks_now);
-+               /*
-+                * ZENITH: Initialize VM pages through buffer cache to prevent loading
-+                * them from pageserver.
-+                */
-+               Buffer  buffer = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, P_NEW,
-+                                                                                       RBM_ZERO_AND_LOCK, NULL);
-+               Page    page = BufferGetPage(buffer);
-+
-+               PageInit((Page) page, BLCKSZ, 0);
-+               PageSetChecksumInplace(page, vm_nblocks_now);
-+               MarkBufferDirty(buffer);
-+               UnlockReleaseBuffer(buffer);
- 
--               smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now,
--                                  pg.data, false);
-                vm_nblocks_now++;
-        }
-```
-
-### Problem we're trying to solve
-
-???
-
-### How to get rid of the patch
-
-Maybe this would be a reasonable change in PostgreSQL too?
-
-
 ## Allow startup without reading checkpoint record
 
 In Neon, the compute node is stateless. So when we are launching compute node, we need to provide
@@ -270,66 +175,6 @@ would be weird if the sequence moved backwards though, think of PITR.
 Or add a GUC for the amount to prefix to PostgreSQL, and force it to 1 in Neon.
 
 
-## Walproposer
-
-```
- src/Makefile                                                |    1 +
- src/backend/replication/libpqwalproposer/Makefile           |   37 +
- src/backend/replication/libpqwalproposer/libpqwalproposer.c |  416 ++++++++++++
- src/backend/postmaster/bgworker.c                           |    4 +
- src/backend/postmaster/postmaster.c                         |    6 +
- src/backend/replication/Makefile                            |    4 +-
- src/backend/replication/walproposer.c                       | 2350 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- src/backend/replication/walproposer_utils.c                 |  402 +++++++++++
- src/backend/replication/walreceiver.c                       |    7 +
- src/backend/replication/walsender.c                         |  320 ++++++---
- src/backend/storage/ipc/ipci.c                              |    6 +
- src/include/replication/walproposer.h                       |  565 ++++++++++++++++
-```
-
-WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes.  It is
-currently implemented as patch to standard WAL sender.
-
-### How to get rid of the patch
-
-Refactor into an extension. Submit hooks or APIs into upstream if necessary.
-
-@MMeent did some work on this already: https://github.com/neondatabase/postgres/pull/96
-
-## Ignore unexpected data beyond EOF in bufmgr.c
-
-```
-@@ -922,11 +928,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
-                 */
-                bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
-                if (!PageIsNew((Page) bufBlock))
--                       ereport(ERROR,
-+               {
-+                        // XXX-ZENITH
-+                        MemSet((char *) bufBlock, 0, BLCKSZ);
-+                        ereport(DEBUG1,
-                                        (errmsg("unexpected data beyond EOF in block %u of relation %s",
-                                                        blockNum, relpath(smgr->smgr_rnode, forkNum)),
-                                         errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
--
-+               }
-                /*
-                 * We *must* do smgrextend before succeeding, else the page will not
-                 * be reserved by the kernel, and the next P_NEW call will decide to
-```
-
-PostgreSQL is a bit sloppy with extending relations. Usually, the relation is extended with zeros
-first, then the page is filled, and finally the new page WAL-logged. But if multiple backends extend
-a relation at the same time, the pages can be WAL-logged in different order.
-
-I'm not sure what scenario exactly required this change in Neon, though.
-
-### How to get rid of the patch
-
-Submit patches to pgsql-hackers, to tighten up the WAL-logging around relation extension. It's a bit
-confusing even in PostgreSQL. Maybe WAL log the intention to extend first, then extend the relation,
-and finally WAL-log that the extension succeeded.
-
 ## Make smgr interface available to extensions
 
 ```

From 1c1b4b0c040b07a7bad2d873ad4189d84096c9e5 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 18 Jun 2024 13:31:03 +0300
Subject: [PATCH 0998/1571] Add a bunch of items for new changes that we've
 made

---
 docs/core_changes.md | 142 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 142 insertions(+)

diff --git a/docs/core_changes.md b/docs/core_changes.md
index 60b54825c4..6cc19d8b62 100644
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -311,6 +311,148 @@ Ignore it. This is only needed for disaster recovery, so once we've eliminated a
 patches, we can just keep it around as a patch or as separate branch in a repo.
 
 
+## pg_waldump flags to ignore errors
+
+After creating a new project or branch in Neon, the first timeline can begin in the middle of a WAL segment. pg_waldump chokes on that, so we added some flags to make it possible to ignore errors.
+
+### How to get rid of the patch
+
+Like previous one, ignore it.
+
+
+
+## Backpressure if pageserver doesn't ingest WAL fast enough
+
+```
+@@ -3200,6 +3202,7 @@ ProcessInterrupts(void)
+                return;
+        InterruptPending = false;
+ 
++retry:
+        if (ProcDiePending)
+        {
+                ProcDiePending = false;
+@@ -3447,6 +3450,13 @@ ProcessInterrupts(void)
+ 
+        if (ParallelApplyMessagePending)
+                HandleParallelApplyMessages();
++
++       /* Call registered callback if any */
++       if (ProcessInterruptsCallback)
++       {
++               if (ProcessInterruptsCallback())
++                       goto retry;
++       }
+ }
+```
+
+
+### How to get rid of the patch
+
+Submit a patch to upstream, for a hook in ProcessInterrupts. Could be useful for other extensions
+too.
+
+
+## SLRU on-demand download
+
+```
+ src/backend/access/transam/slru.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
+ 1 file changed, 92 insertions(+), 13 deletions(-)
+```
+
+### Problem we're trying to solve
+
+Previously, SLRU files were included in the basebackup, but the total size of them can be large,
+several GB, and downloading them all made the startup time too long.
+
+### Alternatives
+
+FUSE hook or LD_PRELOAD trick to intercept the reads on SLRU files
+
+
+## WAL-log an all-zeros page as one large hole
+
+- In XLogRecordAssemble()
+
+### Problem we're trying to solve
+
+This change was made in v16. Starting with v16, when PostgreSQL extends a relation, it first extends
+it with zeros, and it can extend the relation more than one block at a time. The all-zeros page is WAL-ogged, but it's very wasteful to include 8 kB of zeros in the WAL for that. This hack was made so that we WAL logged a compact record with a whole-page "hole". However, PostgreSQL has assertions that prevent that such WAL records from being replayed, so this breaks compatibility such that unmodified PostreSQL cannot process Neon-generated WAL.
+
+### How to get rid of the patch
+
+Find another compact representation for a full-page image of an all-zeros page. A compressed image perhaps.
+
+
+## Shut down walproposer after checkpointer
+
+```
++                       /* Neon: Also allow walproposer background worker to be treated like a WAL sender, so that it's shut down last */
++                       if ((bp->bkend_type == BACKEND_TYPE_NORMAL || bp->bkend_type == BACKEND_TYPE_BGWORKER) &&
+```
+
+This changes was needed so that postmaster shuts down the walproposer process only after the shutdown checkpoint record is written. Otherwise, the shutdown record will never make it to the safekeepers.
+
+### How to get rid of the patch
+
+Do a bigger refactoring of the postmaster state machine, such that a background worker can specify
+the shutdown ordering by itself. The postmaster state machine has grown pretty complicated, and
+would benefit from a refactoring for the sake of readability anyway.
+
+
+## EXPLAIN changes for prefetch and LFC
+
+### How to get rid of the patch
+
+Konstantin submitted a patch to -hackers already: https://commitfest.postgresql.org/47/4643/. Get that into a committable state.
+
+
+## On-demand download of extensions
+
+### How to get rid of the patch
+
+FUSE or LD_PRELOAD trickery to intercept reads?
+
+
+## Publication superuser checks
+
+We have hacked CreatePublication so that also neon_superuser can create them.
+
+### How to get rid of the patch
+
+Create an upstream patch with more fine-grained privileges for publications CREATE/DROP that can be GRANTed to users.
+
+
+## WAL log replication slots
+
+### How to get rid of the patch
+
+Utilize the upcoming v17 "slot sync worker", or a similar neon-specific background worker process, to periodically WAL-log the slots, or to export them somewhere else.
+
+
+## WAL-log replication snapshots
+
+### How to get rid of the patch
+
+WAL-log them periodically, from a backgound worker.
+
+
+## WAL-log relmapper files
+
+Similarly to replications snapshot files, the CID mapping files generated during VACUUM FULL of a catalog table are WAL-logged
+
+### How to get rid of the patch
+
+WAL-log them periodically, from a backgound worker.
+
+
+## XLogWaitForReplayOf()
+
+??
+
+
+
+
 # Not currently committed but proposed
 
 ## Disable ring buffer buffer manager strategies

From 560627b525d7abc0b25410fafc4f9fa523c3a078 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 18 Jun 2024 13:38:15 +0300
Subject: [PATCH 0999/1571] Replace a few references to Zenith with neon

---
 docs/core_changes.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/core_changes.md b/docs/core_changes.md
index 6cc19d8b62..1388317728 100644
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -156,7 +156,7 @@ index 0415df9ccb..9f9db3c8bc 100644
   * crash we can lose (skip over) as many values as we pre-logged.
   */
 -#define SEQ_LOG_VALS   32
-+/* Zenith XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */
++/* Neon XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */
 +/* #define SEQ_LOG_VALS        32 */
 +#define SEQ_LOG_VALS   0
 ```
@@ -485,6 +485,6 @@ Add special WAL record for setting page hints.
 
 ### Why?
 
-Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith.
+Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Neon.
 But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow.
 We can capture state of compute node buffer cache and send bulk request for this pages at startup.

From 30b890e378771c95a4ef08d7724d39ff1fad39a9 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Tue, 18 Jun 2024 13:37:06 -0400
Subject: [PATCH 1000/1571] feat(pageserver): use leases to temporarily block
 gc (#8084)

Part of #7497, extracts from #7996, closes #8063.

## Problem

With the LSN lease API introduced in
https://github.com/neondatabase/neon/issues/7808, we want to implement
the real lease logic so that GC will
keep all the layers needed to reconstruct all pages at all the leased
LSNs with valid leases at a given time.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 control_plane/src/pageserver.rs               |   8 ++
 libs/pageserver_api/src/models.rs             |  16 +++
 pageserver/src/http/routes.rs                 |   2 +-
 pageserver/src/page_service.rs                |   2 +-
 pageserver/src/repository.rs                  |   2 +
 pageserver/src/tenant.rs                      | 110 ++++++++++++++++-
 pageserver/src/tenant/config.rs               |  31 +++++
 pageserver/src/tenant/tasks.rs                |  24 ++++
 pageserver/src/tenant/timeline.rs             | 116 ++++++++++++++++--
 .../regress/test_attach_tenant_config.py      |   2 +
 10 files changed, 294 insertions(+), 19 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 5a84763697..13e684da24 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -383,6 +383,10 @@ impl PageServerNode {
                 .map(|x| x.parse::<AuxFilePolicy>())
                 .transpose()
                 .context("Failed to parse 'switch_aux_file_policy'")?,
+            lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
+            lsn_lease_length_for_ts: settings
+                .remove("lsn_lease_length_for_ts")
+                .map(|x| x.to_string()),
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
@@ -506,6 +510,10 @@ impl PageServerNode {
                     .map(|x| x.parse::<AuxFilePolicy>())
                     .transpose()
                     .context("Failed to parse 'switch_aux_file_policy'")?,
+                lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
+                lsn_lease_length_for_ts: settings
+                    .remove("lsn_lease_length_for_ts")
+                    .map(|x| x.to_string()),
             }
         };
 
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 9311dab33c..70db0b7344 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -177,6 +177,20 @@ serde_with::serde_conv!(
     |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
 );
 
+impl LsnLease {
+    /// The default length for an explicit LSN lease request (10 minutes).
+    pub const DEFAULT_LENGTH: Duration = Duration::from_secs(10 * 60);
+
+    /// The default length for an implicit LSN lease granted during
+    /// `get_lsn_by_timestamp` request (1 minutes).
+    pub const DEFAULT_LENGTH_FOR_TS: Duration = Duration::from_secs(60);
+
+    /// Checks whether the lease is expired.
+    pub fn is_expired(&self, now: &SystemTime) -> bool {
+        now > &self.valid_until
+    }
+}
+
 /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum ActivatingFrom {
@@ -322,6 +336,8 @@ pub struct TenantConfig {
     pub timeline_get_throttle: Option<ThrottleConfig>,
     pub image_layer_creation_check_threshold: Option<u8>,
     pub switch_aux_file_policy: Option<AuxFilePolicy>,
+    pub lsn_lease_length: Option<String>,
+    pub lsn_lease_length_for_ts: Option<String>,
 }
 
 /// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 657708c0d6..482879630a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1730,7 +1730,7 @@ async fn lsn_lease_handler(
         active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
             .await?;
     let result = timeline
-        .make_lsn_lease(lsn, &ctx)
+        .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx)
         .map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
 
     json_response(StatusCode::OK, result)
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index ae389826d5..ebc23e8945 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -935,7 +935,7 @@ impl PageServerHandler {
         let timeline = self
             .get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector)
             .await?;
-        let lease = timeline.make_lsn_lease(lsn, ctx)?;
+        let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?;
         let valid_until = lease
             .valid_until
             .duration_since(SystemTime::UNIX_EPOCH)
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 7b30c3ecf7..5a334d0290 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -240,6 +240,7 @@ pub struct GcResult {
     pub layers_needed_by_cutoff: u64,
     pub layers_needed_by_pitr: u64,
     pub layers_needed_by_branches: u64,
+    pub layers_needed_by_leases: u64,
     pub layers_not_updated: u64,
     pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
 
@@ -269,6 +270,7 @@ impl AddAssign for GcResult {
         self.layers_needed_by_pitr += other.layers_needed_by_pitr;
         self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
         self.layers_needed_by_branches += other.layers_needed_by_branches;
+        self.layers_needed_by_leases += other.layers_needed_by_leases;
         self.layers_not_updated += other.layers_not_updated;
         self.layers_removed += other.layers_removed;
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 801321e36d..ca5765c99b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -31,6 +31,7 @@ use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
 use std::fmt;
+use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -65,9 +66,9 @@ use self::timeline::uninit::TimelineCreateGuard;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
+use self::timeline::GcCutoffs;
 use self::timeline::TimelineResources;
 use self::timeline::WaitLsnError;
-use self::timeline::{GcCutoffs, GcInfo};
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
@@ -2428,6 +2429,13 @@ impl Tenant {
         }
     }
 
+    pub fn get_lsn_lease_length(&self) -> Duration {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .lsn_lease_length
+            .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
+    }
+
     pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
         // Use read-copy-update in order to avoid overwriting the location config
         // state if this races with [`Tenant::set_new_location_config`]. Note that
@@ -3010,12 +3018,13 @@ impl Tenant {
             {
                 let mut target = timeline.gc_info.write().unwrap();
 
+                let now = SystemTime::now();
+                target.leases.retain(|_, lease| !lease.is_expired(&now));
+
                 match gc_cutoffs.remove(&timeline.timeline_id) {
                     Some(cutoffs) => {
-                        *target = GcInfo {
-                            retain_lsns: branchpoints,
-                            cutoffs,
-                        };
+                        target.retain_lsns = branchpoints;
+                        target.cutoffs = cutoffs;
                     }
                     None => {
                         // reasons for this being unavailable:
@@ -3833,6 +3842,8 @@ pub(crate) mod harness {
                     tenant_conf.image_layer_creation_check_threshold,
                 ),
                 switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
+                lsn_lease_length: Some(tenant_conf.lsn_lease_length),
+                lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
             }
         }
     }
@@ -6939,4 +6950,93 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_lsn_lease() -> anyhow::Result<()> {
+        let (tenant, ctx) = TenantHarness::create("test_lsn_lease")?.load().await;
+        let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+
+        let end_lsn = Lsn(0x100);
+        let image_layers = (0x20..=0x90)
+            .step_by(0x10)
+            .map(|n| {
+                (
+                    Lsn(n),
+                    vec![(key, test_img(&format!("data key at {:x}", n)))],
+                )
+            })
+            .collect();
+
+        let timeline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                Vec::new(),
+                image_layers,
+                end_lsn,
+            )
+            .await?;
+
+        let leased_lsns = [0x30, 0x50, 0x70];
+        let mut leases = Vec::new();
+        let _: anyhow::Result<_> = leased_lsns.iter().try_for_each(|n| {
+            leases.push(timeline.make_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)?);
+            Ok(())
+        });
+
+        // Renewing with shorter lease should not change the lease.
+        let updated_lease_0 =
+            timeline.make_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)?;
+        assert_eq!(updated_lease_0.valid_until, leases[0].valid_until);
+
+        // Renewing with a long lease should renew lease with later expiration time.
+        let updated_lease_1 = timeline.make_lsn_lease(
+            Lsn(leased_lsns[1]),
+            timeline.get_lsn_lease_length() * 2,
+            &ctx,
+        )?;
+
+        assert!(updated_lease_1.valid_until > leases[1].valid_until);
+
+        // Force set disk consistent lsn so we can get the cutoff at `end_lsn`.
+        info!(
+            "latest_gc_cutoff_lsn: {}",
+            *timeline.get_latest_gc_cutoff_lsn()
+        );
+        timeline.force_set_disk_consistent_lsn(end_lsn);
+
+        let res = tenant
+            .gc_iteration(
+                Some(TIMELINE_ID),
+                0,
+                Duration::ZERO,
+                &CancellationToken::new(),
+                &ctx,
+            )
+            .await?;
+
+        // Keeping everything <= Lsn(0x80) b/c leases:
+        // 0/10: initdb layer
+        // (0/20..=0/70).step_by(0x10): image layers added when creating the timeline.
+        assert_eq!(res.layers_needed_by_leases, 7);
+        // Keeping 0/90 b/c it is the latest layer.
+        assert_eq!(res.layers_not_updated, 1);
+        // Removed 0/80.
+        assert_eq!(res.layers_removed, 1);
+
+        // Make lease on a already GC-ed LSN.
+        // 0/80 does not have a valid lease + is below latest_gc_cutoff
+        assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn());
+        let res = timeline.make_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx);
+        assert!(res.is_err());
+
+        // Should still be able to renew a currently valid lease
+        // Assumption: original lease to is still valid for 0/50.
+        let _ =
+            timeline.make_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)?;
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 342d705954..1b9be12642 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -13,6 +13,7 @@ use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::CompactionAlgorithm;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
+use pageserver_api::models::LsnLease;
 use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
@@ -377,6 +378,16 @@ pub struct TenantConf {
     /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
     /// file is written.
     pub switch_aux_file_policy: AuxFilePolicy,
+
+    /// The length for an explicit LSN lease request.
+    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
+    #[serde(with = "humantime_serde")]
+    pub lsn_lease_length: Duration,
+
+    /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
+    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
+    #[serde(with = "humantime_serde")]
+    pub lsn_lease_length_for_ts: Duration,
 }
 
 /// Same as TenantConf, but this struct preserves the information about
@@ -476,6 +487,16 @@ pub struct TenantConfOpt {
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub switch_aux_file_policy: Option<AuxFilePolicy>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(with = "humantime_serde")]
+    #[serde(default)]
+    pub lsn_lease_length: Option<Duration>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(with = "humantime_serde")]
+    #[serde(default)]
+    pub lsn_lease_length_for_ts: Option<Duration>,
 }
 
 impl TenantConfOpt {
@@ -538,6 +559,12 @@ impl TenantConfOpt {
             switch_aux_file_policy: self
                 .switch_aux_file_policy
                 .unwrap_or(global_conf.switch_aux_file_policy),
+            lsn_lease_length: self
+                .lsn_lease_length
+                .unwrap_or(global_conf.lsn_lease_length),
+            lsn_lease_length_for_ts: self
+                .lsn_lease_length_for_ts
+                .unwrap_or(global_conf.lsn_lease_length_for_ts),
         }
     }
 }
@@ -582,6 +609,8 @@ impl Default for TenantConf {
             timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
             image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
             switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
+            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
+            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
         }
     }
 }
@@ -657,6 +686,8 @@ impl From<TenantConfOpt> for models::TenantConfig {
             timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
             image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
             switch_aux_file_policy: value.switch_aux_file_policy,
+            lsn_lease_length: value.lsn_lease_length.map(humantime),
+            lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
         }
     }
 }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index a6dfa84f35..d679b78f32 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -346,6 +346,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
         // cutoff specified as time.
         let ctx =
             RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+
         let mut first = true;
         loop {
             tokio::select! {
@@ -362,6 +363,14 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 
             if first {
                 first = false;
+
+                if delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel)
+                    .await
+                    .is_err()
+                {
+                    break;
+                }
+
                 if random_init_delay(period, &cancel).await.is_err() {
                     break;
                 }
@@ -531,6 +540,21 @@ pub(crate) async fn random_init_delay(
     }
 }
 
+/// Delays GC by defaul lease length at restart.
+///
+/// We do this as the leases mapping are not persisted to disk. By delaying GC by default
+/// length, we gurantees that all the leases we granted before the restart will expire
+/// when we run GC for the first time after the restart.
+pub(crate) async fn delay_by_lease_length(
+    length: Duration,
+    cancel: &CancellationToken,
+) -> Result<(), Cancelled> {
+    match tokio::time::timeout(length, cancel.cancelled()).await {
+        Ok(_) => Err(Cancelled),
+        Err(_) => Ok(()),
+    }
+}
+
 /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
 pub(crate) fn warn_when_period_overrun(
     elapsed: Duration,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 08bec329e1..a4f1108635 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -47,7 +47,6 @@ use utils::{
     vec_map::VecMap,
 };
 
-use std::ops::{Deref, Range};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
@@ -61,6 +60,10 @@ use std::{
     cmp::{max, min, Ordering},
     ops::ControlFlow,
 };
+use std::{
+    collections::btree_map::Entry,
+    ops::{Deref, Range},
+};
 
 use crate::metrics::GetKind;
 use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
@@ -454,6 +457,9 @@ pub(crate) struct GcInfo {
 
     /// The cutoff coordinates, which are combined by selecting the minimum.
     pub(crate) cutoffs: GcCutoffs,
+
+    /// Leases granted to particular LSNs.
+    pub(crate) leases: BTreeMap<Lsn, LsnLease>,
 }
 
 impl GcInfo {
@@ -1555,17 +1561,46 @@ impl Timeline {
         Ok(())
     }
 
-    /// Obtains a temporary lease blocking garbage collection for the given LSN
+    /// Obtains a temporary lease blocking garbage collection for the given LSN.
+    ///
+    /// This function will error if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is also
+    /// no existing lease to renew. If there is an existing lease in the map, the lease will be renewed only if
+    /// the request extends the lease. The returned lease is therefore the maximum between the existing lease and
+    /// the requesting lease.
     pub(crate) fn make_lsn_lease(
         &self,
-        _lsn: Lsn,
+        lsn: Lsn,
+        length: Duration,
         _ctx: &RequestContext,
     ) -> anyhow::Result<LsnLease> {
-        const LEASE_LENGTH: Duration = Duration::from_secs(5 * 60);
-        let lease = LsnLease {
-            valid_until: SystemTime::now() + LEASE_LENGTH,
+        let lease = {
+            let mut gc_info = self.gc_info.write().unwrap();
+
+            let valid_until = SystemTime::now() + length;
+
+            let entry = gc_info.leases.entry(lsn);
+
+            let lease = {
+                if let Entry::Occupied(mut occupied) = entry {
+                    let existing_lease = occupied.get_mut();
+                    if valid_until > existing_lease.valid_until {
+                        existing_lease.valid_until = valid_until;
+                    }
+                    existing_lease.clone()
+                } else {
+                    // Reject already GC-ed LSN (lsn < latest_gc_cutoff)
+                    let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
+                    if lsn < *latest_gc_cutoff_lsn {
+                        bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
+                    }
+
+                    entry.or_insert(LsnLease { valid_until }).clone()
+                }
+            };
+
+            lease
         };
-        // TODO: dummy implementation
+
         Ok(lease)
     }
 
@@ -2082,6 +2117,24 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 
 // Private functions
 impl Timeline {
+    pub(crate) fn get_lsn_lease_length(&self) -> Duration {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .lsn_lease_length
+            .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
+    }
+
+    // TODO(yuchen): remove unused flag after implementing https://github.com/neondatabase/neon/issues/8072
+    #[allow(unused)]
+    pub(crate) fn get_lsn_lease_length_for_ts(&self) -> Duration {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .lsn_lease_length_for_ts
+            .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts)
+    }
+
     pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -4907,13 +4960,25 @@ impl Timeline {
             return Err(GcError::TimelineCancelled);
         }
 
-        let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
+        let (horizon_cutoff, pitr_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
             let gc_info = self.gc_info.read().unwrap();
 
             let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
             let pitr_cutoff = gc_info.cutoffs.pitr;
             let retain_lsns = gc_info.retain_lsns.clone();
-            (horizon_cutoff, pitr_cutoff, retain_lsns)
+
+            // Gets the maximum LSN that holds the valid lease.
+            //
+            // Caveat: `refresh_gc_info` is in charged of updating the lease map.
+            // Here, we do not check for stale leases again.
+            let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn);
+
+            (
+                horizon_cutoff,
+                pitr_cutoff,
+                retain_lsns,
+                max_lsn_with_valid_lease,
+            )
         };
 
         let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
@@ -4944,7 +5009,13 @@ impl Timeline {
             .set(Lsn::INVALID.0 as i64);
 
         let res = self
-            .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
+            .gc_timeline(
+                horizon_cutoff,
+                pitr_cutoff,
+                retain_lsns,
+                max_lsn_with_valid_lease,
+                new_gc_cutoff,
+            )
             .instrument(
                 info_span!("gc_timeline", timeline_id = %self.timeline_id, cutoff = %new_gc_cutoff),
             )
@@ -4961,6 +5032,7 @@ impl Timeline {
         horizon_cutoff: Lsn,
         pitr_cutoff: Lsn,
         retain_lsns: Vec<Lsn>,
+        max_lsn_with_valid_lease: Option<Lsn>,
         new_gc_cutoff: Lsn,
     ) -> Result<GcResult, GcError> {
         // FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc
@@ -5009,7 +5081,8 @@ impl Timeline {
         // 1. it is older than cutoff LSN;
         // 2. it is older than PITR interval;
         // 3. it doesn't need to be retained for 'retain_lsns';
-        // 4. newer on-disk image layers cover the layer's whole key range
+        // 4. it does not need to be kept for LSNs holding valid leases.
+        // 5. newer on-disk image layers cover the layer's whole key range
         //
         // TODO holding a write lock is too agressive and avoidable
         let mut guard = self.layers.write().await;
@@ -5060,7 +5133,21 @@ impl Timeline {
                 }
             }
 
-            // 4. Is there a later on-disk layer for this relation?
+            // 4. Is there a valid lease that requires us to keep this layer?
+            if let Some(lsn) = &max_lsn_with_valid_lease {
+                // keep if layer start <= any of the lease
+                if &l.get_lsn_range().start <= lsn {
+                    debug!(
+                        "keeping {} because there is a valid lease preventing GC at {}",
+                        l.layer_name(),
+                        lsn,
+                    );
+                    result.layers_needed_by_leases += 1;
+                    continue 'outer;
+                }
+            }
+
+            // 5. Is there a later on-disk layer for this relation?
             //
             // The end-LSN is exclusive, while disk_consistent_lsn is
             // inclusive. For example, if disk_consistent_lsn is 100, it is
@@ -5438,6 +5525,11 @@ impl Timeline {
         self.last_record_lsn.advance(new_lsn);
     }
 
+    #[cfg(test)]
+    pub(super) fn force_set_disk_consistent_lsn(&self, new_value: Lsn) {
+        self.disk_consistent_lsn.store(new_value);
+    }
+
     /// Force create an image layer and place it into the layer map.
     ///
     /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 1d193b8999..f4667a82dc 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -195,6 +195,8 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "walreceiver_connect_timeout": "13m",
         "image_layer_creation_check_threshold": 1,
         "switch_aux_file_policy": "cross-validation",
+        "lsn_lease_length": "1m",
+        "lsn_lease_length_for_ts": "5s",
     }
 
     ps_http = env.pageserver.http_client()

From 6bb8b1d7c29a11d23b44a8f9a656223f0a410005 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 18 Jun 2024 16:13:47 +0300
Subject: [PATCH 1001/1571] Remove dead code from walproposer_pg.c

Now that logical walsenders fetch WAL from safekeepers recovery in walproposer
is not needed. Fixes warnings.
---
 pgxn/neon/walproposer_pg.c | 194 +------------------------------------
 1 file changed, 3 insertions(+), 191 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 316e23a72e..da1a6f76f0 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -100,17 +100,12 @@ static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd);
 static void WalSndLoop(WalProposer *wp);
 static void XLogBroadcastWalProposer(WalProposer *wp);
 
-static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr);
-static void XLogWalPropClose(XLogRecPtr recptr);
-
 static void add_nwr_event_set(Safekeeper *sk, uint32 events);
 static void update_nwr_event_set(Safekeeper *sk, uint32 events);
 static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
 
 static void CheckGracefulShutdown(WalProposer *wp);
 
-static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp);
-
 static void
 init_walprop_config(bool syncSafekeepers)
 {
@@ -1236,8 +1231,6 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
 static void
 WalSndLoop(WalProposer *wp)
 {
-	XLogRecPtr	flushPtr;
-
 	/* Clear any already-pending wakeups */
 	ResetLatch(MyLatch);
 
@@ -1333,8 +1326,9 @@ XLogBroadcastWalProposer(WalProposer *wp)
 }
 
 /*
-  Used to download WAL before basebackup for logical walsenders from sk, no longer
-  needed because walsender always uses neon_walreader.
+  Used to download WAL before basebackup for walproposer/logical walsenders. No
+  longer used, replaced by neon_walreader; but callback still exists because
+  simulation tests use it.
  */
 static bool
 WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
@@ -1342,136 +1336,6 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 	return true;
 }
 
-/*
- * These variables are used similarly to openLogFile/SegNo,
- * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID
- * corresponding the filename of walpropFile.
- */
-static int	walpropFile = -1;
-static TimeLineID walpropFileTLI = 0;
-static XLogSegNo walpropSegNo = 0;
-
-/*
- * Write XLOG data to disk.
- */
-static void
-XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr)
-{
-	int			startoff;
-	int			byteswritten;
-
-	/*
-	 * Apart from walproposer, basebackup LSN page is also written out by
-	 * postgres itself which writes WAL only in pages, and in basebackup it is
-	 * inherently dummy (only safekeepers have historic WAL). Update WAL
-	 * buffers here to avoid dummy page overwriting correct one we download
-	 * here. Ugly, but alternatives are about the same ugly. We won't need
-	 * that if we switch to on-demand WAL download from safekeepers, without
-	 * writing to disk.
-	 *
-	 * https://github.com/neondatabase/neon/issues/5749
-	 */
-	if (!wp->config->syncSafekeepers)
-		XLogUpdateWalBuffers(buf, recptr, nbytes);
-
-	while (nbytes > 0)
-	{
-		int			segbytes;
-
-		/* Close the current segment if it's completed */
-		if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
-			XLogWalPropClose(recptr);
-
-		if (walpropFile < 0)
-		{
-#if PG_VERSION_NUM >= 150000
-			/* FIXME Is it ok to use hardcoded value here? */
-			TimeLineID	tli = 1;
-#else
-			bool		use_existent = true;
-#endif
-			/* Create/use new log file */
-			XLByteToSeg(recptr, walpropSegNo, wal_segment_size);
-#if PG_VERSION_NUM >= 150000
-			walpropFile = XLogFileInit(walpropSegNo, tli);
-			walpropFileTLI = tli;
-#else
-			walpropFile = XLogFileInit(walpropSegNo, &use_existent, false);
-			walpropFileTLI = ThisTimeLineID;
-#endif
-		}
-
-		/* Calculate the start offset of the received logs */
-		startoff = XLogSegmentOffset(recptr, wal_segment_size);
-
-		if (startoff + nbytes > wal_segment_size)
-			segbytes = wal_segment_size - startoff;
-		else
-			segbytes = nbytes;
-
-		/* OK to write the logs */
-		errno = 0;
-
-		byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff);
-		if (byteswritten <= 0)
-		{
-			char		xlogfname[MAXFNAMELEN];
-			int			save_errno;
-
-			/* if write didn't set errno, assume no disk space */
-			if (errno == 0)
-				errno = ENOSPC;
-
-			save_errno = errno;
-			XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
-			errno = save_errno;
-			ereport(PANIC,
-					(errcode_for_file_access(),
-					 errmsg("could not write to log segment %s "
-							"at offset %u, length %lu: %m",
-							xlogfname, startoff, (unsigned long) segbytes)));
-		}
-
-		/* Update state for write */
-		recptr += byteswritten;
-
-		nbytes -= byteswritten;
-		buf += byteswritten;
-	}
-
-	/*
-	 * Close the current segment if it's fully written up in the last cycle of
-	 * the loop.
-	 */
-	if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
-	{
-		XLogWalPropClose(recptr);
-	}
-}
-
-/*
- * Close the current segment.
- */
-static void
-XLogWalPropClose(XLogRecPtr recptr)
-{
-	Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size));
-
-	if (close(walpropFile) != 0)
-	{
-		char		xlogfname[MAXFNAMELEN];
-
-		XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
-
-		ereport(PANIC,
-				(errcode_for_file_access(),
-				 errmsg("could not close log segment %s: %m",
-						xlogfname)));
-	}
-
-	walpropFile = -1;
-}
-
 static void
 walprop_pg_wal_reader_allocate(Safekeeper *sk)
 {
@@ -1987,58 +1851,6 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
 	elog(FATAL, "unexpected log_internal message at level %d: %s", level, line);
 }
 
-static XLogRecPtr
-GetLogRepRestartLSN(WalProposer *wp)
-{
-	FILE	   *f;
-	XLogRecPtr	lrRestartLsn = InvalidXLogRecPtr;
-
-	/* We don't need to do anything in syncSafekeepers mode. */
-	if (wp->config->syncSafekeepers)
-		return InvalidXLogRecPtr;
-
-	/*
-	 * If there are active logical replication subscription we need to provide
-	 * enough WAL for their WAL senders based on th position of their
-	 * replication slots.
-	 */
-	f = fopen("restart.lsn", "rb");
-	if (f != NULL)
-	{
-		size_t		rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
-
-		fclose(f);
-		if (rc == 1 && lrRestartLsn != InvalidXLogRecPtr)
-		{
-			uint64		download_range_mb;
-
-			wpg_log(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
-
-			/*
-			 * If we need to download more than a max_slot_wal_keep_size,
-			 * don't do it to avoid risk of exploding pg_wal. Logical
-			 * replication won't work until recreated, but at least compute
-			 * would start; this also follows max_slot_wal_keep_size
-			 * semantics.
-			 */
-			download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB;
-			if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
-			{
-				wpg_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
-						LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
-				return InvalidXLogRecPtr;
-			}
-
-			/*
-			 * start from the beginning of the segment to fetch page headers
-			 * verifed by XLogReader
-			 */
-			lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
-		}
-	}
-	return lrRestartLsn;
-}
-
 void
 SetNeonCurrentClusterSize(uint64 size)
 {

From 68476bb4ba0565f68a01504a66a8ddb8fd2ac19b Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 18 Jun 2024 16:02:57 -0400
Subject: [PATCH 1002/1571] feat(pageserver): add iterator API for btree reader
 (#8083)

The new image iterator and delta iterator uses an iterator-based API.
https://github.com/neondatabase/neon/pull/8006 / part of
https://github.com/neondatabase/neon/issues/8002

This requires the underlying thing (the btree) to have an iterator API,
and the iterator should have a type name so that it can be stored
somewhere.

```rust
pub struct DeltaLayerIterator {
  index_iterator: BTreeIterator
}
```

versus

```rust
pub struct DeltaLayerIterator {
  index_iterator: impl Stream<....>
}
```

(this requires nightly flag and still buggy in the Rust compiler)


There are multiple ways to achieve this:

1. Either write a BTreeIterator from scratch that provides `async next`.
This is the most efficient way to do that.
2. Or wrap the current `get_stream` API, which is the current approach
in the pull request.

In the future, we should do (1), and the `get_stream` API should be
refactored to use the iterator API. With (2), we have to wrap the
`get_stream` API with `Pin<Box<dyn Stream>>`, where we have the overhead
of dynamic dispatch. However, (2) needs a rewrite of the `visit`
function, which would take some time to write and review. I'd like to
define this iterator API first and work on a real iterator API later.

## Summary of changes

Add `DiskBtreeIterator` and related tests.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/disk_btree.rs | 36 ++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 6d85d1e60e..119df3e6c4 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -22,7 +22,7 @@ use async_stream::try_stream;
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
 use either::Either;
-use futures::Stream;
+use futures::{Stream, StreamExt};
 use hex;
 use std::{
     cmp::Ordering,
@@ -259,6 +259,16 @@ where
         Ok(result)
     }
 
+    pub fn iter<'a>(
+        &'a self,
+        start_key: &'a [u8; L],
+        ctx: &'a RequestContext,
+    ) -> DiskBtreeIterator<'a> {
+        DiskBtreeIterator {
+            stream: Box::pin(self.get_stream_from(start_key, ctx)),
+        }
+    }
+
     /// Return a stream which yields all key, value pairs from the index
     /// starting from the first key greater or equal to `start_key`.
     ///
@@ -496,6 +506,19 @@ where
     }
 }
 
+pub struct DiskBtreeIterator<'a> {
+    #[allow(clippy::type_complexity)]
+    stream: std::pin::Pin<
+        Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a>,
+    >,
+}
+
+impl<'a> DiskBtreeIterator<'a> {
+    pub async fn next(&mut self) -> Option<std::result::Result<(Vec<u8>, u64), DiskBtreeError>> {
+        self.stream.next().await
+    }
+}
+
 ///
 /// Public builder object, for creating a new tree.
 ///
@@ -1088,6 +1111,17 @@ pub(crate) mod tests {
                 == all_data.get(&u128::MAX).cloned()
         );
 
+        // Test iterator and get_stream API
+        let mut iter = reader.iter(&[0; 16], &ctx);
+        let mut cnt = 0;
+        while let Some(res) = iter.next().await {
+            let (key, val) = res?;
+            let key = u128::from_be_bytes(key.as_slice().try_into().unwrap());
+            assert_eq!(val, *all_data.get(&key).unwrap());
+            cnt += 1;
+        }
+        assert_eq!(cnt, all_data.len());
+
         Ok(())
     }
 

From 4753b8f3902751bf0617be89ae1daccdfb60a7a9 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Wed, 19 Jun 2024 11:33:21 +0200
Subject: [PATCH 1003/1571] Copy release images to prod ECR (#8101)

## Problem
We want to have all released images in production ECR repository

## Summary of changes
Copy all docker images to production ECR repository

cc: https://github.com/neondatabase/cloud/issues/10177
---
 .github/workflows/build_and_test.yml | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 742716776e..8c8500260c 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1070,7 +1070,8 @@ jobs:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - uses: docker/login-action@v3
+      - name: Login to dev ECR
+        uses: docker/login-action@v3
         with:
           registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
           username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
@@ -1104,6 +1105,22 @@ jobs:
           docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
                                              neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
 
+      - name: Login to prod ECR
+        uses: docker/login-action@v3
+        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        with:
+          registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_ACCESS_KEY_ID }}
+          password: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_SECRET_ACCESS_KEY }}
+
+      - name: Copy all images to prod ECR
+        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        run: |
+          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
+            docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
+                                               369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
+          done
+
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]
     runs-on: ubuntu-22.04

From 5778d714f0a4a6f009d9b88bd30a6e35127a6410 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 19 Jun 2024 11:55:30 +0100
Subject: [PATCH 1004/1571] storcon: add drain and fill background operations
 for graceful cluster restarts (#8014)

## Problem
Pageserver restarts cause read availablity downtime for tenants. See
`Motivation` section in the
[RFC](https://github.com/neondatabase/neon/pull/7704).

## Summary of changes
* Introduce a new `NodeSchedulingPolicy`: `PauseForRestart`
* Implement the first take of drain and fill algorithms
* Add a node status endpoint which can be polled to figure out when an
operation is done

The implementation follows the RFC, so it might be useful to peek at it
as you're reviewing.
Since the PR is rather chunky, I've made sure all commits build (with
warnings), so you can
review by commit if you prefer that.

RFC: https://github.com/neondatabase/neon/pull/7704
Related https://github.com/neondatabase/neon/issues/7387
---
 Cargo.lock                                    |   1 +
 libs/pageserver_api/src/controller_api.rs     |   3 +
 storage_controller/Cargo.toml                 |   1 +
 .../src/background_node_operations.rs         |  59 ++
 storage_controller/src/http.rs                |  43 ++
 storage_controller/src/lib.rs                 |   1 +
 storage_controller/src/node.rs                |   7 +-
 storage_controller/src/persistence.rs         |  22 +-
 storage_controller/src/scheduler.rs           |  39 ++
 storage_controller/src/service.rs             | 571 +++++++++++++++++-
 storage_controller/src/tenant_shard.rs        |  36 +-
 test_runner/fixtures/neon_fixtures.py         |  24 +
 .../regress/test_storage_controller.py        | 119 +++-
 13 files changed, 905 insertions(+), 21 deletions(-)
 create mode 100644 storage_controller/src/background_node_operations.rs

diff --git a/Cargo.lock b/Cargo.lock
index 5eac648fd9..cf8a0b3286 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5754,6 +5754,7 @@ dependencies = [
  "r2d2",
  "reqwest 0.12.4",
  "routerify",
+ "scopeguard",
  "serde",
  "serde_json",
  "strum",
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 1278f17ad2..a0d10dc665 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -209,6 +209,7 @@ pub enum NodeSchedulingPolicy {
     Active,
     Filling,
     Pause,
+    PauseForRestart,
     Draining,
 }
 
@@ -220,6 +221,7 @@ impl FromStr for NodeSchedulingPolicy {
             "active" => Ok(Self::Active),
             "filling" => Ok(Self::Filling),
             "pause" => Ok(Self::Pause),
+            "pause_for_restart" => Ok(Self::PauseForRestart),
             "draining" => Ok(Self::Draining),
             _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
         }
@@ -233,6 +235,7 @@ impl From<NodeSchedulingPolicy> for String {
             Active => "active",
             Filling => "filling",
             Pause => "pause",
+            PauseForRestart => "pause_for_restart",
             Draining => "draining",
         }
         .to_string()
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 194619a496..b54dea5d47 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -40,6 +40,7 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 measured.workspace = true
+scopeguard.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 
diff --git a/storage_controller/src/background_node_operations.rs b/storage_controller/src/background_node_operations.rs
new file mode 100644
index 0000000000..74b7e7c849
--- /dev/null
+++ b/storage_controller/src/background_node_operations.rs
@@ -0,0 +1,59 @@
+use std::{borrow::Cow, fmt::Debug, fmt::Display};
+
+use tokio_util::sync::CancellationToken;
+use utils::id::NodeId;
+
+pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 10;
+
+#[derive(Copy, Clone)]
+pub(crate) struct Drain {
+    pub(crate) node_id: NodeId,
+}
+
+#[derive(Copy, Clone)]
+pub(crate) struct Fill {
+    pub(crate) node_id: NodeId,
+}
+
+#[derive(Copy, Clone)]
+pub(crate) enum Operation {
+    Drain(Drain),
+    Fill(Fill),
+}
+
+#[derive(Debug, thiserror::Error)]
+pub(crate) enum OperationError {
+    #[error("Node state changed during operation: {0}")]
+    NodeStateChanged(Cow<'static, str>),
+    #[error("Operation finalize error: {0}")]
+    FinalizeError(Cow<'static, str>),
+    #[error("Operation cancelled")]
+    Cancelled,
+}
+
+pub(crate) struct OperationHandler {
+    pub(crate) operation: Operation,
+    #[allow(unused)]
+    pub(crate) cancel: CancellationToken,
+}
+
+impl Display for Drain {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "drain {}", self.node_id)
+    }
+}
+
+impl Display for Fill {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "fill {}", self.node_id)
+    }
+}
+
+impl Display for Operation {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            Operation::Drain(op) => write!(f, "{op}"),
+            Operation::Fill(op) => write!(f, "{op}"),
+        }
+    }
+}
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index bbb6d2cb32..3e9951fb9e 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -480,6 +480,39 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
     )
 }
 
+async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+
+    let node_status = state.service.get_node(node_id).await?;
+
+    json_response(StatusCode::OK, node_status)
+}
+
+async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+
+    state.service.start_node_drain(node_id).await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
+async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+
+    state.service.start_node_fill(node_id).await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
 async fn handle_tenant_shard_split(
     service: Arc<Service>,
     mut req: Request<Body>,
@@ -832,6 +865,16 @@ pub fn make_router(
                 RequestName("control_v1_node_config"),
             )
         })
+        .get("/control/v1/node/:node_id", |r| {
+            named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
+        })
+        .put("/control/v1/node/:node_id/drain", |r| {
+            named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
+        })
+        .put("/control/v1/node/:node_id/fill", |r| {
+            named_request_span(r, handle_node_fill, RequestName("control_v1_node_fill"))
+        })
+        // TODO(vlad): endpoint for cancelling drain and fill
         // Tenant Shard operations
         .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
             tenant_service_handler(
diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs
index 2ea490a14b..8caf638904 100644
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -2,6 +2,7 @@ use serde::Serialize;
 use utils::seqwait::MonotonicCounter;
 
 mod auth;
+mod background_node_operations;
 mod compute_hook;
 mod heartbeater;
 pub mod http;
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 34dcf0c642..4d17dff9fe 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -59,6 +59,10 @@ impl Node {
         self.id
     }
 
+    pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy {
+        self.scheduling
+    }
+
     pub(crate) fn set_scheduling(&mut self, scheduling: NodeSchedulingPolicy) {
         self.scheduling = scheduling
     }
@@ -151,6 +155,7 @@ impl Node {
             NodeSchedulingPolicy::Draining => MaySchedule::No,
             NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
             NodeSchedulingPolicy::Pause => MaySchedule::No,
+            NodeSchedulingPolicy::PauseForRestart => MaySchedule::No,
         }
     }
 
@@ -167,7 +172,7 @@ impl Node {
             listen_http_port,
             listen_pg_addr,
             listen_pg_port,
-            scheduling: NodeSchedulingPolicy::Filling,
+            scheduling: NodeSchedulingPolicy::Active,
             availability: NodeAvailability::Offline,
             cancel: CancellationToken::new(),
         }
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 67c05296d5..47caf7ae81 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -442,13 +442,15 @@ impl Persistence {
     #[tracing::instrument(skip_all, fields(node_id))]
     pub(crate) async fn re_attach(
         &self,
-        node_id: NodeId,
+        input_node_id: NodeId,
     ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
+        use crate::schema::nodes::dsl::scheduling_policy;
+        use crate::schema::nodes::dsl::*;
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
                 let rows_updated = diesel::update(tenant_shards)
-                    .filter(generation_pageserver.eq(node_id.0 as i64))
+                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
                     .set(generation.eq(generation + 1))
                     .execute(conn)?;
 
@@ -457,9 +459,23 @@ impl Persistence {
                 // TODO: UPDATE+SELECT in one query
 
                 let updated = tenant_shards
-                    .filter(generation_pageserver.eq(node_id.0 as i64))
+                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
                     .select(TenantShardPersistence::as_select())
                     .load(conn)?;
+
+                // If the node went through a drain and restart phase before re-attaching,
+                // then reset it's node scheduling policy to active.
+                diesel::update(nodes)
+                    .filter(node_id.eq(input_node_id.0 as i64))
+                    .filter(
+                        scheduling_policy
+                            .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
+                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Draining)))
+                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Filling))),
+                    )
+                    .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
+                    .execute(conn)?;
+
                 Ok(updated)
             })
             .await?;
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 4ab85509dc..0bd2eeac35 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -1,4 +1,5 @@
 use crate::{node::Node, tenant_shard::TenantShard};
+use itertools::Itertools;
 use pageserver_api::controller_api::UtilizationScore;
 use serde::Serialize;
 use std::collections::HashMap;
@@ -283,6 +284,44 @@ impl Scheduler {
         }
     }
 
+    // Check if the number of shards attached to a given node is lagging below
+    // the cluster average. If that's the case, the node should be filled.
+    pub(crate) fn compute_fill_requirement(&self, node_id: NodeId) -> usize {
+        let Some(node) = self.nodes.get(&node_id) else {
+            debug_assert!(false);
+            tracing::error!("Scheduler missing node {node_id}");
+            return 0;
+        };
+        assert!(!self.nodes.is_empty());
+        let expected_attached_shards_per_node = self.expected_attached_shard_count();
+
+        for (node_id, node) in self.nodes.iter() {
+            tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node);
+        }
+
+        if node.attached_shard_count < expected_attached_shards_per_node {
+            expected_attached_shards_per_node - node.attached_shard_count
+        } else {
+            0
+        }
+    }
+
+    pub(crate) fn expected_attached_shard_count(&self) -> usize {
+        let total_attached_shards: usize =
+            self.nodes.values().map(|n| n.attached_shard_count).sum();
+
+        assert!(!self.nodes.is_empty());
+        total_attached_shards / self.nodes.len()
+    }
+
+    pub(crate) fn nodes_by_attached_shard_count(&self) -> Vec<(NodeId, usize)> {
+        self.nodes
+            .iter()
+            .map(|(node_id, stats)| (*node_id, stats.attached_shard_count))
+            .sorted_by(|lhs, rhs| Ord::cmp(&lhs.1, &rhs.1).reverse())
+            .collect()
+    }
+
     pub(crate) fn node_upsert(&mut self, node: &Node) {
         use std::collections::hash_map::Entry::*;
         match self.nodes.entry(node.get_id()) {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 926332f946..c94af113db 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -8,13 +8,17 @@ use std::{
 };
 
 use crate::{
+    background_node_operations::{
+        Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION,
+    },
     compute_hook::NotifyError,
     id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard},
     persistence::{AbortShardSplitStatus, TenantFilter},
     reconciler::{ReconcileError, ReconcileUnits},
     scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
     tenant_shard::{
-        MigrateAttachment, ReconcileNeeded, ScheduleOptimization, ScheduleOptimizationAction,
+        MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
+        ScheduleOptimizationAction,
     },
 };
 use anyhow::Context;
@@ -134,6 +138,11 @@ struct ServiceState {
 
     scheduler: Scheduler,
 
+    /// Ongoing background operation on the cluster if any is running.
+    /// Note that only one such operation may run at any given time,
+    /// hence the type choice.
+    ongoing_operation: Option<OperationHandler>,
+
     /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile
     delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
 }
@@ -185,6 +194,7 @@ impl ServiceState {
             tenants,
             nodes: Arc::new(nodes),
             scheduler,
+            ongoing_operation: None,
             delayed_reconcile_rx,
         }
     }
@@ -296,6 +306,17 @@ impl From<ReconcileWaitError> for ApiError {
     }
 }
 
+impl From<OperationError> for ApiError {
+    fn from(value: OperationError) -> Self {
+        match value {
+            OperationError::NodeStateChanged(err) | OperationError::FinalizeError(err) => {
+                ApiError::InternalServerError(anyhow::anyhow!(err))
+            }
+            OperationError::Cancelled => ApiError::Conflict("Operation was cancelled".into()),
+        }
+    }
+}
+
 #[allow(clippy::large_enum_variant)]
 enum TenantCreateOrUpdate {
     Create(TenantCreateRequest),
@@ -1594,15 +1615,32 @@ impl Service {
         // Setting a node active unblocks any Reconcilers that might write to the location config API,
         // but those requests will not be accepted by the node until it has finished processing
         // the re-attach response.
+        //
+        // Additionally, reset the nodes scheduling policy to match the conditional update done
+        // in [`Persistence::re_attach`].
         if let Some(node) = nodes.get(&reattach_req.node_id) {
-            if !node.is_available() {
+            let reset_scheduling = matches!(
+                node.get_scheduling(),
+                NodeSchedulingPolicy::PauseForRestart
+                    | NodeSchedulingPolicy::Draining
+                    | NodeSchedulingPolicy::Filling
+            );
+
+            if !node.is_available() || reset_scheduling {
                 let mut new_nodes = (**nodes).clone();
                 if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
-                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+                    if !node.is_available() {
+                        node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+                    }
+
+                    if reset_scheduling {
+                        node.set_scheduling(NodeSchedulingPolicy::Active);
+                    }
+
                     scheduler.node_upsert(node);
+                    let new_nodes = Arc::new(new_nodes);
+                    *nodes = new_nodes;
                 }
-                let new_nodes = Arc::new(new_nodes);
-                *nodes = new_nodes;
             }
         }
 
@@ -1883,6 +1921,25 @@ impl Service {
         Ok(())
     }
 
+    /// Same as [`Service::await_waiters`], but returns the waiters which are still
+    /// in progress
+    async fn await_waiters_remainder(
+        &self,
+        waiters: Vec<ReconcilerWaiter>,
+        timeout: Duration,
+    ) -> Vec<ReconcilerWaiter> {
+        let deadline = Instant::now().checked_add(timeout).unwrap();
+        for waiter in waiters.iter() {
+            let timeout = deadline.duration_since(Instant::now());
+            let _ = waiter.wait_timeout(timeout).await;
+        }
+
+        waiters
+            .into_iter()
+            .filter(|waiter| matches!(waiter.get_status(), ReconcilerStatus::InProgress))
+            .collect::<Vec<_>>()
+    }
+
     /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
     /// and transform it into either a tenant creation of a series of shard updates.
     ///
@@ -4164,6 +4221,18 @@ impl Service {
         Ok(nodes)
     }
 
+    pub(crate) async fn get_node(&self, node_id: NodeId) -> Result<Node, ApiError> {
+        self.inner
+            .read()
+            .unwrap()
+            .nodes
+            .get(&node_id)
+            .cloned()
+            .ok_or(ApiError::NotFound(
+                format!("Node {node_id} not registered").into(),
+            ))
+    }
+
     pub(crate) async fn node_register(
         &self,
         register_req: NodeRegisterRequest,
@@ -4318,9 +4387,6 @@ impl Service {
 
         if let Some(scheduling) = scheduling {
             node.set_scheduling(scheduling);
-
-            // TODO: once we have a background scheduling ticker for fill/drain, kick it
-            // to wake up and start working.
         }
 
         // Update the scheduler, in case the elegibility of the node for new shards has changed
@@ -4411,7 +4477,7 @@ impl Service {
                 // TODO: in the background, we should balance work back onto this pageserver
             }
             AvailabilityTransition::Unchanged => {
-                tracing::debug!("Node {} no change during config", node_id);
+                tracing::debug!("Node {} no availability change during config", node_id);
             }
         }
 
@@ -4420,6 +4486,201 @@ impl Service {
         Ok(())
     }
 
+    pub(crate) async fn start_node_drain(
+        self: &Arc<Self>,
+        node_id: NodeId,
+    ) -> Result<(), ApiError> {
+        let (ongoing_op, node_available, node_policy, schedulable_nodes_count) = {
+            let locked = self.inner.read().unwrap();
+            let nodes = &locked.nodes;
+            let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
+                anyhow::anyhow!("Node {} not registered", node_id).into(),
+            ))?;
+            let schedulable_nodes_count = nodes
+                .iter()
+                .filter(|(_, n)| matches!(n.may_schedule(), MaySchedule::Yes(_)))
+                .count();
+
+            (
+                locked
+                    .ongoing_operation
+                    .as_ref()
+                    .map(|ongoing| ongoing.operation),
+                node.is_available(),
+                node.get_scheduling(),
+                schedulable_nodes_count,
+            )
+        };
+
+        if let Some(ongoing) = ongoing_op {
+            return Err(ApiError::PreconditionFailed(
+                format!("Background operation already ongoing for node: {}", ongoing).into(),
+            ));
+        }
+
+        if !node_available {
+            return Err(ApiError::ResourceUnavailable(
+                format!("Node {node_id} is currently unavailable").into(),
+            ));
+        }
+
+        if schedulable_nodes_count == 0 {
+            return Err(ApiError::PreconditionFailed(
+                "No other schedulable nodes to drain to".into(),
+            ));
+        }
+
+        match node_policy {
+            NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Pause => {
+                self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Draining))
+                    .await?;
+
+                let cancel = CancellationToken::new();
+
+                self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
+                    operation: Operation::Drain(Drain { node_id }),
+                    cancel: cancel.clone(),
+                });
+
+                tokio::task::spawn({
+                    let service = self.clone();
+                    let cancel = cancel.clone();
+                    async move {
+                        scopeguard::defer! {
+                            let prev = service.inner.write().unwrap().ongoing_operation.take();
+
+                            if let Some(Operation::Drain(removed_drain)) = prev.map(|h| h.operation) {
+                                assert_eq!(removed_drain.node_id, node_id, "We always take the same operation");
+                            } else {
+                                panic!("We always remove the same operation")
+                            }
+                        }
+
+                        tracing::info!(%node_id, "Drain background operation starting");
+                        let res = service.drain_node(node_id, cancel).await;
+                        match res {
+                            Ok(()) => {
+                                tracing::info!(%node_id, "Drain background operation completed successfully");
+                            }
+                            Err(OperationError::Cancelled) => {
+                                tracing::info!(%node_id, "Drain background operation was cancelled");
+                            }
+                            Err(err) => {
+                                tracing::error!(%node_id, "Drain background operation encountered: {err}")
+                            }
+                        }
+                    }
+                });
+            }
+            NodeSchedulingPolicy::Draining => {
+                return Err(ApiError::Conflict(format!(
+                    "Node {node_id} has drain in progress"
+                )));
+            }
+            policy => {
+                return Err(ApiError::PreconditionFailed(
+                    format!("Node {node_id} cannot be drained due to {policy:?} policy").into(),
+                ));
+            }
+        }
+
+        Ok(())
+    }
+
+    pub(crate) async fn start_node_fill(self: &Arc<Self>, node_id: NodeId) -> Result<(), ApiError> {
+        let (ongoing_op, node_available, node_policy, total_nodes_count) = {
+            let locked = self.inner.read().unwrap();
+            let nodes = &locked.nodes;
+            let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
+                anyhow::anyhow!("Node {} not registered", node_id).into(),
+            ))?;
+
+            (
+                locked
+                    .ongoing_operation
+                    .as_ref()
+                    .map(|ongoing| ongoing.operation),
+                node.is_available(),
+                node.get_scheduling(),
+                nodes.len(),
+            )
+        };
+
+        if let Some(ongoing) = ongoing_op {
+            return Err(ApiError::PreconditionFailed(
+                format!("Background operation already ongoing for node: {}", ongoing).into(),
+            ));
+        }
+
+        if !node_available {
+            return Err(ApiError::ResourceUnavailable(
+                format!("Node {node_id} is currently unavailable").into(),
+            ));
+        }
+
+        if total_nodes_count <= 1 {
+            return Err(ApiError::PreconditionFailed(
+                "No other nodes to fill from".into(),
+            ));
+        }
+
+        match node_policy {
+            NodeSchedulingPolicy::Active => {
+                self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Filling))
+                    .await?;
+
+                let cancel = CancellationToken::new();
+
+                self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
+                    operation: Operation::Fill(Fill { node_id }),
+                    cancel: cancel.clone(),
+                });
+
+                tokio::task::spawn({
+                    let service = self.clone();
+                    let cancel = cancel.clone();
+                    async move {
+                        scopeguard::defer! {
+                            let prev = service.inner.write().unwrap().ongoing_operation.take();
+
+                            if let Some(Operation::Fill(removed_fill)) = prev.map(|h| h.operation) {
+                                assert_eq!(removed_fill.node_id, node_id, "We always take the same operation");
+                            } else {
+                                panic!("We always remove the same operation")
+                            }
+                        }
+
+                        tracing::info!(%node_id, "Fill background operation starting");
+                        let res = service.fill_node(node_id, cancel).await;
+                        match res {
+                            Ok(()) => {
+                                tracing::info!(%node_id, "Fill background operation completed successfully");
+                            }
+                            Err(OperationError::Cancelled) => {
+                                tracing::info!(%node_id, "Fill background operation was cancelled");
+                            }
+                            Err(err) => {
+                                tracing::error!(%node_id, "Fill background operation encountered: {err}")
+                            }
+                        }
+                    }
+                });
+            }
+            NodeSchedulingPolicy::Filling => {
+                return Err(ApiError::Conflict(format!(
+                    "Node {node_id} has fill in progress"
+                )));
+            }
+            policy => {
+                return Err(ApiError::PreconditionFailed(
+                    format!("Node {node_id} cannot be filled due to {policy:?} policy").into(),
+                ));
+            }
+        }
+
+        Ok(())
+    }
+
     /// Helper for methods that will try and call pageserver APIs for
     /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
     /// is attached somewhere.
@@ -5004,4 +5265,296 @@ impl Service {
         // to complete.
         self.gate.close().await;
     }
+
+    /// Drain a node by moving the shards attached to it as primaries.
+    /// This is a long running operation and it should run as a separate Tokio task.
+    pub(crate) async fn drain_node(
+        &self,
+        node_id: NodeId,
+        cancel: CancellationToken,
+    ) -> Result<(), OperationError> {
+        let mut last_inspected_shard: Option<TenantShardId> = None;
+        let mut inspected_all_shards = false;
+        let mut waiters = Vec::new();
+        let mut schedule_context = ScheduleContext::default();
+
+        while !inspected_all_shards {
+            if cancel.is_cancelled() {
+                return Err(OperationError::Cancelled);
+            }
+
+            {
+                let mut locked = self.inner.write().unwrap();
+                let (nodes, tenants, scheduler) = locked.parts_mut();
+
+                let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged(
+                    format!("node {node_id} was removed").into(),
+                ))?;
+
+                let current_policy = node.get_scheduling();
+                if !matches!(current_policy, NodeSchedulingPolicy::Draining) {
+                    // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
+                    // about it
+                    return Err(OperationError::NodeStateChanged(
+                        format!("node {node_id} changed state to {current_policy:?}").into(),
+                    ));
+                }
+
+                let mut cursor = tenants.iter_mut().skip_while({
+                    let skip_past = last_inspected_shard;
+                    move |(tid, _)| match skip_past {
+                        Some(last) => **tid != last,
+                        None => false,
+                    }
+                });
+
+                while waiters.len() < MAX_RECONCILES_PER_OPERATION {
+                    let (tid, tenant_shard) = match cursor.next() {
+                        Some(some) => some,
+                        None => {
+                            inspected_all_shards = true;
+                            break;
+                        }
+                    };
+
+                    if tenant_shard.intent.demote_attached(scheduler, node_id) {
+                        match tenant_shard.schedule(scheduler, &mut schedule_context) {
+                            Err(e) => {
+                                tracing::warn!(
+                                    tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                                    "Scheduling error when draining pageserver {} : {e}", node_id
+                                );
+                            }
+                            Ok(()) => {
+                                let scheduled_to = tenant_shard.intent.get_attached();
+                                tracing::info!(
+                                    tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                                    "Rescheduled shard while draining node {}: {} -> {:?}",
+                                    node_id,
+                                    node_id,
+                                    scheduled_to
+                                );
+
+                                let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
+                                if let Some(some) = waiter {
+                                    waiters.push(some);
+                                }
+                            }
+                        }
+                    }
+
+                    last_inspected_shard = Some(*tid);
+                }
+            }
+
+            waiters = self
+                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
+                .await;
+        }
+
+        while !waiters.is_empty() {
+            tracing::info!("Awaiting {} pending drain reconciliations", waiters.len());
+
+            waiters = self
+                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
+                .await;
+        }
+
+        // At this point we have done the best we could to drain shards from this node.
+        // Set the node scheduling policy to `[NodeSchedulingPolicy::PauseForRestart]`
+        // to complete the drain.
+        if let Err(err) = self
+            .node_configure(node_id, None, Some(NodeSchedulingPolicy::PauseForRestart))
+            .await
+        {
+            // This is not fatal. Anything that is polling the node scheduling policy to detect
+            // the end of the drain operations will hang, but all such places should enforce an
+            // overall timeout. The scheduling policy will be updated upon node re-attach and/or
+            // by the counterpart fill operation.
+            return Err(OperationError::FinalizeError(
+                format!(
+                    "Failed to finalise drain of {node_id} by setting scheduling policy to PauseForRestart: {err}"
+                )
+                .into(),
+            ));
+        }
+
+        Ok(())
+    }
+
+    /// Create a node fill plan (pick secondaries to promote) that meets the following requirements:
+    /// 1. The node should be filled until it reaches the expected cluster average of
+    /// attached shards. If there are not enough secondaries on the node, the plan stops early.
+    /// 2. Select tenant shards to promote such that the number of attached shards is balanced
+    /// throughout the cluster. We achieve this by picking tenant shards from each node,
+    /// starting from the ones with the largest number of attached shards, until the node
+    /// reaches the expected cluster average.
+    fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
+        let mut locked = self.inner.write().unwrap();
+        let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
+
+        let mut tids_by_node = locked
+            .tenants
+            .iter_mut()
+            .filter_map(|(tid, tenant_shard)| {
+                if tenant_shard.intent.get_secondary().contains(&node_id) {
+                    if let Some(primary) = tenant_shard.intent.get_attached() {
+                        return Some((*primary, *tid));
+                    }
+                }
+
+                None
+            })
+            .into_group_map();
+
+        let expected_attached = locked.scheduler.expected_attached_shard_count();
+        let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();
+
+        let mut plan = Vec::new();
+        for (node_id, attached) in nodes_by_load {
+            if plan.len() >= fill_requirement
+                || tids_by_node.is_empty()
+                || attached <= expected_attached
+            {
+                break;
+            }
+
+            let can_take = attached - expected_attached;
+            let mut remove_node = false;
+            for _ in 0..can_take {
+                match tids_by_node.get_mut(&node_id) {
+                    Some(tids) => match tids.pop() {
+                        Some(tid) => {
+                            plan.push(tid);
+                        }
+                        None => {
+                            remove_node = true;
+                            break;
+                        }
+                    },
+                    None => {
+                        break;
+                    }
+                }
+            }
+
+            if remove_node {
+                tids_by_node.remove(&node_id);
+            }
+        }
+
+        plan
+    }
+
+    /// Fill a node by promoting its secondaries until the cluster is balanced
+    /// with regards to attached shard counts. Note that this operation only
+    /// makes sense as a counterpart to the drain implemented in [`Service::drain_node`].
+    /// This is a long running operation and it should run as a separate Tokio task.
+    pub(crate) async fn fill_node(
+        &self,
+        node_id: NodeId,
+        cancel: CancellationToken,
+    ) -> Result<(), OperationError> {
+        // TODO(vlad): Currently this operates on the assumption that all
+        // secondaries are warm. This is not always true (e.g. we just migrated the
+        // tenant). Take that into consideration by checking the secondary status.
+        let mut tids_to_promote = self.fill_node_plan(node_id);
+
+        let mut waiters = Vec::new();
+        let mut schedule_context = ScheduleContext::default();
+
+        // Execute the plan we've composed above. Before aplying each move from the plan,
+        // we validate to ensure that it has not gone stale in the meantime.
+        while !tids_to_promote.is_empty() {
+            if cancel.is_cancelled() {
+                return Err(OperationError::Cancelled);
+            }
+
+            {
+                let mut locked = self.inner.write().unwrap();
+                let (nodes, tenants, scheduler) = locked.parts_mut();
+
+                let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged(
+                    format!("node {node_id} was removed").into(),
+                ))?;
+
+                let current_policy = node.get_scheduling();
+                if !matches!(current_policy, NodeSchedulingPolicy::Filling) {
+                    // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
+                    // about it
+                    return Err(OperationError::NodeStateChanged(
+                        format!("node {node_id} changed state to {current_policy:?}").into(),
+                    ));
+                }
+
+                while waiters.len() < MAX_RECONCILES_PER_OPERATION {
+                    if let Some(tid) = tids_to_promote.pop() {
+                        if let Some(tenant_shard) = tenants.get_mut(&tid) {
+                            // If the node being filled is not a secondary anymore,
+                            // skip the promotion.
+                            if !tenant_shard.intent.get_secondary().contains(&node_id) {
+                                continue;
+                            }
+
+                            let previously_attached_to = *tenant_shard.intent.get_attached();
+
+                            tenant_shard.intent.promote_attached(scheduler, node_id);
+                            match tenant_shard.schedule(scheduler, &mut schedule_context) {
+                                Err(e) => {
+                                    tracing::warn!(
+                                        tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                                        "Scheduling error when filling pageserver {} : {e}", node_id
+                                    );
+                                }
+                                Ok(()) => {
+                                    tracing::info!(
+                                        tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                                        "Rescheduled shard while filling node {}: {:?} -> {}",
+                                        node_id,
+                                        previously_attached_to,
+                                        node_id
+                                    );
+
+                                    if let Some(waiter) =
+                                        self.maybe_reconcile_shard(tenant_shard, nodes)
+                                    {
+                                        waiters.push(waiter);
+                                    }
+                                }
+                            }
+                        }
+                    } else {
+                        break;
+                    }
+                }
+            }
+
+            waiters = self
+                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
+                .await;
+        }
+
+        while !waiters.is_empty() {
+            tracing::info!("Awaiting {} pending fill reconciliations", waiters.len());
+
+            waiters = self
+                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
+                .await;
+        }
+
+        if let Err(err) = self
+            .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
+            .await
+        {
+            // This isn't a huge issue since the filling process starts upon request. However, it
+            // will prevent the next drain from starting. The only case in which this can fail
+            // is database unavailability. Such a case will require manual intervention.
+            return Err(OperationError::FinalizeError(
+                format!("Failed to finalise fill of {node_id} by setting scheduling policy to Active: {err}")
+                    .into(),
+            ));
+        }
+
+        Ok(())
+    }
 }
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 77bbf4c604..d1b632755f 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -10,7 +10,9 @@ use crate::{
     reconciler::ReconcileUnits,
     scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
 };
-use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
+use pageserver_api::controller_api::{
+    NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy,
+};
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
     shard::{ShardIdentity, TenantShardId},
@@ -311,6 +313,12 @@ pub(crate) struct ReconcilerWaiter {
     seq: Sequence,
 }
 
+pub(crate) enum ReconcilerStatus {
+    Done,
+    Failed,
+    InProgress,
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum ReconcileWaitError {
     #[error("Timeout waiting for shard {0}")]
@@ -373,6 +381,16 @@ impl ReconcilerWaiter {
 
         Ok(())
     }
+
+    pub(crate) fn get_status(&self) -> ReconcilerStatus {
+        if self.seq_wait.would_wait_for(self.seq).is_err() {
+            ReconcilerStatus::Done
+        } else if self.error_seq_wait.would_wait_for(self.seq).is_err() {
+            ReconcilerStatus::Failed
+        } else {
+            ReconcilerStatus::InProgress
+        }
+    }
 }
 
 /// Having spawned a reconciler task, the tenant shard's state will carry enough
@@ -652,13 +670,17 @@ impl TenantShard {
         let mut scores = all_pageservers
             .iter()
             .flat_map(|node_id| {
-                if matches!(
-                    nodes
-                        .get(node_id)
-                        .map(|n| n.may_schedule())
-                        .unwrap_or(MaySchedule::No),
-                    MaySchedule::No
+                let node = nodes.get(node_id);
+                if node.is_none() {
+                    None
+                } else if matches!(
+                    node.unwrap().get_scheduling(),
+                    NodeSchedulingPolicy::Filling
                 ) {
+                    // If the node is currently filling, don't count it as a candidate to avoid,
+                    // racing with the background fill.
+                    None
+                } else if matches!(node.unwrap().may_schedule(), MaySchedule::No) {
                     None
                 } else {
                     let affinity_score = schedule_context.get_node_affinity(*node_id);
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index aa55b6e4cb..bad93ff39a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2213,6 +2213,30 @@ class NeonStorageController(MetricsGetter, LogUtils):
             headers=self.headers(TokenScope.ADMIN),
         )
 
+    def node_drain(self, node_id):
+        log.info(f"node_drain({node_id})")
+        self.request(
+            "PUT",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
+    def node_fill(self, node_id):
+        log.info(f"node_fill({node_id})")
+        self.request(
+            "PUT",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
+    def node_status(self, node_id):
+        response = self.request(
+            "GET",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        return response.json()
+
     def node_list(self):
         response = self.request(
             "GET",
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 8624a45f45..30f96ceee8 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -40,7 +40,7 @@ from werkzeug.wrappers.response import Response
 
 
 def get_node_shard_counts(env: NeonEnv, tenant_ids):
-    counts: defaultdict[str, int] = defaultdict(int)
+    counts: defaultdict[int, int] = defaultdict(int)
     for tid in tenant_ids:
         for shard in env.storage_controller.locate(tid):
             counts[shard["node_id"]] += 1
@@ -1502,3 +1502,120 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto
         workload = Workload(env, tenant_id, timeline, branch_name=branch)
         workload.expect_rows = expect_rows
         workload.validate()
+
+
+def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
+    """
+    Graceful reststart of storage controller clusters use the drain and
+    fill hooks in order to migrate attachments away from pageservers before
+    restarting. In practice, Ansible will drive this process.
+    """
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_count = 5
+    shard_count_per_tenant = 8
+    total_shards = tenant_count * shard_count_per_tenant
+    tenant_ids = []
+
+    for _ in range(0, tenant_count):
+        tid = TenantId.generate()
+        tenant_ids.append(tid)
+        env.neon_cli.create_tenant(
+            tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
+        )
+
+    # Give things a chance to settle.
+    # A call to `reconcile_until_idle` could be used here instead,
+    # however since all attachments are placed on the same node,
+    # we'd have to wait for a long time (2 minutes-ish) for optimizations
+    # to quiesce.
+    # TODO: once the initial attachment selection is fixed, update this
+    # to use `reconcile_until_idle`.
+    time.sleep(2)
+
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 2
+
+    def retryable_node_operation(op, ps_id, max_attempts, backoff):
+        while max_attempts > 0:
+            try:
+                op(ps_id)
+                return
+            except StorageControllerApiException as e:
+                max_attempts -= 1
+                log.info(f"Operation failed ({max_attempts} attempts left): {e}")
+
+                if max_attempts == 0:
+                    raise e
+
+                time.sleep(backoff)
+
+    def poll_node_status(node_id, desired_scheduling_policy, max_attempts, backoff):
+        log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
+        while max_attempts > 0:
+            try:
+                status = env.storage_controller.node_status(node_id)
+                policy = status["scheduling"]
+                if policy == desired_scheduling_policy:
+                    return
+                else:
+                    max_attempts -= 1
+                    log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
+
+                    if max_attempts == 0:
+                        raise AssertionError(
+                            f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
+                        )
+
+                    time.sleep(backoff)
+            except StorageControllerApiException as e:
+                max_attempts -= 1
+                log.info(f"Status call failed ({max_attempts} retries left): {e}")
+
+                if max_attempts == 0:
+                    raise e
+
+                time.sleep(backoff)
+
+    def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards):
+        # Assert that all nodes have some attached shards
+        assert len(shard_counts) == len(env.pageservers)
+
+        min_shard_count = min(shard_counts.values())
+        max_shard_count = max(shard_counts.values())
+
+        flake_factor = 5 / 100
+        assert max_shard_count - min_shard_count <= int(total_shards * flake_factor)
+
+    # Perform a graceful rolling restart
+    for ps in env.pageservers:
+        retryable_node_operation(
+            lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
+        )
+        poll_node_status(ps.id, "PauseForRestart", max_attempts=6, backoff=5)
+
+        shard_counts = get_node_shard_counts(env, tenant_ids)
+        log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
+        # Assert that we've drained the node
+        assert shard_counts[ps.id] == 0
+        # Assert that those shards actually went somewhere
+        assert sum(shard_counts.values()) == total_shards
+
+        ps.restart()
+        poll_node_status(ps.id, "Active", max_attempts=10, backoff=1)
+
+        retryable_node_operation(
+            lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
+        )
+        poll_node_status(ps.id, "Active", max_attempts=6, backoff=5)
+
+        shard_counts = get_node_shard_counts(env, tenant_ids)
+        log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
+        assert_shard_counts_balanced(env, shard_counts, total_shards)
+
+    # Now check that shards are reasonably balanced
+    shard_counts = get_node_shard_counts(env, tenant_ids)
+    log.info(f"Shard counts after rolling restart: {shard_counts}")
+    assert_shard_counts_balanced(env, shard_counts, total_shards)

From e7d62a257d8c56e2289733a9890557f9dbff93cb Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 19 Jun 2024 11:55:59 +0100
Subject: [PATCH 1005/1571] test: fix tenant duplication utility generation
 numbers (#8096)

## Problem
We have this set of test utilities which duplicate a tenant by copying
everything that's in remote storage and then attaching a tenant to the
pageserver and storage controller. When the "copied tenants" are created
on the storage controller, they start off from generation number 0. This
means that they can't see anything past that generation.

This issues has existed ever since generation numbers have been
introduced, but we've largely been lucky
for the generation to stay stable during the template tenant creation.

## Summary of Changes
Extend the storage controller debug attach hook to accept a generation
override. Use that in the tenant duplication logic to set the generation
number to something greater than the naturally reached generation. This
allows the tenants to see all layer files.
---
 control_plane/src/storage_controller.rs         |  2 ++
 storage_controller/src/service.rs               |  3 ++-
 test_runner/fixtures/neon_fixtures.py           | 16 ++++++++++++++--
 test_runner/fixtures/pageserver/many_tenants.py |  2 ++
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index b6b7ea7762..72948e203f 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -46,6 +46,7 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
     pub node_id: Option<NodeId>,
+    pub generation_override: Option<i32>,
 }
 
 #[derive(Serialize, Deserialize)]
@@ -440,6 +441,7 @@ impl StorageController {
         let request = AttachHookRequest {
             tenant_shard_id,
             node_id: Some(pageserver_id),
+            generation_override: None,
         };
 
         let response = self
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index c94af113db..181e262638 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1234,13 +1234,14 @@ impl Service {
             let locked = self.inner.write().unwrap();
             !locked.tenants.contains_key(&attach_req.tenant_shard_id)
         };
+
         if insert {
             let tsp = TenantShardPersistence {
                 tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
                 shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
                 shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32,
                 shard_stripe_size: 0,
-                generation: Some(0),
+                generation: attach_req.generation_override.or(Some(0)),
                 generation_pageserver: None,
                 placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
                 config: serde_json::to_string(&TenantConfig::default()).unwrap(),
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index bad93ff39a..8994db8cf2 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2159,12 +2159,19 @@ class NeonStorageController(MetricsGetter, LogUtils):
         return time.time() - t1
 
     def attach_hook_issue(
-        self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
+        self,
+        tenant_shard_id: Union[TenantId, TenantShardId],
+        pageserver_id: int,
+        generation_override: Optional[int] = None,
     ) -> int:
+        body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id}
+        if generation_override is not None:
+            body["generation_override"] = generation_override
+
         response = self.request(
             "POST",
             f"{self.env.storage_controller_api}/debug/v1/attach-hook",
-            json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id},
+            json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
         gen = response.json()["gen"]
@@ -2635,6 +2642,7 @@ class NeonPageserver(PgProtocol, LogUtils):
         config: None | Dict[str, Any] = None,
         config_null: bool = False,
         generation: Optional[int] = None,
+        override_storage_controller_generation: bool = False,
     ):
         """
         Tenant attachment passes through here to acquire a generation number before proceeding
@@ -2643,6 +2651,10 @@ class NeonPageserver(PgProtocol, LogUtils):
         client = self.http_client()
         if generation is None:
             generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
+        elif override_storage_controller_generation:
+            generation = self.env.storage_controller.attach_hook_issue(
+                tenant_id, self.id, generation
+            )
         return client.tenant_attach(
             tenant_id,
             config,
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index def80a1c3e..8730d8ef75 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -66,6 +66,8 @@ def single_timeline(
         env.pageserver.tenant_attach(
             tenant,
             config=template_config.copy(),
+            generation=100,
+            override_storage_controller_generation=True,
         )
         time.sleep(0.1)
         wait_until_tenant_state(ps_http, tenant, "Broken", 10)

From 438fd2aaf32a682b4cd1175f6e992190321490e1 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 19 Jun 2024 13:59:36 +0200
Subject: [PATCH 1006/1571] neon_local: `background_process`: launch all
 processes in repo dir (or `datadir`) (#8058)

Before this PR, storage controller and broker would run in the
PWD of neon_local, i.e., most likely the checkout of neon.git.

With this PR, the shared infrastructure for background processes
sets the PWD.

Benefits:
* easy listing of processes in a repo dir using `lsof`, see added
  comment in the code
* coredumps go in the right directory (next to the process)
* generally matching common expectations, I think

Changes:
* set the working directory in `background_process` module
* drive-by: fix reliance of storage_controller on NEON_REPO_DIR being
set by neon_local for the local compute hook to work correctly
---
 control_plane/src/background_process.rs | 11 +++++++-
 control_plane/src/bin/neon_local.rs     |  6 +++--
 control_plane/src/local_env.rs          | 35 +++++++++++++++++--------
 control_plane/src/storage_controller.rs | 10 ++++---
 storage_controller/src/compute_hook.rs  |  8 +++++-
 storage_controller/src/main.rs          |  8 ++++++
 storage_controller/src/service.rs       |  4 +++
 7 files changed, 63 insertions(+), 19 deletions(-)

diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 94666f2870..3f4ddbdb2b 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -69,6 +69,9 @@ where
     // Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
     EI: IntoIterator<Item = (String, String)>,
 {
+    if !datadir.metadata().context("stat datadir")?.is_dir() {
+        anyhow::bail!("`datadir` must be a directory when calling this function: {datadir:?}");
+    }
     let log_path = datadir.join(format!("{process_name}.log"));
     let process_log_file = fs::OpenOptions::new()
         .create(true)
@@ -85,7 +88,13 @@ where
     let background_command = command
         .stdout(process_log_file)
         .stderr(same_file_for_stderr)
-        .args(args);
+        .args(args)
+        // spawn all child processes in their datadir, useful for all kinds of things,
+        // not least cleaning up child processes e.g. after an unclean exit from the test suite:
+        // ```
+        // lsof  -d cwd -a +D  Users/cs/src/neon/test_output
+        // ```
+        .current_dir(datadir);
 
     let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
         fill_rust_env_vars(background_command),
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 18e395e2b5..8fe959792b 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -87,7 +87,8 @@ fn main() -> Result<()> {
         handle_init(sub_args).map(Some)
     } else {
         // all other commands need an existing config
-        let mut env = LocalEnv::load_config().context("Error loading config")?;
+        let mut env =
+            LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
         let original_env = env.clone();
 
         let rt = tokio::runtime::Builder::new_current_thread()
@@ -364,7 +365,8 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
 
     LocalEnv::init(init_conf, force)
         .context("materialize initial neon_local environment on disk")?;
-    Ok(LocalEnv::load_config().expect("freshly written config should be loadable"))
+    Ok(LocalEnv::load_config(&local_env::base_path())
+        .expect("freshly written config should be loadable"))
 }
 
 /// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 0edcf1be4e..6634274d2a 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -42,8 +42,8 @@ pub struct LocalEnv {
     // compute endpoints).
     //
     // This is not stored in the config file. Rather, this is the path where the
-    // config file itself is. It is read from the NEON_REPO_DIR env variable or
-    // '.neon' if not given.
+    // config file itself is. It is read from the NEON_REPO_DIR env variable which
+    // must be an absolute path. If the env var is not set, $PWD/.neon is used.
     pub base_data_dir: PathBuf,
 
     // Path to postgres distribution. It's expected that "bin", "include",
@@ -431,9 +431,7 @@ impl LocalEnv {
     }
 
     ///  Construct `Self` from on-disk state.
-    pub fn load_config() -> anyhow::Result<Self> {
-        let repopath = base_path();
-
+    pub fn load_config(repopath: &Path) -> anyhow::Result<Self> {
         if !repopath.exists() {
             bail!(
                 "Neon config is not found in {}. You need to run 'neon_local init' first",
@@ -461,7 +459,7 @@ impl LocalEnv {
                 branch_name_mappings,
             } = on_disk_config;
             LocalEnv {
-                base_data_dir: repopath.clone(),
+                base_data_dir: repopath.to_owned(),
                 pg_distrib_dir,
                 neon_distrib_dir,
                 default_tenant_id,
@@ -482,7 +480,7 @@ impl LocalEnv {
             "we ensure this during deserialization"
         );
         env.pageservers = {
-            let iter = std::fs::read_dir(&repopath).context("open dir")?;
+            let iter = std::fs::read_dir(repopath).context("open dir")?;
             let mut pageservers = Vec::new();
             for res in iter {
                 let dentry = res?;
@@ -719,10 +717,25 @@ impl LocalEnv {
 }
 
 pub fn base_path() -> PathBuf {
-    match std::env::var_os("NEON_REPO_DIR") {
-        Some(val) => PathBuf::from(val),
-        None => PathBuf::from(".neon"),
-    }
+    let path = match std::env::var_os("NEON_REPO_DIR") {
+        Some(val) => {
+            let path = PathBuf::from(val);
+            if !path.is_absolute() {
+                // repeat the env var in the error because our default is always absolute
+                panic!("NEON_REPO_DIR must be an absolute path, got {path:?}");
+            }
+            path
+        }
+        None => {
+            let pwd = std::env::current_dir()
+                // technically this can fail but it's quite unlikeley
+                .expect("determine current directory");
+            let pwd_abs = pwd.canonicalize().expect("canonicalize current directory");
+            pwd_abs.join(".neon")
+        }
+    };
+    assert!(path.is_absolute());
+    path
 }
 
 /// Generate a public/private key pair for JWT authentication
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 72948e203f..4f9f0ba794 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -314,15 +314,17 @@ impl StorageController {
             args.push(format!("--split-threshold={split_threshold}"))
         }
 
+        args.push(format!(
+            "--neon-local-repo-dir={}",
+            self.env.base_data_dir.display()
+        ));
+
         background_process::start_process(
             COMMAND,
             &self.env.base_data_dir,
             &self.env.storage_controller_bin(),
             args,
-            [(
-                "NEON_REPO_DIR".to_string(),
-                self.env.base_data_dir.to_string_lossy().to_string(),
-            )],
+            [],
             background_process::InitialPidFile::Create(self.pid_file()),
             || async {
                 match self.ready().await {
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 9d326ef82d..a1d051f150 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -283,7 +283,13 @@ impl ComputeHook {
         // all calls to this function
         let _locked = self.neon_local_lock.lock().await;
 
-        let env = match LocalEnv::load_config() {
+        let Some(repo_dir) = self.config.neon_local_repo_dir.as_deref() else {
+            tracing::warn!(
+                "neon_local_repo_dir not set, likely a bug in neon_local; skipping compute update"
+            );
+            return Ok(());
+        };
+        let env = match LocalEnv::load_config(repo_dir) {
             Ok(e) => e,
             Err(e) => {
                 tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})");
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index ce8f8d0cdd..f1eb0b30fc 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -4,6 +4,7 @@ use clap::Parser;
 use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
+use std::path::PathBuf;
 use std::sync::Arc;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
@@ -77,6 +78,12 @@ struct Cli {
     /// How long to wait for the initial database connection to be available.
     #[arg(long, default_value = "5s")]
     db_connect_timeout: humantime::Duration,
+
+    /// `neon_local` sets this to the path of the neon_local repo dir.
+    /// Only relevant for testing.
+    // TODO: make `cfg(feature = "testing")`
+    #[arg(long)]
+    neon_local_repo_dir: Option<PathBuf>,
 }
 
 enum StrictMode {
@@ -260,6 +267,7 @@ async fn async_main() -> anyhow::Result<()> {
             .reconciler_concurrency
             .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
         split_threshold: args.split_threshold,
+        neon_local_repo_dir: args.neon_local_repo_dir,
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 181e262638..8475bf46d2 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2,6 +2,7 @@ use std::{
     borrow::Cow,
     cmp::Ordering,
     collections::{BTreeMap, HashMap, HashSet},
+    path::PathBuf,
     str::FromStr,
     sync::Arc,
     time::{Duration, Instant},
@@ -236,6 +237,9 @@ pub struct Config {
     /// How large must a shard grow in bytes before we split it?
     /// None disables auto-splitting.
     pub split_threshold: Option<u64>,
+
+    // TODO: make this cfg(feature  = "testing")
+    pub neon_local_repo_dir: Option<PathBuf>,
 }
 
 impl From<DatabaseError> for ApiError {

From 76aa6936e8e4303fff7809c6c2e9b9467086a0dd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 19 Jun 2024 13:14:50 +0100
Subject: [PATCH 1007/1571] tests: make Endpoint.stop() thread safe (occasional
 flakes in `test_multi_attach`) (#8110)

## Problem

Tests using the `Workload` helper would occasionally fail in a strange
way, where the endpoint appears to try and stop twice concurrently, and
the second stop fails because the pidfile is already gone.
`test_multi_attach` suffered from this.

Workload has a `__del__` that stops the endpoint, and python is
destroying this object in a different thread than NeonEnv.stop is
called, resulting in racing stop() calls. Endpoint has a `running`
attribute that avoids calling neon_local's stop twice, but that doesn't
help in the concurrent case.

## Summary of changes

- Make `Endpoint.stop` thread safe with a simple lock held across the
updates to `running` and the actual act of stopping it.

One could also work around this by letting Workload.endpoint outlive the
Workload, or making Workload a context manager, but this change feels
most robust, as it avoids all test code having to know that it must not
try and stop an endpoint from a destructor.
---
 test_runner/fixtures/neon_fixtures.py | 53 +++++++++++++++++----------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8994db8cf2..49857d5151 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3446,6 +3446,12 @@ class Endpoint(PgProtocol, LogUtils):
         self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
         # path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf
 
+        # This lock prevents concurrent start & stop operations, keeping `self.running` consistent
+        # with whether we're really running.  Tests generally wouldn't try and do these concurrently,
+        # but endpoints are also stopped during test teardown, which might happen concurrently with
+        # destruction of objects in tests.
+        self.lock = threading.Lock()
+
     def http_client(
         self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
     ) -> EndpointHttpClient:
@@ -3516,14 +3522,15 @@ class Endpoint(PgProtocol, LogUtils):
 
         log.info(f"Starting postgres endpoint {self.endpoint_id}")
 
-        self.env.neon_cli.endpoint_start(
-            self.endpoint_id,
-            safekeepers=self.active_safekeepers,
-            remote_ext_config=remote_ext_config,
-            pageserver_id=pageserver_id,
-            allow_multiple=allow_multiple,
-        )
-        self.running = True
+        with self.lock:
+            self.env.neon_cli.endpoint_start(
+                self.endpoint_id,
+                safekeepers=self.active_safekeepers,
+                remote_ext_config=remote_ext_config,
+                pageserver_id=pageserver_id,
+                allow_multiple=allow_multiple,
+            )
+            self.running = True
 
         return self
 
@@ -3615,15 +3622,20 @@ class Endpoint(PgProtocol, LogUtils):
     def stop(self, mode: str = "fast") -> "Endpoint":
         """
         Stop the Postgres instance if it's running.
+
+        Because test teardown might try and stop an endpoint concurrently with test code
+        stopping the endpoint, this method is thread safe
+
         Returns self.
         """
 
-        if self.running:
-            assert self.endpoint_id is not None
-            self.env.neon_cli.endpoint_stop(
-                self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
-            )
-            self.running = False
+        with self.lock:
+            if self.running:
+                assert self.endpoint_id is not None
+                self.env.neon_cli.endpoint_stop(
+                    self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
+                )
+                self.running = False
 
         return self
 
@@ -3633,12 +3645,13 @@ class Endpoint(PgProtocol, LogUtils):
         Returns self.
         """
 
-        assert self.endpoint_id is not None
-        self.env.neon_cli.endpoint_stop(
-            self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode
-        )
-        self.endpoint_id = None
-        self.running = False
+        with self.lock:
+            assert self.endpoint_id is not None
+            self.env.neon_cli.endpoint_stop(
+                self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode
+            )
+            self.endpoint_id = None
+            self.running = False
 
         return self
 

From b998b703158923c493a9ce359e8c81ec89706653 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 19 Jun 2024 13:34:15 +0100
Subject: [PATCH 1008/1571] proxy: reduce some per-task memory usage (#8095)

## Problem

Some tasks are using around upwards of 10KB of memory at all times,
sometimes having buffers that swing them up to 30MB.

## Summary of changes

Split some of the async tasks in selective places and box them as
appropriate to try and reduce the constant memory usage. Especially in
the locations where the large future is only a small part of the total
runtime of the task.

Also, reduces the size of the CopyBuffer buffer size from 8KB to 1KB.

In my local testing and in staging this had a minor improvement. sadly
not the improvement I was hoping for :/ Might have more impact in
production
---
 proxy/src/proxy.rs                    |  42 ++++-----
 proxy/src/proxy/copy_bidirectional.rs |   2 +-
 proxy/src/serverless.rs               | 120 ++++++++++++++++----------
 proxy/src/serverless/backend.rs       |   2 +-
 proxy/src/serverless/conn_pool.rs     |   2 +-
 proxy/src/serverless/sql_over_http.rs |  44 +++++-----
 proxy/src/serverless/websocket.rs     |   4 +-
 7 files changed, 124 insertions(+), 92 deletions(-)

diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 95b46ae002..072f51958f 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -91,7 +91,7 @@ pub async fn task_main(
         let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();
 
         connections.spawn(async move {
-            let (socket, peer_addr) = match read_proxy_protocol(socket).await{
+            let (socket, peer_addr) = match read_proxy_protocol(socket).await {
                 Ok((socket, Some(addr))) => (socket, addr.ip()),
                 Err(e) => {
                     error!("per-client task finished with an error: {e:#}");
@@ -101,36 +101,38 @@ pub async fn task_main(
                     error!("missing required client IP");
                     return;
                 }
-                Ok((socket, None)) => (socket, peer_addr.ip())
+                Ok((socket, None)) => (socket, peer_addr.ip()),
             };
 
             match socket.inner.set_nodelay(true) {
-                Ok(()) => {},
+                Ok(()) => {}
                 Err(e) => {
                     error!("per-client task finished with an error: failed to set socket option: {e:#}");
                     return;
-                },
+                }
             };
 
             let mut ctx = RequestMonitoring::new(
-                    session_id,
-                    peer_addr,
-                    crate::metrics::Protocol::Tcp,
-                    &config.region,
-                );
+                session_id,
+                peer_addr,
+                crate::metrics::Protocol::Tcp,
+                &config.region,
+            );
             let span = ctx.span.clone();
 
-            let res = handle_client(
-                config,
-                &mut ctx,
-                cancellation_handler,
-                socket,
-                ClientMode::Tcp,
-                endpoint_rate_limiter2,
-                conn_gauge,
-            )
-            .instrument(span.clone())
-            .await;
+            let startup = Box::pin(
+                handle_client(
+                    config,
+                    &mut ctx,
+                    cancellation_handler,
+                    socket,
+                    ClientMode::Tcp,
+                    endpoint_rate_limiter2,
+                    conn_gauge,
+                )
+                .instrument(span.clone()),
+            );
+            let res = startup.await;
 
             match res {
                 Err(e) => {
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 4b09ebd8dc..aaf3688f21 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -98,7 +98,7 @@ pub(super) struct CopyBuffer {
     amt: u64,
     buf: Box<[u8]>,
 }
-const DEFAULT_BUF_SIZE: usize = 8 * 1024;
+const DEFAULT_BUF_SIZE: usize = 1024;
 
 impl CopyBuffer {
     pub(super) fn new() -> Self {
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 24ee749e6e..efa999ed7d 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -27,14 +27,14 @@ use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio::time::timeout;
-use tokio_rustls::TlsAcceptor;
+use tokio_rustls::{server::TlsStream, TlsAcceptor};
 use tokio_util::task::TaskTracker;
 
 use crate::cancellation::CancellationHandlerMain;
 use crate::config::ProxyConfig;
 use crate::context::RequestMonitoring;
 use crate::metrics::Metrics;
-use crate::protocol2::read_proxy_protocol;
+use crate::protocol2::{read_proxy_protocol, ChainRW};
 use crate::proxy::run_until_cancelled;
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
@@ -102,8 +102,6 @@ pub async fn task_main(
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
     connections.close(); // allows `connections.wait to complete`
 
-    let server = Builder::new(TokioExecutor::new());
-
     while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await {
         let (conn, peer_addr) = res.context("could not accept TCP stream")?;
         if let Err(e) = conn.set_nodelay(true) {
@@ -127,24 +125,50 @@ pub async fn task_main(
         }
 
         let conn_token = cancellation_token.child_token();
-        let conn = connection_handler(
-            config,
-            backend.clone(),
-            connections.clone(),
-            cancellation_handler.clone(),
-            endpoint_rate_limiter.clone(),
-            conn_token.clone(),
-            server.clone(),
-            tls_acceptor.clone(),
-            conn,
-            peer_addr,
-        )
-        .instrument(http_conn_span);
+        let tls_acceptor = tls_acceptor.clone();
+        let backend = backend.clone();
+        let connections2 = connections.clone();
+        let cancellation_handler = cancellation_handler.clone();
+        let endpoint_rate_limiter = endpoint_rate_limiter.clone();
+        connections.spawn(
+            async move {
+                let conn_token2 = conn_token.clone();
+                let _cancel_guard = config.http_config.cancel_set.insert(conn_id, conn_token2);
 
-        connections.spawn(async move {
-            let _cancel_guard = config.http_config.cancel_set.insert(conn_id, conn_token);
-            conn.await
-        });
+                let session_id = uuid::Uuid::new_v4();
+
+                let _gauge = Metrics::get()
+                    .proxy
+                    .client_connections
+                    .guard(crate::metrics::Protocol::Http);
+
+                let startup_result = Box::pin(connection_startup(
+                    config,
+                    tls_acceptor,
+                    session_id,
+                    conn,
+                    peer_addr,
+                ))
+                .await;
+                let Some((conn, peer_addr)) = startup_result else {
+                    return;
+                };
+
+                Box::pin(connection_handler(
+                    config,
+                    backend,
+                    connections2,
+                    cancellation_handler,
+                    endpoint_rate_limiter,
+                    conn_token,
+                    conn,
+                    peer_addr,
+                    session_id,
+                ))
+                .await;
+            }
+            .instrument(http_conn_span),
+        );
     }
 
     connections.wait().await;
@@ -152,40 +176,22 @@ pub async fn task_main(
     Ok(())
 }
 
-/// Handles the TCP lifecycle.
-///
+/// Handles the TCP startup lifecycle.
 /// 1. Parses PROXY protocol V2
 /// 2. Handles TLS handshake
-/// 3. Handles HTTP connection
-///     1. With graceful shutdowns
-///     2. With graceful request cancellation with connection failure
-///     3. With websocket upgrade support.
-#[allow(clippy::too_many_arguments)]
-async fn connection_handler(
-    config: &'static ProxyConfig,
-    backend: Arc<PoolingBackend>,
-    connections: TaskTracker,
-    cancellation_handler: Arc<CancellationHandlerMain>,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    cancellation_token: CancellationToken,
-    server: Builder<TokioExecutor>,
+async fn connection_startup(
+    config: &ProxyConfig,
     tls_acceptor: TlsAcceptor,
+    session_id: uuid::Uuid,
     conn: TcpStream,
     peer_addr: SocketAddr,
-) {
-    let session_id = uuid::Uuid::new_v4();
-
-    let _gauge = Metrics::get()
-        .proxy
-        .client_connections
-        .guard(crate::metrics::Protocol::Http);
-
+) -> Option<(TlsStream<ChainRW<TcpStream>>, IpAddr)> {
     // handle PROXY protocol
     let (conn, peer) = match read_proxy_protocol(conn).await {
         Ok(c) => c,
         Err(e) => {
             tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
-            return;
+            return None;
         }
     };
 
@@ -208,7 +214,7 @@ async fn connection_handler(
                 Metrics::get().proxy.tls_handshake_failures.inc();
             }
             warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
-            return;
+            return None;
         }
         // The handshake timed out
         Err(e) => {
@@ -216,16 +222,36 @@ async fn connection_handler(
                 Metrics::get().proxy.tls_handshake_failures.inc();
             }
             warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
-            return;
+            return None;
         }
     };
 
+    Some((conn, peer_addr))
+}
+
+/// Handles HTTP connection
+/// 1. With graceful shutdowns
+/// 2. With graceful request cancellation with connection failure
+/// 3. With websocket upgrade support.
+#[allow(clippy::too_many_arguments)]
+async fn connection_handler(
+    config: &'static ProxyConfig,
+    backend: Arc<PoolingBackend>,
+    connections: TaskTracker,
+    cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    cancellation_token: CancellationToken,
+    conn: TlsStream<ChainRW<TcpStream>>,
+    peer_addr: IpAddr,
+    session_id: uuid::Uuid,
+) {
     let session_id = AtomicTake::new(session_id);
 
     // Cancel all current inflight HTTP requests if the HTTP connection is closed.
     let http_cancellation_token = CancellationToken::new();
     let _cancel_connection = http_cancellation_token.clone().drop_guard();
 
+    let server = Builder::new(TokioExecutor::new());
     let conn = server.serve_connection_with_upgrades(
         hyper_util::rt::TokioIo::new(conn),
         hyper1::service::service_fn(move |req: hyper1::Request<Incoming>| {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index a40c66a80d..86e64c0a38 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -104,7 +104,7 @@ impl PoolingBackend {
     ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
         let maybe_client = if !force_new {
             info!("pool: looking for an existing connection");
-            self.pool.get(ctx, &conn_info).await?
+            self.pool.get(ctx, &conn_info)?
         } else {
             info!("pool: pool is disabled");
             None
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 5fa253acf8..170bda062e 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -375,7 +375,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         }
     }
 
-    pub async fn get(
+    pub fn get(
         self: &Arc<Self>,
         ctx: &mut RequestMonitoring,
         conn_info: &ConnInfo,
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 9d6a475aeb..7a99aeb759 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -533,27 +533,31 @@ async fn handle_inner(
         return Err(SqlOverHttpError::RequestTooLarge);
     }
 
-    let fetch_and_process_request = async {
-        let body = request.into_body().collect().await?.to_bytes();
-        info!(length = body.len(), "request payload read");
-        let payload: Payload = serde_json::from_slice(&body)?;
-        Ok::<Payload, ReadPayloadError>(payload) // Adjust error type accordingly
-    }
-    .map_err(SqlOverHttpError::from);
+    let fetch_and_process_request = Box::pin(
+        async {
+            let body = request.into_body().collect().await?.to_bytes();
+            info!(length = body.len(), "request payload read");
+            let payload: Payload = serde_json::from_slice(&body)?;
+            Ok::<Payload, ReadPayloadError>(payload) // Adjust error type accordingly
+        }
+        .map_err(SqlOverHttpError::from),
+    );
 
-    let authenticate_and_connect = async {
-        let keys = backend
-            .authenticate(ctx, &config.authentication_config, &conn_info)
-            .await?;
-        let client = backend
-            .connect_to_compute(ctx, conn_info, keys, !allow_pool)
-            .await?;
-        // not strictly necessary to mark success here,
-        // but it's just insurance for if we forget it somewhere else
-        ctx.latency_timer.success();
-        Ok::<_, HttpConnError>(client)
-    }
-    .map_err(SqlOverHttpError::from);
+    let authenticate_and_connect = Box::pin(
+        async {
+            let keys = backend
+                .authenticate(ctx, &config.authentication_config, &conn_info)
+                .await?;
+            let client = backend
+                .connect_to_compute(ctx, conn_info, keys, !allow_pool)
+                .await?;
+            // not strictly necessary to mark success here,
+            // but it's just insurance for if we forget it somewhere else
+            ctx.latency_timer.success();
+            Ok::<_, HttpConnError>(client)
+        }
+        .map_err(SqlOverHttpError::from),
+    );
 
     let (payload, mut client) = match run_until_cancelled(
         // Run both operations in parallel
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 7d3153a3c1..0e9772733d 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -141,7 +141,7 @@ pub async fn serve_websocket(
         .client_connections
         .guard(crate::metrics::Protocol::Ws);
 
-    let res = handle_client(
+    let res = Box::pin(handle_client(
         config,
         &mut ctx,
         cancellation_handler,
@@ -149,7 +149,7 @@ pub async fn serve_websocket(
         ClientMode::Websockets { hostname },
         endpoint_rate_limiter,
         conn_gauge,
-    )
+    ))
     .await;
 
     match res {

From 56da62487015f78c2cfbb48132bc85cd6f1f93d3 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 19 Jun 2024 15:04:29 +0200
Subject: [PATCH 1009/1571] allow storage_controller error during pagebench
 (#8109)

## Problem

`test_pageserver_max_throughput_getpage_at_latest_lsn` is a pagebench
testcase which creates several tenants/timelines to verify pageserver
performance.


The test swaps environments around in the tenant duplication stage, so
the storage controller uses two separate db instances (one in the
duplication stage and another one in the benchmarking stage).
In the benchmarking stage, the storage controller starts without any
knowledge of nodes, but with knowledge of tenants (via
attachments.json). When we re-attach and attempt to update the scheduler
stats, the scheduler rightfully complains
about the node not being known. The setup should preserve the storage
controller across the two envs, but i think it's fine to just allow list
the error in this case.

## Summary of changes

add the error message

`2024-06-19T09:38:27.866085Z ERROR Scheduler missing node 1``

to the list of allowed errors for storage_controller
---
 ...est_pageserver_max_throughput_getpage_at_latest_lsn.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 772a39fe35..68f3d9dcbe 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -209,3 +209,11 @@ def run_benchmark_max_throughput_latest_lsn(
             unit="ms",
             report=MetricReport.LOWER_IS_BETTER,
         )
+
+    env.storage_controller.allowed_errors.append(
+        # The test setup swaps NeonEnv instances, hence different
+        # pg instances are used for the storage controller db. This means
+        # the storage controller doesn't know about the nodes mentioned
+        # in attachments.json at start-up.
+        ".* Scheduler missing node 1",
+    )

From fd0b22f5cd11d5df2013bd0d9c79cb70086b3fa8 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Wed, 19 Jun 2024 15:05:31 +0200
Subject: [PATCH 1010/1571] Make sure we can handle temporarily offline PS when
 we first connect (#8094)

Fixes https://github.com/neondatabase/neon/issues/7897

## Problem

`shard->delay_us` was potentially uninitialized when we connect to PS,
as it wasn't set to a non-0 value until we've first connected to the
shard's pageserver.

That caused the exponential backoff to use an initial value (multiplier)
of 0 for the first connection attempt to that pageserver, thus causing a
hot retry loop with connection attempts to the pageserver without
significant delay. That in turn caused attemmpts to reconnect to quickly
fail, rather than showing the expected 'wait until pageserver is
available' behaviour.

## Summary of changes

We initialize shard->delay_us before connection initialization if we
notice it is not initialized yet.
---
 pgxn/neon/libpagestore.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 5eae2d8204..a665cafafe 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -381,6 +381,15 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
 		shard->last_reconnect_time = now;
 
+		/*
+		 * Make sure we don't do exponential backoff with a constant multiplier
+		 * of 0 us, as that doesn't really do much for timeouts...
+		 *
+		 * cf. https://github.com/neondatabase/neon/issues/7897
+		 */
+		if (shard->delay_us == 0)
+			shard->delay_us = MIN_RECONNECT_INTERVAL_USEC;
+
 		/*
 		 * If we did other tasks between reconnect attempts, then we won't
 		 * need to wait as long as a full delay.

From f0e2bb79b22cb04994eeca64fbb88a3e17a37779 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 19 Jun 2024 17:07:14 +0100
Subject: [PATCH 1011/1571] tests: use semaphore instead of lock for
 Endpoint.running (#8112)

## Problem

Ahem, let's try this again.

https://github.com/neondatabase/neon/pull/8110 had a spooky failure in
test_multi_attach where a call to Endpoint.stop() timed out waiting for
a lock, even though we can see an earlier call completing and releasing
the lock. I suspect something weird is going on with the way pytest runs
tests across processes, or use of asyncio perhaps.

Anyway: the simplest fix is to just use a semaphore instead: if we don't
lock we can't deadlock.

## Summary of changes

- Make Endpoint.running a semaphore, where we add a unit to its counter
when starting the process and atomically decrement it when stopping.
---
 test_runner/fixtures/neon_fixtures.py | 50 ++++++++++++++-------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 49857d5151..b5d9a69d55 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3446,11 +3446,12 @@ class Endpoint(PgProtocol, LogUtils):
         self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
         # path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf
 
-        # This lock prevents concurrent start & stop operations, keeping `self.running` consistent
-        # with whether we're really running.  Tests generally wouldn't try and do these concurrently,
-        # but endpoints are also stopped during test teardown, which might happen concurrently with
-        # destruction of objects in tests.
-        self.lock = threading.Lock()
+        # Semaphore is set to 1 when we start, and acquire'd back to zero when we stop
+        #
+        # We use a semaphore rather than a bool so that racing calls to stop() don't
+        # try and stop the same process twice, as stop() is called by test teardown and
+        # potentially by some __del__ chains in other threads.
+        self._running = threading.Semaphore(0)
 
     def http_client(
         self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
@@ -3522,15 +3523,14 @@ class Endpoint(PgProtocol, LogUtils):
 
         log.info(f"Starting postgres endpoint {self.endpoint_id}")
 
-        with self.lock:
-            self.env.neon_cli.endpoint_start(
-                self.endpoint_id,
-                safekeepers=self.active_safekeepers,
-                remote_ext_config=remote_ext_config,
-                pageserver_id=pageserver_id,
-                allow_multiple=allow_multiple,
-            )
-            self.running = True
+        self.env.neon_cli.endpoint_start(
+            self.endpoint_id,
+            safekeepers=self.active_safekeepers,
+            remote_ext_config=remote_ext_config,
+            pageserver_id=pageserver_id,
+            allow_multiple=allow_multiple,
+        )
+        self._running.release(1)
 
         return self
 
@@ -3578,9 +3578,12 @@ class Endpoint(PgProtocol, LogUtils):
             conf_file.write("\n".join(hba) + "\n")
             conf_file.write(data)
 
-        if self.running:
+        if self.is_running():
             self.safe_psql("SELECT pg_reload_conf()")
 
+    def is_running(self):
+        return self._running._value > 0
+
     def reconfigure(self, pageserver_id: Optional[int] = None):
         assert self.endpoint_id is not None
         self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id)
@@ -3629,13 +3632,12 @@ class Endpoint(PgProtocol, LogUtils):
         Returns self.
         """
 
-        with self.lock:
-            if self.running:
-                assert self.endpoint_id is not None
-                self.env.neon_cli.endpoint_stop(
-                    self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
-                )
-                self.running = False
+        running = self._running.acquire(blocking=False)
+        if running:
+            assert self.endpoint_id is not None
+            self.env.neon_cli.endpoint_stop(
+                self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
+            )
 
         return self
 
@@ -3645,13 +3647,13 @@ class Endpoint(PgProtocol, LogUtils):
         Returns self.
         """
 
-        with self.lock:
+        running = self._running.acquire(blocking=False)
+        if running:
             assert self.endpoint_id is not None
             self.env.neon_cli.endpoint_stop(
                 self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode
             )
             self.endpoint_id = None
-            self.running = False
 
         return self
 

From 558a57b15b61fe2b0f18218c1314116b11e14c11 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 19 Jun 2024 17:54:07 +0100
Subject: [PATCH 1012/1571] CI(test-images): add dockerhub auth (#8115)

## Problem
```
Unable to find image 'neondatabase/neon:9583413584' locally
docker: Error response from daemon: toomanyrequests: You have reached your pull rate limit. You may increase the limit by authenticating and upgrading: https://www.docker.com/increase-rate-limit.
```

## Summary of changes
- add `docker/login-action@v3` for `test-images` job
---
 .github/workflows/build_and_test.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 8c8500260c..e9adf28b99 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1023,6 +1023,18 @@ jobs:
         with:
           fetch-depth: 0
 
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
       # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
       # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
       # Regular pageserver version string looks like
@@ -1057,6 +1069,11 @@ jobs:
           docker compose -f ./docker-compose/docker-compose.yml logs || 0
           docker compose -f ./docker-compose/docker-compose.yml down
 
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
+
   promote-images:
     needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
     runs-on: ubuntu-22.04

From c789ec21f6053d4c25d2419c4a34ed298d5f69f5 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 19 Jun 2024 19:21:09 +0100
Subject: [PATCH 1013/1571] CI: miscellaneous cleanups (#8073)

## Problem
There are a couple of small CI cleanups that seem too small for dedicated PRs

## Summary of changes
- Create release PR with the title that matches the title in the description
- Tune error message for disallowing `ubuntu-latest` to explicitly
mention what to do
- Remove junit output from pytest, we use allure instead
---
 .github/actions/run-python-test-set/action.yml |  4 +---
 .github/workflows/actionlint.yml               |  9 +++++----
 .github/workflows/release.yml                  | 12 ++++++++----
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index d9e543d4bb..d5c1fcf524 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -183,8 +183,7 @@ runs:
 
         # Run the tests.
         #
-        # The junit.xml file allows CI tools to display more fine-grained test information
-        # in its "Tests" tab in the results page.
+        # --alluredir saves test results in Allure format (in a specified directory)
         # --verbose prints name of each test (helpful when there are
         # multiple tests in one file)
         # -rA prints summary in the end
@@ -193,7 +192,6 @@ runs:
         #
         mkdir -p $TEST_OUTPUT/allure/results
         "${cov_prefix[@]}" ./scripts/pytest \
-          --junitxml=$TEST_OUTPUT/junit.xml \
           --alluredir=$TEST_OUTPUT/allure/results \
           --tb=short \
           --verbose \
diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index 078c7f88c4..34fd8b1d15 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -36,15 +36,16 @@ jobs:
           fail_on_error: true
           filter_mode: nofilter
           level: error
-      - run: |
+
+      - name: Disallow 'ubuntu-latest' runners
+        run: |
           PAT='^\s*runs-on:.*-latest'
-          if grep -ERq $PAT .github/workflows
-          then
+          if grep -ERq $PAT .github/workflows; then
             grep -ERl $PAT .github/workflows |\
             while read -r f
             do
               l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
-              echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
+              echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'"
             done
             exit 1
           fi
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 90a3aaaf2d..56ef6f4bbb 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -52,13 +52,15 @@ jobs:
       env:
         GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
       run: |
+        TITLE="Storage & Compute release ${RELEASE_DATE}"
+
         cat << EOF > body.md
-          ## Storage & Compute release ${RELEASE_DATE}
+          ## ${TITLE}
 
           **Please merge this Pull Request using 'Create a merge commit' button**
         EOF
 
-        gh pr create --title "Release ${RELEASE_DATE}" \
+        gh pr create --title "${TITLE}" \
                      --body-file "body.md" \
                      --head "${RELEASE_BRANCH}" \
                      --base "release"
@@ -91,13 +93,15 @@ jobs:
       env:
         GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
       run: |
+        TITLE="Proxy release ${RELEASE_DATE}"
+
         cat << EOF > body.md
-          ## Proxy release ${RELEASE_DATE}
+          ## ${TITLE}
 
           **Please merge this Pull Request using 'Create a merge commit' button**
         EOF
 
-        gh pr create --title "Proxy release ${RELEASE_DATE}" \
+        gh pr create --title "${TITLE}" \
                      --body-file "body.md" \
                      --head "${RELEASE_BRANCH}" \
                      --base "release-proxy"

From 79401638df8e4c3efb41a6013316cfd5c9061a0a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 20 Jun 2024 11:56:14 +0200
Subject: [PATCH 1014/1571] remove materialized page cache (#8105)

part of Epic https://github.com/neondatabase/neon/issues/7386

# Motivation

The materialized page cache adds complexity to the code base, which
increases the maintenance burden and risk for subtle and hard to
reproduce bugs such as #8050.

Further, the best hit rate that we currently achieve in production is ca
1% of materialized page cache lookups for
`task_kind=PageRequestHandler`. Other task kinds have hit rates <0.2%.

Last, caching page images in Pageserver rewards under-sized caches in
Computes because reading from Pageserver's materialized page cache over
the network is often sufficiently fast (low hundreds of microseconds).
Such Computes should upscale their local caches to fit their working
set, rather than repeatedly requesting the same page from Pageserver.

Some more discussion and context in internal thread
https://neondb.slack.com/archives/C033RQ5SPDH/p1718714037708459

# Changes

This PR removes the materialized page cache code & metrics.

The infrastructure for different key kinds in `PageCache` is left in
place, even though the "Immutable" key kind is the only remaining one.
This can be further simplified in a future commit.

Some tests started failing because their total runtime was dependent on
high materialized page cache hit rates. This test makes them
fixed-runtime or raises pytest timeouts:
* test_local_file_cache_unlink
* test_physical_replication
* test_pg_regress

# Performance

I focussed on ensuring that this PR will not result in a performance
regression in prod.

* **getpage** requests: our production metrics have shown the
materialized page cache to be irrelevant (low hit rate). Also,
Pageserver is the wrong place to cache page images, it should happen in
compute.
* **ingest** (`task_kind=WalReceiverConnectionHandler`): prod metrics
show 0 percent hit rate, so, removing will not be a regression.
* **get_lsn_by_timestamp**: important API for branch creation, used by
control pane. The clog pages that this code uses are not
materialize-page-cached because they're not 8k. No risk of introducing a
regression here.

We will watch the various nightly benchmarks closely for more results
before shipping to prod.
---
 docs/pageserver-pagecache.md                  |   1 -
 docs/settings.md                              |   2 +-
 pageserver/src/metrics.rs                     |  62 +---
 pageserver/src/page_cache.rs                  | 317 +-----------------
 pageserver/src/tenant/timeline.rs             |  77 +----
 test_runner/fixtures/metrics.py               |   2 -
 test_runner/regress/test_local_file_cache.py  |  27 +-
 test_runner/regress/test_pg_regress.py        |   6 +-
 .../regress/test_physical_replication.py      |  17 +-
 9 files changed, 43 insertions(+), 468 deletions(-)

diff --git a/docs/pageserver-pagecache.md b/docs/pageserver-pagecache.md
index d9b120bbb9..d022742dff 100644
--- a/docs/pageserver-pagecache.md
+++ b/docs/pageserver-pagecache.md
@@ -5,4 +5,3 @@ TODO:
 - shared across tenants
 - store pages from layer files
 - store pages from "in-memory layer"
-- store materialized pages
diff --git a/docs/settings.md b/docs/settings.md
index 817f97d8ba..12a6a4c171 100644
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -134,7 +134,7 @@ depends on that, so if you change it, bad things will happen.
 
 #### page_cache_size
 
-Size of the page cache, to hold materialized page versions. Unit is
+Size of the page cache. Unit is
 number of 8 kB blocks. The default is 8192, which means 64 MB.
 
 #### max_file_descriptors
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index e8a1e063c5..2992fef561 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -145,14 +145,6 @@ impl ReconstructTimeMetrics {
     }
 }
 
-pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_materialized_cache_hits_direct_total",
-        "Number of cache hits from materialized page cache without redo",
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) struct ReconstructDataTimeMetrics {
     singular: Histogram,
     vectored: Histogram,
@@ -182,14 +174,6 @@ pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> =
     }
 });
 
-pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_materialized_cache_hits_total",
-        "Number of cache hits from materialized page cache",
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) struct GetVectoredLatency {
     map: EnumMap<TaskKind, Option<Histogram>>,
 }
@@ -298,12 +282,8 @@ pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
 });
 
 pub(crate) struct PageCacheMetricsForTaskKind {
-    pub read_accesses_materialized_page: IntCounter,
     pub read_accesses_immutable: IntCounter,
-
     pub read_hits_immutable: IntCounter,
-    pub read_hits_materialized_page_exact: IntCounter,
-    pub read_hits_materialized_page_older_lsn: IntCounter,
 }
 
 pub(crate) struct PageCacheMetrics {
@@ -336,16 +316,6 @@ pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMet
             let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
             let content_kind: &'static str = content_kind.into();
             PageCacheMetricsForTaskKind {
-                read_accesses_materialized_page: {
-                    PAGE_CACHE_READ_ACCESSES
-                        .get_metric_with_label_values(&[
-                            task_kind,
-                            "materialized_page",
-                            content_kind,
-                        ])
-                        .unwrap()
-                },
-
                 read_accesses_immutable: {
                     PAGE_CACHE_READ_ACCESSES
                         .get_metric_with_label_values(&[task_kind, "immutable", content_kind])
@@ -357,28 +327,6 @@ pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMet
                         .get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"])
                         .unwrap()
                 },
-
-                read_hits_materialized_page_exact: {
-                    PAGE_CACHE_READ_HITS
-                        .get_metric_with_label_values(&[
-                            task_kind,
-                            "materialized_page",
-                            content_kind,
-                            "exact",
-                        ])
-                        .unwrap()
-                },
-
-                read_hits_materialized_page_older_lsn: {
-                    PAGE_CACHE_READ_HITS
-                        .get_metric_with_label_values(&[
-                            task_kind,
-                            "materialized_page",
-                            content_kind,
-                            "older_lsn",
-                        ])
-                        .unwrap()
-                },
             }
         }))
     })),
@@ -394,7 +342,6 @@ pub(crate) struct PageCacheSizeMetrics {
     pub max_bytes: UIntGauge,
 
     pub current_bytes_immutable: UIntGauge,
-    pub current_bytes_materialized_page: UIntGauge,
 }
 
 static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
@@ -420,11 +367,6 @@ pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
                 .get_metric_with_label_values(&["immutable"])
                 .unwrap()
         },
-        current_bytes_materialized_page: {
-            PAGE_CACHE_SIZE_CURRENT_BYTES
-                .get_metric_with_label_values(&["materialized_page"])
-                .unwrap()
-        },
     });
 
 pub(crate) mod page_cache_eviction_metrics {
@@ -2918,13 +2860,11 @@ pub fn preinitialize_metrics() {
     // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
     // order:
     // - global metrics reside in a Lazy<PageserverMetrics>
-    //   - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
+    //   - access via crate::metrics::PS_METRICS.some_metric.inc()
     // - could move the statics into TimelineMetrics::new()?
 
     // counters
     [
-        &MATERIALIZED_PAGE_CACHE_HIT,
-        &MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
         &UNEXPECTED_ONDEMAND_DOWNLOADS,
         &WALRECEIVER_STARTED_CONNECTIONS,
         &WALRECEIVER_BROKER_UPDATES,
diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs
index 529fb9bb07..f386c825b8 100644
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -17,7 +17,6 @@
 //!
 //! Two types of pages are supported:
 //!
-//! * **Materialized pages**, filled & used by page reconstruction
 //! * **Immutable File pages**, filled & used by [`crate::tenant::block_io`] and [`crate::tenant::ephemeral_file`].
 //!
 //! Note that [`crate::tenant::ephemeral_file::EphemeralFile`] is generally mutable, but, it's append-only.
@@ -28,9 +27,6 @@
 //! Page cache maps from a cache key to a buffer slot.
 //! The cache key uniquely identifies the piece of data that is being cached.
 //!
-//! The cache key for **materialized pages** is  [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`].
-//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
-//!
 //! The cache key for **immutable file** pages is [`FileId`] and a block number.
 //! Users of page cache that wish to page-cache an arbitrary (immutable!) on-disk file do the following:
 //! * Have a mechanism to deterministically associate the on-disk file with a [`FileId`].
@@ -82,13 +78,10 @@ use std::{
 
 use anyhow::Context;
 use once_cell::sync::OnceCell;
-use pageserver_api::shard::TenantShardId;
-use utils::{id::TimelineId, lsn::Lsn};
 
 use crate::{
     context::RequestContext,
     metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
-    repository::Key,
 };
 
 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -139,33 +132,7 @@ pub fn next_file_id() -> FileId {
 #[derive(Debug, PartialEq, Eq, Clone)]
 #[allow(clippy::enum_variant_names)]
 enum CacheKey {
-    MaterializedPage {
-        hash_key: MaterializedPageHashKey,
-        lsn: Lsn,
-    },
-    ImmutableFilePage {
-        file_id: FileId,
-        blkno: u32,
-    },
-}
-
-#[derive(Debug, PartialEq, Eq, Hash, Clone)]
-struct MaterializedPageHashKey {
-    /// Why is this TenantShardId rather than TenantId?
-    ///
-    /// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant.  However, this
-    /// this not the case for certain internally-generated pages (e.g. relation sizes).  In future, we may make this
-    /// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are
-    /// special-cased in some other way.
-    tenant_shard_id: TenantShardId,
-    timeline_id: TimelineId,
-    key: Key,
-}
-
-#[derive(Clone)]
-struct Version {
-    lsn: Lsn,
-    slot_idx: usize,
+    ImmutableFilePage { file_id: FileId, blkno: u32 },
 }
 
 struct Slot {
@@ -236,17 +203,6 @@ impl SlotInner {
 }
 
 pub struct PageCache {
-    /// This contains the mapping from the cache key to buffer slot that currently
-    /// contains the page, if any.
-    ///
-    /// TODO: This is protected by a single lock. If that becomes a bottleneck,
-    /// this HashMap can be replaced with a more concurrent version, there are
-    /// plenty of such crates around.
-    ///
-    /// If you add support for caching different kinds of objects, each object kind
-    /// can have a separate mapping map, next to this field.
-    materialized_page_map: std::sync::RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
-
     immutable_page_map: std::sync::RwLock<HashMap<(FileId, u32), usize>>,
 
     /// The actual buffers with their metadata.
@@ -371,175 +327,14 @@ pub enum ReadBufResult<'a> {
 }
 
 impl PageCache {
-    //
-    // Section 1.1: Public interface functions for looking up and memorizing materialized page
-    // versions in the page cache
-    //
-
-    /// Look up a materialized page version.
-    ///
-    /// The 'lsn' is an upper bound, this will return the latest version of
-    /// the given block, but not newer than 'lsn'. Returns the actual LSN of the
-    /// returned page.
-    pub async fn lookup_materialized_page(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        key: &Key,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Option<(Lsn, PageReadGuard)> {
-        let Ok(permit) = self.try_get_pinned_slot_permit().await else {
-            return None;
-        };
-
-        crate::metrics::PAGE_CACHE
-            .for_ctx(ctx)
-            .read_accesses_materialized_page
-            .inc();
-
-        let mut cache_key = CacheKey::MaterializedPage {
-            hash_key: MaterializedPageHashKey {
-                tenant_shard_id,
-                timeline_id,
-                key: *key,
-            },
-            lsn,
-        };
-
-        if let Some(guard) = self
-            .try_lock_for_read(&mut cache_key, &mut Some(permit))
-            .await
-        {
-            if let CacheKey::MaterializedPage {
-                hash_key: _,
-                lsn: available_lsn,
-            } = cache_key
-            {
-                if available_lsn == lsn {
-                    crate::metrics::PAGE_CACHE
-                        .for_ctx(ctx)
-                        .read_hits_materialized_page_exact
-                        .inc();
-                } else {
-                    crate::metrics::PAGE_CACHE
-                        .for_ctx(ctx)
-                        .read_hits_materialized_page_older_lsn
-                        .inc();
-                }
-                Some((available_lsn, guard))
-            } else {
-                panic!("unexpected key type in slot");
-            }
-        } else {
-            None
-        }
-    }
-
-    ///
-    /// Store an image of the given page in the cache.
-    ///
-    pub async fn memorize_materialized_page(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        key: Key,
-        lsn: Lsn,
-        img: &[u8],
-    ) -> anyhow::Result<()> {
-        let cache_key = CacheKey::MaterializedPage {
-            hash_key: MaterializedPageHashKey {
-                tenant_shard_id,
-                timeline_id,
-                key,
-            },
-            lsn,
-        };
-
-        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
-        loop {
-            // First check if the key already exists in the cache.
-            if let Some(slot_idx) = self.search_mapping_exact(&cache_key) {
-                // The page was found in the mapping. Lock the slot, and re-check
-                // that it's still what we expected (because we don't released the mapping
-                // lock already, another thread could have evicted the page)
-                let slot = &self.slots[slot_idx];
-                let inner = slot.inner.write().await;
-                if inner.key.as_ref() == Some(&cache_key) {
-                    slot.inc_usage_count();
-                    debug_assert!(
-                        {
-                            let guard = inner.permit.lock().unwrap();
-                            guard.upgrade().is_none()
-                        },
-                        "we hold a write lock, so, no one else should have a permit"
-                    );
-                    debug_assert_eq!(inner.buf.len(), img.len());
-                    // We already had it in cache. Another thread must've put it there
-                    // concurrently. Check that it had the same contents that we
-                    // replayed.
-                    assert!(inner.buf == img);
-                    return Ok(());
-                }
-            }
-            debug_assert!(permit.is_some());
-
-            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) = self
-                .find_victim(permit.as_ref().unwrap())
-                .await
-                .context("Failed to find evict victim")?;
-
-            // Insert mapping for this. At this point, we may find that another
-            // thread did the same thing concurrently. In that case, we evicted
-            // our victim buffer unnecessarily. Put it into the free list and
-            // continue with the slot that the other thread chose.
-            if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) {
-                // TODO: put to free list
-
-                // We now just loop back to start from beginning. This is not
-                // optimal, we'll perform the lookup in the mapping again, which
-                // is not really necessary because we already got
-                // 'existing_slot_idx'.  But this shouldn't happen often enough
-                // to matter much.
-                continue;
-            }
-
-            // Make the slot ready
-            let slot = &self.slots[slot_idx];
-            inner.key = Some(cache_key.clone());
-            slot.set_usage_count(1);
-            // Create a write guard for the slot so we go through the expected motions.
-            debug_assert!(
-                {
-                    let guard = inner.permit.lock().unwrap();
-                    guard.upgrade().is_none()
-                },
-                "we hold a write lock, so, no one else should have a permit"
-            );
-            let mut write_guard = PageWriteGuard {
-                state: PageWriteGuardState::Invalid {
-                    _permit: permit.take().unwrap(),
-                    inner,
-                },
-            };
-            write_guard.copy_from_slice(img);
-            let _ = write_guard.mark_valid();
-            return Ok(());
-        }
-    }
-
-    // Section 1.2: Public interface functions for working with immutable file pages.
-
     pub async fn read_immutable_buf(
         &self,
         file_id: FileId,
         blkno: u32,
         ctx: &RequestContext,
     ) -> anyhow::Result<ReadBufResult> {
-        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
-
-        self.lock_for_read(&mut cache_key, ctx).await
+        self.lock_for_read(&(CacheKey::ImmutableFilePage { file_id, blkno }), ctx)
+            .await
     }
 
     //
@@ -573,19 +368,11 @@ impl PageCache {
 
     /// Look up a page in the cache.
     ///
-    /// If the search criteria is not exact, *cache_key is updated with the key
-    /// for exact key of the returned page. (For materialized pages, that means
-    /// that the LSN in 'cache_key' is updated with the LSN of the returned page
-    /// version.)
-    ///
-    /// If no page is found, returns None and *cache_key is left unmodified.
-    ///
     async fn try_lock_for_read(
         &self,
-        cache_key: &mut CacheKey,
+        cache_key: &CacheKey,
         permit: &mut Option<PinnedSlotsPermit>,
     ) -> Option<PageReadGuard> {
-        let cache_key_orig = cache_key.clone();
         if let Some(slot_idx) = self.search_mapping(cache_key) {
             // The page was found in the mapping. Lock the slot, and re-check
             // that it's still what we expected (because we released the mapping
@@ -598,9 +385,6 @@ impl PageCache {
                     _permit: inner.coalesce_readers_permit(permit.take().unwrap()),
                     slot_guard: inner,
                 });
-            } else {
-                // search_mapping might have modified the search key; restore it.
-                *cache_key = cache_key_orig;
             }
         }
         None
@@ -637,15 +421,12 @@ impl PageCache {
     ///
     async fn lock_for_read(
         &self,
-        cache_key: &mut CacheKey,
+        cache_key: &CacheKey,
         ctx: &RequestContext,
     ) -> anyhow::Result<ReadBufResult> {
         let mut permit = Some(self.try_get_pinned_slot_permit().await?);
 
         let (read_access, hit) = match cache_key {
-            CacheKey::MaterializedPage { .. } => {
-                unreachable!("Materialized pages use lookup_materialized_page")
-            }
             CacheKey::ImmutableFilePage { .. } => (
                 &crate::metrics::PAGE_CACHE
                     .for_ctx(ctx)
@@ -717,52 +498,15 @@ impl PageCache {
 
     /// Search for a page in the cache using the given search key.
     ///
-    /// Returns the slot index, if any. If the search criteria is not exact,
-    /// *cache_key is updated with the actual key of the found page.
+    /// Returns the slot index, if any.
     ///
     /// NOTE: We don't hold any lock on the mapping on return, so the slot might
     /// get recycled for an unrelated page immediately after this function
     /// returns.  The caller is responsible for re-checking that the slot still
     /// contains the page with the same key before using it.
     ///
-    fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
+    fn search_mapping(&self, cache_key: &CacheKey) -> Option<usize> {
         match cache_key {
-            CacheKey::MaterializedPage { hash_key, lsn } => {
-                let map = self.materialized_page_map.read().unwrap();
-                let versions = map.get(hash_key)?;
-
-                let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
-                    Ok(version_idx) => version_idx,
-                    Err(0) => return None,
-                    Err(version_idx) => version_idx - 1,
-                };
-                let version = &versions[version_idx];
-                *lsn = version.lsn;
-                Some(version.slot_idx)
-            }
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let map = self.immutable_page_map.read().unwrap();
-                Some(*map.get(&(*file_id, *blkno))?)
-            }
-        }
-    }
-
-    /// Search for a page in the cache using the given search key.
-    ///
-    /// Like 'search_mapping, but performs an "exact" search. Used for
-    /// allocating a new buffer.
-    fn search_mapping_exact(&self, key: &CacheKey) -> Option<usize> {
-        match key {
-            CacheKey::MaterializedPage { hash_key, lsn } => {
-                let map = self.materialized_page_map.read().unwrap();
-                let versions = map.get(hash_key)?;
-
-                if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
-                    Some(versions[version_idx].slot_idx)
-                } else {
-                    None
-                }
-            }
             CacheKey::ImmutableFilePage { file_id, blkno } => {
                 let map = self.immutable_page_map.read().unwrap();
                 Some(*map.get(&(*file_id, *blkno))?)
@@ -775,27 +519,6 @@ impl PageCache {
     ///
     fn remove_mapping(&self, old_key: &CacheKey) {
         match old_key {
-            CacheKey::MaterializedPage {
-                hash_key: old_hash_key,
-                lsn: old_lsn,
-            } => {
-                let mut map = self.materialized_page_map.write().unwrap();
-                if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
-                    let versions = old_entry.get_mut();
-
-                    if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
-                        versions.remove(version_idx);
-                        self.size_metrics
-                            .current_bytes_materialized_page
-                            .sub_page_sz(1);
-                        if versions.is_empty() {
-                            old_entry.remove_entry();
-                        }
-                    }
-                } else {
-                    panic!("could not find old key in mapping")
-                }
-            }
             CacheKey::ImmutableFilePage { file_id, blkno } => {
                 let mut map = self.immutable_page_map.write().unwrap();
                 map.remove(&(*file_id, *blkno))
@@ -812,30 +535,6 @@ impl PageCache {
     /// of the existing mapping and leaves it untouched.
     fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
         match new_key {
-            CacheKey::MaterializedPage {
-                hash_key: new_key,
-                lsn: new_lsn,
-            } => {
-                let mut map = self.materialized_page_map.write().unwrap();
-                let versions = map.entry(new_key.clone()).or_default();
-                match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
-                    Ok(version_idx) => Some(versions[version_idx].slot_idx),
-                    Err(version_idx) => {
-                        versions.insert(
-                            version_idx,
-                            Version {
-                                lsn: *new_lsn,
-                                slot_idx,
-                            },
-                        );
-                        self.size_metrics
-                            .current_bytes_materialized_page
-                            .add_page_sz(1);
-                        None
-                    }
-                }
-            }
-
             CacheKey::ImmutableFilePage { file_id, blkno } => {
                 let mut map = self.immutable_page_map.write().unwrap();
                 match map.entry((*file_id, *blkno)) {
@@ -949,7 +648,6 @@ impl PageCache {
         let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
         size_metrics.max_bytes.set_page_sz(num_pages);
         size_metrics.current_bytes_immutable.set_page_sz(0);
-        size_metrics.current_bytes_materialized_page.set_page_sz(0);
 
         let slots = page_buffer
             .chunks_exact_mut(PAGE_SZ)
@@ -968,7 +666,6 @@ impl PageCache {
             .collect();
 
         Self {
-            materialized_page_map: Default::default(),
             immutable_page_map: Default::default(),
             slots,
             next_evict_slot: AtomicUsize::new(0),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a4f1108635..5398ad399c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -101,9 +101,7 @@ use crate::{
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::{
-    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
-};
+use crate::metrics::TimelineMetrics;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
@@ -120,7 +118,6 @@ use utils::{
     simple_rcu::{Rcu, RcuReadGuard},
 };
 
-use crate::page_cache;
 use crate::repository::GcResult;
 use crate::repository::{Key, Value};
 use crate::task_mgr;
@@ -134,7 +131,7 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
+use super::config::TenantConf;
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
@@ -887,32 +884,11 @@ impl Timeline {
 
         self.timeline_get_throttle.throttle(ctx, 1).await;
 
-        // Check the page cache. We will get back the most recent page with lsn <= `lsn`.
-        // The cached image can be returned directly if there is no WAL between the cached image
-        // and requested LSN. The cached image can also be used to reduce the amount of WAL needed
-        // for redo.
-        let cached_page_img = match self.lookup_cached_page(&key, lsn, ctx).await {
-            Some((cached_lsn, cached_img)) => {
-                match cached_lsn.cmp(&lsn) {
-                    Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
-                    Ordering::Equal => {
-                        MATERIALIZED_PAGE_CACHE_HIT_DIRECT.inc();
-                        return Ok(cached_img); // exact LSN match, return the image
-                    }
-                    Ordering::Greater => {
-                        unreachable!("the returned lsn should never be after the requested lsn")
-                    }
-                }
-                Some((cached_lsn, cached_img))
-            }
-            None => None,
-        };
-
         match self.conf.get_impl {
             GetImpl::Legacy => {
                 let reconstruct_state = ValueReconstructState {
                     records: Vec::new(),
-                    img: cached_page_img,
+                    img: None,
                 };
 
                 self.get_impl(key, lsn, reconstruct_state, ctx).await
@@ -926,13 +902,6 @@ impl Timeline {
                 // entry returned above.
                 let mut reconstruct_state = ValuesReconstructState::new();
 
-                // Only add the cached image to the reconstruct state when it exists.
-                if cached_page_img.is_some() {
-                    let mut key_state = VectoredValueReconstructState::default();
-                    key_state.img = cached_page_img;
-                    reconstruct_state.keys.insert(key, Ok(key_state));
-                }
-
                 let vectored_res = self
                     .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
                     .await;
@@ -3240,7 +3209,6 @@ impl Timeline {
                 ValueReconstructResult::Continue => {
                     // If we reached an earlier cached page image, we're done.
                     if cont_lsn == cached_lsn + 1 {
-                        MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
                         return Ok(traversal_path);
                     }
                     if let Some(prev) = prev_lsn {
@@ -3614,26 +3582,6 @@ impl Timeline {
         })
     }
 
-    /// # Cancel-safety
-    ///
-    /// This method is cancellation-safe.
-    async fn lookup_cached_page(
-        &self,
-        key: &Key,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Option<(Lsn, Bytes)> {
-        let cache = page_cache::get();
-
-        // FIXME: It's pointless to check the cache for things that are not 8kB pages.
-        // We should look at the key to determine if it's a cacheable object
-        let (lsn, read_guard) = cache
-            .lookup_materialized_page(self.tenant_shard_id, self.timeline_id, key, lsn, ctx)
-            .await?;
-        let img = Bytes::from(read_guard.to_vec());
-        Some((lsn, img))
-    }
-
     async fn get_ready_ancestor_timeline(
         &self,
         ancestor: &Arc<Timeline>,
@@ -5280,8 +5228,6 @@ impl Timeline {
                     trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
                 };
 
-                let last_rec_lsn = data.records.last().unwrap().0;
-
                 let img = match self
                     .walredo_mgr
                     .as_ref()
@@ -5295,23 +5241,6 @@ impl Timeline {
                     Err(e) => return Err(PageReconstructError::WalRedo(e)),
                 };
 
-                if img.len() == page_cache::PAGE_SZ {
-                    let cache = page_cache::get();
-                    if let Err(e) = cache
-                        .memorize_materialized_page(
-                            self.tenant_shard_id,
-                            self.timeline_id,
-                            key,
-                            last_rec_lsn,
-                            &img,
-                        )
-                        .await
-                        .context("Materialized page memoization failed")
-                    {
-                        return Err(PageReconstructError::from(e));
-                    }
-                }
-
                 Ok(img)
             }
         }
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 8b8075f8c1..e01bb6da51 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -118,8 +118,6 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
     "libmetrics_launch_timestamp",
     "libmetrics_build_info",
     "libmetrics_tracing_event_count_total",
-    "pageserver_materialized_cache_hits_total",
-    "pageserver_materialized_cache_hits_direct_total",
     "pageserver_page_cache_read_hits_total",
     "pageserver_page_cache_read_accesses_total",
     "pageserver_page_cache_size_current_bytes",
diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py
index 76c6581448..3c404c3b23 100644
--- a/test_runner/regress/test_local_file_cache.py
+++ b/test_runner/regress/test_local_file_cache.py
@@ -1,4 +1,5 @@
 import os
+import queue
 import random
 import threading
 import time
@@ -8,11 +9,7 @@ from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder
 from fixtures.utils import query_scalar
 
 
-def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: str):
-    if build_type == "debug":
-        # Disable vectored read path cross validation since it makes the test time out.
-        neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
-
+def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
     cache_dir = os.path.join(env.repo_dir, "file_cache")
@@ -33,11 +30,10 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: s
 
     cur = endpoint.connect().cursor()
 
+    stop = threading.Event()
     n_rows = 100000
     n_threads = 20
-    n_updates_per_thread = 10000
     n_updates_per_connection = 1000
-    n_total_updates = n_threads * n_updates_per_thread
 
     cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)")
     cur.execute(f"INSERT INTO lfctest SELECT g, 1 FROM generate_series(1, {n_rows}) g")
@@ -48,11 +44,11 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: s
     # performed (plus the initial 1 on each row).
     #
     # Furthermore, each thread will reconnect between every 1000 updates.
-    def run_updates():
+    def run_updates(n_updates_performed_q: queue.Queue[int]):
         n_updates_performed = 0
         conn = endpoint.connect()
         cur = conn.cursor()
-        for _ in range(n_updates_per_thread):
+        while not stop.is_set():
             id = random.randint(1, n_rows)
             cur.execute(f"UPDATE lfctest SET n = n + 1 WHERE id = {id}")
             n_updates_performed += 1
@@ -61,19 +57,28 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: s
                 conn.close()
                 conn = endpoint.connect()
                 cur = conn.cursor()
+        n_updates_performed_q.put(n_updates_performed)
 
+    n_updates_performed_q: queue.Queue[int] = queue.Queue()
     threads: List[threading.Thread] = []
     for _i in range(n_threads):
-        thread = threading.Thread(target=run_updates, args=(), daemon=True)
+        thread = threading.Thread(target=run_updates, args=(n_updates_performed_q,), daemon=True)
         thread.start()
         threads.append(thread)
 
     time.sleep(5)
 
+    # unlink, this is what we're actually testing
     new_cache_dir = os.path.join(env.repo_dir, "file_cache_new")
     os.rename(cache_dir, new_cache_dir)
 
+    time.sleep(10)
+
+    stop.set()
+
+    n_updates_performed = 0
     for thread in threads:
         thread.join()
+        n_updates_performed += n_updates_performed_q.get()
 
-    assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_total_updates + n_rows
+    assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_rows + n_updates_performed
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 885a94a557..756a2c17c9 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -23,11 +23,11 @@ if TYPE_CHECKING:
 
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
+@pytest.mark.timeout(600)
 @pytest.mark.parametrize("shard_count", [None, 4])
 def test_pg_regress(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
-    build_type: str,
     pg_bin: PgBin,
     capsys: CaptureFixture[str],
     base_dir: Path,
@@ -43,10 +43,6 @@ def test_pg_regress(
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count
 
-    if build_type == "debug":
-        # Disable vectored read path cross validation since it makes the test time out.
-        neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
-
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
     env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py
index 034f2b669d..a1bff32eed 100644
--- a/test_runner/regress/test_physical_replication.py
+++ b/test_runner/regress/test_physical_replication.py
@@ -6,7 +6,6 @@ from fixtures.neon_fixtures import NeonEnv
 
 def test_physical_replication(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    n_records = 100000
     with env.endpoints.create_start(
         branch_name="main",
         endpoint_id="primary",
@@ -22,8 +21,20 @@ def test_physical_replication(neon_simple_env: NeonEnv):
                 with p_con.cursor() as p_cur:
                     with secondary.connect() as s_con:
                         with s_con.cursor() as s_cur:
-                            for pk in range(n_records):
+                            runtime_secs = 30
+                            started_at = time.time()
+                            pk = 0
+                            while True:
+                                pk += 1
+                                now = time.time()
+                                if now - started_at > runtime_secs:
+                                    break
                                 p_cur.execute("insert into t (pk) values (%s)", (pk,))
+                                # an earlier version of this test was based on a fixed number of loop iterations
+                                # and selected for pk=(random.randrange(1, fixed number of loop iterations)).
+                                # => the probability of selection for a value that was never inserted changed from 99.9999% to 0% over the course of the test.
+                                #
+                                # We changed the test to where=(random.randrange(1, 2*pk)), which means the probability is now fixed to 50%.
                                 s_cur.execute(
-                                    "select * from t where pk=%s", (random.randrange(1, n_records),)
+                                    "select * from t where pk=%s", (random.randrange(1, 2 * pk),)
                                 )

From 02ecdd137bc5d142261f9bb9a4d93331d06acbd3 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 20 Jun 2024 17:50:43 +0200
Subject: [PATCH 1015/1571] fix: preinitialize
 `pageserver_basebackup_query_seconds` metric (#8121)

Without this patch, the Pageserver 4 Golden Signals dashboard shows no
data if there are no basebackups (observed in pre-prod).
---
 pageserver/src/metrics.rs | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 2992fef561..5c8f350f7b 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1347,17 +1347,23 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
     .map(|ms| (ms as f64) / 1000.0)
 });
 
-pub(crate) struct BasebackupQueryTime(HistogramVec);
+pub(crate) struct BasebackupQueryTime {
+    ok: Histogram,
+    error: Histogram,
+}
+
 pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
-    BasebackupQueryTime({
-        register_histogram_vec!(
-            "pageserver_basebackup_query_seconds",
-            "Histogram of basebackup queries durations, by result type",
-            &["result"],
-            COMPUTE_STARTUP_BUCKETS.to_vec(),
-        )
-        .expect("failed to define a metric")
-    })
+    let vec = register_histogram_vec!(
+        "pageserver_basebackup_query_seconds",
+        "Histogram of basebackup queries durations, by result type",
+        &["result"],
+        COMPUTE_STARTUP_BUCKETS.to_vec(),
+    )
+    .expect("failed to define a metric");
+    BasebackupQueryTime {
+        ok: vec.get_metric_with_label_values(&["ok"]).unwrap(),
+        error: vec.get_metric_with_label_values(&["error"]).unwrap(),
+    }
 });
 
 pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
@@ -1412,12 +1418,11 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
                 elapsed
             }
         };
-        let label_value = if res.is_ok() { "ok" } else { "error" };
-        let metric = self
-            .parent
-            .0
-            .get_metric_with_label_values(&[label_value])
-            .unwrap();
+        let metric = if res.is_ok() {
+            &self.parent.ok
+        } else {
+            &self.parent.error
+        };
         metric.observe(ex_throttled.as_secs_f64());
     }
 }
@@ -2926,4 +2931,5 @@ pub fn preinitialize_metrics() {
     // Custom
     Lazy::force(&RECONSTRUCT_TIME);
     Lazy::force(&tenant_throttling::TIMELINE_GET);
+    Lazy::force(&BASEBACKUP_QUERY_TIME);
 }

From f8ac3b0e0ee492e4de793083dd0f9eaaf9c49eab Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 20 Jun 2024 17:32:01 +0100
Subject: [PATCH 1016/1571] storcon: use attached shard counts for initial
 shard placement (#8061)

## Problem
When creating a new shard the storage controller schedules via
Scheduler::schedule_shard. This does not take into account the number of
attached shards. What it does take into account is the node affinity:
when a shard is scheduled, all its nodes (primaries and secondaries) get
their affinity incremented.

For two node clusters and shards with one secondary we have a
pathological case where all primaries are scheduled on the same node.
Now that we track the count of attached shards per node, this is trivial
to fix. Still, the "proper" fix is to use the pageserver's utilization
score.

Closes https://github.com/neondatabase/neon/issues/8041

## Summary of changes
Use attached shard count when deciding which node to schedule a fresh
shard on.
---
 storage_controller/src/scheduler.rs            | 12 ++++++++----
 storage_controller/src/tenant_shard.rs         |  8 ++------
 test_runner/regress/test_storage_controller.py |  8 +-------
 3 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 0bd2eeac35..843159010d 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -391,7 +391,7 @@ impl Scheduler {
             return Err(ScheduleError::NoPageservers);
         }
 
-        let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
+        let mut scores: Vec<(NodeId, AffinityScore, usize, usize)> = self
             .nodes
             .iter()
             .filter_map(|(k, v)| {
@@ -402,6 +402,7 @@ impl Scheduler {
                         *k,
                         context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
                         v.shard_count,
+                        v.attached_shard_count,
                     ))
                 }
             })
@@ -409,9 +410,12 @@ impl Scheduler {
 
         // Sort by, in order of precedence:
         //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
-        //  2nd: Utilization.  Within nodes with the same affinity, use the least loaded nodes.
-        //  3rd: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
-        scores.sort_by_key(|i| (i.1, i.2, i.0));
+        //  2nd: Attached shard count.  Within nodes with the same affinity, we always pick the node with
+        //  the least number of attached shards.
+        //  3rd: Total shard count.  Within nodes with the same affinity and attached shard count, use nodes
+        //  with the lower total shard count.
+        //  4th: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
+        scores.sort_by_key(|i| (i.1, i.3, i.2, i.0));
 
         if scores.is_empty() {
             // After applying constraints, no pageservers were left.
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index d1b632755f..840bcbb81d 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1632,14 +1632,10 @@ pub(crate) mod tests {
 
         // We should see equal number of locations on the two nodes.
         assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
-        // Scheduling does not consider the number of attachments picking the initial
-        // pageserver to attach to (hence the assertion that all primaries are on the
-        // same node)
-        // TODO: Tweak the scheduling to evenly distribute attachments for new shards.
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 4);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2);
 
         assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2);
 
         // Add another two nodes: we should see the shards spread out when their optimize
         // methods are called
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 30f96ceee8..c6450df186 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1527,13 +1527,7 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
         )
 
     # Give things a chance to settle.
-    # A call to `reconcile_until_idle` could be used here instead,
-    # however since all attachments are placed on the same node,
-    # we'd have to wait for a long time (2 minutes-ish) for optimizations
-    # to quiesce.
-    # TODO: once the initial attachment selection is fixed, update this
-    # to use `reconcile_until_idle`.
-    time.sleep(2)
+    env.storage_controller.reconcile_until_idle(timeout_secs=30)
 
     nodes = env.storage_controller.node_list()
     assert len(nodes) == 2

From 0792bb6785b7c749b921086305e2612f2f93c0a1 Mon Sep 17 00:00:00 2001
From: Jure Bajic <jure.bajic94@gmail.com>
Date: Fri, 21 Jun 2024 10:47:04 +0200
Subject: [PATCH 1017/1571] Add tracing for shared locks in `id_lock_map`
 (#7618)

## Problem
Storage controller shared locks do not print a warning when held for long time spans.

## Summary of changes
Extension of issue https://github.com/neondatabase/neon/issues/7108 in
tracing to exclusive lock in `id_lock_map` was added, to add the same
for shared locks. It was mentioned in the comment
https://github.com/neondatabase/neon/pull/7397#discussion_r1587961160
---
 storage_controller/src/id_lock_map.rs         | 78 +++++++++++++------
 storage_controller/src/service.rs             |  7 +-
 .../regress/test_storage_controller.py        | 21 ++++-
 3 files changed, 78 insertions(+), 28 deletions(-)

diff --git a/storage_controller/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs
index dff793289f..fcd3eb57e2 100644
--- a/storage_controller/src/id_lock_map.rs
+++ b/storage_controller/src/id_lock_map.rs
@@ -8,14 +8,15 @@ use crate::service::RECONCILE_TIMEOUT;
 
 const LOCK_TIMEOUT_ALERT_THRESHOLD: Duration = RECONCILE_TIMEOUT;
 
-/// A wrapper around `OwnedRwLockWriteGuard` that when dropped changes the
-/// current holding operation in lock.
-pub struct WrappedWriteGuard<T: Display> {
+/// A wrapper around `OwnedRwLockWriteGuard` used for tracking the
+/// operation that holds the lock, and print a warning if it exceeds
+/// the LOCK_TIMEOUT_ALERT_THRESHOLD time
+pub struct TracingExclusiveGuard<T: Display> {
     guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>,
     start: Instant,
 }
 
-impl<T: Display> WrappedWriteGuard<T> {
+impl<T: Display> TracingExclusiveGuard<T> {
     pub fn new(guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>) -> Self {
         Self {
             guard,
@@ -24,12 +25,12 @@ impl<T: Display> WrappedWriteGuard<T> {
     }
 }
 
-impl<T: Display> Drop for WrappedWriteGuard<T> {
+impl<T: Display> Drop for TracingExclusiveGuard<T> {
     fn drop(&mut self) {
         let duration = self.start.elapsed();
         if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
             tracing::warn!(
-                "Lock on {} was held for {:?}",
+                "Exclusive lock by {} was held for {:?}",
                 self.guard.as_ref().unwrap(),
                 duration
             );
@@ -38,6 +39,38 @@ impl<T: Display> Drop for WrappedWriteGuard<T> {
     }
 }
 
+// A wrapper around `OwnedRwLockReadGuard` used for tracking the
+/// operation that holds the lock, and print a warning if it exceeds
+/// the LOCK_TIMEOUT_ALERT_THRESHOLD time
+pub struct TracingSharedGuard<T: Display> {
+    _guard: tokio::sync::OwnedRwLockReadGuard<Option<T>>,
+    operation: T,
+    start: Instant,
+}
+
+impl<T: Display> TracingSharedGuard<T> {
+    pub fn new(guard: tokio::sync::OwnedRwLockReadGuard<Option<T>>, operation: T) -> Self {
+        Self {
+            _guard: guard,
+            operation,
+            start: Instant::now(),
+        }
+    }
+}
+
+impl<T: Display> Drop for TracingSharedGuard<T> {
+    fn drop(&mut self) {
+        let duration = self.start.elapsed();
+        if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
+            tracing::warn!(
+                "Shared lock by {} was held for {:?}",
+                self.operation,
+                duration
+            );
+        }
+    }
+}
+
 /// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
 /// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
 /// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
@@ -58,21 +91,22 @@ where
     pub(crate) fn shared(
         &self,
         key: T,
-    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<Option<I>>> {
+        operation: I,
+    ) -> impl std::future::Future<Output = TracingSharedGuard<I>> {
         let mut locked = self.entities.lock().unwrap();
-        let entry = locked.entry(key).or_default();
-        entry.clone().read_owned()
+        let entry = locked.entry(key).or_default().clone();
+        async move { TracingSharedGuard::new(entry.read_owned().await, operation) }
     }
 
     pub(crate) fn exclusive(
         &self,
         key: T,
         operation: I,
-    ) -> impl std::future::Future<Output = WrappedWriteGuard<I>> {
+    ) -> impl std::future::Future<Output = TracingExclusiveGuard<I>> {
         let mut locked = self.entities.lock().unwrap();
         let entry = locked.entry(key).or_default().clone();
         async move {
-            let mut guard = WrappedWriteGuard::new(entry.clone().write_owned().await);
+            let mut guard = TracingExclusiveGuard::new(entry.write_owned().await);
             *guard.guard = Some(operation);
             guard
         }
@@ -99,12 +133,12 @@ where
 
 pub async fn trace_exclusive_lock<
     T: Clone + Display + Eq + PartialEq + std::hash::Hash,
-    I: Display + Clone,
+    I: Clone + Display,
 >(
     op_locks: &IdLockMap<T, I>,
     key: T,
     operation: I,
-) -> WrappedWriteGuard<I> {
+) -> TracingExclusiveGuard<I> {
     let start = Instant::now();
     let guard = op_locks.exclusive(key.clone(), operation.clone()).await;
 
@@ -123,14 +157,14 @@ pub async fn trace_exclusive_lock<
 
 pub async fn trace_shared_lock<
     T: Clone + Display + Eq + PartialEq + std::hash::Hash,
-    I: Display,
+    I: Clone + Display,
 >(
     op_locks: &IdLockMap<T, I>,
     key: T,
     operation: I,
-) -> tokio::sync::OwnedRwLockReadGuard<Option<I>> {
+) -> TracingSharedGuard<I> {
     let start = Instant::now();
-    let guard = op_locks.shared(key.clone()).await;
+    let guard = op_locks.shared(key.clone(), operation.clone()).await;
 
     let duration = start.elapsed();
     if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
@@ -159,11 +193,11 @@ mod tests {
     async fn multiple_shared_locks() {
         let id_lock_map: IdLockMap<i32, Operations> = IdLockMap::default();
 
-        let shared_lock_1 = id_lock_map.shared(1).await;
-        let shared_lock_2 = id_lock_map.shared(1).await;
+        let shared_lock_1 = id_lock_map.shared(1, Operations::Op1).await;
+        let shared_lock_2 = id_lock_map.shared(1, Operations::Op2).await;
 
-        assert!(shared_lock_1.is_none());
-        assert!(shared_lock_2.is_none());
+        assert_eq!(shared_lock_1.operation, Operations::Op1);
+        assert_eq!(shared_lock_2.operation, Operations::Op2);
     }
 
     #[tokio::test]
@@ -183,7 +217,7 @@ mod tests {
             assert!(_ex_lock_2.is_err());
         }
 
-        let shared_lock_1 = id_lock_map.shared(resource_id).await;
-        assert!(shared_lock_1.is_none());
+        let shared_lock_1 = id_lock_map.shared(resource_id, Operations::Op1).await;
+        assert_eq!(shared_lock_1.operation, Operations::Op1);
     }
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 8475bf46d2..6ed6c16347 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -13,7 +13,7 @@ use crate::{
         Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION,
     },
     compute_hook::NotifyError,
-    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard},
+    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
     persistence::{AbortShardSplitStatus, TenantFilter},
     reconciler::{ReconcileError, ReconcileUnits},
     scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
@@ -359,7 +359,7 @@ struct TenantShardSplitAbort {
     new_shard_count: ShardCount,
     new_stripe_size: Option<ShardStripeSize>,
     /// Until this abort op is complete, no other operations may be done on the tenant
-    _tenant_lock: WrappedWriteGuard<TenantOperations>,
+    _tenant_lock: TracingExclusiveGuard<TenantOperations>,
 }
 
 #[derive(thiserror::Error, Debug)]
@@ -1429,7 +1429,7 @@ impl Service {
     async fn node_activate_reconcile(
         &self,
         mut node: Node,
-        _lock: &WrappedWriteGuard<NodeOperations>,
+        _lock: &TracingExclusiveGuard<NodeOperations>,
     ) -> Result<(), ApiError> {
         // This Node is a mutable local copy: we will set it active so that we can use its
         // API client to reconcile with the node.  The Node in [`Self::nodes`] will get updated
@@ -2658,6 +2658,7 @@ impl Service {
             TenantOperations::TimelineCreate,
         )
         .await;
+        failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock");
 
         self.ensure_attached_wait(tenant_id).await?;
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index c6450df186..dffe5c89b9 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1384,7 +1384,8 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
     tenant_id = env.initial_tenant
     env.storage_controller.allowed_errors.extend(
         [
-            ".*Lock on.*",
+            ".*Exclusive lock by.*",
+            ".*Shared lock by.*",
             ".*Scheduling is disabled by policy.*",
             f".*Operation TimelineCreate on key {tenant_id} has waited.*",
         ]
@@ -1416,11 +1417,25 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
     )
     thread_update_tenant_policy.join()
 
-    env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for")
-    env.storage_controller.assert_log_contains(
+    env.storage_controller.assert_log_contains("Exclusive lock by UpdatePolicy was held for")
+    _, last_log_cursor = env.storage_controller.assert_log_contains(
         f"Operation TimelineCreate on key {tenant_id} has waited"
     )
 
+    # Test out shared lock
+    env.storage_controller.configure_failpoints(
+        ("tenant-create-timeline-shared-lock", "return(31000)")
+    )
+
+    timeline_id = TimelineId.generate()
+    # This will hold the shared lock for enough time to cause an warning
+    env.storage_controller.pageserver_api().timeline_create(
+        pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id
+    )
+    env.storage_controller.assert_log_contains(
+        "Shared lock by TimelineCreate was held for", offset=last_log_cursor
+    )
+
 
 @pytest.mark.parametrize("remote_storage", [RemoteStorageKind.LOCAL_FS, s3_storage()])
 @pytest.mark.parametrize("shard_count", [None, 4])

From 01399621d53aa70edcc8f89976d2ae2fba5723e1 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 21 Jun 2024 10:19:01 +0100
Subject: [PATCH 1018/1571] storcon: avoid promoting too many shards of the
 same tenant (#8099)

## Problem

The fill planner introduced in
https://github.com/neondatabase/neon/pull/8014 selects
tenant shards to promote strictly based on attached shard count load
(tenant shards on
nodes with the most attached shard counts are considered first). This
approach runs the
risk of migrating too many shards belonging to the same tenant on the
same primary node.

This is bad for availability and causes extra reconciles via the storage
controller's background
optimisations.

Also see
https://github.com/neondatabase/neon/pull/8014#discussion_r1642456241.

## Summary of changes
Refine the fill plan to avoid promoting too many shards belonging to the
same tenant on the same node.
We allow for `max(1, shard_count / node_count)` shards belonging to the
same tenant to be promoted.
---
 storage_controller/src/service.rs | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 6ed6c16347..792f68cc5a 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5395,6 +5395,9 @@ impl Service {
     /// throughout the cluster. We achieve this by picking tenant shards from each node,
     /// starting from the ones with the largest number of attached shards, until the node
     /// reaches the expected cluster average.
+    /// 3. Avoid promoting more shards of the same tenant than required. The upper bound
+    /// for the number of tenants from the same shard promoted to the node being filled is:
+    /// shard count for the tenant divided by the number of nodes in the cluster.
     fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
         let mut locked = self.inner.write().unwrap();
         let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
@@ -5416,8 +5419,18 @@ impl Service {
         let expected_attached = locked.scheduler.expected_attached_shard_count();
         let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();
 
+        let mut promoted_per_tenant: HashMap<TenantId, usize> = HashMap::new();
         let mut plan = Vec::new();
+
         for (node_id, attached) in nodes_by_load {
+            let available = locked
+                .nodes
+                .get(&node_id)
+                .map_or(false, |n| n.is_available());
+            if !available {
+                continue;
+            }
+
             if plan.len() >= fill_requirement
                 || tids_by_node.is_empty()
                 || attached <= expected_attached
@@ -5425,13 +5438,22 @@ impl Service {
                 break;
             }
 
-            let can_take = attached - expected_attached;
+            let mut can_take = attached - expected_attached;
             let mut remove_node = false;
-            for _ in 0..can_take {
+            while can_take > 0 {
                 match tids_by_node.get_mut(&node_id) {
                     Some(tids) => match tids.pop() {
                         Some(tid) => {
-                            plan.push(tid);
+                            let max_promote_for_tenant = std::cmp::max(
+                                tid.shard_count.count() as usize / locked.nodes.len(),
+                                1,
+                            );
+                            let promoted = promoted_per_tenant.entry(tid.tenant_id).or_default();
+                            if *promoted < max_promote_for_tenant {
+                                plan.push(tid);
+                                *promoted += 1;
+                                can_take -= 1;
+                            }
                         }
                         None => {
                             remove_node = true;

From 59f949b4a842b0f27e4a1d622c9c5db293bb7901 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 21 Jun 2024 11:02:15 +0100
Subject: [PATCH 1019/1571] pageserver: remove unused load/ignore APIs (#8122)

## Problem

These APIs have be unused for some time. They were superseded by
/location_conf: the equivalent of ignoring a tenant is now to put it in
secondary mode.

## Summary of changes

- Remove APIs
- Remove tests & helpers that used them
- Remove error variants that are no longer needed.
---
 libs/pageserver_api/src/models.rs         |  16 --
 pageserver/src/config.rs                  |   9 +-
 pageserver/src/http/openapi_spec.yml      |  42 -----
 pageserver/src/http/routes.rs             |  68 +-------
 pageserver/src/lib.rs                     |   7 -
 pageserver/src/tenant/mgr.rs              | 165 +-----------------
 test_runner/fixtures/neon_fixtures.py     |   6 -
 test_runner/fixtures/pageserver/http.py   |  11 --
 test_runner/regress/test_tenant_detach.py | 201 +---------------------
 9 files changed, 10 insertions(+), 515 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 70db0b7344..3db75b7d0e 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -293,22 +293,6 @@ pub struct TenantCreateRequest {
     pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
 
-#[derive(Deserialize, Debug)]
-#[serde(deny_unknown_fields)]
-pub struct TenantLoadRequest {
-    #[serde(default)]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub generation: Option<u32>,
-}
-
-impl std::ops::Deref for TenantCreateRequest {
-    type Target = TenantConfig;
-
-    fn deref(&self) -> &Self::Target {
-        &self.config
-    }
-}
-
 /// An alternative representation of `pageserver::tenant::TenantConf` with
 /// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index b4a0d1ac02..badea48b98 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -39,8 +39,8 @@ use crate::tenant::{
 use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
 use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{
-    IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
-    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
+    TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME,
+    TIMELINE_DELETE_MARK_SUFFIX,
 };
 
 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -811,11 +811,6 @@ impl PageServerConf {
         self.tenants_path().join(tenant_shard_id.to_string())
     }
 
-    pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
-        self.tenant_path(tenant_shard_id)
-            .join(IGNORED_TENANT_FILE_NAME)
-    }
-
     /// Points to a place in pageserver's local directory,
     /// where certain tenant's tenantconf file should be located.
     ///
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 71b486a4d3..4b6fe56b89 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -389,48 +389,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/ConflictError"
-  /v1/tenant/{tenant_id}/ignore:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-    post:
-      description: |
-        Remove tenant data (including all corresponding timelines) from pageserver's memory.
-        Files on local disk and remote storage are not affected.
-
-        Future pageserver restarts won't load the data back until `load` is called on such tenant.
-      responses:
-        "200":
-          description: Tenant ignored
-
-
-  /v1/tenant/{tenant_id}/load:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-    post:
-      description: |
-        Schedules an operation that attempts to load a tenant from the local disk and
-        synchronise it with the remote storage (if enabled), repeating pageserver's restart logic for tenant load.
-        If the tenant was ignored before, removes the ignore mark and continues with load scheduling.
-
-        Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
-        Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
-      requestBody:
-        required: false
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/TenantLoadRequest"
-      responses:
-        "202":
-          description: Tenant scheduled to load successfully
 
   /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
     parameters:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 482879630a..eb74ca637f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -36,7 +36,7 @@ use pageserver_api::models::TopTenantShardsRequest;
 use pageserver_api::models::TopTenantShardsResponse;
 use pageserver_api::models::{
     DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
-    TenantLoadRequest, TenantLocationConfigRequest,
+    TenantLocationConfigRequest,
 };
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
@@ -205,7 +205,6 @@ impl From<TenantSlotError> for ApiError {
             NotFound(tenant_id) => {
                 ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
             }
-            e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
             InProgress => {
                 ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
             }
@@ -891,8 +890,6 @@ async fn tenant_detach_handler(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
-    let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
-
     // This is a legacy API (`/location_conf` is the replacement).  It only supports unsharded tenants
     let tenant_shard_id = TenantShardId::unsharded(tenant_id);
 
@@ -900,12 +897,7 @@ async fn tenant_detach_handler(
     let conf = state.conf;
     state
         .tenant_manager
-        .detach_tenant(
-            conf,
-            tenant_shard_id,
-            detach_ignored.unwrap_or(false),
-            &state.deletion_queue_client,
-        )
+        .detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client)
         .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
         .await?;
 
@@ -932,54 +924,6 @@ async fn tenant_reset_handler(
     json_response(StatusCode::OK, ())
 }
 
-async fn tenant_load_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let maybe_body: Option<TenantLoadRequest> = json_request_or_empty_body(&mut request).await?;
-
-    let state = get_state(&request);
-
-    // The /load request is only usable when control_plane_api is not set.  Once it is set, callers
-    // should always use /attach instead.
-    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
-
-    mgr::load_tenant(
-        state.conf,
-        tenant_id,
-        generation,
-        state.broker_client.clone(),
-        state.remote_storage.clone(),
-        state.deletion_queue_client.clone(),
-        &ctx,
-    )
-    .instrument(info_span!("load", %tenant_id))
-    .await?;
-
-    json_response(StatusCode::ACCEPTED, ())
-}
-
-async fn tenant_ignore_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
-
-    let state = get_state(&request);
-    let conf = state.conf;
-    mgr::ignore_tenant(conf, tenant_id)
-        .instrument(info_span!("ignore_tenant", %tenant_id))
-        .await?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn tenant_list_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -1507,7 +1451,7 @@ async fn put_tenant_location_config_handler(
     if let LocationConfigMode::Detached = request_data.config.mode {
         if let Err(e) = state
             .tenant_manager
-            .detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
+            .detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client)
             .instrument(info_span!("tenant_detach",
                 tenant_id = %tenant_shard_id.tenant_id,
                 shard_id = %tenant_shard_id.shard_slug()
@@ -2764,12 +2708,6 @@ pub fn make_router(
         .post("/v1/tenant/:tenant_shard_id/reset", |r| {
             api_handler(r, tenant_reset_handler)
         })
-        .post("/v1/tenant/:tenant_id/load", |r| {
-            api_handler(r, tenant_load_handler)
-        })
-        .post("/v1/tenant/:tenant_id/ignore", |r| {
-            api_handler(r, tenant_ignore_handler)
-        })
         .post(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
             |r| api_handler(r, timeline_preserve_initdb_handler),
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index c69fb8c83b..9e64eafffc 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -136,13 +136,6 @@ pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
 
 pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
 
-/// A marker file to prevent pageserver from loading a certain tenant on restart.
-/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
-/// `ignore` management API command, that expects the ignored tenant to be properly loaded
-/// into pageserver's memory before being ignored.
-/// Full path: `tenants/<tenant_id>/___ignored_tenant`.
-pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant";
-
 pub fn is_temporary(path: &Utf8Path) -> bool {
     match path.file_name() {
         Some(name) => name.ends_with(TEMP_FILE_SUFFIX),
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 4520bb9295..f61526f8c2 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -27,7 +27,6 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 
-use remote_storage::GenericRemoteStorage;
 use utils::{completion, crashsafe};
 
 use crate::config::PageServerConf;
@@ -47,7 +46,7 @@ use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
 use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
-use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
+use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
 
 use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext::PathExt;
@@ -422,12 +421,6 @@ fn load_tenant_config(
         }
     };
 
-    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
-    if tenant_ignore_mark_file.exists() {
-        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
-        return Ok(None);
-    }
-
     Ok(Some((
         tenant_shard_id,
         Tenant::load_tenant_config(conf, &tenant_shard_id),
@@ -713,12 +706,6 @@ fn tenant_spawn(
         "Cannot load tenant from empty directory {tenant_path:?}"
     );
 
-    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
-    anyhow::ensure!(
-        !conf.tenant_ignore_mark_file_path(&tenant_shard_id).exists(),
-        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
-    );
-
     let remote_storage = resources.remote_storage.clone();
     let tenant = match Tenant::spawn(
         conf,
@@ -1067,7 +1054,7 @@ impl TenantManager {
         // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
         let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)
             .map_err(|e| match e {
-                TenantSlotError::AlreadyExists(_, _) | TenantSlotError::NotFound(_) => {
+                TenantSlotError::NotFound(_) => {
                     unreachable!("Called with mode Any")
                 }
                 TenantSlotError::InProgress => UpsertLocationError::InProgress,
@@ -1901,17 +1888,10 @@ impl TenantManager {
         &self,
         conf: &'static PageServerConf,
         tenant_shard_id: TenantShardId,
-        detach_ignored: bool,
         deletion_queue_client: &DeletionQueueClient,
     ) -> Result<(), TenantStateError> {
         let tmp_path = self
-            .detach_tenant0(
-                conf,
-                &TENANTS,
-                tenant_shard_id,
-                detach_ignored,
-                deletion_queue_client,
-            )
+            .detach_tenant0(conf, &TENANTS, tenant_shard_id, deletion_queue_client)
             .await?;
         spawn_background_purge(tmp_path);
 
@@ -1923,7 +1903,6 @@ impl TenantManager {
         conf: &'static PageServerConf,
         tenants: &std::sync::RwLock<TenantsMap>,
         tenant_shard_id: TenantShardId,
-        detach_ignored: bool,
         deletion_queue_client: &DeletionQueueClient,
     ) -> Result<Utf8PathBuf, TenantStateError> {
         let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
@@ -1946,26 +1925,6 @@ impl TenantManager {
         // before this tenant is potentially re-attached elsewhere.
         deletion_queue_client.flush_advisory();
 
-        // Ignored tenants are not present in memory and will bail the removal from memory operation.
-        // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
-        if detach_ignored
-            && matches!(
-                removal_result,
-                Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
-            )
-        {
-            let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
-            if tenant_ignore_mark.exists() {
-                info!("Detaching an ignored tenant");
-                let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
-                    .await
-                    .with_context(|| {
-                        format!("Ignored tenant {tenant_shard_id} local directory rename")
-                    })?;
-                return Ok(tmp_path);
-            }
-        }
-
         removal_result
     }
 
@@ -2222,97 +2181,6 @@ pub(crate) enum TenantStateError {
     Other(#[from] anyhow::Error),
 }
 
-pub(crate) async fn load_tenant(
-    conf: &'static PageServerConf,
-    tenant_id: TenantId,
-    generation: Generation,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: GenericRemoteStorage,
-    deletion_queue_client: DeletionQueueClient,
-    ctx: &RequestContext,
-) -> Result<(), TenantMapInsertError> {
-    // This is a legacy API (replaced by `/location_conf`).  It does not support sharding
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-    let slot_guard =
-        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
-    let tenant_path = conf.tenant_path(&tenant_shard_id);
-
-    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
-    if tenant_ignore_mark.exists() {
-        std::fs::remove_file(&tenant_ignore_mark).with_context(|| {
-            format!(
-                "Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"
-            )
-        })?;
-    }
-
-    let resources = TenantSharedResources {
-        broker_client,
-        remote_storage,
-        deletion_queue_client,
-    };
-
-    let mut location_conf =
-        Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
-    location_conf.attach_in_generation(AttachmentMode::Single, generation);
-
-    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
-
-    let shard_identity = location_conf.shard;
-    let new_tenant = tenant_spawn(
-        conf,
-        tenant_shard_id,
-        &tenant_path,
-        resources,
-        AttachedTenantConf::try_from(location_conf)?,
-        shard_identity,
-        None,
-        &TENANTS,
-        SpawnMode::Eager,
-        ctx,
-    )
-    .with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
-
-    slot_guard.upsert(TenantSlot::Attached(new_tenant))?;
-    Ok(())
-}
-
-pub(crate) async fn ignore_tenant(
-    conf: &'static PageServerConf,
-    tenant_id: TenantId,
-) -> Result<(), TenantStateError> {
-    ignore_tenant0(conf, &TENANTS, tenant_id).await
-}
-
-#[instrument(skip_all, fields(shard_id))]
-async fn ignore_tenant0(
-    conf: &'static PageServerConf,
-    tenants: &std::sync::RwLock<TenantsMap>,
-    tenant_id: TenantId,
-) -> Result<(), TenantStateError> {
-    // This is a legacy API (replaced by `/location_conf`).  It does not support sharding
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-    tracing::Span::current().record(
-        "shard_id",
-        tracing::field::display(tenant_shard_id.shard_slug()),
-    );
-
-    remove_tenant_from_memory(tenants, tenant_shard_id, async {
-        let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
-        fs::File::create(&ignore_mark_file)
-            .await
-            .context("Failed to create ignore mark file")
-            .and_then(|_| {
-                crashsafe::fsync_file_and_parent(&ignore_mark_file)
-                    .context("Failed to fsync ignore mark file")
-            })
-            .with_context(|| format!("Failed to crate ignore mark for tenant {tenant_shard_id}"))?;
-        Ok(())
-    })
-    .await
-}
-
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum TenantMapListError {
     #[error("tenant map is still initiailizing")]
@@ -2337,10 +2205,6 @@ pub(crate) enum TenantSlotError {
     #[error("Tenant {0} not found")]
     NotFound(TenantShardId),
 
-    /// When acquiring a slot with the expectation that the tenant does not already exist.
-    #[error("tenant {0} already exists, state: {1:?}")]
-    AlreadyExists(TenantShardId, TenantState),
-
     // Tried to read a slot that is currently being mutated by another administrative
     // operation.
     #[error("tenant has a state change in progress, try again later")]
@@ -2656,8 +2520,6 @@ enum TenantSlotAcquireMode {
     Any,
     /// Return an error if trying to acquire a slot and it doesn't already exist
     MustExist,
-    /// Return an error if trying to acquire a slot and it already exists
-    MustNotExist,
 }
 
 fn tenant_map_acquire_slot(
@@ -2711,27 +2573,6 @@ fn tenant_map_acquire_slot_impl(
                     tracing::debug!("Occupied, failing for InProgress");
                     Err(TenantSlotError::InProgress)
                 }
-                (slot, MustNotExist) => match slot {
-                    TenantSlot::Attached(tenant) => {
-                        tracing::debug!("Attached && MustNotExist, return AlreadyExists");
-                        Err(TenantSlotError::AlreadyExists(
-                            *tenant_shard_id,
-                            tenant.current_state(),
-                        ))
-                    }
-                    _ => {
-                        // FIXME: the AlreadyExists error assumes that we have a Tenant
-                        // to get the state from
-                        tracing::debug!("Occupied & MustNotExist, return AlreadyExists");
-                        Err(TenantSlotError::AlreadyExists(
-                            *tenant_shard_id,
-                            TenantState::Broken {
-                                reason: "Present but not attached".to_string(),
-                                backtrace: "".to_string(),
-                            },
-                        ))
-                    }
-                },
                 _ => {
                     // Happy case: the slot was not in any state that violated our mode
                     let (completion, barrier) = utils::completion::channel();
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b5d9a69d55..b5e40f5a46 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2700,12 +2700,6 @@ class NeonPageserver(PgProtocol, LogUtils):
         client = self.http_client(auth_token=auth_token)
         return client.tenant_create(tenant_id, conf, generation=generation)
 
-    def tenant_load(self, tenant_id: TenantId):
-        client = self.http_client()
-        return client.tenant_load(
-            tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
-        )
-
     def list_layers(
         self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
     ) -> list[Path]:
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index d5441bd694..ecc83a9546 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -340,17 +340,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         self.verbose_error(res)
         return res
 
-    def tenant_load(self, tenant_id: TenantId, generation=None):
-        body = None
-        if generation is not None:
-            body = {"generation": generation}
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load", json=body)
-        self.verbose_error(res)
-
-    def tenant_ignore(self, tenant_id: TenantId):
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
-        self.verbose_error(res)
-
     def tenant_status(
         self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False
     ) -> Dict[Any, Any]:
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 871351b2d5..4c49e6fb85 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -344,56 +344,6 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
         pageserver_http.timeline_gc(tenant_id, timeline_id, 0)
 
 
-# Creates and ignores a tenant, then detaches it: first, with no parameters (should fail),
-# then with parameters to force ignored tenant detach (should not fail).
-def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-    client = env.pageserver.http_client()
-
-    # create a new tenant
-    tenant_id, _ = env.neon_cli.create_tenant()
-
-    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
-
-    # assert tenant exists on disk
-    assert env.pageserver.tenant_dir(tenant_id).exists()
-
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
-    # we rely upon autocommit after each statement
-    endpoint.safe_psql_many(
-        queries=[
-            "CREATE TABLE t(key int primary key, value text)",
-            "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
-        ]
-    )
-
-    # ignore tenant
-    client.tenant_ignore(tenant_id)
-    env.pageserver.allowed_errors.append(".*NotFound: tenant .*")
-    # ensure tenant couldn't be detached without the special flag for ignored tenant
-    log.info("detaching ignored tenant WITHOUT required flag")
-    with pytest.raises(
-        expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}"
-    ):
-        client.tenant_detach(tenant_id)
-
-    log.info("tenant detached failed as expected")
-
-    # ensure tenant is detached with ignore state
-    log.info("detaching ignored tenant with required flag")
-    client.tenant_detach(tenant_id, True)
-    log.info("ignored tenant detached without error")
-
-    # check that nothing is left on disk for deleted tenant
-    assert not env.pageserver.tenant_dir(tenant_id).exists()
-
-    # assert the tenant does not exists in the Pageserver
-    tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
-    assert (
-        tenant_id not in tenants_after_detach
-    ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory"
-
-
 # Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach.
 # Tenant should be detached without issues.
 def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
@@ -500,153 +450,6 @@ def test_detach_while_attaching(
         cur.execute("SELECT COUNT(*) FROM foo")
 
 
-# Tests that `ignore` and `get` operations' combination is able to remove and restore the tenant in pageserver's memory.
-# * writes some data into tenant's timeline
-# * ensures it's synced with the remote storage
-# * `ignore` the tenant
-# * verify that ignored tenant files are generally unchanged, only an ignored mark had appeared
-# * verify the ignored tenant is gone from pageserver's memory
-# * restart the pageserver and verify that ignored tenant is still not loaded
-# * `load` the same tenant
-# * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines
-def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
-    env = neon_env_builder.init_start()
-    pageserver_http = env.pageserver.http_client()
-
-    ignored_tenant_id, _ = env.neon_cli.create_tenant()
-    tenant_dir = env.pageserver.tenant_dir(ignored_tenant_id)
-    tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
-    tenants_before_ignore.sort()
-    timelines_before_ignore = [
-        timeline["timeline_id"]
-        for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
-    ]
-    files_before_ignore = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
-
-    # ignore the tenant and veirfy it's not present in pageserver replies, with its files still on disk
-    pageserver_http.tenant_ignore(ignored_tenant_id)
-
-    files_after_ignore_with_retain = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
-    new_files = set(files_after_ignore_with_retain) - set(files_before_ignore)
-    disappeared_files = set(files_before_ignore) - set(files_after_ignore_with_retain)
-    assert (
-        len(disappeared_files) == 0
-    ), f"Tenant ignore should not remove files from disk, missing: {disappeared_files}"
-    assert (
-        len(new_files) == 1
-    ), f"Only tenant ignore file should appear on disk but got: {new_files}"
-
-    tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
-    assert ignored_tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
-    assert len(tenants_after_ignore) + 1 == len(
-        tenants_before_ignore
-    ), "Only ignored tenant should be missing"
-
-    # restart the pageserver to ensure we don't load the ignore timeline
-    env.pageserver.stop()
-    env.pageserver.start()
-    tenants_after_restart = [tenant["id"] for tenant in pageserver_http.tenant_list()]
-    tenants_after_restart.sort()
-    assert (
-        tenants_after_restart == tenants_after_ignore
-    ), "Ignored tenant should not be reloaded after pageserver restart"
-
-    # now, load it from the local files and expect it works
-    env.pageserver.tenant_load(tenant_id=ignored_tenant_id)
-    wait_until_tenant_state(pageserver_http, ignored_tenant_id, "Active", 5)
-
-    tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
-    tenants_after_attach.sort()
-    assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
-
-    timelines_after_ignore = [
-        timeline["timeline_id"]
-        for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
-    ]
-    assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
-
-
-# Tests that it's possible to `load` tenants with missing layers and get them restored:
-# * writes some data into tenant's timeline
-# * ensures it's synced with the remote storage
-# * `ignore` the tenant
-# * removes all timeline's local layers
-# * `load` the same tenant
-# * ensure that it's status is `Active`
-# * check that timeline data is restored
-def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-    env = neon_env_builder.init_start()
-    pageserver_http = env.pageserver.http_client()
-    endpoint = env.endpoints.create_start("main")
-
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-
-    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
-
-    data_id = 1
-    data_secret = "very secret secret"
-    insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
-
-    tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
-    tenants_before_ignore.sort()
-    timelines_before_ignore = [
-        timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
-    ]
-
-    # ignore the tenant and remove its layers
-    pageserver_http.tenant_ignore(tenant_id)
-    timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
-    layers_removed = False
-    for dir_entry in timeline_dir.iterdir():
-        if dir_entry.name.startswith("00000"):
-            # Looks like a layer file. Remove it
-            dir_entry.unlink()
-            layers_removed = True
-    assert layers_removed, f"Found no layers for tenant {timeline_dir}"
-
-    # now, load it from the local files and expect it to work due to remote storage restoration
-    env.pageserver.tenant_load(tenant_id=tenant_id)
-    wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
-
-    tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
-    tenants_after_attach.sort()
-    assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
-
-    timelines_after_ignore = [
-        timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
-    ]
-    assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
-
-    endpoint.stop()
-    endpoint.start()
-    ensure_test_data(data_id, data_secret, endpoint)
-
-
-# Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally
-# Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored.
-def test_load_negatives(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-    env = neon_env_builder.init_start()
-    pageserver_http = env.pageserver.http_client()
-    env.endpoints.create_start("main")
-
-    tenant_id = env.initial_tenant
-
-    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
-
-    env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
-    with pytest.raises(
-        expected_exception=PageserverApiException,
-        match=f"tenant {tenant_id} already exists, state: Active",
-    ):
-        env.pageserver.tenant_load(tenant_id)
-
-    pageserver_http.tenant_ignore(tenant_id)
-
-
 def test_detach_while_activating(
     neon_env_builder: NeonEnvBuilder,
 ):
@@ -770,7 +573,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
 
     wait_until(10, 0.5, found_broken)
 
-    client.tenant_ignore(env.initial_tenant)
+    client.tenant_detach(env.initial_tenant)
 
     def found_cleaned_up():
         m = client.get_metrics()
@@ -782,7 +585,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
 
     wait_until(10, 0.5, found_cleaned_up)
 
-    env.pageserver.tenant_load(env.initial_tenant)
+    env.pageserver.tenant_attach(env.initial_tenant)
 
     def found_active():
         m = client.get_metrics()

From 82266a252c19f2848849b210a71e6c9a98a9e2e3 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Fri, 21 Jun 2024 12:36:12 +0200
Subject: [PATCH 1020/1571] Allow longer timeout for starting pageserver, safe
 keeper and storage controller in test cases to make test cases less flaky
 (#8079)

## Problem

see https://github.com/neondatabase/neon/issues/8070

## Summary of changes

the neon_local subcommands to
- start neon
- start pageserver
- start safekeeper
- start storage controller

get a new option -t=xx or --start-timeout=xx which allows to specify a
longer timeout in seconds we wait for the process start.
This is useful in test cases where the pageserver has to read a lot of
layer data, like in pagebench test cases.

In addition we exploit the new timeout option in the python test
infrastructure (python fixtures) and modify the flaky testcase to
increase the timeout from 10 seconds to 1 minute.

Example from the test execution

```bash
RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=15 BUILD_TYPE=release     ./scripts/pytest test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
...
2024-06-19 09:29:34.590 INFO [neon_fixtures.py:1513] Running command "/instance_store/neon/target/release/neon_local storage_controller start --start-timeout=60s"
2024-06-19 09:29:36.365 INFO [broker.py:34] starting storage_broker to listen incoming connections at "127.0.0.1:15001"
2024-06-19 09:29:36.365 INFO [neon_fixtures.py:1513] Running command "/instance_store/neon/target/release/neon_local pageserver start --id=1 --start-timeout=60s"
2024-06-19 09:29:36.366 INFO [neon_fixtures.py:1513] Running command "/instance_store/neon/target/release/neon_local safekeeper start 1 --start-timeout=60s"
```
---
 control_plane/src/background_process.rs       | 33 ++++++----
 control_plane/src/bin/neon_local.rs           | 62 +++++++++++++++----
 control_plane/src/broker.rs                   |  8 ++-
 control_plane/src/pageserver.rs               | 12 ++--
 control_plane/src/safekeeper.rs               | 13 +++-
 control_plane/src/storage_controller.rs       |  6 +-
 test_runner/fixtures/neon_fixtures.py         | 58 +++++++++++++----
 ...er_max_throughput_getpage_at_latest_lsn.py |  2 +
 test_runner/performance/pageserver/util.py    |  5 +-
 9 files changed, 147 insertions(+), 52 deletions(-)

diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 3f4ddbdb2b..a272c306e7 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -36,11 +36,11 @@ use utils::pid_file::{self, PidFileRead};
 // it's waiting. If the process hasn't started/stopped after 5 seconds,
 // it prints a notice that it's taking long, but keeps waiting.
 //
-const RETRY_UNTIL_SECS: u64 = 10;
-const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
-const RETRY_INTERVAL_MILLIS: u64 = 100;
-const DOT_EVERY_RETRIES: u64 = 10;
-const NOTICE_AFTER_RETRIES: u64 = 50;
+const STOP_RETRY_TIMEOUT: Duration = Duration::from_secs(10);
+const STOP_RETRIES: u128 = STOP_RETRY_TIMEOUT.as_millis() / RETRY_INTERVAL.as_millis();
+const RETRY_INTERVAL: Duration = Duration::from_millis(100);
+const DOT_EVERY_RETRIES: u128 = 10;
+const NOTICE_AFTER_RETRIES: u128 = 50;
 
 /// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
 /// it itself.
@@ -52,6 +52,7 @@ pub enum InitialPidFile {
 }
 
 /// Start a background child process using the parameters given.
+#[allow(clippy::too_many_arguments)]
 pub async fn start_process<F, Fut, AI, A, EI>(
     process_name: &str,
     datadir: &Path,
@@ -59,6 +60,7 @@ pub async fn start_process<F, Fut, AI, A, EI>(
     args: AI,
     envs: EI,
     initial_pid_file: InitialPidFile,
+    retry_timeout: &Duration,
     process_status_check: F,
 ) -> anyhow::Result<()>
 where
@@ -69,6 +71,7 @@ where
     // Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
     EI: IntoIterator<Item = (String, String)>,
 {
+    let retries: u128 = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis();
     if !datadir.metadata().context("stat datadir")?.is_dir() {
         anyhow::bail!("`datadir` must be a directory when calling this function: {datadir:?}");
     }
@@ -130,7 +133,7 @@ where
         .unwrap();
     });
 
-    for retries in 0..RETRIES {
+    for retries in 0..retries {
         match process_started(pid, pid_file_to_check, &process_status_check).await {
             Ok(true) => {
                 println!("\n{process_name} started and passed status check, pid: {pid}");
@@ -148,7 +151,7 @@ where
                     print!(".");
                     io::stdout().flush().unwrap();
                 }
-                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
+                thread::sleep(RETRY_INTERVAL);
             }
             Err(e) => {
                 println!("error starting process {process_name:?}: {e:#}");
@@ -157,9 +160,10 @@ where
         }
     }
     println!();
-    anyhow::bail!(
-        "{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
-    );
+    anyhow::bail!(format!(
+        "{} did not start+pass status checks within {:?} seconds",
+        process_name, retry_timeout
+    ));
 }
 
 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
@@ -215,7 +219,7 @@ pub fn stop_process(
 }
 
 pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
-    for retries in 0..RETRIES {
+    for retries in 0..STOP_RETRIES {
         match process_has_stopped(pid) {
             Ok(true) => {
                 println!("\n{process_name} stopped");
@@ -231,7 +235,7 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
                     print!(".");
                     io::stdout().flush().unwrap();
                 }
-                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
+                thread::sleep(RETRY_INTERVAL);
             }
             Err(e) => {
                 println!("{process_name} with pid {pid} failed to stop: {e:#}");
@@ -240,7 +244,10 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
         }
     }
     println!();
-    anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds");
+    anyhow::bail!(format!(
+        "{} with pid {} did not stop in {:?} seconds",
+        process_name, pid, STOP_RETRY_TIMEOUT
+    ));
 }
 
 fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 8fe959792b..3f656932d5 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -36,6 +36,7 @@ use std::collections::{BTreeSet, HashMap};
 use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
+use std::time::Duration;
 use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
 use url::Host;
 use utils::{
@@ -99,7 +100,7 @@ fn main() -> Result<()> {
         let subcommand_result = match sub_name {
             "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
             "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
-            "start" => rt.block_on(handle_start_all(&env)),
+            "start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))),
             "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
             "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
             "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
@@ -1048,10 +1049,20 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
     ))
 }
 
+fn get_start_timeout(args: &ArgMatches) -> &Duration {
+    let humantime_duration = args
+        .get_one::<humantime::Duration>("start-timeout")
+        .expect("invalid value for start-timeout");
+    humantime_duration.as_ref()
+}
+
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     match sub_match.subcommand() {
         Some(("start", subcommand_args)) => {
-            if let Err(e) = get_pageserver(env, subcommand_args)?.start().await {
+            if let Err(e) = get_pageserver(env, subcommand_args)?
+                .start(get_start_timeout(subcommand_args))
+                .await
+            {
                 eprintln!("pageserver start failed: {e}");
                 exit(1);
             }
@@ -1077,7 +1088,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
                 exit(1);
             }
 
-            if let Err(e) = pageserver.start().await {
+            if let Err(e) = pageserver.start(get_start_timeout(sub_match)).await {
                 eprintln!("pageserver start failed: {e}");
                 exit(1);
             }
@@ -1105,8 +1116,8 @@ async fn handle_storage_controller(
 ) -> Result<()> {
     let svc = StorageController::from_env(env);
     match sub_match.subcommand() {
-        Some(("start", _start_match)) => {
-            if let Err(e) = svc.start().await {
+        Some(("start", start_match)) => {
+            if let Err(e) = svc.start(get_start_timeout(start_match)).await {
                 eprintln!("start failed: {e}");
                 exit(1);
             }
@@ -1165,7 +1176,10 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
         "start" => {
             let extra_opts = safekeeper_extra_opts(sub_args);
 
-            if let Err(e) = safekeeper.start(extra_opts).await {
+            if let Err(e) = safekeeper
+                .start(extra_opts, get_start_timeout(sub_args))
+                .await
+            {
                 eprintln!("safekeeper start failed: {}", e);
                 exit(1);
             }
@@ -1191,7 +1205,10 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
             }
 
             let extra_opts = safekeeper_extra_opts(sub_args);
-            if let Err(e) = safekeeper.start(extra_opts).await {
+            if let Err(e) = safekeeper
+                .start(extra_opts, get_start_timeout(sub_args))
+                .await
+            {
                 eprintln!("safekeeper start failed: {}", e);
                 exit(1);
             }
@@ -1204,15 +1221,18 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
     Ok(())
 }
 
-async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
+async fn handle_start_all(
+    env: &local_env::LocalEnv,
+    retry_timeout: &Duration,
+) -> anyhow::Result<()> {
     // Endpoints are not started automatically
 
-    broker::start_broker_process(env).await?;
+    broker::start_broker_process(env, retry_timeout).await?;
 
     // Only start the storage controller if the pageserver is configured to need it
     if env.control_plane_api.is_some() {
         let storage_controller = StorageController::from_env(env);
-        if let Err(e) = storage_controller.start().await {
+        if let Err(e) = storage_controller.start(retry_timeout).await {
             eprintln!("storage_controller start failed: {:#}", e);
             try_stop_all(env, true).await;
             exit(1);
@@ -1221,7 +1241,7 @@ async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
 
     for ps_conf in &env.pageservers {
         let pageserver = PageServerNode::from_env(env, ps_conf);
-        if let Err(e) = pageserver.start().await {
+        if let Err(e) = pageserver.start(retry_timeout).await {
             eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
             try_stop_all(env, true).await;
             exit(1);
@@ -1230,7 +1250,7 @@ async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
 
     for node in env.safekeepers.iter() {
         let safekeeper = SafekeeperNode::from_env(env, node);
-        if let Err(e) = safekeeper.start(vec![]).await {
+        if let Err(e) = safekeeper.start(vec![], retry_timeout).await {
             eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
             try_stop_all(env, false).await;
             exit(1);
@@ -1290,6 +1310,15 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
 }
 
 fn cli() -> Command {
+    let timeout_arg = Arg::new("start-timeout")
+        .long("start-timeout")
+        .short('t')
+        .global(true)
+        .help("timeout until we fail the command, e.g. 30s")
+        .value_parser(value_parser!(humantime::Duration))
+        .default_value("10s")
+        .required(false);
+
     let branch_name_arg = Arg::new("branch-name")
         .long("branch-name")
         .help("Name of the branch to be created or used as an alias for other services")
@@ -1509,6 +1538,7 @@ fn cli() -> Command {
                 .subcommand(Command::new("status"))
                 .subcommand(Command::new("start")
                     .about("Start local pageserver")
+                    .arg(timeout_arg.clone())
                 )
                 .subcommand(Command::new("stop")
                     .about("Stop local pageserver")
@@ -1516,13 +1546,15 @@ fn cli() -> Command {
                 )
                 .subcommand(Command::new("restart")
                     .about("Restart local pageserver")
+                    .arg(timeout_arg.clone())
                 )
         )
         .subcommand(
             Command::new("storage_controller")
                 .arg_required_else_help(true)
                 .about("Manage storage_controller")
-                .subcommand(Command::new("start").about("Start storage controller"))
+                .subcommand(Command::new("start").about("Start storage controller")
+                            .arg(timeout_arg.clone()))
                 .subcommand(Command::new("stop").about("Stop storage controller")
                             .arg(stop_mode_arg.clone()))
         )
@@ -1534,6 +1566,7 @@ fn cli() -> Command {
                             .about("Start local safekeeper")
                             .arg(safekeeper_id_arg.clone())
                             .arg(safekeeper_extra_opt_arg.clone())
+                            .arg(timeout_arg.clone())
                 )
                 .subcommand(Command::new("stop")
                             .about("Stop local safekeeper")
@@ -1545,6 +1578,7 @@ fn cli() -> Command {
                             .arg(safekeeper_id_arg)
                             .arg(stop_mode_arg.clone())
                             .arg(safekeeper_extra_opt_arg)
+                            .arg(timeout_arg.clone())
                 )
         )
         .subcommand(
@@ -1579,6 +1613,7 @@ fn cli() -> Command {
                     .arg(remote_ext_config_args)
                     .arg(create_test_user)
                     .arg(allow_multiple.clone())
+                    .arg(timeout_arg.clone())
                 )
                 .subcommand(Command::new("reconfigure")
                             .about("Reconfigure the endpoint")
@@ -1630,6 +1665,7 @@ fn cli() -> Command {
         .subcommand(
             Command::new("start")
                 .about("Start page server and safekeepers")
+                .arg(timeout_arg.clone())
         )
         .subcommand(
             Command::new("stop")
diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs
index f40705863b..c3cfc140da 100644
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -5,13 +5,18 @@
 //! ```text
 //!   .neon/safekeepers/<safekeeper id>
 //! ```
+use std::time::Duration;
+
 use anyhow::Context;
 
 use camino::Utf8PathBuf;
 
 use crate::{background_process, local_env};
 
-pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
+pub async fn start_broker_process(
+    env: &local_env::LocalEnv,
+    retry_timeout: &Duration,
+) -> anyhow::Result<()> {
     let broker = &env.broker;
     let listen_addr = &broker.listen_addr;
 
@@ -27,6 +32,7 @@ pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<(
         args,
         [],
         background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
+        retry_timeout,
         || async {
             let url = broker.client_url();
             let status_url = url.join("status").with_context(|| {
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 13e684da24..da4b987849 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -158,8 +158,8 @@ impl PageServerNode {
             .expect("non-Unicode path")
     }
 
-    pub async fn start(&self) -> anyhow::Result<()> {
-        self.start_node().await
+    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
+        self.start_node(retry_timeout).await
     }
 
     fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
@@ -214,14 +214,15 @@ impl PageServerNode {
         Ok(())
     }
 
-    async fn start_node(&self) -> anyhow::Result<()> {
+    async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
         // TODO: using a thread here because start_process() is not async but we need to call check_status()
         let datadir = self.repo_path();
         print!(
-            "Starting pageserver node {} at '{}' in {:?}",
+            "Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
             self.conf.id,
             self.pg_connection_config.raw_address(),
-            datadir
+            datadir,
+            retry_timeout
         );
         io::stdout().flush().context("flush stdout")?;
 
@@ -239,6 +240,7 @@ impl PageServerNode {
             args,
             self.pageserver_env_variables()?,
             background_process::InitialPidFile::Expect(self.pid_file()),
+            retry_timeout,
             || async {
                 let st = self.check_status().await;
                 match st {
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index 4a320ce53d..a0a73f5609 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -7,6 +7,7 @@
 //! ```
 use std::io::Write;
 use std::path::PathBuf;
+use std::time::Duration;
 use std::{io, result};
 
 use anyhow::Context;
@@ -111,11 +112,16 @@ impl SafekeeperNode {
             .expect("non-Unicode path")
     }
 
-    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
+    pub async fn start(
+        &self,
+        extra_opts: Vec<String>,
+        retry_timeout: &Duration,
+    ) -> anyhow::Result<()> {
         print!(
-            "Starting safekeeper at '{}' in '{}'",
+            "Starting safekeeper at '{}' in '{}', retrying for {:?}",
             self.pg_connection_config.raw_address(),
-            self.datadir_path().display()
+            self.datadir_path().display(),
+            retry_timeout,
         );
         io::stdout().flush().unwrap();
 
@@ -200,6 +206,7 @@ impl SafekeeperNode {
             &args,
             self.safekeeper_env_variables()?,
             background_process::InitialPidFile::Expect(self.pid_file()),
+            retry_timeout,
             || async {
                 match self.check_status().await {
                     Ok(()) => Ok(true),
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 4f9f0ba794..1c56d5f80f 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -18,7 +18,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{fs, str::FromStr};
+use std::{fs, str::FromStr, time::Duration};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
@@ -224,7 +224,7 @@ impl StorageController {
         Ok(database_url)
     }
 
-    pub async fn start(&self) -> anyhow::Result<()> {
+    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
         // Start a vanilla Postgres process used by the storage controller for persistence.
         let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
             .unwrap()
@@ -272,6 +272,7 @@ impl StorageController {
             db_start_args,
             [],
             background_process::InitialPidFile::Create(self.postgres_pid_file()),
+            retry_timeout,
             || self.pg_isready(&pg_bin_dir),
         )
         .await?;
@@ -326,6 +327,7 @@ impl StorageController {
             args,
             [],
             background_process::InitialPidFile::Create(self.pid_file()),
+            retry_timeout,
             || async {
                 match self.ready().await {
                     Ok(_) => Ok(true),
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b5e40f5a46..4ff1705ca4 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1177,10 +1177,10 @@ class NeonEnv:
             force=config.config_init_force,
         )
 
-    def start(self):
+    def start(self, timeout_in_seconds: Optional[int] = None):
         # Storage controller starts first, so that pageserver /re-attach calls don't
         # bounce through retries on startup
-        self.storage_controller.start()
+        self.storage_controller.start(timeout_in_seconds=timeout_in_seconds)
 
         # Wait for storage controller readiness to prevent unnecessary post start-up
         # reconcile.
@@ -1196,10 +1196,18 @@ class NeonEnv:
             )  # The `or None` is for the linter
 
             for pageserver in self.pageservers:
-                futs.append(executor.submit(lambda ps=pageserver: ps.start()))
+                futs.append(
+                    executor.submit(
+                        lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)
+                    )
+                )
 
             for safekeeper in self.safekeepers:
-                futs.append(executor.submit(lambda sk=safekeeper: sk.start()))
+                futs.append(
+                    executor.submit(
+                        lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds)
+                    )
+                )
 
         for f in futs:
             f.result()
@@ -1783,8 +1791,13 @@ class NeonCli(AbstractNeonCli):
             res.check_returncode()
         return res
 
-    def storage_controller_start(self):
+    def storage_controller_start(
+        self,
+        timeout_in_seconds: Optional[int] = None,
+    ):
         cmd = ["storage_controller", "start"]
+        if timeout_in_seconds is not None:
+            cmd.append(f"--start-timeout={timeout_in_seconds}s")
         return self.raw_cli(cmd)
 
     def storage_controller_stop(self, immediate: bool):
@@ -1797,8 +1810,11 @@ class NeonCli(AbstractNeonCli):
         self,
         id: int,
         extra_env_vars: Optional[Dict[str, str]] = None,
+        timeout_in_seconds: Optional[int] = None,
     ) -> "subprocess.CompletedProcess[str]":
         start_args = ["pageserver", "start", f"--id={id}"]
+        if timeout_in_seconds is not None:
+            start_args.append(f"--start-timeout={timeout_in_seconds}s")
         storage = self.env.pageserver_remote_storage
 
         if isinstance(storage, S3Storage):
@@ -1816,7 +1832,10 @@ class NeonCli(AbstractNeonCli):
         return self.raw_cli(cmd)
 
     def safekeeper_start(
-        self, id: int, extra_opts: Optional[List[str]] = None
+        self,
+        id: int,
+        extra_opts: Optional[List[str]] = None,
+        timeout_in_seconds: Optional[int] = None,
     ) -> "subprocess.CompletedProcess[str]":
         s3_env_vars = None
         if isinstance(self.env.safekeepers_remote_storage, S3Storage):
@@ -1826,6 +1845,8 @@ class NeonCli(AbstractNeonCli):
             extra_opts = [f"-e={opt}" for opt in extra_opts]
         else:
             extra_opts = []
+        if timeout_in_seconds is not None:
+            extra_opts.append(f"--start-timeout={timeout_in_seconds}s")
         return self.raw_cli(
             ["safekeeper", "start", str(id), *extra_opts], extra_env_vars=s3_env_vars
         )
@@ -2077,9 +2098,9 @@ class NeonStorageController(MetricsGetter, LogUtils):
         self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
         self.logfile = self.workdir / "storage_controller.log"
 
-    def start(self):
+    def start(self, timeout_in_seconds: Optional[int] = None):
         assert not self.running
-        self.env.neon_cli.storage_controller_start()
+        self.env.neon_cli.storage_controller_start(timeout_in_seconds)
         self.running = True
         return self
 
@@ -2531,6 +2552,7 @@ class NeonPageserver(PgProtocol, LogUtils):
     def start(
         self,
         extra_env_vars: Optional[Dict[str, str]] = None,
+        timeout_in_seconds: Optional[int] = None,
     ) -> "NeonPageserver":
         """
         Start the page server.
@@ -2539,7 +2561,9 @@ class NeonPageserver(PgProtocol, LogUtils):
         """
         assert self.running is False
 
-        self.env.neon_cli.pageserver_start(self.id, extra_env_vars=extra_env_vars)
+        self.env.neon_cli.pageserver_start(
+            self.id, extra_env_vars=extra_env_vars, timeout_in_seconds=timeout_in_seconds
+        )
         self.running = True
         return self
 
@@ -2553,13 +2577,17 @@ class NeonPageserver(PgProtocol, LogUtils):
             self.running = False
         return self
 
-    def restart(self, immediate: bool = False):
+    def restart(
+        self,
+        immediate: bool = False,
+        timeout_in_seconds: Optional[int] = None,
+    ):
         """
         High level wrapper for restart: restarts the process, and waits for
         tenant state to stabilize.
         """
         self.stop(immediate=immediate)
-        self.start()
+        self.start(timeout_in_seconds=timeout_in_seconds)
         self.quiesce_tenants()
 
     def quiesce_tenants(self):
@@ -3835,9 +3863,13 @@ class Safekeeper(LogUtils):
         self.running = running
         self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"
 
-    def start(self, extra_opts: Optional[List[str]] = None) -> "Safekeeper":
+    def start(
+        self, extra_opts: Optional[List[str]] = None, timeout_in_seconds: Optional[int] = None
+    ) -> "Safekeeper":
         assert self.running is False
-        self.env.neon_cli.safekeeper_start(self.id, extra_opts=extra_opts)
+        self.env.neon_cli.safekeeper_start(
+            self.id, extra_opts=extra_opts, timeout_in_seconds=timeout_in_seconds
+        )
         self.running = True
         # wait for wal acceptor start by checking its status
         started_at = time.time()
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 68f3d9dcbe..1d579214b0 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -85,6 +85,8 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
         f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}",
         n_tenants,
         setup_wrapper,
+        # https://github.com/neondatabase/neon/issues/8070
+        timeout_in_seconds=60,
     )
 
     env.pageserver.allowed_errors.append(
diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py
index f31cd9a9f8..92e05663ce 100644
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -2,7 +2,7 @@
 Utilities used by all code in this sub-directory
 """
 
-from typing import Any, Callable, Dict, Tuple
+from typing import Any, Callable, Dict, Optional, Tuple
 
 import fixtures.pageserver.many_tenants as many_tenants
 from fixtures.common_types import TenantId, TimelineId
@@ -41,6 +41,7 @@ def setup_pageserver_with_tenants(
     name: str,
     n_tenants: int,
     setup: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]],
+    timeout_in_seconds: Optional[int] = None,
 ) -> NeonEnv:
     """
     Utility function to set up a pageserver with a given number of identical tenants.
@@ -50,6 +51,6 @@ def setup_pageserver_with_tenants(
         return many_tenants.single_timeline(neon_env_builder, setup, n_tenants)
 
     env = neon_env_builder.build_and_use_snapshot(name, doit)
-    env.start()
+    env.start(timeout_in_seconds=timeout_in_seconds)
     ensure_pageserver_ready_for_benchmarking(env, n_tenants)
     return env

From f45cf28247cad5fc1d813881d4869efd10f1fef2 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 21 Jun 2024 13:15:02 +0100
Subject: [PATCH 1021/1571] Add eviction_state to control file (#8125)

This is a preparation for #8022, to make the PR both backwards and
foward compatible.

This commit adds `eviction_state` field to control file. Adds support
for reading it, but writes control file in old format where possible, to
keep the disk format forward compatible.

Note: in `patch_control_file`, new field gets serialized to json like
this:
- `"eviction_state": "Present"`
- `"eviction_state": {"Offloaded": "0/8F"}`
---
 safekeeper/src/control_file.rs         | 19 ++++--
 safekeeper/src/control_file_upgrade.rs | 94 +++++++++++++++++++++++++-
 safekeeper/src/safekeeper.rs           |  5 +-
 safekeeper/src/state.rs                | 16 +++++
 4 files changed, 128 insertions(+), 6 deletions(-)

diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index 9d65187350..8e9031fae4 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -12,15 +12,16 @@ use std::ops::Deref;
 use std::path::Path;
 use std::time::Instant;
 
+use crate::control_file_upgrade::downgrade_v9_to_v8;
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
-use crate::state::TimelinePersistentState;
+use crate::state::{EvictionState, TimelinePersistentState};
 use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
 use utils::{bin_ser::LeSer, id::TenantTimelineId};
 
 use crate::SafeKeeperConf;
 
 pub const SK_MAGIC: u32 = 0xcafeceefu32;
-pub const SK_FORMAT_VERSION: u32 = 8;
+pub const SK_FORMAT_VERSION: u32 = 9;
 
 // contains persistent metadata for safekeeper
 pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
@@ -178,8 +179,18 @@ impl Storage for FileStorage {
         })?;
         let mut buf: Vec<u8> = Vec::new();
         WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
-        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
-        s.ser_into(&mut buf)?;
+
+        if s.eviction_state == EvictionState::Present {
+            // temp hack for forward compatibility
+            const PREV_FORMAT_VERSION: u32 = 8;
+            let prev = downgrade_v9_to_v8(s);
+            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
+            prev.ser_into(&mut buf)?;
+        } else {
+            // otherwise, we write the current format version
+            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
+            s.ser_into(&mut buf)?;
+        }
 
         // calculate checksum before resize
         let checksum = crc32c::crc32c(&buf);
diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs
index 8f4dfe9b43..a4b4670e42 100644
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -1,7 +1,7 @@
 //! Code to deal with safekeeper control file upgrades
 use crate::{
     safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn},
-    state::{PersistedPeers, TimelinePersistentState},
+    state::{EvictionState, PersistedPeers, TimelinePersistentState},
     wal_backup_partial,
 };
 use anyhow::{bail, Result};
@@ -183,6 +183,55 @@ pub struct SafeKeeperStateV7 {
     pub peers: PersistedPeers,
 }
 
+/// Persistent information stored on safekeeper node about timeline.
+/// On disk data is prefixed by magic and format version and followed by checksum.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct SafeKeeperStateV8 {
+    #[serde(with = "hex")]
+    pub tenant_id: TenantId,
+    #[serde(with = "hex")]
+    pub timeline_id: TimelineId,
+    /// persistent acceptor state
+    pub acceptor_state: AcceptorState,
+    /// information about server
+    pub server: ServerInfo,
+    /// Unique id of the last *elected* proposer we dealt with. Not needed
+    /// for correctness, exists for monitoring purposes.
+    #[serde(with = "hex")]
+    pub proposer_uuid: PgUuid,
+    /// Since which LSN this timeline generally starts. Safekeeper might have
+    /// joined later.
+    pub timeline_start_lsn: Lsn,
+    /// Since which LSN safekeeper has (had) WAL for this timeline.
+    /// All WAL segments next to one containing local_start_lsn are
+    /// filled with data from the beginning.
+    pub local_start_lsn: Lsn,
+    /// Part of WAL acknowledged by quorum *and available locally*. Always points
+    /// to record boundary.
+    pub commit_lsn: Lsn,
+    /// LSN that points to the end of the last backed up segment. Useful to
+    /// persist to avoid finding out offloading progress on boot.
+    pub backup_lsn: Lsn,
+    /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
+    /// of last record streamed to everyone). Persisting it helps skipping
+    /// recovery in walproposer, generally we compute it from peers. In
+    /// walproposer proto called 'truncate_lsn'. Updates are currently drived
+    /// only by walproposer.
+    pub peer_horizon_lsn: Lsn,
+    /// LSN of the oldest known checkpoint made by pageserver and successfully
+    /// pushed to s3. We don't remove WAL beyond it. Persisted only for
+    /// informational purposes, we receive it from pageserver (or broker).
+    pub remote_consistent_lsn: Lsn,
+    /// Peers and their state as we remember it. Knowing peers themselves is
+    /// fundamental; but state is saved here only for informational purposes and
+    /// obviously can be stale. (Currently not saved at all, but let's provision
+    /// place to have less file version upgrades).
+    pub peers: PersistedPeers,
+    /// Holds names of partial segments uploaded to remote storage. Used to
+    /// clean up old objects without leaving garbage in remote storage.
+    pub partial_backup: wal_backup_partial::State,
+}
+
 pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
     // migrate to storing full term history
     if version == 1 {
@@ -213,6 +262,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             remote_consistent_lsn: Lsn(0),
             peers: PersistedPeers(vec![]),
             partial_backup: wal_backup_partial::State::default(),
+            eviction_state: EvictionState::Present,
         });
     // migrate to hexing some ids
     } else if version == 2 {
@@ -237,6 +287,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             remote_consistent_lsn: Lsn(0),
             peers: PersistedPeers(vec![]),
             partial_backup: wal_backup_partial::State::default(),
+            eviction_state: EvictionState::Present,
         });
     // migrate to moving tenant_id/timeline_id to the top and adding some lsns
     } else if version == 3 {
@@ -261,6 +312,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             remote_consistent_lsn: Lsn(0),
             peers: PersistedPeers(vec![]),
             partial_backup: wal_backup_partial::State::default(),
+            eviction_state: EvictionState::Present,
         });
     // migrate to having timeline_start_lsn
     } else if version == 4 {
@@ -285,6 +337,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             remote_consistent_lsn: Lsn(0),
             peers: PersistedPeers(vec![]),
             partial_backup: wal_backup_partial::State::default(),
+            eviction_state: EvictionState::Present,
         });
     } else if version == 5 {
         info!("reading safekeeper control file version {}", version);
@@ -329,6 +382,26 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             remote_consistent_lsn: oldstate.remote_consistent_lsn,
             peers: oldstate.peers,
             partial_backup: wal_backup_partial::State::default(),
+            eviction_state: EvictionState::Present,
+        });
+    } else if version == 8 {
+        let oldstate = SafeKeeperStateV8::des(&buf[..buf.len()])?;
+
+        return Ok(TimelinePersistentState {
+            tenant_id: oldstate.tenant_id,
+            timeline_id: oldstate.timeline_id,
+            acceptor_state: oldstate.acceptor_state,
+            server: oldstate.server,
+            proposer_uuid: oldstate.proposer_uuid,
+            timeline_start_lsn: oldstate.timeline_start_lsn,
+            local_start_lsn: oldstate.local_start_lsn,
+            commit_lsn: oldstate.commit_lsn,
+            backup_lsn: oldstate.backup_lsn,
+            peer_horizon_lsn: oldstate.peer_horizon_lsn,
+            remote_consistent_lsn: oldstate.remote_consistent_lsn,
+            peers: oldstate.peers,
+            partial_backup: oldstate.partial_backup,
+            eviction_state: EvictionState::Present,
         });
     }
 
@@ -338,6 +411,25 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
     bail!("unsupported safekeeper control file version {}", version)
 }
 
+pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8 {
+    assert!(state.eviction_state == EvictionState::Present);
+    SafeKeeperStateV8 {
+        tenant_id: state.tenant_id,
+        timeline_id: state.timeline_id,
+        acceptor_state: state.acceptor_state.clone(),
+        server: state.server.clone(),
+        proposer_uuid: state.proposer_uuid,
+        timeline_start_lsn: state.timeline_start_lsn,
+        local_start_lsn: state.local_start_lsn,
+        commit_lsn: state.commit_lsn,
+        backup_lsn: state.backup_lsn,
+        peer_horizon_lsn: state.peer_horizon_lsn,
+        remote_consistent_lsn: state.remote_consistent_lsn,
+        peers: state.peers.clone(),
+        partial_backup: state.partial_backup.clone(),
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::str::FromStr;
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index ae230960ae..666ffdf0ce 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -958,7 +958,7 @@ mod tests {
 
     use super::*;
     use crate::{
-        state::{PersistedPeers, TimelinePersistentState},
+        state::{EvictionState, PersistedPeers, TimelinePersistentState},
         wal_storage::Storage,
     };
     use std::{ops::Deref, str::FromStr, time::Instant};
@@ -1225,6 +1225,7 @@ mod tests {
                 },
             )]),
             partial_backup: crate::wal_backup_partial::State::default(),
+            eviction_state: EvictionState::Present,
         };
 
         let ser = state.ser().unwrap();
@@ -1272,6 +1273,8 @@ mod tests {
             0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
             // partial_backup
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            // eviction_state
+            0x00, 0x00, 0x00, 0x00,
         ];
 
         assert_eq!(Hex(&ser), Hex(&expected));
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index be5e516296..e0f7b65aef 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -63,11 +63,26 @@ pub struct TimelinePersistentState {
     /// Holds names of partial segments uploaded to remote storage. Used to
     /// clean up old objects without leaving garbage in remote storage.
     pub partial_backup: wal_backup_partial::State,
+    /// Eviction state of the timeline. If it's Offloaded, we should download
+    /// WAL files from remote storage to serve the timeline.
+    pub eviction_state: EvictionState,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);
 
+/// State of the local WAL files. Used to track current timeline state,
+/// that can be either WAL files are present on disk or last partial segment
+/// is offloaded to remote storage.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
+pub enum EvictionState {
+    /// WAL files are present on disk.
+    Present,
+    /// Last partial segment is offloaded to remote storage.
+    /// Contains flush_lsn of the last offloaded segment.
+    Offloaded(Lsn),
+}
+
 impl TimelinePersistentState {
     pub fn new(
         ttid: &TenantTimelineId,
@@ -98,6 +113,7 @@ impl TimelinePersistentState {
                     .collect(),
             ),
             partial_backup: wal_backup_partial::State::default(),
+            eviction_state: EvictionState::Present,
         }
     }
 

From 15728be0e1972c1b6971e41f599d5059d19ce5e0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 21 Jun 2024 15:39:19 +0100
Subject: [PATCH 1022/1571] pageserver: always detach before deleting (#8082)

In #7957 we enabled deletion without attachment, but retained the
old-style deletion (return 202, delete in background) for attached
tenants. In this PR, we remove the old-style deletion path, such that if
the tenant delete API is invoked while a tenant is detached, it is
simply detached before completing the deletion.

This intentionally doesn't rip out all the old deletion code: in case a
deletion was in progress at time of upgrade, we keep around the code for
finishing it for one release cycle. The rest of the code removal happens
in https://github.com/neondatabase/neon/pull/8091

Now that deletion will always be via the new path, the new path is also
updated to use some retries around remote storage operations, to
tripping up the control plane with 500s if S3 has an intermittent issue.
---
 pageserver/src/http/openapi_spec.yml          |  25 +-
 pageserver/src/http/routes.rs                 |  16 +-
 pageserver/src/tenant/delete.rs               | 240 +---------
 pageserver/src/tenant/mgr.rs                  | 173 +++----
 test_runner/fixtures/pageserver/utils.py      |  46 --
 test_runner/performance/test_bulk_insert.py   |   2 -
 .../regress/test_pageserver_secondary.py      |   7 +-
 test_runner/regress/test_s3_restore.py        |   5 +-
 .../regress/test_storage_controller.py        |   3 +-
 test_runner/regress/test_tenant_delete.py     | 440 ++----------------
 test_runner/regress/test_tenant_relocation.py |   4 -
 test_runner/regress/test_tenant_size.py       |   3 +-
 .../regress/test_timeline_detach_ancestor.py  |   3 +-
 test_runner/regress/test_timeline_size.py     |  21 +-
 14 files changed, 123 insertions(+), 865 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 4b6fe56b89..1bc8fe9066 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -78,29 +78,14 @@ paths:
 
     delete:
       description: |
-        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
-        404 means that deletion successfully finished"
+        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried.  Deleting
+        a non-existent tenant is considered successful (returns 200).
       responses:
         "200":
           description: Tenant was successfully deleted, or was already not found.
-        "404":
-          description: Tenant not found. This is a success result, equivalent to 200.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "409":
-          description: Deletion is already in progress, continue polling
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ConflictError"
-        "412":
-          description: Deletion may not proceed, tenant is not in Active state
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/PreconditionFailedError"
+        "503":
+          description: Service is unavailable, or tenant is already being modified (perhaps concurrently deleted)
+
 
   /v1/tenant/{tenant_id}/time_travel_remote_storage:
     parameters:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index eb74ca637f..b5713a8cb4 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -334,13 +334,10 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
         use crate::tenant::delete::DeleteTenantError::*;
         match value {
             Get(g) => ApiError::from(g),
-            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
             Timeline(t) => ApiError::from(t),
-            NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
             SlotError(e) => e.into(),
             SlotUpsertError(e) => e.into(),
             Other(o) => ApiError::InternalServerError(o),
-            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
             Cancelled => ApiError::ShuttingDown,
         }
     }
@@ -1015,23 +1012,16 @@ async fn tenant_delete_handler(
 
     let state = get_state(&request);
 
-    let status = state
+    state
         .tenant_manager
-        .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
+        .delete_tenant(tenant_shard_id)
         .instrument(info_span!("tenant_delete_handler",
             tenant_id = %tenant_shard_id.tenant_id,
             shard_id = %tenant_shard_id.shard_slug()
         ))
         .await?;
 
-    // Callers use 404 as success for deletions, for historical reasons.
-    if status == StatusCode::NOT_FOUND {
-        return Err(ApiError::NotFound(
-            anyhow::anyhow!("Deletion complete").into(),
-        ));
-    }
-
-    json_response(status, ())
+    json_response(StatusCode::OK, ())
 }
 
 /// HTTP endpoint to query the current tenant_size of a tenant.
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 8b36aa15e5..d9da3157b7 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -6,25 +6,23 @@ use pageserver_api::{models::TenantState, shard::TenantShardId};
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, instrument, Instrument};
+use tracing::{error, Instrument};
 
 use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
 
 use crate::{
     config::PageServerConf,
     context::RequestContext,
-    task_mgr::{self, TaskKind},
+    task_mgr::{self},
     tenant::{
         mgr::{TenantSlot, TenantsMapRemoveResult},
         remote_timeline_client::remote_heatmap_path,
-        timeline::ShutdownMode,
     },
 };
 
 use super::{
     mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
     remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
-    span,
     timeline::delete::DeleteTimelineFlow,
     tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
 };
@@ -34,15 +32,6 @@ pub(crate) enum DeleteTenantError {
     #[error("GetTenant {0}")]
     Get(#[from] GetTenantError),
 
-    #[error("Tenant not attached")]
-    NotAttached,
-
-    #[error("Invalid state {0}. Expected Active or Broken")]
-    InvalidState(TenantState),
-
-    #[error("Tenant deletion is already in progress")]
-    AlreadyInProgress,
-
     #[error("Tenant map slot error {0}")]
     SlotError(#[from] TenantSlotError),
 
@@ -74,56 +63,6 @@ fn remote_tenant_delete_mark_path(
     Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
 }
 
-async fn create_remote_delete_mark(
-    conf: &PageServerConf,
-    remote_storage: &GenericRemoteStorage,
-    tenant_shard_id: &TenantShardId,
-    cancel: &CancellationToken,
-) -> Result<(), DeleteTenantError> {
-    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
-
-    let data: &[u8] = &[];
-    backoff::retry(
-        || async {
-            let data = bytes::Bytes::from_static(data);
-            let stream = futures::stream::once(futures::future::ready(Ok(data)));
-            remote_storage
-                .upload(stream, 0, &remote_mark_path, None, cancel)
-                .await
-        },
-        TimeoutOrCancel::caused_by_cancel,
-        FAILED_UPLOAD_WARN_THRESHOLD,
-        FAILED_REMOTE_OP_RETRIES,
-        "mark_upload",
-        cancel,
-    )
-    .await
-    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
-    .and_then(|x| x)
-    .context("mark_upload")?;
-
-    Ok(())
-}
-
-async fn create_local_delete_mark(
-    conf: &PageServerConf,
-    tenant_shard_id: &TenantShardId,
-) -> Result<(), DeleteTenantError> {
-    let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id);
-
-    // Note: we're ok to replace existing file.
-    let _ = std::fs::OpenOptions::new()
-        .write(true)
-        .create(true)
-        .truncate(true)
-        .open(&marker_path)
-        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
-
-    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
-
-    Ok(())
-}
-
 async fn schedule_ordered_timeline_deletions(
     tenant: &Arc<Tenant>,
 ) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
@@ -262,21 +201,6 @@ async fn cleanup_remaining_fs_traces(
     Ok(())
 }
 
-/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
-/// and deletes its data from both disk and s3.
-/// The sequence of steps:
-/// 1. Upload remote deletion mark.
-/// 2. Create local mark file.
-/// 3. Shutdown tasks
-/// 4. Run ordered timeline deletions
-/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
-/// 6. Remove remote mark
-/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
-/// It is resumable from any step in case a crash/restart occurs.
-/// There are two entrypoints to the process:
-/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
-/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
-///  Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
 #[derive(Default)]
 pub enum DeleteTenantFlow {
     #[default]
@@ -286,91 +210,6 @@ pub enum DeleteTenantFlow {
 }
 
 impl DeleteTenantFlow {
-    // These steps are run in the context of management api request handler.
-    // Long running steps are continued to run in the background.
-    // NB: If this fails half-way through, and is retried, the retry will go through
-    // all the same steps again. Make sure the code here is idempotent, and don't
-    // error out if some of the shutdown tasks have already been completed!
-    // NOTE: static needed for background part.
-    // We assume that calling code sets up the span with tenant_id.
-    #[instrument(skip_all)]
-    pub(crate) async fn run(
-        conf: &'static PageServerConf,
-        remote_storage: GenericRemoteStorage,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
-        tenant: Arc<Tenant>,
-        cancel: &CancellationToken,
-    ) -> Result<(), DeleteTenantError> {
-        span::debug_assert_current_span_has_tenant_id();
-
-        pausable_failpoint!("tenant-delete-before-run");
-
-        let mut guard = Self::prepare(&tenant).await?;
-
-        if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await {
-            tenant.set_broken(format!("{e:#}")).await;
-            return Err(e);
-        }
-
-        Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
-
-        Ok(())
-    }
-
-    // Helper function needed to be able to match once on returned error and transition tenant into broken state.
-    // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
-    // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
-    // So the solution is to set tenant state to broken.
-    async fn run_inner(
-        guard: &mut OwnedMutexGuard<Self>,
-        conf: &'static PageServerConf,
-        remote_storage: &GenericRemoteStorage,
-        tenant: &Tenant,
-        cancel: &CancellationToken,
-    ) -> Result<(), DeleteTenantError> {
-        guard.mark_in_progress()?;
-
-        fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-create-remote-mark"
-            ))?
-        });
-
-        create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
-            .await
-            .context("remote_mark")?;
-
-        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-create-local-mark"
-            ))?
-        });
-
-        create_local_delete_mark(conf, &tenant.tenant_shard_id)
-            .await
-            .context("local delete mark")?;
-
-        fail::fail_point!("tenant-delete-before-background", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-background"
-            ))?
-        });
-
-        Ok(())
-    }
-
-    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
-        match self {
-            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
-            Self::InProgress { .. } => { /* We're in a retry */ }
-            Self::NotStarted => { /* Fresh start */ }
-        }
-
-        *self = Self::InProgress;
-
-        Ok(())
-    }
-
     pub(crate) async fn should_resume_deletion(
         conf: &'static PageServerConf,
         remote_mark_exists: bool,
@@ -428,79 +267,6 @@ impl DeleteTenantFlow {
         .await
     }
 
-    /// Check whether background deletion of this tenant is currently in progress
-    pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
-        tenant.delete_progress.try_lock().is_err()
-    }
-
-    async fn prepare(
-        tenant: &Arc<Tenant>,
-    ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
-        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
-        // so at least for now allow deletions only for active tenants. TODO recheck
-        // Broken and Stopping is needed for retries.
-        if !matches!(
-            tenant.current_state(),
-            TenantState::Active | TenantState::Broken { .. }
-        ) {
-            return Err(DeleteTenantError::InvalidState(tenant.current_state()));
-        }
-
-        let guard = Arc::clone(&tenant.delete_progress)
-            .try_lock_owned()
-            .map_err(|_| DeleteTenantError::AlreadyInProgress)?;
-
-        fail::fail_point!("tenant-delete-before-shutdown", |_| {
-            Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
-        });
-
-        // make pageserver shutdown not to wait for our completion
-        let (_, progress) = completion::channel();
-
-        // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
-        // i e it is an error to do:
-        // tenant.set_stopping
-        // tenant.shutdown
-        // Its also bad that we're holding tenants.read here.
-        // TODO relax set_stopping to be idempotent?
-        if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
-            return Err(DeleteTenantError::Other(anyhow::anyhow!(
-                "tenant shutdown is already in progress"
-            )));
-        }
-
-        Ok(guard)
-    }
-
-    fn schedule_background(
-        guard: OwnedMutexGuard<Self>,
-        conf: &'static PageServerConf,
-        remote_storage: GenericRemoteStorage,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
-        tenant: Arc<Tenant>,
-    ) {
-        let tenant_shard_id = tenant.tenant_shard_id;
-
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::TimelineDeletionWorker,
-            Some(tenant_shard_id),
-            None,
-            "tenant_delete",
-            false,
-            async move {
-                if let Err(err) =
-                    Self::background(guard, conf, remote_storage, tenants, &tenant).await
-                {
-                    error!("Error: {err:#}");
-                    tenant.set_broken(format!("{err:#}")).await;
-                };
-                Ok(())
-            }
-            .instrument(tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
-        );
-    }
-
     async fn background(
         mut guard: OwnedMutexGuard<Self>,
         conf: &PageServerConf,
@@ -580,8 +346,6 @@ impl DeleteTenantFlow {
             .context("cleanup_remaining_fs_traces")?;
 
         {
-            pausable_failpoint!("tenant-delete-before-map-remove");
-
             // This block is simply removing the TenantSlot for this tenant.  It requires a loop because
             // we might conflict with a TenantSlot::InProgress marker and need to wait for it.
             //
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index f61526f8c2..326086a3cc 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -3,7 +3,6 @@
 
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
-use hyper::StatusCode;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
@@ -27,7 +26,7 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 
-use utils::{completion, crashsafe};
+use utils::{backoff, completion, crashsafe};
 
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -41,7 +40,6 @@ use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
     AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
 };
-use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
@@ -1354,56 +1352,10 @@ impl TenantManager {
         }
     }
 
-    pub(crate) async fn delete_tenant(
+    async fn delete_tenant_remote(
         &self,
         tenant_shard_id: TenantShardId,
-        activation_timeout: Duration,
-    ) -> Result<StatusCode, DeleteTenantError> {
-        super::span::debug_assert_current_span_has_tenant_id();
-        // We acquire a SlotGuard during this function to protect against concurrent
-        // changes while the ::prepare phase of DeleteTenantFlow executes, but then
-        // have to return the Tenant to the map while the background deletion runs.
-        //
-        // TODO: refactor deletion to happen outside the lifetime of a Tenant.
-        // Currently, deletion requires a reference to the tenants map in order to
-        // keep the Tenant in the map until deletion is complete, and then remove
-        // it at the end.
-        //
-        // See https://github.com/neondatabase/neon/issues/5080
-
-        // Tenant deletion can happen two ways:
-        // - Legacy: called on an attached location. The attached Tenant object stays alive in Stopping
-        //   state until deletion is complete.
-        // - New: called on a pageserver without an attached location.  We proceed with deletion from
-        //   remote storage.
-        //
-        // See https://github.com/neondatabase/neon/issues/5080 for more context on this transition.
-
-        let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
-        match &slot_guard.old_value {
-            Some(TenantSlot::Attached(tenant)) => {
-                // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
-                // deletion will be resumed across restarts.
-                let tenant = tenant.clone();
-                return self
-                    .delete_tenant_attached(slot_guard, tenant, activation_timeout)
-                    .await;
-            }
-            Some(TenantSlot::Secondary(secondary_tenant)) => {
-                secondary_tenant.shutdown().await;
-                let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
-                let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
-                    .await
-                    .with_context(|| {
-                        format!("local tenant directory {local_tenant_directory:?} rename")
-                    })?;
-                spawn_background_purge(tmp_dir);
-            }
-            Some(TenantSlot::InProgress(_)) => unreachable!(),
-            None => {}
-        };
-
-        // Fall through: local state for this tenant is no longer present, proceed with remote delete
+    ) -> Result<(), DeleteTenantError> {
         let remote_path = remote_tenant_path(&tenant_shard_id);
         let keys = match self
             .resources
@@ -1420,7 +1372,7 @@ impl TenantManager {
             Err(remote_storage::DownloadError::Cancelled) => {
                 return Err(DeleteTenantError::Cancelled)
             }
-            Err(remote_storage::DownloadError::NotFound) => return Ok(StatusCode::NOT_FOUND),
+            Err(remote_storage::DownloadError::NotFound) => return Ok(()),
             Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
         };
 
@@ -1434,60 +1386,83 @@ impl TenantManager {
                 .await?;
         }
 
-        // Callers use 404 as success for deletions, for historical reasons.
-        Ok(StatusCode::NOT_FOUND)
+        Ok(())
     }
 
-    async fn delete_tenant_attached(
+    /// If a tenant is attached, detach it.  Then remove its data from remote storage.
+    ///
+    /// A tenant is considered deleted once it is gone from remote storage.  It is the caller's
+    /// responsibility to avoid trying to attach the tenant again or use it any way once deletion
+    /// has started: this operation is not atomic, and must be retried until it succeeds.
+    pub(crate) async fn delete_tenant(
         &self,
-        slot_guard: SlotGuard,
-        tenant: Arc<Tenant>,
-        activation_timeout: Duration,
-    ) -> Result<StatusCode, DeleteTenantError> {
-        match tenant.current_state() {
-            TenantState::Broken { .. } | TenantState::Stopping { .. } => {
-                // If deletion is already in progress, return success (the semantics of this
-                // function are to rerturn success afterr deletion is spawned in background).
-                // Otherwise fall through and let [`DeleteTenantFlow`] handle this state.
-                if DeleteTenantFlow::is_in_progress(&tenant) {
-                    // The `delete_progress` lock is held: deletion is already happening
-                    // in the bacckground
-                    slot_guard.revert();
-                    return Ok(StatusCode::ACCEPTED);
-                }
-            }
-            _ => {
-                tenant
-                    .wait_to_become_active(activation_timeout)
-                    .await
-                    .map_err(|e| match e {
-                        GetActiveTenantError::WillNotBecomeActive(_)
-                        | GetActiveTenantError::Broken(_) => {
-                            DeleteTenantError::InvalidState(tenant.current_state())
-                        }
-                        GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
-                        GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached,
-                        GetActiveTenantError::WaitForActiveTimeout {
-                            latest_state: _latest_state,
-                            wait_time: _wait_time,
-                        } => DeleteTenantError::InvalidState(tenant.current_state()),
-                    })?;
-            }
+        tenant_shard_id: TenantShardId,
+    ) -> Result<(), DeleteTenantError> {
+        super::span::debug_assert_current_span_has_tenant_id();
+
+        async fn delete_local(
+            conf: &PageServerConf,
+            tenant_shard_id: &TenantShardId,
+        ) -> anyhow::Result<()> {
+            let local_tenant_directory = conf.tenant_path(tenant_shard_id);
+            let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
+                .await
+                .with_context(|| {
+                    format!("local tenant directory {local_tenant_directory:?} rename")
+                })?;
+            spawn_background_purge(tmp_dir);
+            Ok(())
         }
 
-        let result = DeleteTenantFlow::run(
-            self.conf,
-            self.resources.remote_storage.clone(),
-            &TENANTS,
-            tenant,
+        let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
+        match &slot_guard.old_value {
+            Some(TenantSlot::Attached(tenant)) => {
+                // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
+                // deletion will be resumed across restarts.
+                let tenant = tenant.clone();
+                let (_guard, progress) = utils::completion::channel();
+                match tenant.shutdown(progress, ShutdownMode::Hard).await {
+                    Ok(()) => {}
+                    Err(barrier) => {
+                        info!("Shutdown already in progress, waiting for it to complete");
+                        barrier.wait().await;
+                    }
+                }
+                delete_local(self.conf, &tenant_shard_id).await?;
+            }
+            Some(TenantSlot::Secondary(secondary_tenant)) => {
+                secondary_tenant.shutdown().await;
+
+                delete_local(self.conf, &tenant_shard_id).await?;
+            }
+            Some(TenantSlot::InProgress(_)) => unreachable!(),
+            None => {}
+        };
+
+        // Fall through: local state for this tenant is no longer present, proceed with remote delete.
+        // - We use a retry wrapper here so that common transient S3 errors (e.g. 503, 429) do not result
+        //   in 500 responses to delete requests.
+        // - We keep the `SlotGuard` during this I/O, so that if a concurrent delete request comes in, it will
+        //   503/retry, rather than kicking off a wasteful concurrent deletion.
+        match backoff::retry(
+            || async move { self.delete_tenant_remote(tenant_shard_id).await },
+            |e| match e {
+                DeleteTenantError::Cancelled => true,
+                DeleteTenantError::SlotError(_) => {
+                    unreachable!("Remote deletion doesn't touch slots")
+                }
+                _ => false,
+            },
+            1,
+            3,
+            &format!("delete_tenant[tenant_shard_id={tenant_shard_id}]"),
             &self.cancel,
         )
-        .await;
-
-        // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
-        slot_guard.revert();
-        let () = result?;
-        Ok(StatusCode::ACCEPTED)
+        .await
+        {
+            Some(r) => r,
+            None => Err(DeleteTenantError::Cancelled),
+        }
     }
 
     #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 72384c138b..60535b7592 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -430,52 +430,6 @@ def enable_remote_storage_versioning(
     return response
 
 
-def wait_tenant_status_404(
-    pageserver_http: PageserverHttpClient,
-    tenant_id: TenantId,
-    iterations: int,
-    interval: float = 0.250,
-):
-    def tenant_is_missing():
-        data = {}
-        try:
-            data = pageserver_http.tenant_status(tenant_id)
-            log.info(f"tenant status {data}")
-        except PageserverApiException as e:
-            log.debug(e)
-            if e.status_code == 404:
-                return
-
-        raise RuntimeError(f"Timeline exists state {data.get('state')}")
-
-    wait_until(iterations, interval=interval, func=tenant_is_missing)
-
-
-def tenant_delete_wait_completed(
-    pageserver_http: PageserverHttpClient,
-    tenant_id: TenantId,
-    iterations: int,
-    ignore_errors: bool = False,
-):
-    if not ignore_errors:
-        pageserver_http.tenant_delete(tenant_id=tenant_id)
-    else:
-        interval = 0.5
-
-        def delete_request_sent():
-            try:
-                pageserver_http.tenant_delete(tenant_id=tenant_id)
-            except PageserverApiException as e:
-                log.debug(e)
-                if e.status_code == 404:
-                    return
-            except Exception as e:
-                log.debug(e)
-
-        wait_until(iterations, interval=interval, func=delete_request_sent)
-    wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)
-
-
 MANY_SMALL_LAYERS_TENANT_CONFIG = {
     "gc_period": "0s",
     "compaction_period": "0s",
diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index 3f56da7c1d..3dad348976 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -4,7 +4,6 @@ import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare, PgCompare
-from fixtures.pageserver.utils import wait_tenant_status_404
 from fixtures.pg_version import PgVersion
 
 
@@ -68,7 +67,6 @@ def measure_recovery_time(env: NeonCompare):
     (attach_gen, _) = attach_status
 
     client.tenant_delete(env.tenant)
-    wait_tenant_status_404(client, env.tenant, iterations=60, interval=0.5)
     env.env.pageserver.tenant_create(tenant_id=env.tenant, generation=attach_gen)
 
     # Measure recovery time
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 2782d33e15..8431840dc0 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -11,8 +11,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, StorageScrubb
 from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     assert_prefix_empty,
-    poll_for_remote_storage_iterations,
-    tenant_delete_wait_completed,
     wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage
@@ -363,8 +361,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
 
     # Check that deletion works properly on a tenant that was live-migrated
     # (reproduce https://github.com/neondatabase/neon/issues/6802)
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-    tenant_delete_wait_completed(pageserver_b.http_client(), tenant_id, iterations)
+    pageserver_b.http_client().tenant_delete(tenant_id)
 
 
 def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
@@ -552,7 +549,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     )
 
     log.info("Deleting tenant...")
-    tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)
+    ps_attached.http_client().tenant_delete(tenant_id)
 
     assert_prefix_empty(
         neon_env_builder.pageserver_remote_storage,
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
index 6383d24c57..9992647e56 100644
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -11,8 +11,6 @@ from fixtures.pageserver.utils import (
     MANY_SMALL_LAYERS_TENANT_CONFIG,
     assert_prefix_empty,
     enable_remote_storage_versioning,
-    poll_for_remote_storage_iterations,
-    tenant_delete_wait_completed,
     wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
@@ -83,8 +81,7 @@ def test_tenant_s3_restore(
     assert (
         ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
     ), "tenant removed before we deletion was issued"
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
+    ps_http.tenant_delete(tenant_id)
     ps_http.deletion_queue_flush(execute=True)
     assert (
         ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index dffe5c89b9..d72377e33e 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -24,7 +24,6 @@ from fixtures.pageserver.utils import (
     enable_remote_storage_versioning,
     list_prefix,
     remote_storage_delete_key,
-    tenant_delete_wait_completed,
     timeline_delete_wait_completed,
 )
 from fixtures.pg_version import PgVersion
@@ -158,7 +157,7 @@ def test_storage_controller_smoke(
 
     # Delete all the tenants
     for tid in tenant_ids:
-        tenant_delete_wait_completed(env.storage_controller.pageserver_api(), tid, 10)
+        env.storage_controller.pageserver_api().tenant_delete(tid)
 
     env.storage_controller.consistency_check()
 
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index fd3cc45c3f..a3316f2f45 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -1,17 +1,11 @@
-import concurrent.futures
-import enum
-import os
-import shutil
 from threading import Thread
 
 import pytest
 from fixtures.common_types import Lsn, TenantId, TimelineId
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
     StorageScrubber,
-    last_flush_lsn_upload,
     wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverApiException
@@ -19,18 +13,33 @@ from fixtures.pageserver.utils import (
     MANY_SMALL_LAYERS_TENANT_CONFIG,
     assert_prefix_empty,
     assert_prefix_not_empty,
-    poll_for_remote_storage_iterations,
-    tenant_delete_wait_completed,
     wait_for_upload,
-    wait_tenant_status_404,
-    wait_until_tenant_active,
-    wait_until_tenant_state,
 )
-from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage
+from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.utils import run_pg_bench_small, wait_until
 from requests.exceptions import ReadTimeout
 
 
+def error_tolerant_delete(ps_http, tenant_id):
+    """
+    For tests that inject 500 errors, we must retry repeatedly when issuing deletions
+    """
+    while True:
+        try:
+            ps_http.tenant_delete(tenant_id=tenant_id)
+        except PageserverApiException as e:
+            if e.status_code == 500:
+                # This test uses failure injection, which can produce 500s as the pageserver expects
+                # the object store to always be available, and the ListObjects during deletion is generally
+                # an infallible operation
+                assert "simulated failure of remote operation" in e.message
+            else:
+                raise
+        else:
+            # Success, drop out
+            break
+
+
 def test_tenant_delete_smoke(
     neon_env_builder: NeonEnvBuilder,
     pg_bin: PgBin,
@@ -59,21 +68,7 @@ def test_tenant_delete_smoke(
 
     # Check that deleting a non-existent tenant gives the expected result: this is a loop because we
     # may need to retry on some remote storage errors injected by the test harness
-    while True:
-        try:
-            ps_http.tenant_delete(tenant_id=tenant_id)
-        except PageserverApiException as e:
-            if e.status_code == 500:
-                # This test uses failure injection, which can produce 500s as the pageserver expects
-                # the object store to always be available, and the ListObjects during deletion is generally
-                # an infallible operation
-                assert "simulated failure of remote operation" in e.message
-            elif e.status_code == 404:
-                # This is our expected result: trying to erase a non-existent tenant gives us 404
-                assert "NotFound" in e.message
-                break
-            else:
-                raise
+    error_tolerant_delete(ps_http, tenant_id)
 
     env.neon_cli.create_tenant(
         tenant_id=tenant_id,
@@ -108,10 +103,8 @@ def test_tenant_delete_smoke(
     # Upload a heatmap so that we exercise deletion of that too
     ps_http.tenant_heatmap_upload(tenant_id)
 
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2
-    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
+    error_tolerant_delete(ps_http, tenant_id)
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
 
     tenant_path = env.pageserver.tenant_dir(tenant_id)
@@ -129,286 +122,7 @@ def test_tenant_delete_smoke(
 
     # Deletion updates the tenant count: the one default tenant remains
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
-
-
-class Check(enum.Enum):
-    RETRY_WITHOUT_RESTART = enum.auto()
-    RETRY_WITH_RESTART = enum.auto()
-
-
-FAILPOINTS = [
-    "tenant-delete-before-shutdown",
-    "tenant-delete-before-create-remote-mark",
-    "tenant-delete-before-create-local-mark",
-    "tenant-delete-before-background",
-    "tenant-delete-before-polling-ongoing-deletions",
-    "tenant-delete-before-cleanup-remaining-fs-traces",
-    "tenant-delete-before-remove-timelines-dir",
-    "tenant-delete-before-remove-deleted-mark",
-    "tenant-delete-before-remove-tenant-dir",
-    # Some failpoints from timeline deletion
-    "timeline-delete-before-index-deleted-at",
-    "timeline-delete-before-rm",
-    "timeline-delete-before-index-delete",
-]
-
-FAILPOINTS_BEFORE_BACKGROUND = [
-    "timeline-delete-before-schedule",
-    "tenant-delete-before-shutdown",
-    "tenant-delete-before-create-remote-mark",
-    "tenant-delete-before-create-local-mark",
-    "tenant-delete-before-background",
-]
-
-
-def combinations():
-    result = []
-
-    remotes = available_s3_storages()
-
-    for remote_storage_kind in remotes:
-        for delete_failpoint in FAILPOINTS:
-            # Simulate failures for only one type of remote storage
-            # to avoid log pollution and make tests run faster
-            if remote_storage_kind is RemoteStorageKind.MOCK_S3:
-                simulate_failures = True
-            else:
-                simulate_failures = False
-            result.append((remote_storage_kind, delete_failpoint, simulate_failures))
-    return result
-
-
-@pytest.mark.parametrize("check", list(Check))
-@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations())
-def test_delete_tenant_exercise_crash_safety_failpoints(
-    neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
-    failpoint: str,
-    simulate_failures: bool,
-    check: Check,
-    pg_bin: PgBin,
-):
-    if simulate_failures:
-        neon_env_builder.pageserver_config_override = "test_remote_failures=1"
-
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
-
-    tenant_id = env.initial_tenant
-
-    env.pageserver.allowed_errors.extend(
-        [
-            # From deletion polling
-            f".*NotFound: tenant {env.initial_tenant}.*",
-            # allow errors caused by failpoints
-            f".*failpoint: {failpoint}",
-            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
-            # We may leave some upload tasks in the queue. They're likely deletes.
-            # For uploads we explicitly wait with `last_flush_lsn_upload` below.
-            # So by ignoring these instead of waiting for empty upload queue
-            # we execute more distinct code paths.
-            '.*stopping left-over name="remote upload".*',
-            # an on-demand is cancelled by shutdown
-            ".*initial size calculation failed: downloading failed, possibly for shutdown",
-        ]
-    )
-
-    if simulate_failures:
-        env.pageserver.allowed_errors.append(
-            # The deletion queue will complain when it encounters simulated S3 errors
-            ".*deletion executor: DeleteObjects request failed.*",
-        )
-
-    ps_http = env.pageserver.http_client()
-
-    timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
-    with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint:
-        # generate enough layers
-        run_pg_bench_small(pg_bin, endpoint.connstr())
-        last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
-
-        assert_prefix_not_empty(
-            neon_env_builder.pageserver_remote_storage,
-            prefix="/".join(
-                (
-                    "tenants",
-                    str(tenant_id),
-                )
-            ),
-        )
-
-    ps_http.configure_failpoints((failpoint, "return"))
-
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
-    # These failpoints are earlier than background task is spawned.
-    # so they result in api request failure.
-    if failpoint in FAILPOINTS_BEFORE_BACKGROUND:
-        with pytest.raises(PageserverApiException, match=failpoint):
-            ps_http.tenant_delete(tenant_id)
-
-    else:
-        ps_http.tenant_delete(tenant_id)
-        tenant_info = wait_until_tenant_state(
-            pageserver_http=ps_http,
-            tenant_id=tenant_id,
-            expected_state="Broken",
-            iterations=iterations,
-        )
-
-        reason = tenant_info["state"]["data"]["reason"]
-        log.info(f"tenant broken: {reason}")
-
-        # failpoint may not be the only error in the stack
-        assert reason.endswith(f"failpoint: {failpoint}"), reason
-
-    if check is Check.RETRY_WITH_RESTART:
-        env.pageserver.restart()
-
-        if failpoint in (
-            "tenant-delete-before-shutdown",
-            "tenant-delete-before-create-remote-mark",
-        ):
-            wait_until_tenant_active(
-                ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25
-            )
-            tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
-        else:
-            # Pageserver should've resumed deletion after restart.
-            wait_tenant_status_404(ps_http, tenant_id, iterations=iterations + 10)
-    elif check is Check.RETRY_WITHOUT_RESTART:
-        # this should succeed
-        # this also checks that delete can be retried even when tenant is in Broken state
-        ps_http.configure_failpoints((failpoint, "off"))
-
-        tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
-
-    tenant_dir = env.pageserver.tenant_dir(tenant_id)
-    # Check local is empty
-    assert not tenant_dir.exists()
-
-    # Check remote is empty
-    assert_prefix_empty(
-        neon_env_builder.pageserver_remote_storage,
-        prefix="/".join(
-            (
-                "tenants",
-                str(tenant_id),
-            )
-        ),
-        allowed_postfix="initdb.tar.zst",
-    )
-
-
-def test_tenant_delete_is_resumed_on_attach(
-    neon_env_builder: NeonEnvBuilder,
-    pg_bin: PgBin,
-):
-    remote_storage_kind = s3_storage()
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
-    env.pageserver.allowed_errors.append(
-        # lucky race with stopping from flushing a layer we fail to schedule any uploads
-        ".*layer flush task.+: could not flush frozen layer: update_metadata_file"
-    )
-
-    tenant_id = env.initial_tenant
-
-    ps_http = env.pageserver.http_client()
-    # create two timelines
-    for timeline in ["first", "second"]:
-        timeline_id = env.neon_cli.create_timeline(timeline, tenant_id=tenant_id)
-        with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
-            run_pg_bench_small(pg_bin, endpoint.connstr())
-            wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
-
-    # sanity check, data should be there
-    assert_prefix_not_empty(
-        neon_env_builder.pageserver_remote_storage,
-        prefix="/".join(
-            (
-                "tenants",
-                str(tenant_id),
-            )
-        ),
-    )
-
-    # failpoint before we remove index_part from s3
-    failpoint = "timeline-delete-before-index-delete"
-    ps_http.configure_failpoints((failpoint, "return"))
-
-    env.pageserver.allowed_errors.extend(
-        (
-            # allow errors caused by failpoints
-            f".*failpoint: {failpoint}",
-            # From deletion polling
-            f".*NotFound: tenant {env.initial_tenant}.*",
-            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
-            # error from http response is also logged
-            ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
-            '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
-        )
-    )
-
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
-    ps_http.tenant_delete(tenant_id)
-
-    tenant_info = wait_until_tenant_state(
-        pageserver_http=ps_http,
-        tenant_id=tenant_id,
-        expected_state="Broken",
-        iterations=iterations,
-    )
-
-    assert_prefix_not_empty(
-        neon_env_builder.pageserver_remote_storage,
-        prefix="/".join(
-            (
-                "tenants",
-                str(tenant_id),
-            )
-        ),
-    )
-
-    reason = tenant_info["state"]["data"]["reason"]
-    # failpoint may not be the only error in the stack
-    assert reason.endswith(f"failpoint: {failpoint}"), reason
-
-    # now we stop pageserver and remove local tenant state
-    env.endpoints.stop_all()
-    env.pageserver.stop()
-
-    dir_to_clear = env.pageserver.tenant_dir()
-    shutil.rmtree(dir_to_clear)
-    os.mkdir(dir_to_clear)
-
-    env.pageserver.start()
-
-    # now we call attach
-    env.pageserver.tenant_attach(tenant_id=tenant_id)
-
-    # delete should be resumed
-    wait_tenant_status_404(ps_http, tenant_id, iterations)
-
-    # we shouldn've created tenant dir on disk
-    tenant_path = env.pageserver.tenant_dir(tenant_id)
-    assert not tenant_path.exists()
-
-    ps_http.deletion_queue_flush(execute=True)
-    assert_prefix_empty(
-        neon_env_builder.pageserver_remote_storage,
-        prefix="/".join(
-            (
-                "tenants",
-                str(tenant_id),
-            )
-        ),
-    )
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0
 
 
 def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder):
@@ -483,105 +197,6 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
             deletion.join()
 
 
-def test_tenant_delete_concurrent(
-    neon_env_builder: NeonEnvBuilder,
-    pg_bin: PgBin,
-):
-    """
-    Validate that concurrent delete requests to the same tenant behave correctly:
-    exactly one should execute: the rest should give 202 responses but not start
-    another deletion.
-
-    This is a reproducer for https://github.com/neondatabase/neon/issues/5936
-    """
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
-    ps_http = env.pageserver.http_client()
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-
-    # Populate some data
-    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
-        run_pg_bench_small(pg_bin, endpoint.connstr())
-        last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
-
-    env.pageserver.allowed_errors.extend(
-        [
-            # lucky race with stopping from flushing a layer we fail to schedule any uploads
-            ".*layer flush task.+: could not flush frozen layer: update_metadata_file",
-        ]
-    )
-
-    BEFORE_REMOVE_FAILPOINT = "tenant-delete-before-map-remove"
-    BEFORE_RUN_FAILPOINT = "tenant-delete-before-run"
-
-    # We will let the initial delete run until right before it would remove
-    # the tenant's TenantSlot.  This pauses it in a state where the tenant
-    # is visible in Stopping state, and concurrent requests should fail with 4xx.
-    ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "pause"))
-
-    def delete_tenant():
-        return ps_http.tenant_delete(tenant_id)
-
-    def hit_remove_failpoint():
-        return env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")[1]
-
-    def hit_run_failpoint():
-        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
-
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        background_200_req = executor.submit(delete_tenant)
-        assert background_200_req.result(timeout=10).status_code == 202
-
-        # Wait until the first request completes its work and is blocked on removing
-        # the TenantSlot from tenant manager.
-        log_cursor = wait_until(100, 0.1, hit_remove_failpoint)
-        assert log_cursor is not None
-
-        # Start another request: this should succeed without actually entering the deletion code
-        ps_http.tenant_delete(tenant_id)
-        assert not env.pageserver.log_contains(
-            f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
-        )
-
-        # Start another background request, which will pause after acquiring a TenantSlotGuard
-        # but before completing.
-        ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "pause"))
-        background_4xx_req = executor.submit(delete_tenant)
-        wait_until(100, 0.1, hit_run_failpoint)
-
-        # The TenantSlot is still present while the original request is hung before
-        # final removal
-        assert (
-            ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
-        )
-
-        # Permit the original request to run to success
-        ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "off"))
-
-        # Permit the duplicate background request to run to completion and fail.
-        ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off"))
-        background_4xx_req.result(timeout=10)
-        assert not env.pageserver.log_contains(
-            f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
-        )
-
-    # Physical deletion should have happened
-    assert_prefix_empty(
-        neon_env_builder.pageserver_remote_storage,
-        prefix="/".join(
-            (
-                "tenants",
-                str(tenant_id),
-            )
-        ),
-    )
-
-    # Zero tenants remain (we deleted the default tenant)
-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0
-
-
 def test_tenant_delete_races_timeline_creation(
     neon_env_builder: NeonEnvBuilder,
     pg_bin: PgBin,
@@ -674,9 +289,7 @@ def test_tenant_delete_races_timeline_creation(
     # Disable the failpoint and wait for deletion to finish
     ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
 
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
-    tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True)
+    ps_http.tenant_delete(tenant_id)
 
     # Physical deletion should have happened
     assert_prefix_empty(
@@ -727,8 +340,7 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
 
     env.start()
     ps_http = env.pageserver.http_client()
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
+    ps_http.tenant_delete(tenant_id)
     env.stop()
 
     scrubber.scan_metadata()
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index be289e03d6..9fe732e288 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -15,7 +15,6 @@ from fixtures.pageserver.utils import (
     assert_tenant_state,
     wait_for_last_record_lsn,
     wait_for_upload,
-    wait_tenant_status_404,
 )
 from fixtures.remote_storage import (
     LocalFsStorage,
@@ -348,9 +347,6 @@ def test_tenant_relocation(
     # is no longer involved, and if it is, we will see the error
     origin_http.tenant_detach(tenant_id)
 
-    # Wait a little, so that the detach operation has time to finish.
-    wait_tenant_status_404(origin_http, tenant_id, iterations=100, interval=1)
-
     post_migration_check(ep_main, 500500, old_local_path_main)
     post_migration_check(ep_second, 1001000, old_local_path_second)
 
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index a3dd422903..6c85ddebbc 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -15,7 +15,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
-    tenant_delete_wait_completed,
     timeline_delete_wait_completed,
     wait_until_tenant_active,
 )
@@ -669,7 +668,7 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
             ),
         )
 
-        tenant_delete_wait_completed(client, env.initial_tenant, 10)
+        client.tenant_delete(env.initial_tenant)
 
         client.configure_failpoints((failpoint, "off"))
 
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index f0b2f7d733..606ce203cd 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -14,7 +14,7 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
-from fixtures.pageserver.utils import wait_tenant_status_404, wait_timeline_detail_404
+from fixtures.pageserver.utils import wait_timeline_detail_404
 from fixtures.remote_storage import LocalFsStorage
 from fixtures.utils import assert_pageserver_backups_equal
 
@@ -578,7 +578,6 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
     assert info.value.status_code == 400
 
     client.tenant_delete(env.initial_tenant)
-    wait_tenant_status_404(client, env.initial_tenant, 10, 1)
 
     with pytest.raises(PageserverApiException) as e:
         client.detach_ancestor(env.initial_tenant, first_branch)
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index db5297870e..3110833563 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -26,7 +26,6 @@ from fixtures.pageserver.utils import (
     assert_tenant_state,
     timeline_delete_wait_completed,
     wait_for_upload_queue_empty,
-    wait_tenant_status_404,
     wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
@@ -864,39 +863,33 @@ def delete_lazy_activating(
 ):
     pageserver_http = pageserver.http_client()
 
-    # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
-    # logical size is paused in a failpoint.  So instead we will use a log observation to check that
-    # on-demand activation was triggered by the tenant deletion
-    log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
-
     if expect_attaching:
         assert pageserver_http.tenant_status(delete_tenant_id)["state"]["slug"] == "Attaching"
 
     with concurrent.futures.ThreadPoolExecutor() as executor:
         log.info("Starting background delete")
 
-        def activated_on_demand():
-            assert pageserver.log_contains(log_match) is not None
+        def shutting_down():
+            assert pageserver.log_contains(".*Waiting for timelines.*") is not None
 
         def delete_tenant():
             pageserver_http.tenant_delete(delete_tenant_id)
 
         background_delete = executor.submit(delete_tenant)
 
-        log.info(f"Waiting for activation message '{log_match}'")
+        # We expect deletion to enter shutdown of the tenant even though it's in the attaching state
         try:
-            wait_until(10, 1, activated_on_demand)
+            # Deletion will get to the point in shutdown where it's waiting for timeline shutdown, then
+            # hang because of our failpoint blocking activation.
+            wait_until(10, 1, shutting_down)
         finally:
             log.info("Clearing failpoint")
             pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
 
-        # Deletion should complete successfully now that failpoint is unblocked
+        # Deletion should complete successfully now that failpoint is unblocked and shutdown can complete
         log.info("Joining background delete")
         background_delete.result(timeout=10)
 
-        # Poll for deletion to complete
-        wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40)
-
 
 def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
     """

From ee3081863e05c9a1f3b9cf6614a62c08a6b3fb95 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 21 Jun 2024 17:13:51 +0100
Subject: [PATCH 1023/1571] storcon: implement endpoints for cancellation of
 drain and fill operations (#8029)

## Problem
There's no way to cancel drain and fill operations.

## Summary of changes
Implement HTTP endpoints to allow cancelling of background operations.
When the operationis cancelled successfully, the node scheduling policy will revert to
`Active`.
---
 storage_controller/src/http.rs                |  36 ++++
 storage_controller/src/service.rs             | 156 +++++++++++++++++-
 test_runner/fixtures/neon_fixtures.py         |  16 ++
 .../regress/test_storage_controller.py        | 130 ++++++++++-----
 4 files changed, 290 insertions(+), 48 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 3e9951fb9e..680e6f09c4 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -502,6 +502,17 @@ async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiErro
     json_response(StatusCode::ACCEPTED, ())
 }
 
+async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+
+    state.service.cancel_node_drain(node_id).await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
 async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -513,6 +524,17 @@ async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError
     json_response(StatusCode::ACCEPTED, ())
 }
 
+async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+
+    state.service.cancel_node_fill(node_id).await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
 async fn handle_tenant_shard_split(
     service: Arc<Service>,
     mut req: Request<Body>,
@@ -871,9 +893,23 @@ pub fn make_router(
         .put("/control/v1/node/:node_id/drain", |r| {
             named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
         })
+        .delete("/control/v1/node/:node_id/drain", |r| {
+            named_request_span(
+                r,
+                handle_cancel_node_drain,
+                RequestName("control_v1_cancel_node_drain"),
+            )
+        })
         .put("/control/v1/node/:node_id/fill", |r| {
             named_request_span(r, handle_node_fill, RequestName("control_v1_node_fill"))
         })
+        .delete("/control/v1/node/:node_id/fill", |r| {
+            named_request_span(
+                r,
+                handle_cancel_node_fill,
+                RequestName("control_v1_cancel_node_fill"),
+            )
+        })
         // TODO(vlad): endpoint for cancelling drain and fill
         // Tenant Shard operations
         .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 792f68cc5a..752fb2c161 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4541,7 +4541,8 @@ impl Service {
                 self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Draining))
                     .await?;
 
-                let cancel = CancellationToken::new();
+                let cancel = self.cancel.child_token();
+                let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
 
                 self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
                     operation: Operation::Drain(Drain { node_id }),
@@ -4552,6 +4553,8 @@ impl Service {
                     let service = self.clone();
                     let cancel = cancel.clone();
                     async move {
+                        let _gate_guard = gate_guard;
+
                         scopeguard::defer! {
                             let prev = service.inner.write().unwrap().ongoing_operation.take();
 
@@ -4593,6 +4596,44 @@ impl Service {
         Ok(())
     }
 
+    pub(crate) async fn cancel_node_drain(&self, node_id: NodeId) -> Result<(), ApiError> {
+        let (node_available, node_policy) = {
+            let locked = self.inner.read().unwrap();
+            let nodes = &locked.nodes;
+            let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
+                anyhow::anyhow!("Node {} not registered", node_id).into(),
+            ))?;
+
+            (node.is_available(), node.get_scheduling())
+        };
+
+        if !node_available {
+            return Err(ApiError::ResourceUnavailable(
+                format!("Node {node_id} is currently unavailable").into(),
+            ));
+        }
+
+        if !matches!(node_policy, NodeSchedulingPolicy::Draining) {
+            return Err(ApiError::PreconditionFailed(
+                format!("Node {node_id} has no drain in progress").into(),
+            ));
+        }
+
+        if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
+            if let Operation::Drain(drain) = op_handler.operation {
+                if drain.node_id == node_id {
+                    tracing::info!("Cancelling background drain operation for node {node_id}");
+                    op_handler.cancel.cancel();
+                    return Ok(());
+                }
+            }
+        }
+
+        Err(ApiError::PreconditionFailed(
+            format!("Node {node_id} has no drain in progress").into(),
+        ))
+    }
+
     pub(crate) async fn start_node_fill(self: &Arc<Self>, node_id: NodeId) -> Result<(), ApiError> {
         let (ongoing_op, node_available, node_policy, total_nodes_count) = {
             let locked = self.inner.read().unwrap();
@@ -4635,7 +4676,8 @@ impl Service {
                 self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Filling))
                     .await?;
 
-                let cancel = CancellationToken::new();
+                let cancel = self.cancel.child_token();
+                let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
 
                 self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
                     operation: Operation::Fill(Fill { node_id }),
@@ -4646,6 +4688,8 @@ impl Service {
                     let service = self.clone();
                     let cancel = cancel.clone();
                     async move {
+                        let _gate_guard = gate_guard;
+
                         scopeguard::defer! {
                             let prev = service.inner.write().unwrap().ongoing_operation.take();
 
@@ -4687,6 +4731,44 @@ impl Service {
         Ok(())
     }
 
+    pub(crate) async fn cancel_node_fill(&self, node_id: NodeId) -> Result<(), ApiError> {
+        let (node_available, node_policy) = {
+            let locked = self.inner.read().unwrap();
+            let nodes = &locked.nodes;
+            let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
+                anyhow::anyhow!("Node {} not registered", node_id).into(),
+            ))?;
+
+            (node.is_available(), node.get_scheduling())
+        };
+
+        if !node_available {
+            return Err(ApiError::ResourceUnavailable(
+                format!("Node {node_id} is currently unavailable").into(),
+            ));
+        }
+
+        if !matches!(node_policy, NodeSchedulingPolicy::Filling) {
+            return Err(ApiError::PreconditionFailed(
+                format!("Node {node_id} has no fill in progress").into(),
+            ));
+        }
+
+        if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
+            if let Operation::Fill(fill) = op_handler.operation {
+                if fill.node_id == node_id {
+                    tracing::info!("Cancelling background drain operation for node {node_id}");
+                    op_handler.cancel.cancel();
+                    return Ok(());
+                }
+            }
+        }
+
+        Err(ApiError::PreconditionFailed(
+            format!("Node {node_id} has no fill in progress").into(),
+        ))
+    }
+
     /// Helper for methods that will try and call pageserver APIs for
     /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
     /// is attached somewhere.
@@ -5286,7 +5368,21 @@ impl Service {
 
         while !inspected_all_shards {
             if cancel.is_cancelled() {
-                return Err(OperationError::Cancelled);
+                match self
+                    .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
+                    .await
+                {
+                    Ok(()) => return Err(OperationError::Cancelled),
+                    Err(err) => {
+                        return Err(OperationError::FinalizeError(
+                            format!(
+                                "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
+                                node_id, err
+                            )
+                            .into(),
+                        ));
+                    }
+                }
             }
 
             {
@@ -5356,9 +5452,29 @@ impl Service {
             waiters = self
                 .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
                 .await;
+
+            failpoint_support::sleep_millis_async!("sleepy-drain-loop");
         }
 
         while !waiters.is_empty() {
+            if cancel.is_cancelled() {
+                match self
+                    .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
+                    .await
+                {
+                    Ok(()) => return Err(OperationError::Cancelled),
+                    Err(err) => {
+                        return Err(OperationError::FinalizeError(
+                            format!(
+                                "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
+                                node_id, err
+                            )
+                            .into(),
+                        ));
+                    }
+                }
+            }
+
             tracing::info!("Awaiting {} pending drain reconciliations", waiters.len());
 
             waiters = self
@@ -5495,7 +5611,21 @@ impl Service {
         // we validate to ensure that it has not gone stale in the meantime.
         while !tids_to_promote.is_empty() {
             if cancel.is_cancelled() {
-                return Err(OperationError::Cancelled);
+                match self
+                    .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
+                    .await
+                {
+                    Ok(()) => return Err(OperationError::Cancelled),
+                    Err(err) => {
+                        return Err(OperationError::FinalizeError(
+                            format!(
+                                "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
+                                node_id, err
+                            )
+                            .into(),
+                        ));
+                    }
+                }
             }
 
             {
@@ -5563,6 +5693,24 @@ impl Service {
         }
 
         while !waiters.is_empty() {
+            if cancel.is_cancelled() {
+                match self
+                    .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
+                    .await
+                {
+                    Ok(()) => return Err(OperationError::Cancelled),
+                    Err(err) => {
+                        return Err(OperationError::FinalizeError(
+                            format!(
+                                "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
+                                node_id, err
+                            )
+                            .into(),
+                        ));
+                    }
+                }
+            }
+
             tracing::info!("Awaiting {} pending fill reconciliations", waiters.len());
 
             waiters = self
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 4ff1705ca4..b624c84fad 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2249,6 +2249,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
             headers=self.headers(TokenScope.ADMIN),
         )
 
+    def cancel_node_drain(self, node_id):
+        log.info(f"cancel_node_drain({node_id})")
+        self.request(
+            "DELETE",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
     def node_fill(self, node_id):
         log.info(f"node_fill({node_id})")
         self.request(
@@ -2257,6 +2265,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
             headers=self.headers(TokenScope.ADMIN),
         )
 
+    def cancel_node_fill(self, node_id):
+        log.info(f"cancel_node_fill({node_id})")
+        self.request(
+            "DELETE",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
     def node_status(self, node_id):
         response = self.request(
             "GET",
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index d72377e33e..9cc13ecfdb 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1518,6 +1518,49 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto
         workload.validate()
 
 
+def retryable_node_operation(op, ps_id, max_attempts, backoff):
+    while max_attempts > 0:
+        try:
+            op(ps_id)
+            return
+        except StorageControllerApiException as e:
+            max_attempts -= 1
+            log.info(f"Operation failed ({max_attempts} attempts left): {e}")
+
+            if max_attempts == 0:
+                raise e
+
+            time.sleep(backoff)
+
+
+def poll_node_status(env, node_id, desired_scheduling_policy, max_attempts, backoff):
+    log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
+    while max_attempts > 0:
+        try:
+            status = env.storage_controller.node_status(node_id)
+            policy = status["scheduling"]
+            if policy == desired_scheduling_policy:
+                return
+            else:
+                max_attempts -= 1
+                log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
+
+                if max_attempts == 0:
+                    raise AssertionError(
+                        f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
+                    )
+
+                time.sleep(backoff)
+        except StorageControllerApiException as e:
+            max_attempts -= 1
+            log.info(f"Status call failed ({max_attempts} retries left): {e}")
+
+            if max_attempts == 0:
+                raise e
+
+            time.sleep(backoff)
+
+
 def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
     """
     Graceful reststart of storage controller clusters use the drain and
@@ -1546,47 +1589,6 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
     nodes = env.storage_controller.node_list()
     assert len(nodes) == 2
 
-    def retryable_node_operation(op, ps_id, max_attempts, backoff):
-        while max_attempts > 0:
-            try:
-                op(ps_id)
-                return
-            except StorageControllerApiException as e:
-                max_attempts -= 1
-                log.info(f"Operation failed ({max_attempts} attempts left): {e}")
-
-                if max_attempts == 0:
-                    raise e
-
-                time.sleep(backoff)
-
-    def poll_node_status(node_id, desired_scheduling_policy, max_attempts, backoff):
-        log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
-        while max_attempts > 0:
-            try:
-                status = env.storage_controller.node_status(node_id)
-                policy = status["scheduling"]
-                if policy == desired_scheduling_policy:
-                    return
-                else:
-                    max_attempts -= 1
-                    log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
-
-                    if max_attempts == 0:
-                        raise AssertionError(
-                            f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
-                        )
-
-                    time.sleep(backoff)
-            except StorageControllerApiException as e:
-                max_attempts -= 1
-                log.info(f"Status call failed ({max_attempts} retries left): {e}")
-
-                if max_attempts == 0:
-                    raise e
-
-                time.sleep(backoff)
-
     def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards):
         # Assert that all nodes have some attached shards
         assert len(shard_counts) == len(env.pageservers)
@@ -1602,7 +1604,7 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
         retryable_node_operation(
             lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
         )
-        poll_node_status(ps.id, "PauseForRestart", max_attempts=6, backoff=5)
+        poll_node_status(env, ps.id, "PauseForRestart", max_attempts=6, backoff=5)
 
         shard_counts = get_node_shard_counts(env, tenant_ids)
         log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
@@ -1612,12 +1614,12 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
         assert sum(shard_counts.values()) == total_shards
 
         ps.restart()
-        poll_node_status(ps.id, "Active", max_attempts=10, backoff=1)
+        poll_node_status(env, ps.id, "Active", max_attempts=10, backoff=1)
 
         retryable_node_operation(
             lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
         )
-        poll_node_status(ps.id, "Active", max_attempts=6, backoff=5)
+        poll_node_status(env, ps.id, "Active", max_attempts=6, backoff=5)
 
         shard_counts = get_node_shard_counts(env, tenant_ids)
         log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
@@ -1627,3 +1629,43 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
     shard_counts = get_node_shard_counts(env, tenant_ids)
     log.info(f"Shard counts after rolling restart: {shard_counts}")
     assert_shard_counts_balanced(env, shard_counts, total_shards)
+
+
+def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_count = 5
+    shard_count_per_tenant = 8
+    tenant_ids = []
+
+    for _ in range(0, tenant_count):
+        tid = TenantId.generate()
+        tenant_ids.append(tid)
+        env.neon_cli.create_tenant(
+            tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
+        )
+
+    # See sleep comment in the test above.
+    time.sleep(2)
+
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 2
+
+    env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(2000)"))
+
+    ps_id_to_drain = env.pageservers[0].id
+
+    retryable_node_operation(
+        lambda ps_id: env.storage_controller.node_drain(ps_id),
+        ps_id_to_drain,
+        max_attempts=3,
+        backoff=2,
+    )
+
+    poll_node_status(env, ps_id_to_drain, "Draining", max_attempts=6, backoff=2)
+
+    env.storage_controller.cancel_node_drain(ps_id_to_drain)
+
+    poll_node_status(env, ps_id_to_drain, "Active", max_attempts=6, backoff=2)

From b74232eb4d36ad16750a938c069da0dbfffed3ce Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 21 Jun 2024 18:23:31 +0100
Subject: [PATCH 1024/1571] tests: allow-list neon_local endpoint errors from
 storage controller (#8123)

## Problem

For testing, the storage controller has a built-in hack that loads
neon_local endpoint config from disk, and uses it to reconfigure
endpoints when the attached pageserver changes.

Some tests that stop an endpoint while the storage controller is running
could occasionally fail on log errors from the controller trying to use
its special test-mode calls into neon local Endpoint.

Example:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8117/9592392425/index.html#/testresult/9d2bb8623d0d53f8

## Summary of changes

- Give NotifyError an explicit NeonLocal variant, to avoid munging these
into generic 500s (I don't want to ignore 500s in general)
- Allow-list errors related to the local notification hook.

The expectation is that tests using endpoints/workloads should be
independently checking that those endpoints work: if neon_local
generates an error inside the storage controller, that's ignorable.
---
 storage_controller/src/compute_hook.rs            | 10 +++++++---
 test_runner/fixtures/pageserver/allowed_errors.py |  5 +++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index a1d051f150..4d0f8006aa 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -146,6 +146,9 @@ pub(crate) enum NotifyError {
     // A response indicates we will never succeed, such as 400 or 404
     #[error("Non-retryable error {0}")]
     Fatal(StatusCode),
+
+    #[error("neon_local error: {0}")]
+    NeonLocal(anyhow::Error),
 }
 
 enum MaybeSendResult {
@@ -278,7 +281,7 @@ impl ComputeHook {
     async fn do_notify_local(
         &self,
         reconfigure_request: &ComputeHookNotifyRequest,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), NotifyError> {
         // neon_local updates are not safe to call concurrently, use a lock to serialize
         // all calls to this function
         let _locked = self.neon_local_lock.lock().await;
@@ -321,7 +324,8 @@ impl ComputeHook {
                 tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
                 endpoint
                     .reconfigure(compute_pageservers.clone(), *stripe_size)
-                    .await?;
+                    .await
+                    .map_err(NotifyError::NeonLocal)?;
             }
         }
 
@@ -510,7 +514,7 @@ impl ComputeHook {
         } else {
             self.do_notify_local(&request).await.map_err(|e| {
                 // This path is for testing only, so munge the error into our prod-style error type.
-                tracing::error!("Local notification hook failed: {e}");
+                tracing::error!("neon_local notification hook failed: {e}");
                 NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
             })
         };
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 147d5705d3..c5b09e3608 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -106,6 +106,11 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
     ".*startup_reconcile: Could not scan node.*",
     # Tests run in dev mode
     ".*Starting in dev mode.*",
+    # Tests that stop endpoints & use the storage controller's neon_local notification
+    # mechanism might fail (neon_local's stopping and endpoint isn't atomic wrt the storage
+    # controller's attempts to notify the endpoint).
+    ".*reconciler.*neon_local notification hook failed.*",
+    ".*reconciler.*neon_local error.*",
 ]
 
 
From 8776089c70394b6fd6c0ee607542dd3e5c120333 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 21 Jun 2024 13:51:07 +0100
Subject: [PATCH 1025/1571] Remove kq_imcx extension support per customer
 request

neondatabase/cloud#13648
---
 Dockerfile.compute-node | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 3a73ac71b0..7ab685625a 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -467,31 +467,6 @@ RUN case "${PG_VERSION}" in \
     make install -j $(getconf _NPROCESSORS_ONLN) && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control
 
-#########################################################################################
-#
-# Layer "kq-imcx-pg-build"
-# compile kq_imcx extension
-#
-#########################################################################################
-FROM build-deps AS kq-imcx-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN apt-get update && \
-    apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \
-    wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
-    echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
-    mkdir kq_imcx-src && cd kq_imcx-src && tar xzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
-    mkdir build && cd build && \
-    cmake -DCMAKE_BUILD_TYPE=Release .. && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -
 
 #########################################################################################
 #
@@ -840,7 +815,6 @@ COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -961,7 +935,6 @@ COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
 #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
 COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
 COPY patches/pg_hintplan.patch /ext-src
-#COPY --from=kq-imcx-pg-build /kq_imcx.tar.gz /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src

From 8fe3f17c470ff6dfa9a8abc2dc9fd15b5e4a14a0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Sat, 22 Jun 2024 15:20:58 +0100
Subject: [PATCH 1026/1571] storcon: improve drain and fill shard placement
 (#8119)

## Problem
While adapting the storage controller scale test to do graceful rolling
restarts via
drain and fill, I noticed that secondaries are also being rescheduled,
which, in turn,
caused the storage controller to optimise attachments.

## Summary of changes
* Introduce a transactional looking rescheduling primitive (i.e. "try to
schedule to this
secondary, but leave everything as is if you can't")
* Use it for the drain and fill stages to avoid calling into
`Scheduler::schedule` and having
secondaries move around.
---
 storage_controller/src/service.rs      | 53 +++++++++++++-------------
 storage_controller/src/tenant_shard.rs | 42 ++++++++++++++++++++
 2 files changed, 68 insertions(+), 27 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 752fb2c161..388e0eadc8 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5364,7 +5364,6 @@ impl Service {
         let mut last_inspected_shard: Option<TenantShardId> = None;
         let mut inspected_all_shards = false;
         let mut waiters = Vec::new();
-        let mut schedule_context = ScheduleContext::default();
 
         while !inspected_all_shards {
             if cancel.is_cancelled() {
@@ -5419,28 +5418,32 @@ impl Service {
                         }
                     };
 
-                    if tenant_shard.intent.demote_attached(scheduler, node_id) {
-                        match tenant_shard.schedule(scheduler, &mut schedule_context) {
-                            Err(e) => {
-                                tracing::warn!(
-                                    tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                                    "Scheduling error when draining pageserver {} : {e}", node_id
-                                );
-                            }
-                            Ok(()) => {
-                                let scheduled_to = tenant_shard.intent.get_attached();
-                                tracing::info!(
-                                    tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                                    "Rescheduled shard while draining node {}: {} -> {:?}",
-                                    node_id,
-                                    node_id,
-                                    scheduled_to
-                                );
+                    // If the shard is not attached to the node being drained, skip it.
+                    if *tenant_shard.intent.get_attached() != Some(node_id) {
+                        last_inspected_shard = Some(*tid);
+                        continue;
+                    }
 
-                                let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
-                                if let Some(some) = waiter {
-                                    waiters.push(some);
-                                }
+                    match tenant_shard.reschedule_to_secondary(None, scheduler) {
+                        Err(e) => {
+                            tracing::warn!(
+                                tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                                "Scheduling error when draining pageserver {} : {e}", node_id
+                            );
+                        }
+                        Ok(()) => {
+                            let scheduled_to = tenant_shard.intent.get_attached();
+                            tracing::info!(
+                                tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                                "Rescheduled shard while draining node {}: {} -> {:?}",
+                                node_id,
+                                node_id,
+                                scheduled_to
+                            );
+
+                            let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
+                            if let Some(some) = waiter {
+                                waiters.push(some);
                             }
                         }
                     }
@@ -5603,9 +5606,7 @@ impl Service {
         // secondaries are warm. This is not always true (e.g. we just migrated the
         // tenant). Take that into consideration by checking the secondary status.
         let mut tids_to_promote = self.fill_node_plan(node_id);
-
         let mut waiters = Vec::new();
-        let mut schedule_context = ScheduleContext::default();
 
         // Execute the plan we've composed above. Before aplying each move from the plan,
         // we validate to ensure that it has not gone stale in the meantime.
@@ -5655,9 +5656,7 @@ impl Service {
                             }
 
                             let previously_attached_to = *tenant_shard.intent.get_attached();
-
-                            tenant_shard.intent.promote_attached(scheduler, node_id);
-                            match tenant_shard.schedule(scheduler, &mut schedule_context) {
+                            match tenant_shard.reschedule_to_secondary(Some(node_id), scheduler) {
                                 Err(e) => {
                                     tracing::warn!(
                                         tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 840bcbb81d..45295bc59b 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -646,6 +646,48 @@ impl TenantShard {
         Ok(())
     }
 
+    /// Reschedule this tenant shard to one of its secondary locations. Returns a scheduling error
+    /// if the swap is not possible and leaves the intent state in its original state.
+    ///
+    /// Arguments:
+    /// `attached_to`: the currently attached location matching the intent state (may be None if the
+    /// shard is not attached)
+    /// `promote_to`: an optional secondary location of this tenant shard. If set to None, we ask
+    /// the scheduler to recommend a node
+    pub(crate) fn reschedule_to_secondary(
+        &mut self,
+        promote_to: Option<NodeId>,
+        scheduler: &mut Scheduler,
+    ) -> Result<(), ScheduleError> {
+        let promote_to = match promote_to {
+            Some(node) => node,
+            None => match scheduler.node_preferred(self.intent.get_secondary()) {
+                Some(node) => node,
+                None => {
+                    return Err(ScheduleError::ImpossibleConstraint);
+                }
+            },
+        };
+
+        assert!(self.intent.get_secondary().contains(&promote_to));
+
+        if let Some(node) = self.intent.get_attached() {
+            let demoted = self.intent.demote_attached(scheduler, *node);
+            if !demoted {
+                return Err(ScheduleError::ImpossibleConstraint);
+            }
+        }
+
+        self.intent.promote_attached(scheduler, promote_to);
+
+        // Increment the sequence number for the edge case where a
+        // reconciler is already running to avoid waiting on the
+        // current reconcile instead of spawning a new one.
+        self.sequence = self.sequence.next();
+
+        Ok(())
+    }
+
     /// Optimize attachments: if a shard has a secondary location that is preferable to
     /// its primary location based on soft constraints, switch that secondary location
     /// to be attached.

From 75747cdbffeb0b6d2a2a311584368de68cd9aadc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 22 Jun 2024 19:57:09 +0200
Subject: [PATCH 1027/1571] Use serde for RemoteStorageConfig parsing (#8126)

Adds a `Deserialize` impl to `RemoteStorageConfig`. We thus achieve the
same as #7743 but with less repetitive code, by deriving `Deserialize`
impls on `S3Config`, `AzureConfig`, and `RemoteStorageConfig`. The
disadvantage is less useful error messages.

The git history of this PR contains a state where we go via an
intermediate representation, leveraging the `serde_json` crate,
without it ever being actual json though.

Also, the PR adds deserialization tests.

Alternative to #7743 .
---
 Cargo.lock                       |   5 +
 libs/remote_storage/Cargo.toml   |   3 +-
 libs/remote_storage/src/lib.rs   | 289 ++++++++++++++-----------------
 pageserver/src/config.rs         |   2 +-
 pageserver/src/deletion_queue.rs |   4 +-
 pageserver/src/tenant.rs         |   4 +-
 proxy/src/context/parquet.rs     |   4 +-
 workspace_hack/Cargo.toml        |   1 +
 8 files changed, 144 insertions(+), 168 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index cf8a0b3286..77bf012402 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1014,6 +1014,9 @@ name = "camino"
 version = "1.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c"
+dependencies = [
+ "serde",
+]
 
 [[package]]
 name = "camino-tempfile"
@@ -4647,6 +4650,7 @@ dependencies = [
  "futures-util",
  "http-types",
  "humantime",
+ "humantime-serde",
  "hyper 0.14.26",
  "itertools",
  "metrics",
@@ -7367,6 +7371,7 @@ dependencies = [
  "base64 0.21.1",
  "base64ct",
  "bytes",
+ "camino",
  "cc",
  "chrono",
  "clap",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 78da01c9a0..23d82b90bd 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -14,8 +14,9 @@ aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-credential-types.workspace = true
 bytes.workspace = true
-camino.workspace = true
+camino = { workspace = true, features = ["serde1"] }
 humantime.workspace = true
+humantime-serde.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
 rand.workspace = true
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 72748e156c..e39ac581c7 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -36,7 +36,6 @@ use futures::stream::Stream;
 use serde::{Deserialize, Serialize};
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
-use toml_edit::Item;
 use tracing::info;
 
 pub use self::{
@@ -451,7 +450,7 @@ impl GenericRemoteStorage {
     pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
         let timeout = storage_config.timeout;
         Ok(match &storage_config.storage {
-            RemoteStorageKind::LocalFs(path) => {
+            RemoteStorageKind::LocalFs { local_path: path } => {
                 info!("Using fs root '{path}' as a remote storage");
                 Self::LocalFs(LocalFs::new(path.clone(), timeout)?)
             }
@@ -527,21 +526,28 @@ impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
 }
 
 /// External backup storage configuration, enough for creating a client for that storage.
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
 pub struct RemoteStorageConfig {
     /// The storage connection configuration.
+    #[serde(flatten)]
     pub storage: RemoteStorageKind,
     /// A common timeout enforced for all requests after concurrency limiter permit has been
     /// acquired.
+    #[serde(with = "humantime_serde", default = "default_timeout")]
     pub timeout: Duration,
 }
 
+fn default_timeout() -> Duration {
+    RemoteStorageConfig::DEFAULT_TIMEOUT
+}
+
 /// A kind of a remote storage to connect to, with its connection configuration.
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
+#[serde(untagged)]
 pub enum RemoteStorageKind {
     /// Storage based on local file system.
     /// Specify a root folder to place all stored files into.
-    LocalFs(Utf8PathBuf),
+    LocalFs { local_path: Utf8PathBuf },
     /// AWS S3 based storage, storing all files in the S3 bucket
     /// specified by the config
     AwsS3(S3Config),
@@ -551,7 +557,7 @@ pub enum RemoteStorageKind {
 }
 
 /// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
-#[derive(Clone, PartialEq, Eq)]
+#[derive(Clone, PartialEq, Eq, serde::Deserialize)]
 pub struct S3Config {
     /// Name of the bucket to connect to.
     pub bucket_name: String,
@@ -568,11 +574,24 @@ pub struct S3Config {
     pub endpoint: Option<String>,
     /// AWS S3 has various limits on its API calls, we need not to exceed those.
     /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
+    #[serde(default = "default_remote_storage_s3_concurrency_limit")]
     pub concurrency_limit: NonZeroUsize,
+    #[serde(default = "default_max_keys_per_list_response")]
     pub max_keys_per_list_response: Option<i32>,
+    #[serde(deserialize_with = "deserialize_storage_class", default)]
     pub upload_storage_class: Option<StorageClass>,
 }
 
+fn default_remote_storage_s3_concurrency_limit() -> NonZeroUsize {
+    DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
+        .try_into()
+        .unwrap()
+}
+
+fn default_max_keys_per_list_response() -> Option<i32> {
+    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
+}
+
 impl Debug for S3Config {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("S3Config")
@@ -589,7 +608,7 @@ impl Debug for S3Config {
 }
 
 /// Azure  bucket coordinates and access credentials to manage the bucket contents (read and write).
-#[derive(Clone, PartialEq, Eq)]
+#[derive(Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub struct AzureConfig {
     /// Name of the container to connect to.
     pub container_name: String,
@@ -601,10 +620,16 @@ pub struct AzureConfig {
     pub prefix_in_container: Option<String>,
     /// Azure has various limits on its API calls, we need not to exceed those.
     /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
+    #[serde(default = "default_remote_storage_azure_concurrency_limit")]
     pub concurrency_limit: NonZeroUsize,
+    #[serde(default = "default_max_keys_per_list_response")]
     pub max_keys_per_list_response: Option<i32>,
 }
 
+fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize {
+    NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap()
+}
+
 impl Debug for AzureConfig {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("AzureConfig")
@@ -621,167 +646,47 @@ impl Debug for AzureConfig {
     }
 }
 
+fn deserialize_storage_class<'de, D: serde::Deserializer<'de>>(
+    deserializer: D,
+) -> Result<Option<StorageClass>, D::Error> {
+    Option::<String>::deserialize(deserializer).and_then(|s| {
+        if let Some(s) = s {
+            use serde::de::Error;
+            let storage_class = StorageClass::from_str(&s).expect("infallible");
+            #[allow(deprecated)]
+            if matches!(storage_class, StorageClass::Unknown(_)) {
+                return Err(D::Error::custom(format!(
+                    "Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}",
+                    StorageClass::values()
+                )));
+            }
+            Ok(Some(storage_class))
+        } else {
+            Ok(None)
+        }
+    })
+}
+
 impl RemoteStorageConfig {
     pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
 
     pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
-        let local_path = toml.get("local_path");
-        let bucket_name = toml.get("bucket_name");
-        let bucket_region = toml.get("bucket_region");
-        let container_name = toml.get("container_name");
-        let container_region = toml.get("container_region");
-
-        let use_azure = container_name.is_some() && container_region.is_some();
-
-        let default_concurrency_limit = if use_azure {
-            DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
-        } else {
-            DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
+        let document: toml_edit::Document = match toml {
+            toml_edit::Item::Table(toml) => toml.clone().into(),
+            toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
+                toml.clone().into_table().into()
+            }
+            _ => bail!("toml not a table or inline table"),
         };
-        let concurrency_limit = NonZeroUsize::new(
-            parse_optional_integer("concurrency_limit", toml)?.unwrap_or(default_concurrency_limit),
-        )
-        .context("Failed to parse 'concurrency_limit' as a positive integer")?;
 
-        let max_keys_per_list_response =
-            parse_optional_integer::<i32, _>("max_keys_per_list_response", toml)
-                .context("Failed to parse 'max_keys_per_list_response' as a positive integer")?
-                .or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE);
-
-        let endpoint = toml
-            .get("endpoint")
-            .map(|endpoint| parse_toml_string("endpoint", endpoint))
-            .transpose()?;
-
-        let timeout = toml
-            .get("timeout")
-            .map(|timeout| {
-                timeout
-                    .as_str()
-                    .ok_or_else(|| anyhow::Error::msg("timeout was not a string"))
-            })
-            .transpose()
-            .and_then(|timeout| {
-                timeout
-                    .map(humantime::parse_duration)
-                    .transpose()
-                    .map_err(anyhow::Error::new)
-            })
-            .context("parse timeout")?
-            .unwrap_or(Self::DEFAULT_TIMEOUT);
-
-        if timeout < Duration::from_secs(1) {
-            bail!("timeout was specified as {timeout:?} which is too low");
+        if document.is_empty() {
+            return Ok(None);
         }
 
-        let storage = match (
-            local_path,
-            bucket_name,
-            bucket_region,
-            container_name,
-            container_region,
-        ) {
-            // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
-            (None, None, None, None, None) => return Ok(None),
-            (_, Some(_), None, ..) => {
-                bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
-            }
-            (_, None, Some(_), ..) => {
-                bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
-            }
-            (None, Some(bucket_name), Some(bucket_region), ..) => {
-                RemoteStorageKind::AwsS3(S3Config {
-                    bucket_name: parse_toml_string("bucket_name", bucket_name)?,
-                    bucket_region: parse_toml_string("bucket_region", bucket_region)?,
-                    prefix_in_bucket: toml
-                        .get("prefix_in_bucket")
-                        .map(|prefix_in_bucket| {
-                            parse_toml_string("prefix_in_bucket", prefix_in_bucket)
-                        })
-                        .transpose()?,
-                    endpoint,
-                    concurrency_limit,
-                    max_keys_per_list_response,
-                    upload_storage_class: toml
-                        .get("upload_storage_class")
-                        .map(|prefix_in_bucket| -> anyhow::Result<_> {
-                            let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?;
-                            let storage_class = StorageClass::from_str(&s).expect("infallible");
-                            #[allow(deprecated)]
-                            if matches!(storage_class, StorageClass::Unknown(_)) {
-                                bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values());
-                            }
-                            Ok(storage_class)
-                        })
-                        .transpose()?,
-                })
-            }
-            (_, _, _, Some(_), None) => {
-                bail!("'container_name' option is mandatory if 'container_region' is given ")
-            }
-            (_, _, _, None, Some(_)) => {
-                bail!("'container_name' option is mandatory if 'container_region' is given ")
-            }
-            (None, None, None, Some(container_name), Some(container_region)) => {
-                RemoteStorageKind::AzureContainer(AzureConfig {
-                    container_name: parse_toml_string("container_name", container_name)?,
-                    storage_account: toml
-                        .get("storage_account")
-                        .map(|storage_account| {
-                            parse_toml_string("storage_account", storage_account)
-                        })
-                        .transpose()?,
-                    container_region: parse_toml_string("container_region", container_region)?,
-                    prefix_in_container: toml
-                        .get("prefix_in_container")
-                        .map(|prefix_in_container| {
-                            parse_toml_string("prefix_in_container", prefix_in_container)
-                        })
-                        .transpose()?,
-                    concurrency_limit,
-                    max_keys_per_list_response,
-                })
-            }
-            (Some(local_path), None, None, None, None) => RemoteStorageKind::LocalFs(
-                Utf8PathBuf::from(parse_toml_string("local_path", local_path)?),
-            ),
-            (Some(_), Some(_), ..) => {
-                bail!("'local_path' and 'bucket_name' are mutually exclusive")
-            }
-            (Some(_), _, _, Some(_), Some(_)) => {
-                bail!("local_path and 'container_name' are mutually exclusive")
-            }
-        };
-
-        Ok(Some(RemoteStorageConfig { storage, timeout }))
+        Ok(Some(toml_edit::de::from_document(document)?))
     }
 }
 
-// Helper functions to parse a toml Item
-fn parse_optional_integer<I, E>(name: &str, item: &toml_edit::Item) -> anyhow::Result<Option<I>>
-where
-    I: TryFrom<i64, Error = E>,
-    E: std::error::Error + Send + Sync + 'static,
-{
-    let toml_integer = match item.get(name) {
-        Some(item) => item
-            .as_integer()
-            .with_context(|| format!("configure option {name} is not an integer"))?,
-        None => return Ok(None),
-    };
-
-    I::try_from(toml_integer)
-        .map(Some)
-        .with_context(|| format!("configure option {name} is too large"))
-}
-
-fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result<String> {
-    let s = item
-        .as_str()
-        .with_context(|| format!("configure option {name} is not a string"))?;
-    Ok(s.to_string())
-}
-
 struct ConcurrencyLimiter {
     // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
     // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
@@ -828,6 +733,11 @@ impl ConcurrencyLimiter {
 mod tests {
     use super::*;
 
+    fn parse(input: &str) -> anyhow::Result<Option<RemoteStorageConfig>> {
+        let toml = input.parse::<toml_edit::Document>().unwrap();
+        RemoteStorageConfig::from_toml(toml.as_item())
+    }
+
     #[test]
     fn test_object_name() {
         let k = RemotePath::new(Utf8Path::new("a/b/c")).unwrap();
@@ -855,18 +765,71 @@ mod tests {
         let input = "local_path = '.'
 timeout = '5s'";
 
-        let toml = input.parse::<toml_edit::Document>().unwrap();
-
-        let config = RemoteStorageConfig::from_toml(toml.as_item())
-            .unwrap()
-            .expect("it exists");
+        let config = parse(input).unwrap().expect("it exists");
 
         assert_eq!(
             config,
             RemoteStorageConfig {
-                storage: RemoteStorageKind::LocalFs(Utf8PathBuf::from(".")),
+                storage: RemoteStorageKind::LocalFs {
+                    local_path: Utf8PathBuf::from(".")
+                },
                 timeout: Duration::from_secs(5)
             }
         );
     }
+
+    #[test]
+    fn test_s3_parsing() {
+        let toml = "\
+        bucket_name = 'foo-bar'
+        bucket_region = 'eu-central-1'
+        upload_storage_class = 'INTELLIGENT_TIERING'
+        timeout = '7s'
+        ";
+
+        let config = parse(toml).unwrap().expect("it exists");
+
+        assert_eq!(
+            config,
+            RemoteStorageConfig {
+                storage: RemoteStorageKind::AwsS3(S3Config {
+                    bucket_name: "foo-bar".into(),
+                    bucket_region: "eu-central-1".into(),
+                    prefix_in_bucket: None,
+                    endpoint: None,
+                    concurrency_limit: default_remote_storage_s3_concurrency_limit(),
+                    max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+                    upload_storage_class: Some(StorageClass::IntelligentTiering),
+                }),
+                timeout: Duration::from_secs(7)
+            }
+        );
+    }
+
+    #[test]
+    fn test_azure_parsing() {
+        let toml = "\
+        container_name = 'foo-bar'
+        container_region = 'westeurope'
+        upload_storage_class = 'INTELLIGENT_TIERING'
+        timeout = '7s'
+        ";
+
+        let config = parse(toml).unwrap().expect("it exists");
+
+        assert_eq!(
+            config,
+            RemoteStorageConfig {
+                storage: RemoteStorageKind::AzureContainer(AzureConfig {
+                    container_name: "foo-bar".into(),
+                    storage_account: None,
+                    container_region: "westeurope".into(),
+                    prefix_in_container: None,
+                    concurrency_limit: default_remote_storage_azure_concurrency_limit(),
+                    max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+                }),
+                timeout: Duration::from_secs(7)
+            }
+        );
+    }
 }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index badea48b98..feb1363843 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1463,7 +1463,7 @@ broker_endpoint = '{broker_endpoint}'
             assert_eq!(
                 parsed_remote_storage_config,
                 RemoteStorageConfig {
-                    storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
+                    storage: RemoteStorageKind::LocalFs { local_path: local_storage_path.clone() },
                     timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                 },
                 "Remote storage config should correctly parse the local FS config and fill other storage defaults"
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 3960fc1b99..e779729f8d 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -850,7 +850,9 @@ mod test {
         std::fs::create_dir_all(remote_fs_dir)?;
         let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
         let storage_config = RemoteStorageConfig {
-            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+            storage: RemoteStorageKind::LocalFs {
+                local_path: remote_fs_dir.clone(),
+            },
             timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
         };
         let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ca5765c99b..ace95af10a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3906,7 +3906,9 @@ pub(crate) mod harness {
             let remote_fs_dir = conf.workdir.join("localfs");
             std::fs::create_dir_all(&remote_fs_dir).unwrap();
             let config = RemoteStorageConfig {
-                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+                storage: RemoteStorageKind::LocalFs {
+                    local_path: remote_fs_dir.clone(),
+                },
                 timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
             };
             let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 1355b7e1d8..e72bf199e3 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -543,7 +543,9 @@ mod tests {
         rx: impl Stream<Item = RequestData>,
     ) -> Vec<(u64, usize, i64)> {
         let remote_storage_config = RemoteStorageConfig {
-            storage: RemoteStorageKind::LocalFs(tmpdir.to_path_buf()),
+            storage: RemoteStorageKind::LocalFs {
+                local_path: tmpdir.to_path_buf(),
+            },
             timeout: std::time::Duration::from_secs(120),
         };
         let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap();
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index df16c71789..139a5647c5 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -25,6 +25,7 @@ axum = { version = "0.6", features = ["ws"] }
 base64 = { version = "0.21", features = ["alloc"] }
 base64ct = { version = "1", default-features = false, features = ["std"] }
 bytes = { version = "1", features = ["serde"] }
+camino = { version = "1", default-features = false, features = ["serde1"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }

From 78d9059fc7490e9c9374e80e04507a88861bd89a Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 24 Jun 2024 11:20:27 +0100
Subject: [PATCH 1028/1571] proxy: update tokio-postgres to allow arbitrary
 config params (#8076)

## Problem

Fixes https://github.com/neondatabase/neon/issues/1287

## Summary of changes

tokio-postgres now supports arbitrary server params through the
`param(key, value)` method. Some keys are special so we explicitly
filter them out.
---
 Cargo.lock                            |   8 +-
 libs/postgres_connection/src/lib.rs   |  50 +++++-----
 proxy/src/compute.rs                  | 129 ++++++++++++++------------
 proxy/src/serverless/backend.rs       |   4 +
 proxy/src/serverless/sql_over_http.rs |   1 +
 test_runner/regress/test_proxy.py     |  19 ++++
 6 files changed, 119 insertions(+), 92 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 77bf012402..70c837c146 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4005,7 +4005,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4018,7 +4018,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -4037,7 +4037,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -6210,7 +6210,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs
index 9f57f3d507..fdabcbacb2 100644
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -144,20 +144,7 @@ impl PgConnectionConfig {
             // implement and this function is hardly a bottleneck. The function is only called around
             // establishing a new connection.
             #[allow(unstable_name_collisions)]
-            config.options(
-                &self
-                    .options
-                    .iter()
-                    .map(|s| {
-                        if s.contains(['\\', ' ']) {
-                            Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
-                        } else {
-                            Cow::Borrowed(s.as_str())
-                        }
-                    })
-                    .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
-                    .collect::<String>(),
-            );
+            config.options(&encode_options(&self.options));
         }
         config
     }
@@ -178,6 +165,21 @@ impl PgConnectionConfig {
     }
 }
 
+#[allow(unstable_name_collisions)]
+fn encode_options(options: &[String]) -> String {
+    options
+        .iter()
+        .map(|s| {
+            if s.contains(['\\', ' ']) {
+                Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
+            } else {
+                Cow::Borrowed(s.as_str())
+            }
+        })
+        .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
+        .collect::<String>()
+}
+
 impl fmt::Display for PgConnectionConfig {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         // The password is intentionally hidden and not part of this display string.
@@ -206,7 +208,7 @@ impl fmt::Debug for PgConnectionConfig {
 
 #[cfg(test)]
 mod tests_pg_connection_config {
-    use crate::PgConnectionConfig;
+    use crate::{encode_options, PgConnectionConfig};
     use once_cell::sync::Lazy;
     use url::Host;
 
@@ -255,18 +257,12 @@ mod tests_pg_connection_config {
 
     #[test]
     fn test_with_options() {
-        let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123).extend_options([
-            "hello",
-            "world",
-            "with space",
-            "and \\ backslashes",
+        let options = encode_options(&[
+            "hello".to_owned(),
+            "world".to_owned(),
+            "with space".to_owned(),
+            "and \\ backslashes".to_owned(),
         ]);
-        assert_eq!(cfg.host(), &*STUB_HOST);
-        assert_eq!(cfg.port(), 123);
-        assert_eq!(cfg.raw_address(), "stub.host.example:123");
-        assert_eq!(
-            cfg.to_tokio_postgres_config().get_options(),
-            Some("hello world with\\ space and\\ \\\\\\ backslashes")
-        );
+        assert_eq!(options, "hello world with\\ space and\\ \\\\\\ backslashes");
     }
 }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index feb09d5638..a50a96e5e8 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -103,12 +103,8 @@ impl ConnCfg {
 
     /// Reuse password or auth keys from the other config.
     pub fn reuse_password(&mut self, other: Self) {
-        if let Some(password) = other.get_password() {
-            self.password(password);
-        }
-
-        if let Some(keys) = other.get_auth_keys() {
-            self.auth_keys(keys);
+        if let Some(password) = other.get_auth() {
+            self.auth(password);
         }
     }
 
@@ -124,48 +120,64 @@ impl ConnCfg {
 
     /// Apply startup message params to the connection config.
     pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
-        // Only set `user` if it's not present in the config.
-        // Link auth flow takes username from the console's response.
-        if let (None, Some(user)) = (self.get_user(), params.get("user")) {
-            self.user(user);
-        }
-
-        // Only set `dbname` if it's not present in the config.
-        // Link auth flow takes dbname from the console's response.
-        if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
-            self.dbname(dbname);
-        }
-
-        // Don't add `options` if they were only used for specifying a project.
-        // Connection pools don't support `options`, because they affect backend startup.
-        if let Some(options) = filtered_options(params) {
-            self.options(&options);
-        }
-
-        if let Some(app_name) = params.get("application_name") {
-            self.application_name(app_name);
-        }
-
-        // TODO: This is especially ugly...
-        if let Some(replication) = params.get("replication") {
-            use tokio_postgres::config::ReplicationMode;
-            match replication {
-                "true" | "on" | "yes" | "1" => {
-                    self.replication_mode(ReplicationMode::Physical);
+        let mut client_encoding = false;
+        for (k, v) in params.iter() {
+            match k {
+                "user" => {
+                    // Only set `user` if it's not present in the config.
+                    // Link auth flow takes username from the console's response.
+                    if self.get_user().is_none() {
+                        self.user(v);
+                    }
                 }
                 "database" => {
-                    self.replication_mode(ReplicationMode::Logical);
+                    // Only set `dbname` if it's not present in the config.
+                    // Link auth flow takes dbname from the console's response.
+                    if self.get_dbname().is_none() {
+                        self.dbname(v);
+                    }
+                }
+                "options" => {
+                    // Don't add `options` if they were only used for specifying a project.
+                    // Connection pools don't support `options`, because they affect backend startup.
+                    if let Some(options) = filtered_options(v) {
+                        self.options(&options);
+                    }
+                }
+
+                // the special ones in tokio-postgres that we don't want being set by the user
+                "dbname" => {}
+                "password" => {}
+                "sslmode" => {}
+                "host" => {}
+                "port" => {}
+                "connect_timeout" => {}
+                "keepalives" => {}
+                "keepalives_idle" => {}
+                "keepalives_interval" => {}
+                "keepalives_retries" => {}
+                "target_session_attrs" => {}
+                "channel_binding" => {}
+                "max_backend_message_size" => {}
+
+                "client_encoding" => {
+                    client_encoding = true;
+                    // only error should be from bad null bytes,
+                    // but we've already checked for those.
+                    _ = self.param("client_encoding", v);
+                }
+
+                _ => {
+                    // only error should be from bad null bytes,
+                    // but we've already checked for those.
+                    _ = self.param(k, v);
                 }
-                _other => {}
             }
         }
-
-        // TODO: extend the list of the forwarded startup parameters.
-        // Currently, tokio-postgres doesn't allow us to pass
-        // arbitrary parameters, but the ones above are a good start.
-        //
-        // This and the reverse params problem can be better addressed
-        // in a bespoke connection machinery (a new library for that sake).
+        if !client_encoding {
+            // for compatibility since we removed it from tokio-postgres
+            self.param("client_encoding", "UTF8").unwrap();
+        }
     }
 }
 
@@ -338,10 +350,9 @@ impl ConnCfg {
 }
 
 /// Retrieve `options` from a startup message, dropping all proxy-secific flags.
-fn filtered_options(params: &StartupMessageParams) -> Option<String> {
+fn filtered_options(options: &str) -> Option<String> {
     #[allow(unstable_name_collisions)]
-    let options: String = params
-        .options_raw()?
+    let options: String = StartupMessageParams::parse_options_raw(options)
         .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
         .intersperse(" ") // TODO: use impl from std once it's stabilized
         .collect();
@@ -413,27 +424,23 @@ mod tests {
     #[test]
     fn test_filtered_options() {
         // Empty options is unlikely to be useful anyway.
-        let params = StartupMessageParams::new([("options", "")]);
-        assert_eq!(filtered_options(&params), None);
+        assert_eq!(filtered_options(""), None);
 
         // It's likely that clients will only use options to specify endpoint/project.
-        let params = StartupMessageParams::new([("options", "project=foo")]);
-        assert_eq!(filtered_options(&params), None);
+        let params = "project=foo";
+        assert_eq!(filtered_options(params), None);
 
         // Same, because unescaped whitespaces are no-op.
-        let params = StartupMessageParams::new([("options", " project=foo ")]);
-        assert_eq!(filtered_options(&params).as_deref(), None);
+        let params = " project=foo ";
+        assert_eq!(filtered_options(params), None);
 
-        let params = StartupMessageParams::new([("options", r"\  project=foo \ ")]);
-        assert_eq!(filtered_options(&params).as_deref(), Some(r"\  \ "));
+        let params = r"\  project=foo \ ";
+        assert_eq!(filtered_options(params).as_deref(), Some(r"\  \ "));
 
-        let params = StartupMessageParams::new([("options", "project = foo")]);
-        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+        let params = "project = foo";
+        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
 
-        let params = StartupMessageParams::new([(
-            "options",
-            "project = foo neon_endpoint_type:read_write   neon_lsn:0/2",
-        )]);
-        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+        let params = "project = foo neon_endpoint_type:read_write   neon_lsn:0/2";
+        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
     }
 }
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 86e64c0a38..05d6061238 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -231,6 +231,10 @@ impl ConnectMechanism for TokioMechanism {
             .dbname(&self.conn_info.dbname)
             .connect_timeout(timeout);
 
+        config
+            .param("client_encoding", "UTF8")
+            .expect("client encoding UTF8 is always valid");
+
         let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
         let res = config.connect(tokio_postgres::NoTls).await;
         drop(pause);
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 7a99aeb759..583ff75f7c 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -202,6 +202,7 @@ fn get_conn_info(
             options = Some(NeonOptions::parse_options_raw(&value));
         }
     }
+    ctx.set_db_options(params.freeze());
 
     let user_info = ComputeUserInfo {
         endpoint,
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index f446f4f200..8ed44b1094 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -53,6 +53,25 @@ def test_proxy_select_1(static_proxy: NeonProxy):
     assert out[0][0] == 42
 
 
+def test_proxy_server_params(static_proxy: NeonProxy):
+    """
+    Test that server params are passing through to postgres
+    """
+
+    out = static_proxy.safe_psql(
+        "select to_json('0 seconds'::interval)", options="-c intervalstyle=iso_8601"
+    )
+    assert out[0][0] == "PT0S"
+    out = static_proxy.safe_psql(
+        "select to_json('0 seconds'::interval)", options="-c intervalstyle=sql_standard"
+    )
+    assert out[0][0] == "0"
+    out = static_proxy.safe_psql(
+        "select to_json('0 seconds'::interval)", options="-c intervalstyle=postgres"
+    )
+    assert out[0][0] == "00:00:00"
+
+
 def test_password_hack(static_proxy: NeonProxy):
     """
     Check the PasswordHack auth flow: an alternative to SCRAM auth for

From 5446e08891bd58a598aa427cb6208806154e3b41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 24 Jun 2024 12:29:54 +0200
Subject: [PATCH 1029/1571] Move remote_storage config related code into
 dedicated module (#8132)

Moves `RemoteStorageConfig` and related structs and functions into a
dedicated module. Also implements `Serialize` for the config structs
(requested in #8126).

Follow-up of #8126
---
 libs/remote_storage/src/azure_blob.rs |   2 +-
 libs/remote_storage/src/config.rs     | 277 ++++++++++++++++++++++++++
 libs/remote_storage/src/lib.rs        | 254 +----------------------
 libs/remote_storage/src/s3_bucket.rs  |   4 +-
 4 files changed, 285 insertions(+), 252 deletions(-)
 create mode 100644 libs/remote_storage/src/config.rs

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index dbd64fb5a6..8e590b17c4 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -34,7 +34,7 @@ use utils::backoff;
 
 use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
 use crate::{
-    error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing,
+    config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, Listing,
     ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
 };
 
diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
new file mode 100644
index 0000000000..8a8f6212e9
--- /dev/null
+++ b/libs/remote_storage/src/config.rs
@@ -0,0 +1,277 @@
+use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration};
+
+use anyhow::bail;
+use aws_sdk_s3::types::StorageClass;
+use camino::Utf8PathBuf;
+
+use serde::{Deserialize, Serialize};
+
+use crate::{
+    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT,
+    DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+};
+
+/// External backup storage configuration, enough for creating a client for that storage.
+#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
+pub struct RemoteStorageConfig {
+    /// The storage connection configuration.
+    #[serde(flatten)]
+    pub storage: RemoteStorageKind,
+    /// A common timeout enforced for all requests after concurrency limiter permit has been
+    /// acquired.
+    #[serde(
+        with = "humantime_serde",
+        default = "default_timeout",
+        skip_serializing_if = "is_default_timeout"
+    )]
+    pub timeout: Duration,
+}
+
+fn default_timeout() -> Duration {
+    RemoteStorageConfig::DEFAULT_TIMEOUT
+}
+
+fn is_default_timeout(d: &Duration) -> bool {
+    *d == RemoteStorageConfig::DEFAULT_TIMEOUT
+}
+
+/// A kind of a remote storage to connect to, with its connection configuration.
+#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
+#[serde(untagged)]
+pub enum RemoteStorageKind {
+    /// Storage based on local file system.
+    /// Specify a root folder to place all stored files into.
+    LocalFs { local_path: Utf8PathBuf },
+    /// AWS S3 based storage, storing all files in the S3 bucket
+    /// specified by the config
+    AwsS3(S3Config),
+    /// Azure Blob based storage, storing all files in the container
+    /// specified by the config
+    AzureContainer(AzureConfig),
+}
+
+/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
+#[derive(Clone, PartialEq, Eq, Deserialize, Serialize)]
+pub struct S3Config {
+    /// Name of the bucket to connect to.
+    pub bucket_name: String,
+    /// The region where the bucket is located at.
+    pub bucket_region: String,
+    /// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once.
+    pub prefix_in_bucket: Option<String>,
+    /// A base URL to send S3 requests to.
+    /// By default, the endpoint is derived from a region name, assuming it's
+    /// an AWS S3 region name, erroring on wrong region name.
+    /// Endpoint provides a way to support other S3 flavors and their regions.
+    ///
+    /// Example: `http://127.0.0.1:5000`
+    pub endpoint: Option<String>,
+    /// AWS S3 has various limits on its API calls, we need not to exceed those.
+    /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
+    #[serde(default = "default_remote_storage_s3_concurrency_limit")]
+    pub concurrency_limit: NonZeroUsize,
+    #[serde(default = "default_max_keys_per_list_response")]
+    pub max_keys_per_list_response: Option<i32>,
+    #[serde(
+        deserialize_with = "deserialize_storage_class",
+        serialize_with = "serialize_storage_class",
+        default
+    )]
+    pub upload_storage_class: Option<StorageClass>,
+}
+
+fn default_remote_storage_s3_concurrency_limit() -> NonZeroUsize {
+    DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
+        .try_into()
+        .unwrap()
+}
+
+fn default_max_keys_per_list_response() -> Option<i32> {
+    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
+}
+
+impl Debug for S3Config {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("S3Config")
+            .field("bucket_name", &self.bucket_name)
+            .field("bucket_region", &self.bucket_region)
+            .field("prefix_in_bucket", &self.prefix_in_bucket)
+            .field("concurrency_limit", &self.concurrency_limit)
+            .field(
+                "max_keys_per_list_response",
+                &self.max_keys_per_list_response,
+            )
+            .finish()
+    }
+}
+
+/// Azure  bucket coordinates and access credentials to manage the bucket contents (read and write).
+#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct AzureConfig {
+    /// Name of the container to connect to.
+    pub container_name: String,
+    /// Name of the storage account the container is inside of
+    pub storage_account: Option<String>,
+    /// The region where the bucket is located at.
+    pub container_region: String,
+    /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once.
+    pub prefix_in_container: Option<String>,
+    /// Azure has various limits on its API calls, we need not to exceed those.
+    /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
+    #[serde(default = "default_remote_storage_azure_concurrency_limit")]
+    pub concurrency_limit: NonZeroUsize,
+    #[serde(default = "default_max_keys_per_list_response")]
+    pub max_keys_per_list_response: Option<i32>,
+}
+
+fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize {
+    NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap()
+}
+
+impl Debug for AzureConfig {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("AzureConfig")
+            .field("bucket_name", &self.container_name)
+            .field("storage_account", &self.storage_account)
+            .field("bucket_region", &self.container_region)
+            .field("prefix_in_container", &self.prefix_in_container)
+            .field("concurrency_limit", &self.concurrency_limit)
+            .field(
+                "max_keys_per_list_response",
+                &self.max_keys_per_list_response,
+            )
+            .finish()
+    }
+}
+
+fn deserialize_storage_class<'de, D: serde::Deserializer<'de>>(
+    deserializer: D,
+) -> Result<Option<StorageClass>, D::Error> {
+    Option::<String>::deserialize(deserializer).and_then(|s| {
+        if let Some(s) = s {
+            use serde::de::Error;
+            let storage_class = StorageClass::from_str(&s).expect("infallible");
+            #[allow(deprecated)]
+            if matches!(storage_class, StorageClass::Unknown(_)) {
+                return Err(D::Error::custom(format!(
+                    "Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}",
+                    StorageClass::values()
+                )));
+            }
+            Ok(Some(storage_class))
+        } else {
+            Ok(None)
+        }
+    })
+}
+
+fn serialize_storage_class<S: serde::Serializer>(
+    val: &Option<StorageClass>,
+    serializer: S,
+) -> Result<S::Ok, S::Error> {
+    let val = val.as_ref().map(StorageClass::as_str);
+    Option::<&str>::serialize(&val, serializer)
+}
+
+impl RemoteStorageConfig {
+    pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
+
+    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
+        let document: toml_edit::Document = match toml {
+            toml_edit::Item::Table(toml) => toml.clone().into(),
+            toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
+                toml.clone().into_table().into()
+            }
+            _ => bail!("toml not a table or inline table"),
+        };
+
+        if document.is_empty() {
+            return Ok(None);
+        }
+
+        Ok(Some(toml_edit::de::from_document(document)?))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn parse(input: &str) -> anyhow::Result<Option<RemoteStorageConfig>> {
+        let toml = input.parse::<toml_edit::Document>().unwrap();
+        RemoteStorageConfig::from_toml(toml.as_item())
+    }
+
+    #[test]
+    fn parse_localfs_config_with_timeout() {
+        let input = "local_path = '.'
+timeout = '5s'";
+
+        let config = parse(input).unwrap().expect("it exists");
+
+        assert_eq!(
+            config,
+            RemoteStorageConfig {
+                storage: RemoteStorageKind::LocalFs {
+                    local_path: Utf8PathBuf::from(".")
+                },
+                timeout: Duration::from_secs(5)
+            }
+        );
+    }
+
+    #[test]
+    fn test_s3_parsing() {
+        let toml = "\
+    bucket_name = 'foo-bar'
+    bucket_region = 'eu-central-1'
+    upload_storage_class = 'INTELLIGENT_TIERING'
+    timeout = '7s'
+    ";
+
+        let config = parse(toml).unwrap().expect("it exists");
+
+        assert_eq!(
+            config,
+            RemoteStorageConfig {
+                storage: RemoteStorageKind::AwsS3(S3Config {
+                    bucket_name: "foo-bar".into(),
+                    bucket_region: "eu-central-1".into(),
+                    prefix_in_bucket: None,
+                    endpoint: None,
+                    concurrency_limit: default_remote_storage_s3_concurrency_limit(),
+                    max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+                    upload_storage_class: Some(StorageClass::IntelligentTiering),
+                }),
+                timeout: Duration::from_secs(7)
+            }
+        );
+    }
+
+    #[test]
+    fn test_azure_parsing() {
+        let toml = "\
+    container_name = 'foo-bar'
+    container_region = 'westeurope'
+    upload_storage_class = 'INTELLIGENT_TIERING'
+    timeout = '7s'
+    ";
+
+        let config = parse(toml).unwrap().expect("it exists");
+
+        assert_eq!(
+            config,
+            RemoteStorageConfig {
+                storage: RemoteStorageKind::AzureContainer(AzureConfig {
+                    container_name: "foo-bar".into(),
+                    storage_account: None,
+                    container_region: "westeurope".into(),
+                    prefix_in_container: None,
+                    concurrency_limit: default_remote_storage_azure_concurrency_limit(),
+                    max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+                }),
+                timeout: Duration::from_secs(7)
+            }
+        );
+    }
+}
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index e39ac581c7..d440c03a0e 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -10,6 +10,7 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 
 mod azure_blob;
+mod config;
 mod error;
 mod local_fs;
 mod metrics;
@@ -18,17 +19,10 @@ mod simulate_failures;
 mod support;
 
 use std::{
-    collections::HashMap,
-    fmt::Debug,
-    num::{NonZeroU32, NonZeroUsize},
-    pin::Pin,
-    str::FromStr,
-    sync::Arc,
-    time::{Duration, SystemTime},
+    collections::HashMap, fmt::Debug, num::NonZeroU32, pin::Pin, sync::Arc, time::SystemTime,
 };
 
-use anyhow::{bail, Context};
-use aws_sdk_s3::types::StorageClass;
+use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 
 use bytes::Bytes;
@@ -44,6 +38,8 @@ pub use self::{
 };
 use s3_bucket::RequestKind;
 
+pub use crate::config::{AzureConfig, RemoteStorageConfig, RemoteStorageKind, S3Config};
+
 /// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
 pub use azure_core::Etag;
 
@@ -525,168 +521,6 @@ impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
     }
 }
 
-/// External backup storage configuration, enough for creating a client for that storage.
-#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
-pub struct RemoteStorageConfig {
-    /// The storage connection configuration.
-    #[serde(flatten)]
-    pub storage: RemoteStorageKind,
-    /// A common timeout enforced for all requests after concurrency limiter permit has been
-    /// acquired.
-    #[serde(with = "humantime_serde", default = "default_timeout")]
-    pub timeout: Duration,
-}
-
-fn default_timeout() -> Duration {
-    RemoteStorageConfig::DEFAULT_TIMEOUT
-}
-
-/// A kind of a remote storage to connect to, with its connection configuration.
-#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
-#[serde(untagged)]
-pub enum RemoteStorageKind {
-    /// Storage based on local file system.
-    /// Specify a root folder to place all stored files into.
-    LocalFs { local_path: Utf8PathBuf },
-    /// AWS S3 based storage, storing all files in the S3 bucket
-    /// specified by the config
-    AwsS3(S3Config),
-    /// Azure Blob based storage, storing all files in the container
-    /// specified by the config
-    AzureContainer(AzureConfig),
-}
-
-/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
-#[derive(Clone, PartialEq, Eq, serde::Deserialize)]
-pub struct S3Config {
-    /// Name of the bucket to connect to.
-    pub bucket_name: String,
-    /// The region where the bucket is located at.
-    pub bucket_region: String,
-    /// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once.
-    pub prefix_in_bucket: Option<String>,
-    /// A base URL to send S3 requests to.
-    /// By default, the endpoint is derived from a region name, assuming it's
-    /// an AWS S3 region name, erroring on wrong region name.
-    /// Endpoint provides a way to support other S3 flavors and their regions.
-    ///
-    /// Example: `http://127.0.0.1:5000`
-    pub endpoint: Option<String>,
-    /// AWS S3 has various limits on its API calls, we need not to exceed those.
-    /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
-    #[serde(default = "default_remote_storage_s3_concurrency_limit")]
-    pub concurrency_limit: NonZeroUsize,
-    #[serde(default = "default_max_keys_per_list_response")]
-    pub max_keys_per_list_response: Option<i32>,
-    #[serde(deserialize_with = "deserialize_storage_class", default)]
-    pub upload_storage_class: Option<StorageClass>,
-}
-
-fn default_remote_storage_s3_concurrency_limit() -> NonZeroUsize {
-    DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
-        .try_into()
-        .unwrap()
-}
-
-fn default_max_keys_per_list_response() -> Option<i32> {
-    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
-}
-
-impl Debug for S3Config {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("S3Config")
-            .field("bucket_name", &self.bucket_name)
-            .field("bucket_region", &self.bucket_region)
-            .field("prefix_in_bucket", &self.prefix_in_bucket)
-            .field("concurrency_limit", &self.concurrency_limit)
-            .field(
-                "max_keys_per_list_response",
-                &self.max_keys_per_list_response,
-            )
-            .finish()
-    }
-}
-
-/// Azure  bucket coordinates and access credentials to manage the bucket contents (read and write).
-#[derive(Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-pub struct AzureConfig {
-    /// Name of the container to connect to.
-    pub container_name: String,
-    /// Name of the storage account the container is inside of
-    pub storage_account: Option<String>,
-    /// The region where the bucket is located at.
-    pub container_region: String,
-    /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once.
-    pub prefix_in_container: Option<String>,
-    /// Azure has various limits on its API calls, we need not to exceed those.
-    /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
-    #[serde(default = "default_remote_storage_azure_concurrency_limit")]
-    pub concurrency_limit: NonZeroUsize,
-    #[serde(default = "default_max_keys_per_list_response")]
-    pub max_keys_per_list_response: Option<i32>,
-}
-
-fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize {
-    NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap()
-}
-
-impl Debug for AzureConfig {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("AzureConfig")
-            .field("bucket_name", &self.container_name)
-            .field("storage_account", &self.storage_account)
-            .field("bucket_region", &self.container_region)
-            .field("prefix_in_container", &self.prefix_in_container)
-            .field("concurrency_limit", &self.concurrency_limit)
-            .field(
-                "max_keys_per_list_response",
-                &self.max_keys_per_list_response,
-            )
-            .finish()
-    }
-}
-
-fn deserialize_storage_class<'de, D: serde::Deserializer<'de>>(
-    deserializer: D,
-) -> Result<Option<StorageClass>, D::Error> {
-    Option::<String>::deserialize(deserializer).and_then(|s| {
-        if let Some(s) = s {
-            use serde::de::Error;
-            let storage_class = StorageClass::from_str(&s).expect("infallible");
-            #[allow(deprecated)]
-            if matches!(storage_class, StorageClass::Unknown(_)) {
-                return Err(D::Error::custom(format!(
-                    "Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}",
-                    StorageClass::values()
-                )));
-            }
-            Ok(Some(storage_class))
-        } else {
-            Ok(None)
-        }
-    })
-}
-
-impl RemoteStorageConfig {
-    pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
-
-    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
-        let document: toml_edit::Document = match toml {
-            toml_edit::Item::Table(toml) => toml.clone().into(),
-            toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
-                toml.clone().into_table().into()
-            }
-            _ => bail!("toml not a table or inline table"),
-        };
-
-        if document.is_empty() {
-            return Ok(None);
-        }
-
-        Ok(Some(toml_edit::de::from_document(document)?))
-    }
-}
-
 struct ConcurrencyLimiter {
     // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
     // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
@@ -733,11 +567,6 @@ impl ConcurrencyLimiter {
 mod tests {
     use super::*;
 
-    fn parse(input: &str) -> anyhow::Result<Option<RemoteStorageConfig>> {
-        let toml = input.parse::<toml_edit::Document>().unwrap();
-        RemoteStorageConfig::from_toml(toml.as_item())
-    }
-
     #[test]
     fn test_object_name() {
         let k = RemotePath::new(Utf8Path::new("a/b/c")).unwrap();
@@ -759,77 +588,4 @@ mod tests {
         let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths");
         assert_eq!(err.to_string(), "Path \"/\" is not relative");
     }
-
-    #[test]
-    fn parse_localfs_config_with_timeout() {
-        let input = "local_path = '.'
-timeout = '5s'";
-
-        let config = parse(input).unwrap().expect("it exists");
-
-        assert_eq!(
-            config,
-            RemoteStorageConfig {
-                storage: RemoteStorageKind::LocalFs {
-                    local_path: Utf8PathBuf::from(".")
-                },
-                timeout: Duration::from_secs(5)
-            }
-        );
-    }
-
-    #[test]
-    fn test_s3_parsing() {
-        let toml = "\
-        bucket_name = 'foo-bar'
-        bucket_region = 'eu-central-1'
-        upload_storage_class = 'INTELLIGENT_TIERING'
-        timeout = '7s'
-        ";
-
-        let config = parse(toml).unwrap().expect("it exists");
-
-        assert_eq!(
-            config,
-            RemoteStorageConfig {
-                storage: RemoteStorageKind::AwsS3(S3Config {
-                    bucket_name: "foo-bar".into(),
-                    bucket_region: "eu-central-1".into(),
-                    prefix_in_bucket: None,
-                    endpoint: None,
-                    concurrency_limit: default_remote_storage_s3_concurrency_limit(),
-                    max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
-                    upload_storage_class: Some(StorageClass::IntelligentTiering),
-                }),
-                timeout: Duration::from_secs(7)
-            }
-        );
-    }
-
-    #[test]
-    fn test_azure_parsing() {
-        let toml = "\
-        container_name = 'foo-bar'
-        container_region = 'westeurope'
-        upload_storage_class = 'INTELLIGENT_TIERING'
-        timeout = '7s'
-        ";
-
-        let config = parse(toml).unwrap().expect("it exists");
-
-        assert_eq!(
-            config,
-            RemoteStorageConfig {
-                storage: RemoteStorageKind::AzureContainer(AzureConfig {
-                    container_name: "foo-bar".into(),
-                    storage_account: None,
-                    container_region: "westeurope".into(),
-                    prefix_in_container: None,
-                    concurrency_limit: default_remote_storage_azure_concurrency_limit(),
-                    max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
-                }),
-                timeout: Duration::from_secs(7)
-            }
-        );
-    }
 }
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 76cf3eac80..ef1bd2c047 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -46,12 +46,12 @@ use utils::backoff;
 
 use super::StorageMetadata;
 use crate::{
+    config::S3Config,
     error::Cancelled,
     metrics::{start_counting_cancelled_wait, start_measuring_requests},
     support::PermitCarrying,
     ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
-    S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 use crate::metrics::AttemptOutcome;

From 188797f0486adb53b24edac39929e36bffdfe1b3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 24 Jun 2024 11:41:11 +0100
Subject: [PATCH 1030/1571] pageserver: remove code that resumes tenant
 deletions after restarts (#8091)

#8082 removed the legacy deletion path, but retained code for completing
deletions that were started before a pageserver restart. This PR cleans
up that remaining code, and removes all the pageserver code that dealt
with tenant deletion markers and resuming tenant deletions.

The release at https://github.com/neondatabase/neon/pull/8138 contains
https://github.com/neondatabase/neon/pull/8082, so we can now merge this
to `main`
---
 pageserver/src/config.rs                 |  12 +-
 pageserver/src/http/routes.rs            |   9 +-
 pageserver/src/tenant.rs                 |  69 +---
 pageserver/src/tenant/delete.rs          | 426 -----------------------
 pageserver/src/tenant/mgr.rs             |  54 +--
 pageserver/src/tenant/timeline/delete.rs |   5 -
 6 files changed, 22 insertions(+), 553 deletions(-)
 delete mode 100644 pageserver/src/tenant/delete.rs

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index feb1363843..104234841c 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,9 +33,7 @@ use utils::{
 use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
-use crate::tenant::{
-    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
-};
+use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
 use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{
@@ -855,14 +853,6 @@ impl PageServerConf {
         )
     }
 
-    pub(crate) fn tenant_deleted_mark_file_path(
-        &self,
-        tenant_shard_id: &TenantShardId,
-    ) -> Utf8PathBuf {
-        self.tenant_path(tenant_shard_id)
-            .join(TENANT_DELETED_MARKER_FILE_NAME)
-    }
-
     pub fn traces_path(&self) -> Utf8PathBuf {
         self.workdir.join("traces")
     }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index b5713a8cb4..cfa507fed0 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -329,14 +329,11 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
     }
 }
 
-impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
-    fn from(value: crate::tenant::delete::DeleteTenantError) -> Self {
-        use crate::tenant::delete::DeleteTenantError::*;
+impl From<crate::tenant::mgr::DeleteTenantError> for ApiError {
+    fn from(value: crate::tenant::mgr::DeleteTenantError) -> Self {
+        use crate::tenant::mgr::DeleteTenantError::*;
         match value {
-            Get(g) => ApiError::from(g),
-            Timeline(t) => ApiError::from(t),
             SlotError(e) => e.into(),
-            SlotUpsertError(e) => e.into(),
             Other(o) => ApiError::InternalServerError(o),
             Cancelled => ApiError::ShuttingDown,
         }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ace95af10a..6a748f61e7 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -55,11 +55,9 @@ use self::config::AttachedLocationConfig;
 use self::config::AttachmentMode;
 use self::config::LocationConf;
 use self::config::TenantConf;
-use self::delete::DeleteTenantFlow;
 use self::metadata::TimelineMetadata;
 use self::mgr::GetActiveTenantError;
 use self::mgr::GetTenantError;
-use self::mgr::TenantsMap;
 use self::remote_timeline_client::upload::upload_index_part;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineCreateGuard;
@@ -137,7 +135,6 @@ pub mod remote_timeline_client;
 pub mod storage_layer;
 
 pub mod config;
-pub mod delete;
 pub mod mgr;
 pub mod secondary;
 pub mod tasks;
@@ -161,8 +158,6 @@ pub const TENANTS_SEGMENT_NAME: &str = "tenants";
 /// Parts of the `.neon/tenants/<tenant_id>/timelines/<timeline_id>` directory prefix.
 pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
 
-pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
-
 /// References to shared objects that are passed into each tenant, such
 /// as the shared remote storage client and process initialization state.
 #[derive(Clone)]
@@ -207,7 +202,6 @@ struct TimelinePreload {
 }
 
 pub(crate) struct TenantPreload {
-    deleting: bool,
     timelines: HashMap<TimelineId, TimelinePreload>,
 }
 
@@ -286,8 +280,6 @@ pub struct Tenant {
     /// background warmup.
     pub(crate) activate_now_sem: tokio::sync::Semaphore,
 
-    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
-
     // Cancellation token fires when we have entered shutdown().  This is a parent of
     // Timelines' cancellation token.
     pub(crate) cancel: CancellationToken,
@@ -654,7 +646,6 @@ impl Tenant {
         attached_conf: AttachedTenantConf,
         shard_identity: ShardIdentity,
         init_order: Option<InitializationOrder>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
         mode: SpawnMode,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Tenant>> {
@@ -828,52 +819,6 @@ impl Tenant {
                 // Remote preload is complete.
                 drop(remote_load_completion);
 
-                let pending_deletion = {
-                    match DeleteTenantFlow::should_resume_deletion(
-                        conf,
-                        preload.as_ref().map(|p| p.deleting).unwrap_or(false),
-                        &tenant_clone,
-                    )
-                    .await
-                    {
-                        Ok(should_resume_deletion) => should_resume_deletion,
-                        Err(err) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(err), BrokenVerbosity::Error);
-                            return Ok(());
-                        }
-                    }
-                };
-
-                info!("pending_deletion {}", pending_deletion.is_some());
-
-                if let Some(deletion) = pending_deletion {
-                    // as we are no longer loading, signal completion by dropping
-                    // the completion while we resume deletion
-                    drop(_completion);
-                    let background_jobs_can_start =
-                        init_order.as_ref().map(|x| &x.background_jobs_can_start);
-                    if let Some(background) = background_jobs_can_start {
-                        info!("waiting for backgound jobs barrier");
-                        background.clone().wait().await;
-                        info!("ready for backgound jobs barrier");
-                    }
-
-                    let deleted = DeleteTenantFlow::resume_from_attach(
-                        deletion,
-                        &tenant_clone,
-                        preload,
-                        tenants,
-                        &ctx,
-                    )
-                    .await;
-
-                    if let Err(e) = deleted {
-                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
-                    }
-
-                    return Ok(());
-                }
-
                 // We will time the duration of the attach phase unless this is a creation (attach will do no work)
                 let attached = {
                     let _attach_timer = match mode {
@@ -931,21 +876,13 @@ impl Tenant {
         )
         .await?;
 
-        let deleting = other_keys.contains(TENANT_DELETED_MARKER_FILE_NAME);
-        info!(
-            "found {} timelines, deleting={}",
-            remote_timeline_ids.len(),
-            deleting
-        );
+        info!("found {} timelines", remote_timeline_ids.len(),);
 
         for k in other_keys {
-            if k != TENANT_DELETED_MARKER_FILE_NAME {
-                warn!("Unexpected non timeline key {k}");
-            }
+            warn!("Unexpected non timeline key {k}");
         }
 
         Ok(TenantPreload {
-            deleting,
             timelines: Self::load_timeline_metadata(
                 self,
                 remote_timeline_ids,
@@ -974,7 +911,6 @@ impl Tenant {
         let preload = match (preload, mode) {
             (Some(p), _) => p,
             (None, SpawnMode::Create) => TenantPreload {
-                deleting: false,
                 timelines: HashMap::new(),
             },
             (None, _) => {
@@ -2628,7 +2564,6 @@ impl Tenant {
             cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
             eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
             activate_now_sem: tokio::sync::Semaphore::new(0),
-            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
             cancel: CancellationToken::default(),
             gate: Gate::default(),
             timeline_get_throttle: Arc::new(throttle::Throttle::new(
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
deleted file mode 100644
index d9da3157b7..0000000000
--- a/pageserver/src/tenant/delete.rs
+++ /dev/null
@@ -1,426 +0,0 @@
-use std::sync::Arc;
-
-use anyhow::Context;
-use camino::{Utf8Path, Utf8PathBuf};
-use pageserver_api::{models::TenantState, shard::TenantShardId};
-use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
-use tokio::sync::OwnedMutexGuard;
-use tokio_util::sync::CancellationToken;
-use tracing::{error, Instrument};
-
-use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
-
-use crate::{
-    config::PageServerConf,
-    context::RequestContext,
-    task_mgr::{self},
-    tenant::{
-        mgr::{TenantSlot, TenantsMapRemoveResult},
-        remote_timeline_client::remote_heatmap_path,
-    },
-};
-
-use super::{
-    mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
-    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
-    timeline::delete::DeleteTimelineFlow,
-    tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
-};
-
-#[derive(Debug, thiserror::Error)]
-pub(crate) enum DeleteTenantError {
-    #[error("GetTenant {0}")]
-    Get(#[from] GetTenantError),
-
-    #[error("Tenant map slot error {0}")]
-    SlotError(#[from] TenantSlotError),
-
-    #[error("Tenant map slot upsert error {0}")]
-    SlotUpsertError(#[from] TenantSlotUpsertError),
-
-    #[error("Timeline {0}")]
-    Timeline(#[from] DeleteTimelineError),
-
-    #[error("Cancelled")]
-    Cancelled,
-
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
-}
-
-type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
-
-fn remote_tenant_delete_mark_path(
-    conf: &PageServerConf,
-    tenant_shard_id: &TenantShardId,
-) -> anyhow::Result<RemotePath> {
-    let tenant_remote_path = conf
-        .tenant_path(tenant_shard_id)
-        .strip_prefix(&conf.workdir)
-        .context("Failed to strip workdir prefix")
-        .and_then(RemotePath::new)
-        .context("tenant path")?;
-    Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
-}
-
-async fn schedule_ordered_timeline_deletions(
-    tenant: &Arc<Tenant>,
-) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
-    // Tenant is stopping at this point. We know it will be deleted.
-    // No new timelines should be created.
-    // Tree sort timelines to delete from leafs to the root.
-    // NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
-    // can complete and remove timeline from the map in between our call to clone
-    // and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
-    // timelines.lock is currently synchronous so we cant hold it across await point.
-    // So just ignore NotFound error if we get it from `run`.
-    // Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
-    let timelines = tenant.timelines.lock().unwrap().clone();
-    let sorted =
-        tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
-
-    let mut already_running_deletions = vec![];
-
-    for (timeline_id, _) in sorted.into_iter().rev() {
-        let span = tracing::info_span!("timeline_delete", %timeline_id);
-        let res = DeleteTimelineFlow::run(tenant, timeline_id, true)
-            .instrument(span)
-            .await;
-        if let Err(e) = res {
-            match e {
-                DeleteTimelineError::NotFound => {
-                    // Timeline deletion finished after call to clone above but before call
-                    // to `DeleteTimelineFlow::run` and removed timeline from the map.
-                    continue;
-                }
-                DeleteTimelineError::AlreadyInProgress(guard) => {
-                    already_running_deletions.push((guard, timeline_id));
-                    continue;
-                }
-                e => return Err(DeleteTenantError::Timeline(e)),
-            }
-        }
-    }
-
-    Ok(already_running_deletions)
-}
-
-async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> {
-    // Assert timelines dir is empty.
-    if !fs_ext::is_directory_empty(timelines_path).await? {
-        // Display first 10 items in directory
-        let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
-        let list = &list.into_iter().take(10).collect::<Vec<_>>();
-        return Err(DeleteTenantError::Other(anyhow::anyhow!(
-            "Timelines directory is not empty after all timelines deletion: {list:?}"
-        )));
-    }
-
-    Ok(())
-}
-
-async fn remove_tenant_remote_delete_mark(
-    conf: &PageServerConf,
-    remote_storage: &GenericRemoteStorage,
-    tenant_shard_id: &TenantShardId,
-    cancel: &CancellationToken,
-) -> Result<(), DeleteTenantError> {
-    let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
-    backoff::retry(
-        || async { remote_storage.delete(&path, cancel).await },
-        TimeoutOrCancel::caused_by_cancel,
-        FAILED_UPLOAD_WARN_THRESHOLD,
-        FAILED_REMOTE_OP_RETRIES,
-        "remove_tenant_remote_delete_mark",
-        cancel,
-    )
-    .await
-    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
-    .and_then(|x| x)
-    .context("remove_tenant_remote_delete_mark")?;
-    Ok(())
-}
-
-// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
-async fn cleanup_remaining_fs_traces(
-    conf: &PageServerConf,
-    tenant_shard_id: &TenantShardId,
-) -> Result<(), DeleteTenantError> {
-    let rm = |p: Utf8PathBuf, is_dir: bool| async move {
-        if is_dir {
-            tokio::fs::remove_dir(&p).await
-        } else {
-            tokio::fs::remove_file(&p).await
-        }
-        .or_else(fs_ext::ignore_not_found)
-        .with_context(|| format!("failed to delete {p}"))
-    };
-
-    rm(conf.tenant_config_path(tenant_shard_id), false).await?;
-    rm(conf.tenant_location_config_path(tenant_shard_id), false).await?;
-
-    fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-remove-timelines-dir"
-        ))?
-    });
-
-    rm(conf.timelines_path(tenant_shard_id), true).await?;
-
-    fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-remove-deleted-mark"
-        ))?
-    });
-
-    // Make sure previous deletions are ordered before mark removal.
-    // Otherwise there is no guarantee that they reach the disk before mark deletion.
-    // So its possible for mark to reach disk first and for other deletions
-    // to be reordered later and thus missed if a crash occurs.
-    // Note that we dont need to sync after mark file is removed
-    // because we can tolerate the case when mark file reappears on startup.
-    let tenant_path = &conf.tenant_path(tenant_shard_id);
-    if tenant_path.exists() {
-        crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id))
-            .await
-            .context("fsync_pre_mark_remove")?;
-    }
-
-    rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
-
-    rm(conf.tenant_heatmap_path(tenant_shard_id), false).await?;
-
-    fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-remove-tenant-dir"
-        ))?
-    });
-
-    rm(conf.tenant_path(tenant_shard_id), true).await?;
-
-    Ok(())
-}
-
-#[derive(Default)]
-pub enum DeleteTenantFlow {
-    #[default]
-    NotStarted,
-    InProgress,
-    Finished,
-}
-
-impl DeleteTenantFlow {
-    pub(crate) async fn should_resume_deletion(
-        conf: &'static PageServerConf,
-        remote_mark_exists: bool,
-        tenant: &Tenant,
-    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
-        let acquire = |t: &Tenant| {
-            Some(
-                Arc::clone(&t.delete_progress)
-                    .try_lock_owned()
-                    .expect("we're the only owner during init"),
-            )
-        };
-
-        if remote_mark_exists {
-            return Ok(acquire(tenant));
-        }
-
-        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
-        if conf
-            .tenant_deleted_mark_file_path(&tenant.tenant_shard_id)
-            .exists()
-        {
-            Ok(acquire(tenant))
-        } else {
-            Ok(None)
-        }
-    }
-
-    pub(crate) async fn resume_from_attach(
-        guard: DeletionGuard,
-        tenant: &Arc<Tenant>,
-        preload: Option<TenantPreload>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
-        ctx: &RequestContext,
-    ) -> Result<(), DeleteTenantError> {
-        let (_, progress) = completion::channel();
-
-        tenant
-            .set_stopping(progress, false, true)
-            .await
-            .expect("cant be stopping or broken");
-
-        tenant
-            .attach(preload, super::SpawnMode::Eager, ctx)
-            .await
-            .context("attach")?;
-
-        Self::background(
-            guard,
-            tenant.conf,
-            tenant.remote_storage.clone(),
-            tenants,
-            tenant,
-        )
-        .await
-    }
-
-    async fn background(
-        mut guard: OwnedMutexGuard<Self>,
-        conf: &PageServerConf,
-        remote_storage: GenericRemoteStorage,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
-        tenant: &Arc<Tenant>,
-    ) -> Result<(), DeleteTenantError> {
-        // Tree sort timelines, schedule delete for them. Mention retries from the console side.
-        // Note that if deletion fails we dont mark timelines as broken,
-        // the whole tenant will become broken as by `Self::schedule_background` logic
-        let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
-            .await
-            .context("schedule_ordered_timeline_deletions")?;
-
-        fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-polling-ongoing-deletions"
-            ))?
-        });
-
-        // Wait for deletions that were already running at the moment when tenant deletion was requested.
-        // When we can lock deletion guard it means that corresponding timeline deletion finished.
-        for (guard, timeline_id) in already_running_timeline_deletions {
-            let flow = guard.lock().await;
-            if !flow.is_finished() {
-                return Err(DeleteTenantError::Other(anyhow::anyhow!(
-                    "already running timeline deletion failed: {timeline_id}"
-                )));
-            }
-        }
-
-        // Remove top-level tenant objects that don't belong to a timeline, such as heatmap
-        let heatmap_path = remote_heatmap_path(&tenant.tenant_shard_id());
-        if let Some(Err(e)) = backoff::retry(
-            || async {
-                remote_storage
-                    .delete(&heatmap_path, &task_mgr::shutdown_token())
-                    .await
-            },
-            TimeoutOrCancel::caused_by_cancel,
-            FAILED_UPLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "remove_remote_tenant_heatmap",
-            &task_mgr::shutdown_token(),
-        )
-        .await
-        {
-            tracing::warn!("Failed to delete heatmap at {heatmap_path}: {e}");
-        }
-
-        let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
-        // May not exist if we fail in cleanup_remaining_fs_traces after removing it
-        if timelines_path.exists() {
-            // sanity check to guard against layout changes
-            ensure_timelines_dir_empty(&timelines_path)
-                .await
-                .context("timelines dir not empty")?;
-        }
-
-        remove_tenant_remote_delete_mark(
-            conf,
-            &remote_storage,
-            &tenant.tenant_shard_id,
-            &task_mgr::shutdown_token(),
-        )
-        .await?;
-
-        pausable_failpoint!("tenant-delete-before-cleanup-remaining-fs-traces-pausable");
-        fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
-            ))?
-        });
-
-        cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id)
-            .await
-            .context("cleanup_remaining_fs_traces")?;
-
-        {
-            // This block is simply removing the TenantSlot for this tenant.  It requires a loop because
-            // we might conflict with a TenantSlot::InProgress marker and need to wait for it.
-            //
-            // This complexity will go away when we simplify how deletion works:
-            // https://github.com/neondatabase/neon/issues/5080
-            loop {
-                // Under the TenantMap lock, try to remove the tenant.  We usually succeed, but if
-                // we encounter an InProgress marker, yield the barrier it contains and wait on it.
-                let barrier = {
-                    let mut locked = tenants.write().unwrap();
-                    let removed = locked.remove(tenant.tenant_shard_id);
-
-                    // FIXME: we should not be modifying this from outside of mgr.rs.
-                    // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
-
-                    // Update stats
-                    match &removed {
-                        TenantsMapRemoveResult::Occupied(slot) => {
-                            crate::metrics::TENANT_MANAGER.slot_removed(slot);
-                        }
-                        TenantsMapRemoveResult::InProgress(barrier) => {
-                            crate::metrics::TENANT_MANAGER
-                                .slot_removed(&TenantSlot::InProgress(barrier.clone()));
-                        }
-                        TenantsMapRemoveResult::Vacant => {
-                            // Nothing changed in map, no metric update
-                        }
-                    }
-
-                    match removed {
-                        TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
-                            match tenant.current_state() {
-                                TenantState::Stopping { .. } | TenantState::Broken { .. } => {
-                                    // Expected: we put the tenant into stopping state before we start deleting it
-                                }
-                                state => {
-                                    // Unexpected state
-                                    tracing::warn!(
-                                        "Tenant in unexpected state {state} after deletion"
-                                    );
-                                }
-                            }
-                            break;
-                        }
-                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
-                            // This is unexpected: this secondary tenants should not have been created, and we
-                            // are not in a position to shut it down from here.
-                            tracing::warn!("Tenant transitioned to secondary mode while deleting!");
-                            break;
-                        }
-                        TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => {
-                            unreachable!("TenantsMap::remove handles InProgress separately, should never return it here");
-                        }
-                        TenantsMapRemoveResult::Vacant => {
-                            tracing::warn!(
-                                "Tenant removed from TenantsMap before deletion completed"
-                            );
-                            break;
-                        }
-                        TenantsMapRemoveResult::InProgress(barrier) => {
-                            // An InProgress entry was found, we must wait on its barrier
-                            barrier
-                        }
-                    }
-                };
-
-                tracing::info!(
-                    "Waiting for competing operation to complete before deleting state for tenant"
-                );
-                barrier.wait().await;
-            }
-        }
-
-        *guard = Self::Finished;
-
-        Ok(())
-    }
-}
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 326086a3cc..4fcdf14052 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -51,7 +51,6 @@ use utils::fs_ext::PathExt;
 use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};
 
-use super::delete::DeleteTenantError;
 use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
 use super::timeline::detach_ancestor::PreparedTimelineDetach;
@@ -109,12 +108,6 @@ pub(crate) enum TenantsMap {
     ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
 }
 
-pub(crate) enum TenantsMapRemoveResult {
-    Occupied(TenantSlot),
-    Vacant,
-    InProgress(utils::completion::Barrier),
-}
-
 /// When resolving a TenantId to a shard, we may be looking for the 0th
 /// shard, or we might be looking for whichever shard holds a particular page.
 #[derive(Copy, Clone)]
@@ -191,26 +184,6 @@ impl TenantsMap {
         }
     }
 
-    /// Only for use from DeleteTenantFlow.  This method directly removes a TenantSlot from the map.
-    ///
-    /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
-    /// slot if the enclosed tenant is shutdown.
-    pub(crate) fn remove(&mut self, tenant_shard_id: TenantShardId) -> TenantsMapRemoveResult {
-        use std::collections::btree_map::Entry;
-        match self {
-            TenantsMap::Initializing => TenantsMapRemoveResult::Vacant,
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => match m.entry(tenant_shard_id) {
-                Entry::Occupied(entry) => match entry.get() {
-                    TenantSlot::InProgress(barrier) => {
-                        TenantsMapRemoveResult::InProgress(barrier.clone())
-                    }
-                    _ => TenantsMapRemoveResult::Occupied(entry.remove()),
-                },
-                Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
-            },
-        }
-    }
-
     #[cfg(all(debug_assertions, not(test)))]
     pub(crate) fn len(&self) -> usize {
         match self {
@@ -460,6 +433,18 @@ async fn init_load_tenant_configs(
     Ok(configs)
 }
 
+#[derive(Debug, thiserror::Error)]
+pub(crate) enum DeleteTenantError {
+    #[error("Tenant map slot error {0}")]
+    SlotError(#[from] TenantSlotError),
+
+    #[error("Cancelled")]
+    Cancelled,
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 /// Initialize repositories with locally available timelines.
 /// Timelines that are only partially available locally (remote storage has more data than this pageserver)
 /// are scheduled for download and added to the tenant once download is completed.
@@ -629,7 +614,6 @@ pub async fn init_tenant_mgr(
                     AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
                     shard_identity,
                     Some(init_order.clone()),
-                    &TENANTS,
                     SpawnMode::Lazy,
                     &ctx,
                 ) {
@@ -685,7 +669,6 @@ fn tenant_spawn(
     location_conf: AttachedTenantConf,
     shard_identity: ShardIdentity,
     init_order: Option<InitializationOrder>,
-    tenants: &'static std::sync::RwLock<TenantsMap>,
     mode: SpawnMode,
     ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
@@ -712,7 +695,6 @@ fn tenant_spawn(
         location_conf,
         shard_identity,
         init_order,
-        tenants,
         mode,
         ctx,
     ) {
@@ -1161,7 +1143,6 @@ impl TenantManager {
                     attached_conf,
                     shard_identity,
                     None,
-                    self.tenants,
                     spawn_mode,
                     ctx,
                 )?;
@@ -1283,7 +1264,6 @@ impl TenantManager {
             AttachedTenantConf::try_from(config)?,
             shard_identity,
             None,
-            self.tenants,
             SpawnMode::Eager,
             ctx,
         )?;
@@ -1634,7 +1614,7 @@ impl TenantManager {
         for child_shard_id in &child_shards {
             let child_shard_id = *child_shard_id;
             let child_shard = {
-                let locked = TENANTS.read().unwrap();
+                let locked = self.tenants.read().unwrap();
                 let peek_slot =
                     tenant_map_peek_slot(&locked, &child_shard_id, TenantSlotPeekMode::Read)?;
                 peek_slot.and_then(|s| s.get_attached()).cloned()
@@ -1866,7 +1846,7 @@ impl TenantManager {
         deletion_queue_client: &DeletionQueueClient,
     ) -> Result<(), TenantStateError> {
         let tmp_path = self
-            .detach_tenant0(conf, &TENANTS, tenant_shard_id, deletion_queue_client)
+            .detach_tenant0(conf, tenant_shard_id, deletion_queue_client)
             .await?;
         spawn_background_purge(tmp_path);
 
@@ -1876,7 +1856,6 @@ impl TenantManager {
     async fn detach_tenant0(
         &self,
         conf: &'static PageServerConf,
-        tenants: &std::sync::RwLock<TenantsMap>,
         tenant_shard_id: TenantShardId,
         deletion_queue_client: &DeletionQueueClient,
     ) -> Result<Utf8PathBuf, TenantStateError> {
@@ -1890,7 +1869,7 @@ impl TenantManager {
         };
 
         let removal_result = remove_tenant_from_memory(
-            tenants,
+            self.tenants,
             tenant_shard_id,
             tenant_dir_rename_operation(tenant_shard_id),
         )
@@ -1906,7 +1885,7 @@ impl TenantManager {
     pub(crate) fn list_tenants(
         &self,
     ) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
-        let tenants = TENANTS.read().unwrap();
+        let tenants = self.tenants.read().unwrap();
         let m = match &*tenants {
             TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
             TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
@@ -2007,7 +1986,6 @@ impl TenantManager {
             AttachedTenantConf::try_from(config)?,
             shard_identity,
             None,
-            self.tenants,
             SpawnMode::Eager,
             ctx,
         )?;
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 441298f3e9..6d747d424d 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -255,7 +255,6 @@ impl DeleteTimelineFlow {
     }
 
     /// Shortcut to create Timeline in stopping state and spawn deletion task.
-    /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
     #[instrument(skip_all, fields(%timeline_id))]
     pub async fn resume_deletion(
         tenant: Arc<Tenant>,
@@ -420,10 +419,6 @@ impl DeleteTimelineFlow {
         Ok(())
     }
 
-    pub(crate) fn is_finished(&self) -> bool {
-        matches!(self, Self::Finished)
-    }
-
     pub(crate) fn is_not_started(&self) -> bool {
         matches!(self, Self::NotStarted)
     }

From de05f90735b3b54b6fa99b0b42817d03310ebf87 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 24 Jun 2024 11:53:43 +0100
Subject: [PATCH 1031/1571] pageserver: add more info-level logging in shard
 splits (#8137)

## Problem

`test_sharding_autosplit` is occasionally failing on warnings about
shard splits taking longer than expected (`Exclusive lock by ShardSplit
was held for`...)

It's not obvious which part is taking the time (I suspect remote storage
uploads).

Example:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/9618788427/index.html#testresult/b395294d5bdeb783/

## Summary of changes

- Since shard splits are infrequent events, we can afford to be very
chatty: add a bunch of info-level logging throughout the process.
---
 pageserver/src/tenant.rs     |  4 ++++
 pageserver/src/tenant/mgr.rs | 12 ++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 6a748f61e7..76dc52fa16 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2151,6 +2151,7 @@ impl Tenant {
             // Upload an index from the parent: this is partly to provide freshness for the
             // child tenants that will copy it, and partly for general ease-of-debugging: there will
             // always be a parent shard index in the same generation as we wrote the child shard index.
+            tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index");
             timeline
                 .remote_client
                 .schedule_index_upload_for_file_changes()?;
@@ -2158,12 +2159,14 @@ impl Tenant {
 
             // Shut down the timeline's remote client: this means that the indices we write
             // for child shards will not be invalidated by the parent shard deleting layers.
+            tracing::info!(timeline_id=%timeline.timeline_id, "Shutting down remote storage client");
             timeline.remote_client.shutdown().await;
 
             // Download methods can still be used after shutdown, as they don't flow through the remote client's
             // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
             // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
             // we use here really is the remotely persistent one).
+            tracing::info!(timeline_id=%timeline.timeline_id, "Downloading index_part from parent");
             let result = timeline.remote_client
                 .download_index_file(&self.cancel)
                 .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
@@ -2176,6 +2179,7 @@ impl Tenant {
             };
 
             for child_shard in child_shards {
+                tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index_part for child {}", child_shard.to_index());
                 upload_index_part(
                     &self.remote_storage,
                     child_shard,
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 4fcdf14052..1bc21d8b78 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1715,6 +1715,7 @@ impl TenantManager {
             let timelines = parent_shard.timelines.lock().unwrap().clone();
             let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
             for timeline in timelines.values() {
+                tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink");
                 let timeline_layers = timeline
                     .layers
                     .read()
@@ -1754,7 +1755,12 @@ impl TenantManager {
 
         // Since we will do a large number of small filesystem metadata operations, batch them into
         // spawn_blocking calls rather than doing each one as a tokio::fs round-trip.
+        let span = tracing::Span::current();
         let jh = tokio::task::spawn_blocking(move || -> anyhow::Result<usize> {
+            // Run this synchronous code in the same log context as the outer function that spawned it.
+            let _span = span.enter();
+
+            tracing::info!("Creating {} directories", create_dirs.len());
             for dir in &create_dirs {
                 if let Err(e) = std::fs::create_dir_all(dir) {
                     // Ignore AlreadyExists errors, drop out on all other errors
@@ -1768,6 +1774,11 @@ impl TenantManager {
             }
 
             for child_prefix in child_prefixes {
+                tracing::info!(
+                    "Hard-linking {} parent layers into child path {}",
+                    parent_layers.len(),
+                    child_prefix
+                );
                 for relative_layer in &parent_layers {
                     let parent_path = parent_path.join(relative_layer);
                     let child_path = child_prefix.join(relative_layer);
@@ -1793,6 +1804,7 @@ impl TenantManager {
             // Durability is not required for correctness, but if we crashed during split and
             // then came restarted with empty timeline dirs, it would be very inefficient to
             // re-populate from remote storage.
+            tracing::info!("fsyncing {} directories", create_dirs.len());
             for dir in create_dirs {
                 if let Err(e) = crashsafe::fsync(&dir) {
                     // Something removed a newly created timeline dir out from underneath us?  Extremely

From 47fdf93cf0d8c60434d1501a6047830b49d2f4b2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 24 Jun 2024 14:54:54 +0100
Subject: [PATCH 1032/1571] tests: fix a flake in
 test_sharding_split_compaction (#8136)

## Problem

This test could occasionally trigger a "removing local file ... because
it has unexpected length log" when using the
`compact-shard-ancestors-persistent` failpoint is in use, which is
unexpected because that failpoint stops the process when the remote
metadata is in sync with local files.

It was because there are two shards on the same pageserver, and while
the one being compacted explicitly stops at the failpoint, another shard
was compacting in the background and failing at an unclean point. The
test intends to disable background compaction, but was mistakenly
revoking the value of `compaction_period` when it updated
`pitr_interval`.

Example failure:

https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8123/9602976462/index.html#/testresult/7dd6165da7daef40

## Summary of changes

- Update `TENANT_CONF` in the test to use properly typed values, so that
it is usable in pageserver APIs as well as via neon_local.
- When updating tenant config with `pitr_interval`, retain the overrides
from the start of the test, so that there won't be any background
compaction going on during the test.
---
 test_runner/regress/test_sharding.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 56075c5975..62a9f422ee 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -190,19 +190,20 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
     """
     Test that after a split, we clean up parent layer data in the child shards via compaction.
     """
+
     TENANT_CONF = {
         # small checkpointing and compaction targets to ensure we generate many upload operations
-        "checkpoint_distance": f"{128 * 1024}",
-        "compaction_threshold": "1",
-        "compaction_target_size": f"{128 * 1024}",
+        "checkpoint_distance": 128 * 1024,
+        "compaction_threshold": 1,
+        "compaction_target_size": 128 * 1024,
         # no PITR horizon, we specify the horizon when we request on-demand GC
         "pitr_interval": "3600s",
         # disable background compaction and GC. We invoke it manually when we want it to happen.
         "gc_period": "0s",
         "compaction_period": "0s",
         # create image layers eagerly, so that GC can remove some layers
-        "image_creation_threshold": "1",
-        "image_layer_creation_check_threshold": "0",
+        "image_creation_threshold": 1,
+        "image_layer_creation_check_threshold": 0,
     }
 
     neon_env_builder.storage_controller_config = {
@@ -261,7 +262,9 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
     env.pageserver.start()
 
     # Cleanup part 2: once layers are outside the PITR window, they will be rewritten if they are partially redundant
-    env.storage_controller.pageserver_api().set_tenant_config(tenant_id, {"pitr_interval": "0s"})
+    updated_conf = TENANT_CONF.copy()
+    updated_conf["pitr_interval"] = "0s"
+    env.storage_controller.pageserver_api().set_tenant_config(tenant_id, updated_conf)
     env.storage_controller.reconcile_until_idle()
 
     for shard in shards:

From a4db2af1f0667514ee5cbcb545a2f131b1b3538e Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 24 Jun 2024 15:07:59 +0100
Subject: [PATCH 1033/1571] Truncate waltmp file on creation (#8133)

Previously in safekeeper code, new segment file was opened without
truncate option. I don't think there is a reason to do it, this commit
replaces it with `File::create` to make it simpler and remove
`clippy::suspicious_open_options` linter warning.
---
 safekeeper/src/wal_storage.rs | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 0c1731937c..2aead70ffd 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -231,11 +231,7 @@ impl PhysicalStorage {
             // half initialized segment, first bake it under tmp filename and
             // then rename.
             let tmp_path = self.timeline_dir.join("waltmp");
-            #[allow(clippy::suspicious_open_options)]
-            let mut file = OpenOptions::new()
-                .create(true)
-                .write(true)
-                .open(&tmp_path)
+            let mut file = File::create(&tmp_path)
                 .await
                 .with_context(|| format!("Failed to open tmp wal file {:?}", &tmp_path))?;
 

From d8ffe662a9bb9eb7a7a4c1ae0cc2b9837072a487 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 24 Jun 2024 11:31:06 -0400
Subject: [PATCH 1034/1571] fix(pageserver): handle version number in draw
 timeline (#8102)

We now have a `vX` number in the file name, i.e.,
`000000067F0000000400000B150100000000-000000067F0000000400000D350100000000__00000000014B7AC8-v1-00000001`

The related pull request for new-style path was merged a month ago
https://github.com/neondatabase/neon/pull/7660

## Summary of changes

Fixed the draw timeline dir command to handle it.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/ctl/src/draw_timeline_dir.rs | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs
index 389519c65a..bc939f9688 100644
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -83,10 +83,18 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
     let keys: Vec<&str> = split[0].split('-').collect();
     let mut lsns: Vec<&str> = split[1].split('-').collect();
 
+    // The current format of the layer file name: 000000067F0000000400000B150100000000-000000067F0000000400000D350100000000__00000000014B7AC8-v1-00000001
+
+    // Handle generation number `-00000001` part
     if lsns.last().expect("should").len() == 8 {
         lsns.pop();
     }
 
+    // Handle version number `-v1` part
+    if lsns.last().expect("should").starts_with('v') {
+        lsns.pop();
+    }
+
     if lsns.len() == 1 {
         lsns.push(lsns[0]);
     }

From 9211de0df7cec5910566189c99ee2131462eda16 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 24 Jun 2024 11:50:31 -0400
Subject: [PATCH 1035/1571] test(pageserver): add delta records tests for
 gc-compaction (#8078)

Part of https://github.com/neondatabase/neon/issues/8002

This pull request adds tests for bottom-most gc-compaction with delta
records. Also fixed a bug in the compaction process that creates
overlapping delta layers by force splitting at the original delta layer
boundary.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                     | 234 +++++++++++++++++--
 pageserver/src/tenant/timeline/compaction.rs | 154 +++++++++---
 2 files changed, 339 insertions(+), 49 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 76dc52fa16..62f066862a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4007,6 +4007,7 @@ mod tests {
     use storage_layer::PersistentLayerKey;
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
+    use timeline::GcInfo;
     use utils::bin_ser::BeSer;
     use utils::id::TenantId;
 
@@ -6684,49 +6685,48 @@ mod tests {
 
         // img layer at 0x10
         let img_layer = (0..10)
-            .map(|id| (get_key(id), test_img(&format!("value {id}@0x10"))))
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
             .collect_vec();
 
         let delta1 = vec![
-            // TODO: we should test a real delta record here, which requires us to add a variant of NeonWalRecord for testing purpose.
             (
                 get_key(1),
                 Lsn(0x20),
-                Value::Image(test_img("value 1@0x20")),
+                Value::Image(Bytes::from("value 1@0x20")),
             ),
             (
                 get_key(2),
                 Lsn(0x30),
-                Value::Image(test_img("value 2@0x30")),
+                Value::Image(Bytes::from("value 2@0x30")),
             ),
             (
                 get_key(3),
                 Lsn(0x40),
-                Value::Image(test_img("value 3@0x40")),
+                Value::Image(Bytes::from("value 3@0x40")),
             ),
         ];
         let delta2 = vec![
             (
                 get_key(5),
                 Lsn(0x20),
-                Value::Image(test_img("value 5@0x20")),
+                Value::Image(Bytes::from("value 5@0x20")),
             ),
             (
                 get_key(6),
                 Lsn(0x20),
-                Value::Image(test_img("value 6@0x20")),
+                Value::Image(Bytes::from("value 6@0x20")),
             ),
         ];
         let delta3 = vec![
             (
                 get_key(8),
                 Lsn(0x40),
-                Value::Image(test_img("value 8@0x40")),
+                Value::Image(Bytes::from("value 8@0x40")),
             ),
             (
                 get_key(9),
                 Lsn(0x40),
-                Value::Image(test_img("value 9@0x40")),
+                Value::Image(Bytes::from("value 9@0x40")),
             ),
         ];
 
@@ -6748,9 +6748,42 @@ mod tests {
             guard.cutoffs.horizon = Lsn(0x30);
         }
 
+        let expected_result = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x20"),
+            Bytes::from_static(b"value 2@0x30"),
+            Bytes::from_static(b"value 3@0x40"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x20"),
+            Bytes::from_static(b"value 6@0x20"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x40"),
+            Bytes::from_static(b"value 9@0x40"),
+        ];
+
+        for (idx, expected) in expected_result.iter().enumerate() {
+            assert_eq!(
+                tline
+                    .get(get_key(idx as u32), Lsn(0x50), &ctx)
+                    .await
+                    .unwrap(),
+                expected
+            );
+        }
+
         let cancel = CancellationToken::new();
         tline.compact_with_gc(&cancel, &ctx).await.unwrap();
 
+        for (idx, expected) in expected_result.iter().enumerate() {
+            assert_eq!(
+                tline
+                    .get(get_key(idx as u32), Lsn(0x50), &ctx)
+                    .await
+                    .unwrap(),
+                expected
+            );
+        }
+
         // Check if the image layer at the GC horizon contains exactly what we want
         let image_at_gc_horizon = tline
             .inspect_image_layers(Lsn(0x30), &ctx)
@@ -6761,14 +6794,22 @@ mod tests {
             .collect::<Vec<_>>();
 
         assert_eq!(image_at_gc_horizon.len(), 10);
-        let expected_lsn = [0x10, 0x20, 0x30, 0x10, 0x10, 0x20, 0x20, 0x10, 0x10, 0x10];
+        let expected_result = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x20"),
+            Bytes::from_static(b"value 2@0x30"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x20"),
+            Bytes::from_static(b"value 6@0x20"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
         for idx in 0..10 {
             assert_eq!(
                 image_at_gc_horizon[idx],
-                (
-                    get_key(idx as u32),
-                    test_img(&format!("value {idx}@{:#x}", expected_lsn[idx]))
-                )
+                (get_key(idx as u32), expected_result[idx].clone())
             );
         }
 
@@ -6801,7 +6842,7 @@ mod tests {
                 },
                 // The delta layer that is cut in the middle
                 PersistentLayerKey {
-                    key_range: Key::MIN..get_key(9),
+                    key_range: get_key(3)..get_key(4),
                     lsn_range: Lsn(0x30)..Lsn(0x41),
                     is_delta: true
                 },
@@ -6886,6 +6927,9 @@ mod tests {
             tline.get(get_key(2), Lsn(0x50), &ctx).await?,
             Bytes::from_static(b"0x10,0x20,0x30")
         );
+
+        // Need to remove the limit of "Neon WAL redo requires base image".
+
         // assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new());
         // assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new());
 
@@ -6980,4 +7024,164 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas")?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
+        //
+        //  | D1 |                       | D3 |
+        // -|    |-- gc horizon -----------------
+        //  |    |                | D2 |
+        // --------- img layer ------------------
+        //
+        // What we should expact from this compaction is:
+        //  | Part of D1 |               | D3 |
+        // --------- img layer with D1+D2 at GC horizon------------------
+
+        // img layer at 0x10
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+            (
+                get_key(2),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
+            (
+                get_key(3),
+                Lsn(0x40),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
+            ),
+        ];
+        let delta2 = vec![
+            (
+                get_key(5),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+            (
+                get_key(6),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x40),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x40),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![delta1, delta2, delta3], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![],
+                cutoffs: GcCutoffs {
+                    pitr: Lsn(0x30),
+                    horizon: Lsn(0x30),
+                },
+                leases: Default::default(),
+            };
+        }
+
+        let expected_result = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10@0x30"),
+            Bytes::from_static(b"value 3@0x10@0x40"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10@0x20"),
+            Bytes::from_static(b"value 6@0x10@0x20"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10@0x40"),
+            Bytes::from_static(b"value 9@0x10@0x40"),
+        ];
+
+        let expected_result_at_gc_horizon = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10@0x30"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10@0x20"),
+            Bytes::from_static(b"value 6@0x10@0x20"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        for idx in 0..10 {
+            assert_eq!(
+                tline
+                    .get(get_key(idx as u32), Lsn(0x50), &ctx)
+                    .await
+                    .unwrap(),
+                &expected_result[idx]
+            );
+            assert_eq!(
+                tline
+                    .get(get_key(idx as u32), Lsn(0x30), &ctx)
+                    .await
+                    .unwrap(),
+                &expected_result_at_gc_horizon[idx]
+            );
+        }
+
+        let cancel = CancellationToken::new();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+
+        for idx in 0..10 {
+            assert_eq!(
+                tline
+                    .get(get_key(idx as u32), Lsn(0x50), &ctx)
+                    .await
+                    .unwrap(),
+                &expected_result[idx]
+            );
+            assert_eq!(
+                tline
+                    .get(get_key(idx as u32), Lsn(0x30), &ctx)
+                    .await
+                    .unwrap(),
+                &expected_result_at_gc_horizon[idx]
+            );
+        }
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8a95029f33..de1263fadf 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -965,6 +965,8 @@ impl Timeline {
         _cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> Result<(), CompactionError> {
+        use std::collections::BTreeSet;
+
         use crate::tenant::storage_layer::ValueReconstructState;
         // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
         // The layer selection has the following properties:
@@ -986,20 +988,30 @@ impl Timeline {
             (selected_layers, gc_cutoff)
         };
         // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
+        // Also, collect the layer information to decide when to split the new delta layers.
         let mut all_key_values = Vec::new();
+        let mut delta_split_points = BTreeSet::new();
         for layer in &layer_selection {
             all_key_values.extend(layer.load_key_values(ctx).await?);
+            let desc = layer.layer_desc();
+            if desc.is_delta() {
+                // TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon)
+                // so that we can avoid having too many small delta layers.
+                let key_range = desc.get_key_range();
+                delta_split_points.insert(key_range.start);
+                delta_split_points.insert(key_range.end);
+            }
         }
         // Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
-        // image layers, make image appear later than delta.
+        // image layers, make image appear before than delta.
         struct ValueWrapper<'a>(&'a crate::repository::Value);
         impl Ord for ValueWrapper<'_> {
             fn cmp(&self, other: &Self) -> std::cmp::Ordering {
                 use crate::repository::Value;
                 use std::cmp::Ordering;
                 match (self.0, other.0) {
-                    (Value::Image(_), Value::WalRecord(_)) => Ordering::Greater,
-                    (Value::WalRecord(_), Value::Image(_)) => Ordering::Less,
+                    (Value::Image(_), Value::WalRecord(_)) => Ordering::Less,
+                    (Value::WalRecord(_), Value::Image(_)) => Ordering::Greater,
                     _ => Ordering::Equal,
                 }
             }
@@ -1018,13 +1030,6 @@ impl Timeline {
         all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
             (k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
         });
-        let max_lsn = all_key_values
-            .iter()
-            .map(|(_, lsn, _)| lsn)
-            .max()
-            .copied()
-            .unwrap()
-            + 1;
         // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
         // Data of the same key.
         let mut accumulated_values = Vec::new();
@@ -1043,7 +1048,19 @@ impl Timeline {
             // We have a list of deltas/images. We want to create image layers while collect garbages.
             for (key, lsn, val) in accumulated_values.iter().rev() {
                 if *lsn > horizon {
-                    keys_above_horizon.push((*key, *lsn, val.clone())); // TODO: ensure one LSN corresponds to either delta or image instead of both
+                    if let Some((_, prev_lsn, _)) = keys_above_horizon.last_mut() {
+                        if *prev_lsn == *lsn {
+                            // The case that we have an LSN with both data from the delta layer and the image layer. As
+                            // `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply
+                            // drop this delta and keep the image.
+                            //
+                            // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
+                            // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
+                            // dropped.
+                            continue;
+                        }
+                    }
+                    keys_above_horizon.push((*key, *lsn, val.clone()));
                 } else if *lsn <= horizon {
                     match val {
                         crate::repository::Value::Image(image) => {
@@ -1068,15 +1085,59 @@ impl Timeline {
             Ok((keys_above_horizon, img))
         }
 
-        let mut delta_layer_writer = DeltaLayerWriter::new(
-            self.conf,
-            self.timeline_id,
-            self.tenant_shard_id,
-            all_key_values.first().unwrap().0,
-            gc_cutoff..max_lsn, // TODO: off by one?
-            ctx,
-        )
-        .await?;
+        async fn flush_deltas(
+            deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
+            last_key: Key,
+            delta_split_points: &[Key],
+            current_delta_split_point: &mut usize,
+            tline: &Arc<Timeline>,
+            gc_cutoff: Lsn,
+            ctx: &RequestContext,
+        ) -> anyhow::Result<Option<ResidentLayer>> {
+            // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
+            // overlapping layers.
+            //
+            // If we have a structure like this:
+            //
+            // | Delta 1 |         | Delta 4 |
+            // |---------| Delta 2 |---------|
+            // | Delta 3 |         | Delta 5 |
+            //
+            // And we choose to compact delta 2+3+5. We will get an overlapping delta layer with delta 1+4.
+            // A simple solution here is to split the delta layers using the original boundary, while this
+            // might produce a lot of small layers. This should be improved and fixed in the future.
+            let mut need_split = false;
+            while *current_delta_split_point < delta_split_points.len()
+                && last_key >= delta_split_points[*current_delta_split_point]
+            {
+                *current_delta_split_point += 1;
+                need_split = true;
+            }
+            if !need_split {
+                return Ok(None);
+            }
+            let deltas = std::mem::take(deltas);
+            if deltas.is_empty() {
+                return Ok(None);
+            }
+            let end_lsn = deltas.iter().map(|(_, lsn, _)| lsn).max().copied().unwrap() + 1;
+            let mut delta_layer_writer = DeltaLayerWriter::new(
+                tline.conf,
+                tline.timeline_id,
+                tline.tenant_shard_id,
+                deltas.first().unwrap().0,
+                gc_cutoff..end_lsn,
+                ctx,
+            )
+            .await?;
+            let key_end = deltas.last().unwrap().0.next();
+            for (key, lsn, val) in deltas {
+                delta_layer_writer.put_value(key, lsn, val, ctx).await?;
+            }
+            let delta_layer = delta_layer_writer.finish(key_end, tline, ctx).await?;
+            Ok(Some(delta_layer))
+        }
+
         let mut image_layer_writer = ImageLayerWriter::new(
             self.conf,
             self.timeline_id,
@@ -1087,6 +1148,10 @@ impl Timeline {
         )
         .await?;
 
+        let mut delta_values = Vec::new();
+        let delta_split_points = delta_split_points.into_iter().collect_vec();
+        let mut current_delta_split_point = 0;
+        let mut delta_layers = Vec::new();
         for item @ (key, _, _) in &all_key_values {
             if &last_key == key {
                 accumulated_values.push(item);
@@ -1094,33 +1159,54 @@ impl Timeline {
                 let (deltas, image) =
                     flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
                         .await?;
+                // Put the image into the image layer. Currently we have a single big layer for the compaction.
                 image_layer_writer.put_image(last_key, image, ctx).await?;
-                for (key, lsn, val) in deltas {
-                    delta_layer_writer.put_value(key, lsn, val, ctx).await?;
-                }
+                delta_values.extend(deltas);
+                delta_layers.extend(
+                    flush_deltas(
+                        &mut delta_values,
+                        last_key,
+                        &delta_split_points,
+                        &mut current_delta_split_point,
+                        self,
+                        gc_cutoff,
+                        ctx,
+                    )
+                    .await?,
+                );
                 accumulated_values.clear();
                 accumulated_values.push(item);
                 last_key = *key;
             }
         }
+
+        // TODO: move this part to the loop body
         let (deltas, image) =
             flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
+        // Put the image into the image layer. Currently we have a single big layer for the compaction.
         image_layer_writer.put_image(last_key, image, ctx).await?;
-        for (key, lsn, val) in deltas {
-            delta_layer_writer.put_value(key, lsn, val, ctx).await?;
-        }
-        accumulated_values.clear();
-        // TODO: split layers
-        let delta_layer = delta_layer_writer.finish(last_key, self, ctx).await?;
+        delta_values.extend(deltas);
+        delta_layers.extend(
+            flush_deltas(
+                &mut delta_values,
+                last_key,
+                &delta_split_points,
+                &mut current_delta_split_point,
+                self,
+                gc_cutoff,
+                ctx,
+            )
+            .await?,
+        );
+
         let image_layer = image_layer_writer.finish(self, ctx).await?;
+        let mut compact_to = Vec::new();
+        compact_to.extend(delta_layers);
+        compact_to.push(image_layer);
         // Step 3: Place back to the layer map.
         {
             let mut guard = self.layers.write().await;
-            guard.finish_gc_compaction(
-                &layer_selection,
-                &[delta_layer.clone(), image_layer.clone()],
-                &self.metrics,
-            )
+            guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
         };
         Ok(())
     }

From 3d760938e12e463343cb97e2dd0e2a916c4f3943 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 24 Jun 2024 17:57:16 +0100
Subject: [PATCH 1036/1571] storcon_cli: remove old tenant-scatter command
 (#8127)

## Problem

This command was used in the very early days of sharding, before the
storage controller had anti-affinity + scheduling optimization to spread
out shards.

## Summary of changes

- Remove `storcon_cli tenant-scatter`
---
 control_plane/storcon_cli/src/main.rs | 92 +--------------------------
 1 file changed, 2 insertions(+), 90 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 7b48b75c21..775aedb600 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,5 +1,5 @@
 use futures::StreamExt;
-use std::{collections::HashMap, str::FromStr, time::Duration};
+use std::{str::FromStr, time::Duration};
 
 use clap::{Parser, Subcommand};
 use pageserver_api::{
@@ -21,7 +21,7 @@ use utils::id::{NodeId, TenantId};
 
 use pageserver_api::controller_api::{
     NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-    TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
+    TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
 
 #[derive(Subcommand, Debug)]
@@ -110,12 +110,6 @@ enum Command {
         #[arg(long)]
         config: String,
     },
-    /// Attempt to balance the locations for a tenant across pageservers.  This is a client-side
-    /// alternative to the storage controller's scheduling optimization behavior.
-    TenantScatter {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
     /// Print details about a particular tenant, including all its shards' states.
     TenantDescribe {
         #[arg(long)]
@@ -498,88 +492,6 @@ async fn main() -> anyhow::Result<()> {
                 })
                 .await?;
         }
-        Command::TenantScatter { tenant_id } => {
-            // Find the shards
-            let locate_response = storcon_client
-                .dispatch::<(), TenantLocateResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}/locate"),
-                    None,
-                )
-                .await?;
-            let shards = locate_response.shards;
-
-            let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
-            let shard_count = shards.len();
-            for s in shards {
-                let entry = node_to_shards.entry(s.node_id).or_default();
-                entry.push(s.shard_id);
-            }
-
-            // Load list of available nodes
-            let nodes_resp = storcon_client
-                .dispatch::<(), Vec<NodeDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/node".to_string(),
-                    None,
-                )
-                .await?;
-
-            for node in nodes_resp {
-                if matches!(node.availability, NodeAvailabilityWrapper::Active) {
-                    node_to_shards.entry(node.id).or_default();
-                }
-            }
-
-            let max_shard_per_node = shard_count / node_to_shards.len();
-
-            loop {
-                let mut migrate_shard = None;
-                for shards in node_to_shards.values_mut() {
-                    if shards.len() > max_shard_per_node {
-                        // Pick the emptiest
-                        migrate_shard = Some(shards.pop().unwrap());
-                    }
-                }
-                let Some(migrate_shard) = migrate_shard else {
-                    break;
-                };
-
-                // Pick the emptiest node to migrate to
-                let mut destinations = node_to_shards
-                    .iter()
-                    .map(|(k, v)| (k, v.len()))
-                    .collect::<Vec<_>>();
-                destinations.sort_by_key(|i| i.1);
-                let (destination_node, destination_count) = *destinations.first().unwrap();
-                if destination_count + 1 > max_shard_per_node {
-                    // Even the emptiest destination doesn't have space: we're done
-                    break;
-                }
-                let destination_node = *destination_node;
-
-                node_to_shards
-                    .get_mut(&destination_node)
-                    .unwrap()
-                    .push(migrate_shard);
-
-                println!("Migrate {} -> {} ...", migrate_shard, destination_node);
-
-                storcon_client
-                    .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
-                        Method::PUT,
-                        format!("control/v1/tenant/{migrate_shard}/migrate"),
-                        Some(TenantShardMigrateRequest {
-                            tenant_shard_id: migrate_shard,
-                            node_id: destination_node,
-                        }),
-                    )
-                    .await?;
-                println!("Migrate {} -> {} OK", migrate_shard, destination_node);
-            }
-
-            // Spread the shards across the nodes
-        }
         Command::TenantDescribe { tenant_id } => {
             let describe_response = storcon_client
                 .dispatch::<(), TenantDescribeResponse>(

From 1ea5d8b1327d2e93cbe11682f60a90e35d42d1ee Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 24 Jun 2024 18:03:53 +0100
Subject: [PATCH 1037/1571] tests: accomodate some messages that can fail tests
 (#8144)

## Problem

- `test_storage_controller_many_tenants` can fail with warnings in the
storage controller about tenant creation holding a lock for too long,
because this test stresses the machine running the test with many
concurrent timeline creations
- `test_tenant_delete_smoke` can fail when synthetic remote storage
errors show up

## Summary of changes

- tolerate warnings about slow timeline creation in
test_storage_controller_many_tenants
- tolerate both possible errors during error_tolerant_delete
---
 .../performance/test_storage_controller_scale.py      | 11 ++++++++++-
 test_runner/regress/test_tenant_delete.py             |  8 ++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index cb013ae8c3..a4c8c8ac42 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -48,7 +48,16 @@ def test_storage_controller_many_tenants(
 
     # We will intentionally stress reconciler concurrrency, which triggers a warning when lots
     # of shards are hitting the delayed path.
-    env.storage_controller.allowed_errors.append(".*Many shards are waiting to reconcile")
+    env.storage_controller.allowed_errors.extend(
+        [
+            # We will intentionally stress reconciler concurrrency, which triggers a warning when lots
+            # of shards are hitting the delayed path.
+            ".*Many shards are waiting to reconcile",
+            # We will create many timelines concurrently, so they might get slow enough to trip the warning
+            # that timeline creation is holding a lock too long.
+            ".*Shared lock by TimelineCreate.*was held.*",
+        ]
+    )
 
     for ps in env.pageservers:
         # This can happen because when we do a loop over all pageservers and mark them offline/active,
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index a3316f2f45..d3fba32a19 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -31,8 +31,12 @@ def error_tolerant_delete(ps_http, tenant_id):
             if e.status_code == 500:
                 # This test uses failure injection, which can produce 500s as the pageserver expects
                 # the object store to always be available, and the ListObjects during deletion is generally
-                # an infallible operation
-                assert "simulated failure of remote operation" in e.message
+                # an infallible operation.  This can show up as a clear simulated error, or as a general
+                # error during delete_objects()
+                assert (
+                    "simulated failure of remote operation" in e.message
+                    or "failed to delete" in e.message
+                )
             else:
                 raise
         else:

From 219e78f885486698a67da6ad62ef9f6d001b118a Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Mon, 24 Jun 2024 16:12:24 -0400
Subject: [PATCH 1038/1571] feat(pageserver): add an optional lease to the
 get_lsn_by_timestamp API (#8104)

Part of #7497, closes #8072.

## Problem

Currently the `get_lsn_by_timestamp` and branch creation pageserver APIs do not provide a pleasant client experience where the looked-up LSN might be GC-ed between the two API calls.

This PR attempts to prevent common races between GC and branch creation by making use of LSN leases provided in #8084. A lease can be optionally granted to a looked-up LSN. With the lease, GC will not touch layers needed to reconstruct all pages at this LSN for the duration of the lease.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 pageserver/src/http/openapi_spec.yml    | 11 +++++++
 pageserver/src/http/routes.rs           | 27 +++++++++++++++-
 test_runner/fixtures/pageserver/http.py |  6 ++--
 test_runner/regress/test_lsn_mapping.py | 43 ++++++++++++++++++++-----
 4 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 1bc8fe9066..e583992a58 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -236,6 +236,13 @@ paths:
             type: string
             format: date-time
           description: A timestamp to get the LSN
+        - name: with_lease
+          in: query
+          required: false
+          schema:
+            type: boolean
+          description: Whether to grant a lease to the corresponding LSN. Default to false.
+
       responses:
         "200":
           description: OK
@@ -1029,6 +1036,10 @@ components:
         kind:
           type: string
           enum: [past, present, future, nodata]
+        valid_until:
+          type: string
+          format: date-time
+          description: The expiration time of the granted lease.
 
     LsnLease:
       type: object
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index cfa507fed0..450f89820e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -21,6 +21,7 @@ use pageserver_api::models::IngestAuxFilesRequest;
 use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
+use pageserver_api::models::LsnLease;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
@@ -728,6 +729,8 @@ async fn get_lsn_by_timestamp_handler(
         .map_err(ApiError::BadRequest)?;
     let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
 
+    let with_lease = parse_query_param(&request, "with_lease")?.unwrap_or(false);
+
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
 
     let timeline =
@@ -736,10 +739,15 @@ async fn get_lsn_by_timestamp_handler(
     let result = timeline
         .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
         .await?;
+
     #[derive(serde::Serialize, Debug)]
     struct Result {
         lsn: Lsn,
         kind: &'static str,
+        #[serde(default)]
+        #[serde(skip_serializing_if = "Option::is_none")]
+        #[serde(flatten)]
+        lease: Option<LsnLease>,
     }
     let (lsn, kind) = match result {
         LsnForTimestamp::Present(lsn) => (lsn, "present"),
@@ -747,11 +755,28 @@ async fn get_lsn_by_timestamp_handler(
         LsnForTimestamp::Past(lsn) => (lsn, "past"),
         LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
     };
-    let result = Result { lsn, kind };
+
+    let lease = if with_lease {
+        timeline
+            .make_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx)
+            .inspect_err(|_| {
+                warn!("fail to grant a lease to {}", lsn);
+            })
+            .ok()
+    } else {
+        None
+    };
+
+    let result = Result { lsn, kind, lease };
+    let valid_until = result
+        .lease
+        .as_ref()
+        .map(|l| humantime::format_rfc3339_millis(l.valid_until).to_string());
     tracing::info!(
         lsn=?result.lsn,
         kind=%result.kind,
         timestamp=%timestamp_raw,
+        valid_until=?valid_until,
         "lsn_by_timestamp finished"
     );
     json_response(StatusCode::OK, result)
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index ecc83a9546..64c7ddee6c 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -619,13 +619,15 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         tenant_id: Union[TenantId, TenantShardId],
         timeline_id: TimelineId,
         timestamp: datetime,
+        with_lease: bool = False,
         **kwargs,
     ):
         log.info(
-            f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
+            f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}, {with_lease=}"
         )
+        with_lease_query = f"{with_lease=}".lower()
         res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z",
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z&{with_lease_query}",
             **kwargs,
         )
         self.verbose_error(res)
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 263730a823..67e82f8d30 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -12,10 +12,24 @@ from fixtures.utils import query_scalar, wait_until
 from requests.exceptions import ReadTimeout
 
 
-#
-# Test pageserver get_lsn_by_timestamp API
-#
-def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
+def assert_lsn_lease_granted(result, with_lease: bool):
+    """
+    Asserts an LSN lease is granted when `with_lease` flag is turned on.
+    Always asserts no LSN lease is granted when `with_lease` flag is off.
+    """
+    if with_lease:
+        assert result.get("valid_until")
+    else:
+        assert result.get("valid_until") is None
+
+
+@pytest.mark.parametrize("with_lease", [True, False])
+def test_lsn_mapping(neon_env_builder: NeonEnvBuilder, with_lease: bool):
+    """
+    Test pageserver get_lsn_by_timestamp API.
+
+    :param with_lease: Whether to get a lease associated with returned LSN.
+    """
     env = neon_env_builder.init_start()
 
     tenant_id, _ = env.neon_cli.create_tenant(
@@ -67,23 +81,33 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         # Check edge cases
         # Timestamp is in the future
         probe_timestamp = tbl[-1][1] + timedelta(hours=1)
-        result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp)
+        result = client.timeline_get_lsn_by_timestamp(
+            tenant_id, timeline_id, probe_timestamp, with_lease=with_lease
+        )
         assert result["kind"] == "future"
+        assert_lsn_lease_granted(result, with_lease)
         # make sure that we return a well advanced lsn here
         assert Lsn(result["lsn"]) > start_lsn
 
         # Timestamp is in the unreachable past
         probe_timestamp = tbl[0][1] - timedelta(hours=10)
-        result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp)
+        result = client.timeline_get_lsn_by_timestamp(
+            tenant_id, timeline_id, probe_timestamp, with_lease=with_lease
+        )
         assert result["kind"] == "past"
+        assert_lsn_lease_granted(result, with_lease)
+
         # make sure that we return the minimum lsn here at the start of the range
         assert Lsn(result["lsn"]) < start_lsn
 
         # Probe a bunch of timestamps in the valid range
         for i in range(1, len(tbl), 100):
             probe_timestamp = tbl[i][1]
-            result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp)
+            result = client.timeline_get_lsn_by_timestamp(
+                tenant_id, timeline_id, probe_timestamp, with_lease=with_lease
+            )
             assert result["kind"] not in ["past", "nodata"]
+            assert_lsn_lease_granted(result, with_lease)
             lsn = result["lsn"]
             # Call get_lsn_by_timestamp to get the LSN
             # Launch a new read-only node at that LSN, and check that only the rows
@@ -105,8 +129,11 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
 
         # Timestamp is in the unreachable past
         probe_timestamp = tbl[0][1] - timedelta(hours=10)
-        result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id_child, probe_timestamp)
+        result = client.timeline_get_lsn_by_timestamp(
+            tenant_id, timeline_id_child, probe_timestamp, with_lease=with_lease
+        )
         assert result["kind"] == "past"
+        assert_lsn_lease_granted(result, with_lease)
         # make sure that we return the minimum lsn here at the start of the range
         assert Lsn(result["lsn"]) >= last_flush_lsn
 

From d502313841bf5f31d7aff629f93e09284d984fb3 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 25 Jun 2024 16:29:32 +0300
Subject: [PATCH 1039/1571] Fix MVCC bug with prepared xact with subxacts on
 standby (#8152)

We did not recover the subtransaction IDs of prepared transactions when
starting a hot standby from a shutdown checkpoint. As a result, such
subtransactions were considered as aborted, rather than in-progress.
That would lead to hint bits being set incorrectly, and the
subtransactions suddenly becoming visible to old snapshots when the
prepared transaction was committed.

To fix, update pg_subtrans with prepared transactions's subxids when
starting hot standby from a shutdown checkpoint. The snapshots taken
from that state need to be marked as "suboverflowed", so that we also
check the pg_subtrans.

Discussion:
https://www.postgresql.org/message-id/6b852e98-2d49-4ca1-9e95-db419a2696e0%40iki.fi

NEON: cherry-picked from the upstream thread ahead of time, to unblock
https://github.com/neondatabase/neon/pull/7288. I expect this to be
committed to upstream in the next few days, superseding this. NOTE: I
did not include the new regression test on v15 and v14 branches, because
the test would need some adapting, and we don't run the perl tests on
Neon anyway.
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 4c51945a61..aa88bd536b 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 4c51945a6167ca06c0169e7a4ca5a8e7ffa3faba
+Subproject commit aa88bd536b48b22328aac748be0dcfff760135d0
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index e22098d86d..2092a6dcee 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit e22098d86d6c40276b6bd75c29133a33fb283ab6
+Subproject commit 2092a6dcee794bb0cb17471bd964690dd7c7355f
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 9837db1578..3bf9219f6e 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 9837db157837fcf43ef7348be0017d3a2238cd27
+Subproject commit 3bf9219f6ef5e943393e9430872e26184e92d1c6
diff --git a/vendor/revisions.json b/vendor/revisions.json
index f945ea6d73..d48f1defec 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "9837db157837fcf43ef7348be0017d3a2238cd27"],
-  "v15": ["15.7", "e22098d86d6c40276b6bd75c29133a33fb283ab6"],
-  "v14": ["14.12", "4c51945a6167ca06c0169e7a4ca5a8e7ffa3faba"]
+  "v16": ["16.3", "3bf9219f6ef5e943393e9430872e26184e92d1c6"],
+  "v15": ["15.7", "2092a6dcee794bb0cb17471bd964690dd7c7355f"],
+  "v14": ["14.12", "aa88bd536b48b22328aac748be0dcfff760135d0"]
 }

From 7026dde9eba4bb37f5ed0182c34ca95d27c014a6 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 25 Jun 2024 15:06:18 +0100
Subject: [PATCH 1040/1571] storcon: update db related dependencides (#8155)

## Problem
Storage controller runs into memory corruption issue on the drain/fill
code paths.

## Summary of changes
Update db related depdencies in the unlikely case that the issue was
fixed in diesel.
---
 Cargo.lock                | 103 ++++++++++++++++++++++++++++----------
 workspace_hack/Cargo.toml |   1 -
 2 files changed, 76 insertions(+), 28 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 70c837c146..5393538c59 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1246,7 +1246,7 @@ dependencies = [
  "tokio-postgres",
  "tokio-stream",
  "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
@@ -1362,8 +1362,8 @@ dependencies = [
  "tokio",
  "tokio-postgres",
  "tokio-util",
- "toml",
- "toml_edit",
+ "toml 0.7.4",
+ "toml_edit 0.19.10",
  "tracing",
  "url",
  "utils",
@@ -1669,9 +1669,9 @@ dependencies = [
 
 [[package]]
 name = "diesel"
-version = "2.1.4"
+version = "2.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62c6fcf842f17f8c78ecf7c81d75c5ce84436b41ee07e03f490fbb5f5a8731d8"
+checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
 dependencies = [
  "bitflags 2.4.1",
  "byteorder",
@@ -1684,11 +1684,12 @@ dependencies = [
 
 [[package]]
 name = "diesel_derives"
-version = "2.1.2"
+version = "2.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef8337737574f55a468005a83499da720f20c65586241ffea339db9ecdfd2b44"
+checksum = "59de76a222c2b8059f789cbe07afbfd8deb8c31dd0bc2a21f85e256c1def8259"
 dependencies = [
  "diesel_table_macro_syntax",
+ "dsl_auto_type",
  "proc-macro2",
  "quote",
  "syn 2.0.52",
@@ -1696,9 +1697,9 @@ dependencies = [
 
 [[package]]
 name = "diesel_migrations"
-version = "2.1.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6036b3f0120c5961381b570ee20a02432d7e2d27ea60de9578799cf9156914ac"
+checksum = "8a73ce704bad4231f001bff3314d91dce4aba0770cee8b233991859abc15c1f6"
 dependencies = [
  "diesel",
  "migrations_internals",
@@ -1707,9 +1708,9 @@ dependencies = [
 
 [[package]]
 name = "diesel_table_macro_syntax"
-version = "0.1.0"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5"
+checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25"
 dependencies = [
  "syn 2.0.52",
 ]
@@ -1745,6 +1746,20 @@ dependencies = [
  "const-random",
 ]
 
+[[package]]
+name = "dsl_auto_type"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
+dependencies = [
+ "darling",
+ "either",
+ "heck 0.5.0",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.52",
+]
+
 [[package]]
 name = "dyn-clone"
 version = "1.0.14"
@@ -3084,19 +3099,19 @@ dependencies = [
 
 [[package]]
 name = "migrations_internals"
-version = "2.1.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0f23f71580015254b020e856feac3df5878c2c7a8812297edd6c0a485ac9dada"
+checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff"
 dependencies = [
  "serde",
- "toml",
+ "toml 0.8.14",
 ]
 
 [[package]]
 name = "migrations_macros"
-version = "2.1.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cce3325ac70e67bbab5bd837a31cae01f1a6db64e0e744a33cb03a543469ef08"
+checksum = "ffb161cc72176cb37aa47f1fc520d3ef02263d67d661f44f05d05a079e1237fd"
 dependencies = [
  "migrations_internals",
  "proc-macro2",
@@ -3576,7 +3591,7 @@ dependencies = [
  "thiserror",
  "tokio",
  "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
  "utils",
  "workspace_hack",
 ]
@@ -3659,7 +3674,7 @@ dependencies = [
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
  "tracing",
  "twox-hash",
  "url",
@@ -4665,7 +4680,7 @@ dependencies = [
  "tokio",
  "tokio-stream",
  "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
  "tracing",
  "utils",
  "workspace_hack",
@@ -5164,7 +5179,7 @@ dependencies = [
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
  "tracing",
  "tracing-subscriber",
  "url",
@@ -5443,9 +5458,9 @@ dependencies = [
 
 [[package]]
 name = "serde_spanned"
-version = "0.6.2"
+version = "0.6.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "93107647184f6027e3b7dcb2e11034cf95ffa1e3a682c67951963ac69c1c007d"
+checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0"
 dependencies = [
  "serde",
 ]
@@ -6330,14 +6345,26 @@ dependencies = [
  "serde",
  "serde_spanned",
  "toml_datetime",
- "toml_edit",
+ "toml_edit 0.19.10",
+]
+
+[[package]]
+name = "toml"
+version = "0.8.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335"
+dependencies = [
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_edit 0.22.14",
 ]
 
 [[package]]
 name = "toml_datetime"
-version = "0.6.2"
+version = "0.6.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a76a9312f5ba4c2dec6b9161fdf25d87ad8a09256ccea5a556fef03c706a10f"
+checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf"
 dependencies = [
  "serde",
 ]
@@ -6352,7 +6379,20 @@ dependencies = [
  "serde",
  "serde_spanned",
  "toml_datetime",
- "winnow",
+ "winnow 0.4.6",
+]
+
+[[package]]
+name = "toml_edit"
+version = "0.22.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38"
+dependencies = [
+ "indexmap 2.0.1",
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "winnow 0.6.13",
 ]
 
 [[package]]
@@ -7335,6 +7375,15 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "winnow"
+version = "0.6.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "winreg"
 version = "0.50.0"
@@ -7424,7 +7473,7 @@ dependencies = [
  "tokio-rustls 0.24.0",
  "tokio-util",
  "toml_datetime",
- "toml_edit",
+ "toml_edit 0.19.10",
  "tonic",
  "tower",
  "tracing",
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 139a5647c5..f43076171f 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -115,7 +115,6 @@ syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-trai
 syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }
 toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
-toml_edit = { version = "0.19", features = ["serde"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }

From 947f6da75e10042b2fd66a4ff523c8a1d5da3aeb Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 25 Jun 2024 17:04:44 +0200
Subject: [PATCH 1041/1571] L0 flush: avoid short-lived allocation when
 checking key_range empty (#8154)

We only use `keys` to check if it's empty so we can bail out early. No
need to collect the keys for that.

Found this while doing research for
https://github.com/neondatabase/neon/issues/7418
---
 pageserver/src/tenant/storage_layer/inmemory_layer.rs | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 1ecc56ce99..6624fb7e6b 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -622,18 +622,16 @@ impl InMemoryLayer {
 
         let end_lsn = *self.end_lsn.get().unwrap();
 
-        let keys: Vec<_> = if let Some(key_range) = key_range {
+        let key_count = if let Some(key_range) = key_range {
             inner
                 .index
                 .iter()
                 .filter(|(k, _)| key_range.contains(k))
-                .map(|(k, m)| (k.to_i128(), m))
-                .collect()
+                .count()
         } else {
-            inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect()
+            inner.index.len()
         };
-
-        if keys.is_empty() {
+        if key_count == 0 {
             return Ok(None);
         }
 

From 9b2f9419d9451514e6f11c79db7de3adaac2f0ba Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 25 Jun 2024 16:18:22 +0100
Subject: [PATCH 1042/1571] CI: upload docker cache only from main (#8157)

## Problem
The Docker build cache gets invalidated by PRs

## Summary of changes
- Upload cache only from the main branch
---
 .github/workflows/build-build-tools-image.yml | 2 +-
 .github/workflows/build_and_test.yml          | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 6e90a80ab7..5a94dd8e6f 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -78,7 +78,7 @@ jobs:
           pull: true
           file: Dockerfile.build-tools
           cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
           tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
 
       - name: Remove custom docker config directory
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e9adf28b99..113b37ae51 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -763,7 +763,7 @@ jobs:
           pull: true
           file: Dockerfile
           cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }}
           tags: |
             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
@@ -855,7 +855,7 @@ jobs:
           pull: true
           file: Dockerfile.compute-node
           cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
           tags: |
             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
@@ -875,7 +875,7 @@ jobs:
           file: Dockerfile.compute-node
           target: neon-pg-ext-test
           cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
           tags: |
             neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
 

From 961fc0ba8f8355911a34151f40a105b29ba5002c Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Tue, 25 Jun 2024 11:43:12 -0400
Subject: [PATCH 1043/1571] feat(pageserver): add metrics for number of valid
 leases after each refresh (#8147)

Part of #7497, closes #8120.

## Summary of changes

This PR adds a metric to track the number of valid leases after `GCInfo`
gets refreshed each time.

Besides this metric, we should also track disk space and synthetic size
(after #8071 is closed) to make sure leases are used properly.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 pageserver/src/metrics.rs       | 17 +++++++++++++++++
 pageserver/src/tenant.rs        |  5 +++++
 test_runner/fixtures/metrics.py |  1 +
 3 files changed, 23 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 5c8f350f7b..c6b1607331 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -545,6 +545,15 @@ static AUX_FILE_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_valid_lsn_lease_count",
+        "The number of valid leases after refreshing gc info.",
+        &["tenant_id", "shard_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) mod initial_logical_size {
     use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
     use once_cell::sync::Lazy;
@@ -2055,6 +2064,8 @@ pub(crate) struct TimelineMetrics {
     pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
     pub evictions: IntCounter,
     pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
+    /// Number of valid LSN leases.
+    pub valid_lsn_lease_count_gauge: UIntGauge,
     shutdown: std::sync::atomic::AtomicBool,
 }
 
@@ -2153,6 +2164,10 @@ impl TimelineMetrics {
         let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
             .build(&tenant_id, &shard_id, &timeline_id);
 
+        let valid_lsn_lease_count_gauge = VALID_LSN_LEASE_COUNT
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
         TimelineMetrics {
             tenant_id,
             shard_id,
@@ -2175,6 +2190,7 @@ impl TimelineMetrics {
             evictions_with_low_residence_duration: std::sync::RwLock::new(
                 evictions_with_low_residence_duration,
             ),
+            valid_lsn_lease_count_gauge,
             shutdown: std::sync::atomic::AtomicBool::default(),
         }
     }
@@ -2224,6 +2240,7 @@ impl TimelineMetrics {
         }
         let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
 
         self.evictions_with_low_residence_duration
             .write()
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 62f066862a..4e03e09a9b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2960,6 +2960,11 @@ impl Tenant {
                 let now = SystemTime::now();
                 target.leases.retain(|_, lease| !lease.is_expired(&now));
 
+                timeline
+                    .metrics
+                    .valid_lsn_lease_count_gauge
+                    .set(target.leases.len() as u64);
+
                 match gc_cutoffs.remove(&timeline.timeline_id) {
                     Some(cutoffs) => {
                         target.retain_lsns = branchpoints;
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index e01bb6da51..41fa8e679f 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -149,6 +149,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_evictions_total",
     "pageserver_evictions_with_low_residence_duration_total",
     "pageserver_aux_file_estimated_size",
+    "pageserver_valid_lsn_lease_count",
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold
     # "pageserver_broken_tenants_count" -- used only for broken

From 64a4461191e17521f54cd9c334b034fcf7a12a0b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 25 Jun 2024 19:05:13 +0300
Subject: [PATCH 1044/1571] Fix submodule references to match the
 REL_*_STABLE_neon branches (#8159)

No code changes, just point to the correct commit SHAs.
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index aa88bd536b..7845c122d5 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit aa88bd536b48b22328aac748be0dcfff760135d0
+Subproject commit 7845c122d51d3ebb547a984a640ac0310a2fadce
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 2092a6dcee..2ff5ecc67c 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 2092a6dcee794bb0cb17471bd964690dd7c7355f
+Subproject commit 2ff5ecc67c64e5fe44b7dde598e64e4538e0c373
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 3bf9219f6e..d55e0aca10 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 3bf9219f6ef5e943393e9430872e26184e92d1c6
+Subproject commit d55e0aca104af0b611cf5565f1033b2acd2dcc1c
diff --git a/vendor/revisions.json b/vendor/revisions.json
index d48f1defec..e755cf2e9d 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "3bf9219f6ef5e943393e9430872e26184e92d1c6"],
-  "v15": ["15.7", "2092a6dcee794bb0cb17471bd964690dd7c7355f"],
-  "v14": ["14.12", "aa88bd536b48b22328aac748be0dcfff760135d0"]
+  "v16": ["16.3", "d55e0aca104af0b611cf5565f1033b2acd2dcc1c"],
+  "v15": ["15.7", "2ff5ecc67c64e5fe44b7dde598e64e4538e0c373"],
+  "v14": ["14.12", "7845c122d51d3ebb547a984a640ac0310a2fadce"]
 }

From 07f21dd6b67e46d86ddc45eb69703f84b118fecb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 25 Jun 2024 17:38:06 +0100
Subject: [PATCH 1045/1571] pageserver: remove attach/detach apis (#8134)

## Problem

These APIs have been deprecated for some time, but were still used from
test code.

Closes: https://github.com/neondatabase/neon/issues/4282

## Summary of changes

- It is still convenient to do a "tenant_attach" from a test without
having to write out a location_conf body, so those test methods have
been retained with implementations that call through to their
location_conf equivalent.
---
 libs/pageserver_api/src/models.rs             | 37 ---------
 libs/utils/src/http/json.rs                   | 16 +---
 pageserver/src/http/openapi_spec.yml          | 13 +--
 pageserver/src/http/routes.rs                 | 83 +------------------
 storage_controller/src/service.rs             |  7 ++
 test_runner/fixtures/neon_fixtures.py         |  2 -
 test_runner/fixtures/pageserver/http.py       | 46 ++++------
 .../regress/test_attach_tenant_config.py      | 45 ++++------
 test_runner/regress/test_remote_storage.py    | 13 +--
 test_runner/regress/test_tenant_detach.py     | 48 -----------
 test_runner/regress/test_timeline_size.py     |  2 +-
 .../test_walredo_not_left_behind_on_detach.py |  2 +-
 12 files changed, 54 insertions(+), 260 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 3db75b7d0e..b1e4525cc0 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -607,31 +607,6 @@ impl TenantConfigRequest {
     }
 }
 
-#[derive(Debug, Deserialize)]
-pub struct TenantAttachRequest {
-    #[serde(default)]
-    pub config: TenantAttachConfig,
-    #[serde(default)]
-    pub generation: Option<u32>,
-}
-
-/// Newtype to enforce deny_unknown_fields on TenantConfig for
-/// its usage inside `TenantAttachRequest`.
-#[derive(Debug, Serialize, Deserialize, Default)]
-#[serde(deny_unknown_fields)]
-pub struct TenantAttachConfig {
-    #[serde(flatten)]
-    allowing_unknown_fields: TenantConfig,
-}
-
-impl std::ops::Deref for TenantAttachConfig {
-    type Target = TenantConfig;
-
-    fn deref(&self) -> &Self::Target {
-        &self.allowing_unknown_fields
-    }
-}
-
 /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
 #[derive(Serialize, Deserialize, Clone)]
 #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
@@ -1554,18 +1529,6 @@ mod tests {
             "expect unknown field `unknown_field` error, got: {}",
             err
         );
-
-        let attach_request = json!({
-            "config": {
-                "unknown_field": "unknown_value".to_string(),
-            },
-        });
-        let err = serde_json::from_value::<TenantAttachRequest>(attach_request).unwrap_err();
-        assert!(
-            err.to_string().contains("unknown field `unknown_field`"),
-            "expect unknown field `unknown_field` error, got: {}",
-            err
-        );
     }
 
     #[test]
diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs
index 7ca62561fe..6c25440b42 100644
--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -8,22 +8,15 @@ use super::error::ApiError;
 pub async fn json_request<T: for<'de> Deserialize<'de>>(
     request: &mut Request<Body>,
 ) -> Result<T, ApiError> {
-    json_request_or_empty_body(request)
-        .await?
-        .context("missing request body")
-        .map_err(ApiError::BadRequest)
-}
-
-/// Will be removed as part of <https://github.com/neondatabase/neon/issues/4282>
-pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
-    request: &mut Request<Body>,
-) -> Result<Option<T>, ApiError> {
     let body = hyper::body::aggregate(request.body_mut())
         .await
         .context("Failed to read request body")
         .map_err(ApiError::BadRequest)?;
+
     if body.remaining() == 0 {
-        return Ok(None);
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "missing request body"
+        )));
     }
 
     let mut deser = serde_json::de::Deserializer::from_reader(body.reader());
@@ -31,7 +24,6 @@ pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
     serde_path_to_error::deserialize(&mut deser)
         // intentionally stringify because the debug version is not helpful in python logs
         .map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}"))
-        .map(Some)
         .map_err(ApiError::BadRequest)
 }
 
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index e583992a58..58ff6e3f83 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -367,16 +367,7 @@ paths:
                 $ref: "#/components/schemas/TenantLocationConfigResponse"
         "409":
           description: |
-            The tenant is already known to Pageserver in some way,
-            and hence this `/attach` call has been rejected.
-
-            Some examples of how this can happen:
-            - tenant was created on this pageserver
-            - tenant attachment was started by an earlier call to `/attach`.
-
-            Callers should poll the tenant status's `attachment_status` field,
-            like for status 202. See the longer description for `POST /attach`
-            for details.
+            The tenant is already being modified, perhaps by a concurrent call to this API
           content:
             application/json:
               schema:
@@ -762,8 +753,6 @@ components:
               For example this can be caused by s3 being unreachable. The retry may be implemented
               with call to detach, though it would be better to not automate it and inspec failed state
               manually before proceeding with a retry.
-
-            See the tenant `/attach` endpoint for more information.
           type: object
           required:
             - slug
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 450f89820e..d6ba9ee35e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -31,13 +31,11 @@ use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
-use pageserver_api::models::TenantState;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::TopTenantShardsRequest;
 use pageserver_api::models::TopTenantShardsResponse;
 use pageserver_api::models::{
-    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
-    TenantLocationConfigRequest,
+    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantLocationConfigRequest,
 };
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
@@ -51,7 +49,6 @@ use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
-use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
 
 use crate::context::{DownloadBehavior, RequestContext};
@@ -821,58 +818,6 @@ async fn get_timestamp_of_lsn_handler(
     }
 }
 
-async fn tenant_attach_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
-
-    let maybe_body: Option<TenantAttachRequest> = json_request_or_empty_body(&mut request).await?;
-    let tenant_conf = match &maybe_body {
-        Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?,
-        None => TenantConfOpt::default(),
-    };
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    info!("Handling tenant attach {tenant_id}");
-
-    let state = get_state(&request);
-
-    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
-
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-    let shard_params = ShardParameters::default();
-    let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
-
-    let tenant = state
-        .tenant_manager
-        .upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx)
-        .await?;
-
-    let Some(tenant) = tenant else {
-        // This should never happen: indicates a bug in upsert_location
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "Upsert succeeded but didn't return tenant!"
-        )));
-    };
-
-    // We might have successfully constructed a Tenant, but it could still
-    // end up in a broken state:
-    if let TenantState::Broken {
-        reason,
-        backtrace: _,
-    } = tenant.current_state()
-    {
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "Tenant state is Broken: {reason}"
-        )));
-    }
-
-    json_response(StatusCode::ACCEPTED, ())
-}
-
 async fn timeline_delete_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -903,26 +848,6 @@ async fn timeline_delete_handler(
     json_response(StatusCode::ACCEPTED, ())
 }
 
-async fn tenant_detach_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
-    // This is a legacy API (`/location_conf` is the replacement).  It only supports unsharded tenants
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-    let state = get_state(&request);
-    let conf = state.conf;
-    state
-        .tenant_manager
-        .detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client)
-        .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
-        .await?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn tenant_reset_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -2711,12 +2636,6 @@ pub fn make_router(
         .post("/v1/tenant/:tenant_shard_id/timeline", |r| {
             api_handler(r, timeline_create_handler)
         })
-        .post("/v1/tenant/:tenant_id/attach", |r| {
-            api_handler(r, tenant_attach_handler)
-        })
-        .post("/v1/tenant/:tenant_id/detach", |r| {
-            api_handler(r, tenant_detach_handler)
-        })
         .post("/v1/tenant/:tenant_shard_id/reset", |r| {
             api_handler(r, tenant_reset_handler)
         })
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 388e0eadc8..e329f42dd6 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1231,6 +1231,13 @@ impl Service {
         &self,
         attach_req: AttachHookRequest,
     ) -> anyhow::Result<AttachHookResponse> {
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            attach_req.tenant_shard_id.tenant_id,
+            TenantOperations::ShardSplit,
+        )
+        .await;
+
         // This is a test hook.  To enable using it on tenants that were created directly with
         // the pageserver API (not via this service), we will auto-create any missing tenant
         // shards with default state.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b624c84fad..84fb1f7cb4 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2684,7 +2684,6 @@ class NeonPageserver(PgProtocol, LogUtils):
         self,
         tenant_id: TenantId,
         config: None | Dict[str, Any] = None,
-        config_null: bool = False,
         generation: Optional[int] = None,
         override_storage_controller_generation: bool = False,
     ):
@@ -2702,7 +2701,6 @@ class NeonPageserver(PgProtocol, LogUtils):
         return client.tenant_attach(
             tenant_id,
             config,
-            config_null,
             generation=generation,
         )
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 64c7ddee6c..2a7cbea200 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import json
 import time
 from collections import defaultdict
 from dataclasses import dataclass
@@ -253,39 +252,30 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         self,
         tenant_id: Union[TenantId, TenantShardId],
         config: None | Dict[str, Any] = None,
-        config_null: bool = False,
         generation: Optional[int] = None,
     ):
-        if config_null:
-            assert config is None
-            body: Any = None
-        else:
-            # null-config is prohibited by the API
-            config = config or {}
-            body = {"config": config}
-            if generation is not None:
-                body.update({"generation": generation})
+        config = config or {}
 
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach",
-            data=json.dumps(body),
-            headers={"Content-Type": "application/json"},
+        return self.tenant_location_conf(
+            tenant_id,
+            location_conf={
+                "mode": "AttachedSingle",
+                "secondary_conf": None,
+                "tenant_conf": config,
+                "generation": generation,
+            },
         )
-        self.verbose_error(res)
 
-    def tenant_detach(self, tenant_id: TenantId, detach_ignored=False, timeout_secs=None):
-        params = {}
-        if detach_ignored:
-            params["detach_ignored"] = "true"
-
-        kwargs = {}
-        if timeout_secs is not None:
-            kwargs["timeout"] = timeout_secs
-
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params, **kwargs
+    def tenant_detach(self, tenant_id: TenantId):
+        return self.tenant_location_conf(
+            tenant_id,
+            location_conf={
+                "mode": "Detached",
+                "secondary_conf": None,
+                "tenant_conf": {},
+                "generation": None,
+            },
         )
-        self.verbose_error(res)
 
     def tenant_reset(self, tenant_id: Union[TenantId, TenantShardId], drop_cache: bool):
         params = {}
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index f4667a82dc..e117c2140f 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -7,7 +7,7 @@ from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
 )
-from fixtures.pageserver.http import PageserverApiException, TenantConfig
+from fixtures.pageserver.http import TenantConfig
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.utils import wait_until
 
@@ -82,8 +82,8 @@ def test_null_body(negative_env: NegativeTests):
     tenant_id = negative_env.tenant_id
     ps_http = env.pageserver.http_client()
 
-    res = ps_http.post(
-        f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",
+    res = ps_http.put(
+        f"{ps_http.base_url}/v1/tenant/{tenant_id}/location_config",
         data=b"null",
         headers={"Content-Type": "application/json"},
     )
@@ -99,35 +99,16 @@ def test_null_config(negative_env: NegativeTests):
     tenant_id = negative_env.tenant_id
     ps_http = env.pageserver.http_client()
 
-    res = ps_http.post(
-        f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",
-        data=b'{"config": null}',
+    res = ps_http.put(
+        f"{ps_http.base_url}/v1/tenant/{tenant_id}/location_config",
+        json={"mode": "AttachedSingle", "generation": 1, "tenant_conf": None},
         headers={"Content-Type": "application/json"},
     )
     assert res.status_code == 400
 
 
-def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests):
-    """
-    If we send a config with unknown keys, the request should be rejected with status 400.
-    """
-
-    env = negative_env.neon_env
-    tenant_id = negative_env.tenant_id
-
-    config_with_unknown_keys = {
-        "compaction_period": "1h",
-        "this_key_does_not_exist": "some value",
-    }
-
-    with pytest.raises(PageserverApiException) as e:
-        env.pageserver.tenant_attach(tenant_id, config=config_with_unknown_keys)
-    assert e.type == PageserverApiException
-    assert e.value.status_code == 400
-
-
 @pytest.mark.parametrize("content_type", [None, "application/json"])
-def test_no_config(positive_env: NeonEnv, content_type: Optional[str]):
+def test_empty_config(positive_env: NeonEnv, content_type: Optional[str]):
     """
     When the 'config' body attribute is omitted, the request should be accepted
     and the tenant should use the default configuration
@@ -141,11 +122,13 @@ def test_no_config(positive_env: NeonEnv, content_type: Optional[str]):
     ps_http.tenant_detach(tenant_id)
     assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()]
 
-    body = {"generation": env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)}
-
-    ps_http.post(
-        f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",
-        json=body,
+    ps_http.put(
+        f"{ps_http.base_url}/v1/tenant/{tenant_id}/location_config",
+        json={
+            "mode": "AttachedSingle",
+            "generation": env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id),
+            "tenant_conf": {},
+        },
         headers=None if content_type else {"Content-Type": "application/json"},
     ).raise_for_status()
 
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 7f79bf5d5c..b26bd3422f 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -164,13 +164,14 @@ def test_remote_storage_backup_and_restore(
         "data": {"reason": "storage-sync-list-remote-timelines"},
     }
 
+    # Even though the tenant is broken, subsequent calls to location_conf API will succeed, but
+    # the tenant will always end up in a broken state as a result of the failpoint.
     # Ensure that even though the tenant is broken, retrying the attachment fails
-    with pytest.raises(Exception, match="Tenant state is Broken"):
-        # Use same generation as in previous attempt
-        gen_state = env.storage_controller.inspect(tenant_id)
-        assert gen_state is not None
-        generation = gen_state[0]
-        env.pageserver.tenant_attach(tenant_id, generation=generation)
+    tenant_info = wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15)
+    gen_state = env.storage_controller.inspect(tenant_id)
+    assert gen_state is not None
+    generation = gen_state[0]
+    env.pageserver.tenant_attach(tenant_id, generation=generation)
 
     # Restart again, this implicitly clears the failpoint.
     # test_remote_failures=1 remains active, though, as it's in the pageserver config.
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 4c49e6fb85..2056840558 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -275,16 +275,6 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
 
     env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
 
-    # first check for non existing tenant
-    tenant_id = TenantId.generate()
-    with pytest.raises(
-        expected_exception=PageserverApiException,
-        match=f"NotFound: tenant {tenant_id}",
-    ) as excinfo:
-        pageserver_http.tenant_detach(tenant_id)
-
-    assert excinfo.value.status_code == 404
-
     # create new nenant
     tenant_id, timeline_id = env.neon_cli.create_tenant()
 
@@ -344,44 +334,6 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
         pageserver_http.timeline_gc(tenant_id, timeline_id, 0)
 
 
-# Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach.
-# Tenant should be detached without issues.
-def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-    client = env.pageserver.http_client()
-
-    # create a new tenant
-    tenant_id, _ = env.neon_cli.create_tenant()
-
-    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
-
-    # assert tenant exists on disk
-    assert env.pageserver.tenant_dir(tenant_id).exists()
-
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
-    # we rely upon autocommit after each statement
-    endpoint.safe_psql_many(
-        queries=[
-            "CREATE TABLE t(key int primary key, value text)",
-            "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
-        ]
-    )
-
-    log.info("detaching regular tenant with detach ignored flag")
-    client.tenant_detach(tenant_id, True)
-
-    log.info("regular tenant detached without error")
-
-    # check that nothing is left on disk for deleted tenant
-    assert not env.pageserver.tenant_dir(tenant_id).exists()
-
-    # assert the tenant does not exists in the Pageserver
-    tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
-    assert (
-        tenant_id not in tenants_after_detach
-    ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory"
-
-
 def test_detach_while_attaching(
     neon_env_builder: NeonEnvBuilder,
 ):
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 3110833563..f47356839c 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -840,7 +840,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
 
     # Detaching a stuck tenant should proceed promptly
     # (reproducer for https://github.com/neondatabase/neon/pull/6430)
-    env.pageserver.http_client().tenant_detach(detach_tenant_id, timeout_secs=10)
+    env.pageserver.http_client().tenant_detach(detach_tenant_id)
     tenant_ids.remove(detach_tenant_id)
     # FIXME: currently the mechanism for cancelling attach is to set state to broken, which is reported spuriously at error level
     env.pageserver.allowed_errors.append(
diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
index ad37807dba..375cfcb4fe 100644
--- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py
+++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
@@ -37,7 +37,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
         expected_exception=PageserverApiException,
         match=f"NotFound: tenant {tenant_id}",
     ):
-        pageserver_http.tenant_detach(tenant_id)
+        pageserver_http.tenant_status(tenant_id)
 
     # create new nenant
     tenant_id, _ = env.neon_cli.create_tenant()

From cd9a550d97f5863f8c123e66d08fed6360a8c771 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 25 Jun 2024 20:03:27 +0200
Subject: [PATCH 1046/1571] clippy-deny the `todo!()` macro (#4340)

`todo!()` shouldn't slip into prod code
---
 .neon_clippy_args                   | 3 ++-
 libs/walproposer/src/walproposer.rs | 2 ++
 trace/src/main.rs                   | 8 --------
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/.neon_clippy_args b/.neon_clippy_args
index 25e09c61a6..4db32cf35c 100644
--- a/.neon_clippy_args
+++ b/.neon_clippy_args
@@ -1,4 +1,5 @@
 # * `-A unknown_lints` – do not warn about unknown lint suppressions
 #                        that people with newer toolchains might use
 # * `-D warnings`      - fail on any warnings (`cargo` returns non-zero exit status)
-export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings"
+# * `-D clippy::todo`  - don't let `todo!()` slip into `main`
+export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings -D clippy::todo"
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index f7b72b205f..37b1e0fa87 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -1,3 +1,5 @@
+#![allow(clippy::todo)]
+
 use std::ffi::CString;
 
 use crate::{
diff --git a/trace/src/main.rs b/trace/src/main.rs
index 049f922b6f..79e1df988d 100644
--- a/trace/src/main.rs
+++ b/trace/src/main.rs
@@ -38,12 +38,6 @@ enum Command {
 
     /// Print stats and anomalies about the traces
     Analyze,
-
-    /// Draw the traces in svg format
-    Draw,
-
-    /// Send the read requests to a pageserver
-    Replay,
 }
 
 // HACK This function will change and improve as we see what kind of analysis is useful.
@@ -167,8 +161,6 @@ fn main() -> anyhow::Result<()> {
                 analyze_trace(reader);
             }
         }
-        Command::Draw => todo!(),
-        Command::Replay => todo!(),
     }
 
     Ok(())

From 6c5d3b52634a3bb49f48eae24a9204cea67a7e77 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 25 Jun 2024 19:07:54 +0100
Subject: [PATCH 1047/1571] proxy fix wake compute console retry (#8141)

## Problem

1. Proxy is retrying errors from cplane that shouldn't be retried
2. ~~Proxy is not using the retry_after_ms value~~

## Summary of changes

1. Correct the could_retry impl for ConsoleError.
2. ~~Update could_retry interface to support returning a fixed wait
duration.~~
---
 proxy/src/console/messages.rs      | 111 ++++++++++++++++++-----------
 proxy/src/console/provider.rs      |  48 ++++++++-----
 proxy/src/proxy/connect_compute.rs |  20 +++---
 proxy/src/proxy/retry.rs           |  54 ++++++++------
 proxy/src/proxy/tests.rs           |  25 +++++--
 proxy/src/proxy/wake_compute.rs    |  89 ++++++-----------------
 proxy/src/serverless/backend.rs    |  13 ++--
 7 files changed, 193 insertions(+), 167 deletions(-)

diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 3b7d681a41..d28d13ba69 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -5,7 +5,7 @@ use std::fmt::{self, Display};
 use crate::auth::IpPattern;
 
 use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
-use crate::proxy::retry::ShouldRetry;
+use crate::proxy::retry::CouldRetry;
 
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
@@ -64,45 +64,47 @@ impl Display for ConsoleError {
     }
 }
 
-impl ShouldRetry for ConsoleError {
+impl CouldRetry for ConsoleError {
     fn could_retry(&self) -> bool {
-        if self.status.is_none() || self.status.as_ref().unwrap().details.retry_info.is_none() {
-            // retry some temporary failures because the compute was in a bad state
-            // (bad request can be returned when the endpoint was in transition)
-            return match &self {
-                ConsoleError {
-                    http_status_code: http::StatusCode::BAD_REQUEST,
-                    ..
-                } => true,
-                // don't retry when quotas are exceeded
-                ConsoleError {
-                    http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
-                    ref error,
-                    ..
-                } => !error.contains("compute time quota of non-primary branches is exceeded"),
-                // locked can be returned when the endpoint was in transition
-                // or when quotas are exceeded. don't retry when quotas are exceeded
-                ConsoleError {
-                    http_status_code: http::StatusCode::LOCKED,
-                    ref error,
-                    ..
-                } => {
-                    !error.contains("quota exceeded")
-                        && !error.contains("the limit for current plan reached")
-                }
-                _ => false,
-            };
+        // If the error message does not have a status,
+        // the error is unknown and probably should not retry automatically
+        let Some(status) = &self.status else {
+            return false;
+        };
+
+        // retry if the retry info is set.
+        if status.details.retry_info.is_some() {
+            return true;
         }
 
-        // retry if the response has a retry delay
-        if let Some(retry_info) = self
-            .status
-            .as_ref()
-            .and_then(|s| s.details.retry_info.as_ref())
-        {
-            retry_info.retry_delay_ms > 0
-        } else {
-            false
+        // if no retry info set, attempt to use the error code to guess the retry state.
+        let reason = status
+            .details
+            .error_info
+            .map_or(Reason::Unknown, |e| e.reason);
+        match reason {
+            // not a transitive error
+            Reason::RoleProtected => false,
+            // on retry, it will still not be found
+            Reason::ResourceNotFound
+            | Reason::ProjectNotFound
+            | Reason::EndpointNotFound
+            | Reason::BranchNotFound => false,
+            // we were asked to go away
+            Reason::RateLimitExceeded
+            | Reason::NonDefaultBranchComputeTimeExceeded
+            | Reason::ActiveTimeQuotaExceeded
+            | Reason::ComputeTimeQuotaExceeded
+            | Reason::WrittenDataQuotaExceeded
+            | Reason::DataTransferQuotaExceeded
+            | Reason::LogicalSizeQuotaExceeded => false,
+            // transitive error. control plane is currently busy
+            // but might be ready soon
+            Reason::RunningOperations => true,
+            Reason::ConcurrencyLimitReached => true,
+            Reason::LockAlreadyTaken => true,
+            // unknown error. better not retry it.
+            Reason::Unknown => false,
         }
     }
 }
@@ -121,7 +123,7 @@ pub struct Details {
     pub user_facing_message: Option<UserFacingMessage>,
 }
 
-#[derive(Debug, Deserialize)]
+#[derive(Copy, Clone, Debug, Deserialize)]
 pub struct ErrorInfo {
     pub reason: Reason,
     // Schema could also have `metadata` field, but it's not structured. Skip it for now.
@@ -129,30 +131,59 @@ pub struct ErrorInfo {
 
 #[derive(Clone, Copy, Debug, Deserialize, Default)]
 pub enum Reason {
+    /// RoleProtected indicates that the role is protected and the attempted operation is not permitted on protected roles.
     #[serde(rename = "ROLE_PROTECTED")]
     RoleProtected,
+    /// ResourceNotFound indicates that a resource (project, endpoint, branch, etc.) wasn't found,
+    /// usually due to the provided ID not being correct or because the subject doesn't have enough permissions to
+    /// access the requested resource.
+    /// Prefer a more specific reason if possible, e.g., ProjectNotFound, EndpointNotFound, etc.
     #[serde(rename = "RESOURCE_NOT_FOUND")]
     ResourceNotFound,
+    /// ProjectNotFound indicates that the project wasn't found, usually due to the provided ID not being correct,
+    /// or that the subject doesn't have enough permissions to access the requested project.
     #[serde(rename = "PROJECT_NOT_FOUND")]
     ProjectNotFound,
+    /// EndpointNotFound indicates that the endpoint wasn't found, usually due to the provided ID not being correct,
+    /// or that the subject doesn't have enough permissions to access the requested endpoint.
     #[serde(rename = "ENDPOINT_NOT_FOUND")]
     EndpointNotFound,
+    /// BranchNotFound indicates that the branch wasn't found, usually due to the provided ID not being correct,
+    /// or that the subject doesn't have enough permissions to access the requested branch.
     #[serde(rename = "BRANCH_NOT_FOUND")]
     BranchNotFound,
+    /// RateLimitExceeded indicates that the rate limit for the operation has been exceeded.
     #[serde(rename = "RATE_LIMIT_EXCEEDED")]
     RateLimitExceeded,
+    /// NonDefaultBranchComputeTimeExceeded indicates that the compute time quota of non-default branches has been
+    /// exceeded.
     #[serde(rename = "NON_PRIMARY_BRANCH_COMPUTE_TIME_EXCEEDED")]
-    NonPrimaryBranchComputeTimeExceeded,
+    NonDefaultBranchComputeTimeExceeded,
+    /// ActiveTimeQuotaExceeded indicates that the active time quota was exceeded.
     #[serde(rename = "ACTIVE_TIME_QUOTA_EXCEEDED")]
     ActiveTimeQuotaExceeded,
+    /// ComputeTimeQuotaExceeded indicates that the compute time quota was exceeded.
     #[serde(rename = "COMPUTE_TIME_QUOTA_EXCEEDED")]
     ComputeTimeQuotaExceeded,
+    /// WrittenDataQuotaExceeded indicates that the written data quota was exceeded.
     #[serde(rename = "WRITTEN_DATA_QUOTA_EXCEEDED")]
     WrittenDataQuotaExceeded,
+    /// DataTransferQuotaExceeded indicates that the data transfer quota was exceeded.
     #[serde(rename = "DATA_TRANSFER_QUOTA_EXCEEDED")]
     DataTransferQuotaExceeded,
+    /// LogicalSizeQuotaExceeded indicates that the logical size quota was exceeded.
     #[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")]
     LogicalSizeQuotaExceeded,
+    /// RunningOperations indicates that the project already has some running operations
+    /// and scheduling of new ones is prohibited.
+    #[serde(rename = "RUNNING_OPERATIONS")]
+    RunningOperations,
+    /// ConcurrencyLimitReached indicates that the concurrency limit for an action was reached.
+    #[serde(rename = "CONCURRENCY_LIMIT_REACHED")]
+    ConcurrencyLimitReached,
+    /// LockAlreadyTaken indicates that the we attempted to take a lock that was already taken.
+    #[serde(rename = "LOCK_ALREADY_TAKEN")]
+    LockAlreadyTaken,
     #[default]
     #[serde(other)]
     Unknown,
@@ -170,7 +201,7 @@ impl Reason {
     }
 }
 
-#[derive(Debug, Deserialize)]
+#[derive(Copy, Clone, Debug, Deserialize)]
 pub struct RetryInfo {
     pub retry_delay_ms: u64,
 }
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 915c2ee7a6..bec55a8343 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -25,9 +25,9 @@ use tracing::info;
 
 pub mod errors {
     use crate::{
-        console::messages::{self, ConsoleError},
+        console::messages::{self, ConsoleError, Reason},
         error::{io_error, ReportableError, UserFacingError},
-        proxy::retry::ShouldRetry,
+        proxy::retry::CouldRetry,
     };
     use thiserror::Error;
 
@@ -76,21 +76,22 @@ pub mod errors {
                 ApiError::Console(e) => {
                     use crate::error::ErrorKind::*;
                     match e.get_reason() {
-                        crate::console::messages::Reason::RoleProtected => User,
-                        crate::console::messages::Reason::ResourceNotFound => User,
-                        crate::console::messages::Reason::ProjectNotFound => User,
-                        crate::console::messages::Reason::EndpointNotFound => User,
-                        crate::console::messages::Reason::BranchNotFound => User,
-                        crate::console::messages::Reason::RateLimitExceeded => ServiceRateLimit,
-                        crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => {
-                            User
-                        }
-                        crate::console::messages::Reason::ActiveTimeQuotaExceeded => User,
-                        crate::console::messages::Reason::ComputeTimeQuotaExceeded => User,
-                        crate::console::messages::Reason::WrittenDataQuotaExceeded => User,
-                        crate::console::messages::Reason::DataTransferQuotaExceeded => User,
-                        crate::console::messages::Reason::LogicalSizeQuotaExceeded => User,
-                        crate::console::messages::Reason::Unknown => match &e {
+                        Reason::RoleProtected => User,
+                        Reason::ResourceNotFound => User,
+                        Reason::ProjectNotFound => User,
+                        Reason::EndpointNotFound => User,
+                        Reason::BranchNotFound => User,
+                        Reason::RateLimitExceeded => ServiceRateLimit,
+                        Reason::NonDefaultBranchComputeTimeExceeded => User,
+                        Reason::ActiveTimeQuotaExceeded => User,
+                        Reason::ComputeTimeQuotaExceeded => User,
+                        Reason::WrittenDataQuotaExceeded => User,
+                        Reason::DataTransferQuotaExceeded => User,
+                        Reason::LogicalSizeQuotaExceeded => User,
+                        Reason::ConcurrencyLimitReached => ControlPlane,
+                        Reason::LockAlreadyTaken => ControlPlane,
+                        Reason::RunningOperations => ControlPlane,
+                        Reason::Unknown => match &e {
                             ConsoleError {
                                 http_status_code:
                                     http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
@@ -128,7 +129,7 @@ pub mod errors {
         }
     }
 
-    impl ShouldRetry for ApiError {
+    impl CouldRetry for ApiError {
         fn could_retry(&self) -> bool {
             match self {
                 // retry some transport errors
@@ -239,6 +240,17 @@ pub mod errors {
             }
         }
     }
+
+    impl CouldRetry for WakeComputeError {
+        fn could_retry(&self) -> bool {
+            match self {
+                WakeComputeError::BadComputeAddress(_) => false,
+                WakeComputeError::ApiError(e) => e.could_retry(),
+                WakeComputeError::TooManyConnections => false,
+                WakeComputeError::TooManyConnectionAttempts(_) => false,
+            }
+        }
+    }
 }
 
 /// Auth secret which is managed by the cloud.
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 409d45b39a..82180aaee3 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -7,7 +7,7 @@ use crate::{
     error::ReportableError,
     metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType},
     proxy::{
-        retry::{retry_after, ShouldRetry},
+        retry::{retry_after, should_retry, CouldRetry},
         wake_compute::wake_compute,
     },
     Host,
@@ -17,6 +17,8 @@ use pq_proto::StartupMessageParams;
 use tokio::time;
 use tracing::{error, info, warn};
 
+use super::retry::ShouldRetryWakeCompute;
+
 const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
 
 /// If we couldn't connect, a cached connection info might be to blame
@@ -104,7 +106,7 @@ pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
     connect_to_compute_retry_config: RetryConfig,
 ) -> Result<M::Connection, M::Error>
 where
-    M::ConnectError: ShouldRetry + std::fmt::Debug,
+    M::ConnectError: CouldRetry + ShouldRetryWakeCompute + std::fmt::Debug,
     M::Error: From<WakeComputeError>,
 {
     let mut num_retries = 0;
@@ -139,10 +141,10 @@ where
 
     error!(error = ?err, "could not connect to compute node");
 
-    let node_info = if !node_info.cached() || !err.should_retry_database_address() {
+    let node_info = if !node_info.cached() || !err.should_retry_wake_compute() {
         // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
         // Do not need to retrieve a new node_info, just return the old one.
-        if !err.should_retry(num_retries, connect_to_compute_retry_config) {
+        if should_retry(&err, num_retries, connect_to_compute_retry_config) {
             Metrics::get().proxy.retries_metric.observe(
                 RetriesMetricGroup {
                     outcome: ConnectOutcome::Failed,
@@ -188,9 +190,8 @@ where
                 return Ok(res);
             }
             Err(e) => {
-                let retriable = e.should_retry(num_retries, connect_to_compute_retry_config);
-                if !retriable {
-                    error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
+                if !should_retry(&e, num_retries, connect_to_compute_retry_config) {
+                    error!(error = ?e, num_retries, retriable = false, "couldn't connect to compute node");
                     Metrics::get().proxy.retries_metric.observe(
                         RetriesMetricGroup {
                             outcome: ConnectOutcome::Failed,
@@ -200,9 +201,10 @@ where
                     );
                     return Err(e.into());
                 }
-                warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
+
+                warn!(error = ?e, num_retries, retriable = true, "couldn't connect to compute node");
             }
-        }
+        };
 
         let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
         num_retries += 1;
diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs
index 8dec1f1137..644b183a91 100644
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -2,20 +2,22 @@ use crate::{compute, config::RetryConfig};
 use std::{error::Error, io};
 use tokio::time;
 
-pub trait ShouldRetry {
+pub trait CouldRetry {
+    /// Returns true if the error could be retried
     fn could_retry(&self) -> bool;
-    fn should_retry(&self, num_retries: u32, config: RetryConfig) -> bool {
-        match self {
-            _ if num_retries >= config.max_retries => false,
-            err => err.could_retry(),
-        }
-    }
-    fn should_retry_database_address(&self) -> bool {
-        true
-    }
 }
 
-impl ShouldRetry for io::Error {
+pub trait ShouldRetryWakeCompute {
+    /// Returns true if we need to invalidate the cache for this node.
+    /// If false, we can continue retrying with the current node cache.
+    fn should_retry_wake_compute(&self) -> bool;
+}
+
+pub fn should_retry(err: &impl CouldRetry, num_retries: u32, config: RetryConfig) -> bool {
+    num_retries < config.max_retries && err.could_retry()
+}
+
+impl CouldRetry for io::Error {
     fn could_retry(&self) -> bool {
         use std::io::ErrorKind;
         matches!(
@@ -25,7 +27,7 @@ impl ShouldRetry for io::Error {
     }
 }
 
-impl ShouldRetry for tokio_postgres::error::DbError {
+impl CouldRetry for tokio_postgres::error::DbError {
     fn could_retry(&self) -> bool {
         use tokio_postgres::error::SqlState;
         matches!(
@@ -36,7 +38,9 @@ impl ShouldRetry for tokio_postgres::error::DbError {
                 | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
         )
     }
-    fn should_retry_database_address(&self) -> bool {
+}
+impl ShouldRetryWakeCompute for tokio_postgres::error::DbError {
+    fn should_retry_wake_compute(&self) -> bool {
         use tokio_postgres::error::SqlState;
         // Here are errors that happens after the user successfully authenticated to the database.
         // TODO: there are pgbouncer errors that should be retried, but they are not listed here.
@@ -53,7 +57,7 @@ impl ShouldRetry for tokio_postgres::error::DbError {
     }
 }
 
-impl ShouldRetry for tokio_postgres::Error {
+impl CouldRetry for tokio_postgres::Error {
     fn could_retry(&self) -> bool {
         if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
             io::Error::could_retry(io_err)
@@ -63,29 +67,33 @@ impl ShouldRetry for tokio_postgres::Error {
             false
         }
     }
-    fn should_retry_database_address(&self) -> bool {
-        if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
-            io::Error::should_retry_database_address(io_err)
-        } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
-            tokio_postgres::error::DbError::should_retry_database_address(db_err)
+}
+impl ShouldRetryWakeCompute for tokio_postgres::Error {
+    fn should_retry_wake_compute(&self) -> bool {
+        if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
+            tokio_postgres::error::DbError::should_retry_wake_compute(db_err)
         } else {
+            // likely an IO error. Possible the compute has shutdown and the
+            // cache is stale.
             true
         }
     }
 }
 
-impl ShouldRetry for compute::ConnectionError {
+impl CouldRetry for compute::ConnectionError {
     fn could_retry(&self) -> bool {
         match self {
             compute::ConnectionError::Postgres(err) => err.could_retry(),
             compute::ConnectionError::CouldNotConnect(err) => err.could_retry(),
+            compute::ConnectionError::WakeComputeError(err) => err.could_retry(),
             _ => false,
         }
     }
-    fn should_retry_database_address(&self) -> bool {
+}
+impl ShouldRetryWakeCompute for compute::ConnectionError {
+    fn should_retry_wake_compute(&self) -> bool {
         match self {
-            compute::ConnectionError::Postgres(err) => err.should_retry_database_address(),
-            compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(),
+            compute::ConnectionError::Postgres(err) => err.should_retry_wake_compute(),
             // the cache entry was not checked for validity
             compute::ConnectionError::TooManyConnectionAttempts(_) => false,
             _ => true,
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 96683511fe..8119f39fae 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -5,21 +5,21 @@ mod mitm;
 use std::time::Duration;
 
 use super::connect_compute::ConnectMechanism;
-use super::retry::ShouldRetry;
+use super::retry::CouldRetry;
 use super::*;
 use crate::auth::backend::{
     ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
 };
 use crate::config::{CertResolver, RetryConfig};
 use crate::console::caches::NodeInfoCache;
-use crate::console::messages::{ConsoleError, MetricsAuxInfo};
+use crate::console::messages::{ConsoleError, Details, MetricsAuxInfo, Status};
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
-use crate::proxy::retry::retry_after;
 use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
+use retry::{retry_after, ShouldRetryWakeCompute};
 use rstest::rstest;
 use rustls::pki_types;
 use tokio_postgres::config::SslMode;
@@ -438,11 +438,16 @@ impl std::fmt::Display for TestConnectError {
 
 impl std::error::Error for TestConnectError {}
 
-impl ShouldRetry for TestConnectError {
+impl CouldRetry for TestConnectError {
     fn could_retry(&self) -> bool {
         self.retryable
     }
 }
+impl ShouldRetryWakeCompute for TestConnectError {
+    fn should_retry_wake_compute(&self) -> bool {
+        true
+    }
+}
 
 #[async_trait]
 impl ConnectMechanism for TestConnectMechanism {
@@ -485,7 +490,7 @@ impl TestBackend for TestConnectMechanism {
             ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
             ConnectAction::WakeFail => {
                 let err = console::errors::ApiError::Console(ConsoleError {
-                    http_status_code: http::StatusCode::FORBIDDEN,
+                    http_status_code: http::StatusCode::BAD_REQUEST,
                     error: "TEST".into(),
                     status: None,
                 });
@@ -496,7 +501,15 @@ impl TestBackend for TestConnectMechanism {
                 let err = console::errors::ApiError::Console(ConsoleError {
                     http_status_code: http::StatusCode::BAD_REQUEST,
                     error: "TEST".into(),
-                    status: None,
+                    status: Some(Status {
+                        code: "error".into(),
+                        message: "error".into(),
+                        details: Details {
+                            error_info: None,
+                            retry_info: Some(console::messages::RetryInfo { retry_delay_ms: 1 }),
+                            user_facing_message: None,
+                        },
+                    }),
                 });
                 assert!(err.could_retry());
                 Err(console::errors::WakeComputeError::ApiError(err))
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index c166cf4389..fef349aac0 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,18 +1,16 @@
 use crate::config::RetryConfig;
-use crate::console::messages::ConsoleError;
+use crate::console::messages::{ConsoleError, Reason};
 use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
 use crate::context::RequestMonitoring;
 use crate::metrics::{
     ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,
     WakeupFailureKind,
 };
-use crate::proxy::retry::retry_after;
+use crate::proxy::retry::{retry_after, should_retry};
 use hyper1::StatusCode;
-use std::ops::ControlFlow;
 use tracing::{error, info, warn};
 
 use super::connect_compute::ComputeConnectBackend;
-use super::retry::ShouldRetry;
 
 pub async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
@@ -22,9 +20,8 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
 ) -> Result<CachedNodeInfo, WakeComputeError> {
     let retry_type = RetryType::WakeCompute;
     loop {
-        let wake_res = api.wake_compute(ctx).await;
-        match handle_try_wake(wake_res, *num_retries, config) {
-            Err(e) => {
+        match api.wake_compute(ctx).await {
+            Err(e) if !should_retry(&e, *num_retries, config) => {
                 error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
                 report_error(&e, false);
                 Metrics::get().proxy.retries_metric.observe(
@@ -36,11 +33,11 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
                 );
                 return Err(e);
             }
-            Ok(ControlFlow::Continue(e)) => {
+            Err(e) => {
                 warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
                 report_error(&e, true);
             }
-            Ok(ControlFlow::Break(n)) => {
+            Ok(n) => {
                 Metrics::get().proxy.retries_metric.observe(
                     RetriesMetricGroup {
                         outcome: ConnectOutcome::Success,
@@ -63,70 +60,28 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
     }
 }
 
-/// Attempts to wake up the compute node.
-/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
-/// * Returns Ok(Break(node)) if the wakeup succeeded
-/// * Returns Err(e) if there was an error
-pub fn handle_try_wake(
-    result: Result<CachedNodeInfo, WakeComputeError>,
-    num_retries: u32,
-    config: RetryConfig,
-) -> Result<ControlFlow<CachedNodeInfo, WakeComputeError>, WakeComputeError> {
-    match result {
-        Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.should_retry(num_retries, config) => {
-                Ok(ControlFlow::Continue(err))
-            }
-            _ => Err(err),
-        },
-        // Ready to try again.
-        Ok(new) => Ok(ControlFlow::Break(new)),
-    }
-}
-
 fn report_error(e: &WakeComputeError, retry: bool) {
     use crate::console::errors::ApiError;
     let kind = match e {
         WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress,
         WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError,
         WakeComputeError::ApiError(ApiError::Console(e)) => match e.get_reason() {
-            crate::console::messages::Reason::RoleProtected => {
-                WakeupFailureKind::ApiConsoleBadRequest
-            }
-            crate::console::messages::Reason::ResourceNotFound => {
-                WakeupFailureKind::ApiConsoleBadRequest
-            }
-            crate::console::messages::Reason::ProjectNotFound => {
-                WakeupFailureKind::ApiConsoleBadRequest
-            }
-            crate::console::messages::Reason::EndpointNotFound => {
-                WakeupFailureKind::ApiConsoleBadRequest
-            }
-            crate::console::messages::Reason::BranchNotFound => {
-                WakeupFailureKind::ApiConsoleBadRequest
-            }
-            crate::console::messages::Reason::RateLimitExceeded => {
-                WakeupFailureKind::ApiConsoleLocked
-            }
-            crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => {
-                WakeupFailureKind::QuotaExceeded
-            }
-            crate::console::messages::Reason::ActiveTimeQuotaExceeded => {
-                WakeupFailureKind::QuotaExceeded
-            }
-            crate::console::messages::Reason::ComputeTimeQuotaExceeded => {
-                WakeupFailureKind::QuotaExceeded
-            }
-            crate::console::messages::Reason::WrittenDataQuotaExceeded => {
-                WakeupFailureKind::QuotaExceeded
-            }
-            crate::console::messages::Reason::DataTransferQuotaExceeded => {
-                WakeupFailureKind::QuotaExceeded
-            }
-            crate::console::messages::Reason::LogicalSizeQuotaExceeded => {
-                WakeupFailureKind::QuotaExceeded
-            }
-            crate::console::messages::Reason::Unknown => match e {
+            Reason::RoleProtected => WakeupFailureKind::ApiConsoleBadRequest,
+            Reason::ResourceNotFound => WakeupFailureKind::ApiConsoleBadRequest,
+            Reason::ProjectNotFound => WakeupFailureKind::ApiConsoleBadRequest,
+            Reason::EndpointNotFound => WakeupFailureKind::ApiConsoleBadRequest,
+            Reason::BranchNotFound => WakeupFailureKind::ApiConsoleBadRequest,
+            Reason::RateLimitExceeded => WakeupFailureKind::ApiConsoleLocked,
+            Reason::NonDefaultBranchComputeTimeExceeded => WakeupFailureKind::QuotaExceeded,
+            Reason::ActiveTimeQuotaExceeded => WakeupFailureKind::QuotaExceeded,
+            Reason::ComputeTimeQuotaExceeded => WakeupFailureKind::QuotaExceeded,
+            Reason::WrittenDataQuotaExceeded => WakeupFailureKind::QuotaExceeded,
+            Reason::DataTransferQuotaExceeded => WakeupFailureKind::QuotaExceeded,
+            Reason::LogicalSizeQuotaExceeded => WakeupFailureKind::QuotaExceeded,
+            Reason::ConcurrencyLimitReached => WakeupFailureKind::ApiConsoleLocked,
+            Reason::LockAlreadyTaken => WakeupFailureKind::ApiConsoleLocked,
+            Reason::RunningOperations => WakeupFailureKind::ApiConsoleLocked,
+            Reason::Unknown => match e {
                 ConsoleError {
                     http_status_code: StatusCode::LOCKED,
                     ref error,
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 05d6061238..6c34d48338 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -16,7 +16,10 @@ use crate::{
     context::RequestMonitoring,
     error::{ErrorKind, ReportableError, UserFacingError},
     intern::EndpointIdInt,
-    proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
+    proxy::{
+        connect_compute::ConnectMechanism,
+        retry::{CouldRetry, ShouldRetryWakeCompute},
+    },
     rate_limiter::EndpointRateLimiter,
     Host,
 };
@@ -179,7 +182,7 @@ impl UserFacingError for HttpConnError {
     }
 }
 
-impl ShouldRetry for HttpConnError {
+impl CouldRetry for HttpConnError {
     fn could_retry(&self) -> bool {
         match self {
             HttpConnError::ConnectionError(e) => e.could_retry(),
@@ -190,9 +193,11 @@ impl ShouldRetry for HttpConnError {
             HttpConnError::TooManyConnectionAttempts(_) => false,
         }
     }
-    fn should_retry_database_address(&self) -> bool {
+}
+impl ShouldRetryWakeCompute for HttpConnError {
+    fn should_retry_wake_compute(&self) -> bool {
         match self {
-            HttpConnError::ConnectionError(e) => e.should_retry_database_address(),
+            HttpConnError::ConnectionError(e) => e.should_retry_wake_compute(),
             // we never checked cache validity
             HttpConnError::TooManyConnectionAttempts(_) => false,
             _ => true,

From 76864e6a2a67f1ae0480bffafbad7114d77c1826 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 25 Jun 2024 16:49:29 -0400
Subject: [PATCH 1048/1571] feat(pageserver): add image layer iterator (#8006)

part of https://github.com/neondatabase/neon/issues/8002

## Summary of changes

This pull request adds the image layer iterator. It buffers a fixed
amount of key-value pairs in memory, and give developer an iterator
abstraction over the image layer. Once the buffer is exhausted, it will
issue 1 I/O to fetch the next batch.

Due to the Rust lifetime mysteries, the `get_stream_from` API has been
refactored to `into_stream` and consumes `self`.

Delta layer iterator implementation will be similar, therefore I'll add
it after this pull request gets merged.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs                |   7 +-
 pageserver/src/tenant/block_io.rs             |   1 +
 pageserver/src/tenant/disk_btree.rs           |  32 ++-
 .../src/tenant/storage_layer/delta_layer.rs   |  28 +--
 .../src/tenant/storage_layer/image_layer.rs   | 209 +++++++++++++++++-
 pageserver/src/tenant/storage_layer/layer.rs  |  14 +-
 pageserver/src/tenant/vectored_blob_io.rs     | 110 ++++++++-
 7 files changed, 363 insertions(+), 38 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 997c1cc43a..cd430bfab7 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -160,8 +160,9 @@ impl Key {
         key
     }
 
-    /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys).
+    /// Convert a 18B slice to a key. This function should not be used for 16B metadata keys because `field2` is handled differently.
+    /// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys). There are some restrictions on `field2`,
+    /// and therefore not all 18B slices are valid page server keys.
     pub fn from_slice(b: &[u8]) -> Self {
         Key {
             field1: b[0],
@@ -173,7 +174,7 @@ impl Key {
         }
     }
 
-    /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
+    /// Convert a key to a 18B slice. This function should not be used for getting a 16B metadata key because `field2` is handled differently.
     /// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys).
     pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
         buf[0] = self.field1;
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 92928116c1..b406d50332 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -160,6 +160,7 @@ impl<'a> BlockCursor<'a> {
 ///
 /// The file is assumed to be immutable. This doesn't provide any functions
 /// for modifying the file, nor for invalidating the cache if it is modified.
+#[derive(Clone)]
 pub struct FileBlockReader<'a> {
     pub file: &'a VirtualFile,
 
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 119df3e6c4..b76498b608 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -212,6 +212,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
 ///
 /// Public reader object, to search the tree.
 ///
+#[derive(Clone)]
 pub struct DiskBtreeReader<R, const L: usize>
 where
     R: BlockReader,
@@ -259,27 +260,38 @@ where
         Ok(result)
     }
 
-    pub fn iter<'a>(
-        &'a self,
-        start_key: &'a [u8; L],
-        ctx: &'a RequestContext,
-    ) -> DiskBtreeIterator<'a> {
+    pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a>
+    where
+        R: 'a,
+    {
         DiskBtreeIterator {
-            stream: Box::pin(self.get_stream_from(start_key, ctx)),
+            stream: Box::pin(self.into_stream(start_key, ctx)),
         }
     }
 
     /// Return a stream which yields all key, value pairs from the index
     /// starting from the first key greater or equal to `start_key`.
     ///
-    /// Note that this is a copy of [`Self::visit`].
+    /// Note 1: that this is a copy of [`Self::visit`].
     /// TODO: Once the sequential read path is removed this will become
     /// the only index traversal method.
-    pub fn get_stream_from<'a>(
-        &'a self,
+    ///
+    /// Note 2: this function used to take `&self` but it now consumes `self`. This is due to
+    /// the lifetime constraints of the reader and the stream / iterator it creates. Using `&self`
+    /// requires the reader to be present when the stream is used, and this creates a lifetime
+    /// dependency between the reader and the stream. Now if we want to create an iterator that
+    /// holds the stream, someone will need to keep a reference to the reader, which is inconvenient
+    /// to use from the image/delta layer APIs.
+    ///
+    /// Feel free to add the `&self` variant back if it's necessary.
+    pub fn into_stream<'a>(
+        self,
         start_key: &'a [u8; L],
         ctx: &'a RequestContext,
-    ) -> impl Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a {
+    ) -> impl Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a
+    where
+        R: 'a,
+    {
         try_stream! {
             let mut stack = Vec::new();
             stack.push((self.root_blk, None));
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 5e01ecd71d..ab3ef4980f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -941,7 +941,7 @@ impl DeltaLayerInner {
         );
         let mut result = Vec::new();
         let mut stream =
-            Box::pin(self.stream_index_forwards(&index_reader, &[0; DELTA_KEY_SIZE], ctx));
+            Box::pin(self.stream_index_forwards(index_reader, &[0; DELTA_KEY_SIZE], ctx));
         let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let cursor = block_reader.block_cursor();
         let mut buf = Vec::new();
@@ -976,7 +976,7 @@ impl DeltaLayerInner {
         ctx: &RequestContext,
     ) -> anyhow::Result<Vec<VectoredRead>>
     where
-        Reader: BlockReader,
+        Reader: BlockReader + Clone,
     {
         let ctx = RequestContextBuilder::extend(ctx)
             .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
@@ -986,7 +986,7 @@ impl DeltaLayerInner {
             let mut range_end_handled = false;
 
             let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start);
-            let index_stream = index_reader.get_stream_from(&start_key.0, &ctx);
+            let index_stream = index_reader.clone().into_stream(&start_key.0, &ctx);
             let mut index_stream = std::pin::pin!(index_stream);
 
             while let Some(index_entry) = index_stream.next().await {
@@ -1241,7 +1241,7 @@ impl DeltaLayerInner {
             block_reader,
         );
 
-        let stream = self.stream_index_forwards(&tree_reader, &[0u8; DELTA_KEY_SIZE], ctx);
+        let stream = self.stream_index_forwards(tree_reader, &[0u8; DELTA_KEY_SIZE], ctx);
         let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos));
         // put in a sentinel value for getting the end offset for last item, and not having to
         // repeat the whole read part
@@ -1300,7 +1300,7 @@ impl DeltaLayerInner {
                         offsets.start.pos(),
                         offsets.end.pos(),
                         meta,
-                        max_read_size,
+                        Some(max_read_size),
                     ))
                 }
             } else {
@@ -1459,17 +1459,17 @@ impl DeltaLayerInner {
 
     fn stream_index_forwards<'a, R>(
         &'a self,
-        reader: &'a DiskBtreeReader<R, DELTA_KEY_SIZE>,
+        reader: DiskBtreeReader<R, DELTA_KEY_SIZE>,
         start: &'a [u8; DELTA_KEY_SIZE],
         ctx: &'a RequestContext,
     ) -> impl futures::stream::Stream<
         Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>,
     > + 'a
     where
-        R: BlockReader,
+        R: BlockReader + 'a,
     {
         use futures::stream::TryStreamExt;
-        let stream = reader.get_stream_from(start, ctx);
+        let stream = reader.into_stream(start, ctx);
         stream.map_ok(|(key, value)| {
             let key = DeltaKey::from_slice(&key);
             let (key, lsn) = (key.key(), key.lsn());
@@ -1857,7 +1857,7 @@ mod test {
             .finish(entries_meta.key_range.end, &timeline, &ctx)
             .await?;
 
-        let inner = resident.as_delta(&ctx).await?;
+        let inner = resident.get_as_delta(&ctx).await?;
 
         let file_size = inner.file.metadata().await?.len();
         tracing::info!(
@@ -2044,11 +2044,11 @@ mod test {
 
             let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();
 
-            copied_layer.as_delta(ctx).await.unwrap();
+            copied_layer.get_as_delta(ctx).await.unwrap();
 
             assert_keys_and_values_eq(
-                new_layer.as_delta(ctx).await.unwrap(),
-                copied_layer.as_delta(ctx).await.unwrap(),
+                new_layer.get_as_delta(ctx).await.unwrap(),
+                copied_layer.get_as_delta(ctx).await.unwrap(),
                 truncate_at,
                 ctx,
             )
@@ -2073,7 +2073,7 @@ mod test {
             source.index_root_blk,
             &source_reader,
         );
-        let source_stream = source.stream_index_forwards(&source_tree, &start_key, ctx);
+        let source_stream = source.stream_index_forwards(source_tree, &start_key, ctx);
         let source_stream = source_stream.filter(|res| match res {
             Ok((_, lsn, _)) => ready(lsn < &truncated_at),
             _ => ready(true),
@@ -2086,7 +2086,7 @@ mod test {
             truncated.index_root_blk,
             &truncated_reader,
         );
-        let truncated_stream = truncated.stream_index_forwards(&truncated_tree, &start_key, ctx);
+        let truncated_stream = truncated.stream_index_forwards(truncated_tree, &start_key, ctx);
         let mut truncated_stream = std::pin::pin!(truncated_stream);
 
         let mut scratch_left = Vec::new();
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 06e2f09384..99bce1890d 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -495,7 +495,7 @@ impl ImageLayerInner {
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
         let mut result = Vec::new();
-        let mut stream = Box::pin(tree_reader.get_stream_from(&[0; KEY_SIZE], ctx));
+        let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx));
         let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let cursor = block_reader.block_cursor();
         while let Some(item) = stream.next().await {
@@ -544,7 +544,7 @@ impl ImageLayerInner {
             let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
             range.start.write_to_byte_slice(&mut search_key);
 
-            let index_stream = tree_reader.get_stream_from(&search_key, &ctx);
+            let index_stream = tree_reader.clone().into_stream(&search_key, &ctx);
             let mut index_stream = std::pin::pin!(index_stream);
 
             while let Some(index_entry) = index_stream.next().await {
@@ -689,6 +689,24 @@ impl ImageLayerInner {
             };
         }
     }
+
+    #[cfg(test)]
+    pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
+        ImageLayerIterator {
+            image_layer: self,
+            ctx,
+            index_iter: tree_reader.iter(&[0; KEY_SIZE], ctx),
+            key_values_batch: std::collections::VecDeque::new(),
+            is_end: false,
+            planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
+                1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
+                1024,        // The default value. Unit tests might use a different value
+            ),
+        }
+    }
 }
 
 /// A builder object for constructing a new image layer.
@@ -943,11 +961,77 @@ impl Drop for ImageLayerWriter {
     }
 }
 
+#[cfg(test)]
+pub struct ImageLayerIterator<'a> {
+    image_layer: &'a ImageLayerInner,
+    ctx: &'a RequestContext,
+    planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
+    index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
+    key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
+    is_end: bool,
+}
+
+#[cfg(test)]
+impl<'a> ImageLayerIterator<'a> {
+    /// Retrieve a batch of key-value pairs into the iterator buffer.
+    async fn next_batch(&mut self) -> anyhow::Result<()> {
+        assert!(self.key_values_batch.is_empty());
+        assert!(!self.is_end);
+
+        let plan = loop {
+            if let Some(res) = self.index_iter.next().await {
+                let (raw_key, offset) = res?;
+                if let Some(batch_plan) = self.planner.handle(
+                    Key::from_slice(&raw_key[..KEY_SIZE]),
+                    self.image_layer.lsn,
+                    offset,
+                    BlobFlag::None,
+                ) {
+                    break batch_plan;
+                }
+            } else {
+                self.is_end = true;
+                let payload_end = self.image_layer.index_start_blk as u64 * PAGE_SZ as u64;
+                break self.planner.handle_range_end(payload_end);
+            }
+        };
+        let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
+        let mut next_batch = std::collections::VecDeque::new();
+        let buf_size = plan.size();
+        let buf = BytesMut::with_capacity(buf_size);
+        let blobs_buf = vectored_blob_reader
+            .read_blobs(&plan, buf, self.ctx)
+            .await?;
+        let frozen_buf: Bytes = blobs_buf.buf.freeze();
+        for meta in blobs_buf.blobs.iter() {
+            let img_buf = frozen_buf.slice(meta.start..meta.end);
+            next_batch.push_back((meta.meta.key, self.image_layer.lsn, Value::Image(img_buf)));
+        }
+        self.key_values_batch = next_batch;
+        Ok(())
+    }
+
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        if self.key_values_batch.is_empty() {
+            if self.is_end {
+                return Ok(None);
+            }
+            self.next_batch().await?;
+        }
+        Ok(Some(
+            self.key_values_batch
+                .pop_front()
+                .expect("should not be empty"),
+        ))
+    }
+}
+
 #[cfg(test)]
 mod test {
-    use std::time::Duration;
+    use std::{sync::Arc, time::Duration};
 
     use bytes::Bytes;
+    use itertools::Itertools;
     use pageserver_api::{
         key::Key,
         shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
@@ -959,11 +1043,19 @@ mod test {
     };
 
     use crate::{
-        tenant::{config::TenantConf, harness::TenantHarness},
+        context::RequestContext,
+        repository::Value,
+        tenant::{
+            config::TenantConf,
+            harness::{TenantHarness, TIMELINE_ID},
+            storage_layer::ResidentLayer,
+            vectored_blob_io::StreamingVectoredReadPlanner,
+            Tenant, Timeline,
+        },
         DEFAULT_PG_VERSION,
     };
 
-    use super::ImageLayerWriter;
+    use super::{ImageLayerIterator, ImageLayerWriter};
 
     #[tokio::test]
     async fn image_layer_rewrite() {
@@ -1134,4 +1226,111 @@ mod test {
             }
         }
     }
+
+    async fn produce_image_layer(
+        tenant: &Tenant,
+        tline: &Arc<Timeline>,
+        mut images: Vec<(Key, Bytes)>,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ResidentLayer> {
+        images.sort();
+        let (key_start, _) = images.first().unwrap();
+        let (key_last, _) = images.last().unwrap();
+        let key_end = key_last.next();
+        let key_range = *key_start..key_end;
+        let mut writer = ImageLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            &key_range,
+            lsn,
+            ctx,
+        )
+        .await?;
+
+        for (key, img) in images {
+            writer.put_image(key, img, ctx).await?;
+        }
+        let img_layer = writer.finish(tline, ctx).await?;
+
+        Ok::<_, anyhow::Error>(img_layer)
+    }
+
+    async fn assert_img_iter_equal(
+        img_iter: &mut ImageLayerIterator<'_>,
+        expect: &[(Key, Bytes)],
+        expect_lsn: Lsn,
+    ) {
+        let mut expect_iter = expect.iter();
+        loop {
+            let o1 = img_iter.next().await.unwrap();
+            let o2 = expect_iter.next();
+            match (o1, o2) {
+                (None, None) => break,
+                (Some((k1, l1, v1)), Some((k2, i2))) => {
+                    let Value::Image(i1) = v1 else {
+                        panic!("expect Value::Image")
+                    };
+                    assert_eq!(&k1, k2);
+                    assert_eq!(l1, expect_lsn);
+                    assert_eq!(&i1, i2);
+                }
+                (o1, o2) => panic!("iterators length mismatch: {:?}, {:?}", o1, o2),
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn image_layer_iterator() {
+        let harness = TenantHarness::create("image_layer_iterator").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+        const N: usize = 1000;
+        let test_imgs = (0..N)
+            .map(|idx| (get_key(idx as u32), Bytes::from(format!("img{idx:05}"))))
+            .collect_vec();
+        let resident_layer =
+            produce_image_layer(&tenant, &tline, test_imgs.clone(), Lsn(0x10), &ctx)
+                .await
+                .unwrap();
+        let img_layer = resident_layer.get_as_image(&ctx).await.unwrap();
+        for max_read_size in [1, 1024] {
+            for batch_size in [1, 2, 4, 8, 3, 7, 13] {
+                println!("running with batch_size={batch_size} max_read_size={max_read_size}");
+                // Test if the batch size is correctly determined
+                let mut iter = img_layer.iter(&ctx);
+                iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
+                let mut num_items = 0;
+                for _ in 0..3 {
+                    iter.next_batch().await.unwrap();
+                    num_items += iter.key_values_batch.len();
+                    if max_read_size == 1 {
+                        // every key should be a batch b/c the value is larger than max_read_size
+                        assert_eq!(iter.key_values_batch.len(), 1);
+                    } else {
+                        assert_eq!(iter.key_values_batch.len(), batch_size);
+                    }
+                    if num_items >= N {
+                        break;
+                    }
+                    iter.key_values_batch.clear();
+                }
+                // Test if the result is correct
+                let mut iter = img_layer.iter(&ctx);
+                iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
+                assert_img_iter_equal(&mut iter, &test_imgs, Lsn(0x10)).await;
+            }
+        }
+    }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 32acb3f0cd..d856909f2e 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1905,7 +1905,7 @@ impl ResidentLayer {
     }
 
     #[cfg(test)]
-    pub(crate) async fn as_delta(
+    pub(crate) async fn get_as_delta(
         &self,
         ctx: &RequestContext,
     ) -> anyhow::Result<&delta_layer::DeltaLayerInner> {
@@ -1915,6 +1915,18 @@ impl ResidentLayer {
             Image(_) => Err(anyhow::anyhow!("image layer")),
         }
     }
+
+    #[cfg(test)]
+    pub(crate) async fn get_as_image(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<&image_layer::ImageLayerInner> {
+        use LayerKind::*;
+        match self.downloaded.get(&self.owner.0, ctx).await? {
+            Image(ref d) => Ok(d),
+            Delta(_) => Err(anyhow::anyhow!("delta layer")),
+        }
+    }
 }
 
 impl AsLayerDesc for ResidentLayer {
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 6e825760e3..1241a13902 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -77,7 +77,7 @@ pub(crate) struct VectoredReadBuilder {
     start: u64,
     end: u64,
     blobs_at: VecMap<u64, BlobMeta>,
-    max_read_size: usize,
+    max_read_size: Option<usize>,
 }
 
 impl VectoredReadBuilder {
@@ -90,7 +90,7 @@ impl VectoredReadBuilder {
         start_offset: u64,
         end_offset: u64,
         meta: BlobMeta,
-        max_read_size: usize,
+        max_read_size: Option<usize>,
     ) -> Self {
         let mut blobs_at = VecMap::default();
         blobs_at
@@ -111,7 +111,13 @@ impl VectoredReadBuilder {
     pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
         tracing::trace!(start, end, "trying to extend");
         let size = (end - start) as usize;
-        if self.end == start && self.size() + size <= self.max_read_size {
+        if self.end == start && {
+            if let Some(max_read_size) = self.max_read_size {
+                self.size() + size <= max_read_size
+            } else {
+                true
+            }
+        } {
             self.end = end;
             self.blobs_at
                 .append(start, meta)
@@ -157,7 +163,7 @@ pub struct VectoredReadPlanner {
     // Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
     prev: Option<(Key, Lsn, u64, BlobFlag)>,
 
-    max_read_size: usize,
+    max_read_size: Option<usize>,
 }
 
 impl VectoredReadPlanner {
@@ -165,7 +171,20 @@ impl VectoredReadPlanner {
         Self {
             blobs: BTreeMap::new(),
             prev: None,
-            max_read_size,
+            max_read_size: Some(max_read_size),
+        }
+    }
+
+    /// This function should *only* be used if the caller has a way to control the limit. e.g., in [`StreamingVectoredReadPlanner`],
+    /// it uses the vectored read planner to avoid duplicated logic on handling blob start/end, while expecting the vectored
+    /// read planner to give a single read to a continuous range of bytes in the image layer. Therefore, it does not need the
+    /// code path to split reads into chunks of `max_read_size`, and controls the read size itself.
+    #[cfg(test)]
+    pub(crate) fn new_caller_controlled_max_limit() -> Self {
+        Self {
+            blobs: BTreeMap::new(),
+            prev: None,
+            max_read_size: None,
         }
     }
 
@@ -354,6 +373,87 @@ impl<'a> VectoredBlobReader<'a> {
     }
 }
 
+/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
+/// getting read blobs. It returns a batch when `handle` gets called and when the current key would exceed the read_size and
+/// max_cnt constraints. Underlying it uses [`VectoredReadPlanner`].
+#[cfg(test)]
+pub struct StreamingVectoredReadPlanner {
+    planner: VectoredReadPlanner,
+    /// Max read size per batch
+    max_read_size: u64,
+    /// Max item count per batch
+    max_cnt: usize,
+    /// The first offset of this batch
+    this_batch_first_offset: Option<u64>,
+    /// Size of the current batch
+    cnt: usize,
+}
+
+#[cfg(test)]
+impl StreamingVectoredReadPlanner {
+    pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
+        assert!(max_cnt > 0);
+        assert!(max_read_size > 0);
+        Self {
+            // We want to have exactly one read syscall (plus several others for index lookup) for each `next_batch` call.
+            // Therefore, we enforce `self.max_read_size` by ourselves instead of using the VectoredReadPlanner's capability,
+            // to avoid splitting into two I/Os.
+            planner: VectoredReadPlanner::new_caller_controlled_max_limit(),
+            max_cnt,
+            max_read_size,
+            this_batch_first_offset: None,
+            cnt: 0,
+        }
+    }
+
+    fn emit(&mut self, this_batch_first_offset: u64) -> VectoredRead {
+        let planner = std::mem::replace(
+            &mut self.planner,
+            VectoredReadPlanner::new_caller_controlled_max_limit(),
+        );
+        self.this_batch_first_offset = Some(this_batch_first_offset);
+        self.cnt = 1;
+        let mut batch = planner.finish();
+        assert_eq!(batch.len(), 1, "should have exactly one read batch");
+        batch.pop().unwrap()
+    }
+
+    pub fn handle(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        offset: u64,
+        flag: BlobFlag,
+    ) -> Option<VectoredRead> {
+        if let Some(begin_offset) = self.this_batch_first_offset {
+            // Each batch will have at least one item b/c `self.this_batch_first_offset` is set
+            // after one item gets processed
+            if offset - begin_offset > self.max_read_size {
+                self.planner.handle_range_end(offset); // End the current batch with the offset
+                let batch = self.emit(offset); // Produce a batch
+                self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch
+                return Some(batch);
+            }
+        } else {
+            self.this_batch_first_offset = Some(offset)
+        }
+        if self.cnt >= self.max_cnt {
+            self.planner.handle_range_end(offset); // End the current batch with the offset
+            let batch = self.emit(offset); // Produce a batch
+            self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch
+            return Some(batch);
+        }
+        self.planner.handle(key, lsn, offset, flag); // Add this key to the current batch
+        self.cnt += 1;
+        None
+    }
+
+    pub fn handle_range_end(&mut self, offset: u64) -> VectoredRead {
+        self.planner.handle_range_end(offset);
+        self.emit(offset)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;

From 9b98823d615c991422b6edd3ec3197192f763cf2 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 25 Jun 2024 19:00:14 -0400
Subject: [PATCH 1049/1571] bottom-most-compaction: use in test_gc_feedback +
 fix bugs (#8103)

Adds manual compaction trigger; add gc compaction to test_gc_feedback

Part of https://github.com/neondatabase/neon/issues/8002

```
test_gc_feedback[debug-pg15].logical_size: 50 Mb
test_gc_feedback[debug-pg15].physical_size: 2269 Mb
test_gc_feedback[debug-pg15].physical/logical ratio: 44.5302
test_gc_feedback[debug-pg15].max_total_num_of_deltas: 7
test_gc_feedback[debug-pg15].max_num_of_deltas_above_image: 2
test_gc_feedback[debug-pg15].logical_size_after_bottom_most_compaction: 50 Mb
test_gc_feedback[debug-pg15].physical_size_after_bottom_most_compaction: 287 Mb
test_gc_feedback[debug-pg15].physical/logical ratio after bottom_most_compaction: 5.6312
test_gc_feedback[debug-pg15].max_total_num_of_deltas_after_bottom_most_compaction: 4
test_gc_feedback[debug-pg15].max_num_of_deltas_above_image_after_bottom_most_compaction: 1
```

## Summary of changes

* Add the manual compaction trigger
* Use in test_gc_feedback
* Add a guard to avoid running it with retain_lsns
* Fix: Do `schedule_compaction_update` after compaction
* Fix: Supply deltas in the correct order to reconstruct value

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs                 |  8 ++++
 .../src/tenant/storage_layer/delta_layer.rs   |  1 -
 .../src/tenant/storage_layer/image_layer.rs   |  1 -
 pageserver/src/tenant/storage_layer/layer.rs  |  2 -
 pageserver/src/tenant/timeline.rs             |  2 +-
 pageserver/src/tenant/timeline/compaction.rs  | 41 ++++++++++++----
 .../src/tenant/timeline/layer_manager.rs      |  1 -
 test_runner/fixtures/pageserver/http.py       |  3 ++
 test_runner/performance/test_gc_feedback.py   | 48 ++++++++++++++++++-
 9 files changed, 92 insertions(+), 15 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index d6ba9ee35e..41d096d7bb 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1652,6 +1652,14 @@ async fn timeline_compact_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
         flags |= CompactFlags::ForceImageLayerCreation;
     }
+    if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
+        if !cfg!(feature = "testing") {
+            return Err(ApiError::InternalServerError(anyhow!(
+                "enhanced_gc_bottom_most_compaction is only available in testing mode"
+            )));
+        }
+        flags |= CompactFlags::EnhancedGcBottomMostCompaction;
+    }
     let wait_until_uploaded =
         parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index ab3ef4980f..bf5d9249eb 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -928,7 +928,6 @@ impl DeltaLayerInner {
     }
 
     /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
-    #[cfg(test)]
     pub(super) async fn load_key_values(
         &self,
         ctx: &RequestContext,
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 99bce1890d..50aacbd9ad 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -486,7 +486,6 @@ impl ImageLayerInner {
     }
 
     /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
-    #[cfg(test)]
     pub(super) async fn load_key_values(
         &self,
         ctx: &RequestContext,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index d856909f2e..7eb42d8186 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -389,7 +389,6 @@ impl Layer {
     }
 
     /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
-    #[cfg(test)]
     pub(crate) async fn load_key_values(
         &self,
         ctx: &RequestContext,
@@ -1774,7 +1773,6 @@ impl DownloadedLayer {
         }
     }
 
-    #[cfg(test)]
     async fn load_key_values(
         &self,
         owner: &Arc<LayerInner>,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5398ad399c..1175b75017 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -686,6 +686,7 @@ pub enum GetLogicalSizePriority {
 pub(crate) enum CompactFlags {
     ForceRepartition,
     ForceImageLayerCreation,
+    EnhancedGcBottomMostCompaction,
 }
 
 impl std::fmt::Debug for Timeline {
@@ -1096,7 +1097,6 @@ impl Timeline {
     /// scan iterator interface. We could optimize this interface later to avoid some checks in the vectored
     /// get path to maintain and split the probing and to-be-probe keyspace. We also need to ensure that
     /// the scan operation will not cause OOM in the future.
-    #[allow(dead_code)]
     pub(crate) async fn scan(
         &self,
         keyspace: KeySpace,
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index de1263fadf..efaa6144af 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -47,10 +47,14 @@ impl Timeline {
     /// TODO: cancellation
     pub(crate) async fn compact_legacy(
         self: &Arc<Self>,
-        _cancel: &CancellationToken,
+        cancel: &CancellationToken,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
     ) -> Result<(), CompactionError> {
+        if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
+            return self.compact_with_gc(cancel, ctx).await;
+        }
+
         // High level strategy for compaction / image creation:
         //
         // 1. First, calculate the desired "partitioning" of the
@@ -959,15 +963,20 @@ impl Timeline {
     /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
     /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
     /// and create delta layers with all deltas >= gc horizon.
-    #[cfg(test)]
     pub(crate) async fn compact_with_gc(
         self: &Arc<Self>,
         _cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> Result<(), CompactionError> {
+        use crate::tenant::storage_layer::ValueReconstructState;
         use std::collections::BTreeSet;
 
-        use crate::tenant::storage_layer::ValueReconstructState;
+        info!("running enhanced gc bottom-most compaction");
+
+        scopeguard::defer! {
+            info!("done enhanced gc bottom-most compaction");
+        };
+
         // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
         // The layer selection has the following properties:
         // 1. If a layer is in the selection, all layers below it are in the selection.
@@ -976,6 +985,11 @@ impl Timeline {
             let guard = self.layers.read().await;
             let layers = guard.layer_map();
             let gc_info = self.gc_info.read().unwrap();
+            if !gc_info.retain_lsns.is_empty() || !gc_info.leases.is_empty() {
+                return Err(CompactionError::Other(anyhow!(
+                    "enhanced legacy compaction currently does not support retain_lsns (branches)"
+                )));
+            }
             let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
             let mut selected_layers = Vec::new();
             // TODO: consider retain_lsns
@@ -987,6 +1001,11 @@ impl Timeline {
             }
             (selected_layers, gc_cutoff)
         };
+        info!(
+            "picked {} layers for compaction with gc_cutoff={}",
+            layer_selection.len(),
+            gc_cutoff
+        );
         // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
         // Also, collect the layer information to decide when to split the new delta layers.
         let mut all_key_values = Vec::new();
@@ -1064,10 +1083,8 @@ impl Timeline {
                 } else if *lsn <= horizon {
                     match val {
                         crate::repository::Value::Image(image) => {
-                            if lsn <= &horizon {
-                                base_image = Some((*lsn, image.clone()));
-                                break;
-                            }
+                            base_image = Some((*lsn, image.clone()));
+                            break;
                         }
                         crate::repository::Value::WalRecord(wal) => {
                             delta_above_base_image.push((*lsn, wal.clone()));
@@ -1075,7 +1092,7 @@ impl Timeline {
                     }
                 }
             }
-            delta_above_base_image.reverse();
+            // do not reverse delta_above_base_image, reconstruct state expects reversely-ordered records
             keys_above_horizon.reverse();
             let state = ValueReconstructState {
                 img: base_image,
@@ -1200,6 +1217,11 @@ impl Timeline {
         );
 
         let image_layer = image_layer_writer.finish(self, ctx).await?;
+        info!(
+            "produced {} delta layers and {} image layers",
+            delta_layers.len(),
+            1
+        );
         let mut compact_to = Vec::new();
         compact_to.extend(delta_layers);
         compact_to.push(image_layer);
@@ -1208,6 +1230,9 @@ impl Timeline {
             let mut guard = self.layers.write().await;
             guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
         };
+
+        self.remote_client
+            .schedule_compaction_update(&layer_selection, &compact_to)?;
         Ok(())
     }
 }
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 550a9a567a..948237e06a 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -227,7 +227,6 @@ impl LayerManager {
     }
 
     /// Called when a GC-compaction is completed.
-    #[cfg(test)]
     pub(crate) fn finish_gc_compaction(
         &mut self,
         compact_from: &[Layer],
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 2a7cbea200..7949612714 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -573,6 +573,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         force_repartition=False,
         force_image_layer_creation=False,
         wait_until_uploaded=False,
+        enhanced_gc_bottom_most_compaction=False,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
@@ -582,6 +583,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
             query["force_image_layer_creation"] = "true"
         if wait_until_uploaded:
             query["wait_until_uploaded"] = "true"
+        if enhanced_gc_bottom_most_compaction:
+            query["enhanced_gc_bottom_most_compaction"] = "true"
 
         log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
         res = self.put(
diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py
index 9a03994b29..4c326111c2 100644
--- a/test_runner/performance/test_gc_feedback.py
+++ b/test_runner/performance/test_gc_feedback.py
@@ -33,7 +33,7 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
             "checkpoint_distance": f"{1024 ** 2}",
             "compaction_target_size": f"{1024 ** 2}",
             # set PITR interval to be small, so we can do GC
-            "pitr_interval": "10 s",
+            "pitr_interval": "60 s",
             # "compaction_threshold": "3",
             # "image_creation_threshold": "2",
         }
@@ -99,6 +99,52 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
         MetricReport.LOWER_IS_BETTER,
     )
 
+    client.timeline_compact(tenant_id, timeline_id, enhanced_gc_bottom_most_compaction=True)
+    tline_detail = client.timeline_detail(tenant_id, timeline_id)
+    logical_size = tline_detail["current_logical_size"]
+    physical_size = tline_detail["current_physical_size"]
+
+    max_num_of_deltas_above_image = 0
+    max_total_num_of_deltas = 0
+    for key_range in client.perf_info(tenant_id, timeline_id):
+        max_total_num_of_deltas = max(max_total_num_of_deltas, key_range["total_num_of_deltas"])
+        max_num_of_deltas_above_image = max(
+            max_num_of_deltas_above_image, key_range["num_of_deltas_above_image"]
+        )
+    zenbenchmark.record(
+        "logical_size_after_bottom_most_compaction",
+        logical_size // MB,
+        "Mb",
+        MetricReport.LOWER_IS_BETTER,
+    )
+    zenbenchmark.record(
+        "physical_size_after_bottom_most_compaction",
+        physical_size // MB,
+        "Mb",
+        MetricReport.LOWER_IS_BETTER,
+    )
+    zenbenchmark.record(
+        "physical/logical ratio after bottom_most_compaction",
+        physical_size / logical_size,
+        "",
+        MetricReport.LOWER_IS_BETTER,
+    )
+    zenbenchmark.record(
+        "max_total_num_of_deltas_after_bottom_most_compaction",
+        max_total_num_of_deltas,
+        "",
+        MetricReport.LOWER_IS_BETTER,
+    )
+    zenbenchmark.record(
+        "max_num_of_deltas_above_image_after_bottom_most_compaction",
+        max_num_of_deltas_above_image,
+        "",
+        MetricReport.LOWER_IS_BETTER,
+    )
+
+    with endpoint.cursor() as cur:
+        cur.execute("SELECT * FROM t")  # ensure data is not corrupted
+
     layer_map_path = env.repo_dir / "layer-map.json"
     log.info(f"Writing layer map to {layer_map_path}")
     with layer_map_path.open("w") as f:

From 9b623d3a2cc8048de2b5b8475bb51a747037aa4b Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 26 Jun 2024 07:46:52 +0200
Subject: [PATCH 1050/1571] add commit hash to S3 object identifier for
 artifacts on S3 (#8161)

In future we may want to run periodic tests on dedicated cloud instances
that are not GitHub action runners.
To allow these to download artifact binaries for a specific commit hash
we want to make the search by commit hash possible and prefix the S3
objects with
`artifacts/${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}`

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/actions/download/action.yml | 2 +-
 .github/actions/upload/action.yml   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml
index ce26e7825b..01c216b1ac 100644
--- a/.github/actions/download/action.yml
+++ b/.github/actions/download/action.yml
@@ -26,7 +26,7 @@ runs:
         TARGET: ${{ inputs.path }}
         ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst
         SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }}
-        PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }}
+        PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}/{2}', github.event.pull_request.head.sha || github.sha, github.run_id, github.run_attempt) }}
       run: |
         BUCKET=neon-github-public-dev
         FILENAME=$(basename $ARCHIVE)
diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml
index 63973dfbe7..edcece7d2b 100644
--- a/.github/actions/upload/action.yml
+++ b/.github/actions/upload/action.yml
@@ -8,7 +8,7 @@ inputs:
     description: "A directory or file to upload"
     required: true
   prefix:
-    description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
+    description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
     required: false
 
 runs:
@@ -45,7 +45,7 @@ runs:
       env:
         SOURCE: ${{ inputs.path }}
         ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst
-        PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }}
+        PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}/{2}', github.event.pull_request.head.sha || github.sha, github.run_id , github.run_attempt) }}
       run: |
         BUCKET=neon-github-public-dev
         FILENAME=$(basename $ARCHIVE)

From fdadd6a15216e97dc5ee55c74be92030087c06e1 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 26 Jun 2024 15:13:03 +0300
Subject: [PATCH 1051/1571] Remove primary_is_running (#8162)

This was a half-finished mechanism to allow a replica to enter hot
standby mode sooner, without waiting for a running-xacts record. It had
issues, and we are working on a better mechanism to replace it.

The control plane might still set the flag in the spec file, but
compute_ctl will simply ignore it.
---
 compute_tools/src/config.rs           |  6 ------
 control_plane/src/endpoint.rs         |  1 -
 libs/compute_api/src/spec.rs          |  6 ------
 pgxn/neon/neon.c                      | 10 ----------
 test_runner/fixtures/neon_fixtures.py |  1 -
 5 files changed, 24 deletions(-)

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 2c4aec4116..479100eb89 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -83,12 +83,6 @@ pub fn write_postgres_conf(
         ComputeMode::Replica => {
             // hot_standby is 'on' by default, but let's be explicit
             writeln!(file, "hot_standby=on")?;
-
-            // Inform the replica about the primary state
-            // Default is 'false'
-            if let Some(primary_is_running) = spec.primary_is_running {
-                writeln!(file, "neon.primary_is_running={}", primary_is_running)?;
-            }
         }
     }
 
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 20371e1cb8..b928bbfc30 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -592,7 +592,6 @@ impl Endpoint {
             remote_extensions,
             pgbouncer_settings: None,
             shard_stripe_size: Some(shard_stripe_size),
-            primary_is_running: None,
         };
         let spec_path = self.endpoint_path().join("spec.json");
         std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 1c4ee2089f..883c624f71 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -96,12 +96,6 @@ pub struct ComputeSpec {
     // Stripe size for pageserver sharding, in pages
     #[serde(default)]
     pub shard_stripe_size: Option<usize>,
-
-    // When we are starting a new replica in hot standby mode,
-    // we need to know if the primary is running.
-    // This is used to determine if replica should wait for
-    // RUNNING_XACTS from primary or not.
-    pub primary_is_running: Option<bool>,
 }
 
 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 276d1542fe..b6b2db7e71 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -41,7 +41,6 @@ PG_MODULE_MAGIC;
 void		_PG_init(void);
 
 static int	logical_replication_max_snap_files = 300;
-bool primary_is_running = false;
 
 static void
 InitLogicalReplicationMonitor(void)
@@ -289,15 +288,6 @@ _PG_init(void)
 
 	pg_init_extension_server();
 
-	DefineCustomBoolVariable(
-		"neon.primary_is_running",
-		"true if the primary was running at replica startup. false otherwise",
-		NULL,
-		&primary_is_running,
-		false,
-		PGC_POSTMASTER,
-		0,
-		NULL, NULL, NULL);
 	/*
 	 * Important: This must happen after other parts of the extension are
 	 * loaded, otherwise any settings to GUCs that were set before the
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 84fb1f7cb4..d8da2a3a3e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3539,7 +3539,6 @@ class Endpoint(PgProtocol, LogUtils):
         # and make tests more stable.
         config_lines = ["max_replication_write_lag=15MB"] + config_lines
 
-        config_lines = ["neon.primary_is_running=on"] + config_lines
         self.config(config_lines)
 
         return self

From 5d2f9ffa89bf98290344aed0a22fcede04664831 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 26 Jun 2024 09:34:41 -0400
Subject: [PATCH 1052/1571] test(bottom-most-compaction): wal apply order
 (#8163)

A follow-up on https://github.com/neondatabase/neon/pull/8103/.
Previously, main branch fails with:

```
assertion `left == right` failed
  left: b"value 3@0x10@0x30@0x28@0x40"
 right: b"value 3@0x10@0x28@0x30@0x40"
```

This gets fixed after #8103 gets merged.


Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4e03e09a9b..30e855eaa2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -7069,6 +7069,16 @@ mod tests {
                 Lsn(0x30),
                 Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
             ),
+            (
+                get_key(3),
+                Lsn(0x28),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
+            ),
+            (
+                get_key(3),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
             (
                 get_key(3),
                 Lsn(0x40),
@@ -7128,7 +7138,7 @@ mod tests {
             Bytes::from_static(b"value 0@0x10"),
             Bytes::from_static(b"value 1@0x10@0x20"),
             Bytes::from_static(b"value 2@0x10@0x30"),
-            Bytes::from_static(b"value 3@0x10@0x40"),
+            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
             Bytes::from_static(b"value 4@0x10"),
             Bytes::from_static(b"value 5@0x10@0x20"),
             Bytes::from_static(b"value 6@0x10@0x20"),
@@ -7141,7 +7151,7 @@ mod tests {
             Bytes::from_static(b"value 0@0x10"),
             Bytes::from_static(b"value 1@0x10@0x20"),
             Bytes::from_static(b"value 2@0x10@0x30"),
-            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 3@0x10@0x28@0x30"),
             Bytes::from_static(b"value 4@0x10"),
             Bytes::from_static(b"value 5@0x10@0x20"),
             Bytes::from_static(b"value 6@0x10@0x20"),

From 47e5bf3bbbb97b3f95d545a03bc0c20c782eb806 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 26 Jun 2024 15:26:52 +0100
Subject: [PATCH 1053/1571] Improve term reject message in walproposer (#8164)

Co-authored-by: Tristan Partin <tristan@neon.tech>
---
 pgxn/neon/walproposer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index dbc67a24f5..c53257923a 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1447,7 +1447,7 @@ RecvAppendResponses(Safekeeper *sk)
 			 * core as this is kinda expected scenario.
 			 */
 			disable_core_dump();
-			wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
+			wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
 				   sk->host, sk->port,
 				   sk->appendResponse.term, wp->propTerm);
 		}

From d7e349d33c019554402ff2b4e519a668638ec88f Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 26 Jun 2024 16:11:26 +0100
Subject: [PATCH 1054/1571] proxy: report blame for passthrough disconnect io
 errors (#8170)

## Problem

Hard to debug the disconnection reason currently.

## Summary of changes

Keep track of error-direction, and therefore error source (client vs
compute) during passthrough.
---
 proxy/src/bin/pg_sni_router.rs        |  9 ++--
 proxy/src/proxy.rs                    |  8 +++-
 proxy/src/proxy/copy_bidirectional.rs | 66 ++++++++++++++++++++-------
 proxy/src/proxy/passthrough.rs        | 10 ++--
 proxy/src/serverless/websocket.rs     |  8 +++-
 5 files changed, 75 insertions(+), 26 deletions(-)

diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index e1674049a6..44e880838e 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -10,7 +10,7 @@ use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
 use proxy::metrics::{Metrics, ThreadPoolMetrics};
-use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled};
+use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
 use rustls::pki_types::PrivateKeyDer;
 use tokio::net::TcpListener;
 
@@ -286,7 +286,10 @@ async fn handle_client(
 
     // Starting from here we only proxy the client's traffic.
     info!("performing the proxy pass...");
-    let _ = copy_bidirectional_client_compute(&mut tls_stream, &mut client).await?;
 
-    Ok(())
+    match copy_bidirectional_client_compute(&mut tls_stream, &mut client).await {
+        Ok(_) => Ok(()),
+        Err(ErrorSource::Client(err)) => Err(err).context("client"),
+        Err(ErrorSource::Compute(err)) => Err(err).context("compute"),
+    }
 }
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 072f51958f..3edefcf21a 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -8,6 +8,7 @@ pub mod passthrough;
 pub mod retry;
 pub mod wake_compute;
 pub use copy_bidirectional::copy_bidirectional_client_compute;
+pub use copy_bidirectional::ErrorSource;
 
 use crate::{
     auth,
@@ -148,8 +149,11 @@ pub async fn task_main(
                     ctx.log_connect();
                     match p.proxy_pass().instrument(span.clone()).await {
                         Ok(()) => {}
-                        Err(e) => {
-                            error!(parent: &span, "per-client task finished with an error: {e:#}");
+                        Err(ErrorSource::Client(e)) => {
+                            error!(parent: &span, "per-client task finished with an IO error from the client: {e:#}");
+                        }
+                        Err(ErrorSource::Compute(e)) => {
+                            error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}");
                         }
                     }
                 }
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index aaf3688f21..3c45fff969 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -13,12 +13,39 @@ enum TransferState {
     Done(u64),
 }
 
+#[derive(Debug)]
+pub enum ErrorDirection {
+    Read(io::Error),
+    Write(io::Error),
+}
+
+impl ErrorSource {
+    fn from_client(err: ErrorDirection) -> ErrorSource {
+        match err {
+            ErrorDirection::Read(client) => Self::Client(client),
+            ErrorDirection::Write(compute) => Self::Compute(compute),
+        }
+    }
+    fn from_compute(err: ErrorDirection) -> ErrorSource {
+        match err {
+            ErrorDirection::Write(client) => Self::Client(client),
+            ErrorDirection::Read(compute) => Self::Compute(compute),
+        }
+    }
+}
+
+#[derive(Debug)]
+pub enum ErrorSource {
+    Client(io::Error),
+    Compute(io::Error),
+}
+
 fn transfer_one_direction<A, B>(
     cx: &mut Context<'_>,
     state: &mut TransferState,
     r: &mut A,
     w: &mut B,
-) -> Poll<io::Result<u64>>
+) -> Poll<Result<u64, ErrorDirection>>
 where
     A: AsyncRead + AsyncWrite + Unpin + ?Sized,
     B: AsyncRead + AsyncWrite + Unpin + ?Sized,
@@ -32,7 +59,7 @@ where
                 *state = TransferState::ShuttingDown(count);
             }
             TransferState::ShuttingDown(count) => {
-                ready!(w.as_mut().poll_shutdown(cx))?;
+                ready!(w.as_mut().poll_shutdown(cx)).map_err(ErrorDirection::Write)?;
                 *state = TransferState::Done(*count);
             }
             TransferState::Done(count) => return Poll::Ready(Ok(*count)),
@@ -44,7 +71,7 @@ where
 pub async fn copy_bidirectional_client_compute<Client, Compute>(
     client: &mut Client,
     compute: &mut Compute,
-) -> Result<(u64, u64), std::io::Error>
+) -> Result<(u64, u64), ErrorSource>
 where
     Client: AsyncRead + AsyncWrite + Unpin + ?Sized,
     Compute: AsyncRead + AsyncWrite + Unpin + ?Sized,
@@ -54,9 +81,11 @@ where
 
     poll_fn(|cx| {
         let mut client_to_compute_result =
-            transfer_one_direction(cx, &mut client_to_compute, client, compute)?;
+            transfer_one_direction(cx, &mut client_to_compute, client, compute)
+                .map_err(ErrorSource::from_client)?;
         let mut compute_to_client_result =
-            transfer_one_direction(cx, &mut compute_to_client, compute, client)?;
+            transfer_one_direction(cx, &mut compute_to_client, compute, client)
+                .map_err(ErrorSource::from_compute)?;
 
         // Early termination checks from compute to client.
         if let TransferState::Done(_) = compute_to_client {
@@ -65,18 +94,20 @@ where
                 // Initiate shutdown
                 client_to_compute = TransferState::ShuttingDown(buf.amt);
                 client_to_compute_result =
-                    transfer_one_direction(cx, &mut client_to_compute, client, compute)?;
+                    transfer_one_direction(cx, &mut client_to_compute, client, compute)
+                        .map_err(ErrorSource::from_client)?;
             }
         }
 
-        // Early termination checks from compute to client.
+        // Early termination checks from client to compute.
         if let TransferState::Done(_) = client_to_compute {
             if let TransferState::Running(buf) = &compute_to_client {
                 info!("Client is done, terminate compute");
                 // Initiate shutdown
                 compute_to_client = TransferState::ShuttingDown(buf.amt);
                 compute_to_client_result =
-                    transfer_one_direction(cx, &mut compute_to_client, client, compute)?;
+                    transfer_one_direction(cx, &mut compute_to_client, compute, client)
+                        .map_err(ErrorSource::from_compute)?;
             }
         }
 
@@ -138,7 +169,7 @@ impl CopyBuffer {
         cx: &mut Context<'_>,
         mut reader: Pin<&mut R>,
         mut writer: Pin<&mut W>,
-    ) -> Poll<io::Result<usize>>
+    ) -> Poll<Result<usize, ErrorDirection>>
     where
         R: AsyncRead + ?Sized,
         W: AsyncWrite + ?Sized,
@@ -149,11 +180,11 @@ impl CopyBuffer {
                 // Top up the buffer towards full if we can read a bit more
                 // data - this should improve the chances of a large write
                 if !me.read_done && me.cap < me.buf.len() {
-                    ready!(me.poll_fill_buf(cx, reader.as_mut()))?;
+                    ready!(me.poll_fill_buf(cx, reader.as_mut())).map_err(ErrorDirection::Read)?;
                 }
                 Poll::Pending
             }
-            res => res,
+            res => res.map_err(ErrorDirection::Write),
         }
     }
 
@@ -162,7 +193,7 @@ impl CopyBuffer {
         cx: &mut Context<'_>,
         mut reader: Pin<&mut R>,
         mut writer: Pin<&mut W>,
-    ) -> Poll<io::Result<u64>>
+    ) -> Poll<Result<u64, ErrorDirection>>
     where
         R: AsyncRead + ?Sized,
         W: AsyncWrite + ?Sized,
@@ -176,12 +207,13 @@ impl CopyBuffer {
 
                 match self.poll_fill_buf(cx, reader.as_mut()) {
                     Poll::Ready(Ok(())) => (),
-                    Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+                    Poll::Ready(Err(err)) => return Poll::Ready(Err(ErrorDirection::Read(err))),
                     Poll::Pending => {
                         // Try flushing when the reader has no progress to avoid deadlock
                         // when the reader depends on buffered writer.
                         if self.need_flush {
-                            ready!(writer.as_mut().poll_flush(cx))?;
+                            ready!(writer.as_mut().poll_flush(cx))
+                                .map_err(ErrorDirection::Write)?;
                             self.need_flush = false;
                         }
 
@@ -194,10 +226,10 @@ impl CopyBuffer {
             while self.pos < self.cap {
                 let i = ready!(self.poll_write_buf(cx, reader.as_mut(), writer.as_mut()))?;
                 if i == 0 {
-                    return Poll::Ready(Err(io::Error::new(
+                    return Poll::Ready(Err(ErrorDirection::Write(io::Error::new(
                         io::ErrorKind::WriteZero,
                         "write zero byte into writer",
-                    )));
+                    ))));
                 } else {
                     self.pos += i;
                     self.amt += i as u64;
@@ -216,7 +248,7 @@ impl CopyBuffer {
             // If we've written all the data and we've seen EOF, flush out the
             // data and finish the transfer.
             if self.pos == self.cap && self.read_done {
-                ready!(writer.as_mut().poll_flush(cx))?;
+                ready!(writer.as_mut().poll_flush(cx)).map_err(ErrorDirection::Write)?;
                 return Poll::Ready(Ok(self.amt));
             }
         }
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index 62de79946f..9942fac383 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -10,13 +10,15 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 use utils::measured_stream::MeasuredStream;
 
+use super::copy_bidirectional::ErrorSource;
+
 /// Forward bytes in both directions (client <-> compute).
 #[tracing::instrument(skip_all)]
 pub async fn proxy_pass(
     client: impl AsyncRead + AsyncWrite + Unpin,
     compute: impl AsyncRead + AsyncWrite + Unpin,
     aux: MetricsAuxInfo,
-) -> anyhow::Result<()> {
+) -> Result<(), ErrorSource> {
     let usage = USAGE_METRICS.register(Ids {
         endpoint_id: aux.endpoint_id,
         branch_id: aux.branch_id,
@@ -66,9 +68,11 @@ pub struct ProxyPassthrough<P, S> {
 }
 
 impl<P, S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<P, S> {
-    pub async fn proxy_pass(self) -> anyhow::Result<()> {
+    pub async fn proxy_pass(self) -> Result<(), ErrorSource> {
         let res = proxy_pass(self.client, self.compute.stream, self.aux).await;
-        self.compute.cancel_closure.try_cancel_query().await?;
+        if let Err(err) = self.compute.cancel_closure.try_cancel_query().await {
+            tracing::error!(?err, "could not cancel the query in the database");
+        }
         res
     }
 }
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 0e9772733d..0d5b88f07b 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -1,3 +1,4 @@
+use crate::proxy::ErrorSource;
 use crate::{
     cancellation::CancellationHandlerMain,
     config::ProxyConfig,
@@ -7,6 +8,7 @@ use crate::{
     proxy::{handle_client, ClientMode},
     rate_limiter::EndpointRateLimiter,
 };
+use anyhow::Context as _;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use framed_websockets::{Frame, OpCode, WebSocketServer};
 use futures::{Sink, Stream};
@@ -165,7 +167,11 @@ pub async fn serve_websocket(
         Ok(Some(p)) => {
             ctx.set_success();
             ctx.log_connect();
-            p.proxy_pass().await
+            match p.proxy_pass().await {
+                Ok(()) => Ok(()),
+                Err(ErrorSource::Client(err)) => Err(err).context("client"),
+                Err(ErrorSource::Compute(err)) => Err(err).context("compute"),
+            }
         }
     }
 }

From 5af9660b9e4ad804433335662a3c1bf79cfeb637 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 26 Jun 2024 16:37:04 +0100
Subject: [PATCH 1055/1571] CI(build-tools): don't install Postgres 14 (#6540)

## Problem

We install Postgres 14 in `build-tools` image, but we don't need
it. We use Postgres binaries, which we build ourselves.

## Summary of changes
- Remove Postgresql 14 installation from `build-tools` image
---
 Dockerfile.build-tools                |  7 -------
 test_runner/fixtures/neon_fixtures.py | 11 +++++++++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 5dd2c13c0e..a1483e550e 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -73,13 +73,6 @@ RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
     && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
-# PostgreSQL 14
-RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
-    && echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
-    && apt update \
-    && apt install -y postgresql-client-14 \
-    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
 # AWS CLI
 RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
     && unzip -q awscliv2.zip \
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d8da2a3a3e..7453637218 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3075,9 +3075,16 @@ class PSQL:
         host: str = "127.0.0.1",
         port: int = 5432,
     ):
-        assert shutil.which(path)
+        search_path = None
+        if (d := os.getenv("POSTGRES_DISTRIB_DIR")) is not None and (
+            v := os.getenv("DEFAULT_PG_VERSION")
+        ) is not None:
+            search_path = Path(d) / f"v{v}" / "bin"
 
-        self.path = path
+        full_path = shutil.which(path, path=search_path)
+        assert full_path is not None
+
+        self.path = full_path
         self.database_url = f"postgres://{host}:{port}/main?options=project%3Dgeneric-project-name"
 
     async def run(self, query: Optional[str] = None) -> asyncio.subprocess.Process:

From 3118c245213af0cdcd890cd559567366d7a85b0e Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 26 Jun 2024 16:46:14 +0100
Subject: [PATCH 1056/1571] Panic on unexpected error in simtests (#8169)

---
 safekeeper/tests/random_test.rs                | 2 +-
 safekeeper/tests/walproposer_sim/safekeeper.rs | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/safekeeper/tests/random_test.rs b/safekeeper/tests/random_test.rs
index 6c6f6a8c96..7bdee35cd7 100644
--- a/safekeeper/tests/random_test.rs
+++ b/safekeeper/tests/random_test.rs
@@ -10,7 +10,7 @@ use crate::walproposer_sim::{
 pub mod walproposer_sim;
 
 // Generates 2000 random seeds and runs a schedule for each of them.
-// If you seed this test fail, please report the last seed to the
+// If you see this test fail, please report the last seed to the
 // @safekeeper team.
 #[test]
 fn test_random_schedules() -> anyhow::Result<()> {
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 47539872a6..9c81d2eb4d 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -21,7 +21,7 @@ use safekeeper::{
     wal_storage::Storage,
     SafeKeeperConf,
 };
-use tracing::{debug, info_span};
+use tracing::{debug, info_span, warn};
 use utils::{
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
@@ -247,7 +247,12 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
                 NetEvent::Message(msg) => {
                     let res = conn.process_any(msg, &mut global);
                     if res.is_err() {
-                        debug!("conn {:?} error: {:#}", connection_id, res.unwrap_err());
+                        let e = res.unwrap_err();
+                        let estr = e.to_string();
+                        if !estr.contains("finished processing START_REPLICATION") {
+                            warn!("conn {:?} error: {:?}", connection_id, e);
+                            panic!("unexpected error at safekeeper: {:#}", e);
+                        }
                         conns.remove(&connection_id);
                         break;
                     }

From 24ce73ffaf5b56004f4e2630ca773630b716d253 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 26 Jun 2024 19:19:27 +0300
Subject: [PATCH 1057/1571] Silence compiler warning (#8153)

I saw this compiler warning on my laptop:

pgxn/neon_walredo/walredoproc.c:178:10: warning: using the result of an
assignment as a condition without parentheses [-Wparentheses]
            if (err = close_range_syscall(3, ~0U, 0)) {
                ~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
pgxn/neon_walredo/walredoproc.c:178:10: note: place parentheses around
the assignment to silence this warning
            if (err = close_range_syscall(3, ~0U, 0)) {
                    ^
                (                                   )
pgxn/neon_walredo/walredoproc.c:178:10: note: use '==' to turn this
assignment into an equality comparison
            if (err = close_range_syscall(3, ~0U, 0)) {
                    ^
                    ==
    1 warning generated.

I'm not sure what compiler version or options cause that, but it's a
good warning. Write the call a little differently, to avoid the warning
and to make it a little more clear anyway. (The 'err' variable wasn't
used for anything, so I'm surprised we were not seeing a compiler
warning on the unused value, too.)
---
 pgxn/neon_walredo/walredoproc.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c
index c4ab22636b..cc545393f5 100644
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -168,16 +168,15 @@ close_range_syscall(unsigned int start_fd, unsigned int count, unsigned int flag
 static void
 enter_seccomp_mode(void)
 {
-
 	/*
 	 * The pageserver process relies on us to close all the file descriptors
 	 * it potentially leaked to us, _before_ we start processing potentially dangerous
 	 * wal records. See the comment in the Rust code that launches this process.
 	 */
-	int err;
-	if (err = close_range_syscall(3, ~0U, 0)) {
-		ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not close files >= fd 3")));
-	}
+	if (close_range_syscall(3, ~0U, 0) != 0)
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: could not close files >= fd 3")));
 
 	PgSeccompRule syscalls[] =
 	{

From 5b871802fd86c7b81fff0a99df3f1699ec8474b7 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 26 Jun 2024 19:53:03 +0300
Subject: [PATCH 1058/1571] Add counters for commands processed through the
 libpq page service API (#8089)

I was looking for metrics on how many computes are still using protocol
version 1 and 2. This provides counters for that as "pagestream" and
"pagestream_v2" commands, but also all the other commands. The new
metrics are global for the whole pageserver instance rather than
per-tenant, so the additional metrics bloat should be fairly small.
---
 pageserver/src/metrics.rs      | 41 ++++++++++++++++++++++++++++++++++
 pageserver/src/page_service.rs | 39 +++++++++++++++++++++++++++++++-
 2 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index c6b1607331..ca697afcf6 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1445,6 +1445,46 @@ pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
+pub(crate) enum ComputeCommandKind {
+    PageStreamV2,
+    PageStream,
+    Basebackup,
+    GetLastRecordRlsn,
+    Fullbackup,
+    ImportBasebackup,
+    ImportWal,
+    LeaseLsn,
+    Show,
+}
+
+pub(crate) struct ComputeCommandCounters {
+    map: EnumMap<ComputeCommandKind, IntCounter>,
+}
+
+pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy<ComputeCommandCounters> = Lazy::new(|| {
+    let inner = register_int_counter_vec!(
+        "pageserver_compute_commands",
+        "Number of compute -> pageserver commands processed",
+        &["command"]
+    )
+    .expect("failed to define a metric");
+
+    ComputeCommandCounters {
+        map: EnumMap::from_array(std::array::from_fn(|i| {
+            let command = <ComputeCommandKind as enum_map::Enum>::from_usize(i);
+            let command_str: &'static str = command.into();
+            inner.with_label_values(&[command_str])
+        })),
+    }
+});
+
+impl ComputeCommandCounters {
+    pub(crate) fn for_command(&self, command: ComputeCommandKind) -> &IntCounter {
+        &self.map[command]
+    }
+}
+
 // remote storage metrics
 
 static REMOTE_TIMELINE_CLIENT_CALLS: Lazy<IntCounterPairVec> = Lazy::new(|| {
@@ -2949,4 +2989,5 @@ pub fn preinitialize_metrics() {
     Lazy::force(&RECONSTRUCT_TIME);
     Lazy::force(&tenant_throttling::TIMELINE_GET);
     Lazy::force(&BASEBACKUP_QUERY_TIME);
+    Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
 }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index ebc23e8945..6ea5f396d0 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -55,7 +55,7 @@ use crate::basebackup::BasebackupError;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
-use crate::metrics::LIVE_CONNECTIONS_COUNT;
+use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS_COUNT};
 use crate::pgdatadir_mapping::Version;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
@@ -1554,6 +1554,10 @@ where
 
             self.check_permission(Some(tenant_id))?;
 
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::PageStreamV2)
+                .inc();
+
             self.handle_pagerequests(
                 pgb,
                 tenant_id,
@@ -1579,6 +1583,10 @@ where
 
             self.check_permission(Some(tenant_id))?;
 
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::PageStream)
+                .inc();
+
             self.handle_pagerequests(
                 pgb,
                 tenant_id,
@@ -1605,6 +1613,10 @@ where
 
             self.check_permission(Some(tenant_id))?;
 
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::Basebackup)
+                .inc();
+
             let lsn = if let Some(lsn_str) = params.get(2) {
                 Some(
                     Lsn::from_str(lsn_str)
@@ -1662,6 +1674,11 @@ where
                 .record("timeline_id", field::display(timeline_id));
 
             self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::GetLastRecordRlsn)
+                .inc();
+
             async {
                 let timeline = self
                     .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
@@ -1723,6 +1740,10 @@ where
 
             self.check_permission(Some(tenant_id))?;
 
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::Fullbackup)
+                .inc();
+
             // Check that the timeline exists
             self.handle_basebackup_request(
                 pgb,
@@ -1771,6 +1792,10 @@ where
 
             self.check_permission(Some(tenant_id))?;
 
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::ImportBasebackup)
+                .inc();
+
             match self
                 .handle_import_basebackup(
                     pgb,
@@ -1818,6 +1843,10 @@ where
 
             self.check_permission(Some(tenant_id))?;
 
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::ImportWal)
+                .inc();
+
             match self
                 .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
                 .await
@@ -1855,6 +1884,10 @@ where
 
             self.check_permission(Some(tenant_shard_id.tenant_id))?;
 
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::LeaseLsn)
+                .inc();
+
             // The caller is responsible for providing correct lsn.
             let lsn = Lsn::from_str(params[2])
                 .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
@@ -1886,6 +1919,10 @@ where
 
             self.check_permission(Some(tenant_id))?;
 
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::Show)
+                .inc();
+
             let tenant = self
                 .get_active_tenant_with_timeout(
                     tenant_id,

From dd3adc36933f86e19aa45c1da07e997970350435 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 26 Jun 2024 18:27:23 +0100
Subject: [PATCH 1059/1571] docker: downgrade openssl to 1.1.1w (#8168)

## Problem
We have seen numerous segfault and memory corruption issue for clients
using libpq and openssl 3.2.2. I don't know if this is a bug in openssl
or libpq. Downgrading to 1.1.1w fixes the issues for the storage
controller and pgbench.

## Summary of Changes:
Use openssl 1.1.1w instead of 3.2.2
---
 Dockerfile.build-tools | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index a1483e550e..f85706ef6a 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -106,10 +106,10 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
     && rm -rf ../lcov.tar.gz
 
 # Compile and install the static OpenSSL library
-ENV OPENSSL_VERSION=3.2.2
+ENV OPENSSL_VERSION=1.1.1w
 ENV OPENSSL_PREFIX=/usr/local/openssl
 RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
-    echo "197149c18d9e9f292c43f0400acaba12e5f52cacfe050f3d199277ea738ec2e7 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
+    echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
     cd /tmp && \
     tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
     rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \

From 76fc3d4aa1deaa3f0e821d2dcdb67bdfb7b49281 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 26 Jun 2024 18:58:56 +0100
Subject: [PATCH 1060/1571] Evict WAL files from disk (#8022)

Fixes https://github.com/neondatabase/neon/issues/6337

Add safekeeper support to switch between `Present` and
`Offloaded(flush_lsn)` states. The offloading is disabled by default,
but can be controlled using new cmdline arguments:

```
      --enable-offload
          Enable automatic switching to offloaded state
      --delete-offloaded-wal
          Delete local WAL files after offloading. When disabled, they will be left on disk
      --control-file-save-interval <CONTROL_FILE_SAVE_INTERVAL>
          Pending updates to control file will be automatically saved after this interval [default: 300s]
```

Manager watches state updates and detects when there are no actvity on
the timeline and actual partial backup upload in remote storage. When
all conditions are met, the state can be switched to offloaded.

In `timeline.rs` there is `StateSK` enum to support switching between
states. When offloaded, code can access only control file structure and
cannot use `SafeKeeper` to accept new WAL.

`FullAccessTimeline` is now renamed to `WalResidentTimeline`. This
struct contains guard to notify manager about active tasks requiring
on-disk WAL access. All guards are issued by the manager, all requests
are sent via channel using `ManagerCtl`. When manager receives request
to issue a guard, it unevicts timeline if it's currently evicted.

Fixed a bug in partial WAL backup, it used `term` instead of
`last_log_term` previously.

After this commit is merged, next step is to roll this change out, as in
issue #6338.
---
 safekeeper/src/bin/safekeeper.rs              |  19 +-
 safekeeper/src/control_file.rs                |   5 +-
 safekeeper/src/copy_timeline.rs               |   8 +-
 safekeeper/src/debug_dump.rs                  |   6 +-
 safekeeper/src/http/routes.rs                 |   8 +-
 safekeeper/src/json_ctrl.rs                   |  10 +-
 safekeeper/src/lib.rs                         |   9 +
 safekeeper/src/pull_timeline.rs               |  35 +-
 safekeeper/src/receive_wal.rs                 |  20 +-
 safekeeper/src/recovery.rs                    |  29 +-
 safekeeper/src/remove_wal.rs                  |   2 +-
 safekeeper/src/safekeeper.rs                  |  46 +-
 safekeeper/src/send_wal.rs                    |  15 +-
 safekeeper/src/timeline.rs                    | 458 ++++++++--
 safekeeper/src/timeline_eviction.rs           | 366 ++++++++
 safekeeper/src/timeline_guard.rs              |  71 ++
 safekeeper/src/timeline_manager.rs            | 779 ++++++++++++------
 safekeeper/src/timelines_set.rs               |   4 +
 safekeeper/src/wal_backup.rs                  |  51 +-
 safekeeper/src/wal_backup_partial.rs          |  80 +-
 safekeeper/src/wal_storage.rs                 |  12 +-
 .../tests/walproposer_sim/safekeeper.rs       |  13 +-
 test_runner/fixtures/neon_fixtures.py         |   2 +
 test_runner/regress/test_wal_acceptor.py      | 100 +++
 .../regress/test_wal_acceptor_async.py        |   5 +-
 25 files changed, 1673 insertions(+), 480 deletions(-)
 create mode 100644 safekeeper/src/timeline_eviction.rs
 create mode 100644 safekeeper/src/timeline_guard.rs

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 86238c7292..20650490b1 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -28,8 +28,8 @@ use utils::pid_file;
 
 use metrics::set_build_info_metric;
 use safekeeper::defaults::{
-    DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
-    DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
+    DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR,
+    DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
 use safekeeper::http;
 use safekeeper::wal_service;
@@ -172,6 +172,7 @@ struct Args {
     walsenders_keep_horizon: bool,
     /// Enable partial backup. If disabled, safekeeper will not upload partial
     /// segments to remote storage.
+    /// TODO: now partial backup is always enabled, remove this flag.
     #[arg(long)]
     partial_backup_enabled: bool,
     /// Controls how long backup will wait until uploading the partial segment.
@@ -181,6 +182,15 @@ struct Args {
     /// be used in tests.
     #[arg(long)]
     disable_periodic_broker_push: bool,
+    /// Enable automatic switching to offloaded state.
+    #[arg(long)]
+    enable_offload: bool,
+    /// Delete local WAL files after offloading. When disabled, they will be left on disk.
+    #[arg(long)]
+    delete_offloaded_wal: bool,
+    /// Pending updates to control file will be automatically saved after this interval.
+    #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_CONTROL_FILE_SAVE_INTERVAL)]
+    control_file_save_interval: Duration,
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -328,9 +338,12 @@ async fn main() -> anyhow::Result<()> {
         sk_auth_token,
         current_thread_runtime: args.current_thread_runtime,
         walsenders_keep_horizon: args.walsenders_keep_horizon,
-        partial_backup_enabled: args.partial_backup_enabled,
+        partial_backup_enabled: true,
         partial_backup_timeout: args.partial_backup_timeout,
         disable_periodic_broker_push: args.disable_periodic_broker_push,
+        enable_offload: args.enable_offload,
+        delete_offloaded_wal: args.delete_offloaded_wal,
+        control_file_save_interval: args.control_file_save_interval,
     };
 
     // initialize sentry if SENTRY_DSN is provided
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index 8e9031fae4..cd3c7fe526 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -72,6 +72,9 @@ impl FileStorage {
         conf: &SafeKeeperConf,
         state: TimelinePersistentState,
     ) -> Result<FileStorage> {
+        // we don't support creating new timelines in offloaded state
+        assert!(matches!(state.eviction_state, EvictionState::Present));
+
         let store = FileStorage {
             timeline_dir,
             no_sync: conf.no_sync,
@@ -103,7 +106,7 @@ impl FileStorage {
     }
 
     /// Load control file from given directory.
-    pub fn load_control_file_from_dir(timeline_dir: &Utf8Path) -> Result<TimelinePersistentState> {
+    fn load_control_file_from_dir(timeline_dir: &Utf8Path) -> Result<TimelinePersistentState> {
         let path = timeline_dir.join(CONTROL_FILE_NAME);
         Self::load_control_file(path)
     }
diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
index 51cf4db6b5..14bd3c03b8 100644
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -15,7 +15,7 @@ use crate::{
     control_file::{FileStorage, Storage},
     pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline},
     state::TimelinePersistentState,
-    timeline::{FullAccessTimeline, Timeline, TimelineError},
+    timeline::{Timeline, TimelineError, WalResidentTimeline},
     wal_backup::copy_s3_segments,
     wal_storage::{wal_file_paths, WalReader},
     GlobalTimelines,
@@ -46,7 +46,7 @@ pub async fn handle_request(request: Request) -> Result<()> {
         }
     }
 
-    let source_tli = request.source.full_access_guard().await?;
+    let source_tli = request.source.wal_residence_guard().await?;
 
     let conf = &GlobalTimelines::get_global_config();
     let ttid = request.destination_ttid;
@@ -159,7 +159,7 @@ pub async fn handle_request(request: Request) -> Result<()> {
 }
 
 async fn copy_disk_segments(
-    tli: &FullAccessTimeline,
+    tli: &WalResidentTimeline,
     wal_seg_size: usize,
     start_lsn: Lsn,
     end_lsn: Lsn,
@@ -183,7 +183,7 @@ async fn copy_disk_segments(
         let copy_end = copy_end - segment_start;
 
         let wal_file_path = {
-            let (normal, partial) = wal_file_paths(tli_dir_path, segment, wal_seg_size)?;
+            let (normal, partial) = wal_file_paths(tli_dir_path, segment, wal_seg_size);
 
             if segment == last_segment {
                 partial
diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs
index 062ff4b3db..15b0272cd9 100644
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -28,7 +28,8 @@ use crate::send_wal::WalSenderState;
 use crate::state::TimelineMemState;
 use crate::state::TimelinePersistentState;
 use crate::timeline::get_timeline_dir;
-use crate::timeline::FullAccessTimeline;
+use crate::timeline::WalResidentTimeline;
+use crate::timeline_manager;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;
 
@@ -168,6 +169,7 @@ pub struct Memory {
     pub last_removed_segno: XLogSegNo,
     pub epoch_start_lsn: Lsn,
     pub mem_state: TimelineMemState,
+    pub mgr_status: timeline_manager::Status,
 
     // PhysicalStorage state.
     pub write_lsn: Lsn,
@@ -326,7 +328,7 @@ pub struct TimelineDigest {
 }
 
 pub async fn calculate_digest(
-    tli: &FullAccessTimeline,
+    tli: &WalResidentTimeline,
     request: TimelineDigestRequest,
 ) -> Result<TimelineDigest> {
     if request.from_lsn > request.until_lsn {
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 3f2cd97ccd..fe6d325cee 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -214,10 +214,10 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
     let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
     // Note: with evicted timelines it should work better then de-evict them and
     // stream; probably start_snapshot would copy partial s3 file to dest path
-    // and stream control file, or return FullAccessTimeline if timeline is not
+    // and stream control file, or return WalResidentTimeline if timeline is not
     // evicted.
     let tli = tli
-        .full_access_guard()
+        .wal_residence_guard()
         .await
         .map_err(ApiError::InternalServerError)?;
 
@@ -283,7 +283,7 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
 
     let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
     let tli = tli
-        .full_access_guard()
+        .wal_residence_guard()
         .await
         .map_err(ApiError::InternalServerError)?;
 
@@ -306,7 +306,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
     tli.write_shared_state()
         .await
         .sk
-        .state
+        .state_mut()
         .flush()
         .await
         .map_err(ApiError::InternalServerError)?;
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index 27e54776e0..7fe924a08e 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -21,7 +21,7 @@ use crate::safekeeper::{
 };
 use crate::safekeeper::{Term, TermHistory, TermLsn};
 use crate::state::TimelinePersistentState;
-use crate::timeline::FullAccessTimeline;
+use crate::timeline::WalResidentTimeline;
 use crate::GlobalTimelines;
 use postgres_backend::PostgresBackend;
 use postgres_ffi::encode_logical_message;
@@ -102,7 +102,7 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
 async fn prepare_safekeeper(
     ttid: TenantTimelineId,
     pg_version: u32,
-) -> anyhow::Result<FullAccessTimeline> {
+) -> anyhow::Result<WalResidentTimeline> {
     let tli = GlobalTimelines::create(
         ttid,
         ServerInfo {
@@ -115,11 +115,11 @@ async fn prepare_safekeeper(
     )
     .await?;
 
-    tli.full_access_guard().await
+    tli.wal_residence_guard().await
 }
 
 async fn send_proposer_elected(
-    tli: &FullAccessTimeline,
+    tli: &WalResidentTimeline,
     term: Term,
     lsn: Lsn,
 ) -> anyhow::Result<()> {
@@ -151,7 +151,7 @@ pub struct InsertedWAL {
 /// Extend local WAL with new LogicalMessage record. To do that,
 /// create AppendRequest with new WAL and pass it to safekeeper.
 pub async fn append_logical_message(
-    tli: &FullAccessTimeline,
+    tli: &WalResidentTimeline,
     msg: &AppendLogicalMessage,
 ) -> anyhow::Result<InsertedWAL> {
     let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index cbd67f0064..067e425570 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -28,6 +28,8 @@ pub mod safekeeper;
 pub mod send_wal;
 pub mod state;
 pub mod timeline;
+pub mod timeline_eviction;
+pub mod timeline_guard;
 pub mod timeline_manager;
 pub mod timelines_set;
 pub mod wal_backup;
@@ -49,6 +51,7 @@ pub mod defaults {
     pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms";
     pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
     pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
+    pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
 }
 
 #[derive(Debug, Clone)]
@@ -85,6 +88,9 @@ pub struct SafeKeeperConf {
     pub partial_backup_enabled: bool,
     pub partial_backup_timeout: Duration,
     pub disable_periodic_broker_push: bool,
+    pub enable_offload: bool,
+    pub delete_offloaded_wal: bool,
+    pub control_file_save_interval: Duration,
 }
 
 impl SafeKeeperConf {
@@ -124,6 +130,9 @@ impl SafeKeeperConf {
             partial_backup_enabled: false,
             partial_backup_timeout: Duration::from_secs(0),
             disable_periodic_broker_push: false,
+            enable_offload: false,
+            delete_offloaded_wal: false,
+            control_file_save_interval: Duration::from_secs(1),
         }
     }
 }
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 66c41f65ff..618c6b278f 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -32,7 +32,7 @@ use crate::{
         routes::TimelineStatus,
     },
     safekeeper::Term,
-    timeline::{get_tenant_dir, get_timeline_dir, FullAccessTimeline, Timeline, TimelineError},
+    timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError, WalResidentTimeline},
     wal_storage::{self, open_wal_file, Storage},
     GlobalTimelines, SafeKeeperConf,
 };
@@ -46,7 +46,7 @@ use utils::{
 
 /// Stream tar archive of timeline to tx.
 #[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))]
-pub async fn stream_snapshot(tli: FullAccessTimeline, tx: mpsc::Sender<Result<Bytes>>) {
+pub async fn stream_snapshot(tli: WalResidentTimeline, tx: mpsc::Sender<Result<Bytes>>) {
     if let Err(e) = stream_snapshot_guts(tli, tx.clone()).await {
         // Error type/contents don't matter as they won't can't reach the client
         // (hyper likely doesn't do anything with it), but http stream will be
@@ -66,7 +66,7 @@ pub struct SnapshotContext {
     pub flush_lsn: Lsn,
     pub wal_seg_size: usize,
     // used to remove WAL hold off in Drop.
-    pub tli: FullAccessTimeline,
+    pub tli: WalResidentTimeline,
 }
 
 impl Drop for SnapshotContext {
@@ -80,7 +80,7 @@ impl Drop for SnapshotContext {
 }
 
 pub async fn stream_snapshot_guts(
-    tli: FullAccessTimeline,
+    tli: WalResidentTimeline,
     tx: mpsc::Sender<Result<Bytes>>,
 ) -> Result<()> {
     // tokio-tar wants Write implementor, but we have mpsc tx <Result<Bytes>>;
@@ -135,7 +135,7 @@ pub async fn stream_snapshot_guts(
     Ok(())
 }
 
-impl FullAccessTimeline {
+impl WalResidentTimeline {
     /// Start streaming tar archive with timeline:
     /// 1) stream control file under lock;
     /// 2) hold off WAL removal;
@@ -160,6 +160,7 @@ impl FullAccessTimeline {
         ar: &mut tokio_tar::Builder<W>,
     ) -> Result<SnapshotContext> {
         let mut shared_state = self.write_shared_state().await;
+        let wal_seg_size = shared_state.get_wal_seg_size();
 
         let cf_path = self.get_timeline_dir().join(CONTROL_FILE_NAME);
         let mut cf = File::open(cf_path).await?;
@@ -173,19 +174,19 @@ impl FullAccessTimeline {
         // lock and setting `wal_removal_on_hold` later, it guarantees that WAL
         // won't be removed until we're done.
         let from_lsn = min(
-            shared_state.sk.state.remote_consistent_lsn,
-            shared_state.sk.state.backup_lsn,
+            shared_state.sk.state().remote_consistent_lsn,
+            shared_state.sk.state().backup_lsn,
         );
         if from_lsn == Lsn::INVALID {
             // this is possible if snapshot is called before handling first
             // elected message
             bail!("snapshot is called on uninitialized timeline");
         }
-        let from_segno = from_lsn.segment_number(shared_state.get_wal_seg_size());
-        let term = shared_state.sk.get_term();
-        let last_log_term = shared_state.sk.get_last_log_term();
+        let from_segno = from_lsn.segment_number(wal_seg_size);
+        let term = shared_state.sk.state().acceptor_state.term;
+        let last_log_term = shared_state.sk.last_log_term();
         let flush_lsn = shared_state.sk.flush_lsn();
-        let upto_segno = flush_lsn.segment_number(shared_state.get_wal_seg_size());
+        let upto_segno = flush_lsn.segment_number(wal_seg_size);
         // have some limit on max number of segments as a sanity check
         const MAX_ALLOWED_SEGS: u64 = 1000;
         let num_segs = upto_segno - from_segno + 1;
@@ -206,14 +207,18 @@ impl FullAccessTimeline {
         }
         shared_state.wal_removal_on_hold = true;
 
+        // Drop shared_state to release the lock, before calling wal_residence_guard().
+        drop(shared_state);
+
+        let tli_copy = self.wal_residence_guard().await?;
         let bctx = SnapshotContext {
             from_segno,
             upto_segno,
             term,
             last_log_term,
             flush_lsn,
-            wal_seg_size: shared_state.get_wal_seg_size(),
-            tli: self.clone(),
+            wal_seg_size,
+            tli: tli_copy,
         };
 
         Ok(bctx)
@@ -225,8 +230,8 @@ impl FullAccessTimeline {
     /// forget this if snapshotting fails mid the way.
     pub async fn finish_snapshot(&self, bctx: &SnapshotContext) -> Result<()> {
         let shared_state = self.read_shared_state().await;
-        let term = shared_state.sk.get_term();
-        let last_log_term = shared_state.sk.get_last_log_term();
+        let term = shared_state.sk.state().acceptor_state.term;
+        let last_log_term = shared_state.sk.last_log_term();
         // There are some cases to relax this check (e.g. last_log_term might
         // change, but as long as older history is strictly part of new that's
         // fine), but there is no need to do it.
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 7943a2fd86..ab8c76dc17 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -6,7 +6,7 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::safekeeper::AcceptorProposerMessage;
 use crate::safekeeper::ProposerAcceptorMessage;
 use crate::safekeeper::ServerInfo;
-use crate::timeline::FullAccessTimeline;
+use crate::timeline::WalResidentTimeline;
 use crate::wal_service::ConnectionId;
 use crate::GlobalTimelines;
 use anyhow::{anyhow, Context};
@@ -213,7 +213,7 @@ impl SafekeeperPostgresHandler {
         &mut self,
         pgb: &mut PostgresBackend<IO>,
     ) -> Result<(), QueryError> {
-        let mut tli: Option<FullAccessTimeline> = None;
+        let mut tli: Option<WalResidentTimeline> = None;
         if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
             // Log the result and probably send it to the client, closing the stream.
             let handle_end_fut = pgb.handle_copy_stream_end(end);
@@ -233,7 +233,7 @@ impl SafekeeperPostgresHandler {
     pub async fn handle_start_wal_push_guts<IO: AsyncRead + AsyncWrite + Unpin>(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
-        tli: &mut Option<FullAccessTimeline>,
+        tli: &mut Option<WalResidentTimeline>,
     ) -> Result<(), CopyStreamHandlerEnd> {
         // Notify the libpq client that it's allowed to send `CopyData` messages
         pgb.write_message(&BeMessage::CopyBothResponse).await?;
@@ -269,11 +269,11 @@ impl SafekeeperPostgresHandler {
                     .get_walreceivers()
                     .pageserver_feedback_tx
                     .subscribe();
-            *tli = Some(timeline.clone());
+            *tli = Some(timeline.wal_residence_guard().await?);
 
             tokio::select! {
                 // todo: add read|write .context to these errors
-                r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline.clone(), next_msg) => r,
+                r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r,
                 r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
             }
         } else {
@@ -323,7 +323,7 @@ struct NetworkReader<'a, IO> {
 impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
     async fn read_first_message(
         &mut self,
-    ) -> Result<(FullAccessTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
+    ) -> Result<(WalResidentTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
         // Receive information about server to create timeline, if not yet.
         let next_msg = read_message(self.pgb_reader).await?;
         let tli = match next_msg {
@@ -340,7 +340,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
                 let tli =
                     GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
                         .await?;
-                tli.full_access_guard().await?
+                tli.wal_residence_guard().await?
             }
             _ => {
                 return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!(
@@ -356,7 +356,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
         msg_tx: Sender<ProposerAcceptorMessage>,
         msg_rx: Receiver<ProposerAcceptorMessage>,
         reply_tx: Sender<AcceptorProposerMessage>,
-        tli: FullAccessTimeline,
+        tli: WalResidentTimeline,
         next_msg: ProposerAcceptorMessage,
     ) -> Result<(), CopyStreamHandlerEnd> {
         *self.acceptor_handle = Some(WalAcceptor::spawn(
@@ -451,7 +451,7 @@ const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
 /// replies to reply_tx; reading from socket and writing to disk in parallel is
 /// beneficial for performance, this struct provides writing to disk part.
 pub struct WalAcceptor {
-    tli: FullAccessTimeline,
+    tli: WalResidentTimeline,
     msg_rx: Receiver<ProposerAcceptorMessage>,
     reply_tx: Sender<AcceptorProposerMessage>,
     conn_id: Option<ConnectionId>,
@@ -464,7 +464,7 @@ impl WalAcceptor {
     ///
     /// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper.
     pub fn spawn(
-        tli: FullAccessTimeline,
+        tli: WalResidentTimeline,
         msg_rx: Receiver<ProposerAcceptorMessage>,
         reply_tx: Sender<AcceptorProposerMessage>,
         conn_id: Option<ConnectionId>,
diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index 80a630b1e1..a59ff07b96 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -21,7 +21,7 @@ use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config}
 
 use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
 use crate::safekeeper::{AppendRequest, AppendRequestHeader};
-use crate::timeline::FullAccessTimeline;
+use crate::timeline::WalResidentTimeline;
 use crate::{
     http::routes::TimelineStatus,
     receive_wal::MSG_QUEUE_SIZE,
@@ -36,7 +36,7 @@ use crate::{
 /// Entrypoint for per timeline task which always runs, checking whether
 /// recovery for this safekeeper is needed and starting it if so.
 #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
-pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) {
+pub async fn recovery_main(tli: WalResidentTimeline, conf: SafeKeeperConf) {
     info!("started");
 
     let cancel = tli.cancel.clone();
@@ -66,12 +66,12 @@ pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) {
 /// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
 /// Thus we don't try to predict it here.
 async fn recovery_needed(
-    tli: &FullAccessTimeline,
+    tli: &WalResidentTimeline,
     heartbeat_timeout: Duration,
 ) -> RecoveryNeededInfo {
     let ss = tli.read_shared_state().await;
-    let term = ss.sk.state.acceptor_state.term;
-    let last_log_term = ss.sk.get_last_log_term();
+    let term = ss.sk.state().acceptor_state.term;
+    let last_log_term = ss.sk.last_log_term();
     let flush_lsn = ss.sk.flush_lsn();
     // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
     let mut peers = ss.get_peers(heartbeat_timeout);
@@ -195,7 +195,7 @@ impl From<&PeerInfo> for Donor {
 const CHECK_INTERVAL_MS: u64 = 2000;
 
 /// Check regularly whether we need to start recovery.
-async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) {
+async fn recovery_main_loop(tli: WalResidentTimeline, conf: SafeKeeperConf) {
     let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
     loop {
         let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
@@ -205,7 +205,12 @@ async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) {
                     "starting recovery from donor {}: {}",
                     donor.sk_id, recovery_needed_info
                 );
-                match recover(tli.clone(), donor, &conf).await {
+                let res = tli.wal_residence_guard().await;
+                if let Err(e) = res {
+                    warn!("failed to obtain guard: {}", e);
+                    continue;
+                }
+                match recover(res.unwrap(), donor, &conf).await {
                     // Note: 'write_wal rewrites WAL written before' error is
                     // expected here and might happen if compute and recovery
                     // concurrently write the same data. Eventually compute
@@ -228,7 +233,7 @@ async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) {
 /// Recover from the specified donor. Returns message explaining normal finish
 /// reason or error.
 async fn recover(
-    tli: FullAccessTimeline,
+    tli: WalResidentTimeline,
     donor: &Donor,
     conf: &SafeKeeperConf,
 ) -> anyhow::Result<String> {
@@ -314,7 +319,7 @@ async fn recover(
 
 // Pull WAL from donor, assuming handshake is already done.
 async fn recovery_stream(
-    tli: FullAccessTimeline,
+    tli: WalResidentTimeline,
     donor: &Donor,
     start_streaming_at: Lsn,
     conf: &SafeKeeperConf,
@@ -364,10 +369,10 @@ async fn recovery_stream(
     // As in normal walreceiver, do networking and writing to disk in parallel.
     let (msg_tx, msg_rx) = channel(MSG_QUEUE_SIZE);
     let (reply_tx, reply_rx) = channel(REPLY_QUEUE_SIZE);
-    let wa = WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, None);
+    let wa = WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, None);
 
     let res = tokio::select! {
-        r = network_io(physical_stream, msg_tx, donor.clone(), tli.clone(), conf.clone()) => r,
+        r = network_io(physical_stream, msg_tx, donor.clone(), tli, conf.clone()) => r,
         r = read_replies(reply_rx, donor.term) => r.map(|()| None),
     };
 
@@ -398,7 +403,7 @@ async fn network_io(
     physical_stream: ReplicationStream,
     msg_tx: Sender<ProposerAcceptorMessage>,
     donor: Donor,
-    tli: FullAccessTimeline,
+    tli: WalResidentTimeline,
     conf: SafeKeeperConf,
 ) -> anyhow::Result<Option<String>> {
     let mut physical_stream = pin!(physical_stream);
diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs
index b661e48cb5..16239d847b 100644
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -8,7 +8,7 @@ use crate::timeline_manager::StateSnapshot;
 /// While it is safe to use inmem values for determining horizon,
 /// we use persistent to make possible normal states less surprising.
 /// All segments covering LSNs before horizon_lsn can be removed.
-pub fn calc_horizon_lsn(state: &StateSnapshot, extra_horizon_lsn: Option<Lsn>) -> Lsn {
+pub(crate) fn calc_horizon_lsn(state: &StateSnapshot, extra_horizon_lsn: Option<Lsn>) -> Lsn {
     use std::cmp::min;
 
     let mut horizon_lsn = min(
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 666ffdf0ce..4d0992e8bd 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -499,7 +499,11 @@ where
     /// Accepts a control file storage containing the safekeeper state.
     /// State must be initialized, i.e. contain filled `tenant_id`, `timeline_id`
     /// and `server` (`wal_seg_size` inside it) fields.
-    pub fn new(state: CTRL, wal_store: WAL, node_id: NodeId) -> Result<SafeKeeper<CTRL, WAL>> {
+    pub fn new(
+        state: TimelineState<CTRL>,
+        wal_store: WAL,
+        node_id: NodeId,
+    ) -> Result<SafeKeeper<CTRL, WAL>> {
         if state.tenant_id == TenantId::from([0u8; 16])
             || state.timeline_id == TimelineId::from([0u8; 16])
         {
@@ -512,7 +516,7 @@ where
 
         Ok(SafeKeeper {
             term_start_lsn: Lsn(0),
-            state: TimelineState::new(state),
+            state,
             wal_store,
             node_id,
         })
@@ -526,11 +530,6 @@ where
             .up_to(self.flush_lsn())
     }
 
-    /// Get current term.
-    pub fn get_term(&self) -> Term {
-        self.state.acceptor_state.term
-    }
-
     pub fn get_last_log_term(&self) -> Term {
         self.state
             .acceptor_state
@@ -912,10 +911,8 @@ where
         )))
     }
 
-    /// Update timeline state with peer safekeeper data.
+    /// Update commit_lsn from peer safekeeper data.
     pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
-        let mut sync_control_file = false;
-
         if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) {
             // Note: the check is too restrictive, generally we can update local
             // commit_lsn if our history matches (is part of) history of advanced
@@ -924,29 +921,6 @@ where
                 self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?;
             }
         }
-
-        self.state.inmem.backup_lsn = max(Lsn(sk_info.backup_lsn), self.state.inmem.backup_lsn);
-        sync_control_file |= self.state.backup_lsn + (self.state.server.wal_seg_size as u64)
-            < self.state.inmem.backup_lsn;
-
-        self.state.inmem.remote_consistent_lsn = max(
-            Lsn(sk_info.remote_consistent_lsn),
-            self.state.inmem.remote_consistent_lsn,
-        );
-        sync_control_file |= self.state.remote_consistent_lsn
-            + (self.state.server.wal_seg_size as u64)
-            < self.state.inmem.remote_consistent_lsn;
-
-        self.state.inmem.peer_horizon_lsn = max(
-            Lsn(sk_info.peer_horizon_lsn),
-            self.state.inmem.peer_horizon_lsn,
-        );
-        sync_control_file |= self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64)
-            < self.state.inmem.peer_horizon_lsn;
-
-        if sync_control_file {
-            self.state.flush().await?;
-        }
         Ok(())
     }
 }
@@ -1039,7 +1013,7 @@ mod tests {
             persisted_state: test_sk_state(),
         };
         let wal_store = DummyWalStore { lsn: Lsn(0) };
-        let mut sk = SafeKeeper::new(storage, wal_store, NodeId(0)).unwrap();
+        let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();
 
         // check voting for 1 is ok
         let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 });
@@ -1055,7 +1029,7 @@ mod tests {
             persisted_state: state,
         };
 
-        sk = SafeKeeper::new(storage, sk.wal_store, NodeId(0)).unwrap();
+        sk = SafeKeeper::new(TimelineState::new(storage), sk.wal_store, NodeId(0)).unwrap();
 
         // and ensure voting second time for 1 is not ok
         vote_resp = sk.process_msg(&vote_request).await;
@@ -1072,7 +1046,7 @@ mod tests {
         };
         let wal_store = DummyWalStore { lsn: Lsn(0) };
 
-        let mut sk = SafeKeeper::new(storage, wal_store, NodeId(0)).unwrap();
+        let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();
 
         let mut ar_hdr = AppendRequestHeader {
             term: 1,
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index df75893838..90b1604adb 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -5,7 +5,7 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::metrics::RECEIVED_PS_FEEDBACKS;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{Term, TermLsn};
-use crate::timeline::FullAccessTimeline;
+use crate::timeline::WalResidentTimeline;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
@@ -387,10 +387,10 @@ impl SafekeeperPostgresHandler {
         term: Option<Term>,
     ) -> Result<(), QueryError> {
         let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
-        let full_access = tli.full_access_guard().await?;
+        let residence_guard = tli.wal_residence_guard().await?;
 
         if let Err(end) = self
-            .handle_start_replication_guts(pgb, start_pos, term, full_access)
+            .handle_start_replication_guts(pgb, start_pos, term, residence_guard)
             .await
         {
             let info = tli.get_safekeeper_info(&self.conf).await;
@@ -407,7 +407,7 @@ impl SafekeeperPostgresHandler {
         pgb: &mut PostgresBackend<IO>,
         start_pos: Lsn,
         term: Option<Term>,
-        tli: FullAccessTimeline,
+        tli: WalResidentTimeline,
     ) -> Result<(), CopyStreamHandlerEnd> {
         let appname = self.appname.clone();
 
@@ -458,7 +458,8 @@ impl SafekeeperPostgresHandler {
 
         let mut sender = WalSender {
             pgb,
-            tli: tli.clone(),
+            // should succeed since we're already holding another guard
+            tli: tli.wal_residence_guard().await?,
             appname,
             start_pos,
             end_pos,
@@ -527,7 +528,7 @@ impl EndWatch {
 /// A half driving sending WAL.
 struct WalSender<'a, IO> {
     pgb: &'a mut PostgresBackend<IO>,
-    tli: FullAccessTimeline,
+    tli: WalResidentTimeline,
     appname: Option<String>,
     // Position since which we are sending next chunk.
     start_pos: Lsn,
@@ -736,7 +737,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
 struct ReplyReader<IO> {
     reader: PostgresBackendReader<IO>,
     ws_guard: Arc<WalSenderGuard>,
-    tli: FullAccessTimeline,
+    tli: WalResidentTimeline,
 }
 
 impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 544ffdbb36..f632cd6fb3 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -31,12 +31,15 @@ use crate::safekeeper::{
     INVALID_TERM,
 };
 use crate::send_wal::WalSenders;
-use crate::state::{TimelineMemState, TimelinePersistentState};
+use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState};
+use crate::timeline_guard::ResidenceGuard;
+use crate::timeline_manager::{AtomicStatus, ManagerCtl};
 use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::{self};
+use crate::wal_backup_partial::PartialRemoteSegment;
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
-use crate::metrics::FullTimelineInfo;
+use crate::metrics::{FullTimelineInfo, WalStorageMetrics};
 use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
 use crate::{debug_dump, timeline_manager, wal_storage};
 use crate::{GlobalTimelines, SafeKeeperConf};
@@ -132,8 +135,9 @@ impl<'a> DerefMut for WriteGuardSharedState<'a> {
 
 impl<'a> Drop for WriteGuardSharedState<'a> {
     fn drop(&mut self) {
-        let term_flush_lsn = TermLsn::from((self.guard.sk.get_term(), self.guard.sk.flush_lsn()));
-        let commit_lsn = self.guard.sk.state.inmem.commit_lsn;
+        let term_flush_lsn =
+            TermLsn::from((self.guard.sk.last_log_term(), self.guard.sk.flush_lsn()));
+        let commit_lsn = self.guard.sk.state().inmem.commit_lsn;
 
         let _ = self.tli.term_flush_lsn_watch_tx.send_if_modified(|old| {
             if *old != term_flush_lsn {
@@ -162,10 +166,150 @@ impl<'a> Drop for WriteGuardSharedState<'a> {
     }
 }
 
+/// This structure is stored in shared state and represents the state of the timeline.
+/// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this
+/// case, SafeKeeper is not available (because WAL is not present on disk) and all
+/// operations can be done only with control file.
+pub enum StateSK {
+    Loaded(SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>),
+    Offloaded(Box<TimelineState<control_file::FileStorage>>),
+    // Not used, required for moving between states.
+    Empty,
+}
+
+impl StateSK {
+    pub fn flush_lsn(&self) -> Lsn {
+        match self {
+            StateSK::Loaded(sk) => sk.wal_store.flush_lsn(),
+            StateSK::Offloaded(state) => match state.eviction_state {
+                EvictionState::Offloaded(flush_lsn) => flush_lsn,
+                _ => panic!("StateSK::Offloaded mismatches with eviction_state from control_file"),
+            },
+            StateSK::Empty => unreachable!(),
+        }
+    }
+
+    /// Get a reference to the control file's timeline state.
+    pub fn state(&self) -> &TimelineState<control_file::FileStorage> {
+        match self {
+            StateSK::Loaded(sk) => &sk.state,
+            StateSK::Offloaded(ref s) => s,
+            StateSK::Empty => unreachable!(),
+        }
+    }
+
+    pub fn state_mut(&mut self) -> &mut TimelineState<control_file::FileStorage> {
+        match self {
+            StateSK::Loaded(sk) => &mut sk.state,
+            StateSK::Offloaded(ref mut s) => s,
+            StateSK::Empty => unreachable!(),
+        }
+    }
+
+    pub fn last_log_term(&self) -> Term {
+        self.state()
+            .acceptor_state
+            .get_last_log_term(self.flush_lsn())
+    }
+
+    /// Close open WAL files to release FDs.
+    fn close_wal_store(&mut self) {
+        if let StateSK::Loaded(sk) = self {
+            sk.wal_store.close();
+        }
+    }
+
+    /// Update timeline state with peer safekeeper data.
+    pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
+        // update commit_lsn if safekeeper is loaded
+        match self {
+            StateSK::Loaded(sk) => sk.record_safekeeper_info(sk_info).await?,
+            StateSK::Offloaded(_) => {}
+            StateSK::Empty => unreachable!(),
+        }
+
+        // update everything else, including remote_consistent_lsn and backup_lsn
+        let mut sync_control_file = false;
+        let state = self.state_mut();
+        let wal_seg_size = state.server.wal_seg_size as u64;
+
+        state.inmem.backup_lsn = max(Lsn(sk_info.backup_lsn), state.inmem.backup_lsn);
+        sync_control_file |= state.backup_lsn + wal_seg_size < state.inmem.backup_lsn;
+
+        state.inmem.remote_consistent_lsn = max(
+            Lsn(sk_info.remote_consistent_lsn),
+            state.inmem.remote_consistent_lsn,
+        );
+        sync_control_file |=
+            state.remote_consistent_lsn + wal_seg_size < state.inmem.remote_consistent_lsn;
+
+        state.inmem.peer_horizon_lsn =
+            max(Lsn(sk_info.peer_horizon_lsn), state.inmem.peer_horizon_lsn);
+        sync_control_file |= state.peer_horizon_lsn + wal_seg_size < state.inmem.peer_horizon_lsn;
+
+        if sync_control_file {
+            state.flush().await?;
+        }
+        Ok(())
+    }
+
+    /// Previously known as epoch_start_lsn. Needed only for reference in some APIs.
+    pub fn term_start_lsn(&self) -> Lsn {
+        match self {
+            StateSK::Loaded(sk) => sk.term_start_lsn,
+            StateSK::Offloaded(_) => Lsn(0),
+            StateSK::Empty => unreachable!(),
+        }
+    }
+
+    /// Used for metrics only.
+    pub fn wal_storage_metrics(&self) -> WalStorageMetrics {
+        match self {
+            StateSK::Loaded(sk) => sk.wal_store.get_metrics(),
+            StateSK::Offloaded(_) => WalStorageMetrics::default(),
+            StateSK::Empty => unreachable!(),
+        }
+    }
+
+    /// Returns WAL storage internal LSNs for debug dump.
+    pub fn wal_storage_internal_state(&self) -> (Lsn, Lsn, Lsn, bool) {
+        match self {
+            StateSK::Loaded(sk) => sk.wal_store.internal_state(),
+            StateSK::Offloaded(_) => {
+                let flush_lsn = self.flush_lsn();
+                (flush_lsn, flush_lsn, flush_lsn, false)
+            }
+            StateSK::Empty => unreachable!(),
+        }
+    }
+
+    /// Access to SafeKeeper object. Panics if offloaded, should be good to use from WalResidentTimeline.
+    pub fn safekeeper(
+        &mut self,
+    ) -> &mut SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage> {
+        match self {
+            StateSK::Loaded(sk) => sk,
+            StateSK::Offloaded(_) => {
+                panic!("safekeeper is offloaded, cannot be used")
+            }
+            StateSK::Empty => unreachable!(),
+        }
+    }
+
+    /// Moves control file's state structure out of the enum. Used to switch states.
+    fn take_state(self) -> TimelineState<control_file::FileStorage> {
+        match self {
+            StateSK::Loaded(sk) => sk.state,
+            StateSK::Offloaded(state) => *state,
+            StateSK::Empty => unreachable!(),
+        }
+    }
+}
+
 /// Shared state associated with database instance
 pub struct SharedState {
     /// Safekeeper object
-    pub(crate) sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
+    pub(crate) sk: StateSK,
     /// In memory list containing state of peers sent in latest messages from them.
     pub(crate) peers_info: PeersInfo,
     // True value hinders old WAL removal; this is used by snapshotting. We
@@ -203,10 +347,10 @@ impl SharedState {
             control_file::FileStorage::create_new(timeline_dir.clone(), conf, state)?;
         let wal_store =
             wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
-        let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
+        let sk = SafeKeeper::new(TimelineState::new(control_store), wal_store, conf.my_id)?;
 
         Ok(Self {
-            sk,
+            sk: StateSK::Loaded(sk),
             peers_info: PeersInfo(vec![]),
             wal_removal_on_hold: false,
         })
@@ -220,18 +364,30 @@ impl SharedState {
             bail!(TimelineError::UninitializedWalSegSize(*ttid));
         }
 
-        let wal_store =
-            wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
+        let sk = match control_store.eviction_state {
+            EvictionState::Present => {
+                let wal_store =
+                    wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
+                StateSK::Loaded(SafeKeeper::new(
+                    TimelineState::new(control_store),
+                    wal_store,
+                    conf.my_id,
+                )?)
+            }
+            EvictionState::Offloaded(_) => {
+                StateSK::Offloaded(Box::new(TimelineState::new(control_store)))
+            }
+        };
 
         Ok(Self {
-            sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
+            sk,
             peers_info: PeersInfo(vec![]),
             wal_removal_on_hold: false,
         })
     }
 
     pub(crate) fn get_wal_seg_size(&self) -> usize {
-        self.sk.state.server.wal_seg_size as usize
+        self.sk.state().server.wal_seg_size as usize
     }
 
     fn get_safekeeper_info(
@@ -246,20 +402,20 @@ impl SharedState {
                 tenant_id: ttid.tenant_id.as_ref().to_owned(),
                 timeline_id: ttid.timeline_id.as_ref().to_owned(),
             }),
-            term: self.sk.state.acceptor_state.term,
-            last_log_term: self.sk.get_last_log_term(),
+            term: self.sk.state().acceptor_state.term,
+            last_log_term: self.sk.last_log_term(),
             flush_lsn: self.sk.flush_lsn().0,
             // note: this value is not flushed to control file yet and can be lost
-            commit_lsn: self.sk.state.inmem.commit_lsn.0,
-            remote_consistent_lsn: self.sk.state.inmem.remote_consistent_lsn.0,
-            peer_horizon_lsn: self.sk.state.inmem.peer_horizon_lsn.0,
+            commit_lsn: self.sk.state().inmem.commit_lsn.0,
+            remote_consistent_lsn: self.sk.state().inmem.remote_consistent_lsn.0,
+            peer_horizon_lsn: self.sk.state().inmem.peer_horizon_lsn.0,
             safekeeper_connstr: conf
                 .advertise_pg_addr
                 .to_owned()
                 .unwrap_or(conf.listen_pg_addr.clone()),
             http_connstr: conf.listen_http_addr.to_owned(),
-            backup_lsn: self.sk.state.inmem.backup_lsn.0,
-            local_start_lsn: self.sk.state.local_start_lsn.0,
+            backup_lsn: self.sk.state().inmem.backup_lsn.0,
+            local_start_lsn: self.sk.state().local_start_lsn.0,
             availability_zone: conf.availability_zone.clone(),
             standby_horizon: standby_apply_lsn.0,
         }
@@ -335,6 +491,7 @@ pub struct Timeline {
     walsenders: Arc<WalSenders>,
     walreceivers: Arc<WalReceivers>,
     timeline_dir: Utf8PathBuf,
+    manager_ctl: ManagerCtl,
 
     /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires
     pub(crate) cancel: CancellationToken,
@@ -343,6 +500,7 @@ pub struct Timeline {
     pub(crate) broker_active: AtomicBool,
     pub(crate) wal_backup_active: AtomicBool,
     pub(crate) last_removed_segno: AtomicU64,
+    pub(crate) mgr_status: AtomicStatus,
 }
 
 impl Timeline {
@@ -352,9 +510,9 @@ impl Timeline {
 
         let shared_state = SharedState::restore(conf, &ttid)?;
         let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
-            watch::channel(shared_state.sk.state.commit_lsn);
+            watch::channel(shared_state.sk.state().commit_lsn);
         let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = watch::channel(TermLsn::from((
-            shared_state.sk.get_term(),
+            shared_state.sk.last_log_term(),
             shared_state.sk.flush_lsn(),
         )));
         let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
@@ -373,9 +531,11 @@ impl Timeline {
             walreceivers,
             cancel: CancellationToken::default(),
             timeline_dir: get_timeline_dir(conf, &ttid),
+            manager_ctl: ManagerCtl::new(),
             broker_active: AtomicBool::new(false),
             wal_backup_active: AtomicBool::new(false),
             last_removed_segno: AtomicU64::new(0),
+            mgr_status: AtomicStatus::new(),
         })
     }
 
@@ -409,9 +569,11 @@ impl Timeline {
             walreceivers,
             cancel: CancellationToken::default(),
             timeline_dir: get_timeline_dir(conf, &ttid),
+            manager_ctl: ManagerCtl::new(),
             broker_active: AtomicBool::new(false),
             wal_backup_active: AtomicBool::new(false),
             last_removed_segno: AtomicU64::new(0),
+            mgr_status: AtomicStatus::new(),
         })
     }
 
@@ -442,7 +604,7 @@ impl Timeline {
         fs::create_dir_all(&self.timeline_dir).await?;
 
         // Write timeline to disk and start background tasks.
-        if let Err(e) = shared_state.sk.state.flush().await {
+        if let Err(e) = shared_state.sk.state_mut().flush().await {
             // Bootstrap failed, cancel timeline and remove timeline directory.
             self.cancel(shared_state);
 
@@ -465,12 +627,16 @@ impl Timeline {
         conf: &SafeKeeperConf,
         broker_active_set: Arc<TimelinesSet>,
     ) {
+        let (tx, rx) = self.manager_ctl.bootstrap_manager();
+
         // Start manager task which will monitor timeline state and update
         // background tasks.
         tokio::spawn(timeline_manager::main_task(
-            self.clone(),
+            ManagerTimeline { tli: self.clone() },
             conf.clone(),
             broker_active_set,
+            tx,
+            rx,
         ));
     }
 
@@ -507,7 +673,7 @@ impl Timeline {
         self.cancel.cancel();
         // Close associated FDs. Nobody will be able to touch timeline data once
         // it is cancelled, so WAL storage won't be opened again.
-        shared_state.sk.wal_store.close();
+        shared_state.sk.close_wal_store();
     }
 
     /// Returns if timeline is cancelled.
@@ -547,12 +713,15 @@ impl Timeline {
     /// Returns state of the timeline.
     pub async fn get_state(&self) -> (TimelineMemState, TimelinePersistentState) {
         let state = self.read_shared_state().await;
-        (state.sk.state.inmem.clone(), state.sk.state.clone())
+        (
+            state.sk.state().inmem.clone(),
+            TimelinePersistentState::clone(state.sk.state()),
+        )
     }
 
     /// Returns latest backup_lsn.
     pub async fn get_wal_backup_lsn(&self) -> Lsn {
-        self.read_shared_state().await.sk.state.inmem.backup_lsn
+        self.read_shared_state().await.sk.state().inmem.backup_lsn
     }
 
     /// Sets backup_lsn to the given value.
@@ -562,7 +731,7 @@ impl Timeline {
         }
 
         let mut state = self.write_shared_state().await;
-        state.sk.state.inmem.backup_lsn = max(state.sk.state.inmem.backup_lsn, backup_lsn);
+        state.sk.state_mut().inmem.backup_lsn = max(state.sk.state().inmem.backup_lsn, backup_lsn);
         // we should check whether to shut down offloader, but this will be done
         // soon by peer communication anyway.
         Ok(())
@@ -604,7 +773,7 @@ impl Timeline {
 
     /// Returns flush_lsn.
     pub async fn get_flush_lsn(&self) -> Lsn {
-        self.read_shared_state().await.sk.wal_store.flush_lsn()
+        self.read_shared_state().await.sk.flush_lsn()
     }
 
     /// Gather timeline data for metrics.
@@ -623,11 +792,11 @@ impl Timeline {
             timeline_is_active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
             last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
-            epoch_start_lsn: state.sk.term_start_lsn,
-            mem_state: state.sk.state.inmem.clone(),
-            persisted_state: state.sk.state.clone(),
-            flush_lsn: state.sk.wal_store.flush_lsn(),
-            wal_storage: state.sk.wal_store.get_metrics(),
+            epoch_start_lsn: state.sk.term_start_lsn(),
+            mem_state: state.sk.state().inmem.clone(),
+            persisted_state: TimelinePersistentState::clone(state.sk.state()),
+            flush_lsn: state.sk.flush_lsn(),
+            wal_storage: state.sk.wal_storage_metrics(),
         })
     }
 
@@ -636,7 +805,7 @@ impl Timeline {
         let state = self.read_shared_state().await;
 
         let (write_lsn, write_record_lsn, flush_lsn, file_open) =
-            state.sk.wal_store.internal_state();
+            state.sk.wal_storage_internal_state();
 
         debug_dump::Memory {
             is_cancelled: self.is_cancelled(),
@@ -646,8 +815,9 @@ impl Timeline {
             active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
             last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
-            epoch_start_lsn: state.sk.term_start_lsn,
-            mem_state: state.sk.state.inmem.clone(),
+            epoch_start_lsn: state.sk.term_start_lsn(),
+            mem_state: state.sk.state().inmem.clone(),
+            mgr_status: self.mgr_status.get(),
             write_lsn,
             write_record_lsn,
             flush_lsn,
@@ -661,34 +831,77 @@ impl Timeline {
         f: impl FnOnce(&mut TimelinePersistentState) -> Result<T>,
     ) -> Result<T> {
         let mut state = self.write_shared_state().await;
-        let mut persistent_state = state.sk.state.start_change();
+        let mut persistent_state = state.sk.state_mut().start_change();
         // If f returns error, we abort the change and don't persist anything.
         let res = f(&mut persistent_state)?;
         // If persisting fails, we abort the change and return error.
-        state.sk.state.finish_change(&persistent_state).await?;
+        state
+            .sk
+            .state_mut()
+            .finish_change(&persistent_state)
+            .await?;
         Ok(res)
     }
 
     /// Get the timeline guard for reading/writing WAL files.
-    /// TODO: if WAL files are not present on disk (evicted), they will be
-    /// downloaded from S3. Also there will logic for preventing eviction
-    /// while someone is holding FullAccessTimeline guard.
-    pub async fn full_access_guard(self: &Arc<Self>) -> Result<FullAccessTimeline> {
+    /// If WAL files are not present on disk (evicted), they will be automatically
+    /// downloaded from remote storage. This is done in the manager task, which is
+    /// responsible for issuing all guards.
+    ///
+    /// NB: don't use this function from timeline_manager, it will deadlock.
+    /// NB: don't use this function while holding shared_state lock.
+    pub async fn wal_residence_guard(self: &Arc<Self>) -> Result<WalResidentTimeline> {
         if self.is_cancelled() {
             bail!(TimelineError::Cancelled(self.ttid));
         }
-        Ok(FullAccessTimeline { tli: self.clone() })
+
+        debug!("requesting WalResidentTimeline guard");
+
+        // Wait 5 seconds for the guard to be acquired, should be enough for uneviction.
+        // If it times out, most likely there is a deadlock in the manager task.
+        let res = tokio::time::timeout(
+            Duration::from_secs(5),
+            self.manager_ctl.wal_residence_guard(),
+        )
+        .await;
+
+        let guard = match res {
+            Ok(Ok(guard)) => guard,
+            Ok(Err(e)) => {
+                warn!(
+                    "error while acquiring WalResidentTimeline guard (current state {:?}): {}",
+                    self.mgr_status.get(),
+                    e
+                );
+                return Err(e);
+            }
+            Err(_) => {
+                warn!(
+                    "timeout while acquiring WalResidentTimeline guard (current state {:?})",
+                    self.mgr_status.get()
+                );
+                anyhow::bail!("timeout while acquiring WalResidentTimeline guard");
+            }
+        };
+
+        Ok(WalResidentTimeline::new(self.clone(), guard))
     }
 }
 
 /// This is a guard that allows to read/write disk timeline state.
-/// All tasks that are using the disk should use this guard.
-#[derive(Clone)]
-pub struct FullAccessTimeline {
+/// All tasks that are trying to read/write WAL from disk should use this guard.
+pub struct WalResidentTimeline {
     pub tli: Arc<Timeline>,
+    _guard: ResidenceGuard,
 }
 
-impl Deref for FullAccessTimeline {
+impl WalResidentTimeline {
+    pub fn new(tli: Arc<Timeline>, _guard: ResidenceGuard) -> Self {
+        WalResidentTimeline { tli, _guard }
+    }
+}
+
+impl Deref for WalResidentTimeline {
     type Target = Arc<Timeline>;
 
     fn deref(&self) -> &Self::Target {
@@ -696,7 +909,7 @@ impl Deref for FullAccessTimeline {
     }
 }
 
-impl FullAccessTimeline {
+impl WalResidentTimeline {
     /// Returns true if walsender should stop sending WAL to pageserver. We
     /// terminate it if remote_consistent_lsn reached commit_lsn and there is no
     /// computes. While there might be nothing to stream already, we learn about
@@ -708,8 +921,8 @@ impl FullAccessTimeline {
         }
         let shared_state = self.read_shared_state().await;
         if self.walreceivers.get_num() == 0 {
-            return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
-            reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
+            return shared_state.sk.state().inmem.commit_lsn == Lsn(0) || // no data at all yet
+            reported_remote_consistent_lsn >= shared_state.sk.state().inmem.commit_lsn;
         }
         false
     }
@@ -717,11 +930,11 @@ impl FullAccessTimeline {
     /// Ensure that current term is t, erroring otherwise, and lock the state.
     pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
         let ss = self.read_shared_state().await;
-        if ss.sk.state.acceptor_state.term != t {
+        if ss.sk.state().acceptor_state.term != t {
             bail!(
                 "failed to acquire term {}, current term {}",
                 t,
-                ss.sk.state.acceptor_state.term
+                ss.sk.state().acceptor_state.term
             );
         }
         Ok(ss)
@@ -739,7 +952,7 @@ impl FullAccessTimeline {
         let mut rmsg: Option<AcceptorProposerMessage>;
         {
             let mut shared_state = self.write_shared_state().await;
-            rmsg = shared_state.sk.process_msg(msg).await?;
+            rmsg = shared_state.sk.safekeeper().process_msg(msg).await?;
 
             // if this is AppendResponse, fill in proper hot standby feedback.
             if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
@@ -769,8 +982,141 @@ impl FullAccessTimeline {
     /// Update in memory remote consistent lsn.
     pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) {
         let mut shared_state = self.write_shared_state().await;
-        shared_state.sk.state.inmem.remote_consistent_lsn =
-            max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
+        shared_state.sk.state_mut().inmem.remote_consistent_lsn = max(
+            shared_state.sk.state().inmem.remote_consistent_lsn,
+            candidate,
+        );
+    }
+}
+
+/// This struct contains methods that are used by timeline manager task.
+pub(crate) struct ManagerTimeline {
+    pub(crate) tli: Arc<Timeline>,
+}
+
+impl Deref for ManagerTimeline {
+    type Target = Arc<Timeline>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.tli
+    }
+}
+
+impl ManagerTimeline {
+    pub(crate) fn timeline_dir(&self) -> &Utf8PathBuf {
+        &self.tli.timeline_dir
+    }
+
+    /// Manager requests this state on startup.
+    pub(crate) async fn bootstrap_mgr(&self) -> (bool, Option<PartialRemoteSegment>) {
+        let shared_state = self.read_shared_state().await;
+        let is_offloaded = matches!(
+            shared_state.sk.state().eviction_state,
+            EvictionState::Offloaded(_)
+        );
+        let partial_backup_uploaded = shared_state.sk.state().partial_backup.uploaded_segment();
+
+        (is_offloaded, partial_backup_uploaded)
+    }
+
+    /// Try to switch state Present->Offloaded.
+    pub(crate) async fn switch_to_offloaded(
+        &self,
+        partial: &PartialRemoteSegment,
+    ) -> anyhow::Result<()> {
+        let mut shared = self.write_shared_state().await;
+
+        // updating control file
+        let mut pstate = shared.sk.state_mut().start_change();
+
+        if !matches!(pstate.eviction_state, EvictionState::Present) {
+            bail!(
+                "cannot switch to offloaded state, current state is {:?}",
+                pstate.eviction_state
+            );
+        }
+
+        if partial.flush_lsn != shared.sk.flush_lsn() {
+            bail!(
+                "flush_lsn mismatch in partial backup, expected {}, got {}",
+                shared.sk.flush_lsn(),
+                partial.flush_lsn
+            );
+        }
+
+        if partial.commit_lsn != pstate.commit_lsn {
+            bail!(
+                "commit_lsn mismatch in partial backup, expected {}, got {}",
+                pstate.commit_lsn,
+                partial.commit_lsn
+            );
+        }
+
+        if partial.term != shared.sk.last_log_term() {
+            bail!(
+                "term mismatch in partial backup, expected {}, got {}",
+                shared.sk.last_log_term(),
+                partial.term
+            );
+        }
+
+        pstate.eviction_state = EvictionState::Offloaded(shared.sk.flush_lsn());
+        shared.sk.state_mut().finish_change(&pstate).await?;
+        // control file is now switched to Offloaded state
+
+        // now we can switch shared.sk to Offloaded, shouldn't fail
+        let prev_sk = std::mem::replace(&mut shared.sk, StateSK::Empty);
+        let cfile_state = prev_sk.take_state();
+        shared.sk = StateSK::Offloaded(Box::new(cfile_state));
+
+        Ok(())
+    }
+
+    /// Try to switch state Offloaded->Present.
+    pub(crate) async fn switch_to_present(&self) -> anyhow::Result<()> {
+        let conf = GlobalTimelines::get_global_config();
+        let mut shared = self.write_shared_state().await;
+
+        // trying to restore WAL storage
+        let wal_store = wal_storage::PhysicalStorage::new(
+            &self.ttid,
+            self.timeline_dir.clone(),
+            &conf,
+            shared.sk.state(),
+        )?;
+
+        // updating control file
+        let mut pstate = shared.sk.state_mut().start_change();
+
+        if !matches!(pstate.eviction_state, EvictionState::Offloaded(_)) {
+            bail!(
+                "cannot switch to present state, current state is {:?}",
+                pstate.eviction_state
+            );
+        }
+
+        if wal_store.flush_lsn() != shared.sk.flush_lsn() {
+            bail!(
+                "flush_lsn mismatch in restored WAL, expected {}, got {}",
+                shared.sk.flush_lsn(),
+                wal_store.flush_lsn()
+            );
+        }
+
+        pstate.eviction_state = EvictionState::Present;
+        shared.sk.state_mut().finish_change(&pstate).await?;
+
+        // now we can switch shared.sk to Present, shouldn't fail
+        let prev_sk = std::mem::replace(&mut shared.sk, StateSK::Empty);
+        let cfile_state = prev_sk.take_state();
+        shared.sk = StateSK::Loaded(SafeKeeper::new(cfile_state, wal_store, conf.my_id)?);
+
+        Ok(())
+    }
+
+    /// Update current manager state, useful for debugging manager deadlocks.
+    pub(crate) fn set_status(&self, status: timeline_manager::Status) {
+        self.mgr_status.store(status, Ordering::Relaxed);
     }
 }
 
@@ -784,13 +1130,13 @@ async fn delete_dir(path: &Utf8PathBuf) -> Result<bool> {
 }
 
 /// Get a path to the tenant directory. If you just need to get a timeline directory,
-/// use FullAccessTimeline::get_timeline_dir instead.
+/// use WalResidentTimeline::get_timeline_dir instead.
 pub(crate) fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf {
     conf.workdir.join(tenant_id.to_string())
 }
 
 /// Get a path to the timeline directory. If you need to read WAL files from disk,
-/// use FullAccessTimeline::get_timeline_dir instead. This function does not check
+/// use WalResidentTimeline::get_timeline_dir instead. This function does not check
 /// timeline eviction status and WAL files might not be present on disk.
 pub(crate) fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf {
     get_tenant_dir(conf, &ttid.tenant_id).join(ttid.timeline_id.to_string())
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
new file mode 100644
index 0000000000..b303d41b7b
--- /dev/null
+++ b/safekeeper/src/timeline_eviction.rs
@@ -0,0 +1,366 @@
+//! Code related to evicting WAL files to remote storage. The actual upload is done by the
+//! partial WAL backup code. This file has code to delete and re-download WAL files,
+//! cross-validate with partial WAL backup if local file is still present.
+
+use anyhow::Context;
+use camino::Utf8PathBuf;
+use remote_storage::RemotePath;
+use tokio::{
+    fs::File,
+    io::{AsyncRead, AsyncWriteExt},
+};
+use tracing::{debug, info, instrument, warn};
+use utils::crashsafe::durable_rename;
+
+use crate::{
+    timeline_manager::{Manager, StateSnapshot},
+    wal_backup,
+    wal_backup_partial::{self, PartialRemoteSegment},
+    wal_storage::wal_file_paths,
+};
+
+impl Manager {
+    /// Returns true if the timeline is ready for eviction.
+    /// Current criteria:
+    /// - no active tasks
+    /// - control file is flushed (no next event scheduled)
+    /// - no WAL residence guards
+    /// - no pushes to the broker
+    /// - partial WAL backup is uploaded
+    pub(crate) fn ready_for_eviction(
+        &self,
+        next_event: &Option<tokio::time::Instant>,
+        state: &StateSnapshot,
+    ) -> bool {
+        self.backup_task.is_none()
+            && self.recovery_task.is_none()
+            && self.wal_removal_task.is_none()
+            && self.partial_backup_task.is_none()
+            && self.partial_backup_uploaded.is_some()
+            && next_event.is_none()
+            && self.access_service.is_empty()
+            && !self.tli_broker_active.get()
+            && !wal_backup_partial::needs_uploading(state, &self.partial_backup_uploaded)
+            && self
+                .partial_backup_uploaded
+                .as_ref()
+                .unwrap()
+                .flush_lsn
+                .segment_number(self.wal_seg_size)
+                == self.last_removed_segno + 1
+    }
+
+    /// Evict the timeline to remote storage.
+    #[instrument(name = "evict_timeline", skip_all)]
+    pub(crate) async fn evict_timeline(&mut self) {
+        assert!(!self.is_offloaded);
+        let partial_backup_uploaded = match &self.partial_backup_uploaded {
+            Some(p) => p.clone(),
+            None => {
+                warn!("no partial backup uploaded, skipping eviction");
+                return;
+            }
+        };
+
+        info!("starting eviction, using {:?}", partial_backup_uploaded);
+
+        if let Err(e) = do_eviction(self, &partial_backup_uploaded).await {
+            warn!("failed to evict timeline: {:?}", e);
+            return;
+        }
+
+        info!("successfully evicted timeline");
+    }
+
+    /// Restore evicted timeline from remote storage.
+    #[instrument(name = "unevict_timeline", skip_all)]
+    pub(crate) async fn unevict_timeline(&mut self) {
+        assert!(self.is_offloaded);
+        let partial_backup_uploaded = match &self.partial_backup_uploaded {
+            Some(p) => p.clone(),
+            None => {
+                warn!("no partial backup uploaded, cannot unevict");
+                return;
+            }
+        };
+
+        info!("starting uneviction, using {:?}", partial_backup_uploaded);
+
+        if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await {
+            warn!("failed to unevict timeline: {:?}", e);
+            return;
+        }
+
+        info!("successfully restored evicted timeline");
+    }
+}
+
+/// Ensure that content matches the remote partial backup, if local segment exists.
+/// Then change state in control file and in-memory. If `delete_offloaded_wal` is set,
+/// delete the local segment.
+async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> {
+    compare_local_segment_with_remote(mgr, partial).await?;
+
+    mgr.tli.switch_to_offloaded(partial).await?;
+    // switch manager state as soon as possible
+    mgr.is_offloaded = true;
+
+    if mgr.conf.delete_offloaded_wal {
+        delete_local_segment(mgr, partial).await?;
+    }
+
+    Ok(())
+}
+
+/// Ensure that content matches the remote partial backup, if local segment exists.
+/// Then download segment to local disk and change state in control file and in-memory.
+async fn do_uneviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> {
+    // if the local segment is present, validate it
+    compare_local_segment_with_remote(mgr, partial).await?;
+
+    // atomically download the partial segment
+    redownload_partial_segment(mgr, partial).await?;
+
+    mgr.tli.switch_to_present().await?;
+    // switch manager state as soon as possible
+    mgr.is_offloaded = false;
+
+    Ok(())
+}
+
+/// Delete local WAL segment.
+async fn delete_local_segment(mgr: &Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> {
+    let local_path = local_segment_path(mgr, partial);
+
+    info!("deleting WAL file to evict: {}", local_path);
+    tokio::fs::remove_file(&local_path).await?;
+    Ok(())
+}
+
+/// Redownload partial segment from remote storage.
+/// The segment is downloaded to a temporary file and then renamed to the final path.
+async fn redownload_partial_segment(
+    mgr: &Manager,
+    partial: &PartialRemoteSegment,
+) -> anyhow::Result<()> {
+    let tmp_file = mgr.tli.timeline_dir().join("remote_partial.tmp");
+    let remote_segfile = remote_segment_path(mgr, partial)?;
+
+    debug!(
+        "redownloading partial segment: {} -> {}",
+        remote_segfile, tmp_file
+    );
+
+    let mut reader = wal_backup::read_object(&remote_segfile, 0).await?;
+    let mut file = File::create(&tmp_file).await?;
+
+    let actual_len = tokio::io::copy(&mut reader, &mut file).await?;
+    let expected_len = partial.flush_lsn.segment_offset(mgr.wal_seg_size);
+
+    if actual_len != expected_len as u64 {
+        anyhow::bail!(
+            "partial downloaded {} bytes, expected {}",
+            actual_len,
+            expected_len
+        );
+    }
+
+    if actual_len > mgr.wal_seg_size as u64 {
+        anyhow::bail!(
+            "remote segment is too long: {} bytes, expected {}",
+            actual_len,
+            mgr.wal_seg_size
+        );
+    }
+    file.set_len(mgr.wal_seg_size as u64).await?;
+    file.flush().await?;
+
+    let final_path = local_segment_path(mgr, partial);
+    info!(
+        "downloaded {} bytes, renaming to {}",
+        final_path, final_path,
+    );
+    if let Err(e) = durable_rename(&tmp_file, &final_path, !mgr.conf.no_sync).await {
+        // Probably rename succeeded, but fsync of it failed. Remove
+        // the file then to avoid using it.
+        tokio::fs::remove_file(tmp_file)
+            .await
+            .or_else(utils::fs_ext::ignore_not_found)?;
+        return Err(e.into());
+    }
+
+    Ok(())
+}
+
+/// Compare local WAL segment with partial WAL backup in remote storage.
+/// If the local segment is not present, the function does nothing.
+/// If the local segment is present, it compares the local segment with the remote one.
+async fn compare_local_segment_with_remote(
+    mgr: &Manager,
+    partial: &PartialRemoteSegment,
+) -> anyhow::Result<()> {
+    let local_path = local_segment_path(mgr, partial);
+
+    match File::open(&local_path).await {
+        Ok(mut local_file) => do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial)
+            .await
+            .context("validation failed"),
+        Err(_) => {
+            info!(
+                "local WAL file {} is not present, skipping validation",
+                local_path
+            );
+            Ok(())
+        }
+    }
+}
+
+/// Compare opened local WAL segment with partial WAL backup in remote storage.
+/// Validate full content of both files.
+async fn do_validation(
+    mgr: &Manager,
+    file: &mut File,
+    wal_seg_size: usize,
+    partial: &PartialRemoteSegment,
+) -> anyhow::Result<()> {
+    let local_size = file.metadata().await?.len() as usize;
+    if local_size != wal_seg_size {
+        anyhow::bail!(
+            "local segment size is invalid: found {}, expected {}",
+            local_size,
+            wal_seg_size
+        );
+    }
+
+    let remote_segfile = remote_segment_path(mgr, partial)?;
+    let mut remote_reader: std::pin::Pin<Box<dyn AsyncRead + Send + Sync>> =
+        wal_backup::read_object(&remote_segfile, 0).await?;
+
+    // remote segment should have bytes excatly up to `flush_lsn`
+    let expected_remote_size = partial.flush_lsn.segment_offset(mgr.wal_seg_size);
+    // let's compare the first `expected_remote_size` bytes
+    compare_n_bytes(&mut remote_reader, file, expected_remote_size).await?;
+    // and check that the remote segment ends here
+    check_end(&mut remote_reader).await?;
+
+    // if local segment is longer, the rest should be zeroes
+    read_n_zeroes(file, mgr.wal_seg_size - expected_remote_size).await?;
+    // and check that the local segment ends here
+    check_end(file).await?;
+
+    Ok(())
+}
+
+fn local_segment_path(mgr: &Manager, partial: &PartialRemoteSegment) -> Utf8PathBuf {
+    let flush_lsn = partial.flush_lsn;
+    let segno = flush_lsn.segment_number(mgr.wal_seg_size);
+    let (_, local_partial_segfile) =
+        wal_file_paths(mgr.tli.timeline_dir(), segno, mgr.wal_seg_size);
+    local_partial_segfile
+}
+
+fn remote_segment_path(
+    mgr: &Manager,
+    partial: &PartialRemoteSegment,
+) -> anyhow::Result<RemotePath> {
+    let remote_timeline_path = wal_backup::remote_timeline_path(&mgr.tli.ttid)?;
+    Ok(partial.remote_path(&remote_timeline_path))
+}
+
+/// Compare first `n` bytes of two readers. If the bytes differ, return an error.
+/// If the readers are shorter than `n`, return an error.
+async fn compare_n_bytes<R1, R2>(reader1: &mut R1, reader2: &mut R2, n: usize) -> anyhow::Result<()>
+where
+    R1: AsyncRead + Unpin,
+    R2: AsyncRead + Unpin,
+{
+    use tokio::io::AsyncReadExt;
+
+    const BUF_SIZE: usize = 32 * 1024;
+
+    let mut buffer1 = vec![0u8; BUF_SIZE];
+    let mut buffer2 = vec![0u8; BUF_SIZE];
+
+    let mut offset = 0;
+
+    while offset < n {
+        let bytes_to_read = std::cmp::min(BUF_SIZE, n - offset);
+
+        let bytes_read1 = reader1
+            .read(&mut buffer1[..bytes_to_read])
+            .await
+            .with_context(|| format!("failed to read from reader1 at offset {}", offset))?;
+        if bytes_read1 == 0 {
+            anyhow::bail!("unexpected EOF from reader1 at offset {}", offset);
+        }
+
+        let bytes_read2 = reader2
+            .read_exact(&mut buffer2[..bytes_read1])
+            .await
+            .with_context(|| {
+                format!(
+                    "failed to read {} bytes from reader2 at offset {}",
+                    bytes_read1, offset
+                )
+            })?;
+        assert!(bytes_read2 == bytes_read1);
+
+        if buffer1[..bytes_read1] != buffer2[..bytes_read2] {
+            let diff_offset = buffer1[..bytes_read1]
+                .iter()
+                .zip(buffer2[..bytes_read2].iter())
+                .position(|(a, b)| a != b)
+                .expect("mismatched buffers, but no difference found");
+            anyhow::bail!("mismatch at offset {}", offset + diff_offset);
+        }
+
+        offset += bytes_read1;
+    }
+
+    Ok(())
+}
+
+async fn check_end<R>(mut reader: R) -> anyhow::Result<()>
+where
+    R: AsyncRead + Unpin,
+{
+    use tokio::io::AsyncReadExt;
+
+    let mut buffer = [0u8; 1];
+    let bytes_read = reader.read(&mut buffer).await?;
+    if bytes_read != 0 {
+        anyhow::bail!("expected EOF, found bytes");
+    }
+    Ok(())
+}
+
+async fn read_n_zeroes<R>(reader: &mut R, n: usize) -> anyhow::Result<()>
+where
+    R: AsyncRead + Unpin,
+{
+    use tokio::io::AsyncReadExt;
+
+    const BUF_SIZE: usize = 32 * 1024;
+    let mut buffer = vec![0u8; BUF_SIZE];
+    let mut offset = 0;
+
+    while offset < n {
+        let bytes_to_read = std::cmp::min(BUF_SIZE, n - offset);
+
+        let bytes_read = reader
+            .read(&mut buffer[..bytes_to_read])
+            .await
+            .context("expected zeroes, got read error")?;
+        if bytes_read == 0 {
+            anyhow::bail!("expected zeroes, got EOF");
+        }
+
+        if buffer[..bytes_read].iter().all(|&b| b == 0) {
+            offset += bytes_read;
+        } else {
+            anyhow::bail!("non-zero byte found");
+        }
+    }
+
+    Ok(())
+}
diff --git a/safekeeper/src/timeline_guard.rs b/safekeeper/src/timeline_guard.rs
new file mode 100644
index 0000000000..e249c859b4
--- /dev/null
+++ b/safekeeper/src/timeline_guard.rs
@@ -0,0 +1,71 @@
+//! Timeline residence guard is needed to ensure that WAL segments are present on disk,
+//! as long as the code is holding the guard. This file implements guard logic, to issue
+//! and drop guards, and to notify the manager when the guard is dropped.
+
+use std::collections::HashSet;
+
+use tracing::{debug, warn};
+
+use crate::timeline_manager::ManagerCtlMessage;
+
+#[derive(Debug, Clone, Copy)]
+pub struct GuardId(u64);
+
+pub struct ResidenceGuard {
+    manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
+    guard_id: GuardId,
+}
+
+impl Drop for ResidenceGuard {
+    fn drop(&mut self) {
+        // notify the manager that the guard is dropped
+        let res = self
+            .manager_tx
+            .send(ManagerCtlMessage::GuardDrop(self.guard_id));
+        if let Err(e) = res {
+            warn!("failed to send GuardDrop message: {:?}", e);
+        }
+    }
+}
+
+/// AccessService is responsible for issuing and dropping residence guards.
+/// All guards are stored in the `guards` set.
+/// TODO: it's possible to add `String` name to each guard, for better observability.
+pub(crate) struct AccessService {
+    next_guard_id: u64,
+    guards: HashSet<u64>,
+    manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
+}
+
+impl AccessService {
+    pub(crate) fn new(manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>) -> Self {
+        Self {
+            next_guard_id: 0,
+            guards: HashSet::new(),
+            manager_tx,
+        }
+    }
+
+    pub(crate) fn is_empty(&self) -> bool {
+        self.guards.is_empty()
+    }
+
+    pub(crate) fn create_guard(&mut self) -> ResidenceGuard {
+        let guard_id = self.next_guard_id;
+        self.next_guard_id += 1;
+        self.guards.insert(guard_id);
+
+        let guard_id = GuardId(guard_id);
+        debug!("issued a new guard {:?}", guard_id);
+
+        ResidenceGuard {
+            manager_tx: self.manager_tx.clone(),
+            guard_id,
+        }
+    }
+
+    pub(crate) fn drop_guard(&mut self, guard_id: GuardId) {
+        debug!("dropping guard {:?}", guard_id);
+        assert!(self.guards.remove(&guard_id.0));
+    }
+}
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index 592426bba3..c3abeac644 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -2,66 +2,83 @@
 //! It is spawned alongside each timeline and exits when the timeline is deleted.
 //! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
 //! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
+//!
+//! Be aware that you need to be extra careful with manager code, because it is not respawned on panic.
+//! Also, if it will stuck in some branch, it will prevent any further progress in the timeline.
 
 use std::{
-    sync::Arc,
-    time::{Duration, Instant},
+    sync::{atomic::AtomicUsize, Arc},
+    time::Duration,
 };
 
 use postgres_ffi::XLogSegNo;
-use tokio::task::{JoinError, JoinHandle};
-use tracing::{info, info_span, instrument, warn, Instrument};
+use serde::{Deserialize, Serialize};
+use tokio::{
+    task::{JoinError, JoinHandle},
+    time::Instant,
+};
+use tracing::{debug, info, info_span, instrument, warn, Instrument};
 use utils::lsn::Lsn;
 
 use crate::{
-    control_file::Storage,
+    control_file::{FileStorage, Storage},
     metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
     recovery::recovery_main,
     remove_wal::calc_horizon_lsn,
+    safekeeper::Term,
     send_wal::WalSenders,
-    timeline::{PeerInfo, ReadGuardSharedState, Timeline},
+    state::TimelineState,
+    timeline::{ManagerTimeline, PeerInfo, ReadGuardSharedState, StateSK, WalResidentTimeline},
+    timeline_guard::{AccessService, GuardId, ResidenceGuard},
     timelines_set::{TimelineSetGuard, TimelinesSet},
     wal_backup::{self, WalBackupTaskHandle},
-    wal_backup_partial, SafeKeeperConf,
+    wal_backup_partial::{self, PartialRemoteSegment},
+    SafeKeeperConf,
 };
 
-pub struct StateSnapshot {
+pub(crate) struct StateSnapshot {
     // inmem values
-    pub commit_lsn: Lsn,
-    pub backup_lsn: Lsn,
-    pub remote_consistent_lsn: Lsn,
+    pub(crate) commit_lsn: Lsn,
+    pub(crate) backup_lsn: Lsn,
+    pub(crate) remote_consistent_lsn: Lsn,
 
     // persistent control file values
-    pub cfile_peer_horizon_lsn: Lsn,
-    pub cfile_remote_consistent_lsn: Lsn,
-    pub cfile_backup_lsn: Lsn,
+    pub(crate) cfile_peer_horizon_lsn: Lsn,
+    pub(crate) cfile_remote_consistent_lsn: Lsn,
+    pub(crate) cfile_backup_lsn: Lsn,
+
+    // latest state
+    pub(crate) flush_lsn: Lsn,
+    pub(crate) last_log_term: Term,
 
     // misc
-    pub cfile_last_persist_at: Instant,
-    pub inmem_flush_pending: bool,
-    pub wal_removal_on_hold: bool,
-    pub peers: Vec<PeerInfo>,
+    pub(crate) cfile_last_persist_at: std::time::Instant,
+    pub(crate) inmem_flush_pending: bool,
+    pub(crate) wal_removal_on_hold: bool,
+    pub(crate) peers: Vec<PeerInfo>,
 }
 
 impl StateSnapshot {
     /// Create a new snapshot of the timeline state.
     fn new(read_guard: ReadGuardSharedState, heartbeat_timeout: Duration) -> Self {
+        let state = read_guard.sk.state();
         Self {
-            commit_lsn: read_guard.sk.state.inmem.commit_lsn,
-            backup_lsn: read_guard.sk.state.inmem.backup_lsn,
-            remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn,
-            cfile_peer_horizon_lsn: read_guard.sk.state.peer_horizon_lsn,
-            cfile_remote_consistent_lsn: read_guard.sk.state.remote_consistent_lsn,
-            cfile_backup_lsn: read_guard.sk.state.backup_lsn,
-            cfile_last_persist_at: read_guard.sk.state.pers.last_persist_at(),
-            inmem_flush_pending: Self::has_unflushed_inmem_state(&read_guard),
+            commit_lsn: state.inmem.commit_lsn,
+            backup_lsn: state.inmem.backup_lsn,
+            remote_consistent_lsn: state.inmem.remote_consistent_lsn,
+            cfile_peer_horizon_lsn: state.peer_horizon_lsn,
+            cfile_remote_consistent_lsn: state.remote_consistent_lsn,
+            cfile_backup_lsn: state.backup_lsn,
+            flush_lsn: read_guard.sk.flush_lsn(),
+            last_log_term: read_guard.sk.last_log_term(),
+            cfile_last_persist_at: state.pers.last_persist_at(),
+            inmem_flush_pending: Self::has_unflushed_inmem_state(state),
             wal_removal_on_hold: read_guard.wal_removal_on_hold,
             peers: read_guard.get_peers(heartbeat_timeout),
         }
     }
 
-    fn has_unflushed_inmem_state(read_guard: &ReadGuardSharedState) -> bool {
-        let state = &read_guard.sk.state;
+    fn has_unflushed_inmem_state(state: &TimelineState<FileStorage>) -> bool {
         state.inmem.commit_lsn > state.commit_lsn
             || state.inmem.backup_lsn > state.backup_lsn
             || state.inmem.peer_horizon_lsn > state.peer_horizon_lsn
@@ -73,314 +90,560 @@ impl StateSnapshot {
 /// There is no need to check for updates more often than this.
 const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
 
-/// How often to save the control file if the is no other activity.
-const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
+pub enum ManagerCtlMessage {
+    /// Request to get a guard for WalResidentTimeline, with WAL files available locally.
+    GuardRequest(tokio::sync::oneshot::Sender<anyhow::Result<ResidenceGuard>>),
+    /// Request to drop the guard.
+    GuardDrop(GuardId),
+}
+
+impl std::fmt::Debug for ManagerCtlMessage {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"),
+            ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id),
+        }
+    }
+}
+
+pub struct ManagerCtl {
+    manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
+
+    // this is used to initialize manager, it will be moved out in bootstrap().
+    init_manager_rx:
+        std::sync::Mutex<Option<tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>>>,
+}
+
+impl Default for ManagerCtl {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ManagerCtl {
+    pub fn new() -> Self {
+        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
+        Self {
+            manager_tx: tx,
+            init_manager_rx: std::sync::Mutex::new(Some(rx)),
+        }
+    }
+
+    /// Issue a new guard and wait for manager to prepare the timeline.
+    /// Sends a message to the manager and waits for the response.
+    /// Can be blocked indefinitely if the manager is stuck.
+    pub async fn wal_residence_guard(&self) -> anyhow::Result<ResidenceGuard> {
+        let (tx, rx) = tokio::sync::oneshot::channel();
+        self.manager_tx.send(ManagerCtlMessage::GuardRequest(tx))?;
+
+        // wait for the manager to respond with the guard
+        rx.await
+            .map_err(|e| anyhow::anyhow!("response read fail: {:?}", e))
+            .and_then(std::convert::identity)
+    }
+
+    /// Must be called exactly once to bootstrap the manager.
+    pub fn bootstrap_manager(
+        &self,
+    ) -> (
+        tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
+        tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
+    ) {
+        let rx = self
+            .init_manager_rx
+            .lock()
+            .expect("mutex init_manager_rx poisoned")
+            .take()
+            .expect("manager already bootstrapped");
+
+        (self.manager_tx.clone(), rx)
+    }
+}
+
+pub(crate) struct Manager {
+    // configuration & dependencies
+    pub(crate) tli: ManagerTimeline,
+    pub(crate) conf: SafeKeeperConf,
+    pub(crate) wal_seg_size: usize,
+    pub(crate) walsenders: Arc<WalSenders>,
+
+    // current state
+    pub(crate) state_version_rx: tokio::sync::watch::Receiver<usize>,
+    pub(crate) num_computes_rx: tokio::sync::watch::Receiver<usize>,
+    pub(crate) tli_broker_active: TimelineSetGuard,
+    pub(crate) last_removed_segno: XLogSegNo,
+    pub(crate) is_offloaded: bool,
+
+    // background tasks
+    pub(crate) backup_task: Option<WalBackupTaskHandle>,
+    pub(crate) recovery_task: Option<JoinHandle<()>>,
+    pub(crate) wal_removal_task: Option<JoinHandle<anyhow::Result<u64>>>,
+
+    // partial backup
+    pub(crate) partial_backup_task: Option<JoinHandle<Option<PartialRemoteSegment>>>,
+    pub(crate) partial_backup_uploaded: Option<PartialRemoteSegment>,
+
+    // misc
+    pub(crate) access_service: AccessService,
+}
 
 /// This task gets spawned alongside each timeline and is responsible for managing the timeline's
 /// background tasks.
 /// Be careful, this task is not respawned on panic, so it should not panic.
 #[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))]
 pub async fn main_task(
-    tli: Arc<Timeline>,
+    tli: ManagerTimeline,
     conf: SafeKeeperConf,
     broker_active_set: Arc<TimelinesSet>,
+    manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
+    mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
 ) {
+    tli.set_status(Status::Started);
+
+    let defer_tli = tli.tli.clone();
     scopeguard::defer! {
-        if tli.is_cancelled() {
+        if defer_tli.is_cancelled() {
             info!("manager task finished");
         } else {
             warn!("manager task finished prematurely");
         }
     };
 
-    // configuration & dependencies
-    let wal_seg_size = tli.get_wal_seg_size().await;
-    let heartbeat_timeout = conf.heartbeat_timeout;
-    let walsenders = tli.get_walsenders();
-    let walreceivers = tli.get_walreceivers();
-
-    // current state
-    let mut state_version_rx = tli.get_state_version_rx();
-    let mut num_computes_rx = walreceivers.get_num_rx();
-    let mut tli_broker_active = broker_active_set.guard(tli.clone());
-    let mut last_removed_segno = 0 as XLogSegNo;
-
-    // list of background tasks
-    let mut backup_task: Option<WalBackupTaskHandle> = None;
-    let mut recovery_task: Option<JoinHandle<()>> = None;
-    let mut partial_backup_task: Option<JoinHandle<()>> = None;
-    let mut wal_removal_task: Option<JoinHandle<anyhow::Result<u64>>> = None;
+    let mut mgr = Manager::new(tli, conf, broker_active_set, manager_tx).await;
 
     // Start recovery task which always runs on the timeline.
-    if conf.peer_recovery_enabled {
-        match tli.full_access_guard().await {
-            Ok(tli) => {
-                recovery_task = Some(tokio::spawn(recovery_main(tli, conf.clone())));
-            }
-            Err(e) => {
-                warn!("failed to start recovery task: {:?}", e);
-            }
-        }
-    }
-
-    // Start partial backup task which always runs on the timeline.
-    if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
-        match tli.full_access_guard().await {
-            Ok(tli) => {
-                partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
-                    tli,
-                    conf.clone(),
-                )));
-            }
-            Err(e) => {
-                warn!("failed to start partial backup task: {:?}", e);
-            }
-        }
+    if !mgr.is_offloaded && mgr.conf.peer_recovery_enabled {
+        let tli = mgr.wal_resident_timeline();
+        mgr.recovery_task = Some(tokio::spawn(recovery_main(tli, mgr.conf.clone())));
     }
 
     let last_state = 'outer: loop {
         MANAGER_ITERATIONS_TOTAL.inc();
 
-        let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout);
-        let num_computes = *num_computes_rx.borrow();
+        mgr.set_status(Status::StateSnapshot);
+        let state_snapshot = mgr.state_snapshot().await;
 
-        let is_wal_backup_required = update_backup(
-            &conf,
-            &tli,
-            wal_seg_size,
-            num_computes,
-            &state_snapshot,
-            &mut backup_task,
-        )
-        .await;
+        let mut next_event: Option<Instant> = None;
+        if !mgr.is_offloaded {
+            let num_computes = *mgr.num_computes_rx.borrow();
 
-        let _is_active = update_is_active(
-            is_wal_backup_required,
-            num_computes,
-            &state_snapshot,
-            &mut tli_broker_active,
-            &tli,
-        );
+            mgr.set_status(Status::UpdateBackup);
+            let is_wal_backup_required = mgr.update_backup(num_computes, &state_snapshot).await;
+            mgr.update_is_active(is_wal_backup_required, num_computes, &state_snapshot);
 
-        let next_cfile_save = update_control_file_save(&state_snapshot, &tli).await;
+            mgr.set_status(Status::UpdateControlFile);
+            mgr.update_control_file_save(&state_snapshot, &mut next_event)
+                .await;
 
-        update_wal_removal(
-            &conf,
-            walsenders,
-            &tli,
-            wal_seg_size,
-            &state_snapshot,
-            last_removed_segno,
-            &mut wal_removal_task,
-        )
-        .await;
+            mgr.set_status(Status::UpdateWalRemoval);
+            mgr.update_wal_removal(&state_snapshot).await;
 
+            mgr.set_status(Status::UpdatePartialBackup);
+            mgr.update_partial_backup(&state_snapshot).await;
+
+            if mgr.conf.enable_offload && mgr.ready_for_eviction(&next_event, &state_snapshot) {
+                mgr.set_status(Status::EvictTimeline);
+                mgr.evict_timeline().await;
+            }
+        }
+
+        mgr.set_status(Status::Wait);
         // wait until something changes. tx channels are stored under Arc, so they will not be
         // dropped until the manager task is finished.
         tokio::select! {
-            _ = tli.cancel.cancelled() => {
+            _ = mgr.tli.cancel.cancelled() => {
                 // timeline was deleted
                 break 'outer state_snapshot;
             }
             _ = async {
                 // don't wake up on every state change, but at most every REFRESH_INTERVAL
                 tokio::time::sleep(REFRESH_INTERVAL).await;
-                let _ = state_version_rx.changed().await;
+                let _ = mgr.state_version_rx.changed().await;
             } => {
                 // state was updated
             }
-            _ = num_computes_rx.changed() => {
+            _ = mgr.num_computes_rx.changed() => {
                 // number of connected computes was updated
             }
-            _ = async {
-                if let Some(timeout) = next_cfile_save {
-                    tokio::time::sleep_until(timeout).await
-                } else {
-                    futures::future::pending().await
-                }
-            } => {
-                // it's time to save the control file
+            _ = sleep_until(&next_event) => {
+                // we were waiting for some event (e.g. cfile save)
             }
-            res = async {
-                if let Some(task) = &mut wal_removal_task {
-                    task.await
-                } else {
-                    futures::future::pending().await
-                }
-            } => {
+            res = await_task_finish(&mut mgr.wal_removal_task) => {
                 // WAL removal task finished
-                wal_removal_task = None;
-                update_wal_removal_end(res, &tli, &mut last_removed_segno);
+                mgr.wal_removal_task = None;
+                mgr.update_wal_removal_end(res);
+            }
+            res = await_task_finish(&mut mgr.partial_backup_task) => {
+                // partial backup task finished
+                mgr.partial_backup_task = None;
+                mgr.update_partial_backup_end(res);
+            }
+
+            msg = manager_rx.recv() => {
+                mgr.set_status(Status::HandleMessage);
+                mgr.handle_message(msg).await;
             }
         }
     };
+    mgr.set_status(Status::Exiting);
 
     // remove timeline from the broker active set sooner, before waiting for background tasks
-    tli_broker_active.set(false);
+    mgr.tli_broker_active.set(false);
 
     // shutdown background tasks
-    if conf.is_wal_backup_enabled() {
-        wal_backup::update_task(&conf, &tli, false, &last_state, &mut backup_task).await;
+    if mgr.conf.is_wal_backup_enabled() {
+        wal_backup::update_task(&mut mgr, false, &last_state).await;
     }
 
-    if let Some(recovery_task) = recovery_task {
+    if let Some(recovery_task) = &mut mgr.recovery_task {
         if let Err(e) = recovery_task.await {
             warn!("recovery task failed: {:?}", e);
         }
     }
 
-    if let Some(partial_backup_task) = partial_backup_task {
+    if let Some(partial_backup_task) = &mut mgr.partial_backup_task {
         if let Err(e) = partial_backup_task.await {
             warn!("partial backup task failed: {:?}", e);
         }
     }
 
-    if let Some(wal_removal_task) = wal_removal_task {
+    if let Some(wal_removal_task) = &mut mgr.wal_removal_task {
         let res = wal_removal_task.await;
-        update_wal_removal_end(res, &tli, &mut last_removed_segno);
+        mgr.update_wal_removal_end(res);
     }
+
+    mgr.set_status(Status::Finished);
 }
 
-/// Spawns/kills backup task and returns true if backup is required.
-async fn update_backup(
-    conf: &SafeKeeperConf,
-    tli: &Arc<Timeline>,
-    wal_seg_size: usize,
-    num_computes: usize,
-    state: &StateSnapshot,
-    backup_task: &mut Option<WalBackupTaskHandle>,
-) -> bool {
-    let is_wal_backup_required =
-        wal_backup::is_wal_backup_required(wal_seg_size, num_computes, state);
-
-    if conf.is_wal_backup_enabled() {
-        wal_backup::update_task(conf, tli, is_wal_backup_required, state, backup_task).await;
+impl Manager {
+    async fn new(
+        tli: ManagerTimeline,
+        conf: SafeKeeperConf,
+        broker_active_set: Arc<TimelinesSet>,
+        manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
+    ) -> Manager {
+        let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
+        Manager {
+            conf,
+            wal_seg_size: tli.get_wal_seg_size().await,
+            walsenders: tli.get_walsenders().clone(),
+            state_version_rx: tli.get_state_version_rx(),
+            num_computes_rx: tli.get_walreceivers().get_num_rx(),
+            tli_broker_active: broker_active_set.guard(tli.clone()),
+            last_removed_segno: 0,
+            is_offloaded,
+            backup_task: None,
+            recovery_task: None,
+            wal_removal_task: None,
+            partial_backup_task: None,
+            partial_backup_uploaded,
+            access_service: AccessService::new(manager_tx),
+            tli,
+        }
     }
 
-    // update the state in Arc<Timeline>
-    tli.wal_backup_active
-        .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
-    is_wal_backup_required
-}
-
-/// Update is_active flag and returns its value.
-fn update_is_active(
-    is_wal_backup_required: bool,
-    num_computes: usize,
-    state: &StateSnapshot,
-    tli_broker_active: &mut TimelineSetGuard,
-    tli: &Arc<Timeline>,
-) -> bool {
-    let is_active = is_wal_backup_required
-        || num_computes > 0
-        || state.remote_consistent_lsn < state.commit_lsn;
-
-    // update the broker timeline set
-    if tli_broker_active.set(is_active) {
-        // write log if state has changed
-        info!(
-            "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
-            is_active, state.remote_consistent_lsn, state.commit_lsn,
-        );
-
-        MANAGER_ACTIVE_CHANGES.inc();
+    fn set_status(&self, status: Status) {
+        self.tli.set_status(status);
     }
 
-    // update the state in Arc<Timeline>
-    tli.broker_active
-        .store(is_active, std::sync::atomic::Ordering::Relaxed);
-    is_active
-}
-
-/// Save control file if needed. Returns Instant if we should persist the control file in the future.
-async fn update_control_file_save(
-    state: &StateSnapshot,
-    tli: &Arc<Timeline>,
-) -> Option<tokio::time::Instant> {
-    if !state.inmem_flush_pending {
-        return None;
+    /// Get a WalResidentTimeline.
+    /// Manager code must use this function instead of one from `Timeline`
+    /// directly, because it will deadlock.
+    pub(crate) fn wal_resident_timeline(&mut self) -> WalResidentTimeline {
+        assert!(!self.is_offloaded);
+        let guard = self.access_service.create_guard();
+        WalResidentTimeline::new(self.tli.clone(), guard)
     }
 
-    if state.cfile_last_persist_at.elapsed() > CF_SAVE_INTERVAL {
-        let mut write_guard = tli.write_shared_state().await;
-        // this can be done in the background because it blocks manager task, but flush() should
-        // be fast enough not to be a problem now
-        if let Err(e) = write_guard.sk.state.flush().await {
-            warn!("failed to save control file: {:?}", e);
+    /// Get a snapshot of the timeline state.
+    async fn state_snapshot(&self) -> StateSnapshot {
+        StateSnapshot::new(
+            self.tli.read_shared_state().await,
+            self.conf.heartbeat_timeout,
+        )
+    }
+
+    /// Spawns/kills backup task and returns true if backup is required.
+    async fn update_backup(&mut self, num_computes: usize, state: &StateSnapshot) -> bool {
+        let is_wal_backup_required =
+            wal_backup::is_wal_backup_required(self.wal_seg_size, num_computes, state);
+
+        if self.conf.is_wal_backup_enabled() {
+            wal_backup::update_task(self, is_wal_backup_required, state).await;
         }
 
-        None
-    } else {
-        // we should wait until next CF_SAVE_INTERVAL
-        Some((state.cfile_last_persist_at + CF_SAVE_INTERVAL).into())
-    }
-}
-
-/// Spawns WAL removal task if needed.
-async fn update_wal_removal(
-    conf: &SafeKeeperConf,
-    walsenders: &Arc<WalSenders>,
-    tli: &Arc<Timeline>,
-    wal_seg_size: usize,
-    state: &StateSnapshot,
-    last_removed_segno: u64,
-    wal_removal_task: &mut Option<JoinHandle<anyhow::Result<u64>>>,
-) {
-    if wal_removal_task.is_some() || state.wal_removal_on_hold {
-        // WAL removal is already in progress or hold off
-        return;
-    }
-
-    // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
-    // This allows to get better read speed for pageservers that are lagging behind,
-    // at the cost of keeping more WAL on disk.
-    let replication_horizon_lsn = if conf.walsenders_keep_horizon {
-        walsenders.laggard_lsn()
-    } else {
-        None
-    };
-
-    let removal_horizon_lsn = calc_horizon_lsn(state, replication_horizon_lsn);
-    let removal_horizon_segno = removal_horizon_lsn
-        .segment_number(wal_seg_size)
-        .saturating_sub(1);
-
-    if removal_horizon_segno > last_removed_segno {
-        // we need to remove WAL
-        let remover = crate::wal_storage::Storage::remove_up_to(
-            &tli.read_shared_state().await.sk.wal_store,
-            removal_horizon_segno,
+        // update the state in Arc<Timeline>
+        self.tli.wal_backup_active.store(
+            self.backup_task.is_some(),
+            std::sync::atomic::Ordering::Relaxed,
         );
-        *wal_removal_task = Some(tokio::spawn(
-            async move {
-                remover.await?;
-                Ok(removal_horizon_segno)
+        is_wal_backup_required
+    }
+
+    /// Update is_active flag and returns its value.
+    fn update_is_active(
+        &mut self,
+        is_wal_backup_required: bool,
+        num_computes: usize,
+        state: &StateSnapshot,
+    ) {
+        let is_active = is_wal_backup_required
+            || num_computes > 0
+            || state.remote_consistent_lsn < state.commit_lsn;
+
+        // update the broker timeline set
+        if self.tli_broker_active.set(is_active) {
+            // write log if state has changed
+            info!(
+                "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
+                is_active, state.remote_consistent_lsn, state.commit_lsn,
+            );
+
+            MANAGER_ACTIVE_CHANGES.inc();
+        }
+
+        // update the state in Arc<Timeline>
+        self.tli
+            .broker_active
+            .store(is_active, std::sync::atomic::Ordering::Relaxed);
+    }
+
+    /// Save control file if needed. Returns Instant if we should persist the control file in the future.
+    async fn update_control_file_save(
+        &self,
+        state: &StateSnapshot,
+        next_event: &mut Option<Instant>,
+    ) {
+        if !state.inmem_flush_pending {
+            return;
+        }
+
+        if state.cfile_last_persist_at.elapsed() > self.conf.control_file_save_interval {
+            let mut write_guard = self.tli.write_shared_state().await;
+            // it should be done in the background because it blocks manager task, but flush() should
+            // be fast enough not to be a problem now
+            if let Err(e) = write_guard.sk.state_mut().flush().await {
+                warn!("failed to save control file: {:?}", e);
             }
-            .instrument(info_span!("WAL removal", ttid=%tli.ttid)),
-        ));
+        } else {
+            // we should wait until some time passed until the next save
+            update_next_event(
+                next_event,
+                (state.cfile_last_persist_at + self.conf.control_file_save_interval).into(),
+            );
+        }
+    }
+
+    /// Spawns WAL removal task if needed.
+    async fn update_wal_removal(&mut self, state: &StateSnapshot) {
+        if self.wal_removal_task.is_some() || state.wal_removal_on_hold {
+            // WAL removal is already in progress or hold off
+            return;
+        }
+
+        // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
+        // This allows to get better read speed for pageservers that are lagging behind,
+        // at the cost of keeping more WAL on disk.
+        let replication_horizon_lsn = if self.conf.walsenders_keep_horizon {
+            self.walsenders.laggard_lsn()
+        } else {
+            None
+        };
+
+        let removal_horizon_lsn = calc_horizon_lsn(state, replication_horizon_lsn);
+        let removal_horizon_segno = removal_horizon_lsn
+            .segment_number(self.wal_seg_size)
+            .saturating_sub(1);
+
+        if removal_horizon_segno > self.last_removed_segno {
+            // we need to remove WAL
+            let remover = match self.tli.read_shared_state().await.sk {
+                StateSK::Loaded(ref sk) => {
+                    crate::wal_storage::Storage::remove_up_to(&sk.wal_store, removal_horizon_segno)
+                }
+                StateSK::Offloaded(_) => {
+                    // we can't remove WAL if it's not loaded
+                    warn!("unexpectedly trying to run WAL removal on offloaded timeline");
+                    return;
+                }
+                StateSK::Empty => unreachable!(),
+            };
+
+            self.wal_removal_task = Some(tokio::spawn(
+                async move {
+                    remover.await?;
+                    Ok(removal_horizon_segno)
+                }
+                .instrument(info_span!("WAL removal", ttid=%self.tli.ttid)),
+            ));
+        }
+    }
+
+    /// Update the state after WAL removal task finished.
+    fn update_wal_removal_end(&mut self, res: Result<anyhow::Result<u64>, JoinError>) {
+        let new_last_removed_segno = match res {
+            Ok(Ok(segno)) => segno,
+            Err(e) => {
+                warn!("WAL removal task failed: {:?}", e);
+                return;
+            }
+            Ok(Err(e)) => {
+                warn!("WAL removal task failed: {:?}", e);
+                return;
+            }
+        };
+
+        self.last_removed_segno = new_last_removed_segno;
+        // update the state in Arc<Timeline>
+        self.tli
+            .last_removed_segno
+            .store(new_last_removed_segno, std::sync::atomic::Ordering::Relaxed);
+    }
+
+    /// Spawns partial WAL backup task if needed.
+    async fn update_partial_backup(&mut self, state: &StateSnapshot) {
+        // check if partial backup is enabled and should be started
+        if !self.conf.is_wal_backup_enabled() || !self.conf.partial_backup_enabled {
+            return;
+        }
+
+        if self.partial_backup_task.is_some() {
+            // partial backup is already running
+            return;
+        }
+
+        if !wal_backup_partial::needs_uploading(state, &self.partial_backup_uploaded) {
+            // nothing to upload
+            return;
+        }
+
+        // Get WalResidentTimeline and start partial backup task.
+        self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
+            self.wal_resident_timeline(),
+            self.conf.clone(),
+        )));
+    }
+
+    /// Update the state after partial WAL backup task finished.
+    fn update_partial_backup_end(&mut self, res: Result<Option<PartialRemoteSegment>, JoinError>) {
+        match res {
+            Ok(new_upload_state) => {
+                self.partial_backup_uploaded = new_upload_state;
+            }
+            Err(e) => {
+                warn!("partial backup task panicked: {:?}", e);
+            }
+        }
+    }
+
+    /// Handle message arrived from ManagerCtl.
+    async fn handle_message(&mut self, msg: Option<ManagerCtlMessage>) {
+        debug!("received manager message: {:?}", msg);
+        match msg {
+            Some(ManagerCtlMessage::GuardRequest(tx)) => {
+                if self.is_offloaded {
+                    // trying to unevict timeline, but without gurarantee that it will be successful
+                    self.unevict_timeline().await;
+                }
+
+                let guard = if self.is_offloaded {
+                    Err(anyhow::anyhow!("timeline is offloaded, can't get a guard"))
+                } else {
+                    Ok(self.access_service.create_guard())
+                };
+
+                if tx.send(guard).is_err() {
+                    warn!("failed to reply with a guard, receiver dropped");
+                }
+            }
+            Some(ManagerCtlMessage::GuardDrop(guard_id)) => {
+                self.access_service.drop_guard(guard_id);
+            }
+            None => {
+                // can't happen, we're holding the sender
+                unreachable!();
+            }
+        }
     }
 }
 
-/// Update the state after WAL removal task finished.
-fn update_wal_removal_end(
-    res: Result<anyhow::Result<u64>, JoinError>,
-    tli: &Arc<Timeline>,
-    last_removed_segno: &mut u64,
-) {
-    let new_last_removed_segno = match res {
-        Ok(Ok(segno)) => segno,
-        Err(e) => {
-            warn!("WAL removal task failed: {:?}", e);
-            return;
-        }
-        Ok(Err(e)) => {
-            warn!("WAL removal task failed: {:?}", e);
-            return;
-        }
-    };
-
-    *last_removed_segno = new_last_removed_segno;
-    // update the state in Arc<Timeline>
-    tli.last_removed_segno
-        .store(new_last_removed_segno, std::sync::atomic::Ordering::Relaxed);
+// utility functions
+async fn sleep_until(option: &Option<tokio::time::Instant>) {
+    if let Some(timeout) = option {
+        tokio::time::sleep_until(*timeout).await;
+    } else {
+        futures::future::pending::<()>().await;
+    }
+}
+
+async fn await_task_finish<T>(option: &mut Option<JoinHandle<T>>) -> Result<T, JoinError> {
+    if let Some(task) = option {
+        task.await
+    } else {
+        futures::future::pending().await
+    }
+}
+
+/// Update next_event if candidate is earlier.
+fn update_next_event(next_event: &mut Option<Instant>, candidate: Instant) {
+    if let Some(next) = next_event {
+        if candidate < *next {
+            *next = candidate;
+        }
+    } else {
+        *next_event = Some(candidate);
+    }
+}
+
+#[repr(usize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum Status {
+    NotStarted,
+    Started,
+    StateSnapshot,
+    UpdateBackup,
+    UpdateControlFile,
+    UpdateWalRemoval,
+    UpdatePartialBackup,
+    EvictTimeline,
+    Wait,
+    HandleMessage,
+    Exiting,
+    Finished,
+}
+
+/// AtomicStatus is a wrapper around AtomicUsize adapted for the Status enum.
+pub struct AtomicStatus {
+    inner: AtomicUsize,
+}
+
+impl Default for AtomicStatus {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl AtomicStatus {
+    pub fn new() -> Self {
+        AtomicStatus {
+            inner: AtomicUsize::new(Status::NotStarted as usize),
+        }
+    }
+
+    pub fn load(&self, order: std::sync::atomic::Ordering) -> Status {
+        // Safety: This line of code uses `std::mem::transmute` to reinterpret the loaded value as `Status`.
+        // It is safe to use `transmute` in this context because `Status` is a repr(usize) enum,
+        // which means it has the same memory layout as usize.
+        // However, it is important to ensure that the loaded value is a valid variant of `Status`,
+        // otherwise, the behavior will be undefined.
+        unsafe { std::mem::transmute(self.inner.load(order)) }
+    }
+
+    pub fn get(&self) -> Status {
+        self.load(std::sync::atomic::Ordering::Relaxed)
+    }
+
+    pub fn store(&self, val: Status, order: std::sync::atomic::Ordering) {
+        self.inner.store(val as usize, order);
+    }
 }
diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs
index ea8e23bb72..d6eea79f82 100644
--- a/safekeeper/src/timelines_set.rs
+++ b/safekeeper/src/timelines_set.rs
@@ -80,6 +80,10 @@ impl TimelineSetGuard {
         self.timelines_set.set_present(self.tli.clone(), present);
         true
     }
+
+    pub fn get(&self) -> bool {
+        self.is_present
+    }
 }
 
 impl Drop for TimelineSetGuard {
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 58591aecfa..9ea048a3c7 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -12,7 +12,6 @@ use std::cmp::min;
 use std::collections::HashSet;
 use std::num::NonZeroU32;
 use std::pin::Pin;
-use std::sync::Arc;
 use std::time::Duration;
 
 use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
@@ -30,8 +29,8 @@ use tracing::*;
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
 use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
-use crate::timeline::{FullAccessTimeline, PeerInfo, Timeline};
-use crate::timeline_manager::StateSnapshot;
+use crate::timeline::{PeerInfo, WalResidentTimeline};
+use crate::timeline_manager::{Manager, StateSnapshot};
 use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME};
 
 use once_cell::sync::OnceCell;
@@ -48,7 +47,7 @@ pub struct WalBackupTaskHandle {
 }
 
 /// Do we have anything to upload to S3, i.e. should safekeepers run backup activity?
-pub fn is_wal_backup_required(
+pub(crate) fn is_wal_backup_required(
     wal_seg_size: usize,
     num_computes: usize,
     state: &StateSnapshot,
@@ -61,35 +60,33 @@ pub fn is_wal_backup_required(
 /// Based on peer information determine which safekeeper should offload; if it
 /// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
 /// is running, kill it.
-pub async fn update_task(
-    conf: &SafeKeeperConf,
-    tli: &Arc<Timeline>,
-    need_backup: bool,
-    state: &StateSnapshot,
-    entry: &mut Option<WalBackupTaskHandle>,
-) {
+pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &StateSnapshot) {
     let (offloader, election_dbg_str) =
-        determine_offloader(&state.peers, state.backup_lsn, tli.ttid, conf);
-    let elected_me = Some(conf.my_id) == offloader;
+        determine_offloader(&state.peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf);
+    let elected_me = Some(mgr.conf.my_id) == offloader;
 
     let should_task_run = need_backup && elected_me;
 
     // start or stop the task
-    if should_task_run != (entry.is_some()) {
+    if should_task_run != (mgr.backup_task.is_some()) {
         if should_task_run {
             info!("elected for backup: {}", election_dbg_str);
 
             let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
 
-            let async_task = backup_task_main(tli.clone(), conf.backup_parallel_jobs, shutdown_rx);
+            let async_task = backup_task_main(
+                mgr.wal_resident_timeline(),
+                mgr.conf.backup_parallel_jobs,
+                shutdown_rx,
+            );
 
-            let handle = if conf.current_thread_runtime {
+            let handle = if mgr.conf.current_thread_runtime {
                 tokio::spawn(async_task)
             } else {
                 WAL_BACKUP_RUNTIME.spawn(async_task)
             };
 
-            *entry = Some(WalBackupTaskHandle {
+            mgr.backup_task = Some(WalBackupTaskHandle {
                 shutdown_tx,
                 handle,
             });
@@ -101,7 +98,7 @@ pub async fn update_task(
                 // someone else has been elected
                 info!("stepping down from backup: {}", election_dbg_str);
             }
-            shut_down_task(entry).await;
+            shut_down_task(&mut mgr.backup_task).await;
         }
     }
 }
@@ -191,7 +188,7 @@ pub fn init_remote_storage(conf: &SafeKeeperConf) {
 }
 
 struct WalBackupTask {
-    timeline: FullAccessTimeline,
+    timeline: WalResidentTimeline,
     timeline_dir: Utf8PathBuf,
     wal_seg_size: usize,
     parallel_jobs: usize,
@@ -200,16 +197,12 @@ struct WalBackupTask {
 
 /// Offload single timeline.
 #[instrument(name = "WAL backup", skip_all, fields(ttid = %tli.ttid))]
-async fn backup_task_main(tli: Arc<Timeline>, parallel_jobs: usize, mut shutdown_rx: Receiver<()>) {
+async fn backup_task_main(
+    tli: WalResidentTimeline,
+    parallel_jobs: usize,
+    mut shutdown_rx: Receiver<()>,
+) {
     let _guard = WAL_BACKUP_TASKS.guard();
-
-    let tli = match tli.full_access_guard().await {
-        Ok(tli) => tli,
-        Err(e) => {
-            error!("backup error: {}", e);
-            return;
-        }
-    };
     info!("started");
 
     let mut wb = WalBackupTask {
@@ -304,7 +297,7 @@ impl WalBackupTask {
 }
 
 async fn backup_lsn_range(
-    timeline: &FullAccessTimeline,
+    timeline: &WalResidentTimeline,
     backup_lsn: &mut Lsn,
     end_lsn: Lsn,
     wal_seg_size: usize,
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index ed5ddb71f5..9c7cd0888d 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -29,18 +29,22 @@ use utils::lsn::Lsn;
 use crate::{
     metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
     safekeeper::Term,
-    timeline::FullAccessTimeline,
+    timeline::WalResidentTimeline,
+    timeline_manager::StateSnapshot,
     wal_backup::{self, remote_timeline_path},
     SafeKeeperConf,
 };
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub enum UploadStatus {
-    /// Upload is in progress
+    /// Upload is in progress. This status should be used only for garbage collection,
+    /// don't read data from the remote storage with this status.
     InProgress,
-    /// Upload is finished
+    /// Upload is finished. There is always at most one segment with this status.
+    /// It means that the segment is actual and can be used.
     Uploaded,
-    /// Deletion is in progress
+    /// Deletion is in progress. This status should be used only for garbage collection,
+    /// don't read data from the remote storage with this status.
     Deleting,
 }
 
@@ -50,6 +54,10 @@ pub struct PartialRemoteSegment {
     pub name: String,
     pub commit_lsn: Lsn,
     pub flush_lsn: Lsn,
+    // We should use last_log_term here, otherwise it's possible to have inconsistent data in the
+    // remote storage.
+    //
+    // More info here: https://github.com/neondatabase/neon/pull/8022#discussion_r1654738405
     pub term: Term,
 }
 
@@ -60,6 +68,10 @@ impl PartialRemoteSegment {
             && self.flush_lsn == other.flush_lsn
             && self.term == other.term
     }
+
+    pub(crate) fn remote_path(&self, remote_timeline_path: &RemotePath) -> RemotePath {
+        remote_timeline_path.join(&self.name)
+    }
 }
 
 // NB: these structures are a part of a control_file, you can't change them without
@@ -71,7 +83,7 @@ pub struct State {
 
 impl State {
     /// Find an Uploaded segment. There should be only one Uploaded segment at a time.
-    fn uploaded_segment(&self) -> Option<PartialRemoteSegment> {
+    pub(crate) fn uploaded_segment(&self) -> Option<PartialRemoteSegment> {
         self.segments
             .iter()
             .find(|seg| seg.status == UploadStatus::Uploaded)
@@ -81,7 +93,7 @@ impl State {
 
 struct PartialBackup {
     wal_seg_size: usize,
-    tli: FullAccessTimeline,
+    tli: WalResidentTimeline,
     conf: SafeKeeperConf,
     local_prefix: Utf8PathBuf,
     remote_timeline_path: RemotePath,
@@ -128,17 +140,17 @@ impl PartialBackup {
         let sk_info = self.tli.get_safekeeper_info(&self.conf).await;
         let flush_lsn = Lsn(sk_info.flush_lsn);
         let commit_lsn = Lsn(sk_info.commit_lsn);
-        let term = sk_info.term;
+        let last_log_term = sk_info.last_log_term;
         let segno = self.segno(flush_lsn);
 
-        let name = self.remote_segment_name(segno, term, commit_lsn, flush_lsn);
+        let name = self.remote_segment_name(segno, last_log_term, commit_lsn, flush_lsn);
 
         PartialRemoteSegment {
             status: UploadStatus::InProgress,
             name,
             commit_lsn,
             flush_lsn,
-            term,
+            term: last_log_term,
         }
     }
 
@@ -151,7 +163,7 @@ impl PartialBackup {
         let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size);
 
         let local_path = self.local_prefix.join(self.local_segment_name(segno));
-        let remote_path = self.remote_timeline_path.join(&prepared.name);
+        let remote_path = prepared.remote_path(&self.remote_timeline_path);
 
         // Upload first `backup_bytes` bytes of the segment to the remote storage.
         wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
@@ -161,7 +173,7 @@ impl PartialBackup {
         // If the term changed, we cannot guarantee the validity of the uploaded data.
         // If the term is the same, we know the data is not corrupted.
         let sk_info = self.tli.get_safekeeper_info(&self.conf).await;
-        if sk_info.term != prepared.term {
+        if sk_info.last_log_term != prepared.term {
             anyhow::bail!("term changed during upload");
         }
         assert!(prepared.commit_lsn <= Lsn(sk_info.commit_lsn));
@@ -270,8 +282,32 @@ impl PartialBackup {
     }
 }
 
+/// Check if everything is uploaded and partial backup task doesn't need to run.
+pub(crate) fn needs_uploading(
+    state: &StateSnapshot,
+    uploaded: &Option<PartialRemoteSegment>,
+) -> bool {
+    match uploaded {
+        Some(uploaded) => {
+            uploaded.status != UploadStatus::Uploaded
+                || uploaded.flush_lsn != state.flush_lsn
+                || uploaded.commit_lsn != state.commit_lsn
+                || uploaded.term != state.last_log_term
+        }
+        None => true,
+    }
+}
+
+/// Main task for partial backup. It waits for the flush_lsn to change and then uploads the
+/// partial segment to the remote storage. It also does garbage collection of old segments.
+///
+/// When there is nothing more to do and the last segment was successfully uploaded, the task
+/// returns PartialRemoteSegment, to signal readiness for offloading the timeline.
 #[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))]
-pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
+pub async fn main_task(
+    tli: WalResidentTimeline,
+    conf: SafeKeeperConf,
+) -> Option<PartialRemoteSegment> {
     debug!("started");
     let await_duration = conf.partial_backup_timeout;
 
@@ -285,7 +321,7 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
         Ok(path) => path,
         Err(e) => {
             error!("failed to create remote path: {:?}", e);
-            return;
+            return None;
         }
     };
 
@@ -320,19 +356,13 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
         // wait until we have something to upload
         let uploaded_segment = backup.state.uploaded_segment();
         if let Some(seg) = &uploaded_segment {
-            // if we already uploaded something, wait until we have something new
-            while flush_lsn_rx.borrow().lsn == seg.flush_lsn
+            // check if uploaded segment matches the current state
+            if flush_lsn_rx.borrow().lsn == seg.flush_lsn
                 && *commit_lsn_rx.borrow() == seg.commit_lsn
                 && flush_lsn_rx.borrow().term == seg.term
             {
-                tokio::select! {
-                    _ = backup.tli.cancel.cancelled() => {
-                        info!("timeline canceled");
-                        return;
-                    }
-                    _ = commit_lsn_rx.changed() => {}
-                    _ = flush_lsn_rx.changed() => {}
-                }
+                // we have nothing to do, the last segment is already uploaded
+                return Some(seg.clone());
             }
         }
 
@@ -341,7 +371,7 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
             tokio::select! {
                 _ = backup.tli.cancel.cancelled() => {
                     info!("timeline canceled");
-                    return;
+                    return None;
                 }
                 _ = flush_lsn_rx.changed() => {}
             }
@@ -358,7 +388,7 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
             tokio::select! {
                 _ = backup.tli.cancel.cancelled() => {
                     info!("timeline canceled");
-                    return;
+                    return None;
                 }
                 _ = commit_lsn_rx.changed() => {}
                 _ = flush_lsn_rx.changed() => {
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 2aead70ffd..74c4693ccd 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -211,7 +211,7 @@ impl PhysicalStorage {
     /// Returns `file` and `is_partial`.
     async fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> {
         let (wal_file_path, wal_file_partial_path) =
-            wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
+            wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size);
 
         // Try to open already completed segment
         if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path).await {
@@ -276,7 +276,7 @@ impl PhysicalStorage {
 
             // Rename partial file to completed file
             let (wal_file_path, wal_file_partial_path) =
-                wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
+                wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size);
             fs::rename(wal_file_partial_path, wal_file_path).await?;
         } else {
             // otherwise, file can be reused later
@@ -461,7 +461,7 @@ impl Storage for PhysicalStorage {
         if !is_partial {
             // Make segment partial once again
             let (wal_file_path, wal_file_partial_path) =
-                wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
+                wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size);
             fs::rename(wal_file_path, wal_file_partial_path).await?;
         }
 
@@ -741,7 +741,7 @@ pub(crate) async fn open_wal_file(
     segno: XLogSegNo,
     wal_seg_size: usize,
 ) -> Result<(tokio::fs::File, bool)> {
-    let (wal_file_path, wal_file_partial_path) = wal_file_paths(timeline_dir, segno, wal_seg_size)?;
+    let (wal_file_path, wal_file_partial_path) = wal_file_paths(timeline_dir, segno, wal_seg_size);
 
     // First try to open the .partial file.
     let mut partial_path = wal_file_path.to_owned();
@@ -767,9 +767,9 @@ pub fn wal_file_paths(
     timeline_dir: &Utf8Path,
     segno: XLogSegNo,
     wal_seg_size: usize,
-) -> Result<(Utf8PathBuf, Utf8PathBuf)> {
+) -> (Utf8PathBuf, Utf8PathBuf) {
     let wal_file_name = XLogFileName(PG_TLI, segno, wal_seg_size);
     let wal_file_path = timeline_dir.join(wal_file_name.clone());
     let wal_file_partial_path = timeline_dir.join(wal_file_name + ".partial");
-    Ok((wal_file_path, wal_file_partial_path))
+    (wal_file_path, wal_file_partial_path)
 }
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 9c81d2eb4d..43835c7f44 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -16,7 +16,7 @@ use desim::{
 use hyper::Uri;
 use safekeeper::{
     safekeeper::{ProposerAcceptorMessage, SafeKeeper, ServerInfo, UNKNOWN_SERVER_VERSION},
-    state::TimelinePersistentState,
+    state::{TimelinePersistentState, TimelineState},
     timeline::TimelineError,
     wal_storage::Storage,
     SafeKeeperConf,
@@ -68,7 +68,7 @@ impl GlobalMap {
             let control_store = DiskStateStorage::new(disk.clone());
             let wal_store = DiskWALStorage::new(disk.clone(), &control_store)?;
 
-            let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
+            let sk = SafeKeeper::new(TimelineState::new(control_store), wal_store, conf.my_id)?;
             timelines.insert(
                 ttid,
                 SharedState {
@@ -118,7 +118,11 @@ impl GlobalMap {
         let control_store = DiskStateStorage::new(disk_timeline.clone());
         let wal_store = DiskWALStorage::new(disk_timeline.clone(), &control_store)?;
 
-        let sk = SafeKeeper::new(control_store, wal_store, self.conf.my_id)?;
+        let sk = SafeKeeper::new(
+            TimelineState::new(control_store),
+            wal_store,
+            self.conf.my_id,
+        )?;
 
         self.timelines.insert(
             ttid,
@@ -180,6 +184,9 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         partial_backup_enabled: false,
         partial_backup_timeout: Duration::from_secs(0),
         disable_periodic_broker_push: false,
+        enable_offload: false,
+        delete_offloaded_wal: false,
+        control_file_save_interval: Duration::from_secs(1),
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 7453637218..6a29df6f13 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3916,6 +3916,8 @@ class Safekeeper(LogUtils):
 
     def assert_no_errors(self):
         assert not self.log_contains("manager task finished prematurely")
+        assert not self.log_contains("error while acquiring WalResidentTimeline guard")
+        assert not self.log_contains("timeout while acquiring WalResidentTimeline guard")
 
     def append_logical_message(
         self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any]
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 7bf208db54..ac1a3bef67 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1,4 +1,5 @@
 import filecmp
+import logging
 import os
 import random
 import shutil
@@ -2178,3 +2179,102 @@ def test_broker_discovery(neon_env_builder: NeonEnvBuilder):
 
     do_something()
     do_something()
+
+
+# Test creates 5 endpoints and tries to wake them up randomly. All timeouts are
+# configured to be very short, so that we expect that:
+# - pageserver will update remote_consistent_lsn very often
+# - safekeepers will upload partial WAL segments very often
+# - safekeeper will try to evict and unevict timelines
+#
+# Test checks that there are no critical errors while doing this. Also it checks
+# that every safekeeper has at least one successful eviction.
+@pytest.mark.parametrize("delete_offloaded_wal", [False, True])
+@pytest.mark.parametrize("restart_chance", [0.0, 0.2])
+def test_s3_eviction(
+    neon_env_builder: NeonEnvBuilder, delete_offloaded_wal: bool, restart_chance: float
+):
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_timeout": "100ms",
+        }
+    )
+
+    extra_opts = [
+        "--enable-offload",
+        "--partial-backup-timeout",
+        "50ms",
+        "--control-file-save-interval",
+        "1s",
+    ]
+    if delete_offloaded_wal:
+        extra_opts.append("--delete-offloaded-wal")
+
+    for sk in env.safekeepers:
+        sk.stop().start(extra_opts=extra_opts)
+
+    n_timelines = 5
+
+    branch_names = [f"branch{tlin}" for tlin in range(n_timelines)]
+    timelines = []
+    ps_client = env.pageservers[0].http_client()
+
+    # start postgres on each timeline
+    endpoints: list[Endpoint] = []
+    for branch_name in branch_names:
+        timeline_id = env.neon_cli.create_branch(branch_name)
+        timelines.append(timeline_id)
+
+        endpoints.append(env.endpoints.create_start(branch_name))
+        endpoints[-1].safe_psql("CREATE TABLE t(i int)")
+        endpoints[-1].safe_psql("INSERT INTO t VALUES (0)")
+
+        lsn = endpoints[-1].safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]
+        log.info(f"{branch_name}: LSN={lsn}")
+
+        endpoints[-1].stop()
+
+        # update remote_consistent_lsn on pageserver
+        ps_client.timeline_checkpoint(env.initial_tenant, timelines[-1], wait_until_uploaded=True)
+
+    check_values = [0] * n_timelines
+
+    n_iters = 20
+    for _ in range(n_iters):
+        if log.isEnabledFor(logging.DEBUG):
+            for j in range(n_timelines):
+                detail = ps_client.timeline_detail(env.initial_tenant, timelines[j])
+                log.debug(
+                    f'{branch_names[j]}: RCL={detail["remote_consistent_lsn"]}, LRL={detail["last_record_lsn"]}'
+                )
+
+        i = random.randint(0, n_timelines - 1)
+        log.info(f"Starting endpoint {i}")
+        endpoints[i].start()
+        check_values[i] += 1
+        res = endpoints[i].safe_psql("UPDATE t SET i = i + 1 RETURNING i")
+        assert res[0][0] == check_values[i]
+
+        lsn = endpoints[i].safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]
+        log.info(f"{branch_names[i]}: LSN={lsn}")
+
+        endpoints[i].stop()
+
+        # update remote_consistent_lsn on pageserver
+        ps_client.timeline_checkpoint(env.initial_tenant, timelines[i], wait_until_uploaded=True)
+
+        # restarting random safekeepers
+        for sk in env.safekeepers:
+            if random.random() < restart_chance:
+                sk.stop().start(extra_opts=extra_opts)
+        time.sleep(0.5)
+
+    # require at least one successful eviction in at least one safekeeper
+    # TODO: require eviction in each safekeeper after https://github.com/neondatabase/neon/issues/8148 is fixed
+    assert any(
+        sk.log_contains("successfully evicted timeline")
+        and sk.log_contains("successfully restored evicted timeline")
+        for sk in env.safekeepers
+    )
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index 971fad787a..3f0a4a2ff8 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -200,9 +200,8 @@ async def run_restarts_under_load(
         # assert that at least one transaction has completed in every worker
         stats.check_progress()
 
-        # testing #6530, temporary here
-        # TODO: remove afer partial backup is enabled by default
-        victim.start(extra_opts=["--partial-backup-enabled", "--partial-backup-timeout=2s"])
+        # testing #6530
+        victim.start(extra_opts=["--partial-backup-timeout=2s"])
 
     log.info("Iterations are finished, exiting coroutines...")
     stats.running = False

From c39d5b03e81683717bd95c87615d68b0b23e887d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 26 Jun 2024 20:53:59 +0100
Subject: [PATCH 1061/1571] pageserver: remove legacy tenant config code, clean
 up redundant generation none/broken usages (#7947)

## Problem

In https://github.com/neondatabase/neon/pull/5299, the new config-v1
tenant config file was added to hold the LocationConf type. We left the
old config file in place for forward compat, and because running without
generations (therefore without LocationConf) as still useful before the
storage controller was ready for prime-time.

Closes: https://github.com/neondatabase/neon/issues/5388

## Summary of changes

- Remove code for reading and writing the legacy config file
- Remove Generation::Broken: it was unused.
- Treat missing config file on disk as an error loading a tenant, rather
than defaulting it. We can now remove LocationConf::default, and thereby
guarantee that we never construct a tenant with a None generation.
- Update some comments + add some assertions to clarify that
Generation::None is only used in layer metadata, not in the state of a
running tenant.
- Update docker compose test to create tenants with a generation
---
 docker-compose/README.md                      |  10 ++
 .../compute_wrapper/shell/compute.sh          |   7 +-
 libs/pageserver_api/src/models.rs             |  13 +-
 libs/utils/src/generation.rs                  |  32 +----
 pageserver/src/config.rs                      |  19 +--
 pageserver/src/deletion_queue.rs              |  11 --
 pageserver/src/http/routes.rs                 |   9 +-
 pageserver/src/lib.rs                         |   6 +-
 pageserver/src/tenant.rs                      | 136 +++---------------
 pageserver/src/tenant/config.rs               |  16 ---
 pageserver/src/tenant/mgr.rs                  |  24 +---
 .../src/tenant/secondary/heatmap_uploader.rs  |   5 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  16 +--
 13 files changed, 67 insertions(+), 237 deletions(-)
 create mode 100644 docker-compose/README.md

diff --git a/docker-compose/README.md b/docker-compose/README.md
new file mode 100644
index 0000000000..bd47805a67
--- /dev/null
+++ b/docker-compose/README.md
@@ -0,0 +1,10 @@
+
+# Example docker compose configuration
+
+The configuration in this directory is used for testing Neon docker images: it is
+not intended for deploying a usable system.  To run a development environment where
+you can experiment with a minature Neon system, use `cargo neon` rather than container images.
+
+This configuration does not start the storage controller, because the controller
+needs a way to reconfigure running computes, and no such thing exists in this setup.
+
diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh
index 22660a63ce..f646e36f59 100755
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -23,11 +23,10 @@ echo "Page server is ready."
 echo "Create a tenant and timeline"
 generate_id tenant_id
 PARAMS=(
-     -sb 
-     -X POST
+     -X PUT
      -H "Content-Type: application/json"
-     -d "{\"new_tenant_id\": \"${tenant_id}\"}"
-     http://pageserver:9898/v1/tenant/
+     -d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}"
+     "http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
 )
 result=$(curl "${PARAMS[@]}")
 echo $result | jq .
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index b1e4525cc0..4875f49495 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -625,8 +625,7 @@ pub struct TenantInfo {
     /// If a layer is present in both local FS and S3, it counts only once.
     pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
     pub attachment_status: TenantAttachmentStatus,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub generation: Option<u32>,
+    pub generation: u32,
 }
 
 #[derive(Serialize, Deserialize, Clone)]
@@ -1453,7 +1452,7 @@ mod tests {
             state: TenantState::Active,
             current_physical_size: Some(42),
             attachment_status: TenantAttachmentStatus::Attached,
-            generation: None,
+            generation: 1,
         };
         let expected_active = json!({
             "id": original_active.id.to_string(),
@@ -1463,7 +1462,8 @@ mod tests {
             "current_physical_size": 42,
             "attachment_status": {
                 "slug":"attached",
-            }
+            },
+            "generation" : 1
         });
 
         let original_broken = TenantInfo {
@@ -1474,7 +1474,7 @@ mod tests {
             },
             current_physical_size: Some(42),
             attachment_status: TenantAttachmentStatus::Attached,
-            generation: None,
+            generation: 1,
         };
         let expected_broken = json!({
             "id": original_broken.id.to_string(),
@@ -1488,7 +1488,8 @@ mod tests {
             "current_physical_size": 42,
             "attachment_status": {
                 "slug":"attached",
-            }
+            },
+            "generation" : 1
         });
 
         assert_eq!(
diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs
index b703e883de..5970836033 100644
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -9,20 +9,11 @@ use serde::{Deserialize, Serialize};
 /// numbers are used.
 #[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
 pub enum Generation {
-    // Generations with this magic value will not add a suffix to S3 keys, and will not
-    // be included in persisted index_part.json.  This value is only to be used
-    // during migration from pre-generation metadata to generation-aware metadata,
-    // and should eventually go away.
-    //
-    // A special Generation is used rather than always wrapping Generation in an Option,
-    // so that code handling generations doesn't have to be aware of the legacy
-    // case everywhere it touches a generation.
+    // The None Generation is used in the metadata of layers written before generations were
+    // introduced.  A running Tenant always has a valid generation, but the layer metadata may
+    // include None generations.
     None,
-    // Generations with this magic value may never be used to construct S3 keys:
-    // we will panic if someone tries to.  This is for Tenants in the "Broken" state,
-    // so that we can satisfy their constructor with a Generation without risking
-    // a code bug using it in an S3 write (broken tenants should never write)
-    Broken,
+
     Valid(u32),
 }
 
@@ -42,11 +33,6 @@ impl Generation {
         Self::None
     }
 
-    // Create a new generation that will panic if you try to use get_suffix
-    pub fn broken() -> Self {
-        Self::Broken
-    }
-
     pub const fn new(v: u32) -> Self {
         Self::Valid(v)
     }
@@ -60,9 +46,6 @@ impl Generation {
         match self {
             Self::Valid(v) => GenerationFileSuffix(Some(*v)),
             Self::None => GenerationFileSuffix(None),
-            Self::Broken => {
-                panic!("Tried to use a broken generation");
-            }
         }
     }
 
@@ -86,7 +69,6 @@ impl Generation {
                 }
             }
             Self::None => Self::None,
-            Self::Broken => panic!("Attempted to use a broken generation"),
         }
     }
 
@@ -95,7 +77,6 @@ impl Generation {
         match self {
             Self::Valid(n) => Self::Valid(*n + 1),
             Self::None => Self::Valid(1),
-            Self::Broken => panic!("Attempted to use a broken generation"),
         }
     }
 
@@ -128,7 +109,7 @@ impl Serialize for Generation {
         if let Self::Valid(v) = self {
             v.serialize(serializer)
         } else {
-            // We should never be asked to serialize a None or Broken.  Structures
+            // We should never be asked to serialize a None. Structures
             // that include an optional generation should convert None to an
             // Option<Generation>::None
             Err(serde::ser::Error::custom(
@@ -159,9 +140,6 @@ impl Debug for Generation {
             Self::None => {
                 write!(f, "<none>")
             }
-            Self::Broken => {
-                write!(f, "<broken>")
-            }
         }
     }
 }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 104234841c..f36e63f035 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -36,10 +36,7 @@ use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
 use crate::{tenant::config::TenantConf, virtual_file};
-use crate::{
-    TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME,
-    TIMELINE_DELETE_MARK_SUFFIX,
-};
+use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
 
 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
 
@@ -810,15 +807,11 @@ impl PageServerConf {
     }
 
     /// Points to a place in pageserver's local directory,
-    /// where certain tenant's tenantconf file should be located.
-    ///
-    /// Legacy: superseded by tenant_location_config_path.  Eventually
-    /// remove this function.
-    pub fn tenant_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
-        self.tenant_path(tenant_shard_id).join(TENANT_CONFIG_NAME)
-    }
-
-    pub fn tenant_location_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+    /// where certain tenant's LocationConf be stored.
+    pub(crate) fn tenant_location_config_path(
+        &self,
+        tenant_shard_id: &TenantShardId,
+    ) -> Utf8PathBuf {
         self.tenant_path(tenant_shard_id)
             .join(TENANT_LOCATION_CONFIG_NAME)
     }
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index e779729f8d..3e48552ace 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -382,17 +382,6 @@ pub enum DeletionQueueError {
 }
 
 impl DeletionQueueClient {
-    pub(crate) fn broken() -> Self {
-        // Channels whose receivers are immediately dropped.
-        let (tx, _rx) = tokio::sync::mpsc::unbounded_channel();
-        let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1);
-        Self {
-            tx,
-            executor_tx,
-            lsn_table: Arc::default(),
-        }
-    }
-
     /// This is cancel-safe.  If you drop the future before it completes, the message
     /// is not pushed, although in the context of the deletion queue it doesn't matter: once
     /// we decide to do a deletion the decision is always final.
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 41d096d7bb..5ebd34a406 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -887,7 +887,9 @@ async fn tenant_list_handler(
             state: state.clone(),
             current_physical_size: None,
             attachment_status: state.attachment_status(),
-            generation: (*gen).into(),
+            generation: (*gen)
+                .into()
+                .expect("Tenants are always attached with a generation"),
         })
         .collect::<Vec<TenantInfo>>();
 
@@ -935,7 +937,10 @@ async fn tenant_status(
                 state: state.clone(),
                 current_physical_size: Some(current_physical_size),
                 attachment_status: state.attachment_status(),
-                generation: tenant.generation().into(),
+                generation: tenant
+                    .generation()
+                    .into()
+                    .expect("Tenants are always attached with a generation"),
             },
             walredo: tenant.wal_redo_manager_status(),
             timelines: tenant.list_timeline_ids(),
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 9e64eafffc..353f97264c 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -113,11 +113,7 @@ pub async fn shutdown_pageserver(
 }
 
 /// Per-tenant configuration file.
-/// Full path: `tenants/<tenant_id>/config`.
-pub(crate) const TENANT_CONFIG_NAME: &str = "config";
-
-/// Per-tenant configuration file.
-/// Full path: `tenants/<tenant_id>/config`.
+/// Full path: `tenants/<tenant_id>/config-v1`.
 pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
 
 /// Per-tenant copy of their remote heatmap, downloaded into the local
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 30e855eaa2..45e542a336 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -648,7 +648,7 @@ impl Tenant {
         init_order: Option<InitializationOrder>,
         mode: SpawnMode,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Tenant>> {
+    ) -> Arc<Tenant> {
         let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
             conf,
             tenant_shard_id,
@@ -856,7 +856,7 @@ impl Tenant {
             }
             .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
         );
-        Ok(tenant)
+        tenant
     }
 
     #[instrument(skip_all)]
@@ -1147,30 +1147,6 @@ impl Tenant {
         .await
     }
 
-    /// Create a placeholder Tenant object for a broken tenant
-    pub fn create_broken_tenant(
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-        remote_storage: GenericRemoteStorage,
-        reason: String,
-    ) -> Arc<Tenant> {
-        Arc::new(Tenant::new(
-            TenantState::Broken {
-                reason,
-                backtrace: String::new(),
-            },
-            conf,
-            AttachedTenantConf::try_from(LocationConf::default()).unwrap(),
-            // Shard identity isn't meaningful for a broken tenant: it's just a placeholder
-            // to occupy the slot for this TenantShardId.
-            ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
-            None,
-            tenant_shard_id,
-            remote_storage,
-            DeletionQueueClient::broken(),
-        ))
-    }
-
     async fn load_timeline_metadata(
         self: &Arc<Tenant>,
         timeline_ids: HashSet<TimelineId>,
@@ -2494,6 +2470,10 @@ impl Tenant {
         remote_storage: GenericRemoteStorage,
         deletion_queue_client: DeletionQueueClient,
     ) -> Tenant {
+        debug_assert!(
+            !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
+        );
+
         let (state, mut rx) = watch::channel(state);
 
         tokio::spawn(async move {
@@ -2584,45 +2564,22 @@ impl Tenant {
         conf: &'static PageServerConf,
         tenant_shard_id: &TenantShardId,
     ) -> anyhow::Result<LocationConf> {
-        let legacy_config_path = conf.tenant_config_path(tenant_shard_id);
         let config_path = conf.tenant_location_config_path(tenant_shard_id);
 
         if config_path.exists() {
             // New-style config takes precedence
             let deserialized = Self::read_config(&config_path)?;
             Ok(toml_edit::de::from_document::<LocationConf>(deserialized)?)
-        } else if legacy_config_path.exists() {
-            // Upgrade path: found an old-style configuration only
-            let deserialized = Self::read_config(&legacy_config_path)?;
-
-            let mut tenant_conf = TenantConfOpt::default();
-            for (key, item) in deserialized.iter() {
-                match key {
-                    "tenant_config" => {
-                        tenant_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("Failed to parse config from file '{legacy_config_path}' as pageserver config"))?;
-                    }
-                    _ => bail!(
-                        "config file {legacy_config_path} has unrecognized pageserver option '{key}'"
-                    ),
-                }
-            }
-
-            // Legacy configs are implicitly in attached state, and do not support sharding
-            Ok(LocationConf::attached_single(
-                tenant_conf,
-                Generation::none(),
-                &models::ShardParameters::default(),
-            ))
         } else {
-            // FIXME If the config file is not found, assume that we're attaching
-            // a detached tenant and config is passed via attach command.
-            // https://github.com/neondatabase/neon/issues/1555
-            // OR: we're loading after incomplete deletion that managed to remove config.
-            info!(
-                "tenant config not found in {} or {}",
-                config_path, legacy_config_path
-            );
-            Ok(LocationConf::default())
+            // The config should almost always exist for a tenant directory:
+            //  - When attaching a tenant, the config is the first thing we write
+            //  - When detaching a tenant, we atomically move the directory to a tmp location
+            //    before deleting contents.
+            //
+            // The very rare edge case that can result in a missing config is if we crash during attach
+            // between creating directory and writing config.  Callers should handle that as if the
+            // directory didn't exist.
+            anyhow::bail!("tenant config not found in {}", config_path);
         }
     }
 
@@ -2644,47 +2601,17 @@ impl Tenant {
         tenant_shard_id: &TenantShardId,
         location_conf: &LocationConf,
     ) -> anyhow::Result<()> {
-        let legacy_config_path = conf.tenant_config_path(tenant_shard_id);
         let config_path = conf.tenant_location_config_path(tenant_shard_id);
 
-        Self::persist_tenant_config_at(
-            tenant_shard_id,
-            &config_path,
-            &legacy_config_path,
-            location_conf,
-        )
-        .await
+        Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await
     }
 
     #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
     pub(super) async fn persist_tenant_config_at(
         tenant_shard_id: &TenantShardId,
         config_path: &Utf8Path,
-        legacy_config_path: &Utf8Path,
         location_conf: &LocationConf,
     ) -> anyhow::Result<()> {
-        if let LocationMode::Attached(attach_conf) = &location_conf.mode {
-            // The modern-style LocationConf config file requires a generation to be set. In case someone
-            // is running a pageserver without the infrastructure to set generations, write out the legacy-style
-            // config file that only contains TenantConf.
-            //
-            // This will eventually be removed in https://github.com/neondatabase/neon/issues/5388
-
-            if attach_conf.generation.is_none() {
-                tracing::info!(
-                    "Running without generations, writing legacy-style tenant config file"
-                );
-                Self::persist_tenant_config_legacy(
-                    tenant_shard_id,
-                    legacy_config_path,
-                    &location_conf.tenant_conf,
-                )
-                .await?;
-
-                return Ok(());
-            }
-        }
-
         debug!("persisting tenantconf to {config_path}");
 
         let mut conf_content = r#"# This file contains a specific per-tenant's config.
@@ -2711,37 +2638,6 @@ impl Tenant {
         Ok(())
     }
 
-    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
-    async fn persist_tenant_config_legacy(
-        tenant_shard_id: &TenantShardId,
-        target_config_path: &Utf8Path,
-        tenant_conf: &TenantConfOpt,
-    ) -> anyhow::Result<()> {
-        debug!("persisting tenantconf to {target_config_path}");
-
-        let mut conf_content = r#"# This file contains a specific per-tenant's config.
-#  It is read in case of pageserver restart.
-
-[tenant_config]
-"#
-        .to_string();
-
-        // Convert the config to a toml file.
-        conf_content += &toml_edit::ser::to_string(&tenant_conf)?;
-
-        let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX);
-
-        let tenant_shard_id = *tenant_shard_id;
-        let target_config_path = target_config_path.to_owned();
-        let conf_content = conf_content.into_bytes();
-        VirtualFile::crashsafe_overwrite(target_config_path.clone(), temp_path, conf_content)
-            .await
-            .with_context(|| {
-                format!("write tenant {tenant_shard_id} config to {target_config_path}")
-            })?;
-        Ok(())
-    }
-
     //
     // How garbage collection works:
     //
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 1b9be12642..5b532e4830 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -281,22 +281,6 @@ impl LocationConf {
     }
 }
 
-impl Default for LocationConf {
-    // TODO: this should be removed once tenant loading can guarantee that we are never
-    // loading from a directory without a configuration.
-    // => tech debt since https://github.com/neondatabase/neon/issues/1555
-    fn default() -> Self {
-        Self {
-            mode: LocationMode::Attached(AttachedLocationConfig {
-                generation: Generation::none(),
-                attach_mode: AttachmentMode::Single,
-            }),
-            tenant_conf: TenantConfOpt::default(),
-            shard: ShardIdentity::unsharded(),
-        }
-    }
-}
-
 /// A tenant's calcuated configuration, which is the result of merging a
 /// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
 ///
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 1bc21d8b78..08c3f19b6f 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -495,17 +495,8 @@ pub async fn init_tenant_mgr(
         let mut location_conf = match location_conf {
             Ok(l) => l,
             Err(e) => {
-                warn!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Marking tenant broken, failed to {e:#}");
-
-                tenants.insert(
-                    tenant_shard_id,
-                    TenantSlot::Attached(Tenant::create_broken_tenant(
-                        conf,
-                        tenant_shard_id,
-                        resources.remote_storage.clone(),
-                        format!("{}", e),
-                    )),
-                );
+                // This should only happen in the case of a serialization bug or critical local I/O error: we cannot load this tenant
+                error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to load tenant config, failed to {e:#}");
                 continue;
             }
         };
@@ -687,8 +678,7 @@ fn tenant_spawn(
         "Cannot load tenant from empty directory {tenant_path:?}"
     );
 
-    let remote_storage = resources.remote_storage.clone();
-    let tenant = match Tenant::spawn(
+    let tenant = Tenant::spawn(
         conf,
         tenant_shard_id,
         resources,
@@ -697,13 +687,7 @@ fn tenant_spawn(
         init_order,
         mode,
         ctx,
-    ) {
-        Ok(tenant) => tenant,
-        Err(e) => {
-            error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}");
-            Tenant::create_broken_tenant(conf, tenant_shard_id, remote_storage, format!("{e:#}"))
-        }
-    };
+    );
 
     Ok(tenant)
 }
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 9c7a9c4234..0aad5bf392 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -367,10 +367,9 @@ async fn upload_tenant_heatmap(
     debug_assert_current_span_has_tenant_id();
 
     let generation = tenant.get_generation();
+    debug_assert!(!generation.is_none());
     if generation.is_none() {
-        // We do not expect this: generations were implemented before heatmap uploads.  However,
-        // handle it so that we don't have to make the generation in the heatmap an Option<>
-        // (Generation::none is not serializable)
+        // We do not expect this: None generations should only appear in historic layer metadata, not in running Tenants
         tracing::warn!("Skipping heatmap upload for tenant with generation==None");
         return Ok(UploadHeatmapOutcome::Skipped);
     }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 7eb42d8186..5dd9472535 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -93,16 +93,12 @@ pub(crate) struct Layer(Arc<LayerInner>);
 
 impl std::fmt::Display for Layer {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if matches!(self.0.generation, Generation::Broken) {
-            write!(f, "{}-broken", self.layer_desc().short_id())
-        } else {
-            write!(
-                f,
-                "{}{}",
-                self.layer_desc().short_id(),
-                self.0.generation.get_suffix()
-            )
-        }
+        write!(
+            f,
+            "{}{}",
+            self.layer_desc().short_id(),
+            self.0.generation.get_suffix()
+        )
     }
 }
 

From 04b2ac3fed635bf32b66fa2f9212ce6f5644c8a5 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 26 Jun 2024 16:33:15 -0400
Subject: [PATCH 1062/1571] test: use aux file v2 policy in benchmarks (#8174)

Use aux file v2 in benchmarks.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py               | 12 +++++-------
 test_runner/performance/test_logical_replication.py |  3 ++-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 6a29df6f13..c5fc7ee351 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -581,7 +581,7 @@ class NeonEnvBuilder:
             timeline_id=env.initial_timeline,
             shard_count=initial_tenant_shard_count,
             shard_stripe_size=initial_tenant_shard_stripe_size,
-            aux_file_v2=self.pageserver_aux_file_policy,
+            aux_file_policy=self.pageserver_aux_file_policy,
         )
         assert env.initial_tenant == initial_tenant
         assert env.initial_timeline == initial_timeline
@@ -1604,7 +1604,7 @@ class NeonCli(AbstractNeonCli):
         shard_stripe_size: Optional[int] = None,
         placement_policy: Optional[str] = None,
         set_default: bool = False,
-        aux_file_v2: Optional[AuxFileStore] = None,
+        aux_file_policy: Optional[AuxFileStore] = None,
     ) -> Tuple[TenantId, TimelineId]:
         """
         Creates a new tenant, returns its id and its initial timeline's id.
@@ -1629,13 +1629,11 @@ class NeonCli(AbstractNeonCli):
                 )
             )
 
-        if aux_file_v2 is AuxFileStore.V2:
+        if aux_file_policy is AuxFileStore.V2:
             args.extend(["-c", "switch_aux_file_policy:v2"])
-
-        if aux_file_v2 is AuxFileStore.V1:
+        elif aux_file_policy is AuxFileStore.V1:
             args.extend(["-c", "switch_aux_file_policy:v1"])
-
-        if aux_file_v2 is AuxFileStore.CrossValidation:
+        elif aux_file_policy is AuxFileStore.CrossValidation:
             args.extend(["-c", "switch_aux_file_policy:cross-validation"])
 
         if set_default:
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index b799f7248f..7d11facc29 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -2,9 +2,10 @@ import time
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, PgBin, logical_replication_sync
+from fixtures.neon_fixtures import AuxFileStore, NeonEnv, PgBin, logical_replication_sync
 
 
+@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
 @pytest.mark.timeout(1000)
 def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg):
     env = neon_simple_env

From d2753719e322e3efae50a49bd3935cfd465d1434 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 27 Jun 2024 00:54:29 +0300
Subject: [PATCH 1063/1571] test: Add helper function for importing a Postgres
 cluster (#8025)

Also, modify the "neon_local timeline import" command so that it
doesn't create the endpoint any more. I don't see any reason to bundle
that in the same command, the "timeline create" and "timeline branch"
commands don't do that either.

I plan to add more tests similar to 'test_import_at_2bil', this will
help to reduce the copy-pasting.
---
 control_plane/src/bin/neon_local.rs   | 29 ++-------
 test_runner/fixtures/neon_fixtures.py | 64 +++++++++++++++++++
 test_runner/regress/test_import.py    | 16 ++---
 test_runner/regress/test_next_xid.py  | 91 ++++++++-------------------
 4 files changed, 102 insertions(+), 98 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 3f656932d5..f381337346 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -600,13 +600,9 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
         Some(("import", import_match)) => {
             let tenant_id = get_tenant_id(import_match, env)?;
             let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided");
-            let name = import_match
-                .get_one::<String>("node-name")
-                .ok_or_else(|| anyhow!("No node name provided"))?;
-            let update_catalog = import_match
-                .get_one::<bool>("update-catalog")
-                .cloned()
-                .unwrap_or_default();
+            let branch_name = import_match
+                .get_one::<String>("branch-name")
+                .ok_or_else(|| anyhow!("No branch name provided"))?;
 
             // Parse base inputs
             let base_tarfile = import_match
@@ -633,24 +629,11 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                 .copied()
                 .context("Failed to parse postgres version from the argument string")?;
 
-            let mut cplane = ComputeControlPlane::load(env.clone())?;
             println!("Importing timeline into pageserver ...");
             pageserver
                 .timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)
                 .await?;
-            env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;
-
-            println!("Creating endpoint for imported timeline ...");
-            cplane.new_endpoint(
-                name,
-                tenant_id,
-                timeline_id,
-                None,
-                None,
-                pg_version,
-                ComputeMode::Primary,
-                !update_catalog,
-            )?;
+            env.register_branch_mapping(branch_name.to_string(), tenant_id, timeline_id)?;
             println!("Done");
         }
         Some(("branch", branch_match)) => {
@@ -1487,8 +1470,7 @@ fn cli() -> Command {
                 .about("Import timeline from basebackup directory")
                 .arg(tenant_id_arg.clone())
                 .arg(timeline_id_arg.clone())
-                .arg(Arg::new("node-name").long("node-name")
-                    .help("Name to assign to the imported timeline"))
+                .arg(branch_name_arg.clone())
                 .arg(Arg::new("base-tarfile")
                     .long("base-tarfile")
                     .value_parser(value_parser!(PathBuf))
@@ -1504,7 +1486,6 @@ fn cli() -> Command {
                 .arg(Arg::new("end-lsn").long("end-lsn")
                     .help("Lsn the basebackup ends at"))
                 .arg(pg_version_arg.clone())
-                .arg(update_catalog.clone())
             )
         ).subcommand(
             Command::new("tenant")
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c5fc7ee351..6bfe1afd1f 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4659,6 +4659,70 @@ def fork_at_current_lsn(
     return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn)
 
 
+def import_timeline_from_vanilla_postgres(
+    test_output_dir: Path,
+    env: NeonEnv,
+    pg_bin: PgBin,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    branch_name: str,
+    vanilla_pg_connstr: str,
+):
+    """
+    Create a new timeline, by importing an existing PostgreSQL cluster.
+
+    This works by taking a physical backup of the running PostgreSQL cluster, and importing that.
+    """
+
+    # Take backup of the existing PostgreSQL server with pg_basebackup
+    basebackup_dir = os.path.join(test_output_dir, "basebackup")
+    base_tar = os.path.join(basebackup_dir, "base.tar")
+    wal_tar = os.path.join(basebackup_dir, "pg_wal.tar")
+    os.mkdir(basebackup_dir)
+    pg_bin.run(
+        [
+            "pg_basebackup",
+            "-F",
+            "tar",
+            "-d",
+            vanilla_pg_connstr,
+            "-D",
+            basebackup_dir,
+        ]
+    )
+
+    # Extract start_lsn and end_lsn form the backup manifest file
+    with open(os.path.join(basebackup_dir, "backup_manifest")) as f:
+        manifest = json.load(f)
+        start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"]
+        end_lsn = manifest["WAL-Ranges"][0]["End-LSN"]
+
+    # Import the backup tarballs into the pageserver
+    env.neon_cli.raw_cli(
+        [
+            "timeline",
+            "import",
+            "--tenant-id",
+            str(tenant_id),
+            "--timeline-id",
+            str(timeline_id),
+            "--branch-name",
+            branch_name,
+            "--base-lsn",
+            start_lsn,
+            "--base-tarfile",
+            base_tar,
+            "--end-lsn",
+            end_lsn,
+            "--wal-tarfile",
+            wal_tar,
+            "--pg-version",
+            env.pg_version,
+        ]
+    )
+    wait_for_last_record_lsn(env.pageserver.http_client(), tenant_id, timeline_id, Lsn(end_lsn))
+
+
 def last_flush_lsn_upload(
     env: NeonEnv,
     endpoint: Endpoint,
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index ac27a4cf36..d97e882a70 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -76,7 +76,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
         start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"]
         end_lsn = manifest["WAL-Ranges"][0]["End-LSN"]
 
-    endpoint_id = "ep-import_from_vanilla"
+    branch_name = "import_from_vanilla"
     tenant = TenantId.generate()
     timeline = TimelineId.generate()
 
@@ -106,8 +106,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
                 str(tenant),
                 "--timeline-id",
                 str(timeline),
-                "--node-name",
-                endpoint_id,
+                "--branch-name",
+                branch_name,
                 "--base-lsn",
                 start_lsn,
                 "--base-tarfile",
@@ -146,7 +146,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
     wait_for_upload(client, tenant, timeline, Lsn(end_lsn))
 
     # Check it worked
-    endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant)
+    endpoint = env.endpoints.create_start(branch_name, tenant_id=tenant)
     assert endpoint.safe_psql("select count(*) from t") == [(300000,)]
 
     vanilla_pg.stop()
@@ -265,7 +265,7 @@ def _import(
     tenant = TenantId.generate()
 
     # Import to pageserver
-    endpoint_id = "ep-import_from_pageserver"
+    branch_name = "import_from_pageserver"
     client = env.pageserver.http_client()
     env.pageserver.tenant_create(tenant)
     env.neon_cli.raw_cli(
@@ -276,8 +276,8 @@ def _import(
             str(tenant),
             "--timeline-id",
             str(timeline),
-            "--node-name",
-            endpoint_id,
+            "--branch-name",
+            branch_name,
             "--base-lsn",
             str(lsn),
             "--base-tarfile",
@@ -292,7 +292,7 @@ def _import(
     wait_for_upload(client, tenant, timeline, lsn)
 
     # Check it worked
-    endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant, lsn=lsn)
+    endpoint = env.endpoints.create_start(branch_name, tenant_id=tenant, lsn=lsn)
     assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)]
 
     # Take another fullbackup
diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py
index 98fb06a0d6..b9e7e642b5 100644
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -1,13 +1,14 @@
-import json
 import os
 import time
 from pathlib import Path
 
-from fixtures.common_types import Lsn, TenantId, TimelineId
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_wal_insert_lsn
-from fixtures.pageserver.utils import (
-    wait_for_last_record_lsn,
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PgBin,
+    import_timeline_from_vanilla_postgres,
+    wait_for_wal_insert_lsn,
 )
 from fixtures.remote_storage import RemoteStorageKind
 from fixtures.utils import query_scalar
@@ -76,7 +77,6 @@ def test_import_at_2bil(
 ):
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
     env = neon_env_builder.init_start()
-    ps_http = env.pageserver.http_client()
 
     # Reset the vanilla Postgres instance to somewhat before 2 billion transactions.
     pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
@@ -92,68 +92,28 @@ def test_import_at_2bil(
     assert vanilla_pg.safe_psql("select count(*) from tt") == [(300000,)]
     vanilla_pg.safe_psql("CREATE TABLE t (t text);")
     vanilla_pg.safe_psql("INSERT INTO t VALUES ('inserted in vanilla')")
-
-    endpoint_id = "ep-import_from_vanilla"
-    tenant = TenantId.generate()
-    timeline = TimelineId.generate()
-
-    env.pageserver.tenant_create(tenant)
-
-    # Take basebackup
-    basebackup_dir = os.path.join(test_output_dir, "basebackup")
-    base_tar = os.path.join(basebackup_dir, "base.tar")
-    wal_tar = os.path.join(basebackup_dir, "pg_wal.tar")
-    os.mkdir(basebackup_dir)
     vanilla_pg.safe_psql("CHECKPOINT")
-    pg_bin.run(
-        [
-            "pg_basebackup",
-            "-F",
-            "tar",
-            "-d",
-            vanilla_pg.connstr(),
-            "-D",
-            basebackup_dir,
-        ]
+
+    tenant_id = TenantId.generate()
+    env.pageserver.tenant_create(tenant_id)
+    timeline_id = TimelineId.generate()
+
+    # Import the cluster to Neon
+    import_timeline_from_vanilla_postgres(
+        test_output_dir,
+        env,
+        pg_bin,
+        tenant_id,
+        timeline_id,
+        "imported_2bil_xids",
+        vanilla_pg.connstr(),
     )
+    vanilla_pg.stop()  # don't need the original server anymore
 
-    # Get start_lsn and end_lsn
-    with open(os.path.join(basebackup_dir, "backup_manifest")) as f:
-        manifest = json.load(f)
-        start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"]
-        end_lsn = manifest["WAL-Ranges"][0]["End-LSN"]
-
-    def import_tar(base, wal):
-        env.neon_cli.raw_cli(
-            [
-                "timeline",
-                "import",
-                "--tenant-id",
-                str(tenant),
-                "--timeline-id",
-                str(timeline),
-                "--node-name",
-                endpoint_id,
-                "--base-lsn",
-                start_lsn,
-                "--base-tarfile",
-                base,
-                "--end-lsn",
-                end_lsn,
-                "--wal-tarfile",
-                wal,
-                "--pg-version",
-                env.pg_version,
-            ]
-        )
-
-    # Importing correct backup works
-    import_tar(base_tar, wal_tar)
-    wait_for_last_record_lsn(ps_http, tenant, timeline, Lsn(end_lsn))
-
+    # Check that it works
     endpoint = env.endpoints.create_start(
-        endpoint_id,
-        tenant_id=tenant,
+        "imported_2bil_xids",
+        tenant_id=tenant_id,
         config_lines=[
             "log_autovacuum_min_duration = 0",
             "autovacuum_naptime='5 s'",
@@ -161,7 +121,6 @@ def test_import_at_2bil(
     )
     assert endpoint.safe_psql("select count(*) from t") == [(1,)]
 
-    # Ok, consume
     conn = endpoint.connect()
     cur = conn.cursor()
 
@@ -213,7 +172,7 @@ def test_import_at_2bil(
     cur.execute("checkpoint")
 
     # wait until pageserver receives that data
-    wait_for_wal_insert_lsn(env, endpoint, tenant, timeline)
+    wait_for_wal_insert_lsn(env, endpoint, tenant_id, timeline_id)
 
     # Restart endpoint
     endpoint.stop()

From 32b75e7c7361d57671c037651c6fa943f18e94a7 Mon Sep 17 00:00:00 2001
From: Cihan Demirci <128653800+fcdm@users.noreply.github.com>
Date: Wed, 26 Jun 2024 23:36:41 +0100
Subject: [PATCH 1064/1571] CI: additional trigger on merge to main (#8176)

Before we consolidate workflows we want to be triggered by merges to main.

https://github.com/neondatabase/cloud/issues/14862
---
 .github/workflows/build_and_test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 113b37ae51..87f04996fd 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1245,6 +1245,7 @@ jobs:
         run: |
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
             gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
+            gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
             gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
               -f deployPgSniRouter=false \

From d55700267503f68e707e137710d1ee7a0dc4d693 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 27 Jun 2024 11:56:57 +0100
Subject: [PATCH 1065/1571] strocon: don't overcommit when making node fill
 plan (#8171)

## Problem
The fill requirement was not taken into account when looking through the
shards of a given node to fill from.

## Summary of Changes
Ensure that we do not fill a node past the recommendation from
`Scheduler::compute_fill_requirement`.
---
 storage_controller/src/service.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index e329f42dd6..a94575b428 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5564,9 +5564,12 @@ impl Service {
                 break;
             }
 
-            let mut can_take = attached - expected_attached;
+            let can_take = attached - expected_attached;
+            let needed = fill_requirement - plan.len();
+            let mut take = std::cmp::min(can_take, needed);
+
             let mut remove_node = false;
-            while can_take > 0 {
+            while take > 0 {
                 match tids_by_node.get_mut(&node_id) {
                     Some(tids) => match tids.pop() {
                         Some(tid) => {
@@ -5578,7 +5581,7 @@ impl Service {
                             if *promoted < max_promote_for_tenant {
                                 plan.push(tid);
                                 *promoted += 1;
-                                can_take -= 1;
+                                take -= 1;
                             }
                         }
                         None => {

From 6f20a18e8e20c7cdf689b728fccd7de88eb29ca7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 17 Jun 2024 16:23:07 +0300
Subject: [PATCH 1066/1571] Allow to change compute safekeeper list without
 restart.

- Add --safekeepers option to neon_local reconfigure
- Add it to python Endpoint reconfigure
- Implement config reload in walproposer by restarting the whole bgw when
  safekeeper list changes.

ref https://github.com/neondatabase/neon/issues/6341
---
 control_plane/src/bin/neon_local.rs      | 46 +++++++++++++--------
 control_plane/src/endpoint.rs            | 37 +++++++++++------
 pgxn/neon/walproposer_pg.c               | 52 ++++++++++++++++++++++--
 storage_controller/src/compute_hook.rs   |  2 +-
 test_runner/fixtures/neon_fixtures.py    | 22 +++++++++-
 test_runner/regress/test_wal_acceptor.py | 35 +++++++---------
 6 files changed, 139 insertions(+), 55 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index f381337346..2c05938f44 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -848,20 +848,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
 
             let allow_multiple = sub_args.get_flag("allow-multiple");
 
-            // If --safekeepers argument is given, use only the listed safekeeper nodes.
-            let safekeepers =
-                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
-                    let mut safekeepers: Vec<NodeId> = Vec::new();
-                    for sk_id in safekeepers_str.split(',').map(str::trim) {
-                        let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| {
-                            anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list")
-                        })?);
-                        safekeepers.push(sk_id);
-                    }
-                    safekeepers
-                } else {
-                    env.safekeepers.iter().map(|sk| sk.id).collect()
-                };
+            // If --safekeepers argument is given, use only the listed
+            // safekeeper nodes; otherwise all from the env.
+            let safekeepers = if let Some(safekeepers) = parse_safekeepers(sub_args)? {
+                safekeepers
+            } else {
+                env.safekeepers.iter().map(|sk| sk.id).collect()
+            };
 
             let endpoint = cplane
                 .endpoints
@@ -965,7 +958,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                         })
                         .collect::<Vec<_>>()
                 };
-            endpoint.reconfigure(pageservers, None).await?;
+            // If --safekeepers argument is given, use only the listed
+            // safekeeper nodes; otherwise all from the env.
+            let safekeepers = parse_safekeepers(sub_args)?;
+            endpoint.reconfigure(pageservers, None, safekeepers).await?;
         }
         "stop" => {
             let endpoint_id = sub_args
@@ -987,6 +983,23 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
     Ok(())
 }
 
+/// Parse --safekeepers as list of safekeeper ids.
+fn parse_safekeepers(sub_args: &ArgMatches) -> Result<Option<Vec<NodeId>>> {
+    if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
+        let mut safekeepers: Vec<NodeId> = Vec::new();
+        for sk_id in safekeepers_str.split(',').map(str::trim) {
+            let sk_id = NodeId(
+                u64::from_str(sk_id)
+                    .map_err(|_| anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list"))?,
+            );
+            safekeepers.push(sk_id);
+        }
+        Ok(Some(safekeepers))
+    } else {
+        Ok(None)
+    }
+}
+
 fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
     let (sub_name, sub_args) = match sub_match.subcommand() {
         Some(ep_subcommand_data) => ep_subcommand_data,
@@ -1590,7 +1603,7 @@ fn cli() -> Command {
                     .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
                     .arg(endpoint_id_arg.clone())
                     .arg(endpoint_pageserver_id_arg.clone())
-                    .arg(safekeepers_arg)
+                    .arg(safekeepers_arg.clone())
                     .arg(remote_ext_config_args)
                     .arg(create_test_user)
                     .arg(allow_multiple.clone())
@@ -1599,6 +1612,7 @@ fn cli() -> Command {
                 .subcommand(Command::new("reconfigure")
                             .about("Reconfigure the endpoint")
                             .arg(endpoint_pageserver_id_arg)
+                            .arg(safekeepers_arg)
                             .arg(endpoint_id_arg.clone())
                             .arg(tenant_id_arg.clone())
                 )
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index b928bbfc30..f9bb2da7e7 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -499,6 +499,23 @@ impl Endpoint {
             .join(",")
     }
 
+    /// Map safekeepers ids to the actual connection strings.
+    fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
+        let mut safekeeper_connstrings = Vec::new();
+        if self.mode == ComputeMode::Primary {
+            for sk_id in sk_ids {
+                let sk = self
+                    .env
+                    .safekeepers
+                    .iter()
+                    .find(|node| node.id == sk_id)
+                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
+                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
+            }
+        }
+        Ok(safekeeper_connstrings)
+    }
+
     pub async fn start(
         &self,
         auth_token: &Option<String>,
@@ -523,18 +540,7 @@ impl Endpoint {
         let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
         assert!(!pageserver_connstring.is_empty());
 
-        let mut safekeeper_connstrings = Vec::new();
-        if self.mode == ComputeMode::Primary {
-            for sk_id in safekeepers {
-                let sk = self
-                    .env
-                    .safekeepers
-                    .iter()
-                    .find(|node| node.id == sk_id)
-                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
-                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
-            }
-        }
+        let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
 
         // check for file remote_extensions_spec.json
         // if it is present, read it and pass to compute_ctl
@@ -740,6 +746,7 @@ impl Endpoint {
         &self,
         mut pageservers: Vec<(Host, u16)>,
         stripe_size: Option<ShardStripeSize>,
+        safekeepers: Option<Vec<NodeId>>,
     ) -> Result<()> {
         let mut spec: ComputeSpec = {
             let spec_path = self.endpoint_path().join("spec.json");
@@ -774,6 +781,12 @@ impl Endpoint {
             spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
         }
 
+        // If safekeepers are not specified, don't change them.
+        if let Some(safekeepers) = safekeepers {
+            let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
+            spec.safekeeper_connstrings = safekeeper_connstrings;
+        }
+
         let client = reqwest::Client::builder()
             .timeout(Duration::from_secs(30))
             .build()
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index da1a6f76f0..944b316344 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -63,6 +63,8 @@ char	   *wal_acceptors_list = "";
 int			wal_acceptor_reconnect_timeout = 1000;
 int			wal_acceptor_connection_timeout = 10000;
 
+/* Set to true in the walproposer bgw. */
+static bool am_walproposer;
 static WalproposerShmemState *walprop_shared;
 static WalProposerConfig walprop_config;
 static XLogRecPtr sentPtr = InvalidXLogRecPtr;
@@ -76,6 +78,7 @@ static HotStandbyFeedback agg_hs_feedback;
 
 static void nwp_shmem_startup_hook(void);
 static void nwp_register_gucs(void);
+static void assign_neon_safekeepers(const char *newval, void *extra);
 static void nwp_prepare_shmem(void);
 static uint64 backpressure_lag_impl(void);
 static bool backpressure_throttling_impl(void);
@@ -111,7 +114,8 @@ init_walprop_config(bool syncSafekeepers)
 {
 	walprop_config.neon_tenant = neon_tenant;
 	walprop_config.neon_timeline = neon_timeline;
-	walprop_config.safekeepers_list = wal_acceptors_list;
+	/* WalProposerCreate scribbles directly on it, so pstrdup */
+	walprop_config.safekeepers_list = pstrdup(wal_acceptors_list);
 	walprop_config.safekeeper_reconnect_timeout = wal_acceptor_reconnect_timeout;
 	walprop_config.safekeeper_connection_timeout = wal_acceptor_connection_timeout;
 	walprop_config.wal_segment_size = wal_segment_size;
@@ -151,6 +155,7 @@ WalProposerMain(Datum main_arg)
 
 	init_walprop_config(false);
 	walprop_pg_init_bgworker();
+	am_walproposer = true;
 	walprop_pg_load_libpqwalreceiver();
 
 	wp = WalProposerCreate(&walprop_config, walprop_pg);
@@ -189,10 +194,10 @@ nwp_register_gucs(void)
 							   NULL,	/* long_desc */
 							   &wal_acceptors_list, /* valueAddr */
 							   "",	/* bootValue */
-							   PGC_POSTMASTER,
+							   PGC_SIGHUP,
 							   GUC_LIST_INPUT,	/* extensions can't use*
 												 * GUC_LIST_QUOTE */
-							   NULL, NULL, NULL);
+							   NULL, assign_neon_safekeepers, NULL);
 
 	DefineCustomIntVariable(
 							"neon.safekeeper_reconnect_timeout",
@@ -215,6 +220,33 @@ nwp_register_gucs(void)
 							NULL, NULL, NULL);
 }
 
+/*
+ * GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if
+ * the list changed.
+ */
+static void
+assign_neon_safekeepers(const char *newval, void *extra)
+{
+	if (!am_walproposer)
+		return;
+
+	if (!newval) {
+		/* should never happen */
+		wpg_log(FATAL, "neon.safekeepers is empty");
+	}
+
+	/* 
+	 * TODO: restarting through FATAL is stupid and introduces 1s delay before
+	 * next bgw start. We should refactor walproposer to allow graceful exit and
+	 * thus remove this delay.
+	 */
+	if (strcmp(wal_acceptors_list, newval) != 0)
+	{
+		wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s",
+				wal_acceptors_list, newval);
+	}
+}
+
 /*  Check if we need to suspend inserts because of lagging replication. */
 static uint64
 backpressure_lag_impl(void)
@@ -363,7 +395,7 @@ walprop_register_bgworker(void)
 	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain");
 	snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer");
 	snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer");
-	bgw.bgw_restart_time = 5;
+	bgw.bgw_restart_time = 1;
 	bgw.bgw_notify_pid = 0;
 	bgw.bgw_main_arg = (Datum) 0;
 
@@ -1639,6 +1671,18 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
 		late_cv_trigger = ConditionVariableCancelSleep();
 #endif
 
+	/*
+	 * Process config if requested. This restarts walproposer if safekeepers
+	 * list changed. Don't do that for sync-safekeepers because quite probably
+	 * it (re-reading config) won't work without some effort, and
+	 * sync-safekeepers should be quick to finish anyway.
+	 */
+	if (!wp->config->syncSafekeepers && ConfigReloadPending)
+	{
+		ConfigReloadPending = false;
+		ProcessConfigFile(PGC_SIGHUP);
+	}
+
 	/*
 	 * If wait is terminated by latch set (walsenders' latch is set on each
 	 * wal flush). (no need for pm death check due to WL_EXIT_ON_PM_DEATH)
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 4d0f8006aa..c46539485c 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -323,7 +323,7 @@ impl ComputeHook {
             if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
                 tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
                 endpoint
-                    .reconfigure(compute_pageservers.clone(), *stripe_size)
+                    .reconfigure(compute_pageservers.clone(), *stripe_size, None)
                     .await
                     .map_err(NotifyError::NeonLocal)?;
             }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 6bfe1afd1f..a3f83abd3e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1933,6 +1933,7 @@ class NeonCli(AbstractNeonCli):
         endpoint_id: str,
         tenant_id: Optional[TenantId] = None,
         pageserver_id: Optional[int] = None,
+        safekeepers: Optional[List[int]] = None,
         check_return_code=True,
     ) -> "subprocess.CompletedProcess[str]":
         args = ["endpoint", "reconfigure", endpoint_id]
@@ -1940,6 +1941,8 @@ class NeonCli(AbstractNeonCli):
             args.extend(["--tenant-id", str(tenant_id)])
         if pageserver_id is not None:
             args.extend(["--pageserver-id", str(pageserver_id)])
+        if safekeepers is not None:
+            args.extend(["--safekeepers", (",".join(map(str, safekeepers)))])
         return self.raw_cli(args, check_return_code=check_return_code)
 
     def endpoint_stop(
@@ -3484,6 +3487,7 @@ class Endpoint(PgProtocol, LogUtils):
         self.pg_port = pg_port
         self.http_port = http_port
         self.check_stop_result = check_stop_result
+        # passed to endpoint create and endpoint reconfigure
         self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
         # path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf
 
@@ -3552,6 +3556,7 @@ class Endpoint(PgProtocol, LogUtils):
         self,
         remote_ext_config: Optional[str] = None,
         pageserver_id: Optional[int] = None,
+        safekeepers: Optional[List[int]] = None,
         allow_multiple: bool = False,
     ) -> "Endpoint":
         """
@@ -3561,6 +3566,11 @@ class Endpoint(PgProtocol, LogUtils):
 
         assert self.endpoint_id is not None
 
+        # If `safekeepers` is not None, they are remember them as active and use
+        # in the following commands.
+        if safekeepers is not None:
+            self.active_safekeepers = safekeepers
+
         log.info(f"Starting postgres endpoint {self.endpoint_id}")
 
         self.env.neon_cli.endpoint_start(
@@ -3624,9 +3634,17 @@ class Endpoint(PgProtocol, LogUtils):
     def is_running(self):
         return self._running._value > 0
 
-    def reconfigure(self, pageserver_id: Optional[int] = None):
+    def reconfigure(
+        self, pageserver_id: Optional[int] = None, safekeepers: Optional[List[int]] = None
+    ):
         assert self.endpoint_id is not None
-        self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id)
+        # If `safekeepers` is not None, they are remember them as active and use
+        # in the following commands.
+        if safekeepers is not None:
+            self.active_safekeepers = safekeepers
+        self.env.neon_cli.endpoint_reconfigure(
+            self.endpoint_id, self.tenant_id, pageserver_id, self.active_safekeepers
+        )
 
     def respec(self, **kwargs):
         """Update the endpoint.json file used by control_plane."""
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index ac1a3bef67..febfc10293 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1725,7 +1725,10 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
 
 
 # Basic pull_timeline test.
-def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
+# When live_sk_change is False, compute is restarted to change set of
+# safekeepers; otherwise it is live reload.
+@pytest.mark.parametrize("live_sk_change", [False, True])
+def test_pull_timeline(neon_env_builder: NeonEnvBuilder, live_sk_change: bool):
     neon_env_builder.auth_enabled = True
 
     def execute_payload(endpoint: Endpoint):
@@ -1758,8 +1761,7 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
     log.info("Use only first 3 safekeepers")
     env.safekeepers[3].stop()
     endpoint = env.endpoints.create("main")
-    endpoint.active_safekeepers = [1, 2, 3]
-    endpoint.start()
+    endpoint.start(safekeepers=[1, 2, 3])
 
     execute_payload(endpoint)
     show_statuses(env.safekeepers, tenant_id, timeline_id)
@@ -1771,29 +1773,22 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
     log.info("Initialize new safekeeper 4, pull data from 1 & 3")
     env.safekeepers[3].start()
 
-    res = (
-        env.safekeepers[3]
-        .http_client(auth_token=env.auth_keys.generate_safekeeper_token())
-        .pull_timeline(
-            {
-                "tenant_id": str(tenant_id),
-                "timeline_id": str(timeline_id),
-                "http_hosts": [
-                    f"http://localhost:{env.safekeepers[0].port.http}",
-                    f"http://localhost:{env.safekeepers[2].port.http}",
-                ],
-            }
-        )
+    res = env.safekeepers[3].pull_timeline(
+        [env.safekeepers[0], env.safekeepers[2]], tenant_id, timeline_id
     )
     log.info("Finished pulling timeline")
     log.info(res)
 
     show_statuses(env.safekeepers, tenant_id, timeline_id)
 
-    log.info("Restarting compute with new config to verify that it works")
-    endpoint.stop_and_destroy().create("main")
-    endpoint.active_safekeepers = [1, 3, 4]
-    endpoint.start()
+    action = "reconfiguing" if live_sk_change else "restarting"
+    log.info(f"{action} compute with new config to verify that it works")
+    new_sks = [1, 3, 4]
+    if not live_sk_change:
+        endpoint.stop_and_destroy().create("main")
+        endpoint.start(safekeepers=new_sks)
+    else:
+        endpoint.reconfigure(safekeepers=new_sks)
 
     execute_payload(endpoint)
     show_statuses(env.safekeepers, tenant_id, timeline_id)

From 54a06de4b5ef7dc0ceadfe3cc553e164490e6ae4 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 27 Jun 2024 13:56:03 +0100
Subject: [PATCH 1067/1571] CI: Use `runner.arch` in cache keys along with
 `runner.os` (#8175)

## Problem
The cache keys that we use on CI are the same for X64 and ARM64
(`runner.arch`)

## Summary of changes
- Include `runner.arch` along with `runner.os` into cache keys
---
 .../actions/allure-report-generate/action.yml |  2 +-
 .../actions/run-python-test-set/action.yml    |  6 ++---
 .github/workflows/benchmarking.yml            | 12 +++++-----
 .github/workflows/build_and_test.yml          | 22 +++++++++----------
 .github/workflows/pg_clients.yml              |  4 ++--
 5 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index f84beff20c..11adc8df86 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -183,7 +183,7 @@ runs:
       uses: actions/cache@v4
       with:
         path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
 
     - name: Store Allure test stat in the DB (new)
       if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index d5c1fcf524..c6ea52ba88 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -56,14 +56,14 @@ runs:
       if: inputs.build_type != 'remote'
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
         path: /tmp/neon
 
     - name: Download Neon binaries for the previous release
       if: inputs.build_type != 'remote'
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
         path: /tmp/neon-previous
         prefix: latest
 
@@ -89,7 +89,7 @@ runs:
       uses: actions/cache@v4
       with:
         path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
 
     - name: Install Python deps
       shell: bash -euxo pipefail {0}
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 9eff483680..db4209500f 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -77,7 +77,7 @@ jobs:
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
 
@@ -235,7 +235,7 @@ jobs:
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
 
@@ -373,7 +373,7 @@ jobs:
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
 
@@ -473,7 +473,7 @@ jobs:
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
 
@@ -576,7 +576,7 @@ jobs:
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
 
@@ -677,7 +677,7 @@ jobs:
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
 
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 87f04996fd..9cea9f4148 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -109,7 +109,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: ~/.cache/pypoetry/virtualenvs
-          key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
 
       - name: Install Python deps
         run: ./scripts/pysync
@@ -149,7 +149,7 @@ jobs:
 #            !~/.cargo/registry/src
 #            ~/.cargo/git/
 #            target/
-#          key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
 
       # Some of our rust modules use FFI and need those to be checked
       - name: Get postgres headers
@@ -291,29 +291,29 @@ jobs:
 #            target/
 #          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
 #          key: |
-#            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
-#            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
+#            v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#            v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
 
       - name: Cache postgres v14 build
         id: cache_pg_14
         uses: actions/cache@v4
         with:
           path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
         uses: actions/cache@v4
         with:
           path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
         uses: actions/cache@v4
         with:
           path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
 
       - name: Build postgres v14
         if: steps.cache_pg_14.outputs.cache-hit != 'true'
@@ -411,7 +411,7 @@ jobs:
       - name: Upload Neon artifact
         uses: ./.github/actions/upload
         with:
-          name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact
+          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
           path: /tmp/neon
 
       # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
@@ -490,7 +490,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: ~/.cache/pypoetry/virtualenvs
-          key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
 
       - name: Install Python deps
         run: ./scripts/pysync
@@ -639,7 +639,7 @@ jobs:
       - name: Get Neon artifact
         uses: ./.github/actions/download
         with:
-          name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact
+          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
           path: /tmp/neon
 
       - name: Get coverage artifact
@@ -1340,7 +1340,7 @@ jobs:
           # Update Neon artifact for the release (reuse already uploaded artifact)
           for build_type in debug release; do
             OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
-            FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
+            FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
 
             S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
             if [ -z "${S3_KEY}" ]; then
diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml
index fef3aec754..dd09abddb8 100644
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -41,7 +41,7 @@ jobs:
       uses: actions/cache@v4
       with:
         path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
+        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
 
     - name: Install Python deps
       shell: bash -euxo pipefail {0}
@@ -85,7 +85,7 @@ jobs:
       uses: actions/upload-artifact@v4
       with:
         retention-days: 7
-        name: python-test-pg_clients-${{ runner.os }}-stage-logs
+        name: python-test-pg_clients-${{ runner.os }}-${{ runner.arch }}-stage-logs
         path: ${{ env.TEST_OUTPUT }}
 
     - name: Post to a Slack channel

From 89cf8df93bae771e92b65a510ce8ff33801437a6 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 27 Jun 2024 14:16:41 +0100
Subject: [PATCH 1068/1571] stocon: bump number of concurrent reconciles per
 operation (#8179)

## Problem
Background node operations take a long time for loaded nodes.

## Summary of changes
Increase number of concurrent reconciles an operation is allowed to
spawn.
This should make drain and fill operations faster and the new value is
still well below the total limit of concurrent reconciles.
---
 storage_controller/src/background_node_operations.rs | 2 +-
 test_runner/regress/test_storage_controller.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/background_node_operations.rs b/storage_controller/src/background_node_operations.rs
index 74b7e7c849..6f1355eb68 100644
--- a/storage_controller/src/background_node_operations.rs
+++ b/storage_controller/src/background_node_operations.rs
@@ -3,7 +3,7 @@ use std::{borrow::Cow, fmt::Debug, fmt::Display};
 use tokio_util::sync::CancellationToken;
 use utils::id::NodeId;
 
-pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 10;
+pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 32;
 
 #[derive(Copy, Clone)]
 pub(crate) struct Drain {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 9cc13ecfdb..139a100872 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1636,7 +1636,7 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_configs()
     env.start()
 
-    tenant_count = 5
+    tenant_count = 10
     shard_count_per_tenant = 8
     tenant_ids = []
 

From 66b0bf41a1c9ac140a5af8ab61a94f66e821ae8d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 27 Jun 2024 15:58:28 +0200
Subject: [PATCH 1069/1571] fix: shutdown does not kill walredo processes
 (#8150)

While investigating Pageserver logs from the cases where systemd hangs
during shutdown (https://github.com/neondatabase/cloud/issues/11387), I
noticed that even if Pageserver shuts down cleanly[^1], there are
lingering walredo processes.

[^1]: Meaning, pageserver finishes its shutdown procedure and calls
`exit(0)` on its own terms, instead of hitting the systemd unit's
`TimeoutSec=` limit and getting SIGKILLed.

While systemd should never lock up like it does, maybe we can avoid
hitting that bug by cleaning up properly.

Changes
-------

This PR adds a shutdown method to `WalRedoManager` and hooks it up to
tenant shutdown.

We keep track of intent to shutdown through the new `enum
ProcessOnceCell` stored inside the pre-existing `redo_process` field.
A gate is added to keep track of running processes, using the new type
`struct Process`.

Future Work
-----------

Requests that don't need the redo process will not observe the shutdown
(see doc comment).
Doing so would be nice for completeness sake, but doesn't provide much
benefit because `Tenant` and `Timeline` already shut down all walredo
users.

Testing
-------


I did manual testing to confirm that the problem exists before this PR
and that it's gone after.
Setup:
* `neon_local` with a single tenant, create some data using `pgbench`
* ensure walredo process is running, not pid
* watch `strace -e kill,wait4 -f -p "$(pgrep pageserver)"`
* `neon_local pageserver stop`

With this PR, we always observe

```
$ strace -e kill,wait4 -f -p "$(pgrep pageserver)"
...
[pid 591120] --- SIGTERM {si_signo=SIGTERM, si_code=SI_USER, si_pid=591215, si_uid=1000} ---
[pid 591134] kill(591174, SIGKILL)      = 0
[pid 591134] wait4(591174,  <unfinished ...>
[pid 591142] --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_KILLED, si_pid=591174, si_uid=1000, si_status=SIGKILL, si_utime=0, si_stime=0} ---
[pid 591134] <... wait4 resumed>[{WIFSIGNALED(s) && WTERMSIG(s) == SIGKILL}], 0, NULL) = 591174
...
+++ exited with 0 +++
```

Before this PR, we'd usually observe just

```
...
[pid 596239] --- SIGTERM {si_signo=SIGTERM, si_code=SI_USER, si_pid=596455, si_uid=1000} ---
...
+++ exited with 0 +++
```

Refs
----

refs https://github.com/neondatabase/cloud/issues/11387
---
 pageserver/benches/bench_walredo.rs |   2 +
 pageserver/src/tenant.rs            |  19 ++-
 pageserver/src/walredo.rs           | 190 ++++++++++++++++++++++------
 3 files changed, 169 insertions(+), 42 deletions(-)

diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 5aab10e5d9..edc09d0bf2 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -48,6 +48,7 @@
 //! medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
 //! ```
 
+use anyhow::Context;
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
 use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
@@ -188,6 +189,7 @@ impl Request {
         manager
             .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
             .await
+            .context("request_redo")
     }
 
     fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 45e542a336..22d6804861 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -88,6 +88,7 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
+use crate::walredo;
 use crate::InitializationOrder;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
@@ -323,6 +324,16 @@ impl From<harness::TestRedoManager> for WalRedoManager {
 }
 
 impl WalRedoManager {
+    pub(crate) async fn shutdown(&self) {
+        match self {
+            Self::Prod(mgr) => mgr.shutdown().await,
+            #[cfg(test)]
+            Self::Test(_) => {
+                // Not applicable to test redo manager
+            }
+        }
+    }
+
     pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
         match self {
             Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
@@ -343,7 +354,7 @@ impl WalRedoManager {
         base_img: Option<(Lsn, bytes::Bytes)>,
         records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>,
         pg_version: u32,
-    ) -> anyhow::Result<bytes::Bytes> {
+    ) -> Result<bytes::Bytes, walredo::Error> {
         match self {
             Self::Prod(mgr) => {
                 mgr.request_redo(key, lsn, base_img, records, pg_version)
@@ -1853,6 +1864,10 @@ impl Tenant {
         tracing::debug!("Waiting for tasks...");
         task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await;
 
+        if let Some(walredo_mgr) = self.walredo_mgr.as_ref() {
+            walredo_mgr.shutdown().await;
+        }
+
         // Wait for any in-flight operations to complete
         self.gate.close().await;
 
@@ -3854,7 +3869,7 @@ pub(crate) mod harness {
             base_img: Option<(Lsn, Bytes)>,
             records: Vec<(Lsn, NeonWalRecord)>,
             _pg_version: u32,
-        ) -> anyhow::Result<Bytes> {
+        ) -> Result<Bytes, walredo::Error> {
             let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
             if records_neon {
                 // For Neon wal records, we can decode without spawning postgres, so do so.
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index d562540bde..5095beefd7 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -40,6 +40,7 @@ use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
 use utils::lsn::Lsn;
+use utils::sync::gate::GateError;
 use utils::sync::heavier_once_cell;
 
 ///
@@ -53,10 +54,18 @@ pub struct PostgresRedoManager {
     tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
     last_redo_at: std::sync::Mutex<Option<Instant>>,
-    /// The current [`process::WalRedoProcess`] that is used by new redo requests.
-    /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
-    /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
+    /// We use [`heavier_once_cell`] for
+    ///
+    /// 1. coalescing the lazy spawning of walredo processes ([`ProcessOnceCell::Spawned`])
+    /// 2. prevent new processes from being spawned on [`Self::shutdown`] (=> [`ProcessOnceCell::ManagerShutDown`]).
+    ///
+    /// # Spawning
+    ///
+    /// Redo requests use the once cell to coalesce onto one call to [`process::WalRedoProcess::launch`].
+    ///
+    /// Notably, requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
     /// their process object; we use [`Arc::clone`] for that.
+    ///
     /// This is primarily because earlier implementations that didn't  use [`heavier_once_cell`]
     /// had that behavior; it's probably unnecessary.
     /// The only merit of it is that if one walredo process encounters an error,
@@ -65,7 +74,63 @@ pub struct PostgresRedoManager {
     /// still be using the old redo process. But, those other tasks will most likely
     /// encounter an error as well, and errors are an unexpected condition anyway.
     /// So, probably we could get rid of the `Arc` in the future.
-    redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
+    ///
+    /// # Shutdown
+    ///
+    /// See [`Self::launched_processes`].
+    redo_process: heavier_once_cell::OnceCell<ProcessOnceCell>,
+
+    /// Gate that is entered when launching a walredo process and held open
+    /// until the process has been `kill()`ed and `wait()`ed upon.
+    ///
+    /// Manager shutdown waits for this gate to close after setting the
+    /// [`ProcessOnceCell::ManagerShutDown`] state in [`Self::redo_process`].
+    ///
+    /// This type of usage is a bit unusual because gates usually keep track of
+    /// concurrent operations, e.g., every [`Self::request_redo`] that is inflight.
+    /// But we use it here to keep track of the _processes_ that we have launched,
+    /// which may outlive any individual redo request because
+    /// - we keep walredo process around until its quiesced to amortize spawn cost and
+    /// - the Arc may be held by multiple concurrent redo requests, so, just because
+    ///   you replace the [`Self::redo_process`] cell's content doesn't mean the
+    ///   process gets killed immediately.
+    ///
+    /// We could simplify this by getting rid of the [`Arc`].
+    /// See the comment on [`Self::redo_process`] for more details.
+    launched_processes: utils::sync::gate::Gate,
+}
+
+/// See [`PostgresRedoManager::redo_process`].
+enum ProcessOnceCell {
+    Spawned(Arc<Process>),
+    ManagerShutDown,
+}
+
+struct Process {
+    _launched_processes_guard: utils::sync::gate::GateGuard,
+    process: process::WalRedoProcess,
+}
+
+impl std::ops::Deref for Process {
+    type Target = process::WalRedoProcess;
+
+    fn deref(&self) -> &Self::Target {
+        &self.process
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    #[error("cancelled")]
+    Cancelled,
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+macro_rules! bail {
+    ($($arg:tt)*) => {
+        return Err($crate::walredo::Error::Other(::anyhow::anyhow!($($arg)*)));
+    }
 }
 
 ///
@@ -88,9 +153,9 @@ impl PostgresRedoManager {
         base_img: Option<(Lsn, Bytes)>,
         records: Vec<(Lsn, NeonWalRecord)>,
         pg_version: u32,
-    ) -> anyhow::Result<Bytes> {
+    ) -> Result<Bytes, Error> {
         if records.is_empty() {
-            anyhow::bail!("invalid WAL redo request with no records");
+            bail!("invalid WAL redo request with no records");
         }
 
         let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
@@ -148,10 +213,10 @@ impl PostgresRedoManager {
                     chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
                 })
             },
-            process: self
-                .redo_process
-                .get()
-                .map(|p| WalRedoManagerProcessStatus { pid: p.id() }),
+            process: self.redo_process.get().and_then(|p| match &*p {
+                ProcessOnceCell::Spawned(p) => Some(WalRedoManagerProcessStatus { pid: p.id() }),
+                ProcessOnceCell::ManagerShutDown => None,
+            }),
         }
     }
 }
@@ -170,9 +235,39 @@ impl PostgresRedoManager {
             conf,
             last_redo_at: std::sync::Mutex::default(),
             redo_process: heavier_once_cell::OnceCell::default(),
+            launched_processes: utils::sync::gate::Gate::default(),
         }
     }
 
+    /// Shut down the WAL redo manager.
+    ///
+    /// After this future completes
+    /// - no redo process is running
+    /// - no new redo process will be spawned
+    /// - redo requests that need walredo process will fail with [`Error::Cancelled`]
+    /// - [`apply_neon`]-only redo requests may still work, but this may change in the future
+    ///
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
+    pub async fn shutdown(&self) {
+        // prevent new processes from being spawned
+        let permit = match self.redo_process.get_or_init_detached().await {
+            Ok(guard) => {
+                let (proc, permit) = guard.take_and_deinit();
+                drop(proc); // this just drops the Arc, its refcount may not be zero yet
+                permit
+            }
+            Err(permit) => permit,
+        };
+        self.redo_process
+            .set(ProcessOnceCell::ManagerShutDown, permit);
+        // wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
+        // we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
+        // for the underlying process.
+        self.launched_processes.close().await;
+    }
+
     /// This type doesn't have its own background task to check for idleness: we
     /// rely on our owner calling this function periodically in its own housekeeping
     /// loops.
@@ -203,38 +298,48 @@ impl PostgresRedoManager {
         records: &[(Lsn, NeonWalRecord)],
         wal_redo_timeout: Duration,
         pg_version: u32,
-    ) -> anyhow::Result<Bytes> {
+    ) -> Result<Bytes, Error> {
         *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
 
         let (rel, blknum) = key.to_rel_block().context("invalid record")?;
         const MAX_RETRY_ATTEMPTS: u32 = 1;
         let mut n_attempts = 0u32;
         loop {
-            let proc: Arc<process::WalRedoProcess> =
-                match self.redo_process.get_or_init_detached().await {
-                    Ok(guard) => Arc::clone(&guard),
-                    Err(permit) => {
-                        // don't hold poison_guard, the launch code can bail
-                        let start = Instant::now();
-                        let proc = Arc::new(
-                            process::WalRedoProcess::launch(
+            let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
+                Ok(guard) => match &*guard {
+                    ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
+                    ProcessOnceCell::ManagerShutDown => {
+                        return Err(Error::Cancelled);
+                    }
+                },
+                Err(permit) => {
+                    let start = Instant::now();
+                    let proc = Arc::new(Process {
+                            _launched_processes_guard: match self.launched_processes.enter() {
+                                Ok(guard) => guard,
+                                Err(GateError::GateClosed) => unreachable!(
+                                    "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
+                                ),
+                            },
+                            process: process::WalRedoProcess::launch(
                                 self.conf,
                                 self.tenant_shard_id,
                                 pg_version,
                             )
                             .context("launch walredo process")?,
-                        );
-                        let duration = start.elapsed();
-                        WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                        info!(
-                            duration_ms = duration.as_millis(),
-                            pid = proc.id(),
-                            "launched walredo process"
-                        );
-                        self.redo_process.set(Arc::clone(&proc), permit);
-                        proc
-                    }
-                };
+                        });
+                    let duration = start.elapsed();
+                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                    info!(
+                        duration_ms = duration.as_millis(),
+                        pid = proc.id(),
+                        "launched walredo process"
+                    );
+                    self.redo_process
+                        .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
+                    proc
+                }
+            };
 
             let started_at = std::time::Instant::now();
 
@@ -299,12 +404,17 @@ impl PostgresRedoManager {
                 match self.redo_process.get() {
                     None => (),
                     Some(guard) => {
-                        if Arc::ptr_eq(&proc, &*guard) {
-                            // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
-                            guard.take_and_deinit();
-                        } else {
-                            // Another task already spawned another redo process (further up in this method)
-                            // and put it into `redo_process`. Do nothing, our view of the world is behind.
+                        match &*guard {
+                            ProcessOnceCell::ManagerShutDown => {}
+                            ProcessOnceCell::Spawned(guard_proc) => {
+                                if Arc::ptr_eq(&proc, guard_proc) {
+                                    // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
+                                    guard.take_and_deinit();
+                                } else {
+                                    // Another task already spawned another redo process (further up in this method)
+                                    // and put it into `redo_process`. Do nothing, our view of the world is behind.
+                                }
+                            }
                         }
                     }
                 }
@@ -315,7 +425,7 @@ impl PostgresRedoManager {
             }
             n_attempts += 1;
             if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
-                return result;
+                return result.map_err(Error::Other);
             }
         }
     }
@@ -329,7 +439,7 @@ impl PostgresRedoManager {
         lsn: Lsn,
         base_img: Option<Bytes>,
         records: &[(Lsn, NeonWalRecord)],
-    ) -> anyhow::Result<Bytes> {
+    ) -> Result<Bytes, Error> {
         let start_time = Instant::now();
 
         let mut page = BytesMut::new();
@@ -338,7 +448,7 @@ impl PostgresRedoManager {
             page.extend_from_slice(&fpi[..]);
         } else {
             // All the current WAL record types that we can handle require a base image.
-            anyhow::bail!("invalid neon WAL redo request with no base image");
+            bail!("invalid neon WAL redo request with no base image");
         }
 
         // Apply all the WAL records in the batch

From 23827c6b0d400cbb9a972d4d05d49834816c40d1 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 27 Jun 2024 12:03:48 -0400
Subject: [PATCH 1070/1571] feat(pageserver): add delta layer iterator (#8064)

part of https://github.com/neondatabase/neon/issues/8002

## Summary of changes

Add delta layer iterator and tests.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../src/tenant/storage_layer/delta_layer.rs   | 197 ++++++++++++++++++
 pageserver/src/tenant/timeline.rs             |   8 +-
 2 files changed, 201 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index bf5d9249eb..c2d4a2776b 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1492,6 +1492,24 @@ impl DeltaLayerInner {
         );
         offset
     }
+
+    #[cfg(test)]
+    pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
+        DeltaLayerIterator {
+            delta_layer: self,
+            ctx,
+            index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx),
+            key_values_batch: std::collections::VecDeque::new(),
+            is_end: false,
+            planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
+                1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
+                1024,        // The default value. Unit tests might use a different value
+            ),
+        }
+    }
 }
 
 /// A set of data associated with a delta layer key and its value
@@ -1551,6 +1569,70 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
     }
 }
 
+#[cfg(test)]
+pub struct DeltaLayerIterator<'a> {
+    delta_layer: &'a DeltaLayerInner,
+    ctx: &'a RequestContext,
+    planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
+    index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
+    key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
+    is_end: bool,
+}
+
+#[cfg(test)]
+impl<'a> DeltaLayerIterator<'a> {
+    /// Retrieve a batch of key-value pairs into the iterator buffer.
+    async fn next_batch(&mut self) -> anyhow::Result<()> {
+        assert!(self.key_values_batch.is_empty());
+        assert!(!self.is_end);
+
+        let plan = loop {
+            if let Some(res) = self.index_iter.next().await {
+                let (raw_key, value) = res?;
+                let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+                let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
+                let blob_ref = BlobRef(value);
+                let offset = blob_ref.pos();
+                if let Some(batch_plan) = self.planner.handle(key, lsn, offset, BlobFlag::None) {
+                    break batch_plan;
+                }
+            } else {
+                self.is_end = true;
+                let data_end_offset = self.delta_layer.index_start_offset();
+                break self.planner.handle_range_end(data_end_offset);
+            }
+        };
+        let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
+        let mut next_batch = std::collections::VecDeque::new();
+        let buf_size = plan.size();
+        let buf = BytesMut::with_capacity(buf_size);
+        let blobs_buf = vectored_blob_reader
+            .read_blobs(&plan, buf, self.ctx)
+            .await?;
+        let frozen_buf = blobs_buf.buf.freeze();
+        for meta in blobs_buf.blobs.iter() {
+            let value = Value::des(&frozen_buf[meta.start..meta.end])?;
+            next_batch.push_back((meta.meta.key, meta.meta.lsn, value));
+        }
+        self.key_values_batch = next_batch;
+        Ok(())
+    }
+
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        if self.key_values_batch.is_empty() {
+            if self.is_end {
+                return Ok(None);
+            }
+            self.next_batch().await?;
+        }
+        Ok(Some(
+            self.key_values_batch
+                .pop_front()
+                .expect("should not be empty"),
+        ))
+    }
+}
+
 #[cfg(test)]
 mod test {
     use std::collections::BTreeMap;
@@ -1560,6 +1642,9 @@ mod test {
     use rand::RngCore;
 
     use super::*;
+    use crate::tenant::harness::TIMELINE_ID;
+    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
+    use crate::tenant::Tenant;
     use crate::{
         context::DownloadBehavior,
         task_mgr::TaskKind,
@@ -2126,4 +2211,116 @@ mod test {
             assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right));
         }
     }
+
+    async fn produce_delta_layer(
+        tenant: &Tenant,
+        tline: &Arc<Timeline>,
+        mut deltas: Vec<(Key, Lsn, Value)>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ResidentLayer> {
+        deltas.sort_by(|(k1, l1, _), (k2, l2, _)| (k1, l1).cmp(&(k2, l2)));
+        let (key_start, _, _) = deltas.first().unwrap();
+        let (key_max, _, _) = deltas.first().unwrap();
+        let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
+        let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
+        let lsn_end = Lsn(lsn_max.0 + 1);
+        let mut writer = DeltaLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            *key_start,
+            (*lsn_min)..lsn_end,
+            ctx,
+        )
+        .await?;
+        let key_end = key_max.next();
+
+        for (key, lsn, value) in deltas {
+            writer.put_value(key, lsn, value, ctx).await?;
+        }
+        let delta_layer = writer.finish(key_end, tline, ctx).await?;
+
+        Ok::<_, anyhow::Error>(delta_layer)
+    }
+
+    async fn assert_delta_iter_equal(
+        delta_iter: &mut DeltaLayerIterator<'_>,
+        expect: &[(Key, Lsn, Value)],
+    ) {
+        let mut expect_iter = expect.iter();
+        loop {
+            let o1 = delta_iter.next().await.unwrap();
+            let o2 = expect_iter.next();
+            assert_eq!(o1.is_some(), o2.is_some());
+            if o1.is_none() && o2.is_none() {
+                break;
+            }
+            let (k1, l1, v1) = o1.unwrap();
+            let (k2, l2, v2) = o2.unwrap();
+            assert_eq!(&k1, k2);
+            assert_eq!(l1, *l2);
+            assert_eq!(&v1, v2);
+        }
+    }
+
+    #[tokio::test]
+    async fn delta_layer_iterator() {
+        use crate::repository::Value;
+        use bytes::Bytes;
+
+        let harness = TenantHarness::create("delta_layer_iterator").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+        const N: usize = 1000;
+        let test_deltas = (0..N)
+            .map(|idx| {
+                (
+                    get_key(idx as u32 / 10),
+                    Lsn(0x10 * ((idx as u64) % 10 + 1)),
+                    Value::Image(Bytes::from(format!("img{idx:05}"))),
+                )
+            })
+            .collect_vec();
+        let resident_layer = produce_delta_layer(&tenant, &tline, test_deltas.clone(), &ctx)
+            .await
+            .unwrap();
+        let delta_layer = resident_layer.get_as_delta(&ctx).await.unwrap();
+        for max_read_size in [1, 1024] {
+            for batch_size in [1, 2, 4, 8, 3, 7, 13] {
+                println!("running with batch_size={batch_size} max_read_size={max_read_size}");
+                // Test if the batch size is correctly determined
+                let mut iter = delta_layer.iter(&ctx);
+                iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
+                let mut num_items = 0;
+                for _ in 0..3 {
+                    iter.next_batch().await.unwrap();
+                    num_items += iter.key_values_batch.len();
+                    if max_read_size == 1 {
+                        // every key should be a batch b/c the value is larger than max_read_size
+                        assert_eq!(iter.key_values_batch.len(), 1);
+                    } else {
+                        assert_eq!(iter.key_values_batch.len(), batch_size);
+                    }
+                    if num_items >= N {
+                        break;
+                    }
+                    iter.key_values_batch.clear();
+                }
+                // Test if the result is correct
+                let mut iter = delta_layer.iter(&ctx);
+                iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
+                assert_delta_iter_equal(&mut iter, &test_deltas).await;
+            }
+        }
+    }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1175b75017..8dd0a23f46 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5481,12 +5481,12 @@ impl Timeline {
         }
         images.sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb));
         let min_key = *images.first().map(|(k, _)| k).unwrap();
-        let max_key = images.last().map(|(k, _)| k).unwrap().next();
+        let end_key = images.last().map(|(k, _)| k).unwrap().next();
         let mut image_layer_writer = ImageLayerWriter::new(
             self.conf,
             self.timeline_id,
             self.tenant_shard_id,
-            &(min_key..max_key),
+            &(min_key..end_key),
             lsn,
             ctx,
         )
@@ -5518,7 +5518,7 @@ impl Timeline {
         let last_record_lsn = self.get_last_record_lsn();
         deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
         let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
-        let max_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
+        let end_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
         let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
         let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
         assert!(
@@ -5541,7 +5541,7 @@ impl Timeline {
         for (key, lsn, val) in deltas {
             delta_layer_writer.put_value(key, lsn, val, ctx).await?;
         }
-        let delta_layer = delta_layer_writer.finish(max_key, self, ctx).await?;
+        let delta_layer = delta_layer_writer.finish(end_key, self, ctx).await?;
 
         {
             let mut guard = self.layers.write().await;

From 1d66ca79a9d50ae6423349c6c184c43c78a27113 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 27 Jun 2024 18:39:43 +0100
Subject: [PATCH 1071/1571] Improve slow operations observability in
 safekeepers (#8188)

After https://github.com/neondatabase/neon/pull/8022 was deployed to
staging, I noticed many cases of timeouts. After inspecting the logs, I
realized that some operations are taking ~20 seconds and they're doing
while holding shared state lock. Usually it happens right after
redeploy, because compute reconnections put high load on disks. This
commit tries to improve observability around slow operations.

Non-observability changes:
- `TimelineState::finish_change` now skips update if nothing has changed
- `wal_residence_guard()` timeout is set to 30s
---
 libs/metrics/src/lib.rs            |  7 ++++---
 safekeeper/src/metrics.rs          | 32 ++++++++++++++++++++++--------
 safekeeper/src/safekeeper.rs       |  5 +++++
 safekeeper/src/state.rs            |  7 ++++++-
 safekeeper/src/timeline.rs         | 32 ++++++++++++++++++++----------
 safekeeper/src/timeline_manager.rs |  6 +++++-
 safekeeper/src/wal_storage.rs      | 16 ++++++++++++++-
 7 files changed, 81 insertions(+), 24 deletions(-)

diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 141d8a6d01..0ff8ec8be3 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -103,9 +103,10 @@ static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
     .expect("Failed to register maxrss_kb int gauge")
 });
 
-pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
-    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
-];
+/// Most common fsync latency is 50 µs - 100 µs, but it can be much higher,
+/// especially during many concurrent disk operations.
+pub const DISK_FSYNC_SECONDS_BUCKETS: &[f64] =
+    &[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0];
 
 pub struct BuildInfo {
     pub revision: &'static str,
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 1e965393e3..a484c45af8 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -5,15 +5,15 @@ use std::{
     time::{Instant, SystemTime},
 };
 
-use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS};
+use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_FSYNC_SECONDS_BUCKETS};
 use anyhow::Result;
 use futures::Future;
 use metrics::{
     core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
     proto::MetricFamily,
-    register_int_counter, register_int_counter_pair, register_int_counter_pair_vec,
-    register_int_counter_vec, Gauge, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
-    IntGaugeVec,
+    register_histogram_vec, register_int_counter, register_int_counter_pair,
+    register_int_counter_pair_vec, register_int_counter_vec, Gauge, HistogramVec, IntCounter,
+    IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec,
 };
 use once_cell::sync::Lazy;
 
@@ -48,7 +48,7 @@ pub static WRITE_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "safekeeper_write_wal_seconds",
         "Seconds spent writing and syncing WAL to a disk in a single request",
-        DISK_WRITE_SECONDS_BUCKETS.to_vec()
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
     )
     .expect("Failed to register safekeeper_write_wal_seconds histogram")
 });
@@ -56,7 +56,7 @@ pub static FLUSH_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "safekeeper_flush_wal_seconds",
         "Seconds spent syncing WAL to a disk",
-        DISK_WRITE_SECONDS_BUCKETS.to_vec()
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
     )
     .expect("Failed to register safekeeper_flush_wal_seconds histogram")
 });
@@ -64,10 +64,26 @@ pub static PERSIST_CONTROL_FILE_SECONDS: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "safekeeper_persist_control_file_seconds",
         "Seconds to persist and sync control file",
-        DISK_WRITE_SECONDS_BUCKETS.to_vec()
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
     )
     .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec")
 });
+pub static WAL_STORAGE_OPERATION_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "safekeeper_wal_storage_operation_seconds",
+        "Seconds spent on WAL storage operations",
+        &["operation"]
+    )
+    .expect("Failed to register safekeeper_wal_storage_operation_seconds histogram vec")
+});
+pub static MISC_OPERATION_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "safekeeper_misc_operation_seconds",
+        "Seconds spent on miscellaneous operations",
+        &["operation"]
+    )
+    .expect("Failed to register safekeeper_misc_operation_seconds histogram vec")
+});
 pub static PG_IO_BYTES: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "safekeeper_pg_io_bytes_total",
@@ -126,7 +142,7 @@ pub static BROKER_PUSH_ALL_UPDATES_SECONDS: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "safekeeper_broker_push_update_seconds",
         "Seconds to push all timeline updates to the broker",
-        DISK_WRITE_SECONDS_BUCKETS.to_vec()
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
     )
     .expect("Failed to register safekeeper_broker_push_update_seconds histogram vec")
 });
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 4d0992e8bd..33ec39b852 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -15,6 +15,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use tracing::*;
 
 use crate::control_file;
+use crate::metrics::MISC_OPERATION_SECONDS;
 use crate::send_wal::HotStandbyFeedback;
 
 use crate::state::TimelineState;
@@ -696,6 +697,10 @@ where
         &mut self,
         msg: &ProposerElected,
     ) -> Result<Option<AcceptorProposerMessage>> {
+        let _timer = MISC_OPERATION_SECONDS
+            .with_label_values(&["handle_elected"])
+            .start_timer();
+
         info!("received ProposerElected {:?}", msg);
         if self.state.acceptor_state.term < msg.term {
             let mut state = self.state.start_change();
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index e0f7b65aef..dca6414082 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -189,7 +189,12 @@ where
 
     /// Persist given state. c.f. start_change.
     pub async fn finish_change(&mut self, s: &TimelinePersistentState) -> Result<()> {
-        self.pers.persist(s).await?;
+        if s.eq(&*self.pers) {
+            // nothing to do if state didn't change
+        } else {
+            self.pers.persist(s).await?;
+        }
+
         // keep in memory values up to date
         self.inmem.commit_lsn = s.commit_lsn;
         self.inmem.backup_lsn = s.backup_lsn;
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index f632cd6fb3..6b83270c18 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -39,7 +39,7 @@ use crate::wal_backup::{self};
 use crate::wal_backup_partial::PartialRemoteSegment;
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
-use crate::metrics::{FullTimelineInfo, WalStorageMetrics};
+use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
 use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
 use crate::{debug_dump, timeline_manager, wal_storage};
 use crate::{GlobalTimelines, SafeKeeperConf};
@@ -856,28 +856,40 @@ impl Timeline {
         }
 
         debug!("requesting WalResidentTimeline guard");
+        let started_at = Instant::now();
+        let status_before = self.mgr_status.get();
 
-        // Wait 5 seconds for the guard to be acquired, should be enough for uneviction.
-        // If it times out, most likely there is a deadlock in the manager task.
-        let res = tokio::time::timeout(
-            Duration::from_secs(5),
+        // Wait 30 seconds for the guard to be acquired. It can time out if someone is
+        // holding the lock (e.g. during `SafeKeeper::process_msg()`) or manager task
+        // is stuck.
+        let res = tokio::time::timeout_at(
+            started_at + Duration::from_secs(30),
             self.manager_ctl.wal_residence_guard(),
         )
         .await;
 
         let guard = match res {
-            Ok(Ok(guard)) => guard,
+            Ok(Ok(guard)) => {
+                let finished_at = Instant::now();
+                let elapsed = finished_at - started_at;
+                MISC_OPERATION_SECONDS
+                    .with_label_values(&["wal_residence_guard"])
+                    .observe(elapsed.as_secs_f64());
+
+                guard
+            }
             Ok(Err(e)) => {
                 warn!(
-                    "error while acquiring WalResidentTimeline guard (current state {:?}): {}",
-                    self.mgr_status.get(),
-                    e
+                    "error while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
+                    status_before,
+                    self.mgr_status.get()
                 );
                 return Err(e);
             }
             Err(_) => {
                 warn!(
-                    "timeout while acquiring WalResidentTimeline guard (current state {:?})",
+                    "timeout while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
+                    status_before,
                     self.mgr_status.get()
                 );
                 anyhow::bail!("timeout while acquiring WalResidentTimeline guard");
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index c3abeac644..66c62ce197 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -22,7 +22,7 @@ use utils::lsn::Lsn;
 
 use crate::{
     control_file::{FileStorage, Storage},
-    metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
+    metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS},
     recovery::recovery_main,
     remove_wal::calc_horizon_lsn,
     safekeeper::Term,
@@ -357,6 +357,10 @@ impl Manager {
 
     /// Get a snapshot of the timeline state.
     async fn state_snapshot(&self) -> StateSnapshot {
+        let _timer = MISC_OPERATION_SECONDS
+            .with_label_values(&["state_snapshot"])
+            .start_timer();
+
         StateSnapshot::new(
             self.tli.read_shared_state().await,
             self.conf.heartbeat_timeout,
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 74c4693ccd..ded8571a3e 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -23,7 +23,9 @@ use tokio::io::{AsyncReadExt, AsyncSeekExt};
 use tracing::*;
 use utils::crashsafe::durable_rename;
 
-use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
+use crate::metrics::{
+    time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS,
+};
 use crate::state::TimelinePersistentState;
 use crate::wal_backup::{read_object, remote_timeline_path};
 use crate::SafeKeeperConf;
@@ -331,6 +333,10 @@ impl Storage for PhysicalStorage {
     }
 
     async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()> {
+        let _timer = WAL_STORAGE_OPERATION_SECONDS
+            .with_label_values(&["initialize_first_segment"])
+            .start_timer();
+
         let segno = init_lsn.segment_number(self.wal_seg_size);
         let (mut file, _) = self.open_or_create(segno).await?;
         let major_pg_version = self.pg_version / 10000;
@@ -422,6 +428,10 @@ impl Storage for PhysicalStorage {
     /// Truncate written WAL by removing all WAL segments after the given LSN.
     /// end_pos must point to the end of the WAL record.
     async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
+        let _timer = WAL_STORAGE_OPERATION_SECONDS
+            .with_label_values(&["truncate_wal"])
+            .start_timer();
+
         // Streaming must not create a hole, so truncate cannot be called on non-written lsn
         if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
             bail!(
@@ -497,6 +507,10 @@ async fn remove_segments_from_disk(
     wal_seg_size: usize,
     remove_predicate: impl Fn(XLogSegNo) -> bool,
 ) -> Result<()> {
+    let _timer = WAL_STORAGE_OPERATION_SECONDS
+        .with_label_values(&["remove_segments_from_disk"])
+        .start_timer();
+
     let mut n_removed = 0;
     let mut min_removed = u64::MAX;
     let mut max_removed = u64::MIN;

From 5700233a47ffc2fb040d862976873e273ae180a7 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 27 Jun 2024 10:27:56 -0500
Subject: [PATCH 1072/1571] Add application_name to compute activity monitor
 connection string

This was missed in my previous attempt to mark every connection string
with an application name. See 0c3e3a8667294a3dc345b0f03364aa359a5154de.
---
 compute_tools/src/monitor.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index 872a3f7750..d7127aac32 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -17,7 +17,11 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
 // should be handled gracefully.
 fn watch_compute_activity(compute: &ComputeNode) {
     // Suppose that `connstr` doesn't change
-    let connstr = compute.connstr.as_str();
+    let mut connstr = compute.connstr.clone();
+    connstr
+        .query_pairs_mut()
+        .append_pair("application_name", "compute_activity_monitor");
+    let connstr = connstr.as_str();
 
     // During startup and configuration we connect to every Postgres database,
     // but we don't want to count this as some user activity. So wait until

From 063553a51b2b866c97fff6a818d2c35d8a9ee13c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 28 Jun 2024 09:14:19 +0100
Subject: [PATCH 1073/1571] pageserver: remove tenant create API (#8135)

## Problem

For some time, we have created tenants with calls to location_conf. The
legacy "POST /v1/tenant" path was only used in some tests.

## Summary of changes

- Remove the API
- Relocate TenantCreateRequest to the controller API file (this used to
be used in both pageserver and controller APIs)
- Rewrite tenant_create test helper to use location_config API, as
control plane and storage controller do
- Update docker-compose test script to create tenants with
location_config API (this small commit is also present in
https://github.com/neondatabase/neon/pull/7947)
---
 control_plane/src/bin/neon_local.rs           |  6 +-
 control_plane/src/pageserver.rs               | 25 +-----
 control_plane/src/storage_controller.rs       |  7 +-
 control_plane/storcon_cli/src/main.rs         | 26 ++++---
 libs/pageserver_api/src/controller_api.rs     | 36 +++++++++
 libs/pageserver_api/src/models.rs             | 39 ----------
 pageserver/client/src/mgmt_api.rs             |  9 ---
 pageserver/src/http/routes.rs                 | 76 +------------------
 pageserver/src/metrics.rs                     |  3 -
 pageserver/src/tenant.rs                      | 25 ++----
 storage_controller/src/http.rs                |  3 +-
 storage_controller/src/service.rs             | 15 ++--
 test_runner/fixtures/neon_fixtures.py         | 14 +++-
 test_runner/fixtures/pageserver/http.py       | 28 -------
 test_runner/regress/test_pageserver_api.py    |  6 +-
 .../regress/test_storage_controller.py        |  2 +-
 16 files changed, 91 insertions(+), 229 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 2c05938f44..4bf1b29785 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -21,10 +21,8 @@ use pageserver_api::config::{
     DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
     DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
-use pageserver_api::controller_api::PlacementPolicy;
-use pageserver_api::models::{
-    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
-};
+use pageserver_api::controller_api::{PlacementPolicy, TenantCreateRequest};
+use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo};
 use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index da4b987849..983f78577c 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,8 +17,7 @@ use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
 use pageserver_api::models::{
-    self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo,
-    TimelineInfo,
+    self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
@@ -397,28 +396,6 @@ impl PageServerNode {
         }
     }
 
-    pub async fn tenant_create(
-        &self,
-        new_tenant_id: TenantId,
-        generation: Option<u32>,
-        settings: HashMap<&str, &str>,
-    ) -> anyhow::Result<TenantId> {
-        let config = Self::parse_config(settings.clone())?;
-
-        let request = models::TenantCreateRequest {
-            new_tenant_id: TenantShardId::unsharded(new_tenant_id),
-            generation,
-            config,
-            shard_parameters: ShardParameters::default(),
-            // Placement policy is not meaningful for creations not done via storage controller
-            placement_policy: None,
-        };
-        if !settings.is_empty() {
-            bail!("Unrecognized tenant settings: {settings:?}")
-        }
-        Ok(self.http_client.tenant_create(&request).await?)
-    }
-
     pub async fn tenant_config(
         &self,
         tenant_id: TenantId,
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 1c56d5f80f..5ca1b13b2a 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -5,12 +5,11 @@ use crate::{
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::{
     controller_api::{
-        NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
+        NodeConfigureRequest, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse,
+        TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
     models::{
-        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
-        TimelineCreateRequest, TimelineInfo,
+        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
     },
     shard::{ShardStripeSize, TenantShardId},
 };
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 775aedb600..b2c5dfe58a 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -4,13 +4,13 @@ use std::{str::FromStr, time::Duration};
 use clap::{Parser, Subcommand};
 use pageserver_api::{
     controller_api::{
-        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
+        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
         TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
         EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
-        TenantShardSplitRequest, TenantShardSplitResponse,
+        ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest,
+        TenantShardSplitResponse,
     },
     shard::{ShardStripeSize, TenantShardId},
 };
@@ -336,14 +336,18 @@ async fn main() -> anyhow::Result<()> {
                 .await?;
         }
         Command::TenantCreate { tenant_id } => {
-            vps_client
-                .tenant_create(&TenantCreateRequest {
-                    new_tenant_id: TenantShardId::unsharded(tenant_id),
-                    generation: None,
-                    shard_parameters: ShardParameters::default(),
-                    placement_policy: Some(PlacementPolicy::Attached(1)),
-                    config: TenantConfig::default(),
-                })
+            storcon_client
+                .dispatch(
+                    Method::POST,
+                    "v1/tenant".to_string(),
+                    Some(TenantCreateRequest {
+                        new_tenant_id: TenantShardId::unsharded(tenant_id),
+                        generation: None,
+                        shard_parameters: ShardParameters::default(),
+                        placement_policy: Some(PlacementPolicy::Attached(1)),
+                        config: TenantConfig::default(),
+                    }),
+                )
                 .await?;
         }
         Command::TenantDelete { tenant_id } => {
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index a0d10dc665..f05c1315ea 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -11,6 +11,27 @@ use crate::{
     shard::{ShardStripeSize, TenantShardId},
 };
 
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantCreateRequest {
+    pub new_tenant_id: TenantShardId,
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub generation: Option<u32>,
+
+    // If omitted, create a single shard with TenantShardId::unsharded()
+    #[serde(default)]
+    #[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
+    pub shard_parameters: ShardParameters,
+
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub placement_policy: Option<PlacementPolicy>,
+
+    #[serde(flatten)]
+    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
     pub shard_id: TenantShardId,
@@ -280,4 +301,19 @@ mod test {
         assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
         Ok(())
     }
+
+    #[test]
+    fn test_reject_unknown_field() {
+        let id = TenantId::generate();
+        let create_request = serde_json::json!({
+            "new_tenant_id": id.to_string(),
+            "unknown_field": "unknown_value".to_string(),
+        });
+        let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
+        assert!(
+            err.to_string().contains("unknown field `unknown_field`"),
+            "expect unknown field `unknown_field` error, got: {}",
+            err
+        );
+    }
 }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 4875f49495..61a255cdbc 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -25,7 +25,6 @@ use utils::{
     serde_system_time,
 };
 
-use crate::controller_api::PlacementPolicy;
 use crate::{
     reltag::RelTag,
     shard::{ShardCount, ShardStripeSize, TenantShardId},
@@ -271,28 +270,6 @@ impl Default for ShardParameters {
     }
 }
 
-#[derive(Serialize, Deserialize, Debug)]
-#[serde(deny_unknown_fields)]
-pub struct TenantCreateRequest {
-    pub new_tenant_id: TenantShardId,
-    #[serde(default)]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub generation: Option<u32>,
-
-    // If omitted, create a single shard with TenantShardId::unsharded()
-    #[serde(default)]
-    #[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
-    pub shard_parameters: ShardParameters,
-
-    // This parameter is only meaningful in requests sent to the storage controller
-    #[serde(default)]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub placement_policy: Option<PlacementPolicy>,
-
-    #[serde(flatten)]
-    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
-}
-
 /// An alternative representation of `pageserver::tenant::TenantConf` with
 /// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
@@ -547,10 +524,6 @@ pub struct LocationConfigListResponse {
     pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
 }
 
-#[derive(Serialize, Deserialize)]
-#[serde(transparent)]
-pub struct TenantCreateResponse(pub TenantId);
-
 #[derive(Serialize)]
 pub struct StatusResponse {
     pub id: NodeId,
@@ -1507,18 +1480,6 @@ mod tests {
 
     #[test]
     fn test_reject_unknown_field() {
-        let id = TenantId::generate();
-        let create_request = json!({
-            "new_tenant_id": id.to_string(),
-            "unknown_field": "unknown_value".to_string(),
-        });
-        let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
-        assert!(
-            err.to_string().contains("unknown field `unknown_field`"),
-            "expect unknown field `unknown_field` error, got: {}",
-            err
-        );
-
         let id = TenantId::generate();
         let config_request = json!({
             "tenant_id": id.to_string(),
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 69b86d9c46..48b27775cb 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -205,15 +205,6 @@ impl Client {
         Ok(())
     }
 
-    pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result<TenantId> {
-        let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
-        self.request(Method::POST, &uri, req)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
     /// The tenant deletion API can return 202 if deletion is incomplete, or
     /// 404 if it is complete.  Callers are responsible for checking the status
     /// code and retrying.  Error codes other than 404 will return Err().
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 5ebd34a406..1fda2eaa85 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -53,7 +53,6 @@ use utils::http::request::{get_request_param, must_get_query_param, parse_query_
 
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
-use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
@@ -75,13 +74,12 @@ use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
-use crate::tenant::SpawnMode;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
-    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
-    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
+    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
+    TimelineInfo,
 };
 use utils::{
     auth::SwappableJwtAuth,
@@ -1237,75 +1235,6 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
     Ok(response)
 }
 
-/// Helper for requests that may take a generation, which is mandatory
-/// when control_plane_api is set, but otherwise defaults to Generation::none()
-fn get_request_generation(state: &State, req_gen: Option<u32>) -> Result<Generation, ApiError> {
-    if state.conf.control_plane_api.is_some() {
-        req_gen
-            .map(Generation::new)
-            .ok_or(ApiError::BadRequest(anyhow!(
-                "generation attribute missing"
-            )))
-    } else {
-        // Legacy mode: all tenants operate with no generation
-        Ok(Generation::none())
-    }
-}
-
-async fn tenant_create_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let request_data: TenantCreateRequest = json_request(&mut request).await?;
-    let target_tenant_id = request_data.new_tenant_id;
-    check_permission(&request, None)?;
-
-    let _timer = STORAGE_TIME_GLOBAL
-        .get_metric_with_label_values(&[StorageTimeOperation::CreateTenant.into()])
-        .expect("bug")
-        .start_timer();
-
-    let tenant_conf =
-        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
-
-    let state = get_state(&request);
-
-    let generation = get_request_generation(state, request_data.generation)?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let location_conf =
-        LocationConf::attached_single(tenant_conf, generation, &request_data.shard_parameters);
-
-    let new_tenant = state
-        .tenant_manager
-        .upsert_location(
-            target_tenant_id,
-            location_conf,
-            None,
-            SpawnMode::Create,
-            &ctx,
-        )
-        .await?;
-
-    let Some(new_tenant) = new_tenant else {
-        // This should never happen: indicates a bug in upsert_location
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "Upsert succeeded but didn't return tenant!"
-        )));
-    };
-    // We created the tenant. Existing API semantics are that the tenant
-    // is Active when this function returns.
-    new_tenant
-        .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
-        .await?;
-
-    json_response(
-        StatusCode::CREATED,
-        TenantCreateResponse(new_tenant.tenant_shard_id().tenant_id),
-    )
-}
-
 async fn get_tenant_config_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -2611,7 +2540,6 @@ pub fn make_router(
             api_handler(r, reload_auth_validation_keys_handler)
         })
         .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
-        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
         .get("/v1/tenant/:tenant_shard_id", |r| {
             api_handler(r, tenant_status)
         })
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ca697afcf6..f5aca6dfb3 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -53,9 +53,6 @@ pub(crate) enum StorageTimeOperation {
 
     #[strum(serialize = "find gc cutoffs")]
     FindGcCutoffs,
-
-    #[strum(serialize = "create tenant")]
-    CreateTenant,
 }
 
 pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 22d6804861..92d9c5b143 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -213,8 +213,6 @@ pub(crate) enum SpawnMode {
     Eager,
     /// Lazy activation in the background, with the option to skip the queue if the need comes up
     Lazy,
-    /// Tenant has been created during the lifetime of this process
-    Create,
 }
 
 ///
@@ -808,9 +806,6 @@ impl Tenant {
                 };
 
                 let preload = match &mode {
-                    SpawnMode::Create => {
-                        None
-                    },
                     SpawnMode::Eager | SpawnMode::Lazy => {
                         let _preload_timer = TENANT.preload.start_timer();
                         let res = tenant_clone
@@ -832,11 +827,8 @@ impl Tenant {
 
                 // We will time the duration of the attach phase unless this is a creation (attach will do no work)
                 let attached = {
-                    let _attach_timer = match mode {
-                        SpawnMode::Create => None,
-                        SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()),
-                    };
-                    tenant_clone.attach(preload, mode, &ctx).await
+                    let _attach_timer = Some(TENANT.attach.start_timer());
+                    tenant_clone.attach(preload, &ctx).await
                 };
 
                 match attached {
@@ -912,21 +904,14 @@ impl Tenant {
     async fn attach(
         self: &Arc<Tenant>,
         preload: Option<TenantPreload>,
-        mode: SpawnMode,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         span::debug_assert_current_span_has_tenant_id();
 
         failpoint_support::sleep_millis_async!("before-attaching-tenant");
 
-        let preload = match (preload, mode) {
-            (Some(p), _) => p,
-            (None, SpawnMode::Create) => TenantPreload {
-                timelines: HashMap::new(),
-            },
-            (None, _) => {
-                anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
-            }
+        let Some(preload) = preload else {
+            anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
         };
 
         let mut timelines_to_resume_deletions = vec![];
@@ -3841,7 +3826,7 @@ pub(crate) mod harness {
             let preload = tenant
                 .preload(&self.remote_storage, CancellationToken::new())
                 .await?;
-            tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?;
+            tenant.attach(Some(preload), ctx).await?;
 
             tenant.state.send_replace(TenantState::Active);
             for timeline in tenant.timelines.lock().unwrap().values() {
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 680e6f09c4..7446ad53a2 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -10,8 +10,9 @@ use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use metrics::{BuildInfo, NeonMetrics};
+use pageserver_api::controller_api::TenantCreateRequest;
 use pageserver_api::models::{
-    TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
+    TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
     TenantTimeTravelRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index a94575b428..bcc40c69a2 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -32,10 +32,10 @@ use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
         NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-        ShardSchedulingPolicy, TenantCreateResponse, TenantCreateResponseShard,
-        TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse,
-        TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
-        UtilizationScore,
+        ShardSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
+        TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
+        TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
+        TenantShardMigrateResponse, UtilizationScore,
     },
     models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
 };
@@ -46,10 +46,9 @@ use crate::pageserver_client::PageserverClient;
 use pageserver_api::{
     models::{
         self, LocationConfig, LocationConfigListResponse, LocationConfigMode,
-        PageserverUtilization, ShardParameters, TenantConfig, TenantCreateRequest,
-        TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
-        TenantShardSplitRequest, TenantShardSplitResponse, TenantTimeTravelRequest,
-        TimelineCreateRequest, TimelineInfo,
+        PageserverUtilization, ShardParameters, TenantConfig, TenantLocationConfigRequest,
+        TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest,
+        TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo,
     },
     shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
     upcall_api::{
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a3f83abd3e..4911917bf4 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2741,7 +2741,19 @@ class NeonPageserver(PgProtocol, LogUtils):
         if generation is None:
             generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
         client = self.http_client(auth_token=auth_token)
-        return client.tenant_create(tenant_id, conf, generation=generation)
+
+        conf = conf or {}
+
+        client.tenant_location_conf(
+            tenant_id,
+            {
+                "mode": "AttachedSingle",
+                "generation": generation,
+                "tenant_conf": conf,
+                "secondary_conf": None,
+            },
+        )
+        return tenant_id
 
     def list_layers(
         self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 7949612714..3da0be8021 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -220,34 +220,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, list)
         return res_json
 
-    def tenant_create(
-        self,
-        new_tenant_id: Union[TenantId, TenantShardId],
-        conf: Optional[Dict[str, Any]] = None,
-        generation: Optional[int] = None,
-    ) -> TenantId:
-        if conf is not None:
-            assert "new_tenant_id" not in conf.keys()
-
-        body: Dict[str, Any] = {
-            "new_tenant_id": str(new_tenant_id),
-            **(conf or {}),
-        }
-
-        if generation is not None:
-            body.update({"generation": generation})
-
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant",
-            json=body,
-        )
-        self.verbose_error(res)
-        if res.status_code == 409:
-            raise Exception(f"could not create tenant: already exists for id {new_tenant_id}")
-        new_tenant_id = res.json()
-        assert isinstance(new_tenant_id, str)
-        return TenantId(new_tenant_id)
-
     def tenant_attach(
         self,
         tenant_id: Union[TenantId, TenantShardId],
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index abbea59113..caeae7fd15 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -85,8 +85,10 @@ def check_client(env: NeonEnv, client: PageserverHttpClient):
 
     # create new tenant and check it is also there
     tenant_id = TenantId.generate()
-    client.tenant_create(
-        tenant_id, generation=env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)
+    env.pageserver.tenant_create(
+        tenant_id,
+        generation=env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id),
+        auth_token=client.auth_token,
     )
     assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()}
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 139a100872..1b294fb2d0 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -315,7 +315,7 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     # Create a tenant directly via pageserver HTTP API, skipping the storage controller
     tenant_id = TenantId.generate()
     generation = 123
-    origin_ps.http_client().tenant_create(tenant_id, generation=generation)
+    origin_ps.tenant_create(tenant_id, generation=generation)
 
     # As if doing a live migration, first configure origin into stale mode
     r = origin_ps.http_client().tenant_location_conf(

From deec3bc5789a3daa1bf6c41e1487549e0d1c7dc1 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 28 Jun 2024 11:20:37 +0200
Subject: [PATCH 1074/1571] virtual_file: take a `Slice` in the read APIs,
 eliminate `read_exact_at_n`, fix UB for engine `std-fs` (#8186)

part of https://github.com/neondatabase/neon/issues/7418

I reviewed how the VirtualFile API's `read` methods look like and came
to the conclusion that we've been using `IoBufMut` / `BoundedBufMut` /
`Slice` wrong.

This patch rectifies the situation.

# Change 1: take `tokio_epoll_uring::Slice` in the read APIs

Before, we took an `IoBufMut`, which is too low of a primitive and while
it _seems_ convenient to be able to pass in a `Vec<u8>` without any
fuzz, it's actually very unclear at the callsite that we're going to
fill up that `Vec` up to its `capacity()`, because that's what
`IoBuf::bytes_total()` returns and that's what
`VirtualFile::read_exact_at` fills.

By passing a `Slice` instead, a caller that "just wants to read into a
`Vec`" is forced to be explicit about it, adding either `slice_full()`
or `slice(x..y)`, and these methods panic if the read is outside of the
bounds of the `Vec::capacity()`.

Last, passing slices is more similar to what the `std::io` APIs look
like.

# Change 2: fix UB in `virtual_file_io_engine=std-fs`

While reviewing call sites, I noticed that the
`io_engine::IoEngine::read_at` method for `StdFs` mode has been
constructing an `&mut[u8]` from raw parts that were uninitialized.

We then used `std::fs::File::read_exact` to initialize that memory, but,
IIUC we must not even be constructing an `&mut[u8]` where some of the
memory isn't initialized.

So, stop doing that and add a helper ext trait on `Slice` to do the
zero-initialization.

# Change 3: eliminate  `read_exact_at_n`

The `read_exact_at_n` doesn't make sense because the caller can just

1. `slice = buf.slice()` the exact memory it wants to fill
2. `slice = read_exact_at(slice)`
3. `buf = slice.into_inner()`

Again, the `std::io` APIs specify the length of the read via the Rust
slice length.
We should do the same for the owned buffers IO APIs, i.e., via
`Slice::bytes_total()`.

# Change 4: simplify filling of `PageWriteGuard`

The `PageWriteGuardBuf::init_up_to` was never necessary.
Remove it. See changes to doc comment for more details.

---

Reviewers should probably look at the added test case first, it
illustrates my case a bit.
---
 pageserver/src/tenant/vectored_blob_io.rs     |   6 +-
 pageserver/src/virtual_file.rs                | 181 ++++++++----------
 pageserver/src/virtual_file/io_engine.rs      |  33 ++--
 .../virtual_file/owned_buffers_io/slice.rs    | 121 ++++++++++++
 4 files changed, 219 insertions(+), 122 deletions(-)
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/slice.rs

diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 1241a13902..7ad8446e04 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -20,6 +20,7 @@ use std::num::NonZeroUsize;
 
 use bytes::BytesMut;
 use pageserver_api::key::Key;
+use tokio_epoll_uring::BoundedBuf;
 use utils::lsn::Lsn;
 use utils::vec_map::VecMap;
 
@@ -316,8 +317,9 @@ impl<'a> VectoredBlobReader<'a> {
         );
         let buf = self
             .file
-            .read_exact_at_n(buf, read.start, read.size(), ctx)
-            .await?;
+            .read_exact_at(buf.slice(0..read.size()), read.start, ctx)
+            .await?
+            .into_inner();
 
         let blobs_at = read.blobs_at.as_slice();
         let start_offset = blobs_at.first().expect("VectoredRead is never empty").0;
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 04d9386fab..51b0c420c3 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -13,7 +13,7 @@
 use crate::context::RequestContext;
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
 
-use crate::page_cache::PageWriteGuard;
+use crate::page_cache::{PageWriteGuard, PAGE_SZ};
 use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
@@ -48,6 +48,7 @@ pub(crate) mod owned_buffers_io {
     //! but for the time being we're proving out the primitives in the neon.git repo
     //! for faster iteration.
 
+    pub(crate) mod slice;
     pub(crate) mod write;
     pub(crate) mod util {
         pub(crate) mod size_tracking_writer;
@@ -143,16 +144,17 @@ struct SlotInner {
 /// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`].
 struct PageWriteGuardBuf {
     page: PageWriteGuard<'static>,
-    init_up_to: usize,
 }
 // Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot,
 // and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved.
+// Page cache pages are zero-initialized, so, wrt uninitialized memory we're good.
+// (Page cache tracks separately whether the contents are valid, see `PageWriteGuard::mark_valid`.)
 unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf {
     fn stable_ptr(&self) -> *const u8 {
         self.page.as_ptr()
     }
     fn bytes_init(&self) -> usize {
-        self.init_up_to
+        self.page.len()
     }
     fn bytes_total(&self) -> usize {
         self.page.len()
@@ -166,8 +168,8 @@ unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf {
     }
 
     unsafe fn set_init(&mut self, pos: usize) {
+        // There shouldn't really be any reason to call this API since bytes_init() == bytes_total().
         assert!(pos <= self.page.len());
-        self.init_up_to = pos;
     }
 }
 
@@ -585,37 +587,37 @@ impl VirtualFile {
         Ok(self.pos)
     }
 
-    pub async fn read_exact_at<B>(
+    /// Read the file contents in range `offset..(offset + slice.bytes_total())` into `slice[0..slice.bytes_total()]`.
+    ///
+    /// The returned `Slice<Buf>` is equivalent to the input `slice`, i.e., it's the same view into the same buffer.
+    pub async fn read_exact_at<Buf>(
         &self,
-        buf: B,
+        slice: Slice<Buf>,
         offset: u64,
         ctx: &RequestContext,
-    ) -> Result<B, Error>
+    ) -> Result<Slice<Buf>, Error>
     where
-        B: IoBufMut + Send,
+        Buf: IoBufMut + Send,
     {
-        let (buf, res) = read_exact_at_impl(buf, offset, None, |buf, offset| {
-            self.read_at(buf, offset, ctx)
-        })
-        .await;
-        res.map(|()| buf)
-    }
+        let assert_we_return_original_bounds = if cfg!(debug_assertions) {
+            Some((slice.stable_ptr() as usize, slice.bytes_total()))
+        } else {
+            None
+        };
 
-    pub async fn read_exact_at_n<B>(
-        &self,
-        buf: B,
-        offset: u64,
-        count: usize,
-        ctx: &RequestContext,
-    ) -> Result<B, Error>
-    where
-        B: IoBufMut + Send,
-    {
-        let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| {
-            self.read_at(buf, offset, ctx)
-        })
-        .await;
-        res.map(|()| buf)
+        let original_bounds = slice.bounds();
+        let (buf, res) =
+            read_exact_at_impl(slice, offset, |buf, offset| self.read_at(buf, offset, ctx)).await;
+        let res = res.map(|_| buf.slice(original_bounds));
+
+        if let Some(original_bounds) = assert_we_return_original_bounds {
+            if let Ok(slice) = &res {
+                let returned_bounds = (slice.stable_ptr() as usize, slice.bytes_total());
+                assert_eq!(original_bounds, returned_bounds);
+            }
+        }
+
+        res
     }
 
     /// Like [`Self::read_exact_at`] but for [`PageWriteGuard`].
@@ -625,13 +627,11 @@ impl VirtualFile {
         offset: u64,
         ctx: &RequestContext,
     ) -> Result<PageWriteGuard<'static>, Error> {
-        let buf = PageWriteGuardBuf {
-            page,
-            init_up_to: 0,
-        };
-        let res = self.read_exact_at(buf, offset, ctx).await;
-        res.map(|PageWriteGuardBuf { page, .. }| page)
-            .map_err(|e| Error::new(ErrorKind::Other, e))
+        let buf = PageWriteGuardBuf { page }.slice_full();
+        debug_assert_eq!(buf.bytes_total(), PAGE_SZ);
+        self.read_exact_at(buf, offset, ctx)
+            .await
+            .map(|slice| slice.into_inner().page)
     }
 
     // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
@@ -722,14 +722,14 @@ impl VirtualFile {
         (buf, Ok(n))
     }
 
-    pub(crate) async fn read_at<B>(
+    pub(crate) async fn read_at<Buf>(
         &self,
-        buf: B,
+        buf: tokio_epoll_uring::Slice<Buf>,
         offset: u64,
         _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
-    ) -> (B, Result<usize, Error>)
+    ) -> (tokio_epoll_uring::Slice<Buf>, Result<usize, Error>)
     where
-        B: tokio_epoll_uring::BoundedBufMut + Send,
+        Buf: tokio_epoll_uring::IoBufMut + Send,
     {
         let file_guard = match self.lock_file().await {
             Ok(file_guard) => file_guard,
@@ -781,26 +781,16 @@ impl VirtualFile {
 }
 
 // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
-pub async fn read_exact_at_impl<B, F, Fut>(
-    buf: B,
+pub async fn read_exact_at_impl<Buf, F, Fut>(
+    mut buf: tokio_epoll_uring::Slice<Buf>,
     mut offset: u64,
-    count: Option<usize>,
     mut read_at: F,
-) -> (B, std::io::Result<()>)
+) -> (Buf, std::io::Result<()>)
 where
-    B: IoBufMut + Send,
-    F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
-    Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
+    Buf: IoBufMut + Send,
+    F: FnMut(tokio_epoll_uring::Slice<Buf>, u64) -> Fut,
+    Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<Buf>, std::io::Result<usize>)>,
 {
-    let mut buf: tokio_epoll_uring::Slice<B> = match count {
-        Some(count) => {
-            assert!(count <= buf.bytes_total());
-            assert!(count > 0);
-            buf.slice(..count) // may include uninitialized memory
-        }
-        None => buf.slice_full(), // includes all the uninitialized memory
-    };
-
     while buf.bytes_total() != 0 {
         let res;
         (buf, res) = read_at(buf, offset).await;
@@ -882,7 +872,7 @@ mod test_read_exact_at_impl {
 
     #[tokio::test]
     async fn test_basic() {
-        let buf = Vec::with_capacity(5);
+        let buf = Vec::with_capacity(5).slice_full();
         let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
             expectations: VecDeque::from(vec![Expectation {
                 offset: 0,
@@ -890,7 +880,7 @@ mod test_read_exact_at_impl {
                 result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
             }]),
         }));
-        let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
+        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
             let mock_read_at = Arc::clone(&mock_read_at);
             async move { mock_read_at.lock().await.read_at(buf, offset).await }
         })
@@ -899,33 +889,13 @@ mod test_read_exact_at_impl {
         assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
     }
 
-    #[tokio::test]
-    async fn test_with_count() {
-        let buf = Vec::with_capacity(5);
-        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
-            expectations: VecDeque::from(vec![Expectation {
-                offset: 0,
-                bytes_total: 3,
-                result: Ok(vec![b'a', b'b', b'c']),
-            }]),
-        }));
-
-        let (buf, res) = read_exact_at_impl(buf, 0, Some(3), |buf, offset| {
-            let mock_read_at = Arc::clone(&mock_read_at);
-            async move { mock_read_at.lock().await.read_at(buf, offset).await }
-        })
-        .await;
-        assert!(res.is_ok());
-        assert_eq!(buf, vec![b'a', b'b', b'c']);
-    }
-
     #[tokio::test]
     async fn test_empty_buf_issues_no_syscall() {
-        let buf = Vec::new();
+        let buf = Vec::new().slice_full();
         let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
             expectations: VecDeque::new(),
         }));
-        let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
+        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
             let mock_read_at = Arc::clone(&mock_read_at);
             async move { mock_read_at.lock().await.read_at(buf, offset).await }
         })
@@ -935,7 +905,7 @@ mod test_read_exact_at_impl {
 
     #[tokio::test]
     async fn test_two_read_at_calls_needed_until_buf_filled() {
-        let buf = Vec::with_capacity(4);
+        let buf = Vec::with_capacity(4).slice_full();
         let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
             expectations: VecDeque::from(vec![
                 Expectation {
@@ -950,7 +920,7 @@ mod test_read_exact_at_impl {
                 },
             ]),
         }));
-        let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
+        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
             let mock_read_at = Arc::clone(&mock_read_at);
             async move { mock_read_at.lock().await.read_at(buf, offset).await }
         })
@@ -961,7 +931,7 @@ mod test_read_exact_at_impl {
 
     #[tokio::test]
     async fn test_eof_before_buffer_full() {
-        let buf = Vec::with_capacity(3);
+        let buf = Vec::with_capacity(3).slice_full();
         let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
             expectations: VecDeque::from(vec![
                 Expectation {
@@ -981,7 +951,7 @@ mod test_read_exact_at_impl {
                 },
             ]),
         }));
-        let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
+        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
             let mock_read_at = Arc::clone(&mock_read_at);
             async move { mock_read_at.lock().await.read_at(buf, offset).await }
         })
@@ -1051,27 +1021,29 @@ impl VirtualFile {
         ctx: &RequestContext,
     ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
         use crate::page_cache::PAGE_SZ;
-        let buf = vec![0; PAGE_SZ];
-        let buf = self
-            .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64), ctx)
+        let slice = Vec::with_capacity(PAGE_SZ).slice_full();
+        assert_eq!(slice.bytes_total(), PAGE_SZ);
+        let slice = self
+            .read_exact_at(slice, blknum as u64 * (PAGE_SZ as u64), ctx)
             .await?;
-        Ok(crate::tenant::block_io::BlockLease::Vec(buf))
+        Ok(crate::tenant::block_io::BlockLease::Vec(slice.into_inner()))
     }
 
     async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
         let mut tmp = vec![0; 128];
         loop {
-            let res;
-            (tmp, res) = self.read_at(tmp, self.pos, ctx).await;
+            let slice = tmp.slice(..128);
+            let (slice, res) = self.read_at(slice, self.pos, ctx).await;
             match res {
                 Ok(0) => return Ok(()),
                 Ok(n) => {
                     self.pos += n as u64;
-                    buf.extend_from_slice(&tmp[..n]);
+                    buf.extend_from_slice(&slice[..n]);
                 }
                 Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
                 Err(e) => return Err(e),
             }
+            tmp = slice.into_inner();
         }
     }
 }
@@ -1185,6 +1157,7 @@ mod tests {
     use crate::task_mgr::TaskKind;
 
     use super::*;
+    use owned_buffers_io::slice::SliceExt;
     use rand::seq::SliceRandom;
     use rand::thread_rng;
     use rand::Rng;
@@ -1206,13 +1179,16 @@ mod tests {
     impl MaybeVirtualFile {
         async fn read_exact_at(
             &self,
-            mut buf: Vec<u8>,
+            mut slice: tokio_epoll_uring::Slice<Vec<u8>>,
             offset: u64,
             ctx: &RequestContext,
-        ) -> Result<Vec<u8>, Error> {
+        ) -> Result<tokio_epoll_uring::Slice<Vec<u8>>, Error> {
             match self {
-                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset, ctx).await,
-                MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
+                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await,
+                MaybeVirtualFile::File(file) => {
+                    let rust_slice: &mut [u8] = slice.as_mut_rust_slice_full_zeroed();
+                    file.read_exact_at(rust_slice, offset).map(|()| slice)
+                }
             }
         }
         async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
@@ -1286,9 +1262,12 @@ mod tests {
             len: usize,
             ctx: &RequestContext,
         ) -> Result<String, Error> {
-            let buf = vec![0; len];
-            let buf = self.read_exact_at(buf, pos, ctx).await?;
-            Ok(String::from_utf8(buf).unwrap())
+            let slice = Vec::with_capacity(len).slice_full();
+            assert_eq!(slice.bytes_total(), len);
+            let slice = self.read_exact_at(slice, pos, ctx).await?;
+            let vec = slice.into_inner();
+            assert_eq!(vec.len(), len);
+            Ok(String::from_utf8(vec).unwrap())
         }
     }
 
@@ -1507,7 +1486,11 @@ mod tests {
                 let mut rng = rand::rngs::OsRng;
                 for _ in 1..1000 {
                     let f = &files[rng.gen_range(0..files.len())];
-                    buf = f.read_exact_at(buf, 0, &ctx).await.unwrap();
+                    buf = f
+                        .read_exact_at(buf.slice_full(), 0, &ctx)
+                        .await
+                        .unwrap()
+                        .into_inner();
                     assert!(buf == SAMPLE);
                 }
             });
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 7a27be2ca1..2820cea097 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -107,7 +107,7 @@ use std::{
     sync::atomic::{AtomicU8, Ordering},
 };
 
-use super::{FileGuard, Metadata};
+use super::{owned_buffers_io::slice::SliceExt, FileGuard, Metadata};
 
 #[cfg(target_os = "linux")]
 fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std::io::Error {
@@ -120,38 +120,29 @@ fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std:
 }
 
 impl IoEngine {
-    pub(super) async fn read_at<B>(
+    pub(super) async fn read_at<Buf>(
         &self,
         file_guard: FileGuard,
         offset: u64,
-        mut buf: B,
-    ) -> ((FileGuard, B), std::io::Result<usize>)
+        mut slice: tokio_epoll_uring::Slice<Buf>,
+    ) -> (
+        (FileGuard, tokio_epoll_uring::Slice<Buf>),
+        std::io::Result<usize>,
+    )
     where
-        B: tokio_epoll_uring::BoundedBufMut + Send,
+        Buf: tokio_epoll_uring::IoBufMut + Send,
     {
         match self {
             IoEngine::NotSet => panic!("not initialized"),
             IoEngine::StdFs => {
-                // SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory.
-                let dst = unsafe {
-                    std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total())
-                };
-                let res = file_guard.with_std_file(|std_file| std_file.read_at(dst, offset));
-                if let Ok(nbytes) = &res {
-                    assert!(*nbytes <= buf.bytes_total());
-                    // SAFETY: see above assertion
-                    unsafe {
-                        buf.set_init(*nbytes);
-                    }
-                }
-                #[allow(dropping_references)]
-                drop(dst);
-                ((file_guard, buf), res)
+                let rust_slice = slice.as_mut_rust_slice_full_zeroed();
+                let res = file_guard.with_std_file(|std_file| std_file.read_at(rust_slice, offset));
+                ((file_guard, slice), res)
             }
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring_ext::thread_local_system().await;
-                let (resources, res) = system.read(file_guard, offset, buf).await;
+                let (resources, res) = system.read(file_guard, offset, slice).await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
         }
diff --git a/pageserver/src/virtual_file/owned_buffers_io/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/slice.rs
new file mode 100644
index 0000000000..d19e5ddffe
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/slice.rs
@@ -0,0 +1,121 @@
+use tokio_epoll_uring::BoundedBuf;
+use tokio_epoll_uring::BoundedBufMut;
+use tokio_epoll_uring::IoBufMut;
+use tokio_epoll_uring::Slice;
+
+pub(crate) trait SliceExt {
+    /// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO.
+    ///
+    /// See the test case `test_slice_full_zeroed` for the difference to just doing `&slice[..]`
+    fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8];
+}
+
+impl<B> SliceExt for Slice<B>
+where
+    B: IoBufMut,
+{
+    #[inline(always)]
+    fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8] {
+        // zero-initialize the uninitialized parts of the buffer so we can create a Rust slice
+        //
+        // SAFETY: we own `slice`, don't write outside the bounds
+        unsafe {
+            let to_init = self.bytes_total() - self.bytes_init();
+            self.stable_mut_ptr()
+                .add(self.bytes_init())
+                .write_bytes(0, to_init);
+            self.set_init(self.bytes_total());
+        };
+        let bytes_total = self.bytes_total();
+        &mut self[0..bytes_total]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io::Read;
+
+    use super::*;
+    use bytes::Buf;
+    use tokio_epoll_uring::Slice;
+
+    #[test]
+    fn test_slice_full_zeroed() {
+        let make_fake_file = || bytes::BytesMut::from(&b"12345"[..]).reader();
+
+        // before we start the test, let's make sure we have a shared understanding of what slice_full does
+        {
+            let buf = Vec::with_capacity(3);
+            let slice: Slice<_> = buf.slice_full();
+            assert_eq!(slice.bytes_init(), 0);
+            assert_eq!(slice.bytes_total(), 3);
+            let rust_slice = &slice[..];
+            assert_eq!(
+                rust_slice.len(),
+                0,
+                "Slice only derefs to a &[u8] of the initialized part"
+            );
+        }
+
+        // and also let's establish a shared understanding of .slice()
+        {
+            let buf = Vec::with_capacity(3);
+            let slice: Slice<_> = buf.slice(0..2);
+            assert_eq!(slice.bytes_init(), 0);
+            assert_eq!(slice.bytes_total(), 2);
+            let rust_slice = &slice[..];
+            assert_eq!(
+                rust_slice.len(),
+                0,
+                "Slice only derefs to a &[u8] of the initialized part"
+            );
+        }
+
+        // the above leads to the easy mistake of using slice[..] for borrow-based IO like so:
+        {
+            let buf = Vec::with_capacity(3);
+            let mut slice: Slice<_> = buf.slice_full();
+            assert_eq!(slice[..].len(), 0);
+            let mut file = make_fake_file();
+            file.read_exact(&mut slice[..]).unwrap(); // one might think this reads 3 bytes but it reads 0
+            assert_eq!(&slice[..] as &[u8], &[][..] as &[u8]);
+        }
+
+        // With owned buffers IO like with VirtualFilem, you could totally
+        // pass in a `Slice` with bytes_init()=0 but bytes_total()=5
+        // and it will read 5 bytes into the slice, and return a slice that has bytes_init()=5.
+        {
+            // TODO: demo
+        }
+
+        //
+        // Ok, now that we have a shared understanding let's demo how to use the extension trait.
+        //
+
+        // slice_full()
+        {
+            let buf = Vec::with_capacity(3);
+            let mut slice: Slice<_> = buf.slice_full();
+            let rust_slice = slice.as_mut_rust_slice_full_zeroed();
+            assert_eq!(rust_slice.len(), 3);
+            assert_eq!(rust_slice, &[0, 0, 0]);
+            let mut file = make_fake_file();
+            file.read_exact(rust_slice).unwrap();
+            assert_eq!(rust_slice, b"123");
+            assert_eq!(&slice[..], b"123");
+        }
+
+        // .slice(..)
+        {
+            let buf = Vec::with_capacity(3);
+            let mut slice: Slice<_> = buf.slice(0..2);
+            let rust_slice = slice.as_mut_rust_slice_full_zeroed();
+            assert_eq!(rust_slice.len(), 2);
+            assert_eq!(rust_slice, &[0, 0]);
+            let mut file = make_fake_file();
+            file.read_exact(rust_slice).unwrap();
+            assert_eq!(rust_slice, b"12");
+            assert_eq!(&slice[..], b"12");
+        }
+    }
+}

From c22c6a6c9ece6c1067d75402f6161c6758289484 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 28 Jun 2024 11:09:11 +0100
Subject: [PATCH 1075/1571] Add buckets to safekeeper ops metrics (#8194)

In #8188 I forgot to specify buckets for new operations metrics. This
commit fixes that.
---
 safekeeper/src/metrics.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index a484c45af8..539ecf826b 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -72,7 +72,8 @@ pub static WAL_STORAGE_OPERATION_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "safekeeper_wal_storage_operation_seconds",
         "Seconds spent on WAL storage operations",
-        &["operation"]
+        &["operation"],
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
     )
     .expect("Failed to register safekeeper_wal_storage_operation_seconds histogram vec")
 });
@@ -80,7 +81,8 @@ pub static MISC_OPERATION_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "safekeeper_misc_operation_seconds",
         "Seconds spent on miscellaneous operations",
-        &["operation"]
+        &["operation"],
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
     )
     .expect("Failed to register safekeeper_misc_operation_seconds histogram vec")
 });

From ca2f7d06b230525df62864aa0cc9ebc8ee67aeaf Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 28 Jun 2024 16:47:05 +0300
Subject: [PATCH 1076/1571] Cherry-pick upstream fix for TruncateMultiXact
 assertion (#8195)

We hit that bug in a new test being added in PR #6528. We'd get the fix
from upstream with the next minor release anyway, but cherry-pick it now
to unblock PR #6528.

Upstream commit b1ffe3ff0b.

See
https://github.com/neondatabase/neon/pull/6528#issuecomment-2167367910
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 7845c122d5..223dd92595 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 7845c122d51d3ebb547a984a640ac0310a2fadce
+Subproject commit 223dd925959f8124711dd3d867dc8ba6629d52c0
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 2ff5ecc67c..f54d7373eb 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 2ff5ecc67c64e5fe44b7dde598e64e4538e0c373
+Subproject commit f54d7373eb0de5a54bce2becdb1c801026c7edff
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index d55e0aca10..e06bebc753 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit d55e0aca104af0b611cf5565f1033b2acd2dcc1c
+Subproject commit e06bebc75306b583e758b52c95946d41109239b2
diff --git a/vendor/revisions.json b/vendor/revisions.json
index e755cf2e9d..574e371934 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "d55e0aca104af0b611cf5565f1033b2acd2dcc1c"],
-  "v15": ["15.7", "2ff5ecc67c64e5fe44b7dde598e64e4538e0c373"],
-  "v14": ["14.12", "7845c122d51d3ebb547a984a640ac0310a2fadce"]
+  "v16": ["16.3", "e06bebc75306b583e758b52c95946d41109239b2"],
+  "v15": ["15.7", "f54d7373eb0de5a54bce2becdb1c801026c7edff"],
+  "v14": ["14.12", "223dd925959f8124711dd3d867dc8ba6629d52c0"]
 }

From babbe125dabdd528843d78c97874833ae67c314e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 28 Jun 2024 18:05:09 +0100
Subject: [PATCH 1077/1571] pageserver: drop out of secondary download if
 iteration time has passed (#8198)

## Problem

Very long running downloads can be wasteful, because the heatmap they're
using is outdated after a few minutes.

Closes: https://github.com/neondatabase/neon/issues/8182

## Summary of changes

- Impose a deadline on timeline downloads, using the same period as we
use for scheduling, and returning an UpdateError::Restart when it is
reached. This restart will involve waiting for a scheduling interval,
but that's a good thing: it helps let other tenants proceed.
- Refactor download_timeline so that the part where we update the state
for local layers is done even if we fall out of the layer download loop
with an error: this is important, especially for big tenants, because
only layers in the SecondaryDetail state will be considered for
eviction.
---
 pageserver/src/tenant/secondary/downloader.rs | 126 ++++++++++++++----
 1 file changed, 97 insertions(+), 29 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 24176ecf19..f6f30641db 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -262,6 +262,7 @@ impl scheduler::RunningJob for RunningDownload {
 struct CompleteDownload {
     secondary_state: Arc<SecondaryTenant>,
     completed_at: Instant,
+    result: Result<(), UpdateError>,
 }
 
 impl scheduler::Completion for CompleteDownload {
@@ -286,21 +287,33 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
         let CompleteDownload {
             secondary_state,
             completed_at: _completed_at,
+            result,
         } = completion;
 
         tracing::debug!("Secondary tenant download completed");
 
         let mut detail = secondary_state.detail.lock().unwrap();
 
-        let period = detail
-            .last_download
-            .as_ref()
-            .map(|d| d.upload_period)
-            .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
+        match result {
+            Err(UpdateError::Restart) => {
+                // Start downloading again as soon as we can.  This will involve waiting for the scheduler's
+                // scheduling interval.  This slightly reduces the peak download speed of tenants that hit their
+                // deadline and keep restarting, but that also helps give other tenants a chance to execute rather
+                // that letting one big tenant dominate for a long time.
+                detail.next_download = Some(Instant::now());
+            }
+            _ => {
+                let period = detail
+                    .last_download
+                    .as_ref()
+                    .map(|d| d.upload_period)
+                    .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
 
-        // We advance next_download irrespective of errors: we don't want error cases to result in
-        // expensive busy-polling.
-        detail.next_download = Some(Instant::now() + period_jitter(period, 5));
+                // We advance next_download irrespective of errors: we don't want error cases to result in
+                // expensive busy-polling.
+                detail.next_download = Some(Instant::now() + period_jitter(period, 5));
+            }
+        }
     }
 
     async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
@@ -396,9 +409,10 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
         (RunningDownload { barrier }, Box::pin(async move {
             let _completion = completion;
 
-            match TenantDownloader::new(conf, &remote_storage, &secondary_state)
+            let result = TenantDownloader::new(conf, &remote_storage, &secondary_state)
                 .download(&download_ctx)
-                .await
+                .await;
+            match &result
             {
                 Err(UpdateError::NoData) => {
                     tracing::info!("No heatmap found for tenant.  This is fine if it is new.");
@@ -415,6 +429,9 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                 Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => {
                     tracing::error!("Error while downloading tenant: {e}");
                 },
+                Err(UpdateError::Restart) => {
+                    tracing::info!("Download reached deadline & will restart to update heatmap")
+                }
                 Ok(()) => {}
             };
 
@@ -436,6 +453,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
             CompleteDownload {
                 secondary_state,
                 completed_at: Instant::now(),
+                result
             }
         }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
     }
@@ -452,6 +470,11 @@ struct TenantDownloader<'a> {
 /// Errors that may be encountered while updating a tenant
 #[derive(thiserror::Error, Debug)]
 enum UpdateError {
+    /// This is not a true failure, but it's how a download indicates that it would like to be restarted by
+    /// the scheduler, to pick up the latest heatmap
+    #[error("Reached deadline, restarting downloads")]
+    Restart,
+
     #[error("No remote data found")]
     NoData,
     #[error("Insufficient local storage space")]
@@ -603,6 +626,26 @@ impl<'a> TenantDownloader<'a> {
                 self.prepare_timelines(&heatmap, heatmap_mtime).await?;
         }
 
+        // Calculate a deadline for downloads: if downloading takes longer than this, it is useful to drop out and start again,
+        // so that we are always using reasonably a fresh heatmap.  Otherwise, if we had really huge content to download, we might
+        // spend 10s of minutes downloading layers we don't need.
+        // (see https://github.com/neondatabase/neon/issues/8182)
+        let deadline = {
+            let period = self
+                .secondary_state
+                .detail
+                .lock()
+                .unwrap()
+                .last_download
+                .as_ref()
+                .map(|d| d.upload_period)
+                .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
+
+            // Use double the period: we are not promising to complete within the period, this is just a heuristic
+            // to keep using a "reasonably fresh" heatmap.
+            Instant::now() + period * 2
+        };
+
         // Download the layers in the heatmap
         for timeline in heatmap.timelines {
             let timeline_state = timeline_states
@@ -618,7 +661,7 @@ impl<'a> TenantDownloader<'a> {
             }
 
             let timeline_id = timeline.timeline_id;
-            self.download_timeline(timeline, timeline_state, ctx)
+            self.download_timeline(timeline, timeline_state, deadline, ctx)
                 .instrument(tracing::info_span!(
                     "secondary_download_timeline",
                     tenant_id=%tenant_shard_id.tenant_id,
@@ -827,26 +870,28 @@ impl<'a> TenantDownloader<'a> {
         .and_then(|x| x)
     }
 
-    async fn download_timeline(
+    /// Download heatmap layers that are not present on local disk, or update their
+    /// access time if they are already present.
+    async fn download_timeline_layers(
         &self,
+        tenant_shard_id: &TenantShardId,
         timeline: HeatMapTimeline,
         timeline_state: SecondaryDetailTimeline,
+        deadline: Instant,
         ctx: &RequestContext,
-    ) -> Result<(), UpdateError> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
-
+    ) -> (Result<(), UpdateError>, Vec<HeatMapLayer>) {
         // Accumulate updates to the state
         let mut touched = Vec::new();
 
-        tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
-
-        // Download heatmap layers that are not present on local disk, or update their
-        // access time if they are already present.
         for layer in timeline.layers {
             if self.secondary_state.cancel.is_cancelled() {
                 tracing::debug!("Cancelled -- dropping out of layer loop");
-                return Err(UpdateError::Cancelled);
+                return (Err(UpdateError::Cancelled), touched);
+            }
+
+            if Instant::now() > deadline {
+                // We've been running downloads for a while, restart to download latest heatmap.
+                return (Err(UpdateError::Restart), touched);
             }
 
             // Existing on-disk layers: just update their access time.
@@ -916,20 +961,43 @@ impl<'a> TenantDownloader<'a> {
 
             match self
                 .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
-                .await?
+                .await
             {
-                Some(layer) => touched.push(layer),
-                None => {
+                Ok(Some(layer)) => touched.push(layer),
+                Ok(None) => {
                     // Not an error but we didn't download it: remote layer is missing.  Don't add it to the list of
                     // things to consider touched.
                 }
+                Err(e) => {
+                    return (Err(e), touched);
+                }
             }
         }
 
-        // Write updates to state to record layers we just downloaded or touched.
+        (Ok(()), touched)
+    }
+
+    async fn download_timeline(
+        &self,
+        timeline: HeatMapTimeline,
+        timeline_state: SecondaryDetailTimeline,
+        deadline: Instant,
+        ctx: &RequestContext,
+    ) -> Result<(), UpdateError> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        let timeline_id = timeline.timeline_id;
+
+        tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
+
+        let (result, touched) = self
+            .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx)
+            .await;
+
+        // Write updates to state to record layers we just downloaded or touched, irrespective of whether the overall result was successful
         {
             let mut detail = self.secondary_state.detail.lock().unwrap();
-            let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
+            let timeline_detail = detail.timelines.entry(timeline_id).or_default();
 
             tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
 
@@ -943,14 +1011,14 @@ impl<'a> TenantDownloader<'a> {
                         let local_path = local_layer_path(
                             self.conf,
                             tenant_shard_id,
-                            &timeline.timeline_id,
+                            &timeline_id,
                             &t.name,
                             &t.metadata.generation,
                         );
                         e.insert(OnDiskState::new(
                             self.conf,
                             tenant_shard_id,
-                            &timeline.timeline_id,
+                            &timeline_id,
                             t.name,
                             t.metadata.clone(),
                             t.access_time,
@@ -961,7 +1029,7 @@ impl<'a> TenantDownloader<'a> {
             }
         }
 
-        Ok(())
+        result
     }
 
     /// Call this during timeline download if a layer will _not_ be downloaded, to update progress statistics

From e1a06b40b7690e4d622b4588d946eacd5b601ce2 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 28 Jun 2024 18:16:21 +0100
Subject: [PATCH 1078/1571] Add rate limiter for partial uploads (#8203)

Too many concurrect partial uploads can hurt disk performance, this
commit adds a limiter.

Context:
https://neondb.slack.com/archives/C04KGFVUWUQ/p1719489018814669?thread_ts=1719440183.134739&cid=C04KGFVUWUQ
---
 safekeeper/src/bin/safekeeper.rs              |  7 +++-
 safekeeper/src/lib.rs                         |  3 ++
 safekeeper/src/timeline.rs                    |  7 ++--
 safekeeper/src/timeline_manager.rs            | 16 +++++++--
 safekeeper/src/timelines_global_map.rs        | 34 +++++++++++++-----
 safekeeper/src/wal_backup_partial.rs          | 35 ++++++++++++++++++-
 .../tests/walproposer_sim/safekeeper.rs       |  1 +
 7 files changed, 89 insertions(+), 14 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 20650490b1..c81373c77c 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -29,7 +29,8 @@ use utils::pid_file;
 use metrics::set_build_info_metric;
 use safekeeper::defaults::{
     DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR,
-    DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
+    DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY,
+    DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
 use safekeeper::http;
 use safekeeper::wal_service;
@@ -191,6 +192,9 @@ struct Args {
     /// Pending updates to control file will be automatically saved after this interval.
     #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_CONTROL_FILE_SAVE_INTERVAL)]
     control_file_save_interval: Duration,
+    /// Number of allowed concurrent uploads of partial segments to remote storage.
+    #[arg(long, default_value = DEFAULT_PARTIAL_BACKUP_CONCURRENCY)]
+    partial_backup_concurrency: usize,
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -344,6 +348,7 @@ async fn main() -> anyhow::Result<()> {
         enable_offload: args.enable_offload,
         delete_offloaded_wal: args.delete_offloaded_wal,
         control_file_save_interval: args.control_file_save_interval,
+        partial_backup_concurrency: args.partial_backup_concurrency,
     };
 
     // initialize sentry if SENTRY_DSN is provided
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 067e425570..5cd676d857 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -52,6 +52,7 @@ pub mod defaults {
     pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
     pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
     pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
+    pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";
 }
 
 #[derive(Debug, Clone)]
@@ -91,6 +92,7 @@ pub struct SafeKeeperConf {
     pub enable_offload: bool,
     pub delete_offloaded_wal: bool,
     pub control_file_save_interval: Duration,
+    pub partial_backup_concurrency: usize,
 }
 
 impl SafeKeeperConf {
@@ -133,6 +135,7 @@ impl SafeKeeperConf {
             enable_offload: false,
             delete_offloaded_wal: false,
             control_file_save_interval: Duration::from_secs(1),
+            partial_backup_concurrency: 1,
         }
     }
 }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 6b83270c18..132e5ec32f 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -36,7 +36,7 @@ use crate::timeline_guard::ResidenceGuard;
 use crate::timeline_manager::{AtomicStatus, ManagerCtl};
 use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::{self};
-use crate::wal_backup_partial::PartialRemoteSegment;
+use crate::wal_backup_partial::{PartialRemoteSegment, RateLimiter};
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
 use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
@@ -587,6 +587,7 @@ impl Timeline {
         shared_state: &mut WriteGuardSharedState<'_>,
         conf: &SafeKeeperConf,
         broker_active_set: Arc<TimelinesSet>,
+        partial_backup_rate_limiter: RateLimiter,
     ) -> Result<()> {
         match fs::metadata(&self.timeline_dir).await {
             Ok(_) => {
@@ -617,7 +618,7 @@ impl Timeline {
 
             return Err(e);
         }
-        self.bootstrap(conf, broker_active_set);
+        self.bootstrap(conf, broker_active_set, partial_backup_rate_limiter);
         Ok(())
     }
 
@@ -626,6 +627,7 @@ impl Timeline {
         self: &Arc<Timeline>,
         conf: &SafeKeeperConf,
         broker_active_set: Arc<TimelinesSet>,
+        partial_backup_rate_limiter: RateLimiter,
     ) {
         let (tx, rx) = self.manager_ctl.bootstrap_manager();
 
@@ -637,6 +639,7 @@ impl Timeline {
             broker_active_set,
             tx,
             rx,
+            partial_backup_rate_limiter,
         ));
     }
 
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index 66c62ce197..62142162de 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -32,7 +32,7 @@ use crate::{
     timeline_guard::{AccessService, GuardId, ResidenceGuard},
     timelines_set::{TimelineSetGuard, TimelinesSet},
     wal_backup::{self, WalBackupTaskHandle},
-    wal_backup_partial::{self, PartialRemoteSegment},
+    wal_backup_partial::{self, PartialRemoteSegment, RateLimiter},
     SafeKeeperConf,
 };
 
@@ -185,6 +185,7 @@ pub(crate) struct Manager {
 
     // misc
     pub(crate) access_service: AccessService,
+    pub(crate) partial_backup_rate_limiter: RateLimiter,
 }
 
 /// This task gets spawned alongside each timeline and is responsible for managing the timeline's
@@ -197,6 +198,7 @@ pub async fn main_task(
     broker_active_set: Arc<TimelinesSet>,
     manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
     mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
+    partial_backup_rate_limiter: RateLimiter,
 ) {
     tli.set_status(Status::Started);
 
@@ -209,7 +211,14 @@ pub async fn main_task(
         }
     };
 
-    let mut mgr = Manager::new(tli, conf, broker_active_set, manager_tx).await;
+    let mut mgr = Manager::new(
+        tli,
+        conf,
+        broker_active_set,
+        manager_tx,
+        partial_backup_rate_limiter,
+    )
+    .await;
 
     // Start recovery task which always runs on the timeline.
     if !mgr.is_offloaded && mgr.conf.peer_recovery_enabled {
@@ -321,6 +330,7 @@ impl Manager {
         conf: SafeKeeperConf,
         broker_active_set: Arc<TimelinesSet>,
         manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
+        partial_backup_rate_limiter: RateLimiter,
     ) -> Manager {
         let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
         Manager {
@@ -339,6 +349,7 @@ impl Manager {
             partial_backup_uploaded,
             access_service: AccessService::new(manager_tx),
             tli,
+            partial_backup_rate_limiter,
         }
     }
 
@@ -525,6 +536,7 @@ impl Manager {
         self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
             self.wal_resident_timeline(),
             self.conf.clone(),
+            self.partial_backup_rate_limiter.clone(),
         )));
     }
 
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 45e08ede3c..9ce1112cec 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -5,6 +5,7 @@
 use crate::safekeeper::ServerInfo;
 use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
 use crate::timelines_set::TimelinesSet;
+use crate::wal_backup_partial::RateLimiter;
 use crate::SafeKeeperConf;
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
@@ -23,6 +24,7 @@ struct GlobalTimelinesState {
     conf: Option<SafeKeeperConf>,
     broker_active_set: Arc<TimelinesSet>,
     load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
+    partial_backup_rate_limiter: RateLimiter,
 }
 
 // Used to prevent concurrent timeline loading.
@@ -37,8 +39,12 @@ impl GlobalTimelinesState {
     }
 
     /// Get dependencies for a timeline constructor.
-    fn get_dependencies(&self) -> (SafeKeeperConf, Arc<TimelinesSet>) {
-        (self.get_conf().clone(), self.broker_active_set.clone())
+    fn get_dependencies(&self) -> (SafeKeeperConf, Arc<TimelinesSet>, RateLimiter) {
+        (
+            self.get_conf().clone(),
+            self.broker_active_set.clone(),
+            self.partial_backup_rate_limiter.clone(),
+        )
     }
 
     /// Insert timeline into the map. Returns error if timeline with the same id already exists.
@@ -66,6 +72,7 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
         conf: None,
         broker_active_set: Arc::new(TimelinesSet::default()),
         load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
+        partial_backup_rate_limiter: RateLimiter::new(1),
     })
 });
 
@@ -79,6 +86,7 @@ impl GlobalTimelines {
         // lock, so use explicit block
         let tenants_dir = {
             let mut state = TIMELINES_STATE.lock().unwrap();
+            state.partial_backup_rate_limiter = RateLimiter::new(conf.partial_backup_concurrency);
             state.conf = Some(conf);
 
             // Iterate through all directories and load tenants for all directories
@@ -122,7 +130,7 @@ impl GlobalTimelines {
     /// this function is called during init when nothing else is running, so
     /// this is fine.
     async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
-        let (conf, broker_active_set) = {
+        let (conf, broker_active_set, partial_backup_rate_limiter) = {
             let state = TIMELINES_STATE.lock().unwrap();
             state.get_dependencies()
         };
@@ -145,7 +153,11 @@ impl GlobalTimelines {
                                     .unwrap()
                                     .timelines
                                     .insert(ttid, tli.clone());
-                                tli.bootstrap(&conf, broker_active_set.clone());
+                                tli.bootstrap(
+                                    &conf,
+                                    broker_active_set.clone(),
+                                    partial_backup_rate_limiter.clone(),
+                                );
                             }
                             // If we can't load a timeline, it's most likely because of a corrupted
                             // directory. We will log an error and won't allow to delete/recreate
@@ -178,7 +190,8 @@ impl GlobalTimelines {
         _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>,
         ttid: TenantTimelineId,
     ) -> Result<Arc<Timeline>> {
-        let (conf, broker_active_set) = TIMELINES_STATE.lock().unwrap().get_dependencies();
+        let (conf, broker_active_set, partial_backup_rate_limiter) =
+            TIMELINES_STATE.lock().unwrap().get_dependencies();
 
         match Timeline::load_timeline(&conf, ttid) {
             Ok(timeline) => {
@@ -191,7 +204,7 @@ impl GlobalTimelines {
                     .timelines
                     .insert(ttid, tli.clone());
 
-                tli.bootstrap(&conf, broker_active_set);
+                tli.bootstrap(&conf, broker_active_set, partial_backup_rate_limiter);
 
                 Ok(tli)
             }
@@ -222,7 +235,7 @@ impl GlobalTimelines {
         commit_lsn: Lsn,
         local_start_lsn: Lsn,
     ) -> Result<Arc<Timeline>> {
-        let (conf, broker_active_set) = {
+        let (conf, broker_active_set, partial_backup_rate_limiter) = {
             let state = TIMELINES_STATE.lock().unwrap();
             if let Ok(timeline) = state.get(&ttid) {
                 // Timeline already exists, return it.
@@ -257,7 +270,12 @@ impl GlobalTimelines {
             // Bootstrap is transactional, so if it fails, the timeline will be deleted,
             // and the state on disk should remain unchanged.
             if let Err(e) = timeline
-                .init_new(&mut shared_state, &conf, broker_active_set)
+                .init_new(
+                    &mut shared_state,
+                    &conf,
+                    broker_active_set,
+                    partial_backup_rate_limiter,
+                )
                 .await
             {
                 // Note: the most likely reason for init failure is that the timeline
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 9c7cd0888d..825851c97c 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -18,6 +18,8 @@
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.
 
+use std::sync::Arc;
+
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use remote_storage::RemotePath;
@@ -27,7 +29,7 @@ use tracing::{debug, error, info, instrument, warn};
 use utils::lsn::Lsn;
 
 use crate::{
-    metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
+    metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
     safekeeper::Term,
     timeline::WalResidentTimeline,
     timeline_manager::StateSnapshot,
@@ -35,6 +37,30 @@ use crate::{
     SafeKeeperConf,
 };
 
+#[derive(Clone)]
+pub struct RateLimiter {
+    semaphore: Arc<tokio::sync::Semaphore>,
+}
+
+impl RateLimiter {
+    pub fn new(permits: usize) -> Self {
+        Self {
+            semaphore: Arc::new(tokio::sync::Semaphore::new(permits)),
+        }
+    }
+
+    async fn acquire_owned(&self) -> tokio::sync::OwnedSemaphorePermit {
+        let _timer = MISC_OPERATION_SECONDS
+            .with_label_values(&["partial_permit_acquire"])
+            .start_timer();
+        self.semaphore
+            .clone()
+            .acquire_owned()
+            .await
+            .expect("semaphore is closed")
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub enum UploadStatus {
     /// Upload is in progress. This status should be used only for garbage collection,
@@ -208,6 +234,9 @@ impl PartialBackup {
     /// Upload the latest version of the partial segment and garbage collect older versions.
     #[instrument(name = "upload", skip_all, fields(name = %prepared.name))]
     async fn do_upload(&mut self, prepared: &PartialRemoteSegment) -> anyhow::Result<()> {
+        let _timer = MISC_OPERATION_SECONDS
+            .with_label_values(&["partial_do_upload"])
+            .start_timer();
         info!("starting upload {:?}", prepared);
 
         let state_0 = self.state.clone();
@@ -307,6 +336,7 @@ pub(crate) fn needs_uploading(
 pub async fn main_task(
     tli: WalResidentTimeline,
     conf: SafeKeeperConf,
+    limiter: RateLimiter,
 ) -> Option<PartialRemoteSegment> {
     debug!("started");
     let await_duration = conf.partial_backup_timeout;
@@ -411,6 +441,9 @@ pub async fn main_task(
             continue 'outer;
         }
 
+        // limit concurrent uploads
+        let _upload_permit = limiter.acquire_owned().await;
+
         let prepared = backup.prepare_upload().await;
         if let Some(seg) = &uploaded_segment {
             if seg.eq_without_status(&prepared) {
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 43835c7f44..6bbf96d71d 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -187,6 +187,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         enable_offload: false,
         delete_offloaded_wal: false,
         control_file_save_interval: Duration::from_secs(1),
+        partial_backup_concurrency: 1,
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;

From b8bbaafc0352237ffd90b91f646df886739593b2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 28 Jun 2024 18:27:13 +0100
Subject: [PATCH 1079/1571] storage controller: fix heatmaps getting disabled
 during shard split (#8197)

## Problem

At the start of do_tenant_shard_split, we drop any secondary location
for the parent shards. The reconciler uses presence of secondary
locations as a condition for enabling heatmaps.

On the pageserver, child shards inherit their configuration from
parents, but the storage controller assumes the child's ObservedState is
the same as the parent's config from the prepare phase. The result is
that some child shards end up with inaccurate ObservedState, and until
something next migrates or restarts, those tenant shards aren't
uploading heatmaps, so their secondary locations are downloading
everything that was resident at the moment of the split (including
ancestor layers which are often cleaned up shortly after the split).

Closes: https://github.com/neondatabase/neon/issues/8189

## Summary of changes

- Use PlacementPolicy to control enablement of heatmap upload, rather
than the literal presence of secondaries in IntentState: this way we
avoid switching them off during shard split
- test: during tenant split test, assert that the child shards have
heatmap uploads enabled.
---
 storage_controller/src/reconciler.rs   | 13 +++++++++++--
 storage_controller/src/service.rs      |  4 ++--
 storage_controller/src/tenant_shard.rs |  9 +++------
 test_runner/regress/test_sharding.py   |  7 +++++++
 4 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index fe97f724c1..886ceae90f 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -1,6 +1,7 @@
 use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::service;
+use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
@@ -29,6 +30,7 @@ pub(super) struct Reconciler {
     /// of a tenant's state from when we spawned a reconcile task.
     pub(super) tenant_shard_id: TenantShardId,
     pub(crate) shard: ShardIdentity,
+    pub(crate) placement_policy: PlacementPolicy,
     pub(crate) generation: Option<Generation>,
     pub(crate) intent: TargetState,
 
@@ -641,7 +643,7 @@ impl Reconciler {
                 generation,
                 &self.shard,
                 &self.config,
-                !self.intent.secondary.is_empty(),
+                &self.placement_policy,
             );
             match self.observed.locations.get(&node.get_id()) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
@@ -801,8 +803,15 @@ pub(crate) fn attached_location_conf(
     generation: Generation,
     shard: &ShardIdentity,
     config: &TenantConfig,
-    has_secondaries: bool,
+    policy: &PlacementPolicy,
 ) -> LocationConfig {
+    let has_secondaries = match policy {
+        PlacementPolicy::Attached(0) | PlacementPolicy::Detached | PlacementPolicy::Secondary => {
+            false
+        }
+        PlacementPolicy::Attached(_) => true,
+    };
+
     LocationConfig {
         mode: LocationConfigMode::AttachedSingle,
         generation: generation.into(),
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index bcc40c69a2..3965d7453d 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1390,7 +1390,7 @@ impl Service {
                             tenant_shard.generation.unwrap(),
                             &tenant_shard.shard,
                             &tenant_shard.config,
-                            false,
+                            &PlacementPolicy::Attached(0),
                         )),
                     },
                 )]);
@@ -3321,7 +3321,7 @@ impl Service {
                                 generation,
                                 &child_shard,
                                 &config,
-                                matches!(policy, PlacementPolicy::Attached(n) if n > 0),
+                                &policy,
                             )),
                         },
                     );
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 45295bc59b..3fcf31ac10 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -908,12 +908,8 @@ impl TenantShard {
                 .generation
                 .expect("Attempted to enter attached state without a generation");
 
-            let wanted_conf = attached_location_conf(
-                generation,
-                &self.shard,
-                &self.config,
-                !self.intent.secondary.is_empty(),
-            );
+            let wanted_conf =
+                attached_location_conf(generation, &self.shard, &self.config, &self.policy);
             match self.observed.locations.get(&node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                 Some(_) | None => {
@@ -1099,6 +1095,7 @@ impl TenantShard {
         let mut reconciler = Reconciler {
             tenant_shard_id: self.tenant_shard_id,
             shard: self.shard,
+            placement_policy: self.policy.clone(),
             generation: self.generation,
             intent: reconciler_intent,
             detach,
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 62a9f422ee..8267d3f36c 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -542,6 +542,13 @@ def test_sharding_split_smoke(
             for k, v in non_default_tenant_config.items():
                 assert config.effective_config[k] == v
 
+            # Check that heatmap uploads remain enabled after shard split
+            # (https://github.com/neondatabase/neon/issues/8189)
+            assert (
+                config.effective_config["heatmap_period"]
+                and config.effective_config["heatmap_period"] != "0s"
+            )
+
     # Validate pageserver state: expect every child shard to have an attached and secondary location
     (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id])
     assert sum(attached.values()) == split_shard_count

From bc704917a38b824e683f8f1a3c05f1ae496ddf53 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 28 Jun 2024 15:13:25 -0400
Subject: [PATCH 1080/1571] fix(pageserver): ensure tenant harness has
 different names (#8205)

rename the tenant test harness name

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 92d9c5b143..3ffbaf98c6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -6264,7 +6264,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
+        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
         let (tenant, ctx) = harness.load().await;
 
         let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();

From 30027d94a26ad6624e1b0f55d3819a1c4cb8f59d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 1 Jul 2024 01:49:49 +0300
Subject: [PATCH 1081/1571] Fix tracking of the nextMulti in the pageserver's
 copy of CheckPoint (#6528)

Whenever we see an XLOG_MULTIXACT_CREATE_ID WAL record, we need to
update the nextMulti and NextMultiOffset fields in the pageserver's
copy of the CheckPoint struct, to cover the new multi-XID. In
PostgreSQL, this is done by updating an in-memory struct during WAL
replay, but because in Neon you can start a compute node at any LSN,
we need to have an up-to-date value pre-calculated in the pageserver
at all times. We do the same for nextXid.

However, we had a bug in WAL ingestion code that does that: the
multi-XIDs will wrap around at 2^32, just like XIDs, so we need to do
the comparisons in a wraparound-aware fashion.

Fix that, and add tests.

Fixes issue #6520

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 libs/postgres_ffi/src/xlog_utils.rs           |  22 ++
 .../wal_craft/src/xlog_utils_test.rs          |  47 +++
 pageserver/src/walingest.rs                   |  29 +-
 test_runner/regress/test_next_xid.py          | 273 ++++++++++++++++++
 4 files changed, 365 insertions(+), 6 deletions(-)

diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 0bbb91afc2..d25b23663b 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -356,6 +356,28 @@ impl CheckPoint {
         }
         false
     }
+
+    /// Advance next multi-XID/offset to those given in arguments.
+    ///
+    /// It's important that this handles wraparound correctly. This should match the
+    /// MultiXactAdvanceNextMXact() logic in PostgreSQL's xlog_redo() function.
+    ///
+    /// Returns 'true' if the Checkpoint was updated.
+    pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool {
+        let mut modified = false;
+
+        if multi_xid.wrapping_sub(self.nextMulti) as i32 > 0 {
+            self.nextMulti = multi_xid;
+            modified = true;
+        }
+
+        if multi_offset.wrapping_sub(self.nextMultiOffset) as i32 > 0 {
+            self.nextMultiOffset = multi_offset;
+            modified = true;
+        }
+
+        modified
+    }
 }
 
 /// Generate new, empty WAL segment, with correct block headers at the first
diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
index 496458b2e4..750affc94e 100644
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -202,6 +202,53 @@ pub fn test_update_next_xid() {
     assert_eq!(checkpoint.nextXid.value, 2048);
 }
 
+#[test]
+pub fn test_update_next_multixid() {
+    let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
+    let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
+
+    // simple case
+    checkpoint.nextMulti = 20;
+    checkpoint.nextMultiOffset = 20;
+    checkpoint.update_next_multixid(1000, 2000);
+    assert_eq!(checkpoint.nextMulti, 1000);
+    assert_eq!(checkpoint.nextMultiOffset, 2000);
+
+    // No change
+    checkpoint.update_next_multixid(500, 900);
+    assert_eq!(checkpoint.nextMulti, 1000);
+    assert_eq!(checkpoint.nextMultiOffset, 2000);
+
+    // Close to wraparound, but not wrapped around yet
+    checkpoint.nextMulti = 0xffff0000;
+    checkpoint.nextMultiOffset = 0xfffe0000;
+    checkpoint.update_next_multixid(0xffff00ff, 0xfffe00ff);
+    assert_eq!(checkpoint.nextMulti, 0xffff00ff);
+    assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff);
+
+    // Wraparound
+    checkpoint.update_next_multixid(1, 900);
+    assert_eq!(checkpoint.nextMulti, 1);
+    assert_eq!(checkpoint.nextMultiOffset, 900);
+
+    // Wraparound nextMulti to 0.
+    //
+    // It's a bit surprising that nextMulti can be 0, because that's a special value
+    // (InvalidMultiXactId). However, that's how Postgres does it at multi-xid wraparound:
+    // nextMulti wraps around to 0, but then when the next multi-xid is assigned, it skips
+    // the 0 and the next multi-xid actually assigned is 1.
+    checkpoint.nextMulti = 0xffff0000;
+    checkpoint.nextMultiOffset = 0xfffe0000;
+    checkpoint.update_next_multixid(0, 0xfffe00ff);
+    assert_eq!(checkpoint.nextMulti, 0);
+    assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff);
+
+    // Wraparound nextMultiOffset to 0
+    checkpoint.update_next_multixid(0, 0);
+    assert_eq!(checkpoint.nextMulti, 0);
+    assert_eq!(checkpoint.nextMultiOffset, 0);
+}
+
 #[test]
 pub fn test_encode_logical_message() {
     let expected = [
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 4f26f2f6d1..fb10bca5a6 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1384,14 +1384,31 @@ impl WalIngest {
             // Note: The multixact members can wrap around, even within one WAL record.
             offset = offset.wrapping_add(n_this_page as u32);
         }
-        if xlrec.mid >= self.checkpoint.nextMulti {
-            self.checkpoint.nextMulti = xlrec.mid + 1;
-            self.checkpoint_modified = true;
-        }
-        if xlrec.moff + xlrec.nmembers > self.checkpoint.nextMultiOffset {
-            self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
+        let next_offset = offset;
+        assert!(xlrec.moff.wrapping_add(xlrec.nmembers) == next_offset);
+
+        // Update next-multi-xid and next-offset
+        //
+        // NB: In PostgreSQL, the next-multi-xid stored in the control file is allowed to
+        // go to 0, and it's fixed up by skipping to FirstMultiXactId in functions that
+        // read it, like GetNewMultiXactId(). This is different from how nextXid is
+        // incremented! nextXid skips over < FirstNormalTransactionId when the the value
+        // is stored, so it's never 0 in a checkpoint.
+        //
+        // I don't know why it's done that way, it seems less error-prone to skip over 0
+        // when the value is stored rather than when it's read. But let's do it the same
+        // way here.
+        let next_multi_xid = xlrec.mid.wrapping_add(1);
+
+        if self
+            .checkpoint
+            .update_next_multixid(next_multi_xid, next_offset)
+        {
             self.checkpoint_modified = true;
         }
+
+        // Also update the next-xid with the highest member. According to the comments in
+        // multixact_redo(), this shouldn't be necessary, but let's do the same here.
         let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| {
             if let Some(max_xid) = acc {
                 if mbr.xid.wrapping_sub(max_xid) as i32 > 0 {
diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py
index b9e7e642b5..51e847135e 100644
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -7,6 +7,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
+    VanillaPostgres,
     import_timeline_from_vanilla_postgres,
     wait_for_wal_insert_lsn,
 )
@@ -182,3 +183,275 @@ def test_import_at_2bil(
     cur = conn.cursor()
     cur.execute("SELECT count(*) from t")
     assert cur.fetchone() == (10000 + 1 + 1,)
+
+
+# Constants and macros copied from PostgreSQL multixact.c and headers. These are needed to
+# calculate the SLRU segments that a particular multixid or multixid-offsets falls into.
+BLCKSZ = 8192
+MULTIXACT_OFFSETS_PER_PAGE = int(BLCKSZ / 4)
+SLRU_PAGES_PER_SEGMENT = int(32)
+MXACT_MEMBER_BITS_PER_XACT = 8
+MXACT_MEMBER_FLAGS_PER_BYTE = 1
+MULTIXACT_FLAGBYTES_PER_GROUP = 4
+MULTIXACT_MEMBERS_PER_MEMBERGROUP = MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE
+MULTIXACT_MEMBERGROUP_SIZE = 4 * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP
+MULTIXACT_MEMBERGROUPS_PER_PAGE = int(BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+MULTIXACT_MEMBERS_PER_PAGE = MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP
+
+
+def MultiXactIdToOffsetSegment(xid: int):
+    return int(xid / (SLRU_PAGES_PER_SEGMENT * MULTIXACT_OFFSETS_PER_PAGE))
+
+
+def MXOffsetToMemberSegment(off: int):
+    return int(off / (SLRU_PAGES_PER_SEGMENT * MULTIXACT_MEMBERS_PER_PAGE))
+
+
+def advance_multixid_to(
+    pg_bin: PgBin, vanilla_pg: VanillaPostgres, next_multi_xid: int, next_multi_offset: int
+):
+    """
+    Use pg_resetwal to advance the nextMulti and nextMultiOffset values in a stand-alone
+    Postgres cluster. This is useful to get close to wraparound or some other interesting
+    value, without having to burn a lot of time consuming the (multi-)XIDs one by one.
+
+    The new values should be higher than the old ones, in a wraparound-aware sense.
+
+    On entry, the server should be running. It will be shut down and restarted.
+    """
+
+    # Read old values from the last checkpoint. We will pass the old oldestMultiXid value
+    # back to pg_resetwal, there's no option to leave it alone.
+    with vanilla_pg.connect() as conn:
+        with conn.cursor() as cur:
+            # Make sure the oldest-multi-xid value in the control file is up-to-date
+            cur.execute("checkpoint")
+            cur.execute("select oldest_multi_xid, next_multixact_id from pg_control_checkpoint()")
+            rec = cur.fetchone()
+            assert rec is not None
+            (ckpt_oldest_multi_xid, ckpt_next_multi_xid) = rec
+    log.info(f"oldestMultiXid was {ckpt_oldest_multi_xid}, nextMultiXid was {ckpt_next_multi_xid}")
+    log.info(f"Resetting to {next_multi_xid}")
+
+    # Use pg_resetwal to reset the next multiXid and multiOffset to given values.
+    vanilla_pg.stop()
+    pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
+    cmd = [
+        pg_resetwal_path,
+        f"--multixact-ids={next_multi_xid},{ckpt_oldest_multi_xid}",
+        f"--multixact-offset={next_multi_offset}",
+        "-D",
+        str(vanilla_pg.pgdatadir),
+    ]
+    pg_bin.run_capture(cmd)
+
+    # Because we skip over a lot of values, Postgres hasn't created the SLRU segments for
+    # the new values yet. Create them manually, to allow Postgres to start up.
+    #
+    # This leaves "gaps" in the SLRU where segments between old value and new value are
+    # missing. That's OK for our purposes. Autovacuum will print some warnings about the
+    # missing segments, but will clean it up by truncating the SLRUs up to the new value,
+    # closing the gap.
+    segname = "%04X" % MultiXactIdToOffsetSegment(next_multi_xid)
+    log.info(f"Creating dummy segment pg_multixact/offsets/{segname}")
+    with open(vanilla_pg.pgdatadir / "pg_multixact" / "offsets" / segname, "w") as of:
+        of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ)
+        of.flush()
+
+    segname = "%04X" % MXOffsetToMemberSegment(next_multi_offset)
+    log.info(f"Creating dummy segment pg_multixact/members/{segname}")
+    with open(vanilla_pg.pgdatadir / "pg_multixact" / "members" / segname, "w") as of:
+        of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ)
+        of.flush()
+
+    # Start Postgres again and wait until autovacuum has processed all the databases
+    #
+    # This allows truncating the SLRUs, fixing the gaps with missing segments.
+    vanilla_pg.start()
+    with vanilla_pg.connect().cursor() as cur:
+        for _ in range(1000):
+            datminmxid = int(
+                query_scalar(cur, "select min(datminmxid::text::int8) from pg_database")
+            )
+            log.info(f"datminmxid {datminmxid}")
+            if next_multi_xid - datminmxid < 1_000_000:  # not wraparound-aware!
+                break
+            time.sleep(0.5)
+
+
+def test_multixid_wraparound_import(
+    neon_env_builder: NeonEnvBuilder,
+    test_output_dir: Path,
+    pg_bin: PgBin,
+    vanilla_pg,
+):
+    """
+    Test that the wraparound of the "next-multi-xid" counter is handled correctly in
+    pageserver, And multi-offsets as well
+    """
+    env = neon_env_builder.init_start()
+
+    # In order to to test multixid wraparound, we need to first advance the counter to
+    # within spitting distance of the wraparound, that is 2^32 multi-XIDs. We could simply
+    # run a workload that consumes a lot of multi-XIDs until we approach that, but that
+    # takes a very long time. So we cheat.
+    #
+    # Our strategy is to create a vanilla Postgres cluster, and use pg_resetwal to
+    # directly set the multi-xid counter a higher value. However, we cannot directly set
+    # it to just before 2^32 (~ 4 billion), because that would make the exisitng
+    # 'relminmxid' values to look like they're in the future. It's not clear how the
+    # system would behave in that situation. So instead, we bump it up ~ 1 billion
+    # multi-XIDs at a time, and let autovacuum to process all the relations and update
+    # 'relminmxid' between each run.
+    #
+    # XXX: For the multi-offsets, most of the bump is done in the last call.  This is
+    # because advancing it ~ 1 billion at a time hit a pathological case in the
+    # MultiXactMemberFreezeThreshold() function, causing autovacuum not trigger multixid
+    # freezing. See
+    # https://www.postgresql.org/message-id/85fb354c-f89f-4d47-b3a2-3cbd461c90a3%40iki.fi
+    # Multi-offsets don't have the same wraparound problems at 2 billion mark as
+    # multi-xids do, so one big jump is fine.
+    vanilla_pg.configure(
+        [
+            "log_autovacuum_min_duration = 0",
+            # Perform anti-wraparound vacuuming aggressively
+            "autovacuum_naptime='1 s'",
+            "autovacuum_freeze_max_age = 1000000",
+            "autovacuum_multixact_freeze_max_age = 1000000",
+        ],
+    )
+    vanilla_pg.start()
+    advance_multixid_to(pg_bin, vanilla_pg, 0x40000000, 0x10000000)
+    advance_multixid_to(pg_bin, vanilla_pg, 0x80000000, 0x20000000)
+    advance_multixid_to(pg_bin, vanilla_pg, 0xC0000000, 0x30000000)
+    advance_multixid_to(pg_bin, vanilla_pg, 0xFFFFFF00, 0xFFFFFF00)
+
+    vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
+    vanilla_pg.safe_psql("create table tt as select g as id from generate_series(1, 10) g")
+    vanilla_pg.safe_psql("CHECKPOINT")
+
+    # Import the cluster to the pageserver
+    tenant_id = TenantId.generate()
+    env.pageserver.tenant_create(tenant_id)
+    timeline_id = TimelineId.generate()
+    import_timeline_from_vanilla_postgres(
+        test_output_dir,
+        env,
+        pg_bin,
+        tenant_id,
+        timeline_id,
+        "imported_multixid_wraparound_test",
+        vanilla_pg.connstr(),
+    )
+    vanilla_pg.stop()
+
+    endpoint = env.endpoints.create_start(
+        "imported_multixid_wraparound_test",
+        tenant_id=tenant_id,
+        config_lines=[
+            "log_autovacuum_min_duration = 0",
+            "autovacuum_naptime='5 s'",
+            "autovacuum=off",
+        ],
+    )
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    assert query_scalar(cur, "select count(*) from tt") == 10  # sanity check
+
+    # Install extension containing function needed for test
+    cur.execute("CREATE EXTENSION neon_test_utils")
+
+    # Consume a lot of XIDs, just to advance the XIDs to different range than the
+    # multi-xids. That avoids confusion while debugging
+    cur.execute("select test_consume_xids(100000)")
+    cur.execute("select pg_switch_wal()")
+    cur.execute("checkpoint")
+
+    # Use subtransactions so that each row in 'tt' is stamped with different XID. Leave
+    # the transaction open.
+    cur.execute("BEGIN")
+    cur.execute(
+        """
+do $$
+declare
+  idvar int;
+begin
+  for idvar in select id from tt loop
+    begin
+      update tt set id = idvar where id = idvar;
+    exception when others then
+      raise 'didn''t expect an error: %', sqlerrm;
+    end;
+  end loop;
+end;
+$$;
+"""
+    )
+
+    # In a different transaction, acquire a FOR KEY SHARE lock on each row. This generates
+    # a new multixid for each row, with the previous xmax and this transaction's XID as the
+    # members.
+    #
+    # Repeat this until the multi-xid counter wraps around.
+    conn3 = endpoint.connect()
+    cur3 = conn3.cursor()
+    next_multixact_id_before_restart = 0
+    observed_before_wraparound = False
+    while True:
+        cur3.execute("BEGIN")
+        cur3.execute("SELECT * FROM tt FOR KEY SHARE")
+
+        # Get the xmax of one of the rows we locked. It should be a multi-xid. It might
+        # not be the latest one, but close enough.
+        row_xmax = int(query_scalar(cur3, "SELECT xmax FROM tt LIMIT 1"))
+        cur3.execute("COMMIT")
+        log.info(f"observed a row with xmax {row_xmax}")
+
+        # High value means not wrapped around yet
+        if row_xmax >= 0xFFFFFF00:
+            observed_before_wraparound = True
+            continue
+
+        # xmax should not be a regular XID. (We bumped up the regular XID range earlier
+        # to around 100000 and above.)
+        assert row_xmax < 100
+
+        # xmax values < FirstNormalTransactionId (== 3) could be special XID values, or
+        # multixid values after wraparound. We don't know for sure which, so keep going to
+        # be sure we see value that's unambiguously a wrapped-around multixid
+        if row_xmax < 3:
+            continue
+
+        next_multixact_id_before_restart = row_xmax
+        log.info(
+            f"next_multixact_id is now at {next_multixact_id_before_restart} or a little higher"
+        )
+        break
+
+    # We should have observed the state before wraparound
+    assert observed_before_wraparound
+
+    cur.execute("COMMIT")
+
+    # Wait until pageserver has received all the data, and restart the endpoint
+    wait_for_wal_insert_lsn(env, endpoint, tenant_id, timeline_id)
+    endpoint.stop(mode="immediate")  # 'immediate' to avoid writing shutdown checkpoint
+    endpoint.start()
+
+    # Check that the next-multixid value wrapped around correctly
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    cur.execute("select next_multixact_id from pg_control_checkpoint()")
+    next_multixact_id_after_restart = int(
+        query_scalar(cur, "select next_multixact_id from pg_control_checkpoint()")
+    )
+    log.info(f"next_multixact_id after restart: {next_multixact_id_after_restart}")
+    assert next_multixact_id_after_restart >= next_multixact_id_before_restart
+
+    # The multi-offset should wrap around as well
+    cur.execute("select next_multi_offset from pg_control_checkpoint()")
+    next_multi_offset_after_restart = int(
+        query_scalar(cur, "select next_multi_offset from pg_control_checkpoint()")
+    )
+    log.info(f"next_multi_offset after restart: {next_multi_offset_after_restart}")
+    assert next_multi_offset_after_restart < 100000

From 57535c039c938f7c179693d9db8b052912019823 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 1 Jul 2024 11:23:31 +0300
Subject: [PATCH 1082/1571] tests: remove a leftover 'running' flag (#8216)

The 'running' boolean was replaced with a semaphore in commit
f0e2bb79b2, but this initialization was missed. Remove it so that if a
test tries to access it, you get an error rather than always claiming
that the endpoint is not running.

Spotted by Arseny at
https://github.com/neondatabase/neon/pull/7288#discussion_r1660068657
---
 test_runner/fixtures/neon_fixtures.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 4911917bf4..a1cb1b5195 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3491,7 +3491,6 @@ class Endpoint(PgProtocol, LogUtils):
     ):
         super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres")
         self.env = env
-        self.running = False
         self.branch_name: Optional[str] = None  # dubious
         self.endpoint_id: Optional[str] = None  # dubious, see asserts below
         self.pgdata_dir: Optional[str] = None  # Path to computenode PGDATA

From 75c84c846a2517cbbe414ae5f3e0649f4a359036 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 1 Jul 2024 12:58:08 +0300
Subject: [PATCH 1083/1571] tests: Make neon_xlogflush() flush all WAL, if you
 omit the LSN arg

This makes it much more convenient to use in the common case that you
want to flush all the WAL. (Passing pg_current_wal_insert_lsn() as the
argument doesn't work for the same reasons as explained in the
comments: we need to be back off to the beginning of a page if the
previous record ended at page boundary.)

I plan to use this to fix the issue that Arseny Sher called out at
https://github.com/neondatabase/neon/pull/7288#discussion_r1660063852
---
 pgxn/neon_test_utils/Makefile                 |  2 +-
 ...tils--1.1.sql => neon_test_utils--1.2.sql} |  2 +-
 pgxn/neon_test_utils/neon_test_utils.control  |  2 +-
 pgxn/neon_test_utils/neontest.c               | 38 ++++++++++++++++++-
 4 files changed, 40 insertions(+), 4 deletions(-)
 rename pgxn/neon_test_utils/{neon_test_utils--1.1.sql => neon_test_utils--1.2.sql} (96%)

diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile
index 1ee87357e5..1371272439 100644
--- a/pgxn/neon_test_utils/Makefile
+++ b/pgxn/neon_test_utils/Makefile
@@ -7,7 +7,7 @@ OBJS = \
 	neontest.o
 
 EXTENSION = neon_test_utils
-DATA = neon_test_utils--1.1.sql
+DATA = neon_test_utils--1.2.sql
 PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
 
 PG_CONFIG = pg_config
diff --git a/pgxn/neon_test_utils/neon_test_utils--1.1.sql b/pgxn/neon_test_utils/neon_test_utils--1.2.sql
similarity index 96%
rename from pgxn/neon_test_utils/neon_test_utils--1.1.sql
rename to pgxn/neon_test_utils/neon_test_utils--1.2.sql
index 534784f319..f84a24ec8d 100644
--- a/pgxn/neon_test_utils/neon_test_utils--1.1.sql
+++ b/pgxn/neon_test_utils/neon_test_utils--1.2.sql
@@ -41,7 +41,7 @@ RETURNS bytea
 AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
 LANGUAGE C PARALLEL UNSAFE;
 
-CREATE FUNCTION neon_xlogflush(lsn pg_lsn)
+CREATE FUNCTION neon_xlogflush(lsn pg_lsn DEFAULT NULL)
 RETURNS VOID
 AS 'MODULE_PATHNAME', 'neon_xlogflush'
 LANGUAGE C PARALLEL UNSAFE;
diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control
index 5f6d640835..c7b9191ddc 100644
--- a/pgxn/neon_test_utils/neon_test_utils.control
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -1,6 +1,6 @@
 # neon_test_utils extension
 comment = 'helpers for neon testing and debugging'
-default_version = '1.1'
+default_version = '1.2'
 module_pathname = '$libdir/neon_test_utils'
 relocatable = true
 trusted = true
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 47f245fbf1..944936d395 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -15,6 +15,7 @@
 #include "access/relation.h"
 #include "access/xact.h"
 #include "access/xlog.h"
+#include "access/xlog_internal.h"
 #include "catalog/namespace.h"
 #include "fmgr.h"
 #include "funcapi.h"
@@ -444,11 +445,46 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 
 /*
  * Directly calls XLogFlush(lsn) to flush WAL buffers.
+ *
+ * If 'lsn' is not specified (is NULL), flush all generated WAL.
  */
 Datum
 neon_xlogflush(PG_FUNCTION_ARGS)
 {
-	XLogRecPtr	lsn = PG_GETARG_LSN(0);
+	XLogRecPtr	lsn;
+
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("cannot flush WAL during recovery.")));
+
+	if (!PG_ARGISNULL(0))
+		lsn = PG_GETARG_LSN(0);
+	else
+	{
+		lsn = GetXLogInsertRecPtr();
+
+		/*---
+		 * The LSN returned by GetXLogInsertRecPtr() is the position where the
+		 * next inserted record would begin. If the last record ended just at
+		 * the page boundary, the next record will begin after the page header
+		 * on the next page, and that's what GetXLogInsertRecPtr().returns,
+		 * but the page header has not been written yet. If we tried to flush
+		 * it, XLogFlush() would throw an error:
+		 *
+		 * ERROR : xlog flush request %X/%X is not satisfied --- flushed only to %X/%X
+		 *
+		 * To avoid that, if the insert position points to just after the page
+		 * header, back off to page boundary.
+		 */
+		if (lsn % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
+			XLogSegmentOffset(lsn, wal_segment_size) > XLOG_BLCKSZ)
+			lsn -= SizeOfXLogShortPHD;
+		else if (lsn % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
+				 XLogSegmentOffset(lsn, wal_segment_size) < XLOG_BLCKSZ)
+			lsn -= SizeOfXLogLongPHD;
+	}
 
 	XLogFlush(lsn);
 	PG_RETURN_VOID();

From 9ce193082a26714400a788f96e0c0cf95c7879df Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 1 Jul 2024 12:58:12 +0300
Subject: [PATCH 1084/1571] Restore running xacts from CLOG on replica startup
 (#7288)

We have one pretty serious MVCC visibility bug with hot standby
replicas. We incorrectly treat any transactions that are in progress
in the primary, when the standby is started, as aborted. That can
break MVCC for queries running concurrently in the standby. It can
also lead to hint bits being set incorrectly, and that damage can last
until the replica is restarted.

The fundamental bug was that we treated any replica start as starting
from a shut down server. The fix for that is straightforward: we need
to set 'wasShutdown = false' in InitWalRecovery() (see changes in the
postgres repo).

However, that introduces a new problem: with wasShutdown = false, the
standby will not open up for queries until it receives a running-xacts
WAL record from the primary. That's correct, and that's how Postgres
hot standby always works. But it's a problem for Neon, because:

* It changes the historical behavior for existing users. Currently,
  the standby immediately opens up for queries, so if they now need to
  wait, we can breka existing use cases that were working fine
  (assuming you don't hit the MVCC issues).

* The problem is much worse for Neon than it is for standalone
  PostgreSQL, because in Neon, we can start a replica from an
  arbitrary LSN. In standalone PostgreSQL, the replica always starts
  WAL replay from a checkpoint record, and the primary arranges things
  so that there is always a running-xacts record soon after each
  checkpoint record. You can still hit this issue with PostgreSQL if
  you have a transaction with lots of subtransactions running in the
  primary, but it's pretty rare in practice.

To mitigate that, we introduce another way to collect the
running-xacts information at startup, without waiting for the
running-xacts WAL record: We can the CLOG for XIDs that haven't been
marked as committed or aborted. It has limitations with
subtransactions too, but should mitigate the problem for most users.

See https://github.com/neondatabase/neon/issues/7236.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pageserver/src/walingest.rs                   |  40 +-
 pgxn/neon/neon.c                              | 293 ++++++++
 test_runner/fixtures/neon_fixtures.py         |   4 +-
 test_runner/fixtures/pageserver/utils.py      |   2 +-
 test_runner/regress/test_replica_start.py     | 646 ++++++++++++++++++
 test_runner/regress/test_replication_start.py |  32 -
 vendor/postgres-v14                           |   2 +-
 vendor/postgres-v15                           |   2 +-
 vendor/postgres-v16                           |   2 +-
 vendor/revisions.json                         |   6 +-
 10 files changed, 981 insertions(+), 48 deletions(-)
 create mode 100644 test_runner/regress/test_replica_start.py
 delete mode 100644 test_runner/regress/test_replication_start.py

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index fb10bca5a6..07c90385e6 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -343,7 +343,33 @@ impl WalIngest {
                         xlog_checkpoint.oldestActiveXid,
                         self.checkpoint.oldestActiveXid
                     );
-                    self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
+
+                    // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
+                    // because at shutdown, all in-progress transactions will implicitly
+                    // end. Postgres startup code knows that, and allows hot standby to start
+                    // immediately from a shutdown checkpoint.
+                    //
+                    // In Neon, Postgres hot standby startup always behaves as if starting from
+                    // an online checkpoint. It needs a valid `oldestActiveXid` value, so
+                    // instead of overwriting self.checkpoint.oldestActiveXid with
+                    // InvalidTransactionid from the checkpoint WAL record, update it to a
+                    // proper value, knowing that there are no in-progress transactions at this
+                    // point, except for prepared transactions.
+                    //
+                    // See also the neon code changes in the InitWalRecovery() function.
+                    if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
+                        && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+                    {
+                        let mut oldest_active_xid = self.checkpoint.nextXid.value as u32;
+                        for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
+                            if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
+                                oldest_active_xid = xid;
+                            }
+                        }
+                        self.checkpoint.oldestActiveXid = oldest_active_xid;
+                    } else {
+                        self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
+                    }
 
                     // Write a new checkpoint key-value pair on every checkpoint record, even
                     // if nothing really changed. Not strictly required, but it seems nice to
@@ -375,6 +401,7 @@ impl WalIngest {
                 if info == pg_constants::XLOG_RUNNING_XACTS {
                     let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
                     self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
+                    self.checkpoint_modified = true;
                 }
             }
             pg_constants::RM_REPLORIGIN_ID => {
@@ -1277,13 +1304,10 @@ impl WalIngest {
             xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db
         );
 
-        // Here we treat oldestXid and oldestXidDB
-        // differently from postgres redo routines.
-        // In postgres checkpoint.oldestXid lags behind xlrec.oldest_xid
-        // until checkpoint happens and updates the value.
-        // Here we can use the most recent value.
-        // It's just an optimization, though and can be deleted.
-        // TODO Figure out if there will be any issues with replica.
+        // In Postgres, oldestXid and oldestXidDB are updated in memory when the CLOG is
+        // truncated, but a checkpoint record with the updated values isn't written until
+        // later. In Neon, a server can start at any LSN, not just on a checkpoint record,
+        // so we keep the oldestXid and oldestXidDB up-to-date.
         self.checkpoint.oldestXid = xlrec.oldest_xid;
         self.checkpoint.oldestXidDB = xlrec.oldest_xid_db;
         self.checkpoint_modified = true;
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index b6b2db7e71..e4968bdf89 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -12,6 +12,8 @@
 #include "fmgr.h"
 
 #include "miscadmin.h"
+#include "access/subtrans.h"
+#include "access/twophase.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "storage/buf_internals.h"
@@ -22,10 +24,12 @@
 #include "replication/logical.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
+#include "storage/proc.h"
 #include "storage/procsignal.h"
 #include "tcop/tcopprot.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
+#include "utils/builtins.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
 #include "utils/wait_event.h"
@@ -266,6 +270,293 @@ LogicalSlotsMonitorMain(Datum main_arg)
 	}
 }
 
+/*
+ * XXX: These private to procarray.c, but we need them here.
+ */
+#define PROCARRAY_MAXPROCS	(MaxBackends + max_prepared_xacts)
+#define TOTAL_MAX_CACHED_SUBXIDS \
+	((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
+
+/*
+ * Restore running-xact information by scanning the CLOG at startup.
+ *
+ * In PostgreSQL, a standby always has to wait for a running-xacts WAL record
+ * to arrive before it can start accepting queries. Furthermore, if there are
+ * transactions with too many subxids (> 64) open to fit in the in-memory
+ * subxids cache, the running-xacts record will be marked as "suboverflowed",
+ * and the standby will need to also wait for the currently in-progress
+ * transactions to finish.
+ *
+ * That's not great in PostgreSQL, because a hot standby does not necessary
+ * open up for queries immediately as you might expect. But it's worse in
+ * Neon: A standby in Neon doesn't need to start WAL replay from a checkpoint
+ * record; it can start at any LSN. Postgres arranges things so that there is
+ * a running-xacts record soon after every checkpoint record, but when you
+ * start from an arbitrary LSN, that doesn't help. If the primary is idle, or
+ * not running at all, it might never write a new running-xacts record,
+ * leaving the replica in a limbo where it can never start accepting queries.
+ *
+ * To mitigate that, we have an additional mechanism to find the running-xacts
+ * information: we scan the CLOG, making note of any XIDs not marked as
+ * committed or aborted. They are added to the Postgres known-assigned XIDs
+ * array by calling ProcArrayApplyRecoveryInfo() in the caller of this
+ * function.
+ *
+ * There is one big limitation with that mechanism: The size of the
+ * known-assigned XIDs is limited, so if there are a lot of in-progress XIDs,
+ * we have to give up. Furthermore, we don't know how many of the in-progress
+ * XIDs are subtransactions, and if we use up all the space in the
+ * known-assigned XIDs array for subtransactions, we might run out of space in
+ * the array later during WAL replay, causing the replica to shut down with
+ * "ERROR: too many KnownAssignedXids". The safe # of XIDs that we can add to
+ * the known-assigned array without risking that error later is very low,
+ * merely PGPROC_MAX_CACHED_SUBXIDS == 64, so we take our chances and use up
+ * to half of the known-assigned XIDs array for the subtransactions, even
+ * though that risks getting the error later.
+ *
+ * Note: It's OK if the recovered list of XIDs includes some transactions that
+ * have crashed in the primary, and hence will never commit. They will be seen
+ * as in-progress, until we see a new next running-acts record with an
+ * oldestActiveXid that invalidates them. That's how the known-assigned XIDs
+ * array always works.
+ *
+ * If scraping the CLOG doesn't succeed for some reason, like the subxid
+ * overflow, Postgres will fall back to waiting for a running-xacts record
+ * like usual.
+ *
+ * Returns true if a complete list of in-progress XIDs was scraped.
+ */
+static bool
+RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *nxids)
+{
+	TransactionId from;
+	TransactionId till;
+	int			max_xcnt;
+	TransactionId *prepared_xids = NULL;
+	int			n_prepared_xids;
+	TransactionId *restored_xids = NULL;
+	int			n_restored_xids;
+	int			next_prepared_idx;
+
+	Assert(*xids == NULL);
+
+	/*
+	 * If the checkpoint doesn't have a valid oldestActiveXid, bail out. We
+	 * don't know where to start the scan.
+	 *
+	 * This shouldn't happen, because the pageserver always maintains a valid
+	 * oldestActiveXid nowadays. Except when starting at an old point in time
+	 * that was ingested before the pageserver was taught to do that.
+	 */
+	if (!TransactionIdIsValid(checkpoint->oldestActiveXid))
+	{
+		elog(LOG, "cannot restore running-xacts from CLOG because oldestActiveXid is not set");
+		goto fail;
+	}
+
+	/*
+	 * We will scan the CLOG starting from the oldest active XID.
+	 *
+	 * In some corner cases, the oldestActiveXid from the last checkpoint
+	 * might already have been truncated from the CLOG. That is,
+	 * oldestActiveXid might be older than oldestXid. That's possible because
+	 * oldestActiveXid is only updated at checkpoints. After the last
+	 * checkpoint, the oldest transaction might have committed, and the CLOG
+	 * might also have been already truncated. So if oldestActiveXid is older
+	 * than oldestXid, start at oldestXid instead. (Otherwise we'd try to
+	 * access CLOG segments that have already been truncated away.)
+	 */
+	from = TransactionIdPrecedes(checkpoint->oldestXid, checkpoint->oldestActiveXid)
+		? checkpoint->oldestActiveXid : checkpoint->oldestXid;
+	till = XidFromFullTransactionId(checkpoint->nextXid);
+
+	/*
+	 * To avoid "too many KnownAssignedXids" error later during replay, we
+	 * limit number of collected transactions. This is a tradeoff: if we are
+	 * willing to consume more of the KnownAssignedXids space for the XIDs
+	 * now, that allows us to start up, but we might run out of space later.
+	 *
+	 * The size of the KnownAssignedXids array is TOTAL_MAX_CACHED_SUBXIDS,
+	 * which is (PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS). In
+	 * PostgreSQL, that's always enough because the primary will always write
+	 * an XLOG_XACT_ASSIGNMENT record if a transaction has more than
+	 * PGPROC_MAX_CACHED_SUBXIDS subtransactions. Seeing that record allows
+	 * the standby to mark the XIDs in pg_subtrans and removing them from the
+	 * KnowingAssignedXids array.
+	 *
+	 * Here, we don't know which XIDs belong to subtransactions that have
+	 * already been WAL-logged with an XLOG_XACT_ASSIGNMENT record. If we
+	 * wanted to be totally safe and avoid the possibility of getting a "too
+	 * many KnownAssignedXids" error later, we would have to limit ourselves
+	 * to PGPROC_MAX_CACHED_SUBXIDS, which is not much. And that includes top
+	 * transaction IDs too, because we cannot distinguish between top
+	 * transaction IDs and subtransactions here.
+	 *
+	 * Somewhat arbitrarily, we use up to half of KnownAssignedXids. That
+	 * strikes a sensible balance between being useful, and risking a "too
+	 * many KnownAssignedXids" error later.
+	 */
+	max_xcnt = TOTAL_MAX_CACHED_SUBXIDS / 2;
+
+	/*
+	 * Collect XIDs of prepared transactions in an array. This includes only
+	 * their top-level XIDs. We assume that StandbyRecoverPreparedTransactions
+	 * has already been called, so we can find all the sub-transactions in
+	 * pg_subtrans.
+	 */
+	PrescanPreparedTransactions(&prepared_xids, &n_prepared_xids);
+	qsort(prepared_xids, n_prepared_xids, sizeof(TransactionId), xidLogicalComparator);
+
+	/*
+	 * Scan the CLOG, collecting in-progress XIDs into 'restored_xids'.
+	 */
+	elog(DEBUG1, "scanning CLOG between %u and %u for in-progress XIDs", from, till);
+	restored_xids = (TransactionId *) palloc(max_xcnt * sizeof(TransactionId));
+	n_restored_xids = 0;
+	next_prepared_idx = 0;
+	for (TransactionId xid = from; xid != till;)
+	{
+		XLogRecPtr	xidlsn;
+		XidStatus	xidstatus;
+
+		xidstatus = TransactionIdGetStatus(xid, &xidlsn);
+
+		/*
+		 * "Merge" the prepared transactions into the restored_xids array as
+		 * we go.  The prepared transactions array is sorted. This is mostly
+		 * a sanity check to ensure that all the prpeared transactions are
+		 * seen as in-progress. (There is a check after the loop that we didn't
+		 * miss any.)
+		 */
+		if (next_prepared_idx < n_prepared_xids && xid == prepared_xids[next_prepared_idx])
+		{
+			/*
+			 * This is a top-level transaction ID of a prepared transaction.
+			 * Include it in the array.
+			 */
+
+			/* sanity check */
+			if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS)
+			{
+				elog(LOG, "prepared transaction %u has unexpected status %X, cannot restore running-xacts from CLOG",
+					 xid, xidstatus);
+				Assert(false);
+				goto fail;
+			}
+
+			elog(DEBUG1, "XID %u: was next prepared xact (%d / %d)", xid, next_prepared_idx, n_prepared_xids);
+			next_prepared_idx++;
+		}
+		else if (xidstatus == TRANSACTION_STATUS_COMMITTED)
+		{
+			elog(DEBUG1, "XID %u: was committed", xid);
+			goto skip;
+		}
+		else if (xidstatus == TRANSACTION_STATUS_ABORTED)
+		{
+			elog(DEBUG1, "XID %u: was aborted", xid);
+			goto skip;
+		}
+		else if (xidstatus == TRANSACTION_STATUS_IN_PROGRESS)
+		{
+			/*
+			 * In-progress transactions are included in the array.
+			 *
+			 * Except subtransactions of the prepared transactions. They are
+			 * already set in pg_subtrans, and hence don't need to be tracked
+			 * in the known-assigned XIDs array.
+			 */
+			if (n_prepared_xids > 0)
+			{
+				TransactionId parent = SubTransGetParent(xid);
+
+				if (TransactionIdIsValid(parent))
+				{
+					/*
+					 * This is a subtransaction belonging to a prepared
+					 * transaction.
+					 *
+					 * Sanity check that it is in the prepared XIDs array. It
+					 * should be, because StandbyRecoverPreparedTransactions
+					 * populated pg_subtrans, and no other XID should be set
+					 * in it yet. (This also relies on the fact that
+					 * StandbyRecoverPreparedTransactions sets the parent of
+					 * each subxid to point directly to the top-level XID,
+					 * rather than restoring the original subtransaction
+					 * hierarchy.)
+					 */
+					if (bsearch(&parent, prepared_xids, next_prepared_idx,
+								sizeof(TransactionId), xidLogicalComparator) == NULL)
+					{
+						elog(LOG, "sub-XID %u has unexpected parent %u, cannot restore running-xacts from CLOG",
+							 xid, parent);
+						Assert(false);
+						goto fail;
+					}
+					elog(DEBUG1, "XID %u: was a subtransaction of prepared xid %u", xid, parent);
+					goto skip;
+				}
+			}
+
+			/* include it in the array */
+			elog(DEBUG1, "XID %u: is in progress", xid);
+		}
+		else
+		{
+			/*
+			 * SUB_COMMITTED is a transient state used at commit. We don't
+			 * expect to see that here.
+			 */
+			elog(LOG, "XID %u has unexpected status %X in pg_xact, cannot restore running-xacts from CLOG",
+				 xid, xidstatus);
+			Assert(false);
+			goto fail;
+		}
+
+		if (n_restored_xids >= max_xcnt)
+		{
+			/*
+			 * Overflowed. We won't be able to install the RunningTransactions
+			 * snapshot.
+			 */
+			elog(LOG, "too many running xacts to restore from the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u",
+				 checkpoint->oldestXid, checkpoint->oldestActiveXid,
+				 XidFromFullTransactionId(checkpoint->nextXid));
+			goto fail;
+		}
+
+		restored_xids[n_restored_xids++] = xid;
+
+	skip:
+		TransactionIdAdvance(xid);
+		continue;
+	}
+
+	/* sanity check */
+	if (next_prepared_idx != n_prepared_xids)
+	{
+		elog(LOG, "prepared transaction ID %u was not visited in the CLOG scan, cannot restore running-xacts from CLOG",
+			 prepared_xids[next_prepared_idx]);
+		Assert(false);
+		goto fail;
+	}
+
+	elog(LOG, "restored %d running xacts by scanning the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u",
+		 n_restored_xids, checkpoint->oldestXid, checkpoint->oldestActiveXid, XidFromFullTransactionId(checkpoint->nextXid));
+	*nxids = n_restored_xids;
+	*xids = restored_xids;
+	return true;
+
+ fail:
+	*nxids = 0;
+	*xids = NULL;
+	if (restored_xids)
+		pfree(restored_xids);
+	if (prepared_xids)
+		pfree(prepared_xids);
+	return false;
+}
+
 void
 _PG_init(void)
 {
@@ -288,6 +579,8 @@ _PG_init(void)
 
 	pg_init_extension_server();
 
+	restore_running_xacts_callback = RestoreRunningXactsFromClog;
+
 	/*
 	 * Important: This must happen after other parts of the extension are
 	 * loaded, otherwise any settings to GUCs that were set before the
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a1cb1b5195..e1c8514351 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3856,7 +3856,9 @@ class EndpointFactory:
 
         return self
 
-    def new_replica(self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]]):
+    def new_replica(
+        self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]] = None
+    ):
         branch_name = origin.branch_name
         assert origin in self.endpoints
         assert branch_name is not None
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 60535b7592..b75a480a63 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -198,7 +198,7 @@ def wait_for_last_record_lsn(
     lsn: Lsn,
 ) -> Lsn:
     """waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
-    for i in range(100):
+    for i in range(1000):
         current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
         if current_lsn >= lsn:
             return current_lsn
diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py
new file mode 100644
index 0000000000..17d476a8a6
--- /dev/null
+++ b/test_runner/regress/test_replica_start.py
@@ -0,0 +1,646 @@
+"""
+In PostgreSQL, a standby always has to wait for a running-xacts WAL record to
+arrive before it can start accepting queries. Furthermore, if there are
+transactions with too many subxids (> 64) open to fit in the in-memory subxids
+cache, the running-xacts record will be marked as "suboverflowed", and the
+standby will need to also wait for the currently in-progress transactions to
+finish.
+
+In Neon, we have an additional mechanism that scans the CLOG at server startup
+to determine the list of running transactions, so that the standby can start up
+immediately without waiting for the running-xacts record, but that mechanism
+only works if the # of active (sub-)transactions is reasonably small. Otherwise
+it falls back to waiting. Furthermore, it's somewhat optimistic in using up the
+known-assigned XIDs array: if too many transactions with subxids are started in
+the primary later, the replay in the replica will crash with "too many
+KnownAssignedXids" error.
+
+This module contains tests for those various cases at standby startup: starting
+from shutdown checkpoint, using the CLOG scanning mechanism, waiting for
+running-xacts record and for in-progress transactions to finish etc.
+"""
+
+import threading
+from contextlib import closing
+
+import psycopg2
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup
+from fixtures.pg_version import PgVersion
+from fixtures.utils import query_scalar, wait_until
+
+CREATE_SUBXACTS_FUNC = """
+create or replace function create_subxacts(n integer) returns void as $$
+declare
+   i integer;
+begin
+   for i in 1..n loop
+      begin
+         insert into t (payload) values (0);
+      exception
+         when others then
+            raise exception 'caught something: %', sqlerrm;
+      end;
+   end loop;
+end; $$ language plpgsql
+"""
+
+
+def test_replica_start_scan_clog(neon_simple_env: NeonEnv):
+    """
+    Test the CLOG-scanning mechanism at hot standby startup. There is one
+    transaction active in the primary when the standby is started. The primary
+    is killed before it has a chance to write a running-xacts record. The
+    CLOG-scanning at neon startup allows the standby to start up anyway.
+
+    See the module docstring for background.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+    primary_cur.execute("select pg_switch_wal()")
+
+    # Start a transaction in the primary. Leave the transaction open.
+    #
+    # The transaction has some subtransactions, but not too many to cause the
+    # CLOG-scanning mechanism to give up.
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(50)")
+
+    # Wait for the WAL to be flushed, but then immediately kill the primary,
+    # before it has a chance to generate a running-xacts record.
+    primary_cur.execute("select neon_xlogflush()")
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+    primary.stop(mode="immediate")
+
+    # Create a replica. It should start up normally, thanks to the CLOG-scanning
+    # mechanism.
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+
+    # The transaction did not commit, so it should not be visible in the secondary
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (0,)
+
+
+def test_replica_start_scan_clog_crashed_xids(neon_simple_env: NeonEnv):
+    """
+    Test the CLOG-scanning mechanism at hot standby startup, after
+    leaving behind crashed transactions.
+
+    See the module docstring for background.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+    primary_cur.execute("select pg_switch_wal()")
+
+    # Consume a lot of XIDs, then kill Postgres without giving it a
+    # chance to write abort records for them.
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(100000)")
+    primary.stop(mode="immediate")
+
+    # Restart the primary. Do some light work, and shut it down cleanly
+    primary.start()
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("insert into t (payload) values (0)")
+    primary.stop(mode="fast")
+
+    # Create a replica. It should start up normally, thanks to the CLOG-scanning
+    # mechanism. (Restarting the primary writes a checkpoint and/or running-xacts
+    # record, which allows the standby to know that the crashed XIDs are aborted)
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (1,)
+
+
+def test_replica_start_at_running_xacts(neon_simple_env: NeonEnv, pg_version):
+    """
+    Test that starting a replica works right after the primary has
+    created a running-xacts record. This may seem like a trivial case,
+    but during development, we had a bug that was triggered by having
+    oldestActiveXid == nextXid. Starting right after a running-xacts
+    record is one way to test that case.
+
+    See the module docstring for background.
+    """
+    env = neon_simple_env
+
+    if env.pg_version == PgVersion.V14 or env.pg_version == PgVersion.V15:
+        pytest.skip("pg_log_standby_snapshot() function is available only in PG16")
+
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+    primary_cur.execute("select pg_log_standby_snapshot()")
+    primary_cur.execute("select neon_xlogflush()")
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select 123")
+    assert secondary_cur.fetchone() == (123,)
+
+
+def test_replica_start_wait_subxids_finish(neon_simple_env: NeonEnv):
+    """
+    Test replica startup when there are a lot of (sub)transactions active in the
+    primary. That's too many for the CLOG-scanning mechanism to handle, so the
+    replica has to wait for the large transaction to finish before it starts to
+    accept queries.
+
+    After replica startup, test MVCC with transactions that were in-progress
+    when the replica was started.
+
+    See the module docstring for background.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create
+    # lots of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+
+    # Start a transaction with 100000 subtransactions, and leave it open. That's
+    # too many to fit in the "known-assigned XIDs array" in the replica, and
+    # also too many to fit in the subxid caches so the running-xacts record will
+    # also overflow.
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(100000)")
+
+    # Start another, smaller transaction in the primary. We'll come back to this
+    # later.
+    primary_conn2 = primary.connect()
+    primary_cur2 = primary_conn2.cursor()
+    primary_cur2.execute("begin")
+    primary_cur2.execute("insert into t (payload) values (0)")
+
+    # Create a replica. but before that, wait for the wal to be flushed to
+    # safekeepers, so that the replica is started at a point where the large
+    # transaction is already active. (The whole transaction might not be flushed
+    # yet, but that's OK.)
+    #
+    # Start it in a separate thread, so that we can do other stuff while it's
+    # blocked waiting for the startup to finish.
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+    secondary = env.endpoints.new_replica(origin=primary, endpoint_id="secondary")
+    start_secondary_thread = threading.Thread(target=secondary.start)
+    start_secondary_thread.start()
+
+    # Verify that the replica has otherwise started up, but cannot start
+    # accepting queries yet.
+    log.info("Waiting 5 s to verify that the secondary does not start")
+    start_secondary_thread.join(5)
+    assert secondary.log_contains("consistent recovery state reached")
+    assert secondary.log_contains("started streaming WAL from primary")
+    # The "redo starts" message is printed when the first WAL record is
+    # received. It might or might not be present in the log depending on how
+    # far exactly the WAL was flushed when the replica was started, and whether
+    # background activity caused any more WAL records to be flushed on the
+    # primary afterwards.
+    #
+    # assert secondary.log_contains("redo # starts")
+
+    # should not be open for connections yet
+    assert start_secondary_thread.is_alive()
+    assert not secondary.is_running()
+    assert not secondary.log_contains("database system is ready to accept read-only connections")
+
+    # Commit the large transaction in the primary.
+    #
+    # Within the next 15 s, the primary should write a new running-xacts record
+    # to the WAL which shows the transaction as completed. Once the replica
+    # replays that record, it will start accepting queries.
+    primary_cur.execute("commit")
+    start_secondary_thread.join()
+
+    # Verify that the large transaction is correctly visible in the secondary
+    # (but not the second, small transaction, which is still in-progress!)
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (100000,)
+
+    # Perform some more MVCC testing using the second transaction that was
+    # started in the primary before the replica was created
+    primary_cur2.execute("select create_subxacts(10000)")
+
+    # The second transaction still hasn't committed
+    wait_replica_caughtup(primary, secondary)
+    secondary_cur.execute("BEGIN ISOLATION LEVEL REPEATABLE READ")
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (100000,)
+
+    # Commit the second transaction in the primary
+    primary_cur2.execute("commit")
+
+    # Should still be invisible to the old snapshot
+    wait_replica_caughtup(primary, secondary)
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (100000,)
+
+    # Commit the REPEATABLE READ transaction in the replica. Both
+    # primary transactions should now be visible to a new snapshot.
+    secondary_cur.execute("commit")
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (110001,)
+
+
+def test_replica_too_many_known_assigned_xids(neon_simple_env: NeonEnv):
+    """
+    The CLOG-scanning mechanism fills the known-assigned XIDs array
+    optimistically at standby startup, betting that it can still fit
+    upcoming transactions replayed later from the WAL in the
+    array. This test tests what happens when that bet fails and the
+    known-assigned XID array fills up after the standby has already
+    been started. The WAL redo will fail with an error:
+
+    FATAL:  too many KnownAssignedXids
+    CONTEXT:  WAL redo at 0/1895CB0 for neon/INSERT: off: 25, flags: 0x08; blkref #0: rel 1663/5/16385, blk 64
+
+    which causes the standby to shut down.
+
+    See the module docstring for background.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+
+    # Determine how many connections we can use
+    primary_cur.execute("show max_connections")
+    max_connections = int(primary_cur.fetchall()[0][0])
+    primary_cur.execute("show superuser_reserved_connections")
+    superuser_reserved_connections = int(primary_cur.fetchall()[0][0])
+    n_connections = max_connections - superuser_reserved_connections
+    n_subxids = 200
+
+    # Start one top transaction in primary, with lots of subtransactions. This
+    # uses up much of the known-assigned XIDs space in the standby, but doesn't
+    # cause it to overflow.
+    large_p_conn = primary.connect()
+    large_p_cur = large_p_conn.cursor()
+    large_p_cur.execute("begin")
+    large_p_cur.execute(f"select create_subxacts({max_connections} * 30)")
+
+    with closing(primary.connect()) as small_p_conn:
+        with small_p_conn.cursor() as small_p_cur:
+            small_p_cur.execute("select create_subxacts(1)")
+
+    # Create a replica at this LSN
+    primary_cur.execute("select neon_xlogflush()")
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+
+    # The transaction in primary has not committed yet.
+    wait_replica_caughtup(primary, secondary)
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (1,)
+
+    # Start max number of top transactions in primary, with a lot of
+    # subtransactions each. We add the subtransactions to each top transaction
+    # in a round-robin fashion, instead of adding a lot of subtransactions to
+    # one top transaction at a time. This way, we will have the max number of
+    # subtransactions in the in-memory subxid cache of each top transaction,
+    # until they all overflow.
+    #
+    # Currently, PGPROC_MAX_CACHED_SUBXIDS == 64, so this will overflow the all
+    # the subxid caches after creating 64 subxids in each top transaction. The
+    # point just before the caches have overflowed is the most interesting point
+    # in time, but we'll keep going beyond that, to ensure that this test is
+    # robust even if PGPROC_MAX_CACHED_SUBXIDS changes.
+    p_curs = []
+    for _ in range(0, n_connections):
+        p_cur = primary.connect().cursor()
+        p_cur.execute("begin")
+        p_curs.append(p_cur)
+
+    for _subxid in range(0, n_subxids):
+        for i in range(0, n_connections):
+            p_curs[i].execute("select create_subxacts(1)")
+
+    # Commit all the transactions in the primary
+    for i in range(0, n_connections):
+        p_curs[i].execute("commit")
+    large_p_cur.execute("commit")
+
+    # Wait until the replica crashes with "too many KnownAssignedXids" error.
+    def check_replica_crashed():
+        try:
+            secondary.connect()
+        except psycopg2.Error:
+            # Once the connection fails, return success
+            return None
+        raise RuntimeError("connection succeeded")
+
+    wait_until(20, 0.5, check_replica_crashed)
+    assert secondary.log_contains("too many KnownAssignedXids")
+
+    # Replica is crashed, so ignore stop result
+    secondary.check_stop_result = False
+
+
+def test_replica_start_repro_visibility_bug(neon_simple_env: NeonEnv):
+    """
+    Before PR #7288, a hot standby in neon incorrectly started up
+    immediately, before it had received a running-xacts record. That
+    led to visibility bugs if there were active transactions in the
+    primary. This test reproduces the incorrect query results and
+    incorrectly set hint bits, before that was fixed.
+    """
+    env = neon_simple_env
+
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    p_cur = primary.connect().cursor()
+
+    p_cur.execute("begin")
+    p_cur.execute("create table t(pk integer primary key, payload integer)")
+    p_cur.execute("insert into t values (generate_series(1,100000), 0)")
+
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+    wait_replica_caughtup(primary, secondary)
+    s_cur = secondary.connect().cursor()
+
+    # Set hint bits for pg_class tuples. If primary's transaction is
+    # not marked as in-progress in MVCC snapshot, then XMIN_INVALID
+    # hint bit will be set for table's 't' tuple, making it invisible
+    # even after the commit record is replayed later.
+    s_cur.execute("select * from pg_class")
+
+    p_cur.execute("commit")
+    wait_replica_caughtup(primary, secondary)
+    s_cur.execute("select * from t where pk = 1")
+    assert s_cur.fetchone() == (1, 0)
+
+
+@pytest.mark.parametrize("shutdown", [True, False])
+def test_replica_start_with_prepared_xacts(neon_simple_env: NeonEnv, shutdown: bool):
+    """
+    Test the CLOG-scanning mechanism at hot standby startup in the presence of
+    prepared transactions.
+
+    This test is run in two variants: one where the primary server is shut down
+    before starting the secondary, or not.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(
+        branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"]
+    )
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute("create table t1(pk integer primary key)")
+    primary_cur.execute("create table t2(pk integer primary key)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+
+    # Prepare a transaction for two-phase commit
+    primary_cur.execute("begin")
+    primary_cur.execute("insert into t1 values (1)")
+    primary_cur.execute("prepare transaction 't1'")
+
+    # Prepare another transaction for two-phase commit, with a subtransaction
+    primary_cur.execute("begin")
+    primary_cur.execute("insert into t2 values (2)")
+    primary_cur.execute("savepoint sp")
+    primary_cur.execute("insert into t2 values (3)")
+    primary_cur.execute("prepare transaction 't2'")
+
+    # Start a transaction in the primary. Leave the transaction open.
+    #
+    # The transaction has some subtransactions, but not too many to cause the
+    # CLOG-scanning mechanism to give up.
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(50)")
+
+    # Wait for the WAL to be flushed
+    primary_cur.execute("select neon_xlogflush()")
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+
+    if shutdown:
+        primary.stop(mode="fast")
+
+    # Create a replica. It should start up normally, thanks to the CLOG-scanning
+    # mechanism.
+    secondary = env.endpoints.new_replica_start(
+        origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"]
+    )
+
+    # The transaction did not commit, so it should not be visible in the secondary
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (0,)
+    secondary_cur.execute("select count(*) from t1")
+    assert secondary_cur.fetchone() == (0,)
+    secondary_cur.execute("select count(*) from t2")
+    assert secondary_cur.fetchone() == (0,)
+
+    if shutdown:
+        primary.start()
+        primary_conn = primary.connect()
+        primary_cur = primary_conn.cursor()
+    else:
+        primary_cur.execute("commit")
+    primary_cur.execute("commit prepared 't1'")
+    primary_cur.execute("commit prepared 't2'")
+
+    wait_replica_caughtup(primary, secondary)
+
+    secondary_cur.execute("select count(*) from t")
+    if shutdown:
+        assert secondary_cur.fetchone() == (0,)
+    else:
+        assert secondary_cur.fetchone() == (50,)
+    secondary_cur.execute("select * from t1")
+    assert secondary_cur.fetchall() == [(1,)]
+    secondary_cur.execute("select * from t2")
+    assert secondary_cur.fetchall() == [(2,), (3,)]
+
+
+def test_replica_start_with_prepared_xacts_with_subxacts(neon_simple_env: NeonEnv):
+    """
+    Test the CLOG-scanning mechanism at hot standby startup in the presence of
+    prepared transactions, with subtransactions.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(
+        branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"]
+    )
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+
+    # Install extension containing function needed for test
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+
+    # Advance nextXid close to the beginning of the next pg_subtrans segment (2^16 XIDs)
+    #
+    # This is interesting, because it tests that pg_subtrans is initialized correctly
+    # at standby startup. (We had a bug where it didn't at one point during development.)
+    while True:
+        xid = int(query_scalar(primary_cur, "SELECT txid_current()"))
+        log.info(f"xid now {xid}")
+        # Consume 500 transactions at a time until we get close
+        if xid < 65535 - 600:
+            primary_cur.execute("select test_consume_xids(500);")
+        else:
+            break
+    primary_cur.execute("checkpoint")
+
+    # Prepare a transaction for two-phase commit
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(1000)")
+    primary_cur.execute("prepare transaction 't1'")
+
+    # Wait for the WAL to be flushed, and stop the primary
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+    primary.stop(mode="fast")
+
+    # Create a replica. It should start up normally, thanks to the CLOG-scanning
+    # mechanism.
+    secondary = env.endpoints.new_replica_start(
+        origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"]
+    )
+
+    # The transaction did not commit, so it should not be visible in the secondary
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (0,)
+
+    primary.start()
+
+    # Open a lot of subtransactions in the primary, causing the subxids cache to overflow
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("select create_subxacts(100000)")
+
+    wait_replica_caughtup(primary, secondary)
+
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (100000,)
+
+    primary_cur.execute("commit prepared 't1'")
+
+    wait_replica_caughtup(primary, secondary)
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (101000,)
+
+
+def test_replica_start_with_prepared_xacts_with_many_subxacts(neon_simple_env: NeonEnv):
+    """
+    Test the CLOG-scanning mechanism at hot standby startup in the presence of
+    prepared transactions, with lots of subtransactions.
+
+    Like test_replica_start_with_prepared_xacts_with_subxacts, but with more
+    subxacts, to test that the prepared transaction's subxids don't consume
+    space in the known-assigned XIDs array. (They are set in pg_subtrans
+    instead)
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(
+        branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"]
+    )
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+
+    # Install extension containing function needed for test
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+
+    # Prepare a transaction for two-phase commit, with lots of subxids
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(50000)")
+
+    # to make things a bit more varied, intersperse a few other XIDs in between
+    # the prepared transaction's sub-XIDs
+    with primary.connect().cursor() as primary_cur2:
+        primary_cur2.execute("insert into t (payload) values (123)")
+        primary_cur2.execute("begin; insert into t (payload) values (-1); rollback")
+
+    primary_cur.execute("select create_subxacts(50000)")
+    primary_cur.execute("prepare transaction 't1'")
+
+    # Wait for the WAL to be flushed
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+
+    primary.stop(mode="fast")
+
+    # Create a replica. It should start up normally, thanks to the CLOG-scanning
+    # mechanism.
+    secondary = env.endpoints.new_replica_start(
+        origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"]
+    )
+
+    # The transaction did not commit, so it should not be visible in the secondary
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (1,)
+
+    primary.start()
+
+    # Open a lot of subtransactions in the primary, causing the subxids cache to overflow
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("select create_subxacts(100000)")
+
+    wait_replica_caughtup(primary, secondary)
+
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (100001,)
+
+    primary_cur.execute("commit prepared 't1'")
+
+    wait_replica_caughtup(primary, secondary)
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (200001,)
diff --git a/test_runner/regress/test_replication_start.py b/test_runner/regress/test_replication_start.py
deleted file mode 100644
index 2360745990..0000000000
--- a/test_runner/regress/test_replication_start.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import pytest
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup
-
-
-@pytest.mark.xfail
-def test_replication_start(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-
-    with env.endpoints.create_start(branch_name="main", endpoint_id="primary") as primary:
-        with primary.connect() as p_con:
-            with p_con.cursor() as p_cur:
-                p_cur.execute("begin")
-                p_cur.execute("create table t(pk integer primary key, payload integer)")
-                p_cur.execute("insert into t values (generate_series(1,100000), 0)")
-                p_cur.execute("select txid_current()")
-                xid = p_cur.fetchall()[0][0]
-                log.info(f"Master transaction {xid}")
-                with env.endpoints.new_replica_start(
-                    origin=primary, endpoint_id="secondary"
-                ) as secondary:
-                    wait_replica_caughtup(primary, secondary)
-                    with secondary.connect() as s_con:
-                        with s_con.cursor() as s_cur:
-                            # Enforce setting hint bits for pg_class tuples.
-                            # If master's transaction is not marked as in-progress in MVCC snapshot,
-                            # then XMIN_INVALID hint bit will be set for table's 't' tuple makeing it invisible.
-                            s_cur.execute("select * from pg_class")
-                            p_cur.execute("commit")
-                            wait_replica_caughtup(primary, secondary)
-                            s_cur.execute("select * from t where pk = 1")
-                            assert s_cur.fetchone() == (1, 0)
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 223dd92595..ad73770c44 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 223dd925959f8124711dd3d867dc8ba6629d52c0
+Subproject commit ad73770c446ea361f43e4f0404798b7e5e7a62d8
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index f54d7373eb..4874c8e52e 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit f54d7373eb0de5a54bce2becdb1c801026c7edff
+Subproject commit 4874c8e52ed349a9f8290bbdcd91eb92677a5d24
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index e06bebc753..b810fdfcbb 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit e06bebc75306b583e758b52c95946d41109239b2
+Subproject commit b810fdfcbb59afea7ea7bbe0cf94eaccb55a2ea2
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 574e371934..da49ff19c3 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "e06bebc75306b583e758b52c95946d41109239b2"],
-  "v15": ["15.7", "f54d7373eb0de5a54bce2becdb1c801026c7edff"],
-  "v14": ["14.12", "223dd925959f8124711dd3d867dc8ba6629d52c0"]
+  "v16": ["16.3", "b810fdfcbb59afea7ea7bbe0cf94eaccb55a2ea2"],
+  "v15": ["15.7", "4874c8e52ed349a9f8290bbdcd91eb92677a5d24"],
+  "v14": ["14.12", "ad73770c446ea361f43e4f0404798b7e5e7a62d8"]
 }

From aea5cfe21e62b4df285c0c55c12f79df8fbde1a4 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 1 Jul 2024 12:48:20 +0100
Subject: [PATCH 1085/1571] pageserver: add metric
 `pageserver_secondary_resident_physical_size` (#8204)

## Problem

We lack visibility of how much local disk space is used by secondary
tenant locations

Close: https://github.com/neondatabase/neon/issues/8181

## Summary of changes

- Add `pageserver_secondary_resident_physical_size`, tagged by tenant
- Register & de-register label sets from SecondaryTenant
- Add+use wrappers in SecondaryDetail that update metrics when
adding+removing layers/timelines
---
 pageserver/src/metrics.rs                     |  11 +-
 pageserver/src/tenant/secondary.rs            |  37 +++-
 pageserver/src/tenant/secondary/downloader.rs | 173 ++++++++++++++----
 3 files changed, 171 insertions(+), 50 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index f5aca6dfb3..9cd7ffa042 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -476,7 +476,7 @@ static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
 static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_resident_physical_size",
-        "The size of the layer files present in the pageserver's filesystem.",
+        "The size of the layer files present in the pageserver's filesystem, for attached locations.",
         &["tenant_id", "shard_id", "timeline_id"]
     )
     .expect("failed to define a metric")
@@ -1691,6 +1691,15 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
 }
 });
 
+pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_secondary_resident_physical_size",
+        "The size of the layer files present in the pageserver's filesystem, for secondary locations.",
+        &["tenant_id", "shard_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
     Upload,
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index af6840f525..a233d11c4a 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -23,6 +23,8 @@ use super::{
     storage_layer::LayerName,
 };
 
+use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE;
+use metrics::UIntGauge;
 use pageserver_api::{
     models,
     shard::{ShardIdentity, TenantShardId},
@@ -99,6 +101,17 @@ pub(crate) struct SecondaryTenant {
 
     // Public state indicating overall progress of downloads relative to the last heatmap seen
     pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
+
+    // Sum of layer sizes on local disk
+    pub(super) resident_size_metric: UIntGauge,
+}
+
+impl Drop for SecondaryTenant {
+    fn drop(&mut self) {
+        let tenant_id = self.tenant_shard_id.tenant_id.to_string();
+        let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
+        let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
+    }
 }
 
 impl SecondaryTenant {
@@ -108,6 +121,12 @@ impl SecondaryTenant {
         tenant_conf: TenantConfOpt,
         config: &SecondaryLocationConfig,
     ) -> Arc<Self> {
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let shard_id = format!("{}", tenant_shard_id.shard_slug());
+        let resident_size_metric = SECONDARY_RESIDENT_PHYSICAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id])
+            .unwrap();
+
         Arc::new(Self {
             tenant_shard_id,
             // todo: shall we make this a descendent of the
@@ -123,6 +142,8 @@ impl SecondaryTenant {
             detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
 
             progress: std::sync::Mutex::default(),
+
+            resident_size_metric,
         })
     }
 
@@ -211,16 +232,12 @@ impl SecondaryTenant {
             // have to 100% match what is on disk, because it's a best-effort warming
             // of the cache.
             let mut detail = this.detail.lock().unwrap();
-            if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
-                let removed = timeline_detail.on_disk_layers.remove(&name);
-
-                // We might race with removal of the same layer during downloads, if it was removed
-                // from the heatmap.  If we see that the OnDiskState is gone, then no need to
-                // do a physical deletion or store in evicted_at.
-                if let Some(removed) = removed {
-                    removed.remove_blocking();
-                    timeline_detail.evicted_at.insert(name, now);
-                }
+            if let Some(removed) =
+                detail.evict_layer(name, &timeline_id, now, &this.resident_size_metric)
+            {
+                // We might race with removal of the same layer during downloads, so finding the layer we
+                // were trying to remove is optional.  Only issue the disk I/O to remove it if we found it.
+                removed.remove_blocking();
             }
         })
         .await
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index f6f30641db..27439d4f03 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -46,6 +46,7 @@ use crate::tenant::{
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
+use metrics::UIntGauge;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
@@ -131,16 +132,66 @@ impl OnDiskState {
             .or_else(fs_ext::ignore_not_found)
             .fatal_err("Deleting secondary layer")
     }
+
+    pub(crate) fn file_size(&self) -> u64 {
+        self.metadata.file_size
+    }
 }
 
 #[derive(Debug, Clone, Default)]
 pub(super) struct SecondaryDetailTimeline {
-    pub(super) on_disk_layers: HashMap<LayerName, OnDiskState>,
+    on_disk_layers: HashMap<LayerName, OnDiskState>,
 
     /// We remember when layers were evicted, to prevent re-downloading them.
     pub(super) evicted_at: HashMap<LayerName, SystemTime>,
 }
 
+impl SecondaryDetailTimeline {
+    pub(super) fn remove_layer(
+        &mut self,
+        name: &LayerName,
+        resident_metric: &UIntGauge,
+    ) -> Option<OnDiskState> {
+        let removed = self.on_disk_layers.remove(name);
+        if let Some(removed) = &removed {
+            resident_metric.sub(removed.file_size());
+        }
+        removed
+    }
+
+    /// `local_path`
+    fn touch_layer<F>(
+        &mut self,
+        conf: &'static PageServerConf,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        touched: &HeatMapLayer,
+        resident_metric: &UIntGauge,
+        local_path: F,
+    ) where
+        F: FnOnce() -> Utf8PathBuf,
+    {
+        use std::collections::hash_map::Entry;
+        match self.on_disk_layers.entry(touched.name.clone()) {
+            Entry::Occupied(mut v) => {
+                v.get_mut().access_time = touched.access_time;
+            }
+            Entry::Vacant(e) => {
+                e.insert(OnDiskState::new(
+                    conf,
+                    tenant_shard_id,
+                    timeline_id,
+                    touched.name.clone(),
+                    touched.metadata.clone(),
+                    touched.access_time,
+                    local_path(),
+                ));
+                resident_metric.add(touched.metadata.file_size);
+            }
+        }
+    }
+}
+
 // Aspects of a heatmap that we remember after downloading it
 #[derive(Clone, Debug)]
 struct DownloadSummary {
@@ -158,7 +209,7 @@ pub(super) struct SecondaryDetail {
 
     last_download: Option<DownloadSummary>,
     next_download: Option<Instant>,
-    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
+    timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
 }
 
 /// Helper for logging SystemTime
@@ -191,6 +242,38 @@ impl SecondaryDetail {
         }
     }
 
+    pub(super) fn evict_layer(
+        &mut self,
+        name: LayerName,
+        timeline_id: &TimelineId,
+        now: SystemTime,
+        resident_metric: &UIntGauge,
+    ) -> Option<OnDiskState> {
+        let timeline = self.timelines.get_mut(timeline_id)?;
+        let removed = timeline.remove_layer(&name, resident_metric);
+        if removed.is_some() {
+            timeline.evicted_at.insert(name, now);
+        }
+        removed
+    }
+
+    pub(super) fn remove_timeline(
+        &mut self,
+        timeline_id: &TimelineId,
+        resident_metric: &UIntGauge,
+    ) {
+        let removed = self.timelines.remove(timeline_id);
+        if let Some(removed) = removed {
+            resident_metric.sub(
+                removed
+                    .on_disk_layers
+                    .values()
+                    .map(|l| l.metadata.file_size)
+                    .sum(),
+            );
+        }
+    }
+
     /// Additionally returns the total number of layers, used for more stable relative access time
     /// based eviction.
     pub(super) fn get_layers_for_eviction(
@@ -601,8 +684,13 @@ impl<'a> TenantDownloader<'a> {
                 Some(t) => t,
                 None => {
                     // We have no existing state: need to scan local disk for layers first.
-                    let timeline_state =
-                        init_timeline_state(self.conf, tenant_shard_id, timeline).await;
+                    let timeline_state = init_timeline_state(
+                        self.conf,
+                        tenant_shard_id,
+                        timeline,
+                        &self.secondary_state.resident_size_metric,
+                    )
+                    .await;
 
                     // Re-acquire detail lock now that we're done with async load from local FS
                     self.secondary_state
@@ -671,6 +759,25 @@ impl<'a> TenantDownloader<'a> {
                 .await?;
         }
 
+        // Metrics consistency check in testing builds
+        if cfg!(feature = "testing") {
+            let detail = self.secondary_state.detail.lock().unwrap();
+            let resident_size = detail
+                .timelines
+                .values()
+                .map(|tl| {
+                    tl.on_disk_layers
+                        .values()
+                        .map(|v| v.metadata.file_size)
+                        .sum::<u64>()
+                })
+                .sum::<u64>();
+            assert_eq!(
+                resident_size,
+                self.secondary_state.resident_size_metric.get()
+            );
+        }
+
         // Only update last_etag after a full successful download: this way will not skip
         // the next download, even if the heatmap's actual etag is unchanged.
         self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
@@ -783,7 +890,7 @@ impl<'a> TenantDownloader<'a> {
             for delete_timeline in &delete_timelines {
                 // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
                 // from disk fails that will be a fatal error.
-                detail.timelines.remove(delete_timeline);
+                detail.remove_timeline(delete_timeline, &self.secondary_state.resident_size_metric);
             }
         }
 
@@ -801,7 +908,7 @@ impl<'a> TenantDownloader<'a> {
             let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
                 continue;
             };
-            timeline_state.on_disk_layers.remove(&layer_name);
+            timeline_state.remove_layer(&layer_name, &self.secondary_state.resident_size_metric);
         }
 
         for timeline_id in delete_timelines {
@@ -1000,33 +1107,24 @@ impl<'a> TenantDownloader<'a> {
             let timeline_detail = detail.timelines.entry(timeline_id).or_default();
 
             tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
-
-            for t in touched {
-                use std::collections::hash_map::Entry;
-                match timeline_detail.on_disk_layers.entry(t.name.clone()) {
-                    Entry::Occupied(mut v) => {
-                        v.get_mut().access_time = t.access_time;
-                    }
-                    Entry::Vacant(e) => {
-                        let local_path = local_layer_path(
+            touched.into_iter().for_each(|t| {
+                timeline_detail.touch_layer(
+                    self.conf,
+                    tenant_shard_id,
+                    &timeline_id,
+                    &t,
+                    &self.secondary_state.resident_size_metric,
+                    || {
+                        local_layer_path(
                             self.conf,
                             tenant_shard_id,
                             &timeline_id,
                             &t.name,
                             &t.metadata.generation,
-                        );
-                        e.insert(OnDiskState::new(
-                            self.conf,
-                            tenant_shard_id,
-                            &timeline_id,
-                            t.name,
-                            t.metadata.clone(),
-                            t.access_time,
-                            local_path,
-                        ));
-                    }
-                }
-            }
+                        )
+                    },
+                )
+            });
         }
 
         result
@@ -1135,6 +1233,7 @@ async fn init_timeline_state(
     conf: &'static PageServerConf,
     tenant_shard_id: &TenantShardId,
     heatmap: &HeatMapTimeline,
+    resident_metric: &UIntGauge,
 ) -> SecondaryDetailTimeline {
     let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
     let mut detail = SecondaryDetailTimeline::default();
@@ -1210,17 +1309,13 @@ async fn init_timeline_state(
                         } else {
                             // We expect the access time to be initialized immediately afterwards, when
                             // the latest heatmap is applied to the state.
-                            detail.on_disk_layers.insert(
-                                name.clone(),
-                                OnDiskState::new(
-                                    conf,
-                                    tenant_shard_id,
-                                    &heatmap.timeline_id,
-                                    name,
-                                    remote_meta.metadata.clone(),
-                                    remote_meta.access_time,
-                                    file_path,
-                                ),
+                            detail.touch_layer(
+                                conf,
+                                tenant_shard_id,
+                                &heatmap.timeline_id,
+                                remote_meta,
+                                resident_metric,
+                                || file_path,
                             );
                         }
                     }

From e823b9294714d0c5048942907c06b678c4a6c4a0 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 1 Jul 2024 13:11:55 +0100
Subject: [PATCH 1086/1571] CI(build-tools): Remove libpq from build image
 (#8206)

## Problem
We use `build-tools` image as a base image to build other images, and it
has a pretty old `libpq-dev` installed (v13; it wasn't that old until I
removed system Postgres 14 from `build-tools` image in
https://github.com/neondatabase/neon/pull/6540)

## Summary of changes
- Remove `libpq-dev` from `build-tools` image
- Set `LD_LIBRARY_PATH` for tests (for different Postgres binaries that
we use, like psql and pgbench)
- Set `PQ_LIB_DIR` to build Storage Controller
- Set `LD_LIBRARY_PATH`/`DYLD_LIBRARY_PATH` in the Storage Controller
where it calls Postgres binaries
---
 .../actions/run-python-test-set/action.yml    |  1 +
 .github/workflows/benchmarking.yml            |  4 +++
 .github/workflows/build-build-tools-image.yml |  1 +
 .github/workflows/build_and_test.yml          |  7 ++++
 .github/workflows/neon_extra_builds.yml       |  7 ++++
 Dockerfile                                    |  3 +-
 Dockerfile.build-tools                        |  1 -
 control_plane/src/local_env.rs                | 11 ++++--
 control_plane/src/storage_controller.rs       | 34 +++++++++++++++----
 9 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index c6ea52ba88..a2aae0772b 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -114,6 +114,7 @@ runs:
         export PLATFORM=${PLATFORM:-github-actions-selfhosted}
         export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
         export DEFAULT_PG_VERSION=${PG_VERSION#v}
+        export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
 
         if [ "${BUILD_TYPE}" = "remote" ]; then
           export REMOTE_ENV=1
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index db4209500f..0e748adeb6 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -379,6 +379,10 @@ jobs:
 
     - name: Add Postgres binaries to PATH
       run: |
+        LD_LIBRARY_PATH="${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib"
+        export LD_LIBRARY_PATH
+        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> $GITHUB_ENV
+
         ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
         echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
 
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 5a94dd8e6f..f1c39e7e4f 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -82,6 +82,7 @@ jobs:
           tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
 
       - name: Remove custom docker config directory
+        if: always()
         run: |
           rm -rf /tmp/.docker-custom
 
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9cea9f4148..24ad26205b 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -335,6 +335,8 @@ jobs:
 
       - name: Run cargo build
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       # Do install *before* running rust tests because they might recompile the
@@ -383,6 +385,11 @@ jobs:
         env:
           NEXTEST_RETRIES: 3
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
+          export LD_LIBRARY_PATH
+
           #nextest does not yet support running doctests
           cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
 
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 7d2187e59c..330d858c0e 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -232,12 +232,19 @@ jobs:
 
       - name: Run cargo build
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
           mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
 
       - name: Run cargo test
         env:
           NEXTEST_RETRIES: 3
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
+          export LD_LIBRARY_PATH
+
           cargo nextest run $CARGO_FEATURES -j$(nproc)
 
           # Run separate tests for real S3
diff --git a/Dockerfile b/Dockerfile
index b4900d4a94..f0197758e4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -42,12 +42,13 @@ ARG CACHEPOT_BUCKET=neon-github-dev
 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
+COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
 COPY --chown=nonroot . .
 
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build  \
+    && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index f85706ef6a..30314376ef 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -26,7 +26,6 @@ RUN set -e \
         liblzma-dev \
         libncurses5-dev \
         libncursesw5-dev \
-        libpq-dev \
         libreadline-dev \
         libseccomp-dev \
         libsqlite3-dev \
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 6634274d2a..3ac3ce21df 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -325,11 +325,16 @@ impl LocalEnv {
         }
     }
 
-    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
+    pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result<PathBuf> {
+        Ok(self.pg_distrib_dir(pg_version)?.join(dir_name))
     }
+
+    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
+        self.pg_dir(pg_version, "bin")
+    }
+
     pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
+        self.pg_dir(pg_version, "lib")
     }
 
     pub fn pageserver_bin(&self) -> PathBuf {
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 5ca1b13b2a..47103a2e0a 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -155,16 +155,16 @@ impl StorageController {
         .expect("non-Unicode path")
     }
 
-    /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
+    /// Find the directory containing postgres subdirectories, such `bin` and `lib`
     ///
     /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
     /// to other versions if that one isn't found.  Some automated tests create circumstances
     /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
-    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+    async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
         let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
 
         for v in prefer_versions {
-            let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
+            let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
             if tokio::fs::try_exists(&path).await? {
                 return Ok(path);
             }
@@ -172,11 +172,20 @@ impl StorageController {
 
         // Fall through
         anyhow::bail!(
-            "Postgres binaries not found in {}",
-            self.env.pg_distrib_dir.display()
+            "Postgres directory '{}' not found in {}",
+            dir_name,
+            self.env.pg_distrib_dir.display(),
         );
     }
 
+    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+        self.get_pg_dir("bin").await
+    }
+
+    pub async fn get_pg_lib_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+        self.get_pg_dir("lib").await
+    }
+
     /// Readiness check for our postgres process
     async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
         let bin_path = pg_bin_dir.join("pg_isready");
@@ -229,12 +238,17 @@ impl StorageController {
             .unwrap()
             .join("storage_controller_db");
         let pg_bin_dir = self.get_pg_bin_dir().await?;
+        let pg_lib_dir = self.get_pg_lib_dir().await?;
         let pg_log_path = pg_data_path.join("postgres.log");
 
         if !tokio::fs::try_exists(&pg_data_path).await? {
             // Initialize empty database
             let initdb_path = pg_bin_dir.join("initdb");
             let mut child = Command::new(&initdb_path)
+                .envs(vec![
+                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ])
                 .args(["-D", pg_data_path.as_ref()])
                 .spawn()
                 .expect("Failed to spawn initdb");
@@ -269,7 +283,10 @@ impl StorageController {
             &self.env.base_data_dir,
             pg_bin_dir.join("pg_ctl").as_std_path(),
             db_start_args,
-            [],
+            vec![
+                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ],
             background_process::InitialPidFile::Create(self.postgres_pid_file()),
             retry_timeout,
             || self.pg_isready(&pg_bin_dir),
@@ -324,7 +341,10 @@ impl StorageController {
             &self.env.base_data_dir,
             &self.env.storage_controller_bin(),
             args,
-            [],
+            vec![
+                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ],
             background_process::InitialPidFile::Create(self.pid_file()),
             retry_timeout,
             || async {

From b02aafdfda4d410a33f11bd8d5f785c7cdccd740 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 1 Jul 2024 10:36:49 -0400
Subject: [PATCH 1087/1571] fix(pageserver): include aux file in basebackup
 only once (#8207)

Extracted from https://github.com/neondatabase/neon/pull/6560, currently
we include multiple copies of aux files in the basebackup.

## Summary of changes

Fix the loop.

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pageserver/src/basebackup.rs | 57 ++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 0f057a4368..207f781e1b 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -348,35 +348,36 @@ where
                     self.add_rel(rel, rel).await?;
                 }
             }
-
-            for (path, content) in self
-                .timeline
-                .list_aux_files(self.lsn, self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?
-            {
-                if path.starts_with("pg_replslot") {
-                    let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
-                    let restart_lsn = Lsn(u64::from_le_bytes(
-                        content[offs..offs + 8].try_into().unwrap(),
-                    ));
-                    info!("Replication slot {} restart LSN={}", path, restart_lsn);
-                    min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
-                } else if path == "pg_logical/replorigin_checkpoint" {
-                    // replorigin_checkoint is written only on compute shutdown, so it contains
-                    // deteriorated values. So we generate our own version of this file for the particular LSN
-                    // based on information about replorigins extracted from transaction commit records.
-                    // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
-                    // but now we should handle (skip) it for backward compatibility.
-                    continue;
-                }
-                let header = new_tar_header(&path, content.len() as u64)?;
-                self.ar
-                    .append(&header, &*content)
-                    .await
-                    .context("could not add aux file to basebackup tarball")?;
-            }
         }
+
+        for (path, content) in self
+            .timeline
+            .list_aux_files(self.lsn, self.ctx)
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?
+        {
+            if path.starts_with("pg_replslot") {
+                let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
+                let restart_lsn = Lsn(u64::from_le_bytes(
+                    content[offs..offs + 8].try_into().unwrap(),
+                ));
+                info!("Replication slot {} restart LSN={}", path, restart_lsn);
+                min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
+            } else if path == "pg_logical/replorigin_checkpoint" {
+                // replorigin_checkoint is written only on compute shutdown, so it contains
+                // deteriorated values. So we generate our own version of this file for the particular LSN
+                // based on information about replorigins extracted from transaction commit records.
+                // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
+                // but now we should handle (skip) it for backward compatibility.
+                continue;
+            }
+            let header = new_tar_header(&path, content.len() as u64)?;
+            self.ar
+                .append(&header, &*content)
+                .await
+                .context("could not add aux file to basebackup tarball")?;
+        }
+
         if min_restart_lsn != Lsn::MAX {
             info!(
                 "Min restart LSN for logical replication is {}",

From 9c32604aa98f86089b2f74863bebb7aad67424d9 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 1 Jul 2024 16:42:23 +0100
Subject: [PATCH 1088/1571] CI(gather-rust-build-stats): fix build with libpq
 (#8219)

## Problem
I've missed setting `PQ_LIB_DIR` in
https://github.com/neondatabase/neon/pull/8206 in
`gather-rust-build-stats` job and it fails now:
```
  = note: /usr/bin/ld: cannot find -lpq
          collect2: error: ld returned 1 exit status


error: could not compile `storage_controller` (bin "storage_controller") due to 1 previous error
```

https://github.com/neondatabase/neon/actions/runs/9743960062/job/26888597735

## Summary of changes
- Set `PQ_LIB_DIR` for `gather-rust-build-stats` job
---
 .github/workflows/neon_extra_builds.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 330d858c0e..11ff634b6c 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -385,7 +385,7 @@ jobs:
         run: make walproposer-lib -j$(nproc)
 
       - name: Produce the build stats
-        run: cargo build --all --release --timings -j$(nproc)
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc)
 
       - name: Upload the build stats
         id: upload-stats

From 0789160ffad0cd13b1e378fa5f19250fbd908afd Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 1 Jul 2024 18:55:18 +0300
Subject: [PATCH 1089/1571] tests: Make neon_xlogflush() flush all WAL, if you
 omit the LSN arg (#8215)

This makes it much more convenient to use in the common case that you
want to flush all the WAL. (Passing pg_current_wal_insert_lsn() as the
argument doesn't work for the same reasons as explained in the comments:
we need to be back off to the beginning of a page if the previous record
ended at page boundary.)

I plan to use this to fix the issue that Arseny Sher called out at
https://github.com/neondatabase/neon/pull/7288#discussion_r1660063852
---
 pgxn/neon_test_utils/neontest.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 944936d395..071dc122ed 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -469,9 +469,9 @@ neon_xlogflush(PG_FUNCTION_ARGS)
 		 * The LSN returned by GetXLogInsertRecPtr() is the position where the
 		 * next inserted record would begin. If the last record ended just at
 		 * the page boundary, the next record will begin after the page header
-		 * on the next page, and that's what GetXLogInsertRecPtr().returns,
-		 * but the page header has not been written yet. If we tried to flush
-		 * it, XLogFlush() would throw an error:
+		 * on the next page, but the next page's page header has not been
+		 * written yet. If we tried to flush it, XLogFlush() would throw an
+		 * error:
 		 *
 		 * ERROR : xlog flush request %X/%X is not satisfied --- flushed only to %X/%X
 		 *

From 9882ac8e0690c69df9091b48243cbde52153c492 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 1 Jul 2024 18:44:28 +0100
Subject: [PATCH 1090/1571] docs: Graceful storage controller cluster restarts
 RFC (#7704)

RFC for "Graceful Restarts of Storage Controller Managed Clusters".
Related https://github.com/neondatabase/neon/issues/7387
---
 .../033-storage-controller-drain-and-fill.md  | 345 ++++++++++++++++++
 1 file changed, 345 insertions(+)
 create mode 100644 docs/rfcs/033-storage-controller-drain-and-fill.md

diff --git a/docs/rfcs/033-storage-controller-drain-and-fill.md b/docs/rfcs/033-storage-controller-drain-and-fill.md
new file mode 100644
index 0000000000..77c84cd2a5
--- /dev/null
+++ b/docs/rfcs/033-storage-controller-drain-and-fill.md
@@ -0,0 +1,345 @@
+# Graceful Restarts of Storage Controller Managed Clusters
+
+## Summary
+This RFC describes new storage controller APIs for draining and filling tenant shards from/on pageserver nodes.
+It also covers how these new APIs should be used by an orchestrator (e.g. Ansible) in order to implement
+graceful cluster restarts.
+
+## Motivation
+
+Pageserver restarts cause read availablity downtime for tenants.
+
+For example pageserver-3 @ us-east-1 was unavailable for a randomly
+picked tenant (which requested on-demand activation) for around 30 seconds
+during the restart at 2024-04-03 16:37 UTC.
+
+Note that lots of shutdowns on loaded pageservers do not finish within the
+[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
+and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
+
+This problem is not yet very acutely felt in storage controller managed pageservers since
+tenant density is much lower there. However, we are planning on eventually migrating all
+pageservers to storage controller management, so it makes sense to solve the issue proactively.
+
+## Requirements
+
+- Pageserver re-deployments cause minimal downtime for tenants
+- The storage controller exposes HTTP API hooks for draining and filling tenant shards
+from a given pageserver. Said hooks can be used by an orchestrator proces or a human operator.
+- The storage controller exposes some HTTP API to cancel draining and filling background operations.
+- Failures to drain or fill the node should not be fatal. In such cases, cluster restarts should proceed
+as usual (with downtime).
+- Progress of draining/filling is visible through metrics
+
+## Non Goals
+
+- Integration with the control plane
+- Graceful restarts for large non-HA tenants.
+
+## Impacted Components
+
+- storage controller
+- deployment orchestrator (i.e. Ansible)
+- pageserver (indirectly)
+
+## Terminology
+
+** Draining ** is the process through which all tenant shards that can be migrated from a given pageserver
+are distributed across the rest of the cluster.
+
+** Filling ** is the symmetric opposite of draining. In this process tenant shards are migrated onto a given
+pageserver until the cluster reaches a resonable, quiescent distribution of tenant shards across pageservers.
+
+** Node scheduling policies ** act as constraints to the scheduler. For instance, when a
+node is set in the `Paused` policy, no further shards will be scheduled on it.
+
+** Node ** is a pageserver. Term is used interchangeably in this RFC.
+
+** Deployment orchestrator ** is a generic term for whatever drives our deployments.
+Currently, it's an Ansible playbook.
+
+## Background
+
+### Storage Controller Basics (skip if already familiar)
+
+Fundamentally, the storage controller is a reconciler which aims to move from the observed mapping between pageservers and tenant shards to an intended mapping. Pageserver nodes and tenant shards metadata is durably persisted in a database, but note that the mapping between the two entities is not durably persisted. Instead, this mapping (*observed state*) is constructed at startup by sending `GET location_config` requests to registered pageservers.
+
+An internal scheduler maps tenant shards to pageservers while respecting certain constraints. The result of scheduling is the *intent state*. When the intent state changes, a *reconciliation* will inform pageservers about the new assigment via `PUT location_config` requests and will notify the compute via the configured hook.
+
+### Background Optimizations
+
+The storage controller performs scheduling optimizations in the background. It will
+migrate attachments to warm secondaries and replace secondaries in order to balance
+the cluster out.
+
+### Reconciliations Concurrency Limiting
+
+There's a hard limit on the number of reconciles that the storage controller
+can have in flight at any given time. To get an idea of scales, the limit is
+128 at the time of writing.
+
+## Implementation
+
+Note: this section focuses on the core functionality of the graceful restart process.
+It doesn't neccesarily describe the most efficient approach. Optimizations are described
+separately in a later section.
+
+### Overall Flow
+
+This section describes how to implement graceful restarts from the perspective
+of Ansible, the deployment orchestrator. Pageservers are already restarted sequentially.
+The orchestrator shall implement the following epilogue and prologue steps for each
+pageserver restart:
+
+#### Prologue
+
+The orchestrator shall first fetch the pageserver node id from the control plane or
+the pageserver it aims to restart directly. Next, it issues an HTTP request
+to the storage controller in order to start the drain of said pageserver node.
+All error responses are retried with a short back-off. When a 202 (Accepted)
+HTTP code is returned, the drain has started. Now the orchestrator polls the
+node status endpoint exposed by the storage controller in order to await the
+end of the drain process. When the `policy` field of the node status response
+becomes `PauseForRestart`, the drain has completed and the orchestrator can
+proceed with restarting the pageserver.
+
+The prologue is subject to an overall timeout. It will have a value in the ballpark
+of minutes. As storage controller managed pageservers become more loaded this timeout
+will likely have to increase.
+
+#### Epilogue
+
+After restarting the pageserver, the orchestrator issues an HTTP request
+to the storage controller to kick off the filling process. This API call
+may be retried for all error codes with a short backoff. This also serves
+as a synchronization primitive as the fill will be refused if the pageserver
+has not yet re-attached to the storage controller. When a 202(Accepted) HTTP
+code is returned, the fill has started. Now the orchestrator polls the node
+status endpoint exposed by the storage controller in order to await the end of
+the filling process. When the `policy` field of the node status response becomes
+`Active`, the fill has completed and the orchestrator may proceed to the next pageserver.
+
+Again, the epilogue is subject to an overall timeout. We can start off with
+using the same timeout as for the prologue, but can also consider relying on
+the storage controller's background optimizations with a shorter timeout.
+
+In the case that the deployment orchestrator times out, it attempts to cancel
+the fill. This operation shall be retried with a short back-off. If it ultimately
+fails it will require manual intervention to set the nodes scheduling policy to
+`NodeSchedulingPolicy::Active`. Not doing that is not immediately problematic,
+but it constrains the scheduler as mentioned previously.
+
+### Node Scheduling Policy State Machine
+
+The state machine below encodes the behaviours discussed above and
+the various failover situations described in a later section.
+
+Assuming no failures and/or timeouts the flow should be:
+`Active -> Draining -> PauseForRestart -> Active -> Filling -> Active`
+
+```
+                          Operator requested drain
+               +-----------------------------------------+
+               |                                         |
+       +-------+-------+                         +-------v-------+
+       |               |                         |               |
+       |     Pause     |             +----------->    Draining   +----------+
+       |               |             |           |               |          |
+       +---------------+             |           +-------+-------+          |
+                                     |                   |                  |
+                                     |                   |                  |
+                      Drain requested|                   |                  |
+                                     |                   |Drain complete    | Drain failed
+                                     |                   |                  | Cancelled/PS reattach/Storcon restart
+                                     |                   |                  |
+                             +-------+-------+           |                  |
+                             |               |           |                  |
+               +-------------+    Active     <-----------+------------------+
+               |             |               |           |
+Fill requested |             +---^---^-------+           |
+               |                 |   |                   |
+               |                 |   |                   |
+               |                 |   |                   |
+               |   Fill completed|   |                   |
+               |                 |   |PS reattach        |
+               |                 |   |after restart      |
+       +-------v-------+         |   |           +-------v-------+
+       |               |         |   |           |               |
+       |    Filling    +---------+   +-----------+PauseForRestart|
+       |               |                         |               |
+       +---------------+                         +---------------+
+```
+
+### Draining/Filling APIs
+
+The storage controller API to trigger the draining of a given node is:
+`PUT /v1/control/node/:node_id/{drain,fill}`.
+
+The following HTTP non-success return codes are used.
+All of them are safely retriable from the perspective of the storage controller.
+- 404: Requested node was not found
+- 503: Requested node is known to the storage controller, but unavailable
+- 412: Drain precondition failed: there is no other node to drain to or the node's schedulling policy forbids draining
+- 409: A {drain, fill} is already in progress. Only one such background operation
+is allowed per node.
+
+When the drain is accepted and commenced a 202 HTTP code is returned.
+
+Drains and fills shall be cancellable by the deployment orchestrator or a
+human operator via: `DELETE /v1/control/node/:node_id/{drain,fill}`. A 200
+response is returned when the cancelation is successful. Errors are retriable.
+
+### Drain Process
+
+Before accpeting a drain request the following validations is applied:
+* Ensure that the node is known the storage controller
+* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active` or `NodeSchedulingPolicy::Pause`
+* Ensure that another drain or fill is not already running on the node
+* Ensure that a drain is possible (i.e. check that there is at least one
+schedulable node to drain to)
+
+After accepting the drain, the scheduling policy of the node is set to
+`NodeSchedulingPolicy::Draining` and persisted in both memory and the database.
+This disallows the optimizer from adding or removing shards from the node which
+is desirable to avoid them racing.
+
+Next, a separate Tokio task is spawned to manage the draining. For each tenant
+shard attached to the node being drained, demote the node to a secondary and
+attempt to schedule the node away. Scheduling might fail due to unsatisfiable
+constraints, but that is fine. Draining is a best effort process since it might
+not always be possible to cut over all shards.
+
+Importantly, this task manages the concurrency of issued reconciles in order to
+avoid drowning out the target pageservers and to allow other important reconciles
+to proceed.
+
+Once the triggered reconciles have finished or timed out, set the node's scheduling
+policy to `NodeSchedulingPolicy::PauseForRestart` to signal the end of the drain.
+
+A note on non HA tenants: These tenants do not have secondaries, so by the description
+above, they would not be migrated. It makes sense to skip them (especially the large ones)
+since, depending on tenant size, this might be more disruptive than the restart since the
+pageserver we've moved to do will need to on-demand download the entire working set for the tenant.
+We can consider expanding to small non-HA tenants in the future.
+
+### Fill Process
+
+Before accpeting a fill request the following validations is applied:
+* Ensure that the node is known the storage controller
+* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active`.
+This is the only acceptable policy for the fill starting state. When a node re-attaches,
+it set the scheduling policy to `NodeSchedulingPolicy::Active` if it was equal to
+`NodeSchedulingPolicy::PauseForRestart` or `NodeSchedulingPolicy::Draining` (possible end states for a node drain).
+* Ensure that another drain or fill is not already running on the node
+
+After accepting the drain, the scheduling policy of the node is set to
+`NodeSchedulingPolicy::Filling` and persisted in both memory and the database.
+This disallows the optimizer from adding or removing shards from the node which
+is desirable to avoid them racing.
+
+Next, a separate Tokio task is spawned to manage the draining. For each tenant
+shard where the filled node is a secondary, promote the secondary. This is done
+until we run out of shards or the counts of attached shards become balanced across
+the cluster.
+
+Like for draining, the concurrency of spawned reconciles is limited.
+
+### Failure Modes & Handling
+
+Failures are generally handled by transition back into the `Active`
+(neutral) state. This simplifies the implementation greatly at the
+cost of adding transitions to the state machine. For example, we
+could detect the `Draining` state upon restart and proceed with a drain,
+but how should the storage controller know that's what the orchestrator
+needs still?
+
+#### Storage Controller Crash
+
+When the storage controller starts up reset the node scheduling policy
+of all nodes in states `Draining`, `Filling` or `PauseForRestart` to
+`Active`. The rationale is that when the storage controller restarts,
+we have lost context of what the deployment orchestrator wants. It also
+has the benefit of making things easier to reason about.
+
+#### Pageserver Crash During Drain
+
+The pageserver will attempt to re-attach during restart at which
+point the node scheduling policy will be set back to `Active`, thus
+reenabling the scheduler to use the node.
+
+#### Non-drained Pageserver Crash During Drain
+
+What should happen when a pageserver we are draining to crashes during the
+process. Two reasonable options are: cancel the drain and focus on the failover
+*or* do both, but prioritise failover. Since the number of concurrent reconciles
+produced by drains/fills are limited, we get the later behaviour for free.
+My suggestion is we take this approach, but the cancellation option is trivial
+to implement as well.
+
+#### Pageserver Crash During Fill
+
+The pageserver will attempt to re-attach during restart at which
+point the node scheduling policy will be set back to `Active`, thus
+reenabling the scheduler to use the node.
+
+#### Pageserver Goes unavailable During Drain/Fill
+
+The drain and fill jobs handle this by stopping early. When the pageserver
+is detected as online by storage controller heartbeats, reset its scheduling
+policy to `Active`. If a restart happens instead, see the pageserver crash
+failure mode.
+
+#### Orchestrator Drain Times Out
+
+Orchestrator will still proceed with the restart.
+When the pageserver re-attaches, the scheduling policy is set back to
+`Active`.
+
+#### Orchestrator Fill Times Out
+
+Orchestrator will attempt to cancel the fill operation. If that fails,
+the fill will continue until it quiesces and the node will be left
+in the `Filling` scheduling policy. This hinders the scheduler, but is
+otherwise harmless. A human operator can handle this by setting the scheduling
+policy to `Active`, or we can bake in a fill timeout into the storage controller.
+
+## Optimizations
+
+### Location Warmth
+
+When cutting over to a secondary, the storage controller will wait for it to
+become "warm" (i.e. download enough of the tenants data). This means that some
+reconciliations can take significantly longer than others and hold up precious
+reconciliations units. As an optimization, the drain stage can only cut over
+tenants that are already "warm". Similarly, the fill stage can prioritise the
+"warmest" tenants in the fill.
+
+Given that the number of tenants by the storage controller will be fairly low
+for the foreseable future, the first implementation could simply query the tenants
+for secondary status. This doesn't scale well with increasing tenant counts, so
+eventually we will need new pageserver API endpoints to report the sets of
+"warm" and "cold" nodes.
+
+## Alternatives Considered
+
+### Draining and Filling Purely as Scheduling Constraints
+
+At its core, the storage controller is a big background loop that detects changes
+in the environment and reacts on them. One could express draining and filling
+of nodes purely in terms of constraining the scheduler (as opposed to having
+such background tasks).
+
+While theoretically nice, I think that's harder to implement and more importantly operate and reason about.
+Consider cancellation of a drain/fill operation. We would have to update the scheduler state, create
+an entirely new schedule (intent state) and start work on applying that. It gets trickier if we wish
+to cancel the reconciliation tasks spawned by drain/fill nodes. How would we know which ones belong
+to the conceptual drain/fill? One could add labels to reconciliations, but it gets messy in my opinion.
+
+It would also mean that reconciliations themselves have side effects that persist in the database
+(persist something to the databse when the drain is done), which I'm not conceptually fond of.
+
+## Proof of Concept
+
+This RFC is accompanied by a POC which implements nearly everything mentioned here
+apart from the optimizations and some of the failure handling:
+https://github.com/neondatabase/neon/pull/7682

From 0497b99f3abbb95d07fd80727da5c565afd72e0a Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 2 Jul 2024 06:56:10 +0300
Subject: [PATCH 1091/1571] Check status of connection after
 PQconnectStartParams (#8210)

## Problem

See https://github.com/neondatabase/cloud/issues/14289

## Summary of changes

Check connection status after calling PQconnectStartParams

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/libpagestore.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index a665cafafe..a3fdcc537e 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -432,7 +432,17 @@ pageserver_connect(shardno_t shard_no, int elevel)
 			neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
 			return false;
 		}
-
+		if (PQstatus(shard->conn) == CONNECTION_BAD)
+		{
+			char	   *msg = pchomp(PQerrorMessage(shard->conn));
+			CLEANUP_AND_DISCONNECT(shard);
+			ereport(elevel,
+					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+						errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
+						errdetail_internal("%s", msg)));
+			pfree(msg);
+			return false;
+		}
 		shard->state = PS_Connecting_Startup;
 		/* fallthrough */
 	}

From 7dcdbaa25e00233f79199a30748e08f8b5d72c33 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 2 Jul 2024 12:53:08 +0200
Subject: [PATCH 1092/1571] remote_storage config: move handling of empty
 inline table `{}` to callers (#8193)

Before this PR, `RemoteStorageConfig::from_toml` would support
deserializing an
empty `{}` TOML inline table to a `None`, otherwise try `Some()`.

We can instead let
* in proxy: let clap derive handle the Option
* in PS & SK: assume that if the field is specified, it must be a valid
  RemtoeStorageConfig

(This PR started with a much simpler goal of factoring out the
`deserialize_item` function because I need that in another PR).
---
 Cargo.lock                            |  1 +
 libs/remote_storage/src/config.rs     | 25 ++++++-------------------
 libs/utils/Cargo.toml                 |  1 +
 libs/utils/src/lib.rs                 |  2 ++
 libs/utils/src/toml_edit_ext.rs       | 22 ++++++++++++++++++++++
 pageserver/ctl/src/main.rs            |  2 +-
 pageserver/src/config.rs              | 19 ++++++++++++++++---
 proxy/src/bin/proxy.rs                |  9 ++++-----
 proxy/src/config.rs                   |  8 ++------
 proxy/src/context/parquet.rs          | 15 ++++++---------
 safekeeper/src/bin/safekeeper.rs      | 13 ++-----------
 test_runner/fixtures/neon_fixtures.py |  4 +++-
 12 files changed, 66 insertions(+), 55 deletions(-)
 create mode 100644 libs/utils/src/toml_edit_ext.rs

diff --git a/Cargo.lock b/Cargo.lock
index 5393538c59..6dae8e3403 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6811,6 +6811,7 @@ dependencies = [
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
+ "toml_edit 0.19.10",
  "tracing",
  "tracing-error",
  "tracing-subscriber",
diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index 8a8f6212e9..fa3f2cba58 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -1,6 +1,5 @@
 use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration};
 
-use anyhow::bail;
 use aws_sdk_s3::types::StorageClass;
 use camino::Utf8PathBuf;
 
@@ -176,20 +175,8 @@ fn serialize_storage_class<S: serde::Serializer>(
 impl RemoteStorageConfig {
     pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
 
-    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
-        let document: toml_edit::Document = match toml {
-            toml_edit::Item::Table(toml) => toml.clone().into(),
-            toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
-                toml.clone().into_table().into()
-            }
-            _ => bail!("toml not a table or inline table"),
-        };
-
-        if document.is_empty() {
-            return Ok(None);
-        }
-
-        Ok(Some(toml_edit::de::from_document(document)?))
+    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
+        Ok(utils::toml_edit_ext::deserialize_item(toml)?)
     }
 }
 
@@ -197,7 +184,7 @@ impl RemoteStorageConfig {
 mod tests {
     use super::*;
 
-    fn parse(input: &str) -> anyhow::Result<Option<RemoteStorageConfig>> {
+    fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
         let toml = input.parse::<toml_edit::Document>().unwrap();
         RemoteStorageConfig::from_toml(toml.as_item())
     }
@@ -207,7 +194,7 @@ mod tests {
         let input = "local_path = '.'
 timeout = '5s'";
 
-        let config = parse(input).unwrap().expect("it exists");
+        let config = parse(input).unwrap();
 
         assert_eq!(
             config,
@@ -229,7 +216,7 @@ timeout = '5s'";
     timeout = '7s'
     ";
 
-        let config = parse(toml).unwrap().expect("it exists");
+        let config = parse(toml).unwrap();
 
         assert_eq!(
             config,
@@ -257,7 +244,7 @@ timeout = '5s'";
     timeout = '7s'
     ";
 
-        let config = parse(toml).unwrap().expect("it exists");
+        let config = parse(toml).unwrap();
 
         assert_eq!(
             config,
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index a6a081c5c1..261ca2cc1a 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -40,6 +40,7 @@ thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
 tokio-util.workspace = true
+toml_edit.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 2953f0aad4..2a397d97d2 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -94,6 +94,8 @@ pub mod env;
 
 pub mod poison;
 
+pub mod toml_edit_ext;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
diff --git a/libs/utils/src/toml_edit_ext.rs b/libs/utils/src/toml_edit_ext.rs
new file mode 100644
index 0000000000..ab5f7bdd95
--- /dev/null
+++ b/libs/utils/src/toml_edit_ext.rs
@@ -0,0 +1,22 @@
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    #[error("item is not a document")]
+    ItemIsNotADocument,
+    #[error(transparent)]
+    Serde(toml_edit::de::Error),
+}
+
+pub fn deserialize_item<T>(item: &toml_edit::Item) -> Result<T, Error>
+where
+    T: serde::de::DeserializeOwned,
+{
+    let document: toml_edit::Document = match item {
+        toml_edit::Item::Table(toml) => toml.clone().into(),
+        toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
+            toml.clone().into_table().into()
+        }
+        _ => return Err(Error::ItemIsNotADocument),
+    };
+
+    toml_edit::de::from_document(document).map_err(Error::Serde)
+}
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 50c3ac4c61..ea09a011e5 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -178,7 +178,7 @@ async fn main() -> anyhow::Result<()> {
             let toml_item = toml_document
                 .get("remote_storage")
                 .expect("need remote_storage");
-            let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
+            let config = RemoteStorageConfig::from_toml(toml_item)?;
             let storage = remote_storage::GenericRemoteStorage::from_config(&config);
             let cancel = CancellationToken::new();
             storage
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index f36e63f035..2b698b75dc 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -159,7 +159,7 @@ pub mod defaults {
 
 #ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
 
-[remote_storage]
+#[remote_storage]
 
 "#
     );
@@ -918,7 +918,7 @@ impl PageServerConf {
                 "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
                 "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
                 "remote_storage" => {
-                    builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
+                    builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?))
                 }
                 "tenant_config" => {
                     t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
@@ -946,7 +946,7 @@ impl PageServerConf {
                     builder.metric_collection_endpoint(Some(endpoint));
                 },
                 "metric_collection_bucket" => {
-                    builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
+                    builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?))
                 }
                 "synthetic_size_calculation_interval" =>
                     builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
@@ -1681,6 +1681,19 @@ threshold = "20m"
         }
     }
 
+    #[test]
+    fn empty_remote_storage_is_error() {
+        let tempdir = tempdir().unwrap();
+        let (workdir, _) = prepare_fs(&tempdir).unwrap();
+        let input = r#"
+remote_storage = {}
+        "#;
+        let doc = toml_edit::Document::from_str(input).unwrap();
+        let err = PageServerConf::parse_and_validate(&doc, &workdir)
+            .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
+        assert!(format!("{err}").contains("remote_storage"), "{err}");
+    }
+
     fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
         let tempdir_path = tempdir.path();
 
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index dffebf5580..7f4cb2c010 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -35,6 +35,7 @@ use proxy::usage_metrics;
 use anyhow::bail;
 use proxy::config::{self, ProxyConfig};
 use proxy::serverless;
+use remote_storage::RemoteStorageConfig;
 use std::net::SocketAddr;
 use std::pin::pin;
 use std::sync::Arc;
@@ -205,8 +206,8 @@ struct ProxyCliArgs {
     /// remote storage configuration for backup metric collection
     /// Encoded as toml (same format as pageservers), eg
     /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
-    #[clap(long, default_value = "{}")]
-    metric_backup_collection_remote_storage: String,
+    #[clap(long, value_parser = remote_storage_from_toml)]
+    metric_backup_collection_remote_storage: Option<RemoteStorageConfig>,
     /// chunk size for backup metric collection
     /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
     #[clap(long, default_value = "4194304")]
@@ -511,9 +512,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     }
     let backup_metric_collection_config = config::MetricBackupCollectionConfig {
         interval: args.metric_backup_collection_interval,
-        remote_storage_config: remote_storage_from_toml(
-            &args.metric_backup_collection_remote_storage,
-        )?,
+        remote_storage_config: args.metric_backup_collection_remote_storage.clone(),
         chunk_size: args.metric_backup_collection_chunk_size,
     };
 
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index f4707a33aa..af5511d7ec 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -399,15 +399,11 @@ impl FromStr for EndpointCacheConfig {
 #[derive(Debug)]
 pub struct MetricBackupCollectionConfig {
     pub interval: Duration,
-    pub remote_storage_config: OptRemoteStorageConfig,
+    pub remote_storage_config: Option<RemoteStorageConfig>,
     pub chunk_size: usize,
 }
 
-/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
-/// runtime type errors from the value parser we use.
-pub type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
-
-pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
+pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<RemoteStorageConfig> {
     RemoteStorageConfig::from_toml(&s.parse()?)
 }
 
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index e72bf199e3..cfc1f8e89e 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -14,17 +14,14 @@ use parquet::{
     record::RecordWriter,
 };
 use pq_proto::StartupMessageParams;
-use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
+use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel};
 use serde::ser::SerializeMap;
 use tokio::{sync::mpsc, time};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
 use utils::backoff;
 
-use crate::{
-    config::{remote_storage_from_toml, OptRemoteStorageConfig},
-    context::LOG_CHAN_DISCONNECT,
-};
+use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT};
 
 use super::{RequestMonitoring, LOG_CHAN};
 
@@ -33,11 +30,11 @@ pub struct ParquetUploadArgs {
     /// Storage location to upload the parquet files to.
     /// Encoded as toml (same format as pageservers), eg
     /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
-    #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
-    parquet_upload_remote_storage: OptRemoteStorageConfig,
+    #[clap(long, value_parser = remote_storage_from_toml)]
+    parquet_upload_remote_storage: Option<RemoteStorageConfig>,
 
-    #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
-    parquet_upload_disconnect_events_remote_storage: OptRemoteStorageConfig,
+    #[clap(long, value_parser = remote_storage_from_toml)]
+    parquet_upload_disconnect_events_remote_storage: Option<RemoteStorageConfig>,
 
     /// How many rows to include in a row group
     #[clap(long, default_value_t = 8192)]
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index c81373c77c..d25b8722ac 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -12,7 +12,6 @@ use sd_notify::NotifyState;
 use tokio::runtime::Handle;
 use tokio::signal::unix::{signal, SignalKind};
 use tokio::task::JoinError;
-use toml_edit::Document;
 use utils::logging::SecretString;
 
 use std::env::{var, VarError};
@@ -126,7 +125,7 @@ struct Args {
     peer_recovery: bool,
     /// Remote storage configuration for WAL backup (offloading to s3) as TOML
     /// inline table, e.g.
-    ///   {"max_concurrent_syncs" = 17, "max_sync_errors": 13, "bucket_name": "<BUCKETNAME>", "bucket_region":"<REGION>", "concurrency_limit": 119}
+    ///   {max_concurrent_syncs = 17, max_sync_errors = 13, bucket_name = "<BUCKETNAME>", bucket_region = "<REGION>", concurrency_limit = 119}
     /// Safekeeper offloads WAL to
     ///   [prefix_in_bucket/]<tenant_id>/<timeline_id>/<segment_file>, mirroring
     /// structure on the file system.
@@ -553,16 +552,8 @@ fn set_id(workdir: &Utf8Path, given_id: Option<NodeId>) -> Result<NodeId> {
     Ok(my_id)
 }
 
-// Parse RemoteStorage from TOML table.
 fn parse_remote_storage(storage_conf: &str) -> anyhow::Result<RemoteStorageConfig> {
-    // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse
-    let storage_conf_toml = format!("remote_storage = {storage_conf}");
-    let parsed_toml = storage_conf_toml.parse::<Document>()?; // parse
-    let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again
-    RemoteStorageConfig::from_toml(storage_conf_parsed_toml).and_then(|parsed_config| {
-        // XXX: Don't print the original toml here, there might be some sensitive data
-        parsed_config.context("Incorrectly parsed remote storage toml as no remote storage config")
-    })
+    RemoteStorageConfig::from_toml(&storage_conf.parse()?)
 }
 
 #[test]
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e1c8514351..565aaba6e0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1167,7 +1167,9 @@ class NeonEnv:
             if config.auth_enabled:
                 sk_cfg["auth_enabled"] = True
             if self.safekeepers_remote_storage is not None:
-                sk_cfg["remote_storage"] = self.safekeepers_remote_storage.to_toml_inline_table()
+                sk_cfg[
+                    "remote_storage"
+                ] = self.safekeepers_remote_storage.to_toml_inline_table().strip()
             self.safekeepers.append(Safekeeper(env=self, id=id, port=port))
             cfg["safekeepers"].append(sk_cfg)
 

From 1a0f545c16de5e105a3b22990ce0953e078ac1dc Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 2 Jul 2024 13:45:04 +0100
Subject: [PATCH 1093/1571] pageserver: simpler, stricter config error handling
 (#8177)

## Problem

Tenant attachment has error paths for failures to write local
configuration, but these types of local storage I/O errors should be
considered fatal for the process. Related thread on an earlier PR that
touched this code:
https://github.com/neondatabase/neon/pull/7947#discussion_r1655134114

## Summary of changes

- Make errors writing tenant config fatal (abort process)
- When reading tenant config, make all I/O errors except ENOENT fatal
- Replace use of bare anyhow errors with `LoadConfigError`
---
 pageserver/src/http/routes.rs       |   4 +-
 pageserver/src/tenant.rs            |  78 ++++++------
 pageserver/src/tenant/mgr.rs        | 191 +++++++++++++---------------
 test_runner/regress/test_tenants.py |  25 +++-
 4 files changed, 154 insertions(+), 144 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 1fda2eaa85..f726ba115d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -227,7 +227,7 @@ impl From<UpsertLocationError> for ApiError {
             BadRequest(e) => ApiError::BadRequest(e),
             Unavailable(_) => ApiError::ShuttingDown,
             e @ InProgress => ApiError::Conflict(format!("{e}")),
-            Flush(e) | Other(e) => ApiError::InternalServerError(e),
+            Flush(e) | InternalError(e) => ApiError::InternalServerError(e),
         }
     }
 }
@@ -1296,7 +1296,7 @@ async fn update_tenant_config_handler(
 
     crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
         .await
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
     tenant.set_new_tenant_config(new_tenant_conf);
 
     json_response(StatusCode::OK, ())
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 3ffbaf98c6..116481a1eb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -529,6 +529,15 @@ impl From<PageReconstructError> for GcError {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum LoadConfigError {
+    #[error("TOML deserialization error: '{0}'")]
+    DeserializeToml(#[from] toml_edit::de::Error),
+
+    #[error("Config not found at {0}")]
+    NotFound(Utf8PathBuf),
+}
+
 impl Tenant {
     /// Yet another helper for timeline initialization.
     ///
@@ -2563,36 +2572,35 @@ impl Tenant {
     pub(super) fn load_tenant_config(
         conf: &'static PageServerConf,
         tenant_shard_id: &TenantShardId,
-    ) -> anyhow::Result<LocationConf> {
+    ) -> Result<LocationConf, LoadConfigError> {
         let config_path = conf.tenant_location_config_path(tenant_shard_id);
 
-        if config_path.exists() {
-            // New-style config takes precedence
-            let deserialized = Self::read_config(&config_path)?;
-            Ok(toml_edit::de::from_document::<LocationConf>(deserialized)?)
-        } else {
-            // The config should almost always exist for a tenant directory:
-            //  - When attaching a tenant, the config is the first thing we write
-            //  - When detaching a tenant, we atomically move the directory to a tmp location
-            //    before deleting contents.
-            //
-            // The very rare edge case that can result in a missing config is if we crash during attach
-            // between creating directory and writing config.  Callers should handle that as if the
-            // directory didn't exist.
-            anyhow::bail!("tenant config not found in {}", config_path);
-        }
-    }
-
-    fn read_config(path: &Utf8Path) -> anyhow::Result<toml_edit::Document> {
-        info!("loading tenant configuration from {path}");
+        info!("loading tenant configuration from {config_path}");
 
         // load and parse file
-        let config = fs::read_to_string(path)
-            .with_context(|| format!("Failed to load config from path '{path}'"))?;
+        let config = fs::read_to_string(&config_path).map_err(|e| {
+            match e.kind() {
+                std::io::ErrorKind::NotFound => {
+                    // The config should almost always exist for a tenant directory:
+                    //  - When attaching a tenant, the config is the first thing we write
+                    //  - When detaching a tenant, we atomically move the directory to a tmp location
+                    //    before deleting contents.
+                    //
+                    // The very rare edge case that can result in a missing config is if we crash during attach
+                    // between creating directory and writing config.  Callers should handle that as if the
+                    // directory didn't exist.
 
-        config
-            .parse::<toml_edit::Document>()
-            .with_context(|| format!("Failed to parse config from file '{path}' as toml file"))
+                    LoadConfigError::NotFound(config_path)
+                }
+                _ => {
+                    // No IO errors except NotFound are acceptable here: other kinds of error indicate local storage or permissions issues
+                    // that we cannot cleanly recover
+                    crate::virtual_file::on_fatal_io_error(&e, "Reading tenant config file")
+                }
+            }
+        })?;
+
+        Ok(toml_edit::de::from_str::<LocationConf>(&config)?)
     }
 
     #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
@@ -2600,7 +2608,7 @@ impl Tenant {
         conf: &'static PageServerConf,
         tenant_shard_id: &TenantShardId,
         location_conf: &LocationConf,
-    ) -> anyhow::Result<()> {
+    ) -> std::io::Result<()> {
         let config_path = conf.tenant_location_config_path(tenant_shard_id);
 
         Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await
@@ -2611,7 +2619,7 @@ impl Tenant {
         tenant_shard_id: &TenantShardId,
         config_path: &Utf8Path,
         location_conf: &LocationConf,
-    ) -> anyhow::Result<()> {
+    ) -> std::io::Result<()> {
         debug!("persisting tenantconf to {config_path}");
 
         let mut conf_content = r#"# This file contains a specific per-tenant's config.
@@ -2620,22 +2628,20 @@ impl Tenant {
         .to_string();
 
         fail::fail_point!("tenant-config-before-write", |_| {
-            anyhow::bail!("tenant-config-before-write");
+            Err(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "tenant-config-before-write",
+            ))
         });
 
         // Convert the config to a toml file.
-        conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?;
+        conf_content +=
+            &toml_edit::ser::to_string_pretty(&location_conf).expect("Config serialization failed");
 
         let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX);
 
-        let tenant_shard_id = *tenant_shard_id;
-        let config_path = config_path.to_owned();
         let conf_content = conf_content.into_bytes();
-        VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content)
-            .await
-            .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?;
-
-        Ok(())
+        VirtualFile::crashsafe_overwrite(config_path.to_owned(), temp_path, conf_content).await
     }
 
     //
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 08c3f19b6f..c1da1d2c55 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -43,7 +43,8 @@ use crate::tenant::config::{
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
-use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
+use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Tenant, TenantState};
+use crate::virtual_file::MaybeFatalIo;
 use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
 
 use utils::crashsafe::path_with_suffix_extension;
@@ -272,7 +273,7 @@ pub struct TenantManager {
 }
 
 fn emergency_generations(
-    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
+    tenant_confs: &HashMap<TenantShardId, Result<LocationConf, LoadConfigError>>,
 ) -> HashMap<TenantShardId, TenantStartupMode> {
     tenant_confs
         .iter()
@@ -296,7 +297,7 @@ fn emergency_generations(
 
 async fn init_load_generations(
     conf: &'static PageServerConf,
-    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
+    tenant_confs: &HashMap<TenantShardId, Result<LocationConf, LoadConfigError>>,
     resources: &TenantSharedResources,
     cancel: &CancellationToken,
 ) -> anyhow::Result<Option<HashMap<TenantShardId, TenantStartupMode>>> {
@@ -346,56 +347,32 @@ async fn init_load_generations(
 /// Given a directory discovered in the pageserver's tenants/ directory, attempt
 /// to load a tenant config from it.
 ///
-/// If file is missing, return Ok(None)
+/// If we cleaned up something expected (like an empty dir or a temp dir), return None.
 fn load_tenant_config(
     conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
     dentry: Utf8DirEntry,
-) -> anyhow::Result<Option<(TenantShardId, anyhow::Result<LocationConf>)>> {
+) -> Option<Result<LocationConf, LoadConfigError>> {
     let tenant_dir_path = dentry.path().to_path_buf();
     if crate::is_temporary(&tenant_dir_path) {
         info!("Found temporary tenant directory, removing: {tenant_dir_path}");
         // No need to use safe_remove_tenant_dir_all because this is already
         // a temporary path
-        if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) {
-            error!(
-                "Failed to remove temporary directory '{}': {:?}",
-                tenant_dir_path, e
-            );
-        }
-        return Ok(None);
+        std::fs::remove_dir_all(&tenant_dir_path).fatal_err("Deleting temporary tenant dir");
+        return None;
     }
 
     // This case happens if we crash during attachment before writing a config into the dir
     let is_empty = tenant_dir_path
         .is_empty_dir()
-        .with_context(|| format!("Failed to check whether {tenant_dir_path:?} is an empty dir"))?;
+        .fatal_err("Checking for empty tenant dir");
     if is_empty {
         info!("removing empty tenant directory {tenant_dir_path:?}");
-        if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
-            error!(
-                "Failed to remove empty tenant directory '{}': {e:#}",
-                tenant_dir_path
-            )
-        }
-        return Ok(None);
+        std::fs::remove_dir(&tenant_dir_path).fatal_err("Deleting empty tenant dir");
+        return None;
     }
 
-    let tenant_shard_id = match tenant_dir_path
-        .file_name()
-        .unwrap_or_default()
-        .parse::<TenantShardId>()
-    {
-        Ok(id) => id,
-        Err(_) => {
-            warn!("Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",);
-            return Ok(None);
-        }
-    };
-
-    Ok(Some((
-        tenant_shard_id,
-        Tenant::load_tenant_config(conf, &tenant_shard_id),
-    )))
+    Some(Tenant::load_tenant_config(conf, &tenant_shard_id))
 }
 
 /// Initial stage of load: walk the local tenants directory, clean up any temp files,
@@ -405,32 +382,51 @@ fn load_tenant_config(
 /// seconds even on reasonably fast drives.
 async fn init_load_tenant_configs(
     conf: &'static PageServerConf,
-) -> anyhow::Result<HashMap<TenantShardId, anyhow::Result<LocationConf>>> {
+) -> HashMap<TenantShardId, Result<LocationConf, LoadConfigError>> {
     let tenants_dir = conf.tenants_path();
 
-    let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<Utf8DirEntry>> {
-        let dir_entries = tenants_dir
-            .read_dir_utf8()
-            .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
+    let dentries = tokio::task::spawn_blocking(move || -> Vec<Utf8DirEntry> {
+        let context = format!("Reading tenants dir {tenants_dir}");
+        let dir_entries = tenants_dir.read_dir_utf8().fatal_err(&context);
 
-        Ok(dir_entries.collect::<Result<Vec<_>, std::io::Error>>()?)
+        dir_entries
+            .collect::<Result<Vec<_>, std::io::Error>>()
+            .fatal_err(&context)
     })
-    .await??;
+    .await
+    .expect("Config load task panicked");
 
     let mut configs = HashMap::new();
 
     let mut join_set = JoinSet::new();
     for dentry in dentries {
-        join_set.spawn_blocking(move || load_tenant_config(conf, dentry));
+        let tenant_shard_id = match dentry.file_name().parse::<TenantShardId>() {
+            Ok(id) => id,
+            Err(_) => {
+                warn!(
+                    "Invalid tenant path (garbage in our repo directory?): '{}'",
+                    dentry.file_name()
+                );
+                continue;
+            }
+        };
+
+        join_set.spawn_blocking(move || {
+            (
+                tenant_shard_id,
+                load_tenant_config(conf, tenant_shard_id, dentry),
+            )
+        });
     }
 
     while let Some(r) = join_set.join_next().await {
-        if let Some((tenant_id, tenant_config)) = r?? {
-            configs.insert(tenant_id, tenant_config);
+        let (tenant_shard_id, tenant_config) = r.expect("Panic in config load task");
+        if let Some(tenant_config) = tenant_config {
+            configs.insert(tenant_shard_id, tenant_config);
         }
     }
 
-    Ok(configs)
+    configs
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -472,7 +468,7 @@ pub async fn init_tenant_mgr(
     );
 
     // Scan local filesystem for attached tenants
-    let tenant_configs = init_load_tenant_configs(conf).await?;
+    let tenant_configs = init_load_tenant_configs(conf).await;
 
     // Determine which tenants are to be secondary or attached, and in which generation
     let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
@@ -590,31 +586,23 @@ pub async fn init_tenant_mgr(
     );
     // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
     for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
-        // Errors writing configs are fatal
-        config_write_result?;
+        // Writing a config to local disk is foundational to startup up tenants: panic if we can't.
+        config_write_result.fatal_err("writing tenant shard config file");
 
         let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
         let shard_identity = location_conf.shard;
         let slot = match location_conf.mode {
-            LocationMode::Attached(attached_conf) => {
-                match tenant_spawn(
-                    conf,
-                    tenant_shard_id,
-                    &tenant_dir_path,
-                    resources.clone(),
-                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
-                    shard_identity,
-                    Some(init_order.clone()),
-                    SpawnMode::Lazy,
-                    &ctx,
-                ) {
-                    Ok(tenant) => TenantSlot::Attached(tenant),
-                    Err(e) => {
-                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
-                        continue;
-                    }
-                }
-            }
+            LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
+                conf,
+                tenant_shard_id,
+                &tenant_dir_path,
+                resources.clone(),
+                AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                shard_identity,
+                Some(init_order.clone()),
+                SpawnMode::Lazy,
+                &ctx,
+            )),
             LocationMode::Secondary(secondary_conf) => {
                 info!(
                     tenant_id = %tenant_shard_id.tenant_id,
@@ -649,8 +637,7 @@ pub async fn init_tenant_mgr(
     })
 }
 
-/// Wrapper for Tenant::spawn that checks invariants before running, and inserts
-/// a broken tenant in the map if Tenant::spawn fails.
+/// Wrapper for Tenant::spawn that checks invariants before running
 #[allow(clippy::too_many_arguments)]
 fn tenant_spawn(
     conf: &'static PageServerConf,
@@ -662,23 +649,18 @@ fn tenant_spawn(
     init_order: Option<InitializationOrder>,
     mode: SpawnMode,
     ctx: &RequestContext,
-) -> anyhow::Result<Arc<Tenant>> {
-    anyhow::ensure!(
-        tenant_path.is_dir(),
-        "Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory"
-    );
-    anyhow::ensure!(
-        !crate::is_temporary(tenant_path),
-        "Cannot load tenant from temporary path {tenant_path:?}"
-    );
-    anyhow::ensure!(
-        !tenant_path.is_empty_dir().with_context(|| {
-            format!("Failed to check whether {tenant_path:?} is an empty dir")
-        })?,
-        "Cannot load tenant from empty directory {tenant_path:?}"
-    );
+) -> Arc<Tenant> {
+    // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
+    // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
+    // to avoid impacting prod runtime performance.
+    assert!(!crate::is_temporary(tenant_path));
+    debug_assert!(tenant_path.is_dir());
+    debug_assert!(conf
+        .tenant_location_config_path(&tenant_shard_id)
+        .try_exists()
+        .unwrap());
 
-    let tenant = Tenant::spawn(
+    Tenant::spawn(
         conf,
         tenant_shard_id,
         resources,
@@ -687,9 +669,7 @@ fn tenant_spawn(
         init_order,
         mode,
         ctx,
-    );
-
-    Ok(tenant)
+    )
 }
 
 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
@@ -840,8 +820,9 @@ pub(crate) enum UpsertLocationError {
     #[error("Failed to flush: {0}")]
     Flush(anyhow::Error),
 
+    /// This error variant is for unexpected situations (soft assertions) where the system is in an unexpected state.
     #[error("Internal error: {0}")]
-    Other(#[from] anyhow::Error),
+    InternalError(anyhow::Error),
 }
 
 impl TenantManager {
@@ -971,7 +952,8 @@ impl TenantManager {
         match fast_path_taken {
             Some(FastPathModified::Attached(tenant)) => {
                 Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await?;
+                    .await
+                    .fatal_err("writing tenant shard config");
 
                 // Transition to AttachedStale means we may well hold a valid generation
                 // still, and have been requested to go stale as part of a migration.  If
@@ -1001,7 +983,8 @@ impl TenantManager {
             }
             Some(FastPathModified::Secondary(_secondary_tenant)) => {
                 Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await?;
+                    .await
+                    .fatal_err("writing tenant shard config");
 
                 return Ok(None);
             }
@@ -1067,7 +1050,7 @@ impl TenantManager {
             Some(TenantSlot::InProgress(_)) => {
                 // This should never happen: acquire_slot should error out
                 // if the contents of a slot were InProgress.
-                return Err(UpsertLocationError::Other(anyhow::anyhow!(
+                return Err(UpsertLocationError::InternalError(anyhow::anyhow!(
                     "Acquired an InProgress slot, this is a bug."
                 )));
             }
@@ -1086,12 +1069,14 @@ impl TenantManager {
         // Does not need to be fsync'd because local storage is just a cache.
         tokio::fs::create_dir_all(&timelines_path)
             .await
-            .with_context(|| format!("Creating {timelines_path}"))?;
+            .fatal_err("creating timelines/ dir");
 
         // Before activating either secondary or attached mode, persist the
         // configuration, so that on restart we will re-attach (or re-start
         // secondary) on the tenant.
-        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config).await?;
+        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+            .await
+            .fatal_err("writing tenant shard config");
 
         let new_slot = match &new_location_config.mode {
             LocationMode::Secondary(secondary_config) => {
@@ -1110,13 +1095,15 @@ impl TenantManager {
                 // from upserts.  This enables creating generation-less tenants even though neon_local
                 // always uses generations when calling the location conf API.
                 let attached_conf = if cfg!(feature = "testing") {
-                    let mut conf = AttachedTenantConf::try_from(new_location_config)?;
+                    let mut conf = AttachedTenantConf::try_from(new_location_config)
+                        .map_err(UpsertLocationError::BadRequest)?;
                     if self.conf.control_plane_api.is_none() {
                         conf.location.generation = Generation::none();
                     }
                     conf
                 } else {
-                    AttachedTenantConf::try_from(new_location_config)?
+                    AttachedTenantConf::try_from(new_location_config)
+                        .map_err(UpsertLocationError::BadRequest)?
                 };
 
                 let tenant = tenant_spawn(
@@ -1129,7 +1116,7 @@ impl TenantManager {
                     None,
                     spawn_mode,
                     ctx,
-                )?;
+                );
 
                 TenantSlot::Attached(tenant)
             }
@@ -1143,7 +1130,7 @@ impl TenantManager {
 
         match slot_guard.upsert(new_slot) {
             Err(TenantSlotUpsertError::InternalError(e)) => {
-                Err(UpsertLocationError::Other(anyhow::anyhow!(e)))
+                Err(UpsertLocationError::InternalError(anyhow::anyhow!(e)))
             }
             Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)),
             Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => {
@@ -1250,7 +1237,7 @@ impl TenantManager {
             None,
             SpawnMode::Eager,
             ctx,
-        )?;
+        );
 
         slot_guard.upsert(TenantSlot::Attached(tenant))?;
 
@@ -1984,7 +1971,7 @@ impl TenantManager {
             None,
             SpawnMode::Eager,
             ctx,
-        )?;
+        );
 
         slot_guard.upsert(TenantSlot::Attached(tenant))?;
 
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 93e9ad3673..3705406c2f 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -41,18 +41,35 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
     neon_simple_env.storage_controller.allowed_errors.extend(error_regexes)
 
     pageserver_http = neon_simple_env.pageserver.http_client()
+
+    # Failure to write a config to local disk makes the pageserver assume that local disk is bad and abort the process
     pageserver_http.configure_failpoints(("tenant-config-before-write", "return"))
-    with pytest.raises(Exception, match="tenant-config-before-write"):
+
+    # Storage controller will see a torn TCP connection when the crash point is reached, and follow an unclean 500 error path
+    neon_simple_env.storage_controller.allowed_errors.extend(
+        [
+            ".*Reconcile not done yet while creating tenant.*",
+            ".*Reconcile error: receive body: error sending request.*",
+            ".*Error processing HTTP request: InternalServerError.*",
+        ]
+    )
+
+    with pytest.raises(Exception, match="error sending request"):
         _ = neon_simple_env.neon_cli.create_tenant()
 
+    # Any files left behind on disk during failed creation do not prevent
+    # a retry from succeeding.  Restart pageserver with no failpoints.
+    neon_simple_env.pageserver.running = False
+    neon_simple_env.pageserver.start()
+
+    # The failed creation should not be present in list of tenants, as when we start up we'll see
+    # an empty tenant dir with no config in it.
+    neon_simple_env.pageserver.allowed_errors.append(".*Failed to load tenant config.*")
     new_tenants = sorted(
         map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines())
     )
     assert initial_tenants == new_tenants, "should not create new tenants"
 
-    # Any files left behind on disk during failed creation do not prevent
-    # a retry from succeeding.
-    pageserver_http.configure_failpoints(("tenant-config-before-write", "off"))
     neon_simple_env.neon_cli.create_tenant()
 
 
From 9b4b4bbf6f4b801250ea3b683cc91a626392d12f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 2 Jul 2024 15:13:27 +0200
Subject: [PATCH 1094/1571] fix: noisy logging when download gets cancelled
 during shutdown (#8224)

Before this PR, during timeline shutdown, we'd occasionally see
log lines like this one:

```
2024-06-26T18:28:11.063402Z  INFO initial_size_calculation{tenant_id=$TENANT,shard_id=0000 timeline_id=$TIMELINE}:logical_size_calculation_task:get_or_maybe_download{layer=000000000000000000000000000000000000-000000067F0001A3950001C1630100000000__0000000D88265898}: layer file download failed, and caller has been cancelled: Cancelled, shutting down
Stack backtrace:
   0: <core::result::Result<T,F> as core::ops::try_trait::FromResidual<core::result::Result<core::convert::Infallible,E>>>::from_residual
             at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/core/src/result.rs:1964:27
      pageserver::tenant::remote_timeline_client::RemoteTimelineClient::download_layer_file::{{closure}}
             at /home/nonroot/pageserver/src/tenant/remote_timeline_client.rs:531:13
      pageserver::tenant::storage_layer::layer::LayerInner::download_and_init::{{closure}}
             at /home/nonroot/pageserver/src/tenant/storage_layer/layer.rs:1136:14
      pageserver::tenant::storage_layer::layer::LayerInner::download_init_and_wait::{{closure}}::{{closure}}
             at /home/nonroot/pageserver/src/tenant/storage_layer/layer.rs:1082:74
```

We can eliminate the anyhow backtrace with no loss of information
because the conversion to anyhow::Error happens in exactly one place.

refs #7427
---
 pageserver/src/tenant/remote_timeline_client.rs |  2 +-
 pageserver/src/tenant/storage_layer/layer.rs    | 17 ++++-------------
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index e33e4b84aa..bc9364de61 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -519,7 +519,7 @@ impl RemoteTimelineClient {
         local_path: &Utf8Path,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<u64> {
+    ) -> Result<u64, DownloadError> {
         let downloaded_size = {
             let _unfinished_gauge_guard = self.metrics.call_begin(
                 &RemoteOpFileKind::Layer,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 5dd9472535..02069c29d2 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1096,19 +1096,10 @@ impl LayerInner {
 
         match rx.await {
             Ok(Ok(res)) => Ok(res),
-            Ok(Err(e)) => {
-                // sleep already happened in the spawned task, if it was not cancelled
-                match e.downcast_ref::<remote_storage::DownloadError>() {
-                    // If the download failed due to its cancellation token,
-                    // propagate the cancellation error upstream.
-                    Some(remote_storage::DownloadError::Cancelled) => {
-                        Err(DownloadError::DownloadCancelled)
-                    }
-                    // FIXME: this is not embedding the error because historically it would had
-                    // been output to compute, however that is no longer the case.
-                    _ => Err(DownloadError::DownloadFailed),
-                }
+            Ok(Err(remote_storage::DownloadError::Cancelled)) => {
+                Err(DownloadError::DownloadCancelled)
             }
+            Ok(Err(_)) => Err(DownloadError::DownloadFailed),
             Err(_gone) => Err(DownloadError::DownloadCancelled),
         }
     }
@@ -1118,7 +1109,7 @@ impl LayerInner {
         timeline: Arc<Timeline>,
         permit: heavier_once_cell::InitPermit,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<DownloadedLayer>> {
+    ) -> Result<Arc<DownloadedLayer>, remote_storage::DownloadError> {
         let result = timeline
             .remote_client
             .download_layer_file(

From 28929d9cfa03a003cc96925458a434ac31ec8f27 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 2 Jul 2024 14:14:10 +0100
Subject: [PATCH 1095/1571] pageserver: rate limit log for loads of layers
 visited (#8228)

## Problem
At high percentiles we see more than 800 layers being visited by the
read path. We need the tenant/timeline to investigate.

## Summary of changes
Add a rate limited log line when the average number of layers visited
per key is in the last specified histogram bucket.
I plan to use this to identify tenants in us-east-2 staging that exhibit
this behaviour. Will revert before next week's release.
---
 libs/pageserver_api/src/keyspace.rs | 10 ++++++++++
 pageserver/src/tenant/timeline.rs   | 22 +++++++++++++++++++---
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 9a61f2ad81..401887d362 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -17,6 +17,16 @@ pub struct KeySpace {
     pub ranges: Vec<Range<Key>>,
 }
 
+impl std::fmt::Display for KeySpace {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[")?;
+        for range in &self.ranges {
+            write!(f, "{}..{},", range.start, range.end)?;
+        }
+        write!(f, "]")
+    }
+}
+
 /// A wrapper type for sparse keyspaces.
 #[derive(Clone, Debug, Default, PartialEq, Eq)]
 pub struct SparseKeySpace(pub KeySpace);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8dd0a23f46..ec94ed3a56 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -996,6 +996,7 @@ impl Timeline {
     }
 
     pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
+    pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;
 
     /// Look up multiple page versions at a given LSN
     ///
@@ -1228,7 +1229,7 @@ impl Timeline {
         let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
             .for_get_kind(get_kind)
             .start_timer();
-        self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx)
+        self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
             .await?;
         get_data_timer.stop_and_record();
 
@@ -1258,11 +1259,26 @@ impl Timeline {
         // (this is a requirement, not a bug). Skip updating the metric in these cases
         // to avoid infinite results.
         if !results.is_empty() {
+            let avg = layers_visited as f64 / results.len() as f64;
+            if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                let mut rate_limit = LOGGED.lock().unwrap();
+                rate_limit.call(|| {
+                    tracing::info!(
+                    tenant_id = %self.tenant_shard_id.tenant_id,
+                    shard_id = %self.tenant_shard_id.shard_slug(),
+                    timeline_id = %self.timeline_id,
+                    "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned",
+                    keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size());
+                });
+            }
+
             // Note that this is an approximation. Tracking the exact number of layers visited
             // per key requires virtually unbounded memory usage and is inefficient
             // (i.e. segment tree tracking each range queried from a layer)
-            crate::metrics::VEC_READ_NUM_LAYERS_VISITED
-                .observe(layers_visited as f64 / results.len() as f64);
+            crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg);
         }
 
         Ok(results)

From 25eefdeb1fe2f217ec4e3b8f4d2dff9fd702ab60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 2 Jul 2024 16:14:12 +0200
Subject: [PATCH 1096/1571] Add support for reading and writing compressed
 blobs (#8106)

Add support for reading and writing zstd-compressed blobs for use in
image layer generation, but maybe one day useful also for delta layers.
The reading of them is unconditional while the writing is controlled by
the `image_compression` config variable allowing for experiments.

For the on-disk format, we re-use some of the bitpatterns we currently
keep reserved for blobs larger than 256 MiB. This assumes that we have
never ever written any such large blobs to image layers.

After the preparation in #7852, we now are unable to read blobs with a
size larger than 256 MiB (or write them).

A non-goal of this PR is to come up with good heuristics of when to
compress a bitpattern. This is left for future work.

Parts of the PR were inspired by #7091.

cc  #7879

Part of #5431
---
 libs/pageserver_api/src/models.rs             |  18 ++
 pageserver/src/config.rs                      |  21 ++-
 pageserver/src/tenant/blob_io.rs              | 155 +++++++++++++++---
 .../src/tenant/storage_layer/delta_layer.rs   |   7 +-
 4 files changed, 177 insertions(+), 24 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 61a255cdbc..959e161c16 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -432,6 +432,24 @@ pub enum CompactionAlgorithm {
     Tiered,
 }
 
+#[derive(
+    Debug,
+    Clone,
+    Copy,
+    PartialEq,
+    Eq,
+    Serialize,
+    Deserialize,
+    strum_macros::FromRepr,
+    strum_macros::EnumString,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum ImageCompressionAlgorithm {
+    /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
+    /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html).
+    Zstd { level: Option<i8> },
+}
+
 #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
 pub struct CompactionAlgorithmSettings {
     pub kind: CompactionAlgorithm,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 2b698b75dc..470e941c33 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -5,7 +5,7 @@
 //! See also `settings.md` for better description on every parameter.
 
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde;
 use serde::de::IntoDeserializer;
@@ -50,6 +50,7 @@ pub mod defaults {
         DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
         DEFAULT_PG_LISTEN_PORT,
     };
+    use pageserver_api::models::ImageCompressionAlgorithm;
     pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
 
     pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
@@ -90,6 +91,8 @@ pub mod defaults {
 
     pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
 
+    pub const DEFAULT_IMAGE_COMPRESSION: Option<ImageCompressionAlgorithm> = None;
+
     pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
 
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
@@ -285,6 +288,8 @@ pub struct PageServerConf {
 
     pub validate_vectored_get: bool,
 
+    pub image_compression: Option<ImageCompressionAlgorithm>,
+
     /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
     /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
     /// of ephemeral data.
@@ -395,6 +400,8 @@ struct PageServerConfigBuilder {
 
     validate_vectored_get: BuilderValue<bool>,
 
+    image_compression: BuilderValue<Option<ImageCompressionAlgorithm>>,
+
     ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
 }
 
@@ -482,6 +489,7 @@ impl PageServerConfigBuilder {
             max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                 NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
             )),
+            image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
             validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
         }
@@ -667,6 +675,10 @@ impl PageServerConfigBuilder {
         self.validate_vectored_get = BuilderValue::Set(value);
     }
 
+    pub fn get_image_compression(&mut self, value: Option<ImageCompressionAlgorithm>) {
+        self.image_compression = BuilderValue::Set(value);
+    }
+
     pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
         self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
     }
@@ -727,6 +739,7 @@ impl PageServerConfigBuilder {
                 get_impl,
                 max_vectored_read_bytes,
                 validate_vectored_get,
+                image_compression,
                 ephemeral_bytes_per_memory_kb,
             }
             CUSTOM LOGIC
@@ -1004,6 +1017,9 @@ impl PageServerConf {
                 "validate_vectored_get" => {
                     builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
                 }
+                "image_compression" => {
+                    builder.get_image_compression(Some(parse_toml_from_str("image_compression", item)?))
+                }
                 "ephemeral_bytes_per_memory_kb" => {
                     builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                 }
@@ -1088,6 +1104,7 @@ impl PageServerConf {
                 NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                     .expect("Invalid default constant"),
             ),
+            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
             validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
         }
@@ -1328,6 +1345,7 @@ background_task_maximum_delay = '334 s'
                         .expect("Invalid default constant")
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
             },
             "Correct defaults should be used when no config values are provided"
@@ -1401,6 +1419,7 @@ background_task_maximum_delay = '334 s'
                         .expect("Invalid default constant")
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
             },
             "Should be able to parse all basic config values correctly"
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 2be8816cef..022801b17f 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -6,12 +6,18 @@
 //! is written as a one byte. If it's larger than that, the length
 //! is written as a four-byte integer, in big-endian, with the high
 //! bit set. This way, we can detect whether it's 1- or 4-byte header
-//! by peeking at the first byte.
+//! by peeking at the first byte. For blobs larger than 128 bits,
+//! we also specify three reserved bits, only one of the three bit
+//! patterns is currently in use (0b011) and signifies compression
+//! with zstd.
 //!
 //! len <  128: 0XXXXXXX
-//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
+//! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
+use async_compression::Level;
 use bytes::{BufMut, BytesMut};
+use pageserver_api::models::ImageCompressionAlgorithm;
+use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
 
 use crate::context::RequestContext;
@@ -66,12 +72,29 @@ impl<'a> BlockCursor<'a> {
                 len_buf.copy_from_slice(&buf[off..off + 4]);
                 off += 4;
             }
-            len_buf[0] &= 0x7f;
+            len_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
             u32::from_be_bytes(len_buf) as usize
         };
+        let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
 
-        dstbuf.clear();
-        dstbuf.reserve(len);
+        let mut tmp_buf = Vec::new();
+        let buf_to_write;
+        let compression = if compression_bits <= BYTE_UNCOMPRESSED {
+            buf_to_write = dstbuf;
+            None
+        } else if compression_bits == BYTE_ZSTD {
+            buf_to_write = &mut tmp_buf;
+            Some(dstbuf)
+        } else {
+            let error = std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                format!("invalid compression byte {compression_bits:x}"),
+            );
+            return Err(error);
+        };
+
+        buf_to_write.clear();
+        buf_to_write.reserve(len);
 
         // Read the payload
         let mut remain = len;
@@ -85,14 +108,35 @@ impl<'a> BlockCursor<'a> {
                 page_remain = PAGE_SZ;
             }
             let this_blk_len = min(remain, page_remain);
-            dstbuf.extend_from_slice(&buf[off..off + this_blk_len]);
+            buf_to_write.extend_from_slice(&buf[off..off + this_blk_len]);
             remain -= this_blk_len;
             off += this_blk_len;
         }
+
+        if let Some(dstbuf) = compression {
+            if compression_bits == BYTE_ZSTD {
+                let mut decoder = async_compression::tokio::write::ZstdDecoder::new(dstbuf);
+                decoder.write_all(buf_to_write).await?;
+                decoder.flush().await?;
+            } else {
+                unreachable!("already checked above")
+            }
+        }
+
         Ok(())
     }
 }
 
+/// Reserved bits for length and compression
+const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
+
+/// The maximum size of blobs we support. The highest few bits
+/// are reserved for compression and other further uses.
+const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
+
+const BYTE_UNCOMPRESSED: u8 = 0x80;
+const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
+
 /// A wrapper of `VirtualFile` that allows users to write blobs.
 ///
 /// If a `BlobWriter` is dropped, the internal buffer will be
@@ -219,6 +263,17 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         &mut self,
         srcbuf: B,
         ctx: &RequestContext,
+    ) -> (B::Buf, Result<u64, Error>) {
+        self.write_blob_maybe_compressed(srcbuf, ctx, None).await
+    }
+
+    /// Write a blob of data. Returns the offset that it was written to,
+    /// which can be used to retrieve the data later.
+    pub async fn write_blob_maybe_compressed<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        srcbuf: B,
+        ctx: &RequestContext,
+        algorithm: Option<ImageCompressionAlgorithm>,
     ) -> (B::Buf, Result<u64, Error>) {
         let offset = self.offset;
 
@@ -226,29 +281,58 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
 
         let mut io_buf = self.io_buf.take().expect("we always put it back below");
         io_buf.clear();
-        let (io_buf, hdr_res) = async {
+        let mut compressed_buf = None;
+        let ((io_buf, hdr_res), srcbuf) = async {
             if len < 128 {
                 // Short blob. Write a 1-byte length header
                 io_buf.put_u8(len as u8);
-                self.write_all(io_buf, ctx).await
+                (
+                    self.write_all(io_buf, ctx).await,
+                    srcbuf.slice_full().into_inner(),
+                )
             } else {
                 // Write a 4-byte length header
-                if len > 0x7fff_ffff {
+                if len > MAX_SUPPORTED_LEN {
                     return (
-                        io_buf,
-                        Err(Error::new(
-                            ErrorKind::Other,
-                            format!("blob too large ({len} bytes)"),
-                        )),
+                        (
+                            io_buf,
+                            Err(Error::new(
+                                ErrorKind::Other,
+                                format!("blob too large ({len} bytes)"),
+                            )),
+                        ),
+                        srcbuf.slice_full().into_inner(),
                     );
                 }
-                if len > 0x0fff_ffff {
-                    tracing::warn!("writing blob above future limit ({len} bytes)");
-                }
-                let mut len_buf = (len as u32).to_be_bytes();
-                len_buf[0] |= 0x80;
+                let (high_bit_mask, len_written, srcbuf) = match algorithm {
+                    Some(ImageCompressionAlgorithm::Zstd { level }) => {
+                        let mut encoder = if let Some(level) = level {
+                            async_compression::tokio::write::ZstdEncoder::with_quality(
+                                Vec::new(),
+                                Level::Precise(level.into()),
+                            )
+                        } else {
+                            async_compression::tokio::write::ZstdEncoder::new(Vec::new())
+                        };
+                        let slice = srcbuf.slice_full();
+                        encoder.write_all(&slice[..]).await.unwrap();
+                        encoder.shutdown().await.unwrap();
+                        let compressed = encoder.into_inner();
+                        if compressed.len() < len {
+                            let compressed_len = compressed.len();
+                            compressed_buf = Some(compressed);
+                            (BYTE_ZSTD, compressed_len, slice.into_inner())
+                        } else {
+                            (BYTE_UNCOMPRESSED, len, slice.into_inner())
+                        }
+                    }
+                    None => (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner()),
+                };
+                let mut len_buf = (len_written as u32).to_be_bytes();
+                assert_eq!(len_buf[0] & 0xf0, 0);
+                len_buf[0] |= high_bit_mask;
                 io_buf.extend_from_slice(&len_buf[..]);
-                self.write_all(io_buf, ctx).await
+                (self.write_all(io_buf, ctx).await, srcbuf)
             }
         }
         .await;
@@ -257,7 +341,12 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
             Ok(_) => (),
             Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
         }
-        let (srcbuf, res) = self.write_all(srcbuf, ctx).await;
+        let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf {
+            let (_buf, res) = self.write_all(compressed_buf, ctx).await;
+            (Slice::into_inner(srcbuf.slice(..)), res)
+        } else {
+            self.write_all(srcbuf, ctx).await
+        };
         (srcbuf, res.map(|_| offset))
     }
 }
@@ -295,6 +384,12 @@ mod tests {
     use rand::{Rng, SeedableRng};
 
     async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
+        round_trip_test_compressed::<BUFFERED, 0>(blobs).await
+    }
+
+    async fn round_trip_test_compressed<const BUFFERED: bool, const COMPRESSION: u8>(
+        blobs: &[Vec<u8>],
+    ) -> Result<(), Error> {
         let temp_dir = camino_tempfile::tempdir()?;
         let pathbuf = temp_dir.path().join("file");
         let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
@@ -305,7 +400,18 @@ mod tests {
             let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
-                let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
+                let (_, res) = match COMPRESSION {
+                    0 => wtr.write_blob(blob.clone(), &ctx).await,
+                    1 => {
+                        wtr.write_blob_maybe_compressed(
+                            blob.clone(),
+                            &ctx,
+                            Some(ImageCompressionAlgorithm::Zstd { level: Some(1) }),
+                        )
+                        .await
+                    }
+                    _ => unreachable!("Invalid compression {COMPRESSION}"),
+                };
                 let offs = res?;
                 offsets.push(offs);
             }
@@ -361,10 +467,15 @@ mod tests {
         let blobs = &[
             b"test".to_vec(),
             random_array(10 * PAGE_SZ),
+            b"hello".to_vec(),
+            random_array(66 * PAGE_SZ),
+            vec![0xf3; 24 * PAGE_SZ],
             b"foobar".to_vec(),
         ];
         round_trip_test::<false>(blobs).await?;
         round_trip_test::<true>(blobs).await?;
+        round_trip_test_compressed::<false, 1>(blobs).await?;
+        round_trip_test_compressed::<true, 1>(blobs).await?;
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index c2d4a2776b..e6a4d6d5c4 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -452,7 +452,12 @@ impl DeltaLayerWriterInner {
         ctx: &RequestContext,
     ) -> (Vec<u8>, anyhow::Result<()>) {
         assert!(self.lsn_range.start <= lsn);
-        let (val, res) = self.blob_writer.write_blob(val, ctx).await;
+        // We don't want to use compression in delta layer creation
+        let compression = None;
+        let (val, res) = self
+            .blob_writer
+            .write_blob_maybe_compressed(val, ctx, compression)
+            .await;
         let off = match res {
             Ok(off) => off,
             Err(e) => return (val, Err(anyhow::anyhow!(e))),

From 5de896e7d890271362966ebb6a42f16b5b8cd966 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 2 Jul 2024 16:29:09 +0200
Subject: [PATCH 1097/1571] L0 flush: opt-in mechanism to bypass PageCache
 reads and writes (#8190)

part of https://github.com/neondatabase/neon/issues/7418

# Motivation

(reproducing #7418)

When we do an `InMemoryLayer::write_to_disk`, there is a tremendous
amount of random read I/O, as deltas from the ephemeral file (written in
LSN order) are written out to the delta layer in key order.

In benchmarks (https://github.com/neondatabase/neon/pull/7409) we can
see that this delta layer writing phase is substantially more expensive
than the initial ingest of data, and that within the delta layer write a
significant amount of the CPU time is spent traversing the page cache.

# High-Level Changes

Add a new mode for L0 flush that works as follows:

* Read the full ephemeral file into memory -- layers are much smaller
than total memory, so this is afforable
* Do all the random reads directly from this in memory buffer instead of
using blob IO/page cache/disk reads.
* Add a semaphore to limit how many timelines may concurrently do this
(limit peak memory).
* Make the semaphore configurable via PS config.

# Implementation Details

The new `BlobReaderRef::Slice` is a temporary hack until we can ditch
`blob_io` for `InMemoryLayer` => Plan for this is laid out in
https://github.com/neondatabase/neon/issues/8183

# Correctness

The correctness of this change is quite obvious to me: we do what we did
before (`blob_io`) but read from memory instead of going to disk.

The highest bug potential is in doing owned-buffers IO. I refactored the
API a bit in preliminary PR
https://github.com/neondatabase/neon/pull/8186 to make it less
error-prone, but still, careful review is requested.

# Performance

I manually measured single-client ingest performance from `pgbench -i
...`.

Full report:
https://neondatabase.notion.site/2024-06-28-benchmarking-l0-flush-performance-e98cff3807f94cb38f2054d8c818fe84?pvs=4

tl;dr:

* no speed improvements during ingest,  but
* significantly lower pressure on PS PageCache (eviction rate drops to
1/3)
  * (that's why I'm working on this)
* noticable but modestly lower CPU time

This is good enough for merging this PR because the changes require
opt-in.

We'll do more testing in staging & pre-prod.

# Stability / Monitoring

**memory consumption**: there's no _hard_ limit on max `InMemoryLayer`
size (aka "checkpoint distance") , hence there's no hard limit on the
memory allocation we do for flushing. In practice, we a) [log a
warning](https://github.com/neondatabase/neon/blob/23827c6b0d400cbb9a972d4d05d49834816c40d1/pageserver/src/tenant/timeline.rs#L5741-L5743)
when we flush oversized layers, so we'd know which tenant is to blame
and b) if we were to put a hard limit in place, we would have to decide
what to do if there is an InMemoryLayer that exceeds the limit.
It seems like a better option to guarantee a max size for frozen layer,
dependent on `checkpoint_distance`. Then limit concurrency based on
that.

**metrics**: we do have the
[flush_time_histo](https://github.com/neondatabase/neon/blob/23827c6b0d400cbb9a972d4d05d49834816c40d1/pageserver/src/tenant/timeline.rs#L3725-L3726),
but that includes the wait time for the semaphore. We could add a
separate metric for the time spent after acquiring the semaphore, so one
can infer the wait time. Seems unnecessary at this point, though.
---
 pageserver/src/bin/pageserver.rs              |   5 +
 pageserver/src/config.rs                      |  18 ++-
 pageserver/src/l0_flush.rs                    |  46 ++++++
 pageserver/src/lib.rs                         |   1 +
 pageserver/src/tenant.rs                      |  13 ++
 pageserver/src/tenant/block_io.rs             |  22 +++
 pageserver/src/tenant/ephemeral_file.rs       |   8 +-
 .../src/tenant/ephemeral_file/page_caching.rs | 146 +++++++++++++-----
 .../ephemeral_file/zero_padded_read_write.rs  |  15 ++
 .../tenant/storage_layer/inmemory_layer.rs    |  94 ++++++++---
 pageserver/src/tenant/timeline.rs             |  10 +-
 pageserver/src/tenant/timeline/delete.rs      |   1 +
 12 files changed, 322 insertions(+), 57 deletions(-)
 create mode 100644 pageserver/src/l0_flush.rs

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index ba5b2608bd..39d4e46c96 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -421,6 +421,10 @@ fn start_pageserver(
         background_jobs_can_start: background_jobs_barrier.clone(),
     };
 
+    info!(config=?conf.l0_flush, "using l0_flush config");
+    let l0_flush_global_state =
+        pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());
+
     // Scan the local 'tenants/' directory and start loading the tenants
     let deletion_queue_client = deletion_queue.new_client();
     let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
@@ -429,6 +433,7 @@ fn start_pageserver(
             broker_client: broker_client.clone(),
             remote_storage: remote_storage.clone(),
             deletion_queue_client,
+            l0_flush_global_state,
         },
         order,
         shutdown_pageserver.clone(),
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 470e941c33..fa7f7d8d97 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -30,11 +30,11 @@ use utils::{
     logging::LogFormat,
 };
 
-use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
+use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl};
 use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
 
@@ -296,6 +296,8 @@ pub struct PageServerConf {
     ///
     /// Setting this to zero disables limits on total ephemeral layer size.
     pub ephemeral_bytes_per_memory_kb: usize,
+
+    pub l0_flush: L0FlushConfig,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -403,6 +405,8 @@ struct PageServerConfigBuilder {
     image_compression: BuilderValue<Option<ImageCompressionAlgorithm>>,
 
     ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
+
+    l0_flush: BuilderValue<L0FlushConfig>,
 }
 
 impl PageServerConfigBuilder {
@@ -492,6 +496,7 @@ impl PageServerConfigBuilder {
             image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
             validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
+            l0_flush: Set(L0FlushConfig::default()),
         }
     }
 }
@@ -683,6 +688,10 @@ impl PageServerConfigBuilder {
         self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
     }
 
+    pub fn l0_flush(&mut self, value: L0FlushConfig) {
+        self.l0_flush = BuilderValue::Set(value);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
@@ -741,6 +750,7 @@ impl PageServerConfigBuilder {
                 validate_vectored_get,
                 image_compression,
                 ephemeral_bytes_per_memory_kb,
+                l0_flush,
             }
             CUSTOM LOGIC
             {
@@ -1023,6 +1033,9 @@ impl PageServerConf {
                 "ephemeral_bytes_per_memory_kb" => {
                     builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                 }
+                "l0_flush" => {
+                    builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1107,6 +1120,7 @@ impl PageServerConf {
             image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
             validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+            l0_flush: L0FlushConfig::default(),
         }
     }
 }
@@ -1347,6 +1361,7 @@ background_task_maximum_delay = '334 s'
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                 image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                l0_flush: L0FlushConfig::default(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1421,6 +1436,7 @@ background_task_maximum_delay = '334 s'
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                 image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                l0_flush: L0FlushConfig::default(),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs
new file mode 100644
index 0000000000..7fe8fedc63
--- /dev/null
+++ b/pageserver/src/l0_flush.rs
@@ -0,0 +1,46 @@
+use std::{num::NonZeroUsize, sync::Arc};
+
+use crate::tenant::ephemeral_file;
+
+#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
+#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+pub enum L0FlushConfig {
+    #[default]
+    PageCached,
+    #[serde(rename_all = "snake_case")]
+    Direct { max_concurrency: NonZeroUsize },
+}
+
+#[derive(Clone)]
+pub struct L0FlushGlobalState(Arc<Inner>);
+
+pub(crate) enum Inner {
+    PageCached,
+    Direct { semaphore: tokio::sync::Semaphore },
+}
+
+impl L0FlushGlobalState {
+    pub fn new(config: L0FlushConfig) -> Self {
+        match config {
+            L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
+            L0FlushConfig::Direct { max_concurrency } => {
+                let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
+                Self(Arc::new(Inner::Direct { semaphore }))
+            }
+        }
+    }
+
+    pub(crate) fn inner(&self) -> &Arc<Inner> {
+        &self.0
+    }
+}
+
+impl L0FlushConfig {
+    pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
+        use L0FlushConfig::*;
+        match self {
+            PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
+            Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
+        }
+    }
+}
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 353f97264c..ac6b9b4f2a 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -11,6 +11,7 @@ pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
+pub mod l0_flush;
 pub use pageserver_api::keyspace;
 pub mod aux_file;
 pub mod metrics;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 116481a1eb..89bf89471c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -73,6 +73,7 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
+use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::TENANT;
 use crate::metrics::{
     remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
@@ -166,6 +167,7 @@ pub struct TenantSharedResources {
     pub broker_client: storage_broker::BrokerClientChannel,
     pub remote_storage: GenericRemoteStorage,
     pub deletion_queue_client: DeletionQueueClient,
+    pub l0_flush_global_state: L0FlushGlobalState,
 }
 
 /// A [`Tenant`] is really an _attached_ tenant.  The configuration
@@ -294,6 +296,8 @@ pub struct Tenant {
 
     /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
     ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
+
+    l0_flush_global_state: L0FlushGlobalState,
 }
 
 impl std::fmt::Debug for Tenant {
@@ -676,6 +680,7 @@ impl Tenant {
             broker_client,
             remote_storage,
             deletion_queue_client,
+            l0_flush_global_state,
         } = resources;
 
         let attach_mode = attached_conf.location.attach_mode;
@@ -690,6 +695,7 @@ impl Tenant {
             tenant_shard_id,
             remote_storage.clone(),
             deletion_queue_client,
+            l0_flush_global_state,
         ));
 
         // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
@@ -989,6 +995,7 @@ impl Tenant {
                 TimelineResources {
                     remote_client,
                     timeline_get_throttle: self.timeline_get_throttle.clone(),
+                    l0_flush_global_state: self.l0_flush_global_state.clone(),
                 },
                 ctx,
             )
@@ -2478,6 +2485,7 @@ impl Tenant {
         tenant_shard_id: TenantShardId,
         remote_storage: GenericRemoteStorage,
         deletion_queue_client: DeletionQueueClient,
+        l0_flush_global_state: L0FlushGlobalState,
     ) -> Tenant {
         debug_assert!(
             !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
@@ -2565,6 +2573,7 @@ impl Tenant {
             )),
             tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
             ongoing_timeline_detach: std::sync::Mutex::default(),
+            l0_flush_global_state,
         }
     }
 
@@ -3302,6 +3311,7 @@ impl Tenant {
         TimelineResources {
             remote_client,
             timeline_get_throttle: self.timeline_get_throttle.clone(),
+            l0_flush_global_state: self.l0_flush_global_state.clone(),
         }
     }
 
@@ -3638,6 +3648,7 @@ pub(crate) mod harness {
     use utils::logging;
 
     use crate::deletion_queue::mock::MockDeletionQueue;
+    use crate::l0_flush::L0FlushConfig;
     use crate::walredo::apply_neon;
     use crate::{repository::Key, walrecord::NeonWalRecord};
 
@@ -3827,6 +3838,8 @@ pub(crate) mod harness {
                 self.tenant_shard_id,
                 self.remote_storage.clone(),
                 self.deletion_queue.new_client(),
+                // TODO: ideally we should run all unit tests with both configs
+                L0FlushGlobalState::new(L0FlushConfig::default()),
             ));
 
             let preload = tenant
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index b406d50332..85f3b1c799 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -37,6 +37,7 @@ where
 pub enum BlockLease<'a> {
     PageReadGuard(PageReadGuard<'static>),
     EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
+    Slice(&'a [u8; PAGE_SZ]),
     #[cfg(test)]
     Arc(std::sync::Arc<[u8; PAGE_SZ]>),
     #[cfg(test)]
@@ -63,6 +64,7 @@ impl<'a> Deref for BlockLease<'a> {
         match self {
             BlockLease::PageReadGuard(v) => v.deref(),
             BlockLease::EphemeralFileMutableTail(v) => v,
+            BlockLease::Slice(v) => v,
             #[cfg(test)]
             BlockLease::Arc(v) => v.deref(),
             #[cfg(test)]
@@ -81,6 +83,7 @@ pub(crate) enum BlockReaderRef<'a> {
     FileBlockReader(&'a FileBlockReader<'a>),
     EphemeralFile(&'a EphemeralFile),
     Adapter(Adapter<&'a DeltaLayerInner>),
+    Slice(&'a [u8]),
     #[cfg(test)]
     TestDisk(&'a super::disk_btree::tests::TestDisk),
     #[cfg(test)]
@@ -99,6 +102,7 @@ impl<'a> BlockReaderRef<'a> {
             FileBlockReader(r) => r.read_blk(blknum, ctx).await,
             EphemeralFile(r) => r.read_blk(blknum, ctx).await,
             Adapter(r) => r.read_blk(blknum, ctx).await,
+            Slice(s) => Self::read_blk_slice(s, blknum),
             #[cfg(test)]
             TestDisk(r) => r.read_blk(blknum),
             #[cfg(test)]
@@ -107,6 +111,24 @@ impl<'a> BlockReaderRef<'a> {
     }
 }
 
+impl<'a> BlockReaderRef<'a> {
+    fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
+        let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
+        let end = start.checked_add(PAGE_SZ).unwrap();
+        if end > slice.len() {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::UnexpectedEof,
+                format!("slice too short, len={} end={}", slice.len(), end),
+            ));
+        }
+        let slice = &slice[start..end];
+        let page_sized: &[u8; PAGE_SZ] = slice
+            .try_into()
+            .expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
+        Ok(BlockLease::Slice(page_sized))
+    }
+}
+
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 79cc7bf153..bb65ae24fc 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -21,6 +21,7 @@ pub struct EphemeralFile {
 }
 
 mod page_caching;
+pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
 mod zero_padded_read_write;
 
 impl EphemeralFile {
@@ -53,7 +54,7 @@ impl EphemeralFile {
         Ok(EphemeralFile {
             _tenant_shard_id: tenant_shard_id,
             _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file),
+            rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
         })
     }
 
@@ -65,6 +66,11 @@ impl EphemeralFile {
         self.rw.page_cache_file_id()
     }
 
+    /// See [`self::page_caching::RW::load_to_vec`].
+    pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
+        self.rw.load_to_vec(ctx).await
+    }
+
     pub(crate) async fn read_blk(
         &self,
         blknum: u32,
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
index 276ac87064..43b9fff28d 100644
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -8,6 +8,7 @@ use crate::virtual_file::VirtualFile;
 
 use once_cell::sync::Lazy;
 use std::io::{self, ErrorKind};
+use std::ops::{Deref, Range};
 use tokio_epoll_uring::BoundedBuf;
 use tracing::*;
 
@@ -19,14 +20,23 @@ pub struct RW {
     rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
 }
 
+/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
+/// should we pre-warm the [`crate::page_cache`] with the contents?
+#[derive(Clone, Copy)]
+pub enum PrewarmOnWrite {
+    Yes,
+    No,
+}
+
 impl RW {
-    pub fn new(file: VirtualFile) -> Self {
+    pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
         let page_cache_file_id = page_cache::next_file_id();
         Self {
             page_cache_file_id,
             rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
                 page_cache_file_id,
                 file,
+                prewarm_on_write,
             )),
         }
     }
@@ -49,6 +59,43 @@ impl RW {
         self.rw.bytes_written()
     }
 
+    /// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer.
+    ///
+    /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer.
+    /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`].
+    pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
+        // round up to the next PAGE_SZ multiple, required by blob_io
+        let size = {
+            let s = usize::try_from(self.bytes_written()).unwrap();
+            if s % PAGE_SZ == 0 {
+                s
+            } else {
+                s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap()
+            }
+        };
+        let vec = Vec::with_capacity(size);
+
+        // read from disk what we've already flushed
+        let writer = self.rw.as_writer();
+        let flushed_range = writer.written_range();
+        let mut vec = writer
+            .file
+            .read_exact_at(
+                vec.slice(0..(flushed_range.end - flushed_range.start)),
+                u64::try_from(flushed_range.start).unwrap(),
+                ctx,
+            )
+            .await?
+            .into_inner();
+
+        // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
+        let buffered = self.rw.get_tail_zero_padded();
+        vec.extend_from_slice(buffered);
+        assert_eq!(vec.len(), size);
+        assert_eq!(vec.len() % PAGE_SZ, 0);
+        Ok(vec)
+    }
+
     pub(crate) async fn read_blk(
         &self,
         blknum: u32,
@@ -116,19 +163,40 @@ impl Drop for RW {
 }
 
 struct PreWarmingWriter {
+    prewarm_on_write: PrewarmOnWrite,
     nwritten_blocks: u32,
     page_cache_file_id: page_cache::FileId,
     file: VirtualFile,
 }
 
 impl PreWarmingWriter {
-    fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self {
+    fn new(
+        page_cache_file_id: page_cache::FileId,
+        file: VirtualFile,
+        prewarm_on_write: PrewarmOnWrite,
+    ) -> Self {
         Self {
+            prewarm_on_write,
             nwritten_blocks: 0,
             page_cache_file_id,
             file,
         }
     }
+
+    /// Return the byte range within `file` that has been written though `write_all`.
+    ///
+    /// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
+    fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
+        let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
+        struct Wrapper(Range<usize>);
+        impl Deref for Wrapper {
+            type Target = Range<usize>;
+            fn deref(&self) -> &Range<usize> {
+                &self.0
+            }
+        }
+        Wrapper(0..nwritten_blocks * PAGE_SZ)
+    }
 }
 
 impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
@@ -178,45 +246,51 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
             assert_eq!(&check_bounds_stuff_works, &*buf);
         }
 
-        // Pre-warm page cache with the contents.
-        // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
-        // benefits the code that writes InMemoryLayer=>L0 layers.
         let nblocks = buflen / PAGE_SZ;
         let nblocks32 = u32::try_from(nblocks).unwrap();
-        let cache = page_cache::get();
-        static CTX: Lazy<RequestContext> = Lazy::new(|| {
-            RequestContext::new(
-                crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
-                crate::context::DownloadBehavior::Error,
-            )
-        });
-        for blknum_in_buffer in 0..nblocks {
-            let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
-            let blknum = self
-                .nwritten_blocks
-                .checked_add(blknum_in_buffer as u32)
-                .unwrap();
-            match cache
-                .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
-                .await
-            {
-                Err(e) => {
-                    error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
-                    // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
-                }
-                Ok(v) => match v {
-                    page_cache::ReadBufResult::Found(_guard) => {
-                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
-                        unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
+
+        if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
+            // Pre-warm page cache with the contents.
+            // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
+            // benefits the code that writes InMemoryLayer=>L0 layers.
+
+            let cache = page_cache::get();
+            static CTX: Lazy<RequestContext> = Lazy::new(|| {
+                RequestContext::new(
+                    crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
+                    crate::context::DownloadBehavior::Error,
+                )
+            });
+            for blknum_in_buffer in 0..nblocks {
+                let blk_in_buffer =
+                    &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
+                let blknum = self
+                    .nwritten_blocks
+                    .checked_add(blknum_in_buffer as u32)
+                    .unwrap();
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
+                    .await
+                {
+                    Err(e) => {
+                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
+                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
+                    }
+                    Ok(v) => match v {
+                        page_cache::ReadBufResult::Found(_guard) => {
+                            // This function takes &mut self, so, it shouldn't be possible to reach this point.
+                            unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
                                       and this function takes &mut self, so, no concurrent read_blk is possible");
-                    }
-                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                        write_guard.copy_from_slice(blk_in_buffer);
-                        let _ = write_guard.mark_valid();
-                    }
-                },
+                        }
+                        page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                            write_guard.copy_from_slice(blk_in_buffer);
+                            let _ = write_guard.mark_valid();
+                        }
+                    },
+                }
             }
         }
+
         self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
         Ok((buflen, buf.into_inner()))
     }
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
index b37eafb52c..fe310acab8 100644
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -75,6 +75,21 @@ where
         flushed_offset + u64::try_from(buffer.pending()).unwrap()
     }
 
+    /// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`].
+    pub fn get_tail_zero_padded(&self) -> &[u8] {
+        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
+        let buffer_written_up_to = buffer.pending();
+        // pad to next page boundary
+        let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 {
+            buffer_written_up_to
+        } else {
+            buffer_written_up_to
+                .checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ))
+                .unwrap()
+        };
+        &buffer.as_zero_padded_slice()[0..read_up_to]
+    }
+
     pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
         let flushed_offset = self.buffered_writer.as_inner().bytes_written();
         let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 6624fb7e6b..e1eaea90af 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -6,13 +6,14 @@
 //!
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
+use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
-use crate::tenant::block_io::BlockReader;
+use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{PageReconstructError, Timeline};
-use crate::{page_cache, walrecord};
+use crate::{l0_flush, page_cache, walrecord};
 use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
@@ -410,6 +411,7 @@ impl InMemoryLayer {
                 continue;
             }
 
+            // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
             let buf = reader.read_blob(block_read.block_offset, &ctx).await;
             if let Err(e) = buf {
                 reconstruct_state
@@ -620,6 +622,13 @@ impl InMemoryLayer {
         // rare though, so we just accept the potential latency hit for now.
         let inner = self.inner.read().await;
 
+        let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
+        use l0_flush::Inner;
+        let _concurrency_permit = match &*l0_flush_global_state {
+            Inner::PageCached => None,
+            Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
+        };
+
         let end_lsn = *self.end_lsn.get().unwrap();
 
         let key_count = if let Some(key_range) = key_range {
@@ -645,28 +654,77 @@ impl InMemoryLayer {
         )
         .await?;
 
-        let mut buf = Vec::new();
+        match &*l0_flush_global_state {
+            l0_flush::Inner::PageCached => {
+                let ctx = RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::InMemoryLayer)
+                    .build();
 
-        let cursor = inner.file.block_cursor();
+                let mut buf = Vec::new();
 
-        let ctx = RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::InMemoryLayer)
-            .build();
-        for (key, vec_map) in inner.index.iter() {
-            // Write all page versions
-            for (lsn, pos) in vec_map.as_slice() {
-                cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
-                let will_init = Value::des(&buf)?.will_init();
-                let res;
-                (buf, res) = delta_layer_writer
-                    .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
-                    .await;
-                res?;
+                let cursor = inner.file.block_cursor();
+
+                for (key, vec_map) in inner.index.iter() {
+                    // Write all page versions
+                    for (lsn, pos) in vec_map.as_slice() {
+                        cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
+                        let will_init = Value::des(&buf)?.will_init();
+                        let res;
+                        (buf, res) = delta_layer_writer
+                            .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
+                            .await;
+                        res?;
+                    }
+                }
+            }
+            l0_flush::Inner::Direct { .. } => {
+                let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
+                assert_eq!(
+                    file_contents.len() % PAGE_SZ,
+                    0,
+                    "needed by BlockReaderRef::Slice"
+                );
+                assert_eq!(file_contents.len(), {
+                    let written = usize::try_from(inner.file.len()).unwrap();
+                    if written % PAGE_SZ == 0 {
+                        written
+                    } else {
+                        written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap()
+                    }
+                });
+
+                let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents));
+
+                let mut buf = Vec::new();
+
+                for (key, vec_map) in inner.index.iter() {
+                    // Write all page versions
+                    for (lsn, pos) in vec_map.as_slice() {
+                        // TODO: once we have blob lengths in the in-memory index, we can
+                        // 1. get rid of the blob_io / BlockReaderRef::Slice business and
+                        // 2. load the file contents into a Bytes and
+                        // 3. the use `Bytes::slice` to get the `buf` that is our blob
+                        // 4. pass that `buf` into `put_value_bytes`
+                        // => https://github.com/neondatabase/neon/issues/8183
+                        cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
+                        let will_init = Value::des(&buf)?.will_init();
+                        let res;
+                        (buf, res) = delta_layer_writer
+                            .put_value_bytes(*key, *lsn, buf, will_init, ctx)
+                            .await;
+                        res?;
+                    }
+                }
+
+                // Hold the permit until the IO is done; if we didn't, one could drop this future,
+                // thereby releasing the permit, but the Vec<u8> remains allocated until the IO completes.
+                // => we'd have more concurrenct Vec<u8> than allowed as per the semaphore.
+                drop(_concurrency_permit);
             }
         }
 
         // MAX is used here because we identify L0 layers by full key range
-        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
         Ok(Some(delta_layer))
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ec94ed3a56..de9361d721 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -65,7 +65,6 @@ use std::{
     ops::{Deref, Range},
 };
 
-use crate::metrics::GetKind;
 use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
     aux_file::AuxFileSizeEstimator,
@@ -90,6 +89,10 @@ use crate::{
 use crate::{
     disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
 };
+use crate::{
+    l0_flush::{self, L0FlushGlobalState},
+    metrics::GetKind,
+};
 use crate::{
     metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
@@ -208,6 +211,7 @@ pub struct TimelineResources {
     pub timeline_get_throttle: Arc<
         crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
     >,
+    pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }
 
 pub(crate) struct AuxFilesState {
@@ -433,6 +437,8 @@ pub struct Timeline {
     /// in the future, add `extra_test_sparse_keyspace` if necessary.
     #[cfg(test)]
     pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
+
+    pub(crate) l0_flush_global_state: L0FlushGlobalState,
 }
 
 pub struct WalReceiverInfo {
@@ -2392,6 +2398,8 @@ impl Timeline {
 
                 #[cfg(test)]
                 extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
+
+                l0_flush_global_state: resources.l0_flush_global_state,
             };
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 6d747d424d..b0088f4ea2 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -272,6 +272,7 @@ impl DeleteTimelineFlow {
                 TimelineResources {
                     remote_client,
                     timeline_get_throttle: tenant.timeline_get_throttle.clone(),
+                    l0_flush_global_state: tenant.l0_flush_global_state.clone(),
                 },
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.

From 6216df776549ab79e45f50c7e1befcc9593960bb Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 2 Jul 2024 16:21:23 +0100
Subject: [PATCH 1098/1571] CI(benchmarking): move psql queries to
 actions/run-python-test-set (#8230)

## Problem

Some of the Nightly benchmarks fail with the error
```
+ /tmp/neon/pg_install/v14/bin/pgbench --version
/tmp/neon/pg_install/v14/bin/pgbench: error while loading shared libraries: libpq.so.5: cannot open shared object file: No such file or directory
```
Originally, we added the `pgbench --version` call to check that
`pgbench` is installed and to fail earlier if it's not.
The failure happens because we don't have `LD_LIBRARY_PATH` set for
every job, and it also affects `psql` command.
We can move it to `actions/run-python-test-set` so as not to duplicate
code (as it already have `LD_LIBRARY_PATH` set).

## Summary of changes
- Remove `pgbench --version` call
- Move `psql` commands to common `actions/run-python-test-set`
---
 .../actions/run-python-test-set/action.yml    | 10 ++-
 .github/workflows/benchmarking.yml            | 83 +------------------
 2 files changed, 12 insertions(+), 81 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index a2aae0772b..7f843de1a5 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -179,7 +179,15 @@ runs:
 
         # Wake up the cluster if we use remote neon instance
         if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then
-          ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();"
+          QUERIES=("SELECT version()")
+          if [[ "${PLATFORM}" = "neon"* ]]; then
+            QUERIES+=("SHOW neon.tenant_id")
+            QUERIES+=("SHOW neon.timeline_id")
+          fi
+
+          for q in "${QUERIES[@]}"; do
+            ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "${q}"
+          done
         fi
 
         # Run the tests.
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 0e748adeb6..db04b5de7d 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -239,11 +239,6 @@ jobs:
         path: /tmp/neon/
         prefix: latest
 
-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
     - name: Create Neon Project
       if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
       id: create-neon-project
@@ -282,16 +277,6 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERIES=("SELECT version()")
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
-        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
     - name: Benchmark init
       uses: ./.github/actions/run-python-test-set
       with:
@@ -377,29 +362,12 @@ jobs:
         path: /tmp/neon/
         prefix: latest
 
-    - name: Add Postgres binaries to PATH
-      run: |
-        LD_LIBRARY_PATH="${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib"
-        export LD_LIBRARY_PATH
-        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> $GITHUB_ENV
-
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
     - name: Set up Connection String
       id: set-up-connstr
       run: |
         CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
-        
-        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERIES=("SELECT version()")
-        QUERIES+=("SHOW neon.tenant_id")
-        QUERIES+=("SHOW neon.timeline_id")
-        
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
     - name: Benchmark pgvector hnsw indexing
       uses: ./.github/actions/run-python-test-set
@@ -421,12 +389,12 @@ jobs:
         test_selection: performance/test_perf_pgvector_queries.py
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 
+        extra_params: -m remote_cluster --timeout 21600
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-    
+
     - name: Create Allure report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
@@ -481,11 +449,6 @@ jobs:
         path: /tmp/neon/
         prefix: latest
 
-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
     - name: Set up Connection String
       id: set-up-connstr
       run: |
@@ -507,16 +470,6 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERIES=("SELECT version()")
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
-        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
     - name: ClickBench benchmark
       uses: ./.github/actions/run-python-test-set
       with:
@@ -584,11 +537,6 @@ jobs:
         path: /tmp/neon/
         prefix: latest
 
-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
     - name: Get Connstring Secret Name
       run: |
         case "${PLATFORM}" in
@@ -617,16 +565,6 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERIES=("SELECT version()")
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
-        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
     - name: Run TPC-H benchmark
       uses: ./.github/actions/run-python-test-set
       with:
@@ -685,11 +623,6 @@ jobs:
         path: /tmp/neon/
         prefix: latest
 
-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
     - name: Set up Connection String
       id: set-up-connstr
       run: |
@@ -711,16 +644,6 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERIES=("SELECT version()")
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
-        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
     - name: Run user examples
       uses: ./.github/actions/run-python-test-set
       with:

From f5832329ac57e4a08c4d3b26b61864c2f1552ddf Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 2 Jul 2024 17:17:22 +0100
Subject: [PATCH 1099/1571] tense of errors (#8234)

I forgot a commit when merging
https://github.com/neondatabase/neon/pull/8177
---
 pageserver/src/tenant/mgr.rs | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index c1da1d2c55..b0159e22bf 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -358,7 +358,7 @@ fn load_tenant_config(
         info!("Found temporary tenant directory, removing: {tenant_dir_path}");
         // No need to use safe_remove_tenant_dir_all because this is already
         // a temporary path
-        std::fs::remove_dir_all(&tenant_dir_path).fatal_err("Deleting temporary tenant dir");
+        std::fs::remove_dir_all(&tenant_dir_path).fatal_err("delete temporary tenant dir");
         return None;
     }
 
@@ -368,7 +368,7 @@ fn load_tenant_config(
         .fatal_err("Checking for empty tenant dir");
     if is_empty {
         info!("removing empty tenant directory {tenant_dir_path:?}");
-        std::fs::remove_dir(&tenant_dir_path).fatal_err("Deleting empty tenant dir");
+        std::fs::remove_dir(&tenant_dir_path).fatal_err("delete empty tenant dir");
         return None;
     }
 
@@ -386,7 +386,7 @@ async fn init_load_tenant_configs(
     let tenants_dir = conf.tenants_path();
 
     let dentries = tokio::task::spawn_blocking(move || -> Vec<Utf8DirEntry> {
-        let context = format!("Reading tenants dir {tenants_dir}");
+        let context = format!("read tenants dir {tenants_dir}");
         let dir_entries = tenants_dir.read_dir_utf8().fatal_err(&context);
 
         dir_entries
@@ -587,7 +587,7 @@ pub async fn init_tenant_mgr(
     // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
     for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
         // Writing a config to local disk is foundational to startup up tenants: panic if we can't.
-        config_write_result.fatal_err("writing tenant shard config file");
+        config_write_result.fatal_err("write tenant shard config file");
 
         let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
         let shard_identity = location_conf.shard;
@@ -953,7 +953,7 @@ impl TenantManager {
             Some(FastPathModified::Attached(tenant)) => {
                 Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
                     .await
-                    .fatal_err("writing tenant shard config");
+                    .fatal_err("write tenant shard config");
 
                 // Transition to AttachedStale means we may well hold a valid generation
                 // still, and have been requested to go stale as part of a migration.  If
@@ -984,7 +984,7 @@ impl TenantManager {
             Some(FastPathModified::Secondary(_secondary_tenant)) => {
                 Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
                     .await
-                    .fatal_err("writing tenant shard config");
+                    .fatal_err("write tenant shard config");
 
                 return Ok(None);
             }
@@ -1069,14 +1069,14 @@ impl TenantManager {
         // Does not need to be fsync'd because local storage is just a cache.
         tokio::fs::create_dir_all(&timelines_path)
             .await
-            .fatal_err("creating timelines/ dir");
+            .fatal_err("create timelines/ dir");
 
         // Before activating either secondary or attached mode, persist the
         // configuration, so that on restart we will re-attach (or re-start
         // secondary) on the tenant.
         Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
             .await
-            .fatal_err("writing tenant shard config");
+            .fatal_err("write tenant shard config");
 
         let new_slot = match &new_location_config.mode {
             LocationMode::Secondary(secondary_config) => {

From 891cb5a9a8eb90242997f4517a4d06cd635fd931 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 2 Jul 2024 12:54:32 -0400
Subject: [PATCH 1100/1571] fix(pageserver): comments about metadata key range
 (#8236)

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index cd430bfab7..0acd83753e 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -29,7 +29,7 @@ pub const KEY_SIZE: usize = 18;
 /// See [`Key::to_i128`] for more information on the encoding.
 pub const METADATA_KEY_SIZE: usize = 16;
 
-/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
+/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x60 is a metadata key.
 pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
 pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
 

From 4a0c2aebe03dc388aeefc4cbd62006ae3eb8fc60 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 2 Jul 2024 21:45:42 +0300
Subject: [PATCH 1101/1571] Add test for proper handling of connection failure
 to avoid 'cannot wait on socket event without a socket' error (#8231)

## Problem

See https://github.com/neondatabase/cloud/issues/14289
and PR #8210

## Summary of changes

Add test for problems fixed in #8210

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/libpagestore.c                      |  5 ----
 .../regress/test_pageserver_reconnect.py      | 24 +++++++++++++++++++
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index a3fdcc537e..73a001b6ba 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -427,11 +427,6 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		values[n_pgsql_params] = NULL;
 
 		shard->conn = PQconnectStartParams(keywords, values, 1);
-		if (!shard->conn)
-		{
-			neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
-			return false;
-		}
 		if (PQstatus(shard->conn) == CONNECTION_BAD)
 		{
 			char	   *msg = pchomp(PQerrorMessage(shard->conn));
diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py
index aecfcdd262..37ff923632 100644
--- a/test_runner/regress/test_pageserver_reconnect.py
+++ b/test_runner/regress/test_pageserver_reconnect.py
@@ -2,6 +2,7 @@ import threading
 import time
 from contextlib import closing
 
+import psycopg2.errors
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, PgBin
 
@@ -40,3 +41,26 @@ def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin):
                 c.execute("select pg_reload_conf()")
 
     thread.join()
+
+
+# Test handling errors during page server reconnect
+def test_pageserver_reconnect_failure(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_pageserver_reconnect")
+    endpoint = env.endpoints.create_start("test_pageserver_reconnect")
+
+    con = endpoint.connect()
+    cur = con.cursor()
+
+    cur.execute("set statement_timeout='2s'")
+    cur.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
+    connstring = cur.fetchall()[0][0]
+    cur.execute(
+        f"alter system set neon.pageserver_connstring='{connstring}?some_invalid_param=xyz'"
+    )
+    cur.execute("select pg_reload_conf()")
+    try:
+        cur.execute("select count(*) from pg_class")
+    except psycopg2.errors.QueryCanceled:
+        log.info("Connection to PS failed")
+    assert not endpoint.log_contains("ERROR:  cannot wait on socket event without a socket.*")

From 4273309962df6b8921c0f50de2d9dc4226a28636 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 3 Jul 2024 04:48:56 -0400
Subject: [PATCH 1102/1571] docker: add storage_scrubber into the docker image
 (#8239)

## Problem

We will run this tool in the k8s cluster. To make it accessible from
k8s, we need to package it into the docker image.

part of https://github.com/neondatabase/cloud/issues/14024
---
 Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index f0197758e4..a41598ef72 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -57,6 +57,7 @@ RUN set -e \
       --bin storage_controller  \
       --bin proxy  \
       --bin neon_local \
+      --bin storage_scrubber \
       --locked --release \
     && cachepot -s
 
@@ -83,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber    /usr/local/bin
 
 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/

From dae55badf343627599f7dcb94086a981d54f082c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 3 Jul 2024 13:22:53 +0300
Subject: [PATCH 1103/1571] Simplify test_wal_page_boundary_start test (#8214)

All the code to ensure the WAL record lands at a page boundary was
unnecessary for reproducing the original problem. In fact, it's a pretty
basic test that checks that outbound replication (= neon as publisher)
still works after restarting the endpoint. It just used to be very
broken before commit 5ceccdc7de, which also added this test.

To verify that:

1. Check out commit f3af5f4660 (because the next commit, 7dd58e1449,
fixed the same bug in a different way, making it infeasible to revert
the bug fix in an easy way)
2. Revert the bug fix from commit 5ceccdc7de with this:

```
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 7debb6325..9f03bbd99 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1437,8 +1437,10 @@ XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr)
 	 *
 	 * https://github.com/neondatabase/neon/issues/5749
 	 */
+#if 0
 	if (!wp->config->syncSafekeepers)
 		XLogUpdateWalBuffers(buf, recptr, nbytes);
+#endif

 	while (nbytes > 0)
 	{
```

3. Run the test_wal_page_boundary_start regression test. It fails, as
expected

4. Apply this commit to the test, and run it again. It still fails, with
the same error mentioned in issue #5749:

```
PG:2024-06-30 20:49:08.805 GMT [1248196] STATEMENT:  START_REPLICATION SLOT "sub1" LOGICAL 0/0 (proto_version '4', origin 'any', publication_names '"pub1"')
PG:2024-06-30 21:37:52.567 GMT [1467972] LOG:  starting logical decoding for slot "sub1"
PG:2024-06-30 21:37:52.567 GMT [1467972] DETAIL:  Streaming transactions committing after 0/1532330, reading WAL from 0/1531C78.
PG:2024-06-30 21:37:52.567 GMT [1467972] STATEMENT:  START_REPLICATION SLOT "sub1" LOGICAL 0/0 (proto_version '4', origin 'any', publication_names '"pub1"')
PG:2024-06-30 21:37:52.567 GMT [1467972] LOG:  logical decoding found consistent point at 0/1531C78
PG:2024-06-30 21:37:52.567 GMT [1467972] DETAIL:  There are no running transactions.
PG:2024-06-30 21:37:52.567 GMT [1467972] STATEMENT:  START_REPLICATION SLOT "sub1" LOGICAL 0/0 (proto_version '4', origin 'any', publication_names '"pub1"')
PG:2024-06-30 21:37:52.568 GMT [1467972] ERROR:  could not find record while sending logically-decoded data: invalid contrecord length 312 (expected 6) at 0/1533FD8
```
---
 .../regress/test_logical_replication.py       | 60 +++----------------
 1 file changed, 9 insertions(+), 51 deletions(-)

diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index ca3c81d6e5..41283e4d2c 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -4,7 +4,6 @@ from random import choice
 from string import ascii_lowercase
 
 import pytest
-from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     AuxFileStore,
@@ -13,7 +12,7 @@ from fixtures.neon_fixtures import (
     logical_replication_sync,
     wait_for_last_flush_lsn,
 )
-from fixtures.utils import query_scalar, wait_until
+from fixtures.utils import wait_until
 
 
 def random_string(n: int):
@@ -326,12 +325,17 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
         assert "could not receive data from WAL stream" not in logs
 
 
-# Test compute start at LSN page of which starts with contrecord
-# https://github.com/neondatabase/neon/issues/5749
+# Test replication of WAL record spanning page boundary (with contrecord) after
+# compute restart and WAL write of the page.
+#
+# See https://github.com/neondatabase/neon/issues/5749
+#
+# Most pages start with a contrecord, so we don't do anything special
+# to ensure that.
 @pytest.mark.parametrize(
     "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
 )
-def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
+def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
     env.neon_cli.create_branch("init")
@@ -356,52 +360,6 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
     logical_replication_sync(vanilla_pg, endpoint)
     vanilla_pg.stop()
 
-    with endpoint.cursor() as cur:
-        # measure how much space logical message takes. Sometimes first attempt
-        # creates huge message and then it stabilizes, have no idea why.
-        for _ in range(3):
-            lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-            log.info(f"current_lsn={lsn_before}")
-            # Non-transactional logical message doesn't write WAL, only XLogInsert's
-            # it, so use transactional. Which is a bit problematic as transactional
-            # necessitates commit record. Alternatively we can do smth like
-            #   select neon_xlogflush(pg_current_wal_insert_lsn());
-            # but isn't much better + that particular call complains on 'xlog flush
-            # request 0/282C018 is not satisfied' as pg_current_wal_insert_lsn skips
-            # page headers.
-            payload = "blahblah"
-            cur.execute(f"select pg_logical_emit_message(true, 'pref', '{payload}')")
-            lsn_after_by_curr_wal_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-            lsn_diff = lsn_after_by_curr_wal_lsn - lsn_before
-            logical_message_base = lsn_after_by_curr_wal_lsn - lsn_before - len(payload)
-            log.info(
-                f"before {lsn_before}, after {lsn_after_by_curr_wal_lsn}, lsn diff is {lsn_diff}, base {logical_message_base}"
-            )
-
-        # and write logical message spanning exactly as we want
-        lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        log.info(f"current_lsn={lsn_before}")
-        curr_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        offs = int(curr_lsn) % 8192
-        till_page = 8192 - offs
-        payload_len = (
-            till_page - logical_message_base - 8
-        )  # not sure why 8 is here, it is deduced from experiments
-        log.info(f"current_lsn={curr_lsn}, offs {offs}, till_page {till_page}")
-
-        # payload_len above would go exactly till the page boundary; but we want contrecord, so make it slightly longer
-        payload_len += 8
-
-        cur.execute(f"select pg_logical_emit_message(true, 'pref', 'f{'a' * payload_len}')")
-        supposedly_contrecord_end = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        log.info(f"supposedly_page_boundary={supposedly_contrecord_end}")
-        # The calculations to hit the page boundary are very fuzzy, so just
-        # ignore test if we fail to reach it.
-        if not (int(supposedly_contrecord_end) % 8192 == 32):
-            pytest.skip("missed page boundary, bad luck")
-
-        cur.execute("insert into replication_example values (2, 3)")
-
     wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
     endpoint.stop().start()
 

From aae38763188203ad1937cead961e7809e679ccfd Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 3 Jul 2024 12:19:13 +0100
Subject: [PATCH 1104/1571] CI: update docker/* actions to latest versions
 (#7694)

## Problem

GitHub Actions complain that we use actions that depend on deprecated
Node 16:

```
Node.js 16 actions are deprecated. Please update the following actions to use Node.js 20: docker/setup-buildx-action@v2
```

But also, the latest `docker/setup-buildx-action` fails with the following
error:
```
/nvme/actions-runner/_work/_actions/docker/setup-buildx-action/v3/webpack:/docker-setup-buildx/node_modules/@actions/cache/lib/cache.js:175
            throw new Error(`Path Validation Error: Path(s) specified in the action for caching do(es) not exist, hence no cache is being saved.`);
^
Error: Path Validation Error: Path(s) specified in the action for caching do(es) not exist, hence no cache is being saved.
    at Object.rejected (/nvme/actions-runner/_work/_actions/docker/setup-buildx-action/v3/webpack:/docker-setup-buildx/node_modules/@actions/cache/lib/cache.js:175:1)
    at Generator.next (<anonymous>)
    at fulfilled (/nvme/actions-runner/_work/_actions/docker/setup-buildx-action/v3/webpack:/docker-setup-buildx/node_modules/@actions/cache/lib/cache.js:29:1)
```

We can work this around by setting `cache-binary: false` for `uses:
docker/setup-buildx-action@v3`

## Summary of changes
- Update `docker/setup-buildx-action` from `v2` to `v3`, set
`cache-binary: false`
- Update `docker/login-action` from `v2` to `v3`
- Update `docker/build-push-action` from `v4`/`v5` to `v6`
---
 .github/workflows/build-build-tools-image.yml |  8 +++++---
 .github/workflows/build_and_test.yml          | 17 ++++++++++-------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index f1c39e7e4f..a69686bf2a 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -63,14 +63,16 @@ jobs:
           mkdir -p /tmp/.docker-custom
           echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
 
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
+        with:
+          cache-binary: false
 
-      - uses: docker/login-action@v2
+      - uses: docker/login-action@v3
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - uses: docker/build-push-action@v4
+      - uses: docker/build-push-action@v6
         with:
           context: .
           provenance: false
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 24ad26205b..5ac8c6ec27 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -751,14 +751,16 @@ jobs:
         run: |
           mkdir -p .docker-custom
           echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
+        with:
+          cache-binary: false
 
       - uses: docker/login-action@v3
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - uses: docker/build-push-action@v5
+      - uses: docker/build-push-action@v6
         with:
           context: .
           build-args: |
@@ -829,11 +831,12 @@ jobs:
         run: |
           mkdir -p .docker-custom
           echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
         with:
+          cache-binary: false
           # Disable parallelism for docker buildkit.
           # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
-          config-inline: |
+          buildkitd-config-inline: |
             [worker.oci]
               max-parallelism = 1
 
@@ -849,7 +852,7 @@ jobs:
           password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
       - name: Build compute-node image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
         with:
           context: .
           build-args: |
@@ -868,7 +871,7 @@ jobs:
 
       - name: Build neon extensions test image
         if: matrix.version == 'v16'
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
         with:
           context: .
           build-args: |
@@ -889,7 +892,7 @@ jobs:
       - name: Build compute-tools image
         # compute-tools are Postgres independent, so build it only once
         if: matrix.version == 'v16'
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
         with:
           target: compute-tools-image
           context: .

From 97f7188a07a7992cb058d654a79c91acf4a1b975 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 3 Jul 2024 14:13:06 +0100
Subject: [PATCH 1105/1571] pageserver: don't try to flush if shutdown during
 attach (#8235)

## Problem

test_location_conf_churn fails on log errors when it tries to shutdown a
pageserver immediately after starting a tenant attach, like this:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8224/9761000525/index.html#/testresult/15fb6beca5c7327c

```
shutdown:shutdown{tenant_id=35f5c55eb34e7e5e12288c5d8ab8b909 shard_id=0000}:timeline_shutdown{timeline_id=30936747043353a98661735ad09cbbfe shutdown_mode=FreezeAndFlush}: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited\n')
```

This is happening because Tenant::shutdown fires its cancellation token
early if the tenant is not fully attached by the time shutdown is
called, so the flush loop is shutdown by the time we try and flush.

## Summary of changes

- In the early-cancellation case, also set the shutdown mode to Hard to
skip trying to do a flush that will fail.
---
 pageserver/src/tenant.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 89bf89471c..0c911939e8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1816,9 +1816,15 @@ impl Tenant {
         // If we're still attaching, fire the cancellation token early to drop out: this
         // will prevent us flushing, but ensures timely shutdown if some I/O during attach
         // is very slow.
-        if matches!(self.current_state(), TenantState::Attaching) {
+        let shutdown_mode = if matches!(self.current_state(), TenantState::Attaching) {
             self.cancel.cancel();
-        }
+
+            // Having fired our cancellation token, do not try and flush timelines: their cancellation tokens
+            // are children of ours, so their flush loops will have shut down already
+            timeline::ShutdownMode::Hard
+        } else {
+            shutdown_mode
+        };
 
         match self.set_stopping(shutdown_progress, false, false).await {
             Ok(()) => {}

From e0891ec8c8d07d1e2f91413a56c961103d4ef245 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 3 Jul 2024 18:02:10 +0200
Subject: [PATCH 1106/1571] Only support compressed reads if the compression
 setting is present (#8238)

PR #8106 was created with the assumption that no blob is larger than
`256 MiB`. Due to #7852 we have checking for *writes* of blobs larger
than that limit, but we didn't have checking for *reads* of such large
blobs: in theory, we could be reading these blobs every day but we just
don't happen to write the blobs for some reason.

Therefore, we now add a warning for *reads* of such large blobs as well.

To make deploying compression less dangerous, we therefore only assume a
blob is compressed if the compression setting is present in the config.
This also means that we can't back out of compression once we enabled
it.

Part of https://github.com/neondatabase/neon/issues/5431
---
 pageserver/src/tenant/blob_io.rs              | 45 +++++++++++--------
 pageserver/src/tenant/block_io.rs             | 31 +++++++++++--
 .../src/tenant/storage_layer/image_layer.rs   | 28 ++++++++----
 pageserver/src/tenant/storage_layer/layer.rs  |  1 +
 4 files changed, 75 insertions(+), 30 deletions(-)

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 022801b17f..de74066b81 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -19,6 +19,7 @@ use bytes::{BufMut, BytesMut};
 use pageserver_api::models::ImageCompressionAlgorithm;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
+use tracing::warn;
 
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
@@ -72,14 +73,22 @@ impl<'a> BlockCursor<'a> {
                 len_buf.copy_from_slice(&buf[off..off + 4]);
                 off += 4;
             }
-            len_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
+            let bit_mask = if self.read_compressed {
+                !LEN_COMPRESSION_BIT_MASK
+            } else {
+                0x7f
+            };
+            len_buf[0] &= bit_mask;
             u32::from_be_bytes(len_buf) as usize
         };
         let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
 
         let mut tmp_buf = Vec::new();
         let buf_to_write;
-        let compression = if compression_bits <= BYTE_UNCOMPRESSED {
+        let compression = if compression_bits <= BYTE_UNCOMPRESSED || !self.read_compressed {
+            if compression_bits > BYTE_UNCOMPRESSED {
+                warn!("reading key above future limit ({len} bytes)");
+            }
             buf_to_write = dstbuf;
             None
         } else if compression_bits == BYTE_ZSTD {
@@ -384,10 +393,10 @@ mod tests {
     use rand::{Rng, SeedableRng};
 
     async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
-        round_trip_test_compressed::<BUFFERED, 0>(blobs).await
+        round_trip_test_compressed::<BUFFERED, false>(blobs).await
     }
 
-    async fn round_trip_test_compressed<const BUFFERED: bool, const COMPRESSION: u8>(
+    async fn round_trip_test_compressed<const BUFFERED: bool, const COMPRESSION: bool>(
         blobs: &[Vec<u8>],
     ) -> Result<(), Error> {
         let temp_dir = camino_tempfile::tempdir()?;
@@ -400,17 +409,15 @@ mod tests {
             let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
-                let (_, res) = match COMPRESSION {
-                    0 => wtr.write_blob(blob.clone(), &ctx).await,
-                    1 => {
-                        wtr.write_blob_maybe_compressed(
-                            blob.clone(),
-                            &ctx,
-                            Some(ImageCompressionAlgorithm::Zstd { level: Some(1) }),
-                        )
-                        .await
-                    }
-                    _ => unreachable!("Invalid compression {COMPRESSION}"),
+                let (_, res) = if COMPRESSION {
+                    wtr.write_blob_maybe_compressed(
+                        blob.clone(),
+                        &ctx,
+                        Some(ImageCompressionAlgorithm::Zstd { level: Some(1) }),
+                    )
+                    .await
+                } else {
+                    wtr.write_blob(blob.clone(), &ctx).await
                 };
                 let offs = res?;
                 offsets.push(offs);
@@ -425,7 +432,7 @@ mod tests {
 
         let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
         let rdr = BlockReaderRef::VirtualFile(&file);
-        let rdr = BlockCursor::new(rdr);
+        let rdr = BlockCursor::new_with_compression(rdr, COMPRESSION);
         for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
             let blob_read = rdr.read_blob(*offset, &ctx).await?;
             assert_eq!(
@@ -459,6 +466,8 @@ mod tests {
         ];
         round_trip_test::<false>(blobs).await?;
         round_trip_test::<true>(blobs).await?;
+        round_trip_test_compressed::<false, true>(blobs).await?;
+        round_trip_test_compressed::<true, true>(blobs).await?;
         Ok(())
     }
 
@@ -474,8 +483,8 @@ mod tests {
         ];
         round_trip_test::<false>(blobs).await?;
         round_trip_test::<true>(blobs).await?;
-        round_trip_test_compressed::<false, 1>(blobs).await?;
-        round_trip_test_compressed::<true, 1>(blobs).await?;
+        round_trip_test_compressed::<false, true>(blobs).await?;
+        round_trip_test_compressed::<true, true>(blobs).await?;
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 85f3b1c799..3324e840ec 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -149,16 +149,24 @@ impl<'a> BlockReaderRef<'a> {
 /// ```
 ///
 pub struct BlockCursor<'a> {
+    pub(super) read_compressed: bool,
     reader: BlockReaderRef<'a>,
 }
 
 impl<'a> BlockCursor<'a> {
     pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
-        BlockCursor { reader }
+        Self::new_with_compression(reader, false)
+    }
+    pub(crate) fn new_with_compression(reader: BlockReaderRef<'a>, read_compressed: bool) -> Self {
+        BlockCursor {
+            read_compressed,
+            reader,
+        }
     }
     // Needed by cli
     pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
         BlockCursor {
+            read_compressed: false,
             reader: BlockReaderRef::FileBlockReader(reader),
         }
     }
@@ -188,11 +196,25 @@ pub struct FileBlockReader<'a> {
 
     /// Unique ID of this file, used as key in the page cache.
     file_id: page_cache::FileId,
+
+    compressed_reads: bool,
 }
 
 impl<'a> FileBlockReader<'a> {
     pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
-        FileBlockReader { file_id, file }
+        Self::new_with_compression(file, file_id, false)
+    }
+
+    pub fn new_with_compression(
+        file: &'a VirtualFile,
+        file_id: FileId,
+        compressed_reads: bool,
+    ) -> Self {
+        FileBlockReader {
+            file_id,
+            file,
+            compressed_reads,
+        }
     }
 
     /// Read a page from the underlying file into given buffer.
@@ -239,7 +261,10 @@ impl<'a> FileBlockReader<'a> {
 
 impl BlockReader for FileBlockReader<'_> {
     fn block_cursor(&self) -> BlockCursor<'_> {
-        BlockCursor::new(BlockReaderRef::FileBlockReader(self))
+        BlockCursor::new_with_compression(
+            BlockReaderRef::FileBlockReader(self),
+            self.compressed_reads,
+        )
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 50aacbd9ad..4a1b3a0237 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -165,6 +165,7 @@ pub struct ImageLayerInner {
     file_id: FileId,
 
     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
+    compressed_reads: bool,
 }
 
 impl std::fmt::Debug for ImageLayerInner {
@@ -178,7 +179,8 @@ impl std::fmt::Debug for ImageLayerInner {
 
 impl ImageLayerInner {
     pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
         let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new(
             self.index_start_blk,
             self.index_root_blk,
@@ -266,9 +268,10 @@ impl ImageLayer {
     async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
         let path = self.path();
 
-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
-            .await
-            .and_then(|res| res)?;
+        let loaded =
+            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, false, ctx)
+                .await
+                .and_then(|res| res)?;
 
         // not production code
         let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
@@ -377,6 +380,7 @@ impl ImageLayerInner {
         lsn: Lsn,
         summary: Option<Summary>,
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
+        support_compressed_reads: bool,
         ctx: &RequestContext,
     ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
         let file = match VirtualFile::open(path, ctx).await {
@@ -420,6 +424,7 @@ impl ImageLayerInner {
             file,
             file_id,
             max_vectored_read_bytes,
+            compressed_reads: support_compressed_reads,
             key_range: actual_summary.key_range,
         }))
     }
@@ -430,7 +435,8 @@ impl ImageLayerInner {
         reconstruct_state: &mut ValueReconstructState,
         ctx: &RequestContext,
     ) -> anyhow::Result<ValueReconstructResult> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
 
@@ -490,12 +496,14 @@ impl ImageLayerInner {
         &self,
         ctx: &RequestContext,
     ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
         let mut result = Vec::new();
         let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx));
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
         let cursor = block_reader.block_cursor();
         while let Some(item) = stream.next().await {
             // TODO: dedup code with get_reconstruct_value
@@ -530,7 +538,8 @@ impl ImageLayerInner {
                 .into(),
         );
 
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
 
@@ -691,7 +700,8 @@ impl ImageLayerInner {
 
     #[cfg(test)]
     pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
         ImageLayerIterator {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 02069c29d2..d1f5cc8f43 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1685,6 +1685,7 @@ impl DownloadedLayer {
                     lsn,
                     summary,
                     Some(owner.conf.max_vectored_read_bytes),
+                    owner.conf.image_compression.is_some(),
                     ctx,
                 )
                 .await

From 392a58bdce6ffda454fe6e78f6158f817d6effc3 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 3 Jul 2024 18:22:33 +0200
Subject: [PATCH 1107/1571] add pagebench test cases for periodic pagebench on
 dedicated hardware (#8233)

we want to run some specific pagebench test cases on dedicated hardware
to get reproducible results

run1: 1 client per tenant => characterize throughput with n tenants.
-  500 tenants
- scale 13 (200 MB database)
- 1 hour duration
- ca 380 GB layer snapshot files

run2.singleclient: 1 client per tenant => characterize latencies
run2.manyclient: N clients per tenant => characterize throughput
scalability within one tenant.
- 1 tenant with 1 client for latencies
- 1 tenant with 64 clients because typically for a high number of
connections we recommend the connection pooler
which by default uses 64 connections (for scalability)
- scale 136 (2048 MB database)
- 20 minutes each
---
 .github/workflows/periodic_pagebench.yml      | 144 ++++++++++++++++++
 ...er_max_throughput_getpage_at_latest_lsn.py |  86 ++++++++---
 test_runner/performance/pageserver/util.py    |   2 +-
 3 files changed, 212 insertions(+), 20 deletions(-)
 create mode 100644 .github/workflows/periodic_pagebench.yml

diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
new file mode 100644
index 0000000000..c0219599a2
--- /dev/null
+++ b/.github/workflows/periodic_pagebench.yml
@@ -0,0 +1,144 @@
+name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '0 18 * * *' # Runs at 6 PM UTC every day
+  workflow_dispatch: # Allows manual triggering of the workflow
+    inputs:
+      commit_hash:
+        type: string
+        description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
+        required: false
+        default: ''
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+concurrency:
+  group: ${{ github.workflow }}
+  cancel-in-progress: false
+
+jobs:
+  trigger_bench_on_ec2_machine_in_eu_central_1:
+    runs-on: [ self-hosted, gen3, small ]
+    container:
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+    timeout-minutes: 360  # Set the timeout to 6 hours
+    env:
+      API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
+      RUN_ID: ${{ github.run_id }}
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
+      AWS_DEFAULT_REGION : "eu-central-1"
+      AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
+    steps:
+    - name: Show my own (github runner) external IP address - usefull for IP allowlisting
+      run: curl https://ifconfig.me
+
+    - name: Start EC2 instance and wait for the instance to boot up
+      run: |
+        aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
+        aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
+        sleep 60 # sleep some time to allow cloudinit and our API server to start up
+
+    - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
+      run: |
+        public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
+        echo "Public IP of the EC2 instance: $public_ip"
+        echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV
+
+    - name: Determine commit hash
+      env:
+        INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
+      run: |
+        if [ -z "$INPUT_COMMIT_HASH" ]; then
+          echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
+        else
+          echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
+        fi
+
+    - name: Start Bench with run_id   
+      run: |
+        curl -k -X 'POST' \
+        "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
+        -H 'accept: application/json' \
+        -H 'Content-Type: application/json' \
+        -H "Authorization: Bearer $API_KEY" \
+        -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
+
+    - name: Poll Test Status
+      id: poll_step
+      run: |
+        status=""
+        while [[ "$status" != "failure" && "$status" != "success" ]]; do
+          response=$(curl -k -X 'GET' \
+          "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
+          -H 'accept: application/json' \
+          -H "Authorization: Bearer $API_KEY")
+          echo "Response: $response"
+          set +x
+          status=$(echo $response | jq -r '.status')
+          echo "Test status: $status"
+          if [[ "$status" == "failure" || "$status" == "success" || "$status" == "null" ]]; then
+            break
+          fi
+          if [[ "$status" == "too_many_runs" ]]; then
+            echo "Too many runs already running"
+            echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
+            exit 1
+          fi
+
+          sleep 60 # Poll every 60 seconds
+        done
+
+    - name: Retrieve Test Logs
+      run: |
+        curl -k -X 'GET' \
+        "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
+        -H 'accept: application/gzip' \
+        -H "Authorization: Bearer $API_KEY" \
+        --output "test_log_${GITHUB_RUN_ID}.gz"
+    
+    - name: Unzip Test Log and Print it into this job's log
+      run: |
+        gzip -d "test_log_${GITHUB_RUN_ID}.gz"
+        cat "test_log_${GITHUB_RUN_ID}"
+
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+    - name: Cleanup Test Resources
+      if: always() 
+      run: |
+        curl -k -X 'POST' \
+        "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
+        -H 'accept: application/json' \
+        -H "Authorization: Bearer $API_KEY" \
+        -d ''
+
+    - name: Stop EC2 instance and wait for the instance to be stopped
+      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
+      run: |
+        aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
+        aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 1d579214b0..a8f48fe675 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -1,4 +1,5 @@
 import json
+import os
 from pathlib import Path
 from typing import Any, Dict, Tuple
 
@@ -17,30 +18,74 @@ from performance.pageserver.util import (
     setup_pageserver_with_tenants,
 )
 
+# The following tests use pagebench "getpage at latest LSN" to characterize the throughput of the pageserver.
+# originally there was a single test named `test_pageserver_max_throughput_getpage_at_latest_lsn``
+# so you still see some references to this name in the code.
+# To avoid recreating the snapshots for each test, we continue to use the name `max_throughput_latest_lsn`
+# for some files and metrics.
+
 
 # For reference, the space usage of the snapshots:
-# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots
-# 137G    /instance_store/test_output/shared-snapshots
-# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots/*
-# 1.8G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-13
-# 1.1G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-6
-# 8.5G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-13
-# 5.1G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6
-# 76G     /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13
-# 46G     /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6
-@pytest.mark.parametrize("duration", [30])
-@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]])
-@pytest.mark.parametrize("n_tenants", [1, 10])
-@pytest.mark.timeout(
-    10000
-)  # TODO: this value is just "a really high number"; have this per instance type
-def test_pageserver_max_throughput_getpage_at_latest_lsn(
+# sudo du -hs /instance_store/neon/test_output/shared-snapshots/*
+# 416G	/instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-500-13
+@pytest.mark.parametrize("duration", [60 * 60])
+@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
+@pytest.mark.parametrize("n_tenants", [500])
+@pytest.mark.timeout(10000)
+@pytest.mark.skipif(
+    os.getenv("CI", "false") == "true",
+    reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
+)
+def test_pageserver_characterize_throughput_with_n_tenants(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,
     pg_bin: PgBin,
     n_tenants: int,
     pgbench_scale: int,
     duration: int,
+):
+    setup_and_run_pagebench_benchmark(
+        neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, 1
+    )
+
+
+# For reference, the space usage of the snapshots:
+# sudo du -hs /instance_store/neon/test_output/shared-snapshots/*
+# 19G	/instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-1-136
+@pytest.mark.parametrize("duration", [20 * 60])
+@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)])
+# we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability
+# we use 64 clients because typically for a high number of connections we recommend the connection pooler
+# which by default uses 64 connections
+@pytest.mark.parametrize("n_clients", [1, 64])
+@pytest.mark.parametrize("n_tenants", [1])
+@pytest.mark.timeout(2400)
+@pytest.mark.skipif(
+    os.getenv("CI", "false") == "true",
+    reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
+)
+def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    n_tenants: int,
+    pgbench_scale: int,
+    duration: int,
+    n_clients: int,
+):
+    setup_and_run_pagebench_benchmark(
+        neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients
+    )
+
+
+def setup_and_run_pagebench_benchmark(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    n_tenants: int,
+    pgbench_scale: int,
+    duration: int,
+    n_clients: int,
 ):
     def record(metric, **kwargs):
         zenbenchmark.record(
@@ -55,6 +100,7 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
             "n_tenants": (n_tenants, {"unit": ""}),
             "pgbench_scale": (pgbench_scale, {"unit": ""}),
             "duration": (duration, {"unit": "s"}),
+            "n_clients": (n_clients, {"unit": ""}),
         }
     )
 
@@ -96,7 +142,7 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
         r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
     )
 
-    run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration)
+    run_pagebench_benchmark(env, pg_bin, record, duration, n_clients)
 
 
 def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
@@ -157,8 +203,8 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
     return (template_tenant, template_timeline, config)
 
 
-def run_benchmark_max_throughput_latest_lsn(
-    env: NeonEnv, pg_bin: PgBin, record, duration_secs: int
+def run_pagebench_benchmark(
+    env: NeonEnv, pg_bin: PgBin, record, duration_secs: int, n_clients: int
 ):
     """
     Benchmark `env.pageserver` for max throughput @ latest LSN and record results in `zenbenchmark`.
@@ -172,6 +218,8 @@ def run_benchmark_max_throughput_latest_lsn(
         ps_http.base_url,
         "--page-service-connstring",
         env.pageserver.connstr(password=None),
+        "--num-clients",
+        str(n_clients),
         "--runtime",
         f"{duration_secs}s",
         # don't specify the targets explicitly, let pagebench auto-discover them
diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py
index 92e05663ce..88296a7fbd 100644
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -22,7 +22,7 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
 
     log.info("wait for all tenants to become active")
     wait_until_all_tenants_state(
-        ps_http, "Active", iterations=n_tenants, period=1, http_error_ok=False
+        ps_http, "Active", iterations=10 + n_tenants, period=1, http_error_ok=False
     )
 
     # ensure all layers are resident for predictiable performance

From ea0b22a9b0e5dd03605a285ce6560926299628d8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 3 Jul 2024 17:27:34 +0100
Subject: [PATCH 1108/1571] pageserver: reduce ops tracked at per-timeline
 detail (#8245)

## Problem

We record detailed histograms for all page_service op types, which
mostly aren't very interesting, but make our prometheus scrapes huge.

Closes: #8223

## Summary of changes

- Only track GetPageAtLsn histograms on a per-timeline granularity. For
all other operation types, rely on existing node-wide histograms.
---
 pageserver/src/metrics.rs | 107 ++++++++++++++++++++------------------
 1 file changed, 55 insertions(+), 52 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9cd7ffa042..a21d8780cf 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -8,7 +8,7 @@ use metrics::{
 };
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
-use strum::{EnumCount, IntoEnumIterator, VariantNames};
+use strum::{EnumCount, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
 use tracing::warn;
 use utils::id::TimelineId;
@@ -1076,21 +1076,12 @@ pub(crate) mod virtual_file_io_engine {
     });
 }
 
-#[derive(Debug)]
-struct GlobalAndPerTimelineHistogram {
-    global: Histogram,
-    per_tenant_timeline: Histogram,
-}
-
-impl GlobalAndPerTimelineHistogram {
-    fn observe(&self, value: f64) {
-        self.global.observe(value);
-        self.per_tenant_timeline.observe(value);
-    }
-}
-
 struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
-    h: &'a GlobalAndPerTimelineHistogram,
+    global_metric: &'a Histogram,
+
+    // Optional because not all op types are tracked per-timeline
+    timeline_metric: Option<&'a Histogram>,
+
     ctx: &'c RequestContext,
     start: std::time::Instant,
     op: SmgrQueryType,
@@ -1121,7 +1112,10 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
                 elapsed
             }
         };
-        self.h.observe(ex_throttled.as_secs_f64());
+        self.global_metric.observe(ex_throttled.as_secs_f64());
+        if let Some(timeline_metric) = self.timeline_metric {
+            timeline_metric.observe(ex_throttled.as_secs_f64());
+        }
     }
 }
 
@@ -1146,7 +1140,8 @@ pub enum SmgrQueryType {
 
 #[derive(Debug)]
 pub(crate) struct SmgrQueryTimePerTimeline {
-    metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
+    global_metrics: [Histogram; SmgrQueryType::COUNT],
+    per_timeline_getpage: Histogram,
 }
 
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
@@ -1224,27 +1219,32 @@ impl SmgrQueryTimePerTimeline {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
         let shard_slug = format!("{}", tenant_shard_id.shard_slug());
         let timeline_id = timeline_id.to_string();
-        let metrics = std::array::from_fn(|i| {
+        let global_metrics = std::array::from_fn(|i| {
             let op = SmgrQueryType::from_repr(i).unwrap();
-            let global = SMGR_QUERY_TIME_GLOBAL
+            SMGR_QUERY_TIME_GLOBAL
                 .get_metric_with_label_values(&[op.into()])
-                .unwrap();
-            let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
-                .get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id])
-                .unwrap();
-            GlobalAndPerTimelineHistogram {
-                global,
-                per_tenant_timeline,
-            }
+                .unwrap()
         });
-        Self { metrics }
+
+        let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
+            .get_metric_with_label_values(&[
+                SmgrQueryType::GetPageAtLsn.into(),
+                &tenant_id,
+                &shard_slug,
+                &timeline_id,
+            ])
+            .unwrap();
+        Self {
+            global_metrics,
+            per_timeline_getpage,
+        }
     }
     pub(crate) fn start_timer<'c: 'a, 'a>(
         &'a self,
         op: SmgrQueryType,
         ctx: &'c RequestContext,
-    ) -> impl Drop + '_ {
-        let metric = &self.metrics[op as usize];
+    ) -> Option<impl Drop + '_> {
+        let global_metric = &self.global_metrics[op as usize];
         let start = Instant::now();
         match ctx.micros_spent_throttled.open() {
             Ok(()) => (),
@@ -1263,12 +1263,20 @@ impl SmgrQueryTimePerTimeline {
                 });
             }
         }
-        GlobalAndPerTimelineHistogramTimer {
-            h: metric,
+
+        let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
+            Some(&self.per_timeline_getpage)
+        } else {
+            None
+        };
+
+        Some(GlobalAndPerTimelineHistogramTimer {
+            global_metric,
+            timeline_metric,
             ctx,
             start,
             op,
-        }
+        })
     }
 }
 
@@ -1315,17 +1323,9 @@ mod smgr_query_time_tests {
             let get_counts = || {
                 let global: u64 = ops
                     .iter()
-                    .map(|op| metrics.metrics[*op as usize].global.get_sample_count())
+                    .map(|op| metrics.global_metrics[*op as usize].get_sample_count())
                     .sum();
-                let per_tenant_timeline: u64 = ops
-                    .iter()
-                    .map(|op| {
-                        metrics.metrics[*op as usize]
-                            .per_tenant_timeline
-                            .get_sample_count()
-                    })
-                    .sum();
-                (global, per_tenant_timeline)
+                (global, metrics.per_timeline_getpage.get_sample_count())
             };
 
             let (pre_global, pre_per_tenant_timeline) = get_counts();
@@ -1336,7 +1336,12 @@ mod smgr_query_time_tests {
             drop(timer);
 
             let (post_global, post_per_tenant_timeline) = get_counts();
-            assert_eq!(post_per_tenant_timeline, 1);
+            if matches!(op, super::SmgrQueryType::GetPageAtLsn) {
+                // getpage ops are tracked per-timeline, others aren't
+                assert_eq!(post_per_tenant_timeline, 1);
+            } else {
+                assert_eq!(post_per_tenant_timeline, 0);
+            }
             assert!(post_global > pre_global);
         }
     }
@@ -2317,14 +2322,12 @@ impl TimelineMetrics {
             let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
         }
 
-        for op in SmgrQueryType::iter() {
-            let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
-                op.into(),
-                tenant_id,
-                shard_id,
-                timeline_id,
-            ]);
-        }
+        let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
+            SmgrQueryType::GetPageAtLsn.into(),
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
     }
 }
 

From cdaed4d79c7ac592d909cd958c909fd1795da65c Mon Sep 17 00:00:00 2001
From: Japin Li <japinli@hotmail.com>
Date: Thu, 4 Jul 2024 01:55:36 +0800
Subject: [PATCH 1109/1571] Fix outdated comment (#8149)

Commit 97b48c23f changes the log wait timeout from 1 second to 100
milliseconds but forgets to update the comment.
---
 compute_tools/src/compute.rs    | 5 ++---
 compute_tools/src/pg_helpers.rs | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index a79b666409..41a52ef5b6 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -873,9 +873,8 @@ impl ComputeNode {
         Ok(())
     }
 
-    // We could've wrapped this around `pg_ctl reload`, but right now we don't use
-    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
-    // have opened connection to Postgres and superuser access.
+    // Wrapped this around `pg_ctl reload`, but right now we don't use
+    // `pg_ctl` for start / stop.
     #[instrument(skip_all)]
     fn pg_reload_conf(&self) -> Result<()> {
         let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index fa0822748b..863fa9468f 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -489,7 +489,7 @@ pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()>
 /// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
 /// - next line starts with timestamp
 /// - EOF
-/// - no new lines were written for the last second
+/// - no new lines were written for the last 100 milliseconds
 async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
     let mut lines = tokio::io::BufReader::new(stderr).lines();
     let timeout_duration = Duration::from_millis(100);

From a85aa03d18a788d7d4954f44099e14179ad6489f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 3 Jul 2024 20:05:01 +0200
Subject: [PATCH 1110/1571] page_service: stop exposing `get_last_record_rlsn`
 (#8244)

Compute doesn't use it, let's eliminate it.

Ref to Slack thread:
https://neondb.slack.com/archives/C033RQ5SPDH/p1719920261995529
---
 pageserver/src/metrics.rs        |  1 -
 pageserver/src/page_service.rs   | 47 --------------------------------
 test_runner/regress/test_auth.py |  2 +-
 3 files changed, 1 insertion(+), 49 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index a21d8780cf..87ff8f4d64 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1452,7 +1452,6 @@ pub(crate) enum ComputeCommandKind {
     PageStreamV2,
     PageStream,
     Basebackup,
-    GetLastRecordRlsn,
     Fullbackup,
     ImportBasebackup,
     ImportWal,
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 6ea5f396d0..a440ad6378 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1656,53 +1656,6 @@ where
             metric_recording.observe(&res);
             res?;
         }
-        // return pair of prev_lsn and last_lsn
-        else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
-            if params.len() != 2 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for get_last_record_rlsn command"
-                )));
-            }
-
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::GetLastRecordRlsn)
-                .inc();
-
-            async {
-                let timeline = self
-                    .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
-                    .await?;
-
-                let end_of_timeline = timeline.get_last_record_rlsn();
-
-                pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                    RowDescriptor::text_col(b"prev_lsn"),
-                    RowDescriptor::text_col(b"last_lsn"),
-                ]))?
-                .write_message_noflush(&BeMessage::DataRow(&[
-                    Some(end_of_timeline.prev.to_string().as_bytes()),
-                    Some(end_of_timeline.last.to_string().as_bytes()),
-                ]))?
-                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                anyhow::Ok(())
-            }
-            .instrument(info_span!(
-                "handle_get_last_record_lsn",
-                shard_id = tracing::field::Empty
-            ))
-            .await?;
-        }
         // same as basebackup, but result includes relational data as well
         else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
             if params.len() < 2 {
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index 035ab2796f..922a21a999 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -211,7 +211,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     def check_pageserver(expect_success: bool, **conn_kwargs):
         check_connection(
             env.pageserver,
-            f"get_last_record_rlsn {env.initial_tenant} {timeline_id}",
+            f"show {env.initial_tenant}",
             expect_success,
             **conn_kwargs,
         )

From 90b51dcf1614614340fafaf61957b645fac34903 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 3 Jul 2024 14:46:58 -0400
Subject: [PATCH 1111/1571] fix(pageserver): ensure test creates valid layer
 map (#8191)

I'd like to add some constraints to the layer map we generate in tests.

(1) is the layer map that the current compaction algorithm will produce.
There is a property that for all delta layer, all delta layer overlaps
with it on the LSN axis will have the same LSN range.
(2) is the layer map that cannot be produced with the legacy compaction
algorithm.
(3) is the layer map that will be produced by the future
tiered-compaction algorithm. The current validator does not allow that
but we can modify the algorithm to allow it in the future.

## Summary of changes

Add a validator to check if the layer map is valid and refactor the test
cases to include delta layer start/end LSN.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/tenant.rs          | 177 ++++++++++++++++--------------
 pageserver/src/tenant/timeline.rs |  92 +++++++++++++---
 2 files changed, 172 insertions(+), 97 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0c911939e8..adf492ace7 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1365,7 +1365,7 @@ impl Tenant {
         initdb_lsn: Lsn,
         pg_version: u32,
         ctx: &RequestContext,
-        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
         image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
         end_lsn: Lsn,
     ) -> anyhow::Result<Arc<Timeline>> {
@@ -2933,7 +2933,7 @@ impl Tenant {
         dst_id: TimelineId,
         ancestor_lsn: Option<Lsn>,
         ctx: &RequestContext,
-        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
         image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
         end_lsn: Lsn,
     ) -> anyhow::Result<Arc<Timeline>> {
@@ -3933,7 +3933,7 @@ mod tests {
     use storage_layer::PersistentLayerKey;
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
-    use timeline::GcInfo;
+    use timeline::{DeltaLayerTestDesc, GcInfo};
     use utils::bin_ser::BeSer;
     use utils::id::TenantId;
 
@@ -6229,27 +6229,6 @@ mod tests {
             .await
             .unwrap();
 
-        async fn get_vectored_impl_wrapper(
-            tline: &Arc<Timeline>,
-            key: Key,
-            lsn: Lsn,
-            ctx: &RequestContext,
-        ) -> Result<Option<Bytes>, GetVectoredError> {
-            let mut reconstruct_state = ValuesReconstructState::new();
-            let mut res = tline
-                .get_vectored_impl(
-                    KeySpace::single(key..key.next()),
-                    lsn,
-                    &mut reconstruct_state,
-                    ctx,
-                )
-                .await?;
-            Ok(res.pop_last().map(|(k, v)| {
-                assert_eq!(k, key);
-                v.unwrap()
-            }))
-        }
-
         let lsn = Lsn(0x30);
 
         // test vectored get on parent timeline
@@ -6325,27 +6304,6 @@ mod tests {
             .await
             .unwrap();
 
-        async fn get_vectored_impl_wrapper(
-            tline: &Arc<Timeline>,
-            key: Key,
-            lsn: Lsn,
-            ctx: &RequestContext,
-        ) -> Result<Option<Bytes>, GetVectoredError> {
-            let mut reconstruct_state = ValuesReconstructState::new();
-            let mut res = tline
-                .get_vectored_impl(
-                    KeySpace::single(key..key.next()),
-                    lsn,
-                    &mut reconstruct_state,
-                    ctx,
-                )
-                .await?;
-            Ok(res.pop_last().map(|(k, v)| {
-                assert_eq!(k, key);
-                v.unwrap()
-            }))
-        }
-
         let lsn = Lsn(0x30);
 
         // test vectored get on parent timeline
@@ -6421,9 +6379,18 @@ mod tests {
                 &ctx,
                 // delta layers
                 vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x10)..Lsn(0x20),
+                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
                 ],
                 // image layers
                 vec![
@@ -6489,17 +6456,29 @@ mod tests {
                 &ctx,
                 // delta layers
                 vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![
-                        (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
-                        (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
-                    ],
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x10)..Lsn(0x20),
+                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x30)..Lsn(0x40),
+                        vec![
+                            (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
+                            (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
+                        ],
+                    ),
                 ],
                 // image layers
                 vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
-                Lsn(0x30),
+                Lsn(0x40),
             )
             .await
             .unwrap();
@@ -6522,7 +6501,7 @@ mod tests {
 
         // Image layers are created at last_record_lsn
         let images = tline
-            .inspect_image_layers(Lsn(0x30), &ctx)
+            .inspect_image_layers(Lsn(0x40), &ctx)
             .await
             .unwrap()
             .into_iter()
@@ -6548,9 +6527,18 @@ mod tests {
                 &ctx,
                 // delta layers
                 vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x10)..Lsn(0x20),
+                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
                 ],
                 // image layers
                 vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
@@ -6598,15 +6586,21 @@ mod tests {
             key
         }
 
-        // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
+        // We create
+        // - one bottom-most image layer,
+        // - a delta layer D1 crossing the GC horizon with data below and above the horizon,
+        // - a delta layer D2 crossing the GC horizon with data only below the horizon,
+        // - a delta layer D3 above the horizon.
         //
-        //  | D1 |                       | D3 |
+        //                             | D3 |
+        //  | D1 |
         // -|    |-- gc horizon -----------------
         //  |    |                | D2 |
         // --------- img layer ------------------
         //
         // What we should expact from this compaction is:
-        //  | Part of D1 |               | D3 |
+        //                             | D3 |
+        //  | Part of D1 |
         // --------- img layer with D1+D2 at GC horizon------------------
 
         // img layer at 0x10
@@ -6646,13 +6640,13 @@ mod tests {
         let delta3 = vec![
             (
                 get_key(8),
-                Lsn(0x40),
-                Value::Image(Bytes::from("value 8@0x40")),
+                Lsn(0x48),
+                Value::Image(Bytes::from("value 8@0x48")),
             ),
             (
                 get_key(9),
-                Lsn(0x40),
-                Value::Image(Bytes::from("value 9@0x40")),
+                Lsn(0x48),
+                Value::Image(Bytes::from("value 9@0x48")),
             ),
         ];
 
@@ -6662,7 +6656,11 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
-                vec![delta1, delta2, delta3], // delta layers
+                vec![
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
+                ], // delta layers
                 vec![(Lsn(0x10), img_layer)], // image layers
                 Lsn(0x50),
             )
@@ -6683,8 +6681,8 @@ mod tests {
             Bytes::from_static(b"value 5@0x20"),
             Bytes::from_static(b"value 6@0x20"),
             Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x40"),
-            Bytes::from_static(b"value 9@0x40"),
+            Bytes::from_static(b"value 8@0x48"),
+            Bytes::from_static(b"value 9@0x48"),
         ];
 
         for (idx, expected) in expected_result.iter().enumerate() {
@@ -6772,10 +6770,10 @@ mod tests {
                     lsn_range: Lsn(0x30)..Lsn(0x41),
                     is_delta: true
                 },
-                // The delta layer we created and should not be picked for the compaction
+                // The delta3 layer that should not be picked for the compaction
                 PersistentLayerKey {
                     key_range: get_key(8)..get_key(10),
-                    lsn_range: Lsn(0x40)..Lsn(0x41),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
                     is_delta: true
                 }
             ]
@@ -6839,7 +6837,10 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
-                vec![delta1],              // delta layers
+                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
+                    Lsn(0x10)..Lsn(0x40),
+                    delta1,
+                )], // delta layers
                 vec![(Lsn(0x10), image1)], // image layers
                 Lsn(0x50),
             )
@@ -6963,15 +6964,21 @@ mod tests {
             key
         }
 
-        // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
+        // We create
+        // - one bottom-most image layer,
+        // - a delta layer D1 crossing the GC horizon with data below and above the horizon,
+        // - a delta layer D2 crossing the GC horizon with data only below the horizon,
+        // - a delta layer D3 above the horizon.
         //
-        //  | D1 |                       | D3 |
+        //                             | D3 |
+        //  | D1 |
         // -|    |-- gc horizon -----------------
         //  |    |                | D2 |
         // --------- img layer ------------------
         //
         // What we should expact from this compaction is:
-        //  | Part of D1 |               | D3 |
+        //                             | D3 |
+        //  | Part of D1 |
         // --------- img layer with D1+D2 at GC horizon------------------
 
         // img layer at 0x10
@@ -7021,13 +7028,13 @@ mod tests {
         let delta3 = vec![
             (
                 get_key(8),
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
             ),
             (
                 get_key(9),
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
             ),
         ];
 
@@ -7037,7 +7044,11 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
-                vec![delta1, delta2, delta3], // delta layers
+                vec![
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
+                ], // delta layers
                 vec![(Lsn(0x10), img_layer)], // image layers
                 Lsn(0x50),
             )
@@ -7064,8 +7075,8 @@ mod tests {
             Bytes::from_static(b"value 5@0x10@0x20"),
             Bytes::from_static(b"value 6@0x10@0x20"),
             Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10@0x40"),
-            Bytes::from_static(b"value 9@0x10@0x40"),
+            Bytes::from_static(b"value 8@0x10@0x48"),
+            Bytes::from_static(b"value 9@0x10@0x48"),
         ];
 
         let expected_result_at_gc_horizon = [
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index de9361d721..df4d252ad2 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4735,6 +4735,42 @@ impl DurationRecorder {
     }
 }
 
+/// Descriptor for a delta layer used in testing infra. The start/end key/lsn range of the
+/// delta layer might be different from the min/max key/lsn in the delta layer. Therefore,
+/// the layer descriptor requires the user to provide the ranges, which should cover all
+/// keys specified in the `data` field.
+#[cfg(test)]
+pub struct DeltaLayerTestDesc {
+    pub lsn_range: Range<Lsn>,
+    pub key_range: Range<Key>,
+    pub data: Vec<(Key, Lsn, Value)>,
+}
+
+#[cfg(test)]
+impl DeltaLayerTestDesc {
+    #[allow(dead_code)]
+    pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
+        Self {
+            lsn_range,
+            key_range,
+            data,
+        }
+    }
+
+    pub fn new_with_inferred_key_range(
+        lsn_range: Range<Lsn>,
+        data: Vec<(Key, Lsn, Value)>,
+    ) -> Self {
+        let key_min = data.iter().map(|(key, _, _)| key).min().unwrap();
+        let key_max = data.iter().map(|(key, _, _)| key).max().unwrap();
+        Self {
+            key_range: (*key_min)..(key_max.next()),
+            lsn_range,
+            data,
+        }
+    }
+}
+
 impl Timeline {
     async fn finish_compact_batch(
         self: &Arc<Self>,
@@ -5535,37 +5571,65 @@ impl Timeline {
     #[cfg(test)]
     pub(super) async fn force_create_delta_layer(
         self: &Arc<Timeline>,
-        mut deltas: Vec<(Key, Lsn, Value)>,
+        mut deltas: DeltaLayerTestDesc,
         check_start_lsn: Option<Lsn>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let last_record_lsn = self.get_last_record_lsn();
-        deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
-        let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
-        let end_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
-        let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
-        let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
+        deltas
+            .data
+            .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
+        assert!(deltas.data.first().unwrap().0 >= deltas.key_range.start);
+        assert!(deltas.data.last().unwrap().0 < deltas.key_range.end);
+        for (_, lsn, _) in &deltas.data {
+            assert!(deltas.lsn_range.start <= *lsn && *lsn < deltas.lsn_range.end);
+        }
         assert!(
-            max_lsn <= last_record_lsn,
-            "advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}"
+            deltas.lsn_range.end <= last_record_lsn,
+            "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
+            deltas.lsn_range.end,
+            last_record_lsn
         );
-        let end_lsn = Lsn(max_lsn.0 + 1);
         if let Some(check_start_lsn) = check_start_lsn {
-            assert!(min_lsn >= check_start_lsn);
+            assert!(deltas.lsn_range.start >= check_start_lsn);
+        }
+        // check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of
+        // layers of the same start/end LSN, and so should the force inserted layer
+        {
+            /// Checks if a overlaps with b, assume a/b = [start, end).
+            pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
+                !(a.end <= b.start || b.end <= a.start)
+            }
+
+            let guard = self.layers.read().await;
+            for layer in guard.layer_map().iter_historic_layers() {
+                if layer.is_delta()
+                    && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
+                    && layer.lsn_range != deltas.lsn_range
+                {
+                    // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
+                    panic!(
+                        "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
+                        deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end
+                    );
+                }
+            }
         }
         let mut delta_layer_writer = DeltaLayerWriter::new(
             self.conf,
             self.timeline_id,
             self.tenant_shard_id,
-            min_key,
-            min_lsn..end_lsn,
+            deltas.key_range.start,
+            deltas.lsn_range,
             ctx,
         )
         .await?;
-        for (key, lsn, val) in deltas {
+        for (key, lsn, val) in deltas.data {
             delta_layer_writer.put_value(key, lsn, val, ctx).await?;
         }
-        let delta_layer = delta_layer_writer.finish(end_key, self, ctx).await?;
+        let delta_layer = delta_layer_writer
+            .finish(deltas.key_range.end, self, ctx)
+            .await?;
 
         {
             let mut guard = self.layers.write().await;

From 778787d8e97243945d58515cbe48606c947498c8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 3 Jul 2024 22:29:43 +0100
Subject: [PATCH 1112/1571] pageserver: add supplementary branch usage stats
 (#8131)

## Problem

The metrics we have today aren't convenient for planning around the
impact of timeline archival on costs.

Closes: https://github.com/neondatabase/neon/issues/8108

## Summary of changes

- Add metric `pageserver_archive_size`, which indicates the logical
bytes of data which we would expect to write into an archived branch.
- Add metric `pageserver_pitr_history_size`, which indicates the
distance between last_record_lsn and the PITR cutoff.

These metrics are somewhat temporary: when we implement #8088 and
associated consumption metric changes, these will reach a final form.
For now, an "archived" branch is just any branch outside of its parent's
PITR window: later, archival will become an explicit state (which will
_usually_ correspond to falling outside the parent's PITR window).

The overall volume of timeline metrics is something to watch, but we are
removing many more in https://github.com/neondatabase/neon/pull/8245
than this PR is adding.
---
 libs/pageserver_api/src/models.rs | 10 +++++++++
 pageserver/src/http/routes.rs     |  4 ++++
 pageserver/src/metrics.rs         | 35 +++++++++++++++++++++++++++++++
 pageserver/src/tenant.rs          | 27 ++++++++++++++++++++++++
 pageserver/src/tenant/timeline.rs | 15 +++++++++++++
 test_runner/fixtures/metrics.py   |  2 ++
 6 files changed, 93 insertions(+)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 959e161c16..9228953761 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -661,6 +661,16 @@ pub struct TimelineInfo {
     pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
     pub current_logical_size_non_incremental: Option<u64>,
 
+    /// How many bytes of WAL are within this branch's pitr_interval.  If the pitr_interval goes
+    /// beyond the branch's branch point, we only count up to the branch point.
+    pub pitr_history_size: u64,
+
+    /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any
+    /// ancestor data used by this branch would have been retained anyway).  If this is false, then
+    /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would
+    /// otherwise be able to GC.
+    pub within_ancestor_pitr: bool,
+
     pub timeline_dir_layer_file_size_sum: Option<u64>,
 
     pub wal_source_connstr: Option<String>,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index f726ba115d..6a6f17604d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -406,6 +406,8 @@ async fn build_timeline_info_common(
 
     let walreceiver_status = timeline.walreceiver_status();
 
+    let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();
+
     let info = TimelineInfo {
         tenant_id: timeline.tenant_shard_id,
         timeline_id: timeline.timeline_id,
@@ -426,6 +428,8 @@ async fn build_timeline_info_common(
         directory_entries_counts: timeline.get_directory_metrics().to_vec(),
         current_physical_size,
         current_logical_size_non_incremental: None,
+        pitr_history_size,
+        within_ancestor_pitr,
         timeline_dir_layer_file_size_sum: None,
         wal_source_connstr,
         last_received_msg_lsn,
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 87ff8f4d64..9e9fe7fbb8 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -464,6 +464,24 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_pitr_history_size",
+        "Data written since PITR cutoff on this timeline",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_archive_size",
+        "Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_standby_horizon",
@@ -2106,6 +2124,8 @@ pub(crate) struct TimelineMetrics {
     pub garbage_collect_histo: StorageTimeMetrics,
     pub find_gc_cutoffs_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
+    pub pitr_history_size: UIntGauge,
+    pub archival_size: UIntGauge,
     pub standby_horizon_gauge: IntGauge,
     pub resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
@@ -2179,6 +2199,15 @@ impl TimelineMetrics {
         let last_record_gauge = LAST_RECORD_LSN
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
+
+        let pitr_history_size = PITR_HISTORY_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
+        let archival_size = TIMELINE_ARCHIVE_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
         let standby_horizon_gauge = STANDBY_HORIZON
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -2231,6 +2260,8 @@ impl TimelineMetrics {
             find_gc_cutoffs_histo,
             load_layer_map_histo,
             last_record_gauge,
+            pitr_history_size,
+            archival_size,
             standby_horizon_gauge,
             resident_physical_size_gauge,
             current_logical_size_gauge,
@@ -2288,6 +2319,10 @@ impl TimelineMetrics {
         if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
             let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
+
+        let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+
         let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index adf492ace7..eef8dc104c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2874,6 +2874,7 @@ impl Tenant {
             {
                 let mut target = timeline.gc_info.write().unwrap();
 
+                // Cull any expired leases
                 let now = SystemTime::now();
                 target.leases.retain(|_, lease| !lease.is_expired(&now));
 
@@ -2882,6 +2883,31 @@ impl Tenant {
                     .valid_lsn_lease_count_gauge
                     .set(target.leases.len() as u64);
 
+                // Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR
+                if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
+                    if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
+                        target.within_ancestor_pitr =
+                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr;
+                    }
+                }
+
+                // Update metrics that depend on GC state
+                timeline
+                    .metrics
+                    .archival_size
+                    .set(if target.within_ancestor_pitr {
+                        timeline.metrics.current_logical_size_gauge.get()
+                    } else {
+                        0
+                    });
+                timeline.metrics.pitr_history_size.set(
+                    timeline
+                        .get_last_record_lsn()
+                        .checked_sub(target.cutoffs.pitr)
+                        .unwrap_or(Lsn(0))
+                        .0,
+                );
+
                 match gc_cutoffs.remove(&timeline.timeline_id) {
                     Some(cutoffs) => {
                         target.retain_lsns = branchpoints;
@@ -7063,6 +7089,7 @@ mod tests {
                     horizon: Lsn(0x30),
                 },
                 leases: Default::default(),
+                within_ancestor_pitr: false,
             };
         }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index df4d252ad2..54bbdef56e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -463,6 +463,9 @@ pub(crate) struct GcInfo {
 
     /// Leases granted to particular LSNs.
     pub(crate) leases: BTreeMap<Lsn, LsnLease>,
+
+    /// Whether our branch point is within our ancestor's PITR interval (for cost estimation)
+    pub(crate) within_ancestor_pitr: bool,
 }
 
 impl GcInfo {
@@ -851,6 +854,18 @@ impl Timeline {
             .map(|ancestor| ancestor.timeline_id)
     }
 
+    /// Get the bytes written since the PITR cutoff on this branch, and
+    /// whether this branch's ancestor_lsn is within its parent's PITR.
+    pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
+        let gc_info = self.gc_info.read().unwrap();
+        let history = self
+            .get_last_record_lsn()
+            .checked_sub(gc_info.cutoffs.pitr)
+            .unwrap_or(Lsn(0))
+            .0;
+        (history, gc_info.within_ancestor_pitr)
+    }
+
     /// Lock and get timeline's GC cutoff
     pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
         self.latest_gc_cutoff_lsn.read()
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 41fa8e679f..c019cbbc77 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -144,6 +144,8 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",
     "pageserver_smgr_query_seconds_sum",
+    "pageserver_archive_size",
+    "pageserver_pitr_history_size",
     "pageserver_storage_operations_seconds_count_total",
     "pageserver_storage_operations_seconds_sum_total",
     "pageserver_evictions_total",

From bbb2fa7cdd1284376155fcbbdf34191b335df4e6 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 4 Jul 2024 06:04:19 +0100
Subject: [PATCH 1113/1571] tests: perform graceful rolling restarts in storcon
 scale test (#8173)

## Problem
Scale test doesn't exercise drain & fill.

## Summary of changes
Make scale test exercise drain & fill
---
 test_runner/fixtures/neon_fixtures.py         |  47 +++++++
 .../test_storage_controller_scale.py          | 124 ++++++++++++++++--
 .../regress/test_storage_controller.py        |  59 ++-------
 3 files changed, 171 insertions(+), 59 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 565aaba6e0..c002e11c1c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2113,6 +2113,21 @@ class NeonStorageController(MetricsGetter, LogUtils):
             self.running = False
         return self
 
+    @staticmethod
+    def retryable_node_operation(op, ps_id, max_attempts, backoff):
+        while max_attempts > 0:
+            try:
+                op(ps_id)
+                return
+            except StorageControllerApiException as e:
+                max_attempts -= 1
+                log.info(f"Operation failed ({max_attempts} attempts left): {e}")
+
+                if max_attempts == 0:
+                    raise e
+
+                time.sleep(backoff)
+
     @staticmethod
     def raise_api_exception(res: requests.Response):
         try:
@@ -2453,6 +2468,38 @@ class NeonStorageController(MetricsGetter, LogUtils):
         )
         log.info("storage controller passed consistency check")
 
+    def poll_node_status(
+        self, node_id: int, desired_scheduling_policy: str, max_attempts: int, backoff: int
+    ):
+        """
+        Poll the node status until it reaches 'desired_scheduling_policy' or 'max_attempts' have been exhausted
+        """
+        log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
+        while max_attempts > 0:
+            try:
+                status = self.node_status(node_id)
+                policy = status["scheduling"]
+                if policy == desired_scheduling_policy:
+                    return
+                else:
+                    max_attempts -= 1
+                    log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
+
+                    if max_attempts == 0:
+                        raise AssertionError(
+                            f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
+                        )
+
+                    time.sleep(backoff)
+            except StorageControllerApiException as e:
+                max_attempts -= 1
+                log.info(f"Status call failed ({max_attempts} retries left): {e}")
+
+                if max_attempts == 0:
+                    raise e
+
+                time.sleep(backoff)
+
     def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
         if isinstance(config_strings, tuple):
             pairs = [config_strings]
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index a4c8c8ac42..d65a66b010 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -1,18 +1,89 @@
 import concurrent.futures
 import random
 import time
+from collections import defaultdict
+from typing import Any, Dict
 
 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    NeonEnvBuilder,
-)
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pg_version import PgVersion
 
 
+def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[str, int]:
+    """
+    Get the number of shards attached to each node.
+    This function takes into account the intersection of the intent and the observed state.
+    If they do not match, it asserts out.
+    """
+    tenants = env.storage_controller.tenant_list()
+
+    intent = dict()
+    observed = dict()
+
+    tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict(
+        lambda: {
+            "observed": {"attached": None, "secondary": []},
+            "intent": {"attached": None, "secondary": []},
+        }
+    )
+
+    for t in tenants:
+        for node_id, loc_state in t["observed"]["locations"].items():
+            if (
+                loc_state is not None
+                and "conf" in loc_state
+                and loc_state["conf"] is not None
+                and loc_state["conf"]["mode"]
+                in set(["AttachedSingle", "AttachedMulti", "AttachedStale"])
+            ):
+                observed[t["tenant_shard_id"]] = int(node_id)
+                tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id)
+
+            if (
+                loc_state is not None
+                and "conf" in loc_state
+                and loc_state["conf"] is not None
+                and loc_state["conf"]["mode"] == "Secondary"
+            ):
+                tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(int(node_id))
+
+        if "attached" in t["intent"]:
+            intent[t["tenant_shard_id"]] = t["intent"]["attached"]
+            tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"]["attached"]
+
+        if "secondary" in t["intent"]:
+            tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][
+                "secondary"
+            ]
+
+    log.info(f"{tenant_placement=}")
+
+    matching = {
+        tid: intent[tid] for tid in observed if tid in intent and intent[tid] == observed[tid]
+    }
+    assert len(matching) == total_shards
+
+    attached_per_node: defaultdict[str, int] = defaultdict(int)
+    for node_id in matching.values():
+        attached_per_node[node_id] += 1
+
+    return attached_per_node
+
+
+def assert_consistent_balanced_attachments(env: NeonEnv, total_shards):
+    attached_per_node = get_consistent_node_shard_counts(env, total_shards)
+
+    min_shard_count = min(attached_per_node.values())
+    max_shard_count = max(attached_per_node.values())
+
+    flake_factor = 5 / 100
+    assert max_shard_count - min_shard_count <= int(total_shards * flake_factor)
+
+
 @pytest.mark.timeout(3600)  # super long running test: should go down as we optimize
 def test_storage_controller_many_tenants(
     neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure
@@ -44,7 +115,8 @@ def test_storage_controller_many_tenants(
     # A small sleep on each call into the notify hook, to simulate the latency of doing a database write
     compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))
 
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_configs()
+    neon_env_builder.start()
 
     # We will intentionally stress reconciler concurrrency, which triggers a warning when lots
     # of shards are hitting the delayed path.
@@ -79,6 +151,8 @@ def test_storage_controller_many_tenants(
     shard_count = 2
     stripe_size = 1024
 
+    total_shards = tenant_count * shard_count
+
     tenants = set(TenantId.generate() for _i in range(0, tenant_count))
 
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
@@ -195,10 +269,44 @@ def test_storage_controller_many_tenants(
     env.storage_controller.consistency_check()
     check_memory()
 
-    # Restart pageservers: this exercises the /re-attach API
-    for pageserver in env.pageservers:
-        pageserver.stop()
-        pageserver.start()
+    shard_counts = get_consistent_node_shard_counts(env, total_shards)
+    log.info(f"Shard counts before rolling restart: {shard_counts}")
+
+    assert_consistent_balanced_attachments(env, total_shards)
+
+    # Restart pageservers gracefully: this exercises the /re-attach pageserver API
+    # and the storage controller drain and fill API
+    for ps in env.pageservers:
+        env.storage_controller.retryable_node_operation(
+            lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
+        )
+
+        env.storage_controller.poll_node_status(
+            ps.id, "PauseForRestart", max_attempts=24, backoff=5
+        )
+
+        shard_counts = get_consistent_node_shard_counts(env, total_shards)
+        log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
+        # Assert that we've drained the node
+        assert shard_counts[str(ps.id)] == 0
+        # Assert that those shards actually went somewhere
+        assert sum(shard_counts.values()) == total_shards
+
+        ps.restart()
+        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=1)
+
+        env.storage_controller.retryable_node_operation(
+            lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
+        )
+        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=5)
+
+        shard_counts = get_consistent_node_shard_counts(env, total_shards)
+        log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
+
+        assert_consistent_balanced_attachments(env, total_shards)
+
+        env.storage_controller.reconcile_until_idle()
+        env.storage_controller.consistency_check()
 
     # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,
     # as they were not offline long enough to trigger any scheduling changes.
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 1b294fb2d0..a78f566f0e 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1518,49 +1518,6 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto
         workload.validate()
 
 
-def retryable_node_operation(op, ps_id, max_attempts, backoff):
-    while max_attempts > 0:
-        try:
-            op(ps_id)
-            return
-        except StorageControllerApiException as e:
-            max_attempts -= 1
-            log.info(f"Operation failed ({max_attempts} attempts left): {e}")
-
-            if max_attempts == 0:
-                raise e
-
-            time.sleep(backoff)
-
-
-def poll_node_status(env, node_id, desired_scheduling_policy, max_attempts, backoff):
-    log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
-    while max_attempts > 0:
-        try:
-            status = env.storage_controller.node_status(node_id)
-            policy = status["scheduling"]
-            if policy == desired_scheduling_policy:
-                return
-            else:
-                max_attempts -= 1
-                log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
-
-                if max_attempts == 0:
-                    raise AssertionError(
-                        f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
-                    )
-
-                time.sleep(backoff)
-        except StorageControllerApiException as e:
-            max_attempts -= 1
-            log.info(f"Status call failed ({max_attempts} retries left): {e}")
-
-            if max_attempts == 0:
-                raise e
-
-            time.sleep(backoff)
-
-
 def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
     """
     Graceful reststart of storage controller clusters use the drain and
@@ -1601,10 +1558,10 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
 
     # Perform a graceful rolling restart
     for ps in env.pageservers:
-        retryable_node_operation(
+        env.storage_controller.retryable_node_operation(
             lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
         )
-        poll_node_status(env, ps.id, "PauseForRestart", max_attempts=6, backoff=5)
+        env.storage_controller.poll_node_status(ps.id, "PauseForRestart", max_attempts=6, backoff=5)
 
         shard_counts = get_node_shard_counts(env, tenant_ids)
         log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
@@ -1614,12 +1571,12 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
         assert sum(shard_counts.values()) == total_shards
 
         ps.restart()
-        poll_node_status(env, ps.id, "Active", max_attempts=10, backoff=1)
+        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=10, backoff=1)
 
-        retryable_node_operation(
+        env.storage_controller.retryable_node_operation(
             lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
         )
-        poll_node_status(env, ps.id, "Active", max_attempts=6, backoff=5)
+        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=6, backoff=5)
 
         shard_counts = get_node_shard_counts(env, tenant_ids)
         log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
@@ -1657,15 +1614,15 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
 
     ps_id_to_drain = env.pageservers[0].id
 
-    retryable_node_operation(
+    env.storage_controller.retryable_node_operation(
         lambda ps_id: env.storage_controller.node_drain(ps_id),
         ps_id_to_drain,
         max_attempts=3,
         backoff=2,
     )
 
-    poll_node_status(env, ps_id_to_drain, "Draining", max_attempts=6, backoff=2)
+    env.storage_controller.poll_node_status(ps_id_to_drain, "Draining", max_attempts=6, backoff=2)
 
     env.storage_controller.cancel_node_drain(ps_id_to_drain)
 
-    poll_node_status(env, ps_id_to_drain, "Active", max_attempts=6, backoff=2)
+    env.storage_controller.poll_node_status(ps_id_to_drain, "Active", max_attempts=6, backoff=2)

From e03c3c9893acbc6052184a5be8cc6b9f893a4d4e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 4 Jul 2024 09:03:03 +0100
Subject: [PATCH 1114/1571] proxy: cache certain non-retriable console errors
 for a short time (#8201)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

If there's a quota error, it makes sense to cache it for a short window
of time. Many clients do not handle database connection errors
gracefully, so just spam retry 🤡

## Summary of changes

Updates the node_info cache to support storing console errors. Store
console errors if they cannot be retried (using our own heuristic.
should only trigger for quota exceeded errors).
---
 proxy/src/cache/common.rs          |  7 +++
 proxy/src/cache/timed_lru.rs       | 38 ++++++++++++-
 proxy/src/console/messages.rs      | 62 +++++++++++----------
 proxy/src/console/provider.rs      |  6 +-
 proxy/src/console/provider/neon.rs | 89 ++++++++++++++++++++++--------
 proxy/src/proxy/tests.rs           |  4 +-
 6 files changed, 146 insertions(+), 60 deletions(-)

diff --git a/proxy/src/cache/common.rs b/proxy/src/cache/common.rs
index bc1c37512b..4e393fddb2 100644
--- a/proxy/src/cache/common.rs
+++ b/proxy/src/cache/common.rs
@@ -53,6 +53,13 @@ impl<C: Cache, V> Cached<C, V> {
         )
     }
 
+    pub fn map<U>(self, f: impl FnOnce(V) -> U) -> Cached<C, U> {
+        Cached {
+            token: self.token,
+            value: f(self.value),
+        }
+    }
+
     /// Drop this entry from a cache if it's still there.
     pub fn invalidate(self) -> V {
         if let Some((cache, info)) = &self.token {
diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs
index 3b21381bb9..c5c4f6a1ed 100644
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -65,6 +65,8 @@ impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
 struct Entry<T> {
     created_at: Instant,
     expires_at: Instant,
+    ttl: Duration,
+    update_ttl_on_retrieval: bool,
     value: T,
 }
 
@@ -122,7 +124,6 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
         Q: Hash + Eq + ?Sized,
     {
         let now = Instant::now();
-        let deadline = now.checked_add(self.ttl).expect("time overflow");
 
         // Do costly things before taking the lock.
         let mut cache = self.cache.lock();
@@ -142,7 +143,8 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
         let (created_at, expires_at) = (entry.created_at, entry.expires_at);
 
         // Update the deadline and the entry's position in the LRU list.
-        if self.update_ttl_on_retrieval {
+        let deadline = now.checked_add(raw_entry.get().ttl).expect("time overflow");
+        if raw_entry.get().update_ttl_on_retrieval {
             raw_entry.get_mut().expires_at = deadline;
         }
         raw_entry.to_back();
@@ -162,12 +164,27 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
     /// existed, return the previous value and its creation timestamp.
     #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
     fn insert_raw(&self, key: K, value: V) -> (Instant, Option<V>) {
+        self.insert_raw_ttl(key, value, self.ttl, self.update_ttl_on_retrieval)
+    }
+
+    /// Insert an entry to the cache. If an entry with the same key already
+    /// existed, return the previous value and its creation timestamp.
+    #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
+    fn insert_raw_ttl(
+        &self,
+        key: K,
+        value: V,
+        ttl: Duration,
+        update: bool,
+    ) -> (Instant, Option<V>) {
         let created_at = Instant::now();
-        let expires_at = created_at.checked_add(self.ttl).expect("time overflow");
+        let expires_at = created_at.checked_add(ttl).expect("time overflow");
 
         let entry = Entry {
             created_at,
             expires_at,
+            ttl,
+            update_ttl_on_retrieval: update,
             value,
         };
 
@@ -190,6 +207,21 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
 }
 
 impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
+    pub fn insert_ttl(&self, key: K, value: V, ttl: Duration) {
+        self.insert_raw_ttl(key, value, ttl, false);
+    }
+
+    pub fn insert_unit(&self, key: K, value: V) -> (Option<V>, Cached<&Self, ()>) {
+        let (created_at, old) = self.insert_raw(key.clone(), value);
+
+        let cached = Cached {
+            token: Some((self, LookupInfo { created_at, key })),
+            value: (),
+        };
+
+        (old, cached)
+    }
+
     pub fn insert(&self, key: K, value: V) -> (Option<V>, Cached<&Self>) {
         let (created_at, old) = self.insert_raw(key.clone(), value.clone());
 
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index d28d13ba69..9abf24ab7f 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -9,7 +9,7 @@ use crate::proxy::retry::CouldRetry;
 
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
-#[derive(Debug, Deserialize)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct ConsoleError {
     pub error: Box<str>,
     #[serde(skip)]
@@ -82,41 +82,19 @@ impl CouldRetry for ConsoleError {
             .details
             .error_info
             .map_or(Reason::Unknown, |e| e.reason);
-        match reason {
-            // not a transitive error
-            Reason::RoleProtected => false,
-            // on retry, it will still not be found
-            Reason::ResourceNotFound
-            | Reason::ProjectNotFound
-            | Reason::EndpointNotFound
-            | Reason::BranchNotFound => false,
-            // we were asked to go away
-            Reason::RateLimitExceeded
-            | Reason::NonDefaultBranchComputeTimeExceeded
-            | Reason::ActiveTimeQuotaExceeded
-            | Reason::ComputeTimeQuotaExceeded
-            | Reason::WrittenDataQuotaExceeded
-            | Reason::DataTransferQuotaExceeded
-            | Reason::LogicalSizeQuotaExceeded => false,
-            // transitive error. control plane is currently busy
-            // but might be ready soon
-            Reason::RunningOperations => true,
-            Reason::ConcurrencyLimitReached => true,
-            Reason::LockAlreadyTaken => true,
-            // unknown error. better not retry it.
-            Reason::Unknown => false,
-        }
+
+        reason.can_retry()
     }
 }
 
-#[derive(Debug, Deserialize)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct Status {
     pub code: Box<str>,
     pub message: Box<str>,
     pub details: Details,
 }
 
-#[derive(Debug, Deserialize)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct Details {
     pub error_info: Option<ErrorInfo>,
     pub retry_info: Option<RetryInfo>,
@@ -199,6 +177,34 @@ impl Reason {
                 | Reason::BranchNotFound
         )
     }
+
+    pub fn can_retry(&self) -> bool {
+        match self {
+            // do not retry role protected errors
+            // not a transitive error
+            Reason::RoleProtected => false,
+            // on retry, it will still not be found
+            Reason::ResourceNotFound
+            | Reason::ProjectNotFound
+            | Reason::EndpointNotFound
+            | Reason::BranchNotFound => false,
+            // we were asked to go away
+            Reason::RateLimitExceeded
+            | Reason::NonDefaultBranchComputeTimeExceeded
+            | Reason::ActiveTimeQuotaExceeded
+            | Reason::ComputeTimeQuotaExceeded
+            | Reason::WrittenDataQuotaExceeded
+            | Reason::DataTransferQuotaExceeded
+            | Reason::LogicalSizeQuotaExceeded => false,
+            // transitive error. control plane is currently busy
+            // but might be ready soon
+            Reason::RunningOperations
+            | Reason::ConcurrencyLimitReached
+            | Reason::LockAlreadyTaken => true,
+            // unknown error. better not retry it.
+            Reason::Unknown => false,
+        }
+    }
 }
 
 #[derive(Copy, Clone, Debug, Deserialize)]
@@ -206,7 +212,7 @@ pub struct RetryInfo {
     pub retry_delay_ms: u64,
 }
 
-#[derive(Debug, Deserialize)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct UserFacingMessage {
     pub message: Box<str>,
 }
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index bec55a8343..7a9637066f 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -2,7 +2,7 @@
 pub mod mock;
 pub mod neon;
 
-use super::messages::MetricsAuxInfo;
+use super::messages::{ConsoleError, MetricsAuxInfo};
 use crate::{
     auth::{
         backend::{ComputeCredentialKeys, ComputeUserInfo},
@@ -317,8 +317,8 @@ impl NodeInfo {
     }
 }
 
-pub type NodeInfoCache = TimedLru<EndpointCacheKey, NodeInfo>;
-pub type CachedNodeInfo = Cached<&'static NodeInfoCache>;
+pub type NodeInfoCache = TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ConsoleError>>>;
+pub type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
 pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
 pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
 
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 41bd2f4956..a6e67be22f 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -9,7 +9,7 @@ use super::{
 use crate::{
     auth::backend::ComputeUserInfo,
     compute,
-    console::messages::ColdStartInfo,
+    console::messages::{ColdStartInfo, Reason},
     http,
     metrics::{CacheOutcome, Metrics},
     rate_limiter::EndpointRateLimiter,
@@ -17,10 +17,10 @@ use crate::{
 };
 use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
-use std::sync::Arc;
+use std::{sync::Arc, time::Duration};
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{debug, error, info, info_span, warn, Instrument};
 
 pub struct Api {
     endpoint: http::Endpoint,
@@ -273,26 +273,34 @@ impl super::Api for Api {
     ) -> Result<CachedNodeInfo, WakeComputeError> {
         let key = user_info.endpoint_cache_key();
 
+        macro_rules! check_cache {
+            () => {
+                if let Some(cached) = self.caches.node_info.get(&key) {
+                    let (cached, info) = cached.take_value();
+                    let info = info.map_err(|c| {
+                        info!(key = &*key, "found cached wake_compute error");
+                        WakeComputeError::ApiError(ApiError::Console(*c))
+                    })?;
+
+                    debug!(key = &*key, "found cached compute node info");
+                    ctx.set_project(info.aux.clone());
+                    return Ok(cached.map(|()| info));
+                }
+            };
+        }
+
         // Every time we do a wakeup http request, the compute node will stay up
         // for some time (highly depends on the console's scale-to-zero policy);
         // The connection info remains the same during that period of time,
         // which means that we might cache it to reduce the load and latency.
-        if let Some(cached) = self.caches.node_info.get(&key) {
-            info!(key = &*key, "found cached compute node info");
-            ctx.set_project(cached.aux.clone());
-            return Ok(cached);
-        }
+        check_cache!();
 
         let permit = self.locks.get_permit(&key).await?;
 
         // after getting back a permit - it's possible the cache was filled
         // double check
         if permit.should_check_cache() {
-            if let Some(cached) = self.caches.node_info.get(&key) {
-                info!(key = &*key, "found cached compute node info");
-                ctx.set_project(cached.aux.clone());
-                return Ok(cached);
-            }
+            check_cache!();
         }
 
         // check rate limit
@@ -300,23 +308,56 @@ impl super::Api for Api {
             .wake_compute_endpoint_rate_limiter
             .check(user_info.endpoint.normalize_intern(), 1)
         {
-            info!(key = &*key, "found cached compute node info");
             return Err(WakeComputeError::TooManyConnections);
         }
 
-        let mut node = permit.release_result(self.do_wake_compute(ctx, user_info).await)?;
-        ctx.set_project(node.aux.clone());
-        let cold_start_info = node.aux.cold_start_info;
-        info!("woken up a compute node");
+        let node = permit.release_result(self.do_wake_compute(ctx, user_info).await);
+        match node {
+            Ok(node) => {
+                ctx.set_project(node.aux.clone());
+                debug!(key = &*key, "created a cache entry for woken compute node");
 
-        // store the cached node as 'warm'
-        node.aux.cold_start_info = ColdStartInfo::WarmCached;
-        let (_, mut cached) = self.caches.node_info.insert(key.clone(), node);
-        cached.aux.cold_start_info = cold_start_info;
+                let mut stored_node = node.clone();
+                // store the cached node as 'warm_cached'
+                stored_node.aux.cold_start_info = ColdStartInfo::WarmCached;
 
-        info!(key = &*key, "created a cache entry for compute node info");
+                let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node));
 
-        Ok(cached)
+                Ok(cached.map(|()| node))
+            }
+            Err(err) => match err {
+                WakeComputeError::ApiError(ApiError::Console(err)) => {
+                    let Some(status) = &err.status else {
+                        return Err(WakeComputeError::ApiError(ApiError::Console(err)));
+                    };
+
+                    let reason = status
+                        .details
+                        .error_info
+                        .map_or(Reason::Unknown, |x| x.reason);
+
+                    // if we can retry this error, do not cache it.
+                    if reason.can_retry() {
+                        return Err(WakeComputeError::ApiError(ApiError::Console(err)));
+                    }
+
+                    // at this point, we should only have quota errors.
+                    debug!(
+                        key = &*key,
+                        "created a cache entry for the wake compute error"
+                    );
+
+                    self.caches.node_info.insert_ttl(
+                        key,
+                        Err(Box::new(err.clone())),
+                        Duration::from_secs(30),
+                    );
+
+                    Err(WakeComputeError::ApiError(ApiError::Console(err)))
+                }
+                err => return Err(err),
+            },
+        }
     }
 }
 
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 8119f39fae..5186a9e1b0 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -540,8 +540,8 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
         },
         allow_self_signed_compute: false,
     };
-    let (_, node) = cache.insert("key".into(), node);
-    node
+    let (_, node2) = cache.insert_unit("key".into(), Ok(node.clone()));
+    node2.map(|()| node)
 }
 
 fn helper_create_connect_info(

From 5b69b32dc5fa1500fda12e53471809d5e6082f6f Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 4 Jul 2024 09:20:01 +0100
Subject: [PATCH 1115/1571] CI(build-and-test): add conclusion job (#8246)

## Problem

Currently, if you need to rename a job and the job is listed in [branch
protection
rules](https://github.com/neondatabase/neon/settings/branch_protection_rules),
the PR won't be allowed to merge.

## Summary of changes
- Add `conclusion` job that fails if any of its dependencies don't
finish successfully
---
 .github/workflows/build_and_test.yml | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 5ac8c6ec27..9b75d0bf3c 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1368,3 +1368,31 @@ jobs:
     with:
       from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
     secrets: inherit
+
+  # This job simplifies setting branch protection rules (in GitHub UI)
+  # by allowing to set only this job instead of listing many others.
+  # It also makes it easier to rename or parametrise jobs (using matrix)
+  # which requires changes in branch protection rules
+  #
+  # Note, that we can't add external check (like `neon-cloud-e2e`) we still need to use GitHub UI for that.
+  #
+  # https://github.com/neondatabase/neon/settings/branch_protection_rules
+  conclusion:
+    if: always()
+    # Format `needs` differently to make the list more readable.
+    # Usually we do `needs: [...]`
+    needs:
+      - check-codestyle-python
+      - check-codestyle-rust
+      - regress-tests
+      - test-images
+    runs-on: ubuntu-22.04
+    steps:
+      # The list of possible results:
+      # https://docs.github.com/en/actions/learn-github-actions/contexts#needs-context
+      - name: Fail the job if any of the dependencies do not succeed
+        run: exit 1
+        if: |
+          contains(needs.*.result, 'failure')
+          || contains(needs.*.result, 'cancelled')
+          || contains(needs.*.result, 'skipped')

From a46253766bf59d65c0b24f1e626787316e23ca80 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 4 Jul 2024 13:22:33 +0100
Subject: [PATCH 1116/1571] pageserver: increase rate limit duration for layer
 visit log (#8263)

## Problem
I'd like to keep this in the tree since it might be useful in prod as
well. It's a bit too noisy as is and missing the lsn.

## Summary of changes
Add an lsn field and and increase the rate limit duration.
---
 pageserver/src/tenant/timeline.rs | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 54bbdef56e..bbf0d0a4bf 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1284,15 +1284,14 @@ impl Timeline {
             if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH {
                 use utils::rate_limit::RateLimit;
                 static LOGGED: Lazy<Mutex<RateLimit>> =
-                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
                 let mut rate_limit = LOGGED.lock().unwrap();
                 rate_limit.call(|| {
                     tracing::info!(
-                    tenant_id = %self.tenant_shard_id.tenant_id,
-                    shard_id = %self.tenant_shard_id.shard_slug(),
-                    timeline_id = %self.timeline_id,
-                    "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned",
-                    keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size());
+                      shard_id = %self.tenant_shard_id.shard_slug(),
+                      lsn = %lsn,
+                      "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned",
+                      keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size());
                 });
             }
 

From a004d27fcae6b263a0878b24794514e8f5273dac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 4 Jul 2024 15:04:08 +0200
Subject: [PATCH 1117/1571] Use bool param for round_trip_test_compressed
 (#8252)

As per @koivunej 's request in
https://github.com/neondatabase/neon/pull/8238#discussion_r1663892091 ,
use a runtime param instead of monomorphizing the function based on the value.

Part of https://github.com/neondatabase/neon/issues/5431
---
 pageserver/src/tenant/blob_io.rs | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index de74066b81..1a6a5702f1 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -393,11 +393,12 @@ mod tests {
     use rand::{Rng, SeedableRng};
 
     async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
-        round_trip_test_compressed::<BUFFERED, false>(blobs).await
+        round_trip_test_compressed::<BUFFERED>(blobs, false).await
     }
 
-    async fn round_trip_test_compressed<const BUFFERED: bool, const COMPRESSION: bool>(
+    async fn round_trip_test_compressed<const BUFFERED: bool>(
         blobs: &[Vec<u8>],
+        compression: bool,
     ) -> Result<(), Error> {
         let temp_dir = camino_tempfile::tempdir()?;
         let pathbuf = temp_dir.path().join("file");
@@ -409,7 +410,7 @@ mod tests {
             let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
-                let (_, res) = if COMPRESSION {
+                let (_, res) = if compression {
                     wtr.write_blob_maybe_compressed(
                         blob.clone(),
                         &ctx,
@@ -432,7 +433,7 @@ mod tests {
 
         let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
         let rdr = BlockReaderRef::VirtualFile(&file);
-        let rdr = BlockCursor::new_with_compression(rdr, COMPRESSION);
+        let rdr = BlockCursor::new_with_compression(rdr, compression);
         for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
             let blob_read = rdr.read_blob(*offset, &ctx).await?;
             assert_eq!(
@@ -466,8 +467,8 @@ mod tests {
         ];
         round_trip_test::<false>(blobs).await?;
         round_trip_test::<true>(blobs).await?;
-        round_trip_test_compressed::<false, true>(blobs).await?;
-        round_trip_test_compressed::<true, true>(blobs).await?;
+        round_trip_test_compressed::<false>(blobs, true).await?;
+        round_trip_test_compressed::<true>(blobs, true).await?;
         Ok(())
     }
 
@@ -483,8 +484,8 @@ mod tests {
         ];
         round_trip_test::<false>(blobs).await?;
         round_trip_test::<true>(blobs).await?;
-        round_trip_test_compressed::<false, true>(blobs).await?;
-        round_trip_test_compressed::<true, true>(blobs).await?;
+        round_trip_test_compressed::<false>(blobs, true).await?;
+        round_trip_test_compressed::<true>(blobs, true).await?;
         Ok(())
     }
 

From bf9fc7706190ecd7cbd04fc56864086ced717327 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 4 Jul 2024 14:58:01 +0100
Subject: [PATCH 1118/1571] CI(pg-clients): unify workflow with build-and-test
 (#8160)

## Problem

`pg-clients` workflow looks different from the main `build-and-test`
workflow for historical reasons (it was my very first task at Neon, and
back then I wasn't really familiar with the rest of the CI pipelines).
This PR unifies `pg-clients` workflow with `build-and-test`

## Summary of changes
- Rename `pg_clients.yml` to `pg-clients.yml`
- Run the workflow on changes in relevant files
- Create Allure report for tests
- Send slack notifications to `#on-call-qa-staging-stream` channel
(instead of `#on-call-staging-stream`)
- Update Client libraries once we're here
---
 .github/workflows/build_and_test.yml          |   2 +-
 .github/workflows/pg-clients.yml              | 115 ++++++++
 .github/workflows/pg_clients.yml              |  98 -------
 Dockerfile.build-tools                        |  22 +-
 test_runner/pg_clients/java/jdbc/Dockerfile   |   2 +-
 .../pg_clients/python/pg8000/requirements.txt |   2 +-
 .../pg_clients/rust/tokio-postgres/Cargo.lock | 273 +++++++++---------
 .../pg_clients/rust/tokio-postgres/Cargo.toml |   4 +-
 .../pg_clients/rust/tokio-postgres/Dockerfile |   2 +-
 .../swift/PostgresClientKitExample/Dockerfile |   4 +-
 .../PostgresClientKitExample/Package.resolved |  12 +-
 .../PostgresClientKitExample/Package.swift    |   2 +-
 .../swift/PostgresNIOExample/Dockerfile       |   4 +-
 .../swift/PostgresNIOExample/Package.resolved |  25 +-
 .../swift/PostgresNIOExample/Package.swift    |   4 +-
 .../typescript/postgresql-client/Dockerfile   |   2 +-
 .../postgresql-client/package-lock.json       |  12 +-
 .../typescript/postgresql-client/package.json |   2 +-
 .../typescript/serverless-driver/Dockerfile   |   2 +-
 .../serverless-driver/package-lock.json       | 144 +++++----
 .../typescript/serverless-driver/package.json |   2 +-
 21 files changed, 403 insertions(+), 332 deletions(-)
 create mode 100644 .github/workflows/pg-clients.yml
 delete mode 100644 .github/workflows/pg_clients.yml

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9b75d0bf3c..a3246987e2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -30,7 +30,7 @@ jobs:
     if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
     uses: ./.github/workflows/check-permissions.yml
     with:
-      github-event-name: ${{ github.event_name}}
+      github-event-name: ${{ github.event_name }}
 
   cancel-previous-e2e-tests:
     needs: [ check-permissions ]
diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml
new file mode 100644
index 0000000000..e21e45c929
--- /dev/null
+++ b/.github/workflows/pg-clients.yml
@@ -0,0 +1,115 @@
+name: Test Postgres client libraries
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '23 02 * * *' # run once a day, timezone is utc
+  pull_request:
+    paths:
+      - '.github/workflows/pg-clients.yml'
+      - 'test_runner/pg_clients/**'
+      - 'poetry.lock'
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref_name }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+env:
+  DEFAULT_PG_VERSION: 16
+  PLATFORM: neon-captest-new
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+  AWS_DEFAULT_REGION: eu-central-1
+
+jobs:
+  check-permissions:
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
+    uses: ./.github/workflows/check-permissions.yml
+    with:
+      github-event-name: ${{ github.event_name }}
+
+  check-build-tools-image:
+    needs: [ check-permissions ]
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  build-build-tools-image:
+    needs: [ check-build-tools-image ]
+    uses: ./.github/workflows/build-build-tools-image.yml
+    with:
+      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
+    secrets: inherit
+
+  test-postgres-client-libs:
+    needs: [ build-build-tools-image ]
+    runs-on: ubuntu-22.04
+
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init --user root
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Create Neon Project
+      id: create-neon-project
+      uses: ./.github/actions/neon-project-create
+      with:
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+
+    - name: Run tests
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: remote
+        test_selection: pg_clients
+        run_in_parallel: false
+        extra_params: -m remote_cluster
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
+
+    - name: Delete Neon Project
+      if: always()
+      uses: ./.github/actions/neon-project-delete
+      with:
+        project_id: ${{ steps.create-neon-project.outputs.project_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      id: create-allure-report
+      uses: ./.github/actions/allure-report-generate
+      with:
+        store-test-results-into-db: true
+      env:
+        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
+    - name: Post to a Slack channel
+      if: github.event.schedule && failure()
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
+        slack-message: |
+          Testing Postgres clients: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml
deleted file mode 100644
index dd09abddb8..0000000000
--- a/.github/workflows/pg_clients.yml
+++ /dev/null
@@ -1,98 +0,0 @@
-name: Test Postgres client libraries
-
-on:
-  schedule:
-    # * is a special character in YAML so you have to quote this string
-    #          ┌───────────── minute (0 - 59)
-    #          │ ┌───────────── hour (0 - 23)
-    #          │ │ ┌───────────── day of the month (1 - 31)
-    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
-    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:  '23 02 * * *' # run once a day, timezone is utc
-
-  workflow_dispatch:
-
-concurrency:
-  # Allow only one workflow per any non-`main` branch.
-  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
-  cancel-in-progress: true
-
-jobs:
-  test-postgres-client-libs:
-    # TODO: switch to gen2 runner, requires docker
-    runs-on: ubuntu-22.04
-
-    env:
-      DEFAULT_PG_VERSION: 14
-      TEST_OUTPUT: /tmp/test_output
-
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v4
-
-    - uses: actions/setup-python@v4
-      with:
-        python-version: 3.9
-
-    - name: Install Poetry
-      uses: snok/install-poetry@v1
-
-    - name: Cache poetry deps
-      uses: actions/cache@v4
-      with:
-        path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
-
-    - name: Install Python deps
-      shell: bash -euxo pipefail {0}
-      run: ./scripts/pysync
-
-    - name: Create Neon Project
-      id: create-neon-project
-      uses: ./.github/actions/neon-project-create
-      with:
-        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
-
-    - name: Run pytest
-      env:
-        REMOTE_ENV: 1
-        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
-        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      shell: bash -euxo pipefail {0}
-      run: |
-        # Test framework expects we have psql binary;
-        # but since we don't really need it in this test, let's mock it
-        mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql";
-        ./scripts/pytest \
-          --junitxml=$TEST_OUTPUT/junit.xml \
-          --tb=short \
-          --verbose \
-          -m "remote_cluster" \
-          -rA "test_runner/pg_clients"
-
-    - name: Delete Neon Project
-      if: ${{ always() }}
-      uses: ./.github/actions/neon-project-delete
-      with:
-        project_id: ${{ steps.create-neon-project.outputs.project_id }}
-        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-
-    # We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI.
-    # It will be fixed after switching to gen2 runner
-    - name: Upload python test logs
-      if: always()
-      uses: actions/upload-artifact@v4
-      with:
-        retention-days: 7
-        name: python-test-pg_clients-${{ runner.os }}-${{ runner.arch }}-stage-logs
-        path: ${{ env.TEST_OUTPUT }}
-
-    - name: Post to a Slack channel
-      if: ${{ github.event.schedule && failure() }}
-      uses: slackapi/slack-github-action@v1
-      with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 30314376ef..4826b7914e 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -1,5 +1,13 @@
 FROM debian:bullseye-slim
 
+# Use ARG as a build-time environment variable here to allow.
+# It's not supposed to be set outside.
+# Alternatively it can be obtained using the following command
+# ```
+# . /etc/os-release && echo "${VERSION_CODENAME}"
+# ```
+ARG DEBIAN_VERSION_CODENAME=bullseye
+
 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
 SHELL ["/bin/bash", "-c"]
@@ -66,12 +74,24 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
 # LLVM
 ENV LLVM_VERSION=18
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
-    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
+    && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
     && apt update \
     && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
     && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
+# Install docker
+RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
+    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \
+    && apt update \
+    && apt install -y docker-ce docker-ce-cli \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# Configure sudo & docker
+RUN usermod -aG sudo nonroot && \
+    echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \
+    usermod -aG docker nonroot
+
 # AWS CLI
 RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
     && unzip -q awscliv2.zip \
diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile
index 7e074e07b8..7c2b1b40e0 100644
--- a/test_runner/pg_clients/java/jdbc/Dockerfile
+++ b/test_runner/pg_clients/java/jdbc/Dockerfile
@@ -1,4 +1,4 @@
-FROM openjdk:21
+FROM openjdk:22
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt
index e086a937e6..099a4ade2c 100644
--- a/test_runner/pg_clients/python/pg8000/requirements.txt
+++ b/test_runner/pg_clients/python/pg8000/requirements.txt
@@ -1,2 +1,2 @@
-pg8000==1.30.5
+pg8000==1.31.2
 scramp>=1.4.3
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
index a4a2426b97..32c1c52eea 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
@@ -4,9 +4,9 @@ version = 3
 
 [[package]]
 name = "addr2line"
-version = "0.21.0"
+version = "0.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
+checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678"
 dependencies = [
  "gimli",
 ]
@@ -19,9 +19,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
 [[package]]
 name = "async-trait"
-version = "0.1.77"
+version = "0.1.80"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9"
+checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -30,15 +30,15 @@ dependencies = [
 
 [[package]]
 name = "autocfg"
-version = "1.1.0"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
+checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
 
 [[package]]
 name = "backtrace"
-version = "0.3.69"
+version = "0.3.73"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837"
+checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a"
 dependencies = [
  "addr2line",
  "cc",
@@ -63,9 +63,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "bitflags"
-version = "2.4.2"
+version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf"
+checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
 
 [[package]]
 name = "block-buffer"
@@ -78,9 +78,9 @@ dependencies = [
 
 [[package]]
 name = "bumpalo"
-version = "3.15.3"
+version = "3.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b"
+checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
 
 [[package]]
 name = "byteorder"
@@ -90,15 +90,15 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
 [[package]]
 name = "bytes"
-version = "1.5.0"
+version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
+checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9"
 
 [[package]]
 name = "cc"
-version = "1.0.89"
+version = "1.0.101"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0ba8f7aaa012f30d5b2861462f6708eccd49c3c39863fe083a308035f63d723"
+checksum = "ac367972e516d45567c7eafc73d24e1c193dcf200a8d94e9db7b3d38b349572d"
 
 [[package]]
 name = "cfg-if"
@@ -154,9 +154,9 @@ dependencies = [
 
 [[package]]
 name = "errno"
-version = "0.3.8"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
+checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
 dependencies = [
  "libc",
  "windows-sys 0.52.0",
@@ -170,15 +170,9 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7"
 
 [[package]]
 name = "fastrand"
-version = "2.0.1"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
-
-[[package]]
-name = "finl_unicode"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6"
+checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
 
 [[package]]
 name = "foreign-types"
@@ -296,9 +290,9 @@ dependencies = [
 
 [[package]]
 name = "getrandom"
-version = "0.2.12"
+version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5"
+checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
 dependencies = [
  "cfg-if",
  "libc",
@@ -307,9 +301,9 @@ dependencies = [
 
 [[package]]
 name = "gimli"
-version = "0.28.1"
+version = "0.29.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
+checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd"
 
 [[package]]
 name = "hmac"
@@ -329,29 +323,23 @@ dependencies = [
  "wasm-bindgen",
 ]
 
-[[package]]
-name = "lazy_static"
-version = "1.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
-
 [[package]]
 name = "libc"
-version = "0.2.153"
+version = "0.2.155"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
+checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.13"
+version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
+checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
 
 [[package]]
 name = "lock_api"
-version = "0.4.11"
+version = "0.4.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45"
+checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
 dependencies = [
  "autocfg",
  "scopeguard",
@@ -375,15 +363,15 @@ dependencies = [
 
 [[package]]
 name = "memchr"
-version = "2.7.1"
+version = "2.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 
 [[package]]
 name = "miniz_oxide"
-version = "0.7.2"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7"
+checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
 dependencies = [
  "adler",
 ]
@@ -401,11 +389,10 @@ dependencies = [
 
 [[package]]
 name = "native-tls"
-version = "0.2.11"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
+checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466"
 dependencies = [
- "lazy_static",
  "libc",
  "log",
  "openssl",
@@ -419,9 +406,9 @@ dependencies = [
 
 [[package]]
 name = "object"
-version = "0.32.2"
+version = "0.36.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
+checksum = "576dfe1fc8f9df304abb159d767a29d0476f7750fbf8aa7ad07816004a207434"
 dependencies = [
  "memchr",
 ]
@@ -438,7 +425,7 @@ version = "0.10.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f"
 dependencies = [
- "bitflags 2.4.2",
+ "bitflags 2.6.0",
  "cfg-if",
  "foreign-types",
  "libc",
@@ -466,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.101"
+version = "0.9.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dda2b0f344e78efc2facf7d195d098df0dd72151b26ab98da807afc26c198dff"
+checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2"
 dependencies = [
  "cc",
  "libc",
@@ -478,9 +465,9 @@ dependencies = [
 
 [[package]]
 name = "parking_lot"
-version = "0.12.1"
+version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
+checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
 dependencies = [
  "lock_api",
  "parking_lot_core",
@@ -488,15 +475,15 @@ dependencies = [
 
 [[package]]
 name = "parking_lot_core"
-version = "0.9.9"
+version = "0.9.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
+checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
 dependencies = [
  "cfg-if",
  "libc",
- "redox_syscall",
+ "redox_syscall 0.5.2",
  "smallvec",
- "windows-targets 0.48.5",
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
@@ -525,9 +512,9 @@ dependencies = [
 
 [[package]]
 name = "pin-project-lite"
-version = "0.2.13"
+version = "0.2.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"
+checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02"
 
 [[package]]
 name = "pin-utils"
@@ -591,18 +578,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.78"
+version = "1.0.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
+checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.35"
+version = "1.0.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
+checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
 dependencies = [
  "proc-macro2",
 ]
@@ -646,6 +633,15 @@ dependencies = [
  "bitflags 1.3.2",
 ]
 
+[[package]]
+name = "redox_syscall"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd"
+dependencies = [
+ "bitflags 2.6.0",
+]
+
 [[package]]
 name = "rust-neon-example"
 version = "0.1.0"
@@ -658,17 +654,17 @@ dependencies = [
 
 [[package]]
 name = "rustc-demangle"
-version = "0.1.23"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
+checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
 
 [[package]]
 name = "rustix"
-version = "0.38.31"
+version = "0.38.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949"
+checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
 dependencies = [
- "bitflags 2.4.2",
+ "bitflags 2.6.0",
  "errno",
  "libc",
  "linux-raw-sys",
@@ -692,11 +688,11 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
 [[package]]
 name = "security-framework"
-version = "2.9.2"
+version = "2.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de"
+checksum = "c627723fd09706bacdb5cf41499e95098555af3c3c29d014dc3c458ef6be11c0"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.6.0",
  "core-foundation",
  "core-foundation-sys",
  "libc",
@@ -705,9 +701,9 @@ dependencies = [
 
 [[package]]
 name = "security-framework-sys"
-version = "2.9.1"
+version = "2.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a"
+checksum = "317936bbbd05227752583946b9e66d7ce3b489f84e11a94a510b4437fef407d7"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -741,15 +737,15 @@ dependencies = [
 
 [[package]]
 name = "smallvec"
-version = "1.13.1"
+version = "1.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
+checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
 
 [[package]]
 name = "socket2"
-version = "0.5.6"
+version = "0.5.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871"
+checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c"
 dependencies = [
  "libc",
  "windows-sys 0.52.0",
@@ -757,26 +753,26 @@ dependencies = [
 
 [[package]]
 name = "stringprep"
-version = "0.1.4"
+version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb41d74e231a107a1b4ee36bd1214b11285b77768d2e3824aedafa988fd36ee6"
+checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1"
 dependencies = [
- "finl_unicode",
  "unicode-bidi",
  "unicode-normalization",
+ "unicode-properties",
 ]
 
 [[package]]
 name = "subtle"
-version = "2.5.0"
+version = "2.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
+checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
 
 [[package]]
 name = "syn"
-version = "2.0.52"
+version = "2.0.68"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
+checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -797,9 +793,9 @@ dependencies = [
 
 [[package]]
 name = "tinyvec"
-version = "1.6.0"
+version = "1.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
+checksum = "c55115c6fbe2d2bef26eb09ad74bde02d8255476fc0c7b515ef09fbb35742d82"
 dependencies = [
  "tinyvec_macros",
 ]
@@ -812,9 +808,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.36.0"
+version = "1.38.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
+checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a"
 dependencies = [
  "backtrace",
  "bytes",
@@ -828,9 +824,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-macros"
-version = "2.2.0"
+version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
+checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -875,35 +871,15 @@ dependencies = [
 
 [[package]]
 name = "tokio-util"
-version = "0.7.10"
+version = "0.7.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
+checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1"
 dependencies = [
  "bytes",
  "futures-core",
  "futures-sink",
  "pin-project-lite",
  "tokio",
- "tracing",
-]
-
-[[package]]
-name = "tracing"
-version = "0.1.40"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
-dependencies = [
- "pin-project-lite",
- "tracing-core",
-]
-
-[[package]]
-name = "tracing-core"
-version = "0.1.32"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
-dependencies = [
- "once_cell",
 ]
 
 [[package]]
@@ -933,6 +909,12 @@ dependencies = [
  "tinyvec",
 ]
 
+[[package]]
+name = "unicode-properties"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e4259d9d4425d9f0661581b804cb85fe66a4c631cadd8f490d1c13a35d5d9291"
+
 [[package]]
 name = "vcpkg"
 version = "0.2.15"
@@ -1023,11 +1005,11 @@ dependencies = [
 
 [[package]]
 name = "whoami"
-version = "1.5.0"
+version = "1.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fec781d48b41f8163426ed18e8fc2864c12937df9ce54c88ede7bd47270893e"
+checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9"
 dependencies = [
- "redox_syscall",
+ "redox_syscall 0.4.1",
  "wasite",
  "web-sys",
 ]
@@ -1047,7 +1029,7 @@ version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
 dependencies = [
- "windows-targets 0.52.4",
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
@@ -1067,17 +1049,18 @@ dependencies = [
 
 [[package]]
 name = "windows-targets"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
+checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
 dependencies = [
- "windows_aarch64_gnullvm 0.52.4",
- "windows_aarch64_msvc 0.52.4",
- "windows_i686_gnu 0.52.4",
- "windows_i686_msvc 0.52.4",
- "windows_x86_64_gnu 0.52.4",
- "windows_x86_64_gnullvm 0.52.4",
- "windows_x86_64_msvc 0.52.4",
+ "windows_aarch64_gnullvm 0.52.5",
+ "windows_aarch64_msvc 0.52.5",
+ "windows_i686_gnu 0.52.5",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc 0.52.5",
+ "windows_x86_64_gnu 0.52.5",
+ "windows_x86_64_gnullvm 0.52.5",
+ "windows_x86_64_msvc 0.52.5",
 ]
 
 [[package]]
@@ -1088,9 +1071,9 @@ checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
+checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
 
 [[package]]
 name = "windows_aarch64_msvc"
@@ -1100,9 +1083,9 @@ checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
+checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
 
 [[package]]
 name = "windows_i686_gnu"
@@ -1112,9 +1095,15 @@ checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
+checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
 
 [[package]]
 name = "windows_i686_msvc"
@@ -1124,9 +1113,9 @@ checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
+checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
 
 [[package]]
 name = "windows_x86_64_gnu"
@@ -1136,9 +1125,9 @@ checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
+checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
@@ -1148,9 +1137,9 @@ checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
+checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
 
 [[package]]
 name = "windows_x86_64_msvc"
@@ -1160,6 +1149,6 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
+checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
index 0f420e5b06..27d01810bd 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
@@ -7,9 +7,9 @@ publish = false
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-native-tls = "0.2.11"
+native-tls = "0.2.12"
 postgres-native-tls = "0.5.0"
-tokio = { version = "1.36", features=["rt", "macros"] }
+tokio = { version = "1.38", features=["rt", "macros"] }
 tokio-postgres = "0.7.10"
 
 
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
index 8611e66cbb..3e214de785 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
+++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
@@ -1,4 +1,4 @@
-FROM rust:1.76
+FROM rust:1.79
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
index 0402838820..6006e61ee2 100644
--- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
+++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
@@ -1,11 +1,11 @@
-FROM swift:5.9 AS build
+FROM swift:5.10 AS build
 RUN apt-get -q update && apt-get -q install -y libssl-dev
 WORKDIR /source
 
 COPY . .
 RUN swift build --configuration release
 
-FROM swift:5.9
+FROM swift:5.10
 WORKDIR /app
 COPY --from=build /source/.build/release .
 CMD ["/app/PostgresClientKitExample"]
diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved
index 767443a9dd..6e8613095f 100644
--- a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved
+++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved
@@ -1,4 +1,5 @@
 {
+  "originHash" : "8eff8c577ba246ce7824d3434839acefced2b1a1d2b1ad700554502538a50558",
   "pins" : [
     {
       "identity" : "bluesocket",
@@ -18,15 +19,6 @@
         "version" : "2.0.2"
       }
     },
-    {
-      "identity" : "openssl",
-      "kind" : "remoteSourceControl",
-      "location" : "https://github.com/Kitura/OpenSSL.git",
-      "state" : {
-        "revision" : "5dc8cb4f971135c17343e3c6df4f28904a0600e2",
-        "version" : "2.3.1"
-      }
-    },
     {
       "identity" : "postgresclientkit",
       "kind" : "remoteSourceControl",
@@ -37,5 +29,5 @@
       }
     }
   ],
-  "version" : 2
+  "version" : 3
 }
diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift
index 48320dd023..a66d09c542 100644
--- a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift
+++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift
@@ -1,4 +1,4 @@
-// swift-tools-version:5.8
+// swift-tools-version:5.10
 import PackageDescription
 
 let package = Package(
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
index 9130e0973f..d6815fbb5f 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
@@ -1,10 +1,10 @@
-FROM swift:5.9 AS build
+FROM swift:5.10 AS build
 WORKDIR /source
 
 COPY . .
 RUN swift build --configuration release
 
-FROM swift:5.9
+FROM swift:5.10
 WORKDIR /app
 COPY --from=build /source/.build/release .
 CMD ["/app/PostgresNIOExample"]
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
index 023e03a7b1..0e5dfdafcb 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
@@ -1,12 +1,22 @@
 {
+  "originHash" : "11b5dcece349a3e56a7a9a7d0af6d0f5b83dff321b43124a01b158ed7aac5302",
   "pins" : [
     {
       "identity" : "postgres-nio",
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/vapor/postgres-nio.git",
       "state" : {
-        "revision" : "69ccfdf4c80144d845e3b439961b7ec6cd7ae33f",
-        "version" : "1.20.2"
+        "revision" : "5c268768890b062803a49f1358becc478f954265",
+        "version" : "1.21.5"
+      }
+    },
+    {
+      "identity" : "swift-async-algorithms",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-async-algorithms.git",
+      "state" : {
+        "revision" : "da4e36f86544cdf733a40d59b3a2267e3a7bbf36",
+        "version" : "1.0.0"
       }
     },
     {
@@ -81,6 +91,15 @@
         "version" : "1.20.1"
       }
     },
+    {
+      "identity" : "swift-service-lifecycle",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/swift-server/swift-service-lifecycle.git",
+      "state" : {
+        "revision" : "d58e6bf2b1ae2884cf204a8b5bcaaa7aae3c1ff0",
+        "version" : "2.6.0"
+      }
+    },
     {
       "identity" : "swift-system",
       "kind" : "remoteSourceControl",
@@ -91,5 +110,5 @@
       }
     }
   ],
-  "version" : 2
+  "version" : 3
 }
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
index 637eb4bc9d..20bb10f76c 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
@@ -1,10 +1,10 @@
-// swift-tools-version:5.9
+// swift-tools-version:5.10
 import PackageDescription
 
 let package = Package(
     name: "PostgresNIOExample",
     dependencies: [
-        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.20.2")
+        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.21.5")
     ],
     targets: [
         .executableTarget(
diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
index 004b383749..45e8753f7e 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
+++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
@@ -1,4 +1,4 @@
-FROM node:21
+FROM node:22
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
index b4f8587eac..19311808b6 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
+++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
@@ -5,7 +5,7 @@
   "packages": {
     "": {
       "dependencies": {
-        "postgresql-client": "2.10.5"
+        "postgresql-client": "2.11.0"
       }
     },
     "node_modules/doublylinked": {
@@ -42,9 +42,10 @@
       }
     },
     "node_modules/postgresql-client": {
-      "version": "2.10.5",
-      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.10.5.tgz",
-      "integrity": "sha512-R3EC16pUdbgrzk1J2MQLj7jY2TepWurJHoK90nOeLZj1XTpL/+wL1VCneTmclRVKDuKVjFHr+FASV47KrLpAbw==",
+      "version": "2.11.0",
+      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.11.0.tgz",
+      "integrity": "sha512-QSPHcWVaiBG+JyASaDojOXvhRmsc2n8j2COdIjUDENFAtFls16Zy240asY2ENzZRQJUMAA8vpR8w4SAdI8jdbw==",
+      "license": "MIT",
       "dependencies": {
         "doublylinked": "^2.5.4",
         "lightning-pool": "^4.2.2",
@@ -55,8 +56,7 @@
         "putil-varhelpers": "^1.6.5"
       },
       "engines": {
-        "node": ">=16.0",
-        "npm": ">=7.0.0"
+        "node": ">=16.0"
       }
     },
     "node_modules/power-tasks": {
diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json
index 07ec100d0d..d2bba23d29 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/package.json
+++ b/test_runner/pg_clients/typescript/postgresql-client/package.json
@@ -1,6 +1,6 @@
 {
   "type": "module",
   "dependencies": {
-    "postgresql-client": "2.10.5"
+    "postgresql-client": "2.11.0"
   }
 }
diff --git a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
index 004b383749..45e8753f7e 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
+++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
@@ -1,4 +1,4 @@
-FROM node:21
+FROM node:22
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
index f3b456f1ed..7f3f7f2e84 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
+++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
@@ -5,96 +5,138 @@
   "packages": {
     "": {
       "dependencies": {
-        "@neondatabase/serverless": "0.9.0",
+        "@neondatabase/serverless": "0.9.4",
         "ws": "8.17.1"
       }
     },
     "node_modules/@neondatabase/serverless": {
-      "version": "0.9.0",
-      "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.0.tgz",
-      "integrity": "sha512-mmJnUAzlzvxNSZuuhI6kgJjH+JgFdBMYUWxihtq/nj0Tjt+Y5UU3W+SvRFoucnd5NObYkuLYQzk+zV5DGFKGJg==",
+      "version": "0.9.4",
+      "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.4.tgz",
+      "integrity": "sha512-D0AXgJh6xkf+XTlsO7iwE2Q1w8981E1cLCPAALMU2YKtkF/1SF6BiAzYARZFYo175ON+b1RNIy9TdSFHm5nteg==",
+      "license": "MIT",
       "dependencies": {
-        "@types/pg": "8.6.6"
+        "@types/pg": "8.11.6"
       }
     },
     "node_modules/@types/node": {
-      "version": "18.16.3",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-18.16.3.tgz",
-      "integrity": "sha512-OPs5WnnT1xkCBiuQrZA4+YAV4HEJejmHneyraIaxsbev5yCEr6KMwINNFP9wQeFIw8FWcoTqF3vQsa5CDaI+8Q=="
+      "version": "20.14.9",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.9.tgz",
+      "integrity": "sha512-06OCtnTXtWOZBJlRApleWndH4JsRVs1pDCc8dLSQp+7PpUpX3ePdHyeNSFTeSe7FtKyQkrlPvHwJOW3SLd8Oyg==",
+      "license": "MIT",
+      "dependencies": {
+        "undici-types": "~5.26.4"
+      }
     },
     "node_modules/@types/pg": {
-      "version": "8.6.6",
-      "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.6.6.tgz",
-      "integrity": "sha512-O2xNmXebtwVekJDD+02udOncjVcMZQuTEQEMpKJ0ZRf5E7/9JJX3izhKUcUifBkyKpljyUM6BTgy2trmviKlpw==",
+      "version": "8.11.6",
+      "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.11.6.tgz",
+      "integrity": "sha512-/2WmmBXHLsfRqzfHW7BNZ8SbYzE8OSk7i3WjFYvfgRHj7S1xj+16Je5fUKv3lVdVzk/zn9TXOqf+avFCFIE0yQ==",
+      "license": "MIT",
       "dependencies": {
         "@types/node": "*",
         "pg-protocol": "*",
-        "pg-types": "^2.2.0"
+        "pg-types": "^4.0.1"
       }
     },
+    "node_modules/obuf": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz",
+      "integrity": "sha512-PX1wu0AmAdPqOL1mWhqmlOd8kOIZQwGZw6rh7uby9fTc5lhaOWFLX3I6R1hrF9k3zUY40e6igsLGkDXK92LJNg==",
+      "license": "MIT"
+    },
     "node_modules/pg-int8": {
       "version": "1.0.1",
       "resolved": "https://registry.npmjs.org/pg-int8/-/pg-int8-1.0.1.tgz",
       "integrity": "sha512-WCtabS6t3c8SkpDBUlb1kjOs7l66xsGdKpIPZsg4wR+B3+u9UAum2odSsF9tnvxg80h4ZxLWMy4pRjOsFIqQpw==",
+      "license": "ISC",
       "engines": {
         "node": ">=4.0.0"
       }
     },
-    "node_modules/pg-protocol": {
-      "version": "1.6.0",
-      "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.0.tgz",
-      "integrity": "sha512-M+PDm637OY5WM307051+bsDia5Xej6d9IR4GwJse1qA1DIhiKlksvrneZOYQq42OM+spubpcNYEo2FcKQrDk+Q=="
-    },
-    "node_modules/pg-types": {
-      "version": "2.2.0",
-      "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-2.2.0.tgz",
-      "integrity": "sha512-qTAAlrEsl8s4OiEQY69wDvcMIdQN6wdz5ojQiOy6YRMuynxenON0O5oCpJI6lshc6scgAY8qvJ2On/p+CXY0GA==",
-      "dependencies": {
-        "pg-int8": "1.0.1",
-        "postgres-array": "~2.0.0",
-        "postgres-bytea": "~1.0.0",
-        "postgres-date": "~1.0.4",
-        "postgres-interval": "^1.1.0"
-      },
+    "node_modules/pg-numeric": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/pg-numeric/-/pg-numeric-1.0.2.tgz",
+      "integrity": "sha512-BM/Thnrw5jm2kKLE5uJkXqqExRUY/toLHda65XgFTBTFYZyopbKjBe29Ii3RbkvlsMoFwD+tHeGaCjjv0gHlyw==",
+      "license": "ISC",
       "engines": {
         "node": ">=4"
       }
     },
+    "node_modules/pg-protocol": {
+      "version": "1.6.1",
+      "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.1.tgz",
+      "integrity": "sha512-jPIlvgoD63hrEuihvIg+tJhoGjUsLPn6poJY9N5CnlPd91c2T18T/9zBtLxZSb1EhYxBRoZJtzScCaWlYLtktg==",
+      "license": "MIT"
+    },
+    "node_modules/pg-types": {
+      "version": "4.0.2",
+      "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-4.0.2.tgz",
+      "integrity": "sha512-cRL3JpS3lKMGsKaWndugWQoLOCoP+Cic8oseVcbr0qhPzYD5DWXK+RZ9LY9wxRf7RQia4SCwQlXk0q6FCPrVng==",
+      "license": "MIT",
+      "dependencies": {
+        "pg-int8": "1.0.1",
+        "pg-numeric": "1.0.2",
+        "postgres-array": "~3.0.1",
+        "postgres-bytea": "~3.0.0",
+        "postgres-date": "~2.1.0",
+        "postgres-interval": "^3.0.0",
+        "postgres-range": "^1.1.1"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
     "node_modules/postgres-array": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-2.0.0.tgz",
-      "integrity": "sha512-VpZrUqU5A69eQyW2c5CA1jtLecCsN2U/bD6VilrFDWq5+5UIEVO7nazS3TEcHf1zuPYO/sqGvUvW62g86RXZuA==",
+      "version": "3.0.2",
+      "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-3.0.2.tgz",
+      "integrity": "sha512-6faShkdFugNQCLwucjPcY5ARoW1SlbnrZjmGl0IrrqewpvxvhSLHimCVzqeuULCbG0fQv7Dtk1yDbG3xv7Veog==",
+      "license": "MIT",
       "engines": {
-        "node": ">=4"
+        "node": ">=12"
       }
     },
     "node_modules/postgres-bytea": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-1.0.0.tgz",
-      "integrity": "sha512-xy3pmLuQqRBZBXDULy7KbaitYqLcmxigw14Q5sj8QBVLqEwXfeybIKVWiqAXTlcvdvb0+xkOtDbfQMOf4lST1w==",
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-3.0.0.tgz",
+      "integrity": "sha512-CNd4jim9RFPkObHSjVHlVrxoVQXz7quwNFpz7RY1okNNme49+sVyiTvTRobiLV548Hx/hb1BG+iE7h9493WzFw==",
+      "license": "MIT",
+      "dependencies": {
+        "obuf": "~1.1.2"
+      },
       "engines": {
-        "node": ">=0.10.0"
+        "node": ">= 6"
       }
     },
     "node_modules/postgres-date": {
-      "version": "1.0.7",
-      "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-1.0.7.tgz",
-      "integrity": "sha512-suDmjLVQg78nMK2UZ454hAG+OAW+HQPZ6n++TNDUX+L0+uUlLywnoxJKDou51Zm+zTCjrCl0Nq6J9C5hP9vK/Q==",
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-2.1.0.tgz",
+      "integrity": "sha512-K7Juri8gtgXVcDfZttFKVmhglp7epKb1K4pgrkLxehjqkrgPhfG6OO8LHLkfaqkbpjNRnra018XwAr1yQFWGcA==",
+      "license": "MIT",
       "engines": {
-        "node": ">=0.10.0"
+        "node": ">=12"
       }
     },
     "node_modules/postgres-interval": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-1.2.0.tgz",
-      "integrity": "sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ==",
-      "dependencies": {
-        "xtend": "^4.0.0"
-      },
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-3.0.0.tgz",
+      "integrity": "sha512-BSNDnbyZCXSxgA+1f5UU2GmwhoI0aU5yMxRGO8CdFEcY2BQF9xm/7MqKnYoM1nJDk8nONNWDk9WeSmePFhQdlw==",
+      "license": "MIT",
       "engines": {
-        "node": ">=0.10.0"
+        "node": ">=12"
       }
     },
+    "node_modules/postgres-range": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/postgres-range/-/postgres-range-1.1.4.tgz",
+      "integrity": "sha512-i/hbxIE9803Alj/6ytL7UHQxRvZkI9O4Sy+J3HGc4F4oo/2eQAjTSNJ0bfxyse3bH0nuVesCk+3IRLaMtG3H6w==",
+      "license": "MIT"
+    },
+    "node_modules/undici-types": {
+      "version": "5.26.5",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
+      "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
+      "license": "MIT"
+    },
     "node_modules/ws": {
       "version": "8.17.1",
       "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.1.tgz",
@@ -114,14 +156,6 @@
           "optional": true
         }
       }
-    },
-    "node_modules/xtend": {
-      "version": "4.0.2",
-      "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz",
-      "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==",
-      "engines": {
-        "node": ">=0.4"
-      }
     }
   }
 }
diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json
index 3ae7a8a6cf..f791d184c5 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/package.json
+++ b/test_runner/pg_clients/typescript/serverless-driver/package.json
@@ -1,7 +1,7 @@
 {
   "type": "module",
   "dependencies": {
-    "@neondatabase/serverless": "0.9.0",
+    "@neondatabase/serverless": "0.9.4",
     "ws": "8.17.1"
   }
 }

From c9e6dd45d343ffcb502023857a814e7500a6d3f3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 4 Jul 2024 15:05:41 +0100
Subject: [PATCH 1119/1571] pageserver: downgrade stale generation messages to
 INFO (#8256)

## Problem

When generations were new, these messages were an important way of
noticing if something unexpected was going on. We found some real issues
when investigating tests that unexpectedly tripped them.

At time has gone on, this code is now pretty battle-tested, and as we do
more live migrations etc, it's fairly normal to see the occasional
message from a node with a stale generation.

At this point the cognitive load on developers to selectively allow-list
these logs outweighs the benefit of having them at warn severity.

Closes: https://github.com/neondatabase/neon/issues/8080

## Summary of changes

- Downgrade "Dropped remote consistent LSN updates" and "Dropping stale
deletions" messages to INFO
- Remove all the allow-list entries for these logs.
---
 pageserver/src/deletion_queue/validator.rs        |  4 ++--
 test_runner/fixtures/pageserver/many_tenants.py   |  4 ----
 .../interactive/test_many_small_tenants.py        |  4 ----
 .../pagebench/test_large_slru_basebackup.py       |  4 ----
 ...server_max_throughput_getpage_at_latest_lsn.py |  4 ----
 .../performance/test_storage_controller_scale.py  |  8 --------
 test_runner/regress/test_attach_tenant_config.py  |  6 ------
 test_runner/regress/test_change_pageserver.py     |  5 -----
 test_runner/regress/test_layers_from_future.py    |  3 ---
 .../regress/test_pageserver_generations.py        | 13 -------------
 test_runner/regress/test_pageserver_secondary.py  |  3 ---
 test_runner/regress/test_remote_storage.py        |  7 -------
 test_runner/regress/test_sharding.py              |  4 ----
 test_runner/regress/test_storage_controller.py    | 15 ---------------
 test_runner/regress/test_tenant_conf.py           |  4 ----
 test_runner/regress/test_tenant_detach.py         | 12 ------------
 test_runner/regress/test_tenant_relocation.py     |  2 --
 test_runner/regress/test_tenants.py               |  4 ----
 18 files changed, 2 insertions(+), 104 deletions(-)

diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs
index bf06c78e67..d215fd2b7d 100644
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -190,7 +190,7 @@ where
                 }
             } else {
                 // If we failed validation, then do not apply any of the projected updates
-                warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
+                info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
                 metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
             }
         }
@@ -225,7 +225,7 @@ where
                     && (tenant.generation == *validated_generation);
 
                 if !this_list_valid {
-                    warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
+                    info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
                     metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
                     mutated = true;
                 } else {
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index 8730d8ef75..c437258c6f 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -42,10 +42,6 @@ def single_timeline(
 
     log.info("detach template tenant form pageserver")
     env.pageserver.tenant_detach(template_tenant)
-    env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
-        ".*Dropped remote consistent LSN updates.*",
-    )
 
     log.info(f"duplicating template tenant {ncopies} times in S3")
     tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
index 0ff9c8fdaa..33848b06d3 100644
--- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
+++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
@@ -55,10 +55,6 @@ def setup_env(
         }
         template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
         env.pageserver.tenant_detach(template_tenant)
-        env.pageserver.allowed_errors.append(
-            # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
-            ".*Dropped remote consistent LSN updates.*",
-        )
         env.pageserver.tenant_attach(template_tenant, config)
         ep = env.endpoints.create_start("main", tenant_id=template_tenant)
         ep.safe_psql("create table foo(b text)")
diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index b66db4d0ab..b41ae60197 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -86,10 +86,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int):
 
     template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
     env.pageserver.tenant_detach(template_tenant)
-    env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
-        ".*Dropped remote consistent LSN updates.*",
-    )
     env.pageserver.tenant_attach(template_tenant, config)
 
     ps_http = env.pageserver.http_client()
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index a8f48fe675..60861cf939 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -164,10 +164,6 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
     }
     template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
     env.pageserver.tenant_detach(template_tenant)
-    env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
-        ".*Dropped remote consistent LSN updates.*",
-    )
     env.pageserver.tenant_attach(template_tenant, config)
     ps_http = env.pageserver.http_client()
     with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index d65a66b010..3a6113706f 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -132,14 +132,6 @@ def test_storage_controller_many_tenants(
     )
 
     for ps in env.pageservers:
-        # This can happen because when we do a loop over all pageservers and mark them offline/active,
-        # reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of
-        # bumping generation before other attachments are detached.
-        #
-        # We could clean this up by making reconcilers respect the .observed of their predecessor, if
-        # we spawn with a wait for the predecessor.
-        ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
         # Storage controller is allowed to drop pageserver requests when the cancellation token
         # for a Reconciler fires.
         ps.allowed_errors.append(".*request was dropped before completing.*")
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index e117c2140f..f2ee2b70aa 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -21,8 +21,6 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
         [
             # eviction might be the first one after an attach to access the layers
             ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction",
-            # detach can happen before we get to validate the generation number
-            ".*deletion backend: Dropped remote consistent LSN updates for tenant.*",
         ]
     )
     assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
@@ -58,10 +56,6 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
 
     env.pageserver.allowed_errors.extend(
         [
-            # This fixture detaches the tenant, and tests using it will tend to re-attach it
-            # shortly after. There may be un-processed deletion_queue validations from the
-            # initial attachment
-            ".*Dropped remote consistent LSN updates.*",
             # This fixture is for tests that will intentionally generate 400 responses
             ".*Error processing HTTP request: Bad request",
         ]
diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py
index 97ab69049d..4d2cdb8e32 100644
--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -14,11 +14,6 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     )
     env = neon_env_builder.init_start()
 
-    for pageserver in env.pageservers:
-        # This test dual-attaches a tenant, one of the pageservers will therefore
-        # be running with a stale generation.
-        pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
     env.neon_cli.create_branch("test_change_pageserver")
     endpoint = env.endpoints.create_start("test_change_pageserver")
 
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 54d3b2d515..3b2218dd9b 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -39,9 +39,6 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
 
     env = neon_env_builder.init_configs()
     env.start()
-    env.pageserver.allowed_errors.extend(
-        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
-    )
 
     ps_http = env.pageserver.http_client()
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 696af24e5c..7ce38c5c3c 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -249,10 +249,6 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
     assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"]
     assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
 
-    main_pageserver.allowed_errors.extend(
-        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
-    )
-
     # Now advance the generation in the control plane: subsequent validations
     # from the running pageserver will fail.  No more deletions should happen.
     env.storage_controller.attach_hook_issue(env.initial_tenant, other_pageserver.id)
@@ -397,8 +393,6 @@ def test_deletion_queue_recovery(
         #   validated before restart.
         assert get_deletion_queue_executed(ps_http) == before_restart_depth
     else:
-        main_pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
-
         # If we lost the attachment, we should have dropped our pre-restart deletions.
         assert get_deletion_queue_dropped(ps_http) == before_restart_depth
 
@@ -553,13 +547,6 @@ def test_multi_attach(
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    # We will intentionally create situations where stale deletions happen from non-latest-generation
-    # nodes when the tenant is multiply-attached
-    for ps in env.pageservers:
-        ps.allowed_errors.extend(
-            [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
-        )
-
     # Initially, the tenant will be attached to the first pageserver (first is default in our test harness)
     wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
     _detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 8431840dc0..4c828b86b0 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -83,9 +83,6 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     for ps in env.pageservers:
         ps.allowed_errors.extend(
             [
-                # We will make no effort to avoid stale attachments
-                ".*Dropped remote consistent LSN updates.*",
-                ".*Dropping stale deletions.*",
                 # page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found
                 ".*query handler.*Tenant.*not found.*",
                 # page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index b26bd3422f..fac7fe9dee 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -355,13 +355,6 @@ def test_remote_storage_upload_queue_retries(
     env.pageserver.stop(immediate=True)
     env.endpoints.stop_all()
 
-    # We are about to forcibly drop local dirs.  Storage controller will increment generation in re-attach before
-    # we later increment when actually attaching it again, leading to skipping a generation and potentially getting
-    # these warnings if there was a durable but un-executed deletion list at time of restart.
-    env.pageserver.allowed_errors.extend(
-        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
-    )
-
     dir_to_clear = env.pageserver.tenant_dir()
     shutil.rmtree(dir_to_clear)
     os.mkdir(dir_to_clear)
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 8267d3f36c..d414f986e6 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1144,10 +1144,6 @@ def test_sharding_split_failures(
     )
 
     for ps in env.pageservers:
-        # When we do node failures and abandon a shard, it will de-facto have old generation and
-        # thereby be unable to publish remote consistent LSN updates
-        ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
         # If we're using a failure that will panic the storage controller, all background
         # upcalls from the pageserver can fail
         ps.allowed_errors.append(".*calling control plane generation validation API failed.*")
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index a78f566f0e..d37f7aae3d 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -60,11 +60,6 @@ def test_storage_controller_smoke(
     neon_env_builder.num_pageservers = 3
     env = neon_env_builder.init_configs()
 
-    for pageserver in env.pageservers:
-        # This test detaches tenants during migration, which can race with deletion queue operations,
-        # during detach we only do an advisory flush, we don't wait for it.
-        pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"])
-
     # Start services by hand so that we can skip a pageserver (this will start + register later)
     env.broker.try_start()
     env.storage_controller.start()
@@ -484,9 +479,6 @@ def test_storage_controller_compute_hook(
     # Start running
     env = neon_env_builder.init_start()
 
-    # We will to an unclean migration, which will result in deletion queue warnings
-    env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates for tenant.*")
-
     # Initial notification from tenant creation
     assert len(notifications) == 1
     expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = {
@@ -1054,13 +1046,6 @@ def test_storage_controller_heartbeats(
     online_node_ids = set(range(1, len(env.pageservers) + 1)) - offline_node_ids
 
     for node_id in offline_node_ids:
-        env.get_pageserver(node_id).allowed_errors.append(
-            # In the case of the failpoint failure, the impacted pageserver
-            # still believes it has the tenant attached since location
-            # config calls into it will fail due to being marked offline.
-            ".*Dropped remote consistent LSN updates.*",
-        )
-
         if len(offline_node_ids) > 1:
             env.get_pageserver(node_id).allowed_errors.append(
                 ".*Scheduling error when marking pageserver.*offline.*",
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 2cbb036c0d..80fb2b55b8 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -320,10 +320,6 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
 
     assert not config_path.exists(), "detach did not remove config file"
 
-    # The re-attach's increment of the generation number may invalidate deletion queue
-    # updates in flight from the previous attachment.
-    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
     env.pageserver.tenant_attach(tenant_id)
     wait_until(
         number_of_iterations=5,
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 2056840558..b165588636 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -76,10 +76,6 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str):
 
     env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
 
-    # Our re-attach may race with the deletion queue processing LSN updates
-    # from the original attachment.
-    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
     with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
         with endpoint.cursor() as cur:
             cur.execute("CREATE TABLE t(key int primary key, value text)")
@@ -349,10 +345,6 @@ def test_detach_while_attaching(
 
     env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
 
-    # Our re-attach may race with the deletion queue processing LSN updates
-    # from the original attachment.
-    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
     # Create table, and insert some rows. Make it big enough that it doesn't fit in
     # shared_buffers, otherwise the SELECT after restart will just return answer
     # from shared_buffers without hitting the page server, which defeats the point
@@ -422,10 +414,6 @@ def test_detach_while_activating(
 
     env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
 
-    # Our re-attach may race with the deletion queue processing LSN updates
-    # from the original attachment.
-    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
     data_id = 1
     data_secret = "very secret secret"
     insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 9fe732e288..43e9a0d36e 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -203,8 +203,6 @@ def test_tenant_relocation(
         [
             # Needed for detach polling on the original pageserver
             f".*NotFound: tenant {tenant_id}.*",
-            # We will dual-attach in this test, so stale generations are expected
-            ".*Dropped remote consistent LSN updates.*",
         ]
     )
 
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 3705406c2f..04b3fdd80f 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -386,10 +386,6 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
     # generation nubmers out of order.
     env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+")
 
-    # Our multiple creation requests will advance generation quickly, and when we skip
-    # a generation number we can generate these warnings
-    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates for tenant .+")
-
     # Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of
     # an incomplete attach, or some other problem.  In the field this should be rare,
     # so we allow it to log at WARN, even if it is occasionally a false positive.

From e579bc0819998f234277f2f29d10f2a444154753 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 4 Jul 2024 17:07:16 +0200
Subject: [PATCH 1120/1571] Add find-large-objects subcommand to scrubber
 (#8257)

Adds a find-large-objects subcommand to the scrubber to allow listing
layer objects larger than a specific size.

To be used like:

```
AWS_PROFILE=dev REGION=us-east-2 BUCKET=neon-dev-storage-us-east-2 cargo run -p storage_scrubber -- find-large-objects --min-size 250000000 --ignore-deltas
```

Part of #5431
---
 storage_scrubber/src/checks.rs             |  2 +-
 storage_scrubber/src/find_large_objects.rs | 97 ++++++++++++++++++++++
 storage_scrubber/src/lib.rs                |  1 +
 storage_scrubber/src/main.rs               | 18 ++++
 4 files changed, 117 insertions(+), 1 deletion(-)
 create mode 100644 storage_scrubber/src/find_large_objects.rs

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 4eb8580e32..f687b24320 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -259,7 +259,7 @@ pub(crate) enum BlobDataParseResult {
     Incorrect(Vec<String>),
 }
 
-fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
+pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
     match name.rsplit_once('-') {
         // FIXME: this is gross, just use a regex?
         Some((layer_filename, gen)) if gen.len() == 8 => {
diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs
new file mode 100644
index 0000000000..24668b6516
--- /dev/null
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -0,0 +1,97 @@
+use futures::StreamExt;
+use pageserver::tenant::storage_layer::LayerName;
+use serde::{Deserialize, Serialize};
+
+use crate::{
+    checks::parse_layer_object_name, init_remote, list_objects_with_retries,
+    metadata_stream::stream_tenants, BucketConfig, NodeKind,
+};
+
+#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
+enum LargeObjectKind {
+    DeltaLayer,
+    ImageLayer,
+    Other,
+}
+
+impl LargeObjectKind {
+    fn from_key(key: &str) -> Self {
+        let fname = key.split('/').last().unwrap();
+
+        let Ok((layer_name, _generation)) = parse_layer_object_name(fname) else {
+            return LargeObjectKind::Other;
+        };
+
+        match layer_name {
+            LayerName::Image(_) => LargeObjectKind::ImageLayer,
+            LayerName::Delta(_) => LargeObjectKind::DeltaLayer,
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct LargeObject {
+    pub key: String,
+    pub size: u64,
+    kind: LargeObjectKind,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct LargeObjectListing {
+    pub objects: Vec<LargeObject>,
+}
+
+pub async fn find_large_objects(
+    bucket_config: BucketConfig,
+    min_size: u64,
+    ignore_deltas: bool,
+) -> anyhow::Result<LargeObjectListing> {
+    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+    let mut tenants = std::pin::pin!(stream_tenants(&s3_client, &target));
+    let mut objects = Vec::new();
+    let mut tenant_ctr = 0u64;
+    let mut object_ctr = 0u64;
+    while let Some(tenant_shard_id) = tenants.next().await {
+        let tenant_shard_id = tenant_shard_id?;
+        let mut tenant_root = target.tenant_root(&tenant_shard_id);
+        // We want the objects and not just common prefixes
+        tenant_root.delimiter.clear();
+        let mut continuation_token = None;
+        loop {
+            let fetch_response =
+                list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
+                    .await?;
+            for obj in fetch_response.contents().iter().filter(|o| {
+                if let Some(obj_size) = o.size {
+                    min_size as i64 <= obj_size
+                } else {
+                    false
+                }
+            }) {
+                let key = obj.key().expect("couldn't get key").to_owned();
+                let kind = LargeObjectKind::from_key(&key);
+                if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
+                    continue;
+                }
+                objects.push(LargeObject {
+                    key,
+                    size: obj.size.unwrap() as u64,
+                    kind,
+                })
+            }
+            object_ctr += fetch_response.contents().len() as u64;
+            match fetch_response.next_continuation_token {
+                Some(new_token) => continuation_token = Some(new_token),
+                None => break,
+            }
+        }
+
+        tenant_ctr += 1;
+        if tenant_ctr % 50 == 0 {
+            tracing::info!(
+                "Scanned {tenant_ctr} shards. objects={object_ctr}, found={}, current={tenant_shard_id}.", objects.len()
+            );
+        }
+    }
+    Ok(LargeObjectListing { objects })
+}
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 64273432fc..6adaa5d38f 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -2,6 +2,7 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 pub mod checks;
 pub mod cloud_admin_api;
+pub mod find_large_objects;
 pub mod garbage;
 pub mod metadata_stream;
 pub mod pageserver_physical_gc;
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index 222bd10ed2..10699edd3c 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,6 +1,7 @@
 use anyhow::bail;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
+use storage_scrubber::find_large_objects;
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
 use storage_scrubber::scan_pageserver_metadata::scan_metadata;
@@ -72,6 +73,12 @@ enum Command {
         #[arg(short, long, default_value_t = GcMode::IndicesOnly)]
         mode: GcMode,
     },
+    FindLargeObjects {
+        #[arg(long = "min-size")]
+        min_size: u64,
+        #[arg(short, long, default_value_t = false)]
+        ignore_deltas: bool,
+    },
 }
 
 #[tokio::main]
@@ -86,6 +93,7 @@ async fn main() -> anyhow::Result<()> {
         Command::PurgeGarbage { .. } => "purge-garbage",
         Command::TenantSnapshot { .. } => "tenant-snapshot",
         Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc",
+        Command::FindLargeObjects { .. } => "find-large-objects",
     };
     let _guard = init_logging(&format!(
         "{}_{}_{}_{}.log",
@@ -199,5 +207,15 @@ async fn main() -> anyhow::Result<()> {
             println!("{}", serde_json::to_string(&summary).unwrap());
             Ok(())
         }
+        Command::FindLargeObjects {
+            min_size,
+            ignore_deltas,
+        } => {
+            let summary =
+                find_large_objects::find_large_objects(bucket_config, min_size, ignore_deltas)
+                    .await?;
+            println!("{}", serde_json::to_string(&summary).unwrap());
+            Ok(())
+        }
     }
 }

From 19accfee4e677ed8fabc4dd1f370389038978499 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Thu, 4 Jul 2024 11:09:05 -0400
Subject: [PATCH 1121/1571] feat(pageserver): integrate lsn lease into
 synthetic size (#8220)

Part of #7497, closes #8071. (accidentally closed #8208, reopened here)

## Problem

After the changes in #8084, we need synthetic size to also account for
leased LSNs so that users do not get free retention by running a small
ephemeral endpoint for a long time.

## Summary of changes

This PR integrates LSN leases into the synthetic size calculation. We
model leases as read-only branches started at the leased LSN (except it
does not have a timeline id).

Other changes:
- Add new unit tests testing whether a lease behaves like a read-only
branch.
- Change `/size_debug` response to include lease point in the SVG
visualization.
- Fix `/lsn_lease` HTTP API to do proper parsing for POST.


Signed-off-by: Yuchen Liang <yuchen@neon.tech>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 libs/pageserver_api/src/models.rs         |  5 ++
 libs/tenant_size_model/src/calculation.rs |  4 +-
 libs/tenant_size_model/src/svg.rs         | 36 ++++++++--
 pageserver/src/http/openapi_spec.yml      | 22 +++---
 pageserver/src/http/routes.rs             | 18 +++--
 pageserver/src/tenant/size.rs             | 85 ++++++++++++++++++++--
 pageserver/src/tenant/timeline.rs         |  9 +++
 test_runner/fixtures/pageserver/http.py   | 16 +++++
 test_runner/regress/test_tenant_size.py   | 88 +++++++++++++++++++++++
 9 files changed, 256 insertions(+), 27 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 9228953761..ad65602f54 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -228,6 +228,11 @@ pub struct TimelineCreateRequest {
     pub pg_version: Option<u32>,
 }
 
+#[derive(Serialize, Deserialize, Clone)]
+pub struct LsnLeaseRequest {
+    pub lsn: Lsn,
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct TenantShardSplitRequest {
     pub new_shard_count: u8,
diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs
index f05997ee65..be00562219 100644
--- a/libs/tenant_size_model/src/calculation.rs
+++ b/libs/tenant_size_model/src/calculation.rs
@@ -34,10 +34,10 @@ struct SegmentSize {
 }
 
 struct SizeAlternatives {
-    // cheapest alternative if parent is available.
+    /// cheapest alternative if parent is available.
     incremental: SegmentSize,
 
-    // cheapest alternative if parent node is not available
+    /// cheapest alternative if parent node is not available
     non_incremental: Option<SegmentSize>,
 }
 
diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs
index f26d3aa79d..0de2890bb4 100644
--- a/libs/tenant_size_model/src/svg.rs
+++ b/libs/tenant_size_model/src/svg.rs
@@ -3,10 +3,17 @@ use std::fmt::Write;
 
 const SVG_WIDTH: f32 = 500.0;
 
+/// Different branch kind for SVG drawing.
+#[derive(PartialEq)]
+pub enum SvgBranchKind {
+    Timeline,
+    Lease,
+}
+
 struct SvgDraw<'a> {
     storage: &'a StorageModel,
     branches: &'a [String],
-    seg_to_branch: &'a [usize],
+    seg_to_branch: &'a [(usize, SvgBranchKind)],
     sizes: &'a [SegmentSizeResult],
 
     // layout
@@ -42,13 +49,18 @@ fn draw_legend(result: &mut String) -> anyhow::Result<()> {
         "<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
     )?;
     writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
+    writeln!(
+        result,
+        "<line x1=\"10\" y1=\"85\" x2=\"10\" y2=\"95\" stroke-width=\"3\" stroke=\"blue\" />"
+    )?;
+    writeln!(result, "<text x=\"20\" y=\"95\">LSN lease</text>")?;
     Ok(())
 }
 
 pub fn draw_svg(
     storage: &StorageModel,
     branches: &[String],
-    seg_to_branch: &[usize],
+    seg_to_branch: &[(usize, SvgBranchKind)],
     sizes: &SizeResult,
 ) -> anyhow::Result<String> {
     let mut draw = SvgDraw {
@@ -100,7 +112,7 @@ impl<'a> SvgDraw<'a> {
 
         // Layout the timelines on Y dimension.
         // TODO
-        let mut y = 100.0;
+        let mut y = 120.0;
         let mut branch_y_coordinates = Vec::new();
         for _branch in self.branches {
             branch_y_coordinates.push(y);
@@ -109,7 +121,7 @@ impl<'a> SvgDraw<'a> {
 
         // Calculate coordinates for each point
         let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
-            .map(|(seg, branch_id)| {
+            .map(|(seg, (branch_id, _))| {
                 let x = (seg.lsn - min_lsn) as f32 / xscale;
                 let y = branch_y_coordinates[*branch_id];
                 (x, y)
@@ -175,6 +187,22 @@ impl<'a> SvgDraw<'a> {
 
         // draw a snapshot point if it's needed
         let (coord_x, coord_y) = self.seg_coordinates[seg_id];
+
+        let (_, kind) = &self.seg_to_branch[seg_id];
+        if kind == &SvgBranchKind::Lease {
+            let (x1, y1) = (coord_x, coord_y - 10.0);
+            let (x2, y2) = (coord_x, coord_y + 10.0);
+
+            let style = "stroke-width=\"3\" stroke=\"blue\"";
+
+            writeln!(
+                result,
+                "<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
+            )?;
+            writeln!(result, "  <title>leased lsn at {}</title>", seg.lsn)?;
+            writeln!(result, "</line>")?;
+        }
+
         if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
             writeln!(
                 result,
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 58ff6e3f83..5ba329f05e 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -265,15 +265,19 @@ paths:
           type: string
           format: hex
     post:
-      description: Obtain lease for the given LSN
-      parameters:
-        - name: lsn
-          in: query
-          required: true
-          schema:
-            type: string
-            format: hex
-          description: A LSN to obtain the lease for
+      description: Obtains a lease for the given LSN.
+      requestBody:
+        content:
+          application/json:
+            schema:
+              type: object
+              required:
+               - lsn
+              properties:
+                lsn:
+                  description: A LSN to obtain the lease for.
+                  type: string
+                  format: hex
       responses:
         "200":
           description: OK
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6a6f17604d..893302b7d6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -22,6 +22,7 @@ use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::LsnLease;
+use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
@@ -42,7 +43,7 @@ use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
-use tenant_size_model::{SizeResult, StorageModel};
+use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
@@ -1195,10 +1196,15 @@ fn synthetic_size_html_response(
         timeline_map.insert(ti.timeline_id, index);
         timeline_ids.push(ti.timeline_id.to_string());
     }
-    let seg_to_branch: Vec<usize> = inputs
+    let seg_to_branch: Vec<(usize, SvgBranchKind)> = inputs
         .segments
         .iter()
-        .map(|seg| *timeline_map.get(&seg.timeline_id).unwrap())
+        .map(|seg| {
+            (
+                *timeline_map.get(&seg.timeline_id).unwrap(),
+                seg.kind.into(),
+            )
+        })
         .collect();
 
     let svg =
@@ -1531,15 +1537,13 @@ async fn handle_tenant_break(
 
 // Obtains an lsn lease on the given timeline.
 async fn lsn_lease_handler(
-    request: Request<Body>,
+    mut request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    let lsn: Lsn = parse_query_param(&request, "lsn")?
-        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
+    let lsn = json_request::<LsnLeaseRequest>(&mut request).await?.lsn;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
 
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index b2338b620e..23354417e7 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -3,6 +3,7 @@ use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
+use tenant_size_model::svg::SvgBranchKind;
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -87,6 +88,9 @@ impl SegmentMeta {
             LsnKind::BranchPoint => true,
             LsnKind::GcCutOff => true,
             LsnKind::BranchEnd => false,
+            LsnKind::LeasePoint => true,
+            LsnKind::LeaseStart => false,
+            LsnKind::LeaseEnd => false,
         }
     }
 }
@@ -103,6 +107,21 @@ pub enum LsnKind {
     GcCutOff,
     /// Last record LSN
     BranchEnd,
+    /// A LSN lease is granted here.
+    LeasePoint,
+    /// A lease starts from here.
+    LeaseStart,
+    /// Last record LSN for the lease (should have the same LSN as the previous [`LsnKind::LeaseStart`]).
+    LeaseEnd,
+}
+
+impl From<LsnKind> for SvgBranchKind {
+    fn from(kind: LsnKind) -> Self {
+        match kind {
+            LsnKind::LeasePoint | LsnKind::LeaseStart | LsnKind::LeaseEnd => SvgBranchKind::Lease,
+            _ => SvgBranchKind::Timeline,
+        }
+    }
 }
 
 /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
@@ -124,6 +143,9 @@ pub struct TimelineInputs {
 
     /// Cutoff point calculated from the user-supplied 'max_retention_period'
     retention_param_cutoff: Option<Lsn>,
+
+    /// Lease points on the timeline
+    lease_points: Vec<Lsn>,
 }
 
 /// Gathers the inputs for the tenant sizing model.
@@ -234,6 +256,13 @@ pub(super) async fn gather_inputs(
             None
         };
 
+        let lease_points = gc_info
+            .leases
+            .keys()
+            .filter(|&&lsn| lsn > ancestor_lsn)
+            .copied()
+            .collect::<Vec<_>>();
+
         // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
         // want to query any logical size before initdb_lsn.
         let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
@@ -248,6 +277,8 @@ pub(super) async fn gather_inputs(
             .map(|lsn| (lsn, LsnKind::BranchPoint))
             .collect::<Vec<_>>();
 
+        lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
+
         drop(gc_info);
 
         // Add branch points we collected earlier, just in case there were any that were
@@ -296,6 +327,7 @@ pub(super) async fn gather_inputs(
             if kind == LsnKind::BranchPoint {
                 branchpoint_segments.insert((timeline_id, lsn), segments.len());
             }
+
             segments.push(SegmentMeta {
                 segment: Segment {
                     parent: Some(parent),
@@ -306,7 +338,45 @@ pub(super) async fn gather_inputs(
                 timeline_id: timeline.timeline_id,
                 kind,
             });
-            parent += 1;
+
+            parent = segments.len() - 1;
+
+            if kind == LsnKind::LeasePoint {
+                // Needs `LeaseStart` and `LeaseEnd` as well to model lease as a read-only branch that never writes data
+                // (i.e. it's lsn has not advanced from ancestor_lsn), and therefore the three segments have the same LSN
+                // value. Without the other two segments, the calculation code would not count the leased LSN as a point
+                // to be retained.
+                // Did not use `BranchStart` or `BranchEnd` so we can differentiate branches and leases during debug.
+                //
+                // Alt Design: rewrite the entire calculation code to be independent of timeline id. Both leases and
+                // branch points can be given a synthetic id so we can unite them.
+                let mut lease_parent = parent;
+
+                // Start of a lease.
+                segments.push(SegmentMeta {
+                    segment: Segment {
+                        parent: Some(lease_parent),
+                        lsn: lsn.0,
+                        size: None,                   // Filled in later, if necessary
+                        needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
+                    },
+                    timeline_id: timeline.timeline_id,
+                    kind: LsnKind::LeaseStart,
+                });
+                lease_parent += 1;
+
+                // End of the lease.
+                segments.push(SegmentMeta {
+                    segment: Segment {
+                        parent: Some(lease_parent),
+                        lsn: lsn.0,
+                        size: None,   // Filled in later, if necessary
+                        needed: true, // everything at the lease LSN must be readable => is needed
+                    },
+                    timeline_id: timeline.timeline_id,
+                    kind: LsnKind::LeaseEnd,
+                });
+            }
         }
 
         // Current end of the timeline
@@ -332,6 +402,7 @@ pub(super) async fn gather_inputs(
             pitr_cutoff,
             next_gc_cutoff,
             retention_param_cutoff,
+            lease_points,
         });
     }
 
@@ -674,7 +745,8 @@ fn verify_size_for_multiple_branches() {
       "horizon_cutoff": "0/2210CD0",
       "pitr_cutoff": "0/2210CD0",
       "next_gc_cutoff": "0/2210CD0",
-      "retention_param_cutoff": null
+      "retention_param_cutoff": null,
+      "lease_points": []
     },
     {
       "timeline_id": "454626700469f0a9914949b9d018e876",
@@ -684,7 +756,8 @@ fn verify_size_for_multiple_branches() {
       "horizon_cutoff": "0/1817770",
       "pitr_cutoff": "0/1817770",
       "next_gc_cutoff": "0/1817770",
-      "retention_param_cutoff": null
+      "retention_param_cutoff": null,
+      "lease_points": []
     },
     {
       "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
@@ -694,7 +767,8 @@ fn verify_size_for_multiple_branches() {
       "horizon_cutoff": "0/18B3D98",
       "pitr_cutoff": "0/18B3D98",
       "next_gc_cutoff": "0/18B3D98",
-      "retention_param_cutoff": null
+      "retention_param_cutoff": null,
+      "lease_points": []
     }
   ]
 }
@@ -749,7 +823,8 @@ fn verify_size_for_one_branch() {
       "horizon_cutoff": "47/240A5860",
       "pitr_cutoff": "47/240A5860",
       "next_gc_cutoff": "47/240A5860",
-      "retention_param_cutoff": "0/0"
+      "retention_param_cutoff": "0/0",
+      "lease_points": []
     }
   ]
 }"#;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index bbf0d0a4bf..42e55ab269 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,6 +14,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use arc_swap::ArcSwap;
 use bytes::Bytes;
 use camino::Utf8Path;
+use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
 use once_cell::sync::Lazy;
@@ -1590,7 +1591,13 @@ impl Timeline {
                     let existing_lease = occupied.get_mut();
                     if valid_until > existing_lease.valid_until {
                         existing_lease.valid_until = valid_until;
+                        let dt: DateTime<Utc> = valid_until.into();
+                        info!("lease extended to {}", dt);
+                    } else {
+                        let dt: DateTime<Utc> = existing_lease.valid_until.into();
+                        info!("existing lease covers greater length, valid until {}", dt);
                     }
+
                     existing_lease.clone()
                 } else {
                     // Reject already GC-ed LSN (lsn < latest_gc_cutoff)
@@ -1599,6 +1606,8 @@ impl Timeline {
                         bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
                     }
 
+                    let dt: DateTime<Utc> = valid_until.into();
+                    info!("lease created, valid until {}", dt);
                     entry.or_insert(LsnLease { valid_until }).clone()
                 }
             };
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 3da0be8021..03aee9e5c5 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -599,6 +599,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res_json = res.json()
         return res_json
 
+    def timeline_lsn_lease(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn
+    ):
+        data = {
+            "lsn": str(lsn),
+        }
+
+        log.info(f"Requesting lsn lease for {lsn=}, {tenant_id=}, {timeline_id=}")
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/lsn_lease",
+            json=data,
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        return res_json
+
     def timeline_get_timestamp_of_lsn(
         self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn
     ):
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 6c85ddebbc..70e8fe67d5 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -10,6 +10,7 @@ from fixtures.neon_fixtures import (
     Endpoint,
     NeonEnv,
     NeonEnvBuilder,
+    flush_ep_to_pageserver,
     wait_for_last_flush_lsn,
     wait_for_wal_insert_lsn,
 )
@@ -710,3 +711,90 @@ def mask_model_inputs(x):
         return newlist
     else:
         return x
+
+
+@pytest.mark.parametrize("zero_gc", [True, False])
+def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, zero_gc: bool):
+    """
+    Compare a LSN lease to a read-only branch for synthetic size calculation.
+    They should have the same effect.
+    """
+
+    conf = {
+        "pitr_interval": "0s" if zero_gc else "3600s",
+        "gc_period": "0s",
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=conf)
+
+    ro_branch_res = insert_with_action(
+        env, env.initial_tenant, env.initial_timeline, test_output_dir, action="branch"
+    )
+
+    tenant, timeline = env.neon_cli.create_tenant(conf=conf)
+    lease_res = insert_with_action(env, tenant, timeline, test_output_dir, action="lease")
+
+    assert_size_approx_equal(lease_res, ro_branch_res)
+
+
+def insert_with_action(
+    env: NeonEnv,
+    tenant: TenantId,
+    timeline: TimelineId,
+    test_output_dir: Path,
+    action: str,
+) -> int:
+    """
+    Inserts some data on the timeline, perform an action, and insert more data on the same timeline.
+    Returns the size at the end of the insertion.
+
+    Valid actions:
+     - "lease": Acquires a lease.
+     - "branch": Creates a child branch but never writes to it.
+    """
+
+    client = env.pageserver.http_client()
+    with env.endpoints.create_start("main", tenant_id=tenant) as ep:
+        initial_size = client.tenant_size(tenant)
+        log.info(f"initial size: {initial_size}")
+
+        with ep.cursor() as cur:
+            cur.execute(
+                "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+        last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)
+
+        if action == "lease":
+            res = client.timeline_lsn_lease(tenant, timeline, last_flush_lsn)
+            log.info(f"result from lsn_lease api: {res}")
+        elif action == "branch":
+            ro_branch = env.neon_cli.create_branch(
+                "ro_branch", tenant_id=tenant, ancestor_start_lsn=last_flush_lsn
+            )
+            log.info(f"{ro_branch=} created")
+        else:
+            raise AssertionError("Invalid action type, only `lease` and `branch`are accepted")
+
+        with ep.cursor() as cur:
+            cur.execute(
+                "CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+            cur.execute(
+                "CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+            cur.execute(
+                "CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+
+        last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)
+
+        # Avoid flakiness when calculating logical size.
+        flush_ep_to_pageserver(env, ep, tenant, timeline)
+
+        size_after_action_and_insert = client.tenant_size(tenant)
+        log.info(f"{size_after_action_and_insert=}")
+
+        size_debug_file = open(test_output_dir / f"size_debug_{action}.html", "w")
+        size_debug = client.tenant_size_debug(tenant)
+        size_debug_file.write(size_debug)
+        return size_after_action_and_insert

From adde0ecfe03ff2e352650c2b807bcef4d8a2dc49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 4 Jul 2024 18:59:19 +0200
Subject: [PATCH 1122/1571] Flatten compression algorithm setting (#8265)

This flattens the compression algorithm setting, removing the
`Option<_>` wrapping layer and making handling of the setting easier.

It also adds a specific setting for *disabled* compression with the
continued ability to read copmressed data, giving us the option to
more easily back out of a compression rollout, should the need arise,
which was one of the limitations of #8238.

Implements my suggestion from
https://github.com/neondatabase/neon/pull/8238#issuecomment-2206181594 ,
inspired by Christian's review in
https://github.com/neondatabase/neon/pull/8238#pullrequestreview-2156460268 .

Part of #5431
---
 libs/pageserver_api/src/models.rs              | 15 ++++++++++++++-
 pageserver/src/config.rs                       | 11 ++++++-----
 pageserver/src/tenant/blob_io.rs               | 18 +++++++++++++-----
 .../src/tenant/storage_layer/delta_layer.rs    |  4 ++--
 pageserver/src/tenant/storage_layer/layer.rs   |  2 +-
 5 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ad65602f54..ecc543917e 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -450,9 +450,22 @@ pub enum CompactionAlgorithm {
 )]
 #[strum(serialize_all = "kebab-case")]
 pub enum ImageCompressionAlgorithm {
+    /// Disabled for writes, and never decompress during reading.
+    /// Never set this after you've enabled compression once!
+    DisabledNoDecompress,
+    // Disabled for writes, support decompressing during read path
+    Disabled,
     /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
     /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html).
-    Zstd { level: Option<i8> },
+    Zstd {
+        level: Option<i8>,
+    },
+}
+
+impl ImageCompressionAlgorithm {
+    pub fn allow_decompression(&self) -> bool {
+        !matches!(self, ImageCompressionAlgorithm::DisabledNoDecompress)
+    }
 }
 
 #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index fa7f7d8d97..b7c9af2244 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -91,7 +91,8 @@ pub mod defaults {
 
     pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
 
-    pub const DEFAULT_IMAGE_COMPRESSION: Option<ImageCompressionAlgorithm> = None;
+    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
+        ImageCompressionAlgorithm::DisabledNoDecompress;
 
     pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
 
@@ -288,7 +289,7 @@ pub struct PageServerConf {
 
     pub validate_vectored_get: bool,
 
-    pub image_compression: Option<ImageCompressionAlgorithm>,
+    pub image_compression: ImageCompressionAlgorithm,
 
     /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
     /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
@@ -402,7 +403,7 @@ struct PageServerConfigBuilder {
 
     validate_vectored_get: BuilderValue<bool>,
 
-    image_compression: BuilderValue<Option<ImageCompressionAlgorithm>>,
+    image_compression: BuilderValue<ImageCompressionAlgorithm>,
 
     ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
 
@@ -680,7 +681,7 @@ impl PageServerConfigBuilder {
         self.validate_vectored_get = BuilderValue::Set(value);
     }
 
-    pub fn get_image_compression(&mut self, value: Option<ImageCompressionAlgorithm>) {
+    pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
         self.image_compression = BuilderValue::Set(value);
     }
 
@@ -1028,7 +1029,7 @@ impl PageServerConf {
                     builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
                 }
                 "image_compression" => {
-                    builder.get_image_compression(Some(parse_toml_from_str("image_compression", item)?))
+                    builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
                 }
                 "ephemeral_bytes_per_memory_kb" => {
                     builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 1a6a5702f1..0705182d5d 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -273,7 +273,12 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         srcbuf: B,
         ctx: &RequestContext,
     ) -> (B::Buf, Result<u64, Error>) {
-        self.write_blob_maybe_compressed(srcbuf, ctx, None).await
+        self.write_blob_maybe_compressed(
+            srcbuf,
+            ctx,
+            ImageCompressionAlgorithm::DisabledNoDecompress,
+        )
+        .await
     }
 
     /// Write a blob of data. Returns the offset that it was written to,
@@ -282,7 +287,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         &mut self,
         srcbuf: B,
         ctx: &RequestContext,
-        algorithm: Option<ImageCompressionAlgorithm>,
+        algorithm: ImageCompressionAlgorithm,
     ) -> (B::Buf, Result<u64, Error>) {
         let offset = self.offset;
 
@@ -314,7 +319,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                     );
                 }
                 let (high_bit_mask, len_written, srcbuf) = match algorithm {
-                    Some(ImageCompressionAlgorithm::Zstd { level }) => {
+                    ImageCompressionAlgorithm::Zstd { level } => {
                         let mut encoder = if let Some(level) = level {
                             async_compression::tokio::write::ZstdEncoder::with_quality(
                                 Vec::new(),
@@ -335,7 +340,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                             (BYTE_UNCOMPRESSED, len, slice.into_inner())
                         }
                     }
-                    None => (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner()),
+                    ImageCompressionAlgorithm::Disabled
+                    | ImageCompressionAlgorithm::DisabledNoDecompress => {
+                        (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
+                    }
                 };
                 let mut len_buf = (len_written as u32).to_be_bytes();
                 assert_eq!(len_buf[0] & 0xf0, 0);
@@ -414,7 +422,7 @@ mod tests {
                     wtr.write_blob_maybe_compressed(
                         blob.clone(),
                         &ctx,
-                        Some(ImageCompressionAlgorithm::Zstd { level: Some(1) }),
+                        ImageCompressionAlgorithm::Zstd { level: Some(1) },
                     )
                     .await
                 } else {
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index e6a4d6d5c4..685f6dce60 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -49,7 +49,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
-use pageserver_api::models::LayerAccessKind;
+use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -453,7 +453,7 @@ impl DeltaLayerWriterInner {
     ) -> (Vec<u8>, anyhow::Result<()>) {
         assert!(self.lsn_range.start <= lsn);
         // We don't want to use compression in delta layer creation
-        let compression = None;
+        let compression = ImageCompressionAlgorithm::DisabledNoDecompress;
         let (val, res) = self
             .blob_writer
             .write_blob_maybe_compressed(val, ctx, compression)
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index d1f5cc8f43..afd11780e7 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1685,7 +1685,7 @@ impl DownloadedLayer {
                     lsn,
                     summary,
                     Some(owner.conf.max_vectored_read_bytes),
-                    owner.conf.image_compression.is_some(),
+                    owner.conf.image_compression.allow_decompression(),
                     ctx,
                 )
                 .await

From 88b13d4552fb538ded52624c3daa0883ae272583 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 4 Jul 2024 22:03:58 +0300
Subject: [PATCH 1123/1571] implement rolling hyper-log-log algorithm (#8068)

## Problem

See #7466

## Summary of changes

Implement algorithm descried in
https://hal.science/hal-00465313/document

Now new GUC is added:
`neon.wss_max_duration` which specifies size of sliding window (in
seconds). Default value is 1 hour.

It is possible to request estimation of working set sizes (within this
window using new function
`approximate_working_set_size_seconds`. Old function
`approximate_working_set_size` is preserved for backward compatibility.
But its scope is also limited by `neon.wss_max_duration`.

Version of Neon extension is changed to 1.4

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Matthias van de Meent <matthias@neon.tech>
---
 pgxn/neon/Makefile                            |   3 +-
 pgxn/neon/file_cache.c                        |  42 ++--
 pgxn/neon/hll.c                               | 193 ++++++++++++++++++
 pgxn/neon/hll.h                               |  86 ++++++++
 pgxn/neon/neon--1.3--1.4.sql                  |   9 +
 pgxn/neon/neon--1.4--1.3.sql                  |   1 +
 .../test_lfc_working_set_approximation.py     |  44 ++++
 test_runner/regress/test_neon_extension.py    |   2 +-
 8 files changed, 363 insertions(+), 17 deletions(-)
 create mode 100644 pgxn/neon/hll.c
 create mode 100644 pgxn/neon/hll.h
 create mode 100644 pgxn/neon/neon--1.3--1.4.sql
 create mode 100644 pgxn/neon/neon--1.4--1.3.sql

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index cd316dbb91..3b755bb042 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -6,6 +6,7 @@ OBJS = \
 	$(WIN32RES) \
 	extension_server.o \
 	file_cache.o \
+	hll.o \
 	libpagestore.o \
 	neon.o \
 	neon_utils.o \
@@ -22,7 +23,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl
 
 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql  neon--1.3--1.4.sql neon--1.4--1.3.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
 EXTRA_CLEAN = \
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 25275ef31f..1894e8c72a 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -26,7 +26,6 @@
 #include "miscadmin.h"
 #include "pagestore_client.h"
 #include "common/hashfn.h"
-#include "lib/hyperloglog.h"
 #include "pgstat.h"
 #include "postmaster/bgworker.h"
 #include RELFILEINFO_HDR
@@ -40,6 +39,8 @@
 #include "utils/dynahash.h"
 #include "utils/guc.h"
 
+#include "hll.h"
+
 /*
  * Local file cache is used to temporary store relations pages in local file system.
  * All blocks of all relations are stored inside one file and addressed using shared hash map.
@@ -62,7 +63,6 @@
 #define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
 #define MB					((uint64)1024*1024)
 
-#define HYPER_LOG_LOG_BIT_WIDTH   10
 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
 
 typedef struct FileCacheEntry
@@ -87,8 +87,7 @@ typedef struct FileCacheControl
 	uint64		writes;
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
-	hyperLogLogState wss_estimation; /* estimation of wroking set size */
-	uint8_t		hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1];
+	HyperLogLogState wss_estimation; /* estimation of working set size */
 } FileCacheControl;
 
 static HTAB *lfc_hash;
@@ -238,12 +237,7 @@ lfc_shmem_startup(void)
 		dlist_init(&lfc_ctl->lru);
 
 		/* Initialize hyper-log-log structure for estimating working set size */
-		initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH);
-
-		/* We need hashes in shared memory */
-		pfree(lfc_ctl->wss_estimation.hashesArr);
-		memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
-		lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes;
+		initSHLL(&lfc_ctl->wss_estimation);
 
 		/* Recreate file cache on restart */
 		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
@@ -545,7 +539,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	/* Approximate working set */
 	tag.blockNum = blkno;
-	addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
 
 	if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
 	{
@@ -986,20 +980,38 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		SRF_RETURN_DONE(funcctx);
 }
 
+PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
+
+Datum
+approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
+{
+	if (lfc_size_limit != 0)
+	{
+		int32 dc;
+		time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
+		LWLockAcquire(lfc_lock, LW_SHARED);
+		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
+		LWLockRelease(lfc_lock);
+		PG_RETURN_INT32(dc);
+	}
+	PG_RETURN_NULL();
+}
+
 PG_FUNCTION_INFO_V1(approximate_working_set_size);
 
 Datum
 approximate_working_set_size(PG_FUNCTION_ARGS)
 {
-	int32 dc = -1;
 	if (lfc_size_limit != 0)
 	{
+		int32 dc;
 		bool reset = PG_GETARG_BOOL(0);
 		LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
-		dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation);
+		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1);
 		if (reset)
-			memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
+			memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
 		LWLockRelease(lfc_lock);
+		PG_RETURN_INT32(dc);
 	}
-	PG_RETURN_INT32(dc);
+	PG_RETURN_NULL();
 }
diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c
new file mode 100644
index 0000000000..f8496b3125
--- /dev/null
+++ b/pgxn/neon/hll.c
@@ -0,0 +1,193 @@
+/*-------------------------------------------------------------------------
+ *
+ * hll.c
+ *	  Sliding HyperLogLog cardinality estimator
+ *
+ * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
+ *
+ * Implements https://hal.science/hal-00465313/document
+ * 
+ * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
+ * suited to estimating the cardinality of very large sets;  in particular, we
+ * have not attempted to further optimize the implementation as described in
+ * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
+ * Engineering of a State of The Art Cardinality Estimation Algorithm".
+ *
+ * A sparse representation of HyperLogLog state is used, with fixed space
+ * overhead.
+ *
+ * The copyright terms of Ohno's original version (the MIT license) follow.
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/hyperloglog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the 'Software'), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <math.h>
+
+#include "postgres.h"
+#include "funcapi.h"
+#include "port/pg_bitutils.h"
+#include "utils/timestamp.h"
+#include "hll.h"
+
+
+#define POW_2_32			(4294967296.0)
+#define NEG_POW_2_32		(-4294967296.0)
+
+#define ALPHA_MM ((0.7213 / (1.0 + 1.079 / HLL_N_REGISTERS)) * HLL_N_REGISTERS * HLL_N_REGISTERS)
+
+/*
+ * Worker for addHyperLogLog().
+ *
+ * Calculates the position of the first set bit in first b bits of x argument
+ * starting from the first, reading from most significant to least significant
+ * bits.
+ *
+ * Example (when considering fist 10 bits of x):
+ *
+ * rho(x = 0b1000000000)   returns 1
+ * rho(x = 0b0010000000)   returns 3
+ * rho(x = 0b0000000000)   returns b + 1
+ *
+ * "The binary address determined by the first b bits of x"
+ *
+ * Return value "j" used to index bit pattern to watch.
+ */
+static inline uint8
+rho(uint32 x, uint8 b)
+{
+	uint8		j = 1;
+
+	if (x == 0)
+		return b + 1;
+
+	j = 32 - pg_leftmost_one_pos32(x);
+
+	if (j > b)
+		return b + 1;
+
+	return j;
+}
+
+/*
+ * Initialize HyperLogLog track state
+ */
+void
+initSHLL(HyperLogLogState *cState)
+{
+	memset(cState->regs, 0, sizeof(cState->regs));
+}
+
+/*
+ * Adds element to the estimator, from caller-supplied hash.
+ *
+ * It is critical that the hash value passed be an actual hash value, typically
+ * generated using hash_any().  The algorithm relies on a specific bit-pattern
+ * observable in conjunction with stochastic averaging.  There must be a
+ * uniform distribution of bits in hash values for each distinct original value
+ * observed.
+ */
+void
+addSHLL(HyperLogLogState *cState, uint32 hash)
+{
+	uint8		count;
+	uint32		index;
+	size_t		i;
+	size_t		j;
+
+	TimestampTz	now = GetCurrentTimestamp();
+	/* Use the first "k" (registerWidth) bits as a zero based index */
+	index = hash >> HLL_C_BITS;
+
+	/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
+	count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS);
+
+	cState->regs[index][count] = now;
+}
+
+static uint8
+getMaximum(const TimestampTz* reg, TimestampTz since)
+{
+	uint8 max = 0;
+
+	for (size_t i = 0; i < HLL_C_BITS + 1; i++)
+	{
+		if (reg[i] >= since)
+		{
+			max = i;
+		}
+	}
+
+	return max;
+}
+
+
+/*
+ * Estimates cardinality, based on elements added so far
+ */
+double
+estimateSHLL(HyperLogLogState *cState, time_t duration)
+{
+	double		result;
+	double		sum = 0.0;
+	size_t		i;
+	uint8       R[HLL_N_REGISTERS];
+	/* 0 indicates uninitialized timestamp, so if we need to cover the whole range than starts with 1 */
+	TimestampTz since = duration == (time_t)-1 ? 1 : GetCurrentTimestamp() - duration * USECS_PER_SEC;
+
+	for (i = 0; i < HLL_N_REGISTERS; i++)
+	{
+		R[i] = getMaximum(cState->regs[i], since);
+		sum += 1.0 / pow(2.0, R[i]);
+	}
+
+	/* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
+	result = ALPHA_MM / sum;
+
+	if (result <= (5.0 / 2.0) * HLL_N_REGISTERS)
+	{
+		/* Small range correction */
+		int			zero_count = 0;
+
+		for (i = 0; i < HLL_N_REGISTERS; i++)
+		{
+			zero_count += R[i] == 0;
+		}
+
+		if (zero_count != 0)
+			result = HLL_N_REGISTERS * log((double) HLL_N_REGISTERS /
+										   zero_count);
+	}
+	else if (result > (1.0 / 30.0) * POW_2_32)
+	{
+		/* Large range correction */
+		result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
+	}
+
+	return result;
+}
+
diff --git a/pgxn/neon/hll.h b/pgxn/neon/hll.h
new file mode 100644
index 0000000000..9256cb9afa
--- /dev/null
+++ b/pgxn/neon/hll.h
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * hll.h
+ *	  Sliding HyperLogLog cardinality estimator
+ *
+ * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
+ *
+ * Implements https://hal.science/hal-00465313/document
+ * 
+ * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
+ * suited to estimating the cardinality of very large sets;  in particular, we
+ * have not attempted to further optimize the implementation as described in
+ * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
+ * Engineering of a State of The Art Cardinality Estimation Algorithm".
+ *
+ * A sparse representation of HyperLogLog state is used, with fixed space
+ * overhead.
+ *
+ * The copyright terms of Ohno's original version (the MIT license) follow.
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/hyperloglog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the 'Software'), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef HLL_H
+#define HLL_H
+
+#define HLL_BIT_WIDTH   10
+#define HLL_C_BITS      (32 - HLL_BIT_WIDTH)
+#define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH)
+
+/*
+ * HyperLogLog is an approximate technique for computing the number of distinct
+ * entries in a set.  Importantly, it does this by using a fixed amount of
+ * memory.  See the 2007 paper "HyperLogLog: the analysis of a near-optimal
+ * cardinality estimation algorithm" for more.
+ *
+ * Instead of a single counter for every bits register, we have a timestamp
+ * for every valid number of bits we can encounter. Every time we encounter
+ * a certain number of bits, we update the timestamp in those registers to
+ * the current timestamp.
+ *
+ * We can query the sketch's stored cardinality for the range of some timestamp
+ * up to now: For each register, we return the highest bits bucket that has a
+ * modified timestamp >= the query timestamp. This value is the number of bits
+ * for this register in the normal HLL calculation.
+ *
+ * The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB.
+ * Usage could be halved if we decide to reduce the required time dimension
+ * precision; as 32 bits in second precision should be enough for statistics.
+ * However, that is not yet implemented.
+ */
+typedef struct HyperLogLogState
+{
+	TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1];
+} HyperLogLogState;
+
+extern void   initSHLL(HyperLogLogState *cState);
+extern void   addSHLL(HyperLogLogState *cState, uint32 hash);
+extern double estimateSHLL(HyperLogLogState *cState, time_t dutration);
+
+#endif
diff --git a/pgxn/neon/neon--1.3--1.4.sql b/pgxn/neon/neon--1.3--1.4.sql
new file mode 100644
index 0000000000..042effe346
--- /dev/null
+++ b/pgxn/neon/neon--1.3--1.4.sql
@@ -0,0 +1,9 @@
+\echo Use "ALTER EXTENSION neon UPDATE TO '1.4'" to load this file. \quit
+
+CREATE FUNCTION approximate_working_set_size_seconds(duration integer default null)
+RETURNS integer
+AS 'MODULE_PATHNAME', 'approximate_working_set_size_seconds'
+LANGUAGE C PARALLEL SAFE;
+
+GRANT EXECUTE ON FUNCTION approximate_working_set_size_seconds(integer) TO pg_monitor;
+
diff --git a/pgxn/neon/neon--1.4--1.3.sql b/pgxn/neon/neon--1.4--1.3.sql
new file mode 100644
index 0000000000..bea72d1a6b
--- /dev/null
+++ b/pgxn/neon/neon--1.4--1.3.sql
@@ -0,0 +1 @@
+DROP FUNCTION IF EXISTS approximate_working_set_size_seconds(integer) CASCADE;
diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py
index a6f05fe0f7..6465bdfd21 100644
--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -1,3 +1,4 @@
+import time
 from pathlib import Path
 
 from fixtures.log_helper import log
@@ -72,3 +73,46 @@ WITH (fillfactor='100');
     blocks = query_scalar(cur, "select approximate_working_set_size(true)")
     log.info(f"working set size after some index access of a few select pages only {blocks}")
     assert blocks < 10
+
+
+def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create_start(
+        branch_name="main",
+        config_lines=[
+            "autovacuum = off",
+            "shared_buffers=1MB",
+            "neon.max_file_cache_size=256MB",
+            "neon.file_cache_size_limit=245MB",
+        ],
+    )
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    cur.execute("create extension neon version '1.4'")
+    cur.execute(
+        "create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 128))"
+    )
+    cur.execute("insert into t (pk) values (generate_series(1,1000000))")
+    time.sleep(2)
+    before_10k = time.monotonic()
+    cur.execute("select sum(count) from t where pk between 10000 and 20000")
+    time.sleep(2)
+    before_1k = time.monotonic()
+    cur.execute("select sum(count) from t where pk between 1000 and 2000")
+    after = time.monotonic()
+
+    cur.execute(f"select approximate_working_set_size_seconds({int(after - before_1k + 1)})")
+    estimation_1k = cur.fetchall()[0][0]
+    log.info(f"Working set size for selecting 1k records {estimation_1k}")
+
+    cur.execute(f"select approximate_working_set_size_seconds({int(after - before_10k + 1)})")
+    estimation_10k = cur.fetchall()[0][0]
+    log.info(f"Working set size for selecting 10k records {estimation_10k}")
+
+    cur.execute("select pg_table_size('t')")
+    size = cur.fetchall()[0][0] // 8192
+    log.info(f"Table size {size} blocks")
+
+    assert estimation_1k >= 20 and estimation_1k <= 40
+    assert estimation_10k >= 200 and estimation_10k <= 400
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index 39b4865026..e83aaf91c6 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -50,7 +50,7 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
             # Ensure that the default version is also updated in the neon.control file
             assert cur.fetchone() == ("1.3",)
             cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
-            all_versions = ["1.3", "1.2", "1.1", "1.0"]
+            all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"]
             current_version = "1.3"
             for idx, begin_version in enumerate(all_versions):
                 for target_version in all_versions[idx + 1 :]:

From 711716c72506cdf05ce3a4cd755b007439de86e9 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 4 Jul 2024 22:17:45 +0200
Subject: [PATCH 1124/1571] add checkout depth1 to workflow to access local
 github actions like generate allure report (#8259)

## Problem

job step to create allure report fails


https://github.com/neondatabase/neon/actions/runs/9781886710/job/27006997416#step:11:1

## Summary of changes

Shallow checkout of sources to get access to local github action needed
in the job step

## Example run
example run with this change
https://github.com/neondatabase/neon/actions/runs/9790647724
do not merge this PR until the job is clean

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/periodic_pagebench.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index c0219599a2..a8baf6bf7a 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -43,6 +43,10 @@ jobs:
       AWS_DEFAULT_REGION : "eu-central-1"
       AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
     steps:
+    # we don't need the neon source code because we run everything remotely
+    # however we still need the local github actions to run the allure step below
+    - uses: actions/checkout@v4
+
     - name: Show my own (github runner) external IP address - usefull for IP allowlisting
       run: curl https://ifconfig.me
 
@@ -116,6 +120,9 @@ jobs:
         cat "test_log_${GITHUB_RUN_ID}"
 
     - name: Create Allure report
+      env:
+        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
 

From e25ac31fc9d18d312ec83decb3ceed82cbbf6119 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 5 Jul 2024 10:09:15 +0100
Subject: [PATCH 1125/1571] tests: extend allow list in deletion test (#8268)

## Problem

1ea5d8b1327d2e93cbe11682f60a90e35d42d1ee tolerated this as an error
message, but it can show up in logs as well.

Example failure:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8201/9780147712/index.html#testresult/263422f5f5f292ea/retries

## Summary of changes

- Tolerate "failed to delete 1 objects" in pageserver logs, this occurs
occasionally when injected failures exhaust deletion's retries.
---
 test_runner/regress/test_tenant_delete.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index d3fba32a19..1d7c8b8e31 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -67,8 +67,9 @@ def test_tenant_delete_smoke(
 
     # first try to delete non existing tenant
     tenant_id = TenantId.generate()
-    env.pageserver.allowed_errors.append(".*NotFound.*")
-    env.pageserver.allowed_errors.append(".*simulated failure.*")
+    env.pageserver.allowed_errors.extend(
+        [".*NotFound.*", ".*simulated failure.*", ".*failed to delete .+ objects.*"]
+    )
 
     # Check that deleting a non-existent tenant gives the expected result: this is a loop because we
     # may need to retry on some remote storage errors injected by the test harness

From 6876f0d06616851a694ad36bfec11d83e71cc49a Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Fri, 5 Jul 2024 11:23:46 +0200
Subject: [PATCH 1126/1571] correct error handling for periodic pagebench
 runner status (#8274)

## Problem

the following periodic pagebench run was failed but was still shown as
successful


https://github.com/neondatabase/neon/actions/runs/9798909458/job/27058179993#step:9:47

## Summary of changes

if the ec2 test runner reports a failure fail the job step and thus the
workflow

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/periodic_pagebench.yml | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index a8baf6bf7a..ed4e6be712 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -94,10 +94,12 @@ jobs:
           set +x
           status=$(echo $response | jq -r '.status')
           echo "Test status: $status"
-          if [[ "$status" == "failure" || "$status" == "success" || "$status" == "null" ]]; then
+          if [[ "$status" == "failure" ]]; then
+            echo "Test failed"
+            exit 1 # Fail the job step if status is failure
+          elif [[ "$status" == "success" || "$status" == "null" ]]; then
             break
-          fi
-          if [[ "$status" == "too_many_runs" ]]; then
+          elif [[ "$status" == "too_many_runs" ]]; then
             echo "Too many runs already running"
             echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
             exit 1
@@ -107,6 +109,7 @@ jobs:
         done
 
     - name: Retrieve Test Logs
+      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
       run: |
         curl -k -X 'GET' \
         "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
@@ -115,6 +118,7 @@ jobs:
         --output "test_log_${GITHUB_RUN_ID}.gz"
     
     - name: Unzip Test Log and Print it into this job's log
+      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
       run: |
         gzip -d "test_log_${GITHUB_RUN_ID}.gz"
         cat "test_log_${GITHUB_RUN_ID}"

From 5aae80640b5d0fe20502c0c3b32dd6ffa02456b9 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 5 Jul 2024 10:34:16 +0100
Subject: [PATCH 1127/1571] tests: make location_conf_churn more robust (#8271)

## Problem

This test directly manages locations on pageservers and configuration of
an endpoint. However, it did not switch off the parts of the storage
controller that attempt to do the same: occasionally, the test would
fail in a strange way such as a compute failing to accept a
reconfiguration request.

## Summary of changes

- Wire up the storage controller's compute notification hook to a no-op
handler
- Configure the tenant's scheduling policy to Stop.
---
 .../regress/test_pageserver_secondary.py      | 27 ++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 4c828b86b0..0416078ebc 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -16,6 +16,8 @@ from fixtures.pageserver.utils import (
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
 
 # A tenant configuration that is convenient for generating uploads and deletions
 # without a large amount of postgres traffic.
@@ -59,7 +61,7 @@ def evict_random_layers(
 
 
 @pytest.mark.parametrize("seed", [1, 2, 3])
-def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
+def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, seed: int):
     """
     Issue many location configuration changes, ensure that tenants
     remain readable & we don't get any unexpected errors.  We should
@@ -73,6 +75,20 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     neon_env_builder.enable_pageserver_remote_storage(
         remote_storage_kind=s3_storage(),
     )
+    neon_env_builder.control_plane_compute_hook_api = (
+        f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach"
+    )
+
+    def ignore_notify(request: Request):
+        # This test does all its own compute configuration (by passing explicit pageserver ID to Workload functions),
+        # so we send controller notifications to /dev/null to prevent it fighting the test for control of the compute.
+        log.info(f"Ignoring storage controller compute notification: {request.json}")
+        return Response(status=200)
+
+    make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(
+        ignore_notify
+    )
+
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
 
     pageservers = env.pageservers
@@ -99,6 +115,15 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     workload.init(env.pageservers[0].id)
     workload.write_rows(256, env.pageservers[0].id)
 
+    # Discourage the storage controller from interfering with the changes we will make directly on the pageserver
+    env.storage_controller.tenant_policy_update(
+        tenant_id,
+        {
+            "scheduling": "Stop",
+        },
+    )
+    env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy Stop.*")
+
     # We use a fixed seed to make the test reproducible: we want a randomly
     # chosen order, but not to change the order every time we run the test.
     rng = random.Random(seed)

From 6849ae4810e9a678dfc301f7118c4ce152a0c484 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 5 Jul 2024 11:17:44 +0100
Subject: [PATCH 1128/1571] safekeeper: add separate `tombstones` map for
 deleted timelines (#8253)

## Problem

Safekeepers left running for a long time use a lot of memory (up to the
point of OOMing, on small nodes) for deleted timelines, because the
`Timeline` struct is kept alive as a guard against recreating deleted
timelines.

Closes: https://github.com/neondatabase/neon/issues/6810

## Summary of changes

- Create separate tombstones that just record a ttid and when the
timeline was deleted.
- Add a periodic housekeeping task that cleans up tombstones older than
a hardcoded TTL (24h)

I think this also makes https://github.com/neondatabase/neon/pull/6766
un-needed, as the tombstone is also checked during deletion.

I considered making the overall timeline map use an enum type containing
active or deleted, but having a separate map of tombstones avoids
bloating that map, so that calls like `get()` can still go straight to a
timeline without having to walk a hashmap that also contains tombstones.
---
 safekeeper/src/bin/safekeeper.rs       |  13 +++
 safekeeper/src/timelines_global_map.rs | 105 +++++++++++++++++--------
 2 files changed, 87 insertions(+), 31 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index d25b8722ac..4d580e57ed 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -445,6 +445,19 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
         .map(|res| ("WAL service main".to_owned(), res));
     tasks_handles.push(Box::pin(wal_service_handle));
 
+    let timeline_housekeeping_handle = current_thread_rt
+        .as_ref()
+        .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
+        .spawn(async move {
+            const TOMBSTONE_TTL: Duration = Duration::from_secs(3600 * 24);
+            loop {
+                tokio::time::sleep(TOMBSTONE_TTL).await;
+                GlobalTimelines::housekeeping(&TOMBSTONE_TTL);
+            }
+        })
+        .map(|res| ("Timeline map housekeeping".to_owned(), res));
+    tasks_handles.push(Box::pin(timeline_housekeeping_handle));
+
     if let Some(pg_listener_tenant_only) = pg_listener_tenant_only {
         let conf_ = conf.clone();
         let wal_service_handle = current_thread_rt
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 9ce1112cec..f57da5c7cb 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -15,12 +15,19 @@ use std::collections::HashMap;
 use std::str::FromStr;
 use std::sync::atomic::Ordering;
 use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
 use tracing::*;
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 
 struct GlobalTimelinesState {
     timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
+
+    // A tombstone indicates this timeline used to exist has been deleted.  These are used to prevent
+    // on-demand timeline creation from recreating deleted timelines.  This is only soft-enforced, as
+    // this map is dropped on restart.
+    tombstones: HashMap<TenantTimelineId, Instant>,
+
     conf: Option<SafeKeeperConf>,
     broker_active_set: Arc<TimelinesSet>,
     load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
@@ -64,11 +71,17 @@ impl GlobalTimelinesState {
             .cloned()
             .ok_or(TimelineError::NotFound(*ttid))
     }
+
+    fn delete(&mut self, ttid: TenantTimelineId) {
+        self.timelines.remove(&ttid);
+        self.tombstones.insert(ttid, Instant::now());
+    }
 }
 
 static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
     Mutex::new(GlobalTimelinesState {
         timelines: HashMap::new(),
+        tombstones: HashMap::new(),
         conf: None,
         broker_active_set: Arc::new(TimelinesSet::default()),
         load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
@@ -198,11 +211,17 @@ impl GlobalTimelines {
                 let tli = Arc::new(timeline);
 
                 // TODO: prevent concurrent timeline creation/loading
-                TIMELINES_STATE
-                    .lock()
-                    .unwrap()
-                    .timelines
-                    .insert(ttid, tli.clone());
+                {
+                    let mut state = TIMELINES_STATE.lock().unwrap();
+
+                    // We may be have been asked to load a timeline that was previously deleted (e.g. from `pull_timeline.rs`).  We trust
+                    // that the human doing this manual intervention knows what they are doing, and remove its tombstone.
+                    if state.tombstones.remove(&ttid).is_some() {
+                        warn!("Un-deleted timeline {ttid}");
+                    }
+
+                    state.timelines.insert(ttid, tli.clone());
+                }
 
                 tli.bootstrap(&conf, broker_active_set, partial_backup_rate_limiter);
 
@@ -229,7 +248,7 @@ impl GlobalTimelines {
 
     /// Create a new timeline with the given id. If the timeline already exists, returns
     /// an existing timeline.
-    pub async fn create(
+    pub(crate) async fn create(
         ttid: TenantTimelineId,
         server_info: ServerInfo,
         commit_lsn: Lsn,
@@ -241,6 +260,11 @@ impl GlobalTimelines {
                 // Timeline already exists, return it.
                 return Ok(timeline);
             }
+
+            if state.tombstones.contains_key(&ttid) {
+                anyhow::bail!("Timeline {ttid} is deleted, refusing to recreate");
+            }
+
             state.get_dependencies()
         };
 
@@ -300,17 +324,19 @@ impl GlobalTimelines {
     /// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
     /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid,
     /// i.e. loaded in memory and not cancelled.
-    pub fn get(ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
-        let res = TIMELINES_STATE.lock().unwrap().get(&ttid);
-
-        match res {
+    pub(crate) fn get(ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
+        let tli_res = {
+            let state = TIMELINES_STATE.lock().unwrap();
+            state.get(&ttid)
+        };
+        match tli_res {
             Ok(tli) => {
                 if tli.is_cancelled() {
                     return Err(TimelineError::Cancelled(ttid));
                 }
                 Ok(tli)
             }
-            _ => res,
+            _ => tli_res,
         }
     }
 
@@ -339,12 +365,26 @@ impl GlobalTimelines {
 
     /// Cancels timeline, then deletes the corresponding data directory.
     /// If only_local, doesn't remove WAL segments in remote storage.
-    pub async fn delete(
+    pub(crate) async fn delete(
         ttid: &TenantTimelineId,
         only_local: bool,
     ) -> Result<TimelineDeleteForceResult> {
-        let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid);
-        match tli_res {
+        let tli_res = {
+            let state = TIMELINES_STATE.lock().unwrap();
+
+            if state.tombstones.contains_key(ttid) {
+                // Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do.
+                info!("Timeline {ttid} was already deleted");
+                return Ok(TimelineDeleteForceResult {
+                    dir_existed: false,
+                    was_active: false,
+                });
+            }
+
+            state.get(ttid)
+        };
+
+        let result = match tli_res {
             Ok(timeline) => {
                 let was_active = timeline.broker_active.load(Ordering::Relaxed);
 
@@ -354,11 +394,6 @@ impl GlobalTimelines {
                 info!("deleting timeline {}, only_local={}", ttid, only_local);
                 let dir_existed = timeline.delete(&mut shared_state, only_local).await?;
 
-                // Remove timeline from the map.
-                // FIXME: re-enable it once we fix the issue with recreation of deleted timelines
-                // https://github.com/neondatabase/neon/issues/3146
-                // TIMELINES_STATE.lock().unwrap().timelines.remove(ttid);
-
                 Ok(TimelineDeleteForceResult {
                     dir_existed,
                     was_active, // TODO: we probably should remove this field
@@ -374,7 +409,14 @@ impl GlobalTimelines {
                     was_active: false,
                 })
             }
-        }
+        };
+
+        // Finalize deletion, by dropping Timeline objects and storing smaller tombstones.  The tombstones
+        // are used to prevent still-running computes from re-creating the same timeline when they send data,
+        // and to speed up repeated deletion calls by avoiding re-listing objects.
+        TIMELINES_STATE.lock().unwrap().delete(*ttid);
+
+        result
     }
 
     /// Deactivates and deletes all timelines for the tenant. Returns map of all timelines which
@@ -420,19 +462,20 @@ impl GlobalTimelines {
             tenant_id,
         ))?;
 
-        // FIXME: we temporarily disabled removing timelines from the map, see `delete_force`
-        // let tlis_after_delete = Self::get_all_for_tenant(*tenant_id);
-        // if !tlis_after_delete.is_empty() {
-        //     // Some timelines were created while we were deleting them, returning error
-        //     // to the caller, so it can retry later.
-        //     bail!(
-        //         "failed to delete all timelines for tenant {}: some timelines were created while we were deleting them",
-        //         tenant_id
-        //     );
-        // }
-
         Ok(deleted)
     }
+
+    pub fn housekeeping(tombstone_ttl: &Duration) {
+        let mut state = TIMELINES_STATE.lock().unwrap();
+
+        // We keep tombstones long enough to have a good chance of preventing rogue computes from re-creating deleted
+        // timelines.  If a compute kept running for longer than this TTL (or across a safekeeper restart) then they
+        // may recreate a deleted timeline.
+        let now = Instant::now();
+        state
+            .tombstones
+            .retain(|_, v| now.duration_since(*v) < *tombstone_ttl);
+    }
 }
 
 #[derive(Clone, Copy, Serialize)]

From 7dd2e447d3aa44b8e3e55a6f4cca39c295dc80e7 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 5 Jul 2024 14:02:02 +0100
Subject: [PATCH 1129/1571] pageserver: add time based image layer creation
 check (#8247)

## Problem
Assume a timeline with the following workload: very slow ingest of
updates to a small number of keys that fit within the same partition (as decided by
`KeySpace::partition`). These tenants will create small L0 layers since due to time
based rolling, and, consequently, the L1 layers will also be small.

Currently, by default, we need to ingest 512 MiB of WAL before checking
if an image layer is required. This scheme works fine under the assumption that L1s are roughly of
checkpoint distance size, but as the first paragraph explained, that's not the case for all workloads.

## Summary of changes
Check if new image layers are required at least once every checkpoint timeout interval.
---
 pageserver/src/tenant/timeline.rs | 71 ++++++++++++++++++++++++-------
 1 file changed, 55 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 42e55ab269..92baf1073a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -365,6 +365,7 @@ pub struct Timeline {
     repartition_threshold: u64,
 
     last_image_layer_creation_check_at: AtomicLsn,
+    last_image_layer_creation_check_instant: std::sync::Mutex<Option<Instant>>,
 
     /// Current logical size of the "datadir", at the last LSN.
     current_logical_size: LogicalSize,
@@ -2384,6 +2385,7 @@ impl Timeline {
                 )),
                 repartition_threshold: 0,
                 last_image_layer_creation_check_at: AtomicLsn::new(0),
+                last_image_layer_creation_check_instant: Mutex::new(None),
 
                 last_received_wal: Mutex::new(None),
                 rel_size_cache: RwLock::new(RelSizeCache {
@@ -4464,6 +4466,58 @@ impl Timeline {
         }
     }
 
+    /// Predicate function which indicates whether we should check if new image layers
+    /// are required. Since checking if new image layers are required is expensive in
+    /// terms of CPU, we only do it in the following cases:
+    /// 1. If the timeline has ingested sufficient WAL to justify the cost
+    /// 2. If enough time has passed since the last check
+    /// 2.1. For large tenants, we wish to perform the check more often since they
+    /// suffer from the lack of image layers
+    /// 2.2. For small tenants (that can mostly fit in RAM), we use a much longer interval
+    fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
+        const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
+
+        let last_checks_at = self.last_image_layer_creation_check_at.load();
+        let distance = lsn
+            .checked_sub(last_checks_at)
+            .expect("Attempt to compact with LSN going backwards");
+        let min_distance =
+            self.get_image_layer_creation_check_threshold() as u64 * self.get_checkpoint_distance();
+
+        let distance_based_decision = distance.0 >= min_distance;
+
+        let mut time_based_decision = false;
+        let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap();
+        if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() {
+            let check_required_after = if Into::<u64>::into(&logical_size) >= LARGE_TENANT_THRESHOLD
+            {
+                self.get_checkpoint_timeout()
+            } else {
+                Duration::from_secs(3600 * 48)
+            };
+
+            time_based_decision = match *last_check_instant {
+                Some(last_check) => {
+                    let elapsed = last_check.elapsed();
+                    elapsed >= check_required_after
+                }
+                None => true,
+            };
+        }
+
+        // Do the expensive delta layer counting only if this timeline has ingested sufficient
+        // WAL since the last check or a checkpoint timeout interval has elapsed since the last
+        // check.
+        let decision = distance_based_decision || time_based_decision;
+
+        if decision {
+            self.last_image_layer_creation_check_at.store(lsn);
+            *last_check_instant = Some(Instant::now());
+        }
+
+        decision
+    }
+
     #[tracing::instrument(skip_all, fields(%lsn, %mode))]
     async fn create_image_layers(
         self: &Arc<Timeline>,
@@ -4486,22 +4540,7 @@ impl Timeline {
         // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
         let mut start = Key::MIN;
 
-        let check_for_image_layers = {
-            let last_checks_at = self.last_image_layer_creation_check_at.load();
-            let distance = lsn
-                .checked_sub(last_checks_at)
-                .expect("Attempt to compact with LSN going backwards");
-            let min_distance = self.get_image_layer_creation_check_threshold() as u64
-                * self.get_checkpoint_distance();
-
-            // Skip the expensive delta layer counting if this timeline has not ingested sufficient
-            // WAL since the last check.
-            distance.0 >= min_distance
-        };
-
-        if check_for_image_layers {
-            self.last_image_layer_creation_check_at.store(lsn);
-        }
+        let check_for_image_layers = self.should_check_if_image_layers_required(lsn);
 
         for partition in partitioning.parts.iter() {
             let img_range = start..partition.ranges.last().unwrap().end;

From c9fd8d76937c2031fd4fea1cdf661d6cf4f00dc3 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 5 Jul 2024 15:12:01 +0100
Subject: [PATCH 1130/1571] =?UTF-8?q?SELECT=20=F0=9F=92=A3();=20(#8270)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem
We want to be able to test how our infrastructure reacts on segfaults in
Postgres (for example, we collect cores, and get some required
logs/metrics, etc)

## Summary of changes
- Add `trigger_segfauls` function to `neon_test_utils` to trigger a
segfault in Postgres
- Add `trigger_panic` function to `neon_test_utils` to trigger SIGABRT
(by using `elog(PANIC, ...))
- Fix cleanup logic in regression tests in endpoint crashed
---
 pgxn/neon_test_utils/Makefile                 |  2 +-
 ...tils--1.2.sql => neon_test_utils--1.3.sql} | 18 +++++++++++++++
 pgxn/neon_test_utils/neon_test_utils.control  |  2 +-
 pgxn/neon_test_utils/neontest.c               | 23 +++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py         | 18 +++++++++++----
 test_runner/regress/test_endpoint_crash.py    | 23 +++++++++++++++++++
 6 files changed, 80 insertions(+), 6 deletions(-)
 rename pgxn/neon_test_utils/{neon_test_utils--1.2.sql => neon_test_utils--1.3.sql} (77%)
 create mode 100644 test_runner/regress/test_endpoint_crash.py

diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile
index 1371272439..252810b5b0 100644
--- a/pgxn/neon_test_utils/Makefile
+++ b/pgxn/neon_test_utils/Makefile
@@ -7,7 +7,7 @@ OBJS = \
 	neontest.o
 
 EXTENSION = neon_test_utils
-DATA = neon_test_utils--1.2.sql
+DATA = neon_test_utils--1.3.sql
 PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
 
 PG_CONFIG = pg_config
diff --git a/pgxn/neon_test_utils/neon_test_utils--1.2.sql b/pgxn/neon_test_utils/neon_test_utils--1.3.sql
similarity index 77%
rename from pgxn/neon_test_utils/neon_test_utils--1.2.sql
rename to pgxn/neon_test_utils/neon_test_utils--1.3.sql
index f84a24ec8d..3b8794a8cf 100644
--- a/pgxn/neon_test_utils/neon_test_utils--1.2.sql
+++ b/pgxn/neon_test_utils/neon_test_utils--1.3.sql
@@ -45,3 +45,21 @@ CREATE FUNCTION neon_xlogflush(lsn pg_lsn DEFAULT NULL)
 RETURNS VOID
 AS 'MODULE_PATHNAME', 'neon_xlogflush'
 LANGUAGE C PARALLEL UNSAFE;
+
+CREATE FUNCTION trigger_panic()
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'trigger_panic'
+LANGUAGE C PARALLEL UNSAFE;
+
+CREATE FUNCTION trigger_segfault()
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'trigger_segfault'
+LANGUAGE C PARALLEL UNSAFE;
+
+-- Alias for `trigger_segfault`, just because `SELECT 💣()` looks fun
+CREATE OR REPLACE FUNCTION 💣() RETURNS void
+LANGUAGE plpgsql AS $$
+BEGIN
+    PERFORM trigger_segfault();
+END;
+$$;
diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control
index c7b9191ddc..f22afd70c4 100644
--- a/pgxn/neon_test_utils/neon_test_utils.control
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -1,6 +1,6 @@
 # neon_test_utils extension
 comment = 'helpers for neon testing and debugging'
-default_version = '1.2'
+default_version = '1.3'
 module_pathname = '$libdir/neon_test_utils'
 relocatable = true
 trusted = true
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 071dc122ed..650ef7405d 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -42,6 +42,8 @@ PG_FUNCTION_INFO_V1(clear_buffer_cache);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
 PG_FUNCTION_INFO_V1(neon_xlogflush);
+PG_FUNCTION_INFO_V1(trigger_panic);
+PG_FUNCTION_INFO_V1(trigger_segfault);
 
 /*
  * Linkage to functions in neon module.
@@ -489,3 +491,24 @@ neon_xlogflush(PG_FUNCTION_ARGS)
 	XLogFlush(lsn);
 	PG_RETURN_VOID();
 }
+
+/*
+ * Function to trigger panic.
+ */
+Datum
+trigger_panic(PG_FUNCTION_ARGS)
+{
+    elog(PANIC, "neon_test_utils: panic");
+    PG_RETURN_VOID();
+}
+
+/*
+ * Function to trigger a segfault.
+ */
+Datum
+trigger_segfault(PG_FUNCTION_ARGS)
+{
+    int *ptr = NULL;
+    *ptr = 42;
+    PG_RETURN_VOID();
+}
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c002e11c1c..5fb4d94817 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -943,6 +943,8 @@ class NeonEnvBuilder:
                 # if the test threw an exception, don't check for errors
                 # as a failing assertion would cause the cleanup below to fail
                 ps_assert_metric_no_errors=(exc_type is None),
+                # do not fail on endpoint errors to allow the rest of cleanup to proceed
+                fail_on_endpoint_errors=False,
             )
             cleanup_error = None
 
@@ -1214,11 +1216,11 @@ class NeonEnv:
         for f in futs:
             f.result()
 
-    def stop(self, immediate=False, ps_assert_metric_no_errors=False):
+    def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True):
         """
         After this method returns, there should be no child processes running.
         """
-        self.endpoints.stop_all()
+        self.endpoints.stop_all(fail_on_endpoint_errors)
 
         # Stop storage controller before pageservers: we don't want it to spuriously
         # detect a pageserver "failure" during test teardown
@@ -3899,9 +3901,17 @@ class EndpointFactory:
             pageserver_id=pageserver_id,
         )
 
-    def stop_all(self) -> "EndpointFactory":
+    def stop_all(self, fail_on_error=True) -> "EndpointFactory":
+        exception = None
         for ep in self.endpoints:
-            ep.stop()
+            try:
+                ep.stop()
+            except Exception as e:
+                log.error(f"Failed to stop endpoint {ep.endpoint_id}: {e}")
+                exception = e
+
+        if fail_on_error and exception is not None:
+            raise exception
 
         return self
 
diff --git a/test_runner/regress/test_endpoint_crash.py b/test_runner/regress/test_endpoint_crash.py
new file mode 100644
index 0000000000..ae3dded437
--- /dev/null
+++ b/test_runner/regress/test_endpoint_crash.py
@@ -0,0 +1,23 @@
+import pytest
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+@pytest.mark.parametrize(
+    "sql_func",
+    [
+        "trigger_panic",
+        "trigger_segfault",
+        "💣",  # calls `trigger_segfault` internally
+    ],
+)
+def test_endpoint_crash(neon_env_builder: NeonEnvBuilder, sql_func: str):
+    """
+    Test that triggering crash from neon_test_utils crashes the endpoint
+    """
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_endpoint_crash")
+    endpoint = env.endpoints.create_start("test_endpoint_crash")
+
+    endpoint.safe_psql("CREATE EXTENSION neon_test_utils;")
+    with pytest.raises(Exception, match="This probably means the server terminated abnormally"):
+        endpoint.safe_psql(f"SELECT {sql_func}();")

From 13522fb722bdf09a920e8c99b6128490ccf9205b Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 5 Jul 2024 20:39:10 +0300
Subject: [PATCH 1131/1571] Increase timeout for wating subscriber caught-up
 (#8118)

## Problem

test_subscriber_restart has quit large failure rate'

https://neonprod.grafana.net/d/fddp4rvg7k2dcf/regression-test-failures?orgId=1&var-test_name=test_subscriber_restart&var-max_count=100&var-restrict=false

I can be caused by too small timeout (5 seconds) to wait until changes
are propagated.

Related to #8097

## Summary of changes

Increase timeout to 30 seconds.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 test_runner/regress/test_subscriber_restart.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py
index d7f3962620..91caad7220 100644
--- a/test_runner/regress/test_subscriber_restart.py
+++ b/test_runner/regress/test_subscriber_restart.py
@@ -54,4 +54,4 @@ def test_subscriber_restart(neon_simple_env: NeonEnv):
         pcur.execute(f"INSERT into t values ({n_records}, 0)")
         n_records += 1
         with sub.cursor() as scur:
-            wait_until(10, 0.5, check_that_changes_propagated)
+            wait_until(60, 0.5, check_that_changes_propagated)

From f0d29a0f3ea68159a02d07f7010416b89cacce56 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 5 Jul 2024 22:17:05 +0200
Subject: [PATCH 1132/1571] pageserver_live_connections: track as counter pair
 (#8227)

Generally counter pairs are preferred over gauges.
In this case, I found myself asking what the typical rate of accepted
page_service connections on a pageserver is, and I couldn't answer it
with the gauge metric.

There are a few dashboards using this metric:

https://github.com/search?q=repo%3Aneondatabase%2Fgrafana-dashboard-export%20pageserver_live_connections&type=code

I'll convert them to use the new metric once this PR reaches prod.

refs https://github.com/neondatabase/neon/issues/7427
---
 pageserver/src/metrics.rs                           | 10 ++++++----
 pageserver/src/page_service.rs                      | 13 ++++---------
 .../timeline/walreceiver/walreceiver_connection.rs  | 13 ++++---------
 3 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9e9fe7fbb8..59b7293631 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1456,10 +1456,12 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
     }
 }
 
-pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "pageserver_live_connections",
-        "Number of live network connections",
+pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "pageserver_live_connections_started",
+        "Number of network connections that we started handling",
+        "pageserver_live_connections_finished",
+        "Number of network connections that we finished handling",
         &["pageserver_connection_kind"]
     )
     .expect("failed to define a metric")
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index a440ad6378..07365b5eb8 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -55,7 +55,7 @@ use crate::basebackup::BasebackupError;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
-use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS_COUNT};
+use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
 use crate::pgdatadir_mapping::Version;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
@@ -215,14 +215,9 @@ async fn page_service_conn_main(
     auth_type: AuthType,
     connection_ctx: RequestContext,
 ) -> anyhow::Result<()> {
-    // Immediately increment the gauge, then create a job to decrement it on task exit.
-    // One of the pros of `defer!` is that this will *most probably*
-    // get called, even in presence of panics.
-    let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]);
-    gauge.inc();
-    scopeguard::defer! {
-        gauge.dec();
-    }
+    let _guard = LIVE_CONNECTIONS
+        .with_label_values(&["page_service"])
+        .guard();
 
     socket
         .set_nodelay(true)
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index c6ee6b90c4..a66900522a 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
 use super::TaskStateUpdate;
 use crate::{
     context::RequestContext,
-    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
+    metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
     task_mgr::TaskKind,
     task_mgr::WALRECEIVER_RUNTIME,
     tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
@@ -208,14 +208,9 @@ pub(super) async fn handle_walreceiver_connection(
         .instrument(tracing::info_span!("poller")),
     );
 
-    // Immediately increment the gauge, then create a job to decrement it on task exit.
-    // One of the pros of `defer!` is that this will *most probably*
-    // get called, even in presence of panics.
-    let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]);
-    gauge.inc();
-    scopeguard::defer! {
-        gauge.dec();
-    }
+    let _guard = LIVE_CONNECTIONS
+        .with_label_values(&["wal_receiver"])
+        .guard();
 
     let identify = identify_system(&replication_client).await?;
     info!("{identify:?}");

From b8d031cd0cff8bc155d962e35a781ed934999a58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 5 Jul 2024 22:18:05 +0200
Subject: [PATCH 1133/1571] Improve parsing of `ImageCompressionAlgorithm`
 (#8281)

Improve parsing of the `ImageCompressionAlgorithm` enum to allow level
customization like `zstd(1)`, as strum only takes `Default::default()`,
i.e. `None` as the level.

Part of #5431
---
 libs/pageserver_api/src/models.rs | 64 +++++++++++++++++++++++++------
 1 file changed, 52 insertions(+), 12 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ecc543917e..49c942938d 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,6 +9,7 @@ use std::{
     collections::HashMap,
     io::{BufRead, Read},
     num::{NonZeroU64, NonZeroUsize},
+    str::FromStr,
     sync::atomic::AtomicUsize,
     time::{Duration, SystemTime},
 };
@@ -437,18 +438,7 @@ pub enum CompactionAlgorithm {
     Tiered,
 }
 
-#[derive(
-    Debug,
-    Clone,
-    Copy,
-    PartialEq,
-    Eq,
-    Serialize,
-    Deserialize,
-    strum_macros::FromRepr,
-    strum_macros::EnumString,
-)]
-#[strum(serialize_all = "kebab-case")]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub enum ImageCompressionAlgorithm {
     /// Disabled for writes, and never decompress during reading.
     /// Never set this after you've enabled compression once!
@@ -468,6 +458,31 @@ impl ImageCompressionAlgorithm {
     }
 }
 
+impl FromStr for ImageCompressionAlgorithm {
+    type Err = anyhow::Error;
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let mut components = s.split(['(', ')']);
+        let first = components
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("empty string"))?;
+        match first {
+            "disabled-no-decompress" => Ok(ImageCompressionAlgorithm::DisabledNoDecompress),
+            "disabled" => Ok(ImageCompressionAlgorithm::Disabled),
+            "zstd" => {
+                let level = if let Some(v) = components.next() {
+                    let v: i8 = v.parse()?;
+                    Some(v)
+                } else {
+                    None
+                };
+
+                Ok(ImageCompressionAlgorithm::Zstd { level })
+            }
+            _ => anyhow::bail!("invalid specifier '{first}'"),
+        }
+    }
+}
+
 #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
 pub struct CompactionAlgorithmSettings {
     pub kind: CompactionAlgorithm,
@@ -1660,4 +1675,29 @@ mod tests {
             AuxFilePolicy::CrossValidation
         );
     }
+
+    #[test]
+    fn test_image_compression_algorithm_parsing() {
+        use ImageCompressionAlgorithm::*;
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("disabled").unwrap(),
+            Disabled
+        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("disabled-no-decompress").unwrap(),
+            DisabledNoDecompress
+        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("zstd").unwrap(),
+            Zstd { level: None }
+        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
+            Zstd { level: Some(18) }
+        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
+            Zstd { level: Some(-3) }
+        );
+    }
 }

From 0a937b7f91646d942eb2717239578d96b8e854ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 5 Jul 2024 22:36:28 +0200
Subject: [PATCH 1134/1571] Add concurrency to the find-large-objects scrubber
 subcommand (#8291)

The find-large-objects scrubber subcommand is quite fast if you run it
in an environment with low latency to the S3 bucket (say an EC2 instance
in the same region). However, the higher the latency gets, the slower
the command becomes. Therefore, add a concurrency param and make it
parallelized. This doesn't change that general relationship, but at
least lets us do multiple requests in parallel and therefore hopefully
faster.

Running with concurrency of 64 (default):

```
2024-07-05T17:30:22.882959Z  INFO lazy_load_identity [...]
[...]
2024-07-05T17:30:28.289853Z  INFO Scanned 500 shards. [...]
```

With concurrency of 1, simulating state before this PR:

```
2024-07-05T17:31:43.375153Z  INFO lazy_load_identity [...]
[...]
2024-07-05T17:33:51.987092Z  INFO Scanned 500 shards. [...]
```

In other words, to list 500 shards, speed is increased from 2:08 minutes
to 6 seconds.

Follow-up of  #8257, part of #5431
---
 storage_scrubber/src/find_large_objects.rs | 101 +++++++++++++--------
 storage_scrubber/src/main.rs               |  13 ++-
 2 files changed, 72 insertions(+), 42 deletions(-)

diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs
index 24668b6516..1422545f2f 100644
--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -1,4 +1,4 @@
-use futures::StreamExt;
+use futures::{StreamExt, TryStreamExt};
 use pageserver::tenant::storage_layer::LayerName;
 use serde::{Deserialize, Serialize};
 
@@ -29,7 +29,7 @@ impl LargeObjectKind {
     }
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub struct LargeObject {
     pub key: String,
     pub size: u64,
@@ -45,53 +45,76 @@ pub async fn find_large_objects(
     bucket_config: BucketConfig,
     min_size: u64,
     ignore_deltas: bool,
+    concurrency: usize,
 ) -> anyhow::Result<LargeObjectListing> {
     let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
-    let mut tenants = std::pin::pin!(stream_tenants(&s3_client, &target));
+    let tenants = std::pin::pin!(stream_tenants(&s3_client, &target));
+
+    let objects_stream = tenants.map_ok(|tenant_shard_id| {
+        let mut tenant_root = target.tenant_root(&tenant_shard_id);
+        let s3_client = s3_client.clone();
+        async move {
+            let mut objects = Vec::new();
+            let mut total_objects_ctr = 0u64;
+            // We want the objects and not just common prefixes
+            tenant_root.delimiter.clear();
+            let mut continuation_token = None;
+            loop {
+                let fetch_response =
+                    list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
+                        .await?;
+                for obj in fetch_response.contents().iter().filter(|o| {
+                    if let Some(obj_size) = o.size {
+                        min_size as i64 <= obj_size
+                    } else {
+                        false
+                    }
+                }) {
+                    let key = obj.key().expect("couldn't get key").to_owned();
+                    let kind = LargeObjectKind::from_key(&key);
+                    if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
+                        continue;
+                    }
+                    objects.push(LargeObject {
+                        key,
+                        size: obj.size.unwrap() as u64,
+                        kind,
+                    })
+                }
+                total_objects_ctr += fetch_response.contents().len() as u64;
+                match fetch_response.next_continuation_token {
+                    Some(new_token) => continuation_token = Some(new_token),
+                    None => break,
+                }
+            }
+
+            Ok((tenant_shard_id, objects, total_objects_ctr))
+        }
+    });
+    let mut objects_stream = std::pin::pin!(objects_stream.try_buffer_unordered(concurrency));
+
     let mut objects = Vec::new();
+
     let mut tenant_ctr = 0u64;
     let mut object_ctr = 0u64;
-    while let Some(tenant_shard_id) = tenants.next().await {
-        let tenant_shard_id = tenant_shard_id?;
-        let mut tenant_root = target.tenant_root(&tenant_shard_id);
-        // We want the objects and not just common prefixes
-        tenant_root.delimiter.clear();
-        let mut continuation_token = None;
-        loop {
-            let fetch_response =
-                list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
-                    .await?;
-            for obj in fetch_response.contents().iter().filter(|o| {
-                if let Some(obj_size) = o.size {
-                    min_size as i64 <= obj_size
-                } else {
-                    false
-                }
-            }) {
-                let key = obj.key().expect("couldn't get key").to_owned();
-                let kind = LargeObjectKind::from_key(&key);
-                if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
-                    continue;
-                }
-                objects.push(LargeObject {
-                    key,
-                    size: obj.size.unwrap() as u64,
-                    kind,
-                })
-            }
-            object_ctr += fetch_response.contents().len() as u64;
-            match fetch_response.next_continuation_token {
-                Some(new_token) => continuation_token = Some(new_token),
-                None => break,
-            }
-        }
+    while let Some(res) = objects_stream.next().await {
+        let (tenant_shard_id, objects_slice, total_objects_ctr) = res?;
+        objects.extend_from_slice(&objects_slice);
 
+        object_ctr += total_objects_ctr;
         tenant_ctr += 1;
-        if tenant_ctr % 50 == 0 {
+        if tenant_ctr % 100 == 0 {
             tracing::info!(
-                "Scanned {tenant_ctr} shards. objects={object_ctr}, found={}, current={tenant_shard_id}.", objects.len()
+                "Scanned {tenant_ctr} shards. objects={object_ctr}, found={}, current={tenant_shard_id}.",
+                objects.len()
             );
         }
     }
+
+    let bucket_name = target.bucket_name();
+    tracing::info!(
+        "Scan of {bucket_name} finished. Scanned {tenant_ctr} shards. objects={object_ctr}, found={}.",
+        objects.len()
+    );
     Ok(LargeObjectListing { objects })
 }
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index 10699edd3c..16a26613d2 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -78,6 +78,8 @@ enum Command {
         min_size: u64,
         #[arg(short, long, default_value_t = false)]
         ignore_deltas: bool,
+        #[arg(long = "concurrency", short = 'j', default_value_t = 64)]
+        concurrency: usize,
     },
 }
 
@@ -210,10 +212,15 @@ async fn main() -> anyhow::Result<()> {
         Command::FindLargeObjects {
             min_size,
             ignore_deltas,
+            concurrency,
         } => {
-            let summary =
-                find_large_objects::find_large_objects(bucket_config, min_size, ignore_deltas)
-                    .await?;
+            let summary = find_large_objects::find_large_objects(
+                bucket_config,
+                min_size,
+                ignore_deltas,
+                concurrency,
+            )
+            .await?;
             println!("{}", serde_json::to_string(&summary).unwrap());
             Ok(())
         }

From 27fe7f8963e5227d24cdd56aab419fa973dba369 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 6 Jul 2024 17:41:54 +0100
Subject: [PATCH 1135/1571] build(deps): bump certifi from 2023.7.22 to
 2024.7.4 (#8301)

---
 poetry.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 7740388fb8..bf16aaf55d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -734,13 +734,13 @@ typing-extensions = ">=4.1.0"
 
 [[package]]
 name = "certifi"
-version = "2023.7.22"
+version = "2024.7.4"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
-    {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
+    {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"},
+    {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"},
 ]
 
 [[package]]

From 154ba5e1b440bda455c8962b53688268a2161d4b Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 8 Jul 2024 09:05:49 -0400
Subject: [PATCH 1136/1571] fix(pageserver): ensure sparse keyspace is ordered
 (#8285)

## Problem

Sparse keyspaces were constructed with ranges out of order: this didn't break things obviously, but meant that users of KeySpace functions that assume ordering would assert out.

Closes https://github.com/neondatabase/neon/issues/8277

## Summary of changes

make sure the sparse keyspace has ordered keyspace parts
---
 pageserver/src/pgdatadir_mapping.rs | 52 +++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 25d00d6dfd..fefd8d88ff 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -919,6 +919,9 @@ impl Timeline {
             result.add_key(AUX_FILES_KEY);
         }
 
+        // Add extra keyspaces in the test cases. Some test cases write keys into the storage without
+        // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
+        // and the keys will not be garbage-colllected.
         #[cfg(test)]
         {
             let guard = self.extra_test_dense_keyspace.load();
@@ -927,13 +930,48 @@ impl Timeline {
             }
         }
 
-        Ok((
-            result.to_keyspace(),
-            /* AUX sparse key space */
-            SparseKeySpace(KeySpace {
-                ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
-            }),
-        ))
+        let dense_keyspace = result.to_keyspace();
+        let sparse_keyspace = SparseKeySpace(KeySpace {
+            ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
+        });
+
+        if cfg!(debug_assertions) {
+            // Verify if the sparse keyspaces are ordered and non-overlapping.
+
+            // We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each
+            // category of sparse keys are split into their own image/delta files. If there
+            // are overlapping keyspaces, they will be automatically merged by keyspace accum,
+            // and we want the developer to keep the keyspaces separated.
+
+            let ranges = &sparse_keyspace.0.ranges;
+
+            // TODO: use a single overlaps_with across the codebase
+            fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
+                !(a.end <= b.start || b.end <= a.start)
+            }
+            for i in 0..ranges.len() {
+                for j in 0..i {
+                    if overlaps_with(&ranges[i], &ranges[j]) {
+                        panic!(
+                            "overlapping sparse keyspace: {}..{} and {}..{}",
+                            ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end
+                        );
+                    }
+                }
+            }
+            for i in 1..ranges.len() {
+                assert!(
+                    ranges[i - 1].end <= ranges[i].start,
+                    "unordered sparse keyspace: {}..{} and {}..{}",
+                    ranges[i - 1].start,
+                    ranges[i - 1].end,
+                    ranges[i].start,
+                    ranges[i].end
+                );
+            }
+        }
+
+        Ok((dense_keyspace, sparse_keyspace))
     }
 
     /// Get cached size of relation if it not updated after specified LSN

From 1121a1cbac0059369870d943bf144f0a221db65c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 8 Jul 2024 14:10:42 +0100
Subject: [PATCH 1137/1571] pageserver: switch to jemalloc (#8307)

## Problem

- Resident memory on long running pageserver processes tends to climb:
memory fragmentation is suspected.
- Total resident memory may be a limiting factor for running on smaller
nodes.

## Summary of changes

- As a low-energy experiment, switch the pageserver to use jemalloc (not
a net-new dependency, proxy already use it)
- Decide at end of week whether to revert before next release.
---
 Cargo.lock                       | 2 ++
 pageserver/Cargo.toml            | 1 +
 pageserver/src/bin/pageserver.rs | 3 +++
 workspace_hack/Cargo.toml        | 1 +
 4 files changed, 7 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index 6dae8e3403..716b6690d9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3667,6 +3667,7 @@ dependencies = [
  "sysinfo",
  "tenant_size_model",
  "thiserror",
+ "tikv-jemallocator",
  "tokio",
  "tokio-epoll-uring",
  "tokio-io-timeout",
@@ -7468,6 +7469,7 @@ dependencies = [
  "syn 1.0.109",
  "syn 2.0.52",
  "sync_wrapper",
+ "tikv-jemalloc-sys",
  "time",
  "time-macros",
  "tokio",
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 4335f38f1e..0d9343d643 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -62,6 +62,7 @@ sync_wrapper.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
+tikv-jemallocator.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
 tokio-epoll-uring.workspace = true
 tokio-io-timeout.workspace = true
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 39d4e46c96..2763352a21 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -47,6 +47,9 @@ use utils::{
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
 const PID_FILE_NAME: &str = "pageserver.pid";
 
 const FEATURES: &[&str] = &[
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index f43076171f..e1b1806bc8 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -69,6 +69,7 @@ sha2 = { version = "0.10", features = ["asm"] }
 smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
 subtle = { version = "2" }
 sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
+tikv-jemalloc-sys = { version = "0.5" }
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }

From 2a3410d1c3f4d1cfec3c3959311962872c8fdb87 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 3 Jul 2024 14:57:17 -0500
Subject: [PATCH 1138/1571] Hide import behind TYPE_CHECKING

No need to import it if we aren't type checking anything.
---
 test_runner/performance/test_logical_replication.py | 7 ++++++-
 test_runner/regress/test_physical_replication.py    | 6 +++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 7d11facc29..570bd11b6f 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -1,8 +1,13 @@
+from __future__ import annotations
+
 import time
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import AuxFileStore, NeonEnv, PgBin, logical_replication_sync
+from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv, PgBin
 
 
 @pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py
index a1bff32eed..043aff686b 100644
--- a/test_runner/regress/test_physical_replication.py
+++ b/test_runner/regress/test_physical_replication.py
@@ -1,7 +1,11 @@
+from __future__ import annotations
+
 import random
 import time
+from typing import TYPE_CHECKING
 
-from fixtures.neon_fixtures import NeonEnv
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv
 
 
 def test_physical_replication(neon_simple_env: NeonEnv):

From f2ec5429542f4aa4d5be6c2f6551cde8727c2829 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 3 Jul 2024 14:54:49 -0500
Subject: [PATCH 1139/1571] Add Neon HTTP API test fixture

This is a Python binding to the Neon HTTP API. It isn't complete, but
can be extended as necessary.

Co-authored-by: Sasha Krassovsky <sasha@neon.tech>
---
 test_runner/fixtures/neon_api.py      | 263 ++++++++++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py |  21 ++
 2 files changed, 284 insertions(+)
 create mode 100644 test_runner/fixtures/neon_api.py

diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py
new file mode 100644
index 0000000000..39baf5fab6
--- /dev/null
+++ b/test_runner/fixtures/neon_api.py
@@ -0,0 +1,263 @@
+from __future__ import annotations
+
+import time
+from typing import TYPE_CHECKING, cast
+
+import requests
+
+if TYPE_CHECKING:
+    from typing import Any, Dict, Literal, Optional, Union
+
+    from fixtures.pg_version import PgVersion
+
+
+def connection_parameters_to_env(params: Dict[str, str]) -> Dict[str, str]:
+    return {
+        "PGHOST": params["host"],
+        "PGDATABASE": params["database"],
+        "PGUSER": params["role"],
+        "PGPASSWORD": params["password"],
+    }
+
+
+class NeonAPI:
+    def __init__(self, neon_api_key: str, neon_api_base_url: str):
+        self.__neon_api_key = neon_api_key
+        self.__neon_api_base_url = neon_api_base_url.strip("/")
+
+    def __request(
+        self, method: Union[str, bytes], endpoint: str, **kwargs: Any
+    ) -> requests.Response:
+        if "headers" not in kwargs:
+            kwargs["headers"] = {}
+        kwargs["headers"]["Authorization"] = f"Bearer {self.__neon_api_key}"
+
+        return requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs)
+
+    def create_project(
+        self,
+        pg_version: Optional[PgVersion] = None,
+        name: Optional[str] = None,
+        branch_name: Optional[str] = None,
+        branch_role_name: Optional[str] = None,
+        branch_database_name: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        data: Dict[str, Any] = {
+            "project": {
+                "branch": {},
+            },
+        }
+        if name:
+            data["project"]["name"] = name
+        if pg_version:
+            data["project"]["pg_version"] = int(pg_version)
+        if branch_name:
+            data["project"]["branch"]["name"] = branch_name
+        if branch_role_name:
+            data["project"]["branch"]["role_name"] = branch_role_name
+        if branch_database_name:
+            data["project"]["branch"]["database_name"] = branch_database_name
+
+        resp = self.__request(
+            "POST",
+            "/projects",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+            json=data,
+        )
+
+        assert resp.status_code == 201
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_project_details(self, project_id: str) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+        )
+        assert resp.status_code == 200
+        return cast("Dict[str, Any]", resp.json())
+
+    def delete_project(
+        self,
+        project_id: str,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "DELETE",
+            f"/projects/{project_id}",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def start_endpoint(
+        self,
+        project_id: str,
+        endpoint_id: str,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/endpoints/{endpoint_id}/start",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def suspend_endpoint(
+        self,
+        project_id: str,
+        endpoint_id: str,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/endpoints/{endpoint_id}/suspend",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def restart_endpoint(
+        self,
+        project_id: str,
+        endpoint_id: str,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/endpoints/{endpoint_id}/restart",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def create_endpoint(
+        self,
+        project_id: str,
+        branch_id: str,
+        endpoint_type: Literal["read_write", "read_only"],
+        settings: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        data: Dict[str, Any] = {
+            "endpoint": {
+                "branch_id": branch_id,
+            },
+        }
+
+        if endpoint_type:
+            data["endpoint"]["type"] = endpoint_type
+        if settings:
+            data["endpoint"]["settings"] = settings
+
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/endpoints",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+            json=data,
+        )
+
+        assert resp.status_code == 201
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_connection_uri(
+        self,
+        project_id: str,
+        branch_id: Optional[str] = None,
+        endpoint_id: Optional[str] = None,
+        database_name: str = "neondb",
+        role_name: str = "neondb_owner",
+        pooled: bool = True,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/connection_uri",
+            params={
+                "branch_id": branch_id,
+                "endpoint_id": endpoint_id,
+                "database_name": database_name,
+                "role_name": role_name,
+                "pooled": pooled,
+            },
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_branches(self, project_id: str) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/branches",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_endpoints(self, project_id: str) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/endpoints",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_operations(self, project_id: str) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/operations",
+            headers={
+                "Accept": "application/json",
+                "Authorization": f"Bearer {self.__neon_api_key}",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def wait_for_operation_to_finish(self, project_id: str):
+        has_running = True
+        while has_running:
+            has_running = False
+            operations = self.get_operations(project_id)["operations"]
+            for op in operations:
+                if op["status"] in {"scheduling", "running", "cancelling"}:
+                    has_running = True
+            time.sleep(0.5)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5fb4d94817..ac2fcd8ade 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -87,6 +87,8 @@ from fixtures.utils import (
 )
 from fixtures.utils import AuxFileStore as AuxFileStore  # reexport
 
+from .neon_api import NeonAPI
+
 """
 This file contains pytest fixtures. A fixture is a test resource that can be
 summoned by placing its name in the test's arguments.
@@ -184,6 +186,25 @@ def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Ite
     yield versioned_dir
 
 
+@pytest.fixture(scope="session")
+def neon_api_key() -> str:
+    api_key = os.getenv("NEON_API_KEY")
+    if not api_key:
+        raise AssertionError("Set the NEON_API_KEY environment variable")
+
+    return api_key
+
+
+@pytest.fixture(scope="session")
+def neon_api_base_url() -> str:
+    return os.getenv("NEON_API_BASE_URL", "https://console-stage.neon.build/api/v2")
+
+
+@pytest.fixture(scope="session")
+def neon_api(neon_api_key: str, neon_api_base_url: str) -> NeonAPI:
+    return NeonAPI(neon_api_key, neon_api_base_url)
+
+
 def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]:
     """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar.
 

From 118847cd41bcf5f84126f371f8e322d51eeed1f7 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 3 Jul 2024 14:59:19 -0500
Subject: [PATCH 1140/1571] Log PG environment variables when a PgBin runs

Useful for debugging situations like connecting to databases.

Co-authored-by: Sasha Krassovsky <sasha@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ac2fcd8ade..532e7bcce5 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2883,6 +2883,13 @@ class PgBin:
         env.update(env_add)
         return env
 
+    def _log_env(self, env: dict[str, str]) -> None:
+        env_s = {}
+        for k, v in env.items():
+            if k.startswith("PG") and k != "PGPASSWORD":
+                env_s[k] = v
+        log.debug(f"Environment: {env_s}")
+
     def run(
         self,
         command: List[str],
@@ -2905,6 +2912,7 @@ class PgBin:
         self._fixpath(command)
         log.info(f"Running command '{' '.join(command)}'")
         env = self._build_env(env)
+        self._log_env(env)
         subprocess.run(command, env=env, cwd=cwd, check=True)
 
     def run_capture(
@@ -2925,6 +2933,7 @@ class PgBin:
         self._fixpath(command)
         log.info(f"Running command '{' '.join(command)}'")
         env = self._build_env(env)
+        self._log_env(env)
         base_path, _, _ = subprocess_capture(
             self.log_dir,
             command,

From b54dd9af1575169ce008e6bc1e3f44d7ab22413f Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 3 Jul 2024 15:04:57 -0500
Subject: [PATCH 1141/1571] Add PgBin.run_nonblocking()

Allows a process to run without blocking program execution, which can be
useful for certain test scenarios.

Co-authored-by: Sasha Krassovsky <sasha@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 32 ++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 532e7bcce5..cae2e422c1 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2890,14 +2890,14 @@ class PgBin:
                 env_s[k] = v
         log.debug(f"Environment: {env_s}")
 
-    def run(
+    def run_nonblocking(
         self,
         command: List[str],
         env: Optional[Env] = None,
         cwd: Optional[Union[str, Path]] = None,
-    ):
+    ) -> subprocess.Popen[Any]:
         """
-        Run one of the postgres binaries.
+        Run one of the postgres binaries, not waiting for it to finish
 
         The command should be in list form, e.g. ['pgbench', '-p', '55432']
 
@@ -2908,12 +2908,34 @@ class PgBin:
 
         If you want stdout/stderr captured to files, use `run_capture` instead.
         """
-
         self._fixpath(command)
         log.info(f"Running command '{' '.join(command)}'")
         env = self._build_env(env)
         self._log_env(env)
-        subprocess.run(command, env=env, cwd=cwd, check=True)
+        return subprocess.Popen(command, env=env, cwd=cwd, stdout=subprocess.PIPE, text=True)
+
+    def run(
+        self,
+        command: List[str],
+        env: Optional[Env] = None,
+        cwd: Optional[Union[str, Path]] = None,
+    ) -> None:
+        """
+        Run one of the postgres binaries, waiting for it to finish
+
+        The command should be in list form, e.g. ['pgbench', '-p', '55432']
+
+        All the necessary environment variables will be set.
+
+        If the first argument (the command name) doesn't include a path (no '/'
+        characters present), then it will be edited to include the correct path.
+
+        If you want stdout/stderr captured to files, use `run_capture` instead.
+        """
+        proc = self.run_nonblocking(command, env, cwd)
+        proc.wait()
+        if proc.returncode != 0:
+            raise subprocess.CalledProcessError(proc.returncode, proc.args)
 
     def run_capture(
         self,

From 1c57f6bac34c2e97a1929cd5e96af1156bdc240d Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 3 Jul 2024 15:22:42 -0500
Subject: [PATCH 1142/1571] Add long running replication tests

These tests will help verify that replication, both physical and
logical, works as expected in Neon.

Co-authored-by: Sasha Krassovsky <sasha@neon.tech>
---
 .../actions/run-python-test-set/action.yml    |   1 +
 .github/workflows/benchmarking.yml            |  72 ++++-
 .../performance/test_logical_replication.py   | 295 ++++++++++++++++-
 .../performance/test_physical_replication.py  | 296 ++++++++++++++++++
 4 files changed, 662 insertions(+), 2 deletions(-)
 create mode 100644 test_runner/performance/test_physical_replication.py

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 7f843de1a5..daaedf6d11 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -115,6 +115,7 @@ runs:
         export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
         export DEFAULT_PG_VERSION=${PG_VERSION#v}
         export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
+        export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}
 
         if [ "${BUILD_TYPE}" = "remote" ]; then
           export REMOTE_ENV=1
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index db04b5de7d..899cae2b86 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -99,7 +99,14 @@ jobs:
         # Set --sparse-ordering option of pytest-order plugin
         # to ensure tests are running in order of appears in the file.
         # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
-        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
+        extra_params:
+          -m remote_cluster
+          --sparse-ordering
+          --timeout 5400
+          --ignore test_runner/performance/test_perf_olap.py
+          --ignore test_runner/performance/test_perf_pgvector_queries.py
+          --ignore test_runner/performance/test_logical_replication.py
+          --ignore test_runner/performance/test_physical_replication.py
       env:
         BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -125,6 +132,69 @@ jobs:
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
+  replication-tests:
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 14
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: "neon-staging"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      options: --init
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Run benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_logical_replication.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 5400
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Run benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_physical_replication.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 5400
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
   generate-matrices:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 570bd11b6f..5ab83dd31d 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -1,13 +1,24 @@
 from __future__ import annotations
 
 import time
+import traceback
+from typing import TYPE_CHECKING
 
+import psycopg2
+import psycopg2.extras
 import pytest
+from fixtures.benchmark_fixture import MetricReport
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
+from fixtures.neon_api import connection_parameters_to_env
 from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
+from fixtures.pg_version import PgVersion
 
 if TYPE_CHECKING:
+    from fixtures.benchmark_fixture import NeonBenchmarker
+    from fixtures.neon_api import NeonAPI
     from fixtures.neon_fixtures import NeonEnv, PgBin
+    from fixtures.pg_version import PgVersion
 
 
 @pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
@@ -31,7 +42,6 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
     vanilla_pg.safe_psql("truncate table pgbench_history")
 
     connstr = endpoint.connstr().replace("'", "''")
-    print(f"connstr='{connstr}'")
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
 
     # Wait logical replication channel to be established
@@ -47,3 +57,286 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
     sum_master = endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
     sum_replica = vanilla_pg.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
     assert sum_master == sum_replica
+
+
+def check_pgbench_still_running(pgbench, label=""):
+    rc = pgbench.poll()
+    if rc is not None:
+        raise RuntimeError(f"{label} pgbench terminated early with return code {rc}")
+
+
+def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600):
+    start = time.time()
+    pub_cur.execute("SELECT pg_current_wal_flush_lsn()")
+    pub_lsn = Lsn(pub_cur.fetchall()[0][0])
+    while (time.time() - start) < timeout_sec:
+        sub_cur.execute("SELECT latest_end_lsn FROM pg_catalog.pg_stat_subscription")
+        res = sub_cur.fetchall()[0][0]
+        if res:
+            log.info(f"subscriber_lsn={res}")
+            sub_lsn = Lsn(res)
+            log.info(f"Subscriber LSN={sub_lsn}, publisher LSN={pub_lsn}")
+            if sub_lsn >= pub_lsn:
+                return time.time() - start
+        time.sleep(0.5)
+    raise TimeoutError(f"Logical replication sync took more than {timeout_sec} sec")
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_subscriber_lag(
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    """
+    Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects
+    on subscriber. Periodically restarts subscriber while still running the inserts, and
+    measures how long sync takes after restart.
+    """
+    test_duration_min = 60
+    sync_interval_min = 5
+    pgbench_duration = f"-T{test_duration_min * 60 * 2}"
+
+    pub_project = neon_api.create_project(pg_version)
+    pub_project_id = pub_project["project"]["id"]
+    neon_api.wait_for_operation_to_finish(pub_project_id)
+    error_occurred = False
+    try:
+        sub_project = neon_api.create_project(pg_version)
+        sub_project_id = sub_project["project"]["id"]
+        sub_endpoint_id = sub_project["endpoints"][0]["id"]
+        neon_api.wait_for_operation_to_finish(sub_project_id)
+        try:
+            pub_env = connection_parameters_to_env(
+                pub_project["connection_uris"][0]["connection_parameters"]
+            )
+            sub_env = connection_parameters_to_env(
+                sub_project["connection_uris"][0]["connection_parameters"]
+            )
+            pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
+            sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
+
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+
+            pub_conn = psycopg2.connect(pub_connstr)
+            sub_conn = psycopg2.connect(sub_connstr)
+            pub_conn.autocommit = True
+            sub_conn.autocommit = True
+            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                sub_cur.execute("truncate table pgbench_accounts")
+                sub_cur.execute("truncate table pgbench_history")
+
+                pub_cur.execute(
+                    "create publication pub1 for table pgbench_accounts, pgbench_history"
+                )
+                sub_cur.execute(
+                    f"create subscription sub1 connection '{pub_connstr}' publication pub1"
+                )
+
+                initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+            pub_conn.close()
+            sub_conn.close()
+
+            zenbenchmark.record(
+                "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
+            )
+
+            pub_workload = pg_bin.run_nonblocking(
+                ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
+            )
+            try:
+                sub_workload = pg_bin.run_nonblocking(
+                    ["pgbench", "-c10", pgbench_duration, "-S"],
+                    env=sub_env,
+                )
+                try:
+                    start = time.time()
+                    while time.time() - start < test_duration_min * 60:
+                        time.sleep(sync_interval_min * 60)
+                        check_pgbench_still_running(pub_workload, "pub")
+                        check_pgbench_still_running(sub_workload, "sub")
+
+                        with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
+                            sub_connstr
+                        ) as sub_conn:
+                            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                                lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                        log.info(f"Replica lagged behind master by {lag} seconds")
+                        zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+                        sub_workload.terminate()
+                        neon_api.restart_endpoint(
+                            sub_project_id,
+                            sub_endpoint_id,
+                        )
+                        neon_api.wait_for_operation_to_finish(sub_project_id)
+                        sub_workload = pg_bin.run_nonblocking(
+                            ["pgbench", "-c10", pgbench_duration, "-S"],
+                            env=sub_env,
+                        )
+
+                        # Measure storage to make sure replication information isn't bloating storage
+                        sub_storage = neon_api.get_project_details(sub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        pub_storage = neon_api.get_project_details(pub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        zenbenchmark.record(
+                            "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+                        zenbenchmark.record(
+                            "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+
+                finally:
+                    sub_workload.terminate()
+            finally:
+                pub_workload.terminate()
+        except Exception as e:
+            error_occurred = True
+            log.error(f"Caught exception {e}")
+            log.error(traceback.format_exc())
+        finally:
+            if not error_occurred:
+                neon_api.delete_project(sub_project_id)
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred
+        neon_api.delete_project(pub_project_id)
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_publisher_restart(
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    """
+    Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects
+    on subscriber. Periodically restarts publisher (to exercise on-demand WAL download), and
+    measures how long sync takes after restart.
+    """
+    test_duration_min = 60
+    sync_interval_min = 5
+    pgbench_duration = f"-T{test_duration_min * 60 * 2}"
+
+    pub_project = neon_api.create_project(pg_version)
+    pub_project_id = pub_project["project"]["id"]
+    pub_endpoint_id = pub_project["endpoints"][0]["id"]
+    neon_api.wait_for_operation_to_finish(pub_project_id)
+    error_occurred = False
+    try:
+        sub_project = neon_api.create_project(pg_version)
+        sub_project_id = sub_project["project"]["id"]
+        neon_api.wait_for_operation_to_finish(sub_project_id)
+        try:
+            pub_env = connection_parameters_to_env(
+                pub_project["connection_uris"][0]["connection_parameters"]
+            )
+            sub_env = connection_parameters_to_env(
+                sub_project["connection_uris"][0]["connection_parameters"]
+            )
+            pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
+            sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
+
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+
+            pub_conn = psycopg2.connect(pub_connstr)
+            sub_conn = psycopg2.connect(sub_connstr)
+            pub_conn.autocommit = True
+            sub_conn.autocommit = True
+            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                sub_cur.execute("truncate table pgbench_accounts")
+                sub_cur.execute("truncate table pgbench_history")
+
+                pub_cur.execute(
+                    "create publication pub1 for table pgbench_accounts, pgbench_history"
+                )
+                sub_cur.execute(
+                    f"create subscription sub1 connection '{pub_connstr}' publication pub1"
+                )
+
+                initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+            pub_conn.close()
+            sub_conn.close()
+
+            zenbenchmark.record(
+                "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
+            )
+
+            pub_workload = pg_bin.run_nonblocking(
+                ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
+            )
+            try:
+                sub_workload = pg_bin.run_nonblocking(
+                    ["pgbench", "-c10", pgbench_duration, "-S"],
+                    env=sub_env,
+                )
+                try:
+                    start = time.time()
+                    while time.time() - start < test_duration_min * 60:
+                        time.sleep(sync_interval_min * 60)
+                        check_pgbench_still_running(pub_workload, "pub")
+                        check_pgbench_still_running(sub_workload, "sub")
+
+                        pub_workload.terminate()
+                        neon_api.restart_endpoint(
+                            pub_project_id,
+                            pub_endpoint_id,
+                        )
+                        neon_api.wait_for_operation_to_finish(pub_project_id)
+                        pub_workload = pg_bin.run_nonblocking(
+                            ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
+                            env=pub_env,
+                        )
+                        with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
+                            sub_connstr
+                        ) as sub_conn:
+                            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                                lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                        log.info(f"Replica lagged behind master by {lag} seconds")
+                        zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+
+                        # Measure storage to make sure replication information isn't bloating storage
+                        sub_storage = neon_api.get_project_details(sub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        pub_storage = neon_api.get_project_details(pub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        zenbenchmark.record(
+                            "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+                        zenbenchmark.record(
+                            "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+
+                finally:
+                    sub_workload.terminate()
+            finally:
+                pub_workload.terminate()
+        except Exception as e:
+            error_occurred = True
+            log.error(f"Caught exception {e}")
+            log.error(traceback.format_exc())
+        finally:
+            if not error_occurred:
+                neon_api.delete_project(sub_project_id)
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred
+        neon_api.delete_project(pub_project_id)
diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py
new file mode 100644
index 0000000000..7e16197211
--- /dev/null
+++ b/test_runner/performance/test_physical_replication.py
@@ -0,0 +1,296 @@
+from __future__ import annotations
+
+import csv
+import os
+import subprocess
+import time
+import traceback
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import psycopg2
+import psycopg2.extras
+import pytest
+from fixtures.benchmark_fixture import MetricReport
+from fixtures.common_types import Lsn
+from fixtures.log_helper import log
+from fixtures.neon_api import connection_parameters_to_env
+from fixtures.pg_version import PgVersion
+
+if TYPE_CHECKING:
+    from typing import Any, List, Optional
+
+    from fixtures.benchmark_fixture import NeonBenchmarker
+    from fixtures.neon_api import NeonAPI
+    from fixtures.neon_fixtures import PgBin
+
+
+# Granularity of ~0.5 sec
+def measure_replication_lag(master, replica, timeout_sec=600):
+    start = time.time()
+    master.execute("SELECT pg_current_wal_flush_lsn()")
+    master_lsn = Lsn(master.fetchall()[0][0])
+    while (time.time() - start) < timeout_sec:
+        replica.execute("select pg_last_wal_replay_lsn()")
+        replica_lsn = replica.fetchall()[0][0]
+        if replica_lsn:
+            if Lsn(replica_lsn) >= master_lsn:
+                return time.time() - start
+        time.sleep(0.5)
+    raise TimeoutError(f"Replication sync took more than {timeout_sec} sec")
+
+
+def check_pgbench_still_running(pgbench):
+    rc = pgbench.poll()
+    if rc is not None:
+        raise RuntimeError(f"Pgbench terminated early with return code {rc}")
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_ro_replica_lag(
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    test_duration_min = 60
+    sync_interval_min = 10
+
+    pgbench_duration = f"-T{test_duration_min * 60 * 2}"
+
+    project = neon_api.create_project(pg_version)
+    project_id = project["project"]["id"]
+    neon_api.wait_for_operation_to_finish(project_id)
+    error_occurred = False
+    try:
+        branch_id = project["branch"]["id"]
+        master_connstr = project["connection_uris"][0]["connection_uri"]
+        master_env = connection_parameters_to_env(
+            project["connection_uris"][0]["connection_parameters"]
+        )
+
+        replica = neon_api.create_endpoint(
+            project_id,
+            branch_id,
+            endpoint_type="read_only",
+            settings={"pg_settings": {"hot_standby_feedback": "on"}},
+        )
+        replica_env = master_env.copy()
+        replica_env["PGHOST"] = replica["endpoint"]["host"]
+        neon_api.wait_for_operation_to_finish(project_id)
+
+        replica_connstr = neon_api.get_connection_uri(
+            project_id,
+            endpoint_id=replica["endpoint"]["id"],
+        )["uri"]
+
+        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=master_env)
+
+        master_workload = pg_bin.run_nonblocking(
+            ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
+            env=master_env,
+        )
+        try:
+            replica_workload = pg_bin.run_nonblocking(
+                ["pgbench", "-c10", pgbench_duration, "-S"],
+                env=replica_env,
+            )
+            try:
+                start = time.time()
+                while time.time() - start < test_duration_min * 60:
+                    check_pgbench_still_running(master_workload)
+                    check_pgbench_still_running(replica_workload)
+                    time.sleep(sync_interval_min * 60)
+                    with psycopg2.connect(master_connstr) as conn_master, psycopg2.connect(
+                        replica_connstr
+                    ) as conn_replica:
+                        with conn_master.cursor() as cur_master, conn_replica.cursor() as cur_replica:
+                            lag = measure_replication_lag(cur_master, cur_replica)
+                    log.info(f"Replica lagged behind master by {lag} seconds")
+                    zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+            finally:
+                replica_workload.terminate()
+        finally:
+            master_workload.terminate()
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception: {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred  # Fail the test if an error occurred
+        neon_api.delete_project(project_id)
+
+
+def report_pgbench_aggregate_intervals(
+    output_dir: Path,
+    prefix: str,
+    zenbenchmark: NeonBenchmarker,
+):
+    for filename in os.listdir(output_dir):
+        if filename.startswith(prefix):
+            # The file will be in the form <prefix>_<node>.<pid>
+            # So we first lop off the .<pid>, and then lop off the prefix and the _
+            node = filename.split(".")[0][len(prefix) + 1 :]
+            with open(output_dir / filename) as f:
+                reader = csv.reader(f, delimiter=" ")
+                for line in reader:
+                    num_transactions = int(line[1])
+                    if num_transactions == 0:
+                        continue
+                    sum_latency = int(line[2])
+                    sum_lag = int(line[3])
+                    zenbenchmark.record(
+                        f"{node}_num_txns", num_transactions, "txns", MetricReport.HIGHER_IS_BETTER
+                    )
+                    zenbenchmark.record(
+                        f"{node}_avg_latency",
+                        sum_latency / num_transactions,
+                        "s",
+                        MetricReport.LOWER_IS_BETTER,
+                    )
+                    zenbenchmark.record(
+                        f"{node}_avg_lag",
+                        sum_lag / num_transactions,
+                        "s",
+                        MetricReport.LOWER_IS_BETTER,
+                    )
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_replication_start_stop(
+    pg_bin: PgBin,
+    test_output_dir: Path,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    """
+    Cycles through different configurations of read replicas being enabled disabled. The whole time,
+    there's a pgbench read/write workload going on the master. For each replica, we either turn it
+    on or off, and see how long it takes to catch up after some set amount of time of replicating
+    the pgbench.
+    """
+
+    prefix = "pgbench_agg"
+    num_replicas = 2
+    configuration_test_time_sec = 10 * 60
+    pgbench_duration = f"-T{2 ** num_replicas * configuration_test_time_sec}"
+    error_occurred = False
+
+    project = neon_api.create_project(pg_version)
+    project_id = project["project"]["id"]
+    neon_api.wait_for_operation_to_finish(project_id)
+    try:
+        branch_id = project["branch"]["id"]
+        master_connstr = project["connection_uris"][0]["connection_uri"]
+        master_env = connection_parameters_to_env(
+            project["connection_uris"][0]["connection_parameters"]
+        )
+
+        replicas = []
+        for _ in range(num_replicas):
+            replicas.append(
+                neon_api.create_endpoint(
+                    project_id,
+                    branch_id,
+                    endpoint_type="read_only",
+                    settings={"pg_settings": {"hot_standby_feedback": "on"}},
+                )
+            )
+            neon_api.wait_for_operation_to_finish(project_id)
+
+        replica_connstr = [
+            neon_api.get_connection_uri(
+                project_id,
+                endpoint_id=replicas[i]["endpoint"]["id"],
+            )["uri"]
+            for i in range(num_replicas)
+        ]
+        replica_env = [master_env.copy() for _ in range(num_replicas)]
+        for i in range(num_replicas):
+            replica_env[i]["PGHOST"] = replicas[i]["endpoint"]["host"]
+
+        pg_bin.run_capture(["pgbench", "-i", "-s10"], env=master_env)
+
+        # Sync replicas
+        with psycopg2.connect(master_connstr) as conn_master:
+            with conn_master.cursor() as cur_master:
+                for i in range(num_replicas):
+                    conn_replica = psycopg2.connect(replica_connstr[i])
+                    measure_replication_lag(cur_master, conn_replica.cursor())
+
+        master_pgbench = pg_bin.run_nonblocking(
+            [
+                "pgbench",
+                "-c10",
+                pgbench_duration,
+                "-Mprepared",
+                "--log",
+                f"--log-prefix={test_output_dir}/{prefix}_master",
+                f"--aggregate-interval={configuration_test_time_sec}",
+            ],
+            env=master_env,
+        )
+        replica_pgbench: List[Optional[subprocess.Popen[Any]]] = [None for _ in range(num_replicas)]
+
+        # Use the bits of iconfig to tell us which configuration we are on. For example
+        # a iconfig of 2 is 10 in binary, indicating replica 0 is suspended and replica 1 is
+        # alive.
+        for iconfig in range((1 << num_replicas) - 1, -1, -1):
+
+            def replica_enabled(iconfig: int = iconfig):
+                return bool((iconfig >> 1) & 1)
+
+            # Change configuration
+            for ireplica in range(num_replicas):
+                if replica_enabled() and replica_pgbench[ireplica] is None:
+                    replica_pgbench[ireplica] = pg_bin.run_nonblocking(
+                        [
+                            "pgbench",
+                            "-c10",
+                            "-S",
+                            pgbench_duration,
+                            "--log",
+                            f"--log-prefix={test_output_dir}/{prefix}_replica_{ireplica}",
+                            f"--aggregate-interval={configuration_test_time_sec}",
+                        ],
+                        env=replica_env[ireplica],
+                    )
+                elif not replica_enabled() and replica_pgbench[ireplica] is not None:
+                    pgb = replica_pgbench[ireplica]
+                    assert pgb is not None
+                    pgb.terminate()
+                    pgb.wait()
+                    replica_pgbench[ireplica] = None
+
+                    neon_api.suspend_endpoint(
+                        project_id,
+                        replicas[ireplica]["endpoint"]["id"],
+                    )
+                    neon_api.wait_for_operation_to_finish(project_id)
+
+            time.sleep(configuration_test_time_sec)
+
+            with psycopg2.connect(master_connstr) as conn_master:
+                with conn_master.cursor() as cur_master:
+                    for ireplica in range(num_replicas):
+                        replica_conn = psycopg2.connect(replica_connstr[ireplica])
+                        lag = measure_replication_lag(cur_master, replica_conn.cursor())
+                        zenbenchmark.record(
+                            f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER
+                        )
+                        log.info(
+                            f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}"
+                        )
+        master_pgbench.terminate()
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred
+        neon_api.delete_project(project_id)
+        # Only report results if we didn't error out
+        report_pgbench_aggregate_intervals(test_output_dir, prefix, zenbenchmark)

From fcdf060816b50efe840907748fe3d856277a4e80 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 8 Jul 2024 15:39:41 +0100
Subject: [PATCH 1143/1571] pageserver: respect has_relmap_file in
 collect_keyspace (#8276)

## Problem

Rarely, a dbdir entry can exist with no `relmap_file_key` data. This
causes compaction to fail, because it assumes that if the database
exists, then so does the relmap file.

Basebackup already handled this using a boolean to record whether such a
key exists, but `collect_keyspace` didn't.

## Summary of changes

- Respect the flag for whether a relfilemap exists in collect_keyspace
- The reproducer for this issue will merge separately in
https://github.com/neondatabase/neon/pull/8232
---
 pageserver/src/pgdatadir_mapping.rs | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index fefd8d88ff..8a6cfea92b 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -854,13 +854,14 @@ impl Timeline {
         result.add_key(DBDIR_KEY);
 
         // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf)?;
+        let dbdir = self.list_dbdirs(lsn, ctx).await?;
+        let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect();
 
-        let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
-        dbs.sort_unstable();
-        for (spcnode, dbnode) in dbs {
-            result.add_key(relmap_file_key(spcnode, dbnode));
+        dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b));
+        for ((spcnode, dbnode), has_relmap_file) in dbs {
+            if has_relmap_file {
+                result.add_key(relmap_file_key(spcnode, dbnode));
+            }
             result.add_key(rel_dir_to_key(spcnode, dbnode));
 
             let mut rels: Vec<RelTag> = self

From a68edad913fa54d3d12f0cbd6816b7b3ab8d7676 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Mon, 8 Jul 2024 10:43:10 -0400
Subject: [PATCH 1144/1571] refactor: move part of sharding API from
 `pageserver_api` to `utils` (#8254)

## Problem

LSN Leases introduced in #8084 is a new API that is made shard-aware
from day 1. To support ephemeral endpoint in #7994 without linking
Postgres C API against `compute_ctl`, part of the sharding needs to
reside in `utils`.

## Summary of changes

- Create a new `shard` module in utils crate.
- Move more interface related part of tenant sharding API to utils and
re-export them in pageserver_api.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 libs/pageserver_api/src/shard.rs | 516 +++----------------------------
 libs/utils/src/lib.rs            |   2 +
 libs/utils/src/shard.rs          | 451 +++++++++++++++++++++++++++
 3 files changed, 490 insertions(+), 479 deletions(-)
 create mode 100644 libs/utils/src/shard.rs

diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 8c5a4e6168..e83cf4c855 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,59 +1,42 @@
-use std::{ops::RangeInclusive, str::FromStr};
+//! See docs/rfcs/031-sharding-static.md for an overview of sharding.
+//!
+//! This module contains a variety of types used to represent the concept of sharding
+//! a Neon tenant across multiple physical shards.  Since there are quite a few of these,
+//! we provide an summary here.
+//!
+//! Types used to describe shards:
+//! - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
+//!   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
+//!   a shard suffix.
+//! - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
+//! - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
+//!   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
+//!   tenant, such as layer files.
+//! - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
+//!   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
+//! - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
+//!   four hex digits.  An unsharded tenant is `0000`.
+//! - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
+//!
+//! Types used to describe the parameters for data distribution in a sharded tenant:
+//! - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
+//!   multiple shards.  Its value is given in 8kiB pages.
+//! - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
+//!   always zero: this is provided for future upgrades that might introduce different
+//!   data distribution schemes.
+//!
+//! Examples:
+//! - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
+//! - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
+//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
+//!   and their slugs are 0004, 0104, 0204, and 0304.
 
 use crate::{key::Key, models::ShardParameters};
-use hex::FromHex;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
-use utils::id::TenantId;
 
-/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
-///
-/// This module contains a variety of types used to represent the concept of sharding
-/// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
-/// we provide an summary here.
-///
-/// Types used to describe shards:
-/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
-///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
-///   a shard suffix.
-/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
-/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
-///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
-///   tenant, such as layer files.
-/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
-///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
-/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
-///   four hex digits.  An unsharded tenant is `0000`.
-/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
-///
-/// Types used to describe the parameters for data distribution in a sharded tenant:
-/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
-///   multiple shards.  Its value is given in 8kiB pages.
-/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
-///   always zero: this is provided for future upgrades that might introduce different
-///   data distribution schemes.
-///
-/// Examples:
-/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
-/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
-/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
-///   and their slugs are 0004, 0104, 0204, and 0304.
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardNumber(pub u8);
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardCount(u8);
-
-/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
-/// when we need to know which shard we're dealing with, but do not need to know the full
-/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
-/// the fully qualified TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct ShardIndex {
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
+#[doc(inline)]
+pub use ::utils::shard::*;
 
 /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
 /// and to check whether that [`ShardNumber`] is the same as the current shard.
@@ -65,362 +48,6 @@ pub struct ShardIdentity {
     layout: ShardLayout,
 }
 
-/// Formatting helper, for generating the `shard_id` label in traces.
-struct ShardSlug<'a>(&'a TenantShardId);
-
-/// TenantShardId globally identifies a particular shard in a particular tenant.
-///
-/// These are written as `<TenantId>-<ShardSlug>`, for example:
-///   # The second shard in a two-shard tenant
-///   072f1291a5310026820b2fe4b2968934-0102
-///
-/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
-/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
-/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
-///
-/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
-/// is both forward and backward compatible with TenantId: a legacy TenantId can be
-/// decoded as a TenantShardId, and when re-encoded it will be parseable
-/// as a TenantId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TenantShardId {
-    pub tenant_id: TenantId,
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
-impl ShardCount {
-    pub const MAX: Self = Self(u8::MAX);
-
-    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
-    /// legacy format for TenantShardId that excludes the shard suffix", also known
-    /// as [`TenantShardId::unsharded`].
-    ///
-    /// This method returns the actual number of shards, i.e. if our internal value is
-    /// zero, we return 1 (unsharded tenants have 1 shard).
-    pub fn count(&self) -> u8 {
-        if self.0 > 0 {
-            self.0
-        } else {
-            1
-        }
-    }
-
-    /// The literal internal value: this is **not** the number of shards in the
-    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
-    /// [`Self::count`] if you want to know the cardinality of shards.
-    pub fn literal(&self) -> u8 {
-        self.0
-    }
-
-    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
-    /// uses the legacy format for `TenantShardId`. See also the documentation for
-    /// [`Self::count`].
-    pub fn is_unsharded(&self) -> bool {
-        self.0 == 0
-    }
-
-    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
-    /// [`Self::literal`] would return.
-    pub const fn new(val: u8) -> Self {
-        Self(val)
-    }
-}
-
-impl ShardNumber {
-    pub const MAX: Self = Self(u8::MAX);
-}
-
-impl TenantShardId {
-    pub fn unsharded(tenant_id: TenantId) -> Self {
-        Self {
-            tenant_id,
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
-    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
-    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
-        RangeInclusive::new(
-            Self {
-                tenant_id,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            },
-            Self {
-                tenant_id,
-                shard_number: ShardNumber::MAX,
-                shard_count: ShardCount::MAX,
-            },
-        )
-    }
-
-    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
-        ShardSlug(self)
-    }
-
-    /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_shard_zero(&self) -> bool {
-        self.shard_number == ShardNumber(0)
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
-    }
-
-    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
-    /// is useful when logging from code that is already in a span that includes tenant ID, to
-    /// keep messages reasonably terse.
-    pub fn to_index(&self) -> ShardIndex {
-        ShardIndex {
-            shard_number: self.shard_number,
-            shard_count: self.shard_count,
-        }
-    }
-
-    /// Calculate the children of this TenantShardId when splitting the overall tenant into
-    /// the given number of shards.
-    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
-        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
-        let mut child_shards = Vec::new();
-        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
-            // Key mapping is based on a round robin mapping of key hash modulo shard count,
-            // so our child shards are the ones which the same keys would map to.
-            if shard_number % effective_old_shard_count == self.shard_number.0 {
-                child_shards.push(TenantShardId {
-                    tenant_id: self.tenant_id,
-                    shard_number: ShardNumber(shard_number),
-                    shard_count: new_shard_count,
-                })
-            }
-        }
-
-        child_shards
-    }
-}
-
-impl<'a> std::fmt::Display for ShardSlug<'a> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{:02x}{:02x}",
-            self.0.shard_number.0, self.0.shard_count.0
-        )
-    }
-}
-
-impl std::fmt::Display for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if self.shard_count != ShardCount(0) {
-            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
-        } else {
-            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
-            // is distinct from the normal single shard case (shard count == 1).
-            self.tenant_id.fmt(f)
-        }
-    }
-}
-
-impl std::fmt::Debug for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for TenantShardId {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
-        if s.len() == 32 {
-            // Legacy case: no shard specified
-            Ok(Self {
-                tenant_id: TenantId::from_str(s)?,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            })
-        } else if s.len() == 37 {
-            let bytes = s.as_bytes();
-            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
-            Ok(Self {
-                tenant_id,
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 18]> for TenantShardId {
-    fn from(b: [u8; 18]) -> Self {
-        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
-
-        Self {
-            tenant_id: TenantId::from(tenant_id_bytes),
-            shard_number: ShardNumber(b[16]),
-            shard_count: ShardCount(b[17]),
-        }
-    }
-}
-
-impl ShardIndex {
-    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
-        Self {
-            shard_number: number,
-            shard_count: count,
-        }
-    }
-    pub fn unsharded() -> Self {
-        Self {
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
-    }
-
-    /// For use in constructing remote storage paths: concatenate this with a TenantId
-    /// to get a fully qualified TenantShardId.
-    ///
-    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
-    /// that the legacy pre-sharding remote key format is preserved.
-    pub fn get_suffix(&self) -> String {
-        if self.is_unsharded() {
-            "".to_string()
-        } else {
-            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-        }
-    }
-}
-
-impl std::fmt::Display for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-    }
-}
-
-impl std::fmt::Debug for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for ShardIndex {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 1 byte shard number, 1 byte shard count
-        if s.len() == 4 {
-            let bytes = s.as_bytes();
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(bytes, &mut shard_parts)?;
-            Ok(Self {
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 2]> for ShardIndex {
-    fn from(b: [u8; 2]) -> Self {
-        Self {
-            shard_number: ShardNumber(b[0]),
-            shard_count: ShardCount(b[1]),
-        }
-    }
-}
-
-impl Serialize for TenantShardId {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Note: while human encoding of [`TenantShardId`] is backward and forward
-            // compatible, this binary encoding is not.
-            let mut packed: [u8; 18] = [0; 18];
-            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
-            packed[16] = self.shard_number.0;
-            packed[17] = self.shard_count.0;
-
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for TenantShardId {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = TenantShardId;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 18])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 18] = Deserialize::deserialize(s)?;
-                Ok(TenantShardId::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                TenantShardId::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                18,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
-
 /// Stripe size in number of pages
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
@@ -585,77 +212,6 @@ impl ShardIdentity {
     }
 }
 
-impl Serialize for ShardIndex {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Binary encoding is not used in index_part.json, but is included in anticipation of
-            // switching various structures (e.g. inter-process communication, remote metadata) to more
-            // compact binary encodings in future.
-            let mut packed: [u8; 2] = [0; 2];
-            packed[0] = self.shard_number.0;
-            packed[1] = self.shard_count.0;
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for ShardIndex {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = ShardIndex;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 2])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 2] = Deserialize::deserialize(s)?;
-                Ok(ShardIndex::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                ShardIndex::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                2,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
-
 /// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
 /// in order to be able to serve basebackup requests without peer communication).
 fn key_is_shard0(key: &Key) -> bool {
@@ -737,7 +293,9 @@ pub fn describe(
 
 #[cfg(test)]
 mod tests {
-    use utils::Hex;
+    use std::str::FromStr;
+
+    use utils::{id::TenantId, Hex};
 
     use super::*;
 
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 2a397d97d2..711e617801 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -26,6 +26,8 @@ pub mod auth;
 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;
 
+pub mod shard;
+
 mod hex;
 pub use hex::Hex;
 
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
new file mode 100644
index 0000000000..4f9ac6bdb4
--- /dev/null
+++ b/libs/utils/src/shard.rs
@@ -0,0 +1,451 @@
+//! See `pageserver_api::shard` for description on sharding.
+
+use std::{ops::RangeInclusive, str::FromStr};
+
+use hex::FromHex;
+use serde::{Deserialize, Serialize};
+
+use crate::id::TenantId;
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardNumber(pub u8);
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardCount(pub u8);
+
+/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
+/// when we need to know which shard we're dealing with, but do not need to know the full
+/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
+/// the fully qualified TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+/// Formatting helper, for generating the `shard_id` label in traces.
+pub struct ShardSlug<'a>(&'a TenantShardId);
+
+/// TenantShardId globally identifies a particular shard in a particular tenant.
+///
+/// These are written as `<TenantId>-<ShardSlug>`, for example:
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
+/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
+/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
+///
+/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible with TenantId: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+impl ShardCount {
+    pub const MAX: Self = Self(u8::MAX);
+
+    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
+    /// legacy format for TenantShardId that excludes the shard suffix", also known
+    /// as [`TenantShardId::unsharded`].
+    ///
+    /// This method returns the actual number of shards, i.e. if our internal value is
+    /// zero, we return 1 (unsharded tenants have 1 shard).
+    pub fn count(&self) -> u8 {
+        if self.0 > 0 {
+            self.0
+        } else {
+            1
+        }
+    }
+
+    /// The literal internal value: this is **not** the number of shards in the
+    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
+    /// [`Self::count`] if you want to know the cardinality of shards.
+    pub fn literal(&self) -> u8 {
+        self.0
+    }
+
+    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
+    /// uses the legacy format for `TenantShardId`. See also the documentation for
+    /// [`Self::count`].
+    pub fn is_unsharded(&self) -> bool {
+        self.0 == 0
+    }
+
+    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
+    /// [`Self::literal`] would return.
+    pub const fn new(val: u8) -> Self {
+        Self(val)
+    }
+}
+
+impl ShardNumber {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+impl TenantShardId {
+    pub fn unsharded(tenant_id: TenantId) -> Self {
+        Self {
+            tenant_id,
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
+    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
+    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
+        RangeInclusive::new(
+            Self {
+                tenant_id,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            },
+            Self {
+                tenant_id,
+                shard_number: ShardNumber::MAX,
+                shard_count: ShardCount::MAX,
+            },
+        )
+    }
+
+    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
+        ShardSlug(self)
+    }
+
+    /// Convenience for code that has special behavior on the 0th shard.
+    pub fn is_shard_zero(&self) -> bool {
+        self.shard_number == ShardNumber(0)
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
+    }
+
+    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
+    /// is useful when logging from code that is already in a span that includes tenant ID, to
+    /// keep messages reasonably terse.
+    pub fn to_index(&self) -> ShardIndex {
+        ShardIndex {
+            shard_number: self.shard_number,
+            shard_count: self.shard_count,
+        }
+    }
+
+    /// Calculate the children of this TenantShardId when splitting the overall tenant into
+    /// the given number of shards.
+    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
+        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
+        let mut child_shards = Vec::new();
+        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
+            // Key mapping is based on a round robin mapping of key hash modulo shard count,
+            // so our child shards are the ones which the same keys would map to.
+            if shard_number % effective_old_shard_count == self.shard_number.0 {
+                child_shards.push(TenantShardId {
+                    tenant_id: self.tenant_id,
+                    shard_number: ShardNumber(shard_number),
+                    shard_count: new_shard_count,
+                })
+            }
+        }
+
+        child_shards
+    }
+}
+
+impl<'a> std::fmt::Display for ShardSlug<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{:02x}{:02x}",
+            self.0.shard_number.0, self.0.shard_count.0
+        )
+    }
+}
+
+impl std::fmt::Display for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.shard_count != ShardCount(0) {
+            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
+        } else {
+            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
+            // is distinct from the normal single shard case (shard count == 1).
+            self.tenant_id.fmt(f)
+        }
+    }
+}
+
+impl std::fmt::Debug for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for TenantShardId {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
+        if s.len() == 32 {
+            // Legacy case: no shard specified
+            Ok(Self {
+                tenant_id: TenantId::from_str(s)?,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            })
+        } else if s.len() == 37 {
+            let bytes = s.as_bytes();
+            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
+            Ok(Self {
+                tenant_id,
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 18]> for TenantShardId {
+    fn from(b: [u8; 18]) -> Self {
+        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
+
+        Self {
+            tenant_id: TenantId::from(tenant_id_bytes),
+            shard_number: ShardNumber(b[16]),
+            shard_count: ShardCount(b[17]),
+        }
+    }
+}
+
+impl ShardIndex {
+    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
+        Self {
+            shard_number: number,
+            shard_count: count,
+        }
+    }
+    pub fn unsharded() -> Self {
+        Self {
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+    }
+
+    /// For use in constructing remote storage paths: concatenate this with a TenantId
+    /// to get a fully qualified TenantShardId.
+    ///
+    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
+    /// that the legacy pre-sharding remote key format is preserved.
+    pub fn get_suffix(&self) -> String {
+        if self.is_unsharded() {
+            "".to_string()
+        } else {
+            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+        }
+    }
+}
+
+impl std::fmt::Display for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+    }
+}
+
+impl std::fmt::Debug for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for ShardIndex {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 1 byte shard number, 1 byte shard count
+        if s.len() == 4 {
+            let bytes = s.as_bytes();
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(bytes, &mut shard_parts)?;
+            Ok(Self {
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 2]> for ShardIndex {
+    fn from(b: [u8; 2]) -> Self {
+        Self {
+            shard_number: ShardNumber(b[0]),
+            shard_count: ShardCount(b[1]),
+        }
+    }
+}
+
+impl Serialize for TenantShardId {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Note: while human encoding of [`TenantShardId`] is backward and forward
+            // compatible, this binary encoding is not.
+            let mut packed: [u8; 18] = [0; 18];
+            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
+            packed[16] = self.shard_number.0;
+            packed[17] = self.shard_count.0;
+
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for TenantShardId {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = TenantShardId;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 18])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 18] = Deserialize::deserialize(s)?;
+                Ok(TenantShardId::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                TenantShardId::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                18,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
+impl Serialize for ShardIndex {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Binary encoding is not used in index_part.json, but is included in anticipation of
+            // switching various structures (e.g. inter-process communication, remote metadata) to more
+            // compact binary encodings in future.
+            let mut packed: [u8; 2] = [0; 2];
+            packed[0] = self.shard_number.0;
+            packed[1] = self.shard_count.0;
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for ShardIndex {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = ShardIndex;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 2])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 2] = Deserialize::deserialize(s)?;
+                Ok(ShardIndex::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                ShardIndex::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                2,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}

From 84b039e615e9e7391e22e97fc5ee306cac29385b Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 8 Jul 2024 19:54:02 +0200
Subject: [PATCH 1145/1571] compute_ctl: Use 'fast' shutdown for Postgres
 termination (#8289)

## Problem

We currently use 'immediate' mode in the most commonly used shutdown
path, when the control plane calls a `compute_ctl` API to terminate
Postgres inside compute without waiting for the actual pod / VM
termination. Yet, 'immediate' shutdown doesn't create a shutdown
checkpoint and ROs have bad times figuring out the list of running xacts
during next start.

## Summary of changes

Use 'fast' mode, which creates a shutdown checkpoint that is important
for ROs to get a list of running xacts faster instead of going through
the CLOG. On the control plane side, we poll this `compute_ctl`
termination API for 10s, it should be enough as we don't really write
any data at checkpoint time. If it times out, we anyway switch to the
slow k8s-based termination.

See https://www.postgresql.org/docs/current/server-shutdown.html for the
list of modes and signals.

The default VM shutdown hook already uses `fast` mode, see [1]

[1]
https://github.com/neondatabase/neon/blob/c9fd8d76937c2031fd4fea1cdf661d6cf4f00dc3/vm-image-spec.yaml#L30-L31

Related to #6211
---
 compute_tools/src/compute.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 41a52ef5b6..1fa2b9f71d 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1386,7 +1386,9 @@ pub fn forward_termination_signal() {
     let pg_pid = PG_PID.load(Ordering::SeqCst);
     if pg_pid != 0 {
         let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
-        kill(pg_pid, Signal::SIGQUIT).ok();
+        // Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for
+        // ROs to get a list of running xacts faster instead of going through the CLOG.
+        // See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals.
+        kill(pg_pid, Signal::SIGINT).ok();
     }
 }

From daea26a22f98ca2399f55c0db7eb8932865d7ede Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 8 Jul 2024 20:05:35 +0100
Subject: [PATCH 1146/1571] tests: use smaller layers in test_pg_regress
 (#8232)

## Problem

Debug-mode runs of test_pg_regress are rather slow since
https://github.com/neondatabase/neon/pull/8105, and occasionally exceed
their 600s timeout.

## Summary of changes

- Use 8MiB layer files, avoiding large ephemeral layers

On a hetzner AX102, this takes the runtime from 230s to 190s. Which
hopefully will be enough to get the runtime on github runners more
reliably below its 600s timeout.

This has the side benefit of exercising more of the pageserver stack
(including compaction) under a workload that exercises a more diverse
set of postgres functionality than most of our tests.
---
 pageserver/src/tenant/timeline.rs      |   3 +
 test_runner/regress/test_pg_regress.py | 182 ++++++++++++++++---------
 2 files changed, 118 insertions(+), 67 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 92baf1073a..541704e8d6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -728,6 +728,9 @@ impl From<CreateImageLayersError> for CompactionError {
     fn from(e: CreateImageLayersError) -> Self {
         match e {
             CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
+            CreateImageLayersError::Other(e) => {
+                CompactionError::Other(e.context("create image layers"))
+            }
             _ => CompactionError::Other(e.into()),
         }
     }
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 756a2c17c9..54b493ec70 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -8,8 +8,11 @@ from typing import TYPE_CHECKING, cast
 
 import pytest
 from fixtures.neon_fixtures import (
+    Endpoint,
+    NeonEnv,
     NeonEnvBuilder,
     check_restored_datadir_content,
+    tenant_get_shards,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import s3_storage
@@ -21,6 +24,97 @@ if TYPE_CHECKING:
     from pytest import CaptureFixture
 
 
+TENANT_CONF = {
+    # Scaled down thresholds so that we are exercising the pageserver beyond just writing
+    # ephemeral/L0 layers, and because debug-mode code is slow to read from full sized ephemeral layer files.
+    "pitr_interval": "60s",
+    "checkpoint_distance": f"{8 * 1024 * 1024}",
+    "compaction_target_size": f"{8 * 1024 * 1024}",
+}
+
+# # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create.
+# # There should have been compactions mid-test as well, this final check is in addition those.
+# for (shard, pageserver) in tenant_get_shards(env, env.initial_tenant):
+#     pageserver.http_client().timeline_checkpoint(env.initial_tenant, env.initial_timeline, force_repartition=True, force_image_layer_creation=True)
+
+
+def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: Endpoint):
+    """
+    After running some opaque tests that create interesting content in a timeline, run
+    some generic integrity checks that the storage stack is able to reproduce the written
+    data properly.
+    """
+
+    ignored_files: Optional[list[str]] = None
+
+    # Neon handles unlogged relations in a special manner. During a
+    # basebackup, we ship the init fork as the main fork. This presents a
+    # problem in that the endpoint's data directory and the basebackup will
+    # have differences and will fail the eventual file comparison.
+    #
+    # Unlogged tables were introduced in version 9.1. ALTER TABLE grew
+    # support for setting the persistence of a table in 9.5. The reason that
+    # this doesn't affect versions < 15 (but probably would between 9.1 and
+    # 9.5) is that all the regression tests that deal with unlogged tables
+    # up until that point dropped the unlogged tables or set them to logged
+    # at some point during the test.
+    #
+    # In version 15, Postgres grew support for unlogged sequences, and with
+    # that came a few more regression tests. These tests did not all drop
+    # the unlogged tables/sequences prior to finishing.
+    #
+    # But unlogged sequences came with a bug in that, sequences didn't
+    # inherit the persistence of their "parent" tables if they had one. This
+    # was fixed and backported to 15, thus exacerbating our problem a bit.
+    #
+    # So what we can do is just ignore file differences between the data
+    # directory and basebackup for unlogged relations.
+    results = cast(
+        "list[tuple[str, str]]",
+        endpoint.safe_psql(
+            """
+        SELECT
+            relkind,
+            pg_relation_filepath(
+                pg_filenode_relation(reltablespace, relfilenode)
+            ) AS unlogged_relation_paths
+        FROM pg_class
+        WHERE relpersistence = 'u'
+        """,
+            dbname=db_name,
+        ),
+    )
+
+    unlogged_relation_files: list[str] = []
+    for r in results:
+        unlogged_relation_files.append(r[1])
+        # This is related to the following Postgres commit:
+        #
+        # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
+        # Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+        # Date:   2023-08-23 09:21:31 -0500
+        #
+        # Use the buffer cache when initializing an unlogged index.
+        #
+        # This patch was backpatched to 16. Without it, the LSN in the
+        # page header would be 0/0 in the data directory, which wouldn't
+        # match the LSN generated during the basebackup, thus creating
+        # a difference.
+        if env.pg_version <= PgVersion.V15 and r[0] == "i":
+            unlogged_relation_files.append(f"{r[1]}_init")
+
+    ignored_files = unlogged_relation_files
+
+    check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
+
+    # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create.
+    # There should have been compactions mid-test as well, this final check is in addition those.
+    for shard, pageserver in tenant_get_shards(env, env.initial_tenant):
+        pageserver.http_client().timeline_checkpoint(
+            shard, env.initial_timeline, force_repartition=True, force_image_layer_creation=True
+        )
+
+
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
 @pytest.mark.timeout(600)
@@ -45,7 +139,10 @@ def test_pg_regress(
 
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF,
+        initial_tenant_shard_count=shard_count,
+    )
 
     # Connect to postgres and create a database called "regression".
     endpoint = env.endpoints.create_start("main")
@@ -84,67 +181,7 @@ def test_pg_regress(
     with capsys.disabled():
         pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
 
-        ignored_files: Optional[list[str]] = None
-
-        # Neon handles unlogged relations in a special manner. During a
-        # basebackup, we ship the init fork as the main fork. This presents a
-        # problem in that the endpoint's data directory and the basebackup will
-        # have differences and will fail the eventual file comparison.
-        #
-        # Unlogged tables were introduced in version 9.1. ALTER TABLE grew
-        # support for setting the persistence of a table in 9.5. The reason that
-        # this doesn't affect versions < 15 (but probably would between 9.1 and
-        # 9.5) is that all the regression tests that deal with unlogged tables
-        # up until that point dropped the unlogged tables or set them to logged
-        # at some point during the test.
-        #
-        # In version 15, Postgres grew support for unlogged sequences, and with
-        # that came a few more regression tests. These tests did not all drop
-        # the unlogged tables/sequences prior to finishing.
-        #
-        # But unlogged sequences came with a bug in that, sequences didn't
-        # inherit the persistence of their "parent" tables if they had one. This
-        # was fixed and backported to 15, thus exacerbating our problem a bit.
-        #
-        # So what we can do is just ignore file differences between the data
-        # directory and basebackup for unlogged relations.
-        results = cast(
-            "list[tuple[str, str]]",
-            endpoint.safe_psql(
-                """
-            SELECT
-                relkind,
-                pg_relation_filepath(
-                    pg_filenode_relation(reltablespace, relfilenode)
-                ) AS unlogged_relation_paths
-            FROM pg_class
-            WHERE relpersistence = 'u'
-            """,
-                dbname=DBNAME,
-            ),
-        )
-
-        unlogged_relation_files: list[str] = []
-        for r in results:
-            unlogged_relation_files.append(r[1])
-            # This is related to the following Postgres commit:
-            #
-            # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
-            # Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
-            # Date:   2023-08-23 09:21:31 -0500
-            #
-            # Use the buffer cache when initializing an unlogged index.
-            #
-            # This patch was backpatched to 16. Without it, the LSN in the
-            # page header would be 0/0 in the data directory, which wouldn't
-            # match the LSN generated during the basebackup, thus creating
-            # a difference.
-            if env.pg_version <= PgVersion.V15 and r[0] == "i":
-                unlogged_relation_files.append(f"{r[1]}_init")
-
-        ignored_files = unlogged_relation_files
-
-        check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
+    post_checks(env, test_output_dir, DBNAME, endpoint)
 
 
 # Run the PostgreSQL "isolation" tests, in src/test/isolation.
@@ -159,16 +196,20 @@ def test_isolation(
     pg_distrib_dir: Path,
     shard_count: Optional[int],
 ):
+    DBNAME = "isolation_regression"
+
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
+    )
 
     # Connect to postgres and create a database called "regression".
     # isolation tests use prepared transactions, so enable them
     endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"])
-    endpoint.safe_psql("CREATE DATABASE isolation_regression")
+    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_isolation_regress to run in.
     runpath = test_output_dir / "regress"
@@ -202,6 +243,9 @@ def test_isolation(
     with capsys.disabled():
         pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath)
 
+    # This fails with a mismatch on `pg_multixact/offsets/0000`
+    # post_checks(env, test_output_dir, DBNAME, endpoint)
+
 
 # Run extra Neon-specific pg_regress-based tests. The tests and their
 # schedule file are in the sql_regress/ directory.
@@ -215,15 +259,19 @@ def test_sql_regress(
     pg_distrib_dir: Path,
     shard_count: Optional[int],
 ):
+    DBNAME = "regression"
+
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
+    )
 
     # Connect to postgres and create a database called "regression".
     endpoint = env.endpoints.create_start("main")
-    endpoint.safe_psql("CREATE DATABASE regression")
+    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_regress to run in.
     runpath = test_output_dir / "regress"
@@ -258,4 +306,4 @@ def test_sql_regress(
     with capsys.disabled():
         pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
 
-        check_restored_datadir_content(test_output_dir, env, endpoint)
+    post_checks(env, test_output_dir, DBNAME, endpoint)

From df3dc6e4c1f13a36567813f7f445734dd3a8b902 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 8 Jul 2024 15:05:59 -0400
Subject: [PATCH 1147/1571] fix(pageserver): write to both v1+v2 for aux tenant
 import (#8316)

close https://github.com/neondatabase/neon/issues/8202 ref
https://github.com/neondatabase/neon/pull/6560

For tenant imports, we now write the aux files into both v1+v2 storage,
so that the test case can pick either one for testing. Given the API is
only used for testing, this looks like a safe change.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_controller/src/service.rs | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 3965d7453d..78f0848c24 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4062,7 +4062,14 @@ impl Service {
                 placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking
 
                 // There is no way to know what the tenant's config was: revert to defaults
-                config: TenantConfig::default(),
+                //
+                // TODO: remove `switch_aux_file_policy` once we finish auxv2 migration
+                //
+                // we write to both v1+v2 storage, so that the test case can use either storage format for testing
+                config: TenantConfig {
+                    switch_aux_file_policy: Some(models::AuxFilePolicy::CrossValidation),
+                    ..TenantConfig::default()
+                },
             })
             .await?;
 

From 811eb88b89207be4342e2e8d4a7d6fc2328e6141 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 8 Jul 2024 21:06:34 +0100
Subject: [PATCH 1148/1571] tests: stabilize
 test_timeline_size_quota_on_startup (#8255)

## Problem

`test_timeline_size_quota_on_startup` assumed that writing data beyond
the size limit would always be blocked. This is not so: the limit is
only enforced if feedback makes it back from the pageserver to the
safekeeper + compute.

Closes: https://github.com/neondatabase/neon/issues/6562

## Summary of changes

- Modify the test to wait for the pageserver to catch up. The size limit
was never actually being enforced robustly, the original version of this
test was just writing much more than 30MB and about 98% of the time
getting lucky such that the feedback happened to arrive before the tests
for loop was done.
- If the test fails, log the logical size as seen by the pageserver.
---
 test_runner/regress/test_timeline_size.py | 46 +++++++++++++++++------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index f47356839c..5e9a42f6b4 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -152,10 +152,12 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
 
     client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
 
+    size_limit_mb = 30
+
     endpoint_main = env.endpoints.create(
         "test_timeline_size_quota_on_startup",
         # Set small limit for the test
-        config_lines=["neon.max_cluster_size=30MB"],
+        config_lines=[f"neon.max_cluster_size={size_limit_mb}MB"],
     )
     endpoint_main.start()
 
@@ -165,17 +167,39 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
 
             # Insert many rows. This query must fail because of space limit
             try:
-                for _i in range(5000):
-                    cur.execute(
-                        """
-                        INSERT INTO foo
-                            SELECT 'long string to consume some space' || g
-                            FROM generate_series(1, 100) g
-                    """
-                    )
 
-                # If we get here, the timeline size limit failed
-                log.error("Query unexpectedly succeeded")
+                def write_rows(count):
+                    for _i in range(count):
+                        cur.execute(
+                            """
+                            INSERT INTO foo
+                                SELECT 'long string to consume some space' || g
+                                FROM generate_series(1, 100) g
+                        """
+                        )
+
+                # Write some data that exceeds limit, then let the pageserver ingest it to guarantee that some feedback has made it to
+                # the safekeeper, then try to write some more.  We expect either the initial writes or the ones after
+                # the wait_for_last_flush_lsn to generate an exception.
+                #
+                # Without the wait_for_last_flush_lsn, the size limit sometimes isn't enforced (see https://github.com/neondatabase/neon/issues/6562)
+                write_rows(2500)
+                wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
+                logical_size = env.pageserver.http_client().timeline_detail(
+                    env.initial_tenant, new_timeline_id
+                )["current_logical_size"]
+                assert logical_size > size_limit_mb * 1024 * 1024
+                write_rows(2500)
+
+                # If we get here, the timeline size limit failed.  Find out from the pageserver how large it
+                # thinks the timeline is.
+                wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
+                logical_size = env.pageserver.http_client().timeline_detail(
+                    env.initial_tenant, new_timeline_id
+                )["current_logical_size"]
+                log.error(
+                    f"Query unexpectedly succeeded, pageserver logical size is {logical_size}"
+                )
                 raise AssertionError()
 
             except psycopg2.errors.DiskFull as err:

From d9c1068cf465c508205d58f5f0c962d6757babda Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Mon, 8 Jul 2024 16:50:13 -0400
Subject: [PATCH 1149/1571] tests: increase approx size equal threshold to
 avoid `test_lsn_lease_size` flakiness (#8282)

## Summary of changes

Increase the `assert_size_approx_equal` threshold to avoid flakiness of
`test_lsn_lease_size`. Still needs more investigation to fully resolve
#8293.

- Also set `autovacuum=off` for the endpoint we are running in the test.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 test_runner/regress/test_tenant_size.py | 29 +++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 70e8fe67d5..b1ade77a14 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -720,9 +720,30 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path,
     They should have the same effect.
     """
 
+    def assert_size_approx_equal_for_lease_test(size_lease, size_branch):
+        """
+        Tests that evaluate sizes are checking the pageserver space consumption
+        that sits many layers below the user input.  The exact space needed
+        varies slightly depending on postgres behavior.
+
+        Rather than expecting postgres to be determinstic and occasionally
+        failing the test, we permit sizes for the same data to vary by a few pages.
+        """
+
+        # FIXME(yuchen): The delta is too large, used as temp solution to pass the test reliably.
+        # Investigate and reduce the threshold.
+        threshold = 22 * 8272
+
+        log.info(
+            f"delta: size_branch({size_branch}) -  size_lease({size_lease}) = {size_branch - size_lease}"
+        )
+
+        assert size_lease == pytest.approx(size_branch, abs=threshold)
+
     conf = {
         "pitr_interval": "0s" if zero_gc else "3600s",
         "gc_period": "0s",
+        "compaction_period": "0s",
     }
 
     env = neon_env_builder.init_start(initial_tenant_conf=conf)
@@ -734,7 +755,7 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path,
     tenant, timeline = env.neon_cli.create_tenant(conf=conf)
     lease_res = insert_with_action(env, tenant, timeline, test_output_dir, action="lease")
 
-    assert_size_approx_equal(lease_res, ro_branch_res)
+    assert_size_approx_equal_for_lease_test(lease_res, ro_branch_res)
 
 
 def insert_with_action(
@@ -754,7 +775,11 @@ def insert_with_action(
     """
 
     client = env.pageserver.http_client()
-    with env.endpoints.create_start("main", tenant_id=tenant) as ep:
+    with env.endpoints.create_start(
+        "main",
+        tenant_id=tenant,
+        config_lines=["autovacuum=off"],
+    ) as ep:
         initial_size = client.tenant_size(tenant)
         log.info(f"initial size: {initial_size}")
 

From 8b15864f5927a3881e94c46a7b88f058a0659c2b Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 9 Jul 2024 09:39:10 +0100
Subject: [PATCH 1150/1571] CI(promote-compatibility-data): take into account
 commit sha (#8283)

## Problem

In https://github.com/neondatabase/neon/pull/8161, we changed the path
to Neon artefacts by adding commit sha to it, but we missed adding these
changes to `promote-compatibility-data` job that we use for
backward/forward- compatibility testing.

## Summary of changes
- Add commit sha to `promote-compatibility-data`
---
 .github/workflows/build_and_test.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index a3246987e2..cb7655e039 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1336,6 +1336,7 @@ jobs:
         env:
           BUCKET: neon-github-public-dev
           PREFIX: artifacts/latest
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
         run: |
           # Update compatibility snapshot for the release
           for pg_version in v14 v15 v16; do
@@ -1349,7 +1350,7 @@ jobs:
 
           # Update Neon artifact for the release (reuse already uploaded artifact)
           for build_type in debug release; do
-            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
+            OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
             FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
 
             S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)

From c196cf6ac15170910c8deff40e5830379c31edbe Mon Sep 17 00:00:00 2001
From: Luca BRUNO <lucab@lucabruno.net>
Date: Tue, 9 Jul 2024 10:43:42 +0200
Subject: [PATCH 1151/1571] proxy/http: avoid spurious vector reallocations

This tweaks the rows-to-JSON rendering logic in order to avoid
allocating 0-sized temporary vectors and later growing them
to insert elements.
As the exact size is known in advance, both vectors can be built
with an exact capacity upfront. This will avoid further vector
growing/reallocation in the rendering hotpath.

Signed-off-by: Luca BRUNO <lucab@lucabruno.net>
---
 proxy/src/serverless/sql_over_http.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 583ff75f7c..8118ae5ea8 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -838,8 +838,9 @@ async fn query_to_json<T: GenericClient>(
         "finished reading rows"
     );
 
-    let mut fields = vec![];
-    let mut columns = vec![];
+    let columns_len = row_stream.columns().len();
+    let mut fields = Vec::with_capacity(columns_len);
+    let mut columns = Vec::with_capacity(columns_len);
 
     for c in row_stream.columns() {
         fields.push(json!({

From 73fa3c014bf4717615a453ccf0e50bca98ba64cf Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 9 Jul 2024 12:11:37 -0400
Subject: [PATCH 1152/1571] chore(storage-scrubber): allow disable file logging
 (#8297)

part of https://github.com/neondatabase/cloud/issues/14024, k8s does not
always have a volume available for logging, and I'm running into weird
permission errors... While I could spend time figuring out how to create
temp directories for logging, I think it would be better to just disable
file logging as k8s containers are ephemeral and we cannot retrieve
anything on the fs after the container gets removed.

## Summary of changes

`PAGESERVER_DISABLE_FILE_LOGGING=1` -> file logging disabled

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/lib.rs | 40 ++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 6adaa5d38f..8f567b22e0 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -242,24 +242,36 @@ impl ConsoleConfig {
     }
 }
 
-pub fn init_logging(file_name: &str) -> WorkerGuard {
-    let (file_writer, guard) =
-        tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name));
-
-    let file_logs = fmt::Layer::new()
-        .with_target(false)
-        .with_ansi(false)
-        .with_writer(file_writer);
+pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
     let stderr_logs = fmt::Layer::new()
         .with_target(false)
         .with_writer(std::io::stderr);
-    tracing_subscriber::registry()
-        .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
-        .with(file_logs)
-        .with(stderr_logs)
-        .init();
 
-    guard
+    let disable_file_logging = match std::env::var("PAGESERVER_DISABLE_FILE_LOGGING") {
+        Ok(s) => s == "1" || s.to_lowercase() == "true",
+        Err(_) => false,
+    };
+
+    if disable_file_logging {
+        tracing_subscriber::registry()
+            .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
+            .with(stderr_logs)
+            .init();
+        None
+    } else {
+        let (file_writer, guard) =
+            tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name));
+        let file_logs = fmt::Layer::new()
+            .with_target(false)
+            .with_ansi(false)
+            .with_writer(file_writer);
+        tracing_subscriber::registry()
+            .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
+            .with(stderr_logs)
+            .with(file_logs)
+            .init();
+        Some(guard)
+    }
 }
 
 pub fn init_s3_client(bucket_region: Region) -> Client {

From 4a5b55c8346fc10ebbf7de3040d605c42dce31d3 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 9 Jul 2024 18:25:49 +0100
Subject: [PATCH 1153/1571] chore: fix nightly build (#8142)

## Problem

`cargo +nightly check` fails

## Summary of changes

Updates `measured`, `time`, and `crc32c`.

* `measured`: updated to fix
https://github.com/rust-lang/rust/issues/125763.
* `time`: updated to fix https://github.com/rust-lang/rust/issues/125319
* `crc32c`: updated to remove some nightly feature detection with a
removed nightly feature
---
 Cargo.lock                | 65 ++++++++++++++++++++++++++++-----------
 Cargo.toml                |  4 +--
 libs/metrics/src/hll.rs   | 14 ++++-----
 libs/metrics/src/lib.rs   | 27 ++++------------
 proxy/src/jemalloc.rs     |  6 ++--
 proxy/src/metrics.rs      | 28 ++++++++++++++++-
 workspace_hack/Cargo.toml |  3 ++
 7 files changed, 94 insertions(+), 53 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 716b6690d9..63628160d1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1397,9 +1397,9 @@ dependencies = [
 
 [[package]]
 name = "crc32c"
-version = "0.6.5"
+version = "0.6.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
+checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47"
 dependencies = [
  "rustc_version",
 ]
@@ -1651,6 +1651,16 @@ dependencies = [
  "rusticata-macros",
 ]
 
+[[package]]
+name = "deranged"
+version = "0.3.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
+dependencies = [
+ "powerfmt",
+ "serde",
+]
+
 [[package]]
 name = "desim"
 version = "0.1.0"
@@ -3008,9 +3018,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
 
 [[package]]
 name = "measured"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
+checksum = "3051f3a030d55d680cdef6ca50e80abd1182f8da29f2344a7c9cb575721138f0"
 dependencies = [
  "bytes",
  "crossbeam-utils",
@@ -3026,9 +3036,9 @@ dependencies = [
 
 [[package]]
 name = "measured-derive"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
+checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
 dependencies = [
  "heck 0.5.0",
  "proc-macro2",
@@ -3038,9 +3048,9 @@ dependencies = [
 
 [[package]]
 name = "measured-process"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
+checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
 dependencies = [
  "libc",
  "measured",
@@ -3275,6 +3285,12 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "num-conv"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
+
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -4118,6 +4134,12 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -5397,9 +5419,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
 
 [[package]]
 name = "serde"
-version = "1.0.183"
+version = "1.0.203"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
+checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
 dependencies = [
  "serde_derive",
 ]
@@ -5416,9 +5438,9 @@ dependencies = [
 
 [[package]]
 name = "serde_derive"
-version = "1.0.183"
+version = "1.0.203"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
+checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6108,12 +6130,15 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.21"
+version = "0.3.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
+checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
 dependencies = [
+ "deranged",
  "itoa",
  "js-sys",
+ "num-conv",
+ "powerfmt",
  "serde",
  "time-core",
  "time-macros",
@@ -6121,16 +6146,17 @@ dependencies = [
 
 [[package]]
 name = "time-core"
-version = "0.1.1"
+version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
+checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
 
 [[package]]
 name = "time-macros"
-version = "0.2.9"
+version = "0.2.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b"
+checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
 dependencies = [
+ "num-conv",
  "time-core",
 ]
 
@@ -7428,6 +7454,7 @@ dependencies = [
  "clap",
  "clap_builder",
  "crossbeam-utils",
+ "deranged",
  "either",
  "fail",
  "futures-channel",
@@ -7452,7 +7479,9 @@ dependencies = [
  "num-traits",
  "once_cell",
  "parquet",
+ "proc-macro2",
  "prost",
+ "quote",
  "rand 0.8.5",
  "regex",
  "regex-automata 0.4.3",
diff --git a/Cargo.toml b/Cargo.toml
index 8fddaaef12..fc3dd51809 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -111,8 +111,8 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.21", features=["lasso"] }
-measured-process = { version = "0.0.21" }
+measured = { version = "0.0.22", features=["lasso"] }
+measured-process = { version = "0.0.22" }
 memoffset = "0.8"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs
index f53511ab5c..723916a742 100644
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -13,11 +13,7 @@ use std::{
 
 use measured::{
     label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
-    metric::{
-        group::{Encoding, MetricValue},
-        name::MetricNameEncoder,
-        Metric, MetricType, MetricVec,
-    },
+    metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec},
     text::TextEncoder,
     LabelGroup,
 };
@@ -144,6 +140,7 @@ impl<const N: usize> HyperLogLogState<N> {
         })
     }
 }
+
 impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
     for HyperLogLogState<N>
 {
@@ -182,12 +179,13 @@ impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEnc
             .into_iter()
             .enumerate()
             .try_for_each(|(hll_shard, val)| {
-                enc.write_metric_value(
-                    name.by_ref(),
+                CounterState::new(val as u64).collect_into(
+                    &(),
                     labels.by_ref().compose_with(HllShardLabel {
                         hll_shard: hll_shard as i64,
                     }),
-                    MetricValue::Int(val as i64),
+                    name.by_ref(),
+                    enc,
                 )
             })
     }
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 0ff8ec8be3..df000cd0fb 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -9,7 +9,7 @@ use measured::{
     metric::{
         counter::CounterState,
         gauge::GaugeState,
-        group::{Encoding, MetricValue},
+        group::Encoding,
         name::{MetricName, MetricNameEncoder},
         MetricEncoding, MetricFamilyEncoding,
     },
@@ -171,8 +171,11 @@ fn write_gauge<Enc: Encoding>(
     labels: impl LabelGroup,
     name: impl MetricNameEncoder,
     enc: &mut Enc,
-) -> Result<(), Enc::Err> {
-    enc.write_metric_value(name, labels, MetricValue::Int(x))
+) -> Result<(), Enc::Err>
+where
+    GaugeState: MetricEncoding<Enc>,
+{
+    GaugeState::new(x).collect_into(&(), labels, name, enc)
 }
 
 #[derive(Default)]
@@ -544,15 +547,6 @@ impl<T: Encoding> Encoding for Inc<T> {
     fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
         self.0.write_help(name, help)
     }
-
-    fn write_metric_value(
-        &mut self,
-        name: impl MetricNameEncoder,
-        labels: impl LabelGroup,
-        value: MetricValue,
-    ) -> Result<(), Self::Err> {
-        self.0.write_metric_value(name, labels, value)
-    }
 }
 
 impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
@@ -579,15 +573,6 @@ impl<T: Encoding> Encoding for Dec<T> {
     fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
         self.0.write_help(name, help)
     }
-
-    fn write_metric_value(
-        &mut self,
-        name: impl MetricNameEncoder,
-        labels: impl LabelGroup,
-        value: MetricValue,
-    ) -> Result<(), Self::Err> {
-        self.0.write_metric_value(name, labels, value)
-    }
 }
 
 /// Write the dec counter to the encoder
diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs
index 3243e6a140..d307d80f4a 100644
--- a/proxy/src/jemalloc.rs
+++ b/proxy/src/jemalloc.rs
@@ -3,8 +3,8 @@ use std::marker::PhantomData;
 use measured::{
     label::NoLabels,
     metric::{
-        gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder,
-        MetricEncoding, MetricFamilyEncoding, MetricType,
+        gauge::GaugeState, group::Encoding, name::MetricNameEncoder, MetricEncoding,
+        MetricFamilyEncoding, MetricType,
     },
     text::TextEncoder,
     LabelGroup, MetricGroup,
@@ -100,7 +100,7 @@ macro_rules! jemalloc_gauge {
                 enc: &mut TextEncoder<W>,
             ) -> Result<(), std::io::Error> {
                 if let Ok(v) = mib.read() {
-                    enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?;
+                    GaugeState::new(v as i64).collect_into(&(), labels, name, enc)?;
                 }
                 Ok(())
             }
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index e2a75a8720..db25ac0311 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -2,7 +2,7 @@ use std::sync::{Arc, OnceLock};
 
 use lasso::ThreadedRodeo;
 use measured::{
-    label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet},
+    label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet},
     metric::{histogram::Thresholds, name::MetricName},
     Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
     LabelGroup, MetricGroup,
@@ -577,6 +577,32 @@ impl LabelGroup for ThreadPoolWorkerId {
     }
 }
 
+impl LabelGroupSet for ThreadPoolWorkers {
+    type Group<'a> = ThreadPoolWorkerId;
+
+    fn cardinality(&self) -> Option<usize> {
+        Some(self.0)
+    }
+
+    fn encode_dense(&self, value: Self::Unique) -> Option<usize> {
+        Some(value)
+    }
+
+    fn decode_dense(&self, value: usize) -> Self::Group<'_> {
+        ThreadPoolWorkerId(value)
+    }
+
+    type Unique = usize;
+
+    fn encode(&self, value: Self::Group<'_>) -> Option<Self::Unique> {
+        Some(value.0)
+    }
+
+    fn decode(&self, value: &Self::Unique) -> Self::Group<'_> {
+        ThreadPoolWorkerId(*value)
+    }
+}
+
 impl LabelSet for ThreadPoolWorkers {
     type Value<'a> = ThreadPoolWorkerId;
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index e1b1806bc8..7f57585994 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -30,6 +30,7 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
+deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures-channel = { version = "0.3", features = ["sink"] }
@@ -107,7 +108,9 @@ num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
+proc-macro2 = { version = "1" }
 prost = { version = "0.11" }
+quote = { version = "1" }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }

From b1fe8259b44ba0d0f0ce4d777edbc0e7e76ebd62 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 9 Jul 2024 13:41:37 -0400
Subject: [PATCH 1154/1571] fix(storage-scrubber): use default AWS
 authentication (#8299)

part of https://github.com/neondatabase/cloud/issues/14024
close https://github.com/neondatabase/neon/issues/7665

Things running in k8s container use this authentication:
https://docs.aws.amazon.com/sdkref/latest/guide/feature-container-credentials.html
while we did not configure the client to use it. This pull request
simply uses the default s3 client credential chain for storage scrubber.
It might break compatibility with minio.

## Summary of changes

* Use default AWS credential provider chain.
* Improvements for s3 errors, we now have detailed errors and correct
backtrace on last trial of the operation.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 storage_scrubber/src/find_large_objects.rs    |  2 +-
 storage_scrubber/src/garbage.rs               |  4 +-
 storage_scrubber/src/lib.rs                   | 89 +++++--------------
 storage_scrubber/src/main.rs                  |  2 +-
 .../src/pageserver_physical_gc.rs             |  2 +-
 .../src/scan_pageserver_metadata.rs           |  2 +-
 .../src/scan_safekeeper_metadata.rs           |  2 +-
 storage_scrubber/src/tenant_snapshot.rs       |  7 +-
 8 files changed, 33 insertions(+), 77 deletions(-)

diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs
index 1422545f2f..2ef802229d 100644
--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -47,7 +47,7 @@ pub async fn find_large_objects(
     ignore_deltas: bool,
     concurrency: usize,
 ) -> anyhow::Result<LargeObjectListing> {
-    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
     let tenants = std::pin::pin!(stream_tenants(&s3_client, &target));
 
     let objects_stream = tenants.map_ok(|tenant_shard_id| {
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index ce0ff10ec6..0450851988 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -140,7 +140,7 @@ async fn find_garbage_inner(
     node_kind: NodeKind,
 ) -> anyhow::Result<GarbageList> {
     // Construct clients for S3 and for Console API
-    let (s3_client, target) = init_remote(bucket_config.clone(), node_kind)?;
+    let (s3_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
     let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));
 
     // Build a set of console-known tenants, for quickly eliminating known-active tenants without having
@@ -432,7 +432,7 @@ pub async fn purge_garbage(
     );
 
     let (s3_client, target) =
-        init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind)?;
+        init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
 
     // Sanity checks on the incoming list
     if garbage_list.active_tenant_count == 0 {
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 8f567b22e0..9102ad9906 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -15,17 +15,10 @@ use std::fmt::Display;
 use std::sync::Arc;
 use std::time::Duration;
 
-use anyhow::Context;
-use aws_config::environment::EnvironmentVariableCredentialsProvider;
-use aws_config::imds::credentials::ImdsCredentialsProvider;
-use aws_config::meta::credentials::CredentialsProviderChain;
-use aws_config::profile::ProfileFileCredentialsProvider;
-use aws_config::retry::RetryConfig;
-use aws_config::sso::SsoCredentialsProvider;
-use aws_config::BehaviorVersion;
-use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep};
-use aws_sdk_s3::{Client, Config};
-use aws_smithy_async::rt::sleep::TokioSleep;
+use anyhow::{anyhow, Context};
+use aws_sdk_s3::config::Region;
+use aws_sdk_s3::error::DisplayErrorContext;
+use aws_sdk_s3::Client;
 
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
@@ -274,65 +267,21 @@ pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
     }
 }
 
-pub fn init_s3_client(bucket_region: Region) -> Client {
-    let credentials_provider = {
-        // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
-        let chain = CredentialsProviderChain::first_try(
-            "env",
-            EnvironmentVariableCredentialsProvider::new(),
-        )
-        // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
-        .or_else(
-            "profile-sso",
-            ProfileFileCredentialsProvider::builder().build(),
-        );
-
-        // Use SSO if we were given an account ID
-        match std::env::var("SSO_ACCOUNT_ID").ok() {
-            Some(sso_account) => chain.or_else(
-                "sso",
-                SsoCredentialsProvider::builder()
-                    .account_id(sso_account)
-                    .role_name("PowerUserAccess")
-                    .start_url("https://neondb.awsapps.com/start")
-                    .region(bucket_region.clone())
-                    .build(),
-            ),
-            None => chain,
-        }
-        .or_else(
-            // Finally try IMDS
-            "imds",
-            ImdsCredentialsProvider::builder().build(),
-        )
-    };
-
-    let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
-
-    let mut builder = Config::builder()
-        .behavior_version(
-            #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
-            BehaviorVersion::v2023_11_09(),
-        )
+pub async fn init_s3_client(bucket_region: Region) -> Client {
+    let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
         .region(bucket_region)
-        .retry_config(RetryConfig::adaptive().with_max_attempts(3))
-        .sleep_impl(SharedAsyncSleep::from(sleep_impl))
-        .credentials_provider(credentials_provider);
-
-    if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") {
-        builder = builder.endpoint_url(endpoint)
-    }
-
-    Client::from_conf(builder.build())
+        .load()
+        .await;
+    Client::new(&config)
 }
 
-fn init_remote(
+async fn init_remote(
     bucket_config: BucketConfig,
     node_kind: NodeKind,
 ) -> anyhow::Result<(Arc<Client>, RootTarget)> {
     let bucket_region = Region::new(bucket_config.region);
     let delimiter = "/".to_string();
-    let s3_client = Arc::new(init_s3_client(bucket_region));
+    let s3_client = Arc::new(init_s3_client(bucket_region).await);
 
     let s3_root = match node_kind {
         NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
@@ -357,7 +306,7 @@ async fn list_objects_with_retries(
     s3_target: &S3Target,
     continuation_token: Option<String>,
 ) -> anyhow::Result<aws_sdk_s3::operation::list_objects_v2::ListObjectsV2Output> {
-    for _ in 0..MAX_RETRIES {
+    for trial in 0..MAX_RETRIES {
         match s3_client
             .list_objects_v2()
             .bucket(&s3_target.bucket_name)
@@ -369,16 +318,22 @@ async fn list_objects_with_retries(
         {
             Ok(response) => return Ok(response),
             Err(e) => {
+                if trial == MAX_RETRIES - 1 {
+                    return Err(e)
+                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
+                }
                 error!(
-                    "list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}",
-                    s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter
+                    "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
+                    s3_target.bucket_name,
+                    s3_target.prefix_in_bucket,
+                    s3_target.delimiter,
+                    DisplayErrorContext(e),
                 );
                 tokio::time::sleep(Duration::from_secs(1)).await;
             }
         }
     }
-
-    anyhow::bail!("Failed to list objects {MAX_RETRIES} times")
+    Err(anyhow!("unreachable unless MAX_RETRIES==0"))
 }
 
 async fn download_object_with_retries(
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index 16a26613d2..d816121192 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -196,7 +196,7 @@ async fn main() -> anyhow::Result<()> {
             concurrency,
         } => {
             let downloader =
-                SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?;
+                SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency).await?;
             downloader.download().await
         }
         Command::PageserverPhysicalGc {
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 0146433128..fb8fbc1635 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -160,7 +160,7 @@ pub async fn pageserver_physical_gc(
     min_age: Duration,
     mode: GcMode,
 ) -> anyhow::Result<GcSummary> {
-    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
 
     let tenants = if tenant_ids.is_empty() {
         futures::future::Either::Left(stream_tenants(&s3_client, &target))
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index af74ffa4cd..df4f29acf7 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -199,7 +199,7 @@ pub async fn scan_metadata(
     bucket_config: BucketConfig,
     tenant_ids: Vec<TenantShardId>,
 ) -> anyhow::Result<MetadataSummary> {
-    let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?;
+    let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?;
 
     let tenants = if tenant_ids.is_empty() {
         futures::future::Either::Left(stream_tenants(&s3_client, &target))
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index 24051b03de..553adf8f46 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -106,7 +106,7 @@ pub async fn scan_safekeeper_metadata(
     let timelines = client.query(&query, &[]).await?;
     info!("loaded {} timelines", timelines.len());
 
-    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?;
+    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?;
     let console_config = ConsoleConfig::from_env()?;
     let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
 
diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
index 450b337235..5a75f8d40e 100644
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -28,13 +28,13 @@ pub struct SnapshotDownloader {
 }
 
 impl SnapshotDownloader {
-    pub fn new(
+    pub async fn new(
         bucket_config: BucketConfig,
         tenant_id: TenantId,
         output_path: Utf8PathBuf,
         concurrency: usize,
     ) -> anyhow::Result<Self> {
-        let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+        let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
         Ok(Self {
             s3_client,
             s3_root,
@@ -215,7 +215,8 @@ impl SnapshotDownloader {
     }
 
     pub async fn download(&self) -> anyhow::Result<()> {
-        let (s3_client, target) = init_remote(self.bucket_config.clone(), NodeKind::Pageserver)?;
+        let (s3_client, target) =
+            init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?;
 
         // Generate a stream of TenantShardId
         let shards = stream_tenant_shards(&s3_client, &target, self.tenant_id).await?;

From 6d3cb222ee340f11666031081d08965b19ccb317 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 1 Jul 2024 13:45:42 -0500
Subject: [PATCH 1155/1571] Refactor how migrations are ran

Just a small improvement I noticed while looking at fixing CVE-2024-4317
in Neon.
---
 compute_tools/src/lib.rs       |   1 +
 compute_tools/src/migration.rs | 100 +++++++++++++++++++++++++++++++++
 compute_tools/src/spec.rs      |  65 +--------------------
 3 files changed, 103 insertions(+), 63 deletions(-)
 create mode 100644 compute_tools/src/migration.rs

diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index 18c228ba54..543d4462ed 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -11,6 +11,7 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod extension_server;
+mod migration;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs
new file mode 100644
index 0000000000..61dcf01c84
--- /dev/null
+++ b/compute_tools/src/migration.rs
@@ -0,0 +1,100 @@
+use anyhow::{Context, Result};
+use postgres::Client;
+use tracing::info;
+
+pub(crate) struct MigrationRunner<'m> {
+    client: &'m mut Client,
+    migrations: &'m [&'m str],
+}
+
+impl<'m> MigrationRunner<'m> {
+    pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
+        Self { client, migrations }
+    }
+
+    fn get_migration_id(&mut self) -> Result<i64> {
+        let query = "SELECT id FROM neon_migration.migration_id";
+        let row = self
+            .client
+            .query_one(query, &[])
+            .context("run_migrations get migration_id")?;
+
+        Ok(row.get::<&str, i64>("id"))
+    }
+
+    fn update_migration_id(&mut self) -> Result<()> {
+        let setval = format!(
+            "UPDATE neon_migration.migration_id SET id={}",
+            self.migrations.len()
+        );
+
+        self.client
+            .simple_query(&setval)
+            .context("run_migrations update id")?;
+
+        Ok(())
+    }
+
+    fn prepare_migrations(&mut self) -> Result<()> {
+        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+        self.client.simple_query(query)?;
+
+        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+        self.client.simple_query(query)?;
+
+        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+        self.client.simple_query(query)?;
+
+        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+        self.client.simple_query(query)?;
+
+        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+        self.client.simple_query(query)?;
+
+        Ok(())
+    }
+
+    pub fn run_migrations(mut self) -> Result<()> {
+        self.prepare_migrations()?;
+
+        let mut current_migration: usize = self.get_migration_id()? as usize;
+        let starting_migration_id = current_migration;
+
+        let query = "BEGIN";
+        self.client
+            .simple_query(query)
+            .context("run_migrations begin")?;
+
+        while current_migration < self.migrations.len() {
+            let migration = self.migrations[current_migration];
+
+            if migration.starts_with("-- SKIP") {
+                info!("Skipping migration id={}", current_migration);
+            } else {
+                info!(
+                    "Running migration id={}:\n{}\n",
+                    current_migration, migration
+                );
+                self.client.simple_query(migration).with_context(|| {
+                    format!("run_migration current_migration={}", current_migration)
+                })?;
+            }
+
+            current_migration += 1;
+        }
+
+        self.update_migration_id()?;
+
+        let query = "COMMIT";
+        self.client
+            .simple_query(query)
+            .context("run_migrations commit")?;
+
+        info!(
+            "Ran {} migrations",
+            (self.migrations.len() - starting_migration_id)
+        );
+
+        Ok(())
+    }
+}
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 143f6c1e5f..37090b08fd 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -10,6 +10,7 @@ use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
 
 use crate::config;
 use crate::logger::inlinify;
+use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
 
@@ -791,69 +792,7 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
         include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
     ];
 
-    let mut func = || {
-        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-        client.simple_query(query)?;
-
-        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-        client.simple_query(query)?;
-
-        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-        client.simple_query(query)?;
-
-        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-        client.simple_query(query)?;
-
-        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-        client.simple_query(query)?;
-        Ok::<_, anyhow::Error>(())
-    };
-    func().context("handle_migrations prepare")?;
-
-    let query = "SELECT id FROM neon_migration.migration_id";
-    let row = client
-        .query_one(query, &[])
-        .context("handle_migrations get migration_id")?;
-    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
-    let starting_migration_id = current_migration;
-
-    let query = "BEGIN";
-    client
-        .simple_query(query)
-        .context("handle_migrations begin")?;
-
-    while current_migration < migrations.len() {
-        let migration = &migrations[current_migration];
-        if migration.starts_with("-- SKIP") {
-            info!("Skipping migration id={}", current_migration);
-        } else {
-            info!(
-                "Running migration id={}:\n{}\n",
-                current_migration, migration
-            );
-            client.simple_query(migration).with_context(|| {
-                format!("handle_migrations current_migration={}", current_migration)
-            })?;
-        }
-        current_migration += 1;
-    }
-    let setval = format!(
-        "UPDATE neon_migration.migration_id SET id={}",
-        migrations.len()
-    );
-    client
-        .simple_query(&setval)
-        .context("handle_migrations update id")?;
-
-    let query = "COMMIT";
-    client
-        .simple_query(query)
-        .context("handle_migrations commit")?;
-
-    info!(
-        "Ran {} migrations",
-        (migrations.len() - starting_migration_id)
-    );
+    MigrationRunner::new(client, &migrations).run_migrations()?;
 
     Ok(())
 }

From abc330e095687909c7daea515d27340b15be3810 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 9 Jul 2024 10:21:23 -0500
Subject: [PATCH 1156/1571] Add an application_name to more Neon connections

Helps identify connections in the logs.
---
 compute_tools/src/compute.rs | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 1fa2b9f71d..eced6fc0b2 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -798,7 +798,11 @@ impl ComputeNode {
         // In this case we need to connect with old `zenith_admin` name
         // and create new user. We cannot simply rename connected user,
         // but we can create a new one and grant it all privileges.
-        let connstr = self.connstr.clone();
+        let mut connstr = self.connstr.clone();
+        connstr
+            .query_pairs_mut()
+            .append_pair("application_name", "apply_config");
+
         let mut client = match Client::connect(connstr.as_str(), NoTls) {
             Err(e) => match e.code() {
                 Some(&SqlState::INVALID_PASSWORD)
@@ -867,6 +871,11 @@ impl ComputeNode {
 
         // Run migrations separately to not hold up cold starts
         thread::spawn(move || {
+            let mut connstr = connstr.clone();
+            connstr
+                .query_pairs_mut()
+                .append_pair("application_name", "migrations");
+
             let mut client = Client::connect(connstr.as_str(), NoTls)?;
             handle_migrations(&mut client).context("apply_config handle_migrations")
         });

From 3f7aebb01cd59f8c7ea9e7801832c7fb190a550c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 9 Jul 2024 20:11:11 +0200
Subject: [PATCH 1157/1571] refactor: postgres_backend: replace abstract
 shutdown_watcher with CancellationToken (#8295)

Preliminary refactoring while working on
https://github.com/neondatabase/neon/issues/7427
and specifically https://github.com/neondatabase/neon/pull/8286
---
 Cargo.lock                                   |  3 +-
 libs/postgres_backend/Cargo.toml             |  3 +-
 libs/postgres_backend/src/lib.rs             | 33 +++++++-------------
 libs/postgres_backend/tests/simple_select.rs |  7 +++--
 pageserver/src/page_service.rs               |  2 +-
 proxy/src/console/mgmt.rs                    |  7 +++--
 safekeeper/src/wal_service.rs                |  5 +--
 workspace_hack/Cargo.toml                    |  2 --
 8 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 63628160d1..776d95c3c7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4094,6 +4094,7 @@ dependencies = [
  "tokio-postgres",
  "tokio-postgres-rustls",
  "tokio-rustls 0.25.0",
+ "tokio-util",
  "tracing",
  "workspace_hack",
 ]
@@ -7458,10 +7459,8 @@ dependencies = [
  "either",
  "fail",
  "futures-channel",
- "futures-core",
  "futures-executor",
  "futures-io",
- "futures-sink",
  "futures-util",
  "getrandom 0.2.11",
  "hashbrown 0.14.5",
diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml
index 8e249c09f7..c7611b9f21 100644
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -13,6 +13,7 @@ rustls.workspace = true
 serde.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
+tokio-util.workspace = true
 tokio-rustls.workspace = true
 tracing.workspace = true
 
@@ -23,4 +24,4 @@ workspace_hack.workspace = true
 once_cell.workspace = true
 rustls-pemfile.workspace = true
 tokio-postgres.workspace = true
-tokio-postgres-rustls.workspace = true
\ No newline at end of file
+tokio-postgres-rustls.workspace = true
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 6c41b7f347..c79ee4e053 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -16,6 +16,7 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn};
 
 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
@@ -400,21 +401,15 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
     }
 
     /// Wrapper for run_message_loop() that shuts down socket when we are done
-    pub async fn run<F, S>(
+    pub async fn run(
         mut self,
         handler: &mut impl Handler<IO>,
-        shutdown_watcher: F,
-    ) -> Result<(), QueryError>
-    where
-        F: Fn() -> S + Clone,
-        S: Future,
-    {
-        let ret = self
-            .run_message_loop(handler, shutdown_watcher.clone())
-            .await;
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
+        let ret = self.run_message_loop(handler, cancel).await;
 
         tokio::select! {
-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                 // do nothing; we most likely got already stopped by shutdown and will log it next.
             }
             _ = self.framed.shutdown() => {
@@ -444,21 +439,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         }
     }
 
-    async fn run_message_loop<F, S>(
+    async fn run_message_loop(
         &mut self,
         handler: &mut impl Handler<IO>,
-        shutdown_watcher: F,
-    ) -> Result<(), QueryError>
-    where
-        F: Fn() -> S,
-        S: Future,
-    {
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
         trace!("postgres backend to {:?} started", self.peer_addr);
 
         tokio::select!(
             biased;
 
-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                 // We were requested to shut down.
                 tracing::info!("shutdown request received during handshake");
                 return Err(QueryError::Shutdown)
@@ -473,7 +464,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         let mut query_string = Bytes::new();
         while let Some(msg) = tokio::select!(
             biased;
-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                 // We were requested to shut down.
                 tracing::info!("shutdown request received in run_message_loop");
                 return Err(QueryError::Shutdown)
@@ -485,7 +476,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
             let result = self.process_message(handler, msg, &mut query_string).await;
             tokio::select!(
                 biased;
-                _ = shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                     // We were requested to shut down.
                     tracing::info!("shutdown request received during response flush");
 
diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs
index 80df9db858..7ec85f0dbe 100644
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -3,13 +3,14 @@ use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
 use pq_proto::{BeMessage, RowDescriptor};
 use std::io::Cursor;
-use std::{future, sync::Arc};
+use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::{TcpListener, TcpStream};
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::MakeTlsConnect;
 use tokio_postgres::{Config, NoTls, SimpleQueryMessage};
 use tokio_postgres_rustls::MakeRustlsConnect;
+use tokio_util::sync::CancellationToken;
 
 // generate client, server test streams
 async fn make_tcp_pair() -> (TcpStream, TcpStream) {
@@ -50,7 +51,7 @@ async fn simple_select() {
 
     tokio::spawn(async move {
         let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, future::pending::<()>).await
+        pgbackend.run(&mut handler, &CancellationToken::new()).await
     });
 
     let conf = Config::new();
@@ -102,7 +103,7 @@ async fn simple_select_ssl() {
 
     tokio::spawn(async move {
         let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, future::pending::<()>).await
+        pgbackend.run(&mut handler, &CancellationToken::new()).await
     });
 
     let client_cfg = rustls::ClientConfig::builder()
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 07365b5eb8..975c912970 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -267,7 +267,7 @@ async fn page_service_conn_main(
     let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
 
     match pgbackend
-        .run(&mut conn_handler, task_mgr::shutdown_watcher)
+        .run(&mut conn_handler, &task_mgr::shutdown_token())
         .await
     {
         Ok(()) => {
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index c7a2d467c0..befe7d7510 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -6,8 +6,9 @@ use anyhow::Context;
 use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
-use std::{convert::Infallible, future};
+use std::convert::Infallible;
 use tokio::net::{TcpListener, TcpStream};
+use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, Instrument};
 
 static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);
@@ -67,7 +68,9 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {
 
 async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
     let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?;
-    pgbackend.run(&mut MgmtHandler, future::pending::<()>).await
+    pgbackend
+        .run(&mut MgmtHandler, &CancellationToken::new())
+        .await
 }
 
 /// A message received by `mgmt` when a compute node is ready.
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 4a97eb3993..091571111e 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -4,9 +4,10 @@
 //!
 use anyhow::{Context, Result};
 use postgres_backend::QueryError;
-use std::{future, time::Duration};
+use std::time::Duration;
 use tokio::net::TcpStream;
 use tokio_io_timeout::TimeoutReader;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{auth::Scope, measured_stream::MeasuredStream};
 
@@ -100,7 +101,7 @@ async fn handle_socket(
     // libpq protocol between safekeeper and walproposer / pageserver
     // We don't use shutdown.
     pgbackend
-        .run(&mut conn_handler, future::pending::<()>)
+        .run(&mut conn_handler, &CancellationToken::new())
         .await
 }
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 7f57585994..832fe06bf6 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -34,10 +34,8 @@ deranged = { version = "0.3", default-features = false, features = ["powerfmt",
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures-channel = { version = "0.3", features = ["sink"] }
-futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
-futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", features = ["raw"] }

From 9bb16c8780da435e6de9fac08e11d4e0c2f5c682 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 9 Jul 2024 20:58:48 +0200
Subject: [PATCH 1158/1571] fix(l0_flush): drops permit before fsync, potential
 cause for OOMs (#8327)

## Problem

Slack thread:
https://neondb.slack.com/archives/C033RQ5SPDH/p1720511577862519

We're seeing OOMs in staging on a pageserver that has
l0_flush.mode=Direct enabled.

There's a strong correlation between jumps in `maxrss_kb` and
`pageserver_timeline_ephemeral_bytes`, so, it's quite likely that
l0_flush.mode=Direct is the culprit.

Notably, the expected max memory usage on that staging server by the
l0_flush.mode=Direct is ~2GiB but we're seeing as much as 24GiB max RSS
before the OOM kill.

One hypothesis is that we're dropping the semaphore permit before all
the dirtied pages have been flushed to disk. (The flushing to disk
likely happens in the fsync inside the `.finish()` call, because we're
using ext4 in data=ordered mode).

## Summary of changes

Hold the permit until after we're done with `.finish()`.
---
 .../src/tenant/storage_layer/inmemory_layer.rs   | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index e1eaea90af..5941a52e98 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -715,16 +715,22 @@ impl InMemoryLayer {
                         res?;
                     }
                 }
-
-                // Hold the permit until the IO is done; if we didn't, one could drop this future,
-                // thereby releasing the permit, but the Vec<u8> remains allocated until the IO completes.
-                // => we'd have more concurrenct Vec<u8> than allowed as per the semaphore.
-                drop(_concurrency_permit);
             }
         }
 
         // MAX is used here because we identify L0 layers by full key range
         let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
+
+        // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
+        //
+        // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
+        // the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
+        // Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
+        //
+        // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
+        // we dirtied when writing to the filesystem have been flushed and marked !dirty.
+        drop(_concurrency_permit);
+
         Ok(Some(delta_layer))
     }
 }

From 1a49f1c15c7e728812d7a46191b5d3f194d98999 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 9 Jul 2024 23:17:42 +0200
Subject: [PATCH 1159/1571] pageserver: move `page_service`'s  `import
 basebackup` / `import wal` to mgmt API (#8292)

I want to fix bugs in `page_service`
([issue](https://github.com/neondatabase/neon/issues/7427)) and the
`import basebackup` / `import wal` stand in the way / make the
refactoring more complicated.

We don't use these methods anyway in practice, but, there have been some
objections to removing the functionality completely.

So, this PR preserves the existing functionality but moves it into the
HTTP management API.

Note that I don't try to fix existing bugs in the code, specifically not
fixing
* it only ever worked correctly for unsharded tenants
* it doesn't clean up on error

All errors are mapped to `ApiError::InternalServerError`.
---
 control_plane/src/pageserver.rs    |  58 ++---
 libs/utils/src/http/request.rs     |   9 +
 pageserver/client/Cargo.toml       |   2 +-
 pageserver/client/src/mgmt_api.rs  |  79 ++++++-
 pageserver/src/bin/pageserver.rs   |   1 -
 pageserver/src/http/routes.rs      | 194 ++++++++++++++++
 pageserver/src/metrics.rs          |   2 -
 pageserver/src/page_service.rs     | 357 +----------------------------
 storage_controller/src/node.rs     |   2 +-
 storage_controller/src/service.rs  |   4 +
 test_runner/regress/test_import.py |   3 +-
 11 files changed, 302 insertions(+), 409 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 983f78577c..f0403b1796 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -15,7 +15,6 @@ use std::time::Duration;
 
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use futures::SinkExt;
 use pageserver_api::models::{
     self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
 };
@@ -566,60 +565,39 @@ impl PageServerNode {
         pg_wal: Option<(Lsn, PathBuf)>,
         pg_version: u32,
     ) -> anyhow::Result<()> {
-        let (client, conn) = self.page_server_psql_client().await?;
-        // The connection object performs the actual communication with the database,
-        // so spawn it off to run on its own.
-        tokio::spawn(async move {
-            if let Err(e) = conn.await {
-                eprintln!("connection error: {}", e);
-            }
-        });
-        let client = std::pin::pin!(client);
-
         // Init base reader
         let (start_lsn, base_tarfile_path) = base;
         let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
-        let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);
+        let base_tarfile =
+            mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile));
 
         // Init wal reader if necessary
         let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
             let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
-            let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
+            let wal_reader =
+                mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile));
             (end_lsn, Some(wal_reader))
         } else {
             (start_lsn, None)
         };
 
-        let copy_in = |reader, cmd| {
-            let client = &client;
-            async move {
-                let writer = client.copy_in(&cmd).await?;
-                let writer = std::pin::pin!(writer);
-                let mut writer = writer.sink_map_err(|e| {
-                    std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
-                });
-                let mut reader = std::pin::pin!(reader);
-                writer.send_all(&mut reader).await?;
-                writer.into_inner().finish().await?;
-                anyhow::Ok(())
-            }
-        };
-
         // Import base
-        copy_in(
-            base_tarfile,
-            format!(
-                "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
-            ),
-        )
-        .await?;
-        // Import wal if necessary
-        if let Some(wal_reader) = wal_reader {
-            copy_in(
-                wal_reader,
-                format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
+        self.http_client
+            .import_basebackup(
+                tenant_id,
+                timeline_id,
+                start_lsn,
+                end_lsn,
+                pg_version,
+                base_tarfile,
             )
             .await?;
+
+        // Import wal if necessary
+        if let Some(wal_reader) = wal_reader {
+            self.http_client
+                .import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader)
+                .await?;
         }
 
         Ok(())
diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs
index 766bbfc9df..8b8ed5a67f 100644
--- a/libs/utils/src/http/request.rs
+++ b/libs/utils/src/http/request.rs
@@ -74,6 +74,15 @@ pub fn parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
         .transpose()
 }
 
+pub fn must_parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
+    request: &Request<Body>,
+    param_name: &str,
+) -> Result<T, ApiError> {
+    parse_query_param(request, param_name)?.ok_or_else(|| {
+        ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters"))
+    })
+}
+
 pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
     match request.body_mut().data().await {
         Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),
diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml
index 0ed27602cd..a938367334 100644
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 pageserver_api.workspace = true
 thiserror.workspace = true
 async-trait.workspace = true
-reqwest.workspace = true
+reqwest = { workspace = true, features = [ "stream" ] }
 utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 48b27775cb..e3ddb446fa 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -9,6 +9,8 @@ use utils::{
     lsn::Lsn,
 };
 
+pub use reqwest::Body as ReqwestBody;
+
 pub mod util;
 
 #[derive(Debug, Clone)]
@@ -20,6 +22,9 @@ pub struct Client {
 
 #[derive(thiserror::Error, Debug)]
 pub enum Error {
+    #[error("send request: {0}")]
+    SendRequest(reqwest::Error),
+
     #[error("receive body: {0}")]
     ReceiveBody(reqwest::Error),
 
@@ -173,19 +178,30 @@ impl Client {
         self.request(Method::GET, uri, ()).await
     }
 
+    fn start_request<U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+    ) -> reqwest::RequestBuilder {
+        let req = self.client.request(method, uri);
+        if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        }
+    }
+
     async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
         &self,
         method: Method,
         uri: U,
         body: B,
     ) -> Result<reqwest::Response> {
-        let req = self.client.request(method, uri);
-        let req = if let Some(value) = &self.authorization_header {
-            req.header(reqwest::header::AUTHORIZATION, value)
-        } else {
-            req
-        };
-        req.json(&body).send().await.map_err(Error::ReceiveBody)
+        self.start_request(method, uri)
+            .json(&body)
+            .send()
+            .await
+            .map_err(Error::ReceiveBody)
     }
 
     async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
@@ -609,4 +625,53 @@ impl Client {
             }),
         }
     }
+
+    pub async fn import_basebackup(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        base_lsn: Lsn,
+        end_lsn: Lsn,
+        pg_version: u32,
+        basebackup_tarball: ReqwestBody,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}",
+            self.mgmt_api_endpoint,
+        );
+        self.start_request(Method::PUT, uri)
+            .body(basebackup_tarball)
+            .send()
+            .await
+            .map_err(Error::SendRequest)?
+            .error_from_body()
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn import_wal(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        start_lsn: Lsn,
+        end_lsn: Lsn,
+        wal_tarball: ReqwestBody,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_wal?start_lsn={start_lsn}&end_lsn={end_lsn}",
+            self.mgmt_api_endpoint,
+        );
+        self.start_request(Method::PUT, uri)
+            .body(wal_tarball)
+            .send()
+            .await
+            .map_err(Error::SendRequest)?
+            .error_from_body()
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
 }
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 2763352a21..9f705f0bc9 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -660,7 +660,6 @@ fn start_pageserver(
                 async move {
                     page_service::libpq_listener_main(
                         tenant_manager,
-                        broker_client,
                         pg_auth,
                         pageserver_listener,
                         conf.pg_auth_type,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 893302b7d6..6f8f3e6389 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -10,6 +10,7 @@ use std::time::Duration;
 
 use anyhow::{anyhow, Context, Result};
 use enumset::EnumSet;
+use futures::StreamExt;
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
 use hyper::header;
@@ -44,12 +45,14 @@ use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
 use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
+use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
+use utils::http::request::must_parse_query_param;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
 
 use crate::context::{DownloadBehavior, RequestContext};
@@ -2404,6 +2407,189 @@ async fn post_top_tenants(
     )
 }
 
+async fn put_tenant_timeline_import_basebackup(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?;
+    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
+    let pg_version: u32 = must_parse_query_param(&request, "pg_version")?;
+
+    check_permission(&request, Some(tenant_id))?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
+    async move {
+        let state = get_state(&request);
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
+
+        let broker_client = state.broker_client.clone();
+
+        let mut body = StreamReader::new(request.into_body().map(|res| {
+            res.map_err(|error| {
+                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
+            })
+        }));
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+        let timeline = tenant
+            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
+            .map_err(ApiError::InternalServerError)
+            .await?;
+
+        // TODO mark timeline as not ready until it reaches end_lsn.
+        // We might have some wal to import as well, and we should prevent compute
+        // from connecting before that and writing conflicting wal.
+        //
+        // This is not relevant for pageserver->pageserver migrations, since there's
+        // no wal to import. But should be fixed if we want to import from postgres.
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import basebackup provided via CopyData
+        info!("importing basebackup");
+
+        timeline
+            .import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        // Read the end of the tar archive.
+        read_tar_eof(body)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        // TODO check checksum
+        // Meanwhile you can verify client-side by taking fullbackup
+        // and checking that it matches in size with what was imported.
+        // It wouldn't work if base came from vanilla postgres though,
+        // since we discard some log files.
+
+        info!("done");
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(span)
+    .await
+}
+
+async fn put_tenant_timeline_import_wal(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let start_lsn: Lsn = must_parse_query_param(&request, "start_lsn")?;
+    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
+
+    check_permission(&request, Some(tenant_id))?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn);
+    async move {
+        let state = get_state(&request);
+
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
+
+        let mut body = StreamReader::new(request.into_body().map(|res| {
+            res.map_err(|error| {
+                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
+            })
+        }));
+
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if last_record_lsn != start_lsn {
+            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
+        }
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import wal provided via CopyData
+        info!("importing wal");
+        crate::import_datadir::import_wal_from_tar(&timeline, &mut body, start_lsn, end_lsn, &ctx).await.map_err(ApiError::InternalServerError)?;
+        info!("wal import complete");
+
+        // Read the end of the tar archive.
+        read_tar_eof(body).await.map_err(ApiError::InternalServerError)?;
+
+        // TODO Does it make sense to overshoot?
+        if timeline.get_last_record_lsn() < end_lsn {
+            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
+        }
+
+        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
+        // We only want to persist the data, and it doesn't matter if it's in the
+        // shape of deltas or images.
+        info!("flushing layers");
+        timeline.freeze_and_flush().await.map_err(|e| match e {
+            tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
+            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
+        })?;
+
+        info!("done");
+
+        json_response(StatusCode::OK, ())
+    }.instrument(span).await
+}
+
+/// Read the end of a tar archive.
+///
+/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
+/// `tokio_tar` already read the first such block. Read the second all-zeros block,
+/// and check that there is no more data after the EOF marker.
+///
+/// 'tar' command can also write extra blocks of zeros, up to a record
+/// size, controlled by the --record-size argument. Ignore them too.
+async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
+    use tokio::io::AsyncReadExt;
+    let mut buf = [0u8; 512];
+
+    // Read the all-zeros block, and verify it
+    let mut total_bytes = 0;
+    while total_bytes < 512 {
+        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
+        total_bytes += nbytes;
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if total_bytes < 512 {
+        anyhow::bail!("incomplete or invalid tar EOF marker");
+    }
+    if !buf.iter().all(|&x| x == 0) {
+        anyhow::bail!("invalid tar EOF marker");
+    }
+
+    // Drain any extra zero-blocks after the EOF marker
+    let mut trailing_bytes = 0;
+    let mut seen_nonzero_bytes = false;
+    loop {
+        let nbytes = reader.read(&mut buf).await?;
+        trailing_bytes += nbytes;
+        if !buf.iter().all(|&x| x == 0) {
+            seen_nonzero_bytes = true;
+        }
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if seen_nonzero_bytes {
+        anyhow::bail!("unexpected non-zero bytes after the tar archive");
+    }
+    if trailing_bytes % 512 != 0 {
+        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
+    }
+    Ok(())
+}
+
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2698,5 +2884,13 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
             |r| testing_api_handler("perf_info", r, perf_info),
         )
+        .put(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_basebackup",
+            |r| api_handler(r, put_tenant_timeline_import_basebackup),
+        )
+        .put(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal",
+            |r| api_handler(r, put_tenant_timeline_import_wal),
+        )
         .any(handler_404))
 }
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 59b7293631..e67fa656d0 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1473,8 +1473,6 @@ pub(crate) enum ComputeCommandKind {
     PageStream,
     Basebackup,
     Fullbackup,
-    ImportBasebackup,
-    ImportWal,
     LeaseLsn,
     Show,
 }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 975c912970..c10c2f2a0f 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -4,9 +4,7 @@
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
-use bytes::Bytes;
 use futures::stream::FuturesUnordered;
-use futures::Stream;
 use futures::StreamExt;
 use pageserver_api::key::Key;
 use pageserver_api::models::TenantState;
@@ -28,7 +26,6 @@ use std::borrow::Cow;
 use std::collections::HashMap;
 use std::io;
 use std::net::TcpListener;
-use std::pin::pin;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -37,7 +34,6 @@ use std::time::Instant;
 use std::time::SystemTime;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::ConnectionId;
@@ -53,7 +49,6 @@ use crate::auth::check_permission;
 use crate::basebackup;
 use crate::basebackup::BasebackupError;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
 use crate::pgdatadir_mapping::Version;
@@ -66,7 +61,6 @@ use crate::tenant::mgr::GetTenantError;
 use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
-use crate::tenant::timeline::FlushLayerError;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
@@ -82,56 +76,6 @@ use postgres_ffi::BLCKSZ;
 // is not yet in state [`TenantState::Active`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 
-/// Read the end of a tar archive.
-///
-/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
-/// `tokio_tar` already read the first such block. Read the second all-zeros block,
-/// and check that there is no more data after the EOF marker.
-///
-/// 'tar' command can also write extra blocks of zeros, up to a record
-/// size, controlled by the --record-size argument. Ignore them too.
-async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
-    use tokio::io::AsyncReadExt;
-    let mut buf = [0u8; 512];
-
-    // Read the all-zeros block, and verify it
-    let mut total_bytes = 0;
-    while total_bytes < 512 {
-        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
-        total_bytes += nbytes;
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if total_bytes < 512 {
-        anyhow::bail!("incomplete or invalid tar EOF marker");
-    }
-    if !buf.iter().all(|&x| x == 0) {
-        anyhow::bail!("invalid tar EOF marker");
-    }
-
-    // Drain any extra zero-blocks after the EOF marker
-    let mut trailing_bytes = 0;
-    let mut seen_nonzero_bytes = false;
-    loop {
-        let nbytes = reader.read(&mut buf).await?;
-        trailing_bytes += nbytes;
-        if !buf.iter().all(|&x| x == 0) {
-            seen_nonzero_bytes = true;
-        }
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if seen_nonzero_bytes {
-        anyhow::bail!("unexpected non-zero bytes after the tar archive");
-    }
-    if trailing_bytes % 512 != 0 {
-        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
-    }
-    Ok(())
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 
 ///
@@ -141,7 +85,6 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 ///
 pub async fn libpq_listener_main(
     tenant_manager: Arc<TenantManager>,
-    broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     listener: TcpListener,
     auth_type: AuthType,
@@ -186,7 +129,6 @@ pub async fn libpq_listener_main(
                     false,
                     page_service_conn_main(
                         tenant_manager.clone(),
-                        broker_client.clone(),
                         local_auth,
                         socket,
                         auth_type,
@@ -209,7 +151,6 @@ pub async fn libpq_listener_main(
 #[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
     tenant_manager: Arc<TenantManager>,
-    broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     socket: tokio::net::TcpStream,
     auth_type: AuthType,
@@ -262,8 +203,7 @@ async fn page_service_conn_main(
     // and create a child per-query context when it invokes process_query.
     // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
     // and create the per-query context in process_query ourselves.
-    let mut conn_handler =
-        PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
+    let mut conn_handler = PageServerHandler::new(tenant_manager, auth, connection_ctx);
     let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
 
     match pgbackend
@@ -294,7 +234,6 @@ struct HandlerTimeline {
 }
 
 struct PageServerHandler {
-    broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     claims: Option<Claims>,
 
@@ -386,13 +325,11 @@ impl From<WaitLsnError> for QueryError {
 impl PageServerHandler {
     pub fn new(
         tenant_manager: Arc<TenantManager>,
-        broker_client: storage_broker::BrokerClientChannel,
         auth: Option<Arc<SwappableJwtAuth>>,
         connection_ctx: RequestContext,
     ) -> Self {
         PageServerHandler {
             tenant_manager,
-            broker_client,
             auth,
             claims: None,
             connection_ctx,
@@ -475,73 +412,6 @@ impl PageServerHandler {
         )
     }
 
-    fn copyin_stream<'a, IO>(
-        &'a self,
-        pgb: &'a mut PostgresBackend<IO>,
-        cancel: &'a CancellationToken,
-    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        async_stream::try_stream! {
-            loop {
-                let msg = tokio::select! {
-                    biased;
-
-                    _ = cancel.cancelled() => {
-                        // We were requested to shut down.
-                        let msg = "pageserver is shutting down";
-                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
-                        Err(QueryError::Shutdown)
-                    }
-
-                    msg = pgb.read_message() => { msg.map_err(QueryError::from)}
-                };
-
-                match msg {
-                    Ok(Some(message)) => {
-                        let copy_data_bytes = match message {
-                            FeMessage::CopyData(bytes) => bytes,
-                            FeMessage::CopyDone => { break },
-                            FeMessage::Sync => continue,
-                            FeMessage::Terminate => {
-                                let msg = "client terminated connection with Terminate message during COPY";
-                                let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                                // error can't happen here, ErrorResponse serialization should be always ok
-                                pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                                Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                                break;
-                            }
-                            m => {
-                                let msg = format!("unexpected message {m:?}");
-                                // error can't happen here, ErrorResponse serialization should be always ok
-                                pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
-                                Err(io::Error::new(io::ErrorKind::Other, msg))?;
-                                break;
-                            }
-                        };
-
-                        yield copy_data_bytes;
-                    }
-                    Ok(None) => {
-                        let msg = "client closed connection during COPY";
-                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                        // error can't happen here, ErrorResponse serialization should be always ok
-                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                        self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
-                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                    }
-                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
-                        Err(io_error)?;
-                    }
-                    Err(other) => {
-                        Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
-                    }
-                };
-            }
-        }
-    }
-
     #[instrument(skip_all)]
     async fn handle_pagerequests<IO>(
         &mut self,
@@ -713,128 +583,6 @@ impl PageServerHandler {
         Ok(())
     }
 
-    #[allow(clippy::too_many_arguments)]
-    #[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
-    async fn handle_import_basebackup<IO>(
-        &self,
-        pgb: &mut PostgresBackend<IO>,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        base_lsn: Lsn,
-        _end_lsn: Lsn,
-        pg_version: u32,
-        ctx: RequestContext,
-    ) -> Result<(), QueryError>
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
-
-        // Create empty timeline
-        info!("creating new timeline");
-        let tenant = self
-            .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
-            .await?;
-        let timeline = tenant
-            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
-            .await?;
-
-        // TODO mark timeline as not ready until it reaches end_lsn.
-        // We might have some wal to import as well, and we should prevent compute
-        // from connecting before that and writing conflicting wal.
-        //
-        // This is not relevant for pageserver->pageserver migrations, since there's
-        // no wal to import. But should be fixed if we want to import from postgres.
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import basebackup provided via CopyData
-        info!("importing basebackup");
-        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb, &tenant.cancel).await?;
-
-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
-        timeline
-            .import_basebackup_from_tar(
-                tenant.clone(),
-                &mut copyin_reader,
-                base_lsn,
-                self.broker_client.clone(),
-                &ctx,
-            )
-            .await?;
-
-        // Read the end of the tar archive.
-        read_tar_eof(copyin_reader).await?;
-
-        // TODO check checksum
-        // Meanwhile you can verify client-side by taking fullbackup
-        // and checking that it matches in size with what was imported.
-        // It wouldn't work if base came from vanilla postgres though,
-        // since we discard some log files.
-
-        info!("done");
-        Ok(())
-    }
-
-    #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
-    async fn handle_import_wal<IO>(
-        &self,
-        pgb: &mut PostgresBackend<IO>,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        start_lsn: Lsn,
-        end_lsn: Lsn,
-        ctx: RequestContext,
-    ) -> Result<(), QueryError>
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
-            .await?;
-        let last_record_lsn = timeline.get_last_record_lsn();
-        if last_record_lsn != start_lsn {
-            return Err(QueryError::Other(
-                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
-            );
-        }
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import wal provided via CopyData
-        info!("importing wal");
-        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
-        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
-        info!("wal import complete");
-
-        // Read the end of the tar archive.
-        read_tar_eof(copyin_reader).await?;
-
-        // TODO Does it make sense to overshoot?
-        if timeline.get_last_record_lsn() < end_lsn {
-            return Err(QueryError::Other(
-                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
-            );
-        }
-
-        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
-        // We only want to persist the data, and it doesn't matter if it's in the
-        // shape of deltas or images.
-        info!("flushing layers");
-        timeline.freeze_and_flush().await.map_err(|e| match e {
-            FlushLayerError::Cancelled => QueryError::Shutdown,
-            other => QueryError::Other(other.into()),
-        })?;
-
-        info!("done");
-        Ok(())
-    }
-
     /// Helper function to handle the LSN from client request.
     ///
     /// Each GetPage (and Exists and Nblocks) request includes information about
@@ -1705,109 +1453,6 @@ where
             )
             .await?;
             pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("import basebackup ") {
-            // Import the `base` section (everything but the wal) of a basebackup.
-            // Assumes the tenant already exists on this pageserver.
-            //
-            // Files are scheduled to be persisted to remote storage, and the
-            // caller should poll the http api to check when that is done.
-            //
-            // Example import command:
-            // 1. Get start/end LSN from backup_manifest file
-            // 2. Run:
-            // cat my_backup/base.tar | psql -h $PAGESERVER \
-            //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
-            let params = &parts[2..];
-            if params.len() != 5 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for import basebackup command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-            let base_lsn = Lsn::from_str(params[2])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
-            let end_lsn = Lsn::from_str(params[3])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
-            let pg_version = u32::from_str(params[4])
-                .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::ImportBasebackup)
-                .inc();
-
-            match self
-                .handle_import_basebackup(
-                    pgb,
-                    tenant_id,
-                    timeline_id,
-                    base_lsn,
-                    end_lsn,
-                    pg_version,
-                    ctx,
-                )
-                .await
-            {
-                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
-                Err(e) => {
-                    error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?
-                }
-            };
-        } else if query_string.starts_with("import wal ") {
-            // Import the `pg_wal` section of a basebackup.
-            //
-            // Files are scheduled to be persisted to remote storage, and the
-            // caller should poll the http api to check when that is done.
-            let params = &parts[2..];
-            if params.len() != 4 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for import wal command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-            let start_lsn = Lsn::from_str(params[2])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
-            let end_lsn = Lsn::from_str(params[3])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::ImportWal)
-                .inc();
-
-            match self
-                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
-                .await
-            {
-                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
-                Err(e) => {
-                    error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?
-                }
-            };
         } else if query_string.to_ascii_lowercase().starts_with("set ") {
             // important because psycopg2 executes "SET datestyle TO 'ISO'"
             // on connect
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 4d17dff9fe..fff44aaf26 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -226,7 +226,7 @@ impl Node {
         fn is_fatal(e: &mgmt_api::Error) -> bool {
             use mgmt_api::Error::*;
             match e {
-                ReceiveBody(_) | ReceiveErrorBody(_) => false,
+                SendRequest(_) | ReceiveBody(_) | ReceiveErrorBody(_) => false,
                 ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
                 | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
                 | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 78f0848c24..aada1939ee 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -151,6 +151,10 @@ struct ServiceState {
 /// controller API.
 fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
     match e {
+        mgmt_api::Error::SendRequest(e) => {
+            // Presume errors sending requests are connectivity/availability issues
+            ApiError::ResourceUnavailable(format!("{node} error sending request: {e}").into())
+        }
         mgmt_api::Error::ReceiveErrorBody(str) => {
             // Presume errors receiving body are connectivity/availability issues
             ApiError::ResourceUnavailable(
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index d97e882a70..4dae9176b8 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -88,7 +88,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
 
     env.pageserver.allowed_errors.extend(
         [
-            ".*error importing base backup .*",
+            ".*Failed to import basebackup.*",
+            ".*unexpected non-zero bytes after the tar archive.*",
             ".*Timeline got dropped without initializing, cleaning its files.*",
             ".*InternalServerError.*timeline not found.*",
             ".*InternalServerError.*Tenant .* not found.*",

From fe13fccdc2a0e097bb785edb4ff3913aee35789f Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 10 Jul 2024 09:10:29 +0100
Subject: [PATCH 1160/1571] proxy: pg17 fixes (#8321)

## Problem

#7809 - we do not support sslnegotiation=direct
#7810 - we do not support negotiating down the protocol extensions.

## Summary of changes

1. Same as postgres, check the first startup packet byte for tls header
`0x16`, and check the ALPN.
2. Tell clients using protocol >3.0 to downgrade
---
 libs/postgres_backend/src/lib.rs |  12 ++-
 libs/pq_proto/src/framed.rs      |   6 +-
 libs/pq_proto/src/lib.rs         |  91 ++++++++++++++----
 proxy/src/bin/pg_sni_router.rs   |   3 +-
 proxy/src/config.rs              |  12 ++-
 proxy/src/proxy/handshake.rs     | 152 ++++++++++++++++++++++++++-----
 6 files changed, 222 insertions(+), 54 deletions(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index c79ee4e053..7c7c6535b3 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -663,11 +663,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         assert!(self.state < ProtoState::Authentication);
         let have_tls = self.tls_config.is_some();
         match msg {
-            FeStartupPacket::SslRequest => {
+            FeStartupPacket::SslRequest { direct } => {
                 debug!("SSL requested");
 
-                self.write_message(&BeMessage::EncryptionResponse(have_tls))
-                    .await?;
+                if !direct {
+                    self.write_message(&BeMessage::EncryptionResponse(have_tls))
+                        .await?;
+                } else if !have_tls {
+                    return Err(QueryError::Other(anyhow::anyhow!(
+                        "direct SSL negotiation but no TLS support"
+                    )));
+                }
 
                 if have_tls {
                     self.start_tls().await?;
diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs
index 6e97b8c2a0..ccbb90e384 100644
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -44,9 +44,9 @@ impl ConnectionError {
 /// Wraps async io `stream`, providing messages to write/flush + read Postgres
 /// messages.
 pub struct Framed<S> {
-    stream: S,
-    read_buf: BytesMut,
-    write_buf: BytesMut,
+    pub stream: S,
+    pub read_buf: BytesMut,
+    pub write_buf: BytesMut,
 }
 
 impl<S> Framed<S> {
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index cee3742017..a01191bd5d 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -39,14 +39,39 @@ pub enum FeMessage {
     PasswordMessage(Bytes),
 }
 
+#[derive(Clone, Copy, PartialEq, PartialOrd)]
+pub struct ProtocolVersion(u32);
+
+impl ProtocolVersion {
+    pub const fn new(major: u16, minor: u16) -> Self {
+        Self((major as u32) << 16 | minor as u32)
+    }
+    pub const fn minor(self) -> u16 {
+        self.0 as u16
+    }
+    pub const fn major(self) -> u16 {
+        (self.0 >> 16) as u16
+    }
+}
+
+impl fmt::Debug for ProtocolVersion {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_list()
+            .entry(&self.major())
+            .entry(&self.minor())
+            .finish()
+    }
+}
+
 #[derive(Debug)]
 pub enum FeStartupPacket {
     CancelRequest(CancelKeyData),
-    SslRequest,
+    SslRequest {
+        direct: bool,
+    },
     GssEncRequest,
     StartupMessage {
-        major_version: u32,
-        minor_version: u32,
+        version: ProtocolVersion,
         params: StartupMessageParams,
     },
 }
@@ -301,11 +326,23 @@ impl FeStartupPacket {
     /// different from [`FeMessage::parse`] because startup messages don't have
     /// message type byte; otherwise, its comments apply.
     pub fn parse(buf: &mut BytesMut) -> Result<Option<FeStartupPacket>, ProtocolError> {
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L118>
         const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
-        const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234;
-        const CANCEL_REQUEST_CODE: u32 = 5678;
-        const NEGOTIATE_SSL_CODE: u32 = 5679;
-        const NEGOTIATE_GSS_CODE: u32 = 5680;
+        const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234;
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L132>
+        const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678);
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L166>
+        const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679);
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L167>
+        const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680);
+
+        // <https://github.com/postgres/postgres/blob/04bcf9e19a4261fe9c7df37c777592c2e10c32a7/src/backend/tcop/backend_startup.c#L378-L382>
+        // First byte indicates standard SSL handshake message
+        // (It can't be a Postgres startup length because in network byte order
+        // that would be a startup packet hundreds of megabytes long)
+        if buf.first() == Some(&0x16) {
+            return Ok(Some(FeStartupPacket::SslRequest { direct: true }));
+        }
 
         // need at least 4 bytes with packet len
         if buf.len() < 4 {
@@ -338,12 +375,10 @@ impl FeStartupPacket {
         let mut msg = buf.split_to(len).freeze();
         msg.advance(4); // consume len
 
-        let request_code = msg.get_u32();
-        let req_hi = request_code >> 16;
-        let req_lo = request_code & ((1 << 16) - 1);
+        let request_code = ProtocolVersion(msg.get_u32());
         // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code.
-        let message = match (req_hi, req_lo) {
-            (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
+        let message = match request_code {
+            CANCEL_REQUEST_CODE => {
                 if msg.remaining() != 8 {
                     return Err(ProtocolError::BadMessage(
                         "CancelRequest message is malformed, backend PID / secret key missing"
@@ -355,21 +390,22 @@ impl FeStartupPacket {
                     cancel_key: msg.get_i32(),
                 })
             }
-            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
+            NEGOTIATE_SSL_CODE => {
                 // Requested upgrade to SSL (aka TLS)
-                FeStartupPacket::SslRequest
+                FeStartupPacket::SslRequest { direct: false }
             }
-            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
+            NEGOTIATE_GSS_CODE => {
                 // Requested upgrade to GSSAPI
                 FeStartupPacket::GssEncRequest
             }
-            (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
+            version if version.major() == RESERVED_INVALID_MAJOR_VERSION => {
                 return Err(ProtocolError::Protocol(format!(
-                    "Unrecognized request code {unrecognized_code}"
+                    "Unrecognized request code {}",
+                    version.minor()
                 )));
             }
             // TODO bail if protocol major_version is not 3?
-            (major_version, minor_version) => {
+            version => {
                 // StartupMessage
 
                 let s = str::from_utf8(&msg).map_err(|_e| {
@@ -382,8 +418,7 @@ impl FeStartupPacket {
                 })?;
 
                 FeStartupPacket::StartupMessage {
-                    major_version,
-                    minor_version,
+                    version,
                     params: StartupMessageParams {
                         params: msg.slice_ref(s.as_bytes()),
                     },
@@ -522,6 +557,10 @@ pub enum BeMessage<'a> {
     RowDescription(&'a [RowDescriptor<'a>]),
     XLogData(XLogDataBody<'a>),
     NoticeResponse(&'a str),
+    NegotiateProtocolVersion {
+        version: ProtocolVersion,
+        options: &'a [&'a str],
+    },
     KeepAlive(WalSndKeepAlive),
 }
 
@@ -945,6 +984,18 @@ impl<'a> BeMessage<'a> {
                     buf.put_u8(u8::from(req.request_reply));
                 });
             }
+
+            BeMessage::NegotiateProtocolVersion { version, options } => {
+                buf.put_u8(b'v');
+                write_body(buf, |buf| {
+                    buf.put_u32(version.0);
+                    buf.put_u32(options.len() as u32);
+                    for option in options.iter() {
+                        write_cstr(option, buf)?;
+                    }
+                    Ok(())
+                })?
+            }
         }
         Ok(())
     }
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 44e880838e..d7a3eb9a4d 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -216,10 +216,11 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
     use pq_proto::FeStartupPacket::*;
 
     match msg {
-        SslRequest => {
+        SslRequest { direct: false } => {
             stream
                 .write_message(&pq_proto::BeMessage::EncryptionResponse(true))
                 .await?;
+
             // Upgrade raw stream into a secure TLS-backed stream.
             // NOTE: We've consumed `tls`; this fact will be used later.
 
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index af5511d7ec..6504919760 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -75,6 +75,9 @@ impl TlsConfig {
     }
 }
 
+/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L159>
+pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql";
+
 /// Configure TLS for the main endpoint.
 pub fn configure_tls(
     key_path: &str,
@@ -111,16 +114,17 @@ pub fn configure_tls(
     let cert_resolver = Arc::new(cert_resolver);
 
     // allow TLS 1.2 to be compatible with older client libraries
-    let config = rustls::ServerConfig::builder_with_protocol_versions(&[
+    let mut config = rustls::ServerConfig::builder_with_protocol_versions(&[
         &rustls::version::TLS13,
         &rustls::version::TLS12,
     ])
     .with_no_client_auth()
-    .with_cert_resolver(cert_resolver.clone())
-    .into();
+    .with_cert_resolver(cert_resolver.clone());
+
+    config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()];
 
     Ok(TlsConfig {
-        config,
+        config: Arc::new(config),
         common_names,
         cert_resolver,
     })
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index dd935cc245..d488aea927 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -1,11 +1,17 @@
-use pq_proto::{BeMessage as Be, CancelKeyData, FeStartupPacket, StartupMessageParams};
+use bytes::Buf;
+use pq_proto::{
+    framed::Framed, BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion,
+    StartupMessageParams,
+};
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::info;
+use tracing::{info, warn};
 
 use crate::{
-    config::TlsConfig,
+    auth::endpoint_sni,
+    config::{TlsConfig, PG_ALPN_PROTOCOL},
     error::ReportableError,
+    metrics::Metrics,
     proxy::ERR_INSECURE_CONNECTION,
     stream::{PqStream, Stream, StreamUpgradeError},
 };
@@ -68,6 +74,9 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     // Client may try upgrading to each protocol only once
     let (mut tried_ssl, mut tried_gss) = (false, false);
 
+    const PG_PROTOCOL_EARLIEST: ProtocolVersion = ProtocolVersion::new(3, 0);
+    const PG_PROTOCOL_LATEST: ProtocolVersion = ProtocolVersion::new(3, 0);
+
     let mut stream = PqStream::new(Stream::from_raw(stream));
     loop {
         let msg = stream.read_startup_packet().await?;
@@ -75,40 +84,96 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
 
         use FeStartupPacket::*;
         match msg {
-            SslRequest => match stream.get_ref() {
+            SslRequest { direct } => match stream.get_ref() {
                 Stream::Raw { .. } if !tried_ssl => {
                     tried_ssl = true;
 
                     // We can't perform TLS handshake without a config
-                    let enc = tls.is_some();
-                    stream.write_message(&Be::EncryptionResponse(enc)).await?;
+                    let have_tls = tls.is_some();
+                    if !direct {
+                        stream
+                            .write_message(&Be::EncryptionResponse(have_tls))
+                            .await?;
+                    } else if !have_tls {
+                        return Err(HandshakeError::ProtocolViolation);
+                    }
+
                     if let Some(tls) = tls.take() {
                         // Upgrade raw stream into a secure TLS-backed stream.
                         // NOTE: We've consumed `tls`; this fact will be used later.
 
-                        let (raw, read_buf) = stream.into_inner();
-                        // TODO: Normally, client doesn't send any data before
-                        // server says TLS handshake is ok and read_buf is empy.
-                        // However, you could imagine pipelining of postgres
-                        // SSLRequest + TLS ClientHello in one hunk similar to
-                        // pipelining in our node js driver. We should probably
-                        // support that by chaining read_buf with the stream.
+                        let Framed {
+                            stream: raw,
+                            read_buf,
+                            write_buf,
+                        } = stream.framed;
+
+                        let Stream::Raw { raw } = raw else {
+                            return Err(HandshakeError::StreamUpgradeError(
+                                StreamUpgradeError::AlreadyTls,
+                            ));
+                        };
+
+                        let mut read_buf = read_buf.reader();
+                        let mut res = Ok(());
+                        let accept = tokio_rustls::TlsAcceptor::from(tls.to_server_config())
+                            .accept_with(raw, |session| {
+                                // push the early data to the tls session
+                                while !read_buf.get_ref().is_empty() {
+                                    match session.read_tls(&mut read_buf) {
+                                        Ok(_) => {}
+                                        Err(e) => {
+                                            res = Err(e);
+                                            break;
+                                        }
+                                    }
+                                }
+                            });
+
+                        res?;
+
+                        let read_buf = read_buf.into_inner();
                         if !read_buf.is_empty() {
                             return Err(HandshakeError::EarlyData);
                         }
-                        let tls_stream = raw
-                            .upgrade(tls.to_server_config(), record_handshake_error)
-                            .await?;
+
+                        let tls_stream = accept.await.inspect_err(|_| {
+                            if record_handshake_error {
+                                Metrics::get().proxy.tls_handshake_failures.inc()
+                            }
+                        })?;
+
+                        let conn_info = tls_stream.get_ref().1;
+
+                        // check the ALPN, if exists, as required.
+                        match conn_info.alpn_protocol() {
+                            None | Some(PG_ALPN_PROTOCOL) => {}
+                            Some(other) => {
+                                // try parse ep for better error
+                                let ep = conn_info.server_name().and_then(|sni| {
+                                    endpoint_sni(sni, &tls.common_names).ok().flatten()
+                                });
+                                let alpn = String::from_utf8_lossy(other);
+                                warn!(?ep, %alpn, "unexpected ALPN");
+                                return Err(HandshakeError::ProtocolViolation);
+                            }
+                        }
 
                         let (_, tls_server_end_point) = tls
                             .cert_resolver
-                            .resolve(tls_stream.get_ref().1.server_name())
+                            .resolve(conn_info.server_name())
                             .ok_or(HandshakeError::MissingCertificate)?;
 
-                        stream = PqStream::new(Stream::Tls {
-                            tls: Box::new(tls_stream),
-                            tls_server_end_point,
-                        });
+                        stream = PqStream {
+                            framed: Framed {
+                                stream: Stream::Tls {
+                                    tls: Box::new(tls_stream),
+                                    tls_server_end_point,
+                                },
+                                read_buf,
+                                write_buf,
+                            },
+                        };
                     }
                 }
                 _ => return Err(HandshakeError::ProtocolViolation),
@@ -122,7 +187,9 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 }
                 _ => return Err(HandshakeError::ProtocolViolation),
             },
-            StartupMessage { params, .. } => {
+            StartupMessage { params, version }
+                if PG_PROTOCOL_EARLIEST <= version && version <= PG_PROTOCOL_LATEST =>
+            {
                 // Check that the config has been consumed during upgrade
                 // OR we didn't provide it at all (for dev purposes).
                 if tls.is_some() {
@@ -131,9 +198,48 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                         .await?;
                 }
 
-                info!(session_type = "normal", "successful handshake");
+                info!(?version, session_type = "normal", "successful handshake");
                 break Ok(HandshakeData::Startup(stream, params));
             }
+            // downgrade protocol version
+            StartupMessage { params, version }
+                if version.major() == 3 && version > PG_PROTOCOL_LATEST =>
+            {
+                warn!(?version, "unsupported minor version");
+
+                // no protocol extensions are supported.
+                // <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/backend/tcop/backend_startup.c#L744-L753>
+                let mut unsupported = vec![];
+                for (k, _) in params.iter() {
+                    if k.starts_with("_pq_.") {
+                        unsupported.push(k);
+                    }
+                }
+
+                // TODO: remove unsupported options so we don't send them to compute.
+
+                stream
+                    .write_message(&Be::NegotiateProtocolVersion {
+                        version: PG_PROTOCOL_LATEST,
+                        options: &unsupported,
+                    })
+                    .await?;
+
+                info!(
+                    ?version,
+                    session_type = "normal",
+                    "successful handshake; unsupported minor version requested"
+                );
+                break Ok(HandshakeData::Startup(stream, params));
+            }
+            StartupMessage { version, .. } => {
+                warn!(
+                    ?version,
+                    session_type = "normal",
+                    "unsuccessful handshake; unsupported version"
+                );
+                return Err(HandshakeError::ProtocolViolation);
+            }
             CancelRequest(cancel_key_data) => {
                 info!(session_type = "cancellation", "successful handshake");
                 break Ok(HandshakeData::Cancel(cancel_key_data));

From e89ec55ea571c1f7ca0d722cd2ade07b6c2753cb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 10 Jul 2024 14:14:10 +0100
Subject: [PATCH 1161/1571] tests: stabilize test_sharding_split_compaction
 (#8318)

## Problem

This test incorrectly assumed that a post-split compaction would only
drop content. This was easily destabilized by any changes to image
generation rules.

## Summary of changes

- Before split, do a full image layer generation pass, to guarantee that
post-split compaction should only drop data, never create it.
- Fix the force_image_layer_creation mode of compaction that we use from
tests like this: previously it would try and generate image layers even
if one already existed with the same layer key, which caused compaction
to fail.
---
 pageserver/src/tenant/timeline.rs             | 19 ++++++++++++++++++-
 .../src/tenant/timeline/layer_manager.rs      |  8 ++++++++
 test_runner/regress/test_sharding.py          |  6 ++++++
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 541704e8d6..762e903bf8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -66,12 +66,12 @@ use std::{
     ops::{Deref, Range},
 };
 
-use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
     aux_file::AuxFileSizeEstimator,
     tenant::{
         layer_map::{LayerMap, SearchResult},
         metadata::TimelineMetadata,
+        storage_layer::PersistentLayerDesc,
     },
 };
 use crate::{
@@ -98,6 +98,7 @@ use crate::{
     metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
+use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
 use crate::{
     pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
     virtual_file::{MaybeFatalIo, VirtualFile},
@@ -4572,6 +4573,22 @@ impl Timeline {
                     start = img_range.end;
                     continue;
                 }
+            } else if let ImageLayerCreationMode::Force = mode {
+                // When forced to create image layers, we might try and create them where they already
+                // exist.  This mode is only used in tests/debug.
+                let layers = self.layers.read().await;
+                if layers.contains_key(&PersistentLayerKey {
+                    key_range: img_range.clone(),
+                    lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
+                    is_delta: false,
+                }) {
+                    tracing::info!(
+                        "Skipping image layer at {lsn} {}..{}, already exists",
+                        img_range.start,
+                        img_range.end
+                    );
+                    continue;
+                }
             }
 
             let image_layer_writer = ImageLayerWriter::new(
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 948237e06a..a43ff873ac 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -339,6 +339,10 @@ impl LayerManager {
         self.layer_fmgr.contains(layer)
     }
 
+    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
+        self.layer_fmgr.contains_key(key)
+    }
+
     pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
         self.layer_fmgr.0.keys().cloned().collect_vec()
     }
@@ -363,6 +367,10 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
             .clone()
     }
 
+    fn contains_key(&self, key: &PersistentLayerKey) -> bool {
+        self.0.contains_key(key)
+    }
+
     pub(crate) fn insert(&mut self, layer: T) {
         let present = self.0.insert(layer.layer_desc().key(), layer.clone());
         if present.is_some() && cfg!(debug_assertions) {
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index d414f986e6..4471237900 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -225,6 +225,12 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
     workload.validate()
     workload.stop()
 
+    # Do a full image layer generation before splitting, so that when we compact after splitting
+    # we should only see sizes decrease (from post-split drops/rewrites), not increase (from image layer generation)
+    env.get_tenant_pageserver(tenant_id).http_client().timeline_compact(
+        tenant_id, timeline_id, force_image_layer_creation=True, wait_until_uploaded=True
+    )
+
     # Split one shard into two
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)
 

From 1afab13ccb95ed083397c5bff1e31ae1631b1091 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 10 Jul 2024 15:05:25 +0100
Subject: [PATCH 1162/1571] proxy: remove some trace logs (#8334)

---
 proxy/src/http.rs    | 41 +----------------------------------------
 proxy/src/logging.rs |  3 ++-
 2 files changed, 3 insertions(+), 41 deletions(-)

diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index fc7400869f..dd7164181d 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -4,14 +4,11 @@
 
 pub mod health_server;
 
-use std::{str::FromStr, sync::Arc, time::Duration};
+use std::time::Duration;
 
-use futures::FutureExt;
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
-use tokio::time::Instant;
-use tracing::trace;
 
 use crate::{
     metrics::{ConsoleRequest, Metrics},
@@ -24,8 +21,6 @@ use reqwest_middleware::RequestBuilder;
 /// We deliberately don't want to replace this with a public static.
 pub fn new_client() -> ClientWithMiddleware {
     let client = reqwest::ClientBuilder::new()
-        .dns_resolver(Arc::new(GaiResolver::default()))
-        .connection_verbose(true)
         .build()
         .expect("Failed to create http client");
 
@@ -36,8 +31,6 @@ pub fn new_client() -> ClientWithMiddleware {
 
 pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
     let timeout_client = reqwest::ClientBuilder::new()
-        .dns_resolver(Arc::new(GaiResolver::default()))
-        .connection_verbose(true)
         .timeout(default_timout)
         .build()
         .expect("Failed to create http client with timeout");
@@ -103,38 +96,6 @@ impl Endpoint {
     }
 }
 
-use hyper_util::client::legacy::connect::dns::{
-    GaiResolver as HyperGaiResolver, Name as HyperName,
-};
-use reqwest::dns::{Addrs, Name, Resolve, Resolving};
-/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
-use tower_service::Service;
-#[derive(Debug)]
-pub struct GaiResolver(HyperGaiResolver);
-
-impl Default for GaiResolver {
-    fn default() -> Self {
-        Self(HyperGaiResolver::new())
-    }
-}
-
-impl Resolve for GaiResolver {
-    fn resolve(&self, name: Name) -> Resolving {
-        let this = &mut self.0.clone();
-        let hyper_name = HyperName::from_str(name.as_str()).expect("name should be valid");
-        let start = Instant::now();
-        Box::pin(
-            Service::<HyperName>::call(this, hyper_name).map(move |result| {
-                let resolve_duration = start.elapsed();
-                trace!(duration = ?resolve_duration, addr = %name.as_str(), "resolve host complete");
-                result
-                    .map(|addrs| -> Addrs { Box::new(addrs) })
-                    .map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
-            }),
-        )
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index 3405b8cbc6..3b30ad8b46 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -15,7 +15,8 @@ use tracing_subscriber::{
 pub async fn init() -> anyhow::Result<LoggingGuard> {
     let env_filter = EnvFilter::builder()
         .with_default_directive(LevelFilter::INFO.into())
-        .from_env_lossy();
+        .from_env_lossy()
+        .add_directive("azure_core::policies::transport=off".parse().unwrap());
 
     let fmt_layer = tracing_subscriber::fmt::layer()
         .with_ansi(false)

From 98387d6fb1a125a5e9676534cb46dca88e3252fd Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 9 Jul 2024 18:12:57 +0000
Subject: [PATCH 1163/1571] build(deps-dev): bump zipp from 3.8.1 to 3.19.1

Bumps [zipp](https://github.com/jaraco/zipp) from 3.8.1 to 3.19.1.
- [Release notes](https://github.com/jaraco/zipp/releases)
- [Changelog](https://github.com/jaraco/zipp/blob/main/NEWS.rst)
- [Commits](https://github.com/jaraco/zipp/compare/v3.8.1...v3.19.1)

---
updated-dependencies:
- dependency-name: zipp
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index bf16aaf55d..8091141411 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3133,18 +3133,18 @@ multidict = ">=4.0"
 
 [[package]]
 name = "zipp"
-version = "3.8.1"
+version = "3.19.1"
 description = "Backport of pathlib-compatible object wrapper for zip files"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"},
-    {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"},
+    {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"},
+    {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"},
 ]
 
 [package.extras]
-docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"]
-testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
 
 [[package]]
 name = "zstandard"

From e78341e1c220625d9bfa3f08632bd5cfb8e6a876 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 10 Jul 2024 18:09:19 +0200
Subject: [PATCH 1164/1571] Remove
 ImageCompressionAlgorithm::DisabledNoDecompress (#8300)

Removes the `ImageCompressionAlgorithm::DisabledNoDecompress` variant.
We now assume any blob with the specific bits set is actually a
compressed blob.

The `ImageCompressionAlgorithm::Disabled` variant still remains and is
the new default.

Reverts large parts of #8238 , as originally intended in that PR.

Part of #5431
---
 libs/pageserver_api/src/models.rs             | 14 ----------
 pageserver/src/config.rs                      |  2 +-
 pageserver/src/tenant/blob_io.rs              | 11 ++------
 pageserver/src/tenant/block_io.rs             | 10 +------
 .../src/tenant/storage_layer/delta_layer.rs   |  2 +-
 .../src/tenant/storage_layer/image_layer.rs   | 28 ++++++-------------
 pageserver/src/tenant/storage_layer/layer.rs  |  1 -
 7 files changed, 15 insertions(+), 53 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 49c942938d..d360cc6e87 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -440,9 +440,6 @@ pub enum CompactionAlgorithm {
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub enum ImageCompressionAlgorithm {
-    /// Disabled for writes, and never decompress during reading.
-    /// Never set this after you've enabled compression once!
-    DisabledNoDecompress,
     // Disabled for writes, support decompressing during read path
     Disabled,
     /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
@@ -452,12 +449,6 @@ pub enum ImageCompressionAlgorithm {
     },
 }
 
-impl ImageCompressionAlgorithm {
-    pub fn allow_decompression(&self) -> bool {
-        !matches!(self, ImageCompressionAlgorithm::DisabledNoDecompress)
-    }
-}
-
 impl FromStr for ImageCompressionAlgorithm {
     type Err = anyhow::Error;
     fn from_str(s: &str) -> Result<Self, Self::Err> {
@@ -466,7 +457,6 @@ impl FromStr for ImageCompressionAlgorithm {
             .next()
             .ok_or_else(|| anyhow::anyhow!("empty string"))?;
         match first {
-            "disabled-no-decompress" => Ok(ImageCompressionAlgorithm::DisabledNoDecompress),
             "disabled" => Ok(ImageCompressionAlgorithm::Disabled),
             "zstd" => {
                 let level = if let Some(v) = components.next() {
@@ -1683,10 +1673,6 @@ mod tests {
             ImageCompressionAlgorithm::from_str("disabled").unwrap(),
             Disabled
         );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("disabled-no-decompress").unwrap(),
-            DisabledNoDecompress
-        );
         assert_eq!(
             ImageCompressionAlgorithm::from_str("zstd").unwrap(),
             Zstd { level: None }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index b7c9af2244..17bc427b2c 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -92,7 +92,7 @@ pub mod defaults {
     pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
 
     pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
-        ImageCompressionAlgorithm::DisabledNoDecompress;
+        ImageCompressionAlgorithm::Disabled;
 
     pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
 
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 0705182d5d..e98ed66ef9 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -273,12 +273,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         srcbuf: B,
         ctx: &RequestContext,
     ) -> (B::Buf, Result<u64, Error>) {
-        self.write_blob_maybe_compressed(
-            srcbuf,
-            ctx,
-            ImageCompressionAlgorithm::DisabledNoDecompress,
-        )
-        .await
+        self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
+            .await
     }
 
     /// Write a blob of data. Returns the offset that it was written to,
@@ -340,8 +336,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                             (BYTE_UNCOMPRESSED, len, slice.into_inner())
                         }
                     }
-                    ImageCompressionAlgorithm::Disabled
-                    | ImageCompressionAlgorithm::DisabledNoDecompress => {
+                    ImageCompressionAlgorithm::Disabled => {
                         (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
                     }
                 };
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 3324e840ec..601b095155 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -202,18 +202,10 @@ pub struct FileBlockReader<'a> {
 
 impl<'a> FileBlockReader<'a> {
     pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
-        Self::new_with_compression(file, file_id, false)
-    }
-
-    pub fn new_with_compression(
-        file: &'a VirtualFile,
-        file_id: FileId,
-        compressed_reads: bool,
-    ) -> Self {
         FileBlockReader {
             file_id,
             file,
-            compressed_reads,
+            compressed_reads: true,
         }
     }
 
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 685f6dce60..000076d7c0 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -453,7 +453,7 @@ impl DeltaLayerWriterInner {
     ) -> (Vec<u8>, anyhow::Result<()>) {
         assert!(self.lsn_range.start <= lsn);
         // We don't want to use compression in delta layer creation
-        let compression = ImageCompressionAlgorithm::DisabledNoDecompress;
+        let compression = ImageCompressionAlgorithm::Disabled;
         let (val, res) = self
             .blob_writer
             .write_blob_maybe_compressed(val, ctx, compression)
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 4a1b3a0237..50aacbd9ad 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -165,7 +165,6 @@ pub struct ImageLayerInner {
     file_id: FileId,
 
     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
-    compressed_reads: bool,
 }
 
 impl std::fmt::Debug for ImageLayerInner {
@@ -179,8 +178,7 @@ impl std::fmt::Debug for ImageLayerInner {
 
 impl ImageLayerInner {
     pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let block_reader =
-            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new(
             self.index_start_blk,
             self.index_root_blk,
@@ -268,10 +266,9 @@ impl ImageLayer {
     async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
         let path = self.path();
 
-        let loaded =
-            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, false, ctx)
-                .await
-                .and_then(|res| res)?;
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
+            .await
+            .and_then(|res| res)?;
 
         // not production code
         let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
@@ -380,7 +377,6 @@ impl ImageLayerInner {
         lsn: Lsn,
         summary: Option<Summary>,
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
-        support_compressed_reads: bool,
         ctx: &RequestContext,
     ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
         let file = match VirtualFile::open(path, ctx).await {
@@ -424,7 +420,6 @@ impl ImageLayerInner {
             file,
             file_id,
             max_vectored_read_bytes,
-            compressed_reads: support_compressed_reads,
             key_range: actual_summary.key_range,
         }))
     }
@@ -435,8 +430,7 @@ impl ImageLayerInner {
         reconstruct_state: &mut ValueReconstructState,
         ctx: &RequestContext,
     ) -> anyhow::Result<ValueReconstructResult> {
-        let block_reader =
-            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
 
@@ -496,14 +490,12 @@ impl ImageLayerInner {
         &self,
         ctx: &RequestContext,
     ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
-        let block_reader =
-            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
         let mut result = Vec::new();
         let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx));
-        let block_reader =
-            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let cursor = block_reader.block_cursor();
         while let Some(item) = stream.next().await {
             // TODO: dedup code with get_reconstruct_value
@@ -538,8 +530,7 @@ impl ImageLayerInner {
                 .into(),
         );
 
-        let block_reader =
-            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
 
@@ -700,8 +691,7 @@ impl ImageLayerInner {
 
     #[cfg(test)]
     pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
-        let block_reader =
-            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
         ImageLayerIterator {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index afd11780e7..02069c29d2 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1685,7 +1685,6 @@ impl DownloadedLayer {
                     lsn,
                     summary,
                     Some(owner.conf.max_vectored_read_bytes),
-                    owner.conf.image_compression.allow_decompression(),
                     ctx,
                 )
                 .await

From 9f4511c5545e86a492966abb4887bcac22fd01d4 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 10 Jul 2024 14:11:27 -0400
Subject: [PATCH 1165/1571] feat(pageserver): add k-merge layer iterator with
 lazy loading (#8053)

Part of https://github.com/neondatabase/neon/issues/8002. This pull
request adds a k-merge iterator for bottom-most compaction.

## Summary of changes

* Added back lsn_range / key_range in delta layer inner. This was
removed due to https://github.com/neondatabase/neon/pull/8050, but added
back because iterators need that information to process lazy loading.
* Added lazy-loading k-merge iterator.
* Added iterator wrapper as a unified iterator type for image+delta
iterator.

The current status and test should cover the use case for L0 compaction
so that the L0 compaction process can bypass page cache and have a fixed
amount of memory usage. The next step is to integrate this with the new
bottom-most compaction.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/tenant/storage_layer.rs        |   3 +
 .../src/tenant/storage_layer/delta_layer.rs   |  30 +-
 .../src/tenant/storage_layer/image_layer.rs   |  10 +
 .../tenant/storage_layer/merge_iterator.rs    | 412 ++++++++++++++++++
 4 files changed, 452 insertions(+), 3 deletions(-)
 create mode 100644 pageserver/src/tenant/storage_layer/merge_iterator.rs

diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9607546ce0..62730f88b2 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -7,6 +7,9 @@ pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
 
+#[cfg(test)]
+pub mod merge_iterator;
+
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
 use crate::task_mgr::TaskKind;
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 000076d7c0..dfd0196c87 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -223,6 +223,11 @@ pub struct DeltaLayerInner {
     file: VirtualFile,
     file_id: FileId,
 
+    #[allow(dead_code)]
+    layer_key_range: Range<Key>,
+    #[allow(dead_code)]
+    layer_lsn_range: Range<Lsn>,
+
     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }
 
@@ -742,6 +747,16 @@ impl DeltaLayer {
 }
 
 impl DeltaLayerInner {
+    #[cfg(test)]
+    pub(crate) fn key_range(&self) -> &Range<Key> {
+        &self.layer_key_range
+    }
+
+    #[cfg(test)]
+    pub(crate) fn lsn_range(&self) -> &Range<Lsn> {
+        &self.layer_lsn_range
+    }
+
     /// Returns nested result following Result<Result<_, OpErr>, Critical>:
     /// - inner has the success or transient failure
     /// - outer has the permanent failure
@@ -790,6 +805,8 @@ impl DeltaLayerInner {
             index_start_blk: actual_summary.index_start_blk,
             index_root_blk: actual_summary.index_root_blk,
             max_vectored_read_bytes,
+            layer_key_range: actual_summary.key_range,
+            layer_lsn_range: actual_summary.lsn_range,
         }))
     }
 
@@ -1639,7 +1656,7 @@ impl<'a> DeltaLayerIterator<'a> {
 }
 
 #[cfg(test)]
-mod test {
+pub(crate) mod test {
     use std::collections::BTreeMap;
 
     use itertools::MinMaxResult;
@@ -2217,13 +2234,20 @@ mod test {
         }
     }
 
-    async fn produce_delta_layer(
+    pub(crate) fn sort_delta(
+        (k1, l1, _): &(Key, Lsn, Value),
+        (k2, l2, _): &(Key, Lsn, Value),
+    ) -> std::cmp::Ordering {
+        (k1, l1).cmp(&(k2, l2))
+    }
+
+    pub(crate) async fn produce_delta_layer(
         tenant: &Tenant,
         tline: &Arc<Timeline>,
         mut deltas: Vec<(Key, Lsn, Value)>,
         ctx: &RequestContext,
     ) -> anyhow::Result<ResidentLayer> {
-        deltas.sort_by(|(k1, l1, _), (k2, l2, _)| (k1, l1).cmp(&(k2, l2)));
+        deltas.sort_by(sort_delta);
         let (key_start, _, _) = deltas.first().unwrap();
         let (key_max, _, _) = deltas.first().unwrap();
         let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 50aacbd9ad..1e03e1a58c 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -369,6 +369,16 @@ impl ImageLayer {
 }
 
 impl ImageLayerInner {
+    #[cfg(test)]
+    pub(crate) fn key_range(&self) -> &Range<Key> {
+        &self.key_range
+    }
+
+    #[cfg(test)]
+    pub(crate) fn lsn(&self) -> Lsn {
+        self.lsn
+    }
+
     /// Returns nested result following Result<Result<_, OpErr>, Critical>:
     /// - inner has the success or transient failure
     /// - outer has the permanent failure
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
new file mode 100644
index 0000000000..36386c87c9
--- /dev/null
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -0,0 +1,412 @@
+use std::{
+    cmp::Ordering,
+    collections::{binary_heap, BinaryHeap},
+};
+
+use pageserver_api::key::Key;
+use utils::lsn::Lsn;
+
+use crate::{context::RequestContext, repository::Value};
+
+use super::{
+    delta_layer::{DeltaLayerInner, DeltaLayerIterator},
+    image_layer::{ImageLayerInner, ImageLayerIterator},
+};
+
+#[derive(Clone, Copy)]
+enum LayerRef<'a> {
+    Image(&'a ImageLayerInner),
+    Delta(&'a DeltaLayerInner),
+}
+
+impl<'a> LayerRef<'a> {
+    fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> {
+        match self {
+            Self::Image(x) => LayerIterRef::Image(x.iter(ctx)),
+            Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
+        }
+    }
+}
+
+enum LayerIterRef<'a> {
+    Image(ImageLayerIterator<'a>),
+    Delta(DeltaLayerIterator<'a>),
+}
+
+impl LayerIterRef<'_> {
+    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        match self {
+            Self::Delta(x) => x.next().await,
+            Self::Image(x) => x.next().await,
+        }
+    }
+}
+
+/// This type plays several roles at once
+/// 1. Unified iterator for image and delta layers.
+/// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
+/// 3. Lazy creation of the real delta/image iterator.
+enum IteratorWrapper<'a> {
+    NotLoaded {
+        ctx: &'a RequestContext,
+        first_key_lower_bound: (Key, Lsn),
+        layer: LayerRef<'a>,
+    },
+    Loaded {
+        iter: PeekableLayerIterRef<'a>,
+    },
+}
+
+struct PeekableLayerIterRef<'a> {
+    iter: LayerIterRef<'a>,
+    peeked: Option<(Key, Lsn, Value)>, // None == end
+}
+
+impl<'a> PeekableLayerIterRef<'a> {
+    async fn create(mut iter: LayerIterRef<'a>) -> anyhow::Result<Self> {
+        let peeked = iter.next().await?;
+        Ok(Self { iter, peeked })
+    }
+
+    fn peek(&self) -> &Option<(Key, Lsn, Value)> {
+        &self.peeked
+    }
+
+    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        let result = self.peeked.take();
+        self.peeked = self.iter.next().await?;
+        Ok(result)
+    }
+}
+
+impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> {
+    fn eq(&self, other: &Self) -> bool {
+        self.cmp(other) == Ordering::Equal
+    }
+}
+
+impl<'a> std::cmp::Eq for IteratorWrapper<'a> {}
+
+impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        use std::cmp::Ordering;
+        let a = self.peek_next_key_lsn();
+        let b = other.peek_next_key_lsn();
+        match (a, b) {
+            (Some((k1, l1)), Some((k2, l2))) => {
+                let loaded_1 = if self.is_loaded() { 1 } else { 0 };
+                let loaded_2 = if other.is_loaded() { 1 } else { 0 };
+                // When key_lsn are the same, the unloaded iter will always appear before the loaded one.
+                // And note that we do a reverse at the end of the comparison, so it works with the max heap.
+                (k1, l1, loaded_1).cmp(&(k2, l2, loaded_2))
+            }
+            (Some(_), None) => Ordering::Less,
+            (None, Some(_)) => Ordering::Greater,
+            (None, None) => Ordering::Equal,
+        }
+        .reverse()
+    }
+}
+
+impl<'a> IteratorWrapper<'a> {
+    pub fn create_from_image_layer(
+        image_layer: &'a ImageLayerInner,
+        ctx: &'a RequestContext,
+    ) -> Self {
+        Self::NotLoaded {
+            layer: LayerRef::Image(image_layer),
+            first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()),
+            ctx,
+        }
+    }
+
+    pub fn create_from_delta_layer(
+        delta_layer: &'a DeltaLayerInner,
+        ctx: &'a RequestContext,
+    ) -> Self {
+        Self::NotLoaded {
+            layer: LayerRef::Delta(delta_layer),
+            first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start),
+            ctx,
+        }
+    }
+
+    fn peek_next_key_lsn(&self) -> Option<(&Key, Lsn)> {
+        match self {
+            Self::Loaded { iter } => iter.peek().as_ref().map(|(key, lsn, _)| (key, *lsn)),
+            Self::NotLoaded {
+                first_key_lower_bound: (key, lsn),
+                ..
+            } => Some((key, *lsn)),
+        }
+    }
+
+    // CORRECTNESS: this function must always take `&mut self`, never `&self`.
+    //
+    // The reason is that `impl Ord for Self` evaluates differently after this function
+    // returns. We're called through a `PeekMut::deref_mut`, which causes heap repair when
+    // the PeekMut gets returned. So, it's critical that we actually run through `PeekMut::deref_mut`
+    // and not just `PeekMut::deref`
+    // If we don't take `&mut self`
+    async fn load(&mut self) -> anyhow::Result<()> {
+        assert!(!self.is_loaded());
+        let Self::NotLoaded {
+            ctx,
+            first_key_lower_bound,
+            layer,
+        } = self
+        else {
+            unreachable!()
+        };
+        let iter = layer.iter(ctx);
+        let iter = PeekableLayerIterRef::create(iter).await?;
+        if let Some((k1, l1, _)) = iter.peek() {
+            let (k2, l2) = first_key_lower_bound;
+            debug_assert!((k1, l1) >= (k2, l2));
+        }
+        *self = Self::Loaded { iter };
+        Ok(())
+    }
+
+    fn is_loaded(&self) -> bool {
+        matches!(self, Self::Loaded { .. })
+    }
+
+    /// Correctness: must load the iterator before using.
+    ///
+    /// Given this iterator wrapper is private to the merge iterator, users won't be able to mis-use it.
+    /// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and
+    /// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
+    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        let Self::Loaded { iter } = self else {
+            panic!("must load the iterator before using")
+        };
+        iter.next().await
+    }
+}
+
+pub struct MergeIterator<'a> {
+    heap: BinaryHeap<IteratorWrapper<'a>>,
+}
+
+impl<'a> MergeIterator<'a> {
+    pub fn create(
+        deltas: &[&'a DeltaLayerInner],
+        images: &[&'a ImageLayerInner],
+        ctx: &'a RequestContext,
+    ) -> Self {
+        let mut heap = Vec::with_capacity(images.len() + deltas.len());
+        for image in images {
+            heap.push(IteratorWrapper::create_from_image_layer(image, ctx));
+        }
+        for delta in deltas {
+            heap.push(IteratorWrapper::create_from_delta_layer(delta, ctx));
+        }
+        Self {
+            heap: BinaryHeap::from(heap),
+        }
+    }
+
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        while let Some(mut iter) = self.heap.peek_mut() {
+            if !iter.is_loaded() {
+                // Once we load the iterator, we can know the real first key-value pair in the iterator.
+                // We put it back into the heap so that a potentially unloaded layer may have a key between
+                // [potential_first_key, loaded_first_key).
+                iter.load().await?;
+                continue;
+            }
+            let Some(item) = iter.next().await? else {
+                // If the iterator returns None, we pop this iterator. Actually, in the current implementation,
+                // we order None > Some, and all the rest of the iterators should return None.
+                binary_heap::PeekMut::pop(iter);
+                continue;
+            };
+            return Ok(Some(item));
+        }
+        Ok(None)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use itertools::Itertools;
+    use pageserver_api::key::Key;
+    use utils::lsn::Lsn;
+
+    use crate::{
+        tenant::{
+            harness::{TenantHarness, TIMELINE_ID},
+            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
+        },
+        DEFAULT_PG_VERSION,
+    };
+
+    async fn assert_merge_iter_equal(
+        merge_iter: &mut MergeIterator<'_>,
+        expect: &[(Key, Lsn, Value)],
+    ) {
+        let mut expect_iter = expect.iter();
+        loop {
+            let o1 = merge_iter.next().await.unwrap();
+            let o2 = expect_iter.next();
+            assert_eq!(o1.is_some(), o2.is_some());
+            if o1.is_none() && o2.is_none() {
+                break;
+            }
+            let (k1, l1, v1) = o1.unwrap();
+            let (k2, l2, v2) = o2.unwrap();
+            assert_eq!(&k1, k2);
+            assert_eq!(l1, *l2);
+            assert_eq!(&v1, v2);
+        }
+    }
+
+    #[tokio::test]
+    async fn merge_in_between() {
+        use crate::repository::Value;
+        use bytes::Bytes;
+
+        let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+        let test_deltas1 = vec![
+            (
+                get_key(0),
+                Lsn(0x10),
+                Value::Image(Bytes::copy_from_slice(b"test")),
+            ),
+            (
+                get_key(5),
+                Lsn(0x10),
+                Value::Image(Bytes::copy_from_slice(b"test")),
+            ),
+        ];
+        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
+            .await
+            .unwrap();
+        let test_deltas2 = vec![
+            (
+                get_key(3),
+                Lsn(0x10),
+                Value::Image(Bytes::copy_from_slice(b"test")),
+            ),
+            (
+                get_key(4),
+                Lsn(0x10),
+                Value::Image(Bytes::copy_from_slice(b"test")),
+            ),
+        ];
+        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut merge_iter = MergeIterator::create(
+            &[
+                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
+            ],
+            &[],
+            &ctx,
+        );
+        let mut expect = Vec::new();
+        expect.extend(test_deltas1);
+        expect.extend(test_deltas2);
+        expect.sort_by(sort_delta);
+        assert_merge_iter_equal(&mut merge_iter, &expect).await;
+    }
+
+    #[tokio::test]
+    async fn delta_merge() {
+        use crate::repository::Value;
+        use bytes::Bytes;
+
+        let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+        const N: usize = 1000;
+        let test_deltas1 = (0..N)
+            .map(|idx| {
+                (
+                    get_key(idx as u32 / 10),
+                    Lsn(0x20 * ((idx as u64) % 10 + 1)),
+                    Value::Image(Bytes::from(format!("img{idx:05}"))),
+                )
+            })
+            .collect_vec();
+        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
+            .await
+            .unwrap();
+        let test_deltas2 = (0..N)
+            .map(|idx| {
+                (
+                    get_key(idx as u32 / 10),
+                    Lsn(0x20 * ((idx as u64) % 10 + 1) + 0x10),
+                    Value::Image(Bytes::from(format!("img{idx:05}"))),
+                )
+            })
+            .collect_vec();
+        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
+            .await
+            .unwrap();
+        let test_deltas3 = (0..N)
+            .map(|idx| {
+                (
+                    get_key(idx as u32 / 10 + N as u32),
+                    Lsn(0x10 * ((idx as u64) % 10 + 1)),
+                    Value::Image(Bytes::from(format!("img{idx:05}"))),
+                )
+            })
+            .collect_vec();
+        let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut merge_iter = MergeIterator::create(
+            &[
+                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
+            ],
+            &[],
+            &ctx,
+        );
+        let mut expect = Vec::new();
+        expect.extend(test_deltas1);
+        expect.extend(test_deltas2);
+        expect.extend(test_deltas3);
+        expect.sort_by(sort_delta);
+        assert_merge_iter_equal(&mut merge_iter, &expect).await;
+
+        // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
+    }
+
+    // TODO: image layer merge, delta+image mixed merge
+    // TODO: is it possible to have duplicated delta at same LSN now? we might need to test that
+}

From 24f8133e890f6c44089291f4211171a3d4428738 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 10 Jul 2024 19:38:14 +0100
Subject: [PATCH 1166/1571] safekeeper: add eviction_min_resident to stop
 evictions thrashing (#8335)

## Problem

- The condition for eviction is not time-based: it is possible for a
timeline to be restored in response to a client, that client times out,
and then as soon as the timeline is restored it is immediately evicted
again.
- There is no delay on eviction at startup of the safekeeper, so when it
starts up and sees many idle timelines, it does many evictions which
will likely be immediately restored when someone uses the timeline.

## Summary of changes

- Add `eviction_min_resident` parameter, and use it in
`ready_for_eviction` to avoid evictions if the timeline has been
resident for less than this period.
- This also implicitly delays evictions at startup for
`eviction_min_resident`
- Set this to a very low number for the existing eviction test, which
expects immediate eviction.

The default period is 15 minutes. The general reasoning for that is that
in the worst case where we thrash ~10k timelines on one safekeeper,
downloading 16MB for each one, we should set a period that would not
overwhelm the node's bandwidth.
---
 safekeeper/src/bin/safekeeper.rs              | 11 ++++++++--
 safekeeper/src/lib.rs                         |  7 +++++++
 safekeeper/src/timeline_eviction.rs           |  4 ++++
 safekeeper/src/timeline_manager.rs            |  5 +++++
 .../tests/walproposer_sim/safekeeper.rs       |  1 +
 test_runner/fixtures/neon_fixtures.py         | 21 +++++++++++++++++--
 test_runner/regress/test_wal_acceptor.py      | 21 ++++++++++---------
 7 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 4d580e57ed..9eb6546d6b 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -27,8 +27,8 @@ use utils::pid_file;
 
 use metrics::set_build_info_metric;
 use safekeeper::defaults::{
-    DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR,
-    DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY,
+    DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT,
+    DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY,
     DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
 use safekeeper::http;
@@ -194,6 +194,12 @@ struct Args {
     /// Number of allowed concurrent uploads of partial segments to remote storage.
     #[arg(long, default_value = DEFAULT_PARTIAL_BACKUP_CONCURRENCY)]
     partial_backup_concurrency: usize,
+    /// How long a timeline must be resident before it is eligible for eviction.
+    /// Usually, timeline eviction has to wait for `partial_backup_timeout` before being eligible for eviction,
+    /// but if a timeline is un-evicted and then _not_ written to, it would immediately flap to evicting again,
+    /// if it weren't for `eviction_min_resident` preventing that.
+    #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
+    eviction_min_resident: Duration,
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -348,6 +354,7 @@ async fn main() -> anyhow::Result<()> {
         delete_offloaded_wal: args.delete_offloaded_wal,
         control_file_save_interval: args.control_file_save_interval,
         partial_backup_concurrency: args.partial_backup_concurrency,
+        eviction_min_resident: args.eviction_min_resident,
     };
 
     // initialize sentry if SENTRY_DSN is provided
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 5cd676d857..af83feb77f 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -53,6 +53,11 @@ pub mod defaults {
     pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
     pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
     pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";
+
+    // By default, our required residency before eviction is the same as the period that passes
+    // before uploading a partial segment, so that in normal operation the eviction can happen
+    // as soon as we have done the partial segment upload.
+    pub const DEFAULT_EVICTION_MIN_RESIDENT: &str = DEFAULT_PARTIAL_BACKUP_TIMEOUT;
 }
 
 #[derive(Debug, Clone)]
@@ -93,6 +98,7 @@ pub struct SafeKeeperConf {
     pub delete_offloaded_wal: bool,
     pub control_file_save_interval: Duration,
     pub partial_backup_concurrency: usize,
+    pub eviction_min_resident: Duration,
 }
 
 impl SafeKeeperConf {
@@ -136,6 +142,7 @@ impl SafeKeeperConf {
             delete_offloaded_wal: false,
             control_file_save_interval: Duration::from_secs(1),
             partial_backup_concurrency: 1,
+            eviction_min_resident: Duration::ZERO,
         }
     }
 }
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index b303d41b7b..e4ab65290d 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -5,6 +5,7 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use remote_storage::RemotePath;
+use std::time::Instant;
 use tokio::{
     fs::File,
     io::{AsyncRead, AsyncWriteExt},
@@ -48,6 +49,7 @@ impl Manager {
                 .flush_lsn
                 .segment_number(self.wal_seg_size)
                 == self.last_removed_segno + 1
+            && self.resident_since.elapsed() >= self.conf.eviction_min_resident
     }
 
     /// Evict the timeline to remote storage.
@@ -91,6 +93,8 @@ impl Manager {
             return;
         }
 
+        self.resident_since = Instant::now();
+
         info!("successfully restored evicted timeline");
     }
 }
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index 62142162de..debf8c824f 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -186,6 +186,10 @@ pub(crate) struct Manager {
     // misc
     pub(crate) access_service: AccessService,
     pub(crate) partial_backup_rate_limiter: RateLimiter,
+
+    // Anti-flapping state: we evict timelines eagerly if they are inactive, but should not
+    // evict them if they go inactive very soon after being restored.
+    pub(crate) resident_since: std::time::Instant,
 }
 
 /// This task gets spawned alongside each timeline and is responsible for managing the timeline's
@@ -350,6 +354,7 @@ impl Manager {
             access_service: AccessService::new(manager_tx),
             tli,
             partial_backup_rate_limiter,
+            resident_since: std::time::Instant::now(),
         }
     }
 
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 6bbf96d71d..0c6d97ddfa 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -188,6 +188,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         delete_offloaded_wal: false,
         control_file_save_interval: Duration::from_secs(1),
         partial_backup_concurrency: 1,
+        eviction_min_resident: Duration::ZERO,
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index cae2e422c1..5ca31644a9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -492,6 +492,7 @@ class NeonEnvBuilder:
         pageserver_virtual_file_io_engine: Optional[str] = None,
         pageserver_aux_file_policy: Optional[AuxFileStore] = None,
         pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
+        safekeeper_extra_opts: Optional[list[str]] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -557,6 +558,8 @@ class NeonEnvBuilder:
 
         self.pageserver_aux_file_policy = pageserver_aux_file_policy
 
+        self.safekeeper_extra_opts = safekeeper_extra_opts
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -1193,7 +1196,9 @@ class NeonEnv:
                 sk_cfg[
                     "remote_storage"
                 ] = self.safekeepers_remote_storage.to_toml_inline_table().strip()
-            self.safekeepers.append(Safekeeper(env=self, id=id, port=port))
+            self.safekeepers.append(
+                Safekeeper(env=self, id=id, port=port, extra_opts=config.safekeeper_extra_opts)
+            )
             cfg["safekeepers"].append(sk_cfg)
 
         log.info(f"Config: {cfg}")
@@ -4016,16 +4021,28 @@ class Safekeeper(LogUtils):
     id: int
     running: bool = False
 
-    def __init__(self, env: NeonEnv, port: SafekeeperPort, id: int, running: bool = False):
+    def __init__(
+        self,
+        env: NeonEnv,
+        port: SafekeeperPort,
+        id: int,
+        running: bool = False,
+        extra_opts: Optional[List[str]] = None,
+    ):
         self.env = env
         self.port = port
         self.id = id
         self.running = running
         self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"
+        self.extra_opts = extra_opts
 
     def start(
         self, extra_opts: Optional[List[str]] = None, timeout_in_seconds: Optional[int] = None
     ) -> "Safekeeper":
+        if extra_opts is None:
+            # Apply either the extra_opts passed in, or the ones from our constructor: we do not merge the two.
+            extra_opts = self.extra_opts
+
         assert self.running is False
         self.env.neon_cli.safekeeper_start(
             self.id, extra_opts=extra_opts, timeout_in_seconds=timeout_in_seconds
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index febfc10293..7efd86e349 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2191,24 +2191,25 @@ def test_s3_eviction(
 ):
     neon_env_builder.num_safekeepers = 3
     neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS)
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            "checkpoint_timeout": "100ms",
-        }
-    )
 
-    extra_opts = [
+    neon_env_builder.safekeeper_extra_opts = [
         "--enable-offload",
         "--partial-backup-timeout",
         "50ms",
         "--control-file-save-interval",
         "1s",
+        # Safekeepers usually wait a while before evicting something: for this test we want them to
+        # evict things as soon as they are inactive.
+        "--eviction-min-resident=100ms",
     ]
     if delete_offloaded_wal:
-        extra_opts.append("--delete-offloaded-wal")
+        neon_env_builder.safekeeper_extra_opts.append("--delete-offloaded-wal")
 
-    for sk in env.safekeepers:
-        sk.stop().start(extra_opts=extra_opts)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_timeout": "100ms",
+        }
+    )
 
     n_timelines = 5
 
@@ -2263,7 +2264,7 @@ def test_s3_eviction(
         # restarting random safekeepers
         for sk in env.safekeepers:
             if random.random() < restart_chance:
-                sk.stop().start(extra_opts=extra_opts)
+                sk.stop().start()
         time.sleep(0.5)
 
     # require at least one successful eviction in at least one safekeeper

From 6bbd34a216accdea3c6a3bd30df8ab28386afdde Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Thu, 11 Jul 2024 10:20:14 +0300
Subject: [PATCH 1167/1571] Enable core dumps for postgres (#8272)

Set core rmilit to ulimited in compute_ctl, so that all child processes
inherit it. We could also set rlimit in relevant startup script, but
that way we would depend on external setup and might inadvertently
disable it again (core dumping worked in pods, but not in VMs with
inittab-based startup).
---
 Cargo.lock                           | 10 ++++++++++
 compute_tools/Cargo.toml             |  1 +
 compute_tools/src/bin/compute_ctl.rs |  4 ++++
 compute_tools/src/compute.rs         |  2 +-
 4 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index 776d95c3c7..9fb3f5385d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1236,6 +1236,7 @@ dependencies = [
  "regex",
  "remote_storage",
  "reqwest 0.12.4",
+ "rlimit",
  "rust-ini",
  "serde",
  "serde_json",
@@ -4901,6 +4902,15 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "rlimit"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3560f70f30a0f16d11d01ed078a07740fe6b489667abc7c7b029155d9f21c3d8"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "routerify"
 version = "3.0.0"
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 8f96530a9d..8ceb8f2ad2 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -44,3 +44,4 @@ vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.13"
 bytes = "1.0"
 rust-ini = "0.20.0"
+rlimit = "0.10.1"
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 7bf5db5a57..f4c396a85d 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -64,6 +64,7 @@ use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
 use compute_tools::swap::resize_swap;
+use rlimit::{setrlimit, Resource};
 
 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
@@ -72,6 +73,9 @@ const BUILD_TAG_DEFAULT: &str = "latest";
 fn main() -> Result<()> {
     let (build_tag, clap_args) = init()?;
 
+    // enable core dumping for all child processes
+    setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
+
     let (pg_handle, start_pg_result) = {
         // Enter startup tracing context
         let _startup_context_guard = startup_context_from_env();
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index eced6fc0b2..1112795d30 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1116,7 +1116,7 @@ impl ComputeNode {
     // EKS worker nodes have following core dump settings:
     //   /proc/sys/kernel/core_pattern -> core
     //   /proc/sys/kernel/core_uses_pid -> 1
-    //   ulimint -c -> unlimited
+    //   ulimit -c -> unlimited
     // which results in core dumps being written to postgres data directory as core.<pid>.
     //
     // Use that as a default location and pattern, except macos where core dumps are written

From 69b6675da04ff81f6e2bfe5071e414cc2831e3ed Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 11 Jul 2024 08:23:51 +0100
Subject: [PATCH 1168/1571] rfcs: add RFC for timeline archival (#8221)

A design for a cheap low-resource state for idle timelines:
- #8088
---
 docs/rfcs/034-timeline-archive.md | 507 ++++++++++++++++++++++++++++++
 1 file changed, 507 insertions(+)
 create mode 100644 docs/rfcs/034-timeline-archive.md

diff --git a/docs/rfcs/034-timeline-archive.md b/docs/rfcs/034-timeline-archive.md
new file mode 100644
index 0000000000..c834216962
--- /dev/null
+++ b/docs/rfcs/034-timeline-archive.md
@@ -0,0 +1,507 @@
+# Timeline Archival
+
+## Summary
+
+This RFC describes a mechanism for pageservers to eliminate local storage + compute work
+for timelines which are not in use, in response to external API calls to "archive" a timeline.
+
+The archived state roughly corresponds to fully offloading a timeline to object storage, such
+that its cost is purely the cost of that object storage.
+
+## Motivation
+
+Archived timelines serve multiple purposes:
+- Act as a 'snapshot' for workloads that would like to retain restorable copies of their
+  database from longer ago than their PITR window.
+- Enable users to create huge numbers of branches (e.g. one per github PR) without having
+  to diligently clean them up later to avoid overloading the pageserver (currently we support
+  up to ~500 branches per tenant).
+
+### Prior art
+
+Most storage and database systems have some form of snapshot, which can be implemented several ways:
+1. full copies of data (e.g. an EBS snapshot to S3)
+2. shallow snapshots which are CoW relative to the original version of the data, e.g. on a typical NFS appliance, or a filesystem like CephFS.
+3. a series of snapshots which are CoW or de-duplicated relative to one another.
+
+Today's Neon branches are approximately like `2.`, although due to implementation details branches
+often end up storing much more data than they really need, as parent branches assume that all data
+at the branch point is needed.  The layers pinned in the parent branch may have a much larger size
+than the physical size of a compressed image layer representing the data at the branch point.
+
+## Requirements
+
+- Enter & exit the archived state in response to external admin API calls
+- API calls to modify the archived state are atomic and durable
+- An archived timeline should eventually (once out of PITR window) use an efficient compressed
+  representation, and avoid retaining arbitrarily large data in its parent branch.
+- Remote object GETs during tenant start may be O(N) with the number of _active_ branches,
+  but must not scale with the number of _archived_ branches.
+- Background I/O for archived branches should only be done a limited number of times to evolve them
+  to a long-term-efficient state (e.g. rewriting to image layers).  There should be no ongoing "housekeeping"
+  overhead for archived branches, including operations related to calculating sizes for billing.
+- The pageserver should put no load on the safekeeper for archived branches.
+- Performance of un-archiving a branch must make good use of S3/disk bandwidth to restore the branch
+  to a performant state in a short time (linear with the branch's logical size)
+
+## Non Goals
+
+- Archived branches are not a literal `fullbackup` postgres snapshot: they are still stored
+  in Neon's internal format.
+- Compute cold starts after activating an archived branch will not have comparable performance to
+  cold starts on an active branch.
+- Archived branches will not use any new/additional compression or de-duplication beyond what
+  is already implemented for image layers (zstd per page).
+- The pageserver will not "auto start" archived branches in response to page_service API requests: they
+  are only activated explicitly via the HTTP API.
+- We will not implement a total offload of archived timelines from safekeepers: their control file (small) will
+  remain on local disk, although existing eviction mechanisms will remove any segments from local disk.
+- We will not expose any prometheus metrics for archived timelines, or make them visible in any
+  detailed HTTP APIs other than the specific API for listing archived timelines.
+- A parent branch may not be archived unless all its children are.
+
+## Impacted Components
+
+pageserver, storage controller
+
+## Terminology
+
+**Archived**: a branch is _archived_ when an HTTP API request to archive it has succeeded: the caller
+may assume that this branch is now very cheap to store, although this may not be physically so until the
+branch proceeds to the offloaded state.
+
+**Active** branches are branches which are available for use by page_service clients, and have a relatively
+high cost due to consuming local storage.
+
+**Offloaded** branches are a subset of _archived_ branches, which have had their local state removed such
+that they now consume minimal runtime resources and have a cost similar to the cost of object storage.
+
+**Activate** (verb): transition from Archived to Active
+
+**Archive** (verb): transition from Active to Archived
+
+**Offload** (verb): transition from Archived to Offloaded
+
+**Offload manifest**: an object stored in S3 that describes timelines which pageservers do not load.
+
+**Warm up** (verb): operation done on an active branch, by downloading its active layers.  Once a branch is
+warmed up, good performance will be available to page_service clients.
+
+## Implementation
+
+### High level flow
+
+We may think of a timeline which is archived and then activated as proceeding through a series of states:
+
+```mermaid
+stateDiagram
+  [*] --> Active(warm)
+  Active(warm) --> Archived
+  Archived --> Offloaded
+  Archived --> Active(warm)
+  Offloaded --> Active(cold)
+  Active(cold) --> Active(warm)
+```
+
+Note that the transition from Archived to Active(warm) is expected to be fairly rare: the most common lifecycles
+of branches will be:
+- Very frequent: Short lived branches: Active -> Deleted
+- Frequent: Long-lived branches: Active -> Archived -> Offloaded -> Deleted
+- Rare: Branches used to restore old state: Active ->Archived -> Offloaded -> Active
+
+These states are _not_ all stored as a single physical state on the timeline, but rather represent the combination
+of:
+- the timeline's lifecycle state: active or archived, stored in the timeline's index
+- its offload state: whether pageserver has chosen to drop local storage of the timeline and write it into the
+  manifest of offloaded timelines.
+- cache state (whether it's warm or cold).
+
+### Storage format changes
+
+There are two storage format changes:
+1. `index_part.json` gets a new attribute `state` that describes whether the timeline is to
+   be considered active or archived.
+2. A new tenant-level _manifest_ object `tenant_manifest-v1.json` describes which timelines a tenant does not need to load
+   at startup (and is available for storing other small, rarely changing tenant-wide attributes in future)
+
+The manifest object will have a format like this:
+```
+{
+  "offload_timelines": [
+    {
+      "timeline_id": ...
+      "last_record_lsn": ...
+      "last_record_lsn_time": ...
+      "pitr_interval": ...
+      "last_gc_lsn": ...  # equal to last_record_lsn if this branch has no history (i.e. a snapshot)
+      "logical_size": ...  # The size at last_record_lsn
+      "physical_size" ...
+      "parent": Option<{
+        "timeline_id"...
+        "lsn"... # Branch point LSN on the parent
+        "requires_data": bool # True if this branch depends on layers in its parent, identify it here
+
+      }>
+    }
+  ]
+}
+```
+
+The information about a timeline in its offload state is intentionally minimal: just enough to decide:
+- Whether it requires [archive optimization](#archive-branch-optimization) by rewriting as a set of image layers: we may infer this
+  by checking if now > last_record_lsn_time - pitr_interval, and pitr_lsn < last_record_lsn.
+- Whether a parent branch should include this offloaded branch in its GC inputs to avoid removing
+  layers that the archived branch depends on
+- Whether requests to delete this `timeline_id` should be executed (i.e. if a deletion request
+  is received for a timeline_id that isn't in the site of live `Timelines` or in the manifest, then
+  we don't need to go to S3 for the deletion.
+- How much archived space to report in consumption metrics
+
+The contents of the manifest's offload list will also be stored as an attribute of `Tenant`, such that the total
+set of timelines may be found by the union of `Tenant::timelines` (non-offloaded timelines) and `Tenant::offloaded`
+(offloaded timelines).
+
+For split-brain protection, the manifest object will be written with a generation suffix, in the same way as
+index_part objects are (see [generation numbers RFC](025-generation-numbers.md)).  This will add some complexity, but
+give us total safety against two pageservers with the same tenant attached fighting over the object.  Existing code
+for finding the latest generation and for cleaning up old generations (in the scrubber) will be generalized to cover
+the manifest file.
+
+### API & Timeline state
+
+Timelines will store a lifecycle state (enum of Active or Archived) in their IndexPart.  This will
+be controlled by a new per-timeline `configure` endpoint.  This is intentionally generic naming, which
+may be used in future to control other per-timeline attributes (e.g. in future we may make PITR interval
+a per-timeline configuration).
+
+`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configure`
+```
+{
+  'state': 'active|archive'
+}
+```
+
+When archiving a timeline, this API will complete as soon as the timeline's state has been set in index_part, and that index has been uploaded.
+
+When activating a timeline, this API will complete as soon as the timeline's state has been set in index_part,
+**and** the `Timeline` object has been instantiated and activated.  This will require reading the timeline's
+index, but not any data: it should be about as fast as a couple of small S3 requests.
+
+The API will be available with identical path via the storage controller: calling this on a sharded tenant
+will simply map the API call to all the shards.
+
+Archived timelines may never have descendent timelines which are active.  This will be enforced at the API level,
+such that activating a timeline requires that all its ancestors are active, and archiving a timeline requires
+that all its descendents are archived.  It is the callers responsibility to walk the hierarchy of timelines
+in the proper order if they would like to archive whole trees of branches.
+
+Because archive timelines will be excluded from the usual timeline listing APIs, a new API specifically
+for archived timelines will be added: this is for use in support/debug:
+
+```
+GET /v1/tenants/{tenant_id}/archived_timelines
+
+{
+  ...same per-timeline content as the tenant manifest...
+}
+
+```
+
+### Tenant attach changes
+
+Currently, during Tenant::spawn we list all the timelines in the S3 bucket, and then for each timeline
+we load their index_part.json.  To avoid the number of GETs scaling linearly with the number of archived
+timelines, we must have a single object that tells us which timelines do not need to be loaded.  The
+number of ListObjects requests while listing timelines will still scale O(N), but this is less problematic
+because each request covers 1000 timelines.
+
+This is **not** literally the same as the set of timelines who have state=archived.  Rather, it is
+the set of timelines which have been offloaded in the background after their state was set to archived.
+
+We may simply skip loading these timelines: there will be no special state of `Timeline`, they just won't
+exist from the perspective of an active `Tenant` apart from in deletion: timeline deletion will need
+to check for offloaded timelines as well as active timelines, to avoid wrongly returning 404 on trying
+to delete an offloaded timeline.
+
+### Warm-up API
+
+`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/download?wait_ms=1234`
+
+This API will be similar to the existing `download_remote_layers` API, but smarter:
+- It will not download _all_ remote layers, just the visible set (i.e. layers needed for a read)
+- It will download layers in the visible set until reaching `wait_ms`, then return a struct describing progress
+  of downloads, so that the caller can poll.
+
+The _visible set_ mentioned above will be calculated by the pageserver in the background, by taking the set
+of readable LSNs (i.e. branch points and heads of branches), and walking the layer map to work out which layers
+can possibly be read from these LSNs.  This concept of layer visibility is more generally useful for cache
+eviction and heatmaps, as well as in this specific case of warming up a timeline.
+
+The caller does not have to wait for the warm up API, or call it at all.  But it is strongly advised
+to call it, because otherwise populating local contents for a timeline can take a long time when waiting
+for SQL queries to coincidentally hit all the layers, and during that time query latency remains quite
+volatile.
+
+### Background work
+
+Archived branches are not subject to normal compaction.  Instead, when the compaction loop encounters
+an archived branch, it will consider rewriting the branch to just image layers if the branch has no history
+([archive branch optimization](#archive-branch-optimization)), or offloading the timeline from local disk
+if its state permits that.
+
+Additionally, the tenant compaction task will walk the state of already offloaded timelines to consider
+optimizing their storage, e.g. if a timeline had some history when offloaded, but since then its PITR
+has elapsed and it can now be rewritten to image layers.
+
+#### Archive branch offload
+
+Recall that when we archive a timeline via the HTTP API, this only sets a state: it doesn't do
+any actual work.
+
+This work is done in the background compaction loop.  It makes sense to tag this work on to the compaction
+loop, because it is spiritually aligned: offloading data for archived branches improves storage efficiency.
+
+The condition for offload is simple:
+ - a `Timeline` object exists with state `Archived`
+ - the timeline does not have any non-offloaded children.
+ 
+ Regarding the condition that children must be offloaded, this will always be eventually true, because
+ we enforce at the API level that children of archived timelines must themselves be archived, and all
+ archived timelines will eventually be offloaded.
+
+Offloading a timeline is simple:
+- Read the timeline's attributes that we will store in its offloaded state (especially its logical size)
+- Call `shutdown()` on the timeline and remove it from the `Tenant` (as if we were about to delete it)
+- Erase all the timeline's content from local storage (`remove_dir_all` on its path)
+- Write the tenant manifest to S3 to prevent this timeline being loaded on next start.
+
+#### Archive branch optimization (flattening)
+
+When we offloaded a branch, it might have had some history that prevented rewriting it to a single
+point in time set of image layers.  For example, a branch might have several days of writes and a 7
+day PITR: when we archive it, it still has those days of history.
+
+Once the PITR has expired, we have an opportunity to reduce the physical footprint of the branch by:
+- Writing compressed image layers within the archived branch, as these are more efficient as a way of storing
+  a point in time compared with delta layers
+- Updating the branch's offload metadata to indicate that this branch no longer depends on its ancestor
+  for data, i.e. the ancestor is free to GC layers files at+below the branch point
+
+Fully compacting an archived branch into image layers at a single LSN may be thought of as *flattening* the
+branch, such that it is now a one-dimensional keyspace rather than a two-dimensional key/lsn space. It becomes
+a true snapshot at that LSN.
+
+It is not always more efficient to flatten a branch than to keep some extra history on the parent: this
+is described in more detail in [optimizations](#delaying-storage-optimization-if-retaining-parent-layers-is-cheaper)
+
+Archive branch optimization should be done _before_ background offloads during compaction, because there may
+be timelines which are ready to be offloaded but also would benefit from the optimization step before
+being offloaded.  For example, a branch which has already fallen out of PITR window and has no history
+of its own may be immediately re-written as a series of image layers before being offloaded.
+
+### Consumption metrics
+
+Archived timelines and offloaded timelines will be excluded from the synthetic size calculation, in anticipating
+that billing structures based on consumption metrics are highly likely to apply different $/GB rates to archived
+vs. ordinary content.
+
+Archived and offloaded timelines' logical size will be reported under the existing `timeline_logical_size`
+variant of `MetricsKey`: receivers are then free to bill on this metric as they please.
+
+### Secondary locations
+
+Archived timelines (including offloaded timelines) will be excluded from heatmaps, and thereby
+when a timeline is archived, after the next cycle of heatmap upload & secondary download, its contents
+will be dropped from secondary locations.
+
+### Sharding
+
+Archiving or activating a timeline will be done symmetrically across all shards in a tenant, in
+the same way that timeline creation and deletion is done.  There are no special rules about ordering:
+the storage controller may dispatch concurrent calls to all shards when archiving or activating a timeline.
+
+Since consumption metrics are only transmitted from shard zero, the state of archival on this shard
+will be authoritative for consumption metrics.
+
+## Error cases
+
+### Errors in sharded tenants
+
+If one shard in a tenant fails an operation but others succeed, the tenant may end up in a mixed
+state, where a timeline is archived on some shards but not on others.  
+
+We will not bother implementing a rollback mechanism for this: errors in archiving/activating a timeline
+are either transient (e.g. S3 unavailable, shutting down), or the fault of the caller (NotFound, BadRequest).
+In the transient case callers are expected to retry until success, or to make appropriate API calls to clear
+up their mistake.  We rely on this good behavior of callers to eventually get timelines into a consistent
+state across all shards.  If callers do leave a timeline in an inconsistent state across shards, this doesn't
+break anything, it's just "weird".
+
+This is similar to the status quo for timeline creation and deletion: callers are expected to retry
+these operations until they succeed.
+
+### Archiving/activating
+
+Archiving/activating a timeline can fail in a limited number of ways:
+1. I/O error storing/reading the timeline's updated index
+    - These errors are always retryable: a fundamental design assumption of the pageserver is that remote
+      storage errors are always transient. 
+2. NotFound if the timeline doesn't exist
+    - Callers of the API are expected to avoid calling deletion and archival APIs concurrently.
+    - The storage controller has runtime locking to prevent races such as deleting a timeline while
+      archiving it.
+3. BadRequest if the rules around ancestors/descendents of archived timelines would be violated
+    - Callers are expected to do their own checks to avoid hitting this case.  If they make
+      a mistake and encounter this error, they should give up.
+
+### Offloading
+
+Offloading can only fail if remote storage is unavailable, which would prevent us from writing the
+tenant manifest.  In such error cases, we give up in the expectation that offloading will be tried 
+again at the next iteration of the compaction loop.
+
+### Archive branch optimization
+
+Optimization is a special form of compaction, so can encounter all the same errors as regular compaction
+can: it should return Result<(), CompactionError>, and as with compaction it will be retried on
+the next iteration of the compaction loop.
+
+## Optimizations
+
+### Delaying storage optimization if retaining parent layers is cheaper
+
+Optimizing archived branches to image layers and thereby enabling parent branch GC to progress
+is a safe default: archived branches cannot over-fill a pageserver's local disk, and once they
+are offloaded to S3 they're totally safe, inert things.
+
+However, in some cases it can be advantageous to retain extra history on their parent branch rather
+than flattening the archived branch.  For example, if a 1TB parent branch is rather slow-changing (1GB
+of data per day), and archive branches are being created nightly, then writing out full 1TB image layers
+for each nightly branch is inefficient compared with just keeping more history on the main branch.
+
+Getting this right requires consideration of:
+- Compaction: if keeping more history on the main branch is going to prompt the main branch's compaction to
+  write out extra image layers, then it might make more sense to just write out the image layers on
+  the archived branch.
+- Metadata bloat: keeping extra history on a parent branch doesn't just cost GB of storage, it makes
+  the layer map (and index_part) bigger.  There are practical limits beyond which writing an indefinitely
+  large layer map can cause problems elsewhere.
+
+This optimization can probably be implemented quite cheaply with some basic heuristics like:
+- don't bother doing optimization on an archive branch if the LSN distance between
+  its branch point and the end of the PITR window is <5% of the logical size of the archive branch.
+- ...but, Don't keep more history on the main branch than double the PITR
+
+### Creating a timeline in archived state (a snapshot)
+
+Sometimes, one might want to create a branch with no history, which will not be written to
+before it is archived.  This is a snapshot, although we do not require a special snapshot API,
+since a snapshot can be represented as a timeline with no history.
+
+This can be accomplished by simply creating a timeline and then immediately archiving it, but
+that is somewhat wasteful: this timeline it will spin up various tasks and open a connection to the storage
+broker to try and ingest WAL, before being shutdown in the subsequent archival call.  To explicitly
+support this common special case, we may add a parameter to the timeline creation API which
+creates a timeline directly into the archived state.
+
+Such a timeline creation will do exactly two I/Os at creation time:
+- write the index_part object to record the timeline's existence
+- when the timeline is offloaded in the next iteration of the compaction loop (~20s later),
+  write the tenant manifest.
+
+Later, when the timeline falls off the end of the PITR interval, the usual offload logic will wake
+up the 'snapshot' branch and write out image layers.
+
+## Future Work
+
+### Enabling `fullbackup` dumps from archive branches
+
+It would be useful to be able to export an archive branch to another system, or for use in a local
+postgres database.
+
+This could be implemented as a general capability for all branches, in which case it would "just work"
+for archive branches by activating them.  However, downloading all the layers in a branch just to generate
+a fullbackup is a bit inefficient: we could implement a special case for flattened archived branches
+which streams image layers from S3 and outputs the fullbackup stream without writing the layers out to disk.
+
+Implementing `fullbackup` is a bit more complicated than this because of sharding, but solving that problem
+is unrelated to the topic of archived branches (it probably involves having each shard write out a fullbackup 
+stream to S3 in an intermediate format and, then having one node stitch them together).
+
+### Tagging layers from archived branches
+
+When we know a layer is an image layer written for an archived branch that has fallen off the PITR window,
+we may add tags to the S3 objects to enable writing lifecycle policies that transition such layers to even
+cheaper storage.
+
+This could be done for all archived layers, or it could be driven by the archival API, to give the pageserver
+external hints on which branches are likely to be reactivated, and which branches are good candidates for
+tagging for low performance storage.
+
+Tagging+lifecycles is just one mechanism: one might also directly use S3 storage classes.  Other clouds' object
+stores have similar mechanisms.
+
+### Storing sequences of archive branches as deltas
+
+When archived branches are used as scheduled snapshots, we could store them even more efficiently
+by encoding them as deltas relative to each other (i.e. for nightly snapshots, when we do the
+storage optimization for Tuesday's snapshot, we would read Monday's snapshot and store only the modified
+pages). This is the kind of encoding that many backup storage systems use.
+
+The utility of this depends a lot on the churn rate of the data, and the cost of doing the delta encoding
+vs. just writing out a simple stream of the entire database.  For smaller databases, writing out a full
+copy is pretty trivial (e.g. writing a compressed copy of a 10GiB database to S3 can take under 10 seconds,
+so the complexity tradeoff of diff-encoding it is dubious).
+
+One does not necessarily have to read-back the previous snapshot in order to encoded the next one: if the
+pageserver knows about the schedule, it can intentionally retain extra history on the main branch so that
+we can say: "A branch exists from Monday night.  I have Monday night's data still active in the main branch,
+so now I can read at the Monday LSN and the Tuesday LSN, calculate the delta, and store it as Tuesday's
+delta snapshot".
+
+Clearly this all requires careful housekeeping to retain the relationship between branches that depend on
+each other: perhaps this would be done by making the archive branches have child/parent relationships with
+each other, or perhaps we would permit them to remain children of their original parent, but additionally
+have a relationship with the snapshot they're encoded relative to.
+
+Activating a branch that is diff-encoded may require activating several earlier branches too, so figuring
+out how frequently to write a full copy is important.  This is essentially a zoomed-out version of what
+we do with delta layers and image layers within a timeline, except each "layer" is a whole timeline.
+
+
+## FAQ/Alternatives
+
+### Store all timelines in the tenant manifest
+
+Rather than special-casing offloaded timelines in the offload manifest, we could store a total
+manifest of all timelines, eliminating the need for the pageserver to list timelines in S3 on
+startup.
+
+That would be a more invasive change (require hooking in to timeline creation), and would
+generate much more I/O to this manifest for tenants that had many branches _and_ frequent
+create/delete cycles for short lived branches.  Restricting the manifest to offloaded timelines
+means that we only have to cope with the rate at which long-lived timelines are archived, rather
+than the rate at which sort lived timelines are created & destroyed.
+
+### Automatically archiving/activating timelines without external API calls
+
+We could implement TTL driven offload of timelines, waking them up when a page request
+arrives.
+
+This has downsides:
+- Opacity: if we do TTL-driven offload inside the pageserver, then the end user doesn't
+  know which of their branches are in this state, and might get a surprise when they try
+  to use such a branch.
+- Price fluctuation: if the archival of a branch is used in end user pricing, then users
+  prefer clarity & consistency.  Ideally a branch's storage should cost the same from the moment it
+  is created, rather than having a usage-dependency storage price.
+- Complexity: enabling the page service to call up into the Tenant to activate a timeline
+  would be awkward, compared with an external entry point.
+
+### Make offloaded a state of Timeline
+
+To reduce the operator-facing complexity of having some timelines APIs that only return
+non-offloaded timelines, we could build the offloaded state into the Timeline type.
+
+`timeline.rs` is already one of the most egregiously long source files in the tree, so
+this is rejected on the basis that we need to avoid making that complexity worse.
\ No newline at end of file

From c11b9cb43dfccffd2ce0c48a31119d29ecd28b0f Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 11 Jul 2024 11:07:12 +0200
Subject: [PATCH 1169/1571] Run Performance bench on more platforms (#8312)

## Problem

https://github.com/neondatabase/cloud/issues/14721

## Summary of changes

add one more platform to benchmarking job


https://github.com/neondatabase/neon/blob/57535c039c938f7c179693d9db8b052912019823/.github/workflows/benchmarking.yml#L57C3-L126

Run with pg 16, provisioner k8-neonvm by default on the new platform.

Adjust some test cases to

- not depend on database client <-> database server latency by pushing
loops into server side pl/pgSQL functions
- increase statement and test timeouts

First successful run of these job steps

https://github.com/neondatabase/neon/actions/runs/9869817756/job/27254280428
---
 .github/workflows/benchmarking.yml        | 21 +++++++++++++----
 test_runner/performance/test_hot_page.py  | 28 +++++++++++++++++------
 test_runner/performance/test_hot_table.py | 21 +++++++++++++----
 3 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 899cae2b86..d038f64f15 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -56,15 +56,26 @@ concurrency:
 jobs:
   bench:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
+    strategy:
+      matrix:
+        include:
+          - DEFAULT_PG_VERSION: 14
+            PLATFORM: "neon-staging"
+            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+            provisioner: 'k8s-pod' 
+          - DEFAULT_PG_VERSION: 16
+            PLATFORM: "azure-staging"
+            region_id: 'azure-eastus2'
+            provisioner: 'k8s-neonvm'
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "300"
       TEST_PG_BENCH_SCALES_MATRIX: "10,100"
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }}
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: "neon-staging"
+      PLATFORM: ${{ matrix.PLATFORM }}
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
@@ -85,9 +96,10 @@ jobs:
       id: create-neon-project
       uses: ./.github/actions/neon-project-create
       with:
-        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+        region_id: ${{ matrix.region_id }}
         postgres_version: ${{ env.DEFAULT_PG_VERSION }}
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        provisioner: ${{ matrix.provisioner }}
 
     - name: Run benchmark
       uses: ./.github/actions/run-python-test-set
@@ -96,13 +108,14 @@ jobs:
         test_selection: performance
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
         # Set --sparse-ordering option of pytest-order plugin
         # to ensure tests are running in order of appears in the file.
         # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
         extra_params:
           -m remote_cluster
           --sparse-ordering
-          --timeout 5400
+          --timeout 14400
           --ignore test_runner/performance/test_perf_olap.py
           --ignore test_runner/performance/test_perf_pgvector_queries.py
           --ignore test_runner/performance/test_logical_replication.py
diff --git a/test_runner/performance/test_hot_page.py b/test_runner/performance/test_hot_page.py
index d9785dd87e..5e97c7cddf 100644
--- a/test_runner/performance/test_hot_page.py
+++ b/test_runner/performance/test_hot_page.py
@@ -16,20 +16,34 @@ from pytest_lazyfixture import lazy_fixture
 )
 def test_hot_page(env: PgCompare):
     # Update the same page many times, then measure read performance
-    num_writes = 1000000
 
     with closing(env.pg.connect()) as conn:
         with conn.cursor() as cur:
             cur.execute("drop table if exists t, f;")
+            num_writes = 1000000
 
-            # Write many updates to the same row
+            # Use a PL/pgSQL block to perform many updates to the same row
+            # without depending on the latency between database client and postgres
+            # server
+            # - however a single staement should not run into a timeout so we increase it
+            cur.execute("SET statement_timeout = '4h';")
             with env.record_duration("write"):
-                cur.execute("create table t (i integer);")
-                cur.execute("insert into t values (0);")
-                for i in range(num_writes):
-                    cur.execute(f"update t set i = {i};")
+                cur.execute(
+                    f"""
+                DO $$
+                BEGIN
+                    create table t (i integer);
+                    insert into t values (0);
 
-            # Write 3-4 MB to evict t from compute cache
+                    FOR j IN 1..{num_writes} LOOP
+                        update t set i = j;
+                    END LOOP;
+                END $$;
+                """
+                )
+
+            # Write ca 350 MB to evict t from compute shared buffers (128 MB)
+            # however it will still be in LFC, so I do not really understand the point of this test
             cur.execute("create table f (i integer);")
             cur.execute("insert into f values (generate_series(1,100000));")
 
diff --git a/test_runner/performance/test_hot_table.py b/test_runner/performance/test_hot_table.py
index 5fcffc8afb..9a78c92ec0 100644
--- a/test_runner/performance/test_hot_table.py
+++ b/test_runner/performance/test_hot_table.py
@@ -16,8 +16,8 @@ from pytest_lazyfixture import lazy_fixture
 )
 def test_hot_table(env: PgCompare):
     # Update a small table many times, then measure read performance
-    num_rows = 100000  # Slightly larger than shared buffers size  TODO validate
-    num_writes = 1000000
+    num_rows = 100000  # initial table size only about 4 MB
+    num_writes = 10000000  # write approximately 349 MB blocks > 128 MB shared_buffers
     num_reads = 10
 
     with closing(env.pg.connect()) as conn:
@@ -28,8 +28,21 @@ def test_hot_table(env: PgCompare):
             with env.record_duration("write"):
                 cur.execute("create table t (i integer primary key);")
                 cur.execute(f"insert into t values (generate_series(1,{num_rows}));")
-                for i in range(num_writes):
-                    cur.execute(f"update t set i = {i + num_rows} WHERE i = {i};")
+                # PL/pgSQL block to perform updates (and avoid latency between client and server)
+                # - however a single staement should not run into a timeout so we increase it
+                cur.execute("SET statement_timeout = '4h';")
+                cur.execute(
+                    f"""
+                DO $$
+                DECLARE
+                    r integer := {num_rows};
+                BEGIN
+                    FOR j IN 1..{num_writes} LOOP
+                        UPDATE t SET i = j + r WHERE i = j;
+                    END LOOP;
+                END $$;
+                """
+                )
 
             # Read the table
             with env.record_duration("read"):

From e26ef640c1004306c7be192e7afece93f2f529c0 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 11 Jul 2024 15:17:07 +0200
Subject: [PATCH 1170/1571] pageserver: remove `trace_read_requests` (#8338)

`trace_read_requests` is a per `Tenant`-object option.
But the `handle_pagerequests` loop doesn't know which
`Tenant` object (i.e., which shard) the request is for.

The remaining use of the `Tenant` object is to check `tenant.cancel`.
That check is incorrect [if the pageserver hosts multiple
shards](https://github.com/neondatabase/neon/issues/7427#issuecomment-2220577518).
I'll fix that in a future PR where I completely eliminate the holding
of `Tenant/Timeline` objects across requests.
See [my code RFC](https://github.com/neondatabase/neon/pull/8286) for
the
high level idea.

Note that we can always bring the tracing functionality if we need it.
But since it's actually about logging the `page_service` wire bytes,
it should be a `page_service`-level config option, not per-Tenant.
And for enabling tracing on a single connection, we can implement
a `set pageserver_trace_connection;` option.
---
 Cargo.lock                                    |  11 --
 Cargo.toml                                    |   1 -
 control_plane/src/pageserver.rs               |  10 --
 libs/pageserver_api/src/models.rs             |   1 -
 libs/utils/src/id.rs                          |  11 --
 pageserver/src/config.rs                      |  45 -----
 pageserver/src/http/openapi_spec.yml          |   2 -
 pageserver/src/lib.rs                         |   1 -
 pageserver/src/page_service.rs                |  19 --
 pageserver/src/tenant.rs                      |   8 -
 pageserver/src/tenant/config.rs               |  10 --
 pageserver/src/trace.rs                       |  36 ----
 test_runner/fixtures/compare_fixtures.py      |   2 -
 .../regress/test_attach_tenant_config.py      |   1 -
 test_runner/regress/test_read_trace.py        |  39 ----
 trace/Cargo.toml                              |  13 --
 trace/src/main.rs                             | 167 ------------------
 17 files changed, 377 deletions(-)
 delete mode 100644 pageserver/src/trace.rs
 delete mode 100644 test_runner/regress/test_read_trace.py
 delete mode 100644 trace/Cargo.toml
 delete mode 100644 trace/src/main.rs

diff --git a/Cargo.lock b/Cargo.lock
index 9fb3f5385d..4b1525edee 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6510,17 +6510,6 @@ version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
 
-[[package]]
-name = "trace"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "clap",
- "pageserver_api",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "tracing"
 version = "0.1.37"
diff --git a/Cargo.toml b/Cargo.toml
index fc3dd51809..6bad8e3b20 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,6 @@ members = [
     "storage_controller",
     "storage_scrubber",
     "workspace_hack",
-    "trace",
     "libs/compute_api",
     "libs/pageserver_api",
     "libs/postgres_ffi",
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index f0403b1796..5f2373e95a 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -349,11 +349,6 @@ impl PageServerNode {
                 .map(|x| x.parse::<NonZeroU64>())
                 .transpose()
                 .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
-            trace_read_requests: settings
-                .remove("trace_read_requests")
-                .map(|x| x.parse::<bool>())
-                .transpose()
-                .context("Failed to parse 'trace_read_requests' as bool")?,
             eviction_policy: settings
                 .remove("eviction_policy")
                 .map(serde_json::from_str)
@@ -454,11 +449,6 @@ impl PageServerNode {
                     .map(|x| x.parse::<NonZeroU64>())
                     .transpose()
                     .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
-                trace_read_requests: settings
-                    .remove("trace_read_requests")
-                    .map(|x| x.parse::<bool>())
-                    .transpose()
-                    .context("Failed to parse 'trace_read_requests' as bool")?,
                 eviction_policy: settings
                     .remove("eviction_policy")
                     .map(serde_json::from_str)
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d360cc6e87..6abdcb88d0 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -294,7 +294,6 @@ pub struct TenantConfig {
     pub walreceiver_connect_timeout: Option<String>,
     pub lagging_wal_timeout: Option<String>,
     pub max_lsn_wal_lag: Option<NonZeroU64>,
-    pub trace_read_requests: Option<bool>,
     pub eviction_policy: Option<EvictionPolicy>,
     pub min_resident_size_override: Option<u64>,
     pub evictions_low_residence_duration_metric_threshold: Option<String>,
diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs
index 0409001f4f..db468e3054 100644
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -302,17 +302,6 @@ pub struct TenantId(Id);
 
 id_newtype!(TenantId);
 
-/// Neon Connection Id identifies long-lived connections (for example a pagestream
-/// connection with the page_service). Is used for better logging and tracing
-///
-/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
-/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
-/// See [`Id`] for alternative ways to serialize it.
-#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
-pub struct ConnectionId(Id);
-
-id_newtype!(ConnectionId);
-
 // A pair uniquely identifying Neon instance.
 #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct TenantTimelineId {
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 17bc427b2c..5b103b551f 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -12,7 +12,6 @@ use serde::de::IntoDeserializer;
 use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
-use utils::id::ConnectionId;
 use utils::logging::SecretString;
 
 use once_cell::sync::OnceCell;
@@ -870,22 +869,6 @@ impl PageServerConf {
         )
     }
 
-    pub fn traces_path(&self) -> Utf8PathBuf {
-        self.workdir.join("traces")
-    }
-
-    pub fn trace_path(
-        &self,
-        tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
-        connection_id: &ConnectionId,
-    ) -> Utf8PathBuf {
-        self.traces_path()
-            .join(tenant_shard_id.to_string())
-            .join(timeline_id.to_string())
-            .join(connection_id.to_string())
-    }
-
     /// Turns storage remote path of a file into its local path.
     pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
         remote_path.with_base(&self.workdir)
@@ -1560,34 +1543,6 @@ broker_endpoint = '{broker_endpoint}'
         Ok(())
     }
 
-    #[test]
-    fn parse_tenant_config() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-
-        let broker_endpoint = "http://127.0.0.1:7777";
-        let trace_read_requests = true;
-
-        let config_string = format!(
-            r#"{ALL_BASE_VALUES_TOML}
-pg_distrib_dir='{pg_distrib_dir}'
-broker_endpoint = '{broker_endpoint}'
-
-[tenant_config]
-trace_read_requests = {trace_read_requests}"#,
-        );
-
-        let toml = config_string.parse()?;
-
-        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
-        assert_eq!(
-            conf.default_tenant_conf.trace_read_requests, trace_read_requests,
-            "Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
-        );
-
-        Ok(())
-    }
-
     #[test]
     fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
         let config_string = r#"
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 5ba329f05e..ae109ec1e7 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -873,8 +873,6 @@ components:
           type: string
         max_lsn_wal_lag:
           type: integer
-        trace_read_requests:
-          type: boolean
         heatmap_period:
           type: string
     TenantConfigResponse:
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index ac6b9b4f2a..63c677574f 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -23,7 +23,6 @@ pub mod span;
 pub(crate) mod statvfs;
 pub mod task_mgr;
 pub mod tenant;
-pub mod trace;
 pub mod utilization;
 pub mod virtual_file;
 pub mod walingest;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index c10c2f2a0f..f94b0d335e 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -36,7 +36,6 @@ use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::id::ConnectionId;
 use utils::sync::gate::GateGuard;
 use utils::{
     auth::{Claims, Scope, SwappableJwtAuth},
@@ -66,7 +65,6 @@ use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Tenant;
 use crate::tenant::Timeline;
-use crate::trace::Tracer;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -430,18 +428,6 @@ impl PageServerHandler {
             .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
             .await?;
 
-        // Make request tracer if needed
-        let mut tracer = if tenant.get_trace_read_requests() {
-            let connection_id = ConnectionId::generate();
-            let path =
-                tenant
-                    .conf
-                    .trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id);
-            Some(Tracer::new(path))
-        } else {
-            None
-        };
-
         // switch client to COPYBOTH
         pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
         self.flush_cancellable(pgb, &tenant.cancel).await?;
@@ -473,11 +459,6 @@ impl PageServerHandler {
             trace!("query: {copy_data_bytes:?}");
             fail::fail_point!("ps::handle-pagerequest-message");
 
-            // Trace request if needed
-            if let Some(t) = tracer.as_mut() {
-                t.trace(&copy_data_bytes)
-            }
-
             let neon_fe_msg =
                 PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index eef8dc104c..bf23513527 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2341,13 +2341,6 @@ impl Tenant {
             .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
     }
 
-    pub fn get_trace_read_requests(&self) -> bool {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
-        tenant_conf
-            .trace_read_requests
-            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
-    }
-
     pub fn get_min_resident_size_override(&self) -> Option<u64> {
         let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
@@ -3718,7 +3711,6 @@ pub(crate) mod harness {
                 walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
                 lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
                 max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
-                trace_read_requests: Some(tenant_conf.trace_read_requests),
                 eviction_policy: Some(tenant_conf.eviction_policy),
                 min_resident_size_override: tenant_conf.min_resident_size_override,
                 evictions_low_residence_duration_metric_threshold: Some(
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 5b532e4830..48ff17db94 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -335,7 +335,6 @@ pub struct TenantConf {
     /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
     /// to avoid eager reconnects.
     pub max_lsn_wal_lag: NonZeroU64,
-    pub trace_read_requests: bool,
     pub eviction_policy: EvictionPolicy,
     pub min_resident_size_override: Option<u64>,
     // See the corresponding metric's help string.
@@ -436,10 +435,6 @@ pub struct TenantConfOpt {
     #[serde(default)]
     pub max_lsn_wal_lag: Option<NonZeroU64>,
 
-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(default)]
-    pub trace_read_requests: Option<bool>,
-
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub eviction_policy: Option<EvictionPolicy>,
@@ -519,9 +514,6 @@ impl TenantConfOpt {
                 .lagging_wal_timeout
                 .unwrap_or(global_conf.lagging_wal_timeout),
             max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
-            trace_read_requests: self
-                .trace_read_requests
-                .unwrap_or(global_conf.trace_read_requests),
             eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
             min_resident_size_override: self
                 .min_resident_size_override
@@ -581,7 +573,6 @@ impl Default for TenantConf {
                 .expect("cannot parse default walreceiver lagging wal timeout"),
             max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
                 .expect("cannot parse default max walreceiver Lsn wal lag"),
-            trace_read_requests: false,
             eviction_policy: EvictionPolicy::NoEviction,
             min_resident_size_override: None,
             evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
@@ -659,7 +650,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
             walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime),
             lagging_wal_timeout: value.lagging_wal_timeout.map(humantime),
             max_lsn_wal_lag: value.max_lsn_wal_lag,
-            trace_read_requests: value.trace_read_requests,
             eviction_policy: value.eviction_policy,
             min_resident_size_override: value.min_resident_size_override,
             evictions_low_residence_duration_metric_threshold: value
diff --git a/pageserver/src/trace.rs b/pageserver/src/trace.rs
deleted file mode 100644
index 18ec269198..0000000000
--- a/pageserver/src/trace.rs
+++ /dev/null
@@ -1,36 +0,0 @@
-use bytes::Bytes;
-use camino::Utf8PathBuf;
-use std::{
-    fs::{create_dir_all, File},
-    io::{BufWriter, Write},
-};
-
-pub struct Tracer {
-    writer: BufWriter<File>,
-}
-
-impl Drop for Tracer {
-    fn drop(&mut self) {
-        self.flush()
-    }
-}
-
-impl Tracer {
-    pub fn new(path: Utf8PathBuf) -> Self {
-        let parent = path.parent().expect("failed to parse parent path");
-        create_dir_all(parent).expect("failed to create trace dir");
-
-        let file = File::create(path).expect("failed to create trace file");
-        Tracer {
-            writer: BufWriter::new(file),
-        }
-    }
-
-    pub fn trace(&mut self, msg: &Bytes) {
-        self.writer.write_all(msg).expect("failed to write trace");
-    }
-
-    pub fn flush(&mut self) {
-        self.writer.flush().expect("failed to flush trace file");
-    }
-}
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 429b6af548..08215438e1 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -109,8 +109,6 @@ class NeonCompare(PgCompare):
 
         # Create tenant
         tenant_conf: Dict[str, str] = {}
-        if False:  # TODO add pytest setting for this
-            tenant_conf["trace_read_requests"] = "true"
         self.tenant, _ = self.env.neon_cli.create_tenant(conf=tenant_conf)
 
         # Create timeline
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index f2ee2b70aa..a7eda73d4c 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -168,7 +168,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
             "refill_amount": 1000,
             "max": 1000,
         },
-        "trace_read_requests": True,
         "walreceiver_connect_timeout": "13m",
         "image_layer_creation_check_threshold": 1,
         "switch_aux_file_policy": "cross-validation",
diff --git a/test_runner/regress/test_read_trace.py b/test_runner/regress/test_read_trace.py
deleted file mode 100644
index cc5853b727..0000000000
--- a/test_runner/regress/test_read_trace.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from contextlib import closing
-
-from fixtures.common_types import Lsn
-from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.pageserver.utils import wait_for_last_record_lsn
-from fixtures.utils import query_scalar
-
-
-# This test demonstrates how to collect a read trace. It's useful until
-# it gets replaced by a test that actually does stuff with the trace.
-#
-# Additionally, tests that pageserver is able to create tenants with custom configs.
-def test_read_request_tracing(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.num_safekeepers = 1
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            "trace_read_requests": "true",
-        }
-    )
-
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-    endpoint = env.endpoints.create_start("main")
-
-    with closing(endpoint.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute("create table t (i integer);")
-            cur.execute(f"insert into t values (generate_series(1,{10000}));")
-            cur.execute("select count(*) from t;")
-            current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
-    # wait until pageserver receives that data
-    pageserver_http = env.pageserver.http_client()
-    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
-
-    # Stop postgres so we drop the connection and flush the traces
-    endpoint.stop()
-
-    trace_path = env.pageserver.workdir / "traces" / str(tenant_id) / str(timeline_id)
-    assert trace_path.exists()
diff --git a/trace/Cargo.toml b/trace/Cargo.toml
deleted file mode 100644
index d6eed3f49c..0000000000
--- a/trace/Cargo.toml
+++ /dev/null
@@ -1,13 +0,0 @@
-[package]
-name = "trace"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-[dependencies]
-clap.workspace = true
-anyhow.workspace = true
-
-pageserver_api.workspace = true
-utils.workspace = true
-workspace_hack.workspace = true
diff --git a/trace/src/main.rs b/trace/src/main.rs
deleted file mode 100644
index 79e1df988d..0000000000
--- a/trace/src/main.rs
+++ /dev/null
@@ -1,167 +0,0 @@
-//! A tool for working with read traces generated by the pageserver.
-use std::collections::HashMap;
-use std::path::PathBuf;
-use std::str::FromStr;
-use std::{
-    fs::{read_dir, File},
-    io::BufReader,
-};
-
-use pageserver_api::models::{
-    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamProtocolVersion,
-};
-use utils::id::{ConnectionId, TenantId, TimelineId};
-
-use clap::{Parser, Subcommand};
-
-/// Utils for working with pageserver read traces. For generating
-/// traces, see the `trace_read_requests` tenant config option.
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-struct Args {
-    /// Path of trace directory
-    #[arg(short, long)]
-    path: PathBuf,
-
-    #[command(subcommand)]
-    command: Command,
-}
-
-/// What to do with the read trace
-#[derive(Subcommand, Debug)]
-enum Command {
-    /// List traces in the directory
-    List,
-
-    /// Print the traces in text format
-    Dump,
-
-    /// Print stats and anomalies about the traces
-    Analyze,
-}
-
-// HACK This function will change and improve as we see what kind of analysis is useful.
-//      Currently it collects the difference in blkno of consecutive GetPage requests,
-//      and counts the frequency of each value. This information is useful in order to:
-//      - see how sequential a workload is by seeing how often the delta is 1
-//      - detect any prefetching anomalies by looking for negative deltas during seqscan
-fn analyze_trace<R: std::io::Read>(mut reader: R) {
-    let mut total = 0; // Total requests traced
-    let mut cross_rel = 0; // Requests that ask for different rel than previous request
-    let mut deltas = HashMap::<i32, u32>::new(); // Consecutive blkno differences
-    let mut prev: Option<PagestreamGetPageRequest> = None;
-
-    // Compute stats
-    while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) {
-        match msg {
-            PagestreamFeMessage::Exists(_) => {}
-            PagestreamFeMessage::Nblocks(_) => {}
-            PagestreamFeMessage::GetSlruSegment(_) => {}
-            PagestreamFeMessage::GetPage(req) => {
-                total += 1;
-
-                if let Some(prev) = prev {
-                    if prev.rel == req.rel {
-                        let delta = (req.blkno as i32) - (prev.blkno as i32);
-                        deltas.entry(delta).and_modify(|c| *c += 1).or_insert(1);
-                    } else {
-                        cross_rel += 1;
-                    }
-                }
-                prev = Some(req);
-            }
-            PagestreamFeMessage::DbSize(_) => {}
-        };
-    }
-
-    // Print stats.
-    let mut other = deltas.len();
-    deltas.retain(|_, count| *count > 300);
-    other -= deltas.len();
-    dbg!(total);
-    dbg!(cross_rel);
-    dbg!(other);
-    dbg!(deltas);
-}
-
-fn dump_trace<R: std::io::Read>(mut reader: R) {
-    while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) {
-        println!("{msg:?}");
-    }
-}
-
-#[derive(Debug)]
-struct TraceFile {
-    #[allow(dead_code)]
-    pub tenant_id: TenantId,
-
-    #[allow(dead_code)]
-    pub timeline_id: TimelineId,
-
-    #[allow(dead_code)]
-    pub connection_id: ConnectionId,
-
-    pub path: PathBuf,
-}
-
-fn get_trace_files(traces_dir: &PathBuf) -> anyhow::Result<Vec<TraceFile>> {
-    let mut trace_files = Vec::<TraceFile>::new();
-
-    // Trace files are organized as {tenant_id}/{timeline_id}/{connection_id}
-    for tenant_dir in read_dir(traces_dir)? {
-        let entry = tenant_dir?;
-        let path = entry.path();
-        let tenant_id = TenantId::from_str(path.file_name().unwrap().to_str().unwrap())?;
-
-        for timeline_dir in read_dir(path)? {
-            let entry = timeline_dir?;
-            let path = entry.path();
-            let timeline_id = TimelineId::from_str(path.file_name().unwrap().to_str().unwrap())?;
-
-            for trace_dir in read_dir(path)? {
-                let entry = trace_dir?;
-                let path = entry.path();
-                let connection_id =
-                    ConnectionId::from_str(path.file_name().unwrap().to_str().unwrap())?;
-
-                trace_files.push(TraceFile {
-                    tenant_id,
-                    timeline_id,
-                    connection_id,
-                    path,
-                });
-            }
-        }
-    }
-
-    Ok(trace_files)
-}
-
-fn main() -> anyhow::Result<()> {
-    let args = Args::parse();
-
-    match args.command {
-        Command::List => {
-            for trace_file in get_trace_files(&args.path)? {
-                println!("{trace_file:?}");
-            }
-        }
-        Command::Dump => {
-            for trace_file in get_trace_files(&args.path)? {
-                let file = File::open(trace_file.path.clone())?;
-                let reader = BufReader::new(file);
-                dump_trace(reader);
-            }
-        }
-        Command::Analyze => {
-            for trace_file in get_trace_files(&args.path)? {
-                println!("analyzing {trace_file:?}");
-                let file = File::open(trace_file.path.clone())?;
-                let reader = BufReader::new(file);
-                analyze_trace(reader);
-            }
-        }
-    }
-
-    Ok(())
-}

From d9a82468e27e185fb1f18d4da0d63ac18e37ac2d Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 11 Jul 2024 15:43:28 +0100
Subject: [PATCH 1171/1571] storage_controller: fix
 ReconcilerWaiter::get_status (#8341)

## Problem
SeqWait::would_wait_for returns Ok in the case when we would not wait
for the sequence number and Err otherwise.
ReconcilerWaiter::get_status uses it the wrong way around. This can
cause the storage controller to go into a busy loop
and make it look unavailable to the k8s controller.

## Summary of changes
Use `SeqWait::would_wait_for` correctly.
---
 storage_controller/src/tenant_shard.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 3fcf31ac10..2ddab58aaf 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -383,9 +383,9 @@ impl ReconcilerWaiter {
     }
 
     pub(crate) fn get_status(&self) -> ReconcilerStatus {
-        if self.seq_wait.would_wait_for(self.seq).is_err() {
+        if self.seq_wait.would_wait_for(self.seq).is_ok() {
             ReconcilerStatus::Done
-        } else if self.error_seq_wait.would_wait_for(self.seq).is_err() {
+        } else if self.error_seq_wait.would_wait_for(self.seq).is_ok() {
             ReconcilerStatus::Failed
         } else {
             ReconcilerStatus::InProgress

From 0159ae9536d6b9e0a9cb27b0ced3fd244faf63d0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 11 Jul 2024 17:05:35 +0100
Subject: [PATCH 1172/1571] safekeeper: eviction metrics (#8348)

## Problem

Follow up to https://github.com/neondatabase/neon/pull/8335, to improve
observability of how many evict/restores we are doing.

## Summary of changes

- Add `safekeeper_eviction_events_started_total` and
`safekeeper_eviction_events_completed_total`, with a "kind" label of
evict or restore. This gives us rates, and also ability to calculate how
many are in progress.
- Generalize SafekeeperMetrics test type to use the same helpers as
pageserver, and enable querying any metric.
- Read the new metrics at the end of the eviction test.
---
 Cargo.lock                               |  2 +
 safekeeper/Cargo.toml                    |  2 +
 safekeeper/src/metrics.rs                | 26 +++++++++++++
 safekeeper/src/timeline_eviction.rs      | 19 ++++++++++
 test_runner/fixtures/safekeeper/http.py  | 48 +++++++++++-------------
 test_runner/regress/test_wal_acceptor.py | 24 +++++++++++-
 6 files changed, 92 insertions(+), 29 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4b1525edee..b31ac69e6c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5206,6 +5206,8 @@ dependencies = [
  "sha2",
  "signal-hook",
  "storage_broker",
+ "strum",
+ "strum_macros",
  "thiserror",
  "tokio",
  "tokio-io-timeout",
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index a650d5e207..9f32016fd9 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -41,6 +41,8 @@ serde.workspace = true
 serde_json.workspace = true
 serde_with.workspace = true
 signal-hook.workspace = true
+strum.workspace = true
+strum_macros.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["fs"] }
 tokio-util = { workspace = true }
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 539ecf826b..aa2bafbe92 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -205,6 +205,32 @@ pub static WAL_BACKUP_TASKS: Lazy<IntCounterPair> = Lazy::new(|| {
     .expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter")
 });
 
+// Metrics collected on operations on the storage repository.
+#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum EvictionEvent {
+    Evict,
+    Restore,
+}
+
+pub(crate) static EVICTION_EVENTS_STARTED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_eviction_events_started_total",
+        "Number of eviction state changes, incremented when they start",
+        &["kind"]
+    )
+    .expect("Failed to register metric")
+});
+
+pub(crate) static EVICTION_EVENTS_COMPLETED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_eviction_events_completed_total",
+        "Number of eviction state changes, incremented when they complete",
+        &["kind"]
+    )
+    .expect("Failed to register metric")
+});
+
 pub const LABEL_UNKNOWN: &str = "unknown";
 
 /// Labels for traffic metrics.
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index e4ab65290d..0b8d58ee8a 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -14,6 +14,7 @@ use tracing::{debug, info, instrument, warn};
 use utils::crashsafe::durable_rename;
 
 use crate::{
+    metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
     timeline_manager::{Manager, StateSnapshot},
     wal_backup,
     wal_backup_partial::{self, PartialRemoteSegment},
@@ -66,6 +67,15 @@ impl Manager {
 
         info!("starting eviction, using {:?}", partial_backup_uploaded);
 
+        EVICTION_EVENTS_STARTED
+            .with_label_values(&[EvictionEvent::Evict.into()])
+            .inc();
+        let _guard = scopeguard::guard((), |_| {
+            EVICTION_EVENTS_COMPLETED
+                .with_label_values(&[EvictionEvent::Evict.into()])
+                .inc();
+        });
+
         if let Err(e) = do_eviction(self, &partial_backup_uploaded).await {
             warn!("failed to evict timeline: {:?}", e);
             return;
@@ -88,6 +98,15 @@ impl Manager {
 
         info!("starting uneviction, using {:?}", partial_backup_uploaded);
 
+        EVICTION_EVENTS_STARTED
+            .with_label_values(&[EvictionEvent::Restore.into()])
+            .inc();
+        let _guard = scopeguard::guard((), |_| {
+            EVICTION_EVENTS_COMPLETED
+                .with_label_values(&[EvictionEvent::Restore.into()])
+                .inc();
+        });
+
         if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await {
             warn!("failed to unevict timeline: {:?}", e);
             return;
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 11e6fef28f..a51b89744b 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -1,6 +1,5 @@
 import json
-import re
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import pytest
@@ -8,6 +7,7 @@ import requests
 
 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
+from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 
 
 # Walreceiver as returned by sk's timeline status endpoint.
@@ -31,15 +31,26 @@ class SafekeeperTimelineStatus:
     walreceivers: List[Walreceiver]
 
 
-@dataclass
-class SafekeeperMetrics:
+class SafekeeperMetrics(Metrics):
+    # Helpers to get metrics from tests without hardcoding the metric names there.
     # These are metrics from Prometheus which uses float64 internally.
     # As a consequence, values may differ from real original int64s.
-    flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
-    commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
+
+    def __init__(self, m: Metrics):
+        self.metrics = m.metrics
+
+    def flush_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId):
+        return self.query_one(
+            "safekeeper_flush_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)}
+        ).value
+
+    def commit_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId):
+        return self.query_one(
+            "safekeeper_commit_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)}
+        ).value
 
 
-class SafekeeperHttpClient(requests.Session):
+class SafekeeperHttpClient(requests.Session, MetricsGetter):
     HTTPError = requests.HTTPError
 
     def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
@@ -209,28 +220,11 @@ class SafekeeperHttpClient(requests.Session):
         return res_json
 
     def get_metrics_str(self) -> str:
+        """You probably want to use get_metrics() instead."""
         request_result = self.get(f"http://localhost:{self.port}/metrics")
         request_result.raise_for_status()
         return request_result.text
 
     def get_metrics(self) -> SafekeeperMetrics:
-        all_metrics_text = self.get_metrics_str()
-
-        metrics = SafekeeperMetrics()
-        for match in re.finditer(
-            r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
-            all_metrics_text,
-            re.MULTILINE,
-        ):
-            metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int(
-                match.group(3)
-            )
-        for match in re.finditer(
-            r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
-            all_metrics_text,
-            re.MULTILINE,
-        ):
-            metrics.commit_lsn_inexact[
-                (TenantId(match.group(1)), TimelineId(match.group(2)))
-            ] = int(match.group(3))
-        return metrics
+        res = self.get_metrics_str()
+        return SafekeeperMetrics(parse_metrics(res))
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 7efd86e349..e0ad4fdd5c 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -147,8 +147,8 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
                 last_record_lsn=Lsn(timeline_detail["last_record_lsn"]),
             )
             for sk_m in sk_metrics:
-                m.flush_lsns.append(Lsn(sk_m.flush_lsn_inexact[(tenant_id, timeline_id)]))
-                m.commit_lsns.append(Lsn(sk_m.commit_lsn_inexact[(tenant_id, timeline_id)]))
+                m.flush_lsns.append(Lsn(int(sk_m.flush_lsn_inexact(tenant_id, timeline_id))))
+                m.commit_lsns.append(Lsn(int(sk_m.commit_lsn_inexact(tenant_id, timeline_id))))
 
             for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns):
                 # Invariant. May be < when transaction is in progress.
@@ -2274,3 +2274,23 @@ def test_s3_eviction(
         and sk.log_contains("successfully restored evicted timeline")
         for sk in env.safekeepers
     )
+
+    assert any(
+        sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_started_total", {"kind": "evict"}
+        )
+        or 0 > 0
+        and sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_completed_total", {"kind": "evict"}
+        )
+        or 0 > 0
+        and sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_started_total", {"kind": "restore"}
+        )
+        or 0 > 0
+        and sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_completed_total", {"kind": "restore"}
+        )
+        or 0 > 0
+        for sk in env.safekeepers
+    )

From 814c8e8f683ee8fdddc86de99bf33900f423b4d4 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 11 Jul 2024 17:05:47 +0100
Subject: [PATCH 1173/1571] storage controller: add node deletion API (#8226)

## Problem

In anticipation of later adding a really nice drain+delete API, I
initially only added an intentionally basic `/drop` API that is just
about usable for deleting nodes in a pinch, but requires some ugly
storage controller restarts to persuade it to restart secondaries.

## Summary of changes

I started making a few tiny fixes, and ended up writing the delete
API...

- Quality of life nit: ordering of node + tenant listings in storcon_cli
- Papercut: Fix the attach_hook using the wrong operation type for
reporting slow locks
- Make Service::spawn tolerate `generation_pageserver` columns that
point to nonexistent node IDs. I started out thinking of this as a
general resilience thing, but when implementing the delete API I
realized it was actually a legitimate end state after the delete API is
called (as that API doesn't wait for all reconciles to succeed).
- Add a `DELETE` API for nodes, which does not gracefully drain, but
does reschedule everything. This becomes safe to use when the system is
in any state, but will incur availability gaps for any tenants that
weren't already live-migrated away. If tenants have already been
drained, this becomes a totally clean + safe way to decom a node.
- Add a test and a storcon_cli wrapper for it

This is meant to be a robust initial API that lets us remove nodes
without doing ugly things like restarting the storage controller -- it's
not quite a totally graceful node-draining routine yet. There's more
work in https://github.com/neondatabase/neon/issues/8333 to get to our
end-end state.
---
 control_plane/storcon_cli/src/main.rs         |  19 ++-
 storage_controller/src/http.rs                |  11 ++
 storage_controller/src/service.rs             | 121 +++++++++++++++++-
 storage_controller/src/tenant_shard.rs        |  19 ++-
 test_runner/fixtures/neon_fixtures.py         |   8 ++
 test_runner/regress/test_compatibility.py     |  25 ++++
 .../regress/test_storage_controller.py        |  88 +++++++++++++
 7 files changed, 277 insertions(+), 14 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index b2c5dfe58a..815f5c940f 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -56,6 +56,10 @@ enum Command {
         #[arg(long)]
         scheduling: Option<NodeSchedulingPolicy>,
     },
+    NodeDelete {
+        #[arg(long)]
+        node_id: NodeId,
+    },
     /// Modify a tenant's policies in the storage controller
     TenantPolicy {
         #[arg(long)]
@@ -357,13 +361,16 @@ async fn main() -> anyhow::Result<()> {
             tracing::info!("Delete status: {}", status);
         }
         Command::Nodes {} => {
-            let resp = storcon_client
+            let mut resp = storcon_client
                 .dispatch::<(), Vec<NodeDescribeResponse>>(
                     Method::GET,
                     "control/v1/node".to_string(),
                     None,
                 )
                 .await?;
+
+            resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
+
             let mut table = comfy_table::Table::new();
             table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
             for node in resp {
@@ -395,13 +402,16 @@ async fn main() -> anyhow::Result<()> {
                 .await?;
         }
         Command::Tenants {} => {
-            let resp = storcon_client
+            let mut resp = storcon_client
                 .dispatch::<(), Vec<TenantDescribeResponse>>(
                     Method::GET,
                     "control/v1/tenant".to_string(),
                     None,
                 )
                 .await?;
+
+            resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
+
             let mut table = comfy_table::Table::new();
             table.set_header([
                 "TenantId",
@@ -650,6 +660,11 @@ async fn main() -> anyhow::Result<()> {
                 .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
                 .await?;
         }
+        Command::NodeDelete { node_id } => {
+            storcon_client
+                .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
+                .await?;
+        }
         Command::TenantSetTimeBasedEviction {
             tenant_id,
             period,
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 7446ad53a2..3a62c0dd4f 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -456,6 +456,14 @@ async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError
     json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
 }
 
+async fn handle_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+    json_response(StatusCode::OK, state.service.node_delete(node_id).await?)
+}
+
 async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -878,6 +886,9 @@ pub fn make_router(
         .post("/control/v1/node", |r| {
             named_request_span(r, handle_node_register, RequestName("control_v1_node"))
         })
+        .delete("/control/v1/node/:node_id", |r| {
+            named_request_span(r, handle_node_delete, RequestName("control_v1_node_delete"))
+        })
         .get("/control/v1/node", |r| {
             named_request_span(r, handle_node_list, RequestName("control_v1_node"))
         })
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index aada1939ee..b6e2b53191 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2,6 +2,7 @@ use std::{
     borrow::Cow,
     cmp::Ordering,
     collections::{BTreeMap, HashMap, HashSet},
+    ops::Deref,
     path::PathBuf,
     str::FromStr,
     sync::Arc,
@@ -115,12 +116,14 @@ enum TenantOperations {
     SecondaryDownload,
     TimelineCreate,
     TimelineDelete,
+    AttachHook,
 }
 
 #[derive(Clone, strum_macros::Display)]
 enum NodeOperations {
     Register,
     Configure,
+    Delete,
 }
 
 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
@@ -845,9 +848,10 @@ impl Service {
         tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(),
         sequence=%result.sequence
     ))]
-    fn process_result(&self, result: ReconcileResult) {
+    fn process_result(&self, mut result: ReconcileResult) {
         let mut locked = self.inner.write().unwrap();
-        let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
+        let (nodes, tenants, _scheduler) = locked.parts_mut();
+        let Some(tenant) = tenants.get_mut(&result.tenant_shard_id) else {
             // A reconciliation result might race with removing a tenant: drop results for
             // tenants that aren't in our map.
             return;
@@ -864,6 +868,13 @@ impl Service {
         // Let the TenantShard know it is idle.
         tenant.reconcile_complete(result.sequence);
 
+        // In case a node was deleted while this reconcile is in flight, filter it out of the update we will
+        // make to the tenant
+        result
+            .observed
+            .locations
+            .retain(|node_id, _loc| nodes.contains_key(node_id));
+
         match result.result {
             Ok(()) => {
                 for (node_id, loc) in &result.observed.locations {
@@ -873,6 +884,7 @@ impl Service {
                         tracing::info!("Setting observed location {} to None", node_id,)
                     }
                 }
+
                 tenant.observed = result.observed;
                 tenant.waiter.advance(result.sequence);
             }
@@ -1109,8 +1121,16 @@ impl Service {
             // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
             // it with what we can infer: the node for which a generation was most recently issued.
             let mut intent = IntentState::new();
-            if let Some(generation_pageserver) = tsp.generation_pageserver {
-                intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
+            if let Some(generation_pageserver) = tsp.generation_pageserver.map(|n| NodeId(n as u64))
+            {
+                if nodes.contains_key(&generation_pageserver) {
+                    intent.set_attached(&mut scheduler, Some(generation_pageserver));
+                } else {
+                    // If a node was removed before being completely drained, it is legal for it to leave behind a `generation_pageserver` referring
+                    // to a non-existent node, because node deletion doesn't block on completing the reconciliations that will issue new generations
+                    // on different pageservers.
+                    tracing::warn!("Tenant shard {tenant_shard_id} references non-existent node {generation_pageserver} in database, will be rescheduled");
+                }
             }
             let new_tenant = TenantShard::from_persistent(tsp, intent)?;
 
@@ -1237,7 +1257,7 @@ impl Service {
         let _tenant_lock = trace_exclusive_lock(
             &self.tenant_op_locks,
             attach_req.tenant_shard_id.tenant_id,
-            TenantOperations::ShardSplit,
+            TenantOperations::AttachHook,
         )
         .await;
 
@@ -4210,8 +4230,6 @@ impl Service {
     /// This is for debug/support only: we simply drop all state for a tenant, without
     /// detaching or deleting it on pageservers.  We do not try and re-schedule any
     /// tenants that were on this node.
-    ///
-    /// TODO: proper node deletion API that unhooks things more gracefully
     pub(crate) async fn node_drop(&self, node_id: NodeId) -> Result<(), ApiError> {
         self.persistence.delete_node(node_id).await?;
 
@@ -4219,6 +4237,7 @@ impl Service {
 
         for shard in locked.tenants.values_mut() {
             shard.deref_node(node_id);
+            shard.observed.locations.remove(&node_id);
         }
 
         let mut nodes = (*locked.nodes).clone();
@@ -4230,6 +4249,94 @@ impl Service {
         Ok(())
     }
 
+    /// If a node has any work on it, it will be rescheduled: this is "clean" in the sense
+    /// that we don't leave any bad state behind in the storage controller, but unclean
+    /// in the sense that we are not carefully draining the node.
+    pub(crate) async fn node_delete(&self, node_id: NodeId) -> Result<(), ApiError> {
+        let _node_lock =
+            trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Delete).await;
+
+        // 1. Atomically update in-memory state:
+        //    - set the scheduling state to Pause to make subsequent scheduling ops skip it
+        //    - update shards' intents to exclude the node, and reschedule any shards whose intents we modified.
+        //    - drop the node from the main nodes map, so that when running reconciles complete they do not
+        //      re-insert references to this node into the ObservedState of shards
+        //    - drop the node from the scheduler
+        {
+            let mut locked = self.inner.write().unwrap();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
+
+            {
+                let mut nodes_mut = (*nodes).deref().clone();
+                match nodes_mut.get_mut(&node_id) {
+                    Some(node) => {
+                        // We do not bother setting this in the database, because we're about to delete the row anyway, and
+                        // if we crash it would not be desirable to leave the node paused after a restart.
+                        node.set_scheduling(NodeSchedulingPolicy::Pause);
+                    }
+                    None => {
+                        tracing::info!(
+                            "Node not found: presuming this is a retry and returning success"
+                        );
+                        return Ok(());
+                    }
+                }
+
+                *nodes = Arc::new(nodes_mut);
+            }
+
+            for (tenant_shard_id, shard) in tenants {
+                if shard.deref_node(node_id) {
+                    // FIXME: we need to build a ScheduleContext that reflects this shard's peers, otherwise
+                    // it won't properly do anti-affinity.
+                    let mut schedule_context = ScheduleContext::default();
+
+                    if let Err(e) = shard.schedule(scheduler, &mut schedule_context) {
+                        // TODO: implement force flag to remove a node even if we can't reschedule
+                        // a tenant
+                        tracing::error!("Refusing to delete node, shard {tenant_shard_id} can't be rescheduled: {e}");
+                        return Err(e.into());
+                    } else {
+                        tracing::info!(
+                            "Rescheduled shard {tenant_shard_id} away from node during deletion"
+                        )
+                    }
+
+                    self.maybe_reconcile_shard(shard, nodes);
+                }
+
+                // Here we remove an existing observed location for the node we're removing, and it will
+                // not be re-added by a reconciler's completion because we filter out removed nodes in
+                // process_result.
+                //
+                // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that
+                // means any reconciles we spawned will know about the node we're deleting, enabling them
+                // to do live migrations if it's still online.
+                shard.observed.locations.remove(&node_id);
+            }
+
+            scheduler.node_remove(node_id);
+
+            {
+                let mut nodes_mut = (**nodes).clone();
+                nodes_mut.remove(&node_id);
+                *nodes = Arc::new(nodes_mut);
+            }
+        }
+
+        // Note: some `generation_pageserver` columns on tenant shards in the database may still refer to
+        // the removed node, as this column means "The pageserver to which this generation was issued", and
+        // their generations won't get updated until the reconcilers moving them away from this node complete.
+        // That is safe because in Service::spawn we only use generation_pageserver if it refers to a node
+        // that exists.
+
+        // 2. Actually delete the node from the database and from in-memory state
+        tracing::info!("Deleting node from database");
+        self.persistence.delete_node(node_id).await?;
+
+        Ok(())
+    }
+
     pub(crate) async fn node_list(&self) -> Result<Vec<Node>, ApiError> {
         let nodes = {
             self.inner
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 2ddab58aaf..2574dc297a 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1229,18 +1229,27 @@ impl TenantShard {
         }
     }
 
-    // If we had any state at all referring to this node ID, drop it.  Does not
-    // attempt to reschedule.
-    pub(crate) fn deref_node(&mut self, node_id: NodeId) {
+    /// If we had any state at all referring to this node ID, drop it.  Does not
+    /// attempt to reschedule.
+    ///
+    /// Returns true if we modified the node's intent state.
+    pub(crate) fn deref_node(&mut self, node_id: NodeId) -> bool {
+        let mut intent_modified = false;
+
+        // Drop if this node was our attached intent
         if self.intent.attached == Some(node_id) {
             self.intent.attached = None;
+            intent_modified = true;
         }
 
+        // Drop from the list of secondaries, and check if we modified it
+        let had_secondaries = self.intent.secondary.len();
         self.intent.secondary.retain(|n| n != &node_id);
-
-        self.observed.locations.remove(&node_id);
+        intent_modified |= self.intent.secondary.len() != had_secondaries;
 
         debug_assert!(!self.intent.all_pageservers().contains(&node_id));
+
+        intent_modified
     }
 
     pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5ca31644a9..463e4a3b01 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2287,6 +2287,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
             headers=self.headers(TokenScope.ADMIN),
         )
 
+    def node_delete(self, node_id):
+        log.info(f"node_delete({node_id})")
+        self.request(
+            "DELETE",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
     def node_drain(self, node_id):
         log.info(f"node_drain({node_id})")
         self.request(
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 65649e0c0a..1e5e320e0e 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -93,6 +93,29 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
 )
 
 
+def fixup_storage_controller(env: NeonEnv):
+    """
+    After importing a repo_dir, we need to massage the storage controller's state a bit: it will have
+    initially started up with no nodes, but some tenants, and thereby those tenants won't be scheduled
+    anywhere.
+
+    After NeonEnv.start() is done (i.e. nodes are started + registered), call this function to get
+    the storage controller into a good state.
+
+    This function should go away once compat tests carry the controller database in their snapshots, so
+    that the controller properly remembers nodes between creating + restoring the snapshot.
+    """
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*Tenant shard .+ references non-existent node.*",
+            ".*Failed to schedule tenant .+ at startup.*",
+        ]
+    )
+    env.storage_controller.stop()
+    env.storage_controller.start()
+    env.storage_controller.reconcile_until_idle()
+
+
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(before="test_forward_compatibility")
 def test_create_snapshot(
@@ -175,6 +198,7 @@ def test_backward_compatibility(
         neon_env_builder.num_safekeepers = 3
         env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
         neon_env_builder.start()
+        fixup_storage_controller(env)
 
         check_neon_works(
             env,
@@ -263,6 +287,7 @@ def test_forward_compatibility(
         assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version)
 
         neon_env_builder.start()
+        fixup_storage_controller(env)
 
         # ensure the specified pageserver is running
         assert env.pageserver.log_contains("git-env:" + prev_pageserver_version)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index d37f7aae3d..741f16685e 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1611,3 +1611,91 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.cancel_node_drain(ps_id_to_drain)
 
     env.storage_controller.poll_node_status(ps_id_to_drain, "Active", max_attempts=6, backoff=2)
+
+
+@pytest.mark.parametrize("while_offline", [True, False])
+def test_storage_controller_node_deletion(
+    neon_env_builder: NeonEnvBuilder,
+    compute_reconfigure_listener: ComputeReconfigure,
+    while_offline: bool,
+):
+    """
+    Test that deleting a node works & properly reschedules everything that was on the node.
+    """
+    neon_env_builder.num_pageservers = 3
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_count = 10
+    shard_count_per_tenant = 8
+    tenant_ids = []
+    for _ in range(0, tenant_count):
+        tid = TenantId.generate()
+        tenant_ids.append(tid)
+        env.neon_cli.create_tenant(
+            tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
+        )
+
+    victim = env.pageservers[-1]
+
+    # The procedure a human would follow is:
+    # 1. Mark pageserver scheduling=pause
+    # 2. Mark pageserver availability=offline to trigger migrations away from it
+    # 3. Wait for attachments to all move elsewhere
+    # 4. Call deletion API
+    # 5. Stop the node.
+
+    env.storage_controller.node_configure(victim.id, {"scheduling": "Pause"})
+
+    if while_offline:
+        victim.stop(immediate=True)
+        env.storage_controller.node_configure(victim.id, {"availability": "Offline"})
+
+        def assert_shards_migrated():
+            counts = get_node_shard_counts(env, tenant_ids)
+            elsewhere = sum(v for (k, v) in counts.items() if k != victim.id)
+            log.info(f"Shards on nodes other than on victim: {elsewhere}")
+            assert elsewhere == tenant_count * shard_count_per_tenant
+
+        wait_until(30, 1, assert_shards_migrated)
+
+    log.info(f"Deleting pageserver {victim.id}")
+    env.storage_controller.node_delete(victim.id)
+
+    if not while_offline:
+
+        def assert_victim_evacuated():
+            counts = get_node_shard_counts(env, tenant_ids)
+            count = counts[victim.id]
+            log.info(f"Shards on node {victim.id}: {count}")
+            assert count == 0
+
+        wait_until(30, 1, assert_victim_evacuated)
+
+    # The node should be gone from the list API
+    assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
+
+    # No tenants should refer to the node in their intent
+    for tenant_id in tenant_ids:
+        describe = env.storage_controller.tenant_describe(tenant_id)
+        for shard in describe["shards"]:
+            assert shard["node_attached"] != victim.id
+            assert victim.id not in shard["node_secondary"]
+
+    # Reconciles running during deletion should all complete
+    # FIXME: this currently doesn't work because the deletion schedules shards without a proper ScheduleContext, resulting
+    # in states that background_reconcile wants to optimize, but can't proceed with migrations yet because this is a short3
+    # test that hasn't uploaded any heatmaps for secondaries.
+    # In the interim, just do a reconcile_all to enable the consistency check.
+    # env.storage_controller.reconcile_until_idle()
+    env.storage_controller.reconcile_all()
+
+    # Controller should pass its own consistency checks
+    env.storage_controller.consistency_check()
+
+    # The node should stay gone across a restart
+    env.storage_controller.stop()
+    env.storage_controller.start()
+    assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
+    env.storage_controller.reconcile_all()  # FIXME: workaround for optimizations happening on startup, see FIXME above.
+    env.storage_controller.consistency_check()

From cd29156927474219b92d2e5d8fda5f045a58d7af Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 11 Jul 2024 19:14:49 +0300
Subject: [PATCH 1174/1571] Fix memory context of NeonWALReader allocation.

Allocating it in short living context is wrong because it is reused during
backend lifetime.
---
 pgxn/neon/neon_walreader.c                      | 9 +++++----
 test_runner/regress/test_logical_replication.py | 6 ++++++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c
index 60eb8e1fc9..0f76514b86 100644
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -109,11 +109,12 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_
 {
 	NeonWALReader *reader;
 
+	/*
+	 * Note: we allocate in TopMemoryContext, reusing the reader for all process
+	 * reads.
+	 */
 	reader = (NeonWALReader *)
-		palloc_extended(sizeof(NeonWALReader),
-						MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO);
-	if (!reader)
-		return NULL;
+		MemoryContextAllocZero(TopMemoryContext, sizeof(NeonWALReader));
 
 	reader->available_lsn = available_lsn;
 	reader->seg.ws_file = -1;
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 41283e4d2c..66afe9ddfd 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -247,6 +247,12 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
         cur.execute(
             "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')"
         )
+        # do the peek second time: we've had a bug using wrong memory context
+        # for NeonWALReader leading to the crash in this case.
+        log.info("peek_changes again")
+        cur.execute(
+            "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')"
+        )
 
 
 # Tests that walsender correctly blocks until WAL is downloaded from safekeepers

From 38b4ed297eeb50ad2d97e58b8328d7f8c46fdb6f Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 11 Jul 2024 14:28:16 -0400
Subject: [PATCH 1175/1571] feat(pageserver): rewrite streaming vectored read
 planner (#8242)

Rewrite streaming vectored read planner to be a separate struct. The API
is designed to produce batches around `max_read_size` instead of exactly
less than that so that `handle_XX` returns one batch a time.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../src/tenant/storage_layer/delta_layer.rs   |  10 +-
 .../src/tenant/storage_layer/image_layer.rs   |   7 +-
 pageserver/src/tenant/vectored_blob_io.rs     | 273 +++++++++++++-----
 3 files changed, 218 insertions(+), 72 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index dfd0196c87..2d36ac7442 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1321,7 +1321,7 @@ impl DeltaLayerInner {
                         offsets.start.pos(),
                         offsets.end.pos(),
                         meta,
-                        Some(max_read_size),
+                        max_read_size,
                     ))
                 }
             } else {
@@ -1615,13 +1615,17 @@ impl<'a> DeltaLayerIterator<'a> {
                 let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
                 let blob_ref = BlobRef(value);
                 let offset = blob_ref.pos();
-                if let Some(batch_plan) = self.planner.handle(key, lsn, offset, BlobFlag::None) {
+                if let Some(batch_plan) = self.planner.handle(key, lsn, offset) {
                     break batch_plan;
                 }
             } else {
                 self.is_end = true;
                 let data_end_offset = self.delta_layer.index_start_offset();
-                break self.planner.handle_range_end(data_end_offset);
+                if let Some(item) = self.planner.handle_range_end(data_end_offset) {
+                    break item;
+                } else {
+                    return Ok(()); // TODO: test empty iterator
+                }
             }
         };
         let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 1e03e1a58c..1440c0db84 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -994,14 +994,17 @@ impl<'a> ImageLayerIterator<'a> {
                     Key::from_slice(&raw_key[..KEY_SIZE]),
                     self.image_layer.lsn,
                     offset,
-                    BlobFlag::None,
                 ) {
                     break batch_plan;
                 }
             } else {
                 self.is_end = true;
                 let payload_end = self.image_layer.index_start_blk as u64 * PAGE_SZ as u64;
-                break self.planner.handle_range_end(payload_end);
+                if let Some(item) = self.planner.handle_range_end(payload_end) {
+                    break item;
+                } else {
+                    return Ok(()); // TODO: a test case on empty iterator
+                }
             }
         };
         let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 7ad8446e04..1b470034db 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -68,7 +68,7 @@ impl VectoredRead {
     }
 }
 
-#[derive(Eq, PartialEq)]
+#[derive(Eq, PartialEq, Debug)]
 pub(crate) enum VectoredReadExtended {
     Yes,
     No,
@@ -91,7 +91,7 @@ impl VectoredReadBuilder {
         start_offset: u64,
         end_offset: u64,
         meta: BlobMeta,
-        max_read_size: Option<usize>,
+        max_read_size: usize,
     ) -> Self {
         let mut blobs_at = VecMap::default();
         blobs_at
@@ -102,10 +102,9 @@ impl VectoredReadBuilder {
             start: start_offset,
             end: end_offset,
             blobs_at,
-            max_read_size,
+            max_read_size: Some(max_read_size),
         }
     }
-
     /// Attempt to extend the current read with a new blob if the start
     /// offset matches with the current end of the vectored read
     /// and the resuting size is below the max read size
@@ -164,7 +163,7 @@ pub struct VectoredReadPlanner {
     // Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
     prev: Option<(Key, Lsn, u64, BlobFlag)>,
 
-    max_read_size: Option<usize>,
+    max_read_size: usize,
 }
 
 impl VectoredReadPlanner {
@@ -172,20 +171,7 @@ impl VectoredReadPlanner {
         Self {
             blobs: BTreeMap::new(),
             prev: None,
-            max_read_size: Some(max_read_size),
-        }
-    }
-
-    /// This function should *only* be used if the caller has a way to control the limit. e.g., in [`StreamingVectoredReadPlanner`],
-    /// it uses the vectored read planner to avoid duplicated logic on handling blob start/end, while expecting the vectored
-    /// read planner to give a single read to a continuous range of bytes in the image layer. Therefore, it does not need the
-    /// code path to split reads into chunks of `max_read_size`, and controls the read size itself.
-    #[cfg(test)]
-    pub(crate) fn new_caller_controlled_max_limit() -> Self {
-        Self {
-            blobs: BTreeMap::new(),
-            prev: None,
-            max_read_size: None,
+            max_read_size,
         }
     }
 
@@ -376,17 +362,18 @@ impl<'a> VectoredBlobReader<'a> {
 }
 
 /// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
-/// getting read blobs. It returns a batch when `handle` gets called and when the current key would exceed the read_size and
-/// max_cnt constraints. Underlying it uses [`VectoredReadPlanner`].
+/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
+/// max_cnt constraints.
 #[cfg(test)]
 pub struct StreamingVectoredReadPlanner {
-    planner: VectoredReadPlanner,
-    /// Max read size per batch
+    read_builder: Option<VectoredReadBuilder>,
+    // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
+    prev: Option<(Key, Lsn, u64)>,
+    /// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150,
+    /// we will produce a single batch instead of split them.
     max_read_size: u64,
     /// Max item count per batch
     max_cnt: usize,
-    /// The first offset of this batch
-    this_batch_first_offset: Option<u64>,
     /// Size of the current batch
     cnt: usize,
 }
@@ -397,62 +384,88 @@ impl StreamingVectoredReadPlanner {
         assert!(max_cnt > 0);
         assert!(max_read_size > 0);
         Self {
-            // We want to have exactly one read syscall (plus several others for index lookup) for each `next_batch` call.
-            // Therefore, we enforce `self.max_read_size` by ourselves instead of using the VectoredReadPlanner's capability,
-            // to avoid splitting into two I/Os.
-            planner: VectoredReadPlanner::new_caller_controlled_max_limit(),
+            read_builder: None,
+            prev: None,
             max_cnt,
             max_read_size,
-            this_batch_first_offset: None,
             cnt: 0,
         }
     }
 
-    fn emit(&mut self, this_batch_first_offset: u64) -> VectoredRead {
-        let planner = std::mem::replace(
-            &mut self.planner,
-            VectoredReadPlanner::new_caller_controlled_max_limit(),
-        );
-        self.this_batch_first_offset = Some(this_batch_first_offset);
-        self.cnt = 1;
-        let mut batch = planner.finish();
-        assert_eq!(batch.len(), 1, "should have exactly one read batch");
-        batch.pop().unwrap()
+    pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64) -> Option<VectoredRead> {
+        // Implementation note: internally lag behind by one blob such that
+        // we have a start and end offset when initialising [`VectoredRead`]
+        let (prev_key, prev_lsn, prev_offset) = match self.prev {
+            None => {
+                self.prev = Some((key, lsn, offset));
+                return None;
+            }
+            Some(prev) => prev,
+        };
+
+        let res = self.add_blob(prev_key, prev_lsn, prev_offset, offset, false);
+
+        self.prev = Some((key, lsn, offset));
+
+        res
     }
 
-    pub fn handle(
+    pub fn handle_range_end(&mut self, offset: u64) -> Option<VectoredRead> {
+        let res = if let Some((prev_key, prev_lsn, prev_offset)) = self.prev {
+            self.add_blob(prev_key, prev_lsn, prev_offset, offset, true)
+        } else {
+            None
+        };
+
+        self.prev = None;
+
+        res
+    }
+
+    fn add_blob(
         &mut self,
         key: Key,
         lsn: Lsn,
-        offset: u64,
-        flag: BlobFlag,
+        start_offset: u64,
+        end_offset: u64,
+        is_last_blob_in_read: bool,
     ) -> Option<VectoredRead> {
-        if let Some(begin_offset) = self.this_batch_first_offset {
-            // Each batch will have at least one item b/c `self.this_batch_first_offset` is set
-            // after one item gets processed
-            if offset - begin_offset > self.max_read_size {
-                self.planner.handle_range_end(offset); // End the current batch with the offset
-                let batch = self.emit(offset); // Produce a batch
-                self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch
-                return Some(batch);
+        match &mut self.read_builder {
+            Some(read_builder) => {
+                let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn });
+                assert_eq!(extended, VectoredReadExtended::Yes);
             }
-        } else {
-            self.this_batch_first_offset = Some(offset)
-        }
-        if self.cnt >= self.max_cnt {
-            self.planner.handle_range_end(offset); // End the current batch with the offset
-            let batch = self.emit(offset); // Produce a batch
-            self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch
-            return Some(batch);
-        }
-        self.planner.handle(key, lsn, offset, flag); // Add this key to the current batch
-        self.cnt += 1;
-        None
-    }
+            None => {
+                self.read_builder = {
+                    let mut blobs_at = VecMap::default();
+                    blobs_at
+                        .append(start_offset, BlobMeta { key, lsn })
+                        .expect("First insertion always succeeds");
 
-    pub fn handle_range_end(&mut self, offset: u64) -> VectoredRead {
-        self.planner.handle_range_end(offset);
-        self.emit(offset)
+                    Some(VectoredReadBuilder {
+                        start: start_offset,
+                        end: end_offset,
+                        blobs_at,
+                        max_read_size: None,
+                    })
+                };
+            }
+        }
+        let read_builder = self.read_builder.as_mut().unwrap();
+        self.cnt += 1;
+        if is_last_blob_in_read
+            || read_builder.size() >= self.max_read_size as usize
+            || self.cnt >= self.max_cnt
+        {
+            let prev_read_builder = self.read_builder.take();
+            self.cnt = 0;
+
+            // `current_read_builder` is None in the first iteration
+            if let Some(read_builder) = prev_read_builder {
+                return Some(read_builder.build());
+            }
+        }
+        None
     }
 }
 
@@ -509,8 +522,11 @@ mod tests {
         planner.handle_range_end(652 * 1024);
 
         let reads = planner.finish();
+
         assert_eq!(reads.len(), 6);
 
+        // TODO: could remove zero reads to produce 5 reads here
+
         for (idx, read) in reads.iter().enumerate() {
             validate_read(read, ranges[idx]);
         }
@@ -548,4 +564,127 @@ mod tests {
             validate_read(read, ranges[idx]);
         }
     }
+
+    #[test]
+    fn streaming_planner_max_read_size_test() {
+        let max_read_size = 128 * 1024;
+        let key = Key::MIN;
+        let lsn = Lsn(0);
+
+        let blob_descriptions = vec![
+            (key, lsn, 0, BlobFlag::None),
+            (key, lsn, 32 * 1024, BlobFlag::None),
+            (key, lsn, 96 * 1024, BlobFlag::None),
+            (key, lsn, 128 * 1024, BlobFlag::None),
+            (key, lsn, 198 * 1024, BlobFlag::None),
+            (key, lsn, 268 * 1024, BlobFlag::None),
+            (key, lsn, 396 * 1024, BlobFlag::None),
+            (key, lsn, 652 * 1024, BlobFlag::None),
+        ];
+
+        let ranges = [
+            &blob_descriptions[0..3],
+            &blob_descriptions[3..5],
+            &blob_descriptions[5..6],
+            &blob_descriptions[6..7],
+            &blob_descriptions[7..],
+        ];
+
+        let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1000);
+        let mut reads = Vec::new();
+        for (key, lsn, offset, _) in blob_descriptions.clone() {
+            reads.extend(planner.handle(key, lsn, offset));
+        }
+        reads.extend(planner.handle_range_end(652 * 1024));
+
+        assert_eq!(reads.len(), ranges.len());
+
+        for (idx, read) in reads.iter().enumerate() {
+            validate_read(read, ranges[idx]);
+        }
+    }
+
+    #[test]
+    fn streaming_planner_max_cnt_test() {
+        let max_read_size = 1024 * 1024;
+        let key = Key::MIN;
+        let lsn = Lsn(0);
+
+        let blob_descriptions = vec![
+            (key, lsn, 0, BlobFlag::None),
+            (key, lsn, 32 * 1024, BlobFlag::None),
+            (key, lsn, 96 * 1024, BlobFlag::None),
+            (key, lsn, 128 * 1024, BlobFlag::None),
+            (key, lsn, 198 * 1024, BlobFlag::None),
+            (key, lsn, 268 * 1024, BlobFlag::None),
+            (key, lsn, 396 * 1024, BlobFlag::None),
+            (key, lsn, 652 * 1024, BlobFlag::None),
+        ];
+
+        let ranges = [
+            &blob_descriptions[0..2],
+            &blob_descriptions[2..4],
+            &blob_descriptions[4..6],
+            &blob_descriptions[6..],
+        ];
+
+        let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2);
+        let mut reads = Vec::new();
+        for (key, lsn, offset, _) in blob_descriptions.clone() {
+            reads.extend(planner.handle(key, lsn, offset));
+        }
+        reads.extend(planner.handle_range_end(652 * 1024));
+
+        assert_eq!(reads.len(), ranges.len());
+
+        for (idx, read) in reads.iter().enumerate() {
+            validate_read(read, ranges[idx]);
+        }
+    }
+
+    #[test]
+    fn streaming_planner_edge_test() {
+        let max_read_size = 1024 * 1024;
+        let key = Key::MIN;
+        let lsn = Lsn(0);
+        {
+            let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
+            let mut reads = Vec::new();
+            reads.extend(planner.handle_range_end(652 * 1024));
+            assert!(reads.is_empty());
+        }
+        {
+            let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
+            let mut reads = Vec::new();
+            reads.extend(planner.handle(key, lsn, 0));
+            reads.extend(planner.handle_range_end(652 * 1024));
+            assert_eq!(reads.len(), 1);
+            validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]);
+        }
+        {
+            let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
+            let mut reads = Vec::new();
+            reads.extend(planner.handle(key, lsn, 0));
+            reads.extend(planner.handle(key, lsn, 128 * 1024));
+            reads.extend(planner.handle_range_end(652 * 1024));
+            assert_eq!(reads.len(), 2);
+            validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]);
+            validate_read(&reads[1], &[(key, lsn, 128 * 1024, BlobFlag::None)]);
+        }
+        {
+            let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2);
+            let mut reads = Vec::new();
+            reads.extend(planner.handle(key, lsn, 0));
+            reads.extend(planner.handle(key, lsn, 128 * 1024));
+            reads.extend(planner.handle_range_end(652 * 1024));
+            assert_eq!(reads.len(), 1);
+            validate_read(
+                &reads[0],
+                &[
+                    (key, lsn, 0, BlobFlag::None),
+                    (key, lsn, 128 * 1024, BlobFlag::None),
+                ],
+            );
+        }
+    }
 }

From 4a87bac036f7d21545183dd1894df00e960179ad Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 11 Jul 2024 22:03:35 +0300
Subject: [PATCH 1176/1571] test: limit `test_layer_download_timeouted` to
 MOCK_S3 (#8331)

Requests against REAL_S3 on CI can consistently take longer than 1s;
testing the short timeouts against it made no sense in hindsight, as
MOCK_S3 works just as well.

evidence:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8229/9857994025/index.html#suites/b97efae3a617afb71cb8142f5afa5224/6828a50921660a32
---
 test_runner/regress/test_ondemand_download.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 4a25dfd874..c8249bb2ce 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -764,7 +764,9 @@ def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder):
     """
     Pause using a pausable_failpoint longer than the client timeout to simulate the timeout happening.
     """
-    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    # running this test is not reliable against REAL_S3, because operations can
+    # take longer than 1s we want to use as a timeout
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
     assert isinstance(neon_env_builder.pageserver_remote_storage, S3Storage)
     neon_env_builder.pageserver_remote_storage.custom_timeout = "1s"
 

From 82b9a44ab48a1658fce7942e60dc61f68bd29945 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 11 Jul 2024 13:29:35 -0700
Subject: [PATCH 1177/1571] Grant execute on snapshot functions to
 neon_superuser (#8346)

## Problem
I need `neon_superuser` to be allowed to create snapshots for
replication tests

## Summary of changes
Adds a migration that grants these functions to neon_superuser
---
 ...nt_snapshot_synchronization_funcs_to_neon_superuser.sql | 7 +++++++
 compute_tools/src/spec.rs                                  | 3 +++
 test_runner/regress/test_migrations.py                     | 2 +-
 3 files changed, 11 insertions(+), 1 deletion(-)
 create mode 100644 compute_tools/src/migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql

diff --git a/compute_tools/src/migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql b/compute_tools/src/migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
new file mode 100644
index 0000000000..28750e00dd
--- /dev/null
+++ b/compute_tools/src/migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
@@ -0,0 +1,7 @@
+DO $$
+BEGIN
+    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
+       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser';
+       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser';
+    END IF;
+END $$;
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 37090b08fd..1d12b88c7c 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -790,6 +790,9 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
             "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
         ),
         include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
+        include_str!(
+            "./migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
+        ),
     ];
 
     MigrationRunner::new(client, &migrations).run_migrations()?;
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 5637f160cf..91bd3ea50c 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -13,7 +13,7 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     endpoint.wait_for_migrations()
 
-    num_migrations = 9
+    num_migrations = 10
 
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")

From 30bbfde50d2e2e224cb8e4d9c0113b000111657b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 12 Jul 2024 01:43:44 +0200
Subject: [PATCH 1178/1571] Pass configured compression param to image
 generation (#8363)

We need to pass on the configured compression param during image layer
generation.

This was an oversight of #8106, and the likely cause why #8288 didn't
bring any interesting regressions.

Part of https://github.com/neondatabase/neon/issues/5431
---
 pageserver/src/tenant/storage_layer/image_layer.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 1440c0db84..a88a1e6429 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -809,7 +809,11 @@ impl ImageLayerWriterInner {
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         ensure!(self.key_range.contains(&key));
-        let (_img, res) = self.blob_writer.write_blob(img, ctx).await;
+        let compression = self.conf.image_compression;
+        let (_img, res) = self
+            .blob_writer
+            .write_blob_maybe_compressed(img, ctx, compression)
+            .await;
         // TODO: re-use the buffer for `img` further upstack
         let off = res?;
 

From 2e37aa3fe80bc9b60f90e12365da348ed108e4fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 12 Jul 2024 04:32:34 +0200
Subject: [PATCH 1179/1571] Implement decompression for vectored reads (#8302)

Implement decompression of images for vectored reads.

This doesn't implement support for still treating blobs as uncompressed
with the bits we reserved for compression, as we have removed that
functionality in #8300 anyways.

Part of #5431
---
 pageserver/src/tenant/blob_io.rs          |  40 ++++---
 pageserver/src/tenant/vectored_blob_io.rs | 127 +++++++++++++++++++---
 2 files changed, 139 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index e98ed66ef9..791eefebe9 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -137,14 +137,14 @@ impl<'a> BlockCursor<'a> {
 }
 
 /// Reserved bits for length and compression
-const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
+pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
 
 /// The maximum size of blobs we support. The highest few bits
 /// are reserved for compression and other further uses.
 const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
 
-const BYTE_UNCOMPRESSED: u8 = 0x80;
-const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
+pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
+pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
 
 /// A wrapper of `VirtualFile` that allows users to write blobs.
 ///
@@ -390,51 +390,63 @@ impl BlobWriter<false> {
 }
 
 #[cfg(test)]
-mod tests {
+pub(crate) mod tests {
     use super::*;
     use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
+    use camino::Utf8PathBuf;
+    use camino_tempfile::Utf8TempDir;
     use rand::{Rng, SeedableRng};
 
     async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
         round_trip_test_compressed::<BUFFERED>(blobs, false).await
     }
 
-    async fn round_trip_test_compressed<const BUFFERED: bool>(
+    pub(crate) async fn write_maybe_compressed<const BUFFERED: bool>(
         blobs: &[Vec<u8>],
         compression: bool,
-    ) -> Result<(), Error> {
+        ctx: &RequestContext,
+    ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
         let temp_dir = camino_tempfile::tempdir()?;
         let pathbuf = temp_dir.path().join("file");
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
 
         // Write part (in block to drop the file)
         let mut offsets = Vec::new();
         {
-            let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
+            let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
                 let (_, res) = if compression {
                     wtr.write_blob_maybe_compressed(
                         blob.clone(),
-                        &ctx,
+                        ctx,
                         ImageCompressionAlgorithm::Zstd { level: Some(1) },
                     )
                     .await
                 } else {
-                    wtr.write_blob(blob.clone(), &ctx).await
+                    wtr.write_blob(blob.clone(), ctx).await
                 };
                 let offs = res?;
                 offsets.push(offs);
             }
             // Write out one page worth of zeros so that we can
             // read again with read_blk
-            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
+            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await;
             let offs = res?;
             println!("Writing final blob at offs={offs}");
-            wtr.flush_buffer(&ctx).await?;
+            wtr.flush_buffer(ctx).await?;
         }
+        Ok((temp_dir, pathbuf, offsets))
+    }
 
-        let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
+    async fn round_trip_test_compressed<const BUFFERED: bool>(
+        blobs: &[Vec<u8>],
+        compression: bool,
+    ) -> Result<(), Error> {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let (_temp_dir, pathbuf, offsets) =
+            write_maybe_compressed::<BUFFERED>(blobs, compression, &ctx).await?;
+
+        let file = VirtualFile::open(pathbuf, &ctx).await?;
         let rdr = BlockReaderRef::VirtualFile(&file);
         let rdr = BlockCursor::new_with_compression(rdr, compression);
         for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
@@ -447,7 +459,7 @@ mod tests {
         Ok(())
     }
 
-    fn random_array(len: usize) -> Vec<u8> {
+    pub(crate) fn random_array(len: usize) -> Vec<u8> {
         let mut rng = rand::thread_rng();
         (0..len).map(|_| rng.gen()).collect::<_>()
     }
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 1b470034db..cb81f1d76d 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -20,11 +20,13 @@ use std::num::NonZeroUsize;
 
 use bytes::BytesMut;
 use pageserver_api::key::Key;
+use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::BoundedBuf;
 use utils::lsn::Lsn;
 use utils::vec_map::VecMap;
 
 use crate::context::RequestContext;
+use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
 use crate::virtual_file::VirtualFile;
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
@@ -301,7 +303,7 @@ impl<'a> VectoredBlobReader<'a> {
             read.size(),
             buf.capacity()
         );
-        let buf = self
+        let mut buf = self
             .file
             .read_exact_at(buf.slice(0..read.size()), read.start, ctx)
             .await?
@@ -323,38 +325,68 @@ impl<'a> VectoredBlobReader<'a> {
                 .chain(std::iter::once(None)),
         );
 
+        // Some scratch space, put here for reusing the allocation
+        let mut decompressed_vec = Vec::new();
+
         for ((offset, meta), next) in pairs {
             let offset_in_buf = offset - start_offset;
             let first_len_byte = buf[offset_in_buf as usize];
 
-            // Each blob is prefixed by a header containing it's size.
+            // Each blob is prefixed by a header containing its size and compression information.
             // Extract the size and skip that header to find the start of the data.
             // The size can be 1 or 4 bytes. The most significant bit is 0 in the
             // 1 byte case and 1 in the 4 byte case.
-            let (size_length, blob_size) = if first_len_byte < 0x80 {
-                (1, first_len_byte as u64)
+            let (size_length, blob_size, compression_bits) = if first_len_byte < 0x80 {
+                (1, first_len_byte as u64, BYTE_UNCOMPRESSED)
             } else {
                 let mut blob_size_buf = [0u8; 4];
                 let offset_in_buf = offset_in_buf as usize;
 
                 blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
-                blob_size_buf[0] &= 0x7f;
-                (4, u32::from_be_bytes(blob_size_buf) as u64)
+                blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
+
+                let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
+                (
+                    4,
+                    u32::from_be_bytes(blob_size_buf) as u64,
+                    compression_bits,
+                )
             };
 
-            let start = offset_in_buf + size_length;
-            let end = match next {
+            let start_raw = offset_in_buf + size_length;
+            let end_raw = match next {
                 Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
-                None => start + blob_size,
+                None => start_raw + blob_size,
             };
-
-            assert_eq!(end - start, blob_size);
+            assert_eq!(end_raw - start_raw, blob_size);
+            let (start, end);
+            if compression_bits == BYTE_UNCOMPRESSED {
+                start = start_raw as usize;
+                end = end_raw as usize;
+            } else if compression_bits == BYTE_ZSTD {
+                let mut decoder =
+                    async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
+                decoder
+                    .write_all(&buf[start_raw as usize..end_raw as usize])
+                    .await?;
+                decoder.flush().await?;
+                start = buf.len();
+                buf.extend_from_slice(&decompressed_vec);
+                end = buf.len();
+                decompressed_vec.clear();
+            } else {
+                let error = std::io::Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    format!("invalid compression byte {compression_bits:x}"),
+                );
+                return Err(error);
+            }
 
             metas.push(VectoredBlob {
-                start: start as usize,
-                end: end as usize,
+                start,
+                end,
                 meta: *meta,
-            })
+            });
         }
 
         Ok(VectoredBlobsBuf { buf, blobs: metas })
@@ -471,6 +503,13 @@ impl StreamingVectoredReadPlanner {
 
 #[cfg(test)]
 mod tests {
+    use anyhow::Error;
+
+    use crate::context::DownloadBehavior;
+    use crate::page_cache::PAGE_SZ;
+    use crate::task_mgr::TaskKind;
+
+    use super::super::blob_io::tests::{random_array, write_maybe_compressed};
     use super::*;
 
     fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
@@ -687,4 +726,64 @@ mod tests {
             );
         }
     }
+
+    async fn round_trip_test_compressed(blobs: &[Vec<u8>], compression: bool) -> Result<(), Error> {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let (_temp_dir, pathbuf, offsets) =
+            write_maybe_compressed::<true>(blobs, compression, &ctx).await?;
+
+        let file = VirtualFile::open(&pathbuf, &ctx).await?;
+        let file_len = std::fs::metadata(&pathbuf)?.len();
+
+        // Multiply by two (compressed data might need more space), and add a few bytes for the header
+        let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
+        let mut buf = BytesMut::with_capacity(reserved_bytes);
+
+        let vectored_blob_reader = VectoredBlobReader::new(&file);
+        let meta = BlobMeta {
+            key: Key::MIN,
+            lsn: Lsn(0),
+        };
+
+        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
+            let end = offsets.get(idx + 1).unwrap_or(&file_len);
+            if idx + 1 == offsets.len() {
+                continue;
+            }
+            let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096);
+            let read = read_builder.build();
+            let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
+            assert_eq!(result.blobs.len(), 1);
+            let read_blob = &result.blobs[0];
+            let read_buf = &result.buf[read_blob.start..read_blob.end];
+            assert_eq!(blob, read_buf, "mismatch for idx={idx} at offset={offset}");
+            buf = result.buf;
+        }
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_really_big_array() -> Result<(), Error> {
+        let blobs = &[
+            b"test".to_vec(),
+            random_array(10 * PAGE_SZ),
+            b"hello".to_vec(),
+            random_array(66 * PAGE_SZ),
+            vec![0xf3; 24 * PAGE_SZ],
+            b"foobar".to_vec(),
+        ];
+        round_trip_test_compressed(blobs, false).await?;
+        round_trip_test_compressed(blobs, true).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_arrays_inc() -> Result<(), Error> {
+        let blobs = (0..PAGE_SZ / 8)
+            .map(|v| random_array(v * 16))
+            .collect::<Vec<_>>();
+        round_trip_test_compressed(&blobs, false).await?;
+        round_trip_test_compressed(&blobs, true).await?;
+        Ok(())
+    }
 }

From 86d6ef305a6cfe5ab91febb876f6a9bae0dd982f Mon Sep 17 00:00:00 2001
From: Japin Li <jianping.li@ww-it.cn>
Date: Fri, 12 Jul 2024 17:56:06 +0800
Subject: [PATCH 1180/1571] Remove fs2 dependency (#8350)

The fs2 dependency is not needed anymore after commit d42700280.
---
 Cargo.lock            | 11 -----------
 Cargo.toml            |  1 -
 safekeeper/Cargo.toml |  1 -
 3 files changed, 13 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b31ac69e6c..bab0b4dd1f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2028,16 +2028,6 @@ dependencies = [
  "tokio-util",
 ]
 
-[[package]]
-name = "fs2"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
-dependencies = [
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "fsevent-sys"
 version = "4.1.0"
@@ -5179,7 +5169,6 @@ dependencies = [
  "crc32c",
  "desim",
  "fail",
- "fs2",
  "futures",
  "git-version",
  "hex",
diff --git a/Cargo.toml b/Cargo.toml
index 6bad8e3b20..670e3241d5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -83,7 +83,6 @@ enumset = "1.0.12"
 fail = "0.5.0"
 fallible-iterator = "0.2"
 framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
-fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 9f32016fd9..0fdb3147bf 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -23,7 +23,6 @@ clap = { workspace = true, features = ["derive"] }
 const_format.workspace = true
 crc32c.workspace = true
 fail.workspace = true
-fs2.workspace = true
 git-version.workspace = true
 hex.workspace = true
 humantime.workspace = true

From 0645ae318e49115055b5903791dcd9294ce67521 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 12 Jul 2024 12:04:02 +0100
Subject: [PATCH 1181/1571] pageserver: circuit breaker on compaction (#8359)

## Problem

We already back off on compaction retries, but the impact of a failing
compaction can be so great that backing off up to 300s isn't enough. The
impact is consuming a lot of I/O+CPU in the case of image layer
generation for large tenants, and potentially also leaking disk space.

Compaction failures are extremely rare and almost always indicate a bug,
frequently a bug that will not let compaction to proceed until it is
fixed.

Related: https://github.com/neondatabase/neon/issues/6738

## Summary of changes

- Introduce a CircuitBreaker type
- Add a circuit breaker for compaction, with a policy that after 5
failures, compaction will not be attempted again for 24 hours.
- Add metrics that we can alert on: any >0 value for
`pageserver_circuit_breaker_broken_total` should generate an alert.
- Add a test that checks this works as intended.

Couple notes to reviewers:
- Circuit breakers are intrinsically a defense-in-depth measure: this is
not the solution to any underlying issues, it is just a general
mitigation for "unknown unknowns" that might be encountered in future.
- This PR isn't primarily about writing a perfect CircuitBreaker type:
the one in this PR is meant to be just enough to mitigate issues in
compaction, and make it easy to monitor/alert on these failures. We can
refine this type in future as/when we want to use it elsewhere.
---
 libs/utils/src/circuit_breaker.rs      | 114 +++++++++++++++++++++++++
 libs/utils/src/lib.rs                  |   2 +
 pageserver/src/metrics.rs              |  16 ++++
 pageserver/src/tenant.rs               |  36 +++++++-
 test_runner/regress/test_compaction.py |  63 ++++++++++++++
 5 files changed, 229 insertions(+), 2 deletions(-)
 create mode 100644 libs/utils/src/circuit_breaker.rs

diff --git a/libs/utils/src/circuit_breaker.rs b/libs/utils/src/circuit_breaker.rs
new file mode 100644
index 0000000000..720ea39d4f
--- /dev/null
+++ b/libs/utils/src/circuit_breaker.rs
@@ -0,0 +1,114 @@
+use std::{
+    fmt::Display,
+    time::{Duration, Instant},
+};
+
+use metrics::IntCounter;
+
+/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
+/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
+/// to mitigate the log spam from repeated failures.
+pub struct CircuitBreaker {
+    /// An identifier that enables us to log useful errors when a circuit is broken
+    name: String,
+
+    /// Consecutive failures since last success
+    fail_count: usize,
+
+    /// How many consecutive failures before we break the circuit
+    fail_threshold: usize,
+
+    /// If circuit is broken, when was it broken?
+    broken_at: Option<Instant>,
+
+    /// If set, we will auto-reset the circuit this long after it was broken.  If None, broken
+    /// circuits stay broken forever, or until success() is called.
+    reset_period: Option<Duration>,
+
+    /// If this is true, no actual circuit-breaking happens.  This is for overriding a circuit breaker
+    /// to permit something to keep running even if it would otherwise have tripped it.
+    short_circuit: bool,
+}
+
+impl CircuitBreaker {
+    pub fn new(name: String, fail_threshold: usize, reset_period: Option<Duration>) -> Self {
+        Self {
+            name,
+            fail_count: 0,
+            fail_threshold,
+            broken_at: None,
+            reset_period,
+            short_circuit: false,
+        }
+    }
+
+    /// Construct an unbreakable circuit breaker, for use in unit tests etc.
+    pub fn short_circuit() -> Self {
+        Self {
+            name: String::new(),
+            fail_threshold: 0,
+            fail_count: 0,
+            broken_at: None,
+            reset_period: None,
+            short_circuit: true,
+        }
+    }
+
+    pub fn fail<E>(&mut self, metric: &IntCounter, error: E)
+    where
+        E: Display,
+    {
+        if self.short_circuit {
+            return;
+        }
+
+        self.fail_count += 1;
+        if self.broken_at.is_none() && self.fail_count >= self.fail_threshold {
+            self.break_circuit(metric, error);
+        }
+    }
+
+    /// Call this after successfully executing an operation
+    pub fn success(&mut self, metric: &IntCounter) {
+        self.fail_count = 0;
+        if let Some(broken_at) = &self.broken_at {
+            tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})",
+                humantime::format_duration(broken_at.elapsed()));
+            self.broken_at = None;
+            metric.inc();
+        }
+    }
+
+    /// Call this before attempting an operation, and skip the operation if we are currently broken.
+    pub fn is_broken(&mut self) -> bool {
+        if self.short_circuit {
+            return false;
+        }
+
+        if let Some(broken_at) = self.broken_at {
+            match self.reset_period {
+                Some(reset_period) if broken_at.elapsed() > reset_period => {
+                    self.reset_circuit();
+                    false
+                }
+                _ => true,
+            }
+        } else {
+            false
+        }
+    }
+
+    fn break_circuit<E>(&mut self, metric: &IntCounter, error: E)
+    where
+        E: Display,
+    {
+        self.broken_at = Some(Instant::now());
+        tracing::error!(breaker=%self.name, "Circuit breaker broken!  Last error: {error}");
+        metric.inc();
+    }
+
+    fn reset_circuit(&mut self) {
+        self.broken_at = None;
+        self.fail_count = 0;
+    }
+}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 711e617801..9ad1752fb7 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -98,6 +98,8 @@ pub mod poison;
 
 pub mod toml_edit_ext;
 
+pub mod circuit_breaker;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index e67fa656d0..9b3bb481b9 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -569,6 +569,22 @@ static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_circuit_breaker_broken",
+        "How many times a circuit breaker has broken"
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_circuit_breaker_unbroken",
+        "How many times a circuit breaker has been un-broken (recovered)"
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) mod initial_logical_size {
     use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
     use once_cell::sync::Lazy;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index bf23513527..6333fd3b63 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -39,6 +39,7 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::backoff;
+use utils::circuit_breaker::CircuitBreaker;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::failpoint_support;
@@ -76,7 +77,8 @@ use crate::is_uninit_mark;
 use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::TENANT;
 use crate::metrics::{
-    remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
+    remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
+    TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
 };
 use crate::repository::GcResult;
 use crate::task_mgr;
@@ -276,6 +278,10 @@ pub struct Tenant {
 
     eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
 
+    /// Track repeated failures to compact, so that we can back off.
+    /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
+    compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
+
     /// If the tenant is in Activating state, notify this to encourage it
     /// to proceed to Active as soon as possible, rather than waiting for lazy
     /// background warmup.
@@ -1641,13 +1647,31 @@ impl Tenant {
             timelines_to_compact
         };
 
+        // Before doing any I/O work, check our circuit breaker
+        if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
+            info!("Skipping compaction due to previous failures");
+            return Ok(());
+        }
+
         for (timeline_id, timeline) in &timelines_to_compact {
             timeline
                 .compact(cancel, EnumSet::empty(), ctx)
                 .instrument(info_span!("compact_timeline", %timeline_id))
-                .await?;
+                .await
+                .map_err(|e| {
+                    self.compaction_circuit_breaker
+                        .lock()
+                        .unwrap()
+                        .fail(&CIRCUIT_BREAKERS_BROKEN, &e);
+                    e
+                })?;
         }
 
+        self.compaction_circuit_breaker
+            .lock()
+            .unwrap()
+            .success(&CIRCUIT_BREAKERS_UNBROKEN);
+
         Ok(())
     }
 
@@ -2563,6 +2587,14 @@ impl Tenant {
             cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
             cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
             eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
+            compaction_circuit_breaker: std::sync::Mutex::new(CircuitBreaker::new(
+                format!("compaction-{tenant_shard_id}"),
+                5,
+                // Compaction can be a very expensive operation, and might leak disk space.  It also ought
+                // to be infallible, as long as remote storage is available.  So if it repeatedly fails,
+                // use an extremely long backoff.
+                Some(Duration::from_secs(3600 * 24)),
+            )),
             activate_now_sem: tokio::sync::Semaphore::new(0),
             cancel: CancellationToken::default(),
             gate: Gate::default(),
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 49dcb9b86a..f321c09b27 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -1,12 +1,14 @@
 import enum
 import json
 import os
+import time
 from typing import Optional
 
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions
 from fixtures.pageserver.http import PageserverApiException
+from fixtures.utils import wait_until
 from fixtures.workload import Workload
 
 AGGRESIVE_COMPACTION_TENANT_CONF = {
@@ -257,3 +259,64 @@ def test_uploads_and_deletions(
         found_allowed_error = any(env.pageserver.log_contains(e) for e in allowed_errors)
         if not found_allowed_error:
             raise Exception("None of the allowed_errors occured in the log")
+
+
+def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder):
+    """
+    Check that repeated failures in compaction result in a circuit breaker breaking
+    """
+    TENANT_CONF = {
+        # Very frequent runs to rack up failures quickly
+        "compaction_period": "100ms",
+        # Small checkpoint distance to create many layers
+        "checkpoint_distance": 1024 * 128,
+        # Compact small layers
+        "compaction_target_size": 1024 * 128,
+        "image_creation_threshold": 1,
+    }
+
+    FAILPOINT = "delta-layer-writer-fail-before-finish"
+    BROKEN_LOG = ".*Circuit breaker broken!.*"
+
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+
+    workload = Workload(env, env.initial_tenant, env.initial_timeline)
+    workload.init()
+
+    # Set a failpoint that will prevent compaction succeeding
+    env.pageserver.http_client().configure_failpoints((FAILPOINT, "return"))
+
+    # Write some data to trigger compaction
+    workload.write_rows(1024, upload=False)
+    workload.write_rows(1024, upload=False)
+    workload.write_rows(1024, upload=False)
+
+    def assert_broken():
+        env.pageserver.assert_log_contains(BROKEN_LOG)
+        assert (
+            env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_broken_total")
+            or 0
+        ) == 1
+        assert (
+            env.pageserver.http_client().get_metric_value(
+                "pageserver_circuit_breaker_unbroken_total"
+            )
+            or 0
+        ) == 0
+
+    # Wait for enough failures to break the circuit breaker
+    # This wait is fairly long because we back off on compaction failures, so 5 retries takes ~30s
+    wait_until(60, 1, assert_broken)
+
+    # Sleep for a while, during which time we expect that compaction will _not_ be retried
+    time.sleep(10)
+
+    assert (
+        env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_broken_total")
+        or 0
+    ) == 1
+    assert (
+        env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_unbroken_total")
+        or 0
+    ) == 0
+    assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*")

From 411a130675363bd2e06be937926803390d748319 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 12 Jul 2024 13:58:04 +0100
Subject: [PATCH 1182/1571] Fix nightly warnings 2024 june (#8151)

## Problem

new clippy warnings on nightly.

## Summary of changes

broken up each commit by warning type.
1. Remove some unnecessary refs.
2. In edition 2024, inference will default to `!` and not `()`.
3. Clippy complains about doc comment indentation
4. Fix `Trait + ?Sized` where `Trait: Sized`.
5. diesel_derives triggering `non_local_defintions`
---
 compute_tools/src/bin/compute_ctl.rs          |  3 +--
 compute_tools/src/compute.rs                  |  1 +
 control_plane/storcon_cli/src/main.rs         |  2 +-
 libs/utils/src/http/endpoint.rs               | 14 +++++------
 pageserver/compaction/src/interface.rs        |  2 +-
 pageserver/src/context.rs                     |  1 +
 pageserver/src/pgdatadir_mapping.rs           |  2 +-
 pageserver/src/tenant/disk_btree.rs           |  6 ++---
 .../src/tenant/storage_layer/layer_desc.rs    |  2 +-
 pageserver/src/tenant/timeline.rs             | 11 +++++----
 pageserver/src/tenant/timeline/delete.rs      |  6 +++--
 .../src/tenant/timeline/logical_size.rs       | 24 +++++++++----------
 pageserver/src/tenant/timeline/walreceiver.rs | 10 ++++----
 pageserver/src/tenant/vectored_blob_io.rs     |  4 ++--
 .../virtual_file/owned_buffers_io/write.rs    |  1 +
 proxy/src/compute.rs                          |  2 +-
 proxy/src/redis/cancellation_publisher.rs     |  2 +-
 .../connection_with_credentials_provider.rs   |  2 +-
 proxy/src/redis/notifications.rs              |  2 +-
 proxy/src/serverless/backend.rs               |  2 +-
 proxy/src/serverless/conn_pool.rs             |  2 +-
 proxy/src/waiters.rs                          |  2 +-
 safekeeper/src/wal_backup.rs                  |  1 +
 storage_controller/src/persistence.rs         |  1 +
 storage_controller/src/service.rs             | 14 +++++------
 storage_controller/src/tenant_shard.rs        |  1 +
 26 files changed, 64 insertions(+), 56 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index f4c396a85d..0ba2c1aeb4 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -6,7 +6,7 @@
 //! - Every start is a fresh start, so the data directory is removed and
 //!   initialized again on each run.
 //! - If remote_extension_config is provided, it will be used to fetch extensions list
-//!  and download `shared_preload_libraries` from the remote storage.
+//!   and download `shared_preload_libraries` from the remote storage.
 //! - Next it will put configuration files into the `PGDATA` directory.
 //! - Sync safekeepers and get commit LSN.
 //! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -33,7 +33,6 @@
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
 //! ```
-//!
 use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 1112795d30..91855d954d 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -56,6 +56,7 @@ pub struct ComputeNode {
     /// - we push new spec and it does reconfiguration
     /// - but then something happens and compute pod / VM is destroyed,
     ///   so k8s controller starts it again with the **old** spec
+    ///
     /// and the same for empty computes:
     /// - we started compute without any spec
     /// - we push spec and it does configuration
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 815f5c940f..777a717a73 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -341,7 +341,7 @@ async fn main() -> anyhow::Result<()> {
         }
         Command::TenantCreate { tenant_id } => {
             storcon_client
-                .dispatch(
+                .dispatch::<_, ()>(
                     Method::POST,
                     "v1/tenant".to_string(),
                     Some(TenantCreateRequest {
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index f8a5f68131..8ee5abd434 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -52,17 +52,17 @@ struct RequestId(String);
 /// There could be other ways to implement similar functionality:
 ///
 /// * procmacros placed on top of all handler methods
-/// With all the drawbacks of procmacros, brings no difference implementation-wise,
-/// and little code reduction compared to the existing approach.
+///   With all the drawbacks of procmacros, brings no difference implementation-wise,
+///   and little code reduction compared to the existing approach.
 ///
 /// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic,
-/// implemented for [`RouterBuilder`].
-/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
+///   implemented for [`RouterBuilder`].
+///   Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
 ///
 /// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped
-/// later, in a post-response middleware.
-/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
-/// tries to achive with its `.instrument` used in the current approach.
+///   later, in a post-response middleware.
+///   Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
+///   tries to achive with its `.instrument` used in the current approach.
 ///
 /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
 pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs
index 35519b5d0a..5bc9b5ca1d 100644
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -131,7 +131,7 @@ impl CompactionKey for Key {
 pub type CompactionKeySpace<K> = Vec<Range<K>>;
 
 /// Functions needed from all layers.
-pub trait CompactionLayer<K: CompactionKey + ?Sized> {
+pub trait CompactionLayer<K: CompactionKey> {
     fn key_range(&self) -> &Range<K>;
     fn lsn_range(&self) -> &Range<Lsn>;
 
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index 86d0390c30..0b07e07524 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -59,6 +59,7 @@
 //! 1. It should be easy to forward the context to callees.
 //! 2. To propagate more data from high-level to low-level code, the functions in
 //!    the middle should not need to be modified.
+//!
 //! The solution is to have a container structure ([`RequestContext`]) that
 //! carries the information. Functions that don't care about what's in it
 //! pass it along to callees.
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 8a6cfea92b..a821b824d0 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -522,7 +522,7 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<Option<TimestampTz>, PageReconstructError> {
         let mut max: Option<TimestampTz> = None;
-        self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
+        self.map_all_timestamps::<()>(probe_lsn, ctx, |timestamp| {
             if let Some(max_prev) = max {
                 max = Some(max_prev.max(timestamp));
             } else {
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index b76498b608..251d2ab4ad 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -550,10 +550,10 @@ where
     /// We maintain the length of the stack to be always greater than zero.
     /// Two exceptions are:
     /// 1. `Self::flush_node`. The method will push the new node if it extracted the last one.
-    ///   So because other methods cannot see the intermediate state invariant still holds.
+    ///    So because other methods cannot see the intermediate state invariant still holds.
     /// 2. `Self::finish`. It consumes self and does not return it back,
-    ///  which means that this is where the structure is destroyed.
-    ///  Thus stack of zero length cannot be observed by other methods.
+    ///    which means that this is where the structure is destroyed.
+    ///    Thus stack of zero length cannot be observed by other methods.
     stack: Vec<BuildNode<L>>,
 
     /// Last key that was appended to the tree. Used to sanity check that append
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index a89b66e4a1..bd765560e4 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -25,7 +25,7 @@ pub struct PersistentLayerDesc {
     ///
     /// - For an open in-memory layer, the end bound is MAX_LSN
     /// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the
-    /// range start
+    ///   range start
     /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
     pub lsn_range: Range<Lsn>,
     /// Whether this is a delta layer, and also, is this incremental.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 762e903bf8..a3ddb3a1d1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3408,6 +3408,7 @@ impl Timeline {
         }
     }
 
+    #[allow(clippy::doc_lazy_continuation)]
     /// Get the data needed to reconstruct all keys in the provided keyspace
     ///
     /// The algorithm is as follows:
@@ -4474,10 +4475,10 @@ impl Timeline {
     /// are required. Since checking if new image layers are required is expensive in
     /// terms of CPU, we only do it in the following cases:
     /// 1. If the timeline has ingested sufficient WAL to justify the cost
-    /// 2. If enough time has passed since the last check
-    /// 2.1. For large tenants, we wish to perform the check more often since they
-    /// suffer from the lack of image layers
-    /// 2.2. For small tenants (that can mostly fit in RAM), we use a much longer interval
+    /// 2. If enough time has passed since the last check:
+    ///     1. For large tenants, we wish to perform the check more often since they
+    ///        suffer from the lack of image layers
+    ///     2. For small tenants (that can mostly fit in RAM), we use a much longer interval
     fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
         const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
 
@@ -4719,7 +4720,7 @@ impl Timeline {
     /// Requires a timeline that:
     /// - has an ancestor to detach from
     /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
-    /// a technical requirement
+    ///   a technical requirement
     ///
     /// After the operation has been started, it cannot be canceled. Upon restart it needs to be
     /// polled again until completion.
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index b0088f4ea2..d32945d9e4 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -182,13 +182,15 @@ async fn remove_timeline_from_tenant(
 /// 5. Delete index part
 /// 6. Delete meta, timeline directory
 /// 7. Delete mark file
+///
 /// It is resumable from any step in case a crash/restart occurs.
 /// There are three entrypoints to the process:
 /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
 /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
-/// and we possibly neeed to continue deletion of remote files.
+///    and we possibly neeed to continue deletion of remote files.
 /// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
-/// index but still have local metadata, timeline directory and delete mark.
+///    index but still have local metadata, timeline directory and delete mark.
+///
 /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
 #[derive(Default)]
 pub enum DeleteTimelineFlow {
diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs
index 8f9ca0e29f..b0d6c4a27a 100644
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -11,11 +11,11 @@ use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
 /// Calculation consists of two stages:
 ///
 /// 1. Initial size calculation. That might take a long time, because it requires
-/// reading all layers containing relation sizes at `initial_part_end`.
+///    reading all layers containing relation sizes at `initial_part_end`.
 ///
 /// 2. Collecting an incremental part and adding that to the initial size.
-/// Increments are appended on walreceiver writing new timeline data,
-/// which result in increase or decrease of the logical size.
+///    Increments are appended on walreceiver writing new timeline data,
+///    which result in increase or decrease of the logical size.
 pub(super) struct LogicalSize {
     /// Size, potentially slow to compute. Calculating this might require reading multiple
     /// layers, and even ancestor's layers.
@@ -45,17 +45,17 @@ pub(super) struct LogicalSize {
     /// Size shouldn't ever be negative, but this is signed for two reasons:
     ///
     /// 1. If we initialized the "baseline" size lazily, while we already
-    /// process incoming WAL, the incoming WAL records could decrement the
-    /// variable and temporarily make it negative. (This is just future-proofing;
-    /// the initialization is currently not done lazily.)
+    ///    process incoming WAL, the incoming WAL records could decrement the
+    ///    variable and temporarily make it negative. (This is just future-proofing;
+    ///    the initialization is currently not done lazily.)
     ///
     /// 2. If there is a bug and we e.g. forget to increment it in some cases
-    /// when size grows, but remember to decrement it when it shrinks again, the
-    /// variable could go negative. In that case, it seems better to at least
-    /// try to keep tracking it, rather than clamp or overflow it. Note that
-    /// get_current_logical_size() will clamp the returned value to zero if it's
-    /// negative, and log an error. Could set it permanently to zero or some
-    /// special value to indicate "broken" instead, but this will do for now.
+    ///    when size grows, but remember to decrement it when it shrinks again, the
+    ///    variable could go negative. In that case, it seems better to at least
+    ///    try to keep tracking it, rather than clamp or overflow it. Note that
+    ///    get_current_logical_size() will clamp the returned value to zero if it's
+    ///    negative, and log an error. Could set it permanently to zero or some
+    ///    special value to indicate "broken" instead, but this will do for now.
     ///
     /// Note that we also expose a copy of this value as a prometheus metric,
     /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index a085154a5a..4a3a5c621b 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -2,13 +2,13 @@
 //! To do so, a current implementation needs to do the following:
 //!
 //! * acknowledge the timelines that it needs to stream WAL into.
-//! Pageserver is able to dynamically (un)load tenants on attach and detach,
-//! hence WAL receiver needs to react on such events.
+//!   Pageserver is able to dynamically (un)load tenants on attach and detach,
+//!   hence WAL receiver needs to react on such events.
 //!
 //! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming.
-//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
-//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
-//! Without this data, no WAL streaming is possible currently.
+//!   For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
+//!   The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
+//!   Without this data, no WAL streaming is possible currently.
 //!
 //! Only one active WAL streaming connection is allowed at a time.
 //! The connection is supposed to be updated periodically, based on safekeeper timeline data.
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index cb81f1d76d..5a0986ea12 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -191,9 +191,9 @@ impl VectoredReadPlanner {
     ///
     /// The `flag` argument has two interesting values:
     /// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs.
-    /// This is used for WAL records that `will_init`.
+    ///   This is used for WAL records that `will_init`.
     /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
-    /// if the blob is cached.
+    ///   if the blob is cached.
     pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) {
         // Implementation note: internally lag behind by one blob such that
         // we have a start and end offset when initialising [`VectoredRead`]
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index 885a9221c5..8599d95cdf 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -33,6 +33,7 @@ pub struct BufferedWriter<B, W> {
     /// invariant: always remains Some(buf) except
     /// - while IO is ongoing => goes back to Some() once the IO completed successfully
     /// - after an IO error => stays `None` forever
+    ///
     /// In these exceptional cases, it's `None`.
     buf: Option<B>,
 }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index a50a96e5e8..f91693c704 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -319,7 +319,7 @@ impl ConnCfg {
         let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
         let (client, connection) = self.0.connect_raw(stream, tls).await?;
         drop(pause);
-        tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
+        tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
         let stream = connection.stream.into_inner();
 
         info!(
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 7baf104374..c9a946fa4a 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -106,7 +106,7 @@ impl RedisPublisherClient {
             cancel_key_data,
             session_id,
         }))?;
-        self.client.publish(PROXY_CHANNEL_NAME, payload).await?;
+        let _: () = self.client.publish(PROXY_CHANNEL_NAME, payload).await?;
         Ok(())
     }
     pub async fn try_connect(&mut self) -> anyhow::Result<()> {
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index 3a90d911c2..b02ce472c0 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -178,7 +178,7 @@ impl ConnectionWithCredentialsProvider {
         credentials_provider: Arc<CredentialsProvider>,
     ) -> anyhow::Result<()> {
         let (user, password) = credentials_provider.provide_credentials().await?;
-        redis::cmd("AUTH")
+        let _: () = redis::cmd("AUTH")
             .arg(user)
             .arg(password)
             .query_async(con)
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 87d723d17e..efd7437d5d 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -127,7 +127,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
             Cancel(cancel_session) => {
                 tracing::Span::current().record(
                     "session_id",
-                    &tracing::field::display(cancel_session.session_id),
+                    tracing::field::display(cancel_session.session_id),
                 );
                 Metrics::get()
                     .proxy
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 6c34d48338..3b86c1838c 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -245,7 +245,7 @@ impl ConnectMechanism for TokioMechanism {
         drop(pause);
         let (client, connection) = permit.release_result(res)?;
 
-        tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
+        tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
         Ok(poll_client(
             self.pool.clone(),
             ctx,
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 170bda062e..dbc58d48ec 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -403,7 +403,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                 tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
                 tracing::Span::current().record(
                     "pid",
-                    &tracing::field::display(client.inner.get_process_id()),
+                    tracing::field::display(client.inner.get_process_id()),
                 );
                 info!(
                     cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs
index bba5494cfe..888ad38048 100644
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -111,7 +111,7 @@ mod tests {
 
         let waiters = Arc::clone(&waiters);
         let notifier = tokio::spawn(async move {
-            waiters.notify(key, Default::default())?;
+            waiters.notify(key, ())?;
             Ok(())
         });
 
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 9ea048a3c7..5a590689c3 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -119,6 +119,7 @@ async fn shut_down_task(entry: &mut Option<WalBackupTaskHandle>) {
 /// time we have several ones as they PUT the same files. Also,
 /// - frequently changing the offloader would be bad;
 /// - electing seriously lagging safekeeper is undesirable;
+///
 /// So we deterministically choose among the reasonably caught up candidates.
 /// TODO: take into account failed attempts to deal with hypothetical situation
 /// where s3 is unreachable only for some sks.
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 47caf7ae81..9f7b2f775e 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -542,6 +542,7 @@ impl Persistence {
         Ok(Generation::new(g as u32))
     }
 
+    #[allow(non_local_definitions)]
     /// For use when updating a persistent property of a tenant, such as its config or placement_policy.
     ///
     /// Do not use this for settting generation, unless in the special onboarding code path (/location_config)
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index b6e2b53191..deaac83ea5 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5070,7 +5070,7 @@ impl Service {
     ///      we did the split, but are probably better placed elsewhere.
     /// - Creating new secondary locations if it improves the spreading of a sharded tenant
     ///    * e.g. after a shard split, some locations will be on the same node (where the split
-    ///     happened), and will probably be better placed elsewhere.
+    ///      happened), and will probably be better placed elsewhere.
     ///
     /// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at
     /// the time of scheduling, this function looks for cases where a better-scoring location is available
@@ -5633,14 +5633,14 @@ impl Service {
 
     /// Create a node fill plan (pick secondaries to promote) that meets the following requirements:
     /// 1. The node should be filled until it reaches the expected cluster average of
-    /// attached shards. If there are not enough secondaries on the node, the plan stops early.
+    ///    attached shards. If there are not enough secondaries on the node, the plan stops early.
     /// 2. Select tenant shards to promote such that the number of attached shards is balanced
-    /// throughout the cluster. We achieve this by picking tenant shards from each node,
-    /// starting from the ones with the largest number of attached shards, until the node
-    /// reaches the expected cluster average.
+    ///    throughout the cluster. We achieve this by picking tenant shards from each node,
+    ///    starting from the ones with the largest number of attached shards, until the node
+    ///    reaches the expected cluster average.
     /// 3. Avoid promoting more shards of the same tenant than required. The upper bound
-    /// for the number of tenants from the same shard promoted to the node being filled is:
-    /// shard count for the tenant divided by the number of nodes in the cluster.
+    ///    for the number of tenants from the same shard promoted to the node being filled is:
+    ///    shard count for the tenant divided by the number of nodes in the cluster.
     fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
         let mut locked = self.inner.write().unwrap();
         let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 2574dc297a..ee2ba6c4ee 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -124,6 +124,7 @@ pub(crate) struct TenantShard {
     ///  - ReconcileWaiters need to Arc-clone the overall object to read it later
     ///  - ReconcileWaitError needs to use an `Arc<ReconcileError>` because we can construct
     ///    many waiters for one shard, and the underlying error types are not Clone.
+    ///
     /// TODO: generalize to an array of recent events
     /// TOOD: use a ArcSwap instead of mutex for faster reads?
     #[serde(serialize_with = "read_last_error")]

From 4184685721f5bd0e70ee9587d569b09bce0f306c Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 12 Jul 2024 09:28:13 -0400
Subject: [PATCH 1183/1571] fix(pageserver): unique test harness name for
 merge_in_between (#8366)

As title, there should be a way to detect duplicated harness names in
the future :(

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/storage_layer/merge_iterator.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index 36386c87c9..68759f7585 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -275,7 +275,7 @@ mod tests {
         use crate::repository::Value;
         use bytes::Bytes;
 
-        let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
+        let harness = TenantHarness::create("merge_iterator_merge_in_between").unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let tline = tenant

From b329b1c610e7166fc28a1499375666da7723ae24 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 12 Jul 2024 17:31:17 +0100
Subject: [PATCH 1184/1571] tests: allow list breaching min resident size in
 statvfs test (#8358)

## Problem
This test would sometimes violate the min resident size during disk
eviction and fail due to the generate warning log.

Disk usage candidate collection only takes into account active tenants.
However, the statvfs call takes into account the entire tenants
directory, which includes tenants which haven't become active yet.

After re-starting the pageserver, disk usage eviction may kick in
*before* both tenants have become active. Hence, the logic will try to satisfy
thedisk usage requirements by evicting everything belonging to the active
tenant, and hence violating the tenant minimum resident size.

## Summary of changes

Allow the warning
---
 test_runner/regress/test_disk_usage_eviction.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 7722828c79..fb8b7b22fa 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -794,6 +794,16 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
 
     wait_until(2, 2, less_than_max_usage_pct)
 
+    # Disk usage candidate collection only takes into account active tenants.
+    # However, the statvfs call takes into account the entire tenants directory,
+    # which includes tenants which haven't become active yet.
+    #
+    # After re-starting the pageserver, disk usage eviction may kick in *before*
+    # both tenants have become active. Hence, the logic will try to satisfy the
+    # disk usage requirements by evicting everything belonging to the active tenant,
+    # and hence violating the tenant minimum resident size.
+    env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
+
 
 def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
     """

From 8da3b547f834497377b7b5db727c10bd88697cde Mon Sep 17 00:00:00 2001
From: Luca Bruno <lucab@lucabruno.net>
Date: Mon, 15 Jul 2024 13:38:52 +0200
Subject: [PATCH 1185/1571] proxy/http: switch to typed_json (#8377)

## Summary of changes

This switches JSON rendering logic to `typed_json` in order to
reduce the number of allocations in the HTTP responder path.

Followup from
https://github.com/neondatabase/neon/pull/8319#issuecomment-2216991760.

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 Cargo.lock                            | 11 +++
 Cargo.toml                            |  1 +
 proxy/Cargo.toml                      |  1 +
 proxy/src/serverless/sql_over_http.rs | 97 +++++++++++++--------------
 4 files changed, 59 insertions(+), 51 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bab0b4dd1f..8897364701 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4404,6 +4404,7 @@ dependencies = [
  "tracing-opentelemetry",
  "tracing-subscriber",
  "tracing-utils",
+ "typed-json",
  "url",
  "urlencoding",
  "utils",
@@ -6665,6 +6666,16 @@ dependencies = [
  "static_assertions",
 ]
 
+[[package]]
+name = "typed-json"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6024a8d0025400b3f6b189366e9aa92012cf9c4fe1cd2620848dd61425c49eed"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "typenum"
 version = "1.16.0"
diff --git a/Cargo.toml b/Cargo.toml
index 670e3241d5..4f42203683 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -184,6 +184,7 @@ tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
+typed-json = "0.1"
 url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 288f7769fe..2f18b5fbc6 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -92,6 +92,7 @@ tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
+typed-json.workspace = true
 url.workspace = true
 urlencoding.workspace = true
 utils.workspace = true
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 8118ae5ea8..6400e4ac7b 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -18,7 +18,7 @@ use hyper1::Response;
 use hyper1::StatusCode;
 use hyper1::{HeaderMap, Request};
 use pq_proto::StartupMessageParamsBuilder;
-use serde_json::json;
+use serde::Serialize;
 use serde_json::Value;
 use tokio::time;
 use tokio_postgres::error::DbError;
@@ -32,6 +32,7 @@ use tokio_postgres::Transaction;
 use tokio_util::sync::CancellationToken;
 use tracing::error;
 use tracing::info;
+use typed_json::json;
 use url::Url;
 use utils::http::error::ApiError;
 
@@ -263,13 +264,8 @@ pub async fn handle(
                 | SqlOverHttpError::Postgres(e) => e.as_db_error(),
                 _ => None,
             };
-            fn get<'a, T: serde::Serialize>(
-                db: Option<&'a DbError>,
-                x: impl FnOnce(&'a DbError) -> T,
-            ) -> Value {
-                db.map(x)
-                    .and_then(|t| serde_json::to_value(t).ok())
-                    .unwrap_or_default()
+            fn get<'a, T: Default>(db: Option<&'a DbError>, x: impl FnOnce(&'a DbError) -> T) -> T {
+                db.map(x).unwrap_or_default()
             }
 
             if let Some(db_error) = db_error {
@@ -278,17 +274,11 @@ pub async fn handle(
 
             let position = db_error.and_then(|db| db.position());
             let (position, internal_position, internal_query) = match position {
-                Some(ErrorPosition::Original(position)) => (
-                    Value::String(position.to_string()),
-                    Value::Null,
-                    Value::Null,
-                ),
-                Some(ErrorPosition::Internal { position, query }) => (
-                    Value::Null,
-                    Value::String(position.to_string()),
-                    Value::String(query.clone()),
-                ),
-                None => (Value::Null, Value::Null, Value::Null),
+                Some(ErrorPosition::Original(position)) => (Some(position.to_string()), None, None),
+                Some(ErrorPosition::Internal { position, query }) => {
+                    (None, Some(position.to_string()), Some(query.clone()))
+                }
+                None => (None, None, None),
             };
 
             let code = get(db_error, |db| db.code().code());
@@ -578,10 +568,8 @@ async fn handle_inner(
         .status(StatusCode::OK)
         .header(header::CONTENT_TYPE, "application/json");
 
-    //
-    // Now execute the query and return the result
-    //
-    let result = match payload {
+    // Now execute the query and return the result.
+    let json_output = match payload {
         Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?,
         Payload::Batch(statements) => {
             if parsed_headers.txn_read_only {
@@ -605,11 +593,9 @@ async fn handle_inner(
 
     let metrics = client.metrics();
 
-    // how could this possibly fail
-    let body = serde_json::to_string(&result).expect("json serialization should not fail");
-    let len = body.len();
+    let len = json_output.len();
     let response = response
-        .body(Full::new(Bytes::from(body)))
+        .body(Full::new(Bytes::from(json_output)))
         // only fails if invalid status code or invalid header/values are given.
         // these are not user configurable so it cannot fail dynamically
         .expect("building response payload should not fail");
@@ -631,7 +617,7 @@ impl QueryData {
         cancel: CancellationToken,
         client: &mut Client<tokio_postgres::Client>,
         parsed_headers: HttpHeaders,
-    ) -> Result<Value, SqlOverHttpError> {
+    ) -> Result<String, SqlOverHttpError> {
         let (inner, mut discard) = client.inner();
         let cancel_token = inner.cancel_token();
 
@@ -644,7 +630,10 @@ impl QueryData {
             // The query successfully completed.
             Either::Left((Ok((status, results)), __not_yet_cancelled)) => {
                 discard.check_idle(status);
-                Ok(results)
+
+                let json_output =
+                    serde_json::to_string(&results).expect("json serialization should not fail");
+                Ok(json_output)
             }
             // The query failed with an error
             Either::Left((Err(e), __not_yet_cancelled)) => {
@@ -662,7 +651,10 @@ impl QueryData {
                     // query successed before it was cancelled.
                     Ok(Ok((status, results))) => {
                         discard.check_idle(status);
-                        Ok(results)
+
+                        let json_output = serde_json::to_string(&results)
+                            .expect("json serialization should not fail");
+                        Ok(json_output)
                     }
                     // query failed or was cancelled.
                     Ok(Err(error)) => {
@@ -696,7 +688,7 @@ impl BatchQueryData {
         cancel: CancellationToken,
         client: &mut Client<tokio_postgres::Client>,
         parsed_headers: HttpHeaders,
-    ) -> Result<Value, SqlOverHttpError> {
+    ) -> Result<String, SqlOverHttpError> {
         info!("starting transaction");
         let (inner, mut discard) = client.inner();
         let cancel_token = inner.cancel_token();
@@ -718,9 +710,9 @@ impl BatchQueryData {
             e
         })?;
 
-        let results =
+        let json_output =
             match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
-                Ok(results) => {
+                Ok(json_output) => {
                     info!("commit");
                     let status = transaction.commit().await.map_err(|e| {
                         // if we cannot commit - for now don't return connection to pool
@@ -729,7 +721,7 @@ impl BatchQueryData {
                         e
                     })?;
                     discard.check_idle(status);
-                    results
+                    json_output
                 }
                 Err(SqlOverHttpError::Cancelled(_)) => {
                     if let Err(err) = cancel_token.cancel_query(NoTls).await {
@@ -753,7 +745,7 @@ impl BatchQueryData {
                 }
             };
 
-        Ok(json!({ "results": results }))
+        Ok(json_output)
     }
 }
 
@@ -762,7 +754,7 @@ async fn query_batch(
     transaction: &Transaction<'_>,
     queries: BatchQueryData,
     parsed_headers: HttpHeaders,
-) -> Result<Vec<Value>, SqlOverHttpError> {
+) -> Result<String, SqlOverHttpError> {
     let mut results = Vec::with_capacity(queries.queries.len());
     let mut current_size = 0;
     for stmt in queries.queries {
@@ -787,7 +779,11 @@ async fn query_batch(
             }
         }
     }
-    Ok(results)
+
+    let results = json!({ "results": results });
+    let json_output = serde_json::to_string(&results).expect("json serialization should not fail");
+
+    Ok(json_output)
 }
 
 async fn query_to_json<T: GenericClient>(
@@ -795,7 +791,7 @@ async fn query_to_json<T: GenericClient>(
     data: QueryData,
     current_size: &mut usize,
     parsed_headers: HttpHeaders,
-) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> {
+) -> Result<(ReadyForQueryStatus, impl Serialize), SqlOverHttpError> {
     info!("executing query");
     let query_params = data.params;
     let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?);
@@ -844,8 +840,8 @@ async fn query_to_json<T: GenericClient>(
 
     for c in row_stream.columns() {
         fields.push(json!({
-            "name": Value::String(c.name().to_owned()),
-            "dataTypeID": Value::Number(c.type_().oid().into()),
+            "name": c.name().to_owned(),
+            "dataTypeID": c.type_().oid(),
             "tableID": c.table_oid(),
             "columnID": c.column_id(),
             "dataTypeSize": c.type_size(),
@@ -863,15 +859,14 @@ async fn query_to_json<T: GenericClient>(
         .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode))
         .collect::<Result<Vec<_>, _>>()?;
 
-    // resulting JSON format is based on the format of node-postgres result
-    Ok((
-        ready,
-        json!({
-            "command": command_tag_name,
-            "rowCount": command_tag_count,
-            "rows": rows,
-            "fields": fields,
-            "rowAsArray": array_mode,
-        }),
-    ))
+    // Resulting JSON format is based on the format of node-postgres result.
+    let results = json!({
+        "command": command_tag_name.to_string(),
+        "rowCount": command_tag_count,
+        "rows": rows,
+        "fields": fields,
+        "rowAsArray": array_mode,
+    });
+
+    Ok((ready, results))
 }

From 4bdfb96078951e3eb471d0ebd668777db048fb67 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 11 Jul 2024 16:35:31 +0300
Subject: [PATCH 1186/1571] Fix test_timeline_copy flakiness.

fixes https://github.com/neondatabase/neon/issues/8355
---
 safekeeper/src/copy_timeline.rs          | 10 ++++++++--
 test_runner/regress/test_wal_acceptor.py |  5 +++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
index 14bd3c03b8..220988c3ce 100644
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -74,10 +74,16 @@ pub async fn handle_request(request: Request) -> Result<()> {
         assert!(flush_lsn >= start_lsn);
 
         if request.until_lsn > flush_lsn {
-            bail!("requested LSN is beyond the end of the timeline");
+            bail!(format!(
+                "requested LSN {} is beyond the end of the timeline {}",
+                request.until_lsn, flush_lsn
+            ));
         }
         if request.until_lsn < start_lsn {
-            bail!("requested LSN is before the start of the timeline");
+            bail!(format!(
+                "requested LSN {} is before the start of the timeline {}",
+                request.until_lsn, start_lsn
+            ));
         }
 
         if request.until_lsn > commit_lsn {
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index e0ad4fdd5c..2e906e6160 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2065,6 +2065,11 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int):
         log.info(f"Original digest: {orig_digest}")
 
         for sk in env.safekeepers:
+            wait(
+                partial(is_flush_lsn_caught_up, sk, tenant_id, timeline_id, lsn),
+                f"sk_id={sk.id} to flush {lsn}",
+            )
+
             sk.http_client().copy_timeline(
                 tenant_id,
                 timeline_id,

From 8a8b83df27383a07bb7dbba519325c15d2f46357 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 15 Jul 2024 15:52:00 +0300
Subject: [PATCH 1187/1571] Add neon.running_xacts_overflow_policy to make it
 possible for RO replica to startup without primary even in case running xacts
 overflow (#8323)

## Problem

Right now if there are too many running xacts to be restored from CLOG
at replica startup,
then replica is not trying to restore them and wait for non-overflown
running-xacs WAL record from primary.
But if primary is not active, then replica will not start at all.

Too many running xacts can be caused by transactions with large number
of subtractions.
But right now it can be also cause by two reasons:
- Lack of shutdown checkpoint which updates `oldestRunningXid` (because
of immediate shutdown)
- nextXid alignment on 1024 boundary (which cause loosing ~1k XIDs on
each restart)

Both problems are somehow addressed now.
But we have existed customers with "sparse" CLOG and lack of
checkpoints.
To be able to start RO replicas for such customers I suggest to add GUC
which allows replica to start even in case of subxacts overflow.

## Summary of changes

Add `neon.running_xacts_overflow_policy` with the following values:
- ignore: restore from CLOG last N XIDs and accept connections
- skip: do not restore any XIDs from CXLOGbut still accept connections
- wait: wait non-overflown running xacts record from primary node

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/neon.c                          | 47 +++++++++++++++++++++--
 test_runner/regress/test_replica_start.py | 46 +++++++++++++++++++++-
 2 files changed, 88 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index e4968bdf89..3197a7e715 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -46,6 +46,21 @@ void		_PG_init(void);
 
 static int	logical_replication_max_snap_files = 300;
 
+static int  running_xacts_overflow_policy;
+
+enum RunningXactsOverflowPolicies {
+	OP_IGNORE,
+	OP_SKIP,
+	OP_WAIT
+};
+
+static const struct config_enum_entry running_xacts_overflow_policies[] = {
+	{"ignore", OP_IGNORE, false},
+	{"skip", OP_SKIP, false},
+	{"wait", OP_WAIT, false},
+	{NULL, 0, false}
+};
+
 static void
 InitLogicalReplicationMonitor(void)
 {
@@ -414,6 +429,7 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n
 	restored_xids = (TransactionId *) palloc(max_xcnt * sizeof(TransactionId));
 	n_restored_xids = 0;
 	next_prepared_idx = 0;
+
 	for (TransactionId xid = from; xid != till;)
 	{
 		XLogRecPtr	xidlsn;
@@ -424,7 +440,7 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n
 		/*
 		 * "Merge" the prepared transactions into the restored_xids array as
 		 * we go.  The prepared transactions array is sorted. This is mostly
-		 * a sanity check to ensure that all the prpeared transactions are
+		 * a sanity check to ensure that all the prepared transactions are
 		 * seen as in-progress. (There is a check after the loop that we didn't
 		 * miss any.)
 		 */
@@ -522,14 +538,23 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n
 			elog(LOG, "too many running xacts to restore from the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u",
 				 checkpoint->oldestXid, checkpoint->oldestActiveXid,
 				 XidFromFullTransactionId(checkpoint->nextXid));
-			goto fail;
+
+			switch (running_xacts_overflow_policy)
+			{
+				case OP_WAIT:
+					goto fail;
+				case OP_IGNORE:
+					goto success;
+				case OP_SKIP:
+					n_restored_xids = 0;
+					goto success;
+			}
 		}
 
 		restored_xids[n_restored_xids++] = xid;
 
 	skip:
 		TransactionIdAdvance(xid);
-		continue;
 	}
 
 	/* sanity check */
@@ -540,11 +565,13 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n
 		Assert(false);
 		goto fail;
 	}
-
+   success:
 	elog(LOG, "restored %d running xacts by scanning the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u",
 		 n_restored_xids, checkpoint->oldestXid, checkpoint->oldestActiveXid, XidFromFullTransactionId(checkpoint->nextXid));
 	*nxids = n_restored_xids;
 	*xids = restored_xids;
+	if (prepared_xids)
+		pfree(prepared_xids);
 	return true;
 
  fail:
@@ -581,6 +608,18 @@ _PG_init(void)
 
 	restore_running_xacts_callback = RestoreRunningXactsFromClog;
 
+
+	DefineCustomEnumVariable(
+							"neon.running_xacts_overflow_policy",
+							"Action performed on snapshot overflow when restoring runnings xacts from CLOG",
+							NULL,
+							&running_xacts_overflow_policy,
+							OP_IGNORE,
+							running_xacts_overflow_policies,
+							PGC_POSTMASTER,
+							0,
+							NULL, NULL, NULL);
+
 	/*
 	 * Important: This must happen after other parts of the extension are
 	 * loaded, otherwise any settings to GUCs that were set before the
diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py
index 17d476a8a6..0d95109d6b 100644
--- a/test_runner/regress/test_replica_start.py
+++ b/test_runner/regress/test_replica_start.py
@@ -210,7 +210,11 @@ def test_replica_start_wait_subxids_finish(neon_simple_env: NeonEnv):
     # Start it in a separate thread, so that we can do other stuff while it's
     # blocked waiting for the startup to finish.
     wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
-    secondary = env.endpoints.new_replica(origin=primary, endpoint_id="secondary")
+    secondary = env.endpoints.new_replica(
+        origin=primary,
+        endpoint_id="secondary",
+        config_lines=["neon.running_xacts_overflow_policy='wait'"],
+    )
     start_secondary_thread = threading.Thread(target=secondary.start)
     start_secondary_thread.start()
 
@@ -644,3 +648,43 @@ def test_replica_start_with_prepared_xacts_with_many_subxacts(neon_simple_env: N
     wait_replica_caughtup(primary, secondary)
     secondary_cur.execute("select count(*) from t")
     assert secondary_cur.fetchone() == (200001,)
+
+
+def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv):
+    """
+    Test the CLOG-scanning mechanism at hot standby startup in the presence of
+    large number of unsued XIDs, caused by  XID alignment and frequent primary restarts
+    """
+    n_restarts = 50
+
+    # Initialize the primary and a test table
+    env = neon_simple_env
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    with primary.cursor() as primary_cur:
+        primary_cur.execute("create table t(pk serial primary key, payload integer)")
+
+    for _ in range(n_restarts):
+        with primary.cursor() as primary_cur:
+            primary_cur.execute("insert into t (payload) values (0)")
+        # restart primary
+        primary.stop("immediate")
+        primary.start()
+
+    # Wait for the WAL to be flushed
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+
+    # stop primary to check that we can start replica without it
+    primary.stop(mode="immediate")
+
+    # Create a replica. It should start up normally, because of ignore policy
+    # mechanism.
+    secondary = env.endpoints.new_replica_start(
+        origin=primary,
+        endpoint_id="secondary",
+        config_lines=["neon.running_xacts_overflow_policy='ignore'"],
+    )
+
+    # Check that replica see all changes
+    with secondary.cursor() as secondary_cur:
+        secondary_cur.execute("select count(*) from t")
+        assert secondary_cur.fetchone() == (n_restarts,)

From b49b450dc4b607bf6d1aa267a16d8ff8180c998f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 15 Jul 2024 16:33:56 +0200
Subject: [PATCH 1188/1571] remove page_service `show <tenant_id>` (#8372)

This operation isn't used in practice, so let's remove it.

Context: in https://github.com/neondatabase/neon/pull/8339
---
 pageserver/src/metrics.rs               |  1 -
 pageserver/src/page_service.rs          | 60 ----------------
 test_runner/regress/test_auth.py        |  2 +-
 test_runner/regress/test_tenant_conf.py | 96 ++-----------------------
 4 files changed, 5 insertions(+), 154 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9b3bb481b9..abad4b44b8 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1490,7 +1490,6 @@ pub(crate) enum ComputeCommandKind {
     Basebackup,
     Fullbackup,
     LeaseLsn,
-    Show,
 }
 
 pub(crate) struct ComputeCommandCounters {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index f94b0d335e..00147a8ca6 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1479,66 +1479,6 @@ where
                     ))?
                 }
             };
-        } else if let Some(params) = parts.strip_prefix(&["show"]) {
-            // show <tenant_id>
-            if params.len() != 1 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for config command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-
-            tracing::Span::current().record("tenant_id", field::display(tenant_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::Show)
-                .inc();
-
-            let tenant = self
-                .get_active_tenant_with_timeout(
-                    tenant_id,
-                    ShardSelector::Zero,
-                    ACTIVE_TENANT_TIMEOUT,
-                )
-                .await?;
-            pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                RowDescriptor::int8_col(b"checkpoint_distance"),
-                RowDescriptor::int8_col(b"checkpoint_timeout"),
-                RowDescriptor::int8_col(b"compaction_target_size"),
-                RowDescriptor::int8_col(b"compaction_period"),
-                RowDescriptor::int8_col(b"compaction_threshold"),
-                RowDescriptor::int8_col(b"gc_horizon"),
-                RowDescriptor::int8_col(b"gc_period"),
-                RowDescriptor::int8_col(b"image_creation_threshold"),
-                RowDescriptor::int8_col(b"pitr_interval"),
-            ]))?
-            .write_message_noflush(&BeMessage::DataRow(&[
-                Some(tenant.get_checkpoint_distance().to_string().as_bytes()),
-                Some(
-                    tenant
-                        .get_checkpoint_timeout()
-                        .as_secs()
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(tenant.get_compaction_target_size().to_string().as_bytes()),
-                Some(
-                    tenant
-                        .get_compaction_period()
-                        .as_secs()
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(tenant.get_compaction_threshold().to_string().as_bytes()),
-                Some(tenant.get_gc_horizon().to_string().as_bytes()),
-                Some(tenant.get_gc_period().as_secs().to_string().as_bytes()),
-                Some(tenant.get_image_creation_threshold().to_string().as_bytes()),
-                Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
-            ]))?
-            .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
         } else {
             return Err(QueryError::Other(anyhow::anyhow!(
                 "unknown command {query_string}"
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index 922a21a999..7cb85e3dd1 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -211,7 +211,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     def check_pageserver(expect_success: bool, **conn_kwargs):
         check_connection(
             env.pageserver,
-            f"show {env.initial_tenant}",
+            f"pagestream {env.initial_tenant} {env.initial_timeline}",
             expect_success,
             **conn_kwargs,
         )
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 80fb2b55b8..1a8bc3b983 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -1,10 +1,7 @@
 import json
-from contextlib import closing
 from typing import Any, Dict
 
-import psycopg2.extras
 from fixtures.common_types import Lsn
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
 )
@@ -63,25 +60,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder):
 
     # check the configuration of the default tenant
     # it should match global configuration
-    with closing(env.pageserver.connect()) as psconn:
-        with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur:
-            log.info(f"show {env.initial_tenant}")
-            pscur.execute(f"show {env.initial_tenant}")
-            res = pscur.fetchone()
-            assert res is not None
-            assert all(
-                i in res.items()
-                for i in {
-                    "checkpoint_distance": 10000,
-                    "compaction_target_size": 1048576,
-                    "compaction_period": 20,
-                    "compaction_threshold": 10,
-                    "gc_horizon": 67108864,
-                    "gc_period": 60 * 60,
-                    "image_creation_threshold": 3,
-                    "pitr_interval": 604800,  # 7 days
-                }.items()
-            ), f"Unexpected res: {res}"
     default_tenant_config = http_client.tenant_config(tenant_id=env.initial_tenant)
     assert (
         not default_tenant_config.tenant_specific_overrides
@@ -103,25 +81,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder):
     }
 
     # check the configuration of the new tenant
-    with closing(env.pageserver.connect()) as psconn:
-        with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur:
-            pscur.execute(f"show {tenant}")
-            res = pscur.fetchone()
-            log.info(f"res: {res}")
-            assert res is not None
-            assert all(
-                i in res.items()
-                for i in {
-                    "checkpoint_distance": 20000,
-                    "compaction_target_size": 1048576,
-                    "compaction_period": 20,
-                    "compaction_threshold": 10,
-                    "gc_horizon": 67108864,
-                    "gc_period": 30,
-                    "image_creation_threshold": 3,
-                    "pitr_interval": 604800,
-                }.items()
-            ), f"Unexpected res: {res}"
     new_tenant_config = http_client.tenant_config(tenant_id=tenant)
     new_specific_config = new_tenant_config.tenant_specific_overrides
     assert new_specific_config["checkpoint_distance"] == 20000
@@ -166,25 +125,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder):
         conf=conf_update,
     )
 
-    with closing(env.pageserver.connect()) as psconn:
-        with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur:
-            pscur.execute(f"show {tenant}")
-            res = pscur.fetchone()
-            log.info(f"after config res: {res}")
-            assert res is not None
-            assert all(
-                i in res.items()
-                for i in {
-                    "checkpoint_distance": 15000,
-                    "compaction_target_size": 1048576,
-                    "compaction_period": 80,
-                    "compaction_threshold": 10,
-                    "gc_horizon": 67108864,
-                    "gc_period": 80,
-                    "image_creation_threshold": 2,
-                    "pitr_interval": 604800,
-                }.items()
-            ), f"Unexpected res: {res}"
     updated_tenant_config = http_client.tenant_config(tenant_id=tenant)
     updated_specific_config = updated_tenant_config.tenant_specific_overrides
     assert updated_specific_config["checkpoint_distance"] == 15000
@@ -222,25 +162,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder):
     env.pageserver.stop()
     env.pageserver.start()
 
-    with closing(env.pageserver.connect()) as psconn:
-        with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur:
-            pscur.execute(f"show {tenant}")
-            res = pscur.fetchone()
-            log.info(f"after restart res: {res}")
-            assert res is not None
-            assert all(
-                i in res.items()
-                for i in {
-                    "checkpoint_distance": 15000,
-                    "compaction_target_size": 1048576,
-                    "compaction_period": 80,
-                    "compaction_threshold": 10,
-                    "gc_horizon": 67108864,
-                    "gc_period": 80,
-                    "image_creation_threshold": 2,
-                    "pitr_interval": 604800,
-                }.items()
-            ), f"Unexpected res: {res}"
     restarted_tenant_config = http_client.tenant_config(tenant_id=tenant)
     assert (
         restarted_tenant_config == updated_tenant_config
@@ -283,19 +204,10 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder):
     env.pageserver.stop()
     env.pageserver.start()
 
-    with closing(env.pageserver.connect()) as psconn:
-        with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur:
-            pscur.execute(f"show {tenant}")
-            res = pscur.fetchone()
-            log.info(f"after restart res: {res}")
-            assert res is not None
-            assert all(
-                i in res.items()
-                for i in {
-                    "compaction_period": 20,
-                    "pitr_interval": 60,
-                }.items()
-            ), f"Unexpected res: {res}"
+    restarted_final_tenant_config = http_client.tenant_config(tenant_id=tenant)
+    assert (
+        restarted_final_tenant_config == final_tenant_config
+    ), "Updated config should not change after the restart"
 
 
 def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):

From 324e4e008fe4994ec84f96312aead9430afa178c Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 15 Jul 2024 18:08:24 +0300
Subject: [PATCH 1189/1571] feat(storcon): timeline detach ancestor passthrough
 (#8353)

Currently storage controller does not support forwarding timeline detach
ancestor requests to pageservers. Add support for forwarding `PUT
.../:tenant_id/timelines/:timeline_id/detach_ancestor`. Implement the
support mostly as is, because the timeline detach ancestor will be made
(mostly) idempotent in future PR.

Cc: #6994
---
 .../src/models/detach_ancestor.rs             |   2 +-
 pageserver/client/src/mgmt_api.rs             |  18 +++
 storage_controller/src/http.rs                |  26 ++++
 storage_controller/src/pageserver_client.rs   |  22 ++-
 storage_controller/src/service.rs             | 140 ++++++++++++++++--
 test_runner/fixtures/neon_fixtures.py         |   2 +-
 .../regress/test_timeline_detach_ancestor.py  |  97 +++++++++++-
 7 files changed, 281 insertions(+), 26 deletions(-)

diff --git a/libs/pageserver_api/src/models/detach_ancestor.rs b/libs/pageserver_api/src/models/detach_ancestor.rs
index fc1f10e734..ae5a21bab9 100644
--- a/libs/pageserver_api/src/models/detach_ancestor.rs
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -1,6 +1,6 @@
 use utils::id::TimelineId;
 
-#[derive(Default, serde::Serialize)]
+#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct AncestorDetached {
     pub reparented_timelines: Vec<TimelineId>,
 }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index e3ddb446fa..ac3ff1bb89 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,6 +1,7 @@
 use std::collections::HashMap;
 
 use bytes::Bytes;
+use detach_ancestor::AncestorDetached;
 use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
@@ -418,6 +419,23 @@ impl Client {
         }
     }
 
+    pub async fn timeline_detach_ancestor(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<AncestorDetached> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor",
+            self.mgmt_api_endpoint
+        );
+
+        self.request(Method::PUT, &uri, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
         let uri = format!(
             "{}/v1/tenant/{}/reset",
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 3a62c0dd4f..9ddf98eb3b 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -330,6 +330,22 @@ async fn handle_tenant_timeline_delete(
     .await
 }
 
+async fn handle_tenant_timeline_detach_ancestor(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
+    let res = service
+        .tenant_timeline_detach_ancestor(tenant_id, timeline_id)
+        .await?;
+
+    json_response(StatusCode::OK, res)
+}
+
 async fn handle_tenant_timeline_passthrough(
     service: Arc<Service>,
     req: Request<Body>,
@@ -1006,6 +1022,16 @@ pub fn make_router(
                 RequestName("v1_tenant_timeline"),
             )
         })
+        .put(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_timeline_detach_ancestor,
+                    RequestName("v1_tenant_timeline_detach_ancestor"),
+                )
+            },
+        )
         // Tenant detail GET passthrough to shard zero:
         .get("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 769aba80ca..8d64201cd9 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -1,8 +1,9 @@
 use pageserver_api::{
     models::{
-        LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
-        TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse,
-        TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse,
+        detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse,
+        PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse,
+        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
+        TopTenantShardsRequest, TopTenantShardsResponse,
     },
     shard::TenantShardId,
 };
@@ -226,6 +227,21 @@ impl PageserverClient {
         )
     }
 
+    pub(crate) async fn timeline_detach_ancestor(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<AncestorDetached> {
+        measured_request!(
+            "timeline_detach_ancestor",
+            crate::metrics::Method::Put,
+            &self.node_id_label,
+            self.inner
+                .timeline_detach_ancestor(tenant_shard_id, timeline_id)
+                .await
+        )
+    }
+
     pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
         measured_request!(
             "utilization",
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index deaac83ea5..95522525cb 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -117,6 +117,7 @@ enum TenantOperations {
     TimelineCreate,
     TimelineDelete,
     AttachHook,
+    TimelineDetachAncestor,
 }
 
 #[derive(Clone, strum_macros::Display)]
@@ -2376,18 +2377,18 @@ impl Service {
                 tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);
 
                 client
-                        .tenant_time_travel_remote_storage(
-                            tenant_shard_id,
-                            &timestamp,
-                            &done_if_after,
-                        )
-                        .await
-                        .map_err(|e| {
-                            ApiError::InternalServerError(anyhow::anyhow!(
-                                "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}",
-                                node
-                            ))
-                        })?;
+                    .tenant_time_travel_remote_storage(
+                        tenant_shard_id,
+                        &timestamp,
+                        &done_if_after,
+                    )
+                    .await
+                    .map_err(|e| {
+                        ApiError::InternalServerError(anyhow::anyhow!(
+                            "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}",
+                            node
+                        ))
+                    })?;
             }
         }
         Ok(())
@@ -2757,7 +2758,7 @@ impl Service {
         // Create timeline on remaining shards with number >0
         if !targets.is_empty() {
             // If we had multiple shards, issue requests for the remainder now.
-            let jwt = self.config.jwt_token.clone();
+            let jwt = &self.config.jwt_token;
             self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
                 let create_req = create_req.clone();
                 Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req))
@@ -2768,6 +2769,115 @@ impl Service {
         Ok(timeline_info)
     }
 
+    pub(crate) async fn tenant_timeline_detach_ancestor(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<models::detach_ancestor::AncestorDetached, ApiError> {
+        tracing::info!("Detaching timeline {tenant_id}/{timeline_id}",);
+
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimelineDetachAncestor,
+        )
+        .await;
+
+        self.ensure_attached_wait(tenant_id).await?;
+
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                let node_id = shard.intent.get_attached().ok_or_else(|| {
+                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
+                })?;
+                let node = locked
+                    .nodes
+                    .get(&node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                targets.push((*tenant_shard_id, node.clone()));
+            }
+            targets
+        };
+
+        if targets.is_empty() {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant not found").into(),
+            ));
+        }
+
+        async fn detach_one(
+            tenant_shard_id: TenantShardId,
+            timeline_id: TimelineId,
+            node: Node,
+            jwt: Option<String>,
+        ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> {
+            tracing::info!(
+                "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
+            );
+
+            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+            client
+                .timeline_detach_ancestor(tenant_shard_id, timeline_id)
+                .await
+                .map_err(|e| {
+                    use mgmt_api::Error;
+
+                    match e {
+                        // no ancestor (ever)
+                        Error::ApiError(StatusCode::CONFLICT, msg) => {
+                            ApiError::Conflict(format!("{node}: {msg}"))
+                        }
+                        // too many ancestors
+                        Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
+                            ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
+                        }
+                        // rest can be mapped
+                        other => passthrough_api_error(&node, other),
+                    }
+                })
+                .map(|res| (tenant_shard_id.shard_number, res))
+        }
+
+        // no shard needs to go first/last; the operation should be idempotent
+        // TODO: it would be great to ensure that all shards return the same error
+        let mut results = self
+            .tenant_for_shards(targets, |tenant_shard_id, node| {
+                futures::FutureExt::boxed(detach_one(
+                    tenant_shard_id,
+                    timeline_id,
+                    node,
+                    self.config.jwt_token.clone(),
+                ))
+            })
+            .await?;
+
+        let any = results.pop().expect("we must have at least one response");
+
+        // FIXME: the ordering is not stable yet on pageserver, should be (ancestor_lsn,
+        // TimelineId)
+        let mismatching = results
+            .iter()
+            .filter(|(_, res)| res != &any.1)
+            .collect::<Vec<_>>();
+        if !mismatching.is_empty() {
+            let matching = results.len() - mismatching.len();
+            tracing::error!(
+                matching,
+                compared_against=?any,
+                ?mismatching,
+                "shards returned different results"
+            );
+        }
+
+        Ok(any.1)
+    }
+
     /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
     ///
     /// On success, the returned vector contains exactly the same number of elements as the input `locations`.
@@ -2894,8 +3004,8 @@ impl Service {
                 .await
                 .map_err(|e| {
                     ApiError::InternalServerError(anyhow::anyhow!(
-                    "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
-                ))
+                        "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
+                    ))
                 })
         }
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 463e4a3b01..90ed838e1d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2400,7 +2400,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
     def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
         """
-        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
+        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int}
         """
         response = self.request(
             "GET",
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 606ce203cd..803fcac583 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -11,11 +11,12 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
+    flush_ep_to_pageserver,
     wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
-from fixtures.pageserver.utils import wait_timeline_detail_404
-from fixtures.remote_storage import LocalFsStorage
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404
+from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.utils import assert_pageserver_backups_equal
 
 
@@ -559,11 +560,24 @@ def test_compaction_induced_by_detaches_in_history(
     assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set())
 
 
-def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
-    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+@pytest.mark.parametrize("sharded", [True, False])
+def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, sharded: bool):
+    shards = 2 if sharded else 1
 
-    client = env.pageserver.http_client()
+    neon_env_builder.num_pageservers = shards
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None)
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+
+    for ps in pageservers.values():
+        ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    if sharded:
+        # FIXME: should this be in the neon_env_builder.init_start?
+        env.storage_controller.reconcile_until_idle()
+        client = env.storage_controller.pageserver_api()
+    else:
+        client = env.pageserver.http_client()
 
     with pytest.raises(PageserverApiException, match=".* no ancestors") as info:
         client.detach_ancestor(env.initial_tenant, env.initial_timeline)
@@ -577,6 +591,17 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
         client.detach_ancestor(env.initial_tenant, second_branch)
     assert info.value.status_code == 400
 
+    client.detach_ancestor(env.initial_tenant, first_branch)
+
+    # FIXME: this should be done by the http req handler
+    for ps in pageservers.values():
+        ps.quiesce_tenants()
+
+    with pytest.raises(PageserverApiException, match=".* no ancestors") as info:
+        client.detach_ancestor(env.initial_tenant, first_branch)
+    # FIXME: this should be 200 OK because we've already completed it
+    assert info.value.status_code == 409
+
     client.tenant_delete(env.initial_tenant)
 
     with pytest.raises(PageserverApiException) as e:
@@ -584,6 +609,58 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
     assert e.value.status_code == 404
 
 
+def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
+    branch_name = "soon_detached"
+    shard_count = 4
+    neon_env_builder.num_pageservers = shard_count
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    for ps in env.pageservers:
+        ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    # FIXME: should this be in the neon_env_builder.init_start?
+    env.storage_controller.reconcile_until_idle()
+    shards = env.storage_controller.locate(env.initial_tenant)
+
+    branch_timeline_id = env.neon_cli.create_branch(branch_name, tenant_id=env.initial_tenant)
+
+    with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep:
+        ep.safe_psql(
+            "create table foo as select 1::bigint, i::bigint from generate_series(1, 10000) v(i)"
+        )
+        lsn = flush_ep_to_pageserver(env, ep, env.initial_tenant, branch_timeline_id)
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+
+    for shard_info in shards:
+        node_id = int(shard_info["node_id"])
+        shard_id = shard_info["shard_id"]
+        detail = pageservers[node_id].http_client().timeline_detail(shard_id, branch_timeline_id)
+
+        assert Lsn(detail["last_record_lsn"]) >= lsn
+        assert Lsn(detail["initdb_lsn"]) < lsn
+        assert TimelineId(detail["ancestor_timeline_id"]) == env.initial_timeline
+
+    env.storage_controller.pageserver_api().detach_ancestor(env.initial_tenant, branch_timeline_id)
+
+    for shard_info in shards:
+        node_id = int(shard_info["node_id"])
+        shard_id = shard_info["shard_id"]
+
+        # TODO: ensure quescing is done on pageserver?
+        pageservers[node_id].quiesce_tenants()
+        detail = pageservers[node_id].http_client().timeline_detail(shard_id, branch_timeline_id)
+        wait_for_last_record_lsn(
+            pageservers[node_id].http_client(), shard_id, branch_timeline_id, lsn
+        )
+        assert detail.get("ancestor_timeline_id") is None
+
+    with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep:
+        count = int(ep.safe_psql("select count(*) from foo")[0][0])
+        assert count == 10000
+
+
 # TODO:
 # - after starting the operation, tenant is deleted
 # - after starting the operation, pageserver is shutdown, restarted
@@ -591,3 +668,11 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
 # - deletion of reparented while reparenting should fail once, then succeed (?)
 # - branch near existing L1 boundary, image layers?
 # - investigate: why are layers started at uneven lsn? not just after branching, but in general.
+#
+# TEST: 1. tad which partially succeeds, one returns 500
+#       2. create branch below timeline? or delete timeline below
+#       3. on retry all should report the same reparented timelines
+#
+# TEST: 1. tad is started, one node stalls, other restarts
+#       2. client timeout before stall over
+#       3. on retry with stalled and other being able to proceed

From 04448ac3231deaae9f418812b96c60ed2bfa5bd1 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 15 Jul 2024 17:43:05 +0100
Subject: [PATCH 1190/1571] pageserver: use PITR GC cutoffs as authoritative
 (#8365)

## Problem

Pageserver GC uses a size-based condition (GC "horizon" in addition to
time-based "PITR").

Eventually we plan to retire the size-based condition:
https://github.com/neondatabase/neon/issues/6374

Currently, we always apply the more conservative of the two, meaning
that tenants always retain at least 64MB of history (default horizon),
even after a very long time has passed. This is particularly acute in
cases where someone has dropped tables/databases, and then leaves a
database idle: the horizon can prevent GCing very large quantities of
historical data (we already account for this in synthetic size by
ignoring gc horizon).

We're not entirely removing GC horizon right now because we don't want
to 100% rely on standby_horizon for robustness of physical replication,
but we can tweak our logic to avoid retaining that 64MB LSN length
indefinitely.

## Summary of changes

- Rework `Timeline::find_gc_cutoffs`, with new logic:
- If there is no PITR set, then use `DEFAULT_PITR_INTERVAL` (1 week) to
calculate a time threshold. Retain either the horizon or up to that
thresholds, whichever requires less data.
- When there is a PITR set, and we have unambiguously resolved the
timestamp to an LSN, then ignore the GC horizon entirely. For typical
PITRs (1 day, 1 week), this will still easily retain enough data to
avoid stressing read only replicas.

The key property we end up with, whether a PITR is set or not, is that
after enough time has passed, our GC cutoff on an idle timeline will
catch up with the last_record_lsn.

Using `DEFAULT_PITR_INTERVAL` is a bit of an arbitrary hack, but this
feels like it isn't really worth the noise of exposing in TenantConfig.
We could just make it a different named constant though. The end-end
state will be that there is no gc_horizon at all, and that tenants with
pitr_interval=0 would truly retain no history, so this constant would go
away.
---
 pageserver/src/tenant/timeline.rs         | 150 +++++++++++++---------
 test_runner/regress/test_branch_and_gc.py |   4 +-
 2 files changed, 91 insertions(+), 63 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a3ddb3a1d1..0996616a67 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -69,6 +69,7 @@ use std::{
 use crate::{
     aux_file::AuxFileSizeEstimator,
     tenant::{
+        config::defaults::DEFAULT_PITR_INTERVAL,
         layer_map::{LayerMap, SearchResult},
         metadata::TimelineMetadata,
         storage_layer::PersistentLayerDesc,
@@ -4945,20 +4946,17 @@ impl Timeline {
     }
 
     /// Find the Lsns above which layer files need to be retained on
-    /// garbage collection. This is separate from actually performing the GC,
-    /// and is updated more frequently, so that compaction can remove obsolete
-    /// page versions more aggressively.
+    /// garbage collection.
     ///
-    /// TODO: that's wishful thinking, compaction doesn't actually do that
-    /// currently.
+    /// We calculate two cutoffs, one based on time and one based on WAL size.  `pitr`
+    /// controls the time cutoff (or ZERO to disable time-based retention), and `cutoff_horizon` controls
+    /// the space-based retention.
     ///
-    /// The 'cutoff_horizon' point is used to retain recent versions that might still be
-    /// needed by read-only nodes. (As of this writing, the caller just passes
-    /// the latest LSN subtracted by a constant, and doesn't do anything smart
-    /// to figure out what read-only nodes might actually need.)
-    ///
-    /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
-    /// whether a record is needed for PITR.
+    /// This function doesn't simply to calculate time & space based retention: it treats time-based
+    /// retention as authoritative if enabled, and falls back to space-based retention if calculating
+    /// the LSN for a time point isn't possible.  Therefore the GcCutoffs::horizon in the response might
+    /// be different to the `cutoff_horizon` input.  Callers should treat the min() of the two cutoffs
+    /// in the response as the GC cutoff point for the timeline.
     #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
     pub(super) async fn find_gc_cutoffs(
         &self,
@@ -4975,58 +4973,88 @@ impl Timeline {
 
         pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");
 
-        // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
-        //
-        // Some unit tests depend on garbage-collection working even when
-        // CLOG data is missing, so that find_lsn_for_timestamp() doesn't
-        // work, so avoid calling it altogether if time-based retention is not
-        // configured. It would be pointless anyway.
-        let pitr_cutoff = if pitr != Duration::ZERO {
-            let now = SystemTime::now();
-            if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
-                let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
-
-                match self
-                    .find_lsn_for_timestamp(pitr_timestamp, cancel, ctx)
-                    .await?
-                {
-                    LsnForTimestamp::Present(lsn) => lsn,
-                    LsnForTimestamp::Future(lsn) => {
-                        // The timestamp is in the future. That sounds impossible,
-                        // but what it really means is that there hasn't been
-                        // any commits since the cutoff timestamp.
-                        //
-                        // In this case we should use the LSN of the most recent commit,
-                        // which is implicitly the last LSN in the log.
-                        debug!("future({})", lsn);
-                        self.get_last_record_lsn()
-                    }
-                    LsnForTimestamp::Past(lsn) => {
-                        debug!("past({})", lsn);
-                        // conservative, safe default is to remove nothing, when we
-                        // have no commit timestamp data available
-                        *self.get_latest_gc_cutoff_lsn()
-                    }
-                    LsnForTimestamp::NoData(lsn) => {
-                        debug!("nodata({})", lsn);
-                        // conservative, safe default is to remove nothing, when we
-                        // have no commit timestamp data available
-                        *self.get_latest_gc_cutoff_lsn()
-                    }
-                }
-            } else {
-                // If we don't have enough data to convert to LSN,
-                // play safe and don't remove any layers.
-                *self.get_latest_gc_cutoff_lsn()
+        if cfg!(test) {
+            // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
+            if pitr == Duration::ZERO {
+                return Ok(GcCutoffs {
+                    pitr: self.get_last_record_lsn(),
+                    horizon: cutoff_horizon,
+                });
+            }
+        }
+
+        // Calculate a time-based limit on how much to retain:
+        // - if PITR interval is set, then this is our cutoff.
+        // - if PITR interval is not set, then we do a lookup
+        //   based on DEFAULT_PITR_INTERVAL, so that size-based retention (horizon)
+        //   does not result in keeping history around permanently on idle databases.
+        let time_cutoff = {
+            let now = SystemTime::now();
+            let time_range = if pitr == Duration::ZERO {
+                humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid")
+            } else {
+                pitr
+            };
+
+            // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case)
+            let time_cutoff = now.checked_sub(time_range).unwrap_or(now);
+            let timestamp = to_pg_timestamp(time_cutoff);
+
+            match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? {
+                LsnForTimestamp::Present(lsn) => Some(lsn),
+                LsnForTimestamp::Future(lsn) => {
+                    // The timestamp is in the future. That sounds impossible,
+                    // but what it really means is that there hasn't been
+                    // any commits since the cutoff timestamp.
+                    //
+                    // In this case we should use the LSN of the most recent commit,
+                    // which is implicitly the last LSN in the log.
+                    debug!("future({})", lsn);
+                    Some(self.get_last_record_lsn())
+                }
+                LsnForTimestamp::Past(lsn) => {
+                    debug!("past({})", lsn);
+                    None
+                }
+                LsnForTimestamp::NoData(lsn) => {
+                    debug!("nodata({})", lsn);
+                    None
+                }
             }
-        } else {
-            // No time-based retention was configured. Interpret this as "keep no history".
-            self.get_last_record_lsn()
         };
 
-        Ok(GcCutoffs {
-            horizon: cutoff_horizon,
-            pitr: pitr_cutoff,
+        Ok(match (pitr, time_cutoff) {
+            (Duration::ZERO, Some(time_cutoff)) => {
+                // PITR is not set. Retain the size-based limit, or the default time retention,
+                // whichever requires less data.
+                GcCutoffs {
+                    pitr: std::cmp::max(time_cutoff, cutoff_horizon),
+                    horizon: std::cmp::max(time_cutoff, cutoff_horizon),
+                }
+            }
+            (Duration::ZERO, None) => {
+                // PITR is not set, and time lookup failed
+                GcCutoffs {
+                    pitr: self.get_last_record_lsn(),
+                    horizon: cutoff_horizon,
+                }
+            }
+            (_, None) => {
+                // PITR interval is set & we didn't look up a timestamp successfully.  Conservatively assume PITR
+                // cannot advance beyond what was already GC'd, and respect space-based retention
+                GcCutoffs {
+                    pitr: *self.get_latest_gc_cutoff_lsn(),
+                    horizon: cutoff_horizon,
+                }
+            }
+            (_, Some(time_cutoff)) => {
+                // PITR interval is set and we looked up timestamp successfully.  Ignore
+                // size based retention and make time cutoff authoritative
+                GcCutoffs {
+                    pitr: time_cutoff,
+                    horizon: time_cutoff,
+                }
+            }
         })
     }
 
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index eb503ddbfa..f2e3855c12 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -65,8 +65,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str):
             "compaction_period": "1 s",
             "compaction_threshold": "2",
             "image_creation_threshold": "1",
-            # set PITR interval to be small, so we can do GC
-            "pitr_interval": "1 s",
+            # Disable PITR, this test will set an explicit space-based GC limit
+            "pitr_interval": "0 s",
         }
     )
 

From 730db859c741f6e782f721de12e8ec776c4ceb0a Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 15 Jul 2024 20:47:53 +0300
Subject: [PATCH 1191/1571] feat(timeline_detach_ancestor): success idempotency
  (#8354)

Right now timeline detach ancestor reports an error (409, "no ancestor")
on a new attempt after successful completion. This makes it troublesome
for storage controller retries. Fix it to respond with `200 OK` as if
the operation had just completed quickly.

Additionally, the returned timeline identifiers in the 200 OK response
are now ordered so that responses between different nodes for error
comparison are done by the storage controller added in #8353.

Design-wise, this PR introduces a new strategy for accessing the latest
uploaded IndexPart:
`RemoteTimelineClient::initialized_upload_queue(&self) ->
Result<UploadQueueAccessor<'_>, NotInitialized>`. It should be a more
scalable way to query the latest uploaded `IndexPart` than to add a
query method for each question directly on `RemoteTimelineClient`.

GC blocking will need to be introduced to make the operation fully
idempotent. However, it is idempotent for the cases demonstrated by
tests.

Cc: #6994
---
 pageserver/src/http/routes.rs                 |  45 +-
 .../src/tenant/remote_timeline_client.rs      |  27 +-
 .../tenant/remote_timeline_client/index.rs    |  26 ++
 pageserver/src/tenant/timeline.rs             |   8 +-
 .../src/tenant/timeline/detach_ancestor.rs    | 130 +++++-
 pageserver/src/tenant/upload_queue.rs         |  10 +-
 storage_controller/src/service.rs             |   9 +-
 test_runner/fixtures/pageserver/http.py       |  21 +-
 .../regress/test_timeline_detach_ancestor.py  | 430 ++++++++++++++++--
 9 files changed, 632 insertions(+), 74 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6f8f3e6389..d7ef70477f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1721,7 +1721,9 @@ async fn timeline_detach_ancestor_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    use crate::tenant::timeline::detach_ancestor::Options;
+    use crate::tenant::timeline::detach_ancestor;
+    use pageserver_api::models::detach_ancestor::AncestorDetached;
+
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1729,7 +1731,7 @@ async fn timeline_detach_ancestor_handler(
     let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
 
     async move {
-        let mut options = Options::default();
+        let mut options = detach_ancestor::Options::default();
 
         let rewrite_concurrency =
             parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
@@ -1757,27 +1759,36 @@ async fn timeline_detach_ancestor_handler(
 
         let timeline = tenant.get_timeline(timeline_id, true)?;
 
-        let (_guard, prepared) = timeline
+        let progress = timeline
             .prepare_to_detach_from_ancestor(&tenant, options, ctx)
             .await?;
 
-        let res = state
-            .tenant_manager
-            .complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
-            .await;
+        // uncomment to allow early as possible Tenant::drop
+        // drop(tenant);
 
-        match res {
-            Ok(reparented_timelines) => {
-                let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
+        let resp = match progress {
+            detach_ancestor::Progress::Prepared(_guard, prepared) => {
+                // it would be great to tag the guard on to the tenant activation future
+                let reparented_timelines = state
+                    .tenant_manager
+                    .complete_detaching_timeline_ancestor(
+                        tenant_shard_id,
+                        timeline_id,
+                        prepared,
+                        ctx,
+                    )
+                    .await
+                    .context("timeline detach ancestor completion")
+                    .map_err(ApiError::InternalServerError)?;
+
+                AncestorDetached {
                     reparented_timelines,
-                };
-
-                json_response(StatusCode::OK, resp)
+                }
             }
-            Err(e) => Err(ApiError::InternalServerError(
-                e.context("timeline detach completion"),
-            )),
-        }
+            detach_ancestor::Progress::Done(resp) => resp,
+        };
+
+        json_response(StatusCode::OK, resp)
     }
     .instrument(span)
     .await
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index bc9364de61..66b759c8e0 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -241,7 +241,7 @@ use self::index::IndexPart;
 
 use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerName, ResidentLayer};
-use super::upload_queue::SetDeletedFlagProgress;
+use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
 use super::Generation;
 
 pub(crate) use download::{
@@ -1930,6 +1930,31 @@ impl RemoteTimelineClient {
             }
         }
     }
+
+    /// Returns an accessor which will hold the UploadQueue mutex for accessing the upload queue
+    /// externally to RemoteTimelineClient.
+    pub(crate) fn initialized_upload_queue(
+        &self,
+    ) -> Result<UploadQueueAccessor<'_>, NotInitialized> {
+        let mut inner = self.upload_queue.lock().unwrap();
+        inner.initialized_mut()?;
+        Ok(UploadQueueAccessor { inner })
+    }
+}
+
+pub(crate) struct UploadQueueAccessor<'a> {
+    inner: std::sync::MutexGuard<'a, UploadQueue>,
+}
+
+impl<'a> UploadQueueAccessor<'a> {
+    pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart {
+        match &*self.inner {
+            UploadQueue::Initialized(x) => &x.clean.0,
+            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
+                unreachable!("checked before constructing")
+            }
+        }
+    }
 }
 
 pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 6233a3477e..b439df8edb 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -176,6 +176,24 @@ pub(crate) struct Lineage {
     ///
     /// If you are adding support for detaching from a hierarchy, consider changing the ancestry
     /// into a `Vec<(TimelineId, Lsn)>` to be a path instead.
+    // FIXME: this is insufficient even for path of two timelines for future wal recovery
+    // purposes:
+    //
+    // assuming a "old main" which has received most of the WAL, and has a branch "new main",
+    // starting a bit before "old main" last_record_lsn. the current version works fine,
+    // because we will know to replay wal and branch at the recorded Lsn to do wal recovery.
+    //
+    // then assuming "new main" would similarly receive a branch right before its last_record_lsn,
+    // "new new main". the current implementation would just store ("new main", ancestor_lsn, _)
+    // here. however, we cannot recover from WAL using only that information, we would need the
+    // whole ancestry here:
+    //
+    // ```json
+    // [
+    //   ["old main", ancestor_lsn("new main"), _],
+    //   ["new main", ancestor_lsn("new new main"), _]
+    // ]
+    // ```
     #[serde(skip_serializing_if = "Option::is_none", default)]
     original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>,
 }
@@ -217,6 +235,14 @@ impl Lineage {
         self.original_ancestor
             .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
     }
+
+    pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
+        self.original_ancestor.is_some()
+    }
+
+    pub(crate) fn is_reparented(&self) -> bool {
+        !self.reparenting_history.is_empty()
+    }
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0996616a67..239dce8786 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4733,13 +4733,7 @@ impl Timeline {
         tenant: &crate::tenant::Tenant,
         options: detach_ancestor::Options,
         ctx: &RequestContext,
-    ) -> Result<
-        (
-            completion::Completion,
-            detach_ancestor::PreparedTimelineDetach,
-        ),
-        detach_ancestor::Error,
-    > {
+    ) -> Result<detach_ancestor::Progress, detach_ancestor::Error> {
         detach_ancestor::prepare(self, tenant, options, ctx).await
     }
 
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 4fc89330ba..49ce3db3e6 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -10,6 +10,7 @@ use crate::{
     },
     virtual_file::{MaybeFatalIo, VirtualFile},
 };
+use pageserver_api::models::detach_ancestor::AncestorDetached;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
@@ -39,6 +40,9 @@ pub(crate) enum Error {
 
     #[error("unexpected error")]
     Unexpected(#[source] anyhow::Error),
+
+    #[error("failpoint: {}", .0)]
+    Failpoint(&'static str),
 }
 
 impl From<Error> for ApiError {
@@ -57,11 +61,41 @@ impl From<Error> for ApiError {
             | e @ Error::CopyDeltaPrefix(_)
             | e @ Error::UploadRewritten(_)
             | e @ Error::CopyFailed(_)
-            | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
+            | e @ Error::Unexpected(_)
+            | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()),
         }
     }
 }
 
+impl From<crate::tenant::upload_queue::NotInitialized> for Error {
+    fn from(_: crate::tenant::upload_queue::NotInitialized) -> Self {
+        // treat all as shutting down signals, even though that is not entirely correct
+        // (uninitialized state)
+        Error::ShuttingDown
+    }
+}
+
+impl From<FlushLayerError> for Error {
+    fn from(value: FlushLayerError) -> Self {
+        match value {
+            FlushLayerError::Cancelled => Error::ShuttingDown,
+            FlushLayerError::NotRunning(_) => {
+                // FIXME(#6424): technically statically unreachable right now, given how we never
+                // drop the sender
+                Error::ShuttingDown
+            }
+            FlushLayerError::CreateImageLayersError(_) | FlushLayerError::Other(_) => {
+                Error::FlushAncestor(value)
+            }
+        }
+    }
+}
+
+pub(crate) enum Progress {
+    Prepared(completion::Completion, PreparedTimelineDetach),
+    Done(AncestorDetached),
+}
+
 pub(crate) struct PreparedTimelineDetach {
     layers: Vec<Layer>,
 }
@@ -88,7 +122,7 @@ pub(super) async fn prepare(
     tenant: &Tenant,
     options: Options,
     ctx: &RequestContext,
-) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
+) -> Result<Progress, Error> {
     use Error::*;
 
     let Some((ancestor, ancestor_lsn)) = detached
@@ -96,15 +130,67 @@ pub(super) async fn prepare(
         .as_ref()
         .map(|tl| (tl.clone(), detached.ancestor_lsn))
     else {
-        // TODO: check if we have already been detached; for this we need to read the stored data
-        // on remote client, for that we need a follow-up which makes uploads cheaper and maintains
-        // a projection of the commited data.
+        {
+            let accessor = detached.remote_client.initialized_upload_queue()?;
+
+            // we are safe to inspect the latest uploaded, because we can only witness this after
+            // restart is complete and ancestor is no more.
+            let latest = accessor.latest_uploaded_index_part();
+            if !latest.lineage.is_detached_from_original_ancestor() {
+                return Err(NoAncestor);
+            }
+        }
+
+        // detached has previously been detached; let's inspect each of the current timelines and
+        // report back the timelines which have been reparented by our detach
+        let mut all_direct_children = tenant
+            .timelines
+            .lock()
+            .unwrap()
+            .values()
+            .filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
+            .map(|tl| (tl.ancestor_lsn, tl.clone()))
+            .collect::<Vec<_>>();
+
+        let mut any_shutdown = false;
+
+        all_direct_children.retain(
+            |(_, tl)| match tl.remote_client.initialized_upload_queue() {
+                Ok(accessor) => accessor
+                    .latest_uploaded_index_part()
+                    .lineage
+                    .is_reparented(),
+                Err(_shutdownalike) => {
+                    // not 100% a shutdown, but let's bail early not to give inconsistent results in
+                    // sharded enviroment.
+                    any_shutdown = true;
+                    true
+                }
+            },
+        );
+
+        if any_shutdown {
+            // it could be one or many being deleted; have client retry
+            return Err(Error::ShuttingDown);
+        }
+
+        let mut reparented = all_direct_children;
+        // why this instead of hashset? there is a reason, but I've forgotten it many times.
         //
-        // the error is wrong per openapi
-        return Err(NoAncestor);
+        // maybe if this was a hashset we would not be able to distinguish some race condition.
+        reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));
+
+        return Ok(Progress::Done(AncestorDetached {
+            reparented_timelines: reparented
+                .into_iter()
+                .map(|(_, tl)| tl.timeline_id)
+                .collect(),
+        }));
     };
 
     if !ancestor_lsn.is_valid() {
+        // rare case, probably wouldn't even load
+        tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing");
         return Err(NoAncestor);
     }
 
@@ -131,6 +217,15 @@ pub(super) async fn prepare(
 
     let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
 
+    utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable");
+
+    fail::fail_point!(
+        "timeline-detach-ancestor::before_starting_after_locking",
+        |_| Err(Error::Failpoint(
+            "timeline-detach-ancestor::before_starting_after_locking"
+        ))
+    );
+
     if ancestor_lsn >= ancestor.get_disk_consistent_lsn() {
         let span =
             tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id);
@@ -151,7 +246,7 @@ pub(super) async fn prepare(
                 }
             };
 
-            res.map_err(FlushAncestor)?;
+            res?;
 
             // we do not need to wait for uploads to complete but we do need `struct Layer`,
             // copying delta prefix is unsupported currently for `InMemoryLayer`.
@@ -159,7 +254,7 @@ pub(super) async fn prepare(
                 elapsed_ms = started_at.elapsed().as_millis(),
                 "froze and flushed the ancestor"
             );
-            Ok(())
+            Ok::<_, Error>(())
         }
         .instrument(span)
         .await?;
@@ -283,7 +378,7 @@ pub(super) async fn prepare(
 
     let prepared = PreparedTimelineDetach { layers: new_layers };
 
-    Ok((guard, prepared))
+    Ok(Progress::Prepared(guard, prepared))
 }
 
 fn partition_work(
@@ -350,7 +445,11 @@ async fn copy_lsn_prefix(
     target_timeline: &Arc<Timeline>,
     ctx: &RequestContext,
 ) -> Result<Option<ResidentLayer>, Error> {
-    use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed};
+    use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed, ShuttingDown};
+
+    if target_timeline.cancel.is_cancelled() {
+        return Err(ShuttingDown);
+    }
 
     tracing::debug!(%layer, %end_lsn, "copying lsn prefix");
 
@@ -529,7 +628,7 @@ pub(super) async fn complete(
         match res {
             Ok(Some(timeline)) => {
                 tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
-                reparented.push(timeline.timeline_id);
+                reparented.push((timeline.ancestor_lsn, timeline.timeline_id));
             }
             Ok(None) => {
                 // lets just ignore this for now. one or all reparented timelines could had
@@ -551,5 +650,12 @@ pub(super) async fn complete(
         tracing::info!("failed to reparent some candidates");
     }
 
+    reparented.sort_unstable();
+
+    let reparented = reparented
+        .into_iter()
+        .map(|(_, timeline_id)| timeline_id)
+        .collect();
+
     Ok(reparented)
 }
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 50c977a950..f7440ecdae 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -228,18 +228,20 @@ impl UploadQueue {
         Ok(self.initialized_mut().expect("we just set it"))
     }
 
-    pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
+    pub(crate) fn initialized_mut(
+        &mut self,
+    ) -> Result<&mut UploadQueueInitialized, NotInitialized> {
         use UploadQueue::*;
         match self {
-            Uninitialized => Err(NotInitialized::Uninitialized.into()),
+            Uninitialized => Err(NotInitialized::Uninitialized),
             Initialized(x) => {
                 if x.shutting_down {
-                    Err(NotInitialized::ShuttingDown.into())
+                    Err(NotInitialized::ShuttingDown)
                 } else {
                     Ok(x)
                 }
             }
-            Stopped(_) => Err(NotInitialized::Stopped.into()),
+            Stopped(_) => Err(NotInitialized::Stopped),
         }
     }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 95522525cb..3c24433c42 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2830,9 +2830,10 @@ impl Service {
 
                     match e {
                         // no ancestor (ever)
-                        Error::ApiError(StatusCode::CONFLICT, msg) => {
-                            ApiError::Conflict(format!("{node}: {msg}"))
-                        }
+                        Error::ApiError(StatusCode::CONFLICT, msg) => ApiError::Conflict(format!(
+                            "{node}: {}",
+                            msg.strip_prefix("Conflict: ").unwrap_or(&msg)
+                        )),
                         // too many ancestors
                         Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
                             ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
@@ -2859,8 +2860,6 @@ impl Service {
 
         let any = results.pop().expect("we must have at least one response");
 
-        // FIXME: the ordering is not stable yet on pageserver, should be (ancestor_lsn,
-        // TimelineId)
         let mismatching = results
             .iter()
             .filter(|(_, res)| res != &any.1)
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 03aee9e5c5..d66b94948a 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -172,6 +172,21 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         if auth_token is not None:
             self.headers["Authorization"] = f"Bearer {auth_token}"
 
+    def without_status_retrying(self) -> PageserverHttpClient:
+        retries = Retry(
+            status=0,
+            connect=5,
+            read=False,
+            backoff_factor=0.2,
+            status_forcelist=[],
+            allowed_methods=None,
+            remove_headers_on_redirect=[],
+        )
+
+        return PageserverHttpClient(
+            self.port, self.is_testing_enabled_or_skip, self.auth_token, retries
+        )
+
     @property
     def base_url(self) -> str:
         return f"http://localhost:{self.port}"
@@ -814,17 +829,19 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         tenant_id: Union[TenantId, TenantShardId],
         timeline_id: TimelineId,
         batch_size: int | None = None,
-    ) -> Set[TimelineId]:
+        **kwargs,
+    ) -> List[TimelineId]:
         params = {}
         if batch_size is not None:
             params["batch_size"] = batch_size
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor",
             params=params,
+            **kwargs,
         )
         self.verbose_error(res)
         json = res.json()
-        return set(map(TimelineId, json["reparented_timelines"]))
+        return list(map(TimelineId, json["reparented_timelines"]))
 
     def evict_layer(
         self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 803fcac583..d75ab4c060 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -1,5 +1,7 @@
 import datetime
 import enum
+import threading
+import time
 from concurrent.futures import ThreadPoolExecutor
 from queue import Empty, Queue
 from threading import Barrier
@@ -9,6 +11,7 @@ import pytest
 from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    LogCursor,
     NeonEnvBuilder,
     PgBin,
     flush_ep_to_pageserver,
@@ -17,7 +20,8 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.utils import assert_pageserver_backups_equal
+from fixtures.utils import assert_pageserver_backups_equal, wait_until
+from requests import ReadTimeout
 
 
 def by_end_lsn(info: HistoricLayerInfo) -> Lsn:
@@ -161,7 +165,7 @@ def test_ancestor_detach_branched_from(
     )
 
     all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
-    assert all_reparented == set()
+    assert all_reparented == []
 
     if restart_after:
         env.pageserver.stop()
@@ -270,7 +274,7 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
     after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None)
 
     all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
-    assert all_reparented == {reparented, same_branchpoint}
+    assert set(all_reparented) == {reparented, same_branchpoint}
 
     env.pageserver.quiesce_tenants()
 
@@ -530,7 +534,7 @@ def test_compaction_induced_by_detaches_in_history(
 
     for _, timeline_id in skip_main:
         reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
-        assert reparented == set(), "we have no earlier branches at any level"
+        assert reparented == [], "we have no earlier branches at any level"
 
     post_detach_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id)))
     assert len(post_detach_l0s) == 5, "should had inherited 4 L0s, have 5 in total"
@@ -561,7 +565,9 @@ def test_compaction_induced_by_detaches_in_history(
 
 
 @pytest.mark.parametrize("sharded", [True, False])
-def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, sharded: bool):
+def test_timeline_ancestor_detach_idempotent_success(
+    neon_env_builder: NeonEnvBuilder, sharded: bool
+):
     shards = 2 if sharded else 1
 
     neon_env_builder.num_pageservers = shards
@@ -579,28 +585,28 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard
     else:
         client = env.pageserver.http_client()
 
-    with pytest.raises(PageserverApiException, match=".* no ancestors") as info:
-        client.detach_ancestor(env.initial_tenant, env.initial_timeline)
-    assert info.value.status_code == 409
-
     first_branch = env.neon_cli.create_branch("first_branch")
-    second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch")
 
-    # funnily enough this does not have a prefix
-    with pytest.raises(PageserverApiException, match="too many ancestors") as info:
-        client.detach_ancestor(env.initial_tenant, second_branch)
-    assert info.value.status_code == 400
+    _ = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch")
 
-    client.detach_ancestor(env.initial_tenant, first_branch)
+    # these two will be reparented, and they should be returned in stable order
+    # from pageservers OR otherwise there will be an `error!` logging from
+    # storage controller
+    reparented1 = env.neon_cli.create_branch("first_reparented", ancestor_branch_name="main")
+    reparented2 = env.neon_cli.create_branch("second_reparented", ancestor_branch_name="main")
+
+    first_reparenting_response = client.detach_ancestor(env.initial_tenant, first_branch)
+    assert set(first_reparenting_response) == {reparented1, reparented2}
 
     # FIXME: this should be done by the http req handler
     for ps in pageservers.values():
         ps.quiesce_tenants()
 
-    with pytest.raises(PageserverApiException, match=".* no ancestors") as info:
-        client.detach_ancestor(env.initial_tenant, first_branch)
-    # FIXME: this should be 200 OK because we've already completed it
-    assert info.value.status_code == 409
+    for _ in range(5):
+        # once completed, we can retry this how many times
+        assert (
+            client.detach_ancestor(env.initial_tenant, first_branch) == first_reparenting_response
+        )
 
     client.tenant_delete(env.initial_tenant)
 
@@ -609,7 +615,50 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard
     assert e.value.status_code == 404
 
 
+@pytest.mark.parametrize("sharded", [True, False])
+def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, sharded: bool):
+    # the test is split from test_timeline_ancestor_detach_idempotent_success as only these error cases should create "request was dropped before completing",
+    # given the current first error handling
+    shards = 2 if sharded else 1
+
+    neon_env_builder.num_pageservers = shards
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None)
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+
+    for ps in pageservers.values():
+        ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+        ps.allowed_errors.append(
+            ".* WARN .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: request was dropped before completing"
+        )
+
+    client = (
+        env.pageserver.http_client() if not sharded else env.storage_controller.pageserver_api()
+    )
+
+    with pytest.raises(PageserverApiException, match=".* no ancestors") as info:
+        client.detach_ancestor(env.initial_tenant, env.initial_timeline)
+    assert info.value.status_code == 409
+
+    _ = env.neon_cli.create_branch("first_branch")
+
+    second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch")
+
+    # funnily enough this does not have a prefix
+    with pytest.raises(PageserverApiException, match="too many ancestors") as info:
+        client.detach_ancestor(env.initial_tenant, second_branch)
+    assert info.value.status_code == 400
+
+
 def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
+    """
+    Sharded timeline detach ancestor; 4 nodes: 1 stuck, 1 restarted, 2 normal.
+
+    Stuck node gets stuck on a pause failpoint for first storage controller request.
+    Restarted node remains stuck until explicit restart from test code.
+
+    We retry the request until storage controller gets 200 OK from all nodes.
+    """
     branch_name = "soon_detached"
     shard_count = 4
     neon_env_builder.num_pageservers = shard_count
@@ -621,8 +670,15 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
 
     # FIXME: should this be in the neon_env_builder.init_start?
     env.storage_controller.reconcile_until_idle()
+    # as we will stop a node, make sure there is no clever rebalancing
+    env.storage_controller.tenant_policy_update(env.initial_tenant, body={"scheduling": "Stop"})
+    env.storage_controller.allowed_errors.append(".*: Scheduling is disabled by policy Stop .*")
+
     shards = env.storage_controller.locate(env.initial_tenant)
 
+    utilized_pageservers = {x["node_id"] for x in shards}
+    assert len(utilized_pageservers) > 1, "all shards got placed on single pageserver?"
+
     branch_timeline_id = env.neon_cli.create_branch(branch_name, tenant_id=env.initial_tenant)
 
     with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep:
@@ -642,7 +698,79 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
         assert Lsn(detail["initdb_lsn"]) < lsn
         assert TimelineId(detail["ancestor_timeline_id"]) == env.initial_timeline
 
-    env.storage_controller.pageserver_api().detach_ancestor(env.initial_tenant, branch_timeline_id)
+    # make one of the nodes get stuck, but continue the initial operation
+    # make another of the nodes get stuck, then restart
+
+    stuck = pageservers[int(shards[0]["node_id"])]
+    stuck.allowed_errors.append(".*: request was dropped before completing")
+    env.storage_controller.allowed_errors.append(".*: request was dropped before completing")
+    stuck_http = stuck.http_client()
+    stuck_http.configure_failpoints(
+        ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause")
+    )
+
+    restarted = pageservers[int(shards[1]["node_id"])]
+    restarted.allowed_errors.extend(
+        [
+            ".*: request was dropped before completing",
+            ".*: Cancelled request finished with an error: ShuttingDown",
+        ]
+    )
+    assert restarted.id != stuck.id
+    restarted_http = restarted.http_client()
+    restarted_http.configure_failpoints(
+        [
+            ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause"),
+        ]
+    )
+
+    target = env.storage_controller.pageserver_api()
+
+    with pytest.raises(ReadTimeout):
+        target.detach_ancestor(env.initial_tenant, branch_timeline_id, timeout=1)
+
+    stuck_http.configure_failpoints(
+        ("timeline-detach-ancestor::before_starting_after_locking_pausable", "off")
+    )
+
+    barrier = threading.Barrier(2)
+
+    def restart_restarted():
+        barrier.wait()
+        # graceful shutdown should just work, because simultaneously unpaused
+        restarted.stop()
+        # this does not happen always, depends how fast we exit after unpausing
+        # restarted.assert_log_contains("Cancelled request finished with an error: ShuttingDown")
+        restarted.start()
+
+    with ThreadPoolExecutor(max_workers=1) as pool:
+        fut = pool.submit(restart_restarted)
+        barrier.wait()
+        # we have 10s, lets use 1/2 of that to help the shutdown start
+        time.sleep(5)
+        restarted_http.configure_failpoints(
+            ("timeline-detach-ancestor::before_starting_after_locking_pausable", "off")
+        )
+        fut.result()
+
+    # detach ancestor request handling is not sensitive to http cancellation.
+    # this means that the "stuck" is on its way to complete the detach, but the restarted is off
+    # now it can either be complete on all nodes, or still in progress with
+    # one.
+    without_retrying = target.without_status_retrying()
+
+    # this retry loop will be long enough that the tenant can always activate
+    reparented = None
+    for _ in range(10):
+        try:
+            reparented = without_retrying.detach_ancestor(env.initial_tenant, branch_timeline_id)
+        except PageserverApiException as info:
+            assert info.status_code == 503
+            time.sleep(2)
+        else:
+            break
+
+    assert reparented == [], "too many retries (None) or unexpected reparentings"
 
     for shard_info in shards:
         node_id = int(shard_info["node_id"])
@@ -661,8 +789,262 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
         assert count == 10000
 
 
+@pytest.mark.parametrize("mode", ["delete_timeline", "delete_tenant"])
+@pytest.mark.parametrize("sharded", [False, True])
+def test_timeline_detach_ancestor_interrupted_by_deletion(
+    neon_env_builder: NeonEnvBuilder, mode: str, sharded: bool
+):
+    """
+    Timeline ancestor detach interrupted by deleting either:
+    - the detached timeline
+    - the whole tenant
+
+    after starting the detach.
+
+    What remains not tested by this:
+    - shutdown winning over complete
+
+    Shutdown winning over complete needs gc blocking and reparenting any left-overs on retry.
+    """
+
+    if sharded and mode == "delete_tenant":
+        # the shared/exclusive lock for tenant is blocking this:
+        # timeline detach ancestor takes shared, delete tenant takes exclusive
+        pytest.skip(
+            "tenant deletion while timeline ancestor detach is underway is not supported yet"
+        )
+
+    shard_count = 2 if sharded else 1
+
+    neon_env_builder.num_pageservers = shard_count
+
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count if sharded else None)
+
+    for ps in env.pageservers:
+        ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+
+    detached_timeline = env.neon_cli.create_branch("detached soon", "main")
+
+    failpoint = "timeline-detach-ancestor::before_starting_after_locking_pausable"
+
+    env.storage_controller.reconcile_until_idle()
+    shards = env.storage_controller.locate(env.initial_tenant)
+
+    assert len(set(info["node_id"] for info in shards)) == shard_count
+
+    target = env.storage_controller.pageserver_api() if sharded else env.pageserver.http_client()
+    target = target.without_status_retrying()
+
+    victim = pageservers[int(shards[-1]["node_id"])]
+    victim_http = victim.http_client()
+    victim_http.configure_failpoints((failpoint, "pause"))
+
+    def detach_ancestor():
+        target.detach_ancestor(env.initial_tenant, detached_timeline)
+
+    def at_failpoint() -> Tuple[str, LogCursor]:
+        return victim.assert_log_contains(f"at failpoint {failpoint}")
+
+    def start_delete():
+        if mode == "delete_timeline":
+            target.timeline_delete(env.initial_tenant, detached_timeline)
+        elif mode == "delete_tenant":
+            target.tenant_delete(env.initial_tenant)
+        else:
+            raise RuntimeError(f"unimplemented mode {mode}")
+
+    def at_waiting_on_gate_close(start_offset: LogCursor) -> LogCursor:
+        _, offset = victim.assert_log_contains(
+            "closing is taking longer than expected", offset=start_offset
+        )
+        return offset
+
+    def is_deleted():
+        try:
+            if mode == "delete_timeline":
+                target.timeline_detail(env.initial_tenant, detached_timeline)
+            elif mode == "delete_tenant":
+                target.tenant_status(env.initial_tenant)
+            else:
+                return False
+        except PageserverApiException as e:
+            assert e.status_code == 404
+            return True
+        else:
+            raise RuntimeError("waiting for 404")
+
+    with ThreadPoolExecutor(max_workers=2) as pool:
+        try:
+            fut = pool.submit(detach_ancestor)
+            _, offset = wait_until(10, 1.0, at_failpoint)
+
+            delete = pool.submit(start_delete)
+
+            wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset))
+
+            victim_http.configure_failpoints((failpoint, "off"))
+
+            delete.result()
+
+            assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}"
+
+            with pytest.raises(PageserverApiException) as exc:
+                fut.result()
+            assert exc.value.status_code == 503
+        finally:
+            victim_http.configure_failpoints((failpoint, "off"))
+
+
+@pytest.mark.parametrize("mode", ["delete_reparentable_timeline"])
+def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnvBuilder, mode: str):
+    """
+    Technically possible storage controller concurrent interleaving timeline
+    deletion with timeline detach.
+
+    Deletion is fine, as any sharded pageservers reach the same end state, but
+    creating reparentable timeline would create an issue as the two nodes would
+    never agree. There is a solution though: the created reparentable timeline
+    must be detached.
+    """
+
+    assert (
+        mode == "delete_reparentable_timeline"
+    ), "only one now, but we could have the create just as well, need gc blocking"
+
+    shard_count = 2
+    neon_env_builder.num_pageservers = shard_count
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+
+    for ps in env.pageservers:
+        ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+
+    env.storage_controller.reconcile_until_idle()
+    shards = env.storage_controller.locate(env.initial_tenant)
+    assert len(set(x["node_id"] for x in shards)) == shard_count
+
+    with env.endpoints.create_start("main") as ep:
+        ep.safe_psql("create table foo as select i::bigint from generate_series(1, 1000) t(i)")
+
+        # as the interleaved operation, we will delete this timeline, which was reparenting candidate
+        first_branch_lsn = wait_for_last_flush_lsn(
+            env, ep, env.initial_tenant, env.initial_timeline
+        )
+        for ps, shard_id in [(pageservers[int(x["node_id"])], x["shard_id"]) for x in shards]:
+            ps.http_client().timeline_checkpoint(shard_id, env.initial_timeline)
+
+        ep.safe_psql("create table bar as select i::bigint from generate_series(1, 2000) t(i)")
+        detached_branch_lsn = flush_ep_to_pageserver(
+            env, ep, env.initial_tenant, env.initial_timeline
+        )
+
+    for ps, shard_id in [(pageservers[int(x["node_id"])], x["shard_id"]) for x in shards]:
+        ps.http_client().timeline_checkpoint(shard_id, env.initial_timeline)
+
+    first_branch = env.neon_cli.create_branch(
+        "first_branch", ancestor_branch_name="main", ancestor_start_lsn=first_branch_lsn
+    )
+    detached_branch = env.neon_cli.create_branch(
+        "detached_branch", ancestor_branch_name="main", ancestor_start_lsn=detached_branch_lsn
+    )
+
+    pausepoint = "timeline-detach-ancestor::before_starting_after_locking_pausable"
+
+    stuck = pageservers[int(shards[0]["node_id"])]
+    stuck_http = stuck.http_client().without_status_retrying()
+    stuck_http.configure_failpoints((pausepoint, "pause"))
+
+    victim = pageservers[int(shards[-1]["node_id"])]
+    victim_http = victim.http_client().without_status_retrying()
+    victim_http.configure_failpoints(
+        (pausepoint, "pause"),
+    )
+
+    # noticed a surprising 409 if the other one would fail instead
+    # victim_http.configure_failpoints([
+    #     (pausepoint, "pause"),
+    #     ("timeline-detach-ancestor::before_starting_after_locking", "return"),
+    # ])
+
+    # interleaving a create_timeline which could be reparented will produce two
+    # permanently different reparentings: one node has reparented, other has
+    # not
+    #
+    # with deletion there is no such problem
+    def detach_timeline():
+        env.storage_controller.pageserver_api().detach_ancestor(env.initial_tenant, detached_branch)
+
+    def paused_at_failpoint():
+        stuck.assert_log_contains(f"at failpoint {pausepoint}")
+        victim.assert_log_contains(f"at failpoint {pausepoint}")
+
+    def first_completed():
+        detail = stuck_http.timeline_detail(shards[0]["shard_id"], detached_branch)
+        log.info(detail)
+        assert detail.get("ancestor_lsn") is None
+
+    def first_branch_gone():
+        try:
+            env.storage_controller.pageserver_api().timeline_detail(
+                env.initial_tenant, first_branch
+            )
+        except PageserverApiException as e:
+            log.info(f"error {e}")
+            assert e.status_code == 404
+        else:
+            log.info("still ok")
+            raise RuntimeError("not done yet")
+
+    with ThreadPoolExecutor(max_workers=1) as pool:
+        try:
+            fut = pool.submit(detach_timeline)
+            wait_until(10, 1.0, paused_at_failpoint)
+
+            # let stuck complete
+            stuck_http.configure_failpoints((pausepoint, "off"))
+            wait_until(10, 1.0, first_completed)
+
+            # if we would let victim fail, for some reason there'd be a 409 response instead of 500
+            # victim_http.configure_failpoints((pausepoint, "off"))
+            # with pytest.raises(PageserverApiException, match=".* 500 Internal Server Error failpoint: timeline-detach-ancestor::before_starting_after_locking") as exc:
+            #     fut.result()
+            # assert exc.value.status_code == 409
+
+            env.storage_controller.pageserver_api().timeline_delete(
+                env.initial_tenant, first_branch
+            )
+            victim_http.configure_failpoints((pausepoint, "off"))
+            wait_until(10, 1.0, first_branch_gone)
+
+            # it now passes, and we should get an error messages about mixed reparenting as the stuck still had something to reparent
+            fut.result()
+
+            msg, offset = env.storage_controller.assert_log_contains(
+                ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*"
+            )
+            log.info(f"expected error message: {msg}")
+            env.storage_controller.allowed_errors.append(
+                ".*: shards returned different results matching=0 .*"
+            )
+
+            detach_timeline()
+
+            # FIXME: perhaps the above should be automatically retried, if we get mixed results?
+            not_found = env.storage_controller.log_contains(
+                ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*",
+                offset=offset,
+            )
+
+            assert not_found is None
+        finally:
+            stuck_http.configure_failpoints((pausepoint, "off"))
+            victim_http.configure_failpoints((pausepoint, "off"))
+
+
 # TODO:
-# - after starting the operation, tenant is deleted
 # - after starting the operation, pageserver is shutdown, restarted
 # - after starting the operation, bottom-most timeline is deleted, pageserver is restarted, gc is inhibited
 # - deletion of reparented while reparenting should fail once, then succeed (?)
@@ -670,9 +1052,5 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
 # - investigate: why are layers started at uneven lsn? not just after branching, but in general.
 #
 # TEST: 1. tad which partially succeeds, one returns 500
-#       2. create branch below timeline? or delete timeline below
+#       2. create branch below timeline? ~or delete reparented timeline~ (done)
 #       3. on retry all should report the same reparented timelines
-#
-# TEST: 1. tad is started, one node stalls, other restarts
-#       2. client timeout before stall over
-#       3. on retry with stalled and other being able to proceed

From 7eb37fea26ab7ed3312a82617cef33af03476999 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Mon, 15 Jul 2024 14:55:57 -0700
Subject: [PATCH 1192/1571] Allow reusing projects between runs of logical
 replication benchmarks (#8393)

---
 test_runner/fixtures/neon_api.py              |  44 +++
 test_runner/fixtures/neon_fixtures.py         |  14 +-
 .../performance/test_logical_replication.py   | 335 +++++++-----------
 3 files changed, 179 insertions(+), 214 deletions(-)

diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py
index 39baf5fab6..658ed119a1 100644
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -261,3 +261,47 @@ class NeonAPI:
                 if op["status"] in {"scheduling", "running", "cancelling"}:
                     has_running = True
             time.sleep(0.5)
+
+
+class NeonApiEndpoint:
+    def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: Optional[str]):
+        self.neon_api = neon_api
+        if project_id is None:
+            project = neon_api.create_project(pg_version)
+            neon_api.wait_for_operation_to_finish(project["project"]["id"])
+            self.project_id = project["project"]["id"]
+            self.endpoint_id = project["endpoints"][0]["id"]
+            self.connstr = project["connection_uris"][0]["connection_uri"]
+            self.pgbench_env = connection_parameters_to_env(
+                project["connection_uris"][0]["connection_parameters"]
+            )
+            self.is_new = True
+        else:
+            project = neon_api.get_project_details(project_id)
+            if int(project["project"]["pg_version"]) != int(pg_version):
+                raise Exception(
+                    f"A project with the provided ID exists, but it's not of the specified version (expected {pg_version}, got {project['project']['pg_version']})"
+                )
+            self.project_id = project_id
+            eps = neon_api.get_endpoints(project_id)["endpoints"]
+            self.endpoint_id = eps[0]["id"]
+            self.connstr = neon_api.get_connection_uri(project_id, endpoint_id=self.endpoint_id)[
+                "uri"
+            ]
+            pw = self.connstr.split("@")[0].split(":")[-1]
+            self.pgbench_env = {
+                "PGHOST": eps[0]["host"],
+                "PGDATABASE": "neondb",
+                "PGUSER": "neondb_owner",
+                "PGPASSWORD": pw,
+            }
+            self.is_new = False
+
+    def restart(self):
+        self.neon_api.restart_endpoint(self.project_id, self.endpoint_id)
+        self.neon_api.wait_for_operation_to_finish(self.project_id)
+
+    def get_synthetic_storage_size(self) -> int:
+        return int(
+            self.neon_api.get_project_details(self.project_id)["project"]["synthetic_storage_size"]
+        )
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 90ed838e1d..fe4a334458 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -87,7 +87,7 @@ from fixtures.utils import (
 )
 from fixtures.utils import AuxFileStore as AuxFileStore  # reexport
 
-from .neon_api import NeonAPI
+from .neon_api import NeonAPI, NeonApiEndpoint
 
 """
 This file contains pytest fixtures. A fixture is a test resource that can be
@@ -3158,6 +3158,18 @@ class RemotePostgres(PgProtocol):
         pass
 
 
+@pytest.fixture(scope="function")
+def benchmark_project_pub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint:
+    project_id = os.getenv("BENCHMARK_PROJECT_ID_PUB")
+    return NeonApiEndpoint(neon_api, pg_version, project_id)
+
+
+@pytest.fixture(scope="function")
+def benchmark_project_sub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint:
+    project_id = os.getenv("BENCHMARK_PROJECT_ID_SUB")
+    return NeonApiEndpoint(neon_api, pg_version, project_id)
+
+
 @pytest.fixture(scope="function")
 def remote_pg(
     test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 5ab83dd31d..53bb29a659 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import time
-import traceback
 from typing import TYPE_CHECKING
 
 import psycopg2
@@ -10,15 +9,12 @@ import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
-from fixtures.neon_api import connection_parameters_to_env
 from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
-from fixtures.pg_version import PgVersion
 
 if TYPE_CHECKING:
     from fixtures.benchmark_fixture import NeonBenchmarker
-    from fixtures.neon_api import NeonAPI
+    from fixtures.neon_api import NeonApiEndpoint
     from fixtures.neon_fixtures import NeonEnv, PgBin
-    from fixtures.pg_version import PgVersion
 
 
 @pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
@@ -86,8 +82,8 @@ def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600):
 @pytest.mark.timeout(2 * 60 * 60)
 def test_subscriber_lag(
     pg_bin: PgBin,
-    neon_api: NeonAPI,
-    pg_version: PgVersion,
+    benchmark_project_pub: NeonApiEndpoint,
+    benchmark_project_sub: NeonApiEndpoint,
     zenbenchmark: NeonBenchmarker,
 ):
     """
@@ -99,125 +95,82 @@ def test_subscriber_lag(
     sync_interval_min = 5
     pgbench_duration = f"-T{test_duration_min * 60 * 2}"
 
-    pub_project = neon_api.create_project(pg_version)
-    pub_project_id = pub_project["project"]["id"]
-    neon_api.wait_for_operation_to_finish(pub_project_id)
-    error_occurred = False
+    pub_env = benchmark_project_pub.pgbench_env
+    sub_env = benchmark_project_sub.pgbench_env
+    pub_connstr = benchmark_project_pub.connstr
+    sub_connstr = benchmark_project_sub.connstr
+
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+
+    pub_conn = psycopg2.connect(pub_connstr)
+    sub_conn = psycopg2.connect(sub_connstr)
+    pub_conn.autocommit = True
+    sub_conn.autocommit = True
+    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+        if benchmark_project_pub.is_new:
+            pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
+
+        if benchmark_project_sub.is_new:
+            sub_cur.execute("truncate table pgbench_accounts")
+            sub_cur.execute("truncate table pgbench_history")
+
+            sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
+
+        initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+    pub_conn.close()
+    sub_conn.close()
+
+    zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER)
+
+    pub_workload = pg_bin.run_nonblocking(
+        ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
+    )
     try:
-        sub_project = neon_api.create_project(pg_version)
-        sub_project_id = sub_project["project"]["id"]
-        sub_endpoint_id = sub_project["endpoints"][0]["id"]
-        neon_api.wait_for_operation_to_finish(sub_project_id)
+        sub_workload = pg_bin.run_nonblocking(
+            ["pgbench", "-c10", pgbench_duration, "-S"],
+            env=sub_env,
+        )
         try:
-            pub_env = connection_parameters_to_env(
-                pub_project["connection_uris"][0]["connection_parameters"]
-            )
-            sub_env = connection_parameters_to_env(
-                sub_project["connection_uris"][0]["connection_parameters"]
-            )
-            pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
-            sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
+            start = time.time()
+            while time.time() - start < test_duration_min * 60:
+                time.sleep(sync_interval_min * 60)
+                check_pgbench_still_running(pub_workload, "pub")
+                check_pgbench_still_running(sub_workload, "sub")
 
-            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
-            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+                with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
+                    sub_connstr
+                ) as sub_conn:
+                    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                        lag = measure_logical_replication_lag(sub_cur, pub_cur)
 
-            pub_conn = psycopg2.connect(pub_connstr)
-            sub_conn = psycopg2.connect(sub_connstr)
-            pub_conn.autocommit = True
-            sub_conn.autocommit = True
-            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                sub_cur.execute("truncate table pgbench_accounts")
-                sub_cur.execute("truncate table pgbench_history")
+                log.info(f"Replica lagged behind master by {lag} seconds")
+                zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+                sub_workload.terminate()
+                benchmark_project_sub.restart()
 
-                pub_cur.execute(
-                    "create publication pub1 for table pgbench_accounts, pgbench_history"
-                )
-                sub_cur.execute(
-                    f"create subscription sub1 connection '{pub_connstr}' publication pub1"
-                )
-
-                initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
-            pub_conn.close()
-            sub_conn.close()
-
-            zenbenchmark.record(
-                "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
-            )
-
-            pub_workload = pg_bin.run_nonblocking(
-                ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
-            )
-            try:
                 sub_workload = pg_bin.run_nonblocking(
                     ["pgbench", "-c10", pgbench_duration, "-S"],
                     env=sub_env,
                 )
-                try:
-                    start = time.time()
-                    while time.time() - start < test_duration_min * 60:
-                        time.sleep(sync_interval_min * 60)
-                        check_pgbench_still_running(pub_workload, "pub")
-                        check_pgbench_still_running(sub_workload, "sub")
 
-                        with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
-                            sub_connstr
-                        ) as sub_conn:
-                            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                                lag = measure_logical_replication_lag(sub_cur, pub_cur)
-
-                        log.info(f"Replica lagged behind master by {lag} seconds")
-                        zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
-                        sub_workload.terminate()
-                        neon_api.restart_endpoint(
-                            sub_project_id,
-                            sub_endpoint_id,
-                        )
-                        neon_api.wait_for_operation_to_finish(sub_project_id)
-                        sub_workload = pg_bin.run_nonblocking(
-                            ["pgbench", "-c10", pgbench_duration, "-S"],
-                            env=sub_env,
-                        )
-
-                        # Measure storage to make sure replication information isn't bloating storage
-                        sub_storage = neon_api.get_project_details(sub_project_id)["project"][
-                            "synthetic_storage_size"
-                        ]
-                        pub_storage = neon_api.get_project_details(pub_project_id)["project"][
-                            "synthetic_storage_size"
-                        ]
-                        zenbenchmark.record(
-                            "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
-                        )
-                        zenbenchmark.record(
-                            "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
-                        )
-
-                finally:
-                    sub_workload.terminate()
-            finally:
-                pub_workload.terminate()
-        except Exception as e:
-            error_occurred = True
-            log.error(f"Caught exception {e}")
-            log.error(traceback.format_exc())
+                # Measure storage to make sure replication information isn't bloating storage
+                sub_storage = benchmark_project_sub.get_synthetic_storage_size()
+                pub_storage = benchmark_project_pub.get_synthetic_storage_size()
+                zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER)
+                zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER)
         finally:
-            if not error_occurred:
-                neon_api.delete_project(sub_project_id)
-    except Exception as e:
-        error_occurred = True
-        log.error(f"Caught exception {e}")
-        log.error(traceback.format_exc())
+            sub_workload.terminate()
     finally:
-        assert not error_occurred
-        neon_api.delete_project(pub_project_id)
+        pub_workload.terminate()
 
 
 @pytest.mark.remote_cluster
 @pytest.mark.timeout(2 * 60 * 60)
 def test_publisher_restart(
     pg_bin: PgBin,
-    neon_api: NeonAPI,
-    pg_version: PgVersion,
+    benchmark_project_pub: NeonApiEndpoint,
+    benchmark_project_sub: NeonApiEndpoint,
     zenbenchmark: NeonBenchmarker,
 ):
     """
@@ -229,114 +182,70 @@ def test_publisher_restart(
     sync_interval_min = 5
     pgbench_duration = f"-T{test_duration_min * 60 * 2}"
 
-    pub_project = neon_api.create_project(pg_version)
-    pub_project_id = pub_project["project"]["id"]
-    pub_endpoint_id = pub_project["endpoints"][0]["id"]
-    neon_api.wait_for_operation_to_finish(pub_project_id)
-    error_occurred = False
+    pub_env = benchmark_project_pub.pgbench_env
+    sub_env = benchmark_project_sub.pgbench_env
+    pub_connstr = benchmark_project_pub.connstr
+    sub_connstr = benchmark_project_sub.connstr
+
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+
+    pub_conn = psycopg2.connect(pub_connstr)
+    sub_conn = psycopg2.connect(sub_connstr)
+    pub_conn.autocommit = True
+    sub_conn.autocommit = True
+    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+        if benchmark_project_pub.is_new:
+            pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
+
+        if benchmark_project_sub.is_new:
+            sub_cur.execute("truncate table pgbench_accounts")
+            sub_cur.execute("truncate table pgbench_history")
+
+            sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
+
+        initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+    pub_conn.close()
+    sub_conn.close()
+
+    zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER)
+
+    pub_workload = pg_bin.run_nonblocking(
+        ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
+    )
     try:
-        sub_project = neon_api.create_project(pg_version)
-        sub_project_id = sub_project["project"]["id"]
-        neon_api.wait_for_operation_to_finish(sub_project_id)
+        sub_workload = pg_bin.run_nonblocking(
+            ["pgbench", "-c10", pgbench_duration, "-S"],
+            env=sub_env,
+        )
         try:
-            pub_env = connection_parameters_to_env(
-                pub_project["connection_uris"][0]["connection_parameters"]
-            )
-            sub_env = connection_parameters_to_env(
-                sub_project["connection_uris"][0]["connection_parameters"]
-            )
-            pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
-            sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
+            start = time.time()
+            while time.time() - start < test_duration_min * 60:
+                time.sleep(sync_interval_min * 60)
+                check_pgbench_still_running(pub_workload, "pub")
+                check_pgbench_still_running(sub_workload, "sub")
 
-            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
-            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
-
-            pub_conn = psycopg2.connect(pub_connstr)
-            sub_conn = psycopg2.connect(sub_connstr)
-            pub_conn.autocommit = True
-            sub_conn.autocommit = True
-            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                sub_cur.execute("truncate table pgbench_accounts")
-                sub_cur.execute("truncate table pgbench_history")
-
-                pub_cur.execute(
-                    "create publication pub1 for table pgbench_accounts, pgbench_history"
-                )
-                sub_cur.execute(
-                    f"create subscription sub1 connection '{pub_connstr}' publication pub1"
-                )
-
-                initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
-            pub_conn.close()
-            sub_conn.close()
-
-            zenbenchmark.record(
-                "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
-            )
-
-            pub_workload = pg_bin.run_nonblocking(
-                ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
-            )
-            try:
-                sub_workload = pg_bin.run_nonblocking(
-                    ["pgbench", "-c10", pgbench_duration, "-S"],
-                    env=sub_env,
-                )
-                try:
-                    start = time.time()
-                    while time.time() - start < test_duration_min * 60:
-                        time.sleep(sync_interval_min * 60)
-                        check_pgbench_still_running(pub_workload, "pub")
-                        check_pgbench_still_running(sub_workload, "sub")
-
-                        pub_workload.terminate()
-                        neon_api.restart_endpoint(
-                            pub_project_id,
-                            pub_endpoint_id,
-                        )
-                        neon_api.wait_for_operation_to_finish(pub_project_id)
-                        pub_workload = pg_bin.run_nonblocking(
-                            ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
-                            env=pub_env,
-                        )
-                        with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
-                            sub_connstr
-                        ) as sub_conn:
-                            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                                lag = measure_logical_replication_lag(sub_cur, pub_cur)
-
-                        log.info(f"Replica lagged behind master by {lag} seconds")
-                        zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
-
-                        # Measure storage to make sure replication information isn't bloating storage
-                        sub_storage = neon_api.get_project_details(sub_project_id)["project"][
-                            "synthetic_storage_size"
-                        ]
-                        pub_storage = neon_api.get_project_details(pub_project_id)["project"][
-                            "synthetic_storage_size"
-                        ]
-                        zenbenchmark.record(
-                            "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
-                        )
-                        zenbenchmark.record(
-                            "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
-                        )
-
-                finally:
-                    sub_workload.terminate()
-            finally:
                 pub_workload.terminate()
-        except Exception as e:
-            error_occurred = True
-            log.error(f"Caught exception {e}")
-            log.error(traceback.format_exc())
+                benchmark_project_pub.restart()
+                pub_workload = pg_bin.run_nonblocking(
+                    ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
+                    env=pub_env,
+                )
+                with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
+                    sub_connstr
+                ) as sub_conn:
+                    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                        lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                log.info(f"Replica lagged behind master by {lag} seconds")
+                zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+
+                # Measure storage to make sure replication information isn't bloating storage
+                sub_storage = benchmark_project_sub.get_synthetic_storage_size()
+                pub_storage = benchmark_project_pub.get_synthetic_storage_size()
+                zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER)
+                zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER)
         finally:
-            if not error_occurred:
-                neon_api.delete_project(sub_project_id)
-    except Exception as e:
-        error_occurred = True
-        log.error(f"Caught exception {e}")
-        log.error(traceback.format_exc())
+            sub_workload.terminate()
     finally:
-        assert not error_occurred
-        neon_api.delete_project(pub_project_id)
+        pub_workload.terminate()

From ee263e6a622c38369110bfa8fae1ba044c48ce0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 16 Jul 2024 02:16:18 +0200
Subject: [PATCH 1193/1571] Allow the new clippy::doc_lazy_continuation lint
 (#8388)

The `doc_lazy_continuation` lint of clippy is still unknown on latest
rust stable.

Fixes fall-out from #8151.
---
 pageserver/src/tenant/timeline.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 239dce8786..58c6257c65 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3409,6 +3409,7 @@ impl Timeline {
         }
     }
 
+    #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
     #[allow(clippy::doc_lazy_continuation)]
     /// Get the data needed to reconstruct all keys in the provided keyspace
     ///

From 83e07c1a5bcc8f4075474ba8b5e4731a078f6dd7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 16 Jul 2024 08:52:49 +0100
Subject: [PATCH 1194/1571] pageserver: un-Arc Timeline::layers (#8386)

## Problem

This structure was in an Arc<> unnecessarily, making it harder to reason
about its lifetime (i.e. it was superficially possible for LayerManager
to outlive timeline, even though no code used it that way)

## Summary of changes

- Remove the Arc<>
---
 pageserver/src/tenant/timeline.rs            |  4 ++--
 pageserver/src/tenant/timeline/compaction.rs | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 58c6257c65..48a5b2d32b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -198,7 +198,7 @@ impl PartialOrd for Hole {
 
 /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
 /// Can be removed after all refactors are done.
-fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
+fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
     drop(rlock)
 }
 
@@ -271,7 +271,7 @@ pub struct Timeline {
     ///
     /// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
     /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
-    pub(crate) layers: Arc<tokio::sync::RwLock<LayerManager>>,
+    pub(crate) layers: tokio::sync::RwLock<LayerManager>,
 
     last_freeze_at: AtomicLsn,
     // Atomic would be more appropriate here.
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index efaa6144af..eec5e5e53c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -27,8 +27,8 @@ use utils::id::TimelineId;
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
-use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
-use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
+use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
+use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -379,7 +379,7 @@ impl Timeline {
             };
 
             let begin = tokio::time::Instant::now();
-            let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
+            let phase1_layers_locked = self.layers.read().await;
             let now = tokio::time::Instant::now();
             stats.read_lock_acquisition_micros =
                 DurationRecorder::Recorded(RecordedDuration(now - begin), now);
@@ -399,9 +399,9 @@ impl Timeline {
     }
 
     /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
-    async fn compact_level0_phase1(
-        self: &Arc<Self>,
-        guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
+    async fn compact_level0_phase1<'a>(
+        self: &'a Arc<Self>,
+        guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
         mut stats: CompactLevel0Phase1StatsBuilder,
         target_file_size: u64,
         ctx: &RequestContext,

From e6dadcd2f35ce4dd2702acef2bdebe75d583677f Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Mon, 15 Jul 2024 12:48:53 +0100
Subject: [PATCH 1195/1571] Compute: add compatibility patch for rum Fixes
 #8251

---
 Dockerfile.compute-node |  3 +++
 patches/rum.patch       | 54 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)
 create mode 100644 patches/rum.patch

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 7ab685625a..48a52bfc6d 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -311,9 +311,12 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
 FROM build-deps AS rum-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+COPY patches/rum.patch /rum.patch
+
 RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
     echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
     mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /rum.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
diff --git a/patches/rum.patch b/patches/rum.patch
new file mode 100644
index 0000000000..3041f8df81
--- /dev/null
+++ b/patches/rum.patch
@@ -0,0 +1,54 @@
+commit 68f3b3b0d594f08aacc4a082ee210749ed5677eb
+Author: Anastasia Lubennikova <anastasia@neon.tech>
+Date:   Mon Jul 15 12:31:56 2024 +0100
+
+    Neon: fix unlogged index build patch
+
+diff --git a/src/ruminsert.c b/src/ruminsert.c
+index e8b209d..e89bf2a 100644
+--- a/src/ruminsert.c
++++ b/src/ruminsert.c
+@@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
+ 		elog(ERROR, "index \"%s\" already contains data",
+ 			 RelationGetRelationName(index));
+ 
++#ifdef NEON_SMGR
++	smgr_start_unlogged_build(index->rd_smgr);
++#endif
++
+ 	initRumState(&buildstate.rumstate, index);
+ 	buildstate.rumstate.isBuild = true;
+ 	buildstate.indtuples = 0;
+@@ -693,6 +697,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
+ 	buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
+ 	rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild);
+ 
++#ifdef NEON_SMGR
++	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
++#endif
++
+ 	/*
+ 	 * Write index to xlog
+ 	 */
+@@ -713,6 +721,21 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
+ 		UnlockReleaseBuffer(buffer);
+ 	}
+ 
++#ifdef NEON_SMGR
++	{
++#if PG_VERSION_NUM >= 160000
++		RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
++#else
++		RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
++#endif
++
++		SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
++		SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
++
++		smgr_end_unlogged_build(index->rd_smgr);
++	}
++#endif
++
+ 	/*
+ 	 * Return statistics
+ 	 */

From 66337097de074de3a2e2e19bf0b1c304a21b273c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 16 Jul 2024 12:19:28 +0200
Subject: [PATCH 1196/1571] Avoid the storage controller in
 test_tenant_creation_fails (#8392)

As described in #8385, the likely source for flakiness in
test_tenant_creation_fails is the following sequence of events:

1. test instructs the storage controller to create the tenant
2. storage controller adds the tenant and persists it to the database.
issues a creation request
3. the pageserver restarts with the failpoint disabled
4. storage controller's background reconciliation still wants to create
the tenant
5. pageserver gets new request to create the tenant from background
reconciliation

This commit just avoids the storage controller entirely. It has its own
set of issues, as the re-attach request will obviously not include the
tenant, but it's still useful to test for non-existence of the tenant.

The generation is also not optional any more during tenant attachment.
If you omit it, the pageserver yields an error. We change the signature
of `tenant_attach` to reflect that.

Alternative to #8385
Fixes #8266
---
 test_runner/fixtures/neon_fixtures.py   |  2 +-
 test_runner/fixtures/pageserver/http.py |  2 +-
 test_runner/regress/test_tenants.py     | 13 +++----------
 3 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index fe4a334458..625e9096f5 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2786,8 +2786,8 @@ class NeonPageserver(PgProtocol, LogUtils):
             )
         return client.tenant_attach(
             tenant_id,
+            generation,
             config,
-            generation=generation,
         )
 
     def tenant_detach(self, tenant_id: TenantId):
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index d66b94948a..f1e3d1a309 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -238,8 +238,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
     def tenant_attach(
         self,
         tenant_id: Union[TenantId, TenantShardId],
+        generation: int,
         config: None | Dict[str, Any] = None,
-        generation: Optional[int] = None,
     ):
         config = config or {}
 
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 04b3fdd80f..0ebf714de0 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -45,17 +45,10 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
     # Failure to write a config to local disk makes the pageserver assume that local disk is bad and abort the process
     pageserver_http.configure_failpoints(("tenant-config-before-write", "return"))
 
-    # Storage controller will see a torn TCP connection when the crash point is reached, and follow an unclean 500 error path
-    neon_simple_env.storage_controller.allowed_errors.extend(
-        [
-            ".*Reconcile not done yet while creating tenant.*",
-            ".*Reconcile error: receive body: error sending request.*",
-            ".*Error processing HTTP request: InternalServerError.*",
-        ]
-    )
+    tenant_id = TenantId.generate()
 
-    with pytest.raises(Exception, match="error sending request"):
-        _ = neon_simple_env.neon_cli.create_tenant()
+    with pytest.raises(requests.exceptions.ConnectionError, match="Connection aborted"):
+        neon_simple_env.pageserver.http_client().tenant_attach(tenant_id=tenant_id, generation=1)
 
     # Any files left behind on disk during failed creation do not prevent
     # a retry from succeeding.  Restart pageserver with no failpoints.

From d2ee760eb2ad2ad637d10e5ab1bc44e9215bc2fd Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 16 Jul 2024 12:20:23 +0200
Subject: [PATCH 1197/1571] build(deps): bump setuptools from 65.5.1 to 70.0.0
 (#8387)

Bumps [setuptools](https://github.com/pypa/setuptools) from 65.5.1 to
70.0.0.

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: a-masterov <72613290+a-masterov@users.noreply.github.com>
---
 poetry.lock | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 8091141411..5192a574cc 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2641,19 +2641,18 @@ pbr = "*"
 
 [[package]]
 name = "setuptools"
-version = "65.5.1"
+version = "70.0.0"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"},
-    {file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"},
+    {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"},
+    {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"},
 ]
 
 [package.extras]
-docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
-testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
-testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
 
 [[package]]
 name = "six"

From a40b402957a99de5a484284d22462cd3191b4bb1 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 16 Jul 2024 14:54:54 +0100
Subject: [PATCH 1198/1571] pageserver: clean up GcCutoffs names (#8379)

- `horizon` is a confusing term, it's not at all obvious that this means
space-based retention limit, rather than the total GC history limit.
Rename to `GcCutoffs::space`.
- `pitr` is less confusing, but still an unecessary level of indirection
from what we really mean: a time-based condition. The fact that we use
that that time-history for Point In Time Recovery doesn't mean we have
to refer to time as "pitr" everywhere. Rename to `GcCutoffs::time`.
---
 pageserver/src/tenant.rs                     | 14 +--
 pageserver/src/tenant/size.rs                | 61 +++++--------
 pageserver/src/tenant/timeline.rs            | 94 +++++++++-----------
 pageserver/src/tenant/timeline/compaction.rs |  4 +-
 4 files changed, 75 insertions(+), 98 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 6333fd3b63..dc6f42eaeb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2912,7 +2912,7 @@ impl Tenant {
                 if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
                     if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
                         target.within_ancestor_pitr =
-                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr;
+                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time;
                     }
                 }
 
@@ -2928,7 +2928,7 @@ impl Tenant {
                 timeline.metrics.pitr_history_size.set(
                     timeline
                         .get_last_record_lsn()
-                        .checked_sub(target.cutoffs.pitr)
+                        .checked_sub(target.cutoffs.time)
                         .unwrap_or(Lsn(0))
                         .0,
                 );
@@ -4262,7 +4262,7 @@ mod tests {
                     .source()
                     .unwrap()
                     .to_string()
-                    .contains("is earlier than latest GC horizon"));
+                    .contains("is earlier than latest GC cutoff"));
             }
         }
 
@@ -6718,8 +6718,8 @@ mod tests {
         {
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.pitr = Lsn(0x30);
-            guard.cutoffs.horizon = Lsn(0x30);
+            guard.cutoffs.time = Lsn(0x30);
+            guard.cutoffs.space = Lsn(0x30);
         }
 
         let expected_result = [
@@ -7109,8 +7109,8 @@ mod tests {
             *guard = GcInfo {
                 retain_lsns: vec![],
                 cutoffs: GcCutoffs {
-                    pitr: Lsn(0x30),
-                    horizon: Lsn(0x30),
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
                 },
                 leases: Default::default(),
                 within_ancestor_pitr: false,
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 23354417e7..e4728ca8a8 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -135,11 +135,9 @@ pub struct TimelineInputs {
     ancestor_lsn: Lsn,
     last_record: Lsn,
     latest_gc_cutoff: Lsn,
-    horizon_cutoff: Lsn,
-    pitr_cutoff: Lsn,
 
     /// Cutoff point based on GC settings
-    next_gc_cutoff: Lsn,
+    next_pitr_cutoff: Lsn,
 
     /// Cutoff point calculated from the user-supplied 'max_retention_period'
     retention_param_cutoff: Option<Lsn>,
@@ -150,7 +148,7 @@ pub struct TimelineInputs {
 
 /// Gathers the inputs for the tenant sizing model.
 ///
-/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
+/// Tenant size does not consider the latest state, but only the state until next_pitr_cutoff, which
 /// is updated on-demand, during the start of this calculation and separate from the
 /// [`TimelineInputs::latest_gc_cutoff`].
 ///
@@ -158,11 +156,8 @@ pub struct TimelineInputs {
 ///
 /// ```text
 /// 0-----|---------|----|------------| · · · · · |·> lsn
-///   initdb_lsn  branchpoints*  next_gc_cutoff  latest
+///   initdb_lsn  branchpoints*  next_pitr_cutoff  latest
 /// ```
-///
-/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
-/// tenant size will be zero.
 pub(super) async fn gather_inputs(
     tenant: &Tenant,
     limit: &Arc<Semaphore>,
@@ -172,7 +167,7 @@ pub(super) async fn gather_inputs(
     cancel: &CancellationToken,
     ctx: &RequestContext,
 ) -> Result<ModelInputs, CalculateSyntheticSizeError> {
-    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
+    // refresh is needed to update [`timeline::GcCutoffs`]
     tenant.refresh_gc_info(cancel, ctx).await?;
 
     // Collect information about all the timelines
@@ -236,20 +231,18 @@ pub(super) async fn gather_inputs(
         // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
         // actually removing files.
         //
-        // We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from
+        // We only consider [`timeline::GcCutoffs::time`], and not [`timeline::GcCutoffs::space`], because from
         // a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather
-        // than a space bound (horizon cutoff).  This means that if someone drops a database and waits for their
+        // than our internal space cutoff.  This means that if someone drops a database and waits for their
         // PITR interval, they will see synthetic size decrease, even if we are still storing data inside
-        // horizon_cutoff.
-        let pitr_cutoff = gc_info.cutoffs.pitr;
-        let horizon_cutoff = gc_info.cutoffs.horizon;
-        let mut next_gc_cutoff = pitr_cutoff;
+        // the space cutoff.
+        let mut next_pitr_cutoff = gc_info.cutoffs.time;
 
         // If the caller provided a shorter retention period, use that instead of the GC cutoff.
         let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
             let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period));
-            if next_gc_cutoff < param_cutoff {
-                next_gc_cutoff = param_cutoff;
+            if next_pitr_cutoff < param_cutoff {
+                next_pitr_cutoff = param_cutoff;
             }
             Some(param_cutoff)
         } else {
@@ -263,7 +256,7 @@ pub(super) async fn gather_inputs(
             .copied()
             .collect::<Vec<_>>();
 
-        // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
+        // next_pitr_cutoff in parent branch are not of interest (right now at least), nor do we
         // want to query any logical size before initdb_lsn.
         let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
 
@@ -291,10 +284,10 @@ pub(super) async fn gather_inputs(
             )
         }
 
-        // Add a point for the GC cutoff
-        let branch_start_needed = next_gc_cutoff <= branch_start_lsn;
+        // Add a point for the PITR cutoff
+        let branch_start_needed = next_pitr_cutoff <= branch_start_lsn;
         if !branch_start_needed {
-            lsns.push((next_gc_cutoff, LsnKind::GcCutOff));
+            lsns.push((next_pitr_cutoff, LsnKind::GcCutOff));
         }
 
         lsns.sort_unstable();
@@ -333,7 +326,7 @@ pub(super) async fn gather_inputs(
                     parent: Some(parent),
                     lsn: lsn.0,
                     size: None,
-                    needed: lsn > next_gc_cutoff,
+                    needed: lsn > next_pitr_cutoff,
                 },
                 timeline_id: timeline.timeline_id,
                 kind,
@@ -357,8 +350,8 @@ pub(super) async fn gather_inputs(
                     segment: Segment {
                         parent: Some(lease_parent),
                         lsn: lsn.0,
-                        size: None,                   // Filled in later, if necessary
-                        needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
+                        size: None,                     // Filled in later, if necessary
+                        needed: lsn > next_pitr_cutoff, // only needed if the point is within rentention.
                     },
                     timeline_id: timeline.timeline_id,
                     kind: LsnKind::LeaseStart,
@@ -398,9 +391,7 @@ pub(super) async fn gather_inputs(
             last_record: last_record_lsn,
             // this is not used above, because it might not have updated recently enough
             latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
-            horizon_cutoff,
-            pitr_cutoff,
-            next_gc_cutoff,
+            next_pitr_cutoff,
             retention_param_cutoff,
             lease_points,
         });
@@ -742,9 +733,7 @@ fn verify_size_for_multiple_branches() {
       "ancestor_lsn": "0/18D3D98",
       "last_record": "0/2230CD0",
       "latest_gc_cutoff": "0/1698C48",
-      "horizon_cutoff": "0/2210CD0",
-      "pitr_cutoff": "0/2210CD0",
-      "next_gc_cutoff": "0/2210CD0",
+      "next_pitr_cutoff": "0/2210CD0",
       "retention_param_cutoff": null,
       "lease_points": []
     },
@@ -753,9 +742,7 @@ fn verify_size_for_multiple_branches() {
       "ancestor_lsn": "0/176D998",
       "last_record": "0/1837770",
       "latest_gc_cutoff": "0/1698C48",
-      "horizon_cutoff": "0/1817770",
-      "pitr_cutoff": "0/1817770",
-      "next_gc_cutoff": "0/1817770",
+      "next_pitr_cutoff": "0/1817770",
       "retention_param_cutoff": null,
       "lease_points": []
     },
@@ -764,9 +751,7 @@ fn verify_size_for_multiple_branches() {
       "ancestor_lsn": "0/0",
       "last_record": "0/18D3D98",
       "latest_gc_cutoff": "0/1698C48",
-      "horizon_cutoff": "0/18B3D98",
-      "pitr_cutoff": "0/18B3D98",
-      "next_gc_cutoff": "0/18B3D98",
+      "next_pitr_cutoff": "0/18B3D98",
       "retention_param_cutoff": null,
       "lease_points": []
     }
@@ -820,9 +805,7 @@ fn verify_size_for_one_branch() {
       "ancestor_lsn": "0/0",
       "last_record": "47/280A5860",
       "latest_gc_cutoff": "47/240A5860",
-      "horizon_cutoff": "47/240A5860",
-      "pitr_cutoff": "47/240A5860",
-      "next_gc_cutoff": "47/240A5860",
+      "next_pitr_cutoff": "47/240A5860",
       "retention_param_cutoff": "0/0",
       "lease_points": []
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 48a5b2d32b..3d3d3ac34d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -478,37 +478,32 @@ impl GcInfo {
     }
 }
 
-/// The `GcInfo` component describing which Lsns need to be retained.
+/// The `GcInfo` component describing which Lsns need to be retained.  Functionally, this
+/// is a single number (the oldest LSN which we must retain), but it internally distinguishes
+/// between time-based and space-based retention for observability and consumption metrics purposes.
 #[derive(Debug)]
 pub(crate) struct GcCutoffs {
-    /// Keep everything newer than this point.
-    ///
-    /// This is calculated by subtracting 'gc_horizon' setting from
-    /// last-record LSN
-    ///
-    /// FIXME: is this inclusive or exclusive?
-    pub(crate) horizon: Lsn,
+    /// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much
+    /// history we must keep to retain a specified number of bytes of WAL.
+    pub(crate) space: Lsn,
 
-    /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this
-    /// point.
-    ///
-    /// This is calculated by finding a number such that a record is needed for PITR
-    /// if only if its LSN is larger than 'pitr_cutoff'.
-    pub(crate) pitr: Lsn,
+    /// Calculated from [`TenantConf::pitr_interval`], this LSN indicates how much
+    /// history we must keep to enable reading back at least the PITR interval duration.
+    pub(crate) time: Lsn,
 }
 
 impl Default for GcCutoffs {
     fn default() -> Self {
         Self {
-            horizon: Lsn::INVALID,
-            pitr: Lsn::INVALID,
+            space: Lsn::INVALID,
+            time: Lsn::INVALID,
         }
     }
 }
 
 impl GcCutoffs {
     fn select_min(&self) -> Lsn {
-        std::cmp::min(self.horizon, self.pitr)
+        std::cmp::min(self.space, self.time)
     }
 }
 
@@ -867,7 +862,7 @@ impl Timeline {
         let gc_info = self.gc_info.read().unwrap();
         let history = self
             .get_last_record_lsn()
-            .checked_sub(gc_info.cutoffs.pitr)
+            .checked_sub(gc_info.cutoffs.time)
             .unwrap_or(Lsn(0))
             .0;
         (history, gc_info.within_ancestor_pitr)
@@ -1566,7 +1561,7 @@ impl Timeline {
     ) -> anyhow::Result<()> {
         ensure!(
             lsn >= **latest_gc_cutoff_lsn,
-            "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
+            "LSN {} is earlier than latest GC cutoff {} (we might've already garbage collected needed data)",
             lsn,
             **latest_gc_cutoff_lsn,
         );
@@ -4944,18 +4939,18 @@ impl Timeline {
     /// garbage collection.
     ///
     /// We calculate two cutoffs, one based on time and one based on WAL size.  `pitr`
-    /// controls the time cutoff (or ZERO to disable time-based retention), and `cutoff_horizon` controls
+    /// controls the time cutoff (or ZERO to disable time-based retention), and `space_cutoff` controls
     /// the space-based retention.
     ///
     /// This function doesn't simply to calculate time & space based retention: it treats time-based
     /// retention as authoritative if enabled, and falls back to space-based retention if calculating
     /// the LSN for a time point isn't possible.  Therefore the GcCutoffs::horizon in the response might
-    /// be different to the `cutoff_horizon` input.  Callers should treat the min() of the two cutoffs
+    /// be different to the `space_cutoff` input.  Callers should treat the min() of the two cutoffs
     /// in the response as the GC cutoff point for the timeline.
     #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
     pub(super) async fn find_gc_cutoffs(
         &self,
-        cutoff_horizon: Lsn,
+        space_cutoff: Lsn,
         pitr: Duration,
         cancel: &CancellationToken,
         ctx: &RequestContext,
@@ -4972,8 +4967,8 @@ impl Timeline {
             // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
             if pitr == Duration::ZERO {
                 return Ok(GcCutoffs {
-                    pitr: self.get_last_record_lsn(),
-                    horizon: cutoff_horizon,
+                    time: self.get_last_record_lsn(),
+                    space: space_cutoff,
                 });
             }
         }
@@ -4981,8 +4976,7 @@ impl Timeline {
         // Calculate a time-based limit on how much to retain:
         // - if PITR interval is set, then this is our cutoff.
         // - if PITR interval is not set, then we do a lookup
-        //   based on DEFAULT_PITR_INTERVAL, so that size-based retention (horizon)
-        //   does not result in keeping history around permanently on idle databases.
+        //   based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases.
         let time_cutoff = {
             let now = SystemTime::now();
             let time_range = if pitr == Duration::ZERO {
@@ -5023,31 +5017,31 @@ impl Timeline {
                 // PITR is not set. Retain the size-based limit, or the default time retention,
                 // whichever requires less data.
                 GcCutoffs {
-                    pitr: std::cmp::max(time_cutoff, cutoff_horizon),
-                    horizon: std::cmp::max(time_cutoff, cutoff_horizon),
+                    time: self.get_last_record_lsn(),
+                    space: std::cmp::max(time_cutoff, space_cutoff),
                 }
             }
             (Duration::ZERO, None) => {
                 // PITR is not set, and time lookup failed
                 GcCutoffs {
-                    pitr: self.get_last_record_lsn(),
-                    horizon: cutoff_horizon,
+                    time: self.get_last_record_lsn(),
+                    space: space_cutoff,
                 }
             }
             (_, None) => {
                 // PITR interval is set & we didn't look up a timestamp successfully.  Conservatively assume PITR
                 // cannot advance beyond what was already GC'd, and respect space-based retention
                 GcCutoffs {
-                    pitr: *self.get_latest_gc_cutoff_lsn(),
-                    horizon: cutoff_horizon,
+                    time: *self.get_latest_gc_cutoff_lsn(),
+                    space: space_cutoff,
                 }
             }
             (_, Some(time_cutoff)) => {
                 // PITR interval is set and we looked up timestamp successfully.  Ignore
                 // size based retention and make time cutoff authoritative
                 GcCutoffs {
-                    pitr: time_cutoff,
-                    horizon: time_cutoff,
+                    time: time_cutoff,
+                    space: time_cutoff,
                 }
             }
         })
@@ -5074,11 +5068,11 @@ impl Timeline {
             return Err(GcError::TimelineCancelled);
         }
 
-        let (horizon_cutoff, pitr_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
+        let (space_cutoff, time_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
             let gc_info = self.gc_info.read().unwrap();
 
-            let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
-            let pitr_cutoff = gc_info.cutoffs.pitr;
+            let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn());
+            let time_cutoff = gc_info.cutoffs.time;
             let retain_lsns = gc_info.retain_lsns.clone();
 
             // Gets the maximum LSN that holds the valid lease.
@@ -5088,14 +5082,14 @@ impl Timeline {
             let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn);
 
             (
-                horizon_cutoff,
-                pitr_cutoff,
+                space_cutoff,
+                time_cutoff,
                 retain_lsns,
                 max_lsn_with_valid_lease,
             )
         };
 
-        let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
+        let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff);
         let standby_horizon = self.standby_horizon.load();
         // Hold GC for the standby, but as a safety guard do it only within some
         // reasonable lag.
@@ -5124,8 +5118,8 @@ impl Timeline {
 
         let res = self
             .gc_timeline(
-                horizon_cutoff,
-                pitr_cutoff,
+                space_cutoff,
+                time_cutoff,
                 retain_lsns,
                 max_lsn_with_valid_lease,
                 new_gc_cutoff,
@@ -5143,8 +5137,8 @@ impl Timeline {
 
     async fn gc_timeline(
         &self,
-        horizon_cutoff: Lsn,
-        pitr_cutoff: Lsn,
+        space_cutoff: Lsn,
+        time_cutoff: Lsn,
         retain_lsns: Vec<Lsn>,
         max_lsn_with_valid_lease: Option<Lsn>,
         new_gc_cutoff: Lsn,
@@ -5205,22 +5199,22 @@ impl Timeline {
             result.layers_total += 1;
 
             // 1. Is it newer than GC horizon cutoff point?
-            if l.get_lsn_range().end > horizon_cutoff {
+            if l.get_lsn_range().end > space_cutoff {
                 debug!(
-                    "keeping {} because it's newer than horizon_cutoff {}",
+                    "keeping {} because it's newer than space_cutoff {}",
                     l.layer_name(),
-                    horizon_cutoff,
+                    space_cutoff,
                 );
                 result.layers_needed_by_cutoff += 1;
                 continue 'outer;
             }
 
             // 2. It is newer than PiTR cutoff point?
-            if l.get_lsn_range().end > pitr_cutoff {
+            if l.get_lsn_range().end > time_cutoff {
                 debug!(
-                    "keeping {} because it's newer than pitr_cutoff {}",
+                    "keeping {} because it's newer than time_cutoff {}",
                     l.layer_name(),
-                    pitr_cutoff,
+                    time_cutoff,
                 );
                 result.layers_needed_by_pitr += 1;
                 continue 'outer;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index eec5e5e53c..cbb3303341 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -195,7 +195,7 @@ impl Timeline {
         tracing::info!(
             "latest_gc_cutoff: {}, pitr cutoff {}",
             *latest_gc_cutoff,
-            self.gc_info.read().unwrap().cutoffs.pitr
+            self.gc_info.read().unwrap().cutoffs.time
         );
 
         let layers = self.layers.read().await;
@@ -990,7 +990,7 @@ impl Timeline {
                     "enhanced legacy compaction currently does not support retain_lsns (branches)"
                 )));
             }
-            let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
+            let gc_cutoff = gc_info.cutoffs.select_min();
             let mut selected_layers = Vec::new();
             // TODO: consider retain_lsns
             drop(gc_info);

From b5ab0555265d72b2cdd86ee259d84847409ad8ad Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 12 Jul 2024 13:46:14 -0500
Subject: [PATCH 1199/1571] Rename compute migrations to start at 1

This matches what we put into the neon_migration.migration_id table.
---
 compute_tools/src/migration.rs                | 17 +++++++++++++---
 ...sql => 0001-neon_superuser_bypass_rls.sql} |  0
 ...1-alter_roles.sql => 0002-alter_roles.sql} |  0
 ...create_subscription_to_neon_superuser.sql} |  0
 ...04-grant_pg_monitor_to_neon_superuser.sql} |  0
 ...grant_all_on_tables_to_neon_superuser.sql} |  0
 ...nt_all_on_sequences_to_neon_superuser.sql} |  0
 ...s_to_neon_superuser_with_grant_option.sql} |  0
 ...s_to_neon_superuser_with_grant_option.sql} |  0
 ...lication_for_previously_allowed_roles.sql} |  0
 ...nchronization_funcs_to_neon_superuser.sql} |  0
 compute_tools/src/spec.rs                     | 20 +++++++++----------
 12 files changed, 24 insertions(+), 13 deletions(-)
 rename compute_tools/src/migrations/{0000-neon_superuser_bypass_rls.sql => 0001-neon_superuser_bypass_rls.sql} (100%)
 rename compute_tools/src/migrations/{0001-alter_roles.sql => 0002-alter_roles.sql} (100%)
 rename compute_tools/src/migrations/{0002-grant_pg_create_subscription_to_neon_superuser.sql => 0003-grant_pg_create_subscription_to_neon_superuser.sql} (100%)
 rename compute_tools/src/migrations/{0003-grant_pg_monitor_to_neon_superuser.sql => 0004-grant_pg_monitor_to_neon_superuser.sql} (100%)
 rename compute_tools/src/migrations/{0004-grant_all_on_tables_to_neon_superuser.sql => 0005-grant_all_on_tables_to_neon_superuser.sql} (100%)
 rename compute_tools/src/migrations/{0005-grant_all_on_sequences_to_neon_superuser.sql => 0006-grant_all_on_sequences_to_neon_superuser.sql} (100%)
 rename compute_tools/src/migrations/{0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql => 0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql} (100%)
 rename compute_tools/src/migrations/{0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql => 0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql} (100%)
 rename compute_tools/src/migrations/{0008-revoke_replication_for_previously_allowed_roles.sql => 0009-revoke_replication_for_previously_allowed_roles.sql} (100%)
 rename compute_tools/src/migrations/{0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql => 0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql} (100%)

diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs
index 61dcf01c84..241ccd4100 100644
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -66,17 +66,28 @@ impl<'m> MigrationRunner<'m> {
             .context("run_migrations begin")?;
 
         while current_migration < self.migrations.len() {
+            macro_rules! migration_id {
+                ($cm:expr) => {
+                    ($cm + 1) as i64
+                };
+            }
+
             let migration = self.migrations[current_migration];
 
             if migration.starts_with("-- SKIP") {
-                info!("Skipping migration id={}", current_migration);
+                info!("Skipping migration id={}", migration_id!(current_migration));
             } else {
                 info!(
                     "Running migration id={}:\n{}\n",
-                    current_migration, migration
+                    migration_id!(current_migration),
+                    migration
                 );
+
                 self.client.simple_query(migration).with_context(|| {
-                    format!("run_migration current_migration={}", current_migration)
+                    format!(
+                        "run_migration migration id={}",
+                        migration_id!(current_migration)
+                    )
                 })?;
             }
 
diff --git a/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql b/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql
similarity index 100%
rename from compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql
rename to compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql
diff --git a/compute_tools/src/migrations/0001-alter_roles.sql b/compute_tools/src/migrations/0002-alter_roles.sql
similarity index 100%
rename from compute_tools/src/migrations/0001-alter_roles.sql
rename to compute_tools/src/migrations/0002-alter_roles.sql
diff --git a/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql
similarity index 100%
rename from compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql
rename to compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql
diff --git a/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql
similarity index 100%
rename from compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql
rename to compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql
diff --git a/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql b/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql
similarity index 100%
rename from compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql
rename to compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql
diff --git a/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql b/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql
similarity index 100%
rename from compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql
rename to compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql
diff --git a/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
similarity index 100%
rename from compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
rename to compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
diff --git a/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
similarity index 100%
rename from compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
rename to compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
diff --git a/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql
similarity index 100%
rename from compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql
rename to compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql
diff --git a/compute_tools/src/migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
similarity index 100%
rename from compute_tools/src/migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
rename to compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 1d12b88c7c..6a87263821 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -777,21 +777,21 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
 
     // Add new migrations in numerical order.
     let migrations = [
-        include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
-        include_str!("./migrations/0001-alter_roles.sql"),
-        include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
-        include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
-        include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
-        include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
+        include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"),
+        include_str!("./migrations/0002-alter_roles.sql"),
+        include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"),
+        include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"),
+        include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"),
+        include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"),
         include_str!(
-            "./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
+            "./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
         ),
         include_str!(
-            "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
+            "./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
         ),
-        include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
+        include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"),
         include_str!(
-            "./migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
+            "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
         ),
     ];
 

From ba17025a57bc4916b3efeb0fd068f2ada7f668a8 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 12 Jul 2024 13:38:51 -0500
Subject: [PATCH 1200/1571] Run each migration in its own transaction

Previously, every migration was run in the same transaction. This
is preparatory work for fixing CVE-2024-4317.
---
 compute_tools/src/migration.rs         | 46 +++++++++++---------------
 test_runner/fixtures/neon_fixtures.py  |  6 ++--
 test_runner/regress/test_migrations.py |  7 +---
 3 files changed, 24 insertions(+), 35 deletions(-)

diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs
index 241ccd4100..22ab145eda 100644
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -9,6 +9,9 @@ pub(crate) struct MigrationRunner<'m> {
 
 impl<'m> MigrationRunner<'m> {
     pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
+        // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64
+        assert!(migrations.len() + 1 < i64::MAX as usize);
+
         Self { client, migrations }
     }
 
@@ -22,11 +25,8 @@ impl<'m> MigrationRunner<'m> {
         Ok(row.get::<&str, i64>("id"))
     }
 
-    fn update_migration_id(&mut self) -> Result<()> {
-        let setval = format!(
-            "UPDATE neon_migration.migration_id SET id={}",
-            self.migrations.len()
-        );
+    fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
+        let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id);
 
         self.client
             .simple_query(&setval)
@@ -57,14 +57,7 @@ impl<'m> MigrationRunner<'m> {
     pub fn run_migrations(mut self) -> Result<()> {
         self.prepare_migrations()?;
 
-        let mut current_migration: usize = self.get_migration_id()? as usize;
-        let starting_migration_id = current_migration;
-
-        let query = "BEGIN";
-        self.client
-            .simple_query(query)
-            .context("run_migrations begin")?;
-
+        let mut current_migration = self.get_migration_id()? as usize;
         while current_migration < self.migrations.len() {
             macro_rules! migration_id {
                 ($cm:expr) => {
@@ -83,29 +76,30 @@ impl<'m> MigrationRunner<'m> {
                     migration
                 );
 
+                self.client
+                    .simple_query("BEGIN")
+                    .context("begin migration")?;
+
                 self.client.simple_query(migration).with_context(|| {
                     format!(
-                        "run_migration migration id={}",
+                        "run_migrations migration id={}",
                         migration_id!(current_migration)
                     )
                 })?;
+
+                // Migration IDs start at 1
+                self.update_migration_id(migration_id!(current_migration))?;
+
+                self.client
+                    .simple_query("COMMIT")
+                    .context("commit migration")?;
+
+                info!("Finished migration id={}", migration_id!(current_migration));
             }
 
             current_migration += 1;
         }
 
-        self.update_migration_id()?;
-
-        let query = "COMMIT";
-        self.client
-            .simple_query(query)
-            .context("run_migrations commit")?;
-
-        info!(
-            "Ran {} migrations",
-            (self.migrations.len() - starting_migration_id)
-        );
-
         Ok(())
     }
 }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 625e9096f5..4766b72516 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3798,13 +3798,13 @@ class Endpoint(PgProtocol, LogUtils):
             json.dump(dict(data_dict, **kwargs), file, indent=4)
 
     # Please note: Migrations only run if pg_skip_catalog_updates is false
-    def wait_for_migrations(self):
+    def wait_for_migrations(self, num_migrations: int = 10):
         with self.cursor() as cur:
 
             def check_migrations_done():
                 cur.execute("SELECT id FROM neon_migration.migration_id")
-                migration_id = cur.fetchall()[0][0]
-                assert migration_id != 0
+                migration_id: int = cur.fetchall()[0][0]
+                assert migration_id >= num_migrations
 
             wait_until(20, 0.5, check_migrations_done)
 
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 91bd3ea50c..880dead4e8 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -11,17 +11,14 @@ def test_migrations(neon_simple_env: NeonEnv):
     endpoint.respec(skip_pg_catalog_updates=False)
     endpoint.start()
 
-    endpoint.wait_for_migrations()
-
     num_migrations = 10
+    endpoint.wait_for_migrations(num_migrations=num_migrations)
 
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")
         migration_id = cur.fetchall()
         assert migration_id[0][0] == num_migrations
 
-    endpoint.assert_log_contains(f"INFO handle_migrations: Ran {num_migrations} migrations")
-
     endpoint.stop()
     endpoint.start()
     # We don't have a good way of knowing that the migrations code path finished executing
@@ -31,5 +28,3 @@ def test_migrations(neon_simple_env: NeonEnv):
         cur.execute("SELECT id FROM neon_migration.migration_id")
         migration_id = cur.fetchall()
         assert migration_id[0][0] == num_migrations
-
-    endpoint.assert_log_contains("INFO handle_migrations: Ran 0 migrations")

From b197cc20fc4d2c474eec03d57ce855203e24c704 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 15 Jul 2024 10:30:04 -0500
Subject: [PATCH 1201/1571] Hide import behind TYPE_CHECKING

---
 test_runner/regress/test_migrations.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 880dead4e8..bdc5ca907e 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -1,6 +1,10 @@
-import time
+from __future__ import annotations
 
-from fixtures.neon_fixtures import NeonEnv
+import time
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv
 
 
 def test_migrations(neon_simple_env: NeonEnv):

From 7cf59ae5b4b2ebf5a7685976cb74ae28dd25db08 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 15 Jul 2024 10:35:49 -0500
Subject: [PATCH 1202/1571] Add some typing to Endpoint.respec()

---
 test_runner/fixtures/neon_fixtures.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 4766b72516..2765ff916e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3785,12 +3785,12 @@ class Endpoint(PgProtocol, LogUtils):
             self.endpoint_id, self.tenant_id, pageserver_id, self.active_safekeepers
         )
 
-    def respec(self, **kwargs):
+    def respec(self, **kwargs: Any) -> None:
         """Update the endpoint.json file used by control_plane."""
         # Read config
         config_path = os.path.join(self.endpoint_path(), "endpoint.json")
         with open(config_path, "r") as f:
-            data_dict = json.load(f)
+            data_dict: dict[str, Any] = json.load(f)
 
         # Write it back updated
         with open(config_path, "w") as file:

From 0950866fa8728896d04ac0fdf707813299f1d621 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Tue, 16 Jul 2024 15:43:24 -0400
Subject: [PATCH 1203/1571] fix(pageserver): limit num of delta layers for l0
 compaction (#8391)

## Problem

close https://github.com/neondatabase/neon/issues/8389

## Summary of changes

A quick mitigation for tenants with fast writes. We compact at most 60
delta layers at a time, expecting a memory footprint of 15GB. We will
pick the oldest 60 L0 layers.

This should be a relatively safe change so no test is added. Question is
whether to make this parameter configurable via tenant config.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: John Spray <john@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 31 ++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index cbb3303341..f251b667c2 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -26,6 +26,7 @@ use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
+use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
@@ -415,6 +416,7 @@ impl Timeline {
             .map(|x| guard.get_from_desc(&x))
             .collect_vec();
         stats.level0_deltas_count = Some(level0_deltas.len());
+
         // Only compact if enough layers have accumulated.
         let threshold = self.get_compaction_threshold();
         if level0_deltas.is_empty() || level0_deltas.len() < threshold {
@@ -445,6 +447,22 @@ impl Timeline {
         let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
         let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
 
+        // Accumulate the size of layers in `deltas_to_compact`
+        let mut deltas_to_compact_bytes = 0;
+
+        // Under normal circumstances, we will accumulate up to compaction_interval L0s of size
+        // checkpoint_distance each.  To avoid edge cases using extra system resources, bound our
+        // work in this function to only operate on this much delta data at once.
+        //
+        // Take the max of the configured value & the default, so that tests that configure tiny values
+        // can still use a sensible amount of memory, but if a deployed system configures bigger values we
+        // still let them compact a full stack of L0s in one go.
+        let delta_size_limit = std::cmp::max(
+            self.get_compaction_threshold(),
+            DEFAULT_COMPACTION_THRESHOLD,
+        ) as u64
+            * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
+
         deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
         for l in level0_deltas_iter {
             let lsn_range = &l.layer_desc().lsn_range;
@@ -453,7 +471,20 @@ impl Timeline {
                 break;
             }
             deltas_to_compact.push(l.download_and_keep_resident().await?);
+            deltas_to_compact_bytes += l.metadata().file_size;
             prev_lsn_end = lsn_range.end;
+
+            if deltas_to_compact_bytes >= delta_size_limit {
+                info!(
+                    l0_deltas_selected = deltas_to_compact.len(),
+                    l0_deltas_total = level0_deltas.len(),
+                    "L0 compaction picker hit max delta layer size limit: {}",
+                    delta_size_limit
+                );
+
+                // Proceed with compaction, but only a subset of L0s
+                break;
+            }
         }
         let lsn_range = Range {
             start: deltas_to_compact

From f4f0869dc841374921e7fb3ff353ecbc2b2267a0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 16 Jul 2024 20:55:29 +0100
Subject: [PATCH 1204/1571] pageserver: exclude un-read layers from short
 residence statistic (#8396)

## Problem

The `evictions_with_low_residence_duration` is used as an indicator of
cache thrashing. However, there are situations where it is quite
legitimate to only have a short residence during compaction, where a
delta is downloaded, used to generate an image layer, and then
discarded. This can lead to false positive alerts.

## Summary of changes

- Only track low residence duration for layers that have been accessed
at least once (compaction doesn't count as an access). This will give us
a metric that indicates thrashing on layers that the _user_ is using,
rather than those we're downloading for housekeeping purposes.

Once we add "layer visibility" as an explicit property of layers, this
can also be used as a cleaner condition (residence of non-visible layers
should never be alertable)
---
 pageserver/src/tenant/storage_layer.rs       | 20 ++++++++++++++++++++
 pageserver/src/tenant/storage_layer/layer.rs | 20 ++++++++++++++------
 test_runner/regress/test_tenant_conf.py      | 11 +++++++++++
 3 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 62730f88b2..2f0c45317d 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -676,6 +676,26 @@ impl LayerAccessStats {
             },
         }
     }
+
+    /// Whether this layer has been accessed (excluding in [`AccessStatsBehavior::Skip`]).
+    ///
+    /// This indicates whether the layer has been used for some purpose that would motivate
+    /// us to keep it on disk, such as for serving a getpage request.
+    fn accessed(&self) -> bool {
+        let locked = self.0.lock().unwrap();
+        let inner = &locked.for_eviction_policy;
+
+        // Consider it accessed if the most recent access is more recent than
+        // the most recent change in residence status.
+        match (
+            inner.last_accesses.recent(),
+            inner.last_residence_changes.recent(),
+        ) {
+            (None, _) => false,
+            (Some(_), None) => true,
+            (Some(a), Some(r)) => a.when >= r.timestamp,
+        }
+    }
 }
 
 /// Get a layer descriptor from a layer.
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 02069c29d2..4500bc94dd 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1469,14 +1469,22 @@ impl LayerInner {
                 let duration = SystemTime::now().duration_since(local_layer_mtime);
                 match duration {
                     Ok(elapsed) => {
-                        timeline
-                            .metrics
-                            .evictions_with_low_residence_duration
-                            .read()
-                            .unwrap()
-                            .observe(elapsed);
+                        let accessed = self.access_stats.accessed();
+                        if accessed {
+                            // Only layers used for reads contribute to our "low residence" metric that is used
+                            // to detect thrashing.  Layers promoted for other reasons (e.g. compaction) are allowed
+                            // to be rapidly evicted without contributing to this metric.
+                            timeline
+                                .metrics
+                                .evictions_with_low_residence_duration
+                                .read()
+                                .unwrap()
+                                .observe(elapsed);
+                        }
+
                         tracing::info!(
                             residence_millis = elapsed.as_millis(),
+                            accessed,
                             "evicted layer after known residence period"
                         );
                     }
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 1a8bc3b983..9fb7324fa1 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -8,6 +8,7 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.utils import wait_until
+from fixtures.workload import Workload
 
 
 def test_tenant_config(neon_env_builder: NeonEnvBuilder):
@@ -265,6 +266,13 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
     (tenant_id, timeline_id) = env.initial_tenant, env.initial_timeline
     ps_http = env.pageserver.http_client()
 
+    # When we evict/download layers, we will use this Workload to generate getpage requests
+    # that touch some layers, as otherwise the pageserver doesn't report totally unused layers
+    # as problems when they have short residence duration.
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(100)
+
     def get_metric():
         metrics = ps_http.get_metrics()
         metric = metrics.query_one(
@@ -285,6 +293,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
     assert default_value == "1day"
 
     ps_http.download_all_layers(tenant_id, timeline_id)
+    workload.validate()
     ps_http.evict_all_layers(tenant_id, timeline_id)
     metric = get_metric()
     assert int(metric.value) > 0, "metric is updated"
@@ -305,6 +314,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
     assert int(metric.value) == 0
 
     ps_http.download_all_layers(tenant_id, timeline_id)
+    workload.validate()
     ps_http.evict_all_layers(tenant_id, timeline_id)
     metric = get_metric()
     assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60
@@ -318,6 +328,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
     assert int(metric.value) == 0, "value resets if label changes"
 
     ps_http.download_all_layers(tenant_id, timeline_id)
+    workload.validate()
     ps_http.evict_all_layers(tenant_id, timeline_id)
     metric = get_metric()
     assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60

From 4a90423292a2c6abec84a75d8c4cb2c3306baeed Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 16 Jul 2024 21:36:17 +0100
Subject: [PATCH 1205/1571] pageserver: reduce size of delta layer ValueRef
 (#8401)

## Problem

ValueRef is an unnecessarily large structure, because it carries a
cursor. L0 compaction currently instantiates gigabytes of these under
some circumstances.

## Summary of changes

- Carry a ref to the parent layer instead of a cursor, and construct a
cursor on demand.

This reduces RSS high watermark during L0 compaction by about 20%.
---
 .../src/tenant/storage_layer/delta_layer.rs   | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 2d36ac7442..64412fe4af 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1180,9 +1180,7 @@ impl DeltaLayerInner {
                     let delta_key = DeltaKey::from_slice(key);
                     let val_ref = ValueRef {
                         blob_ref: BlobRef(value),
-                        reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
-                            Adapter(self),
-                        )),
+                        layer: self,
                     };
                     let pos = BlobRef(value).pos();
                     if let Some(last) = all_keys.last_mut() {
@@ -1426,7 +1424,7 @@ impl DeltaLayerInner {
         let keys = self.load_keys(ctx).await?;
 
         async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
-            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
+            let buf = val.load_raw(ctx).await?;
             let val = Value::des(&buf)?;
             let desc = match val {
                 Value::Image(img) => {
@@ -1461,8 +1459,7 @@ impl DeltaLayerInner {
             use pageserver_api::key::CHECKPOINT_KEY;
             use postgres_ffi::CheckPoint;
             if key == CHECKPOINT_KEY {
-                let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
-                let val = Value::des(&buf)?;
+                let val = val.load(ctx).await?;
                 match val {
                     Value::Image(img) => {
                         let checkpoint = CheckPoint::decode(&img)?;
@@ -1547,17 +1544,24 @@ pub struct DeltaEntry<'a> {
 /// Reference to an on-disk value
 pub struct ValueRef<'a> {
     blob_ref: BlobRef,
-    reader: BlockCursor<'a>,
+    layer: &'a DeltaLayerInner,
 }
 
 impl<'a> ValueRef<'a> {
     /// Loads the value from disk
     pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
-        // theoretically we *could* record an access time for each, but it does not really matter
-        let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?;
+        let buf = self.load_raw(ctx).await?;
         let val = Value::des(&buf)?;
         Ok(val)
     }
+
+    async fn load_raw(&self, ctx: &RequestContext) -> Result<Vec<u8>> {
+        let reader = BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(Adapter(
+            self.layer,
+        )));
+        let buf = reader.read_blob(self.blob_ref.pos(), ctx).await?;
+        Ok(buf)
+    }
 }
 
 pub(crate) struct Adapter<T>(T);

From f7131834eb55efc2d49a4e660a763d590c74a0a2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 17 Jul 2024 15:25:35 +0100
Subject: [PATCH 1206/1571] docs/rfcs: timeline ancestor detach API (#6888)

## Problem

When a tenant creates a new timeline that they will treat as their
'main' history,
it is awkward to permanently retain an 'old main' timeline as its
ancestor. Currently
this is necessary because it is forbidden to delete a timeline which has
descendents.

## Summary of changes

A new pageserver API is proposed to 'adopt' data from a parent timeline
into
one of its children, such that the link between ancestor and child can
be severed,
leaving the parent in a state where it may then be deleted.

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 docs/rfcs/034-ancestor-deletion.md | 252 +++++++++++++++++++++++++++++
 1 file changed, 252 insertions(+)
 create mode 100644 docs/rfcs/034-ancestor-deletion.md

diff --git a/docs/rfcs/034-ancestor-deletion.md b/docs/rfcs/034-ancestor-deletion.md
new file mode 100644
index 0000000000..7341d930e2
--- /dev/null
+++ b/docs/rfcs/034-ancestor-deletion.md
@@ -0,0 +1,252 @@
+# Ancestor Timeline Deletion
+
+Created on: 2024-02-23
+
+Author: John Spray
+
+# Summary
+
+When a tenant creates a new timeline that they will treat as their 'main' history,
+it is awkward to permanently retain an 'old main' timeline as its ancestor. Currently
+this is necessary because it is forbidden to delete a timeline which has descendents.
+
+A new pageserver API is proposed to 'adopt' data from a parent timeline into
+one of its children, such that the link between ancestor and child can be severed,
+leaving the parent in a state where it may then be deleted.
+
+# Motivation
+
+Retaining parent timelines currently has two costs:
+
+- Cognitive load on users, who have to remember which is the "real" main timeline.
+- Storage capacity cost, as the parent timeline will retain layers up to the
+  child's timeline point, even if the child fully covers its keyspace with image
+  layers and will never actually read from the parent.
+
+# Solution
+
+A new pageserver API `PUT /v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor`
+will be added. The `timeline_id` in this URL is that of the _child_ timeline that we
+wish to detach from its parent.
+
+On success, this API will leave the following state:
+
+- The detached child timeline will no longer have an ancestor, and will contain all
+  the data needed to service reads without recursing into an ancestor.
+- Any other children of the parent whose timeline points were at a lower LSN than
+  the detached child timeline will be modified to have the child timeline as their
+  new parent.
+- The parent timeline will still exist, but the child will no longer have it as an
+  ancestor. If this was the last timeline that depended on the parent, then the
+  parent will become deletable.
+
+This API's implementation will consist of a series of retryable steps, such that
+on failures/timeout it can safely be called again to reach the target state.
+
+## Example
+
+### Before
+
+The user has "rolled back" their project to LSN X, resulting in a "new main"
+timeline. The parent "old main" timeline still exists, and they would like
+to clean it up.
+
+They have two other timelines A and B. A is from before the rollback point,
+and B is from after the rollback point.
+
+```
+----"old main" timeline-------X-------------------------------------------->
+                |             |                         |
+                |-> child A   |                         |
+                              |-> "new main" timeline   |
+                                                        -> child B
+
+```
+
+### After calling detach ancestor API
+
+The "new main" timeline is no longer dependent on old main, and neither
+is child A, because it had a branch point before X.
+
+The user may now choose to delete child B and "old main" to get to
+a pristine state. Child B is likely to be unwanted since the user
+chose to roll back to X, and it branches from after X. However, we
+don't assume this in the API; it is up to the user to delete it.
+
+```
+|----"old main" timeline---------------------------------------------------->
+                                                         |
+                                                         |
+                                                         |
+                                                         -> child B
+
+|----"new main" timeline--------->
+                 |
+                 |-> child A
+
+
+```
+
+### After removing timelines
+
+We end up with a totally clean state that leaves no trace that a rollback
+ever happened: there is only one root timeline.
+
+```
+| ----"new main" timeline----------->
+                |
+                |-> child A
+
+
+```
+
+## Caveats
+
+Important things for API users to bear in mind:
+
+- this API does not delete the parent timeline: you must still do that explicitly.
+- if there are other child timelines ahead of the branch point of the detached
+  child, the parent won't be deletable: you must either delete or detach those
+  children.
+- do _not_ simply loop over all children and detach them all: this can have an
+  extremely high storage cost. The detach ancestor API is intended for use on a single
+  timeline to make it the new "main".
+- The detach ancestor API should also not be
+  exposed directly to the user as button/API, because they might decide
+  to click it for all the children and thereby generate many copies of the
+  parent's data -- the detach ancestor API should be used as part
+  of a high level "clean up after rollback" feature.
+
+## `detach_ancestor` API implementation
+
+Terms used in the following sections:
+
+- "the child": the timeline whose ID is specified in the detach ancestor API URL, also
+  called "new main" in the example.
+- "the parent": the parent of "the child". Also called "old main" in the example.
+- "the branch point" the ancestor_lsn of "the child"
+
+### Phase 1: write out adopted layers to S3
+
+The child will "adopt" layers from the parent, such that its end state contains
+all the parent's history as well as its own.
+
+For all layers in the parent's layer map whose high LSN is below the branch
+point, issue S3 CopyObject requests to duplicate them into the child timeline's
+prefix. Do not add them to the child's layer map yet.
+
+For delta layers in the parent's layer map which straddle the branch point, read them
+and write out only content up to the branch point into new layer objects.
+
+This is a long running operation if the parent has many layers: it should be
+implemented in a way that resumes rather than restarting from scratch, if the API
+times out and is called again.
+
+As an optimization, if there are no other timelines that will be adopted into
+the child, _and_ the child's image layers already full cover the branch LSN,
+then we may skip adopting layers.
+
+### Phase 2: update the child's index
+
+Having written out all needed layers in phase 1, atomically link them all
+into the child's IndexPart and upload to S3. This may be done while the
+child Timeline is still running.
+
+### Phase 3: modify timelines ancestry
+
+Modify the child's ancestor to None, and upload its IndexPart to persist the change.
+
+For all timelines which have the same parent as the child, and have a branch
+point lower than our branch point, switch their ancestor_timeline to the child,
+and upload their IndexPart to persist the change.
+
+## Alternatives considered
+
+### Generate full image layer on child, rather than adopting parent deltas
+
+This would work for the case of a single child, but would prevent re-targeting
+other timelines that depended on the parent. If we detached many children this
+way, the storage cost would become prohibitive (consider a 1TB database with
+100 child timelines: it would cost 100TiB if they all generated their own image layers).
+
+### Don't rewrite anything: just fake it in the API
+
+We could add a layer of indirection that let a child "pretend" that it had no
+ancestor, when in reality it still had the parent. The pageserver API could
+accept deletion of ancestor timelines, and just update child metadata to make
+them look like they have no ancestor.
+
+This would not achieve the desired reduction in storage cost, and may well be more
+complex to maintain than simply implementing the API described in this RFC.
+
+### Avoid copying objects: enable child index to use parent layers directly
+
+We could teach IndexPart to store a TimelineId for each layer, such that a child
+timeline could reference a parent's layers directly, rather than copying them
+into the child's prefix.
+
+This would impose a cost for the normal case of indices that only target the
+timeline's own layers, add complexity, and break the useful simplifying
+invariant that timelines "own" their own path. If child timelines were
+referencing layers from the parent, we would have to ensure that the parent
+never runs GC/compaction again, which would make the API less flexible (the
+proposal in this RFC enables deletion of the parent but doesn't require it.)
+
+## Performance
+
+### Adopting layers
+
+- CopyObject is a relatively cheap operation, but we may need to issue tens of thousands
+  of such requests: this can take up to tens of seconds and will compete for RemoteStorage
+  semaphore units with other activity on the pageserver.
+- If we are running on storage backend that doesn't implement CopyObject, then
+  this part will be much more expensive as we would stream all layer content
+  through the pageserver. This is no different to issuing a lot
+  of reads to a timeline that does not have a warm local cache: it will move
+  a lot of gigabytes, but that shouldn't break anything.
+- Generating truncated layers for delta that straddle the branch point will
+  require streaming read/write of all the layers in question.
+
+### Updating timeline ancestry
+
+The simplest way to update timeline ancestry will probably be to stop and start
+all the Timeline objects: this is preferable to the complexity of making their
+ancestry mutable at runtime.
+
+There will be a corresponding "stutter" in the availability of the timelines,
+of the order 10-100ms, which is the time taken to upload their IndexPart, and
+restart the Timeline.
+
+# Interaction with other features
+
+## Concurrent timeline creation
+
+If new historic timelines are created using the parent as an ancestor while the
+detach ancestor API is running, they will not be re-parented to the child. This
+doesn't break anything, but it leaves the parent in a state where it might not
+be possible to delete it.
+
+Since timeline creations are an explicit user action, this is not something we need to
+worry about as the storage layer: a user who wants to delete their parent timeline will not create
+new children, and if they do, they can choose to delete those children to
+enable deleting the parent.
+
+For the least surprise to the user, before starting the detach ancestor branch
+operation, the control plane should wait until all branches are created and not
+allow any branches to be created before the branch point on the ancestor branch
+while the operation is ongoing.
+
+## WAL based disaster recovery
+
+WAL based disaster recovery currently supports only restoring of the main
+branch. Enabling WAL based disaster recovery in the future requires that we
+keep a record which timeline generated the WAL and at which LSN was a parent
+detached. Keep a list of timeline ids and the LSN in which they were detached in
+the `index_part.json`. Limit the size of the list to 100 first entries, after
+which the WAL disaster recovery will not be possible.
+
+## Sharded tenants
+
+For sharded tenants, calls to the detach ancestor API will pass through the storage
+controller, which will handle them the same as timeline creations: invoke first
+on shard zero, and then on all the other shards.

From f2b8e390e77c157d8f7ebef573bb226a313a8478 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 17 Jul 2024 16:56:32 +0200
Subject: [PATCH 1207/1571] Bodobolero/pgbench compare azure (#8409)

## Problem

We want to run performance tests on all supported cloud providers.
We want to run most tests on the postgres version which is default for
new projects in production, currently (July 24) this is postgres version
16

## Summary of changes

- change default postgres version for some (performance) tests to 16
(which is our default for new projects in prod anyhow)
- add azure region to pgbench_compare jobs

- add azure region to pgvector benchmarking jobs
- re-used project `weathered-snowflake-88107345` was prepared with 1
million embeddings running on 7 minCU 7 maxCU in azure region to compare
with AWS region (pgvector indexing and hnsw queries)
  - see job pgbench-pgvector

- Note we now have a 11 environments combinations where we run
pgbench-compare and 5 are for k8s-pod (deprecated) which we can remove
in the future once auto-scaling team approves.

## Logs

A current run with the changes from this pull request is running here
https://github.com/neondatabase/neon/actions/runs/9972096222

Note that we currently expect some failures due to
- https://github.com/neondatabase/neon/issues/8275
- instability of projects on azure region
---
 .../actions/neon-project-create/action.yml    |  4 +-
 .github/workflows/benchmarking.yml            | 70 ++++++++++++++-----
 2 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index 16759ad038..d4029bd37c 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -9,8 +9,8 @@ inputs:
     description: 'Region ID, if not set the project will be created in the default region'
     default: aws-us-east-2
   postgres_version:
-    description: 'Postgres version; default is 15'
-    default: '15'
+    description: 'Postgres version; default is 16'
+    default: '16'
   api_host:
     description: 'Neon API host'
     default: console-stage.neon.build
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index d038f64f15..d785156a29 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -59,7 +59,7 @@ jobs:
     strategy:
       matrix:
         include:
-          - DEFAULT_PG_VERSION: 14
+          - DEFAULT_PG_VERSION: 16
             PLATFORM: "neon-staging"
             region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
             provisioner: 'k8s-pod' 
@@ -146,6 +146,7 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   replication-tests:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
       DEFAULT_PG_VERSION: 14
@@ -190,6 +191,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 5400
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -215,11 +217,14 @@ jobs:
     # Available platforms:
     # - neon-captest-new: Freshly created project (1 CU)
     # - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
+    # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
+    # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
     # - neon-captest-reuse: Reusing existing project
     # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
     # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
     env:
       RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
+      DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
     runs-on: ubuntu-22.04
     outputs:
       pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
@@ -230,23 +235,33 @@ jobs:
     - name: Generate matrix for pgbench benchmark
       id: pgbench-compare-matrix
       run: |
+        region_id_default=${{ env.DEFAULT_REGION_ID }}
         matrix='{
+          "pg_version" : [
+            16
+          ],
+          "region_id" : [
+            "'"$region_id_default"'"
+            ],
           "platform": [
             "neon-captest-new",
             "neon-captest-reuse",
             "neonvm-captest-new"
           ],
           "db_size": [ "10gb" ],
-          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
-                      { "platform": "neon-captest-new",              "db_size": "50gb" },
-                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier",         "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new",              "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
         }'
 
         if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
-                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"},
+                                                     { "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-aurora",   "db_size": "50gb"}]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -298,7 +313,7 @@ jobs:
       TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
       TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -323,14 +338,14 @@ jobs:
         prefix: latest
 
     - name: Create Neon Project
-      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
+      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
       id: create-neon-project
       uses: ./.github/actions/neon-project-create
       with:
-        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+        region_id: ${{ matrix.region_id }}
         postgres_version: ${{ env.DEFAULT_PG_VERSION }}
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
+        compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
         provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
 
     - name: Set up Connection String
@@ -343,7 +358,7 @@ jobs:
           neonvm-captest-sharding-reuse)
             CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
             ;;
-          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
+          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
             CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
             ;;
           rds-aurora)
@@ -368,6 +383,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -381,6 +397,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -394,6 +411,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -420,6 +438,12 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   pgbench-pgvector:
+    strategy:
+      matrix:
+        include:
+          - PLATFORM: "neon-captest-pgvector"
+          - PLATFORM: "azure-captest-pgvector"
+            
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
       TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -428,7 +452,7 @@ jobs:
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: "neon-captest-pgvector"
+      PLATFORM: ${{ matrix.PLATFORM }}
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
@@ -448,7 +472,18 @@ jobs:
     - name: Set up Connection String
       id: set-up-connstr
       run: |
-        CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
+        case "${PLATFORM}" in
+          neon-captest-pgvector)
+            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
+            ;;
+          azure-captest-pgvector)
+            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }}
+            ;;
+          *)
+            echo >&2 "Unknown PLATFORM=${PLATFORM}"
+            exit 1
+            ;;
+        esac
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
@@ -460,6 +495,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -473,6 +509,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -487,7 +524,7 @@ jobs:
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
@@ -735,6 +772,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"

From 839a5724a4d28b775fbcab03c9e3b3643e2f0086 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Wed, 17 Jul 2024 11:22:38 -0400
Subject: [PATCH 1208/1571] test(pageserver): more k-merge tests on duplicated
 keys (#8404)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Existing tenants and some selection of layers might produce duplicated
keys. Add tests to ensure the k-merge iterator handles it correctly. We
also enforced ordering of the k-merge iterator to put images before
deltas.

part of https://github.com/neondatabase/neon/issues/8002

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 .../src/tenant/storage_layer/delta_layer.rs   |  16 +-
 .../tenant/storage_layer/merge_iterator.rs    | 163 ++++++++++++++++--
 2 files changed, 163 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 64412fe4af..43941b6e17 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1672,6 +1672,7 @@ pub(crate) mod test {
     use rand::RngCore;
 
     use super::*;
+    use crate::repository::Value;
     use crate::tenant::harness::TIMELINE_ID;
     use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
     use crate::tenant::Tenant;
@@ -1681,6 +1682,7 @@ pub(crate) mod test {
         tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
         DEFAULT_PG_VERSION,
     };
+    use bytes::Bytes;
 
     /// Construct an index for a fictional delta layer and and then
     /// traverse in order to plan vectored reads for a query. Finally,
@@ -2249,6 +2251,15 @@ pub(crate) mod test {
         (k1, l1).cmp(&(k2, l2))
     }
 
+    pub(crate) fn sort_delta_value(
+        (k1, l1, v1): &(Key, Lsn, Value),
+        (k2, l2, v2): &(Key, Lsn, Value),
+    ) -> std::cmp::Ordering {
+        let order_1 = if v1.is_image() { 0 } else { 1 };
+        let order_2 = if v2.is_image() { 0 } else { 1 };
+        (k1, l1, order_1).cmp(&(k2, l2, order_2))
+    }
+
     pub(crate) async fn produce_delta_layer(
         tenant: &Tenant,
         tline: &Arc<Timeline>,
@@ -2257,7 +2268,7 @@ pub(crate) mod test {
     ) -> anyhow::Result<ResidentLayer> {
         deltas.sort_by(sort_delta);
         let (key_start, _, _) = deltas.first().unwrap();
-        let (key_max, _, _) = deltas.first().unwrap();
+        let (key_max, _, _) = deltas.last().unwrap();
         let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
         let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
         let lsn_end = Lsn(lsn_max.0 + 1);
@@ -2302,9 +2313,6 @@ pub(crate) mod test {
 
     #[tokio::test]
     async fn delta_layer_iterator() {
-        use crate::repository::Value;
-        use bytes::Bytes;
-
         let harness = TenantHarness::create("delta_layer_iterator").unwrap();
         let (tenant, ctx) = harness.load().await;
 
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index 68759f7585..0edfd4bd40 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -96,15 +96,22 @@ impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
 impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
         use std::cmp::Ordering;
-        let a = self.peek_next_key_lsn();
-        let b = other.peek_next_key_lsn();
+        let a = self.peek_next_key_lsn_value();
+        let b = other.peek_next_key_lsn_value();
         match (a, b) {
-            (Some((k1, l1)), Some((k2, l2))) => {
-                let loaded_1 = if self.is_loaded() { 1 } else { 0 };
-                let loaded_2 = if other.is_loaded() { 1 } else { 0 };
+            (Some((k1, l1, v1)), Some((k2, l2, v2))) => {
+                fn map_value_to_num(val: &Option<&Value>) -> usize {
+                    match val {
+                        None => 0,
+                        Some(Value::Image(_)) => 1,
+                        Some(Value::WalRecord(_)) => 2,
+                    }
+                }
+                let order_1 = map_value_to_num(&v1);
+                let order_2 = map_value_to_num(&v2);
                 // When key_lsn are the same, the unloaded iter will always appear before the loaded one.
                 // And note that we do a reverse at the end of the comparison, so it works with the max heap.
-                (k1, l1, loaded_1).cmp(&(k2, l2, loaded_2))
+                (k1, l1, order_1).cmp(&(k2, l2, order_2))
             }
             (Some(_), None) => Ordering::Less,
             (None, Some(_)) => Ordering::Greater,
@@ -137,13 +144,16 @@ impl<'a> IteratorWrapper<'a> {
         }
     }
 
-    fn peek_next_key_lsn(&self) -> Option<(&Key, Lsn)> {
+    fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> {
         match self {
-            Self::Loaded { iter } => iter.peek().as_ref().map(|(key, lsn, _)| (key, *lsn)),
+            Self::Loaded { iter } => iter
+                .peek()
+                .as_ref()
+                .map(|(key, lsn, val)| (key, *lsn, Some(val))),
             Self::NotLoaded {
                 first_key_lower_bound: (key, lsn),
                 ..
-            } => Some((key, *lsn)),
+            } => Some((key, *lsn, None)),
         }
     }
 
@@ -191,6 +201,13 @@ impl<'a> IteratorWrapper<'a> {
     }
 }
 
+/// A merge iterator over delta/image layer iterators. When duplicated records are
+/// found, the iterator will not perform any deduplication, and the caller should handle
+/// these situation. By saying duplicated records, there are many possibilities:
+/// * Two same delta at the same LSN.
+/// * Two same image at the same LSN.
+/// * Delta/image at the same LSN where the image has already applied the delta.
+/// The iterator will always put the image before the delta.
 pub struct MergeIterator<'a> {
     heap: BinaryHeap<IteratorWrapper<'a>>,
 }
@@ -245,8 +262,9 @@ mod tests {
     use crate::{
         tenant::{
             harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
+            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
         },
+        walrecord::NeonWalRecord,
         DEFAULT_PG_VERSION,
     };
 
@@ -407,6 +425,127 @@ mod tests {
         // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
     }
 
-    // TODO: image layer merge, delta+image mixed merge
-    // TODO: is it possible to have duplicated delta at same LSN now? we might need to test that
+    #[tokio::test]
+    async fn delta_image_mixed_merge() {
+        use crate::repository::Value;
+        use bytes::Bytes;
+
+        let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+        // In this test case, we want to test if the iterator still works correctly with multiple copies
+        // of a delta+image at the same LSN, for example, the following sequence a@10=+a, a@10=+a, a@10=ab, a@10=ab.
+        // Duplicated deltas/images are possible for old tenants before the full L0 compaction file name fix.
+        // An incomplete compaction could produce multiple exactly-the-same delta layers. Force image generation
+        // could produce overlapping images. Apart from duplicated deltas/images, in the current storage implementation
+        // one key-lsn could have a delta in the delta layer and one image in the image layer. The iterator should
+        // correctly process these situations and return everything as-is, and the upper layer of the system
+        // will handle duplicated LSNs.
+        let test_deltas1 = vec![
+            (
+                get_key(0),
+                Lsn(0x10),
+                Value::WalRecord(NeonWalRecord::wal_init()),
+            ),
+            (
+                get_key(0),
+                Lsn(0x18),
+                Value::WalRecord(NeonWalRecord::wal_append("a")),
+            ),
+            (
+                get_key(5),
+                Lsn(0x10),
+                Value::WalRecord(NeonWalRecord::wal_init()),
+            ),
+            (
+                get_key(5),
+                Lsn(0x18),
+                Value::WalRecord(NeonWalRecord::wal_append("b")),
+            ),
+        ];
+        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut test_deltas2 = test_deltas1.clone();
+        test_deltas2.push((
+            get_key(10),
+            Lsn(0x20),
+            Value::Image(Bytes::copy_from_slice(b"test")),
+        ));
+        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
+            .await
+            .unwrap();
+        let test_deltas3 = vec![
+            (
+                get_key(0),
+                Lsn(0x10),
+                Value::Image(Bytes::copy_from_slice(b"")),
+            ),
+            (
+                get_key(5),
+                Lsn(0x18),
+                Value::Image(Bytes::copy_from_slice(b"b")),
+            ),
+            (
+                get_key(15),
+                Lsn(0x20),
+                Value::Image(Bytes::copy_from_slice(b"test")),
+            ),
+        ];
+        let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut test_deltas4 = test_deltas3.clone();
+        test_deltas4.push((
+            get_key(20),
+            Lsn(0x20),
+            Value::Image(Bytes::copy_from_slice(b"test")),
+        ));
+        let resident_layer_4 = produce_delta_layer(&tenant, &tline, test_deltas4.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut expect = Vec::new();
+        expect.extend(test_deltas1);
+        expect.extend(test_deltas2);
+        expect.extend(test_deltas3);
+        expect.extend(test_deltas4);
+        expect.sort_by(sort_delta_value);
+
+        // Test with different layer order for MergeIterator::create to ensure the order
+        // is stable.
+
+        let mut merge_iter = MergeIterator::create(
+            &[
+                resident_layer_4.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
+            ],
+            &[],
+            &ctx,
+        );
+        assert_merge_iter_equal(&mut merge_iter, &expect).await;
+
+        let mut merge_iter = MergeIterator::create(
+            &[
+                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_4.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
+            ],
+            &[],
+            &ctx,
+        );
+        assert_merge_iter_equal(&mut merge_iter, &expect).await;
+    }
 }

From 975f8ac658243640c7d695e2bcc0acad3e72ccdb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 17 Jul 2024 18:35:27 +0100
Subject: [PATCH 1209/1571] tests: add test_compaction_l0_memory (#8403)

This test reproduces the case of a writer creating a deep stack of L0
layers. It uses realistic layer sizes and writes several gigabytes of
data, therefore runs as a performance test although it is validating
memory footprint rather than performance per se.

It acts a regression test for two recent fixes:
- https://github.com/neondatabase/neon/pull/8401
- https://github.com/neondatabase/neon/pull/8391

In future it will demonstrate the larger improvement of using a k-merge
iterator for L0 compaction (#8184)

This test can be extended to enforce limits on the memory consumption of
other housekeeping steps, by restarting the pageserver and then running
other things to do the same "how much did RSS increase" measurement.
---
 test_runner/fixtures/pageserver/http.py    |  3 +
 test_runner/performance/test_compaction.py | 96 ++++++++++++++++++++++
 2 files changed, 99 insertions(+)

diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index f1e3d1a309..c7cea4ec04 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -117,6 +117,9 @@ class LayerMapInfo:
     def image_layers(self) -> List[HistoricLayerInfo]:
         return [x for x in self.historic_layers if x.kind == "Image"]
 
+    def delta_l0_layers(self) -> List[HistoricLayerInfo]:
+        return [x for x in self.historic_layers if x.kind == "Delta" and x.l0]
+
     def historic_by_name(self) -> Set[str]:
         return set(x.layer_file_name for x in self.historic_layers)
 
diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py
index 326c4f5c6f..077b76104c 100644
--- a/test_runner/performance/test_compaction.py
+++ b/test_runner/performance/test_compaction.py
@@ -2,6 +2,7 @@ from contextlib import closing
 
 import pytest
 from fixtures.compare_fixtures import NeonCompare
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import wait_for_last_flush_lsn
 
 
@@ -56,3 +57,98 @@ def test_compaction(neon_compare: NeonCompare):
         pageserver_http.timeline_compact(tenant_id, timeline_id)
 
     neon_compare.report_size()
+
+
+def test_compaction_l0_memory(neon_compare: NeonCompare):
+    """
+    Generate a large stack of L0s pending compaction into L1s, and
+    measure the pageserver's peak RSS while doing so
+    """
+
+    env = neon_compare.env
+    pageserver_http = env.pageserver.http_client()
+
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        conf={
+            # Initially disable compaction so that we will build up a stack of L0s
+            "compaction_period": "0s",
+            "gc_period": "0s",
+        }
+    )
+    neon_compare.tenant = tenant_id
+    neon_compare.timeline = timeline_id
+
+    endpoint = env.endpoints.create_start(
+        "main", tenant_id=tenant_id, config_lines=["shared_buffers=512MB"]
+    )
+
+    # Read tenant effective config and assert on checkpoint_distance and compaction_threshold,
+    # as we do want to test with defaults (to be same as the field), but this test's workload size makes assumptions about them.
+    #
+    # If these assertions fail, it probably means we changed the default.
+    tenant_conf = pageserver_http.tenant_config(tenant_id)
+    assert tenant_conf.effective_config["checkpoint_distance"] == 256 * 1024 * 1024
+    assert tenant_conf.effective_config["compaction_threshold"] == 10
+
+    # Aim to write about 20 L0s, so that we will hit the limit on how many
+    # to compact at once
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            for i in range(200):
+                cur.execute(f"create table tbl{i} (i int, j int);")
+                cur.execute(f"insert into tbl{i} values (generate_series(1, 1000), 0);")
+                for j in range(100):
+                    cur.execute(f"update tbl{i} set j = {j};")
+
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+    endpoint.stop()
+
+    # Check we have generated the L0 stack we expected
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    initial_l0s = len(layers.delta_l0_layers())
+    initial_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers())
+    log.info(f"l0s before compaction {initial_l0s} ({initial_l0s_size})")
+
+    def rss_hwm():
+        v = pageserver_http.get_metric_value("libmetrics_maxrss_kb")
+        assert v is not None
+        assert v > 0
+        return v * 1024
+
+    before = rss_hwm()
+    pageserver_http.timeline_compact(tenant_id, timeline_id)
+    after = rss_hwm()
+
+    log.info(f"RSS across compaction: {before} -> {after} (grew {after - before})")
+
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    final_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers())
+    log.info(f"l0s after compaction {len(layers.delta_l0_layers())} ({final_l0s_size})")
+
+    assert after > before  # If we didn't use some memory the test is probably buggy
+    compaction_mapped_rss = after - before
+
+    # During L0 compaction, we require as much memory as the physical size of what we compacted, and then some,
+    # because the key->value mapping in L0s compaction is exhaustive, non-streaming, and does not de-duplicate
+    # repeated references to the same key.
+    #
+    # To be fixed in https://github.com/neondatabase/neon/issues/8184, after which
+    # this memory estimate can be revised far downwards to something that doesn't scale
+    # linearly with the layer sizes.
+    MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.25
+
+    # If we find that compaction is using more memory, this may indicate a regression
+    assert compaction_mapped_rss < MEMORY_ESTIMATE
+
+    # If we find that compaction is using <0.5 the expected memory then:
+    # - maybe we made a big efficiency improvement, in which case update the test
+    # - maybe something is functionally wrong with the test and it's not driving the system as expected
+    assert compaction_mapped_rss > MEMORY_ESTIMATE / 2
+
+    # We should have compacted some but not all of the l0s, based on the limit on how much
+    # l0 to compact in one go
+    assert len(layers.delta_l0_layers()) > 0
+    assert len(layers.delta_l0_layers()) < initial_l0s
+
+    # The pageserver should have logged when it hit the compaction size limit
+    env.pageserver.assert_log_contains(".*hit max delta layer size limit.*")

From da84a250c69b82362af56360eeae9117d82fb94a Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Wed, 17 Jul 2024 15:19:40 -0400
Subject: [PATCH 1210/1571] docs: update storage controller db name in doc
 (#8411)

The db name was renamed to storage_controller from attachment_service.
Doc was stale.
---
 docs/storage_controller.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/storage_controller.md b/docs/storage_controller.md
index daf4d0c8b7..6d2ef929a4 100644
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -44,7 +44,7 @@ If you need to modify the database schema, here’s how to create a migration:
 - Use `diesel migration generate <name>` to create a new migration
 - Populate the SQL files in the `migrations/` subdirectory
 - Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
-  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
+  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/storage_controller`
 - Commit the migration files and the changes to schema.rs
 - If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
 - The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.

From 0c236fa465f1f4691f9b814208edc7437f92fa4b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 17 Jul 2024 21:55:20 +0100
Subject: [PATCH 1211/1571] pageserver: layer count & size metrics (#8410)

## Problem

We lack insight into:
- How much of a tenant's physical size is image vs. delta layers
- Average sizes of image vs. delta layers
- Total layer counts per timeline, indicating size of index_part object

As well as general observability love, this is motivated by
https://github.com/neondatabase/neon/issues/6738, where we need to
define some sensible thresholds for storage amplification, and using
total physical size may not work well (if someone does a lot of DROPs
then it's legitimate for the physical-synthetic ratio to be huge), but
the ratio between image layer size and delta layer size may be a better
indicator of whether we're generating unreasonable quantities of image
layers.

## Summary of changes

- Add pageserver_layer_bytes and pageserver_layer_count metrics,
labelled by timeline and `kind` (delta or image)
- Add & subtract these with LayerInner's lifetime.

I'm intentionally avoiding using a generic metric RAII guard object, to
avoid bloating LayerInner: it already has all the information it needs
to update metric on new+drop.
---
 pageserver/src/metrics.rs                    | 94 ++++++++++++++++++++
 pageserver/src/tenant/storage_layer/layer.rs | 21 +++++
 test_runner/fixtures/metrics.py              |  2 +
 3 files changed, 117 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index abad4b44b8..753f5524c5 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -473,6 +473,31 @@ static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum MetricLayerKind {
+    Delta,
+    Image,
+}
+
+static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_layer_bytes",
+        "Sum of layer physical sizes in bytes",
+        &["tenant_id", "shard_id", "timeline_id", "kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_layer_count",
+        "Number of layers that exist",
+        &["tenant_id", "shard_id", "timeline_id", "kind"]
+    )
+    .expect("failed to define a metric")
+});
+
 static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_archive_size",
@@ -2141,6 +2166,10 @@ pub(crate) struct TimelineMetrics {
     pub last_record_gauge: IntGauge,
     pub pitr_history_size: UIntGauge,
     pub archival_size: UIntGauge,
+    pub(crate) layer_size_image: UIntGauge,
+    pub(crate) layer_count_image: UIntGauge,
+    pub(crate) layer_size_delta: UIntGauge,
+    pub(crate) layer_count_delta: UIntGauge,
     pub standby_horizon_gauge: IntGauge,
     pub resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
@@ -2223,6 +2252,42 @@ impl TimelineMetrics {
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
 
+        let layer_size_image = TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Image.into(),
+            ])
+            .unwrap();
+
+        let layer_count_image = TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Image.into(),
+            ])
+            .unwrap();
+
+        let layer_size_delta = TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Delta.into(),
+            ])
+            .unwrap();
+
+        let layer_count_delta = TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Delta.into(),
+            ])
+            .unwrap();
+
         let standby_horizon_gauge = STANDBY_HORIZON
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -2277,6 +2342,10 @@ impl TimelineMetrics {
             last_record_gauge,
             pitr_history_size,
             archival_size,
+            layer_size_image,
+            layer_count_image,
+            layer_size_delta,
+            layer_count_delta,
             standby_horizon_gauge,
             resident_physical_size_gauge,
             current_logical_size_gauge,
@@ -2338,6 +2407,31 @@ impl TimelineMetrics {
         let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
 
+        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Image.into(),
+        ]);
+        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Image.into(),
+        ]);
+        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Delta.into(),
+        ]);
+        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Delta.into(),
+        ]);
+
         let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 4500bc94dd..dbf6c60aae 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -693,6 +693,18 @@ impl Drop for LayerInner {
             // and we could be delaying shutdown for nothing.
         }
 
+        if let Some(timeline) = self.timeline.upgrade() {
+            // Only need to decrement metrics if the timeline still exists: otherwise
+            // it will have already de-registered these metrics via TimelineMetrics::shutdown
+            if self.desc.is_delta() {
+                timeline.metrics.layer_count_delta.dec();
+                timeline.metrics.layer_size_delta.sub(self.desc.file_size);
+            } else {
+                timeline.metrics.layer_count_image.dec();
+                timeline.metrics.layer_size_image.sub(self.desc.file_size);
+            }
+        }
+
         if !*self.wanted_deleted.get_mut() {
             return;
         }
@@ -791,6 +803,15 @@ impl LayerInner {
             (heavier_once_cell::OnceCell::default(), 0, Status::Evicted)
         };
 
+        // This object acts as a RAII guard on these metrics: increment on construction
+        if desc.is_delta() {
+            timeline.metrics.layer_count_delta.inc();
+            timeline.metrics.layer_size_delta.add(desc.file_size);
+        } else {
+            timeline.metrics.layer_count_image.inc();
+            timeline.metrics.layer_size_image.add(desc.file_size);
+        }
+
         LayerInner {
             conf,
             debug_str: {
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index c019cbbc77..4836d42db5 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -146,6 +146,8 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_smgr_query_seconds_sum",
     "pageserver_archive_size",
     "pageserver_pitr_history_size",
+    "pageserver_layer_bytes",
+    "pageserver_layer_count",
     "pageserver_storage_operations_seconds_count_total",
     "pageserver_storage_operations_seconds_sum_total",
     "pageserver_evictions_total",

From e250b9e063b27db724032e2c0f0971cc67bb7130 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 18 Jul 2024 00:03:02 +0300
Subject: [PATCH 1212/1571] test: allow requests to any pageserver get
 cancelled (#8413)

Fix flakyness on `test_sharded_timeline_detach_ancestor` which does not
reproduce on a fast enough runner by allowing cancelled request before
completing on all pageservers. It was only allowed on half of the
pageservers.

Failure evidence:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8352/9972357040/index.html#suites/a1c2be32556270764423c495fad75d47/7cca3e3d94fe12f2
---
 .../regress/test_timeline_detach_ancestor.py  | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index d75ab4c060..38f8dfa885 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -702,20 +702,16 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
     # make another of the nodes get stuck, then restart
 
     stuck = pageservers[int(shards[0]["node_id"])]
-    stuck.allowed_errors.append(".*: request was dropped before completing")
-    env.storage_controller.allowed_errors.append(".*: request was dropped before completing")
+    log.info(f"stuck pageserver is id={stuck.id}")
     stuck_http = stuck.http_client()
     stuck_http.configure_failpoints(
         ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause")
     )
 
     restarted = pageservers[int(shards[1]["node_id"])]
-    restarted.allowed_errors.extend(
-        [
-            ".*: request was dropped before completing",
-            ".*: Cancelled request finished with an error: ShuttingDown",
-        ]
-    )
+    log.info(f"restarted pageserver is id={restarted.id}")
+    # this might be hit; see `restart_restarted`
+    restarted.allowed_errors.append(".*: Cancelled request finished with an error: ShuttingDown")
     assert restarted.id != stuck.id
     restarted_http = restarted.http_client()
     restarted_http.configure_failpoints(
@@ -724,6 +720,14 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
         ]
     )
 
+    for info in shards:
+        pageserver = pageservers[int(info["node_id"])]
+        # the first request can cause these, but does not repeatedly
+        pageserver.allowed_errors.append(".*: request was dropped before completing")
+
+    # first request again
+    env.storage_controller.allowed_errors.append(".*: request was dropped before completing")
+
     target = env.storage_controller.pageserver_api()
 
     with pytest.raises(ReadTimeout):

From 1303d477789a4f131cf04c974b6d0846be88a0f5 Mon Sep 17 00:00:00 2001
From: dotdister <odsk.dr@gmail.com>
Date: Thu, 18 Jul 2024 17:33:46 +0900
Subject: [PATCH 1213/1571] Fix comment in Control Plane (#8406)

## Problem
There are something wrong in the comment of
`control_plane/src/broker.rs` and `control_plane/src/pageserver.rs`

## Summary of changes
Fixed the comment about component name and their data path in
`control_plane/src/broker.rs` and `control_plane/src/pageserver.rs`.
---
 control_plane/src/broker.rs     | 4 ++--
 control_plane/src/pageserver.rs | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs
index c3cfc140da..c8ac5d8981 100644
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -1,9 +1,9 @@
 //! Code to manage the storage broker
 //!
-//! In the local test environment, the data for each safekeeper is stored in
+//! In the local test environment, the storage broker stores its data directly in
 //!
 //! ```text
-//!   .neon/safekeepers/<safekeeper id>
+//!   .neon
 //! ```
 use std::time::Duration;
 
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 5f2373e95a..e3d1d0e110 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,8 +1,10 @@
 //! Code to manage pageservers
 //!
-//! In the local test environment, the pageserver stores its data directly in
+//! In the local test environment, the data for each pageserver is stored in
 //!
-//!   .neon/
+//! ```text
+//!   .neon/pageserver_<pageserver_id>
+//! ```
 //!
 use std::collections::HashMap;
 

From a2d170b6d06a1ccc8eba3aadfaf7bbf16007978c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 18 Jul 2024 10:56:07 +0200
Subject: [PATCH 1214/1571] NeonEnv.from_repo_dir: use storage_controller_db
 instead of `attachments.json` (#8382)

When `NeonEnv.from_repo_dir` was introduced, storage controller stored
its
state exclusively `attachments.json`.
Since then, it has moved to using Postgres, which stores its state in
`storage_controller_db`.

But `NeonEnv.from_repo_dir` wasn't adjusted to do this.
This PR rectifies the situation.

Context for this is failures in
`test_pageserver_characterize_throughput_with_n_tenants`
CF:
https://neondb.slack.com/archives/C033RQ5SPDH/p1721035799502239?thread_ts=1720901332.293769&cid=C033RQ5SPDH

Notably, `from_repo_dir` is also used by the backwards- and
forwards-compatibility.
Thus, the changes in this PR affect those tests as well.
However, it turns out that the compatibility snapshot already contains
the `storage_controller_db`.
Thus, it should just work and in fact we can remove hacks like
`fixup_storage_controller`.

Follow-ups created as part of this work:
* https://github.com/neondatabase/neon/issues/8399
* https://github.com/neondatabase/neon/issues/8400
---
 Cargo.lock                                    |  27 +++++
 Cargo.toml                                    |   1 +
 control_plane/Cargo.toml                      |   1 +
 control_plane/src/storage_controller.rs       |  87 +++++++++++----
 storage_controller/src/main.rs                |  19 +---
 storage_controller/src/persistence.rs         | 100 ++----------------
 test_runner/fixtures/neon_fixtures.py         |  27 ++++-
 ...er_max_throughput_getpage_at_latest_lsn.py |   8 --
 test_runner/regress/test_compatibility.py     |  25 -----
 9 files changed, 133 insertions(+), 162 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 8897364701..d08da0babd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1368,6 +1368,7 @@ dependencies = [
  "tracing",
  "url",
  "utils",
+ "whoami",
  "workspace_hack",
 ]
 
@@ -4603,6 +4604,15 @@ dependencies = [
  "bitflags 1.3.2",
 ]
 
+[[package]]
+name = "redox_syscall"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
+dependencies = [
+ "bitflags 1.3.2",
+]
+
 [[package]]
 name = "regex"
 version = "1.10.2"
@@ -6972,6 +6982,12 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
+[[package]]
+name = "wasite"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
+
 [[package]]
 name = "wasm-bindgen"
 version = "0.2.92"
@@ -7124,6 +7140,17 @@ dependencies = [
  "once_cell",
 ]
 
+[[package]]
+name = "whoami"
+version = "1.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9"
+dependencies = [
+ "redox_syscall 0.4.1",
+ "wasite",
+ "web-sys",
+]
+
 [[package]]
 name = "winapi"
 version = "0.3.9"
diff --git a/Cargo.toml b/Cargo.toml
index 4f42203683..b9b4bafb4f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -191,6 +191,7 @@ uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
 rustls-native-certs = "0.7"
 x509-parser = "0.15"
+whoami = "1.5.1"
 
 ## TODO replace this with tracing
 env_logger = "0.10"
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index e62f3b8a47..487ac8f047 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -40,6 +40,7 @@ safekeeper_api.workspace = true
 postgres_connection.workspace = true
 storage_broker.workspace = true
 utils.workspace = true
+whoami.workspace = true
 
 compute_api.workspace = true
 workspace_hack.workspace = true
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 47103a2e0a..d7aedd711a 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -29,7 +29,6 @@ use utils::{
 pub struct StorageController {
     env: LocalEnv,
     listen: String,
-    path: Utf8PathBuf,
     private_key: Option<Vec<u8>>,
     public_key: Option<String>,
     postgres_port: u16,
@@ -41,6 +40,8 @@ const COMMAND: &str = "storage_controller";
 
 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 
+const DB_NAME: &str = "storage_controller";
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
@@ -65,10 +66,6 @@ pub struct InspectResponse {
 
 impl StorageController {
     pub fn from_env(env: &LocalEnv) -> Self {
-        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
-            .unwrap()
-            .join("attachments.json");
-
         // Makes no sense to construct this if pageservers aren't going to use it: assume
         // pageservers have control plane API set
         let listen_url = env.control_plane_api.clone().unwrap();
@@ -128,7 +125,6 @@ impl StorageController {
 
         Self {
             env: env.clone(),
-            path,
             listen,
             private_key,
             public_key,
@@ -203,7 +199,6 @@ impl StorageController {
     ///
     /// Returns the database url
     pub async fn setup_database(&self) -> anyhow::Result<String> {
-        const DB_NAME: &str = "storage_controller";
         let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
 
         let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -232,6 +227,30 @@ impl StorageController {
         Ok(database_url)
     }
 
+    pub async fn connect_to_database(
+        &self,
+    ) -> anyhow::Result<(
+        tokio_postgres::Client,
+        tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
+    )> {
+        tokio_postgres::Config::new()
+            .host("localhost")
+            .port(self.postgres_port)
+            // The user is the ambient operating system user name.
+            // That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
+            //
+            // Until we get there, use the ambient operating system user name.
+            // Recent tokio-postgres versions default to this if the user isn't specified.
+            // But tokio-postgres fork doesn't have this upstream commit:
+            // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79
+            // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399
+            .user(&whoami::username())
+            .dbname(DB_NAME)
+            .connect(tokio_postgres::NoTls)
+            .await
+            .map_err(anyhow::Error::new)
+    }
+
     pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
         // Start a vanilla Postgres process used by the storage controller for persistence.
         let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
@@ -256,18 +275,21 @@ impl StorageController {
             if !status.success() {
                 anyhow::bail!("initdb failed with status {status}");
             }
-
-            // Write a minimal config file:
-            // - Specify the port, since this is chosen dynamically
-            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
-            //   the storage controller we don't want a slow local disk to interfere with that.
-            tokio::fs::write(
-                &pg_data_path.join("postgresql.conf"),
-                format!("port = {}\nfsync=off\n", self.postgres_port),
-            )
-            .await?;
         };
 
+        // Write a minimal config file:
+        // - Specify the port, since this is chosen dynamically
+        // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
+        //   the storage controller we don't want a slow local disk to interfere with that.
+        //
+        // NB: it's important that we rewrite this file on each start command so we propagate changes
+        // from `LocalEnv`'s config file (`.neon/config`).
+        tokio::fs::write(
+            &pg_data_path.join("postgresql.conf"),
+            format!("port = {}\nfsync=off\n", self.postgres_port),
+        )
+        .await?;
+
         println!("Starting storage controller database...");
         let db_start_args = [
             "-w",
@@ -296,11 +318,38 @@ impl StorageController {
         // Run migrations on every startup, in case something changed.
         let database_url = self.setup_database().await?;
 
+        // We support running a startup SQL script to fiddle with the database before we launch storcon.
+        // This is used by the test suite.
+        let startup_script_path = self
+            .env
+            .base_data_dir
+            .join("storage_controller_db.startup.sql");
+        let startup_script = match tokio::fs::read_to_string(&startup_script_path).await {
+            Ok(script) => {
+                tokio::fs::remove_file(startup_script_path).await?;
+                script
+            }
+            Err(e) => {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    // always run some startup script so that this code path doesn't bit rot
+                    "BEGIN; COMMIT;".to_string()
+                } else {
+                    anyhow::bail!("Failed to read startup script: {e}")
+                }
+            }
+        };
+        let (mut client, conn) = self.connect_to_database().await?;
+        let conn = tokio::spawn(conn);
+        let tx = client.build_transaction();
+        let tx = tx.start().await?;
+        tx.batch_execute(&startup_script).await?;
+        tx.commit().await?;
+        drop(client);
+        conn.await??;
+
         let mut args = vec![
             "-l",
             &self.listen,
-            "-p",
-            self.path.as_ref(),
             "--dev",
             "--database-url",
             &database_url,
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index f1eb0b30fc..4bf6b528f4 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -1,5 +1,4 @@
 use anyhow::{anyhow, Context};
-use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -51,10 +50,6 @@ struct Cli {
     #[arg(long)]
     compute_hook_url: Option<String>,
 
-    /// Path to the .json file to store state (will be created if it doesn't exist)
-    #[arg(short, long)]
-    path: Option<Utf8PathBuf>,
-
     /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller
     #[arg(long)]
     database_url: Option<String>,
@@ -206,11 +201,10 @@ async fn async_main() -> anyhow::Result<()> {
 
     let args = Cli::parse();
     tracing::info!(
-        "version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
+        "version: {}, launch_timestamp: {}, build_tag {}, listening on {}",
         GIT_VERSION,
         launch_ts.to_string(),
         BUILD_TAG,
-        args.path.as_ref().unwrap_or(&Utf8PathBuf::from("<none>")),
         args.listen
     );
 
@@ -277,8 +271,7 @@ async fn async_main() -> anyhow::Result<()> {
         .await
         .context("Running database migrations")?;
 
-    let json_path = args.path;
-    let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone()));
+    let persistence = Arc::new(Persistence::new(secrets.database_url));
 
     let service = Service::spawn(config, persistence.clone()).await?;
 
@@ -316,14 +309,6 @@ async fn async_main() -> anyhow::Result<()> {
     }
     tracing::info!("Terminating on signal");
 
-    if json_path.is_some() {
-        // Write out a JSON dump on shutdown: this is used in compat tests to avoid passing
-        // full postgres dumps around.
-        if let Err(e) = persistence.write_tenants_json().await {
-            tracing::error!("Failed to write JSON on shutdown: {e}")
-        }
-    }
-
     // Stop HTTP server first, so that we don't have to service requests
     // while shutting down Service
     server_shutdown.cancel();
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 9f7b2f775e..d8f31e86e5 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -5,8 +5,6 @@ use std::time::Duration;
 use std::time::Instant;
 
 use self::split_state::SplitState;
-use camino::Utf8Path;
-use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
@@ -55,11 +53,6 @@ use crate::node::Node;
 /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
     connection_pool: diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<PgConnection>>,
-
-    // In test environments, we support loading+saving a JSON file.  This is temporary, for the benefit of
-    // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
-    // compatible just yet.
-    json_path: Option<Utf8PathBuf>,
 }
 
 /// Legacy format, for use in JSON compat objects in test environment
@@ -124,7 +117,7 @@ impl Persistence {
     const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
     const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);
 
-    pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
+    pub fn new(database_url: String) -> Self {
         let manager = diesel::r2d2::ConnectionManager::<PgConnection>::new(database_url);
 
         // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
@@ -139,10 +132,7 @@ impl Persistence {
             .build(manager)
             .expect("Could not build connection pool");
 
-        Self {
-            connection_pool,
-            json_path,
-        }
+        Self { connection_pool }
     }
 
     /// A helper for use during startup, where we would like to tolerate concurrent restarts of the
@@ -302,85 +292,13 @@ impl Persistence {
     /// At startup, load the high level state for shards, such as their config + policy.  This will
     /// be enriched at runtime with state discovered on pageservers.
     pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
-        let loaded = self
-            .with_measured_conn(
-                DatabaseOperation::ListTenantShards,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
-                },
-            )
-            .await?;
-
-        if loaded.is_empty() {
-            if let Some(path) = &self.json_path {
-                if tokio::fs::try_exists(path)
-                    .await
-                    .map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))?
-                {
-                    tracing::info!("Importing from legacy JSON format at {path}");
-                    return self.list_tenant_shards_json(path).await;
-                }
-            }
-        }
-        Ok(loaded)
-    }
-
-    /// Shim for automated compatibility tests: load tenants from a JSON file instead of database
-    pub(crate) async fn list_tenant_shards_json(
-        &self,
-        path: &Utf8Path,
-    ) -> DatabaseResult<Vec<TenantShardPersistence>> {
-        let bytes = tokio::fs::read(path)
-            .await
-            .map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?;
-
-        let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
-            .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
-        for shard in decoded.tenants.values_mut() {
-            if shard.placement_policy == "\"Single\"" {
-                // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
-                shard.placement_policy = "{\"Attached\":0}".to_string();
-            }
-
-            if shard.scheduling_policy.is_empty() {
-                shard.scheduling_policy =
-                    serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
-            }
-        }
-
-        let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
-
-        // Synchronize database with what is in the JSON file
-        self.insert_tenant_shards(tenants.clone()).await?;
-
-        Ok(tenants)
-    }
-
-    /// For use in testing environments, where we dump out JSON on shutdown.
-    pub async fn write_tenants_json(&self) -> anyhow::Result<()> {
-        let Some(path) = &self.json_path else {
-            anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)");
-        };
-        tracing::info!("Writing state to {path}...");
-        let tenants = self.list_tenant_shards().await?;
-        let mut tenants_map = HashMap::new();
-        for tsp in tenants {
-            let tenant_shard_id = TenantShardId {
-                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
-                shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount::new(tsp.shard_count as u8),
-            };
-
-            tenants_map.insert(tenant_shard_id, tsp);
-        }
-        let json = serde_json::to_string(&JsonPersistence {
-            tenants: tenants_map,
-        })?;
-
-        tokio::fs::write(path, &json).await?;
-        tracing::info!("Wrote {} bytes to {path}...", json.len());
-
-        Ok(())
+        self.with_measured_conn(
+            DatabaseOperation::ListTenantShards,
+            move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
+            },
+        )
+        .await
     }
 
     /// Tenants must be persisted before we schedule them for the first time.  This enables us
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 2765ff916e..fcfd4ea676 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -31,6 +31,7 @@ import backoff
 import httpx
 import jwt
 import psycopg2
+import psycopg2.sql
 import pytest
 import requests
 import toml
@@ -727,8 +728,30 @@ class NeonEnvBuilder:
                 self.repo_dir / "local_fs_remote_storage",
             )
 
-        if (attachments_json := Path(repo_dir / "attachments.json")).exists():
-            shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name)
+        # restore storage controller (the db is small, don't bother with overlayfs)
+        storcon_db_from_dir = repo_dir / "storage_controller_db"
+        storcon_db_to_dir = self.repo_dir / "storage_controller_db"
+        log.info(f"Copying storage_controller_db from {storcon_db_from_dir} to {storcon_db_to_dir}")
+        assert storcon_db_from_dir.is_dir()
+        assert not storcon_db_to_dir.exists()
+
+        def ignore_postgres_log(path: str, _names):
+            if Path(path) == storcon_db_from_dir:
+                return {"postgres.log"}
+            return set()
+
+        shutil.copytree(storcon_db_from_dir, storcon_db_to_dir, ignore=ignore_postgres_log)
+        assert not (storcon_db_to_dir / "postgres.log").exists()
+        # NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it.
+        # However, in this new NeonEnv, the pageservers listen on different ports, and the storage controller
+        # will currently reject re-attach requests from them because the NodeMetadata isn't identical.
+        # So, from_repo_dir patches up the the storcon database.
+        patch_script_path = self.repo_dir / "storage_controller_db.startup.sql"
+        assert not patch_script_path.exists()
+        patch_script = ""
+        for ps in self.env.pageservers:
+            patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg}  WHERE node_id = '{ps.id}';"
+        patch_script_path.write_text(patch_script)
 
         # Update the config with info about tenants and timelines
         with (self.repo_dir / "config").open("r") as f:
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 60861cf939..949813c984 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -255,11 +255,3 @@ def run_pagebench_benchmark(
             unit="ms",
             report=MetricReport.LOWER_IS_BETTER,
         )
-
-    env.storage_controller.allowed_errors.append(
-        # The test setup swaps NeonEnv instances, hence different
-        # pg instances are used for the storage controller db. This means
-        # the storage controller doesn't know about the nodes mentioned
-        # in attachments.json at start-up.
-        ".* Scheduler missing node 1",
-    )
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 1e5e320e0e..65649e0c0a 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -93,29 +93,6 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
 )
 
 
-def fixup_storage_controller(env: NeonEnv):
-    """
-    After importing a repo_dir, we need to massage the storage controller's state a bit: it will have
-    initially started up with no nodes, but some tenants, and thereby those tenants won't be scheduled
-    anywhere.
-
-    After NeonEnv.start() is done (i.e. nodes are started + registered), call this function to get
-    the storage controller into a good state.
-
-    This function should go away once compat tests carry the controller database in their snapshots, so
-    that the controller properly remembers nodes between creating + restoring the snapshot.
-    """
-    env.storage_controller.allowed_errors.extend(
-        [
-            ".*Tenant shard .+ references non-existent node.*",
-            ".*Failed to schedule tenant .+ at startup.*",
-        ]
-    )
-    env.storage_controller.stop()
-    env.storage_controller.start()
-    env.storage_controller.reconcile_until_idle()
-
-
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(before="test_forward_compatibility")
 def test_create_snapshot(
@@ -198,7 +175,6 @@ def test_backward_compatibility(
         neon_env_builder.num_safekeepers = 3
         env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
         neon_env_builder.start()
-        fixup_storage_controller(env)
 
         check_neon_works(
             env,
@@ -287,7 +263,6 @@ def test_forward_compatibility(
         assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version)
 
         neon_env_builder.start()
-        fixup_storage_controller(env)
 
         # ensure the specified pageserver is running
         assert env.pageserver.log_contains("git-env:" + prev_pageserver_version)

From 7672e49ab530eed265b39bf62c3d44e7750f8303 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 18 Jul 2024 10:14:56 +0100
Subject: [PATCH 1215/1571] tests: fix metrics check in test_s3_eviction
 (#8419)

## Problem

This test would occasionally fail its metric check. This could happen in
the rare case that the nodes had all been restarted before their most
recent eviction.

The metric check was added in
https://github.com/neondatabase/neon/pull/8348

## Summary of changes

- Check metrics before each restart, accumulate into a bool that we
assert on at the end of the test
---
 test_runner/regress/test_wal_acceptor.py | 43 +++++++++++++-----------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 2e906e6160..f02f19c588 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2242,6 +2242,8 @@ def test_s3_eviction(
 
     check_values = [0] * n_timelines
 
+    event_metrics_seen = False
+
     n_iters = 20
     for _ in range(n_iters):
         if log.isEnabledFor(logging.DEBUG):
@@ -2266,6 +2268,27 @@ def test_s3_eviction(
         # update remote_consistent_lsn on pageserver
         ps_client.timeline_checkpoint(env.initial_tenant, timelines[i], wait_until_uploaded=True)
 
+        # Do metrics check before restarts, since these will reset to zero across a restart
+        event_metrics_seen |= any(
+            sk.http_client().get_metric_value(
+                "safekeeper_eviction_events_started_total", {"kind": "evict"}
+            )
+            or 0 > 0
+            and sk.http_client().get_metric_value(
+                "safekeeper_eviction_events_completed_total", {"kind": "evict"}
+            )
+            or 0 > 0
+            and sk.http_client().get_metric_value(
+                "safekeeper_eviction_events_started_total", {"kind": "restore"}
+            )
+            or 0 > 0
+            and sk.http_client().get_metric_value(
+                "safekeeper_eviction_events_completed_total", {"kind": "restore"}
+            )
+            or 0 > 0
+            for sk in env.safekeepers
+        )
+
         # restarting random safekeepers
         for sk in env.safekeepers:
             if random.random() < restart_chance:
@@ -2280,22 +2303,4 @@ def test_s3_eviction(
         for sk in env.safekeepers
     )
 
-    assert any(
-        sk.http_client().get_metric_value(
-            "safekeeper_eviction_events_started_total", {"kind": "evict"}
-        )
-        or 0 > 0
-        and sk.http_client().get_metric_value(
-            "safekeeper_eviction_events_completed_total", {"kind": "evict"}
-        )
-        or 0 > 0
-        and sk.http_client().get_metric_value(
-            "safekeeper_eviction_events_started_total", {"kind": "restore"}
-        )
-        or 0 > 0
-        and sk.http_client().get_metric_value(
-            "safekeeper_eviction_events_completed_total", {"kind": "restore"}
-        )
-        or 0 > 0
-        for sk in env.safekeepers
-    )
+    assert event_metrics_seen

From 9ded2556dfe104c76793e04e7b3fde44b83714d3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 18 Jul 2024 10:23:17 +0100
Subject: [PATCH 1216/1571] tests: increase test_pg_regress and test_isolation
 timeouts (#8418)

## Problem

These tests time out ~1 in 50 runs when in debug mode.

There is no indication of a real issue: they're just wrappers that have
large numbers of individual tests contained within on pytest case.

## Summary of changes

- Bump pg_regress timeout from 600 to 900s
- Bump test_isolation timeout from 300s (default) to 600s

In future it would be nice to break out these tests to run individual
cases (or batches thereof) as separate tests, rather than this monolith.
---
 test_runner/regress/test_pg_regress.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 54b493ec70..d5b5ac3f75 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -117,7 +117,7 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End
 
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
-@pytest.mark.timeout(600)
+@pytest.mark.timeout(900)  # Contains many sub-tests, is slow in debug builds
 @pytest.mark.parametrize("shard_count", [None, 4])
 def test_pg_regress(
     neon_env_builder: NeonEnvBuilder,
@@ -186,6 +186,7 @@ def test_pg_regress(
 
 # Run the PostgreSQL "isolation" tests, in src/test/isolation.
 #
+@pytest.mark.timeout(600)  # Contains many sub-tests, is slow in debug builds
 @pytest.mark.parametrize("shard_count", [None, 4])
 def test_isolation(
     neon_env_builder: NeonEnvBuilder,

From b46175532678fc650bd32a2bbd281a3813e4773e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 18 Jul 2024 12:59:14 +0100
Subject: [PATCH 1217/1571] tests: turn on safekeeper eviction by default
 (#8352)

## Problem

Ahead of enabling eviction in the field, where it will become the
normal/default mode, let's enable it by default throughout our tests in
case any issues become visible there.

## Summary of changes

- Make default `extra_opts` for safekeepers enable offload & deletion
- Set low timeouts in `extra_opts` so that tests running for tens of
seconds have a chance to hit some of these background operations.
---
 test_runner/fixtures/neon_fixtures.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index fcfd4ea676..567ca532f9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4077,6 +4077,22 @@ class Safekeeper(LogUtils):
         self.id = id
         self.running = running
         self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"
+
+        if extra_opts is None:
+            # Testing defaults: enable everything, and set short timeouts so that background
+            # work will happen during short tests.
+            # **Note**: Any test that explicitly sets extra_opts will not get these defaults.
+            extra_opts = [
+                "--enable-offload",
+                "--delete-offloaded-wal",
+                "--partial-backup-timeout",
+                "10s",
+                "--control-file-save-interval",
+                "1s",
+                "--eviction-min-resident",
+                "10s",
+            ]
+
         self.extra_opts = extra_opts
 
     def start(

From d263b1804e70d3adf482e740fb9ed20e3fbcbe09 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 18 Jul 2024 13:46:00 +0100
Subject: [PATCH 1218/1571] Fix partial upload bug with invalid remote state
 (#8383)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We have an issue that some partial uploaded segments can be actually
missing in remote storage. I found this issue when was looking at the
logs in staging, and it can be triggered by failed uploads:
1. Code tries to upload `SEG_TERM_LSN_LSN_sk5.partial`, but receives
error from S3
2. The failed attempt is saved to `segments` vec
3. After some time, the code tries to upload
`SEG_TERM_LSN_LSN_sk5.partial` again
4. This time the upload is successful and code calls `gc()` to delete
previous uploads
5. Since new object and old object share the same name, uploaded data
gets deleted from remote storage

This commit fixes the issue by patching `gc()` not to delete objects
with the same name as currently uploaded.

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 safekeeper/src/timeline_eviction.rs  |  5 +----
 safekeeper/src/wal_backup_partial.rs | 12 ++++++++++++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index 0b8d58ee8a..7947d83eb4 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -199,10 +199,7 @@ async fn redownload_partial_segment(
     file.flush().await?;
 
     let final_path = local_segment_path(mgr, partial);
-    info!(
-        "downloaded {} bytes, renaming to {}",
-        final_path, final_path,
-    );
+    info!("downloaded {actual_len} bytes, renaming to {final_path}");
     if let Err(e) = durable_rename(&tmp_file, &final_path, !mgr.conf.no_sync).await {
         // Probably rename succeeded, but fsync of it failed. Remove
         // the file then to avoid using it.
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 825851c97c..b1efa9749f 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -289,6 +289,18 @@ impl PartialBackup {
             })
             .collect();
 
+        if new_segments.len() == 1 {
+            // we have an uploaded segment, it must not be deleted from remote storage
+            segments_to_delete.retain(|name| name != &new_segments[0].name);
+        } else {
+            // there should always be zero or one uploaded segment
+            assert!(
+                new_segments.is_empty(),
+                "too many uploaded segments: {:?}",
+                new_segments
+            );
+        }
+
         info!("deleting objects: {:?}", segments_to_delete);
         let mut objects_to_delete = vec![];
         for seg in segments_to_delete.iter() {

From a4434cf1c0b42133de1196c8adc5468637bbb8eb Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Thu, 18 Jul 2024 12:16:44 -0400
Subject: [PATCH 1219/1571] pageserver: integrate k-merge with bottom-most
 compaction (#8415)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use the k-merge iterator in the compaction process to reduce memory
footprint.

part of https://github.com/neondatabase/neon/issues/8002

## Summary of changes

* refactor the bottom-most compaction code to use k-merge iterator
* add Send bound on some structs as it is used across the await points

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/tenant.rs                      |  2 +-
 pageserver/src/tenant/disk_btree.rs           |  4 +-
 pageserver/src/tenant/storage_layer.rs        |  2 -
 .../src/tenant/storage_layer/delta_layer.rs   | 21 +++---
 .../src/tenant/storage_layer/image_layer.rs   | 23 +++---
 pageserver/src/tenant/storage_layer/layer.rs  |  5 +-
 .../tenant/storage_layer/merge_iterator.rs    |  4 ++
 pageserver/src/tenant/timeline/compaction.rs  | 70 ++++++++-----------
 pageserver/src/tenant/vectored_blob_io.rs     |  2 -
 9 files changed, 62 insertions(+), 71 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index dc6f42eaeb..637051413f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -6810,7 +6810,7 @@ mod tests {
             vec![
                 // Image layer at GC horizon
                 PersistentLayerKey {
-                    key_range: Key::MIN..get_key(10),
+                    key_range: Key::MIN..Key::MAX,
                     lsn_range: Lsn(0x30)..Lsn(0x31),
                     is_delta: false
                 },
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 251d2ab4ad..1583a3826a 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -262,7 +262,7 @@ where
 
     pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a>
     where
-        R: 'a,
+        R: 'a + Send,
     {
         DiskBtreeIterator {
             stream: Box::pin(self.into_stream(start_key, ctx)),
@@ -521,7 +521,7 @@ where
 pub struct DiskBtreeIterator<'a> {
     #[allow(clippy::type_complexity)]
     stream: std::pin::Pin<
-        Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a>,
+        Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a + Send>,
     >,
 }
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 2f0c45317d..a389358f0d 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -6,8 +6,6 @@ pub(crate) mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
-
-#[cfg(test)]
 pub mod merge_iterator;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 43941b6e17..c34923320a 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -33,11 +33,14 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
-use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
+use crate::tenant::disk_btree::{
+    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
+};
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    VectoredReadPlanner,
 };
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
@@ -53,6 +56,7 @@ use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
+use std::collections::VecDeque;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -747,12 +751,10 @@ impl DeltaLayer {
 }
 
 impl DeltaLayerInner {
-    #[cfg(test)]
     pub(crate) fn key_range(&self) -> &Range<Key> {
         &self.layer_key_range
     }
 
-    #[cfg(test)]
     pub(crate) fn lsn_range(&self) -> &Range<Lsn> {
         &self.layer_lsn_range
     }
@@ -1512,7 +1514,6 @@ impl DeltaLayerInner {
         offset
     }
 
-    #[cfg(test)]
     pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
         let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader =
@@ -1523,7 +1524,7 @@ impl DeltaLayerInner {
             index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx),
             key_values_batch: std::collections::VecDeque::new(),
             is_end: false,
-            planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
+            planner: StreamingVectoredReadPlanner::new(
                 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
                 1024,        // The default value. Unit tests might use a different value
             ),
@@ -1595,17 +1596,15 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
     }
 }
 
-#[cfg(test)]
 pub struct DeltaLayerIterator<'a> {
     delta_layer: &'a DeltaLayerInner,
     ctx: &'a RequestContext,
-    planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
-    index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
-    key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
+    planner: StreamingVectoredReadPlanner,
+    index_iter: DiskBtreeIterator<'a>,
+    key_values_batch: VecDeque<(Key, Lsn, Value)>,
     is_end: bool,
 }
 
-#[cfg(test)]
 impl<'a> DeltaLayerIterator<'a> {
     /// Retrieve a batch of key-value pairs into the iterator buffer.
     async fn next_batch(&mut self) -> anyhow::Result<()> {
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index a88a1e6429..c7f41b66be 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -29,13 +29,16 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
-use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
+use crate::tenant::disk_btree::{
+    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
+};
 use crate::tenant::storage_layer::{
     LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    VectoredReadPlanner,
 };
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
@@ -50,6 +53,7 @@ use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
+use std::collections::VecDeque;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -369,12 +373,10 @@ impl ImageLayer {
 }
 
 impl ImageLayerInner {
-    #[cfg(test)]
     pub(crate) fn key_range(&self) -> &Range<Key> {
         &self.key_range
     }
 
-    #[cfg(test)]
     pub(crate) fn lsn(&self) -> Lsn {
         self.lsn
     }
@@ -699,7 +701,6 @@ impl ImageLayerInner {
         }
     }
 
-    #[cfg(test)]
     pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
         let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader =
@@ -708,9 +709,9 @@ impl ImageLayerInner {
             image_layer: self,
             ctx,
             index_iter: tree_reader.iter(&[0; KEY_SIZE], ctx),
-            key_values_batch: std::collections::VecDeque::new(),
+            key_values_batch: VecDeque::new(),
             is_end: false,
-            planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
+            planner: StreamingVectoredReadPlanner::new(
                 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
                 1024,        // The default value. Unit tests might use a different value
             ),
@@ -974,17 +975,15 @@ impl Drop for ImageLayerWriter {
     }
 }
 
-#[cfg(test)]
 pub struct ImageLayerIterator<'a> {
     image_layer: &'a ImageLayerInner,
     ctx: &'a RequestContext,
-    planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
-    index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
-    key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
+    planner: StreamingVectoredReadPlanner,
+    index_iter: DiskBtreeIterator<'a>,
+    key_values_batch: VecDeque<(Key, Lsn, Value)>,
     is_end: bool,
 }
 
-#[cfg(test)]
 impl<'a> ImageLayerIterator<'a> {
     /// Retrieve a batch of key-value pairs into the iterator buffer.
     async fn next_batch(&mut self) -> anyhow::Result<()> {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index dbf6c60aae..d9cbaba529 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -385,6 +385,7 @@ impl Layer {
     }
 
     /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
+    #[allow(dead_code)]
     pub(crate) async fn load_key_values(
         &self,
         ctx: &RequestContext,
@@ -1918,7 +1919,7 @@ impl ResidentLayer {
         self.owner.metadata()
     }
 
-    #[cfg(test)]
+    /// Cast the layer to a delta, return an error if it is an image layer.
     pub(crate) async fn get_as_delta(
         &self,
         ctx: &RequestContext,
@@ -1930,7 +1931,7 @@ impl ResidentLayer {
         }
     }
 
-    #[cfg(test)]
+    /// Cast the layer to an image, return an error if it is a delta layer.
     pub(crate) async fn get_as_image(
         &self,
         ctx: &RequestContext,
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index 0edfd4bd40..6f59b2fd77 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -547,5 +547,9 @@ mod tests {
             &ctx,
         );
         assert_merge_iter_equal(&mut merge_iter, &expect).await;
+
+        is_send(merge_iter);
     }
+
+    fn is_send(_: impl Send) {}
 }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index f251b667c2..a648432b4d 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -27,6 +27,7 @@ use utils::id::TimelineId;
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
 use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
+use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
@@ -1039,10 +1040,12 @@ impl Timeline {
         );
         // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
         // Also, collect the layer information to decide when to split the new delta layers.
-        let mut all_key_values = Vec::new();
+        let mut downloaded_layers = Vec::new();
         let mut delta_split_points = BTreeSet::new();
         for layer in &layer_selection {
-            all_key_values.extend(layer.load_key_values(ctx).await?);
+            let resident_layer = layer.download_and_keep_resident().await?;
+            downloaded_layers.push(resident_layer);
+
             let desc = layer.layer_desc();
             if desc.is_delta() {
                 // TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon)
@@ -1052,44 +1055,28 @@ impl Timeline {
                 delta_split_points.insert(key_range.end);
             }
         }
-        // Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
-        // image layers, make image appear before than delta.
-        struct ValueWrapper<'a>(&'a crate::repository::Value);
-        impl Ord for ValueWrapper<'_> {
-            fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-                use crate::repository::Value;
-                use std::cmp::Ordering;
-                match (self.0, other.0) {
-                    (Value::Image(_), Value::WalRecord(_)) => Ordering::Less,
-                    (Value::WalRecord(_), Value::Image(_)) => Ordering::Greater,
-                    _ => Ordering::Equal,
-                }
+        let mut delta_layers = Vec::new();
+        let mut image_layers = Vec::new();
+        for resident_layer in &downloaded_layers {
+            if resident_layer.layer_desc().is_delta() {
+                let layer = resident_layer.get_as_delta(ctx).await?;
+                delta_layers.push(layer);
+            } else {
+                let layer = resident_layer.get_as_image(ctx).await?;
+                image_layers.push(layer);
             }
         }
-        impl PartialOrd for ValueWrapper<'_> {
-            fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-                Some(self.cmp(other))
-            }
-        }
-        impl PartialEq for ValueWrapper<'_> {
-            fn eq(&self, other: &Self) -> bool {
-                self.cmp(other) == std::cmp::Ordering::Equal
-            }
-        }
-        impl Eq for ValueWrapper<'_> {}
-        all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
-            (k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
-        });
+        let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
         // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
         // Data of the same key.
         let mut accumulated_values = Vec::new();
-        let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty
+        let mut last_key: Option<Key> = None;
 
         /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
         async fn flush_accumulated_states(
             tline: &Arc<Timeline>,
             key: Key,
-            accumulated_values: &[&(Key, Lsn, crate::repository::Value)],
+            accumulated_values: &[(Key, Lsn, crate::repository::Value)],
             horizon: Lsn,
         ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
             let mut base_image = None;
@@ -1190,7 +1177,7 @@ impl Timeline {
             self.conf,
             self.timeline_id,
             self.tenant_shard_id,
-            &(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()),
+            &(Key::MIN..Key::MAX), // covers the full key range
             gc_cutoff,
             ctx,
         )
@@ -1200,20 +1187,24 @@ impl Timeline {
         let delta_split_points = delta_split_points.into_iter().collect_vec();
         let mut current_delta_split_point = 0;
         let mut delta_layers = Vec::new();
-        for item @ (key, _, _) in &all_key_values {
-            if &last_key == key {
-                accumulated_values.push(item);
+        while let Some((key, lsn, val)) = merge_iter.next().await? {
+            if last_key.is_none() || last_key.as_ref() == Some(&key) {
+                if last_key.is_none() {
+                    last_key = Some(key);
+                }
+                accumulated_values.push((key, lsn, val));
             } else {
+                let last_key = last_key.as_mut().unwrap();
                 let (deltas, image) =
-                    flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
+                    flush_accumulated_states(self, *last_key, &accumulated_values, gc_cutoff)
                         .await?;
                 // Put the image into the image layer. Currently we have a single big layer for the compaction.
-                image_layer_writer.put_image(last_key, image, ctx).await?;
+                image_layer_writer.put_image(*last_key, image, ctx).await?;
                 delta_values.extend(deltas);
                 delta_layers.extend(
                     flush_deltas(
                         &mut delta_values,
-                        last_key,
+                        *last_key,
                         &delta_split_points,
                         &mut current_delta_split_point,
                         self,
@@ -1223,11 +1214,12 @@ impl Timeline {
                     .await?,
                 );
                 accumulated_values.clear();
-                accumulated_values.push(item);
-                last_key = *key;
+                *last_key = key;
+                accumulated_values.push((key, lsn, val));
             }
         }
 
+        let last_key = last_key.expect("no keys produced during compaction");
         // TODO: move this part to the loop body
         let (deltas, image) =
             flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 5a0986ea12..54a3ad789b 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -396,7 +396,6 @@ impl<'a> VectoredBlobReader<'a> {
 /// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
 /// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
 /// max_cnt constraints.
-#[cfg(test)]
 pub struct StreamingVectoredReadPlanner {
     read_builder: Option<VectoredReadBuilder>,
     // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
@@ -410,7 +409,6 @@ pub struct StreamingVectoredReadPlanner {
     cnt: usize,
 }
 
-#[cfg(test)]
 impl StreamingVectoredReadPlanner {
     pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
         assert!(max_cnt > 0);

From 841b76ea7cd7a6f8cf47cbacdbbc1b4fefce331f Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 18 Jul 2024 18:18:18 +0200
Subject: [PATCH 1220/1571] Temporarily use vanilla pgbench and psql (client)
 for running pgvector benchmark (#8422)

## Problem

https://github.com/neondatabase/neon/issues/8275 is not yet fixed

Periodic benchmarking fails with SIGABRT in pgvector step, see
https://github.com/neondatabase/neon/actions/runs/9967453263/job/27541159738#step:7:393

## Summary of changes

Instead of using pgbench and psql from Neon artifacts, download vanilla
postgres binaries into the container and use those to run the client
side of the test.
---
 .github/workflows/benchmarking.yml | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index d785156a29..833a4ce33c 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -457,17 +457,21 @@ jobs:
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
-      options: --init
+      options: --init --user root
 
     steps:
     - uses: actions/checkout@v4
 
-    - name: Download Neon artifact
-      uses: ./.github/actions/download
-      with:
-        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
-        path: /tmp/neon/
-        prefix: latest
+    # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16
+    # instead of using Neon artifacts containing pgbench
+    - name: Install postgresql-16 where pytest expects it
+      run: |
+        apt-get update && apt-get install -y postgresql-common
+        /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y
+        apt-get -y install postgresql-16
+        mkdir -p /tmp/neon/pg_install/v16/bin
+        ln -s /usr/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
+        ln -s /usr/bin/psql /tmp/neon/pg_install/v16/bin/psql
 
     - name: Set up Connection String
       id: set-up-connstr

From 5a772761ee799d1d4537fcb2ab5a973e7be4d754 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 18 Jul 2024 17:26:27 +0100
Subject: [PATCH 1221/1571] Change log level for GuardDrop error (#8305)

The error means that manager exited earlier than `ResidenceGuard` and
it's not unexpected with current deletion implementation. This commit
changes log level to reduse noise.
---
 safekeeper/src/timeline_guard.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/safekeeper/src/timeline_guard.rs b/safekeeper/src/timeline_guard.rs
index e249c859b4..dbdf46412d 100644
--- a/safekeeper/src/timeline_guard.rs
+++ b/safekeeper/src/timeline_guard.rs
@@ -4,7 +4,7 @@
 
 use std::collections::HashSet;
 
-use tracing::{debug, warn};
+use tracing::debug;
 
 use crate::timeline_manager::ManagerCtlMessage;
 
@@ -23,7 +23,7 @@ impl Drop for ResidenceGuard {
             .manager_tx
             .send(ManagerCtlMessage::GuardDrop(self.guard_id));
         if let Err(e) = res {
-            warn!("failed to send GuardDrop message: {:?}", e);
+            debug!("failed to send GuardDrop message: {:?}", e);
         }
     }
 }

From c96e8012ce2472964f7dff13110d57f7ba5db2b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 18 Jul 2024 20:09:57 +0200
Subject: [PATCH 1222/1571] Enable zstd in tests (#8368)

Successor of #8288 , just enable zstd in tests. Also adds a test that
creates easily compressable data.

Part of #5431

---------

Co-authored-by: John Spray <john@neon.tech>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/src/metrics.rs                     | 16 ++++
 .../src/tenant/storage_layer/image_layer.rs   | 10 ++
 test_runner/fixtures/neon_fixtures.py         |  1 +
 test_runner/regress/test_compaction.py        | 93 ++++++++++++++++++-
 .../regress/test_disk_usage_eviction.py       |  3 +
 5 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 753f5524c5..c03567f6ef 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -610,6 +610,22 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_compression_image_in_bytes_total",
+        "Size of uncompressed data written into image layers"
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_compression_image_out_bytes_total",
+        "Size of compressed image layer written"
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) mod initial_logical_size {
     use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
     use once_cell::sync::Lazy;
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index c7f41b66be..45b47bb62b 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -738,6 +738,9 @@ struct ImageLayerWriterInner {
     key_range: Range<Key>,
     lsn: Lsn,
 
+    // Total uncompressed bytes passed into put_image
+    uncompressed_bytes: u64,
+
     blob_writer: BlobWriter<false>,
     tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 }
@@ -793,6 +796,7 @@ impl ImageLayerWriterInner {
             lsn,
             tree: tree_builder,
             blob_writer,
+            uncompressed_bytes: 0,
         };
 
         Ok(writer)
@@ -811,6 +815,7 @@ impl ImageLayerWriterInner {
     ) -> anyhow::Result<()> {
         ensure!(self.key_range.contains(&key));
         let compression = self.conf.image_compression;
+        self.uncompressed_bytes += img.len() as u64;
         let (_img, res) = self
             .blob_writer
             .write_blob_maybe_compressed(img, ctx, compression)
@@ -836,6 +841,11 @@ impl ImageLayerWriterInner {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
 
+        // Calculate compression ratio
+        let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
+        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
+        crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
+
         let mut file = self.blob_writer.into_inner();
 
         // Write out the index
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 567ca532f9..db7269ad41 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1158,6 +1158,7 @@ class NeonEnv:
                 "listen_http_addr": f"localhost:{pageserver_port.http}",
                 "pg_auth_type": pg_auth_type,
                 "http_auth_type": http_auth_type,
+                "image_compression": "zstd",
             }
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index f321c09b27..be787e0642 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -6,7 +6,10 @@ from typing import Optional
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    generate_uploads_and_deletions,
+)
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
@@ -142,6 +145,10 @@ def test_sharding_compaction(
         "image_layer_creation_check_threshold": 0,
     }
 
+    # Disable compression, as we can't estimate the size of layers with compression enabled
+    # TODO: implement eager layer cutting during compaction
+    neon_env_builder.pageserver_config_override = "image_compression='disabled'"
+
     neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count
     env = neon_env_builder.init_start(
         initial_tenant_conf=TENANT_CONF,
@@ -320,3 +327,87 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder)
         or 0
     ) == 0
     assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*")
+
+
+@pytest.mark.parametrize("enabled", [True, False])
+def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool):
+    tenant_conf = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": f"{128 * 1024}",
+        "compaction_threshold": "1",
+        "compaction_target_size": f"{128 * 1024}",
+        # no PITR horizon, we specify the horizon when we request on-demand GC
+        "pitr_interval": "0s",
+        # disable background compaction and GC. We invoke it manually when we want it to happen.
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # create image layers as eagerly as possible
+        "image_creation_threshold": "1",
+        "image_layer_creation_check_threshold": "0",
+    }
+
+    # Explicitly enable/disable compression, rather than using default
+    if enabled:
+        neon_env_builder.pageserver_config_override = "image_compression='zstd'"
+    else:
+        neon_env_builder.pageserver_config_override = "image_compression='disabled'"
+
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    pageserver = env.pageserver
+    ps_http = env.pageserver.http_client()
+    with env.endpoints.create_start(
+        "main", tenant_id=tenant_id, pageserver_id=pageserver.id
+    ) as endpoint:
+        endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+        # Generate around 800k worth of easily compressible data to store
+        for v in range(100):
+            endpoint.safe_psql(
+                f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))"
+            )
+    # run compaction to create image layers
+    ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
+
+    layer_map = ps_http.layer_map_info(tenant_id, timeline_id)
+    image_layer_count = 0
+    delta_layer_count = 0
+    for layer in layer_map.historic_layers:
+        if layer.kind == "Image":
+            image_layer_count += 1
+        elif layer.kind == "Delta":
+            delta_layer_count += 1
+    assert image_layer_count > 0
+    assert delta_layer_count > 0
+
+    log.info(f"images: {image_layer_count}, deltas: {delta_layer_count}")
+
+    bytes_in = pageserver.http_client().get_metric_value(
+        "pageserver_compression_image_in_bytes_total"
+    )
+    bytes_out = pageserver.http_client().get_metric_value(
+        "pageserver_compression_image_out_bytes_total"
+    )
+    assert bytes_in is not None
+    assert bytes_out is not None
+    log.info(f"Compression ratio: {bytes_out/bytes_in} ({bytes_out} in, {bytes_out} out)")
+
+    if enabled:
+        # We are writing high compressible repetitive plain text, expect excellent compression
+        EXPECT_RATIO = 0.2
+        assert bytes_out / bytes_in < EXPECT_RATIO
+    else:
+        # Nothing should be compressed if we disabled it.
+        assert bytes_out >= bytes_in
+
+    # Destroy the endpoint and create a new one to resetthe caches
+    with env.endpoints.create_start(
+        "main", tenant_id=tenant_id, pageserver_id=pageserver.id
+    ) as endpoint:
+        for v in range(100):
+            res = endpoint.safe_psql(
+                f"SELECT count(*) FROM foo WHERE id={v} and val=repeat('abcde{v:0>3}', 500)"
+            )
+            assert res[0][0] == 1
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index fb8b7b22fa..3c834f430b 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -230,6 +230,9 @@ def _eviction_env(
     neon_env_builder.num_pageservers = num_pageservers
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
+    # Disable compression support for EvictionEnv to get larger layer sizes
+    neon_env_builder.pageserver_config_override = "image_compression='disabled'"
+
     # initial tenant will not be present on this pageserver
     env = neon_env_builder.init_configs()
     env.start()

From 392d3524f955de375c119f1cdf99a9069843dc67 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Fri, 19 Jul 2024 15:40:55 +0200
Subject: [PATCH 1223/1571] Bodobolero/fix root permissions (#8429)

## Problem

My prior PR https://github.com/neondatabase/neon/pull/8422
caused leftovers in the GitHub action runner work directory with root
permission.
As an example see here
https://github.com/neondatabase/neon/actions/runs/10001857641/job/27646237324#step:3:37
To work-around we install vanilla postgres as non-root using deb
packages in /home/nonroot user directory

## Summary of changes

- since we cannot use root we install the deb pkgs directly and create
symbolic links for psql, pgbench and libs in expected places
- continue jobs an aws even if azure jobs fail (because this region is
currently unreliable)
---
 .github/workflows/benchmarking.yml | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 833a4ce33c..c132b5b513 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -57,6 +57,7 @@ jobs:
   bench:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     strategy:
+      fail-fast: false
       matrix:
         include:
           - DEFAULT_PG_VERSION: 16
@@ -439,6 +440,7 @@ jobs:
 
   pgbench-pgvector:
     strategy:
+      fail-fast: false
       matrix:
         include:
           - PLATFORM: "neon-captest-pgvector"
@@ -451,13 +453,14 @@ jobs:
       DEFAULT_PG_VERSION: 16
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
+      LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.PLATFORM }}
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
-      options: --init --user root
+      options: --init
 
     steps:
     - uses: actions/checkout@v4
@@ -466,12 +469,19 @@ jobs:
     # instead of using Neon artifacts containing pgbench
     - name: Install postgresql-16 where pytest expects it
       run: |
-        apt-get update && apt-get install -y postgresql-common
-        /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y
-        apt-get -y install postgresql-16
+        cd /home/nonroot
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb 
+        dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
         mkdir -p /tmp/neon/pg_install/v16/bin
-        ln -s /usr/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
-        ln -s /usr/bin/psql /tmp/neon/pg_install/v16/bin/psql
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench  
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql  
+        ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib 
+        /tmp/neon/pg_install/v16/bin/pgbench --version
+        /tmp/neon/pg_install/v16/bin/psql --version
 
     - name: Set up Connection String
       id: set-up-connstr
@@ -532,7 +542,6 @@ jobs:
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
-
   clickbench-compare:
     # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
     # we use for performance testing in pgbench-compare.

From 16071e57c642bddfcbe2aabc60acd9a788e2fadb Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 19 Jul 2024 18:01:02 +0200
Subject: [PATCH 1224/1571] pageserver: remove obsolete
 cached_metric_collection_interval (#8370)

We're removing the usage of this long-meaningless config field in
https://github.com/neondatabase/aws/pull/1599

Once that PR has been deployed to staging and prod, we can merge this
PR.
---
 pageserver/src/bin/pageserver.rs              |  1 -
 pageserver/src/config.rs                      | 24 -------------------
 pageserver/src/consumption_metrics.rs         |  7 ------
 .../test_pageserver_metric_collection.py      |  2 --
 4 files changed, 34 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 9f705f0bc9..fceddfb757 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -622,7 +622,6 @@ fn start_pageserver(
                         metric_collection_endpoint,
                         &conf.metric_collection_bucket,
                         conf.metric_collection_interval,
-                        conf.cached_metric_collection_interval,
                         conf.synthetic_size_calculation_interval,
                         conf.id,
                         local_disk_storage,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 5b103b551f..35b4e79365 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -68,7 +68,6 @@ pub mod defaults {
         super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
 
     pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s";
     pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
     pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
     pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
@@ -123,7 +122,6 @@ pub mod defaults {
 #concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'
 
 #metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
-#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
 #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
 
 #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
@@ -238,7 +236,6 @@ pub struct PageServerConf {
     // How often to collect metrics and send them to the metrics endpoint.
     pub metric_collection_interval: Duration,
     // How often to send unchanged cached metrics to the metrics endpoint.
-    pub cached_metric_collection_interval: Duration,
     pub metric_collection_endpoint: Option<Url>,
     pub metric_collection_bucket: Option<RemoteStorageConfig>,
     pub synthetic_size_calculation_interval: Duration,
@@ -370,7 +367,6 @@ struct PageServerConfigBuilder {
     concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
 
     metric_collection_interval: BuilderValue<Duration>,
-    cached_metric_collection_interval: BuilderValue<Duration>,
     metric_collection_endpoint: BuilderValue<Option<Url>>,
     synthetic_size_calculation_interval: BuilderValue<Duration>,
     metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
@@ -454,10 +450,6 @@ impl PageServerConfigBuilder {
                 DEFAULT_METRIC_COLLECTION_INTERVAL,
             )
             .expect("cannot parse default metric collection interval")),
-            cached_metric_collection_interval: Set(humantime::parse_duration(
-                DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL,
-            )
-            .expect("cannot parse default cached_metric_collection_interval")),
             synthetic_size_calculation_interval: Set(humantime::parse_duration(
                 DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
             )
@@ -589,14 +581,6 @@ impl PageServerConfigBuilder {
         self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
     }
 
-    pub fn cached_metric_collection_interval(
-        &mut self,
-        cached_metric_collection_interval: Duration,
-    ) {
-        self.cached_metric_collection_interval =
-            BuilderValue::Set(cached_metric_collection_interval)
-    }
-
     pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
         self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
     }
@@ -730,7 +714,6 @@ impl PageServerConfigBuilder {
                 broker_keepalive_interval,
                 log_format,
                 metric_collection_interval,
-                cached_metric_collection_interval,
                 metric_collection_endpoint,
                 metric_collection_bucket,
                 synthetic_size_calculation_interval,
@@ -947,7 +930,6 @@ impl PageServerConf {
                     NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
                 }),
                 "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
-                "cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?),
                 "metric_collection_endpoint" => {
                     let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
                     builder.metric_collection_endpoint(Some(endpoint));
@@ -1080,7 +1062,6 @@ impl PageServerConf {
             eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
             ),
             metric_collection_interval: Duration::from_secs(60),
-            cached_metric_collection_interval: Duration::from_secs(60 * 60),
             metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
             metric_collection_bucket: None,
             synthetic_size_calculation_interval: Duration::from_secs(60),
@@ -1259,7 +1240,6 @@ initial_superuser_name = 'zzzz'
 id = 10
 
 metric_collection_interval = '222 s'
-cached_metric_collection_interval = '22200 s'
 metric_collection_endpoint = 'http://localhost:80/metrics'
 synthetic_size_calculation_interval = '333 s'
 
@@ -1315,9 +1295,6 @@ background_task_maximum_delay = '334 s'
                 metric_collection_interval: humantime::parse_duration(
                     defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
                 )?,
-                cached_metric_collection_interval: humantime::parse_duration(
-                    defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
-                )?,
                 metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
                 metric_collection_bucket: None,
                 synthetic_size_calculation_interval: humantime::parse_duration(
@@ -1396,7 +1373,6 @@ background_task_maximum_delay = '334 s'
                 eviction_task_immitated_concurrent_logical_size_queries:
                     ConfigurableSemaphore::default(),
                 metric_collection_interval: Duration::from_secs(222),
-                cached_metric_collection_interval: Duration::from_secs(22200),
                 metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                 metric_collection_bucket: None,
                 synthetic_size_calculation_interval: Duration::from_secs(333),
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 18c1a6cd9b..6861adad2c 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -46,19 +46,12 @@ pub async fn collect_metrics(
     metric_collection_endpoint: &Url,
     metric_collection_bucket: &Option<RemoteStorageConfig>,
     metric_collection_interval: Duration,
-    _cached_metric_collection_interval: Duration,
     synthetic_size_calculation_interval: Duration,
     node_id: NodeId,
     local_disk_storage: Utf8PathBuf,
     cancel: CancellationToken,
     ctx: RequestContext,
 ) -> anyhow::Result<()> {
-    if _cached_metric_collection_interval != Duration::ZERO {
-        tracing::warn!(
-            "cached_metric_collection_interval is no longer used, please set it to zero."
-        )
-    }
-
     // spin up background worker that caclulates tenant sizes
     let worker_ctx =
         ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index cea35a6acb..24a37b04ec 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -58,7 +58,6 @@ def test_metric_collection(
         metric_collection_interval="1s"
         metric_collection_endpoint="{metric_collection_endpoint}"
         metric_collection_bucket={remote_storage_to_toml_inline_table(neon_env_builder.pageserver_remote_storage)}
-        cached_metric_collection_interval="0s"
         synthetic_size_calculation_interval="3s"
         """
 
@@ -216,7 +215,6 @@ def test_metric_collection_cleans_up_tempfile(
     neon_env_builder.pageserver_config_override = f"""
         metric_collection_interval="1s"
         metric_collection_endpoint="{metric_collection_endpoint}"
-        cached_metric_collection_interval="0s"
         synthetic_size_calculation_interval="3s"
         """
 

From 44781518d044de46f6fd1d58d9aece7bf399bc40 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 19 Jul 2024 17:07:59 +0100
Subject: [PATCH 1225/1571] storage scrubber: GC ancestor shard layers (#8196)

## Problem

After a shard split, the pageserver leaves the ancestor shard's content
in place. It may be referenced by child shards, but eventually child
shards will de-reference most ancestor layers as they write their own
data and do GC. We would like to eventually clean up those ancestor
layers to reclaim space.

## Summary of changes

- Extend the physical GC command with `--mode=full`, which includes
cleaning up unreferenced ancestor shard layers
- Add test `test_scrubber_physical_gc_ancestors`
- Remove colored log output: in testing this is irritating ANSI code
spam in logs, and in interactive use doesn't add much.
- Refactor storage controller API client code out of storcon_client into
a `storage_controller/client` crate
- During physical GC of ancestors, call into the storage controller to
check that the latest shards seen in S3 reflect the latest state of the
tenant, and there is no shard split in progress.
---
 Cargo.lock                                    |  41 +-
 Cargo.toml                                    |   4 +-
 control_plane/storcon_cli/Cargo.toml          |   1 +
 control_plane/storcon_cli/src/main.rs         |  62 +--
 libs/pageserver_api/src/controller_api.rs     |   4 +-
 libs/utils/src/auth.rs                        |   4 +
 pageserver/src/auth.rs                        |  16 +-
 safekeeper/src/auth.rs                        |  16 +-
 storage_controller/client/Cargo.toml          |  23 +
 storage_controller/client/src/control_api.rs  |  62 +++
 storage_controller/client/src/lib.rs          |   1 +
 storage_controller/src/http.rs                |   2 +-
 storage_controller/src/main.rs                |  18 +-
 storage_controller/src/service.rs             |   2 +
 storage_scrubber/Cargo.toml                   |   1 +
 storage_scrubber/src/lib.rs                   |  33 +-
 storage_scrubber/src/main.rs                  |  46 +-
 .../src/pageserver_physical_gc.rs             | 481 +++++++++++++++---
 test_runner/fixtures/neon_fixtures.py         |  24 +-
 .../regress/test_pageserver_generations.py    |   3 +-
 .../regress/test_pageserver_secondary.py      |   6 +-
 test_runner/regress/test_sharding.py          |   3 +-
 test_runner/regress/test_storage_scrubber.py  | 237 ++++++++-
 test_runner/regress/test_tenant_delete.py     |   6 +-
 24 files changed, 905 insertions(+), 191 deletions(-)
 create mode 100644 storage_controller/client/Cargo.toml
 create mode 100644 storage_controller/client/src/control_api.rs
 create mode 100644 storage_controller/client/src/lib.rs

diff --git a/Cargo.lock b/Cargo.lock
index d08da0babd..2505d4d3ed 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3234,16 +3234,6 @@ dependencies = [
  "winapi",
 ]
 
-[[package]]
-name = "nu-ansi-term"
-version = "0.46.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
-dependencies = [
- "overload",
- "winapi",
-]
-
 [[package]]
 name = "num"
 version = "0.4.1"
@@ -3539,12 +3529,6 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
 
-[[package]]
-name = "overload"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
-
 [[package]]
 name = "p256"
 version = "0.11.1"
@@ -5822,6 +5806,28 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "storage_controller_client"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "bytes",
+ "futures",
+ "pageserver_api",
+ "pageserver_client",
+ "postgres",
+ "reqwest 0.12.4",
+ "serde",
+ "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-stream",
+ "tokio-util",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "storage_scrubber"
 version = "0.1.0"
@@ -5856,6 +5862,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_with",
+ "storage_controller_client",
  "thiserror",
  "tokio",
  "tokio-postgres",
@@ -5885,6 +5892,7 @@ dependencies = [
  "reqwest 0.12.4",
  "serde",
  "serde_json",
+ "storage_controller_client",
  "thiserror",
  "tokio",
  "tracing",
@@ -6611,7 +6619,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
  "matchers",
- "nu-ansi-term",
  "once_cell",
  "regex",
  "serde",
diff --git a/Cargo.toml b/Cargo.toml
index b9b4bafb4f..615f5472ec 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,6 +13,7 @@ members = [
     "safekeeper",
     "storage_broker",
     "storage_controller",
+    "storage_controller/client",
     "storage_scrubber",
     "workspace_hack",
     "libs/compute_api",
@@ -182,7 +183,7 @@ tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
-tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
+tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 twox-hash = { version = "1.6.3", default-features = false }
 typed-json = "0.1"
 url = "2.2"
@@ -221,6 +222,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
 safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
 desim = { version = "0.1", path = "./libs/desim" }
 storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
+storage_controller_client = { path = "./storage_controller/client" }
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml
index f96f0084b2..be69208d0d 100644
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -17,6 +17,7 @@ pageserver_client.workspace = true
 reqwest.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
+storage_controller_client.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tracing.workspace = true
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 777a717a73..5c1add070a 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -14,15 +14,15 @@ use pageserver_api::{
     },
     shard::{ShardStripeSize, TenantShardId},
 };
-use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
+use pageserver_client::mgmt_api::{self};
 use reqwest::{Method, StatusCode, Url};
-use serde::{de::DeserializeOwned, Serialize};
 use utils::id::{NodeId, TenantId};
 
 use pageserver_api::controller_api::{
     NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
     TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
+use storage_controller_client::control_api::Client;
 
 #[derive(Subcommand, Debug)]
 enum Command {
@@ -249,64 +249,6 @@ impl FromStr for NodeAvailabilityArg {
     }
 }
 
-struct Client {
-    base_url: Url,
-    jwt_token: Option<String>,
-    client: reqwest::Client,
-}
-
-impl Client {
-    fn new(base_url: Url, jwt_token: Option<String>) -> Self {
-        Self {
-            base_url,
-            jwt_token,
-            client: reqwest::ClientBuilder::new()
-                .build()
-                .expect("Failed to construct http client"),
-        }
-    }
-
-    /// Simple HTTP request wrapper for calling into storage controller
-    async fn dispatch<RQ, RS>(
-        &self,
-        method: Method,
-        path: String,
-        body: Option<RQ>,
-    ) -> mgmt_api::Result<RS>
-    where
-        RQ: Serialize + Sized,
-        RS: DeserializeOwned + Sized,
-    {
-        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
-        // for general purpose API access.
-        let url = Url::from_str(&format!(
-            "http://{}:{}/{path}",
-            self.base_url.host_str().unwrap(),
-            self.base_url.port().unwrap()
-        ))
-        .unwrap();
-
-        let mut builder = self.client.request(method, url);
-        if let Some(body) = body {
-            builder = builder.json(&body)
-        }
-        if let Some(jwt_token) = &self.jwt_token {
-            builder = builder.header(
-                reqwest::header::AUTHORIZATION,
-                format!("Bearer {jwt_token}"),
-            );
-        }
-
-        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
-        let response = response.error_from_body().await?;
-
-        response
-            .json()
-            .await
-            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
-    }
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
     let cli = Cli::parse();
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index f05c1315ea..d0e1eb6b28 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -87,7 +87,7 @@ pub struct TenantLocateResponse {
     pub shard_params: ShardParameters,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Debug)]
 pub struct TenantDescribeResponse {
     pub tenant_id: TenantId,
     pub shards: Vec<TenantDescribeResponseShard>,
@@ -110,7 +110,7 @@ pub struct NodeDescribeResponse {
     pub listen_pg_port: u16,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Debug)]
 pub struct TenantDescribeResponseShard {
     pub tenant_shard_id: TenantShardId,
 
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index 03e65f74fe..a1170a460d 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -33,6 +33,10 @@ pub enum Scope {
     GenerationsApi,
     // Allows access to control plane managment API and some storage controller endpoints.
     Admin,
+
+    /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
+    /// of a tenant & post scrub results.
+    Scrubber,
 }
 
 /// JWT payload. See docs/authentication.md for the format
diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs
index 4785c8c4c5..9e3dedb75a 100644
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -14,12 +14,14 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
         }
         (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
         (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
-            format!(
-                "JWT scope '{:?}' is ineligible for Pageserver auth",
-                claims.scope
-            )
-            .into(),
-        )),
+        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Scrubber, _) => {
+            Err(AuthError(
+                format!(
+                    "JWT scope '{:?}' is ineligible for Pageserver auth",
+                    claims.scope
+                )
+                .into(),
+            ))
+        }
     }
 }
diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs
index dd9058c468..b8bc3f3e06 100644
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -12,13 +12,15 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
             }
             Ok(())
         }
-        (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError(
-            format!(
-                "JWT scope '{:?}' is ineligible for Safekeeper auth",
-                claims.scope
-            )
-            .into(),
-        )),
+        (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi | Scope::Scrubber, _) => {
+            Err(AuthError(
+                format!(
+                    "JWT scope '{:?}' is ineligible for Safekeeper auth",
+                    claims.scope
+                )
+                .into(),
+            ))
+        }
         (Scope::SafekeeperData, _) => Ok(()),
     }
 }
diff --git a/storage_controller/client/Cargo.toml b/storage_controller/client/Cargo.toml
new file mode 100644
index 0000000000..c3bfe2bfd2
--- /dev/null
+++ b/storage_controller/client/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "storage_controller_client"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+pageserver_api.workspace = true
+pageserver_client.workspace = true
+thiserror.workspace = true
+async-trait.workspace = true
+reqwest.workspace = true
+utils.workspace = true
+serde.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+tokio-postgres.workspace = true
+tokio-stream.workspace = true
+tokio.workspace = true
+futures.workspace = true
+tokio-util.workspace = true
+anyhow.workspace = true
+postgres.workspace = true
+bytes.workspace = true
diff --git a/storage_controller/client/src/control_api.rs b/storage_controller/client/src/control_api.rs
new file mode 100644
index 0000000000..a981b5020e
--- /dev/null
+++ b/storage_controller/client/src/control_api.rs
@@ -0,0 +1,62 @@
+use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
+use reqwest::{Method, Url};
+use serde::{de::DeserializeOwned, Serialize};
+use std::str::FromStr;
+
+pub struct Client {
+    base_url: Url,
+    jwt_token: Option<String>,
+    client: reqwest::Client,
+}
+
+impl Client {
+    pub fn new(base_url: Url, jwt_token: Option<String>) -> Self {
+        Self {
+            base_url,
+            jwt_token,
+            client: reqwest::ClientBuilder::new()
+                .build()
+                .expect("Failed to construct http client"),
+        }
+    }
+
+    /// Simple HTTP request wrapper for calling into storage controller
+    pub async fn dispatch<RQ, RS>(
+        &self,
+        method: Method,
+        path: String,
+        body: Option<RQ>,
+    ) -> mgmt_api::Result<RS>
+    where
+        RQ: Serialize + Sized,
+        RS: DeserializeOwned + Sized,
+    {
+        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
+        // for general purpose API access.
+        let url = Url::from_str(&format!(
+            "http://{}:{}/{path}",
+            self.base_url.host_str().unwrap(),
+            self.base_url.port().unwrap()
+        ))
+        .unwrap();
+
+        let mut builder = self.client.request(method, url);
+        if let Some(body) = body {
+            builder = builder.json(&body)
+        }
+        if let Some(jwt_token) = &self.jwt_token {
+            builder = builder.header(
+                reqwest::header::AUTHORIZATION,
+                format!("Bearer {jwt_token}"),
+            );
+        }
+
+        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
+        let response = response.error_from_body().await?;
+
+        response
+            .json()
+            .await
+            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
+    }
+}
diff --git a/storage_controller/client/src/lib.rs b/storage_controller/client/src/lib.rs
new file mode 100644
index 0000000000..6d5e202942
--- /dev/null
+++ b/storage_controller/client/src/lib.rs
@@ -0,0 +1 @@
+pub mod control_api;
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 9ddf98eb3b..8fb4be93e0 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -430,7 +430,7 @@ async fn handle_tenant_describe(
     service: Arc<Service>,
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Scrubber)?;
 
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 4bf6b528f4..789f96beb3 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -5,6 +5,7 @@ use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
 use std::path::PathBuf;
 use std::sync::Arc;
+use std::time::Duration;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
@@ -310,12 +311,21 @@ async fn async_main() -> anyhow::Result<()> {
     tracing::info!("Terminating on signal");
 
     // Stop HTTP server first, so that we don't have to service requests
-    // while shutting down Service
+    // while shutting down Service.
     server_shutdown.cancel();
-    if let Err(e) = server_task.await {
-        tracing::error!("Error joining HTTP server task: {e}")
+    match tokio::time::timeout(Duration::from_secs(5), server_task).await {
+        Ok(Ok(_)) => {
+            tracing::info!("Joined HTTP server task");
+        }
+        Ok(Err(e)) => {
+            tracing::error!("Error joining HTTP server task: {e}")
+        }
+        Err(_) => {
+            tracing::warn!("Timed out joining HTTP server task");
+            // We will fall through and shut down the service anyway, any request handlers
+            // in flight will experience cancellation & their clients will see a torn connection.
+        }
     }
-    tracing::info!("Joined HTTP server task");
 
     service.shutdown().await;
     tracing::info!("Service shutdown complete");
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 3c24433c42..a163453dca 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3956,6 +3956,8 @@ impl Service {
                 "failpoint".to_string()
             )));
 
+            failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel);
+
             tracing::info!(
                 "Split {} into {}",
                 parent_id,
diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml
index 050be66483..5233afbebe 100644
--- a/storage_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -34,6 +34,7 @@ camino.workspace = true
 rustls.workspace = true
 rustls-native-certs.workspace = true
 once_cell.workspace = true
+storage_controller_client.workspace = true
 
 tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
 chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 9102ad9906..a0b6d7ea30 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -24,6 +24,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
 use pageserver_api::shard::TenantShardId;
+use remote_storage::RemotePath;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
 use tokio::io::AsyncReadExt;
@@ -31,7 +32,7 @@ use tracing::error;
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
 use utils::fs_ext;
-use utils::id::{TenantId, TimelineId};
+use utils::id::{TenantId, TenantTimelineId, TimelineId};
 
 const MAX_RETRIES: usize = 20;
 const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
@@ -54,7 +55,7 @@ pub struct S3Target {
 /// in the pageserver, as all timeline objects existing in the scope of a particular
 /// tenant: the scrubber is different in that it handles collections of data referring to many
 /// TenantShardTimelineIds in on place.
-#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq)]
+#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
 pub struct TenantShardTimelineId {
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
@@ -67,6 +68,10 @@ impl TenantShardTimelineId {
             timeline_id,
         }
     }
+
+    fn as_tenant_timeline_id(&self) -> TenantTimelineId {
+        TenantTimelineId::new(self.tenant_shard_id.tenant_id, self.timeline_id)
+    }
 }
 
 impl Display for TenantShardTimelineId {
@@ -179,6 +184,22 @@ impl RootTarget {
             .with_sub_segment(&id.timeline_id.to_string())
     }
 
+    /// Given RemotePath "tenants/foo/timelines/bar/layerxyz", prefix it to a literal
+    /// key in the S3 bucket.
+    pub fn absolute_key(&self, key: &RemotePath) -> String {
+        let root = match self {
+            Self::Pageserver(root) => root,
+            Self::Safekeeper(root) => root,
+        };
+
+        let prefix = &root.prefix_in_bucket;
+        if prefix.ends_with('/') {
+            format!("{prefix}{key}")
+        } else {
+            format!("{prefix}/{key}")
+        }
+    }
+
     pub fn bucket_name(&self) -> &str {
         match self {
             Self::Pageserver(root) => &root.bucket_name,
@@ -216,6 +237,14 @@ impl BucketConfig {
     }
 }
 
+pub struct ControllerClientConfig {
+    /// URL to storage controller.  e.g. http://127.0.0.1:1234 when using `neon_local`
+    pub controller_api: Url,
+
+    /// JWT token for authenticating with storage controller.  Requires scope 'scrubber' or 'admin'.
+    pub controller_jwt: String,
+}
+
 pub struct ConsoleConfig {
     pub token: String,
     pub base_url: Url,
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index d816121192..b3ed6f6451 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,11 +1,12 @@
-use anyhow::bail;
+use anyhow::{anyhow, bail};
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
-use storage_scrubber::find_large_objects;
+use reqwest::Url;
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
 use storage_scrubber::scan_pageserver_metadata::scan_metadata;
 use storage_scrubber::tenant_snapshot::SnapshotDownloader;
+use storage_scrubber::{find_large_objects, ControllerClientConfig};
 use storage_scrubber::{
     init_logging, pageserver_physical_gc::pageserver_physical_gc,
     scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind,
@@ -24,6 +25,14 @@ struct Cli {
 
     #[arg(short, long, default_value_t = false)]
     delete: bool,
+
+    #[arg(long)]
+    /// URL to storage controller.  e.g. http://127.0.0.1:1234 when using `neon_local`
+    controller_api: Option<Url>,
+
+    #[arg(long)]
+    /// JWT token for authenticating with storage controller.  Requires scope 'scrubber' or 'admin'.
+    controller_jwt: Option<String>,
 }
 
 #[derive(Subcommand, Debug)]
@@ -204,8 +213,37 @@ async fn main() -> anyhow::Result<()> {
             min_age,
             mode,
         } => {
-            let summary =
-                pageserver_physical_gc(bucket_config, tenant_ids, min_age.into(), mode).await?;
+            let controller_client_conf = cli.controller_api.map(|controller_api| {
+                ControllerClientConfig {
+                    controller_api,
+                    // Default to no key: this is a convenience when working in a development environment
+                    controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
+                }
+            });
+
+            match (&controller_client_conf, mode) {
+                (Some(_), _) => {
+                    // Any mode may run when controller API is set
+                }
+                (None, GcMode::Full) => {
+                    // The part of physical GC where we erase ancestor layers cannot be done safely without
+                    // confirming the most recent complete shard split with the controller.  Refuse to run, rather
+                    // than doing it unsafely.
+                    return Err(anyhow!("Full physical GC requires `--controller-api` and `--controller-jwt` to run"));
+                }
+                (None, GcMode::DryRun | GcMode::IndicesOnly) => {
+                    // These GcModes do not require the controller to run.
+                }
+            }
+
+            let summary = pageserver_physical_gc(
+                bucket_config,
+                controller_client_conf,
+                tenant_ids,
+                min_age.into(),
+                mode,
+            )
+            .await?;
             println!("{}", serde_json::to_string(&summary).unwrap());
             Ok(())
         }
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index fb8fbc1635..e977fd49f7 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -1,22 +1,50 @@
-use std::time::{Duration, UNIX_EPOCH};
+use std::collections::{BTreeMap, HashMap};
+use std::sync::Arc;
+use std::time::{Duration, SystemTime};
 
 use crate::checks::{list_timeline_blobs, BlobDataParseResult};
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
+use crate::{
+    init_remote, BucketConfig, ControllerClientConfig, NodeKind, RootTarget, TenantShardTimelineId,
+};
 use aws_sdk_s3::Client;
 use futures_util::{StreamExt, TryStreamExt};
-use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::controller_api::TenantDescribeResponse;
+use pageserver_api::shard::{ShardIndex, TenantShardId};
 use remote_storage::RemotePath;
+use reqwest::Method;
 use serde::Serialize;
+use storage_controller_client::control_api;
 use tracing::{info_span, Instrument};
 use utils::generation::Generation;
+use utils::id::{TenantId, TenantTimelineId};
 
 #[derive(Serialize, Default)]
 pub struct GcSummary {
     indices_deleted: usize,
     remote_storage_errors: usize,
+    controller_api_errors: usize,
+    ancestor_layers_deleted: usize,
+}
+
+impl GcSummary {
+    fn merge(&mut self, other: Self) {
+        let Self {
+            indices_deleted,
+            remote_storage_errors,
+            ancestor_layers_deleted,
+            controller_api_errors,
+        } = other;
+
+        self.indices_deleted += indices_deleted;
+        self.remote_storage_errors += remote_storage_errors;
+        self.ancestor_layers_deleted += ancestor_layers_deleted;
+        self.controller_api_errors += controller_api_errors;
+    }
 }
 
 #[derive(clap::ValueEnum, Debug, Clone, Copy)]
@@ -26,9 +54,9 @@ pub enum GcMode {
 
     // Enable only removing old-generation indices
     IndicesOnly,
+
     // Enable all forms of GC
-    // TODO: this will be used when shard split ancestor layer deletion is added
-    // All,
+    Full,
 }
 
 impl std::fmt::Display for GcMode {
@@ -36,10 +64,232 @@ impl std::fmt::Display for GcMode {
         match self {
             GcMode::DryRun => write!(f, "dry-run"),
             GcMode::IndicesOnly => write!(f, "indices-only"),
+            GcMode::Full => write!(f, "full"),
         }
     }
 }
 
+mod refs {
+    use super::*;
+    // Map of cross-shard layer references, giving a refcount for each layer in each shard that is referenced by some other
+    // shard in the same tenant.  This is sparse!  The vast majority of timelines will have no cross-shard refs, and those that
+    // do have cross shard refs should eventually drop most of them via compaction.
+    //
+    // In our inner map type, the TTID in the key is shard-agnostic, and the ShardIndex in the value refers to the _ancestor
+    // which is is referenced_.
+    #[derive(Default)]
+    pub(super) struct AncestorRefs(
+        BTreeMap<TenantTimelineId, HashMap<(ShardIndex, LayerName), usize>>,
+    );
+
+    impl AncestorRefs {
+        /// Insert references for layers discovered in a particular shard-timeline that refer to an ancestral shard-timeline.
+        pub(super) fn update(
+            &mut self,
+            ttid: TenantShardTimelineId,
+            layers: Vec<(LayerName, LayerFileMetadata)>,
+        ) {
+            let ttid_refs = self.0.entry(ttid.as_tenant_timeline_id()).or_default();
+            for (layer_name, layer_metadata) in layers {
+                // Increment refcount of this layer in the ancestor shard
+                *(ttid_refs
+                    .entry((layer_metadata.shard, layer_name))
+                    .or_default()) += 1;
+            }
+        }
+
+        /// For a particular TTID, return the map of all ancestor layers referenced by a descendent to their refcount
+        ///
+        /// The `ShardIndex` in the result's key is the index of the _ancestor_, not the descendent.
+        pub(super) fn get_ttid_refcounts(
+            &self,
+            ttid: &TenantTimelineId,
+        ) -> Option<&HashMap<(ShardIndex, LayerName), usize>> {
+            self.0.get(ttid)
+        }
+    }
+}
+
+use refs::AncestorRefs;
+
+// As we see shards for a tenant, acccumulate knowledge needed for cross-shard GC:
+// - Are there any ancestor shards?
+// - Are there any refs to ancestor shards' layers?
+#[derive(Default)]
+struct TenantRefAccumulator {
+    shards_seen: HashMap<TenantId, Vec<ShardIndex>>,
+
+    // For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to
+    ancestor_ref_shards: AncestorRefs,
+}
+
+impl TenantRefAccumulator {
+    fn update(&mut self, ttid: TenantShardTimelineId, index_part: &IndexPart) {
+        let this_shard_idx = ttid.tenant_shard_id.to_index();
+        (*self
+            .shards_seen
+            .entry(ttid.tenant_shard_id.tenant_id)
+            .or_default())
+        .push(this_shard_idx);
+
+        let mut ancestor_refs = Vec::new();
+        for (layer_name, layer_metadata) in &index_part.layer_metadata {
+            if layer_metadata.shard != this_shard_idx {
+                // This is a reference from this shard to a layer in an ancestor shard: we must track this
+                // as a marker to not GC this layer from the parent.
+                ancestor_refs.push((layer_name.clone(), layer_metadata.clone()));
+            }
+        }
+
+        if !ancestor_refs.is_empty() {
+            tracing::info!(%ttid, "Found {} ancestor refs", ancestor_refs.len());
+            self.ancestor_ref_shards.update(ttid, ancestor_refs);
+        }
+    }
+
+    /// Consume Self and return a vector of ancestor tenant shards that should be GC'd, and map of referenced ancestor layers to preserve
+    async fn into_gc_ancestors(
+        self,
+        controller_client: &control_api::Client,
+        summary: &mut GcSummary,
+    ) -> (Vec<TenantShardId>, AncestorRefs) {
+        let mut ancestors_to_gc = Vec::new();
+        for (tenant_id, mut shard_indices) in self.shards_seen {
+            // Find the highest shard count
+            let latest_count = shard_indices
+                .iter()
+                .map(|i| i.shard_count)
+                .max()
+                .expect("Always at least one shard");
+
+            let (mut latest_shards, ancestor_shards) = {
+                let at =
+                    itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count);
+                (shard_indices[0..at].to_owned(), &shard_indices[at..])
+            };
+            // Sort shards, as we will later compare them with a sorted list from the controller
+            latest_shards.sort();
+
+            // Check that we have a complete view of the latest shard count: this should always be the case unless we happened
+            // to scan the S3 bucket halfway through a shard split.
+            if latest_shards.len() != latest_count.count() as usize {
+                // This should be extremely rare, so we warn on it.
+                tracing::warn!(%tenant_id, "Missed some shards at count {:?}", latest_count);
+                continue;
+            }
+
+            // Check if we have any non-latest-count shards
+            if ancestor_shards.is_empty() {
+                tracing::debug!(%tenant_id, "No ancestor shards to clean up");
+                continue;
+            }
+
+            // Based on S3 view, this tenant looks like it might have some ancestor shard work to do.  We
+            // must only do this work if the tenant is not currently being split: otherwise, it is not safe
+            // to GC ancestors, because if the split fails then the controller will try to attach ancestor
+            // shards again.
+            match controller_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await
+            {
+                Err(e) => {
+                    // We were not able to learn the latest shard split state from the controller, so we will not
+                    // do ancestor GC on this tenant.
+                    tracing::warn!(%tenant_id, "Failed to query storage controller, will not do ancestor GC: {e}");
+                    summary.controller_api_errors += 1;
+                    continue;
+                }
+                Ok(desc) => {
+                    // We expect to see that the latest shard count matches the one we saw in S3, and that none
+                    // of the shards indicate splitting in progress.
+
+                    let controller_indices: Vec<ShardIndex> = desc
+                        .shards
+                        .iter()
+                        .map(|s| s.tenant_shard_id.to_index())
+                        .collect();
+                    if controller_indices != latest_shards {
+                        tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})");
+                        continue;
+                    }
+
+                    if desc.shards.iter().any(|s| s.is_splitting) {
+                        tracing::info!(%tenant_id, "One or more shards is currently splitting");
+                        continue;
+                    }
+
+                    // This shouldn't be too noisy, because we only log this for tenants that have some ancestral refs.
+                    tracing::info!(%tenant_id, "Validated state with controller: {desc:?}");
+                }
+            }
+
+            // GC ancestor shards
+            for ancestor_shard in ancestor_shards.iter().map(|idx| TenantShardId {
+                tenant_id,
+                shard_count: idx.shard_count,
+                shard_number: idx.shard_number,
+            }) {
+                ancestors_to_gc.push(ancestor_shard);
+            }
+        }
+
+        (ancestors_to_gc, self.ancestor_ref_shards)
+    }
+}
+
+async fn is_old_enough(
+    s3_client: &Client,
+    bucket_config: &BucketConfig,
+    min_age: &Duration,
+    key: &str,
+    summary: &mut GcSummary,
+) -> bool {
+    // Validation: we will only GC indices & layers after a time threshold (e.g. one week) so that during an incident
+    // it is easier to read old data for analysis, and easier to roll back shard splits without having to un-delete any objects.
+    let age: Duration = match s3_client
+        .head_object()
+        .bucket(&bucket_config.bucket)
+        .key(key)
+        .send()
+        .await
+    {
+        Ok(response) => match response.last_modified {
+            None => {
+                tracing::warn!("Missing last_modified");
+                summary.remote_storage_errors += 1;
+                return false;
+            }
+            Some(last_modified) => match SystemTime::try_from(last_modified).map(|t| t.elapsed()) {
+                Ok(Ok(e)) => e,
+                Err(_) | Ok(Err(_)) => {
+                    tracing::warn!("Bad last_modified time: {last_modified:?}");
+                    return false;
+                }
+            },
+        },
+        Err(e) => {
+            tracing::warn!("Failed to HEAD {key}: {e}");
+            summary.remote_storage_errors += 1;
+            return false;
+        }
+    };
+    let old_enough = &age > min_age;
+
+    if !old_enough {
+        tracing::info!(
+            "Skipping young object {} < {}",
+            humantime::format_duration(age),
+            humantime::format_duration(*min_age)
+        );
+    }
+
+    old_enough
+}
+
 async fn maybe_delete_index(
     s3_client: &Client,
     bucket_config: &BucketConfig,
@@ -79,45 +329,7 @@ async fn maybe_delete_index(
         return;
     }
 
-    // Validation: we will only delete indices after one week, so that during incidents we will have
-    // easy access to recent indices.
-    let age: Duration = match s3_client
-        .head_object()
-        .bucket(&bucket_config.bucket)
-        .key(key)
-        .send()
-        .await
-    {
-        Ok(response) => match response.last_modified {
-            None => {
-                tracing::warn!("Missing last_modified");
-                summary.remote_storage_errors += 1;
-                return;
-            }
-            Some(last_modified) => {
-                let last_modified =
-                    UNIX_EPOCH + Duration::from_secs_f64(last_modified.as_secs_f64());
-                match last_modified.elapsed() {
-                    Ok(e) => e,
-                    Err(_) => {
-                        tracing::warn!("Bad last_modified time: {last_modified:?}");
-                        return;
-                    }
-                }
-            }
-        },
-        Err(e) => {
-            tracing::warn!("Failed to HEAD {key}: {e}");
-            summary.remote_storage_errors += 1;
-            return;
-        }
-    };
-    if &age < min_age {
-        tracing::info!(
-            "Skipping young object {} < {}",
-            age.as_secs_f64(),
-            min_age.as_secs_f64()
-        );
+    if !is_old_enough(s3_client, bucket_config, min_age, key, summary).await {
         return;
     }
 
@@ -145,6 +357,108 @@ async fn maybe_delete_index(
     }
 }
 
+#[allow(clippy::too_many_arguments)]
+async fn gc_ancestor(
+    s3_client: &Client,
+    bucket_config: &BucketConfig,
+    root_target: &RootTarget,
+    min_age: &Duration,
+    ancestor: TenantShardId,
+    refs: &AncestorRefs,
+    mode: GcMode,
+    summary: &mut GcSummary,
+) -> anyhow::Result<()> {
+    // Scan timelines in the ancestor
+    let timelines = stream_tenant_timelines(s3_client, root_target, ancestor).await?;
+    let mut timelines = std::pin::pin!(timelines);
+
+    // Build a list of keys to retain
+
+    while let Some(ttid) = timelines.next().await {
+        let ttid = ttid?;
+
+        let data = list_timeline_blobs(s3_client, ttid, root_target).await?;
+
+        let s3_layers = match data.blob_data {
+            BlobDataParseResult::Parsed {
+                index_part: _,
+                index_part_generation: _,
+                s3_layers,
+            } => s3_layers,
+            BlobDataParseResult::Relic => {
+                // Post-deletion tenant location: don't try and GC it.
+                continue;
+            }
+            BlobDataParseResult::Incorrect(reasons) => {
+                // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
+                tracing::warn!(
+                    "Skipping ancestor GC for timeline {ttid}, bad metadata: {reasons:?}"
+                );
+                continue;
+            }
+        };
+
+        let ttid_refs = refs.get_ttid_refcounts(&ttid.as_tenant_timeline_id());
+        let ancestor_shard_index = ttid.tenant_shard_id.to_index();
+
+        for (layer_name, layer_gen) in s3_layers {
+            let ref_count = ttid_refs
+                .and_then(|m| m.get(&(ancestor_shard_index, layer_name.clone())))
+                .copied()
+                .unwrap_or(0);
+
+            if ref_count > 0 {
+                tracing::debug!(%ttid, "Ancestor layer {layer_name}  has {ref_count} refs");
+                continue;
+            }
+
+            tracing::info!(%ttid, "Ancestor layer {layer_name} is not referenced");
+
+            // Build the key for the layer we are considering deleting
+            let key = root_target.absolute_key(&remote_layer_path(
+                &ttid.tenant_shard_id.tenant_id,
+                &ttid.timeline_id,
+                ancestor_shard_index,
+                &layer_name,
+                layer_gen,
+            ));
+
+            // We apply a time threshold to GCing objects that are un-referenced: this preserves our ability
+            // to roll back a shard split if we have to, by avoiding deleting ancestor layers right away
+            if !is_old_enough(s3_client, bucket_config, min_age, &key, summary).await {
+                continue;
+            }
+
+            if !matches!(mode, GcMode::Full) {
+                tracing::info!("Dry run: would delete key {key}");
+                continue;
+            }
+
+            // All validations passed: erase the object
+            match s3_client
+                .delete_object()
+                .bucket(&bucket_config.bucket)
+                .key(&key)
+                .send()
+                .await
+            {
+                Ok(_) => {
+                    tracing::info!("Successfully deleted unreferenced ancestor layer {key}");
+                    summary.ancestor_layers_deleted += 1;
+                }
+                Err(e) => {
+                    tracing::warn!("Failed to delete layer {key}: {e}");
+                    summary.remote_storage_errors += 1;
+                }
+            }
+        }
+
+        // TODO: if all the layers are gone, clean up the whole timeline dir (remove index)
+    }
+
+    Ok(())
+}
+
 /// Physical garbage collection: removing unused S3 objects.  This is distinct from the garbage collection
 /// done inside the pageserver, which operates at a higher level (keys, layers).  This type of garbage collection
 /// is about removing:
@@ -156,22 +470,26 @@ async fn maybe_delete_index(
 /// make sure that object listings don't get slowed down by large numbers of garbage objects.
 pub async fn pageserver_physical_gc(
     bucket_config: BucketConfig,
-    tenant_ids: Vec<TenantShardId>,
+    controller_client_conf: Option<ControllerClientConfig>,
+    tenant_shard_ids: Vec<TenantShardId>,
     min_age: Duration,
     mode: GcMode,
 ) -> anyhow::Result<GcSummary> {
     let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
 
-    let tenants = if tenant_ids.is_empty() {
+    let tenants = if tenant_shard_ids.is_empty() {
         futures::future::Either::Left(stream_tenants(&s3_client, &target))
     } else {
-        futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
+        futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok)))
     };
 
     // How many tenants to process in parallel.  We need to be mindful of pageservers
     // accessing the same per tenant prefixes, so use a lower setting than pageservers.
     const CONCURRENCY: usize = 32;
 
+    // Accumulate information about each tenant for cross-shard GC step we'll do at the end
+    let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default()));
+
     // Generate a stream of TenantTimelineId
     let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
     let timelines = timelines.try_buffered(CONCURRENCY);
@@ -185,16 +503,17 @@ pub async fn pageserver_physical_gc(
         target: &RootTarget,
         mode: GcMode,
         ttid: TenantShardTimelineId,
+        accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
     ) -> anyhow::Result<GcSummary> {
         let mut summary = GcSummary::default();
         let data = list_timeline_blobs(s3_client, ttid, target).await?;
 
-        let (latest_gen, candidates) = match &data.blob_data {
+        let (index_part, latest_gen, candidates) = match &data.blob_data {
             BlobDataParseResult::Parsed {
-                index_part: _index_part,
+                index_part,
                 index_part_generation,
                 s3_layers: _s3_layers,
-            } => (*index_part_generation, data.unused_index_keys),
+            } => (index_part, *index_part_generation, data.unused_index_keys),
             BlobDataParseResult::Relic => {
                 // Post-deletion tenant location: don't try and GC it.
                 return Ok(summary);
@@ -206,6 +525,8 @@ pub async fn pageserver_physical_gc(
             }
         };
 
+        accumulator.lock().unwrap().update(ttid, index_part);
+
         for key in candidates {
             maybe_delete_index(
                 s3_client,
@@ -222,17 +543,61 @@ pub async fn pageserver_physical_gc(
 
         Ok(summary)
     }
-    let timelines = timelines
-        .map_ok(|ttid| gc_timeline(&s3_client, &bucket_config, &min_age, &target, mode, ttid));
-    let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
     let mut summary = GcSummary::default();
 
-    while let Some(i) = timelines.next().await {
-        let tl_summary = i?;
+    // Drain futures for per-shard GC, populating accumulator as a side effect
+    {
+        let timelines = timelines.map_ok(|ttid| {
+            gc_timeline(
+                &s3_client,
+                &bucket_config,
+                &min_age,
+                &target,
+                mode,
+                ttid,
+                &accumulator,
+            )
+        });
+        let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
-        summary.indices_deleted += tl_summary.indices_deleted;
-        summary.remote_storage_errors += tl_summary.remote_storage_errors;
+        while let Some(i) = timelines.next().await {
+            summary.merge(i?);
+        }
+    }
+
+    // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
+    let Some(controller_client) = controller_client_conf.as_ref().map(|c| {
+        let ControllerClientConfig {
+            controller_api,
+            controller_jwt,
+        } = c;
+        control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone()))
+    }) else {
+        tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
+        return Ok(summary);
+    };
+
+    let (ancestor_shards, ancestor_refs) = Arc::into_inner(accumulator)
+        .unwrap()
+        .into_inner()
+        .unwrap()
+        .into_gc_ancestors(&controller_client, &mut summary)
+        .await;
+
+    for ancestor_shard in ancestor_shards {
+        gc_ancestor(
+            &s3_client,
+            &bucket_config,
+            &target,
+            &min_age,
+            ancestor_shard,
+            &ancestor_refs,
+            mode,
+            &mut summary,
+        )
+        .instrument(info_span!("gc_ancestor", %ancestor_shard))
+        .await?;
     }
 
     Ok(summary)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index db7269ad41..9e39457c06 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -997,7 +997,7 @@ class NeonEnvBuilder:
 
             if self.scrub_on_exit:
                 try:
-                    StorageScrubber(self).scan_metadata()
+                    self.env.storage_scrubber.scan_metadata()
                 except Exception as e:
                     log.error(f"Error during remote storage scrub: {e}")
                     cleanup_error = e
@@ -1225,6 +1225,9 @@ class NeonEnv:
             )
             cfg["safekeepers"].append(sk_cfg)
 
+        # Scrubber instance for tests that use it, and for use during teardown checks
+        self.storage_scrubber = StorageScrubber(self, log_dir=config.test_output_dir)
+
         log.info(f"Config: {cfg}")
         self.neon_cli.init(
             cfg,
@@ -4265,9 +4268,9 @@ class Safekeeper(LogUtils):
 
 
 class StorageScrubber:
-    def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
+    def __init__(self, env: NeonEnv, log_dir: Path):
         self.env = env
-        self.log_dir = log_dir or env.test_output_dir
+        self.log_dir = log_dir
 
     def scrubber_cli(self, args: list[str], timeout) -> str:
         assert isinstance(self.env.pageserver_remote_storage, S3Storage)
@@ -4284,11 +4287,14 @@ class StorageScrubber:
         if s3_storage.endpoint is not None:
             env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})
 
-        base_args = [str(self.env.neon_binpath / "storage_scrubber")]
+        base_args = [
+            str(self.env.neon_binpath / "storage_scrubber"),
+            f"--controller-api={self.env.storage_controller_api}",
+        ]
         args = base_args + args
 
         (output_path, stdout, status_code) = subprocess_capture(
-            self.env.test_output_dir,
+            self.log_dir,
             args,
             echo_stderr=True,
             echo_stdout=True,
@@ -4327,7 +4333,10 @@ class StorageScrubber:
         log.info(f"tenant-snapshot output: {stdout}")
 
     def pageserver_physical_gc(
-        self, min_age_secs: int, tenant_ids: Optional[list[TenantId]] = None
+        self,
+        min_age_secs: int,
+        tenant_ids: Optional[list[TenantId]] = None,
+        mode: Optional[str] = None,
     ):
         args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"]
 
@@ -4337,6 +4346,9 @@ class StorageScrubber:
         for tenant_id in tenant_ids:
             args.extend(["--tenant-id", str(tenant_id)])
 
+        if mode is not None:
+            args.extend(["--mode", mode])
+
         stdout = self.scrubber_cli(
             args,
             timeout=30,
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 7ce38c5c3c..041942cda3 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -22,7 +22,6 @@ from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
     PgBin,
-    StorageScrubber,
     generate_uploads_and_deletions,
 )
 from fixtures.pageserver.common_types import parse_layer_file_name
@@ -215,7 +214,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
 
     # Having written a mixture of generation-aware and legacy index_part.json,
     # ensure the scrubber handles the situation as expected.
-    metadata_summary = StorageScrubber(neon_env_builder).scan_metadata()
+    metadata_summary = env.storage_scrubber.scan_metadata()
     assert metadata_summary["tenant_count"] == 1  # Scrubber should have seen our timeline
     assert metadata_summary["timeline_count"] == 1
     assert metadata_summary["timeline_shard_count"] == 1
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 0416078ebc..58d61eab0d 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -7,7 +7,7 @@ from typing import Any, Dict, Optional
 import pytest
 from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, StorageScrubber
+from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
 from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     assert_prefix_empty,
@@ -234,7 +234,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
     # Having done a bunch of attach/detach cycles, we will have generated some index garbage: check
     # that the scrubber sees it and cleans it up.  We do this before the final attach+validate pass,
     # to also validate that the scrubber isn't breaking anything.
-    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1)
     assert gc_summary["remote_storage_errors"] == 0
     assert gc_summary["indices_deleted"] > 0
 
@@ -555,7 +555,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     # Scrub the remote storage
     # ========================
     # This confirms that the scrubber isn't upset by the presence of the heatmap
-    StorageScrubber(neon_env_builder).scan_metadata()
+    env.storage_scrubber.scan_metadata()
 
     # Detach secondary and delete tenant
     # ===================================
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 4471237900..90c6e26d01 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -12,7 +12,6 @@ from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
     StorageControllerApiException,
-    StorageScrubber,
     last_flush_lsn_upload,
     tenant_get_shards,
     wait_for_last_flush_lsn,
@@ -128,7 +127,7 @@ def test_sharding_smoke(
 
     # Check the scrubber isn't confused by sharded content, then disable
     # it during teardown because we'll have deleted by then
-    StorageScrubber(neon_env_builder).scan_metadata()
+    env.storage_scrubber.scan_metadata()
     neon_env_builder.scrub_on_exit = False
 
     env.storage_controller.pageserver_api().tenant_delete(tenant_id)
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 35ae61c380..635690fc7f 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -1,14 +1,19 @@
 import os
 import shutil
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor
 from typing import Optional
 
 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    NeonEnv,
     NeonEnvBuilder,
-    StorageScrubber,
 )
 from fixtures.remote_storage import S3Storage, s3_storage
+from fixtures.utils import wait_until
 from fixtures.workload import Workload
 
 
@@ -60,8 +65,7 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
     output_path = neon_env_builder.test_output_dir / "snapshot"
     os.makedirs(output_path)
 
-    scrubber = StorageScrubber(neon_env_builder)
-    scrubber.tenant_snapshot(tenant_id, output_path)
+    env.storage_scrubber.tenant_snapshot(tenant_id, output_path)
 
     assert len(os.listdir(output_path)) > 0
 
@@ -111,6 +115,14 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
     workload.validate()
 
 
+def drop_local_state(env: NeonEnv, tenant_id: TenantId):
+    env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})
+    env.storage_controller.reconcile_until_idle()
+
+    env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}})
+    env.storage_controller.reconcile_until_idle()
+
+
 @pytest.mark.parametrize("shard_count", [None, 4])
 def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
@@ -133,28 +145,231 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt
 
     # For each cycle, detach and attach the tenant to bump the generation, and do some writes to generate uploads
     for _i in range(0, n_cycles):
-        env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})
-        env.storage_controller.reconcile_until_idle()
-
-        env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}})
-        env.storage_controller.reconcile_until_idle()
+        drop_local_state(env, tenant_id)
 
         # This write includes remote upload, will generate an index in this generation
         workload.write_rows(1)
 
     # With a high min_age, the scrubber should decline to delete anything
-    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600)
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600)
     assert gc_summary["remote_storage_errors"] == 0
     assert gc_summary["indices_deleted"] == 0
 
     # If targeting a different tenant, the scrubber shouldn't do anything
-    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(
         min_age_secs=1, tenant_ids=[TenantId.generate()]
     )
     assert gc_summary["remote_storage_errors"] == 0
     assert gc_summary["indices_deleted"] == 0
 
     #  With a low min_age, the scrubber should go ahead and clean up all but the latest 2 generations
-    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1)
     assert gc_summary["remote_storage_errors"] == 0
     assert gc_summary["indices_deleted"] == (expect_indices_per_shard - 2) * shard_count
+
+
+@pytest.mark.parametrize("shard_count", [None, 2])
+def test_scrubber_physical_gc_ancestors(
+    neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]
+):
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.num_pageservers = 2
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(
+        tenant_id,
+        timeline_id,
+        shard_count=shard_count,
+        conf={
+            # Small layers and low compaction thresholds, so that when we split we can expect some to
+            # be dropped by child shards
+            "checkpoint_distance": f"{1024 * 1024}",
+            "compaction_threshold": "1",
+            "compaction_target_size": f"{1024 * 1024}",
+            "image_creation_threshold": "2",
+            "image_layer_creation_check_threshold": "0",
+            # Disable background compaction, we will do it explicitly
+            "compaction_period": "0s",
+            # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas
+            # and makes them GC'able
+            "pitr_interval": "0s",
+        },
+    )
+
+    # Make sure the original shard has some layers
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(100)
+
+    new_shard_count = 4
+    assert shard_count is None or new_shard_count > shard_count
+    shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
+
+    # Make sure child shards have some layers
+    workload.write_rows(100)
+
+    # Flush deletion queue so that we don't leave any orphan layers in the parent that will confuse subsequent checks: once
+    # a shard is split, any layers in its prefix that aren't referenced by a child will be considered GC'able, even
+    # if they were logically deleted before the shard split, just not physically deleted yet because of the queue.
+    for ps in env.pageservers:
+        ps.http_client().deletion_queue_flush(execute=True)
+
+    # Before compacting, all the layers in the ancestor should still be referenced by the children: the scrubber
+    # should not erase any ancestor layers
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+    assert gc_summary["ancestor_layers_deleted"] == 0
+
+    # Write some data and compact: compacting, some ancestor layers should no longer be needed by children
+    # (the compaction is part of the checkpoint that Workload does for us)
+    workload.churn_rows(100)
+    workload.churn_rows(100)
+    workload.churn_rows(100)
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+        ps.http_client().timeline_compact(shard, timeline_id)
+        ps.http_client().timeline_gc(shard, timeline_id, 0)
+
+    # We will use a min_age_secs=1 threshold for deletion, let it pass
+    time.sleep(2)
+
+    # Our time threshold should be respected: check that with a high threshold we delete nothing
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+    assert gc_summary["ancestor_layers_deleted"] == 0
+
+    # Now run with a low time threshold: deletions of ancestor layers should be executed
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+    assert gc_summary["ancestor_layers_deleted"] > 0
+
+    # We deleted some layers: now check we didn't corrupt the tenant by doing so. Detach and
+    # attach it, to drop any local state, then check it's still readable.
+    workload.stop()
+    drop_local_state(env, tenant_id)
+
+    workload.validate()
+
+
+def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder):
+    """
+    Exercise ancestor GC while a tenant is partly split: this test ensures that if we have some child shards
+    which don't reference an ancestor, but some child shards that don't exist yet, then we do not incorrectly
+    GC any ancestor layers.
+    """
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.num_pageservers = 2
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    initial_shard_count = 2
+    env.neon_cli.create_tenant(
+        tenant_id,
+        timeline_id,
+        shard_count=initial_shard_count,
+        conf={
+            # Small layers and low compaction thresholds, so that when we split we can expect some to
+            # be dropped by child shards
+            "checkpoint_distance": f"{1024 * 1024}",
+            "compaction_threshold": "1",
+            "compaction_target_size": f"{1024 * 1024}",
+            "image_creation_threshold": "2",
+            "image_layer_creation_check_threshold": "0",
+            # Disable background compaction, we will do it explicitly
+            "compaction_period": "0s",
+            # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas
+            # and makes them GC'able
+            "pitr_interval": "0s",
+        },
+    )
+
+    unstuck = threading.Event()
+
+    def stuck_split():
+        # Pause our shard split after the first shard but before the second, such that when we run
+        # the scrub, the S3 bucket contains shards 0002, 0101, 0004, 0204 (but not 0104, 0304).
+        env.storage_controller.configure_failpoints(
+            ("shard-split-post-remote-sleep", "return(3600000)")
+        )
+        try:
+            split_response = env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
+        except Exception as e:
+            log.info(f"Split failed with {e}")
+        else:
+            if not unstuck.is_set():
+                raise RuntimeError(f"Split succeeded unexpectedly ({split_response})")
+
+    with ThreadPoolExecutor(max_workers=1) as threads:
+        log.info("Starting hung shard split")
+        stuck_split_fut = threads.submit(stuck_split)
+
+        # Let the controller reach the failpoint
+        wait_until(
+            10,
+            1,
+            lambda: env.storage_controller.assert_log_contains(
+                'failpoint "shard-split-post-remote-sleep": sleeping'
+            ),
+        )
+
+        # Run compaction on the new child shards, so that they drop some refs to their parent
+        child_shards = [
+            TenantShardId(tenant_id, 0, 4),
+            TenantShardId(tenant_id, 2, 4),
+        ]
+        log.info("Compacting first two children")
+        for child in child_shards:
+            env.get_tenant_pageserver(
+                TenantShardId(tenant_id, 0, initial_shard_count)
+            ).http_client().timeline_compact(child, timeline_id)
+
+        # Check that the other child shards weren't created
+        assert env.get_tenant_pageserver(TenantShardId(tenant_id, 1, 4)) is None
+        assert env.get_tenant_pageserver(TenantShardId(tenant_id, 3, 4)) is None
+
+        # Run scrubber: it should not incorrectly interpret the **04 shards' lack of refs to all
+        # ancestor layers as a reason to GC them, because it should realize that a split is in progress.
+        # (GC requires that controller does not indicate split in progress, and that if we see the highest
+        #  shard count N, then there are N shards present with that shard count).
+        gc_output = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
+        log.info(f"Ran physical GC partway through split: {gc_output}")
+        assert gc_output["ancestor_layers_deleted"] == 0
+        assert gc_output["remote_storage_errors"] == 0
+        assert gc_output["controller_api_errors"] == 0
+
+        # Storage controller shutdown lets our split request client complete
+        log.info("Stopping storage controller")
+        unstuck.set()
+        env.storage_controller.allowed_errors.append(".*Timed out joining HTTP server task.*")
+        env.storage_controller.stop()
+        stuck_split_fut.result()
+
+        # Restart the controller and retry the split with the failpoint disabled, this should
+        # complete successfully and result in an S3 state that allows the scrubber to proceed with removing ancestor layers
+        log.info("Starting & retrying split")
+        env.storage_controller.start()
+        env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
+
+        # The other child shards exist now, we can compact them to drop refs to ancestor
+        log.info("Compacting second two children")
+        for child in [
+            TenantShardId(tenant_id, 1, 4),
+            TenantShardId(tenant_id, 3, 4),
+        ]:
+            env.get_tenant_pageserver(child).http_client().timeline_compact(child, timeline_id)
+
+        gc_output = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
+        log.info(f"Ran physical GC after split completed: {gc_output}")
+        assert gc_output["ancestor_layers_deleted"] > 0
+        assert gc_output["remote_storage_errors"] == 0
+        assert gc_output["controller_api_errors"] == 0
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 1d7c8b8e31..6d20b3d0de 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -5,7 +5,6 @@ from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
-    StorageScrubber,
     wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverApiException
@@ -325,7 +324,6 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
 
     remote_storage_kind = RemoteStorageKind.MOCK_S3
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-    scrubber = StorageScrubber(neon_env_builder)
     env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
 
     ps_http = env.pageserver.http_client()
@@ -340,7 +338,7 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
     wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
     env.stop()
 
-    result = scrubber.scan_metadata()
+    result = env.storage_scrubber.scan_metadata()
     assert result["with_warnings"] == []
 
     env.start()
@@ -348,5 +346,5 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
     ps_http.tenant_delete(tenant_id)
     env.stop()
 
-    scrubber.scan_metadata()
+    env.storage_scrubber.scan_metadata()
     assert result["with_warnings"] == []

From 39aeb10cfc453172cd189c7a43877194ab0dc4a8 Mon Sep 17 00:00:00 2001
From: Shinya Kato <37682778+shinyaaa@users.noreply.github.com>
Date: Sat, 20 Jul 2024 02:10:19 +0900
Subject: [PATCH 1226/1571] safekeeper: remove unused safekeeper runtimes
 (#8433)

There are unused safekeeper runtimes `WAL_REMOVER_RUNTIME` and
`METRICS_SHIFTER_RUNTIME`.

`WAL_REMOVER_RUNTIME` was implemented in
[#4119](https://github.com/neondatabase/neon/pull/4119) and removed in
[#7887](https://github.com/neondatabase/neon/pull/7887).
`METRICS_SHIFTER_RUNTIME` was also implemented in
[#4119](https://github.com/neondatabase/neon/pull/4119) but has never
been used.

I removed unused safekeeper runtimes `WAL_REMOVER_RUNTIME` and
`METRICS_SHIFTER_RUNTIME`.
---
 safekeeper/src/lib.rs | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index af83feb77f..8f2920ada3 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -173,15 +173,6 @@ pub static BROKER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
         .expect("Failed to create broker runtime")
 });
 
-pub static WAL_REMOVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("WAL remover")
-        .worker_threads(1)
-        .enable_all()
-        .build()
-        .expect("Failed to create broker runtime")
-});
-
 pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
     tokio::runtime::Builder::new_multi_thread()
         .thread_name("WAL backup worker")
@@ -189,12 +180,3 @@ pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
         .build()
         .expect("Failed to create WAL backup runtime")
 });
-
-pub static METRICS_SHIFTER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("metric shifter")
-        .worker_threads(1)
-        .enable_all()
-        .build()
-        .expect("Failed to create broker runtime")
-});

From a4fa250c9226ede43c7183345c19815ae6f6b61c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 19 Jul 2024 18:30:28 +0100
Subject: [PATCH 1227/1571] tests: longer timeouts in
 test_timeline_deletion_with_files_stuck_in_upload_queue (#8438)

## Problem

This test had two locations with 2 second timeouts, which is rather low
when we run on a highly contended test machine running lots of tests in
parallel. It usually passes, but today I've seen both of these locations
time out on separate PRs.

Example failure:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8432/10007868041/index.html#suites/837740b64a53e769572c4ed7b7a7eeeb/6c6a092be083d27c

## Summary of changes

- Change 2 second timeouts to 20 second timeouts
---
 test_runner/regress/test_remote_storage.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index fac7fe9dee..09f941f582 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -577,7 +577,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
             > 0
         )
 
-    wait_until(20, 0.1, assert_compacted_and_uploads_queued)
+    wait_until(200, 0.1, assert_compacted_and_uploads_queued)
 
     # Regardless, give checkpoint some time to block for good.
     # Not strictly necessary, but might help uncover failure modes in the future.
@@ -619,7 +619,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
     )
 
     # timeline deletion should be unblocking checkpoint ops
-    checkpoint_thread.join(2.0)
+    checkpoint_thread.join(20.0)
     assert not checkpoint_thread.is_alive()
 
     # Just to be sure, unblock ongoing uploads. If the previous assert was incorrect, or the prometheus metric broken,

From 3fbb84d74115dadf0ba68ccf98da777c70d97400 Mon Sep 17 00:00:00 2001
From: Shinya Kato <37682778+shinyaaa@users.noreply.github.com>
Date: Sat, 20 Jul 2024 03:20:57 +0900
Subject: [PATCH 1228/1571] Fix openapi specification (#8273)

## Problem

There are some swagger errors in `pageserver/src/http/openapi_spec.yml`
```
Error	431	15000	Object includes not allowed fields
Error	569	3100401	should always have a 'required'
Error	569	15000	Object includes not allowed fields
Error	1111	10037	properties members must be schemas
```

## Summary of changes

Fixed the above errors.
---
 pageserver/src/http/openapi_spec.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index ae109ec1e7..4d243ddeb9 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -377,7 +377,7 @@ paths:
               schema:
                 $ref: "#/components/schemas/ConflictError"
 
-  /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
+  /v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive:
     parameters:
       - name: tenant_id
         in: path
@@ -429,7 +429,9 @@ paths:
               schema:
                 $ref: "#/components/schemas/SyntheticSizeResponse"
             text/html:
-              description: SVG representation of the tenant and it's timelines.
+              schema:
+                type: string
+                description: SVG representation of the tenant and its timelines.
         "401":
           description: Unauthorized Error
           content:
@@ -568,7 +570,7 @@ paths:
           type: string
       - name: timeline_id
         in: path
-        ŕequired: true
+        required: true
         schema:
           type: string
 
@@ -774,15 +776,13 @@ components:
     TenantCreateRequest:
       allOf:
         - $ref: '#/components/schemas/TenantConfig'
+        - $ref: '#/components/schemas/TenantLoadRequest'
         - type: object
           required:
             - new_tenant_id
           properties:
             new_tenant_id:
               type: string
-            generation:
-              type: integer
-              description: Attachment generation number.
     TenantLoadRequest:
       type: object
       properties:
@@ -1106,7 +1106,7 @@ components:
         reparented_timelines:
           type: array
           description: Set of reparented timeline ids
-          properties:
+          items:
             type: string
             format: hex
             description: TimelineId

From 3d582b212a8003d599f3fa2ce5d13670a3cb70e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 19 Jul 2024 21:01:59 +0200
Subject: [PATCH 1229/1571] Add archival_config endpoint to pageserver (#8414)

This adds an archival_config endpoint to the pageserver. Currently it
has no effect, and always "works", but later the intent is that it will
make a timeline archived/unarchived.

- [x] add yml spec
- [x] add endpoint handler

Part of https://github.com/neondatabase/neon/issues/8088
---
 libs/pageserver_api/src/models.rs    | 11 ++++++
 pageserver/src/http/openapi_spec.yml | 54 ++++++++++++++++++++++++++++
 pageserver/src/http/routes.rs        | 44 +++++++++++++++++++++--
 pageserver/src/tenant.rs             |  9 +++++
 4 files changed, 115 insertions(+), 3 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 6abdcb88d0..231a604b47 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -651,6 +651,17 @@ pub struct TenantDetails {
     pub timelines: Vec<TimelineId>,
 }
 
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)]
+pub enum TimelineArchivalState {
+    Archived,
+    Unarchived,
+}
+
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
+pub struct TimelineArchivalConfigRequest {
+    pub state: TimelineArchivalState,
+}
+
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 4d243ddeb9..087d281a0c 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -397,6 +397,51 @@ paths:
         "202":
           description: Tenant scheduled to load successfully
 
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+    put:
+      description: |
+        Either archives or unarchives the given timeline.
+        An archived timeline may not have any non-archived children.
+      requestBody:
+        required: false
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/ArchivalConfigRequest"
+      responses:
+        "200":
+          description: Timeline (un)archived successfully
+        "409":
+          description: |
+            The tenant/timeline is already being modified, perhaps by a concurrent call to this API
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
   /v1/tenant/{tenant_id}/synthetic_size:
     parameters:
       - name: tenant_id
@@ -846,6 +891,15 @@ components:
         warm:
           type: boolean
           description: Whether to poll remote storage for layers to download.  If false, secondary locations don't download anything.
+    ArchivalConfigRequest:
+      type: object
+      required
+        - state
+      properties:
+        state:
+          description: The archival state of a timeline
+          type: string
+          enum: ["Archived", "Unarchived"]
     TenantConfig:
       type: object
       properties:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index d7ef70477f..b8063eb5a2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -18,14 +18,17 @@ use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use pageserver_api::models::IngestAuxFilesRequest;
 use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
+use pageserver_api::models::LocationConfigMode;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
+use pageserver_api::models::TenantLocationConfigRequest;
 use pageserver_api::models::TenantLocationConfigResponse;
 use pageserver_api::models::TenantScanRemoteStorageResponse;
 use pageserver_api::models::TenantScanRemoteStorageShard;
@@ -33,12 +36,10 @@ use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
+use pageserver_api::models::TimelineArchivalConfigRequest;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::TopTenantShardsRequest;
 use pageserver_api::models::TopTenantShardsResponse;
-use pageserver_api::models::{
-    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantLocationConfigRequest,
-};
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
@@ -664,6 +665,39 @@ async fn timeline_preserve_initdb_handler(
     json_response(StatusCode::OK, ())
 }
 
+async fn timeline_archival_config_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+
+    let request_data: TimelineArchivalConfigRequest = json_request(&mut request).await?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
+
+    async {
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+
+        tenant
+            .apply_timeline_archival_config(timeline_id, request_data.state)
+            .await
+            .context("applying archival config")
+            .map_err(ApiError::InternalServerError)?;
+        Ok::<_, ApiError>(())
+    }
+    .instrument(info_span!("timeline_archival_config",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug(),
+                state = ?request_data.state,
+                %timeline_id))
+    .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn timeline_detail_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -2789,6 +2823,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
             |r| api_handler(r, timeline_preserve_initdb_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
+            |r| api_handler(r, timeline_archival_config_handler),
+        )
         .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
             api_handler(r, timeline_detail_handler)
         })
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 637051413f..01f7ac626b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -21,6 +21,7 @@ use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
 use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::models::TimelineState;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::WalRedoManagerStatus;
@@ -1228,6 +1229,14 @@ impl Tenant {
         Ok(timeline_preloads)
     }
 
+    pub async fn apply_timeline_archival_config(
+        &self,
+        _timeline_id: TimelineId,
+        _config: TimelineArchivalState,
+    ) -> anyhow::Result<()> {
+        Ok(())
+    }
+
     pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
         self.tenant_shard_id
     }

From 4e547e6274c362bf2779df90db2b0f1c445f9e13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 19 Jul 2024 21:19:30 +0200
Subject: [PATCH 1230/1571] Use DefaultCredentialsChain AWS authentication in
 remote_storage (#8440)

PR #8299 has switched the storage scrubber to use
`DefaultCredentialsChain`. Now we do this for `remote_storage`, as it
allows us to use `remote_storage` from inside kubernetes. Most of the
diff is due to `GenericRemoteStorage::from_config` becoming `async fn`.
---
 libs/remote_storage/src/lib.rs                |   4 +-
 libs/remote_storage/src/s3_bucket.rs          |  64 ++++------
 libs/remote_storage/tests/test_real_azure.rs  |   7 +-
 libs/remote_storage/tests/test_real_s3.rs     |   7 +-
 pageserver/ctl/src/main.rs                    |   2 +-
 pageserver/src/bin/pageserver.rs              |   6 +-
 pageserver/src/consumption_metrics.rs         |   2 +-
 pageserver/src/deletion_queue.rs              |  20 ++-
 pageserver/src/pgdatadir_mapping.rs           |   2 +-
 pageserver/src/tenant.rs                      | 116 +++++++++++-------
 pageserver/src/tenant/mgr.rs                  |   4 +-
 .../src/tenant/remote_timeline_client.rs      |   2 +-
 .../src/tenant/storage_layer/delta_layer.rs   |   8 +-
 .../src/tenant/storage_layer/image_layer.rs   |   4 +-
 .../src/tenant/storage_layer/layer/tests.rs   |  25 ++--
 .../tenant/storage_layer/merge_iterator.rs    |  12 +-
 pageserver/src/tenant/timeline.rs             |   5 +-
 .../walreceiver/connection_manager.rs         |  17 +--
 pageserver/src/walingest.rs                   |  16 ++-
 proxy/src/context/parquet.rs                  |  10 +-
 proxy/src/usage_metrics.rs                    |  14 ++-
 safekeeper/src/bin/safekeeper.rs              |   2 +-
 safekeeper/src/wal_backup.rs                  |  26 ++--
 23 files changed, 219 insertions(+), 156 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index d440c03a0e..3381c4296f 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -443,7 +443,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
 }
 
 impl GenericRemoteStorage {
-    pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
+    pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
         let timeout = storage_config.timeout;
         Ok(match &storage_config.storage {
             RemoteStorageKind::LocalFs { local_path: path } => {
@@ -458,7 +458,7 @@ impl GenericRemoteStorage {
                     std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
                 info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
                       s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
+                Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?))
             }
             RemoteStorageKind::AzureContainer(azure_config) => {
                 let storage_account = azure_config
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index ef1bd2c047..b65d8b7e9e 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -16,16 +16,10 @@ use std::{
 
 use anyhow::{anyhow, Context as _};
 use aws_config::{
-    environment::credentials::EnvironmentVariableCredentialsProvider,
-    imds::credentials::ImdsCredentialsProvider,
-    meta::credentials::CredentialsProviderChain,
-    profile::ProfileFileCredentialsProvider,
-    provider_config::ProviderConfig,
+    default_provider::credentials::DefaultCredentialsChain,
     retry::{RetryConfigBuilder, RetryMode},
-    web_identity_token::WebIdentityTokenCredentialsProvider,
     BehaviorVersion,
 };
-use aws_credential_types::provider::SharedCredentialsProvider;
 use aws_sdk_s3::{
     config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
     error::SdkError,
@@ -76,40 +70,27 @@ struct GetObjectRequest {
 }
 impl S3Bucket {
     /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
+    pub async fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
         tracing::debug!(
             "Creating s3 remote storage for S3 bucket {}",
             remote_storage_config.bucket_name
         );
 
-        let region = Some(Region::new(remote_storage_config.bucket_region.clone()));
+        let region = Region::new(remote_storage_config.bucket_region.clone());
+        let region_opt = Some(region.clone());
 
-        let provider_conf = ProviderConfig::without_region().with_region(region.clone());
-
-        let credentials_provider = {
-            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
-            CredentialsProviderChain::first_try(
-                "env",
-                EnvironmentVariableCredentialsProvider::new(),
-            )
-            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
-            .or_else(
-                "profile-sso",
-                ProfileFileCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build(),
-            )
-            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
-            // needed to access remote extensions bucket
-            .or_else(
-                "token",
-                WebIdentityTokenCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build(),
-            )
-            // uses imds v2
-            .or_else("imds", ImdsCredentialsProvider::builder().build())
-        };
+        // https://docs.aws.amazon.com/sdkref/latest/guide/standardized-credentials.html
+        // https://docs.rs/aws-config/latest/aws_config/default_provider/credentials/struct.DefaultCredentialsChain.html
+        // Incomplete list of auth methods used by this:
+        // * "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+        // * "AWS_PROFILE" / `aws sso login --profile <profile>`
+        // * "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
+        // * http (ECS/EKS) container credentials
+        // * imds v2
+        let credentials_provider = DefaultCredentialsChain::builder()
+            .region(region)
+            .build()
+            .await;
 
         // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
         let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
@@ -118,9 +99,9 @@ impl S3Bucket {
             #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
             BehaviorVersion::v2023_11_09(),
         )
-        .region(region)
+        .region(region_opt)
         .identity_cache(IdentityCache::lazy().build())
-        .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
+        .credentials_provider(credentials_provider)
         .sleep_impl(SharedAsyncSleep::from(sleep_impl));
 
         let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
@@ -1041,8 +1022,8 @@ mod tests {
 
     use crate::{RemotePath, S3Bucket, S3Config};
 
-    #[test]
-    fn relative_path() {
+    #[tokio::test]
+    async fn relative_path() {
         let all_paths = ["", "some/path", "some/path/"];
         let all_paths: Vec<RemotePath> = all_paths
             .iter()
@@ -1085,8 +1066,9 @@ mod tests {
                 max_keys_per_list_response: Some(5),
                 upload_storage_class: None,
             };
-            let storage =
-                S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
+            let storage = S3Bucket::new(&config, std::time::Duration::ZERO)
+                .await
+                .expect("remote storage init");
             for (test_path_idx, test_path) in all_paths.iter().enumerate() {
                 let result = storage.relative_path_to_s3_object(test_path);
                 let expected = expected_outputs[prefix_idx][test_path_idx];
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 23628dfebe..3a20649490 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -31,6 +31,7 @@ struct EnabledAzure {
 impl EnabledAzure {
     async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
         let client = create_azure_client(max_keys_in_list_response)
+            .await
             .context("Azure client creation")
             .expect("Azure client creation failed");
 
@@ -187,7 +188,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
     }
 }
 
-fn create_azure_client(
+async fn create_azure_client(
     max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
     use rand::Rng;
@@ -221,6 +222,8 @@ fn create_azure_client(
         timeout: Duration::from_secs(120),
     };
     Ok(Arc::new(
-        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
+        GenericRemoteStorage::from_config(&remote_storage_config)
+            .await
+            .context("remote storage init")?,
     ))
 }
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index a273abe867..342bc6da0b 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -197,6 +197,7 @@ struct EnabledS3 {
 impl EnabledS3 {
     async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
         let client = create_s3_client(max_keys_in_list_response)
+            .await
             .context("S3 client creation")
             .expect("S3 client creation failed");
 
@@ -352,7 +353,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
     }
 }
 
-fn create_s3_client(
+async fn create_s3_client(
     max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
     use rand::Rng;
@@ -385,7 +386,9 @@ fn create_s3_client(
         timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
     };
     Ok(Arc::new(
-        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
+        GenericRemoteStorage::from_config(&remote_storage_config)
+            .await
+            .context("remote storage init")?,
     ))
 }
 
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index ea09a011e5..3fabf62987 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -179,7 +179,7 @@ async fn main() -> anyhow::Result<()> {
                 .get("remote_storage")
                 .expect("need remote_storage");
             let config = RemoteStorageConfig::from_toml(toml_item)?;
-            let storage = remote_storage::GenericRemoteStorage::from_config(&config);
+            let storage = remote_storage::GenericRemoteStorage::from_config(&config).await;
             let cancel = CancellationToken::new();
             storage
                 .unwrap()
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index fceddfb757..ec1ceb54ce 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -385,7 +385,7 @@ fn start_pageserver(
     let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
 
     // Set up remote storage client
-    let remote_storage = create_remote_storage_client(conf)?;
+    let remote_storage = BACKGROUND_RUNTIME.block_on(create_remote_storage_client(conf))?;
 
     // Set up deletion queue
     let (deletion_queue, deletion_workers) = DeletionQueue::new(
@@ -701,7 +701,7 @@ fn start_pageserver(
     }
 }
 
-fn create_remote_storage_client(
+async fn create_remote_storage_client(
     conf: &'static PageServerConf,
 ) -> anyhow::Result<GenericRemoteStorage> {
     let config = if let Some(config) = &conf.remote_storage_config {
@@ -711,7 +711,7 @@ fn create_remote_storage_client(
     };
 
     // Create the client
-    let mut remote_storage = GenericRemoteStorage::from_config(config)?;
+    let mut remote_storage = GenericRemoteStorage::from_config(config).await?;
 
     // If `test_remote_failures` is non-zero, wrap the client with a
     // wrapper that simulates failures.
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 6861adad2c..9104da6072 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -96,7 +96,7 @@ pub async fn collect_metrics(
         .expect("Failed to create http client with timeout");
 
     let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
-        match GenericRemoteStorage::from_config(bucket_config) {
+        match GenericRemoteStorage::from_config(bucket_config).await {
             Ok(client) => Some(client),
             Err(e) => {
                 // Non-fatal error: if we were given an invalid config, we will proceed
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 3e48552ace..22f7d5b824 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -828,9 +828,9 @@ mod test {
         }
     }
 
-    fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
+    async fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
         let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
-        let harness = TenantHarness::create(test_name)?;
+        let harness = TenantHarness::create(test_name).await?;
 
         // We do not load() the harness: we only need its config and remote_storage
 
@@ -844,7 +844,9 @@ mod test {
             },
             timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
         };
-        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
+        let storage = GenericRemoteStorage::from_config(&storage_config)
+            .await
+            .unwrap();
 
         let mock_control_plane = MockControlPlane::new();
 
@@ -922,7 +924,9 @@ mod test {
     #[tokio::test]
     async fn deletion_queue_smoke() -> anyhow::Result<()> {
         // Basic test that the deletion queue processes the deletions we pass into it
-        let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
+        let ctx = setup("deletion_queue_smoke")
+            .await
+            .expect("Failed test setup");
         let client = ctx.deletion_queue.new_client();
         client.recover(HashMap::new())?;
 
@@ -992,7 +996,9 @@ mod test {
 
     #[tokio::test]
     async fn deletion_queue_validation() -> anyhow::Result<()> {
-        let ctx = setup("deletion_queue_validation").expect("Failed test setup");
+        let ctx = setup("deletion_queue_validation")
+            .await
+            .expect("Failed test setup");
         let client = ctx.deletion_queue.new_client();
         client.recover(HashMap::new())?;
 
@@ -1051,7 +1057,9 @@ mod test {
     #[tokio::test]
     async fn deletion_queue_recovery() -> anyhow::Result<()> {
         // Basic test that the deletion queue processes the deletions we pass into it
-        let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
+        let mut ctx = setup("deletion_queue_recovery")
+            .await
+            .expect("Failed test setup");
         let client = ctx.deletion_queue.new_client();
         client.recover(HashMap::new())?;
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index a821b824d0..3bbd084ab4 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -2031,7 +2031,7 @@ mod tests {
     #[tokio::test]
     async fn aux_files_round_trip() -> anyhow::Result<()> {
         let name = "aux_files_round_trip";
-        let harness = TenantHarness::create(name)?;
+        let harness = TenantHarness::create(name).await?;
 
         pub const TIMELINE_ID: TimelineId =
             TimelineId::from_array(hex!("11223344556677881122334455667788"));
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 01f7ac626b..6d59752606 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3797,7 +3797,7 @@ pub(crate) mod harness {
     }
 
     impl TenantHarness {
-        pub fn create_custom(
+        pub async fn create_custom(
             test_name: &'static str,
             tenant_conf: TenantConf,
             tenant_id: TenantId,
@@ -3833,7 +3833,7 @@ pub(crate) mod harness {
                 },
                 timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
             };
-            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
+            let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap();
             let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
 
             Ok(Self {
@@ -3848,7 +3848,7 @@ pub(crate) mod harness {
             })
         }
 
-        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
+        pub async fn create(test_name: &'static str) -> anyhow::Result<Self> {
             // Disable automatic GC and compaction to make the unit tests more deterministic.
             // The tests perform them manually if needed.
             let tenant_conf = TenantConf {
@@ -3865,6 +3865,7 @@ pub(crate) mod harness {
                 shard,
                 Generation::new(0xdeadbeef),
             )
+            .await
         }
 
         pub fn span(&self) -> tracing::Span {
@@ -4001,7 +4002,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_basic() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -4048,7 +4049,8 @@ mod tests {
 
     #[tokio::test]
     async fn no_duplicate_timelines() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")?
+        let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")
+            .await?
             .load()
             .await;
         let _ = tenant
@@ -4080,7 +4082,7 @@ mod tests {
     async fn test_branch() -> anyhow::Result<()> {
         use std::str::from_utf8;
 
-        let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_branch").await?.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -4202,7 +4204,8 @@ mod tests {
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
         let (tenant, ctx) =
-            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
+            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
+                .await?
                 .load()
                 .await;
         let tline = tenant
@@ -4249,7 +4252,8 @@ mod tests {
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
         let (tenant, ctx) =
-            TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
+            TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")
+                .await?
                 .load()
                 .await;
 
@@ -4304,7 +4308,8 @@ mod tests {
     #[tokio::test]
     async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> {
         let (tenant, ctx) =
-            TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")?
+            TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")
+                .await?
                 .load()
                 .await;
         let tline = tenant
@@ -4361,7 +4366,8 @@ mod tests {
     #[tokio::test]
     async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
         let (tenant, ctx) =
-            TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
+            TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")
+                .await?
                 .load()
                 .await;
         let tline = tenant
@@ -4391,10 +4397,10 @@ mod tests {
     }
     #[tokio::test]
     async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
-        let (tenant, ctx) =
-            TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
-                .load()
-                .await;
+        let (tenant, ctx) = TenantHarness::create("test_parent_keeps_data_forever_after_branching")
+            .await?
+            .load()
+            .await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -4432,7 +4438,7 @@ mod tests {
     #[tokio::test]
     async fn timeline_load() -> anyhow::Result<()> {
         const TEST_NAME: &str = "timeline_load";
-        let harness = TenantHarness::create(TEST_NAME)?;
+        let harness = TenantHarness::create(TEST_NAME).await?;
         {
             let (tenant, ctx) = harness.load().await;
             let tline = tenant
@@ -4459,7 +4465,7 @@ mod tests {
     #[tokio::test]
     async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
         const TEST_NAME: &str = "timeline_load_with_ancestor";
-        let harness = TenantHarness::create(TEST_NAME)?;
+        let harness = TenantHarness::create(TEST_NAME).await?;
         // create two timelines
         {
             let (tenant, ctx) = harness.load().await;
@@ -4507,7 +4513,10 @@ mod tests {
     #[tokio::test]
     async fn delta_layer_dumping() -> anyhow::Result<()> {
         use storage_layer::AsLayerDesc;
-        let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_layer_dumping")
+            .await?
+            .load()
+            .await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -4534,7 +4543,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_images() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_images")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_images").await?.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -4705,7 +4714,7 @@ mod tests {
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_bulk_insert")?;
+        let harness = TenantHarness::create("test_bulk_insert").await?;
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
@@ -4736,7 +4745,7 @@ mod tests {
     // so the search can stop at the first delta layer and doesn't traverse any deeper.
     #[tokio::test]
     async fn test_get_vectored() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_get_vectored")?;
+        let harness = TenantHarness::create("test_get_vectored").await?;
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
@@ -4814,7 +4823,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_get_vectored_aux_files")?;
+        let harness = TenantHarness::create("test_get_vectored_aux_files").await?;
 
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
@@ -4900,7 +4909,8 @@ mod tests {
             TenantId::generate(),
             ShardIdentity::unsharded(),
             Generation::new(0xdeadbeef),
-        )?;
+        )
+        .await?;
         let (tenant, ctx) = harness.load().await;
 
         let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
@@ -5043,7 +5053,7 @@ mod tests {
     // ```
     #[tokio::test]
     async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?;
+        let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?;
         let (tenant, ctx) = harness.load().await;
 
         let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
@@ -5192,7 +5202,7 @@ mod tests {
         name: &'static str,
         compaction_algorithm: CompactionAlgorithm,
     ) -> anyhow::Result<()> {
-        let mut harness = TenantHarness::create(name)?;
+        let mut harness = TenantHarness::create(name).await?;
         harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
             kind: compaction_algorithm,
         };
@@ -5276,7 +5286,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_branches() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_traverse_branches")?
+        let (tenant, ctx) = TenantHarness::create("test_traverse_branches")
+            .await?
             .load()
             .await;
         let mut tline = tenant
@@ -5366,7 +5377,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_ancestors() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")?
+        let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")
+            .await?
             .load()
             .await;
         let mut tline = tenant
@@ -5432,7 +5444,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")?
+        let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")
+            .await?
             .load()
             .await;
 
@@ -5501,7 +5514,7 @@ mod tests {
     #[tokio::test]
     async fn test_create_guard_crash() -> anyhow::Result<()> {
         let name = "test_create_guard_crash";
-        let harness = TenantHarness::create(name)?;
+        let harness = TenantHarness::create(name).await?;
         {
             let (tenant, ctx) = harness.load().await;
             let tline = tenant
@@ -5554,7 +5567,7 @@ mod tests {
         name: &'static str,
         compaction_algorithm: CompactionAlgorithm,
     ) -> anyhow::Result<()> {
-        let mut harness = TenantHarness::create(name)?;
+        let mut harness = TenantHarness::create(name).await?;
         harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
             kind: compaction_algorithm,
         };
@@ -5578,7 +5591,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_metadata_scan() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_scan")?;
+        let harness = TenantHarness::create("test_metadata_scan").await?;
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5697,7 +5710,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_metadata_compaction_trigger() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_compaction_trigger")?;
+        let harness = TenantHarness::create("test_metadata_compaction_trigger").await?;
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5756,7 +5769,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_branch_copies_dirty_aux_file_flag() {
-        let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap();
+        let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag")
+            .await
+            .unwrap();
 
         // the default aux file policy to switch is v1 if not set by the admins
         assert_eq!(
@@ -5858,7 +5873,9 @@ mod tests {
 
     #[tokio::test]
     async fn aux_file_policy_switch() {
-        let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap();
+        let mut harness = TenantHarness::create("aux_file_policy_switch")
+            .await
+            .unwrap();
         harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
         let (tenant, ctx) = harness.load().await;
 
@@ -6032,7 +6049,9 @@ mod tests {
 
     #[tokio::test]
     async fn aux_file_policy_force_switch() {
-        let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap();
+        let mut harness = TenantHarness::create("aux_file_policy_force_switch")
+            .await
+            .unwrap();
         harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
         let (tenant, ctx) = harness.load().await;
 
@@ -6093,7 +6112,9 @@ mod tests {
 
     #[tokio::test]
     async fn aux_file_policy_auto_detect() {
-        let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap();
+        let mut harness = TenantHarness::create("aux_file_policy_auto_detect")
+            .await
+            .unwrap();
         harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
         let (tenant, ctx) = harness.load().await;
 
@@ -6156,7 +6177,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_metadata_image_creation() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_image_creation")?;
+        let harness = TenantHarness::create("test_metadata_image_creation").await?;
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -6255,7 +6276,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
+        let harness = TenantHarness::create("test_vectored_missing_data_key_reads").await?;
         let (tenant, ctx) = harness.load().await;
 
         let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
@@ -6327,7 +6348,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
+        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?;
         let (tenant, ctx) = harness.load().await;
 
         let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6419,7 +6440,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_metadata_tombstone_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_tombstone_reads")?;
+        let harness = TenantHarness::create("test_metadata_tombstone_reads").await?;
         let (tenant, ctx) = harness.load().await;
         let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
         let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -6499,7 +6520,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_metadata_tombstone_image_creation() {
-        let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap();
+        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")
+            .await
+            .unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6571,8 +6594,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_metadata_tombstone_empty_image_creation() {
-        let harness =
-            TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap();
+        let harness = TenantHarness::create("test_metadata_tombstone_empty_image_creation")
+            .await
+            .unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -6635,7 +6659,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_images")?;
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?;
         let (tenant, ctx) = harness.load().await;
 
         fn get_key(id: u32) -> Key {
@@ -6843,7 +6867,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_neon_test_record() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_neon_test_record")?;
+        let harness = TenantHarness::create("test_neon_test_record").await?;
         let (tenant, ctx) = harness.load().await;
 
         fn get_key(id: u32) -> Key {
@@ -6924,7 +6948,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_lsn_lease() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_lsn_lease")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await;
         let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
 
         let end_lsn = Lsn(0x100);
@@ -7013,7 +7037,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas")?;
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
         let (tenant, ctx) = harness.load().await;
 
         fn get_key(id: u32) -> Key {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index b0159e22bf..4912608677 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2698,7 +2698,9 @@ mod tests {
         // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully
         // wait for it to complete before proceeding.
 
-        let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap();
+        let h = TenantHarness::create("shutdown_awaits_in_progress_tenant")
+            .await
+            .unwrap();
         let (t, _ctx) = h.load().await;
 
         // harness loads it to active, which is forced and nothing is running on the tenant
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 66b759c8e0..bb42fbeebf 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2128,7 +2128,7 @@ mod tests {
     impl TestSetup {
         async fn new(test_name: &str) -> anyhow::Result<Self> {
             let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
-            let harness = TenantHarness::create(test_name)?;
+            let harness = TenantHarness::create(test_name).await?;
             let (tenant, ctx) = harness.load().await;
 
             let timeline = tenant
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index c34923320a..512e9e86fa 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1934,7 +1934,7 @@ pub(crate) mod test {
 
     #[tokio::test]
     async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
+        let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read").await?;
         let (tenant, ctx) = harness.load().await;
 
         let timeline_id = TimelineId::generate();
@@ -2034,7 +2034,9 @@ pub(crate) mod test {
         use crate::walrecord::NeonWalRecord;
         use bytes::Bytes;
 
-        let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
+        let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
+            .await
+            .unwrap();
         let (tenant, ctx) = h.load().await;
         let ctx = &ctx;
         let timeline = tenant
@@ -2312,7 +2314,7 @@ pub(crate) mod test {
 
     #[tokio::test]
     async fn delta_layer_iterator() {
-        let harness = TenantHarness::create("delta_layer_iterator").unwrap();
+        let harness = TenantHarness::create("delta_layer_iterator").await.unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let tline = tenant
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 45b47bb62b..19e4e9e2e9 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1111,6 +1111,7 @@ mod test {
             ShardIdentity::unsharded(),
             get_next_gen(),
         )
+        .await
         .unwrap();
         let (tenant, ctx) = harness.load().await;
         let timeline = tenant
@@ -1177,6 +1178,7 @@ mod test {
                 // But here, all we care about is that the gen number is unique.
                 get_next_gen(),
             )
+            .await
             .unwrap();
             let (tenant, ctx) = harness.load().await;
             let timeline = tenant
@@ -1308,7 +1310,7 @@ mod test {
 
     #[tokio::test]
     async fn image_layer_iterator() {
-        let harness = TenantHarness::create("image_layer_iterator").unwrap();
+        let harness = TenantHarness::create("image_layer_iterator").await.unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let tline = tenant
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 3a7aca7a6c..8a3737f8a7 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -22,7 +22,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s
 async fn smoke_test() {
     let handle = tokio::runtime::Handle::current();
 
-    let h = TenantHarness::create("smoke_test").unwrap();
+    let h = TenantHarness::create("smoke_test").await.unwrap();
     let span = h.span();
     let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
     let (tenant, _) = h.load().await;
@@ -176,7 +176,9 @@ async fn evict_and_wait_on_wanted_deleted() {
     // this is the runtime on which Layer spawns the blocking tasks on
     let handle = tokio::runtime::Handle::current();
 
-    let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
+    let h = TenantHarness::create("evict_and_wait_on_wanted_deleted")
+        .await
+        .unwrap();
     utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
     let (tenant, ctx) = h.load().await;
 
@@ -258,7 +260,9 @@ fn read_wins_pending_eviction() {
     rt.block_on(async move {
         // this is the runtime on which Layer spawns the blocking tasks on
         let handle = tokio::runtime::Handle::current();
-        let h = TenantHarness::create("read_wins_pending_eviction").unwrap();
+        let h = TenantHarness::create("read_wins_pending_eviction")
+            .await
+            .unwrap();
         let (tenant, ctx) = h.load().await;
         let span = h.span();
         let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -390,7 +394,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
     rt.block_on(async move {
         // this is the runtime on which Layer spawns the blocking tasks on
         let handle = tokio::runtime::Handle::current();
-        let h = TenantHarness::create(name).unwrap();
+        let h = TenantHarness::create(name).await.unwrap();
         let (tenant, ctx) = h.load().await;
         let span = h.span();
         let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -559,8 +563,9 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
 #[tokio::test(start_paused = true)]
 async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
     let handle = tokio::runtime::Handle::current();
-    let h =
-        TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap();
+    let h = TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction")
+        .await
+        .unwrap();
     let (tenant, ctx) = h.load().await;
 
     let timeline = tenant
@@ -636,7 +641,9 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
 #[tokio::test(start_paused = true)]
 async fn evict_and_wait_does_not_wait_for_download() {
     // let handle = tokio::runtime::Handle::current();
-    let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap();
+    let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download")
+        .await
+        .unwrap();
     let (tenant, ctx) = h.load().await;
     let span = h.span();
     let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -733,7 +740,9 @@ async fn eviction_cancellation_on_drop() {
     // this is the runtime on which Layer spawns the blocking tasks on
     let handle = tokio::runtime::Handle::current();
 
-    let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap();
+    let h = TenantHarness::create("eviction_cancellation_on_drop")
+        .await
+        .unwrap();
     utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
     let (tenant, ctx) = h.load().await;
 
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index 6f59b2fd77..eb4a1f28a1 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -293,7 +293,9 @@ mod tests {
         use crate::repository::Value;
         use bytes::Bytes;
 
-        let harness = TenantHarness::create("merge_iterator_merge_in_between").unwrap();
+        let harness = TenantHarness::create("merge_iterator_merge_in_between")
+            .await
+            .unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let tline = tenant
@@ -356,7 +358,9 @@ mod tests {
         use crate::repository::Value;
         use bytes::Bytes;
 
-        let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
+        let harness = TenantHarness::create("merge_iterator_delta_merge")
+            .await
+            .unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let tline = tenant
@@ -430,7 +434,9 @@ mod tests {
         use crate::repository::Value;
         use bytes::Bytes;
 
-        let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge").unwrap();
+        let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
+            .await
+            .unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let tline = tenant
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3d3d3ac34d..19b1396981 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -6046,8 +6046,9 @@ mod tests {
 
     #[tokio::test]
     async fn two_layer_eviction_attempts_at_the_same_time() {
-        let harness =
-            TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
+        let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")
+            .await
+            .unwrap();
 
         let (tenant, ctx) = harness.load().await;
         let timeline = tenant
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 1d2ffec08f..de50f217d8 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -1118,7 +1118,7 @@ mod tests {
 
     #[tokio::test]
     async fn no_connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("no_connection_no_candidate")?;
+        let harness = TenantHarness::create("no_connection_no_candidate").await?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1151,7 +1151,7 @@ mod tests {
 
     #[tokio::test]
     async fn connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("connection_no_candidate")?;
+        let harness = TenantHarness::create("connection_no_candidate").await?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1216,7 +1216,7 @@ mod tests {
 
     #[tokio::test]
     async fn no_connection_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("no_connection_candidate")?;
+        let harness = TenantHarness::create("no_connection_candidate").await?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1279,7 +1279,7 @@ mod tests {
 
     #[tokio::test]
     async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("candidate_with_many_connection_failures")?;
+        let harness = TenantHarness::create("candidate_with_many_connection_failures").await?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1319,7 +1319,7 @@ mod tests {
 
     #[tokio::test]
     async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
+        let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate").await?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
@@ -1385,7 +1385,8 @@ mod tests {
 
     #[tokio::test]
     async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?;
+        let harness =
+            TenantHarness::create("timeout_connection_threshold_current_candidate").await?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
@@ -1448,7 +1449,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?;
+        let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate").await?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let new_lsn = Lsn(100_100).align();
@@ -1550,7 +1551,7 @@ mod tests {
         // and pageserver should prefer to connect to it.
         let test_az = Some("test_az".to_owned());
 
-        let harness = TenantHarness::create("switch_to_same_availability_zone")?;
+        let harness = TenantHarness::create("switch_to_same_availability_zone").await?;
         let mut state = dummy_state(&harness).await;
         state.conf.availability_zone.clone_from(&test_az);
         let current_lsn = Lsn(100_000).align();
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 07c90385e6..dff3a8f52d 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1754,7 +1754,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_relsize() -> Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_relsize").await?.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -1975,7 +1975,10 @@ mod tests {
     // and then created it again within the same layer.
     #[tokio::test]
     async fn test_drop_extend() -> Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_drop_extend")
+            .await?
+            .load()
+            .await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -2046,7 +2049,10 @@ mod tests {
     // and then extended it again within the same layer.
     #[tokio::test]
     async fn test_truncate_extend() -> Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_truncate_extend")
+            .await?
+            .load()
+            .await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -2188,7 +2194,7 @@ mod tests {
     /// split into multiple 1 GB segments in Postgres.
     #[tokio::test]
     async fn test_large_rel() -> Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_large_rel").await?.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -2296,7 +2302,7 @@ mod tests {
         let startpoint = Lsn::from_hex("14AEC08").unwrap();
         let _endpoint = Lsn::from_hex("1FFFF98").unwrap();
 
-        let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
+        let harness = TenantHarness::create("test_ingest_real_wal").await.unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let remote_initdb_path =
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index cfc1f8e89e..543a458274 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -181,8 +181,9 @@ pub async fn worker(
     let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx));
     let rx = rx.map(RequestData::from);
 
-    let storage =
-        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?;
+    let storage = GenericRemoteStorage::from_config(&remote_storage_config)
+        .await
+        .context("remote storage init")?;
 
     let properties = WriterProperties::builder()
         .set_data_page_size_limit(config.parquet_upload_page_size)
@@ -217,6 +218,7 @@ pub async fn worker(
 
         let storage_disconnect =
             GenericRemoteStorage::from_config(&disconnect_events_storage_config)
+                .await
                 .context("remote storage for disconnect events init")?;
         let parquet_config_disconnect = parquet_config.clone();
         tokio::try_join!(
@@ -545,7 +547,9 @@ mod tests {
             },
             timeout: std::time::Duration::from_secs(120),
         };
-        let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap();
+        let storage = GenericRemoteStorage::from_config(&remote_storage_config)
+            .await
+            .unwrap();
 
         worker_inner(storage, rx, config).await.unwrap();
 
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index 56ed2145dc..a8735fe0bb 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -357,11 +357,15 @@ pub async fn task_backup(
         info!("metrics backup has shut down");
     }
     // Even if the remote storage is not configured, we still want to clear the metrics.
-    let storage = backup_config
-        .remote_storage_config
-        .as_ref()
-        .map(|config| GenericRemoteStorage::from_config(config).context("remote storage init"))
-        .transpose()?;
+    let storage = if let Some(config) = backup_config.remote_storage_config.as_ref() {
+        Some(
+            GenericRemoteStorage::from_config(config)
+                .await
+                .context("remote storage init")?,
+        )
+    } else {
+        None
+    };
     let mut ticker = tokio::time::interval(backup_config.interval);
     let mut prev = Utc::now();
     let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 9eb6546d6b..2365fd0587 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -418,7 +418,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     let timeline_collector = safekeeper::metrics::TimelineCollector::new();
     metrics::register_internal(Box::new(timeline_collector))?;
 
-    wal_backup::init_remote_storage(&conf);
+    wal_backup::init_remote_storage(&conf).await;
 
     // Keep handles to main tasks to die if any of them disappears.
     let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 5a590689c3..7ecee178f3 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -22,7 +22,7 @@ use tokio::fs::File;
 
 use tokio::select;
 use tokio::sync::mpsc::{self, Receiver, Sender};
-use tokio::sync::watch;
+use tokio::sync::{watch, OnceCell};
 use tokio::time::sleep;
 use tracing::*;
 
@@ -33,8 +33,6 @@ use crate::timeline::{PeerInfo, WalResidentTimeline};
 use crate::timeline_manager::{Manager, StateSnapshot};
 use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME};
 
-use once_cell::sync::OnceCell;
-
 const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10;
 const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;
 
@@ -167,7 +165,7 @@ fn determine_offloader(
     }
 }
 
-static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
+static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::const_new();
 
 // Storage must be configured and initialized when this is called.
 fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
@@ -178,14 +176,22 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
         .unwrap()
 }
 
-pub fn init_remote_storage(conf: &SafeKeeperConf) {
+pub async fn init_remote_storage(conf: &SafeKeeperConf) {
     // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide
     // dependencies to all tasks instead.
-    REMOTE_STORAGE.get_or_init(|| {
-        conf.remote_storage
-            .as_ref()
-            .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
-    });
+    REMOTE_STORAGE
+        .get_or_init(|| async {
+            if let Some(conf) = conf.remote_storage.as_ref() {
+                Some(
+                    GenericRemoteStorage::from_config(conf)
+                        .await
+                        .expect("failed to create remote storage"),
+                )
+            } else {
+                None
+            }
+        })
+        .await;
 }
 
 struct WalBackupTask {

From 7996bce6d66cc1b879cfefd7ff9826b14949259c Mon Sep 17 00:00:00 2001
From: Anton Chaporgin <chapson@neon.tech>
Date: Mon, 22 Jul 2024 11:02:22 +0300
Subject: [PATCH 1231/1571] [proxy/redis] impr: use redis_auth_type to switch
 between auth types (#8428)

## Problem

On Azure we need to use username-password authentication in proxy for
regional redis client.

## Summary of changes

This adds `redis_auth_type` to the config with default value of "irsa".
Not specifying it will enforce the `regional_redis_client` to be
configured with IRSA redis (as it's done now).
If "plain" is specified, then the regional client is condifigured with
`redis_notifications`, consuming username:password auth from URI. We
plan to do that for the Azure cloud.

Configuring `regional_redis_client` is required now, there is no opt-out
from configuring it.

https://github.com/neondatabase/cloud/issues/14462
---
 proxy/src/bin/proxy.rs | 43 +++++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 7f4cb2c010..7314710508 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -176,6 +176,9 @@ struct ProxyCliArgs {
     /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
     #[clap(long)]
     redis_notifications: Option<String>,
+    /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain".
+    #[clap(long, default_value = "irsa")]
+    redis_auth_type: String,
     /// redis host for streaming connections (might be different from the notifications host)
     #[clap(long)]
     redis_host: Option<String>,
@@ -319,24 +322,38 @@ async fn main() -> anyhow::Result<()> {
         ),
         aws_credentials_provider,
     ));
-    let regional_redis_client = match (args.redis_host, args.redis_port) {
-        (Some(host), Some(port)) => Some(
-            ConnectionWithCredentialsProvider::new_with_credentials_provider(
-                host,
-                port,
-                elasticache_credentials_provider.clone(),
+    let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) {
+        ("plain", redis_url) => match redis_url {
+            None => {
+                bail!("plain auth requires redis_notifications to be set");
+            }
+            Some(url) => Some(
+                ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()),
             ),
-        ),
-        (None, None) => {
-            warn!("Redis events from console are disabled");
-            None
-        }
+        },
+        ("irsa", _) => match (&args.redis_host, args.redis_port) {
+            (Some(host), Some(port)) => Some(
+                ConnectionWithCredentialsProvider::new_with_credentials_provider(
+                    host.to_string(),
+                    port,
+                    elasticache_credentials_provider.clone(),
+                ),
+            ),
+            (None, None) => {
+                warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client");
+                None
+            }
+            _ => {
+                bail!("redis-host and redis-port must be specified together");
+            }
+        },
         _ => {
-            bail!("redis-host and redis-port must be specified together");
+            bail!("unknown auth type given");
         }
     };
+
     let redis_notifications_client = if let Some(url) = args.redis_notifications {
-        Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
+        Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()))
     } else {
         regional_redis_client.clone()
     };

From fd8a7a722351a985390885f791b3ec19c5afaab8 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Mon, 22 Jul 2024 04:22:07 -0400
Subject: [PATCH 1232/1571] fix(docs): race on monotonic rfc id (#8445)

## Problem

We have two No.34 RFC.

## Summary of changes

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 docs/rfcs/{034-timeline-archive.md => 035-timeline-archive.md} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename docs/rfcs/{034-timeline-archive.md => 035-timeline-archive.md} (100%)

diff --git a/docs/rfcs/034-timeline-archive.md b/docs/rfcs/035-timeline-archive.md
similarity index 100%
rename from docs/rfcs/034-timeline-archive.md
rename to docs/rfcs/035-timeline-archive.md

From ebda667ef8780ada13f3549306ac65f8b456440d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 22 Jul 2024 11:50:30 +0100
Subject: [PATCH 1233/1571] tests: more generous memory allowance in
 test_compaction_l0_memory (#8446)

## Problem

This test is new, the limit was set experimentally and it turns out the
memory consumption in CI runs varies more than expected.

Example failure:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/10010912745/index.html#suites/9eebd1154fe19f9311ca7613f38156a1/82e40cf86a243ad5/
---
 test_runner/performance/test_compaction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py
index 077b76104c..3c6f0b0131 100644
--- a/test_runner/performance/test_compaction.py
+++ b/test_runner/performance/test_compaction.py
@@ -135,7 +135,7 @@ def test_compaction_l0_memory(neon_compare: NeonCompare):
     # To be fixed in https://github.com/neondatabase/neon/issues/8184, after which
     # this memory estimate can be revised far downwards to something that doesn't scale
     # linearly with the layer sizes.
-    MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.25
+    MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.5
 
     # If we find that compaction is using more memory, this may indicate a regression
     assert compaction_mapped_rss < MEMORY_ESTIMATE

From 98af1e365bbbf71f21a3317313dc2407d7f8937a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 22 Jul 2024 13:15:55 +0100
Subject: [PATCH 1234/1571] pageserver: remove absolute-order disk usage
 eviction (#8454)

## Problem

Deployed pageserver configurations are all like this:
```
      disk_usage_based_eviction:
        max_usage_pct: 85
        min_avail_bytes: 0
        period: "10s"
        eviction_order:
          type: "RelativeAccessed"
          args:
            highest_layer_count_loses_first: true
```

But we're maintaining this optional absolute order eviction, with test
cases etc.

## Summary of changes

- Remove absolute order eviction. Make the default eviction policy the
same as how we really deploy pageservers.
---
 pageserver/src/config.rs                      |  2 +-
 pageserver/src/disk_usage_eviction_task.rs    | 24 ++---
 .../regress/test_disk_usage_eviction.py       | 96 ++++++-------------
 3 files changed, 40 insertions(+), 82 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 35b4e79365..6a78d126cf 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1601,7 +1601,7 @@ threshold = "20m"
                 period: Duration::from_secs(10),
                 #[cfg(feature = "testing")]
                 mock_statvfs: None,
-                eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
+                eviction_order: Default::default(),
             })
         );
 
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 90bd4294bb..103e549d22 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -83,17 +83,9 @@ pub struct DiskUsageEvictionTaskConfig {
 
 /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
 /// partitioning.
-#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(tag = "type", content = "args")]
 pub enum EvictionOrder {
-    /// Order the layers to be evicted by how recently they have been accessed in absolute
-    /// time.
-    ///
-    /// This strategy is unfair when some tenants grow faster than others towards the slower
-    /// growing.
-    #[default]
-    AbsoluteAccessed,
-
     /// Order the layers to be evicted by how recently they have been accessed relatively within
     /// the set of resident layers of a tenant.
     RelativeAccessed {
@@ -108,6 +100,14 @@ pub enum EvictionOrder {
     },
 }
 
+impl Default for EvictionOrder {
+    fn default() -> Self {
+        Self::RelativeAccessed {
+            highest_layer_count_loses_first: true,
+        }
+    }
+}
+
 fn default_highest_layer_count_loses_first() -> bool {
     true
 }
@@ -117,11 +117,6 @@ impl EvictionOrder {
         use EvictionOrder::*;
 
         match self {
-            AbsoluteAccessed => {
-                candidates.sort_unstable_by_key(|(partition, candidate)| {
-                    (*partition, candidate.last_activity_ts)
-                });
-            }
             RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
                 (*partition, candidate.relative_last_activity)
             }),
@@ -134,7 +129,6 @@ impl EvictionOrder {
         use EvictionOrder::*;
 
         match self {
-            AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
             RelativeAccessed {
                 highest_layer_count_loses_first,
             } => {
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 3c834f430b..930fb14947 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -67,14 +67,11 @@ def test_min_resident_size_override_handling(
 
 @enum.unique
 class EvictionOrder(str, enum.Enum):
-    ABSOLUTE_ORDER = "absolute"
     RELATIVE_ORDER_EQUAL = "relative_equal"
     RELATIVE_ORDER_SPARE = "relative_spare"
 
     def config(self) -> Dict[str, Any]:
-        if self == EvictionOrder.ABSOLUTE_ORDER:
-            return {"type": "AbsoluteAccessed"}
-        elif self == EvictionOrder.RELATIVE_ORDER_EQUAL:
+        if self == EvictionOrder.RELATIVE_ORDER_EQUAL:
             return {
                 "type": "RelativeAccessed",
                 "args": {"highest_layer_count_loses_first": False},
@@ -384,7 +381,7 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
 
 @pytest.mark.parametrize(
     "order",
-    [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
+    [EvictionOrder.RELATIVE_ORDER_EQUAL],
 )
 def test_pageserver_evicts_until_pressure_is_relieved(
     eviction_env: EvictionEnv, order: EvictionOrder
@@ -418,7 +415,7 @@ def test_pageserver_evicts_until_pressure_is_relieved(
 
 @pytest.mark.parametrize(
     "order",
-    [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
+    [EvictionOrder.RELATIVE_ORDER_EQUAL],
 )
 def test_pageserver_respects_overridden_resident_size(
     eviction_env: EvictionEnv, order: EvictionOrder
@@ -495,7 +492,7 @@ def test_pageserver_respects_overridden_resident_size(
 
 @pytest.mark.parametrize(
     "order",
-    [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
+    [EvictionOrder.RELATIVE_ORDER_EQUAL],
 )
 def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder):
     """
@@ -526,7 +523,6 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E
 @pytest.mark.parametrize(
     "order",
     [
-        EvictionOrder.ABSOLUTE_ORDER,
         EvictionOrder.RELATIVE_ORDER_EQUAL,
         EvictionOrder.RELATIVE_ORDER_SPARE,
     ],
@@ -572,63 +568,38 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
             later_tenant_usage < du_by_timeline[tenant]
         ), "all tenants should have lost some layers"
 
-    warm_size = later_du_by_timeline[warm]
-    cold_size = later_du_by_timeline[cold]
+    # with relative order what matters is the amount of layers, with a
+    # fudge factor of whether the eviction bothers tenants with highest
+    # layer count the most. last accessed times between tenants does not
+    # matter.
+    assert order in [EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE]
+    layers_now = env.count_layers_per_tenant(env.pageserver)
 
-    if order == EvictionOrder.ABSOLUTE_ORDER:
-        # bounds for warmed_size
-        warm_lower = 0.5 * du_by_timeline[warm]
+    expected_ratio = later_total_on_disk / total_on_disk
+    log.info(
+        f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio"
+    )
 
-        # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
-        # So, check for up to 3 here.
-        warm_upper = warm_lower + 3 * env.layer_size
+    for tenant_id, original_count in tenant_layers.items():
+        count_now = layers_now[tenant_id]
+        ratio = count_now / original_count
+        abs_diff = abs(ratio - expected_ratio)
+        assert original_count > count_now
 
-        cold_upper = 2 * env.layer_size
-        log.info(f"tenants: warm={warm[0]}, cold={cold[0]}")
+        expectation = 0.06
         log.info(
-            f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
+            f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}"
         )
-        log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
-
-        assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
-        assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
-
-        assert (
-            cold_size < cold_upper
-        ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
-    else:
-        # with relative order what matters is the amount of layers, with a
-        # fudge factor of whether the eviction bothers tenants with highest
-        # layer count the most. last accessed times between tenants does not
-        # matter.
-        layers_now = env.count_layers_per_tenant(env.pageserver)
-
-        expected_ratio = later_total_on_disk / total_on_disk
-        log.info(
-            f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio"
-        )
-
-        for tenant_id, original_count in tenant_layers.items():
-            count_now = layers_now[tenant_id]
-            ratio = count_now / original_count
-            abs_diff = abs(ratio - expected_ratio)
-            assert original_count > count_now
-
-            expectation = 0.06
-            log.info(
-                f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}"
-            )
-            # in this test case both relative_spare and relative_equal produce
-            # the same outcomes; this must be a quantization effect of similar
-            # sizes (-s4 and -s6) and small (5MB) layer size.
-            # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02
-            assert abs_diff < expectation
+        # in this test case both relative_spare and relative_equal produce
+        # the same outcomes; this must be a quantization effect of similar
+        # sizes (-s4 and -s6) and small (5MB) layer size.
+        # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02
+        assert abs_diff < expectation
 
 
 @pytest.mark.parametrize(
     "order",
     [
-        EvictionOrder.ABSOLUTE_ORDER,
         EvictionOrder.RELATIVE_ORDER_EQUAL,
         EvictionOrder.RELATIVE_ORDER_SPARE,
     ],
@@ -680,14 +651,7 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or
     ), "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order"
     log.info(f"{ratios}")
 
-    if order == EvictionOrder.ABSOLUTE_ORDER:
-        # first tenant loses most
-        assert ratios[0] <= ratios[1], "first should lose the most"
-        assert ratios[1] < ratios[2], "second should lose some"
-        assert ratios[1] < 1.0
-        assert ratios[2] <= ratios[3], "third might not lose"
-        assert ratios[3] == 1.0, "tenant created last does not lose"
-    elif order == EvictionOrder.RELATIVE_ORDER_EQUAL:
+    if order == EvictionOrder.RELATIVE_ORDER_EQUAL:
         assert all([x for x in ratios if x < 1.0]), "all tenants lose layers"
     elif order == EvictionOrder.RELATIVE_ORDER_SPARE:
         # with different layer sizes and pg versions, there are different combinations
@@ -750,7 +714,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
             "type": "Failure",
             "mocked_error": "EIO",
         },
-        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
+        eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE,
     )
 
     env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO")
@@ -784,7 +748,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
             # This avoids accounting for metadata files & tenant conf in the tests.
             "name_filter": ".*__.*",
         },
-        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
+        eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE,
     )
 
     wait_until(
@@ -837,7 +801,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
             # This avoids accounting for metadata files & tenant conf in the tests.
             "name_filter": ".*__.*",
         },
-        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
+        eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE,
     )
 
     wait_until(

From 8d948f2e076332c06e4cc59ed47fe45ea8374140 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 22 Jul 2024 14:17:02 +0100
Subject: [PATCH 1235/1571] tests: make test_change_pageserver more robust
 (#8442)

## Problem

This test predates the storage controller. It stops pageservers and
reconfigures computes, but that races with the storage controller's node
failure detection, which can result in restarting nodes not getting the
attachments they expect, and the test failing

## Summary of changes

- Configure the storage controller to use a compute notify hook that
does nothing, so that it cannot interfere with the test's configuration
of computes.
- Instead of using the attach hook, just notify the storage controller
that nodes are offline, and reconcile tenants so that they will
automatically be attached to the other node.
---
 test_runner/regress/test_change_pageserver.py | 33 ++++++++++++++++---
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py
index 4d2cdb8e32..34791e5988 100644
--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -3,9 +3,16 @@ import asyncio
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.remote_storage import RemoteStorageKind
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
 
 
-def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
+def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver):
+    """
+    A relatively low level test of reconfiguring a compute's pageserver at runtime.  Usually this
+    is all done via the storage controller, but this test will disable the storage controller's compute
+    notifications, and instead update endpoints directly.
+    """
     num_connections = 3
 
     neon_env_builder.num_pageservers = 2
@@ -14,9 +21,24 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     )
     env = neon_env_builder.init_start()
 
+    neon_env_builder.control_plane_compute_hook_api = (
+        f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach"
+    )
+
+    def ignore_notify(request: Request):
+        # This test does direct updates to compute configuration: disable the storage controller's notification
+        log.info(f"Ignoring storage controller compute notification: {request.json}")
+        return Response(status=200)
+
+    make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(
+        ignore_notify
+    )
+
     env.neon_cli.create_branch("test_change_pageserver")
     endpoint = env.endpoints.create_start("test_change_pageserver")
 
+    # Put this tenant into a dual-attached state
+    assert env.get_tenant_pageserver(env.initial_tenant) == env.pageservers[0]
     alt_pageserver_id = env.pageservers[1].id
     env.pageservers[1].tenant_attach(env.initial_tenant)
 
@@ -72,6 +94,7 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     env.pageservers[
         0
     ].stop()  # Stop the old pageserver just to make sure we're reading from the new one
+    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
 
     execute("SELECT count(*) FROM foo")
     assert fetchone() == (100000,)
@@ -82,9 +105,10 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     #
     # Since we're dual-attached, need to tip-off storage controller to treat the one we're
     # about to start as the attached pageserver
-    env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
     env.pageservers[0].start()
     env.pageservers[1].stop()
+    env.storage_controller.node_configure(env.pageservers[1].id, {"availability": "Offline"})
+    env.storage_controller.reconcile_until_idle()
 
     endpoint.reconfigure(pageserver_id=env.pageservers[0].id)
 
@@ -92,10 +116,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     assert fetchone() == (100000,)
 
     env.pageservers[0].stop()
-    # Since we're dual-attached, need to tip-off storage controller to treat the one we're
-    # about to start as the attached pageserver
-    env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[1].id)
     env.pageservers[1].start()
+    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
+    env.storage_controller.reconcile_until_idle()
 
     # Test a (former) bug where a child process spins without updating its connection string
     # by executing a query separately. This query will hang until we issue the reconfigure.

From 204bb8faa32975cdbf3546e0a78f3387b511e589 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 22 Jul 2024 15:49:30 +0200
Subject: [PATCH 1236/1571] Start using remote_storage in S3 scrubber for
 PurgeGarbage (#7932)

Starts using the `remote_storage` crate in the S3 scrubber for the
`PurgeGarbage` subcommand.

The `remote_storage` crate is generic over various backends and thus
using it gives us the ability to run the scrubber against all supported
backends.

Start with the `PurgeGarbage` subcommand as it doesn't use
`stream_tenants`.

Part of #7547.
---
 libs/remote_storage/src/azure_blob.rs |  6 ++
 libs/remote_storage/src/lib.rs        | 10 +++
 libs/remote_storage/src/s3_bucket.rs  |  4 ++
 storage_scrubber/src/garbage.rs       | 99 +++++++++++++--------------
 storage_scrubber/src/lib.rs           | 51 ++++++++++++--
 5 files changed, 114 insertions(+), 56 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 8e590b17c4..d0146238da 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -40,6 +40,7 @@ use crate::{
 
 pub struct AzureBlobStorage {
     client: ContainerClient,
+    container_name: String,
     prefix_in_container: Option<String>,
     max_keys_per_list_response: Option<NonZeroU32>,
     concurrency_limiter: ConcurrencyLimiter,
@@ -85,6 +86,7 @@ impl AzureBlobStorage {
 
         Ok(AzureBlobStorage {
             client,
+            container_name: azure_config.container_name.to_owned(),
             prefix_in_container: azure_config.prefix_in_container.to_owned(),
             max_keys_per_list_response,
             concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
@@ -238,6 +240,10 @@ impl AzureBlobStorage {
             _ = cancel.cancelled() => Err(Cancelled),
         }
     }
+
+    pub fn container_name(&self) -> &str {
+        &self.container_name
+    }
 }
 
 fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 3381c4296f..3ee7d15a76 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -504,6 +504,16 @@ impl GenericRemoteStorage {
             None => self.download(from, cancel).await,
         }
     }
+
+    /// The name of the bucket/container/etc.
+    pub fn bucket_name(&self) -> Option<&str> {
+        match self {
+            Self::LocalFs(_s) => None,
+            Self::AwsS3(s) => Some(s.bucket_name()),
+            Self::AzureBlob(s) => Some(s.container_name()),
+            Self::Unreliable(_s) => None,
+        }
+    }
 }
 
 /// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index b65d8b7e9e..056646a01e 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -386,6 +386,10 @@ impl S3Bucket {
         }
         Ok(())
     }
+
+    pub fn bucket_name(&self) -> &str {
+        &self.bucket_name
+    }
 }
 
 pin_project_lite::pin_project! {
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index 0450851988..c7e21d7e26 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -8,21 +8,19 @@ use std::{
 };
 
 use anyhow::Context;
-use aws_sdk_s3::{
-    types::{Delete, ObjectIdentifier},
-    Client,
-};
 use futures_util::TryStreamExt;
 use pageserver_api::shard::TenantShardId;
+use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath};
 use serde::{Deserialize, Serialize};
 use tokio_stream::StreamExt;
+use tokio_util::sync::CancellationToken;
 use utils::id::TenantId;
 
 use crate::{
     cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
-    init_remote,
-    metadata_stream::{stream_listing, stream_tenant_timelines, stream_tenants},
-    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, TraversingDepth,
+    init_remote, init_remote_generic,
+    metadata_stream::{stream_tenant_timelines, stream_tenants},
+    BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
 };
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -324,41 +322,45 @@ impl std::fmt::Display for PurgeMode {
 }
 
 pub async fn get_tenant_objects(
-    s3_client: &Arc<Client>,
-    target: RootTarget,
+    s3_client: &GenericRemoteStorage,
     tenant_shard_id: TenantShardId,
-) -> anyhow::Result<Vec<ObjectIdentifier>> {
+) -> anyhow::Result<Vec<RemotePath>> {
     tracing::debug!("Listing objects in tenant {tenant_shard_id}");
+    let tenant_root = super::remote_tenant_path(&tenant_shard_id);
+
     // TODO: apply extra validation based on object modification time.  Don't purge
     // tenants where any timeline's index_part.json has been touched recently.
 
-    let mut tenant_root = target.tenant_root(&tenant_shard_id);
-
-    // Remove delimiter, so that object listing lists all keys in the prefix and not just
-    // common prefixes.
-    tenant_root.delimiter = String::new();
-
-    let key_stream = stream_listing(s3_client, &tenant_root);
-    key_stream.try_collect().await
+    let list = s3_client
+        .list(
+            Some(&tenant_root),
+            ListingMode::NoDelimiter,
+            None,
+            &CancellationToken::new(),
+        )
+        .await?;
+    Ok(list.keys)
 }
 
 pub async fn get_timeline_objects(
-    s3_client: &Arc<Client>,
-    target: RootTarget,
+    s3_client: &GenericRemoteStorage,
     ttid: TenantShardTimelineId,
-) -> anyhow::Result<Vec<ObjectIdentifier>> {
+) -> anyhow::Result<Vec<RemotePath>> {
     tracing::debug!("Listing objects in timeline {ttid}");
-    let mut timeline_root = target.timeline_root(&ttid);
+    let timeline_root = super::remote_timeline_path_id(&ttid);
 
     // TODO: apply extra validation based on object modification time.  Don't purge
     // timelines whose index_part.json has been touched recently.
 
-    // Remove delimiter, so that object listing lists all keys in the prefix and not just
-    // common prefixes.
-    timeline_root.delimiter = String::new();
-    let key_stream = stream_listing(s3_client, &timeline_root);
-
-    key_stream.try_collect().await
+    let list = s3_client
+        .list(
+            Some(&timeline_root),
+            ListingMode::NoDelimiter,
+            None,
+            &CancellationToken::new(),
+        )
+        .await?;
+    Ok(list.keys)
 }
 
 const MAX_KEYS_PER_DELETE: usize = 1000;
@@ -369,16 +371,17 @@ const MAX_KEYS_PER_DELETE: usize = 1000;
 /// MAX_KEYS_PER_DELETE keys are left.
 /// `num_deleted` returns number of deleted keys.
 async fn do_delete(
-    s3_client: &Arc<Client>,
-    bucket_name: &str,
-    keys: &mut Vec<ObjectIdentifier>,
+    remote_client: &GenericRemoteStorage,
+    keys: &mut Vec<RemotePath>,
     dry_run: bool,
     drain: bool,
     progress_tracker: &mut DeletionProgressTracker,
 ) -> anyhow::Result<()> {
+    let cancel = CancellationToken::new();
     while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) {
         let request_keys =
             keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
+
         let num_deleted = request_keys.len();
         if dry_run {
             tracing::info!("Dry-run deletion of objects: ");
@@ -386,14 +389,10 @@ async fn do_delete(
                 tracing::info!("  {k:?}");
             }
         } else {
-            let delete_request = s3_client
-                .delete_objects()
-                .bucket(bucket_name)
-                .delete(Delete::builder().set_objects(Some(request_keys)).build()?);
-            delete_request
-                .send()
+            remote_client
+                .delete_objects(&request_keys, &cancel)
                 .await
-                .context("DeleteObjects request")?;
+                .context("deletetion request")?;
             progress_tracker.register(num_deleted);
         }
     }
@@ -431,8 +430,13 @@ pub async fn purge_garbage(
         input_path
     );
 
-    let (s3_client, target) =
-        init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
+    let remote_client =
+        init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
+
+    assert_eq!(
+        &garbage_list.bucket_config.bucket,
+        remote_client.bucket_name().unwrap()
+    );
 
     // Sanity checks on the incoming list
     if garbage_list.active_tenant_count == 0 {
@@ -464,16 +468,13 @@ pub async fn purge_garbage(
 
     let items = tokio_stream::iter(filtered_items.map(Ok));
     let get_objects_results = items.map_ok(|i| {
-        let s3_client = s3_client.clone();
-        let target = target.clone();
+        let remote_client = remote_client.clone();
         async move {
             match i.entity {
                 GarbageEntity::Tenant(tenant_id) => {
-                    get_tenant_objects(&s3_client, target, tenant_id).await
-                }
-                GarbageEntity::Timeline(ttid) => {
-                    get_timeline_objects(&s3_client, target, ttid).await
+                    get_tenant_objects(&remote_client, tenant_id).await
                 }
+                GarbageEntity::Timeline(ttid) => get_timeline_objects(&remote_client, ttid).await,
             }
         }
     });
@@ -487,8 +488,7 @@ pub async fn purge_garbage(
         objects_to_delete.append(&mut object_list);
         if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
             do_delete(
-                &s3_client,
-                &garbage_list.bucket_config.bucket,
+                &remote_client,
                 &mut objects_to_delete,
                 dry_run,
                 false,
@@ -499,8 +499,7 @@ pub async fn purge_garbage(
     }
 
     do_delete(
-        &s3_client,
-        &garbage_list.bucket_config.bucket,
+        &remote_client,
         &mut objects_to_delete,
         dry_run,
         true,
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index a0b6d7ea30..5c64e7e459 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -22,9 +22,13 @@ use aws_sdk_s3::Client;
 
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
+use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path};
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::RemotePath;
+use remote_storage::{
+    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
+    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+};
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
 use tokio::io::AsyncReadExt;
@@ -215,6 +219,10 @@ impl RootTarget {
     }
 }
 
+pub fn remote_timeline_path_id(id: &TenantShardTimelineId) -> RemotePath {
+    remote_timeline_path(&id.tenant_shard_id, &id.timeline_id)
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(deny_unknown_fields)]
 pub struct BucketConfig {
@@ -296,7 +304,7 @@ pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
     }
 }
 
-pub async fn init_s3_client(bucket_region: Region) -> Client {
+async fn init_s3_client(bucket_region: Region) -> Client {
     let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
         .region(bucket_region)
         .load()
@@ -304,6 +312,13 @@ pub async fn init_s3_client(bucket_region: Region) -> Client {
     Client::new(&config)
 }
 
+fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str {
+    match node_kind {
+        NodeKind::Pageserver => "pageserver/v1/",
+        NodeKind::Safekeeper => "wal/",
+    }
+}
+
 async fn init_remote(
     bucket_config: BucketConfig,
     node_kind: NodeKind,
@@ -311,18 +326,17 @@ async fn init_remote(
     let bucket_region = Region::new(bucket_config.region);
     let delimiter = "/".to_string();
     let s3_client = Arc::new(init_s3_client(bucket_region).await);
+    let default_prefix = default_prefix_in_bucket(node_kind).to_string();
 
     let s3_root = match node_kind {
         NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
             bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config
-                .prefix_in_bucket
-                .unwrap_or("pageserver/v1".to_string()),
+            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
             delimiter,
         }),
         NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
             bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()),
+            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
             delimiter,
         }),
     };
@@ -330,6 +344,31 @@ async fn init_remote(
     Ok((s3_client, s3_root))
 }
 
+async fn init_remote_generic(
+    bucket_config: BucketConfig,
+    node_kind: NodeKind,
+) -> anyhow::Result<GenericRemoteStorage> {
+    let endpoint = env::var("AWS_ENDPOINT_URL").ok();
+    let default_prefix = default_prefix_in_bucket(node_kind).to_string();
+    let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix));
+    let storage = S3Config {
+        bucket_name: bucket_config.bucket,
+        bucket_region: bucket_config.region,
+        prefix_in_bucket,
+        endpoint,
+        concurrency_limit: DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
+            .try_into()
+            .unwrap(),
+        max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+        upload_storage_class: None,
+    };
+    let storage_config = RemoteStorageConfig {
+        storage: RemoteStorageKind::AwsS3(storage),
+        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+    };
+    GenericRemoteStorage::from_config(&storage_config).await
+}
+
 async fn list_objects_with_retries(
     s3_client: &Client,
     s3_target: &S3Target,

From 595c450036782a1ab494b57880e88e20a453fd48 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Mon, 22 Jul 2024 09:53:33 -0400
Subject: [PATCH 1237/1571] fix(scrubber): more robust metadata consistency
 checks (#8344)

Part of #8128.

## Problem

Scrubber uses `scan_metadata` command to flag metadata inconsistencies.
To trust it at scale, we need to make sure the errors we emit is a
reflection of real scenario. One check performed in the scrubber is to
see whether layers listed in the latest `index_part.json` is present in
object listing. Currently, the scrubber does not robustly handle the
case where objects are uploaded/deleted during the scan.

## Summary of changes

**Condition for success:** An object in the index is (1) in the object
listing we acquire from S3 or (2) found in a HeadObject request (new
object).

- Add in the `HeadObject` requests for the layers missing from the
object listing.
- Keep the order of first getting the object listing and then
downloading the layers.
- Update check to only consider shards with highest shard count.
- Skip analyzing a timeline if `deleted_at` tombstone is marked in
`index_part.json`.
- Add new test to see if scrubber actually detect the metadata
inconsistency.

_Misc_

- A timeline with no ancestor should always have some layers.
- Removed experimental histograms

_Caveat_

- Ancestor layer is not cleaned until #8308 is implemented. If ancestor
layers reference non-existing layers in the index, the scrubber will
emit false positives.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 Cargo.lock                                    |  11 --
 libs/utils/src/shard.rs                       |   1 +
 pageserver/src/tenant/layer_map.rs            |  11 +-
 pageserver/src/tenant/storage_layer/layer.rs  |   2 +-
 .../src/tenant/storage_layer/layer_name.rs    |   8 +
 pageserver/src/tenant/timeline.rs             |   2 +-
 storage_scrubber/Cargo.toml                   |   1 -
 storage_scrubber/src/checks.rs                |  73 +++++--
 .../src/scan_pageserver_metadata.rs           | 179 +++++++-----------
 test_runner/fixtures/common_types.py          |   3 +
 test_runner/fixtures/remote_storage.py        |  34 +++-
 test_runner/regress/test_storage_scrubber.py  |  74 ++++++++
 12 files changed, 251 insertions(+), 148 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2505d4d3ed..b03bd57631 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2384,16 +2384,6 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46"
 
-[[package]]
-name = "histogram"
-version = "0.7.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e673d137229619d5c2c8903b6ed5852b43636c0017ff2e66b1aafb8ccf04b80b"
-dependencies = [
- "serde",
- "thiserror",
-]
-
 [[package]]
 name = "hmac"
 version = "0.12.1"
@@ -5847,7 +5837,6 @@ dependencies = [
  "futures",
  "futures-util",
  "hex",
- "histogram",
  "humantime",
  "itertools",
  "once_cell",
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
index 4f9ac6bdb4..f6b430657e 100644
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -49,6 +49,7 @@ pub struct TenantShardId {
 
 impl ShardCount {
     pub const MAX: Self = Self(u8::MAX);
+    pub const MIN: Self = Self(0);
 
     /// The internal value of a ShardCount may be zero, which means "1 shard, but use
     /// legacy format for TenantShardId that excludes the shard suffix", also known
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 2724a5cc07..72167d02ab 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -463,7 +463,7 @@ impl LayerMap {
     pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
         // TODO: See #3869, resulting #4088, attempted fix and repro #4094
 
-        if Self::is_l0(&layer_desc) {
+        if Self::is_l0(&layer_desc.key_range) {
             self.l0_delta_layers.push(layer_desc.clone().into());
         }
 
@@ -482,7 +482,7 @@ impl LayerMap {
         self.historic
             .remove(historic_layer_coverage::LayerKey::from(layer_desc));
         let layer_key = layer_desc.key();
-        if Self::is_l0(layer_desc) {
+        if Self::is_l0(&layer_desc.key_range) {
             let len_before = self.l0_delta_layers.len();
             let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
             l0_delta_layers.retain(|other| other.key() != layer_key);
@@ -598,8 +598,9 @@ impl LayerMap {
         coverage
     }
 
-    pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
-        layer.get_key_range() == (Key::MIN..Key::MAX)
+    /// Check if the key range resembles that of an L0 layer.
+    pub fn is_l0(key_range: &Range<Key>) -> bool {
+        key_range == &(Key::MIN..Key::MAX)
     }
 
     /// This function determines which layers are counted in `count_deltas`:
@@ -626,7 +627,7 @@ impl LayerMap {
     ///      than just the current partition_range.
     pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
         // Case 1
-        if !Self::is_l0(layer) {
+        if !Self::is_l0(&layer.key_range) {
             return true;
         }
 
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index d9cbaba529..d1c9173f1c 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1298,7 +1298,7 @@ impl LayerInner {
                 lsn_end: lsn_range.end,
                 remote: !resident,
                 access_stats,
-                l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()),
+                l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range),
             }
         } else {
             let lsn = self.desc.image_layer_lsn();
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index da26e1eeb7..f33ca076ab 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -248,6 +248,14 @@ impl LayerName {
             Image(_) => "image",
         }
     }
+
+    /// Gets the key range encoded in the layer name.
+    pub fn key_range(&self) -> &Range<Key> {
+        match &self {
+            LayerName::Image(layer) => &layer.key_range,
+            LayerName::Delta(layer) => &layer.key_range,
+        }
+    }
 }
 
 impl fmt::Display for LayerName {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 19b1396981..b312a1e43d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4868,7 +4868,7 @@ impl Timeline {
                 // for compact_level0_phase1 creating an L0, which does not happen in practice
                 // because we have not implemented L0 => L0 compaction.
                 duplicated_layers.insert(l.layer_desc().key());
-            } else if LayerMap::is_l0(l.layer_desc()) {
+            } else if LayerMap::is_l0(&l.layer_desc().key_range) {
                 bail!("compaction generates a L0 layer file as output, which will cause infinite compaction.");
             } else {
                 insert_layers.push(l.clone());
diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml
index 5233afbebe..7d5b7d10b9 100644
--- a/storage_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -49,6 +49,5 @@ tracing.workspace = true
 tracing-subscriber.workspace = true
 clap.workspace = true
 tracing-appender = "0.2"
-histogram = "0.7"
 
 futures.workspace = true
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index f687b24320..421a848f67 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -2,6 +2,7 @@ use std::collections::{HashMap, HashSet};
 
 use anyhow::Context;
 use aws_sdk_s3::Client;
+use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
 use tracing::{error, info, warn};
@@ -12,7 +13,7 @@ use crate::cloud_admin_api::BranchData;
 use crate::metadata_stream::stream_listing;
 use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
 use futures_util::StreamExt;
-use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
+use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use remote_storage::RemotePath;
@@ -41,7 +42,9 @@ impl TimelineAnalysis {
     }
 }
 
-pub(crate) fn branch_cleanup_and_check_errors(
+pub(crate) async fn branch_cleanup_and_check_errors(
+    s3_client: &Client,
+    target: &RootTarget,
     id: &TenantShardTimelineId,
     tenant_objects: &mut TenantObjectListing,
     s3_active_branch: Option<&BranchData>,
@@ -85,15 +88,17 @@ pub(crate) fn branch_cleanup_and_check_errors(
                     }
 
                     if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
-                        result.warnings.push(format!(
+                        info!(
                             "index_part.json version is not latest: {}",
                             index_part.version()
-                        ))
+                        );
                     }
 
                     if index_part.metadata.disk_consistent_lsn()
                         != index_part.duplicated_disk_consistent_lsn()
                     {
+                        // Tech debt: let's get rid of one of these, they are redundant
+                        // https://github.com/neondatabase/neon/issues/8343
                         result.errors.push(format!(
                             "Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})",
                             index_part.metadata.disk_consistent_lsn(),
@@ -102,8 +107,16 @@ pub(crate) fn branch_cleanup_and_check_errors(
                     }
 
                     if index_part.layer_metadata.is_empty() {
-                        // not an error, can happen for branches with zero writes, but notice that
-                        info!("index_part.json has no layers");
+                        if index_part.metadata.ancestor_timeline().is_none() {
+                            // The initial timeline with no ancestor should ALWAYS have layers.
+                            result.errors.push(
+                                "index_part.json has no layers (ancestor_timeline=None)"
+                                    .to_string(),
+                            );
+                        } else {
+                            // Not an error, can happen for branches with zero writes, but notice that
+                            info!("index_part.json has no layers (ancestor_timeline exists)");
+                        }
                     }
 
                     for (layer, metadata) in index_part.layer_metadata {
@@ -114,16 +127,41 @@ pub(crate) fn branch_cleanup_and_check_errors(
                         }
 
                         if !tenant_objects.check_ref(id.timeline_id, &layer, &metadata) {
-                            // FIXME: this will emit false positives if an index was
-                            // uploaded concurrently with our scan.  To make this check
-                            // correct, we need to try sending a HEAD request for the
-                            // layer we think is missing.
-                            result.errors.push(format!(
-                                "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage",
-                                layer,
-                                metadata.generation.get_suffix(),
-                                metadata.shard
-                            ))
+                            let path = remote_layer_path(
+                                &id.tenant_shard_id.tenant_id,
+                                &id.timeline_id,
+                                metadata.shard,
+                                &layer,
+                                metadata.generation,
+                            );
+
+                            // HEAD request used here to address a race condition  when an index was uploaded concurrently
+                            // with our scan. We check if the object is uploaded to S3 after taking the listing snapshot.
+                            let response = s3_client
+                                .head_object()
+                                .bucket(target.bucket_name())
+                                .key(path.get_path().as_str())
+                                .send()
+                                .await;
+
+                            if response.is_err() {
+                                // Object is not present.
+                                let is_l0 = LayerMap::is_l0(layer.key_range());
+
+                                let msg = format!(
+                                    "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})",
+                                    layer,
+                                    metadata.generation.get_suffix(),
+                                    metadata.shard,
+                                    is_l0,
+                                );
+
+                                if is_l0 {
+                                    result.warnings.push(msg);
+                                } else {
+                                    result.errors.push(msg);
+                                }
+                            }
                         }
                     }
                 }
@@ -303,6 +341,9 @@ pub(crate) async fn list_timeline_blobs(
                 tracing::debug!("initdb archive {key}");
                 initdb_archive = true;
             }
+            Some("initdb-preserved.tar.zst") => {
+                tracing::info!("initdb archive preserved {key}");
+            }
             Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
                 Ok((new_layer, gen)) => {
                     tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen);
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index df4f29acf7..fbd60f93bb 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -8,12 +8,11 @@ use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use aws_sdk_s3::Client;
 use futures_util::{StreamExt, TryStreamExt};
-use histogram::Histogram;
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
-use pageserver::tenant::IndexPart;
 use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use utils::id::TenantId;
+use utils::shard::ShardCount;
 
 #[derive(Serialize)]
 pub struct MetadataSummary {
@@ -24,66 +23,6 @@ pub struct MetadataSummary {
     with_warnings: HashSet<TenantShardTimelineId>,
     with_orphans: HashSet<TenantShardTimelineId>,
     indices_by_version: HashMap<usize, usize>,
-
-    layer_count: MinMaxHisto,
-    timeline_size_bytes: MinMaxHisto,
-    layer_size_bytes: MinMaxHisto,
-}
-
-/// A histogram plus minimum and maximum tracking
-#[derive(Serialize)]
-struct MinMaxHisto {
-    #[serde(skip)]
-    histo: Histogram,
-    min: u64,
-    max: u64,
-}
-
-impl MinMaxHisto {
-    fn new() -> Self {
-        Self {
-            histo: histogram::Histogram::builder()
-                .build()
-                .expect("Bad histogram params"),
-            min: u64::MAX,
-            max: 0,
-        }
-    }
-
-    fn sample(&mut self, v: u64) -> Result<(), histogram::Error> {
-        self.min = std::cmp::min(self.min, v);
-        self.max = std::cmp::max(self.max, v);
-        let r = self.histo.increment(v, 1);
-
-        if r.is_err() {
-            tracing::warn!("Bad histogram sample: {v}");
-        }
-
-        r
-    }
-
-    fn oneline(&self) -> String {
-        let percentiles = match self.histo.percentiles(&[1.0, 10.0, 50.0, 90.0, 99.0]) {
-            Ok(p) => p,
-            Err(e) => return format!("No data: {}", e),
-        };
-
-        let percentiles: Vec<u64> = percentiles
-            .iter()
-            .map(|p| p.bucket().low() + p.bucket().high() / 2)
-            .collect();
-
-        format!(
-            "min {}, 1% {}, 10% {}, 50% {}, 90% {}, 99% {}, max {}",
-            self.min,
-            percentiles[0],
-            percentiles[1],
-            percentiles[2],
-            percentiles[3],
-            percentiles[4],
-            self.max,
-        )
-    }
 }
 
 impl MetadataSummary {
@@ -96,25 +35,9 @@ impl MetadataSummary {
             with_warnings: HashSet::new(),
             with_orphans: HashSet::new(),
             indices_by_version: HashMap::new(),
-            layer_count: MinMaxHisto::new(),
-            timeline_size_bytes: MinMaxHisto::new(),
-            layer_size_bytes: MinMaxHisto::new(),
         }
     }
 
-    fn update_histograms(&mut self, index_part: &IndexPart) -> Result<(), histogram::Error> {
-        self.layer_count
-            .sample(index_part.layer_metadata.len() as u64)?;
-        let mut total_size: u64 = 0;
-        for meta in index_part.layer_metadata.values() {
-            total_size += meta.file_size;
-            self.layer_size_bytes.sample(meta.file_size)?;
-        }
-        self.timeline_size_bytes.sample(total_size)?;
-
-        Ok(())
-    }
-
     fn update_data(&mut self, data: &S3TimelineBlobData) {
         self.timeline_shard_count += 1;
         if let BlobDataParseResult::Parsed {
@@ -127,14 +50,6 @@ impl MetadataSummary {
                 .indices_by_version
                 .entry(index_part.version())
                 .or_insert(0) += 1;
-
-            if let Err(e) = self.update_histograms(index_part) {
-                // Value out of range?  Warn that the results are untrustworthy
-                tracing::warn!(
-                    "Error updating histograms, summary stats may be wrong: {}",
-                    e
-                );
-            }
         }
     }
 
@@ -169,9 +84,6 @@ With errors: {}
 With warnings: {}
 With orphan layers: {}
 Index versions: {version_summary}
-Timeline size bytes: {}
-Layer size bytes: {}
-Timeline layer count: {}
 ",
             self.tenant_count,
             self.timeline_count,
@@ -179,9 +91,6 @@ Timeline layer count: {}
             self.with_errors.len(),
             self.with_warnings.len(),
             self.with_orphans.len(),
-            self.timeline_size_bytes.oneline(),
-            self.layer_size_bytes.oneline(),
-            self.layer_count.oneline(),
         )
     }
 
@@ -235,33 +144,60 @@ pub async fn scan_metadata(
     let mut tenant_objects = TenantObjectListing::default();
     let mut tenant_timeline_results = Vec::new();
 
-    fn analyze_tenant(
+    async fn analyze_tenant(
+        s3_client: &Client,
+        target: &RootTarget,
         tenant_id: TenantId,
         summary: &mut MetadataSummary,
         mut tenant_objects: TenantObjectListing,
         timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>,
+        highest_shard_count: ShardCount,
     ) {
         summary.tenant_count += 1;
 
         let mut timeline_ids = HashSet::new();
         let mut timeline_generations = HashMap::new();
         for (ttid, data) in timelines {
-            timeline_ids.insert(ttid.timeline_id);
-            // Stash the generation of each timeline, for later use identifying orphan layers
-            if let BlobDataParseResult::Parsed {
-                index_part: _index_part,
-                index_part_generation,
-                s3_layers: _s3_layers,
-            } = &data.blob_data
-            {
-                timeline_generations.insert(ttid, *index_part_generation);
-            }
+            if ttid.tenant_shard_id.shard_count == highest_shard_count {
+                // Only analyze `TenantShardId`s with highest shard count.
 
-            // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
-            // reference counts for layers across the tenant.
-            let analysis =
-                branch_cleanup_and_check_errors(&ttid, &mut tenant_objects, None, None, Some(data));
-            summary.update_analysis(&ttid, &analysis);
+                // Stash the generation of each timeline, for later use identifying orphan layers
+                if let BlobDataParseResult::Parsed {
+                    index_part,
+                    index_part_generation,
+                    s3_layers: _s3_layers,
+                } = &data.blob_data
+                {
+                    if index_part.deleted_at.is_some() {
+                        // skip deleted timeline.
+                        tracing::info!("Skip analysis of {} b/c timeline is already deleted", ttid);
+                        continue;
+                    }
+                    timeline_generations.insert(ttid, *index_part_generation);
+                }
+
+                // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
+                // reference counts for layers across the tenant.
+                let analysis = branch_cleanup_and_check_errors(
+                    s3_client,
+                    target,
+                    &ttid,
+                    &mut tenant_objects,
+                    None,
+                    None,
+                    Some(data),
+                )
+                .await;
+                summary.update_analysis(&ttid, &analysis);
+
+                timeline_ids.insert(ttid.timeline_id);
+            } else {
+                tracing::info!(
+                    "Skip analysis of {} b/c a lower shard count than {}",
+                    ttid,
+                    highest_shard_count.0,
+                );
+            }
         }
 
         summary.timeline_count += timeline_ids.len();
@@ -309,18 +245,35 @@ pub async fn scan_metadata(
     // all results for the same tenant will be adjacent.  We accumulate these,
     // and then call `analyze_tenant` to flush, when we see the next tenant ID.
     let mut summary = MetadataSummary::new();
+    let mut highest_shard_count = ShardCount::MIN;
     while let Some(i) = timelines.next().await {
         let (ttid, data) = i?;
         summary.update_data(&data);
 
         match tenant_id {
-            None => tenant_id = Some(ttid.tenant_shard_id.tenant_id),
+            None => {
+                tenant_id = Some(ttid.tenant_shard_id.tenant_id);
+                highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
+            }
             Some(prev_tenant_id) => {
                 if prev_tenant_id != ttid.tenant_shard_id.tenant_id {
+                    // New tenant: analyze this tenant's timelines, clear accumulated tenant_timeline_results
                     let tenant_objects = std::mem::take(&mut tenant_objects);
                     let timelines = std::mem::take(&mut tenant_timeline_results);
-                    analyze_tenant(prev_tenant_id, &mut summary, tenant_objects, timelines);
+                    analyze_tenant(
+                        &s3_client,
+                        &target,
+                        prev_tenant_id,
+                        &mut summary,
+                        tenant_objects,
+                        timelines,
+                        highest_shard_count,
+                    )
+                    .await;
                     tenant_id = Some(ttid.tenant_shard_id.tenant_id);
+                    highest_shard_count = ttid.tenant_shard_id.shard_count;
+                } else {
+                    highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
                 }
             }
         }
@@ -338,11 +291,15 @@ pub async fn scan_metadata(
 
     if !tenant_timeline_results.is_empty() {
         analyze_tenant(
+            &s3_client,
+            &target,
             tenant_id.expect("Must be set if results are present"),
             &mut summary,
             tenant_objects,
             tenant_timeline_results,
-        );
+            highest_shard_count,
+        )
+        .await;
     }
 
     Ok(summary)
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index 147264762c..b63dfd4e47 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -143,6 +143,9 @@ class TimelineId(Id):
     def __repr__(self) -> str:
         return f'TimelineId("{self.id.hex()}")'
 
+    def __str__(self) -> str:
+        return self.id.hex()
+
 
 # Workaround for compat with python 3.9, which does not have `typing.Self`
 TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId")
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 6f6526d3fc..0f2a997b1e 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -12,8 +12,9 @@ import boto3
 import toml
 from mypy_boto3_s3 import S3Client
 
-from fixtures.common_types import TenantId, TimelineId
+from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
+from fixtures.pageserver.common_types import IndexPartDump
 
 TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"
 TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json"
@@ -265,9 +266,38 @@ class S3Storage:
     def tenants_path(self) -> str:
         return f"{self.prefix_in_bucket}/tenants"
 
-    def tenant_path(self, tenant_id: TenantId) -> str:
+    def tenant_path(self, tenant_id: Union[TenantShardId, TenantId]) -> str:
         return f"{self.tenants_path()}/{tenant_id}"
 
+    def timeline_path(
+        self, tenant_id: Union[TenantShardId, TenantId], timeline_id: TimelineId
+    ) -> str:
+        return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}"
+
+    def get_latest_index_key(self, index_keys: List[str]) -> str:
+        """
+        Gets the latest index file key.
+
+        @param index_keys: A list of index keys of different generations.
+        """
+
+        def parse_gen(index_key: str) -> int:
+            parts = index_key.split("index_part.json-")
+            return int(parts[-1], base=16) if len(parts) == 2 else -1
+
+        return max(index_keys, key=parse_gen)
+
+    def download_index_part(self, index_key: str) -> IndexPartDump:
+        """
+        Downloads the index content from remote storage.
+
+        @param index_key: index key in remote storage.
+        """
+        response = self.client.get_object(Bucket=self.bucket_name, Key=index_key)
+        body = response["Body"].read().decode("utf-8")
+        log.info(f"index_part.json: {body}")
+        return IndexPartDump.from_json(json.loads(body))
+
     def heatmap_key(self, tenant_id: TenantId) -> str:
         return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
 
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 635690fc7f..a9f12f09b7 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -1,4 +1,5 @@
 import os
+import pprint
 import shutil
 import threading
 import time
@@ -373,3 +374,76 @@ def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder):
         assert gc_output["ancestor_layers_deleted"] > 0
         assert gc_output["remote_storage_errors"] == 0
         assert gc_output["controller_api_errors"] == 0
+
+
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_scrubber_scan_pageserver_metadata(
+    neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]
+):
+    """
+    Create some layers. Delete an object listed in index. Run scrubber and see if it detects the defect.
+    """
+
+    # Use s3_storage so we could test out scrubber.
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+
+    # Create some layers.
+
+    workload = Workload(env, env.initial_tenant, env.initial_timeline)
+    workload.init()
+
+    for _ in range(3):
+        workload.write_rows(128)
+
+    for pageserver in env.pageservers:
+        pageserver.stop()
+        pageserver.start()
+
+    for _ in range(3):
+        workload.write_rows(128)
+
+    # Get the latest index for a particular timeline.
+
+    tenant_shard_id = TenantShardId(env.initial_tenant, 0, shard_count if shard_count else 0)
+
+    assert isinstance(env.pageserver_remote_storage, S3Storage)
+    timeline_path = env.pageserver_remote_storage.timeline_path(
+        tenant_shard_id, env.initial_timeline
+    )
+
+    client = env.pageserver_remote_storage.client
+    bucket = env.pageserver_remote_storage.bucket_name
+    objects = client.list_objects_v2(Bucket=bucket, Prefix=f"{timeline_path}/", Delimiter="").get(
+        "Contents", []
+    )
+    keys = [obj["Key"] for obj in objects]
+    index_keys = list(filter(lambda s: s.startswith(f"{timeline_path}/index_part"), keys))
+    assert len(index_keys) > 0
+
+    latest_index_key = env.pageserver_remote_storage.get_latest_index_key(index_keys)
+    log.info(f"{latest_index_key=}")
+
+    index = env.pageserver_remote_storage.download_index_part(latest_index_key)
+
+    assert len(index.layer_metadata) > 0
+    it = iter(index.layer_metadata.items())
+
+    scan_summary = env.storage_scrubber.scan_metadata()
+    assert not scan_summary["with_warnings"]
+    assert not scan_summary["with_errors"]
+
+    # Delete a layer file that is listed in the index.
+    layer, metadata = next(it)
+    log.info(f"Deleting {timeline_path}/{layer.to_str()}")
+    delete_response = client.delete_object(
+        Bucket=bucket,
+        Key=f"{timeline_path}/{layer.to_str()}-{metadata.generation:08x}",
+    )
+    log.info(f"delete response: {delete_response}")
+
+    # Check scan summary. Expect it to be a L0 layer so only emit warnings.
+    scan_summary = env.storage_scrubber.scan_metadata()
+    log.info(f"{pprint.pformat(scan_summary)}")
+    assert len(scan_summary["with_warnings"]) > 0

From 631a9c372fe3aba8625ed3fa976b5ff4655bc702 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Mon, 22 Jul 2024 09:59:02 -0400
Subject: [PATCH 1238/1571] fix(docs): clearify the admin URL and token used in
 scrubber (#8441)

We were not clear about which token and admin URL to use for this tool.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/storage_scrubber/README.md b/storage_scrubber/README.md
index 0930f343ec..5b137950d9 100644
--- a/storage_scrubber/README.md
+++ b/storage_scrubber/README.md
@@ -45,7 +45,11 @@ processing by the `purge-garbage` subcommand.
 
 Example:
 
-`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`
+`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=[client_key] CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`
+
+Note that `CLOUD_ADMIN_API_TOKEN` can be obtained from https://console-stage.neon.build/app/settings/api-keys (for staging) or https://console.neon.tech/app/settings/api-keys for production. This is not the control plane admin JWT key. The env var name is confusing. Though anyone can generate that API key, you still need admin permission in order to access all projects in the region.
+
+And note that `CLOUD_ADMIN_API_URL` should include the region in the admin URL due to the control plane / console split. For example, `https://console-stage.neon.build/regions/aws-us-east-2/api/v1/admin` for the staging us-east-2 region.
 
 #### `purge-garbage`
 

From e8523014d46c4a491c9a36d686285ac2eef2d6ef Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 22 Jul 2024 17:25:06 +0200
Subject: [PATCH 1239/1571] refactor(pageserver) remove `task_mgr` for most
 global tasks (#8449)

## Motivation & Context

We want to move away from `task_mgr` towards explicit tracking of child
tasks.

This PR is extracted from https://github.com/neondatabase/neon/pull/8339
where I refactor `PageRequestHandler` to not depend on task_mgr anymore.

## Changes

This PR refactors all global tasks but `PageRequestHandler` to use some
combination of `JoinHandle`/`JoinSet` + `CancellationToken`.

The `task_mgr::spawn(.., shutdown_process_on_error)` functionality is
preserved through the new `exit_on_panic_or_error` wrapper.
Some global tasks were not using it before, but as of this PR, they are.
The rationale is that all global tasks are relevant for correct
operation of the overall Neon system in one way or another.

## Future Work

After #8339, we can make `task_mgr::spawn` require a `TenantId` instead
of an `Option<TenantId>` which concludes this step of cleanup work and
will help discourage future usage of task_mgr for global tasks.
---
 pageserver/src/bin/pageserver.rs              | 162 ++++++++----------
 pageserver/src/consumption_metrics.rs         | 100 +++++++----
 pageserver/src/disk_usage_eviction_task.rs    |  41 +++--
 pageserver/src/lib.rs                         |  60 ++++++-
 pageserver/src/page_service.rs                |   1 -
 pageserver/src/task_mgr.rs                    | 118 ++++++-------
 pageserver/src/tenant.rs                      |   1 -
 pageserver/src/tenant/mgr.rs                  | 129 +++++++++++---
 .../src/tenant/remote_timeline_client.rs      |   1 -
 pageserver/src/tenant/secondary.rs            | 104 ++++++-----
 pageserver/src/tenant/tasks.rs                |   3 -
 pageserver/src/tenant/timeline.rs             |   4 -
 pageserver/src/tenant/timeline/delete.rs      |   1 -
 .../src/tenant/timeline/eviction_task.rs      |   1 -
 14 files changed, 429 insertions(+), 297 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index ec1ceb54ce..978b81d498 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -18,9 +18,13 @@ use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_evicti
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
+use pageserver::{
+    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
+};
 use remote_storage::GenericRemoteStorage;
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 
 use metrics::set_build_info_metric;
@@ -430,8 +434,10 @@ fn start_pageserver(
 
     // Scan the local 'tenants/' directory and start loading the tenants
     let deletion_queue_client = deletion_queue.new_client();
+    let background_purges = mgr::BackgroundPurges::default();
     let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
         conf,
+        background_purges.clone(),
         TenantSharedResources {
             broker_client: broker_client.clone(),
             remote_storage: remote_storage.clone(),
@@ -523,7 +529,7 @@ fn start_pageserver(
         }
     });
 
-    let secondary_controller = secondary::spawn_tasks(
+    let (secondary_controller, secondary_controller_tasks) = secondary::spawn_tasks(
         tenant_manager.clone(),
         remote_storage.clone(),
         background_jobs_barrier.clone(),
@@ -536,18 +542,19 @@ fn start_pageserver(
     // been configured.
     let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();
 
-    launch_disk_usage_global_eviction_task(
+    let disk_usage_eviction_task = launch_disk_usage_global_eviction_task(
         conf,
         remote_storage.clone(),
         disk_usage_eviction_state.clone(),
         tenant_manager.clone(),
         background_jobs_barrier.clone(),
-    )?;
+    );
 
     // Start up the service to handle HTTP mgmt API request. We created the
     // listener earlier already.
-    {
-        let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
+    let http_endpoint_listener = {
+        let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper
+        let cancel = CancellationToken::new();
 
         let router_state = Arc::new(
             http::routes::State::new(
@@ -568,77 +575,44 @@ fn start_pageserver(
         let service = utils::http::RouterService::new(router).unwrap();
         let server = hyper::Server::from_tcp(http_listener)?
             .serve(service)
-            .with_graceful_shutdown(task_mgr::shutdown_watcher());
+            .with_graceful_shutdown({
+                let cancel = cancel.clone();
+                async move { cancel.clone().cancelled().await }
+            });
 
-        task_mgr::spawn(
-            MGMT_REQUEST_RUNTIME.handle(),
-            TaskKind::HttpEndpointListener,
-            None,
-            None,
+        let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
             "http endpoint listener",
-            true,
-            async {
-                server.await?;
-                Ok(())
-            },
-        );
-    }
+            server,
+        ));
+        HttpEndpointListener(CancellableTask { task, cancel })
+    };
 
-    if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
-        let metrics_ctx = RequestContext::todo_child(
-            TaskKind::MetricsCollection,
-            // This task itself shouldn't download anything.
-            // The actual size calculation does need downloads, and
-            // creates a child context with the right DownloadBehavior.
-            DownloadBehavior::Error,
-        );
+    let consumption_metrics_tasks = {
+        let cancel = shutdown_pageserver.child_token();
+        let task = crate::BACKGROUND_RUNTIME.spawn({
+            let tenant_manager = tenant_manager.clone();
+            let cancel = cancel.clone();
+            async move {
+                // first wait until background jobs are cleared to launch.
+                //
+                // this is because we only process active tenants and timelines, and the
+                // Timeline::get_current_logical_size will spawn the logical size calculation,
+                // which will not be rate-limited.
+                tokio::select! {
+                    _ = cancel.cancelled() => { return; },
+                    _ = background_jobs_barrier.wait() => {}
+                };
 
-        let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
-
-        task_mgr::spawn(
-            crate::BACKGROUND_RUNTIME.handle(),
-            TaskKind::MetricsCollection,
-            None,
-            None,
-            "consumption metrics collection",
-            true,
-            {
-                let tenant_manager = tenant_manager.clone();
-                async move {
-                    // first wait until background jobs are cleared to launch.
-                    //
-                    // this is because we only process active tenants and timelines, and the
-                    // Timeline::get_current_logical_size will spawn the logical size calculation,
-                    // which will not be rate-limited.
-                    let cancel = task_mgr::shutdown_token();
-
-                    tokio::select! {
-                        _ = cancel.cancelled() => { return Ok(()); },
-                        _ = background_jobs_barrier.wait() => {}
-                    };
-
-                    pageserver::consumption_metrics::collect_metrics(
-                        tenant_manager,
-                        metric_collection_endpoint,
-                        &conf.metric_collection_bucket,
-                        conf.metric_collection_interval,
-                        conf.synthetic_size_calculation_interval,
-                        conf.id,
-                        local_disk_storage,
-                        cancel,
-                        metrics_ctx,
-                    )
-                    .instrument(info_span!("metrics_collection"))
-                    .await?;
-                    Ok(())
-                }
-            },
-        );
-    }
+                pageserver::consumption_metrics::run(conf, tenant_manager, cancel).await;
+            }
+        });
+        ConsumptionMetricsTasks(CancellableTask { task, cancel })
+    };
 
     // Spawn a task to listen for libpq connections. It will spawn further tasks
     // for each connection. We created the listener earlier already.
-    {
+    let libpq_listener = {
+        let cancel = CancellationToken::new();
         let libpq_ctx = RequestContext::todo_child(
             TaskKind::LibpqEndpointListener,
             // listener task shouldn't need to download anything. (We will
@@ -647,29 +621,20 @@ fn start_pageserver(
             // accept connections.)
             DownloadBehavior::Error,
         );
-        task_mgr::spawn(
-            COMPUTE_REQUEST_RUNTIME.handle(),
-            TaskKind::LibpqEndpointListener,
-            None,
-            None,
-            "libpq endpoint listener",
-            true,
-            {
-                let tenant_manager = tenant_manager.clone();
-                async move {
-                    page_service::libpq_listener_main(
-                        tenant_manager,
-                        pg_auth,
-                        pageserver_listener,
-                        conf.pg_auth_type,
-                        libpq_ctx,
-                        task_mgr::shutdown_token(),
-                    )
-                    .await
-                }
-            },
-        );
-    }
+
+        let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+            "libpq listener",
+            page_service::libpq_listener_main(
+                tenant_manager.clone(),
+                pg_auth,
+                pageserver_listener,
+                conf.pg_auth_type,
+                libpq_ctx,
+                cancel.clone(),
+            ),
+        ));
+        LibpqEndpointListener(CancellableTask { task, cancel })
+    };
 
     let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
 
@@ -695,7 +660,18 @@ fn start_pageserver(
             // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
             // The plan is to change that over time.
             shutdown_pageserver.take();
-            pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await;
+            pageserver::shutdown_pageserver(
+                http_endpoint_listener,
+                libpq_listener,
+                consumption_metrics_tasks,
+                disk_usage_eviction_task,
+                &tenant_manager,
+                background_purges,
+                deletion_queue.clone(),
+                secondary_controller_tasks,
+                0,
+            )
+            .await;
             unreachable!()
         })
     }
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 9104da6072..f94d945d46 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -1,5 +1,6 @@
 //! Periodically collect consumption metrics for all active tenants
 //! and push them to a HTTP endpoint.
+use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::size::CalculateSyntheticSizeError;
@@ -39,49 +40,74 @@ type RawMetric = (MetricsKey, (EventType, u64));
 /// for deduplication, but that is no longer needed.
 type Cache = HashMap<MetricsKey, (EventType, u64)>;
 
+pub async fn run(
+    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
+    cancel: CancellationToken,
+) {
+    let Some(metric_collection_endpoint) = conf.metric_collection_endpoint.as_ref() else {
+        return;
+    };
+
+    let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
+
+    let metrics_ctx = RequestContext::todo_child(
+        TaskKind::MetricsCollection,
+        // This task itself shouldn't download anything.
+        // The actual size calculation does need downloads, and
+        // creates a child context with the right DownloadBehavior.
+        DownloadBehavior::Error,
+    );
+    let collect_metrics = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+        "consumption metrics collection",
+        collect_metrics(
+            tenant_manager.clone(),
+            metric_collection_endpoint,
+            &conf.metric_collection_bucket,
+            conf.metric_collection_interval,
+            conf.id,
+            local_disk_storage,
+            cancel.clone(),
+            metrics_ctx,
+        )
+        .instrument(info_span!("metrics_collection")),
+    ));
+
+    let worker_ctx =
+        RequestContext::todo_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
+    let synthetic_size_worker = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+        "synthetic size calculation",
+        calculate_synthetic_size_worker(
+            tenant_manager.clone(),
+            conf.synthetic_size_calculation_interval,
+            cancel.clone(),
+            worker_ctx,
+        )
+        .instrument(info_span!("synthetic_size_worker")),
+    ));
+
+    let (collect_metrics, synthetic_size_worker) =
+        futures::future::join(collect_metrics, synthetic_size_worker).await;
+    collect_metrics
+        .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process");
+    synthetic_size_worker
+        .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process");
+}
+
 /// Main thread that serves metrics collection
 #[allow(clippy::too_many_arguments)]
-pub async fn collect_metrics(
+async fn collect_metrics(
     tenant_manager: Arc<TenantManager>,
     metric_collection_endpoint: &Url,
     metric_collection_bucket: &Option<RemoteStorageConfig>,
     metric_collection_interval: Duration,
-    synthetic_size_calculation_interval: Duration,
     node_id: NodeId,
     local_disk_storage: Utf8PathBuf,
     cancel: CancellationToken,
     ctx: RequestContext,
 ) -> anyhow::Result<()> {
-    // spin up background worker that caclulates tenant sizes
-    let worker_ctx =
-        ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
-    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
-        TaskKind::CalculateSyntheticSize,
-        None,
-        None,
-        "synthetic size calculation",
-        false,
-        {
-            let tenant_manager = tenant_manager.clone();
-            async move {
-                calculate_synthetic_size_worker(
-                    tenant_manager,
-                    synthetic_size_calculation_interval,
-                    &cancel,
-                    &worker_ctx,
-                )
-                .instrument(info_span!("synthetic_size_worker"))
-                .await?;
-                Ok(())
-            }
-        },
-    );
-
     let path: Arc<Utf8PathBuf> = Arc::new(local_disk_storage);
 
-    let cancel = task_mgr::shutdown_token();
-
     let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval);
 
     let mut cached_metrics = tokio::select! {
@@ -168,11 +194,9 @@ pub async fn collect_metrics(
             BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
         );
 
-        let res = tokio::time::timeout_at(
-            started_at + metric_collection_interval,
-            task_mgr::shutdown_token().cancelled(),
-        )
-        .await;
+        let res =
+            tokio::time::timeout_at(started_at + metric_collection_interval, cancel.cancelled())
+                .await;
         if res.is_ok() {
             return Ok(());
         }
@@ -272,8 +296,8 @@ async fn reschedule(
 async fn calculate_synthetic_size_worker(
     tenant_manager: Arc<TenantManager>,
     synthetic_size_calculation_interval: Duration,
-    cancel: &CancellationToken,
-    ctx: &RequestContext,
+    cancel: CancellationToken,
+    ctx: RequestContext,
 ) -> anyhow::Result<()> {
     info!("starting calculate_synthetic_size_worker");
     scopeguard::defer! {
@@ -313,7 +337,7 @@ async fn calculate_synthetic_size_worker(
             // there is never any reason to exit calculate_synthetic_size_worker following any
             // return value -- we don't need to care about shutdown because no tenant is found when
             // pageserver is shut down.
-            calculate_and_log(&tenant, cancel, ctx).await;
+            calculate_and_log(&tenant, &cancel, &ctx).await;
         }
 
         crate::tenant::tasks::warn_when_period_overrun(
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 103e549d22..92dcf6ee61 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -59,13 +59,14 @@ use utils::{completion, id::TimelineId};
 use crate::{
     config::PageServerConf,
     metrics::disk_usage_based_eviction::METRICS,
-    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
+    task_mgr::{self, BACKGROUND_RUNTIME},
     tenant::{
         mgr::TenantManager,
         remote_timeline_client::LayerFileMetadata,
         secondary::SecondaryTenant,
         storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
     },
+    CancellableTask, DiskUsageEvictionTask,
 };
 
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -186,36 +187,34 @@ pub fn launch_disk_usage_global_eviction_task(
     state: Arc<State>,
     tenant_manager: Arc<TenantManager>,
     background_jobs_barrier: completion::Barrier,
-) -> anyhow::Result<()> {
+) -> Option<DiskUsageEvictionTask> {
     let Some(task_config) = &conf.disk_usage_based_eviction else {
         info!("disk usage based eviction task not configured");
-        return Ok(());
+        return None;
     };
 
     info!("launching disk usage based eviction task");
 
-    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
-        TaskKind::DiskUsageEviction,
-        None,
-        None,
+    let cancel = CancellationToken::new();
+    let task = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
         "disk usage based eviction",
-        false,
-        async move {
-            let cancel = task_mgr::shutdown_token();
+        {
+            let cancel = cancel.clone();
+            async move {
+                // wait until initial load is complete, because we cannot evict from loading tenants.
+                tokio::select! {
+                    _ = cancel.cancelled() => { return anyhow::Ok(()); },
+                    _ = background_jobs_barrier.wait() => { }
+                };
 
-            // wait until initial load is complete, because we cannot evict from loading tenants.
-            tokio::select! {
-                _ = cancel.cancelled() => { return Ok(()); },
-                _ = background_jobs_barrier.wait() => { }
-            };
-
-            disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
-            Ok(())
+                disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel)
+                    .await;
+                anyhow::Ok(())
+            }
         },
-    );
+    ));
 
-    Ok(())
+    Some(DiskUsageEvictionTask(CancellableTask { cancel, task }))
 }
 
 #[instrument(skip_all)]
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 63c677574f..d944019641 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -13,6 +13,7 @@ pub mod http;
 pub mod import_datadir;
 pub mod l0_flush;
 pub use pageserver_api::keyspace;
+use tokio_util::sync::CancellationToken;
 pub mod aux_file;
 pub mod metrics;
 pub mod page_cache;
@@ -32,7 +33,10 @@ pub mod walredo;
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
-use tenant::mgr::TenantManager;
+use tenant::{
+    mgr::{BackgroundPurges, TenantManager},
+    secondary,
+};
 use tracing::info;
 
 /// Current storage format version
@@ -54,17 +58,39 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 
 pub use crate::metrics::preinitialize_metrics;
 
+pub struct CancellableTask {
+    pub task: tokio::task::JoinHandle<()>,
+    pub cancel: CancellationToken,
+}
+pub struct HttpEndpointListener(pub CancellableTask);
+pub struct LibpqEndpointListener(pub CancellableTask);
+pub struct ConsumptionMetricsTasks(pub CancellableTask);
+pub struct DiskUsageEvictionTask(pub CancellableTask);
+impl CancellableTask {
+    pub async fn shutdown(self) {
+        self.cancel.cancel();
+        self.task.await.unwrap();
+    }
+}
+
 #[tracing::instrument(skip_all, fields(%exit_code))]
+#[allow(clippy::too_many_arguments)]
 pub async fn shutdown_pageserver(
+    http_listener: HttpEndpointListener,
+    libpq_listener: LibpqEndpointListener,
+    consumption_metrics_worker: ConsumptionMetricsTasks,
+    disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
     tenant_manager: &TenantManager,
+    background_purges: BackgroundPurges,
     mut deletion_queue: DeletionQueue,
+    secondary_controller_tasks: secondary::GlobalTasks,
     exit_code: i32,
 ) {
     use std::time::Duration;
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
     timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None),
+        libpq_listener.0.shutdown(),
         "shutdown LibpqEndpointListener",
         Duration::from_secs(1),
     )
@@ -91,16 +117,44 @@ pub async fn shutdown_pageserver(
     // Best effort to persist any outstanding deletions, to avoid leaking objects
     deletion_queue.shutdown(Duration::from_secs(5)).await;
 
+    timed(
+        consumption_metrics_worker.0.shutdown(),
+        "shutdown consumption metrics",
+        Duration::from_secs(1),
+    )
+    .await;
+
+    timed(
+        futures::future::OptionFuture::from(disk_usage_eviction_task.map(|t| t.0.shutdown())),
+        "shutdown disk usage eviction",
+        Duration::from_secs(1),
+    )
+    .await;
+
+    timed(
+        background_purges.shutdown(),
+        "shutdown background purges",
+        Duration::from_secs(1),
+    )
+    .await;
+
     // Shut down the HTTP endpoint last, so that you can still check the server's
     // status while it's shutting down.
     // FIXME: We should probably stop accepting commands like attach/detach earlier.
     timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None),
+        http_listener.0.shutdown(),
         "shutdown http",
         Duration::from_secs(1),
     )
     .await;
 
+    timed(
+        secondary_controller_tasks.wait(), // cancellation happened in caller
+        "secondary controller wait",
+        Duration::from_secs(1),
+    )
+    .await;
+
     // There should be nothing left, but let's be sure
     timed(
         task_mgr::shutdown_tasks(None, None, None),
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 00147a8ca6..6353f713e0 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -124,7 +124,6 @@ pub async fn libpq_listener_main(
                     None,
                     None,
                     "serving compute connection task",
-                    false,
                     page_service_conn_main(
                         tenant_manager.clone(),
                         local_auth,
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 5f46ce3d69..5cd78874c1 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -408,7 +408,6 @@ pub fn spawn<F>(
     tenant_shard_id: Option<TenantShardId>,
     timeline_id: Option<TimelineId>,
     name: &str,
-    shutdown_process_on_error: bool,
     future: F,
 ) -> PageserverTaskId
 where
@@ -437,7 +436,6 @@ where
         task_id,
         task_cloned,
         cancel,
-        shutdown_process_on_error,
         future,
     ));
     task_mut.join_handle = Some(join_handle);
@@ -454,82 +452,78 @@ async fn task_wrapper<F>(
     task_id: u64,
     task: Arc<PageServerTask>,
     shutdown_token: CancellationToken,
-    shutdown_process_on_error: bool,
     future: F,
 ) where
     F: Future<Output = anyhow::Result<()>> + Send + 'static,
 {
     debug!("Starting task '{}'", task_name);
 
-    let result = SHUTDOWN_TOKEN
-        .scope(
-            shutdown_token,
-            CURRENT_TASK.scope(task, {
-                // We use AssertUnwindSafe here so that the payload function
-                // doesn't need to be UnwindSafe. We don't do anything after the
-                // unwinding that would expose us to unwind-unsafe behavior.
-                AssertUnwindSafe(future).catch_unwind()
-            }),
-        )
-        .await;
-    task_finish(result, task_name, task_id, shutdown_process_on_error).await;
-}
-
-async fn task_finish(
-    result: std::result::Result<
-        anyhow::Result<()>,
-        std::boxed::Box<dyn std::any::Any + std::marker::Send>,
-    >,
-    task_name: String,
-    task_id: u64,
-    shutdown_process_on_error: bool,
-) {
-    // Remove our entry from the global hashmap.
-    let task = TASKS
-        .lock()
-        .unwrap()
-        .remove(&task_id)
-        .expect("no task in registry");
-
-    let mut shutdown_process = false;
-    {
+    // wrap the future so we log panics and errors
+    let tenant_shard_id = task.tenant_shard_id;
+    let timeline_id = task.timeline_id;
+    let fut = async move {
+        // We use AssertUnwindSafe here so that the payload function
+        // doesn't need to be UnwindSafe. We don't do anything after the
+        // unwinding that would expose us to unwind-unsafe behavior.
+        let result = AssertUnwindSafe(future).catch_unwind().await;
         match result {
             Ok(Ok(())) => {
                 debug!("Task '{}' exited normally", task_name);
             }
             Ok(Err(err)) => {
-                if shutdown_process_on_error {
-                    error!(
-                        "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_shard_id, task.timeline_id, err
-                    );
-                    shutdown_process = true;
-                } else {
-                    error!(
-                        "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_shard_id, task.timeline_id, err
-                    );
-                }
+                error!(
+                    "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
+                    task_name, tenant_shard_id, timeline_id, err
+                );
             }
             Err(err) => {
-                if shutdown_process_on_error {
-                    error!(
-                        "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_shard_id, task.timeline_id, err
-                    );
-                    shutdown_process = true;
-                } else {
-                    error!(
-                        "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_shard_id, task.timeline_id, err
-                    );
-                }
+                error!(
+                    "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
+                    task_name, tenant_shard_id, timeline_id, err
+                );
             }
         }
-    }
+    };
 
-    if shutdown_process {
-        std::process::exit(1);
+    // add the task-locals
+    let fut = CURRENT_TASK.scope(task, fut);
+    let fut = SHUTDOWN_TOKEN.scope(shutdown_token, fut);
+
+    // poll future to completion
+    fut.await;
+
+    // Remove our entry from the global hashmap.
+    TASKS
+        .lock()
+        .unwrap()
+        .remove(&task_id)
+        .expect("no task in registry");
+}
+
+pub async fn exit_on_panic_or_error<T, E>(
+    task_name: &'static str,
+    future: impl Future<Output = Result<T, E>>,
+) -> T
+where
+    E: std::fmt::Debug,
+{
+    // We use AssertUnwindSafe here so that the payload function
+    // doesn't need to be UnwindSafe. We don't do anything after the
+    // unwinding that would expose us to unwind-unsafe behavior.
+    let result = AssertUnwindSafe(future).catch_unwind().await;
+    match result {
+        Ok(Ok(val)) => val,
+        Ok(Err(err)) => {
+            error!(
+                task_name,
+                "Task exited with error, exiting process: {err:?}"
+            );
+            std::process::exit(1);
+        }
+        Err(panic_obj) => {
+            error!(task_name, "Task panicked, exiting process: {panic_obj:?}");
+            std::process::exit(1);
+        }
     }
 }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 6d59752606..7c6464dab3 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -721,7 +721,6 @@ impl Tenant {
             Some(tenant_shard_id),
             None,
             "attach tenant",
-            false,
             async move {
 
                 info!(
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 4912608677..f23e6ff9d6 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -36,7 +36,7 @@ use crate::control_plane_client::{
 use crate::deletion_queue::DeletionQueueClient;
 use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
 use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
-use crate::task_mgr::{self, TaskKind};
+use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::config::{
     AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
 };
@@ -225,26 +225,98 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
     Ok(tmp_path)
 }
 
-/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
-/// the background, and thereby avoid blocking any API requests on this deletion completing.
-fn spawn_background_purge(tmp_path: Utf8PathBuf) {
-    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
-    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
-    let task_tenant_id = None;
+/// See [`Self::spawn`].
+#[derive(Clone)]
+pub struct BackgroundPurges(Arc<std::sync::Mutex<BackgroundPurgesInner>>);
+enum BackgroundPurgesInner {
+    Open(tokio::task::JoinSet<()>),
+    // we use the async mutex for coalescing
+    ShuttingDown(Arc<tokio::sync::Mutex<tokio::task::JoinSet<()>>>),
+}
 
-    task_mgr::spawn(
-        task_mgr::BACKGROUND_RUNTIME.handle(),
-        TaskKind::MgmtRequest,
-        task_tenant_id,
-        None,
-        "tenant_files_delete",
-        false,
-        async move {
-            fs::remove_dir_all(tmp_path.as_path())
-                .await
-                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-        },
-    );
+impl Default for BackgroundPurges {
+    fn default() -> Self {
+        Self(Arc::new(std::sync::Mutex::new(
+            BackgroundPurgesInner::Open(JoinSet::new()),
+        )))
+    }
+}
+
+impl BackgroundPurges {
+    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
+    /// the background, and thereby avoid blocking any API requests on this deletion completing.
+    ///
+    /// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+    /// Thus the [`BackgroundPurges`] type to keep track of these tasks.
+    pub fn spawn(&self, tmp_path: Utf8PathBuf) {
+        let mut guard = self.0.lock().unwrap();
+        let jset = match &mut *guard {
+            BackgroundPurgesInner::Open(ref mut jset) => jset,
+            BackgroundPurgesInner::ShuttingDown(_) => {
+                warn!("trying to spawn background purge during shutdown, ignoring");
+                return;
+            }
+        };
+        jset.spawn_on(
+            async move {
+                if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await {
+                    // should we fatal_io_error here?
+                    warn!(%error, path=%tmp_path, "failed to purge tenant directory");
+                }
+            }
+            .instrument(info_span!(parent: None, "background_purge")),
+            BACKGROUND_RUNTIME.handle(),
+        );
+    }
+
+    /// When this future completes, all background purges have completed.
+    /// The first poll of the future will already lock out new background purges spawned via [`Self::spawn`].
+    ///
+    /// Concurrent calls will coalesce.
+    ///
+    /// # Cancellation-Safety
+    ///
+    /// If this future is dropped before polled to completion, concurrent and subsequent
+    /// instances of this future will continue to be correct.
+    #[instrument(skip_all)]
+    pub async fn shutdown(&self) {
+        let jset = {
+            let mut guard = self.0.lock().unwrap();
+            match &mut *guard {
+                BackgroundPurgesInner::Open(jset) => {
+                    *guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new(
+                        std::mem::take(jset),
+                    )))
+                }
+                BackgroundPurgesInner::ShuttingDown(_) => {
+                    // calling shutdown multiple times is most likely a bug in pageserver shutdown code
+                    warn!("already shutting down");
+                }
+            };
+            match &mut *guard {
+                BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(),
+                BackgroundPurgesInner::Open(_) => {
+                    unreachable!("above code transitions into shut down state");
+                }
+            }
+        };
+        let mut jset = jset.lock().await; // concurrent callers coalesce here
+        while let Some(res) = jset.join_next().await {
+            match res {
+                Ok(()) => {}
+                Err(e) if e.is_panic() => {
+                    // If it panicked, the error is already logged by the panic hook.
+                }
+                Err(e) if e.is_cancelled() => {
+                    unreachable!("we don't cancel the joinset or runtime")
+                }
+                Err(e) => {
+                    // No idea when this can happen, but let's log it.
+                    warn!(%e, "background purge task failed or panicked");
+                }
+            }
+        }
+    }
 }
 
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
@@ -270,6 +342,8 @@ pub struct TenantManager {
     // tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or
     // when the tenant detaches.
     cancel: CancellationToken,
+
+    background_purges: BackgroundPurges,
 }
 
 fn emergency_generations(
@@ -447,6 +521,7 @@ pub(crate) enum DeleteTenantError {
 #[instrument(skip_all)]
 pub async fn init_tenant_mgr(
     conf: &'static PageServerConf,
+    background_purges: BackgroundPurges,
     resources: TenantSharedResources,
     init_order: InitializationOrder,
     cancel: CancellationToken,
@@ -512,7 +587,7 @@ pub async fn init_tenant_mgr(
 
                     match safe_rename_tenant_dir(&tenant_dir_path).await {
                         Ok(tmp_path) => {
-                            spawn_background_purge(tmp_path);
+                            background_purges.spawn(tmp_path);
                         }
                         Err(e) => {
                             error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
@@ -634,6 +709,7 @@ pub async fn init_tenant_mgr(
         tenants: &TENANTS,
         resources,
         cancel: CancellationToken::new(),
+        background_purges,
     })
 }
 
@@ -1353,6 +1429,7 @@ impl TenantManager {
 
         async fn delete_local(
             conf: &PageServerConf,
+            background_purges: &BackgroundPurges,
             tenant_shard_id: &TenantShardId,
         ) -> anyhow::Result<()> {
             let local_tenant_directory = conf.tenant_path(tenant_shard_id);
@@ -1361,7 +1438,7 @@ impl TenantManager {
                 .with_context(|| {
                     format!("local tenant directory {local_tenant_directory:?} rename")
                 })?;
-            spawn_background_purge(tmp_dir);
+            background_purges.spawn(tmp_dir);
             Ok(())
         }
 
@@ -1379,12 +1456,12 @@ impl TenantManager {
                         barrier.wait().await;
                     }
                 }
-                delete_local(self.conf, &tenant_shard_id).await?;
+                delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?;
             }
             Some(TenantSlot::Secondary(secondary_tenant)) => {
                 secondary_tenant.shutdown().await;
 
-                delete_local(self.conf, &tenant_shard_id).await?;
+                delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?;
             }
             Some(TenantSlot::InProgress(_)) => unreachable!(),
             None => {}
@@ -1655,7 +1732,7 @@ impl TenantManager {
         let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
             .await
             .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
-        spawn_background_purge(tmp_path);
+        self.background_purges.spawn(tmp_path);
 
         fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
             "failpoint"
@@ -1831,7 +1908,7 @@ impl TenantManager {
         let tmp_path = self
             .detach_tenant0(conf, tenant_shard_id, deletion_queue_client)
             .await?;
-        spawn_background_purge(tmp_path);
+        self.background_purges.spawn(tmp_path);
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index bb42fbeebf..c75d1eaa5e 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1525,7 +1525,6 @@ impl RemoteTimelineClient {
                 Some(self.tenant_shard_id),
                 Some(self.timeline_id),
                 "remote upload",
-                false,
                 async move {
                     self_rc.perform_upload_task(task).await;
                     Ok(())
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index a233d11c4a..3132a28b12 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -31,6 +31,7 @@ use pageserver_api::{
 };
 use remote_storage::GenericRemoteStorage;
 
+use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::instrument;
 use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate};
@@ -293,15 +294,50 @@ impl SecondaryController {
     }
 }
 
+pub struct GlobalTasks {
+    cancel: CancellationToken,
+    uploader: JoinHandle<()>,
+    downloader: JoinHandle<()>,
+}
+
+impl GlobalTasks {
+    /// Caller is responsible for requesting shutdown via the cancellation token that was
+    /// passed to [`spawn_tasks`].
+    ///
+    /// # Panics
+    ///
+    /// This method panics if that token is not cancelled.
+    /// This is low-risk because we're calling this during process shutdown, so, a panic
+    /// will be informative but not cause undue downtime.
+    pub async fn wait(self) {
+        let Self {
+            cancel,
+            uploader,
+            downloader,
+        } = self;
+        assert!(
+            cancel.is_cancelled(),
+            "must cancel cancellation token, otherwise the tasks will not shut down"
+        );
+
+        let (uploader, downloader) = futures::future::join(uploader, downloader).await;
+        uploader.expect(
+            "unreachable: exit_on_panic_or_error would catch the panic and exit the process",
+        );
+        downloader.expect(
+            "unreachable: exit_on_panic_or_error would catch the panic and exit the process",
+        );
+    }
+}
+
 pub fn spawn_tasks(
     tenant_manager: Arc<TenantManager>,
     remote_storage: GenericRemoteStorage,
     background_jobs_can_start: Barrier,
     cancel: CancellationToken,
-) -> SecondaryController {
+) -> (SecondaryController, GlobalTasks) {
     let mgr_clone = tenant_manager.clone();
     let storage_clone = remote_storage.clone();
-    let cancel_clone = cancel.clone();
     let bg_jobs_clone = background_jobs_can_start.clone();
 
     let (download_req_tx, download_req_rx) =
@@ -309,17 +345,9 @@ pub fn spawn_tasks(
     let (upload_req_tx, upload_req_rx) =
         tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
 
-    let downloader_task_ctx = RequestContext::new(
-        TaskKind::SecondaryDownloads,
-        crate::context::DownloadBehavior::Download,
-    );
-    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
-        downloader_task_ctx.task_kind(),
-        None,
-        None,
+    let cancel_clone = cancel.clone();
+    let downloader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
         "secondary tenant downloads",
-        false,
         async move {
             downloader_task(
                 mgr_clone,
@@ -327,49 +355,41 @@ pub fn spawn_tasks(
                 download_req_rx,
                 bg_jobs_clone,
                 cancel_clone,
-                downloader_task_ctx,
+                RequestContext::new(
+                    TaskKind::SecondaryDownloads,
+                    crate::context::DownloadBehavior::Download,
+                ),
             )
             .await;
-
-            Ok(())
+            anyhow::Ok(())
         },
-    );
+    ));
 
-    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
-        TaskKind::SecondaryUploads,
-        None,
-        None,
+    let cancel_clone = cancel.clone();
+    let uploader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
         "heatmap uploads",
-        false,
         async move {
             heatmap_uploader_task(
                 tenant_manager,
                 remote_storage,
                 upload_req_rx,
                 background_jobs_can_start,
-                cancel,
+                cancel_clone,
             )
             .await;
-
-            Ok(())
+            anyhow::Ok(())
         },
-    );
+    ));
 
-    SecondaryController {
-        download_req_tx,
-        upload_req_tx,
-    }
-}
-
-/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
-pub fn null_controller() -> SecondaryController {
-    let (download_req_tx, _download_req_rx) =
-        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
-    let (upload_req_tx, _upload_req_rx) =
-        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
-    SecondaryController {
-        upload_req_tx,
-        download_req_tx,
-    }
+    (
+        SecondaryController {
+            upload_req_tx,
+            download_req_tx,
+        },
+        GlobalTasks {
+            cancel,
+            uploader,
+            downloader,
+        },
+    )
 }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index d679b78f32..7f59e54eb7 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -101,7 +101,6 @@ pub fn start_background_loops(
         Some(tenant_shard_id),
         None,
         &format!("compactor for tenant {tenant_shard_id}"),
-        false,
         {
             let tenant = Arc::clone(tenant);
             let background_jobs_can_start = background_jobs_can_start.cloned();
@@ -125,7 +124,6 @@ pub fn start_background_loops(
         Some(tenant_shard_id),
         None,
         &format!("garbage collector for tenant {tenant_shard_id}"),
-        false,
         {
             let tenant = Arc::clone(tenant);
             let background_jobs_can_start = background_jobs_can_start.cloned();
@@ -149,7 +147,6 @@ pub fn start_background_loops(
         Some(tenant_shard_id),
         None,
         &format!("ingest housekeeping for tenant {tenant_shard_id}"),
-        false,
         {
             let tenant = Arc::clone(tenant);
             let background_jobs_can_start = background_jobs_can_start.cloned();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b312a1e43d..8866e1af5c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2477,7 +2477,6 @@ impl Timeline {
             Some(self.tenant_shard_id),
             Some(self.timeline_id),
             "layer flush task",
-            false,
             async move {
                 let _guard = guard;
                 let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
@@ -2822,7 +2821,6 @@ impl Timeline {
             Some(self.tenant_shard_id),
             Some(self.timeline_id),
             "initial size calculation",
-            false,
             // NB: don't log errors here, task_mgr will do that.
             async move {
                 let cancel = task_mgr::shutdown_token();
@@ -2991,7 +2989,6 @@ impl Timeline {
             Some(self.tenant_shard_id),
             Some(self.timeline_id),
             "ondemand logical size calculation",
-            false,
             async move {
                 let res = self_clone
                     .logical_size_calculation_task(lsn, cause, &ctx)
@@ -5435,7 +5432,6 @@ impl Timeline {
             Some(self.tenant_shard_id),
             Some(self.timeline_id),
             "download all remote layers task",
-            false,
             async move {
                 self_clone.download_all_remote_layers(request).await;
                 let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap();
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index d32945d9e4..02124ad852 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -391,7 +391,6 @@ impl DeleteTimelineFlow {
             Some(tenant_shard_id),
             Some(timeline_id),
             "timeline_delete",
-            false,
             async move {
                 if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
                     error!("Error: {err:#}");
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 8a8c38d0ce..972ac48cda 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -65,7 +65,6 @@ impl Timeline {
                 "layer eviction for {}/{}",
                 self.tenant_shard_id, self.timeline_id
             ),
-            false,
             async move {
                 tokio::select! {
                     _ = self_clone.cancel.cancelled() => { return Ok(()); }

From 6237322a2ee57dffed0ef300749dd7e155a1ecd6 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 22 Jul 2024 17:32:25 +0200
Subject: [PATCH 1240/1571] build: mark `target/` and `pg_install/` with
 `CACHEDIR.TAG` (#8448)

Backup tools such as `tar` and `restic` recognize this.
More info: https://bford.info/cachedir/

NB: cargo _should_ create the tag file in the `target/` directory
but doesn't if the directory already exists, which happens frequently
if rust-analyzer is launched by your IDE before you can type
`cargo build`. Hence, create the file manually here.
=> https://github.com/rust-lang/cargo/issues/14281
---
 Makefile | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 942867d81a..de298303e3 100644
--- a/Makefile
+++ b/Makefile
@@ -69,6 +69,8 @@ CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
 # Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
 CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
 
+CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
+
 #
 # Top level Makefile to build Neon and PostgreSQL
 #
@@ -79,15 +81,24 @@ all: neon postgres neon-pg-ext
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: neon
-neon: postgres-headers walproposer-lib
+neon: postgres-headers walproposer-lib cargo-target-dir
 	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
+.PHONY: cargo-target-dir
+cargo-target-dir:
+	# https://github.com/rust-lang/cargo/issues/14281
+	mkdir -p target
+	test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG
 
 ### PostgreSQL parts
 # Some rules are duplicated for Postgres v14 and 15. We may want to refactor
 # to avoid the duplication in the future, but it's tolerable for now.
 #
 $(POSTGRES_INSTALL_DIR)/build/%/config.status:
+
+	mkdir -p $(POSTGRES_INSTALL_DIR)
+	test -e $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG
+
 	+@echo "Configuring Postgres $* build"
 	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
 		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \

From f17fe75169029dc4f2cd778d02d3aac57aae2341 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 22 Jul 2024 18:39:18 +0200
Subject: [PATCH 1241/1571] Mark body of archival_config endpoint as required
 (#8458)

As pointed out in
https://github.com/neondatabase/neon/pull/8414#discussion_r1684881525

Part of https://github.com/neondatabase/neon/issues/8088
---
 pageserver/src/http/openapi_spec.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 087d281a0c..59e646d0ca 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -414,7 +414,7 @@ paths:
         Either archives or unarchives the given timeline.
         An archived timeline may not have any non-archived children.
       requestBody:
-        required: false
+        required: true
         content:
           application/json:
             schema:

From a868e342d48704e5a3ca3c4a8178d9b46242aaa9 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@neon.tech>
Date: Wed, 17 Jul 2024 10:04:46 +0300
Subject: [PATCH 1242/1571] Change default version of Neon extensio  to 1.4

---
 pgxn/neon/neon.control                                    | 2 +-
 test_runner/regress/test_lfc_working_set_approximation.py | 2 +-
 test_runner/regress/test_neon_extension.py                | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control
index cee2f336f2..03bdb9a0b4 100644
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -1,6 +1,6 @@
 # neon extension
 comment = 'cloud storage for PostgreSQL'
-default_version = '1.3'
+default_version = '1.4'
 module_pathname = '$libdir/neon'
 relocatable = true
 trusted = true
diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py
index 6465bdfd21..4c53e4e2fd 100644
--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -89,7 +89,7 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
     )
     conn = endpoint.connect()
     cur = conn.cursor()
-    cur.execute("create extension neon version '1.4'")
+    cur.execute("create extension neon")
     cur.execute(
         "create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 128))"
     )
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index e83aaf91c6..bb844244e3 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -24,7 +24,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
             # IMPORTANT:
             # If the version has changed, the test should be updated.
             # Ensure that the default version is also updated in the neon.control file
-            assert cur.fetchone() == ("1.3",)
+            assert cur.fetchone() == ("1.4",)
             cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
             res = cur.fetchall()
             log.info(res)
@@ -48,10 +48,10 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
             # IMPORTANT:
             # If the version has changed, the test should be updated.
             # Ensure that the default version is also updated in the neon.control file
-            assert cur.fetchone() == ("1.3",)
+            assert cur.fetchone() == ("1.4",)
             cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
             all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"]
-            current_version = "1.3"
+            current_version = "1.4"
             for idx, begin_version in enumerate(all_versions):
                 for target_version in all_versions[idx + 1 :]:
                     if current_version != begin_version:

From d6753e9ee4925166cced51cd1b1426fbee75ccad Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Mon, 22 Jul 2024 11:28:08 -0700
Subject: [PATCH 1243/1571] vm-image: Expose new LFC working set size metrics
 (#8298)

In general, replace:

* 'lfc_approximate_working_set_size' with
* 'lfc_approximate_working_set_size_windows'

For the "main" metrics that are actually scraped and used internally,
the old one is just marked as deprecated.
For the "autoscaling" metrics, we're not currently using the old one, so
we can get away with just replacing it.

Also, for the user-visible metrics we'll only store & expose a few
different time windows, to avoid making the UI overly busy or bloating
our internal metrics storage.

But for the autoscaling-related scraper, we aren't storing the metrics,
and it's useful to be able to programmatically operate on the trendline
of how WSS increases (or doesn't!) with window size. So there, we can
just output datapoints for each minute.

Part of neondatabase/autoscaling#872
See also https://www.notion.so/neondatabase/cca38138fadd45eaa753d81b859490c6
---
 vm-image-spec.yaml | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 3c446ecdea..224e9847f3 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -236,6 +236,7 @@ files:
         query: |
           select sum(pg_database_size(datname)) as total from pg_database;
 
+      # DEPRECATED
       - metric_name: lfc_approximate_working_set_size
         type: gauge
         help: 'Approximate working set size in pages of 8192 bytes'
@@ -244,6 +245,20 @@ files:
         query: |
           select neon.approximate_working_set_size(false) as approximate_working_set_size;
 
+      - metric_name: lfc_approximate_working_set_size_windows
+        type: gauge
+        help: 'Approximate working set size in pages of 8192 bytes'
+        key_labels: [duration]
+        values: [size]
+        # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
+        # of durations in a pretty-printed form.
+        query: |
+          select
+            x as duration,
+            neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
+          from
+            (values ('5m'),('15m'),('1h')) as t (x);
+
       - metric_name: current_lsn
         type: gauge
         help: 'Current LSN of the database'
@@ -377,13 +392,19 @@ files:
         query: |
           select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
 
-      - metric_name: lfc_approximate_working_set_size
+      - metric_name: lfc_approximate_working_set_size_windows
         type: gauge
         help: 'Approximate working set size in pages of 8192 bytes'
-        key_labels:
-        values: [approximate_working_set_size]
+        key_labels: [duration_seconds]
+        values: [size]
+        # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
+        # size looking back 1..60 minutes, labeled with the number of minutes.
         query: |
-          select neon.approximate_working_set_size(false) as approximate_working_set_size;
+          select
+            x::text as duration_seconds,
+            neon.approximate_working_set_size_seconds(x) as size
+          from
+            (select generate_series * 60 as x from generate_series(1, 60));
 build: |
   # Build cgroup-tools
   #

From 3cd888f173d3326f2f3ef134a16877dee7dd8bb6 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Mon, 22 Jul 2024 15:02:25 -0400
Subject: [PATCH 1244/1571] fix(docs): remove incorrect flags for scrubber
 purge-garbage command (#8463)

Scrubber purge-garbage command does not take `--node-kind` and
`--depth`.
---
 storage_scrubber/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage_scrubber/README.md b/storage_scrubber/README.md
index 5b137950d9..9fbd92feef 100644
--- a/storage_scrubber/README.md
+++ b/storage_scrubber/README.md
@@ -65,7 +65,7 @@ to pass them on the command line
 
 Example:
 
-`env AWS_PROFILE=dev cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json`
+`env AWS_PROFILE=dev cargo run --release -- purge-garbage --input-path=eu-west-1-garbage.json`
 
 Add the `--delete` argument before `purge-garbage` to enable deletion.  This is intentionally
 not provided inline in the example above to avoid accidents.  Without the `--delete` flag

From 35854928d96ee43ea621df5158d958443e33a773 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 23 Jul 2024 11:41:12 +0100
Subject: [PATCH 1245/1571] pageserver: use identity file as node id authority
 and remove init command and config-override flags (#7766)

Ansible will soon write the node id to `identity.toml` in the work dir
for new pageservers. On the pageserver side, we read the node id from
the identity file if it is present and use that as the source of truth.
If the identity file is missing, cannot be read, or does not
deserialise, start-up is aborted.

This PR also removes the `--init` mode and the `--config-override` flag
from the `pageserver` binary.
The neon_local is already not using these flags anymore.

Ansible still uses them until the linked change is merged & deployed,
so, this PR has to land simultaneously or after the Ansible change due
to that.

Related Ansible change: https://github.com/neondatabase/aws/pull/1322
Cplane change to remove config-override usages:
https://github.com/neondatabase/cloud/pull/13417
Closes: https://github.com/neondatabase/neon/issues/7736
Overall plan:
https://www.notion.so/neondatabase/Rollout-Plan-simplified-pageserver-initialization-f935ae02b225444e8a41130b7d34e4ea?pvs=4

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 Dockerfile                                    |  18 ++--
 control_plane/src/pageserver.rs               |  18 ++++
 .../compute_wrapper/shell/compute.sh          |   2 +-
 docker-compose/docker-compose.yml             |  15 +--
 .../pageserver_config/identity.toml           |   1 +
 .../pageserver_config/pageserver.toml         |   5 +
 pageserver/src/bin/pageserver.rs              | 101 ++++++------------
 pageserver/src/config.rs                      |  68 ++++++++----
 test_runner/regress/test_pageserver_api.py    |  64 -----------
 9 files changed, 117 insertions(+), 175 deletions(-)
 create mode 100644 docker-compose/pageserver_config/identity.toml
 create mode 100644 docker-compose/pageserver_config/pageserver.toml

diff --git a/Dockerfile b/Dockerfile
index a41598ef72..ace112cccf 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -93,13 +93,14 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
 
 # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
 # Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
-RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
-    && /usr/local/bin/pageserver -D /data/.neon/ --init \
-       -c "id=1234" \
-       -c "broker_endpoint='http://storage_broker:50051'" \
-       -c "pg_distrib_dir='/usr/local/'" \
-       -c "listen_pg_addr='0.0.0.0:6400'" \
-       -c "listen_http_addr='0.0.0.0:9898'"
+RUN mkdir -p /data/.neon/ && \
+  echo "id=1234" > "/data/.neon/identity.toml" && \
+  echo "broker_endpoint='http://storage_broker:50051'\n" \
+       "pg_distrib_dir='/usr/local/'\n" \
+       "listen_pg_addr='0.0.0.0:6400'\n" \
+       "listen_http_addr='0.0.0.0:9898'\n" \
+  > /data/.neon/pageserver.toml && \
+  chown -R neon:neon /data/.neon
 
 # When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
 # that want a particular postgres version will select it explicitly: this is just a default.
@@ -110,3 +111,6 @@ VOLUME ["/data"]
 USER neon
 EXPOSE 6400
 EXPOSE 9898
+
+CMD /usr/local/bin/pageserver -D /data/.neon
+
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index e3d1d0e110..ba4f98d945 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -25,6 +25,7 @@ use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
 use utils::auth::{Claims, Scope};
+use utils::id::NodeId;
 use utils::{
     id::{TenantId, TimelineId},
     lsn::Lsn,
@@ -74,6 +75,10 @@ impl PageServerNode {
         }
     }
 
+    fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document {
+        toml_edit::Document::from_str(&format!("id={node_id}")).unwrap()
+    }
+
     fn pageserver_init_make_toml(
         &self,
         conf: NeonLocalInitPageserverConf,
@@ -186,6 +191,19 @@ impl PageServerNode {
             .write_all(config.to_string().as_bytes())
             .context("write pageserver toml")?;
         drop(config_file);
+
+        let identity_file_path = datadir.join("identity.toml");
+        let mut identity_file = std::fs::OpenOptions::new()
+            .create_new(true)
+            .write(true)
+            .open(identity_file_path)
+            .with_context(|| format!("open identity toml for write: {config_file_path:?}"))?;
+        let identity_toml = self.pageserver_make_identity_toml(node_id);
+        identity_file
+            .write_all(identity_toml.to_string().as_bytes())
+            .context("write identity toml")?;
+        drop(identity_toml);
+
         // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
 
         // Write metadata file, used by pageserver on startup to register itself with
diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh
index f646e36f59..33455e458a 100755
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -33,7 +33,7 @@ echo $result | jq .
 
 generate_id timeline_id
 PARAMS=(
-     -sb 
+     -sbf
      -X POST
      -H "Content-Type: application/json"
      -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index 5503b6611a..6e15fdbe0d 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -31,25 +31,14 @@ services:
     restart: always
     image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
     environment:
-      - BROKER_ENDPOINT='http://storage_broker:50051'
       - AWS_ACCESS_KEY_ID=minio
       - AWS_SECRET_ACCESS_KEY=password
       #- RUST_BACKTRACE=1
     ports:
        #- 6400:6400  # pg protocol handler
        - 9898:9898 # http endpoints
-    entrypoint:
-      - "/bin/sh"
-      - "-c"
-    command:
-      - "/usr/local/bin/pageserver -D /data/.neon/
-                                   -c \"broker_endpoint=$$BROKER_ENDPOINT\"
-                                   -c \"listen_pg_addr='0.0.0.0:6400'\"
-                                   -c \"listen_http_addr='0.0.0.0:9898'\"
-                                   -c \"remote_storage={endpoint='http://minio:9000',
-                                                        bucket_name='neon',
-                                                        bucket_region='eu-north-1',
-                                                        prefix_in_bucket='/pageserver/'}\""
+    volumes:
+      - ./pageserver_config:/data/.neon/
     depends_on:
       - storage_broker
       - minio_create_buckets
diff --git a/docker-compose/pageserver_config/identity.toml b/docker-compose/pageserver_config/identity.toml
new file mode 100644
index 0000000000..20121327c7
--- /dev/null
+++ b/docker-compose/pageserver_config/identity.toml
@@ -0,0 +1 @@
+id=1234
diff --git a/docker-compose/pageserver_config/pageserver.toml b/docker-compose/pageserver_config/pageserver.toml
new file mode 100644
index 0000000000..76935453b6
--- /dev/null
+++ b/docker-compose/pageserver_config/pageserver.toml
@@ -0,0 +1,5 @@
+broker_endpoint='http://storage_broker:50051'
+pg_distrib_dir='/usr/local/'
+listen_pg_addr='0.0.0.0:6400'
+listen_http_addr='0.0.0.0:9898'
+remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 978b81d498..db27a77ec6 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,17 +2,18 @@
 
 //! Main entry point for the Page Server executable.
 
+use std::env;
 use std::env::{var, VarError};
 use std::io::Read;
 use std::sync::Arc;
 use std::time::Duration;
-use std::{env, ops::ControlFlow, str::FromStr};
 
 use anyhow::{anyhow, Context};
 use camino::Utf8Path;
 use clap::{Arg, ArgAction, Command};
 
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
+use pageserver::config::PageserverIdentity;
 use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
@@ -29,7 +30,7 @@ use tracing::*;
 
 use metrics::set_build_info_metric;
 use pageserver::{
-    config::{defaults::*, PageServerConf},
+    config::PageServerConf,
     context::{DownloadBehavior, RequestContext},
     deletion_queue::DeletionQueue,
     http, page_cache, page_service, task_mgr,
@@ -88,18 +89,13 @@ fn main() -> anyhow::Result<()> {
         .with_context(|| format!("Error opening workdir '{workdir}'"))?;
 
     let cfg_file_path = workdir.join("pageserver.toml");
+    let identity_file_path = workdir.join("identity.toml");
 
     // Set CWD to workdir for non-daemon modes
     env::set_current_dir(&workdir)
         .with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?;
 
-    let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
-        ControlFlow::Continue(conf) => conf,
-        ControlFlow::Break(()) => {
-            info!("Pageserver config init successful");
-            return Ok(());
-        }
-    };
+    let conf = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?;
 
     // Initialize logging.
     //
@@ -154,70 +150,55 @@ fn main() -> anyhow::Result<()> {
 }
 
 fn initialize_config(
+    identity_file_path: &Utf8Path,
     cfg_file_path: &Utf8Path,
-    arg_matches: clap::ArgMatches,
     workdir: &Utf8Path,
-) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
-    let init = arg_matches.get_flag("init");
-
-    let file_contents: Option<toml_edit::Document> = match std::fs::File::open(cfg_file_path) {
+) -> anyhow::Result<&'static PageServerConf> {
+    // The deployment orchestrator writes out an indentity file containing the node id
+    // for all pageservers. This file is the source of truth for the node id. In order
+    // to allow for rolling back pageserver releases, the node id is also included in
+    // the pageserver config that the deployment orchestrator writes to disk for the pageserver.
+    // A rolled back version of the pageserver will get the node id from the pageserver.toml
+    // config file.
+    let identity = match std::fs::File::open(identity_file_path) {
         Ok(mut f) => {
-            if init {
-                anyhow::bail!("config file already exists: {cfg_file_path}");
+            let md = f.metadata().context("stat config file")?;
+            if !md.is_file() {
+                anyhow::bail!("Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ...");
             }
+
+            let mut s = String::new();
+            f.read_to_string(&mut s).context("read identity file")?;
+            toml_edit::de::from_str::<PageserverIdentity>(&s)?
+        }
+        Err(e) => {
+            anyhow::bail!("Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ...");
+        }
+    };
+
+    let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) {
+        Ok(mut f) => {
             let md = f.metadata().context("stat config file")?;
             if md.is_file() {
                 let mut s = String::new();
                 f.read_to_string(&mut s).context("read config file")?;
-                Some(s.parse().context("parse config file toml")?)
+                s.parse().context("parse config file toml")?
             } else {
                 anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
             }
         }
-        Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
         Err(e) => {
             anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
         }
     };
 
-    let mut effective_config = file_contents.unwrap_or_else(|| {
-        DEFAULT_CONFIG_FILE
-            .parse()
-            .expect("unit tests ensure this works")
-    });
-
-    // Patch with overrides from the command line
-    if let Some(values) = arg_matches.get_many::<String>("config-override") {
-        for option_line in values {
-            let doc = toml_edit::Document::from_str(option_line).with_context(|| {
-                format!("Option '{option_line}' could not be parsed as a toml document")
-            })?;
-
-            for (key, item) in doc.iter() {
-                effective_config.insert(key, item.clone());
-            }
-        }
-    }
-
-    debug!("Resulting toml: {effective_config}");
+    debug!("Using pageserver toml: {config}");
 
     // Construct the runtime representation
-    let conf = PageServerConf::parse_and_validate(&effective_config, workdir)
+    let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir)
         .context("Failed to parse pageserver configuration")?;
 
-    if init {
-        info!("Writing pageserver config to '{cfg_file_path}'");
-
-        std::fs::write(cfg_file_path, effective_config.to_string())
-            .with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
-        info!("Config successfully written to '{cfg_file_path}'")
-    }
-
-    Ok(if init {
-        ControlFlow::Break(())
-    } else {
-        ControlFlow::Continue(Box::leak(Box::new(conf)))
-    })
+    Ok(Box::leak(Box::new(conf)))
 }
 
 struct WaitForPhaseResult<F: std::future::Future + Unpin> {
@@ -710,28 +691,12 @@ fn cli() -> Command {
     Command::new("Neon page server")
         .about("Materializes WAL stream to pages and serves them to the postgres")
         .version(version())
-        .arg(
-            Arg::new("init")
-                .long("init")
-                .action(ArgAction::SetTrue)
-                .help("Initialize pageserver with all given config overrides"),
-        )
         .arg(
             Arg::new("workdir")
                 .short('D')
                 .long("workdir")
                 .help("Working directory for the pageserver"),
         )
-        // See `settings.md` for more details on the extra configuration patameters pageserver can process
-        .arg(
-            Arg::new("config-override")
-                .long("config-override")
-                .short('c')
-                .num_args(1)
-                .action(ArgAction::Append)
-                .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
-                Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
-        )
         .arg(
             Arg::new("enabled-features")
                 .long("enabled-features")
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 6a78d126cf..20e78b1d85 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -7,8 +7,8 @@
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
 use remote_storage::{RemotePath, RemoteStorageConfig};
-use serde;
 use serde::de::IntoDeserializer;
+use serde::{self, Deserialize};
 use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
@@ -406,6 +406,13 @@ struct PageServerConfigBuilder {
 }
 
 impl PageServerConfigBuilder {
+    fn new(node_id: NodeId) -> Self {
+        let mut this = Self::default();
+        this.id(node_id);
+
+        this
+    }
+
     #[inline(always)]
     fn default_values() -> Self {
         use self::BuilderValue::*;
@@ -881,8 +888,12 @@ impl PageServerConf {
     /// validating the input and failing on errors.
     ///
     /// This leaves any options not present in the file in the built-in defaults.
-    pub fn parse_and_validate(toml: &Document, workdir: &Utf8Path) -> anyhow::Result<Self> {
-        let mut builder = PageServerConfigBuilder::default();
+    pub fn parse_and_validate(
+        node_id: NodeId,
+        toml: &Document,
+        workdir: &Utf8Path,
+    ) -> anyhow::Result<Self> {
+        let mut builder = PageServerConfigBuilder::new(node_id);
         builder.workdir(workdir.to_owned());
 
         let mut t_conf = TenantConfOpt::default();
@@ -913,7 +924,8 @@ impl PageServerConf {
                 "tenant_config" => {
                     t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
                 }
-                "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
+                "id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
+                            // Logging is not set up yet, so we can't do it.
                 "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
                 "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                 "log_format" => builder.log_format(
@@ -1090,6 +1102,12 @@ impl PageServerConf {
     }
 }
 
+#[derive(Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct PageserverIdentity {
+    pub id: NodeId,
+}
+
 // Helper functions to parse a toml Item
 
 fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
@@ -1259,7 +1277,7 @@ background_task_maximum_delay = '334 s'
         );
         let toml = config_string.parse()?;
 
-        let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
+        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
             .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
 
         assert_eq!(
@@ -1341,7 +1359,7 @@ background_task_maximum_delay = '334 s'
         );
         let toml = config_string.parse()?;
 
-        let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
+        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
             .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
 
         assert_eq!(
@@ -1431,12 +1449,13 @@ broker_endpoint = '{broker_endpoint}'
 
             let toml = config_string.parse()?;
 
-            let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
-                .unwrap_or_else(|e| {
-                    panic!("Failed to parse config '{config_string}', reason: {e:?}")
-                })
-                .remote_storage_config
-                .expect("Should have remote storage config for the local FS");
+            let parsed_remote_storage_config =
+                PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
+                    .unwrap_or_else(|e| {
+                        panic!("Failed to parse config '{config_string}', reason: {e:?}")
+                    })
+                    .remote_storage_config
+                    .expect("Should have remote storage config for the local FS");
 
             assert_eq!(
                 parsed_remote_storage_config,
@@ -1492,12 +1511,13 @@ broker_endpoint = '{broker_endpoint}'
 
             let toml = config_string.parse()?;
 
-            let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
-                .unwrap_or_else(|e| {
-                    panic!("Failed to parse config '{config_string}', reason: {e:?}")
-                })
-                .remote_storage_config
-                .expect("Should have remote storage config for S3");
+            let parsed_remote_storage_config =
+                PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
+                    .unwrap_or_else(|e| {
+                        panic!("Failed to parse config '{config_string}', reason: {e:?}")
+                    })
+                    .remote_storage_config
+                    .expect("Should have remote storage config for S3");
 
             assert_eq!(
                 parsed_remote_storage_config,
@@ -1576,7 +1596,7 @@ threshold = "20m"
 "#,
         );
         let toml: Document = pageserver_conf_toml.parse()?;
-        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
+        let conf = PageServerConf::parse_and_validate(NodeId(333), &toml, &workdir)?;
 
         assert_eq!(conf.pg_distrib_dir, pg_distrib_dir);
         assert_eq!(
@@ -1592,7 +1612,11 @@ threshold = "20m"
                 .evictions_low_residence_duration_metric_threshold,
             Duration::from_secs(20 * 60)
         );
-        assert_eq!(conf.id, NodeId(222));
+
+        // Assert that the node id provided by the indentity file (threaded
+        // through the call to [`PageServerConf::parse_and_validate`] is
+        // used.
+        assert_eq!(conf.id, NodeId(333));
         assert_eq!(
             conf.disk_usage_based_eviction,
             Some(DiskUsageEvictionTaskConfig {
@@ -1637,7 +1661,7 @@ threshold = "20m"
 "#,
         );
         let toml: Document = pageserver_conf_toml.parse().unwrap();
-        let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap();
+        let conf = PageServerConf::parse_and_validate(NodeId(222), &toml, &workdir).unwrap();
 
         match &conf.default_tenant_conf.eviction_policy {
             EvictionPolicy::OnlyImitiate(t) => {
@@ -1656,7 +1680,7 @@ threshold = "20m"
 remote_storage = {}
         "#;
         let doc = toml_edit::Document::from_str(input).unwrap();
-        let err = PageServerConf::parse_and_validate(&doc, &workdir)
+        let err = PageServerConf::parse_and_validate(NodeId(222), &doc, &workdir)
             .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
         assert!(format!("{err}").contains("remote_storage"), "{err}");
     }
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index caeae7fd15..28dbf40bed 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -1,8 +1,5 @@
-import subprocess
-from pathlib import Path
 from typing import Optional
 
-import toml
 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.neon_fixtures import (
     DEFAULT_BRANCH_NAME,
@@ -13,67 +10,6 @@ from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.utils import wait_until
 
 
-def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path):
-    """
-    NB: The neon_local doesn't use `--init` mode anymore, but our production
-    deployment still does => https://github.com/neondatabase/aws/pull/1322
-    """
-    workdir = neon_simple_env.pageserver.workdir
-    pageserver_config = workdir / "pageserver.toml"
-    pageserver_bin = neon_binpath / "pageserver"
-
-    def run_pageserver(args):
-        return subprocess.run(
-            [str(pageserver_bin), "-D", str(workdir), *args],
-            check=False,
-            universal_newlines=True,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
-
-    neon_simple_env.pageserver.stop()
-
-    with open(neon_simple_env.pageserver.config_toml_path, "r") as f:
-        ps_config = toml.load(f)
-
-    required_config_keys = [
-        "pg_distrib_dir",
-        "listen_pg_addr",
-        "listen_http_addr",
-        "pg_auth_type",
-        "http_auth_type",
-        # TODO: only needed for NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM in https://github.com/neondatabase/neon/pull/7748
-        # "tenant_config",
-    ]
-    required_config_overrides = [
-        f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys
-    ]
-
-    pageserver_config.unlink()
-
-    bad_init = run_pageserver(["--init", *required_config_overrides])
-    assert (
-        bad_init.returncode == 1
-    ), "pageserver should not be able to init new config without the node id"
-    assert 'missing config value "id"' in bad_init.stderr
-    assert not pageserver_config.exists(), "config file should not be created after init error"
-
-    good_init_cmd = [
-        "--init",
-        f"--config-override=id={ps_config['id']}",
-        *required_config_overrides,
-    ]
-    completed_init = run_pageserver(good_init_cmd)
-    assert (
-        completed_init.returncode == 0
-    ), "pageserver should be able to create a new config with the node id given"
-    assert pageserver_config.exists(), "config file should be created successfully"
-
-    bad_reinit = run_pageserver(good_init_cmd)
-    assert bad_reinit.returncode == 1, "pageserver refuses to init if already exists"
-    assert "config file already exists" in bad_reinit.stderr
-
-
 def check_client(env: NeonEnv, client: PageserverHttpClient):
     pg_version = env.pg_version
     initial_tenant = env.initial_tenant

From 80c8ceacbcb3cbd4dbc51c14e3b95b9c4934260f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 23 Jul 2024 12:57:57 +0100
Subject: [PATCH 1246/1571] tests: make `test_scrubber_physical_gc_ancestors`
 more stable (#8453)

## Problem

This test sometimes found that ancestors were getting cleaned up before
it had done any compaction.

Compaction was happening implicitly via Workload.

Example:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8298/10032173390/index.html#testresult/fb04786402f80822/retries

## Summary of changes

- Set upload=False when writing data after shard split, to avoid doing a
checkpoint
- Add a checkpoint_period & explicit wait for uploads so that we ensure
data lands in S3 without doing a checkpoint
---
 pageserver/src/http/routes.rs                | 24 ++++++++++++--------
 test_runner/fixtures/pageserver/http.py      |  4 ++++
 test_runner/regress/test_storage_scrubber.py | 12 ++++++++--
 3 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index b8063eb5a2..d63c240365 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1676,6 +1676,10 @@ async fn timeline_checkpoint_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
         flags |= CompactFlags::ForceImageLayerCreation;
     }
+
+    // By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload.
+    let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true);
+
     let wait_until_uploaded =
         parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 
@@ -1692,15 +1696,17 @@ async fn timeline_checkpoint_handler(
 
                 }
             })?;
-        timeline
-            .compact(&cancel, flags, &ctx)
-            .await
-            .map_err(|e|
-                match e {
-                    CompactionError::ShuttingDown => ApiError::ShuttingDown,
-                    CompactionError::Other(e) => ApiError::InternalServerError(e)
-                }
-            )?;
+        if compact {
+            timeline
+                .compact(&cancel, flags, &ctx)
+                .await
+                .map_err(|e|
+                    match e {
+                        CompactionError::ShuttingDown => ApiError::ShuttingDown,
+                        CompactionError::Other(e) => ApiError::InternalServerError(e)
+                    }
+                )?;
+        }
 
         if wait_until_uploaded {
             timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index c7cea4ec04..c6df6b5baf 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -662,6 +662,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         force_repartition=False,
         force_image_layer_creation=False,
         wait_until_uploaded=False,
+        compact: Optional[bool] = None,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
@@ -672,6 +673,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         if wait_until_uploaded:
             query["wait_until_uploaded"] = "true"
 
+        if compact is not None:
+            query["compact"] = "true" if compact else "false"
+
         log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index a9f12f09b7..7c411a6b84 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -209,9 +209,17 @@ def test_scrubber_physical_gc_ancestors(
     new_shard_count = 4
     assert shard_count is None or new_shard_count > shard_count
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
+    env.storage_controller.reconcile_until_idle()  # Move shards to their final locations immediately
 
-    # Make sure child shards have some layers
-    workload.write_rows(100)
+    # Make sure child shards have some layers.  Do not force upload, because the test helper calls checkpoint, which
+    # compacts, and we only want to do tha explicitly later in the test.
+    workload.write_rows(100, upload=False)
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+        log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
+        ps.http_client().timeline_checkpoint(
+            shard, timeline_id, compact=False, wait_until_uploaded=True
+        )
 
     # Flush deletion queue so that we don't leave any orphan layers in the parent that will confuse subsequent checks: once
     # a shard is split, any layers in its prefix that aren't referenced by a child will be considered GC'able, even

From 129f348aae46a0af4c8bac67cea17bd46de81d78 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 23 Jul 2024 14:05:07 +0000
Subject: [PATCH 1247/1571] build(deps): bump openssl from 0.10.64 to 0.10.66
 in /test_runner/pg_clients/rust/tokio-postgres (#8464)

---
 test_runner/pg_clients/rust/tokio-postgres/Cargo.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
index 32c1c52eea..354fc15745 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
@@ -421,9 +421,9 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
 [[package]]
 name = "openssl"
-version = "0.10.64"
+version = "0.10.66"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f"
+checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1"
 dependencies = [
  "bitflags 2.6.0",
  "cfg-if",
@@ -453,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.102"
+version = "0.9.103"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2"
+checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6"
 dependencies = [
  "cc",
  "libc",

From 1a4c1eba92780f29f7b42449ab0f952d1630d771 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 23 Jul 2024 15:37:12 +0100
Subject: [PATCH 1248/1571] pageserver: add LayerVisibilityHint (#8432)

## Problem

As described in https://github.com/neondatabase/neon/issues/8398, layer
visibility is a new hint that will help us manage disk space more
efficiently.

## Summary of changes

- Introduce LayerVisibilityHint and store it as part of access stats
- Automatically mark a layer visible if it is accessed, or when it is
created.

The impact on the access stats size will be reversed in
https://github.com/neondatabase/neon/pull/8431

This is functionally a no-op change: subsequent PRs will add the logic
that sets layers to Covered, and which uses the layer visibility as an
input to eviction and heatmap generation.

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/src/tenant/storage_layer.rs        | 37 ++++++++++++++++++-
 pageserver/src/tenant/storage_layer/layer.rs  |  2 +
 .../src/tenant/storage_layer/layer/tests.rs   |  4 +-
 .../src/tenant/timeline/layer_manager.rs      |  8 ++++
 4 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index a389358f0d..3404308e56 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -458,6 +458,26 @@ pub enum ValueReconstructResult {
     Missing,
 }
 
+/// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
+/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
+/// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
+/// be used for cache management but not for correctness-critical checks.
+#[derive(Default, Debug, Clone)]
+pub(crate) enum LayerVisibilityHint {
+    /// A Visible layer might be read while serving a read, because there is not an image layer between it
+    /// and a readable LSN (the tip of the branch or a child's branch point)
+    Visible,
+    /// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
+    /// a branch or ephemeral endpoint at an LSN below the layer that covers this.
+    #[allow(unused)]
+    Covered,
+    /// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
+    /// in this state.  Note that newly written layers may be called Visible immediately, this uninitialized
+    /// state is for when existing layers are constructed while loading a timeline.
+    #[default]
+    Uninitialized,
+}
+
 #[derive(Debug)]
 pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
 
@@ -469,6 +489,7 @@ pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
 struct LayerAccessStatsLocked {
     for_scraping_api: LayerAccessStatsInner,
     for_eviction_policy: LayerAccessStatsInner,
+    visibility: LayerVisibilityHint,
 }
 
 impl LayerAccessStatsLocked {
@@ -592,7 +613,13 @@ impl LayerAccessStats {
             inner.count_by_access_kind[access_kind] += 1;
             inner.task_kind_flag |= ctx.task_kind();
             inner.last_accesses.write(this_access);
-        })
+        });
+
+        // We may access a layer marked as Covered, if a new branch was created that depends on
+        // this layer, and background updates to layer visibility didn't notice it yet
+        if !matches!(locked.visibility, LayerVisibilityHint::Visible) {
+            locked.visibility = LayerVisibilityHint::Visible;
+        }
     }
 
     fn as_api_model(
@@ -694,6 +721,14 @@ impl LayerAccessStats {
             (Some(a), Some(r)) => a.when >= r.timestamp,
         }
     }
+
+    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
+        self.0.lock().unwrap().visibility = visibility;
+    }
+
+    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
+        self.0.lock().unwrap().visibility.clone()
+    }
 }
 
 /// Get a layer descriptor from a layer.
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index d1c9173f1c..25d8ee6b2b 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -250,6 +250,8 @@ impl Layer {
                 LayerResidenceStatus::Resident,
                 LayerResidenceEventReason::LayerCreate,
             );
+            // Newly created layers are marked visible by default: the usual case is that they were created to be read.
+            access_stats.set_visibility(super::LayerVisibilityHint::Visible);
 
             let local_path = local_layer_path(
                 conf,
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 8a3737f8a7..66a4493218 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -826,9 +826,9 @@ async fn eviction_cancellation_on_drop() {
 #[test]
 #[cfg(target_arch = "x86_64")]
 fn layer_size() {
-    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
+    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2048);
     assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(std::mem::size_of::<LayerInner>(), 2344);
+    assert_eq!(std::mem::size_of::<LayerInner>(), 2352);
     // it also has the utf8 path
 }
 
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index a43ff873ac..1e4edd34ad 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -255,6 +255,14 @@ impl LayerManager {
                 new_layer.layer_desc().lsn_range
             );
 
+            // Transfer visibilty hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
+            // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents
+            // always marking rewritten layers as visible.
+            new_layer
+                .as_ref()
+                .access_stats()
+                .set_visibility(old_layer.access_stats().visibility());
+
             // Safety: we may never rewrite the same file in-place.  Callers are responsible
             // for ensuring that they only rewrite layers after something changes the path,
             // such as an increment in the generation number.

From 563d73d923e7d8b5880df14c9a50ca01c777d2b6 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 23 Jul 2024 18:41:55 +0300
Subject: [PATCH 1249/1571] Use smgrexists() instead of access() to enforce
 uniqueness of generated relfilenumber (#7992)

## Problem

Postgres is using `access()` function in `GetNewRelFileNumber` to check
if assigned relfilenumber is not used for any other relation. This check
will not work in Neon, because we do not have all files in local
storage.

## Summary of changes

Use smgrexists() instead which will check at page server if such
relfilenode is used.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs           | 10 +++++
 pgxn/neon_test_utils/neon_test_utils--1.3.sql |  6 +++
 pgxn/neon_test_utils/neontest.c               | 16 +++++++
 test_runner/regress/test_oid_overflow.py      | 45 +++++++++++++++++++
 vendor/postgres-v14                           |  2 +-
 vendor/postgres-v15                           |  2 +-
 vendor/postgres-v16                           |  2 +-
 vendor/revisions.json                         |  6 +--
 8 files changed, 83 insertions(+), 6 deletions(-)
 create mode 100644 test_runner/regress/test_oid_overflow.py

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 3bbd084ab4..85f3a6e0fb 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -284,6 +284,16 @@ impl Timeline {
         if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
             return Ok(true);
         }
+        // then check if the database was already initialized.
+        // get_rel_exists can be called before dbdir is created.
+        let buf = version.get(self, DBDIR_KEY, ctx).await?;
+        let dbdirs = match DbDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => Ok(dir.dbdirs),
+            Err(e) => Err(PageReconstructError::from(e)),
+        }?;
+        if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
+            return Ok(false);
+        }
         // fetch directory listing
         let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
         let buf = version.get(self, key, ctx).await?;
diff --git a/pgxn/neon_test_utils/neon_test_utils--1.3.sql b/pgxn/neon_test_utils/neon_test_utils--1.3.sql
index 3b8794a8cf..9a9b41c3a3 100644
--- a/pgxn/neon_test_utils/neon_test_utils--1.3.sql
+++ b/pgxn/neon_test_utils/neon_test_utils--1.3.sql
@@ -7,6 +7,12 @@ AS 'MODULE_PATHNAME', 'test_consume_xids'
 LANGUAGE C STRICT
 PARALLEL UNSAFE;
 
+CREATE FUNCTION test_consume_oids(oid int)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'test_consume_oids'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
+
 CREATE FUNCTION test_consume_cpu(seconds int)
 RETURNS VOID
 AS 'MODULE_PATHNAME', 'test_consume_cpu'
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 650ef7405d..0b5499ca53 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -35,6 +35,7 @@ PG_MODULE_MAGIC;
 extern void _PG_init(void);
 
 PG_FUNCTION_INFO_V1(test_consume_xids);
+PG_FUNCTION_INFO_V1(test_consume_oids);
 PG_FUNCTION_INFO_V1(test_consume_cpu);
 PG_FUNCTION_INFO_V1(test_consume_memory);
 PG_FUNCTION_INFO_V1(test_release_memory);
@@ -74,6 +75,21 @@ _PG_init(void)
 
 #define neon_read_at_lsn neon_read_at_lsn_ptr
 
+/*
+ * test_consume_oids(int4), for rapidly consuming OIDs, to test wraparound.
+ * Unlike test_consume_xids which is passed number of xids to be consumed,
+ * this function is given the target Oid.
+ */
+Datum
+test_consume_oids(PG_FUNCTION_ARGS)
+{
+	int32 oid = PG_GETARG_INT32(0);
+
+	while (oid != GetNewObjectId());
+
+	PG_RETURN_VOID();
+}
+
 /*
  * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound.
  */
diff --git a/test_runner/regress/test_oid_overflow.py b/test_runner/regress/test_oid_overflow.py
new file mode 100644
index 0000000000..a94ae99ed9
--- /dev/null
+++ b/test_runner/regress/test_oid_overflow.py
@@ -0,0 +1,45 @@
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+def test_oid_overflow(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+
+    endpoint = env.endpoints.create_start("main")
+
+    conn = endpoint.connect()
+    cur = conn.cursor()
+
+    cur.execute("CREATE EXTENSION neon_test_utils")
+
+    cur.execute("CREATE TABLE t1(x integer)")
+    cur.execute("INSERT INTO t1 values (1)")
+    cur.execute("CREATE TABLE t2(x integer)")
+    cur.execute("INSERT INTO t2 values (2)")
+
+    cur.execute("SELECT x from t1")
+    assert cur.fetchone() == (1,)
+    cur.execute("SELECT x from t2")
+    assert cur.fetchone() == (2,)
+
+    cur.execute("VACUUM FULL t1")
+    cur.execute("VACUUM FULL t1")
+    cur.execute("vacuum pg_class")
+    cur.execute("SELECT relfilenode FROM pg_class where relname='t1'")
+    oid = cur.fetchall()[0][0]
+    log.info(f"t1.relfilenode={oid}")
+
+    cur.execute("set statement_timeout=0")
+    cur.execute(f"select test_consume_oids({oid-1})")
+    cur.execute("VACUUM FULL t2")
+
+    cur.execute("SELECT relfilenode FROM pg_class where relname='t2'")
+    oid = cur.fetchall()[0][0]
+    log.info(f"t2.relfilenode={oid}")
+
+    cur.execute("SELECT clear_buffer_cache()")
+
+    cur.execute("SELECT x from t1")
+    assert cur.fetchone() == (1,)
+    cur.execute("SELECT x from t2")
+    assert cur.fetchone() == (2,)
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index ad73770c44..dbd0e6428b 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit ad73770c446ea361f43e4f0404798b7e5e7a62d8
+Subproject commit dbd0e6428b9274d72a10ac29bd3e3162faf109d4
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 4874c8e52e..035b73a9c5 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 4874c8e52ed349a9f8290bbdcd91eb92677a5d24
+Subproject commit 035b73a9c5998f9a0ef35cc8df1bae680bf770fc
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index b810fdfcbb..b39f316137 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit b810fdfcbb59afea7ea7bbe0cf94eaccb55a2ea2
+Subproject commit b39f316137fdd29e2da15d2af2fdd1cfd18163be
diff --git a/vendor/revisions.json b/vendor/revisions.json
index da49ff19c3..eeebd646f5 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "b810fdfcbb59afea7ea7bbe0cf94eaccb55a2ea2"],
-  "v15": ["15.7", "4874c8e52ed349a9f8290bbdcd91eb92677a5d24"],
-  "v14": ["14.12", "ad73770c446ea361f43e4f0404798b7e5e7a62d8"]
+  "v16": ["16.3", "b39f316137fdd29e2da15d2af2fdd1cfd18163be"],
+  "v15": ["15.7", "035b73a9c5998f9a0ef35cc8df1bae680bf770fc"],
+  "v14": ["14.12", "dbd0e6428b9274d72a10ac29bd3e3162faf109d4"]
 }

From d47c94b336c42129861e52d4243ffd67940ce7f8 Mon Sep 17 00:00:00 2001
From: Shinya Kato <37682778+shinyaaa@users.noreply.github.com>
Date: Wed, 24 Jul 2024 00:46:05 +0900
Subject: [PATCH 1250/1571] Fix to use a tab instead of spaces (#8394)

## Problem
There were spaces instead of a tab in the C source file.

## Summary of changes
I fixed to use a tab instead of spaces.
---
 pgxn/neon/neon.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 3197a7e715..d107cdc1c2 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -597,7 +597,7 @@ _PG_init(void)
 
 	pg_init_libpagestore();
 	pg_init_walproposer();
-        WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
+	WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 	LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 
 	InitLogicalReplicationMonitor();

From 9e23410074b0d48a923fc0f2cc7dabee3bfd41ff Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 23 Jul 2024 21:09:05 +0100
Subject: [PATCH 1251/1571] tests: allow-list a controller heartbeat error
 (#8471)

## Problem

`test_change_pageserver` stops pageservers in a way that can overlap
with the controller's heartbeats: the controller can get a heartbeat
success and then immediately find the node unavailable. This particular
situation triggers a log that isn't in our current allow-list of
messages for nodes offline

Example:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8339/10048487700/index.html#testresult/19678f27810231df/retries

## Summary of changes

- Add the message to the allow list
---
 storage_controller/src/service.rs                 | 2 ++
 test_runner/fixtures/pageserver/allowed_errors.py | 1 +
 2 files changed, 3 insertions(+)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index a163453dca..2a6d5d3578 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -828,6 +828,8 @@ impl Service {
                                 );
                             }
                             Err(err) => {
+                                // Transition to active involves reconciling: if a node responds to a heartbeat then
+                                // becomes unavailable again, we may get an error here.
                                 tracing::error!(
                                     "Failed to update node {} after heartbeat round: {}",
                                     node_id,
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index c5b09e3608..dff002bd4b 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -102,6 +102,7 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
     # failing to connect to them.
     ".*Call to node.*management API.*failed.*receive body.*",
     ".*Call to node.*management API.*failed.*ReceiveBody.*",
+    ".*Failed to update node .+ after heartbeat round.*error sending request for url.*",
     # Many tests will start up with a node offline
     ".*startup_reconcile: Could not scan node.*",
     # Tests run in dev mode

From 39a35671df3e653ba6ecf275a98b354eb106a90b Mon Sep 17 00:00:00 2001
From: Andrey Taranik <andrey@cicd.team>
Date: Wed, 24 Jul 2024 00:36:28 +0300
Subject: [PATCH 1252/1571] temporarily disable cache saving in the registry as
 it is very slow (#8475)

## Problem

`compute-node-image-arch` jobs are very slow and block development.

## Summary of changes

Temporary disable cache saving
---
 .github/workflows/build_and_test.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index cb7655e039..6407809432 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -865,7 +865,8 @@ jobs:
           pull: true
           file: Dockerfile.compute-node
           cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
+          # 23.07.2024 temporarily disable cache saving in the registry as it is very slow
+          # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
           tags: |
             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 

From 18cf5cfefded5f910c1c3259578f6e3124124760 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Tue, 23 Jul 2024 19:28:43 -0400
Subject: [PATCH 1253/1571] feat(pageserver): support retain_lsn in bottommost
 gc-compaction (#8328)

part of https://github.com/neondatabase/neon/issues/8002

The main thing in this pull request is the new `generate_key_retention`
function. It decides which deltas to retain and generate images for a
given key based on its history + retain_lsn + horizon.

On that, we generate a flat single level of delta layers over all deltas
included in the compaction. In the future, we can decide whether to
split them over the LSN axis as described in the RFC.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/tenant.rs                      | 317 +++++++++++++++
 .../src/tenant/storage_layer/delta_layer.rs   |   7 +-
 pageserver/src/tenant/timeline.rs             |   2 +-
 pageserver/src/tenant/timeline/compaction.rs  | 373 ++++++++++++++----
 4 files changed, 622 insertions(+), 77 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7c6464dab3..cb3ca9c8b9 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3992,6 +3992,7 @@ mod tests {
     use storage_layer::PersistentLayerKey;
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
+    use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
     use timeline::{DeltaLayerTestDesc, GcInfo};
     use utils::bin_ser::BeSer;
     use utils::id::TenantId;
@@ -7214,4 +7215,320 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_generate_key_retention() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_generate_key_retention").await?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        tline.force_advance_lsn(Lsn(0x70));
+        let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let history = vec![
+            (
+                key,
+                Lsn(0x10),
+                Value::Image(Bytes::copy_from_slice(b"0x10")),
+            ),
+            (
+                key,
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
+            ),
+            (
+                key,
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
+            ),
+            (
+                key,
+                Lsn(0x40),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
+            ),
+            (
+                key,
+                Lsn(0x50),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x50")),
+            ),
+            (
+                key,
+                Lsn(0x60),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
+            ),
+            (
+                key,
+                Lsn(0x70),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
+            ),
+            (
+                key,
+                Lsn(0x80),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x80")),
+            ),
+            (
+                key,
+                Lsn(0x90),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x90")),
+            ),
+        ];
+        let res = tline
+            .generate_key_retention(
+                key,
+                &history,
+                Lsn(0x60),
+                &[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
+                3,
+            )
+            .await
+            .unwrap();
+        let expected_res = KeyHistoryRetention {
+            below_horizon: vec![
+                (
+                    Lsn(0x20),
+                    KeyLogAtLsn(vec![(
+                        Lsn(0x20),
+                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20")),
+                    )]),
+                ),
+                (
+                    Lsn(0x40),
+                    KeyLogAtLsn(vec![
+                        (
+                            Lsn(0x30),
+                            Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
+                        ),
+                        (
+                            Lsn(0x40),
+                            Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
+                        ),
+                    ]),
+                ),
+                (
+                    Lsn(0x50),
+                    KeyLogAtLsn(vec![(
+                        Lsn(0x50),
+                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40;0x50")),
+                    )]),
+                ),
+                (
+                    Lsn(0x60),
+                    KeyLogAtLsn(vec![(
+                        Lsn(0x60),
+                        Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
+                    )]),
+                ),
+            ],
+            above_horizon: KeyLogAtLsn(vec![
+                (
+                    Lsn(0x70),
+                    Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
+                ),
+                (
+                    Lsn(0x80),
+                    Value::WalRecord(NeonWalRecord::wal_append(";0x80")),
+                ),
+                (
+                    Lsn(0x90),
+                    Value::WalRecord(NeonWalRecord::wal_append(";0x90")),
+                ),
+            ]),
+        };
+        assert_eq!(res, expected_res);
+        // TODO: more tests with mixed image + delta, adding with k-merge test cases; e2e compaction test
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_with_retain_lsns() -> anyhow::Result<()> {
+        let harness =
+            TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+            (
+                get_key(2),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
+            (
+                get_key(3),
+                Lsn(0x28),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
+            ),
+            (
+                get_key(3),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
+            (
+                get_key(3),
+                Lsn(0x40),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
+            ),
+        ];
+        let delta2 = vec![
+            (
+                get_key(5),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+            (
+                get_key(6),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
+                ], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![Lsn(0x10), Lsn(0x20)],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let expected_result = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10@0x30"),
+            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10@0x20"),
+            Bytes::from_static(b"value 6@0x10@0x20"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10@0x48"),
+            Bytes::from_static(b"value 9@0x10@0x48"),
+        ];
+
+        let expected_result_at_gc_horizon = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10@0x30"),
+            Bytes::from_static(b"value 3@0x10@0x28@0x30"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10@0x20"),
+            Bytes::from_static(b"value 6@0x10@0x20"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_20 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10@0x20"),
+            Bytes::from_static(b"value 6@0x10@0x20"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_10 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let verify_result = || async {
+            for idx in 0..10 {
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x30), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_gc_horizon[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x20), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_20[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x10), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_10[idx]
+                );
+            }
+        };
+
+        verify_result().await;
+
+        let cancel = CancellationToken::new();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+
+        verify_result().await;
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 512e9e86fa..c73059c34a 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -460,7 +460,12 @@ impl DeltaLayerWriterInner {
         will_init: bool,
         ctx: &RequestContext,
     ) -> (Vec<u8>, anyhow::Result<()>) {
-        assert!(self.lsn_range.start <= lsn);
+        assert!(
+            self.lsn_range.start <= lsn,
+            "lsn_start={}, lsn={}",
+            self.lsn_range.start,
+            lsn
+        );
         // We don't want to use compression in delta layer creation
         let compression = ImageCompressionAlgorithm::Disabled;
         let (val, res) = self
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8866e1af5c..4bfcdc43e8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,5 +1,5 @@
 pub(crate) mod analysis;
-mod compaction;
+pub(crate) mod compaction;
 pub mod delete;
 pub(crate) mod detach_ancestor;
 mod eviction_task;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index a648432b4d..fb8c125b60 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -28,7 +28,7 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}
 use crate::page_cache;
 use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
-use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
+use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState};
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
 use crate::tenant::timeline::{Layer, ResidentLayer};
@@ -36,7 +36,7 @@ use crate::tenant::DeltaLayer;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 
 use crate::keyspace::KeySpace;
-use crate::repository::Key;
+use crate::repository::{Key, Value};
 
 use utils::lsn::Lsn;
 
@@ -45,6 +45,60 @@ use pageserver_compaction::interface::*;
 
 use super::CompactionError;
 
+/// Maximum number of deltas before generating an image layer in bottom-most compaction.
+const COMPACTION_DELTA_THRESHOLD: usize = 5;
+
+/// The result of bottom-most compaction for a single key at each LSN.
+#[derive(Debug)]
+#[cfg_attr(test, derive(PartialEq))]
+pub struct KeyLogAtLsn(pub Vec<(Lsn, Value)>);
+
+/// The result of bottom-most compaction.
+#[derive(Debug)]
+#[cfg_attr(test, derive(PartialEq))]
+pub(crate) struct KeyHistoryRetention {
+    /// Stores logs to reconstruct the value at the given LSN, that is to say, logs <= LSN or image == LSN.
+    pub(crate) below_horizon: Vec<(Lsn, KeyLogAtLsn)>,
+    /// Stores logs to reconstruct the value at any LSN above the horizon, that is to say, log > LSN.
+    pub(crate) above_horizon: KeyLogAtLsn,
+}
+
+impl KeyHistoryRetention {
+    async fn pipe_to(
+        self,
+        key: Key,
+        delta_writer: &mut Vec<(Key, Lsn, Value)>,
+        image_writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let mut first_batch = true;
+        for (_, KeyLogAtLsn(logs)) in self.below_horizon {
+            if first_batch {
+                if logs.len() == 1 && logs[0].1.is_image() {
+                    let Value::Image(img) = &logs[0].1 else {
+                        unreachable!()
+                    };
+                    image_writer.put_image(key, img.clone(), ctx).await?;
+                } else {
+                    for (lsn, val) in logs {
+                        delta_writer.push((key, lsn, val));
+                    }
+                }
+                first_batch = false;
+            } else {
+                for (lsn, val) in logs {
+                    delta_writer.push((key, lsn, val));
+                }
+            }
+        }
+        let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
+        for (lsn, val) in above_horizon_logs {
+            delta_writer.push((key, lsn, val));
+        }
+        Ok(())
+    }
+}
+
 impl Timeline {
     /// TODO: cancellation
     pub(crate) async fn compact_legacy(
@@ -989,6 +1043,188 @@ impl Timeline {
         Ok(())
     }
 
+    /// Take a list of images and deltas, produce images and deltas according to GC horizon and retain_lsns.
+    ///
+    /// It takes a key, the values of the key within the compaction process, a GC horizon, and all retain_lsns below the horizon.
+    /// For now, it requires the `accumulated_values` contains the full history of the key (i.e., the key with the lowest LSN is
+    /// an image or a WAL not requiring a base image). This restriction will be removed once we implement gc-compaction on branch.
+    ///
+    /// The function returns the deltas and the base image that need to be placed at each of the retain LSN. For example, we have:
+    ///
+    /// A@0x10, +B@0x20, +C@0x30, +D@0x40, +E@0x50, +F@0x60
+    /// horizon = 0x50, retain_lsn = 0x20, 0x40, delta_threshold=3
+    ///
+    /// The function will produce:
+    ///
+    /// ```plain
+    /// 0x20(retain_lsn) -> img=AB@0x20                  always produce a single image below the lowest retain LSN
+    /// 0x40(retain_lsn) -> deltas=[+C@0x30, +D@0x40]    two deltas since the last base image, keeping the deltas
+    /// 0x50(horizon)    -> deltas=[ABCDE@0x50]          three deltas since the last base image, generate an image but put it in the delta
+    /// above_horizon    -> deltas=[+F@0x60]             full history above the horizon
+    /// ```
+    ///
+    /// Note that `accumulated_values` must be sorted by LSN and should belong to a single key.
+    pub(crate) async fn generate_key_retention(
+        self: &Arc<Timeline>,
+        key: Key,
+        history: &[(Key, Lsn, Value)],
+        horizon: Lsn,
+        retain_lsn_below_horizon: &[Lsn],
+        delta_threshold_cnt: usize,
+    ) -> anyhow::Result<KeyHistoryRetention> {
+        // Pre-checks for the invariants
+        if cfg!(debug_assertions) {
+            for (log_key, _, _) in history {
+                assert_eq!(log_key, &key, "mismatched key");
+            }
+            for i in 1..history.len() {
+                assert!(history[i - 1].1 <= history[i].1, "unordered LSN");
+                if history[i - 1].1 == history[i].1 {
+                    assert!(
+                        matches!(history[i - 1].2, Value::Image(_)),
+                        "unordered delta/image, or duplicated delta"
+                    );
+                }
+            }
+            if let Value::WalRecord(rec) = &history[0].2 {
+                assert!(rec.will_init(), "no base image");
+            }
+            for lsn in retain_lsn_below_horizon {
+                assert!(lsn < &horizon, "retain lsn must be below horizon")
+            }
+            for i in 1..retain_lsn_below_horizon.len() {
+                assert!(
+                    retain_lsn_below_horizon[i - 1] <= retain_lsn_below_horizon[i],
+                    "unordered LSN"
+                );
+            }
+        }
+        // Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon,
+        // and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket.
+        let (mut split_history, lsn_split_points) = {
+            let mut split_history = Vec::new();
+            split_history.resize_with(retain_lsn_below_horizon.len() + 2, Vec::new);
+            let mut lsn_split_points = Vec::with_capacity(retain_lsn_below_horizon.len() + 1);
+            for lsn in retain_lsn_below_horizon {
+                lsn_split_points.push(*lsn);
+            }
+            lsn_split_points.push(horizon);
+            let mut current_idx = 0;
+            for item @ (_, lsn, _) in history {
+                while current_idx < lsn_split_points.len() && *lsn > lsn_split_points[current_idx] {
+                    current_idx += 1;
+                }
+                split_history[current_idx].push(item);
+            }
+            (split_history, lsn_split_points)
+        };
+        // Step 2: filter out duplicated records due to the k-merge of image/delta layers
+        for split_for_lsn in &mut split_history {
+            let mut prev_lsn = None;
+            let mut new_split_for_lsn = Vec::with_capacity(split_for_lsn.len());
+            for record @ (_, lsn, _) in std::mem::take(split_for_lsn) {
+                if let Some(prev_lsn) = &prev_lsn {
+                    if *prev_lsn == lsn {
+                        // The case that we have an LSN with both data from the delta layer and the image layer. As
+                        // `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply
+                        // drop this delta and keep the image.
+                        //
+                        // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
+                        // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
+                        // dropped.
+                        continue;
+                    }
+                }
+                prev_lsn = Some(lsn);
+                new_split_for_lsn.push(record);
+            }
+            *split_for_lsn = new_split_for_lsn;
+        }
+        // Step 3: generate images when necessary
+        let mut retention = Vec::with_capacity(split_history.len());
+        let mut records_since_last_image = 0;
+        let batch_cnt = split_history.len();
+        assert!(
+            batch_cnt >= 2,
+            "should have at least below + above horizon batches"
+        );
+        let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
+        for (i, split_for_lsn) in split_history.into_iter().enumerate() {
+            records_since_last_image += split_for_lsn.len();
+            let generate_image = if i == 0 {
+                // We always generate images for the first batch (below horizon / lowest retain_lsn)
+                true
+            } else if i == batch_cnt - 1 {
+                // Do not generate images for the last batch (above horizon)
+                false
+            } else if records_since_last_image >= delta_threshold_cnt {
+                // Generate images when there are too many records
+                true
+            } else {
+                false
+            };
+            replay_history.extend(split_for_lsn.iter().map(|x| (*x).clone()));
+            if let Some((_, _, val)) = replay_history.first() {
+                assert!(val.will_init(), "invalid history, no base image");
+            }
+            // Only retain the items after the last image record
+            for idx in (0..replay_history.len()).rev() {
+                if replay_history[idx].2.will_init() {
+                    replay_history = replay_history[idx..].to_vec();
+                    break;
+                }
+            }
+            if generate_image && records_since_last_image > 0 {
+                records_since_last_image = 0;
+                let history = std::mem::take(&mut replay_history);
+                let mut img = None;
+                let mut records = Vec::with_capacity(history.len());
+                if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
+                    img = Some((*lsn, val.clone()));
+                    for (_, lsn, val) in history.into_iter().skip(1) {
+                        let Value::WalRecord(rec) = val else {
+                            panic!("invalid record")
+                        };
+                        records.push((lsn, rec));
+                    }
+                } else {
+                    for (_, lsn, val) in history.into_iter() {
+                        let Value::WalRecord(rec) = val else {
+                            panic!("invalid record")
+                        };
+                        records.push((lsn, rec));
+                    }
+                }
+                records.reverse();
+                let state = ValueReconstructState { img, records };
+                let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range
+                let img = self.reconstruct_value(key, request_lsn, state).await?;
+                replay_history.push((key, request_lsn, Value::Image(img.clone())));
+                retention.push(vec![(request_lsn, Value::Image(img))]);
+            } else {
+                retention.push(
+                    split_for_lsn
+                        .iter()
+                        .map(|(_, lsn, value)| (*lsn, value.clone()))
+                        .collect(),
+                );
+            }
+        }
+        let mut result = Vec::with_capacity(retention.len());
+        assert_eq!(retention.len(), lsn_split_points.len() + 1);
+        for (idx, logs) in retention.into_iter().enumerate() {
+            if idx == lsn_split_points.len() {
+                return Ok(KeyHistoryRetention {
+                    below_horizon: result,
+                    above_horizon: KeyLogAtLsn(logs),
+                });
+            } else {
+                result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
+            }
+        }
+        unreachable!()
+    }
+
     /// An experimental compaction building block that combines compaction with garbage collection.
     ///
     /// The current implementation picks all delta + image layers that are below or intersecting with
@@ -1000,7 +1236,6 @@ impl Timeline {
         _cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> Result<(), CompactionError> {
-        use crate::tenant::storage_layer::ValueReconstructState;
         use std::collections::BTreeSet;
 
         info!("running enhanced gc bottom-most compaction");
@@ -1013,30 +1248,51 @@ impl Timeline {
         // The layer selection has the following properties:
         // 1. If a layer is in the selection, all layers below it are in the selection.
         // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
-        let (layer_selection, gc_cutoff) = {
+        let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
             let guard = self.layers.read().await;
             let layers = guard.layer_map();
             let gc_info = self.gc_info.read().unwrap();
-            if !gc_info.retain_lsns.is_empty() || !gc_info.leases.is_empty() {
-                return Err(CompactionError::Other(anyhow!(
-                    "enhanced legacy compaction currently does not support retain_lsns (branches)"
-                )));
-            }
+            let mut retain_lsns_below_horizon = Vec::new();
             let gc_cutoff = gc_info.cutoffs.select_min();
+            for lsn in &gc_info.retain_lsns {
+                if lsn < &gc_cutoff {
+                    retain_lsns_below_horizon.push(*lsn);
+                }
+            }
+            for lsn in gc_info.leases.keys() {
+                if lsn < &gc_cutoff {
+                    retain_lsns_below_horizon.push(*lsn);
+                }
+            }
             let mut selected_layers = Vec::new();
-            // TODO: consider retain_lsns
             drop(gc_info);
             for desc in layers.iter_historic_layers() {
                 if desc.get_lsn_range().start <= gc_cutoff {
                     selected_layers.push(guard.get_from_desc(&desc));
                 }
             }
-            (selected_layers, gc_cutoff)
+            retain_lsns_below_horizon.sort();
+            (selected_layers, gc_cutoff, retain_lsns_below_horizon)
         };
+        let lowest_retain_lsn = retain_lsns_below_horizon
+            .first()
+            .copied()
+            .unwrap_or(gc_cutoff);
+        if cfg!(debug_assertions) {
+            assert_eq!(
+                lowest_retain_lsn,
+                retain_lsns_below_horizon
+                    .iter()
+                    .min()
+                    .copied()
+                    .unwrap_or(gc_cutoff)
+            );
+        }
         info!(
-            "picked {} layers for compaction with gc_cutoff={}",
+            "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
             layer_selection.len(),
-            gc_cutoff
+            gc_cutoff,
+            lowest_retain_lsn
         );
         // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
         // Also, collect the layer information to decide when to split the new delta layers.
@@ -1072,61 +1328,13 @@ impl Timeline {
         let mut accumulated_values = Vec::new();
         let mut last_key: Option<Key> = None;
 
-        /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
-        async fn flush_accumulated_states(
-            tline: &Arc<Timeline>,
-            key: Key,
-            accumulated_values: &[(Key, Lsn, crate::repository::Value)],
-            horizon: Lsn,
-        ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
-            let mut base_image = None;
-            let mut keys_above_horizon = Vec::new();
-            let mut delta_above_base_image = Vec::new();
-            // We have a list of deltas/images. We want to create image layers while collect garbages.
-            for (key, lsn, val) in accumulated_values.iter().rev() {
-                if *lsn > horizon {
-                    if let Some((_, prev_lsn, _)) = keys_above_horizon.last_mut() {
-                        if *prev_lsn == *lsn {
-                            // The case that we have an LSN with both data from the delta layer and the image layer. As
-                            // `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply
-                            // drop this delta and keep the image.
-                            //
-                            // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
-                            // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
-                            // dropped.
-                            continue;
-                        }
-                    }
-                    keys_above_horizon.push((*key, *lsn, val.clone()));
-                } else if *lsn <= horizon {
-                    match val {
-                        crate::repository::Value::Image(image) => {
-                            base_image = Some((*lsn, image.clone()));
-                            break;
-                        }
-                        crate::repository::Value::WalRecord(wal) => {
-                            delta_above_base_image.push((*lsn, wal.clone()));
-                        }
-                    }
-                }
-            }
-            // do not reverse delta_above_base_image, reconstruct state expects reversely-ordered records
-            keys_above_horizon.reverse();
-            let state = ValueReconstructState {
-                img: base_image,
-                records: delta_above_base_image,
-            };
-            let img = tline.reconstruct_value(key, horizon, state).await?;
-            Ok((keys_above_horizon, img))
-        }
-
         async fn flush_deltas(
             deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
             last_key: Key,
             delta_split_points: &[Key],
             current_delta_split_point: &mut usize,
             tline: &Arc<Timeline>,
-            gc_cutoff: Lsn,
+            lowest_retain_lsn: Lsn,
             ctx: &RequestContext,
         ) -> anyhow::Result<Option<ResidentLayer>> {
             // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
@@ -1161,7 +1369,7 @@ impl Timeline {
                 tline.timeline_id,
                 tline.tenant_shard_id,
                 deltas.first().unwrap().0,
-                gc_cutoff..end_lsn,
+                lowest_retain_lsn..end_lsn,
                 ctx,
             )
             .await?;
@@ -1178,7 +1386,7 @@ impl Timeline {
             self.timeline_id,
             self.tenant_shard_id,
             &(Key::MIN..Key::MAX), // covers the full key range
-            gc_cutoff,
+            lowest_retain_lsn,
             ctx,
         )
         .await?;
@@ -1195,12 +1403,19 @@ impl Timeline {
                 accumulated_values.push((key, lsn, val));
             } else {
                 let last_key = last_key.as_mut().unwrap();
-                let (deltas, image) =
-                    flush_accumulated_states(self, *last_key, &accumulated_values, gc_cutoff)
-                        .await?;
+                let retention = self
+                    .generate_key_retention(
+                        *last_key,
+                        &accumulated_values,
+                        gc_cutoff,
+                        &retain_lsns_below_horizon,
+                        COMPACTION_DELTA_THRESHOLD,
+                    )
+                    .await?;
                 // Put the image into the image layer. Currently we have a single big layer for the compaction.
-                image_layer_writer.put_image(*last_key, image, ctx).await?;
-                delta_values.extend(deltas);
+                retention
+                    .pipe_to(*last_key, &mut delta_values, &mut image_layer_writer, ctx)
+                    .await?;
                 delta_layers.extend(
                     flush_deltas(
                         &mut delta_values,
@@ -1208,7 +1423,7 @@ impl Timeline {
                         &delta_split_points,
                         &mut current_delta_split_point,
                         self,
-                        gc_cutoff,
+                        lowest_retain_lsn,
                         ctx,
                     )
                     .await?,
@@ -1221,11 +1436,19 @@ impl Timeline {
 
         let last_key = last_key.expect("no keys produced during compaction");
         // TODO: move this part to the loop body
-        let (deltas, image) =
-            flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
+        let retention = self
+            .generate_key_retention(
+                last_key,
+                &accumulated_values,
+                gc_cutoff,
+                &retain_lsns_below_horizon,
+                COMPACTION_DELTA_THRESHOLD,
+            )
+            .await?;
         // Put the image into the image layer. Currently we have a single big layer for the compaction.
-        image_layer_writer.put_image(last_key, image, ctx).await?;
-        delta_values.extend(deltas);
+        retention
+            .pipe_to(last_key, &mut delta_values, &mut image_layer_writer, ctx)
+            .await?;
         delta_layers.extend(
             flush_deltas(
                 &mut delta_values,
@@ -1233,7 +1456,7 @@ impl Timeline {
                 &delta_split_points,
                 &mut current_delta_split_point,
                 self,
-                gc_cutoff,
+                lowest_retain_lsn,
                 ctx,
             )
             .await?,

From 2c0d311a54927dabea9ae4f97559a0d878f36d9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 24 Jul 2024 02:09:01 +0200
Subject: [PATCH 1254/1571] remote_storage: add list_streaming API call (#8466)

This adds the ability to list many prefixes in a streaming fashion to
both the `RemoteStorage` trait as well as `GenericRemoteStorage`.

* The `list` function of the `RemoteStorage` trait is implemented by
default in terms of `list_streaming`.
* For the production users (S3, Azure), `list_streaming` is implemented
and the default `list` implementation is used.
* For `LocalFs`, we keep the `list` implementation and make
`list_streaming` call it.

The `list_streaming` function is implemented for both S3 and Azure.

A TODO for later is retries, which the scrubber currently has while the
`list_streaming` implementations lack them.

part of #8457 and #7547
---
 Cargo.lock                                   |   1 +
 libs/remote_storage/Cargo.toml               |   1 +
 libs/remote_storage/src/azure_blob.rs        |  55 +++----
 libs/remote_storage/src/lib.rs               |  54 ++++++-
 libs/remote_storage/src/local_fs.rs          |  11 ++
 libs/remote_storage/src/s3_bucket.rs         | 162 ++++++++++---------
 libs/remote_storage/src/simulate_failures.rs |  18 +++
 7 files changed, 185 insertions(+), 117 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b03bd57631..df9efbf7cc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4648,6 +4648,7 @@ name = "remote_storage"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "async-stream",
  "async-trait",
  "aws-config",
  "aws-credential-types",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 23d82b90bd..414bce1b26 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -7,6 +7,7 @@ license.workspace = true
 [dependencies]
 anyhow.workspace = true
 async-trait.workspace = true
+async-stream.workspace = true
 once_cell.workspace = true
 aws-smithy-async.workspace = true
 aws-smithy-types.workspace = true
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index d0146238da..266a1f6584 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -267,30 +267,30 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
 }
 
 impl RemoteStorage for AzureBlobStorage {
-    async fn list(
+    fn list_streaming(
         &self,
         prefix: Option<&RemotePath>,
         mode: ListingMode,
         max_keys: Option<NonZeroU32>,
         cancel: &CancellationToken,
-    ) -> anyhow::Result<Listing, DownloadError> {
-        let _permit = self.permit(RequestKind::List, cancel).await?;
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
+        // get the passed prefix or if it is not set use prefix_in_bucket value
+        let list_prefix = prefix
+            .map(|p| self.relative_path_to_name(p))
+            .or_else(|| self.prefix_in_container.clone())
+            .map(|mut p| {
+                // required to end with a separator
+                // otherwise request will return only the entry of a prefix
+                if matches!(mode, ListingMode::WithDelimiter)
+                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                {
+                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                }
+                p
+            });
 
-        let op = async {
-            // get the passed prefix or if it is not set use prefix_in_bucket value
-            let list_prefix = prefix
-                .map(|p| self.relative_path_to_name(p))
-                .or_else(|| self.prefix_in_container.clone())
-                .map(|mut p| {
-                    // required to end with a separator
-                    // otherwise request will return only the entry of a prefix
-                    if matches!(mode, ListingMode::WithDelimiter)
-                        && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                    {
-                        p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                    }
-                    p
-                });
+        async_stream::stream! {
+            let _permit = self.permit(RequestKind::List, cancel).await?;
 
             let mut builder = self.client.list_blobs();
 
@@ -316,10 +316,12 @@ impl RemoteStorage for AzureBlobStorage {
 
             let mut response = std::pin::pin!(response);
 
-            let mut res = Listing::default();
-
             let mut max_keys = max_keys.map(|mk| mk.get());
-            while let Some(entry) = response.next().await {
+            'outer: while let Some(entry) = tokio::select! {
+                op = response.next() => Ok(op),
+                _ = cancel.cancelled() => Err(DownloadError::Cancelled),
+            }? {
+                let mut res = Listing::default();
                 let entry = entry?;
                 let prefix_iter = entry
                     .blobs
@@ -339,19 +341,14 @@ impl RemoteStorage for AzureBlobStorage {
                         assert!(mk > 0);
                         mk -= 1;
                         if mk == 0 {
-                            return Ok(res); // limit reached
+                            yield Ok(res); // limit reached
+                            break 'outer;
                         }
                         max_keys = Some(mk);
                     }
                 }
+                yield Ok(res);
             }
-
-            Ok(res)
-        };
-
-        tokio::select! {
-            res = op => res,
-            _ = cancel.cancelled() => Err(DownloadError::Cancelled),
         }
     }
 
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 3ee7d15a76..201e2fb178 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -26,7 +26,7 @@ use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 
 use bytes::Bytes;
-use futures::stream::Stream;
+use futures::{stream::Stream, StreamExt};
 use serde::{Deserialize, Serialize};
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -160,13 +160,15 @@ pub struct Listing {
 /// providing basic CRUD operations for storage files.
 #[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
-    /// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
+    /// List objects in remote storage, with semantics matching AWS S3's [`ListObjectsV2`].
+    ///
+    /// The stream is guaranteed to return at least one element, even in the case of errors
+    /// (in that case it's an `Err()`), or an empty `Listing`.
     ///
     /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
     /// from the absolute root of the bucket.
     ///
-    /// `mode` configures whether to use a delimiter.  Without a delimiter all keys
+    /// `mode` configures whether to use a delimiter.  Without a delimiter, all keys
     /// within the prefix are listed in the `keys` of the result.  With a delimiter, any "directories" at the top level of
     /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
     /// returned in `keys` ().
@@ -175,13 +177,31 @@ pub trait RemoteStorage: Send + Sync + 'static {
     /// will iteratively call listobjects until it runs out of keys.  Note that this is not safe to use on
     /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
     ///
+    /// [`ListObjectsV2`]: <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>
+    fn list_streaming(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> impl Stream<Item = Result<Listing, DownloadError>>;
+
     async fn list(
         &self,
         prefix: Option<&RemotePath>,
-        _mode: ListingMode,
+        mode: ListingMode,
         max_keys: Option<NonZeroU32>,
         cancel: &CancellationToken,
-    ) -> Result<Listing, DownloadError>;
+    ) -> Result<Listing, DownloadError> {
+        let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel));
+        let mut combined = stream.next().await.expect("At least one item required")?;
+        while let Some(list) = stream.next().await {
+            let list = list?;
+            combined.keys.extend_from_slice(&list.keys);
+            combined.prefixes.extend_from_slice(&list.prefixes);
+        }
+        Ok(combined)
+    }
 
     /// Streams the local file contents into remote into the remote storage entry.
     ///
@@ -288,8 +308,8 @@ impl Debug for Download {
 
 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
-#[derive(Clone)]
 // Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
+#[derive(Clone)]
 pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
     LocalFs(LocalFs),
     AwsS3(Arc<S3Bucket>),
@@ -298,13 +318,14 @@ pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
 }
 
 impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
+    // See [`RemoteStorage::list`].
     pub async fn list(
         &self,
         prefix: Option<&RemotePath>,
         mode: ListingMode,
         max_keys: Option<NonZeroU32>,
         cancel: &CancellationToken,
-    ) -> anyhow::Result<Listing, DownloadError> {
+    ) -> Result<Listing, DownloadError> {
         match self {
             Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await,
             Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await,
@@ -313,6 +334,23 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         }
     }
 
+    // See [`RemoteStorage::list_streaming`].
+    pub fn list_streaming<'a>(
+        &'a self,
+        prefix: Option<&'a RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &'a CancellationToken,
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
+        match self {
+            Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
+                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
+            Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
+            Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
+            Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
+        }
+    }
+
     /// See [`RemoteStorage::upload`]
     pub async fn upload(
         &self,
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 1f7bcfc982..a4857b0bba 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -331,6 +331,17 @@ impl LocalFs {
 }
 
 impl RemoteStorage for LocalFs {
+    fn list_streaming(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
+        let listing = self.list(prefix, mode, max_keys, cancel);
+        futures::stream::once(listing)
+    }
+
     async fn list(
         &self,
         prefix: Option<&RemotePath>,
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 056646a01e..39106a4e53 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -467,17 +467,16 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
 }
 
 impl RemoteStorage for S3Bucket {
-    async fn list(
+    fn list_streaming(
         &self,
         prefix: Option<&RemotePath>,
         mode: ListingMode,
         max_keys: Option<NonZeroU32>,
         cancel: &CancellationToken,
-    ) -> Result<Listing, DownloadError> {
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
         let kind = RequestKind::List;
         // s3 sdk wants i32
         let mut max_keys = max_keys.map(|mk| mk.get() as i32);
-        let mut result = Listing::default();
 
         // get the passed prefix or if it is not set use prefix_in_bucket value
         let list_prefix = prefix
@@ -489,89 +488,92 @@ impl RemoteStorage for S3Bucket {
                 })
             });
 
-        let _permit = self.permit(kind, cancel).await?;
+        async_stream::stream! {
+            let _permit = self.permit(kind, cancel).await?;
 
-        let mut continuation_token = None;
+            let mut continuation_token = None;
+            'outer: loop {
+                let started_at = start_measuring_requests(kind);
 
-        loop {
-            let started_at = start_measuring_requests(kind);
+                // min of two Options, returning Some if one is value and another is
+                // None (None is smaller than anything, so plain min doesn't work).
+                let request_max_keys = self
+                    .max_keys_per_list_response
+                    .into_iter()
+                    .chain(max_keys.into_iter())
+                    .min();
+                let mut request = self
+                    .client
+                    .list_objects_v2()
+                    .bucket(self.bucket_name.clone())
+                    .set_prefix(list_prefix.clone())
+                    .set_continuation_token(continuation_token)
+                    .set_max_keys(request_max_keys);
 
-            // min of two Options, returning Some if one is value and another is
-            // None (None is smaller than anything, so plain min doesn't work).
-            let request_max_keys = self
-                .max_keys_per_list_response
-                .into_iter()
-                .chain(max_keys.into_iter())
-                .min();
-            let mut request = self
-                .client
-                .list_objects_v2()
-                .bucket(self.bucket_name.clone())
-                .set_prefix(list_prefix.clone())
-                .set_continuation_token(continuation_token)
-                .set_max_keys(request_max_keys);
-
-            if let ListingMode::WithDelimiter = mode {
-                request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
-            }
-
-            let request = request.send();
-
-            let response = tokio::select! {
-                res = request => res,
-                _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
-                _ = cancel.cancelled() => return Err(DownloadError::Cancelled),
-            };
-
-            let response = response
-                .context("Failed to list S3 prefixes")
-                .map_err(DownloadError::Other);
-
-            let started_at = ScopeGuard::into_inner(started_at);
-
-            crate::metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &response, started_at);
-
-            let response = response?;
-
-            let keys = response.contents();
-            let empty = Vec::new();
-            let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
-
-            tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
-
-            for object in keys {
-                let object_path = object.key().expect("response does not contain a key");
-                let remote_path = self.s3_object_to_relative_path(object_path);
-                result.keys.push(remote_path);
-                if let Some(mut mk) = max_keys {
-                    assert!(mk > 0);
-                    mk -= 1;
-                    if mk == 0 {
-                        return Ok(result); // limit reached
-                    }
-                    max_keys = Some(mk);
+                if let ListingMode::WithDelimiter = mode {
+                    request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
                 }
+
+                let request = request.send();
+
+                let response = tokio::select! {
+                    res = request => Ok(res),
+                    _ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout),
+                    _ = cancel.cancelled() => Err(DownloadError::Cancelled),
+                }?;
+
+                let response = response
+                    .context("Failed to list S3 prefixes")
+                    .map_err(DownloadError::Other);
+
+                let started_at = ScopeGuard::into_inner(started_at);
+
+                crate::metrics::BUCKET_METRICS
+                    .req_seconds
+                    .observe_elapsed(kind, &response, started_at);
+
+                let response = response?;
+
+                let keys = response.contents();
+                let prefixes = response.common_prefixes.as_deref().unwrap_or_default();
+
+                tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
+                let mut result = Listing::default();
+
+                for object in keys {
+                    let object_path = object.key().expect("response does not contain a key");
+                    let remote_path = self.s3_object_to_relative_path(object_path);
+                    result.keys.push(remote_path);
+                    if let Some(mut mk) = max_keys {
+                        assert!(mk > 0);
+                        mk -= 1;
+                        if mk == 0 {
+                            // limit reached
+                            yield Ok(result);
+                            break 'outer;
+                        }
+                        max_keys = Some(mk);
+                    }
+                }
+
+                // S3 gives us prefixes like "foo/", we return them like "foo"
+                result.prefixes.extend(prefixes.iter().filter_map(|o| {
+                    Some(
+                        self.s3_object_to_relative_path(
+                            o.prefix()?
+                                .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
+                        ),
+                    )
+                }));
+
+                yield Ok(result);
+
+                continuation_token = match response.next_continuation_token {
+                    Some(new_token) => Some(new_token),
+                    None => break,
+                };
             }
-
-            // S3 gives us prefixes like "foo/", we return them like "foo"
-            result.prefixes.extend(prefixes.iter().filter_map(|o| {
-                Some(
-                    self.s3_object_to_relative_path(
-                        o.prefix()?
-                            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
-                    ),
-                )
-            }));
-
-            continuation_token = match response.next_continuation_token {
-                Some(new_token) => Some(new_token),
-                None => break,
-            };
         }
-
-        Ok(result)
     }
 
     async fn upload(
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index c467a2d196..67e5be2955 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -3,6 +3,7 @@
 //! testing purposes.
 use bytes::Bytes;
 use futures::stream::Stream;
+use futures::StreamExt;
 use std::collections::HashMap;
 use std::num::NonZeroU32;
 use std::sync::Mutex;
@@ -107,6 +108,23 @@ impl UnreliableWrapper {
 type VoidStorage = crate::LocalFs;
 
 impl RemoteStorage for UnreliableWrapper {
+    fn list_streaming(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
+        async_stream::stream! {
+            self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
+                .map_err(DownloadError::Other)?;
+            let mut stream = self.inner
+                .list_streaming(prefix, mode, max_keys, cancel);
+            while let Some(item) = stream.next().await {
+                yield item;
+            }
+        }
+    }
     async fn list(
         &self,
         prefix: Option<&RemotePath>,

From b037ce07ec9d94aaa7e428acaef5ecf6b9eec624 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 24 Jul 2024 03:17:52 +0200
Subject: [PATCH 1255/1571] followup(#8475): also disable 'cache-to' for
 neon-image-arch and neon-test-extensions (#8478)

PR #8475 only disabled it for compute-node-image-arch.
Those are fast now, but we use cache-to in other places.
---
 .github/workflows/build_and_test.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6407809432..fb456ce3ff 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -772,7 +772,8 @@ jobs:
           pull: true
           file: Dockerfile
           cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }}
+          # 23.07.2024 temporarily disable cache saving in the registry as it is very slow
+          # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }}
           tags: |
             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
@@ -886,7 +887,8 @@ jobs:
           file: Dockerfile.compute-node
           target: neon-pg-ext-test
           cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
+          # 23.07.2024 temporarily disable cache saving in the registry as it is very slow
+          # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
           tags: |
             neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
 

From 925c5ad1e8f007a4cb58c15fa19fc641a9ab696e Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 24 Jul 2024 09:59:18 +0300
Subject: [PATCH 1256/1571] Make async connect work on MacOS: it is necessary
 top call WaitLatchOrSocket before PQconnectPoll (#8472)

## Problem

While investigating problem with test_subscriber_restart flukyness, I
found out that this test is not passed at all for PG 14/15 at MacOS
(while working for PG16).

## Summary of changes

Rewrite async connect state machine exactly in the same way as in
Vanilla: call `WaitLatchOrSocket` with `WL_SOCKETR_WRTEABLE` before
calling `PQconnectPoll`.

Please notice that most likely it will not fix flukyness of
test_subscriber_restart.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/neon_walreader.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c
index 0f76514b86..b575712dbe 100644
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -220,7 +220,8 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou
 			return NEON_WALREAD_ERROR;
 		}
 		/* we'll poll immediately */
-		state->rem_state = RS_CONNECTING_READ;
+		state->rem_state = RS_CONNECTING_WRITE;
+		return NEON_WALREAD_WOULDBLOCK;
 	}
 
 	if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE)

From f5db655447b59366875adde2fc1176bf27dcd313 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 24 Jul 2024 08:17:28 +0100
Subject: [PATCH 1257/1571] pageserver: simplify LayerAccessStats (#8431)

## Problem

LayerAccessStats contains a lot of detail that we don't use: short
histories of most recent accesses, specifics on what kind of task
accessed a layer, etc. This is all stored inside a Mutex, which is
locked every time something accesses a layer.

## Summary of changes

- Store timestamps at a very low resolution (to the nearest second),
sufficient for use on the timescales of eviction.
- Pack access time and last residence change time into a single u64
- Use the high bits of the u64 for other flags, including the new layer
visibility concept.
- Simplify the external-facing model for access stats to just include
what we now track.

Note that the `HistoryBufferWithDropCounter` is removed here because it
is no longer used. I do not dislike this type, we just happen not to use
it for anything else at present.


Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 Cargo.lock                                    |  46 ---
 Cargo.toml                                    |   3 -
 libs/pageserver_api/src/models.rs             |  78 +----
 libs/utils/Cargo.toml                         |   1 -
 libs/utils/src/history_buffer.rs              | 196 -----------
 libs/utils/src/lib.rs                         |   2 -
 pageserver/src/tenant/storage_layer.rs        | 311 +++++++-----------
 .../src/tenant/storage_layer/delta_layer.rs   |  14 +-
 .../src/tenant/storage_layer/image_layer.rs   |  13 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  54 +--
 .../src/tenant/storage_layer/layer/tests.rs   |  51 ++-
 pageserver/src/tenant/timeline.rs             |   4 +-
 .../src/tenant/timeline/eviction_task.rs      |   2 +-
 .../regress/test_disk_usage_eviction.py       |   9 +
 .../regress/test_threshold_based_eviction.py  |   4 +-
 15 files changed, 209 insertions(+), 579 deletions(-)
 delete mode 100644 libs/utils/src/history_buffer.rs

diff --git a/Cargo.lock b/Cargo.lock
index df9efbf7cc..2b56095bc8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -261,15 +261,6 @@ version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
 
-[[package]]
-name = "atomic-polyfill"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c314e70d181aa6053b26e3f7fbf86d1dfff84f816a6175b967666b3506ef7289"
-dependencies = [
- "critical-section",
-]
-
 [[package]]
 name = "atomic-take"
 version = "1.1.0"
@@ -1451,12 +1442,6 @@ dependencies = [
  "itertools",
 ]
 
-[[package]]
-name = "critical-section"
-version = "1.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
-
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.8"
@@ -2282,15 +2267,6 @@ dependencies = [
  "num-traits",
 ]
 
-[[package]]
-name = "hash32"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "47d60b12902ba28e2730cd37e95b8c9223af2808df9e902d4df49588d1470606"
-dependencies = [
- "byteorder",
-]
-
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
@@ -2339,18 +2315,6 @@ dependencies = [
  "num-traits",
 ]
 
-[[package]]
-name = "heapless"
-version = "0.8.0"
-source = "git+https://github.com/japaric/heapless.git?rev=644653bf3b831c6bb4963be2de24804acf5e5001#644653bf3b831c6bb4963be2de24804acf5e5001"
-dependencies = [
- "atomic-polyfill",
- "hash32",
- "rustc_version",
- "spin 0.9.8",
- "stable_deref_trait",
-]
-
 [[package]]
 name = "heck"
 version = "0.4.1"
@@ -5691,9 +5655,6 @@ name = "spin"
 version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
-dependencies = [
- "lock_api",
-]
 
 [[package]]
 name = "spki"
@@ -5715,12 +5676,6 @@ dependencies = [
  "der 0.7.8",
 ]
 
-[[package]]
-name = "stable_deref_trait"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
-
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
@@ -6817,7 +6772,6 @@ dependencies = [
  "criterion",
  "fail",
  "futures",
- "heapless",
  "hex",
  "hex-literal",
  "humantime",
diff --git a/Cargo.toml b/Cargo.toml
index 615f5472ec..7749378114 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -204,9 +204,6 @@ postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git",
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 
-## Other git libraries
-heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
-
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 231a604b47..591c45d908 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -5,7 +5,6 @@ pub mod utilization;
 pub use utilization::PageserverUtilization;
 
 use std::{
-    borrow::Cow,
     collections::HashMap,
     io::{BufRead, Read},
     num::{NonZeroU64, NonZeroUsize},
@@ -20,7 +19,6 @@ use serde::{Deserialize, Serialize};
 use serde_with::serde_as;
 use utils::{
     completion,
-    history_buffer::HistoryBufferWithDropCounter,
     id::{NodeId, TenantId, TimelineId},
     lsn::Lsn,
     serde_system_time,
@@ -726,58 +724,7 @@ pub struct LayerMapInfo {
     pub historic_layers: Vec<HistoricLayerInfo>,
 }
 
-#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize, enum_map::Enum)]
-#[repr(usize)]
-pub enum LayerAccessKind {
-    GetValueReconstructData,
-    Iter,
-    KeyIter,
-    Dump,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct LayerAccessStatFullDetails {
-    pub when_millis_since_epoch: u64,
-    pub task_kind: Cow<'static, str>,
-    pub access_kind: LayerAccessKind,
-}
-
-/// An event that impacts the layer's residence status.
-#[serde_as]
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct LayerResidenceEvent {
-    /// The time when the event occurred.
-    /// NB: this timestamp is captured while the residence status changes.
-    /// So, it might be behind/ahead of the actual residence change by a short amount of time.
-    ///
-    #[serde(rename = "timestamp_millis_since_epoch")]
-    #[serde_as(as = "serde_with::TimestampMilliSeconds")]
-    pub timestamp: SystemTime,
-    /// The new residence status of the layer.
-    pub status: LayerResidenceStatus,
-    /// The reason why we had to record this event.
-    pub reason: LayerResidenceEventReason,
-}
-
-/// The reason for recording a given [`LayerResidenceEvent`].
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
-pub enum LayerResidenceEventReason {
-    /// The layer map is being populated, e.g. during timeline load or attach.
-    /// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
-    /// We need to record such events because there is no persistent storage for the events.
-    ///
-    // https://github.com/rust-lang/rust/issues/74481
-    /// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
-    /// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
-    LayerLoad,
-    /// We just created the layer (e.g., freeze_and_flush or compaction).
-    /// Such layers are always [`LayerResidenceStatus::Resident`].
-    LayerCreate,
-    /// We on-demand downloaded or evicted the given layer.
-    ResidenceChange,
-}
-
-/// The residence status of the layer, after the given [`LayerResidenceEvent`].
+/// The residence status of a layer
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub enum LayerResidenceStatus {
     /// Residence status for a layer file that exists locally.
@@ -787,23 +734,16 @@ pub enum LayerResidenceStatus {
     Evicted,
 }
 
-impl LayerResidenceEvent {
-    pub fn new(status: LayerResidenceStatus, reason: LayerResidenceEventReason) -> Self {
-        Self {
-            status,
-            reason,
-            timestamp: SystemTime::now(),
-        }
-    }
-}
-
+#[serde_as]
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerAccessStats {
-    pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
-    pub task_kind_access_flag: Vec<Cow<'static, str>>,
-    pub first: Option<LayerAccessStatFullDetails>,
-    pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
-    pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
+    #[serde_as(as = "serde_with::TimestampMilliSeconds")]
+    pub access_time: SystemTime,
+
+    #[serde_as(as = "serde_with::TimestampMilliSeconds")]
+    pub residence_time: SystemTime,
+
+    pub visible: bool,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 261ca2cc1a..ec05f849cf 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -20,7 +20,6 @@ bincode.workspace = true
 bytes.workspace = true
 camino.workspace = true
 chrono.workspace = true
-heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
 hyper = { workspace = true, features = ["full"] }
diff --git a/libs/utils/src/history_buffer.rs b/libs/utils/src/history_buffer.rs
deleted file mode 100644
index bd35e2bad6..0000000000
--- a/libs/utils/src/history_buffer.rs
+++ /dev/null
@@ -1,196 +0,0 @@
-//! A heapless buffer for events of sorts.
-
-use std::ops;
-
-use heapless::HistoryBuffer;
-
-#[derive(Debug, Clone)]
-pub struct HistoryBufferWithDropCounter<T, const L: usize> {
-    buffer: HistoryBuffer<T, L>,
-    drop_count: u64,
-}
-
-impl<T, const L: usize> HistoryBufferWithDropCounter<T, L> {
-    pub fn write(&mut self, data: T) {
-        let len_before = self.buffer.len();
-        self.buffer.write(data);
-        let len_after = self.buffer.len();
-        self.drop_count += u64::from(len_before == len_after);
-    }
-    pub fn drop_count(&self) -> u64 {
-        self.drop_count
-    }
-    pub fn map<U, F: Fn(&T) -> U>(&self, f: F) -> HistoryBufferWithDropCounter<U, L> {
-        let mut buffer = HistoryBuffer::new();
-        buffer.extend(self.buffer.oldest_ordered().map(f));
-        HistoryBufferWithDropCounter::<U, L> {
-            buffer,
-            drop_count: self.drop_count,
-        }
-    }
-}
-
-impl<T, const L: usize> Default for HistoryBufferWithDropCounter<T, L> {
-    fn default() -> Self {
-        Self {
-            buffer: HistoryBuffer::default(),
-            drop_count: 0,
-        }
-    }
-}
-
-impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
-    type Target = HistoryBuffer<T, L>;
-
-    fn deref(&self) -> &Self::Target {
-        &self.buffer
-    }
-}
-
-#[derive(serde::Serialize, serde::Deserialize)]
-struct SerdeRepr<T> {
-    buffer: Vec<T>,
-    buffer_size: usize,
-    drop_count: u64,
-}
-
-impl<'a, T, const L: usize> From<&'a HistoryBufferWithDropCounter<T, L>> for SerdeRepr<T>
-where
-    T: Clone + serde::Serialize,
-{
-    fn from(value: &'a HistoryBufferWithDropCounter<T, L>) -> Self {
-        let HistoryBufferWithDropCounter { buffer, drop_count } = value;
-        SerdeRepr {
-            buffer: buffer.iter().cloned().collect(),
-            buffer_size: L,
-            drop_count: *drop_count,
-        }
-    }
-}
-
-impl<T, const L: usize> serde::Serialize for HistoryBufferWithDropCounter<T, L>
-where
-    T: Clone + serde::Serialize,
-{
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        SerdeRepr::from(self).serialize(serializer)
-    }
-}
-
-impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
-where
-    T: Clone + serde::Deserialize<'de>,
-{
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let SerdeRepr {
-            buffer: des_buffer,
-            drop_count,
-            buffer_size,
-        } = SerdeRepr::<T>::deserialize(deserializer)?;
-        if buffer_size != L {
-            use serde::de::Error;
-            return Err(D::Error::custom(format!(
-                "invalid buffer_size, expecting {L} got {buffer_size}"
-            )));
-        }
-        let mut buffer = HistoryBuffer::new();
-        buffer.extend(des_buffer);
-        Ok(HistoryBufferWithDropCounter { buffer, drop_count })
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use super::HistoryBufferWithDropCounter;
-
-    #[test]
-    fn test_basics() {
-        let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
-        b.write(1);
-        b.write(2);
-        b.write(3);
-        assert!(b.iter().any(|e| *e == 2));
-        assert!(b.iter().any(|e| *e == 3));
-        assert!(!b.iter().any(|e| *e == 1));
-
-        // round-trip serde
-        let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
-            serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
-        assert_eq!(
-            round_tripped.iter().cloned().collect::<Vec<_>>(),
-            b.iter().cloned().collect::<Vec<_>>()
-        );
-    }
-
-    #[test]
-    fn test_drop_count_works() {
-        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
-        b.write(1);
-        assert_eq!(b.drop_count(), 0);
-        b.write(2);
-        assert_eq!(b.drop_count(), 0);
-        b.write(3);
-        assert_eq!(b.drop_count(), 1);
-        b.write(4);
-        assert_eq!(b.drop_count(), 2);
-    }
-
-    #[test]
-    fn test_clone_works() {
-        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
-        b.write(1);
-        b.write(2);
-        b.write(3);
-        assert_eq!(b.drop_count(), 1);
-        let mut c = b.clone();
-        assert_eq!(c.drop_count(), 1);
-        assert!(c.iter().any(|e| *e == 2));
-        assert!(c.iter().any(|e| *e == 3));
-        assert!(!c.iter().any(|e| *e == 1));
-
-        c.write(4);
-        assert!(c.iter().any(|e| *e == 4));
-        assert!(!b.iter().any(|e| *e == 4));
-    }
-
-    #[test]
-    fn test_map() {
-        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
-
-        b.write(1);
-        assert_eq!(b.drop_count(), 0);
-        {
-            let c = b.map(|i| i + 10);
-            assert_eq!(c.oldest_ordered().cloned().collect::<Vec<_>>(), vec![11]);
-            assert_eq!(c.drop_count(), 0);
-        }
-
-        b.write(2);
-        assert_eq!(b.drop_count(), 0);
-        {
-            let c = b.map(|i| i + 10);
-            assert_eq!(
-                c.oldest_ordered().cloned().collect::<Vec<_>>(),
-                vec![11, 12]
-            );
-            assert_eq!(c.drop_count(), 0);
-        }
-
-        b.write(3);
-        assert_eq!(b.drop_count(), 1);
-        {
-            let c = b.map(|i| i + 10);
-            assert_eq!(
-                c.oldest_ordered().cloned().collect::<Vec<_>>(),
-                vec![12, 13]
-            );
-            assert_eq!(c.drop_count(), 1);
-        }
-    }
-}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 9ad1752fb7..a46d68ef33 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -59,8 +59,6 @@ pub mod signals;
 
 pub mod fs_ext;
 
-pub mod history_buffer;
-
 pub mod measured_stream;
 
 pub mod serde_percent;
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 3404308e56..f931341aca 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -10,29 +10,18 @@ pub mod merge_iterator;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
-use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
 use bytes::Bytes;
-use enum_map::EnumMap;
-use enumset::EnumSet;
-use once_cell::sync::Lazy;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
-use pageserver_api::models::{
-    LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
-};
-use std::borrow::Cow;
 use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
 use std::ops::Range;
-use std::sync::{Arc, Mutex};
+use std::sync::Arc;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
-use tracing::warn;
-use utils::history_buffer::HistoryBufferWithDropCounter;
-use utils::rate_limit::RateLimit;
 
-use utils::{id::TimelineId, lsn::Lsn};
+use utils::lsn::Lsn;
 
 pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
@@ -75,9 +64,9 @@ where
 /// call, to collect more records.
 ///
 #[derive(Debug, Default)]
-pub struct ValueReconstructState {
-    pub records: Vec<(Lsn, NeonWalRecord)>,
-    pub img: Option<(Lsn, Bytes)>,
+pub(crate) struct ValueReconstructState {
+    pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
+    pub(crate) img: Option<(Lsn, Bytes)>,
 }
 
 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
@@ -462,7 +451,7 @@ pub enum ValueReconstructResult {
 /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
 /// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
 /// be used for cache management but not for correctness-critical checks.
-#[derive(Default, Debug, Clone)]
+#[derive(Default, Debug, Clone, PartialEq, Eq)]
 pub(crate) enum LayerVisibilityHint {
     /// A Visible layer might be read while serving a read, because there is not an image layer between it
     /// and a readable LSN (the tip of the branch or a child's branch point)
@@ -478,95 +467,72 @@ pub(crate) enum LayerVisibilityHint {
     Uninitialized,
 }
 
-#[derive(Debug)]
-pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
-
-/// This struct holds two instances of [`LayerAccessStatsInner`].
-/// Accesses are recorded to both instances.
-/// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`].
-/// The `for_eviction_policy` is never reset.
-#[derive(Debug, Default, Clone)]
-struct LayerAccessStatsLocked {
-    for_scraping_api: LayerAccessStatsInner,
-    for_eviction_policy: LayerAccessStatsInner,
-    visibility: LayerVisibilityHint,
-}
-
-impl LayerAccessStatsLocked {
-    fn iter_mut(&mut self) -> impl Iterator<Item = &mut LayerAccessStatsInner> {
-        [&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter()
-    }
-}
-
-#[derive(Debug, Default, Clone)]
-struct LayerAccessStatsInner {
-    first_access: Option<LayerAccessStatFullDetails>,
-    count_by_access_kind: EnumMap<LayerAccessKind, u64>,
-    task_kind_flag: EnumSet<TaskKind>,
-    last_accesses: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
-    last_residence_changes: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
-}
-
-#[derive(Debug, Clone, Copy)]
-pub(crate) struct LayerAccessStatFullDetails {
-    pub(crate) when: SystemTime,
-    pub(crate) task_kind: TaskKind,
-    pub(crate) access_kind: LayerAccessKind,
-}
+pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
 
 #[derive(Clone, Copy, strum_macros::EnumString)]
-pub enum LayerAccessStatsReset {
+pub(crate) enum LayerAccessStatsReset {
     NoReset,
-    JustTaskKindFlags,
     AllStats,
 }
 
-fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 {
-    ts.duration_since(UNIX_EPOCH)
-        .expect("better to die in this unlikely case than report false stats")
-        .as_millis()
-        .try_into()
-        .expect("64 bits is enough for few more years")
-}
+impl Default for LayerAccessStats {
+    fn default() -> Self {
+        // Default value is to assume resident since creation time, and visible.
+        let (_mask, mut value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, SystemTime::now());
+        value |= 0x1 << Self::VISIBILITY_SHIFT;
 
-impl LayerAccessStatFullDetails {
-    fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
-        let Self {
-            when,
-            task_kind,
-            access_kind,
-        } = self;
-        pageserver_api::models::LayerAccessStatFullDetails {
-            when_millis_since_epoch: system_time_to_millis_since_epoch(when),
-            task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros
-            access_kind: *access_kind,
-        }
+        Self(std::sync::atomic::AtomicU64::new(value))
     }
 }
 
+// Efficient store of two very-low-resolution timestamps and some bits.  Used for storing last access time and
+// last residence change time.
 impl LayerAccessStats {
-    /// Create an empty stats object.
-    ///
-    /// The caller is responsible for recording a residence event
-    /// using [`record_residence_event`] before calling `latest_activity`.
-    /// If they don't, [`latest_activity`] will return `None`.
-    ///
-    /// [`record_residence_event`]: Self::record_residence_event
-    /// [`latest_activity`]: Self::latest_activity
-    pub(crate) fn empty_will_record_residence_event_later() -> Self {
-        LayerAccessStats(Mutex::default())
+    // How many high bits to drop from a u32 timestamp?
+    // - Only storing up to a u32 timestamp will work fine until 2038 (if this code is still in use
+    //   after that, this software has been very successful!)
+    // - Dropping the top bit is implicitly safe because unix timestamps are meant to be
+    // stored in an i32, so they never used it.
+    // - Dropping the next two bits is safe because this code is only running on systems in
+    // years >= 2024, and these bits have been 1 since 2021
+    //
+    // Therefore we may store only 28 bits for a timestamp with one second resolution.  We do
+    // this truncation to make space for some flags in the high bits of our u64.
+    const TS_DROP_HIGH_BITS: u32 = u32::count_ones(Self::TS_ONES) + 1;
+    const TS_MASK: u32 = 0x1f_ff_ff_ff;
+    const TS_ONES: u32 = 0x60_00_00_00;
+
+    const ATIME_SHIFT: u32 = 0;
+    const RTIME_SHIFT: u32 = 32 - Self::TS_DROP_HIGH_BITS;
+    const VISIBILITY_SHIFT: u32 = 64 - 2 * Self::TS_DROP_HIGH_BITS;
+
+    fn write_bits(&self, mask: u64, value: u64) -> u64 {
+        self.0
+            .fetch_update(
+                // TODO: decide what orderings are correct
+                std::sync::atomic::Ordering::Relaxed,
+                std::sync::atomic::Ordering::Relaxed,
+                |v| Some((v & !mask) | (value & mask)),
+            )
+            .expect("Inner function is infallible")
     }
 
-    /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
-    ///
-    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    ///
-    /// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
-    /// [`record_residence_event`]: Self::record_residence_event
-    pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
-        let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
-        new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
-        new
+    fn to_low_res_timestamp(shift: u32, time: SystemTime) -> (u64, u64) {
+        // Drop the low three bits of the timestamp, for an ~8s accuracy
+        let timestamp = time.duration_since(UNIX_EPOCH).unwrap().as_secs() & (Self::TS_MASK as u64);
+
+        ((Self::TS_MASK as u64) << shift, timestamp << shift)
+    }
+
+    fn read_low_res_timestamp(&self, shift: u32) -> Option<SystemTime> {
+        let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
+
+        let ts_bits = (read & ((Self::TS_MASK as u64) << shift)) >> shift;
+        if ts_bits == 0 {
+            None
+        } else {
+            Some(UNIX_EPOCH + Duration::from_secs(ts_bits | (Self::TS_ONES as u64)))
+        }
     }
 
     /// Record a change in layer residency.
@@ -582,123 +548,64 @@ impl LayerAccessStats {
     /// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map.
     /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
     /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
-    ///
-    pub(crate) fn record_residence_event(
-        &self,
-        status: LayerResidenceStatus,
-        reason: LayerResidenceEventReason,
-    ) {
-        let mut locked = self.0.lock().unwrap();
-        locked.iter_mut().for_each(|inner| {
-            inner
-                .last_residence_changes
-                .write(LayerResidenceEvent::new(status, reason))
-        });
+    pub(crate) fn record_residence_event_at(&self, now: SystemTime) {
+        let (mask, value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, now);
+        self.write_bits(mask, value);
     }
 
-    fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
+    pub(crate) fn record_residence_event(&self) {
+        self.record_residence_event_at(SystemTime::now())
+    }
+
+    pub(crate) fn record_access_at(&self, now: SystemTime) {
+        let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);
+
+        // A layer which is accessed must be visible.
+        mask |= 0x1 << Self::VISIBILITY_SHIFT;
+        value |= 0x1 << Self::VISIBILITY_SHIFT;
+
+        self.write_bits(mask, value);
+    }
+
+    pub(crate) fn record_access(&self, ctx: &RequestContext) {
         if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
             return;
         }
 
-        let this_access = LayerAccessStatFullDetails {
-            when: SystemTime::now(),
-            task_kind: ctx.task_kind(),
-            access_kind,
-        };
-
-        let mut locked = self.0.lock().unwrap();
-        locked.iter_mut().for_each(|inner| {
-            inner.first_access.get_or_insert(this_access);
-            inner.count_by_access_kind[access_kind] += 1;
-            inner.task_kind_flag |= ctx.task_kind();
-            inner.last_accesses.write(this_access);
-        });
-
-        // We may access a layer marked as Covered, if a new branch was created that depends on
-        // this layer, and background updates to layer visibility didn't notice it yet
-        if !matches!(locked.visibility, LayerVisibilityHint::Visible) {
-            locked.visibility = LayerVisibilityHint::Visible;
-        }
+        self.record_access_at(SystemTime::now())
     }
 
     fn as_api_model(
         &self,
         reset: LayerAccessStatsReset,
     ) -> pageserver_api::models::LayerAccessStats {
-        let mut locked = self.0.lock().unwrap();
-        let inner = &mut locked.for_scraping_api;
-        let LayerAccessStatsInner {
-            first_access,
-            count_by_access_kind,
-            task_kind_flag,
-            last_accesses,
-            last_residence_changes,
-        } = inner;
         let ret = pageserver_api::models::LayerAccessStats {
-            access_count_by_access_kind: count_by_access_kind
-                .iter()
-                .map(|(kind, count)| (kind, *count))
-                .collect(),
-            task_kind_access_flag: task_kind_flag
-                .iter()
-                .map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros
-                .collect(),
-            first: first_access.as_ref().map(|a| a.as_api_model()),
-            accesses_history: last_accesses.map(|m| m.as_api_model()),
-            residence_events_history: last_residence_changes.clone(),
+            access_time: self
+                .read_low_res_timestamp(Self::ATIME_SHIFT)
+                .unwrap_or(UNIX_EPOCH),
+            residence_time: self
+                .read_low_res_timestamp(Self::RTIME_SHIFT)
+                .unwrap_or(UNIX_EPOCH),
+            visible: matches!(self.visibility(), LayerVisibilityHint::Visible),
         };
         match reset {
-            LayerAccessStatsReset::NoReset => (),
-            LayerAccessStatsReset::JustTaskKindFlags => {
-                inner.task_kind_flag.clear();
-            }
+            LayerAccessStatsReset::NoReset => {}
             LayerAccessStatsReset::AllStats => {
-                *inner = LayerAccessStatsInner::default();
+                self.write_bits((Self::TS_MASK as u64) << Self::ATIME_SHIFT, 0x0);
+                self.write_bits((Self::TS_MASK as u64) << Self::RTIME_SHIFT, 0x0);
             }
         }
         ret
     }
 
-    /// Get the latest access timestamp, falling back to latest residence event, further falling
-    /// back to `SystemTime::now` for a usable timestamp for eviction.
-    pub(crate) fn latest_activity_or_now(&self) -> SystemTime {
-        self.latest_activity().unwrap_or_else(SystemTime::now)
-    }
-
-    /// Get the latest access timestamp, falling back to latest residence event.
-    ///
-    /// This function can only return `None` if there has not yet been a call to the
-    /// [`record_residence_event`] method. That would generally be considered an
-    /// implementation error. This function logs a rate-limited warning in that case.
-    ///
-    /// TODO: use type system to avoid the need for `fallback`.
-    /// The approach in <https://github.com/neondatabase/neon/pull/3775>
-    /// could be used to enforce that a residence event is recorded
-    /// before a layer is added to the layer map. We could also have
-    /// a layer wrapper type that holds the LayerAccessStats, and ensure
-    /// that that type can only be produced by inserting into the layer map.
-    ///
-    /// [`record_residence_event`]: Self::record_residence_event
-    fn latest_activity(&self) -> Option<SystemTime> {
-        let locked = self.0.lock().unwrap();
-        let inner = &locked.for_eviction_policy;
-        match inner.last_accesses.recent() {
-            Some(a) => Some(a.when),
-            None => match inner.last_residence_changes.recent() {
-                Some(e) => Some(e.timestamp),
-                None => {
-                    static WARN_RATE_LIMIT: Lazy<Mutex<(usize, RateLimit)>> =
-                        Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10)))));
-                    let mut guard = WARN_RATE_LIMIT.lock().unwrap();
-                    guard.0 += 1;
-                    let occurences = guard.0;
-                    guard.1.call(move || {
-                        warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value");
-                    });
-                    None
-                }
-            },
+    /// Get the latest access timestamp, falling back to latest residence event.  The latest residence event
+    /// will be this Layer's construction time, if its residence hasn't changed since then.
+    pub(crate) fn latest_activity(&self) -> SystemTime {
+        if let Some(t) = self.read_low_res_timestamp(Self::ATIME_SHIFT) {
+            t
+        } else {
+            self.read_low_res_timestamp(Self::RTIME_SHIFT)
+                .expect("Residence time is set on construction")
         }
     }
 
@@ -707,38 +614,46 @@ impl LayerAccessStats {
     /// This indicates whether the layer has been used for some purpose that would motivate
     /// us to keep it on disk, such as for serving a getpage request.
     fn accessed(&self) -> bool {
-        let locked = self.0.lock().unwrap();
-        let inner = &locked.for_eviction_policy;
-
         // Consider it accessed if the most recent access is more recent than
         // the most recent change in residence status.
         match (
-            inner.last_accesses.recent(),
-            inner.last_residence_changes.recent(),
+            self.read_low_res_timestamp(Self::ATIME_SHIFT),
+            self.read_low_res_timestamp(Self::RTIME_SHIFT),
         ) {
             (None, _) => false,
             (Some(_), None) => true,
-            (Some(a), Some(r)) => a.when >= r.timestamp,
+            (Some(a), Some(r)) => a >= r,
         }
     }
 
     pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
-        self.0.lock().unwrap().visibility = visibility;
+        let value = match visibility {
+            LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
+            LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0,
+        };
+
+        self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
     }
 
     pub(crate) fn visibility(&self) -> LayerVisibilityHint {
-        self.0.lock().unwrap().visibility.clone()
+        let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
+        match (read >> Self::VISIBILITY_SHIFT) & 0x1 {
+            1 => LayerVisibilityHint::Visible,
+            0 => LayerVisibilityHint::Covered,
+            _ => unreachable!(),
+        }
     }
 }
 
 /// Get a layer descriptor from a layer.
-pub trait AsLayerDesc {
+pub(crate) trait AsLayerDesc {
     /// Get the layer descriptor.
     fn layer_desc(&self) -> &PersistentLayerDesc;
 }
 
 pub mod tests {
     use pageserver_api::shard::TenantShardId;
+    use utils::id::TimelineId;
 
     use super::*;
 
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index c73059c34a..586a7b7836 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -52,7 +52,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
-use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
+use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -265,7 +265,7 @@ impl DeltaLayer {
             return Ok(());
         }
 
-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+        let inner = self.load(ctx).await?;
 
         inner.dump(ctx).await
     }
@@ -298,12 +298,8 @@ impl DeltaLayer {
     /// Open the underlying file and read the metadata into memory, if it's
     /// not loaded already.
     ///
-    async fn load(
-        &self,
-        access_kind: LayerAccessKind,
-        ctx: &RequestContext,
-    ) -> Result<&Arc<DeltaLayerInner>> {
-        self.access_stats.record_access(access_kind, ctx);
+    async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
+        self.access_stats.record_access(ctx);
         // Quick exit if already loaded
         self.inner
             .get_or_try_init(|| self.load_inner(ctx))
@@ -356,7 +352,7 @@ impl DeltaLayer {
                 summary.lsn_range,
                 metadata.len(),
             ),
-            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
+            access_stats: Default::default(),
             inner: OnceCell::new(),
         })
     }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 19e4e9e2e9..e5e7f71928 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -49,7 +49,6 @@ use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
-use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -228,7 +227,7 @@ impl ImageLayer {
             return Ok(());
         }
 
-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+        let inner = self.load(ctx).await?;
 
         inner.dump(ctx).await?;
 
@@ -255,12 +254,8 @@ impl ImageLayer {
     /// Open the underlying file and read the metadata into memory, if it's
     /// not loaded already.
     ///
-    async fn load(
-        &self,
-        access_kind: LayerAccessKind,
-        ctx: &RequestContext,
-    ) -> Result<&ImageLayerInner> {
-        self.access_stats.record_access(access_kind, ctx);
+    async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
+        self.access_stats.record_access(ctx);
         self.inner
             .get_or_try_init(|| self.load_inner(ctx))
             .await
@@ -312,7 +307,7 @@ impl ImageLayer {
                 metadata.len(),
             ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
             lsn: summary.lsn,
-            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
+            access_stats: Default::default(),
             inner: OnceCell::new(),
         })
     }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 25d8ee6b2b..1db3e7c675 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1,9 +1,7 @@
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::keyspace::KeySpace;
-use pageserver_api::models::{
-    HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
-};
+use pageserver_api::models::HistoricLayerInfo;
 use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
@@ -160,13 +158,10 @@ impl Layer {
             metadata.file_size,
         );
 
-        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
-
         let owner = Layer(Arc::new(LayerInner::new(
             conf,
             timeline,
             local_path,
-            access_stats,
             desc,
             None,
             metadata.generation,
@@ -193,8 +188,6 @@ impl Layer {
             metadata.file_size,
         );
 
-        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
-
         let mut resident = None;
 
         let owner = Layer(Arc::new_cyclic(|owner| {
@@ -209,7 +202,6 @@ impl Layer {
                 conf,
                 timeline,
                 local_path,
-                access_stats,
                 desc,
                 Some(inner),
                 metadata.generation,
@@ -245,13 +237,6 @@ impl Layer {
                 version: 0,
             });
             resident = Some(inner.clone());
-            let access_stats = LayerAccessStats::empty_will_record_residence_event_later();
-            access_stats.record_residence_event(
-                LayerResidenceStatus::Resident,
-                LayerResidenceEventReason::LayerCreate,
-            );
-            // Newly created layers are marked visible by default: the usual case is that they were created to be read.
-            access_stats.set_visibility(super::LayerVisibilityHint::Visible);
 
             let local_path = local_layer_path(
                 conf,
@@ -261,16 +246,22 @@ impl Layer {
                 &timeline.generation,
             );
 
-            LayerInner::new(
+            let layer = LayerInner::new(
                 conf,
                 timeline,
                 local_path,
-                access_stats,
                 desc,
                 Some(inner),
                 timeline.generation,
                 timeline.get_shard_index(),
-            )
+            );
+
+            // Newly created layers are marked visible by default: the usual case is that they were created to be read.
+            layer
+                .access_stats
+                .set_visibility(super::LayerVisibilityHint::Visible);
+
+            layer
         }));
 
         let downloaded = resident.expect("just initialized");
@@ -334,9 +325,7 @@ impl Layer {
         use anyhow::ensure;
 
         let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
-        self.0
-            .access_stats
-            .record_access(LayerAccessKind::GetValueReconstructData, ctx);
+        self.0.access_stats.record_access(ctx);
 
         if self.layer_desc().is_delta {
             ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
@@ -370,9 +359,7 @@ impl Layer {
                 other => GetVectoredError::Other(anyhow::anyhow!(other)),
             })?;
 
-        self.0
-            .access_stats
-            .record_access(LayerAccessKind::GetValueReconstructData, ctx);
+        self.0.access_stats.record_access(ctx);
 
         layer
             .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
@@ -788,7 +775,6 @@ impl LayerInner {
         conf: &'static PageServerConf,
         timeline: &Arc<Timeline>,
         local_path: Utf8PathBuf,
-        access_stats: LayerAccessStats,
         desc: PersistentLayerDesc,
         downloaded: Option<Arc<DownloadedLayer>>,
         generation: Generation,
@@ -823,7 +809,7 @@ impl LayerInner {
             path: local_path,
             desc,
             timeline: Arc::downgrade(timeline),
-            access_stats,
+            access_stats: Default::default(),
             wanted_deleted: AtomicBool::new(false),
             inner,
             version: AtomicUsize::new(version),
@@ -1178,10 +1164,7 @@ impl LayerInner {
                     LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
                 }
 
-                self.access_stats.record_residence_event(
-                    LayerResidenceStatus::Resident,
-                    LayerResidenceEventReason::ResidenceChange,
-                );
+                self.access_stats.record_residence_event();
 
                 Ok(self.initialize_after_layer_is_on_disk(permit))
             }
@@ -1535,10 +1518,7 @@ impl LayerInner {
             }
         }
 
-        self.access_stats.record_residence_event(
-            LayerResidenceStatus::Evicted,
-            LayerResidenceEventReason::ResidenceChange,
-        );
+        self.access_stats.record_residence_event();
 
         self.status.as_ref().unwrap().send_replace(Status::Evicted);
 
@@ -1864,9 +1844,7 @@ impl ResidentLayer {
                 // this is valid because the DownloadedLayer::kind is a OnceCell, not a
                 // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
                 // while it's being held.
-                owner
-                    .access_stats
-                    .record_access(LayerAccessKind::KeyIter, ctx);
+                owner.access_stats.record_access(ctx);
 
                 delta_layer::DeltaLayerInner::load_keys(d, ctx)
                     .await
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 66a4493218..d5d2f748a9 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,3 +1,5 @@
+use std::time::UNIX_EPOCH;
+
 use pageserver_api::key::CONTROLFILE_KEY;
 use tokio::task::JoinSet;
 use utils::{
@@ -7,7 +9,7 @@ use utils::{
 
 use super::failpoints::{Failpoint, FailpointKind};
 use super::*;
-use crate::context::DownloadBehavior;
+use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint};
 use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
 
 /// Used in tests to advance a future to wanted await point, and not futher.
@@ -826,9 +828,9 @@ async fn eviction_cancellation_on_drop() {
 #[test]
 #[cfg(target_arch = "x86_64")]
 fn layer_size() {
-    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2048);
+    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 8);
     assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(std::mem::size_of::<LayerInner>(), 2352);
+    assert_eq!(std::mem::size_of::<LayerInner>(), 312);
     // it also has the utf8 path
 }
 
@@ -968,3 +970,46 @@ fn spawn_blocking_pool_helper_actually_works() {
         println!("joined");
     });
 }
+
+/// Drop the low bits from a time, to emulate the precision loss in LayerAccessStats
+fn lowres_time(hires: SystemTime) -> SystemTime {
+    let ts = hires.duration_since(UNIX_EPOCH).unwrap().as_secs();
+    UNIX_EPOCH + Duration::from_secs(ts)
+}
+
+#[test]
+fn access_stats() {
+    let access_stats = LayerAccessStats::default();
+    // Default is visible
+    assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible);
+
+    access_stats.set_visibility(LayerVisibilityHint::Covered);
+    assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered);
+    access_stats.set_visibility(LayerVisibilityHint::Visible);
+    assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible);
+
+    let rtime = UNIX_EPOCH + Duration::from_secs(2000000000);
+    access_stats.record_residence_event_at(rtime);
+    assert_eq!(access_stats.latest_activity(), lowres_time(rtime));
+
+    let atime = UNIX_EPOCH + Duration::from_secs(2100000000);
+    access_stats.record_access_at(atime);
+    assert_eq!(access_stats.latest_activity(), lowres_time(atime));
+
+    // Setting visibility doesn't clobber access time
+    access_stats.set_visibility(LayerVisibilityHint::Covered);
+    assert_eq!(access_stats.latest_activity(), lowres_time(atime));
+    access_stats.set_visibility(LayerVisibilityHint::Visible);
+    assert_eq!(access_stats.latest_activity(), lowres_time(atime));
+}
+
+#[test]
+fn access_stats_2038() {
+    // The access stats structure uses a timestamp representation that will run out
+    // of bits in 2038.  One year before that, this unit test will start failing.
+
+    let one_year_from_now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap()
+        + Duration::from_secs(3600 * 24 * 365);
+
+    assert!(one_year_from_now.as_secs() < (2 << 31));
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4bfcdc43e8..82e8ff02ca 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3155,7 +3155,7 @@ impl Timeline {
         let guard = self.layers.read().await;
 
         let resident = guard.likely_resident_layers().map(|layer| {
-            let last_activity_ts = layer.access_stats().latest_activity_or_now();
+            let last_activity_ts = layer.access_stats().latest_activity();
 
             HeatMapLayer::new(
                 layer.layer_desc().layer_name(),
@@ -5582,7 +5582,7 @@ impl Timeline {
                 let file_size = layer.layer_desc().file_size;
                 max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
 
-                let last_activity_ts = layer.access_stats().latest_activity_or_now();
+                let last_activity_ts = layer.access_stats().latest_activity();
 
                 EvictionCandidate {
                     layer: layer.into(),
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 972ac48cda..fec66aabc1 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -225,7 +225,7 @@ impl Timeline {
                     continue;
                 }
 
-                let last_activity_ts = layer.access_stats().latest_activity_or_now();
+                let last_activity_ts = layer.access_stats().latest_activity();
 
                 let no_activity_for = match now.duration_since(last_activity_ts) {
                     Ok(d) => d,
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 930fb14947..91c7b97fdd 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -21,6 +21,10 @@ from fixtures.utils import human_bytes, wait_until
 
 GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
 
+# access times in the pageserver are stored at a very low resolution: to generate meaningfully different
+# values, tests must inject sleeps
+ATIME_RESOLUTION = 2
+
 
 @pytest.mark.parametrize("config_level_override", [None, 400])
 def test_min_resident_size_override_handling(
@@ -546,6 +550,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
     (tenant_id, timeline_id) = warm
 
     # make picked tenant more recently used than the other one
+    time.sleep(ATIME_RESOLUTION)
     env.warm_up_tenant(tenant_id)
 
     # Build up enough pressure to require evictions from both tenants,
@@ -622,6 +627,10 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or
     for scale in [1, 1, 1, 4]:
         timelines.append((pgbench_init_tenant(layer_size, scale, env, pg_bin), scale))
 
+        # Eviction times are stored at a low resolution.  We must ensure that the time between
+        # tenants is long enough for the pageserver to distinguish them.
+        time.sleep(ATIME_RESOLUTION)
+
     env.neon_cli.safekeeper_stop()
 
     for (tenant_id, timeline_id), scale in timelines:
diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py
index 7bf49a0874..b62398d427 100644
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -52,8 +52,8 @@ def test_threshold_based_eviction(
         "kind": "NoEviction"
     }
 
-    eviction_threshold = 5
-    eviction_period = 1
+    eviction_threshold = 10
+    eviction_period = 2
     ps_http.set_tenant_config(
         tenant_id,
         {

From c698b7b010133564f2211afbc293d5d949fdecd2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 24 Jul 2024 11:43:05 +0200
Subject: [PATCH 1258/1571] Implement retry support for list_streaming (#8481)

Implements the TODO from #8466 about retries: now the user of the stream
returned by `list_streaming` is able to obtain the next item in the
stream as often as they want, and retry it if it is an error.

Also adds extends the test for paginated listing to include a dedicated
test for `list_streaming`.

follow-up of #8466
fixes #8457
part of #7547

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 libs/remote_storage/src/azure_blob.rs     | 55 ++++++++++++++++-------
 libs/remote_storage/src/lib.rs            |  4 ++
 libs/remote_storage/src/s3_bucket.rs      | 11 ++++-
 libs/remote_storage/tests/common/mod.rs   |  2 +-
 libs/remote_storage/tests/common/tests.rs | 44 ++++++++++++++++--
 5 files changed, 94 insertions(+), 22 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 266a1f6584..acd95a5255 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -15,7 +15,7 @@ use std::time::SystemTime;
 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
 use azure_core::request_options::{MaxResults, Metadata, Range};
-use azure_core::RetryOptions;
+use azure_core::{Continuable, RetryOptions};
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::blob::CopyStatus;
@@ -306,23 +306,43 @@ impl RemoteStorage for AzureBlobStorage {
                 builder = builder.max_results(MaxResults::new(limit));
             }
 
-            let response = builder.into_stream();
-            let response = response.into_stream().map_err(to_download_error);
-            let response = tokio_stream::StreamExt::timeout(response, self.timeout);
-            let response = response.map(|res| match res {
-                Ok(res) => res,
-                Err(_elapsed) => Err(DownloadError::Timeout),
-            });
+            let mut next_marker = None;
 
-            let mut response = std::pin::pin!(response);
+            'outer: loop {
+                let mut builder = builder.clone();
+                if let Some(marker) = next_marker.clone() {
+                    builder = builder.marker(marker);
+                }
+                let response = builder.into_stream();
+                let response = response.into_stream().map_err(to_download_error);
+                let response = tokio_stream::StreamExt::timeout(response, self.timeout);
+                let response = response.map(|res| match res {
+                    Ok(res) => res,
+                    Err(_elapsed) => Err(DownloadError::Timeout),
+                });
+
+                let mut response = std::pin::pin!(response);
+
+                let mut max_keys = max_keys.map(|mk| mk.get());
+                let next_item = tokio::select! {
+                    op = response.next() => Ok(op),
+                    _ = cancel.cancelled() => Err(DownloadError::Cancelled),
+                }?;
+                let Some(entry) = next_item else {
+                    // The list is complete, so yield it.
+                    break;
+                };
 
-            let mut max_keys = max_keys.map(|mk| mk.get());
-            'outer: while let Some(entry) = tokio::select! {
-                op = response.next() => Ok(op),
-                _ = cancel.cancelled() => Err(DownloadError::Cancelled),
-            }? {
                 let mut res = Listing::default();
-                let entry = entry?;
+                let entry = match entry {
+                    Ok(entry) => entry,
+                    Err(e) => {
+                        // The error is potentially retryable, so we must rewind the loop after yielding.
+                        yield Err(e);
+                        continue;
+                    }
+                };
+                next_marker = entry.continuation();
                 let prefix_iter = entry
                     .blobs
                     .prefixes()
@@ -348,6 +368,11 @@ impl RemoteStorage for AzureBlobStorage {
                     }
                 }
                 yield Ok(res);
+
+                // We are done here
+                if next_marker.is_none() {
+                    break;
+                }
             }
         }
     }
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 201e2fb178..0fed86f4b8 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -165,6 +165,9 @@ pub trait RemoteStorage: Send + Sync + 'static {
     /// The stream is guaranteed to return at least one element, even in the case of errors
     /// (in that case it's an `Err()`), or an empty `Listing`.
     ///
+    /// The stream is not ending if it returns an error, as long as [`is_permanent`] returns false on the error.
+    /// The `next` function can be retried, and maybe in a future retry, there will be success.
+    ///
     /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
     /// from the absolute root of the bucket.
     ///
@@ -178,6 +181,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
     /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
     ///
     /// [`ListObjectsV2`]: <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>
+    /// [`is_permanent`]: DownloadError::is_permanent
     fn list_streaming(
         &self,
         prefix: Option<&RemotePath>,
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 39106a4e53..90ed48e06c 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -507,7 +507,7 @@ impl RemoteStorage for S3Bucket {
                     .list_objects_v2()
                     .bucket(self.bucket_name.clone())
                     .set_prefix(list_prefix.clone())
-                    .set_continuation_token(continuation_token)
+                    .set_continuation_token(continuation_token.clone())
                     .set_max_keys(request_max_keys);
 
                 if let ListingMode::WithDelimiter = mode {
@@ -532,7 +532,14 @@ impl RemoteStorage for S3Bucket {
                     .req_seconds
                     .observe_elapsed(kind, &response, started_at);
 
-                let response = response?;
+                let response = match response {
+                    Ok(response) => response,
+                    Err(e) => {
+                        // The error is potentially retryable, so we must rewind the loop after yielding.
+                        yield Err(e);
+                        continue 'outer;
+                    },
+                };
 
                 let keys = response.contents();
                 let prefixes = response.common_prefixes.as_deref().unwrap_or_default();
diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs
index da9dc08d8d..daab05d91a 100644
--- a/libs/remote_storage/tests/common/mod.rs
+++ b/libs/remote_storage/tests/common/mod.rs
@@ -152,7 +152,7 @@ pub(crate) async fn upload_remote_data(
     let mut upload_tasks = JoinSet::new();
     let cancel = CancellationToken::new();
 
-    for i in 1..upload_tasks_count + 1 {
+    for i in 1..=upload_tasks_count {
         let task_client = Arc::clone(client);
         let cancel = cancel.clone();
 
diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs
index 673151c8ef..38c316397a 100644
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,5 +1,6 @@
 use anyhow::Context;
 use camino::Utf8Path;
+use futures::StreamExt;
 use remote_storage::ListingMode;
 use remote_storage::RemotePath;
 use std::sync::Arc;
@@ -29,10 +30,10 @@ use super::{
 /// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
 /// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
 ///
-/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
-/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
-/// since current default AWS S3 pagination limit is 1000.
-/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
+/// In the `MaybeEnabledStorageWithTestBlobs::setup`, we set the `max_keys_in_list_response` param to limit the keys in a single response.
+/// This way, we are able to test the pagination, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
+/// as the current default AWS S3 pagination limit is 1000.
+/// (see <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>).
 ///
 /// Lastly, the test attempts to clean up and remove all uploaded S3 files.
 /// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
@@ -87,6 +88,41 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
         "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
     );
 
+    // list_streaming
+
+    let prefix_with_slash = base_prefix.add_trailing_slash();
+    let mut nested_remote_prefixes_st = test_client.list_streaming(
+        Some(&prefix_with_slash),
+        ListingMode::WithDelimiter,
+        None,
+        &cancel,
+    );
+    let mut nested_remote_prefixes_combined = HashSet::new();
+    let mut segments = 0;
+    let mut segment_max_size = 0;
+    while let Some(st) = nested_remote_prefixes_st.next().await {
+        let st = st?;
+        segment_max_size = segment_max_size.max(st.prefixes.len());
+        nested_remote_prefixes_combined.extend(st.prefixes.into_iter());
+        segments += 1;
+    }
+    assert!(segments > 1, "less than 2 segments: {segments}");
+    assert!(
+        segment_max_size * 2 <= nested_remote_prefixes_combined.len(),
+        "double of segment_max_size={segment_max_size} larger number of remote prefixes of {}",
+        nested_remote_prefixes_combined.len()
+    );
+    let remote_only_prefixes = nested_remote_prefixes_combined
+        .difference(&expected_remote_prefixes)
+        .collect::<HashSet<_>>();
+    let missing_uploaded_prefixes = expected_remote_prefixes
+        .difference(&nested_remote_prefixes_combined)
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
+        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
+    );
+
     Ok(())
 }
 

From 842c3d8c10e08dcebe76e55bc06d2cac065bc6a6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 24 Jul 2024 11:26:24 +0100
Subject: [PATCH 1259/1571] tests: simplify code around unstable
 `test_basebackup_with_high_slru_count` (#8477)

## Problem

In `test_basebackup_with_high_slru_count`, the pageserver is sometimes
mysteriously hanging on startup, having been started+stopped earlier in
the test setup while populating template tenant data.

- #7586

We can't see why this is hanging in this particular test. The test does
some weird stuff though, like attaching a load of broken tenants and
then doing a SIGQUIT kill of a pageserver.

## Summary of changes

- Attach tenants normally instead of doing a failpoint dance to attach
them as broken
- Shut the pageserver down gracefully during init instead of using
immediate mode
- Remove the "sequential" variant of the unstable test, as this is going
away soon anyway
- Log before trying to acquire lock file, so that if it hangs we have a
clearer sense of if that's really where it's hanging. It seems like it
is, but that code does a non-blocking flock so it's surprising.
---
 pageserver/src/bin/pageserver.rs              |  1 +
 .../fixtures/pageserver/many_tenants.py       | 35 +++++++------------
 .../pagebench/test_large_slru_basebackup.py   |  4 +--
 3 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index db27a77ec6..7a96c86ded 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -290,6 +290,7 @@ fn start_pageserver(
     // Create and lock PID file. This ensures that there cannot be more than one
     // pageserver process running at the same time.
     let lock_file_path = conf.workdir.join(PID_FILE_NAME);
+    info!("Claiming pid file at {lock_file_path:?}...");
     let lock_file =
         utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
     info!("Claimed pid file at {lock_file_path:?}");
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index c437258c6f..3e0ffabf74 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -1,5 +1,4 @@
 import concurrent.futures
-import time
 from typing import Any, Callable, Dict, Tuple
 
 import fixtures.pageserver.remote_storage
@@ -9,9 +8,6 @@ from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
 )
-from fixtures.pageserver.utils import (
-    wait_until_tenant_state,
-)
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 
 
@@ -46,38 +42,33 @@ def single_timeline(
     log.info(f"duplicating template tenant {ncopies} times in S3")
     tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
 
+    # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
+    # However, on-demand downloads are quite slow ATM.
+    # => do the on-demand downloads in Python.
+    log.info("python-side on-demand download the layer files into local tenant dir")
+    tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
+    fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
+        env, tenant_timelines
+    )
+
     log.info("attach duplicated tenants to pageserver")
     # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
     # However, on-demand downloads are quite slow ATM.
     # => do the on-demand downloads in Python.
     assert ps_http.tenant_list() == []
-    # make the attach fail after it created enough on-disk state to retry loading
-    # the tenant next startup, but before it can start background loops that would start download
-    ps_http.configure_failpoints(("attach-before-activate", "return"))
-    env.pageserver.allowed_errors.append(
-        ".*attach failed, setting tenant state to Broken: attach-before-activate.*"
-    )
 
-    def attach_broken(tenant):
+    def attach(tenant):
         env.pageserver.tenant_attach(
             tenant,
             config=template_config.copy(),
             generation=100,
             override_storage_controller_generation=True,
         )
-        time.sleep(0.1)
-        wait_until_tenant_state(ps_http, tenant, "Broken", 10)
 
     with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
-        executor.map(attach_broken, tenants)
+        executor.map(attach, tenants)
 
-    env.pageserver.stop(
-        immediate=True
-    )  # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout
-    tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
-    log.info("python-side on-demand download the layer files into local tenant dir")
-    fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
-        env, tenant_timelines
-    )
+    # Benchmarks will start the pageserver explicitly themselves
+    env.pageserver.stop()
 
     return env
diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index b41ae60197..3258d4dcfa 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -17,13 +17,11 @@ from performance.pageserver.util import (
 @pytest.mark.parametrize("duration", [30])
 @pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
 @pytest.mark.parametrize("n_tenants", [10])
-@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
 @pytest.mark.timeout(1000)
 def test_basebackup_with_high_slru_count(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,
     pg_bin: PgBin,
-    get_vectored_impl: str,
     n_tenants: int,
     pgbench_scale: int,
     duration: int,
@@ -47,7 +45,7 @@ def test_basebackup_with_high_slru_count(
     max_file_descriptors = 500000
     neon_env_builder.pageserver_config_override = (
         f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; "
-        f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false"
+        f"get_vectored_impl='vectored'; validate_vectored_get=false"
     )
     params.update(
         {

From 2ef8e57f865773437a1350964f7d2e83bbab6ad5 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 24 Jul 2024 11:33:44 +0100
Subject: [PATCH 1260/1571] pageserver: maintain gc_info incrementally (#8427)

## Problem

Previously, Timeline::gc_info was only updated in a batch operation at
the start of GC. That means that timelines didn't generally have
accurate information about who their children were before the first GC,
or between GC cycles.

Knowledge of child branches is important for calculating layer
visibility in #8398

## Summary of changes

- Split out part of refresh_gc_info into initialize_gc_info, which is
now called early in startup
- Include TimelineId in retain_lsns so that we can later add/remove the
LSNs for particular children
- When timelines are added/removed, update their parent's retain_lsns
---
 pageserver/src/tenant.rs                 | 183 ++++++++++++++---------
 pageserver/src/tenant/size.rs            |   4 +-
 pageserver/src/tenant/timeline.rs        |  36 ++++-
 pageserver/src/tenant/timeline/delete.rs |   8 +-
 4 files changed, 152 insertions(+), 79 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index cb3ca9c8b9..a98a32de35 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -31,6 +31,7 @@ use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
+use std::collections::BTreeMap;
 use std::fmt;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
@@ -95,14 +96,12 @@ use crate::tenant::storage_layer::ImageLayer;
 use crate::walredo;
 use crate::InitializationOrder;
 use std::collections::hash_map::Entry;
-use std::collections::BTreeSet;
 use std::collections::HashMap;
 use std::collections::HashSet;
 use std::fmt::Debug;
 use std::fmt::Display;
 use std::fs;
 use std::fs::File;
-use std::ops::Bound::Included;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
@@ -1765,6 +1764,9 @@ impl Tenant {
                 .values()
                 .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping()));
 
+            // Before activation, populate each Timeline's GcInfo with information about its children
+            self.initialize_gc_info(&timelines_accessor);
+
             // Spawn gc and compaction loops. The loops will shut themselves
             // down when they notice that the tenant is inactive.
             tasks::start_background_loops(self, background_jobs_can_start);
@@ -2798,6 +2800,55 @@ impl Tenant {
             .await
     }
 
+    /// Populate all Timelines' `GcInfo` with information about their children.  We do not set the
+    /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`]
+    ///
+    /// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion.
+    fn initialize_gc_info(
+        &self,
+        timelines: &std::sync::MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
+    ) {
+        // This function must be called before activation: after activation timeline create/delete operations
+        // might happen, and this function is not safe to run concurrently with those.
+        assert!(!self.is_active());
+
+        // Scan all timelines. For each timeline, remember the timeline ID and
+        // the branch point where it was created.
+        let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> = BTreeMap::new();
+        timelines.iter().for_each(|(timeline_id, timeline_entry)| {
+            if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() {
+                let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default();
+                ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id));
+            }
+        });
+
+        // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines
+        let horizon = self.get_gc_horizon();
+
+        // Populate each timeline's GcInfo with information about its child branches
+        for timeline in timelines.values() {
+            let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
+                .remove(&timeline.timeline_id)
+                .unwrap_or_default();
+
+            branchpoints.sort_by_key(|b| b.0);
+
+            let mut target = timeline.gc_info.write().unwrap();
+
+            target.retain_lsns = branchpoints;
+
+            let space_cutoff = timeline
+                .get_last_record_lsn()
+                .checked_sub(horizon)
+                .unwrap_or(Lsn(0));
+
+            target.cutoffs = GcCutoffs {
+                space: space_cutoff,
+                time: Lsn::INVALID,
+            };
+        }
+    }
+
     async fn refresh_gc_info_internal(
         &self,
         target_timeline_id: Option<TimelineId>,
@@ -2820,6 +2871,11 @@ impl Tenant {
             .cloned()
             .collect::<Vec<_>>();
 
+        if target_timeline_id.is_some() && timelines.is_empty() {
+            // We were to act on a particular timeline and it wasn't found
+            return Err(GcError::TimelineNotFound);
+        }
+
         let mut gc_cutoffs: HashMap<TimelineId, GcCutoffs> =
             HashMap::with_capacity(timelines.len());
 
@@ -2842,68 +2898,63 @@ impl Tenant {
         // because that will stall branch creation.
         let gc_cs = self.gc_cs.lock().await;
 
-        // Scan all timelines. For each timeline, remember the timeline ID and
-        // the branch point where it was created.
-        let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = {
-            let timelines = self.timelines.lock().unwrap();
-            let mut all_branchpoints = BTreeSet::new();
-            let timelines = {
-                if let Some(target_timeline_id) = target_timeline_id.as_ref() {
-                    if timelines.get(target_timeline_id).is_none() {
-                        return Err(GcError::TimelineNotFound);
+        // Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they
+        // depend on.  So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here
+        // and fail out if it's inaccurate.
+        // (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427)
+        {
+            let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> =
+                BTreeMap::new();
+            timelines.iter().for_each(|timeline| {
+                if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() {
+                    let ancestor_children =
+                        all_branchpoints.entry(*ancestor_timeline_id).or_default();
+                    ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id));
+                }
+            });
+
+            for timeline in &timelines {
+                let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
+                    .remove(&timeline.timeline_id)
+                    .unwrap_or_default();
+
+                branchpoints.sort_by_key(|b| b.0);
+
+                let target = timeline.gc_info.read().unwrap();
+
+                // We require that retain_lsns contains everything in `branchpoints`, but not that
+                // they are exactly equal: timeline deletions can race with us, so retain_lsns
+                // may contain some extra stuff.  It is safe to have extra timelines in there, because it
+                // just means that we retain slightly more data than we otherwise might.
+                let have_branchpoints = target.retain_lsns.iter().copied().collect::<HashSet<_>>();
+                for b in &branchpoints {
+                    if !have_branchpoints.contains(b) {
+                        tracing::error!(
+                            "Bug: `retain_lsns` is set incorrectly.  Expected be {:?}, but found {:?}",
+                            branchpoints,
+                            target.retain_lsns
+                        );
+                        debug_assert!(false);
+                        // Do not GC based on bad information!
+                        // (ab-use an existing GcError type rather than adding a new one, since this is a
+                        // "should never happen" check that will be removed soon).
+                        return Err(GcError::Remote(anyhow::anyhow!(
+                            "retain_lsns failed validation!"
+                        )));
                     }
-                };
-
-                timelines
-                    .iter()
-                    .map(|(_timeline_id, timeline_entry)| {
-                        if let Some(ancestor_timeline_id) =
-                            &timeline_entry.get_ancestor_timeline_id()
-                        {
-                            // If target_timeline is specified, we only need to know branchpoints of its children
-                            if let Some(timeline_id) = target_timeline_id {
-                                if ancestor_timeline_id == &timeline_id {
-                                    all_branchpoints.insert((
-                                        *ancestor_timeline_id,
-                                        timeline_entry.get_ancestor_lsn(),
-                                    ));
-                                }
-                            }
-                            // Collect branchpoints for all timelines
-                            else {
-                                all_branchpoints.insert((
-                                    *ancestor_timeline_id,
-                                    timeline_entry.get_ancestor_lsn(),
-                                ));
-                            }
-                        }
-
-                        timeline_entry.clone()
-                    })
-                    .collect::<Vec<_>>()
-            };
-            (all_branchpoints, timelines)
-        };
+                }
+            }
+        }
 
         // Ok, we now know all the branch points.
         // Update the GC information for each timeline.
         let mut gc_timelines = Vec::with_capacity(timelines.len());
         for timeline in timelines {
-            // If target_timeline is specified, ignore all other timelines
+            // We filtered the timeline list above
             if let Some(target_timeline_id) = target_timeline_id {
-                if timeline.timeline_id != target_timeline_id {
-                    continue;
-                }
+                assert_eq!(target_timeline_id, timeline.timeline_id);
             }
 
-            let branchpoints: Vec<Lsn> = all_branchpoints
-                .range((
-                    Included((timeline.timeline_id, Lsn(0))),
-                    Included((timeline.timeline_id, Lsn(u64::MAX))),
-                ))
-                .map(|&x| x.1)
-                .collect();
-
             {
                 let mut target = timeline.gc_info.write().unwrap();
 
@@ -2941,20 +2992,12 @@ impl Tenant {
                         .0,
                 );
 
-                match gc_cutoffs.remove(&timeline.timeline_id) {
-                    Some(cutoffs) => {
-                        target.retain_lsns = branchpoints;
-                        target.cutoffs = cutoffs;
-                    }
-                    None => {
-                        // reasons for this being unavailable:
-                        // - this timeline was created while we were finding cutoffs
-                        // - lsn for timestamp search fails for this timeline repeatedly
-                        //
-                        // in both cases, refreshing the branchpoints is correct.
-                        target.retain_lsns = branchpoints;
-                    }
-                };
+                // Apply the cutoffs we found to the Timeline's GcInfo.  Why might we _not_ have cutoffs for a timeline?
+                // - this timeline was created while we were finding cutoffs
+                // - lsn for timestamp search fails for this timeline repeatedly
+                if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) {
+                    target.cutoffs = cutoffs.clone();
+                }
             }
 
             gc_timelines.push(timeline);
@@ -4343,7 +4386,7 @@ mod tests {
         {
             let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
             assert_eq!(branchpoints.len(), 1);
-            assert_eq!(branchpoints[0], Lsn(0x40));
+            assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID));
         }
 
         // You can read the key from the child branch even though the parent is
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index e4728ca8a8..41d558d3f6 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -264,10 +264,10 @@ pub(super) async fn gather_inputs(
         let mut lsns: Vec<(Lsn, LsnKind)> = gc_info
             .retain_lsns
             .iter()
-            .filter(|&&lsn| lsn > ancestor_lsn)
+            .filter(|(lsn, _child_id)| lsn > &ancestor_lsn)
             .copied()
             // this assumes there are no other retain_lsns than the branchpoints
-            .map(|lsn| (lsn, LsnKind::BranchPoint))
+            .map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint))
             .collect::<Vec<_>>();
 
         lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 82e8ff02ca..178b707aa7 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -460,7 +460,7 @@ pub(crate) struct GcInfo {
     /// Currently, this includes all points where child branches have
     /// been forked off from. In the future, could also include
     /// explicit user-defined snapshot points.
-    pub(crate) retain_lsns: Vec<Lsn>,
+    pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>,
 
     /// The cutoff coordinates, which are combined by selecting the minimum.
     pub(crate) cutoffs: GcCutoffs,
@@ -476,12 +476,21 @@ impl GcInfo {
     pub(crate) fn min_cutoff(&self) -> Lsn {
         self.cutoffs.select_min()
     }
+
+    pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) {
+        self.retain_lsns.push((child_lsn, child_id));
+        self.retain_lsns.sort_by_key(|i| i.0);
+    }
+
+    pub(super) fn remove_child(&mut self, child_id: TimelineId) {
+        self.retain_lsns.retain(|i| i.1 != child_id);
+    }
 }
 
 /// The `GcInfo` component describing which Lsns need to be retained.  Functionally, this
 /// is a single number (the oldest LSN which we must retain), but it internally distinguishes
 /// between time-based and space-based retention for observability and consumption metrics purposes.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) struct GcCutoffs {
     /// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much
     /// history we must keep to retain a specified number of bytes of WAL.
@@ -2307,6 +2316,11 @@ impl Timeline {
             )
         };
 
+        if let Some(ancestor) = &ancestor {
+            let mut ancestor_gc_info = ancestor.gc_info.write().unwrap();
+            ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn());
+        }
+
         Arc::new_cyclic(|myself| {
             let metrics = TimelineMetrics::new(
                 &tenant_shard_id,
@@ -4753,6 +4767,18 @@ impl Timeline {
     }
 }
 
+impl Drop for Timeline {
+    fn drop(&mut self) {
+        if let Some(ancestor) = &self.ancestor_timeline {
+            // This lock should never be poisoned, but in case it is we do a .map() instead of
+            // an unwrap(), to avoid panicking in a destructor and thereby aborting the process.
+            if let Ok(mut gc_info) = ancestor.gc_info.write() {
+                gc_info.remove_child(self.timeline_id)
+            }
+        }
+    }
+}
+
 /// Top-level failure to compact.
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum CompactionError {
@@ -5070,7 +5096,11 @@ impl Timeline {
 
             let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn());
             let time_cutoff = gc_info.cutoffs.time;
-            let retain_lsns = gc_info.retain_lsns.clone();
+            let retain_lsns = gc_info
+                .retain_lsns
+                .iter()
+                .map(|(lsn, _child_id)| *lsn)
+                .collect();
 
             // Gets the maximum LSN that holds the valid lease.
             //
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 02124ad852..ab6a5f20ba 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -148,14 +148,14 @@ async fn cleanup_remaining_timeline_fs_traces(
 /// For more context see comments in [`DeleteTimelineFlow::prepare`]
 async fn remove_timeline_from_tenant(
     tenant: &Tenant,
-    timeline_id: TimelineId,
+    timeline: &Timeline,
     _: &DeletionGuard, // using it as a witness
 ) -> anyhow::Result<()> {
     // Remove the timeline from the map.
     let mut timelines = tenant.timelines.lock().unwrap();
     let children_exist = timelines
         .iter()
-        .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
+        .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id));
     // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
     // We already deleted the layer files, so it's probably best to panic.
     // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
@@ -164,7 +164,7 @@ async fn remove_timeline_from_tenant(
     }
 
     timelines
-        .remove(&timeline_id)
+        .remove(&timeline.timeline_id)
         .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
 
     drop(timelines);
@@ -414,7 +414,7 @@ impl DeleteTimelineFlow {
 
         pausable_failpoint!("in_progress_delete");
 
-        remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
+        remove_timeline_from_tenant(tenant, timeline, &guard).await?;
 
         *guard = Self::Finished;
 

From 6ca41d34380826d64ba5cec61d0a2514968e7d8d Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 24 Jul 2024 12:28:37 +0100
Subject: [PATCH 1261/1571] proxy: switch to leaky bucket (#8470)

## Problem

The current bucket based rate limiter is not very intuitive and has some
bad failure cases.

## Summary of changes

Switches from fixed interval buckets to leaky bucket impl. A single
bucket per endpoint,
drains over time. Drains by checking the time since the last check, and
draining tokens en-masse. Garbage collection works similar to before, it
drains a shard (1/64th of the set) every 2048 checks, and it only
removes buckets that are empty.

To be compatible with the existing config, I've faffed to make it take
the min and the max rps of each as the sustained rps and the max bucket
size which should be roughly equivalent.
---
 proxy/src/auth/backend.rs              |  18 ++-
 proxy/src/bin/proxy.rs                 |  25 +++-
 proxy/src/console/provider/neon.rs     |   6 +-
 proxy/src/rate_limiter.rs              |   6 +-
 proxy/src/rate_limiter/leaky_bucket.rs | 171 +++++++++++++++++++++++++
 proxy/src/rate_limiter/limiter.rs      |  12 +-
 6 files changed, 220 insertions(+), 18 deletions(-)
 create mode 100644 proxy/src/rate_limiter/leaky_bucket.rs

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index f757a15fbb..67c4dd019e 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -717,8 +717,10 @@ mod tests {
                 _ => panic!("wrong message"),
             }
         });
-        let endpoint_rate_limiter =
-            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
+        let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
+            EndpointRateLimiter::DEFAULT,
+            64,
+        ));
 
         let _creds = auth_quirks(
             &mut ctx,
@@ -767,8 +769,10 @@ mod tests {
             frontend::password_message(b"my-secret-password", &mut write).unwrap();
             client.write_all(&write).await.unwrap();
         });
-        let endpoint_rate_limiter =
-            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
+        let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
+            EndpointRateLimiter::DEFAULT,
+            64,
+        ));
 
         let _creds = auth_quirks(
             &mut ctx,
@@ -818,8 +822,10 @@ mod tests {
             client.write_all(&write).await.unwrap();
         });
 
-        let endpoint_rate_limiter =
-            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
+        let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
+            EndpointRateLimiter::DEFAULT,
+            64,
+        ));
 
         let creds = auth_quirks(
             &mut ctx,
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 7314710508..c1fd6dfd80 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -22,7 +22,9 @@ use proxy::http;
 use proxy::http::health_server::AppMetrics;
 use proxy::metrics::Metrics;
 use proxy::rate_limiter::EndpointRateLimiter;
+use proxy::rate_limiter::LeakyBucketConfig;
 use proxy::rate_limiter::RateBucketInfo;
+use proxy::rate_limiter::WakeComputeRateLimiter;
 use proxy::redis::cancellation_publisher::RedisPublisherClient;
 use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use proxy::redis::elasticache;
@@ -390,9 +392,24 @@ async fn main() -> anyhow::Result<()> {
         proxy::metrics::CancellationSource::FromClient,
     ));
 
-    let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
-    RateBucketInfo::validate(&mut endpoint_rps_limit)?;
-    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
+    // bit of a hack - find the min rps and max rps supported and turn it into
+    // leaky bucket config instead
+    let max = args
+        .endpoint_rps_limit
+        .iter()
+        .map(|x| x.rps())
+        .max_by(f64::total_cmp)
+        .unwrap_or(EndpointRateLimiter::DEFAULT.max);
+    let rps = args
+        .endpoint_rps_limit
+        .iter()
+        .map(|x| x.rps())
+        .min_by(f64::total_cmp)
+        .unwrap_or(EndpointRateLimiter::DEFAULT.rps);
+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
+        LeakyBucketConfig { rps, max },
+        64,
+    ));
 
     // client facing tasks. these will exit on error or on cancellation
     // cancellation returns Ok(())
@@ -594,7 +611,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
             RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
             let wake_compute_endpoint_rate_limiter =
-                Arc::new(EndpointRateLimiter::new(wake_compute_rps_limit));
+                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
             let api = console::provider::neon::Api::new(
                 endpoint,
                 caches,
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index a6e67be22f..768cd2fdfa 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -12,7 +12,7 @@ use crate::{
     console::messages::{ColdStartInfo, Reason},
     http,
     metrics::{CacheOutcome, Metrics},
-    rate_limiter::EndpointRateLimiter,
+    rate_limiter::WakeComputeRateLimiter,
     scram, EndpointCacheKey,
 };
 use crate::{cache::Cached, context::RequestMonitoring};
@@ -26,7 +26,7 @@ pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
     pub locks: &'static ApiLocks<EndpointCacheKey>,
-    pub wake_compute_endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    pub wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
     jwt: String,
 }
 
@@ -36,7 +36,7 @@ impl Api {
         endpoint: http::Endpoint,
         caches: &'static ApiCaches,
         locks: &'static ApiLocks<EndpointCacheKey>,
-        wake_compute_endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+        wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
     ) -> Self {
         let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
             Ok(v) => v,
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index be9072dd8c..222cd431d2 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -3,4 +3,8 @@ mod limiter;
 pub use limit_algorithm::{
     aimd::Aimd, DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
 };
-pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
+pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
+mod leaky_bucket;
+pub use leaky_bucket::{
+    EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState,
+};
diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs
new file mode 100644
index 0000000000..2d5e056540
--- /dev/null
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -0,0 +1,171 @@
+use std::{
+    hash::Hash,
+    sync::atomic::{AtomicUsize, Ordering},
+};
+
+use ahash::RandomState;
+use dashmap::DashMap;
+use rand::{thread_rng, Rng};
+use tokio::time::Instant;
+use tracing::info;
+
+use crate::intern::EndpointIdInt;
+
+// Simple per-endpoint rate limiter.
+pub type EndpointRateLimiter = LeakyBucketRateLimiter<EndpointIdInt>;
+
+pub struct LeakyBucketRateLimiter<Key> {
+    map: DashMap<Key, LeakyBucketState, RandomState>,
+    config: LeakyBucketConfig,
+    access_count: AtomicUsize,
+}
+
+impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
+    pub const DEFAULT: LeakyBucketConfig = LeakyBucketConfig {
+        rps: 600.0,
+        max: 1500.0,
+    };
+
+    pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self {
+        Self {
+            map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
+            config,
+            access_count: AtomicUsize::new(0),
+        }
+    }
+
+    /// Check that number of connections to the endpoint is below `max_rps` rps.
+    pub fn check(&self, key: K, n: u32) -> bool {
+        let now = Instant::now();
+
+        if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 {
+            self.do_gc(now);
+        }
+
+        let mut entry = self.map.entry(key).or_insert_with(|| LeakyBucketState {
+            time: now,
+            filled: 0.0,
+        });
+
+        entry.check(&self.config, now, n as f64)
+    }
+
+    fn do_gc(&self, now: Instant) {
+        info!(
+            "cleaning up bucket rate limiter, current size = {}",
+            self.map.len()
+        );
+        let n = self.map.shards().len();
+        let shard = thread_rng().gen_range(0..n);
+        self.map.shards()[shard]
+            .write()
+            .retain(|_, value| !value.get_mut().update(&self.config, now));
+    }
+}
+
+pub struct LeakyBucketConfig {
+    pub rps: f64,
+    pub max: f64,
+}
+
+pub struct LeakyBucketState {
+    filled: f64,
+    time: Instant,
+}
+
+impl LeakyBucketConfig {
+    pub fn new(rps: f64, max: f64) -> Self {
+        assert!(rps > 0.0, "rps must be positive");
+        assert!(max > 0.0, "max must be positive");
+        Self { rps, max }
+    }
+}
+
+impl LeakyBucketState {
+    pub fn new() -> Self {
+        Self {
+            filled: 0.0,
+            time: Instant::now(),
+        }
+    }
+
+    /// updates the timer and returns true if the bucket is empty
+    fn update(&mut self, info: &LeakyBucketConfig, now: Instant) -> bool {
+        let drain = now.duration_since(self.time);
+        let drain = drain.as_secs_f64() * info.rps;
+
+        self.filled = (self.filled - drain).clamp(0.0, info.max);
+        self.time = now;
+
+        self.filled == 0.0
+    }
+
+    pub fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool {
+        self.update(info, now);
+
+        if self.filled + n > info.max {
+            return false;
+        }
+        self.filled += n;
+
+        true
+    }
+}
+
+impl Default for LeakyBucketState {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use tokio::time::Instant;
+
+    use super::{LeakyBucketConfig, LeakyBucketState};
+
+    #[tokio::test(start_paused = true)]
+    async fn check() {
+        let info = LeakyBucketConfig::new(500.0, 2000.0);
+        let mut bucket = LeakyBucketState::new();
+
+        // should work for 2000 requests this second
+        for _ in 0..2000 {
+            assert!(bucket.check(&info, Instant::now(), 1.0));
+        }
+        assert!(!bucket.check(&info, Instant::now(), 1.0));
+        assert_eq!(bucket.filled, 2000.0);
+
+        // in 1ms we should drain 0.5 tokens.
+        // make sure we don't lose any tokens
+        tokio::time::advance(Duration::from_millis(1)).await;
+        assert!(!bucket.check(&info, Instant::now(), 1.0));
+        tokio::time::advance(Duration::from_millis(1)).await;
+        assert!(bucket.check(&info, Instant::now(), 1.0));
+
+        // in 10ms we should drain 5 tokens
+        tokio::time::advance(Duration::from_millis(10)).await;
+        for _ in 0..5 {
+            assert!(bucket.check(&info, Instant::now(), 1.0));
+        }
+        assert!(!bucket.check(&info, Instant::now(), 1.0));
+
+        // in 10s we should drain 5000 tokens
+        // but cap is only 2000
+        tokio::time::advance(Duration::from_secs(10)).await;
+        for _ in 0..2000 {
+            assert!(bucket.check(&info, Instant::now(), 1.0));
+        }
+        assert!(!bucket.check(&info, Instant::now(), 1.0));
+
+        // should sustain 500rps
+        for _ in 0..2000 {
+            tokio::time::advance(Duration::from_millis(10)).await;
+            for _ in 0..5 {
+                assert!(bucket.check(&info, Instant::now(), 1.0));
+            }
+        }
+    }
+}
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index b8c9490696..5db4efed37 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -61,7 +61,7 @@ impl GlobalRateLimiter {
 // Purposefully ignore user name and database name as clients can reconnect
 // with different names, so we'll end up sending some http requests to
 // the control plane.
-pub type EndpointRateLimiter = BucketRateLimiter<EndpointIdInt, StdRng, RandomState>;
+pub type WakeComputeRateLimiter = BucketRateLimiter<EndpointIdInt, StdRng, RandomState>;
 
 pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
     map: DashMap<Key, Vec<RateBucket>, Hasher>,
@@ -103,7 +103,7 @@ pub struct RateBucketInfo {
 
 impl std::fmt::Display for RateBucketInfo {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let rps = (self.max_rpi as u64) * 1000 / self.interval.as_millis() as u64;
+        let rps = self.rps().floor() as u64;
         write!(f, "{rps}@{}", humantime::format_duration(self.interval))
     }
 }
@@ -140,6 +140,10 @@ impl RateBucketInfo {
         Self::new(200, Duration::from_secs(600)),
     ];
 
+    pub fn rps(&self) -> f64 {
+        (self.max_rpi as f64) / self.interval.as_secs_f64()
+    }
+
     pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
         info.sort_unstable_by_key(|info| info.interval);
         let invalid = info
@@ -245,7 +249,7 @@ mod tests {
     use rustc_hash::FxHasher;
     use tokio::time;
 
-    use super::{BucketRateLimiter, EndpointRateLimiter};
+    use super::{BucketRateLimiter, WakeComputeRateLimiter};
     use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId};
 
     #[test]
@@ -293,7 +297,7 @@ mod tests {
             .map(|s| s.parse().unwrap())
             .collect();
         RateBucketInfo::validate(&mut rates).unwrap();
-        let limiter = EndpointRateLimiter::new(rates);
+        let limiter = WakeComputeRateLimiter::new(rates);
 
         let endpoint = EndpointId::from("ep-my-endpoint-1234");
         let endpoint = EndpointIdInt::from(endpoint);

From 6f22de5fc9f9e1bd2409d6aaba130bbc6c950d5e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 24 Jul 2024 13:43:31 +0100
Subject: [PATCH 1262/1571] CI(build-and-test): move part of the pipeline to a
 reusable workflow (#8241)

## Problem

- `build-and-test` workflow is pretty big
- jobs that depend on the matrix job don't start before all variations
are done. I.e. `regress-tests` depend on `build-neon`, but we can't
start `regress-tests` on the release configuration until `build-neon` is
done on release **and debug** configurations. This will be more visible
once we add ARM to the matrix.

## Summary of changes
- Move jobs related to building (`build-neon`) and testing
(`regress-tests`) to a separate job
---
 .github/workflows/_build-and-test-locally.yml | 285 ++++++++++++++++
 .github/workflows/build_and_test.yml          | 307 ++----------------
 2 files changed, 304 insertions(+), 288 deletions(-)
 create mode 100644 .github/workflows/_build-and-test-locally.yml

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
new file mode 100644
index 0000000000..843cc1aa48
--- /dev/null
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -0,0 +1,285 @@
+name: Build and Test Locally
+
+on:
+  workflow_call:
+    inputs:
+      build-tag:
+        description: 'build tag'
+        required: true
+        type: string
+      build-tools-image:
+        description: 'build-tools image'
+        required: true
+        type: string
+      build-type:
+        description: 'debug or release'
+        required: true
+        type: string
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+env:
+  RUST_BACKTRACE: 1
+  COPT: '-Werror'
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+jobs:
+  build-neon:
+    runs-on: [ self-hosted, gen3, large ]
+    container:
+      image: ${{ inputs.build-tools-image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      # Raise locked memory limit for tokio-epoll-uring.
+      # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
+      # io_uring will account the memory of the CQ and SQ as locked.
+      # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+    env:
+      BUILD_TYPE: ${{ inputs.build-type }}
+      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
+      BUILD_TAG: ${{ inputs.build-tag }}
+
+    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
+
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Set pg 14 revision for caching
+        id: pg_v14_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
+
+      - name: Set pg 15 revision for caching
+        id: pg_v15_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
+
+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
+      # Set some environment variables used by all the steps.
+      #
+      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
+      #   It also includes --features, if any
+      #
+      # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
+      #   because "cargo metadata" doesn't accept --release or --debug options
+      #
+      # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
+      # corresponding Cargo.toml files for their descriptions.
+      - name: Set env variables
+        run: |
+          CARGO_FEATURES="--features testing"
+          if [[ $BUILD_TYPE == "debug" ]]; then
+            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
+            CARGO_FLAGS="--locked"
+          elif [[ $BUILD_TYPE == "release" ]]; then
+            cov_prefix=""
+            CARGO_FLAGS="--locked --release"
+          fi
+          {
+            echo "cov_prefix=${cov_prefix}"
+            echo "CARGO_FEATURES=${CARGO_FEATURES}"
+            echo "CARGO_FLAGS=${CARGO_FLAGS}"
+            echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
+          } >> $GITHUB_ENV
+
+      - name: Cache postgres v14 build
+        id: cache_pg_14
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+
+      - name: Cache postgres v15 build
+        id: cache_pg_15
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+
+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v14 -j$(nproc)
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v15 -j$(nproc)
+
+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
+      - name: Build neon extensions
+        run: mold -run make neon-pg-ext -j$(nproc)
+
+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
+      - name: Run cargo build
+        run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+
+      # Do install *before* running rust tests because they might recompile the
+      # binaries with different features/flags.
+      - name: Install rust binaries
+        run: |
+          # Install target binaries
+          mkdir -p /tmp/neon/bin/
+          binaries=$(
+            ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
+            jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
+          )
+          for bin in $binaries; do
+            SRC=target/$BUILD_TYPE/$bin
+            DST=/tmp/neon/bin/$bin
+            cp "$SRC" "$DST"
+          done
+
+          # Install test executables and write list of all binaries (for code coverage)
+          if [[ $BUILD_TYPE == "debug" ]]; then
+            # Keep bloated coverage data files away from the rest of the artifact
+            mkdir -p /tmp/coverage/
+
+            mkdir -p /tmp/neon/test_bin/
+
+            test_exe_paths=$(
+              ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
+              jq -r '.executable | select(. != null)'
+            )
+            for bin in $test_exe_paths; do
+              SRC=$bin
+              DST=/tmp/neon/test_bin/$(basename $bin)
+
+              # We don't need debug symbols for code coverage, so strip them out to make
+              # the artifact smaller.
+              strip "$SRC" -o "$DST"
+              echo "$DST" >> /tmp/coverage/binaries.list
+            done
+
+            for bin in $binaries; do
+              echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
+            done
+          fi
+
+      - name: Run rust tests
+        env:
+          NEXTEST_RETRIES: 3
+        run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
+          export LD_LIBRARY_PATH
+
+          #nextest does not yet support running doctests
+          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
+
+          for io_engine in std-fs tokio-epoll-uring ; do
+            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          done
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
+
+      - name: Install postgres binaries
+        run: cp -a pg_install /tmp/neon/pg_install
+
+      - name: Upload Neon artifact
+        uses: ./.github/actions/upload
+        with:
+          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
+          path: /tmp/neon
+
+      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
+      - name: Merge and upload coverage data
+        if: inputs.build-type == 'debug'
+        uses: ./.github/actions/save-coverage-data
+
+  regress-tests:
+    needs: [ build-neon ]
+    runs-on: [ self-hosted, gen3, large ]
+    container:
+      image: ${{ inputs.build-tools-image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      # for changed limits, see comments on `options:` earlier in this file
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+    strategy:
+      fail-fast: false
+      matrix:
+        pg_version: [ v14, v15, v16 ]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Pytest regression tests
+        uses: ./.github/actions/run-python-test-set
+        timeout-minutes: 60
+        with:
+          build_type: ${{ inputs.build-type }}
+          test_selection: regress
+          needs_postgres_source: true
+          run_with_real_s3: true
+          real_s3_bucket: neon-github-ci-tests
+          real_s3_region: eu-central-1
+          rerun_flaky: true
+          pg_version: ${{ matrix.pg_version }}
+        env:
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
+          BUILD_TAG: ${{ inputs.build-tag }}
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_IMPL: vectored
+          PAGESERVER_GET_IMPL: vectored
+          PAGESERVER_VALIDATE_VEC_GET: true
+
+      # Temporary disable this step until we figure out why it's so flaky
+      # Ref https://github.com/neondatabase/neon/issues/4540
+      - name: Merge and upload coverage data
+        if: |
+          false &&
+          inputs.build-type == 'debug' && matrix.pg_version == 'v14'
+        uses: ./.github/actions/save-coverage-data
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index fb456ce3ff..dd8820c865 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -193,291 +193,23 @@ jobs:
         if: ${{ !cancelled() }}
         run: cargo deny check --hide-inclusion-graph
 
-  build-neon:
-    needs: [ check-permissions, tag, build-build-tools-image ]
-    runs-on: [ self-hosted, gen3, large ]
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      # Raise locked memory limit for tokio-epoll-uring.
-      # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
-      # io_uring will account the memory of the CQ and SQ as locked.
-      # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
-      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+  build-and-test-locally:
+    needs: [ tag, build-build-tools-image ]
     strategy:
       fail-fast: false
       matrix:
-        build_type: [ debug, release ]
-    env:
-      BUILD_TYPE: ${{ matrix.build_type }}
-      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
-      BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
+        build-type: [ debug, release ]
 
-    steps:
-      - name: Fix git ownership
-        run: |
-          # Workaround for `fatal: detected dubious ownership in repository at ...`
-          #
-          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
-          #   Ref https://github.com/actions/checkout/issues/785
-          #
-          git config --global --add safe.directory ${{ github.workspace }}
-          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16; do
-            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
-            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
-          done
-
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-          fetch-depth: 1
-
-      - name: Set pg 14 revision for caching
-        id: pg_v14_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
-
-      - name: Set pg 15 revision for caching
-        id: pg_v15_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
-
-      - name: Set pg 16 revision for caching
-        id: pg_v16_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
-
-      # Set some environment variables used by all the steps.
-      #
-      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
-      #   It also includes --features, if any
-      #
-      # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
-      #   because "cargo metadata" doesn't accept --release or --debug options
-      #
-      # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
-      # corresponding Cargo.toml files for their descriptions.
-      - name: Set env variables
-        run: |
-          CARGO_FEATURES="--features testing"
-          if [[ $BUILD_TYPE == "debug" ]]; then
-            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
-            CARGO_FLAGS="--locked"
-          elif [[ $BUILD_TYPE == "release" ]]; then
-            cov_prefix=""
-            CARGO_FLAGS="--locked --release"
-          fi
-          {
-            echo "cov_prefix=${cov_prefix}"
-            echo "CARGO_FEATURES=${CARGO_FEATURES}"
-            echo "CARGO_FLAGS=${CARGO_FLAGS}"
-            echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
-          } >> $GITHUB_ENV
-
-      # Disabled for now
-      # Don't include the ~/.cargo/registry/src directory. It contains just
-      # uncompressed versions of the crates in ~/.cargo/registry/cache
-      # directory, and it's faster to let 'cargo' to rebuild it from the
-      # compressed crates.
-#      - name: Cache cargo deps
-#        id: cache_cargo
-#        uses: actions/cache@v4
-#        with:
-#          path: |
-#            ~/.cargo/registry/
-#            !~/.cargo/registry/src
-#            ~/.cargo/git/
-#            target/
-#          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
-#          key: |
-#            v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
-#            v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
-
-      - name: Cache postgres v14 build
-        id: cache_pg_14
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
-
-      - name: Cache postgres v15 build
-        id: cache_pg_15
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
-
-      - name: Cache postgres v16 build
-        id: cache_pg_16
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
-
-      - name: Build postgres v14
-        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v14 -j$(nproc)
-
-      - name: Build postgres v15
-        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v15 -j$(nproc)
-
-      - name: Build postgres v16
-        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v16 -j$(nproc)
-
-      - name: Build neon extensions
-        run: mold -run make neon-pg-ext -j$(nproc)
-
-      - name: Build walproposer-lib
-        run: mold -run make walproposer-lib -j$(nproc)
-
-      - name: Run cargo build
-        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
-          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
-
-      # Do install *before* running rust tests because they might recompile the
-      # binaries with different features/flags.
-      - name: Install rust binaries
-        run: |
-          # Install target binaries
-          mkdir -p /tmp/neon/bin/
-          binaries=$(
-            ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
-            jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
-          )
-          for bin in $binaries; do
-            SRC=target/$BUILD_TYPE/$bin
-            DST=/tmp/neon/bin/$bin
-            cp "$SRC" "$DST"
-          done
-
-          # Install test executables and write list of all binaries (for code coverage)
-          if [[ $BUILD_TYPE == "debug" ]]; then
-            # Keep bloated coverage data files away from the rest of the artifact
-            mkdir -p /tmp/coverage/
-
-            mkdir -p /tmp/neon/test_bin/
-
-            test_exe_paths=$(
-              ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
-              jq -r '.executable | select(. != null)'
-            )
-            for bin in $test_exe_paths; do
-              SRC=$bin
-              DST=/tmp/neon/test_bin/$(basename $bin)
-
-              # We don't need debug symbols for code coverage, so strip them out to make
-              # the artifact smaller.
-              strip "$SRC" -o "$DST"
-              echo "$DST" >> /tmp/coverage/binaries.list
-            done
-
-            for bin in $binaries; do
-              echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
-            done
-          fi
-
-      - name: Run rust tests
-        env:
-          NEXTEST_RETRIES: 3
-        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
-          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
-          export LD_LIBRARY_PATH
-
-          #nextest does not yet support running doctests
-          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
-
-          for io_engine in std-fs tokio-epoll-uring ; do
-            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
-          done
-
-          # Run separate tests for real S3
-          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
-          export REMOTE_STORAGE_S3_REGION=eu-central-1
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
-
-          # Run separate tests for real Azure Blob Storage
-          # XXX: replace region with `eu-central-1`-like region
-          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
-          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
-          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
-
-      - name: Install postgres binaries
-        run: cp -a pg_install /tmp/neon/pg_install
-
-      - name: Upload Neon artifact
-        uses: ./.github/actions/upload
-        with:
-          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
-          path: /tmp/neon
-
-      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
-      - name: Merge and upload coverage data
-        if: matrix.build_type == 'debug'
-        uses: ./.github/actions/save-coverage-data
-
-  regress-tests:
-    needs: [ check-permissions, build-neon, build-build-tools-image, tag ]
-    runs-on: [ self-hosted, gen3, large ]
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      # for changed limits, see comments on `options:` earlier in this file
-      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
-    strategy:
-      fail-fast: false
-      matrix:
-        build_type: [ debug, release ]
-        pg_version: [ v14, v15, v16 ]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-          fetch-depth: 1
-
-      - name: Pytest regression tests
-        uses: ./.github/actions/run-python-test-set
-        timeout-minutes: 60
-        with:
-          build_type: ${{ matrix.build_type }}
-          test_selection: regress
-          needs_postgres_source: true
-          run_with_real_s3: true
-          real_s3_bucket: neon-github-ci-tests
-          real_s3_region: eu-central-1
-          rerun_flaky: true
-          pg_version: ${{ matrix.pg_version }}
-        env:
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
-          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
-          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          PAGESERVER_GET_VECTORED_IMPL: vectored
-          PAGESERVER_GET_IMPL: vectored
-          PAGESERVER_VALIDATE_VEC_GET: true
-
-      # Temporary disable this step until we figure out why it's so flaky
-      # Ref https://github.com/neondatabase/neon/issues/4540
-      - name: Merge and upload coverage data
-        if: |
-          false &&
-          matrix.build_type == 'debug' && matrix.pg_version == 'v14'
-        uses: ./.github/actions/save-coverage-data
+    uses: ./.github/workflows/_build-and-test-locally.yml
+    with:
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
+      build-tag: ${{ needs.tag.outputs.build-tag }}
+      build-type: ${{ matrix.build-type }}
+    secrets: inherit
 
+  # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
   get-benchmarks-durations:
+    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
     outputs:
       json: ${{ steps.get-benchmark-durations.outputs.json }}
     needs: [ check-permissions, build-build-tools-image ]
@@ -488,7 +220,6 @@ jobs:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -513,7 +244,8 @@ jobs:
           echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
 
   benchmarks:
-    needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ]
+    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
+    needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
     runs-on: [ self-hosted, gen3, small ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -522,7 +254,6 @@ jobs:
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       # for changed limits, see comments on `options:` earlier in this file
       options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
     strategy:
       fail-fast: false
       matrix:
@@ -570,7 +301,7 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
+    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
     if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
     outputs:
       report-url: ${{ steps.create-allure-report.outputs.report-url }}
@@ -621,7 +352,7 @@ jobs:
             })
 
   coverage-report:
-    needs: [ check-permissions, regress-tests, build-build-tools-image ]
+    needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
     runs-on: [ self-hosted, gen3, small ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -1226,7 +957,7 @@ jobs:
           exit 1
 
   deploy:
-    needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
+    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
     if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
 
     runs-on: [ self-hosted, gen3, small ]
@@ -1327,7 +1058,7 @@ jobs:
             })
 
   promote-compatibility-data:
-    needs: [ check-permissions, promote-images, tag, regress-tests ]
+    needs: [ check-permissions, promote-images, tag, build-and-test-locally ]
     if: github.ref_name == 'release'
 
     runs-on: [ self-hosted, gen3, small ]
@@ -1366,7 +1097,7 @@ jobs:
           done
 
   pin-build-tools-image:
-    needs: [ build-build-tools-image, promote-images, regress-tests ]
+    needs: [ build-build-tools-image, promote-images, build-and-test-locally ]
     if: github.ref_name == 'main'
     uses: ./.github/workflows/pin-build-tools-image.yml
     with:
@@ -1388,7 +1119,7 @@ jobs:
     needs:
       - check-codestyle-python
       - check-codestyle-rust
-      - regress-tests
+      - build-and-test-locally
       - test-images
     runs-on: ubuntu-22.04
     steps:

From 2723a8156ac6df877a01ea4b16b0cd9577e4e1e0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 24 Jul 2024 14:23:03 +0100
Subject: [PATCH 1263/1571] pageserver: faster and simpler inmem layer vec read
 (#8469)

## Problem
The in-memory layer vectored read was very slow in some conditions
(walingest::test_large_rel) test. Upon profiling, I realised that 80% of
the time was spent building up the binary heap of reads. This stage
isn't actually needed.

## Summary of changes
Remove the planning stage as we never took advantage of it in order to
merge reads. There should be no functional change from this patch.
---
 .../tenant/storage_layer/inmemory_layer.rs    | 70 ++++++-------------
 1 file changed, 22 insertions(+), 48 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 5941a52e98..f9010ae8a6 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -18,7 +18,7 @@ use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use std::collections::{BTreeMap, BinaryHeap, HashSet};
+use std::collections::BTreeMap;
 use std::sync::{Arc, OnceLock};
 use std::time::Instant;
 use tracing::*;
@@ -375,15 +375,6 @@ impl InMemoryLayer {
         let inner = self.inner.read().await;
         let reader = inner.file.block_cursor();
 
-        #[derive(Eq, PartialEq, Ord, PartialOrd)]
-        struct BlockRead {
-            key: Key,
-            lsn: Lsn,
-            block_offset: u64,
-        }
-
-        let mut planned_block_reads = BinaryHeap::new();
-
         for range in keyspace.ranges.iter() {
             for (key, vec_map) in inner.index.range(range.start..range.end) {
                 let lsn_range = match reconstruct_state.get_cached_lsn(key) {
@@ -392,49 +383,32 @@ impl InMemoryLayer {
                 };
 
                 let slice = vec_map.slice_range(lsn_range);
+
                 for (entry_lsn, pos) in slice.iter().rev() {
-                    planned_block_reads.push(BlockRead {
-                        key: *key,
-                        lsn: *entry_lsn,
-                        block_offset: *pos,
-                    });
+                    // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
+                    let buf = reader.read_blob(*pos, &ctx).await;
+                    if let Err(e) = buf {
+                        reconstruct_state
+                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
+                        break;
+                    }
+
+                    let value = Value::des(&buf.unwrap());
+                    if let Err(e) = value {
+                        reconstruct_state
+                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
+                        break;
+                    }
+
+                    let key_situation =
+                        reconstruct_state.update_key(key, *entry_lsn, value.unwrap());
+                    if key_situation == ValueReconstructSituation::Complete {
+                        break;
+                    }
                 }
             }
         }
 
-        let keyspace_size = keyspace.total_raw_size();
-
-        let mut completed_keys = HashSet::new();
-        while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() {
-            let block_read = planned_block_reads.pop().unwrap();
-            if completed_keys.contains(&block_read.key) {
-                continue;
-            }
-
-            // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
-            let buf = reader.read_blob(block_read.block_offset, &ctx).await;
-            if let Err(e) = buf {
-                reconstruct_state
-                    .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
-                completed_keys.insert(block_read.key);
-                continue;
-            }
-
-            let value = Value::des(&buf.unwrap());
-            if let Err(e) = value {
-                reconstruct_state
-                    .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
-                completed_keys.insert(block_read.key);
-                continue;
-            }
-
-            let key_situation =
-                reconstruct_state.update_key(&block_read.key, block_read.lsn, value.unwrap());
-            if key_situation == ValueReconstructSituation::Complete {
-                completed_keys.insert(block_read.key);
-            }
-        }
-
         reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);
 
         Ok(())

From 5f4e14d27d46a0c35bf4efc25d6a78ccbc7b3097 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 24 Jul 2024 16:37:15 +0100
Subject: [PATCH 1264/1571] pageserver: fix a compilation error (#8487)

## Problem
PR that modified compaction raced with PR that modified the GcInfo
structure

## Summary of changes
Fix it

Co-authored-by: Vlad Lazar <vlalazar.vlad@gmail.com>
---
 pageserver/src/tenant.rs                     | 5 ++++-
 pageserver/src/tenant/timeline/compaction.rs | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a98a32de35..f83c7021e3 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -7470,7 +7470,10 @@ mod tests {
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
             *guard = GcInfo {
-                retain_lsns: vec![Lsn(0x10), Lsn(0x20)],
+                retain_lsns: vec![
+                    (Lsn(0x10), tline.timeline_id),
+                    (Lsn(0x20), tline.timeline_id),
+                ],
                 cutoffs: GcCutoffs {
                     time: Lsn(0x30),
                     space: Lsn(0x30),
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index fb8c125b60..d0a74e3924 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1254,7 +1254,7 @@ impl Timeline {
             let gc_info = self.gc_info.read().unwrap();
             let mut retain_lsns_below_horizon = Vec::new();
             let gc_cutoff = gc_info.cutoffs.select_min();
-            for lsn in &gc_info.retain_lsns {
+            for (lsn, _timeline_id) in &gc_info.retain_lsns {
                 if lsn < &gc_cutoff {
                     retain_lsns_below_horizon.push(*lsn);
                 }

From 99b1a1dfb6ee64973552f0169fe1e5e07096ec8c Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 24 Jul 2024 18:50:49 +0300
Subject: [PATCH 1265/1571] devx: nicer diff hunk headers (#8482)

By default git does not find a nice hunk header with rust. New(er)
versions ship with a handy xfuncname pattern, so lets enable that for
all developers.

Example of how this should help:
https://github.com/rust-lang/rust/commit/39046172ab91805efb79a55870c2ced2d61cfc3a
---
 .gitattributes | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 .gitattributes

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000..6ba6b3c887
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+# allows for nicer hunk headers with git show
+*.rs diff=rust

From 6fc27265683d6d1bcfb123b6268f3c33f002aa0c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 24 Jul 2024 23:05:32 +0100
Subject: [PATCH 1266/1571] CI: Run ARM checks in the main pipeline (#8185)

## Problem
Jobs `check-linux-arm-build` and `check-codestyle-rust-arm` (from
`.github/workflows/neon_extra_builds.yml`) duplicate `build-neon` and
`check-codestyle-rust` jobs in the main pipeline.

## Summary of changes
- Move `check-linux-arm-build` and `check-codestyle-rust-arm` from extra
builds to the main pipeline
---
 .github/workflows/_build-and-test-locally.yml |  10 +-
 .github/workflows/build_and_test.yml          |  12 +-
 .github/workflows/neon_extra_builds.yml       | 215 ------------------
 3 files changed, 18 insertions(+), 219 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 843cc1aa48..35c6251304 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -3,6 +3,10 @@ name: Build and Test Locally
 on:
   workflow_call:
     inputs:
+      arch:
+        description: 'x64 or arm64'
+        required: true
+        type: string
       build-tag:
         description: 'build tag'
         required: true
@@ -28,7 +32,7 @@ env:
 
 jobs:
   build-neon:
-    runs-on: [ self-hosted, gen3, large ]
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
     container:
       image: ${{ inputs.build-tools-image }}
       credentials:
@@ -236,8 +240,10 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   regress-tests:
+    # Run test on x64 only
+    if: inputs.arch == 'x64'
     needs: [ build-neon ]
-    runs-on: [ self-hosted, gen3, large ]
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
     container:
       image: ${{ inputs.build-tools-image }}
       credentials:
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index dd8820c865..d4af174fc5 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -125,7 +125,11 @@ jobs:
 
   check-codestyle-rust:
     needs: [ check-permissions, build-build-tools-image ]
-    runs-on: [ self-hosted, gen3, small ]
+    strategy:
+      matrix:
+        arch: [ x64, arm64 ]
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
+
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
       credentials:
@@ -198,10 +202,14 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
+        arch: [ x64 ]
         build-type: [ debug, release ]
-
+        include:
+          - build-type: release
+            arch: arm64
     uses: ./.github/workflows/_build-and-test-locally.yml
     with:
+      arch: ${{ matrix.arch }}
       build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
       build-tag: ${{ needs.tag.outputs.build-tag }}
       build-type: ${{ matrix.build-type }}
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 11ff634b6c..d4870e16ad 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -133,221 +133,6 @@ jobs:
       - name: Check that no warnings are produced
         run: ./run_clippy.sh
 
-  check-linux-arm-build:
-    needs: [ check-permissions, build-build-tools-image ]
-    timeout-minutes: 90
-    runs-on: [ self-hosted, small-arm64 ]
-
-    env:
-      # Use release build only, to have less debug info around
-      # Hence keeping target/ (and general cache size) smaller
-      BUILD_TYPE: release
-      CARGO_FEATURES: --features testing
-      CARGO_FLAGS: --release
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init
-
-    steps:
-      - name: Fix git ownership
-        run: |
-          # Workaround for `fatal: detected dubious ownership in repository at ...`
-          #
-          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
-          #   Ref https://github.com/actions/checkout/issues/785
-          #
-          git config --global --add safe.directory ${{ github.workspace }}
-          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16; do
-            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
-            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
-          done
-
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-          fetch-depth: 1
-
-      - name: Set pg 14 revision for caching
-        id: pg_v14_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
-
-      - name: Set pg 15 revision for caching
-        id: pg_v15_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
-
-      - name: Set pg 16 revision for caching
-        id: pg_v16_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
-
-      - name: Set env variables
-        run: |
-          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
-
-      - name: Cache postgres v14 build
-        id: cache_pg_14
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Cache postgres v15 build
-        id: cache_pg_15
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Cache postgres v16 build
-        id: cache_pg_16
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Build postgres v14
-        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v14 -j$(nproc)
-
-      - name: Build postgres v15
-        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v15 -j$(nproc)
-
-      - name: Build postgres v16
-        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v16 -j$(nproc)
-
-      - name: Build neon extensions
-        run: mold -run make neon-pg-ext -j$(nproc)
-
-      - name: Build walproposer-lib
-        run: mold -run make walproposer-lib -j$(nproc)
-
-      - name: Run cargo build
-        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
-          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
-
-      - name: Run cargo test
-        env:
-          NEXTEST_RETRIES: 3
-        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
-          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
-          export LD_LIBRARY_PATH
-
-          cargo nextest run $CARGO_FEATURES -j$(nproc)
-
-          # Run separate tests for real S3
-          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
-          export REMOTE_STORAGE_S3_REGION=eu-central-1
-          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc)
-
-          # Run separate tests for real Azure Blob Storage
-          # XXX: replace region with `eu-central-1`-like region
-          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
-          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
-          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
-          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo nextest run --package remote_storage --test test_real_azure -j$(nproc)
-
-  check-codestyle-rust-arm:
-    needs: [ check-permissions, build-build-tools-image ]
-    timeout-minutes: 90
-    runs-on: [ self-hosted, small-arm64 ]
-
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init
-
-    strategy:
-      fail-fast: false
-      matrix:
-        build_type: [ debug, release ]
-
-    steps:
-      - name: Fix git ownership
-        run: |
-          # Workaround for `fatal: detected dubious ownership in repository at ...`
-          #
-          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
-          #   Ref https://github.com/actions/checkout/issues/785
-          #
-          git config --global --add safe.directory ${{ github.workspace }}
-          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16; do
-            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
-            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
-          done
-
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-          fetch-depth: 1
-
-      # Some of our rust modules use FFI and need those to be checked
-      - name: Get postgres headers
-        run: make postgres-headers -j$(nproc)
-
-      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
-      # This will catch compiler & clippy warnings in all feature combinations.
-      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
-      # NB: keep clippy args in sync with ./run_clippy.sh
-      - run: |
-          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
-          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
-            echo "No clippy args found in .neon_clippy_args"
-            exit 1
-          fi
-          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
-
-      - name: Run cargo clippy (debug)
-        if: matrix.build_type == 'debug'
-        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
-      - name: Run cargo clippy (release)
-        if: matrix.build_type == 'release'
-        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
-
-      - name: Check documentation generation
-        if: matrix.build_type == 'release'
-        run: cargo doc --workspace --no-deps --document-private-items -j$(nproc)
-        env:
-            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
-
-      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
-      - name: Check formatting
-        if: ${{ !cancelled() && matrix.build_type == 'release' }}
-        run: cargo fmt --all -- --check
-
-      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
-      - name: Check rust dependencies
-        if: ${{ !cancelled() && matrix.build_type == 'release' }}
-        run: |
-          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
-          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
-
-      # https://github.com/EmbarkStudios/cargo-deny
-      - name: Check rust licenses/bans/advisories/sources
-        if: ${{ !cancelled() && matrix.build_type == 'release' }}
-        run: cargo deny check
-
   gather-rust-build-stats:
     needs: [ check-permissions, build-build-tools-image ]
     if: |

From d57412aaab4e6cf0d42bd13325935233e2c561cd Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 25 Jul 2024 10:24:28 +0200
Subject: [PATCH 1267/1571] followup(#8359): pre-initialize circuitbreaker
 metrics (#8491)

---
 pageserver/src/metrics.rs       | 2 ++
 test_runner/fixtures/metrics.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index c03567f6ef..9aff5220f5 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3104,6 +3104,8 @@ pub fn preinitialize_metrics() {
         &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
         &REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
         &REMOTE_ONDEMAND_DOWNLOADED_BYTES,
+        &CIRCUIT_BREAKERS_BROKEN,
+        &CIRCUIT_BREAKERS_UNBROKEN,
     ]
     .into_iter()
     .for_each(|c| {
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 4836d42db5..509f41366b 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -133,6 +133,8 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
     *histogram("pageserver_remote_operation_seconds"),
     *histogram("pageserver_io_operations_seconds"),
     "pageserver_tenant_states_count",
+    "pageserver_circuit_breaker_broken_total",
+    "pageserver_circuit_breaker_unbroken_total",
 )
 
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (

From a1256b2a67c2ddf363594a8cb8e73739bef64c9b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 25 Jul 2024 10:44:31 +0200
Subject: [PATCH 1268/1571] fix: remote timeline client shutdown trips circuit
 breaker (#8495)

Before this PR

1.The circuit breaker would trip on CompactionError::Shutdown. That's
wrong, we want to ignore those cases.
2. remote timeline client shutdown would not be mapped to
CompactionError::Shutdown in all circumstances.

We observed this in staging, see
https://neondb.slack.com/archives/C033RQ5SPDH/p1721829745384449

This PR fixes (1) with a simple `match` statement, and (2) by switching
a bunch of `anyhow` usage over to distinguished errors that ultimately
get mapped to `CompactionError::Shutdown`.

I removed the implicit `#[from]` conversion from `anyhow::Error` to
`CompactionError::Other` to discover all the places that were mapping
remote timeline client shutdown to `anyhow::Error`.

In my opinion `#[from]` is an antipattern and we should avoid it,
especially for `anyhow::Error`. If some callee is going to return
anyhow, the very least the caller should to is to acknowledge, through a
`map_err(MyError::Other)` that they're conflating different failure
reasons.
---
 pageserver/src/http/routes.rs                 |  8 +-
 pageserver/src/tenant.rs                      | 22 +++---
 pageserver/src/tenant/layer_map.rs            |  4 +-
 .../src/tenant/remote_timeline_client.rs      | 38 +++++++---
 pageserver/src/tenant/storage_layer/layer.rs  | 24 ++++--
 pageserver/src/tenant/timeline.rs             | 42 +++++++++--
 pageserver/src/tenant/timeline/compaction.rs  | 74 ++++++++++++++-----
 .../src/tenant/timeline/detach_ancestor.rs    |  2 +-
 pageserver/src/tenant/upload_queue.rs         |  2 +-
 9 files changed, 158 insertions(+), 58 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index d63c240365..7935aeb5e9 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1650,7 +1650,9 @@ async fn timeline_compact_handler(
             .await
             .map_err(|e| ApiError::InternalServerError(e.into()))?;
         if wait_until_uploaded {
-            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
+            timeline.remote_client.wait_completion().await
+            // XXX map to correct ApiError for the cases where it's due to shutdown
+            .context("wait completion").map_err(ApiError::InternalServerError)?;
         }
         json_response(StatusCode::OK, ())
     }
@@ -1709,7 +1711,9 @@ async fn timeline_checkpoint_handler(
         }
 
         if wait_until_uploaded {
-            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
+            timeline.remote_client.wait_completion().await
+            // XXX map to correct ApiError for the cases where it's due to shutdown
+            .context("wait completion").map_err(ApiError::InternalServerError)?;
         }
 
         json_response(StatusCode::OK, ())
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f83c7021e3..f359326cc0 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1620,7 +1620,7 @@ impl Tenant {
         &self,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<(), timeline::CompactionError> {
+    ) -> Result<(), timeline::CompactionError> {
         // Don't start doing work during shutdown, or when broken, we do not need those in the logs
         if !self.is_active() {
             return Ok(());
@@ -1665,12 +1665,14 @@ impl Tenant {
                 .compact(cancel, EnumSet::empty(), ctx)
                 .instrument(info_span!("compact_timeline", %timeline_id))
                 .await
-                .map_err(|e| {
-                    self.compaction_circuit_breaker
-                        .lock()
-                        .unwrap()
-                        .fail(&CIRCUIT_BREAKERS_BROKEN, &e);
-                    e
+                .inspect_err(|e| match e {
+                    timeline::CompactionError::ShuttingDown => (),
+                    timeline::CompactionError::Other(e) => {
+                        self.compaction_circuit_breaker
+                            .lock()
+                            .unwrap()
+                            .fail(&CIRCUIT_BREAKERS_BROKEN, e);
+                    }
                 })?;
         }
 
@@ -4568,7 +4570,7 @@ mod tests {
         let layer_map = tline.layers.read().await;
         let level0_deltas = layer_map
             .layer_map()
-            .get_level0_deltas()?
+            .get_level0_deltas()
             .into_iter()
             .map(|desc| layer_map.get_from_desc(&desc))
             .collect::<Vec<_>>();
@@ -5787,7 +5789,7 @@ mod tests {
             .read()
             .await
             .layer_map()
-            .get_level0_deltas()?
+            .get_level0_deltas()
             .len();
 
         tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
@@ -5797,7 +5799,7 @@ mod tests {
             .read()
             .await
             .layer_map()
-            .get_level0_deltas()?
+            .get_level0_deltas()
             .len();
 
         assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 72167d02ab..6f150a2d5c 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -845,8 +845,8 @@ impl LayerMap {
     }
 
     /// Return all L0 delta layers
-    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<PersistentLayerDesc>>> {
-        Ok(self.l0_delta_layers.to_vec())
+    pub fn get_level0_deltas(&self) -> Vec<Arc<PersistentLayerDesc>> {
+        self.l0_delta_layers.to_vec()
     }
 
     /// debugging function to print out the contents of the layer map
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index c75d1eaa5e..8b26f122cf 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -287,6 +287,14 @@ pub enum PersistIndexPartWithDeletedFlagError {
     Other(#[from] anyhow::Error),
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum WaitCompletionError {
+    #[error(transparent)]
+    NotInitialized(NotInitialized),
+    #[error("wait_completion aborted because upload queue was stopped")]
+    UploadQueueShutDownOrStopped,
+}
+
 /// A client for accessing a timeline's data in remote storage.
 ///
 /// This takes care of managing the number of connections, and balancing them
@@ -630,7 +638,7 @@ impl RemoteTimelineClient {
     ///
     /// Like schedule_index_upload_for_metadata_update(), this merely adds
     /// the upload to the upload queue and returns quickly.
-    pub fn schedule_index_upload_for_file_changes(self: &Arc<Self>) -> anyhow::Result<()> {
+    pub fn schedule_index_upload_for_file_changes(self: &Arc<Self>) -> Result<(), NotInitialized> {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
 
@@ -645,7 +653,7 @@ impl RemoteTimelineClient {
     fn schedule_index_upload(
         self: &Arc<Self>,
         upload_queue: &mut UploadQueueInitialized,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), NotInitialized> {
         let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();
         // fix up the duplicated field
         upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn;
@@ -653,7 +661,7 @@ impl RemoteTimelineClient {
         // make sure it serializes before doing it in perform_upload_task so that it doesn't
         // look like a retryable error
         let void = std::io::sink();
-        serde_json::to_writer(void, &upload_queue.dirty).context("serialize index_part.json")?;
+        serde_json::to_writer(void, &upload_queue.dirty).expect("serialize index_part.json");
 
         let index_part = &upload_queue.dirty;
 
@@ -699,7 +707,9 @@ impl RemoteTimelineClient {
             self.schedule_barrier0(upload_queue)
         };
 
-        Self::wait_completion0(receiver).await
+        Self::wait_completion0(receiver)
+            .await
+            .context("wait completion")
     }
 
     /// Schedules uploading a new version of `index_part.json` with the given layers added,
@@ -732,7 +742,9 @@ impl RemoteTimelineClient {
             barrier
         };
 
-        Self::wait_completion0(barrier).await
+        Self::wait_completion0(barrier)
+            .await
+            .context("wait completion")
     }
 
     /// Launch an upload operation in the background; the file is added to be included in next
@@ -740,7 +752,7 @@ impl RemoteTimelineClient {
     pub(crate) fn schedule_layer_file_upload(
         self: &Arc<Self>,
         layer: ResidentLayer,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), NotInitialized> {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
 
@@ -826,7 +838,7 @@ impl RemoteTimelineClient {
         self: &Arc<Self>,
         upload_queue: &mut UploadQueueInitialized,
         names: I,
-    ) -> anyhow::Result<Vec<(LayerName, LayerFileMetadata)>>
+    ) -> Result<Vec<(LayerName, LayerFileMetadata)>, NotInitialized>
     where
         I: IntoIterator<Item = LayerName>,
     {
@@ -952,7 +964,7 @@ impl RemoteTimelineClient {
         self: &Arc<Self>,
         compacted_from: &[Layer],
         compacted_to: &[ResidentLayer],
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), NotInitialized> {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
 
@@ -969,10 +981,12 @@ impl RemoteTimelineClient {
     }
 
     /// Wait for all previously scheduled uploads/deletions to complete
-    pub(crate) async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
+    pub(crate) async fn wait_completion(self: &Arc<Self>) -> Result<(), WaitCompletionError> {
         let receiver = {
             let mut guard = self.upload_queue.lock().unwrap();
-            let upload_queue = guard.initialized_mut()?;
+            let upload_queue = guard
+                .initialized_mut()
+                .map_err(WaitCompletionError::NotInitialized)?;
             self.schedule_barrier0(upload_queue)
         };
 
@@ -981,9 +995,9 @@ impl RemoteTimelineClient {
 
     async fn wait_completion0(
         mut receiver: tokio::sync::watch::Receiver<()>,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WaitCompletionError> {
         if receiver.changed().await.is_err() {
-            anyhow::bail!("wait_completion aborted because upload queue was stopped");
+            return Err(WaitCompletionError::UploadQueueShutDownOrStopped);
         }
 
         Ok(())
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 1db3e7c675..619c4d044d 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -17,7 +17,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::repository::Key;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::task_mgr::TaskKind;
-use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::timeline::{CompactionError, GetVectoredError};
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
 
 use super::delta_layer::{self, DeltaEntry};
@@ -426,7 +426,7 @@ impl Layer {
     }
 
     /// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
-    pub(crate) async fn download_and_keep_resident(&self) -> anyhow::Result<ResidentLayer> {
+    pub(crate) async fn download_and_keep_resident(&self) -> Result<ResidentLayer, DownloadError> {
         let downloaded = self.0.get_or_maybe_download(true, None).await?;
 
         Ok(ResidentLayer {
@@ -1862,12 +1862,24 @@ impl ResidentLayer {
         shard_identity: &ShardIdentity,
         writer: &mut ImageLayerWriter,
         ctx: &RequestContext,
-    ) -> anyhow::Result<usize> {
+    ) -> Result<usize, CompactionError> {
         use LayerKind::*;
 
-        match self.downloaded.get(&self.owner.0, ctx).await? {
-            Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")),
-            Image(i) => i.filter(shard_identity, writer, ctx).await,
+        match self
+            .downloaded
+            .get(&self.owner.0, ctx)
+            .await
+            .map_err(CompactionError::Other)?
+        {
+            Delta(_) => {
+                return Err(CompactionError::Other(anyhow::anyhow!(format!(
+                    "cannot filter() on a delta layer {self}"
+                ))));
+            }
+            Image(i) => i
+                .filter(shard_identity, writer, ctx)
+                .await
+                .map_err(CompactionError::Other),
         }
     }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 178b707aa7..8829040c70 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4786,7 +4786,7 @@ pub(crate) enum CompactionError {
     ShuttingDown,
     /// Compaction cannot be done right now; page reconstruction and so on.
     #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    Other(anyhow::Error),
 }
 
 impl From<CollectKeySpaceError> for CompactionError {
@@ -4801,6 +4801,38 @@ impl From<CollectKeySpaceError> for CompactionError {
     }
 }
 
+impl From<super::upload_queue::NotInitialized> for CompactionError {
+    fn from(value: super::upload_queue::NotInitialized) -> Self {
+        match value {
+            super::upload_queue::NotInitialized::Uninitialized
+            | super::upload_queue::NotInitialized::Stopped => {
+                CompactionError::Other(anyhow::anyhow!(value))
+            }
+            super::upload_queue::NotInitialized::ShuttingDown => CompactionError::ShuttingDown,
+        }
+    }
+}
+
+impl CompactionError {
+    /// We cannot do compaction because we could not download a layer that is input to the compaction.
+    pub(crate) fn input_layer_download_failed(
+        e: super::storage_layer::layer::DownloadError,
+    ) -> Self {
+        match e {
+            super::storage_layer::layer::DownloadError::TimelineShutdown |
+            /* TODO DownloadCancelled correct here? */
+            super::storage_layer::layer::DownloadError::DownloadCancelled  => CompactionError::ShuttingDown,
+            super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads |
+            super::storage_layer::layer::DownloadError::DownloadRequired |
+            super::storage_layer::layer::DownloadError::NotFile(_) |
+            super::storage_layer::layer::DownloadError::DownloadFailed |
+            super::storage_layer::layer::DownloadError::PreStatFailed(_)=>CompactionError::Other(anyhow::anyhow!(e)),
+            #[cfg(test)]
+            super::storage_layer::layer::DownloadError::Failpoint(_) =>  CompactionError::Other(anyhow::anyhow!(e)),
+        }
+    }
+}
+
 #[serde_as]
 #[derive(serde::Serialize)]
 struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration);
@@ -4874,7 +4906,7 @@ impl Timeline {
         new_deltas: &[ResidentLayer],
         new_images: &[ResidentLayer],
         layers_to_remove: &[Layer],
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), CompactionError> {
         let mut guard = self.layers.write().await;
 
         let mut duplicated_layers = HashSet::new();
@@ -4892,7 +4924,7 @@ impl Timeline {
                 // because we have not implemented L0 => L0 compaction.
                 duplicated_layers.insert(l.layer_desc().key());
             } else if LayerMap::is_l0(&l.layer_desc().key_range) {
-                bail!("compaction generates a L0 layer file as output, which will cause infinite compaction.");
+                return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
             } else {
                 insert_layers.push(l.clone());
             }
@@ -4924,7 +4956,7 @@ impl Timeline {
         self: &Arc<Self>,
         mut replace_layers: Vec<(Layer, ResidentLayer)>,
         mut drop_layers: Vec<Layer>,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), super::upload_queue::NotInitialized> {
         let mut guard = self.layers.write().await;
 
         // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want
@@ -4946,7 +4978,7 @@ impl Timeline {
     fn upload_new_image_layers(
         self: &Arc<Self>,
         new_images: impl IntoIterator<Item = ResidentLayer>,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), super::upload_queue::NotInitialized> {
         for layer in new_images {
             self.remote_client.schedule_layer_file_upload(layer)?;
         }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index d0a74e3924..487ff6cd80 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -27,6 +27,7 @@ use utils::id::TimelineId;
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
 use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
+use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState};
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
@@ -108,7 +109,10 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<(), CompactionError> {
         if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
-            return self.compact_with_gc(cancel, ctx).await;
+            return self
+                .compact_with_gc(cancel, ctx)
+                .await
+                .map_err(CompactionError::Other);
         }
 
         // High level strategy for compaction / image creation:
@@ -236,7 +240,7 @@ impl Timeline {
         self: &Arc<Self>,
         rewrite_max: usize,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), CompactionError> {
         let mut drop_layers = Vec::new();
         let mut layers_to_rewrite: Vec<Layer> = Vec::new();
 
@@ -357,7 +361,8 @@ impl Timeline {
                 layer.layer_desc().image_layer_lsn(),
                 ctx,
             )
-            .await?;
+            .await
+            .map_err(CompactionError::Other)?;
 
             // Safety of layer rewrites:
             // - We are writing to a different local file path than we are reading from, so the old Layer
@@ -372,14 +377,20 @@ impl Timeline {
             // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
             //    - GC, which at worst witnesses us "undelete" a layer that they just deleted.
             //    - ingestion, which only inserts layers, therefore cannot collide with us.
-            let resident = layer.download_and_keep_resident().await?;
+            let resident = layer
+                .download_and_keep_resident()
+                .await
+                .map_err(CompactionError::input_layer_download_failed)?;
 
             let keys_written = resident
                 .filter(&self.shard_identity, &mut image_layer_writer, ctx)
                 .await?;
 
             if keys_written > 0 {
-                let new_layer = image_layer_writer.finish(self, ctx).await?;
+                let new_layer = image_layer_writer
+                    .finish(self, ctx)
+                    .await
+                    .map_err(CompactionError::Other)?;
                 tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
                     layer.metadata().file_size,
                     new_layer.metadata().file_size);
@@ -407,7 +418,13 @@ impl Timeline {
         // necessary for correctness, but it simplifies testing, and avoids proceeding with another
         // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
         // load.
-        self.remote_client.wait_completion().await?;
+        match self.remote_client.wait_completion().await {
+            Ok(()) => (),
+            Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
+            Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
+                return Err(CompactionError::ShuttingDown)
+            }
+        }
 
         fail::fail_point!("compact-shard-ancestors-persistent");
 
@@ -465,7 +482,7 @@ impl Timeline {
         stats.read_lock_held_spawn_blocking_startup_micros =
             stats.read_lock_acquisition_micros.till_now(); // set by caller
         let layers = guard.layer_map();
-        let level0_deltas = layers.get_level0_deltas()?;
+        let level0_deltas = layers.get_level0_deltas();
         let mut level0_deltas = level0_deltas
             .into_iter()
             .map(|x| guard.get_from_desc(&x))
@@ -518,14 +535,23 @@ impl Timeline {
         ) as u64
             * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
 
-        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
+        deltas_to_compact.push(
+            first_level0_delta
+                .download_and_keep_resident()
+                .await
+                .map_err(CompactionError::input_layer_download_failed)?,
+        );
         for l in level0_deltas_iter {
             let lsn_range = &l.layer_desc().lsn_range;
 
             if lsn_range.start != prev_lsn_end {
                 break;
             }
-            deltas_to_compact.push(l.download_and_keep_resident().await?);
+            deltas_to_compact.push(
+                l.download_and_keep_resident()
+                    .await
+                    .map_err(CompactionError::input_layer_download_failed)?,
+            );
             deltas_to_compact_bytes += l.metadata().file_size;
             prev_lsn_end = lsn_range.end;
 
@@ -584,7 +610,7 @@ impl Timeline {
         let mut all_keys = Vec::new();
 
         for l in deltas_to_compact.iter() {
-            all_keys.extend(l.load_keys(ctx).await?);
+            all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
         }
 
         // FIXME: should spawn_blocking the rest of this function
@@ -706,7 +732,7 @@ impl Timeline {
             key, lsn, ref val, ..
         } in all_values_iter
         {
-            let value = val.load(ctx).await?;
+            let value = val.load(ctx).await.map_err(CompactionError::Other)?;
             let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
             // We need to check key boundaries once we reach next key or end of layer with the same key
             if !same_key || lsn == dup_end_lsn {
@@ -763,7 +789,8 @@ impl Timeline {
                                 .take()
                                 .unwrap()
                                 .finish(prev_key.unwrap().next(), self, ctx)
-                                .await?,
+                                .await
+                                .map_err(CompactionError::Other)?,
                         );
                         writer = None;
 
@@ -801,7 +828,8 @@ impl Timeline {
                             },
                             ctx,
                         )
-                        .await?,
+                        .await
+                        .map_err(CompactionError::Other)?,
                     );
                 }
 
@@ -809,7 +837,8 @@ impl Timeline {
                     .as_mut()
                     .unwrap()
                     .put_value(key, lsn, value, ctx)
-                    .await?;
+                    .await
+                    .map_err(CompactionError::Other)?;
             } else {
                 debug!(
                     "Dropping key {} during compaction (it belongs on shard {:?})",
@@ -825,7 +854,12 @@ impl Timeline {
             prev_key = Some(key);
         }
         if let Some(writer) = writer {
-            new_layers.push(writer.finish(prev_key.unwrap().next(), self, ctx).await?);
+            new_layers.push(
+                writer
+                    .finish(prev_key.unwrap().next(), self, ctx)
+                    .await
+                    .map_err(CompactionError::Other)?,
+            );
         }
 
         // Sync layers
@@ -1007,7 +1041,7 @@ impl Timeline {
             let guard = self.layers.read().await;
             let layers = guard.layer_map();
 
-            let l0_deltas = layers.get_level0_deltas()?;
+            let l0_deltas = layers.get_level0_deltas();
             drop(guard);
 
             // As an optimization, if we find that there are too few L0 layers,
@@ -1037,7 +1071,9 @@ impl Timeline {
             fanout,
             ctx,
         )
-        .await?;
+        .await
+        // TODO: compact_tiered needs to return CompactionError
+        .map_err(CompactionError::Other)?;
 
         adaptor.flush_updates().await?;
         Ok(())
@@ -1235,7 +1271,7 @@ impl Timeline {
         self: &Arc<Self>,
         _cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
+    ) -> anyhow::Result<()> {
         use std::collections::BTreeSet;
 
         info!("running enhanced gc bottom-most compaction");
@@ -1504,7 +1540,7 @@ impl TimelineAdaptor {
         }
     }
 
-    pub async fn flush_updates(&mut self) -> anyhow::Result<()> {
+    pub async fn flush_updates(&mut self) -> Result<(), CompactionError> {
         let layers_to_delete = {
             let guard = self.timeline.layers.read().await;
             self.layers_to_delete
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 49ce3db3e6..ee5f8cd52a 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -26,7 +26,7 @@ pub(crate) enum Error {
     #[error("flushing failed")]
     FlushAncestor(#[source] FlushLayerError),
     #[error("layer download failed")]
-    RewrittenDeltaDownloadFailed(#[source] anyhow::Error),
+    RewrittenDeltaDownloadFailed(#[source] crate::tenant::storage_layer::layer::DownloadError),
     #[error("copying LSN prefix locally failed")]
     CopyDeltaPrefix(#[source] anyhow::Error),
     #[error("upload rewritten layer")]
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index f7440ecdae..592f41cb21 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -130,7 +130,7 @@ pub(super) enum UploadQueueStopped {
 }
 
 #[derive(thiserror::Error, Debug)]
-pub(crate) enum NotInitialized {
+pub enum NotInitialized {
     #[error("queue is in state Uninitialized")]
     Uninitialized,
     #[error("queue is in state Stopped")]

From f76a4e0ad2a3a87a4992ca5404eaca0527fdefc8 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 25 Jul 2024 10:51:20 +0200
Subject: [PATCH 1269/1571] Temporarily remove week-end test for res-aurora
 from pgbench-compare benchmarking runs (#8493)

## Problem

The rds-aurora endpoint connection cannot be reached from GitHub action
runners.
Temporarily remove this DBMS from the pgbench comparison runs.

## Summary of changes

On Saturday we normally run Neon in comparison with AWS RDS-Postgres and
AWS RDS-Aurora.
Remove Aurora until we have a working setup
---
 .github/workflows/benchmarking.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index c132b5b513..5ffdb29fe6 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -261,8 +261,7 @@ jobs:
         }'
 
         if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"},
-                                                     { "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-aurora",   "db_size": "50gb"}]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT

From 9c5ad2134123f4757febe8c4b46837254e4062fb Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 25 Jul 2024 14:09:12 +0100
Subject: [PATCH 1270/1571] storcon: make heartbeats restart aware (#8222)

## Problem
Re-attach blocks the pageserver http server from starting up. Hence, it
can't reply to heartbeats
until that's done. This makes the storage controller mark the node
off-line (not good). We worked
around this by setting the interval after which nodes are marked offline
to 5 minutes. This isn't a
long term solution.

## Summary of changes
* Introduce a new `NodeAvailability` state: `WarmingUp`. This state
models the following time interval:
* From receiving the re-attach request until the pageserver replies to
the first heartbeat post re-attach
* The heartbeat delta generator becomes aware of this state and uses a
separate longer interval
* Flag `max-warming-up-interval` now models the longer timeout and
`max-offline-interval` the shorter one to
match the names of the states

Closes https://github.com/neondatabase/neon/issues/7552
---
 control_plane/src/bin/neon_local.rs           |  65 +++++++-
 control_plane/src/local_env.rs                |  13 +-
 control_plane/src/storage_controller.rs       |  20 ++-
 libs/pageserver_api/src/controller_api.rs     |  16 +-
 pageserver/src/control_plane_client.rs        |   4 +-
 storage_controller/src/heartbeater.rs         |  89 ++++++----
 storage_controller/src/main.rs                |  20 ++-
 storage_controller/src/node.rs                |  40 +++--
 storage_controller/src/service.rs             | 157 ++++++++++--------
 test_runner/fixtures/neon_fixtures.py         |  65 +++++++-
 .../test_storage_controller_scale.py          |  32 +++-
 .../regress/test_pageserver_generations.py    |  23 ++-
 .../regress/test_pageserver_restart.py        |   6 +-
 .../regress/test_pageserver_secondary.py      |   7 +-
 test_runner/regress/test_sharding.py          |   3 +-
 .../regress/test_storage_controller.py        | 112 +++++++++++--
 .../regress/test_threshold_based_eviction.py  |  15 +-
 17 files changed, 508 insertions(+), 179 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 4bf1b29785..51e9a51a57 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -21,7 +21,9 @@ use pageserver_api::config::{
     DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
     DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
-use pageserver_api::controller_api::{PlacementPolicy, TenantCreateRequest};
+use pageserver_api::controller_api::{
+    NodeAvailabilityWrapper, PlacementPolicy, TenantCreateRequest,
+};
 use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo};
 use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
 use postgres_backend::AuthType;
@@ -1250,9 +1252,70 @@ async fn handle_start_all(
             exit(1);
         }
     }
+
+    neon_start_status_check(env, retry_timeout).await?;
+
     Ok(())
 }
 
+async fn neon_start_status_check(
+    env: &local_env::LocalEnv,
+    retry_timeout: &Duration,
+) -> anyhow::Result<()> {
+    const RETRY_INTERVAL: Duration = Duration::from_millis(100);
+    const NOTICE_AFTER_RETRIES: Duration = Duration::from_secs(5);
+
+    if env.control_plane_api.is_none() {
+        return Ok(());
+    }
+
+    let storcon = StorageController::from_env(env);
+
+    let retries = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis();
+    let notice_after_retries = retry_timeout.as_millis() / NOTICE_AFTER_RETRIES.as_millis();
+
+    println!("\nRunning neon status check");
+
+    for retry in 0..retries {
+        if retry == notice_after_retries {
+            println!("\nNeon status check has not passed yet, continuing to wait")
+        }
+
+        let mut passed = true;
+        let mut nodes = storcon.node_list().await?;
+        let mut pageservers = env.pageservers.clone();
+
+        if nodes.len() != pageservers.len() {
+            continue;
+        }
+
+        nodes.sort_by_key(|ps| ps.id);
+        pageservers.sort_by_key(|ps| ps.id);
+
+        for (idx, pageserver) in pageservers.iter().enumerate() {
+            let node = &nodes[idx];
+            if node.id != pageserver.id {
+                passed = false;
+                break;
+            }
+
+            if !matches!(node.availability, NodeAvailabilityWrapper::Active) {
+                passed = false;
+                break;
+            }
+        }
+
+        if passed {
+            println!("\nNeon started and passed status check");
+            return Ok(());
+        }
+
+        tokio::time::sleep(RETRY_INTERVAL).await;
+    }
+
+    anyhow::bail!("\nNeon passed status check")
+}
+
 async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     let immediate =
         sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 3ac3ce21df..d7830a5e70 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -151,7 +151,10 @@ pub struct NeonBroker {
 pub struct NeonStorageControllerConf {
     /// Heartbeat timeout before marking a node offline
     #[serde(with = "humantime_serde")]
-    pub max_unavailable: Duration,
+    pub max_offline: Duration,
+
+    #[serde(with = "humantime_serde")]
+    pub max_warming_up: Duration,
 
     /// Threshold for auto-splitting a tenant into shards
     pub split_threshold: Option<u64>,
@@ -159,14 +162,16 @@ pub struct NeonStorageControllerConf {
 
 impl NeonStorageControllerConf {
     // Use a shorter pageserver unavailability interval than the default to speed up tests.
-    const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
-        std::time::Duration::from_secs(10);
+    const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
+
+    const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30);
 }
 
 impl Default for NeonStorageControllerConf {
     fn default() -> Self {
         Self {
-            max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
+            max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
+            max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
             split_threshold: None,
         }
     }
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index d7aedd711a..e054e9ee57 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -5,8 +5,9 @@ use crate::{
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::{
     controller_api::{
-        NodeConfigureRequest, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse,
-        TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
+        NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
+        TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest,
+        TenantShardMigrateResponse,
     },
     models::{
         TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
@@ -353,8 +354,10 @@ impl StorageController {
             "--dev",
             "--database-url",
             &database_url,
-            "--max-unavailable-interval",
-            &humantime::Duration::from(self.config.max_unavailable).to_string(),
+            "--max-offline-interval",
+            &humantime::Duration::from(self.config.max_offline).to_string(),
+            "--max-warming-up-interval",
+            &humantime::Duration::from(self.config.max_warming_up).to_string(),
         ]
         .into_iter()
         .map(|s| s.to_string())
@@ -625,6 +628,15 @@ impl StorageController {
         .await
     }
 
+    pub async fn node_list(&self) -> anyhow::Result<Vec<NodeDescribeResponse>> {
+        self.dispatch::<(), Vec<NodeDescribeResponse>>(
+            Method::GET,
+            "control/v1/node".to_string(),
+            None,
+        )
+        .await
+    }
+
     #[instrument(skip(self))]
     pub async fn ready(&self) -> anyhow::Result<()> {
         self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None)
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index d0e1eb6b28..474f796040 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,4 +1,5 @@
 use std::str::FromStr;
+use std::time::Instant;
 
 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
@@ -150,11 +151,16 @@ impl UtilizationScore {
     }
 }
 
-#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
+#[derive(Serialize, Clone, Copy, Debug)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
     // Normal, happy state
     Active(UtilizationScore),
+    // Node is warming up, but we expect it to become available soon. Covers
+    // the time span between the re-attach response being composed on the storage controller
+    // and the first successful heartbeat after the processing of the re-attach response
+    // finishes on the pageserver.
+    WarmingUp(Instant),
     // Offline: Tenants shouldn't try to attach here, but they may assume that their
     // secondary locations on this node still exist.  Newly added nodes are in this
     // state until we successfully contact them.
@@ -164,7 +170,10 @@ pub enum NodeAvailability {
 impl PartialEq for NodeAvailability {
     fn eq(&self, other: &Self) -> bool {
         use NodeAvailability::*;
-        matches!((self, other), (Active(_), Active(_)) | (Offline, Offline))
+        matches!(
+            (self, other),
+            (Active(_), Active(_)) | (Offline, Offline) | (WarmingUp(_), WarmingUp(_))
+        )
     }
 }
 
@@ -176,6 +185,7 @@ impl Eq for NodeAvailability {}
 #[derive(Serialize, Deserialize, Clone, Copy, Debug)]
 pub enum NodeAvailabilityWrapper {
     Active,
+    WarmingUp,
     Offline,
 }
 
@@ -185,6 +195,7 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
             // Assume the worst utilisation score to begin with. It will later be updated by
             // the heartbeats.
             NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
+            NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
             NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
         }
     }
@@ -194,6 +205,7 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
     fn from(val: NodeAvailability) -> Self {
         match val {
             NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
+            NodeAvailability::WarmingUp(_) => NodeAvailabilityWrapper::WarmingUp,
             NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
         }
     }
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 26e7cc7ef8..b5d9267d79 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -171,14 +171,14 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
             register,
         };
 
-        fail::fail_point!("control-plane-client-re-attach");
-
         let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
         tracing::info!(
             "Received re-attach response with {} tenants",
             response.tenants.len()
         );
 
+        failpoint_support::sleep_millis_async!("control-plane-client-re-attach");
+
         Ok(response
             .tenants
             .into_iter()
diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 14cda0a289..1bb9c17f30 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -22,7 +22,8 @@ struct HeartbeaterTask {
 
     state: HashMap<NodeId, PageserverState>,
 
-    max_unavailable_interval: Duration,
+    max_offline_interval: Duration,
+    max_warming_up_interval: Duration,
     jwt_token: Option<String>,
 }
 
@@ -31,7 +32,9 @@ pub(crate) enum PageserverState {
     Available {
         last_seen_at: Instant,
         utilization: PageserverUtilization,
-        new: bool,
+    },
+    WarmingUp {
+        started_at: Instant,
     },
     Offline,
 }
@@ -57,12 +60,18 @@ pub(crate) struct Heartbeater {
 impl Heartbeater {
     pub(crate) fn new(
         jwt_token: Option<String>,
-        max_unavailable_interval: Duration,
+        max_offline_interval: Duration,
+        max_warming_up_interval: Duration,
         cancel: CancellationToken,
     ) -> Self {
         let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::<HeartbeatRequest>();
-        let mut heartbeater =
-            HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel);
+        let mut heartbeater = HeartbeaterTask::new(
+            receiver,
+            jwt_token,
+            max_offline_interval,
+            max_warming_up_interval,
+            cancel,
+        );
         tokio::task::spawn(async move { heartbeater.run().await });
 
         Self { sender }
@@ -88,14 +97,16 @@ impl HeartbeaterTask {
     fn new(
         receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
         jwt_token: Option<String>,
-        max_unavailable_interval: Duration,
+        max_offline_interval: Duration,
+        max_warming_up_interval: Duration,
         cancel: CancellationToken,
     ) -> Self {
         Self {
             receiver,
             cancel,
             state: HashMap::new(),
-            max_unavailable_interval,
+            max_offline_interval,
+            max_warming_up_interval,
             jwt_token,
         }
     }
@@ -128,16 +139,15 @@ impl HeartbeaterTask {
             heartbeat_futs.push({
                 let jwt_token = self.jwt_token.clone();
                 let cancel = self.cancel.clone();
-                let new_node = !self.state.contains_key(node_id);
 
                 // Clone the node and mark it as available such that the request
                 // goes through to the pageserver even when the node is marked offline.
                 // This doesn't impact the availability observed by [`crate::service::Service`].
-                let mut node = node.clone();
-                node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+                let mut node_clone = node.clone();
+                node_clone.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
 
                 async move {
-                    let response = node
+                    let response = node_clone
                         .with_client_retries(
                             |client| async move { client.get_utilization().await },
                             &jwt_token,
@@ -161,7 +171,12 @@ impl HeartbeaterTask {
                         PageserverState::Available {
                             last_seen_at: Instant::now(),
                             utilization,
-                            new: new_node,
+                        }
+                    } else if let NodeAvailability::WarmingUp(last_seen_at) =
+                        node.get_availability()
+                    {
+                        PageserverState::WarmingUp {
+                            started_at: last_seen_at,
                         }
                     } else {
                         PageserverState::Offline
@@ -187,53 +202,67 @@ impl HeartbeaterTask {
                 }
             }
         }
+
+        let mut warming_up = 0;
+        let mut offline = 0;
+        for state in new_state.values() {
+            match state {
+                PageserverState::WarmingUp { .. } => {
+                    warming_up += 1;
+                }
+                PageserverState::Offline { .. } => offline += 1,
+                PageserverState::Available { .. } => {}
+            }
+        }
+
         tracing::info!(
-            "Heartbeat round complete for {} nodes, {} offline",
+            "Heartbeat round complete for {} nodes, {} warming-up, {} offline",
             new_state.len(),
-            new_state
-                .values()
-                .filter(|s| match s {
-                    PageserverState::Available { .. } => {
-                        false
-                    }
-                    PageserverState::Offline => true,
-                })
-                .count()
+            warming_up,
+            offline
         );
 
         let mut deltas = Vec::new();
         let now = Instant::now();
-        for (node_id, ps_state) in new_state {
+        for (node_id, ps_state) in new_state.iter_mut() {
             use std::collections::hash_map::Entry::*;
-            let entry = self.state.entry(node_id);
+            let entry = self.state.entry(*node_id);
 
             let mut needs_update = false;
             match entry {
                 Occupied(ref occ) => match (occ.get(), &ps_state) {
                     (PageserverState::Offline, PageserverState::Offline) => {}
                     (PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => {
-                        if now - *last_seen_at >= self.max_unavailable_interval {
-                            deltas.push((node_id, ps_state.clone()));
+                        if now - *last_seen_at >= self.max_offline_interval {
+                            deltas.push((*node_id, ps_state.clone()));
                             needs_update = true;
                         }
                     }
+                    (_, PageserverState::WarmingUp { started_at }) => {
+                        if now - *started_at >= self.max_warming_up_interval {
+                            *ps_state = PageserverState::Offline;
+                        }
+
+                        deltas.push((*node_id, ps_state.clone()));
+                        needs_update = true;
+                    }
                     _ => {
-                        deltas.push((node_id, ps_state.clone()));
+                        deltas.push((*node_id, ps_state.clone()));
                         needs_update = true;
                     }
                 },
                 Vacant(_) => {
                     // This is a new node. Don't generate a delta for it.
-                    deltas.push((node_id, ps_state.clone()));
+                    deltas.push((*node_id, ps_state.clone()));
                 }
             }
 
             match entry {
                 Occupied(mut occ) if needs_update => {
-                    (*occ.get_mut()) = ps_state;
+                    (*occ.get_mut()) = ps_state.clone();
                 }
                 Vacant(vac) => {
-                    vac.insert(ps_state);
+                    vac.insert(ps_state.clone());
                 }
                 _ => {}
             }
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 789f96beb3..adbf5c6496 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -10,7 +10,8 @@ use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
 use storage_controller::service::{
-    Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
+    Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
+    RECONCILER_CONCURRENCY_DEFAULT,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
@@ -61,7 +62,12 @@ struct Cli {
 
     /// Grace period before marking unresponsive pageserver offline
     #[arg(long)]
-    max_unavailable_interval: Option<humantime::Duration>,
+    max_offline_interval: Option<humantime::Duration>,
+
+    /// More tolerant grace period before marking unresponsive pagserver offline used
+    /// around pageserver restarts
+    #[arg(long)]
+    max_warming_up_interval: Option<humantime::Duration>,
 
     /// Size threshold for automatically splitting shards (disabled by default)
     #[arg(long)]
@@ -254,10 +260,14 @@ async fn async_main() -> anyhow::Result<()> {
         jwt_token: secrets.jwt_token,
         control_plane_jwt_token: secrets.control_plane_jwt_token,
         compute_hook_url: args.compute_hook_url,
-        max_unavailable_interval: args
-            .max_unavailable_interval
+        max_offline_interval: args
+            .max_offline_interval
             .map(humantime::Duration::into)
-            .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
+            .unwrap_or(MAX_OFFLINE_INTERVAL_DEFAULT),
+        max_warming_up_interval: args
+            .max_warming_up_interval
+            .map(humantime::Duration::into)
+            .unwrap_or(MAX_WARMING_UP_INTERVAL_DEFAULT),
         reconciler_concurrency: args
             .reconciler_concurrency
             .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index fff44aaf26..ea765ca123 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -3,7 +3,7 @@ use std::{str::FromStr, time::Duration};
 use pageserver_api::{
     controller_api::{
         NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
-        TenantLocateResponseShard, UtilizationScore,
+        TenantLocateResponseShard,
     },
     shard::TenantShardId,
 };
@@ -46,6 +46,8 @@ pub(crate) struct Node {
 /// whether/how they changed it.
 pub(crate) enum AvailabilityTransition {
     ToActive,
+    ToWarmingUpFromActive,
+    ToWarmingUpFromOffline,
     ToOffline,
     Unchanged,
 }
@@ -90,22 +92,34 @@ impl Node {
         }
     }
 
+    pub(crate) fn get_availability(&self) -> NodeAvailability {
+        self.availability
+    }
+
     pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
+        use AvailabilityTransition::*;
+        use NodeAvailability::WarmingUp;
+
         match self.get_availability_transition(availability) {
-            AvailabilityTransition::ToActive => {
+            ToActive => {
                 // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
                 // users of previously-cloned copies of the node will still see the old cancellation
                 // state.  For example, Reconcilers in flight will have to complete and be spawned
                 // again to realize that the node has become available.
                 self.cancel = CancellationToken::new();
             }
-            AvailabilityTransition::ToOffline => {
+            ToOffline | ToWarmingUpFromActive => {
                 // Fire the node's cancellation token to cancel any in-flight API requests to it
                 self.cancel.cancel();
             }
-            AvailabilityTransition::Unchanged => {}
+            Unchanged | ToWarmingUpFromOffline => {}
+        }
+
+        if let (WarmingUp(crnt), WarmingUp(proposed)) = (self.availability, availability) {
+            self.availability = WarmingUp(std::cmp::max(crnt, proposed));
+        } else {
+            self.availability = availability;
         }
-        self.availability = availability;
     }
 
     /// Without modifying the availability of the node, convert the intended availability
@@ -120,16 +134,10 @@ impl Node {
         match (self.availability, availability) {
             (Offline, Active(_)) => ToActive,
             (Active(_), Offline) => ToOffline,
-            // Consider the case when the storage controller handles the re-attach of a node
-            // before the heartbeats detect that the node is back online. We still need
-            // [`Service::node_configure`] to attempt reconciliations for shards with an
-            // unknown observed location.
-            // The unsavoury match arm below handles this situation.
-            (Active(lhs), Active(rhs))
-                if lhs == UtilizationScore::worst() && rhs < UtilizationScore::worst() =>
-            {
-                ToActive
-            }
+            (Active(_), WarmingUp(_)) => ToWarmingUpFromActive,
+            (WarmingUp(_), Offline) => ToOffline,
+            (WarmingUp(_), Active(_)) => ToActive,
+            (Offline, WarmingUp(_)) => ToWarmingUpFromOffline,
             _ => Unchanged,
         }
     }
@@ -147,7 +155,7 @@ impl Node {
     pub(crate) fn may_schedule(&self) -> MaySchedule {
         let score = match self.availability {
             NodeAvailability::Active(score) => score,
-            NodeAvailability::Offline => return MaySchedule::No,
+            NodeAvailability::Offline | NodeAvailability::WarmingUp(_) => return MaySchedule::No,
         };
 
         match self.scheduling {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 2a6d5d3578..860fe4802a 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -100,9 +100,13 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
 /// How long a node may be unresponsive to heartbeats before we declare it offline.
 /// This must be long enough to cover node restarts as well as normal operations: in future
-/// it should be separated into distinct timeouts for startup vs. normal operation
-/// (`<https://github.com/neondatabase/neon/issues/7552>`)
-pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
+pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
+
+/// How long a node may be unresponsive to heartbeats during start up before we declare it
+/// offline. This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's
+/// handling of the re-attach response may take a long time and blocks heartbeats from
+/// being handled on the pageserver side.
+pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
 
 #[derive(Clone, strum_macros::Display)]
 enum TenantOperations {
@@ -236,7 +240,12 @@ pub struct Config {
     /// Grace period within which a pageserver does not respond to heartbeats, but is still
     /// considered active. Once the grace period elapses, the next heartbeat failure will
     /// mark the pagseserver offline.
-    pub max_unavailable_interval: Duration,
+    pub max_offline_interval: Duration,
+
+    /// Extended grace period within which pageserver may not respond to heartbeats.
+    /// This extended grace period kicks in after the node has been drained for restart
+    /// and/or upon handling the re-attach request from a node.
+    pub max_warming_up_interval: Duration,
 
     /// How many Reconcilers may be spawned concurrently
     pub reconciler_concurrency: usize,
@@ -587,6 +596,9 @@ impl Service {
                         online_nodes.insert(node_id, utilization);
                     }
                     PageserverState::Offline => {}
+                    PageserverState::WarmingUp { .. } => {
+                        unreachable!("Nodes are never marked warming-up during startup reconcile")
+                    }
                 }
             }
         }
@@ -779,63 +791,54 @@ impl Service {
             let res = self.heartbeater.heartbeat(nodes).await;
             if let Ok(deltas) = res {
                 for (node_id, state) in deltas.0 {
-                    let (new_node, new_availability) = match state {
-                        PageserverState::Available {
-                            utilization, new, ..
-                        } => (
-                            new,
-                            NodeAvailability::Active(UtilizationScore(
-                                utilization.utilization_score,
-                            )),
+                    let new_availability = match state {
+                        PageserverState::Available { utilization, .. } => NodeAvailability::Active(
+                            UtilizationScore(utilization.utilization_score),
                         ),
-                        PageserverState::Offline => (false, NodeAvailability::Offline),
+                        PageserverState::WarmingUp { started_at } => {
+                            NodeAvailability::WarmingUp(started_at)
+                        }
+                        PageserverState::Offline => {
+                            // The node might have been placed in the WarmingUp state
+                            // while the heartbeat round was on-going. Hence, filter out
+                            // offline transitions for WarmingUp nodes that are still within
+                            // their grace period.
+                            if let Ok(NodeAvailability::WarmingUp(started_at)) =
+                                self.get_node(node_id).await.map(|n| n.get_availability())
+                            {
+                                let now = Instant::now();
+                                if now - started_at >= self.config.max_warming_up_interval {
+                                    NodeAvailability::Offline
+                                } else {
+                                    NodeAvailability::WarmingUp(started_at)
+                                }
+                            } else {
+                                NodeAvailability::Offline
+                            }
+                        }
                     };
 
-                    if new_node {
-                        // When the heartbeats detect a newly added node, we don't wish
-                        // to attempt to reconcile the shards assigned to it. The node
-                        // is likely handling it's re-attach response, so reconciling now
-                        // would be counterproductive.
-                        //
-                        // Instead, update the in-memory state with the details learned about the
-                        // node.
-                        let mut locked = self.inner.write().unwrap();
-                        let (nodes, _tenants, scheduler) = locked.parts_mut();
+                    // This is the code path for geniune availability transitions (i.e node
+                    // goes unavailable and/or comes back online).
+                    let res = self
+                        .node_configure(node_id, Some(new_availability), None)
+                        .await;
 
-                        let mut new_nodes = (**nodes).clone();
-
-                        if let Some(node) = new_nodes.get_mut(&node_id) {
-                            node.set_availability(new_availability);
-                            scheduler.node_upsert(node);
+                    match res {
+                        Ok(()) => {}
+                        Err(ApiError::NotFound(_)) => {
+                            // This should be rare, but legitimate since the heartbeats are done
+                            // on a snapshot of the nodes.
+                            tracing::info!("Node {} was not found after heartbeat round", node_id);
                         }
-
-                        locked.nodes = Arc::new(new_nodes);
-                    } else {
-                        // This is the code path for geniune availability transitions (i.e node
-                        // goes unavailable and/or comes back online).
-                        let res = self
-                            .node_configure(node_id, Some(new_availability), None)
-                            .await;
-
-                        match res {
-                            Ok(()) => {}
-                            Err(ApiError::NotFound(_)) => {
-                                // This should be rare, but legitimate since the heartbeats are done
-                                // on a snapshot of the nodes.
-                                tracing::info!(
-                                    "Node {} was not found after heartbeat round",
-                                    node_id
-                                );
-                            }
-                            Err(err) => {
-                                // Transition to active involves reconciling: if a node responds to a heartbeat then
-                                // becomes unavailable again, we may get an error here.
-                                tracing::error!(
-                                    "Failed to update node {} after heartbeat round: {}",
-                                    node_id,
-                                    err
-                                );
-                            }
+                        Err(err) => {
+                            // Transition to active involves reconciling: if a node responds to a heartbeat then
+                            // becomes unavailable again, we may get an error here.
+                            tracing::error!(
+                                "Failed to update node {} after heartbeat round: {}",
+                                node_id,
+                                err
+                            );
                         }
                     }
                 }
@@ -1152,7 +1155,8 @@ impl Service {
         let cancel = CancellationToken::new();
         let heartbeater = Heartbeater::new(
             config.jwt_token.clone(),
-            config.max_unavailable_interval,
+            config.max_offline_interval,
+            config.max_warming_up_interval,
             cancel.clone(),
         );
         let this = Arc::new(Self {
@@ -1664,21 +1668,23 @@ impl Service {
                     | NodeSchedulingPolicy::Filling
             );
 
-            if !node.is_available() || reset_scheduling {
-                let mut new_nodes = (**nodes).clone();
-                if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
-                    if !node.is_available() {
-                        node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
-                    }
-
-                    if reset_scheduling {
-                        node.set_scheduling(NodeSchedulingPolicy::Active);
-                    }
-
-                    scheduler.node_upsert(node);
-                    let new_nodes = Arc::new(new_nodes);
-                    *nodes = new_nodes;
+            let mut new_nodes = (**nodes).clone();
+            if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
+                if reset_scheduling {
+                    node.set_scheduling(NodeSchedulingPolicy::Active);
                 }
+
+                tracing::info!("Marking {} warming-up on reattach", reattach_req.node_id);
+                node.set_availability(NodeAvailability::WarmingUp(std::time::Instant::now()));
+
+                scheduler.node_upsert(node);
+                let new_nodes = Arc::new(new_nodes);
+                *nodes = new_nodes;
+            } else {
+                tracing::error!(
+                    "Reattaching node {} was removed while processing the request",
+                    reattach_req.node_id
+                );
             }
         }
 
@@ -4719,6 +4725,15 @@ impl Service {
 
                 // TODO: in the background, we should balance work back onto this pageserver
             }
+            // No action required for the intermediate unavailable state.
+            // When we transition into active or offline from the unavailable state,
+            // the correct handling above will kick in.
+            AvailabilityTransition::ToWarmingUpFromActive => {
+                tracing::info!("Node {} transition to unavailable from active", node_id);
+            }
+            AvailabilityTransition::ToWarmingUpFromOffline => {
+                tracing::info!("Node {} transition to unavailable from offline", node_id);
+            }
             AvailabilityTransition::Unchanged => {
                 tracing::debug!("Node {} no availability change during config", node_id);
             }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9e39457c06..76ab46b01a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2148,6 +2148,23 @@ class StorageControllerApiException(Exception):
         self.status_code = status_code
 
 
+# See libs/pageserver_api/src/controller_api.rs
+# for the rust definitions of the enums below
+# TODO: Replace with `StrEnum` when we upgrade to python 3.11
+class PageserverAvailability(str, Enum):
+    ACTIVE = "Active"
+    UNAVAILABLE = "Unavailable"
+    OFFLINE = "Offline"
+
+
+class PageserverSchedulingPolicy(str, Enum):
+    ACTIVE = "Active"
+    DRAINING = "Draining"
+    FILLING = "Filling"
+    PAUSE = "Pause"
+    PAUSE_FOR_RESTART = "PauseForRestart"
+
+
 class NeonStorageController(MetricsGetter, LogUtils):
     def __init__(self, env: NeonEnv, auth_enabled: bool):
         self.env = env
@@ -2531,26 +2548,54 @@ class NeonStorageController(MetricsGetter, LogUtils):
         )
         log.info("storage controller passed consistency check")
 
+    def node_registered(self, node_id: int) -> bool:
+        """
+        Returns true if the storage controller can confirm
+        it knows of pageserver with 'node_id'
+        """
+        try:
+            self.node_status(node_id)
+        except StorageControllerApiException as e:
+            if e.status_code == 404:
+                return False
+            else:
+                raise e
+
+        return True
+
     def poll_node_status(
-        self, node_id: int, desired_scheduling_policy: str, max_attempts: int, backoff: int
+        self,
+        node_id: int,
+        desired_availability: Optional[PageserverAvailability],
+        desired_scheduling_policy: Optional[PageserverSchedulingPolicy],
+        max_attempts: int,
+        backoff: int,
     ):
         """
-        Poll the node status until it reaches 'desired_scheduling_policy' or 'max_attempts' have been exhausted
+        Poll the node status until it reaches 'desired_scheduling_policy' and 'desired_availability'
+        or 'max_attempts' have been exhausted
         """
-        log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
+        log.info(
+            f"Polling {node_id} for {desired_scheduling_policy} scheduling policy and {desired_availability} availability"
+        )
         while max_attempts > 0:
             try:
                 status = self.node_status(node_id)
                 policy = status["scheduling"]
-                if policy == desired_scheduling_policy:
+                availability = status["availability"]
+                if (desired_scheduling_policy is None or policy == desired_scheduling_policy) and (
+                    desired_availability is None or availability == desired_availability
+                ):
                     return
                 else:
                     max_attempts -= 1
-                    log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
+                    log.info(
+                        f"Status call returned {policy=} {availability=} ({max_attempts} attempts left)"
+                    )
 
                     if max_attempts == 0:
                         raise AssertionError(
-                            f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
+                            f"Status for {node_id=} did not reach {desired_scheduling_policy=} {desired_availability=}"
                         )
 
                     time.sleep(backoff)
@@ -2694,6 +2739,14 @@ class NeonPageserver(PgProtocol, LogUtils):
             self.id, extra_env_vars=extra_env_vars, timeout_in_seconds=timeout_in_seconds
         )
         self.running = True
+
+        if self.env.storage_controller.running and self.env.storage_controller.node_registered(
+            self.id
+        ):
+            self.env.storage_controller.poll_node_status(
+                self.id, PageserverAvailability.ACTIVE, None, max_attempts=20, backoff=1
+            )
+
         return self
 
     def stop(self, immediate: bool = False) -> "NeonPageserver":
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 3a6113706f..281c9271e9 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -8,7 +8,12 @@ import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PageserverAvailability,
+    PageserverSchedulingPolicy,
+)
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pg_version import PgVersion
 
@@ -106,7 +111,8 @@ def test_storage_controller_many_tenants(
         # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
         # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to
         # guard against regressions in restart time.
-        "max_unavailable": "300s"
+        "max_offline": "30s",
+        "max_warming_up": "300s",
     }
     neon_env_builder.control_plane_compute_hook_api = (
         compute_reconfigure_listener.control_plane_compute_hook_api
@@ -274,7 +280,11 @@ def test_storage_controller_many_tenants(
         )
 
         env.storage_controller.poll_node_status(
-            ps.id, "PauseForRestart", max_attempts=24, backoff=5
+            ps.id,
+            PageserverAvailability.ACTIVE,
+            PageserverSchedulingPolicy.PAUSE_FOR_RESTART,
+            max_attempts=24,
+            backoff=5,
         )
 
         shard_counts = get_consistent_node_shard_counts(env, total_shards)
@@ -285,12 +295,24 @@ def test_storage_controller_many_tenants(
         assert sum(shard_counts.values()) == total_shards
 
         ps.restart()
-        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=1)
+        env.storage_controller.poll_node_status(
+            ps.id,
+            PageserverAvailability.ACTIVE,
+            PageserverSchedulingPolicy.ACTIVE,
+            max_attempts=24,
+            backoff=1,
+        )
 
         env.storage_controller.retryable_node_operation(
             lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
         )
-        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=5)
+        env.storage_controller.poll_node_status(
+            ps.id,
+            PageserverAvailability.ACTIVE,
+            PageserverSchedulingPolicy.ACTIVE,
+            max_attempts=24,
+            backoff=5,
+        )
 
         shard_counts = get_consistent_node_shard_counts(env, total_shards)
         log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 041942cda3..8941ddd281 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -596,19 +596,26 @@ def test_multi_attach(
     for ps in pageservers:
         ps.stop()
 
-    # Returning to a normal healthy state: all pageservers will start, but only the one most
-    # recently attached via the control plane will re-attach on startup
+    # Returning to a normal healthy state: all pageservers will start
     for ps in pageservers:
         ps.start()
 
-    with pytest.raises(PageserverApiException):
-        _detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
-    with pytest.raises(PageserverApiException):
-        _detail = http_clients[1].timeline_detail(tenant_id, timeline_id)
-    _detail = http_clients[2].timeline_detail(tenant_id, timeline_id)
+    # Pageservers are marked offline by the storage controller during the rolling restart
+    # above. This may trigger a reschedulling, so there's no guarantee that the tenant
+    # shard ends up attached to the most recent ps.
+    raised = 0
+    serving_ps_idx = None
+    for idx, http_client in enumerate(http_clients):
+        try:
+            _detail = http_client.timeline_detail(tenant_id, timeline_id)
+            serving_ps_idx = idx
+        except PageserverApiException:
+            raised += 1
+
+    assert raised == 2 and serving_ps_idx is not None
 
     # All data we wrote while multi-attached remains readable
-    workload.validate(pageservers[2].id)
+    workload.validate(pageservers[serving_ps_idx].id)
 
 
 def test_upgrade_generationless_local_file_paths(
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index 4ce53df214..dccc1264e3 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -15,6 +15,10 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
 
+    # We inject a delay of 15 seconds for tenant activation below.
+    # Hence, bump the max delay here to not skip over the activation.
+    neon_env_builder.pageserver_config_override = 'background_task_maximum_delay="20s"'
+
     env = neon_env_builder.init_start()
 
     endpoint = env.endpoints.create_start("main")
@@ -70,7 +74,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
     # pageserver does if a compute node connects and sends a request for the tenant
     # while it's still in Loading state. (It waits for the loading to finish, and then
     # processes the request.)
-    tenant_load_delay_ms = 5000
+    tenant_load_delay_ms = 15000
     env.pageserver.stop()
     env.pageserver.start(
         extra_env_vars={"FAILPOINTS": f"before-attaching-tenant=return({tenant_load_delay_ms})"}
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 58d61eab0d..f43141c2d8 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -122,7 +122,12 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
             "scheduling": "Stop",
         },
     )
-    env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy Stop.*")
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*Scheduling is disabled by policy Stop.*",
+            ".*Skipping reconcile for policy Stop.*",
+        ]
+    )
 
     # We use a fixed seed to make the test reproducible: we want a randomly
     # chosen order, but not to change the order every time we run the test.
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 90c6e26d01..9c45af7c1b 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -207,7 +207,8 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
 
     neon_env_builder.storage_controller_config = {
         # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
-        "max_unavailable": "300s"
+        "max_offline": "30s",
+        "max_warming_up": "300s",
     }
 
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 741f16685e..9a47d7d651 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -12,6 +12,8 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
+    PageserverAvailability,
+    PageserverSchedulingPolicy,
     PgBin,
     StorageControllerApiException,
     TokenScope,
@@ -918,6 +920,8 @@ def test_storage_controller_tenant_deletion(
 
 class Failure:
     pageserver_id: int
+    offline_timeout: int
+    must_detect_after: int
 
     def apply(self, env: NeonEnv):
         raise NotImplementedError()
@@ -930,9 +934,11 @@ class Failure:
 
 
 class NodeStop(Failure):
-    def __init__(self, pageserver_ids, immediate):
+    def __init__(self, pageserver_ids, immediate, offline_timeout, must_detect_after):
         self.pageserver_ids = pageserver_ids
         self.immediate = immediate
+        self.offline_timeout = offline_timeout
+        self.must_detect_after = must_detect_after
 
     def apply(self, env: NeonEnv):
         for ps_id in self.pageserver_ids:
@@ -948,10 +954,42 @@ class NodeStop(Failure):
         return self.pageserver_ids
 
 
+class NodeRestartWithSlowReattach(Failure):
+    def __init__(self, pageserver_id, offline_timeout, must_detect_after):
+        self.pageserver_id = pageserver_id
+        self.offline_timeout = offline_timeout
+        self.must_detect_after = must_detect_after
+        self.thread = None
+
+    def apply(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.stop(immediate=False)
+
+        def start_ps():
+            pageserver.start(
+                extra_env_vars={"FAILPOINTS": "control-plane-client-re-attach=return(30000)"}
+            )
+
+        self.thread = threading.Thread(target=start_ps)
+        self.thread.start()
+
+    def clear(self, env: NeonEnv):
+        if self.thread is not None:
+            self.thread.join()
+
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.http_client().configure_failpoints(("control-plane-client-re-attach", "off"))
+
+    def nodes(self):
+        return [self.pageserver_id]
+
+
 class PageserverFailpoint(Failure):
-    def __init__(self, failpoint, pageserver_id):
+    def __init__(self, failpoint, pageserver_id, offline_timeout, must_detect_after):
         self.failpoint = failpoint
         self.pageserver_id = pageserver_id
+        self.offline_timeout = offline_timeout
+        self.must_detect_after = must_detect_after
 
     def apply(self, env: NeonEnv):
         pageserver = env.get_pageserver(self.pageserver_id)
@@ -987,15 +1025,28 @@ def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]:
 @pytest.mark.parametrize(
     "failure",
     [
-        NodeStop(pageserver_ids=[1], immediate=False),
-        NodeStop(pageserver_ids=[1], immediate=True),
-        NodeStop(pageserver_ids=[1, 2], immediate=True),
-        PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"),
+        NodeStop(pageserver_ids=[1], immediate=False, offline_timeout=20, must_detect_after=5),
+        NodeStop(pageserver_ids=[1], immediate=True, offline_timeout=20, must_detect_after=5),
+        NodeStop(pageserver_ids=[1, 2], immediate=True, offline_timeout=20, must_detect_after=5),
+        PageserverFailpoint(
+            pageserver_id=1,
+            failpoint="get-utilization-http-handler",
+            offline_timeout=20,
+            must_detect_after=5,
+        ),
+        # Instrument a scenario where the node is slow to re-attach. The re-attach request itself
+        # should serve as a signal to the storage controller to use a more lenient heartbeat timeout.
+        NodeRestartWithSlowReattach(pageserver_id=1, offline_timeout=60, must_detect_after=15),
     ],
 )
 def test_storage_controller_heartbeats(
     neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure
 ):
+    neon_env_builder.storage_controller_config = {
+        "max_offline": "10s",
+        "max_warming_up": "20s",
+    }
+
     neon_env_builder.num_pageservers = 2
     env = neon_env_builder.init_configs()
     env.start()
@@ -1061,9 +1112,12 @@ def test_storage_controller_heartbeats(
             if node["id"] in offline_node_ids:
                 assert node["availability"] == "Offline"
 
-    # A node is considered offline if the last successful heartbeat
-    # was more than 10 seconds ago (hardcoded in the storage controller).
-    wait_until(20, 1, nodes_offline)
+    start = time.time()
+    wait_until(failure.offline_timeout, 1, nodes_offline)
+    detected_after = time.time() - start
+    log.info(f"Detected node failures after {detected_after}s")
+
+    assert detected_after >= failure.must_detect_after
 
     # .. expecting the tenant on the offline node to be migrated
     def tenant_migrated():
@@ -1546,7 +1600,13 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
         env.storage_controller.retryable_node_operation(
             lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
         )
-        env.storage_controller.poll_node_status(ps.id, "PauseForRestart", max_attempts=6, backoff=5)
+        env.storage_controller.poll_node_status(
+            ps.id,
+            PageserverAvailability.ACTIVE,
+            PageserverSchedulingPolicy.PAUSE_FOR_RESTART,
+            max_attempts=6,
+            backoff=5,
+        )
 
         shard_counts = get_node_shard_counts(env, tenant_ids)
         log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
@@ -1556,12 +1616,24 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
         assert sum(shard_counts.values()) == total_shards
 
         ps.restart()
-        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=10, backoff=1)
+        env.storage_controller.poll_node_status(
+            ps.id,
+            PageserverAvailability.ACTIVE,
+            PageserverSchedulingPolicy.ACTIVE,
+            max_attempts=10,
+            backoff=1,
+        )
 
         env.storage_controller.retryable_node_operation(
             lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
         )
-        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=6, backoff=5)
+        env.storage_controller.poll_node_status(
+            ps.id,
+            PageserverAvailability.ACTIVE,
+            PageserverSchedulingPolicy.ACTIVE,
+            max_attempts=6,
+            backoff=5,
+        )
 
         shard_counts = get_node_shard_counts(env, tenant_ids)
         log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
@@ -1606,11 +1678,23 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
         backoff=2,
     )
 
-    env.storage_controller.poll_node_status(ps_id_to_drain, "Draining", max_attempts=6, backoff=2)
+    env.storage_controller.poll_node_status(
+        ps_id_to_drain,
+        PageserverAvailability.ACTIVE,
+        PageserverSchedulingPolicy.DRAINING,
+        max_attempts=6,
+        backoff=2,
+    )
 
     env.storage_controller.cancel_node_drain(ps_id_to_drain)
 
-    env.storage_controller.poll_node_status(ps_id_to_drain, "Active", max_attempts=6, backoff=2)
+    env.storage_controller.poll_node_status(
+        ps_id_to_drain,
+        PageserverAvailability.ACTIVE,
+        PageserverSchedulingPolicy.ACTIVE,
+        max_attempts=6,
+        backoff=2,
+    )
 
 
 @pytest.mark.parametrize("while_offline", [True, False])
diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py
index b62398d427..840c7159ad 100644
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -48,13 +48,12 @@ def test_threshold_based_eviction(
     tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
 
     ps_http = env.pageserver.http_client()
-    assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == {
-        "kind": "NoEviction"
-    }
+    vps_http = env.storage_controller.pageserver_api()
+    assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] is None
 
     eviction_threshold = 10
     eviction_period = 2
-    ps_http.set_tenant_config(
+    vps_http.set_tenant_config(
         tenant_id,
         {
             "eviction_policy": {
@@ -64,7 +63,7 @@ def test_threshold_based_eviction(
             },
         },
     )
-    assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == {
+    assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == {
         "kind": "LayerAccessThreshold",
         "threshold": f"{eviction_threshold}s",
         "period": f"{eviction_period}s",
@@ -73,7 +72,7 @@ def test_threshold_based_eviction(
     # restart because changing tenant config is not instant
     env.pageserver.restart()
 
-    assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == {
+    assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == {
         "kind": "LayerAccessThreshold",
         "threshold": f"{eviction_threshold}s",
         "period": f"{eviction_period}s",
@@ -81,7 +80,7 @@ def test_threshold_based_eviction(
 
     # create a bunch of L1s, only the least of which will need to be resident
     compaction_threshold = 3  # create L1 layers quickly
-    ps_http.patch_tenant_config_client_side(
+    vps_http.patch_tenant_config_client_side(
         tenant_id,
         inserts={
             # Disable gc and compaction to avoid on-demand downloads from their side.
@@ -154,7 +153,7 @@ def test_threshold_based_eviction(
     while time.time() - started_waiting_at < observation_window:
         current = (
             time.time(),
-            MapInfoProjection(ps_http.layer_map_info(tenant_id, timeline_id)),
+            MapInfoProjection(vps_http.layer_map_info(tenant_id, timeline_id)),
         )
         last = map_info_changes[-1] if map_info_changes else (0, None)
         if last[1] is None or current[1] != last[1]:

From 24ea9f9f600d588bbae8812ce2c8e6570fad67f0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 25 Jul 2024 14:19:38 +0100
Subject: [PATCH 1271/1571] tests: always scrub on test exit when using
 S3Storage (#8437)

## Problem

Currently, tests may have a scrub during teardown if they ask for it,
but most tests don't request it. To detect "unknown unknowns", let's run
it at the end of every test where possible. This is similar to asserting
that there are no errors in the log at the end of tests.

## Summary of changes

- Remove explicit `enable_scrub_on_exit`
- Always scrub if remote storage is an S3Storage.
---
 test_runner/fixtures/neon_fixtures.py         | 33 ++++++++-----------
 .../regress/test_pageserver_restart.py        |  2 --
 .../regress/test_pageserver_secondary.py      |  6 ++++
 test_runner/regress/test_pg_regress.py        |  3 --
 test_runner/regress/test_sharding.py          |  3 --
 test_runner/regress/test_tenant_delete.py     |  3 ++
 test_runner/regress/test_timeline_delete.py   |  6 ++++
 7 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 76ab46b01a..d6718fca39 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -523,7 +523,7 @@ class NeonEnvBuilder:
         self.preserve_database_files = preserve_database_files
         self.initial_tenant = initial_tenant or TenantId.generate()
         self.initial_timeline = initial_timeline or TimelineId.generate()
-        self.scrub_on_exit = False
+        self.enable_scrub_on_exit = True
         self.test_output_dir = test_output_dir
         self.test_overlay_dir = test_overlay_dir
         self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = []
@@ -852,6 +852,13 @@ class NeonEnvBuilder:
         )
         ident_state_dir.rmdir()  # should be empty since we moved `upper` out
 
+    def disable_scrub_on_exit(self):
+        """
+        Some tests intentionally leave the remote storage contents empty or corrupt,
+        so it doesn't make sense to do the usual scrub at the end of the test.
+        """
+        self.enable_scrub_on_exit = False
+
     def overlay_cleanup_teardown(self):
         """
         Unmount the overlayfs mounts created by `self.overlay_mount()`.
@@ -877,23 +884,6 @@ class NeonEnvBuilder:
         # assert all overlayfs mounts in our test directory are gone
         assert [] == list(overlayfs.iter_mounts_beneath(self.test_overlay_dir))
 
-    def enable_scrub_on_exit(self):
-        """
-        Call this if you would like the fixture to automatically run
-        storage_scrubber at the end of the test, as a bidirectional test
-        that the scrubber is working properly, and that the code within
-        the test didn't produce any invalid remote state.
-        """
-
-        if not isinstance(self.pageserver_remote_storage, S3Storage):
-            # The scrubber can't talk to e.g. LocalFS -- it needs
-            # an HTTP endpoint (mock is fine) to connect to.
-            raise RuntimeError(
-                "Cannot scrub with remote_storage={self.pageserver_remote_storage}, require an S3 endpoint"
-            )
-
-        self.scrub_on_exit = True
-
     def enable_pageserver_remote_storage(
         self,
         remote_storage_kind: RemoteStorageKind,
@@ -995,7 +985,12 @@ class NeonEnvBuilder:
             )
             cleanup_error = None
 
-            if self.scrub_on_exit:
+            # If we are running with S3Storage (required by the scrubber), check that whatever the test
+            # did does not generate any corruption
+            if (
+                isinstance(self.env.pageserver_remote_storage, S3Storage)
+                and self.enable_scrub_on_exit
+            ):
                 try:
                     self.env.storage_scrubber.scan_metadata()
                 except Exception as e:
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index dccc1264e3..68a45f957c 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -13,7 +13,6 @@ from fixtures.utils import wait_until
 # running.
 def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
-    neon_env_builder.enable_scrub_on_exit()
 
     # We inject a delay of 15 seconds for tenant activation below.
     # Hence, bump the max delay here to not skip over the activation.
@@ -161,7 +160,6 @@ def test_pageserver_chaos(
         pytest.skip("times out in debug builds")
 
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
-    neon_env_builder.enable_scrub_on_exit()
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count
 
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index f43141c2d8..53f69b5b26 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -390,6 +390,9 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
     # (reproduce https://github.com/neondatabase/neon/issues/6802)
     pageserver_b.http_client().tenant_delete(tenant_id)
 
+    # We deleted our only tenant, and the scrubber fails if it detects nothing
+    neon_env_builder.disable_scrub_on_exit()
+
 
 def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
     """
@@ -589,6 +592,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     )
     workload.stop()
 
+    # We deleted our only tenant, and the scrubber fails if it detects nothing
+    neon_env_builder.disable_scrub_on_exit()
+
 
 def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
     """
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index d5b5ac3f75..6f7ea0092a 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -138,7 +138,6 @@ def test_pg_regress(
         neon_env_builder.num_pageservers = shard_count
 
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
-    neon_env_builder.enable_scrub_on_exit()
     env = neon_env_builder.init_start(
         initial_tenant_conf=TENANT_CONF,
         initial_tenant_shard_count=shard_count,
@@ -202,7 +201,6 @@ def test_isolation(
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
-    neon_env_builder.enable_scrub_on_exit()
     env = neon_env_builder.init_start(
         initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
     )
@@ -265,7 +263,6 @@ def test_sql_regress(
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
-    neon_env_builder.enable_scrub_on_exit()
     env = neon_env_builder.init_start(
         initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
     )
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 9c45af7c1b..bc43bc77fa 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -47,7 +47,6 @@ def test_sharding_smoke(
     # Use S3-compatible remote storage so that we can scrub: this test validates
     # that the scrubber doesn't barf when it sees a sharded tenant.
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
-    neon_env_builder.enable_scrub_on_exit()
 
     neon_env_builder.preserve_database_files = True
 
@@ -128,7 +127,6 @@ def test_sharding_smoke(
     # Check the scrubber isn't confused by sharded content, then disable
     # it during teardown because we'll have deleted by then
     env.storage_scrubber.scan_metadata()
-    neon_env_builder.scrub_on_exit = False
 
     env.storage_controller.pageserver_api().tenant_delete(tenant_id)
     assert_prefix_empty(
@@ -373,7 +371,6 @@ def test_sharding_split_smoke(
     # Use S3-compatible remote storage so that we can scrub: this test validates
     # that the scrubber doesn't barf when it sees a sharded tenant.
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
-    neon_env_builder.enable_scrub_on_exit()
 
     neon_env_builder.preserve_database_files = True
 
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 6d20b3d0de..c343b349cf 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -315,6 +315,9 @@ def test_tenant_delete_races_timeline_creation(
     # Zero tenants remain (we deleted the default tenant)
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
 
+    # We deleted our only tenant, and the scrubber fails if it detects nothing
+    neon_env_builder.disable_scrub_on_exit()
+
 
 def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
     """
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index da37f469b3..6d96dda391 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -485,6 +485,9 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
         lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage),
     )
 
+    # We deleted our only tenant, and the scrubber fails if it detects nothing
+    neon_env_builder.disable_scrub_on_exit()
+
 
 @pytest.mark.parametrize(
     "stuck_failpoint",
@@ -703,6 +706,9 @@ def test_timeline_delete_works_for_remote_smoke(
     # Assume it is mock server inconsistency and check twice.
     wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage))
 
+    # We deleted our only tenant, and the scrubber fails if it detects nothing
+    neon_env_builder.disable_scrub_on_exit()
+
 
 def test_delete_orphaned_objects(
     neon_env_builder: NeonEnvBuilder,

From 775c0c88928f4c94de7dc52043b982a9791f6c48 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 25 Jul 2024 15:00:42 +0100
Subject: [PATCH 1272/1571] tests: adjust threshold in
 test_partial_evict_tenant (#8509)

## Problem

This test was destabilized by
https://github.com/neondatabase/neon/pull/8431. The threshold is
arbitrary & failures are still quite close to it. At a high level the
test is asserting "eviction was approximately fair to these tenants",
which appears to still be the case when the abs diff between ratios is
slightly higher at ~0.6-0.7.

## Summary of changes

- Change threshold from 0.06 to 0.065. Based on the last ~10 failures
that should be sufficient.
---
 test_runner/regress/test_disk_usage_eviction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 91c7b97fdd..85616c3fe2 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -591,7 +591,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
         abs_diff = abs(ratio - expected_ratio)
         assert original_count > count_now
 
-        expectation = 0.06
+        expectation = 0.065
         log.info(
             f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}"
         )

From 3977e0a7a3d8a8c205d35dde621c90b39e51dbf2 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 25 Jul 2024 15:13:34 +0100
Subject: [PATCH 1273/1571] storcon: shutdown with clean observed state 
 (#8494)

## Problem
Storcon shutdown did not produce a clean observed state. This is not a
problem at the moment, but we will need to stop all reconciles with
clean observed state for rolling restarts.

I tried to test this by collecting the observed state during shutdown
and comparing it with the in-memory observed
state, but it doesn't work because a lot of tests use the cursed attach
hook to create tenants directly through the ps.

## Summary of Changes
Rework storcon shutdown as follows:
* Reconcilers get a separate cancellation token which is a child token
of the global `Service::cancel`.
* Reconcilers get a separate gate
* Add a mechanism to drain the reconciler result queue before
* Put all of this together into a clean shutdown sequence

Related https://github.com/neondatabase/cloud/issues/14701
---
 storage_controller/src/service.rs      | 58 ++++++++++++++++----------
 storage_controller/src/tenant_shard.rs |  7 +++-
 2 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 860fe4802a..e890c5e45e 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -278,7 +278,7 @@ pub struct Service {
     config: Config,
     persistence: Arc<Persistence>,
     compute_hook: Arc<ComputeHook>,
-    result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
+    result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResultRequest>,
 
     heartbeater: Heartbeater,
 
@@ -308,9 +308,15 @@ pub struct Service {
     // Process shutdown will fire this token
     cancel: CancellationToken,
 
+    // Child token of [`Service::cancel`] used by reconcilers
+    reconcilers_cancel: CancellationToken,
+
     // Background tasks will hold this gate
     gate: Gate,
 
+    // Reconcilers background tasks will hold this gate
+    reconcilers_gate: Gate,
+
     /// This waits for initial reconciliation with pageservers to complete.  Until this barrier
     /// passes, it isn't safe to do any actions that mutate tenants.
     pub(crate) startup_complete: Barrier,
@@ -397,6 +403,11 @@ struct ShardUpdate {
     generation: Option<Generation>,
 }
 
+pub(crate) enum ReconcileResultRequest {
+    ReconcileResult(ReconcileResult),
+    Stop,
+}
+
 impl Service {
     pub fn get_config(&self) -> &Config {
         &self.config
@@ -753,7 +764,7 @@ impl Service {
         const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20);
 
         let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD);
-        while !self.cancel.is_cancelled() {
+        while !self.reconcilers_cancel.is_cancelled() {
             tokio::select! {
               _ = interval.tick() => {
                 let reconciles_spawned = self.reconcile_all();
@@ -766,7 +777,7 @@ impl Service {
                     }
                 }
             }
-              _ = self.cancel.cancelled() => return
+              _ = self.reconcilers_cancel.cancelled() => return
             }
         }
     }
@@ -937,7 +948,7 @@ impl Service {
 
     async fn process_results(
         &self,
-        mut result_rx: tokio::sync::mpsc::UnboundedReceiver<ReconcileResult>,
+        mut result_rx: tokio::sync::mpsc::UnboundedReceiver<ReconcileResultRequest>,
         mut bg_compute_hook_result_rx: tokio::sync::mpsc::Receiver<
             Result<(), (TenantShardId, NotifyError)>,
         >,
@@ -947,8 +958,8 @@ impl Service {
             tokio::select! {
                 r = result_rx.recv() => {
                     match r {
-                        Some(result) => {self.process_result(result);},
-                        None => {break;}
+                        Some(ReconcileResultRequest::ReconcileResult(result)) => {self.process_result(result);},
+                        None | Some(ReconcileResultRequest::Stop) => {break;}
                     }
                 }
                 _ = async{
@@ -974,9 +985,6 @@ impl Service {
                 }
             };
         }
-
-        // We should only fall through on shutdown
-        assert!(self.cancel.is_cancelled());
     }
 
     async fn process_aborts(
@@ -1153,6 +1161,8 @@ impl Service {
             tokio::sync::mpsc::channel(MAX_DELAYED_RECONCILES);
 
         let cancel = CancellationToken::new();
+        let reconcilers_cancel = cancel.child_token();
+
         let heartbeater = Heartbeater::new(
             config.jwt_token.clone(),
             config.max_offline_interval,
@@ -1178,7 +1188,9 @@ impl Service {
             abort_tx,
             startup_complete: startup_complete.clone(),
             cancel,
+            reconcilers_cancel,
             gate: Gate::default(),
+            reconcilers_gate: Gate::default(),
             tenant_op_locks: Default::default(),
             node_op_locks: Default::default(),
         });
@@ -5132,7 +5144,7 @@ impl Service {
             }
         };
 
-        let Ok(gate_guard) = self.gate.enter() else {
+        let Ok(gate_guard) = self.reconcilers_gate.enter() else {
             // Gate closed: we're shutting down, drop out.
             return None;
         };
@@ -5145,7 +5157,7 @@ impl Service {
             &self.persistence,
             units,
             gate_guard,
-            &self.cancel,
+            &self.reconcilers_cancel,
         )
     }
 
@@ -5592,17 +5604,21 @@ impl Service {
     }
 
     pub async fn shutdown(&self) {
-        // Note that this already stops processing any results from reconciles: so
-        // we do not expect that our [`TenantShard`] objects will reach a neat
-        // final state.
+        // Cancel all on-going reconciles and wait for them to exit the gate.
+        tracing::info!("Shutting down: cancelling and waiting for in-flight reconciles");
+        self.reconcilers_cancel.cancel();
+        self.reconcilers_gate.close().await;
+
+        // Signal the background loop in [`Service::process_results`] to exit once
+        // it has proccessed the results from all the reconciles we cancelled earlier.
+        tracing::info!("Shutting down: processing results from previously in-flight reconciles");
+        self.result_tx.send(ReconcileResultRequest::Stop).ok();
+        self.result_tx.closed().await;
+
+        // Background tasks hold gate guards: this notifies them of the cancellation and
+        // waits for them all to complete.
+        tracing::info!("Shutting down: cancelling and waiting for background tasks to exit");
         self.cancel.cancel();
-
-        // The cancellation tokens in [`crate::reconciler::Reconciler`] are children
-        // of our cancellation token, so we do not need to explicitly cancel each of
-        // them.
-
-        // Background tasks and reconcilers hold gate guards: this waits for them all
-        // to complete.
         self.gate.close().await;
     }
 
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index ee2ba6c4ee..670efae154 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -9,6 +9,7 @@ use crate::{
     persistence::TenantShardPersistence,
     reconciler::ReconcileUnits,
     scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
+    service::ReconcileResultRequest,
 };
 use pageserver_api::controller_api::{
     NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy,
@@ -1059,7 +1060,7 @@ impl TenantShard {
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
     pub(crate) fn spawn_reconciler(
         &mut self,
-        result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
+        result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResultRequest>,
         pageservers: &Arc<HashMap<NodeId, Node>>,
         compute_hook: &Arc<ComputeHook>,
         service_config: &service::Config,
@@ -1183,7 +1184,9 @@ impl TenantShard {
                     pending_compute_notification: reconciler.compute_notify_failure,
                 };
 
-                result_tx.send(result).ok();
+                result_tx
+                    .send(ReconcileResultRequest::ReconcileResult(result))
+                    .ok();
             }
             .instrument(reconciler_span),
         );

From bea0468f1f62d7573e4c5167bc9b19e43f500ced Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Thu, 25 Jul 2024 12:56:37 -0400
Subject: [PATCH 1274/1571] fix(pageserver): allow incomplete history in
 btm-gc-compaction (#8500)

This pull request (should) fix the failure of test_gc_feedback. See the
explanation in the newly-added test case.

Part of https://github.com/neondatabase/neon/issues/8002

Allow incomplete history for the compaction algorithm.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                     | 121 ++++++++++++++++++-
 pageserver/src/tenant/timeline/compaction.rs |  13 +-
 2 files changed, 125 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f359326cc0..41d8a40941 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -7309,7 +7309,9 @@ mod tests {
             (
                 key,
                 Lsn(0x80),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x80")),
+                Value::Image(Bytes::copy_from_slice(
+                    b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80",
+                )),
             ),
             (
                 key,
@@ -7371,7 +7373,9 @@ mod tests {
                 ),
                 (
                     Lsn(0x80),
-                    Value::WalRecord(NeonWalRecord::wal_append(";0x80")),
+                    Value::Image(Bytes::copy_from_slice(
+                        b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80",
+                    )),
                 ),
                 (
                     Lsn(0x90),
@@ -7380,7 +7384,118 @@ mod tests {
             ]),
         };
         assert_eq!(res, expected_res);
-        // TODO: more tests with mixed image + delta, adding with k-merge test cases; e2e compaction test
+
+        // We expect GC-compaction to run with the original GC. This would create a situation that
+        // the original GC algorithm removes some delta layers b/c there are full image coverage,
+        // therefore causing some keys to have an incomplete history below the lowest retain LSN.
+        // For example, we have
+        // ```plain
+        // init delta @ 0x10, image @ 0x20, delta @ 0x30 (gc_horizon), image @ 0x40.
+        // ```
+        // Now the GC horizon moves up, and we have
+        // ```plain
+        // init delta @ 0x10, image @ 0x20, delta @ 0x30, image @ 0x40 (gc_horizon)
+        // ```
+        // The original GC algorithm kicks in, and removes delta @ 0x10, image @ 0x20.
+        // We will end up with
+        // ```plain
+        // delta @ 0x30, image @ 0x40 (gc_horizon)
+        // ```
+        // Now we run the GC-compaction, and this key does not have a full history.
+        // We should be able to handle this partial history and drop everything before the
+        // gc_horizon image.
+
+        let history = vec![
+            (
+                key,
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
+            ),
+            (
+                key,
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
+            ),
+            (
+                key,
+                Lsn(0x40),
+                Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")),
+            ),
+            (
+                key,
+                Lsn(0x50),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x50")),
+            ),
+            (
+                key,
+                Lsn(0x60),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
+            ),
+            (
+                key,
+                Lsn(0x70),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
+            ),
+            (
+                key,
+                Lsn(0x80),
+                Value::Image(Bytes::copy_from_slice(
+                    b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80",
+                )),
+            ),
+            (
+                key,
+                Lsn(0x90),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x90")),
+            ),
+        ];
+        let res = tline
+            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3)
+            .await
+            .unwrap();
+        let expected_res = KeyHistoryRetention {
+            below_horizon: vec![
+                (
+                    Lsn(0x40),
+                    KeyLogAtLsn(vec![(
+                        Lsn(0x40),
+                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")),
+                    )]),
+                ),
+                (
+                    Lsn(0x50),
+                    KeyLogAtLsn(vec![(
+                        Lsn(0x50),
+                        Value::WalRecord(NeonWalRecord::wal_append(";0x50")),
+                    )]),
+                ),
+                (
+                    Lsn(0x60),
+                    KeyLogAtLsn(vec![(
+                        Lsn(0x60),
+                        Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
+                    )]),
+                ),
+            ],
+            above_horizon: KeyLogAtLsn(vec![
+                (
+                    Lsn(0x70),
+                    Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
+                ),
+                (
+                    Lsn(0x80),
+                    Value::Image(Bytes::copy_from_slice(
+                        b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80",
+                    )),
+                ),
+                (
+                    Lsn(0x90),
+                    Value::WalRecord(NeonWalRecord::wal_append(";0x90")),
+                ),
+            ]),
+        };
+        assert_eq!(res, expected_res);
+
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 487ff6cd80..2c7ae911df 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1122,9 +1122,10 @@ impl Timeline {
                     );
                 }
             }
-            if let Value::WalRecord(rec) = &history[0].2 {
-                assert!(rec.will_init(), "no base image");
-            }
+            // There was an assertion for no base image that checks if the first
+            // record in the history is `will_init` before, but it was removed.
+            // This is explained in the test cases for generate_key_retention.
+            // Search "incomplete history" for more information.
             for lsn in retain_lsn_below_horizon {
                 assert!(lsn < &horizon, "retain lsn must be below horizon")
             }
@@ -1200,9 +1201,6 @@ impl Timeline {
                 false
             };
             replay_history.extend(split_for_lsn.iter().map(|x| (*x).clone()));
-            if let Some((_, _, val)) = replay_history.first() {
-                assert!(val.will_init(), "invalid history, no base image");
-            }
             // Only retain the items after the last image record
             for idx in (0..replay_history.len()).rev() {
                 if replay_history[idx].2.will_init() {
@@ -1210,6 +1208,9 @@ impl Timeline {
                     break;
                 }
             }
+            if let Some((_, _, val)) = replay_history.first() {
+                assert!(val.will_init(), "invalid history, no base image");
+            }
             if generate_image && records_since_last_image > 0 {
                 records_since_last_image = 0;
                 let history = std::mem::take(&mut replay_history);

From 9bfa180f2e9fc08f70d985cdac093b680ec6a18f Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 25 Jul 2024 21:21:58 +0100
Subject: [PATCH 1275/1571] Update pgrx to v 0.11.3 (#8515)

update pg_jsonschema extension to v 0.3.1
update pg_graphql extension to v1.5.7
update pgx_ulid extension to v0.1.5
update pg_tiktoken extension, patch Cargo.toml to use new pgrx
---
 Dockerfile.compute-node | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 48a52bfc6d..5e53a55316 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -657,7 +657,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     chmod +x rustup-init && \
     ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
     rm rustup-init && \
-    cargo install --locked --version 0.10.2 cargo-pgrx && \
+    cargo install --locked --version 0.11.3 cargo-pgrx && \
     /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
 
 USER root
@@ -672,10 +672,15 @@ USER root
 FROM rust-extensions-build AS pg-jsonschema-pg-build
 ARG PG_VERSION
 
-RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \
+    echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \
     mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
+    # `unsafe-postgres` feature allows to build pgx extensions
+    # against postgres forks that decided to change their ABI name (like us).
+    # With that we can build extensions without forking them and using stock
+    # pgx. As this feature is new few manual version bumps were required.
+    sed -i 's/pgrx = "0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
 
@@ -689,10 +694,10 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.
 FROM rust-extensions-build AS pg-graphql-pg-build
 ARG PG_VERSION
 
-RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
-    echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \
+    echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
     # it's needed to enable extension because it uses untrusted C language
     sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
@@ -712,6 +717,9 @@ ARG PG_VERSION
 RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
     echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
     mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
+    # TODO update pgrx version in the pg_tiktoken repo and remove this line
+    sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \
+    sed -i 's/pgrx-tests = "=0.10.2"/pgrx-tests = "0.11.3"/g' Cargo.toml && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
 
@@ -725,14 +733,10 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6
 FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION
 
-RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
-    echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
+    echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
     mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
-    wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
-    patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
-    echo "********************************************************************************************************" && \
-    sed -i 's/pgrx       = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx       = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
 

From 857a1823fef4d0254eb132caa6fd9e6d6c08a632 Mon Sep 17 00:00:00 2001
From: Mihai Bojin <mihai@neon.tech>
Date: Fri, 26 Jul 2024 01:14:12 +0100
Subject: [PATCH 1276/1571] Update links in synthetic-size.md (#8501)

---
 docs/synthetic-size.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/docs/synthetic-size.md b/docs/synthetic-size.md
index 3acb4e18cb..b6b90d90c2 100644
--- a/docs/synthetic-size.md
+++ b/docs/synthetic-size.md
@@ -21,9 +21,9 @@ implementation where we keep more data than we would need to, do not
 change the synthetic size or incur any costs to the user.
 
 The synthetic size is calculated for the whole project. It is not
-straightforward to attribute size to individual branches. See "What is
-the size of an individual branch?" for discussion on those
-difficulties.
+straightforward to attribute size to individual branches. See [What is
+the size of an individual branch?](#what-is-the-size-of-an-individual-branch)
+for a discussion of those difficulties.
 
 The synthetic size is designed to:
 
@@ -40,8 +40,9 @@ The synthetic size is designed to:
 - logical size is the size of a branch *at a given point in
   time*. It's the total size of all tables in all databases, as you
   see with "\l+" in psql for example, plus the Postgres SLRUs and some
-  small amount of metadata. NOTE that currently, Neon does not include
-  the SLRUs and metadata in the logical size. See comment to `get_current_logical_size_non_incremental()`.
+  small amount of metadata. Note that currently, Neon does not include
+  the SLRUs and metadata in the logical size. Refer to the comment in
+  [`get_current_logical_size_non_incremental()`](/pageserver/src/pgdatadir_mapping.rs#L813-L814).
 
 - a "point in time" is defined as an LSN value. You can convert a
   timestamp to an LSN, but the storage internally works with LSNs.

From 8e02db1ab926718a959ea0aa09ac069527953857 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 26 Jul 2024 02:16:10 +0200
Subject: [PATCH 1277/1571] Handle NotInitialized::ShuttingDown error in shard
 split (#8506)

There is a race condition between timeline shutdown and the split task.
Timeline shutdown first shuts down the upload queue, and only then fires
the cancellation token. A parallel running timeline split operation
might thus encounter a cancelled upload queue before the cancellation
token is fired, and print a noisy error.

Fix this by mapping `anyhow::Error{ NotInitialized::ShuttingDown }) to
`FlushLayerError::Cancelled` instead of `FlushLayerError::Other(_)`.


Fixes #8496
---
 pageserver/src/tenant/timeline.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8829040c70..6a29fc1ce1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -137,7 +137,7 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::config::TenantConf;
+use super::{config::TenantConf, upload_queue::NotInitialized};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
@@ -642,7 +642,13 @@ impl FlushLayerError {
     // When crossing from generic anyhow errors to this error type, we explicitly check
     // for timeline cancellation to avoid logging inoffensive shutdown errors as warn/err.
     fn from_anyhow(timeline: &Timeline, err: anyhow::Error) -> Self {
-        if timeline.cancel.is_cancelled() {
+        let cancelled = timeline.cancel.is_cancelled()
+            // The upload queue might have been shut down before the official cancellation of the timeline.
+            || err
+                .downcast_ref::<NotInitialized>()
+                .map(NotInitialized::is_stopping)
+                .unwrap_or_default();
+        if cancelled {
             Self::Cancelled
         } else {
             Self::Other(Arc::new(err))

From 8182bfdf0126fb3bb2fbdf7761e71f63083df779 Mon Sep 17 00:00:00 2001
From: Andrey Taranik <andrey@cicd.team>
Date: Fri, 26 Jul 2024 10:55:57 +0300
Subject: [PATCH 1278/1571] Using own registry to cache layers in docker build
 workflows (#8521)

## Problem

follow up for #8475

## Summary of changes

Using own private docker registry in `cache-from` and `cache-to`
settings in docker build-push actions
---
 .github/workflows/build-build-tools-image.yml | 10 +++++--
 .github/workflows/build_and_test.yml          | 27 ++++++++++++-------
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index a69686bf2a..76fc58151a 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -72,6 +72,12 @@ jobs:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
+      - uses: docker/login-action@v3
+        with:
+          registry: cache.neon.build
+          username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }}
+          password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }}
+
       - uses: docker/build-push-action@v6
         with:
           context: .
@@ -79,8 +85,8 @@ jobs:
           push: true
           pull: true
           file: Dockerfile.build-tools
-          cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
           tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
 
       - name: Remove custom docker config directory
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index d4af174fc5..885f4058d0 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -499,6 +499,12 @@ jobs:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
+      - uses: docker/login-action@v3
+        with:
+          registry: cache.neon.build
+          username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }}
+          password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }}
+
       - uses: docker/build-push-action@v6
         with:
           context: .
@@ -510,9 +516,8 @@ jobs:
           push: true
           pull: true
           file: Dockerfile
-          cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
-          # 23.07.2024 temporarily disable cache saving in the registry as it is very slow
-          # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/neon:cache-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0},mode=max', matrix.arch) || '' }}
           tags: |
             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
@@ -591,6 +596,12 @@ jobs:
           username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
           password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
+      - uses: docker/login-action@v3
+        with:
+          registry: cache.neon.build
+          username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }}
+          password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }}
+
       - name: Build compute-node image
         uses: docker/build-push-action@v6
         with:
@@ -604,9 +615,8 @@ jobs:
           push: true
           pull: true
           file: Dockerfile.compute-node
-          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
-          # 23.07.2024 temporarily disable cache saving in the registry as it is very slow
-          # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
           tags: |
             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
@@ -625,9 +635,8 @@ jobs:
           pull: true
           file: Dockerfile.compute-node
           target: neon-pg-ext-test
-          cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
-          # 23.07.2024 temporarily disable cache saving in the registry as it is very slow
-          # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
           tags: |
             neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
 

From 6711087ddf3f4f3bcbe2a89e026f436d5fe415d3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 26 Jul 2024 08:57:52 +0100
Subject: [PATCH 1279/1571] remote_storage: expose last_modified in listings
 (#8497)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

The scrubber would like to check the highest mtime in a tenant's objects
as a safety check during purges. It recently switched to use
GenericRemoteStorage, so we need to expose that in the listing methods.

## Summary of changes

- In Listing.keys, return a ListingObject{} including a last_modified
field, instead of a RemotePath

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 libs/remote_storage/src/azure_blob.rs         |  7 ++-
 libs/remote_storage/src/lib.rs                | 10 ++++-
 libs/remote_storage/src/local_fs.rs           | 45 ++++++++++++-------
 libs/remote_storage/src/s3_bucket.rs          | 28 +++++++++---
 libs/remote_storage/tests/common/tests.rs     |  2 +
 libs/remote_storage/tests/test_real_s3.rs     |  1 +
 pageserver/src/tenant/mgr.rs                  |  1 +
 .../src/tenant/remote_timeline_client.rs      | 19 ++++----
 .../tenant/remote_timeline_client/download.rs |  9 ++--
 safekeeper/src/wal_backup.rs                  |  7 ++-
 storage_scrubber/src/garbage.rs               | 10 +++--
 11 files changed, 96 insertions(+), 43 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index acd95a5255..6ca4ae43f2 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -33,6 +33,7 @@ use tracing::debug;
 use utils::backoff;
 
 use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
+use crate::ListingObject;
 use crate::{
     config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, Listing,
     ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
@@ -352,7 +353,11 @@ impl RemoteStorage for AzureBlobStorage {
                 let blob_iter = entry
                     .blobs
                     .blobs()
-                    .map(|k| self.name_to_relative_path(&k.name));
+                    .map(|k| ListingObject{
+                        key: self.name_to_relative_path(&k.name),
+                        last_modified: k.properties.last_modified.into()
+                    }
+                    );
 
                 for key in blob_iter {
                     res.keys.push(key);
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 0fed86f4b8..75aa28233b 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -149,10 +149,16 @@ pub enum ListingMode {
     NoDelimiter,
 }
 
+#[derive(PartialEq, Eq, Debug)]
+pub struct ListingObject {
+    pub key: RemotePath,
+    pub last_modified: SystemTime,
+}
+
 #[derive(Default)]
 pub struct Listing {
     pub prefixes: Vec<RemotePath>,
-    pub keys: Vec<RemotePath>,
+    pub keys: Vec<ListingObject>,
 }
 
 /// Storage (potentially remote) API to manage its state.
@@ -201,7 +207,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
         let mut combined = stream.next().await.expect("At least one item required")?;
         while let Some(list) = stream.next().await {
             let list = list?;
-            combined.keys.extend_from_slice(&list.keys);
+            combined.keys.extend(list.keys.into_iter());
             combined.prefixes.extend_from_slice(&list.prefixes);
         }
         Ok(combined)
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index a4857b0bba..bc6b10aa51 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use utils::crashsafe::path_with_suffix_extension;
 
 use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, Listing, ListingMode, ListingObject, RemotePath, TimeTravelError,
+    TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 use super::{RemoteStorage, StorageMetadata};
@@ -357,19 +357,28 @@ impl RemoteStorage for LocalFs {
                 .list_recursive(prefix)
                 .await
                 .map_err(DownloadError::Other)?;
-            let keys = keys
+            let objects = keys
                 .into_iter()
-                .filter(|k| {
+                .filter_map(|k| {
                     let path = k.with_base(&self.storage_root);
-                    !path.is_dir()
+                    if path.is_dir() {
+                        None
+                    } else {
+                        Some(ListingObject {
+                            key: k.clone(),
+                            // LocalFs is just for testing, so just specify a dummy time
+                            last_modified: SystemTime::now(),
+                        })
+                    }
                 })
                 .collect();
 
             if let ListingMode::NoDelimiter = mode {
-                result.keys = keys;
+                result.keys = objects;
             } else {
                 let mut prefixes = HashSet::new();
-                for key in keys {
+                for object in objects {
+                    let key = object.key;
                     // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
                     let relative_key = if let Some(prefix) = prefix {
                         let mut prefix = prefix.clone();
@@ -398,9 +407,11 @@ impl RemoteStorage for LocalFs {
                             .to_owned();
                         prefixes.insert(first_part);
                     } else {
-                        result
-                            .keys
-                            .push(RemotePath::from_string(&relative_key).unwrap());
+                        result.keys.push(ListingObject {
+                            key: RemotePath::from_string(&relative_key).unwrap(),
+                            // LocalFs is just for testing
+                            last_modified: SystemTime::now(),
+                        });
                     }
                 }
                 result.prefixes = prefixes
@@ -950,7 +961,11 @@ mod fs_tests {
             .await?;
         assert!(listing.prefixes.is_empty());
         assert_eq!(
-            listing.keys.into_iter().collect::<HashSet<_>>(),
+            listing
+                .keys
+                .into_iter()
+                .map(|o| o.key)
+                .collect::<HashSet<_>>(),
             HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
         );
 
@@ -975,7 +990,7 @@ mod fs_tests {
             )
             .await?;
         assert_eq!(
-            listing.keys,
+            listing.keys.into_iter().map(|o| o.key).collect::<Vec<_>>(),
             [RemotePath::from_string("uncle").unwrap()].to_vec()
         );
         assert_eq!(
@@ -992,7 +1007,7 @@ mod fs_tests {
                 &cancel,
             )
             .await?;
-        assert_eq!(listing.keys, [].to_vec());
+        assert_eq!(listing.keys, vec![]);
         assert_eq!(
             listing.prefixes,
             [RemotePath::from_string("grandparent").unwrap()].to_vec()
@@ -1007,7 +1022,7 @@ mod fs_tests {
                 &cancel,
             )
             .await?;
-        assert_eq!(listing.keys, [].to_vec());
+        assert_eq!(listing.keys, vec![]);
         assert_eq!(
             listing.prefixes,
             [RemotePath::from_string("grandparent").unwrap()].to_vec()
@@ -1040,7 +1055,7 @@ mod fs_tests {
                 &cancel,
             )
             .await?;
-        assert_eq!(listing.keys, [].to_vec());
+        assert_eq!(listing.keys, vec![]);
 
         let mut found_prefixes = listing.prefixes.clone();
         found_prefixes.sort();
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 90ed48e06c..412f307445 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -44,8 +44,9 @@ use crate::{
     error::Cancelled,
     metrics::{start_counting_cancelled_wait, start_measuring_requests},
     support::PermitCarrying,
-    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
-    TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, ListingObject, RemotePath,
+    RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 use crate::metrics::AttemptOutcome;
@@ -548,9 +549,26 @@ impl RemoteStorage for S3Bucket {
                 let mut result = Listing::default();
 
                 for object in keys {
-                    let object_path = object.key().expect("response does not contain a key");
-                    let remote_path = self.s3_object_to_relative_path(object_path);
-                    result.keys.push(remote_path);
+                    let key = object.key().expect("response does not contain a key");
+                    let key = self.s3_object_to_relative_path(key);
+
+                    let last_modified = match object.last_modified.map(SystemTime::try_from) {
+                        Some(Ok(t)) => t,
+                        Some(Err(_)) => {
+                            tracing::warn!("Remote storage last_modified {:?} for {} is out of bounds",
+                                object.last_modified, key
+                        );
+                            SystemTime::now()
+                        },
+                        None => {
+                            SystemTime::now()
+                        }
+                    };
+
+                    result.keys.push(ListingObject{
+                        key,
+                        last_modified
+                    });
                     if let Some(mut mk) = max_keys {
                         assert!(mk > 0);
                         mk -= 1;
diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs
index 38c316397a..86c55872c1 100644
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -156,6 +156,7 @@ async fn list_no_delimiter_works(
         .context("client list root files failure")?
         .keys
         .into_iter()
+        .map(|o| o.key)
         .collect::<HashSet<_>>();
     assert_eq!(
         root_files,
@@ -182,6 +183,7 @@ async fn list_no_delimiter_works(
         .context("client list nested files failure")?
         .keys
         .into_iter()
+        .map(|o| o.key)
         .collect::<HashSet<_>>();
     let trim_remote_blobs: HashSet<_> = ctx
         .remote_blobs
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 342bc6da0b..b893beeebd 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -81,6 +81,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
                 .context("list root files failure")?
                 .keys
                 .into_iter()
+                .map(|o| o.key)
                 .collect::<HashSet<_>>(),
         )
     }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index f23e6ff9d6..75c8682c97 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1407,6 +1407,7 @@ impl TenantManager {
             tracing::info!("Remote storage already deleted");
         } else {
             tracing::info!("Deleting {} keys from remote storage", keys.len());
+            let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
             self.resources
                 .remote_storage
                 .delete_objects(&keys, &self.cancel)
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 8b26f122cf..2f3c6c188b 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1380,12 +1380,13 @@ impl RemoteTimelineClient {
         // marker via its deleted_at attribute
         let latest_index = remaining
             .iter()
-            .filter(|p| {
-                p.object_name()
+            .filter(|o| {
+                o.key
+                    .object_name()
                     .map(|n| n.starts_with(IndexPart::FILE_NAME))
                     .unwrap_or(false)
             })
-            .filter_map(|path| parse_remote_index_path(path.clone()).map(|gen| (path, gen)))
+            .filter_map(|o| parse_remote_index_path(o.key.clone()).map(|gen| (o.key.clone(), gen)))
             .max_by_key(|i| i.1)
             .map(|i| i.0.clone())
             .unwrap_or(
@@ -1396,14 +1397,12 @@ impl RemoteTimelineClient {
 
         let remaining_layers: Vec<RemotePath> = remaining
             .into_iter()
-            .filter(|p| {
-                if p == &latest_index {
-                    return false;
+            .filter_map(|o| {
+                if o.key == latest_index || o.key.object_name() == Some(INITDB_PRESERVED_PATH) {
+                    None
+                } else {
+                    Some(o.key)
                 }
-                if p.object_name() == Some(INITDB_PRESERVED_PATH) {
-                    return false;
-                }
-                true
             })
             .inspect(|path| {
                 if let Some(name) = path.object_name() {
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index d0385e4aee..a17b32c983 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -295,10 +295,11 @@ where
         };
     }
 
-    for key in listing.keys {
-        let object_name = key
+    for object in listing.keys {
+        let object_name = object
+            .key
             .object_name()
-            .ok_or_else(|| anyhow::anyhow!("object name for key {key}"))?;
+            .ok_or_else(|| anyhow::anyhow!("object name for key {}", object.key))?;
         other_prefixes.insert(object_name.to_string());
     }
 
@@ -459,7 +460,7 @@ pub(crate) async fn download_index_part(
     // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
     let max_previous_generation = indices
         .into_iter()
-        .filter_map(parse_remote_index_path)
+        .filter_map(|o| parse_remote_index_path(o.key))
         .filter(|g| g <= &my_generation)
         .max();
 
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 7ecee178f3..234273e133 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -545,7 +545,10 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
                         &cancel,
                     )
                     .await?
-                    .keys;
+                    .keys
+                    .into_iter()
+                    .map(|o| o.key)
+                    .collect::<Vec<_>>();
                 if files.is_empty() {
                     return Ok(()); // done
                 }
@@ -613,7 +616,7 @@ pub async fn copy_s3_segments(
 
     let uploaded_segments = &files
         .iter()
-        .filter_map(|file| file.object_name().map(ToOwned::to_owned))
+        .filter_map(|o| o.key.object_name().map(ToOwned::to_owned))
         .collect::<HashSet<_>>();
 
     debug!(
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index c7e21d7e26..333269ec7e 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -10,7 +10,7 @@ use std::{
 use anyhow::Context;
 use futures_util::TryStreamExt;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath};
+use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath};
 use serde::{Deserialize, Serialize};
 use tokio_stream::StreamExt;
 use tokio_util::sync::CancellationToken;
@@ -324,7 +324,7 @@ impl std::fmt::Display for PurgeMode {
 pub async fn get_tenant_objects(
     s3_client: &GenericRemoteStorage,
     tenant_shard_id: TenantShardId,
-) -> anyhow::Result<Vec<RemotePath>> {
+) -> anyhow::Result<Vec<ListingObject>> {
     tracing::debug!("Listing objects in tenant {tenant_shard_id}");
     let tenant_root = super::remote_tenant_path(&tenant_shard_id);
 
@@ -345,7 +345,7 @@ pub async fn get_tenant_objects(
 pub async fn get_timeline_objects(
     s3_client: &GenericRemoteStorage,
     ttid: TenantShardTimelineId,
-) -> anyhow::Result<Vec<RemotePath>> {
+) -> anyhow::Result<Vec<ListingObject>> {
     tracing::debug!("Listing objects in timeline {ttid}");
     let timeline_root = super::remote_timeline_path_id(&ttid);
 
@@ -372,7 +372,7 @@ const MAX_KEYS_PER_DELETE: usize = 1000;
 /// `num_deleted` returns number of deleted keys.
 async fn do_delete(
     remote_client: &GenericRemoteStorage,
-    keys: &mut Vec<RemotePath>,
+    keys: &mut Vec<ListingObject>,
     dry_run: bool,
     drain: bool,
     progress_tracker: &mut DeletionProgressTracker,
@@ -382,6 +382,8 @@ async fn do_delete(
         let request_keys =
             keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
 
+        let request_keys: Vec<RemotePath> = request_keys.into_iter().map(|o| o.key).collect();
+
         let num_deleted = request_keys.len();
         if dry_run {
             tracing::info!("Dry-run deletion of objects: ");

From bb2a3f9b025318f1cc600dfa3836ba5984e99d61 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 26 Jul 2024 11:17:33 +0200
Subject: [PATCH 1280/1571] Update Rust to 1.80.0 (#8518)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust ecosystem as well.

[Release notes](https://github.com/rust-lang/rust/blob/master/RELEASES.md#version-180-2024-07-25).

Prior update was in #8048
---
 Dockerfile.build-tools                                | 2 +-
 libs/postgres_ffi/src/controlfile_utils.rs            | 2 +-
 libs/postgres_ffi/src/pg_constants.rs                 | 4 ++--
 libs/postgres_ffi/src/xlog_utils.rs                   | 8 ++++----
 libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs    | 4 ++--
 pageserver/src/tenant/metadata.rs                     | 2 +-
 pageserver/src/tenant/storage_layer/layer/tests.rs    | 6 +++---
 pageserver/src/tenant/storage_layer/merge_iterator.rs | 2 ++
 pageserver/src/tenant/timeline.rs                     | 1 -
 pageserver/src/walingest.rs                           | 8 ++++----
 proxy/src/proxy/tests/mitm.rs                         | 2 +-
 proxy/src/scram/countmin.rs                           | 2 +-
 rust-toolchain.toml                                   | 2 +-
 safekeeper/src/control_file.rs                        | 2 +-
 safekeeper/tests/walproposer_sim/walproposer_disk.rs  | 2 +-
 15 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 4826b7914e..dfaab1cb2e 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -192,7 +192,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.79.0
+ENV RUSTC_VERSION=1.80.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
diff --git a/libs/postgres_ffi/src/controlfile_utils.rs b/libs/postgres_ffi/src/controlfile_utils.rs
index 0918d15001..eaa9450294 100644
--- a/libs/postgres_ffi/src/controlfile_utils.rs
+++ b/libs/postgres_ffi/src/controlfile_utils.rs
@@ -29,7 +29,7 @@ use anyhow::{bail, Result};
 use bytes::{Bytes, BytesMut};
 
 /// Equivalent to sizeof(ControlFileData) in C
-const SIZEOF_CONTROLDATA: usize = std::mem::size_of::<ControlFileData>();
+const SIZEOF_CONTROLDATA: usize = size_of::<ControlFileData>();
 
 impl ControlFileData {
     /// Compute the offset of the `crc` field within the `ControlFileData` struct.
diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs
index 54b032d138..6ce855c78e 100644
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -31,7 +31,7 @@ pub const SMGR_TRUNCATE_FSM: u32 = 0x0004;
 //
 
 // Assumes 8 byte alignment
-const SIZEOF_PAGE_HEADER_DATA: usize = std::mem::size_of::<PageHeaderData>();
+const SIZEOF_PAGE_HEADER_DATA: usize = size_of::<PageHeaderData>();
 pub const MAXALIGN_SIZE_OF_PAGE_HEADER_DATA: usize = (SIZEOF_PAGE_HEADER_DATA + 7) & !7;
 
 //
@@ -191,7 +191,7 @@ pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
 pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
 pub const XLOG_TBLSPC_DROP: u8 = 0x10;
 
-pub const SIZEOF_XLOGRECORD: u32 = std::mem::size_of::<XLogRecord>() as u32;
+pub const SIZEOF_XLOGRECORD: u32 = size_of::<XLogRecord>() as u32;
 
 //
 // from xlogrecord.h
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index d25b23663b..9fe7e8198b 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -42,9 +42,9 @@ pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
 pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
 
-pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::<XLogPageHeaderData>();
-pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::<XLogLongPageHeaderData>();
-pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
+pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = size_of::<XLogPageHeaderData>();
+pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = size_of::<XLogLongPageHeaderData>();
+pub const XLOG_SIZE_OF_XLOG_RECORD: usize = size_of::<XLogRecord>();
 #[allow(clippy::identity_op)]
 pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;
 
@@ -311,7 +311,7 @@ impl XLogLongPageHeaderData {
     }
 }
 
-pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::<CheckPoint>();
+pub const SIZEOF_CHECKPOINT: usize = size_of::<CheckPoint>();
 
 impl CheckPoint {
     pub fn encode(&self) -> Result<Bytes, SerializeError> {
diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
index 750affc94e..79d45de67a 100644
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -178,7 +178,7 @@ pub fn test_find_end_of_wal_last_crossing_segment() {
 /// currently 1024.
 #[test]
 pub fn test_update_next_xid() {
-    let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
+    let checkpoint_buf = [0u8; size_of::<CheckPoint>()];
     let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
 
     checkpoint.nextXid = FullTransactionId { value: 10 };
@@ -204,7 +204,7 @@ pub fn test_update_next_xid() {
 
 #[test]
 pub fn test_update_next_multixid() {
-    let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
+    let checkpoint_buf = [0u8; size_of::<CheckPoint>()];
     let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
 
     // simple case
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 6ba1bdef9b..bbc070a81b 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -111,7 +111,7 @@ impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader {
 #[error("re-serializing for crc32 failed")]
 struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError);
 
-const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();
+const METADATA_HDR_SIZE: usize = size_of::<TimelineMetadataHeader>();
 
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 struct TimelineMetadataBodyV2 {
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index d5d2f748a9..423cde001c 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -828,9 +828,9 @@ async fn eviction_cancellation_on_drop() {
 #[test]
 #[cfg(target_arch = "x86_64")]
 fn layer_size() {
-    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 8);
-    assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(std::mem::size_of::<LayerInner>(), 312);
+    assert_eq!(size_of::<LayerAccessStats>(), 8);
+    assert_eq!(size_of::<PersistentLayerDesc>(), 104);
+    assert_eq!(size_of::<LayerInner>(), 312);
     // it also has the utf8 path
 }
 
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index eb4a1f28a1..b4bd976033 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -204,9 +204,11 @@ impl<'a> IteratorWrapper<'a> {
 /// A merge iterator over delta/image layer iterators. When duplicated records are
 /// found, the iterator will not perform any deduplication, and the caller should handle
 /// these situation. By saying duplicated records, there are many possibilities:
+///
 /// * Two same delta at the same LSN.
 /// * Two same image at the same LSN.
 /// * Delta/image at the same LSN where the image has already applied the delta.
+///
 /// The iterator will always put the image before the delta.
 pub struct MergeIterator<'a> {
     heap: BinaryHeap<IteratorWrapper<'a>>,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6a29fc1ce1..757a859f55 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3421,7 +3421,6 @@ impl Timeline {
         }
     }
 
-    #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
     #[allow(clippy::doc_lazy_continuation)]
     /// Get the data needed to reconstruct all keys in the provided keyspace
     ///
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index dff3a8f52d..804c7fca97 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -618,7 +618,7 @@ impl WalIngest {
                                 // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
                                 0
                             } else {
-                                std::mem::size_of::<u16>() * xlrec.ntuples as usize
+                                size_of::<u16>() * xlrec.ntuples as usize
                             };
                         assert_eq!(offset_array_len, buf.remaining());
 
@@ -685,7 +685,7 @@ impl WalIngest {
                                 // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
                                 0
                             } else {
-                                std::mem::size_of::<u16>() * xlrec.ntuples as usize
+                                size_of::<u16>() * xlrec.ntuples as usize
                             };
                         assert_eq!(offset_array_len, buf.remaining());
 
@@ -752,7 +752,7 @@ impl WalIngest {
                                 // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
                                 0
                             } else {
-                                std::mem::size_of::<u16>() * xlrec.ntuples as usize
+                                size_of::<u16>() * xlrec.ntuples as usize
                             };
                         assert_eq!(offset_array_len, buf.remaining());
 
@@ -920,7 +920,7 @@ impl WalIngest {
                                 // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
                                 0
                             } else {
-                                std::mem::size_of::<u16>() * xlrec.ntuples as usize
+                                size_of::<u16>() * xlrec.ntuples as usize
                             };
                         assert_eq!(offset_array_len, buf.remaining());
 
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index cbfc9f1358..d96dd0947b 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -1,7 +1,7 @@
 //! Man-in-the-middle tests
 //!
 //! Channel binding should prevent a proxy server
-//! - that has access to create valid certificates -
+//! *that has access to create valid certificates*
 //! from controlling the TLS connection.
 
 use std::fmt::Debug;
diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs
index f2b794e5fe..e8e7ef5c86 100644
--- a/proxy/src/scram/countmin.rs
+++ b/proxy/src/scram/countmin.rs
@@ -158,7 +158,7 @@ mod tests {
         let N = 1021 * 4096;
         let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
 
-        let memory = std::mem::size_of::<u32>() * sketch.buckets.len();
+        let memory = size_of::<u32>() * sketch.buckets.len();
         let time = sketch.depth;
         (memory, time)
     }
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index dcae25a287..3510359591 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.79.0"
+channel = "1.80.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index cd3c7fe526..d574bb438f 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -27,7 +27,7 @@ pub const SK_FORMAT_VERSION: u32 = 9;
 pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
 // needed to atomically update the state using `rename`
 const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial";
-pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
+pub const CHECKSUM_SIZE: usize = size_of::<u32>();
 
 /// Storage should keep actual state inside of it. It should implement Deref
 /// trait to access state fields and have persist method for updating that state.
diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs
index aa329bd2f0..123cd6bad6 100644
--- a/safekeeper/tests/walproposer_sim/walproposer_disk.rs
+++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs
@@ -172,7 +172,7 @@ fn write_walrecord_to_disk(
     let mut freespace = insert_freespace(curr_ptr);
     let mut written: usize = 0;
 
-    assert!(freespace >= std::mem::size_of::<u32>());
+    assert!(freespace >= size_of::<u32>());
 
     for mut rdata in rdatas {
         while rdata.len() >= freespace {

From 65868258d2c8a46001c7b9426368941f87cec968 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 26 Jul 2024 11:03:44 +0100
Subject: [PATCH 1281/1571] tests: checkpoint instead of compact in
 test_sharding_split_compaction (#8473)

## Problem

This test relies on writing image layers before the split. It can fail
to do so durably if the image layers are written ahead of the remote
consistent LSN, so we should have been doing a checkpoint rather than
just a compaction
---
 test_runner/regress/test_sharding.py         | 6 +++---
 test_runner/regress/test_storage_scrubber.py | 6 ++++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index bc43bc77fa..f8770e70fe 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -198,8 +198,8 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
         # disable background compaction and GC. We invoke it manually when we want it to happen.
         "gc_period": "0s",
         "compaction_period": "0s",
-        # create image layers eagerly, so that GC can remove some layers
-        "image_creation_threshold": 1,
+        # Disable automatic creation of image layers, as we will create them explicitly when we want them
+        "image_creation_threshold": 9999,
         "image_layer_creation_check_threshold": 0,
     }
 
@@ -225,7 +225,7 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
 
     # Do a full image layer generation before splitting, so that when we compact after splitting
     # we should only see sizes decrease (from post-split drops/rewrites), not increase (from image layer generation)
-    env.get_tenant_pageserver(tenant_id).http_client().timeline_compact(
+    env.get_tenant_pageserver(tenant_id).http_client().timeline_checkpoint(
         tenant_id, timeline_id, force_image_layer_creation=True, wait_until_uploaded=True
     )
 
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 7c411a6b84..a45430ca86 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -191,7 +191,9 @@ def test_scrubber_physical_gc_ancestors(
             "checkpoint_distance": f"{1024 * 1024}",
             "compaction_threshold": "1",
             "compaction_target_size": f"{1024 * 1024}",
-            "image_creation_threshold": "2",
+            # Disable automatic creation of image layers, as future image layers can result in layers in S3 that
+            # aren't referenced by children, earlier than the test expects such layers to exist
+            "image_creation_threshold": "9999",
             "image_layer_creation_check_threshold": "0",
             # Disable background compaction, we will do it explicitly
             "compaction_period": "0s",
@@ -241,7 +243,7 @@ def test_scrubber_physical_gc_ancestors(
     workload.churn_rows(100)
     for shard in shards:
         ps = env.get_tenant_pageserver(shard)
-        ps.http_client().timeline_compact(shard, timeline_id)
+        ps.http_client().timeline_compact(shard, timeline_id, force_image_layer_creation=True)
         ps.http_client().timeline_gc(shard, timeline_id, 0)
 
     # We will use a min_age_secs=1 threshold for deletion, let it pass

From 3cecbfc04dde12da0e7ee38614c3105e3b05ee78 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 26 Jul 2024 11:55:37 +0100
Subject: [PATCH 1282/1571] .github: reduce test concurrency (#8444)

## Problem

This is an experiment to see if 16x concurrency is actually helping, or
if it's just giving us very noisy results. If the total runtime with a
lower concurrency is similar, then a lower concurrency is preferable to
reduce the impact of resource-hungry tests running concurrently.
---
 .github/actions/run-python-test-set/action.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index daaedf6d11..9d39ab6ad7 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -131,8 +131,8 @@ runs:
           exit 1
         fi
         if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
-          # -n16 uses sixteen processes to run tests via pytest-xdist
-          EXTRA_PARAMS="-n16 $EXTRA_PARAMS"
+          # -n sets the number of parallel processes that pytest-xdist will run
+          EXTRA_PARAMS="-n12 $EXTRA_PARAMS"
 
           # --dist=loadgroup points tests marked with @pytest.mark.xdist_group
           # to the same worker to make @pytest.mark.order work with xdist

From cdaa2816e7464c2517fee2f56f65b9b401ea946f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 26 Jul 2024 14:19:52 +0100
Subject: [PATCH 1283/1571] pageserver: make vectored get the default read path
 for the pageserver (#8384)

## Problem
Vectored get is already enabled in all prod regions without validation.
The pageserver defaults
are out of sync however.

## Summary of changes
Update the pageserver defaults to match the prod config. Also means that
when running tests locally,
people don't have to use the env vars to get the prod config.
---
 .github/workflows/_build-and-test-locally.yml |  3 ---
 .github/workflows/build_and_test.yml          |  3 ---
 pageserver/src/config.rs                      |  6 +++---
 test_runner/fixtures/neon_fixtures.py         | 21 -------------------
 test_runner/regress/test_broken_timeline.py   |  9 ++------
 test_runner/regress/test_compatibility.py     |  6 ------
 6 files changed, 5 insertions(+), 43 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 35c6251304..26e234a04d 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -278,9 +278,6 @@ jobs:
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
           BUILD_TAG: ${{ inputs.build-tag }}
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          PAGESERVER_GET_VECTORED_IMPL: vectored
-          PAGESERVER_GET_IMPL: vectored
-          PAGESERVER_VALIDATE_VEC_GET: true
 
       # Temporary disable this step until we figure out why it's so flaky
       # Ref https://github.com/neondatabase/neon/issues/4540
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 885f4058d0..872c1fbb39 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -286,9 +286,6 @@ jobs:
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          PAGESERVER_GET_VECTORED_IMPL: vectored
-          PAGESERVER_GET_IMPL: vectored
-          PAGESERVER_VALIDATE_VEC_GET: false
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 20e78b1d85..614bbf3392 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -83,16 +83,16 @@ pub mod defaults {
     #[cfg(not(target_os = "linux"))]
     pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
 
-    pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
+    pub const DEFAULT_GET_VECTORED_IMPL: &str = "vectored";
 
-    pub const DEFAULT_GET_IMPL: &str = "legacy";
+    pub const DEFAULT_GET_IMPL: &str = "vectored";
 
     pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
 
     pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
         ImageCompressionAlgorithm::Disabled;
 
-    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
+    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
 
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d6718fca39..09c28148b4 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -542,21 +542,6 @@ class NeonEnvBuilder:
                 f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}"
             )
 
-        self.pageserver_get_vectored_impl: Optional[str] = None
-        if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored":
-            self.pageserver_get_vectored_impl = "vectored"
-            log.debug('Overriding pageserver get_vectored_impl config to "vectored"')
-
-        self.pageserver_get_impl: Optional[str] = None
-        if os.getenv("PAGESERVER_GET_IMPL", "") == "vectored":
-            self.pageserver_get_impl = "vectored"
-            log.debug('Overriding pageserver get_impl config to "vectored"')
-
-        self.pageserver_validate_vectored_get: Optional[bool] = None
-        if (validate := os.getenv("PAGESERVER_VALIDATE_VEC_GET")) is not None:
-            self.pageserver_validate_vectored_get = bool(validate)
-            log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"')
-
         self.pageserver_aux_file_policy = pageserver_aux_file_policy
 
         self.safekeeper_extra_opts = safekeeper_extra_opts
@@ -1157,12 +1142,6 @@ class NeonEnv:
             }
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
-            if config.pageserver_get_vectored_impl is not None:
-                ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl
-            if config.pageserver_get_impl is not None:
-                ps_cfg["get_impl"] = config.pageserver_get_impl
-            if config.pageserver_validate_vectored_get is not None:
-                ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get
             if config.pageserver_default_tenant_config_compaction_algorithm is not None:
                 tenant_config = ps_cfg.setdefault("tenant_config", {})
                 tenant_config[
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 61afd820ca..976ac09335 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -17,16 +17,11 @@ from fixtures.pg_version import PgVersion
 # Test restarting page server, while safekeeper and compute node keep
 # running.
 def test_local_corruption(neon_env_builder: NeonEnvBuilder):
-    if neon_env_builder.pageserver_get_impl == "vectored":
-        reconstruct_function_name = "get_values_reconstruct_data"
-    else:
-        reconstruct_function_name = "get_value_reconstruct_data"
-
     env = neon_env_builder.init_start()
 
     env.pageserver.allowed_errors.extend(
         [
-            f".*{reconstruct_function_name} for layer .*",
+            ".*get_values_reconstruct_data for layer .*",
             ".*could not find data for key.*",
             ".*is not active. Current state: Broken.*",
             ".*will not become active. Current state: Broken.*",
@@ -79,7 +74,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
     # (We don't check layer file contents on startup, when loading the timeline)
     #
     # This will change when we implement checksums for layers
-    with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err:
+    with pytest.raises(Exception, match="get_values_reconstruct_data for layer ") as err:
         pg1.start()
     log.info(
         f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}"
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 65649e0c0a..411b20b2c4 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -227,12 +227,6 @@ def test_forward_compatibility(
     )
 
     try:
-        # Previous version neon_local and pageserver are not aware
-        # of the new config.
-        # TODO: remove these once the previous version of neon local supports them
-        neon_env_builder.pageserver_get_impl = None
-        neon_env_builder.pageserver_validate_vectored_get = None
-
         neon_env_builder.num_safekeepers = 3
 
         # Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.).

From eddfd623333ebb2f6270f784ca10979da90de955 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 26 Jul 2024 14:29:59 +0100
Subject: [PATCH 1284/1571] CODEOWNERS: collapse safekeepers into storage
 (#8510)

## Problem

- The intersection of "safekeepers" and "storage" is just one person
---
 CODEOWNERS | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index af2fa6088e..606dbb4e22 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,13 +1,13 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
 /storage_controller @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
+/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
 /libs/remote_storage/ @neondatabase/storage
-/libs/safekeeper_api/ @neondatabase/safekeepers
+/libs/safekeeper_api/ @neondatabase/storage
 /libs/vm_monitor/ @neondatabase/autoscaling
 /pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
-/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
+/pgxn/neon/ @neondatabase/compute @neondatabase/storage
 /proxy/ @neondatabase/proxy
-/safekeeper/ @neondatabase/safekeepers
+/safekeeper/ @neondatabase/storage
 /vendor/ @neondatabase/compute

From 7a796a996357295dc06fdb94698f95171717301c Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 26 Jul 2024 14:54:09 +0100
Subject: [PATCH 1285/1571] storcon: introduce step down primitive (#8512)

## Problem
We are missing the step-down primitive required to implement rolling
restarts of the storage controller.

## Summary of changes
Add `/control/v1/step_down` endpoint which puts the storage controller
into a state where it rejects
all API requests apart from `/control/v1/step_down`, `/status` and
`/metrics`. When receiving the request,
storage controller cancels all pending reconciles and waits for them to
exit gracefully. The response contains
a snapshot of the in-memory observed state.

Related:
* https://github.com/neondatabase/cloud/issues/14701
* https://github.com/neondatabase/neon/issues/7797
* https://github.com/neondatabase/neon/pull/8310
---
 storage_controller/src/http.rs                |  54 +++++++-
 storage_controller/src/metrics.rs             |  13 +-
 storage_controller/src/reconciler.rs          |   3 +
 storage_controller/src/service.rs             | 120 +++++++++++++++++-
 storage_controller/src/tenant_shard.rs        |   6 +-
 test_runner/fixtures/neon_fixtures.py         |  11 ++
 .../regress/test_storage_controller.py        |  75 +++++++++++
 7 files changed, 274 insertions(+), 8 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 8fb4be93e0..c77918827f 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -3,7 +3,7 @@ use crate::metrics::{
     METRICS_REGISTRY,
 };
 use crate::reconciler::ReconcileError;
-use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
+use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT};
 use anyhow::Context;
 use futures::Future;
 use hyper::header::CONTENT_TYPE;
@@ -607,6 +607,13 @@ async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<
     )
 }
 
+async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    json_response(StatusCode::OK, state.service.step_down().await)
+}
+
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
@@ -734,6 +741,47 @@ struct RequestMeta {
     at: Instant,
 }
 
+pub fn prologue_leadership_status_check_middleware<
+    B: hyper::body::HttpBody + Send + Sync + 'static,
+>() -> Middleware<B, ApiError> {
+    Middleware::pre(move |req| async move {
+        let state = get_state(&req);
+        let leadership_status = state.service.get_leadership_status();
+
+        enum AllowedRoutes<'a> {
+            All,
+            Some(Vec<&'a str>),
+        }
+
+        let allowed_routes = match leadership_status {
+            LeadershipStatus::Leader => AllowedRoutes::All,
+            LeadershipStatus::SteppedDown => {
+                // TODO: does it make sense to allow /status here?
+                AllowedRoutes::Some(["/control/v1/step_down", "/status", "/metrics"].to_vec())
+            }
+            LeadershipStatus::Candidate => {
+                AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec())
+            }
+        };
+
+        let uri = req.uri().to_string();
+        match allowed_routes {
+            AllowedRoutes::All => Ok(req),
+            AllowedRoutes::Some(allowed) if allowed.contains(&uri.as_str()) => Ok(req),
+            _ => {
+                tracing::info!(
+                    "Request {} not allowed due to current leadership state",
+                    req.uri()
+                );
+
+                Err(ApiError::ResourceUnavailable(
+                    format!("Current leadership status is {leadership_status}").into(),
+                ))
+            }
+        }
+    })
+}
+
 fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
 ) -> Middleware<B, ApiError> {
     Middleware::pre(move |req| async move {
@@ -820,6 +868,7 @@ pub fn make_router(
     build_info: BuildInfo,
 ) -> RouterBuilder<hyper::Body, ApiError> {
     let mut router = endpoint::make_router()
+        .middleware(prologue_leadership_status_check_middleware())
         .middleware(prologue_metrics_middleware())
         .middleware(epilogue_metrics_middleware());
     if auth.is_some() {
@@ -971,6 +1020,9 @@ pub fn make_router(
                 RequestName("control_v1_tenant_policy"),
             )
         })
+        .put("/control/v1/step_down", |r| {
+            named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
+        })
         // Tenant operations
         // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
         // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index ac9f22c739..a1a4b8543d 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -13,7 +13,10 @@ use metrics::NeonMetrics;
 use once_cell::sync::Lazy;
 use std::sync::Mutex;
 
-use crate::persistence::{DatabaseError, DatabaseOperation};
+use crate::{
+    persistence::{DatabaseError, DatabaseOperation},
+    service::LeadershipStatus,
+};
 
 pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
     Lazy::new(StorageControllerMetrics::default);
@@ -81,6 +84,8 @@ pub(crate) struct StorageControllerMetricGroup {
     #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
     pub(crate) storage_controller_database_query_latency:
         measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
+
+    pub(crate) storage_controller_leadership_status: measured::GaugeVec<LeadershipStatusGroupSet>,
 }
 
 impl StorageControllerMetrics {
@@ -156,6 +161,12 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup {
     pub(crate) operation: DatabaseOperation,
 }
 
+#[derive(measured::LabelGroup)]
+#[label(set = LeadershipStatusGroupSet)]
+pub(crate) struct LeadershipStatusGroup {
+    pub(crate) status: LeadershipStatus,
+}
+
 #[derive(FixedCardinalityLabel, Clone, Copy)]
 pub(crate) enum ReconcileOutcome {
     #[label(rename = "ok")]
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 886ceae90f..12dea2c7ef 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -12,6 +12,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
+use utils::failpoint_support;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
 use utils::lsn::Lsn;
@@ -749,6 +750,8 @@ impl Reconciler {
             self.location_config(&node, conf, None, false).await?;
         }
 
+        failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue");
+
         Ok(())
     }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index e890c5e45e..821f45d0c0 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -15,6 +15,7 @@ use crate::{
     },
     compute_hook::NotifyError,
     id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
+    metrics::LeadershipStatusGroup,
     persistence::{AbortShardSplitStatus, TenantFilter},
     reconciler::{ReconcileError, ReconcileUnits},
     scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
@@ -81,6 +82,7 @@ use crate::{
         ReconcilerWaiter, TenantShard,
     },
 };
+use serde::{Deserialize, Serialize};
 
 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
@@ -131,6 +133,24 @@ enum NodeOperations {
     Delete,
 }
 
+/// The leadership status for the storage controller process.
+/// Allowed transitions are:
+/// 1. Leader -> SteppedDown
+/// 2. Candidate -> Leader
+#[derive(Copy, Clone, strum_macros::Display, measured::FixedCardinalityLabel)]
+#[strum(serialize_all = "snake_case")]
+pub(crate) enum LeadershipStatus {
+    /// This is the steady state where the storage controller can produce
+    /// side effects in the cluster.
+    Leader,
+    /// We've been notified to step down by another candidate. No reconciliations
+    /// take place in this state.
+    SteppedDown,
+    /// Initial state for a new storage controller instance. Will attempt to assume leadership.
+    #[allow(unused)]
+    Candidate,
+}
+
 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 
 // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
@@ -140,6 +160,8 @@ const MAX_DELAYED_RECONCILES: usize = 10000;
 
 // Top level state available to all HTTP handlers
 struct ServiceState {
+    leadership_status: LeadershipStatus,
+
     tenants: BTreeMap<TenantShardId, TenantShard>,
 
     nodes: Arc<HashMap<NodeId, Node>>,
@@ -202,7 +224,21 @@ impl ServiceState {
         scheduler: Scheduler,
         delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
     ) -> Self {
+        let status = &crate::metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_leadership_status;
+
+        status.set(
+            LeadershipStatusGroup {
+                status: LeadershipStatus::Leader,
+            },
+            1,
+        );
+
         Self {
+            // TODO: Starting up as Leader is a transient state. Once we enable rolling
+            // upgrades on the k8s side, we should start up as Candidate.
+            leadership_status: LeadershipStatus::Leader,
             tenants,
             nodes: Arc::new(nodes),
             scheduler,
@@ -220,6 +256,37 @@ impl ServiceState {
     ) {
         (&mut self.nodes, &mut self.tenants, &mut self.scheduler)
     }
+
+    fn get_leadership_status(&self) -> LeadershipStatus {
+        self.leadership_status
+    }
+
+    fn step_down(&mut self) {
+        self.leadership_status = LeadershipStatus::SteppedDown;
+
+        let status = &crate::metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_leadership_status;
+
+        status.set(
+            LeadershipStatusGroup {
+                status: LeadershipStatus::SteppedDown,
+            },
+            1,
+        );
+        status.set(
+            LeadershipStatusGroup {
+                status: LeadershipStatus::Leader,
+            },
+            0,
+        );
+        status.set(
+            LeadershipStatusGroup {
+                status: LeadershipStatus::Candidate,
+            },
+            0,
+        );
+    }
 }
 
 #[derive(Clone)]
@@ -403,11 +470,30 @@ struct ShardUpdate {
     generation: Option<Generation>,
 }
 
+enum StopReconciliationsReason {
+    ShuttingDown,
+    SteppingDown,
+}
+
+impl std::fmt::Display for StopReconciliationsReason {
+    fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result {
+        let s = match self {
+            Self::ShuttingDown => "Shutting down",
+            Self::SteppingDown => "Stepping down",
+        };
+        write!(writer, "{}", s)
+    }
+}
+
 pub(crate) enum ReconcileResultRequest {
     ReconcileResult(ReconcileResult),
     Stop,
 }
 
+// TODO: move this into the storcon peer client when that gets added
+#[derive(Serialize, Deserialize, Debug, Default)]
+pub(crate) struct GlobalObservedState(HashMap<TenantShardId, ObservedState>);
+
 impl Service {
     pub fn get_config(&self) -> &Config {
         &self.config
@@ -5603,17 +5689,22 @@ impl Service {
         Ok(std::cmp::max(waiter_count, reconciles_spawned))
     }
 
-    pub async fn shutdown(&self) {
+    async fn stop_reconciliations(&self, reason: StopReconciliationsReason) {
         // Cancel all on-going reconciles and wait for them to exit the gate.
-        tracing::info!("Shutting down: cancelling and waiting for in-flight reconciles");
+        tracing::info!("{reason}: cancelling and waiting for in-flight reconciles");
         self.reconcilers_cancel.cancel();
         self.reconcilers_gate.close().await;
 
         // Signal the background loop in [`Service::process_results`] to exit once
         // it has proccessed the results from all the reconciles we cancelled earlier.
-        tracing::info!("Shutting down: processing results from previously in-flight reconciles");
+        tracing::info!("{reason}: processing results from previously in-flight reconciles");
         self.result_tx.send(ReconcileResultRequest::Stop).ok();
         self.result_tx.closed().await;
+    }
+
+    pub async fn shutdown(&self) {
+        self.stop_reconciliations(StopReconciliationsReason::ShuttingDown)
+            .await;
 
         // Background tasks hold gate guards: this notifies them of the cancellation and
         // waits for them all to complete.
@@ -6003,4 +6094,27 @@ impl Service {
 
         Ok(())
     }
+
+    pub(crate) fn get_leadership_status(&self) -> LeadershipStatus {
+        self.inner.read().unwrap().get_leadership_status()
+    }
+
+    pub(crate) async fn step_down(&self) -> GlobalObservedState {
+        tracing::info!("Received step down request from peer");
+
+        self.inner.write().unwrap().step_down();
+        // TODO: would it make sense to have a time-out for this?
+        self.stop_reconciliations(StopReconciliationsReason::SteppingDown)
+            .await;
+
+        let mut global_observed = GlobalObservedState::default();
+        let locked = self.inner.read().unwrap();
+        for (tid, tenant_shard) in locked.tenants.iter() {
+            global_observed
+                .0
+                .insert(*tid, tenant_shard.observed.clone());
+        }
+
+        global_observed
+    }
 }
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 670efae154..e250f29f98 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -18,7 +18,7 @@ use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
     shard::{ShardIdentity, TenantShardId},
 };
-use serde::Serialize;
+use serde::{Deserialize, Serialize};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::{instrument, Instrument};
@@ -284,7 +284,7 @@ impl Drop for IntentState {
     }
 }
 
-#[derive(Default, Clone, Serialize)]
+#[derive(Default, Clone, Serialize, Deserialize, Debug)]
 pub(crate) struct ObservedState {
     pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
 }
@@ -298,7 +298,7 @@ pub(crate) struct ObservedState {
 ///       what it is (e.g. we failed partway through configuring it)
 ///     * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
 ///       and that configuration will still be present unless something external interfered.
-#[derive(Clone, Serialize)]
+#[derive(Clone, Serialize, Deserialize, Debug)]
 pub(crate) struct ObservedStateLocation {
     /// If None, it means we do not know the status of this shard's location on this node, but
     /// we know that we might have some state on this node.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 09c28148b4..0a06398391 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2582,6 +2582,17 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
                 time.sleep(backoff)
 
+    def step_down(self):
+        log.info("Asking storage controller to step down")
+        response = self.request(
+            "PUT",
+            f"{self.env.storage_controller_api}/control/v1/step_down",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
+        response.raise_for_status()
+        return response.json()
+
     def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
         if isinstance(config_strings, tuple):
             pairs = [config_strings]
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 9a47d7d651..da638ac233 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1783,3 +1783,78 @@ def test_storage_controller_node_deletion(
     assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
     env.storage_controller.reconcile_all()  # FIXME: workaround for optimizations happening on startup, see FIXME above.
     env.storage_controller.consistency_check()
+
+
+def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
+    """
+    Test the `/control/v1/step_down` storage controller API. Upon receiving such
+    a request, the storage controller cancels any on-going reconciles and replies
+    with 503 to all requests apart from `/control/v1/step_down`, `/status` and `/metrics`.
+    """
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tid = TenantId.generate()
+    tsid = str(TenantShardId(tid, shard_number=0, shard_count=0))
+    env.storage_controller.tenant_create(tid)
+
+    env.storage_controller.reconcile_until_idle()
+    env.storage_controller.configure_failpoints(("sleep-on-reconcile-epilogue", "return(10000)"))
+
+    # Make a change to the tenant config to trigger a slow reconcile
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
+    virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None)
+    env.storage_controller.allowed_errors.append(
+        ".*Accepted configuration update but reconciliation failed.*"
+    )
+
+    observed_state = env.storage_controller.step_down()
+    log.info(f"Storage controller stepped down with {observed_state=}")
+
+    # Validate that we waited for the slow reconcile to complete
+    # and updated the observed state in the storcon before stepping down.
+    node_id = str(env.pageserver.id)
+    assert tsid in observed_state
+    assert node_id in observed_state[tsid]["locations"]
+    assert "conf" in observed_state[tsid]["locations"][node_id]
+    assert "tenant_conf" in observed_state[tsid]["locations"][node_id]["conf"]
+
+    tenant_conf = observed_state[tsid]["locations"][node_id]["conf"]["tenant_conf"]
+    assert "compaction_threshold" in tenant_conf
+    assert tenant_conf["compaction_threshold"] == 5
+
+    # Validate that we propagated the change to the pageserver
+    ps_tenant_conf = env.pageserver.http_client().tenant_config(tid)
+    assert "compaction_threshold" in ps_tenant_conf.effective_config
+    assert ps_tenant_conf.effective_config["compaction_threshold"] == 5
+
+    # Validate that the storcon is not replying to the usual requests
+    # once it has stepped down.
+    with pytest.raises(StorageControllerApiException, match="stepped_down"):
+        env.storage_controller.tenant_list()
+
+    # Validate that we can step down multiple times and the observed state
+    # doesn't change.
+    observed_state_again = env.storage_controller.step_down()
+    assert observed_state == observed_state_again
+
+    assert (
+        env.storage_controller.get_metric_value(
+            "storage_controller_leadership_status", filter={"status": "leader"}
+        )
+        == 0
+    )
+
+    assert (
+        env.storage_controller.get_metric_value(
+            "storage_controller_leadership_status", filter={"status": "stepped_down"}
+        )
+        == 1
+    )
+
+    assert (
+        env.storage_controller.get_metric_value(
+            "storage_controller_leadership_status", filter={"status": "candidate"}
+        )
+        == 0
+    )

From 240ba7e10cb731526d24f31f559c10d0e3f64c3b Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Fri, 26 Jul 2024 07:08:13 -0700
Subject: [PATCH 1286/1571] Fix sql-exporter-autoscaling for pg < 16 (#8523)

The lfc_approximate_working_set_size_windows query was failing on pg14
and pg15 with

  pq: subquery in FROM must have an alias

Because aliases in that position became optional only in pg16.

Some context here: https://neondb.slack.com/archives/C04DGM6SMTM/p1721970322601679?thread_ts=1721921122.528849
---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 224e9847f3..2767710bad 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -404,7 +404,7 @@ files:
             x::text as duration_seconds,
             neon.approximate_working_set_size_seconds(x) as size
           from
-            (select generate_series * 60 as x from generate_series(1, 60));
+            (select generate_series * 60 as x from generate_series(1, 60)) as t (x);
 build: |
   # Build cgroup-tools
   #

From 8154e887325ede131e0fdb4500269f6b51884ef8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Jul 2024 16:48:44 +0200
Subject: [PATCH 1287/1571] refactor(layer load API): all errors are permanent
 (#8527)

I am not aware of a case of "transient" VirtualFile errors as mentioned
in https://github.com/neondatabase/neon/pull/5880

Private DM with Joonas discussing this:
https://neondb.slack.com/archives/D049K7HJ9JM/p1721836424615799
---
 .../src/tenant/storage_layer/delta_layer.rs   | 31 ++++++-------
 .../src/tenant/storage_layer/image_layer.rs   | 26 +++++------
 pageserver/src/tenant/storage_layer/layer.rs  | 44 ++++++++++---------
 test_runner/regress/test_broken_timeline.py   |  2 +-
 4 files changed, 49 insertions(+), 54 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 586a7b7836..229d1e3608 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -307,12 +307,10 @@ impl DeltaLayer {
             .with_context(|| format!("Failed to load delta layer {}", self.path()))
     }
 
-    async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
+    async fn load_inner(&self, ctx: &RequestContext) -> anyhow::Result<Arc<DeltaLayerInner>> {
         let path = self.path();
 
-        let loaded = DeltaLayerInner::load(&path, None, None, ctx)
-            .await
-            .and_then(|res| res)?;
+        let loaded = DeltaLayerInner::load(&path, None, None, ctx).await?;
 
         // not production code
         let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
@@ -760,27 +758,24 @@ impl DeltaLayerInner {
         &self.layer_lsn_range
     }
 
-    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
-    /// - inner has the success or transient failure
-    /// - outer has the permanent failure
     pub(super) async fn load(
         path: &Utf8Path,
         summary: Option<Summary>,
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
-    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
-        let file = match VirtualFile::open(path, ctx).await {
-            Ok(file) => file,
-            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
-        };
+    ) -> anyhow::Result<Self> {
+        let file = VirtualFile::open(path, ctx)
+            .await
+            .context("open layer file")?;
+
         let file_id = page_cache::next_file_id();
 
         let block_reader = FileBlockReader::new(&file, file_id);
 
-        let summary_blk = match block_reader.read_blk(0, ctx).await {
-            Ok(blk) => blk,
-            Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
-        };
+        let summary_blk = block_reader
+            .read_blk(0, ctx)
+            .await
+            .context("read first block")?;
 
         // TODO: this should be an assertion instead; see ImageLayerInner::load
         let actual_summary =
@@ -802,7 +797,7 @@ impl DeltaLayerInner {
             }
         }
 
-        Ok(Ok(DeltaLayerInner {
+        Ok(DeltaLayerInner {
             file,
             file_id,
             index_start_blk: actual_summary.index_start_blk,
@@ -810,7 +805,7 @@ impl DeltaLayerInner {
             max_vectored_read_bytes,
             layer_key_range: actual_summary.key_range,
             layer_lsn_range: actual_summary.lsn_range,
-        }))
+        })
     }
 
     pub(super) async fn get_value_reconstruct_data(
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index e5e7f71928..44ba685490 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -265,9 +265,8 @@ impl ImageLayer {
     async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
         let path = self.path();
 
-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
-            .await
-            .and_then(|res| res)?;
+        let loaded =
+            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx).await?;
 
         // not production code
         let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
@@ -385,17 +384,16 @@ impl ImageLayerInner {
         summary: Option<Summary>,
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
-    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
-        let file = match VirtualFile::open(path, ctx).await {
-            Ok(file) => file,
-            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
-        };
+    ) -> anyhow::Result<Self> {
+        let file = VirtualFile::open(path, ctx)
+            .await
+            .context("open layer file")?;
         let file_id = page_cache::next_file_id();
         let block_reader = FileBlockReader::new(&file, file_id);
-        let summary_blk = match block_reader.read_blk(0, ctx).await {
-            Ok(blk) => blk,
-            Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
-        };
+        let summary_blk = block_reader
+            .read_blk(0, ctx)
+            .await
+            .context("read first block")?;
 
         // length is the only way how this could fail, so it's not actually likely at all unless
         // read_blk returns wrong sized block.
@@ -420,7 +418,7 @@ impl ImageLayerInner {
             }
         }
 
-        Ok(Ok(ImageLayerInner {
+        Ok(ImageLayerInner {
             index_start_blk: actual_summary.index_start_blk,
             index_root_blk: actual_summary.index_root_blk,
             lsn,
@@ -428,7 +426,7 @@ impl ImageLayerInner {
             file_id,
             max_vectored_read_bytes,
             key_range: actual_summary.key_range,
-        }))
+        })
     }
 
     pub(super) async fn get_value_reconstruct_data(
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 619c4d044d..1075feb1d1 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1651,8 +1651,9 @@ impl Drop for DownloadedLayer {
 }
 
 impl DownloadedLayer {
-    /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`], or fails to
-    /// initialize it permanently.
+    /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`].
+    /// Failure to load the layer is sticky, i.e., future `get()` calls will return
+    /// the initial load failure immediately.
     ///
     /// `owner` parameter is a strong reference at the same `LayerInner` as the
     /// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called,
@@ -1683,7 +1684,7 @@ impl DownloadedLayer {
                     ctx,
                 )
                 .await
-                .map(|res| res.map(LayerKind::Delta))
+                .map(LayerKind::Delta)
             } else {
                 let lsn = owner.desc.image_layer_lsn();
                 let summary = Some(image_layer::Summary::expected(
@@ -1700,32 +1701,29 @@ impl DownloadedLayer {
                     ctx,
                 )
                 .await
-                .map(|res| res.map(LayerKind::Image))
+                .map(LayerKind::Image)
             };
 
             match res {
-                Ok(Ok(layer)) => Ok(Ok(layer)),
-                Ok(Err(transient)) => Err(transient),
-                Err(permanent) => {
+                Ok(layer) => Ok(layer),
+                Err(err) => {
                     LAYER_IMPL_METRICS.inc_permanent_loading_failures();
-                    // TODO(#5815): we are not logging all errors, so temporarily log them **once**
-                    // here as well
-                    let permanent = permanent.context("load layer");
-                    tracing::error!("layer loading failed permanently: {permanent:#}");
-                    Ok(Err(permanent))
+                    // We log this message once over the lifetime of `Self`
+                    // => Ok and good to log backtrace and path here.
+                    tracing::error!(
+                        "layer load failed, assuming permanent failure: {}: {err:?}",
+                        owner.path
+                    );
+                    Err(err)
                 }
             }
         };
         self.kind
-            .get_or_try_init(init)
-            // return transient errors using `?`
-            .await?
+            .get_or_init(init)
+            .await
             .as_ref()
-            .map_err(|e| {
-                // errors are not clonabled, cannot but stringify
-                // test_broken_timeline matches this string
-                anyhow::anyhow!("layer loading failed: {e:#}")
-            })
+            // We already logged the full backtrace above, once. Don't repeat that here.
+            .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
     }
 
     async fn get_value_reconstruct_data(
@@ -1760,7 +1758,11 @@ impl DownloadedLayer {
     ) -> Result<(), GetVectoredError> {
         use LayerKind::*;
 
-        match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
+        match self
+            .get(owner, ctx)
+            .await
+            .map_err(GetVectoredError::Other)?
+        {
             Delta(d) => {
                 d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx)
                     .await
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 976ac09335..5ec9a22ba1 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -27,7 +27,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
             ".*will not become active. Current state: Broken.*",
             ".*failed to load metadata.*",
             ".*load failed.*load local timeline.*",
-            ".*layer loading failed permanently: load layer: .*",
+            ".*: layer load failed, assuming permanent failure:.*",
         ]
     )
 

From 68241f5a3e2c8b23f2db5a1100066fd19f3890e4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Jul 2024 17:44:57 +0200
Subject: [PATCH 1288/1571] raise wait_lsn timeout from 60s to 300s (#8529)

Problem
-------

wait_lsn timeouts result in a user-facing errors like

```
$ /tmp/neon/pg_install/v16/bin/pgbench -s3424 -i -I dtGvp user=neondb_owner dbname=neondb host=ep-tiny-wave-w23owa37.eastus2.azure.neon.build sslmode=require options='-cstatement_timeout=0 '

dropping old tables...
NOTICE:  table "pgbench_accounts" does not exist, skipping
NOTICE:  table "pgbench_branches" does not exist, skipping
NOTICE:  table "pgbench_history" does not exist, skipping
NOTICE:  table "pgbench_tellers" does not exist, skipping
creating tables...
generating data (server-side)...
vacuuming...
pgbench: error: query failed: ERROR:  [NEON_SMGR] [shard 0] could not read block 214338 in rel 1663/16389/16839.0 from page server at lsn C/E1C12828
DETAIL:  page server returned error: LSN timeout: Timed out while waiting for WAL record at LSN C/E1418528 to arrive, last_record_lsn 6/999D9CA8 disk consistent LSN=6/999D9CA8, WalReceiver status:  (update 2024-07-25 08:30:07): connecting to node 25, safekeeper candidates (id|update_time|commit_lsn): [(21|08:30:16|C/E1C129E0), (23|08:30:16|C/E1C129E0), (25|08:30:17|C/E1C129E0)]
CONTEXT:  while scanning block 214338 of relation "public.pgbench_accounts"
pgbench: detail: Query was: vacuum analyze pgbench_accounts
```

Solution
--------

Its better to be slow than to fail the queries.
If the app has a deadline, it can use `statement_timeout`.

In the long term, we want to eliminate wait_lsn timeout.

In the short term (this PR), we bump the wait_lsn timeout to
a larger value to reduce the frequency at which these wait_lsn timeouts
occur.

We will observe SLOs and specifically
`pageserver_wait_lsn_seconds_bucket`
before we eliminate the timeout completely.
---
 pageserver/src/config.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 614bbf3392..100c6c1ac5 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -52,7 +52,7 @@ pub mod defaults {
     use pageserver_api::models::ImageCompressionAlgorithm;
     pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
 
-    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
+    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
     pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
 
     pub const DEFAULT_SUPERUSER: &str = "cloud_admin";

From dd40b19db4739aadebfd7f11182016bbb3c464be Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Fri, 26 Jul 2024 14:09:55 -0400
Subject: [PATCH 1289/1571] fix(pageserver): give L0 compaction priorities over
 image layer creation (#8443)

close https://github.com/neondatabase/neon/issues/8435

## Summary of changes

If L0 compaction did not include all L0 layers, skip image generation.

There are multiple possible solutions to the original issue, i.e., an
alternative is to wrap the partial L0 compaction in a loop until it
compacts all L0 layers. However, considering that we should weight all
tenants equally, the current solution can ensure everyone gets a chance
to run compaction, and those who write too much won't get a chance to
create image layers. This creates a natural backpressure feedback that
they get a slower read due to no image layers are created, slowing down
their writes, and eventually compaction could keep up with their writes
+ generate image layers.

Consider deployment, we should add an alert on "skipping image layer
generation", so that we won't run into the case that image layers are
not generated => incidents again.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                     | 16 +++--
 pageserver/src/tenant/tasks.rs               | 40 ++++++-----
 pageserver/src/tenant/timeline.rs            | 16 +++--
 pageserver/src/tenant/timeline/compaction.rs | 71 ++++++++++++--------
 4 files changed, 85 insertions(+), 58 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 41d8a40941..99f4e2d7cd 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1616,21 +1616,23 @@ impl Tenant {
     /// This function is periodically called by compactor task.
     /// Also it can be explicitly requested per timeline through page server
     /// api's 'compact' command.
+    ///
+    /// Returns whether we have pending compaction task.
     async fn compaction_iteration(
         &self,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> Result<(), timeline::CompactionError> {
+    ) -> Result<bool, timeline::CompactionError> {
         // Don't start doing work during shutdown, or when broken, we do not need those in the logs
         if !self.is_active() {
-            return Ok(());
+            return Ok(false);
         }
 
         {
             let conf = self.tenant_conf.load();
             if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
                 info!("Skipping compaction in location state {:?}", conf.location);
-                return Ok(());
+                return Ok(false);
             }
         }
 
@@ -1657,11 +1659,13 @@ impl Tenant {
         // Before doing any I/O work, check our circuit breaker
         if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
             info!("Skipping compaction due to previous failures");
-            return Ok(());
+            return Ok(false);
         }
 
+        let mut has_pending_task = false;
+
         for (timeline_id, timeline) in &timelines_to_compact {
-            timeline
+            has_pending_task |= timeline
                 .compact(cancel, EnumSet::empty(), ctx)
                 .instrument(info_span!("compact_timeline", %timeline_id))
                 .await
@@ -1681,7 +1685,7 @@ impl Tenant {
             .unwrap()
             .success(&CIRCUIT_BREAKERS_UNBROKEN);
 
-        Ok(())
+        Ok(has_pending_task)
     }
 
     // Call through to all timelines to freeze ephemeral layers if needed.  Usually
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 7f59e54eb7..230362d81a 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -210,24 +210,28 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 Duration::from_secs(10)
             } else {
                 // Run compaction
-                if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
-                    let wait_duration = backoff::exponential_backoff_duration_seconds(
-                        error_run_count + 1,
-                        1.0,
-                        MAX_BACKOFF_SECS,
-                    );
-                    error_run_count += 1;
-                    let wait_duration = Duration::from_secs_f64(wait_duration);
-                    log_compaction_error(
-                        &e,
-                        error_run_count,
-                        &wait_duration,
-                        cancel.is_cancelled(),
-                    );
-                    wait_duration
-                } else {
-                    error_run_count = 0;
-                    period
+                match tenant.compaction_iteration(&cancel, &ctx).await {
+                    Err(e) => {
+                        let wait_duration = backoff::exponential_backoff_duration_seconds(
+                            error_run_count + 1,
+                            1.0,
+                            MAX_BACKOFF_SECS,
+                        );
+                        error_run_count += 1;
+                        let wait_duration = Duration::from_secs_f64(wait_duration);
+                        log_compaction_error(
+                            &e,
+                            error_run_count,
+                            &wait_duration,
+                            cancel.is_cancelled(),
+                        );
+                        wait_duration
+                    }
+                    Ok(has_pending_task) => {
+                        error_run_count = 0;
+                        // schedule the next compaction immediately in case there is a pending compaction task
+                        if has_pending_task { Duration::from_secs(0) } else { period }
+                    }
                 }
             };
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 757a859f55..017d598d96 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1769,13 +1769,14 @@ impl Timeline {
         }
     }
 
-    /// Outermost timeline compaction operation; downloads needed layers.
+    /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
+    /// compaction tasks.
     pub(crate) async fn compact(
         self: &Arc<Self>,
         cancel: &CancellationToken,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
+    ) -> Result<bool, CompactionError> {
         // most likely the cancellation token is from background task, but in tests it could be the
         // request task as well.
 
@@ -1795,8 +1796,8 @@ impl Timeline {
         // compaction task goes over it's period (20s) which is quite often in production.
         let (_guard, _permit) = tokio::select! {
             tuple = prepare => { tuple },
-            _ = self.cancel.cancelled() => return Ok(()),
-            _ = cancel.cancelled() => return Ok(()),
+            _ = self.cancel.cancelled() => return Ok(false),
+            _ = cancel.cancelled() => return Ok(false),
         };
 
         let last_record_lsn = self.get_last_record_lsn();
@@ -1804,11 +1805,14 @@ impl Timeline {
         // Last record Lsn could be zero in case the timeline was just created
         if !last_record_lsn.is_valid() {
             warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
-            return Ok(());
+            return Ok(false);
         }
 
         match self.get_compaction_algorithm_settings().kind {
-            CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
+            CompactionAlgorithm::Tiered => {
+                self.compact_tiered(cancel, ctx).await?;
+                Ok(false)
+            }
             CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
         }
     }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 2c7ae911df..497d631f4f 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -102,17 +102,19 @@ impl KeyHistoryRetention {
 
 impl Timeline {
     /// TODO: cancellation
+    ///
+    /// Returns whether the compaction has pending tasks.
     pub(crate) async fn compact_legacy(
         self: &Arc<Self>,
         cancel: &CancellationToken,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
+    ) -> Result<bool, CompactionError> {
         if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
-            return self
-                .compact_with_gc(cancel, ctx)
+            self.compact_with_gc(cancel, ctx)
                 .await
-                .map_err(CompactionError::Other);
+                .map_err(CompactionError::Other)?;
+            return Ok(false);
         }
 
         // High level strategy for compaction / image creation:
@@ -160,7 +162,7 @@ impl Timeline {
         // Define partitioning schema if needed
 
         // FIXME: the match should only cover repartitioning, not the next steps
-        let partition_count = match self
+        let (partition_count, has_pending_tasks) = match self
             .repartition(
                 self.get_last_record_lsn(),
                 self.get_compaction_target_size(),
@@ -177,30 +179,35 @@ impl Timeline {
 
                 // 2. Compact
                 let timer = self.metrics.compact_time_histo.start_timer();
-                self.compact_level0(target_file_size, ctx).await?;
+                let fully_compacted = self.compact_level0(target_file_size, ctx).await?;
                 timer.stop_and_record();
 
-                // 3. Create new image layers for partitions that have been modified
-                // "enough".
                 let mut partitioning = dense_partitioning;
                 partitioning
                     .parts
                     .extend(sparse_partitioning.into_dense().parts);
-                let image_layers = self
-                    .create_image_layers(
-                        &partitioning,
-                        lsn,
-                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
-                            ImageLayerCreationMode::Force
-                        } else {
-                            ImageLayerCreationMode::Try
-                        },
-                        &image_ctx,
-                    )
-                    .await?;
 
-                self.upload_new_image_layers(image_layers)?;
-                partitioning.parts.len()
+                // 3. Create new image layers for partitions that have been modified
+                // "enough". Skip image layer creation if L0 compaction cannot keep up.
+                if fully_compacted {
+                    let image_layers = self
+                        .create_image_layers(
+                            &partitioning,
+                            lsn,
+                            if flags.contains(CompactFlags::ForceImageLayerCreation) {
+                                ImageLayerCreationMode::Force
+                            } else {
+                                ImageLayerCreationMode::Try
+                            },
+                            &image_ctx,
+                        )
+                        .await?;
+
+                    self.upload_new_image_layers(image_layers)?;
+                } else {
+                    info!("skipping image layer generation due to L0 compaction did not include all layers.");
+                }
+                (partitioning.parts.len(), !fully_compacted)
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -212,7 +219,7 @@ impl Timeline {
                 if !self.cancel.is_cancelled() {
                     tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
                 }
-                1
+                (1, false)
             }
         };
 
@@ -225,7 +232,7 @@ impl Timeline {
             self.compact_shard_ancestors(rewrite_max, ctx).await?;
         }
 
-        Ok(())
+        Ok(has_pending_tasks)
     }
 
     /// Check for layers that are elegible to be rewritten:
@@ -432,15 +439,16 @@ impl Timeline {
     }
 
     /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
-    /// as Level 1 files.
+    /// as Level 1 files. Returns whether the L0 layers are fully compacted.
     async fn compact_level0(
         self: &Arc<Self>,
         target_file_size: u64,
         ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
+    ) -> Result<bool, CompactionError> {
         let CompactLevel0Phase1Result {
             new_layers,
             deltas_to_compact,
+            fully_compacted,
         } = {
             let phase1_span = info_span!("compact_level0_phase1");
             let ctx = ctx.attached_child();
@@ -463,12 +471,12 @@ impl Timeline {
 
         if new_layers.is_empty() && deltas_to_compact.is_empty() {
             // nothing to do
-            return Ok(());
+            return Ok(true);
         }
 
         self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
             .await?;
-        Ok(())
+        Ok(fully_compacted)
     }
 
     /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
@@ -535,6 +543,8 @@ impl Timeline {
         ) as u64
             * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
 
+        let mut fully_compacted = true;
+
         deltas_to_compact.push(
             first_level0_delta
                 .download_and_keep_resident()
@@ -562,6 +572,7 @@ impl Timeline {
                     "L0 compaction picker hit max delta layer size limit: {}",
                     delta_size_limit
                 );
+                fully_compacted = false;
 
                 // Proceed with compaction, but only a subset of L0s
                 break;
@@ -923,6 +934,7 @@ impl Timeline {
                 .into_iter()
                 .map(|x| x.drop_eviction_guard())
                 .collect::<Vec<_>>(),
+            fully_compacted,
         })
     }
 }
@@ -931,6 +943,9 @@ impl Timeline {
 struct CompactLevel0Phase1Result {
     new_layers: Vec<ResidentLayer>,
     deltas_to_compact: Vec<Layer>,
+    // Whether we have included all L0 layers, or selected only part of them due to the
+    // L0 compaction size limit.
+    fully_compacted: bool,
 }
 
 #[derive(Default)]

From b5e95f68b5f9fee018a068fd9ac6ed270f085443 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 27 Jul 2024 04:32:05 +0200
Subject: [PATCH 1290/1571] Persist archival information (#8479)

Persists whether a timeline is archived or not in `index_part.json`. We
only return success if the upload has actually worked successfully.

Also introduces a new `index_part.json` version number.

Fixes #8459

Part of #8088
---
 pageserver/src/tenant.rs                      | 27 +++++--
 .../src/tenant/remote_timeline_client.rs      | 57 +++++++++++++-
 .../tenant/remote_timeline_client/index.rs    | 74 ++++++++++++++++++-
 pageserver/src/tenant/timeline.rs             |  5 ++
 storage_scrubber/src/checks.rs                |  3 +-
 5 files changed, 156 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 99f4e2d7cd..e5ac6725ad 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -102,8 +102,7 @@ use std::fmt::Debug;
 use std::fmt::Display;
 use std::fs;
 use std::fs::File;
-use std::sync::atomic::AtomicU64;
-use std::sync::atomic::Ordering;
+use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::Arc;
 use std::sync::Mutex;
 use std::time::{Duration, Instant};
@@ -1227,11 +1226,29 @@ impl Tenant {
         Ok(timeline_preloads)
     }
 
-    pub async fn apply_timeline_archival_config(
+    pub(crate) async fn apply_timeline_archival_config(
         &self,
-        _timeline_id: TimelineId,
-        _config: TimelineArchivalState,
+        timeline_id: TimelineId,
+        state: TimelineArchivalState,
     ) -> anyhow::Result<()> {
+        let timeline = self
+            .get_timeline(timeline_id, false)
+            .context("Cannot apply timeline archival config to inexistent timeline")?;
+
+        let upload_needed = timeline
+            .remote_client
+            .schedule_index_upload_for_timeline_archival_state(state)?;
+
+        if upload_needed {
+            const MAX_WAIT: Duration = Duration::from_secs(10);
+            let Ok(v) =
+                tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
+            else {
+                tracing::warn!("reached timeout for waiting on upload queue");
+                bail!("reached timeout for upload queue flush");
+            };
+            v?;
+        }
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 2f3c6c188b..fed666ca45 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -187,7 +187,7 @@ use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
 
 pub(crate) use download::download_initdb_tar_zst;
-use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState};
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
@@ -457,6 +457,17 @@ impl RemoteTimelineClient {
             .unwrap_or(false)
     }
 
+    /// Returns whether the timeline is archived.
+    /// Return None if the remote index_part hasn't been downloaded yet.
+    pub(crate) fn is_archived(&self) -> Option<bool> {
+        self.upload_queue
+            .lock()
+            .unwrap()
+            .initialized_mut()
+            .map(|q| q.clean.0.archived_at.is_some())
+            .ok()
+    }
+
     fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
         let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
             current_remote_index_part
@@ -617,7 +628,7 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
-    /// Launch an index-file upload operation in the background, with only aux_file_policy flag updated.
+    /// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated.
     pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
         self: &Arc<Self>,
         last_aux_file_policy: Option<AuxFilePolicy>,
@@ -628,6 +639,48 @@ impl RemoteTimelineClient {
         self.schedule_index_upload(upload_queue)?;
         Ok(())
     }
+
+    /// Launch an index-file upload operation in the background, with only the `archived_at` field updated.
+    ///
+    /// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded,
+    /// so either if the change is already sitting in the queue, but not commited yet, or the change has not
+    /// been in the queue yet.
+    pub(crate) fn schedule_index_upload_for_timeline_archival_state(
+        self: &Arc<Self>,
+        state: TimelineArchivalState,
+    ) -> anyhow::Result<bool> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        /// Returns Some(_) if a change is needed, and Some(true) if it's a
+        /// change needed to set archived_at.
+        fn need_change(
+            archived_at: &Option<NaiveDateTime>,
+            state: TimelineArchivalState,
+        ) -> Option<bool> {
+            match (archived_at, state) {
+                (Some(_), TimelineArchivalState::Archived)
+                | (None, TimelineArchivalState::Unarchived) => {
+                    // Nothing to do
+                    tracing::info!("intended state matches present state");
+                    None
+                }
+                (None, TimelineArchivalState::Archived) => Some(true),
+                (Some(_), TimelineArchivalState::Unarchived) => Some(false),
+            }
+        }
+        let need_upload_scheduled = need_change(&upload_queue.dirty.archived_at, state);
+
+        if let Some(archived_at_set) = need_upload_scheduled {
+            let intended_archived_at = archived_at_set.then(|| Utc::now().naive_utc());
+            upload_queue.dirty.archived_at = intended_archived_at;
+            self.schedule_index_upload(upload_queue)?;
+        }
+
+        let need_wait = need_change(&upload_queue.clean.0.archived_at, state).is_some();
+        Ok(need_wait)
+    }
+
     ///
     /// Launch an index-file upload operation in the background, if necessary.
     ///
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index b439df8edb..3075df022e 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -32,6 +32,10 @@ pub struct IndexPart {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub deleted_at: Option<NaiveDateTime>,
 
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub archived_at: Option<NaiveDateTime>,
+
     /// Per layer file name metadata, which can be present for a present or missing layer file.
     ///
     /// Older versions of `IndexPart` will not have this property or have only a part of metadata
@@ -80,10 +84,11 @@ impl IndexPart {
     /// - 5: lineage was added
     /// - 6: last_aux_file_policy is added.
     /// - 7: metadata_bytes is no longer written, but still read
-    const LATEST_VERSION: usize = 7;
+    /// - 8: added `archived_at`
+    const LATEST_VERSION: usize = 8;
 
     // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8];
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
@@ -94,6 +99,7 @@ impl IndexPart {
             disk_consistent_lsn: metadata.disk_consistent_lsn(),
             metadata,
             deleted_at: None,
+            archived_at: None,
             lineage: Default::default(),
             last_aux_file_policy: None,
         }
@@ -284,6 +290,7 @@ mod tests {
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: None,
+            archived_at: None,
             lineage: Lineage::default(),
             last_aux_file_policy: None,
         };
@@ -326,6 +333,7 @@ mod tests {
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: None,
+            archived_at: None,
             lineage: Lineage::default(),
             last_aux_file_policy: None,
         };
@@ -369,6 +377,7 @@ mod tests {
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
+            archived_at: None,
             lineage: Lineage::default(),
             last_aux_file_policy: None,
         };
@@ -415,6 +424,7 @@ mod tests {
             ])
             .unwrap(),
             deleted_at: None,
+            archived_at: None,
             lineage: Lineage::default(),
             last_aux_file_policy: None,
         };
@@ -456,6 +466,7 @@ mod tests {
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
+            archived_at: None,
             lineage: Lineage::default(),
             last_aux_file_policy: None,
         };
@@ -496,6 +507,7 @@ mod tests {
             disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(),
             metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: None,
+            archived_at: None,
             lineage: Lineage {
                 reparenting_history_truncated: false,
                 reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
@@ -545,6 +557,7 @@ mod tests {
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
+            archived_at: None,
             lineage: Lineage {
                 reparenting_history_truncated: false,
                 reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
@@ -603,6 +616,63 @@ mod tests {
                 14,
             ).with_recalculated_checksum().unwrap(),
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
+            archived_at: None,
+            lineage: Default::default(),
+            last_aux_file_policy: Default::default(),
+        };
+
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    #[test]
+    fn v8_indexpart_is_parsed() {
+        let example = r#"{
+            "version": 8,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata": {
+                "disk_consistent_lsn": "0/16960E8",
+                "prev_record_lsn": "0/1696070",
+                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
+                "ancestor_lsn": "0/0",
+                "latest_gc_cutoff_lsn": "0/1696070",
+                "initdb_lsn": "0/1696070",
+                "pg_version": 14
+            },
+            "deleted_at": "2023-07-31T09:00:00.123",
+            "archived_at": "2023-04-29T09:00:00.123"
+        }"#;
+
+        let expected = IndexPart {
+            version: 8,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::new(
+                Lsn::from_str("0/16960E8").unwrap(),
+                Some(Lsn::from_str("0/1696070").unwrap()),
+                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
+                Lsn::INVALID,
+                Lsn::from_str("0/1696070").unwrap(),
+                Lsn::from_str("0/1696070").unwrap(),
+                14,
+            ).with_recalculated_checksum().unwrap(),
+            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
+            archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
             lineage: Default::default(),
             last_aux_file_policy: Default::default(),
         };
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 017d598d96..862ca42188 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2001,6 +2001,11 @@ impl Timeline {
         self.current_state() == TimelineState::Active
     }
 
+    #[allow(unused)]
+    pub(crate) fn is_archived(&self) -> Option<bool> {
+        self.remote_client.is_archived()
+    }
+
     pub(crate) fn is_stopping(&self) -> bool {
         self.current_state() == TimelineState::Stopping
     }
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 421a848f67..a35a58aedd 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -87,7 +87,8 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                             .push(format!("index_part.json version: {}", index_part.version()))
                     }
 
-                    if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
+                    let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(2);
+                    if !newest_versions.any(|ip| ip == &index_part.version()) {
                         info!(
                             "index_part.json version is not latest: {}",
                             index_part.version()

From 6cad0455b07b9270dd0d9650c3e655fe6fa53f1e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sat, 27 Jul 2024 20:01:10 +0100
Subject: [PATCH 1291/1571] CI(test_runner): Upload all test artifacts if
 preserve_database_files is enabled (#7990)

## Problem

There's a `NeonEnvBuilder#preserve_database_files` parameter that allows
you to keep database files for debugging purposes (by default, files get
cleaned up), but there's no way to get these files from a CI run.
This PR adds handling of `NeonEnvBuilder#preserve_database_files` and
adds the compressed test output directory to Allure reports (for tests
with this parameter enabled).

Ref https://github.com/neondatabase/neon/issues/6967

## Summary of changes
- Compress and add the whole test output directory to Allure reports
- Currently works only with `neon_env_builder` fixture
- Remove `preserve_database_files = True` from sharding tests as
unneeded

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 test_runner/README.md                 |  2 +-
 test_runner/fixtures/neon_fixtures.py | 20 +++++++++++++++++---
 test_runner/fixtures/utils.py         | 11 ++++++++++-
 test_runner/regress/test_sharding.py  |  4 ----
 4 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/test_runner/README.md b/test_runner/README.md
index 7d95634ea8..e2f26a19ce 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -81,7 +81,7 @@ should go.
 Useful parameters and commands:
 
 `--preserve-database-files` to preserve pageserver (layer) and safekeer (segment) timeline files on disk
-after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents.
+after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents. If `NeonEnvBuilder#preserve_database_files` set to `True` for a particular test, the whole `repo` directory will be attached to Allure report (thus uploaded to S3) as `everything.tar.zst` for this test.
 
 Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them:
 `./scripts/pytest -s --log-cli-level=INFO ...`
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0a06398391..d98b2564df 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1394,7 +1394,7 @@ def _shared_simple_env(
         pg_distrib_dir=pg_distrib_dir,
         pg_version=pg_version,
         run_id=run_id,
-        preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
+        preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")),
         test_name=request.node.name,
         test_output_dir=test_output_dir,
         pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
@@ -1469,7 +1469,7 @@ def neon_env_builder(
         pg_version=pg_version,
         broker=default_broker,
         run_id=run_id,
-        preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
+        preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")),
         pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
         test_name=request.node.name,
         test_output_dir=test_output_dir,
@@ -1478,6 +1478,11 @@ def neon_env_builder(
         pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
     ) as builder:
         yield builder
+        # Propogate `preserve_database_files` to make it possible to use in other fixtures,
+        # like `test_output_dir` fixture for attaching all database files to Allure report.
+        request.node.user_properties.append(
+            ("preserve_database_files", builder.preserve_database_files)
+        )
 
 
 @dataclass
@@ -4478,7 +4483,16 @@ def test_output_dir(
 
     yield test_dir
 
-    allure_attach_from_dir(test_dir)
+    preserve_database_files = False
+    for k, v in request.node.user_properties:
+        # NB: the neon_env_builder fixture uses this fixture (test_output_dir).
+        # So, neon_env_builder's cleanup runs before here.
+        # The cleanup propagates NeonEnvBuilder.preserve_database_files into this user property.
+        if k == "preserve_database_files":
+            assert isinstance(v, bool)
+            preserve_database_files = v
+
+    allure_attach_from_dir(test_dir, preserve_database_files)
 
 
 class FileAndThreadLock:
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 0989dc1893..7f54eb0b0a 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -240,9 +240,18 @@ ATTACHMENT_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
 )
 
 
-def allure_attach_from_dir(dir: Path):
+def allure_attach_from_dir(dir: Path, preserve_database_files: bool = False):
     """Attach all non-empty files from `dir` that matches `ATTACHMENT_NAME_REGEX` to Allure report"""
 
+    if preserve_database_files:
+        zst_file = dir.with_suffix(".tar.zst")
+        with zst_file.open("wb") as zst:
+            cctx = zstandard.ZstdCompressor()
+            with cctx.stream_writer(zst) as compressor:
+                with tarfile.open(fileobj=compressor, mode="w") as tar:
+                    tar.add(dir, arcname="")
+        allure.attach.file(zst_file, "everything.tar.zst", "application/zstd", "tar.zst")
+
     for attachment in Path(dir).glob("**/*"):
         if ATTACHMENT_NAME_REGEX.fullmatch(attachment.name) and attachment.stat().st_size > 0:
             name = str(attachment.relative_to(dir))
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index f8770e70fe..7f30b2d7a7 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -48,8 +48,6 @@ def test_sharding_smoke(
     # that the scrubber doesn't barf when it sees a sharded tenant.
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
 
-    neon_env_builder.preserve_database_files = True
-
     env = neon_env_builder.init_start(
         initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
     )
@@ -372,8 +370,6 @@ def test_sharding_split_smoke(
     # that the scrubber doesn't barf when it sees a sharded tenant.
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
 
-    neon_env_builder.preserve_database_files = True
-
     non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024}
 
     env = neon_env_builder.init_configs(True)

From 2416da337e9ea9281e0c55d61718373fd46e424b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sun, 28 Jul 2024 21:36:59 +0200
Subject: [PATCH 1292/1571] safekeeper: include application name in wal service
 root span (#8525)

For IDENTIFY_SYSTEM in particular, application name gives away whether
the client is
* walproposer => Some(wal_proposer_recovery)
* safekeeper => Some(safekeeper)
* pageserver => Some(pageserver)

Context:
https://neondb.slack.com/archives/C06SJG60FRB/p1721987794673429?thread_ts=1721981056.451599&cid=C06SJG60FRB
---
 safekeeper/src/handler.rs     | 7 ++++++-
 safekeeper/src/wal_service.rs | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index f45bfb95fa..2c519433ef 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -143,7 +143,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                 self.tenant_id.unwrap_or(TenantId::from([0u8; 16])),
                 self.timeline_id.unwrap_or(TimelineId::from([0u8; 16])),
             );
-            tracing::Span::current().record("ttid", tracing::field::display(ttid));
+            tracing::Span::current()
+                .record("ttid", tracing::field::display(ttid))
+                .record(
+                    "application_name",
+                    tracing::field::debug(self.appname.clone()),
+                );
 
             Ok(())
         } else {
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 091571111e..16f7748eb4 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -43,7 +43,7 @@ pub async fn task_main(
                     error!("connection handler exited: {}", err);
                 }
             }
-            .instrument(info_span!("", cid = %conn_id, ttid = field::Empty)),
+            .instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty)),
         );
     }
 }

From da6bdff8934280d5bdec7042e90becb58697545a Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 29 Jul 2024 11:00:33 +0100
Subject: [PATCH 1293/1571] test_runner: fix user_property usage in benchmarks
 (#8531)

## Problem

After https://github.com/neondatabase/neon/pull/7990 `regress_test` job
started to fail with an error:
```
...
File "/__w/neon/neon/test_runner/fixtures/benchmark_fixture.py", line 485, in pytest_terminal_summary
  terminalreporter.write(f"{test_report.head_line}.{recorded_property['name']}: ")
TypeError: 'bool' object is not subscriptable
```

https://github.com/neondatabase/neon/actions/runs/10125750938/job/28002582582

It happens because the current implementation doesn't expect pytest's
`user_properties` can be used for anything else but benchmarks (and
https://github.com/neondatabase/neon/pull/7990 started to use it for
tracking `preserve_database_files` parameter)

## Summary of changes
- Make NeonBenchmarker use only records with`neon_benchmarker_` prefix
---
 test_runner/fixtures/benchmark_fixture.py | 23 +++++++++++++++++++----
 test_runner/fixtures/neon_fixtures.py     |  5 ++---
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index 038f557cc8..0c36cd6ef7 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -222,6 +222,8 @@ class NeonBenchmarker:
     function by the zenbenchmark fixture
     """
 
+    PROPERTY_PREFIX = "neon_benchmarker_"
+
     def __init__(self, property_recorder: Callable[[str, object], None]):
         # property recorder here is a pytest fixture provided by junitxml module
         # https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property
@@ -238,7 +240,7 @@ class NeonBenchmarker:
         Record a benchmark result.
         """
         # just to namespace the value
-        name = f"neon_benchmarker_{metric_name}"
+        name = f"{self.PROPERTY_PREFIX}_{metric_name}"
         self.property_recorder(
             name,
             {
@@ -249,6 +251,18 @@ class NeonBenchmarker:
             },
         )
 
+    @classmethod
+    def records(
+        cls, user_properties: list[tuple[str, object]]
+    ) -> Iterator[tuple[str, dict[str, object]]]:
+        """
+        Yield all records related to benchmarks
+        """
+        for property_name, recorded_property in user_properties:
+            if property_name.startswith(cls.PROPERTY_PREFIX):
+                assert isinstance(recorded_property, dict)
+                yield recorded_property["name"], recorded_property
+
     @contextmanager
     def record_duration(self, metric_name: str) -> Iterator[None]:
         """
@@ -425,10 +439,11 @@ def zenbenchmark(
     yield benchmarker
 
     results = {}
-    for _, recorded_property in request.node.user_properties:
+    for _, recorded_property in NeonBenchmarker.records(request.node.user_properties):
         name = recorded_property["name"]
         value = str(recorded_property["value"])
-        if (unit := recorded_property["unit"].strip()) != "":
+        unit = str(recorded_property["unit"]).strip()
+        if unit != "":
             value += f" {unit}"
         results[name] = value
 
@@ -477,7 +492,7 @@ def pytest_terminal_summary(
     for test_report in terminalreporter.stats.get("passed", []):
         result_entry = []
 
-        for _, recorded_property in test_report.user_properties:
+        for _, recorded_property in NeonBenchmarker.records(test_report.user_properties):
             if not is_header_printed:
                 terminalreporter.section("Benchmark results", "-")
                 is_header_printed = True
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d98b2564df..c5fffc2af6 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1441,6 +1441,7 @@ def neon_env_builder(
     pageserver_virtual_file_io_engine: str,
     pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
     pageserver_aux_file_policy: Optional[AuxFileStore],
+    record_property: Callable[[str, object], None],
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.
@@ -1480,9 +1481,7 @@ def neon_env_builder(
         yield builder
         # Propogate `preserve_database_files` to make it possible to use in other fixtures,
         # like `test_output_dir` fixture for attaching all database files to Allure report.
-        request.node.user_properties.append(
-            ("preserve_database_files", builder.preserve_database_files)
-        )
+        record_property("preserve_database_files", builder.preserve_database_files)
 
 
 @dataclass

From 859f01918529d5e6547ac4ff8e05a4e5775520a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 29 Jul 2024 12:05:18 +0200
Subject: [PATCH 1294/1571] Adopt list_streaming in tenant deletion (#8504)

Uses the Stream based `list_streaming` function added by #8457 in tenant
deletion, as suggested in https://github.com/neondatabase/neon/pull/7932#issuecomment-2150480180 .

We don't have to worry about retries, as the function is wrapped inside
an outer retry block. If there is a retryable error either during the
listing or during deletion, we just do a fresh start.

Also adds `+ Send` bounds as they are required by the
`delete_tenant_remote` function.
---
 libs/remote_storage/src/lib.rs               |  6 +--
 libs/remote_storage/src/simulate_failures.rs |  2 +-
 pageserver/src/tenant/mgr.rs                 | 52 ++++++++++----------
 3 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 75aa28233b..031548bbec 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -194,7 +194,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
         mode: ListingMode,
         max_keys: Option<NonZeroU32>,
         cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>>;
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> + Send;
 
     async fn list(
         &self,
@@ -351,10 +351,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         mode: ListingMode,
         max_keys: Option<NonZeroU32>,
         cancel: &'a CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a + Send {
         match self {
             Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
-                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
+                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>> + Send>>,
             Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
             Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
             Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 67e5be2955..13f873dcdb 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -114,7 +114,7 @@ impl RemoteStorage for UnreliableWrapper {
         mode: ListingMode,
         max_keys: Option<NonZeroU32>,
         cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> + Send {
         async_stream::stream! {
             self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
                 .map_err(DownloadError::Other)?;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 75c8682c97..5e1f69f4c1 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1384,34 +1384,32 @@ impl TenantManager {
         tenant_shard_id: TenantShardId,
     ) -> Result<(), DeleteTenantError> {
         let remote_path = remote_tenant_path(&tenant_shard_id);
-        let keys = match self
-            .resources
-            .remote_storage
-            .list(
-                Some(&remote_path),
-                remote_storage::ListingMode::NoDelimiter,
-                None,
-                &self.cancel,
-            )
-            .await
-        {
-            Ok(listing) => listing.keys,
-            Err(remote_storage::DownloadError::Cancelled) => {
-                return Err(DeleteTenantError::Cancelled)
-            }
-            Err(remote_storage::DownloadError::NotFound) => return Ok(()),
-            Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
-        };
+        let mut keys_stream = self.resources.remote_storage.list_streaming(
+            Some(&remote_path),
+            remote_storage::ListingMode::NoDelimiter,
+            None,
+            &self.cancel,
+        );
+        while let Some(chunk) = keys_stream.next().await {
+            let keys = match chunk {
+                Ok(listing) => listing.keys,
+                Err(remote_storage::DownloadError::Cancelled) => {
+                    return Err(DeleteTenantError::Cancelled)
+                }
+                Err(remote_storage::DownloadError::NotFound) => return Ok(()),
+                Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
+            };
 
-        if keys.is_empty() {
-            tracing::info!("Remote storage already deleted");
-        } else {
-            tracing::info!("Deleting {} keys from remote storage", keys.len());
-            let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
-            self.resources
-                .remote_storage
-                .delete_objects(&keys, &self.cancel)
-                .await?;
+            if keys.is_empty() {
+                tracing::info!("Remote storage already deleted");
+            } else {
+                tracing::info!("Deleting {} keys from remote storage", keys.len());
+                let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
+                self.resources
+                    .remote_storage
+                    .delete_objects(&keys, &self.cancel)
+                    .await?;
+            }
         }
 
         Ok(())

From 1d8cf5b3a9e3dd0013e5cc8c846c51558abecbfd Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Mon, 29 Jul 2024 14:35:12 +0200
Subject: [PATCH 1295/1571] Add a test for clickhouse as a logical replication
 consumer (#8408)

## Problem

We need to test logical replication with 3rd-party tools regularly.

## Summary of changes

Added a test using ClickHouse as a client

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/pg-clients.yml          |  72 ++++++++++
 poetry.lock                               | 153 +++++++++++++++++++++-
 pyproject.toml                            |   1 +
 test_runner/logical_repl/test_log_repl.py |  88 +++++++++++++
 4 files changed, 313 insertions(+), 1 deletion(-)
 create mode 100644 test_runner/logical_repl/test_log_repl.py

diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml
index e21e45c929..55b68ccdb5 100644
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -13,6 +13,7 @@ on:
     paths:
       - '.github/workflows/pg-clients.yml'
       - 'test_runner/pg_clients/**'
+      - 'test_runner/logical_repl/**'
       - 'poetry.lock'
   workflow_dispatch:
 
@@ -49,6 +50,77 @@ jobs:
       image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
     secrets: inherit
 
+  test-logical-replication:
+    needs: [ build-build-tools-image ]
+    runs-on: ubuntu-22.04
+
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init --user root
+    services:
+      clickhouse:
+        image: clickhouse/clickhouse-server:24.6.3.64
+        ports:
+          - 9000:9000
+          - 8123:8123
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download Neon artifact
+        uses: ./.github/actions/download
+        with:
+          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+          path: /tmp/neon/
+          prefix: latest
+
+      - name: Create Neon Project
+        id: create-neon-project
+        uses: ./.github/actions/neon-project-create
+        with:
+          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+          postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+
+      - name: Run tests
+        uses: ./.github/actions/run-python-test-set
+        with:
+          build_type: remote
+          test_selection: logical_repl
+          run_in_parallel: false
+          extra_params: -m remote_cluster
+          pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        env:
+          BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
+
+      - name: Delete Neon Project
+        if: always()
+        uses: ./.github/actions/neon-project-delete
+        with:
+          project_id: ${{ steps.create-neon-project.outputs.project_id }}
+          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+      - name: Create Allure report
+        if: ${{ !cancelled() }}
+        id: create-allure-report
+        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+        env:
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
+      - name: Post to a Slack channel
+        if: github.event.schedule && failure()
+        uses: slackapi/slack-github-action@v1
+        with:
+          channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
+          slack-message: |
+            Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
   test-postgres-client-libs:
     needs: [ build-build-tools-image ]
     runs-on: ubuntu-22.04
diff --git a/poetry.lock b/poetry.lock
index 5192a574cc..d7a3dde65b 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -870,6 +870,96 @@ files = [
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}
 
+[[package]]
+name = "clickhouse-connect"
+version = "0.7.17"
+description = "ClickHouse Database Core Driver for Python, Pandas, and Superset"
+optional = false
+python-versions = "~=3.8"
+files = [
+    {file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66209e4634f457604c263bea176336079d26c284e251e68a8435b0b80c1a25ff"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4d86c5a561a2a99321c8b4af22257461b8e67142f34cfea6e70f39b45b1f406"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d200c9afa2725a96f9f3718221f641276b80c11bf504d8a2fbaafb5a05b2f0d3"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004d867b1005445a46e6742db1054bf2a717a451372663b46e09b5e9e90a31e3"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4ef94a4a8e008882259151833c3c47cfbb9c8f08de0f100aaf3b95c366dcfb24"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ee732c3df50c8b07d16b5836ff85e6b84569922455c03837c3add5cf1388fe1f"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d9dbe1235465bb946e24b90b0ca5b8800b5d645acb2d7d6ee819448c3e2fd959"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-win32.whl", hash = "sha256:e5db0d68dfb63db0297d44dc91406bcfd7d333708d7cd55086c8550fbf870b78"},
+    {file = "clickhouse_connect-0.7.17-cp310-cp310-win_amd64.whl", hash = "sha256:800750f568c097ea312887785025006d6098bffd8ed2dd6a57048fb3ced6d778"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4eb390623b3d15dc9cda78f5c68f83ef9ad11743797e70af8fabc384b015a73c"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:35f172ca950f218f63072024c81d5b4ff6e5399620c255506c321ccc7b17c9a5"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7918f060f7576fc931c692e0122b1b07576fabd81444af22e1f8582300d200"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff2881b93c7a1afb9c99fb59ad5fd666850421325d0931e2b77f3f4ba872303d"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a4d9b4f97271addf66aadbaf7f154f19a0ad6c22026d575a995c55ebd8576db"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e431469b1ff2d5c3e4c406d55c6afdf7102f5d2524c2ceb5481b94ac24412aa3"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2b6f80115176559f181a6b3ecad11aa3d70ef6014c3d2905b90fcef3f27d25c2"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8ac694f40dfafc8a3cc877116b4bc73e8877ebf66d4d96ee092484ee4c0b481"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-win32.whl", hash = "sha256:78b7a3f6b0fad4eaf8afb5f9a2e855bde53e82ea5804960e9cf779538f4606a1"},
+    {file = "clickhouse_connect-0.7.17-cp311-cp311-win_amd64.whl", hash = "sha256:efd390cc045334ecc3f2a9c18cc07c041d0288b145967805fdcab65abeefa75f"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9228334a17dc0a7842222f54ba5b89fc563532424aad4f66be799df70ab37e9f"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e432a42bb788bda77e88eda2774392a60fbbb5ee2a79cb2881d182d26c45fe49"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85152ed2879965ee1fa2bd5e31fb27d281fd5f50d6e86a401efd95cd85b29ef"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a126104aa5e11df570cbd89fca4988784084602ba77d17b2396b334c54fd75"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:882d8f9570549258e6eb6a97915fbf64ed29fe395d5e360866ea8d42c8283a35"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:06ebf99111171442f462fb8b357364c3e276da3e8f8557b2e8fee9eb55ab37d1"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e0cf6f99b2777b0d164bf8b65ec39104cdc0789a56bcb52d98289bbd6f5cc70e"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee46c508fddfff3b7ac52326788e0c6dd8dfb416b6d7e02e5d30e8110749dac2"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-win32.whl", hash = "sha256:eb708b590a37d56b069a6088254ffa55d73b8cb65527339df81ef03fe67ffdec"},
+    {file = "clickhouse_connect-0.7.17-cp312-cp312-win_amd64.whl", hash = "sha256:17f00dccddaeaf43733faa1fa21f7d24641454a73669fda862545ba7c88627f5"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab5d4b37a6dcc39e94c63beac0f22d9dda914f5eb865d166c64cf04dfadb7d16"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32aa90387f45f34cbc5a984789ed4c12760a3c0056c190ab0123ceafc36b1002"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21277b6bdd6c8ff14170bfcd52125c5c39f442ec4bafbb643ad7d0ca915f0029"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca68d8b7dee3fb4e7229e06152f5b0faaccafb4c87d9c2d48fa5bd117a3cc1c0"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:841c56282102b2fba1e0b332bb1c7a0c50992fbc321746af8d3e0e6ca2450e8b"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d7ffde5a4b95d8fe9ed38e08e504e497310e3d7a17691bd40bf65734648fdfc"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:055960086b6b92b6e44f5ba04c81c40c10b038588e4b3908b033c99f66125332"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:36491fec63ceb8503b6344c23477647030139f346b749dc5ee672c505939dbbe"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-win32.whl", hash = "sha256:8779a907e026db32e6bc0bc0c8d5de0e2e3afd166afc2d4adcc0603399af5539"},
+    {file = "clickhouse_connect-0.7.17-cp38-cp38-win_amd64.whl", hash = "sha256:309854fa197885c6278438ddd032ab52e6fec56f162074e343c3635ca7266078"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8009f94550178dc971aeb4f8787ba7a5b473c22647490428b7229f540a51d2b"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:70f8422f407b13a404b3670fd097855abd5adaf890c710d6678d2b46ab61ac48"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:082783eb1e8baf7b3465dd045132dc5cb5a91432c899dc4e19891c5f782d8d23"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c30aad2a9c7584c4ee19e646a087b3bbd2d4daab3d88a2afeeae1a7f6febf9"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc8e245a9f4f0dce39f155e626405f60f1d3cf4d1e52dd2c793ea6b603ca111b"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:802372cb8a69c9ffdf4260e9f01616c8601ba531825ed6f08834827e0b880cd1"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:193a60271a3b105cdbde96fb20b40eab8a50fca3bb1f397546f7a18b53d9aa9c"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:59d58932916792fdbd09cb961a245a0c2d87b07b8296f9138915b998f4522941"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-win32.whl", hash = "sha256:3cfd0edabb589f640636a97ffc38d1b3d760faef208d44e50829cc1ad3f0d3e5"},
+    {file = "clickhouse_connect-0.7.17-cp39-cp39-win_amd64.whl", hash = "sha256:5661b4629aac228481219abf2e149119af1a71d897f191665e182d9d192d7033"},
+    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7429d309109e7e4a70fd867d69fcfea9ddcb1a1e910caa6b0e2c3776b71f4613"},
+    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5ae619151006da84a0b1585a9bcc81be32459d8061aeb2e116bad5bbaa7d108"},
+    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec0c84a0880621cb2389656a89886ef3133f0b3f8dc016eee6f25bbb49ff6f70"},
+    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705464c23f821666b76f8f619cf2870225156276562756b3933aaa24708e0ff8"},
+    {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1822016f4b769e89264fe26cefe0bc5e50e4c3ca0747d89bb52d57dc4f1e5ffb"},
+    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c92b0c342c1fbfa666010e8175e05026dc570a7ef91d8fa81ce503180f318aa"},
+    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2e106536540e906c3c866f8615fcf870a9a77c1bfab9ef4b042febfd2fdb953"},
+    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac9a32e62384b4341ba51a451084eb3b00c6e59aaac1499145dd8b897cb585c"},
+    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0feed93b9912b7862a8c41be1febcd44b68a824a5c1059b19d5c567afdaa6273"},
+    {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2e2dd6db52e799f065fd565143fde5a872cfe903de1bee7775bc3a349856a790"},
+    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed13add5d579a5960155f3000420544368501c9703d2fb94f103b4a6126081f6"},
+    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c257a23ed3bf1858593fb03927d9d073fbbdfa24dc2afee537c3314bd66b4e24"},
+    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47866f64cbdc2d5cc4f8a7a8c49e3ee90c9e487091b9eda7c3a3576418e1cbe"},
+    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b850e2f17e0a0b5a37d996d3fb728050227489d64d271d678d166abea94f26e"},
+    {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:349682288987dc84ac7695f7cd6b510be8d0ec0eee7c1b72dbf2146b4e9efdb8"},
+]
+
+[package.dependencies]
+certifi = "*"
+lz4 = "*"
+pytz = "*"
+urllib3 = ">=1.26"
+zstandard = "*"
+
+[package.extras]
+arrow = ["pyarrow"]
+numpy = ["numpy"]
+orjson = ["orjson"]
+pandas = ["pandas"]
+sqlalchemy = ["sqlalchemy (>1.3.21,<2.0)"]
+tzlocal = ["tzlocal (>=4.0)"]
+
 [[package]]
 name = "colorama"
 version = "0.4.5"
@@ -1470,6 +1560,56 @@ files = [
     {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
 ]
 
+[[package]]
+name = "lz4"
+version = "4.3.3"
+description = "LZ4 Bindings for Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"},
+    {file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"},
+    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7"},
+    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05"},
+    {file = "lz4-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc"},
+    {file = "lz4-4.3.3-cp310-cp310-win32.whl", hash = "sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6"},
+    {file = "lz4-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2"},
+    {file = "lz4-4.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6"},
+    {file = "lz4-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61"},
+    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7"},
+    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563"},
+    {file = "lz4-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21"},
+    {file = "lz4-4.3.3-cp311-cp311-win32.whl", hash = "sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d"},
+    {file = "lz4-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c"},
+    {file = "lz4-4.3.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d"},
+    {file = "lz4-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2"},
+    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809"},
+    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf"},
+    {file = "lz4-4.3.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e"},
+    {file = "lz4-4.3.3-cp312-cp312-win32.whl", hash = "sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1"},
+    {file = "lz4-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f"},
+    {file = "lz4-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394"},
+    {file = "lz4-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0"},
+    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd"},
+    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775"},
+    {file = "lz4-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604"},
+    {file = "lz4-4.3.3-cp38-cp38-win32.whl", hash = "sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa"},
+    {file = "lz4-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24"},
+    {file = "lz4-4.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba"},
+    {file = "lz4-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205"},
+    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d"},
+    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071"},
+    {file = "lz4-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0"},
+    {file = "lz4-4.3.3-cp39-cp39-win32.whl", hash = "sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2"},
+    {file = "lz4-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807"},
+    {file = "lz4-4.3.3.tar.gz", hash = "sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e"},
+]
+
+[package.extras]
+docs = ["sphinx (>=1.6.0)", "sphinx-bootstrap-theme"]
+flake8 = ["flake8"]
+tests = ["psutil", "pytest (!=3.3.0)", "pytest-cov"]
+
 [[package]]
 name = "markupsafe"
 version = "2.1.1"
@@ -2361,6 +2501,17 @@ files = [
 [package.dependencies]
 six = ">=1.5"
 
+[[package]]
+name = "pytz"
+version = "2024.1"
+description = "World timezone definitions, modern and historical"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
+    {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
+]
+
 [[package]]
 name = "pywin32"
 version = "301"
@@ -3206,4 +3357,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
+content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
diff --git a/pyproject.toml b/pyproject.toml
index c7f1a07512..0d5782ac7c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,7 @@ zstandard = "^0.21.0"
 httpx = {extras = ["http2"], version = "^0.26.0"}
 pytest-repeat = "^0.9.3"
 websockets = "^12.0"
+clickhouse-connect = "^0.7.16"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
diff --git a/test_runner/logical_repl/test_log_repl.py b/test_runner/logical_repl/test_log_repl.py
new file mode 100644
index 0000000000..0a1aecfe2b
--- /dev/null
+++ b/test_runner/logical_repl/test_log_repl.py
@@ -0,0 +1,88 @@
+"""
+Test the logical replication in Neon with the different consumers
+"""
+
+import hashlib
+import time
+
+import clickhouse_connect
+import psycopg2
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import RemotePostgres
+from fixtures.utils import wait_until
+
+
+def query_clickhouse(
+    client,
+    query: str,
+    digest: str,
+) -> None:
+    """
+    Run the query on the client
+    return answer if successful, raise an exception otherwise
+    """
+    log.debug("Query: %s", query)
+    res = client.query(query)
+    log.debug(res.result_rows)
+    m = hashlib.sha1()
+    m.update(repr(tuple(res.result_rows)).encode())
+    hash_res = m.hexdigest()
+    log.debug("Hash: %s", hash_res)
+    if hash_res == digest:
+        return
+    raise ValueError("Hash mismatch")
+
+
+@pytest.mark.remote_cluster
+def test_clickhouse(remote_pg: RemotePostgres):
+    """
+    Test the logical replication having ClickHouse as a client
+    """
+    conn_options = remote_pg.conn_options()
+    for _ in range(5):
+        try:
+            conn = psycopg2.connect(remote_pg.connstr())
+        except psycopg2.OperationalError as perr:
+            log.debug(perr)
+            time.sleep(1)
+        else:
+            break
+        raise TimeoutError
+    cur = conn.cursor()
+    cur.execute("DROP TABLE IF EXISTS table1")
+    cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));")
+    cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');")
+    conn.commit()
+    client = clickhouse_connect.get_client(host="clickhouse")
+    client.command("SET allow_experimental_database_materialized_postgresql=1")
+    client.command(
+        "CREATE DATABASE db1_postgres ENGINE = "
+        f"MaterializedPostgreSQL('{conn_options['host']}', "
+        f"'{conn_options['dbname']}', "
+        f"'{conn_options['user']}', '{conn_options['password']}') "
+        "SETTINGS materialized_postgresql_tables_list = 'table1';"
+    )
+    wait_until(
+        120,
+        0.5,
+        lambda: query_clickhouse(
+            client,
+            "select * from db1_postgres.table1 order by 1",
+            "ee600d8f7cd05bd0b169fa81f44300a9dd10085a",
+        ),
+    )
+    cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');")
+    conn.commit()
+    wait_until(
+        120,
+        0.5,
+        lambda: query_clickhouse(
+            client,
+            "select * from db1_postgres.table1 order by 1",
+            "9eba2daaf7e4d7d27ac849525f68b562ab53947d",
+        ),
+    )
+    log.debug("Sleeping before final checking if Neon is still alive")
+    time.sleep(3)
+    cur.execute("SELECT 1")

From bdfc9ca7e986308d57d8d2bd122e0d0306652aba Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 29 Jul 2024 15:41:06 +0300
Subject: [PATCH 1296/1571] test: deflake test_duplicate_creation (#8536)

By including comparison of `remote_consistent_lsn_visible` we risk
flakyness coming from outside of timeline creation. Mask out the
`remote_consistent_lsn_visible` for the comparison.

Evidence:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8489/10142336315/index.html#suites/ffbb7f9930a77115316b58ff32b7c719/89ff0270bf58577a
---
 test_runner/regress/test_branching.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 03d6946c15..190b624a54 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -389,6 +389,11 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
         repeat_result = ps_http.timeline_create(
             env.pg_version, env.initial_tenant, success_timeline, timeout=60
         )
+        # remote_consistent_lsn_visible will be published only after we've
+        # confirmed the generation, which is not part of what we await during
+        # timeline creation (uploads). mask it out here to avoid flakyness.
+        del success_result["remote_consistent_lsn_visible"]
+        del repeat_result["remote_consistent_lsn_visible"]
         assert repeat_result == success_result
     finally:
         env.pageserver.stop(immediate=True)

From 5775662276cbeb1b7cdcfcc0dca1ad59880825f1 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 29 Jul 2024 15:05:30 +0100
Subject: [PATCH 1297/1571] pageserver: fix return code from
 secondary_download_handler (#8508)

## Problem

The secondary download HTTP API is meant to return 200 if the download
is complete, and 202 if it is still in progress. In #8198 the download
implementation was changed to drop out with success early if it
over-runs a time budget, which resulted in 200 responses for incomplete
downloads.

This breaks storcon_cli's "tenant-warmup" command, which uses the OK
status to indicate download complete.

## Summary of changes

- Only return 200 if we get an Ok() _and_ the progress stats indicate
the download is complete.
---
 pageserver/src/http/routes.rs | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7935aeb5e9..9222123ad3 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2129,14 +2129,24 @@ async fn secondary_download_handler(
 
     let timeout = wait.unwrap_or(Duration::MAX);
 
-    let status = match tokio::time::timeout(
+    let result = tokio::time::timeout(
         timeout,
         state.secondary_controller.download_tenant(tenant_shard_id),
     )
-    .await
-    {
-        // Download job ran to completion.
-        Ok(Ok(())) => StatusCode::OK,
+    .await;
+
+    let progress = secondary_tenant.progress.lock().unwrap().clone();
+
+    let status = match result {
+        Ok(Ok(())) => {
+            if progress.layers_downloaded >= progress.layers_total {
+                // Download job ran to completion
+                StatusCode::OK
+            } else {
+                // Download dropped out without errors because it ran out of time budget
+                StatusCode::ACCEPTED
+            }
+        }
         // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
         // okay.  We could get an error here in the unlikely edge case that the tenant
         // was detached between our check above and executing the download job.
@@ -2146,8 +2156,6 @@ async fn secondary_download_handler(
         Err(_) => StatusCode::ACCEPTED,
     };
 
-    let progress = secondary_tenant.progress.lock().unwrap().clone();
-
     json_response(status, progress)
 }
 

From d09dad0ea2bfc1485dba537275d8fe906dab8c77 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 29 Jul 2024 16:16:32 +0200
Subject: [PATCH 1298/1571] pageserver: fail if `id` is present in
 pageserver.toml (#8489)

Overall plan:
https://www.notion.so/neondatabase/Rollout-Plan-simplified-pageserver-initialization-f935ae02b225444e8a41130b7d34e4ea?pvs=4

---

`identity.toml` is the authoritative place for `id` as of
https://github.com/neondatabase/neon/pull/7766

refs https://github.com/neondatabase/neon/issues/7736
---
 control_plane/src/local_env.rs  | 19 +++++++++++++++----
 control_plane/src/pageserver.rs | 11 +++++++----
 pageserver/src/config.rs        | 32 ++++++++------------------------
 3 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index d7830a5e70..505d157efd 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -514,7 +514,6 @@ impl LocalEnv {
                 #[derive(serde::Serialize, serde::Deserialize)]
                 // (allow unknown fields, unlike PageServerConf)
                 struct PageserverConfigTomlSubset {
-                    id: NodeId,
                     listen_pg_addr: String,
                     listen_http_addr: String,
                     pg_auth_type: AuthType,
@@ -526,18 +525,30 @@ impl LocalEnv {
                         .with_context(|| format!("read {:?}", config_toml_path))?,
                 )
                 .context("parse pageserver.toml")?;
+                let identity_toml_path = dentry.path().join("identity.toml");
+                #[derive(serde::Serialize, serde::Deserialize)]
+                struct IdentityTomlSubset {
+                    id: NodeId,
+                }
+                let identity_toml: IdentityTomlSubset = toml_edit::de::from_str(
+                    &std::fs::read_to_string(&identity_toml_path)
+                        .with_context(|| format!("read {:?}", identity_toml_path))?,
+                )
+                .context("parse identity.toml")?;
                 let PageserverConfigTomlSubset {
-                    id: config_toml_id,
                     listen_pg_addr,
                     listen_http_addr,
                     pg_auth_type,
                     http_auth_type,
                 } = config_toml;
+                let IdentityTomlSubset {
+                    id: identity_toml_id,
+                } = identity_toml;
                 let conf = PageServerConf {
                     id: {
                         anyhow::ensure!(
-                            config_toml_id == id,
-                            "id mismatch: config_toml.id={config_toml_id} id={id}",
+                            identity_toml_id == id,
+                            "id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}",
                         );
                         id
                     },
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index ba4f98d945..399b1c2653 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -127,10 +127,13 @@ impl PageServerNode {
         }
 
         // Apply the user-provided overrides
-        overrides.push(
-            toml_edit::ser::to_string_pretty(&conf)
-                .expect("we deserialized this from toml earlier"),
-        );
+        overrides.push({
+            let mut doc =
+                toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier");
+            // `id` is written out to `identity.toml` instead of `pageserver.toml`
+            doc.remove("id").expect("it's part of the struct");
+            doc.to_string()
+        });
 
         // Turn `overrides` into a toml document.
         // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 100c6c1ac5..f71881683d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -356,8 +356,6 @@ struct PageServerConfigBuilder {
     auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
     remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
 
-    id: BuilderValue<NodeId>,
-
     broker_endpoint: BuilderValue<Uri>,
     broker_keepalive_interval: BuilderValue<Duration>,
 
@@ -406,11 +404,8 @@ struct PageServerConfigBuilder {
 }
 
 impl PageServerConfigBuilder {
-    fn new(node_id: NodeId) -> Self {
-        let mut this = Self::default();
-        this.id(node_id);
-
-        this
+    fn new() -> Self {
+        Self::default()
     }
 
     #[inline(always)]
@@ -438,7 +433,6 @@ impl PageServerConfigBuilder {
             pg_auth_type: Set(AuthType::Trust),
             auth_validation_public_key_path: Set(None),
             remote_storage_config: Set(None),
-            id: NotSet,
             broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
                 .parse()
                 .expect("failed to parse default broker endpoint")),
@@ -568,10 +562,6 @@ impl PageServerConfigBuilder {
         self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
     }
 
-    pub fn id(&mut self, node_id: NodeId) {
-        self.id = BuilderValue::Set(node_id)
-    }
-
     pub fn log_format(&mut self, log_format: LogFormat) {
         self.log_format = BuilderValue::Set(log_format)
     }
@@ -683,7 +673,7 @@ impl PageServerConfigBuilder {
         self.l0_flush = BuilderValue::Set(value);
     }
 
-    pub fn build(self) -> anyhow::Result<PageServerConf> {
+    pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
         macro_rules! conf {
@@ -716,7 +706,6 @@ impl PageServerConfigBuilder {
                 pg_auth_type,
                 auth_validation_public_key_path,
                 remote_storage_config,
-                id,
                 broker_endpoint,
                 broker_keepalive_interval,
                 log_format,
@@ -744,6 +733,7 @@ impl PageServerConfigBuilder {
             }
             CUSTOM LOGIC
             {
+                id: id,
                 // TenantConf is handled separately
                 default_tenant_conf: TenantConf::default(),
                 concurrent_tenant_warmup: ConfigurableSemaphore::new({
@@ -893,7 +883,7 @@ impl PageServerConf {
         toml: &Document,
         workdir: &Utf8Path,
     ) -> anyhow::Result<Self> {
-        let mut builder = PageServerConfigBuilder::new(node_id);
+        let mut builder = PageServerConfigBuilder::new();
         builder.workdir(workdir.to_owned());
 
         let mut t_conf = TenantConfOpt::default();
@@ -924,8 +914,6 @@ impl PageServerConf {
                 "tenant_config" => {
                     t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
                 }
-                "id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
-                            // Logging is not set up yet, so we can't do it.
                 "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
                 "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                 "log_format" => builder.log_format(
@@ -1018,7 +1006,7 @@ impl PageServerConf {
             }
         }
 
-        let mut conf = builder.build().context("invalid config")?;
+        let mut conf = builder.build(node_id).context("invalid config")?;
 
         if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
             let auth_validation_public_key_path = conf
@@ -1255,7 +1243,6 @@ max_file_descriptors = 333
 
 # initial superuser role name to use when creating a new tenant
 initial_superuser_name = 'zzzz'
-id = 10
 
 metric_collection_interval = '222 s'
 metric_collection_endpoint = 'http://localhost:80/metrics'
@@ -1272,9 +1259,8 @@ background_task_maximum_delay = '334 s'
         let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
         let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
         // we have to create dummy values to overcome the validation errors
-        let config_string = format!(
-            "pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
-        );
+        let config_string =
+            format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",);
         let toml = config_string.parse()?;
 
         let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
@@ -1579,7 +1565,6 @@ broker_endpoint = '{broker_endpoint}'
             r#"pg_distrib_dir = "{pg_distrib_dir}"
 metric_collection_endpoint = "http://sample.url"
 metric_collection_interval = "10min"
-id = 222
 
 [disk_usage_based_eviction]
 max_usage_pct = 80
@@ -1649,7 +1634,6 @@ threshold = "20m"
             r#"pg_distrib_dir = "{pg_distrib_dir}"
 metric_collection_endpoint = "http://sample.url"
 metric_collection_interval = "10min"
-id = 222
 
 [tenant_config]
 evictions_low_residence_duration_metric_threshold = "20m"

From 4be58522fbd61e74e7fead19d106c8182f1cefba Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 29 Jul 2024 16:49:22 +0200
Subject: [PATCH 1299/1571] l0_flush: use mode=direct by default => coverage in
 automated tests (#8534)

Testing in staging and pre-prod has been [going

well](https://github.com/neondatabase/neon/issues/7418#issuecomment-2255474917).

This PR enables mode=direct by default, thereby providing additional
coverage in the automated tests:
- Rust tests
- Integration tests
- Nightly pagebench (likely irrelevant because it's read-only)

Production deployments continue to use `mode=page-cache` for the time
being: https://github.com/neondatabase/aws/pull/1655

refs https://github.com/neondatabase/neon/issues/7418
---
 pageserver/src/l0_flush.rs | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs
index 7fe8fedc63..8945e5accd 100644
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -2,13 +2,23 @@ use std::{num::NonZeroUsize, sync::Arc};
 
 use crate::tenant::ephemeral_file;
 
-#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
-    #[default]
     PageCached,
     #[serde(rename_all = "snake_case")]
-    Direct { max_concurrency: NonZeroUsize },
+    Direct {
+        max_concurrency: NonZeroUsize,
+    },
+}
+
+impl Default for L0FlushConfig {
+    fn default() -> Self {
+        Self::Direct {
+            // TODO: using num_cpus results in different peak memory usage on different instance types.
+            max_concurrency: NonZeroUsize::new(usize::max(1, num_cpus::get())).unwrap(),
+        }
+    }
 }
 
 #[derive(Clone)]

From 52b02d95c801855dfd462f767bf551cbe2142663 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 29 Jul 2024 17:50:44 +0100
Subject: [PATCH 1300/1571] scrubber: enable cleaning up garbage tenants from
 known deletion bugs, add object age safety check (#8461)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

Old storage buckets can contain a lot of tenants that aren't known to
the control plane at all, because they belonged to test jobs that get
their control plane state cleaned up shortly after running.

In general, it's somewhat unsafe to purge these, as it's hard to
distinguish "control plane doesn't know about this, so it's garbage"
from "control plane said it didn't know about this, which is a bug in
the scrubber, control plane, or API URL configured".

However, the most common case is that we see only a small husk of a
tenant in S3 from a specific old behavior of the software, for example:
- We had a bug where heatmaps weren't deleted on tenant delete
- When WAL DR was first deployed, we didn't delete initdb.tar.zst on
tenant deletion

## Summary of changes

- Add a KnownBug variant for the garbage reason
- Include such cases in the "safe" deletion mode (`--mode=deleted`)
- Add code that inspects tenants missing in control plane to identify
cases of known bugs (this is kind of slow, but should go away once we've
cleaned all these up)
- Add an additional `-min-age` safety check similar to physical GC,
where even if everything indicates objects aren't needed, we won't
delete something that has been modified too recently.

---------

Co-authored-by: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 storage_scrubber/src/garbage.rs | 118 ++++++++++++++++++++++++++++++--
 storage_scrubber/src/main.rs    |  10 ++-
 2 files changed, 121 insertions(+), 7 deletions(-)

diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index 333269ec7e..78ecfc7232 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -5,6 +5,7 @@
 use std::{
     collections::{HashMap, HashSet},
     sync::Arc,
+    time::Duration,
 };
 
 use anyhow::Context;
@@ -18,7 +19,7 @@ use utils::id::TenantId;
 
 use crate::{
     cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
-    init_remote, init_remote_generic,
+    init_remote, init_remote_generic, list_objects_with_retries,
     metadata_stream::{stream_tenant_timelines, stream_tenants},
     BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
 };
@@ -27,6 +28,11 @@ use crate::{
 enum GarbageReason {
     DeletedInConsole,
     MissingInConsole,
+
+    // The remaining data relates to a known deletion issue, and we're sure that purging this
+    // will not delete any real data, for example https://github.com/neondatabase/neon/pull/7928 where
+    // there is nothing in a tenant path apart from a heatmap file.
+    KnownBug,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -72,6 +78,15 @@ impl GarbageList {
         }
     }
 
+    /// If an entity has been identified as requiring purge due to a known bug, e.g.
+    /// a particular type of object left behind after an incomplete deletion.
+    fn append_buggy(&mut self, entity: GarbageEntity) {
+        self.items.push(GarbageItem {
+            entity,
+            reason: GarbageReason::KnownBug,
+        });
+    }
+
     /// Return true if appended, false if not.  False means the result was not garbage.
     fn maybe_append<T>(&mut self, entity: GarbageEntity, result: Option<T>) -> bool
     where
@@ -219,6 +234,71 @@ async fn find_garbage_inner(
             assert!(project.tenant == tenant_shard_id.tenant_id);
         }
 
+        // Special case: If it's missing in console, check for known bugs that would enable us to conclusively
+        // identify it as purge-able anyway
+        if console_result.is_none() {
+            let timelines = stream_tenant_timelines(&s3_client, &target, tenant_shard_id)
+                .await?
+                .collect::<Vec<_>>()
+                .await;
+            if timelines.is_empty() {
+                // No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
+                let tenant_objects = list_objects_with_retries(
+                    &s3_client,
+                    &target.tenant_root(&tenant_shard_id),
+                    None,
+                )
+                .await?;
+                let object = tenant_objects.contents.as_ref().unwrap().first().unwrap();
+                if object.key.as_ref().unwrap().ends_with("heatmap-v1.json") {
+                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
+                    garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
+                    continue;
+                } else {
+                    tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key.as_ref().unwrap());
+                }
+            } else {
+                // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
+                // rollout of WAL DR in which we never deleted these.
+                let mut any_non_initdb = false;
+
+                for timeline_r in timelines {
+                    let timeline = timeline_r?;
+                    let timeline_objects = list_objects_with_retries(
+                        &s3_client,
+                        &target.timeline_root(&timeline),
+                        None,
+                    )
+                    .await?;
+                    if timeline_objects
+                        .common_prefixes
+                        .as_ref()
+                        .map(|v| v.len())
+                        .unwrap_or(0)
+                        > 0
+                    {
+                        // Sub-paths?  Unexpected
+                        any_non_initdb = true;
+                    } else {
+                        let object = timeline_objects.contents.as_ref().unwrap().first().unwrap();
+                        if object.key.as_ref().unwrap().ends_with("initdb.tar.zst") {
+                            tracing::info!("Timeline {timeline} contains only initdb.tar.zst");
+                        } else {
+                            any_non_initdb = true;
+                        }
+                    }
+                }
+
+                if any_non_initdb {
+                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb");
+                } else {
+                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb");
+                    garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
+                    continue;
+                }
+            }
+        }
+
         if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) {
             tracing::debug!("Tenant {tenant_shard_id} is garbage");
         } else {
@@ -349,9 +429,6 @@ pub async fn get_timeline_objects(
     tracing::debug!("Listing objects in timeline {ttid}");
     let timeline_root = super::remote_timeline_path_id(&ttid);
 
-    // TODO: apply extra validation based on object modification time.  Don't purge
-    // timelines whose index_part.json has been touched recently.
-
     let list = s3_client
         .list(
             Some(&timeline_root),
@@ -422,6 +499,7 @@ impl DeletionProgressTracker {
 pub async fn purge_garbage(
     input_path: String,
     mode: PurgeMode,
+    min_age: Duration,
     dry_run: bool,
 ) -> anyhow::Result<()> {
     let list_bytes = tokio::fs::read(&input_path).await?;
@@ -459,6 +537,7 @@ pub async fn purge_garbage(
         .filter(|i| match (&mode, &i.reason) {
             (PurgeMode::DeletedAndMissing, _) => true,
             (PurgeMode::DeletedOnly, GarbageReason::DeletedInConsole) => true,
+            (PurgeMode::DeletedOnly, GarbageReason::KnownBug) => true,
             (PurgeMode::DeletedOnly, GarbageReason::MissingInConsole) => false,
         });
 
@@ -487,6 +566,37 @@ pub async fn purge_garbage(
     let mut progress_tracker = DeletionProgressTracker::default();
     while let Some(result) = get_objects_results.next().await {
         let mut object_list = result?;
+
+        // Extra safety check: even if a collection of objects is garbage, check max() of modification
+        // times before purging, so that if we incorrectly marked a live tenant as garbage then we would
+        // notice that its index has been written recently and would omit deleting it.
+        if object_list.is_empty() {
+            // Simplify subsequent code by ensuring list always has at least one item
+            // Usually, this only occurs if there is parallel deletions racing us, as there is no empty prefixes
+            continue;
+        }
+        let max_mtime = object_list.iter().map(|o| o.last_modified).max().unwrap();
+        let age = max_mtime.elapsed();
+        match age {
+            Err(_) => {
+                tracing::warn!("Bad last_modified time");
+                continue;
+            }
+            Ok(a) if a < min_age => {
+                // Failed age check.  This doesn't mean we did something wrong: a tenant might really be garbage and recently
+                // written, but out of an abundance of caution we still don't purge it.
+                tracing::info!(
+                    "Skipping tenant with young objects {}..{}",
+                    object_list.first().as_ref().unwrap().key,
+                    object_list.last().as_ref().unwrap().key
+                );
+                continue;
+            }
+            Ok(_) => {
+                // Passed age check
+            }
+        }
+
         objects_to_delete.append(&mut object_list);
         if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
             do_delete(
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index b3ed6f6451..346829b7c9 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -50,6 +50,8 @@ enum Command {
         input_path: String,
         #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
         mode: PurgeMode,
+        #[arg(long = "min-age")]
+        min_age: humantime::Duration,
     },
     #[command(verbatim_doc_comment)]
     ScanMetadata {
@@ -196,9 +198,11 @@ async fn main() -> anyhow::Result<()> {
             let console_config = ConsoleConfig::from_env()?;
             find_garbage(bucket_config, console_config, depth, node_kind, output_path).await
         }
-        Command::PurgeGarbage { input_path, mode } => {
-            purge_garbage(input_path, mode, !cli.delete).await
-        }
+        Command::PurgeGarbage {
+            input_path,
+            mode,
+            min_age,
+        } => purge_garbage(input_path, mode, min_age.into(), !cli.delete).await,
         Command::TenantSnapshot {
             tenant_id,
             output_path,

From 1c7b06c98895074ebc88557a1a632319a147b51d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 30 Jul 2024 09:59:15 +0200
Subject: [PATCH 1301/1571] Add metrics for input data considered and taken for
 compression (#8522)

If compression is enabled, we currently try compressing each image
larger than a specific size and if the compressed version is smaller, we
write that one, otherwise we use the uncompressed image. However, this
might sometimes be a wasteful process, if there is a substantial amount
of images that don't compress well.

The compression metrics added in #8420
`pageserver_compression_image_in_bytes_total` and
`pageserver_compression_image_out_bytes_total` are well designed for
answering the question how space efficient the total compression process
is end-to-end, which helps one to decide whether to enable it or not.

To answer the question of how much waste there is in terms of trial
compression, so CPU time, we add two metrics:

* one about the images that have been trial-compressed (considered), and
* one about the images where the compressed image has actually been
written (chosen).

There is different ways of weighting them, like for example one could
look at the count, or the compressed data. But the main contributor to
compression CPU usage is amount of data processed, so we weight the
images by their *uncompressed* size. In other words, the two metrics
are:

* `pageserver_compression_image_in_bytes_considered`
* `pageserver_compression_image_in_bytes_chosen`

Part of #5431
---
 pageserver/src/metrics.rs                     | 18 +++++++++-
 pageserver/src/tenant/blob_io.rs              | 36 +++++++++++++------
 .../src/tenant/storage_layer/delta_layer.rs   |  2 +-
 .../src/tenant/storage_layer/image_layer.rs   | 26 ++++++++++++--
 4 files changed, 68 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9aff5220f5..ede6b41a75 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -613,7 +613,23 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "pageserver_compression_image_in_bytes_total",
-        "Size of uncompressed data written into image layers"
+        "Size of data written into image layers before compression"
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_compression_image_in_bytes_considered",
+        "Size of potentially compressible data written into image layers before compression"
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_compression_image_in_bytes_chosen",
+        "Size of data whose compressed form was written into image layers"
     )
     .expect("failed to define a metric")
 });
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 791eefebe9..8e9d349ca8 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -28,6 +28,12 @@ use crate::virtual_file::VirtualFile;
 use std::cmp::min;
 use std::io::{Error, ErrorKind};
 
+#[derive(Copy, Clone, Debug)]
+pub struct CompressionInfo {
+    pub written_compressed: bool,
+    pub compressed_size: Option<usize>,
+}
+
 impl<'a> BlockCursor<'a> {
     /// Read a blob into a new buffer.
     pub async fn read_blob(
@@ -273,8 +279,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         srcbuf: B,
         ctx: &RequestContext,
     ) -> (B::Buf, Result<u64, Error>) {
-        self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
-            .await
+        let (buf, res) = self
+            .write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
+            .await;
+        (buf, res.map(|(off, _compression_info)| off))
     }
 
     /// Write a blob of data. Returns the offset that it was written to,
@@ -284,8 +292,12 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         srcbuf: B,
         ctx: &RequestContext,
         algorithm: ImageCompressionAlgorithm,
-    ) -> (B::Buf, Result<u64, Error>) {
+    ) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
         let offset = self.offset;
+        let mut compression_info = CompressionInfo {
+            written_compressed: false,
+            compressed_size: None,
+        };
 
         let len = srcbuf.bytes_init();
 
@@ -328,7 +340,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                         encoder.write_all(&slice[..]).await.unwrap();
                         encoder.shutdown().await.unwrap();
                         let compressed = encoder.into_inner();
+                        compression_info.compressed_size = Some(compressed.len());
                         if compressed.len() < len {
+                            compression_info.written_compressed = true;
                             let compressed_len = compressed.len();
                             compressed_buf = Some(compressed);
                             (BYTE_ZSTD, compressed_len, slice.into_inner())
@@ -359,7 +373,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         } else {
             self.write_all(srcbuf, ctx).await
         };
-        (srcbuf, res.map(|_| offset))
+        (srcbuf, res.map(|_| (offset, compression_info)))
     }
 }
 
@@ -416,12 +430,14 @@ pub(crate) mod tests {
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
                 let (_, res) = if compression {
-                    wtr.write_blob_maybe_compressed(
-                        blob.clone(),
-                        ctx,
-                        ImageCompressionAlgorithm::Zstd { level: Some(1) },
-                    )
-                    .await
+                    let res = wtr
+                        .write_blob_maybe_compressed(
+                            blob.clone(),
+                            ctx,
+                            ImageCompressionAlgorithm::Zstd { level: Some(1) },
+                        )
+                        .await;
+                    (res.0, res.1.map(|(off, _)| off))
                 } else {
                     wtr.write_blob(blob.clone(), ctx).await
                 };
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 229d1e3608..f9becf53ff 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -467,7 +467,7 @@ impl DeltaLayerWriterInner {
             .write_blob_maybe_compressed(val, ctx, compression)
             .await;
         let off = match res {
-            Ok(off) => off,
+            Ok((off, _)) => off,
             Err(e) => return (val, Err(anyhow::anyhow!(e))),
         };
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 44ba685490..08db27514a 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -734,6 +734,14 @@ struct ImageLayerWriterInner {
     // Total uncompressed bytes passed into put_image
     uncompressed_bytes: u64,
 
+    // Like `uncompressed_bytes`,
+    // but only of images we might consider for compression
+    uncompressed_bytes_eligible: u64,
+
+    // Like `uncompressed_bytes`, but only of images
+    // where we have chosen their compressed form
+    uncompressed_bytes_chosen: u64,
+
     blob_writer: BlobWriter<false>,
     tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 }
@@ -790,6 +798,8 @@ impl ImageLayerWriterInner {
             tree: tree_builder,
             blob_writer,
             uncompressed_bytes: 0,
+            uncompressed_bytes_eligible: 0,
+            uncompressed_bytes_chosen: 0,
         };
 
         Ok(writer)
@@ -808,13 +818,22 @@ impl ImageLayerWriterInner {
     ) -> anyhow::Result<()> {
         ensure!(self.key_range.contains(&key));
         let compression = self.conf.image_compression;
-        self.uncompressed_bytes += img.len() as u64;
+        let uncompressed_len = img.len() as u64;
+        self.uncompressed_bytes += uncompressed_len;
         let (_img, res) = self
             .blob_writer
             .write_blob_maybe_compressed(img, ctx, compression)
             .await;
         // TODO: re-use the buffer for `img` further upstack
-        let off = res?;
+        let (off, compression_info) = res?;
+        if compression_info.compressed_size.is_some() {
+            // The image has been considered for compression at least
+            self.uncompressed_bytes_eligible += uncompressed_len;
+        }
+        if compression_info.written_compressed {
+            // The image has been compressed
+            self.uncompressed_bytes_chosen += uncompressed_len;
+        }
 
         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
         key.write_to_byte_slice(&mut keybuf);
@@ -837,6 +856,9 @@ impl ImageLayerWriterInner {
         // Calculate compression ratio
         let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
         crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
+        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
+            .inc_by(self.uncompressed_bytes_eligible);
+        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
         crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
 
         let mut file = self.blob_writer.into_inner();

From 9fabdda2dcaa67536bdec9e65303d22674dbb9b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 30 Jul 2024 11:00:37 +0200
Subject: [PATCH 1302/1571] scrubber: add remote_storage based listing APIs and
 use them in find-large-objects (#8541)

Add two new functions `stream_objects_with_retries` and
`stream_tenants_generic` and use them in the `find-large-objects`
subcommand, migrating it to `remote_storage`.

Also adds the `size` field to the `ListingObject` struct.

Part of #7547
---
 libs/remote_storage/src/azure_blob.rs      |  3 +-
 libs/remote_storage/src/lib.rs             |  1 +
 libs/remote_storage/src/local_fs.rs        |  2 +
 libs/remote_storage/src/s3_bucket.rs       |  5 +-
 storage_scrubber/src/find_large_objects.rs | 44 +++++------
 storage_scrubber/src/garbage.rs            |  2 +-
 storage_scrubber/src/lib.rs                | 90 +++++++++++++++++-----
 storage_scrubber/src/metadata_stream.rs    | 33 +++++++-
 8 files changed, 133 insertions(+), 47 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 6ca4ae43f2..3c77d5a227 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -355,7 +355,8 @@ impl RemoteStorage for AzureBlobStorage {
                     .blobs()
                     .map(|k| ListingObject{
                         key: self.name_to_relative_path(&k.name),
-                        last_modified: k.properties.last_modified.into()
+                        last_modified: k.properties.last_modified.into(),
+                        size: k.properties.content_length,
                     }
                     );
 
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 031548bbec..794e696769 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -153,6 +153,7 @@ pub enum ListingMode {
 pub struct ListingObject {
     pub key: RemotePath,
     pub last_modified: SystemTime,
+    pub size: u64,
 }
 
 #[derive(Default)]
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index bc6b10aa51..99b4aa4061 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -368,6 +368,7 @@ impl RemoteStorage for LocalFs {
                             key: k.clone(),
                             // LocalFs is just for testing, so just specify a dummy time
                             last_modified: SystemTime::now(),
+                            size: 0,
                         })
                     }
                 })
@@ -411,6 +412,7 @@ impl RemoteStorage for LocalFs {
                             key: RemotePath::from_string(&relative_key).unwrap(),
                             // LocalFs is just for testing
                             last_modified: SystemTime::now(),
+                            size: 0,
                         });
                     }
                 }
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 412f307445..1f25da813d 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -565,9 +565,12 @@ impl RemoteStorage for S3Bucket {
                         }
                     };
 
+                    let size = object.size.unwrap_or(0) as u64;
+
                     result.keys.push(ListingObject{
                         key,
-                        last_modified
+                        last_modified,
+                        size,
                     });
                     if let Some(mut mk) = max_keys {
                         assert!(mk > 0);
diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs
index 2ef802229d..f5bb7e088a 100644
--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -1,10 +1,13 @@
+use std::pin::pin;
+
 use futures::{StreamExt, TryStreamExt};
 use pageserver::tenant::storage_layer::LayerName;
+use remote_storage::ListingMode;
 use serde::{Deserialize, Serialize};
 
 use crate::{
-    checks::parse_layer_object_name, init_remote, list_objects_with_retries,
-    metadata_stream::stream_tenants, BucketConfig, NodeKind,
+    checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic,
+    stream_objects_with_retries, BucketConfig, NodeKind,
 };
 
 #[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
@@ -47,45 +50,38 @@ pub async fn find_large_objects(
     ignore_deltas: bool,
     concurrency: usize,
 ) -> anyhow::Result<LargeObjectListing> {
-    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
-    let tenants = std::pin::pin!(stream_tenants(&s3_client, &target));
+    let (remote_client, target) =
+        init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let tenants = pin!(stream_tenants_generic(&remote_client, &target));
 
     let objects_stream = tenants.map_ok(|tenant_shard_id| {
         let mut tenant_root = target.tenant_root(&tenant_shard_id);
-        let s3_client = s3_client.clone();
+        let remote_client = remote_client.clone();
         async move {
             let mut objects = Vec::new();
             let mut total_objects_ctr = 0u64;
             // We want the objects and not just common prefixes
             tenant_root.delimiter.clear();
-            let mut continuation_token = None;
-            loop {
-                let fetch_response =
-                    list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
-                        .await?;
-                for obj in fetch_response.contents().iter().filter(|o| {
-                    if let Some(obj_size) = o.size {
-                        min_size as i64 <= obj_size
-                    } else {
-                        false
-                    }
-                }) {
-                    let key = obj.key().expect("couldn't get key").to_owned();
+            let mut objects_stream = pin!(stream_objects_with_retries(
+                &remote_client,
+                ListingMode::NoDelimiter,
+                &tenant_root
+            ));
+            while let Some(listing) = objects_stream.next().await {
+                let listing = listing?;
+                for obj in listing.keys.iter().filter(|obj| min_size <= obj.size) {
+                    let key = obj.key.to_string();
                     let kind = LargeObjectKind::from_key(&key);
                     if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
                         continue;
                     }
                     objects.push(LargeObject {
                         key,
-                        size: obj.size.unwrap() as u64,
+                        size: obj.size,
                         kind,
                     })
                 }
-                total_objects_ctr += fetch_response.contents().len() as u64;
-                match fetch_response.next_continuation_token {
-                    Some(new_token) => continuation_token = Some(new_token),
-                    None => break,
-                }
+                total_objects_ctr += listing.keys.len() as u64;
             }
 
             Ok((tenant_shard_id, objects, total_objects_ctr))
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index 78ecfc7232..73479c3658 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -510,7 +510,7 @@ pub async fn purge_garbage(
         input_path
     );
 
-    let remote_client =
+    let (remote_client, _target) =
         init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
 
     assert_eq!(
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 5c64e7e459..c7900f9b02 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -22,16 +22,18 @@ use aws_sdk_s3::Client;
 
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
+use futures::{Stream, StreamExt};
 use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path};
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{
-    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
-    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+    GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind,
+    S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
 };
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
 use tokio::io::AsyncReadExt;
+use tokio_util::sync::CancellationToken;
 use tracing::error;
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
@@ -319,27 +321,35 @@ fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str {
     }
 }
 
+fn make_root_target(
+    bucket_name: String,
+    prefix_in_bucket: String,
+    node_kind: NodeKind,
+) -> RootTarget {
+    let s3_target = S3Target {
+        bucket_name,
+        prefix_in_bucket,
+        delimiter: "/".to_string(),
+    };
+    match node_kind {
+        NodeKind::Pageserver => RootTarget::Pageserver(s3_target),
+        NodeKind::Safekeeper => RootTarget::Safekeeper(s3_target),
+    }
+}
+
 async fn init_remote(
     bucket_config: BucketConfig,
     node_kind: NodeKind,
 ) -> anyhow::Result<(Arc<Client>, RootTarget)> {
     let bucket_region = Region::new(bucket_config.region);
-    let delimiter = "/".to_string();
     let s3_client = Arc::new(init_s3_client(bucket_region).await);
     let default_prefix = default_prefix_in_bucket(node_kind).to_string();
 
-    let s3_root = match node_kind {
-        NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
-            bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
-            delimiter,
-        }),
-        NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
-            bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
-            delimiter,
-        }),
-    };
+    let s3_root = make_root_target(
+        bucket_config.bucket,
+        bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
+        node_kind,
+    );
 
     Ok((s3_client, s3_root))
 }
@@ -347,12 +357,12 @@ async fn init_remote(
 async fn init_remote_generic(
     bucket_config: BucketConfig,
     node_kind: NodeKind,
-) -> anyhow::Result<GenericRemoteStorage> {
+) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> {
     let endpoint = env::var("AWS_ENDPOINT_URL").ok();
     let default_prefix = default_prefix_in_bucket(node_kind).to_string();
     let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix));
     let storage = S3Config {
-        bucket_name: bucket_config.bucket,
+        bucket_name: bucket_config.bucket.clone(),
         bucket_region: bucket_config.region,
         prefix_in_bucket,
         endpoint,
@@ -366,7 +376,13 @@ async fn init_remote_generic(
         storage: RemoteStorageKind::AwsS3(storage),
         timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
     };
-    GenericRemoteStorage::from_config(&storage_config).await
+
+    // We already pass the prefix to the remote client above
+    let prefix_in_root_target = String::new();
+    let s3_root = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind);
+
+    let client = GenericRemoteStorage::from_config(&storage_config).await?;
+    Ok((client, s3_root))
 }
 
 async fn list_objects_with_retries(
@@ -404,6 +420,44 @@ async fn list_objects_with_retries(
     Err(anyhow!("unreachable unless MAX_RETRIES==0"))
 }
 
+fn stream_objects_with_retries<'a>(
+    storage_client: &'a GenericRemoteStorage,
+    listing_mode: ListingMode,
+    s3_target: &'a S3Target,
+) -> impl Stream<Item = Result<Listing, anyhow::Error>> + 'a {
+    async_stream::stream! {
+        let mut trial = 0;
+        let cancel = CancellationToken::new();
+        let prefix_str = &s3_target
+            .prefix_in_bucket
+            .strip_prefix("/")
+            .unwrap_or(&s3_target.prefix_in_bucket);
+        let prefix = RemotePath::from_string(prefix_str)?;
+        let mut list_stream =
+            storage_client.list_streaming(Some(&prefix), listing_mode, None, &cancel);
+        while let Some(res) = list_stream.next().await {
+            if let Err(err) = res {
+                let yield_err = if err.is_permanent() {
+                    true
+                } else {
+                    let backoff_time = 1 << trial.max(5);
+                    tokio::time::sleep(Duration::from_secs(backoff_time)).await;
+                    trial += 1;
+                    trial == MAX_RETRIES - 1
+                };
+                if yield_err {
+                    yield Err(err)
+                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
+                    break;
+                }
+            } else {
+                trial = 0;
+                yield res.map_err(anyhow::Error::from);
+            }
+        }
+    }
+}
+
 async fn download_object_with_retries(
     s3_client: &Client,
     bucket_name: &str,
diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index c05874f556..91dba3c992 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -1,12 +1,41 @@
-use anyhow::Context;
+use std::str::FromStr;
+
+use anyhow::{anyhow, Context};
 use async_stream::{stream, try_stream};
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
+use futures::StreamExt;
+use remote_storage::{GenericRemoteStorage, ListingMode};
 use tokio_stream::Stream;
 
-use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
+use crate::{
+    list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target,
+    TenantShardTimelineId,
+};
 use pageserver_api::shard::TenantShardId;
 use utils::id::{TenantId, TimelineId};
 
+/// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes
+pub fn stream_tenants_generic<'a>(
+    remote_client: &'a GenericRemoteStorage,
+    target: &'a RootTarget,
+) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
+    try_stream! {
+        let tenants_target = target.tenants_root();
+        let mut tenants_stream =
+            std::pin::pin!(stream_objects_with_retries(remote_client, ListingMode::WithDelimiter, &tenants_target));
+        while let Some(chunk) = tenants_stream.next().await {
+            let chunk = chunk?;
+            let entry_ids = chunk.prefixes.iter()
+                .map(|prefix| prefix.get_path().file_name().ok_or_else(|| anyhow!("no final component in path '{prefix}'")));
+            for dir_name_res in entry_ids {
+                let dir_name = dir_name_res?;
+                let id = TenantShardId::from_str(dir_name)?;
+                yield id;
+            }
+        }
+    }
+}
+
 /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
 pub fn stream_tenants<'a>(
     s3_client: &'a Client,

From f72fe686266e6c81d9890390b28be8c65e0afc73 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 30 Jul 2024 13:38:23 +0100
Subject: [PATCH 1303/1571] CI(benchmarking): make neonvm default provisioner
 (#8538)

## Problem

We don't allow regular end-users to use `k8s-pod` provisioner,
but we still use it in nightly benchmarks

## Summary of changes
- Remove `provisioner` input from `neon-create-project` action, use
`k8s-neonvm` as a default provioner
- Change `neon-` platform prefix to `neonvm-`
- Remove `neon-captest-freetier` and `neon-captest-new` as we already
have their `neonvm` counterparts
---
 .../actions/neon-project-create/action.yml    | 12 +----
 .github/workflows/benchmarking.yml            | 52 ++++++++-----------
 2 files changed, 25 insertions(+), 39 deletions(-)

diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index d4029bd37c..f4a194639f 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -14,11 +14,8 @@ inputs:
   api_host:
     description: 'Neon API host'
     default: console-stage.neon.build
-  provisioner:
-    description: 'k8s-pod or k8s-neonvm'
-    default: 'k8s-pod'
   compute_units:
-    description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
+    description: '[Min, Max] compute units'
     default: '[1, 1]'
 
 outputs:
@@ -37,10 +34,6 @@ runs:
       # A shell without `set -x` to not to expose password/dsn in logs
       shell: bash -euo pipefail {0}
       run: |
-        if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
-          echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
-        fi
-
         project=$(curl \
           "https://${API_HOST}/api/v2/projects" \
           --fail \
@@ -52,7 +45,7 @@ runs:
               \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
               \"pg_version\": ${POSTGRES_VERSION},
               \"region_id\": \"${REGION_ID}\",
-              \"provisioner\": \"${PROVISIONER}\",
+              \"provisioner\": \"k8s-neonvm\",
               \"autoscaling_limit_min_cu\": ${MIN_CU},
               \"autoscaling_limit_max_cu\": ${MAX_CU},
               \"settings\": { }
@@ -75,6 +68,5 @@ runs:
         API_KEY: ${{ inputs.api_key }}
         REGION_ID: ${{ inputs.region_id }}
         POSTGRES_VERSION: ${{ inputs.postgres_version }}
-        PROVISIONER: ${{ inputs.provisioner }}
         MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
         MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 5ffdb29fe6..f7ea534fb9 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -63,11 +63,9 @@ jobs:
           - DEFAULT_PG_VERSION: 16
             PLATFORM: "neon-staging"
             region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
-            provisioner: 'k8s-pod' 
           - DEFAULT_PG_VERSION: 16
             PLATFORM: "azure-staging"
             region_id: 'azure-eastus2'
-            provisioner: 'k8s-neonvm'
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "300"
       TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -100,7 +98,6 @@ jobs:
         region_id: ${{ matrix.region_id }}
         postgres_version: ${{ env.DEFAULT_PG_VERSION }}
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        provisioner: ${{ matrix.provisioner }}
 
     - name: Run benchmark
       uses: ./.github/actions/run-python-test-set
@@ -216,11 +213,11 @@ jobs:
     # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
     #
     # Available platforms:
-    # - neon-captest-new: Freshly created project (1 CU)
-    # - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
+    # - neonvm-captest-new: Freshly created project (1 CU)
+    # - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU)
     # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
     # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
-    # - neon-captest-reuse: Reusing existing project
+    # - neonvm-captest-reuse: Reusing existing project
     # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
     # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
     env:
@@ -245,18 +242,16 @@ jobs:
             "'"$region_id_default"'"
             ],
           "platform": [
-            "neon-captest-new",
-            "neon-captest-reuse",
+            "neonvm-captest-new",
+            "neonvm-captest-reuse",
             "neonvm-captest-new"
           ],
           "db_size": [ "10gb" ],
-          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier",         "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new",              "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
                       { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
                       { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
         }'
 
@@ -271,7 +266,7 @@ jobs:
       run: |
         matrix='{
           "platform": [
-            "neon-captest-reuse"
+            "neonvm-captest-reuse"
           ]
         }'
 
@@ -287,7 +282,7 @@ jobs:
       run: |
         matrix='{
           "platform": [
-            "neon-captest-reuse"
+            "neonvm-captest-reuse"
           ],
           "scale": [
             "10"
@@ -338,7 +333,7 @@ jobs:
         prefix: latest
 
     - name: Create Neon Project
-      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
+      if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
       id: create-neon-project
       uses: ./.github/actions/neon-project-create
       with:
@@ -346,19 +341,18 @@ jobs:
         postgres_version: ${{ env.DEFAULT_PG_VERSION }}
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
         compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
-        provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
 
     - name: Set up Connection String
       id: set-up-connstr
       run: |
         case "${PLATFORM}" in
-          neon-captest-reuse)
+          neonvm-captest-reuse)
             CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
             ;;
           neonvm-captest-sharding-reuse)
             CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
             ;;
-          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
+          neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
             CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
             ;;
           rds-aurora)
@@ -442,9 +436,9 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - PLATFORM: "neon-captest-pgvector"
+          - PLATFORM: "neonvm-captest-pgvector"
           - PLATFORM: "azure-captest-pgvector"
-            
+
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
       TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -486,7 +480,7 @@ jobs:
       id: set-up-connstr
       run: |
         case "${PLATFORM}" in
-          neon-captest-pgvector)
+          neonvm-captest-pgvector)
             CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
             ;;
           azure-captest-pgvector)
@@ -585,7 +579,7 @@ jobs:
       id: set-up-connstr
       run: |
         case "${PLATFORM}" in
-          neon-captest-reuse)
+          neonvm-captest-reuse)
             CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
             ;;
           rds-aurora)
@@ -595,7 +589,7 @@ jobs:
             CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
             ;;
           *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
             exit 1
             ;;
         esac
@@ -672,7 +666,7 @@ jobs:
     - name: Get Connstring Secret Name
       run: |
         case "${PLATFORM}" in
-          neon-captest-reuse)
+          neonvm-captest-reuse)
             ENV_PLATFORM=CAPTEST_TPCH
             ;;
           rds-aurora)
@@ -682,7 +676,7 @@ jobs:
             ENV_PLATFORM=RDS_AURORA_TPCH
             ;;
           *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
             exit 1
             ;;
         esac
@@ -759,7 +753,7 @@ jobs:
       id: set-up-connstr
       run: |
         case "${PLATFORM}" in
-          neon-captest-reuse)
+          neonvm-captest-reuse)
             CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
             ;;
           rds-aurora)
@@ -769,7 +763,7 @@ jobs:
             CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
             ;;
           *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
             exit 1
             ;;
         esac

From 9ceaf9a98665a99115a2ae1720c204957a276b6d Mon Sep 17 00:00:00 2001
From: Anton Chaporgin <chapson@neon.tech>
Date: Tue, 30 Jul 2024 16:15:53 +0300
Subject: [PATCH 1304/1571] [neon/acr] impr: push to ACR while building images
 (#8545)

This tests the ability to push into ACR using OIDC. Proved it worked by running slightly modified YAML.
In `promote-images` we push the following images `neon compute-tools {vm-,}compute-node-{v14,v15,v16}` into `neoneastus2`.

https://github.com/neondatabase/cloud/issues/14640
---
 .github/workflows/build_and_test.yml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 872c1fbb39..3cf40e6153 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -833,6 +833,9 @@ jobs:
           rm -rf .docker-custom
 
   promote-images:
+    permissions:
+      contents: read  # This is required for actions/checkout
+      id-token: write # This is required for Azure Login to work.
     needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
     runs-on: ubuntu-22.04
 
@@ -859,6 +862,28 @@ jobs:
                                                neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
           done
 
+      - name: Azure login
+        if: github.ref_name == 'main'
+        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
+        with:
+          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
+          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+
+      - name: Login to ACR
+        if: github.ref_name == 'main'
+        run: |
+          az acr login --name=neoneastus2
+
+      - name: Copy docker images to ACR-dev
+        if: github.ref_name == 'main'
+        run: |
+          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
+            docker buildx imagetools create \
+              -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
+                                        neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
+          done
+
       - name: Add latest tag to images
         if: github.ref_name == 'main'
         run: |

From e374d6778ed4da3e7437975b77ef69fdfea80470 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Tue, 30 Jul 2024 09:32:00 -0400
Subject: [PATCH 1305/1571] feat(storcon): store scrubber metadata scan result
 (#8480)

Part of #8128, followed by #8502.

## Problem

Currently we lack mechanism to alert unhealthy `scan_metadata` status if
we start running this scrubber command as part of a cronjob. With the
storage controller client introduced to storage scrubber in #8196, it is
viable to set up alert by storing health status in the storage
controller database.

We intentionally do not store the full output to the database as the
json blobs potentially makes the table really huge. Instead, only a
health status and a timestamp recording the last time metadata health
status is posted on a tenant shard.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 Cargo.lock                                    |   2 +
 libs/pageserver_api/src/controller_api.rs     |  38 +++-
 libs/utils/src/auth.rs                        |  16 +-
 storage_controller/Cargo.toml                 |   9 +-
 .../down.sql                                  |   1 +
 .../up.sql                                    |  14 ++
 storage_controller/src/http.rs                |  73 ++++++-
 storage_controller/src/persistence.rs         | 180 +++++++++++++++++-
 storage_controller/src/schema.rs              |  12 +-
 storage_controller/src/service.rs             |  74 ++++++-
 test_runner/fixtures/neon_fixtures.py         |  46 +++++
 .../regress/test_storage_controller.py        | 122 +++++++++++-
 12 files changed, 560 insertions(+), 27 deletions(-)
 create mode 100644 storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql
 create mode 100644 storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql

diff --git a/Cargo.lock b/Cargo.lock
index 2b56095bc8..2186d55e9c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1672,6 +1672,7 @@ checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
 dependencies = [
  "bitflags 2.4.1",
  "byteorder",
+ "chrono",
  "diesel_derives",
  "itoa",
  "pq-sys",
@@ -5718,6 +5719,7 @@ dependencies = [
  "aws-config",
  "bytes",
  "camino",
+ "chrono",
  "clap",
  "control_plane",
  "diesel",
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 474f796040..36b1bd95ff 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,5 +1,5 @@
 use std::str::FromStr;
-use std::time::Instant;
+use std::time::{Duration, Instant};
 
 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
@@ -294,6 +294,42 @@ pub enum PlacementPolicy {
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateResponse {}
 
+/// Metadata health record posted from scrubber.
+#[derive(Serialize, Deserialize, Debug)]
+pub struct MetadataHealthRecord {
+    pub tenant_shard_id: TenantShardId,
+    pub healthy: bool,
+    pub last_scrubbed_at: chrono::DateTime<chrono::Utc>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct MetadataHealthUpdateRequest {
+    pub healthy_tenant_shards: Vec<TenantShardId>,
+    pub unhealthy_tenant_shards: Vec<TenantShardId>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct MetadataHealthUpdateResponse {}
+
+#[derive(Serialize, Deserialize, Debug)]
+
+pub struct MetadataHealthListUnhealthyResponse {
+    pub unhealthy_tenant_shards: Vec<TenantShardId>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+
+pub struct MetadataHealthListOutdatedRequest {
+    #[serde(with = "humantime_serde")]
+    pub not_scrubbed_for: Duration,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+
+pub struct MetadataHealthListOutdatedResponse {
+    pub health_records: Vec<MetadataHealthRecord>,
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index a1170a460d..7b735875b7 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -18,20 +18,20 @@ const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
 #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum Scope {
-    // Provides access to all data for a specific tenant (specified in `struct Claims` below)
+    /// Provides access to all data for a specific tenant (specified in `struct Claims` below)
     // TODO: join these two?
     Tenant,
-    // Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
-    // Should only be used e.g. for status check/tenant creation/list.
+    /// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
+    /// Should only be used e.g. for status check/tenant creation/list.
     PageServerApi,
-    // Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
-    // Should only be used e.g. for status check.
-    // Currently also used for connection from any pageserver to any safekeeper.
+    /// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
+    /// Should only be used e.g. for status check.
+    /// Currently also used for connection from any pageserver to any safekeeper.
     SafekeeperData,
-    // The scope used by pageservers in upcalls to storage controller and cloud control plane
+    /// The scope used by pageservers in upcalls to storage controller and cloud control plane
     #[serde(rename = "generations_api")]
     GenerationsApi,
-    // Allows access to control plane managment API and some storage controller endpoints.
+    /// Allows access to control plane managment API and some storage controller endpoints.
     Admin,
 
     /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index b54dea5d47..d14b235046 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -18,6 +18,7 @@ anyhow.workspace = true
 aws-config.workspace = true
 bytes.workspace = true
 camino.workspace = true
+chrono.workspace = true
 clap.workspace = true
 fail.workspace = true
 futures.workspace = true
@@ -44,7 +45,12 @@ scopeguard.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 
-diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
+diesel = { version = "2.1.4", features = [
+    "serde_json",
+    "postgres",
+    "r2d2",
+    "chrono",
+] }
 diesel_migrations = { version = "2.1.0" }
 r2d2 = { version = "0.8.10" }
 
@@ -52,4 +58,3 @@ utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
 control_plane = { path = "../control_plane" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
-
diff --git a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql
new file mode 100644
index 0000000000..1ecfc8786f
--- /dev/null
+++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql
@@ -0,0 +1 @@
+DROP TABLE metadata_health;
\ No newline at end of file
diff --git a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql
new file mode 100644
index 0000000000..fa87eda119
--- /dev/null
+++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql
@@ -0,0 +1,14 @@
+CREATE TABLE metadata_health (
+  tenant_id VARCHAR NOT NULL,
+  shard_number INTEGER NOT NULL,
+  shard_count INTEGER NOT NULL,
+  PRIMARY KEY(tenant_id, shard_number, shard_count),
+  -- Rely on cascade behavior for delete
+  FOREIGN KEY(tenant_id, shard_number, shard_count) REFERENCES tenant_shards ON DELETE CASCADE,
+  healthy BOOLEAN NOT NULL DEFAULT TRUE,
+  last_scrubbed_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+
+INSERT INTO metadata_health(tenant_id, shard_number, shard_count)
+SELECT tenant_id, shard_number, shard_count FROM tenant_shards;
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index c77918827f..e8513b31eb 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -10,7 +10,11 @@ use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use metrics::{BuildInfo, NeonMetrics};
-use pageserver_api::controller_api::TenantCreateRequest;
+use pageserver_api::controller_api::{
+    MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
+    MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
+    TenantCreateRequest,
+};
 use pageserver_api::models::{
     TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
     TenantTimeTravelRequest, TimelineCreateRequest,
@@ -560,6 +564,51 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
     json_response(StatusCode::ACCEPTED, ())
 }
 
+async fn handle_metadata_health_update(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Scrubber)?;
+
+    let update_req = json_request::<MetadataHealthUpdateRequest>(&mut req).await?;
+    let state = get_state(&req);
+
+    state.service.metadata_health_update(update_req).await?;
+
+    json_response(StatusCode::OK, MetadataHealthUpdateResponse {})
+}
+
+async fn handle_metadata_health_list_unhealthy(
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?;
+
+    json_response(
+        StatusCode::OK,
+        MetadataHealthListUnhealthyResponse {
+            unhealthy_tenant_shards,
+        },
+    )
+}
+
+async fn handle_metadata_health_list_outdated(
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let list_outdated_req = json_request::<MetadataHealthListOutdatedRequest>(&mut req).await?;
+    let state = get_state(&req);
+    let health_records = state
+        .service
+        .metadata_health_list_outdated(list_outdated_req.not_scrubbed_for)
+        .await?;
+
+    json_response(
+        StatusCode::OK,
+        MetadataHealthListOutdatedResponse { health_records },
+    )
+}
+
 async fn handle_tenant_shard_split(
     service: Arc<Service>,
     mut req: Request<Body>,
@@ -987,6 +1036,28 @@ pub fn make_router(
                 RequestName("control_v1_cancel_node_fill"),
             )
         })
+        // Metadata health operations
+        .post("/control/v1/metadata_health/update", |r| {
+            named_request_span(
+                r,
+                handle_metadata_health_update,
+                RequestName("control_v1_metadata_health_update"),
+            )
+        })
+        .get("/control/v1/metadata_health/unhealthy", |r| {
+            named_request_span(
+                r,
+                handle_metadata_health_list_unhealthy,
+                RequestName("control_v1_metadata_health_list_unhealthy"),
+            )
+        })
+        .post("/control/v1/metadata_health/outdated", |r| {
+            named_request_span(
+                r,
+                handle_metadata_health_list_outdated,
+                RequestName("control_v1_metadata_health_list_outdated"),
+            )
+        })
         // TODO(vlad): endpoint for cancelling drain and fill
         // Tenant Shard operations
         .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index d8f31e86e5..64a3e597ce 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -8,6 +8,7 @@ use self::split_state::SplitState;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
+use pageserver_api::controller_api::MetadataHealthRecord;
 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
@@ -90,6 +91,10 @@ pub(crate) enum DatabaseOperation {
     UpdateTenantShard,
     DeleteTenant,
     UpdateTenantConfig,
+    UpdateMetadataHealth,
+    ListMetadataHealth,
+    ListMetadataHealthUnhealthy,
+    ListMetadataHealthOutdated,
 }
 
 #[must_use]
@@ -307,15 +312,32 @@ impl Persistence {
         &self,
         shards: Vec<TenantShardPersistence>,
     ) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
+        use crate::schema::metadata_health;
+        use crate::schema::tenant_shards;
+
+        let now = chrono::Utc::now();
+
+        let metadata_health_records = shards
+            .iter()
+            .map(|t| MetadataHealthPersistence {
+                tenant_id: t.tenant_id.clone(),
+                shard_number: t.shard_number,
+                shard_count: t.shard_count,
+                healthy: true,
+                last_scrubbed_at: now,
+            })
+            .collect::<Vec<_>>();
+
         self.with_measured_conn(
             DatabaseOperation::InsertTenantShards,
             move |conn| -> DatabaseResult<()> {
-                for tenant in &shards {
-                    diesel::insert_into(tenant_shards)
-                        .values(tenant)
-                        .execute(conn)?;
-                }
+                diesel::insert_into(tenant_shards::table)
+                    .values(&shards)
+                    .execute(conn)?;
+
+                diesel::insert_into(metadata_health::table)
+                    .values(&metadata_health_records)
+                    .execute(conn)?;
                 Ok(())
             },
         )
@@ -329,10 +351,10 @@ impl Persistence {
         self.with_measured_conn(
             DatabaseOperation::DeleteTenant,
             move |conn| -> DatabaseResult<()> {
+                // `metadata_health` status (if exists) is also deleted based on the cascade behavior.
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(del_tenant_id.to_string()))
                     .execute(conn)?;
-
                 Ok(())
             },
         )
@@ -675,6 +697,94 @@ impl Persistence {
         )
         .await
     }
+
+    /// Stores all the latest metadata health updates durably. Updates existing entry on conflict.
+    ///
+    /// **Correctness:** `metadata_health_updates` should all belong the tenant shards managed by the storage controller.
+    #[allow(dead_code)]
+    pub(crate) async fn update_metadata_health_records(
+        &self,
+        healthy_records: Vec<MetadataHealthPersistence>,
+        unhealthy_records: Vec<MetadataHealthPersistence>,
+        now: chrono::DateTime<chrono::Utc>,
+    ) -> DatabaseResult<()> {
+        use crate::schema::metadata_health::dsl::*;
+
+        self.with_measured_conn(
+            DatabaseOperation::UpdateMetadataHealth,
+            move |conn| -> DatabaseResult<_> {
+                diesel::insert_into(metadata_health)
+                    .values(&healthy_records)
+                    .on_conflict((tenant_id, shard_number, shard_count))
+                    .do_update()
+                    .set((healthy.eq(true), last_scrubbed_at.eq(now)))
+                    .execute(conn)?;
+
+                diesel::insert_into(metadata_health)
+                    .values(&unhealthy_records)
+                    .on_conflict((tenant_id, shard_number, shard_count))
+                    .do_update()
+                    .set((healthy.eq(false), last_scrubbed_at.eq(now)))
+                    .execute(conn)?;
+                Ok(())
+            },
+        )
+        .await
+    }
+
+    /// Lists all the metadata health records.
+    #[allow(dead_code)]
+    pub(crate) async fn list_metadata_health_records(
+        &self,
+    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
+        self.with_measured_conn(
+            DatabaseOperation::ListMetadataHealth,
+            move |conn| -> DatabaseResult<_> {
+                Ok(
+                    crate::schema::metadata_health::table
+                        .load::<MetadataHealthPersistence>(conn)?,
+                )
+            },
+        )
+        .await
+    }
+
+    /// Lists all the metadata health records that is unhealthy.
+    #[allow(dead_code)]
+    pub(crate) async fn list_unhealthy_metadata_health_records(
+        &self,
+    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
+        use crate::schema::metadata_health::dsl::*;
+        self.with_measured_conn(
+            DatabaseOperation::ListMetadataHealthUnhealthy,
+            move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::metadata_health::table
+                    .filter(healthy.eq(false))
+                    .load::<MetadataHealthPersistence>(conn)?)
+            },
+        )
+        .await
+    }
+
+    /// Lists all the metadata health records that have not been updated since an `earlier` time.
+    #[allow(dead_code)]
+    pub(crate) async fn list_outdated_metadata_health_records(
+        &self,
+        earlier: chrono::DateTime<chrono::Utc>,
+    ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
+        use crate::schema::metadata_health::dsl::*;
+
+        self.with_measured_conn(
+            DatabaseOperation::ListMetadataHealthOutdated,
+            move |conn| -> DatabaseResult<_> {
+                let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
+                let res = query.load::<MetadataHealthPersistence>(conn)?;
+
+                Ok(res)
+            },
+        )
+        .await
+    }
 }
 
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
@@ -744,3 +854,59 @@ pub(crate) struct NodePersistence {
     pub(crate) listen_pg_addr: String,
     pub(crate) listen_pg_port: i32,
 }
+
+/// Tenant metadata health status that are stored durably.
+#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
+#[diesel(table_name = crate::schema::metadata_health)]
+pub(crate) struct MetadataHealthPersistence {
+    #[serde(default)]
+    pub(crate) tenant_id: String,
+    #[serde(default)]
+    pub(crate) shard_number: i32,
+    #[serde(default)]
+    pub(crate) shard_count: i32,
+
+    pub(crate) healthy: bool,
+    pub(crate) last_scrubbed_at: chrono::DateTime<chrono::Utc>,
+}
+
+impl MetadataHealthPersistence {
+    pub fn new(
+        tenant_shard_id: TenantShardId,
+        healthy: bool,
+        last_scrubbed_at: chrono::DateTime<chrono::Utc>,
+    ) -> Self {
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let shard_number = tenant_shard_id.shard_number.0 as i32;
+        let shard_count = tenant_shard_id.shard_count.literal() as i32;
+
+        MetadataHealthPersistence {
+            tenant_id,
+            shard_number,
+            shard_count,
+            healthy,
+            last_scrubbed_at,
+        }
+    }
+
+    #[allow(dead_code)]
+    pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
+        Ok(TenantShardId {
+            tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
+            shard_number: ShardNumber(self.shard_number as u8),
+            shard_count: ShardCount::new(self.shard_count as u8),
+        })
+    }
+}
+
+impl From<MetadataHealthPersistence> for MetadataHealthRecord {
+    fn from(value: MetadataHealthPersistence) -> Self {
+        MetadataHealthRecord {
+            tenant_shard_id: value
+                .get_tenant_shard_id()
+                .expect("stored tenant id should be valid"),
+            healthy: value.healthy,
+            last_scrubbed_at: value.last_scrubbed_at,
+        }
+    }
+}
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index ff37d0fe77..cb5ba3f38b 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -1,5 +1,15 @@
 // @generated automatically by Diesel CLI.
 
+diesel::table! {
+    metadata_health (tenant_id, shard_number, shard_count) {
+        tenant_id -> Varchar,
+        shard_number -> Int4,
+        shard_count -> Int4,
+        healthy -> Bool,
+        last_scrubbed_at -> Timestamptz,
+    }
+}
+
 diesel::table! {
     nodes (node_id) {
         node_id -> Int8,
@@ -26,4 +36,4 @@ diesel::table! {
     }
 }
 
-diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
+diesel::allow_tables_to_appear_in_same_query!(metadata_health, nodes, tenant_shards,);
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 821f45d0c0..ea515f67da 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -16,7 +16,7 @@ use crate::{
     compute_hook::NotifyError,
     id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
     metrics::LeadershipStatusGroup,
-    persistence::{AbortShardSplitStatus, TenantFilter},
+    persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
     reconciler::{ReconcileError, ReconcileUnits},
     scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
     tenant_shard::{
@@ -33,11 +33,11 @@ use futures::{stream::FuturesUnordered, StreamExt};
 use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
-        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-        ShardSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
-        TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
-        TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
-        TenantShardMigrateResponse, UtilizationScore,
+        MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
+        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
+        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
+        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
+        TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
     },
     models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
 };
@@ -6095,6 +6095,68 @@ impl Service {
         Ok(())
     }
 
+    /// Updates scrubber metadata health check results.
+    pub(crate) async fn metadata_health_update(
+        &self,
+        update_req: MetadataHealthUpdateRequest,
+    ) -> Result<(), ApiError> {
+        let now = chrono::offset::Utc::now();
+        let (healthy_records, unhealthy_records) = {
+            let locked = self.inner.read().unwrap();
+            let healthy_records = update_req
+                .healthy_tenant_shards
+                .into_iter()
+                // Retain only health records associated with tenant shards managed by storage controller.
+                .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
+                .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, true, now))
+                .collect();
+            let unhealthy_records = update_req
+                .unhealthy_tenant_shards
+                .into_iter()
+                .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
+                .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, false, now))
+                .collect();
+
+            (healthy_records, unhealthy_records)
+        };
+
+        self.persistence
+            .update_metadata_health_records(healthy_records, unhealthy_records, now)
+            .await?;
+        Ok(())
+    }
+
+    /// Lists the tenant shards that has unhealthy metadata status.
+    pub(crate) async fn metadata_health_list_unhealthy(
+        &self,
+    ) -> Result<Vec<TenantShardId>, ApiError> {
+        let result = self
+            .persistence
+            .list_unhealthy_metadata_health_records()
+            .await?
+            .iter()
+            .map(|p| p.get_tenant_shard_id().unwrap())
+            .collect();
+
+        Ok(result)
+    }
+
+    /// Lists the tenant shards that have not been scrubbed for some duration.
+    pub(crate) async fn metadata_health_list_outdated(
+        &self,
+        not_scrubbed_for: Duration,
+    ) -> Result<Vec<MetadataHealthRecord>, ApiError> {
+        let earlier = chrono::offset::Utc::now() - not_scrubbed_for;
+        let result = self
+            .persistence
+            .list_outdated_metadata_health_records(earlier)
+            .await?
+            .into_iter()
+            .map(|record| record.into())
+            .collect();
+        Ok(result)
+    }
+
     pub(crate) fn get_leadership_status(&self) -> LeadershipStatus {
         self.inner.read().unwrap().get_leadership_status()
     }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c5fffc2af6..5b2ebea794 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -449,6 +449,7 @@ class TokenScope(str, Enum):
     GENERATIONS_API = "generations_api"
     SAFEKEEPER_DATA = "safekeeperdata"
     TENANT = "tenant"
+    SCRUBBER = "scrubber"
 
 
 class NeonEnvBuilder:
@@ -2586,6 +2587,51 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
                 time.sleep(backoff)
 
+    def metadata_health_update(self, healthy: List[TenantShardId], unhealthy: List[TenantShardId]):
+        body: Dict[str, Any] = {
+            "healthy_tenant_shards": [str(t) for t in healthy],
+            "unhealthy_tenant_shards": [str(t) for t in unhealthy],
+        }
+
+        self.request(
+            "POST",
+            f"{self.env.storage_controller_api}/control/v1/metadata_health/update",
+            json=body,
+            headers=self.headers(TokenScope.SCRUBBER),
+        )
+
+    def metadata_health_list_unhealthy(self):
+        response = self.request(
+            "GET",
+            f"{self.env.storage_controller_api}/control/v1/metadata_health/unhealthy",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        return response.json()
+
+    def metadata_health_list_outdated(self, duration: str):
+        body: Dict[str, Any] = {"not_scrubbed_for": duration}
+
+        response = self.request(
+            "POST",
+            f"{self.env.storage_controller_api}/control/v1/metadata_health/outdated",
+            json=body,
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        return response.json()
+
+    def metadata_health_is_healthy(self, outdated_duration: str = "1h") -> bool:
+        """Metadata is healthy if there is no unhealthy or outdated health records."""
+
+        unhealthy = self.metadata_health_list_unhealthy()
+        outdated = self.metadata_health_list_outdated(outdated_duration)
+
+        healthy = (
+            len(unhealthy["unhealthy_tenant_shards"]) == 0 and len(outdated["health_records"]) == 0
+        )
+        if not healthy:
+            log.info(f"{unhealthy=}, {outdated=}")
+        return healthy
+
     def step_down(self):
         log.info("Asking storage controller to step down")
         response = self.request(
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index da638ac233..eb2cdccdb9 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3,7 +3,7 @@ import threading
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
@@ -1785,6 +1785,126 @@ def test_storage_controller_node_deletion(
     env.storage_controller.consistency_check()
 
 
+@pytest.mark.parametrize("shard_count", [None, 2])
+def test_storage_controller_metadata_health(
+    neon_env_builder: NeonEnvBuilder,
+    shard_count: Optional[int],
+):
+    """
+    Create three tenants A, B, C.
+
+    Phase 1:
+    - A: Post healthy status.
+    - B: Post unhealthy status.
+    - C: No updates.
+
+    Phase 2:
+    - B: Post healthy status.
+    - C: Post healthy status.
+
+    Phase 3:
+    - A: Post unhealthy status.
+
+    Phase 4:
+    - Delete tenant A, metadata health status should be deleted as well.
+    """
+
+    def update_and_query_metadata_health(
+        env: NeonEnv,
+        healthy: List[TenantShardId],
+        unhealthy: List[TenantShardId],
+        outdated_duration: str = "1h",
+    ) -> Tuple[Set[str], Set[str]]:
+        """
+        Update metadata health. Then list tenant shards with unhealthy and
+        outdated metadata health status.
+        """
+        if healthy or unhealthy:
+            env.storage_controller.metadata_health_update(healthy, unhealthy)
+        result = env.storage_controller.metadata_health_list_unhealthy()
+        unhealthy_res = set(result["unhealthy_tenant_shards"])
+        result = env.storage_controller.metadata_health_list_outdated(outdated_duration)
+        outdated_res = set(record["tenant_shard_id"] for record in result["health_records"])
+
+        return unhealthy_res, outdated_res
+
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_start()
+
+    # Mock tenant (`initial_tenant``) with healthy scrubber scan result
+    tenant_a_shard_ids = (
+        env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=shard_count)
+        if shard_count is not None
+        else [TenantShardId(env.initial_tenant, 0, 0)]
+    )
+
+    # Mock tenant with unhealthy scrubber scan result
+    tenant_b, _ = env.neon_cli.create_tenant(shard_count=shard_count)
+    tenant_b_shard_ids = (
+        env.storage_controller.tenant_shard_split(tenant_b, shard_count=shard_count)
+        if shard_count is not None
+        else [TenantShardId(tenant_b, 0, 0)]
+    )
+
+    # Mock tenant that never gets a health update from scrubber
+    tenant_c, _ = env.neon_cli.create_tenant(shard_count=shard_count)
+
+    tenant_c_shard_ids = (
+        env.storage_controller.tenant_shard_split(tenant_c, shard_count=shard_count)
+        if shard_count is not None
+        else [TenantShardId(tenant_c, 0, 0)]
+    )
+
+    # Metadata health table also updated as tenant shards are created.
+    assert env.storage_controller.metadata_health_is_healthy()
+
+    # post "fake" updates to storage controller db
+
+    unhealthy, outdated = update_and_query_metadata_health(
+        env, healthy=tenant_a_shard_ids, unhealthy=tenant_b_shard_ids
+    )
+
+    log.info(f"After Phase 1: {unhealthy=}, {outdated=}")
+    assert len(unhealthy) == len(tenant_b_shard_ids)
+    for t in tenant_b_shard_ids:
+        assert str(t) in unhealthy
+    assert len(outdated) == 0
+
+    unhealthy, outdated = update_and_query_metadata_health(
+        env, healthy=tenant_b_shard_ids + tenant_c_shard_ids, unhealthy=[]
+    )
+
+    log.info(f"After Phase 2: {unhealthy=}, {outdated=}")
+    assert len(unhealthy) == 0
+    assert len(outdated) == 0
+
+    unhealthy, outdated = update_and_query_metadata_health(
+        env, healthy=[], unhealthy=tenant_a_shard_ids
+    )
+
+    log.info(f"After Phase 3: {unhealthy=}, {outdated=}")
+    assert len(unhealthy) == len(tenant_a_shard_ids)
+    for t in tenant_a_shard_ids:
+        assert str(t) in unhealthy
+    assert len(outdated) == 0
+
+    # Phase 4: Delete A
+    env.storage_controller.pageserver_api().tenant_delete(env.initial_tenant)
+
+    # A's unhealthy metadata health status should be deleted as well.
+    assert env.storage_controller.metadata_health_is_healthy()
+
+    # All shards from B and C are not fresh if set outdated duration to 0 seconds.
+    unhealthy, outdated = update_and_query_metadata_health(
+        env, healthy=[], unhealthy=tenant_a_shard_ids, outdated_duration="0s"
+    )
+    assert len(unhealthy) == 0
+    for t in tenant_b_shard_ids + tenant_c_shard_ids:
+        assert str(t) in outdated
+
+
 def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
     """
     Test the `/control/v1/step_down` storage controller API. Upon receiving such

From 85bef9f05d68b1c91436e6de9ef5c6cc05e8a6f0 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Tue, 30 Jul 2024 11:07:34 -0400
Subject: [PATCH 1306/1571] feat(scrubber): post  `scan_metadata` results to
 storage controller (#8502)

Part of #8128, followup to #8480. closes #8421.

Enable scrubber to optionally post metadata scan health results to
storage controller.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 libs/pageserver_api/src/controller_api.rs     |  5 ++-
 storage_scrubber/src/checks.rs                |  5 +++
 storage_scrubber/src/lib.rs                   |  7 ++++
 storage_scrubber/src/main.rs                  | 40 ++++++++++++++-----
 .../src/pageserver_physical_gc.rs             |  8 +---
 .../src/scan_pageserver_metadata.rs           | 32 ++++++++++-----
 test_runner/fixtures/neon_fixtures.py         |  9 +++--
 test_runner/regress/test_storage_scrubber.py  | 16 +++++++-
 8 files changed, 88 insertions(+), 34 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 36b1bd95ff..a5b452da83 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,3 +1,4 @@
+use std::collections::HashSet;
 use std::str::FromStr;
 use std::time::{Duration, Instant};
 
@@ -304,8 +305,8 @@ pub struct MetadataHealthRecord {
 
 #[derive(Serialize, Deserialize, Debug)]
 pub struct MetadataHealthUpdateRequest {
-    pub healthy_tenant_shards: Vec<TenantShardId>,
-    pub unhealthy_tenant_shards: Vec<TenantShardId>,
+    pub healthy_tenant_shards: HashSet<TenantShardId>,
+    pub unhealthy_tenant_shards: HashSet<TenantShardId>,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index a35a58aedd..5aa9e88c40 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -40,6 +40,11 @@ impl TimelineAnalysis {
             garbage_keys: Vec::new(),
         }
     }
+
+    /// Whether a timeline is healthy.
+    pub(crate) fn is_healthy(&self) -> bool {
+        self.errors.is_empty() && self.warnings.is_empty()
+    }
 }
 
 pub(crate) async fn branch_cleanup_and_check_errors(
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index c7900f9b02..e0f154def3 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -32,6 +32,7 @@ use remote_storage::{
 };
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
+use storage_controller_client::control_api;
 use tokio::io::AsyncReadExt;
 use tokio_util::sync::CancellationToken;
 use tracing::error;
@@ -255,6 +256,12 @@ pub struct ControllerClientConfig {
     pub controller_jwt: String,
 }
 
+impl ControllerClientConfig {
+    pub fn build_client(self) -> control_api::Client {
+        control_api::Client::new(self.controller_api, Some(self.controller_jwt))
+    }
+}
+
 pub struct ConsoleConfig {
     pub token: String,
     pub base_url: Url,
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index 346829b7c9..4c804c00c1 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,7 +1,8 @@
 use anyhow::{anyhow, bail};
 use camino::Utf8PathBuf;
+use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
 use pageserver_api::shard::TenantShardId;
-use reqwest::Url;
+use reqwest::{Method, Url};
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
 use storage_scrubber::scan_pageserver_metadata::scan_metadata;
@@ -61,6 +62,8 @@ enum Command {
         json: bool,
         #[arg(long = "tenant-id", num_args = 0..)]
         tenant_ids: Vec<TenantShardId>,
+        #[arg(long = "post", default_value_t = false)]
+        post_to_storage_controller: bool,
         #[arg(long, default_value = None)]
         /// For safekeeper node_kind only, points to db with debug dump
         dump_db_connstr: Option<String>,
@@ -116,11 +119,20 @@ async fn main() -> anyhow::Result<()> {
         chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
     ));
 
+    let controller_client_conf = cli.controller_api.map(|controller_api| {
+        ControllerClientConfig {
+            controller_api,
+            // Default to no key: this is a convenience when working in a development environment
+            controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
+        }
+    });
+
     match cli.command {
         Command::ScanMetadata {
             json,
             tenant_ids,
             node_kind,
+            post_to_storage_controller,
             dump_db_connstr,
             dump_db_table,
         } => {
@@ -159,6 +171,9 @@ async fn main() -> anyhow::Result<()> {
                 }
                 Ok(())
             } else {
+                if controller_client_conf.is_none() && post_to_storage_controller {
+                    return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
+                }
                 match scan_metadata(bucket_config.clone(), tenant_ids).await {
                     Err(e) => {
                         tracing::error!("Failed: {e}");
@@ -170,6 +185,21 @@ async fn main() -> anyhow::Result<()> {
                         } else {
                             println!("{}", summary.summary_string());
                         }
+
+                        if post_to_storage_controller {
+                            if let Some(conf) = controller_client_conf {
+                                let controller_client = conf.build_client();
+                                let body = summary.build_health_update_request();
+                                controller_client
+                                    .dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
+                                        Method::POST,
+                                        "control/v1/metadata_health/update".to_string(),
+                                        Some(body),
+                                    )
+                                    .await?;
+                            }
+                        }
+
                         if summary.is_fatal() {
                             Err(anyhow::anyhow!("Fatal scrub errors detected"))
                         } else if summary.is_empty() {
@@ -217,14 +247,6 @@ async fn main() -> anyhow::Result<()> {
             min_age,
             mode,
         } => {
-            let controller_client_conf = cli.controller_api.map(|controller_api| {
-                ControllerClientConfig {
-                    controller_api,
-                    // Default to no key: this is a convenience when working in a development environment
-                    controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
-                }
-            });
-
             match (&controller_client_conf, mode) {
                 (Some(_), _) => {
                     // Any mode may run when controller API is set
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index e977fd49f7..69896caa82 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -567,13 +567,7 @@ pub async fn pageserver_physical_gc(
     }
 
     // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
-    let Some(controller_client) = controller_client_conf.as_ref().map(|c| {
-        let ControllerClientConfig {
-            controller_api,
-            controller_jwt,
-        } = c;
-        control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone()))
-    }) else {
+    let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
         tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
         return Ok(summary);
     };
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index fbd60f93bb..dc410bde41 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -9,12 +9,13 @@ use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimeline
 use aws_sdk_s3::Client;
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
+use pageserver_api::controller_api::MetadataHealthUpdateRequest;
 use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use utils::id::TenantId;
 use utils::shard::ShardCount;
 
-#[derive(Serialize)]
+#[derive(Serialize, Default)]
 pub struct MetadataSummary {
     tenant_count: usize,
     timeline_count: usize,
@@ -23,19 +24,16 @@ pub struct MetadataSummary {
     with_warnings: HashSet<TenantShardTimelineId>,
     with_orphans: HashSet<TenantShardTimelineId>,
     indices_by_version: HashMap<usize, usize>,
+
+    #[serde(skip)]
+    pub(crate) healthy_tenant_shards: HashSet<TenantShardId>,
+    #[serde(skip)]
+    pub(crate) unhealthy_tenant_shards: HashSet<TenantShardId>,
 }
 
 impl MetadataSummary {
     fn new() -> Self {
-        Self {
-            tenant_count: 0,
-            timeline_count: 0,
-            timeline_shard_count: 0,
-            with_errors: HashSet::new(),
-            with_warnings: HashSet::new(),
-            with_orphans: HashSet::new(),
-            indices_by_version: HashMap::new(),
-        }
+        Self::default()
     }
 
     fn update_data(&mut self, data: &S3TimelineBlobData) {
@@ -54,6 +52,13 @@ impl MetadataSummary {
     }
 
     fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
+        if analysis.is_healthy() {
+            self.healthy_tenant_shards.insert(id.tenant_shard_id);
+        } else {
+            self.healthy_tenant_shards.remove(&id.tenant_shard_id);
+            self.unhealthy_tenant_shards.insert(id.tenant_shard_id);
+        }
+
         if !analysis.errors.is_empty() {
             self.with_errors.insert(*id);
         }
@@ -101,6 +106,13 @@ Index versions: {version_summary}
     pub fn is_empty(&self) -> bool {
         self.timeline_shard_count == 0
     }
+
+    pub fn build_health_update_request(&self) -> MetadataHealthUpdateRequest {
+        MetadataHealthUpdateRequest {
+            healthy_tenant_shards: self.healthy_tenant_shards.clone(),
+            unhealthy_tenant_shards: self.unhealthy_tenant_shards.clone(),
+        }
+    }
 }
 
 /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5b2ebea794..0c33dec784 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4401,10 +4401,11 @@ class StorageScrubber:
         assert stdout is not None
         return stdout
 
-    def scan_metadata(self) -> Any:
-        stdout = self.scrubber_cli(
-            ["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
-        )
+    def scan_metadata(self, post_to_storage_controller: bool = False) -> Any:
+        args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
+        if post_to_storage_controller:
+            args.append("--post")
+        stdout = self.scrubber_cli(args, timeout=30)
 
         try:
             return json.loads(stdout)
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index a45430ca86..fadf438788 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -440,10 +440,12 @@ def test_scrubber_scan_pageserver_metadata(
     assert len(index.layer_metadata) > 0
     it = iter(index.layer_metadata.items())
 
-    scan_summary = env.storage_scrubber.scan_metadata()
+    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
     assert not scan_summary["with_warnings"]
     assert not scan_summary["with_errors"]
 
+    assert env.storage_controller.metadata_health_is_healthy()
+
     # Delete a layer file that is listed in the index.
     layer, metadata = next(it)
     log.info(f"Deleting {timeline_path}/{layer.to_str()}")
@@ -453,7 +455,17 @@ def test_scrubber_scan_pageserver_metadata(
     )
     log.info(f"delete response: {delete_response}")
 
-    # Check scan summary. Expect it to be a L0 layer so only emit warnings.
+    # Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings.
     scan_summary = env.storage_scrubber.scan_metadata()
     log.info(f"{pprint.pformat(scan_summary)}")
     assert len(scan_summary["with_warnings"]) > 0
+
+    assert env.storage_controller.metadata_health_is_healthy()
+
+    # Now post to storage controller, expect seeing one unhealthy health record
+    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
+    log.info(f"{pprint.pformat(scan_summary)}")
+    assert len(scan_summary["with_warnings"]) > 0
+
+    unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
+    assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)

From d95b46f3f35eaae3aaec4f218986ece33acab052 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 30 Jul 2024 18:13:18 +0200
Subject: [PATCH 1307/1571] cleanup(compact_level0_phase1): some commentary and
 wrapping into block expressions (#8544)

Byproduct of scouting done for
https://github.com/neondatabase/neon/issues/8184

refs https://github.com/neondatabase/neon/issues/8184
---
 pageserver/src/tenant/timeline.rs            |  21 +---
 pageserver/src/tenant/timeline/compaction.rs | 126 ++++++++++++-------
 2 files changed, 80 insertions(+), 67 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 862ca42188..2b205db6e1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -58,7 +58,7 @@ use std::{
     sync::atomic::AtomicU64,
 };
 use std::{
-    cmp::{max, min, Ordering},
+    cmp::{max, min},
     ops::ControlFlow,
 };
 use std::{
@@ -177,25 +177,6 @@ impl std::fmt::Display for ImageLayerCreationMode {
     }
 }
 
-/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub(crate) struct Hole {
-    key_range: Range<Key>,
-    coverage_size: usize,
-}
-
-impl Ord for Hole {
-    fn cmp(&self, other: &Self) -> Ordering {
-        other.coverage_size.cmp(&self.coverage_size) // inverse order
-    }
-}
-
-impl PartialOrd for Hole {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
 /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
 /// Can be removed after all refactors are done.
 fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 497d631f4f..3292b4a121 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -30,8 +30,8 @@ use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPA
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState};
+use crate::tenant::timeline::ImageLayerCreationOutcome;
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
-use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -608,62 +608,93 @@ impl Timeline {
             .read_lock_held_spawn_blocking_startup_micros
             .till_now();
 
-        // Determine N largest holes where N is number of compacted layers.
-        let max_holes = deltas_to_compact.len();
-        let last_record_lsn = self.get_last_record_lsn();
-        let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
-        let min_hole_coverage_size = 3; // TODO: something more flexible?
-
-        // min-heap (reserve space for one more element added before eviction)
-        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
-        let mut prev: Option<Key> = None;
-
-        let mut all_keys = Vec::new();
-
-        for l in deltas_to_compact.iter() {
-            all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
-        }
-
-        // FIXME: should spawn_blocking the rest of this function
-
-        // The current stdlib sorting implementation is designed in a way where it is
-        // particularly fast where the slice is made up of sorted sub-ranges.
-        all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
+        // TODO: replace with streaming k-merge
+        let all_keys = {
+            let mut all_keys = Vec::new();
+            for l in deltas_to_compact.iter() {
+                all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
+            }
+            // The current stdlib sorting implementation is designed in a way where it is
+            // particularly fast where the slice is made up of sorted sub-ranges.
+            all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
+            all_keys
+        };
 
         stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
 
-        for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
-            if let Some(prev_key) = prev {
-                // just first fast filter, do not create hole entries for metadata keys. The last hole in the
-                // compaction is the gap between data key and metadata keys.
-                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
-                    && !Key::is_metadata_key(&prev_key)
-                {
-                    let key_range = prev_key..next_key;
-                    // Measuring hole by just subtraction of i128 representation of key range boundaries
-                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
-                    // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
-                    // That is why it is better to measure size of hole as number of covering image layers.
-                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
-                    if coverage_size >= min_hole_coverage_size {
-                        heap.push(Hole {
-                            key_range,
-                            coverage_size,
-                        });
-                        if heap.len() > max_holes {
-                            heap.pop(); // remove smallest hole
+        // Determine N largest holes where N is number of compacted layers. The vec is sorted by key range start.
+        //
+        // A hole is a key range for which this compaction doesn't have any WAL records.
+        // Our goal in this compaction iteration is to avoid creating L1s that, in terms of their key range,
+        // cover the hole, but actually don't contain any WAL records for that key range.
+        // The reason is that the mere stack of L1s (`count_deltas`) triggers image layer creation (`create_image_layers`).
+        // That image layer creation would be useless for a hole range covered by L1s that don't contain any WAL records.
+        //
+        // The algorithm chooses holes as follows.
+        // - Slide a 2-window over the keys in key orde to get the hole range (=distance between two keys).
+        // - Filter: min threshold on range length
+        // - Rank: by coverage size (=number of image layers required to reconstruct each key in the range for which we have any data)
+        //
+        // For more details, intuition, and some ASCII art see https://github.com/neondatabase/neon/pull/3597#discussion_r1112704451
+        #[derive(PartialEq, Eq)]
+        struct Hole {
+            key_range: Range<Key>,
+            coverage_size: usize,
+        }
+        let holes: Vec<Hole> = {
+            use std::cmp::Ordering;
+            impl Ord for Hole {
+                fn cmp(&self, other: &Self) -> Ordering {
+                    self.coverage_size.cmp(&other.coverage_size).reverse()
+                }
+            }
+            impl PartialOrd for Hole {
+                fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+                    Some(self.cmp(other))
+                }
+            }
+            let max_holes = deltas_to_compact.len();
+            let last_record_lsn = self.get_last_record_lsn();
+            let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
+            let min_hole_coverage_size = 3; // TODO: something more flexible?
+                                            // min-heap (reserve space for one more element added before eviction)
+            let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
+            let mut prev: Option<Key> = None;
+
+            for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
+                if let Some(prev_key) = prev {
+                    // just first fast filter, do not create hole entries for metadata keys. The last hole in the
+                    // compaction is the gap between data key and metadata keys.
+                    if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
+                        && !Key::is_metadata_key(&prev_key)
+                    {
+                        let key_range = prev_key..next_key;
+                        // Measuring hole by just subtraction of i128 representation of key range boundaries
+                        // has not so much sense, because largest holes will corresponds field1/field2 changes.
+                        // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
+                        // That is why it is better to measure size of hole as number of covering image layers.
+                        let coverage_size =
+                            layers.image_coverage(&key_range, last_record_lsn).len();
+                        if coverage_size >= min_hole_coverage_size {
+                            heap.push(Hole {
+                                key_range,
+                                coverage_size,
+                            });
+                            if heap.len() > max_holes {
+                                heap.pop(); // remove smallest hole
+                            }
                         }
                     }
                 }
+                prev = Some(next_key.next());
             }
-            prev = Some(next_key.next());
-        }
+            let mut holes = heap.into_vec();
+            holes.sort_unstable_by_key(|hole| hole.key_range.start);
+            holes
+        };
         stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
         drop_rlock(guard);
         stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
-        let mut holes = heap.into_vec();
-        holes.sort_unstable_by_key(|hole| hole.key_range.start);
-        let mut next_hole = 0; // index of next hole in holes vector
 
         // This iterator walks through all key-value pairs from all the layers
         // we're compacting, in key, LSN order.
@@ -738,6 +769,7 @@ impl Timeline {
         let mut key_values_total_size = 0u64;
         let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
         let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
+        let mut next_hole = 0; // index of next hole in holes vector
 
         for &DeltaEntry {
             key, lsn, ref val, ..

From a4df3c8488024b9c4dba532be517cfbc3d7f8852 Mon Sep 17 00:00:00 2001
From: Cihan Demirci <128653800+fcdm@users.noreply.github.com>
Date: Tue, 30 Jul 2024 22:34:15 +0300
Subject: [PATCH 1308/1571] cicd: change Azure storage details [1/2] (#8553)

Change Azure storage configuration to point to new variables/secrets. They have
the `_NEW` suffix in order not to disrupt any tests while we complete the
switch.
---
 .github/actionlint.yml                        | 1 +
 .github/workflows/_build-and-test-locally.yml | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 37983798b7..f086008d34 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -9,5 +9,6 @@ self-hosted-runner:
     - us-east-2
 config-variables:
   - REMOTE_STORAGE_AZURE_CONTAINER
+  - REMOTE_STORAGE_AZURE_CONTAINER_NEW
   - REMOTE_STORAGE_AZURE_REGION
   - SLACK_UPCOMING_RELEASE_CHANNEL_ID
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 26e234a04d..7751f9e8c9 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -219,9 +219,9 @@ jobs:
           # Run separate tests for real Azure Blob Storage
           # XXX: replace region with `eu-central-1`-like region
           export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
-          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV_NEW }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV_NEW }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER_NEW }}"
           export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
           ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
 

From 4825b0fec31c06d57a573fb19854502a4d04e2c0 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 31 Jul 2024 14:17:59 +0200
Subject: [PATCH 1309/1571] compaction_level0_phase1: bypass PS PageCache for
 data blocks (#8543)

part of https://github.com/neondatabase/neon/issues/8184

# Problem

We want to bypass PS PageCache for all data block reads, but
`compact_level0_phase1` currently uses `ValueRef::load` to load the WAL
records from delta layers.
Internally, that maps to `FileBlockReader:read_blk` which hits the
PageCache
[here](https://github.com/neondatabase/neon/blob/e78341e1c220625d9bfa3f08632bd5cfb8e6a876/pageserver/src/tenant/block_io.rs#L229-L236).

# Solution

This PR adds a mode for `compact_level0_phase1` that uses the
`MergeIterator` for reading the `Value`s from the delta layer files.

`MergeIterator` is a streaming k-merge that uses vectored blob_io under
the hood, which bypasses the PS PageCache for data blocks.

Other notable changes:
* change the `DiskBtreeReader::into_stream` to buffer the node, instead
of holding a `PageCache` `PageReadGuard`.
* Without this, we run out of page cache slots in
`test_pageserver_compaction_smoke`.
* Generally, `PageReadGuard`s aren't supposed to be held across await
points, so, this is a general bugfix.

# Testing / Validation / Performance

`MergeIterator` has not yet been used in production; it's being
developed as part of
* https://github.com/neondatabase/neon/issues/8002

Therefore, this PR adds a validation mode that compares the existing
approach's value iterator with the new approach's stream output, item by
item.
If they're not identical, we log a warning / fail the unit/regression
test.
To avoid flooding the logs, we apply a global rate limit of once per 10
seconds.
In any case, we use the existing approach's value.

Expected performance impact that will be monitored in staging / nightly
benchmarks / eventually pre-prod:
* with validation:
  * increased CPU usage
  * ~doubled VirtualFile read bytes/second metric
* no change in disk IO usage because the kernel page cache will likely
have the pages buffered on the second read
* without validation:
* slightly higher DRAM usage because each iterator participating in the
k-merge has a dedicated buffer (as opposed to before, where compactions
would rely on the PS PageCaceh as a shared evicting buffer)
* less disk IO if previously there were repeat PageCache misses (likely
case on a busy production Pageserver)
* lower CPU usage: PageCache out of the picture, fewer syscalls are made
(vectored blob io batches reads)

# Rollout

The new code is used with validation mode enabled-by-default.
This gets us validation everywhere by default, specifically in
- Rust unit tests
- Python tests
- Nightly pagebench (shouldn't really matter)
- Staging

Before the next release, I'll merge the following aws.git PR that
configures prod to continue using the existing behavior:

* https://github.com/neondatabase/aws/pull/1663

# Interactions With Other Features

This work & rollout should complete before Direct IO is enabled because
Direct IO would double the IOPS & latency for each compaction read
(#8240).

# Future Work

The streaming k-merge's memory usage is proportional to the amount of
memory per participating layer.

But `compact_level0_phase1` still loads all keys into memory for
`all_keys_iter`.
Thus, it continues to have active memory usage proportional to the
number of keys involved in the compaction.

Future work should replace `all_keys_iter` with a streaming keys
iterator.
This PR has a draft in its first commit, which I later reverted because
it's not necessary to achieve the goal of this PR / issue #8184.
---
 pageserver/src/bin/pageserver.rs             |   1 +
 pageserver/src/config.rs                     |  19 ++
 pageserver/src/repository.rs                 |   3 +-
 pageserver/src/tenant/disk_btree.rs          |  13 +-
 pageserver/src/tenant/timeline/compaction.rs | 184 ++++++++++++++++++-
 5 files changed, 210 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 7a96c86ded..2d00f311fb 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -129,6 +129,7 @@ fn main() -> anyhow::Result<()> {
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.get_impl, "starting with get page implementation");
     info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
+    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
 
     let tenants_path = conf.tenants_path();
     if !tenants_path.exists() {
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index f71881683d..41c2fe0af3 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -29,6 +29,7 @@ use utils::{
     logging::LogFormat,
 };
 
+use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -295,6 +296,10 @@ pub struct PageServerConf {
     pub ephemeral_bytes_per_memory_kb: usize,
 
     pub l0_flush: L0FlushConfig,
+
+    /// This flag is temporary and will be removed after gradual rollout.
+    /// See <https://github.com/neondatabase/neon/issues/8184>.
+    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -401,6 +406,8 @@ struct PageServerConfigBuilder {
     ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
 
     l0_flush: BuilderValue<L0FlushConfig>,
+
+    compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
 }
 
 impl PageServerConfigBuilder {
@@ -490,6 +497,7 @@ impl PageServerConfigBuilder {
             validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: Set(L0FlushConfig::default()),
+            compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
         }
     }
 }
@@ -673,6 +681,10 @@ impl PageServerConfigBuilder {
         self.l0_flush = BuilderValue::Set(value);
     }
 
+    pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) {
+        self.compact_level0_phase1_value_access = BuilderValue::Set(value);
+    }
+
     pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
@@ -730,6 +742,7 @@ impl PageServerConfigBuilder {
                 image_compression,
                 ephemeral_bytes_per_memory_kb,
                 l0_flush,
+                compact_level0_phase1_value_access,
             }
             CUSTOM LOGIC
             {
@@ -1002,6 +1015,9 @@ impl PageServerConf {
                 "l0_flush" => {
                     builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
                 }
+                "compact_level0_phase1_value_access" => {
+                    builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1086,6 +1102,7 @@ impl PageServerConf {
             validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
             l0_flush: L0FlushConfig::default(),
+            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
         }
     }
 }
@@ -1327,6 +1344,7 @@ background_task_maximum_delay = '334 s'
                 image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
+                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1401,6 +1419,7 @@ background_task_maximum_delay = '334 s'
                 image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
+                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 5a334d0290..e4ebafd927 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -8,8 +8,7 @@ use std::time::Duration;
 pub use pageserver_api::key::{Key, KEY_SIZE};
 
 /// A 'value' stored for a one Key.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[cfg_attr(test, derive(PartialEq))]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub enum Value {
     /// An Image value contains a full copy of the value
     Image(Bytes),
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 1583a3826a..0107b0ac7e 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -296,13 +296,19 @@ where
             let mut stack = Vec::new();
             stack.push((self.root_blk, None));
             let block_cursor = self.reader.block_cursor();
+            let mut node_buf = [0_u8; PAGE_SZ];
             while let Some((node_blknum, opt_iter)) = stack.pop() {
-                // Locate the node.
-                let node_buf = block_cursor
+                // Read the node, through the PS PageCache, into local variable `node_buf`.
+                // We could keep the page cache read guard alive, but, at the time of writing,
+                // we run quite small PS PageCache s => can't risk running out of
+                // PageCache space because this stream isn't consumed fast enough.
+                let page_read_guard = block_cursor
                     .read_blk(self.start_blk + node_blknum, ctx)
                     .await?;
+                node_buf.copy_from_slice(page_read_guard.as_ref());
+                drop(page_read_guard); // drop page cache read guard early
 
-                let node = OnDiskNode::deparse(node_buf.as_ref())?;
+                let node = OnDiskNode::deparse(&node_buf)?;
                 let prefix_len = node.prefix_len as usize;
                 let suffix_len = node.suffix_len as usize;
 
@@ -345,6 +351,7 @@ where
                     Either::Left(idx..node.num_children.into())
                 };
 
+
                 // idx points to the first match now. Keep going from there
                 while let Some(idx) = iter.next() {
                     let key_off = idx * suffix_len;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 3292b4a121..7bfa8e9d35 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -698,7 +698,140 @@ impl Timeline {
 
         // This iterator walks through all key-value pairs from all the layers
         // we're compacting, in key, LSN order.
-        let all_values_iter = all_keys.iter();
+        // If there's both a Value::Image and Value::WalRecord for the same (key,lsn),
+        // then the Value::Image is ordered before Value::WalRecord.
+        //
+        // TODO(https://github.com/neondatabase/neon/issues/8184): remove the page cached blob_io
+        // option and validation code once we've reached confidence.
+        enum AllValuesIter<'a> {
+            PageCachedBlobIo {
+                all_keys_iter: VecIter<'a>,
+            },
+            StreamingKmergeBypassingPageCache {
+                merge_iter: MergeIterator<'a>,
+            },
+            ValidatingStreamingKmergeBypassingPageCache {
+                mode: CompactL0BypassPageCacheValidation,
+                merge_iter: MergeIterator<'a>,
+                all_keys_iter: VecIter<'a>,
+            },
+        }
+        type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes
+        impl AllValuesIter<'_> {
+            async fn next_all_keys_iter(
+                iter: &mut VecIter<'_>,
+                ctx: &RequestContext,
+            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+                let Some(DeltaEntry {
+                    key,
+                    lsn,
+                    val: value_ref,
+                    ..
+                }) = iter.next()
+                else {
+                    return Ok(None);
+                };
+                let value = value_ref.load(ctx).await?;
+                Ok(Some((*key, *lsn, value)))
+            }
+            async fn next(
+                &mut self,
+                ctx: &RequestContext,
+            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+                match self {
+                    AllValuesIter::PageCachedBlobIo { all_keys_iter: iter } => {
+                      Self::next_all_keys_iter(iter, ctx).await
+                    }
+                    AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await,
+                    AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async {
+                        // advance both iterators
+                        let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await;
+                        let merge_iter_item = merge_iter.next().await;
+                        // compare results & log warnings as needed
+                        macro_rules! rate_limited_warn {
+                            ($($arg:tt)*) => {{
+                                if cfg!(debug_assertions) || cfg!(feature = "testing") {
+                                    warn!($($arg)*);
+                                    panic!("CompactL0BypassPageCacheValidation failure, check logs");
+                                }
+                                use once_cell::sync::Lazy;
+                                use utils::rate_limit::RateLimit;
+                                use std::sync::Mutex;
+                                use std::time::Duration;
+                                static LOGGED: Lazy<Mutex<RateLimit>> =
+                                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                                let mut rate_limit = LOGGED.lock().unwrap();
+                                rate_limit.call(|| {
+                                    warn!($($arg)*);
+                                });
+                            }}
+                        }
+                        match (&all_keys_iter_item, &merge_iter_item) {
+                            (Err(_), Err(_)) => {
+                                // don't bother asserting equivality of the errors
+                            }
+                            (Err(all_keys), Ok(merge)) => {
+                                rate_limited_warn!(?merge, "all_keys_iter returned an error where merge did not: {all_keys:?}");
+                            },
+                            (Ok(all_keys), Err(merge)) => {
+                                rate_limited_warn!(?all_keys, "merge returned an error where all_keys_iter did not: {merge:?}");
+                            },
+                            (Ok(None), Ok(None)) => { }
+                            (Ok(Some(all_keys)), Ok(None)) => {
+                                rate_limited_warn!(?all_keys, "merge returned None where all_keys_iter returned Some");
+                            }
+                            (Ok(None), Ok(Some(merge))) => {
+                                rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some");
+                            }
+                            (Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => {
+                                match mode {
+                                    // TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one
+                                    CompactL0BypassPageCacheValidation::KeyLsn => {
+                                        let all_keys = (all_keys_key, all_keys_lsn);
+                                        let merge = (merge_key, merge_lsn);
+                                        if all_keys != merge {
+                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter");
+                                        }
+                                    }
+                                    CompactL0BypassPageCacheValidation::KeyLsnValue => {
+                                        let all_keys = (all_keys_key, all_keys_lsn, all_keys_value);
+                                        let merge = (merge_key, merge_lsn, merge_value);
+                                        if all_keys != merge {
+                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN,Value) than all_keys_iter");
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        // in case of mismatch, trust the legacy all_keys_iter_item
+                        all_keys_iter_item
+                    }.instrument(info_span!("next")).await
+                }
+            }
+        }
+        let mut all_values_iter = match &self.conf.compact_level0_phase1_value_access {
+            CompactL0Phase1ValueAccess::PageCachedBlobIo => AllValuesIter::PageCachedBlobIo {
+                all_keys_iter: all_keys.iter(),
+            },
+            CompactL0Phase1ValueAccess::StreamingKmerge { validate } => {
+                let merge_iter = {
+                    let mut deltas = Vec::with_capacity(deltas_to_compact.len());
+                    for l in deltas_to_compact.iter() {
+                        let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
+                        deltas.push(l);
+                    }
+                    MergeIterator::create(&deltas, &[], ctx)
+                };
+                match validate {
+                    None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter },
+                    Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache {
+                        mode: validate.clone(),
+                        merge_iter,
+                        all_keys_iter: all_keys.iter(),
+                    },
+                }
+            }
+        };
 
         // This iterator walks through all keys and is needed to calculate size used by each key
         let mut all_keys_iter = all_keys
@@ -771,11 +904,11 @@ impl Timeline {
         let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
         let mut next_hole = 0; // index of next hole in holes vector
 
-        for &DeltaEntry {
-            key, lsn, ref val, ..
-        } in all_values_iter
+        while let Some((key, lsn, value)) = all_values_iter
+            .next(ctx)
+            .await
+            .map_err(CompactionError::Other)?
         {
-            let value = val.load(ctx).await.map_err(CompactionError::Other)?;
             let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
             // We need to check key boundaries once we reach next key or end of layer with the same key
             if !same_key || lsn == dup_end_lsn {
@@ -960,6 +1093,10 @@ impl Timeline {
             }
         }
 
+        // Without this, rustc complains about deltas_to_compact still
+        // being borrowed when we `.into_iter()` below.
+        drop(all_values_iter);
+
         Ok(CompactLevel0Phase1Result {
             new_layers,
             deltas_to_compact: deltas_to_compact
@@ -1067,6 +1204,43 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
     }
 }
 
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+pub enum CompactL0Phase1ValueAccess {
+    /// The old way.
+    PageCachedBlobIo,
+    /// The new way.
+    StreamingKmerge {
+        /// If set, we run both the old way and the new way, validate that
+        /// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
+        /// and if the validation fails,
+        /// - in tests: fail them with a panic or
+        /// - in prod, log a rate-limited warning and use the old way's results.
+        ///
+        /// If not set, we only run the new way and trust its results.
+        validate: Option<CompactL0BypassPageCacheValidation>,
+    },
+}
+
+/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum CompactL0BypassPageCacheValidation {
+    /// Validate that the series of (key, lsn) pairs are the same.
+    KeyLsn,
+    /// Validate that the entire output of old and new way is identical.
+    KeyLsnValue,
+}
+
+impl Default for CompactL0Phase1ValueAccess {
+    fn default() -> Self {
+        CompactL0Phase1ValueAccess::StreamingKmerge {
+            // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
+            validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
+        }
+    }
+}
+
 impl Timeline {
     /// Entry point for new tiered compaction algorithm.
     ///

From d21246c8bdb2b0d7b515c122fce55fb3a9528908 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 31 Jul 2024 15:10:27 +0100
Subject: [PATCH 1310/1571] CI(regress-tests): run less regression tests
 (#8561)

## Problem
We run regression tests on `release` & `debug` builds for each of the
three supported Postgres versions (6 in total).
With upcoming ARM support and Postgres 17, the number of jobs will jump
to 16, which is a lot.

See the internal discussion here:
https://neondb.slack.com/archives/C033A2WE6BZ/p1722365908404329

## Summary of changes
- Run `regress-tests` job in debug builds only with the latest Postgres
version
- Do not do `debug` builds on release branches
---
 .github/workflows/_build-and-test-locally.yml | 8 ++++++--
 .github/workflows/build_and_test.yml          | 5 ++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 7751f9e8c9..182e96a8ca 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -19,6 +19,10 @@ on:
         description: 'debug or release'
         required: true
         type: string
+      pg-versions:
+        description: 'a json array of postgres versions to run regression tests on'
+        required: true
+        type: string
 
 defaults:
   run:
@@ -254,7 +258,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        pg_version: [ v14, v15, v16 ]
+        pg_version: ${{ fromJson(inputs.pg-versions) }}
     steps:
       - uses: actions/checkout@v4
         with:
@@ -284,5 +288,5 @@ jobs:
       - name: Merge and upload coverage data
         if: |
           false &&
-          inputs.build-type == 'debug' && matrix.pg_version == 'v14'
+          inputs.build-type == 'debug' && matrix.pg_version == 'v16'
         uses: ./.github/actions/save-coverage-data
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 3cf40e6153..c4df98f585 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -203,7 +203,8 @@ jobs:
       fail-fast: false
       matrix:
         arch: [ x64 ]
-        build-type: [ debug, release ]
+        # Do not build or run tests in debug for release branches
+        build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
         include:
           - build-type: release
             arch: arm64
@@ -213,6 +214,8 @@ jobs:
       build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
       build-tag: ${{ needs.tag.outputs.build-tag }}
       build-type: ${{ matrix.build-type }}
+      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
+      pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
     secrets: inherit
 
   # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking

From 61a65f61f394b6fee52bb04b239a2a0d7548da19 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Wed, 31 Jul 2024 10:48:48 -0400
Subject: [PATCH 1311/1571] feat(pageserver): support btm-gc-compaction for
 child branches (#8519)

part of https://github.com/neondatabase/neon/issues/8002

For child branches, we will pull the image of the modified keys from the
parant into the child branch, which creates a full history for
generating key retention. If there are not enough delta keys, the image
won't be wrote eventually, and we will only keep the deltas inside the
child branch. We could avoid the wasteful work to pull the image from
the parent if we can know the number of deltas in advance, in the future
(currently we always pull image for all modified keys in the child
branch)


---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                     | 293 ++++++++++++++++++-
 pageserver/src/tenant/timeline.rs            |   7 +-
 pageserver/src/tenant/timeline/compaction.rs | 135 ++++++---
 3 files changed, 400 insertions(+), 35 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e5ac6725ad..48c1851a3a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -7347,6 +7347,7 @@ mod tests {
                 Lsn(0x60),
                 &[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
                 3,
+                None,
             )
             .await
             .unwrap();
@@ -7471,7 +7472,7 @@ mod tests {
             ),
         ];
         let res = tline
-            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3)
+            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
             .await
             .unwrap();
         let expected_res = KeyHistoryRetention {
@@ -7517,6 +7518,114 @@ mod tests {
         };
         assert_eq!(res, expected_res);
 
+        // In case of branch compaction, the branch itself does not have the full history, and we need to provide
+        // the ancestor image in the test case.
+
+        let history = vec![
+            (
+                key,
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
+            ),
+            (
+                key,
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
+            ),
+            (
+                key,
+                Lsn(0x40),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
+            ),
+            (
+                key,
+                Lsn(0x70),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
+            ),
+        ];
+        let res = tline
+            .generate_key_retention(
+                key,
+                &history,
+                Lsn(0x60),
+                &[],
+                3,
+                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
+            )
+            .await
+            .unwrap();
+        let expected_res = KeyHistoryRetention {
+            below_horizon: vec![(
+                Lsn(0x60),
+                KeyLogAtLsn(vec![(
+                    Lsn(0x60),
+                    Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page
+                )]),
+            )],
+            above_horizon: KeyLogAtLsn(vec![(
+                Lsn(0x70),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
+            )]),
+        };
+        assert_eq!(res, expected_res);
+
+        let history = vec![
+            (
+                key,
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
+            ),
+            (
+                key,
+                Lsn(0x40),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
+            ),
+            (
+                key,
+                Lsn(0x60),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
+            ),
+            (
+                key,
+                Lsn(0x70),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
+            ),
+        ];
+        let res = tline
+            .generate_key_retention(
+                key,
+                &history,
+                Lsn(0x60),
+                &[Lsn(0x30)],
+                3,
+                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
+            )
+            .await
+            .unwrap();
+        let expected_res = KeyHistoryRetention {
+            below_horizon: vec![
+                (
+                    Lsn(0x30),
+                    KeyLogAtLsn(vec![(
+                        Lsn(0x20),
+                        Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
+                    )]),
+                ),
+                (
+                    Lsn(0x60),
+                    KeyLogAtLsn(vec![(
+                        Lsn(0x60),
+                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")),
+                    )]),
+                ),
+            ],
+            above_horizon: KeyLogAtLsn(vec![(
+                Lsn(0x70),
+                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
+            )]),
+        };
+        assert_eq!(res, expected_res);
+
         Ok(())
     }
 
@@ -7715,4 +7824,186 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+            (
+                get_key(2),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
+            (
+                get_key(3),
+                Lsn(0x28),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
+            ),
+            (
+                get_key(3),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
+            (
+                get_key(3),
+                Lsn(0x40),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
+            ),
+        ];
+        let delta2 = vec![
+            (
+                get_key(5),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+            (
+                get_key(6),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+        ];
+
+        let parent_tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![],                       // delta layers
+                vec![(Lsn(0x18), img_layer)], // image layers
+                Lsn(0x18),
+            )
+            .await?;
+
+        parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
+
+        let branch_tline = tenant
+            .branch_timeline_test_with_layers(
+                &parent_tline,
+                NEW_TIMELINE_ID,
+                Some(Lsn(0x18)),
+                &ctx,
+                vec![
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
+                ], // delta layers
+                vec![], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+
+        branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
+
+        {
+            // Update GC info
+            let mut guard = parent_tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x10),
+                    space: Lsn(0x10),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        {
+            // Update GC info
+            let mut guard = branch_tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x50),
+                    space: Lsn(0x50),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let expected_result_at_gc_horizon = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10@0x30"),
+            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10@0x20"),
+            Bytes::from_static(b"value 6@0x10@0x20"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10@0x48"),
+            Bytes::from_static(b"value 9@0x10@0x48"),
+        ];
+
+        let expected_result_at_lsn_40 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10@0x30"),
+            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10@0x20"),
+            Bytes::from_static(b"value 6@0x10@0x20"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let verify_result = || async {
+            for idx in 0..10 {
+                assert_eq!(
+                    branch_tline
+                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_gc_horizon[idx]
+                );
+                assert_eq!(
+                    branch_tline
+                        .get(get_key(idx as u32), Lsn(0x40), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_40[idx]
+                );
+            }
+        };
+
+        verify_result().await;
+
+        let cancel = CancellationToken::new();
+        branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+
+        verify_result().await;
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2b205db6e1..4db44a3a19 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -529,7 +529,6 @@ impl GetVectoredError {
     }
 }
 
-#[derive(Debug)]
 pub struct MissingKeyError {
     key: Key,
     shard: ShardNumber,
@@ -540,6 +539,12 @@ pub struct MissingKeyError {
     backtrace: Option<std::backtrace::Backtrace>,
 }
 
+impl std::fmt::Debug for MissingKeyError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self)
+    }
+}
+
 impl std::fmt::Display for MissingKeyError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 7bfa8e9d35..5e9ff1c9e4 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -15,6 +15,7 @@ use super::{
 };
 
 use anyhow::{anyhow, Context};
+use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
@@ -69,17 +70,21 @@ impl KeyHistoryRetention {
         self,
         key: Key,
         delta_writer: &mut Vec<(Key, Lsn, Value)>,
-        image_writer: &mut ImageLayerWriter,
+        mut image_writer: Option<&mut ImageLayerWriter>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let mut first_batch = true;
-        for (_, KeyLogAtLsn(logs)) in self.below_horizon {
+        for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
             if first_batch {
                 if logs.len() == 1 && logs[0].1.is_image() {
                     let Value::Image(img) = &logs[0].1 else {
                         unreachable!()
                     };
-                    image_writer.put_image(key, img.clone(), ctx).await?;
+                    if let Some(image_writer) = image_writer.as_mut() {
+                        image_writer.put_image(key, img.clone(), ctx).await?;
+                    } else {
+                        delta_writer.push((key, cutoff_lsn, Value::Image(img.clone())));
+                    }
                 } else {
                     for (lsn, val) in logs {
                         delta_writer.push((key, lsn, val));
@@ -1328,6 +1333,7 @@ impl Timeline {
         horizon: Lsn,
         retain_lsn_below_horizon: &[Lsn],
         delta_threshold_cnt: usize,
+        base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
     ) -> anyhow::Result<KeyHistoryRetention> {
         // Pre-checks for the invariants
         if cfg!(debug_assertions) {
@@ -1357,6 +1363,7 @@ impl Timeline {
                 );
             }
         }
+        let has_ancestor = base_img_from_ancestor.is_some();
         // Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon,
         // and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket.
         let (mut split_history, lsn_split_points) = {
@@ -1390,6 +1397,9 @@ impl Timeline {
                         // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
                         // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
                         // dropped.
+                        //
+                        // TODO: in case we have both delta + images for a given LSN and it does not exceed the delta
+                        // threshold, we could have kept delta instead to save space. This is an optimization for the future.
                         continue;
                     }
                 }
@@ -1407,9 +1417,13 @@ impl Timeline {
             "should have at least below + above horizon batches"
         );
         let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
+        if let Some((key, lsn, img)) = base_img_from_ancestor {
+            replay_history.push((key, lsn, Value::Image(img)));
+        }
         for (i, split_for_lsn) in split_history.into_iter().enumerate() {
+            // TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly.
             records_since_last_image += split_for_lsn.len();
-            let generate_image = if i == 0 {
+            let generate_image = if i == 0 && !has_ancestor {
                 // We always generate images for the first batch (below horizon / lowest retain_lsn)
                 true
             } else if i == batch_cnt - 1 {
@@ -1532,20 +1546,25 @@ impl Timeline {
             retain_lsns_below_horizon.sort();
             (selected_layers, gc_cutoff, retain_lsns_below_horizon)
         };
-        let lowest_retain_lsn = retain_lsns_below_horizon
-            .first()
-            .copied()
-            .unwrap_or(gc_cutoff);
-        if cfg!(debug_assertions) {
-            assert_eq!(
-                lowest_retain_lsn,
-                retain_lsns_below_horizon
-                    .iter()
-                    .min()
-                    .copied()
-                    .unwrap_or(gc_cutoff)
-            );
-        }
+        let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
+            Lsn(self.ancestor_lsn.0 + 1)
+        } else {
+            let res = retain_lsns_below_horizon
+                .first()
+                .copied()
+                .unwrap_or(gc_cutoff);
+            if cfg!(debug_assertions) {
+                assert_eq!(
+                    res,
+                    retain_lsns_below_horizon
+                        .iter()
+                        .min()
+                        .copied()
+                        .unwrap_or(gc_cutoff)
+                );
+            }
+            res
+        };
         info!(
             "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
             layer_selection.len(),
@@ -1586,6 +1605,7 @@ impl Timeline {
         let mut accumulated_values = Vec::new();
         let mut last_key: Option<Key> = None;
 
+        #[allow(clippy::too_many_arguments)]
         async fn flush_deltas(
             deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
             last_key: Key,
@@ -1594,6 +1614,7 @@ impl Timeline {
             tline: &Arc<Timeline>,
             lowest_retain_lsn: Lsn,
             ctx: &RequestContext,
+            last_batch: bool,
         ) -> anyhow::Result<Option<ResidentLayer>> {
             // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
             // overlapping layers.
@@ -1614,7 +1635,7 @@ impl Timeline {
                 *current_delta_split_point += 1;
                 need_split = true;
             }
-            if !need_split {
+            if !need_split && !last_batch {
                 return Ok(None);
             }
             let deltas = std::mem::take(deltas);
@@ -1639,15 +1660,44 @@ impl Timeline {
             Ok(Some(delta_layer))
         }
 
-        let mut image_layer_writer = ImageLayerWriter::new(
-            self.conf,
-            self.timeline_id,
-            self.tenant_shard_id,
-            &(Key::MIN..Key::MAX), // covers the full key range
-            lowest_retain_lsn,
-            ctx,
-        )
-        .await?;
+        // Only create image layers when there is no ancestor branches. TODO: create covering image layer
+        // when some condition meet.
+        let mut image_layer_writer = if self.ancestor_timeline.is_none() {
+            Some(
+                ImageLayerWriter::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_shard_id,
+                    &(Key::MIN..Key::MAX), // covers the full key range
+                    lowest_retain_lsn,
+                    ctx,
+                )
+                .await?,
+            )
+        } else {
+            None
+        };
+
+        /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
+        ///
+        /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
+        /// is needed for reconstruction. This should be fixed in the future.
+        ///
+        /// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor
+        /// images.
+        async fn get_ancestor_image(
+            tline: &Arc<Timeline>,
+            key: Key,
+            ctx: &RequestContext,
+        ) -> anyhow::Result<Option<(Key, Lsn, Bytes)>> {
+            if tline.ancestor_timeline.is_none() {
+                return Ok(None);
+            };
+            // This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing
+            // as much existing code as possible.
+            let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
+            Ok(Some((key, tline.ancestor_lsn, img)))
+        }
 
         let mut delta_values = Vec::new();
         let delta_split_points = delta_split_points.into_iter().collect_vec();
@@ -1668,11 +1718,17 @@ impl Timeline {
                         gc_cutoff,
                         &retain_lsns_below_horizon,
                         COMPACTION_DELTA_THRESHOLD,
+                        get_ancestor_image(self, *last_key, ctx).await?,
                     )
                     .await?;
                 // Put the image into the image layer. Currently we have a single big layer for the compaction.
                 retention
-                    .pipe_to(*last_key, &mut delta_values, &mut image_layer_writer, ctx)
+                    .pipe_to(
+                        *last_key,
+                        &mut delta_values,
+                        image_layer_writer.as_mut(),
+                        ctx,
+                    )
                     .await?;
                 delta_layers.extend(
                     flush_deltas(
@@ -1683,6 +1739,7 @@ impl Timeline {
                         self,
                         lowest_retain_lsn,
                         ctx,
+                        false,
                     )
                     .await?,
                 );
@@ -1701,11 +1758,17 @@ impl Timeline {
                 gc_cutoff,
                 &retain_lsns_below_horizon,
                 COMPACTION_DELTA_THRESHOLD,
+                get_ancestor_image(self, last_key, ctx).await?,
             )
             .await?;
         // Put the image into the image layer. Currently we have a single big layer for the compaction.
         retention
-            .pipe_to(last_key, &mut delta_values, &mut image_layer_writer, ctx)
+            .pipe_to(
+                last_key,
+                &mut delta_values,
+                image_layer_writer.as_mut(),
+                ctx,
+            )
             .await?;
         delta_layers.extend(
             flush_deltas(
@@ -1716,19 +1779,25 @@ impl Timeline {
                 self,
                 lowest_retain_lsn,
                 ctx,
+                true,
             )
             .await?,
         );
+        assert!(delta_values.is_empty(), "unprocessed keys");
 
-        let image_layer = image_layer_writer.finish(self, ctx).await?;
+        let image_layer = if let Some(writer) = image_layer_writer {
+            Some(writer.finish(self, ctx).await?)
+        } else {
+            None
+        };
         info!(
             "produced {} delta layers and {} image layers",
             delta_layers.len(),
-            1
+            if image_layer.is_some() { 1 } else { 0 }
         );
         let mut compact_to = Vec::new();
         compact_to.extend(delta_layers);
-        compact_to.push(image_layer);
+        compact_to.extend(image_layer);
         // Step 3: Place back to the layer map.
         {
             let mut guard = self.layers.write().await;

From 4e3b70e3081165ebd5ca1f93e90cb172bcf6a16e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 31 Jul 2024 17:05:45 +0200
Subject: [PATCH 1312/1571] refactor(page_service): Timeline gate guard holding
 + cancellation + shutdown (#8339)

Since the introduction of sharding, the protocol handling loop in
`handle_pagerequests` cannot know anymore which concrete
`Tenant`/`Timeline` object any of the incoming `PagestreamFeMessage`
resolves to.
In fact, one message might resolve to one `Tenant`/`Timeline` while
the next one may resolve to another one.

To avoid going to tenant manager, we added the `shard_timelines` which
acted as an ever-growing cache that held timeline gate guards open for
the lifetime of the connection.
The consequence of holding the gate guards open was that we had to be
sensitive to every cached `Timeline::cancel` on each interaction with
the network connection, so that Timeline shutdown would not have to wait
for network connection interaction.

We can do better than that, meaning more efficiency & better
abstraction.
I proposed a sketch for it in

* https://github.com/neondatabase/neon/pull/8286

and this PR implements an evolution of that sketch.

The main idea is is that `mod page_service` shall be solely concerned
with the following:
1. receiving requests by speaking the protocol / pagestream subprotocol
2. dispatching the request to a corresponding method on the correct
shard/`Timeline` object
3. sending response by speaking the protocol / pagestream subprotocol.

The cancellation sensitivity responsibilities are clear cut:
* while in `page_service` code, sensitivity to page_service cancellation
is sufficient
* while in `Timeline` code, sensitivity to `Timeline::cancel` is
sufficient

To enforce these responsibilities, we introduce the notion of a
`timeline::handle::Handle` to a `Timeline` object that is checked out
from a `timeline::handle::Cache` for **each request**.
The `Handle` derefs to `Timeline` and is supposed to be used for a
single async method invocation on `Timeline`.
See the lengthy doc comment in `mod handle` for details of the design.
---
 pageserver/src/bin/pageserver.rs         |  43 +-
 pageserver/src/http/routes.rs            |   5 +
 pageserver/src/lib.rs                    |  10 +-
 pageserver/src/page_service.rs           | 766 +++++++++---------
 pageserver/src/tenant.rs                 |   2 +
 pageserver/src/tenant/mgr.rs             |   6 +-
 pageserver/src/tenant/timeline.rs        |  20 +
 pageserver/src/tenant/timeline/handle.rs | 967 +++++++++++++++++++++++
 8 files changed, 1387 insertions(+), 432 deletions(-)
 create mode 100644 pageserver/src/tenant/timeline/handle.rs

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 2d00f311fb..5ebd6511ac 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -17,11 +17,9 @@ use pageserver::config::PageserverIdentity;
 use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
-use pageserver::task_mgr::WALRECEIVER_RUNTIME;
+use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
 use pageserver::tenant::{secondary, TenantSharedResources};
-use pageserver::{
-    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
-};
+use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener};
 use remote_storage::GenericRemoteStorage;
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
@@ -31,11 +29,9 @@ use tracing::*;
 use metrics::set_build_info_metric;
 use pageserver::{
     config::PageServerConf,
-    context::{DownloadBehavior, RequestContext},
     deletion_queue::DeletionQueue,
     http, page_cache, page_service, task_mgr,
-    task_mgr::TaskKind,
-    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
+    task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME},
     tenant::mgr,
     virtual_file,
 };
@@ -594,30 +590,13 @@ fn start_pageserver(
 
     // Spawn a task to listen for libpq connections. It will spawn further tasks
     // for each connection. We created the listener earlier already.
-    let libpq_listener = {
-        let cancel = CancellationToken::new();
-        let libpq_ctx = RequestContext::todo_child(
-            TaskKind::LibpqEndpointListener,
-            // listener task shouldn't need to download anything. (We will
-            // create a separate sub-contexts for each connection, with their
-            // own download behavior. This context is used only to listen and
-            // accept connections.)
-            DownloadBehavior::Error,
-        );
-
-        let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
-            "libpq listener",
-            page_service::libpq_listener_main(
-                tenant_manager.clone(),
-                pg_auth,
-                pageserver_listener,
-                conf.pg_auth_type,
-                libpq_ctx,
-                cancel.clone(),
-            ),
-        ));
-        LibpqEndpointListener(CancellableTask { task, cancel })
-    };
+    let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
+        let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
+        pageserver_listener
+            .set_nonblocking(true)
+            .context("set listener to nonblocking")?;
+        tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
+    });
 
     let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
 
@@ -645,7 +624,7 @@ fn start_pageserver(
             shutdown_pageserver.take();
             pageserver::shutdown_pageserver(
                 http_endpoint_listener,
-                libpq_listener,
+                page_service,
                 consumption_metrics_tasks,
                 disk_usage_eviction_task,
                 &tenant_manager,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 9222123ad3..117f2c5869 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -296,6 +296,11 @@ impl From<GetActiveTenantError> for ApiError {
             GetActiveTenantError::WaitForActiveTimeout { .. } => {
                 ApiError::ResourceUnavailable(format!("{}", e).into())
             }
+            GetActiveTenantError::SwitchedTenant => {
+                // in our HTTP handlers, this error doesn't happen
+                // TODO: separate error types
+                ApiError::ResourceUnavailable("switched tenant".into())
+            }
         }
     }
 }
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index d944019641..f729cad3c3 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -30,7 +30,6 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;
 
-use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
 use tenant::{
@@ -63,7 +62,6 @@ pub struct CancellableTask {
     pub cancel: CancellationToken,
 }
 pub struct HttpEndpointListener(pub CancellableTask);
-pub struct LibpqEndpointListener(pub CancellableTask);
 pub struct ConsumptionMetricsTasks(pub CancellableTask);
 pub struct DiskUsageEvictionTask(pub CancellableTask);
 impl CancellableTask {
@@ -77,7 +75,7 @@ impl CancellableTask {
 #[allow(clippy::too_many_arguments)]
 pub async fn shutdown_pageserver(
     http_listener: HttpEndpointListener,
-    libpq_listener: LibpqEndpointListener,
+    page_service: page_service::Listener,
     consumption_metrics_worker: ConsumptionMetricsTasks,
     disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
     tenant_manager: &TenantManager,
@@ -89,8 +87,8 @@ pub async fn shutdown_pageserver(
     use std::time::Duration;
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
-    timed(
-        libpq_listener.0.shutdown(),
+    let remaining_connections = timed(
+        page_service.stop_accepting(),
         "shutdown LibpqEndpointListener",
         Duration::from_secs(1),
     )
@@ -108,7 +106,7 @@ pub async fn shutdown_pageserver(
     // Shut down any page service tasks: any in-progress work for particular timelines or tenants
     // should already have been canclled via mgr::shutdown_all_tenants
     timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
+        remaining_connections.shutdown(),
         "shutdown PageRequestHandlers",
         Duration::from_secs(1),
     )
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 6353f713e0..5344b83e0d 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -4,9 +4,8 @@
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
-use futures::stream::FuturesUnordered;
-use futures::StreamExt;
-use pageserver_api::key::Key;
+use futures::FutureExt;
+use once_cell::sync::OnceCell;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
     PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
@@ -15,28 +14,23 @@ use pageserver_api::models::{
     PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
     PagestreamNblocksResponse, PagestreamProtocolVersion,
 };
-use pageserver_api::shard::ShardIndex;
-use pageserver_api::shard::ShardNumber;
 use pageserver_api::shard::TenantShardId;
 use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
 use std::borrow::Cow;
-use std::collections::HashMap;
 use std::io;
-use std::net::TcpListener;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
-use std::time::Duration;
-use std::time::Instant;
 use std::time::SystemTime;
+use std::time::{Duration, Instant};
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::sync::gate::GateGuard;
 use utils::{
     auth::{Claims, Scope, SwappableJwtAuth},
     id::{TenantId, TimelineId},
@@ -47,61 +41,130 @@ use utils::{
 use crate::auth::check_permission;
 use crate::basebackup;
 use crate::basebackup::BasebackupError;
+use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics;
 use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
 use crate::pgdatadir_mapping::Version;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
-use crate::task_mgr;
 use crate::task_mgr::TaskKind;
-use crate::tenant::mgr::GetActiveTenantError;
-use crate::tenant::mgr::GetTenantError;
-use crate::tenant::mgr::ShardResolveResult;
+use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME};
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
-use crate::tenant::timeline::WaitLsnError;
+use crate::tenant::mgr::{GetActiveTenantError, GetTenantError, ShardResolveResult};
+use crate::tenant::timeline::{self, WaitLsnError};
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
-use crate::tenant::Tenant;
 use crate::tenant::Timeline;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
 
-// How long we may wait for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
-// is not yet in state [`TenantState::Active`].
+/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which
+/// is not yet in state [`TenantState::Active`].
+///
+/// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 
 ///////////////////////////////////////////////////////////////////////////////
 
+pub struct Listener {
+    cancel: CancellationToken,
+    /// Cancel the listener task through `listen_cancel` to shut down the listener
+    /// and get a handle on the existing connections.
+    task: JoinHandle<Connections>,
+}
+
+pub struct Connections {
+    cancel: CancellationToken,
+    tasks: tokio::task::JoinSet<ConnectionHandlerResult>,
+}
+
+pub fn spawn(
+    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
+    pg_auth: Option<Arc<SwappableJwtAuth>>,
+    tcp_listener: tokio::net::TcpListener,
+) -> Listener {
+    let cancel = CancellationToken::new();
+    let libpq_ctx = RequestContext::todo_child(
+        TaskKind::LibpqEndpointListener,
+        // listener task shouldn't need to download anything. (We will
+        // create a separate sub-contexts for each connection, with their
+        // own download behavior. This context is used only to listen and
+        // accept connections.)
+        DownloadBehavior::Error,
+    );
+    let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+        "libpq listener",
+        libpq_listener_main(
+            tenant_manager,
+            pg_auth,
+            tcp_listener,
+            conf.pg_auth_type,
+            libpq_ctx,
+            cancel.clone(),
+        )
+        .map(anyhow::Ok),
+    ));
+
+    Listener { cancel, task }
+}
+
+impl Listener {
+    pub async fn stop_accepting(self) -> Connections {
+        self.cancel.cancel();
+        self.task
+            .await
+            .expect("unreachable: we wrap the listener task in task_mgr::exit_on_panic_or_error")
+    }
+}
+impl Connections {
+    pub async fn shutdown(self) {
+        let Self { cancel, mut tasks } = self;
+        cancel.cancel();
+        while let Some(res) = tasks.join_next().await {
+            // the logging done here mimics what was formerly done by task_mgr
+            match res {
+                Ok(Ok(())) => {}
+                Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
+                Err(e) => error!("page_service connection task panicked: {:?}", e),
+            }
+        }
+    }
+}
+
 ///
 /// Main loop of the page service.
 ///
 /// Listens for connections, and launches a new handler task for each.
 ///
+/// Returns Ok(()) upon cancellation via `cancel`, returning the set of
+/// open connections.
+///
 pub async fn libpq_listener_main(
     tenant_manager: Arc<TenantManager>,
     auth: Option<Arc<SwappableJwtAuth>>,
-    listener: TcpListener,
+    listener: tokio::net::TcpListener,
     auth_type: AuthType,
     listener_ctx: RequestContext,
-    cancel: CancellationToken,
-) -> anyhow::Result<()> {
-    listener.set_nonblocking(true)?;
-    let tokio_listener = tokio::net::TcpListener::from_std(listener)?;
+    listener_cancel: CancellationToken,
+) -> Connections {
+    let connections_cancel = CancellationToken::new();
+    let mut connection_handler_tasks = tokio::task::JoinSet::default();
 
     // Wait for a new connection to arrive, or for server shutdown.
     while let Some(res) = tokio::select! {
         biased;
 
-        _ = cancel.cancelled() => {
+        _ = listener_cancel.cancelled() => {
             // We were requested to shut down.
             None
         }
 
-        res = tokio_listener.accept() => {
+        res = listener.accept() => {
             Some(res)
         }
     } {
@@ -110,28 +173,16 @@ pub async fn libpq_listener_main(
                 // Connection established. Spawn a new task to handle it.
                 debug!("accepted connection from {}", peer_addr);
                 let local_auth = auth.clone();
-
                 let connection_ctx = listener_ctx
                     .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download);
-
-                // PageRequestHandler tasks are not associated with any particular
-                // timeline in the task manager. In practice most connections will
-                // only deal with a particular timeline, but we don't know which one
-                // yet.
-                task_mgr::spawn(
-                    &tokio::runtime::Handle::current(),
-                    TaskKind::PageRequestHandler,
-                    None,
-                    None,
-                    "serving compute connection task",
-                    page_service_conn_main(
-                        tenant_manager.clone(),
-                        local_auth,
-                        socket,
-                        auth_type,
-                        connection_ctx,
-                    ),
-                );
+                connection_handler_tasks.spawn(page_service_conn_main(
+                    tenant_manager.clone(),
+                    local_auth,
+                    socket,
+                    auth_type,
+                    connection_ctx,
+                    connections_cancel.child_token(),
+                ));
             }
             Err(err) => {
                 // accept() failed. Log the error, and loop back to retry on next connection.
@@ -140,11 +191,16 @@ pub async fn libpq_listener_main(
         }
     }
 
-    debug!("page_service loop terminated");
+    debug!("page_service listener loop terminated");
 
-    Ok(())
+    Connections {
+        cancel: connections_cancel,
+        tasks: connection_handler_tasks,
+    }
 }
 
+type ConnectionHandlerResult = anyhow::Result<()>;
+
 #[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
     tenant_manager: Arc<TenantManager>,
@@ -152,7 +208,8 @@ async fn page_service_conn_main(
     socket: tokio::net::TcpStream,
     auth_type: AuthType,
     connection_ctx: RequestContext,
-) -> anyhow::Result<()> {
+    cancel: CancellationToken,
+) -> ConnectionHandlerResult {
     let _guard = LIVE_CONNECTIONS
         .with_label_values(&["page_service"])
         .guard();
@@ -200,13 +257,11 @@ async fn page_service_conn_main(
     // and create a child per-query context when it invokes process_query.
     // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
     // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(tenant_manager, auth, connection_ctx);
+    let mut conn_handler =
+        PageServerHandler::new(tenant_manager, auth, connection_ctx, cancel.clone());
     let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
 
-    match pgbackend
-        .run(&mut conn_handler, &task_mgr::shutdown_token())
-        .await
-    {
+    match pgbackend.run(&mut conn_handler, &cancel).await {
         Ok(()) => {
             // we've been requested to shut down
             Ok(())
@@ -223,32 +278,154 @@ async fn page_service_conn_main(
     }
 }
 
-/// While a handler holds a reference to a Timeline, it also holds a the
-/// timeline's Gate open.
-struct HandlerTimeline {
-    timeline: Arc<Timeline>,
-    _guard: GateGuard,
-}
-
 struct PageServerHandler {
     auth: Option<Arc<SwappableJwtAuth>>,
     claims: Option<Claims>,
 
-    tenant_manager: Arc<TenantManager>,
-
     /// The context created for the lifetime of the connection
     /// services by this PageServerHandler.
     /// For each query received over the connection,
     /// `process_query` creates a child context from this one.
     connection_ctx: RequestContext,
 
-    /// See [`Self::cache_timeline`] for usage.
-    ///
+    cancel: CancellationToken,
+
+    timeline_handles: TimelineHandles,
+}
+
+struct TimelineHandles {
+    wrapper: TenantManagerWrapper,
     /// Note on size: the typical size of this map is 1.  The largest size we expect
     /// to see is the number of shards divided by the number of pageservers (typically < 2),
     /// or the ratio used when splitting shards (i.e. how many children created from one)
     /// parent shard, where a "large" number might be ~8.
-    shard_timelines: HashMap<ShardIndex, HandlerTimeline>,
+    handles: timeline::handle::Cache<TenantManagerTypes>,
+}
+
+impl TimelineHandles {
+    fn new(tenant_manager: Arc<TenantManager>) -> Self {
+        Self {
+            wrapper: TenantManagerWrapper {
+                tenant_manager,
+                tenant_id: OnceCell::new(),
+            },
+            handles: Default::default(),
+        }
+    }
+    async fn get(
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_selector: ShardSelector,
+    ) -> Result<timeline::handle::Handle<TenantManagerTypes>, GetActiveTimelineError> {
+        if *self.wrapper.tenant_id.get_or_init(|| tenant_id) != tenant_id {
+            return Err(GetActiveTimelineError::Tenant(
+                GetActiveTenantError::SwitchedTenant,
+            ));
+        }
+        self.handles
+            .get(timeline_id, shard_selector, &self.wrapper)
+            .await
+            .map_err(|e| match e {
+                timeline::handle::GetError::TenantManager(e) => e,
+                timeline::handle::GetError::TimelineGateClosed => {
+                    trace!("timeline gate closed");
+                    GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
+                }
+                timeline::handle::GetError::PerTimelineStateShutDown => {
+                    trace!("per-timeline state shut down");
+                    GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
+                }
+            })
+    }
+}
+
+pub(crate) struct TenantManagerWrapper {
+    tenant_manager: Arc<TenantManager>,
+    // We do not support switching tenant_id on a connection at this point.
+    // We can can add support for this later if needed without changing
+    // the protocol.
+    tenant_id: once_cell::sync::OnceCell<TenantId>,
+}
+
+#[derive(Debug)]
+pub(crate) struct TenantManagerTypes;
+
+impl timeline::handle::Types for TenantManagerTypes {
+    type TenantManagerError = GetActiveTimelineError;
+    type TenantManager = TenantManagerWrapper;
+    type Timeline = Arc<Timeline>;
+}
+
+impl timeline::handle::ArcTimeline<TenantManagerTypes> for Arc<Timeline> {
+    fn gate(&self) -> &utils::sync::gate::Gate {
+        &self.gate
+    }
+
+    fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId {
+        Timeline::shard_timeline_id(self)
+    }
+
+    fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState<TenantManagerTypes> {
+        &self.handles
+    }
+
+    fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity {
+        Timeline::get_shard_identity(self)
+    }
+}
+
+impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrapper {
+    async fn resolve(
+        &self,
+        timeline_id: TimelineId,
+        shard_selector: ShardSelector,
+    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
+        let tenant_id = self.tenant_id.get().expect("we set this in get()");
+        let timeout = ACTIVE_TENANT_TIMEOUT;
+        let wait_start = Instant::now();
+        let deadline = wait_start + timeout;
+        let tenant_shard = loop {
+            let resolved = self
+                .tenant_manager
+                .resolve_attached_shard(tenant_id, shard_selector);
+            match resolved {
+                ShardResolveResult::Found(tenant_shard) => break tenant_shard,
+                ShardResolveResult::NotFound => {
+                    return Err(GetActiveTimelineError::Tenant(
+                        GetActiveTenantError::NotFound(GetTenantError::NotFound(*tenant_id)),
+                    ));
+                }
+                ShardResolveResult::InProgress(barrier) => {
+                    // We can't authoritatively answer right now: wait for InProgress state
+                    // to end, then try again
+                    tokio::select! {
+                        _  = barrier.wait() => {
+                            // The barrier completed: proceed around the loop to try looking up again
+                        },
+                        _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
+                            return Err(GetActiveTimelineError::Tenant(GetActiveTenantError::WaitForActiveTimeout {
+                                latest_state: None,
+                                wait_time: timeout,
+                            }));
+                        }
+                    }
+                }
+            };
+        };
+
+        tracing::debug!("Waiting for tenant to enter active state...");
+        tenant_shard
+            .wait_to_become_active(deadline.duration_since(Instant::now()))
+            .await
+            .map_err(GetActiveTimelineError::Tenant)?;
+
+        let timeline = tenant_shard
+            .get_timeline(timeline_id, true)
+            .map_err(GetActiveTimelineError::Timeline)?;
+        set_tracing_field_shard_id(&timeline);
+        Ok(timeline)
+    }
 }
 
 #[derive(thiserror::Error, Debug)]
@@ -292,7 +469,11 @@ impl From<PageReconstructError> for PageStreamError {
 impl From<GetActiveTimelineError> for PageStreamError {
     fn from(value: GetActiveTimelineError) -> Self {
         match value {
-            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown,
+            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled)
+            | GetActiveTimelineError::Tenant(GetActiveTenantError::WillNotBecomeActive(
+                TenantState::Stopping { .. },
+            ))
+            | GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) => Self::Shutdown,
             GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()),
             GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()),
         }
@@ -324,64 +505,17 @@ impl PageServerHandler {
         tenant_manager: Arc<TenantManager>,
         auth: Option<Arc<SwappableJwtAuth>>,
         connection_ctx: RequestContext,
+        cancel: CancellationToken,
     ) -> Self {
         PageServerHandler {
-            tenant_manager,
             auth,
             claims: None,
             connection_ctx,
-            shard_timelines: HashMap::new(),
+            timeline_handles: TimelineHandles::new(tenant_manager),
+            cancel,
         }
     }
 
-    /// Future that completes when we need to shut down the connection.
-    ///
-    /// We currently need to shut down when any of the following happens:
-    /// 1. any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
-    /// 2. task_mgr requests shutdown of the connection
-    ///
-    /// NB on (1): the connection's lifecycle is not actually tied to any of the
-    /// `shard_timelines`s' lifecycles. But it's _necessary_ in the current
-    /// implementation to be responsive to timeline cancellation because
-    /// the connection holds their `GateGuards` open (sored in `shard_timelines`).
-    /// We currently do the easy thing and terminate the connection if any of the
-    /// shard_timelines gets cancelled. But really, we cuold spend more effort
-    /// and simply remove the cancelled timeline from the `shard_timelines`, thereby
-    /// dropping the guard.
-    ///
-    /// NB: keep in sync with [`Self::is_connection_cancelled`]
-    async fn await_connection_cancelled(&self) {
-        // A short wait before we expend the cycles to walk our timeline map.  This avoids incurring
-        // that cost every time we check for cancellation.
-        tokio::time::sleep(Duration::from_millis(10)).await;
-
-        // This function is never called concurrently with code that adds timelines to shard_timelines,
-        // which is enforced by the borrow checker (the future returned by this function carries the
-        // immutable &self).  So it's fine to evaluate shard_timelines after the sleep, we don't risk
-        // missing any inserts to the map.
-
-        let mut cancellation_sources = Vec::with_capacity(1 + self.shard_timelines.len());
-        use futures::future::Either;
-        cancellation_sources.push(Either::Left(task_mgr::shutdown_watcher()));
-        cancellation_sources.extend(
-            self.shard_timelines
-                .values()
-                .map(|ht| Either::Right(ht.timeline.cancel.cancelled())),
-        );
-        FuturesUnordered::from_iter(cancellation_sources)
-            .next()
-            .await;
-    }
-
-    /// Checking variant of [`Self::await_connection_cancelled`].
-    fn is_connection_cancelled(&self) -> bool {
-        task_mgr::is_shutdown_requested()
-            || self
-                .shard_timelines
-                .values()
-                .any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping())
-    }
-
     /// This function always respects cancellation of any timeline in `[Self::shard_timelines]`.  Pass in
     /// a cancellation token at the next scope up (such as a tenant cancellation token) to ensure we respect
     /// cancellation if there aren't any timelines in the cache.
@@ -400,15 +534,21 @@ impl PageServerHandler {
             flush_r = pgb.flush() => {
                 Ok(flush_r?)
             },
-            _ = self.await_connection_cancelled() => {
-                Err(QueryError::Shutdown)
-            }
             _ = cancel.cancelled() => {
                 Err(QueryError::Shutdown)
             }
         )
     }
 
+    /// Pagestream sub-protocol handler.
+    ///
+    /// It is a simple request-response protocol inside a COPYBOTH session.
+    ///
+    /// # Coding Discipline
+    ///
+    /// Coding discipline within this function: all interaction with the `pgb` connection
+    /// needs to be sensitive to connection shutdown, currently signalled via [`Self::cancel`].
+    /// This is so that we can shutdown page_service quickly.
     #[instrument(skip_all)]
     async fn handle_pagerequests<IO>(
         &mut self,
@@ -423,27 +563,27 @@ impl PageServerHandler {
     {
         debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
 
-        let tenant = self
-            .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
-            .await?;
-
         // switch client to COPYBOTH
         pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
-        self.flush_cancellable(pgb, &tenant.cancel).await?;
+        tokio::select! {
+            biased;
+            _ = self.cancel.cancelled() => {
+                return Err(QueryError::Shutdown)
+            }
+            res = pgb.flush() => {
+                res?;
+            }
+        }
 
         loop {
+            // read request bytes (it's exactly 1 PagestreamFeMessage per CopyData)
             let msg = tokio::select! {
                 biased;
-
-                _ = self.await_connection_cancelled() => {
-                    // We were requested to shut down.
-                    info!("shutdown request received in page handler");
+                _ = self.cancel.cancelled() => {
                     return Err(QueryError::Shutdown)
                 }
-
                 msg = pgb.read_message() => { msg }
             };
-
             let copy_data_bytes = match msg? {
                 Some(FeMessage::CopyData(bytes)) => bytes,
                 Some(FeMessage::Terminate) => break,
@@ -458,13 +598,12 @@ impl PageServerHandler {
             trace!("query: {copy_data_bytes:?}");
             fail::fail_point!("ps::handle-pagerequest-message");
 
+            // parse request
             let neon_fe_msg =
                 PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
 
-            // TODO: We could create a new per-request context here, with unique ID.
-            // Currently we use the same per-timeline context for all requests
-
-            let (response, span) = match neon_fe_msg {
+            // invoke handler function
+            let (handler_result, span) = match neon_fe_msg {
                 PagestreamFeMessage::Exists(req) => {
                     fail::fail_point!("ps::handle-pagerequest-message::exists");
                     let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
@@ -518,31 +657,26 @@ impl PageServerHandler {
                 }
             };
 
-            match response {
-                Err(PageStreamError::Shutdown) => {
-                    // If we fail to fulfil a request during shutdown, which may be _because_ of
-                    // shutdown, then do not send the error to the client.  Instead just drop the
-                    // connection.
-                    span.in_scope(|| info!("dropping connection due to shutdown"));
-                    return Err(QueryError::Shutdown);
-                }
-                Err(PageStreamError::Reconnect(reason)) => {
-                    span.in_scope(|| info!("handler requested reconnect: {reason}"));
-                    return Err(QueryError::Reconnect);
-                }
-                Err(e) if self.is_connection_cancelled() => {
-                    // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
-                    // shutdown error, this may be buried inside a PageReconstructError::Other for example.
-                    //
-                    // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
-                    // because wait_lsn etc will drop out
-                    // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
-                    // is_canceled(): [`Timeline::shutdown`]` has entered
-                    span.in_scope(|| info!("dropped error response during shutdown: {e:#}"));
-                    return Err(QueryError::Shutdown);
-                }
-                r => {
-                    let response_msg = r.unwrap_or_else(|e| {
+            // Map handler result to protocol behavior.
+            // Some handler errors cause exit from pagestream protocol.
+            // Other handler errors are sent back as an error message and we stay in pagestream protocol.
+            let response_msg = match handler_result {
+                Err(e) => match &e {
+                    PageStreamError::Shutdown => {
+                        // If we fail to fulfil a request during shutdown, which may be _because_ of
+                        // shutdown, then do not send the error to the client.  Instead just drop the
+                        // connection.
+                        span.in_scope(|| info!("dropping connection due to shutdown"));
+                        return Err(QueryError::Shutdown);
+                    }
+                    PageStreamError::Reconnect(reason) => {
+                        span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                        return Err(QueryError::Reconnect);
+                    }
+                    PageStreamError::Read(_)
+                    | PageStreamError::LsnTimeout(_)
+                    | PageStreamError::NotFound(_)
+                    | PageStreamError::BadRequest(_) => {
                         // print the all details to the log with {:#}, but for the client the
                         // error message is enough.  Do not log if shutting down, as the anyhow::Error
                         // here includes cancellation which is not an error.
@@ -553,10 +687,22 @@ impl PageServerHandler {
                         PagestreamBeMessage::Error(PagestreamErrorResponse {
                             message: e.to_string(),
                         })
-                    });
+                    }
+                },
+                Ok(response_msg) => response_msg,
+            };
 
-                    pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
-                    self.flush_cancellable(pgb, &tenant.cancel).await?;
+            // marshal & transmit response message
+            pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
+            tokio::select! {
+                biased;
+                _ = self.cancel.cancelled() => {
+                    // We were requested to shut down.
+                    info!("shutdown request received in page handler");
+                    return Err(QueryError::Shutdown)
+                }
+                res = pgb.flush() => {
+                    res?;
                 }
             }
         }
@@ -644,7 +790,7 @@ impl PageServerHandler {
 
     #[instrument(skip_all, fields(shard_id, %lsn))]
     async fn handle_make_lsn_lease<IO>(
-        &self,
+        &mut self,
         pgb: &mut PostgresBackend<IO>,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
@@ -654,10 +800,16 @@ impl PageServerHandler {
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
-        let shard_selector = ShardSelector::Known(tenant_shard_id.to_index());
         let timeline = self
-            .get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector)
+            .timeline_handles
+            .get(
+                tenant_shard_id.tenant_id,
+                timeline_id,
+                ShardSelector::Known(tenant_shard_id.to_index()),
+            )
             .await?;
+        set_tracing_field_shard_id(&timeline);
+
         let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?;
         let valid_until = lease
             .valid_until
@@ -683,14 +835,17 @@ impl PageServerHandler {
         req: &PagestreamExistsRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+        let timeline = self
+            .timeline_handles
+            .get(tenant_id, timeline_id, ShardSelector::Zero)
+            .await?;
         let _timer = timeline
             .query_metrics
             .start_timer(metrics::SmgrQueryType::GetRelExists, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            timeline,
+            &timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -715,7 +870,10 @@ impl PageServerHandler {
         req: &PagestreamNblocksRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+        let timeline = self
+            .timeline_handles
+            .get(tenant_id, timeline_id, ShardSelector::Zero)
+            .await?;
 
         let _timer = timeline
             .query_metrics
@@ -723,7 +881,7 @@ impl PageServerHandler {
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            timeline,
+            &timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -748,7 +906,10 @@ impl PageServerHandler {
         req: &PagestreamDbSizeRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+        let timeline = self
+            .timeline_handles
+            .get(tenant_id, timeline_id, ShardSelector::Zero)
+            .await?;
 
         let _timer = timeline
             .query_metrics
@@ -756,7 +917,7 @@ impl PageServerHandler {
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            timeline,
+            &timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -774,122 +935,6 @@ impl PageServerHandler {
         }))
     }
 
-    /// For most getpage requests, we will already have a Timeline to serve the request: this function
-    /// looks up such a Timeline synchronously and without touching any global state.
-    fn get_cached_timeline_for_page(
-        &mut self,
-        req: &PagestreamGetPageRequest,
-    ) -> Result<&Arc<Timeline>, Key> {
-        let key = if let Some((first_idx, first_timeline)) = self.shard_timelines.iter().next() {
-            // Fastest path: single sharded case
-            if first_idx.shard_count.count() == 1 {
-                return Ok(&first_timeline.timeline);
-            }
-
-            let key = rel_block_to_key(req.rel, req.blkno);
-            let shard_num = first_timeline
-                .timeline
-                .get_shard_identity()
-                .get_shard_number(&key);
-
-            // Fast path: matched the first timeline in our local handler map.  This case is common if
-            // only one shard per tenant is attached to this pageserver.
-            if first_timeline.timeline.get_shard_identity().number == shard_num {
-                return Ok(&first_timeline.timeline);
-            }
-
-            let shard_index = ShardIndex {
-                shard_number: shard_num,
-                shard_count: first_timeline.timeline.get_shard_identity().count,
-            };
-
-            // Fast-ish path: timeline is in the connection handler's local cache
-            if let Some(found) = self.shard_timelines.get(&shard_index) {
-                return Ok(&found.timeline);
-            }
-
-            key
-        } else {
-            rel_block_to_key(req.rel, req.blkno)
-        };
-
-        Err(key)
-    }
-
-    /// Having looked up the [`Timeline`] instance for a particular shard, cache it to enable
-    /// use in future requests without having to traverse [`crate::tenant::mgr::TenantManager`]
-    /// again.
-    ///
-    /// Note that all the Timelines in this cache are for the same timeline_id: they're differ
-    /// in which shard they belong to.  When we serve a getpage@lsn request, we choose a shard
-    /// based on key.
-    ///
-    /// The typical size of this cache is 1, as we generally create shards to distribute work
-    /// across pageservers, so don't tend to have multiple shards for the same tenant on the
-    /// same pageserver.
-    fn cache_timeline(
-        &mut self,
-        timeline: Arc<Timeline>,
-    ) -> Result<&Arc<Timeline>, GetActiveTimelineError> {
-        let gate_guard = timeline
-            .gate
-            .enter()
-            .map_err(|_| GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled))?;
-
-        let shard_index = timeline.tenant_shard_id.to_index();
-        let entry = self
-            .shard_timelines
-            .entry(shard_index)
-            .or_insert(HandlerTimeline {
-                timeline,
-                _guard: gate_guard,
-            });
-
-        Ok(&entry.timeline)
-    }
-
-    /// If [`Self::get_cached_timeline_for_page`] missed, then this function is used to populate the cache with
-    /// a Timeline to serve requests for this key, if such a Timeline is present on this pageserver.  If no such
-    /// Timeline is found, then we will return an error (this indicates that the client is talking to the wrong node).
-    async fn load_timeline_for_page(
-        &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        key: Key,
-    ) -> anyhow::Result<&Arc<Timeline>, GetActiveTimelineError> {
-        // Slow path: we must call out to the TenantManager to find the timeline for this Key
-        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Page(key))
-            .await?;
-
-        self.cache_timeline(timeline)
-    }
-
-    async fn get_timeline_shard_zero(
-        &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> anyhow::Result<&Arc<Timeline>, GetActiveTimelineError> {
-        // This is a borrow-checker workaround: we can't return from inside of the  `if let Some` because
-        // that would be an immutable-borrow-self return, whereas later in the function we will use a mutable
-        // ref to salf.  So instead, we first build a bool, and then return while not borrowing self.
-        let have_cached = if let Some((idx, _tl)) = self.shard_timelines.iter().next() {
-            idx.shard_number == ShardNumber(0)
-        } else {
-            false
-        };
-
-        if have_cached {
-            let entry = self.shard_timelines.iter().next().unwrap();
-            Ok(&entry.1.timeline)
-        } else {
-            let timeline = self
-                .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
-                .await?;
-            Ok(self.cache_timeline(timeline)?)
-        }
-    }
-
     #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_page_at_lsn_request(
         &mut self,
@@ -898,33 +943,30 @@ impl PageServerHandler {
         req: &PagestreamGetPageRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = match self.get_cached_timeline_for_page(req) {
-            Ok(tl) => {
-                set_tracing_field_shard_id(tl);
-                tl
-            }
-            Err(key) => {
-                match self
-                    .load_timeline_for_page(tenant_id, timeline_id, key)
-                    .await
-                {
-                    Ok(t) => t,
-                    Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
-                        // We already know this tenant exists in general, because we resolved it at
-                        // start of connection.  Getting a NotFound here indicates that the shard containing
-                        // the requested page is not present on this node: the client's knowledge of shard->pageserver
-                        // mapping is out of date.
-                        //
-                        // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
-                        // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
-                        // and talk to a different pageserver.
-                        return Err(PageStreamError::Reconnect(
-                            "getpage@lsn request routed to wrong shard".into(),
-                        ));
-                    }
-                    Err(e) => return Err(e.into()),
-                }
+        let timeline = match self
+            .timeline_handles
+            .get(
+                tenant_id,
+                timeline_id,
+                ShardSelector::Page(rel_block_to_key(req.rel, req.blkno)),
+            )
+            .await
+        {
+            Ok(tl) => tl,
+            Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
+                // We already know this tenant exists in general, because we resolved it at
+                // start of connection.  Getting a NotFound here indicates that the shard containing
+                // the requested page is not present on this node: the client's knowledge of shard->pageserver
+                // mapping is out of date.
+                //
+                // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
+                // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
+                // and talk to a different pageserver.
+                return Err(PageStreamError::Reconnect(
+                    "getpage@lsn request routed to wrong shard".into(),
+                ));
             }
+            Err(e) => return Err(e.into()),
         };
 
         let _timer = timeline
@@ -933,7 +975,7 @@ impl PageServerHandler {
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            timeline,
+            &timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -958,7 +1000,10 @@ impl PageServerHandler {
         req: &PagestreamGetSlruSegmentRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+        let timeline = self
+            .timeline_handles
+            .get(tenant_id, timeline_id, ShardSelector::Zero)
+            .await?;
 
         let _timer = timeline
             .query_metrics
@@ -966,7 +1011,7 @@ impl PageServerHandler {
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            timeline,
+            &timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -987,6 +1032,15 @@ impl PageServerHandler {
     /// Full basebackups should only be used for debugging purposes.
     /// Originally, it was introduced to enable breaking storage format changes,
     /// but that is not applicable anymore.
+    ///
+    /// # Coding Discipline
+    ///
+    /// Coding discipline within this function: all interaction with the `pgb` connection
+    /// needs to be sensitive to connection shutdown, currently signalled via [`Self::cancel`].
+    /// This is so that we can shutdown page_service quickly.
+    ///
+    /// TODO: wrap the pgb that we pass to the basebackup handler so that it's sensitive
+    /// to connection cancellation.
     #[allow(clippy::too_many_arguments)]
     #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
     async fn handle_basebackup_request<IO>(
@@ -1012,10 +1066,11 @@ impl PageServerHandler {
 
         let started = std::time::Instant::now();
 
-        // check that the timeline exists
         let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+            .timeline_handles
+            .get(tenant_id, timeline_id, ShardSelector::Zero)
             .await?;
+
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         if let Some(lsn) = lsn {
             // Backup was requested at a particular LSN. Wait for it to arrive.
@@ -1037,7 +1092,7 @@ impl PageServerHandler {
         // switch client to COPYOUT
         pgb.write_message_noflush(&BeMessage::CopyOutResponse)
             .map_err(QueryError::Disconnected)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
+        self.flush_cancellable(pgb, &self.cancel).await?;
 
         // Send a tarball of the latest layer on the timeline. Compress if not
         // fullbackup. TODO Compress in that case too (tests need to be updated)
@@ -1128,77 +1183,6 @@ impl PageServerHandler {
             .expect("claims presence already checked");
         check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
     }
-
-    /// Shorthand for getting a reference to a Timeline of an Active tenant.
-    async fn get_active_tenant_timeline(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        selector: ShardSelector,
-    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
-        let tenant = self
-            .get_active_tenant_with_timeout(tenant_id, selector, ACTIVE_TENANT_TIMEOUT)
-            .await
-            .map_err(GetActiveTimelineError::Tenant)?;
-        let timeline = tenant.get_timeline(timeline_id, true)?;
-        set_tracing_field_shard_id(&timeline);
-        Ok(timeline)
-    }
-
-    /// Get a shard's [`Tenant`] in its active state, if present.  If we don't find the shard and some
-    /// slots for this tenant are `InProgress` then we will wait.
-    /// If we find the [`Tenant`] and it's not yet in state [`TenantState::Active`], we will wait.
-    ///
-    /// `timeout` is used as a total timeout for the whole wait operation.
-    async fn get_active_tenant_with_timeout(
-        &self,
-        tenant_id: TenantId,
-        shard_selector: ShardSelector,
-        timeout: Duration,
-    ) -> Result<Arc<Tenant>, GetActiveTenantError> {
-        let wait_start = Instant::now();
-        let deadline = wait_start + timeout;
-
-        // Resolve TenantId to TenantShardId.  This is usually a quick one-shot thing, the loop is
-        // for handling the rare case that the slot we're accessing is InProgress.
-        let tenant_shard = loop {
-            let resolved = self
-                .tenant_manager
-                .resolve_attached_shard(&tenant_id, shard_selector);
-            match resolved {
-                ShardResolveResult::Found(tenant_shard) => break tenant_shard,
-                ShardResolveResult::NotFound => {
-                    return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
-                        tenant_id,
-                    )));
-                }
-                ShardResolveResult::InProgress(barrier) => {
-                    // We can't authoritatively answer right now: wait for InProgress state
-                    // to end, then try again
-                    tokio::select! {
-                        _ = self.await_connection_cancelled() => {
-                            return Err(GetActiveTenantError::Cancelled)
-                        },
-                        _  = barrier.wait() => {
-                            // The barrier completed: proceed around the loop to try looking up again
-                        },
-                        _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
-                            return Err(GetActiveTenantError::WaitForActiveTimeout {
-                                latest_state: None,
-                                wait_time: timeout,
-                            });
-                        }
-                    }
-                }
-            };
-        };
-
-        tracing::debug!("Waiting for tenant to enter active state...");
-        tenant_shard
-            .wait_to_become_active(deadline.duration_since(Instant::now()))
-            .await?;
-        Ok(tenant_shard)
-    }
 }
 
 #[async_trait::async_trait]
@@ -1505,7 +1489,7 @@ impl From<GetActiveTenantError> for QueryError {
 }
 
 #[derive(Debug, thiserror::Error)]
-enum GetActiveTimelineError {
+pub(crate) enum GetActiveTimelineError {
     #[error(transparent)]
     Tenant(GetActiveTenantError),
     #[error(transparent)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 48c1851a3a..5d0e963b4e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -386,6 +386,8 @@ impl WalRedoManager {
 
 #[derive(Debug, thiserror::Error, PartialEq, Eq)]
 pub enum GetTimelineError {
+    #[error("Timeline is shutting down")]
+    ShuttingDown,
     #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
     NotActive {
         tenant_id: TenantShardId,
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 5e1f69f4c1..58f8990892 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -116,8 +116,6 @@ pub(crate) enum ShardSelector {
     /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
     /// ignore it.
     Zero,
-    /// Pick the first shard we find for the TenantId
-    First,
     /// Pick the shard that holds this key
     Page(Key),
     /// The shard ID is known: pick the given shard
@@ -2088,7 +2086,6 @@ impl TenantManager {
                     };
 
                     match selector {
-                        ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
                         ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                             return ShardResolveResult::Found(tenant.clone())
                         }
@@ -2170,6 +2167,9 @@ pub(crate) enum GetActiveTenantError {
     /// never happen.
     #[error("Tenant is broken: {0}")]
     Broken(String),
+
+    #[error("reconnect to switch tenant id")]
+    SwitchedTenant,
 }
 
 #[derive(Debug, thiserror::Error)]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4db44a3a19..ecae443079 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3,6 +3,7 @@ pub(crate) mod compaction;
 pub mod delete;
 pub(crate) mod detach_ancestor;
 mod eviction_task;
+pub(crate) mod handle;
 mod init;
 pub mod layer_manager;
 pub(crate) mod logical_size;
@@ -17,6 +18,7 @@ use camino::Utf8Path;
 use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
+use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::{
@@ -424,6 +426,8 @@ pub struct Timeline {
     pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
 
     pub(crate) l0_flush_global_state: L0FlushGlobalState,
+
+    pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
 }
 
 pub struct WalReceiverInfo {
@@ -1915,6 +1919,9 @@ impl Timeline {
         tracing::debug!("Cancelling CancellationToken");
         self.cancel.cancel();
 
+        // Ensure Prevent new page service requests from starting.
+        self.handles.shutdown();
+
         // Transition the remote_client into a state where it's only useful for timeline deletion.
         // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
         self.remote_client.stop();
@@ -2440,6 +2447,8 @@ impl Timeline {
                 extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
 
                 l0_flush_global_state: resources.l0_flush_global_state,
+
+                handles: Default::default(),
             };
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -3709,6 +3718,17 @@ impl Timeline {
         &self.shard_identity
     }
 
+    #[inline(always)]
+    pub(crate) fn shard_timeline_id(&self) -> ShardTimelineId {
+        ShardTimelineId {
+            shard_index: ShardIndex {
+                shard_number: self.shard_identity.number,
+                shard_count: self.shard_identity.count,
+            },
+            timeline_id: self.timeline_id,
+        }
+    }
+
     ///
     /// Get a handle to the latest layer for appending.
     ///
diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
new file mode 100644
index 0000000000..e82559b8b3
--- /dev/null
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -0,0 +1,967 @@
+//! An efficient way to keep the timeline gate open without preventing
+//! timeline shutdown for longer than a single call to a timeline method.
+//!
+//! # Motivation
+//!
+//! On a single page service connection, we're typically serving a single TenantTimelineId.
+//!
+//! Without sharding, there is a single Timeline object to which we dispatch
+//! all requests. For example, a getpage request gets dispatched to the
+//! Timeline::get method of the Timeline object that represents the
+//! (tenant,timeline) of that connection.
+//!
+//! With sharding, for each request that comes in on the connection,
+//! we first have to perform shard routing based on the requested key (=~ page number).
+//! The result of shard routing is a Timeline object.
+//! We then dispatch the request to that Timeline object.
+//!
+//! Regardless of whether the tenant is sharded or not, we want to ensure that
+//! we hold the Timeline gate open while we're invoking the method on the
+//! Timeline object.
+//!
+//! However, we want to avoid the overhead of entering the gate for every
+//! method invocation.
+//!
+//! Further, for shard routing, we want to avoid calling the tenant manager to
+//! resolve the shard for every request. Instead, we want to cache the
+//! routing result so we can bypass the tenant manager for all subsequent requests
+//! that get routed to that shard.
+//!
+//! Regardless of how we accomplish the above, it should not
+//! prevent the Timeline from shutting down promptly.
+//!
+//! # Design
+//!
+//! There are three user-facing data structures:
+//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
+//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
+//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
+//!   Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
+//!
+//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
+//!
+//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
+//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
+//!
+//! To dispatch a request, the page service connection calls `Cache::get`.
+//!
+//! A cache miss means we consult the tenant manager for shard routing,
+//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
+//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
+//! and the `Arc<HandleInner>` in the `PerTimelineState`.
+//!
+//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
+//! and find the `Weak<HandleInner>` in the cache.
+//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
+//!
+//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
+//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
+//!
+//! # Memory Management / How The Reference Cycle Is Broken
+//!
+//! The attentive reader may have noticed the strong reference cycle
+//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
+//!
+//! This cycle is intentional: while it exists, the `Cache` can upgrade its
+//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
+//!
+//! The cycle is broken by either
+//! - `PerTimelineState::shutdown` or
+//! - dropping the `Cache`.
+//!
+//! Concurrently existing `Handle`s will extend the existence of the cycle.
+//! However, since `Handle`s are short-lived and new `Handle`s are not
+//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
+//! that extension of the cycle is bounded.
+//!
+//! # Fast Path for Shard Routing
+//!
+//! The `Cache` has a fast path for shard routing to avoid calling into
+//! the tenant manager for every request.
+//!
+//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
+//!
+//! The current implementation uses the first entry in the hash map
+//! to determine the `ShardParameters` and derive the correct
+//! `ShardIndex` for the requested key.
+//!
+//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
+//!
+//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
+//! it's a hit.
+//!
+//! ## Cache invalidation
+//!
+//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
+//! The only reasons why an entry in the cache can become stale are:
+//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
+//!    being detached, timeline or shard deleted, or pageserver is shutting down.
+//! 2. We're doing a shard split and new traffic should be routed to the child shards.
+//!
+//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
+//! timeline has shut down, and when that happens, we remove the entry from the cache.
+//!
+//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
+//! to the parent shard during a shard split. Eventually, the shard split task will
+//! shut down the parent => case (1).
+
+use std::collections::hash_map;
+use std::collections::HashMap;
+use std::sync::atomic::AtomicBool;
+use std::sync::atomic::Ordering;
+use std::sync::Arc;
+use std::sync::Mutex;
+use std::sync::Weak;
+
+use pageserver_api::shard::ShardIdentity;
+use tracing::instrument;
+use tracing::trace;
+use utils::id::TimelineId;
+use utils::shard::ShardIndex;
+use utils::shard::ShardNumber;
+
+use crate::tenant::mgr::ShardSelector;
+
+/// The requirement for Debug is so that #[derive(Debug)] works in some places.
+pub(crate) trait Types: Sized + std::fmt::Debug {
+    type TenantManagerError: Sized + std::fmt::Debug;
+    type TenantManager: TenantManager<Self> + Sized;
+    type Timeline: ArcTimeline<Self> + Sized;
+}
+
+/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
+/// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`].
+/// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer.
+#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
+struct CacheId(u64);
+
+impl CacheId {
+    fn next() -> Self {
+        static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
+        let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        if id == 0 {
+            panic!("CacheId::new() returned 0, overflow");
+        }
+        Self(id)
+    }
+}
+
+/// See module-level comment.
+pub(crate) struct Cache<T: Types> {
+    id: CacheId,
+    map: Map<T>,
+}
+
+type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
+
+impl<T: Types> Default for Cache<T> {
+    fn default() -> Self {
+        Self {
+            id: CacheId::next(),
+            map: Default::default(),
+        }
+    }
+}
+
+#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
+pub(crate) struct ShardTimelineId {
+    pub(crate) shard_index: ShardIndex,
+    pub(crate) timeline_id: TimelineId,
+}
+
+/// See module-level comment.
+pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
+struct HandleInner<T: Types> {
+    shut_down: AtomicBool,
+    timeline: T::Timeline,
+    // The timeline's gate held open.
+    _gate_guard: utils::sync::gate::GateGuard,
+}
+
+/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
+///
+/// See module-level comment for details.
+pub struct PerTimelineState<T: Types> {
+    // None = shutting down
+    handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
+}
+
+impl<T: Types> Default for PerTimelineState<T> {
+    fn default() -> Self {
+        Self {
+            handles: Mutex::new(Some(Default::default())),
+        }
+    }
+}
+
+/// Abstract view of [`crate::tenant::mgr`], for testability.
+pub(crate) trait TenantManager<T: Types> {
+    /// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
+    /// Errors are returned as [`GetError::TenantManager`].
+    async fn resolve(
+        &self,
+        timeline_id: TimelineId,
+        shard_selector: ShardSelector,
+    ) -> Result<T::Timeline, T::TenantManagerError>;
+}
+
+/// Abstract view of an [`Arc<Timeline>`], for testability.
+pub(crate) trait ArcTimeline<T: Types>: Clone {
+    fn gate(&self) -> &utils::sync::gate::Gate;
+    fn shard_timeline_id(&self) -> ShardTimelineId;
+    fn get_shard_identity(&self) -> &ShardIdentity;
+    fn per_timeline_state(&self) -> &PerTimelineState<T>;
+}
+
+/// Errors returned by [`Cache::get`].
+#[derive(Debug)]
+pub(crate) enum GetError<T: Types> {
+    TenantManager(T::TenantManagerError),
+    TimelineGateClosed,
+    PerTimelineStateShutDown,
+}
+
+/// Internal type used in [`Cache::get`].
+enum RoutingResult<T: Types> {
+    FastPath(Handle<T>),
+    SlowPath(ShardTimelineId),
+    NeedConsultTenantManager,
+}
+
+impl<T: Types> Cache<T> {
+    /// See module-level comment for details.
+    ///
+    /// Does NOT check for the shutdown state of [`Types::Timeline`].
+    /// Instead, the methods of [`Types::Timeline`] that are invoked through
+    /// the [`Handle`] are responsible for checking these conditions
+    /// and if so, return an error that causes the page service to
+    /// close the connection.
+    #[instrument(level = "trace", skip_all)]
+    pub(crate) async fn get(
+        &mut self,
+        timeline_id: TimelineId,
+        shard_selector: ShardSelector,
+        tenant_manager: &T::TenantManager,
+    ) -> Result<Handle<T>, GetError<T>> {
+        // terminates because each iteration removes an element from the map
+        loop {
+            let handle = self
+                .get_impl(timeline_id, shard_selector, tenant_manager)
+                .await?;
+            if handle.0.shut_down.load(Ordering::Relaxed) {
+                let removed = self
+                    .map
+                    .remove(&handle.0.timeline.shard_timeline_id())
+                    .expect("invariant of get_impl is that the returned handle is in the map");
+                assert!(
+                    Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
+                    "shard_timeline_id() incorrect?"
+                );
+            } else {
+                return Ok(handle);
+            }
+        }
+    }
+
+    #[instrument(level = "trace", skip_all)]
+    async fn get_impl(
+        &mut self,
+        timeline_id: TimelineId,
+        shard_selector: ShardSelector,
+        tenant_manager: &T::TenantManager,
+    ) -> Result<Handle<T>, GetError<T>> {
+        let miss: ShardSelector = {
+            let routing_state = self.shard_routing(timeline_id, shard_selector);
+            match routing_state {
+                RoutingResult::FastPath(handle) => return Ok(handle),
+                RoutingResult::SlowPath(key) => match self.map.get(&key) {
+                    Some(cached) => match cached.upgrade() {
+                        Some(upgraded) => return Ok(Handle(upgraded)),
+                        None => {
+                            trace!("handle cache stale");
+                            self.map.remove(&key).unwrap();
+                            ShardSelector::Known(key.shard_index)
+                        }
+                    },
+                    None => ShardSelector::Known(key.shard_index),
+                },
+                RoutingResult::NeedConsultTenantManager => shard_selector,
+            }
+        };
+        self.get_miss(timeline_id, miss, tenant_manager).await
+    }
+
+    #[inline(always)]
+    fn shard_routing(
+        &mut self,
+        timeline_id: TimelineId,
+        shard_selector: ShardSelector,
+    ) -> RoutingResult<T> {
+        loop {
+            // terminates because when every iteration we remove an element from the map
+            let Some((first_key, first_handle)) = self.map.iter().next() else {
+                return RoutingResult::NeedConsultTenantManager;
+            };
+            let Some(first_handle) = first_handle.upgrade() else {
+                // TODO: dedup with get()
+                trace!("handle cache stale");
+                let first_key_owned = *first_key;
+                self.map.remove(&first_key_owned).unwrap();
+                continue;
+            };
+
+            let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
+            let make_shard_index = |shard_num: ShardNumber| ShardIndex {
+                shard_number: shard_num,
+                shard_count: first_handle_shard_identity.count,
+            };
+
+            let need_idx = match shard_selector {
+                ShardSelector::Page(key) => {
+                    make_shard_index(first_handle_shard_identity.get_shard_number(&key))
+                }
+                ShardSelector::Zero => make_shard_index(ShardNumber(0)),
+                ShardSelector::Known(shard_idx) => shard_idx,
+            };
+            let need_shard_timeline_id = ShardTimelineId {
+                shard_index: need_idx,
+                timeline_id,
+            };
+            let first_handle_shard_timeline_id = ShardTimelineId {
+                shard_index: first_handle_shard_identity.shard_index(),
+                timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
+            };
+
+            if need_shard_timeline_id == first_handle_shard_timeline_id {
+                return RoutingResult::FastPath(Handle(first_handle));
+            } else {
+                return RoutingResult::SlowPath(need_shard_timeline_id);
+            }
+        }
+    }
+
+    #[instrument(level = "trace", skip_all)]
+    #[inline(always)]
+    async fn get_miss(
+        &mut self,
+        timeline_id: TimelineId,
+        shard_selector: ShardSelector,
+        tenant_manager: &T::TenantManager,
+    ) -> Result<Handle<T>, GetError<T>> {
+        match tenant_manager.resolve(timeline_id, shard_selector).await {
+            Ok(timeline) => {
+                let key = timeline.shard_timeline_id();
+                match &shard_selector {
+                    ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
+                    ShardSelector::Page(_) => (), // gotta trust tenant_manager
+                    ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
+                }
+
+                let gate_guard = match timeline.gate().enter() {
+                    Ok(guard) => guard,
+                    Err(_) => {
+                        return Err(GetError::TimelineGateClosed);
+                    }
+                };
+                trace!("creating new HandleInner");
+                let handle = Arc::new(
+                    // TODO: global metric that keeps track of the number of live HandlerTimeline instances
+                    // so we can identify reference cycle bugs.
+                    HandleInner {
+                        shut_down: AtomicBool::new(false),
+                        _gate_guard: gate_guard,
+                        timeline: timeline.clone(),
+                    },
+                );
+                let handle = {
+                    let mut lock_guard = timeline
+                        .per_timeline_state()
+                        .handles
+                        .lock()
+                        .expect("mutex poisoned");
+                    match &mut *lock_guard {
+                        Some(per_timeline_state) => {
+                            let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
+                            assert!(replaced.is_none(), "some earlier code left a stale handle");
+                            match self.map.entry(key) {
+                                hash_map::Entry::Occupied(_o) => {
+                                    // This cannot not happen because
+                                    // 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
+                                    // 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
+                                    //    while we were waiting for the tenant manager.
+                                    unreachable!()
+                                }
+                                hash_map::Entry::Vacant(v) => {
+                                    v.insert(Arc::downgrade(&handle));
+                                    handle
+                                }
+                            }
+                        }
+                        None => {
+                            return Err(GetError::PerTimelineStateShutDown);
+                        }
+                    }
+                };
+                Ok(Handle(handle))
+            }
+            Err(e) => Err(GetError::TenantManager(e)),
+        }
+    }
+}
+
+impl<T: Types> PerTimelineState<T> {
+    /// After this method returns, [`Cache::get`] will never again return a [`Handle`]
+    /// to the [`Types::Timeline`] that embeds this per-timeline state.
+    /// Even if [`TenantManager::resolve`] would still resolve to it.
+    ///
+    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
+    /// That's ok because they're short-lived. See module-level comment for details.
+    #[instrument(level = "trace", skip_all)]
+    pub(super) fn shutdown(&self) {
+        let handles = self
+            .handles
+            .lock()
+            .expect("mutex poisoned")
+            // NB: this .take() sets locked to None.
+            // That's what makes future `Cache::get` misses fail.
+            // Cache hits are taken care of below.
+            .take();
+        let Some(handles) = handles else {
+            trace!("already shut down");
+            return;
+        };
+        for handle in handles.values() {
+            // Make hits fail.
+            handle.shut_down.store(true, Ordering::Relaxed);
+        }
+        drop(handles);
+    }
+}
+
+impl<T: Types> std::ops::Deref for Handle<T> {
+    type Target = T::Timeline;
+    fn deref(&self) -> &Self::Target {
+        &self.0.timeline
+    }
+}
+
+#[cfg(test)]
+impl<T: Types> Drop for HandleInner<T> {
+    fn drop(&mut self) {
+        trace!("HandleInner dropped");
+    }
+}
+
+// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
+impl<T: Types> Drop for Cache<T> {
+    fn drop(&mut self) {
+        for (_, weak) in self.map.drain() {
+            if let Some(strong) = weak.upgrade() {
+                // handle is still being kept alive in PerTimelineState
+                let timeline = strong.timeline.per_timeline_state();
+                let mut handles = timeline.handles.lock().expect("mutex poisoned");
+                if let Some(handles) = &mut *handles {
+                    let Some(removed) = handles.remove(&self.id) else {
+                        // There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
+                        continue;
+                    };
+                    assert!(Arc::ptr_eq(&removed, &strong));
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use pageserver_api::{
+        key::{rel_block_to_key, Key, DBDIR_KEY},
+        models::ShardParameters,
+        reltag::RelTag,
+        shard::ShardStripeSize,
+    };
+    use utils::shard::ShardCount;
+
+    use super::*;
+
+    const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX);
+
+    #[derive(Debug)]
+    struct TestTypes;
+    impl Types for TestTypes {
+        type TenantManagerError = anyhow::Error;
+        type TenantManager = StubManager;
+        type Timeline = Arc<StubTimeline>;
+    }
+
+    struct StubManager {
+        shards: Vec<Arc<StubTimeline>>,
+    }
+
+    struct StubTimeline {
+        gate: utils::sync::gate::Gate,
+        id: TimelineId,
+        shard: ShardIdentity,
+        per_timeline_state: PerTimelineState<TestTypes>,
+        myself: Weak<StubTimeline>,
+    }
+
+    impl StubTimeline {
+        fn getpage(&self) {
+            // do nothing
+        }
+    }
+
+    impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
+        fn gate(&self) -> &utils::sync::gate::Gate {
+            &self.gate
+        }
+
+        fn shard_timeline_id(&self) -> ShardTimelineId {
+            ShardTimelineId {
+                shard_index: self.shard.shard_index(),
+                timeline_id: self.id,
+            }
+        }
+
+        fn get_shard_identity(&self) -> &ShardIdentity {
+            &self.shard
+        }
+
+        fn per_timeline_state(&self) -> &PerTimelineState<TestTypes> {
+            &self.per_timeline_state
+        }
+    }
+
+    impl TenantManager<TestTypes> for StubManager {
+        async fn resolve(
+            &self,
+            timeline_id: TimelineId,
+            shard_selector: ShardSelector,
+        ) -> anyhow::Result<Arc<StubTimeline>> {
+            for timeline in &self.shards {
+                if timeline.id == timeline_id {
+                    match &shard_selector {
+                        ShardSelector::Zero if timeline.shard.is_shard_zero() => {
+                            return Ok(Arc::clone(timeline));
+                        }
+                        ShardSelector::Zero => continue,
+                        ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
+                            return Ok(Arc::clone(timeline));
+                        }
+                        ShardSelector::Page(_) => continue,
+                        ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
+                            return Ok(Arc::clone(timeline));
+                        }
+                        ShardSelector::Known(_) => continue,
+                    }
+                }
+            }
+            anyhow::bail!("not found")
+        }
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_timeline_shutdown() {
+        crate::tenant::harness::setup_logging();
+
+        let timeline_id = TimelineId::generate();
+        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let mgr = StubManager {
+            shards: vec![shard0.clone()],
+        };
+        let key = DBDIR_KEY;
+
+        let mut cache = Cache::<TestTypes>::default();
+
+        //
+        // fill the cache
+        //
+        assert_eq!(
+            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
+            (2, 1),
+            "strong: shard0, mgr; weak: myself"
+        );
+
+        let handle: Handle<_> = cache
+            .get(timeline_id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we have the timeline");
+        let handle_inner_weak = Arc::downgrade(&handle.0);
+        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
+        assert_eq!(
+            (
+                Weak::strong_count(&handle_inner_weak),
+                Weak::weak_count(&handle_inner_weak)
+            ),
+            (2, 2),
+            "strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
+        );
+        assert_eq!(cache.map.len(), 1);
+
+        assert_eq!(
+            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
+            (3, 1),
+            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
+        );
+        drop(handle);
+        assert_eq!(
+            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
+            (3, 1),
+            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
+        );
+
+        //
+        // demonstrate that Handle holds up gate closure
+        // but shutdown prevents new handles from being handed out
+        //
+
+        tokio::select! {
+            _ = shard0.gate.close() => {
+                panic!("cache and per-timeline handler state keep cache open");
+            }
+            _ = tokio::time::sleep(FOREVER) => {
+                // NB: first poll of close() makes it enter closing state
+            }
+        }
+
+        let handle = cache
+            .get(timeline_id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we have the timeline");
+        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
+
+        // SHUTDOWN
+        shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
+
+        assert_eq!(
+            1,
+            Weak::strong_count(&handle_inner_weak),
+            "through local var handle"
+        );
+        assert_eq!(
+            cache.map.len(),
+            1,
+            "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
+        );
+        assert_eq!(
+            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
+            (3, 1),
+            "strong: handleinner(via handle), shard0, mgr; weak: myself"
+        );
+
+        // this handle is perfectly usable
+        handle.getpage();
+
+        cache
+            .get(timeline_id, ShardSelector::Page(key), &mgr)
+            .await
+            .err()
+            .expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle");
+        assert_eq!(
+            cache.map.len(),
+            0,
+            "first access after shutdown cleans up the Weak's from the cache"
+        );
+
+        tokio::select! {
+            _ = shard0.gate.close() => {
+                panic!("handle is keeping gate open");
+            }
+            _ = tokio::time::sleep(FOREVER) => { }
+        }
+
+        drop(handle);
+        assert_eq!(
+            0,
+            Weak::strong_count(&handle_inner_weak),
+            "the HandleInner destructor already ran"
+        );
+        assert_eq!(
+            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
+            (2, 1),
+            "strong: shard0, mgr; weak: myself"
+        );
+
+        // closing gate succeeds after dropping handle
+        tokio::select! {
+            _ = shard0.gate.close() => { }
+            _ = tokio::time::sleep(FOREVER) => {
+                panic!("handle is dropped, no other gate holders exist")
+            }
+        }
+
+        // map gets cleaned on next lookup
+        cache
+            .get(timeline_id, ShardSelector::Page(key), &mgr)
+            .await
+            .err()
+            .expect("documented behavior: can't get new handle after shutdown");
+        assert_eq!(cache.map.len(), 0);
+
+        // ensure all refs to shard0 are gone and we're not leaking anything
+        let myself = Weak::clone(&shard0.myself);
+        drop(shard0);
+        drop(mgr);
+        assert_eq!(Weak::strong_count(&myself), 0);
+    }
+
+    #[tokio::test]
+    async fn test_multiple_timelines_and_deletion() {
+        crate::tenant::harness::setup_logging();
+
+        let timeline_a = TimelineId::generate();
+        let timeline_b = TimelineId::generate();
+        assert_ne!(timeline_a, timeline_b);
+        let timeline_a = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_a,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let timeline_b = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_b,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let mut mgr = StubManager {
+            shards: vec![timeline_a.clone(), timeline_b.clone()],
+        };
+        let key = DBDIR_KEY;
+
+        let mut cache = Cache::<TestTypes>::default();
+
+        cache
+            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we have it");
+        cache
+            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we have it");
+        assert_eq!(cache.map.len(), 2);
+
+        // delete timeline A
+        timeline_a.per_timeline_state.shutdown();
+        mgr.shards.retain(|t| t.id != timeline_a.id);
+        assert!(
+            mgr.resolve(timeline_a.id, ShardSelector::Page(key))
+                .await
+                .is_err(),
+            "broken StubManager implementation"
+        );
+
+        assert_eq!(
+            cache.map.len(),
+            2,
+            "cache still has a Weak handle to Timeline A"
+        );
+        cache
+            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
+            .await
+            .err()
+            .expect("documented behavior: can't get new handle after shutdown");
+        assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
+
+        cache
+            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we still have it");
+    }
+
+    fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
+        rel_block_to_key(
+            RelTag {
+                spcnode: 1663,
+                dbnode: 208101,
+                relnode: 2620,
+                forknum: 0,
+            },
+            shard.0 as u32 * params.stripe_size.0,
+        )
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_shard_split() {
+        crate::tenant::harness::setup_logging();
+        let timeline_id = TimelineId::generate();
+        let parent = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let child_params = ShardParameters {
+            count: ShardCount(2),
+            stripe_size: ShardStripeSize::default(),
+        };
+        let child0 = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let child1 = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let child_shards_by_shard_number = [child0.clone(), child1.clone()];
+
+        let mut cache = Cache::<TestTypes>::default();
+
+        // fill the cache with the parent
+        for i in 0..2 {
+            let handle = cache
+                .get(
+                    timeline_id,
+                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
+                    &StubManager {
+                        shards: vec![parent.clone()],
+                    },
+                )
+                .await
+                .expect("we have it");
+            assert!(
+                Weak::ptr_eq(&handle.myself, &parent.myself),
+                "mgr returns parent first"
+            );
+            drop(handle);
+        }
+
+        //
+        // SHARD SPLIT: tenant manager changes, but the cache isn't informed
+        //
+
+        // while we haven't shut down the parent, the cache will return the cached parent, even
+        // if the tenant manager returns the child
+        for i in 0..2 {
+            let handle = cache
+                .get(
+                    timeline_id,
+                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
+                    &StubManager {
+                        shards: vec![], // doesn't matter what's in here, the cache is fully loaded
+                    },
+                )
+                .await
+                .expect("we have it");
+            assert!(
+                Weak::ptr_eq(&handle.myself, &parent.myself),
+                "mgr returns parent"
+            );
+            drop(handle);
+        }
+
+        let parent_handle = cache
+            .get(
+                timeline_id,
+                ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
+                &StubManager {
+                    shards: vec![parent.clone()],
+                },
+            )
+            .await
+            .expect("we have it");
+        assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself));
+
+        // invalidate the cache
+        parent.per_timeline_state.shutdown();
+
+        // the cache will now return the child, even though the parent handle still exists
+        for i in 0..2 {
+            let handle = cache
+                .get(
+                    timeline_id,
+                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
+                    &StubManager {
+                        shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
+                    },
+                )
+                .await
+                .expect("we have it");
+            assert!(
+                Weak::ptr_eq(
+                    &handle.myself,
+                    &child_shards_by_shard_number[i as usize].myself
+                ),
+                "mgr returns child"
+            );
+            drop(handle);
+        }
+
+        // all the while the parent handle kept the parent gate open
+        tokio::select! {
+            _ = parent_handle.gate.close() => {
+                panic!("parent handle is keeping gate open");
+            }
+            _ = tokio::time::sleep(FOREVER) => { }
+        }
+        drop(parent_handle);
+        tokio::select! {
+            _ = parent.gate.close() => { }
+            _ = tokio::time::sleep(FOREVER) => {
+                panic!("parent handle is dropped, no other gate holders exist")
+            }
+        }
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_connection_handler_exit() {
+        crate::tenant::harness::setup_logging();
+        let timeline_id = TimelineId::generate();
+        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let mgr = StubManager {
+            shards: vec![shard0.clone()],
+        };
+        let key = DBDIR_KEY;
+
+        // Simulate 10 connections that's opened, used, and closed
+        let mut used_handles = vec![];
+        for _ in 0..10 {
+            let mut cache = Cache::<TestTypes>::default();
+            let handle = {
+                let handle = cache
+                    .get(timeline_id, ShardSelector::Page(key), &mgr)
+                    .await
+                    .expect("we have the timeline");
+                assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
+                handle
+            };
+            handle.getpage();
+            used_handles.push(Arc::downgrade(&handle.0));
+        }
+
+        // No handles exist, thus gates are closed and don't require shutdown
+        assert!(used_handles
+            .iter()
+            .all(|weak| Weak::strong_count(weak) == 0));
+
+        // ... thus the gate should close immediately, even without shutdown
+        tokio::select! {
+            _ = shard0.gate.close() => { }
+            _ = tokio::time::sleep(FOREVER) => {
+                panic!("handle is dropped, no other gate holders exist")
+            }
+        }
+    }
+}

From 5e0409de95ed1d19ffdb36c31b12792c49938635 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 25 Jul 2024 15:45:15 -0500
Subject: [PATCH 1313/1571] Fix negative replication delay metric

In some cases, we can get a negative metric for replication_delay_bytes.
My best guess from all the research I've done is that we evaluate
pg_last_wal_receive_lsn() before pg_last_wal_replay_lsn(), and that by
the time everything is said and done, the replay LSN has advanced past
the receive LSN. In this case, our lag can effectively be modeled as
0 due to the speed of the WAL reception and replay.
---
 vm-image-spec.yaml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 2767710bad..7d005c7139 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -277,8 +277,12 @@ files:
         help: 'Bytes between received and replayed LSN'
         key_labels:
         values: [replication_delay_bytes]
+        # We use a GREATEST call here because this calculation can be negative.
+        # The calculation is not atomic, meaning after we've gotten the receive
+        # LSN, the replay LSN may have advanced past the receive LSN we
+        # are using for the calculation.
         query: |
-          SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes;
+          SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
 
       - metric_name: replication_delay_seconds
         type: gauge

From ff51b565d3312ae471c9738e54e40f8617029e90 Mon Sep 17 00:00:00 2001
From: Cihan Demirci <128653800+fcdm@users.noreply.github.com>
Date: Wed, 31 Jul 2024 19:42:10 +0300
Subject: [PATCH 1314/1571] cicd: change Azure storage details [2/2] (#8562)

Change Azure storage configuration to point to updated variables/secrets.

Also update subscription id variable.
---
 .github/actionlint.yml                        | 1 -
 .github/workflows/_build-and-test-locally.yml | 6 +++---
 .github/workflows/build_and_test.yml          | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index f086008d34..37983798b7 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -9,6 +9,5 @@ self-hosted-runner:
     - us-east-2
 config-variables:
   - REMOTE_STORAGE_AZURE_CONTAINER
-  - REMOTE_STORAGE_AZURE_CONTAINER_NEW
   - REMOTE_STORAGE_AZURE_REGION
   - SLACK_UPCOMING_RELEASE_CHANNEL_ID
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 182e96a8ca..a0ed169024 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -223,9 +223,9 @@ jobs:
           # Run separate tests for real Azure Blob Storage
           # XXX: replace region with `eu-central-1`-like region
           export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV_NEW }}"
-          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV_NEW }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER_NEW }}"
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
           export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
           ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
 
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index c4df98f585..50006dd3d4 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -871,7 +871,7 @@ jobs:
         with:
           client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
           tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
 
       - name: Login to ACR
         if: github.ref_name == 'main'

From 2f9ada13c42253084134a4f50a8b172122e7b569 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 31 Jul 2024 18:37:47 +0100
Subject: [PATCH 1315/1571] controller: simplify reconciler generation
 increment logic (#8560)

## Problem

This code was confusing, untested and covered:
- an impossible case, where intent state is AttacheStale (we never do
this)
- a rare edge case (going from AttachedMulti to Attached), which we were
not testing, and in any case the pageserver internally does the same
Tenant reset in this transition as it would do if we incremented
generation.

Closes: https://github.com/neondatabase/neon/issues/8367

## Summary of changes

- Simplify the logic to only skip incrementing the generation if the
location already has the expected generation and the exact same mode.
---
 storage_controller/src/reconciler.rs | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 12dea2c7ef..254fdb364e 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -656,11 +656,8 @@ impl Reconciler {
                     // reconcile this location.  This includes locations with different configurations, as well
                     // as locations with unknown (None) observed state.
 
-                    // The general case is to increment the generation.  However, there are cases
-                    // where this is not necessary:
-                    // - if we are only updating the TenantConf part of the location
-                    // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
-                    //   and the location was already in the correct generation
+                    // Incrementing generation is the safe general case, but is inefficient for changes that only
+                    // modify some details (e.g. the tenant's config).
                     let increment_generation = match observed {
                         None => true,
                         Some(ObservedStateLocation { conf: None }) => true,
@@ -669,18 +666,11 @@ impl Reconciler {
                         }) => {
                             let generations_match = observed.generation == wanted_conf.generation;
 
-                            use LocationConfigMode::*;
-                            let mode_transition_requires_gen_inc =
-                                match (observed.mode, wanted_conf.mode) {
-                                    // Usually the short-lived attachment modes (multi and stale) are only used
-                                    // in the case of [`Self::live_migrate`], but it is simple to handle them correctly
-                                    // here too.  Locations are allowed to go Single->Stale and Multi->Single within the same generation.
-                                    (AttachedSingle, AttachedStale) => false,
-                                    (AttachedMulti, AttachedSingle) => false,
-                                    (lhs, rhs) => lhs != rhs,
-                                };
-
-                            !generations_match || mode_transition_requires_gen_inc
+                            // We may skip incrementing the generation if the location is already in the expected mode and
+                            // generation.  In principle it would also be safe to skip from certain other modes (e.g. AttachedStale),
+                            // but such states are handled inside `live_migrate`, and if we see that state here we're cleaning up
+                            // after a restart/crash, so fall back to the universally safe path of incrementing generation.
+                            !generations_match || (observed.mode != wanted_conf.mode)
                         }
                     };
 

From 939d50a41c1b9819daff6b6300fdb0d76b772acb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 31 Jul 2024 20:24:42 +0200
Subject: [PATCH 1316/1571] storage_scrubber: migrate FindGarbage to
 remote_storage (#8548)

Uses the newly added APIs from #8541 named `stream_tenants_generic` and
`stream_objects_with_retries` and extends them with
`list_objects_with_retries_generic` and
`stream_tenant_timelines_generic` to migrate the `find-garbage` command
of the scrubber to `GenericRemoteStorage`.

Part of https://github.com/neondatabase/neon/issues/7547
---
 libs/remote_storage/src/lib.rs          |  1 +
 storage_scrubber/src/garbage.rs         | 50 ++++++++++------------
 storage_scrubber/src/lib.rs             | 40 +++++++++++++++++
 storage_scrubber/src/metadata_stream.rs | 57 +++++++++++++++++++++++++
 4 files changed, 121 insertions(+), 27 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 794e696769..2c9e298f79 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -144,6 +144,7 @@ impl RemotePath {
 ///
 /// The WithDelimiter mode will populate `prefixes` and `keys` in the result.  The
 /// NoDelimiter mode will only populate `keys`.
+#[derive(Copy, Clone)]
 pub enum ListingMode {
     WithDelimiter,
     NoDelimiter,
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index 73479c3658..d6a73bf366 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -19,8 +19,8 @@ use utils::id::TenantId;
 
 use crate::{
     cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
-    init_remote, init_remote_generic, list_objects_with_retries,
-    metadata_stream::{stream_tenant_timelines, stream_tenants},
+    init_remote_generic, list_objects_with_retries_generic,
+    metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic},
     BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
 };
 
@@ -153,7 +153,7 @@ async fn find_garbage_inner(
     node_kind: NodeKind,
 ) -> anyhow::Result<GarbageList> {
     // Construct clients for S3 and for Console API
-    let (s3_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
+    let (remote_client, target) = init_remote_generic(bucket_config.clone(), node_kind).await?;
     let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));
 
     // Build a set of console-known tenants, for quickly eliminating known-active tenants without having
@@ -179,7 +179,7 @@ async fn find_garbage_inner(
 
     // Enumerate Tenants in S3, and check if each one exists in Console
     tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
-    let tenants = stream_tenants(&s3_client, &target);
+    let tenants = stream_tenants_generic(&remote_client, &target);
     let tenants_checked = tenants.map_ok(|t| {
         let api_client = cloud_admin_api_client.clone();
         let console_cache = console_cache.clone();
@@ -237,25 +237,26 @@ async fn find_garbage_inner(
         // Special case: If it's missing in console, check for known bugs that would enable us to conclusively
         // identify it as purge-able anyway
         if console_result.is_none() {
-            let timelines = stream_tenant_timelines(&s3_client, &target, tenant_shard_id)
-                .await?
-                .collect::<Vec<_>>()
-                .await;
+            let timelines =
+                stream_tenant_timelines_generic(&remote_client, &target, tenant_shard_id)
+                    .await?
+                    .collect::<Vec<_>>()
+                    .await;
             if timelines.is_empty() {
                 // No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
-                let tenant_objects = list_objects_with_retries(
-                    &s3_client,
+                let tenant_objects = list_objects_with_retries_generic(
+                    &remote_client,
+                    ListingMode::WithDelimiter,
                     &target.tenant_root(&tenant_shard_id),
-                    None,
                 )
                 .await?;
-                let object = tenant_objects.contents.as_ref().unwrap().first().unwrap();
-                if object.key.as_ref().unwrap().ends_with("heatmap-v1.json") {
+                let object = tenant_objects.keys.first().unwrap();
+                if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
                     tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
                     garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
                     continue;
                 } else {
-                    tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key.as_ref().unwrap());
+                    tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
                 }
             } else {
                 // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
@@ -264,24 +265,18 @@ async fn find_garbage_inner(
 
                 for timeline_r in timelines {
                     let timeline = timeline_r?;
-                    let timeline_objects = list_objects_with_retries(
-                        &s3_client,
+                    let timeline_objects = list_objects_with_retries_generic(
+                        &remote_client,
+                        ListingMode::WithDelimiter,
                         &target.timeline_root(&timeline),
-                        None,
                     )
                     .await?;
-                    if timeline_objects
-                        .common_prefixes
-                        .as_ref()
-                        .map(|v| v.len())
-                        .unwrap_or(0)
-                        > 0
-                    {
+                    if !timeline_objects.prefixes.is_empty() {
                         // Sub-paths?  Unexpected
                         any_non_initdb = true;
                     } else {
-                        let object = timeline_objects.contents.as_ref().unwrap().first().unwrap();
-                        if object.key.as_ref().unwrap().ends_with("initdb.tar.zst") {
+                        let object = timeline_objects.keys.first().unwrap();
+                        if object.key.get_path().as_str().ends_with("initdb.tar.zst") {
                             tracing::info!("Timeline {timeline} contains only initdb.tar.zst");
                         } else {
                             any_non_initdb = true;
@@ -336,7 +331,8 @@ async fn find_garbage_inner(
 
     // Construct a stream of all timelines within active tenants
     let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok));
-    let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, *t));
+    let timelines =
+        active_tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, *t));
     let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY);
     let timelines = timelines.try_flatten();
 
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index e0f154def3..152319b731 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -427,6 +427,7 @@ async fn list_objects_with_retries(
     Err(anyhow!("unreachable unless MAX_RETRIES==0"))
 }
 
+/// Listing possibly large amounts of keys in a streaming fashion.
 fn stream_objects_with_retries<'a>(
     storage_client: &'a GenericRemoteStorage,
     listing_mode: ListingMode,
@@ -465,6 +466,45 @@ fn stream_objects_with_retries<'a>(
     }
 }
 
+/// If you want to list a bounded amount of prefixes or keys. For larger numbers of keys/prefixes,
+/// use [`stream_objects_with_retries`] instead.
+async fn list_objects_with_retries_generic(
+    remote_client: &GenericRemoteStorage,
+    listing_mode: ListingMode,
+    s3_target: &S3Target,
+) -> anyhow::Result<Listing> {
+    let cancel = CancellationToken::new();
+    let prefix_str = &s3_target
+        .prefix_in_bucket
+        .strip_prefix("/")
+        .unwrap_or(&s3_target.prefix_in_bucket);
+    let prefix = RemotePath::from_string(prefix_str)?;
+    for trial in 0..MAX_RETRIES {
+        match remote_client
+            .list(Some(&prefix), listing_mode, None, &cancel)
+            .await
+        {
+            Ok(response) => return Ok(response),
+            Err(e) => {
+                if trial == MAX_RETRIES - 1 {
+                    return Err(e)
+                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
+                }
+                error!(
+                    "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
+                    s3_target.bucket_name,
+                    s3_target.prefix_in_bucket,
+                    s3_target.delimiter,
+                    DisplayErrorContext(e),
+                );
+                let backoff_time = 1 << trial.max(5);
+                tokio::time::sleep(Duration::from_secs(backoff_time)).await;
+            }
+        }
+    }
+    panic!("MAX_RETRIES is not allowed to be 0");
+}
+
 async fn download_object_with_retries(
     s3_client: &Client,
     bucket_name: &str,
diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index 91dba3c992..c702c0c312 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -189,6 +189,63 @@ pub async fn stream_tenant_timelines<'a>(
     })
 }
 
+/// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
+/// using a listing. The listing is done before the stream is built, so that this
+/// function can be used to generate concurrency on a stream using buffer_unordered.
+pub async fn stream_tenant_timelines_generic<'a>(
+    remote_client: &'a GenericRemoteStorage,
+    target: &'a RootTarget,
+    tenant: TenantShardId,
+) -> anyhow::Result<impl Stream<Item = Result<TenantShardTimelineId, anyhow::Error>> + 'a> {
+    let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
+    let timelines_target = target.timelines_root(&tenant);
+
+    let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
+        remote_client,
+        ListingMode::WithDelimiter,
+        &timelines_target
+    ));
+    loop {
+        tracing::debug!("Listing in {tenant}");
+        let fetch_response = match objects_stream.next().await {
+            None => break,
+            Some(Err(e)) => {
+                timeline_ids.push(Err(e));
+                break;
+            }
+            Some(Ok(r)) => r,
+        };
+
+        let new_entry_ids = fetch_response
+            .prefixes
+            .iter()
+            .filter_map(|prefix| -> Option<&str> {
+                prefix
+                    .get_path()
+                    .as_str()
+                    .strip_prefix(&timelines_target.prefix_in_bucket)?
+                    .strip_suffix('/')
+            })
+            .map(|entry_id_str| {
+                entry_id_str
+                    .parse::<TimelineId>()
+                    .with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
+            });
+
+        for i in new_entry_ids {
+            timeline_ids.push(i);
+        }
+    }
+
+    tracing::debug!("Yielding for {}", tenant);
+    Ok(stream! {
+        for i in timeline_ids {
+            let id = i?;
+            yield Ok(TenantShardTimelineId::new(tenant, id));
+        }
+    })
+}
+
 pub(crate) fn stream_listing<'a>(
     s3_client: &'a Client,
     target: &'a S3Target,

From 3350daeb9af9b2abbffea0c1496c1d19ca9f8721 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 31 Jul 2024 19:47:59 +0100
Subject: [PATCH 1317/1571] CI(create-test-report): fix missing benchmark
 results in Allure report (#8540)

## Problem

In https://github.com/neondatabase/neon/pull/8241 I've accidentally
removed `create-test-report` dependency on `benchmarks` job

## Summary of changes
- Run `create-test-report` after `benchmarks` job
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 50006dd3d4..c7ae2aedd4 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -309,7 +309,7 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   create-test-report:
-    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
+    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
     if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
     outputs:
       report-url: ${{ steps.create-allure-report.outputs.report-url }}

From d6c79b77dfecd638e064fd2aed938318a508b3c0 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Wed, 31 Jul 2024 17:55:19 -0400
Subject: [PATCH 1318/1571] test(pageserver): add
 test_gc_feedback_with_snapshots (#8474)

should be working after https://github.com/neondatabase/neon/pull/8328
gets merged. Part of https://github.com/neondatabase/neon/issues/8002

adds a new perf benchmark case that ensures garbages can be collected
with branches

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 scripts/benchmark_durations.py              |  1 +
 test_runner/performance/test_gc_feedback.py | 54 +++++++++++++++------
 2 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/scripts/benchmark_durations.py b/scripts/benchmark_durations.py
index 01f34a1b96..4ca433679a 100755
--- a/scripts/benchmark_durations.py
+++ b/scripts/benchmark_durations.py
@@ -67,6 +67,7 @@ FALLBACK_DURATION = {
     "test_runner/performance/test_copy.py::test_copy[neon]": 13.817,
     "test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736,
     "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735,
+    "test_runner/performance/test_gc_feedback.py::test_gc_feedback_with_snapshots": 575.735,
     "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868,
     "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393,
     "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588,
diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py
index 4c326111c2..9861259c16 100644
--- a/test_runner/performance/test_gc_feedback.py
+++ b/test_runner/performance/test_gc_feedback.py
@@ -6,21 +6,8 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 
 
-@pytest.mark.timeout(10000)
-def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
-    """
-    Test that GC is able to collect all old layers even if them are forming
-    "stairs" and there are not three delta layers since last image layer.
-
-    Information about image layers needed to collect old layers should
-    be propagated by GC to compaction task which should take in in account
-    when make a decision which new image layers needs to be created.
-
-    NB: this test demonstrates the problem. The source tree contained the
-    `gc_feedback` mechanism for about 9 months, but, there were problems
-    with it and it wasn't enabled at runtime.
-    This PR removed the code: https://github.com/neondatabase/neon/pull/6863
-    """
+def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, mode: str):
+    assert mode == "normal" or mode == "with_snapshots"
     env = neon_env_builder.init_start()
     client = env.pageserver.http_client()
 
@@ -74,6 +61,9 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
 
             physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"]
             log.info(f"Physical storage size {physical_size}")
+        if mode == "with_snapshots":
+            if step == n_steps / 2:
+                env.neon_cli.create_branch("child")
 
     max_num_of_deltas_above_image = 0
     max_total_num_of_deltas = 0
@@ -149,3 +139,37 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
     log.info(f"Writing layer map to {layer_map_path}")
     with layer_map_path.open("w") as f:
         f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id)))
+
+
+@pytest.mark.timeout(10000)
+def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
+    """
+    Test that GC is able to collect all old layers even if them are forming
+    "stairs" and there are not three delta layers since last image layer.
+
+    Information about image layers needed to collect old layers should
+    be propagated by GC to compaction task which should take in in account
+    when make a decision which new image layers needs to be created.
+
+    NB: this test demonstrates the problem. The source tree contained the
+    `gc_feedback` mechanism for about 9 months, but, there were problems
+    with it and it wasn't enabled at runtime.
+    This PR removed the code: https://github.com/neondatabase/neon/pull/6863
+
+    And the bottom-most GC-compaction epic resolves the problem.
+    https://github.com/neondatabase/neon/issues/8002
+    """
+    gc_feedback_impl(neon_env_builder, zenbenchmark, "normal")
+
+
+@pytest.mark.timeout(10000)
+def test_gc_feedback_with_snapshots(
+    neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker
+):
+    """
+    Compared with `test_gc_feedback`, we create a branch without written data (=snapshot) in the middle
+    of the benchmark, and the   bottom-most compaction should collect as much garbage as possible below the GC
+    horizon. Ideally, there should be images (in an image layer) covering the full range at the branch point,
+    and images covering the full key range (in a delta layer) at the GC horizon.
+    """
+    gc_feedback_impl(neon_env_builder, zenbenchmark, "with_snapshots")

From 980d506bdaba05955e3d9316d9d385228a16f39f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 1 Aug 2024 07:57:09 +0200
Subject: [PATCH 1319/1571] pageserver: shutdown all walredo managers 8s into
 shutdown (#8572)

# Motivation

The working theory for hung systemd during PS deploy
(https://github.com/neondatabase/cloud/issues/11387) is that leftover
walredo processes trigger a race condition.

In https://github.com/neondatabase/neon/pull/8150 I arranged that a
clean Tenant shutdown does actually kill its walredo processes.

But many prod machines don't manage to shut down all their tenants until
the 10s systemd timeout hits and, presumably, triggers the race
condition in systemd / the Linux kernel that causes the frozen systemd

# Solution

This PR bolts on a rather ugly mechanism to shut down tenant managers
out of order 8s after we've received the SIGTERM from systemd.

# Changes

- add a global registry of `Weak<WalRedoManager>`
- add a special thread spawned during `shutdown_pageserver` that sleeps
for 8s, then shuts down all redo managers in the registry and prevents
new redo managers from being created
- propagate the new failure mode of tenant spawning throughout the code
base
- make sure shut down tenant manager results in
PageReconstructError::Cancelled so that if Timeline::get calls come in
after the shutdown, they do the right thing
---
 pageserver/src/lib.rs             | 83 ++++++++++++++++++++++++++++++-
 pageserver/src/tenant.rs          | 82 ++++++++++++++++++++++++------
 pageserver/src/tenant/mgr.rs      | 38 ++++++++------
 pageserver/src/tenant/timeline.rs | 22 +++++---
 pageserver/src/walredo.rs         | 29 ++++++++---
 5 files changed, 206 insertions(+), 48 deletions(-)

diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index f729cad3c3..5aee13cfc6 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -12,6 +12,8 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod l0_flush;
+
+use futures::{stream::FuturesUnordered, StreamExt};
 pub use pageserver_api::keyspace;
 use tokio_util::sync::CancellationToken;
 pub mod aux_file;
@@ -36,7 +38,7 @@ use tenant::{
     mgr::{BackgroundPurges, TenantManager},
     secondary,
 };
-use tracing::info;
+use tracing::{info, info_span};
 
 /// Current storage format version
 ///
@@ -85,6 +87,79 @@ pub async fn shutdown_pageserver(
     exit_code: i32,
 ) {
     use std::time::Duration;
+
+    // If the orderly shutdown below takes too long, we still want to make
+    // sure that all walredo processes are killed and wait()ed on by us, not systemd.
+    //
+    // (Leftover walredo processes are the hypothesized trigger for the systemd freezes
+    //  that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387.
+    //
+    // We use a thread instead of a tokio task because the background runtime is likely busy
+    // with the final flushing / uploads. This activity here has priority, and due to lack
+    // of scheduling priority feature sin the tokio scheduler, using a separate thread is
+    // an effective priority booster.
+    let walredo_extraordinary_shutdown_thread_span = {
+        let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread");
+        span.follows_from(tracing::Span::current());
+        span
+    };
+    let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new();
+    let walredo_extraordinary_shutdown_thread = std::thread::spawn({
+        let walredo_extraordinary_shutdown_thread_cancel =
+            walredo_extraordinary_shutdown_thread_cancel.clone();
+        move || {
+            let rt = tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()
+                .unwrap();
+            let _entered = rt.enter();
+            let _entered = walredo_extraordinary_shutdown_thread_span.enter();
+            if let Ok(()) = rt.block_on(tokio::time::timeout(
+                Duration::from_secs(8),
+                walredo_extraordinary_shutdown_thread_cancel.cancelled(),
+            )) {
+                info!("cancellation requested");
+                return;
+            }
+            let managers = tenant::WALREDO_MANAGERS
+                .lock()
+                .unwrap()
+                // prevents new walredo managers from being inserted
+                .take()
+                .expect("only we take()");
+            // Use FuturesUnordered to get in queue early for each manager's
+            // heavier_once_cell semaphore wait list.
+            // Also, for idle tenants that for some reason haven't
+            // shut down yet, it's quite likely that we're not going
+            // to get Poll::Pending once.
+            let mut futs: FuturesUnordered<_> = managers
+                .into_iter()
+                .filter_map(|(_, mgr)| mgr.upgrade())
+                .map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await })
+                .collect();
+            info!(count=%futs.len(), "built FuturesUnordered");
+            let mut last_log_at = std::time::Instant::now();
+            #[derive(Debug, Default)]
+            struct Results {
+                initiated: u64,
+                already: u64,
+            }
+            let mut results = Results::default();
+            while let Some(we_initiated) = rt.block_on(futs.next()) {
+                if we_initiated {
+                    results.initiated += 1;
+                } else {
+                    results.already += 1;
+                }
+                if last_log_at.elapsed() > Duration::from_millis(100) {
+                    info!(remaining=%futs.len(), ?results, "progress");
+                    last_log_at = std::time::Instant::now();
+                }
+            }
+            info!(?results, "done");
+        }
+    });
+
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
     let remaining_connections = timed(
@@ -160,6 +235,12 @@ pub async fn shutdown_pageserver(
         Duration::from_secs(1),
     )
     .await;
+
+    info!("cancel & join walredo_extraordinary_shutdown_thread");
+    walredo_extraordinary_shutdown_thread_cancel.cancel();
+    walredo_extraordinary_shutdown_thread.join().unwrap();
+    info!("walredo_extraordinary_shutdown_thread done");
+
     info!("Shut down successfully completed");
     std::process::exit(exit_code);
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 5d0e963b4e..0f09241d22 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -33,6 +33,7 @@ use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
 use std::collections::BTreeMap;
 use std::fmt;
+use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
@@ -312,14 +313,66 @@ impl std::fmt::Debug for Tenant {
 }
 
 pub(crate) enum WalRedoManager {
-    Prod(PostgresRedoManager),
+    Prod(WalredoManagerId, PostgresRedoManager),
     #[cfg(test)]
     Test(harness::TestRedoManager),
 }
 
-impl From<PostgresRedoManager> for WalRedoManager {
-    fn from(mgr: PostgresRedoManager) -> Self {
-        Self::Prod(mgr)
+#[derive(thiserror::Error, Debug)]
+#[error("pageserver is shutting down")]
+pub(crate) struct GlobalShutDown;
+
+impl WalRedoManager {
+    pub(crate) fn new(mgr: PostgresRedoManager) -> Result<Arc<Self>, GlobalShutDown> {
+        let id = WalredoManagerId::next();
+        let arc = Arc::new(Self::Prod(id, mgr));
+        let mut guard = WALREDO_MANAGERS.lock().unwrap();
+        match &mut *guard {
+            Some(map) => {
+                map.insert(id, Arc::downgrade(&arc));
+                Ok(arc)
+            }
+            None => Err(GlobalShutDown),
+        }
+    }
+}
+
+impl Drop for WalRedoManager {
+    fn drop(&mut self) {
+        match self {
+            Self::Prod(id, _) => {
+                let mut guard = WALREDO_MANAGERS.lock().unwrap();
+                if let Some(map) = &mut *guard {
+                    map.remove(id).expect("new() registers, drop() unregisters");
+                }
+            }
+            #[cfg(test)]
+            Self::Test(_) => {
+                // Not applicable to test redo manager
+            }
+        }
+    }
+}
+
+/// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down
+/// the walredo processes outside of the regular order.
+///
+/// This is necessary to work around a systemd bug where it freezes if there are
+/// walredo processes left => <https://github.com/neondatabase/cloud/issues/11387>
+#[allow(clippy::type_complexity)]
+pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy<
+    Mutex<Option<HashMap<WalredoManagerId, Weak<WalRedoManager>>>>,
+> = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new())));
+#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
+pub(crate) struct WalredoManagerId(u64);
+impl WalredoManagerId {
+    pub fn next() -> Self {
+        static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
+        let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        if id == 0 {
+            panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique");
+        }
+        Self(id)
     }
 }
 
@@ -331,19 +384,20 @@ impl From<harness::TestRedoManager> for WalRedoManager {
 }
 
 impl WalRedoManager {
-    pub(crate) async fn shutdown(&self) {
+    pub(crate) async fn shutdown(&self) -> bool {
         match self {
-            Self::Prod(mgr) => mgr.shutdown().await,
+            Self::Prod(_, mgr) => mgr.shutdown().await,
             #[cfg(test)]
             Self::Test(_) => {
                 // Not applicable to test redo manager
+                true
             }
         }
     }
 
     pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
         match self {
-            Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
+            Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout),
             #[cfg(test)]
             Self::Test(_) => {
                 // Not applicable to test redo manager
@@ -363,7 +417,7 @@ impl WalRedoManager {
         pg_version: u32,
     ) -> Result<bytes::Bytes, walredo::Error> {
         match self {
-            Self::Prod(mgr) => {
+            Self::Prod(_, mgr) => {
                 mgr.request_redo(key, lsn, base_img, records, pg_version)
                     .await
             }
@@ -377,7 +431,7 @@ impl WalRedoManager {
 
     pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
         match self {
-            WalRedoManager::Prod(m) => Some(m.status()),
+            WalRedoManager::Prod(_, m) => Some(m.status()),
             #[cfg(test)]
             WalRedoManager::Test(_) => None,
         }
@@ -677,11 +731,9 @@ impl Tenant {
         init_order: Option<InitializationOrder>,
         mode: SpawnMode,
         ctx: &RequestContext,
-    ) -> Arc<Tenant> {
-        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
-            conf,
-            tenant_shard_id,
-        )));
+    ) -> Result<Arc<Tenant>, GlobalShutDown> {
+        let wal_redo_manager =
+            WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;
 
         let TenantSharedResources {
             broker_client,
@@ -880,7 +932,7 @@ impl Tenant {
             }
             .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
         );
-        tenant
+        Ok(tenant)
     }
 
     #[instrument(skip_all)]
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 58f8990892..b5568d37b5 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -55,7 +55,7 @@ use utils::id::{TenantId, TimelineId};
 use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
 use super::timeline::detach_ancestor::PreparedTimelineDetach;
-use super::TenantSharedResources;
+use super::{GlobalShutDown, TenantSharedResources};
 
 /// For a tenant that appears in TenantsMap, it may either be
 /// - `Attached`: has a full Tenant object, is elegible to service
@@ -665,17 +665,20 @@ pub async fn init_tenant_mgr(
         let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
         let shard_identity = location_conf.shard;
         let slot = match location_conf.mode {
-            LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
-                conf,
-                tenant_shard_id,
-                &tenant_dir_path,
-                resources.clone(),
-                AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
-                shard_identity,
-                Some(init_order.clone()),
-                SpawnMode::Lazy,
-                &ctx,
-            )),
+            LocationMode::Attached(attached_conf) => TenantSlot::Attached(
+                tenant_spawn(
+                    conf,
+                    tenant_shard_id,
+                    &tenant_dir_path,
+                    resources.clone(),
+                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                    shard_identity,
+                    Some(init_order.clone()),
+                    SpawnMode::Lazy,
+                    &ctx,
+                )
+                .expect("global shutdown during init_tenant_mgr cannot happen"),
+            ),
             LocationMode::Secondary(secondary_conf) => {
                 info!(
                     tenant_id = %tenant_shard_id.tenant_id,
@@ -723,7 +726,7 @@ fn tenant_spawn(
     init_order: Option<InitializationOrder>,
     mode: SpawnMode,
     ctx: &RequestContext,
-) -> Arc<Tenant> {
+) -> Result<Arc<Tenant>, GlobalShutDown> {
     // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
     // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
     // to avoid impacting prod runtime performance.
@@ -1190,7 +1193,10 @@ impl TenantManager {
                     None,
                     spawn_mode,
                     ctx,
-                );
+                )
+                .map_err(|_: GlobalShutDown| {
+                    UpsertLocationError::Unavailable(TenantMapError::ShuttingDown)
+                })?;
 
                 TenantSlot::Attached(tenant)
             }
@@ -1311,7 +1317,7 @@ impl TenantManager {
             None,
             SpawnMode::Eager,
             ctx,
-        );
+        )?;
 
         slot_guard.upsert(TenantSlot::Attached(tenant))?;
 
@@ -2045,7 +2051,7 @@ impl TenantManager {
             None,
             SpawnMode::Eager,
             ctx,
-        );
+        )?;
 
         slot_guard.upsert(TenantSlot::Attached(tenant))?;
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ecae443079..3a7353c138 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -76,6 +76,7 @@ use crate::{
         metadata::TimelineMetadata,
         storage_layer::PersistentLayerDesc,
     },
+    walredo,
 };
 use crate::{
     context::{DownloadBehavior, RequestContext},
@@ -1000,7 +1001,10 @@ impl Timeline {
             .for_get_kind(GetKind::Singular)
             .observe(elapsed.as_secs_f64());
 
-        if cfg!(feature = "testing") && res.is_err() {
+        if cfg!(feature = "testing")
+            && res.is_err()
+            && !matches!(res, Err(PageReconstructError::Cancelled))
+        {
             // it can only be walredo issue
             use std::fmt::Write;
 
@@ -5466,20 +5470,22 @@ impl Timeline {
                 } else {
                     trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
                 };
-
-                let img = match self
+                let res = self
                     .walredo_mgr
                     .as_ref()
                     .context("timeline has no walredo manager")
                     .map_err(PageReconstructError::WalRedo)?
                     .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
-                    .await
-                    .context("reconstruct a page image")
-                {
+                    .await;
+                let img = match res {
                     Ok(img) => img,
-                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
+                    Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
+                    Err(walredo::Error::Other(e)) => {
+                        return Err(PageReconstructError::WalRedo(
+                            e.context("reconstruct a page image"),
+                        ))
+                    }
                 };
-
                 Ok(img)
             }
         }
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 5095beefd7..770081b3b4 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -241,6 +241,9 @@ impl PostgresRedoManager {
 
     /// Shut down the WAL redo manager.
     ///
+    /// Returns `true` if this call was the one that initiated shutdown.
+    /// `true` may be observed by no caller if the first caller stops polling.
+    ///
     /// After this future completes
     /// - no redo process is running
     /// - no new redo process will be spawned
@@ -250,22 +253,32 @@ impl PostgresRedoManager {
     /// # Cancel-Safety
     ///
     /// This method is cancellation-safe.
-    pub async fn shutdown(&self) {
+    pub async fn shutdown(&self) -> bool {
         // prevent new processes from being spawned
-        let permit = match self.redo_process.get_or_init_detached().await {
+        let maybe_permit = match self.redo_process.get_or_init_detached().await {
             Ok(guard) => {
-                let (proc, permit) = guard.take_and_deinit();
-                drop(proc); // this just drops the Arc, its refcount may not be zero yet
-                permit
+                if matches!(&*guard, ProcessOnceCell::ManagerShutDown) {
+                    None
+                } else {
+                    let (proc, permit) = guard.take_and_deinit();
+                    drop(proc); // this just drops the Arc, its refcount may not be zero yet
+                    Some(permit)
+                }
             }
-            Err(permit) => permit,
+            Err(permit) => Some(permit),
+        };
+        let it_was_us = if let Some(permit) = maybe_permit {
+            self.redo_process
+                .set(ProcessOnceCell::ManagerShutDown, permit);
+            true
+        } else {
+            false
         };
-        self.redo_process
-            .set(ProcessOnceCell::ManagerShutDown, permit);
         // wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
         // we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
         // for the underlying process.
         self.launched_processes.close().await;
+        it_was_us
     }
 
     /// This type doesn't have its own background task to check for idleness: we

From 163f2eaf7967bcbd1ed0694b50f38daafa36c96b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 1 Aug 2024 10:22:21 +0200
Subject: [PATCH 1320/1571] Reduce linux-raw-sys duplication (#8577)

Before, we had four versions of linux-raw-sys in our dependency graph:

```
  linux-raw-sys@0.1.4
  linux-raw-sys@0.3.8
  linux-raw-sys@0.4.13
  linux-raw-sys@0.6.4
```

now it's only two:

```
  linux-raw-sys@0.4.13
  linux-raw-sys@0.6.4
```

The changes in this PR are minimal. In order to get to its state one
only has to update procfs in Cargo.toml to 0.16 and do `cargo update -p
tempfile -p is-terminal -p prometheus`.
---
 Cargo.lock | 128 +++++++++--------------------------------------------
 Cargo.toml |   2 +-
 2 files changed, 21 insertions(+), 109 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2186d55e9c..e2e9ca3ed8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2710,17 +2710,6 @@ version = "3.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
 
-[[package]]
-name = "io-lifetimes"
-version = "1.0.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
-dependencies = [
- "hermit-abi",
- "libc",
- "windows-sys 0.48.0",
-]
-
 [[package]]
 name = "io-uring"
 version = "0.6.2"
@@ -2739,14 +2728,13 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
 
 [[package]]
 name = "is-terminal"
-version = "0.4.7"
+version = "0.4.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
+checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
 dependencies = [
  "hermit-abi",
- "io-lifetimes",
- "rustix 0.37.25",
- "windows-sys 0.48.0",
+ "libc",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -2872,18 +2860,6 @@ version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
 
-[[package]]
-name = "linux-raw-sys"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
-
-[[package]]
-name = "linux-raw-sys"
-version = "0.3.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
-
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.13"
@@ -3001,7 +2977,7 @@ checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
 dependencies = [
  "libc",
  "measured",
- "procfs 0.16.0",
+ "procfs",
 ]
 
 [[package]]
@@ -3046,7 +3022,7 @@ dependencies = [
  "measured",
  "measured-process",
  "once_cell",
- "procfs 0.14.2",
+ "procfs",
  "prometheus",
  "rand 0.8.5",
  "rand_distr",
@@ -3593,7 +3569,7 @@ dependencies = [
  "postgres_connection",
  "postgres_ffi",
  "pq_proto",
- "procfs 0.14.2",
+ "procfs",
  "rand 0.8.5",
  "regex",
  "remote_storage",
@@ -4139,21 +4115,6 @@ dependencies = [
  "unicode-ident",
 ]
 
-[[package]]
-name = "procfs"
-version = "0.14.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69"
-dependencies = [
- "bitflags 1.3.2",
- "byteorder",
- "chrono",
- "flate2",
- "hex",
- "lazy_static",
- "rustix 0.36.16",
-]
-
 [[package]]
 name = "procfs"
 version = "0.16.0"
@@ -4161,10 +4122,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
 dependencies = [
  "bitflags 2.4.1",
+ "chrono",
+ "flate2",
  "hex",
  "lazy_static",
  "procfs-core",
- "rustix 0.38.28",
+ "rustix",
 ]
 
 [[package]]
@@ -4174,14 +4137,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
 dependencies = [
  "bitflags 2.4.1",
+ "chrono",
  "hex",
 ]
 
 [[package]]
 name = "prometheus"
-version = "0.13.3"
+version = "0.13.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c"
+checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
 dependencies = [
  "cfg-if",
  "fnv",
@@ -4189,7 +4153,7 @@ dependencies = [
  "libc",
  "memchr",
  "parking_lot 0.12.1",
- "procfs 0.14.2",
+ "procfs",
  "thiserror",
 ]
 
@@ -4943,34 +4907,6 @@ dependencies = [
  "nom",
 ]
 
-[[package]]
-name = "rustix"
-version = "0.36.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
-dependencies = [
- "bitflags 1.3.2",
- "errno",
- "io-lifetimes",
- "libc",
- "linux-raw-sys 0.1.4",
- "windows-sys 0.45.0",
-]
-
-[[package]]
-name = "rustix"
-version = "0.37.25"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035"
-dependencies = [
- "bitflags 1.3.2",
- "errno",
- "io-lifetimes",
- "libc",
- "linux-raw-sys 0.3.8",
- "windows-sys 0.48.0",
-]
-
 [[package]]
 name = "rustix"
 version = "0.38.28"
@@ -5973,15 +5909,15 @@ dependencies = [
 
 [[package]]
 name = "tempfile"
-version = "3.5.0"
+version = "3.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998"
+checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
 dependencies = [
  "cfg-if",
- "fastrand 1.9.0",
- "redox_syscall 0.3.5",
- "rustix 0.37.25",
- "windows-sys 0.45.0",
+ "fastrand 2.0.0",
+ "redox_syscall 0.4.1",
+ "rustix",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -7178,15 +7114,6 @@ dependencies = [
  "windows_x86_64_msvc 0.42.2",
 ]
 
-[[package]]
-name = "windows-sys"
-version = "0.45.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
-dependencies = [
- "windows-targets 0.42.2",
-]
-
 [[package]]
 name = "windows-sys"
 version = "0.48.0"
@@ -7205,21 +7132,6 @@ dependencies = [
  "windows-targets 0.52.4",
 ]
 
-[[package]]
-name = "windows-targets"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
-dependencies = [
- "windows_aarch64_gnullvm 0.42.2",
- "windows_aarch64_msvc 0.42.2",
- "windows_i686_gnu 0.42.2",
- "windows_i686_msvc 0.42.2",
- "windows_x86_64_gnu 0.42.2",
- "windows_x86_64_gnullvm 0.42.2",
- "windows_x86_64_msvc 0.42.2",
-]
-
 [[package]]
 name = "windows-targets"
 version = "0.48.0"
diff --git a/Cargo.toml b/Cargo.toml
index 7749378114..af1c1dfc82 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -126,7 +126,7 @@ parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
 parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
-procfs = "0.14"
+procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"

From 1678dea20f2ace9543c6e4bc93ee2c323518ff30 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 1 Aug 2024 10:25:35 +0100
Subject: [PATCH 1321/1571] pageserver: add layer visibility calculation
 (#8511)

## Problem

We recently added a "visibility" state to layers, but nothing
initializes it.

Part of:
- #8398

## Summary of changes

- Add a dependency on `range-set-blaze`, which is used as a fast
incrementally updated alternative to KeySpace. We could also use this to
replace the internals of KeySpaceRandomAccum if we wanted to. Writing a
type that does this kind of "BtreeMap & merge overlapping entries" thing
isn't super complicated, but no reason to write this ourselves when
there's a third party impl available.
- Add a function to layermap to calculate visibilities for each layer
- Add a function to Timeline to call into layermap and then apply these
visibilities to the Layer objects.
- Invoke the calculation during startup, after image layer creations,
and when removing branches. Branch removal and image layer creation are
the two ways that a layer can go from Visible to Covered.
- Add unit test & benchmark for the visibility calculation
- Expose `pageserver_visible_physical_size` metric, which should always
be <= `pageserver_remote_physical_size`.
- This metric will feed into the /v1/utilization endpoint later: the
visible size indicates how much space we would like to use on this
pageserver for this tenant.
- When `pageserver_visible_physical_size` is greater than
`pageserver_resident_physical_size`, this is a sign that the tenant has
long-idle branches, which result in layers that are visible in
principle, but not used in practice.

This does not keep visibility hints up to date in all cases:
particularly, when creating a child timeline, any previously covered
layers will not get marked Visible until they are accessed.

Updates after image layer creation could be implemented as more of a
special case, but this would require more new code: the existing depth
calculation code doesn't maintain+yield the list of deltas that would be
covered by an image layer.

## Performance

This operation is done rarely (at startup and at timeline deletion), so
needs to be efficient but not ultra-fast.

There is a new `visibility` bench that measures runtime for a synthetic
100k layers case (`sequential`) and a real layer map (`real_map`) with
~26k layers.

The benchmark shows runtimes of single digit milliseconds (on a ryzen
7950). This confirms that the runtime shouldn't be a problem at startup
(as we already incur S3-level latencies there), but that it's slow
enough that we definitely shouldn't call it more often than necessary,
and it may be worthwhile to optimize further later (things like: when
removing a branch, only bother scanning layers below the branchpoint)

```
visibility/sequential   time:   [4.5087 ms 4.5894 ms 4.6775 ms]
                        change: [+2.0826% +3.9097% +5.8995%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 24 outliers among 100 measurements (24.00%)
  2 (2.00%) high mild
  22 (22.00%) high severe
min: 0/1696070, max: 93/1C0887F0
visibility/real_map     time:   [7.0796 ms 7.0832 ms 7.0871 ms]
                        change: [+0.3900% +0.4505% +0.5164%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 4 outliers among 100 measurements (4.00%)
  3 (3.00%) high mild
  1 (1.00%) high severe
min: 0/1696070, max: 93/1C0887F0
visibility/real_map_many_branches
                        time:   [4.5285 ms 4.5355 ms 4.5434 ms]
                        change: [-1.0012% -0.8004% -0.5969%] (p = 0.00 < 0.05)
                        Change within noise threshold.
```
---
 Cargo.lock                                    |  56 ++-
 pageserver/Cargo.toml                         |   1 +
 pageserver/benches/bench_layer_map.rs         |  78 ++-
 pageserver/src/metrics.rs                     |  15 +
 pageserver/src/tenant.rs                      |   2 +-
 pageserver/src/tenant/layer_map.rs            | 474 +++++++++++++++++-
 .../layer_map/historic_layer_coverage.rs      |   4 +
 pageserver/src/tenant/storage_layer.rs        |  41 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  53 +-
 pageserver/src/tenant/timeline.rs             |  28 +-
 pageserver/src/tenant/timeline/compaction.rs  |  39 ++
 pageserver/src/tenant/timeline/delete.rs      |   9 +-
 .../indices/mixed_workload/README.md          |   7 +
 .../indices/mixed_workload/index_part.json    |   1 +
 test_runner/fixtures/metrics.py               |   1 +
 15 files changed, 729 insertions(+), 80 deletions(-)
 create mode 100644 pageserver/test_data/indices/mixed_workload/README.md
 create mode 100644 pageserver/test_data/indices/mixed_workload/index_part.json

diff --git a/Cargo.lock b/Cargo.lock
index e2e9ca3ed8..dc4f0c7b81 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1418,7 +1418,7 @@ dependencies = [
  "clap",
  "criterion-plot",
  "is-terminal",
- "itertools",
+ "itertools 0.10.5",
  "num-traits",
  "once_cell",
  "oorandom",
@@ -1439,7 +1439,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
 dependencies = [
  "cast",
- "itertools",
+ "itertools 0.10.5",
 ]
 
 [[package]]
@@ -2134,6 +2134,12 @@ dependencies = [
  "slab",
 ]
 
+[[package]]
+name = "gen_ops"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a"
+
 [[package]]
 name = "generic-array"
 version = "0.14.7"
@@ -2746,6 +2752,15 @@ dependencies = [
  "either",
 ]
 
+[[package]]
+name = "itertools"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
+dependencies = [
+ "either",
+]
+
 [[package]]
 name = "itoa"
 version = "1.0.6"
@@ -3551,7 +3566,7 @@ dependencies = [
  "humantime",
  "humantime-serde",
  "hyper 0.14.26",
- "itertools",
+ "itertools 0.10.5",
  "leaky-bucket",
  "md5",
  "metrics",
@@ -3571,6 +3586,7 @@ dependencies = [
  "pq_proto",
  "procfs",
  "rand 0.8.5",
+ "range-set-blaze",
  "regex",
  "remote_storage",
  "reqwest 0.12.4",
@@ -3621,7 +3637,7 @@ dependencies = [
  "hex",
  "humantime",
  "humantime-serde",
- "itertools",
+ "itertools 0.10.5",
  "postgres_ffi",
  "rand 0.8.5",
  "serde",
@@ -3679,7 +3695,7 @@ dependencies = [
  "hex-literal",
  "humantime",
  "humantime-serde",
- "itertools",
+ "itertools 0.10.5",
  "metrics",
  "once_cell",
  "pageserver_api",
@@ -4011,7 +4027,7 @@ name = "postgres_connection"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "itertools",
+ "itertools 0.10.5",
  "once_cell",
  "postgres",
  "tokio-postgres",
@@ -4069,7 +4085,7 @@ version = "0.1.0"
 dependencies = [
  "byteorder",
  "bytes",
- "itertools",
+ "itertools 0.10.5",
  "pin-project-lite",
  "postgres-protocol",
  "rand 0.8.5",
@@ -4175,7 +4191,7 @@ checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
 dependencies = [
  "bytes",
  "heck 0.4.1",
- "itertools",
+ "itertools 0.10.5",
  "lazy_static",
  "log",
  "multimap",
@@ -4196,7 +4212,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
 dependencies = [
  "anyhow",
- "itertools",
+ "itertools 0.10.5",
  "proc-macro2",
  "quote",
  "syn 1.0.109",
@@ -4253,7 +4269,7 @@ dependencies = [
  "hyper-util",
  "indexmap 2.0.1",
  "ipnet",
- "itertools",
+ "itertools 0.10.5",
  "lasso",
  "md5",
  "measured",
@@ -4429,6 +4445,18 @@ dependencies = [
  "rand_core 0.5.1",
 ]
 
+[[package]]
+name = "range-set-blaze"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8421b5d459262eabbe49048d362897ff3e3830b44eac6cfe341d6acb2f0f13d2"
+dependencies = [
+ "gen_ops",
+ "itertools 0.12.1",
+ "num-integer",
+ "num-traits",
+]
+
 [[package]]
 name = "rayon"
 version = "1.7.0"
@@ -4597,7 +4625,7 @@ dependencies = [
  "humantime",
  "humantime-serde",
  "hyper 0.14.26",
- "itertools",
+ "itertools 0.10.5",
  "metrics",
  "once_cell",
  "pin-project-lite",
@@ -5666,7 +5694,7 @@ dependencies = [
  "hex",
  "humantime",
  "hyper 0.14.26",
- "itertools",
+ "itertools 0.10.5",
  "lasso",
  "measured",
  "metrics",
@@ -5732,7 +5760,7 @@ dependencies = [
  "futures-util",
  "hex",
  "humantime",
- "itertools",
+ "itertools 0.10.5",
  "once_cell",
  "pageserver",
  "pageserver_api",
@@ -7361,7 +7389,7 @@ dependencies = [
  "hmac",
  "hyper 0.14.26",
  "indexmap 1.9.3",
- "itertools",
+ "itertools 0.10.5",
  "libc",
  "log",
  "memchr",
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 0d9343d643..43976250a4 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -49,6 +49,7 @@ postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
 rand.workspace = true
+range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 1d02aa7709..1353e79f7c 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,3 +1,4 @@
+use criterion::measurement::WallTime;
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
@@ -15,7 +16,11 @@ use utils::id::{TenantId, TimelineId};
 
 use utils::lsn::Lsn;
 
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion};
+
+fn fixture_path(relative: &str) -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
+}
 
 fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
     let mut layer_map = LayerMap::default();
@@ -109,7 +114,7 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning
 // between each test run.
 fn bench_from_captest_env(c: &mut Criterion) {
     // TODO consider compressing this file
-    let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
+    let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
     let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
 
     // Test with uniform query pattern
@@ -139,7 +144,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
 fn bench_from_real_project(c: &mut Criterion) {
     // Init layer map
     let now = Instant::now();
-    let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
+    let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
     println!("Finished layer map init in {:?}", now.elapsed());
 
     // Choose uniformly distributed queries
@@ -242,7 +247,72 @@ fn bench_sequential(c: &mut Criterion) {
     group.finish();
 }
 
+fn bench_visibility_with_map(
+    group: &mut BenchmarkGroup<WallTime>,
+    layer_map: LayerMap,
+    read_points: Vec<Lsn>,
+    bench_name: &str,
+) {
+    group.bench_function(bench_name, |b| {
+        b.iter(|| black_box(layer_map.get_visibility(read_points.clone())));
+    });
+}
+
+// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
+fn bench_visibility(c: &mut Criterion) {
+    let mut group = c.benchmark_group("visibility");
+    {
+        // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
+        let now = Instant::now();
+        let mut layer_map = LayerMap::default();
+        let mut updates = layer_map.batch_update();
+        for i in 0..100_000 {
+            let i32 = (i as u32) % 100;
+            let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
+            let layer = PersistentLayerDesc::new_img(
+                TenantShardId::unsharded(TenantId::generate()),
+                TimelineId::generate(),
+                zero.add(10 * i32)..zero.add(10 * i32 + 1),
+                Lsn(i),
+                0,
+            );
+            updates.insert_historic(layer);
+        }
+        updates.flush();
+        println!("Finished layer map init in {:?}", now.elapsed());
+
+        let mut read_points = Vec::new();
+        for i in (0..100_000).step_by(1000) {
+            read_points.push(Lsn(i));
+        }
+
+        bench_visibility_with_map(&mut group, layer_map, read_points, "sequential");
+    }
+
+    {
+        let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
+        let read_points = vec![Lsn(0x1C760FA190)];
+        bench_visibility_with_map(&mut group, layer_map, read_points, "real_map");
+
+        let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
+        let read_points = vec![
+            Lsn(0x1C760FA190),
+            Lsn(0x000000931BEAD539),
+            Lsn(0x000000931BF63011),
+            Lsn(0x000000931B33AE68),
+            Lsn(0x00000038E67ABFA0),
+            Lsn(0x000000931B33AE68),
+            Lsn(0x000000914E3F38F0),
+            Lsn(0x000000931B33AE68),
+        ];
+        bench_visibility_with_map(&mut group, layer_map, read_points, "real_map_many_branches");
+    }
+
+    group.finish();
+}
+
 criterion_group!(group_1, bench_from_captest_env);
 criterion_group!(group_2, bench_from_real_project);
 criterion_group!(group_3, bench_sequential);
-criterion_main!(group_1, group_2, group_3);
+criterion_group!(group_4, bench_visibility);
+criterion_main!(group_1, group_2, group_3, group_4);
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ede6b41a75..cd2cd43f27 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -525,6 +525,15 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_visible_physical_size",
+        "The size of the layer files present in the pageserver's filesystem.",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
     register_uint_gauge!(
         "pageserver_resident_physical_size_global",
@@ -2204,6 +2213,7 @@ pub(crate) struct TimelineMetrics {
     pub(crate) layer_count_delta: UIntGauge,
     pub standby_horizon_gauge: IntGauge,
     pub resident_physical_size_gauge: UIntGauge,
+    pub visible_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
     pub aux_file_size_gauge: IntGauge,
@@ -2326,6 +2336,9 @@ impl TimelineMetrics {
         let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
+        let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
         // TODO: we shouldn't expose this metric
         let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
@@ -2380,6 +2393,7 @@ impl TimelineMetrics {
             layer_count_delta,
             standby_horizon_gauge,
             resident_physical_size_gauge,
+            visible_physical_size_gauge,
             current_logical_size_gauge,
             aux_file_size_gauge,
             directory_entries_count_gauge,
@@ -2431,6 +2445,7 @@ impl TimelineMetrics {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
             let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
+        let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
             let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0f09241d22..b9257dfbe8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1634,7 +1634,7 @@ impl Tenant {
         self: Arc<Self>,
         timeline_id: TimelineId,
     ) -> Result<(), DeleteTimelineError> {
-        DeleteTimelineFlow::run(&self, timeline_id, false).await?;
+        DeleteTimelineFlow::run(&self, timeline_id).await?;
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 6f150a2d5c..ba9c08f6e7 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,7 +51,8 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
-use pageserver_api::keyspace::KeySpaceAccum;
+use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
+use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
 use std::collections::{HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
@@ -61,7 +62,7 @@ use utils::lsn::Lsn;
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
 pub use historic_layer_coverage::LayerKey;
 
-use super::storage_layer::PersistentLayerDesc;
+use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
 
 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -871,11 +872,183 @@ impl LayerMap {
         println!("End dump LayerMap");
         Ok(())
     }
+
+    /// `read_points` represent the tip of a timeline and any branch points, i.e. the places
+    /// where we expect to serve reads.
+    ///
+    /// This function is O(N) and should be called infrequently.  The caller is responsible for
+    /// looking up and updating the Layer objects for these layer descriptors.
+    pub fn get_visibility(
+        &self,
+        mut read_points: Vec<Lsn>,
+    ) -> (
+        Vec<(Arc<PersistentLayerDesc>, LayerVisibilityHint)>,
+        KeySpace,
+    ) {
+        // This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas
+        // KeySpace is intended to be composed statically and iterated over.
+        struct KeyShadow {
+            // Map of range start to range end
+            inner: RangeSetBlaze<i128>,
+        }
+
+        impl KeyShadow {
+            fn new() -> Self {
+                Self {
+                    inner: Default::default(),
+                }
+            }
+
+            fn contains(&self, range: Range<Key>) -> bool {
+                let range_incl = range.start.to_i128()..=range.end.to_i128() - 1;
+                self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint(
+                    CheckSortedDisjoint::from([range_incl]),
+                ))
+            }
+
+            /// Add the input range to the keys covered by self.
+            ///
+            /// Return true if inserting this range covered some keys that were previously not covered
+            fn cover(&mut self, insert: Range<Key>) -> bool {
+                let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1;
+                self.inner.ranges_insert(range_incl)
+            }
+
+            fn reset(&mut self) {
+                self.inner = Default::default();
+            }
+
+            fn to_keyspace(&self) -> KeySpace {
+                let mut accum = KeySpaceAccum::new();
+                for range_incl in self.inner.ranges() {
+                    let range = Range {
+                        start: Key::from_i128(*range_incl.start()),
+                        end: Key::from_i128(range_incl.end() + 1),
+                    };
+                    accum.add_range(range)
+                }
+
+                accum.to_keyspace()
+            }
+        }
+
+        // The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow,
+        // and a ReadPoint
+        read_points.sort_by_key(|rp| rp.0);
+        let mut shadow = KeyShadow::new();
+
+        // We will interleave all our read points and layers into a sorted collection
+        enum Item {
+            ReadPoint { lsn: Lsn },
+            Layer(Arc<PersistentLayerDesc>),
+        }
+
+        let mut items = Vec::with_capacity(self.historic.len() + read_points.len());
+        items.extend(self.iter_historic_layers().map(Item::Layer));
+        items.extend(
+            read_points
+                .into_iter()
+                .map(|rp| Item::ReadPoint { lsn: rp }),
+        );
+
+        // Ordering: we want to iterate like this:
+        // 1. Highest LSNs first
+        // 2. Consider images before deltas if they end at the same LSNs (images cover deltas)
+        // 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible)
+        items.sort_by_key(|item| {
+            std::cmp::Reverse(match item {
+                Item::Layer(layer) => {
+                    if layer.is_delta() {
+                        (Lsn(layer.get_lsn_range().end.0 - 1), 0)
+                    } else {
+                        (layer.image_layer_lsn(), 1)
+                    }
+                }
+                Item::ReadPoint { lsn } => (*lsn, 2),
+            })
+        });
+
+        let mut results = Vec::with_capacity(self.historic.len());
+
+        let mut maybe_covered_deltas: Vec<Arc<PersistentLayerDesc>> = Vec::new();
+
+        for item in items {
+            let (reached_lsn, is_readpoint) = match &item {
+                Item::ReadPoint { lsn } => (lsn, true),
+                Item::Layer(layer) => (&layer.lsn_range.start, false),
+            };
+            maybe_covered_deltas.retain(|d| {
+                if *reached_lsn >= d.lsn_range.start && is_readpoint {
+                    // We encountered a readpoint within the delta layer: it is visible
+
+                    results.push((d.clone(), LayerVisibilityHint::Visible));
+                    false
+                } else if *reached_lsn < d.lsn_range.start {
+                    // We passed the layer's range without encountering a read point: it is not visible
+                    results.push((d.clone(), LayerVisibilityHint::Covered));
+                    false
+                } else {
+                    // We're still in the delta layer: continue iterating
+                    true
+                }
+            });
+
+            match item {
+                Item::ReadPoint { lsn: _lsn } => {
+                    // TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have
+                    // to assume that the whole key range is visible at the branch point.
+                    shadow.reset();
+                }
+                Item::Layer(layer) => {
+                    let visibility = if layer.is_delta() {
+                        if shadow.contains(layer.get_key_range()) {
+                            // If a layer isn't visible based on current state, we must defer deciding whether
+                            // it is truly not visible until we have advanced past the delta's range: we might
+                            // encounter another branch point within this delta layer's LSN range.
+                            maybe_covered_deltas.push(layer);
+                            continue;
+                        } else {
+                            LayerVisibilityHint::Visible
+                        }
+                    } else {
+                        let modified = shadow.cover(layer.get_key_range());
+                        if modified {
+                            // An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered
+                            LayerVisibilityHint::Visible
+                        } else {
+                            // An image layer in a region that was already covered
+                            LayerVisibilityHint::Covered
+                        }
+                    };
+
+                    results.push((layer, visibility));
+                }
+            }
+        }
+
+        // Drain any remaining maybe_covered deltas
+        results.extend(
+            maybe_covered_deltas
+                .into_iter()
+                .map(|d| (d, LayerVisibilityHint::Covered)),
+        );
+
+        (results, shadow.to_keyspace())
+    }
 }
 
 #[cfg(test)]
 mod tests {
-    use pageserver_api::keyspace::KeySpace;
+    use crate::tenant::{storage_layer::LayerName, IndexPart};
+    use pageserver_api::{
+        key::DBDIR_KEY,
+        keyspace::{KeySpace, KeySpaceRandomAccum},
+    };
+    use std::{collections::HashMap, path::PathBuf};
+    use utils::{
+        id::{TenantId, TimelineId},
+        shard::TenantShardId,
+    };
 
     use super::*;
 
@@ -1002,4 +1175,299 @@ mod tests {
             }
         }
     }
+
+    #[test]
+    fn layer_visibility_basic() {
+        // A simple synthetic input, as a smoke test.
+        let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
+        let timeline_id = TimelineId::generate();
+        let mut layer_map = LayerMap::default();
+        let mut updates = layer_map.batch_update();
+
+        const FAKE_LAYER_SIZE: u64 = 1024;
+
+        let inject_delta = |updates: &mut BatchedUpdates,
+                            key_start: i128,
+                            key_end: i128,
+                            lsn_start: u64,
+                            lsn_end: u64| {
+            let desc = PersistentLayerDesc::new_delta(
+                tenant_shard_id,
+                timeline_id,
+                Range {
+                    start: Key::from_i128(key_start),
+                    end: Key::from_i128(key_end),
+                },
+                Range {
+                    start: Lsn(lsn_start),
+                    end: Lsn(lsn_end),
+                },
+                1024,
+            );
+            updates.insert_historic(desc.clone());
+            desc
+        };
+
+        let inject_image =
+            |updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| {
+                let desc = PersistentLayerDesc::new_img(
+                    tenant_shard_id,
+                    timeline_id,
+                    Range {
+                        start: Key::from_i128(key_start),
+                        end: Key::from_i128(key_end),
+                    },
+                    Lsn(lsn),
+                    FAKE_LAYER_SIZE,
+                );
+                updates.insert_historic(desc.clone());
+                desc
+            };
+
+        //
+        // Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios
+        // we expect to handle.  You can follow these examples through in the same order as they would be processed
+        // by the function under test.
+        //
+
+        let mut read_points = vec![Lsn(1000)];
+
+        // A delta ahead of any image layer
+        let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110);
+
+        // An image layer is visible and covers some layers beneath itself
+        let visible_covering_img = inject_image(&mut updates, 5, 25, 99);
+
+        // A delta layer covered by the image layer: should be covered
+        let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100);
+
+        // A delta layer partially covered by an image layer: should be visible
+        let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100);
+
+        // A delta layer not covered by an image layer: should be visible
+        let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100);
+
+        // An image layer covered by the image layer above: should be covered
+        let covered_image = inject_image(&mut updates, 10, 20, 89);
+
+        // An image layer partially covered by an image layer: should be visible
+        let partially_covered_image = inject_image(&mut updates, 1, 7, 89);
+
+        // An image layer not covered by an image layer: should be visible
+        let not_covered_image = inject_image(&mut updates, 1, 4, 89);
+
+        // A read point: this will make subsequent layers below here visible, even if there are
+        // more recent layers covering them.
+        read_points.push(Lsn(80));
+
+        // A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer
+        let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79);
+
+        // A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range:
+        // the read point should make it visible, even though its end LSN is covered
+        let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69);
+        let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69);
+        read_points.push(Lsn(65));
+        let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69);
+
+        let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65);
+
+        updates.flush();
+
+        let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
+        let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
+
+        assert_eq!(
+            layer_visibilities.get(&ahead_layer),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&visible_covering_img),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&covered_delta),
+            Some(&LayerVisibilityHint::Covered)
+        );
+        assert_eq!(
+            layer_visibilities.get(&partially_covered_delta),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&not_covered_delta),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&covered_image),
+            Some(&LayerVisibilityHint::Covered)
+        );
+        assert_eq!(
+            layer_visibilities.get(&partially_covered_image),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&not_covered_image),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&covered_delta_below_read_point),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&covering_img_between_read_points),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&covered_delta_between_read_points),
+            Some(&LayerVisibilityHint::Covered)
+        );
+        assert_eq!(
+            layer_visibilities.get(&covered_delta_intersects_read_point),
+            Some(&LayerVisibilityHint::Visible)
+        );
+        assert_eq!(
+            layer_visibilities.get(&visible_img_after_last_read_point),
+            Some(&LayerVisibilityHint::Visible)
+        );
+
+        // Shadow should include all the images below the last read point
+        let expected_shadow = KeySpace {
+            ranges: vec![Key::from_i128(10)..Key::from_i128(20)],
+        };
+        assert_eq!(shadow, expected_shadow);
+    }
+
+    fn fixture_path(relative: &str) -> PathBuf {
+        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
+    }
+
+    #[test]
+    fn layer_visibility_realistic() {
+        // Load a large example layermap
+        let index_raw = std::fs::read_to_string(fixture_path(
+            "test_data/indices/mixed_workload/index_part.json",
+        ))
+        .unwrap();
+        let index: IndexPart = serde_json::from_str::<IndexPart>(&index_raw).unwrap();
+
+        let tenant_id = TenantId::generate();
+        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+        let timeline_id = TimelineId::generate();
+
+        let mut layer_map = LayerMap::default();
+        let mut updates = layer_map.batch_update();
+        for (layer_name, layer_metadata) in index.layer_metadata {
+            let layer_desc = match layer_name {
+                LayerName::Image(layer_name) => PersistentLayerDesc {
+                    key_range: layer_name.key_range.clone(),
+                    lsn_range: layer_name.lsn_as_range(),
+                    tenant_shard_id,
+                    timeline_id,
+                    is_delta: false,
+                    file_size: layer_metadata.file_size,
+                },
+                LayerName::Delta(layer_name) => PersistentLayerDesc {
+                    key_range: layer_name.key_range,
+                    lsn_range: layer_name.lsn_range,
+                    tenant_shard_id,
+                    timeline_id,
+                    is_delta: true,
+                    file_size: layer_metadata.file_size,
+                },
+            };
+            updates.insert_historic(layer_desc);
+        }
+        updates.flush();
+
+        let read_points = vec![index.metadata.disk_consistent_lsn()];
+        let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
+        for (layer_desc, visibility) in &layer_visibilities {
+            tracing::info!("{layer_desc:?}: {visibility:?}");
+            eprintln!("{layer_desc:?}: {visibility:?}");
+        }
+
+        // The shadow should be non-empty, since there were some image layers
+        assert!(!shadow.ranges.is_empty());
+
+        // At least some layers should be marked covered
+        assert!(layer_visibilities
+            .iter()
+            .any(|i| matches!(i.1, LayerVisibilityHint::Covered)));
+
+        let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
+
+        // Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it
+        for (layer_desc, visible) in &layer_visibilities {
+            let mut coverage = KeySpaceRandomAccum::new();
+            let mut covered_by = Vec::new();
+
+            for other_layer in layer_map.iter_historic_layers() {
+                if &other_layer == layer_desc {
+                    continue;
+                }
+                if !other_layer.is_delta()
+                    && other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1)
+                    && other_layer.key_range.start <= layer_desc.key_range.end
+                    && layer_desc.key_range.start <= other_layer.key_range.end
+                {
+                    coverage.add_range(other_layer.get_key_range());
+                    covered_by.push((*other_layer).clone());
+                }
+            }
+            let coverage = coverage.to_keyspace();
+
+            let expect_visible = if coverage.ranges.len() == 1
+                && coverage.contains(&layer_desc.key_range.start)
+                && coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1))
+            {
+                LayerVisibilityHint::Covered
+            } else {
+                LayerVisibilityHint::Visible
+            };
+
+            if expect_visible != *visible {
+                eprintln!(
+                    "Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}",
+                    layer_desc.key_range.start,
+                    layer_desc.key_range.end,
+                    layer_desc.lsn_range.start,
+                    layer_desc.lsn_range.end,
+                    layer_desc.is_delta()
+                );
+                if expect_visible == LayerVisibilityHint::Covered {
+                    eprintln!("Covered by:");
+                    for other in covered_by {
+                        eprintln!(
+                            "  {}..{} @ {}",
+                            other.get_key_range().start,
+                            other.get_key_range().end,
+                            other.image_layer_lsn()
+                        );
+                    }
+                    if let Some(range) = coverage.ranges.first() {
+                        eprintln!(
+                            "Total coverage from contributing layers: {}..{}",
+                            range.start, range.end
+                        );
+                    } else {
+                        eprintln!(
+                            "Total coverage from contributing layers: {:?}",
+                            coverage.ranges
+                        );
+                    }
+                }
+            }
+            assert_eq!(expect_visible, *visible);
+        }
+
+        // Sanity: the layer that holds latest data for the DBDIR key should always be visible
+        // (just using this key as a key that will always exist for any layermap fixture)
+        let dbdir_layer = layer_map
+            .search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
+            .unwrap();
+        assert!(matches!(
+            layer_visibilities.get(&dbdir_layer.layer).unwrap(),
+            LayerVisibilityHint::Visible
+        ));
+    }
 }
diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
index 347490c1ba..136f68bc36 100644
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -521,6 +521,10 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
 
         Ok(&self.historic_coverage)
     }
+
+    pub(crate) fn len(&self) -> usize {
+        self.layers.len()
+    }
 }
 
 #[test]
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index f931341aca..4fd110359b 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -451,20 +451,14 @@ pub enum ValueReconstructResult {
 /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
 /// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
 /// be used for cache management but not for correctness-critical checks.
-#[derive(Default, Debug, Clone, PartialEq, Eq)]
-pub(crate) enum LayerVisibilityHint {
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum LayerVisibilityHint {
     /// A Visible layer might be read while serving a read, because there is not an image layer between it
     /// and a readable LSN (the tip of the branch or a child's branch point)
     Visible,
     /// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
     /// a branch or ephemeral endpoint at an LSN below the layer that covers this.
-    #[allow(unused)]
     Covered,
-    /// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
-    /// in this state.  Note that newly written layers may be called Visible immediately, this uninitialized
-    /// state is for when existing layers are constructed while loading a timeline.
-    #[default]
-    Uninitialized,
 }
 
 pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
@@ -626,23 +620,30 @@ impl LayerAccessStats {
         }
     }
 
-    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
-        let value = match visibility {
-            LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
-            LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0,
-        };
-
-        self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
-    }
-
-    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
-        let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
-        match (read >> Self::VISIBILITY_SHIFT) & 0x1 {
+    /// Helper for extracting the visibility hint from the literal value of our inner u64
+    fn decode_visibility(&self, bits: u64) -> LayerVisibilityHint {
+        match (bits >> Self::VISIBILITY_SHIFT) & 0x1 {
             1 => LayerVisibilityHint::Visible,
             0 => LayerVisibilityHint::Covered,
             _ => unreachable!(),
         }
     }
+
+    /// Returns the old value which has been replaced
+    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) -> LayerVisibilityHint {
+        let value = match visibility {
+            LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
+            LayerVisibilityHint::Covered => 0x0,
+        };
+
+        let old_bits = self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
+        self.decode_visibility(old_bits)
+    }
+
+    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
+        let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
+        self.decode_visibility(read)
+    }
 }
 
 /// Get a layer descriptor from a layer.
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 1075feb1d1..5732779e44 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -24,7 +24,8 @@ use super::delta_layer::{self, DeltaEntry};
 use super::image_layer::{self};
 use super::{
     AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
+    LayerVisibilityHint, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState,
+    ValuesReconstructState,
 };
 
 use utils::generation::Generation;
@@ -246,7 +247,7 @@ impl Layer {
                 &timeline.generation,
             );
 
-            let layer = LayerInner::new(
+            LayerInner::new(
                 conf,
                 timeline,
                 local_path,
@@ -254,14 +255,7 @@ impl Layer {
                 Some(inner),
                 timeline.generation,
                 timeline.get_shard_index(),
-            );
-
-            // Newly created layers are marked visible by default: the usual case is that they were created to be read.
-            layer
-                .access_stats
-                .set_visibility(super::LayerVisibilityHint::Visible);
-
-            layer
+            )
         }));
 
         let downloaded = resident.expect("just initialized");
@@ -493,6 +487,32 @@ impl Layer {
             }
         }
     }
+
+    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
+        let old_visibility = self.access_stats().set_visibility(visibility.clone());
+        use LayerVisibilityHint::*;
+        match (old_visibility, visibility) {
+            (Visible, Covered) => {
+                // Subtract this layer's contribution to the visible size metric
+                if let Some(tl) = self.0.timeline.upgrade() {
+                    tl.metrics
+                        .visible_physical_size_gauge
+                        .sub(self.0.desc.file_size)
+                }
+            }
+            (Covered, Visible) => {
+                // Add this layer's contribution to the visible size metric
+                if let Some(tl) = self.0.timeline.upgrade() {
+                    tl.metrics
+                        .visible_physical_size_gauge
+                        .add(self.0.desc.file_size)
+                }
+            }
+            (Covered, Covered) | (Visible, Visible) => {
+                // no change
+            }
+        }
+    }
 }
 
 /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
@@ -693,6 +713,13 @@ impl Drop for LayerInner {
                 timeline.metrics.layer_count_image.dec();
                 timeline.metrics.layer_size_image.sub(self.desc.file_size);
             }
+
+            if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
+                timeline
+                    .metrics
+                    .visible_physical_size_gauge
+                    .sub(self.desc.file_size);
+            }
         }
 
         if !*self.wanted_deleted.get_mut() {
@@ -801,6 +828,12 @@ impl LayerInner {
             timeline.metrics.layer_size_image.add(desc.file_size);
         }
 
+        // New layers are visible by default. This metric is later updated on drop or in set_visibility
+        timeline
+            .metrics
+            .visible_physical_size_gauge
+            .add(desc.file_size);
+
         LayerInner {
             conf,
             debug_str: {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3a7353c138..37ebeded66 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2736,6 +2736,10 @@ impl Timeline {
         // Tenant::create_timeline will wait for these uploads to happen before returning, or
         // on retry.
 
+        // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
+        drop(guard); // drop write lock, update_layer_visibility will take a read lock.
+        self.update_layer_visibility().await;
+
         info!(
             "loaded layer map with {} layers at {}, total physical size: {}",
             num_layers, disk_consistent_lsn, total_physical_size
@@ -4677,27 +4681,6 @@ impl Timeline {
             }
         }
 
-        // The writer.finish() above already did the fsync of the inodes.
-        // We just need to fsync the directory in which these inodes are linked,
-        // which we know to be the timeline directory.
-        if !image_layers.is_empty() {
-            // We use fatal_err() below because the after writer.finish() returns with success,
-            // the in-memory state of the filesystem already has the layer file in its final place,
-            // and subsequent pageserver code could think it's durable while it really isn't.
-            let timeline_dir = VirtualFile::open(
-                &self
-                    .conf
-                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
-                ctx,
-            )
-            .await
-            .fatal_err("VirtualFile::open for timeline dir fsync");
-            timeline_dir
-                .sync_all()
-                .await
-                .fatal_err("VirtualFile::sync_all timeline dir");
-        }
-
         let mut guard = self.layers.write().await;
 
         // FIXME: we could add the images to be uploaded *before* returning from here, but right
@@ -4706,6 +4689,9 @@ impl Timeline {
         drop_wlock(guard);
         timer.stop_and_record();
 
+        // Creating image layers may have caused some previously visible layers to be covered
+        self.update_layer_visibility().await;
+
         Ok(image_layers)
     }
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 5e9ff1c9e4..4fe9bbafab 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -443,6 +443,45 @@ impl Timeline {
         Ok(())
     }
 
+    /// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is
+    /// an image layer between them and the most recent readable LSN (branch point or tip of timeline).  The
+    /// purpose of the visibility hint is to record which layers need to be available to service reads.
+    ///
+    /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers
+    /// that we know won't be needed for reads.
+    pub(super) async fn update_layer_visibility(&self) {
+        let head_lsn = self.get_last_record_lsn();
+
+        // We will sweep through layers in reverse-LSN order.  We only do historic layers.  L0 deltas
+        // are implicitly left visible, because LayerVisibilityHint's default is Visible, and we never modify it here.
+        // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that
+        // they will be subject to L0->L1 compaction in the near future.
+        let layer_manager = self.layers.read().await;
+        let layer_map = layer_manager.layer_map();
+
+        let readable_points = {
+            let children = self.gc_info.read().unwrap().retain_lsns.clone();
+
+            let mut readable_points = Vec::with_capacity(children.len() + 1);
+            for (child_lsn, _child_timeline_id) in &children {
+                readable_points.push(*child_lsn);
+            }
+            readable_points.push(head_lsn);
+            readable_points
+        };
+
+        let (layer_visibility, covered) = layer_map.get_visibility(readable_points);
+        for (layer_desc, visibility) in layer_visibility {
+            // FIXME: a more efficiency bulk zip() through the layers rather than NlogN getting each one
+            let layer = layer_manager.get_from_desc(&layer_desc);
+            layer.set_visibility(visibility);
+        }
+
+        // TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can
+        // avoid assuming that everything at a branch point is visible.
+        drop(covered);
+    }
+
     /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
     /// as Level 1 files. Returns whether the L0 layers are fully compacted.
     async fn compact_level0(
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index ab6a5f20ba..9b2403f899 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -206,11 +206,10 @@ impl DeleteTimelineFlow {
     // NB: If this fails half-way through, and is retried, the retry will go through
     // all the same steps again. Make sure the code here is idempotent, and don't
     // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip_all, fields(%inplace))]
+    #[instrument(skip_all)]
     pub async fn run(
         tenant: &Arc<Tenant>,
         timeline_id: TimelineId,
-        inplace: bool,
     ) -> Result<(), DeleteTimelineError> {
         super::debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -235,11 +234,7 @@ impl DeleteTimelineFlow {
             ))?
         });
 
-        if inplace {
-            Self::background(guard, tenant.conf, tenant, &timeline).await?
-        } else {
-            Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
-        }
+        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
 
         Ok(())
     }
diff --git a/pageserver/test_data/indices/mixed_workload/README.md b/pageserver/test_data/indices/mixed_workload/README.md
new file mode 100644
index 0000000000..724274fcd9
--- /dev/null
+++ b/pageserver/test_data/indices/mixed_workload/README.md
@@ -0,0 +1,7 @@
+
+# This was captured from one shard of a large tenant in staging.
+
+# It has a mixture of deltas and image layers, >1000 layers in total.
+
+# This is suitable for general smoke tests that want an index which is not
+# trivially small, but doesn't contain weird/pathological cases.
diff --git a/pageserver/test_data/indices/mixed_workload/index_part.json b/pageserver/test_data/indices/mixed_workload/index_part.json
new file mode 100644
index 0000000000..cb4bfc4726
--- /dev/null
+++ b/pageserver/test_data/indices/mixed_workload/index_part.json
@@ -0,0 +1 @@
+{"version":7,"layer_metadata":{"000000067F00004005000060F300069883DB-000000067F00004005000060F300069D13FA__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300039A4000-000000067F00004005000060F300039C0000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039FC000-000000067F00004005000060F30003A0F066__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000082C0F1-000000067F000040050081DB43000086E169__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000478000-000000067F00004005000060F3000047C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000012C000-000000067F00004005000060F300001F0000__0000018624969468":{"file_size":134422528,"generation":7,"shard":"0008"},"000000067F00004005000060F700019E8000-000000067F00004005000060F700019EC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300018E0FE6-000000067F00004005000060F3000193A10B__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016E85370000004000-030000000000000000000000000000000002__0000018613F0A050":{"file_size":14172160,"generation":3,"shard":"0008"},"000000067F00004005000060F300034847BD-000000067F00004005000060F300034BD86C__000000EBC9213D59-000000EFA7EAA9E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C80000-000000067F000040050081DB430000C84000__000000BDAFECFC00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100000CCBA0-000000067F00004005000060F20100000000__0000000D80565628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CA4000-000000067F00004005016EA00C0000CE0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB00013BC000-000000067F00004005000060FB0001400000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001240000-000000067F00004005016EA00C0001244000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004EC52E9-000000067F00004005000060F30004F1638A__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E10000-000000067F000040050081DB430000E14000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000007F0F-000000067F0000400500EB4A480000037E20__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004FE8000-000000067F00004005000060F3000502905D__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000072C000-000000067F000040050081DB430000768000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005E3B48F-000000067F00004005000060F30005EF454F__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A100000B7E04-030000000000000000000000000000000002__000000E7C2F1B249-000000EBC9213D59":{"file_size":30146560,"generation":2,"shard":"0008"},"000000067F0000400501025D90000009029B-000000067F0000400501025D950100000000__0000011B688FEDC8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A10000-000000067F000040050081DB430000A14000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002F5105E-000000067F00004005000060F30002F9A0EB__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000187FE22-000000067F000040050081D80C0100000000__00000075E5D2A930":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001E8000-000000067F000040050081DB4300001EC000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000184C000-000000067F00004005000060FB000187FE22__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005A16504-000000067F00004005000060F30005A57691__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005C0000-000000067F00004005000060F100005C821A__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-000000067F00004005000060F00300000000__000001BCB572A4E0":{"file_size":2310144,"generation":17,"shard":"0008"},"000000067F00004005000060F30002214000-000000067F00004005000060F30002264247__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000110000-000000067F0000400500E3A2A10000114000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006864000-000000067F00004005000060F30006868000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000D0000-000000067F0000400500DBCED500000D4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000274C000-000000067F00004005000060F30002790000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00009274AB-030000000000000000000000000000000002__000001935283F9B9-00000196C9018F59":{"file_size":60104704,"generation":11,"shard":"0008"},"000000067F0000400500C782E4000023D359-000000067F0000400500C782E400002A5E4B__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001780DB7-000000067F00004005000060F700017E1391__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004E4000-000000067F000040050081DB4300004F8000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018C0000-000000067F00004005016EA00C00018C4000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300056DC000-000000067F00004005000060F300056E0000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001F14230-000000067F000040050081D80C0100000000__0000018613F0A050":{"file_size":59138048,"generation":3,"shard":"0008"},"000000067F00004005010F9F120000004000-030000000000000000000000000000000002__0000012E77D3BF00":{"file_size":105775104,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D80000-000000067F00004005000060F30002D84000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000122BBF-000000067F00004005000060F7000013B18E__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002B10000-000000067F00004005000060F30002B88FF2__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006320C60-000000067F00004005000060F30006349DA2__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000079E393-000000067F00004005016EA00C00009BF728__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500F67839000005C000-000000067F0000400500F67839000006AEF4__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D7F71A-030000000000000000000000000000000002__000001BA93C39481-000001BCB572A4E1":{"file_size":50880512,"generation":17,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C481-000001BCB572C5D9":{"file_size":24576,"generation":20,"shard":"0008"},"000000067F00004005000060F70001570000-000000067F00004005000060F70001574000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000042C000-000000067F00004005000060F30000478000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C5D9-000001BCB572DFF9":{"file_size":24576,"generation":22,"shard":"0008"},"000000067F00004005000060FB00015FCD31-030000000000000000000000000000000002__000000698F2C3A38":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C841ED-000000067F00004005000060F30005C95225__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001B4A119-000000067F00004005000060F30100000000__0000008196C976A1-0000008625CF2891":{"file_size":200990720,"generation":2,"shard":"0008"},"000000067F00004005000060F300019790A2-000000067F00004005000060F300019C2056__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001838000-000000067F00004005000060FB000183C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001C00FE1-000000067F00004005000060F30001C0A0A3__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E0000-000000067F00004005000060F300056E4000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000BBD532-000000067F00004005000060F80100000000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":96477184,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F9B026-000000067F00004005000060F30100000000__00000047E31D98D1-0000004C49155071":{"file_size":173834240,"generation":2,"shard":"0008"},"000000067F000040050081DB430000500000-000000067F000040050081DB430000504000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004971675-000000067F00004005000060F300049B26A8__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003102107-000000067F00004005000060F300031130BC__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300048A4000-000000067F00004005000060F30004900000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004B8000-000000067F00004005016EA00C00004BC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001A71688-000000067F00004005000060FB0001A8A1CD__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E60000-000000067F00004005000060F30000E64000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300023B0FF7-000000067F00004005000060F300024020ED__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003F8000-000000067F00004005016EA00C00003FC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004B2B250-000000067F00004005000060F30004B5431C__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000050000-000000067F00004005000060F700000885C5__000000044854EBD1-00000008B6B51879":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB000097168A-030000000000000000000000000000000002__00000028C365FBE1-0000002D2A8E0B81":{"file_size":120299520,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625C000-000000067F00004005000060F30006270000__0000017171761D90":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BA8000-000000067F00004005000060FB0001BC0B44__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003344134-000000067F00004005000060F3000336D193__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B10FFF-000000067F00004005000060F30006B22072__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E34000-000000067F00004005000060F30006E70000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000008238C-000000067F00004005000060F60100000000__00000139CF156B58":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A30000-000000067F00004005000060F70100000000__0000009DF02C1241-000000A173C00489":{"file_size":269688832,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001CE16ED-000000067F000040050081D80C0100000000__0000008DDCD70B68":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB4300011B0000-000000067F000040050081DB4300011B4000__000000DBD29DC248":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000010C0D1-000000067F0000400500F3A25C000011E137__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000004000-000000067F00004005000060F70000029ED0__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F60000058F73-000000067F00004005000060F60100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C3F636-000000067F00004005016EA00C0001CC74D7__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000101089-000000067F0000400500EB4A48000012798C__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007A8000-000000067F000040050081DB4300007AC000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F1000010043F-000000067F00004005000060F20100000000__0000000D55A212C9-000000114A805939":{"file_size":182878208,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001EAC000-000000067F00004005000060FB0001F14230__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000616F6B2-000000067F00004005000060F300061B8705__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C9E3C4-000000067F00004005000060F30005CCF3C5__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AA0000-000000067F00004005000060F70001AB05CB__0000015304A396B9-0000015670D6AFD9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000073C000-000000067F00004005000060F30000775A02__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003AE21D-000000067F000040050081DB43000045029C__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B04000-000000067F00004005000060F70001B18000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E74000-000000067F00004005000060F30000E78000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000182C000-000000067F00004005000060F700018871D6__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DE8B45-000000067F00004005000060FB0000DF968A__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E78000-000000067F00004005000060F30000E7C000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000140C000-030000000000000000000000000000000002__000000603CA8F2F0":{"file_size":89522176,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011CA1CD-000000067F00004005000060FB00011F2D11__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000144FB4E-000000067F00004005016EA00C00014B79E7__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F700015A195C-000000067F00004005000060F80100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FC0000-000000067F00004005000060F70000FC4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000012798C-000000067F0000400500EB4A48000013F89B__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CE4000-000000067F00004005016EA00C0001D18000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30005FC519A-000000067F00004005000060F30005FE621A__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000370000-000000067F00004005016EA00C0000374000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001760000-000000067F00004005016EA00C0001764000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F100003A0000-000000067F00004005000060F100003B8214__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300006B0000-000000067F00004005000060F300006B4000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004E1FF6-030000000000000000000000000000000002__000000174479FC18":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F3000502905D-000000067F00004005000060F300050321C0__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AB05CB-000000067F00004005000060F70001AB8B97__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000151F7C5-000000067F00004005016EA00C000158F667__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000B9C000-000000067F00004005000060F80100000000__000000AFE87558B0":{"file_size":83533824,"generation":2,"shard":"0008"},"000000067F00004005000060F7000141882A-000000067F00004005000060F80100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000018F5CD-000000067F0000400500EB4A48000019F4DD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000196C000-000000067F00004005000060F70001990000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300029C623C-000000067F00004005000060F30100000000__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":81313792,"generation":2,"shard":"0008"},"000000067F00004005000060F300027C0000-000000067F00004005000060F300027C4000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300000001487-000000067F0000400500FB3D300100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":24428544,"generation":2,"shard":"0008"},"000000067F00004005000060F300056D8000-000000067F00004005000060F300056DC000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003C0000-000000067F00004005000060F700003C4000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000664E3CA-000000067F00004005000060F30100000000__000001715E483C79-000001751A7D7589":{"file_size":288645120,"generation":2,"shard":"0008"},"000000067F000040050100D04D000004B5AD-000000067F000040050100D04D00000634BB__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500DBCED5000002C000-000000067F0000400500DBCED50000078000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000C20000-000000067F00004005016EA00C0000C24000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70001B30000-000000067F00004005000060F70001B34000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700009C035C-000000067F00004005000060F80100000000__0000009A1ABDE921-0000009DF02C1241":{"file_size":264159232,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B33945-000000067F00004005000060F30100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":155344896,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000079FCFA-000000067F00004005016EA00C00007C7B9C__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000218000-000000067F0000400500EB4A48000021C000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D1D0DC-000000067F00004005000060F30005D76250__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000149B774-000000067F00004005000060FB00014A42B8__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D0B155-000000067F00004005000060F30003D14206__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300020FC052-000000067F00004005000060F300021050B0__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002268000-000000067F00004005000060F300022B9050__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004FC000-000000067F000040050081DB430000500000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060A93B5-000000067F00004005000060F300060C2210__0000016834A3FC91-0000016B49A934C1":{"file_size":263479296,"generation":2,"shard":"0008"},"000000067F00004005000060F3000674C000-000000067F00004005000060F30006798000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007F913A-030000000000000000000000000000000002__000000A5A3F27398":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F4000-030000000000000000000000000000000002__000000E4D847F4E0":{"file_size":103907328,"generation":2,"shard":"0008"},"000000067F00004005000060F70001348000-000000067F00004005000060F70100000000__0000011B632CC319-0000011F1A40FA69":{"file_size":270753792,"generation":2,"shard":"0008"},"000000067F00004005000060F10000030000-000000067F00004005000060F20100000000__000000021DC73119-000000044854EBD1":{"file_size":267771904,"generation":2,"shard":"0008"},"000000067F000040050107B54701FFFFFFFF-000000067F000040050107B5470300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006674000-000000067F00004005000060F30006690000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050107B54701FFFFFFFF-000000067F000040050107B5470300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000298000-000000067F00004005000060F3000029C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F185D4-000000067F00004005000060F80100000000__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":249135104,"generation":2,"shard":"0008"},"000000067F00004005000060F300049CB712-000000067F00004005000060F30004A048A8__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700004B1E77-000000067F00004005000060F80100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B00000-000000067F00004005000060F30004B1111A__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D14000-000000067F00004005000060F30006D30000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00002D77AE-030000000000000000000000000000000002__000001880F984A29-0000018C496B6DB1":{"file_size":81018880,"generation":11,"shard":"0008"},"000000067F00004005000060F300002D0000-000000067F00004005000060F30000370FD1__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000028000-000000067F0000400500D69D79000002C000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002170000-000000067F00004005000060F30002174000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F59017-000000067F00004005000060F30000F91FFF__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000006A37A-000000067F00004005000060F60100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F6000002F012-000000067F00004005000060F60100000000__00000081AA3C40F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30005614000-000000067F00004005000060F30005688000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300036C8000-000000067F00004005000060F300036F91FE__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001ADF63C-030000000000000000000000000000000002__000001B3E1B95181-000001B6FFE46BC9":{"file_size":64421888,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000057D31-000000067F0000400500EB4A48000008FC41__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F58000-000000067F00004005016EA00C0000F5C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000908000-000000067F000040050081DB43000094A076__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000471200E-000000067F00004005000060F3000474302B__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000403DA-030000000000000000000000000000000002__00000075E5D2A930":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F60000079C4E-000000067F00004005000060F60100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500F67839000003C000-000000067F0000400500F678390000058000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001C80000-000000067F00004005000060FB0001C84000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300047F5138-000000067F00004005000060F3000480620C__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B5C09E-000000067F00004005000060F30006BAD108__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001410F57-000000067F00004005000060F70001429534__00000122A7BB7B29-0000012694E36301":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006B4000-000000067F00004005016EA00C00006E0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700009605D8-000000067F00004005000060F80100000000__000000923719A971-00000096262826C9":{"file_size":251338752,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C8CD0C-000000067F00004005000060F80100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700012B8000-000000067F00004005000060F80100000000__00000113456156F1-00000117EDA82C11":{"file_size":265781248,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000049C000-000000067F00004005016EA00C00004A8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000C78000-000000067F00004005000060F70000C7C000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B4B0BB-000000067F00004005000060F30006B5C09E__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001844000-000000067F00004005000060FB0001848000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300067F0000-000000067F00004005000060F300067F4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C80000-000000067F00004005000060F30004C84000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A4C000-000000067F00004005000060F30002A98000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002480000-000000067F00004005000060F30002484000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000306A02D-000000067F00004005000060F30100000000__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":191299584,"generation":2,"shard":"0008"},"000000067F00004005000060F70001510000-000000067F00004005000060F70001514000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005BDB15B-000000067F00004005000060F30005C841ED__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E98000-000000067F00004005000060FB0001E9C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300057942F4-000000067F00004005000060F300057DD292__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005698000-000000067F00004005000060F3000569C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002983166-000000067F00004005000060F3000299C28F__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000C24000-000000067F00004005016EA00C0000CA0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300033D7D7C-000000067F00004005000060F30003458D42__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A1C000-000000067F000040050081DB430000A30379__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D93639-000000067F00004005000060F50100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C195-000000067F00004005016EA00C000029C196__000001BA93C39481-000001BCB572A4E1":{"file_size":32768,"generation":17,"shard":"0008"},"000000067F00004005000060F30000A5F9BB-000000067F00004005000060F60100000000__000000321AA80270":{"file_size":81657856,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D84000-000000067F00004005000060F30002D93639__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D1C000-000000067F00004005000060F30005D70000__000001684518AF20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010C8000-000000067F000040050081DB4300010E2072__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000058AF5E-000000067F000040050081DB4300005BCFD7__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000034611E-000000067F00004005000060F80100000000__000000321AA80270":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300000C1095-000000067F00004005000060F60100000000__000000021DC73119-000000044854EBD1":{"file_size":220635136,"generation":2,"shard":"0008"},"000000067F00004005000060FB000183C000-000000067F00004005000060FB0001840000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C8729E-000000067F00004005000060F30006C98340__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005138000-000000067F00004005000060F3000513C000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300053E30C3-000000067F00004005000060F300053F40CC__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000002C000-000000067F000040050081DB4300000403DA__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004970000-000000067F00004005000060F30004974000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C08000-000000067F00004005000060F30003C0C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000103AD12-000000067F00004005000060FB000104B856__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004AC000-000000067F00004005016EA00C00004B8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000DB7D33-000000067F00004005016EA00C0000E47BD2__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001F30000-000000067F00004005000060F30001F34000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050109FFA2000000C000-030000000000000000000000000000000002__000001180B3FF408":{"file_size":70516736,"generation":2,"shard":"0008"},"000000067F00004005000060F700017405D4-000000067F00004005000060F70001758B92__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300030B0000-000000067F00004005000060F300030C0FE5__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010660F501FFFFFFFF-000000067F00004005010660F50300000000__00000122A7BB7B29-0000012694E36301":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002168000-000000067F00004005000060F3000216C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60000046A83-000000067F00004005000060F60100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001368000-000000067F00004005000060FB000136C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000184000-000000067F00004005000060F80100000000__000000174479FC18":{"file_size":93143040,"generation":2,"shard":"0008"},"000000067F00004005000060FB00012A8000-000000067F00004005000060FB0100000000__00000057593D8169-0000005C01565329":{"file_size":273711104,"generation":2,"shard":"0008"},"000000067F00004005000060F700007B0000-000000067F00004005000060F700007D05C8__00000075CC373F31-00000079F2A2F311":{"file_size":268468224,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001680B45-000000067F00004005000060FB000169968A__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300050CC000-000000067F00004005000060F300050E8000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-000000067F00004005000060F00300000000__0000018613F0A050":{"file_size":2310144,"generation":3,"shard":"0008"},"000000067F00004005000060F70001B1C000-000000067F00004005000060F70001B30000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F50000-000000067F00004005000060F70000F705D6__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050109CD330100000000-000000067F000040050109FFA2000000C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001FC000-000000067F0000400500EB4A480000200000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000240B12A-000000067F00004005000060F300024440AE__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000008228D-000000067F00004005000060F60100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C000042C000-000000067F00004005016EA00C0000478000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000FF8000-000000067F00004005000060FB0001000B44__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000169968A-000000067F00004005000060FB00016D21CF__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005F821C-000000067F00004005000060F20100000000__000000636DE92159-000000663565F8C9":{"file_size":149954560,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D7C000-000000067F00004005016EA00C0001E03DD8__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F0000400500F678390000058000-000000067F0000400500F67839000005C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003A7E20-000000067F0000400500EB4A4800003BFD31__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001228000-000000067F00004005016EA00C000122C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000F0C0E9-000000067F000040050081DB430000F4E15B__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000758000-000000067F00004005000060F80100000000__0000006DDB29D589-000000722F474369":{"file_size":264781824,"generation":2,"shard":"0008"},"000000067F00004005000060F300068640AF-000000067F00004005000060F3000686D0DE__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000047C000-000000067F00004005016EA00C0000498000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30006166575-000000067F00004005000060F3000616F6B2__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B18000-000000067F00004005000060F70001B1C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700016EC000-000000067F00004005000060F70001708000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005CCF3C5-000000067F00004005000060F30005D184F6__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002848000-000000067F00004005000060F3000285901B__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300039C0000-000000067F00004005000060F300039C4000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002464000-000000067F00004005000060F30002480000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00011D0000-000000067F00004005016EA00C00011D4000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003D44283-000000067F00004005000060F30003D952B0__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480100000000-000000067F0000400500EE16BC0000044000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000533205E-000000067F00004005000060F300053E30C3__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000009A255-000000067F00004005000060F60300000000__0000017CC2FD7288":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B00000-000000067F00004005000060F70001B04000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004958000-000000067F00004005000060F3000495C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000518000-000000067F00004005000060F80100000000__0000004C49155071-0000004F31878919":{"file_size":262373376,"generation":2,"shard":"0008"},"000000067F00004005000060F300064D8000-000000067F00004005000060F3000658113F__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FDA1F80000014000-000000067F0000400500FDA1F80000020D42__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000284000-000000067F00004005000060FB00002D4B6A__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CDBB9C-000000067F00004005000060F80100000000__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":148865024,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001298000-000000067F00004005016EA00C000129C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001DD8000-000000067F00004005000060FB0001DF0B43__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001220000-000000067F00004005000060F70001224000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002908000-000000067F00004005000060F30002920FA0__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F5C000-000000067F00004005016EA00C0000F90000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001E03DD8-030000000000000000000000000000000002__000001BCB572A4E0":{"file_size":139264,"generation":17,"shard":"0008"},"000000067F00004005000060F30003998000-000000067F00004005000060F3000399C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00014E75C6-030000000000000000000000000000000002__000001A931C135B1-000001AC25760149":{"file_size":51486720,"generation":11,"shard":"0008"},"000000067F00004005010660F500000F44CB-000000067F00004005010660F70100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003FC000-000000067F00004005016EA00C0000400000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003810000-000000067F00004005000060F30003849093__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B00000-000000067F00004005000060F30006B10FFF__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001541688-000000067F00004005000060FB000154A1CD__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001098000-000000067F00004005000060FB000109C000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700011912D4-000000067F00004005000060F80100000000__00000104BD37F348":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A40000-000000067F00004005000060F30002A44000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001448000-000000067F00004005000060F300014B0F7B__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001009688-000000067F00004005000060FB000102A1CE__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001A4000-000000067F0000400500EE16BC00001E0000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B58B45-000000067F00004005000060FB0000B6168A__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000AC000-000000067F0000400500D69D7900000BDAF5__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000193A10B-000000067F00004005000060F30100000000__00000075CC373F31-00000079F2A2F311":{"file_size":198148096,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00005A0000-000000067F00004005016EA00C00005A4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700000E0000-000000067F00004005000060F80100000000__0000000D80565628":{"file_size":112009216,"generation":2,"shard":"0008"},"000000067F00004005000060F3000690F2FD-000000067F00004005000060F300069883DB__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300004C6B83-000000067F00004005000060F60100000000__000000174479FC18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E18000-000000067F00004005000060F30001E50FF3__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B4000-000000067F00004005000060F300043B8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100006C0000-000000067F00004005000060F20100000000__000000722F474369-00000075CC373F31":{"file_size":267665408,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A78000-000000067F00004005000060F70000A7C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011C1688-000000067F00004005000060FB00011CA1CD__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004E8000-000000067F00004005016EA00C00004EC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000257A6F-000000067F00004005016EA00C000029F90B__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001590000-000000067F00004005000060FB0001594000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000193189A-030000000000000000000000000000000002__000001B3F17FE4E0":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F300027C4000-000000067F00004005000060F30002828000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B40000-000000067F00004005016EA00C0000B44000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30006694000-000000067F00004005000060F300066F0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015C8000-000000067F00004005000060FB00015CC000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B84000-000000067F00004005000060F30003B90000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006704000-000000067F00004005000060F30006748000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000793506-030000000000000000000000000000000002__0000002427BD8BD0":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30004F1638A-000000067F00004005000060F30100000000__000001440D3D0C69-0000014784964B91":{"file_size":93708288,"generation":2,"shard":"0008"},"000000067F00004005000060F80100000000-000000067F00004005000060FB0000014000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F70000180000-000000067F00004005000060F70000184000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A2693B-000000067F00004005000060F30004A7F98F__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C71F27-000000067F00004005000060F30002C9AFB8__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300038075AF-000000067F00004005000060F30100000000__000000FF8B261599-000001048B25A8E9":{"file_size":49823744,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000028000-000000067F0000400500DBCED5000002C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004188000-000000067F00004005000060F300041D9101__0000012694E36301-0000012A3F140591":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30006868000-000000067F00004005000060F50100000000__00000178C5D5D3A8":{"file_size":116645888,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A789A0-000000067F00004005000060F30003AB9907__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000368000-000000067F0000400500EB4A48000036FF11__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300047EC0CA-000000067F00004005000060F300047F5138__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AB8B97-000000067F00004005000060F70001AC115C__0000015304A396B9-0000015670D6AFD9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D61283-000000067F00004005000060F70000D8985C__000000C462B3C2A9-000000C824C09619":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300011D1111-000000067F00004005000060F3000122A1D5__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001967D34-000000067F00004005016EA00C000197FBD0__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FA2AD3000004D85C-000000067F0000400500FB3D300100000000__0000010D77B487A0":{"file_size":31309824,"generation":2,"shard":"0008"},"000000067F000040050081DB4300005BCFD7-000000067F000040050081DB4300005D704F__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000004000-000000067F00004005000060F100000260F2__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F0000400500EE16BC00000F8000-000000067F0000400500EE16BC000014158C__000000F901689359-000000FCCD5238B1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000921E8A-000000067F00004005000060F60100000000__00000028C365FBE1-0000002D2A8E0B81":{"file_size":228564992,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001190000-000000067F00004005000060FB0001198B44__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300067A0000-000000067F00004005000060F300067A4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000200000-000000067F00004005000060F10000204000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF0FBB-000000067F00004005000060F3000407201D__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000001C000-000000067F00004005000060F3000008228D__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0001CD7376-030000000000000000000000000000000002__000001B6FFE46BC9-000001BA93C39481":{"file_size":70238208,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000EBC000-000000067F00004005000060FB0000EC8000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000293210E-000000067F00004005000060F30002983166__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000151F271-000000067F00004005000060F30100000000__000000636DE92159-000000663565F8C9":{"file_size":41271296,"generation":2,"shard":"0008"},"000000067F00004005000060F30004880000-000000067F00004005000060F30004884000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000518222-000000067F00004005000060F20100000000__0000005413AB3641-00000057593D8169":{"file_size":169492480,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003E0000-000000067F00004005016EA00C00003E4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000775A02-000000067F00004005000060F60100000000__0000002427BD8BD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000197FBD0-000000067F00004005016EA00C00019C7A6A__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000067114B-000000067F00004005000060F60100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":232669184,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001408000-000000067F00004005000060FB000140C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001F8000-000000067F0000400500EB4A4800001FC000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000290000-000000067F0000400500EB4A480000294000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003061089-000000067F00004005000060F3000306A02D__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CE4000-000000067F00004005000060F30001CF0197__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000E20000-000000067F00004005000060F70000E24000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001D0000-000000067F000040050081DB4300001D4000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D184F6-000000067F00004005000060F30100000000__0000016143292911-00000164DEE06671":{"file_size":200163328,"generation":2,"shard":"0008"},"000000067F00004005000060F300066F4000-000000067F00004005000060F30006700000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A38000-000000067F000040050081DB430000A4A074__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F38000-000000067F00004005000060F30000F59017__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C0C000-000000067F00004005000060FB0000C18000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D34000-000000067F00004005000060F30006D60000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010660F501FFFFFFFF-000000067F00004005010660F50300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700013E85D1-000000067F00004005000060F70001410BBC__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000538B44-000000067F00004005000060FB0000551689__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001410000-000000067F00004005000060F70001414000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300032F1113-000000067F00004005000060F3000330A1C8__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004974000-000000067F00004005000060F3000498DC49__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625EB45-000000067F00004005000060F30006277C61__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700019E8E81-000000067F00004005000060F80100000000__0000014EC58A4A79-0000015304A396B9":{"file_size":246792192,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB5730259-000001BCB5732691":{"file_size":24576,"generation":187,"shard":"0008"},"000000067F000040050081DB4300001CC000-000000067F000040050081DB4300001D0000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C00000-000000067F00004005000060F30002C18FAE__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FC4000-000000067F00004005000060F70000FCD85E__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000107C39B-030000000000000000000000000000000002__0000004C49155071-0000004F31878919":{"file_size":133349376,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F90000-000000067F00004005016EA00C0000F94000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000F98000-000000067F00004005016EA00C0000F9C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700019EC000-000000067F00004005000060F80100000000__0000014EDD256548":{"file_size":7421952,"generation":2,"shard":"0008"},"000000067F00004005000060F300069FA3F6-000000067F00004005000060F30006A0B44C__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003AC000-000000067F000040050081DB4300003B27DA__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005A57691-000000067F00004005000060F30005B00697__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300060CB2C8-000000067F00004005000060F300060D4415__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000495C000-000000067F00004005000060F30004970000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000D1C5F-000000067F0000400500D69D7900000F1B5B__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001358000-030000000000000000000000000000000002__000001A95031E5B8":{"file_size":21110784,"generation":11,"shard":"0008"},"000000067F00004005000060F3000430C000-000000067F00004005000060F30004370000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004904000-000000067F00004005000060F30004958000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000008000-000000067F00004005000060F30000378000__00000186146441F1-0000018624969469":{"file_size":33357824,"generation":6,"shard":"0008"},"000000067F00004005000060F700005C0000-000000067F00004005000060F700005C85CE__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B04000-000000067F00004005016EA00C0000B40000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002920FA0-000000067F00004005000060F3000293210E__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002058000-000000067F00004005000060F30002070F71__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000686D0DE-000000067F00004005000060F3000689E295__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000004000-000000067F0000400500FA2AD30000030000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00009BF728-000000067F00004005016EA00C0000A575C7__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30004374000-000000067F00004005000060F300043B0000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300051F0000-000000067F00004005000060F300051F4000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B22072-000000067F00004005000060F30006B4B0BB__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000328FA4E-000000067F00004005000060F50100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000000FEA0-000000067F00004005016EA00C000001FD3E__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A48000019F4DD-030000000000000000000000000000000002__000000F6661C9241-000000F901689359":{"file_size":59498496,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003EC000-000000067F00004005016EA00C00003F8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000073C000-000000067F00004005016EA00C000074F43B__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003542BFF-000000067F00004005000060F50100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001771169-000000067F00004005000060F80100000000__000001398B56A519-0000013C9C0E3339":{"file_size":263454720,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003B27DA-030000000000000000000000000000000002__0000008DDCD70B68":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F3000542AFB0-000000067F00004005000060F30005474062__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000057C94F-000000067F00004005000060F80100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300055861F2-000000067F00004005000060F30100000000__0000015304A396B9-0000015670D6AFD9":{"file_size":127393792,"generation":2,"shard":"0008"},"000000067F00004005000060F30001D79136-000000067F00004005000060F30100000000__0000008DBE2855F9-000000923719A971":{"file_size":227958784,"generation":2,"shard":"0008"},"000000067F00004005000060F10000218000-000000067F00004005000060F1000021C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD4000-000000067F00004005016EA00C0001CE0000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F300017EC000-000000067F00004005000060F30001886B2A__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001188000-000000067F00004005000060F300011D1111__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000ECC000-000000067F00004005000060FB0000F050F2__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300018C0000-000000067F00004005000060F300018E0FE6__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006E4000-000000067F00004005016EA00C0000738000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002790000-000000067F00004005000060F30002794000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00001B850B-000000067F0000400500F56D510100000000__0000011B688FEDC8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F100001F8000-000000067F00004005000060F100001FC000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000810000-000000067F00004005000060F80100000000__00000079F2A2F311-0000007E3A9BFD29":{"file_size":263454720,"generation":2,"shard":"0008"},"000000067F00004005000060F100006CBF87-000000067F00004005000060F20100000000__000000A5A3F27398":{"file_size":15851520,"generation":2,"shard":"0008"},"000000067F0000400500F7D2DD0100000000-000000067F0000400500F8E3A50000014000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010AABC7-000000067F00004005000060F80100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B80000-000000067F00004005000060F30003B84000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000078000-000000067F000040050081DB4300000AA080__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002618000-000000067F00004005000060F30002680F9D__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A48000-000000067F00004005000060F30002A4C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001994000-000000067F00004005000060F700019E8000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B6168A-000000067F00004005000060FB0000B6A1D0__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000147A0EC-000000067F00004005000060FB000148AC30__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000060000-000000067F0000400500EE16BC0000064000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003458D42-000000067F00004005000060F30003481DDB__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E30000-000000067F00004005000060F30006E34000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700017F8000-000000067F00004005000060F700017FC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C50000-000000067F00004005000060F30004C54000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001720000-000000067F00004005000060F80100000000__00000139CF156B58":{"file_size":63463424,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A8E15E-000000067F000040050081DB430000A98000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":265404416,"generation":2,"shard":"0008"},"000000067F00004005000060F30004BAE526-000000067F00004005000060F30004BE7584__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001ADF97B-000000067F00004005016EA00C0001B0FD2A__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F60000014000-000000067F00004005000060F60100000000__0000003D2AB09B68":{"file_size":83329024,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C1C000-000000067F00004005000060FB0000C70000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005240000-000000067F00004005000060F30005244000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000077C000-000000067F000040050081DB430000790000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D60000-000000067F00004005000060F30006D64000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C54000-000000067F00004005000060F30004C60000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000000000000000001-000000067F0000400500000A690000000002__00000186146441F1-0000018624969469":{"file_size":57344,"generation":6,"shard":"0008"},"000000067F00004005000060F30005688000-000000067F00004005000060F3000568C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004370000-000000067F00004005000060F30004374000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300051F4000-000000067F00004005000060F30005210000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DD8000-000000067F00004005000060F30004DDC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001AFD31-000000067F0000400500C782E400001B7C41__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000BB103B-000000067F00004005000060F60000014C3A__0000003579F03331-0000003959DA2DE9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500D19D030100000000-000000067F0000400500D69D790000024000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000028B253-030000000000000000000000000000000002__0000008196C976A1-0000008625CF2891":{"file_size":151224320,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DD8000-000000067F00004005000060F30004E40FFC__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010F44EB0100000000-000000067F00004005010F57CB000000C000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003BCC000-000000067F00004005000060F30003C08000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B80000-000000067F00004005000060F30005B89170__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000135FCAD-000000067F00004005016EA00C000144FB4E__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005010660F500000B0000-000000067F00004005010660F500000B4000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000D31030-000000067F00004005000060F30100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":233791488,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C18FAE-000000067F00004005000060F30002C71F27__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000041FB53-000000067F0000400500EB4A480000447A64__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000048000-000000067F0000400500EE16BC000004C000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00009D0000-000000067F00004005000060FB00009D4000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100004365FE-000000067F00004005000060F20100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006BAD108-000000067F00004005000060F30006C0E146__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300006B4000-000000067F00004005000060F300006E0000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000327C000-000000067F00004005000060F3000328FA4E__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B94000-000000067F00004005000060F30003BC8000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003CB8FCF-000000067F00004005000060F30003CCA0B9__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003EA902F-000000067F00004005000060F30003F72201__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C64000-000000067F00004005000060F30004C80000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000194000-000000067F000040050081DB4300001C8000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB01FFFFFFFF-000000067F00004005000060FB0300000000__0000018613A0DEA9-00000186146441F1":{"file_size":73728,"generation":5,"shard":"0008"},"000000067F00004005000060F300038B5F5B-000000067F00004005000060F300038FF04F__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001C8000-000000067F000040050081DB4300001CC000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000137F10-000000067F0000400500C782E40000177E20__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000139C000-000000067F00004005000060FB00013B8000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000447A64-000000067F0000400500EB4A480100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":40550400,"generation":2,"shard":"0008"},"000000067F00004005000060F70000418000-000000067F00004005000060F700004405CF__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB430000728000-000000067F000040050081DB43000072C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014B0F7B-000000067F00004005000060F30100000000__000000601F43CF09-000000636DE92159":{"file_size":83951616,"generation":2,"shard":"0008"},"000000067F00004005000060F30005F3303F-000000067F00004005000060F30005FA40AD__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300012442A9-000000067F00004005000060F3000129D29A__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010B14AB-000000067F000040050081DB430100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00014CF88D-000000067F00004005016EA00C00014D7727__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006A0B44C-000000067F00004005000060F30006A7C566__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000062EE46-000000067F00004005000060F20100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CE0000-000000067F00004005016EA00C0001CE4000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30000250000-000000067F00004005000060F30000254000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050E8000-000000067F00004005000060F300050EC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000259F4A3-000000067F00004005000060F30100000000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":44433408,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A640EA-000000067F000040050081DB430000A8E15E__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003050000-000000067F00004005000060F30003061089__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C0000158000-000000067F0000400500F3A25C000016A065__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010A4000-000000067F000040050081DB4300010B14AB__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001E0000-000000067F0000400500EE16BC00001E4000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300055B8000-000000067F00004005000060F300055BC000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CE4000-000000067F00004005016EA00C0000D30000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003640000-000000067F00004005000060F30003644000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000014F7AC-000000067F0000400500EB4A4800001876BD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD338E-000000067F00004005016EA00C0001CE79E0__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F00004005000060FB0001530B44-000000067F00004005000060FB0001541688__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300031D516C-000000067F00004005000060F30100000000__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":137863168,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00019C7A6A-000000067F00004005016EA00C00019F7907__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000E7F7A7-000000067F00004005016EA00C0000F3F647__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300032C0000-000000067F00004005000060F300032F1113__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006E0000-000000067F00004005016EA00C00006E4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F7000019EA78-000000067F00004005000060F80100000000__0000001737D88379-0000001B59EEB909":{"file_size":50946048,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001B4FBC9-000000067F00004005016EA00C0001BBFA66__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001660000-000000067F00004005000060FB0001680B45__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002BAA1DD-000000067F00004005000060F30100000000__000000C462B3C2A9-000000C824C09619":{"file_size":203554816,"generation":2,"shard":"0008"},"000000067F00004005000060F300049B26A8-000000067F00004005000060F300049CB712__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CCB5CD-000000067F00004005000060F70000CDBB9C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EEA075-000000067F000040050081DB430000F0C0E9__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300003E0000-000000067F00004005000060F300003E8FBC__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C9C000-000000067F00004005000060F30006CA0000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C7C000-000000067F00004005000060F70000C8CD0C__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001148000-000000067F00004005000060FB000114C000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001232ACF-000000067F00004005000060F80100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FE8000-000000067F00004005000060F700010105DB__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000355928-000000067F0000400500EB4A480100000000__000000FCD84FE628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700003FE341-000000067F00004005000060F80100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000244D189-000000067F00004005000060F30100000000__000000A9EB8C4489-000000ACA44C8E99":{"file_size":212566016,"generation":2,"shard":"0008"},"000000067F00004005000060F700003B85C7-000000067F00004005000060F80100000000__0000003579F03331-0000003959DA2DE9":{"file_size":208945152,"generation":2,"shard":"0008"},"000000067F00004005000060F100005A2B80-000000067F00004005000060F20100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB000070C000-000000067F00004005000060FB0000718000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB01FFFFFFFF-000000067F00004005000060FB0300000000__00000186146441F1-0000018624969469":{"file_size":24576,"generation":6,"shard":"0008"},"000000067F00004005000060FB000180C000-000000067F00004005000060FB0001838000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000044000-000000067F0000400500EE16BC0000048000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10100000000-000000067F00004005000060F10300000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":483328,"generation":2,"shard":"0008"},"000000067F00004005000060F30004EA41A5-000000067F00004005000060F30004EC52E9__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003AB9907-000000067F00004005000060F30003AF28CB__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000974000-000000067F00004005000060FB00009D0000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038720A2-000000067F00004005000060F300038A3082__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000452BA1-000000067F000040050081DB4300004C4C1E__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300017AA0CE-000000067F00004005000060F30100000000__0000006DDB29D589-000000722F474369":{"file_size":202719232,"generation":2,"shard":"0008"},"000000067F000040050081DB430000504000-000000067F000040050081DB430000560000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B5431C-000000067F00004005000060F30004B654F6__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000C20000-000000067F00004005000060F30000C24000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300028920E4-000000067F00004005000060F30100000000__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":200351744,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004C4C1E-030000000000000000000000000000000002__000000923719A971-00000096262826C9":{"file_size":192356352,"generation":2,"shard":"0008"},"000000067F000040050081DB430000190000-000000067F000040050081DB430000194000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E88000-000000067F000040050081DB430000E8C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000738000-000000067F00004005016EA00C000073C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000578EE6-000000067F000040050081DB43000058AF5E__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001C38000-000000067F00004005000060F30001C3C000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000B7C0EA-030000000000000000000000000000000002__000000B2B5C4E8F9-000000B768469051":{"file_size":133464064,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625B8F0-000000067F00004005000060F30100000000__0000016B49A934C1-0000016E1FBB7B99":{"file_size":139640832,"generation":2,"shard":"0008"},"000000067F00004005000060FB000109C000-000000067F00004005000060FB0001110000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572DFF9-000001BCB5730259":{"file_size":24576,"generation":41,"shard":"0008"},"000000067F00004005000060FB0000AA8000-000000067F00004005000060FB0000AD0B45__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043F8000-000000067F00004005000060F300043FC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003C7C42-000000067F0000400500EB4A48000041FB53__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005BA213F-000000067F00004005000060F30005BDB15B__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300063FE10E-000000067F00004005000060F30100000000__0000016E1FBB7B99-000001715E483C79":{"file_size":111067136,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F91FFF-000000067F00004005000060F30000F9B026__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003650000-000000067F00004005000060F30003654000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050A412B-000000067F00004005000060F300050B5199__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D78000-000000067F00004005016EA00C0001D7C000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005016EA00C0001244000-000000067F00004005016EA00C0001298000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F100001FC000-000000067F00004005000060F10000200000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CA0000-000000067F00004005016EA00C0000CA4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F3000498DC49-000000067F00004005000060F50100000000__00000139CF156B58":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F60000036EA0-000000067F00004005000060F60100000000__0000009A24DF6768":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000928B45-000000067F00004005000060FB000097168A__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006854000-000000067F00004005000060F30006858000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050109FFA2000000C3F5-030000000000000000000000000000000002__00000117EDA82C11-0000011B632CC319":{"file_size":226066432,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A6D1B3-000000067F00004005000060F30100000000__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":117620736,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D2C000-000000067F00004005000060F30002D80000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A31FB6-000000067F00004005000060F30003A3B020__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000160723E-000000067F00004005016EA00C00016570D9__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FB3D310000018000-000000067F0000400500FB3D31000001C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001708000-000000067F00004005000060F7000170C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000283C3E7-000000067F00004005000060F50100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00018F0000-000000067F00004005000060FB0100000000__00000075CC373F31-00000079F2A2F311":{"file_size":268959744,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EC8000-000000067F00004005000060FB0000ECC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F9C000-000000067F00004005016EA00C0000FF0000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002680F9D-000000067F00004005000060F3000274A080__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000679C000-000000067F00004005000060F300067A0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000428313F-000000067F00004005000060F300042CC1BD__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00000FFFFFFFF-030000000000000000000000000000000002__00000186146441F1-0000018624969469":{"file_size":24576,"generation":6,"shard":"0008"},"000000067F00004005000060FB00017D8000-000000067F00004005000060FB00017DC000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700017FC000-000000067F00004005000060F70001828000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002FD317C-000000067F00004005000060F30002FF427D__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001701588-000000067F00004005000060FB00017120CE__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500000A3000FFFFFFFF-000000067F0000400500000A690000000002__000001BA93C39481-000001BCB572A4E1":{"file_size":40960,"generation":17,"shard":"0008"},"000000067F00004005000060FB0000638B45-030000000000000000000000000000000002__0000001B59EEB909-0000001FFBC01501":{"file_size":252010496,"generation":2,"shard":"0008"},"000000067F000040050081DB430000394000-000000067F000040050081DB4300003A8000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CF0197-000000067F00004005000060F50100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800000DFB51-000000067F0000400500EB4A4800000E7A62__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000014C000-000000067F00004005000060F70000180000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005948000-000000067F00004005000060F300059790CD__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000853115-000000067F00004005000060F60100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":176136192,"generation":2,"shard":"0008"},"000000067F00004005000060F30004884000-000000067F00004005000060F30004888000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000513C000-000000067F00004005000060F30005160000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000017C000-000000067F0000400500F3A25C00001B850B__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006382F14-000000067F00004005000060F3000638C06D__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000017F02-000000067F0000400500E3A2A100000B7E04__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001000B44-000000067F00004005000060FB0001009688__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790100000000-000000067F0000400500DBCED50000024000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010A0000-000000067F000040050081DB4300010A4000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000310000-000000067F00004005000060FB0000348B45__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F60000060038-000000067F00004005000060F60100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CE0000-000000067F00004005000060F30001CE4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000AA080-000000067F000040050081DB4300000D40FF__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000551689-030000000000000000000000000000000002__0000001737D88379-0000001B59EEB909":{"file_size":227418112,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000D90000-000000067F00004005000060FB0100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":272769024,"generation":2,"shard":"0008"},"000000067F00004005000060F300059CC403-000000067F00004005000060F300059F53C6__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F2C000-000000067F00004005000060F30001F30000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000014000-000000067F00004005000060FB0000084772__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F30004B654F6-000000067F00004005000060F30004BAE526__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002450000-000000067F00004005000060F30002454000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A0F066-000000067F00004005000060F50100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F60000032EBE-000000067F00004005000060F60100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00001D8000-000000067F00004005000060FB00001DC000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000670000-000000067F00004005016EA00C0000674000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001344000-000000067F00004005016EA00C0001358000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000D30000-000000067F00004005016EA00C0000D34000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000012FE9A-000000067F00004005016EA00C00001F7D38__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000BF0000-000000067F00004005000060F70100000000__000000B2B5C4E8F9-000000B768469051":{"file_size":273809408,"generation":2,"shard":"0008"},"000000067F00004005000060F300005A0000-000000067F00004005000060F3000067114B__0000001B59EEB909-0000001FFBC01501":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000021C000-000000067F0000400500EB4A480000290000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F3C000-000000067F00004005016EA00C0000F58000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000074F43B-030000000000000000000000000000000002__000001936E73D028":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005010F57CB000000C000-000000067F00004005010F99A50100000000__00000126C3C69FC0":{"file_size":22978560,"generation":2,"shard":"0008"},"000000067F00004005000060F700017E1391-000000067F00004005000060F80100000000__0000013C9C0E3339-0000013FEFA7D709":{"file_size":232677376,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CC74D7-000000067F00004005016EA00C0001CD7376__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F700005C85CE-000000067F00004005000060F700005E8B9D__00000057593D8169-0000005C01565329":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FCD352-000000067F00004005000060F30100000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":124788736,"generation":2,"shard":"0008"},"000000067F0000400500C782E400002A5E4B-000000067F0000400500C782E400002CDD5C__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700018871D6-000000067F00004005000060F80100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D252C8-000000067F00004005000060F30100000000__00000117EDA82C11-0000011B632CC319":{"file_size":205963264,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001408A62-000000067F00004005000060FB00014195A7__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001B7C41-000000067F0000400500C782E400001C7B51__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000110000-000000067F00004005000060FB0100000000__000000044854EBD1-00000008B6B51879":{"file_size":272613376,"generation":2,"shard":"0008"},"000000067F00004005000060F300004E8000-000000067F00004005000060F60100000000__0000001737D88379-0000001B59EEB909":{"file_size":260579328,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DF4000-000000067F00004005000060F30006E30000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C84000-030000000000000000000000000000000002__000000BAC0041E18":{"file_size":59998208,"generation":2,"shard":"0008"},"000000067F00004005000060F30002B88FF2-000000067F00004005000060F30002BAA1DD__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000434000-000000067F00004005000060FB00004A0000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DA8000-000000067F00004005000060F30004DAC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004E0000-000000067F000040050081DB4300004E4000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001E4000-000000067F0000400500EE16BC0000201716__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C440EA-000000067F000040050081DB430000C5E15B__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000BDAF5-000000067F0000400500D69D790100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A9C000-000000067F00004005000060F30002AEED02__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DAC000-000000067F00004005000060F30004DD8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B94000-000000067F00004005000060F70000B98000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002454000-000000067F00004005000060F30002460000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100001059CB-000000067F00004005000060F10000125BF2__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000D362CA-000000067F00004005016EA00C0000DB7D33__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001C0A0A3-000000067F00004005000060F30100000000__0000008625CF2891-00000089F4693119":{"file_size":203063296,"generation":2,"shard":"0008"},"000000067F00004005000060F300066F0000-000000067F00004005000060F300066F4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001414000-000000067F00004005000060F70001428000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014CC16D-000000067F00004005000060F300014D5280__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000172AC12-030000000000000000000000000000000002__0000006DDB29D589-000000722F474369":{"file_size":186875904,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E4C000-000000067F000040050081DB430000E88000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300063A50CD-000000067F00004005000060F300063FE10E__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005419E9C-000000067F00004005000060F3000542AFB0__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC000014158C-030000000000000000000000000000000002__000000F901689359-000000FCCD5238B1":{"file_size":67854336,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00015FF3A0-000000067F00004005016EA00C000160723E__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00008E760F-000000067F00004005016EA00C00009274AB__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000B98000-000000067F00004005000060F70000B9C000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004A4000-000000067F00004005000060FB00004E1FF6__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006670000-000000067F00004005000060F30006674000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000185EE9-000000067F00004005000060F7000018E4B6__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000067CA9-030000000000000000000000000000000002__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":29319168,"generation":2,"shard":"0008"},"000000067F0000400500FF2A51000000BFFB-030000000000000000000000000000000002__0000010D77B487A0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A048A8-000000067F00004005000060F30004A1D870__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300004BC000-000000067F00004005000060F300004C6B83__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005290FC9-000000067F00004005000060F3000533205E__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300031130BC-000000067F00004005000060F300031C40D1__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000047EE2-000000067F0000400500D19D03000004FDC6__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A44000-000000067F00004005000060F30002A48000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003DAE2DC-000000067F00004005000060F30003DD734C__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A50000014000-000000067F0000400500F8E3A5000004A25C__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100002F03E9-000000067F00004005000060F20100000000__000000321AA80270":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001138000-000000067F00004005000060F80100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":72695808,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E4000-000000067F00004005000060F50100000000__00000159B010F6C0":{"file_size":13393920,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A7C000-000000067F00004005000060F70000ABD9C4__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000CC6E51-030000000000000000000000000000000002__0000003D2AB09B68":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F60000091EFF-000000067F00004005000060F60100000000__0000014EDD256548":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000008FC41-000000067F0000400500EB4A4800000DFB51__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F363B4-000000067F00004005000060F30001F574A6__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD0000-000000067F00004005016EA00C0001CD4000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F300059B324D-000000067F00004005000060F300059CC403__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002530000-000000067F00004005000060F30002534000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000004B633-000000067F00004005000060F60100000000__000000C483D0D6B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700011E0000-000000067F00004005000060F80100000000__0000010779A7F551-0000010A5E65DF39":{"file_size":262922240,"generation":2,"shard":"0008"},"000000067F00004005000060F30006690000-000000067F00004005000060F30006694000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000100E18-000000067F00004005000060F700001213F2__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500FF2A510000004000-000000067F0000400500FF2A51000000BFFB__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EB8000-000000067F00004005000060FB0000EBC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000674000-000000067F00004005016EA00C00006B0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000EF85D6-000000067F00004005000060F80100000000__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":262897664,"generation":2,"shard":"0008"},"000000067F00004005000060F700005E8B9D-000000067F00004005000060F700005F9158__00000057593D8169-0000005C01565329":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E40FFC-000000067F00004005000060F30004E7A062__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000037E20-000000067F0000400500EB4A480000057D31__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400501101C0901FFFFFFFF-030000000000000000000000000000000002__0000012E71CF31F9-000001334140FC21":{"file_size":65060864,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B10000-000000067F00004005000060F70100000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":272646144,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E104B-000000067F00004005000060F3000570A19E__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300059790CD-000000067F00004005000060F300059AA115__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B54000-000000067F00004005000060F70000B90000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300041D9101-000000067F00004005000060F3000424A099__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700000E085E-000000067F00004005000060F70000100E18__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300051B0000-000000067F00004005000060F300051B4000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572A4E1-000001BCB572C329":{"file_size":24576,"generation":17,"shard":"0008"},"000000067F00004005000060F30006D30000-000000067F00004005000060F30006D34000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FDA1F80000020D42-000000067F0000400500FDA1F80100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081D80C0100000000-000000067F000040050081DB430000024000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F600000235B4-000000067F00004005000060F60100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500C782E400000A0000-000000067F0000400500C782E400000A4000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002264247-000000067F00004005000060F50100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000302C2D6-000000067F00004005000060F50100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000129C000-000000067F00004005016EA00C0001340000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700016E8000-000000067F00004005000060F700016EC000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300023A0000-000000067F00004005000060F300023B0FF7__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F20100000000-000000067F00004005000060F3000000C000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0000374000-000000067F00004005016EA00C00003E0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000368000-000000067F00004005000060F80100000000__0000003203FB5749-0000003579F03331":{"file_size":263249920,"generation":2,"shard":"0008"},"000000067F000040050081DB4300006310C9-030000000000000000000000000000000002__0000009A1ABDE921-0000009DF02C1241":{"file_size":208953344,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DC8000-000000067F00004005000060FB0000DE8B45__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000530000-000000067F00004005000060FB0000538B44__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000024000-000000067F000040050081DB430000028000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000488C000-000000067F00004005000060F30004898000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300044D3639-000000067F00004005000060F50100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005010450640000000570-000000067F0000400501046F39000000BDD2__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300021050B0-000000067F00004005000060F3000212E160__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700010DD440-000000067F00004005000060F80100000000__000000F309FCDD19-000000F6661C9241":{"file_size":91758592,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000AD0B45-000000067F00004005000060FB0000AE168A__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000013B18E-000000067F00004005000060F7000014B73D__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001938000-000000067F00004005016EA00C000193FE9D__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500C782E400000A4000-000000067F0000400500C782E4000012A71E__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001A40000-000000067F00004005000060F30001A44000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00008578D4-000000067F00004005016EA00C00008CF772__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001CC0000-000000067F00004005000060F30001CC4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004D20000-000000067F00004005000060F30004D24000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003E8000-000000067F00004005016EA00C00003EC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300039C4000-000000067F00004005000060F300039F8000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005164000-000000067F00004005000060F300051B0000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039F8000-000000067F00004005000060F300039FC000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010F46BD-000000067F000040050081DB430100000000__000000D31E48D7C9-000000D74E29AAD1":{"file_size":113999872,"generation":2,"shard":"0008"},"000000067F00004005000060F30002E630CF-000000067F00004005000060F30100000000__000000D31E48D7C9-000000D74E29AAD1":{"file_size":171999232,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000ACF305-000000067F00004005016EA00C0000ADF1AB__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006748000-000000067F00004005000060F3000674C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003810000-000000067F00004005000060F50100000000__00000104BD37F348":{"file_size":11739136,"generation":2,"shard":"0008"},"000000067F00004005000060F1000021C000-000000067F00004005000060F20100000000__0000002427BD8BD0":{"file_size":132448256,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017EC000-000000067F00004005016EA00C00018C0000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F7000025DA3C-000000067F00004005000060F80100000000__0000002427BD8BD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00007F0000-000000067F00004005000060FB0000860B45__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF0000-000000067F00004005000060F30003FF4000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E0AD15-000000067F00004005000060FB0000E1B859__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010ADFA80000004000-000000067F00004005010F2BD40100000000__00000126C3C69FC0":{"file_size":13369344,"generation":2,"shard":"0008"},"000000067F00004005000060F30004898000-000000067F00004005000060F3000489C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D2B1B0-000000067F00004005000060F30003D44283__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000FF4000-000000067F00004005016EA00C0001188000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005010F99A50100000000-000000067F00004005010F9F120000004000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F34000-000000067F00004005000060F30001F38F48__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700018A0000-000000067F00004005000060F700018D85CA__000001440D3D0C69-0000014784964B91":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300029A526C-000000067F00004005000060F300029C623C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00017DC000-000000067F00004005000060FB0001808000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000024000-000000067F0000400500DBCED50000028000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000201716-000000067F0000400500EE16C40100000000__0000012A77C1B0B0":{"file_size":32768,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D10000-000000067F00004005000060F30006D14000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430001064000-000000067F000040050081DB4300010A0000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C01FFFFFFFF-000000067F0000400500F3A25C0300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001340000-000000067F00004005000060F30001344000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003E98000-000000067F00004005000060F30003EA902F__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C0E146-000000067F00004005000060F30006C8729E__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F600000166C4-000000067F00004005000060F60100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":54165504,"generation":2,"shard":"0008"},"000000067F00004005000060F10000180000-000000067F00004005000060F1000018821A__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000193FE9D-000000067F00004005016EA00C0001967D34__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB43000076C000-000000067F000040050081DB430000778000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050321C0-000000067F00004005000060F30005063187__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000D4000-000000067F0000400500DBCED500000F0000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300004B8000-000000067F00004005000060F300004BC000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000022C000-000000067F00004005000060FB0000280000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DF968A-000000067F00004005000060FB0000E021D0__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000228000-000000067F00004005000060FB000022C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015D8000-000000067F00004005000060FB00015DC000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B89170-000000067F00004005000060F30005BA213F__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B0000-000000067F00004005000060F300043B4000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004F8000-000000067F000040050081DB4300004FC000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006860000-000000067F00004005000060F30006864000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000ADA0D0-000000067F00004005000060F30000B0300C__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FF2A510000000000-000000067F000040050100D04D000004369C__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000BB439-030000000000000000000000000000000002__00000104BD37F348":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C078FA-000000067F00004005016EA00C0001C0F79A__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB430000B4A075-000000067F000040050081DB430000B7C0EA__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000117C10C-000000067F00004005000060F50100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E47BD2-000000067F00004005016EA00C0000E67A6E__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30005D23BB5-000000067F00004005000060F50100000000__00000164EA9EC9A8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000336D193-000000067F00004005000060F3000337DCF3__000000E4C63CFA21-000000E7C2F1B249":{"file_size":259473408,"generation":2,"shard":"0008"},"000000067F00004005000060F300001F0000-000000067F00004005000060F300001F4000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000084772-030000000000000000000000000000000002__000000027AF9D7D0":{"file_size":147456,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0001CE79E0-000000067F00004005016EA00C0001D1F87B__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F0000400500EB4A4800FFFFFFFF-000000067F0000400500EB4A480100000000__000000FF8B261599-000001048B25A8E9":{"file_size":1318912,"generation":2,"shard":"0008"},"000000067F00004005000060F70000488000-000000067F00004005000060F7000048C000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000ADF1AB-000000067F00004005016EA00C0100000000__00000196C9018F59-0000019A2EAFE7A9":{"file_size":282132480,"generation":11,"shard":"0008"},"000000067F00004005000060FB000071C000-000000067F00004005000060FB0000793506__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006850000-000000067F00004005000060F30006854000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000390000-000000067F000040050081DB430000394000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000020C000-000000067F00004005000060F30000250000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001398000-000000067F00004005000060FB000139C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003648000-000000067F00004005000060F3000364C000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001C7B51-000000067F0000400500C782E4000023FA62__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001788000-000000067F00004005016EA00C000178C000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000C3A075-000000067F000040050081DB430000C440EA__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300036FE561-000000067F00004005000060F300038075AF__000000FF8B261599-000001048B25A8E9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D03000004FDC6-000000067F0000400500D19D030000067CA9__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C00000-000000067F00004005000060FB0000C04000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000282C000-000000067F00004005000060F3000283C3E7__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006B0000-000000067F00004005016EA00C00006B4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001789027-000000067F00004005000060F300017AA0CE__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004558000-000000067F00004005000060F300045C1062__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C08000-000000067F00004005000060FB0000C0C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DCC000-000000067F00004005000060F30006DF0000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B221FE-000000067F00004005000060F30004B2B250__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018C4000-000000067F00004005016EA00C00018E0000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000564000-000000067F000040050081DB430000578000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000274A080-000000067F00004005000060F30100000000__000000B2B5C4E8F9-000000B768469051":{"file_size":199057408,"generation":2,"shard":"0008"},"000000067F00004005000060F300046D0EA8-000000067F00004005000060F3000471200E__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001114000-000000067F00004005000060FB0001120000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FEC000-000000067F00004005000060F30003FF0000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000368000-000000067F00004005000060F10100000000__0000003959DA2DE9-0000003D03FCCDB9":{"file_size":269967360,"generation":2,"shard":"0008"},"000000067F0000400500C782E4000012A71E-030000000000000000000000000000000002__000000D037B2DBD0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C98000-000000067F00004005000060F30006C9C000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300055BC000-000000067F00004005000060F30005610000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000F050F2-030000000000000000000000000000000002__00000047F1F2B800":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30002484000-000000067F00004005000060F300024D8000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FE8000-000000067F00004005000060F30003FEC000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000A8000-000000067F0000400500DBCED500000AC000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006C3D76-000000067F00004005000060F80100000000__000000663565F8C9-000000698AF6E809":{"file_size":139821056,"generation":2,"shard":"0008"},"000000067F00004005000060F30002534000-000000067F00004005000060F3000253B7A3__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000412D27C-000000067F00004005000060F30004156457__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000910000-000000067F00004005000060F700009385D4__0000008DBE2855F9-000000923719A971":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30002510000-000000067F00004005000060F30002514000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002210000-000000067F00004005000060F30002214000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF4000-000000067F00004005000060F30004070000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001BBFA66-000000067F00004005016EA00C0001C078FA__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000424A099-000000067F00004005000060F3000428313F__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300036F91FE-000000067F00004005000060F30100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":164118528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000718000-000000067F00004005000060FB000071C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010F44EB000000C000-000000067F00004005010F44EB0100000000__00000126C3C69FC0":{"file_size":70696960,"generation":2,"shard":"0008"},"000000067F00004005000060F30005214000-000000067F00004005000060F30005240000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000A7AF6E-030000000000000000000000000000000002__000000321AA80270":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30005063187-000000067F00004005000060F300050A412B__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005E8000-000000067F00004005000060F100005F821C__000000636DE92159-000000663565F8C9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300020830BE-000000067F00004005000060F300020FC052__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300065BB235-000000067F00004005000060F300065F42B4__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000034000-000000067F0000400500FA2AD3000004D85C__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017A8000-000000067F00004005016EA00C00017AC000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB00008D8000-000000067F00004005000060FB0000928B45__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000798000-000000067F00004005000060F300007C1007__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000040000-000000067F0000400500D19D030000047EE2__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001AB1583-000000067F00004005000060F50100000000__00000081AA3C40F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001AD8000-000000067F00004005000060F30001B09104__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E1B859-030000000000000000000000000000000002__000000417D21ACF9-00000044B4679349":{"file_size":156844032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E9C000-000000067F00004005000060FB0001EA8000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001374000-000000067F00004005000060FB0001398000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000155C000-000000067F00004005000060FB0001590000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000EA069-000000067F0000400500F3A25C000010C0D1__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000568C000-000000067F00004005000060F30005698000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C74000-000000067F00004005000060FB0000C98000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700004F0000-000000067F00004005000060F80100000000__00000047E31D98D1-0000004C49155071":{"file_size":264921088,"generation":2,"shard":"0008"},"000000067F00004005000060F30005598000-000000067F00004005000060F3000559C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001429534-000000067F00004005000060F80100000000__00000122A7BB7B29-0000012694E36301":{"file_size":231964672,"generation":2,"shard":"0008"},"000000067F00004005000060F70000780000-000000067F00004005000060F80100000000__000000722F474369-00000075CC373F31":{"file_size":263340032,"generation":2,"shard":"0008"},"000000067F00004005000060F300019F31AA-000000067F00004005000060F30100000000__00000079F2A2F311-0000007E3A9BFD29":{"file_size":168484864,"generation":2,"shard":"0008"},"000000067F000040050081DB430000822079-000000067F000040050081DB43000082C0F1__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007AC000-000000067F000040050081DB4300007F913A__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005847319-000000067F00004005000060F300058C8000__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":261505024,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E21687-000000067F00004005000060FB0100000000__000000923719A971-00000096262826C9":{"file_size":224403456,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C98000-000000067F00004005000060F30003CB8FCF__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000045029C-030000000000000000000000000000000002__0000008DBE2855F9-000000923719A971":{"file_size":89505792,"generation":2,"shard":"0008"},"000000067F00004005000060F3000559C000-000000067F00004005000060F300055B8000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000285901B-000000067F00004005000060F300028920E4__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E64000-000000067F00004005000060F30000E70000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015FB022-000000067F00004005000060F3000160410C__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006FDA081-000000067F00004005000060F30100000000__00000184624E5741-000001860C80A151":{"file_size":202276864,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000107973-000000067F0000400500EE16BC0100000000__000000F309FCDD19-000000F6661C9241":{"file_size":275456000,"generation":2,"shard":"0008"},"000000067F00004005000060F300031C40D1-000000067F00004005000060F300031D516C__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00001F7D38-000000067F00004005016EA00C000020FBCF__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FDA1F80100000000-000000067F0000400500FF2A510000004000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001182EC9-000000067F00004005000060F80100000000__000000FF8B261599-000001048B25A8E9":{"file_size":174284800,"generation":2,"shard":"0008"},"000000067F00004005000060F700011528FB-000000067F00004005000060F70001182EC9__000000FF8B261599-000001048B25A8E9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300024DC000-000000067F00004005000060F30002510000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00000B0000-030000000000000000000000000000000002__000000021DC73119-000000044854EBD1":{"file_size":259375104,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001DF0B43-000000067F00004005000060FB0001E21687__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000088000-000000067F00004005000060F10000090000__00000008B6B51879-0000000D55A212C9":{"file_size":264142848,"generation":2,"shard":"0008"},"000000067F00004005000060F30003968000-000000067F00004005000060F3000396C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017AC000-000000067F00004005016EA00C00017E8000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F1000019C73D-000000067F00004005000060F20100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":124698624,"generation":2,"shard":"0008"},"000000067F00004005000060F700001F8000-000000067F00004005000060F700002005D2__0000001B59EEB909-0000001FFBC01501":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001110000-000000067F00004005000060FB0001114000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F1000019842A-000000067F00004005000060F20100000000__0000001737D88379-0000001B59EEB909":{"file_size":145137664,"generation":2,"shard":"0008"},"000000067F00004005000060F700003BC000-000000067F00004005000060F700003C0000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000280000-000000067F00004005000060FB0000284000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED5000007C000-000000067F0000400500DBCED500000A8000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB5732691-000001BCB5734CD9":{"file_size":24576,"generation":239,"shard":"0008"},"000000067F00004005010660F70100000000-000000067F000040050107B547000006C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000C24000-000000067F00004005000060F30000CA0000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000569C000-000000067F00004005000060F300056D8000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00000C7A73-030000000000000000000000000000000002__0000018624969469-000001880F984A29":{"file_size":40566784,"generation":11,"shard":"0008"},"000000067F00004005000060F30001344000-000000067F00004005000060F30001358000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F38F48-000000067F00004005000060F50100000000__0000009A24DF6768":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001760000-000000067F00004005000060F30001789027__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000018821A-000000067F00004005000060F1000019842A__0000001737D88379-0000001B59EEB909":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300059AA115-000000067F00004005000060F300059B324D__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001400000-000000067F00004005000060FB0001404000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800000E7A62-000000067F0000400500EB4A480000107973__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000498000-000000067F00004005000060F3000049C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D24000-000000067F00004005000060F70000D38000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000120E409-000000067F000040050081DB430300000000__0000018613F0A050":{"file_size":24576,"generation":3,"shard":"0008"},"000000067F00004005000060FB0001A8A1CD-000000067F00004005000060FB0100000000__0000007E3A9BFD29-0000008196C976A1":{"file_size":199622656,"generation":2,"shard":"0008"},"000000067F00004005000060F30006270000-000000067F00004005000060F50100000000__0000016E41E03CA0":{"file_size":71114752,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000BAAD15-030000000000000000000000000000000002__0000003579F03331-0000003959DA2DE9":{"file_size":182321152,"generation":2,"shard":"0008"},"000000067F00004005000060F700016205B5-000000067F00004005000060F80100000000__0000012E71CF31F9-000001334140FC21":{"file_size":266862592,"generation":2,"shard":"0008"},"000000067F00004005000060F300030C0FE5-000000067F00004005000060F30003102107__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004BC000-000000067F00004005016EA00C00004E8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F10000440000-000000067F00004005000060F1000046821B__00000047E31D98D1-0000004C49155071":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB4300009C8000-000000067F000040050081DB4300009CC000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000106C000-000000067F00004005000060F700010AABC7__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000367733F-000000067F00004005000060F50100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000478000-000000067F00004005016EA00C000047C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002E4104A-000000067F00004005000060F30002E4A157__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001370000-000000067F00004005000060FB0001374000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B1111A-000000067F00004005000060F30004B221FE__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C000-000000067F00004005016EA00C00002D0000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30001C3C000-000000067F00004005000060F30001CC0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000136C000-000000067F00004005000060FB0001370000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000488000-000000067F00004005000060F10100000000__0000004C49155071-0000004F31878919":{"file_size":268754944,"generation":2,"shard":"0008"},"000000067F00004005000060F30000B0300C-000000067F00004005000060F60100000000__0000003203FB5749-0000003579F03331":{"file_size":212885504,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C0F79A-000000067F00004005016EA00C0001C3F636__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000399C000-000000067F00004005000060F300039A0000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001574000-000000067F00004005000060F700015A195C__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B00697-000000067F00004005000060F30100000000__0000015DD1D3C809-0000016143292911":{"file_size":282025984,"generation":2,"shard":"0008"},"000000067F00004005000060F300050C8000-000000067F00004005000060F300050CC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700000885C5-000000067F00004005000060F80100000000__000000044854EBD1-00000008B6B51879":{"file_size":253878272,"generation":2,"shard":"0008"},"000000067F00004005000060F30001407F7A-000000067F00004005000060F50100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B90000-000000067F00004005000060F70000B94000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000560000-000000067F000040050081DB430000564000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001720000-000000067F00004005000060F700017405D4__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300043CC000-000000067F00004005000060F300043F8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000129D29A-000000067F00004005000060F30100000000__00000057593D8169-0000005C01565329":{"file_size":110788608,"generation":2,"shard":"0008"},"000000067F00004005000060F300003F9F83-000000067F00004005000060F30000402F4A__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001940000-000000067F00004005000060F700019685CE__0000014784964B91-0000014B000D1821":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B8000-000000067F00004005000060F300043BC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000370FD1-000000067F00004005000060F60100000000__0000000D55A212C9-000000114A805939":{"file_size":232144896,"generation":2,"shard":"0008"},"000000067F00004005000060F30003849093-000000067F00004005000060F300038720A2__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100003C0432-000000067F00004005000060F20100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":262701056,"generation":2,"shard":"0008"},"000000067F00004005000060F700014F85DF-000000067F00004005000060F70001510BBE__0000012694E36301-0000012A3F140591":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000253B7A3-000000067F00004005000060F50100000000__000000AFE87558B0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001404000-000000067F00004005000060FB0001408000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F942CF-000000067F00004005000060F30003FCD352__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B38000-000000067F00004005000060FB0000B58B45__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B505C8-000000067F00004005000060F80100000000__000000A9EB8C4489-000000ACA44C8E99":{"file_size":226459648,"generation":2,"shard":"0008"},"000000067F00004005000060F3000612D506-000000067F00004005000060F30006166575__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700000DC000-000000067F00004005000060F700000E0000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D31000000C000-000000067F0000400500FB3D310000018000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C329-000001BCB572C481":{"file_size":24576,"generation":19,"shard":"0008"},"000000067F00004005000060F30002828000-000000067F00004005000060F3000282C000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015B0000-000000067F00004005000060F300015B4000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000078000-000000067F0000400500DBCED5000007C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000086E169-030000000000000000000000000000000002__000000A583FBFB91-000000A9EB8C4489":{"file_size":77471744,"generation":2,"shard":"0008"},"000000067F0000400501046F39000000BDD2-000000067F00004005010660F500000161F7__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D3101FFFFFFFF-000000067F0000400500FB3D310300000000__00000122A7BB7B29-0000012694E36301":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00000F28ED-030000000000000000000000000000000002__000000F91FE84F08":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E9307A-000000067F00004005000060F30004EA41A5__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00016D21CF-030000000000000000000000000000000002__000000698AF6E809-0000006DDB29D589":{"file_size":226353152,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001876BD-000000067F0000400500EB4A48000018F5CD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E400002E5B84-030000000000000000000000000000000002__000000DBD29DC248":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D8985C-000000067F00004005000060F70000DA1E38__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C28000-000000067F000040050081DB430000C3A075__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000407201D-000000067F00004005000060F300040E319D__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000002B3CE-000000067F00004005000060F60100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D60000-000000067F00004005000060F80100000000__000000C483D0D6B8":{"file_size":133947392,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F705D6-000000067F00004005000060F80100000000__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":259842048,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E7A062-000000067F00004005000060F30004E9307A__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006810000-000000067F00004005000060F30006814000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700007D05C8-000000067F00004005000060F80100000000__00000075CC373F31-00000079F2A2F311":{"file_size":251740160,"generation":2,"shard":"0008"},"000000067F00004005000000000000000001-000000067F0000400500000A690000000002__0000018624969469-000001880F984A29":{"file_size":40960,"generation":11,"shard":"0008"},"000000067F00004005000060FB00014D8000-000000067F00004005000060FB0001530B44__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001EA8000-000000067F00004005000060FB0001EAC000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000230A0C7-000000067F00004005000060F30100000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":213680128,"generation":2,"shard":"0008"},"000000067F00004005000060F30000A98000-000000067F00004005000060F30000AC9024__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F72201-000000067F00004005000060F30003F7B254__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000498000-000000067F00004005016EA00C000049C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004CB8000-000000067F00004005000060F30004CBC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300042CC1BD-000000067F00004005000060F300042D51D6__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D310000028681-000000067F0000400500FB3D320100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000474302B-000000067F00004005000060F300047EC0CA__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003204000-000000067F00004005000060F30003278000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300024020ED-000000067F00004005000060F3000240B12A__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000216C000-000000067F00004005000060F30002170000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000005DD43-000000067F00004005000060F60100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000348B45-000000067F00004005000060FB000037968A__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000778000-000000067F000040050081DB43000077C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300011B4000-000000067F000040050081DB43000120E409__000000DBD29DC248":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003CCA0B9-000000067F00004005000060F30003D0B155__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00009D4000-000000067F00004005000060FB0000A7AF6E__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700008F0000-000000067F00004005000060F80100000000__00000089F4693119-0000008DBE2855F9":{"file_size":262905856,"generation":2,"shard":"0008"},"000000067F00004005000060F30006CA0000-000000067F00004005000060F30006CA4000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E021D0-000000067F00004005000060FB0000E0AD15__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003654000-000000067F00004005000060F3000367733F__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000DC0000-000000067F00004005000060F70000DE05C8__000000C824C09619-000000CC13D2E549":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F700018D85CA-000000067F00004005000060F80100000000__000001440D3D0C69-0000014784964B91":{"file_size":260775936,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EAC000-000000067F00004005000060FB0000EB8000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E70000-000000067F00004005000060F30000E74000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FE621A-000000067F00004005000060F30005FFF23F__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D20000-000000067F00004005000060F70000D24000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005244000-000000067F00004005000060F3000525C065__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400501025D9001FFFFFFFF-000000067F0000400501025D900300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CD4000-000000067F00004005000060F30001CE0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E77906-000000067F00004005016EA00C0000E7F7A7__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300046B41AA-000000067F00004005000060F30100000000__0000012E71CF31F9-000001334140FC21":{"file_size":199688192,"generation":2,"shard":"0008"},"000000067F000040050100D04D00000634BB-030000000000000000000000000000000002__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":173744128,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CA4000-000000067F00004005000060F30000CB16B6__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DDC000-000000067F00004005000060F30004DF086C__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D7F2DE-000000067F00004005000060F30005DA03A8__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300048A0000-000000067F00004005000060F300048A4000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100003954D3-000000067F00004005000060F20100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300043BC000-000000067F00004005000060F300043C8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D1C000-000000067F00004005016EA00C0001D78000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F100000D8000-000000067F00004005000060F100000E021B__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300060A0282-000000067F00004005000060F300060A93B5__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000021D8F8-000000067F00004005000060F20100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":88227840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000018000-000000067F00004005000060F3000001C000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB430000E48000-000000067F000040050081DB430000E4C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300003E8FBC-000000067F00004005000060F300003F9F83__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004868000-000000067F00004005000060F3000486C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700013D0000-000000067F00004005000060F700013E85D1__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001203856-030000000000000000000000000000000002__0000005413AB3641-00000057593D8169":{"file_size":157130752,"generation":2,"shard":"0008"},"000000067F00004005000060F3000029C000-000000067F00004005000060F300002C4887__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005160000-000000067F00004005000060F30005164000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D31000001C000-000000067F0000400500FB3D310000028681__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029F90B-000000067F00004005016EA00C00002D77AE__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30003620000-000000067F00004005000060F30100000000__000000F309FCDD19-000000F6661C9241":{"file_size":249372672,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B90000-000000067F00004005000060F30003B94000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300001F4000-000000067F00004005000060F30000208000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001BB8000-000000067F00004005000060F30001C00FE1__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005210000-000000067F00004005000060F30005214000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002070F71-000000067F00004005000060F30002079FDE__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000B40000-000000067F00004005000060F30000BB103B__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000290000-000000067F00004005000060F10000298000__00000028C365FBE1-0000002D2A8E0B81":{"file_size":264134656,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00007C7B9C-000000067F00004005016EA00C0000807A34__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001548000-000000067F00004005000060FB000154C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100005FC000-000000067F00004005000060F1000062EE46__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001A0000-000000067F0000400500EE16BC00001A4000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F94000-000000067F00004005016EA00C0000F98000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000290000-000000067F00004005000060F80100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":265764864,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BC0B44-000000067F00004005000060FB0001BD1689__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000337DCF2-000000067F00004005000060F30003386D10__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300045C1062-000000067F00004005000060F3000460202F__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006814000-000000067F00004005000060F30006850000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000073DFA8-000000067F00004005016EA00C000079FCFA__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000178C000-000000067F00004005016EA00C00017A8000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F1000051D1AE-000000067F00004005000060F20100000000__00000057593D8169-0000005C01565329":{"file_size":103145472,"generation":2,"shard":"0008"},"000000067F00004005000060F300034BD86C-000000067F00004005000060F30100000000__000000EBC9213D59-000000EFA7EAA9E1":{"file_size":95617024,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000008000-000000067F00004005016EA00C000000FEA0__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F1000014C000-000000067F00004005000060F1000015F545__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300000000EAB-000000067F0000400500FB3D300100000000__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":12976128,"generation":2,"shard":"0008"},"000000067F000040050081DB430000028000-000000067F000040050081DB43000002C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BD1689-000000067F00004005000060FB0100000000__0000008625CF2891-00000089F4693119":{"file_size":223690752,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000000000-000000067F0000400500EB4A480000000001__000000FF8B261599-000001048B25A8E9":{"file_size":32768,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D952B0-000000067F00004005000060F30003DAE2DC__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B30000-000000067F00004005000060F70000B505C8__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000549D0A6-000000067F00004005000060F300055861F2__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000046821B-000000067F00004005000060F20100000000__00000047E31D98D1-0000004C49155071":{"file_size":266969088,"generation":2,"shard":"0008"},"000000067F00004005000060F300043C8000-000000067F00004005000060F300043CC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E720A2-000000067F00004005000060F30100000000__000000923719A971-00000096262826C9":{"file_size":141344768,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003A8000-000000067F000040050081DB4300003AC000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006AB7A6-000000067F00004005000060F700006C3D76__000000663565F8C9-000000698AF6E809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000570A19E-000000067F00004005000060F3000573B206__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003AF28CB-000000067F00004005000060F30003B33945__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015CC000-000000067F00004005000060FB00015D8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000A9CFB-000000067F0000400500D69D7900000D1C5F__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A30000-000000067F00004005000060F30002A34000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000047C000-000000067F00004005000060F30000498000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FFF23F-000000067F00004005000060F300060A0282__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C194-000000067F00004005016EA00C00004EF809__0000018EC67807C9-000001935283F9B9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006D64000-000000067F00004005000060F30006DC8000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001340000-000000067F00004005016EA00C0001344000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000BB0000-000000067F00004005016EA00C0000BB4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000000000-000000067F0000400500EB4A480000007F0F__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000114000-000000067F0000400500E3A2A1000016321A__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000578000-030000000000000000000000000000000002__0000009A24DF6768":{"file_size":107642880,"generation":2,"shard":"0008"},"000000067F00004005000060F30006798000-000000067F00004005000060F3000679C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100000E021B-000000067F00004005000060F1000010043F__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB430000DA8000-030000000000000000000000000000000002__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":233201664,"generation":2,"shard":"0008"},"000000067F00004005000060F100004EC079-000000067F00004005000060F20100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F7000170C000-000000067F00004005000060F70001720000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FCD85E-000000067F00004005000060F80100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00015B74FF-000000067F00004005016EA00C00015FF3A0__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30000AC9024-000000067F00004005000060F30000ADA0D0__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16C40100000000-000000067F0000400500F3A25C000006C000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000F1B5B-000000067F0000400500D69D790100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":233275392,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C0C000-000000067F00004005000060F30003C257AD__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E44000-000000067F00004005000060F30000E60000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000018E4B6-000000067F00004005000060F7000019EA78__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017E8000-000000067F00004005016EA00C00017EC000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003A4C09C-000000067F00004005000060F30003A6D1B3__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100000260F2-000000067F00004005000060F20100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0000097BDA-000000067F00004005016EA00C00000C7A73__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500C782E400002CDD5C-030000000000000000000000000000000002__000000D31E48D7C9-000000D74E29AAD1":{"file_size":90923008,"generation":2,"shard":"0008"},"000000067F00004005000060F3000685C000-000000067F00004005000060F30006860000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001C84000-000000067F00004005000060FB0001CE16ED__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000CC4BC2-000000067F000040050081DB430000CD6C36__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006349DA2-000000067F00004005000060F30006382F14__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000212E160-000000067F00004005000060F30100000000__0000009DF02C1241-000000A173C00489":{"file_size":224731136,"generation":2,"shard":"0008"},"000000067F00004005000060F30001FF8691-000000067F00004005000060F30100000000__0000009A1ABDE921-0000009DF02C1241":{"file_size":256114688,"generation":2,"shard":"0008"},"000000067F00004005000060F300067F4000-000000067F00004005000060F30006810000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700015A8000-000000067F00004005000060F700016205B5__0000012E71CF31F9-000001334140FC21":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000024000-000000067F0000400500D69D790000028000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700007AE010-000000067F00004005000060F80100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000428000-000000067F00004005016EA00C000042C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001E74000-000000067F00004005000060F30001F28000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038FF04F-000000067F00004005000060F30100000000__0000010779A7F551-0000010A5E65DF39":{"file_size":45359104,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001B0FD2A-000000067F00004005016EA00C0001B4FBC9__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006858000-000000067F00004005000060F3000685C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002F9A0EB-000000067F00004005000060F30002FD317C__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000808000-000000067F000040050081DB430000822079__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015DC000-000000067F00004005000060FB00015F0000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000021C000-000000067F00004005000060F7000025DA3C__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D79000007C000-000000067F0000400500D69D7900000A8000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000001EE3D-000000067F00004005000060F60100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000F4E15B-030000000000000000000000000000000002__000000C462B3C2A9-000000C824C09619":{"file_size":73662464,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F28000-000000067F00004005000060F30001F2C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001F1DA6-030000000000000000000000000000000002__00000081AA3C40F0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F70001758B92-000000067F00004005000060F70001771169__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000010000-000000067F0000400500E3A2A10000017F02__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A98000-000000067F00004005000060F30002A9C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000573B206-000000067F00004005000060F300057942F4__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000860B45-030000000000000000000000000000000002__00000023FEF9F321-00000028C365FBE1":{"file_size":252788736,"generation":2,"shard":"0008"},"000000067F00004005000060F7000090B929-000000067F00004005000060F80100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F7000014B73D-000000067F00004005000060F80100000000__000000114A805939-00000013FB921C81":{"file_size":146432000,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D3C000-000000067F00004005000060F70000D60000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001514000-000000067F00004005000060F70001528000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001764000-000000067F00004005016EA00C0001788000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001358000-000000067F00004005000060F3000135C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001594000-000000067F00004005000060FB00015C8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300004AC000-000000067F00004005000060F300004B8000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005610000-000000067F00004005000060F30005614000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002794000-000000067F00004005000060F300027C0000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C60000-000000067F00004005000060F30004C64000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003A0000-000000067F00004005000060F700003B85C7__0000003579F03331-0000003959DA2DE9":{"file_size":268468224,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F1034-030000000000000000000000000000000002__000000E4C63CFA21-000000E7C2F1B249":{"file_size":247480320,"generation":2,"shard":"0008"},"000000067F00004005000060F300051B4000-000000067F00004005000060F300051F0000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000003C77D-000000067F00004005000060F60100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005010660F500000161F7-030000000000000000000000000000000002__0000010FB1BE19B9-00000113456156F1":{"file_size":64757760,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F7B254-000000067F00004005000060F30003F942CF__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004900000-000000067F00004005000060F30004904000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F18000-000000067F00004005000060F30006F1C000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A21037-000000067F00004005000060F30003A31FB6__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000DB0000-000000067F00004005000060F30000E40F86__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001A60B43-000000067F00004005000060FB0001A71688__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DC8000-000000067F00004005000060F30006DCC000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006E38F6-000000067F00004005000060F80100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000122B1C9-000000067F00004005000060F300012442A9__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EA8000-000000067F00004005000060FB0000EAC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B5A072-000000067F00004005000060F80100000000__00000159B010F6C0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000144DCA3-000000067F00004005016EA00C000151F7C5__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F600000711FF-000000067F00004005000060F60100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300050EC000-000000067F00004005000060F30005138000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005260000-000000067F00004005000060F30005290FC9__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700012DE407-000000067F00004005000060F80100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F10000-000000067F00004005000060F70000F185D4__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D38000-000000067F00004005000060F70000D3C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000006671F-000000067F00004005000060F60100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300059F53C6-000000067F00004005000060F30005A16504__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000B08000-000000067F000040050081DB430000B4A075__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000152C000-000000067F00004005000060F70001570000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000128000-000000067F00004005000060F3000012C000__0000018624969468":{"file_size":134422528,"generation":7,"shard":"0008"},"000000067F00004005000060F70000E24000-000000067F00004005000060F70000E387D6__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300002791D8-000000067F000040050081DB43000028B253__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F600000500F7-000000067F00004005000060F60100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000ABD9C4-000000067F00004005000060F80100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB4300009CC000-000000067F000040050081DB430000A10000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700002005D2-000000067F00004005000060F80100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":261169152,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001AA656E-000000067F000040050081D80C0100000000__00000081AA3C40F0":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E14000-000000067F000040050081DB430000E48000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003DD734C-000000067F00004005000060F30003E40000__0000011B632CC319-0000011F1A40FA69":{"file_size":261046272,"generation":2,"shard":"0008"},"000000067F0000400500D19D0300FFFFFFFF-030000000000000000000000000000000002__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":5373952,"generation":2,"shard":"0008"},"000000067F00004005000060F30001588000-000000067F00004005000060F3000158C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000AC000-000000067F0000400500DBCED500000D0000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000013F89B-000000067F0000400500EB4A48000014F7AC__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300005D704F-000000067F000040050081DB4300006310C9__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A14000-000000067F000040050081DB430000A18000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F574A6-000000067F00004005000060F30001FF8691__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D320100000000-000000067F0000400500FDA1F80000014000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001B09104-000000067F00004005000060F30001B4A119__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005011035750100000000-030000000000000000000000000000000002__00000159B010F6C0":{"file_size":78626816,"generation":2,"shard":"0008"},"000000067F00004005000060F1000015F545-000000067F00004005000060F20100000000__000000174479FC18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000638C06D-000000067F00004005000060F300063A50CD__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000299C28F-000000067F00004005000060F300029A526C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000364C000-000000067F00004005000060F30003650000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CE0000-000000067F00004005016EA00C0000CE4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000794000-000000067F000040050081DB4300007A8000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A18000-000000067F000040050081DB430000A1C000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000000C000-000000067F00004005000060F30000018000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB4300000D40FF-030000000000000000000000000000000002__00000075CC373F31-00000079F2A2F311":{"file_size":78061568,"generation":2,"shard":"0008"},"000000067F00004005000060F60000099FD8-000000067F00004005000060F60100000000__00000159B010F6C0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000330A1C8-000000067F00004005000060F3000332B1B6__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006FA900D-000000067F00004005000060F30006FDA081__00000184624E5741-000001860C80A151":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000148AC30-000000067F00004005000060FB000149B774__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C01FFFFFFFF-000000067F0000400500F3A25C0300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000EF1FC3-000000067F00004005000060F50100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006A7C566-000000067F00004005000060F30100000000__00000178B8B10551-0000017C9F5597E1":{"file_size":173072384,"generation":2,"shard":"0008"},"000000067F00004005000060FB000104B856-000000067F00004005000060FB000107C39B__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000030000-000000067F00004005000060F80100000000__000000021DC73119-000000044854EBD1":{"file_size":261341184,"generation":2,"shard":"0008"},"000000067F00004005000060F30003580FD3-000000067F00004005000060F30100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":228188160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001224000-000000067F00004005000060F70001232ACF__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300022B9050-000000067F00004005000060F3000230A0C7__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006654000-000000067F00004005000060F30006670000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010D0000-000000067F00004005000060F700010D85CF__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000FD8000-030000000000000000000000000000000002__000000C824C09619-000000CC13D2E549":{"file_size":237559808,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015F0000-000000067F00004005000060FB00015F4000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60100000000-000000067F00004005000060F70000004000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F70000DA1E38-000000067F00004005000060F80100000000__000000C462B3C2A9-000000C824C09619":{"file_size":209821696,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D76250-000000067F00004005000060F30005D7F2DE__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000418000-000000067F00004005000060F10100000000__00000044B4679349-00000047E31D98D1":{"file_size":269148160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B61000-000000067F00004005000060F80100000000__0000018613F0A050":{"file_size":65150976,"generation":3,"shard":"0008"},"000000067F00004005000060F300008C8000-000000067F00004005000060F300008E0F49__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300002D8000-030000000000000000000000000000000002__0000008625CF2891-00000089F4693119":{"file_size":231907328,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C04000-000000067F00004005000060FB0000C08000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001808000-000000067F00004005000060FB000180C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A30379-030000000000000000000000000000000002__000000AFE87558B0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F700010D85CF-000000067F00004005000060F80100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":164970496,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C70000-000000067F00004005000060FB0000C74000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001188000-000000067F00004005016EA00C000118C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000CB85B3-000000067F00004005000060F70000CC8B74__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A1D870-000000067F00004005000060F30004A2693B__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00008CF772-000000067F00004005016EA00C00008E760F__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000D34000-000000067F00004005016EA00C0000D5D1E9__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00014B79E7-000000067F00004005016EA00C00014CF88D__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300040E319D-000000067F00004005000060F300040F41F4__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002FF427D-000000067F00004005000060F30100000000__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":156073984,"generation":2,"shard":"0008"},"000000067F00004005000060F30005E0A466-000000067F00004005000060F30005E3B48F__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700005F9158-000000067F00004005000060F80100000000__00000057593D8169-0000005C01565329":{"file_size":230768640,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018E4000-000000067F00004005016EA00C000193189A__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30005F0202C-000000067F00004005000060F30005F3303F__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000148000-000000067F00004005000060F1000014C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060C0000-000000067F00004005000060F300060C4000__0000016E41E03CA0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C9C000-000000067F00004005000060FB0000CC6E51__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050107B54700000A0EB1-000000067F000040050109CD330100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004EC000-000000067F00004005016EA00C00005A0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000A9F465-000000067F00004005016EA00C0000ACF305__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30000208000-000000067F00004005000060F3000020C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000011E137-000000067F0000400500F67839000003E09B__000001048B25A8E9-0000010779A7F551":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30000402F4A-000000067F00004005000060F60100000000__000000114A805939-00000013FB921C81":{"file_size":166469632,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004A8000-000000067F00004005016EA00C00004AC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70001968000-000000067F00004005000060F7000196C000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006EF8000-000000067F00004005000060F30006EFC000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000BB4000-000000067F00004005016EA00C0000C20000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700009C0000-000000067F00004005000060F80100000000__0000009A24DF6768":{"file_size":37371904,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C84000-000000067F00004005000060F30004CB8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002514000-000000067F00004005000060F30002530000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000DE05C8-000000067F00004005000060F80100000000__000000C824C09619-000000CC13D2E549":{"file_size":259473408,"generation":2,"shard":"0008"},"000000067F00004005000060F301FFFFFFFF-000000067F00004005000060F30300000000__00000186146441F1-0000018624969469":{"file_size":57344,"generation":6,"shard":"0008"},"000000067F00004005000060F30001886B2A-000000067F00004005000060F50100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700006A8000-000000067F00004005000060F80100000000__000000636DE92159-000000663565F8C9":{"file_size":117022720,"generation":2,"shard":"0008"},"000000067F00004005000060FB000154C000-000000067F00004005000060FB0001558000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300053F40CC-000000067F00004005000060F30100000000__0000014EC58A4A79-0000015304A396B9":{"file_size":223453184,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C95225-000000067F00004005000060F30005C9E3C4__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000558C000-000000067F00004005000060F30005598000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FFA699-000000067F00004005000060F50100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F1C000-000000067F00004005000060F50100000000__000001848D082B20":{"file_size":24117248,"generation":2,"shard":"0008"},"000000067F00004005000060F3000486C000-000000067F00004005000060F30004878000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300019C2056-000000067F00004005000060F300019F31AA__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC000004C000-000000067F0000400500EE16BC0000060000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000046EAB9-000000067F00004005000060F80100000000__000000417D21ACF9-00000044B4679349":{"file_size":48717824,"generation":2,"shard":"0008"},"000000067F000040050081DB430000790000-000000067F000040050081DB430000794000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D79000002C000-000000067F0000400500D69D790000078000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60000026C90-000000067F00004005000060F60100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000738000-000000067F00004005000060F3000073C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000204000-000000067F00004005000060F10000218000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000177E20-000000067F0000400500C782E400001AFD31__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000048C000-000000067F00004005000060F700004B1E77__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015F8000-000000067F00004005000060F50100000000__000000698F2C3A38":{"file_size":131276800,"generation":2,"shard":"0008"},"000000067F00004005000060F30000428000-000000067F00004005000060F3000042C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000038C000-000000067F000040050081DB430000390000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000102A1CE-000000067F00004005000060FB000103AD12__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001848000-000000067F00004005000060FB000184C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00001DC000-000000067F00004005000060FB0000228000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00011D4000-000000067F00004005016EA00C0001228000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000011775B-030000000000000000000000000000000002__0000018820A34650":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F700011B8000-000000067F00004005000060F80100000000__000001048B25A8E9-0000010779A7F551":{"file_size":263897088,"generation":2,"shard":"0008"},"000000067F00004005000060F3000660D31F-000000067F00004005000060F3000664E3CA__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000064000-000000067F0000400500EE16BC00000F28ED__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000525C065-000000067F00004005000060F50100000000__0000014EDD256548":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A7F98F-000000067F00004005000060F30100000000__000001398B56A519-0000013C9C0E3339":{"file_size":47595520,"generation":2,"shard":"0008"},"000000067F000040050100D04D000004369C-000000067F000040050100D04D000004B5AD__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000001A6E2-000000067F00004005000060F60100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700004405CF-000000067F00004005000060F80100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":198836224,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D28000-000000067F00004005000060F30002D2C000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F56D510100000000-000000067F0000400500F67839000003C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000E387D6-000000067F00004005000060F80100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000213C000-000000067F00004005000060F30002168000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060D4415-000000067F00004005000060F3000612D506__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D3100000546CB-000000067F0000400500FB3D320100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000D18CA9-030000000000000000000000000000000002__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":210288640,"generation":2,"shard":"0008"},"000000067F00004005000060F60000062E4F-000000067F00004005000060F60100000000__00000104BD37F348":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000016A065-000000067F0000400500F3A25C000017C0CB__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001AD0000-000000067F00004005000060FB0001B28B44__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000254000-000000067F00004005000060F30000298000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E8C000-000000067F000040050081DB430000EA0000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300040F41F4-000000067F00004005000060F3000412D27C__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00013B8000-000000067F00004005000060FB00013BC000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700000D8000-000000067F00004005000060F700000DC000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000958000-000000067F00004005000060F700009605D8__000000923719A971-00000096262826C9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004A0000-000000067F00004005000060FB00004A4000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700001213F2-000000067F00004005000060F80100000000__0000000D55A212C9-000000114A805939":{"file_size":55320576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004156457-000000067F00004005000060F30100000000__00000122A7BB7B29-0000012694E36301":{"file_size":96927744,"generation":2,"shard":"0008"},"000000067F00004005000060F30003278000-000000067F00004005000060F3000327C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000158F667-000000067F00004005016EA00C00015B74FF__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001D50000-000000067F00004005000060FB0001D88B43__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F60000054AE8-000000067F00004005000060F60100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300002C4887-000000067F00004005000060F60100000000__0000000D80565628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B34000-000000067F00004005000060F70001B5A072__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F600000416A8-000000067F00004005000060F60100000000__000000AFE87558B0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F10000050000-000000067F00004005000060F10000058000__000000044854EBD1-00000008B6B51879":{"file_size":264011776,"generation":2,"shard":"0008"},"000000067F00004005000060F300043FC000-000000067F00004005000060F300044D3639__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004878000-000000067F00004005000060F3000487C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000396C000-000000067F00004005000060F30003998000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00019F7907-000000067F00004005016EA00C0001A477A4__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268443648,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00014D7727-000000067F00004005016EA00C00014E75C6__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00016570D9-030000000000000000000000000000000002__000001AC25760149-000001AFC313C819":{"file_size":86335488,"generation":11,"shard":"0008"},"000000067F00004005000060F70001270000-000000067F00004005000060F80100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":265363456,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003BFD31-000000067F0000400500EB4A4800003C7C42__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300014B31F8-000000067F00004005000060F300014CC16D__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000D5D1E9-030000000000000000000000000000000002__0000019E7001E460":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F100003B8214-000000067F00004005000060F100003C0432__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001346854-000000067F00004005016EA00C000135FCAD__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000160410C-000000067F00004005000060F3000165515A__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000118B12B-030000000000000000000000000000000002__00000054161C34B8":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DF0000-000000067F00004005000060F30006DF4000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003C4000-000000067F00004005000060F700003FE341__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000FF0000-000000067F00004005000060F30100000000__0000004C49155071-0000004F31878919":{"file_size":256286720,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015F4000-000000067F00004005000060FB00015FCD31__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005816253-000000067F00004005000060F30005847319__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002460000-000000067F00004005000060F30002464000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000113A337-000000067F00004005000060F700011528FB__000000FF8B261599-000001048B25A8E9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000037968A-030000000000000000000000000000000002__0000000D55A212C9-000000114A805939":{"file_size":226426880,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000128000-000000067F00004005016EA00C000012FE9A__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A48000036FF11-000000067F0000400500EB4A4800003A7E20__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000658113F-000000067F00004005000060F3000659A203__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D18000-000000067F00004005016EA00C0001D1C000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30001A44000-000000067F00004005000060F30001AB1583__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000138000-000000067F00004005000060F1000013C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300009BC000-000000067F00004005000060F30000A50000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000110E30C-000000067F00004005000060F80100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F50100000000-000000067F00004005000060F60000014000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F18000-000000067F00004005000060F30006FA900D__00000184624E5741-000001860C80A151":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001D88B43-000000067F00004005000060FB0100000000__0000008DBE2855F9-000000923719A971":{"file_size":249028608,"generation":2,"shard":"0008"},"000000067F00004005000060F3000122A1D5-000000067F00004005000060F30100000000__0000005413AB3641-00000057593D8169":{"file_size":48783360,"generation":2,"shard":"0008"},"000000067F00004005000060F30006277C61-000000067F00004005000060F30006320C60__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000388000-000000067F000040050081DB43000038C000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E67A6E-000000067F00004005016EA00C0000E77906__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300009B8000-000000067F00004005000060F300009BC000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400501025D900000068000-000000067F00004005010450640000000570__0000010FB1BE19B9-00000113456156F1":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB00002D4B6A-030000000000000000000000000000000002__0000000D80565628":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E50FF3-000000067F00004005000060F30001E720A2__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00005A4000-000000067F00004005016EA00C0000670000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000C18000-000000067F00004005000060FB0000C1C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000BA4F5B-000000067F00004005000060F70000BBD532__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AC115C-000000067F00004005000060F80100000000__0000015304A396B9-0000015670D6AFD9":{"file_size":237248512,"generation":2,"shard":"0008"},"000000067F00004005000060F30004D24000-000000067F00004005000060F30004DA8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006CA4000-000000067F00004005000060F30006D10000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001433D0-030000000000000000000000000000000002__000000FCCD5238B1-000000FF8B261599":{"file_size":146407424,"generation":2,"shard":"0008"},"000000067F00004005000060F3000165515A-000000067F00004005000060F30100000000__000000698AF6E809-0000006DDB29D589":{"file_size":112680960,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000118C000-000000067F00004005016EA00C00011D0000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB43000094A076-030000000000000000000000000000000002__000000A9EB8C4489-000000ACA44C8E99":{"file_size":176054272,"generation":2,"shard":"0008"},"000000067F00004005000060F70001528000-000000067F00004005000060F7000152C000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C82B50-000000067F000040050081DB430000CC4BC2__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001EF15A-000000067F000040050081DB4300002791D8__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000125BF2-000000067F00004005000060F20100000000__000000114A805939-00000013FB921C81":{"file_size":78782464,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E40F86-000000067F00004005000060F30100000000__000000417D21ACF9-00000044B4679349":{"file_size":111108096,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000FF0000-000000067F00004005016EA00C0000FF4000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000CB16B6-000000067F00004005000060F50100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001990000-000000067F00004005000060F70001994000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000A54000-000000067F00004005000060F30000A5F9BB__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300061B8705-000000067F00004005000060F300061D9774__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000084C000-000000067F00004005000060F70000858000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000848000-000000067F00004005000060F7000084C000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001D18000-000000067F00004005000060F30001D79136__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001558000-000000067F00004005000060FB000155C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300024440AE-000000067F00004005000060F3000244D189__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002CFC020-000000067F00004005000060F30100000000__000000C824C09619-000000CC13D2E549":{"file_size":150708224,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A4A074-000000067F000040050081DB430000A640EA__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C98000-000000067F00004005000060FB0000C9C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001840000-000000067F00004005000060FB0001844000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000802123-000000067F00004005000060F30000853115__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000029ED0-000000067F00004005000060F80100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C00003E4000-000000067F00004005016EA00C00003E8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004CBC000-000000067F00004005000060F30004D20000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000122C000-000000067F00004005016EA00C0001240000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004DF086C-000000067F00004005000060F50100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300050B5199-000000067F00004005000060F30100000000__0000014784964B91-0000014B000D1821":{"file_size":126124032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001A477A4-000000067F00004005016EA00C0001ADF63C__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70001828000-000000067F00004005000060F7000182C000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100004F0000-000000067F00004005000060F10000518222__0000005413AB3641-00000057593D8169":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30005EFD576-000000067F00004005000060F30100000000__00000164DEE06671-0000016834A3FC91":{"file_size":193077248,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A50100000000-000000067F0000400500FA2AD30000004000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000258E3A9-000000067F00004005000060F3000259F4A3__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C90000-000000067F00004005000060F70000CB85B3__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB000114C000-000000067F00004005000060FB000118B12B__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003644000-000000067F00004005000060F30003648000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001A50000-000000067F00004005000060FB0001A60B43__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C257AD-000000067F00004005000060F50100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002DE8000-000000067F00004005000060F30002E4104A__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000C8000-000000067F0000400500F3A25C00000EA069__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002174000-000000067F00004005000060F30002210000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014D5280-000000067F00004005000060F300014E6333__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000332B1B6-000000067F00004005000060F30003344134__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300065F42B4-000000067F00004005000060F3000660D31F__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010E264A-000000067F000040050081DB4300010F46BD__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300069D13FA-000000067F00004005000060F300069FA3F6__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300061D9774-000000067F00004005000060F30006222843__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005C821A-000000067F00004005000060F20100000000__000000601F43CF09-000000636DE92159":{"file_size":265183232,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000200000-000000067F0000400500EB4A480000204000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001690000-000000067F00004005000060F70100000000__000001334140FC21-00000137115BE4D9":{"file_size":273965056,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000A575C7-000000067F00004005016EA00C0000A9F465__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001E6C000-000000067F00004005000060FB0001E98000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00014195A7-000000067F00004005000060FB000147A0EC__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000AE168A-030000000000000000000000000000000002__0000003203FB5749-0000003579F03331":{"file_size":223379456,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CA0000-000000067F00004005000060F30000CA4000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300006E4000-000000067F00004005000060F30000738000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300006E0000-000000067F00004005000060F300006E4000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001124000-000000067F00004005000060FB0001148000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000A8000-000000067F0000400500D69D7900000AC000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000130000-000000067F0000400500C782E40000137F10__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000020FBCF-000000067F00004005016EA00C0000257A6F__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001B28B44-000000067F00004005000060FB0100000000__0000008196C976A1-0000008625CF2891":{"file_size":249454592,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001120000-000000067F00004005000060FB0001124000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005474062-000000067F00004005000060F3000549D0A6__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E4000023FA62-030000000000000000000000000000000002__000000D01F399709-000000D31E48D7C9":{"file_size":245366784,"generation":2,"shard":"0008"},"000000067F000040050081DB430000160484-030000000000000000000000000000000002__00000079F2A2F311-0000007E3A9BFD29":{"file_size":226582528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038A4FB4-000000067F00004005000060F300038B5F5B__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300017E8000-000000067F00004005000060F300017EC000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300100000000-000000067F0000400500FB3D31000000C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010105DB-000000067F00004005000060F80100000000__000000E4C63CFA21-000000E7C2F1B249":{"file_size":254935040,"generation":2,"shard":"0008"},"000000067F00004005000060F70000858570-000000067F00004005000060F80100000000__0000008196C976A1-0000008625CF2891":{"file_size":252985344,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001D4000-000000067F000040050081DB4300001E8000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00005E0000-000000067F00004005000060FB0000638B45__0000001B59EEB909-0000001FFBC01501":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050107B547000006C000-000000067F000040050107B54700000A0EB1__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000430000-000000067F00004005000060FB0000434000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014E6333-000000067F00004005000060F3000151F271__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300100000000-000000067F0000400500FB3D300300000000__00000117EDA82C11-0000011B632CC319":{"file_size":65536,"generation":2,"shard":"0008"},"000000067F00004005000060F30004BE7584-000000067F00004005000060F30100000000__0000013C9C0E3339-0000013FEFA7D709":{"file_size":58204160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001068000-000000067F00004005000060F80100000000__000000E7C2F1B249-000000EBC9213D59":{"file_size":168730624,"generation":2,"shard":"0008"},"000000067F00004005000060F1000013C000-000000067F00004005000060F10000148000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000659A203-000000067F00004005000060F300065BB235__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000EC0000-000000067F00004005000060F70000EF85D6__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005010660F500000B4000-000000067F00004005010660F500000F44CB__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300067A4000-000000067F00004005000060F300067F0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F0000-000000067F0000400500DBCED500000F4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000768000-000000067F000040050081DB43000076C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018E0000-000000067F00004005016EA00C00018E4000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000A50000-000000067F00004005000060F30000A54000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E68000-000000067F00004005000060FB0001E6C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001960000-000000067F00004005000060F300019790A2__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B6A1D0-000000067F00004005000060FB0000BAAD15__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002E4A157-000000067F00004005000060F30002E630CF__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E70000-000000067F00004005000060F30006E74000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700004464DD-000000067F00004005000060F7000046EAB9__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000204000-000000067F0000400500EB4A480000218000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300042D51D6-000000067F00004005000060F3000430E1E9__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000F30000-000000067F00004005000060FB0100000000__00000047E31D98D1-0000004C49155071":{"file_size":272302080,"generation":2,"shard":"0008"},"000000067F000040050081DB4300006F8000-030000000000000000000000000000000002__0000009DF02C1241-000000A173C00489":{"file_size":235110400,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001EC000-000000067F000040050081DB4300001F1DA6__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038A3082-000000067F00004005000060F30100000000__000001048B25A8E9-0000010779A7F551":{"file_size":76644352,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000400000-000000067F00004005016EA00C0000404000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003481DDB-000000067F00004005000060F30100000000__000000E7C2F1B249-000000EBC9213D59":{"file_size":107814912,"generation":2,"shard":"0008"},"000000067F00004005000060F3000489C000-000000067F00004005000060F300048A0000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000CD6C36-000000067F000040050081DB430000D18CA9__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004888000-000000067F00004005000060F3000488C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300008E0F49-000000067F00004005000060F30000921E8A__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000074000-000000067F0000400500C782E400000A0000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011F2D11-000000067F00004005000060FB0001203856__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300046330B1-000000067F00004005000060F300046B41AA__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003548000-000000067F00004005000060F30003580FD3__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001198B44-000000067F00004005000060FB00011C1688__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000049C000-000000067F00004005000060F300004A8000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B44000-000000067F00004005016EA00C0000BB0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700014F0000-000000067F00004005000060F700014F85DF__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C5E15B-000000067F000040050081DB430000C801D1__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A10000-000000067F00004005000060F30003A21037__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006EFC000-000000067F00004005000060F30006F18000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D1F87B-000000067F00004005016EA00C0001D7F71A__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F00004005000060F30002A34000-000000067F00004005000060F30002A40000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F0AA88-000000067F00004005000060F80100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006700000-000000067F00004005000060F30006704000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CC4000-000000067F00004005000060F30001CD0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000858000-000000067F00004005000060F80100000000__00000081AA3C40F0":{"file_size":48439296,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000D6407-000000067F000040050081DB430000160484__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300057DD292-000000067F00004005000060F30005816253__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006222843-000000067F00004005000060F3000625B8F0__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000410000-000000067F00004005000060FB0000430B46__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100006A8000-000000067F00004005000060F100006B0000__0000006DDB29D589-000000722F474369":{"file_size":264110080,"generation":2,"shard":"0008"},"000000067F00004005000060F3000460202F-000000067F00004005000060F300046330B1__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E74000-000000067F00004005000060F30006EF8000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A3B020-000000067F00004005000060F30003A4C09C__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002535462-000000067F00004005000060F3000258E3A9__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000294000-000000067F0000400500EB4A480000355928__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016E85370000000000-030000000000000000000000000000000002__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":152190976,"generation":2,"shard":"0008"},"000000067F00004005000060F3000158C000-000000067F00004005000060F300015B0000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003386D10-000000067F00004005000060F300033D7D7C__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E7C000-000000067F00004005000060F30000EF1FC3__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000030000-000000067F0000400500FA2AD30000034000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005588000-000000067F00004005000060F3000558C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039A0000-000000067F00004005000060F300039A4000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000008A13D-000000067F00004005000060F60100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00017120CE-000000067F00004005000060FB000172AC12__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003200000-000000067F00004005000060F30003204000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300007C1007-000000067F00004005000060F30000802123__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000006C000-000000067F0000400500F3A25C00000BB439__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015B4000-000000067F00004005000060F300015F8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060C220F-000000067F00004005000060F300060CB2C8__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A5000004A25C-000000067F0000400500F8E3A50100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C9AFB8-000000067F00004005000060F30002CFC020__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010F2BD40100000000-000000067F00004005010F44EB000000C000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002AEED02-000000067F00004005000060F50100000000__000000C483D0D6B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002EB8000-000000067F00004005000060F30002F5105E__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A1000016321A-030000000000000000000000000000000002__000000EFDE07FFD8":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F3000135C000-000000067F00004005000060F30001407F7A__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F67839000006AEF4-000000067F0000400500F7D2DD0100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30005DA03A8-000000067F00004005000060F30005DC93F1__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010E2072-000000067F000040050081DB430100000000__000000D01F399709-000000D31E48D7C9":{"file_size":15392768,"generation":2,"shard":"0008"},"000000067F00004005000060F300004A8000-000000067F00004005000060F300004AC000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00016E0A44-000000067F00004005000060FB0001701588__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300024D8000-000000067F00004005000060F300024DC000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003BC8000-000000067F00004005000060F30003BCC000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F00100000000-000000067F00004005000060F10000004000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB430100000000-000000067F0000400500C782E40000074000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D14206-000000067F00004005000060F30003D252C8__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700006479E7-000000067F00004005000060F80100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B9C988-000000067F00004005000060F70000BA4F5B__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000078000-000000067F0000400500D69D79000007C000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CC8B74-000000067F00004005000060F80100000000__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":95657984,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000708000-000000067F00004005000060FB000070C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EA0000-000000067F000040050081DB430000EEA075__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000001FD3E-000000067F00004005016EA00C0000097BDA__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000689E295-000000067F00004005000060F3000690F2FD__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CE0000-000000067F00004005000060F30000D31030__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EA0000-030000000000000000000000000000000002__000000C483D0D6B8":{"file_size":20307968,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000807A34-000000067F00004005016EA00C00008578D4__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB430001060000-000000067F000040050081DB430001064000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000480F32C-000000067F00004005000060F3000486837F__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700009385D4-000000067F00004005000060F80100000000__0000008DBE2855F9-000000923719A971":{"file_size":252207104,"generation":2,"shard":"0008"},"000000067F00004005000060F30000090000-000000067F00004005000060F300000C1095__000000021DC73119-000000044854EBD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000480620C-000000067F00004005000060F3000480F32C__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FA40AD-000000067F00004005000060F30005FC519A__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00014A42B8-030000000000000000000000000000000002__000000601F43CF09-000000636DE92159":{"file_size":137322496,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CD0000-000000067F00004005000060F30001CD4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000404000-000000067F00004005016EA00C0000428000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002079FDE-000000067F00004005000060F300020830BE__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000487C000-000000067F00004005000060F30004880000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010A188401FFFFFFFF-000000067F00004005010A18840300000000__00000137115BE4D9-000001398B56A519":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000218000-000000067F00004005000060F7000021C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005EF454F-000000067F00004005000060F30005EFD576__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005DC93F1-000000067F00004005000060F30005E0A466__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"}},"disk_consistent_lsn":"1BC/B5734CD8","metadata_bytes":{"disk_consistent_lsn":"1BC/B5734CD8","prev_record_lsn":"1BC/B5734CB0","ancestor_timeline":null,"ancestor_lsn":"0/0","latest_gc_cutoff_lsn":"1BC/B5732690","initdb_lsn":"0/14EE150","pg_version":16},"lineage":{}}
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 509f41366b..cda70be8da 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -150,6 +150,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_pitr_history_size",
     "pageserver_layer_bytes",
     "pageserver_layer_count",
+    "pageserver_visible_physical_size",
     "pageserver_storage_operations_seconds_count_total",
     "pageserver_storage_operations_seconds_sum_total",
     "pageserver_evictions_total",

From 970f2923b2f81eaf04525f1c9ffb122991319d0c Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Thu, 1 Aug 2024 09:52:34 -0400
Subject: [PATCH 1322/1571] storage-scrubber: log version on start (#8571)

Helps us better identify which version of storage scrubber is running.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 Cargo.lock                   | 1 +
 storage_scrubber/Cargo.toml  | 1 +
 storage_scrubber/src/main.rs | 7 +++++++
 3 files changed, 9 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index dc4f0c7b81..2677699702 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5758,6 +5758,7 @@ dependencies = [
  "either",
  "futures",
  "futures-util",
+ "git-version",
  "hex",
  "humantime",
  "itertools 0.10.5",
diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml
index 7d5b7d10b9..d19119990b 100644
--- a/storage_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -10,6 +10,7 @@ aws-smithy-async.workspace = true
 either.workspace = true
 tokio-rustls.workspace = true
 anyhow.workspace = true
+git-version.workspace = true
 hex.workspace = true
 humantime.workspace = true
 thiserror.workspace = true
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index 4c804c00c1..a111c31844 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -17,6 +17,11 @@ use storage_scrubber::{
 use clap::{Parser, Subcommand};
 use utils::id::TenantId;
 
+use utils::{project_build_tag, project_git_version};
+
+project_git_version!(GIT_VERSION);
+project_build_tag!(BUILD_TAG);
+
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[command(arg_required_else_help(true))]
@@ -101,6 +106,8 @@ enum Command {
 async fn main() -> anyhow::Result<()> {
     let cli = Cli::parse();
 
+    tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG);
+
     let bucket_config = BucketConfig::from_env()?;
 
     let command_log_name = match &cli.command {

From f4a668a27de4557dbfac0b004189c37d068118d5 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Thu, 1 Aug 2024 10:00:06 -0400
Subject: [PATCH 1323/1571] fix(pageserver): skip existing layers for
 btm-gc-compaction (#8498)

part of https://github.com/neondatabase/neon/issues/8002

Due to the limitation of the current layer map implementation, we cannot
directly replace a layer. It's interpreted as an insert and a deletion,
and there will be file exist error when renaming the newly-created layer
to replace the old layer. We work around that by changing the end key of
the image layer. A long-term fix would involve a refactor around the
layer file naming. For delta layers, we simply skip layers with the same
key range produced, though it is possible to add an extra key as an
alternative solution.

* The image layer range for the layers generated from gc-compaction will
be Key::MIN..(Key..MAX-1), to avoid being recognized as an L0 delta
layer.
* Skip existing layers if it turns out that we need to generate a layer
with the same persistent key in the same generation.

Note that it is possible that the newly-generated layer has different
content from the existing layer. For example, when the user drops a
retain_lsn, the compaction could have combined or dropped some records,
therefore creating a smaller layer than the existing one. We discard the
"optimized" layer for now because we cannot deal with such rewrites
within the same generation.


---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/tenant.rs                      |  47 ++-
 .../src/tenant/storage_layer/layer_desc.rs    |  14 +
 pageserver/src/tenant/timeline/compaction.rs  | 279 ++++++++++++++++--
 .../src/tenant/timeline/layer_manager.rs      |  14 +-
 4 files changed, 320 insertions(+), 34 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index b9257dfbe8..84c5095610 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -6963,7 +6963,11 @@ mod tests {
             vec![
                 // Image layer at GC horizon
                 PersistentLayerKey {
-                    key_range: Key::MIN..Key::MAX,
+                    key_range: {
+                        let mut key = Key::MAX;
+                        key.field6 -= 1;
+                        Key::MIN..key
+                    },
                     lsn_range: Lsn(0x30)..Lsn(0x31),
                     is_delta: false
                 },
@@ -6982,6 +6986,15 @@ mod tests {
             ]
         );
 
+        // increase GC horizon and compact again
+        {
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            guard.cutoffs.time = Lsn(0x40);
+            guard.cutoffs.space = Lsn(0x40);
+        }
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+
         Ok(())
     }
 
@@ -7333,6 +7346,15 @@ mod tests {
             );
         }
 
+        // increase GC horizon and compact again
+        {
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            guard.cutoffs.time = Lsn(0x40);
+            guard.cutoffs.space = Lsn(0x40);
+        }
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+
         Ok(())
     }
 
@@ -7837,6 +7859,10 @@ mod tests {
         ];
 
         let verify_result = || async {
+            let gc_horizon = {
+                let gc_info = tline.gc_info.read().unwrap();
+                gc_info.cutoffs.time
+            };
             for idx in 0..10 {
                 assert_eq!(
                     tline
@@ -7847,7 +7873,7 @@ mod tests {
                 );
                 assert_eq!(
                     tline
-                        .get(get_key(idx as u32), Lsn(0x30), &ctx)
+                        .get(get_key(idx as u32), gc_horizon, &ctx)
                         .await
                         .unwrap(),
                     &expected_result_at_gc_horizon[idx]
@@ -7873,7 +7899,24 @@ mod tests {
 
         let cancel = CancellationToken::new();
         tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        verify_result().await;
 
+        // compact again
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        verify_result().await;
+
+        // increase GC horizon and compact again
+        {
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            guard.cutoffs.time = Lsn(0x38);
+            guard.cutoffs.space = Lsn(0x38);
+        }
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
+
+        // not increasing the GC horizon and compact again
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
         verify_result().await;
 
         Ok(())
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index bd765560e4..cbd18e650f 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -41,6 +41,20 @@ pub struct PersistentLayerKey {
     pub is_delta: bool,
 }
 
+impl std::fmt::Display for PersistentLayerKey {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{}..{} {}..{} is_delta={}",
+            self.key_range.start,
+            self.key_range.end,
+            self.lsn_range.start,
+            self.lsn_range.end,
+            self.is_delta
+        )
+    }
+}
+
 impl PersistentLayerDesc {
     pub fn key(&self) -> PersistentLayerKey {
         PersistentLayerKey {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 4fe9bbafab..61d662d25d 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,7 +4,7 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.
 
-use std::collections::BinaryHeap;
+use std::collections::{BinaryHeap, HashSet};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
 
@@ -30,7 +30,9 @@ use crate::page_cache;
 use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
-use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState};
+use crate::tenant::storage_layer::{
+    AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
+};
 use crate::tenant::timeline::ImageLayerCreationOutcome;
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
@@ -1368,7 +1370,7 @@ impl Timeline {
     pub(crate) async fn generate_key_retention(
         self: &Arc<Timeline>,
         key: Key,
-        history: &[(Key, Lsn, Value)],
+        full_history: &[(Key, Lsn, Value)],
         horizon: Lsn,
         retain_lsn_below_horizon: &[Lsn],
         delta_threshold_cnt: usize,
@@ -1376,14 +1378,14 @@ impl Timeline {
     ) -> anyhow::Result<KeyHistoryRetention> {
         // Pre-checks for the invariants
         if cfg!(debug_assertions) {
-            for (log_key, _, _) in history {
+            for (log_key, _, _) in full_history {
                 assert_eq!(log_key, &key, "mismatched key");
             }
-            for i in 1..history.len() {
-                assert!(history[i - 1].1 <= history[i].1, "unordered LSN");
-                if history[i - 1].1 == history[i].1 {
+            for i in 1..full_history.len() {
+                assert!(full_history[i - 1].1 <= full_history[i].1, "unordered LSN");
+                if full_history[i - 1].1 == full_history[i].1 {
                     assert!(
-                        matches!(history[i - 1].2, Value::Image(_)),
+                        matches!(full_history[i - 1].2, Value::Image(_)),
                         "unordered delta/image, or duplicated delta"
                     );
                 }
@@ -1414,7 +1416,7 @@ impl Timeline {
             }
             lsn_split_points.push(horizon);
             let mut current_idx = 0;
-            for item @ (_, lsn, _) in history {
+            for item @ (_, lsn, _) in full_history {
                 while current_idx < lsn_split_points.len() && *lsn > lsn_split_points[current_idx] {
                     current_idx += 1;
                 }
@@ -1459,6 +1461,68 @@ impl Timeline {
         if let Some((key, lsn, img)) = base_img_from_ancestor {
             replay_history.push((key, lsn, Value::Image(img)));
         }
+
+        /// Generate debug information for the replay history
+        fn generate_history_trace(replay_history: &[(Key, Lsn, Value)]) -> String {
+            use std::fmt::Write;
+            let mut output = String::new();
+            if let Some((key, _, _)) = replay_history.first() {
+                write!(output, "key={} ", key).unwrap();
+                let mut cnt = 0;
+                for (_, lsn, val) in replay_history {
+                    if val.is_image() {
+                        write!(output, "i@{} ", lsn).unwrap();
+                    } else if val.will_init() {
+                        write!(output, "di@{} ", lsn).unwrap();
+                    } else {
+                        write!(output, "d@{} ", lsn).unwrap();
+                    }
+                    cnt += 1;
+                    if cnt >= 128 {
+                        write!(output, "... and more").unwrap();
+                        break;
+                    }
+                }
+            } else {
+                write!(output, "<no history>").unwrap();
+            }
+            output
+        }
+
+        fn generate_debug_trace(
+            replay_history: Option<&[(Key, Lsn, Value)]>,
+            full_history: &[(Key, Lsn, Value)],
+            lsns: &[Lsn],
+            horizon: Lsn,
+        ) -> String {
+            use std::fmt::Write;
+            let mut output = String::new();
+            if let Some(replay_history) = replay_history {
+                writeln!(
+                    output,
+                    "replay_history: {}",
+                    generate_history_trace(replay_history)
+                )
+                .unwrap();
+            } else {
+                writeln!(output, "replay_history: <disabled>",).unwrap();
+            }
+            writeln!(
+                output,
+                "full_history: {}",
+                generate_history_trace(full_history)
+            )
+            .unwrap();
+            writeln!(
+                output,
+                "when processing: [{}] horizon={}",
+                lsns.iter().map(|l| format!("{l}")).join(","),
+                horizon
+            )
+            .unwrap();
+            output
+        }
+
         for (i, split_for_lsn) in split_history.into_iter().enumerate() {
             // TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly.
             records_since_last_image += split_for_lsn.len();
@@ -1483,10 +1547,27 @@ impl Timeline {
                 }
             }
             if let Some((_, _, val)) = replay_history.first() {
-                assert!(val.will_init(), "invalid history, no base image");
+                if !val.will_init() {
+                    return Err(anyhow::anyhow!("invalid history, no base image")).with_context(
+                        || {
+                            generate_debug_trace(
+                                Some(&replay_history),
+                                full_history,
+                                retain_lsn_below_horizon,
+                                horizon,
+                            )
+                        },
+                    );
+                }
             }
             if generate_image && records_since_last_image > 0 {
                 records_since_last_image = 0;
+                let replay_history_for_debug = if cfg!(debug_assertions) {
+                    Some(replay_history.clone())
+                } else {
+                    None
+                };
+                let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
                 let history = std::mem::take(&mut replay_history);
                 let mut img = None;
                 let mut records = Vec::with_capacity(history.len());
@@ -1494,14 +1575,30 @@ impl Timeline {
                     img = Some((*lsn, val.clone()));
                     for (_, lsn, val) in history.into_iter().skip(1) {
                         let Value::WalRecord(rec) = val else {
-                            panic!("invalid record")
+                            return Err(anyhow::anyhow!(
+                                "invalid record, first record is image, expect walrecords"
+                            ))
+                            .with_context(|| {
+                                generate_debug_trace(
+                                    replay_history_for_debug_ref,
+                                    full_history,
+                                    retain_lsn_below_horizon,
+                                    horizon,
+                                )
+                            });
                         };
                         records.push((lsn, rec));
                     }
                 } else {
                     for (_, lsn, val) in history.into_iter() {
                         let Value::WalRecord(rec) = val else {
-                            panic!("invalid record")
+                            return Err(anyhow::anyhow!("invalid record, first record is walrecord, expect rest are walrecord"))
+                                .with_context(|| generate_debug_trace(
+                                    replay_history_for_debug_ref,
+                                    full_history,
+                                    retain_lsn_below_horizon,
+                                    horizon,
+                                ));
                         };
                         records.push((lsn, rec));
                     }
@@ -1513,12 +1610,11 @@ impl Timeline {
                 replay_history.push((key, request_lsn, Value::Image(img.clone())));
                 retention.push(vec![(request_lsn, Value::Image(img))]);
             } else {
-                retention.push(
-                    split_for_lsn
-                        .iter()
-                        .map(|(_, lsn, value)| (*lsn, value.clone()))
-                        .collect(),
-                );
+                let deltas = split_for_lsn
+                    .iter()
+                    .map(|(_, lsn, value)| (*lsn, value.clone()))
+                    .collect_vec();
+                retention.push(deltas);
             }
         }
         let mut result = Vec::with_capacity(retention.len());
@@ -1533,7 +1629,7 @@ impl Timeline {
                 result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
             }
         }
-        unreachable!()
+        unreachable!("key retention is empty")
     }
 
     /// An experimental compaction building block that combines compaction with garbage collection.
@@ -1544,11 +1640,26 @@ impl Timeline {
     /// and create delta layers with all deltas >= gc horizon.
     pub(crate) async fn compact_with_gc(
         self: &Arc<Self>,
-        _cancel: &CancellationToken,
+        cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         use std::collections::BTreeSet;
 
+        // Block other compaction/GC tasks from running for now. GC-compaction could run along
+        // with legacy compaction tasks in the future.
+
+        let _compaction_lock = tokio::select! {
+            guard = self.compaction_lock.lock() => guard,
+            // TODO: refactor to CompactionError to correctly pass cancelled error
+            _ = cancel.cancelled() => return Err(anyhow!("cancelled")),
+        };
+
+        let _gc = tokio::select! {
+            guard = self.gc_lock.lock() => guard,
+            // TODO: refactor to CompactionError to correctly pass cancelled error
+            _ = cancel.cancelled() => return Err(anyhow!("cancelled")),
+        };
+
         info!("running enhanced gc bottom-most compaction");
 
         scopeguard::defer! {
@@ -1644,6 +1755,13 @@ impl Timeline {
         let mut accumulated_values = Vec::new();
         let mut last_key: Option<Key> = None;
 
+        enum FlushDeltaResult {
+            /// Create a new resident layer
+            CreateResidentLayer(ResidentLayer),
+            /// Keep an original delta layer
+            KeepLayer(PersistentLayerKey),
+        }
+
         #[allow(clippy::too_many_arguments)]
         async fn flush_deltas(
             deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
@@ -1654,7 +1772,7 @@ impl Timeline {
             lowest_retain_lsn: Lsn,
             ctx: &RequestContext,
             last_batch: bool,
-        ) -> anyhow::Result<Option<ResidentLayer>> {
+        ) -> anyhow::Result<Option<FlushDeltaResult>> {
             // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
             // overlapping layers.
             //
@@ -1677,28 +1795,78 @@ impl Timeline {
             if !need_split && !last_batch {
                 return Ok(None);
             }
-            let deltas = std::mem::take(deltas);
+            let deltas: Vec<(Key, Lsn, Value)> = std::mem::take(deltas);
             if deltas.is_empty() {
                 return Ok(None);
             }
             let end_lsn = deltas.iter().map(|(_, lsn, _)| lsn).max().copied().unwrap() + 1;
+            let delta_key = PersistentLayerKey {
+                key_range: {
+                    let key_start = deltas.first().unwrap().0;
+                    let key_end = deltas.last().unwrap().0.next();
+                    key_start..key_end
+                },
+                lsn_range: lowest_retain_lsn..end_lsn,
+                is_delta: true,
+            };
+            {
+                // Hack: skip delta layer if we need to produce a layer of a same key-lsn.
+                //
+                // This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
+                // For example, consider the case where a single delta with range [0x10,0x50) exists.
+                // And we have branches at LSN 0x10, 0x20, 0x30.
+                // Then we delete branch @ 0x20.
+                // Bottom-most compaction may now delete the delta [0x20,0x30).
+                // And that wouldnt' change the shape of the layer.
+                //
+                // Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
+                // That's why it's safe to skip.
+                let guard = tline.layers.read().await;
+
+                if guard.contains_key(&delta_key) {
+                    let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
+                    drop(guard);
+                    if layer_generation == tline.generation {
+                        // TODO: depending on whether we design this compaction process to run along with
+                        // other compactions, there could be layer map modifications after we drop the
+                        // layer guard, and in case it creates duplicated layer key, we will still error
+                        // in the end.
+                        info!(
+                            key=%delta_key,
+                            ?layer_generation,
+                            "discard delta layer due to duplicated layer in the same generation"
+                        );
+                        return Ok(Some(FlushDeltaResult::KeepLayer(delta_key)));
+                    }
+                }
+            }
+
             let mut delta_layer_writer = DeltaLayerWriter::new(
                 tline.conf,
                 tline.timeline_id,
                 tline.tenant_shard_id,
-                deltas.first().unwrap().0,
+                delta_key.key_range.start,
                 lowest_retain_lsn..end_lsn,
                 ctx,
             )
             .await?;
-            let key_end = deltas.last().unwrap().0.next();
             for (key, lsn, val) in deltas {
                 delta_layer_writer.put_value(key, lsn, val, ctx).await?;
             }
-            let delta_layer = delta_layer_writer.finish(key_end, tline, ctx).await?;
-            Ok(Some(delta_layer))
+            let delta_layer = delta_layer_writer
+                .finish(delta_key.key_range.end, tline, ctx)
+                .await?;
+            Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
         }
 
+        // Hack the key range to be min..(max-1). Otherwise, the image layer will be
+        // interpreted as an L0 delta layer.
+        let hack_image_layer_range = {
+            let mut end_key = Key::MAX;
+            end_key.field6 -= 1;
+            Key::MIN..end_key
+        };
+
         // Only create image layers when there is no ancestor branches. TODO: create covering image layer
         // when some condition meet.
         let mut image_layer_writer = if self.ancestor_timeline.is_none() {
@@ -1707,7 +1875,7 @@ impl Timeline {
                     self.conf,
                     self.timeline_id,
                     self.tenant_shard_id,
-                    &(Key::MIN..Key::MAX), // covers the full key range
+                    &hack_image_layer_range, // covers the full key range
                     lowest_retain_lsn,
                     ctx,
                 )
@@ -1737,6 +1905,42 @@ impl Timeline {
             let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
             Ok(Some((key, tline.ancestor_lsn, img)))
         }
+        let image_layer_key = PersistentLayerKey {
+            key_range: hack_image_layer_range,
+            lsn_range: PersistentLayerDesc::image_layer_lsn_range(lowest_retain_lsn),
+            is_delta: false,
+        };
+
+        // Like with delta layers, it can happen that we re-produce an already existing image layer.
+        // This could happen when a user triggers force compaction and image generation. In this case,
+        // it's always safe to rewrite the layer.
+        let discard_image_layer = {
+            let guard = self.layers.read().await;
+            if guard.contains_key(&image_layer_key) {
+                let layer_generation = guard.get_from_key(&image_layer_key).metadata().generation;
+                drop(guard);
+                if layer_generation == self.generation {
+                    // TODO: depending on whether we design this compaction process to run along with
+                    // other compactions, there could be layer map modifications after we drop the
+                    // layer guard, and in case it creates duplicated layer key, we will still error
+                    // in the end.
+                    info!(
+                        key=%image_layer_key,
+                        ?layer_generation,
+                        "discard image layer due to duplicated layer key in the same generation",
+                    );
+                    true
+                } else {
+                    false
+                }
+            } else {
+                false
+            }
+        };
+
+        // Actually, we can decide not to write to the image layer at all at this point because
+        // the key and LSN range are determined. However, to keep things simple here, we still
+        // create this writer, and discard the writer in the end.
 
         let mut delta_values = Vec::new();
         let delta_split_points = delta_split_points.into_iter().collect_vec();
@@ -1824,7 +2028,9 @@ impl Timeline {
         );
         assert!(delta_values.is_empty(), "unprocessed keys");
 
-        let image_layer = if let Some(writer) = image_layer_writer {
+        let image_layer = if discard_image_layer {
+            None
+        } else if let Some(writer) = image_layer_writer {
             Some(writer.finish(self, ctx).await?)
         } else {
             None
@@ -1835,7 +2041,22 @@ impl Timeline {
             if image_layer.is_some() { 1 } else { 0 }
         );
         let mut compact_to = Vec::new();
-        compact_to.extend(delta_layers);
+        let mut keep_layers = HashSet::new();
+        for action in delta_layers {
+            match action {
+                FlushDeltaResult::CreateResidentLayer(layer) => {
+                    compact_to.push(layer);
+                }
+                FlushDeltaResult::KeepLayer(l) => {
+                    keep_layers.insert(l);
+                }
+            }
+        }
+        if discard_image_layer {
+            keep_layers.insert(image_layer_key);
+        }
+        let mut layer_selection = layer_selection;
+        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
         compact_to.extend(image_layer);
         // Step 3: Place back to the layer map.
         {
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 1e4edd34ad..1bc2acbd34 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -35,6 +35,10 @@ impl LayerManager {
         self.layer_fmgr.get_from_desc(desc)
     }
 
+    pub(crate) fn get_from_key(&self, desc: &PersistentLayerKey) -> Layer {
+        self.layer_fmgr.get_from_key(desc)
+    }
+
     /// Get an immutable reference to the layer map.
     ///
     /// We expect users only to be able to get an immutable layer map. If users want to make modifications,
@@ -365,16 +369,20 @@ impl<T> Default for LayerFileManager<T> {
 }
 
 impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
-    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
+    fn get_from_key(&self, key: &PersistentLayerKey) -> T {
         // The assumption for the `expect()` is that all code maintains the following invariant:
         // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
         self.0
-            .get(&desc.key())
-            .with_context(|| format!("get layer from desc: {}", desc.layer_name()))
+            .get(key)
+            .with_context(|| format!("get layer from key: {}", key))
             .expect("not found")
             .clone()
     }
 
+    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
+        self.get_from_key(&desc.key())
+    }
+
     fn contains_key(&self, key: &PersistentLayerKey) -> bool {
         self.0.contains_key(key)
     }

From e7477855b787f3f44a8e46c550eeed009ec89f0c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 1 Aug 2024 16:55:43 +0100
Subject: [PATCH 1324/1571] test_runner: don't create artifacts if Allure is
 not enabled (#8580)

## Problem

`allure_attach_from_dir` method might create `tar.zst` archives even
if `--alluredir` is not set (i.e. Allure results collection is disabled)

## Summary of changes
- Don't run `allure_attach_from_dir` if `--alluredir`  is not set
---
 test_runner/fixtures/neon_fixtures.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0c33dec784..b370a92e38 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4529,6 +4529,13 @@ def test_output_dir(
 
     yield test_dir
 
+    # Allure artifacts creation might involve the creation of `.tar.zst` archives,
+    # which aren't going to be used if Allure results collection is not enabled
+    # (i.e. --alluredir is not set).
+    # Skip `allure_attach_from_dir` in this case
+    if not request.config.getoption("--alluredir"):
+        return
+
     preserve_database_files = False
     for k, v in request.node.user_properties:
         # NB: the neon_env_builder fixture uses this fixture (test_output_dir).

From c53799044d0db98d6ddcbd0be4521910e287d371 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Aug 2024 08:00:46 +0100
Subject: [PATCH 1325/1571] pageserver: refine how we delete timelines after
 shard split (#8436)

## Problem

Previously, when we do a timeline deletion, shards will delete layers
that belong to an ancestor. That is not a correctness issue, because
when we delete a timeline, we're always deleting it from all shards, and
destroying data for that timeline is clearly fine.

However, there exists a race where one shard might start doing this
deletion while another shard has not yet received the deletion request,
and might try to access an ancestral layer. This creates ambiguity over
the "all layers referenced by my index should always exist" invariant,
which is important to detecting and reporting corruption.

Now that we have a GC mode for clearing up ancestral layers, we can rely
on that to clean up such layers, and avoid deleting them right away.
This makes things easier to reason about: there are now no cases where a
shard will delete a layer that belongs to a ShardIndex other than
itself.

## Summary of changes

- Modify behavior of RemoteTimelineClient::delete_all
- Add `test_scrubber_physical_gc_timeline_deletion` to exercise this
case
- Tweak AWS SDK config in the scrubber to enable retries. Motivated by
seeing the test for this feature encounter some transient "service
error" S3 errors (which are probably nothing to do with the changes in
this PR)
---
 .../src/tenant/remote_timeline_client.rs      | 12 +++
 storage_scrubber/src/lib.rs                   |  8 ++
 test_runner/regress/test_storage_scrubber.py  | 78 ++++++++++++++++++-
 3 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index fed666ca45..9e021c7e35 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1378,6 +1378,18 @@ impl RemoteTimelineClient {
                 .dirty
                 .layer_metadata
                 .drain()
+                .filter(|(_file_name, meta)| {
+                    // Filter out layers that belonged to an ancestor shard.  Since we are deleting the whole timeline from
+                    // all shards anyway, we _could_ delete these, but
+                    // - it creates a potential race if other shards are still
+                    //   using the layers while this shard deletes them.
+                    // - it means that if we rolled back the shard split, the ancestor shards would be in a state where
+                    //   these timelines are present but corrupt (their index exists but some layers don't)
+                    //
+                    // These layers will eventually be cleaned up by the scrubber when it does physical GC.
+                    meta.shard.shard_number == self.tenant_shard_id.shard_number
+                        && meta.shard.shard_count == self.tenant_shard_id.shard_count
+                })
                 .map(|(file_name, meta)| {
                     remote_layer_path(
                         &self.tenant_shard_id.tenant_id,
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 152319b731..1fc94cc174 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -16,6 +16,7 @@ use std::sync::Arc;
 use std::time::Duration;
 
 use anyhow::{anyhow, Context};
+use aws_config::retry::{RetryConfigBuilder, RetryMode};
 use aws_sdk_s3::config::Region;
 use aws_sdk_s3::error::DisplayErrorContext;
 use aws_sdk_s3::Client;
@@ -314,8 +315,15 @@ pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
 }
 
 async fn init_s3_client(bucket_region: Region) -> Client {
+    let mut retry_config_builder = RetryConfigBuilder::new();
+
+    retry_config_builder
+        .set_max_attempts(Some(3))
+        .set_mode(Some(RetryMode::Adaptive));
+
     let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
         .region(bucket_region)
+        .retry_config(retry_config_builder.build())
         .load()
         .await;
     Client::new(&config)
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index fadf438788..e3f627b6a6 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -13,6 +13,7 @@ from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
 )
+from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import S3Storage, s3_storage
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
@@ -265,10 +266,85 @@ def test_scrubber_physical_gc_ancestors(
     # attach it, to drop any local state, then check it's still readable.
     workload.stop()
     drop_local_state(env, tenant_id)
-
     workload.validate()
 
 
+def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder):
+    """
+    When we delete a timeline after a shard split, the child shards do not directly delete the
+    layers in the ancestor shards.  They rely on the scrubber to clean up.
+    """
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.num_pageservers = 2
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(
+        tenant_id,
+        timeline_id,
+        shard_count=None,
+        conf={
+            # Small layers and low compaction thresholds, so that when we split we can expect some to
+            # be dropped by child shards
+            "checkpoint_distance": f"{1024 * 1024}",
+            "compaction_threshold": "1",
+            "compaction_target_size": f"{1024 * 1024}",
+            "image_creation_threshold": "2",
+            "image_layer_creation_check_threshold": "0",
+            # Disable background compaction, we will do it explicitly
+            "compaction_period": "0s",
+            # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas
+            # and makes them GC'able
+            "pitr_interval": "0s",
+        },
+    )
+
+    # Make sure the original shard has some layers
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(100)
+
+    new_shard_count = 4
+    shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
+
+    # Create a second timeline so that when we delete the first one, child shards still have some content in S3.
+    #
+    # This is a limitation of the scrubber: if a shard isn't in S3 (because it has no timelines), then the scrubber
+    # doesn't know about it, and won't perceive its ancestors as ancestors.
+    other_timeline_id = TimelineId.generate()
+    env.storage_controller.pageserver_api().timeline_create(
+        PgVersion.NOT_SET, tenant_id, other_timeline_id
+    )
+
+    # Write after split so that child shards have some indices in S3
+    workload.write_rows(100, upload=False)
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+        log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
+        ps.http_client().timeline_checkpoint(
+            shard, timeline_id, compact=False, wait_until_uploaded=True
+        )
+
+    # The timeline still exists in child shards and they reference its layers, so scrubbing
+    # now shouldn't delete anything.
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+    assert gc_summary["ancestor_layers_deleted"] == 0
+
+    # Delete the timeline
+    env.storage_controller.pageserver_api().timeline_delete(tenant_id, timeline_id)
+
+    # Subsequently doing physical GC should clean up the ancestor layers
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+    assert gc_summary["ancestor_layers_deleted"] > 0
+
+
 def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder):
     """
     Exercise ancestor GC while a tenant is partly split: this test ensures that if we have some child shards

From 2334fed7627441afde0c7dbf6109bb35074616dd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Aug 2024 09:37:44 +0100
Subject: [PATCH 1326/1571] storage_controller: start adding chaos hooks
 (#7946)

Chaos injection bridges the gap between automated testing (where we do
lots of different things with small, short-lived tenants), and staging
(where we do many fewer things, but with larger, long-lived tenants).

This PR adds a first type of chaos which isn't really very chaotic: it's
live migration of tenants between healthy pageservers. This nevertheless
provides continuous checks that things like clean, prompt shutdown of
tenants works for realistically deployed pageservers with realistically
large tenants.
---
 Cargo.lock                                    |  1 +
 storage_controller/Cargo.toml                 |  1 +
 storage_controller/src/main.rs                | 28 ++++++++
 storage_controller/src/service.rs             |  2 +
 .../src/service/chaos_injector.rs             | 71 +++++++++++++++++++
 5 files changed, 103 insertions(+)
 create mode 100644 storage_controller/src/service/chaos_injector.rs

diff --git a/Cargo.lock b/Cargo.lock
index 2677699702..764c0fbd30 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5703,6 +5703,7 @@ dependencies = [
  "pageserver_client",
  "postgres_connection",
  "r2d2",
+ "rand 0.8.5",
  "reqwest 0.12.4",
  "routerify",
  "scopeguard",
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index d14b235046..ecaac04915 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -32,6 +32,7 @@ once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
+rand.workspace = true
 reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
 serde.workspace = true
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index adbf5c6496..2799f21fdc 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -9,12 +9,14 @@ use std::time::Duration;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
+use storage_controller::service::chaos_injector::ChaosInjector;
 use storage_controller::service::{
     Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
     RECONCILER_CONCURRENCY_DEFAULT,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
+use tracing::Instrument;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};
 
@@ -86,6 +88,10 @@ struct Cli {
     // TODO: make `cfg(feature = "testing")`
     #[arg(long)]
     neon_local_repo_dir: Option<PathBuf>,
+
+    /// Chaos testing
+    #[arg(long)]
+    chaos_interval: Option<humantime::Duration>,
 }
 
 enum StrictMode {
@@ -309,6 +315,22 @@ async fn async_main() -> anyhow::Result<()> {
     tracing::info!("Serving on {0}", args.listen);
     let server_task = tokio::task::spawn(server);
 
+    let chaos_task = args.chaos_interval.map(|interval| {
+        let service = service.clone();
+        let cancel = CancellationToken::new();
+        let cancel_bg = cancel.clone();
+        (
+            tokio::task::spawn(
+                async move {
+                    let mut chaos_injector = ChaosInjector::new(service, interval.into());
+                    chaos_injector.run(cancel_bg).await
+                }
+                .instrument(tracing::info_span!("chaos_injector")),
+            ),
+            cancel,
+        )
+    });
+
     // Wait until we receive a signal
     let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
     let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
@@ -337,6 +359,12 @@ async fn async_main() -> anyhow::Result<()> {
         }
     }
 
+    // If we were injecting chaos, stop that so that we're not calling into Service while it shuts down
+    if let Some((chaos_jh, chaos_cancel)) = chaos_task {
+        chaos_cancel.cancel();
+        chaos_jh.await.ok();
+    }
+
     service.shutdown().await;
     tracing::info!("Service shutdown complete");
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ea515f67da..6940bf2c64 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -84,6 +84,8 @@ use crate::{
 };
 use serde::{Deserialize, Serialize};
 
+pub mod chaos_injector;
+
 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
 
diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs
new file mode 100644
index 0000000000..99961d691c
--- /dev/null
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -0,0 +1,71 @@
+use std::{sync::Arc, time::Duration};
+
+use rand::seq::SliceRandom;
+use rand::thread_rng;
+use tokio_util::sync::CancellationToken;
+
+use super::Service;
+
+pub struct ChaosInjector {
+    service: Arc<Service>,
+    interval: Duration,
+}
+
+impl ChaosInjector {
+    pub fn new(service: Arc<Service>, interval: Duration) -> Self {
+        Self { service, interval }
+    }
+
+    pub async fn run(&mut self, cancel: CancellationToken) {
+        let mut interval = tokio::time::interval(self.interval);
+
+        loop {
+            tokio::select! {
+                _ = interval.tick() => {}
+                _ = cancel.cancelled() => {
+                    tracing::info!("Shutting down");
+                    return;
+                }
+            }
+
+            self.inject_chaos().await;
+
+            tracing::info!("Chaos iteration...");
+        }
+    }
+
+    async fn inject_chaos(&mut self) {
+        // Pick some shards to interfere with
+        let batch_size = 128;
+        let mut inner = self.service.inner.write().unwrap();
+        let (nodes, tenants, scheduler) = inner.parts_mut();
+        let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
+        let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size);
+
+        for victim in victims {
+            let shard = tenants
+                .get_mut(victim)
+                .expect("Held lock between choosing ID and this get");
+
+            // Pick a secondary to promote
+            let Some(new_location) = shard
+                .intent
+                .get_secondary()
+                .choose(&mut thread_rng())
+                .cloned()
+            else {
+                tracing::info!("Skipping shard {victim}: no secondary location, can't migrate");
+                continue;
+            };
+
+            let Some(old_location) = *shard.intent.get_attached() else {
+                tracing::info!("Skipping shard {victim}: currently has no attached location");
+                continue;
+            };
+
+            shard.intent.demote_attached(scheduler, old_location);
+            shard.intent.promote_attached(scheduler, new_location);
+            self.service.maybe_reconcile_shard(shard, nodes);
+        }
+    }
+}

From 8c828c586ea473beb8a593411f54d1f677c1ddfa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 2 Aug 2024 13:07:12 +0200
Subject: [PATCH 1327/1571] Wait for completion of the upload queue in
 flush_frozen_layer (#8550)

Makes `flush_frozen_layer` add a barrier to the upload queue and makes
it wait for that barrier to be reached until it lets the flushing be
completed.

This gives us backpressure and ensures that writes can't build up in an
unbounded fashion.

Fixes #7317
---
 compute_tools/Cargo.toml                   |  5 ++
 compute_tools/src/compute.rs               | 10 ++-
 control_plane/src/background_process.rs    |  2 +-
 pageserver/src/tenant/timeline.rs          | 20 ++++-
 test_runner/fixtures/neon_fixtures.py      | 12 ++-
 test_runner/fixtures/pageserver/http.py    |  2 +
 test_runner/regress/test_branching.py      | 23 ++++--
 test_runner/regress/test_remote_storage.py | 87 +++-------------------
 8 files changed, 74 insertions(+), 87 deletions(-)

diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 8ceb8f2ad2..8af0ed43ce 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -4,6 +4,11 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+default = []
+# Enables test specific features.
+testing = []
+
 [dependencies]
 anyhow.workspace = true
 async-compression.workspace = true
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 91855d954d..5bd6897fe3 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -400,7 +400,15 @@ impl ComputeNode {
     pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
         let mut retry_period_ms = 500.0;
         let mut attempts = 0;
-        let max_attempts = 10;
+        const DEFAULT_ATTEMPTS: u16 = 10;
+        #[cfg(feature = "testing")]
+        let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") {
+            u16::from_str(&v).unwrap()
+        } else {
+            DEFAULT_ATTEMPTS
+        };
+        #[cfg(not(feature = "testing"))]
+        let max_attempts = DEFAULT_ATTEMPTS;
         loop {
             let result = self.try_get_basebackup(compute_state, lsn);
             match result {
diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index a272c306e7..bf8a27e550 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -289,7 +289,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
 
 fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
     for (var, val) in std::env::vars() {
-        if var.starts_with("NEON_PAGESERVER_") {
+        if var.starts_with("NEON_") {
             cmd = cmd.env(var, val);
         }
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 37ebeded66..be72e15c19 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -143,7 +143,10 @@ use self::walreceiver::{WalReceiver, WalReceiverConf};
 use super::{config::TenantConf, upload_queue::NotInitialized};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
-use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
+use super::{
+    remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
+    storage_layer::ReadableLayer,
+};
 use super::{
     secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
     GcError,
@@ -4089,6 +4092,21 @@ impl Timeline {
             // release lock on 'layers'
         };
 
+        // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
+        // This makes us refuse ingest until the new layers have been persisted to the remote.
+        self.remote_client
+            .wait_completion()
+            .await
+            .map_err(|e| match e {
+                WaitCompletionError::UploadQueueShutDownOrStopped
+                | WaitCompletionError::NotInitialized(
+                    NotInitialized::ShuttingDown | NotInitialized::Stopped,
+                ) => FlushLayerError::Cancelled,
+                WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
+                    FlushLayerError::Other(anyhow!(e).into())
+                }
+            })?;
+
         // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
         // a compaction can delete the file and then it won't be available for uploads any more.
         // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b370a92e38..7289472de2 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1943,11 +1943,15 @@ class NeonCli(AbstractNeonCli):
         remote_ext_config: Optional[str] = None,
         pageserver_id: Optional[int] = None,
         allow_multiple=False,
+        basebackup_request_tries: Optional[int] = None,
     ) -> "subprocess.CompletedProcess[str]":
         args = [
             "endpoint",
             "start",
         ]
+        extra_env_vars = {}
+        if basebackup_request_tries is not None:
+            extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries)
         if remote_ext_config is not None:
             args.extend(["--remote-ext-config", remote_ext_config])
 
@@ -1960,7 +1964,7 @@ class NeonCli(AbstractNeonCli):
         if allow_multiple:
             args.extend(["--allow-multiple"])
 
-        res = self.raw_cli(args)
+        res = self.raw_cli(args, extra_env_vars)
         res.check_returncode()
         return res
 
@@ -3812,6 +3816,7 @@ class Endpoint(PgProtocol, LogUtils):
         pageserver_id: Optional[int] = None,
         safekeepers: Optional[List[int]] = None,
         allow_multiple: bool = False,
+        basebackup_request_tries: Optional[int] = None,
     ) -> "Endpoint":
         """
         Start the Postgres instance.
@@ -3833,6 +3838,7 @@ class Endpoint(PgProtocol, LogUtils):
             remote_ext_config=remote_ext_config,
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
+            basebackup_request_tries=basebackup_request_tries,
         )
         self._running.release(1)
 
@@ -3979,6 +3985,7 @@ class Endpoint(PgProtocol, LogUtils):
         remote_ext_config: Optional[str] = None,
         pageserver_id: Optional[int] = None,
         allow_multiple=False,
+        basebackup_request_tries: Optional[int] = None,
     ) -> "Endpoint":
         """
         Create an endpoint, apply config, and start Postgres.
@@ -3999,6 +4006,7 @@ class Endpoint(PgProtocol, LogUtils):
             remote_ext_config=remote_ext_config,
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
+            basebackup_request_tries=basebackup_request_tries,
         )
 
         log.info(f"Postgres startup took {time.time() - started_at} seconds")
@@ -4042,6 +4050,7 @@ class EndpointFactory:
         config_lines: Optional[List[str]] = None,
         remote_ext_config: Optional[str] = None,
         pageserver_id: Optional[int] = None,
+        basebackup_request_tries: Optional[int] = None,
     ) -> Endpoint:
         ep = Endpoint(
             self.env,
@@ -4060,6 +4069,7 @@ class EndpointFactory:
             lsn=lsn,
             remote_ext_config=remote_ext_config,
             pageserver_id=pageserver_id,
+            basebackup_request_tries=basebackup_request_tries,
         )
 
     def create(
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index c6df6b5baf..192324f086 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -663,6 +663,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         force_image_layer_creation=False,
         wait_until_uploaded=False,
         compact: Optional[bool] = None,
+        **kwargs,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
@@ -680,6 +681,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
             params=query,
+            **kwargs,
         )
         log.info(f"Got checkpoint request response code: {res.status_code}")
         self.verbose_error(res)
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 190b624a54..fc74707639 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -18,7 +18,6 @@ from fixtures.pageserver.utils import wait_until_tenant_active
 from fixtures.utils import query_scalar
 from performance.test_perf_pgbench import get_scales_matrix
 from requests import RequestException
-from requests.exceptions import RetryError
 
 
 # Test branch creation
@@ -151,7 +150,7 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
     env.pageserver.allowed_errors.extend(
         [
             ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
-            ".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading",
+            ".*page_service_conn_main.*: query handler for 'basebackup .* ERROR: Not found: Timeline",
         ]
     )
     ps_http = env.pageserver.http_client()
@@ -176,10 +175,12 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
 
         env.neon_cli.map_branch(initial_branch, env.initial_tenant, env.initial_timeline)
 
-        with pytest.raises(RuntimeError, match="is not active, state: Loading"):
-            env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant)
+        with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"):
+            env.endpoints.create_start(
+                initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2
+            )
+        ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
     finally:
-        # FIXME: paused uploads bother shutdown
         env.pageserver.stop(immediate=True)
 
         t.join()
@@ -193,8 +194,11 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
     env = neon_env_builder.init_configs()
     env.start()
 
-    env.pageserver.allowed_errors.append(
-        ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
+            ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: .*Cannot branch off the timeline that's not present in pageserver.*",
+        ]
     )
     ps_http = env.pageserver.http_client()
 
@@ -216,7 +220,10 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
 
         branch_id = TimelineId.generate()
 
-        with pytest.raises(RetryError, match="too many 503 error responses"):
+        with pytest.raises(
+            PageserverApiException,
+            match="Cannot branch off the timeline that's not present in pageserver",
+        ):
             ps_http.timeline_create(
                 env.pg_version,
                 env.initial_tenant,
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 09f941f582..2e5260ca78 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -12,7 +12,6 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
@@ -313,6 +312,7 @@ def test_remote_storage_upload_queue_retries(
 
     def churn_while_failpoints_active(result):
         overwrite_data_and_wait_for_it_to_arrive_at_pageserver("c")
+        # this call will wait for the failpoints to be turned off
         client.timeline_checkpoint(tenant_id, timeline_id)
         client.timeline_compact(tenant_id, timeline_id)
         overwrite_data_and_wait_for_it_to_arrive_at_pageserver("d")
@@ -332,8 +332,8 @@ def test_remote_storage_upload_queue_retries(
     # Exponential back-off in upload queue, so, gracious timeouts.
 
     wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
-    wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1))
+    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
 
     # unblock churn operations
     configure_storage_sync_failpoints("off")
@@ -769,11 +769,11 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
         create_thread.join()
 
 
-def test_compaction_waits_for_upload(
+def test_paused_upload_stalls_checkpoint(
     neon_env_builder: NeonEnvBuilder,
 ):
     """
-    This test forces a race between upload and compaction.
+    This test checks that checkpoints block on uploads to remote storage.
     """
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
@@ -788,6 +788,10 @@ def test_compaction_waits_for_upload(
         }
     )
 
+    env.pageserver.allowed_errors.append(
+        f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
+    )
+
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
@@ -808,76 +812,9 @@ def test_compaction_waits_for_upload(
         endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)")
         wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
-        client.timeline_checkpoint(tenant_id, timeline_id)
-        deltas_at_first = len(client.layer_map_info(tenant_id, timeline_id).delta_layers())
-        assert (
-            deltas_at_first == 2
-        ), "are you fixing #5863? just add one more checkpoint after 'CREATE TABLE bar ...' statement."
-
-        endpoint.safe_psql("CREATE TABLE bar AS SELECT x FROM generate_series(1, 10000) g(x)")
-        endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1")
-        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-
-    layers_before_last_checkpoint = client.layer_map_info(tenant_id, timeline_id).historic_by_name()
-    upload_stuck_layers = layers_before_last_checkpoint - layers_at_creation.historic_by_name()
-
-    assert len(upload_stuck_layers) > 0
-
-    for name in upload_stuck_layers:
-        assert env.pageserver.layer_exists(
-            tenant_id, timeline_id, parse_layer_file_name(name)
-        ), "while uploads are stuck the layers should be present on disk"
-
-    # now this will do the L0 => L1 compaction and want to remove
-    # upload_stuck_layers and the original initdb L0
-    client.timeline_checkpoint(tenant_id, timeline_id)
-
-    # as uploads are paused, the upload_stuck_layers should still be with us
-    for name in upload_stuck_layers:
-        assert env.pageserver.layer_exists(
-            tenant_id, timeline_id, parse_layer_file_name(name)
-        ), "uploads are stuck still over compaction"
-
-    compacted_layers = client.layer_map_info(tenant_id, timeline_id).historic_by_name()
-    overlap = compacted_layers.intersection(upload_stuck_layers)
-    assert len(overlap) == 0, "none of the L0's should remain after L0 => L1 compaction"
-    assert (
-        len(compacted_layers) == 1
-    ), "there should be one L1 after L0 => L1 compaction (without #5863 being fixed)"
-
-    def layer_deletes_completed():
-        m = client.get_metric_value("pageserver_layer_completed_deletes_total")
-        if m is None:
-            return 0
-        return int(m)
-
-    # if initdb created an initial delta layer, it might already be gc'd
-    # because it was uploaded before the failpoint was enabled. however, the
-    # deletion is not guaranteed to be complete.
-    assert layer_deletes_completed() <= 1
-
-    client.configure_failpoints(("before-upload-layer-pausable", "off"))
-
-    # Ensure that this actually terminates
-    wait_upload_queue_empty(client, tenant_id, timeline_id)
-
-    def until_layer_deletes_completed():
-        deletes = layer_deletes_completed()
-        log.info(f"layer_deletes: {deletes}")
-        # ensure that initdb delta layer AND the previously stuck are now deleted
-        assert deletes >= len(upload_stuck_layers) + 1
-
-    wait_until(10, 1, until_layer_deletes_completed)
-
-    for name in upload_stuck_layers:
-        assert not env.pageserver.layer_exists(
-            tenant_id, timeline_id, parse_layer_file_name(name)
-        ), "l0 should now be removed because of L0 => L1 compaction and completed uploads"
-
-    # We should not have hit the error handling path in uploads where a uploaded file is gone
-    assert not env.pageserver.log_contains(
-        "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."
-    )
+        with pytest.raises(ReadTimeout):
+            client.timeline_checkpoint(tenant_id, timeline_id, timeout=5)
+        client.configure_failpoints(("before-upload-layer-pausable", "off"))
 
 
 def wait_upload_queue_empty(

From f3acfb2d80729ca7a2cfffdd4d924bd934101b06 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 2 Aug 2024 15:26:46 +0100
Subject: [PATCH 1328/1571] Improve safekeepers eviction rate limiting (#8456)

This commit tries to fix regular load spikes on staging, caused by too
many eviction and partial upload operations running at the same time.
Usually it was hapenning after restart, for partial backup the load was
delayed.
- Add a semaphore for evictions (2 permits by default)
- Rename `resident_since` to `evict_not_before` and smooth out the curve
by using random duration
- Use random duration in partial uploads as well

related to https://github.com/neondatabase/neon/issues/6338
some discussion in
https://neondb.slack.com/archives/C033RQ5SPDH/p1720601531744029
---
 safekeeper/src/lib.rs                  |  2 ++
 safekeeper/src/rate_limit.rs           | 49 ++++++++++++++++++++++++++
 safekeeper/src/timeline.rs             |  3 +-
 safekeeper/src/timeline_eviction.rs    |  6 ++--
 safekeeper/src/timeline_manager.rs     | 48 ++++++++++++++++++-------
 safekeeper/src/timelines_global_map.rs | 14 +++++---
 safekeeper/src/wal_backup_partial.rs   | 39 +++++++-------------
 7 files changed, 112 insertions(+), 49 deletions(-)
 create mode 100644 safekeeper/src/rate_limit.rs

diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 8f2920ada3..56d61e8287 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -21,6 +21,7 @@ pub mod json_ctrl;
 pub mod metrics;
 pub mod patch_control_file;
 pub mod pull_timeline;
+pub mod rate_limit;
 pub mod receive_wal;
 pub mod recovery;
 pub mod remove_wal;
@@ -53,6 +54,7 @@ pub mod defaults {
     pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
     pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
     pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";
+    pub const DEFAULT_EVICTION_CONCURRENCY: usize = 2;
 
     // By default, our required residency before eviction is the same as the period that passes
     // before uploading a partial segment, so that in normal operation the eviction can happen
diff --git a/safekeeper/src/rate_limit.rs b/safekeeper/src/rate_limit.rs
new file mode 100644
index 0000000000..72373b5786
--- /dev/null
+++ b/safekeeper/src/rate_limit.rs
@@ -0,0 +1,49 @@
+use std::sync::Arc;
+
+use rand::Rng;
+
+use crate::metrics::MISC_OPERATION_SECONDS;
+
+/// Global rate limiter for background tasks.
+#[derive(Clone)]
+pub struct RateLimiter {
+    partial_backup: Arc<tokio::sync::Semaphore>,
+    eviction: Arc<tokio::sync::Semaphore>,
+}
+
+impl RateLimiter {
+    /// Create a new rate limiter.
+    /// - `partial_backup_max`: maximum number of concurrent partial backups.
+    /// - `eviction_max`: maximum number of concurrent timeline evictions.
+    pub fn new(partial_backup_max: usize, eviction_max: usize) -> Self {
+        Self {
+            partial_backup: Arc::new(tokio::sync::Semaphore::new(partial_backup_max)),
+            eviction: Arc::new(tokio::sync::Semaphore::new(eviction_max)),
+        }
+    }
+
+    /// Get a permit for partial backup. This will block if the maximum number of concurrent
+    /// partial backups is reached.
+    pub async fn acquire_partial_backup(&self) -> tokio::sync::OwnedSemaphorePermit {
+        let _timer = MISC_OPERATION_SECONDS
+            .with_label_values(&["partial_permit_acquire"])
+            .start_timer();
+        self.partial_backup
+            .clone()
+            .acquire_owned()
+            .await
+            .expect("semaphore is closed")
+    }
+
+    /// Try to get a permit for timeline eviction. This will return None if the maximum number of
+    /// concurrent timeline evictions is reached.
+    pub fn try_acquire_eviction(&self) -> Option<tokio::sync::OwnedSemaphorePermit> {
+        self.eviction.clone().try_acquire_owned().ok()
+    }
+}
+
+/// Generate a random duration that is a fraction of the given duration.
+pub fn rand_duration(duration: &std::time::Duration) -> std::time::Duration {
+    let randf64 = rand::thread_rng().gen_range(0.0..1.0);
+    duration.mul_f64(randf64)
+}
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 132e5ec32f..57935d879f 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -25,6 +25,7 @@ use utils::{
 use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 
+use crate::rate_limit::RateLimiter;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{
     AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
@@ -36,7 +37,7 @@ use crate::timeline_guard::ResidenceGuard;
 use crate::timeline_manager::{AtomicStatus, ManagerCtl};
 use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::{self};
-use crate::wal_backup_partial::{PartialRemoteSegment, RateLimiter};
+use crate::wal_backup_partial::PartialRemoteSegment;
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
 use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index 7947d83eb4..ae6f3f4b7e 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -5,7 +5,6 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use remote_storage::RemotePath;
-use std::time::Instant;
 use tokio::{
     fs::File,
     io::{AsyncRead, AsyncWriteExt},
@@ -15,6 +14,7 @@ use utils::crashsafe::durable_rename;
 
 use crate::{
     metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
+    rate_limit::rand_duration,
     timeline_manager::{Manager, StateSnapshot},
     wal_backup,
     wal_backup_partial::{self, PartialRemoteSegment},
@@ -50,7 +50,6 @@ impl Manager {
                 .flush_lsn
                 .segment_number(self.wal_seg_size)
                 == self.last_removed_segno + 1
-            && self.resident_since.elapsed() >= self.conf.eviction_min_resident
     }
 
     /// Evict the timeline to remote storage.
@@ -112,7 +111,8 @@ impl Manager {
             return;
         }
 
-        self.resident_since = Instant::now();
+        self.evict_not_before =
+            tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident);
 
         info!("successfully restored evicted timeline");
     }
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index debf8c824f..c224dcd398 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -23,6 +23,7 @@ use utils::lsn::Lsn;
 use crate::{
     control_file::{FileStorage, Storage},
     metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS},
+    rate_limit::{rand_duration, RateLimiter},
     recovery::recovery_main,
     remove_wal::calc_horizon_lsn,
     safekeeper::Term,
@@ -32,7 +33,7 @@ use crate::{
     timeline_guard::{AccessService, GuardId, ResidenceGuard},
     timelines_set::{TimelineSetGuard, TimelinesSet},
     wal_backup::{self, WalBackupTaskHandle},
-    wal_backup_partial::{self, PartialRemoteSegment, RateLimiter},
+    wal_backup_partial::{self, PartialRemoteSegment},
     SafeKeeperConf,
 };
 
@@ -185,11 +186,11 @@ pub(crate) struct Manager {
 
     // misc
     pub(crate) access_service: AccessService,
-    pub(crate) partial_backup_rate_limiter: RateLimiter,
+    pub(crate) global_rate_limiter: RateLimiter,
 
     // Anti-flapping state: we evict timelines eagerly if they are inactive, but should not
     // evict them if they go inactive very soon after being restored.
-    pub(crate) resident_since: std::time::Instant,
+    pub(crate) evict_not_before: Instant,
 }
 
 /// This task gets spawned alongside each timeline and is responsible for managing the timeline's
@@ -202,7 +203,7 @@ pub async fn main_task(
     broker_active_set: Arc<TimelinesSet>,
     manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
     mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
-    partial_backup_rate_limiter: RateLimiter,
+    global_rate_limiter: RateLimiter,
 ) {
     tli.set_status(Status::Started);
 
@@ -220,7 +221,7 @@ pub async fn main_task(
         conf,
         broker_active_set,
         manager_tx,
-        partial_backup_rate_limiter,
+        global_rate_limiter,
     )
     .await;
 
@@ -254,9 +255,29 @@ pub async fn main_task(
             mgr.set_status(Status::UpdatePartialBackup);
             mgr.update_partial_backup(&state_snapshot).await;
 
-            if mgr.conf.enable_offload && mgr.ready_for_eviction(&next_event, &state_snapshot) {
-                mgr.set_status(Status::EvictTimeline);
-                mgr.evict_timeline().await;
+            let now = Instant::now();
+            if mgr.evict_not_before > now {
+                // we should wait until evict_not_before
+                update_next_event(&mut next_event, mgr.evict_not_before);
+            }
+
+            if mgr.conf.enable_offload
+                && mgr.evict_not_before <= now
+                && mgr.ready_for_eviction(&next_event, &state_snapshot)
+            {
+                // check rate limiter and evict timeline if possible
+                match mgr.global_rate_limiter.try_acquire_eviction() {
+                    Some(_permit) => {
+                        mgr.set_status(Status::EvictTimeline);
+                        mgr.evict_timeline().await;
+                    }
+                    None => {
+                        // we can't evict timeline now, will try again later
+                        mgr.evict_not_before =
+                            Instant::now() + rand_duration(&mgr.conf.eviction_min_resident);
+                        update_next_event(&mut next_event, mgr.evict_not_before);
+                    }
+                }
             }
         }
 
@@ -334,11 +355,10 @@ impl Manager {
         conf: SafeKeeperConf,
         broker_active_set: Arc<TimelinesSet>,
         manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
-        partial_backup_rate_limiter: RateLimiter,
+        global_rate_limiter: RateLimiter,
     ) -> Manager {
         let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
         Manager {
-            conf,
             wal_seg_size: tli.get_wal_seg_size().await,
             walsenders: tli.get_walsenders().clone(),
             state_version_rx: tli.get_state_version_rx(),
@@ -353,8 +373,10 @@ impl Manager {
             partial_backup_uploaded,
             access_service: AccessService::new(manager_tx),
             tli,
-            partial_backup_rate_limiter,
-            resident_since: std::time::Instant::now(),
+            global_rate_limiter,
+            // to smooth out evictions spike after restart
+            evict_not_before: Instant::now() + rand_duration(&conf.eviction_min_resident),
+            conf,
         }
     }
 
@@ -541,7 +563,7 @@ impl Manager {
         self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
             self.wal_resident_timeline(),
             self.conf.clone(),
-            self.partial_backup_rate_limiter.clone(),
+            self.global_rate_limiter.clone(),
         )));
     }
 
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index f57da5c7cb..6662e18817 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -2,10 +2,11 @@
 //! All timelines should always be present in this map, this is done by loading them
 //! all from the disk on startup and keeping them in memory.
 
+use crate::defaults::DEFAULT_EVICTION_CONCURRENCY;
+use crate::rate_limit::RateLimiter;
 use crate::safekeeper::ServerInfo;
 use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
 use crate::timelines_set::TimelinesSet;
-use crate::wal_backup_partial::RateLimiter;
 use crate::SafeKeeperConf;
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
@@ -31,7 +32,7 @@ struct GlobalTimelinesState {
     conf: Option<SafeKeeperConf>,
     broker_active_set: Arc<TimelinesSet>,
     load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
-    partial_backup_rate_limiter: RateLimiter,
+    global_rate_limiter: RateLimiter,
 }
 
 // Used to prevent concurrent timeline loading.
@@ -50,7 +51,7 @@ impl GlobalTimelinesState {
         (
             self.get_conf().clone(),
             self.broker_active_set.clone(),
-            self.partial_backup_rate_limiter.clone(),
+            self.global_rate_limiter.clone(),
         )
     }
 
@@ -85,7 +86,7 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
         conf: None,
         broker_active_set: Arc::new(TimelinesSet::default()),
         load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
-        partial_backup_rate_limiter: RateLimiter::new(1),
+        global_rate_limiter: RateLimiter::new(1, 1),
     })
 });
 
@@ -99,7 +100,10 @@ impl GlobalTimelines {
         // lock, so use explicit block
         let tenants_dir = {
             let mut state = TIMELINES_STATE.lock().unwrap();
-            state.partial_backup_rate_limiter = RateLimiter::new(conf.partial_backup_concurrency);
+            state.global_rate_limiter = RateLimiter::new(
+                conf.partial_backup_concurrency,
+                DEFAULT_EVICTION_CONCURRENCY,
+            );
             state.conf = Some(conf);
 
             // Iterate through all directories and load tenants for all directories
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index b1efa9749f..52765b0e98 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -18,8 +18,6 @@
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.
 
-use std::sync::Arc;
-
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use remote_storage::RemotePath;
@@ -30,6 +28,7 @@ use utils::lsn::Lsn;
 
 use crate::{
     metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
+    rate_limit::{rand_duration, RateLimiter},
     safekeeper::Term,
     timeline::WalResidentTimeline,
     timeline_manager::StateSnapshot,
@@ -37,30 +36,6 @@ use crate::{
     SafeKeeperConf,
 };
 
-#[derive(Clone)]
-pub struct RateLimiter {
-    semaphore: Arc<tokio::sync::Semaphore>,
-}
-
-impl RateLimiter {
-    pub fn new(permits: usize) -> Self {
-        Self {
-            semaphore: Arc::new(tokio::sync::Semaphore::new(permits)),
-        }
-    }
-
-    async fn acquire_owned(&self) -> tokio::sync::OwnedSemaphorePermit {
-        let _timer = MISC_OPERATION_SECONDS
-            .with_label_values(&["partial_permit_acquire"])
-            .start_timer();
-        self.semaphore
-            .clone()
-            .acquire_owned()
-            .await
-            .expect("semaphore is closed")
-    }
-}
-
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub enum UploadStatus {
     /// Upload is in progress. This status should be used only for garbage collection,
@@ -352,6 +327,7 @@ pub async fn main_task(
 ) -> Option<PartialRemoteSegment> {
     debug!("started");
     let await_duration = conf.partial_backup_timeout;
+    let mut first_iteration = true;
 
     let (_, persistent_state) = tli.get_state().await;
     let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
@@ -419,6 +395,15 @@ pub async fn main_task(
             }
         }
 
+        // smoothing the load after restart, by sleeping for a random time.
+        // if this is not the first iteration, we will wait for the full await_duration
+        let await_duration = if first_iteration {
+            first_iteration = false;
+            rand_duration(&await_duration)
+        } else {
+            await_duration
+        };
+
         // fixing the segno and waiting some time to prevent reuploading the same segment too often
         let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
         let timeout = tokio::time::sleep(await_duration);
@@ -454,7 +439,7 @@ pub async fn main_task(
         }
 
         // limit concurrent uploads
-        let _upload_permit = limiter.acquire_owned().await;
+        let _upload_permit = limiter.acquire_partial_backup().await;
 
         let prepared = backup.prepare_upload().await;
         if let Some(seg) = &uploaded_segment {

From 0a667bc8ef729b23fa121914e136b304574191a1 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Aug 2024 18:28:23 +0100
Subject: [PATCH 1329/1571] tests: add test_historic_storage_formats (#8423)

## Problem

Currently, our backward compatibility tests only look one release back.
That means, for example, that when we switch on image layer compression
by default, we'll test reading of uncompressed layers for one release,
and then stop doing it. When we make an index_part.json format change,
we'll test against the old format for a week, then stop (unless we write
separate unit tests for each old format).

The reality in the field is that data in old formats will continue to
exist for weeks/months/years. When we make major format changes, we
should retain examples of the old format data, and continuously verify
that the latest code can still read them.

This test uses contents from a new path in the public S3 bucket,
`compatibility-data-snapshots/`. It is populated by hand. The first
important artifact is one from before we switch on compression, so that
we will keep testing reads of uncompressed data. We will generate more
artifacts ahead of other key changes, like when we update remote storage
format for archival timelines.

Closes: https://github.com/neondatabase/cloud/issues/15576
---
 test_runner/regress/test_compatibility.py | 142 ++++++++++++++++++++--
 1 file changed, 135 insertions(+), 7 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 411b20b2c4..137b0e931d 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -3,18 +3,15 @@ import re
 import shutil
 import subprocess
 import tempfile
+from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Optional
 
 import pytest
 import toml
-from fixtures.common_types import Lsn
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    NeonEnv,
-    NeonEnvBuilder,
-    PgBin,
-)
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
@@ -22,7 +19,8 @@ from fixtures.pageserver.utils import (
     wait_for_upload,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.remote_storage import RemoteStorageKind
+from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
+from fixtures.workload import Workload
 
 #
 # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases.
@@ -409,3 +407,133 @@ def dump_differs(
                     break
 
     return differs
+
+
+@dataclass
+class HistoricDataSet:
+    name: str
+    tenant_id: TenantId
+    pg_version: PgVersion
+    url: str
+
+    def __str__(self):
+        return self.name
+
+
+HISTORIC_DATA_SETS = [
+    # From before we enabled image layer compression.
+    # - IndexPart::LATEST_VERSION 7
+    # - STORAGE_FORMAT_VERSION 3
+    HistoricDataSet(
+        "2024-07-18",
+        TenantId("17bf64a53509714687664b3a84e9b3ba"),
+        PgVersion.V16,
+        "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2024-07-18-pgv16.tar.zst",
+    ),
+]
+
+
+@pytest.mark.parametrize("dataset", HISTORIC_DATA_SETS)
+@pytest.mark.xdist_group("compatibility")
+def test_historic_storage_formats(
+    neon_env_builder: NeonEnvBuilder,
+    test_output_dir: Path,
+    pg_version: PgVersion,
+    dataset: HistoricDataSet,
+):
+    """
+    This test is like test_backward_compatibility, but it looks back further to examples of our storage format from long ago.
+    """
+
+    ARTIFACT_CACHE_DIR = "./artifact_cache"
+
+    import tarfile
+    from contextlib import closing
+
+    import requests
+    import zstandard
+
+    artifact_unpack_path = ARTIFACT_CACHE_DIR / Path("unpacked") / Path(dataset.name)
+
+    # Note: we assume that when running across a matrix of PG versions, the matrix includes all the versions needed by
+    # HISTORIC_DATA_SETS. If we ever remove a PG version from the matrix, then historic datasets built using that version
+    # will no longer be covered by this test.
+    if pg_version != dataset.pg_version:
+        pytest.skip(f"Dataset {dataset} is for different PG version, skipping")
+
+    with closing(requests.get(dataset.url, stream=True)) as r:
+        unzstd = zstandard.ZstdDecompressor()
+        with unzstd.stream_reader(r.raw) as stream:
+            with tarfile.open(mode="r|", fileobj=stream) as tf:
+                tf.extractall(artifact_unpack_path)
+
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.pg_version = dataset.pg_version
+    env = neon_env_builder.init_configs()
+    env.start()
+    assert isinstance(env.pageserver_remote_storage, S3Storage)
+
+    # Link artifact data into test's remote storage.  We don't want the whole repo dir, just the remote storage part: we are not testing
+    # compat of local disk data across releases (test_backward_compat does that), we're testing really long-lived data in S3 like layer files and indices.
+    #
+    # The code generating the snapshot uses local_fs, but this test uses S3Storage, so we are copying a tree of files into a bucket.  We use
+    # S3Storage so that the scrubber can run (the scrubber doesn't speak local_fs)
+    artifact_pageserver_path = (
+        artifact_unpack_path / Path("repo") / Path("local_fs_remote_storage") / Path("pageserver")
+    )
+    for root, _dirs, files in os.walk(artifact_pageserver_path):
+        for file in files:
+            local_path = os.path.join(root, file)
+            remote_key = (
+                env.pageserver_remote_storage.prefix_in_bucket
+                + str(local_path)[len(str(artifact_pageserver_path)) :]
+            )
+            log.info(f"Uploading {local_path} -> {remote_key}")
+            env.pageserver_remote_storage.client.upload_file(
+                local_path, env.pageserver_remote_storage.bucket_name, remote_key
+            )
+
+    # Check the scrubber handles this old data correctly (can read it and doesn't consider it corrupt)
+    #
+    # Do this _before_ importing to the pageserver, as that import may start writing immediately
+    metadata_summary = env.storage_scrubber.scan_metadata()
+    assert metadata_summary["tenant_count"] >= 1
+    assert metadata_summary["timeline_count"] >= 1
+    assert not metadata_summary["with_errors"]
+    assert not metadata_summary["with_warnings"]
+
+    env.neon_cli.import_tenant(dataset.tenant_id)
+
+    # Discover timelines
+    timelines = env.pageserver.http_client().timeline_list(dataset.tenant_id)
+    # All our artifacts should contain at least one timeline
+    assert len(timelines) > 0
+
+    # TODO: ensure that the snapshots we're importing contain a sensible variety of content, at the very
+    # least they should include a mixture of deltas and image layers.  Preferably they should also
+    # contain some "exotic" stuff like aux files from logical replication.
+
+    # Check we can start an endpoint and read the SQL that the artifact is meant to contain
+    reference_sql_dump = artifact_unpack_path / Path("dump.sql")
+    ep = env.endpoints.create_start("main", tenant_id=dataset.tenant_id)
+    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
+    pg_bin.run_capture(
+        ["pg_dumpall", f"--dbname={ep.connstr()}", f"--file={test_output_dir / 'dump.sql'}"]
+    )
+    assert not dump_differs(
+        reference_sql_dump,
+        test_output_dir / "dump.sql",
+        test_output_dir / "dump.filediff",
+    )
+    ep.stop()
+
+    # Check we can also do writes to the database
+    existing_timeline_id = TimelineId(timelines[0]["timeline_id"])
+    workload = Workload(env, dataset.tenant_id, existing_timeline_id)
+    workload.init()
+    workload.write_rows(100)
+
+    # Check that compaction works
+    env.pageserver.http_client().timeline_compact(
+        dataset.tenant_id, existing_timeline_id, force_image_layer_creation=True
+    )

From 6814bdd30b6b79500b2e3003b5a32a2c4fcf98e3 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Fri, 2 Aug 2024 19:52:04 -0400
Subject: [PATCH 1330/1571] fix(pageserver): deadlock in gc-compaction (#8590)

We need both compaction and gc lock for gc-compaction. The lock order
should be the same everywhere, otherwise there could be a deadlock where
A waits for B and B waits for A.

We also had a double-lock issue. The compaction lock gets acquired in
the outer `compact` function. Note that the unit tests directly call
`compact_with_gc`, and therefore not triggering the issue.

## Summary of changes

Ensure all places acquire compact lock and then gc lock. Remove an extra
compact lock acqusition.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 28 ++++++++++++--------
 pageserver/src/tenant/timeline/delete.rs     | 20 ++++++++++----
 2 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 61d662d25d..421f718ad6 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1646,19 +1646,23 @@ impl Timeline {
         use std::collections::BTreeSet;
 
         // Block other compaction/GC tasks from running for now. GC-compaction could run along
-        // with legacy compaction tasks in the future.
+        // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
+        // Note that we already acquired the compaction lock when the outer `compact` function gets called.
 
-        let _compaction_lock = tokio::select! {
-            guard = self.compaction_lock.lock() => guard,
-            // TODO: refactor to CompactionError to correctly pass cancelled error
-            _ = cancel.cancelled() => return Err(anyhow!("cancelled")),
+        let gc_lock = async {
+            tokio::select! {
+                guard = self.gc_lock.lock() => Ok(guard),
+                // TODO: refactor to CompactionError to correctly pass cancelled error
+                _ = cancel.cancelled() => Err(anyhow!("cancelled")),
+            }
         };
 
-        let _gc = tokio::select! {
-            guard = self.gc_lock.lock() => guard,
-            // TODO: refactor to CompactionError to correctly pass cancelled error
-            _ = cancel.cancelled() => return Err(anyhow!("cancelled")),
-        };
+        let gc_lock = crate::timed(
+            gc_lock,
+            "acquires gc lock",
+            std::time::Duration::from_secs(5),
+        )
+        .await?;
 
         info!("running enhanced gc bottom-most compaction");
 
@@ -2063,9 +2067,11 @@ impl Timeline {
             let mut guard = self.layers.write().await;
             guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
         };
-
         self.remote_client
             .schedule_compaction_update(&layer_selection, &compact_to)?;
+
+        drop(gc_lock);
+
         Ok(())
     }
 }
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 9b2403f899..05178c38b4 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -63,10 +63,19 @@ pub(super) async fn delete_local_timeline_directory(
     tenant_shard_id: TenantShardId,
     timeline: &Timeline,
 ) -> anyhow::Result<()> {
-    let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
-    let guards = crate::timed(
-        guards,
-        "acquire gc and compaction locks",
+    // Always ensure the lock order is compaction -> gc.
+    let compaction_lock = timeline.compaction_lock.lock();
+    let compaction_lock = crate::timed(
+        compaction_lock,
+        "acquires compaction lock",
+        std::time::Duration::from_secs(5),
+    )
+    .await;
+
+    let gc_lock = timeline.gc_lock.lock();
+    let gc_lock = crate::timed(
+        gc_lock,
+        "acquires gc lock",
         std::time::Duration::from_secs(5),
     )
     .await;
@@ -107,7 +116,8 @@ pub(super) async fn delete_local_timeline_directory(
         .context("fsync_pre_mark_remove")?;
 
     info!("finished deleting layer files, releasing locks");
-    drop(guards);
+    drop(gc_lock);
+    drop(compaction_lock);
 
     fail::fail_point!("timeline-delete-after-rm", |_| {
         Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?

From 1dc496a2c9a8dd8a9a7aa4f08a08555b9005e64c Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Mon, 5 Aug 2024 13:55:36 +0800
Subject: [PATCH 1331/1571] feat(pageserver): support auto split layers based
 on size (#8574)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

part of https://github.com/neondatabase/neon/issues/8002

## Summary of changes

Add a `SplitImageWriter` that automatically splits image layer based on
estimated target image layer size. This does not consider compression
and we might need a better metrics.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/tenant/storage_layer.rs        |   3 +
 .../src/tenant/storage_layer/image_layer.rs   |  58 ++++-
 .../src/tenant/storage_layer/split_writer.rs  | 244 ++++++++++++++++++
 3 files changed, 303 insertions(+), 2 deletions(-)
 create mode 100644 pageserver/src/tenant/storage_layer/split_writer.rs

diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 4fd110359b..59d3e1ce09 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,6 +8,9 @@ mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;
 
+#[cfg(test)]
+pub mod split_writer;
+
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
 use crate::walrecord::NeonWalRecord;
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 08db27514a..aa308ba3c1 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -742,8 +742,14 @@ struct ImageLayerWriterInner {
     // where we have chosen their compressed form
     uncompressed_bytes_chosen: u64,
 
+    // Number of keys in the layer.
+    num_keys: usize,
+
     blob_writer: BlobWriter<false>,
     tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
+
+    #[cfg_attr(not(feature = "testing"), allow(dead_code))]
+    last_written_key: Key,
 }
 
 impl ImageLayerWriterInner {
@@ -800,6 +806,8 @@ impl ImageLayerWriterInner {
             uncompressed_bytes: 0,
             uncompressed_bytes_eligible: 0,
             uncompressed_bytes_chosen: 0,
+            num_keys: 0,
+            last_written_key: Key::MIN,
         };
 
         Ok(writer)
@@ -820,6 +828,7 @@ impl ImageLayerWriterInner {
         let compression = self.conf.image_compression;
         let uncompressed_len = img.len() as u64;
         self.uncompressed_bytes += uncompressed_len;
+        self.num_keys += 1;
         let (_img, res) = self
             .blob_writer
             .write_blob_maybe_compressed(img, ctx, compression)
@@ -839,6 +848,11 @@ impl ImageLayerWriterInner {
         key.write_to_byte_slice(&mut keybuf);
         self.tree.append(&keybuf, off)?;
 
+        #[cfg(feature = "testing")]
+        {
+            self.last_written_key = key;
+        }
+
         Ok(())
     }
 
@@ -849,6 +863,7 @@ impl ImageLayerWriterInner {
         self,
         timeline: &Arc<Timeline>,
         ctx: &RequestContext,
+        end_key: Option<Key>,
     ) -> anyhow::Result<ResidentLayer> {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -899,11 +914,23 @@ impl ImageLayerWriterInner {
         let desc = PersistentLayerDesc::new_img(
             self.tenant_shard_id,
             self.timeline_id,
-            self.key_range.clone(),
+            if let Some(end_key) = end_key {
+                self.key_range.start..end_key
+            } else {
+                self.key_range.clone()
+            },
             self.lsn,
             metadata.len(),
         );
 
+        #[cfg(feature = "testing")]
+        if let Some(end_key) = end_key {
+            assert!(
+                self.last_written_key < end_key,
+                "written key violates end_key range"
+            );
+        }
+
         // Note: Because we open the file in write-only mode, we cannot
         // reuse the same VirtualFile for reading later. That's why we don't
         // set inner.file here. The first read will have to re-open it.
@@ -980,6 +1007,18 @@ impl ImageLayerWriter {
         self.inner.as_mut().unwrap().put_image(key, img, ctx).await
     }
 
+    #[cfg(test)]
+    /// Estimated size of the image layer.
+    pub(crate) fn estimated_size(&self) -> u64 {
+        let inner = self.inner.as_ref().unwrap();
+        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
+    }
+
+    #[cfg(test)]
+    pub(crate) fn num_keys(&self) -> usize {
+        self.inner.as_ref().unwrap().num_keys
+    }
+
     ///
     /// Finish writing the image layer.
     ///
@@ -988,7 +1027,22 @@ impl ImageLayerWriter {
         timeline: &Arc<Timeline>,
         ctx: &RequestContext,
     ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline, ctx).await
+        self.inner.take().unwrap().finish(timeline, ctx, None).await
+    }
+
+    #[cfg(test)]
+    /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
+    pub(super) async fn finish_with_end_key(
+        mut self,
+        timeline: &Arc<Timeline>,
+        end_key: Key,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<super::ResidentLayer> {
+        self.inner
+            .take()
+            .unwrap()
+            .finish(timeline, ctx, Some(end_key))
+            .await
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
new file mode 100644
index 0000000000..a4091a890c
--- /dev/null
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -0,0 +1,244 @@
+use std::sync::Arc;
+
+use bytes::Bytes;
+use pageserver_api::key::{Key, KEY_SIZE};
+use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
+
+use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
+
+use super::{ImageLayerWriter, ResidentLayer};
+
+/// An image writer that takes images and produces multiple image layers. The interface does not
+/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
+/// to be cleaned up)
+#[must_use]
+pub struct SplitImageLayerWriter {
+    inner: ImageLayerWriter,
+    target_layer_size: u64,
+    generated_layers: Vec<ResidentLayer>,
+    conf: &'static PageServerConf,
+    timeline_id: TimelineId,
+    tenant_shard_id: TenantShardId,
+    lsn: Lsn,
+}
+
+impl SplitImageLayerWriter {
+    pub async fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_shard_id: TenantShardId,
+        start_key: Key,
+        lsn: Lsn,
+        target_layer_size: u64,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Self> {
+        Ok(Self {
+            target_layer_size,
+            inner: ImageLayerWriter::new(
+                conf,
+                timeline_id,
+                tenant_shard_id,
+                &(start_key..Key::MAX),
+                lsn,
+                ctx,
+            )
+            .await?,
+            generated_layers: Vec::new(),
+            conf,
+            timeline_id,
+            tenant_shard_id,
+            lsn,
+        })
+    }
+
+    pub async fn put_image(
+        &mut self,
+        key: Key,
+        img: Bytes,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // The current estimation is an upper bound of the space that the key/image could take
+        // because we did not consider compression in this estimation. The resulting image layer
+        // could be smaller than the target size.
+        let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64;
+        if self.inner.num_keys() >= 1
+            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
+        {
+            let next_image_writer = ImageLayerWriter::new(
+                self.conf,
+                self.timeline_id,
+                self.tenant_shard_id,
+                &(key..Key::MAX),
+                self.lsn,
+                ctx,
+            )
+            .await?;
+            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
+            self.generated_layers.push(
+                prev_image_writer
+                    .finish_with_end_key(tline, key, ctx)
+                    .await?,
+            );
+        }
+        self.inner.put_image(key, img, ctx).await
+    }
+
+    pub(crate) async fn finish(
+        self,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        end_key: Key,
+    ) -> anyhow::Result<Vec<ResidentLayer>> {
+        let Self {
+            mut generated_layers,
+            inner,
+            ..
+        } = self;
+        generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
+        Ok(generated_layers)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        tenant::{
+            harness::{TenantHarness, TIMELINE_ID},
+            storage_layer::AsLayerDesc,
+        },
+        DEFAULT_PG_VERSION,
+    };
+
+    use super::*;
+
+    fn get_key(id: u32) -> Key {
+        let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        key.field6 = id;
+        key
+    }
+
+    fn get_img(id: u32) -> Bytes {
+        format!("{id:064}").into()
+    }
+
+    fn get_large_img() -> Bytes {
+        vec![0; 8192].into()
+    }
+
+    #[tokio::test]
+    async fn write_one_image() {
+        let harness = TenantHarness::create("split_writer_write_one_image")
+            .await
+            .unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        let mut writer = SplitImageLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18),
+            4 * 1024 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
+
+        writer
+            .put_image(get_key(0), get_img(0), &tline, &ctx)
+            .await
+            .unwrap();
+        let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
+        assert_eq!(layers.len(), 1);
+    }
+
+    #[tokio::test]
+    async fn write_split() {
+        let harness = TenantHarness::create("split_writer_write_split")
+            .await
+            .unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        let mut writer = SplitImageLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18),
+            4 * 1024 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
+        const N: usize = 2000;
+        for i in 0..N {
+            let i = i as u32;
+            writer
+                .put_image(get_key(i), get_large_img(), &tline, &ctx)
+                .await
+                .unwrap();
+        }
+        let layers = writer
+            .finish(&tline, &ctx, get_key(N as u32))
+            .await
+            .unwrap();
+        assert_eq!(layers.len(), N / 512 + 1);
+        for idx in 0..layers.len() {
+            assert_ne!(layers[idx].layer_desc().key_range.start, Key::MIN);
+            assert_ne!(layers[idx].layer_desc().key_range.end, Key::MAX);
+            if idx > 0 {
+                assert_eq!(
+                    layers[idx - 1].layer_desc().key_range.end,
+                    layers[idx].layer_desc().key_range.start
+                );
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn write_large_img() {
+        let harness = TenantHarness::create("split_writer_write_large_img")
+            .await
+            .unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        let mut writer = SplitImageLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18),
+            4 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
+
+        writer
+            .put_image(get_key(0), get_img(0), &tline, &ctx)
+            .await
+            .unwrap();
+        writer
+            .put_image(get_key(1), get_large_img(), &tline, &ctx)
+            .await
+            .unwrap();
+        let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
+        assert_eq!(layers.len(), 2);
+    }
+}

From 0f3dac265b7b183cb4136322036c2eec0e9dc283 Mon Sep 17 00:00:00 2001
From: dotdister <odsk.dr@gmail.com>
Date: Mon, 5 Aug 2024 16:23:59 +0900
Subject: [PATCH 1332/1571] safekeeper: remove unused partial_backup_enabled
 option (#8547)

## Problem
There is an unused safekeeper option `partial_backup_enabled`.

`partial_backup_enabled` was implemented in #6530, but this option was
always turned into enabled in #8022.

If you intended to keep this option for a specific reason, I will close
this PR.

## Summary of changes
I removed an unused safekeeper option `partial_backup_enabled`.
---
 safekeeper/src/bin/safekeeper.rs               | 6 ------
 safekeeper/src/lib.rs                          | 2 --
 safekeeper/src/timeline_manager.rs             | 4 ++--
 safekeeper/tests/walproposer_sim/safekeeper.rs | 1 -
 4 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 2365fd0587..41c2d3fe08 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -170,11 +170,6 @@ struct Args {
     /// still needed for existing replication connection.
     #[arg(long)]
     walsenders_keep_horizon: bool,
-    /// Enable partial backup. If disabled, safekeeper will not upload partial
-    /// segments to remote storage.
-    /// TODO: now partial backup is always enabled, remove this flag.
-    #[arg(long)]
-    partial_backup_enabled: bool,
     /// Controls how long backup will wait until uploading the partial segment.
     #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
     partial_backup_timeout: Duration,
@@ -347,7 +342,6 @@ async fn main() -> anyhow::Result<()> {
         sk_auth_token,
         current_thread_runtime: args.current_thread_runtime,
         walsenders_keep_horizon: args.walsenders_keep_horizon,
-        partial_backup_enabled: true,
         partial_backup_timeout: args.partial_backup_timeout,
         disable_periodic_broker_push: args.disable_periodic_broker_push,
         enable_offload: args.enable_offload,
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 56d61e8287..2e11a279ca 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -93,7 +93,6 @@ pub struct SafeKeeperConf {
     pub sk_auth_token: Option<SecretString>,
     pub current_thread_runtime: bool,
     pub walsenders_keep_horizon: bool,
-    pub partial_backup_enabled: bool,
     pub partial_backup_timeout: Duration,
     pub disable_periodic_broker_push: bool,
     pub enable_offload: bool,
@@ -137,7 +136,6 @@ impl SafeKeeperConf {
             max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
             current_thread_runtime: false,
             walsenders_keep_horizon: false,
-            partial_backup_enabled: false,
             partial_backup_timeout: Duration::from_secs(0),
             disable_periodic_broker_push: false,
             enable_offload: false,
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index c224dcd398..482614fac7 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -544,8 +544,8 @@ impl Manager {
 
     /// Spawns partial WAL backup task if needed.
     async fn update_partial_backup(&mut self, state: &StateSnapshot) {
-        // check if partial backup is enabled and should be started
-        if !self.conf.is_wal_backup_enabled() || !self.conf.partial_backup_enabled {
+        // check if WAL backup is enabled and should be started
+        if !self.conf.is_wal_backup_enabled() {
             return;
         }
 
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 0c6d97ddfa..771d905c90 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -181,7 +181,6 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         sk_auth_token: None,
         current_thread_runtime: false,
         walsenders_keep_horizon: false,
-        partial_backup_enabled: false,
         partial_backup_timeout: Duration::from_secs(0),
         disable_periodic_broker_push: false,
         enable_offload: false,

From 200fa56b045879caaecfd00ed18e9c2843758e13 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Mon, 5 Aug 2024 18:30:49 +0800
Subject: [PATCH 1333/1571] feat(pageserver): support split delta layers
 (#8599)

part of https://github.com/neondatabase/neon/issues/8002

Similar to https://github.com/neondatabase/neon/pull/8574, we add
auto-split support for delta layers. Tests are reused from image layer
split writers.


---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../src/tenant/storage_layer/delta_layer.rs   |  18 ++
 .../src/tenant/storage_layer/split_writer.rs  | 243 ++++++++++++++++--
 2 files changed, 242 insertions(+), 19 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index f9becf53ff..e50fc2a266 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -384,6 +384,9 @@ struct DeltaLayerWriterInner {
     tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,
 
     blob_writer: BlobWriter<true>,
+
+    // Number of key-lsns in the layer.
+    num_keys: usize,
 }
 
 impl DeltaLayerWriterInner {
@@ -425,6 +428,7 @@ impl DeltaLayerWriterInner {
             lsn_range,
             tree: tree_builder,
             blob_writer,
+            num_keys: 0,
         })
     }
 
@@ -475,6 +479,9 @@ impl DeltaLayerWriterInner {
 
         let delta_key = DeltaKey::from_key_lsn(&key, lsn);
         let res = self.tree.append(&delta_key.0, blob_ref.0);
+
+        self.num_keys += 1;
+
         (val, res.map_err(|e| anyhow::anyhow!(e)))
     }
 
@@ -686,6 +693,17 @@ impl DeltaLayerWriter {
             .finish(key_end, timeline, ctx)
             .await
     }
+
+    #[cfg(test)]
+    pub(crate) fn num_keys(&self) -> usize {
+        self.inner.as_ref().unwrap().num_keys
+    }
+
+    #[cfg(test)]
+    pub(crate) fn estimated_size(&self) -> u64 {
+        let inner = self.inner.as_ref().unwrap();
+        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
+    }
 }
 
 impl Drop for DeltaLayerWriter {
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index a4091a890c..a966775f9e 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -1,12 +1,12 @@
-use std::sync::Arc;
+use std::{ops::Range, sync::Arc};
 
 use bytes::Bytes;
 use pageserver_api::key::{Key, KEY_SIZE};
 use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
 
-use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
+use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
 
-use super::{ImageLayerWriter, ResidentLayer};
+use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
 
 /// An image writer that takes images and produces multiple image layers. The interface does not
 /// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
@@ -98,6 +98,107 @@ impl SplitImageLayerWriter {
         generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
         Ok(generated_layers)
     }
+
+    /// When split writer fails, the caller should call this function and handle partially generated layers.
+    #[allow(dead_code)]
+    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
+        Ok((self.generated_layers, self.inner))
+    }
+}
+
+/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
+/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
+/// to be cleaned up).
+#[must_use]
+pub struct SplitDeltaLayerWriter {
+    inner: DeltaLayerWriter,
+    target_layer_size: u64,
+    generated_layers: Vec<ResidentLayer>,
+    conf: &'static PageServerConf,
+    timeline_id: TimelineId,
+    tenant_shard_id: TenantShardId,
+    lsn_range: Range<Lsn>,
+}
+
+impl SplitDeltaLayerWriter {
+    pub async fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_shard_id: TenantShardId,
+        start_key: Key,
+        lsn_range: Range<Lsn>,
+        target_layer_size: u64,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Self> {
+        Ok(Self {
+            target_layer_size,
+            inner: DeltaLayerWriter::new(
+                conf,
+                timeline_id,
+                tenant_shard_id,
+                start_key,
+                lsn_range.clone(),
+                ctx,
+            )
+            .await?,
+            generated_layers: Vec::new(),
+            conf,
+            timeline_id,
+            tenant_shard_id,
+            lsn_range,
+        })
+    }
+
+    pub async fn put_value(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: Value,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
+        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
+        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
+        if self.inner.num_keys() >= 1
+            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
+        {
+            let next_delta_writer = DeltaLayerWriter::new(
+                self.conf,
+                self.timeline_id,
+                self.tenant_shard_id,
+                key,
+                self.lsn_range.clone(),
+                ctx,
+            )
+            .await?;
+            let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
+            self.generated_layers
+                .push(prev_delta_writer.finish(key, tline, ctx).await?);
+        }
+        self.inner.put_value(key, lsn, val, ctx).await
+    }
+
+    pub(crate) async fn finish(
+        self,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        end_key: Key,
+    ) -> anyhow::Result<Vec<ResidentLayer>> {
+        let Self {
+            mut generated_layers,
+            inner,
+            ..
+        } = self;
+        generated_layers.push(inner.finish(end_key, tline, ctx).await?);
+        Ok(generated_layers)
+    }
+
+    /// When split writer fails, the caller should call this function and handle partially generated layers.
+    #[allow(dead_code)]
+    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
+        Ok((self.generated_layers, self.inner))
+    }
 }
 
 #[cfg(test)]
@@ -138,7 +239,7 @@ mod tests {
             .await
             .unwrap();
 
-        let mut writer = SplitImageLayerWriter::new(
+        let mut image_writer = SplitImageLayerWriter::new(
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
@@ -150,11 +251,42 @@ mod tests {
         .await
         .unwrap();
 
-        writer
+        let mut delta_writer = SplitDeltaLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18)..Lsn(0x20),
+            4 * 1024 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
+
+        image_writer
             .put_image(get_key(0), get_img(0), &tline, &ctx)
             .await
             .unwrap();
-        let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
+        let layers = image_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
+        assert_eq!(layers.len(), 1);
+
+        delta_writer
+            .put_value(
+                get_key(0),
+                Lsn(0x18),
+                Value::Image(get_img(0)),
+                &tline,
+                &ctx,
+            )
+            .await
+            .unwrap();
+        let layers = delta_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
         assert_eq!(layers.len(), 1);
     }
 
@@ -170,7 +302,7 @@ mod tests {
             .await
             .unwrap();
 
-        let mut writer = SplitImageLayerWriter::new(
+        let mut image_writer = SplitImageLayerWriter::new(
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
@@ -181,26 +313,58 @@ mod tests {
         )
         .await
         .unwrap();
+        let mut delta_writer = SplitDeltaLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18)..Lsn(0x20),
+            4 * 1024 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
         const N: usize = 2000;
         for i in 0..N {
             let i = i as u32;
-            writer
+            image_writer
                 .put_image(get_key(i), get_large_img(), &tline, &ctx)
                 .await
                 .unwrap();
+            delta_writer
+                .put_value(
+                    get_key(i),
+                    Lsn(0x20),
+                    Value::Image(get_large_img()),
+                    &tline,
+                    &ctx,
+                )
+                .await
+                .unwrap();
         }
-        let layers = writer
+        let image_layers = image_writer
             .finish(&tline, &ctx, get_key(N as u32))
             .await
             .unwrap();
-        assert_eq!(layers.len(), N / 512 + 1);
-        for idx in 0..layers.len() {
-            assert_ne!(layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(layers[idx].layer_desc().key_range.end, Key::MAX);
+        let delta_layers = delta_writer
+            .finish(&tline, &ctx, get_key(N as u32))
+            .await
+            .unwrap();
+        assert_eq!(image_layers.len(), N / 512 + 1);
+        assert_eq!(delta_layers.len(), N / 512 + 1);
+        for idx in 0..image_layers.len() {
+            assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
+            assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
+            assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
+            assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
             if idx > 0 {
                 assert_eq!(
-                    layers[idx - 1].layer_desc().key_range.end,
-                    layers[idx].layer_desc().key_range.start
+                    image_layers[idx - 1].layer_desc().key_range.end,
+                    image_layers[idx].layer_desc().key_range.start
+                );
+                assert_eq!(
+                    delta_layers[idx - 1].layer_desc().key_range.end,
+                    delta_layers[idx].layer_desc().key_range.start
                 );
             }
         }
@@ -218,7 +382,7 @@ mod tests {
             .await
             .unwrap();
 
-        let mut writer = SplitImageLayerWriter::new(
+        let mut image_writer = SplitImageLayerWriter::new(
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
@@ -230,15 +394,56 @@ mod tests {
         .await
         .unwrap();
 
-        writer
+        let mut delta_writer = SplitDeltaLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18)..Lsn(0x20),
+            4 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
+
+        image_writer
             .put_image(get_key(0), get_img(0), &tline, &ctx)
             .await
             .unwrap();
-        writer
+        image_writer
             .put_image(get_key(1), get_large_img(), &tline, &ctx)
             .await
             .unwrap();
-        let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
+        let layers = image_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
+        assert_eq!(layers.len(), 2);
+
+        delta_writer
+            .put_value(
+                get_key(0),
+                Lsn(0x18),
+                Value::Image(get_img(0)),
+                &tline,
+                &ctx,
+            )
+            .await
+            .unwrap();
+        delta_writer
+            .put_value(
+                get_key(1),
+                Lsn(0x1A),
+                Value::Image(get_large_img()),
+                &tline,
+                &ctx,
+            )
+            .await
+            .unwrap();
+        let layers = delta_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
         assert_eq!(layers.len(), 2);
     }
 }

From f63c8e5a8ce836fa92d7ec36445389e7ef9fce2e Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 5 Aug 2024 14:24:54 +0300
Subject: [PATCH 1334/1571] Update Postgres versions to use smgrexists()
 instead of access() to check if Oid is used (#8597)

## Problem

PR #7992 was merged without correspondent changes in Postgres submodules
and this is why test_oid_overflow.py is failed now.

## Summary of changes

Bump Postgres versions

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index dbd0e6428b..7bbe834c8c 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit dbd0e6428b9274d72a10ac29bd3e3162faf109d4
+Subproject commit 7bbe834c8c2dc37802eca8484311599bc47341f6
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 035b73a9c5..9eba7dd382 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 035b73a9c5998f9a0ef35cc8df1bae680bf770fc
+Subproject commit 9eba7dd382606ffca43aca865f337ec21bcdac73
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index b39f316137..5377f5ed72 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit b39f316137fdd29e2da15d2af2fdd1cfd18163be
+Subproject commit 5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3
diff --git a/vendor/revisions.json b/vendor/revisions.json
index eeebd646f5..570dfc1550 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "b39f316137fdd29e2da15d2af2fdd1cfd18163be"],
-  "v15": ["15.7", "035b73a9c5998f9a0ef35cc8df1bae680bf770fc"],
-  "v14": ["14.12", "dbd0e6428b9274d72a10ac29bd3e3162faf109d4"]
+  "v16": ["16.3", "5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3"],
+  "v15": ["15.7", "9eba7dd382606ffca43aca865f337ec21bcdac73"],
+  "v14": ["14.12", "7bbe834c8c2dc37802eca8484311599bc47341f6"]
 }

From bd845c7587ba7326a6d795c9903e18f602e738c5 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 5 Aug 2024 12:25:23 +0100
Subject: [PATCH 1335/1571] CI(trigger-e2e-tests): wait for promote-images job
 from the last commit (#8592)

## Problem

We don't trigger e2e tests for draft PRs, but we do trigger them once a
PR is in the "Ready for review" state.
Sometimes, a PR can be marked as "Ready for review" before we finish
image building. In such cases, triggering e2e tests fails.

## Summary of changes
- Make `trigger-e2e-tests` job poll status of `promote-images` job from
the build-and-test workflow for the last commit. And trigger only if the
status is `success`
- Remove explicit image checking from the workflow
- Add `concurrency` for `triggere-e2e-tests` workflow to make it
possible to cancel jobs in progress (if PR moves from "Draft" to "Ready
for review" several times in a row)
---
 .github/workflows/trigger-e2e-tests.yml | 42 ++++++++++++++++++-------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 77928a343e..0a615b3e37 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -10,11 +10,13 @@ defaults:
   run:
     shell: bash -euxo pipefail {0}
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
 env:
   # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
   E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
 jobs:
   cancel-previous-e2e-tests:
@@ -64,19 +66,35 @@ jobs:
     needs: [ tag ]
     runs-on: ubuntu-22.04
     env:
+      EVENT_ACTION: ${{ github.event.action }}
+      GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
       TAG: ${{ needs.tag.outputs.build-tag }}
     steps:
-      - name: check if ecr image are present
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+      - name: Wait for `promote-images` job to finish
+        # It's important to have a timeout here, the script in the step can run infinitely
+        timeout-minutes: 60
         run: |
-          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
-            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
-            if [ "$OUTPUT" == "" ]; then
-              echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT
-              exit 1
-            fi
+          if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then
+            exit 0
+          fi
+
+          # For PRs we use the run id as the tag
+          BUILD_AND_TEST_RUN_ID=${TAG}
+          while true; do
+            conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion')
+            case "$conclusion" in
+              success)
+                break
+                ;;
+              failure | cancelled | skipped)
+                echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..."
+                exit 1
+                ;;
+              *)
+                echo "The 'promote-images' hasn't succeed yet. Waiting..."
+                sleep 60
+                ;;
+            esac
           done
 
       - name: Set e2e-platforms

From 50daff96558b7a4ae6318d28ac4c93c7b53c6bd7 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 5 Aug 2024 19:47:59 +0100
Subject: [PATCH 1336/1571] CI(trigger-e2e-tests): fix deadlock with Build and
 Test workflow (#8606)

## Problem

In some cases, a deadlock between `build-and-test` and
`trigger-e2e-tests` workflows can happen:

```
Build and Test

Canceling since a deadlock for concurrency group 'Build and Test-8600/merge-anysha' was detected between 'top level workflow' and 'trigger-e2e-tests'
```

I don't understand the reason completely, probably `${{ github.workflow
}}` got evaluated to the same value and somehow caused the issue.
We don't need to limit concurrency for `trigger-e2e-tests`
workflow.

See
https://neondb.slack.com/archives/C059ZC138NR/p1722869486708179?thread_ts=1722869027.960029&cid=C059ZC138NR
---
 .github/workflows/trigger-e2e-tests.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 0a615b3e37..6fbe785c56 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -10,10 +10,6 @@ defaults:
   run:
     shell: bash -euxo pipefail {0}
 
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
-  cancel-in-progress: true
-
 env:
   # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
   E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}

From c32807ac1913ad4d70bec9a0c2b25278cf8a71f1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 5 Aug 2024 23:21:33 +0300
Subject: [PATCH 1337/1571] fix: allow awaiting logical size for root timelines
 (#8604)

Currently if `GET
/v1/tenant/x/timeline/y?force-await-initial-logical-size=true` is
requested for a root timeline created within the current pageserver
session, the request handler panics hitting the debug assertion. These
timelines will always have an accurate (at initdb import) calculated
logical size. Fix is to never attempt prioritizing timeline size
calculation if we already have an exact value.

Split off from #8528.
---
 pageserver/src/tenant/timeline.rs              | 6 ++++++
 pageserver/src/tenant/timeline/logical_size.rs | 4 ++++
 test_runner/regress/test_timeline_size.py      | 3 +++
 3 files changed, 13 insertions(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index be72e15c19..8c80a54bdd 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4727,6 +4727,12 @@ impl Timeline {
             return;
         }
 
+        if self.current_logical_size.current_size().is_exact() {
+            // root timelines are initialized with exact count, but never start the background
+            // calculation
+            return;
+        }
+
         if let Some(await_bg_cancel) = self
             .current_logical_size
             .cancel_wait_for_background_loop_concurrency_limit_semaphore
diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs
index b0d6c4a27a..f4a4eea54a 100644
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -122,6 +122,10 @@ impl CurrentLogicalSize {
             Self::Exact(_) => Accuracy::Exact,
         }
     }
+
+    pub(crate) fn is_exact(&self) -> bool {
+        matches!(self, Self::Exact(_))
+    }
 }
 
 impl LogicalSize {
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 5e9a42f6b4..1f220eec9e 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -936,6 +936,9 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
+    # just make sure this doesn't hit an assertion
+    client.timeline_detail(tenant_id, timeline_id, force_await_initial_logical_size=True)
+
     # load in some data
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
     endpoint.safe_psql_many(

From e6e578821b9748036ad592d8f232f47bc903c904 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 5 Aug 2024 23:06:47 +0100
Subject: [PATCH 1338/1571] CI(benchmarking): set pub/sub projects for LR tests
 (#8483)

## Problem

> Currently, long-running LR tests recreate endpoints every night. We'd
like to have along-running buildup of history to exercise the pageserver
in this case (instead of "unit-testing" the same behavior everynight).

Closes #8317

## Summary of changes
- Update Postgres version for replication tests
- Set `BENCHMARK_PROJECT_ID_PUB`/`BENCHMARK_PROJECT_ID_SUB` env vars to
projects that were created for this purpose

---------

Co-authored-by: Sasha Krassovsky <krassovskysasha@gmail.com>
---
 .github/actionlint.yml                        |  2 ++
 .github/workflows/benchmarking.yml            |  9 ++++--
 test_runner/fixtures/neon_api.py              |  6 ++--
 .../performance/test_logical_replication.py   | 29 ++++++++++++++-----
 4 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 37983798b7..d27fa01efa 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -8,6 +8,8 @@ self-hosted-runner:
     - small-arm64
     - us-east-2
 config-variables:
+  - BENCHMARK_PROJECT_ID_PUB
+  - BENCHMARK_PROJECT_ID_SUB
   - REMOTE_STORAGE_AZURE_CONTAINER
   - REMOTE_STORAGE_AZURE_REGION
   - SLACK_UPCOMING_RELEASE_CHANNEL_ID
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index f7ea534fb9..0f4dac841e 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -147,7 +147,7 @@ jobs:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: 16
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -168,7 +168,7 @@ jobs:
         path: /tmp/neon/
         prefix: latest
 
-    - name: Run benchmark
+    - name: Run Logical Replication benchmarks
       uses: ./.github/actions/run-python-test-set
       with:
         build_type: ${{ env.BUILD_TYPE }}
@@ -176,12 +176,15 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 5400
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
         NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+        BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }}
+        BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }}
 
-    - name: Run benchmark
+    - name: Run Physical Replication benchmarks
       uses: ./.github/actions/run-python-test-set
       with:
         build_type: ${{ env.BUILD_TYPE }}
diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py
index 658ed119a1..0636cfad06 100644
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -285,9 +285,9 @@ class NeonApiEndpoint:
             self.project_id = project_id
             eps = neon_api.get_endpoints(project_id)["endpoints"]
             self.endpoint_id = eps[0]["id"]
-            self.connstr = neon_api.get_connection_uri(project_id, endpoint_id=self.endpoint_id)[
-                "uri"
-            ]
+            self.connstr = neon_api.get_connection_uri(
+                project_id, endpoint_id=self.endpoint_id, pooled=False
+            )["uri"]
             pw = self.connstr.split("@")[0].split(":")[-1]
             self.pgbench_env = {
                 "PGHOST": eps[0]["host"],
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 53bb29a659..4b4ffc1fee 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -100,24 +100,32 @@ def test_subscriber_lag(
     pub_connstr = benchmark_project_pub.connstr
     sub_connstr = benchmark_project_sub.connstr
 
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+    if benchmark_project_pub.is_new:
+        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+    if benchmark_project_sub.is_new:
+        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
 
     pub_conn = psycopg2.connect(pub_connstr)
     sub_conn = psycopg2.connect(sub_connstr)
     pub_conn.autocommit = True
     sub_conn.autocommit = True
     with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-        if benchmark_project_pub.is_new:
-            pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
+        pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
+        pub_exists = len(pub_cur.fetchall()) != 0
 
-        if benchmark_project_sub.is_new:
+        if not pub_exists:
+            pub_cur.execute("CREATE PUBLICATION pub1 FOR TABLE pgbench_accounts, pgbench_history")
+
+        sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'")
+        sub_exists = len(sub_cur.fetchall()) != 0
+        if not sub_exists:
             sub_cur.execute("truncate table pgbench_accounts")
             sub_cur.execute("truncate table pgbench_history")
 
-            sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
+            sub_cur.execute(f"CREATE SUBSCRIPTION sub1 CONNECTION '{pub_connstr}' PUBLICATION pub1")
 
         initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
     pub_conn.close()
     sub_conn.close()
 
@@ -195,10 +203,15 @@ def test_publisher_restart(
     pub_conn.autocommit = True
     sub_conn.autocommit = True
     with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-        if benchmark_project_pub.is_new:
+        pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
+        pub_exists = len(pub_cur.fetchall()) != 0
+
+        if not pub_exists:
             pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
 
-        if benchmark_project_sub.is_new:
+        sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'")
+        sub_exists = len(sub_cur.fetchall()) != 0
+        if not sub_exists:
             sub_cur.execute("truncate table pgbench_accounts")
             sub_cur.execute("truncate table pgbench_history")
 

From 8f3bc5ae3527b9b0a835eb15682ab99e4ef167fe Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Tue, 6 Aug 2024 10:07:48 +0800
Subject: [PATCH 1339/1571] feat(pageserver): support dry-run for
 gc-compaction, add statistics (#8557)

Add dry-run mode that does not produce any image layer + delta layer. I
will use this code to do some experiments and see how much space we can
reclaim for tenants on staging. Part of
https://github.com/neondatabase/neon/issues/8002

* Add dry-run mode that runs the full compaction process without
updating the layer map. (We never call finish on the writers and the
files will be removed before exiting the function).
* Add compaction statistics and print them at the end of compaction.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                      |  56 +++++--
 .../src/tenant/storage_layer/image_layer.rs   |   8 +
 pageserver/src/tenant/timeline.rs             |   1 +
 pageserver/src/tenant/timeline/compaction.rs  | 151 +++++++++++++++++-
 4 files changed, 204 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 84c5095610..72d3aedd05 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -6899,7 +6899,10 @@ mod tests {
         }
 
         let cancel = CancellationToken::new();
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
 
         for (idx, expected) in expected_result.iter().enumerate() {
             assert_eq!(
@@ -6993,7 +6996,10 @@ mod tests {
             guard.cutoffs.time = Lsn(0x40);
             guard.cutoffs.space = Lsn(0x40);
         }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
 
         Ok(())
     }
@@ -7327,7 +7333,10 @@ mod tests {
         }
 
         let cancel = CancellationToken::new();
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
 
         for idx in 0..10 {
             assert_eq!(
@@ -7353,7 +7362,10 @@ mod tests {
             guard.cutoffs.time = Lsn(0x40);
             guard.cutoffs.space = Lsn(0x40);
         }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
 
         Ok(())
     }
@@ -7898,11 +7910,28 @@ mod tests {
         verify_result().await;
 
         let cancel = CancellationToken::new();
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        let mut dryrun_flags = EnumSet::new();
+        dryrun_flags.insert(CompactFlags::DryRun);
+
+        tline
+            .compact_with_gc(&cancel, dryrun_flags, &ctx)
+            .await
+            .unwrap();
+        // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
+        // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
+        verify_result().await;
+
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
         verify_result().await;
 
         // compact again
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
         verify_result().await;
 
         // increase GC horizon and compact again
@@ -7912,11 +7941,17 @@ mod tests {
             guard.cutoffs.time = Lsn(0x38);
             guard.cutoffs.space = Lsn(0x38);
         }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
         verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
 
         // not increasing the GC horizon and compact again
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
         verify_result().await;
 
         Ok(())
@@ -8097,7 +8132,10 @@ mod tests {
         verify_result().await;
 
         let cancel = CancellationToken::new();
-        branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        branch_tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
 
         verify_result().await;
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index aa308ba3c1..f4f48aaf16 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -753,6 +753,10 @@ struct ImageLayerWriterInner {
 }
 
 impl ImageLayerWriterInner {
+    fn size(&self) -> u64 {
+        self.tree.borrow_writer().size() + self.blob_writer.size()
+    }
+
     ///
     /// Start building a new image layer.
     ///
@@ -1044,6 +1048,10 @@ impl ImageLayerWriter {
             .finish(timeline, ctx, Some(end_key))
             .await
     }
+
+    pub(crate) fn size(&self) -> u64 {
+        self.inner.as_ref().unwrap().size()
+    }
 }
 
 impl Drop for ImageLayerWriter {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8c80a54bdd..5c268bf875 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -704,6 +704,7 @@ pub(crate) enum CompactFlags {
     ForceRepartition,
     ForceImageLayerCreation,
     EnhancedGcBottomMostCompaction,
+    DryRun,
 }
 
 impl std::fmt::Debug for Timeline {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 421f718ad6..1ff029a313 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -19,8 +19,10 @@ use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
+use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
+use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;
@@ -41,6 +43,7 @@ use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 
 use crate::keyspace::KeySpace;
 use crate::repository::{Key, Value};
+use crate::walrecord::NeonWalRecord;
 
 use utils::lsn::Lsn;
 
@@ -73,6 +76,7 @@ impl KeyHistoryRetention {
         key: Key,
         delta_writer: &mut Vec<(Key, Lsn, Value)>,
         mut image_writer: Option<&mut ImageLayerWriter>,
+        stat: &mut CompactionStatistics,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let mut first_batch = true;
@@ -82,6 +86,7 @@ impl KeyHistoryRetention {
                     let Value::Image(img) = &logs[0].1 else {
                         unreachable!()
                     };
+                    stat.produce_image_key(img);
                     if let Some(image_writer) = image_writer.as_mut() {
                         image_writer.put_image(key, img.clone(), ctx).await?;
                     } else {
@@ -89,24 +94,111 @@ impl KeyHistoryRetention {
                     }
                 } else {
                     for (lsn, val) in logs {
+                        stat.produce_key(&val);
                         delta_writer.push((key, lsn, val));
                     }
                 }
                 first_batch = false;
             } else {
                 for (lsn, val) in logs {
+                    stat.produce_key(&val);
                     delta_writer.push((key, lsn, val));
                 }
             }
         }
         let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
         for (lsn, val) in above_horizon_logs {
+            stat.produce_key(&val);
             delta_writer.push((key, lsn, val));
         }
         Ok(())
     }
 }
 
+#[derive(Debug, Serialize, Default)]
+struct CompactionStatisticsNumSize {
+    num: u64,
+    size: u64,
+}
+
+#[derive(Debug, Serialize, Default)]
+pub struct CompactionStatistics {
+    delta_layer_visited: CompactionStatisticsNumSize,
+    image_layer_visited: CompactionStatisticsNumSize,
+    delta_layer_produced: CompactionStatisticsNumSize,
+    image_layer_produced: CompactionStatisticsNumSize,
+    num_delta_layer_discarded: usize,
+    num_image_layer_discarded: usize,
+    num_unique_keys_visited: usize,
+    wal_keys_visited: CompactionStatisticsNumSize,
+    image_keys_visited: CompactionStatisticsNumSize,
+    wal_produced: CompactionStatisticsNumSize,
+    image_produced: CompactionStatisticsNumSize,
+}
+
+impl CompactionStatistics {
+    fn estimated_size_of_value(val: &Value) -> usize {
+        match val {
+            Value::Image(img) => img.len(),
+            Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(),
+            _ => std::mem::size_of::<NeonWalRecord>(),
+        }
+    }
+    fn estimated_size_of_key() -> usize {
+        KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer)
+    }
+    fn visit_delta_layer(&mut self, size: u64) {
+        self.delta_layer_visited.num += 1;
+        self.delta_layer_visited.size += size;
+    }
+    fn visit_image_layer(&mut self, size: u64) {
+        self.image_layer_visited.num += 1;
+        self.image_layer_visited.size += size;
+    }
+    fn on_unique_key_visited(&mut self) {
+        self.num_unique_keys_visited += 1;
+    }
+    fn visit_wal_key(&mut self, val: &Value) {
+        self.wal_keys_visited.num += 1;
+        self.wal_keys_visited.size +=
+            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn visit_image_key(&mut self, val: &Value) {
+        self.image_keys_visited.num += 1;
+        self.image_keys_visited.size +=
+            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn produce_key(&mut self, val: &Value) {
+        match val {
+            Value::Image(img) => self.produce_image_key(img),
+            Value::WalRecord(_) => self.produce_wal_key(val),
+        }
+    }
+    fn produce_wal_key(&mut self, val: &Value) {
+        self.wal_produced.num += 1;
+        self.wal_produced.size +=
+            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn produce_image_key(&mut self, val: &Bytes) {
+        self.image_produced.num += 1;
+        self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn discard_delta_layer(&mut self) {
+        self.num_delta_layer_discarded += 1;
+    }
+    fn discard_image_layer(&mut self) {
+        self.num_image_layer_discarded += 1;
+    }
+    fn produce_delta_layer(&mut self, size: u64) {
+        self.delta_layer_produced.num += 1;
+        self.delta_layer_produced.size += size;
+    }
+    fn produce_image_layer(&mut self, size: u64) {
+        self.image_layer_produced.num += 1;
+        self.image_layer_produced.size += size;
+    }
+}
+
 impl Timeline {
     /// TODO: cancellation
     ///
@@ -118,12 +210,18 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<bool, CompactionError> {
         if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
-            self.compact_with_gc(cancel, ctx)
+            self.compact_with_gc(cancel, flags, ctx)
                 .await
                 .map_err(CompactionError::Other)?;
             return Ok(false);
         }
 
+        if flags.contains(CompactFlags::DryRun) {
+            return Err(CompactionError::Other(anyhow!(
+                "dry-run mode is not supported for legacy compaction for now"
+            )));
+        }
+
         // High level strategy for compaction / image creation:
         //
         // 1. First, calculate the desired "partitioning" of the
@@ -1641,6 +1739,7 @@ impl Timeline {
     pub(crate) async fn compact_with_gc(
         self: &Arc<Self>,
         cancel: &CancellationToken,
+        flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         use std::collections::BTreeSet;
@@ -1664,12 +1763,16 @@ impl Timeline {
         )
         .await?;
 
-        info!("running enhanced gc bottom-most compaction");
+        let dry_run = flags.contains(CompactFlags::DryRun);
+
+        info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
 
         scopeguard::defer! {
             info!("done enhanced gc bottom-most compaction");
         };
 
+        let mut stat = CompactionStatistics::default();
+
         // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
         // The layer selection has the following properties:
         // 1. If a layer is in the selection, all layers below it are in the selection.
@@ -1740,6 +1843,9 @@ impl Timeline {
                 let key_range = desc.get_key_range();
                 delta_split_points.insert(key_range.start);
                 delta_split_points.insert(key_range.end);
+                stat.visit_delta_layer(desc.file_size());
+            } else {
+                stat.visit_image_layer(desc.file_size());
             }
         }
         let mut delta_layers = Vec::new();
@@ -1775,6 +1881,8 @@ impl Timeline {
             tline: &Arc<Timeline>,
             lowest_retain_lsn: Lsn,
             ctx: &RequestContext,
+            stats: &mut CompactionStatistics,
+            dry_run: bool,
             last_batch: bool,
         ) -> anyhow::Result<Option<FlushDeltaResult>> {
             // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
@@ -1831,6 +1939,7 @@ impl Timeline {
                     let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
                     drop(guard);
                     if layer_generation == tline.generation {
+                        stats.discard_delta_layer();
                         // TODO: depending on whether we design this compaction process to run along with
                         // other compactions, there could be layer map modifications after we drop the
                         // layer guard, and in case it creates duplicated layer key, we will still error
@@ -1857,6 +1966,10 @@ impl Timeline {
             for (key, lsn, val) in deltas {
                 delta_layer_writer.put_value(key, lsn, val, ctx).await?;
             }
+            stats.produce_delta_layer(delta_layer_writer.size());
+            if dry_run {
+                return Ok(None);
+            }
             let delta_layer = delta_layer_writer
                 .finish(delta_key.key_range.end, tline, ctx)
                 .await?;
@@ -1951,6 +2064,13 @@ impl Timeline {
         let mut current_delta_split_point = 0;
         let mut delta_layers = Vec::new();
         while let Some((key, lsn, val)) = merge_iter.next().await? {
+            if cancel.is_cancelled() {
+                return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
+            }
+            match val {
+                Value::Image(_) => stat.visit_image_key(&val),
+                Value::WalRecord(_) => stat.visit_wal_key(&val),
+            }
             if last_key.is_none() || last_key.as_ref() == Some(&key) {
                 if last_key.is_none() {
                     last_key = Some(key);
@@ -1958,6 +2078,7 @@ impl Timeline {
                 accumulated_values.push((key, lsn, val));
             } else {
                 let last_key = last_key.as_mut().unwrap();
+                stat.on_unique_key_visited();
                 let retention = self
                     .generate_key_retention(
                         *last_key,
@@ -1974,6 +2095,7 @@ impl Timeline {
                         *last_key,
                         &mut delta_values,
                         image_layer_writer.as_mut(),
+                        &mut stat,
                         ctx,
                     )
                     .await?;
@@ -1986,6 +2108,8 @@ impl Timeline {
                         self,
                         lowest_retain_lsn,
                         ctx,
+                        &mut stat,
+                        dry_run,
                         false,
                     )
                     .await?,
@@ -1998,6 +2122,7 @@ impl Timeline {
 
         let last_key = last_key.expect("no keys produced during compaction");
         // TODO: move this part to the loop body
+        stat.on_unique_key_visited();
         let retention = self
             .generate_key_retention(
                 last_key,
@@ -2014,6 +2139,7 @@ impl Timeline {
                 last_key,
                 &mut delta_values,
                 image_layer_writer.as_mut(),
+                &mut stat,
                 ctx,
             )
             .await?;
@@ -2026,6 +2152,8 @@ impl Timeline {
                 self,
                 lowest_retain_lsn,
                 ctx,
+                &mut stat,
+                dry_run,
                 true,
             )
             .await?,
@@ -2033,12 +2161,28 @@ impl Timeline {
         assert!(delta_values.is_empty(), "unprocessed keys");
 
         let image_layer = if discard_image_layer {
+            stat.discard_image_layer();
             None
         } else if let Some(writer) = image_layer_writer {
-            Some(writer.finish(self, ctx).await?)
+            stat.produce_image_layer(writer.size());
+            if !dry_run {
+                Some(writer.finish(self, ctx).await?)
+            } else {
+                None
+            }
         } else {
             None
         };
+
+        info!(
+            "gc-compaction statistics: {}",
+            serde_json::to_string(&stat)?
+        );
+
+        if dry_run {
+            return Ok(());
+        }
+
         info!(
             "produced {} delta layers and {} image layers",
             delta_layers.len(),
@@ -2062,6 +2206,7 @@ impl Timeline {
         let mut layer_selection = layer_selection;
         layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
         compact_to.extend(image_layer);
+
         // Step 3: Place back to the layer map.
         {
             let mut guard = self.layers.write().await;

From 6a6f30e378ad224b522c93cedc15a98e6aff4109 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 6 Aug 2024 10:52:01 +0300
Subject: [PATCH 1340/1571] fix: make Timeline::set_disk_consistent_lsn use
 fetch_max (#8311)

now it is safe to use from multiple callers, as we have two callers.
---
 pageserver/src/tenant/timeline.rs | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5c268bf875..05bf4eac8b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4123,17 +4123,11 @@ impl Timeline {
 
     /// Return true if the value changed
     ///
-    /// This function must only be used from the layer flush task, and may not be called concurrently.
+    /// This function must only be used from the layer flush task.
     fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
-        // We do a simple load/store cycle: that's why this function isn't safe for concurrent use.
-        let old_value = self.disk_consistent_lsn.load();
-        if new_value != old_value {
-            assert!(new_value >= old_value);
-            self.disk_consistent_lsn.store(new_value);
-            true
-        } else {
-            false
-        }
+        let old_value = self.disk_consistent_lsn.fetch_max(new_value);
+        assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");
+        new_value != old_value
     }
 
     /// Update metadata file

From 138f008bab8260cec05d1b3353c1f4ecba1ebb0c Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 6 Aug 2024 12:09:56 +0300
Subject: [PATCH 1341/1571] feat: persistent gc blocking (#8600)

Currently, we do not have facilities to persistently block GC on a
tenant for whatever reason. We could do a tenant configuration update,
but that is risky for generation numbers and would also be transient.
Introduce a `gc_block` facility in the tenant, which manages per
timeline blocking reasons.

Additionally, add HTTP endpoints for enabling/disabling manual gc
blocking for a specific timeline. For debugging, individual tenant
status now includes a similar string representation logged when GC is
skipped.

Cc: #6994
---
 libs/pageserver_api/src/models.rs             |   9 +
 pageserver/src/http/openapi_spec.yml          |  39 ++++
 pageserver/src/http/routes.rs                 |  76 +++++++
 pageserver/src/tenant.rs                      |  30 +++
 pageserver/src/tenant/gc_block.rs             | 213 ++++++++++++++++++
 .../src/tenant/remote_timeline_client.rs      | 117 ++++++++++
 .../tenant/remote_timeline_client/index.rs    | 133 +++++++++++
 pageserver/src/tenant/timeline.rs             |  16 ++
 pageserver/src/tenant/timeline/delete.rs      |   2 +
 test_runner/fixtures/pageserver/http.py       |  16 ++
 .../regress/test_timeline_gc_blocking.py      |  67 ++++++
 11 files changed, 718 insertions(+)
 create mode 100644 pageserver/src/tenant/gc_block.rs
 create mode 100644 test_runner/regress/test_timeline_gc_blocking.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 591c45d908..b541bba6a1 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -637,6 +637,13 @@ pub struct TenantInfo {
     pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
     pub attachment_status: TenantAttachmentStatus,
     pub generation: u32,
+
+    /// Opaque explanation if gc is being blocked.
+    ///
+    /// Only looked up for the individual tenant detail, not the listing. This is purely for
+    /// debugging, not included in openapi.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub gc_blocking: Option<String>,
 }
 
 #[derive(Serialize, Deserialize, Clone)]
@@ -1427,6 +1434,7 @@ mod tests {
             current_physical_size: Some(42),
             attachment_status: TenantAttachmentStatus::Attached,
             generation: 1,
+            gc_blocking: None,
         };
         let expected_active = json!({
             "id": original_active.id.to_string(),
@@ -1449,6 +1457,7 @@ mod tests {
             current_physical_size: Some(42),
             attachment_status: TenantAttachmentStatus::Attached,
             generation: 1,
+            gc_blocking: None,
         };
         let expected_broken = json!({
             "id": original_broken.id.to_string(),
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 59e646d0ca..4656f2c93a 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -308,6 +308,45 @@ paths:
             application/json:
               schema:
                 type: string
+
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: Persistently add a gc blocking at the tenant level because of this timeline
+      responses:
+        "200":
+          description: OK
+
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: Persistently remove a tenant level gc blocking for this timeline
+      responses:
+        "200":
+          description: OK
+
   /v1/tenant/{tenant_shard_id}/location_config:
     parameters:
       - name: tenant_shard_id
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 117f2c5869..fdab780bfb 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -935,6 +935,7 @@ async fn tenant_list_handler(
             generation: (*gen)
                 .into()
                 .expect("Tenants are always attached with a generation"),
+            gc_blocking: None,
         })
         .collect::<Vec<TenantInfo>>();
 
@@ -986,6 +987,7 @@ async fn tenant_status(
                     .generation()
                     .into()
                     .expect("Tenants are always attached with a generation"),
+                gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")),
             },
             walredo: tenant.wal_redo_manager_status(),
             timelines: tenant.list_timeline_ids(),
@@ -1226,6 +1228,72 @@ async fn evict_timeline_layer_handler(
     }
 }
 
+async fn timeline_gc_blocking_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    block_or_unblock_gc(request, true).await
+}
+
+async fn timeline_gc_unblocking_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    block_or_unblock_gc(request, false).await
+}
+
+/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
+///
+/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
+async fn block_or_unblock_gc(
+    request: Request<Body>,
+    block: bool,
+) -> Result<Response<Body>, ApiError> {
+    use crate::tenant::{
+        remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized,
+    };
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let state = get_state(&request);
+
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+    let timeline = tenant.get_timeline(timeline_id, true)?;
+
+    let fut = async {
+        if block {
+            timeline.block_gc(&tenant).await.map(|_| ())
+        } else {
+            timeline.unblock_gc(&tenant).await
+        }
+    };
+
+    let span = tracing::info_span!(
+        "block_or_unblock_gc",
+        tenant_id = %tenant_shard_id.tenant_id,
+        shard_id = %tenant_shard_id.shard_slug(),
+        timeline_id = %timeline_id,
+        block = block,
+    );
+
+    let res = fut.instrument(span).await;
+
+    res.map_err(|e| {
+        if e.is::<NotInitialized>() || e.is::<WaitCompletionError>() {
+            ApiError::ShuttingDown
+        } else {
+            ApiError::InternalServerError(e)
+        }
+    })?;
+
+    json_response(StatusCode::OK, ())
+}
+
 /// Get tenant_size SVG graph along with the JSON data.
 fn synthetic_size_html_response(
     inputs: ModelInputs,
@@ -2904,6 +2972,14 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
             |r| api_handler(r, evict_timeline_layer_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
+            |r| api_handler(r, timeline_gc_blocking_handler),
+        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
+            |r| api_handler(r, timeline_gc_unblocking_handler),
+        )
         .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
             api_handler(r, secondary_upload_handler)
         })
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 72d3aedd05..de9b55d847 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -148,6 +148,7 @@ pub(crate) mod timeline;
 
 pub mod size;
 
+mod gc_block;
 pub(crate) mod throttle;
 
 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -303,6 +304,12 @@ pub struct Tenant {
     /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
     ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
 
+    /// `index_part.json` based gc blocking reason tracking.
+    ///
+    /// New gc iterations must start a new iteration by acquiring `GcBlock::start` before
+    /// proceeding.
+    pub(crate) gc_block: gc_block::GcBlock,
+
     l0_flush_global_state: L0FlushGlobalState,
 }
 
@@ -1036,6 +1043,8 @@ impl Tenant {
             }
         }
 
+        let mut gc_blocks = HashMap::new();
+
         // For every timeline, download the metadata file, scan the local directory,
         // and build a layer map that contains an entry for each remote and local
         // layer file.
@@ -1045,6 +1054,16 @@ impl Tenant {
                 .remove(&timeline_id)
                 .expect("just put it in above");
 
+            if let Some(blocking) = index_part.gc_blocking.as_ref() {
+                // could just filter these away, but it helps while testing
+                anyhow::ensure!(
+                    !blocking.reasons.is_empty(),
+                    "index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons"
+                );
+                let prev = gc_blocks.insert(timeline_id, blocking.reasons);
+                assert!(prev.is_none());
+            }
+
             // TODO again handle early failure
             self.load_remote_timeline(
                 timeline_id,
@@ -1089,6 +1108,8 @@ impl Tenant {
         // IndexPart is the source of truth.
         self.clean_up_timelines(&existent_timelines)?;
 
+        self.gc_block.set_scanned(gc_blocks);
+
         fail::fail_point!("attach-before-activate", |_| {
             anyhow::bail!("attach-before-activate");
         });
@@ -1679,6 +1700,14 @@ impl Tenant {
             }
         }
 
+        let _guard = match self.gc_block.start().await {
+            Ok(guard) => guard,
+            Err(reasons) => {
+                info!("Skipping GC: {reasons}");
+                return Ok(GcResult::default());
+            }
+        };
+
         self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
             .await
     }
@@ -2691,6 +2720,7 @@ impl Tenant {
             )),
             tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
             ongoing_timeline_detach: std::sync::Mutex::default(),
+            gc_block: Default::default(),
             l0_flush_global_state,
         }
     }
diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs
new file mode 100644
index 0000000000..8b41ba1746
--- /dev/null
+++ b/pageserver/src/tenant/gc_block.rs
@@ -0,0 +1,213 @@
+use std::collections::HashMap;
+
+use utils::id::TimelineId;
+
+use super::remote_timeline_client::index::GcBlockingReason;
+
+type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
+
+#[derive(Default)]
+pub(crate) struct GcBlock {
+    /// The timelines which have current reasons to block gc.
+    ///
+    /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
+    /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
+    reasons: std::sync::Mutex<Storage>,
+    blocking: tokio::sync::Mutex<()>,
+}
+
+impl GcBlock {
+    /// Start another gc iteration.
+    ///
+    /// Returns a guard to be held for the duration of gc iteration to allow synchronizing with
+    /// it's ending, or if not currently possible, a value describing the reasons why not.
+    ///
+    /// Cancellation safe.
+    pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
+        let reasons = {
+            let g = self.reasons.lock().unwrap();
+
+            // TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in
+            // tests, we use everything. we should warn if the gc has been consecutively blocked
+            // for more than 1h (within single tenant session?).
+            BlockingReasons::clean_and_summarize(g)
+        };
+
+        if let Some(reasons) = reasons {
+            Err(reasons)
+        } else {
+            Ok(Guard {
+                _inner: self.blocking.lock().await,
+            })
+        }
+    }
+
+    pub(crate) fn summary(&self) -> Option<BlockingReasons> {
+        let g = self.reasons.lock().unwrap();
+
+        BlockingReasons::summarize(&g)
+    }
+
+    /// Start blocking gc for this one timeline for the given reason.
+    ///
+    /// This is not a guard based API but instead it mimics set API. The returned future will not
+    /// resolve until an existing gc round has completed.
+    ///
+    /// Returns true if this block was new, false if gc was already blocked for this reason.
+    ///
+    /// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will
+    /// keep the gc blocking reason.
+    pub(crate) async fn insert(
+        &self,
+        timeline: &super::Timeline,
+        reason: GcBlockingReason,
+    ) -> anyhow::Result<bool> {
+        let (added, uploaded) = {
+            let mut g = self.reasons.lock().unwrap();
+            let set = g.entry(timeline.timeline_id).or_default();
+            let added = set.insert(reason);
+
+            // LOCK ORDER: intentionally hold the lock, see self.reasons.
+            let uploaded = timeline
+                .remote_client
+                .schedule_insert_gc_block_reason(reason)?;
+
+            (added, uploaded)
+        };
+
+        uploaded.await?;
+
+        // ensure that any ongoing gc iteration has completed
+        drop(self.blocking.lock().await);
+
+        Ok(added)
+    }
+
+    /// Remove blocking gc for this one timeline and the given reason.
+    pub(crate) async fn remove(
+        &self,
+        timeline: &super::Timeline,
+        reason: GcBlockingReason,
+    ) -> anyhow::Result<()> {
+        use std::collections::hash_map::Entry;
+
+        super::span::debug_assert_current_span_has_tenant_and_timeline_id();
+
+        let (remaining_blocks, uploaded) = {
+            let mut g = self.reasons.lock().unwrap();
+            match g.entry(timeline.timeline_id) {
+                Entry::Occupied(mut oe) => {
+                    let set = oe.get_mut();
+                    set.remove(reason);
+                    if set.is_empty() {
+                        oe.remove();
+                    }
+                }
+                Entry::Vacant(_) => {
+                    // we must still do the index_part.json update regardless, in case we had earlier
+                    // been cancelled
+                }
+            }
+
+            let remaining_blocks = g.len();
+
+            // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
+            let uploaded = timeline
+                .remote_client
+                .schedule_remove_gc_block_reason(reason)?;
+
+            (remaining_blocks, uploaded)
+        };
+        uploaded.await?;
+
+        // no need to synchronize with gc iteration again
+
+        if remaining_blocks > 0 {
+            tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked");
+        } else {
+            tracing::info!("gc is now unblocked for the tenant");
+        }
+
+        Ok(())
+    }
+
+    pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
+        let unblocked = {
+            let mut g = self.reasons.lock().unwrap();
+            if g.is_empty() {
+                return;
+            }
+
+            g.remove(&timeline.timeline_id);
+
+            BlockingReasons::clean_and_summarize(g).is_none()
+        };
+
+        if unblocked {
+            tracing::info!("gc is now unblocked following deletion");
+        }
+    }
+
+    /// Initialize with the non-deleted timelines of this tenant.
+    pub(crate) fn set_scanned(&self, scanned: Storage) {
+        let mut g = self.reasons.lock().unwrap();
+        assert!(g.is_empty());
+        g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
+
+        if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
+            tracing::info!(summary=?reasons, "initialized with gc blocked");
+        }
+    }
+}
+
+pub(super) struct Guard<'a> {
+    _inner: tokio::sync::MutexGuard<'a, ()>,
+}
+
+#[derive(Debug)]
+pub(crate) struct BlockingReasons {
+    timelines: usize,
+    reasons: enumset::EnumSet<GcBlockingReason>,
+}
+
+impl std::fmt::Display for BlockingReasons {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{} timelines block for {:?}",
+            self.timelines, self.reasons
+        )
+    }
+}
+
+impl BlockingReasons {
+    fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
+        let mut reasons = enumset::EnumSet::empty();
+        g.retain(|_key, value| {
+            reasons = reasons.union(*value);
+            !value.is_empty()
+        });
+        if !g.is_empty() {
+            Some(BlockingReasons {
+                timelines: g.len(),
+                reasons,
+            })
+        } else {
+            None
+        }
+    }
+
+    fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
+        if g.is_empty() {
+            None
+        } else {
+            let reasons = g
+                .values()
+                .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
+            Some(BlockingReasons {
+                timelines: g.len(),
+                reasons,
+            })
+        }
+    }
+}
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 9e021c7e35..1344fe4192 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -800,6 +800,123 @@ impl RemoteTimelineClient {
             .context("wait completion")
     }
 
+    /// Adds a gc blocking reason for this timeline if one does not exist already.
+    ///
+    /// A retryable step of timeline detach ancestor.
+    ///
+    /// Returns a future which waits until the completion of the upload.
+    pub(crate) fn schedule_insert_gc_block_reason(
+        self: &Arc<Self>,
+        reason: index::GcBlockingReason,
+    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
+    {
+        let maybe_barrier = {
+            let mut guard = self.upload_queue.lock().unwrap();
+            let upload_queue = guard.initialized_mut()?;
+
+            if let index::GcBlockingReason::DetachAncestor = reason {
+                if upload_queue.dirty.metadata.ancestor_timeline().is_none() {
+                    drop(guard);
+                    panic!("cannot start detach ancestor if there is nothing to detach from");
+                }
+            }
+
+            let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason));
+
+            let current = upload_queue.dirty.gc_blocking.as_ref();
+            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
+
+            match (current, uploaded) {
+                (x, y) if wanted(x) && wanted(y) => None,
+                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
+                // Usual case: !wanted(x) && !wanted(y)
+                //
+                // Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to
+                // turn on and off some reason.
+                (x, y) => {
+                    if !wanted(x) && wanted(y) {
+                        // this could be avoided by having external in-memory synchronization, like
+                        // timeline detach ancestor
+                        warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason");
+                    }
+
+                    // at this point, the metadata must always show that there is a parent
+                    upload_queue.dirty.gc_blocking = current
+                        .map(|x| x.with_reason(reason))
+                        .or_else(|| Some(index::GcBlocking::started_now_for(reason)));
+                    self.schedule_index_upload(upload_queue)?;
+                    Some(self.schedule_barrier0(upload_queue))
+                }
+            }
+        };
+
+        Ok(async move {
+            if let Some(barrier) = maybe_barrier {
+                Self::wait_completion0(barrier).await?;
+            }
+            Ok(())
+        })
+    }
+
+    /// Removes a gc blocking reason for this timeline if one exists.
+    ///
+    /// A retryable step of timeline detach ancestor.
+    ///
+    /// Returns a future which waits until the completion of the upload.
+    pub(crate) fn schedule_remove_gc_block_reason(
+        self: &Arc<Self>,
+        reason: index::GcBlockingReason,
+    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
+    {
+        let maybe_barrier = {
+            let mut guard = self.upload_queue.lock().unwrap();
+            let upload_queue = guard.initialized_mut()?;
+
+            if let index::GcBlockingReason::DetachAncestor = reason {
+                if !upload_queue
+                    .clean
+                    .0
+                    .lineage
+                    .is_detached_from_original_ancestor()
+                {
+                    drop(guard);
+                    panic!("cannot complete timeline_ancestor_detach while not detached");
+                }
+            }
+
+            let wanted = |x: Option<&index::GcBlocking>| {
+                x.is_none() || x.is_some_and(|b| !b.blocked_by(reason))
+            };
+
+            let current = upload_queue.dirty.gc_blocking.as_ref();
+            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
+
+            match (current, uploaded) {
+                (x, y) if wanted(x) && wanted(y) => None,
+                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
+                (x, y) => {
+                    if !wanted(x) && wanted(y) {
+                        warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)");
+                    }
+
+                    upload_queue.dirty.gc_blocking =
+                        current.as_ref().and_then(|x| x.without_reason(reason));
+                    assert!(wanted(upload_queue.dirty.gc_blocking.as_ref()));
+                    // FIXME: bogus ?
+                    self.schedule_index_upload(upload_queue)?;
+                    Some(self.schedule_barrier0(upload_queue))
+                }
+            }
+        };
+
+        Ok(async move {
+            if let Some(barrier) = maybe_barrier {
+                Self::wait_completion0(barrier).await?;
+            }
+            Ok(())
+        })
+    }
+
     /// Launch an upload operation in the background; the file is added to be included in next
     /// `index_part.json` upload.
     pub(crate) fn schedule_layer_file_upload(
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 3075df022e..8e6290030d 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -60,6 +60,9 @@ pub struct IndexPart {
     #[serde(default)]
     pub(crate) lineage: Lineage,
 
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub(crate) gc_blocking: Option<GcBlocking>,
+
     /// Describes the kind of aux files stored in the timeline.
     ///
     /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
@@ -101,6 +104,7 @@ impl IndexPart {
             deleted_at: None,
             archived_at: None,
             lineage: Default::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         }
     }
@@ -251,6 +255,64 @@ impl Lineage {
     }
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub(crate) struct GcBlocking {
+    pub(crate) started_at: NaiveDateTime,
+    pub(crate) reasons: enumset::EnumSet<GcBlockingReason>,
+}
+
+#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)]
+#[enumset(serialize_repr = "list")]
+pub(crate) enum GcBlockingReason {
+    Manual,
+    DetachAncestor,
+}
+
+impl GcBlocking {
+    pub(super) fn started_now_for(reason: GcBlockingReason) -> Self {
+        GcBlocking {
+            started_at: chrono::Utc::now().naive_utc(),
+            reasons: enumset::EnumSet::only(reason),
+        }
+    }
+
+    /// Returns true if the given reason is one of the reasons why the gc is blocked.
+    pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool {
+        self.reasons.contains(reason)
+    }
+
+    /// Returns a version of self with the given reason.
+    pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self {
+        assert!(!self.blocked_by(reason));
+        let mut reasons = self.reasons;
+        reasons.insert(reason);
+
+        Self {
+            started_at: self.started_at,
+            reasons,
+        }
+    }
+
+    /// Returns a version of self without the given reason. Assumption is that if
+    /// there are no more reasons, we can unblock the gc by returning `None`.
+    pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option<Self> {
+        assert!(self.blocked_by(reason));
+
+        if self.reasons.len() == 1 {
+            None
+        } else {
+            let mut reasons = self.reasons;
+            assert!(reasons.remove(reason));
+            assert!(!reasons.is_empty());
+
+            Some(Self {
+                started_at: self.started_at,
+                reasons,
+            })
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -292,6 +354,7 @@ mod tests {
             deleted_at: None,
             archived_at: None,
             lineage: Lineage::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -335,6 +398,7 @@ mod tests {
             deleted_at: None,
             archived_at: None,
             lineage: Lineage::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -379,6 +443,7 @@ mod tests {
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             archived_at: None,
             lineage: Lineage::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -426,6 +491,7 @@ mod tests {
             deleted_at: None,
             archived_at: None,
             lineage: Lineage::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -468,6 +534,7 @@ mod tests {
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             archived_at: None,
             lineage: Lineage::default(),
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -513,6 +580,7 @@ mod tests {
                 reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                 original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
             },
+            gc_blocking: None,
             last_aux_file_policy: None,
         };
 
@@ -563,6 +631,7 @@ mod tests {
                 reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                 original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
             },
+            gc_blocking: None,
             last_aux_file_policy: Some(AuxFilePolicy::V2),
         };
 
@@ -618,6 +687,7 @@ mod tests {
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             archived_at: None,
             lineage: Default::default(),
+            gc_blocking: None,
             last_aux_file_policy: Default::default(),
         };
 
@@ -674,6 +744,7 @@ mod tests {
             deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
             lineage: Default::default(),
+            gc_blocking: None,
             last_aux_file_policy: Default::default(),
         };
 
@@ -681,6 +752,68 @@ mod tests {
         assert_eq!(part, expected);
     }
 
+    #[test]
+    fn v9_indexpart_is_parsed() {
+        let example = r#"{
+            "version": 9,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata": {
+                "disk_consistent_lsn": "0/16960E8",
+                "prev_record_lsn": "0/1696070",
+                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
+                "ancestor_lsn": "0/0",
+                "latest_gc_cutoff_lsn": "0/1696070",
+                "initdb_lsn": "0/1696070",
+                "pg_version": 14
+            },
+            "gc_blocking": {
+                "started_at": "2024-07-19T09:00:00.123",
+                "reasons": ["DetachAncestor"]
+            }
+        }"#;
+
+        let expected = IndexPart {
+            version: 9,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::new(
+                Lsn::from_str("0/16960E8").unwrap(),
+                Some(Lsn::from_str("0/1696070").unwrap()),
+                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
+                Lsn::INVALID,
+                Lsn::from_str("0/1696070").unwrap(),
+                Lsn::from_str("0/1696070").unwrap(),
+                14,
+            ).with_recalculated_checksum().unwrap(),
+            deleted_at: None,
+            lineage: Default::default(),
+            gc_blocking: Some(GcBlocking {
+                started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
+                reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
+            }),
+            last_aux_file_policy: Default::default(),
+            archived_at: None,
+        };
+
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
     fn parse_naive_datetime(s: &str) -> NaiveDateTime {
         chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 05bf4eac8b..79bfd1ebb2 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5698,6 +5698,22 @@ impl Timeline {
         }
     }
 
+    /// Persistently blocks gc for `Manual` reason.
+    ///
+    /// Returns true if no such block existed before, false otherwise.
+    pub(crate) async fn block_gc(&self, tenant: &super::Tenant) -> anyhow::Result<bool> {
+        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
+        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
+        tenant.gc_block.insert(self, GcBlockingReason::Manual).await
+    }
+
+    /// Persistently unblocks gc for `Manual` reason.
+    pub(crate) async fn unblock_gc(&self, tenant: &super::Tenant) -> anyhow::Result<()> {
+        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
+        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
+        tenant.gc_block.remove(self, GcBlockingReason::Manual).await
+    }
+
     #[cfg(test)]
     pub(super) fn force_advance_lsn(self: &Arc<Timeline>, new_lsn: Lsn) {
         self.last_record_lsn.advance(new_lsn);
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 05178c38b4..b03dbb092e 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -230,6 +230,8 @@ impl DeleteTimelineFlow {
         // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
         timeline.shutdown(super::ShutdownMode::Hard).await;
 
+        tenant.gc_block.before_delete(&timeline);
+
         fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
             Err(anyhow::anyhow!(
                 "failpoint: timeline-delete-before-index-deleted-at"
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 192324f086..61e2204b23 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -556,6 +556,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, dict)
         return res_json
 
+    def timeline_block_gc(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/block_gc",
+        )
+        log.info(f"Got GC request response code: {res.status_code}")
+        self.verbose_error(res)
+
+    def timeline_unblock_gc(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/unblock_gc",
+        )
+        log.info(f"Got GC request response code: {res.status_code}")
+        self.verbose_error(res)
+
     def timeline_compact(
         self,
         tenant_id: Union[TenantId, TenantShardId],
diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py
new file mode 100644
index 0000000000..24de894687
--- /dev/null
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -0,0 +1,67 @@
+import time
+
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+from fixtures.pageserver.utils import wait_timeline_detail_404
+
+
+def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"}
+    )
+    ps = env.pageserver
+    http = ps.http_client()
+
+    foo_branch = env.neon_cli.create_branch("foo", "main", env.initial_tenant)
+
+    gc_active_line = ".* gc_loop.*: [12] timelines need GC"
+    gc_skipped_line = ".* gc_loop.*: Skipping GC: .*"
+    init_gc_skipped = ".*: initialized with gc blocked.*"
+
+    tenant_before = http.tenant_status(env.initial_tenant)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_active_line)
+
+    assert ps.log_contains(gc_skipped_line, offset) is None
+
+    http.timeline_block_gc(env.initial_tenant, foo_branch)
+
+    tenant_after = http.tenant_status(env.initial_tenant)
+    assert tenant_before != tenant_after
+    gc_blocking = tenant_after["gc_blocking"]
+    assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }"
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+
+    ps.restart()
+    ps.quiesce_tenants()
+
+    _, offset = env.pageserver.assert_log_contains(init_gc_skipped, offset)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+
+    # deletion unblocks gc
+    http.timeline_delete(env.initial_tenant, foo_branch)
+    wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_active_line, offset)
+
+    http.timeline_block_gc(env.initial_tenant, env.initial_timeline)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+
+    # removing the manual block also unblocks gc
+    http.timeline_unblock_gc(env.initial_tenant, env.initial_timeline)
+
+    wait_for_another_gc_round()
+    _, offset = ps.assert_log_contains(gc_active_line, offset)
+
+
+def wait_for_another_gc_round():
+    time.sleep(2)

From 44fedfd6c3502d6bd3ceb4a2bf9e55b1f5727327 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 6 Aug 2024 10:14:01 +0100
Subject: [PATCH 1342/1571] pageserver: remove legacy read path (#8601)

## Problem

We have been maintaining two read paths (legacy and vectored) for a
while now. The legacy read-path was only used for cross validation in some tests.

## Summary of changes
* Tweak all tests that were using the legacy read path to use the
vectored read path instead
* Remove the read path dispatching based on the pageserver configs
* Remove the legacy read path code

We will be able to remove the single blob io code in
`pageserver/src/tenant/blob_io.rs` when https://github.com/neondatabase/neon/issues/7386 is complete.

Closes https://github.com/neondatabase/neon/issues/8005
---
 pageserver/src/tenant.rs                      |  53 +-
 pageserver/src/tenant/storage_layer.rs        |  15 -
 .../src/tenant/storage_layer/delta_layer.rs   |  91 +--
 .../src/tenant/storage_layer/image_layer.rs   |  44 +-
 .../tenant/storage_layer/inmemory_layer.rs    |  76 +--
 pageserver/src/tenant/storage_layer/layer.rs  |  73 +--
 .../src/tenant/storage_layer/layer/tests.rs   |  38 +-
 pageserver/src/tenant/timeline.rs             | 581 ++----------------
 8 files changed, 121 insertions(+), 850 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index de9b55d847..989ed0d4eb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4122,7 +4122,7 @@ pub(crate) mod harness {
 
 #[cfg(test)]
 mod tests {
-    use std::collections::BTreeMap;
+    use std::collections::{BTreeMap, BTreeSet};
 
     use super::*;
     use crate::keyspace::KeySpaceAccum;
@@ -4797,7 +4797,7 @@ mod tests {
         lsn: Lsn,
         repeat: usize,
         key_count: usize,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
         let compact = true;
         bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
     }
@@ -4810,7 +4810,9 @@ mod tests {
         repeat: usize,
         key_count: usize,
         compact: bool,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
+        let mut inserted: HashMap<Key, BTreeSet<Lsn>> = Default::default();
+
         let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
         let mut blknum = 0;
 
@@ -4831,6 +4833,7 @@ mod tests {
                         ctx,
                     )
                     .await?;
+                inserted.entry(test_key).or_default().insert(lsn);
                 writer.finish_write(lsn);
                 drop(writer);
 
@@ -4855,7 +4858,7 @@ mod tests {
             assert_eq!(res.layers_removed, 0, "this never removes anything");
         }
 
-        Ok(())
+        Ok(inserted)
     }
 
     //
@@ -4902,7 +4905,7 @@ mod tests {
             .await?;
 
         let lsn = Lsn(0x10);
-        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
+        let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
 
         let guard = tline.layers.read().await;
         guard.layer_map().dump(true, &ctx).await?;
@@ -4963,9 +4966,39 @@ mod tests {
                     &ctx,
                 )
                 .await;
-            tline
-                .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
-                .await;
+
+            let mut expected_lsns: HashMap<Key, Lsn> = Default::default();
+            let mut expect_missing = false;
+            let mut key = read.start().unwrap();
+            while key != read.end().unwrap() {
+                if let Some(lsns) = inserted.get(&key) {
+                    let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn);
+                    match expected_lsn {
+                        Some(lsn) => {
+                            expected_lsns.insert(key, *lsn);
+                        }
+                        None => {
+                            expect_missing = true;
+                            break;
+                        }
+                    }
+                } else {
+                    expect_missing = true;
+                    break;
+                }
+
+                key = key.next();
+            }
+
+            if expect_missing {
+                assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_))));
+            } else {
+                for (key, image) in vectored_res? {
+                    let expected_lsn = expected_lsns.get(&key).expect("determined above");
+                    let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn));
+                    assert_eq!(image?, expected_image);
+                }
+            }
         }
 
         Ok(())
@@ -5015,10 +5048,6 @@ mod tests {
             )
             .await;
 
-        child_timeline
-            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
-            .await;
-
         let images = vectored_res?;
         assert!(images.is_empty());
         Ok(())
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 59d3e1ce09..ab32a6035e 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -435,21 +435,6 @@ impl ReadableLayer {
     }
 }
 
-/// Return value from [`Layer::get_value_reconstruct_data`]
-#[derive(Clone, Copy, Debug)]
-pub enum ValueReconstructResult {
-    /// Got all the data needed to reconstruct the requested page
-    Complete,
-    /// This layer didn't contain all the required data, the caller should look up
-    /// the predecessor layer at the returned LSN and collect more data from there.
-    Continue,
-
-    /// This layer didn't contain data needed to reconstruct the page version at
-    /// the returned LSN. This is usually considered an error, but might be OK
-    /// in some circumstances.
-    Missing,
-}
-
 /// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
 /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
 /// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index e50fc2a266..a17dd28547 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,7 +36,7 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{
     DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use crate::tenant::storage_layer::Layer;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -826,95 +826,6 @@ impl DeltaLayerInner {
         })
     }
 
-    pub(super) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        let mut need_image = true;
-        // Scan the page versions backwards, starting from `lsn`.
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            &block_reader,
-        );
-        let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
-
-        let mut offsets: Vec<(Lsn, u64)> = Vec::new();
-
-        tree_reader
-            .visit(
-                &search_key.0,
-                VisitDirection::Backwards,
-                |key, value| {
-                    let blob_ref = BlobRef(value);
-                    if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
-                        return false;
-                    }
-                    let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
-                    if entry_lsn < lsn_range.start {
-                        return false;
-                    }
-                    offsets.push((entry_lsn, blob_ref.pos()));
-
-                    !blob_ref.will_init()
-                },
-                &RequestContextBuilder::extend(ctx)
-                    .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
-                    .build(),
-            )
-            .await?;
-
-        let ctx = &RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::DeltaLayerValue)
-            .build();
-
-        // Ok, 'offsets' now contains the offsets of all the entries we need to read
-        let cursor = block_reader.block_cursor();
-        let mut buf = Vec::new();
-        for (entry_lsn, pos) in offsets {
-            cursor
-                .read_blob_into_buf(pos, &mut buf, ctx)
-                .await
-                .with_context(|| {
-                    format!("Failed to read blob from virtual file {}", self.file.path)
-                })?;
-            let val = Value::des(&buf).with_context(|| {
-                format!(
-                    "Failed to deserialize file blob from virtual file {}",
-                    self.file.path
-                )
-            })?;
-            match val {
-                Value::Image(img) => {
-                    reconstruct_state.img = Some((entry_lsn, img));
-                    need_image = false;
-                    break;
-                }
-                Value::WalRecord(rec) => {
-                    let will_init = rec.will_init();
-                    reconstruct_state.records.push((entry_lsn, rec));
-                    if will_init {
-                        // This WAL record initializes the page, so no need to go further back
-                        need_image = false;
-                        break;
-                    }
-                }
-            }
-        }
-
-        // If an older page image is needed to reconstruct the page, let the
-        // caller know.
-        if need_image {
-            Ok(ValueReconstructResult::Continue)
-        } else {
-            Ok(ValueReconstructResult::Complete)
-        }
-    }
-
     // Look up the keys in the provided keyspace and update
     // the reconstruct state with whatever is found.
     //
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index f4f48aaf16..b2173455ab 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -32,9 +32,7 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{
     DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::{
-    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
-};
+use crate::tenant::storage_layer::LayerAccessStats;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -429,46 +427,6 @@ impl ImageLayerInner {
         })
     }
 
-    pub(super) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader =
-            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
-
-        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
-        key.write_to_byte_slice(&mut keybuf);
-        if let Some(offset) = tree_reader
-            .get(
-                &keybuf,
-                &RequestContextBuilder::extend(ctx)
-                    .page_content_kind(PageContentKind::ImageLayerBtreeNode)
-                    .build(),
-            )
-            .await?
-        {
-            let blob = block_reader
-                .block_cursor()
-                .read_blob(
-                    offset,
-                    &RequestContextBuilder::extend(ctx)
-                        .page_content_kind(PageContentKind::ImageLayerValue)
-                        .build(),
-                )
-                .await
-                .with_context(|| format!("failed to read value from offset {}", offset))?;
-            let value = Bytes::from(blob);
-
-            reconstruct_state.img = Some((self.lsn, value));
-            Ok(ValueReconstructResult::Complete)
-        } else {
-            Ok(ValueReconstructResult::Missing)
-        }
-    }
-
     // Look up the keys in the provided keyspace and update
     // the reconstruct state with whatever is found.
     pub(super) async fn get_values_reconstruct_data(
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index f9010ae8a6..6abc89c2ed 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,11 +10,10 @@ use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
 use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
-use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::{l0_flush, page_cache, walrecord};
-use anyhow::{anyhow, ensure, Result};
+use anyhow::{anyhow, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -33,10 +32,7 @@ use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::{RwLock, RwLockWriteGuard};
 
-use super::{
-    DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState,
-    ValuesReconstructState,
-};
+use super::{DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValuesReconstructState};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
 pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
@@ -55,9 +51,6 @@ pub struct InMemoryLayer {
     /// Writes are only allowed when this is `None`.
     pub(crate) end_lsn: OnceLock<Lsn>,
 
-    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
-    local_path_str: Arc<str>,
-
     /// Used for traversal path. Cached representation of the in-memory layer after frozen.
     frozen_local_path_str: OnceLock<Arc<str>>,
 
@@ -248,12 +241,6 @@ impl InMemoryLayer {
         self.start_lsn..self.end_lsn_or_max()
     }
 
-    pub(crate) fn local_path_str(&self) -> &Arc<str> {
-        self.frozen_local_path_str
-            .get()
-            .unwrap_or(&self.local_path_str)
-    }
-
     /// debugging function to print out the contents of the layer
     ///
     /// this is likely completly unused
@@ -303,60 +290,6 @@ impl InMemoryLayer {
         Ok(())
     }
 
-    /// Look up given value in the layer.
-    pub(crate) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        ensure!(lsn_range.start >= self.start_lsn);
-        let mut need_image = true;
-
-        let ctx = RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::InMemoryLayer)
-            .build();
-
-        let inner = self.inner.read().await;
-
-        let reader = inner.file.block_cursor();
-
-        // Scan the page versions backwards, starting from `lsn`.
-        if let Some(vec_map) = inner.index.get(&key) {
-            let slice = vec_map.slice_range(lsn_range);
-            for (entry_lsn, pos) in slice.iter().rev() {
-                let buf = reader.read_blob(*pos, &ctx).await?;
-                let value = Value::des(&buf)?;
-                match value {
-                    Value::Image(img) => {
-                        reconstruct_state.img = Some((*entry_lsn, img));
-                        return Ok(ValueReconstructResult::Complete);
-                    }
-                    Value::WalRecord(rec) => {
-                        let will_init = rec.will_init();
-                        reconstruct_state.records.push((*entry_lsn, rec));
-                        if will_init {
-                            // This WAL record initializes the page, so no need to go further back
-                            need_image = false;
-                            break;
-                        }
-                    }
-                }
-            }
-        }
-
-        // release lock on 'inner'
-
-        // If an older page image is needed to reconstruct the page, let the
-        // caller know.
-        if need_image {
-            Ok(ValueReconstructResult::Continue)
-        } else {
-            Ok(ValueReconstructResult::Complete)
-        }
-    }
-
     // Look up the keys in the provided keyspace and update
     // the reconstruct state with whatever is found.
     //
@@ -458,11 +391,6 @@ impl InMemoryLayer {
 
         Ok(InMemoryLayer {
             file_id: key,
-            local_path_str: {
-                let mut buf = String::new();
-                inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
-                buf.into()
-            },
             frozen_local_path_str: OnceLock::new(),
             conf,
             timeline_id,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 5732779e44..cee2fe7342 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -24,8 +24,7 @@ use super::delta_layer::{self, DeltaEntry};
 use super::image_layer::{self};
 use super::{
     AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    LayerVisibilityHint, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState,
-    ValuesReconstructState,
+    LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState,
 };
 
 use utils::generation::Generation;
@@ -301,42 +300,6 @@ impl Layer {
         self.0.delete_on_drop();
     }
 
-    /// Return data needed to reconstruct given page at LSN.
-    ///
-    /// It is up to the caller to collect more data from the previous layer and
-    /// perform WAL redo, if necessary.
-    ///
-    /// # Cancellation-Safety
-    ///
-    /// This method is cancellation-safe.
-    pub(crate) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        use anyhow::ensure;
-
-        let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
-        self.0.access_stats.record_access(ctx);
-
-        if self.layer_desc().is_delta {
-            ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
-            ensure!(self.layer_desc().key_range.contains(&key));
-        } else {
-            ensure!(self.layer_desc().key_range.contains(&key));
-            ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
-            ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
-        }
-
-        layer
-            .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
-            .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
-            .await
-            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
-    }
-
     pub(crate) async fn get_values_reconstruct_data(
         &self,
         keyspace: KeySpace,
@@ -441,10 +404,6 @@ impl Layer {
         &self.0.path
     }
 
-    pub(crate) fn debug_str(&self) -> &Arc<str> {
-        &self.0.debug_str
-    }
-
     pub(crate) fn metadata(&self) -> LayerFileMetadata {
         self.0.metadata()
     }
@@ -519,7 +478,7 @@ impl Layer {
 ///
 /// However when we want something evicted, we cannot evict it right away as there might be current
 /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
-/// read with [`Layer::get_value_reconstruct_data`].
+/// read with [`Layer::get_values_reconstruct_data`].
 ///
 /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
 #[derive(Debug)]
@@ -600,9 +559,6 @@ struct LayerInner {
     /// Full path to the file; unclear if this should exist anymore.
     path: Utf8PathBuf,
 
-    /// String representation of the layer, used for traversal id.
-    debug_str: Arc<str>,
-
     desc: PersistentLayerDesc,
 
     /// Timeline access is needed for remote timeline client and metrics.
@@ -836,9 +792,6 @@ impl LayerInner {
 
         LayerInner {
             conf,
-            debug_str: {
-                format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
-            },
             path: local_path,
             desc,
             timeline: Arc::downgrade(timeline),
@@ -1759,28 +1712,6 @@ impl DownloadedLayer {
             .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
     }
 
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        owner: &Arc<LayerInner>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        use LayerKind::*;
-
-        match self.get(owner, ctx).await? {
-            Delta(d) => {
-                d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
-                    .await
-            }
-            Image(i) => {
-                i.get_value_reconstruct_data(key, reconstruct_data, ctx)
-                    .await
-            }
-        }
-    }
-
     async fn get_values_reconstruct_data(
         &self,
         keyspace: KeySpace,
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 423cde001c..6b0d5f09ff 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -50,13 +50,26 @@ async fn smoke_test() {
     // all layers created at pageserver are like `layer`, initialized with strong
     // Arc<DownloadedLayer>.
 
+    let controlfile_keyspace = KeySpace {
+        ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()],
+    };
+
     let img_before = {
-        let mut data = ValueReconstructState::default();
+        let mut data = ValuesReconstructState::default();
         layer
-            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
+            .get_values_reconstruct_data(
+                controlfile_keyspace.clone(),
+                Lsn(0x10)..Lsn(0x11),
+                &mut data,
+                &ctx,
+            )
             .await
             .unwrap();
-        data.img
+        data.keys
+            .remove(&CONTROLFILE_KEY)
+            .expect("must be present")
+            .expect("should not error")
+            .img
             .take()
             .expect("tenant harness writes the control file")
     };
@@ -74,13 +87,24 @@ async fn smoke_test() {
 
     // on accesses when the layer is evicted, it will automatically be downloaded.
     let img_after = {
-        let mut data = ValueReconstructState::default();
+        let mut data = ValuesReconstructState::default();
         layer
-            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
+            .get_values_reconstruct_data(
+                controlfile_keyspace.clone(),
+                Lsn(0x10)..Lsn(0x11),
+                &mut data,
+                &ctx,
+            )
             .instrument(download_span.clone())
             .await
             .unwrap();
-        data.img.take().unwrap()
+        data.keys
+            .remove(&CONTROLFILE_KEY)
+            .expect("must be present")
+            .expect("should not error")
+            .img
+            .take()
+            .expect("tenant harness writes the control file")
     };
 
     assert_eq!(img_before, img_after);
@@ -830,7 +854,7 @@ async fn eviction_cancellation_on_drop() {
 fn layer_size() {
     assert_eq!(size_of::<LayerAccessStats>(), 8);
     assert_eq!(size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(size_of::<LayerInner>(), 312);
+    assert_eq!(size_of::<LayerInner>(), 296);
     // it also has the utf8 path
 }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 79bfd1ebb2..5a02fd4a4c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -22,8 +22,8 @@ use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::{
-        AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
-        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
+        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
+        NON_INHERITED_SPARSE_RANGE,
     },
     keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
@@ -59,10 +59,7 @@ use std::{
     collections::{BTreeMap, HashMap, HashSet},
     sync::atomic::AtomicU64,
 };
-use std::{
-    cmp::{max, min},
-    ops::ControlFlow,
-};
+use std::{cmp::min, ops::ControlFlow};
 use std::{
     collections::btree_map::Entry,
     ops::{Deref, Range},
@@ -87,8 +84,8 @@ use crate::{
     disk_usage_eviction_task::finite_f32,
     tenant::storage_layer::{
         AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
-        LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult,
-        ValueReconstructState, ValuesReconstructState,
+        LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructState,
+        ValuesReconstructState,
     },
 };
 use crate::{
@@ -543,7 +540,6 @@ pub struct MissingKeyError {
     cont_lsn: Lsn,
     request_lsn: Lsn,
     ancestor_lsn: Option<Lsn>,
-    traversal_path: Vec<TraversalPathItem>,
     backtrace: Option<std::backtrace::Backtrace>,
 }
 
@@ -564,18 +560,6 @@ impl std::fmt::Display for MissingKeyError {
             write!(f, ", ancestor {}", ancestor_lsn)?;
         }
 
-        if !self.traversal_path.is_empty() {
-            writeln!(f)?;
-        }
-
-        for (r, c, l) in &self.traversal_path {
-            writeln!(
-                f,
-                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
-                r, c, l,
-            )?;
-        }
-
         if let Some(ref backtrace) = self.backtrace {
             write!(f, "\n{}", backtrace)?;
         }
@@ -918,119 +902,44 @@ impl Timeline {
 
         self.timeline_get_throttle.throttle(ctx, 1).await;
 
-        match self.conf.get_impl {
-            GetImpl::Legacy => {
-                let reconstruct_state = ValueReconstructState {
-                    records: Vec::new(),
-                    img: None,
-                };
+        let keyspace = KeySpace {
+            ranges: vec![key..key.next()],
+        };
 
-                self.get_impl(key, lsn, reconstruct_state, ctx).await
-            }
-            GetImpl::Vectored => {
-                let keyspace = KeySpace {
-                    ranges: vec![key..key.next()],
-                };
+        // Initialise the reconstruct state for the key with the cache
+        // entry returned above.
+        let mut reconstruct_state = ValuesReconstructState::new();
 
-                // Initialise the reconstruct state for the key with the cache
-                // entry returned above.
-                let mut reconstruct_state = ValuesReconstructState::new();
+        let vectored_res = self
+            .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
+            .await;
 
-                let vectored_res = self
-                    .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
-                    .await;
-
-                if self.conf.validate_vectored_get {
-                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
-                        .await;
-                }
-
-                let key_value = vectored_res?.pop_first();
-                match key_value {
-                    Some((got_key, value)) => {
-                        if got_key != key {
-                            error!(
-                                "Expected {}, but singular vectored get returned {}",
-                                key, got_key
-                            );
-                            Err(PageReconstructError::Other(anyhow!(
-                                "Singular vectored get returned wrong key"
-                            )))
-                        } else {
-                            value
-                        }
-                    }
-                    None => Err(PageReconstructError::MissingKey(MissingKeyError {
-                        key,
-                        shard: self.shard_identity.get_shard_number(&key),
-                        cont_lsn: Lsn(0),
-                        request_lsn: lsn,
-                        ancestor_lsn: None,
-                        traversal_path: Vec::new(),
-                        backtrace: None,
-                    })),
+        let key_value = vectored_res?.pop_first();
+        match key_value {
+            Some((got_key, value)) => {
+                if got_key != key {
+                    error!(
+                        "Expected {}, but singular vectored get returned {}",
+                        key, got_key
+                    );
+                    Err(PageReconstructError::Other(anyhow!(
+                        "Singular vectored get returned wrong key"
+                    )))
+                } else {
+                    value
                 }
             }
+            None => Err(PageReconstructError::MissingKey(MissingKeyError {
+                key,
+                shard: self.shard_identity.get_shard_number(&key),
+                cont_lsn: Lsn(0),
+                request_lsn: lsn,
+                ancestor_lsn: None,
+                backtrace: None,
+            })),
         }
     }
 
-    /// Not subject to [`Self::timeline_get_throttle`].
-    async fn get_impl(
-        &self,
-        key: Key,
-        lsn: Lsn,
-        mut reconstruct_state: ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<Bytes, PageReconstructError> {
-        // XXX: structured stats collection for layer eviction here.
-        trace!(
-            "get page request for {}@{} from task kind {:?}",
-            key,
-            lsn,
-            ctx.task_kind()
-        );
-
-        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
-            .for_get_kind(GetKind::Singular)
-            .start_timer();
-        let path = self
-            .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
-            .await?;
-        timer.stop_and_record();
-
-        let start = Instant::now();
-        let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
-        let elapsed = start.elapsed();
-        crate::metrics::RECONSTRUCT_TIME
-            .for_get_kind(GetKind::Singular)
-            .observe(elapsed.as_secs_f64());
-
-        if cfg!(feature = "testing")
-            && res.is_err()
-            && !matches!(res, Err(PageReconstructError::Cancelled))
-        {
-            // it can only be walredo issue
-            use std::fmt::Write;
-
-            let mut msg = String::new();
-
-            path.into_iter().for_each(|(res, cont_lsn, layer)| {
-                writeln!(
-                    msg,
-                    "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
-                    layer,
-                )
-                .expect("string grows")
-            });
-
-            // this is to rule out or provide evidence that we could in some cases read a duplicate
-            // walrecord
-            tracing::info!("walredo failed, path:\n{msg}");
-        }
-
-        res
-    }
-
     pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
     pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;
 
@@ -1080,28 +989,14 @@ impl Timeline {
             .throttle(ctx, key_count as usize)
             .await;
 
-        let res = match self.conf.get_vectored_impl {
-            GetVectoredImpl::Sequential => {
-                self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
-            }
-            GetVectoredImpl::Vectored => {
-                let vectored_res = self
-                    .get_vectored_impl(
-                        keyspace.clone(),
-                        lsn,
-                        &mut ValuesReconstructState::new(),
-                        ctx,
-                    )
-                    .await;
-
-                if self.conf.validate_vectored_get {
-                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
-                        .await;
-                }
-
-                vectored_res
-            }
-        };
+        let res = self
+            .get_vectored_impl(
+                keyspace.clone(),
+                lsn,
+                &mut ValuesReconstructState::new(),
+                ctx,
+            )
+            .await;
 
         if let Some((metric, start)) = start {
             let elapsed = start.elapsed();
@@ -1190,65 +1085,6 @@ impl Timeline {
         vectored_res
     }
 
-    /// Not subject to [`Self::timeline_get_throttle`].
-    pub(super) async fn get_vectored_sequential_impl(
-        &self,
-        keyspace: KeySpace,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        let mut values = BTreeMap::new();
-
-        for range in keyspace.ranges {
-            let mut key = range.start;
-            while key != range.end {
-                let block = self
-                    .get_impl(key, lsn, ValueReconstructState::default(), ctx)
-                    .await;
-
-                use PageReconstructError::*;
-                match block {
-                    Err(Cancelled) => return Err(GetVectoredError::Cancelled),
-                    Err(MissingKey(_))
-                        if NON_INHERITED_RANGE.contains(&key)
-                            || NON_INHERITED_SPARSE_RANGE.contains(&key) =>
-                    {
-                        // Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range.
-                        // When we add more types of keys into the page server, we should revisit this part of code and throw errors
-                        // accordingly.
-                        key = key.next();
-                    }
-                    Err(MissingKey(err)) => {
-                        return Err(GetVectoredError::MissingKey(err));
-                    }
-                    Err(Other(err))
-                        if err
-                            .to_string()
-                            .contains("downloading evicted layer file failed") =>
-                    {
-                        return Err(GetVectoredError::Other(err))
-                    }
-                    Err(Other(err))
-                        if err
-                            .chain()
-                            .any(|cause| cause.to_string().contains("layer loading failed")) =>
-                    {
-                        // The intent here is to achieve error parity with the vectored read path.
-                        // When vectored read fails to load a layer it fails the whole read, hence
-                        // we mimic this behaviour here to keep the validation happy.
-                        return Err(GetVectoredError::Other(err));
-                    }
-                    _ => {
-                        values.insert(key, block);
-                        key = key.next();
-                    }
-                }
-            }
-        }
-
-        Ok(values)
-    }
-
     pub(super) async fn get_vectored_impl(
         &self,
         keyspace: KeySpace,
@@ -1319,113 +1155,6 @@ impl Timeline {
         Ok(results)
     }
 
-    /// Not subject to [`Self::timeline_get_throttle`].
-    pub(super) async fn validate_get_vectored_impl(
-        &self,
-        vectored_res: &Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError>,
-        keyspace: KeySpace,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) {
-        if keyspace.overlaps(&Key::metadata_key_range()) {
-            // skip validation for metadata key range
-            return;
-        }
-
-        let sequential_res = self
-            .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx)
-            .await;
-
-        fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool {
-            use GetVectoredError::*;
-            match (lhs, rhs) {
-                (Oversized(l), Oversized(r)) => l == r,
-                (InvalidLsn(l), InvalidLsn(r)) => l == r,
-                (MissingKey(l), MissingKey(r)) => l.key == r.key,
-                (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true,
-                (Other(_), Other(_)) => true,
-                _ => false,
-            }
-        }
-
-        match (&sequential_res, vectored_res) {
-            (Err(GetVectoredError::Cancelled), _) => {},
-            (_, Err(GetVectoredError::Cancelled)) => {},
-            (Err(seq_err), Ok(_)) => {
-                panic!(concat!("Sequential get failed with {}, but vectored get did not",
-                               " - keyspace={:?} lsn={}"),
-                       seq_err, keyspace, lsn) },
-            (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => {
-                // Sequential get runs after vectored get, so it is possible for the later
-                // to time out while waiting for its ancestor's Lsn to become ready and for the
-                // former to succeed (it essentially has a doubled wait time).
-            },
-            (Ok(_), Err(vec_err)) => {
-                panic!(concat!("Vectored get failed with {}, but sequential get did not",
-                               " - keyspace={:?} lsn={}"),
-                       vec_err, keyspace, lsn) },
-            (Err(seq_err), Err(vec_err)) => {
-                assert!(errors_match(seq_err, vec_err),
-                        "Mismatched errors: {seq_err} != {vec_err} - keyspace={keyspace:?} lsn={lsn}")},
-            (Ok(seq_values), Ok(vec_values)) => {
-                seq_values.iter().zip(vec_values.iter()).for_each(|((seq_key, seq_res), (vec_key, vec_res))| {
-                    assert_eq!(seq_key, vec_key);
-                    match (seq_res, vec_res) {
-                        (Ok(seq_blob), Ok(vec_blob)) => {
-                            Self::validate_key_equivalence(seq_key, &keyspace, lsn, seq_blob, vec_blob);
-                        },
-                        (Err(err), Ok(_)) => {
-                            panic!(
-                                concat!("Sequential get failed with {} for key {}, but vectored get did not",
-                                        " - keyspace={:?} lsn={}"),
-                                err, seq_key, keyspace, lsn) },
-                        (Ok(_), Err(err)) => {
-                            panic!(
-                                concat!("Vectored get failed with {} for key {}, but sequential get did not",
-                                        " - keyspace={:?} lsn={}"),
-                                err, seq_key, keyspace, lsn) },
-                        (Err(_), Err(_)) => {}
-                    }
-                })
-            }
-        }
-    }
-
-    fn validate_key_equivalence(
-        key: &Key,
-        keyspace: &KeySpace,
-        lsn: Lsn,
-        seq: &Bytes,
-        vec: &Bytes,
-    ) {
-        if *key == AUX_FILES_KEY {
-            // The value reconstruct of AUX_FILES_KEY from records is not deterministic
-            // since it uses a hash map under the hood. Hence, deserialise both results
-            // before comparing.
-            let seq_aux_dir_res = AuxFilesDirectory::des(seq);
-            let vec_aux_dir_res = AuxFilesDirectory::des(vec);
-            match (&seq_aux_dir_res, &vec_aux_dir_res) {
-                (Ok(seq_aux_dir), Ok(vec_aux_dir)) => {
-                    assert_eq!(
-                        seq_aux_dir, vec_aux_dir,
-                        "Mismatch for key {} - keyspace={:?} lsn={}",
-                        key, keyspace, lsn
-                    );
-                }
-                (Err(_), Err(_)) => {}
-                _ => {
-                    panic!("Mismatch for {key}: {seq_aux_dir_res:?} != {vec_aux_dir_res:?}");
-                }
-            }
-        } else {
-            // All other keys should reconstruct deterministically, so we simply compare the blobs.
-            assert_eq!(
-                seq, vec,
-                "Image mismatch for key {key} - keyspace={keyspace:?} lsn={lsn}"
-            );
-        }
-    }
-
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
     pub(crate) fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last
@@ -3215,228 +2944,7 @@ impl Timeline {
     }
 }
 
-type TraversalId = Arc<str>;
-
-trait TraversalLayerExt {
-    fn traversal_id(&self) -> TraversalId;
-}
-
-impl TraversalLayerExt for Layer {
-    fn traversal_id(&self) -> TraversalId {
-        Arc::clone(self.debug_str())
-    }
-}
-
-impl TraversalLayerExt for Arc<InMemoryLayer> {
-    fn traversal_id(&self) -> TraversalId {
-        Arc::clone(self.local_path_str())
-    }
-}
-
 impl Timeline {
-    ///
-    /// Get a handle to a Layer for reading.
-    ///
-    /// The returned Layer might be from an ancestor timeline, if the
-    /// segment hasn't been updated on this timeline yet.
-    ///
-    /// This function takes the current timeline's locked LayerMap as an argument,
-    /// so callers can avoid potential race conditions.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
-    async fn get_reconstruct_data(
-        &self,
-        key: Key,
-        request_lsn: Lsn,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<Vec<TraversalPathItem>, PageReconstructError> {
-        // Start from the current timeline.
-        let mut timeline_owned;
-        let mut timeline = self;
-
-        let mut read_count = scopeguard::guard(0, |cnt| {
-            crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64)
-        });
-
-        // For debugging purposes, collect the path of layers that we traversed
-        // through. It's included in the error message if we fail to find the key.
-        let mut traversal_path = Vec::<TraversalPathItem>::new();
-
-        let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img {
-            *cached_lsn
-        } else {
-            Lsn(0)
-        };
-
-        // 'prev_lsn' tracks the last LSN that we were at in our search. It's used
-        // to check that each iteration make some progress, to break infinite
-        // looping if something goes wrong.
-        let mut prev_lsn = None;
-
-        let mut result = ValueReconstructResult::Continue;
-        let mut cont_lsn = Lsn(request_lsn.0 + 1);
-
-        'outer: loop {
-            if self.cancel.is_cancelled() {
-                return Err(PageReconstructError::Cancelled);
-            }
-
-            // The function should have updated 'state'
-            //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
-            match result {
-                ValueReconstructResult::Complete => return Ok(traversal_path),
-                ValueReconstructResult::Continue => {
-                    // If we reached an earlier cached page image, we're done.
-                    if cont_lsn == cached_lsn + 1 {
-                        return Ok(traversal_path);
-                    }
-                    if let Some(prev) = prev_lsn {
-                        if prev <= cont_lsn {
-                            // Didn't make any progress in last iteration. Error out to avoid
-                            // getting stuck in the loop.
-                            return Err(PageReconstructError::MissingKey(MissingKeyError {
-                                key,
-                                shard: self.shard_identity.get_shard_number(&key),
-                                cont_lsn: Lsn(cont_lsn.0 - 1),
-                                request_lsn,
-                                ancestor_lsn: Some(timeline.ancestor_lsn),
-                                traversal_path,
-                                backtrace: None,
-                            }));
-                        }
-                    }
-                    prev_lsn = Some(cont_lsn);
-                }
-                ValueReconstructResult::Missing => {
-                    return Err(PageReconstructError::MissingKey(MissingKeyError {
-                        key,
-                        shard: self.shard_identity.get_shard_number(&key),
-                        cont_lsn,
-                        request_lsn,
-                        ancestor_lsn: None,
-                        traversal_path,
-                        backtrace: if cfg!(test) {
-                            Some(std::backtrace::Backtrace::force_capture())
-                        } else {
-                            None
-                        },
-                    }));
-                }
-            }
-
-            // Recurse into ancestor if needed
-            if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
-                if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
-                    trace!(
-                        "going into ancestor {}, cont_lsn is {}",
-                        timeline.ancestor_lsn,
-                        cont_lsn
-                    );
-
-                    timeline_owned = timeline
-                        .get_ready_ancestor_timeline(ancestor_timeline, ctx)
-                        .await?;
-                    timeline = &*timeline_owned;
-                    prev_lsn = None;
-                    continue 'outer;
-                }
-            }
-
-            let guard = timeline.layers.read().await;
-            let layers = guard.layer_map();
-
-            // Check the open and frozen in-memory layers first, in order from newest
-            // to oldest.
-            if let Some(open_layer) = &layers.open_layer {
-                let start_lsn = open_layer.get_lsn_range().start;
-                if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display());
-                    // Get all the data needed to reconstruct the page version from this layer.
-                    // But if we have an older cached page image, no need to go past that.
-                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-
-                    let open_layer = open_layer.clone();
-                    drop(guard);
-
-                    result = match open_layer
-                        .get_value_reconstruct_data(
-                            key,
-                            lsn_floor..cont_lsn,
-                            reconstruct_state,
-                            ctx,
-                        )
-                        .await
-                    {
-                        Ok(result) => result,
-                        Err(e) => return Err(PageReconstructError::from(e)),
-                    };
-                    cont_lsn = lsn_floor;
-                    *read_count += 1;
-                    traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
-                    continue 'outer;
-                }
-            }
-            for frozen_layer in layers.frozen_layers.iter().rev() {
-                let start_lsn = frozen_layer.get_lsn_range().start;
-                if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display());
-                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-
-                    let frozen_layer = frozen_layer.clone();
-                    drop(guard);
-
-                    result = match frozen_layer
-                        .get_value_reconstruct_data(
-                            key,
-                            lsn_floor..cont_lsn,
-                            reconstruct_state,
-                            ctx,
-                        )
-                        .await
-                    {
-                        Ok(result) => result,
-                        Err(e) => return Err(PageReconstructError::from(e)),
-                    };
-                    cont_lsn = lsn_floor;
-                    *read_count += 1;
-                    traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
-                    continue 'outer;
-                }
-            }
-
-            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
-                let layer = guard.get_from_desc(&layer);
-                drop(guard);
-                // Get all the data needed to reconstruct the page version from this layer.
-                // But if we have an older cached page image, no need to go past that.
-                let lsn_floor = max(cached_lsn + 1, lsn_floor);
-                result = match layer
-                    .get_value_reconstruct_data(key, lsn_floor..cont_lsn, reconstruct_state, ctx)
-                    .await
-                {
-                    Ok(result) => result,
-                    Err(e) => return Err(PageReconstructError::from(e)),
-                };
-                cont_lsn = lsn_floor;
-                *read_count += 1;
-                traversal_path.push((result, cont_lsn, layer.traversal_id()));
-                continue 'outer;
-            } else if timeline.ancestor_timeline.is_some() {
-                // Nothing on this timeline. Traverse to parent
-                result = ValueReconstructResult::Continue;
-                cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
-                continue 'outer;
-            } else {
-                // Nothing found
-                result = ValueReconstructResult::Missing;
-                continue 'outer;
-            }
-        }
-    }
-
     #[allow(clippy::doc_lazy_continuation)]
     /// Get the data needed to reconstruct all keys in the provided keyspace
     ///
@@ -3530,7 +3038,6 @@ impl Timeline {
                 cont_lsn,
                 request_lsn,
                 ancestor_lsn: Some(timeline.ancestor_lsn),
-                traversal_path: vec![],
                 backtrace: None,
             }));
         }
@@ -5895,8 +5402,6 @@ impl Timeline {
     }
 }
 
-type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
-
 /// Tracking writes ingestion does to a particular in-memory layer.
 ///
 /// Cleared upon freezing a layer.

From dc7eb5ae5a58b0f9bc4e88a93c8cd582df3d1e1d Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 6 Aug 2024 13:45:41 +0300
Subject: [PATCH 1343/1571] chore: bump index part version (#8611)

#8600 missed the hunk changing index_part.json informative version.
Include it in this PR, in addition add more non-warning index_part.json
versions to scrubber.
---
 pageserver/src/tenant/remote_timeline_client/index.rs | 5 +++--
 storage_scrubber/src/checks.rs                        | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 8e6290030d..90453b1922 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -88,10 +88,11 @@ impl IndexPart {
     /// - 6: last_aux_file_policy is added.
     /// - 7: metadata_bytes is no longer written, but still read
     /// - 8: added `archived_at`
-    const LATEST_VERSION: usize = 8;
+    /// - 9: +gc_blocking
+    const LATEST_VERSION: usize = 9;
 
     // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 5aa9e88c40..14788515dd 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -92,7 +92,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                             .push(format!("index_part.json version: {}", index_part.version()))
                     }
 
-                    let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(2);
+                    let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(3);
                     if !newest_versions.any(|ip| ip == &index_part.version()) {
                         info!(
                             "index_part.json version is not latest: {}",

From a31c95cb40646302624625518f6fd7fedb6c8795 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 6 Aug 2024 12:51:39 +0200
Subject: [PATCH 1344/1571] storage_scrubber: migrate scan_safekeeper_metadata
 to remote_storage (#8595)

Migrates the safekeeper-specific parts of `ScanMetadata` to
GenericRemoteStorage, making it Azure-ready.

Part of https://github.com/neondatabase/neon/issues/7547
---
 storage_scrubber/src/metadata_stream.rs       | 32 ++++++++++++++++++-
 .../src/scan_safekeeper_metadata.rs           | 20 ++++++------
 2 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index c702c0c312..54812ffc94 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -4,7 +4,7 @@ use anyhow::{anyhow, Context};
 use async_stream::{stream, try_stream};
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
 use futures::StreamExt;
-use remote_storage::{GenericRemoteStorage, ListingMode};
+use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath};
 use tokio_stream::Stream;
 
 use crate::{
@@ -276,3 +276,33 @@ pub(crate) fn stream_listing<'a>(
         }
     }
 }
+
+pub(crate) fn stream_listing_generic<'a>(
+    remote_client: &'a GenericRemoteStorage,
+    target: &'a S3Target,
+) -> impl Stream<Item = anyhow::Result<(RemotePath, Option<ListingObject>)>> + 'a {
+    let listing_mode = if target.delimiter.is_empty() {
+        ListingMode::NoDelimiter
+    } else {
+        ListingMode::WithDelimiter
+    };
+    try_stream! {
+        let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
+            remote_client,
+            listing_mode,
+            target,
+        ));
+        while let Some(list) = objects_stream.next().await {
+            let list = list?;
+            if target.delimiter.is_empty() {
+                for key in list.keys {
+                    yield (key.key.clone(), Some(key));
+                }
+            } else {
+                for key in list.prefixes {
+                    yield (key, None);
+                }
+            }
+        }
+    }
+}
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index 553adf8f46..08a4541c5c 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -1,10 +1,10 @@
 use std::{collections::HashSet, str::FromStr, sync::Arc};
 
-use aws_sdk_s3::Client;
 use futures::stream::{StreamExt, TryStreamExt};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::{XLogFileName, PG_TLI};
+use remote_storage::GenericRemoteStorage;
 use serde::Serialize;
 use tokio_postgres::types::PgLsn;
 use tracing::{error, info, trace};
@@ -14,8 +14,9 @@ use utils::{
 };
 
 use crate::{
-    cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
-    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
+    cloud_admin_api::CloudAdminApiClient, init_remote_generic,
+    metadata_stream::stream_listing_generic, BucketConfig, ConsoleConfig, NodeKind, RootTarget,
+    TenantShardTimelineId,
 };
 
 /// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
@@ -106,7 +107,7 @@ pub async fn scan_safekeeper_metadata(
     let timelines = client.query(&query, &[]).await?;
     info!("loaded {} timelines", timelines.len());
 
-    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?;
+    let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Safekeeper).await?;
     let console_config = ConsoleConfig::from_env()?;
     let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
 
@@ -119,7 +120,7 @@ pub async fn scan_safekeeper_metadata(
         let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
         let ttid = TenantTimelineId::new(tenant_id, timeline_id);
         check_timeline(
-            &s3_client,
+            &remote_client,
             &target,
             &cloud_admin_api_client,
             ttid,
@@ -156,7 +157,7 @@ struct TimelineCheckResult {
 /// errors are logged to stderr; returns Ok(true) if timeline is consistent,
 /// Ok(false) if not, Err if failed to check.
 async fn check_timeline(
-    s3_client: &Client,
+    remote_client: &GenericRemoteStorage,
     root: &RootTarget,
     api_client: &CloudAdminApiClient,
     ttid: TenantTimelineId,
@@ -187,12 +188,13 @@ async fn check_timeline(
     // we need files, so unset it.
     timeline_dir_target.delimiter = String::new();
 
-    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
+    let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
     while let Some(obj) = stream.next().await {
-        let obj = obj?;
-        let key = obj.key();
+        let (key, _obj) = obj?;
 
         let seg_name = key
+            .get_path()
+            .as_str()
             .strip_prefix(&timeline_dir_target.prefix_in_bucket)
             .expect("failed to extract segment name");
         expected_segfiles.remove(seg_name);

From 68bcbf822799e190deedb1480379e8a390929975 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 6 Aug 2024 11:52:48 +0300
Subject: [PATCH 1345/1571] Add package-mode=false to poetry.

We don't use it for packaging, and 'poetry install' will soon error
otherwise. Also remove name and version fields as these are not required for
non-packaging mode.
---
 pyproject.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0d5782ac7c..36a1e24ca1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,8 +1,7 @@
 [tool.poetry]
-name = "neon"
-version = "0.1.0"
 description = ""
 authors = []
+package-mode = false
 
 [tool.poetry.dependencies]
 python = "^3.9"

From 078f941dc820c6388b6b2c6e462db7f67551770b Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 6 Aug 2024 13:08:55 +0200
Subject: [PATCH 1346/1571] Add a test using Debezium as a client for the
 logical replication (#8568)

## Problem
We need to test the logical replication with some external consumers.
## Summary of changes
A test of the logical replication with Debezium as a consumer was added.
---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/pg-clients.yml              |  26 ++-
 poetry.lock                                   |  16 +-
 pyproject.toml                                |   2 +
 test_runner/fixtures/utils.py                 |   7 +-
 test_runner/logical_repl/README.md            |  22 ++
 .../clickhouse/docker-compose.yml             |   9 +
 .../logical_repl/debezium/docker-compose.yml  |  24 +++
 .../{test_log_repl.py => test_clickhouse.py}  |  16 +-
 test_runner/logical_repl/test_debezium.py     | 189 ++++++++++++++++++
 9 files changed, 297 insertions(+), 14 deletions(-)
 create mode 100644 test_runner/logical_repl/README.md
 create mode 100644 test_runner/logical_repl/clickhouse/docker-compose.yml
 create mode 100644 test_runner/logical_repl/debezium/docker-compose.yml
 rename test_runner/logical_repl/{test_log_repl.py => test_clickhouse.py} (85%)
 create mode 100644 test_runner/logical_repl/test_debezium.py

diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml
index 55b68ccdb5..23a2e3876c 100644
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -66,7 +66,31 @@ jobs:
         ports:
           - 9000:9000
           - 8123:8123
-
+      zookeeper:
+        image: quay.io/debezium/zookeeper:2.7
+        ports:
+          - 2181:2181
+      kafka:
+        image: quay.io/debezium/kafka:2.7
+        env:
+          ZOOKEEPER_CONNECT: "zookeeper:2181"
+          KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
+          KAFKA_BROKER_ID: 1
+          KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
+          KAFKA_JMX_PORT: 9991
+        ports:
+          - 9092:9092
+      debezium:
+        image: quay.io/debezium/connect:2.7
+        env:
+          BOOTSTRAP_SERVERS: kafka:9092
+          GROUP_ID: 1
+          CONFIG_STORAGE_TOPIC: debezium-config
+          OFFSET_STORAGE_TOPIC: debezium-offset
+          STATUS_STORAGE_TOPIC: debezium-status
+          DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
+        ports:
+          - 8083:8083
     steps:
       - uses: actions/checkout@v4
 
diff --git a/poetry.lock b/poetry.lock
index d7a3dde65b..9026824558 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1514,6 +1514,20 @@ files = [
 [package.dependencies]
 six = "*"
 
+[[package]]
+name = "kafka-python"
+version = "2.0.2"
+description = "Pure Python client for Apache Kafka"
+optional = false
+python-versions = "*"
+files = [
+    {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"},
+    {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"},
+]
+
+[package.extras]
+crc32c = ["crc32c"]
+
 [[package]]
 name = "lazy-object-proxy"
 version = "1.10.0"
@@ -3357,4 +3371,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
+content-hash = "d569a3593b98baceb0a88e176bdad63cae99d6bfc2a81bf6741663a4abcafd72"
diff --git a/pyproject.toml b/pyproject.toml
index 36a1e24ca1..cfb569b2ba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,7 @@ httpx = {extras = ["http2"], version = "^0.26.0"}
 pytest-repeat = "^0.9.3"
 websockets = "^12.0"
 clickhouse-connect = "^0.7.16"
+kafka-python = "^2.0.2"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
@@ -74,6 +75,7 @@ module = [
     "allure.*",
     "allure_commons.*",
     "allure_pytest.*",
+    "kafka.*",
 ]
 ignore_missing_imports = true
 
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 7f54eb0b0a..4dc9f7caae 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -389,7 +389,10 @@ WaitUntilRet = TypeVar("WaitUntilRet")
 
 
 def wait_until(
-    number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet]
+    number_of_iterations: int,
+    interval: float,
+    func: Callable[[], WaitUntilRet],
+    show_intermediate_error=False,
 ) -> WaitUntilRet:
     """
     Wait until 'func' returns successfully, without exception. Returns the
@@ -402,6 +405,8 @@ def wait_until(
         except Exception as e:
             log.info("waiting for %s iteration %s failed", func, i + 1)
             last_exception = e
+            if show_intermediate_error:
+                log.info(e)
             time.sleep(interval)
             continue
         return res
diff --git a/test_runner/logical_repl/README.md b/test_runner/logical_repl/README.md
new file mode 100644
index 0000000000..8eca056dda
--- /dev/null
+++ b/test_runner/logical_repl/README.md
@@ -0,0 +1,22 @@
+# Logical replication tests
+
+## Clickhouse
+
+```bash
+export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
+
+docker compose -f clickhouse/docker-compose.yml up -d
+pytest -m remote_cluster -k test_clickhouse
+docker compose -f clickhouse/docker-compose.yml down
+```
+
+## Debezium
+
+```bash
+export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
+
+docker compose -f debezium/docker-compose.yml up -d
+pytest -m remote_cluster -k test_debezium
+docker compose -f debezium/docker-compose.yml down
+
+```
\ No newline at end of file
diff --git a/test_runner/logical_repl/clickhouse/docker-compose.yml b/test_runner/logical_repl/clickhouse/docker-compose.yml
new file mode 100644
index 0000000000..e00038b811
--- /dev/null
+++ b/test_runner/logical_repl/clickhouse/docker-compose.yml
@@ -0,0 +1,9 @@
+services:
+  clickhouse:
+    image: clickhouse/clickhouse-server
+    user: "101:101"
+    container_name: clickhouse
+    hostname: clickhouse
+    ports:
+      - 127.0.0.1:8123:8123
+      - 127.0.0.1:9000:9000
diff --git a/test_runner/logical_repl/debezium/docker-compose.yml b/test_runner/logical_repl/debezium/docker-compose.yml
new file mode 100644
index 0000000000..fee127a2fd
--- /dev/null
+++ b/test_runner/logical_repl/debezium/docker-compose.yml
@@ -0,0 +1,24 @@
+services:
+  zookeeper:
+    image: quay.io/debezium/zookeeper:2.7
+  kafka:
+    image: quay.io/debezium/kafka:2.7
+    environment:
+      ZOOKEEPER_CONNECT: "zookeeper:2181"
+      KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
+      KAFKA_BROKER_ID: 1
+      KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
+      KAFKA_JMX_PORT: 9991
+    ports:
+      - 127.0.0.1:9092:9092
+  debezium:
+    image: quay.io/debezium/connect:2.7
+    environment:
+      BOOTSTRAP_SERVERS: kafka:9092
+      GROUP_ID: 1
+      CONFIG_STORAGE_TOPIC: debezium-config
+      OFFSET_STORAGE_TOPIC: debezium-offset
+      STATUS_STORAGE_TOPIC: debezium-status
+      DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
+    ports:
+      - 127.0.0.1:8083:8083
diff --git a/test_runner/logical_repl/test_log_repl.py b/test_runner/logical_repl/test_clickhouse.py
similarity index 85%
rename from test_runner/logical_repl/test_log_repl.py
rename to test_runner/logical_repl/test_clickhouse.py
index 0a1aecfe2b..c5ed9bc8af 100644
--- a/test_runner/logical_repl/test_log_repl.py
+++ b/test_runner/logical_repl/test_clickhouse.py
@@ -1,8 +1,9 @@
 """
-Test the logical replication in Neon with the different consumers
+Test the logical replication in Neon with ClickHouse as a consumer
 """
 
 import hashlib
+import os
 import time
 
 import clickhouse_connect
@@ -39,22 +40,15 @@ def test_clickhouse(remote_pg: RemotePostgres):
     """
     Test the logical replication having ClickHouse as a client
     """
+    clickhouse_host = "clickhouse" if ("CI" in os.environ) else "127.0.0.1"
     conn_options = remote_pg.conn_options()
-    for _ in range(5):
-        try:
-            conn = psycopg2.connect(remote_pg.connstr())
-        except psycopg2.OperationalError as perr:
-            log.debug(perr)
-            time.sleep(1)
-        else:
-            break
-        raise TimeoutError
+    conn = psycopg2.connect(remote_pg.connstr())
     cur = conn.cursor()
     cur.execute("DROP TABLE IF EXISTS table1")
     cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));")
     cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');")
     conn.commit()
-    client = clickhouse_connect.get_client(host="clickhouse")
+    client = clickhouse_connect.get_client(host=clickhouse_host)
     client.command("SET allow_experimental_database_materialized_postgresql=1")
     client.command(
         "CREATE DATABASE db1_postgres ENGINE = "
diff --git a/test_runner/logical_repl/test_debezium.py b/test_runner/logical_repl/test_debezium.py
new file mode 100644
index 0000000000..700b731418
--- /dev/null
+++ b/test_runner/logical_repl/test_debezium.py
@@ -0,0 +1,189 @@
+"""
+Test the logical replication in Neon with Debezium as a consumer
+"""
+
+import json
+import os
+import time
+
+import psycopg2
+import pytest
+import requests
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import RemotePostgres
+from fixtures.utils import wait_until
+from kafka import KafkaConsumer
+
+
+class DebeziumAPI:
+    """
+    The class for Debezium API calls
+    """
+
+    def __init__(self):
+        self.__host = "debezium" if ("CI" in os.environ) else "127.0.0.1"
+        self.__base_url = f"http://{self.__host}:8083"
+        self.__connectors_url = f"{self.__base_url}/connectors"
+
+    def __request(self, method, addurl="", **kwargs):
+        return requests.request(
+            method,
+            self.__connectors_url + addurl,
+            headers={"Accept": "application/json", "Content-type": "application/json"},
+            timeout=60,
+            **kwargs,
+        )
+
+    def create_pg_connector(self, remote_pg: RemotePostgres, dbz_conn_name: str):
+        """
+        Create a Postgres connector in debezium
+        """
+        conn_options = remote_pg.conn_options()
+        payload = {
+            "name": dbz_conn_name,
+            "config": {
+                "connector.class": "io.debezium.connector.postgresql.PostgresConnector",
+                "tasks.max": "1",
+                "database.hostname": conn_options["host"],
+                "database.port": "5432",
+                "database.user": conn_options["user"],
+                "database.password": conn_options["password"],
+                "database.dbname": conn_options["dbname"],
+                "plugin.name": "pgoutput",
+                "topic.prefix": "dbserver1",
+                "schema.include.list": "inventory",
+            },
+        }
+        return self.__request("POST", json=payload)
+
+    def list_connectors(self):
+        """
+        Returns a list of all connectors existent in Debezium.
+        """
+        resp = self.__request("GET")
+        assert resp.ok
+        return json.loads(resp.text)
+
+    def del_connector(self, connector):
+        """
+        Deletes the specified connector
+        """
+        return self.__request("DELETE", f"/{connector}")
+
+
+@pytest.fixture(scope="function")
+def debezium(remote_pg: RemotePostgres):
+    """
+    Prepare the Debezium API handler, connection
+    """
+    conn = psycopg2.connect(remote_pg.connstr())
+    cur = conn.cursor()
+    cur.execute("DROP SCHEMA IF EXISTS inventory CASCADE")
+    cur.execute("CREATE SCHEMA inventory")
+    cur.execute(
+        "CREATE TABLE inventory.customers ("
+        "id SERIAL NOT NULL PRIMARY KEY,"
+        "first_name character varying(255) NOT NULL,"
+        "last_name character varying(255) NOT NULL,"
+        "email character varying(255) NOT NULL)"
+    )
+    conn.commit()
+    dbz = DebeziumAPI()
+    assert len(dbz.list_connectors()) == 0
+    dbz_conn_name = "inventory-connector"
+    resp = dbz.create_pg_connector(remote_pg, dbz_conn_name)
+    log.debug("%s %s %s", resp.status_code, resp.ok, resp.text)
+    assert resp.status_code == 201
+    assert len(dbz.list_connectors()) == 1
+    consumer = KafkaConsumer(
+        "dbserver1.inventory.customers",
+        bootstrap_servers=["kafka:9092"],
+        auto_offset_reset="earliest",
+        enable_auto_commit=False,
+    )
+    yield conn, consumer
+    resp = dbz.del_connector(dbz_conn_name)
+    assert resp.status_code == 204
+
+
+def get_kafka_msg(consumer, ts_ms, before=None, after=None) -> None:
+    """
+    Gets the message from Kafka and checks its validity
+    Arguments:
+        consumer: the consumer object
+        ts_ms:    timestamp in milliseconds of the change of db, the corresponding message must have
+                  the later timestamp
+        before:   a dictionary, if not None, the before field from the kafka message must
+                  have the same values for the same keys
+        after:    a dictionary, if not None, the after field from the kafka message must
+                  have the same values for the same keys
+    """
+    msg = consumer.poll()
+    assert msg, "Empty message"
+    for val in msg.values():
+        r = json.loads(val[-1].value)
+        log.info(r["payload"])
+        assert ts_ms < r["payload"]["ts_ms"], "Incorrect timestamp"
+        for param, pname in ((before, "before"), (after, "after")):
+            if param is not None:
+                for k, v in param.items():
+                    assert r["payload"][pname][k] == v, f"{pname} mismatches"
+
+
+@pytest.mark.remote_cluster
+def test_debezium(debezium):
+    """
+    Test the logical replication having Debezium as a subscriber
+    """
+    conn, consumer = debezium
+    cur = conn.cursor()
+    ts_ms = time.time() * 1000
+    log.info("Insert 1 ts_ms: %s", ts_ms)
+    cur.execute(
+        "insert into inventory.customers (first_name, last_name, email) "
+        "values ('John', 'Dow','johndow@example.com')"
+    )
+    conn.commit()
+    wait_until(
+        100,
+        0.5,
+        lambda: get_kafka_msg(
+            consumer,
+            ts_ms,
+            after={"first_name": "John", "last_name": "Dow", "email": "johndow@example.com"},
+        ),
+        show_intermediate_error=True,
+    )
+    ts_ms = time.time() * 1000
+    log.info("Insert 2 ts_ms: %s", ts_ms)
+    cur.execute(
+        "insert into inventory.customers (first_name, last_name, email) "
+        "values ('Alex', 'Row','alexrow@example.com')"
+    )
+    conn.commit()
+    wait_until(
+        100,
+        0.5,
+        lambda: get_kafka_msg(
+            consumer,
+            ts_ms,
+            after={"first_name": "Alex", "last_name": "Row", "email": "alexrow@example.com"},
+        ),
+        show_intermediate_error=True,
+    )
+    ts_ms = time.time() * 1000
+    log.info("Update ts_ms: %s", ts_ms)
+    cur.execute("update inventory.customers set first_name = 'Alexander' where id = 2")
+    conn.commit()
+    wait_until(
+        100,
+        0.5,
+        lambda: get_kafka_msg(
+            consumer,
+            ts_ms,
+            after={"first_name": "Alexander"},
+        ),
+        show_intermediate_error=True,
+    )
+    time.sleep(3)
+    cur.execute("select 1")

From 16c91ff5d38bd768e884a834a605cacaa148cc89 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 6 Aug 2024 13:56:42 +0200
Subject: [PATCH 1347/1571] enable rum test (#8380)

## Problem
We need to test the rum extension automatically as a path of the GitHub
workflow

## Summary of changes

rum test is enabled
---
 Dockerfile.compute-node               |  6 ++++--
 docker-compose/docker_compose_test.sh |  2 +-
 docker-compose/run-tests.sh           | 10 +++++-----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 5e53a55316..054d44e0ec 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -933,7 +933,8 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
 #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
 COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
 COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
-#COPY --from=rum-pg-build /rum.tar.gz /ext-src
+COPY --from=rum-pg-build /rum.tar.gz /ext-src
+COPY patches/rum.patch /ext-src
 #COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
 COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
 COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
@@ -945,7 +946,7 @@ COPY patches/pg_hintplan.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
-COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
+#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
 COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
 COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
 COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
@@ -960,6 +961,7 @@ RUN cd /ext-src/ && for f in *.tar.gz; \
     rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
     || exit 1; rm -f $f; done
 RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
+RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 # cmake is required for the h3 test
 RUN apt-get update && apt-get install -y cmake
 RUN patch -p1 < /ext-src/pg_hintplan.patch
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index a00591afd0..10805a9952 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -78,7 +78,7 @@ for pg_version in 14 15 16; do
         docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
         rm -rf $TMPDIR
         # We are running tests now
-        if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+        if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
             $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
         then
             cleanup
diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh
index c05fc159aa..58b2581197 100644
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,15 +1,15 @@
 #!/bin/bash
 set -x
 
-cd /ext-src
+cd /ext-src || exit 2
 FAILED=
-LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
+LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
 for d in ${LIST}
 do
-       [ -d ${d} ] || continue
+       [ -d "${d}" ] || continue
     psql -c "select 1" >/dev/null || break
-       make -C ${d} installcheck || FAILED="${d} ${FAILED}"
+       USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
 done
 [ -z "${FAILED}" ] && exit 0
-echo ${FAILED}
+echo "${FAILED}"
 exit 1
\ No newline at end of file

From b7beaa0fd7a549634af04069d97dc7ef2d1aa7d1 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Aug 2024 12:58:33 +0100
Subject: [PATCH 1348/1571] tests: improve stability of
 `test_storage_controller_many_tenants` (#8607)

## Problem

The controller scale test does random migrations. These mutate secondary
locations, and therefore can cause secondary optimizations to happen in
the background, violating the test's expectation that consistency_check
will work as there are no reconciliations running.

Example:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/10247161379/index.html#suites/07874de07c4a1c9effe0d92da7755ebf/6316beacd3fb3060/

## Summary of changes

- Only migrate to existing secondary locations, not randomly picked
nodes, so that we can do a fast reconcile_until_idle (otherwise
reconcile_until_idle is takes a long time to create new secondary
locations).
- Do a reconcile_until_idle before consistency_check.
---
 .../performance/test_storage_controller_scale.py     | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 281c9271e9..04785f7184 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -217,7 +217,11 @@ def test_storage_controller_many_tenants(
                 # A reconciler operation: migrate a shard.
                 shard_number = rng.randint(0, shard_count - 1)
                 tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
-                dest_ps_id = rng.choice([ps.id for ps in env.pageservers])
+
+                # Migrate it to its secondary location
+                desc = env.storage_controller.tenant_describe(tenant_id)
+                dest_ps_id = desc["shards"][shard_number]["node_secondary"][0]
+
                 f = executor.submit(
                     env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id
                 )
@@ -231,7 +235,11 @@ def test_storage_controller_many_tenants(
         for f in futs:
             f.result()
 
-    # Consistency check is safe here: all the previous operations waited for reconcile before completing
+    # Some of the operations above (notably migrations) might leave the controller in a state where it has
+    # some work to do, for example optimizing shard placement after we do a random migration. Wait for the system
+    # to reach a quiescent state before doing following checks.
+    env.storage_controller.reconcile_until_idle()
+
     env.storage_controller.consistency_check()
     check_memory()
 

From 42229aacf60831443d9ec5e2342db34a143f9f1d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Aug 2024 14:47:01 +0100
Subject: [PATCH 1349/1571] pageserver: fixes for layer visibility metric
 (#8603)

## Problem

In staging, we could see that occasionally tenants were wrapping their
pageserver_visible_physical_size metric past zero to 2^64.

This is harmless right now, but will matter more later when we start
using visible size in things like the /utilization endpoint.

## Summary of changes

- Add debug asserts that detect this case. `test_gc_of_remote_layers`
works as a reproducer for this issue once the asserts are added.
- Tighten up the interface around access_stats so that only Layer can
mutate it.
- In Layer, wrap calls to `record_access` in code that will update the
visible size statistic if the access implicitly marks the layer visible
(this was what caused the bug)
- In LayerManager::rewrite_layers, use the proper set_visibility layer
function instead of directly using access_stats (this is an additional
path where metrics could go bad.)
- Removed unused instances of LayerAccessStats in DeltaLayer and
ImageLayer which I noticed while reviewing the code paths that call
record_access.
---
 pageserver/src/tenant/storage_layer.rs        | 14 +++++---
 .../src/tenant/storage_layer/delta_layer.rs   |  8 +----
 .../src/tenant/storage_layer/image_layer.rs   |  4 ---
 pageserver/src/tenant/storage_layer/layer.rs  | 35 ++++++++++++++++---
 pageserver/src/tenant/timeline.rs             |  4 +--
 .../src/tenant/timeline/eviction_task.rs      |  2 +-
 .../src/tenant/timeline/layer_manager.rs      |  7 ++--
 7 files changed, 46 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index ab32a6035e..04f89db401 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -539,19 +539,25 @@ impl LayerAccessStats {
         self.record_residence_event_at(SystemTime::now())
     }
 
-    pub(crate) fn record_access_at(&self, now: SystemTime) {
+    fn record_access_at(&self, now: SystemTime) -> bool {
         let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);
 
         // A layer which is accessed must be visible.
         mask |= 0x1 << Self::VISIBILITY_SHIFT;
         value |= 0x1 << Self::VISIBILITY_SHIFT;
 
-        self.write_bits(mask, value);
+        let old_bits = self.write_bits(mask, value);
+        !matches!(
+            self.decode_visibility(old_bits),
+            LayerVisibilityHint::Visible
+        )
     }
 
-    pub(crate) fn record_access(&self, ctx: &RequestContext) {
+    /// Returns true if we modified the layer's visibility to set it to Visible implicitly
+    /// as a result of this access
+    pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool {
         if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
-            return;
+            return false;
         }
 
         self.record_access_at(SystemTime::now())
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index a17dd28547..962faa6796 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -72,10 +72,7 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::{
-    AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
-    ValuesReconstructState,
-};
+use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState};
 
 ///
 /// Header stored in the beginning of the file
@@ -200,7 +197,6 @@ impl DeltaKey {
 pub struct DeltaLayer {
     path: Utf8PathBuf,
     pub desc: PersistentLayerDesc,
-    access_stats: LayerAccessStats,
     inner: OnceCell<Arc<DeltaLayerInner>>,
 }
 
@@ -299,7 +295,6 @@ impl DeltaLayer {
     /// not loaded already.
     ///
     async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
-        self.access_stats.record_access(ctx);
         // Quick exit if already loaded
         self.inner
             .get_or_try_init(|| self.load_inner(ctx))
@@ -350,7 +345,6 @@ impl DeltaLayer {
                 summary.lsn_range,
                 metadata.len(),
             ),
-            access_stats: Default::default(),
             inner: OnceCell::new(),
         })
     }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index b2173455ab..16ba0fda94 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -32,7 +32,6 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{
     DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::LayerAccessStats;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -135,7 +134,6 @@ pub struct ImageLayer {
     pub desc: PersistentLayerDesc,
     // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
     pub lsn: Lsn,
-    access_stats: LayerAccessStats,
     inner: OnceCell<ImageLayerInner>,
 }
 
@@ -253,7 +251,6 @@ impl ImageLayer {
     /// not loaded already.
     ///
     async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
-        self.access_stats.record_access(ctx);
         self.inner
             .get_or_try_init(|| self.load_inner(ctx))
             .await
@@ -304,7 +301,6 @@ impl ImageLayer {
                 metadata.len(),
             ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
             lsn: summary.lsn,
-            access_stats: Default::default(),
             inner: OnceCell::new(),
         })
     }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index cee2fe7342..83450d24bb 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -316,7 +316,7 @@ impl Layer {
                 other => GetVectoredError::Other(anyhow::anyhow!(other)),
             })?;
 
-        self.0.access_stats.record_access(ctx);
+        self.record_access(ctx);
 
         layer
             .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
@@ -396,8 +396,12 @@ impl Layer {
         self.0.info(reset)
     }
 
-    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
-        &self.0.access_stats
+    pub(crate) fn latest_activity(&self) -> SystemTime {
+        self.0.access_stats.latest_activity()
+    }
+
+    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
+        self.0.access_stats.visibility()
     }
 
     pub(crate) fn local_path(&self) -> &Utf8Path {
@@ -447,13 +451,31 @@ impl Layer {
         }
     }
 
+    fn record_access(&self, ctx: &RequestContext) {
+        if self.0.access_stats.record_access(ctx) {
+            // Visibility was modified to Visible
+            tracing::info!(
+                "Layer {} became visible as a result of access",
+                self.0.desc.key()
+            );
+            if let Some(tl) = self.0.timeline.upgrade() {
+                tl.metrics
+                    .visible_physical_size_gauge
+                    .add(self.0.desc.file_size)
+            }
+        }
+    }
+
     pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
-        let old_visibility = self.access_stats().set_visibility(visibility.clone());
+        let old_visibility = self.0.access_stats.set_visibility(visibility.clone());
         use LayerVisibilityHint::*;
         match (old_visibility, visibility) {
             (Visible, Covered) => {
                 // Subtract this layer's contribution to the visible size metric
                 if let Some(tl) = self.0.timeline.upgrade() {
+                    debug_assert!(
+                        tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
+                    );
                     tl.metrics
                         .visible_physical_size_gauge
                         .sub(self.0.desc.file_size)
@@ -671,6 +693,9 @@ impl Drop for LayerInner {
             }
 
             if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
+                debug_assert!(
+                    timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
+                );
                 timeline
                     .metrics
                     .visible_physical_size_gauge
@@ -1810,7 +1835,7 @@ impl ResidentLayer {
                 // this is valid because the DownloadedLayer::kind is a OnceCell, not a
                 // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
                 // while it's being held.
-                owner.access_stats.record_access(ctx);
+                self.owner.record_access(ctx);
 
                 delta_layer::DeltaLayerInner::load_keys(d, ctx)
                     .await
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5a02fd4a4c..6c67fb9cb6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2920,7 +2920,7 @@ impl Timeline {
         let guard = self.layers.read().await;
 
         let resident = guard.likely_resident_layers().map(|layer| {
-            let last_activity_ts = layer.access_stats().latest_activity();
+            let last_activity_ts = layer.latest_activity();
 
             HeatMapLayer::new(
                 layer.layer_desc().layer_name(),
@@ -5182,7 +5182,7 @@ impl Timeline {
                 let file_size = layer.layer_desc().file_size;
                 max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
 
-                let last_activity_ts = layer.access_stats().latest_activity();
+                let last_activity_ts = layer.latest_activity();
 
                 EvictionCandidate {
                     layer: layer.into(),
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index fec66aabc1..1ba1bf9de5 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -225,7 +225,7 @@ impl Timeline {
                     continue;
                 }
 
-                let last_activity_ts = layer.access_stats().latest_activity();
+                let last_activity_ts = layer.latest_activity();
 
                 let no_activity_for = match now.duration_since(last_activity_ts) {
                     Ok(d) => d,
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 1bc2acbd34..e6e7bc2e77 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -259,13 +259,10 @@ impl LayerManager {
                 new_layer.layer_desc().lsn_range
             );
 
-            // Transfer visibilty hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
+            // Transfer visibility hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
             // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents
             // always marking rewritten layers as visible.
-            new_layer
-                .as_ref()
-                .access_stats()
-                .set_visibility(old_layer.access_stats().visibility());
+            new_layer.as_ref().set_visibility(old_layer.visibility());
 
             // Safety: we may never rewrite the same file in-place.  Callers are responsible
             // for ensuring that they only rewrite layers after something changes the path,

From 3727c6fbbedc7df73bdbef7766708819071724e3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Aug 2024 17:15:40 +0100
Subject: [PATCH 1350/1571] pageserver: use layer visibility when composing
 heatmap (#8616)

## Problem

Sometimes, a layer is Covered by hasn't yet been evicted from local disk
(e.g. shortly after image layer generation). It is not good use of
resources to download these to a secondary location, as there's a good
chance they will never be read.

This follows the previous change that added layer visibility:
- #8511

Part of epic:
- https://github.com/neondatabase/neon/issues/8398

## Summary of changes

- When generating heatmaps, only include Visible layers
- Update test_secondary_downloads to filter to visible layers when
listing layers from an attached location
---
 pageserver/src/tenant/timeline.rs             | 26 ++++++----
 test_runner/fixtures/pageserver/http.py       |  2 +
 .../regress/test_pageserver_secondary.py      | 52 +++++++++++++++----
 3 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6c67fb9cb6..4ff87f20f1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -137,7 +137,7 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::{config::TenantConf, upload_queue::NotInitialized};
+use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{
@@ -2919,14 +2919,22 @@ impl Timeline {
 
         let guard = self.layers.read().await;
 
-        let resident = guard.likely_resident_layers().map(|layer| {
-            let last_activity_ts = layer.latest_activity();
-
-            HeatMapLayer::new(
-                layer.layer_desc().layer_name(),
-                layer.metadata(),
-                last_activity_ts,
-            )
+        let resident = guard.likely_resident_layers().filter_map(|layer| {
+            match layer.visibility() {
+                LayerVisibilityHint::Visible => {
+                    // Layer is visible to one or more read LSNs: elegible for inclusion in layer map
+                    let last_activity_ts = layer.latest_activity();
+                    Some(HeatMapLayer::new(
+                        layer.layer_desc().layer_name(),
+                        layer.metadata(),
+                        last_activity_ts,
+                    ))
+                }
+                LayerVisibilityHint::Covered => {
+                    // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
+                    None
+                }
+            }
         });
 
         let layers = resident.collect();
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 61e2204b23..5be59d3749 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -61,6 +61,7 @@ class HistoricLayerInfo:
     remote: bool
     # None for image layers, true if pageserver thinks this is an L0 delta layer
     l0: Optional[bool]
+    visible: bool
 
     @classmethod
     def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
@@ -79,6 +80,7 @@ class HistoricLayerInfo:
             lsn_end=d.get("lsn_end"),
             remote=d["remote"],
             l0=l0_ness,
+            visible=d["access_stats"]["visible"],
         )
 
 
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 53f69b5b26..4b0af24480 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -2,10 +2,11 @@ import json
 import os
 import random
 import time
-from typing import Any, Dict, Optional
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
 
 import pytest
-from fixtures.common_types import TenantId, TimelineId
+from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
 from fixtures.pageserver.common_types import parse_layer_file_name
@@ -437,6 +438,35 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
     validate_heatmap(heatmap_second)
 
 
+def list_elegible_layers(
+    pageserver, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+) -> list[Path]:
+    """
+    The subset of layer filenames that are elegible for secondary download: at time of writing this
+    is all resident layers which are also visible.
+    """
+    candidates = pageserver.list_layers(tenant_id, timeline_id)
+
+    layer_map = pageserver.http_client().layer_map_info(tenant_id, timeline_id)
+
+    # Map of layer filenames to their visibility the "layer name" is not the same as the filename: add suffix to resolve one to the other
+    visible_map = dict(
+        (f"{layer.layer_file_name}-v1-00000001", layer.visible)
+        for layer in layer_map.historic_layers
+    )
+
+    def is_visible(layer_file_name):
+        try:
+            return visible_map[str(layer_file_name)]
+        except KeyError:
+            # Unexpected: tests should call this when pageservers are in a quiet state such that the layer map
+            # matches what's on disk.
+            log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}")
+            raise
+
+    return list(c for c in candidates if is_visible(c))
+
+
 def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     """
     Test the overall data flow in secondary mode:
@@ -491,7 +521,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
 
     ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
-    assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
+    assert list_elegible_layers(ps_attached, tenant_id, timeline_id) == ps_secondary.list_layers(
         tenant_id, timeline_id
     )
 
@@ -509,9 +539,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
     try:
-        assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
-            tenant_id, timeline_id
-        )
+        assert list_elegible_layers(
+            ps_attached, tenant_id, timeline_id
+        ) == ps_secondary.list_layers(tenant_id, timeline_id)
     except:
         # Do a full listing of the secondary location on errors, to help debug of
         # https://github.com/neondatabase/neon/issues/6966
@@ -532,8 +562,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     # ==================================================================
     try:
         log.info("Evicting a layer...")
-        layer_to_evict = ps_attached.list_layers(tenant_id, timeline_id)[0]
-        some_other_layer = ps_attached.list_layers(tenant_id, timeline_id)[1]
+        layer_to_evict = list_elegible_layers(ps_attached, tenant_id, timeline_id)[0]
+        some_other_layer = list_elegible_layers(ps_attached, tenant_id, timeline_id)[1]
         log.info(f"Victim layer: {layer_to_evict.name}")
         ps_attached.http_client().evict_layer(
             tenant_id, timeline_id, layer_name=layer_to_evict.name
@@ -551,9 +581,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
         ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
         assert layer_to_evict not in ps_attached.list_layers(tenant_id, timeline_id)
-        assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
-            tenant_id, timeline_id
-        )
+        assert list_elegible_layers(
+            ps_attached, tenant_id, timeline_id
+        ) == ps_secondary.list_layers(tenant_id, timeline_id)
     except:
         # On assertion failures, log some details to help with debugging
         heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)

From ca5390a89d8ae4b485c3471ccdac5910a86079dd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Aug 2024 17:39:40 +0100
Subject: [PATCH 1351/1571] pageserver: add `bench_ingest` (#7409)

## Problem

We lack a rust bench for the inmemory layer and delta layer write paths:
it is useful to benchmark these components independent of postgres & WAL
decoding.

Related: https://github.com/neondatabase/neon/issues/8452

## Summary of changes

- Refactor DeltaLayerWriter to avoid carrying a Timeline, so that it can
be cleanly tested + benched without a Tenant/Timeline test harness. It
only needed the Timeline for building `Layer`, so this can be done in a
separate step.
- Add `bench_ingest`, which exercises a variety of workload "shapes"
(big values, small values, sequential keys, random keys)
- Include a small uncontroversial optimization: in `freeze`, only
exhaustively walk values to assert ordering relative to end_lsn in debug
mode.

These benches are limited by drive performance on a lot of machines, but
still useful as a local tool for iterating on CPU/memory improvements
around this code path.

Anecdotal measurements on Hetzner AX102 (Ryzen 7950xd):

```

ingest-small-values/ingest 128MB/100b seq
                        time:   [1.1160 s 1.1230 s 1.1289 s]
                        thrpt:  [113.38 MiB/s 113.98 MiB/s 114.70 MiB/s]
Found 1 outliers among 10 measurements (10.00%)
  1 (10.00%) low mild
Benchmarking ingest-small-values/ingest 128MB/100b rand: Warming up for 3.0000 s
Warning: Unable to complete 10 samples in 10.0s. You may wish to increase target time to 18.9s.
ingest-small-values/ingest 128MB/100b rand
                        time:   [1.9001 s 1.9056 s 1.9110 s]
                        thrpt:  [66.982 MiB/s 67.171 MiB/s 67.365 MiB/s]
Benchmarking ingest-small-values/ingest 128MB/100b rand-1024keys: Warming up for 3.0000 s
Warning: Unable to complete 10 samples in 10.0s. You may wish to increase target time to 11.0s.
ingest-small-values/ingest 128MB/100b rand-1024keys
                        time:   [1.0715 s 1.0828 s 1.0937 s]
                        thrpt:  [117.04 MiB/s 118.21 MiB/s 119.46 MiB/s]
ingest-small-values/ingest 128MB/100b seq, no delta
                        time:   [425.49 ms 429.07 ms 432.04 ms]
                        thrpt:  [296.27 MiB/s 298.32 MiB/s 300.83 MiB/s]
Found 1 outliers among 10 measurements (10.00%)
  1 (10.00%) low mild

ingest-big-values/ingest 128MB/8k seq
                        time:   [373.03 ms 375.84 ms 379.17 ms]
                        thrpt:  [337.58 MiB/s 340.57 MiB/s 343.13 MiB/s]
Found 1 outliers among 10 measurements (10.00%)
  1 (10.00%) high mild
ingest-big-values/ingest 128MB/8k seq, no delta
                        time:   [81.534 ms 82.811 ms 83.364 ms]
                        thrpt:  [1.4994 GiB/s 1.5095 GiB/s 1.5331 GiB/s]
Found 1 outliers among 10 measurements (10.00%)


```
---
 pageserver/Cargo.toml                         |   4 +
 pageserver/benches/bench_ingest.rs            | 235 ++++++++++++++++++
 pageserver/src/l0_flush.rs                    |   4 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  45 ++--
 .../tenant/storage_layer/inmemory_layer.rs    |  37 +--
 .../src/tenant/storage_layer/split_writer.rs  |  11 +-
 pageserver/src/tenant/timeline.rs             |  11 +-
 pageserver/src/tenant/timeline/compaction.rs  |  44 ++--
 .../src/tenant/timeline/detach_ancestor.rs    |   6 +-
 9 files changed, 322 insertions(+), 75 deletions(-)
 create mode 100644 pageserver/benches/bench_ingest.rs

diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 43976250a4..0e748ee3db 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -108,3 +108,7 @@ harness = false
 [[bench]]
 name = "bench_walredo"
 harness = false
+
+[[bench]]
+name = "bench_ingest"
+harness = false
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
new file mode 100644
index 0000000000..af2b6934c6
--- /dev/null
+++ b/pageserver/benches/bench_ingest.rs
@@ -0,0 +1,235 @@
+use std::{env, num::NonZeroUsize};
+
+use bytes::Bytes;
+use camino::Utf8PathBuf;
+use criterion::{criterion_group, criterion_main, Criterion};
+use pageserver::{
+    config::PageServerConf,
+    context::{DownloadBehavior, RequestContext},
+    l0_flush::{L0FlushConfig, L0FlushGlobalState},
+    page_cache,
+    repository::Value,
+    task_mgr::TaskKind,
+    tenant::storage_layer::InMemoryLayer,
+    virtual_file::{self, api::IoEngineKind},
+};
+use pageserver_api::{key::Key, shard::TenantShardId};
+use utils::{
+    bin_ser::BeSer,
+    id::{TenantId, TimelineId},
+};
+
+// A very cheap hash for generating non-sequential keys.
+fn murmurhash32(mut h: u32) -> u32 {
+    h ^= h >> 16;
+    h = h.wrapping_mul(0x85ebca6b);
+    h ^= h >> 13;
+    h = h.wrapping_mul(0xc2b2ae35);
+    h ^= h >> 16;
+    h
+}
+
+enum KeyLayout {
+    /// Sequential unique keys
+    Sequential,
+    /// Random unique keys
+    Random,
+    /// Random keys, but only use the bits from the mask of them
+    RandomReuse(u32),
+}
+
+enum WriteDelta {
+    Yes,
+    No,
+}
+
+async fn ingest(
+    conf: &'static PageServerConf,
+    put_size: usize,
+    put_count: usize,
+    key_layout: KeyLayout,
+    write_delta: WriteDelta,
+) -> anyhow::Result<()> {
+    let mut lsn = utils::lsn::Lsn(1000);
+    let mut key = Key::from_i128(0x0);
+
+    let timeline_id = TimelineId::generate();
+    let tenant_id = TenantId::generate();
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?;
+
+    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
+
+    let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &ctx).await?;
+
+    let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
+    let ctx = RequestContext::new(
+        pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
+        pageserver::context::DownloadBehavior::Download,
+    );
+
+    for i in 0..put_count {
+        lsn += put_size as u64;
+
+        // Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people
+        // usually care the most about write performance when they're blasting a huge batch of data into a huge table.
+        match key_layout {
+            KeyLayout::Sequential => {
+                // Use sequential order to illustrate the experience a user is likely to have
+                // when ingesting bulk data.
+                key.field6 = i as u32;
+            }
+            KeyLayout::Random => {
+                // Use random-order keys to avoid giving a false advantage to data structures that are
+                // faster when inserting on the end.
+                key.field6 = murmurhash32(i as u32);
+            }
+            KeyLayout::RandomReuse(mask) => {
+                // Use low bits only, to limit cardinality
+                key.field6 = murmurhash32(i as u32) & mask;
+            }
+        }
+
+        layer.put_value(key, lsn, &data, &ctx).await?;
+    }
+    layer.freeze(lsn + 1).await;
+
+    if matches!(write_delta, WriteDelta::Yes) {
+        let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct {
+            max_concurrency: NonZeroUsize::new(1).unwrap(),
+        });
+        let (_desc, path) = layer
+            .write_to_disk(&ctx, None, l0_flush_state.inner())
+            .await?
+            .unwrap();
+        tokio::fs::remove_file(path).await?;
+    }
+
+    Ok(())
+}
+
+/// Wrapper to instantiate a tokio runtime
+fn ingest_main(
+    conf: &'static PageServerConf,
+    put_size: usize,
+    put_count: usize,
+    key_layout: KeyLayout,
+    write_delta: WriteDelta,
+) {
+    let runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    runtime.block_on(async move {
+        let r = ingest(conf, put_size, put_count, key_layout, write_delta).await;
+        if let Err(e) = r {
+            panic!("{e:?}");
+        }
+    });
+}
+
+/// Declare a series of benchmarks for the Pageserver's ingest write path.
+///
+/// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either
+/// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set).
+///
+/// Genuine disk I/O is used, so expect results to differ depending on storage.  However, when running on
+/// a fast disk, CPU is the bottleneck at time of writing.
+fn criterion_benchmark(c: &mut Criterion) {
+    let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap();
+    let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap();
+    eprintln!("Data directory: {}", temp_dir.path());
+
+    let conf: &'static PageServerConf = Box::leak(Box::new(
+        pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
+    ));
+    virtual_file::init(16384, IoEngineKind::TokioEpollUring);
+    page_cache::init(conf.page_cache_size);
+
+    {
+        let mut group = c.benchmark_group("ingest-small-values");
+        let put_size = 100usize;
+        let put_count = 128 * 1024 * 1024 / put_size;
+        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
+        group.sample_size(10);
+        group.bench_function("ingest 128MB/100b seq", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/100b rand", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Random,
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/100b rand-1024keys", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::RandomReuse(0x3ff),
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/100b seq, no delta", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::No,
+                )
+            })
+        });
+    }
+
+    {
+        let mut group = c.benchmark_group("ingest-big-values");
+        let put_size = 8192usize;
+        let put_count = 128 * 1024 * 1024 / put_size;
+        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
+        group.sample_size(10);
+        group.bench_function("ingest 128MB/8k seq", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/8k seq, no delta", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::No,
+                )
+            })
+        });
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs
index 8945e5accd..10187f2ba3 100644
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -24,7 +24,7 @@ impl Default for L0FlushConfig {
 #[derive(Clone)]
 pub struct L0FlushGlobalState(Arc<Inner>);
 
-pub(crate) enum Inner {
+pub enum Inner {
     PageCached,
     Direct { semaphore: tokio::sync::Semaphore },
 }
@@ -40,7 +40,7 @@ impl L0FlushGlobalState {
         }
     }
 
-    pub(crate) fn inner(&self) -> &Arc<Inner> {
+    pub fn inner(&self) -> &Arc<Inner> {
         &self.0
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 962faa6796..bff8f7cb24 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,13 +36,12 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{
     DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::Layer;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
     VectoredReadPlanner,
 };
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::PageReconstructError;
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
@@ -72,7 +71,7 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState};
+use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
 
 ///
 /// Header stored in the beginning of the file
@@ -367,7 +366,6 @@ impl DeltaLayer {
 /// 3. Call `finish`.
 ///
 struct DeltaLayerWriterInner {
-    conf: &'static PageServerConf,
     pub path: Utf8PathBuf,
     timeline_id: TimelineId,
     tenant_shard_id: TenantShardId,
@@ -414,7 +412,6 @@ impl DeltaLayerWriterInner {
         let tree_builder = DiskBtreeBuilder::new(block_buf);
 
         Ok(Self {
-            conf,
             path,
             timeline_id,
             tenant_shard_id,
@@ -489,11 +486,10 @@ impl DeltaLayerWriterInner {
     async fn finish(
         self,
         key_end: Key,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
         let temp_path = self.path.clone();
-        let result = self.finish0(key_end, timeline, ctx).await;
+        let result = self.finish0(key_end, ctx).await;
         if result.is_err() {
             tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
             if let Err(e) = std::fs::remove_file(&temp_path) {
@@ -506,9 +502,8 @@ impl DeltaLayerWriterInner {
     async fn finish0(
         self,
         key_end: Key,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
 
@@ -573,11 +568,9 @@ impl DeltaLayerWriterInner {
         // fsync the file
         file.sync_all().await?;
 
-        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+        trace!("created delta layer {}", self.path);
 
-        trace!("created delta layer {}", layer.local_path());
-
-        Ok(layer)
+        Ok((desc, self.path))
     }
 }
 
@@ -678,14 +671,9 @@ impl DeltaLayerWriter {
     pub(crate) async fn finish(
         mut self,
         key_end: Key,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
-        self.inner
-            .take()
-            .unwrap()
-            .finish(key_end, timeline, ctx)
-            .await
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+        self.inner.take().unwrap().finish(key_end, ctx).await
     }
 
     #[cfg(test)]
@@ -1592,8 +1580,9 @@ pub(crate) mod test {
     use super::*;
     use crate::repository::Value;
     use crate::tenant::harness::TIMELINE_ID;
+    use crate::tenant::storage_layer::{Layer, ResidentLayer};
     use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
-    use crate::tenant::Tenant;
+    use crate::tenant::{Tenant, Timeline};
     use crate::{
         context::DownloadBehavior,
         task_mgr::TaskKind,
@@ -1887,9 +1876,8 @@ pub(crate) mod test {
             res?;
         }
 
-        let resident = writer
-            .finish(entries_meta.key_range.end, &timeline, &ctx)
-            .await?;
+        let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?;
+        let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?;
 
         let inner = resident.get_as_delta(&ctx).await?;
 
@@ -2078,7 +2066,8 @@ pub(crate) mod test {
                 .await
                 .unwrap();
 
-            let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();
+            let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap();
+            let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap();
 
             copied_layer.get_as_delta(ctx).await.unwrap();
 
@@ -2206,7 +2195,9 @@ pub(crate) mod test {
         for (key, lsn, value) in deltas {
             writer.put_value(key, lsn, value, ctx).await?;
         }
-        let delta_layer = writer.finish(key_end, tline, ctx).await?;
+
+        let (desc, path) = writer.finish(key_end, ctx).await?;
+        let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
 
         Ok::<_, anyhow::Error>(delta_layer)
     }
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 6abc89c2ed..f118f3d8d8 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -11,9 +11,10 @@ use crate::repository::{Key, Value};
 use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::PageReconstructError;
 use crate::{l0_flush, page_cache, walrecord};
 use anyhow::{anyhow, Result};
+use camino::Utf8PathBuf;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -32,7 +33,9 @@ use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::{RwLock, RwLockWriteGuard};
 
-use super::{DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValuesReconstructState};
+use super::{
+    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
+};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
 pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
@@ -410,8 +413,7 @@ impl InMemoryLayer {
 
     /// Common subroutine of the public put_wal_record() and put_page_image() functions.
     /// Adds the page version to the in-memory tree
-
-    pub(crate) async fn put_value(
+    pub async fn put_value(
         &self,
         key: Key,
         lsn: Lsn,
@@ -476,8 +478,6 @@ impl InMemoryLayer {
     /// Records the end_lsn for non-dropped layers.
     /// `end_lsn` is exclusive
     pub async fn freeze(&self, end_lsn: Lsn) {
-        let inner = self.inner.write().await;
-
         assert!(
             self.start_lsn < end_lsn,
             "{} >= {}",
@@ -495,9 +495,13 @@ impl InMemoryLayer {
             })
             .expect("frozen_local_path_str set only once");
 
-        for vec_map in inner.index.values() {
-            for (lsn, _pos) in vec_map.as_slice() {
-                assert!(*lsn < end_lsn);
+        #[cfg(debug_assertions)]
+        {
+            let inner = self.inner.write().await;
+            for vec_map in inner.index.values() {
+                for (lsn, _pos) in vec_map.as_slice() {
+                    assert!(*lsn < end_lsn);
+                }
             }
         }
     }
@@ -507,12 +511,12 @@ impl InMemoryLayer {
     /// if there are no matching keys.
     ///
     /// Returns a new delta layer with all the same data as this in-memory layer
-    pub(crate) async fn write_to_disk(
+    pub async fn write_to_disk(
         &self,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
         key_range: Option<Range<Key>>,
-    ) -> Result<Option<ResidentLayer>> {
+        l0_flush_global_state: &l0_flush::Inner,
+    ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
         // Grab the lock in read-mode. We hold it over the I/O, but because this
         // layer is not writeable anymore, no one should be trying to acquire the
         // write lock on it, so we shouldn't block anyone. There's one exception
@@ -524,9 +528,8 @@ impl InMemoryLayer {
         // rare though, so we just accept the potential latency hit for now.
         let inner = self.inner.read().await;
 
-        let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
         use l0_flush::Inner;
-        let _concurrency_permit = match &*l0_flush_global_state {
+        let _concurrency_permit = match l0_flush_global_state {
             Inner::PageCached => None,
             Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
         };
@@ -556,7 +559,7 @@ impl InMemoryLayer {
         )
         .await?;
 
-        match &*l0_flush_global_state {
+        match l0_flush_global_state {
             l0_flush::Inner::PageCached => {
                 let ctx = RequestContextBuilder::extend(ctx)
                     .page_content_kind(PageContentKind::InMemoryLayer)
@@ -621,7 +624,7 @@ impl InMemoryLayer {
         }
 
         // MAX is used here because we identify L0 layers by full key range
-        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
+        let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;
 
         // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
         //
@@ -633,6 +636,6 @@ impl InMemoryLayer {
         // we dirtied when writing to the filesystem have been flushed and marked !dirty.
         drop(_concurrency_permit);
 
-        Ok(Some(delta_layer))
+        Ok(Some((desc, path)))
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index a966775f9e..d7bfe48c60 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -4,6 +4,7 @@ use bytes::Bytes;
 use pageserver_api::key::{Key, KEY_SIZE};
 use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
 
+use crate::tenant::storage_layer::Layer;
 use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
 
 use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
@@ -173,8 +174,9 @@ impl SplitDeltaLayerWriter {
             )
             .await?;
             let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
-            self.generated_layers
-                .push(prev_delta_writer.finish(key, tline, ctx).await?);
+            let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
+            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+            self.generated_layers.push(delta_layer);
         }
         self.inner.put_value(key, lsn, val, ctx).await
     }
@@ -190,7 +192,10 @@ impl SplitDeltaLayerWriter {
             inner,
             ..
         } = self;
-        generated_layers.push(inner.finish(end_key, tline, ctx).await?);
+
+        let (desc, path) = inner.finish(end_key, ctx).await?;
+        let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+        generated_layers.push(delta_layer);
         Ok(generated_layers)
     }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4ff87f20f1..a05e4e0712 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3709,12 +3709,14 @@ impl Timeline {
         let frozen_layer = Arc::clone(frozen_layer);
         let ctx = ctx.attached_child();
         let work = async move {
-            let Some(new_delta) = frozen_layer
-                .write_to_disk(&self_clone, &ctx, key_range)
+            let Some((desc, path)) = frozen_layer
+                .write_to_disk(&ctx, key_range, self_clone.l0_flush_global_state.inner())
                 .await?
             else {
                 return Ok(None);
             };
+            let new_delta = Layer::finish_creating(self_clone.conf, &self_clone, desc, &path)?;
+
             // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
             // We just need to fsync the directory in which these inodes are linked,
             // which we know to be the timeline directory.
@@ -5347,9 +5349,8 @@ impl Timeline {
         for (key, lsn, val) in deltas.data {
             delta_layer_writer.put_value(key, lsn, val, ctx).await?;
         }
-        let delta_layer = delta_layer_writer
-            .finish(deltas.key_range.end, self, ctx)
-            .await?;
+        let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?;
+        let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
 
         {
             let mut guard = self.layers.write().await;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 1ff029a313..276d7b4967 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1104,14 +1104,16 @@ impl Timeline {
                         || contains_hole
                     {
                         // ... if so, flush previous layer and prepare to write new one
-                        new_layers.push(
-                            writer
-                                .take()
-                                .unwrap()
-                                .finish(prev_key.unwrap().next(), self, ctx)
-                                .await
-                                .map_err(CompactionError::Other)?,
-                        );
+                        let (desc, path) = writer
+                            .take()
+                            .unwrap()
+                            .finish(prev_key.unwrap().next(), ctx)
+                            .await
+                            .map_err(CompactionError::Other)?;
+                        let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
+                            .map_err(CompactionError::Other)?;
+
+                        new_layers.push(new_delta);
                         writer = None;
 
                         if contains_hole {
@@ -1174,12 +1176,13 @@ impl Timeline {
             prev_key = Some(key);
         }
         if let Some(writer) = writer {
-            new_layers.push(
-                writer
-                    .finish(prev_key.unwrap().next(), self, ctx)
-                    .await
-                    .map_err(CompactionError::Other)?,
-            );
+            let (desc, path) = writer
+                .finish(prev_key.unwrap().next(), ctx)
+                .await
+                .map_err(CompactionError::Other)?;
+            let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
+                .map_err(CompactionError::Other)?;
+            new_layers.push(new_delta);
         }
 
         // Sync layers
@@ -1966,13 +1969,16 @@ impl Timeline {
             for (key, lsn, val) in deltas {
                 delta_layer_writer.put_value(key, lsn, val, ctx).await?;
             }
+
             stats.produce_delta_layer(delta_layer_writer.size());
             if dry_run {
                 return Ok(None);
             }
-            let delta_layer = delta_layer_writer
-                .finish(delta_key.key_range.end, tline, ctx)
+
+            let (desc, path) = delta_layer_writer
+                .finish(delta_key.key_range.end, ctx)
                 .await?;
+            let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?;
             Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
         }
 
@@ -2413,9 +2419,9 @@ impl CompactionJobExecutor for TimelineAdaptor {
             ))
         });
 
-        let new_delta_layer = writer
-            .finish(prev.unwrap().0.next(), &self.timeline, ctx)
-            .await?;
+        let (desc, path) = writer.finish(prev.unwrap().0.next(), ctx).await?;
+        let new_delta_layer =
+            Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
 
         self.new_deltas.push(new_delta_layer);
         Ok(())
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index ee5f8cd52a..645b5ad2bf 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -488,10 +488,12 @@ async fn copy_lsn_prefix(
         // reuse the key instead of adding more holes between layers by using the real
         // highest key in the layer.
         let reused_highest_key = layer.layer_desc().key_range.end;
-        let copied = writer
-            .finish(reused_highest_key, target_timeline, ctx)
+        let (desc, path) = writer
+            .finish(reused_highest_key, ctx)
             .await
             .map_err(CopyDeltaPrefix)?;
+        let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path)
+            .map_err(CopyDeltaPrefix)?;
 
         tracing::debug!(%layer, %copied, "new layer produced");
 

From ed5724d79d01599e14a724501e83d4fd53bef21b Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Tue, 6 Aug 2024 13:55:42 -0400
Subject: [PATCH 1352/1571] scrubber: clean up `scan_metadata` before prod
 (#8565)

Part of #8128.

## Problem
Currently, scrubber `scan_metadata` command will return with an error
code if the metadata on remote storage is corrupted with fatal errors.
To safely deploy this command in a cronjob, we want to differentiate
between failures while running scrubber command and the erroneous
metadata. At the same time, we also want our regression tests to catch
corrupted metadata using the scrubber command.

## Summary of changes

- Return with error code only when the scrubber command fails
- Uses explicit checks on errors and warnings to determine metadata
health in regression tests.

**Resolve conflict with `tenant-snapshot` command (after shard split):**
[`test_scrubber_tenant_snapshot`](https://github.com/neondatabase/neon/blob/yuchen/scrubber-scan-cleanup-before-prod/test_runner/regress/test_storage_scrubber.py#L23)
failed before applying 422a8443ddb7f1c7a26907a96c4aed0c5d554e67
- When taking a snapshot, the old `index_part.json` in the unsharded
tenant directory is not kept.
- The current `list_timeline_blobs` implementation consider no
`index_part.json` as a parse error.
- During the scan, we are only analyzing shards with highest shard
count, so we will not get a parse error. but we do need to add the
layers to tenant object listing, otherwise we will get index is
referencing a layer that is not in remote storage error.
- **Action:** Add s3_layers from `list_timeline_blobs` regardless of
parsing error

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 storage_scrubber/src/checks.rs                | 14 ++++++++----
 storage_scrubber/src/main.rs                  | 10 ++++-----
 .../src/pageserver_physical_gc.rs             | 14 ++++++++----
 .../src/scan_pageserver_metadata.rs           | 22 +++++++++++++------
 storage_scrubber/src/tenant_snapshot.rs       |  2 +-
 test_runner/fixtures/neon_fixtures.py         | 14 +++++++++---
 test_runner/regress/test_compatibility.py     |  5 ++---
 .../regress/test_pageserver_generations.py    |  5 ++---
 .../regress/test_pageserver_secondary.py      |  3 ++-
 test_runner/regress/test_sharding.py          |  3 ++-
 test_runner/regress/test_storage_scrubber.py  | 11 +++++-----
 test_runner/regress/test_tenant_delete.py     |  8 +++----
 12 files changed, 70 insertions(+), 41 deletions(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 14788515dd..35ec69fd50 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -172,8 +172,11 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                     }
                 }
                 BlobDataParseResult::Relic => {}
-                BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
-                    parse_errors
+                BlobDataParseResult::Incorrect {
+                    errors,
+                    s3_layers: _,
+                } => result.errors.extend(
+                    errors
                         .into_iter()
                         .map(|error| format!("parse error: {error}")),
                 ),
@@ -300,7 +303,10 @@ pub(crate) enum BlobDataParseResult {
     },
     /// The remains of a deleted Timeline (i.e. an initdb archive only)
     Relic,
-    Incorrect(Vec<String>),
+    Incorrect {
+        errors: Vec<String>,
+        s3_layers: HashSet<(LayerName, Generation)>,
+    },
 }
 
 pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
@@ -443,7 +449,7 @@ pub(crate) async fn list_timeline_blobs(
     }
 
     Ok(S3TimelineBlobData {
-        blob_data: BlobDataParseResult::Incorrect(errors),
+        blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
         unused_index_keys: index_part_keys,
         unknown_keys,
     })
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index a111c31844..cbc836755a 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -208,21 +208,21 @@ async fn main() -> anyhow::Result<()> {
                         }
 
                         if summary.is_fatal() {
-                            Err(anyhow::anyhow!("Fatal scrub errors detected"))
+                            tracing::error!("Fatal scrub errors detected");
                         } else if summary.is_empty() {
                             // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
                             // scrubber they were likely expecting to scan something, and if we see no timelines
                             // at all then it's likely due to some configuration issues like a bad prefix
-                            Err(anyhow::anyhow!(
+                            tracing::error!(
                                 "No timelines found in bucket {} prefix {}",
                                 bucket_config.bucket,
                                 bucket_config
                                     .prefix_in_bucket
                                     .unwrap_or("<none>".to_string())
-                            ))
-                        } else {
-                            Ok(())
+                            );
                         }
+
+                        Ok(())
                     }
                 }
             }
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 69896caa82..ff230feae3 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -389,10 +389,13 @@ async fn gc_ancestor(
                 // Post-deletion tenant location: don't try and GC it.
                 continue;
             }
-            BlobDataParseResult::Incorrect(reasons) => {
+            BlobDataParseResult::Incorrect {
+                errors,
+                s3_layers: _, // TODO(yuchen): could still check references to these s3 layers?
+            } => {
                 // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
                 tracing::warn!(
-                    "Skipping ancestor GC for timeline {ttid}, bad metadata: {reasons:?}"
+                    "Skipping ancestor GC for timeline {ttid}, bad metadata: {errors:?}"
                 );
                 continue;
             }
@@ -518,9 +521,12 @@ pub async fn pageserver_physical_gc(
                 // Post-deletion tenant location: don't try and GC it.
                 return Ok(summary);
             }
-            BlobDataParseResult::Incorrect(reasons) => {
+            BlobDataParseResult::Incorrect {
+                errors,
+                s3_layers: _,
+            } => {
                 // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
-                tracing::warn!("Skipping timeline {ttid}, bad metadata: {reasons:?}");
+                tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}");
                 return Ok(summary);
             }
         };
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index dc410bde41..b9630056e1 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -290,13 +290,21 @@ pub async fn scan_metadata(
             }
         }
 
-        if let BlobDataParseResult::Parsed {
-            index_part: _index_part,
-            index_part_generation: _index_part_generation,
-            s3_layers,
-        } = &data.blob_data
-        {
-            tenant_objects.push(ttid, s3_layers.clone());
+        match &data.blob_data {
+            BlobDataParseResult::Parsed {
+                index_part: _index_part,
+                index_part_generation: _index_part_generation,
+                s3_layers,
+            } => {
+                tenant_objects.push(ttid, s3_layers.clone());
+            }
+            BlobDataParseResult::Relic => (),
+            BlobDataParseResult::Incorrect {
+                errors: _,
+                s3_layers,
+            } => {
+                tenant_objects.push(ttid, s3_layers.clone());
+            }
         }
         tenant_timeline_results.push((ttid, data));
     }
diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
index 5a75f8d40e..1866e6ec80 100644
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -269,7 +269,7 @@ impl SnapshotDownloader {
                         .context("Downloading timeline")?;
                     }
                     BlobDataParseResult::Relic => {}
-                    BlobDataParseResult::Incorrect(_) => {
+                    BlobDataParseResult::Incorrect { .. } => {
                         tracing::error!("Bad metadata in timeline {ttid}");
                     }
                 };
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 7289472de2..c6f4404784 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -978,7 +978,10 @@ class NeonEnvBuilder:
                 and self.enable_scrub_on_exit
             ):
                 try:
-                    self.env.storage_scrubber.scan_metadata()
+                    healthy, _ = self.env.storage_scrubber.scan_metadata()
+                    if not healthy:
+                        e = Exception("Remote storage metadata corrupted")
+                        cleanup_error = e
                 except Exception as e:
                     log.error(f"Error during remote storage scrub: {e}")
                     cleanup_error = e
@@ -4411,14 +4414,19 @@ class StorageScrubber:
         assert stdout is not None
         return stdout
 
-    def scan_metadata(self, post_to_storage_controller: bool = False) -> Any:
+    def scan_metadata(self, post_to_storage_controller: bool = False) -> Tuple[bool, Any]:
+        """
+        Returns the health status and the metadata summary.
+        """
         args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
         if post_to_storage_controller:
             args.append("--post")
         stdout = self.scrubber_cli(args, timeout=30)
 
         try:
-            return json.loads(stdout)
+            summary = json.loads(stdout)
+            healthy = not summary["with_errors"] and not summary["with_warnings"]
+            return healthy, summary
         except:
             log.error("Failed to decode JSON output from `scan-metadata`.  Dumping stdout:")
             log.error(stdout)
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 137b0e931d..afa5f6873c 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -496,11 +496,10 @@ def test_historic_storage_formats(
     # Check the scrubber handles this old data correctly (can read it and doesn't consider it corrupt)
     #
     # Do this _before_ importing to the pageserver, as that import may start writing immediately
-    metadata_summary = env.storage_scrubber.scan_metadata()
+    healthy, metadata_summary = env.storage_scrubber.scan_metadata()
+    assert healthy
     assert metadata_summary["tenant_count"] >= 1
     assert metadata_summary["timeline_count"] >= 1
-    assert not metadata_summary["with_errors"]
-    assert not metadata_summary["with_warnings"]
 
     env.neon_cli.import_tenant(dataset.tenant_id)
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 8941ddd281..73af7950f1 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -214,12 +214,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
 
     # Having written a mixture of generation-aware and legacy index_part.json,
     # ensure the scrubber handles the situation as expected.
-    metadata_summary = env.storage_scrubber.scan_metadata()
+    healthy, metadata_summary = env.storage_scrubber.scan_metadata()
     assert metadata_summary["tenant_count"] == 1  # Scrubber should have seen our timeline
     assert metadata_summary["timeline_count"] == 1
     assert metadata_summary["timeline_shard_count"] == 1
-    assert not metadata_summary["with_errors"]
-    assert not metadata_summary["with_warnings"]
+    assert healthy
 
 
 def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 4b0af24480..8746b88a75 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -593,7 +593,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     # Scrub the remote storage
     # ========================
     # This confirms that the scrubber isn't upset by the presence of the heatmap
-    env.storage_scrubber.scan_metadata()
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy
 
     # Detach secondary and delete tenant
     # ===================================
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 7f30b2d7a7..1011a6fd22 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -124,7 +124,8 @@ def test_sharding_smoke(
 
     # Check the scrubber isn't confused by sharded content, then disable
     # it during teardown because we'll have deleted by then
-    env.storage_scrubber.scan_metadata()
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy
 
     env.storage_controller.pageserver_api().tenant_delete(tenant_id)
     assert_prefix_empty(
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index e3f627b6a6..388f6a9e92 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -516,9 +516,8 @@ def test_scrubber_scan_pageserver_metadata(
     assert len(index.layer_metadata) > 0
     it = iter(index.layer_metadata.items())
 
-    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
-    assert not scan_summary["with_warnings"]
-    assert not scan_summary["with_errors"]
+    healthy, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
+    assert healthy
 
     assert env.storage_controller.metadata_health_is_healthy()
 
@@ -532,16 +531,18 @@ def test_scrubber_scan_pageserver_metadata(
     log.info(f"delete response: {delete_response}")
 
     # Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings.
-    scan_summary = env.storage_scrubber.scan_metadata()
+    _, scan_summary = env.storage_scrubber.scan_metadata()
     log.info(f"{pprint.pformat(scan_summary)}")
     assert len(scan_summary["with_warnings"]) > 0
 
     assert env.storage_controller.metadata_health_is_healthy()
 
     # Now post to storage controller, expect seeing one unhealthy health record
-    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
+    _, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
     log.info(f"{pprint.pformat(scan_summary)}")
     assert len(scan_summary["with_warnings"]) > 0
 
     unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
     assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)
+
+    neon_env_builder.disable_scrub_on_exit()
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index c343b349cf..c01b3a2e89 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -341,13 +341,13 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
     wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
     env.stop()
 
-    result = env.storage_scrubber.scan_metadata()
-    assert result["with_warnings"] == []
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy
 
     env.start()
     ps_http = env.pageserver.http_client()
     ps_http.tenant_delete(tenant_id)
     env.stop()
 
-    env.storage_scrubber.scan_metadata()
-    assert result["with_warnings"] == []
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy

From c3f2240fbd32dae823ce6b2d47a308afe42719f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 7 Aug 2024 09:14:26 +0200
Subject: [PATCH 1353/1571] storage broker: only print one line for version and
 build tag in init (#8624)

This makes it more consistent with pageserver and safekeeper. Also, it
is easier to collect the two values into one data point.
---
 storage_broker/src/bin/storage_broker.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 0a4af543ab..15acd0e49c 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -642,8 +642,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     logging::replace_panic_hook_with_tracing_panic_hook().forget();
     // initialize sentry if SENTRY_DSN is provided
     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-    info!("version: {GIT_VERSION}");
-    info!("build_tag: {BUILD_TAG}");
+    info!("version: {GIT_VERSION} build_tag: {BUILD_TAG}");
     metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);
 
     // On any shutdown signal, log receival and exit.

From 00c981576a0b552c73115cba66cf3ca5907fdd90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 7 Aug 2024 09:29:52 +0200
Subject: [PATCH 1354/1571] Lower level for timeline cancellations during gc
 (#8626)

Timeline cancellation running in parallel with gc yields error log lines
like:

```
Gc failed 1 times, retrying in 2s: TimelineCancelled
```

They are completely harmless though and normal to occur. Therefore, only
print those messages at an info level. Still print them at all so that
we know what is going on if we focus on a single timeline.
---
 pageserver/src/tenant/tasks.rs | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 230362d81a..b4706ea59d 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -407,9 +407,16 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                         error_run_count += 1;
                         let wait_duration = Duration::from_secs_f64(wait_duration);
 
-                        error!(
-                        "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
-                    );
+                        if matches!(e, crate::tenant::GcError::TimelineCancelled) {
+                            // Timeline was cancelled during gc. We might either be in an event
+                            // that affects the entire tenant (tenant deletion, pageserver shutdown),
+                            // or in one that affects the timeline only (timeline deletion).
+                            // Therefore, don't exit the loop.
+                            info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
+                        } else {
+                            error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
+                        }
+
                         wait_duration
                     }
                 }

From 4d7c0dac936aedb31bc4fbd6dca80708c701c17d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 7 Aug 2024 14:53:52 +0200
Subject: [PATCH 1355/1571] Add missing colon to ArchivalConfigRequest
 specification (#8627)

Add a missing colon to the API specification of `ArchivalConfigRequest`.
The `state` field is required. Pointed out by Gleb.
---
 pageserver/src/http/openapi_spec.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 4656f2c93a..42086dc2e6 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -932,7 +932,7 @@ components:
           description: Whether to poll remote storage for layers to download.  If false, secondary locations don't download anything.
     ArchivalConfigRequest:
       type: object
-      required
+      required:
         - state
       properties:
         state:

From ad0988f27856f8b80f86f808ad2dd4ec90aadac0 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 7 Aug 2024 14:37:03 +0100
Subject: [PATCH 1356/1571] proxy: random changes (#8602)

## Problem

1. Hard to correlate startup parameters with the endpoint that provided
them.
2. Some configurations are not needed in the `ProxyConfig` struct.

## Summary of changes

Because of some borrow checker fun, I needed to switch to an
interior-mutability implementation of our `RequestMonitoring` context
system. Using https://docs.rs/try-lock/latest/try_lock/ as a cheap lock
for such a use-case (needed to be thread safe).

Removed the lock of each startup message, instead just logging only the
startup params in a successful handshake.

Also removed from values from `ProxyConfig` and kept as arguments.
(needed for local-proxy config)
---
 Cargo.lock                            |   5 +-
 Cargo.toml                            |   1 +
 proxy/Cargo.toml                      |   1 +
 proxy/src/auth/backend.rs             |  40 ++---
 proxy/src/auth/backend/classic.rs     |   4 +-
 proxy/src/auth/backend/hacks.rs       |   8 +-
 proxy/src/auth/backend/link.rs        |   2 +-
 proxy/src/auth/credentials.rs         |  60 ++++---
 proxy/src/auth/flow.rs                |  10 +-
 proxy/src/bin/pg_sni_router.rs        |   6 +-
 proxy/src/bin/proxy.rs                |  20 +--
 proxy/src/cache/endpoints.rs          |   2 +-
 proxy/src/compute.rs                  |  10 +-
 proxy/src/config.rs                   |   4 -
 proxy/src/console/provider.rs         |  14 +-
 proxy/src/console/provider/mock.rs    |   6 +-
 proxy/src/console/provider/neon.rs    |  22 +--
 proxy/src/context.rs                  | 241 +++++++++++++++++++-------
 proxy/src/context/parquet.rs          |   6 +-
 proxy/src/metrics.rs                  |  31 +---
 proxy/src/proxy.rs                    |  18 +-
 proxy/src/proxy/connect_compute.rs    |  16 +-
 proxy/src/proxy/handshake.rs          |  25 ++-
 proxy/src/proxy/tests.rs              |  41 ++---
 proxy/src/proxy/tests/mitm.rs         |  11 +-
 proxy/src/proxy/wake_compute.rs       |   6 +-
 proxy/src/serverless.rs               |   4 +-
 proxy/src/serverless/backend.rs       |  12 +-
 proxy/src/serverless/conn_pool.rs     |  15 +-
 proxy/src/serverless/sql_over_http.rs |  17 +-
 proxy/src/serverless/websocket.rs     |   4 +-
 31 files changed, 386 insertions(+), 276 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 764c0fbd30..f565119dbd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4324,6 +4324,7 @@ dependencies = [
  "tracing-opentelemetry",
  "tracing-subscriber",
  "tracing-utils",
+ "try-lock",
  "typed-json",
  "url",
  "urlencoding",
@@ -6563,9 +6564,9 @@ dependencies = [
 
 [[package]]
 name = "try-lock"
-version = "0.2.4"
+version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed"
+checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
 
 [[package]]
 name = "tungstenite"
diff --git a/Cargo.toml b/Cargo.toml
index af1c1dfc82..963841e340 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -184,6 +184,7 @@ tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
+try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
 typed-json = "0.1"
 url = "2.2"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 2f18b5fbc6..b316c53034 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -92,6 +92,7 @@ tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
+try-lock.workspace = true
 typed-json.workspace = true
 url.workspace = true
 urlencoding.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 67c4dd019e..90dea01bf3 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -218,7 +218,7 @@ impl RateBucketInfo {
 impl AuthenticationConfig {
     pub fn check_rate_limit(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         config: &AuthenticationConfig,
         secret: AuthSecret,
         endpoint: &EndpointId,
@@ -243,7 +243,7 @@ impl AuthenticationConfig {
         let limit_not_exceeded = self.rate_limiter.check(
             (
                 endpoint_int,
-                MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet),
+                MaskedIp::new(ctx.peer_addr(), config.rate_limit_ip_subnet),
             ),
             password_weight,
         );
@@ -274,7 +274,7 @@ impl AuthenticationConfig {
 ///
 /// All authentication flows will emit an AuthenticationOk message if successful.
 async fn auth_quirks(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     api: &impl console::Api,
     user_info: ComputeUserInfoMaybeEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -303,8 +303,8 @@ async fn auth_quirks(
     let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;
 
     // check allowed list
-    if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
-        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
+    if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
+        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
     }
 
     if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
@@ -356,7 +356,7 @@ async fn auth_quirks(
 }
 
 async fn authenticate_with_secret(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     secret: AuthSecret,
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -421,7 +421,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
     #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
     pub async fn authenticate(
         self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
@@ -467,7 +467,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
 impl BackendType<'_, ComputeUserInfo, &()> {
     pub async fn get_role_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         use BackendType::*;
         match self {
@@ -478,7 +478,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
 
     pub async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         use BackendType::*;
         match self {
@@ -492,7 +492,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
 impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
         use BackendType::*;
 
@@ -514,7 +514,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
 impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
         use BackendType::*;
 
@@ -571,7 +571,7 @@ mod tests {
     impl console::Api for Auth {
         async fn get_role_secret(
             &self,
-            _ctx: &mut RequestMonitoring,
+            _ctx: &RequestMonitoring,
             _user_info: &super::ComputeUserInfo,
         ) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
             Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
@@ -579,7 +579,7 @@ mod tests {
 
         async fn get_allowed_ips_and_secret(
             &self,
-            _ctx: &mut RequestMonitoring,
+            _ctx: &RequestMonitoring,
             _user_info: &super::ComputeUserInfo,
         ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
         {
@@ -591,7 +591,7 @@ mod tests {
 
         async fn wake_compute(
             &self,
-            _ctx: &mut RequestMonitoring,
+            _ctx: &RequestMonitoring,
             _user_info: &super::ComputeUserInfo,
         ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
             unimplemented!()
@@ -665,7 +665,7 @@ mod tests {
         let (mut client, server) = tokio::io::duplex(1024);
         let mut stream = PqStream::new(Stream::from_raw(server));
 
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let api = Auth {
             ips: vec![],
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -723,7 +723,7 @@ mod tests {
         ));
 
         let _creds = auth_quirks(
-            &mut ctx,
+            &ctx,
             &api,
             user_info,
             &mut stream,
@@ -742,7 +742,7 @@ mod tests {
         let (mut client, server) = tokio::io::duplex(1024);
         let mut stream = PqStream::new(Stream::from_raw(server));
 
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let api = Auth {
             ips: vec![],
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -775,7 +775,7 @@ mod tests {
         ));
 
         let _creds = auth_quirks(
-            &mut ctx,
+            &ctx,
             &api,
             user_info,
             &mut stream,
@@ -794,7 +794,7 @@ mod tests {
         let (mut client, server) = tokio::io::duplex(1024);
         let mut stream = PqStream::new(Stream::from_raw(server));
 
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let api = Auth {
             ips: vec![],
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -828,7 +828,7 @@ mod tests {
         ));
 
         let creds = auth_quirks(
-            &mut ctx,
+            &ctx,
             &api,
             user_info,
             &mut stream,
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index b98fa63120..285fa29428 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -12,7 +12,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
 
 pub(super) async fn authenticate(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     creds: ComputeUserInfo,
     client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     config: &'static AuthenticationConfig,
@@ -27,7 +27,7 @@ pub(super) async fn authenticate(
         }
         AuthSecret::Scram(secret) => {
             info!("auth endpoint chooses SCRAM");
-            let scram = auth::Scram(&secret, &mut *ctx);
+            let scram = auth::Scram(&secret, ctx);
 
             let auth_outcome = tokio::time::timeout(
                 config.scram_protocol_timeout,
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 6b0f5e1726..56921dd949 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -18,7 +18,7 @@ use tracing::{info, warn};
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
 pub async fn authenticate_cleartext(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     secret: AuthSecret,
@@ -28,7 +28,7 @@ pub async fn authenticate_cleartext(
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
+    let paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
 
     let ep = EndpointIdInt::from(&info.endpoint);
 
@@ -60,7 +60,7 @@ pub async fn authenticate_cleartext(
 /// Similar to [`authenticate_cleartext`], but there's a specific password format,
 /// and passwords are not yet validated (we don't know how to validate them!)
 pub async fn password_hack_no_authentication(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
 ) -> auth::Result<ComputeCredentials> {
@@ -68,7 +68,7 @@ pub async fn password_hack_no_authentication(
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
+    let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
 
     let payload = AuthFlow::new(client)
         .begin(auth::PasswordHack)
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index 5932e1337c..95f4614736 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -57,7 +57,7 @@ pub fn new_psql_session_id() -> String {
 }
 
 pub(super) async fn authenticate(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index d06f5614f1..8f4a392131 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -84,7 +84,7 @@ pub fn endpoint_sni(
 
 impl ComputeUserInfoMaybeEndpoint {
     pub fn parse(
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         params: &StartupMessageParams,
         sni: Option<&str>,
         common_names: Option<&HashSet<String>>,
@@ -249,8 +249,8 @@ mod tests {
     fn parse_bare_minimum() -> anyhow::Result<()> {
         // According to postgresql, only `user` should be required.
         let options = StartupMessageParams::new([("user", "john_doe")]);
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id, None);
 
@@ -264,8 +264,8 @@ mod tests {
             ("database", "world"), // should be ignored
             ("foo", "bar"),        // should be ignored
         ]);
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id, None);
 
@@ -279,9 +279,9 @@ mod tests {
         let sni = Some("foo.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id.as_deref(), Some("foo"));
         assert_eq!(user_info.options.get_cache_key("foo"), "foo");
@@ -296,8 +296,8 @@ mod tests {
             ("options", "-ckey=1 project=bar -c geqo=off"),
         ]);
 
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
 
@@ -311,8 +311,8 @@ mod tests {
             ("options", "-ckey=1 endpoint=bar -c geqo=off"),
         ]);
 
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
 
@@ -329,8 +329,8 @@ mod tests {
             ),
         ]);
 
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert!(user_info.endpoint_id.is_none());
 
@@ -344,8 +344,8 @@ mod tests {
             ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
         ]);
 
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert!(user_info.endpoint_id.is_none());
 
@@ -359,9 +359,9 @@ mod tests {
         let sni = Some("baz.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id.as_deref(), Some("baz"));
 
@@ -374,16 +374,16 @@ mod tests {
 
         let common_names = Some(["a.com".into(), "b.com".into()].into());
         let sni = Some("p1.a.com");
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
 
         let common_names = Some(["a.com".into(), "b.com".into()].into());
         let sni = Some("p1.b.com");
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
 
         Ok(())
@@ -397,10 +397,9 @@ mod tests {
         let sni = Some("second.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let mut ctx = RequestMonitoring::test();
-        let err =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
-                .expect_err("should fail");
+        let ctx = RequestMonitoring::test();
+        let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
+            .expect_err("should fail");
         match err {
             InconsistentProjectNames { domain, option } => {
                 assert_eq!(option, "first");
@@ -417,10 +416,9 @@ mod tests {
         let sni = Some("project.localhost");
         let common_names = Some(["example.com".into()].into());
 
-        let mut ctx = RequestMonitoring::test();
-        let err =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
-                .expect_err("should fail");
+        let ctx = RequestMonitoring::test();
+        let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
+            .expect_err("should fail");
         match err {
             UnknownCommonName { cn } => {
                 assert_eq!(cn, "localhost");
@@ -438,9 +436,9 @@ mod tests {
 
         let sni = Some("project.localhost");
         let common_names = Some(["localhost".into()].into());
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
         let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
         assert_eq!(
             user_info.options.get_cache_key("project"),
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 59d1ac17f4..acf7b4f6b6 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -27,7 +27,7 @@ pub trait AuthMethod {
 pub struct Begin;
 
 /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
-pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring);
+pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring);
 
 impl AuthMethod for Scram<'_> {
     #[inline(always)]
@@ -155,7 +155,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
         let Scram(secret, ctx) = self.state;
 
         // pause the timer while we communicate with the client
-        let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
+        let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
 
         // Initial client message contains the chosen auth method's name.
         let msg = self.stream.read_password_message().await?;
@@ -168,10 +168,8 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
         }
 
         match sasl.method {
-            SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256),
-            SCRAM_SHA_256_PLUS => {
-                ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus)
-            }
+            SCRAM_SHA_256 => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256),
+            SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus),
             _ => {}
         }
         info!("client chooses {}", sasl.method);
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index d7a3eb9a4d..1038fa5116 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -205,7 +205,7 @@ async fn task_main(
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 
 async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     raw_stream: S,
     tls_config: Arc<rustls::ServerConfig>,
     tls_server_end_point: TlsServerEndPoint,
@@ -256,13 +256,13 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
 }
 
 async fn handle_client(
-    mut ctx: RequestMonitoring,
+    ctx: RequestMonitoring,
     dest_suffix: Arc<String>,
     tls_config: Arc<rustls::ServerConfig>,
     tls_server_end_point: TlsServerEndPoint,
     stream: impl AsyncRead + AsyncWrite + Unpin,
 ) -> anyhow::Result<()> {
-    let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?;
+    let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?;
 
     // Cut off first part of the SNI domain
     // We receive required destination details in the format of
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index c1fd6dfd80..b44e0ddd2f 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -5,6 +5,7 @@ use aws_config::meta::region::RegionProviderChain;
 use aws_config::profile::ProfileFileCredentialsProvider;
 use aws_config::provider_config::ProviderConfig;
 use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
+use aws_config::Region;
 use futures::future::Either;
 use proxy::auth;
 use proxy::auth::backend::AuthRateLimiter;
@@ -290,9 +291,10 @@ async fn main() -> anyhow::Result<()> {
     let config = build_config(&args)?;
 
     info!("Authentication backend: {}", config.auth_backend);
-    info!("Using region: {}", config.aws_region);
+    info!("Using region: {}", args.aws_region);
 
-    let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed
+    let region_provider =
+        RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone()));
     let provider_conf =
         ProviderConfig::without_region().with_region(region_provider.region().await);
     let aws_credentials_provider = {
@@ -318,7 +320,7 @@ async fn main() -> anyhow::Result<()> {
     };
     let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
         elasticache::AWSIRSAConfig::new(
-            config.aws_region.clone(),
+            args.aws_region.clone(),
             args.redis_cluster_name,
             args.redis_user_id,
         ),
@@ -376,11 +378,14 @@ async fn main() -> anyhow::Result<()> {
 
     let cancel_map = CancelMap::default();
 
+    let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone());
+    RateBucketInfo::validate(redis_rps_limit)?;
+
     let redis_publisher = match &regional_redis_client {
         Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
             redis_publisher.clone(),
             args.region.clone(),
-            &config.redis_rps_limit,
+            redis_rps_limit,
         )?))),
         None => None,
     };
@@ -656,7 +661,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     )?;
 
     let http_config = HttpConfig {
-        request_timeout: args.sql_over_http.sql_over_http_timeout,
         pool_options: GlobalConnPoolOptions {
             max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
             gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
@@ -676,9 +680,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
     };
 
-    let mut redis_rps_limit = args.redis_rps_limit.clone();
-    RateBucketInfo::validate(&mut redis_rps_limit)?;
-
     let config = Box::leak(Box::new(ProxyConfig {
         tls_config,
         auth_backend,
@@ -687,11 +688,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         http_config,
         authentication_config,
         require_client_ip: args.require_client_ip,
-        disable_ip_check_for_http: args.disable_ip_check_for_http,
-        redis_rps_limit,
         handshake_timeout: args.handshake_timeout,
         region: args.region.clone(),
-        aws_region: args.aws_region.clone(),
         wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
         connect_compute_locks,
         connect_to_compute_retry_config: config::RetryConfig::parse(
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 4bc10a6020..8c851790c2 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -68,7 +68,7 @@ impl EndpointsCache {
             ready: AtomicBool::new(false),
         }
     }
-    pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
+    pub async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
         if !self.ready.load(Ordering::Acquire) {
             return true;
         }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index f91693c704..21687160ea 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -288,12 +288,12 @@ impl ConnCfg {
     /// Connect to a corresponding compute node.
     pub async fn connect(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         allow_self_signed_compute: bool,
         aux: MetricsAuxInfo,
         timeout: Duration,
     ) -> Result<PostgresConnection, ConnectionError> {
-        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
         drop(pause);
 
@@ -316,14 +316,14 @@ impl ConnCfg {
         )?;
 
         // connect_raw() will not use TLS if sslmode is "disable"
-        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let (client, connection) = self.0.connect_raw(stream, tls).await?;
         drop(pause);
         tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
         let stream = connection.stream.into_inner();
 
         info!(
-            cold_start_info = ctx.cold_start_info.as_str(),
+            cold_start_info = ctx.cold_start_info().as_str(),
             "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
             self.0.get_ssl_mode()
         );
@@ -342,7 +342,7 @@ impl ConnCfg {
             params,
             cancel_closure,
             aux,
-            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol),
+            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()),
         };
 
         Ok(connection)
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 6504919760..1412095505 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -31,11 +31,8 @@ pub struct ProxyConfig {
     pub http_config: HttpConfig,
     pub authentication_config: AuthenticationConfig,
     pub require_client_ip: bool,
-    pub disable_ip_check_for_http: bool,
-    pub redis_rps_limit: Vec<RateBucketInfo>,
     pub region: String,
     pub handshake_timeout: Duration,
-    pub aws_region: String,
     pub wake_compute_retry_config: RetryConfig,
     pub connect_compute_locks: ApiLocks<Host>,
     pub connect_to_compute_retry_config: RetryConfig,
@@ -55,7 +52,6 @@ pub struct TlsConfig {
 }
 
 pub struct HttpConfig {
-    pub request_timeout: tokio::time::Duration,
     pub pool_options: GlobalConnPoolOptions,
     pub cancel_set: CancelSet,
     pub client_conn_threshold: u64,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 7a9637066f..15fc0134b3 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -292,7 +292,7 @@ pub struct NodeInfo {
 impl NodeInfo {
     pub async fn connect(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         timeout: Duration,
     ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
         self.config
@@ -330,20 +330,20 @@ pub(crate) trait Api {
     /// We still have to mock the scram to avoid leaking information that user doesn't exist.
     async fn get_role_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
 
     async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
 
     /// Wake up the compute node and return the corresponding connection info.
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 }
@@ -363,7 +363,7 @@ pub enum ConsoleBackend {
 impl Api for ConsoleBackend {
     async fn get_role_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
         use ConsoleBackend::*;
@@ -378,7 +378,7 @@ impl Api for ConsoleBackend {
 
     async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
         use ConsoleBackend::*;
@@ -393,7 +393,7 @@ impl Api for ConsoleBackend {
 
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
         use ConsoleBackend::*;
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index cfe491f2aa..2093da7562 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -158,7 +158,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn get_role_secret(
         &self,
-        _ctx: &mut RequestMonitoring,
+        _ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         Ok(CachedRoleSecret::new_uncached(
@@ -168,7 +168,7 @@ impl super::Api for Api {
 
     async fn get_allowed_ips_and_secret(
         &self,
-        _ctx: &mut RequestMonitoring,
+        _ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         Ok((
@@ -182,7 +182,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn wake_compute(
         &self,
-        _ctx: &mut RequestMonitoring,
+        _ctx: &RequestMonitoring,
         _user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
         self.do_wake_compute().map_ok(Cached::new_uncached).await
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 768cd2fdfa..7eda238b66 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -57,7 +57,7 @@ impl Api {
 
     async fn do_get_auth_info(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
         if !self
@@ -69,7 +69,7 @@ impl Api {
             info!("endpoint is not valid, skipping the request");
             return Ok(AuthInfo::default());
         }
-        let request_id = ctx.session_id.to_string();
+        let request_id = ctx.session_id().to_string();
         let application_name = ctx.console_application_name();
         async {
             let request = self
@@ -77,7 +77,7 @@ impl Api {
                 .get("proxy_get_role_secret")
                 .header("X-Request-ID", &request_id)
                 .header("Authorization", format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id)])
+                .query(&[("session_id", ctx.session_id())])
                 .query(&[
                     ("application_name", application_name.as_str()),
                     ("project", user_info.endpoint.as_str()),
@@ -87,7 +87,7 @@ impl Api {
 
             info!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
-            let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
             let response = self.endpoint.execute(request).await?;
             drop(pause);
             info!(duration = ?start.elapsed(), "received http response");
@@ -130,10 +130,10 @@ impl Api {
 
     async fn do_wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<NodeInfo, WakeComputeError> {
-        let request_id = ctx.session_id.to_string();
+        let request_id = ctx.session_id().to_string();
         let application_name = ctx.console_application_name();
         async {
             let mut request_builder = self
@@ -141,7 +141,7 @@ impl Api {
                 .get("proxy_wake_compute")
                 .header("X-Request-ID", &request_id)
                 .header("Authorization", format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id)])
+                .query(&[("session_id", ctx.session_id())])
                 .query(&[
                     ("application_name", application_name.as_str()),
                     ("project", user_info.endpoint.as_str()),
@@ -156,7 +156,7 @@ impl Api {
 
             info!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
-            let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
             let response = self.endpoint.execute(request).await?;
             drop(pause);
             info!(duration = ?start.elapsed(), "received http response");
@@ -192,7 +192,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn get_role_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         let normalized_ep = &user_info.endpoint.normalize();
@@ -226,7 +226,7 @@ impl super::Api for Api {
 
     async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         let normalized_ep = &user_info.endpoint.normalize();
@@ -268,7 +268,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
         let key = user_info.endpoint_cache_key();
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index ff79ba8275..e925f67233 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -7,13 +7,14 @@ use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
 use tracing::{field::display, info, info_span, Span};
+use try_lock::TryLock;
 use uuid::Uuid;
 
 use crate::{
     console::messages::{ColdStartInfo, MetricsAuxInfo},
     error::ErrorKind,
     intern::{BranchIdInt, ProjectIdInt},
-    metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol},
+    metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting},
     DbName, EndpointId, RoleName,
 };
 
@@ -28,7 +29,15 @@ pub static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>>
 ///
 /// This data should **not** be used for connection logic, only for observability and limiting purposes.
 /// All connection logic should instead use strongly typed state machines, not a bunch of Options.
-pub struct RequestMonitoring {
+pub struct RequestMonitoring(
+    /// To allow easier use of the ctx object, we have interior mutability.
+    /// I would typically use a RefCell but that would break the `Send` requirements
+    /// so we need something with thread-safety. `TryLock` is a cheap alternative
+    /// that offers similar semantics to a `RefCell` but with synchronisation.
+    TryLock<RequestMonitoringInner>,
+);
+
+struct RequestMonitoringInner {
     pub peer_addr: IpAddr,
     pub session_id: Uuid,
     pub protocol: Protocol,
@@ -85,7 +94,7 @@ impl RequestMonitoring {
             role = tracing::field::Empty,
         );
 
-        Self {
+        let inner = RequestMonitoringInner {
             peer_addr,
             session_id,
             protocol,
@@ -110,7 +119,9 @@ impl RequestMonitoring {
             disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
             latency_timer: LatencyTimer::new(protocol),
             disconnect_timestamp: None,
-        }
+        };
+
+        Self(TryLock::new(inner))
     }
 
     #[cfg(test)]
@@ -119,48 +130,177 @@ impl RequestMonitoring {
     }
 
     pub fn console_application_name(&self) -> String {
+        let this = self.0.try_lock().expect("should not deadlock");
         format!(
             "{}/{}",
-            self.application.as_deref().unwrap_or_default(),
-            self.protocol
+            this.application.as_deref().unwrap_or_default(),
+            this.protocol
         )
     }
 
-    pub fn set_rejected(&mut self, rejected: bool) {
-        self.rejected = Some(rejected);
+    pub fn set_rejected(&self, rejected: bool) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.rejected = Some(rejected);
     }
 
-    pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
+    pub fn set_cold_start_info(&self, info: ColdStartInfo) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .set_cold_start_info(info);
+    }
+
+    pub fn set_db_options(&self, options: StartupMessageParams) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.set_application(options.get("application_name").map(SmolStr::from));
+        if let Some(user) = options.get("user") {
+            this.set_user(user.into());
+        }
+        if let Some(dbname) = options.get("database") {
+            this.set_dbname(dbname.into());
+        }
+
+        this.pg_options = Some(options);
+    }
+
+    pub fn set_project(&self, x: MetricsAuxInfo) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        if this.endpoint_id.is_none() {
+            this.set_endpoint_id(x.endpoint_id.as_str().into())
+        }
+        this.branch = Some(x.branch_id);
+        this.project = Some(x.project_id);
+        this.set_cold_start_info(x.cold_start_info);
+    }
+
+    pub fn set_project_id(&self, project_id: ProjectIdInt) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.project = Some(project_id);
+    }
+
+    pub fn set_endpoint_id(&self, endpoint_id: EndpointId) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .set_endpoint_id(endpoint_id);
+    }
+
+    pub fn set_dbname(&self, dbname: DbName) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .set_dbname(dbname);
+    }
+
+    pub fn set_user(&self, user: RoleName) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .set_user(user);
+    }
+
+    pub fn set_auth_method(&self, auth_method: AuthMethod) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.auth_method = Some(auth_method);
+    }
+
+    pub fn has_private_peer_addr(&self) -> bool {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .has_private_peer_addr()
+    }
+
+    pub fn set_error_kind(&self, kind: ErrorKind) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        // Do not record errors from the private address to metrics.
+        if !this.has_private_peer_addr() {
+            Metrics::get().proxy.errors_total.inc(kind);
+        }
+        if let Some(ep) = &this.endpoint_id {
+            let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
+            let label = metric.with_labels(kind);
+            metric.get_metric(label).measure(ep);
+        }
+        this.error_kind = Some(kind);
+    }
+
+    pub fn set_success(&self) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.success = true;
+    }
+
+    pub fn log_connect(&self) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .log_connect();
+    }
+
+    pub fn protocol(&self) -> Protocol {
+        self.0.try_lock().expect("should not deadlock").protocol
+    }
+
+    pub fn span(&self) -> Span {
+        self.0.try_lock().expect("should not deadlock").span.clone()
+    }
+
+    pub fn session_id(&self) -> Uuid {
+        self.0.try_lock().expect("should not deadlock").session_id
+    }
+
+    pub fn peer_addr(&self) -> IpAddr {
+        self.0.try_lock().expect("should not deadlock").peer_addr
+    }
+
+    pub fn cold_start_info(&self) -> ColdStartInfo {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .cold_start_info
+    }
+
+    pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause {
+        LatencyTimerPause {
+            ctx: self,
+            start: tokio::time::Instant::now(),
+            waiting_for,
+        }
+    }
+
+    pub fn success(&self) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .latency_timer
+            .success()
+    }
+}
+
+pub struct LatencyTimerPause<'a> {
+    ctx: &'a RequestMonitoring,
+    start: tokio::time::Instant,
+    waiting_for: Waiting,
+}
+
+impl Drop for LatencyTimerPause<'_> {
+    fn drop(&mut self) {
+        self.ctx
+            .0
+            .try_lock()
+            .expect("should not deadlock")
+            .latency_timer
+            .unpause(self.start, self.waiting_for);
+    }
+}
+
+impl RequestMonitoringInner {
+    fn set_cold_start_info(&mut self, info: ColdStartInfo) {
         self.cold_start_info = info;
         self.latency_timer.cold_start_info(info);
     }
 
-    pub fn set_db_options(&mut self, options: StartupMessageParams) {
-        self.set_application(options.get("application_name").map(SmolStr::from));
-        if let Some(user) = options.get("user") {
-            self.set_user(user.into());
-        }
-        if let Some(dbname) = options.get("database") {
-            self.set_dbname(dbname.into());
-        }
-
-        self.pg_options = Some(options);
-    }
-
-    pub fn set_project(&mut self, x: MetricsAuxInfo) {
-        if self.endpoint_id.is_none() {
-            self.set_endpoint_id(x.endpoint_id.as_str().into())
-        }
-        self.branch = Some(x.branch_id);
-        self.project = Some(x.project_id);
-        self.set_cold_start_info(x.cold_start_info);
-    }
-
-    pub fn set_project_id(&mut self, project_id: ProjectIdInt) {
-        self.project = Some(project_id);
-    }
-
-    pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
+    fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
         if self.endpoint_id.is_none() {
             self.span.record("ep", display(&endpoint_id));
             let metric = &Metrics::get().proxy.connecting_endpoints;
@@ -176,44 +316,23 @@ impl RequestMonitoring {
         }
     }
 
-    pub fn set_dbname(&mut self, dbname: DbName) {
+    fn set_dbname(&mut self, dbname: DbName) {
         self.dbname = Some(dbname);
     }
 
-    pub fn set_user(&mut self, user: RoleName) {
+    fn set_user(&mut self, user: RoleName) {
         self.span.record("role", display(&user));
         self.user = Some(user);
     }
 
-    pub fn set_auth_method(&mut self, auth_method: AuthMethod) {
-        self.auth_method = Some(auth_method);
-    }
-
-    pub fn has_private_peer_addr(&self) -> bool {
+    fn has_private_peer_addr(&self) -> bool {
         match self.peer_addr {
             IpAddr::V4(ip) => ip.is_private(),
             _ => false,
         }
     }
 
-    pub fn set_error_kind(&mut self, kind: ErrorKind) {
-        // Do not record errors from the private address to metrics.
-        if !self.has_private_peer_addr() {
-            Metrics::get().proxy.errors_total.inc(kind);
-        }
-        if let Some(ep) = &self.endpoint_id {
-            let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
-            let label = metric.with_labels(kind);
-            metric.get_metric(label).measure(ep);
-        }
-        self.error_kind = Some(kind);
-    }
-
-    pub fn set_success(&mut self) {
-        self.success = true;
-    }
-
-    pub fn log_connect(&mut self) {
+    fn log_connect(&mut self) {
         let outcome = if self.success {
             ConnectOutcome::Success
         } else {
@@ -256,7 +375,7 @@ impl RequestMonitoring {
     }
 }
 
-impl Drop for RequestMonitoring {
+impl Drop for RequestMonitoringInner {
     fn drop(&mut self) {
         if self.sender.is_some() {
             self.log_connect();
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 543a458274..bb02a476fc 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -23,7 +23,7 @@ use utils::backoff;
 
 use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT};
 
-use super::{RequestMonitoring, LOG_CHAN};
+use super::{RequestMonitoringInner, LOG_CHAN};
 
 #[derive(clap::Args, Clone, Debug)]
 pub struct ParquetUploadArgs {
@@ -118,8 +118,8 @@ impl<'a> serde::Serialize for Options<'a> {
     }
 }
 
-impl From<&RequestMonitoring> for RequestData {
-    fn from(value: &RequestMonitoring) -> Self {
+impl From<&RequestMonitoringInner> for RequestData {
+    fn from(value: &RequestMonitoringInner) -> Self {
         Self {
             session_id: value.session_id,
             peer_addr: value.peer_addr.to_string(),
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index db25ac0311..0167553e30 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -370,6 +370,7 @@ pub struct CancellationRequest {
     pub kind: CancellationOutcome,
 }
 
+#[derive(Clone, Copy)]
 pub enum Waiting {
     Cplane,
     Client,
@@ -398,12 +399,6 @@ pub struct LatencyTimer {
     outcome: ConnectOutcome,
 }
 
-pub struct LatencyTimerPause<'a> {
-    timer: &'a mut LatencyTimer,
-    start: time::Instant,
-    waiting_for: Waiting,
-}
-
 impl LatencyTimer {
     pub fn new(protocol: Protocol) -> Self {
         Self {
@@ -417,11 +412,13 @@ impl LatencyTimer {
         }
     }
 
-    pub fn pause(&mut self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
-        LatencyTimerPause {
-            timer: self,
-            start: Instant::now(),
-            waiting_for,
+    pub fn unpause(&mut self, start: Instant, waiting_for: Waiting) {
+        let dur = start.elapsed();
+        match waiting_for {
+            Waiting::Cplane => self.accumulated.cplane += dur,
+            Waiting::Client => self.accumulated.client += dur,
+            Waiting::Compute => self.accumulated.compute += dur,
+            Waiting::RetryTimeout => self.accumulated.retry += dur,
         }
     }
 
@@ -438,18 +435,6 @@ impl LatencyTimer {
     }
 }
 
-impl Drop for LatencyTimerPause<'_> {
-    fn drop(&mut self) {
-        let dur = self.start.elapsed();
-        match self.waiting_for {
-            Waiting::Cplane => self.timer.accumulated.cplane += dur,
-            Waiting::Client => self.timer.accumulated.client += dur,
-            Waiting::Compute => self.timer.accumulated.compute += dur,
-            Waiting::RetryTimeout => self.timer.accumulated.retry += dur,
-        }
-    }
-}
-
 #[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
 pub enum ConnectOutcome {
     Success,
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 3edefcf21a..2182f38fe7 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -113,18 +113,18 @@ pub async fn task_main(
                 }
             };
 
-            let mut ctx = RequestMonitoring::new(
+            let ctx = RequestMonitoring::new(
                 session_id,
                 peer_addr,
                 crate::metrics::Protocol::Tcp,
                 &config.region,
             );
-            let span = ctx.span.clone();
+            let span = ctx.span();
 
             let startup = Box::pin(
                 handle_client(
                     config,
-                    &mut ctx,
+                    &ctx,
                     cancellation_handler,
                     socket,
                     ClientMode::Tcp,
@@ -240,7 +240,7 @@ impl ReportableError for ClientRequestError {
 
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
     mode: ClientMode,
@@ -248,25 +248,25 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
     info!(
-        protocol = %ctx.protocol,
+        protocol = %ctx.protocol(),
         "handling interactive connection from client"
     );
 
     let metrics = &Metrics::get().proxy;
-    let proto = ctx.protocol;
+    let proto = ctx.protocol();
     let _request_gauge = metrics.connection_requests.guard(proto);
 
     let tls = config.tls_config.as_ref();
 
     let record_handshake_error = !ctx.has_private_peer_addr();
-    let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
-    let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error);
+    let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+    let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error);
     let (mut stream, params) =
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(cancel_key_data) => {
                 return Ok(cancellation_handler
-                    .cancel_session(cancel_key_data, ctx.session_id)
+                    .cancel_session(cancel_key_data, ctx.session_id())
                     .await
                     .map(|()| None)?)
             }
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 82180aaee3..f38e43ba5a 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -46,7 +46,7 @@ pub trait ConnectMechanism {
     type Error: From<Self::ConnectError>;
     async fn connect_once(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<Self::Connection, Self::ConnectError>;
@@ -58,7 +58,7 @@ pub trait ConnectMechanism {
 pub trait ComputeConnectBackend {
     async fn wake_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
 
     fn get_keys(&self) -> Option<&ComputeCredentialKeys>;
@@ -81,7 +81,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
     #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
     async fn connect_once(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
@@ -98,7 +98,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
 /// Try to connect to the compute node, retrying if necessary.
 #[tracing::instrument(skip_all)]
 pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     mechanism: &M,
     user_info: &B,
     allow_self_signed_compute: bool,
@@ -126,7 +126,7 @@ where
         .await
     {
         Ok(res) => {
-            ctx.latency_timer.success();
+            ctx.success();
             Metrics::get().proxy.retries_metric.observe(
                 RetriesMetricGroup {
                     outcome: ConnectOutcome::Success,
@@ -178,7 +178,7 @@ where
             .await
         {
             Ok(res) => {
-                ctx.latency_timer.success();
+                ctx.success();
                 Metrics::get().proxy.retries_metric.observe(
                     RetriesMetricGroup {
                         outcome: ConnectOutcome::Success,
@@ -209,9 +209,7 @@ where
         let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
         num_retries += 1;
 
-        let pause = ctx
-            .latency_timer
-            .pause(crate::metrics::Waiting::RetryTimeout);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout);
         time::sleep(wait_duration).await;
         drop(pause);
     }
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index d488aea927..c65a5558d9 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -10,6 +10,7 @@ use tracing::{info, warn};
 use crate::{
     auth::endpoint_sni,
     config::{TlsConfig, PG_ALPN_PROTOCOL},
+    context::RequestMonitoring,
     error::ReportableError,
     metrics::Metrics,
     proxy::ERR_INSECURE_CONNECTION,
@@ -67,6 +68,7 @@ pub enum HandshakeData<S> {
 /// we also take an extra care of propagating only the select handshake errors to client.
 #[tracing::instrument(skip_all)]
 pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
+    ctx: &RequestMonitoring,
     stream: S,
     mut tls: Option<&TlsConfig>,
     record_handshake_error: bool,
@@ -80,8 +82,6 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     let mut stream = PqStream::new(Stream::from_raw(stream));
     loop {
         let msg = stream.read_startup_packet().await?;
-        info!("received {msg:?}");
-
         use FeStartupPacket::*;
         match msg {
             SslRequest { direct } => match stream.get_ref() {
@@ -145,16 +145,20 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
 
                         let conn_info = tls_stream.get_ref().1;
 
+                        // try parse endpoint
+                        let ep = conn_info
+                            .server_name()
+                            .and_then(|sni| endpoint_sni(sni, &tls.common_names).ok().flatten());
+                        if let Some(ep) = ep {
+                            ctx.set_endpoint_id(ep);
+                        }
+
                         // check the ALPN, if exists, as required.
                         match conn_info.alpn_protocol() {
                             None | Some(PG_ALPN_PROTOCOL) => {}
                             Some(other) => {
-                                // try parse ep for better error
-                                let ep = conn_info.server_name().and_then(|sni| {
-                                    endpoint_sni(sni, &tls.common_names).ok().flatten()
-                                });
                                 let alpn = String::from_utf8_lossy(other);
-                                warn!(?ep, %alpn, "unexpected ALPN");
+                                warn!(%alpn, "unexpected ALPN");
                                 return Err(HandshakeError::ProtocolViolation);
                             }
                         }
@@ -198,7 +202,12 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                         .await?;
                 }
 
-                info!(?version, session_type = "normal", "successful handshake");
+                info!(
+                    ?version,
+                    ?params,
+                    session_type = "normal",
+                    "successful handshake"
+                );
                 break Ok(HandshakeData::Startup(stream, params));
             }
             // downgrade protocol version
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 5186a9e1b0..d8308c4f2a 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -155,7 +155,7 @@ impl TestAuth for Scram {
         stream: &mut PqStream<Stream<S>>,
     ) -> anyhow::Result<()> {
         let outcome = auth::AuthFlow::new(stream)
-            .begin(auth::Scram(&self.0, &mut RequestMonitoring::test()))
+            .begin(auth::Scram(&self.0, &RequestMonitoring::test()))
             .await?
             .authenticate()
             .await?;
@@ -175,10 +175,11 @@ async fn dummy_proxy(
     auth: impl TestAuth + Send,
 ) -> anyhow::Result<()> {
     let (client, _) = read_proxy_protocol(client).await?;
-    let mut stream = match handshake(client, tls.as_ref(), false).await? {
-        HandshakeData::Startup(stream, _) => stream,
-        HandshakeData::Cancel(_) => bail!("cancellation not supported"),
-    };
+    let mut stream =
+        match handshake(&RequestMonitoring::test(), client, tls.as_ref(), false).await? {
+            HandshakeData::Startup(stream, _) => stream,
+            HandshakeData::Cancel(_) => bail!("cancellation not supported"),
+        };
 
     auth.authenticate(&mut stream).await?;
 
@@ -457,7 +458,7 @@ impl ConnectMechanism for TestConnectMechanism {
 
     async fn connect_once(
         &self,
-        _ctx: &mut RequestMonitoring,
+        _ctx: &RequestMonitoring,
         _node_info: &console::CachedNodeInfo,
         _timeout: std::time::Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
@@ -565,7 +566,7 @@ fn helper_create_connect_info(
 async fn connect_to_compute_success() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -573,7 +574,7 @@ async fn connect_to_compute_success() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -583,7 +584,7 @@ async fn connect_to_compute_success() {
 async fn connect_to_compute_retry() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -591,7 +592,7 @@ async fn connect_to_compute_retry() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -602,7 +603,7 @@ async fn connect_to_compute_retry() {
 async fn connect_to_compute_non_retry_1() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -610,7 +611,7 @@ async fn connect_to_compute_non_retry_1() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -621,7 +622,7 @@ async fn connect_to_compute_non_retry_1() {
 async fn connect_to_compute_non_retry_2() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -629,7 +630,7 @@ async fn connect_to_compute_non_retry_2() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -641,7 +642,7 @@ async fn connect_to_compute_non_retry_3() {
     let _ = env_logger::try_init();
     tokio::time::pause();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism =
         TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]);
     let user_info = helper_create_connect_info(&mechanism);
@@ -656,7 +657,7 @@ async fn connect_to_compute_non_retry_3() {
         backoff_factor: 2.0,
     };
     connect_to_compute(
-        &mut ctx,
+        &ctx,
         &mechanism,
         &user_info,
         false,
@@ -673,7 +674,7 @@ async fn connect_to_compute_non_retry_3() {
 async fn wake_retry() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -681,7 +682,7 @@ async fn wake_retry() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -692,7 +693,7 @@ async fn wake_retry() {
 async fn wake_non_retry() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
+    let ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -700,7 +701,7 @@ async fn wake_non_retry() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap_err();
     mechanism.verify();
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index d96dd0947b..c8ec2b2db6 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -34,9 +34,14 @@ async fn proxy_mitm(
     tokio::spawn(async move {
         // begin handshake with end_server
         let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await;
-        let (end_client, startup) = match handshake(client1, Some(&server_config1), false)
-            .await
-            .unwrap()
+        let (end_client, startup) = match handshake(
+            &RequestMonitoring::test(),
+            client1,
+            Some(&server_config1),
+            false,
+        )
+        .await
+        .unwrap()
         {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(_) => panic!("cancellation not supported"),
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index fef349aac0..5b06e8f054 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -14,7 +14,7 @@ use super::connect_compute::ComputeConnectBackend;
 
 pub async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     api: &B,
     config: RetryConfig,
 ) -> Result<CachedNodeInfo, WakeComputeError> {
@@ -52,9 +52,7 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
 
         let wait_duration = retry_after(*num_retries, config);
         *num_retries += 1;
-        let pause = ctx
-            .latency_timer
-            .pause(crate::metrics::Waiting::RetryTimeout);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout);
         tokio::time::sleep(wait_duration).await;
         drop(pause);
     }
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index efa999ed7d..115bef7375 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -334,7 +334,7 @@ async fn request_handler(
             &config.region,
         );
 
-        let span = ctx.span.clone();
+        let span = ctx.span();
         info!(parent: &span, "performing websocket upgrade");
 
         let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request)
@@ -367,7 +367,7 @@ async fn request_handler(
             crate::metrics::Protocol::Http,
             &config.region,
         );
-        let span = ctx.span.clone();
+        let span = ctx.span();
 
         sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
             .instrument(span)
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 3b86c1838c..80d46c67eb 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -35,15 +35,15 @@ pub struct PoolingBackend {
 impl PoolingBackend {
     pub async fn authenticate(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         config: &AuthenticationConfig,
         conn_info: &ConnInfo,
     ) -> Result<ComputeCredentials, AuthError> {
         let user_info = conn_info.user_info.clone();
         let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
-        if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
-            return Err(AuthError::ip_address_not_allowed(ctx.peer_addr));
+        if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
+            return Err(AuthError::ip_address_not_allowed(ctx.peer_addr()));
         }
         if !self
             .endpoint_rate_limiter
@@ -100,7 +100,7 @@ impl PoolingBackend {
     #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
     pub async fn connect_to_compute(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         conn_info: ConnInfo,
         keys: ComputeCredentials,
         force_new: bool,
@@ -222,7 +222,7 @@ impl ConnectMechanism for TokioMechanism {
 
     async fn connect_once(
         &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         node_info: &CachedNodeInfo,
         timeout: Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
@@ -240,7 +240,7 @@ impl ConnectMechanism for TokioMechanism {
             .param("client_encoding", "UTF8")
             .expect("client encoding UTF8 is always valid");
 
-        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let res = config.connect(tokio_postgres::NoTls).await;
         drop(pause);
         let (client, connection) = permit.release_result(res)?;
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index dbc58d48ec..e1dc44dc1c 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -377,7 +377,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
 
     pub fn get(
         self: &Arc<Self>,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
         conn_info: &ConnInfo,
     ) -> Result<Option<Client<C>>, HttpConnError> {
         let mut client: Option<ClientInner<C>> = None;
@@ -409,9 +409,9 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                     cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
                     "pool: reusing connection '{conn_info}'"
                 );
-                client.session.send(ctx.session_id)?;
+                client.session.send(ctx.session_id())?;
                 ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
-                ctx.latency_timer.success();
+                ctx.success();
                 return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
             }
         }
@@ -465,19 +465,19 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
 
 pub fn poll_client<C: ClientInnerExt>(
     global_pool: Arc<GlobalConnPool<C>>,
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     conn_info: ConnInfo,
     client: C,
     mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
 ) -> Client<C> {
-    let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol);
-    let mut session_id = ctx.session_id;
+    let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol());
+    let mut session_id = ctx.session_id();
     let (tx, mut rx) = tokio::sync::watch::channel(session_id);
 
     let span = info_span!(parent: None, "connection", %conn_id);
-    let cold_start_info = ctx.cold_start_info;
+    let cold_start_info = ctx.cold_start_info();
     span.in_scope(|| {
         info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection");
     });
@@ -766,7 +766,6 @@ mod tests {
                 opt_in: false,
                 max_total_conns: 3,
             },
-            request_timeout: Duration::from_secs(1),
             cancel_set: CancelSet::new(0),
             client_conn_threshold: u64::MAX,
         }));
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 6400e4ac7b..77ec6b1c73 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -144,7 +144,7 @@ impl UserFacingError for ConnInfoError {
 }
 
 fn get_conn_info(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     headers: &HeaderMap,
     tls: &TlsConfig,
 ) -> Result<ConnInfo, ConnInfoError> {
@@ -224,12 +224,12 @@ fn get_conn_info(
 // TODO: return different http error codes
 pub async fn handle(
     config: &'static ProxyConfig,
-    mut ctx: RequestMonitoring,
+    ctx: RequestMonitoring,
     request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
     cancel: CancellationToken,
 ) -> Result<Response<Full<Bytes>>, ApiError> {
-    let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
+    let result = handle_inner(cancel, config, &ctx, request, backend).await;
 
     let mut response = match result {
         Ok(r) => {
@@ -482,13 +482,16 @@ fn map_isolation_level_to_headers(level: IsolationLevel) -> Option<HeaderValue>
 async fn handle_inner(
     cancel: CancellationToken,
     config: &'static ProxyConfig,
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
     request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
 ) -> Result<Response<Full<Bytes>>, SqlOverHttpError> {
-    let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol);
+    let _requeset_gauge = Metrics::get()
+        .proxy
+        .connection_requests
+        .guard(ctx.protocol());
     info!(
-        protocol = %ctx.protocol,
+        protocol = %ctx.protocol(),
         "handling interactive connection from client"
     );
 
@@ -544,7 +547,7 @@ async fn handle_inner(
                 .await?;
             // not strictly necessary to mark success here,
             // but it's just insurance for if we forget it somewhere else
-            ctx.latency_timer.success();
+            ctx.success();
             Ok::<_, HttpConnError>(client)
         }
         .map_err(SqlOverHttpError::from),
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 0d5b88f07b..4fba4d141c 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -129,7 +129,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
 
 pub async fn serve_websocket(
     config: &'static ProxyConfig,
-    mut ctx: RequestMonitoring,
+    ctx: RequestMonitoring,
     websocket: OnUpgrade,
     cancellation_handler: Arc<CancellationHandlerMain>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -145,7 +145,7 @@ pub async fn serve_websocket(
 
     let res = Box::pin(handle_client(
         config,
-        &mut ctx,
+        &ctx,
         cancellation_handler,
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },

From fc78774f39084c86b160b24765893030a8eaf25c Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 7 Aug 2024 17:50:09 +0300
Subject: [PATCH 1357/1571] fix: EphemeralFiles can outlive their Timeline via
 `enum LayerManager` (#8229)

Ephemeral files cleanup on drop but did not delay shutdown, leading to
problems with restarting the tenant. The solution is as proposed:
- make ephemeral files carry the gate guard to delay `Timeline::gate`
closing
- flush in-memory layers and strong references to those on
`Timeline::shutdown`

The above are realized by making LayerManager an `enum` with `Open` and
`Closed` variants, and fail requests to modify `LayerMap`.

Additionally:

- fix too eager anyhow conversions in compaction
- unify how we freeze layers and handle errors
- optimize likely_resident_layers to read LayerFileManager hashmap
values instead of bouncing through LayerMap

Fixes: #7830
---
 libs/utils/src/sync/gate.rs                   |   3 +-
 pageserver/benches/bench_ingest.rs            |   6 +-
 pageserver/src/http/routes.rs                 |   5 +-
 pageserver/src/tenant.rs                      |  38 +-
 pageserver/src/tenant/ephemeral_file.rs       |  45 ++-
 .../src/tenant/ephemeral_file/page_caching.rs |  10 +-
 pageserver/src/tenant/layer_map.rs            |   4 +-
 pageserver/src/tenant/mgr.rs                  |   9 +-
 .../src/tenant/storage_layer/delta_layer.rs   |   4 +-
 .../tenant/storage_layer/inmemory_layer.rs    |   4 +-
 .../src/tenant/storage_layer/layer/tests.rs   |  20 +-
 pageserver/src/tenant/timeline.rs             | 331 +++++++++++-------
 pageserver/src/tenant/timeline/compaction.rs  |  35 +-
 .../src/tenant/timeline/detach_ancestor.rs    |  17 +-
 .../src/tenant/timeline/eviction_task.rs      |  66 ++--
 .../src/tenant/timeline/layer_manager.rs      | 214 ++++++-----
 16 files changed, 505 insertions(+), 306 deletions(-)

diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs
index 156b99a010..16ec563fa7 100644
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -78,8 +78,9 @@ impl Drop for GateGuard {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, thiserror::Error)]
 pub enum GateError {
+    #[error("gate is closed")]
     GateClosed,
 }
 
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index af2b6934c6..459394449a 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -61,7 +61,11 @@ async fn ingest(
 
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
 
-    let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &ctx).await?;
+    let gate = utils::sync::gate::Gate::default();
+    let entered = gate.enter().unwrap();
+
+    let layer =
+        InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
 
     let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
     let ctx = RequestContext::new(
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index fdab780bfb..a983d8c4c2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1162,7 +1162,10 @@ async fn layer_map_info_handler(
     let timeline =
         active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
             .await?;
-    let layer_map_info = timeline.layer_map_info(reset).await;
+    let layer_map_info = timeline
+        .layer_map_info(reset)
+        .await
+        .map_err(|_shutdown| ApiError::ShuttingDown)?;
 
     json_response(StatusCode::OK, layer_map_info)
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 989ed0d4eb..2422ab4cf2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -601,6 +601,12 @@ impl From<PageReconstructError> for GcError {
     }
 }
 
+impl From<timeline::layer_manager::Shutdown> for GcError {
+    fn from(_: timeline::layer_manager::Shutdown) -> Self {
+        GcError::TimelineCancelled
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum LoadConfigError {
     #[error("TOML deserialization error: '{0}'")]
@@ -710,6 +716,7 @@ impl Tenant {
                     .read()
                     .await
                     .layer_map()
+                    .expect("currently loading, layer manager cannot be shutdown already")
                     .iter_historic_layers()
                     .next()
                     .is_some(),
@@ -4674,10 +4681,10 @@ mod tests {
 
         let layer_map = tline.layers.read().await;
         let level0_deltas = layer_map
-            .layer_map()
-            .get_level0_deltas()
-            .into_iter()
-            .map(|desc| layer_map.get_from_desc(&desc))
+            .layer_map()?
+            .level0_deltas()
+            .iter()
+            .map(|desc| layer_map.get_from_desc(desc))
             .collect::<Vec<_>>();
 
         assert!(!level0_deltas.is_empty());
@@ -4908,11 +4915,13 @@ mod tests {
         let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
 
         let guard = tline.layers.read().await;
-        guard.layer_map().dump(true, &ctx).await?;
+        let lm = guard.layer_map()?;
+
+        lm.dump(true, &ctx).await?;
 
         let mut reads = Vec::new();
         let mut prev = None;
-        guard.layer_map().iter_historic_layers().for_each(|desc| {
+        lm.iter_historic_layers().for_each(|desc| {
             if !desc.is_delta() {
                 prev = Some(desc.clone());
                 return;
@@ -5918,23 +5927,12 @@ mod tests {
             tline.freeze_and_flush().await?; // force create a delta layer
         }
 
-        let before_num_l0_delta_files = tline
-            .layers
-            .read()
-            .await
-            .layer_map()
-            .get_level0_deltas()
-            .len();
+        let before_num_l0_delta_files =
+            tline.layers.read().await.layer_map()?.level0_deltas().len();
 
         tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
 
-        let after_num_l0_delta_files = tline
-            .layers
-            .read()
-            .await
-            .layer_map()
-            .get_level0_deltas()
-            .len();
+        let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len();
 
         assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");
 
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index bb65ae24fc..770f3ca5f0 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -29,6 +29,7 @@ impl EphemeralFile {
         conf: &PageServerConf,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
+        gate_guard: utils::sync::gate::GateGuard,
         ctx: &RequestContext,
     ) -> Result<EphemeralFile, io::Error> {
         static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
@@ -51,10 +52,12 @@ impl EphemeralFile {
         )
         .await?;
 
+        let prewarm = conf.l0_flush.prewarm_on_write();
+
         Ok(EphemeralFile {
             _tenant_shard_id: tenant_shard_id,
             _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
+            rw: page_caching::RW::new(file, prewarm, gate_guard),
         })
     }
 
@@ -161,7 +164,11 @@ mod tests {
     async fn test_ephemeral_blobs() -> Result<(), io::Error> {
         let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
 
-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?;
+        let gate = utils::sync::gate::Gate::default();
+
+        let entered = gate.enter().unwrap();
+
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
 
         let pos_foo = file.write_blob(b"foo", &ctx).await?;
         assert_eq!(
@@ -215,4 +222,38 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn ephemeral_file_holds_gate_open() {
+        const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
+
+        let (conf, tenant_id, timeline_id, ctx) =
+            harness("ephemeral_file_holds_gate_open").unwrap();
+
+        let gate = utils::sync::gate::Gate::default();
+
+        let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
+            .await
+            .unwrap();
+
+        let mut closing = tokio::task::spawn(async move {
+            gate.close().await;
+        });
+
+        // gate is entered until the ephemeral file is dropped
+        // do not start paused tokio-epoll-uring has a sleep loop
+        tokio::time::pause();
+        tokio::time::timeout(FOREVER, &mut closing)
+            .await
+            .expect_err("closing cannot complete before dropping");
+
+        // this is a requirement of the reset_tenant functionality: we have to be able to restart a
+        // tenant fast, and for that, we need all tenant_dir operations be guarded by entering a gate
+        drop(file);
+
+        tokio::time::timeout(FOREVER, &mut closing)
+            .await
+            .expect("closing completes right away")
+            .expect("closing does not panic");
+    }
 }
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
index 43b9fff28d..0a12b64a7c 100644
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -18,6 +18,8 @@ use super::zero_padded_read_write;
 pub struct RW {
     page_cache_file_id: page_cache::FileId,
     rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
+    /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
+    _gate_guard: utils::sync::gate::GateGuard,
 }
 
 /// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
@@ -29,7 +31,11 @@ pub enum PrewarmOnWrite {
 }
 
 impl RW {
-    pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
+    pub fn new(
+        file: VirtualFile,
+        prewarm_on_write: PrewarmOnWrite,
+        _gate_guard: utils::sync::gate::GateGuard,
+    ) -> Self {
         let page_cache_file_id = page_cache::next_file_id();
         Self {
             page_cache_file_id,
@@ -38,6 +44,7 @@ impl RW {
                 file,
                 prewarm_on_write,
             )),
+            _gate_guard,
         }
     }
 
@@ -145,6 +152,7 @@ impl Drop for RW {
         // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
 
         // unlink the file
+        // we are clear to do this, because we have entered a gate
         let res = std::fs::remove_file(&self.rw.as_writer().file.path);
         if let Err(e) = res {
             if e.kind() != std::io::ErrorKind::NotFound {
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index ba9c08f6e7..844f117ea2 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -846,8 +846,8 @@ impl LayerMap {
     }
 
     /// Return all L0 delta layers
-    pub fn get_level0_deltas(&self) -> Vec<Arc<PersistentLayerDesc>> {
-        self.l0_delta_layers.to_vec()
+    pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
+        &self.l0_delta_layers
     }
 
     /// debugging function to print out the contents of the layer map
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index b5568d37b5..7901fc3554 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1767,14 +1767,9 @@ impl TenantManager {
             let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
             for timeline in timelines.values() {
                 tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink");
-                let timeline_layers = timeline
-                    .layers
-                    .read()
-                    .await
-                    .likely_resident_layers()
-                    .collect::<Vec<_>>();
+                let layers = timeline.layers.read().await;
 
-                for layer in timeline_layers {
+                for layer in layers.likely_resident_layers() {
                     let relative_path = layer
                         .local_path()
                         .strip_prefix(&parent_path)
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index bff8f7cb24..f4e965b99a 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1957,6 +1957,7 @@ pub(crate) mod test {
             .await
             .likely_resident_layers()
             .next()
+            .cloned()
             .unwrap();
 
         {
@@ -2031,7 +2032,8 @@ pub(crate) mod test {
             .read()
             .await
             .likely_resident_layers()
-            .find(|x| x != &initdb_layer)
+            .find(|&x| x != &initdb_layer)
+            .cloned()
             .unwrap();
 
         // create a copy for the timeline, so we don't overwrite the file
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index f118f3d8d8..57d93feaaf 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -385,11 +385,13 @@ impl InMemoryLayer {
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         start_lsn: Lsn,
+        gate_guard: utils::sync::gate::GateGuard,
         ctx: &RequestContext,
     ) -> Result<InMemoryLayer> {
         trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
 
-        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?;
+        let file =
+            EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
         let key = InMemoryLayerFileId(file.page_cache_file_id());
 
         Ok(InMemoryLayer {
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 6b0d5f09ff..bffd2db800 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -39,7 +39,7 @@ async fn smoke_test() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -176,7 +176,7 @@ async fn smoke_test() {
     {
         let layers = &[layer];
         let mut g = timeline.layers.write().await;
-        g.finish_gc_timeline(layers);
+        g.open_mut().unwrap().finish_gc_timeline(layers);
         // this just updates the remote_physical_size for demonstration purposes
         rtc.schedule_gc_update(layers).unwrap();
     }
@@ -216,7 +216,7 @@ async fn evict_and_wait_on_wanted_deleted() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -260,7 +260,7 @@ async fn evict_and_wait_on_wanted_deleted() {
     // the deletion of the layer in remote_storage happens.
     {
         let mut layers = timeline.layers.write().await;
-        layers.finish_gc_timeline(&[layer]);
+        layers.open_mut().unwrap().finish_gc_timeline(&[layer]);
     }
 
     SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
@@ -301,7 +301,7 @@ fn read_wins_pending_eviction() {
         let layer = {
             let mut layers = {
                 let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().collect::<Vec<_>>()
+                layers.likely_resident_layers().cloned().collect::<Vec<_>>()
             };
 
             assert_eq!(layers.len(), 1);
@@ -433,7 +433,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
         let layer = {
             let mut layers = {
                 let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().collect::<Vec<_>>()
+                layers.likely_resident_layers().cloned().collect::<Vec<_>>()
             };
 
             assert_eq!(layers.len(), 1);
@@ -602,7 +602,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -682,7 +682,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -801,9 +801,9 @@ async fn eviction_cancellation_on_drop() {
     let (evicted_layer, not_evicted) = {
         let mut layers = {
             let mut guard = timeline.layers.write().await;
-            let layers = guard.likely_resident_layers().collect::<Vec<_>>();
+            let layers = guard.likely_resident_layers().cloned().collect::<Vec<_>>();
             // remove the layers from layermap
-            guard.finish_gc_timeline(&layers);
+            guard.open_mut().unwrap().finish_gc_timeline(&layers);
 
             layers
         };
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a05e4e0712..8f9ff78fd8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -527,6 +527,12 @@ pub(crate) enum PageReconstructError {
     MissingKey(MissingKeyError),
 }
 
+impl From<layer_manager::Shutdown> for PageReconstructError {
+    fn from(_: layer_manager::Shutdown) -> Self {
+        PageReconstructError::Cancelled
+    }
+}
+
 impl GetVectoredError {
     #[cfg(test)]
     pub(crate) fn is_missing_key_error(&self) -> bool {
@@ -534,6 +540,12 @@ impl GetVectoredError {
     }
 }
 
+impl From<layer_manager::Shutdown> for GetVectoredError {
+    fn from(_: layer_manager::Shutdown) -> Self {
+        GetVectoredError::Cancelled
+    }
+}
+
 pub struct MissingKeyError {
     key: Key,
     shard: ShardNumber,
@@ -597,6 +609,12 @@ pub(crate) enum CreateImageLayersError {
     Other(#[from] anyhow::Error),
 }
 
+impl From<layer_manager::Shutdown> for CreateImageLayersError {
+    fn from(_: layer_manager::Shutdown) -> Self {
+        CreateImageLayersError::Cancelled
+    }
+}
+
 #[derive(thiserror::Error, Debug, Clone)]
 pub(crate) enum FlushLayerError {
     /// Timeline cancellation token was cancelled
@@ -634,6 +652,12 @@ impl FlushLayerError {
     }
 }
 
+impl From<layer_manager::Shutdown> for FlushLayerError {
+    fn from(_: layer_manager::Shutdown) -> Self {
+        FlushLayerError::Cancelled
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum GetVectoredError {
     #[error("timeline shutting down")]
@@ -1198,12 +1222,7 @@ impl Timeline {
     /// Hence, the result **does not represent local filesystem usage**.
     pub(crate) async fn layer_size_sum(&self) -> u64 {
         let guard = self.layers.read().await;
-        let layer_map = guard.layer_map();
-        let mut size = 0;
-        for l in layer_map.iter_historic_layers() {
-            size += l.file_size;
-        }
-        size
+        guard.layer_size_sum()
     }
 
     pub(crate) fn resident_physical_size(&self) -> u64 {
@@ -1370,16 +1389,15 @@ impl Timeline {
     // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
     // polluting the span hierarchy.
     pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
-        let to_lsn = {
+        let token = {
             // Freeze the current open in-memory layer. It will be written to disk on next
             // iteration.
             let mut g = self.write_lock.lock().await;
 
             let to_lsn = self.get_last_record_lsn();
-            self.freeze_inmem_layer_at(to_lsn, &mut g).await;
-            to_lsn
+            self.freeze_inmem_layer_at(to_lsn, &mut g).await?
         };
-        self.flush_frozen_layers_and_wait(to_lsn).await
+        self.wait_flush_completion(token).await
     }
 
     // Check if an open ephemeral layer should be closed: this provides
@@ -1393,12 +1411,20 @@ impl Timeline {
             return;
         };
 
+        // FIXME: why not early exit? because before #7927 the state would had been cleared every
+        // time, and this was missed.
+        // if write_guard.is_none() { return; }
+
         let Ok(layers_guard) = self.layers.try_read() else {
             // Don't block if the layer lock is busy
             return;
         };
 
-        let Some(open_layer) = &layers_guard.layer_map().open_layer else {
+        let Ok(lm) = layers_guard.layer_map() else {
+            return;
+        };
+
+        let Some(open_layer) = &lm.open_layer else {
             // If there is no open layer, we have no layer freezing to do.  However, we might need to generate
             // some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions
             // that didn't result in writes to this shard.
@@ -1424,9 +1450,16 @@ impl Timeline {
                     );
 
                     // The flush loop will update remote consistent LSN as well as disk consistent LSN.
-                    self.flush_frozen_layers_and_wait(last_record_lsn)
-                        .await
-                        .ok();
+                    // We know there is no open layer, so we can request freezing without actually
+                    // freezing anything. This is true even if we have dropped the layers_guard, we
+                    // still hold the write_guard.
+                    let _ = async {
+                        let token = self
+                            .freeze_inmem_layer_at(last_record_lsn, &mut write_guard)
+                            .await?;
+                        self.wait_flush_completion(token).await
+                    }
+                    .await;
                 }
             }
 
@@ -1464,33 +1497,26 @@ impl Timeline {
             self.last_freeze_at.load(),
             open_layer.get_opened_at(),
         ) {
-            let at_lsn = match open_layer.info() {
+            match open_layer.info() {
                 InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
                     // We may reach this point if the layer was already frozen by not yet flushed: flushing
                     // happens asynchronously in the background.
                     tracing::debug!(
                         "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
                     );
-                    None
                 }
                 InMemoryLayerInfo::Open { .. } => {
                     // Upgrade to a write lock and freeze the layer
                     drop(layers_guard);
-                    let mut layers_guard = self.layers.write().await;
-                    let froze = layers_guard
-                        .try_freeze_in_memory_layer(
-                            current_lsn,
-                            &self.last_freeze_at,
-                            &mut write_guard,
-                        )
+                    let res = self
+                        .freeze_inmem_layer_at(current_lsn, &mut write_guard)
                         .await;
-                    Some(current_lsn).filter(|_| froze)
-                }
-            };
-            if let Some(lsn) = at_lsn {
-                let res: Result<u64, _> = self.flush_frozen_layers(lsn);
-                if let Err(e) = res {
-                    tracing::info!("failed to flush frozen layer after background freeze: {e:#}");
+
+                    if let Err(e) = res {
+                        tracing::info!(
+                            "failed to flush frozen layer after background freeze: {e:#}"
+                        );
+                    }
                 }
             }
         }
@@ -1644,6 +1670,11 @@ impl Timeline {
                     // about corner cases like s3 suddenly hanging up?
                     self.remote_client.shutdown().await;
                 }
+                Err(FlushLayerError::Cancelled) => {
+                    // this is likely the second shutdown, ignore silently.
+                    // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
+                    debug_assert!(self.cancel.is_cancelled());
+                }
                 Err(e) => {
                     // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
                     // we have some extra WAL replay to do next time the timeline starts.
@@ -1662,6 +1693,7 @@ impl Timeline {
         // Transition the remote_client into a state where it's only useful for timeline deletion.
         // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
         self.remote_client.stop();
+
         // As documented in remote_client.stop()'s doc comment, it's our responsibility
         // to shut down the upload queue tasks.
         // TODO: fix that, task management should be encapsulated inside remote_client.
@@ -1672,10 +1704,17 @@ impl Timeline {
         )
         .await;
 
-        // TODO: work toward making this a no-op. See this funciton's doc comment for more context.
+        // TODO: work toward making this a no-op. See this function's doc comment for more context.
         tracing::debug!("Waiting for tasks...");
         task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;
 
+        {
+            // Allow any remaining in-memory layers to do cleanup -- until that, they hold the gate
+            // open.
+            let mut write_guard = self.write_lock.lock().await;
+            self.layers.write().await.shutdown(&mut write_guard);
+        }
+
         // Finally wait until any gate-holders are complete.
         //
         // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks
@@ -1769,9 +1808,12 @@ impl Timeline {
         }
     }
 
-    pub(crate) async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
+    pub(crate) async fn layer_map_info(
+        &self,
+        reset: LayerAccessStatsReset,
+    ) -> Result<LayerMapInfo, layer_manager::Shutdown> {
         let guard = self.layers.read().await;
-        let layer_map = guard.layer_map();
+        let layer_map = guard.layer_map()?;
         let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
         if let Some(open_layer) = &layer_map.open_layer {
             in_memory_layers.push(open_layer.info());
@@ -1780,16 +1822,15 @@ impl Timeline {
             in_memory_layers.push(frozen_layer.info());
         }
 
-        let mut historic_layers = Vec::new();
-        for historic_layer in layer_map.iter_historic_layers() {
-            let historic_layer = guard.get_from_desc(&historic_layer);
-            historic_layers.push(historic_layer.info(reset));
-        }
+        let historic_layers = layer_map
+            .iter_historic_layers()
+            .map(|desc| guard.get_from_desc(&desc).info(reset))
+            .collect();
 
-        LayerMapInfo {
+        Ok(LayerMapInfo {
             in_memory_layers,
             historic_layers,
-        }
+        })
     }
 
     #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
@@ -1797,7 +1838,7 @@ impl Timeline {
         &self,
         layer_file_name: &LayerName,
     ) -> anyhow::Result<Option<bool>> {
-        let Some(layer) = self.find_layer(layer_file_name).await else {
+        let Some(layer) = self.find_layer(layer_file_name).await? else {
             return Ok(None);
         };
 
@@ -1818,7 +1859,7 @@ impl Timeline {
             .enter()
             .map_err(|_| anyhow::anyhow!("Shutting down"))?;
 
-        let Some(local_layer) = self.find_layer(layer_file_name).await else {
+        let Some(local_layer) = self.find_layer(layer_file_name).await? else {
             return Ok(None);
         };
 
@@ -2304,7 +2345,10 @@ impl Timeline {
         let mut layers = self.layers.try_write().expect(
             "in the context where we call this function, no other task has access to the object",
         );
-        layers.initialize_empty(Lsn(start_lsn.0));
+        layers
+            .open_mut()
+            .expect("in this context the LayerManager must still be open")
+            .initialize_empty(Lsn(start_lsn.0));
     }
 
     /// Scan the timeline directory, cleanup, populate the layer map, and schedule uploads for local-only
@@ -2436,7 +2480,10 @@ impl Timeline {
 
         let num_layers = loaded_layers.len();
 
-        guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
+        guard
+            .open_mut()
+            .expect("layermanager must be open during init")
+            .initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
 
         self.remote_client
             .schedule_layer_file_deletion(&needs_cleanup)?;
@@ -2471,7 +2518,7 @@ impl Timeline {
 
         // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
         drop(guard); // drop write lock, update_layer_visibility will take a read lock.
-        self.update_layer_visibility().await;
+        self.update_layer_visibility().await?;
 
         info!(
             "loaded layer map with {} layers at {}, total physical size: {}",
@@ -2893,16 +2940,17 @@ impl Timeline {
         }
     }
 
-    async fn find_layer(&self, layer_name: &LayerName) -> Option<Layer> {
+    async fn find_layer(
+        &self,
+        layer_name: &LayerName,
+    ) -> Result<Option<Layer>, layer_manager::Shutdown> {
         let guard = self.layers.read().await;
-        for historic_layer in guard.layer_map().iter_historic_layers() {
-            let historic_layer_name = historic_layer.layer_name();
-            if layer_name == &historic_layer_name {
-                return Some(guard.get_from_desc(&historic_layer));
-            }
-        }
-
-        None
+        let layer = guard
+            .layer_map()?
+            .iter_historic_layers()
+            .find(|l| &l.layer_name() == layer_name)
+            .map(|found| guard.get_from_desc(&found));
+        Ok(layer)
     }
 
     /// The timeline heatmap is a hint to secondary locations from the primary location,
@@ -2953,6 +3001,7 @@ impl Timeline {
 }
 
 impl Timeline {
+    #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
     #[allow(clippy::doc_lazy_continuation)]
     /// Get the data needed to reconstruct all keys in the provided keyspace
     ///
@@ -3104,7 +3153,7 @@ impl Timeline {
             // which turns out to be a perf bottleneck in some cases.
             if !unmapped_keyspace.is_empty() {
                 let guard = timeline.layers.read().await;
-                let layers = guard.layer_map();
+                let layers = guard.layer_map()?;
 
                 let in_memory_layer = layers.find_in_memory_layer(|l| {
                     let start_lsn = l.get_lsn_range().start;
@@ -3256,22 +3305,35 @@ impl Timeline {
         }
     }
 
+    /// Returns a non-frozen open in-memory layer for ingestion.
     ///
-    /// Get a handle to the latest layer for appending.
-    ///
+    /// Takes a witness of timeline writer state lock being held, because it makes no sense to call
+    /// this function without holding the mutex.
     async fn get_layer_for_write(
         &self,
         lsn: Lsn,
+        _guard: &tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<InMemoryLayer>> {
         let mut guard = self.layers.write().await;
+        let gate_guard = self.gate.enter().context("enter gate for inmem layer")?;
+
+        let last_record_lsn = self.get_last_record_lsn();
+        ensure!(
+            lsn > last_record_lsn,
+            "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
+            lsn,
+            last_record_lsn,
+        );
+
         let layer = guard
+            .open_mut()?
             .get_layer_for_write(
                 lsn,
-                self.get_last_record_lsn(),
                 self.conf,
                 self.timeline_id,
                 self.tenant_shard_id,
+                gate_guard,
                 ctx,
             )
             .await?;
@@ -3285,21 +3347,48 @@ impl Timeline {
         self.last_record_lsn.advance(new_lsn);
     }
 
+    /// Freeze any existing open in-memory layer and unconditionally notify the flush loop.
+    ///
+    /// Unconditional flush loop notification is given because in sharded cases we will want to
+    /// leave an Lsn gap. Unsharded tenants do not have Lsn gaps.
     async fn freeze_inmem_layer_at(
         &self,
         at: Lsn,
         write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
-    ) {
+    ) -> Result<u64, FlushLayerError> {
         let frozen = {
             let mut guard = self.layers.write().await;
             guard
+                .open_mut()?
                 .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock)
                 .await
         };
+
         if frozen {
             let now = Instant::now();
             *(self.last_freeze_ts.write().unwrap()) = now;
         }
+
+        // Increment the flush cycle counter and wake up the flush task.
+        // Remember the new value, so that when we listen for the flush
+        // to finish, we know when the flush that we initiated has
+        // finished, instead of some other flush that was started earlier.
+        let mut my_flush_request = 0;
+
+        let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
+        if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
+            return Err(FlushLayerError::NotRunning(flush_loop_state));
+        }
+
+        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
+            my_flush_request = *counter + 1;
+            *counter = my_flush_request;
+            *lsn = std::cmp::max(at, *lsn);
+        });
+
+        assert_ne!(my_flush_request, 0);
+
+        Ok(my_flush_request)
     }
 
     /// Layer flusher task's main loop.
@@ -3336,7 +3425,11 @@ impl Timeline {
 
                 let layer_to_flush = {
                     let guard = self.layers.read().await;
-                    guard.layer_map().frozen_layers.front().cloned()
+                    let Ok(lm) = guard.layer_map() else {
+                        info!("dropping out of flush loop for timeline shutdown");
+                        return;
+                    };
+                    lm.frozen_layers.front().cloned()
                     // drop 'layers' lock to allow concurrent reads and writes
                 };
                 let Some(layer_to_flush) = layer_to_flush else {
@@ -3393,34 +3486,7 @@ impl Timeline {
         }
     }
 
-    /// Request the flush loop to write out all frozen layers up to `at_lsn` as Delta L0 files to disk.
-    /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer_at`].
-    ///
-    /// `at_lsn` may be higher than the highest LSN of a frozen layer: if this is the
-    /// case, it means no data will be written between the top of the highest frozen layer and
-    /// to_lsn, e.g. because this tenant shard has ingested up to to_lsn and not written any data
-    /// locally for that part of the WAL.
-    fn flush_frozen_layers(&self, at_lsn: Lsn) -> Result<u64, FlushLayerError> {
-        // Increment the flush cycle counter and wake up the flush task.
-        // Remember the new value, so that when we listen for the flush
-        // to finish, we know when the flush that we initiated has
-        // finished, instead of some other flush that was started earlier.
-        let mut my_flush_request = 0;
-
-        let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
-        if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
-            return Err(FlushLayerError::NotRunning(flush_loop_state));
-        }
-
-        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
-            my_flush_request = *counter + 1;
-            *counter = my_flush_request;
-            *lsn = std::cmp::max(at_lsn, *lsn);
-        });
-
-        Ok(my_flush_request)
-    }
-
+    /// Waits any flush request created by [`Self::freeze_inmem_layer_at`] to complete.
     async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> {
         let mut rx = self.layer_flush_done_tx.subscribe();
         loop {
@@ -3453,11 +3519,6 @@ impl Timeline {
         }
     }
 
-    async fn flush_frozen_layers_and_wait(&self, at_lsn: Lsn) -> Result<(), FlushLayerError> {
-        let token = self.flush_frozen_layers(at_lsn)?;
-        self.wait_flush_completion(token).await
-    }
-
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
     ///
     /// Return value is the last lsn (inclusive) of the layer that was frozen.
@@ -3594,11 +3655,11 @@ impl Timeline {
         {
             let mut guard = self.layers.write().await;
 
-            if self.cancel.is_cancelled() {
-                return Err(FlushLayerError::Cancelled);
-            }
-
-            guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
+            guard.open_mut()?.finish_flush_l0_layer(
+                delta_layer_to_add.as_ref(),
+                &frozen_layer,
+                &self.metrics,
+            );
 
             if self.set_disk_consistent_lsn(disk_consistent_lsn) {
                 // Schedule remote uploads that will reflect our new disk_consistent_lsn
@@ -3806,7 +3867,9 @@ impl Timeline {
         let threshold = self.get_image_creation_threshold();
 
         let guard = self.layers.read().await;
-        let layers = guard.layer_map();
+        let Ok(layers) = guard.layer_map() else {
+            return false;
+        };
 
         let mut max_deltas = 0;
         for part_range in &partition.ranges {
@@ -4214,13 +4277,16 @@ impl Timeline {
         let mut guard = self.layers.write().await;
 
         // FIXME: we could add the images to be uploaded *before* returning from here, but right
-        // now they are being scheduled outside of write lock
-        guard.track_new_image_layers(&image_layers, &self.metrics);
+        // now they are being scheduled outside of write lock; current way is inconsistent with
+        // compaction lock order.
+        guard
+            .open_mut()?
+            .track_new_image_layers(&image_layers, &self.metrics);
         drop_wlock(guard);
         timer.stop_and_record();
 
         // Creating image layers may have caused some previously visible layers to be covered
-        self.update_layer_visibility().await;
+        self.update_layer_visibility().await?;
 
         Ok(image_layers)
     }
@@ -4379,6 +4445,12 @@ impl CompactionError {
     }
 }
 
+impl From<layer_manager::Shutdown> for CompactionError {
+    fn from(_: layer_manager::Shutdown) -> Self {
+        CompactionError::ShuttingDown
+    }
+}
+
 #[serde_as]
 #[derive(serde::Serialize)]
 struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration);
@@ -4484,11 +4556,14 @@ impl Timeline {
             .collect();
 
         if !new_images.is_empty() {
-            guard.track_new_image_layers(new_images, &self.metrics);
+            guard
+                .open_mut()?
+                .track_new_image_layers(new_images, &self.metrics);
         }
 
-        // deletion will happen later, the layer file manager calls garbage_collect_on_drop
-        guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics);
+        guard
+            .open_mut()?
+            .finish_compact_l0(&remove_layers, &insert_layers, &self.metrics);
 
         self.remote_client
             .schedule_compaction_update(&remove_layers, new_deltas)?;
@@ -4502,7 +4577,7 @@ impl Timeline {
         self: &Arc<Self>,
         mut replace_layers: Vec<(Layer, ResidentLayer)>,
         mut drop_layers: Vec<Layer>,
-    ) -> Result<(), super::upload_queue::NotInitialized> {
+    ) -> Result<(), CompactionError> {
         let mut guard = self.layers.write().await;
 
         // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want
@@ -4510,7 +4585,9 @@ impl Timeline {
         replace_layers.retain(|(l, _)| guard.contains(l));
         drop_layers.retain(|l| guard.contains(l));
 
-        guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics);
+        guard
+            .open_mut()?
+            .rewrite_layers(&replace_layers, &drop_layers, &self.metrics);
 
         let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
 
@@ -4799,7 +4876,7 @@ impl Timeline {
         //
         // TODO holding a write lock is too agressive and avoidable
         let mut guard = self.layers.write().await;
-        let layers = guard.layer_map();
+        let layers = guard.layer_map()?;
         'outer: for l in layers.iter_historic_layers() {
             result.layers_total += 1;
 
@@ -4927,7 +5004,7 @@ impl Timeline {
                     }
                 })?;
 
-            guard.finish_gc_timeline(&gc_layers);
+            guard.open_mut()?.finish_gc_timeline(&gc_layers);
 
             #[cfg(feature = "testing")]
             {
@@ -5083,9 +5160,13 @@ impl Timeline {
 
         let remaining = {
             let guard = self.layers.read().await;
-            guard
-                .layer_map()
-                .iter_historic_layers()
+            let Ok(lm) = guard.layer_map() else {
+                // technically here we could look into iterating accessible layers, but downloading
+                // all layers of a shutdown timeline makes no sense regardless.
+                tracing::info!("attempted to download all layers of shutdown timeline");
+                return;
+            };
+            lm.iter_historic_layers()
                 .map(|desc| guard.get_from_desc(&desc))
                 .collect::<Vec<_>>()
         };
@@ -5195,7 +5276,7 @@ impl Timeline {
                 let last_activity_ts = layer.latest_activity();
 
                 EvictionCandidate {
-                    layer: layer.into(),
+                    layer: layer.to_owned().into(),
                     last_activity_ts,
                     relative_last_activity: finite_f32::FiniteF32::ZERO,
                 }
@@ -5280,7 +5361,7 @@ impl Timeline {
 
         {
             let mut guard = self.layers.write().await;
-            guard.force_insert_layer(image_layer);
+            guard.open_mut().unwrap().force_insert_layer(image_layer);
         }
 
         Ok(())
@@ -5324,7 +5405,7 @@ impl Timeline {
             }
 
             let guard = self.layers.read().await;
-            for layer in guard.layer_map().iter_historic_layers() {
+            for layer in guard.layer_map()?.iter_historic_layers() {
                 if layer.is_delta()
                     && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
                     && layer.lsn_range != deltas.lsn_range
@@ -5354,7 +5435,7 @@ impl Timeline {
 
         {
             let mut guard = self.layers.write().await;
-            guard.force_insert_layer(delta_layer);
+            guard.open_mut().unwrap().force_insert_layer(delta_layer);
         }
 
         Ok(())
@@ -5369,7 +5450,7 @@ impl Timeline {
     ) -> anyhow::Result<Vec<(Key, Bytes)>> {
         let mut all_data = Vec::new();
         let guard = self.layers.read().await;
-        for layer in guard.layer_map().iter_historic_layers() {
+        for layer in guard.layer_map()?.iter_historic_layers() {
             if !layer.is_delta() && layer.image_layer_lsn() == lsn {
                 let layer = guard.get_from_desc(&layer);
                 let mut reconstruct_data = ValuesReconstructState::default();
@@ -5397,7 +5478,7 @@ impl Timeline {
     ) -> anyhow::Result<Vec<super::storage_layer::PersistentLayerKey>> {
         let mut layers = Vec::new();
         let guard = self.layers.read().await;
-        for layer in guard.layer_map().iter_historic_layers() {
+        for layer in guard.layer_map()?.iter_historic_layers() {
             layers.push(layer.key());
         }
         Ok(layers)
@@ -5414,7 +5495,7 @@ impl Timeline {
 /// Tracking writes ingestion does to a particular in-memory layer.
 ///
 /// Cleared upon freezing a layer.
-struct TimelineWriterState {
+pub(crate) struct TimelineWriterState {
     open_layer: Arc<InMemoryLayer>,
     current_size: u64,
     // Previous Lsn which passed through
@@ -5522,7 +5603,10 @@ impl<'a> TimelineWriter<'a> {
     }
 
     async fn open_layer(&mut self, at: Lsn, ctx: &RequestContext) -> anyhow::Result<()> {
-        let layer = self.tl.get_layer_for_write(at, ctx).await?;
+        let layer = self
+            .tl
+            .get_layer_for_write(at, &self.write_guard, ctx)
+            .await?;
         let initial_size = layer.size().await?;
 
         let last_freeze_at = self.last_freeze_at.load();
@@ -5535,15 +5619,15 @@ impl<'a> TimelineWriter<'a> {
         Ok(())
     }
 
-    async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> {
+    async fn roll_layer(&mut self, freeze_at: Lsn) -> Result<(), FlushLayerError> {
         let current_size = self.write_guard.as_ref().unwrap().current_size;
 
         // self.write_guard will be taken by the freezing
         self.tl
             .freeze_inmem_layer_at(freeze_at, &mut self.write_guard)
-            .await;
+            .await?;
 
-        self.tl.flush_frozen_layers(freeze_at)?;
+        assert!(self.write_guard.is_none());
 
         if current_size >= self.get_checkpoint_distance() * 2 {
             warn!("Flushed oversized open layer with size {}", current_size)
@@ -5708,6 +5792,7 @@ mod tests {
         let layers = timeline.layers.read().await;
         let desc = layers
             .layer_map()
+            .unwrap()
             .iter_historic_layers()
             .next()
             .expect("must find one layer to evict");
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 276d7b4967..87ec46c0b5 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -371,7 +371,7 @@ impl Timeline {
         );
 
         let layers = self.layers.read().await;
-        for layer_desc in layers.layer_map().iter_historic_layers() {
+        for layer_desc in layers.layer_map()?.iter_historic_layers() {
             let layer = layers.get_from_desc(&layer_desc);
             if layer.metadata().shard.shard_count == self.shard_identity.count {
                 // This layer does not belong to a historic ancestor, no need to re-image it.
@@ -549,7 +549,9 @@ impl Timeline {
     ///
     /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers
     /// that we know won't be needed for reads.
-    pub(super) async fn update_layer_visibility(&self) {
+    pub(super) async fn update_layer_visibility(
+        &self,
+    ) -> Result<(), super::layer_manager::Shutdown> {
         let head_lsn = self.get_last_record_lsn();
 
         // We will sweep through layers in reverse-LSN order.  We only do historic layers.  L0 deltas
@@ -557,7 +559,7 @@ impl Timeline {
         // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that
         // they will be subject to L0->L1 compaction in the near future.
         let layer_manager = self.layers.read().await;
-        let layer_map = layer_manager.layer_map();
+        let layer_map = layer_manager.layer_map()?;
 
         let readable_points = {
             let children = self.gc_info.read().unwrap().retain_lsns.clone();
@@ -580,6 +582,7 @@ impl Timeline {
         // TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can
         // avoid assuming that everything at a branch point is visible.
         drop(covered);
+        Ok(())
     }
 
     /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
@@ -633,12 +636,8 @@ impl Timeline {
     ) -> Result<CompactLevel0Phase1Result, CompactionError> {
         stats.read_lock_held_spawn_blocking_startup_micros =
             stats.read_lock_acquisition_micros.till_now(); // set by caller
-        let layers = guard.layer_map();
-        let level0_deltas = layers.get_level0_deltas();
-        let mut level0_deltas = level0_deltas
-            .into_iter()
-            .map(|x| guard.get_from_desc(&x))
-            .collect_vec();
+        let layers = guard.layer_map()?;
+        let level0_deltas = layers.level0_deltas();
         stats.level0_deltas_count = Some(level0_deltas.len());
 
         // Only compact if enough layers have accumulated.
@@ -651,6 +650,11 @@ impl Timeline {
             return Ok(CompactLevel0Phase1Result::default());
         }
 
+        let mut level0_deltas = level0_deltas
+            .iter()
+            .map(|x| guard.get_from_desc(x))
+            .collect::<Vec<_>>();
+
         // Gather the files to compact in this iteration.
         //
         // Start with the oldest Level 0 delta file, and collect any other
@@ -1407,10 +1411,9 @@ impl Timeline {
         // Find the top of the historical layers
         let end_lsn = {
             let guard = self.layers.read().await;
-            let layers = guard.layer_map();
+            let layers = guard.layer_map()?;
 
-            let l0_deltas = layers.get_level0_deltas();
-            drop(guard);
+            let l0_deltas = layers.level0_deltas();
 
             // As an optimization, if we find that there are too few L0 layers,
             // bail out early. We know that the compaction algorithm would do
@@ -1782,7 +1785,7 @@ impl Timeline {
         // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
         let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
             let guard = self.layers.read().await;
-            let layers = guard.layer_map();
+            let layers = guard.layer_map()?;
             let gc_info = self.gc_info.read().unwrap();
             let mut retain_lsns_below_horizon = Vec::new();
             let gc_cutoff = gc_info.cutoffs.select_min();
@@ -2216,7 +2219,9 @@ impl Timeline {
         // Step 3: Place back to the layer map.
         {
             let mut guard = self.layers.write().await;
-            guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
+            guard
+                .open_mut()?
+                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
         };
         self.remote_client
             .schedule_compaction_update(&layer_selection, &compact_to)?;
@@ -2296,7 +2301,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
         self.flush_updates().await?;
 
         let guard = self.timeline.layers.read().await;
-        let layer_map = guard.layer_map();
+        let layer_map = guard.layer_map()?;
 
         let result = layer_map
             .iter_historic_layers()
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 645b5ad2bf..7f63b53e86 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -74,6 +74,11 @@ impl From<crate::tenant::upload_queue::NotInitialized> for Error {
         Error::ShuttingDown
     }
 }
+impl From<super::layer_manager::Shutdown> for Error {
+    fn from(_: super::layer_manager::Shutdown) -> Self {
+        Error::ShuttingDown
+    }
+}
 
 impl From<FlushLayerError> for Error {
     fn from(value: FlushLayerError) -> Self {
@@ -277,7 +282,7 @@ pub(super) async fn prepare(
 
         // between retries, these can change if compaction or gc ran in between. this will mean
         // we have to redo work.
-        partition_work(ancestor_lsn, &layers)
+        partition_work(ancestor_lsn, &layers)?
     };
 
     // TODO: layers are already sorted by something: use that to determine how much of remote
@@ -383,14 +388,14 @@ pub(super) async fn prepare(
 
 fn partition_work(
     ancestor_lsn: Lsn,
-    source_layermap: &LayerManager,
-) -> (usize, Vec<Layer>, Vec<Layer>) {
+    source: &LayerManager,
+) -> Result<(usize, Vec<Layer>, Vec<Layer>), Error> {
     let mut straddling_branchpoint = vec![];
     let mut rest_of_historic = vec![];
 
     let mut later_by_lsn = 0;
 
-    for desc in source_layermap.layer_map().iter_historic_layers() {
+    for desc in source.layer_map()?.iter_historic_layers() {
         // off by one chances here:
         // - start is inclusive
         // - end is exclusive
@@ -409,10 +414,10 @@ fn partition_work(
             &mut rest_of_historic
         };
 
-        target.push(source_layermap.get_from_desc(&desc));
+        target.push(source.get_from_desc(&desc));
     }
 
-    (later_by_lsn, straddling_branchpoint, rest_of_historic)
+    Ok((later_by_lsn, straddling_branchpoint, rest_of_historic))
 }
 
 async fn upload_rewritten_layer(
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 1ba1bf9de5..07d860eb80 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -213,51 +213,45 @@ impl Timeline {
         let mut js = tokio::task::JoinSet::new();
         {
             let guard = self.layers.read().await;
-            let layers = guard.layer_map();
-            for layer in layers.iter_historic_layers() {
-                let layer = guard.get_from_desc(&layer);
 
-                // guard against eviction while we inspect it; it might be that eviction_task and
-                // disk_usage_eviction_task both select the same layers to be evicted, and
-                // seemingly free up double the space. both succeeding is of no consequence.
+            guard
+                .likely_resident_layers()
+                .filter(|layer| {
+                    let last_activity_ts = layer.latest_activity();
 
-                if !layer.is_likely_resident() {
-                    continue;
-                }
+                    let no_activity_for = match now.duration_since(last_activity_ts) {
+                        Ok(d) => d,
+                        Err(_e) => {
+                            // We reach here if `now` < `last_activity_ts`, which can legitimately
+                            // happen if there is an access between us getting `now`, and us getting
+                            // the access stats from the layer.
+                            //
+                            // The other reason why it can happen is system clock skew because
+                            // SystemTime::now() is not monotonic, so, even if there is no access
+                            // to the layer after we get `now` at the beginning of this function,
+                            // it could be that `now`  < `last_activity_ts`.
+                            //
+                            // To distinguish the cases, we would need to record `Instant`s in the
+                            // access stats (i.e., monotonic timestamps), but then, the timestamps
+                            // values in the access stats would need to be `Instant`'s, and hence
+                            // they would be meaningless outside of the pageserver process.
+                            // At the time of writing, the trade-off is that access stats are more
+                            // valuable than detecting clock skew.
+                            return false;
+                        }
+                    };
 
-                let last_activity_ts = layer.latest_activity();
-
-                let no_activity_for = match now.duration_since(last_activity_ts) {
-                    Ok(d) => d,
-                    Err(_e) => {
-                        // We reach here if `now` < `last_activity_ts`, which can legitimately
-                        // happen if there is an access between us getting `now`, and us getting
-                        // the access stats from the layer.
-                        //
-                        // The other reason why it can happen is system clock skew because
-                        // SystemTime::now() is not monotonic, so, even if there is no access
-                        // to the layer after we get `now` at the beginning of this function,
-                        // it could be that `now`  < `last_activity_ts`.
-                        //
-                        // To distinguish the cases, we would need to record `Instant`s in the
-                        // access stats (i.e., monotonic timestamps), but then, the timestamps
-                        // values in the access stats would need to be `Instant`'s, and hence
-                        // they would be meaningless outside of the pageserver process.
-                        // At the time of writing, the trade-off is that access stats are more
-                        // valuable than detecting clock skew.
-                        continue;
-                    }
-                };
-
-                if no_activity_for > p.threshold {
+                    no_activity_for > p.threshold
+                })
+                .cloned()
+                .for_each(|layer| {
                     js.spawn(async move {
                         layer
                             .evict_and_wait(std::time::Duration::from_secs(5))
                             .await
                     });
                     stats.candidates += 1;
-                }
-            }
+                });
         };
 
         let join_all = async move {
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index e6e7bc2e77..8f20d84401 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,4 +1,4 @@
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{bail, ensure, Context};
 use itertools::Itertools;
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
@@ -24,39 +24,142 @@ use crate::{
 use super::TimelineWriterState;
 
 /// Provides semantic APIs to manipulate the layer map.
-#[derive(Default)]
-pub(crate) struct LayerManager {
-    layer_map: LayerMap,
-    layer_fmgr: LayerFileManager<Layer>,
+pub(crate) enum LayerManager {
+    /// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate
+    /// the layers.
+    Open(OpenLayerManager),
+    /// Shutdown layer manager where there are no more in-memory layers and persistent layers are
+    /// read-only.
+    Closed {
+        layers: HashMap<PersistentLayerKey, Layer>,
+    },
+}
+
+impl Default for LayerManager {
+    fn default() -> Self {
+        LayerManager::Open(OpenLayerManager::default())
+    }
 }
 
 impl LayerManager {
-    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
-        self.layer_fmgr.get_from_desc(desc)
+    pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
+        // The assumption for the `expect()` is that all code maintains the following invariant:
+        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
+        self.layers()
+            .get(key)
+            .with_context(|| format!("get layer from key: {key}"))
+            .expect("not found")
+            .clone()
     }
 
-    pub(crate) fn get_from_key(&self, desc: &PersistentLayerKey) -> Layer {
-        self.layer_fmgr.get_from_key(desc)
+    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
+        self.get_from_key(&desc.key())
     }
 
     /// Get an immutable reference to the layer map.
     ///
     /// We expect users only to be able to get an immutable layer map. If users want to make modifications,
     /// they should use the below semantic APIs. This design makes us step closer to immutable storage state.
-    pub(crate) fn layer_map(&self) -> &LayerMap {
-        &self.layer_map
+    pub(crate) fn layer_map(&self) -> Result<&LayerMap, Shutdown> {
+        use LayerManager::*;
+        match self {
+            Open(OpenLayerManager { layer_map, .. }) => Ok(layer_map),
+            Closed { .. } => Err(Shutdown),
+        }
     }
 
+    pub(crate) fn open_mut(&mut self) -> Result<&mut OpenLayerManager, Shutdown> {
+        use LayerManager::*;
+
+        match self {
+            Open(open) => Ok(open),
+            Closed { .. } => Err(Shutdown),
+        }
+    }
+
+    /// LayerManager shutdown. The in-memory layers do cleanup on drop, so we must drop them in
+    /// order to allow shutdown to complete.
+    ///
+    /// If there was a want to flush in-memory layers, it must have happened earlier.
+    pub(crate) fn shutdown(&mut self, writer_state: &mut Option<TimelineWriterState>) {
+        use LayerManager::*;
+        match self {
+            Open(OpenLayerManager {
+                layer_map,
+                layer_fmgr: LayerFileManager(hashmap),
+            }) => {
+                let open = layer_map.open_layer.take();
+                let frozen = layer_map.frozen_layers.len();
+                let taken_writer_state = writer_state.take();
+                tracing::info!(open = open.is_some(), frozen, "dropped inmemory layers");
+                let layers = std::mem::take(hashmap);
+                *self = Closed { layers };
+                assert_eq!(open.is_some(), taken_writer_state.is_some());
+            }
+            Closed { .. } => {
+                tracing::debug!("ignoring multiple shutdowns on layer manager")
+            }
+        }
+    }
+
+    /// Sum up the historic layer sizes
+    pub(crate) fn layer_size_sum(&self) -> u64 {
+        self.layers()
+            .values()
+            .map(|l| l.layer_desc().file_size)
+            .sum()
+    }
+
+    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = &'_ Layer> + '_ {
+        self.layers().values().filter(|l| l.is_likely_resident())
+    }
+
+    pub(crate) fn contains(&self, layer: &Layer) -> bool {
+        self.contains_key(&layer.layer_desc().key())
+    }
+
+    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
+        self.layers().contains_key(key)
+    }
+
+    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
+        self.layers().keys().cloned().collect_vec()
+    }
+
+    fn layers(&self) -> &HashMap<PersistentLayerKey, Layer> {
+        use LayerManager::*;
+        match self {
+            Open(OpenLayerManager { layer_fmgr, .. }) => &layer_fmgr.0,
+            Closed { layers } => layers,
+        }
+    }
+}
+
+#[derive(Default)]
+pub(crate) struct OpenLayerManager {
+    layer_map: LayerMap,
+    layer_fmgr: LayerFileManager<Layer>,
+}
+
+impl std::fmt::Debug for OpenLayerManager {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("OpenLayerManager")
+            .field("layer_count", &self.layer_fmgr.0.len())
+            .finish()
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+#[error("layer manager has been shutdown")]
+pub(crate) struct Shutdown;
+
+impl OpenLayerManager {
     /// Called from `load_layer_map`. Initialize the layer manager with:
     /// 1. all on-disk layers
     /// 2. next open layer (with disk disk_consistent_lsn LSN)
-    pub(crate) fn initialize_local_layers(
-        &mut self,
-        on_disk_layers: Vec<Layer>,
-        next_open_layer_at: Lsn,
-    ) {
+    pub(crate) fn initialize_local_layers(&mut self, layers: Vec<Layer>, next_open_layer_at: Lsn) {
         let mut updates = self.layer_map.batch_update();
-        for layer in on_disk_layers {
+        for layer in layers {
             Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
         }
         updates.flush();
@@ -68,26 +171,19 @@ impl LayerManager {
         self.layer_map.next_open_layer_at = Some(next_open_layer_at);
     }
 
-    /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
-    /// called within `get_layer_for_write`.
+    /// Open a new writable layer to append data if there is no open layer, otherwise return the
+    /// current open layer, called within `get_layer_for_write`.
     pub(crate) async fn get_layer_for_write(
         &mut self,
         lsn: Lsn,
-        last_record_lsn: Lsn,
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
+        gate_guard: utils::sync::gate::GateGuard,
         ctx: &RequestContext,
-    ) -> Result<Arc<InMemoryLayer>> {
+    ) -> anyhow::Result<Arc<InMemoryLayer>> {
         ensure!(lsn.is_aligned());
 
-        ensure!(
-            lsn > last_record_lsn,
-            "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
-            lsn,
-            last_record_lsn,
-        );
-
         // Do we have a layer open for writing already?
         let layer = if let Some(open_layer) = &self.layer_map.open_layer {
             if open_layer.get_lsn_range().start > lsn {
@@ -113,8 +209,15 @@ impl LayerManager {
                 lsn
             );
 
-            let new_layer =
-                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?;
+            let new_layer = InMemoryLayer::create(
+                conf,
+                timeline_id,
+                tenant_shard_id,
+                start_lsn,
+                gate_guard,
+                ctx,
+            )
+            .await?;
             let layer = Arc::new(new_layer);
 
             self.layer_map.open_layer = Some(layer.clone());
@@ -168,7 +271,7 @@ impl LayerManager {
         froze
     }
 
-    /// Add image layers to the layer map, called from `create_image_layers`.
+    /// Add image layers to the layer map, called from [`super::Timeline::create_image_layers`].
     pub(crate) fn track_new_image_layers(
         &mut self,
         image_layers: &[ResidentLayer],
@@ -241,7 +344,7 @@ impl LayerManager {
         self.finish_compact_l0(compact_from, compact_to, metrics)
     }
 
-    /// Called when compaction is completed.
+    /// Called post-compaction when some previous generation image layers were trimmed.
     pub(crate) fn rewrite_layers(
         &mut self,
         rewrite_layers: &[(Layer, ResidentLayer)],
@@ -330,31 +433,6 @@ impl LayerManager {
         mapping.remove(layer);
         layer.delete_on_drop();
     }
-
-    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = Layer> + '_ {
-        // for small layer maps, we most likely have all resident, but for larger more are likely
-        // to be evicted assuming lots of layers correlated with longer lifespan.
-
-        self.layer_map().iter_historic_layers().filter_map(|desc| {
-            self.layer_fmgr
-                .0
-                .get(&desc.key())
-                .filter(|l| l.is_likely_resident())
-                .cloned()
-        })
-    }
-
-    pub(crate) fn contains(&self, layer: &Layer) -> bool {
-        self.layer_fmgr.contains(layer)
-    }
-
-    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
-        self.layer_fmgr.contains_key(key)
-    }
-
-    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
-        self.layer_fmgr.0.keys().cloned().collect_vec()
-    }
 }
 
 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
@@ -366,24 +444,6 @@ impl<T> Default for LayerFileManager<T> {
 }
 
 impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
-    fn get_from_key(&self, key: &PersistentLayerKey) -> T {
-        // The assumption for the `expect()` is that all code maintains the following invariant:
-        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
-        self.0
-            .get(key)
-            .with_context(|| format!("get layer from key: {}", key))
-            .expect("not found")
-            .clone()
-    }
-
-    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
-        self.get_from_key(&desc.key())
-    }
-
-    fn contains_key(&self, key: &PersistentLayerKey) -> bool {
-        self.0.contains_key(key)
-    }
-
     pub(crate) fn insert(&mut self, layer: T) {
         let present = self.0.insert(layer.layer_desc().key(), layer.clone());
         if present.is_some() && cfg!(debug_assertions) {
@@ -391,10 +451,6 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
         }
     }
 
-    pub(crate) fn contains(&self, layer: &T) -> bool {
-        self.0.contains_key(&layer.layer_desc().key())
-    }
-
     pub(crate) fn remove(&mut self, layer: &T) {
         let present = self.0.remove(&layer.layer_desc().key());
         if present.is_none() && cfg!(debug_assertions) {

From b3eea45277bd54f1437fdc313277bff1afa35673 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Wed, 7 Aug 2024 23:37:46 +0800
Subject: [PATCH 1358/1571] fix(pageserver): dump the key when it's invalid
 (#8633)

We see an assertion error in staging. Dump the key to guess where it was
from, and then we can fix it.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 0acd83753e..3af3f74e9c 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -107,7 +107,10 @@ impl Key {
     /// As long as Neon does not support tablespace (because of lack of access to local file system),
     /// we can assume that only some predefined namespace OIDs are used which can fit in u16
     pub fn to_i128(&self) -> i128 {
-        assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
+        assert!(
+            self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
+            "invalid key: {self}",
+        );
         (((self.field1 & 0x7F) as i128) << 120)
             | (((self.field2 & 0xFFFF) as i128) << 104)
             | ((self.field3 as i128) << 72)

From a81fab48261aecbd386f57989a5e0af0f58b2030 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 7 Aug 2024 19:19:00 +0300
Subject: [PATCH 1359/1571] refactor(timeline_detach_ancestor): replace ordered
 reparented with a hashset (#8629)

Earlier I was thinking we'd need a (ancestor_lsn, timeline_id) ordered
list of reparented. Turns out we did not need it at all. Replace it with
an unordered hashset. Additionally refactor the reparented direct
children query out, it will later be used from more places.

Split off from #8430.

Cc: #6994
---
 .../src/models/detach_ancestor.rs             |   4 +-
 pageserver/src/tenant/mgr.rs                  |   5 +-
 pageserver/src/tenant/timeline.rs             |   6 +-
 .../src/tenant/timeline/detach_ancestor.rs    | 121 ++++++++++--------
 storage_controller/src/service.rs             |   2 +-
 test_runner/fixtures/pageserver/http.py       |   4 +-
 .../regress/test_timeline_detach_ancestor.py  |   6 +-
 7 files changed, 78 insertions(+), 70 deletions(-)

diff --git a/libs/pageserver_api/src/models/detach_ancestor.rs b/libs/pageserver_api/src/models/detach_ancestor.rs
index ae5a21bab9..ad74d343ae 100644
--- a/libs/pageserver_api/src/models/detach_ancestor.rs
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -1,6 +1,8 @@
+use std::collections::HashSet;
+
 use utils::id::TimelineId;
 
 #[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct AncestorDetached {
-    pub reparented_timelines: Vec<TimelineId>,
+    pub reparented_timelines: HashSet<TimelineId>,
 }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 7901fc3554..3f592f167e 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -13,7 +13,7 @@ use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::cmp::Ordering;
-use std::collections::{BTreeMap, HashMap};
+use std::collections::{BTreeMap, HashMap, HashSet};
 use std::ops::Deref;
 use std::sync::Arc;
 use std::time::Duration;
@@ -1966,7 +1966,8 @@ impl TenantManager {
         timeline_id: TimelineId,
         prepared: PreparedTimelineDetach,
         ctx: &RequestContext,
-    ) -> Result<Vec<TimelineId>, anyhow::Error> {
+    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
+        // FIXME: this is unnecessary, slotguard already has these semantics
         struct RevertOnDropSlot(Option<SlotGuard>);
 
         impl Drop for RevertOnDropSlot {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8f9ff78fd8..76dcb5645f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3286,10 +3286,6 @@ impl Timeline {
         Ok(ancestor.clone())
     }
 
-    pub(crate) fn get_ancestor_timeline(&self) -> Option<Arc<Timeline>> {
-        self.ancestor_timeline.clone()
-    }
-
     pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {
         &self.shard_identity
     }
@@ -4366,7 +4362,7 @@ impl Timeline {
         tenant: &crate::tenant::Tenant,
         prepared: detach_ancestor::PreparedTimelineDetach,
         ctx: &RequestContext,
-    ) -> Result<Vec<TimelineId>, anyhow::Error> {
+    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
         detach_ancestor::complete(self, tenant, prepared, ctx).await
     }
 
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 7f63b53e86..3b52adc77b 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -1,4 +1,4 @@
-use std::sync::Arc;
+use std::{collections::HashSet, sync::Arc};
 
 use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
 use crate::{
@@ -146,50 +146,9 @@ pub(super) async fn prepare(
             }
         }
 
-        // detached has previously been detached; let's inspect each of the current timelines and
-        // report back the timelines which have been reparented by our detach
-        let mut all_direct_children = tenant
-            .timelines
-            .lock()
-            .unwrap()
-            .values()
-            .filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
-            .map(|tl| (tl.ancestor_lsn, tl.clone()))
-            .collect::<Vec<_>>();
-
-        let mut any_shutdown = false;
-
-        all_direct_children.retain(
-            |(_, tl)| match tl.remote_client.initialized_upload_queue() {
-                Ok(accessor) => accessor
-                    .latest_uploaded_index_part()
-                    .lineage
-                    .is_reparented(),
-                Err(_shutdownalike) => {
-                    // not 100% a shutdown, but let's bail early not to give inconsistent results in
-                    // sharded enviroment.
-                    any_shutdown = true;
-                    true
-                }
-            },
-        );
-
-        if any_shutdown {
-            // it could be one or many being deleted; have client retry
-            return Err(Error::ShuttingDown);
-        }
-
-        let mut reparented = all_direct_children;
-        // why this instead of hashset? there is a reason, but I've forgotten it many times.
-        //
-        // maybe if this was a hashset we would not be able to distinguish some race condition.
-        reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));
-
+        let reparented_timelines = reparented_direct_children(detached, tenant)?;
         return Ok(Progress::Done(AncestorDetached {
-            reparented_timelines: reparented
-                .into_iter()
-                .map(|(_, tl)| tl.timeline_id)
-                .collect(),
+            reparented_timelines,
         }));
     };
 
@@ -386,6 +345,57 @@ pub(super) async fn prepare(
     Ok(Progress::Prepared(guard, prepared))
 }
 
+fn reparented_direct_children(
+    detached: &Arc<Timeline>,
+    tenant: &Tenant,
+) -> Result<HashSet<TimelineId>, Error> {
+    let mut all_direct_children = tenant
+        .timelines
+        .lock()
+        .unwrap()
+        .values()
+        .filter_map(|tl| {
+            let is_direct_child = matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached));
+
+            if is_direct_child {
+                Some(tl.clone())
+            } else {
+                if let Some(timeline) = tl.ancestor_timeline.as_ref() {
+                    assert_ne!(timeline.timeline_id, detached.timeline_id, "we cannot have two timelines with the same timeline_id live");
+                }
+                None
+            }
+        })
+        // Collect to avoid lock taking order problem with Tenant::timelines and
+        // Timeline::remote_client
+        .collect::<Vec<_>>();
+
+    let mut any_shutdown = false;
+
+    all_direct_children.retain(|tl| match tl.remote_client.initialized_upload_queue() {
+        Ok(accessor) => accessor
+            .latest_uploaded_index_part()
+            .lineage
+            .is_reparented(),
+        Err(_shutdownalike) => {
+            // not 100% a shutdown, but let's bail early not to give inconsistent results in
+            // sharded enviroment.
+            any_shutdown = true;
+            true
+        }
+    });
+
+    if any_shutdown {
+        // it could be one or many being deleted; have client retry
+        return Err(Error::ShuttingDown);
+    }
+
+    Ok(all_direct_children
+        .into_iter()
+        .map(|tl| tl.timeline_id)
+        .collect())
+}
+
 fn partition_work(
     ancestor_lsn: Lsn,
     source: &LayerManager,
@@ -544,11 +554,12 @@ pub(super) async fn complete(
     tenant: &Tenant,
     prepared: PreparedTimelineDetach,
     _ctx: &RequestContext,
-) -> Result<Vec<TimelineId>, anyhow::Error> {
+) -> Result<HashSet<TimelineId>, anyhow::Error> {
     let PreparedTimelineDetach { layers } = prepared;
 
     let ancestor = detached
-        .get_ancestor_timeline()
+        .ancestor_timeline
+        .as_ref()
         .expect("must still have a ancestor");
     let ancestor_lsn = detached.get_ancestor_lsn();
 
@@ -588,7 +599,7 @@ pub(super) async fn complete(
             }
 
             let tl_ancestor = tl.ancestor_timeline.as_ref()?;
-            let is_same = Arc::ptr_eq(&ancestor, tl_ancestor);
+            let is_same = Arc::ptr_eq(ancestor, tl_ancestor);
             let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
 
             let is_deleting = tl
@@ -629,13 +640,18 @@ pub(super) async fn complete(
         });
 
     let reparenting_candidates = tasks.len();
-    let mut reparented = Vec::with_capacity(tasks.len());
+    let mut reparented = HashSet::with_capacity(tasks.len());
 
     while let Some(res) = tasks.join_next().await {
         match res {
             Ok(Some(timeline)) => {
                 tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
-                reparented.push((timeline.ancestor_lsn, timeline.timeline_id));
+
+                assert!(
+                    reparented.insert(timeline.timeline_id),
+                    "duplicate reparenting? timeline_id={}",
+                    timeline.timeline_id
+                );
             }
             Ok(None) => {
                 // lets just ignore this for now. one or all reparented timelines could had
@@ -657,12 +673,5 @@ pub(super) async fn complete(
         tracing::info!("failed to reparent some candidates");
     }
 
-    reparented.sort_unstable();
-
-    let reparented = reparented
-        .into_iter()
-        .map(|(_, timeline_id)| timeline_id)
-        .collect();
-
     Ok(reparented)
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 6940bf2c64..e391ce65e6 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2954,7 +2954,6 @@ impl Service {
         }
 
         // no shard needs to go first/last; the operation should be idempotent
-        // TODO: it would be great to ensure that all shards return the same error
         let mut results = self
             .tenant_for_shards(targets, |tenant_shard_id, node| {
                 futures::FutureExt::boxed(detach_one(
@@ -2973,6 +2972,7 @@ impl Service {
             .filter(|(_, res)| res != &any.1)
             .collect::<Vec<_>>();
         if !mismatching.is_empty() {
+            // this can be hit by races which should not happen because operation lock on cplane
             let matching = results.len() - mismatching.len();
             tracing::error!(
                 matching,
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 5be59d3749..65d6ff5d62 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -857,7 +857,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         timeline_id: TimelineId,
         batch_size: int | None = None,
         **kwargs,
-    ) -> List[TimelineId]:
+    ) -> Set[TimelineId]:
         params = {}
         if batch_size is not None:
             params["batch_size"] = batch_size
@@ -868,7 +868,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         )
         self.verbose_error(res)
         json = res.json()
-        return list(map(TimelineId, json["reparented_timelines"]))
+        return set(map(TimelineId, json["reparented_timelines"]))
 
     def evict_layer(
         self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 38f8dfa885..b3767a2766 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -165,7 +165,7 @@ def test_ancestor_detach_branched_from(
     )
 
     all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
-    assert all_reparented == []
+    assert all_reparented == set()
 
     if restart_after:
         env.pageserver.stop()
@@ -534,7 +534,7 @@ def test_compaction_induced_by_detaches_in_history(
 
     for _, timeline_id in skip_main:
         reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
-        assert reparented == [], "we have no earlier branches at any level"
+        assert reparented == set(), "we have no earlier branches at any level"
 
     post_detach_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id)))
     assert len(post_detach_l0s) == 5, "should had inherited 4 L0s, have 5 in total"
@@ -774,7 +774,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
         else:
             break
 
-    assert reparented == [], "too many retries (None) or unexpected reparentings"
+    assert reparented == set(), "too many retries (None) or unexpected reparentings"
 
     for shard_info in shards:
         node_id = int(shard_info["node_id"])

From 8468d51a14d35af6232467e79c88e23a2c0bb507 Mon Sep 17 00:00:00 2001
From: Cihan Demirci <128653800+fcdm@users.noreply.github.com>
Date: Wed, 7 Aug 2024 19:53:47 +0300
Subject: [PATCH 1360/1571] cicd: push build-tools image to ACR as well (#8638)

https://github.com/neondatabase/cloud/issues/15899
---
 .github/workflows/pin-build-tools-image.yml | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index 024594532f..cf10910b0b 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -66,8 +66,22 @@ jobs:
           username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
           password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
+      - name: Azure login
+        if: steps.check-manifests.outputs.skip == 'false'
+        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
+        with:
+          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
+          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
+
+      - name: Login to ACR
+        if: steps.check-manifests.outputs.skip == 'false'
+        run: |
+          az acr login --name=neoneastus2
+
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR and ACR
         if: steps.check-manifests.outputs.skip == 'false'
         run: |
           docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
+                                          -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
                                              neondatabase/build-tools:${FROM_TAG}

From 05dd1ae9e038589c98168f8e817d8a31e027d12f Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 7 Aug 2024 20:14:45 +0300
Subject: [PATCH 1361/1571] fix: drain completed page_service connections
 (#8632)

We've noticed increased memory usage with the latest release. Drain the
joinset of `page_service` connection handlers to avoid leaking them
until shutdown. An alternative would be to use a TaskTracker.
TaskTracker was not discussed in original PR #8339 review, so not hot
fixing it in here either.
---
 pageserver/src/page_service.rs             | 42 +++++++++++-----------
 test_runner/regress/test_bad_connection.py | 11 +++++-
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 5344b83e0d..81294291a9 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -122,16 +122,19 @@ impl Listener {
     }
 }
 impl Connections {
-    pub async fn shutdown(self) {
+    pub(crate) async fn shutdown(self) {
         let Self { cancel, mut tasks } = self;
         cancel.cancel();
         while let Some(res) = tasks.join_next().await {
-            // the logging done here mimics what was formerly done by task_mgr
-            match res {
-                Ok(Ok(())) => {}
-                Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
-                Err(e) => error!("page_service connection task panicked: {:?}", e),
-            }
+            Self::handle_connection_completion(res);
+        }
+    }
+
+    fn handle_connection_completion(res: Result<anyhow::Result<()>, tokio::task::JoinError>) {
+        match res {
+            Ok(Ok(())) => {}
+            Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
+            Err(e) => error!("page_service connection task panicked: {:?}", e),
         }
     }
 }
@@ -155,20 +158,19 @@ pub async fn libpq_listener_main(
     let connections_cancel = CancellationToken::new();
     let mut connection_handler_tasks = tokio::task::JoinSet::default();
 
-    // Wait for a new connection to arrive, or for server shutdown.
-    while let Some(res) = tokio::select! {
-        biased;
+    loop {
+        let accepted = tokio::select! {
+            biased;
+            _ = listener_cancel.cancelled() => break,
+            next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => {
+                let res = next.expect("we dont poll while empty");
+                Connections::handle_connection_completion(res);
+                continue;
+            }
+            accepted = listener.accept() => accepted,
+        };
 
-        _ = listener_cancel.cancelled() => {
-            // We were requested to shut down.
-            None
-        }
-
-        res = listener.accept() => {
-            Some(res)
-        }
-    } {
-        match res {
+        match accepted {
             Ok((socket, peer_addr)) => {
                 // Connection established. Spawn a new task to handle it.
                 debug!("accepted connection from {}", peer_addr);
diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py
index 82a3a05c2b..392b73c1f7 100644
--- a/test_runner/regress/test_bad_connection.py
+++ b/test_runner/regress/test_bad_connection.py
@@ -10,7 +10,12 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 @pytest.mark.timeout(600)
 def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
-    env.pageserver.allowed_errors.append(".*simulated connection error.*")
+    env.pageserver.allowed_errors.append(".*simulated connection error.*")  # this is never hit
+
+    # the real reason (Simulated Connection Error) is on the next line, and we cannot filter this out.
+    env.pageserver.allowed_errors.append(
+        ".*ERROR error in page_service connection task: Postgres query error"
+    )
 
     # Enable failpoint before starting everything else up so that we exercise the retry
     # on fetching basebackup
@@ -69,3 +74,7 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
             cur.fetchall()
         times_executed += 1
     log.info(f"Workload executed {times_executed} times")
+
+    # do a graceful shutdown which would had caught the allowed_errors before
+    # https://github.com/neondatabase/neon/pull/8632
+    env.pageserver.stop()

From 542385e36493325ea4a1b781fc288ff4caa3922a Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Wed, 7 Aug 2024 16:04:19 -0400
Subject: [PATCH 1362/1571] feat(pageserver): add direct io pageserver config
 (#8622)

Part of #8130, [RFC: Direct IO For Pageserver](https://github.com/neondatabase/neon/blob/problame/direct-io-rfc/docs/rfcs/034-direct-io-for-pageserver.md)

## Description

Add pageserver config for evaluating/enabling direct I/O.

- Disabled: current default, uses buffered io as is.
- Evaluate: still uses buffered io, but could do alignment checking and
perf simulation (pad latency by direct io RW to a fake file).
- Enabled: uses direct io, behavior on alignment error is configurable.


Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 libs/pageserver_api/src/models.rs | 49 +++++++++++++++++++++++++++++++
 pageserver/src/bin/pageserver.rs  |  1 +
 pageserver/src/config.rs          | 17 +++++++++++
 pageserver/src/virtual_file.rs    |  1 +
 4 files changed, 68 insertions(+)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index b541bba6a1..ab4adfbebe 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -947,6 +947,8 @@ pub struct TopTenantShardsResponse {
 }
 
 pub mod virtual_file {
+    use std::path::PathBuf;
+
     #[derive(
         Copy,
         Clone,
@@ -965,6 +967,53 @@ pub mod virtual_file {
         #[cfg(target_os = "linux")]
         TokioEpollUring,
     }
+
+    /// Direct IO modes for a pageserver.
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+    pub enum DirectIoMode {
+        /// Direct IO disabled (uses usual buffered IO).
+        #[default]
+        Disabled,
+        /// Direct IO disabled (performs checks and perf simulations).
+        Evaluate {
+            /// Alignment check level
+            alignment_check: DirectIoAlignmentCheckLevel,
+            /// Latency padded for performance simulation.
+            latency_padding: DirectIoLatencyPadding,
+        },
+        /// Direct IO enabled.
+        Enabled {
+            /// Actions to perform on alignment error.
+            on_alignment_error: DirectIoOnAlignmentErrorAction,
+        },
+    }
+
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(rename_all = "kebab-case")]
+    pub enum DirectIoAlignmentCheckLevel {
+        #[default]
+        Error,
+        Log,
+        None,
+    }
+
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(rename_all = "kebab-case")]
+    pub enum DirectIoOnAlignmentErrorAction {
+        Error,
+        #[default]
+        FallbackToBuffered,
+    }
+
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(tag = "type", rename_all = "kebab-case")]
+    pub enum DirectIoLatencyPadding {
+        /// Pad virtual file operations with IO to a fake file.
+        FakeFileRW { path: PathBuf },
+        #[default]
+        None,
+    }
 }
 
 // Wrapped in libpq CopyData
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 5ebd6511ac..932918410c 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -123,6 +123,7 @@ fn main() -> anyhow::Result<()> {
 
     // after setting up logging, log the effective IO engine choice and read path implementations
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
+    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
     info!(?conf.get_impl, "starting with get page implementation");
     info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
     info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 41c2fe0af3..f4c367bd4d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -300,6 +300,9 @@ pub struct PageServerConf {
     /// This flag is temporary and will be removed after gradual rollout.
     /// See <https://github.com/neondatabase/neon/issues/8184>.
     pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
+
+    /// Direct IO settings
+    pub virtual_file_direct_io: virtual_file::DirectIoMode,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -408,6 +411,8 @@ struct PageServerConfigBuilder {
     l0_flush: BuilderValue<L0FlushConfig>,
 
     compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
+
+    virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
 }
 
 impl PageServerConfigBuilder {
@@ -498,6 +503,7 @@ impl PageServerConfigBuilder {
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: Set(L0FlushConfig::default()),
             compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
+            virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
         }
     }
 }
@@ -685,6 +691,10 @@ impl PageServerConfigBuilder {
         self.compact_level0_phase1_value_access = BuilderValue::Set(value);
     }
 
+    pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) {
+        self.virtual_file_direct_io = BuilderValue::Set(value);
+    }
+
     pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
@@ -743,6 +753,7 @@ impl PageServerConfigBuilder {
                 ephemeral_bytes_per_memory_kb,
                 l0_flush,
                 compact_level0_phase1_value_access,
+                virtual_file_direct_io,
             }
             CUSTOM LOGIC
             {
@@ -1018,6 +1029,9 @@ impl PageServerConf {
                 "compact_level0_phase1_value_access" => {
                     builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
                 }
+                "virtual_file_direct_io" => {
+                    builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1103,6 +1117,7 @@ impl PageServerConf {
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
             l0_flush: L0FlushConfig::default(),
             compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+            virtual_file_direct_io: virtual_file::DirectIoMode::default(),
         }
     }
 }
@@ -1345,6 +1360,7 @@ background_task_maximum_delay = '334 s'
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
                 compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1420,6 +1436,7 @@ background_task_maximum_delay = '334 s'
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
                 compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 51b0c420c3..8de646469e 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -34,6 +34,7 @@ pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
 use self::owned_buffers_io::write::OwnedAsyncWriter;
+pub(crate) use api::DirectIoMode;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;

From cf3eac785b30d686f8c9bebe521a1b63a61a9ec5 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 7 Aug 2024 21:17:08 +0100
Subject: [PATCH 1363/1571] pageserver: make bench_ingest build (but panic) on
 macOS (#8641)

## Problem

Some developers build on MacOS, which doesn't have  io_uring.

## Summary of changes

- Add `io_engine_for_bench`, which on linux will give io_uring or panic
if it's unavailable, and on MacOS will always panic.

We do not want to run such benchmarks with StdFs: the results aren't
interesting, and will actively waste the time of any developers who
start investigating performance before they realize they're using a
known-slow I/O backend.

Why not just conditionally compile this benchmark on linux only? Because
even on linux, I still want it to refuse to run if it can't get
io_uring.
---
 pageserver/benches/bench_ingest.rs       |  4 ++--
 pageserver/src/virtual_file.rs           |  1 +
 pageserver/src/virtual_file/io_engine.rs | 26 ++++++++++++++++++++++++
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 459394449a..9bab02e46c 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -11,7 +11,7 @@ use pageserver::{
     repository::Value,
     task_mgr::TaskKind,
     tenant::storage_layer::InMemoryLayer,
-    virtual_file::{self, api::IoEngineKind},
+    virtual_file,
 };
 use pageserver_api::{key::Key, shard::TenantShardId};
 use utils::{
@@ -149,7 +149,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     let conf: &'static PageServerConf = Box::leak(Box::new(
         pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
     ));
-    virtual_file::init(16384, IoEngineKind::TokioEpollUring);
+    virtual_file::init(16384, virtual_file::io_engine_for_bench());
     page_cache::init(conf.page_cache_size);
 
     {
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 8de646469e..27f6fe90a4 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -30,6 +30,7 @@ use tokio::time::Instant;
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
 pub use io_engine::feature_test as io_engine_feature_test;
+pub use io_engine::io_engine_for_bench;
 pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 2820cea097..0ffcd9fa05 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -328,3 +328,29 @@ pub fn feature_test() -> anyhow::Result<FeatureTestResult> {
     .join()
     .unwrap()
 }
+
+/// For use in benchmark binaries only.
+///
+/// Benchmarks which initialize `virtual_file` need to know what engine to use, but we also
+/// don't want to silently fall back to slower I/O engines in a benchmark: this could waste
+/// developer time trying to figure out why it's slow.
+///
+/// In practice, this method will either return IoEngineKind::TokioEpollUring, or panic.
+pub fn io_engine_for_bench() -> IoEngineKind {
+    #[cfg(not(target_os = "linux"))]
+    {
+        panic!("This benchmark does I/O and can only give a representative result on Linux");
+    }
+    #[cfg(target_os = "linux")]
+    {
+        match feature_test().unwrap() {
+            FeatureTestResult::PlatformPreferred(engine) => engine,
+            FeatureTestResult::Worse {
+                engine: _engine,
+                remark,
+            } => {
+                panic!("This benchmark does I/O can requires the preferred I/O engine: {remark}");
+            }
+        }
+    }
+}

From cbe8c77997aea576a96a7f8d31147cb7a11d6a6b Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 8 Aug 2024 10:23:57 +0300
Subject: [PATCH 1364/1571] Use sycnhronous commit for logical replicaiton
 worker (#8645)

## Problem

See
https://neondb.slack.com/archives/C03QLRH7PPD/p1723038557449239?thread_ts=1722868375.476789&cid=C03QLRH7PPD


Logical replication subscription by default use `synchronous_commit=off`
which cause problems with safekeeper

## Summary of changes

Set `synchronous_commit=on` for logical replication subscription in
test_subscriber_restart.py

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 test_runner/regress/test_subscriber_restart.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py
index 91caad7220..4581008022 100644
--- a/test_runner/regress/test_subscriber_restart.py
+++ b/test_runner/regress/test_subscriber_restart.py
@@ -37,7 +37,9 @@ def test_subscriber_restart(neon_simple_env: NeonEnv):
             scur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
             # scur.execute("CREATE INDEX on t(sk)") # slowdown applying WAL at replica
             pub_conn = f"host=localhost port={pub.pg_port} dbname=postgres user=cloud_admin"
-            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub"
+            # synchronous_commit=on to test a hypothesis for why this test has been flaky.
+            # XXX: Add link to the issue
+            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub with (synchronous_commit=on)"
             scur.execute(query)
             time.sleep(2)  # let initial table sync complete
 

From 21638ee96cdba59899f15d5d446ade3a03c32930 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 8 Aug 2024 12:34:47 +0300
Subject: [PATCH 1365/1571] fix(test): do not fail test for filesystem race
 (#8643)

evidence:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8632/10287641784/index.html#suites/0e58fb04d9998963e98e45fe1880af7d/c7a46335515142b/
---
 pageserver/src/statvfs.rs | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs
index 45a516566f..ede1791afa 100644
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -56,7 +56,6 @@ impl Statvfs {
 }
 
 pub mod mock {
-    use anyhow::Context;
     use camino::Utf8Path;
     use regex::Regex;
     use tracing::log::info;
@@ -135,14 +134,30 @@ pub mod mock {
             {
                 continue;
             }
-            total += entry
-                .metadata()
-                .with_context(|| format!("get metadata of {:?}", entry.path()))?
-                .len();
+            let m = match entry.metadata() {
+                Ok(m) => m,
+                Err(e) if is_not_found(&e) => {
+                    // some temp file which got removed right as we are walking
+                    continue;
+                }
+                Err(e) => {
+                    return Err(anyhow::Error::new(e)
+                        .context(format!("get metadata of {:?}", entry.path())))
+                }
+            };
+            total += m.len();
         }
         Ok(total)
     }
 
+    fn is_not_found(e: &walkdir::Error) -> bool {
+        let Some(io_error) = e.io_error() else {
+            return false;
+        };
+        let kind = io_error.kind();
+        matches!(kind, std::io::ErrorKind::NotFound)
+    }
+
     pub struct Statvfs {
         pub blocks: u64,
         pub blocks_available: u64,

From 8561b2c628683fb9845672eca8b66578434100e2 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 8 Aug 2024 14:02:53 +0300
Subject: [PATCH 1366/1571] fix: stop leaking BackgroundPurges (#8650)

avoid "leaking" the completions of BackgroundPurges by:

1. switching it to TaskTracker for provided close+wait
2. stop using tokio::fs::remove_dir_all which will consume two units of
memory instead of one blocking task

Additionally, use more graceful shutdown in tests which do actually some
background cleanup.
---
 pageserver/src/tenant/mgr.rs              | 96 +++++++----------------
 test_runner/regress/test_tenant_delete.py | 11 ++-
 2 files changed, 36 insertions(+), 71 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 3f592f167e..3316627540 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -224,21 +224,8 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
 }
 
 /// See [`Self::spawn`].
-#[derive(Clone)]
-pub struct BackgroundPurges(Arc<std::sync::Mutex<BackgroundPurgesInner>>);
-enum BackgroundPurgesInner {
-    Open(tokio::task::JoinSet<()>),
-    // we use the async mutex for coalescing
-    ShuttingDown(Arc<tokio::sync::Mutex<tokio::task::JoinSet<()>>>),
-}
-
-impl Default for BackgroundPurges {
-    fn default() -> Self {
-        Self(Arc::new(std::sync::Mutex::new(
-            BackgroundPurgesInner::Open(JoinSet::new()),
-        )))
-    }
-}
+#[derive(Clone, Default)]
+pub struct BackgroundPurges(tokio_util::task::TaskTracker);
 
 impl BackgroundPurges {
     /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
@@ -247,24 +234,32 @@ impl BackgroundPurges {
     /// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
     /// Thus the [`BackgroundPurges`] type to keep track of these tasks.
     pub fn spawn(&self, tmp_path: Utf8PathBuf) {
-        let mut guard = self.0.lock().unwrap();
-        let jset = match &mut *guard {
-            BackgroundPurgesInner::Open(ref mut jset) => jset,
-            BackgroundPurgesInner::ShuttingDown(_) => {
-                warn!("trying to spawn background purge during shutdown, ignoring");
-                return;
+        // because on shutdown we close and wait, we are misusing TaskTracker a bit.
+        //
+        // so first acquire a token, then check if the tracker has been closed. the tracker might get closed
+        // right after, but at least the shutdown will wait for what we are spawning next.
+        let token = self.0.token();
+
+        if self.0.is_closed() {
+            warn!(
+                %tmp_path,
+                "trying to spawn background purge during shutdown, ignoring"
+            );
+            return;
+        }
+
+        let span = info_span!(parent: None, "background_purge", %tmp_path);
+
+        let task = move || {
+            let _token = token;
+            let _entered = span.entered();
+            if let Err(error) = std::fs::remove_dir_all(tmp_path.as_path()) {
+                // should we fatal_io_error here?
+                warn!(%error, "failed to purge tenant directory");
             }
         };
-        jset.spawn_on(
-            async move {
-                if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await {
-                    // should we fatal_io_error here?
-                    warn!(%error, path=%tmp_path, "failed to purge tenant directory");
-                }
-            }
-            .instrument(info_span!(parent: None, "background_purge")),
-            BACKGROUND_RUNTIME.handle(),
-        );
+
+        BACKGROUND_RUNTIME.spawn_blocking(task);
     }
 
     /// When this future completes, all background purges have completed.
@@ -278,42 +273,9 @@ impl BackgroundPurges {
     /// instances of this future will continue to be correct.
     #[instrument(skip_all)]
     pub async fn shutdown(&self) {
-        let jset = {
-            let mut guard = self.0.lock().unwrap();
-            match &mut *guard {
-                BackgroundPurgesInner::Open(jset) => {
-                    *guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new(
-                        std::mem::take(jset),
-                    )))
-                }
-                BackgroundPurgesInner::ShuttingDown(_) => {
-                    // calling shutdown multiple times is most likely a bug in pageserver shutdown code
-                    warn!("already shutting down");
-                }
-            };
-            match &mut *guard {
-                BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(),
-                BackgroundPurgesInner::Open(_) => {
-                    unreachable!("above code transitions into shut down state");
-                }
-            }
-        };
-        let mut jset = jset.lock().await; // concurrent callers coalesce here
-        while let Some(res) = jset.join_next().await {
-            match res {
-                Ok(()) => {}
-                Err(e) if e.is_panic() => {
-                    // If it panicked, the error is already logged by the panic hook.
-                }
-                Err(e) if e.is_cancelled() => {
-                    unreachable!("we don't cancel the joinset or runtime")
-                }
-                Err(e) => {
-                    // No idea when this can happen, but let's log it.
-                    warn!(%e, "background purge task failed or panicked");
-                }
-            }
-        }
+        // forbid new tasks (can be called many times)
+        self.0.close();
+        self.0.wait().await;
     }
 }
 
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index c01b3a2e89..dadf5ca672 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -128,6 +128,8 @@ def test_tenant_delete_smoke(
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0
 
+    env.pageserver.stop()
+
 
 def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder):
     """Reproduction of 2023-11-23 stuck tenants investigation"""
@@ -200,11 +202,10 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
         if deletion is not None:
             deletion.join()
 
+    env.pageserver.stop()
 
-def test_tenant_delete_races_timeline_creation(
-    neon_env_builder: NeonEnvBuilder,
-    pg_bin: PgBin,
-):
+
+def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder):
     """
     Validate that timeline creation executed in parallel with deletion works correctly.
 
@@ -318,6 +319,8 @@ def test_tenant_delete_races_timeline_creation(
     # We deleted our only tenant, and the scrubber fails if it detects nothing
     neon_env_builder.disable_scrub_on_exit()
 
+    env.pageserver.stop()
+
 
 def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
     """

From 953b7d4f7ec774be5765f762eb46334be069b3bc Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 8 Aug 2024 12:57:48 +0100
Subject: [PATCH 1367/1571] pageserver: remove paranoia double-calculation of
 retain_lsns (#8617)

## Problem

This code was to mitigate risk in
https://github.com/neondatabase/neon/pull/8427

As expected, we did not hit this code path - the new continuous updates
of gc_info are working fine, we can remove this code now.

## Summary of changes

- Remove block that double-checks retain_lsns
---
 pageserver/src/tenant.rs | 48 ----------------------------------------
 1 file changed, 48 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2422ab4cf2..90c0e28bc4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3012,54 +3012,6 @@ impl Tenant {
         // because that will stall branch creation.
         let gc_cs = self.gc_cs.lock().await;
 
-        // Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they
-        // depend on.  So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here
-        // and fail out if it's inaccurate.
-        // (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427)
-        {
-            let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> =
-                BTreeMap::new();
-            timelines.iter().for_each(|timeline| {
-                if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() {
-                    let ancestor_children =
-                        all_branchpoints.entry(*ancestor_timeline_id).or_default();
-                    ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id));
-                }
-            });
-
-            for timeline in &timelines {
-                let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
-                    .remove(&timeline.timeline_id)
-                    .unwrap_or_default();
-
-                branchpoints.sort_by_key(|b| b.0);
-
-                let target = timeline.gc_info.read().unwrap();
-
-                // We require that retain_lsns contains everything in `branchpoints`, but not that
-                // they are exactly equal: timeline deletions can race with us, so retain_lsns
-                // may contain some extra stuff.  It is safe to have extra timelines in there, because it
-                // just means that we retain slightly more data than we otherwise might.
-                let have_branchpoints = target.retain_lsns.iter().copied().collect::<HashSet<_>>();
-                for b in &branchpoints {
-                    if !have_branchpoints.contains(b) {
-                        tracing::error!(
-                            "Bug: `retain_lsns` is set incorrectly.  Expected be {:?}, but found {:?}",
-                            branchpoints,
-                            target.retain_lsns
-                        );
-                        debug_assert!(false);
-                        // Do not GC based on bad information!
-                        // (ab-use an existing GcError type rather than adding a new one, since this is a
-                        // "should never happen" check that will be removed soon).
-                        return Err(GcError::Remote(anyhow::anyhow!(
-                            "retain_lsns failed validation!"
-                        )));
-                    }
-                }
-            }
-        }
-
         // Ok, we now know all the branch points.
         // Update the GC information for each timeline.
         let mut gc_timelines = Vec::with_capacity(timelines.len());

From 4431688dc69765f6cb5d0a3f0258fa3ef30e6f5b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 8 Aug 2024 19:24:21 +0100
Subject: [PATCH 1368/1571] tests: don't require kafka client for regular tests
 (#8662)

## Problem

We're adding more third party dependencies to support more diverse +
realistic test cases in `test_runner/logical_repl`. I :heart: these
tests, they are a good thing.

The slight glitch is that python packaging is hard, and some third party
python packages have issues. For example the current kafka dependency
doesn't work on latest python. We can mitigate that by only importing
these more specialized dependencies in the tests that use them.

## Summary of changes

- Move the `kafka` import into a test body, so that folks running the
regular `test_runner/regress` tests don't have to have a working kafka
client package.
---
 test_runner/logical_repl/test_debezium.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test_runner/logical_repl/test_debezium.py b/test_runner/logical_repl/test_debezium.py
index 700b731418..5426a06ca1 100644
--- a/test_runner/logical_repl/test_debezium.py
+++ b/test_runner/logical_repl/test_debezium.py
@@ -12,7 +12,6 @@ import requests
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import RemotePostgres
 from fixtures.utils import wait_until
-from kafka import KafkaConsumer
 
 
 class DebeziumAPI:
@@ -95,6 +94,8 @@ def debezium(remote_pg: RemotePostgres):
     log.debug("%s %s %s", resp.status_code, resp.ok, resp.text)
     assert resp.status_code == 201
     assert len(dbz.list_connectors()) == 1
+    from kafka import KafkaConsumer
+
     consumer = KafkaConsumer(
         "dbserver1.inventory.customers",
         bootstrap_servers=["kafka:9092"],

From d28a6f2576739b5a0464cea01075d469431cd743 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 9 Aug 2024 07:17:16 +0100
Subject: [PATCH 1369/1571] CI(build-tools): update Rust, Python, Mold (#8667)

## Problem
- Rust 1.80.1 has been released:
https://blog.rust-lang.org/2024/08/08/Rust-1.80.1.html
- Python 3.9.19 has been released:
https://www.python.org/downloads/release/python-3919/
- Mold 2.33.0 has been released:
https://github.com/rui314/mold/releases/tag/v2.33.0
- Unpinned `cargo-deny` in `build-tools` got updated to the latest
version and doesn't work anymore with the current config file

## Summary of changes
- Bump Rust to 1.80.1
- Bump Python to 3.9.19
- Bump Mold to 2.33.0
- Pin `cargo-deny`, `cargo-hack`, `cargo-hakari`, `cargo-nextest`,
`rustfilt` versions
- Update `deny.toml` to the latest format, see
https://github.com/EmbarkStudios/cargo-deny/pull/611
---
 Dockerfile.build-tools | 21 +++++++++++++--------
 deny.toml              | 10 ++--------
 rust-toolchain.toml    |  2 +-
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index dfaab1cb2e..a72092e8e2 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -99,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
     && rm awscliv2.zip
 
 # Mold: A Modern Linker
-ENV MOLD_VERSION v2.31.0
+ENV MOLD_VERSION=v2.33.0
 RUN set -e \
     && git clone https://github.com/rui314/mold.git \
     && mkdir mold/build \
@@ -168,7 +168,7 @@ USER nonroot:nonroot
 WORKDIR /home/nonroot
 
 # Python
-ENV PYTHON_VERSION=3.9.18 \
+ENV PYTHON_VERSION=3.9.19 \
     PYENV_ROOT=/home/nonroot/.pyenv \
     PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
 RUN set -e \
@@ -192,9 +192,14 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.80.0
+ENV RUSTC_VERSION=1.80.1
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
+ARG RUSTFILT_VERSION=0.2.1
+ARG CARGO_HAKARI_VERSION=0.9.30
+ARG CARGO_DENY_VERSION=0.16.1
+ARG CARGO_HACK_VERSION=0.6.31
+ARG CARGO_NEXTEST_VERSION=0.9.72
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
@@ -204,11 +209,11 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     cargo --version && rustup --version && \
     rustup component add llvm-tools-preview rustfmt clippy && \
     cargo install --git https://github.com/paritytech/cachepot && \
-    cargo install rustfilt && \
-    cargo install cargo-hakari && \
-    cargo install cargo-deny --locked && \
-    cargo install cargo-hack && \
-    cargo install cargo-nextest && \
+    cargo install rustfilt            --version ${RUSTFILT_VERSION} && \
+    cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} && \
+    cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
+    cargo install cargo-hack          --version ${CARGO_HACK_VERSION} && \
+    cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
     rm -rf /home/nonroot/.cargo/registry && \
     rm -rf /home/nonroot/.cargo/git
 ENV RUSTC_WRAPPER=cachepot
diff --git a/deny.toml b/deny.toml
index 469609c496..dc985138e6 100644
--- a/deny.toml
+++ b/deny.toml
@@ -4,6 +4,7 @@
 # to your expectations and requirements.
 
 # Root options
+[graph]
 targets = [
     { triple = "x86_64-unknown-linux-gnu" },
     { triple = "aarch64-unknown-linux-gnu" },
@@ -12,6 +13,7 @@ targets = [
 ]
 all-features = false
 no-default-features = false
+[output]
 feature-depth = 1
 
 # This section is considered when running `cargo deny check advisories`
@@ -19,17 +21,13 @@ feature-depth = 1
 # https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
 [advisories]
 db-urls = ["https://github.com/rustsec/advisory-db"]
-vulnerability = "deny"
-unmaintained = "warn"
 yanked = "warn"
-notice = "warn"
 ignore = []
 
 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
 [licenses]
-unlicensed = "deny"
 allow = [
     "Apache-2.0",
     "Artistic-2.0",
@@ -42,10 +40,6 @@ allow = [
     "OpenSSL",
     "Unicode-DFS-2016",
 ]
-deny = []
-copyleft = "warn"
-allow-osi-fsf-free = "neither"
-default = "deny"
 confidence-threshold = 0.8
 exceptions = [
     # Zlib license has some restrictions if we decide to change sth
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 3510359591..368b8d300a 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.80.0"
+channel = "1.80.1"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From 8acce009530f308a1987ecc68858821a3a8230ad Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 9 Aug 2024 07:54:54 +0100
Subject: [PATCH 1370/1571] Dockerfiles: fix LegacyKeyValueFormat &
 JSONArgsRecommended  (#8664)

## Problem
CI complains in all PRs:
```
"ENV key=value" should be used instead of legacy "ENV key value" format
```
https://docs.docker.com/reference/build-checks/legacy-key-value-format/

See
- https://github.com/neondatabase/neon/pull/8644/files ("Unchanged files
with check annotations" section)
- https://github.com/neondatabase/neon/actions/runs/10304090562?pr=8644
("Annotations" section)


## Summary of changes
- Use `ENV key=value` instead of `ENV key value` in all Dockerfiles
---
 Dockerfile              |  6 +++---
 Dockerfile.build-tools  |  2 +-
 Dockerfile.compute-node | 28 ++++++++++++++--------------
 vm-image-spec.yaml      |  4 ++--
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index ace112cccf..6ed57a84a3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -17,7 +17,7 @@ COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
 
-ENV BUILD_TYPE release
+ENV BUILD_TYPE=release
 RUN set -e \
     && mold -run make -j $(nproc) -s neon-pg-ext \
     && rm -rf pg_install/build \
@@ -104,7 +104,7 @@ RUN mkdir -p /data/.neon/ && \
 
 # When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
 # that want a particular postgres version will select it explicitly: this is just a default.
-ENV LD_LIBRARY_PATH /usr/local/v16/lib
+ENV LD_LIBRARY_PATH=/usr/local/v16/lib
 
 
 VOLUME ["/data"]
@@ -112,5 +112,5 @@ USER neon
 EXPOSE 6400
 EXPOSE 9898
 
-CMD /usr/local/bin/pageserver -D /data/.neon
+CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"]
 
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index a72092e8e2..d39d36e1b6 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -58,7 +58,7 @@ RUN set -e \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
 # protobuf-compiler (protoc)
-ENV PROTOC_VERSION 25.1
+ENV PROTOC_VERSION=25.1
 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
     && unzip -q protoc.zip -d protoc \
     && mv protoc/bin/protoc /usr/local/bin/protoc \
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 054d44e0ec..7acaf2f2fd 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -94,7 +94,7 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar
     DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
     make clean && cp -R /sfcgal/* /
 
-ENV PATH "/usr/local/pgsql/bin:$PATH"
+ENV PATH="/usr/local/pgsql/bin:$PATH"
 
 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
     echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
@@ -411,7 +411,7 @@ FROM build-deps AS timescaledb-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ARG PG_VERSION
-ENV PATH "/usr/local/pgsql/bin:$PATH"
+ENV PATH="/usr/local/pgsql/bin:$PATH"
 
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
@@ -444,7 +444,7 @@ FROM build-deps AS pg-hint-plan-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ARG PG_VERSION
-ENV PATH "/usr/local/pgsql/bin:$PATH"
+ENV PATH="/usr/local/pgsql/bin:$PATH"
 
 RUN case "${PG_VERSION}" in \
       "v14") \
@@ -480,7 +480,7 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS pg-cron-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
     echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
     mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
@@ -506,7 +506,7 @@ RUN apt-get update && \
         libboost-system1.74-dev \
         libeigen3-dev
 
-ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
     echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
     mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
@@ -546,7 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
 FROM build-deps AS pg-uuidv7-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
     echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
     mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
@@ -563,7 +563,7 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz
 FROM build-deps AS pg-roaringbitmap-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
     echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
     mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
@@ -580,7 +580,7 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
 FROM build-deps AS pg-semver-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
     echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
     mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
@@ -598,7 +598,7 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ARG PG_VERSION
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
         export PG_EMBEDDING_VERSION=0.3.5 \
@@ -622,7 +622,7 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
     echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
     mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
@@ -750,7 +750,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -
 FROM build-deps AS wal2json-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
     echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
     mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
@@ -766,7 +766,7 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
 FROM build-deps AS pg-ivm-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
     echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
     mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
@@ -783,7 +783,7 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv
 FROM build-deps AS pg-partman-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
     echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
     mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
@@ -1034,6 +1034,6 @@ RUN apt update &&  \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
 
-ENV LANG en_US.utf8
+ENV LANG=en_US.utf8
 USER postgres
 ENTRYPOINT ["/usr/local/bin/compute_ctl"]
diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 7d005c7139..41d6e11725 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -416,7 +416,7 @@ build: |
   # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
   # requires cgroup v2, so we'll build cgroup-tools ourselves.
   FROM debian:bullseye-slim as libcgroup-builder
-  ENV LIBCGROUP_VERSION v2.0.3
+  ENV LIBCGROUP_VERSION=v2.0.3
 
   RUN set -exu \
       && apt update \
@@ -460,7 +460,7 @@ build: |
           pkg-config
 
   # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
-  ENV PGBOUNCER_TAG pgbouncer_1_22_1
+  ENV PGBOUNCER_TAG=pgbouncer_1_22_1
   RUN set -e \
       && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
       && cd pgbouncer \

From 2ca5ff26d787a89ff9d3176a6e63ea2a2e4716b2 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Fri, 9 Aug 2024 09:36:29 +0200
Subject: [PATCH 1371/1571] Run a subset of benchmarking job steps on GitHub
 action runners in Azure - closer to the system under test (#8651)

## Problem

Latency from one cloud provider to another one is higher than within the
same cloud provider.
Some of our benchmarks are latency sensitive - we run a pgbench or psql
in the github action runner and the system under test is running in Neon
(database project).
For realistic perf tps and latency results we need to compare apples to
apples and run the database client in the same "latency distance" for
all tests.

## Summary of changes

Move job steps that test Neon databases deployed on Azure into Azure
action runners.
- bench strategy variant using azure database
- pgvector strategy variant using azure database
- pgbench-compare strategy variants using azure database

## Test run

https://github.com/neondatabase/neon/actions/runs/10314848502
---
 .github/actionlint.yml             |  1 +
 .github/workflows/benchmarking.yml | 88 +++++++++++++++++++++++-------
 2 files changed, 69 insertions(+), 20 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index d27fa01efa..a5282876d0 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -13,3 +13,4 @@ config-variables:
   - REMOTE_STORAGE_AZURE_CONTAINER
   - REMOTE_STORAGE_AZURE_REGION
   - SLACK_UPCOMING_RELEASE_CHANNEL_ID
+  - DEV_AWS_OIDC_ROLE_ARN
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 0f4dac841e..6f80d6e431 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -56,6 +56,10 @@ concurrency:
 jobs:
   bench:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # Required for OIDC authentication in azure runners
     strategy:
       fail-fast: false
       matrix:
@@ -63,9 +67,13 @@ jobs:
           - DEFAULT_PG_VERSION: 16
             PLATFORM: "neon-staging"
             region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+            RUNNER: [ self-hosted, us-east-2, x64 ]
+            IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
           - DEFAULT_PG_VERSION: 16
             PLATFORM: "azure-staging"
             region_id: 'azure-eastus2'
+            RUNNER: [ self-hosted, eastus2, x64 ]
+            IMAGE: neondatabase/build-tools:pinned
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "300"
       TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -76,14 +84,21 @@ jobs:
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.PLATFORM }}
 
-    runs-on: [ self-hosted, us-east-2, x64 ]
+    runs-on: ${{ matrix.RUNNER }}
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: ${{ matrix.IMAGE }}
       options: --init
 
     steps:
     - uses: actions/checkout@v4
 
+    - name: Configure AWS credentials # necessary on Azure runners
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}  
+        role-duration-seconds: 18000 # 5 hours
+
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
@@ -161,6 +176,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
+    
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
@@ -237,6 +253,9 @@ jobs:
       id: pgbench-compare-matrix
       run: |
         region_id_default=${{ env.DEFAULT_REGION_ID }}
+        runner_default='["self-hosted", "us-east-2", "x64"]'
+        runner_azure='["self-hosted", "eastus2", "x64"]'
+        image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned"
         matrix='{
           "pg_version" : [
             16
@@ -250,16 +269,19 @@ jobs:
             "neonvm-captest-new"
           ],
           "db_size": [ "10gb" ],
-          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
+          "runner": ['"$runner_default"'],
+          "image": [ "'"$image_default"'" ],
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
         }'
 
         if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -302,6 +324,10 @@ jobs:
   pgbench-compare:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     needs: [ generate-matrices ]
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # Required for OIDC authentication in azure runners
 
     strategy:
       fail-fast: false
@@ -317,9 +343,9 @@ jobs:
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.platform }}
 
-    runs-on: [ self-hosted, us-east-2, x64 ]
+    runs-on: ${{ matrix.runner }}
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: ${{ matrix.image }}
       options: --init
 
     # Increase timeout to 8h, default timeout is 6h
@@ -328,6 +354,13 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
+    - name: Configure AWS credentials # necessary on Azure runners
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+        
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
@@ -435,12 +468,20 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   pgbench-pgvector:
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # Required for OIDC authentication in azure runners
     strategy:
       fail-fast: false
       matrix:
         include:
           - PLATFORM: "neonvm-captest-pgvector"
+            RUNNER: [ self-hosted, us-east-2, x64 ]
+            IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
           - PLATFORM: "azure-captest-pgvector"
+            RUNNER: [ self-hosted, eastus2, x64 ]
+            IMAGE: neondatabase/build-tools:pinned
 
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
@@ -453,9 +494,9 @@ jobs:
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.PLATFORM }}
 
-    runs-on: [ self-hosted, us-east-2, x64 ]
+    runs-on: ${{ matrix.RUNNER }}
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: ${{ matrix.IMAGE }}
       options: --init
 
     steps:
@@ -466,12 +507,12 @@ jobs:
     - name: Install postgresql-16 where pytest expects it
       run: |
         cd /home/nonroot
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb 
-        dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb 
+        dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg
         mkdir -p /tmp/neon/pg_install/v16/bin
         ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench  
         ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql  
@@ -496,6 +537,13 @@ jobs:
         esac
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+        
+    - name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
 
     - name: Benchmark pgvector hnsw indexing
       uses: ./.github/actions/run-python-test-set
@@ -524,7 +572,7 @@ jobs:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-
+    
     - name: Create Allure report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate

From 7e08fbd1b97f7f35b4ff4f40a42cf6e579e81c23 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 9 Aug 2024 09:09:29 +0100
Subject: [PATCH 1372/1571] Revert "proxy: update tokio-postgres to allow
 arbitrary config params (#8076)" (#8654)

This reverts #8076 - which was already reverted from the release branch
since forever (it would have been a breaking change to release for all
users who currently set TimeZone options). It's causing conflicts now so
we should revert it here as well.
---
 Cargo.lock                            |   8 +-
 libs/postgres_connection/src/lib.rs   |  50 +++++-----
 proxy/src/compute.rs                  | 129 ++++++++++++--------------
 proxy/src/serverless/backend.rs       |   4 -
 proxy/src/serverless/sql_over_http.rs |   1 -
 test_runner/regress/test_proxy.py     |  19 ----
 6 files changed, 92 insertions(+), 119 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f565119dbd..031fae0f37 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3960,7 +3960,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -3973,7 +3973,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -3992,7 +3992,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -6187,7 +6187,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs
index fdabcbacb2..9f57f3d507 100644
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -144,7 +144,20 @@ impl PgConnectionConfig {
             // implement and this function is hardly a bottleneck. The function is only called around
             // establishing a new connection.
             #[allow(unstable_name_collisions)]
-            config.options(&encode_options(&self.options));
+            config.options(
+                &self
+                    .options
+                    .iter()
+                    .map(|s| {
+                        if s.contains(['\\', ' ']) {
+                            Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
+                        } else {
+                            Cow::Borrowed(s.as_str())
+                        }
+                    })
+                    .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
+                    .collect::<String>(),
+            );
         }
         config
     }
@@ -165,21 +178,6 @@ impl PgConnectionConfig {
     }
 }
 
-#[allow(unstable_name_collisions)]
-fn encode_options(options: &[String]) -> String {
-    options
-        .iter()
-        .map(|s| {
-            if s.contains(['\\', ' ']) {
-                Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
-            } else {
-                Cow::Borrowed(s.as_str())
-            }
-        })
-        .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
-        .collect::<String>()
-}
-
 impl fmt::Display for PgConnectionConfig {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         // The password is intentionally hidden and not part of this display string.
@@ -208,7 +206,7 @@ impl fmt::Debug for PgConnectionConfig {
 
 #[cfg(test)]
 mod tests_pg_connection_config {
-    use crate::{encode_options, PgConnectionConfig};
+    use crate::PgConnectionConfig;
     use once_cell::sync::Lazy;
     use url::Host;
 
@@ -257,12 +255,18 @@ mod tests_pg_connection_config {
 
     #[test]
     fn test_with_options() {
-        let options = encode_options(&[
-            "hello".to_owned(),
-            "world".to_owned(),
-            "with space".to_owned(),
-            "and \\ backslashes".to_owned(),
+        let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123).extend_options([
+            "hello",
+            "world",
+            "with space",
+            "and \\ backslashes",
         ]);
-        assert_eq!(options, "hello world with\\ space and\\ \\\\\\ backslashes");
+        assert_eq!(cfg.host(), &*STUB_HOST);
+        assert_eq!(cfg.port(), 123);
+        assert_eq!(cfg.raw_address(), "stub.host.example:123");
+        assert_eq!(
+            cfg.to_tokio_postgres_config().get_options(),
+            Some("hello world with\\ space and\\ \\\\\\ backslashes")
+        );
     }
 }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 21687160ea..18c82fe379 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -103,8 +103,12 @@ impl ConnCfg {
 
     /// Reuse password or auth keys from the other config.
     pub fn reuse_password(&mut self, other: Self) {
-        if let Some(password) = other.get_auth() {
-            self.auth(password);
+        if let Some(password) = other.get_password() {
+            self.password(password);
+        }
+
+        if let Some(keys) = other.get_auth_keys() {
+            self.auth_keys(keys);
         }
     }
 
@@ -120,64 +124,48 @@ impl ConnCfg {
 
     /// Apply startup message params to the connection config.
     pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
-        let mut client_encoding = false;
-        for (k, v) in params.iter() {
-            match k {
-                "user" => {
-                    // Only set `user` if it's not present in the config.
-                    // Link auth flow takes username from the console's response.
-                    if self.get_user().is_none() {
-                        self.user(v);
-                    }
+        // Only set `user` if it's not present in the config.
+        // Link auth flow takes username from the console's response.
+        if let (None, Some(user)) = (self.get_user(), params.get("user")) {
+            self.user(user);
+        }
+
+        // Only set `dbname` if it's not present in the config.
+        // Link auth flow takes dbname from the console's response.
+        if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
+            self.dbname(dbname);
+        }
+
+        // Don't add `options` if they were only used for specifying a project.
+        // Connection pools don't support `options`, because they affect backend startup.
+        if let Some(options) = filtered_options(params) {
+            self.options(&options);
+        }
+
+        if let Some(app_name) = params.get("application_name") {
+            self.application_name(app_name);
+        }
+
+        // TODO: This is especially ugly...
+        if let Some(replication) = params.get("replication") {
+            use tokio_postgres::config::ReplicationMode;
+            match replication {
+                "true" | "on" | "yes" | "1" => {
+                    self.replication_mode(ReplicationMode::Physical);
                 }
                 "database" => {
-                    // Only set `dbname` if it's not present in the config.
-                    // Link auth flow takes dbname from the console's response.
-                    if self.get_dbname().is_none() {
-                        self.dbname(v);
-                    }
-                }
-                "options" => {
-                    // Don't add `options` if they were only used for specifying a project.
-                    // Connection pools don't support `options`, because they affect backend startup.
-                    if let Some(options) = filtered_options(v) {
-                        self.options(&options);
-                    }
-                }
-
-                // the special ones in tokio-postgres that we don't want being set by the user
-                "dbname" => {}
-                "password" => {}
-                "sslmode" => {}
-                "host" => {}
-                "port" => {}
-                "connect_timeout" => {}
-                "keepalives" => {}
-                "keepalives_idle" => {}
-                "keepalives_interval" => {}
-                "keepalives_retries" => {}
-                "target_session_attrs" => {}
-                "channel_binding" => {}
-                "max_backend_message_size" => {}
-
-                "client_encoding" => {
-                    client_encoding = true;
-                    // only error should be from bad null bytes,
-                    // but we've already checked for those.
-                    _ = self.param("client_encoding", v);
-                }
-
-                _ => {
-                    // only error should be from bad null bytes,
-                    // but we've already checked for those.
-                    _ = self.param(k, v);
+                    self.replication_mode(ReplicationMode::Logical);
                 }
+                _other => {}
             }
         }
-        if !client_encoding {
-            // for compatibility since we removed it from tokio-postgres
-            self.param("client_encoding", "UTF8").unwrap();
-        }
+
+        // TODO: extend the list of the forwarded startup parameters.
+        // Currently, tokio-postgres doesn't allow us to pass
+        // arbitrary parameters, but the ones above are a good start.
+        //
+        // This and the reverse params problem can be better addressed
+        // in a bespoke connection machinery (a new library for that sake).
     }
 }
 
@@ -350,9 +338,10 @@ impl ConnCfg {
 }
 
 /// Retrieve `options` from a startup message, dropping all proxy-secific flags.
-fn filtered_options(options: &str) -> Option<String> {
+fn filtered_options(params: &StartupMessageParams) -> Option<String> {
     #[allow(unstable_name_collisions)]
-    let options: String = StartupMessageParams::parse_options_raw(options)
+    let options: String = params
+        .options_raw()?
         .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
         .intersperse(" ") // TODO: use impl from std once it's stabilized
         .collect();
@@ -424,23 +413,27 @@ mod tests {
     #[test]
     fn test_filtered_options() {
         // Empty options is unlikely to be useful anyway.
-        assert_eq!(filtered_options(""), None);
+        let params = StartupMessageParams::new([("options", "")]);
+        assert_eq!(filtered_options(&params), None);
 
         // It's likely that clients will only use options to specify endpoint/project.
-        let params = "project=foo";
-        assert_eq!(filtered_options(params), None);
+        let params = StartupMessageParams::new([("options", "project=foo")]);
+        assert_eq!(filtered_options(&params), None);
 
         // Same, because unescaped whitespaces are no-op.
-        let params = " project=foo ";
-        assert_eq!(filtered_options(params), None);
+        let params = StartupMessageParams::new([("options", " project=foo ")]);
+        assert_eq!(filtered_options(&params).as_deref(), None);
 
-        let params = r"\  project=foo \ ";
-        assert_eq!(filtered_options(params).as_deref(), Some(r"\  \ "));
+        let params = StartupMessageParams::new([("options", r"\  project=foo \ ")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some(r"\  \ "));
 
-        let params = "project = foo";
-        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
+        let params = StartupMessageParams::new([("options", "project = foo")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
 
-        let params = "project = foo neon_endpoint_type:read_write   neon_lsn:0/2";
-        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
+        let params = StartupMessageParams::new([(
+            "options",
+            "project = foo neon_endpoint_type:read_write   neon_lsn:0/2",
+        )]);
+        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
     }
 }
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 80d46c67eb..295ea1a1c7 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -236,10 +236,6 @@ impl ConnectMechanism for TokioMechanism {
             .dbname(&self.conn_info.dbname)
             .connect_timeout(timeout);
 
-        config
-            .param("client_encoding", "UTF8")
-            .expect("client encoding UTF8 is always valid");
-
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let res = config.connect(tokio_postgres::NoTls).await;
         drop(pause);
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 77ec6b1c73..e5b6536328 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -203,7 +203,6 @@ fn get_conn_info(
             options = Some(NeonOptions::parse_options_raw(&value));
         }
     }
-    ctx.set_db_options(params.freeze());
 
     let user_info = ComputeUserInfo {
         endpoint,
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 8ed44b1094..f446f4f200 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -53,25 +53,6 @@ def test_proxy_select_1(static_proxy: NeonProxy):
     assert out[0][0] == 42
 
 
-def test_proxy_server_params(static_proxy: NeonProxy):
-    """
-    Test that server params are passing through to postgres
-    """
-
-    out = static_proxy.safe_psql(
-        "select to_json('0 seconds'::interval)", options="-c intervalstyle=iso_8601"
-    )
-    assert out[0][0] == "PT0S"
-    out = static_proxy.safe_psql(
-        "select to_json('0 seconds'::interval)", options="-c intervalstyle=sql_standard"
-    )
-    assert out[0][0] == "0"
-    out = static_proxy.safe_psql(
-        "select to_json('0 seconds'::interval)", options="-c intervalstyle=postgres"
-    )
-    assert out[0][0] == "00:00:00"
-
-
 def test_password_hack(static_proxy: NeonProxy):
     """
     Check the PasswordHack auth flow: an alternative to SCRAM auth for

From a155914c1c8d46372cb5363f07c7bb89021becf4 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <iskyzh@gmail.com>
Date: Fri, 9 Aug 2024 16:18:55 +0800
Subject: [PATCH 1373/1571] fix(neon): disable create tablespace stmt (#8657)

part of https://github.com/neondatabase/neon/issues/8653

Disable create tablespace stmt. It turns out it requires much less
effort to do the regress test mode flag than patching the test cases,
and given that we might need to support tablespaces in the future, I
decided to add a new flag `regress_test_mode` to change the behavior of
create tablespace.

Tested manually that without setting regress_test_mode, create
tablespace will be rejected.


---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pgxn/neon/control_plane_connector.c    | 21 +++++++++++++++++++++
 test_runner/regress/test_pg_regress.py | 25 ++++++++++++++++++++++---
 2 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index 93252e6b29..de023da5c4 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -45,6 +45,7 @@ static const char *jwt_token = NULL;
 /* GUCs */
 static char *ConsoleURL = NULL;
 static bool ForwardDDL = true;
+static bool RegressTestMode = false;
 
 /*
  * CURL docs say that this buffer must exist until we call curl_easy_cleanup
@@ -802,6 +803,14 @@ NeonProcessUtility(
 		case T_DropRoleStmt:
 			HandleDropRole(castNode(DropRoleStmt, parseTree));
 			break;
+		case T_CreateTableSpaceStmt:
+			if (!RegressTestMode)
+			{
+				ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					errmsg("CREATE TABLESPACE is not supported on Neon")));
+			}
+   			break;
 		default:
 			break;
 	}
@@ -864,6 +873,18 @@ InitControlPlaneConnector()
 							 NULL,
 							 NULL);
 
+	DefineCustomBoolVariable(
+							 "neon.regress_test_mode",
+							 "Controls whether we are running in the regression test mode",
+							 NULL,
+							 &RegressTestMode,
+							 false,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
 	jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
 	if (!jwt_token)
 	{
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 6f7ea0092a..45ce5b1c5b 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -144,7 +144,13 @@ def test_pg_regress(
     )
 
     # Connect to postgres and create a database called "regression".
-    endpoint = env.endpoints.create_start("main")
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            # Enable the test mode, so that we don't need to patch the test cases.
+            "neon.regress_test_mode = true",
+        ],
+    )
     endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_regress to run in.
@@ -207,7 +213,14 @@ def test_isolation(
 
     # Connect to postgres and create a database called "regression".
     # isolation tests use prepared transactions, so enable them
-    endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"])
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            "max_prepared_transactions=100",
+            # Enable the test mode, so that we don't need to patch the test cases.
+            "neon.regress_test_mode = true",
+        ],
+    )
     endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_isolation_regress to run in.
@@ -268,7 +281,13 @@ def test_sql_regress(
     )
 
     # Connect to postgres and create a database called "regression".
-    endpoint = env.endpoints.create_start("main")
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            # Enable the test mode, so that we don't need to patch the test cases.
+            "neon.regress_test_mode = true",
+        ],
+    )
     endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_regress to run in.

From 201f56baf7c38f0626fc59a299ed6f164722a850 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 9 Aug 2024 12:05:43 +0100
Subject: [PATCH 1374/1571] CI(pin-build-tools-image): fix permissions for
 Azure login (#8671)

## Problem

Azure login fails in `pin-build-tools-image` workflow because the job
doesn't have the required permissions.

```
Error: Please make sure to give write permissions to id-token in the workflow.
Error: Login failed with Error: Error message: Unable to get ACTIONS_ID_TOKEN_REQUEST_URL env variable. Double check if the 'auth-type' is correct. Refer to https://github.com/Azure/login#readme for more information.
```

## Summary of changes
- Add `id-token: write` permission to `pin-build-tools-image`
- Add an input to force image tagging
- Unify pushing to Docker Hub with other registries
- Split the job into two to have less if's
---
 .github/workflows/pin-build-tools-image.yml | 50 +++++++++++++--------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index cf10910b0b..2e79498fc4 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -7,12 +7,20 @@ on:
         description: 'Source tag'
         required: true
         type: string
+      force:
+        description: 'Force the image to be pinned'
+        default: false
+        type: boolean
   workflow_call:
     inputs:
       from-tag:
         description: 'Source tag'
         required: true
         type: string
+      force:
+        description: 'Force the image to be pinned'
+        default: false
+        type: boolean
 
 defaults:
   run:
@@ -22,15 +30,18 @@ concurrency:
   group: pin-build-tools-image-${{ inputs.from-tag }}
   cancel-in-progress: false
 
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
 
-jobs:
-  tag-image:
-    runs-on: ubuntu-22.04
+env:
+  FROM_TAG: ${{ inputs.from-tag }}
+  TO_TAG: pinned
 
-    env:
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: pinned
+jobs:
+  check-manifests:
+    runs-on: ubuntu-22.04
+    outputs:
+      skip: ${{ steps.check-manifests.outputs.skip }}
 
     steps:
       - name: Check if we really need to pin the image
@@ -47,27 +58,31 @@ jobs:
 
           echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
 
+  tag-image:
+    needs: check-manifests
+
+    # use format(..) to catch both inputs.force = true AND inputs.force = 'true'
+    if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true'
+
+    runs-on: ubuntu-22.04
+
+    permissions:
+      id-token: write # for `azure/login`
+
+    steps:
       - uses: docker/login-action@v3
-        if: steps.check-manifests.outputs.skip == 'false'
+
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
-        if: steps.check-manifests.outputs.skip == 'false'
-        run: |
-          docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
-                                             neondatabase/build-tools:${FROM_TAG}
-
       - uses: docker/login-action@v3
-        if: steps.check-manifests.outputs.skip == 'false'
         with:
           registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
           username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
           password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
       - name: Azure login
-        if: steps.check-manifests.outputs.skip == 'false'
         uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
         with:
           client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
@@ -75,13 +90,12 @@ jobs:
           subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
 
       - name: Login to ACR
-        if: steps.check-manifests.outputs.skip == 'false'
         run: |
           az acr login --name=neoneastus2
 
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR and ACR
-        if: steps.check-manifests.outputs.skip == 'false'
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
         run: |
           docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
                                           -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
+                                          -t neondatabase/build-tools:${TO_TAG} \
                                              neondatabase/build-tools:${FROM_TAG}

From e6770d79fd9c375ffa5385b1d276c3bd71f6618c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 9 Aug 2024 14:01:56 +0100
Subject: [PATCH 1375/1571] pageserver: don't treat NotInitialized::Stopped as
 unexpected (#8675)

## Problem

This type of error can happen during shutdown & was triggering a circuit
breaker alert.

## Summary of changes

- Map NotIntialized::Stopped to CompactionError::ShuttingDown, so that
we may handle it cleanly
---
 pageserver/src/tenant/timeline.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 76dcb5645f..f810df5a56 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4412,11 +4412,11 @@ impl From<CollectKeySpaceError> for CompactionError {
 impl From<super::upload_queue::NotInitialized> for CompactionError {
     fn from(value: super::upload_queue::NotInitialized) -> Self {
         match value {
-            super::upload_queue::NotInitialized::Uninitialized
-            | super::upload_queue::NotInitialized::Stopped => {
+            super::upload_queue::NotInitialized::Uninitialized => {
                 CompactionError::Other(anyhow::anyhow!(value))
             }
-            super::upload_queue::NotInitialized::ShuttingDown => CompactionError::ShuttingDown,
+            super::upload_queue::NotInitialized::ShuttingDown
+            | super::upload_queue::NotInitialized::Stopped => CompactionError::ShuttingDown,
         }
     }
 }

From f5cef7bf7f321421a459f51f2a07289e145bc159 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 9 Aug 2024 15:45:07 +0100
Subject: [PATCH 1376/1571] storcon: skip draining shard if it's secondary is
 lagging too much (#8644)

## Problem
Migrations of tenant shards with cold secondaries are holding up drains
in during production deployments.

## Summary of changes
If a secondary locations is lagging by more than 256MiB (configurable,
but that's the default), then skip cutting it over to the secondary as part of the node drain.
---
 control_plane/src/local_env.rs                |   3 +
 control_plane/src/storage_controller.rs       |   4 +
 pageserver/src/tenant/secondary/downloader.rs |   6 +-
 storage_controller/src/drain_utils.rs         | 225 ++++++++++++++++++
 storage_controller/src/lib.rs                 |   1 +
 storage_controller/src/main.rs                |   6 +
 storage_controller/src/reconciler.rs          |  74 +++++-
 storage_controller/src/service.rs             | 222 ++++++++++++-----
 storage_controller/src/tenant_shard.rs        |   4 +-
 test_runner/fixtures/neon_fixtures.py         |  64 +++++
 test_runner/fixtures/pageserver/http.py       |   6 +
 .../test_storage_controller_scale.py          |  48 +---
 .../regress/test_storage_controller.py        | 113 +++++++++
 13 files changed, 666 insertions(+), 110 deletions(-)
 create mode 100644 storage_controller/src/drain_utils.rs

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 505d157efd..15bbac702f 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -158,6 +158,8 @@ pub struct NeonStorageControllerConf {
 
     /// Threshold for auto-splitting a tenant into shards
     pub split_threshold: Option<u64>,
+
+    pub max_secondary_lag_bytes: Option<u64>,
 }
 
 impl NeonStorageControllerConf {
@@ -173,6 +175,7 @@ impl Default for NeonStorageControllerConf {
             max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
             max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
             split_threshold: None,
+            max_secondary_lag_bytes: None,
         }
     }
 }
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index e054e9ee57..f180e922e8 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -383,6 +383,10 @@ impl StorageController {
             args.push(format!("--split-threshold={split_threshold}"))
         }
 
+        if let Some(lag) = self.config.max_secondary_lag_bytes.as_ref() {
+            args.push(format!("--max-secondary-lag-bytes={lag}"))
+        }
+
         args.push(format!(
             "--neon-local-repo-dir={}",
             self.env.base_data_dir.display()
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 27439d4f03..135e73b57f 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -55,7 +55,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
 use utils::{
     backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
-    id::TimelineId, serde_system_time,
+    id::TimelineId, pausable_failpoint, serde_system_time,
 };
 
 use super::{
@@ -1146,12 +1146,14 @@ impl<'a> TenantDownloader<'a> {
         layer: HeatMapLayer,
         ctx: &RequestContext,
     ) -> Result<Option<HeatMapLayer>, UpdateError> {
-        // Failpoint for simulating slow remote storage
+        // Failpoints for simulating slow remote storage
         failpoint_support::sleep_millis_async!(
             "secondary-layer-download-sleep",
             &self.secondary_state.cancel
         );
 
+        pausable_failpoint!("secondary-layer-download-pausable");
+
         let local_path = local_layer_path(
             self.conf,
             tenant_shard_id,
diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs
new file mode 100644
index 0000000000..dea1f04649
--- /dev/null
+++ b/storage_controller/src/drain_utils.rs
@@ -0,0 +1,225 @@
+use std::{
+    collections::{BTreeMap, HashMap},
+    sync::Arc,
+};
+
+use pageserver_api::controller_api::NodeSchedulingPolicy;
+use utils::{id::NodeId, shard::TenantShardId};
+
+use crate::{
+    background_node_operations::OperationError, node::Node, scheduler::Scheduler,
+    tenant_shard::TenantShard,
+};
+
+pub(crate) struct TenantShardIterator<F> {
+    tenants_accessor: F,
+    inspected_all_shards: bool,
+    last_inspected_shard: Option<TenantShardId>,
+}
+
+/// A simple iterator which can be used in tandem with [`crate::service::Service`]
+/// to iterate over all known tenant shard ids without holding the lock on the
+/// service state at all times.
+impl<F> TenantShardIterator<F>
+where
+    F: Fn(Option<TenantShardId>) -> Option<TenantShardId>,
+{
+    pub(crate) fn new(tenants_accessor: F) -> Self {
+        Self {
+            tenants_accessor,
+            inspected_all_shards: false,
+            last_inspected_shard: None,
+        }
+    }
+
+    /// Returns the next tenant shard id if one exists
+    pub(crate) fn next(&mut self) -> Option<TenantShardId> {
+        if self.inspected_all_shards {
+            return None;
+        }
+
+        match (self.tenants_accessor)(self.last_inspected_shard) {
+            Some(tid) => {
+                self.last_inspected_shard = Some(tid);
+                Some(tid)
+            }
+            None => {
+                self.inspected_all_shards = true;
+                None
+            }
+        }
+    }
+
+    /// Returns true when the end of the iterator is reached and false otherwise
+    pub(crate) fn finished(&self) -> bool {
+        self.inspected_all_shards
+    }
+}
+
+/// Check that the state of the node being drained is as expected:
+/// node is present in memory and scheduling policy is set to [`NodeSchedulingPolicy::Draining`]
+pub(crate) fn validate_node_state(
+    node_id: &NodeId,
+    nodes: Arc<HashMap<NodeId, Node>>,
+) -> Result<(), OperationError> {
+    let node = nodes.get(node_id).ok_or(OperationError::NodeStateChanged(
+        format!("node {} was removed", node_id).into(),
+    ))?;
+
+    let current_policy = node.get_scheduling();
+    if !matches!(current_policy, NodeSchedulingPolicy::Draining) {
+        // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
+        // about it
+        return Err(OperationError::NodeStateChanged(
+            format!("node {} changed state to {:?}", node_id, current_policy).into(),
+        ));
+    }
+
+    Ok(())
+}
+
+/// Struct that houses a few utility methods for draining pageserver nodes
+pub(crate) struct TenantShardDrain {
+    pub(crate) drained_node: NodeId,
+    pub(crate) tenant_shard_id: TenantShardId,
+}
+
+impl TenantShardDrain {
+    /// Check if the tenant shard under question is eligible for drainining:
+    /// it's primary attachment is on the node being drained
+    pub(crate) fn tenant_shard_eligible_for_drain(
+        &self,
+        tenants: &BTreeMap<TenantShardId, TenantShard>,
+        scheduler: &Scheduler,
+    ) -> Option<NodeId> {
+        let tenant_shard = tenants.get(&self.tenant_shard_id)?;
+
+        if *tenant_shard.intent.get_attached() != Some(self.drained_node) {
+            return None;
+        }
+
+        match scheduler.node_preferred(tenant_shard.intent.get_secondary()) {
+            Some(node) => Some(node),
+            None => {
+                tracing::warn!(
+                    tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
+                    "No eligible secondary while draining {}", self.drained_node
+                );
+
+                None
+            }
+        }
+    }
+
+    /// Attempt to reschedule the tenant shard under question to one of its secondary locations
+    /// Returns an Err when the operation should be aborted and Ok(None) when the tenant shard
+    /// should be skipped.
+    pub(crate) fn reschedule_to_secondary<'a>(
+        &self,
+        destination: NodeId,
+        tenants: &'a mut BTreeMap<TenantShardId, TenantShard>,
+        scheduler: &mut Scheduler,
+        nodes: &Arc<HashMap<NodeId, Node>>,
+    ) -> Result<Option<&'a mut TenantShard>, OperationError> {
+        let tenant_shard = match tenants.get_mut(&self.tenant_shard_id) {
+            Some(some) => some,
+            None => {
+                // Tenant shard was removed in the meantime.
+                // Skip to the next one, but don't fail the overall operation
+                return Ok(None);
+            }
+        };
+
+        if !nodes.contains_key(&destination) {
+            return Err(OperationError::NodeStateChanged(
+                format!("node {} was removed", destination).into(),
+            ));
+        }
+
+        if !tenant_shard.intent.get_secondary().contains(&destination) {
+            tracing::info!(
+                tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
+                "Secondary moved away from {destination} during drain"
+            );
+
+            return Ok(None);
+        }
+
+        match tenant_shard.reschedule_to_secondary(Some(destination), scheduler) {
+            Err(e) => {
+                tracing::warn!(
+                    tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
+                    "Scheduling error when draining pageserver {} : {}", self.drained_node, e
+                );
+
+                Ok(None)
+            }
+            Ok(()) => {
+                let scheduled_to = tenant_shard.intent.get_attached();
+                tracing::info!(
+                    tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
+                    "Rescheduled shard while draining node {}: {} -> {:?}",
+                    self.drained_node,
+                    self.drained_node,
+                    scheduled_to
+                );
+
+                Ok(Some(tenant_shard))
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use utils::{
+        id::TenantId,
+        shard::{ShardCount, ShardNumber, TenantShardId},
+    };
+
+    use super::TenantShardIterator;
+
+    #[test]
+    fn test_tenant_shard_iterator() {
+        let tenant_id = TenantId::generate();
+        let shard_count = ShardCount(8);
+
+        let mut tenant_shards = Vec::default();
+        for i in 0..shard_count.0 {
+            tenant_shards.push((
+                TenantShardId {
+                    tenant_id,
+                    shard_number: ShardNumber(i),
+                    shard_count,
+                },
+                (),
+            ))
+        }
+
+        let tenant_shards = Arc::new(tenant_shards);
+
+        let mut tid_iter = TenantShardIterator::new({
+            let tenants = tenant_shards.clone();
+            move |last_inspected_shard: Option<TenantShardId>| {
+                let entry = match last_inspected_shard {
+                    Some(skip_past) => {
+                        let mut cursor = tenants.iter().skip_while(|(tid, _)| *tid != skip_past);
+                        cursor.nth(1)
+                    }
+                    None => tenants.first(),
+                };
+
+                entry.map(|(tid, _)| tid).copied()
+            }
+        });
+
+        let mut iterated_over = Vec::default();
+        while let Some(tid) = tid_iter.next() {
+            iterated_over.push((tid, ()));
+        }
+
+        assert_eq!(iterated_over, *tenant_shards);
+    }
+}
diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs
index 8caf638904..26c258c466 100644
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -4,6 +4,7 @@ use utils::seqwait::MonotonicCounter;
 mod auth;
 mod background_node_operations;
 mod compute_hook;
+mod drain_utils;
 mod heartbeater;
 pub mod http;
 mod id_lock_map;
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 2799f21fdc..a66e9128bc 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -92,6 +92,11 @@ struct Cli {
     /// Chaos testing
     #[arg(long)]
     chaos_interval: Option<humantime::Duration>,
+
+    // Maximum acceptable lag for the secondary location while draining
+    // a pageserver
+    #[arg(long)]
+    max_secondary_lag_bytes: Option<u64>,
 }
 
 enum StrictMode {
@@ -279,6 +284,7 @@ async fn async_main() -> anyhow::Result<()> {
             .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
         split_threshold: args.split_threshold,
         neon_local_repo_dir: args.neon_local_repo_dir,
+        max_secondary_lag_bytes: args.max_secondary_lag_bytes,
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 254fdb364e..94db879ade 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -39,6 +39,9 @@ pub(super) struct Reconciler {
     /// to detach this tenant shard.
     pub(crate) detach: Vec<Node>,
 
+    /// Configuration specific to this reconciler
+    pub(crate) reconciler_config: ReconcilerConfig,
+
     pub(crate) config: TenantConfig,
     pub(crate) observed: ObservedState,
 
@@ -73,6 +76,65 @@ pub(super) struct Reconciler {
     pub(crate) persistence: Arc<Persistence>,
 }
 
+pub(crate) struct ReconcilerConfigBuilder {
+    config: ReconcilerConfig,
+}
+
+impl ReconcilerConfigBuilder {
+    pub(crate) fn new() -> Self {
+        Self {
+            config: ReconcilerConfig::default(),
+        }
+    }
+
+    pub(crate) fn secondary_warmup_timeout(self, value: Duration) -> Self {
+        Self {
+            config: ReconcilerConfig {
+                secondary_warmup_timeout: Some(value),
+                ..self.config
+            },
+        }
+    }
+
+    pub(crate) fn secondary_download_request_timeout(self, value: Duration) -> Self {
+        Self {
+            config: ReconcilerConfig {
+                secondary_download_request_timeout: Some(value),
+                ..self.config
+            },
+        }
+    }
+
+    pub(crate) fn build(self) -> ReconcilerConfig {
+        self.config
+    }
+}
+
+#[derive(Default, Debug, Copy, Clone)]
+pub(crate) struct ReconcilerConfig {
+    // During live migration give up on warming-up the secondary
+    // after this timeout.
+    secondary_warmup_timeout: Option<Duration>,
+
+    // During live migrations this is the amount of time that
+    // the pagserver will hold our poll.
+    secondary_download_request_timeout: Option<Duration>,
+}
+
+impl ReconcilerConfig {
+    pub(crate) fn get_secondary_warmup_timeout(&self) -> Duration {
+        const SECONDARY_WARMUP_TIMEOUT_DEFAULT: Duration = Duration::from_secs(300);
+        self.secondary_warmup_timeout
+            .unwrap_or(SECONDARY_WARMUP_TIMEOUT_DEFAULT)
+    }
+
+    pub(crate) fn get_secondary_download_request_timeout(&self) -> Duration {
+        const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT: Duration = Duration::from_secs(20);
+        self.secondary_download_request_timeout
+            .unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT)
+    }
+}
+
 /// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
 pub(crate) struct ReconcileUnits {
     _sem_units: tokio::sync::OwnedSemaphorePermit,
@@ -300,11 +362,13 @@ impl Reconciler {
     ) -> Result<(), ReconcileError> {
         // This is not the timeout for a request, but the total amount of time we're willing to wait
         // for a secondary location to get up to date before
-        const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
+        let total_download_timeout = self.reconciler_config.get_secondary_warmup_timeout();
 
         // This the long-polling interval for the secondary download requests we send to destination pageserver
         // during a migration.
-        const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
+        let request_download_timeout = self
+            .reconciler_config
+            .get_secondary_download_request_timeout();
 
         let started_at = Instant::now();
 
@@ -315,14 +379,14 @@ impl Reconciler {
                         client
                             .tenant_secondary_download(
                                 tenant_shard_id,
-                                Some(REQUEST_DOWNLOAD_TIMEOUT),
+                                Some(request_download_timeout),
                             )
                             .await
                     },
                     &self.service_config.jwt_token,
                     1,
                     3,
-                    REQUEST_DOWNLOAD_TIMEOUT * 2,
+                    request_download_timeout * 2,
                     &self.cancel,
                 )
                 .await
@@ -350,7 +414,7 @@ impl Reconciler {
                 return Ok(());
             } else if status == StatusCode::ACCEPTED {
                 let total_runtime = started_at.elapsed();
-                if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
+                if total_runtime > total_download_timeout {
                     tracing::warn!("Timed out after {}ms downloading layers to {node}.  Progress so far: {}/{} layers, {}/{} bytes",
                         total_runtime.as_millis(),
                         progress.layers_downloaded,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index e391ce65e6..31b2d0c3f5 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -14,10 +14,11 @@ use crate::{
         Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION,
     },
     compute_hook::NotifyError,
+    drain_utils::{self, TenantShardDrain, TenantShardIterator},
     id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
     metrics::LeadershipStatusGroup,
     persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
-    reconciler::{ReconcileError, ReconcileUnits},
+    reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
     scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
     tenant_shard::{
         MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
@@ -325,6 +326,12 @@ pub struct Config {
 
     // TODO: make this cfg(feature  = "testing")
     pub neon_local_repo_dir: Option<PathBuf>,
+
+    // Maximum acceptable download lag for the secondary location
+    // while draining a node. If the secondary location is lagging
+    // by more than the configured amount, then the secondary is not
+    // upgraded to primary.
+    pub max_secondary_lag_bytes: Option<u64>,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -5187,11 +5194,22 @@ impl Service {
         Ok(())
     }
 
-    /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`],
+    /// Like [`Self::maybe_configured_reconcile_shard`], but uses the default reconciler
+    /// configuration
     fn maybe_reconcile_shard(
         &self,
         shard: &mut TenantShard,
         nodes: &Arc<HashMap<NodeId, Node>>,
+    ) -> Option<ReconcilerWaiter> {
+        self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::default())
+    }
+
+    /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`],
+    fn maybe_configured_reconcile_shard(
+        &self,
+        shard: &mut TenantShard,
+        nodes: &Arc<HashMap<NodeId, Node>>,
+        reconciler_config: ReconcilerConfig,
     ) -> Option<ReconcilerWaiter> {
         let reconcile_needed = shard.get_reconcile_needed(nodes);
 
@@ -5241,6 +5259,7 @@ impl Service {
             &self.result_tx,
             nodes,
             &self.compute_hook,
+            reconciler_config,
             &self.config,
             &self.persistence,
             units,
@@ -5715,18 +5734,92 @@ impl Service {
         self.gate.close().await;
     }
 
+    /// Spot check the download lag for a secondary location of a shard.
+    /// Should be used as a heuristic, since it's not always precise: the
+    /// secondary might have not downloaded the new heat map yet and, hence,
+    /// is not aware of the lag.
+    ///
+    /// Returns:
+    /// * Ok(None) if the lag could not be determined from the status,
+    /// * Ok(Some(_)) if the lag could be determind
+    /// * Err on failures to query the pageserver.
+    async fn secondary_lag(
+        &self,
+        secondary: &NodeId,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<Option<u64>, mgmt_api::Error> {
+        let nodes = self.inner.read().unwrap().nodes.clone();
+        let node = nodes.get(secondary).ok_or(mgmt_api::Error::ApiError(
+            StatusCode::NOT_FOUND,
+            format!("Node with id {} not found", secondary),
+        ))?;
+
+        match node
+            .with_client_retries(
+                |client| async move { client.tenant_secondary_status(tenant_shard_id).await },
+                &self.config.jwt_token,
+                1,
+                3,
+                Duration::from_millis(250),
+                &self.cancel,
+            )
+            .await
+        {
+            Some(Ok(status)) => match status.heatmap_mtime {
+                Some(_) => Ok(Some(status.bytes_total - status.bytes_downloaded)),
+                None => Ok(None),
+            },
+            Some(Err(e)) => Err(e),
+            None => Err(mgmt_api::Error::Cancelled),
+        }
+    }
+
     /// Drain a node by moving the shards attached to it as primaries.
     /// This is a long running operation and it should run as a separate Tokio task.
     pub(crate) async fn drain_node(
-        &self,
+        self: &Arc<Self>,
         node_id: NodeId,
         cancel: CancellationToken,
     ) -> Result<(), OperationError> {
-        let mut last_inspected_shard: Option<TenantShardId> = None;
-        let mut inspected_all_shards = false;
+        const MAX_SECONDARY_LAG_BYTES_DEFAULT: u64 = 256 * 1024 * 1024;
+        let max_secondary_lag_bytes = self
+            .config
+            .max_secondary_lag_bytes
+            .unwrap_or(MAX_SECONDARY_LAG_BYTES_DEFAULT);
+
+        // By default, live migrations are generous about the wait time for getting
+        // the secondary location up to speed. When draining, give up earlier in order
+        // to not stall the operation when a cold secondary is encountered.
+        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
+        const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
+        let reconciler_config = ReconcilerConfigBuilder::new()
+            .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
+            .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT)
+            .build();
+
         let mut waiters = Vec::new();
 
-        while !inspected_all_shards {
+        let mut tid_iter = TenantShardIterator::new({
+            let service = self.clone();
+            move |last_inspected_shard: Option<TenantShardId>| {
+                let locked = &service.inner.read().unwrap();
+                let tenants = &locked.tenants;
+                let entry = match last_inspected_shard {
+                    Some(skip_past) => {
+                        // Skip to the last seen tenant shard id
+                        let mut cursor = tenants.iter().skip_while(|(tid, _)| **tid != skip_past);
+
+                        // Skip past the last seen
+                        cursor.nth(1)
+                    }
+                    None => tenants.first_key_value(),
+                };
+
+                entry.map(|(tid, _)| tid).copied()
+            }
+        });
+
+        while !tid_iter.finished() {
             if cancel.is_cancelled() {
                 match self
                     .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
@@ -5745,71 +5838,82 @@ impl Service {
                 }
             }
 
-            {
-                let mut locked = self.inner.write().unwrap();
-                let (nodes, tenants, scheduler) = locked.parts_mut();
+            drain_utils::validate_node_state(&node_id, self.inner.read().unwrap().nodes.clone())?;
 
-                let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged(
-                    format!("node {node_id} was removed").into(),
-                ))?;
-
-                let current_policy = node.get_scheduling();
-                if !matches!(current_policy, NodeSchedulingPolicy::Draining) {
-                    // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
-                    // about it
-                    return Err(OperationError::NodeStateChanged(
-                        format!("node {node_id} changed state to {current_policy:?}").into(),
-                    ));
-                }
-
-                let mut cursor = tenants.iter_mut().skip_while({
-                    let skip_past = last_inspected_shard;
-                    move |(tid, _)| match skip_past {
-                        Some(last) => **tid != last,
-                        None => false,
+            while waiters.len() < MAX_RECONCILES_PER_OPERATION {
+                let tid = match tid_iter.next() {
+                    Some(tid) => tid,
+                    None => {
+                        break;
                     }
-                });
+                };
 
-                while waiters.len() < MAX_RECONCILES_PER_OPERATION {
-                    let (tid, tenant_shard) = match cursor.next() {
-                        Some(some) => some,
+                let tid_drain = TenantShardDrain {
+                    drained_node: node_id,
+                    tenant_shard_id: tid,
+                };
+
+                let dest_node_id = {
+                    let locked = self.inner.read().unwrap();
+
+                    match tid_drain
+                        .tenant_shard_eligible_for_drain(&locked.tenants, &locked.scheduler)
+                    {
+                        Some(node_id) => node_id,
                         None => {
-                            inspected_all_shards = true;
-                            break;
+                            continue;
                         }
-                    };
+                    }
+                };
 
-                    // If the shard is not attached to the node being drained, skip it.
-                    if *tenant_shard.intent.get_attached() != Some(node_id) {
-                        last_inspected_shard = Some(*tid);
+                match self.secondary_lag(&dest_node_id, tid).await {
+                    Ok(Some(lag)) if lag <= max_secondary_lag_bytes => {
+                        // The secondary is reasonably up to date.
+                        // Migrate to it
+                    }
+                    Ok(Some(lag)) => {
+                        tracing::info!(
+                            tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                            "Secondary on node {dest_node_id} is lagging by {lag}. Skipping reconcile."
+                        );
                         continue;
                     }
+                    Ok(None) => {
+                        tracing::info!(
+                            tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                            "Could not determine lag for secondary on node {dest_node_id}. Skipping reconcile."
+                        );
+                        continue;
+                    }
+                    Err(err) => {
+                        tracing::warn!(
+                            tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                            "Failed to get secondary lag from node {dest_node_id}. Skipping reconcile: {err}"
+                        );
+                        continue;
+                    }
+                }
 
-                    match tenant_shard.reschedule_to_secondary(None, scheduler) {
-                        Err(e) => {
-                            tracing::warn!(
-                                tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                                "Scheduling error when draining pageserver {} : {e}", node_id
-                            );
-                        }
-                        Ok(()) => {
-                            let scheduled_to = tenant_shard.intent.get_attached();
-                            tracing::info!(
-                                tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                                "Rescheduled shard while draining node {}: {} -> {:?}",
-                                node_id,
-                                node_id,
-                                scheduled_to
-                            );
+                {
+                    let mut locked = self.inner.write().unwrap();
+                    let (nodes, tenants, scheduler) = locked.parts_mut();
+                    let rescheduled = tid_drain.reschedule_to_secondary(
+                        dest_node_id,
+                        tenants,
+                        scheduler,
+                        nodes,
+                    )?;
 
-                            let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
-                            if let Some(some) = waiter {
-                                waiters.push(some);
-                            }
+                    if let Some(tenant_shard) = rescheduled {
+                        let waiter = self.maybe_configured_reconcile_shard(
+                            tenant_shard,
+                            nodes,
+                            reconciler_config,
+                        );
+                        if let Some(some) = waiter {
+                            waiters.push(some);
                         }
                     }
-
-                    last_inspected_shard = Some(*tid);
                 }
             }
 
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index e250f29f98..1fcc3c8547 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -7,7 +7,7 @@ use std::{
 use crate::{
     metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
     persistence::TenantShardPersistence,
-    reconciler::ReconcileUnits,
+    reconciler::{ReconcileUnits, ReconcilerConfig},
     scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
     service::ReconcileResultRequest,
 };
@@ -1063,6 +1063,7 @@ impl TenantShard {
         result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResultRequest>,
         pageservers: &Arc<HashMap<NodeId, Node>>,
         compute_hook: &Arc<ComputeHook>,
+        reconciler_config: ReconcilerConfig,
         service_config: &service::Config,
         persistence: &Arc<Persistence>,
         units: ReconcileUnits,
@@ -1101,6 +1102,7 @@ impl TenantShard {
             generation: self.generation,
             intent: reconciler_intent,
             detach,
+            reconciler_config,
             config: self.config.clone(),
             observed: self.observed.clone(),
             compute_hook: compute_hook.clone(),
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c6f4404784..844a23d327 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -14,6 +14,7 @@ import textwrap
 import threading
 import time
 import uuid
+from collections import defaultdict
 from contextlib import closing, contextmanager
 from dataclasses import dataclass
 from datetime import datetime
@@ -2667,6 +2668,69 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"Got failpoints request response code {res.status_code}")
         res.raise_for_status()
 
+    def get_tenants_placement(self) -> defaultdict[str, Dict[str, Any]]:
+        """
+        Get the intent and observed placements of all tenants known to the storage controller.
+        """
+        tenants = self.tenant_list()
+
+        tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict(
+            lambda: {
+                "observed": {"attached": None, "secondary": []},
+                "intent": {"attached": None, "secondary": []},
+            }
+        )
+
+        for t in tenants:
+            for node_id, loc_state in t["observed"]["locations"].items():
+                if (
+                    loc_state is not None
+                    and "conf" in loc_state
+                    and loc_state["conf"] is not None
+                    and loc_state["conf"]["mode"]
+                    in set(["AttachedSingle", "AttachedMulti", "AttachedStale"])
+                ):
+                    tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id)
+
+                if (
+                    loc_state is not None
+                    and "conf" in loc_state
+                    and loc_state["conf"] is not None
+                    and loc_state["conf"]["mode"] == "Secondary"
+                ):
+                    tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(
+                        int(node_id)
+                    )
+
+            if "attached" in t["intent"]:
+                tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"][
+                    "attached"
+                ]
+
+            if "secondary" in t["intent"]:
+                tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][
+                    "secondary"
+                ]
+
+        return tenant_placement
+
+    def warm_up_all_secondaries(self):
+        log.info("Warming up all secondary locations")
+
+        tenant_placement = self.get_tenants_placement()
+        for tid, placement in tenant_placement.items():
+            assert placement["observed"]["attached"] is not None
+            primary_id = placement["observed"]["attached"]
+
+            assert len(placement["observed"]["secondary"]) == 1
+            secondary_id = placement["observed"]["secondary"][0]
+
+            parsed_tid = TenantShardId.parse(tid)
+            self.env.get_pageserver(primary_id).http_client().tenant_heatmap_upload(parsed_tid)
+            self.env.get_pageserver(secondary_id).http_client().tenant_secondary_download(
+                parsed_tid, wait_ms=250
+            )
+
     @property
     def workdir(self) -> Path:
         return self.env.repo_dir
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 65d6ff5d62..cd4261f1b8 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -361,6 +361,12 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         self.verbose_error(res)
         return (res.status_code, res.json())
 
+    def tenant_secondary_status(self, tenant_id: Union[TenantId, TenantShardId]):
+        url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/status"
+        res = self.get(url)
+        self.verbose_error(res)
+        return res.json()
+
     def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]):
         assert "tenant_id" not in config.keys()
         res = self.put(
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 04785f7184..297aedfbed 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -2,7 +2,6 @@ import concurrent.futures
 import random
 import time
 from collections import defaultdict
-from typing import Any, Dict
 
 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
@@ -24,51 +23,14 @@ def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[
     This function takes into account the intersection of the intent and the observed state.
     If they do not match, it asserts out.
     """
-    tenants = env.storage_controller.tenant_list()
-
-    intent = dict()
-    observed = dict()
-
-    tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict(
-        lambda: {
-            "observed": {"attached": None, "secondary": []},
-            "intent": {"attached": None, "secondary": []},
-        }
-    )
-
-    for t in tenants:
-        for node_id, loc_state in t["observed"]["locations"].items():
-            if (
-                loc_state is not None
-                and "conf" in loc_state
-                and loc_state["conf"] is not None
-                and loc_state["conf"]["mode"]
-                in set(["AttachedSingle", "AttachedMulti", "AttachedStale"])
-            ):
-                observed[t["tenant_shard_id"]] = int(node_id)
-                tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id)
-
-            if (
-                loc_state is not None
-                and "conf" in loc_state
-                and loc_state["conf"] is not None
-                and loc_state["conf"]["mode"] == "Secondary"
-            ):
-                tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(int(node_id))
-
-        if "attached" in t["intent"]:
-            intent[t["tenant_shard_id"]] = t["intent"]["attached"]
-            tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"]["attached"]
-
-        if "secondary" in t["intent"]:
-            tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][
-                "secondary"
-            ]
-
+    tenant_placement = env.storage_controller.get_tenants_placement()
     log.info(f"{tenant_placement=}")
 
     matching = {
-        tid: intent[tid] for tid in observed if tid in intent and intent[tid] == observed[tid]
+        tid: tenant_placement[tid]["intent"]["attached"]
+        for tid in tenant_placement
+        if tenant_placement[tid]["intent"]["attached"]
+        == tenant_placement[tid]["observed"]["attached"]
     }
     assert len(matching) == total_shards
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index eb2cdccdb9..9b2557a165 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -17,6 +17,7 @@ from fixtures.neon_fixtures import (
     PgBin,
     StorageControllerApiException,
     TokenScope,
+    last_flush_lsn_upload,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -1597,6 +1598,8 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
 
     # Perform a graceful rolling restart
     for ps in env.pageservers:
+        env.storage_controller.warm_up_all_secondaries()
+
         env.storage_controller.retryable_node_operation(
             lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
         )
@@ -1645,6 +1648,115 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
     assert_shard_counts_balanced(env, shard_counts, total_shards)
 
 
+def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    """
+    Artificially make a tenant shard's secondary location lag behind the primary
+    and check that storage controller driven node drains skip the lagging tenant shard.
+    Finally, validate that the tenant shard is migrated when a new drain request comes
+    in and it's no longer lagging.
+    """
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.storage_controller_config = {
+        "max_secondary_lag_bytes": 1 * 1024 * 1024,
+    }
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tid, timeline_id = env.neon_cli.create_tenant(placement_policy='{"Attached":1}')
+
+    # Give things a chance to settle.
+    env.storage_controller.reconcile_until_idle(timeout_secs=30)
+
+    locations = env.storage_controller.locate(tid)
+    assert len(locations) == 1
+    primary: int = locations[0]["node_id"]
+    not_primary = [ps.id for ps in env.pageservers if ps.id != primary]
+    assert len(not_primary) == 1
+    secondary = not_primary[0]
+
+    log.info(f"Paused secondary downloads on {secondary}")
+    env.get_pageserver(secondary).http_client().configure_failpoints(
+        ("secondary-layer-download-pausable", "pause")
+    )
+
+    log.info(f"Ingesting some data for {tid}")
+
+    with env.endpoints.create_start("main", tenant_id=tid) as endpoint:
+        run_pg_bench_small(pg_bin, endpoint.connstr())
+        endpoint.safe_psql("CREATE TABLE created_foo(id integer);")
+        last_flush_lsn_upload(env, endpoint, tid, timeline_id)
+
+    log.info(f"Uploading heatmap from {primary} and requesting download from {secondary}")
+
+    env.get_pageserver(primary).http_client().tenant_heatmap_upload(tid)
+    env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100)
+
+    def secondary_is_lagging():
+        resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid)
+        lag = resp["bytes_total"] - resp["bytes_downloaded"]
+
+        if lag <= 1 * 1024 * 1024:
+            raise Exception(f"Secondary lag not big enough: {lag}")
+
+    log.info(f"Looking for lag to develop on the secondary {secondary}")
+    wait_until(10, 1, secondary_is_lagging)
+
+    log.info(f"Starting drain of primary {primary} with laggy secondary {secondary}")
+    env.storage_controller.retryable_node_operation(
+        lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2
+    )
+
+    env.storage_controller.poll_node_status(
+        primary,
+        PageserverAvailability.ACTIVE,
+        PageserverSchedulingPolicy.PAUSE_FOR_RESTART,
+        max_attempts=6,
+        backoff=5,
+    )
+
+    locations = env.storage_controller.locate(tid)
+    assert len(locations) == 1
+    assert locations[0]["node_id"] == primary
+
+    log.info(f"Unpausing secondary downloads on {secondary}")
+    env.get_pageserver(secondary).http_client().configure_failpoints(
+        ("secondary-layer-download-pausable", "off")
+    )
+    env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100)
+
+    log.info(f"Waiting for lag to reduce on {secondary}")
+
+    def lag_is_acceptable():
+        resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid)
+        lag = resp["bytes_total"] - resp["bytes_downloaded"]
+
+        if lag > 1 * 1024 * 1024:
+            raise Exception(f"Secondary lag not big enough: {lag}")
+
+    wait_until(10, 1, lag_is_acceptable)
+
+    env.storage_controller.node_configure(primary, {"scheduling": "Active"})
+
+    log.info(f"Starting drain of primary {primary} with non-laggy secondary {secondary}")
+
+    env.storage_controller.retryable_node_operation(
+        lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2
+    )
+
+    env.storage_controller.poll_node_status(
+        primary,
+        PageserverAvailability.ACTIVE,
+        PageserverSchedulingPolicy.PAUSE_FOR_RESTART,
+        max_attempts=6,
+        backoff=5,
+    )
+
+    locations = env.storage_controller.locate(tid)
+    assert len(locations) == 1
+    assert locations[0]["node_id"] == secondary
+
+
 def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_pageservers = 2
     env = neon_env_builder.init_configs()
@@ -1671,6 +1783,7 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
 
     ps_id_to_drain = env.pageservers[0].id
 
+    env.storage_controller.warm_up_all_secondaries()
     env.storage_controller.retryable_node_operation(
         lambda ps_id: env.storage_controller.node_drain(ps_id),
         ps_id_to_drain,

From 4a53cd0fc31d3f1782d3c002d67f1ecda1fc6fc5 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 9 Aug 2024 15:48:16 +0100
Subject: [PATCH 1377/1571] Dockerfiles: remove cachepot (#8666)

## Problem
We install and try to use `cachepot`. But it is not configured correctly
and doesn't work (after https://github.com/neondatabase/neon/pull/2290)

## Summary of changes
- Remove `cachepot`
---
 .github/workflows/neon_extra_builds.yml |  2 --
 Dockerfile                              | 15 +--------------
 Dockerfile.build-tools                  |  2 --
 libs/utils/src/lib.rs                   |  2 +-
 4 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index d4870e16ad..2ee66cfdc1 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -149,8 +149,6 @@ jobs:
 
     env:
       BUILD_TYPE: release
-      # remove the cachepot wrapper and build without crate caches
-      RUSTC_WRAPPER: ""
       # build with incremental compilation produce partial results
       # so do not attempt to cache this build, also disable the incremental compilation
       CARGO_INCREMENTAL: 0
diff --git a/Dockerfile b/Dockerfile
index 6ed57a84a3..ceb1c7cb55 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,24 +29,12 @@ WORKDIR /home/nonroot
 ARG GIT_VERSION=local
 ARG BUILD_TAG
 
-# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
-# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
-# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build
-ARG RUSTC_WRAPPER=cachepot
-ENV AWS_REGION=eu-central-1
-ENV CACHEPOT_S3_KEY_PREFIX=cachepot
-ARG CACHEPOT_BUCKET=neon-github-dev
-#ARG AWS_ACCESS_KEY_ID
-#ARG AWS_SECRET_ACCESS_KEY
-
 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
 COPY --chown=nonroot . .
 
-# Show build caching stats to check if it was used in the end.
-# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
     && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
       --bin pg_sni_router  \
@@ -58,8 +46,7 @@ RUN set -e \
       --bin proxy  \
       --bin neon_local \
       --bin storage_scrubber \
-      --locked --release \
-    && cachepot -s
+      --locked --release
 
 # Build final image
 #
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index d39d36e1b6..d6beb61369 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -208,7 +208,6 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     . "$HOME/.cargo/env" && \
     cargo --version && rustup --version && \
     rustup component add llvm-tools-preview rustfmt clippy && \
-    cargo install --git https://github.com/paritytech/cachepot && \
     cargo install rustfilt            --version ${RUSTFILT_VERSION} && \
     cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} && \
     cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
@@ -216,7 +215,6 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
     rm -rf /home/nonroot/.cargo/registry && \
     rm -rf /home/nonroot/.cargo/git
-ENV RUSTC_WRAPPER=cachepot
 
 # Show versions
 RUN whoami \
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index a46d68ef33..f4fc0ba57b 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -128,7 +128,7 @@ pub mod circuit_breaker;
 ///
 /// #############################################################################################
 /// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details.
-/// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
+/// We used `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
 /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
 /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
 /// The problem needs further investigation and regular `const` declaration instead of a macro.

From 401dcd3551ced013ca4782e0ae55816eafeee00f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 9 Aug 2024 18:30:15 +0100
Subject: [PATCH 1378/1571] Update docs/SUMMARY.md (#8665)

## Problem

This page had many dead links, and was confusing for folks looking for
documentation about our product.

Closes: https://github.com/neondatabase/neon/issues/8535

## Summary of changes

- Add a link to the product docs up top
- Remove dead/placeholder links
---
 docs/SUMMARY.md | 59 ++++++++++---------------------------------------
 1 file changed, 12 insertions(+), 47 deletions(-)

diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md
index b275349168..5fd4080c28 100644
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -1,13 +1,18 @@
 # Summary
 
+# Looking for `neon.tech` docs?
+
+This page linkes to a selection of technical content about the open source code in this repository.
+
+Please visit https://neon.tech/docs for documentation about using the Neon service, which is based on the code
+in this repository.
+
+# Architecture
+
 [Introduction]()
 - [Separation of Compute and Storage](./separation-compute-storage.md)
 
-# Architecture
-
 - [Compute]()
-  - [WAL proposer]()
-  - [WAL Backpressure]()
   - [Postgres changes](./core_changes.md)
 
 - [Pageserver](./pageserver.md)
@@ -16,33 +21,15 @@
     - [WAL Redo](./pageserver-walredo.md)
     - [Page cache](./pageserver-pagecache.md)
     - [Storage](./pageserver-storage.md)
-        - [Datadir mapping]()
-        - [Layer files]()
-        - [Branching]()
-        - [Garbage collection]()
-    - [Cloud Storage]()
     - [Processing a GetPage request](./pageserver-processing-getpage.md)
     - [Processing WAL](./pageserver-processing-wal.md)
-	- [Management API]()
-	- [Tenant Rebalancing]()
 
 - [WAL Service](walservice.md)
   - [Consensus protocol](safekeeper-protocol.md)
-  - [Management API]()
-  - [Rebalancing]()
-
-- [Control Plane]()
-
-- [Proxy]()
 
 - [Source view](./sourcetree.md)
   - [docker.md](./docker.md) — Docker images and building pipeline.
   - [Error handling and logging](./error-handling.md)
-  - [Testing]()
-    - [Unit testing]()
-    - [Integration testing]()
-    - [Benchmarks]()
-
 
 - [Glossary](./glossary.md)
 
@@ -58,28 +45,6 @@
 
 # RFCs
 
-- [RFCs](./rfcs/README.md)
-
-- [002-storage](rfcs/002-storage.md)
-- [003-laptop-cli](rfcs/003-laptop-cli.md)
-- [004-durability](rfcs/004-durability.md)
-- [005-zenith_local](rfcs/005-zenith_local.md)
-- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md)
-- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md)
-- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md)
-- [008-push-pull](rfcs/008-push-pull.md)
-- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md)
-- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md)
-- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md)
-- [010-storage_details](rfcs/010-storage_details.md)
-- [011-retention-policy](rfcs/011-retention-policy.md)
-- [012-background-tasks](rfcs/012-background-tasks.md)
-- [013-term-history](rfcs/013-term-history.md)
-- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md)
-- [014-storage-lsm](rfcs/014-storage-lsm.md)
-- [015-storage-messaging](rfcs/015-storage-messaging.md)
-- [016-connection-routing](rfcs/016-connection-routing.md)
-- [017-timeline-data-management](rfcs/017-timeline-data-management.md)
-- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md)
-- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md)
-- [cluster-size-limits](rfcs/cluster-size-limits.md)
+Major changes are documented in RFCS:
+- See [RFCs](./rfcs/README.md) for more information
+- view the RFCs at https://github.com/neondatabase/neon/tree/main/docs/rfcs

From 507f1a5bdd4a168e589550e7c1bb5ac6de41643f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 10 Aug 2024 14:04:47 +0200
Subject: [PATCH 1379/1571] Also pass HOME env var in access_env_vars (#8685)

Noticed this while debugging a test failure in #8673 which only occurs
with real S3 instead of mock S3: if you authenticate to S3 via
`AWS_PROFILE`, then it requires the `HOME` env var to be set so that it
can read inside the `~/.aws` directory.

The scrubber abstraction `StorageScrubber::scrubber_cli` in
`neon_fixtures.py` would otherwise not work. My earlier PR #6556 has
done similar things for the `neon_local` wrapper.

You can try:

```
aws sso login --profile dev
export ENABLE_REAL_S3_REMOTE_STORAGE=y REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests REMOTE_STORAGE_S3_REGION=eu-central-1 AWS_PROFILE=dev
RUST_BACKTRACE=1 BUILD_TYPE=debug DEFAULT_PG_VERSION=16 ./scripts/pytest -vv --tb=short -k test_scrubber_tenant_snapshot
```

before and after this patch: this patch fixes it.
---
 test_runner/fixtures/remote_storage.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 0f2a997b1e..1b6c3c23ba 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -177,9 +177,14 @@ class S3Storage:
 
     def access_env_vars(self) -> Dict[str, str]:
         if self.aws_profile is not None:
-            return {
+            env = {
                 "AWS_PROFILE": self.aws_profile,
             }
+            # Pass through HOME env var because AWS_PROFILE needs it in order to work
+            home = os.getenv("HOME")
+            if home is not None:
+                env["HOME"] = home
+            return env
         if self.access_key is not None and self.secret_key is not None:
             return {
                 "AWS_ACCESS_KEY_ID": self.access_key,

From f7a3380aeccdffbb7371c33d5f2391f28426b6c8 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 11 Aug 2024 12:21:32 +0100
Subject: [PATCH 1380/1571] chore(deps): bump aiohttp from 3.9.4 to 3.10.2
 (#8684)

---
 poetry.lock    | 170 ++++++++++++++++++++++++++-----------------------
 pyproject.toml |   2 +-
 2 files changed, 92 insertions(+), 80 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 9026824558..7db91e51f7 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,91 +1,103 @@
 # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
+[[package]]
+name = "aiohappyeyeballs"
+version = "2.3.5"
+description = "Happy Eyeballs for asyncio"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"},
+    {file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"},
+]
+
 [[package]]
 name = "aiohttp"
-version = "3.9.4"
+version = "3.10.2"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"},
-    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"},
-    {file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"},
-    {file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"},
-    {file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"},
-    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"},
-    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"},
-    {file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"},
-    {file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"},
-    {file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"},
-    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"},
-    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"},
-    {file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"},
-    {file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"},
-    {file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"},
-    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"},
-    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"},
-    {file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"},
-    {file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"},
-    {file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"},
-    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"},
-    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"},
-    {file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"},
-    {file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"},
-    {file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"},
-    {file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"},
+    {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:95213b3d79c7e387144e9cb7b9d2809092d6ff2c044cb59033aedc612f38fb6d"},
+    {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1aa005f060aff7124cfadaa2493f00a4e28ed41b232add5869e129a2e395935a"},
+    {file = "aiohttp-3.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eabe6bf4c199687592f5de4ccd383945f485779c7ffb62a9b9f1f8a3f9756df8"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e010736fc16d21125c7e2dc5c350cd43c528b85085c04bf73a77be328fe944"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99f81f9c1529fd8e03be4a7bd7df32d14b4f856e90ef6e9cbad3415dbfa9166c"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d611d1a01c25277bcdea06879afbc11472e33ce842322496b211319aa95441bb"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00191d38156e09e8c81ef3d75c0d70d4f209b8381e71622165f22ef7da6f101"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74c091a5ded6cb81785de2d7a8ab703731f26de910dbe0f3934eabef4ae417cc"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:18186a80ec5a701816adbf1d779926e1069392cf18504528d6e52e14b5920525"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5a7ceb2a0d2280f23a02c64cd0afdc922079bb950400c3dd13a1ab2988428aac"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8bd7be6ff6c162a60cb8fce65ee879a684fbb63d5466aba3fa5b9288eb04aefa"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:fae962b62944eaebff4f4fddcf1a69de919e7b967136a318533d82d93c3c6bd1"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a0fde16d284efcacbe15fb0c1013f0967b6c3e379649239d783868230bf1db42"},
+    {file = "aiohttp-3.10.2-cp310-cp310-win32.whl", hash = "sha256:f81cd85a0e76ec7b8e2b6636fe02952d35befda4196b8c88f3cec5b4fb512839"},
+    {file = "aiohttp-3.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:54ba10eb5a3481c28282eb6afb5f709aedf53cf9c3a31875ffbdc9fc719ffd67"},
+    {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87fab7f948e407444c2f57088286e00e2ed0003ceaf3d8f8cc0f60544ba61d91"},
+    {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ec6ad66ed660d46503243cbec7b2b3d8ddfa020f984209b3b8ef7d98ce69c3f2"},
+    {file = "aiohttp-3.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a4be88807283bd96ae7b8e401abde4ca0bab597ba73b5e9a2d98f36d451e9aac"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01c98041f90927c2cbd72c22a164bb816fa3010a047d264969cf82e1d4bcf8d1"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54e36c67e1a9273ecafab18d6693da0fb5ac48fd48417e4548ac24a918c20998"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7de3ddb6f424af54535424082a1b5d1ae8caf8256ebd445be68c31c662354720"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dd9c7db94b4692b827ce51dcee597d61a0e4f4661162424faf65106775b40e7"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e57e21e1167705f8482ca29cc5d02702208d8bf4aff58f766d94bcd6ead838cd"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a1a50e59b720060c29e2951fd9f13c01e1ea9492e5a527b92cfe04dd64453c16"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:686c87782481fda5ee6ba572d912a5c26d9f98cc5c243ebd03f95222af3f1b0f"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:dafb4abb257c0ed56dc36f4e928a7341b34b1379bd87e5a15ce5d883c2c90574"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:494a6f77560e02bd7d1ab579fdf8192390567fc96a603f21370f6e63690b7f3d"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6fe8503b1b917508cc68bf44dae28823ac05e9f091021e0c41f806ebbb23f92f"},
+    {file = "aiohttp-3.10.2-cp311-cp311-win32.whl", hash = "sha256:4ddb43d06ce786221c0dfd3c91b4892c318eaa36b903f7c4278e7e2fa0dd5102"},
+    {file = "aiohttp-3.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:ca2f5abcb0a9a47e56bac173c01e9f6c6e7f27534d91451c5f22e6a35a5a2093"},
+    {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:14eb6b17f6246959fb0b035d4f4ae52caa870c4edfb6170aad14c0de5bfbf478"},
+    {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:465e445ec348d4e4bd349edd8b22db75f025da9d7b6dc1369c48e7935b85581e"},
+    {file = "aiohttp-3.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:341f8ece0276a828d95b70cd265d20e257f5132b46bf77d759d7f4e0443f2906"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c01fbb87b5426381cd9418b3ddcf4fc107e296fa2d3446c18ce6c76642f340a3"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c474af073e1a6763e1c5522bbb2d85ff8318197e4c6c919b8d7886e16213345"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d9076810a5621236e29b2204e67a68e1fe317c8727ee4c9abbfbb1083b442c38"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8f515d6859e673940e08de3922b9c4a2249653b0ac181169313bd6e4b1978ac"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:655e583afc639bef06f3b2446972c1726007a21003cd0ef57116a123e44601bc"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8da9449a575133828cc99985536552ea2dcd690e848f9d41b48d8853a149a959"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:19073d57d0feb1865d12361e2a1f5a49cb764bf81a4024a3b608ab521568093a"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c8e98e1845805f184d91fda6f9ab93d7c7b0dddf1c07e0255924bfdb151a8d05"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:377220a5efde6f9497c5b74649b8c261d3cce8a84cb661be2ed8099a2196400a"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92f7f4a4dc9cdb5980973a74d43cdbb16286dacf8d1896b6c3023b8ba8436f8e"},
+    {file = "aiohttp-3.10.2-cp312-cp312-win32.whl", hash = "sha256:9bb2834a6f11d65374ce97d366d6311a9155ef92c4f0cee543b2155d06dc921f"},
+    {file = "aiohttp-3.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:518dc3cb37365255708283d1c1c54485bbacccd84f0a0fb87ed8917ba45eda5b"},
+    {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7f98e70bbbf693086efe4b86d381efad8edac040b8ad02821453083d15ec315f"},
+    {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9f6f0b252a009e98fe84028a4ec48396a948e7a65b8be06ccfc6ef68cf1f614d"},
+    {file = "aiohttp-3.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9360e3ffc7b23565600e729e8c639c3c50d5520e05fdf94aa2bd859eef12c407"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3988044d1635c7821dd44f0edfbe47e9875427464e59d548aece447f8c22800a"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30a9d59da1543a6f1478c3436fd49ec59be3868bca561a33778b4391005e499d"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f49bdb94809ac56e09a310a62f33e5f22973d6fd351aac72a39cd551e98194"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfd2dca3f11c365d6857a07e7d12985afc59798458a2fdb2ffa4a0332a3fd43"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c1508ec97b2cd3e120bfe309a4ff8e852e8a7460f1ef1de00c2c0ed01e33c"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:49904f38667c44c041a0b44c474b3ae36948d16a0398a8f8cd84e2bb3c42a069"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:352f3a4e5f11f3241a49b6a48bc5b935fabc35d1165fa0d87f3ca99c1fcca98b"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:fc61f39b534c5d5903490478a0dd349df397d2284a939aa3cbaa2fb7a19b8397"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:ad2274e707be37420d0b6c3d26a8115295fe9d8e6e530fa6a42487a8ca3ad052"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c836bf3c7512100219fe1123743fd8dd9a2b50dd7cfb0c3bb10d041309acab4b"},
+    {file = "aiohttp-3.10.2-cp38-cp38-win32.whl", hash = "sha256:53e8898adda402be03ff164b0878abe2d884e3ea03a4701e6ad55399d84b92dc"},
+    {file = "aiohttp-3.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:7cc8f65f5b22304693de05a245b6736b14cb5bc9c8a03da6e2ae9ef15f8b458f"},
+    {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9dfc906d656e14004c5bc672399c1cccc10db38df2b62a13fb2b6e165a81c316"},
+    {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:91b10208b222ddf655c3a3d5b727879d7163db12b634492df41a9182a76edaae"},
+    {file = "aiohttp-3.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fd16b5e1a7bdd14668cd6bde60a2a29b49147a535c74f50d8177d11b38433a7"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2bfdda4971bd79201f59adbad24ec2728875237e1c83bba5221284dbbf57bda"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69d73f869cf29e8a373127fc378014e2b17bcfbe8d89134bc6fb06a2f67f3cb3"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df59f8486507c421c0620a2c3dce81fbf1d54018dc20ff4fecdb2c106d6e6abc"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df930015db36b460aa9badbf35eccbc383f00d52d4b6f3de2ccb57d064a6ade"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:562b1153ab7f766ee6b8b357ec777a302770ad017cf18505d34f1c088fccc448"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d984db6d855de58e0fde1ef908d48fe9a634cadb3cf715962722b4da1c40619d"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:14dc3fcb0d877911d775d511eb617a486a8c48afca0a887276e63db04d3ee920"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b52a27a5c97275e254704e1049f4b96a81e67d6205f52fa37a4777d55b0e98ef"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:cd33d9de8cfd006a0d0fe85f49b4183c57e91d18ffb7e9004ce855e81928f704"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1238fc979160bc03a92fff9ad021375ff1c8799c6aacb0d8ea1b357ea40932bb"},
+    {file = "aiohttp-3.10.2-cp39-cp39-win32.whl", hash = "sha256:e2f43d238eae4f0b04f58d4c0df4615697d4ca3e9f9b1963d49555a94f0f5a04"},
+    {file = "aiohttp-3.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:947847f07a8f81d7b39b2d0202fd73e61962ebe17ac2d8566f260679e467da7b"},
+    {file = "aiohttp-3.10.2.tar.gz", hash = "sha256:4d1f694b5d6e459352e5e925a42e05bac66655bfde44d81c59992463d2897014"},
 ]
 
 [package.dependencies]
+aiohappyeyeballs = ">=2.3.0"
 aiosignal = ">=1.1.2"
 async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
 attrs = ">=17.3.0"
@@ -94,7 +106,7 @@ multidict = ">=4.5,<7.0"
 yarl = ">=1.0,<2.0"
 
 [package.extras]
-speedups = ["Brotli", "aiodns", "brotlicffi"]
+speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]
 
 [[package]]
 name = "aiopg"
@@ -3371,4 +3383,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "d569a3593b98baceb0a88e176bdad63cae99d6bfc2a81bf6741663a4abcafd72"
+content-hash = "c09bcb333ab550958b33dbf4fec968c500d8e701fd4c96402cddbd9bb8048055"
diff --git a/pyproject.toml b/pyproject.toml
index cfb569b2ba..ad3961ef55 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
-aiohttp = "3.9.4"
+aiohttp = "3.10.2"
 pytest-rerunfailures = "^13.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"

From 4476caf670c9e52480e93e39b570e3fc70c83d34 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 12 Aug 2024 09:17:31 +0100
Subject: [PATCH 1381/1571] CI: add `actions/set-docker-config-dir` to set
 DOCKER_CONFIG (#8676)

## Problem

In several workflows, we have repeating code which is separated into
two steps:
```bash
mkdir -p $(pwd)/.docker-custom
echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
...
rm -rf $(pwd)/.docker-custom
```

Such copy-paste is prone to errors; for example, in one case, instead of
`$(pwd)/.docker-custom`, we use `/tmp/.docker-custom`, which is shared
between workflows.

## Summary of changes
- Create a new action `actions/set-docker-config-dir`, which sets
`DOCKER_CONFIG` and deletes it in a Post action part
---
 .../actions/set-docker-config-dir/action.yml  | 36 +++++++++++++
 .github/workflows/build-build-tools-image.yml | 13 +----
 .github/workflows/build_and_test.yml          | 50 ++-----------------
 3 files changed, 41 insertions(+), 58 deletions(-)
 create mode 100644 .github/actions/set-docker-config-dir/action.yml

diff --git a/.github/actions/set-docker-config-dir/action.yml b/.github/actions/set-docker-config-dir/action.yml
new file mode 100644
index 0000000000..3ee8bec8c6
--- /dev/null
+++ b/.github/actions/set-docker-config-dir/action.yml
@@ -0,0 +1,36 @@
+name: "Set custom docker config directory"
+description: "Create a directory for docker config and set DOCKER_CONFIG"
+
+# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+runs:
+  using: "composite"
+  steps:
+  - name: Show warning on GitHub-hosted runners
+    if: runner.environment == 'github-hosted'
+    shell: bash -euo pipefail {0}
+    run: |
+      # Using the following environment variables to find a path to the workflow file
+      # ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch
+      # ${GITHUB_REPOSITORY}   - octocat/hello-world
+      # ${GITHUB_REF}          - refs/heads/my_branch
+      # From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables
+
+      filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"}
+      filename=${filename_with_ref%"@$GITHUB_REF"}
+
+      # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message
+      title='Unnecessary usage of `.github/actions/set-docker-config-dir`'
+      message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners'
+      echo "::warning file=${filename},title=${title}::${message}"
+
+  - uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7
+    env:
+      DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }}
+    with:
+      main: |
+        mkdir -p "${DOCKER_CONFIG}"
+        echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV
+      post: |
+        if [ -d "${DOCKER_CONFIG}" ]; then
+          rm -r "${DOCKER_CONFIG}"
+        fi
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 76fc58151a..f4f6e6971f 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -56,13 +56,7 @@ jobs:
 
       - uses: actions/checkout@v4
 
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p /tmp/.docker-custom
-          echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
-
+      - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/setup-buildx-action@v3
         with:
           cache-binary: false
@@ -89,11 +83,6 @@ jobs:
           cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
           tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
 
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf /tmp/.docker-custom
-
   merge-images:
     needs: [ build-image ]
     runs-on: ubuntu-22.04
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index c7ae2aedd4..78f9f11a65 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -484,12 +484,7 @@ jobs:
           submodules: true
           fetch-depth: 0
 
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+      - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/setup-buildx-action@v3
         with:
           cache-binary: false
@@ -521,11 +516,6 @@ jobs:
           tags: |
             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
-
   neon-image:
     needs: [ neon-image-arch, tag ]
     runs-on: ubuntu-22.04
@@ -570,12 +560,7 @@ jobs:
           submodules: true
           fetch-depth: 0
 
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+      - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/setup-buildx-action@v3
         with:
           cache-binary: false
@@ -658,11 +643,6 @@ jobs:
           tags: |
             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
-
   compute-node-image:
     needs: [ compute-node-image-arch, tag ]
     runs-on: ubuntu-22.04
@@ -735,13 +715,7 @@ jobs:
           curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
           chmod +x vm-builder
 
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-
+      - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/login-action@v3
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -764,11 +738,6 @@ jobs:
         run: |
           docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
 
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
-
   test-images:
     needs: [ check-permissions, tag, neon-image, compute-node-image ]
     strategy:
@@ -784,13 +753,7 @@ jobs:
         with:
           fetch-depth: 0
 
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-
+      - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/login-action@v3
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -830,11 +793,6 @@ jobs:
           docker compose -f ./docker-compose/docker-compose.yml logs || 0
           docker compose -f ./docker-compose/docker-compose.yml down
 
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
-
   promote-images:
     permissions:
       contents: read  # This is required for actions/checkout

From a4eea5025c70d646f5a94481590177af3dfe7491 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 8 Aug 2024 20:01:55 +0300
Subject: [PATCH 1382/1571] Fix logical apply worker reporting of flush_lsn wrt
 sync replication.

It should take syncrep flush_lsn into account because WAL before it on endpoint
restart is lost, which makes replication miss some data if slot had already been
advanced too far. This commit adds test reproducing the issue and bumps
vendor/postgres to commit with the actual fix.
---
 control_plane/src/endpoint.rs                 | 11 +--
 pgxn/neon/walsender_hooks.c                   |  8 ++
 test_runner/fixtures/neon_fixtures.py         |  2 +-
 .../regress/test_logical_replication.py       | 89 +++++++++++++++++++
 vendor/postgres-v14                           |  2 +-
 vendor/postgres-v15                           |  2 +-
 vendor/postgres-v16                           |  2 +-
 vendor/revisions.json                         | 15 +++-
 8 files changed, 119 insertions(+), 12 deletions(-)

diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index f9bb2da7e7..9f879c4b08 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -824,11 +824,12 @@ impl Endpoint {
         // cleanup work to do after postgres stops, like syncing safekeepers,
         // etc.
         //
-        // If destroying, send it SIGTERM before waiting. Sometimes we do *not*
-        // want this cleanup: tests intentionally do stop when majority of
-        // safekeepers is down, so sync-safekeepers would hang otherwise. This
-        // could be a separate flag though.
-        self.wait_for_compute_ctl_to_exit(destroy)?;
+        // If destroying or stop mode is immediate, send it SIGTERM before
+        // waiting. Sometimes we do *not* want this cleanup: tests intentionally
+        // do stop when majority of safekeepers is down, so sync-safekeepers
+        // would hang otherwise. This could be a separate flag though.
+        let send_sigterm = destroy || mode == "immediate";
+        self.wait_for_compute_ctl_to_exit(send_sigterm)?;
         if destroy {
             println!(
                 "Destroying postgres data directory '{}'",
diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c
index 8f8d1dfc01..bd3856e9d9 100644
--- a/pgxn/neon/walsender_hooks.c
+++ b/pgxn/neon/walsender_hooks.c
@@ -20,6 +20,7 @@
 #include "utils/guc.h"
 #include "postmaster/interrupt.h"
 
+#include "neon.h"
 #include "neon_walreader.h"
 #include "walproposer.h"
 
@@ -181,6 +182,13 @@ NeonWALReadSegmentClose(XLogReaderState *xlogreader)
 void
 NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr)
 {
+	/*
+	 * If safekeepers are not configured, assume we don't need neon_walreader,
+	 * i.e. running neon fork locally.
+	 */
+	if (wal_acceptors_list[0] == '\0')
+		return;
+
 	if (!wal_reader)
 	{
 		XLogRecPtr	epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn);
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 844a23d327..4374e74a41 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4893,7 +4893,7 @@ def check_restored_datadir_content(
     assert (mismatch, error) == ([], [])
 
 
-def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -> Lsn:
+def logical_replication_sync(subscriber: PgProtocol, publisher: PgProtocol) -> Lsn:
     """Wait logical replication subscriber to sync with publisher."""
     publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
     while True:
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 66afe9ddfd..5a5d369a11 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -4,11 +4,13 @@ from random import choice
 from string import ascii_lowercase
 
 import pytest
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     AuxFileStore,
     NeonEnv,
     NeonEnvBuilder,
+    PgProtocol,
     logical_replication_sync,
     wait_for_last_flush_lsn,
 )
@@ -524,3 +526,90 @@ def test_replication_shutdown(neon_simple_env: NeonEnv):
             assert [r[0] for r in res] == [10, 20, 30, 40]
 
         wait_until(10, 0.5, check_that_changes_propagated)
+
+
+def logical_replication_wait_flush_lsn_sync(publisher: PgProtocol) -> Lsn:
+    """
+    Wait for logical replication subscriber reported flush_lsn to reach
+    pg_current_wal_flush_lsn on publisher. Note that this is somewhat unreliable
+    because for some WAL records like vacuum subscriber won't get any data at
+    all.
+    """
+    publisher_flush_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+
+    def check_caughtup():
+        res = publisher.safe_psql(
+            """
+select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication sr, pg_replication_slots s
+   where s.active_pid = sr.pid and s.slot_type = 'logical';
+                                  """
+        )[0]
+        sent_lsn, flush_lsn, curr_publisher_flush_lsn = Lsn(res[0]), Lsn(res[1]), Lsn(res[2])
+        log.info(
+            f"sent_lsn={sent_lsn}, flush_lsn={flush_lsn}, publisher_flush_lsn={curr_publisher_flush_lsn}, waiting flush_lsn to reach {publisher_flush_lsn}"
+        )
+        assert flush_lsn >= publisher_flush_lsn
+
+    wait_until(30, 0.5, check_caughtup)
+    return publisher_flush_lsn
+
+
+# Test that subscriber takes into account quorum committed flush_lsn in
+# flush_lsn reporting to publisher. Without this, it may ack too far, losing
+# data on restart because publisher advances START_REPLICATION position to the
+# confirmed_flush_lsn of the slot.
+def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg):
+    env = neon_simple_env
+    # use vanilla as publisher to allow writes on it when safekeeper is down
+    vanilla_pg.configure(
+        [
+            "wal_level = 'logical'",
+            # neon fork uses custom WAL records which won't work without extension installed with obscure
+            # ERROR:  resource manager with ID 134 not registered
+            # error.
+            "shared_preload_libraries = 'neon'",
+        ]
+    )
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create extension neon;")
+
+    env.neon_cli.create_branch("subscriber")
+    sub = env.endpoints.create("subscriber")
+    sub.start()
+
+    with vanilla_pg.cursor() as pcur:
+        with sub.cursor() as scur:
+            pcur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
+            pcur.execute("CREATE PUBLICATION pub FOR TABLE t")
+            scur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
+
+            pub_connstr = vanilla_pg.connstr().replace("'", "''")
+            log.info(f"pub connstr is {pub_connstr}, subscriber connstr {sub.connstr()}")
+            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_connstr}' PUBLICATION pub with (synchronous_commit=off)"
+            scur.execute(query)
+            time.sleep(2)  # let initial table sync complete
+
+    # stop safekeeper so it won't get any data
+    for sk in env.safekeepers:
+        sk.stop()
+    # and insert to publisher
+    with vanilla_pg.cursor() as pcur:
+        for i in range(0, 1000):
+            pcur.execute("INSERT into t values (%s, random()*100000)", (i,))
+    # wait until sub receives all data
+    logical_replication_sync(sub, vanilla_pg)
+    # Update confirmed_flush_lsn of the slot. If subscriber ack'ed recevied data
+    # as flushed we'll now lose it if subscriber restars. That's why
+    # logical_replication_wait_flush_lsn_sync is expected to hang while
+    # safekeeper is down.
+    vanilla_pg.safe_psql("checkpoint;")
+    assert sub.safe_psql_scalar("SELECT count(*) FROM t") == 1000
+
+    # restart subscriber and ensure it can catch up lost tail again
+    sub.stop(mode="immediate")
+    for sk in env.safekeepers:
+        sk.start()
+    sub.start()
+    log.info("waiting for sync after restart")
+    logical_replication_wait_flush_lsn_sync(vanilla_pg)
+    assert sub.safe_psql_scalar("SELECT count(*) FROM t") == 1000
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 7bbe834c8c..ae07734e0f 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 7bbe834c8c2dc37802eca8484311599bc47341f6
+Subproject commit ae07734e0ff72759ab425fc8f625d4c1ecb15a50
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 9eba7dd382..47c8d462d1 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 9eba7dd382606ffca43aca865f337ec21bcdac73
+Subproject commit 47c8d462d169367c8979ce628a523be2d94b46be
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 5377f5ed72..6434b1499b 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3
+Subproject commit 6434b1499b11ed97dccea5618a055034b83b8e2f
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 570dfc1550..ab8b3b3c4f 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,14 @@
 {
-  "v16": ["16.3", "5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3"],
-  "v15": ["15.7", "9eba7dd382606ffca43aca865f337ec21bcdac73"],
-  "v14": ["14.12", "7bbe834c8c2dc37802eca8484311599bc47341f6"]
+  "v16": [
+    "16.3",
+    "6434b1499b11ed97dccea5618a055034b83b8e2f"
+  ],
+  "v15": [
+    "15.7",
+    "47c8d462d169367c8979ce628a523be2d94b46be"
+  ],
+  "v14": [
+    "14.12",
+    "ae07734e0ff72759ab425fc8f625d4c1ecb15a50"
+  ]
 }

From 162424ad774505bf38fcda31af81efd4f22de9a2 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 18 Jul 2024 13:51:07 +0300
Subject: [PATCH 1383/1571] wip

---
 ...35-safekeeper-dynamic-membership-change.md | 329 ++++++++++++++++++
 1 file changed, 329 insertions(+)
 create mode 100644 docs/rfcs/035-safekeeper-dynamic-membership-change.md

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
new file mode 100644
index 0000000000..4872fbaf89
--- /dev/null
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -0,0 +1,329 @@
+# Safekeeper dynamic membership change
+
+To quickly recover from safekeeper node failures and do rebalancing we need to
+be able to change set of safekeepers the timeline resides on. The procedure must
+be safe (not lose committed log) regardless of safekeepers and compute state. It
+should be able to progress if any majority of old safekeeper set, any majority
+of new safekeeper set and compute are up and connected. This is known as a
+consensus membership change. It always involves two phases: 1) switch old
+majority to old + new configuration, preventing commits without acknowledge from
+the new set 2) bootstrap the new set by ensuring majority of the new set has all
+data which ever could have been committed before the first phase completed;
+after that switch is safe to finish. Without two phases switch to the new set
+which quorum might not intersect with quorum of the old set (and typical case of
+ABC -> ABD switch is an example of that, because quorums AC and BD don't
+intersect). Furthermore, procedure is typically carried out by the consensus
+leader, and so enumeration of configurations which establishes order between
+them is done through consensus log.
+
+In our case consensus leader is compute (walproposer), and we don't want to wake
+up all computes for the change. Neither we want to fully reimplement the leader
+logic second time outside compute. Because of that the proposed algorithm relies
+for issuing configurations on the external fault tolerant (distributed) strongly
+consisent storage with simple API: CAS (compare-and-swap) on the single key.
+Properly configured postgres suits this.
+
+In the system consensus is implemented at the timeline level, so algorithm below
+applies to the single timeline.
+
+## Algorithm
+
+### Definitions
+
+A SafekeeperId is
+```
+struct SafekeeperId {
+    node_id: NodeId,
+    // Not strictly required for this RFC but useful for asserts and potentially other purposes in the future
+    hostname: String,
+}
+```
+
+A configuration is
+
+```
+struct Configuration {
+    generation: Generation, // a number uniquely identifying configuration
+    sk_set: Vec<SafekeeperId>, // current safekeeper set
+    new_sk_set: Optional<Vec<SafekeeperId>>,
+}
+```
+
+Configuration with `new_set` present is used for the intermediate step during
+the change and called joint configuration. Generations establish order of
+generations: we say `c1` is higher than `c2` if `c1.generation` >
+`c2.generation`.
+
+### Persistently stored data changes
+
+Safekeeper starts storing its current configuration in the control file. Update
+of is atomic, so in-memory value always matches the persistent one.
+
+External CAS providing storage (let's call it configuration storage here) also
+stores configuration for each timeline. It is initialized with generation 1 and
+initial set of safekeepers during timeline creation. Executed CAS on it must
+never be lost.
+
+### Compute <-> safekeeper protocol changes
+
+`ProposerGreeting` message carries walproposer's configuration if it is already
+established (see below), else null.  `AcceptorGreeting` message carries
+safekeeper's current `Configuration`. All further messages (`VoteRequest`,
+`VoteResponse`, `ProposerElected`, `AppendRequest`, `AppendResponse`) carry
+generation number, of walproposer in case of wp->sk message or of safekeeper in
+case of sk->wp message.
+
+### Safekeeper changes
+
+Basic rule: once safekeeper observes configuration higher than his own it
+immediately switches to it.
+
+Safekeeper sends its current configuration in its first message to walproposer
+`AcceptorGreeting`. It refuses all other walproposer messages if the
+configuration generation in them is less than its current one. Namely, it
+refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
+response it sends its current configuration generation to let walproposer know.
+
+Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` 
+accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
+current one and ignores it otherwise. In any case it replies with
+```
+struct ConfigurationSwitchResponse {
+    conf: Configuration,
+    last_log_term: Term,
+    flush_lsn: Lsn,
+    term: Term, // not used by this RFC, but might be useful for observability
+}
+```
+
+### Compute (walproposer) changes
+
+Basic rule is that joint configuration requires votes from majorities in the
+both `set` and `new_sk_set`.
+
+Compute receives list of safekeepers to connect to from the control plane as
+currently and tries to communicate with all of them. However, the list does not
+define consensus members. Instead, on start walproposer tracks highest
+configuration it receives from `AcceptorGreeting`s. Once it assembles greetings
+from majority of `sk_set` and majority of `new_sk_set` (if it is present), it
+establishes this configuration as its own and moves to voting. 
+
+It should stop talking to safekeepers not listed in the configuration at this
+point, though it is not unsafe to continue doing so.
+
+To be elected it must receive votes from both majorites if `new_sk_set` is present.
+Similarly, to commit WAL it must receive flush acknowledge from both majorities.
+
+If walproposer hears from safekeeper configuration higher than his own (i.e.
+refusal to accept due to configuration change) it simply restarts.
+
+### Change algorithm
+
+The following algorithm can be executed anywhere having access to configuration
+storage and safekeepers. It is safe to interrupt / restart it and run multiple
+instances of it concurrently, though likely one of them won't make
+progress then. It accepts `desired_set: Vec<SafekeeperId>` as input. 
+
+Algorithm will refuse to make the change if it encounters previous interrupted
+change attempt, but in this case it will try to finish it.
+
+It will eventually converge if old majority, new majority and configuration
+storage are reachable.
+
+1) Fetch current timeline configuration from the configuration storage.
+2) If it is already joint one and `new_set` is different from `desired_set`
+   refuse to change. However, assign join conf to (in memory) var
+   `join_conf` and proceed to step 4 to finish the ongoing change.
+3) Else, create joint `joint_conf: Configuration`: increment current conf number
+   `n` and put `desired_set` to `new_sk_set`. Persist it in the configuration
+   storage by doing CAS on the current generation: change happens only if
+   current configuration number is still `n`. Apart from guaranteeing uniqueness
+   of configurations, CAS linearizes them, ensuring that new configuration is
+   created only following the previous one when we know that the transition is
+   safe. Failed CAS aborts the procedure.
+4) Call `PUT` `configuration` on safekeepers from the current set,
+   delivering them `joint_conf`. Collecting responses from majority is required
+   to proceed. If any response returned generation higher than 
+   `joint_conf.generation`, abort (another switch raced us). Otherwise, choose
+   max `<last_log_term, flush_lsn>` among responses and establish it as 
+   (in memory) `sync_position`. We can't finish switch until majority 
+   of the new set catches up to this position because data before it 
+   could be committed without ack from the new set.
+4) Initialize timeline on safekeeper(s) from `new_sk_set` where it 
+   doesn't exist yet by doing `pull_timeline` from current set. Doing 
+   that on majority of `new_sk_set` is enough to proceed, but it is 
+   reasonable to ensure that all `new_sk_set` members are initialized 
+   -- if some of them are down why are we migrating there?
+5) Call `PUT` `configuration` on safekeepers from the new set,
+   delivering them `joint_conf` and collecting their positions. This will
+   switch them to the `joint_conf` which generally won't be needed 
+   because `pull_timeline` already includes it and plus additionally would be
+   broadcast by compute. More importantly, we may proceed to the next step 
+   only when `<last_log_term, flush_lsn>` on the majority of the new set reached 
+   `sync_position`. Similarly, on the happy path this is not needed because 
+   `pull_timeline` already includes it. However, it is better to double
+    check to be safe. For example, timeline could have been created earlier e.g.
+    manually or after try-to-migrate, abort, try-to-migrate-again sequence.   
+6) Create `new_conf: Configuration` incrementing `join_conf` generation and having new 
+   safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration 
+   storage under one more CAS.
+7) Call `PUT` `configuration` on safekeepers from the new set,
+   delivering them `new_conf`. It is enough to deliver it to the majority 
+   of the new set; the rest can be updated by compute.
+
+I haven't put huge effort to make the description above very precise, because it
+is natural language prone to interpretations anyway. Instead I'd like to make TLA+
+spec of it.
+
+Description above focuses on safety. To make the flow practical and live, here a few more 
+considerations.
+1) It makes sense to ping new set to ensure it we are migrating to live node(s) before 
+  step 3.
+2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed we
+   can rollback to the old conf with one more CAS.
+3) On step 4 timeline might be already created on members of the new set for various reasons; 
+   the simplest is the procedure restart. There are more complicated scenarious like mentioned
+   in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving 
+   generations, so seems simpler to treat existing timeline as success. However, this also 
+   has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
+   the step 5 is never reached until compute is (re)awaken up to synchronize new member(s).
+   I don't think we'll observe this in practice, but can add waking up compute if needed.
+4) To do step 7 in case of failure immediately after completion of CAS in step 6, 
+   configuration storage should also have `delivered_to_majority` flag for non join configurations.
+
+## Implementation
+
+The procedure ought to be driven from somewhere. Obvious candidates are control
+plane and storage_controller; and as each of them already has db we don't want
+yet another storage. I propose to manage safekeepers in storage_controller
+because 1) since it is in rust it simplifies simulation testing (more on this
+below) 2) it already manages pageservers. 
+
+This assumes that migration will be fully usable only after we migrate all
+tenants/timelines to storage_controller. It is discussible whether we want also
+to manage pageserver attachments for all of these, but likely we do.
+
+This requires us to define
+
+### storage_controller <-> control plane interface
+
+First of all, control plane should
+[change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829)
+storing safekeepers per timeline instead of per tenant because we can't migrate
+tenants atomically. 
+
+The important question is how updated configuration is delivered from
+storage_controller to control plane to provide it to computes. As always, there
+are two options, pull and push. Let's do it the same push as with pageserver
+`/notify-attach` because 1) it keeps storage_controller out of critical compute
+start path 2) provides easier upgrade: there won't be such a thing as 'timeline
+managed by control plane / storcon', cplane just takes the value out of its db
+when needed 3) uniformity. It makes storage_controller responsible for retrying notifying
+control plane until it succeeds.
+
+So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and
+updates it in the db if the provided conf generation is higher (the cplane db
+should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it
+should update db which makes the call successful, and then try to schedule
+`apply_config` if possible, it is ok if not. storage_controller 
+should rate limit calling the endpoint, but likely this won't be needed, as migration
+throughput is limited by `pull_timeline`.
+
+Timeline (branch) creation in cplane should call storage_controller POST
+`tenant/:tenant_id/timeline` like it currently does for sharded tenants.
+Response should be augmented with `safekeeper_conf: Configuration`. The call
+should be retried until succeeds.
+
+Timeline deletion and tenant deletion in cplane should call appropriate
+storage_controller endpoints like it currently does for sharded tenants. The
+calls should be retried until they succeed.
+
+### storage_controller implementation
+
+Current 'load everything on startup and keep in memory' easy design is fine.
+Single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16
+byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so
+10^6 of timelines shouldn't take more than 100MB.
+
+Similar to pageserver attachment Intents storage_controller would have in-memory
+`MigrationRequest` (or its absense) for each timeline and pool of tasks trying
+to make these request reality; this ensures one instance of storage_controller
+won't do several migrations on the same timeline concurrently. In the first
+version it is simpler to have more manual control and no retries, i.e. migration
+failure removes the request. Later we can build retries and automatic
+scheduling/migration.
+
+#### Schema
+
+`safekeepers` table mirroring current `nodes` should be added, except that for
+`scheduling_policy` field (maybe better name it `status`?) it is enough to have
+at least in the beginning only 3 fields: 1) `active` 2) `scheduling_disabled` 3)
+`decomissioned`.
+
+`timelines` table:
+```
+table! {
+    timelines {
+        timeline_id -> Varchar,
+        tenant_id -> Varchar,
+        generation -> Int4,
+        sk_set -> Jsonb, // list of safekeeper ids
+        new_sk_set -> Nullable<Jsonb>, // list of safekeeper ids, null if not join conf
+        delivered_to_majority -> Nullable<Bool>, // null if joint conf
+        cplane_notified_generation -> Int4,
+    }
+}
+```
+
+#### API
+
+
+
+#### Dealing with multiple instances of storage_controller
+
+neon_local, pytest
+
+## Testing
+
+## Integration with evicted timeline
+
+## Order of implementation and rollout
+
+note that 
+- core can be developed ignoring cplane integration (neon_local will use storcon, but prod not)
+- there is a lot of infra work and it woud be great to separate its rollout from the core
+- wp could ignore joint consensus for some time
+
+rough order:
+- add sk infra, but not enforce confs
+- change proto
+- add wp proto, but not enforce confs
+- implement storconn. It will be used and tested by neon_local.
+- implement cplane/storcon integration. Route branch creation/deletion 
+  through storcon. Then we can test migration of these branches, hm.
+  In principle sk choice from cplane can be removed at this point. 
+  However, that would be bad because before import 1) 
+  storconn doesn't know about existing project so can't colocate tenants 
+  2) neither it knows about capacity. So we could instead allow to set sk 
+  set in the branch creation request.
+  These cplane -> storconn calls should be under feature flag; 
+  rollback is safe.
+- finally import existing branches. Then we can drop cplane 
+  sk selection code.
+  also only at this point wp will always use generations and
+  so we can drop 'tli creation on connect'.
+
+
+## Possible optimizations
+
+`AcceptorRefusal` separate message
+
+Preserving connections (not neede)
+
+multiple joint consensus (not neede)
+
+## Misc
+
+We should use Compute <-> safekeeper protocol change to include other (long
+yearned) modifications:
+

From 1e789fb9631ecb394b18c9f051d3775f2234272f Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 19 Jul 2024 18:06:10 +0300
Subject: [PATCH 1384/1571] wipwip

---
 ...35-safekeeper-dynamic-membership-change.md | 171 +++++++++++++++---
 1 file changed, 144 insertions(+), 27 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index 4872fbaf89..2fc3f2066b 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -30,21 +30,12 @@ applies to the single timeline.
 
 ### Definitions
 
-A SafekeeperId is
-```
-struct SafekeeperId {
-    node_id: NodeId,
-    // Not strictly required for this RFC but useful for asserts and potentially other purposes in the future
-    hostname: String,
-}
-```
-
 A configuration is
 
 ```
 struct Configuration {
     generation: Generation, // a number uniquely identifying configuration
-    sk_set: Vec<SafekeeperId>, // current safekeeper set
+    sk_set: Vec<NodeId>, // current safekeeper set
     new_sk_set: Optional<Vec<SafekeeperId>>,
 }
 ```
@@ -76,7 +67,13 @@ case of sk->wp message.
 ### Safekeeper changes
 
 Basic rule: once safekeeper observes configuration higher than his own it
-immediately switches to it.
+immediately switches to it. It must refuse all messages with lower generation
+that his. It also refuses messages if it is not member of the current
+generation, though it is likely not unsafe to process them (walproposer should
+ignore them anyway).
+
+If there is non null configuration in `ProposerGreeting` and it is higher than
+current safekeeper one, safekeeper switches to it.
 
 Safekeeper sends its current configuration in its first message to walproposer
 `AcceptorGreeting`. It refuses all other walproposer messages if the
@@ -122,7 +119,7 @@ refusal to accept due to configuration change) it simply restarts.
 The following algorithm can be executed anywhere having access to configuration
 storage and safekeepers. It is safe to interrupt / restart it and run multiple
 instances of it concurrently, though likely one of them won't make
-progress then. It accepts `desired_set: Vec<SafekeeperId>` as input. 
+progress then. It accepts `desired_set: Vec<NodeId>` as input. 
 
 Algorithm will refuse to make the change if it encounters previous interrupted
 change attempt, but in this case it will try to finish it.
@@ -150,10 +147,10 @@ storage are reachable.
    of the new set catches up to this position because data before it 
    could be committed without ack from the new set.
 4) Initialize timeline on safekeeper(s) from `new_sk_set` where it 
-   doesn't exist yet by doing `pull_timeline` from current set. Doing 
-   that on majority of `new_sk_set` is enough to proceed, but it is 
-   reasonable to ensure that all `new_sk_set` members are initialized 
-   -- if some of them are down why are we migrating there?
+   doesn't exist yet by doing `pull_timeline` from the majority of the 
+   current set. Doing  that on majority of `new_sk_set` is enough to
+   proceed, but it is reasonable to ensure that all `new_sk_set` members
+   are initialized -- if some of them are down why are we migrating there?
 5) Call `PUT` `configuration` on safekeepers from the new set,
    delivering them `joint_conf` and collecting their positions. This will
    switch them to the `joint_conf` which generally won't be needed 
@@ -179,8 +176,8 @@ Description above focuses on safety. To make the flow practical and live, here a
 considerations.
 1) It makes sense to ping new set to ensure it we are migrating to live node(s) before 
   step 3.
-2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed we
-   can rollback to the old conf with one more CAS.
+2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed 
+   it is safe to rollback to the old conf with one more CAS.
 3) On step 4 timeline might be already created on members of the new set for various reasons; 
    the simplest is the procedure restart. There are more complicated scenarious like mentioned
    in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving 
@@ -188,8 +185,11 @@ considerations.
    has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
    the step 5 is never reached until compute is (re)awaken up to synchronize new member(s).
    I don't think we'll observe this in practice, but can add waking up compute if needed.
-4) To do step 7 in case of failure immediately after completion of CAS in step 6, 
-   configuration storage should also have `delivered_to_majority` flag for non join configurations.
+4) In the end timeline should be locally deleted on the safekeeper(s) which are
+   in the old set but not in the new one, unless they are unreachable. To be
+   safe this also should be done under generation number.
+5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
+   jump to step 7, using it as `new_conf`.
 
 ## Implementation
 
@@ -251,25 +251,37 @@ to make these request reality; this ensures one instance of storage_controller
 won't do several migrations on the same timeline concurrently. In the first
 version it is simpler to have more manual control and no retries, i.e. migration
 failure removes the request. Later we can build retries and automatic
-scheduling/migration.
+scheduling/migration. `MigrationRequest` is
+```
+enum MigrationRequest {
+    To(Vec<NodeId>),
+    FinishPending,
+}
+```
+
+`FinishPending` requests to run the procedure to ensure state is clean: current
+configuration is not joint and majority of safekeepers are aware of it, but do
+not attempt to migrate anywhere. If current configuration fetched on step 1 is
+not joint it jumps to step 7. It should be run at startup for all timelines (but
+similarly, in the first version it is ok to trigger it manually).
 
 #### Schema
 
 `safekeepers` table mirroring current `nodes` should be added, except that for
-`scheduling_policy` field (maybe better name it `status`?) it is enough to have
-at least in the beginning only 3 fields: 1) `active` 2) `scheduling_disabled` 3)
+`scheduling_policy` field (seems like `status` is a better name for it): it is enough
+to have at least in the beginning only 3 fields: 1) `active` 2) `unavailable` 3)
 `decomissioned`.
 
 `timelines` table:
 ```
 table! {
-    timelines {
+    // timeline_id is primary key
+    timelines (timeline_id) {
         timeline_id -> Varchar,
         tenant_id -> Varchar,
         generation -> Int4,
         sk_set -> Jsonb, // list of safekeeper ids
         new_sk_set -> Nullable<Jsonb>, // list of safekeeper ids, null if not join conf
-        delivered_to_majority -> Nullable<Bool>, // null if joint conf
         cplane_notified_generation -> Int4,
     }
 }
@@ -277,15 +289,117 @@ table! {
 
 #### API
 
+Node management is similar to pageserver:
+1) POST `/control/v1/safekeepers` upserts safekeeper.
+2) GET `/control/v1/safekeepers` lists safekeepers.
+3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
+4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
+   `unavailable` or `decomissioned`. Initially it is simpler not to schedule any
+    migrations here.
 
+Safekeeper deploy scripts should register safekeeper at storage_contorller as
+they currently do with cplane, under the same id.
+
+Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline`
+would 1) choose initial set of safekeepers; 2) write to the db initial
+`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in
+case of conflict; 3) create timeline on the majority of safekeepers (already
+created is ok).
+
+We don't want to block timeline creation when one safekeeper is down. Currently
+this is solved by compute implicitly creating timeline on any safekeeper it is
+connected to. This creates ugly timeline state on safekeeper when timeline is
+created, but start LSN is not defined yet. It would be nice to remove this; to
+do that, controller can in the background retry to create timeline on
+safekeeper(s) which missed that during initial creation call. It can do that
+through `pull_timeline` from majority so it doesn't need to remember
+`parent_lsn` in its db.
+
+Timeline deletion removes the row from the db and forwards deletion to the
+current configuration members. Without additional actions deletions might leak,
+see below on this; initially let's ignore these, reporting to cplane success if
+at least one safekeeper deleted the timeline (this will remove s3 data).
+
+Tenant deletion repeats timeline deletion for all timelines.
+
+Migration API: the first version is the simplest and the most imperative:
+1) PUT `/control/v1/safekeepers/migrate` schedules `MigrationRequest`s to move
+all timelines from one safekeeper to another. It accepts json
+```
+{
+    "src_sk": u32,
+    "dst_sk": u32,
+    "limit": Optional<u32>,
+}
+```
+
+Returns list of scheduled requests.
+
+2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest`
+   to move single timeline to given set of safekeepers:
+{
+    "desired_set": Vec<u32>,
+}
+
+Returns scheduled request.
+
+Similar call should be added for the tenant.
+
+It would be great to have some way of subscribing to the results (appart from
+looking at logs/metrics).
+
+Migration is executed as described above. One subtlety is that (local) deletion on
+source safekeeper might fail, which is not a problem if we are going to
+decomission the node but leaves garbage otherwise. I'd propose in the first version
+1) Don't attempt deletion at all if node status is `unavailable`.
+2) If it failed, just issue warning.
+And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and 
+remove garbage timelines for manual use. It will 1) list all timelines on the 
+safekeeper 2) compare each one against configuration storage: if timeline 
+doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can 
+be deleted under generation number if node is not member of current generation.
+
+Automating this is untrivial; we'd need to register all potential missing
+deletions <tenant_id, timeline_id, generation, node_id> in the same transaction
+which switches configurations. Similarly when timeline is fully deleted to
+prevent cplane operation from blocking when some safekeeper is not available
+deletion should be also registered.
+
+3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return
+   current in memory state of the timeline and pending `MigrationRequest`,
+   if any.
+
+4) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate_abort` tries to abort the
+   migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS
+   (incrementing generation as always).
 
 #### Dealing with multiple instances of storage_controller
 
-neon_local, pytest
+Operations described above executed concurrently might create some errors but do
+not prevent progress, so while we normally don't want to run multiple instances
+of storage_controller it is fine to have it temporarily, e.g. during redeploy.
+
+Any interactions with db update in-memory controller state, e.g. if migration
+request failed because different one is in progress, controller remembers that
+and tries to finish it.
 
 ## Testing
 
-## Integration with evicted timeline
+`neon_local` should be switched to use storage_controller, playing role of
+control plane.
+
+There should be following layers of tests:
+1) Model checked TLA+ spec specifies the algorithm and verifies its basic safety.
+
+2) To cover real code and at the same time test many schedules we should have
+   simulation tests. For that, configuration storage, storage_controller <->
+   safekeeper communication and pull_timeline need to be mocked and main switch
+   procedure wrapped to as a node (thread) in simulation tests, using these
+   mocks. Test would inject migrations like it currently injects
+   safekeeper/walproposer restars. Main assert is the same -- committed WAL must
+   not be lost.
+
+3) Additionally it would be good to have basic tests covering the whole system.
 
 ## Order of implementation and rollout
 
@@ -294,6 +408,8 @@ note that
 - there is a lot of infra work and it woud be great to separate its rollout from the core
 - wp could ignore joint consensus for some time
 
+TimelineCreateRequest should get optional safekeepers field with safekeepers chosen by cplane.
+
 rough order:
 - add sk infra, but not enforce confs
 - change proto
@@ -313,6 +429,7 @@ rough order:
   also only at this point wp will always use generations and
   so we can drop 'tli creation on connect'.
 
+## Integration with evicted timelines
 
 ## Possible optimizations
 

From 7b50c1a4576fbc3283e06403586915824e5c3ee6 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 22 Jul 2024 16:25:06 +0300
Subject: [PATCH 1385/1571] more wip

ref https://github.com/neondatabase/cloud/issues/14668
---
 ...35-safekeeper-dynamic-membership-change.md | 25 +++++++++++++------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index 2fc3f2066b..e9183c9829 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -36,7 +36,7 @@ A configuration is
 struct Configuration {
     generation: Generation, // a number uniquely identifying configuration
     sk_set: Vec<NodeId>, // current safekeeper set
-    new_sk_set: Optional<Vec<SafekeeperId>>,
+    new_sk_set: Optional<Vec<NodeId>>,
 }
 ```
 
@@ -337,9 +337,11 @@ Returns list of scheduled requests.
 
 2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest`
    to move single timeline to given set of safekeepers:
+```
 {
     "desired_set": Vec<u32>,
 }
+```
 
 Returns scheduled request.
 
@@ -399,14 +401,22 @@ There should be following layers of tests:
    safekeeper/walproposer restars. Main assert is the same -- committed WAL must
    not be lost.
 
-3) Additionally it would be good to have basic tests covering the whole system.
+3) Since simulation testing injects at relatively high level points (not
+   syscalls), it omits some code, in particular `pull_timeline`. Thus it 
+   is better to have basic tests covering whole system. Extended 
+   version of `test_restarts_under_load` would do. TBD
+
+4) Basic e2e test should ensure that full flow including cplane notification works.
 
 ## Order of implementation and rollout
 
 note that 
-- core can be developed ignoring cplane integration (neon_local will use storcon, but prod not)
-- there is a lot of infra work and it woud be great to separate its rollout from the core
-- wp could ignore joint consensus for some time
+- Control plane parts and integration with it is fully independent from everything else
+  (tests would use simulation and neon_local).
+- There is a lot of infra work making storage_controller aware of timelines and safekeepers
+  and its impl/rollout should be separate from migration itself.
+- Initially walproposer can just stop working while it observers joint configuration.
+  Such window would be typically very short anyway.
 
 TimelineCreateRequest should get optional safekeepers field with safekeepers chosen by cplane.
 
@@ -435,12 +445,13 @@ rough order:
 
 `AcceptorRefusal` separate message
 
-Preserving connections (not neede)
+Preserving connections (not needed)
 
-multiple joint consensus (not neede)
+multiple joint consensus (not needed)
 
 ## Misc
 
 We should use Compute <-> safekeeper protocol change to include other (long
 yearned) modifications:
+- network order
 

From 4d1cf2dc6f6406f51333af0a495146fe3dbb9153 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 23 Jul 2024 17:58:32 +0300
Subject: [PATCH 1386/1571] tests, rollout

---
 ...35-safekeeper-dynamic-membership-change.md | 94 ++++++++++++-------
 1 file changed, 62 insertions(+), 32 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index e9183c9829..88087270d6 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -347,7 +347,7 @@ Returns scheduled request.
 
 Similar call should be added for the tenant.
 
-It would be great to have some way of subscribing to the results (appart from
+It would be great to have some way of subscribing to the results (apart from
 looking at logs/metrics).
 
 Migration is executed as described above. One subtlety is that (local) deletion on
@@ -367,6 +367,9 @@ which switches configurations. Similarly when timeline is fully deleted to
 prevent cplane operation from blocking when some safekeeper is not available
 deletion should be also registered.
 
+One more task pool should infinitely retry notifying control plane about changed
+safekeeper sets.
+
 3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return
    current in memory state of the timeline and pending `MigrationRequest`,
    if any.
@@ -402,15 +405,19 @@ There should be following layers of tests:
    not be lost.
 
 3) Since simulation testing injects at relatively high level points (not
-   syscalls), it omits some code, in particular `pull_timeline`. Thus it 
-   is better to have basic tests covering whole system. Extended 
-   version of `test_restarts_under_load` would do. TBD
+   syscalls), it omits some code, in particular `pull_timeline`. Thus it is
+   better to have basic tests covering whole system as well. Extended version of
+   `test_restarts_under_load` would do: start background load and do migration 
+   under it, then restart endpoint and check that no reported commits 
+   had been lost. I'd also add one more creating classic network split scenario, with
+   one compute talking to AC and another to BD while migration from nodes ABC to ABD
+   happens.
 
-4) Basic e2e test should ensure that full flow including cplane notification works.
+4) Simple e2e test should ensure that full flow including cplane notification works.
 
 ## Order of implementation and rollout
 
-note that 
+Note that 
 - Control plane parts and integration with it is fully independent from everything else
   (tests would use simulation and neon_local).
 - There is a lot of infra work making storage_controller aware of timelines and safekeepers
@@ -418,40 +425,63 @@ note that
 - Initially walproposer can just stop working while it observers joint configuration.
   Such window would be typically very short anyway.
 
-TimelineCreateRequest should get optional safekeepers field with safekeepers chosen by cplane.
+To rollout smoothly, both walproposer and safekeeper should have flag
+`configurations_enabled`; when set to false, they would work as currently, i.e.
+walproposer is able to commit on whatever safekeeper set it is provided. Until
+all timelines are managed by storcon we'd need to use current script to migrate
+and update/drop entries in the storage_controller database if it has any.
 
-rough order:
-- add sk infra, but not enforce confs
-- change proto
-- add wp proto, but not enforce confs
-- implement storconn. It will be used and tested by neon_local.
-- implement cplane/storcon integration. Route branch creation/deletion 
-  through storcon. Then we can test migration of these branches, hm.
-  In principle sk choice from cplane can be removed at this point. 
-  However, that would be bad because before import 1) 
-  storconn doesn't know about existing project so can't colocate tenants 
-  2) neither it knows about capacity. So we could instead allow to set sk 
-  set in the branch creation request.
-  These cplane -> storconn calls should be under feature flag; 
-  rollback is safe.
-- finally import existing branches. Then we can drop cplane 
-  sk selection code.
-  also only at this point wp will always use generations and
-  so we can drop 'tli creation on connect'.
+Safekeepers would need to be able to talk both current and new protocol version
+with compute to reduce number of computes restarted in prod once v2 protocol is
+deployed (though before completely switching we'd need to force this).
+
+Let's have the following rollout order:
+- storage_controller becomes aware of safekeepers;
+- storage_controller gets timeline creation for new timelines and deletion requests, but
+  doesn't manage all timelines yet. Migration can be tested on these new timelines.
+  To keep control plane and storage_controller databases in sync while control 
+  plane still chooses the safekeepers initially (until all timelines are imported
+  it can choose better), `TimelineCreateRequest` can get optional safekeepers
+  field with safekeepers chosen by cplane.
+- Then we can import all existing timelines from control plane to
+  storage_controller and gradually enable configurations region by region.
+
+
+Very rough implementation order:
+- Add concept of configurations to safekeepers (including control file),
+  implement v3 protocol.
+- Implement walproposer changes, including protocol.
+- Implement storconn part. Use it in neon_local (and pytest).
+- Make cplane store safekeepers per timeline instead of per tenant.
+- Implement cplane/storcon integration. Route branch creation/deletion 
+  through storcon. Then we can test migration of new branches.
+- Finally import existing branches. Then we can drop cplane 
+  safekeeper selection code. Gradually enable configurations at 
+  computes and safekeepers. Before that, all computes must talk only
+  v3 protocol version.
 
 ## Integration with evicted timelines
 
+Currently, `pull_timeline` doesn't work correctly with evicted timelines because
+copy would point to original partial file. To fix let's just do s3 copy of the
+file. It is a bit stupid as generally unnecessary work, but it makes sense to
+implement proper migration before doing smarter timeline archival.
+
 ## Possible optimizations
 
-`AcceptorRefusal` separate message
 
-Preserving connections (not needed)
+Algorithm suggested above forces walproposer re-election (technically restart)
+and thus reconnection to safekeepers; essentially we treat generation as part of
+term and don't allow leader to survive configuration change. It is possible to
+optimize this, but this is untrivial and I don't think needed. Reconnection is
+very fast and it is much more important to avoid compute restart than
+millisecond order of write stall.
 
-multiple joint consensus (not needed)
+Multiple joint consensus: algorithm above rejects attempt to change membership
+while another attempt is in progress. It is possible to overlay them and AFAIK
+Aurora does this but similarly I don't think this is needed.
 
 ## Misc
 
-We should use Compute <-> safekeeper protocol change to include other (long
-yearned) modifications:
-- network order
-
+We should use Compute <-> safekeeper protocol change to include another long
+yearned modifications: send data in network order to make arm work.

From c9d2b6119576d8bd5f98460ac249db468b51bc7a Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 2 Aug 2024 12:28:11 +0300
Subject: [PATCH 1387/1571] fix term uniqueness

---
 ...35-safekeeper-dynamic-membership-change.md | 51 ++++++++++---------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index 88087270d6..ed831f1492 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -68,9 +68,9 @@ case of sk->wp message.
 
 Basic rule: once safekeeper observes configuration higher than his own it
 immediately switches to it. It must refuse all messages with lower generation
-that his. It also refuses messages if it is not member of the current
-generation, though it is likely not unsafe to process them (walproposer should
-ignore them anyway).
+that his. It also refuses messages if it is not member of the current generation
+(that is, of either `sk_set` of `sk_new_set`), though it is likely not unsafe to
+process them (walproposer should ignore result anyway).
 
 If there is non null configuration in `ProposerGreeting` and it is higher than
 current safekeeper one, safekeeper switches to it.
@@ -87,9 +87,9 @@ current one and ignores it otherwise. In any case it replies with
 ```
 struct ConfigurationSwitchResponse {
     conf: Configuration,
+    term: Term,
     last_log_term: Term,
     flush_lsn: Lsn,
-    term: Term, // not used by this RFC, but might be useful for observability
 }
 ```
 
@@ -142,29 +142,33 @@ storage are reachable.
    delivering them `joint_conf`. Collecting responses from majority is required
    to proceed. If any response returned generation higher than 
    `joint_conf.generation`, abort (another switch raced us). Otherwise, choose
-   max `<last_log_term, flush_lsn>` among responses and establish it as 
-   (in memory) `sync_position`. We can't finish switch until majority 
-   of the new set catches up to this position because data before it 
-   could be committed without ack from the new set.
-4) Initialize timeline on safekeeper(s) from `new_sk_set` where it 
+   max `<last_log_term, flush_lsn>` among responses and establish it as
+   (in memory) `sync_position`. Also choose max `term` and establish it as (in
+   memory) `sync_term`. We can't finish the switch until majority of the new set
+   catches up to this `sync_position` because data before it could be committed
+   without ack from the new set. Similarly, we'll bump term on new majority
+   to `sync_term` so that two computes with the same term are never elected.
+4) Initialize timeline on safekeeper(s) from `new_sk_set` where it
    doesn't exist yet by doing `pull_timeline` from the majority of the 
-   current set. Doing  that on majority of `new_sk_set` is enough to
+   current set. Doing that on majority of `new_sk_set` is enough to
    proceed, but it is reasonable to ensure that all `new_sk_set` members
    are initialized -- if some of them are down why are we migrating there?
-5) Call `PUT` `configuration` on safekeepers from the new set,
+5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set. 
+   Success on majority is enough.
+6) Repeatedly call `PUT` `configuration` on safekeepers from the new set,
    delivering them `joint_conf` and collecting their positions. This will
    switch them to the `joint_conf` which generally won't be needed 
    because `pull_timeline` already includes it and plus additionally would be
-   broadcast by compute. More importantly, we may proceed to the next step 
+   broadcast by compute. More importantly, we may proceed to the next step
    only when `<last_log_term, flush_lsn>` on the majority of the new set reached 
-   `sync_position`. Similarly, on the happy path this is not needed because 
-   `pull_timeline` already includes it. However, it is better to double
+   `sync_position`. Similarly, on the happy path no waiting is not needed because 
+   `pull_timeline` already includes it. However, we should double
     check to be safe. For example, timeline could have been created earlier e.g.
-    manually or after try-to-migrate, abort, try-to-migrate-again sequence.   
-6) Create `new_conf: Configuration` incrementing `join_conf` generation and having new 
+    manually or after try-to-migrate, abort, try-to-migrate-again sequence. 
+7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new 
    safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration 
    storage under one more CAS.
-7) Call `PUT` `configuration` on safekeepers from the new set,
+8) Call `PUT` `configuration` on safekeepers from the new set,
    delivering them `new_conf`. It is enough to deliver it to the majority 
    of the new set; the rest can be updated by compute.
 
@@ -469,13 +473,12 @@ implement proper migration before doing smarter timeline archival.
 
 ## Possible optimizations
 
-
-Algorithm suggested above forces walproposer re-election (technically restart)
-and thus reconnection to safekeepers; essentially we treat generation as part of
-term and don't allow leader to survive configuration change. It is possible to
-optimize this, but this is untrivial and I don't think needed. Reconnection is
-very fast and it is much more important to avoid compute restart than
-millisecond order of write stall.
+Steps above suggest walproposer restart (with re-election) and thus reconnection
+to safekeepers. Since by bumping term on new majority we ensure that leader
+terms are unique even across generation switches it is possible to preserve
+connections. However, it is more complicated, reconnection is very fast and it
+is much more important to avoid compute restart than millisecond order of write
+stall.
 
 Multiple joint consensus: algorithm above rejects attempt to change membership
 while another attempt is in progress. It is possible to overlay them and AFAIK

From 28ef1522d63b31e8735fa84e45cfd6d336972dfc Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 2 Aug 2024 13:46:32 +0300
Subject: [PATCH 1388/1571] cosmetic fixes

---
 docs/rfcs/035-safekeeper-dynamic-membership-change.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index ed831f1492..0d7396cf93 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -191,7 +191,8 @@ considerations.
    I don't think we'll observe this in practice, but can add waking up compute if needed.
 4) In the end timeline should be locally deleted on the safekeeper(s) which are
    in the old set but not in the new one, unless they are unreachable. To be
-   safe this also should be done under generation number.
+   safe this also should be done under generation number (deletion proceeds only if 
+   current configuration is <= than one in request and safekeeper is not memeber of it).
 5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
    jump to step 7, using it as `new_conf`.
 
@@ -207,7 +208,7 @@ This assumes that migration will be fully usable only after we migrate all
 tenants/timelines to storage_controller. It is discussible whether we want also
 to manage pageserver attachments for all of these, but likely we do.
 
-This requires us to define
+This requires us to define storcon <-> cplane interface.
 
 ### storage_controller <-> control plane interface
 
@@ -273,14 +274,14 @@ similarly, in the first version it is ok to trigger it manually).
 
 `safekeepers` table mirroring current `nodes` should be added, except that for
 `scheduling_policy` field (seems like `status` is a better name for it): it is enough
-to have at least in the beginning only 3 fields: 1) `active` 2) `unavailable` 3)
+to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3)
 `decomissioned`.
 
 `timelines` table:
 ```
 table! {
     // timeline_id is primary key
-    timelines (timeline_id) {
+    timelines (tenant_id, timeline_id) {
         timeline_id -> Varchar,
         tenant_id -> Varchar,
         generation -> Int4,

From 930763cad2278a65b64cabaa231ea9b356f479ca Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 6 Aug 2024 17:25:49 +0300
Subject: [PATCH 1389/1571] s/jsonb/array

---
 docs/rfcs/035-safekeeper-dynamic-membership-change.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index 0d7396cf93..307606da38 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -285,8 +285,8 @@ table! {
         timeline_id -> Varchar,
         tenant_id -> Varchar,
         generation -> Int4,
-        sk_set -> Jsonb, // list of safekeeper ids
-        new_sk_set -> Nullable<Jsonb>, // list of safekeeper ids, null if not join conf
+        sk_set -> Array<Int4>, // list of safekeeper ids
+        new_sk_set -> Nullable<Array<Int4>>, // list of safekeeper ids, null if not joint conf
         cplane_notified_generation -> Int4,
     }
 }
@@ -299,7 +299,7 @@ Node management is similar to pageserver:
 2) GET `/control/v1/safekeepers` lists safekeepers.
 3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
 4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
-   `unavailable` or `decomissioned`. Initially it is simpler not to schedule any
+   `offline` or `decomissioned`. Initially it is simpler not to schedule any
     migrations here.
 
 Safekeeper deploy scripts should register safekeeper at storage_contorller as
@@ -358,7 +358,7 @@ looking at logs/metrics).
 Migration is executed as described above. One subtlety is that (local) deletion on
 source safekeeper might fail, which is not a problem if we are going to
 decomission the node but leaves garbage otherwise. I'd propose in the first version
-1) Don't attempt deletion at all if node status is `unavailable`.
+1) Don't attempt deletion at all if node status is `offline`.
 2) If it failed, just issue warning.
 And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and 
 remove garbage timelines for manual use. It will 1) list all timelines on the 
@@ -470,7 +470,7 @@ Very rough implementation order:
 Currently, `pull_timeline` doesn't work correctly with evicted timelines because
 copy would point to original partial file. To fix let's just do s3 copy of the
 file. It is a bit stupid as generally unnecessary work, but it makes sense to
-implement proper migration before doing smarter timeline archival.
+implement proper migration before doing smarter timeline archival. [Issue](https://github.com/neondatabase/neon/issues/8542)
 
 ## Possible optimizations
 

From 06df6ca52e3b245727fbd76a21050ec98c8e83e1 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 6 Aug 2024 17:37:59 +0300
Subject: [PATCH 1390/1571] proto changes

---
 docs/rfcs/035-safekeeper-dynamic-membership-change.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index 307606da38..239ec58186 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -487,5 +487,9 @@ Aurora does this but similarly I don't think this is needed.
 
 ## Misc
 
-We should use Compute <-> safekeeper protocol change to include another long
-yearned modifications: send data in network order to make arm work.
+We should use Compute <-> safekeeper protocol change to include other (long
+yearned) modifications:
+- send data in network order to make arm work.
+- remove term_start_lsn from AppendRequest
+- add horizon to TermHistory
+- add to ProposerGreeting number of connection from this wp to sk

From 41b5ee491edc75d3135a4e2b6b8a045244c3d6f7 Mon Sep 17 00:00:00 2001
From: Shinya Kato <37682778+shinyaaa@users.noreply.github.com>
Date: Mon, 12 Aug 2024 21:24:25 +0900
Subject: [PATCH 1391/1571] Fix a comment in walproposer_pg.c (#8583)

## Problem
Perhaps there is an error in the source code comment.

## Summary of changes
Fix "walsender" to "walproposer"
---
 pgxn/neon/walproposer_pg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 944b316344..f3ddc64061 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -512,7 +512,7 @@ replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRe
 }
 
 /*
- * Start walsender streaming replication
+ * Start walproposer streaming replication
  */
 static void
 walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)

From 1b9a27d6e30f086f3ce8a41a617ae551ea0a4b0a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 12 Aug 2024 13:33:09 +0100
Subject: [PATCH 1392/1571] tests: reinstate test_bulk_insert (#8683)

## Problem

This test was disabled.

## Summary of changes

- Remove the skip marker.
- Explicitly avoid doing compaction & gc during checkpoints (the default
scale doesn't do anything here, but when experimeting with larger scales
it messes things up)
- Set a data size that gives a ~20s runtime on a Hetzner dev machine,
previous one gave very noisy results because it was so small


For reference on a Hetzner AX102:
```
------------------------------ Benchmark results -------------------------------
test_bulk_insert[neon-release-pg16].insert: 25.664 s
test_bulk_insert[neon-release-pg16].pageserver_writes: 5,428 MB
test_bulk_insert[neon-release-pg16].peak_mem: 577 MB
test_bulk_insert[neon-release-pg16].size: 0 MB
test_bulk_insert[neon-release-pg16].data_uploaded: 1,922 MB
test_bulk_insert[neon-release-pg16].num_files_uploaded: 8
test_bulk_insert[neon-release-pg16].wal_written: 1,382 MB
test_bulk_insert[neon-release-pg16].wal_recovery: 25.373 s
test_bulk_insert[neon-release-pg16].compaction: 0.035 s
```
---
 test_runner/fixtures/compare_fixtures.py    | 25 ++++++++++++++++-----
 test_runner/performance/test_bulk_insert.py | 14 +++++++-----
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 08215438e1..5fe544b3bd 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -42,7 +42,11 @@ class PgCompare(ABC):
         pass
 
     @abstractmethod
-    def flush(self):
+    def flush(self, compact: bool = False, gc: bool = False):
+        pass
+
+    @abstractmethod
+    def compact(self):
         pass
 
     @abstractmethod
@@ -129,13 +133,16 @@ class NeonCompare(PgCompare):
     def pg_bin(self) -> PgBin:
         return self._pg_bin
 
-    def flush(self):
+    def flush(self, compact: bool = True, gc: bool = True):
         wait_for_last_flush_lsn(self.env, self._pg, self.tenant, self.timeline)
-        self.pageserver_http_client.timeline_checkpoint(self.tenant, self.timeline)
-        self.pageserver_http_client.timeline_gc(self.tenant, self.timeline, 0)
+        self.pageserver_http_client.timeline_checkpoint(self.tenant, self.timeline, compact=compact)
+        if gc:
+            self.pageserver_http_client.timeline_gc(self.tenant, self.timeline, 0)
 
     def compact(self):
-        self.pageserver_http_client.timeline_compact(self.tenant, self.timeline)
+        self.pageserver_http_client.timeline_compact(
+            self.tenant, self.timeline, wait_until_uploaded=True
+        )
 
     def report_peak_memory_use(self):
         self.zenbenchmark.record(
@@ -215,9 +222,12 @@ class VanillaCompare(PgCompare):
     def pg_bin(self) -> PgBin:
         return self._pg.pg_bin
 
-    def flush(self):
+    def flush(self, compact: bool = False, gc: bool = False):
         self.cur.execute("checkpoint")
 
+    def compact(self):
+        pass
+
     def report_peak_memory_use(self):
         pass  # TODO find something
 
@@ -266,6 +276,9 @@ class RemoteCompare(PgCompare):
         # TODO: flush the remote pageserver
         pass
 
+    def compact(self):
+        pass
+
     def report_peak_memory_use(self):
         # TODO: get memory usage from remote pageserver
         pass
diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index 3dad348976..69df7974b9 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -1,9 +1,9 @@
 from contextlib import closing
 
-import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare, PgCompare
+from fixtures.log_helper import log
 from fixtures.pg_version import PgVersion
 
 
@@ -17,7 +17,6 @@ from fixtures.pg_version import PgVersion
 # 3. Disk space used
 # 4. Peak memory usage
 #
-@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/7124")
 def test_bulk_insert(neon_with_baseline: PgCompare):
     env = neon_with_baseline
 
@@ -30,8 +29,8 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
             # Run INSERT, recording the time and I/O it takes
             with env.record_pageserver_writes("pageserver_writes"):
                 with env.record_duration("insert"):
-                    cur.execute("insert into huge values (generate_series(1, 5000000), 0);")
-                    env.flush()
+                    cur.execute("insert into huge values (generate_series(1, 20000000), 0);")
+                    env.flush(compact=False, gc=False)
 
             env.report_peak_memory_use()
             env.report_size()
@@ -49,6 +48,9 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
     if isinstance(env, NeonCompare):
         measure_recovery_time(env)
 
+    with env.record_duration("compaction"):
+        env.compact()
+
 
 def measure_recovery_time(env: NeonCompare):
     client = env.env.pageserver.http_client()
@@ -71,7 +73,9 @@ def measure_recovery_time(env: NeonCompare):
 
     # Measure recovery time
     with env.record_duration("wal_recovery"):
+        log.info("Entering recovery...")
         client.timeline_create(pg_version, env.tenant, env.timeline)
 
         # Flush, which will also wait for lsn to catch up
-        env.flush()
+        env.flush(compact=False, gc=False)
+        log.info("Finished recovery.")

From 9dc9a9b2e950638b4fc018e1254879cbb430ba6a Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 12 Aug 2024 15:37:15 +0300
Subject: [PATCH 1393/1571] test: do graceful shutdown by default (#8655)

It should give us all possible allowed_errors more consistently.

While getting the workflows to pass on
https://github.com/neondatabase/neon/pull/8632 it was noticed that
allowed_errors are rarely hit (1/4). This made me realize that we always
do an immediate stop by default. Doing a graceful shutdown would had
made the draining more apparent and likely we would not have needed the
#8632 hotfix.

Downside of doing this is that we will see more timeouts if tests are
randomly leaving pause failpoints which fail the shutdown.

The net outcome should however be positive, we could even detect too
slow shutdowns caused by a bug or deadlock.
---
 pageserver/src/tenant.rs                      | 10 +++++
 .../src/tenant/remote_timeline_client.rs      |  5 ++-
 .../src/tenant/storage_layer/image_layer.rs   |  3 --
 pageserver/src/tenant/storage_layer/layer.rs  |  4 +-
 pageserver/src/tenant/tasks.rs                | 13 +++---
 pageserver/src/tenant/timeline.rs             | 40 ++++++++-----------
 pageserver/src/tenant/timeline/compaction.rs  | 22 ++++------
 .../walreceiver/walreceiver_connection.rs     |  3 ++
 test_runner/fixtures/neon_fixtures.py         |  2 +-
 test_runner/regress/test_ancestor_branch.py   |  6 ++-
 test_runner/regress/test_timeline_size.py     |  7 ++++
 11 files changed, 63 insertions(+), 52 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 90c0e28bc4..cfdb32f755 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -41,6 +41,7 @@ use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use upload_queue::NotInitialized;
 use utils::backoff;
 use utils::circuit_breaker::CircuitBreaker;
 use utils::completion;
@@ -601,6 +602,15 @@ impl From<PageReconstructError> for GcError {
     }
 }
 
+impl From<NotInitialized> for GcError {
+    fn from(value: NotInitialized) -> Self {
+        match value {
+            NotInitialized::Uninitialized => GcError::Remote(value.into()),
+            NotInitialized::Stopped | NotInitialized::ShuttingDown => GcError::TimelineCancelled,
+        }
+    }
+}
+
 impl From<timeline::layer_manager::Shutdown> for GcError {
     fn from(_: timeline::layer_manager::Shutdown) -> Self {
         GcError::TimelineCancelled
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1344fe4192..8a76d7532f 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -985,7 +985,10 @@ impl RemoteTimelineClient {
     ///
     /// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`]
     /// is invoked on them.
-    pub(crate) fn schedule_gc_update(self: &Arc<Self>, gc_layers: &[Layer]) -> anyhow::Result<()> {
+    pub(crate) fn schedule_gc_update(
+        self: &Arc<Self>,
+        gc_layers: &[Layer],
+    ) -> Result<(), NotInitialized> {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 16ba0fda94..f9d3fdf186 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -369,9 +369,6 @@ impl ImageLayerInner {
         self.lsn
     }
 
-    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
-    /// - inner has the success or transient failure
-    /// - outer has the permanent failure
     pub(super) async fn load(
         path: &Utf8Path,
         lsn: Lsn,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 83450d24bb..0175f32268 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1848,8 +1848,8 @@ impl ResidentLayer {
     /// Read all they keys in this layer which match the ShardIdentity, and write them all to
     /// the provided writer.  Return the number of keys written.
     #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
-    pub(crate) async fn filter<'a>(
-        &'a self,
+    pub(crate) async fn filter(
+        &self,
         shard_identity: &ShardIdentity,
         writer: &mut ImageLayerWriter,
         ctx: &RequestContext,
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index b4706ea59d..713845e9ac 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -211,6 +211,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
             } else {
                 // Run compaction
                 match tenant.compaction_iteration(&cancel, &ctx).await {
+                    Ok(has_pending_task) => {
+                        error_run_count = 0;
+                        // schedule the next compaction immediately in case there is a pending compaction task
+                        if has_pending_task { Duration::ZERO } else { period }
+                    }
                     Err(e) => {
                         let wait_duration = backoff::exponential_backoff_duration_seconds(
                             error_run_count + 1,
@@ -227,11 +232,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                         );
                         wait_duration
                     }
-                    Ok(has_pending_task) => {
-                        error_run_count = 0;
-                        // schedule the next compaction immediately in case there is a pending compaction task
-                        if has_pending_task { Duration::from_secs(0) } else { period }
-                    }
                 }
             };
 
@@ -265,7 +265,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                     count_throttled,
                     sum_throttled_usecs,
                     allowed_rps=%format_args!("{allowed_rps:.0}"),
-                    "shard was throttled in the last n_seconds")
+                    "shard was throttled in the last n_seconds"
+                );
             });
 
             // Sleep
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f810df5a56..b003834adf 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4421,22 +4421,24 @@ impl From<super::upload_queue::NotInitialized> for CompactionError {
     }
 }
 
-impl CompactionError {
-    /// We cannot do compaction because we could not download a layer that is input to the compaction.
-    pub(crate) fn input_layer_download_failed(
-        e: super::storage_layer::layer::DownloadError,
-    ) -> Self {
+impl From<super::storage_layer::layer::DownloadError> for CompactionError {
+    fn from(e: super::storage_layer::layer::DownloadError) -> Self {
         match e {
-            super::storage_layer::layer::DownloadError::TimelineShutdown |
-            /* TODO DownloadCancelled correct here? */
-            super::storage_layer::layer::DownloadError::DownloadCancelled  => CompactionError::ShuttingDown,
-            super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads |
-            super::storage_layer::layer::DownloadError::DownloadRequired |
-            super::storage_layer::layer::DownloadError::NotFile(_) |
-            super::storage_layer::layer::DownloadError::DownloadFailed |
-            super::storage_layer::layer::DownloadError::PreStatFailed(_)=>CompactionError::Other(anyhow::anyhow!(e)),
+            super::storage_layer::layer::DownloadError::TimelineShutdown
+            | super::storage_layer::layer::DownloadError::DownloadCancelled => {
+                CompactionError::ShuttingDown
+            }
+            super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads
+            | super::storage_layer::layer::DownloadError::DownloadRequired
+            | super::storage_layer::layer::DownloadError::NotFile(_)
+            | super::storage_layer::layer::DownloadError::DownloadFailed
+            | super::storage_layer::layer::DownloadError::PreStatFailed(_) => {
+                CompactionError::Other(anyhow::anyhow!(e))
+            }
             #[cfg(test)]
-            super::storage_layer::layer::DownloadError::Failpoint(_) =>  CompactionError::Other(anyhow::anyhow!(e)),
+            super::storage_layer::layer::DownloadError::Failpoint(_) => {
+                CompactionError::Other(anyhow::anyhow!(e))
+            }
         }
     }
 }
@@ -4990,15 +4992,7 @@ impl Timeline {
 
             result.layers_removed = gc_layers.len() as u64;
 
-            self.remote_client
-                .schedule_gc_update(&gc_layers)
-                .map_err(|e| {
-                    if self.cancel.is_cancelled() {
-                        GcError::TimelineCancelled
-                    } else {
-                        GcError::Remote(e)
-                    }
-                })?;
+            self.remote_client.schedule_gc_update(&gc_layers)?;
 
             guard.open_mut()?.finish_gc_timeline(&gc_layers);
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 87ec46c0b5..8390cb839c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -489,10 +489,7 @@ impl Timeline {
             // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
             //    - GC, which at worst witnesses us "undelete" a layer that they just deleted.
             //    - ingestion, which only inserts layers, therefore cannot collide with us.
-            let resident = layer
-                .download_and_keep_resident()
-                .await
-                .map_err(CompactionError::input_layer_download_failed)?;
+            let resident = layer.download_and_keep_resident().await?;
 
             let keys_written = resident
                 .filter(&self.shard_identity, &mut image_layer_writer, ctx)
@@ -693,23 +690,14 @@ impl Timeline {
 
         let mut fully_compacted = true;
 
-        deltas_to_compact.push(
-            first_level0_delta
-                .download_and_keep_resident()
-                .await
-                .map_err(CompactionError::input_layer_download_failed)?,
-        );
+        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
         for l in level0_deltas_iter {
             let lsn_range = &l.layer_desc().lsn_range;
 
             if lsn_range.start != prev_lsn_end {
                 break;
             }
-            deltas_to_compact.push(
-                l.download_and_keep_resident()
-                    .await
-                    .map_err(CompactionError::input_layer_download_failed)?,
-            );
+            deltas_to_compact.push(l.download_and_keep_resident().await?);
             deltas_to_compact_bytes += l.metadata().file_size;
             prev_lsn_end = lsn_range.end;
 
@@ -1137,6 +1125,10 @@ impl Timeline {
 
             if !self.shard_identity.is_key_disposable(&key) {
                 if writer.is_none() {
+                    if self.cancel.is_cancelled() {
+                        // to be somewhat responsive to cancellation, check for each new layer
+                        return Err(CompactionError::ShuttingDown);
+                    }
                     // Create writer if not initiaized yet
                     writer = Some(
                         DeltaLayerWriter::new(
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index a66900522a..b5c577af72 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -335,6 +335,9 @@ pub(super) async fn handle_walreceiver_connection(
                             filtered_records += 1;
                         }
 
+                        // FIXME: this cannot be made pausable_failpoint without fixing the
+                        // failpoint library; in tests, the added amount of debugging will cause us
+                        // to timeout the tests.
                         fail_point!("walreceiver-after-ingest");
 
                         last_rec_lsn = lsn;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 4374e74a41..561e8bce04 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -963,7 +963,7 @@ class NeonEnvBuilder:
         if self.env:
             log.info("Cleaning up all storage and compute nodes")
             self.env.stop(
-                immediate=True,
+                immediate=False,
                 # if the test threw an exception, don't check for errors
                 # as a failing assertion would cause the cleanup below to fail
                 ps_assert_metric_no_errors=(exc_type is None),
diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py
index 7e40081aa2..f83b44a7ad 100644
--- a/test_runner/regress/test_ancestor_branch.py
+++ b/test_runner/regress/test_ancestor_branch.py
@@ -20,7 +20,9 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder):
         }
     )
 
-    pageserver_http.configure_failpoints(("flush-frozen-pausable", "sleep(10000)"))
+    failpoint = "flush-frozen-pausable"
+
+    pageserver_http.configure_failpoints((failpoint, "sleep(10000)"))
 
     endpoint_branch0 = env.endpoints.create_start("main", tenant_id=tenant)
     branch0_cur = endpoint_branch0.connect().cursor()
@@ -96,3 +98,5 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder):
     assert query_scalar(branch1_cur, "SELECT count(*) FROM foo") == 200000
 
     assert query_scalar(branch2_cur, "SELECT count(*) FROM foo") == 300000
+
+    pageserver_http.configure_failpoints((failpoint, "off"))
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 1f220eec9e..642b9e449b 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -1137,3 +1137,10 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met
         delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True)
     else:
         raise RuntimeError(activation_method)
+
+    client.configure_failpoints(
+        [
+            ("timeline-calculate-logical-size-pause", "off"),
+            ("walreceiver-after-ingest", "off"),
+        ]
+    )

From ae527ef088ef1654854c0cbd9b4cc9ab3878619e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 12 Aug 2024 13:58:46 +0100
Subject: [PATCH 1394/1571] storcon: implement graceful leadership transfer
 (#8588)

## Problem
Storage controller restarts cause temporary unavailability from the
control plane POV. See RFC for more details.

## Summary of changes
* A couple of small refactors of the storage controller start-up
sequence to make extending it easier.
* A leader table is added to track the storage controller instance
that's currently the leader (if any)
* A peer client is added such that storage controllers can send
`step_down` requests to each other (implemented in
https://github.com/neondatabase/neon/pull/8512).
* Implement the leader cut-over as described in the RFC
* Add `start-as-candidate` flag to the storage controller to gate the
rolling restart behaviour. When the flag is `false` (the default), the
only change from the current start-up sequence is persisting the leader
entry to the database.
---
 .../2024-07-26-140924_create_leader/down.sql  |   1 +
 .../2024-07-26-140924_create_leader/up.sql    |   5 +
 storage_controller/src/lib.rs                 |   1 +
 storage_controller/src/main.rs                |  11 +
 storage_controller/src/metrics.rs             |  16 ++
 storage_controller/src/peer_client.rs         | 106 +++++++
 storage_controller/src/persistence.rs         |  74 +++++
 storage_controller/src/schema.rs              |   9 +-
 storage_controller/src/service.rs             | 261 +++++++++++++-----
 9 files changed, 407 insertions(+), 77 deletions(-)
 create mode 100644 storage_controller/migrations/2024-07-26-140924_create_leader/down.sql
 create mode 100644 storage_controller/migrations/2024-07-26-140924_create_leader/up.sql
 create mode 100644 storage_controller/src/peer_client.rs

diff --git a/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql b/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql
new file mode 100644
index 0000000000..53222c614e
--- /dev/null
+++ b/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql
@@ -0,0 +1 @@
+DROP TABLE controllers;
diff --git a/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql b/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql
new file mode 100644
index 0000000000..90546948cb
--- /dev/null
+++ b/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql
@@ -0,0 +1,5 @@
+CREATE TABLE controllers (
+  address VARCHAR NOT NULL,
+  started_at TIMESTAMPTZ NOT NULL,
+  PRIMARY KEY(address, started_at)
+);
diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs
index 26c258c466..2034addbe1 100644
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -11,6 +11,7 @@ mod id_lock_map;
 pub mod metrics;
 mod node;
 mod pageserver_client;
+mod peer_client;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index a66e9128bc..5a68799141 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -1,6 +1,7 @@
 use anyhow::{anyhow, Context};
 use clap::Parser;
 use diesel::Connection;
+use hyper::Uri;
 use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
 use std::path::PathBuf;
@@ -83,6 +84,13 @@ struct Cli {
     #[arg(long, default_value = "5s")]
     db_connect_timeout: humantime::Duration,
 
+    #[arg(long, default_value = "false")]
+    start_as_candidate: bool,
+
+    // TODO: make this mandatory once the helm chart gets updated
+    #[arg(long)]
+    address_for_peers: Option<Uri>,
+
     /// `neon_local` sets this to the path of the neon_local repo dir.
     /// Only relevant for testing.
     // TODO: make `cfg(feature = "testing")`
@@ -285,6 +293,9 @@ async fn async_main() -> anyhow::Result<()> {
         split_threshold: args.split_threshold,
         neon_local_repo_dir: args.neon_local_repo_dir,
         max_secondary_lag_bytes: args.max_secondary_lag_bytes,
+        address_for_peers: args.address_for_peers,
+        start_as_candidate: args.start_as_candidate,
+        http_service_port: args.listen.port() as i32,
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index a1a4b8543d..c2303e7a7f 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -12,6 +12,7 @@ use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, Metr
 use metrics::NeonMetrics;
 use once_cell::sync::Lazy;
 use std::sync::Mutex;
+use strum::IntoEnumIterator;
 
 use crate::{
     persistence::{DatabaseError, DatabaseOperation},
@@ -241,3 +242,18 @@ impl DatabaseError {
         }
     }
 }
+
+/// Update the leadership status metric gauges to reflect the requested status
+pub(crate) fn update_leadership_status(status: LeadershipStatus) {
+    let status_metric = &METRICS_REGISTRY
+        .metrics_group
+        .storage_controller_leadership_status;
+
+    for s in LeadershipStatus::iter() {
+        if s == status {
+            status_metric.set(LeadershipStatusGroup { status: s }, 1);
+        } else {
+            status_metric.set(LeadershipStatusGroup { status: s }, 0);
+        }
+    }
+}
diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs
new file mode 100644
index 0000000000..ebb59a1720
--- /dev/null
+++ b/storage_controller/src/peer_client.rs
@@ -0,0 +1,106 @@
+use crate::tenant_shard::ObservedState;
+use pageserver_api::shard::TenantShardId;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use tokio_util::sync::CancellationToken;
+
+use hyper::Uri;
+use reqwest::{StatusCode, Url};
+use utils::{backoff, http::error::HttpErrorBody};
+
+#[derive(Debug, Clone)]
+pub(crate) struct PeerClient {
+    uri: Uri,
+    jwt: Option<String>,
+    client: reqwest::Client,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum StorageControllerPeerError {
+    #[error("failed to deserialize error response with status code {0} at {1}: {2}")]
+    DeserializationError(StatusCode, Url, reqwest::Error),
+    #[error("storage controller peer API error ({0}): {1}")]
+    ApiError(StatusCode, String),
+    #[error("failed to send HTTP request: {0}")]
+    SendError(reqwest::Error),
+    #[error("Cancelled")]
+    Cancelled,
+}
+
+pub(crate) type Result<T> = std::result::Result<T, StorageControllerPeerError>;
+
+pub(crate) trait ResponseErrorMessageExt: Sized {
+    fn error_from_body(self) -> impl std::future::Future<Output = Result<Self>> + Send;
+}
+
+impl ResponseErrorMessageExt for reqwest::Response {
+    async fn error_from_body(self) -> Result<Self> {
+        let status = self.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            return Ok(self);
+        }
+
+        let url = self.url().to_owned();
+        Err(match self.json::<HttpErrorBody>().await {
+            Ok(HttpErrorBody { msg }) => StorageControllerPeerError::ApiError(status, msg),
+            Err(err) => StorageControllerPeerError::DeserializationError(status, url, err),
+        })
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Default)]
+pub(crate) struct GlobalObservedState(pub(crate) HashMap<TenantShardId, ObservedState>);
+
+impl PeerClient {
+    pub(crate) fn new(uri: Uri, jwt: Option<String>) -> Self {
+        Self {
+            uri,
+            jwt,
+            client: reqwest::Client::new(),
+        }
+    }
+
+    async fn request_step_down(&self) -> Result<GlobalObservedState> {
+        let step_down_path = format!("{}control/v1/step_down", self.uri);
+        let req = self.client.put(step_down_path);
+        let req = if let Some(jwt) = &self.jwt {
+            req.header(reqwest::header::AUTHORIZATION, format!("Bearer {jwt}"))
+        } else {
+            req
+        };
+
+        let res = req
+            .send()
+            .await
+            .map_err(StorageControllerPeerError::SendError)?;
+        let response = res.error_from_body().await?;
+
+        let status = response.status();
+        let url = response.url().to_owned();
+
+        response
+            .json()
+            .await
+            .map_err(|err| StorageControllerPeerError::DeserializationError(status, url, err))
+    }
+
+    /// Request the peer to step down and return its current observed state
+    /// All errors are retried with exponential backoff for a maximum of 4 attempts.
+    /// Assuming all retries are performed, the function times out after roughly 4 seconds.
+    pub(crate) async fn step_down(
+        &self,
+        cancel: &CancellationToken,
+    ) -> Result<GlobalObservedState> {
+        backoff::retry(
+            || self.request_step_down(),
+            |_e| false,
+            2,
+            4,
+            "Send step down request",
+            cancel,
+        )
+        .await
+        .ok_or_else(|| StorageControllerPeerError::Cancelled)
+        .and_then(|x| x)
+    }
+}
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 64a3e597ce..aebbdec0d1 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -95,6 +95,8 @@ pub(crate) enum DatabaseOperation {
     ListMetadataHealth,
     ListMetadataHealthUnhealthy,
     ListMetadataHealthOutdated,
+    GetLeader,
+    UpdateLeader,
 }
 
 #[must_use]
@@ -785,6 +787,69 @@ impl Persistence {
         )
         .await
     }
+
+    /// Get the current entry from the `leader` table if one exists.
+    /// It is an error for the table to contain more than one entry.
+    pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
+        let mut leader: Vec<ControllerPersistence> = self
+            .with_measured_conn(
+                DatabaseOperation::GetLeader,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::controllers::table.load::<ControllerPersistence>(conn)?)
+                },
+            )
+            .await?;
+
+        if leader.len() > 1 {
+            return Err(DatabaseError::Logical(format!(
+                "More than one entry present in the leader table: {leader:?}"
+            )));
+        }
+
+        Ok(leader.pop())
+    }
+
+    /// Update the new leader with compare-exchange semantics. If `prev` does not
+    /// match the current leader entry, then the update is treated as a failure.
+    /// When `prev` is not specified, the update is forced.
+    pub(crate) async fn update_leader(
+        &self,
+        prev: Option<ControllerPersistence>,
+        new: ControllerPersistence,
+    ) -> DatabaseResult<()> {
+        use crate::schema::controllers::dsl::*;
+
+        let updated = self
+            .with_measured_conn(
+                DatabaseOperation::UpdateLeader,
+                move |conn| -> DatabaseResult<usize> {
+                    let updated = match &prev {
+                        Some(prev) => diesel::update(controllers)
+                            .filter(address.eq(prev.address.clone()))
+                            .filter(started_at.eq(prev.started_at))
+                            .set((
+                                address.eq(new.address.clone()),
+                                started_at.eq(new.started_at),
+                            ))
+                            .execute(conn)?,
+                        None => diesel::insert_into(controllers)
+                            .values(new.clone())
+                            .execute(conn)?,
+                    };
+
+                    Ok(updated)
+                },
+            )
+            .await?;
+
+        if updated == 0 {
+            return Err(DatabaseError::Logical(
+                "Leader table update failed".to_string(),
+            ));
+        }
+
+        Ok(())
+    }
 }
 
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
@@ -910,3 +975,12 @@ impl From<MetadataHealthPersistence> for MetadataHealthRecord {
         }
     }
 }
+
+#[derive(
+    Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq, Debug, Clone,
+)]
+#[diesel(table_name = crate::schema::controllers)]
+pub(crate) struct ControllerPersistence {
+    pub(crate) address: String,
+    pub(crate) started_at: chrono::DateTime<chrono::Utc>,
+}
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index cb5ba3f38b..77ba47e114 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -1,5 +1,12 @@
 // @generated automatically by Diesel CLI.
 
+diesel::table! {
+    controllers (address, started_at) {
+        address -> Varchar,
+        started_at -> Timestamptz,
+    }
+}
+
 diesel::table! {
     metadata_health (tenant_id, shard_number, shard_count) {
         tenant_id -> Varchar,
@@ -36,4 +43,4 @@ diesel::table! {
     }
 }
 
-diesel::allow_tables_to_appear_in_same_query!(metadata_health, nodes, tenant_shards,);
+diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,);
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 31b2d0c3f5..fe582cf0e2 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1,3 +1,4 @@
+use hyper::Uri;
 use std::{
     borrow::Cow,
     cmp::Ordering,
@@ -16,8 +17,11 @@ use crate::{
     compute_hook::NotifyError,
     drain_utils::{self, TenantShardDrain, TenantShardIterator},
     id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
-    metrics::LeadershipStatusGroup,
-    persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
+    metrics,
+    peer_client::{GlobalObservedState, PeerClient},
+    persistence::{
+        AbortShardSplitStatus, ControllerPersistence, MetadataHealthPersistence, TenantFilter,
+    },
     reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
     scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
     tenant_shard::{
@@ -83,7 +87,6 @@ use crate::{
         ReconcilerWaiter, TenantShard,
     },
 };
-use serde::{Deserialize, Serialize};
 
 pub mod chaos_injector;
 
@@ -140,7 +143,15 @@ enum NodeOperations {
 /// Allowed transitions are:
 /// 1. Leader -> SteppedDown
 /// 2. Candidate -> Leader
-#[derive(Copy, Clone, strum_macros::Display, measured::FixedCardinalityLabel)]
+#[derive(
+    Eq,
+    PartialEq,
+    Copy,
+    Clone,
+    strum_macros::Display,
+    strum_macros::EnumIter,
+    measured::FixedCardinalityLabel,
+)]
 #[strum(serialize_all = "snake_case")]
 pub(crate) enum LeadershipStatus {
     /// This is the steady state where the storage controller can produce
@@ -226,22 +237,12 @@ impl ServiceState {
         tenants: BTreeMap<TenantShardId, TenantShard>,
         scheduler: Scheduler,
         delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
+        initial_leadership_status: LeadershipStatus,
     ) -> Self {
-        let status = &crate::metrics::METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_leadership_status;
-
-        status.set(
-            LeadershipStatusGroup {
-                status: LeadershipStatus::Leader,
-            },
-            1,
-        );
+        metrics::update_leadership_status(initial_leadership_status);
 
         Self {
-            // TODO: Starting up as Leader is a transient state. Once we enable rolling
-            // upgrades on the k8s side, we should start up as Candidate.
-            leadership_status: LeadershipStatus::Leader,
+            leadership_status: initial_leadership_status,
             tenants,
             nodes: Arc::new(nodes),
             scheduler,
@@ -266,29 +267,12 @@ impl ServiceState {
 
     fn step_down(&mut self) {
         self.leadership_status = LeadershipStatus::SteppedDown;
+        metrics::update_leadership_status(self.leadership_status);
+    }
 
-        let status = &crate::metrics::METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_leadership_status;
-
-        status.set(
-            LeadershipStatusGroup {
-                status: LeadershipStatus::SteppedDown,
-            },
-            1,
-        );
-        status.set(
-            LeadershipStatusGroup {
-                status: LeadershipStatus::Leader,
-            },
-            0,
-        );
-        status.set(
-            LeadershipStatusGroup {
-                status: LeadershipStatus::Candidate,
-            },
-            0,
-        );
+    fn become_leader(&mut self) {
+        self.leadership_status = LeadershipStatus::Leader;
+        metrics::update_leadership_status(self.leadership_status);
     }
 }
 
@@ -332,6 +316,12 @@ pub struct Config {
     // by more than the configured amount, then the secondary is not
     // upgraded to primary.
     pub max_secondary_lag_bytes: Option<u64>,
+
+    pub address_for_peers: Option<Uri>,
+
+    pub start_as_candidate: bool,
+
+    pub http_service_port: i32,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -499,9 +489,10 @@ pub(crate) enum ReconcileResultRequest {
     Stop,
 }
 
-// TODO: move this into the storcon peer client when that gets added
-#[derive(Serialize, Deserialize, Debug, Default)]
-pub(crate) struct GlobalObservedState(HashMap<TenantShardId, ObservedState>);
+struct LeaderStepDownState {
+    observed: GlobalObservedState,
+    leader: ControllerPersistence,
+}
 
 impl Service {
     pub fn get_config(&self) -> &Config {
@@ -513,15 +504,11 @@ impl Service {
     #[instrument(skip_all)]
     async fn startup_reconcile(
         self: &Arc<Service>,
+        leader_step_down_state: Option<LeaderStepDownState>,
         bg_compute_notify_result_tx: tokio::sync::mpsc::Sender<
             Result<(), (TenantShardId, NotifyError)>,
         >,
     ) {
-        // For all tenant shards, a vector of observed states on nodes (where None means
-        // indeterminate, same as in [`ObservedStateLocation`])
-        let mut observed: HashMap<TenantShardId, Vec<(NodeId, Option<LocationConfig>)>> =
-            HashMap::new();
-
         // Startup reconciliation does I/O to other services: whether they
         // are responsive or not, we should aim to finish within our deadline, because:
         // - If we don't, a k8s readiness hook watching /ready will kill us.
@@ -535,26 +522,28 @@ impl Service {
             .checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
             .expect("Reconcile timeout is a modest constant");
 
+        let (observed, current_leader) = if let Some(state) = leader_step_down_state {
+            tracing::info!(
+                "Using observed state received from leader at {}",
+                state.leader.address,
+            );
+            (state.observed, Some(state.leader))
+        } else {
+            (
+                self.build_global_observed_state(node_scan_deadline).await,
+                None,
+            )
+        };
+
         // Accumulate a list of any tenant locations that ought to be detached
         let mut cleanup = Vec::new();
 
-        let node_listings = self.scan_node_locations(node_scan_deadline).await;
-        // Send initial heartbeat requests to nodes that replied to the location listing above.
-        let nodes_online = self.initial_heartbeat_round(node_listings.keys()).await;
-
-        for (node_id, list_response) in node_listings {
-            let tenant_shards = list_response.tenant_shards;
-            tracing::info!(
-                "Received {} shard statuses from pageserver {}, setting it to Active",
-                tenant_shards.len(),
-                node_id
-            );
-
-            for (tenant_shard_id, conf_opt) in tenant_shards {
-                let shard_observations = observed.entry(tenant_shard_id).or_default();
-                shard_observations.push((node_id, conf_opt));
-            }
-        }
+        // Send initial heartbeat requests to all nodes loaded from the database
+        let all_nodes = {
+            let locked = self.inner.read().unwrap();
+            locked.nodes.clone()
+        };
+        let nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;
 
         // List of tenants for which we will attempt to notify compute of their location at startup
         let mut compute_notifications = Vec::new();
@@ -577,17 +566,16 @@ impl Service {
             }
             *nodes = Arc::new(new_nodes);
 
-            for (tenant_shard_id, shard_observations) in observed {
-                for (node_id, observed_loc) in shard_observations {
-                    let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
-                        cleanup.push((tenant_shard_id, node_id));
-                        continue;
-                    };
-                    tenant_shard
-                        .observed
-                        .locations
-                        .insert(node_id, ObservedStateLocation { conf: observed_loc });
-                }
+            for (tenant_shard_id, observed_state) in observed.0 {
+                let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
+                    for node_id in observed_state.locations.keys() {
+                        cleanup.push((tenant_shard_id, *node_id));
+                    }
+
+                    continue;
+                };
+
+                tenant_shard.observed = observed_state;
             }
 
             // Populate each tenant's intent state
@@ -621,6 +609,28 @@ impl Service {
             tenants.len()
         };
 
+        // Before making any obeservable changes to the cluster, persist self
+        // as leader in database and memory.
+        if let Some(address_for_peers) = &self.config.address_for_peers {
+            // TODO: `address-for-peers` can become a mandatory cli arg
+            // after we update the k8s setup
+            let proposed_leader = ControllerPersistence {
+                address: address_for_peers.to_string(),
+                started_at: chrono::Utc::now(),
+            };
+
+            if let Err(err) = self
+                .persistence
+                .update_leader(current_leader, proposed_leader)
+                .await
+            {
+                tracing::error!("Failed to persist self as leader: {err}. Aborting start-up ...");
+                std::process::exit(1);
+            }
+        }
+
+        self.inner.write().unwrap().become_leader();
+
         // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
         // generation_pageserver in the database.
 
@@ -786,6 +796,31 @@ impl Service {
         node_results
     }
 
+    async fn build_global_observed_state(&self, deadline: Instant) -> GlobalObservedState {
+        let node_listings = self.scan_node_locations(deadline).await;
+        let mut observed = GlobalObservedState::default();
+
+        for (node_id, location_confs) in node_listings {
+            tracing::info!(
+                "Received {} shard statuses from pageserver {}",
+                location_confs.tenant_shards.len(),
+                node_id
+            );
+
+            for (tid, location_conf) in location_confs.tenant_shards {
+                let entry = observed.0.entry(tid).or_default();
+                entry.locations.insert(
+                    node_id,
+                    ObservedStateLocation {
+                        conf: location_conf,
+                    },
+                );
+            }
+        }
+
+        observed
+    }
+
     /// Used during [`Self::startup_reconcile`]: detach a list of unknown-to-us tenants from pageservers.
     ///
     /// This is safe to run in the background, because if we don't have this TenantShardId in our map of
@@ -1264,12 +1299,20 @@ impl Service {
             config.max_warming_up_interval,
             cancel.clone(),
         );
+
+        let initial_leadership_status = if config.start_as_candidate {
+            LeadershipStatus::Candidate
+        } else {
+            LeadershipStatus::Leader
+        };
+
         let this = Arc::new(Self {
             inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
                 nodes,
                 tenants,
                 scheduler,
                 delayed_reconcile_rx,
+                initial_leadership_status,
             ))),
             config: config.clone(),
             persistence,
@@ -1338,7 +1381,16 @@ impl Service {
                     return;
                 };
 
-                this.startup_reconcile(bg_compute_notify_result_tx).await;
+                let leadership_status = this.inner.read().unwrap().get_leadership_status();
+                let peer_observed_state = match leadership_status {
+                    LeadershipStatus::Candidate => this.request_step_down().await,
+                    LeadershipStatus::Leader => None,
+                    LeadershipStatus::SteppedDown => unreachable!(),
+                };
+
+                this.startup_reconcile(peer_observed_state, bg_compute_notify_result_tx)
+                    .await;
+
                 drop(startup_completion);
             }
         });
@@ -6285,4 +6337,61 @@ impl Service {
 
         global_observed
     }
+
+    /// Request step down from the currently registered leader in the database
+    ///
+    /// If such an entry is persisted, the success path returns the observed
+    /// state and details of the leader. Otherwise, None is returned indicating
+    /// there is no leader currently.
+    ///
+    /// On failures to query the database or step down error responses the process is killed
+    /// and we rely on k8s to retry.
+    async fn request_step_down(&self) -> Option<LeaderStepDownState> {
+        let leader = match self.persistence.get_leader().await {
+            Ok(leader) => leader,
+            Err(err) => {
+                tracing::error!(
+                    "Failed to query database for current leader: {err}. Aborting start-up ..."
+                );
+                std::process::exit(1);
+            }
+        };
+
+        match leader {
+            Some(leader) => {
+                tracing::info!("Sending step down request to {leader:?}");
+
+                // TODO: jwt token
+                let client = PeerClient::new(
+                    Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
+                    self.config.jwt_token.clone(),
+                );
+                let state = client.step_down(&self.cancel).await;
+                match state {
+                    Ok(state) => Some(LeaderStepDownState {
+                        observed: state,
+                        leader: leader.clone(),
+                    }),
+                    Err(err) => {
+                        // TODO: Make leaders periodically update a timestamp field in the
+                        // database and, if the leader is not reachable from the current instance,
+                        // but inferred as alive from the timestamp, abort start-up. This avoids
+                        // a potential scenario in which we have two controllers acting as leaders.
+                        tracing::error!(
+                            "Leader ({}) did not respond to step-down request: {}",
+                            leader.address,
+                            err
+                        );
+                        None
+                    }
+                }
+            }
+            None => {
+                tracing::info!(
+                    "No leader found to request step down from. Will build observed state."
+                );
+                None
+            }
+        }
+    }
 }

From ce0d0a204ce9f77d6f7fe23b2bd2f393a75c7b6b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 12 Aug 2024 19:15:48 +0200
Subject: [PATCH 1395/1571] fix(walredo): shutdown can complete too early
 (#8701)

Problem
-------

The following race is possible today:

```
walredo_extraordinary_shutdown_thread: shutdown gets until Poll::Pending of self.launched_processes.close().await call

other thread: drops the last Arc<Process>
  = 1. drop(_launched_processes_guard) runs, this ...

walredo_extraordinary_shutdown_thread: ... wakes self.launched_processes.close().await

walredo_extraordinary_shutdown_thread: logs `done`

other thread:
  = 2. drop(process): this kill & waits
```

Solution
--------

Change drop order so that `process` gets dropped first.

Context
-------


https://neondb.slack.com/archives/C06Q661FA4C/p1723478188785719?thread_ts=1723456706.465789&cid=C06Q661FA4C

refs https://github.com/neondatabase/neon/pull/8572
refs https://github.com/neondatabase/cloud/issues/11387
---
 pageserver/src/walredo.rs | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 770081b3b4..82585f9ed8 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -107,8 +107,10 @@ enum ProcessOnceCell {
 }
 
 struct Process {
-    _launched_processes_guard: utils::sync::gate::GateGuard,
     process: process::WalRedoProcess,
+    /// This field is last in this struct so the guard gets dropped _after_ [`Self::process`].
+    /// (Reminder: dropping [`Self::process`] synchronously sends SIGKILL and then `wait()`s for it to exit).
+    _launched_processes_guard: utils::sync::gate::GateGuard,
 }
 
 impl std::ops::Deref for Process {
@@ -327,20 +329,23 @@ impl PostgresRedoManager {
                 },
                 Err(permit) => {
                     let start = Instant::now();
-                    let proc = Arc::new(Process {
-                            _launched_processes_guard: match self.launched_processes.enter() {
+                    // acquire guard before spawning process, so that we don't spawn new processes
+                    // if the gate is already closed.
+                    let _launched_processes_guard = match self.launched_processes.enter() {
                                 Ok(guard) => guard,
                                 Err(GateError::GateClosed) => unreachable!(
                                     "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
                                 ),
-                            },
-                            process: process::WalRedoProcess::launch(
-                                self.conf,
-                                self.tenant_shard_id,
-                                pg_version,
-                            )
-                            .context("launch walredo process")?,
-                        });
+                            };
+                    let proc = Arc::new(Process {
+                        process: process::WalRedoProcess::launch(
+                            self.conf,
+                            self.tenant_shard_id,
+                            pg_version,
+                        )
+                        .context("launch walredo process")?,
+                        _launched_processes_guard,
+                    });
                     let duration = start.elapsed();
                     WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
                     info!(

From f57c2fe8fbb19da0fa4c9d21cf73e000981c3bad Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 12 Aug 2024 21:46:35 +0200
Subject: [PATCH 1396/1571] Automatically prepare/restore Aurora and RDS
 databases from pg_dump in benchmarking workflow (#8682)

## Problem

We use infrastructure as code (TF) to deploy AWS Aurora and AWS RDS
Postgres database clusters.
Whenever we have a change in TF (e.g. **every year** to upgrade to a
higher Postgres version or when we change the cluster configuration) TF
will apply the change and create a new AWS database cluster.

However our benchmarking testcase also expects databases in these
clusters and tables loaded with data.
So we add auto-detection - if the AWS RDS instances are "empty" we
create the necessary databases and restore a pg_dump.

**Important Notes:**

- These steps are NOT run in each benchmarking run, but only after a new
RDS instance has been deployed.
- the benchmarking workflows use GitHub secrets to find the connection
string for the database. These secrets still need to be (manually or
programmatically using git cli) updated if some port of the connection
string (e.g. user, password or hostname) changes.

## Summary of changes

In each benchmarking run check if
- database has already been created - if not create it
- database has already been restored - if not restore it

Supported databases
- tpch
- clickbench
- user example

Supported platforms:
- AWS RDS Postgres
- AWS Aurora serverless Postgres

Sample workflow run - but this one uses Neon database to test the
restore step and not real AWS databases


https://github.com/neondatabase/neon/actions/runs/10321441086/job/28574350581

Sample workflow run - with real AWS database clusters

https://github.com/neondatabase/neon/actions/runs/10346816389/job/28635997653

Verification in second run - with real AWS database clusters - that
second time the restore is skipped

https://github.com/neondatabase/neon/actions/runs/10348469517/job/28640778223
---
 .../workflows/_benchmarking_preparation.yml   | 149 ++++++++++++++++++
 .github/workflows/benchmarking.yml            |  27 ++--
 2 files changed, 166 insertions(+), 10 deletions(-)
 create mode 100644 .github/workflows/_benchmarking_preparation.yml

diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml
new file mode 100644
index 0000000000..0f540afab7
--- /dev/null
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -0,0 +1,149 @@
+name: Prepare benchmarking databases by restoring dumps
+
+on:
+  workflow_call:
+    # no inputs needed
+    
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+jobs:
+  setup-databases:
+    strategy:
+      fail-fast: false
+      matrix:
+        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres ] 
+        database: [ clickbench, tpch, userexample ]
+  
+    env:
+      LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib
+      PLATFORM: ${{ matrix.platform }}
+      PG_BINARIES: /tmp/neon/pg_install/v16/bin
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      options: --init
+
+    steps:
+    - name: Set up Connection String
+      id: set-up-prep-connstr
+      run: |
+        case "${PLATFORM}" in
+          aws-rds-postgres)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} 
+            ;;
+          aws-aurora-serverless-v2-postgres)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }} 
+            ;;
+          *)
+            echo >&2 "Unknown PLATFORM=${PLATFORM}"
+            exit 1
+            ;;
+        esac
+
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT  
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    # we create a table that has one row for each database that we want to restore with the status whether the restore is done    
+    - name: Create benchmark_restore_status table if it does not exist
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
+        DATABASE_NAME: ${{ matrix.database }}
+      # to avoid a race condition of multiple jobs trying to create the table at the same time, 
+      # we use an advisory lock
+      run: |
+        ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
+        SELECT pg_advisory_lock(4711);  
+        CREATE TABLE IF NOT EXISTS benchmark_restore_status (
+        databasename text primary key,
+        restore_done boolean
+        );
+        SELECT pg_advisory_unlock(4711);
+        "
+    
+    - name: Check if restore is already done
+      id: check-restore-done
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
+        DATABASE_NAME: ${{ matrix.database }}
+      run: |
+        skip=false
+        if ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM benchmark_restore_status WHERE databasename='${{ env.DATABASE_NAME }}' AND restore_done=true;" | grep -q 1; then
+          echo "Restore already done for database ${{ env.DATABASE_NAME }} on platform ${{ env.PLATFORM }}. Skipping this database."
+          skip=true
+        fi
+        echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
+
+    - name: Check and create database if it does not exist
+      if: steps.check-restore-done.outputs.skip != 'true'
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
+        DATABASE_NAME: ${{ matrix.database }}
+      run: |
+        DB_EXISTS=$(${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM pg_database WHERE datname='${{ env.DATABASE_NAME }}'")
+        if [ "$DB_EXISTS" != "1" ]; then
+          echo "Database ${{ env.DATABASE_NAME }} does not exist. Creating it..."
+          ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "CREATE DATABASE \"${{ env.DATABASE_NAME }}\";"
+        else
+          echo "Database ${{ env.DATABASE_NAME }} already exists."
+        fi
+
+    - name: Download dump from S3 to /tmp/dumps
+      if: steps.check-restore-done.outputs.skip != 'true'
+      env:
+        DATABASE_NAME: ${{ matrix.database }}
+      run: |
+        mkdir -p /tmp/dumps
+        aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/ 
+
+    - name: Replace database name in connection string
+      if: steps.check-restore-done.outputs.skip != 'true'
+      id: replace-dbname
+      env:
+        DATABASE_NAME: ${{ matrix.database }}
+        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
+      run: |
+        # Extract the part before the database name
+        base_connstr="${BENCHMARK_CONNSTR%/*}"
+        # Extract the query parameters (if any) after the database name
+        query_params="${BENCHMARK_CONNSTR#*\?}"
+        # Reconstruct the new connection string
+        if [ "$query_params" != "$BENCHMARK_CONNSTR" ]; then
+          new_connstr="${base_connstr}/${DATABASE_NAME}?${query_params}"
+        else
+          new_connstr="${base_connstr}/${DATABASE_NAME}"
+        fi
+        echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT  
+
+    - name: Restore dump
+      if: steps.check-restore-done.outputs.skip != 'true'
+      env:
+        DATABASE_NAME: ${{ matrix.database }}
+        DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }}
+        # the following works only with larger computes: 
+        # PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
+        # we add the || true because:
+        # the dumps were created with Neon and contain neon extensions that are not 
+        # available in RDS, so we will always report an error, but we can ignore it
+      run: |
+        ${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \
+        -d "${DATABASE_CONNSTR}" /tmp/dumps/${DATABASE_NAME}.pg_dump || true
+
+    - name: Update benchmark_restore_status table
+      if: steps.check-restore-done.outputs.skip != 'true'
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
+        DATABASE_NAME: ${{ matrix.database }}
+      run: |
+        ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
+        INSERT INTO benchmark_restore_status (databasename, restore_done) VALUES ('${{ env.DATABASE_NAME }}', true)
+        ON CONFLICT (databasename) DO UPDATE SET restore_done = true;
+        "
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 6f80d6e431..106c3e3138 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -280,8 +280,9 @@ jobs:
                       { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
         }'
 
-        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
+        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                                                     { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -321,9 +322,13 @@ jobs:
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
 
+  prepare_AWS_RDS_databases:
+    uses: ./.github/workflows/_benchmarking_preparation.yml
+    secrets: inherit
+  
   pgbench-compare:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
-    needs: [ generate-matrices ]
+    needs: [ generate-matrices, prepare_AWS_RDS_databases ]
     permissions:
       contents: write
       statuses: write
@@ -595,7 +600,7 @@ jobs:
     # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
     # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
     if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    needs: [ generate-matrices, pgbench-compare ]
+    needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ]
 
     strategy:
       fail-fast: false
@@ -603,7 +608,7 @@ jobs:
 
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: 16
       TEST_OUTPUT: /tmp/test_output
       TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
       TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
@@ -655,6 +660,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -684,7 +690,7 @@ jobs:
     #
     # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
     if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    needs: [ generate-matrices, clickbench-compare ]
+    needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ]
 
     strategy:
       fail-fast: false
@@ -692,7 +698,7 @@ jobs:
 
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: 16
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -724,7 +730,7 @@ jobs:
             ENV_PLATFORM=RDS_AURORA_TPCH
             ;;
           rds-postgres)
-            ENV_PLATFORM=RDS_AURORA_TPCH
+            ENV_PLATFORM=RDS_POSTGRES_TPCH
             ;;
           *)
             echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
@@ -750,6 +756,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_tpch
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -771,7 +778,7 @@ jobs:
 
   user-examples-compare:
     if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    needs: [ generate-matrices, tpch-compare ]
+    needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ]
 
     strategy:
       fail-fast: false
@@ -779,7 +786,7 @@ jobs:
 
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: 16
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}

From 32aa1fc68151a7393801447e2f33688d78b07ea1 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Mon, 12 Aug 2024 21:54:42 -0700
Subject: [PATCH 1397/1571] Add on-demand WAL download to slot funcs (#8705)

## Problem
Currently we can have an issue where if someone does
`pg_logical_slot_advance`, it could fail because it doesn't have the WAL
locally.

## Summary of changes
Adds on-demand WAL download and a test to these slot funcs. Before
adding these, the test fails with
```
requested WAL segment pg_wal/000000010000000000000001 has already been removed
```
After the changes, the test passes


Relies on:
- https://github.com/neondatabase/postgres/pull/466
- https://github.com/neondatabase/postgres/pull/467
- https://github.com/neondatabase/postgres/pull/468
---
 pgxn/neon/neon.c                                |  1 +
 test_runner/regress/test_logical_replication.py | 15 +++++++++++++++
 vendor/postgres-v14                             |  2 +-
 vendor/postgres-v15                             |  2 +-
 vendor/postgres-v16                             |  2 +-
 vendor/revisions.json                           |  6 +++---
 6 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index d107cdc1c2..9cdbf4a126 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -599,6 +599,7 @@ _PG_init(void)
 	pg_init_walproposer();
 	WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 	LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
+	SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 
 	InitLogicalReplicationMonitor();
 
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 5a5d369a11..0d18aa43b7 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -255,6 +255,21 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
         cur.execute(
             "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')"
         )
+        cur.execute(
+            """
+INSERT INTO wal_generator (data)
+SELECT repeat('A', 1024) -- Generates a kilobyte of data per row
+FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data
+"""
+        )
+
+    endpoint.stop_and_destroy()
+    endpoint = env.endpoints.create_start("init")
+    with endpoint.connect().cursor() as cur:
+        log.info("advance slot")
+        cur.execute(
+            "SELECT * from pg_replication_slot_advance('slotty_mcslotface', pg_current_wal_lsn())"
+        )
 
 
 # Tests that walsender correctly blocks until WAL is downloaded from safekeepers
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index ae07734e0f..a48faca1d9 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit ae07734e0ff72759ab425fc8f625d4c1ecb15a50
+Subproject commit a48faca1d9aef59649dd1bf34bc1b6303fa3489e
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 47c8d462d1..39c51c33b3 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 47c8d462d169367c8979ce628a523be2d94b46be
+Subproject commit 39c51c33b383239c78b86afe561679f980e44842
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 6434b1499b..5ea106b258 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 6434b1499b11ed97dccea5618a055034b83b8e2f
+Subproject commit 5ea106b2583285849784e774b39d62eb2615bd5d
diff --git a/vendor/revisions.json b/vendor/revisions.json
index ab8b3b3c4f..f983407268 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,14 +1,14 @@
 {
   "v16": [
     "16.3",
-    "6434b1499b11ed97dccea5618a055034b83b8e2f"
+    "5ea106b2583285849784e774b39d62eb2615bd5d"
   ],
   "v15": [
     "15.7",
-    "47c8d462d169367c8979ce628a523be2d94b46be"
+    "39c51c33b383239c78b86afe561679f980e44842"
   ],
   "v14": [
     "14.12",
-    "ae07734e0ff72759ab425fc8f625d4c1ecb15a50"
+    "a48faca1d9aef59649dd1bf34bc1b6303fa3489e"
   ]
 }

From d24f1b6c044150013a020b9856186b9dba5f6c28 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 12 Aug 2024 19:28:03 +0300
Subject: [PATCH 1398/1571] Allow logical_replication_max_snap_files = -1

which disables the mechanism.
---
 pgxn/neon/neon.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 9cdbf4a126..6afca61fae 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -68,10 +68,10 @@ InitLogicalReplicationMonitor(void)
 
 	DefineCustomIntVariable(
 							"neon.logical_replication_max_snap_files",
-							"Maximum allowed logical replication .snap files",
+							"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
 							NULL,
 							&logical_replication_max_snap_files,
-							300, 0, INT_MAX,
+							300, -1, INT_MAX,
 							PGC_SIGHUP,
 							0,
 							NULL, NULL, NULL);

From 3379cbcaa451905eac32f18d3bb7a8f0d2e74fbd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 13 Aug 2024 11:48:23 +0100
Subject: [PATCH 1399/1571] pageserver: add CompactKey, use it in InMemoryLayer
 (#8652)

## Problem

This follows a PR that insists all input keys are representable in 16
bytes:
- https://github.com/neondatabase/neon/pull/8648

& a PR that prevents postgres from sending us keys that use the high
bits of field2:
- https://github.com/neondatabase/neon/pull/8657

Motivation for this change:
1. Ingest is bottlenecked on CPU
2. InMemoryLayer can create huge (~1M value) BTreeMap<Key,_> for its
index.
3. Maps over i128 are much faster than maps over an arbitrary 18 byte
struct.

It may still be worthwhile to make the index two-tier to optimize for
the case where only the last 4 bytes (blkno) of the key vary frequently,
but simply using the i128 representation of keys has a big impact for
very little effort.

Related: #8452

## Summary of changes

- Introduce `CompactKey` type which contains an i128
- Use this instead of Key in InMemoryLayer's index, converting back and
forth as needed.

## Performance

All the small-value `bench_ingest` cases show improved throughput.

The one that exercises this index most directly shows a 35% throughput
increase:

```
ingest-small-values/ingest 128MB/100b seq, no delta
                        time:   [374.29 ms 378.56 ms 383.38 ms]
                        thrpt:  [333.88 MiB/s 338.13 MiB/s 341.98 MiB/s]
                 change:
                        time:   [-26.993% -26.117% -25.111%] (p = 0.00 < 0.05)
                        thrpt:  [+33.531% +35.349% +36.974%]
                        Performance has improved.
```
---
 libs/pageserver_api/src/key.rs                | 20 +++++++++++++
 pageserver/benches/bench_ingest.rs            |  2 +-
 .../tenant/storage_layer/inmemory_layer.rs    | 29 +++++++++++--------
 pageserver/src/tenant/timeline.rs             |  2 +-
 4 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 3af3f74e9c..2fdd7de38f 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -22,6 +22,11 @@ pub struct Key {
     pub field6: u32,
 }
 
+/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
+/// a struct of fields.
+#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd)]
+pub struct CompactKey(i128);
+
 /// The storage key size.
 pub const KEY_SIZE: usize = 18;
 
@@ -130,6 +135,14 @@ impl Key {
         }
     }
 
+    pub fn to_compact(&self) -> CompactKey {
+        CompactKey(self.to_i128())
+    }
+
+    pub fn from_compact(k: CompactKey) -> Self {
+        Self::from_i128(k.0)
+    }
+
     pub const fn next(&self) -> Key {
         self.add(1)
     }
@@ -199,6 +212,13 @@ impl fmt::Display for Key {
     }
 }
 
+impl fmt::Display for CompactKey {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let k = Key::from_compact(*self);
+        k.fmt(f)
+    }
+}
+
 impl Key {
     pub const MIN: Key = Key {
         field1: u8::MIN,
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 9bab02e46c..0336302de0 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -95,7 +95,7 @@ async fn ingest(
             }
         }
 
-        layer.put_value(key, lsn, &data, &ctx).await?;
+        layer.put_value(key.to_compact(), lsn, &data, &ctx).await?;
     }
     layer.freeze(lsn + 1).await;
 
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 57d93feaaf..fb15ddfba9 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -15,6 +15,7 @@ use crate::tenant::PageReconstructError;
 use crate::{l0_flush, page_cache, walrecord};
 use anyhow::{anyhow, Result};
 use camino::Utf8PathBuf;
+use pageserver_api::key::CompactKey;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -78,7 +79,7 @@ pub struct InMemoryLayerInner {
     /// All versions of all pages in the layer are kept here. Indexed
     /// by block number and LSN. The value is an offset into the
     /// ephemeral file where the page version is stored.
-    index: BTreeMap<Key, VecMap<Lsn, u64>>,
+    index: BTreeMap<CompactKey, VecMap<Lsn, u64>>,
 
     /// The values are stored in a serialized format in this file.
     /// Each serialized Value is preceded by a 'u32' length field.
@@ -312,8 +313,12 @@ impl InMemoryLayer {
         let reader = inner.file.block_cursor();
 
         for range in keyspace.ranges.iter() {
-            for (key, vec_map) in inner.index.range(range.start..range.end) {
-                let lsn_range = match reconstruct_state.get_cached_lsn(key) {
+            for (key, vec_map) in inner
+                .index
+                .range(range.start.to_compact()..range.end.to_compact())
+            {
+                let key = Key::from_compact(*key);
+                let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
                     Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
                     None => self.start_lsn..end_lsn,
                 };
@@ -324,20 +329,18 @@ impl InMemoryLayer {
                     // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
                     let buf = reader.read_blob(*pos, &ctx).await;
                     if let Err(e) = buf {
-                        reconstruct_state
-                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
+                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
                         break;
                     }
 
                     let value = Value::des(&buf.unwrap());
                     if let Err(e) = value {
-                        reconstruct_state
-                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
+                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
                         break;
                     }
 
                     let key_situation =
-                        reconstruct_state.update_key(key, *entry_lsn, value.unwrap());
+                        reconstruct_state.update_key(&key, *entry_lsn, value.unwrap());
                     if key_situation == ValueReconstructSituation::Complete {
                         break;
                     }
@@ -417,7 +420,7 @@ impl InMemoryLayer {
     /// Adds the page version to the in-memory tree
     pub async fn put_value(
         &self,
-        key: Key,
+        key: CompactKey,
         lsn: Lsn,
         buf: &[u8],
         ctx: &RequestContext,
@@ -430,7 +433,7 @@ impl InMemoryLayer {
     async fn put_value_locked(
         &self,
         locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
-        key: Key,
+        key: CompactKey,
         lsn: Lsn,
         buf: &[u8],
         ctx: &RequestContext,
@@ -539,6 +542,8 @@ impl InMemoryLayer {
         let end_lsn = *self.end_lsn.get().unwrap();
 
         let key_count = if let Some(key_range) = key_range {
+            let key_range = key_range.start.to_compact()..key_range.end.to_compact();
+
             inner
                 .index
                 .iter()
@@ -578,7 +583,7 @@ impl InMemoryLayer {
                         let will_init = Value::des(&buf)?.will_init();
                         let res;
                         (buf, res) = delta_layer_writer
-                            .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
+                            .put_value_bytes(Key::from_compact(*key), *lsn, buf, will_init, &ctx)
                             .await;
                         res?;
                     }
@@ -617,7 +622,7 @@ impl InMemoryLayer {
                         let will_init = Value::des(&buf)?.will_init();
                         let res;
                         (buf, res) = delta_layer_writer
-                            .put_value_bytes(*key, *lsn, buf, will_init, ctx)
+                            .put_value_bytes(Key::from_compact(*key), *lsn, buf, will_init, ctx)
                             .await;
                         res?;
                     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b003834adf..9bce9c1fac 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5553,7 +5553,7 @@ impl<'a> TimelineWriter<'a> {
 
         let action = self.get_open_layer_action(lsn, buf_size);
         let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
-        let res = layer.put_value(key, lsn, &buf, ctx).await;
+        let res = layer.put_value(key.to_compact(), lsn, &buf, ctx).await;
 
         if res.is_ok() {
             // Update the current size only when the entire write was ok.

From b9d2c7bdd555e5c99e1e8ab7f418be6647407a57 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 13 Aug 2024 12:45:54 +0100
Subject: [PATCH 1400/1571] pageserver: remove vectored get related configs
 (#8695)

## Problem
Pageserver exposes some vectored get related configs which are not in
use.

## Summary of changes
Remove the following pageserver configs: get_impl, get_vectored_impl,
and `validate_get_vectored`.
They are not used in the pageserver since
https://github.com/neondatabase/neon/pull/8601.
Manual overrides have been removed from the aws repo in
https://github.com/neondatabase/aws/pull/1664.
---
 pageserver/src/bin/pageserver.rs              |  2 -
 pageserver/src/config.rs                      | 58 +------------------
 pageserver/src/tenant/timeline.rs             | 37 +-----------
 .../pagebench/test_large_slru_basebackup.py   |  3 +-
 4 files changed, 4 insertions(+), 96 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 932918410c..da0c11d9bf 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -124,8 +124,6 @@ fn main() -> anyhow::Result<()> {
     // after setting up logging, log the effective IO engine choice and read path implementations
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
-    info!(?conf.get_impl, "starting with get page implementation");
-    info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
     info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
 
     let tenants_path = conf.tenants_path();
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index f4c367bd4d..3ac5ac539f 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -29,12 +29,12 @@ use utils::{
     logging::LogFormat,
 };
 
+use crate::l0_flush::L0FlushConfig;
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
-use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
-use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl};
 use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
 
@@ -133,14 +133,8 @@ pub mod defaults {
 
 #virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
 
-#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
-
-#get_impl = '{DEFAULT_GET_IMPL}'
-
 #max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
 
-#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
-
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -278,14 +272,8 @@ pub struct PageServerConf {
 
     pub virtual_file_io_engine: virtual_file::IoEngineKind,
 
-    pub get_vectored_impl: GetVectoredImpl,
-
-    pub get_impl: GetImpl,
-
     pub max_vectored_read_bytes: MaxVectoredReadBytes,
 
-    pub validate_vectored_get: bool,
-
     pub image_compression: ImageCompressionAlgorithm,
 
     /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
@@ -396,14 +384,8 @@ struct PageServerConfigBuilder {
 
     virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
 
-    get_vectored_impl: BuilderValue<GetVectoredImpl>,
-
-    get_impl: BuilderValue<GetImpl>,
-
     max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
 
-    validate_vectored_get: BuilderValue<bool>,
-
     image_compression: BuilderValue<ImageCompressionAlgorithm>,
 
     ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
@@ -493,13 +475,10 @@ impl PageServerConfigBuilder {
 
             virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
 
-            get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
-            get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()),
             max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                 NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
             )),
             image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
-            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: Set(L0FlushConfig::default()),
             compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
@@ -659,22 +638,10 @@ impl PageServerConfigBuilder {
         self.virtual_file_io_engine = BuilderValue::Set(value);
     }
 
-    pub fn get_vectored_impl(&mut self, value: GetVectoredImpl) {
-        self.get_vectored_impl = BuilderValue::Set(value);
-    }
-
-    pub fn get_impl(&mut self, value: GetImpl) {
-        self.get_impl = BuilderValue::Set(value);
-    }
-
     pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
         self.max_vectored_read_bytes = BuilderValue::Set(value);
     }
 
-    pub fn get_validate_vectored_get(&mut self, value: bool) {
-        self.validate_vectored_get = BuilderValue::Set(value);
-    }
-
     pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
         self.image_compression = BuilderValue::Set(value);
     }
@@ -745,10 +712,7 @@ impl PageServerConfigBuilder {
                 heatmap_upload_concurrency,
                 secondary_download_concurrency,
                 ingest_batch_size,
-                get_vectored_impl,
-                get_impl,
                 max_vectored_read_bytes,
-                validate_vectored_get,
                 image_compression,
                 ephemeral_bytes_per_memory_kb,
                 l0_flush,
@@ -1002,21 +966,12 @@ impl PageServerConf {
                 "virtual_file_io_engine" => {
                     builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
                 }
-                "get_vectored_impl" => {
-                    builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
-                }
-                "get_impl" => {
-                    builder.get_impl(parse_toml_from_str("get_impl", item)?)
-                }
                 "max_vectored_read_bytes" => {
                     let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
                     builder.get_max_vectored_read_bytes(
                         MaxVectoredReadBytes(
                             NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0")))
                 }
-                "validate_vectored_get" => {
-                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
-                }
                 "image_compression" => {
                     builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
                 }
@@ -1106,14 +1061,11 @@ impl PageServerConf {
             secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
             ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
             virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
-            get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-            get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
             max_vectored_read_bytes: MaxVectoredReadBytes(
                 NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                     .expect("Invalid default constant"),
             ),
             image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
-            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
             l0_flush: L0FlushConfig::default(),
             compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
@@ -1349,13 +1301,10 @@ background_task_maximum_delay = '334 s'
                 secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                 ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
-                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                 max_vectored_read_bytes: MaxVectoredReadBytes(
                     NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                         .expect("Invalid default constant")
                 ),
-                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                 image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
@@ -1425,13 +1374,10 @@ background_task_maximum_delay = '334 s'
                 secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                 ingest_batch_size: 100,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
-                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                 max_vectored_read_bytes: MaxVectoredReadBytes(
                     NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                         .expect("Invalid default constant")
                 ),
-                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                 image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 9bce9c1fac..abe3f56e45 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -802,40 +802,6 @@ impl From<GetReadyAncestorError> for PageReconstructError {
     }
 }
 
-#[derive(
-    Eq,
-    PartialEq,
-    Debug,
-    Copy,
-    Clone,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
-pub enum GetVectoredImpl {
-    Sequential,
-    Vectored,
-}
-
-#[derive(
-    Eq,
-    PartialEq,
-    Debug,
-    Copy,
-    Clone,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
-pub enum GetImpl {
-    Legacy,
-    Vectored,
-}
-
 pub(crate) enum WaitLsnWaiter<'a> {
     Timeline(&'a Timeline),
     Tenant,
@@ -995,11 +961,10 @@ impl Timeline {
         }
 
         trace!(
-            "get vectored request for {:?}@{} from task kind {:?} will use {} implementation",
+            "get vectored request for {:?}@{} from task kind {:?}",
             keyspace,
             lsn,
             ctx.task_kind(),
-            self.conf.get_vectored_impl
         );
 
         let start = crate::metrics::GET_VECTORED_LATENCY
diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index 3258d4dcfa..8b934057e4 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -44,8 +44,7 @@ def test_basebackup_with_high_slru_count(
     page_cache_size = 16384
     max_file_descriptors = 500000
     neon_env_builder.pageserver_config_override = (
-        f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; "
-        f"get_vectored_impl='vectored'; validate_vectored_get=false"
+        f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}"
     )
     params.update(
         {

From afb68b0e7eda8c86ca0d2994c2499f25a46655a0 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 13 Aug 2024 15:07:24 +0300
Subject: [PATCH 1401/1571] Report search_path to make it possible to use it in
 pgbouncer track_extra_parameters (#8303)

## Problem

When pooled connections are used, session semantic its not preserved,
including GUC settings.
Many customers have particular problem with setting search_path.
But pgbouncer 1.20 has `track_extra_parameters` settings which allows to
track parameters included in startup package which are reported by
Postgres. Postgres has [an official list of parameters that it reports
to the
client](https://www.postgresql.org/docs/15/protocol-flow.html#PROTOCOL-ASYNC).
This PR makes Postgres also report `search_path` and so allows to
include it in `track_extra_parameters`.


## Summary of changes

Set GUC_REPORT flag  for `search_path`.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/neon.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 6afca61fae..784d0f1da3 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -32,6 +32,7 @@
 #include "utils/builtins.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
+#include "utils/guc_tables.h"
 #include "utils/wait_event.h"
 
 #include "extension_server.h"
@@ -584,6 +585,40 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n
 	return false;
 }
 
+
+/*
+ * pgbouncer is able to track GUCs reported by Postgres.
+ * But most parameters cannot be tracked this way. The only parameters that can be tracked are ones
+ * that Postgres reports to the client. Unfortunately `search_path` is not reported by Postgres:
+ * https://www.postgresql.org/message-id/flat/CAGECzQQ6xFcgrg%2Be0p9mCumtK362TiA6vTiiZKoYbS8OXggwuQ%40mail.gmail.com#be4bfd7a9cf1f0633bdb2d1790a0a1be
+ * This code sets GUC_REPORT flag for `search_path`making it possible to include it in
+ * pgbouncer's `track_extra_parameters` list.
+ *
+ * This code is inspired by how the Citus extension does this, see
+ * https://github.com/citusdata/citus/blob/2a263fe69a707d16ef24378f7650742386b0968f/src/backend/distributed/shared_library_init.c#L2694
+ */
+static void
+ReportSearchPath(void)
+{
+#if PG_VERSION_NUM >= 160000
+	int nGucs = 0;
+	struct config_generic **gucs = get_guc_variables(&nGucs);
+#else
+	struct config_generic **gucs = get_guc_variables();
+	int nGucs = GetNumConfigOptions();
+#endif
+
+	for (int i = 0; i < nGucs; i++)
+	{
+		struct config_generic *guc = (struct config_generic *) gucs[i];
+
+		if (strcmp(guc->name, "search_path") == 0)
+		{
+			guc->flags |= GUC_REPORT;
+		}
+	}
+}
+
 void
 _PG_init(void)
 {
@@ -627,6 +662,8 @@ _PG_init(void)
 	 * extension was loaded will be removed.
 	 */
 	EmitWarningsOnPlaceholders("neon");
+
+	ReportSearchPath();
 }
 
 PG_FUNCTION_INFO_V1(pg_cluster_size);

From ecb01834d645392d508a50d40fa75e746aeef276 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 13 Aug 2024 15:15:55 +0100
Subject: [PATCH 1402/1571] pageserver: implement utilization score (#8703)

## Problem

When the utilization API was added, it was just a stub with disk space
information.

Disk space information isn't a very good metric for assigning tenants to
pageservers, because pageservers making full use of their disks would
always just have 85% utilization, irrespective of how much pressure they
had for disk space.

## Summary of changes

- Use the new layer visibiilty metric to calculate a "wanted size" per
tenant, and sum these to get a total local disk space wanted per
pageserver. This acts as the primary signal for utilization.
- Also use the shard count to calculate a utilization score, and take
the max of this and the disk-driven utilization. The shard count limit
is currently set as a constant 20,000, which matches contemporary
operational practices when loading pageservers.

The shard count limit means that for tiny/empty tenants, on a machine
with 3.84TB disk, each tiny tenant influences the utilization score as
if it had size 160MB.
---
 libs/pageserver_api/src/models/utilization.rs | 90 +++++++++++++++++--
 pageserver/src/http/routes.rs                 |  5 +-
 pageserver/src/tenant.rs                      | 13 +++
 pageserver/src/tenant/mgr.rs                  | 51 +++++++++++
 pageserver/src/utilization.rs                 | 34 +++++--
 5 files changed, 176 insertions(+), 17 deletions(-)

diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs
index e88cab5d6a..0fec221276 100644
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -1,4 +1,5 @@
-use utils::serde_system_time::SystemTime;
+use std::time::SystemTime;
+use utils::{serde_percent::Percent, serde_system_time};
 
 /// Pageserver current utilization and scoring for how good candidate the pageserver would be for
 /// the next tenant.
@@ -9,19 +10,88 @@ use utils::serde_system_time::SystemTime;
 /// not handle full u64 values properly.
 #[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
 pub struct PageserverUtilization {
-    /// Used disk space
+    /// Used disk space (physical, ground truth from statfs())
     #[serde(serialize_with = "ser_saturating_u63")]
     pub disk_usage_bytes: u64,
     /// Free disk space
     #[serde(serialize_with = "ser_saturating_u63")]
     pub free_space_bytes: u64,
-    /// Lower is better score for how good candidate for a next tenant would this pageserver be.
-    #[serde(serialize_with = "ser_saturating_u63")]
+
+    /// Wanted disk space, based on the tenant shards currently present on this pageserver: this
+    /// is like disk_usage_bytes, but it is stable and does not change with the cache state of
+    /// tenants, whereas disk_usage_bytes may reach the disk eviction `max_usage_pct` and stay
+    /// there, or may be unrealistically low if the pageserver has attached tenants which haven't
+    /// downloaded layers yet.
+    #[serde(serialize_with = "ser_saturating_u63", default)]
+    pub disk_wanted_bytes: u64,
+
+    // What proportion of total disk space will this pageserver use before it starts evicting data?
+    #[serde(default = "unity_percent")]
+    pub disk_usable_pct: Percent,
+
+    // How many shards are currently on this node?
+    #[serde(default)]
+    pub shard_count: u32,
+
+    // How many shards should this node be able to handle at most?
+    #[serde(default)]
+    pub max_shard_count: u32,
+
+    /// Cached result of [`Self::score`]
     pub utilization_score: u64,
+
     /// When was this snapshot captured, pageserver local time.
     ///
     /// Use millis to give confidence that the value is regenerated often enough.
-    pub captured_at: SystemTime,
+    pub captured_at: serde_system_time::SystemTime,
+}
+
+fn unity_percent() -> Percent {
+    Percent::new(0).unwrap()
+}
+
+impl PageserverUtilization {
+    const UTILIZATION_FULL: u64 = 1000000;
+
+    /// Calculate a utilization score.  The result is to be inrepreted as a fraction of
+    /// Self::UTILIZATION_FULL.
+    ///
+    /// Lower values are more affine to scheduling more work on this node.
+    /// - UTILIZATION_FULL represents an ideal node which is fully utilized but should not receive any more work.
+    /// - 0.0 represents an empty node.
+    /// - Negative values are forbidden
+    /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
+    ///   layer eviction.
+    pub fn score(&self) -> u64 {
+        let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
+            * self.disk_usable_pct.get() as u64)
+            / 100;
+        let disk_utilization_score =
+            self.disk_wanted_bytes * Self::UTILIZATION_FULL / disk_usable_capacity;
+
+        let shard_utilization_score =
+            self.shard_count as u64 * Self::UTILIZATION_FULL / self.max_shard_count as u64;
+        std::cmp::max(disk_utilization_score, shard_utilization_score)
+    }
+
+    pub fn refresh_score(&mut self) {
+        self.utilization_score = self.score();
+    }
+
+    /// A utilization structure that has a full utilization score: use this as a placeholder when
+    /// you need a utilization but don't have real values yet.
+    pub fn full() -> Self {
+        Self {
+            disk_usage_bytes: 1,
+            free_space_bytes: 0,
+            disk_wanted_bytes: 1,
+            disk_usable_pct: Percent::new(100).unwrap(),
+            shard_count: 1,
+            max_shard_count: 1,
+            utilization_score: Self::UTILIZATION_FULL,
+            captured_at: serde_system_time::SystemTime(SystemTime::now()),
+        }
+    }
 }
 
 /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
@@ -49,15 +119,19 @@ mod tests {
         let doc = PageserverUtilization {
             disk_usage_bytes: u64::MAX,
             free_space_bytes: 0,
-            utilization_score: u64::MAX,
-            captured_at: SystemTime(
+            disk_wanted_bytes: u64::MAX,
+            utilization_score: 13,
+            disk_usable_pct: Percent::new(90).unwrap(),
+            shard_count: 100,
+            max_shard_count: 200,
+            captured_at: serde_system_time::SystemTime(
                 std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
             ),
         };
 
         let s = serde_json::to_string(&doc).unwrap();
 
-        let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#;
+        let expected = "{\"disk_usage_bytes\":9223372036854775807,\"free_space_bytes\":0,\"disk_wanted_bytes\":9223372036854775807,\"disk_usable_pct\":90,\"shard_count\":100,\"max_shard_count\":200,\"utilization_score\":13,\"captured_at\":\"2024-02-21T10:02:59.000Z\"}";
 
         assert_eq!(s, expected);
     }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a983d8c4c2..2b0156079e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2357,8 +2357,9 @@ async fn get_utilization(
     // regenerate at most 1Hz to allow polling at any rate.
     if !still_valid {
         let path = state.conf.tenants_path();
-        let doc = crate::utilization::regenerate(path.as_std_path())
-            .map_err(ApiError::InternalServerError)?;
+        let doc =
+            crate::utilization::regenerate(state.conf, path.as_std_path(), &state.tenant_manager)
+                .map_err(ApiError::InternalServerError)?;
 
         let mut buf = Vec::new();
         serde_json::to_writer(&mut buf, &doc)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index cfdb32f755..a238004aad 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3732,6 +3732,19 @@ impl Tenant {
     pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
         self.tenant_conf.load().tenant_conf.clone()
     }
+
+    /// How much local storage would this tenant like to have?  It can cope with
+    /// less than this (via eviction and on-demand downloads), but this function enables
+    /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
+    /// by keeping important things on local disk.
+    pub(crate) fn local_storage_wanted(&self) -> u64 {
+        let mut wanted = 0;
+        let timelines = self.timelines.lock().unwrap();
+        for timeline in timelines.values() {
+            wanted += timeline.metrics.visible_physical_size_gauge.get();
+        }
+        wanted
+    }
 }
 
 /// Create the cluster temporarily in 'initdbpath' directory inside the repository
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 3316627540..c8a11e88cc 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2085,6 +2085,57 @@ impl TenantManager {
             }
         }
     }
+
+    /// Calculate the tenant shards' contributions to this pageserver's utilization metrics.  The
+    /// returned values are:
+    ///  - the number of bytes of local disk space this pageserver's shards are requesting, i.e.
+    ///    how much space they would use if not impacted by disk usage eviction.
+    ///  - the number of tenant shards currently on this pageserver, including attached
+    ///    and secondary.
+    ///
+    /// This function is quite expensive: callers are expected to cache the result and
+    /// limit how often they call it.
+    pub(crate) fn calculate_utilization(&self) -> Result<(u64, u32), TenantMapListError> {
+        let tenants = self.tenants.read().unwrap();
+        let m = match &*tenants {
+            TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
+        };
+        let shard_count = m.len();
+        let mut wanted_bytes = 0;
+
+        for tenant_slot in m.values() {
+            match tenant_slot {
+                TenantSlot::InProgress(_barrier) => {
+                    // While a slot is being changed, we can't know how much storage it wants.  This
+                    // means this function's output can fluctuate if a lot of changes are going on
+                    // (such as transitions from secondary to attached).
+                    //
+                    // We could wait for the barrier and retry, but it's important that the utilization
+                    // API is responsive, and the data quality impact is not very significant.
+                    continue;
+                }
+                TenantSlot::Attached(tenant) => {
+                    wanted_bytes += tenant.local_storage_wanted();
+                }
+                TenantSlot::Secondary(secondary) => {
+                    let progress = secondary.progress.lock().unwrap();
+                    wanted_bytes += if progress.heatmap_mtime.is_some() {
+                        // If we have heatmap info, then we will 'want' the sum
+                        // of the size of layers in the heatmap: this is how much space
+                        // we would use if not doing any eviction.
+                        progress.bytes_total
+                    } else {
+                        // In the absence of heatmap info, assume that the secondary location simply
+                        // needs as much space as it is currently using.
+                        secondary.resident_size_metric.get()
+                    }
+                }
+            }
+        }
+
+        Ok((wanted_bytes, shard_count as u32))
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs
index e6c835aa75..3c48c84598 100644
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -5,12 +5,17 @@
 
 use anyhow::Context;
 use std::path::Path;
+use utils::serde_percent::Percent;
 
 use pageserver_api::models::PageserverUtilization;
 
-pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtilization> {
-    // TODO: currently the http api ratelimits this to 1Hz at most, which is probably good enough
+use crate::{config::PageServerConf, tenant::mgr::TenantManager};
 
+pub(crate) fn regenerate(
+    conf: &PageServerConf,
+    tenants_path: &Path,
+    tenant_manager: &TenantManager,
+) -> anyhow::Result<PageserverUtilization> {
     let statvfs = nix::sys::statvfs::statvfs(tenants_path)
         .map_err(std::io::Error::from)
         .context("statvfs tenants directory")?;
@@ -34,16 +39,31 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtiliz
 
     let captured_at = std::time::SystemTime::now();
 
-    let doc = PageserverUtilization {
+    // Calculate aggregate utilization from tenants on this pageserver
+    let (disk_wanted_bytes, shard_count) = tenant_manager.calculate_utilization()?;
+
+    // Fetch the fraction of disk space which may be used
+    let disk_usable_pct = match conf.disk_usage_based_eviction.clone() {
+        Some(e) => e.max_usage_pct,
+        None => Percent::new(100).unwrap(),
+    };
+
+    // Express a static value for how many shards we may schedule on one node
+    const MAX_SHARDS: u32 = 20000;
+
+    let mut doc = PageserverUtilization {
         disk_usage_bytes: used,
         free_space_bytes: free,
-        // lower is better; start with a constant
-        //
-        // note that u64::MAX will be output as i64::MAX as u64, but that should not matter
-        utilization_score: u64::MAX,
+        disk_wanted_bytes,
+        disk_usable_pct,
+        shard_count,
+        max_shard_count: MAX_SHARDS,
+        utilization_score: 0,
         captured_at: utils::serde_system_time::SystemTime(captured_at),
     };
 
+    doc.refresh_score();
+
     // TODO: make utilization_score into a metric
 
     Ok(doc)

From 852a6a7a5aab76b29f54ae41032fdf46d7826903 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 13 Aug 2024 15:28:26 +0100
Subject: [PATCH 1403/1571] CI: mark PRs and issues create by external users
 (#8694)

## Problem

We want to mark new PRs and issues created by external users

## Summary of changes
- Add a new workflow which adds `external` label for issues and PRs
created by external users
---
 .../workflows/label-for-external-users.yml    | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 .github/workflows/label-for-external-users.yml

diff --git a/.github/workflows/label-for-external-users.yml b/.github/workflows/label-for-external-users.yml
new file mode 100644
index 0000000000..2f19a746e0
--- /dev/null
+++ b/.github/workflows/label-for-external-users.yml
@@ -0,0 +1,35 @@
+name: Add `external` label to issues and PRs created by external users
+
+on:
+  issues:
+    types:
+      - opened
+  pull_request:
+    types:
+      - opened
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+env:
+  LABEL: external
+
+jobs:
+  add-label:
+    # This workflow uses `author_association` for PRs and issues to determine if the user is an external user.
+    # Possible values for `author_association`: https://docs.github.com/en/graphql/reference/enums#commentauthorassociation
+    if: ${{ !contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].author_association) }}
+
+    runs-on: ubuntu-22.04
+    permissions:
+      pull-requests: write
+      issues: write
+
+    steps:
+    - name: Label new ${{ github.event_name }}
+      env:
+        GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].number }}
+        GH_CLI_COMMAND: ${{ github.event_name == 'pull_request' && 'pr' || 'issue' }}
+      run: |
+        gh ${GH_CLI_COMMAND} --repo ${GITHUB_REPOSITORY} edit --add-label=${LABEL} ${ITEM_NUMBER}

From e0946e334a7a2812e157411eba243d66cfb43394 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 13 Aug 2024 19:07:51 +0300
Subject: [PATCH 1404/1571] bench: stop immediatedly in some benches (#8713)

It seems that some benchmarks are failing because they are simply not
stopping to ingest wal on shutdown. It might mean that the tests were
never ran on a stable pageserver situation and WAL has always been left
to be ingested on safekeepers, but let's see if this silences the
failures and "stops the bleeding".

Cc: https://github.com/neondatabase/neon/issues/8712
---
 .../pageserver/pagebench/test_ondemand_download_churn.py       | 3 +++
 test_runner/performance/test_layer_map.py                      | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
index 644c1f559b..0348b08f04 100644
--- a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
+++ b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
@@ -62,6 +62,9 @@ def test_download_churn(
 
     run_benchmark(env, pg_bin, record, io_engine, concurrency_per_target, duration)
 
+    # see https://github.com/neondatabase/neon/issues/8712
+    env.stop(immediate=True)
+
 
 def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     remote_storage_kind = s3_storage()
diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index 9b20954d45..890b70b9fc 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -36,3 +36,6 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
     with zenbenchmark.record_duration("test_query"):
         cur.execute("SELECT count(*) from t")
         assert cur.fetchone() == (n_iters * n_records,)
+
+    # see https://github.com/neondatabase/neon/issues/8712
+    env.stop(immediate=True)

From 8f170c51059087b26ac098219ca97072f053f513 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 13 Aug 2024 20:00:54 +0300
Subject: [PATCH 1405/1571] fix: make compaction more sensitive to cancellation
 (#8706)

A few of the benchmarks have started failing after #8655 where they are
waiting for compactor task. Reads done by image layer creation should
already be cancellation sensitive because vectored get does a check each
time, but try sprinkling additional cancellation points to:

- each partition
- after each vectored read batch
---
 pageserver/src/tenant/timeline.rs              | 11 +++++++++++
 pageserver/src/tenant/timeline/compaction.rs   |  8 ++++++++
 test_runner/fixtures/neon_fixtures.py          |  2 ++
 test_runner/regress/test_pageserver_restart.py | 10 ++++++++++
 4 files changed, 31 insertions(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index abe3f56e45..767f5969fc 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3917,6 +3917,10 @@ impl Timeline {
                         .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
                         .await?;
 
+                    if self.cancel.is_cancelled() {
+                        return Err(CreateImageLayersError::Cancelled);
+                    }
+
                     for (img_key, img) in results {
                         let img = match img {
                             Ok(img) => img,
@@ -4024,6 +4028,9 @@ impl Timeline {
                 next_start_key: img_range.end,
             });
         }
+        if self.cancel.is_cancelled() {
+            return Err(CreateImageLayersError::Cancelled);
+        }
         let mut wrote_any_image = false;
         for (k, v) in data {
             if v.is_empty() {
@@ -4138,6 +4145,10 @@ impl Timeline {
         let check_for_image_layers = self.should_check_if_image_layers_required(lsn);
 
         for partition in partitioning.parts.iter() {
+            if self.cancel.is_cancelled() {
+                return Err(CreateImageLayersError::Cancelled);
+            }
+
             let img_range = start..partition.ranges.last().unwrap().end;
             let compact_metadata = partition.overlaps(&Key::metadata_key_range());
             if compact_metadata {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8390cb839c..9ac0086cde 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -748,6 +748,9 @@ impl Timeline {
         let all_keys = {
             let mut all_keys = Vec::new();
             for l in deltas_to_compact.iter() {
+                if self.cancel.is_cancelled() {
+                    return Err(CompactionError::ShuttingDown);
+                }
                 all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
             }
             // The current stdlib sorting implementation is designed in a way where it is
@@ -830,6 +833,11 @@ impl Timeline {
         };
         stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
         drop_rlock(guard);
+
+        if self.cancel.is_cancelled() {
+            return Err(CompactionError::ShuttingDown);
+        }
+
         stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
 
         // This iterator walks through all key-value pairs from all the layers
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 561e8bce04..daa4c8b97f 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1251,6 +1251,8 @@ class NeonEnv:
     def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True):
         """
         After this method returns, there should be no child processes running.
+
+        Unless of course, some stopping failed, in that case, all remaining child processes are leaked.
         """
         self.endpoints.stop_all(fail_on_endpoint_errors)
 
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index 68a45f957c..bbf82fea4c 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -159,6 +159,8 @@ def test_pageserver_chaos(
     if build_type == "debug":
         pytest.skip("times out in debug builds")
 
+    # same rationale as with the immediate stop; we might leave orphan layers behind.
+    neon_env_builder.disable_scrub_on_exit()
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count
@@ -220,3 +222,11 @@ def test_pageserver_chaos(
         # Check that all the updates are visible
         num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0]
         assert num_updates == i * 100000
+
+    # currently pageserver cannot tolerate the fact that "s3" goes away, and if
+    # we succeeded in a compaction before shutdown, there might be a lot of
+    # uploads pending, certainly more than what we can ingest with MOCK_S3
+    #
+    # so instead, do a fast shutdown for this one test.
+    # See https://github.com/neondatabase/neon/issues/8709
+    env.stop(immediate=True)

From ae6e27274cecaf60b0a1388cb9344bc8987a4d3f Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 13 Aug 2024 20:14:42 +0300
Subject: [PATCH 1406/1571] refactor(test): unify how we clear shared buffers
 (#8634)

so that we can easily plug in LFC clearing as well.

Private discussion reference:
<https://neondb.slack.com/archives/C033A2WE6BZ/p1722942856987979>
---
 test_runner/fixtures/neon_fixtures.py                | 11 +++++++++++
 test_runner/fixtures/workload.py                     | 12 +++---------
 test_runner/regress/test_hot_standby.py              |  3 ++-
 test_runner/regress/test_oid_overflow.py             |  2 +-
 test_runner/regress/test_read_validation.py          |  2 +-
 test_runner/regress/test_timeline_detach_ancestor.py |  2 +-
 test_runner/regress/test_vm_bits.py                  |  2 +-
 test_runner/regress/test_wal_acceptor.py             |  2 +-
 8 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index daa4c8b97f..6600b44759 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4100,6 +4100,17 @@ class Endpoint(PgProtocol, LogUtils):
         assert self.pgdata_dir is not None  # please mypy
         return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024
 
+    def clear_shared_buffers(self, cursor: Optional[Any] = None):
+        """
+        Best-effort way to clear postgres buffers. Pinned buffers will not be 'cleared.'
+
+        Might also clear LFC.
+        """
+        if cursor is not None:
+            cursor.execute("select clear_buffer_cache()")
+        else:
+            self.safe_psql("select clear_buffer_cache()")
+
 
 class EndpointFactory:
     """An object representing multiple compute endpoints."""
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index dfd9caba3e..cc93762175 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -182,14 +182,8 @@ class Workload:
 
     def validate(self, pageserver_id: Optional[int] = None):
         endpoint = self.endpoint(pageserver_id)
-        result = endpoint.safe_psql_many(
-            [
-                "select clear_buffer_cache()",
-                f"""
-            SELECT COUNT(*) FROM {self.table}
-            """,
-            ]
-        )
+        endpoint.clear_shared_buffers()
+        result = endpoint.safe_psql(f"SELECT COUNT(*) FROM {self.table}")
 
         log.info(f"validate({self.expect_rows}): {result}")
-        assert result == [[("",)], [(self.expect_rows,)]]
+        assert result == [(self.expect_rows,)]
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 8edc8c554c..ae63136abb 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -168,7 +168,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
             # re-execute the query, it will make GetPage
             # requests. This does not clear the last-written LSN cache
             # so we still remember the LSNs of the pages.
-            s_cur.execute("SELECT clear_buffer_cache()")
+            secondary.clear_shared_buffers(cursor=s_cur)
 
             if pause_apply:
                 s_cur.execute("SELECT pg_wal_replay_pause()")
@@ -332,6 +332,7 @@ def test_replica_query_race(neon_simple_env: NeonEnv):
                 log.info(f"read {reads}: counter {readcounter}, last update {writecounter}")
             reads += 1
 
+            # FIXME: what about LFC clearing?
             await conn.execute("SELECT clear_buffer_cache()")
 
     async def both():
diff --git a/test_runner/regress/test_oid_overflow.py b/test_runner/regress/test_oid_overflow.py
index a94ae99ed9..e8eefc2414 100644
--- a/test_runner/regress/test_oid_overflow.py
+++ b/test_runner/regress/test_oid_overflow.py
@@ -37,7 +37,7 @@ def test_oid_overflow(neon_env_builder: NeonEnvBuilder):
     oid = cur.fetchall()[0][0]
     log.info(f"t2.relfilenode={oid}")
 
-    cur.execute("SELECT clear_buffer_cache()")
+    endpoint.clear_shared_buffers(cursor=cur)
 
     cur.execute("SELECT x from t1")
     assert cur.fetchone() == (1,)
diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py
index 2437c8f806..d128c60a99 100644
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -61,7 +61,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
 
             log.info("Clear buffer cache to ensure no stale pages are brought into the cache")
 
-            c.execute("select clear_buffer_cache()")
+            endpoint.clear_shared_buffers(cursor=c)
 
             cache_entries = query_scalar(
                 c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}"
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index b3767a2766..4e409eeb17 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -411,7 +411,7 @@ def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEn
 
     assert client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_timeline_id"] is None
 
-    assert ep.safe_psql("SELECT clear_buffer_cache();")
+    ep.clear_shared_buffers()
     assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
     assert ep.safe_psql("SELECT SUM(LENGTH(aux)) FROM foo")[0][0] != 0
     ep.stop()
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 225b952e73..7272979c4a 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -62,7 +62,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
 
     # Clear the buffer cache, to force the VM page to be re-fetched from
     # the page server
-    cur.execute("SELECT clear_buffer_cache()")
+    endpoint.clear_shared_buffers(cursor=cur)
 
     # Check that an index-only scan doesn't see the deleted row. If the
     # clearing of the VM bit was not replayed correctly, this would incorrectly
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index f02f19c588..bf7829fc84 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2159,7 +2159,7 @@ def test_broker_discovery(neon_env_builder: NeonEnvBuilder):
         # generate some data to commit WAL on safekeepers
         endpoint.safe_psql("insert into t select generate_series(1,100), 'action'")
         # clear the buffers
-        endpoint.safe_psql("select clear_buffer_cache()")
+        endpoint.clear_shared_buffers()
         # read data to fetch pages from pageserver
         endpoint.safe_psql("select sum(i) from t")
 

From 9d2276323d8f375c7394a92f71fd8c728d61ab96 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Tue, 13 Aug 2024 19:36:39 +0200
Subject: [PATCH 1407/1571] Benchmarking tests: automatically restore Neon
 reuse databases, too and migrate to pg16 (#8707)

## Problem

We use a set of **Neon** reuse databases in benchmarking.yml which are
still using pg14.
Because we want to compare apples to apples and have migrated the AWS
reuse clusters to pg16 we should also use pg16 for Neon.

## Summary of changes

- Automatically restore the test databases for Neon project
---
 .github/workflows/_benchmarking_preparation.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml
index 0f540afab7..7229776cd6 100644
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -13,7 +13,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres ] 
+        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ] 
         database: [ clickbench, tpch, userexample ]
   
     env:
@@ -31,6 +31,9 @@ jobs:
       id: set-up-prep-connstr
       run: |
         case "${PLATFORM}" in
+          neon)
+            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} 
+            ;;
           aws-rds-postgres)
             CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} 
             ;;

From 87a5d7db9e0f47eab8b48cebc6ab47045a8c9b1b Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 13 Aug 2024 20:49:50 +0300
Subject: [PATCH 1408/1571] test: do better job of shutting everything down
 (#8714)

After #8655 we've had a few issues (mostly tracked on #8708) with the
graceful shutdown. In order to shutdown more of the processes and catch
more errors, for example, from all pageservers, do an immediate shutdown
for those nodes which fail the initial (possibly graceful) shutdown.

Cc: #6485
---
 test_runner/fixtures/neon_fixtures.py | 40 +++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 6600b44759..961dbde95c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1254,20 +1254,54 @@ class NeonEnv:
 
         Unless of course, some stopping failed, in that case, all remaining child processes are leaked.
         """
-        self.endpoints.stop_all(fail_on_endpoint_errors)
+
+        # the commonly failing components have special try-except behavior,
+        # trying to get us to actually shutdown all processes over easier error
+        # reporting.
+
+        raise_later = None
+        try:
+            self.endpoints.stop_all(fail_on_endpoint_errors)
+        except Exception as e:
+            raise_later = e
 
         # Stop storage controller before pageservers: we don't want it to spuriously
         # detect a pageserver "failure" during test teardown
         self.storage_controller.stop(immediate=immediate)
 
+        stop_later = []
+        metric_errors = []
+
         for sk in self.safekeepers:
             sk.stop(immediate=immediate)
         for pageserver in self.pageservers:
             if ps_assert_metric_no_errors:
-                pageserver.assert_no_metric_errors()
-            pageserver.stop(immediate=immediate)
+                try:
+                    pageserver.assert_no_metric_errors()
+                except Exception as e:
+                    metric_errors.append(e)
+                    log.error(f"metric validation failed on {pageserver.id}: {e}")
+            try:
+                pageserver.stop(immediate=immediate)
+            except RuntimeError:
+                stop_later.append(pageserver)
         self.broker.stop(immediate=immediate)
 
+        # TODO: for nice logging we need python 3.11 ExceptionGroup
+        for ps in stop_later:
+            ps.stop(immediate=True)
+
+        if raise_later is not None:
+            raise raise_later
+
+        for error in metric_errors:
+            raise error
+
+        if len(stop_later) > 0:
+            raise RuntimeError(
+                f"{len(stop_later)} out of {len(self.pageservers)} pageservers failed to stop gracefully"
+            )
+
     @property
     def pageserver(self) -> NeonPageserver:
         """

From 6d6e2c6a395d365852df478b37bbf50af1e48e6b Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 13 Aug 2024 20:51:51 +0300
Subject: [PATCH 1409/1571] feat(detach_ancestor): better retries with
 persistent gc blocking (#8430)

With the persistent gc blocking, we can now retry reparenting timelines
which had failed for whatever reason on the previous attempt(s).
Restructure the detach_ancestor into three phases:

- prepare (insert persistent gc blocking, copy lsn prefix, layers)
- detach and reparent
    - reparenting can fail, so we might need to retry this portion
- complete (remove persistent gc blocking)

Cc: #6994
---
 libs/utils/src/completion.rs                  |  31 +-
 pageserver/src/http/routes.rs                 |   4 +-
 pageserver/src/tenant.rs                      |  12 +-
 pageserver/src/tenant/metadata.rs             |   3 +
 pageserver/src/tenant/mgr.rs                  | 119 +++-
 .../src/tenant/remote_timeline_client.rs      |  70 ++-
 .../tenant/remote_timeline_client/index.rs    |  57 +-
 pageserver/src/tenant/tasks.rs                |  14 +-
 pageserver/src/tenant/timeline.rs             |  30 +-
 .../src/tenant/timeline/detach_ancestor.rs    | 528 ++++++++++++++----
 storage_controller/src/service.rs             |   4 +-
 .../regress/test_timeline_detach_ancestor.py  | 327 ++++++++++-
 12 files changed, 960 insertions(+), 239 deletions(-)

diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs
index 2fef8d35df..f65c080ad4 100644
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -5,13 +5,40 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
 /// Can be cloned, moved and kept around in futures as "guard objects".
 #[derive(Clone)]
 pub struct Completion {
-    _token: TaskTrackerToken,
+    token: TaskTrackerToken,
+}
+
+impl std::fmt::Debug for Completion {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Completion")
+            .field("siblings", &self.token.task_tracker().len())
+            .finish()
+    }
+}
+
+impl Completion {
+    /// Returns true if this completion is associated with the given barrier.
+    pub fn blocks(&self, barrier: &Barrier) -> bool {
+        TaskTracker::ptr_eq(self.token.task_tracker(), &barrier.0)
+    }
+
+    pub fn barrier(&self) -> Barrier {
+        Barrier(self.token.task_tracker().clone())
+    }
 }
 
 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
 pub struct Barrier(TaskTracker);
 
+impl std::fmt::Debug for Barrier {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Barrier")
+            .field("remaining", &self.0.len())
+            .finish()
+    }
+}
+
 impl Default for Barrier {
     fn default() -> Self {
         let (_, rx) = channel();
@@ -51,5 +78,5 @@ pub fn channel() -> (Completion, Barrier) {
     tracker.close();
 
     let token = tracker.token();
-    (Completion { _token: token }, Barrier(tracker))
+    (Completion { token }, Barrier(tracker))
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 2b0156079e..6f7480cc6c 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1887,7 +1887,7 @@ async fn timeline_detach_ancestor_handler(
         // drop(tenant);
 
         let resp = match progress {
-            detach_ancestor::Progress::Prepared(_guard, prepared) => {
+            detach_ancestor::Progress::Prepared(attempt, prepared) => {
                 // it would be great to tag the guard on to the tenant activation future
                 let reparented_timelines = state
                     .tenant_manager
@@ -1895,10 +1895,10 @@ async fn timeline_detach_ancestor_handler(
                         tenant_shard_id,
                         timeline_id,
                         prepared,
+                        attempt,
                         ctx,
                     )
                     .await
-                    .context("timeline detach ancestor completion")
                     .map_err(ApiError::InternalServerError)?;
 
                 AncestorDetached {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a238004aad..b065f58382 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -302,7 +302,11 @@ pub struct Tenant {
     pub(crate) timeline_get_throttle:
         Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
 
-    /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
+    /// An ongoing timeline detach concurrency limiter.
+    ///
+    /// As a tenant will likely be restarted as part of timeline detach ancestor it makes no sense
+    /// to have two running at the same time. A different one can be started if an earlier one
+    /// has failed for whatever reason.
     ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
 
     /// `index_part.json` based gc blocking reason tracking.
@@ -833,9 +837,9 @@ impl Tenant {
                             // The Stopping case is for when we have passed control on to DeleteTenantFlow:
                             // if it errors, we will call make_broken when tenant is already in Stopping.
                             assert!(
-                            matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
-                            "the attach task owns the tenant state until activation is complete"
-                        );
+                                matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
+                                "the attach task owns the tenant state until activation is complete"
+                            );
 
                             *state = TenantState::broken_from_reason(err.to_string());
                         });
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index bbc070a81b..6073abc8c3 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -285,12 +285,15 @@ impl TimelineMetadata {
     }
 
     /// When reparenting, the `ancestor_lsn` does not change.
+    ///
+    /// Returns true if anything was changed.
     pub fn reparent(&mut self, timeline: &TimelineId) {
         assert!(self.body.ancestor_timeline.is_some());
         // no assertion for redoing this: it's fine, we may have to repeat this multiple times over
         self.body.ancestor_timeline = Some(*timeline);
     }
 
+    /// Returns true if anything was changed
     pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) {
         if let Some(ancestor) = self.body.ancestor_timeline {
             assert_eq!(ancestor, branchpoint.0);
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index c8a11e88cc..5f2539d426 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -54,7 +54,7 @@ use utils::id::{TenantId, TimelineId};
 
 use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
-use super::timeline::detach_ancestor::PreparedTimelineDetach;
+use super::timeline::detach_ancestor::{self, PreparedTimelineDetach};
 use super::{GlobalShutDown, TenantSharedResources};
 
 /// For a tenant that appears in TenantsMap, it may either be
@@ -1927,8 +1927,10 @@ impl TenantManager {
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         prepared: PreparedTimelineDetach,
+        mut attempt: detach_ancestor::Attempt,
         ctx: &RequestContext,
     ) -> Result<HashSet<TimelineId>, anyhow::Error> {
+        use crate::tenant::timeline::detach_ancestor::Error;
         // FIXME: this is unnecessary, slotguard already has these semantics
         struct RevertOnDropSlot(Option<SlotGuard>);
 
@@ -1977,43 +1979,98 @@ impl TenantManager {
 
         let timeline = tenant.get_timeline(timeline_id, true)?;
 
-        let reparented = timeline
-            .complete_detaching_timeline_ancestor(&tenant, prepared, ctx)
+        let resp = timeline
+            .detach_from_ancestor_and_reparent(&tenant, prepared, ctx)
             .await?;
 
         let mut slot_guard = slot_guard.into_inner();
 
-        let (_guard, progress) = utils::completion::channel();
-        match tenant.shutdown(progress, ShutdownMode::Hard).await {
-            Ok(()) => {
-                slot_guard.drop_old_value()?;
+        let tenant = if resp.reset_tenant_required() {
+            attempt.before_reset_tenant();
+
+            let (_guard, progress) = utils::completion::channel();
+            match tenant.shutdown(progress, ShutdownMode::Hard).await {
+                Ok(()) => {
+                    slot_guard.drop_old_value()?;
+                }
+                Err(_barrier) => {
+                    slot_guard.revert();
+                    // this really should not happen, at all, unless shutdown was already going?
+                    anyhow::bail!("Cannot restart Tenant, already shutting down");
+                }
             }
-            Err(_barrier) => {
-                slot_guard.revert();
-                // this really should not happen, at all, unless shutdown was already going?
-                anyhow::bail!("Cannot restart Tenant, already shutting down");
+
+            let tenant_path = self.conf.tenant_path(&tenant_shard_id);
+            let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+
+            let shard_identity = config.shard;
+            let tenant = tenant_spawn(
+                self.conf,
+                tenant_shard_id,
+                &tenant_path,
+                self.resources.clone(),
+                AttachedTenantConf::try_from(config)?,
+                shard_identity,
+                None,
+                SpawnMode::Eager,
+                ctx,
+            )?;
+
+            {
+                let mut g = tenant.ongoing_timeline_detach.lock().unwrap();
+                assert!(
+                    g.is_none(),
+                    "there cannot be any new timeline detach ancestor on newly created tenant"
+                );
+                *g = Some((attempt.timeline_id, attempt.new_barrier()));
             }
+
+            slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?;
+            tenant
+        } else {
+            tracing::info!("skipping tenant_reset as no changes made required it");
+            tenant
+        };
+
+        if let Some(reparented) = resp.completed() {
+            // finally ask the restarted tenant to complete the detach
+            //
+            // rationale for 9999s: we don't really have a timetable here; if retried, the caller
+            // will get an 503.
+            tenant
+                .wait_to_become_active(std::time::Duration::from_secs(9999))
+                .await
+                .map_err(|e| {
+                    use pageserver_api::models::TenantState;
+                    use GetActiveTenantError::{Cancelled, WillNotBecomeActive};
+                    match e {
+                        Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => {
+                            Error::ShuttingDown
+                        }
+                        other => Error::Unexpected(other.into()),
+                    }
+                })?;
+
+            utils::pausable_failpoint!(
+                "timeline-detach-ancestor::after_activating_before_finding-pausable"
+            );
+
+            let timeline = tenant
+                .get_timeline(attempt.timeline_id, true)
+                .map_err(|_| Error::DetachedNotFoundAfterRestart)?;
+
+            timeline
+                .complete_detaching_timeline_ancestor(&tenant, attempt, ctx)
+                .await
+                .map(|()| reparented)
+                .map_err(|e| e.into())
+        } else {
+            // at least the latest versions have now been downloaded and refreshed; be ready to
+            // retry another time.
+            Err(anyhow::anyhow!(
+                "failed to reparent all candidate timelines, please retry"
+            ))
         }
-
-        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
-
-        let shard_identity = config.shard;
-        let tenant = tenant_spawn(
-            self.conf,
-            tenant_shard_id,
-            &tenant_path,
-            self.resources.clone(),
-            AttachedTenantConf::try_from(config)?,
-            shard_identity,
-            None,
-            SpawnMode::Eager,
-            ctx,
-        )?;
-
-        slot_guard.upsert(TenantSlot::Attached(tenant))?;
-
-        Ok(reparented)
     }
 
     /// A page service client sends a TenantId, and to look up the correct Tenant we must
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 8a76d7532f..b4d7ad1e97 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -736,12 +736,13 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Reparent this timeline to a new parent.
+    ///
+    /// A retryable step of timeline ancestor detach.
     pub(crate) async fn schedule_reparenting_and_wait(
         self: &Arc<Self>,
         new_parent: &TimelineId,
     ) -> anyhow::Result<()> {
-        // FIXME: because of how Timeline::schedule_uploads works when called from layer flushing
-        // and reads the in-memory part we cannot do the detaching like this
         let receiver = {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
@@ -752,17 +753,25 @@ impl RemoteTimelineClient {
                 ));
             };
 
-            upload_queue.dirty.metadata.reparent(new_parent);
-            upload_queue.dirty.lineage.record_previous_ancestor(&prev);
+            let uploaded = &upload_queue.clean.0.metadata;
 
-            self.schedule_index_upload(upload_queue)?;
+            if uploaded.ancestor_timeline().is_none() && !uploaded.ancestor_lsn().is_valid() {
+                // nothing to do
+                None
+            } else {
+                upload_queue.dirty.metadata.reparent(new_parent);
+                upload_queue.dirty.lineage.record_previous_ancestor(&prev);
 
-            self.schedule_barrier0(upload_queue)
+                self.schedule_index_upload(upload_queue)?;
+
+                Some(self.schedule_barrier0(upload_queue))
+            }
         };
 
-        Self::wait_completion0(receiver)
-            .await
-            .context("wait completion")
+        if let Some(receiver) = receiver {
+            Self::wait_completion0(receiver).await?;
+        }
+        Ok(())
     }
 
     /// Schedules uploading a new version of `index_part.json` with the given layers added,
@@ -778,26 +787,30 @@ impl RemoteTimelineClient {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
 
-            upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
-            upload_queue.dirty.lineage.record_detaching(&adopted);
+            if upload_queue.clean.0.lineage.detached_previous_ancestor() == Some(adopted) {
+                None
+            } else {
+                upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
+                upload_queue.dirty.lineage.record_detaching(&adopted);
 
-            for layer in layers {
-                upload_queue
-                    .dirty
-                    .layer_metadata
-                    .insert(layer.layer_desc().layer_name(), layer.metadata());
+                for layer in layers {
+                    let prev = upload_queue
+                        .dirty
+                        .layer_metadata
+                        .insert(layer.layer_desc().layer_name(), layer.metadata());
+                    assert!(prev.is_none(), "copied layer existed already {layer}");
+                }
+
+                self.schedule_index_upload(upload_queue)?;
+
+                Some(self.schedule_barrier0(upload_queue))
             }
-
-            self.schedule_index_upload(upload_queue)?;
-
-            let barrier = self.schedule_barrier0(upload_queue);
-            self.launch_queued_tasks(upload_queue);
-            barrier
         };
 
-        Self::wait_completion0(barrier)
-            .await
-            .context("wait completion")
+        if let Some(barrier) = barrier {
+            Self::wait_completion0(barrier).await?;
+        }
+        Ok(())
     }
 
     /// Adds a gc blocking reason for this timeline if one does not exist already.
@@ -873,12 +886,7 @@ impl RemoteTimelineClient {
             let upload_queue = guard.initialized_mut()?;
 
             if let index::GcBlockingReason::DetachAncestor = reason {
-                if !upload_queue
-                    .clean
-                    .0
-                    .lineage
-                    .is_detached_from_original_ancestor()
-                {
+                if !upload_queue.clean.0.lineage.is_detached_from_ancestor() {
                     drop(guard);
                     panic!("cannot complete timeline_ancestor_detach while not detached");
                 }
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 90453b1922..757fb9d032 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -216,26 +216,47 @@ fn is_false(b: &bool) -> bool {
 impl Lineage {
     const REMEMBER_AT_MOST: usize = 100;
 
-    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) {
+    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) -> bool {
         if self.reparenting_history.last() == Some(old_ancestor) {
             // do not re-record it
-            return;
-        }
+            false
+        } else {
+            #[cfg(feature = "testing")]
+            {
+                let existing = self
+                    .reparenting_history
+                    .iter()
+                    .position(|x| x == old_ancestor);
+                assert_eq!(
+                    existing, None,
+                    "we cannot reparent onto and off and onto the same timeline twice"
+                );
+            }
+            let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
 
-        let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
-
-        self.reparenting_history_truncated |= drop_oldest;
-        if drop_oldest {
-            self.reparenting_history.remove(0);
+            self.reparenting_history_truncated |= drop_oldest;
+            if drop_oldest {
+                self.reparenting_history.remove(0);
+            }
+            self.reparenting_history.push(*old_ancestor);
+            true
         }
-        self.reparenting_history.push(*old_ancestor);
     }
 
-    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) {
-        assert!(self.original_ancestor.is_none());
-
-        self.original_ancestor =
-            Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
+    /// Returns true if anything changed.
+    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) -> bool {
+        if let Some((id, lsn, _)) = self.original_ancestor {
+            assert_eq!(
+                &(id, lsn),
+                branchpoint,
+                "detaching attempt has to be for the same ancestor we are already detached from"
+            );
+            false
+        } else {
+            self.original_ancestor =
+                Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
+            true
+        }
     }
 
     /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
@@ -247,10 +268,16 @@ impl Lineage {
             .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
     }
 
-    pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
+    /// Returns true if the timeline originally had an ancestor, and no longer has one.
+    pub(crate) fn is_detached_from_ancestor(&self) -> bool {
         self.original_ancestor.is_some()
     }
 
+    /// Returns original ancestor timeline id and lsn that this timeline has been detached from.
+    pub(crate) fn detached_previous_ancestor(&self) -> Option<(TimelineId, Lsn)> {
+        self.original_ancestor.map(|(id, lsn, _)| (id, lsn))
+    }
+
     pub(crate) fn is_reparented(&self) -> bool {
         !self.reparenting_history.is_empty()
     }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 713845e9ac..dbcd704b4e 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -366,14 +366,13 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
             if first {
                 first = false;
 
-                if delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel)
-                    .await
-                    .is_err()
-                {
-                    break;
-                }
+                let delays = async {
+                    delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
+                    random_init_delay(period, &cancel).await?;
+                    Ok::<_, Cancelled>(())
+                };
 
-                if random_init_delay(period, &cancel).await.is_err() {
+                if delays.await.is_err() {
                     break;
                 }
             }
@@ -425,7 +424,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 
             warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
 
-            // Sleep
             if tokio::time::timeout(sleep_duration, cancel.cancelled())
                 .await
                 .is_ok()
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 767f5969fc..f1587951c6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4328,18 +4328,34 @@ impl Timeline {
         detach_ancestor::prepare(self, tenant, options, ctx).await
     }
 
-    /// Completes the ancestor detach. This method is to be called while holding the
-    /// TenantManager's tenant slot, so during this method we cannot be deleted nor can any
-    /// timeline be deleted. After this method returns successfully, tenant must be reloaded.
+    /// Second step of detach from ancestor; detaches the `self` from it's current ancestor and
+    /// reparents any reparentable children of previous ancestor.
     ///
-    /// Pageserver receiving a SIGKILL during this operation is not supported (yet).
-    pub(crate) async fn complete_detaching_timeline_ancestor(
+    /// This method is to be called while holding the TenantManager's tenant slot, so during this
+    /// method we cannot be deleted nor can any timeline be deleted. After this method returns
+    /// successfully, tenant must be reloaded.
+    ///
+    /// Final step will be to [`Self::complete_detaching_timeline_ancestor`] after optionally
+    /// resetting the tenant.
+    pub(crate) async fn detach_from_ancestor_and_reparent(
         self: &Arc<Timeline>,
         tenant: &crate::tenant::Tenant,
         prepared: detach_ancestor::PreparedTimelineDetach,
         ctx: &RequestContext,
-    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
-        detach_ancestor::complete(self, tenant, prepared, ctx).await
+    ) -> Result<detach_ancestor::DetachingAndReparenting, anyhow::Error> {
+        detach_ancestor::detach_and_reparent(self, tenant, prepared, ctx).await
+    }
+
+    /// Final step which unblocks the GC.
+    ///
+    /// The tenant must've been reset if ancestry was modified previously (in tenant manager).
+    pub(crate) async fn complete_detaching_timeline_ancestor(
+        self: &Arc<Timeline>,
+        tenant: &crate::tenant::Tenant,
+        attempt: detach_ancestor::Attempt,
+        ctx: &RequestContext,
+    ) -> Result<(), detach_ancestor::Error> {
+        detach_ancestor::complete(self, tenant, attempt, ctx).await
     }
 
     /// Switch aux file policy and schedule upload to the index part.
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 3b52adc77b..969da2662b 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -5,12 +5,16 @@ use crate::{
     context::{DownloadBehavior, RequestContext},
     task_mgr::TaskKind,
     tenant::{
+        mgr::GetActiveTenantError,
+        remote_timeline_client::index::GcBlockingReason::DetachAncestor,
         storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer},
         Tenant,
     },
     virtual_file::{MaybeFatalIo, VirtualFile},
 };
+use anyhow::Context;
 use pageserver_api::models::detach_ancestor::AncestorDetached;
+use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
@@ -38,6 +42,12 @@ pub(crate) enum Error {
     #[error("remote copying layer failed")]
     CopyFailed(#[source] anyhow::Error),
 
+    #[error("wait for tenant to activate after restarting")]
+    WaitToActivate(#[source] GetActiveTenantError),
+
+    #[error("detached timeline was not found after restart")]
+    DetachedNotFoundAfterRestart,
+
     #[error("unexpected error")]
     Unexpected(#[source] anyhow::Error),
 
@@ -55,6 +65,10 @@ impl From<Error> for ApiError {
             Error::OtherTimelineDetachOngoing(_) => {
                 ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
             }
+            e @ Error::WaitToActivate(_) => {
+                let s = utils::error::report_compact_sources(&e).to_string();
+                ApiError::ResourceUnavailable(s.into())
+            }
             // All of these contain shutdown errors, in fact, it's the most common
             e @ Error::FlushAncestor(_)
             | e @ Error::RewrittenDeltaDownloadFailed(_)
@@ -63,6 +77,7 @@ impl From<Error> for ApiError {
             | e @ Error::CopyFailed(_)
             | e @ Error::Unexpected(_)
             | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()),
+            Error::DetachedNotFoundAfterRestart => ApiError::NotFound(value.into()),
         }
     }
 }
@@ -96,8 +111,25 @@ impl From<FlushLayerError> for Error {
     }
 }
 
+impl From<GetActiveTenantError> for Error {
+    fn from(value: GetActiveTenantError) -> Self {
+        use pageserver_api::models::TenantState;
+        use GetActiveTenantError::*;
+
+        match value {
+            Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) | SwitchedTenant => {
+                Error::ShuttingDown
+            }
+            WaitForActiveTimeout { .. } | NotFound(_) | Broken(_) | WillNotBecomeActive(_) => {
+                // NotFound seems out-of-place
+                Error::WaitToActivate(value)
+            }
+        }
+    }
+}
+
 pub(crate) enum Progress {
-    Prepared(completion::Completion, PreparedTimelineDetach),
+    Prepared(Attempt, PreparedTimelineDetach),
     Done(AncestorDetached),
 }
 
@@ -121,6 +153,26 @@ impl Default for Options {
     }
 }
 
+/// Represents an across tenant reset exclusive single attempt to detach ancestor.
+#[derive(Debug)]
+pub(crate) struct Attempt {
+    pub(crate) timeline_id: TimelineId,
+
+    _guard: completion::Completion,
+    gate_entered: Option<utils::sync::gate::GateGuard>,
+}
+
+impl Attempt {
+    pub(crate) fn before_reset_tenant(&mut self) {
+        let taken = self.gate_entered.take();
+        assert!(taken.is_some());
+    }
+
+    pub(crate) fn new_barrier(&self) -> completion::Barrier {
+        self._guard.barrier()
+    }
+}
+
 /// See [`Timeline::prepare_to_detach_from_ancestor`]
 pub(super) async fn prepare(
     detached: &Arc<Timeline>,
@@ -135,15 +187,33 @@ pub(super) async fn prepare(
         .as_ref()
         .map(|tl| (tl.clone(), detached.ancestor_lsn))
     else {
-        {
+        let still_in_progress = {
             let accessor = detached.remote_client.initialized_upload_queue()?;
 
             // we are safe to inspect the latest uploaded, because we can only witness this after
             // restart is complete and ancestor is no more.
             let latest = accessor.latest_uploaded_index_part();
-            if !latest.lineage.is_detached_from_original_ancestor() {
+            if latest.lineage.detached_previous_ancestor().is_none() {
                 return Err(NoAncestor);
-            }
+            };
+
+            latest
+                .gc_blocking
+                .as_ref()
+                .is_some_and(|b| b.blocked_by(DetachAncestor))
+        };
+
+        if still_in_progress {
+            // gc is still blocked, we can still reparent and complete.
+            // we are safe to reparent remaining, because they were locked in in the beginning.
+            let attempt = continue_with_blocked_gc(detached, tenant).await?;
+
+            // because the ancestor of detached is already set to none, we have published all
+            // of the layers, so we are still "prepared."
+            return Ok(Progress::Prepared(
+                attempt,
+                PreparedTimelineDetach { layers: Vec::new() },
+            ));
         }
 
         let reparented_timelines = reparented_direct_children(detached, tenant)?;
@@ -164,22 +234,7 @@ pub(super) async fn prepare(
         return Err(TooManyAncestors);
     }
 
-    // before we acquire the gate, we must mark the ancestor as having a detach operation
-    // ongoing which will block other concurrent detach operations so we don't get to ackward
-    // situations where there would be two branches trying to reparent earlier branches.
-    let (guard, barrier) = completion::channel();
-
-    {
-        let mut guard = tenant.ongoing_timeline_detach.lock().unwrap();
-        if let Some((tl, other)) = guard.as_ref() {
-            if !other.is_ready() {
-                return Err(OtherTimelineDetachOngoing(*tl));
-            }
-        }
-        *guard = Some((detached.timeline_id, barrier));
-    }
-
-    let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
+    let attempt = start_new_attempt(detached, tenant).await?;
 
     utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable");
 
@@ -245,7 +300,8 @@ pub(super) async fn prepare(
     };
 
     // TODO: layers are already sorted by something: use that to determine how much of remote
-    // copies are already done.
+    // copies are already done -- gc is blocked, but a compaction could had happened on ancestor,
+    // which is something to keep in mind if copy skipping is implemented.
     tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers");
 
     // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after
@@ -259,29 +315,33 @@ pub(super) async fn prepare(
 
         let mut wrote_any = false;
 
-        let limiter = Arc::new(tokio::sync::Semaphore::new(
-            options.rewrite_concurrency.get(),
-        ));
+        let limiter = Arc::new(Semaphore::new(options.rewrite_concurrency.get()));
 
         for layer in straddling_branchpoint {
             let limiter = limiter.clone();
             let timeline = detached.clone();
             let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download);
 
-            tasks.spawn(async move {
-                let _permit = limiter.acquire().await;
-                let copied =
-                    upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx)
-                        .await?;
-                Ok(copied)
-            });
+            let span = tracing::info_span!("upload_rewritten_layer", %layer);
+            tasks.spawn(
+                async move {
+                    let _permit = limiter.acquire().await;
+                    let copied =
+                        upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx)
+                            .await?;
+                    if let Some(copied) = copied.as_ref() {
+                        tracing::info!(%copied, "rewrote and uploaded");
+                    }
+                    Ok(copied)
+                }
+                .instrument(span),
+            );
         }
 
         while let Some(res) = tasks.join_next().await {
             match res {
                 Ok(Ok(Some(copied))) => {
                     wrote_any = true;
-                    tracing::info!(layer=%copied, "rewrote and uploaded");
                     new_layers.push(copied);
                 }
                 Ok(Ok(None)) => {}
@@ -308,7 +368,7 @@ pub(super) async fn prepare(
     }
 
     let mut tasks = tokio::task::JoinSet::new();
-    let limiter = Arc::new(tokio::sync::Semaphore::new(options.copy_concurrency.get()));
+    let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get()));
 
     for adopted in rest_of_historic {
         let limiter = limiter.clone();
@@ -342,7 +402,56 @@ pub(super) async fn prepare(
 
     let prepared = PreparedTimelineDetach { layers: new_layers };
 
-    Ok(Progress::Prepared(guard, prepared))
+    Ok(Progress::Prepared(attempt, prepared))
+}
+
+async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
+    let attempt = obtain_exclusive_attempt(detached, tenant)?;
+
+    // insert the block in the index_part.json, if not already there.
+    let _dont_care = tenant
+        .gc_block
+        .insert(
+            detached,
+            crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor,
+        )
+        .await
+        // FIXME: better error
+        .map_err(Error::Unexpected)?;
+
+    Ok(attempt)
+}
+
+async fn continue_with_blocked_gc(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
+    // FIXME: it would be nice to confirm that there is an in-memory version, since we've just
+    // verified there is a persistent one?
+    obtain_exclusive_attempt(detached, tenant)
+}
+
+fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
+    use Error::{OtherTimelineDetachOngoing, ShuttingDown};
+
+    // ensure we are the only active attempt for this tenant
+    let (guard, barrier) = completion::channel();
+    {
+        let mut guard = tenant.ongoing_timeline_detach.lock().unwrap();
+        if let Some((tl, other)) = guard.as_ref() {
+            if !other.is_ready() {
+                return Err(OtherTimelineDetachOngoing(*tl));
+            }
+            // FIXME: no test enters here
+        }
+        *guard = Some((detached.timeline_id, barrier));
+    }
+
+    // ensure the gate is still open
+    let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
+
+    Ok(Attempt {
+        timeline_id: detached.timeline_id,
+        _guard: guard,
+        gate_entered: Some(_gate_entered),
+    })
 }
 
 fn reparented_direct_children(
@@ -548,96 +657,207 @@ async fn remote_copy(
         .map_err(CopyFailed)
 }
 
-/// See [`Timeline::complete_detaching_timeline_ancestor`].
-pub(super) async fn complete(
+pub(crate) enum DetachingAndReparenting {
+    /// All of the following timeline ids were reparented and the timeline ancestor detach must be
+    /// marked as completed.
+    Reparented(HashSet<TimelineId>),
+
+    /// Some of the reparentings failed. The timeline ancestor detach must **not** be marked as
+    /// completed.
+    ///
+    /// Nested `must_reset_tenant` is set to true when any restart requiring changes were made.
+    SomeReparentingFailed { must_reset_tenant: bool },
+
+    /// Detaching and reparentings were completed in a previous attempt. Timeline ancestor detach
+    /// must be marked as completed.
+    AlreadyDone(HashSet<TimelineId>),
+}
+
+impl DetachingAndReparenting {
+    pub(crate) fn reset_tenant_required(&self) -> bool {
+        use DetachingAndReparenting::*;
+        match self {
+            Reparented(_) => true,
+            SomeReparentingFailed { must_reset_tenant } => *must_reset_tenant,
+            AlreadyDone(_) => false,
+        }
+    }
+
+    pub(crate) fn completed(self) -> Option<HashSet<TimelineId>> {
+        use DetachingAndReparenting::*;
+        match self {
+            Reparented(x) | AlreadyDone(x) => Some(x),
+            SomeReparentingFailed { .. } => None,
+        }
+    }
+}
+
+/// See [`Timeline::detach_from_ancestor_and_reparent`].
+pub(super) async fn detach_and_reparent(
     detached: &Arc<Timeline>,
     tenant: &Tenant,
     prepared: PreparedTimelineDetach,
     _ctx: &RequestContext,
-) -> Result<HashSet<TimelineId>, anyhow::Error> {
+) -> Result<DetachingAndReparenting, anyhow::Error> {
     let PreparedTimelineDetach { layers } = prepared;
 
-    let ancestor = detached
-        .ancestor_timeline
-        .as_ref()
-        .expect("must still have a ancestor");
-    let ancestor_lsn = detached.get_ancestor_lsn();
+    #[derive(Debug)]
+    enum Ancestor {
+        NotDetached(Arc<Timeline>, Lsn),
+        Detached(Arc<Timeline>, Lsn),
+    }
+
+    let (recorded_branchpoint, still_ongoing) = {
+        let access = detached.remote_client.initialized_upload_queue()?;
+        let latest = access.latest_uploaded_index_part();
+
+        (
+            latest.lineage.detached_previous_ancestor(),
+            latest
+                .gc_blocking
+                .as_ref()
+                .is_some_and(|b| b.blocked_by(DetachAncestor)),
+        )
+    };
+    assert!(
+        still_ongoing,
+        "cannot (detach? reparent)? complete if the operation is not still ongoing"
+    );
+
+    let ancestor = match (detached.ancestor_timeline.as_ref(), recorded_branchpoint) {
+        (Some(ancestor), None) => {
+            assert!(
+                !layers.is_empty(),
+                "there should always be at least one layer to inherit"
+            );
+            Ancestor::NotDetached(ancestor.clone(), detached.ancestor_lsn)
+        }
+        (Some(_), Some(_)) => {
+            panic!(
+                "it should be impossible to get to here without having gone through the tenant reset; if the tenant was reset, then the ancestor_timeline would be None"
+            );
+        }
+        (None, Some((ancestor_id, ancestor_lsn))) => {
+            // it has been either:
+            // - detached but still exists => we can try reparenting
+            // - detached and deleted
+            //
+            // either way, we must complete
+            assert!(
+                layers.is_empty(),
+                "no layers should had been copied as detach is done"
+            );
+
+            let existing = tenant.timelines.lock().unwrap().get(&ancestor_id).cloned();
+
+            if let Some(ancestor) = existing {
+                Ancestor::Detached(ancestor, ancestor_lsn)
+            } else {
+                let direct_children = reparented_direct_children(detached, tenant)?;
+                return Ok(DetachingAndReparenting::AlreadyDone(direct_children));
+            }
+        }
+        (None, None) => {
+            // TODO: make sure there are no `?` before tenant_reset from after a questionmark from
+            // here.
+            panic!(
+            "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor"
+            );
+        }
+    };
 
     // publish the prepared layers before we reparent any of the timelines, so that on restart
     // reparented timelines find layers. also do the actual detaching.
     //
-    // if we crash after this operation, we will at least come up having detached a timeline, but
-    // we cannot go back and reparent the timelines which would had been reparented in normal
-    // execution.
-    //
-    // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
-    // which could give us a completely wrong layer combination.
-    detached
-        .remote_client
-        .schedule_adding_existing_layers_to_index_detach_and_wait(
-            &layers,
-            (ancestor.timeline_id, ancestor_lsn),
-        )
-        .await?;
+    // if we crash after this operation, a retry will allow reparenting the remaining timelines as
+    // gc is blocked.
+
+    let (ancestor, ancestor_lsn, was_detached) = match ancestor {
+        Ancestor::NotDetached(ancestor, ancestor_lsn) => {
+            // this has to complete before any reparentings because otherwise they would not have
+            // layers on the new parent.
+            detached
+                .remote_client
+                .schedule_adding_existing_layers_to_index_detach_and_wait(
+                    &layers,
+                    (ancestor.timeline_id, ancestor_lsn),
+                )
+                .await
+                .context("publish layers and detach ancestor")?;
+
+            tracing::info!(
+                ancestor=%ancestor.timeline_id,
+                %ancestor_lsn,
+                inherited_layers=%layers.len(),
+                "detached from ancestor"
+            );
+            (ancestor, ancestor_lsn, true)
+        }
+        Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn, false),
+    };
 
     let mut tasks = tokio::task::JoinSet::new();
 
+    // Returns a single permit semaphore which will be used to make one reparenting succeed,
+    // others will fail as if those timelines had been stopped for whatever reason.
+    #[cfg(feature = "testing")]
+    let failpoint_sem = || -> Option<Arc<Semaphore>> {
+        fail::fail_point!("timeline-detach-ancestor::allow_one_reparented", |_| Some(
+            Arc::new(Semaphore::new(1))
+        ));
+        None
+    }();
+
     // because we are now keeping the slot in progress, it is unlikely that there will be any
     // timeline deletions during this time. if we raced one, then we'll just ignore it.
-    tenant
-        .timelines
-        .lock()
-        .unwrap()
-        .values()
-        .filter_map(|tl| {
-            if Arc::ptr_eq(tl, detached) {
-                return None;
-            }
+    {
+        let g = tenant.timelines.lock().unwrap();
+        reparentable_timelines(g.values(), detached, &ancestor, ancestor_lsn)
+            .cloned()
+            .for_each(|timeline| {
+                // important in this scope: we are holding the Tenant::timelines lock
+                let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id);
+                let new_parent = detached.timeline_id;
+                #[cfg(feature = "testing")]
+                let failpoint_sem = failpoint_sem.clone();
 
-            if !tl.is_active() {
-                return None;
-            }
+                tasks.spawn(
+                    async move {
+                        let res = async {
+                            #[cfg(feature = "testing")]
+                            if let Some(failpoint_sem) = failpoint_sem {
+                                let _permit = failpoint_sem.acquire().await.map_err(|_| {
+                                    anyhow::anyhow!(
+                                        "failpoint: timeline-detach-ancestor::allow_one_reparented",
+                                    )
+                                })?;
+                                failpoint_sem.close();
+                            }
 
-            let tl_ancestor = tl.ancestor_timeline.as_ref()?;
-            let is_same = Arc::ptr_eq(ancestor, tl_ancestor);
-            let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
-
-            let is_deleting = tl
-                .delete_progress
-                .try_lock()
-                .map(|flow| !flow.is_not_started())
-                .unwrap_or(true);
-
-            if is_same && is_earlier && !is_deleting {
-                Some(tl.clone())
-            } else {
-                None
-            }
-        })
-        .for_each(|timeline| {
-            // important in this scope: we are holding the Tenant::timelines lock
-            let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id);
-            let new_parent = detached.timeline_id;
-
-            tasks.spawn(
-                async move {
-                    let res = timeline
-                        .remote_client
-                        .schedule_reparenting_and_wait(&new_parent)
+                            timeline
+                                .remote_client
+                                .schedule_reparenting_and_wait(&new_parent)
+                                .await
+                        }
                         .await;
 
-                    match res {
-                        Ok(()) => Some(timeline),
-                        Err(e) => {
-                            // with the use of tenant slot, we no longer expect these.
-                            tracing::warn!("reparenting failed: {e:#}");
-                            None
+                        match res {
+                            Ok(()) => {
+                                tracing::info!("reparented");
+                                Some(timeline)
+                            }
+                            Err(e) => {
+                                // with the use of tenant slot, raced timeline deletion is the most
+                                // likely reason.
+                                tracing::warn!("reparenting failed: {e:#}");
+                                None
+                            }
                         }
                     }
-                }
-                .instrument(span),
-            );
-        });
+                    .instrument(span),
+                );
+            });
+    }
 
     let reparenting_candidates = tasks.len();
     let mut reparented = HashSet::with_capacity(tasks.len());
@@ -645,33 +865,103 @@ pub(super) async fn complete(
     while let Some(res) = tasks.join_next().await {
         match res {
             Ok(Some(timeline)) => {
-                tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
-
                 assert!(
                     reparented.insert(timeline.timeline_id),
                     "duplicate reparenting? timeline_id={}",
                     timeline.timeline_id
                 );
             }
-            Ok(None) => {
-                // lets just ignore this for now. one or all reparented timelines could had
-                // started deletion, and that is fine.
-            }
             Err(je) if je.is_cancelled() => unreachable!("not used"),
-            Err(je) if je.is_panic() => {
-                // ignore; it's better to continue with a single reparenting failing (or even
-                // all of them) in order to get to the goal state.
-                //
-                // these timelines will never be reparentable, but they can be always detached as
-                // separate tree roots.
-            }
+            // just ignore failures now, we can retry
+            Ok(None) => {}
+            Err(je) if je.is_panic() => {}
             Err(je) => tracing::error!("unexpected join error: {je:?}"),
         }
     }
 
-    if reparenting_candidates != reparented.len() {
-        tracing::info!("failed to reparent some candidates");
+    let reparented_all = reparenting_candidates == reparented.len();
+
+    if reparented_all {
+        Ok(DetachingAndReparenting::Reparented(reparented))
+    } else {
+        tracing::info!(
+            reparented = reparented.len(),
+            candidates = reparenting_candidates,
+            "failed to reparent all candidates; they can be retried after the tenant_reset",
+        );
+
+        let must_reset_tenant = !reparented.is_empty() || was_detached;
+        Ok(DetachingAndReparenting::SomeReparentingFailed { must_reset_tenant })
+    }
+}
+
+pub(super) async fn complete(
+    detached: &Arc<Timeline>,
+    tenant: &Tenant,
+    mut attempt: Attempt,
+    _ctx: &RequestContext,
+) -> Result<(), Error> {
+    assert_eq!(detached.timeline_id, attempt.timeline_id);
+
+    if attempt.gate_entered.is_none() {
+        let entered = detached.gate.enter().map_err(|_| Error::ShuttingDown)?;
+        attempt.gate_entered = Some(entered);
+    } else {
+        // Some(gate_entered) means the tenant was not restarted, as is not required
     }
 
-    Ok(reparented)
+    assert!(detached.ancestor_timeline.is_none());
+
+    // this should be an 503 at least...?
+    fail::fail_point!(
+        "timeline-detach-ancestor::complete_before_uploading",
+        |_| Err(Error::Failpoint(
+            "timeline-detach-ancestor::complete_before_uploading"
+        ))
+    );
+
+    tenant
+        .gc_block
+        .remove(
+            detached,
+            crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor,
+        )
+        .await
+        // FIXME: better error
+        .map_err(Error::Unexpected)?;
+
+    Ok(())
+}
+
+/// Query against a locked `Tenant::timelines`.
+fn reparentable_timelines<'a, I>(
+    timelines: I,
+    detached: &'a Arc<Timeline>,
+    ancestor: &'a Arc<Timeline>,
+    ancestor_lsn: Lsn,
+) -> impl Iterator<Item = &'a Arc<Timeline>> + 'a
+where
+    I: Iterator<Item = &'a Arc<Timeline>> + 'a,
+{
+    timelines.filter_map(move |tl| {
+        if Arc::ptr_eq(tl, detached) {
+            return None;
+        }
+
+        let tl_ancestor = tl.ancestor_timeline.as_ref()?;
+        let is_same = Arc::ptr_eq(ancestor, tl_ancestor);
+        let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
+
+        let is_deleting = tl
+            .delete_progress
+            .try_lock()
+            .map(|flow| !flow.is_not_started())
+            .unwrap_or(true);
+
+        if is_same && is_earlier && !is_deleting {
+            Some(tl)
+        } else {
+            None
+        }
+    })
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index fe582cf0e2..ee8e9ac5a1 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2989,6 +2989,7 @@ impl Service {
             );
 
             let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+
             client
                 .timeline_detach_ancestor(tenant_shard_id, timeline_id)
                 .await
@@ -3005,7 +3006,8 @@ impl Service {
                         Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
                             ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
                         }
-                        // rest can be mapped
+                        // rest can be mapped as usual
+                        // FIXME: this converts some 500 to 409 which is not per openapi
                         other => passthrough_api_error(&node, other),
                     }
                 })
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 4e409eeb17..902457c2ac 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -5,7 +5,7 @@ import time
 from concurrent.futures import ThreadPoolExecutor
 from queue import Empty, Queue
 from threading import Barrier
-from typing import List, Tuple
+from typing import List, Set, Tuple
 
 import pytest
 from fixtures.common_types import Lsn, TimelineId
@@ -807,22 +807,24 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
 
     What remains not tested by this:
     - shutdown winning over complete
-
-    Shutdown winning over complete needs gc blocking and reparenting any left-overs on retry.
     """
 
     if sharded and mode == "delete_tenant":
         # the shared/exclusive lock for tenant is blocking this:
         # timeline detach ancestor takes shared, delete tenant takes exclusive
-        pytest.skip(
-            "tenant deletion while timeline ancestor detach is underway is not supported yet"
-        )
+        pytest.skip("tenant deletion while timeline ancestor detach is underway cannot happen")
 
     shard_count = 2 if sharded else 1
 
     neon_env_builder.num_pageservers = shard_count
 
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count if sharded else None)
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shard_count if sharded else None,
+        initial_tenant_conf={
+            "gc_period": "1s",
+            "lsn_lease_length": "0s",
+        },
+    )
 
     for ps in env.pageservers:
         ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
@@ -831,7 +833,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
 
     detached_timeline = env.neon_cli.create_branch("detached soon", "main")
 
-    failpoint = "timeline-detach-ancestor::before_starting_after_locking_pausable"
+    pausepoint = "timeline-detach-ancestor::before_starting_after_locking_pausable"
 
     env.storage_controller.reconcile_until_idle()
     shards = env.storage_controller.locate(env.initial_tenant)
@@ -843,13 +845,20 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
 
     victim = pageservers[int(shards[-1]["node_id"])]
     victim_http = victim.http_client()
-    victim_http.configure_failpoints((failpoint, "pause"))
+    victim_http.configure_failpoints((pausepoint, "pause"))
 
     def detach_ancestor():
         target.detach_ancestor(env.initial_tenant, detached_timeline)
 
-    def at_failpoint() -> Tuple[str, LogCursor]:
-        return victim.assert_log_contains(f"at failpoint {failpoint}")
+    def at_failpoint() -> LogCursor:
+        msg, offset = victim.assert_log_contains(f"at failpoint {pausepoint}")
+        log.info(f"found {msg}")
+        msg, offset = victim.assert_log_contains(
+            ".* gc_loop.*: Skipping GC: .*",
+            offset,
+        )
+        log.info(f"found {msg}")
+        return offset
 
     def start_delete():
         if mode == "delete_timeline":
@@ -882,23 +891,44 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
     with ThreadPoolExecutor(max_workers=2) as pool:
         try:
             fut = pool.submit(detach_ancestor)
-            _, offset = wait_until(10, 1.0, at_failpoint)
+            offset = wait_until(10, 1.0, at_failpoint)
 
             delete = pool.submit(start_delete)
 
-            wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset))
+            offset = wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset))
 
-            victim_http.configure_failpoints((failpoint, "off"))
+            victim_http.configure_failpoints((pausepoint, "off"))
 
             delete.result()
 
             assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}"
 
+            # TODO: match the error
             with pytest.raises(PageserverApiException) as exc:
                 fut.result()
+            log.info(f"TODO: match this error: {exc.value}")
             assert exc.value.status_code == 503
         finally:
-            victim_http.configure_failpoints((failpoint, "off"))
+            victim_http.configure_failpoints((pausepoint, "off"))
+
+    if mode != "delete_timeline":
+        return
+
+    # make sure the gc is unblocked
+    time.sleep(2)
+    victim.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset)
+
+    if not sharded:
+        # we have the other node only while sharded
+        return
+
+    other = pageservers[int(shards[0]["node_id"])]
+    log.info(f"other is {other.id}")
+    _, offset = other.assert_log_contains(
+        ".*INFO request\\{method=PUT path=/v1/tenant/\\S+/timeline/\\S+/detach_ancestor .*\\}: Request handled, status: 200 OK",
+    )
+    # this might be a lot earlier than the victims line, but that is okay.
+    _, offset = other.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset)
 
 
 @pytest.mark.parametrize("mode", ["delete_reparentable_timeline"])
@@ -915,7 +945,9 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
 
     assert (
         mode == "delete_reparentable_timeline"
-    ), "only one now, but we could have the create just as well, need gc blocking"
+    ), "only one now, but creating reparentable timelines cannot be supported even with gc blocking"
+    # perhaps it could be supported by always doing this for the shard0 first, and after that for others.
+    # when we run shard0 to completion, we can use it's timelines to restrict which can be reparented.
 
     shard_count = 2
     neon_env_builder.num_pageservers = shard_count
@@ -1048,10 +1080,267 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
             victim_http.configure_failpoints((pausepoint, "off"))
 
 
+def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: NeonEnvBuilder):
+    """
+    Using a failpoint, force the completion step of timeline ancestor detach to
+    fail after reparenting a single timeline.
+
+    Retrying should try reparenting until all reparentings are done, all the
+    time blocking gc even across restarts (first round).
+
+    A completion failpoint is used to inhibit completion on second to last
+    round.
+
+    On last round, the completion uses a path where no reparentings can happen
+    because original ancestor is deleted, and there is a completion to unblock
+    gc without restart.
+    """
+
+    # to get the remote storage metrics
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "1s",
+            "lsn_lease_length": "0s",
+        }
+    )
+
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    env.pageserver.allowed_errors.extend(
+        [
+            ".* reparenting failed: failpoint: timeline-detach-ancestor::allow_one_reparented",
+            ".* Error processing HTTP request: InternalServerError\\(failed to reparent all candidate timelines, please retry",
+            ".* Error processing HTTP request: InternalServerError\\(failpoint: timeline-detach-ancestor::complete_before_uploading",
+        ]
+    )
+
+    http = env.pageserver.http_client()
+
+    def remote_storage_copy_requests():
+        return http.get_metric_value(
+            "remote_storage_s3_request_seconds_count",
+            {"request_type": "copy_object", "result": "ok"},
+        )
+
+    def reparenting_progress(timelines: List[TimelineId]) -> Tuple[int, Set[TimelineId]]:
+        reparented = 0
+        not_reparented = set()
+        for timeline in timelines:
+            detail = http.timeline_detail(env.initial_tenant, timeline)
+            ancestor = TimelineId(detail["ancestor_timeline_id"])
+            if ancestor == detached:
+                reparented += 1
+            else:
+                not_reparented.add(timeline)
+        return (reparented, not_reparented)
+
+    # main ------A-----B-----C-----D-----E> lsn
+    timelines = []
+    with env.endpoints.create_start("main") as ep:
+        for counter in range(5):
+            ep.safe_psql(
+                f"create table foo_{counter} as select i::bigint from generate_series(1, 10000) t(i)"
+            )
+            branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+            http.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
+            branch = env.neon_cli.create_branch(
+                f"branch_{counter}", "main", ancestor_start_lsn=branch_lsn
+            )
+            timelines.append(branch)
+
+        flush_ep_to_pageserver(env, ep, env.initial_tenant, env.initial_timeline)
+
+    # detach "E" which has most reparentable timelines under it
+    detached = timelines.pop()
+    assert len(timelines) == 4
+
+    http = http.without_status_retrying()
+
+    http.configure_failpoints(("timeline-detach-ancestor::allow_one_reparented", "return"))
+
+    not_reparented: Set[TimelineId] = set()
+    # tracked offset in the pageserver log which is at least at the most recent activation
+    offset = None
+
+    def try_detach():
+        with pytest.raises(
+            PageserverApiException,
+            match=".*failed to reparent all candidate timelines, please retry",
+        ) as exc:
+            http.detach_ancestor(env.initial_tenant, detached)
+        assert exc.value.status_code == 500
+
+    # first round -- do more checking to make sure the gc gets paused
+    try_detach()
+
+    assert (
+        http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None
+    ), "first round should had detached 'detached'"
+
+    reparented, not_reparented = reparenting_progress(timelines)
+    assert reparented == 1
+
+    time.sleep(2)
+    _, offset = env.pageserver.assert_log_contains(
+        ".*INFO request\\{method=PUT path=/v1/tenant/[0-9a-f]{32}/timeline/[0-9a-f]{32}/detach_ancestor .*\\}: Handling request",
+        offset,
+    )
+    _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset)
+    _, offset = env.pageserver.assert_log_contains(
+        ".* gc_loop.*: Skipping GC: .*",
+        offset,
+    )
+    metric = remote_storage_copy_requests()
+    assert metric != 0
+    # make sure the gc blocking is persistent over a restart
+    env.pageserver.restart()
+    env.pageserver.quiesce_tenants()
+    time.sleep(2)
+    _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset)
+    assert env.pageserver.log_contains(".* gc_loop.*: [0-9] timelines need GC", offset) is None
+    _, offset = env.pageserver.assert_log_contains(
+        ".* gc_loop.*: Skipping GC: .*",
+        offset,
+    )
+    # restore failpoint for the next reparented
+    http.configure_failpoints(("timeline-detach-ancestor::allow_one_reparented", "return"))
+
+    reparented_before = reparented
+
+    # do two more rounds
+    for _ in range(2):
+        try_detach()
+
+        assert (
+            http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None
+        ), "first round should had detached 'detached'"
+
+        reparented, not_reparented = reparenting_progress(timelines)
+        assert reparented == reparented_before + 1
+        reparented_before = reparented
+
+        _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset)
+        metric = remote_storage_copy_requests()
+        assert metric == 0, "copies happen in the first round"
+
+    assert offset is not None
+    assert len(not_reparented) == 1
+
+    http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "return"))
+
+    # almost final round, the failpoint is hit no longer as there is only one reparented and one always gets to succeed.
+    # the tenant is restarted once more, but we fail during completing.
+    with pytest.raises(
+        PageserverApiException, match=".* timeline-detach-ancestor::complete_before_uploading"
+    ) as exc:
+        http.detach_ancestor(env.initial_tenant, detached)
+    assert exc.value.status_code == 500
+    _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset)
+
+    # delete the previous ancestor to take a different path to completion. all
+    # other tests take the "detach? reparent complete", but this only hits
+    # "complete".
+    http.timeline_delete(env.initial_tenant, env.initial_timeline)
+    wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline, 20)
+
+    http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "off"))
+
+    reparented_resp = http.detach_ancestor(env.initial_tenant, detached)
+    assert reparented_resp == set(timelines)
+    # no need to quiesce_tenants anymore, because completion does that
+
+    reparented, not_reparented = reparenting_progress(timelines)
+    assert reparented == len(timelines)
+
+    time.sleep(2)
+    assert (
+        env.pageserver.log_contains(".*: attach finished, activating", offset) is None
+    ), "there should be no restart with the final detach_ancestor as it only completed"
+
+    # gc is unblocked
+    env.pageserver.assert_log_contains(".* gc_loop.*: 5 timelines need GC", offset)
+
+    metric = remote_storage_copy_requests()
+    assert metric == 0
+
+
+def test_timeline_is_deleted_before_timeline_detach_ancestor_completes(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Make sure that a timeline deleted after restart will unpause gc blocking.
+    """
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "1s",
+            "lsn_lease_length": "0s",
+        }
+    )
+
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    http = env.pageserver.http_client()
+
+    detached = env.neon_cli.create_branch("detached")
+
+    failpoint = "timeline-detach-ancestor::after_activating_before_finding-pausable"
+
+    http.configure_failpoints((failpoint, "pause"))
+
+    def detach_and_get_stuck():
+        return http.detach_ancestor(env.initial_tenant, detached)
+
+    def request_processing_noted_in_log():
+        _, offset = env.pageserver.assert_log_contains(
+            ".*INFO request\\{method=PUT path=/v1/tenant/[0-9a-f]{32}/timeline/[0-9a-f]{32}/detach_ancestor .*\\}: Handling request",
+        )
+        return offset
+
+    def delete_detached():
+        return http.timeline_delete(env.initial_tenant, detached)
+
+    try:
+        with ThreadPoolExecutor(max_workers=1) as pool:
+            detach = pool.submit(detach_and_get_stuck)
+
+            offset = wait_until(10, 1.0, request_processing_noted_in_log)
+
+            # make this named fn tor more clear failure test output logging
+            def pausepoint_hit_with_gc_paused() -> LogCursor:
+                env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+                _, at = env.pageserver.assert_log_contains(
+                    ".* gc_loop.*: Skipping GC: .*",
+                    offset,
+                )
+                return at
+
+            offset = wait_until(10, 1.0, pausepoint_hit_with_gc_paused)
+
+            delete_detached()
+
+            wait_timeline_detail_404(http, env.initial_tenant, detached, 10, 1.0)
+
+            http.configure_failpoints((failpoint, "off"))
+
+            with pytest.raises(PageserverApiException) as exc:
+                detach.result()
+
+            # FIXME: this should be 404 but because there is another Anyhow conversion it is 500
+            assert exc.value.status_code == 500
+            env.pageserver.allowed_errors.append(
+                ".*Error processing HTTP request: InternalServerError\\(detached timeline was not found after restart"
+            )
+    finally:
+        http.configure_failpoints((failpoint, "off"))
+
+    # make sure gc has been unblocked
+    time.sleep(2)
+
+    env.pageserver.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset)
+
+
 # TODO:
-# - after starting the operation, pageserver is shutdown, restarted
-# - after starting the operation, bottom-most timeline is deleted, pageserver is restarted, gc is inhibited
-# - deletion of reparented while reparenting should fail once, then succeed (?)
 # - branch near existing L1 boundary, image layers?
 # - investigate: why are layers started at uneven lsn? not just after branching, but in general.
 #

From 0f43b7c51b622e59d7485e52ac572378dcb78afc Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 13 Aug 2024 15:31:55 -0500
Subject: [PATCH 1410/1571] Loosen type on PgProtocol::safe_psql(queries:)

Using Iterable allows us to also use tuples, among other things.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 961dbde95c..aaa1f21997 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -24,7 +24,7 @@ from functools import cached_property, partial
 from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
-from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
+from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union, cast
 from urllib.parse import quote, urlparse
 
 import asyncpg
@@ -388,7 +388,7 @@ class PgProtocol:
         return self.safe_psql_many([query], **kwargs)[0]
 
     def safe_psql_many(
-        self, queries: List[str], log_query=True, **kwargs: Any
+        self, queries: Iterable[str], log_query=True, **kwargs: Any
     ) -> List[List[Tuple[Any, ...]]]:
         """
         Execute queries against the node and return all rows.

From c624317b0e2ff73c6c01904e4d883c256c078f22 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 13 Aug 2024 15:34:10 -0500
Subject: [PATCH 1411/1571] Decode the database name in SQL/HTTP connections

A url::Url does not hand you back a URL decoded value for path values,
so we must decode them ourselves.

Link: https://docs.rs/url/2.5.2/url/struct.Url.html#method.path
Link: https://docs.rs/url/2.5.2/url/struct.Url.html#method.path_segments
Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 proxy/src/serverless/sql_over_http.rs |  4 +++-
 test_runner/regress/test_proxy.py     | 26 ++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index e5b6536328..c41df07a4d 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -34,6 +34,7 @@ use tracing::error;
 use tracing::info;
 use typed_json::json;
 use url::Url;
+use urlencoding;
 use utils::http::error::ApiError;
 
 use crate::auth::backend::ComputeUserInfo;
@@ -168,7 +169,8 @@ fn get_conn_info(
         .path_segments()
         .ok_or(ConnInfoError::MissingDbName)?;
 
-    let dbname: DbName = url_path.next().ok_or(ConnInfoError::InvalidDbName)?.into();
+    let dbname: DbName =
+        urlencoding::decode(url_path.next().ok_or(ConnInfoError::InvalidDbName)?)?.into();
     ctx.set_dbname(dbname.clone());
 
     let username = RoleName::from(urlencoding::decode(connection_url.username())?);
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index f446f4f200..d2b8c2ed8b 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -2,6 +2,7 @@ import asyncio
 import json
 import subprocess
 import time
+import urllib.parse
 from typing import Any, List, Optional, Tuple
 
 import psycopg2
@@ -275,6 +276,31 @@ def test_sql_over_http(static_proxy: NeonProxy):
     assert res["rowCount"] is None
 
 
+def test_sql_over_http_db_name_with_space(static_proxy: NeonProxy):
+    db = "db with spaces"
+    static_proxy.safe_psql_many(
+        (
+            f'create database "{db}"',
+            "create role http with login password 'http' superuser",
+        )
+    )
+
+    def q(sql: str, params: Optional[List[Any]] = None) -> Any:
+        params = params or []
+        connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/{urllib.parse.quote(db)}"
+        response = requests.post(
+            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+            data=json.dumps({"query": sql, "params": params}),
+            headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr},
+            verify=str(static_proxy.test_output_dir / "proxy.crt"),
+        )
+        assert response.status_code == 200, response.text
+        return response.json()
+
+    rows = q("select 42 as answer")["rows"]
+    assert rows == [{"answer": 42}]
+
+
 def test_sql_over_http_output_options(static_proxy: NeonProxy):
     static_proxy.safe_psql("create role http2 with login password 'http2' superuser")
 

From 7a1736ddcf0ed0e60bbb8b5a030c8e5349041c37 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 14 Aug 2024 08:13:20 +0300
Subject: [PATCH 1412/1571] Preserve HEAP_COMBOCID when restoring t_cid from
 WAL (#8503)

## Problem

See https://github.com/neondatabase/neon/issues/8499

## Summary of changes

Save HEAP_COMBOCID flag in WAL and do not clear it in redo handlers.

Related Postgres PRs:
https://github.com/neondatabase/postgres/pull/457
https://github.com/neondatabase/postgres/pull/458
https://github.com/neondatabase/postgres/pull/459


## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pgxn/neon_rmgr/neon_rmgr.c           |  16 +--
 test_runner/regress/test_combocid.py | 139 +++++++++++++++++++++++++++
 vendor/postgres-v14                  |   2 +-
 vendor/postgres-v15                  |   2 +-
 vendor/postgres-v16                  |   2 +-
 vendor/revisions.json                |   6 +-
 6 files changed, 154 insertions(+), 13 deletions(-)
 create mode 100644 test_runner/regress/test_combocid.py

diff --git a/pgxn/neon_rmgr/neon_rmgr.c b/pgxn/neon_rmgr/neon_rmgr.c
index 496ca08c08..c3f726db84 100644
--- a/pgxn/neon_rmgr/neon_rmgr.c
+++ b/pgxn/neon_rmgr/neon_rmgr.c
@@ -186,7 +186,7 @@ static void
 fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
 {
 	*infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
-				   HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK);
+				   HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK | HEAP_COMBOCID);
 	*infomask2 &= ~HEAP_KEYS_UPDATED;
 
 	if (infobits & XLHL_XMAX_IS_MULTI)
@@ -195,6 +195,8 @@ fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
 		*infomask |= HEAP_XMAX_LOCK_ONLY;
 	if (infobits & XLHL_XMAX_EXCL_LOCK)
 		*infomask |= HEAP_XMAX_EXCL_LOCK;
+	if (infobits & XLHL_COMBOCID)
+		*infomask |= HEAP_COMBOCID;
 	/* note HEAP_XMAX_SHR_LOCK isn't considered here */
 	if (infobits & XLHL_XMAX_KEYSHR_LOCK)
 		*infomask |= HEAP_XMAX_KEYSHR_LOCK;
@@ -284,7 +286,7 @@ redo_neon_heap_insert(XLogReaderState *record)
 		htup->t_infomask = xlhdr.t_infomask;
 		htup->t_hoff = xlhdr.t_hoff;
 		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
-		HeapTupleHeaderSetCmin(htup, xlhdr.t_cid);
+		htup->t_choice.t_heap.t_field3.t_cid = xlhdr.t_cid;
 		htup->t_ctid = target_tid;
 
 		if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
@@ -373,7 +375,7 @@ redo_neon_heap_delete(XLogReaderState *record)
 			HeapTupleHeaderSetXmax(htup, xlrec->xmax);
 		else
 			HeapTupleHeaderSetXmin(htup, InvalidTransactionId);
-		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);
+		htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid;
 
 		/* Mark the page as a candidate for pruning */
 		PageSetPrunable(page, XLogRecGetXid(record));
@@ -490,7 +492,7 @@ redo_neon_heap_update(XLogReaderState *record, bool hot_update)
 		fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
 								   &htup->t_infomask2);
 		HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
-		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);
+		htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid;
 		/* Set forward chain link in t_ctid */
 		htup->t_ctid = newtid;
 
@@ -623,7 +625,7 @@ redo_neon_heap_update(XLogReaderState *record, bool hot_update)
 		htup->t_hoff = xlhdr.t_hoff;
 
 		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
-		HeapTupleHeaderSetCmin(htup, xlhdr.t_cid);
+		htup->t_choice.t_heap.t_field3.t_cid = xlhdr.t_cid;
 		HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
 		/* Make sure there is no forward chain link in t_ctid */
 		htup->t_ctid = newtid;
@@ -728,7 +730,7 @@ redo_neon_heap_lock(XLogReaderState *record)
 						   offnum);
 		}
 		HeapTupleHeaderSetXmax(htup, xlrec->xmax);
-		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);
+		htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid;
 		PageSetLSN(page, lsn);
 		MarkBufferDirty(buffer);
 	}
@@ -840,7 +842,7 @@ redo_neon_heap_multi_insert(XLogReaderState *record)
 			htup->t_infomask = xlhdr->t_infomask;
 			htup->t_hoff = xlhdr->t_hoff;
 			HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
-			HeapTupleHeaderSetCmin(htup, xlrec->t_cid);
+			htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid;
 			ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
 			ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);
 
diff --git a/test_runner/regress/test_combocid.py b/test_runner/regress/test_combocid.py
new file mode 100644
index 0000000000..6d2567b7ee
--- /dev/null
+++ b/test_runner/regress/test_combocid.py
@@ -0,0 +1,139 @@
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            "shared_buffers='1MB'",
+        ],
+    )
+
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    n_records = 1000
+
+    cur.execute("CREATE EXTENSION neon_test_utils")
+
+    cur.execute("create table t(id integer, val integer)")
+
+    cur.execute("begin")
+    cur.execute("insert into t values (1, 0)")
+    cur.execute("insert into t values (2, 0)")
+    cur.execute(f"insert into t select g, 0 from generate_series(3,{n_records}) g")
+
+    # Open a cursor that scroll it halfway through
+    cur.execute("DECLARE c1 NO SCROLL CURSOR WITHOUT HOLD FOR SELECT * FROM t")
+    cur.execute("fetch 500 from c1")
+    rows = cur.fetchall()
+    assert len(rows) == 500
+
+    # Perform specified operation
+    cur.execute(op)
+
+    # Clear the cache, so that we exercise reconstructing the pages
+    # from WAL
+    cur.execute("SELECT clear_buffer_cache()")
+
+    # Check that the cursor opened earlier still works. If the
+    # combocids are not restored correctly, it won't.
+    cur.execute("fetch all from c1")
+    rows = cur.fetchall()
+    assert len(rows) == 500
+
+    cur.execute("rollback")
+
+
+def test_combocid_delete(neon_env_builder: NeonEnvBuilder):
+    do_combocid_op(neon_env_builder, "delete from t")
+
+
+def test_combocid_update(neon_env_builder: NeonEnvBuilder):
+    do_combocid_op(neon_env_builder, "update t set val=val+1")
+
+
+def test_combocid_lock(neon_env_builder: NeonEnvBuilder):
+    do_combocid_op(neon_env_builder, "select * from t for update")
+
+
+def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            "shared_buffers='1MB'",
+        ],
+    )
+
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    n_records = 1000
+
+    cur.execute("CREATE EXTENSION neon_test_utils")
+
+    cur.execute("create table t(id integer, val integer)")
+    file_path = f"{endpoint.pg_data_dir_path()}/t.csv"
+    cur.execute(f"insert into t select g, 0 from generate_series(1,{n_records}) g")
+    cur.execute(f"copy t to '{file_path}'")
+    cur.execute("truncate table t")
+
+    cur.execute("begin")
+    cur.execute(f"copy t from '{file_path}'")
+
+    # Open a cursor that scroll it halfway through
+    cur.execute("DECLARE c1 NO SCROLL CURSOR WITHOUT HOLD FOR SELECT * FROM t")
+    cur.execute("fetch 500 from c1")
+    rows = cur.fetchall()
+    assert len(rows) == 500
+
+    # Delete all the rows. Because all of the rows were inserted earlier in the
+    # same transaction, all the rows will get a combocid.
+    cur.execute("delete from t")
+    # Clear the cache, so that we exercise reconstructing the pages
+    # from WAL
+    cur.execute("SELECT clear_buffer_cache()")
+
+    # Check that the cursor opened earlier still works. If the
+    # combocids are not restored correctly, it won't.
+    cur.execute("fetch all from c1")
+    rows = cur.fetchall()
+    assert len(rows) == 500
+
+    cur.execute("rollback")
+
+
+def test_combocid(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start("main")
+
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    n_records = 100000
+
+    cur.execute("create table t(id integer, val integer)")
+    cur.execute(f"insert into t values (generate_series(1,{n_records}), 0)")
+
+    cur.execute("begin")
+
+    cur.execute("update t set val=val+1")
+    assert cur.rowcount == n_records
+    cur.execute("update t set val=val+1")
+    assert cur.rowcount == n_records
+    cur.execute("update t set val=val+1")
+    assert cur.rowcount == n_records
+
+    cur.execute("delete from t")
+    assert cur.rowcount == n_records
+    cur.execute("delete from t")
+    assert cur.rowcount == 0
+
+    cur.execute(f"insert into t values (generate_series(1,{n_records}), 0)")
+    cur.execute("update t set val=val+1")
+    assert cur.rowcount == n_records
+    cur.execute("update t set val=val+1")
+    assert cur.rowcount == n_records
+    cur.execute("update t set val=val+1")
+    assert cur.rowcount == n_records
+
+    cur.execute("rollback")
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index a48faca1d9..3fd7a45f8a 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit a48faca1d9aef59649dd1bf34bc1b6303fa3489e
+Subproject commit 3fd7a45f8aae85c080df6329e3c85887b7f3a737
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 39c51c33b3..46b4b235f3 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 39c51c33b383239c78b86afe561679f980e44842
+Subproject commit 46b4b235f38413ab5974bb22c022f9b829257674
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 5ea106b258..47a9122a5a 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 5ea106b2583285849784e774b39d62eb2615bd5d
+Subproject commit 47a9122a5a150a3217fafd3f3d4fe8e020ea718a
diff --git a/vendor/revisions.json b/vendor/revisions.json
index f983407268..6e3e489b5d 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,14 +1,14 @@
 {
   "v16": [
     "16.3",
-    "5ea106b2583285849784e774b39d62eb2615bd5d"
+    "47a9122a5a150a3217fafd3f3d4fe8e020ea718a"
   ],
   "v15": [
     "15.7",
-    "39c51c33b383239c78b86afe561679f980e44842"
+    "46b4b235f38413ab5974bb22c022f9b829257674"
   ],
   "v14": [
     "14.12",
-    "a48faca1d9aef59649dd1bf34bc1b6303fa3489e"
+    "3fd7a45f8aae85c080df6329e3c85887b7f3a737"
   ]
 }

From 4049d2b7e1a73ed1dfbadbe759fd3b1f4247606b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 14 Aug 2024 09:29:06 +0100
Subject: [PATCH 1413/1571] scrubber: fix spurious "Missed some shards" errors
 (#8661)

## Problem

The storage scrubber was reporting warnings for lots of timelines like:
```
WARN Missed some shards at count ShardCount(0) tenant_id=25eb7a83d9a2f90ac0b765b6ca84cf4c
```

These were spurious: these tenants are fine. There was a bug in
accumulating the ShardIndex for each tenant, whereby multiple timelines
would lead us to add the same ShardIndex more than one.

Closes: #8646

## Summary of changes

- Accumulate ShardIndex in a BTreeSet instead of a Vec
- Extend the test to reproduce the issue
---
 pageserver/src/http/routes.rs                 |  2 ++
 .../src/pageserver_physical_gc.rs             | 13 ++++----
 test_runner/regress/test_storage_scrubber.py  | 30 ++++++++++++-------
 3 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6f7480cc6c..fd4ead9d47 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1787,9 +1787,11 @@ async fn timeline_checkpoint_handler(
         }
 
         if wait_until_uploaded {
+            tracing::info!("Waiting for uploads to complete...");
             timeline.remote_client.wait_completion().await
             // XXX map to correct ApiError for the cases where it's due to shutdown
             .context("wait completion").map_err(ApiError::InternalServerError)?;
+            tracing::info!("Uploads completed up to {}", timeline.get_remote_consistent_lsn_projected().unwrap_or(Lsn(0)));
         }
 
         json_response(StatusCode::OK, ())
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index ff230feae3..c8b1ed49f4 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -1,4 +1,4 @@
-use std::collections::{BTreeMap, HashMap};
+use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 
@@ -117,7 +117,7 @@ use refs::AncestorRefs;
 // - Are there any refs to ancestor shards' layers?
 #[derive(Default)]
 struct TenantRefAccumulator {
-    shards_seen: HashMap<TenantId, Vec<ShardIndex>>,
+    shards_seen: HashMap<TenantId, BTreeSet<ShardIndex>>,
 
     // For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to
     ancestor_ref_shards: AncestorRefs,
@@ -130,7 +130,7 @@ impl TenantRefAccumulator {
             .shards_seen
             .entry(ttid.tenant_shard_id.tenant_id)
             .or_default())
-        .push(this_shard_idx);
+        .insert(this_shard_idx);
 
         let mut ancestor_refs = Vec::new();
         for (layer_name, layer_metadata) in &index_part.layer_metadata {
@@ -154,7 +154,7 @@ impl TenantRefAccumulator {
         summary: &mut GcSummary,
     ) -> (Vec<TenantShardId>, AncestorRefs) {
         let mut ancestors_to_gc = Vec::new();
-        for (tenant_id, mut shard_indices) in self.shards_seen {
+        for (tenant_id, shard_indices) in self.shards_seen {
             // Find the highest shard count
             let latest_count = shard_indices
                 .iter()
@@ -162,6 +162,7 @@ impl TenantRefAccumulator {
                 .max()
                 .expect("Always at least one shard");
 
+            let mut shard_indices = shard_indices.iter().collect::<Vec<_>>();
             let (mut latest_shards, ancestor_shards) = {
                 let at =
                     itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count);
@@ -174,7 +175,7 @@ impl TenantRefAccumulator {
             // to scan the S3 bucket halfway through a shard split.
             if latest_shards.len() != latest_count.count() as usize {
                 // This should be extremely rare, so we warn on it.
-                tracing::warn!(%tenant_id, "Missed some shards at count {:?}", latest_count);
+                tracing::warn!(%tenant_id, "Missed some shards at count {:?}: {latest_shards:?}", latest_count);
                 continue;
             }
 
@@ -212,7 +213,7 @@ impl TenantRefAccumulator {
                         .iter()
                         .map(|s| s.tenant_shard_id.to_index())
                         .collect();
-                    if controller_indices != latest_shards {
+                    if !controller_indices.iter().eq(latest_shards.iter().copied()) {
                         tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})");
                         continue;
                     }
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 388f6a9e92..2844d1b1d2 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -204,6 +204,11 @@ def test_scrubber_physical_gc_ancestors(
         },
     )
 
+    # Create an extra timeline, to ensure the scrubber isn't confused by multiple timelines
+    env.storage_controller.pageserver_api().timeline_create(
+        env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate()
+    )
+
     # Make sure the original shard has some layers
     workload = Workload(env, tenant_id, timeline_id)
     workload.init()
@@ -214,6 +219,11 @@ def test_scrubber_physical_gc_ancestors(
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
     env.storage_controller.reconcile_until_idle()  # Move shards to their final locations immediately
 
+    # Create a timeline after split, to ensure scrubber can handle timelines that exist in child shards but not ancestors
+    env.storage_controller.pageserver_api().timeline_create(
+        env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate()
+    )
+
     # Make sure child shards have some layers.  Do not force upload, because the test helper calls checkpoint, which
     # compacts, and we only want to do tha explicitly later in the test.
     workload.write_rows(100, upload=False)
@@ -305,10 +315,19 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
     # Make sure the original shard has some layers
     workload = Workload(env, tenant_id, timeline_id)
     workload.init()
-    workload.write_rows(100)
+    workload.write_rows(100, upload=False)
+    workload.stop()
 
     new_shard_count = 4
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+        log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
+        ps.http_client().timeline_checkpoint(
+            shard, timeline_id, compact=False, wait_until_uploaded=True
+        )
+
+        ps.http_client().deletion_queue_flush(execute=True)
 
     # Create a second timeline so that when we delete the first one, child shards still have some content in S3.
     #
@@ -319,15 +338,6 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
         PgVersion.NOT_SET, tenant_id, other_timeline_id
     )
 
-    # Write after split so that child shards have some indices in S3
-    workload.write_rows(100, upload=False)
-    for shard in shards:
-        ps = env.get_tenant_pageserver(shard)
-        log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
-        ps.http_client().timeline_checkpoint(
-            shard, timeline_id, compact=False, wait_until_uploaded=True
-        )
-
     # The timeline still exists in child shards and they reference its layers, so scrubbing
     # now shouldn't delete anything.
     gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")

From 485d76ac622dcb8d847ecce9eef2ca714768e7df Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 14 Aug 2024 12:16:18 +0300
Subject: [PATCH 1414/1571] timeline_detach_ancestor: adjust error handling
 (#8528)

With additional phases from #8430 the `detach_ancestor::Error` became
untenable. Split it up into phases, and introduce laundering for
remaining `anyhow::Error` to propagate them as most often
`Error::ShuttingDown`.

Additionally, complete FIXMEs.

Cc: #6994
---
 libs/remote_storage/src/error.rs              |   4 +
 pageserver/src/http/routes.rs                 |   3 +-
 pageserver/src/tenant/mgr.rs                  | 100 +++----
 pageserver/src/tenant/storage_layer/layer.rs  |   6 +
 pageserver/src/tenant/timeline.rs             |   2 +-
 .../src/tenant/timeline/detach_ancestor.rs    | 181 ++++++-------
 storage_controller/src/service.rs             |   9 +-
 .../regress/test_timeline_detach_ancestor.py  | 246 ++++++++++++++----
 8 files changed, 347 insertions(+), 204 deletions(-)

diff --git a/libs/remote_storage/src/error.rs b/libs/remote_storage/src/error.rs
index 66422853e1..5fd0eaabc7 100644
--- a/libs/remote_storage/src/error.rs
+++ b/libs/remote_storage/src/error.rs
@@ -42,6 +42,10 @@ impl DownloadError {
             Timeout | Other(_) => false,
         }
     }
+
+    pub fn is_cancelled(&self) -> bool {
+        matches!(self, DownloadError::Cancelled)
+    }
 }
 
 impl From<std::io::Error> for DownloadError {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index fd4ead9d47..d209f4eced 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1900,8 +1900,7 @@ async fn timeline_detach_ancestor_handler(
                         attempt,
                         ctx,
                     )
-                    .await
-                    .map_err(ApiError::InternalServerError)?;
+                    .await?;
 
                 AncestorDetached {
                     reparented_timelines,
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 5f2539d426..4e6ea0c8f9 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1929,61 +1929,51 @@ impl TenantManager {
         prepared: PreparedTimelineDetach,
         mut attempt: detach_ancestor::Attempt,
         ctx: &RequestContext,
-    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
-        use crate::tenant::timeline::detach_ancestor::Error;
-        // FIXME: this is unnecessary, slotguard already has these semantics
-        struct RevertOnDropSlot(Option<SlotGuard>);
+    ) -> Result<HashSet<TimelineId>, detach_ancestor::Error> {
+        use detach_ancestor::Error;
 
-        impl Drop for RevertOnDropSlot {
-            fn drop(&mut self) {
-                if let Some(taken) = self.0.take() {
-                    taken.revert();
-                }
-            }
-        }
+        let slot_guard =
+            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist).map_err(
+                |e| {
+                    use TenantSlotError::*;
 
-        impl RevertOnDropSlot {
-            fn into_inner(mut self) -> SlotGuard {
-                self.0.take().unwrap()
-            }
-        }
-
-        impl std::ops::Deref for RevertOnDropSlot {
-            type Target = SlotGuard;
-
-            fn deref(&self) -> &Self::Target {
-                self.0.as_ref().unwrap()
-            }
-        }
-
-        let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
-        let slot_guard = RevertOnDropSlot(Some(slot_guard));
+                    match e {
+                        MapState(TenantMapError::ShuttingDown) => Error::ShuttingDown,
+                        NotFound(_) | InProgress | MapState(_) => Error::DetachReparent(e.into()),
+                    }
+                },
+            )?;
 
         let tenant = {
-            let Some(old_slot) = slot_guard.get_old_value() else {
-                anyhow::bail!(
-                    "Tenant not found when trying to complete detaching timeline ancestor"
-                );
-            };
+            let old_slot = slot_guard
+                .get_old_value()
+                .as_ref()
+                .expect("requested MustExist");
 
             let Some(tenant) = old_slot.get_attached() else {
-                anyhow::bail!("Tenant is not in attached state");
+                return Err(Error::DetachReparent(anyhow::anyhow!(
+                    "Tenant is not in attached state"
+                )));
             };
 
             if !tenant.is_active() {
-                anyhow::bail!("Tenant is not active");
+                return Err(Error::DetachReparent(anyhow::anyhow!(
+                    "Tenant is not active"
+                )));
             }
 
             tenant.clone()
         };
 
-        let timeline = tenant.get_timeline(timeline_id, true)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, true)
+            .map_err(Error::NotFound)?;
 
         let resp = timeline
             .detach_from_ancestor_and_reparent(&tenant, prepared, ctx)
             .await?;
 
-        let mut slot_guard = slot_guard.into_inner();
+        let mut slot_guard = slot_guard;
 
         let tenant = if resp.reset_tenant_required() {
             attempt.before_reset_tenant();
@@ -1991,17 +1981,20 @@ impl TenantManager {
             let (_guard, progress) = utils::completion::channel();
             match tenant.shutdown(progress, ShutdownMode::Hard).await {
                 Ok(()) => {
-                    slot_guard.drop_old_value()?;
+                    slot_guard.drop_old_value().expect("it was just shutdown");
                 }
                 Err(_barrier) => {
                     slot_guard.revert();
-                    // this really should not happen, at all, unless shutdown was already going?
-                    anyhow::bail!("Cannot restart Tenant, already shutting down");
+                    // this really should not happen, at all, unless a shutdown without acquiring
+                    // tenant slot was already going? regardless, on restart the attempt tracking
+                    // will reset to retryable.
+                    return Err(Error::ShuttingDown);
                 }
             }
 
             let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-            let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+            let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)
+                .map_err(|e| Error::DetachReparent(e.into()))?;
 
             let shard_identity = config.shard;
             let tenant = tenant_spawn(
@@ -2009,12 +2002,13 @@ impl TenantManager {
                 tenant_shard_id,
                 &tenant_path,
                 self.resources.clone(),
-                AttachedTenantConf::try_from(config)?,
+                AttachedTenantConf::try_from(config).map_err(Error::DetachReparent)?,
                 shard_identity,
                 None,
                 SpawnMode::Eager,
                 ctx,
-            )?;
+            )
+            .map_err(|_| Error::ShuttingDown)?;
 
             {
                 let mut g = tenant.ongoing_timeline_detach.lock().unwrap();
@@ -2025,7 +2019,15 @@ impl TenantManager {
                 *g = Some((attempt.timeline_id, attempt.new_barrier()));
             }
 
-            slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?;
+            // if we bail out here, we will not allow a new attempt, which should be fine.
+            // pageserver should be shutting down regardless? tenant_reset would help, unless it
+            // runs into the same problem.
+            slot_guard
+                .upsert(TenantSlot::Attached(tenant.clone()))
+                .map_err(|e| match e {
+                    TenantSlotUpsertError::ShuttingDown(_) => Error::ShuttingDown,
+                    other => Error::DetachReparent(other.into()),
+                })?;
             tenant
         } else {
             tracing::info!("skipping tenant_reset as no changes made required it");
@@ -2047,7 +2049,7 @@ impl TenantManager {
                         Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => {
                             Error::ShuttingDown
                         }
-                        other => Error::Unexpected(other.into()),
+                        other => Error::Complete(other.into()),
                     }
                 })?;
 
@@ -2057,19 +2059,16 @@ impl TenantManager {
 
             let timeline = tenant
                 .get_timeline(attempt.timeline_id, true)
-                .map_err(|_| Error::DetachedNotFoundAfterRestart)?;
+                .map_err(Error::NotFound)?;
 
             timeline
                 .complete_detaching_timeline_ancestor(&tenant, attempt, ctx)
                 .await
                 .map(|()| reparented)
-                .map_err(|e| e.into())
         } else {
             // at least the latest versions have now been downloaded and refreshed; be ready to
             // retry another time.
-            Err(anyhow::anyhow!(
-                "failed to reparent all candidate timelines, please retry"
-            ))
+            Err(Error::FailedToReparentAll)
         }
     }
 
@@ -2392,6 +2391,9 @@ impl SlotGuard {
 
     /// Get any value that was present in the slot before we acquired ownership
     /// of it: in state transitions, this will be the old state.
+    ///
+    // FIXME: get_ prefix
+    // FIXME: this should be .as_ref() -- unsure why no clippy
     fn get_old_value(&self) -> &Option<TenantSlot> {
         &self.old_value
     }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 0175f32268..9c31d5dc3f 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1612,6 +1612,12 @@ pub(crate) enum DownloadError {
     Failpoint(failpoints::FailpointKind),
 }
 
+impl DownloadError {
+    pub(crate) fn is_cancelled(&self) -> bool {
+        matches!(self, DownloadError::DownloadCancelled)
+    }
+}
+
 #[derive(Debug, PartialEq)]
 pub(crate) enum NeedsDownload {
     NotFound,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f1587951c6..c45d7431ec 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4342,7 +4342,7 @@ impl Timeline {
         tenant: &crate::tenant::Tenant,
         prepared: detach_ancestor::PreparedTimelineDetach,
         ctx: &RequestContext,
-    ) -> Result<detach_ancestor::DetachingAndReparenting, anyhow::Error> {
+    ) -> Result<detach_ancestor::DetachingAndReparenting, detach_ancestor::Error> {
         detach_ancestor::detach_and_reparent(self, tenant, prepared, ctx).await
     }
 
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 969da2662b..641faada25 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -5,7 +5,6 @@ use crate::{
     context::{DownloadBehavior, RequestContext},
     task_mgr::TaskKind,
     tenant::{
-        mgr::GetActiveTenantError,
         remote_timeline_client::index::GcBlockingReason::DetachAncestor,
         storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer},
         Tenant,
@@ -23,61 +22,74 @@ use utils::{completion, generation::Generation, http::error::ApiError, id::Timel
 pub(crate) enum Error {
     #[error("no ancestors")]
     NoAncestor,
+
     #[error("too many ancestors")]
     TooManyAncestors,
+
     #[error("shutting down, please retry later")]
     ShuttingDown,
-    #[error("flushing failed")]
-    FlushAncestor(#[source] FlushLayerError),
-    #[error("layer download failed")]
-    RewrittenDeltaDownloadFailed(#[source] crate::tenant::storage_layer::layer::DownloadError),
-    #[error("copying LSN prefix locally failed")]
-    CopyDeltaPrefix(#[source] anyhow::Error),
-    #[error("upload rewritten layer")]
-    UploadRewritten(#[source] anyhow::Error),
+
+    #[error(transparent)]
+    NotFound(crate::tenant::GetTimelineError),
+
+    #[error("failed to reparent all candidate timelines, please retry")]
+    FailedToReparentAll,
 
     #[error("ancestor is already being detached by: {}", .0)]
     OtherTimelineDetachOngoing(TimelineId),
 
-    #[error("remote copying layer failed")]
-    CopyFailed(#[source] anyhow::Error),
+    #[error("preparing to timeline ancestor detach failed")]
+    Prepare(#[source] anyhow::Error),
 
-    #[error("wait for tenant to activate after restarting")]
-    WaitToActivate(#[source] GetActiveTenantError),
+    #[error("detaching and reparenting failed")]
+    DetachReparent(#[source] anyhow::Error),
 
-    #[error("detached timeline was not found after restart")]
-    DetachedNotFoundAfterRestart,
-
-    #[error("unexpected error")]
-    Unexpected(#[source] anyhow::Error),
+    #[error("completing ancestor detach failed")]
+    Complete(#[source] anyhow::Error),
 
     #[error("failpoint: {}", .0)]
     Failpoint(&'static str),
 }
 
+impl Error {
+    /// Try to catch cancellation from within the `anyhow::Error`, or wrap the anyhow as the given
+    /// variant or fancier `or_else`.
+    fn launder<F>(e: anyhow::Error, or_else: F) -> Error
+    where
+        F: Fn(anyhow::Error) -> Error,
+    {
+        use crate::tenant::remote_timeline_client::WaitCompletionError;
+        use crate::tenant::upload_queue::NotInitialized;
+        use remote_storage::TimeoutOrCancel;
+
+        if e.is::<NotInitialized>()
+            || TimeoutOrCancel::caused_by_cancel(&e)
+            || e.downcast_ref::<remote_storage::DownloadError>()
+                .is_some_and(|e| e.is_cancelled())
+            || e.is::<WaitCompletionError>()
+        {
+            Error::ShuttingDown
+        } else {
+            or_else(e)
+        }
+    }
+}
+
 impl From<Error> for ApiError {
     fn from(value: Error) -> Self {
         match value {
-            e @ Error::NoAncestor => ApiError::Conflict(e.to_string()),
-            // TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError?
-            e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)),
+            Error::NoAncestor => ApiError::Conflict(value.to_string()),
+            Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", value)),
             Error::ShuttingDown => ApiError::ShuttingDown,
-            Error::OtherTimelineDetachOngoing(_) => {
-                ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
+            Error::OtherTimelineDetachOngoing(_) | Error::FailedToReparentAll => {
+                ApiError::ResourceUnavailable(value.to_string().into())
             }
-            e @ Error::WaitToActivate(_) => {
-                let s = utils::error::report_compact_sources(&e).to_string();
-                ApiError::ResourceUnavailable(s.into())
-            }
-            // All of these contain shutdown errors, in fact, it's the most common
-            e @ Error::FlushAncestor(_)
-            | e @ Error::RewrittenDeltaDownloadFailed(_)
-            | e @ Error::CopyDeltaPrefix(_)
-            | e @ Error::UploadRewritten(_)
-            | e @ Error::CopyFailed(_)
-            | e @ Error::Unexpected(_)
-            | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()),
-            Error::DetachedNotFoundAfterRestart => ApiError::NotFound(value.into()),
+            Error::NotFound(e) => ApiError::from(e),
+            // these variants should have no cancellation errors because of Error::launder
+            Error::Prepare(_)
+            | Error::DetachReparent(_)
+            | Error::Complete(_)
+            | Error::Failpoint(_) => ApiError::InternalServerError(value.into()),
         }
     }
 }
@@ -95,39 +107,6 @@ impl From<super::layer_manager::Shutdown> for Error {
     }
 }
 
-impl From<FlushLayerError> for Error {
-    fn from(value: FlushLayerError) -> Self {
-        match value {
-            FlushLayerError::Cancelled => Error::ShuttingDown,
-            FlushLayerError::NotRunning(_) => {
-                // FIXME(#6424): technically statically unreachable right now, given how we never
-                // drop the sender
-                Error::ShuttingDown
-            }
-            FlushLayerError::CreateImageLayersError(_) | FlushLayerError::Other(_) => {
-                Error::FlushAncestor(value)
-            }
-        }
-    }
-}
-
-impl From<GetActiveTenantError> for Error {
-    fn from(value: GetActiveTenantError) -> Self {
-        use pageserver_api::models::TenantState;
-        use GetActiveTenantError::*;
-
-        match value {
-            Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) | SwitchedTenant => {
-                Error::ShuttingDown
-            }
-            WaitForActiveTimeout { .. } | NotFound(_) | Broken(_) | WillNotBecomeActive(_) => {
-                // NotFound seems out-of-place
-                Error::WaitToActivate(value)
-            }
-        }
-    }
-}
-
 pub(crate) enum Progress {
     Prepared(Attempt, PreparedTimelineDetach),
     Done(AncestorDetached),
@@ -236,7 +215,7 @@ pub(super) async fn prepare(
 
     let attempt = start_new_attempt(detached, tenant).await?;
 
-    utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable");
+    utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking-pausable");
 
     fail::fail_point!(
         "timeline-detach-ancestor::before_starting_after_locking",
@@ -265,7 +244,17 @@ pub(super) async fn prepare(
                 }
             };
 
-            res?;
+            res.map_err(|e| {
+                use FlushLayerError::*;
+                match e {
+                    Cancelled | NotRunning(_) => {
+                        // FIXME(#6424): technically statically unreachable right now, given how we never
+                        // drop the sender
+                        Error::ShuttingDown
+                    }
+                    CreateImageLayersError(_) | Other(_) => Error::Prepare(e.into()),
+                }
+            })?;
 
             // we do not need to wait for uploads to complete but we do need `struct Layer`,
             // copying delta prefix is unsupported currently for `InMemoryLayer`.
@@ -346,7 +335,7 @@ pub(super) async fn prepare(
                 }
                 Ok(Ok(None)) => {}
                 Ok(Err(e)) => return Err(e),
-                Err(je) => return Err(Unexpected(je.into())),
+                Err(je) => return Err(Error::Prepare(je.into())),
             }
         }
 
@@ -394,7 +383,7 @@ pub(super) async fn prepare(
             Ok(Err(failed)) => {
                 return Err(failed);
             }
-            Err(je) => return Err(Unexpected(je.into())),
+            Err(je) => return Err(Error::Prepare(je.into())),
         }
     }
 
@@ -416,8 +405,7 @@ async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result<Attem
             crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor,
         )
         .await
-        // FIXME: better error
-        .map_err(Error::Unexpected)?;
+        .map_err(|e| Error::launder(e, Error::Prepare))?;
 
     Ok(attempt)
 }
@@ -546,19 +534,17 @@ async fn upload_rewritten_layer(
     cancel: &CancellationToken,
     ctx: &RequestContext,
 ) -> Result<Option<Layer>, Error> {
-    use Error::UploadRewritten;
     let copied = copy_lsn_prefix(end_lsn, layer, target, ctx).await?;
 
     let Some(copied) = copied else {
         return Ok(None);
     };
 
-    // FIXME: better shuttingdown error
     target
         .remote_client
         .upload_layer_file(&copied, cancel)
         .await
-        .map_err(UploadRewritten)?;
+        .map_err(|e| Error::launder(e, Error::Prepare))?;
 
     Ok(Some(copied.into()))
 }
@@ -569,10 +555,8 @@ async fn copy_lsn_prefix(
     target_timeline: &Arc<Timeline>,
     ctx: &RequestContext,
 ) -> Result<Option<ResidentLayer>, Error> {
-    use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed, ShuttingDown};
-
     if target_timeline.cancel.is_cancelled() {
-        return Err(ShuttingDown);
+        return Err(Error::ShuttingDown);
     }
 
     tracing::debug!(%layer, %end_lsn, "copying lsn prefix");
@@ -586,18 +570,22 @@ async fn copy_lsn_prefix(
         ctx,
     )
     .await
-    .map_err(CopyDeltaPrefix)?;
+    .with_context(|| format!("prepare to copy lsn prefix of ancestors {layer}"))
+    .map_err(Error::Prepare)?;
 
-    let resident = layer
-        .download_and_keep_resident()
-        .await
-        // likely shutdown
-        .map_err(RewrittenDeltaDownloadFailed)?;
+    let resident = layer.download_and_keep_resident().await.map_err(|e| {
+        if e.is_cancelled() {
+            Error::ShuttingDown
+        } else {
+            Error::Prepare(e.into())
+        }
+    })?;
 
     let records = resident
         .copy_delta_prefix(&mut writer, end_lsn, ctx)
         .await
-        .map_err(CopyDeltaPrefix)?;
+        .with_context(|| format!("copy lsn prefix of ancestors {layer}"))
+        .map_err(Error::Prepare)?;
 
     drop(resident);
 
@@ -615,9 +603,9 @@ async fn copy_lsn_prefix(
         let (desc, path) = writer
             .finish(reused_highest_key, ctx)
             .await
-            .map_err(CopyDeltaPrefix)?;
+            .map_err(Error::Prepare)?;
         let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path)
-            .map_err(CopyDeltaPrefix)?;
+            .map_err(Error::Prepare)?;
 
         tracing::debug!(%layer, %copied, "new layer produced");
 
@@ -633,8 +621,6 @@ async fn remote_copy(
     generation: Generation,
     cancel: &CancellationToken,
 ) -> Result<Layer, Error> {
-    use Error::CopyFailed;
-
     // depending if Layer::keep_resident we could hardlink
 
     let mut metadata = adopted.metadata();
@@ -648,13 +634,12 @@ async fn remote_copy(
         metadata,
     );
 
-    // FIXME: better shuttingdown error
     adoptee
         .remote_client
         .copy_timeline_layer(adopted, &owned, cancel)
         .await
         .map(move |()| owned)
-        .map_err(CopyFailed)
+        .map_err(|e| Error::launder(e, Error::Prepare))
 }
 
 pub(crate) enum DetachingAndReparenting {
@@ -698,7 +683,7 @@ pub(super) async fn detach_and_reparent(
     tenant: &Tenant,
     prepared: PreparedTimelineDetach,
     _ctx: &RequestContext,
-) -> Result<DetachingAndReparenting, anyhow::Error> {
+) -> Result<DetachingAndReparenting, Error> {
     let PreparedTimelineDetach { layers } = prepared;
 
     #[derive(Debug)]
@@ -783,7 +768,8 @@ pub(super) async fn detach_and_reparent(
                     (ancestor.timeline_id, ancestor_lsn),
                 )
                 .await
-                .context("publish layers and detach ancestor")?;
+                .context("publish layers and detach ancestor")
+                .map_err(|e| Error::launder(e, Error::DetachReparent))?;
 
             tracing::info!(
                 ancestor=%ancestor.timeline_id,
@@ -927,8 +913,7 @@ pub(super) async fn complete(
             crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor,
         )
         .await
-        // FIXME: better error
-        .map_err(Error::Unexpected)?;
+        .map_err(|e| Error::launder(e, Error::Complete))?;
 
     Ok(())
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ee8e9ac5a1..ef4cd91efd 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3006,8 +3006,13 @@ impl Service {
                         Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
                             ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
                         }
+                        Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, msg) => {
+                            // avoid turning these into conflicts to remain compatible with
+                            // pageservers, 500 errors are sadly retryable with timeline ancestor
+                            // detach
+                            ApiError::InternalServerError(anyhow::anyhow!("{node}: {msg}"))
+                        }
                         // rest can be mapped as usual
-                        // FIXME: this converts some 500 to 409 which is not per openapi
                         other => passthrough_api_error(&node, other),
                     }
                 })
@@ -3041,6 +3046,8 @@ impl Service {
                 ?mismatching,
                 "shards returned different results"
             );
+
+            return Err(ApiError::InternalServerError(anyhow::anyhow!("pageservers returned mixed results for ancestor detach; manual intervention is required.")));
         }
 
         Ok(any.1)
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 902457c2ac..82fc26126d 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -97,7 +97,7 @@ def test_ancestor_detach_branched_from(
         client.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
 
         ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);")
-        wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+        flush_ep_to_pageserver(env, ep, env.initial_tenant, env.initial_timeline)
 
     deltas = client.layer_map_info(env.initial_tenant, env.initial_timeline).delta_layers()
     # there is also the in-mem layer, but ignore it for now
@@ -452,6 +452,9 @@ def test_compaction_induced_by_detaches_in_history(
         }
     )
     env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+    env.pageserver.allowed_errors.append(
+        ".*await_initial_logical_size: can't get semaphore cancel token, skipping"
+    )
     client = env.pageserver.http_client()
 
     def delta_layers(timeline_id: TimelineId):
@@ -524,6 +527,7 @@ def test_compaction_induced_by_detaches_in_history(
     assert len([filter(lambda x: x.l0, delta_layers(branch_timeline_id))]) == 1
 
     skip_main = branches[1:]
+
     branch_lsn = client.timeline_detail(env.initial_tenant, branch_timeline_id)["ancestor_lsn"]
 
     # take the fullbackup before and after inheriting the new L0s
@@ -532,6 +536,13 @@ def test_compaction_induced_by_detaches_in_history(
         env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_before
     )
 
+    # force initial logical sizes, so we can evict all layers from all
+    # timelines and exercise on-demand download for copy lsn prefix
+    client.timeline_detail(
+        env.initial_tenant, env.initial_timeline, force_await_initial_logical_size=True
+    )
+    client.evict_all_layers(env.initial_tenant, env.initial_timeline)
+
     for _, timeline_id in skip_main:
         reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
         assert reparented == set(), "we have no earlier branches at any level"
@@ -705,7 +716,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
     log.info(f"stuck pageserver is id={stuck.id}")
     stuck_http = stuck.http_client()
     stuck_http.configure_failpoints(
-        ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause")
+        ("timeline-detach-ancestor::before_starting_after_locking-pausable", "pause")
     )
 
     restarted = pageservers[int(shards[1]["node_id"])]
@@ -716,7 +727,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
     restarted_http = restarted.http_client()
     restarted_http.configure_failpoints(
         [
-            ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause"),
+            ("timeline-detach-ancestor::before_starting_after_locking-pausable", "pause"),
         ]
     )
 
@@ -734,7 +745,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
         target.detach_ancestor(env.initial_tenant, branch_timeline_id, timeout=1)
 
     stuck_http.configure_failpoints(
-        ("timeline-detach-ancestor::before_starting_after_locking_pausable", "off")
+        ("timeline-detach-ancestor::before_starting_after_locking-pausable", "off")
     )
 
     barrier = threading.Barrier(2)
@@ -753,7 +764,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
         # we have 10s, lets use 1/2 of that to help the shutdown start
         time.sleep(5)
         restarted_http.configure_failpoints(
-            ("timeline-detach-ancestor::before_starting_after_locking_pausable", "off")
+            ("timeline-detach-ancestor::before_starting_after_locking-pausable", "off")
         )
         fut.result()
 
@@ -806,7 +817,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
     after starting the detach.
 
     What remains not tested by this:
-    - shutdown winning over complete
+    - shutdown winning over complete, see test_timeline_is_deleted_before_timeline_detach_ancestor_completes
     """
 
     if sharded and mode == "delete_tenant":
@@ -833,7 +844,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
 
     detached_timeline = env.neon_cli.create_branch("detached soon", "main")
 
-    pausepoint = "timeline-detach-ancestor::before_starting_after_locking_pausable"
+    pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable"
 
     env.storage_controller.reconcile_until_idle()
     shards = env.storage_controller.locate(env.initial_tenant)
@@ -931,7 +942,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
     _, offset = other.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset)
 
 
-@pytest.mark.parametrize("mode", ["delete_reparentable_timeline"])
+@pytest.mark.parametrize("mode", ["delete_reparentable_timeline", "create_reparentable_timeline"])
 def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnvBuilder, mode: str):
     """
     Technically possible storage controller concurrent interleaving timeline
@@ -943,12 +954,6 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
     must be detached.
     """
 
-    assert (
-        mode == "delete_reparentable_timeline"
-    ), "only one now, but creating reparentable timelines cannot be supported even with gc blocking"
-    # perhaps it could be supported by always doing this for the shard0 first, and after that for others.
-    # when we run shard0 to completion, we can use it's timelines to restrict which can be reparented.
-
     shard_count = 2
     neon_env_builder.num_pageservers = shard_count
     env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
@@ -980,14 +985,21 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
     for ps, shard_id in [(pageservers[int(x["node_id"])], x["shard_id"]) for x in shards]:
         ps.http_client().timeline_checkpoint(shard_id, env.initial_timeline)
 
-    first_branch = env.neon_cli.create_branch(
-        "first_branch", ancestor_branch_name="main", ancestor_start_lsn=first_branch_lsn
-    )
+    def create_reparentable_timeline() -> TimelineId:
+        return env.neon_cli.create_branch(
+            "first_branch", ancestor_branch_name="main", ancestor_start_lsn=first_branch_lsn
+        )
+
+    if mode == "delete_reparentable_timeline":
+        first_branch = create_reparentable_timeline()
+    else:
+        first_branch = None
+
     detached_branch = env.neon_cli.create_branch(
         "detached_branch", ancestor_branch_name="main", ancestor_start_lsn=detached_branch_lsn
     )
 
-    pausepoint = "timeline-detach-ancestor::before_starting_after_locking_pausable"
+    pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable"
 
     stuck = pageservers[int(shards[0]["node_id"])]
     stuck_http = stuck.http_client().without_status_retrying()
@@ -999,12 +1011,6 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
         (pausepoint, "pause"),
     )
 
-    # noticed a surprising 409 if the other one would fail instead
-    # victim_http.configure_failpoints([
-    #     (pausepoint, "pause"),
-    #     ("timeline-detach-ancestor::before_starting_after_locking", "return"),
-    # ])
-
     # interleaving a create_timeline which could be reparented will produce two
     # permanently different reparentings: one node has reparented, other has
     # not
@@ -1023,6 +1029,7 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
         assert detail.get("ancestor_lsn") is None
 
     def first_branch_gone():
+        assert first_branch is not None
         try:
             env.storage_controller.pageserver_api().timeline_detail(
                 env.initial_tenant, first_branch
@@ -1043,42 +1050,178 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
             stuck_http.configure_failpoints((pausepoint, "off"))
             wait_until(10, 1.0, first_completed)
 
-            # if we would let victim fail, for some reason there'd be a 409 response instead of 500
-            # victim_http.configure_failpoints((pausepoint, "off"))
-            # with pytest.raises(PageserverApiException, match=".* 500 Internal Server Error failpoint: timeline-detach-ancestor::before_starting_after_locking") as exc:
-            #     fut.result()
-            # assert exc.value.status_code == 409
-
-            env.storage_controller.pageserver_api().timeline_delete(
-                env.initial_tenant, first_branch
-            )
-            victim_http.configure_failpoints((pausepoint, "off"))
-            wait_until(10, 1.0, first_branch_gone)
+            if mode == "delete_reparentable_timeline":
+                assert first_branch is not None
+                env.storage_controller.pageserver_api().timeline_delete(
+                    env.initial_tenant, first_branch
+                )
+                victim_http.configure_failpoints((pausepoint, "off"))
+                wait_until(10, 1.0, first_branch_gone)
+            elif mode == "create_reparentable_timeline":
+                first_branch = create_reparentable_timeline()
+                victim_http.configure_failpoints((pausepoint, "off"))
+            else:
+                raise RuntimeError("{mode}")
 
             # it now passes, and we should get an error messages about mixed reparenting as the stuck still had something to reparent
-            fut.result()
+            mixed_results = "pageservers returned mixed results for ancestor detach; manual intervention is required."
+            with pytest.raises(PageserverApiException, match=mixed_results):
+                fut.result()
 
             msg, offset = env.storage_controller.assert_log_contains(
                 ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*"
             )
-            log.info(f"expected error message: {msg}")
-            env.storage_controller.allowed_errors.append(
-                ".*: shards returned different results matching=0 .*"
+            log.info(f"expected error message: {msg.rstrip()}")
+            env.storage_controller.allowed_errors.extend(
+                [
+                    ".*: shards returned different results matching=0 .*",
+                    f".*: InternalServerError\\({mixed_results}",
+                ]
             )
 
-            detach_timeline()
+            if mode == "create_reparentable_timeline":
+                with pytest.raises(PageserverApiException, match=mixed_results):
+                    detach_timeline()
+            else:
+                # it is a bit shame to flag it and then it suceeds, but most
+                # likely there would be a retry loop which would take care of
+                # this in cplane
+                detach_timeline()
 
-            # FIXME: perhaps the above should be automatically retried, if we get mixed results?
-            not_found = env.storage_controller.log_contains(
+            retried = env.storage_controller.log_contains(
                 ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*",
-                offset=offset,
+                offset,
             )
-
-            assert not_found is None
+            if mode == "delete_reparentable_timeline":
+                assert (
+                    retried is None
+                ), "detaching should had converged after both nodes saw the deletion"
+            elif mode == "create_reparentable_timeline":
+                assert retried is not None, "detaching should not have converged"
+                _, offset = retried
         finally:
             stuck_http.configure_failpoints((pausepoint, "off"))
             victim_http.configure_failpoints((pausepoint, "off"))
 
+    if mode == "create_reparentable_timeline":
+        assert first_branch is not None
+        # now we have mixed ancestry
+        assert (
+            TimelineId(
+                stuck_http.timeline_detail(shards[0]["shard_id"], first_branch)[
+                    "ancestor_timeline_id"
+                ]
+            )
+            == env.initial_timeline
+        )
+        assert (
+            TimelineId(
+                victim_http.timeline_detail(shards[-1]["shard_id"], first_branch)[
+                    "ancestor_timeline_id"
+                ]
+            )
+            == detached_branch
+        )
+
+        # make sure we are still able to repair this by detaching the ancestor on the storage controller in case it ever happens
+        # if the ancestor would be deleted, we would partially fail, making deletion stuck.
+        env.storage_controller.pageserver_api().detach_ancestor(env.initial_tenant, first_branch)
+
+        # and we should now have good results
+        not_found = env.storage_controller.log_contains(
+            ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*",
+            offset,
+        )
+
+        assert not_found is None
+        assert (
+            stuck_http.timeline_detail(shards[0]["shard_id"], first_branch)["ancestor_timeline_id"]
+            is None
+        )
+        assert (
+            victim_http.timeline_detail(shards[-1]["shard_id"], first_branch)[
+                "ancestor_timeline_id"
+            ]
+            is None
+        )
+
+
+def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor(
+    neon_env_builder: NeonEnvBuilder,
+):
+    shard_count = 2
+    neon_env_builder.num_pageservers = shard_count
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+
+    for ps in env.pageservers:
+        ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+
+    env.storage_controller.reconcile_until_idle()
+    shards = env.storage_controller.locate(env.initial_tenant)
+    assert len(set(x["node_id"] for x in shards)) == shard_count
+
+    detached_branch = env.neon_cli.create_branch("detached_branch", ancestor_branch_name="main")
+
+    pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable"
+    failpoint = "timeline-detach-ancestor::before_starting_after_locking"
+
+    stuck = pageservers[int(shards[0]["node_id"])]
+    stuck_http = stuck.http_client().without_status_retrying()
+    stuck_http.configure_failpoints(
+        (pausepoint, "pause"),
+    )
+
+    env.storage_controller.allowed_errors.append(
+        f".*Error processing HTTP request: .* failpoint: {failpoint}"
+    )
+    http = env.storage_controller.pageserver_api()
+
+    victim = pageservers[int(shards[-1]["node_id"])]
+    victim.allowed_errors.append(
+        f".*Error processing HTTP request: InternalServerError\\(failpoint: {failpoint}"
+    )
+    victim_http = victim.http_client().without_status_retrying()
+    victim_http.configure_failpoints([(pausepoint, "pause"), (failpoint, "return")])
+
+    def detach_timeline():
+        http.detach_ancestor(env.initial_tenant, detached_branch)
+
+    def paused_at_failpoint():
+        stuck.assert_log_contains(f"at failpoint {pausepoint}")
+        victim.assert_log_contains(f"at failpoint {pausepoint}")
+
+    def first_completed():
+        detail = stuck_http.timeline_detail(shards[0]["shard_id"], detached_branch)
+        log.info(detail)
+        assert detail.get("ancestor_lsn") is None
+
+    with ThreadPoolExecutor(max_workers=1) as pool:
+        try:
+            fut = pool.submit(detach_timeline)
+            wait_until(10, 1.0, paused_at_failpoint)
+
+            # let stuck complete
+            stuck_http.configure_failpoints((pausepoint, "off"))
+            wait_until(10, 1.0, first_completed)
+
+            victim_http.configure_failpoints((pausepoint, "off"))
+
+            with pytest.raises(
+                PageserverApiException,
+                match=f".*failpoint: {failpoint}",
+            ) as exc:
+                fut.result()
+            assert exc.value.status_code == 500
+
+        finally:
+            stuck_http.configure_failpoints((pausepoint, "off"))
+            victim_http.configure_failpoints((pausepoint, "off"))
+
+    victim_http.configure_failpoints((failpoint, "off"))
+    detach_timeline()
+
 
 def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: NeonEnvBuilder):
     """
@@ -1169,7 +1312,7 @@ def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: Neon
             match=".*failed to reparent all candidate timelines, please retry",
         ) as exc:
             http.detach_ancestor(env.initial_tenant, detached)
-        assert exc.value.status_code == 500
+        assert exc.value.status_code == 503
 
     # first round -- do more checking to make sure the gc gets paused
     try_detach()
@@ -1323,14 +1466,11 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes(
 
             http.configure_failpoints((failpoint, "off"))
 
-            with pytest.raises(PageserverApiException) as exc:
+            with pytest.raises(
+                PageserverApiException, match="NotFound: Timeline .* was not found"
+            ) as exc:
                 detach.result()
-
-            # FIXME: this should be 404 but because there is another Anyhow conversion it is 500
-            assert exc.value.status_code == 500
-            env.pageserver.allowed_errors.append(
-                ".*Error processing HTTP request: InternalServerError\\(detached timeline was not found after restart"
-            )
+            assert exc.value.status_code == 404
     finally:
         http.configure_failpoints((failpoint, "off"))
 

From 19d69d515cacc7287517371e3b39fe7f874b306f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 14 Aug 2024 12:10:15 +0100
Subject: [PATCH 1415/1571] pageserver: evict covered layers earlier (#8679)

## Problem

When pageservers do compaction, they frequently create image layers that
make earlier layers un-needed for reads, but then keep those earlier
layers around for 24 hours waiting for time-based eviction to expire
them.

Now that we track layer visibility, we can use it as an input to
eviction, and avoid the 24 hour "disk bump" that happens around
pageserver restarts.

## Summary of changes

- During time-based eviction, if a layer is marked Covered, use the
eviction period as the threshold: i.e. these layers get to remain
resident for at least one iteration of the eviction loop, but then get
evicted. With current settings this means they get evicted after 1h
instead of 24h.
- During disk usage eviction, prioritized evicting covered layers above
all other layers.


Caveats:
- Using the period as the threshold for time based eviction in this case
is a bit of a hack, but it avoids adding yet another configuration
property, and in any case the value of a new property would be somewhat
arbitrary: there's no "right" length of time to keep covered layers
around just in case.
- We had previously planned on removing time-based eviction: this change
would motivate us to keep it around, but we can still simplify the code
later to just do the eviction of covered layers, rather than applying a
TTL policy to all layers.
---
 pageserver/src/disk_usage_eviction_task.rs    | 54 +++++++++++++------
 pageserver/src/tenant/secondary/downloader.rs |  5 +-
 pageserver/src/tenant/timeline.rs             |  1 +
 .../src/tenant/timeline/eviction_task.rs      | 20 ++++++-
 4 files changed, 62 insertions(+), 18 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 92dcf6ee61..5e4a49bc56 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -64,7 +64,7 @@ use crate::{
         mgr::TenantManager,
         remote_timeline_client::LayerFileMetadata,
         secondary::SecondaryTenant,
-        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
+        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint},
     },
     CancellableTask, DiskUsageEvictionTask,
 };
@@ -114,7 +114,7 @@ fn default_highest_layer_count_loses_first() -> bool {
 }
 
 impl EvictionOrder {
-    fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
+    fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) {
         use EvictionOrder::*;
 
         match self {
@@ -644,6 +644,7 @@ pub(crate) struct EvictionCandidate {
     pub(crate) layer: EvictionLayer,
     pub(crate) last_activity_ts: SystemTime,
     pub(crate) relative_last_activity: finite_f32::FiniteF32,
+    pub(crate) visibility: LayerVisibilityHint,
 }
 
 impl std::fmt::Display for EvictionLayer {
@@ -685,14 +686,22 @@ impl std::fmt::Debug for EvictionCandidate {
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
-enum MinResidentSizePartition {
+enum EvictionPartition {
+    // A layer that is un-wanted by the tenant: evict all these first, before considering
+    // any other layers
+    EvictNow,
+
+    // Above the minimum size threshold: this layer is a candidate for eviction.
     Above,
+
+    // Below the minimum size threshold: this layer should only be evicted if all the
+    // tenants' layers above the minimum size threshold have already been considered.
     Below,
 }
 
 enum EvictionCandidates {
     Cancelled,
-    Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
+    Finished(Vec<(EvictionPartition, EvictionCandidate)>),
 }
 
 /// Gather the eviction candidates.
@@ -890,8 +899,10 @@ async fn collect_eviction_candidates(
             max_layer_size
         };
 
-        // Sort layers most-recently-used first, then partition by
-        // cumsum above/below min_resident_size.
+        // Sort layers most-recently-used first, then calculate [`EvictionPartition`] for each layer,
+        // where the inputs are:
+        //  - whether the layer is visible
+        //  - whether the layer is above/below the min_resident_size cutline
         tenant_candidates
             .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
         let mut cumsum: i128 = 0;
@@ -908,12 +919,23 @@ async fn collect_eviction_candidates(
                     candidate.relative_last_activity =
                         eviction_order.relative_last_activity(total, i);
 
-                    let partition = if cumsum > min_resident_size as i128 {
-                        MinResidentSizePartition::Above
-                    } else {
-                        MinResidentSizePartition::Below
+                    let partition = match candidate.visibility {
+                        LayerVisibilityHint::Covered => {
+                            // Covered layers are evicted first
+                            EvictionPartition::EvictNow
+                        }
+                        LayerVisibilityHint::Visible => {
+                            cumsum += i128::from(candidate.layer.get_file_size());
+
+                            if cumsum > min_resident_size as i128 {
+                                EvictionPartition::Above
+                            } else {
+                                // The most recent layers below the min_resident_size threshold
+                                // are the last to be evicted.
+                                EvictionPartition::Below
+                            }
+                        }
                     };
-                    cumsum += i128::from(candidate.layer.get_file_size());
 
                     (partition, candidate)
                 });
@@ -981,7 +1003,7 @@ async fn collect_eviction_candidates(
                         // Secondary locations' layers are always considered above the min resident size,
                         // i.e. secondary locations are permitted to be trimmed to zero layers if all
                         // the layers have sufficiently old access times.
-                        MinResidentSizePartition::Above,
+                        EvictionPartition::Above,
                         candidate,
                     )
                 });
@@ -1009,7 +1031,9 @@ async fn collect_eviction_candidates(
         }
     }
 
-    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
+    debug_assert!(EvictionPartition::Above < EvictionPartition::Below,
+        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
+    debug_assert!(EvictionPartition::EvictNow < EvictionPartition::Above,
         "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
 
     eviction_order.sort(&mut candidates);
@@ -1022,7 +1046,7 @@ async fn collect_eviction_candidates(
 ///
 /// Returns the amount of candidates selected, with the planned usage.
 fn select_victims<U: Usage>(
-    candidates: &[(MinResidentSizePartition, EvictionCandidate)],
+    candidates: &[(EvictionPartition, EvictionCandidate)],
     usage_pre: U,
 ) -> VictimSelection<U> {
     let mut usage_when_switched = None;
@@ -1034,7 +1058,7 @@ fn select_victims<U: Usage>(
             break;
         }
 
-        if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() {
+        if partition == &EvictionPartition::Below && usage_when_switched.is_none() {
             usage_when_switched = Some((usage_planned, i));
         }
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 135e73b57f..8cff1d2864 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -22,7 +22,7 @@ use crate::{
             FAILED_REMOTE_OP_RETRIES,
         },
         span::debug_assert_current_span_has_tenant_id,
-        storage_layer::{layer::local_layer_path, LayerName},
+        storage_layer::{layer::local_layer_path, LayerName, LayerVisibilityHint},
         tasks::{warn_when_period_overrun, BackgroundLoopKind},
     },
     virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
@@ -296,6 +296,9 @@ impl SecondaryDetail {
                         }),
                         last_activity_ts: ods.access_time,
                         relative_last_activity: finite_f32::FiniteF32::ZERO,
+                        // Secondary location layers are presumed visible, because Covered layers
+                        // are excluded from the heatmap
+                        visibility: LayerVisibilityHint::Visible,
                     }
                 }));
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c45d7431ec..a799ce764a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5261,6 +5261,7 @@ impl Timeline {
                     layer: layer.to_owned().into(),
                     last_activity_ts,
                     relative_last_activity: finite_f32::FiniteF32::ZERO,
+                    visibility: layer.visibility(),
                 }
             })
             .collect();
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 07d860eb80..eaa9c0ff62 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,7 +30,8 @@ use crate::{
     pgdatadir_mapping::CollectKeySpaceError,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     tenant::{
-        tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
+        storage_layer::LayerVisibilityHint, tasks::BackgroundLoopKind, timeline::EvictionError,
+        LogicalSizeCalculationCause, Tenant,
     },
 };
 
@@ -241,7 +242,22 @@ impl Timeline {
                         }
                     };
 
-                    no_activity_for > p.threshold
+                    match layer.visibility() {
+                        LayerVisibilityHint::Visible => {
+                            // Usual case: a visible layer might be read any time, and we will keep it
+                            // resident until it hits our configured TTL threshold.
+                            no_activity_for > p.threshold
+                        }
+                        LayerVisibilityHint::Covered => {
+                            // Covered layers: this is probably a layer that was recently covered by
+                            // an image layer during compaction.  We don't evict it immediately, but
+                            // it doesn't stay resident for the full `threshold`: we just keep it
+                            // for a shorter time in case
+                            // - it is used for Timestamp->LSN lookups
+                            // - a new branch is created in recent history which will read this layer
+                            no_activity_for > p.period
+                        }
+                    }
                 })
                 .cloned()
                 .for_each(|layer| {

From fc3d372f3ab52ee3d4b9df5fc047c1ab3b5e26b1 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 14 Aug 2024 12:27:52 +0100
Subject: [PATCH 1416/1571] CI(label-for-external-users): check membership
 using GitHub API (#8724)

## Problem

`author_association` doesn't properly work if a GitHub user decides not
to show affiliation with the org in their profile (i.e. if it's private)

## Summary of changes
- Call
`/orgs/ORG/members/USERNAME` API to check whether
a PR/issue author is a member of the org
---
 .../workflows/label-for-external-users.yml    | 29 +++++++++++++++----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/label-for-external-users.yml b/.github/workflows/label-for-external-users.yml
index 2f19a746e0..7cf5ee254c 100644
--- a/.github/workflows/label-for-external-users.yml
+++ b/.github/workflows/label-for-external-users.yml
@@ -15,15 +15,34 @@ env:
   LABEL: external
 
 jobs:
+  check-user:
+    runs-on: ubuntu-22.04
+
+    outputs:
+      is-member: ${{ steps.check-user.outputs.is-member }}
+
+    steps:
+    - name: Check whether `${{ github.actor }}` is a member of `${{ github.repository_owner }}`
+      id: check-user
+      env:
+        GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      run: |
+        if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
+          is_member=true
+        else
+          is_member=false
+        fi
+
+        echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT}
+
   add-label:
-    # This workflow uses `author_association` for PRs and issues to determine if the user is an external user.
-    # Possible values for `author_association`: https://docs.github.com/en/graphql/reference/enums#commentauthorassociation
-    if: ${{ !contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].author_association) }}
+    if: needs.check-user.outputs.is-member == 'false'
+    needs: [ check-user ]
 
     runs-on: ubuntu-22.04
     permissions:
-      pull-requests: write
-      issues: write
+      pull-requests: write # for `gh pr edit`
+      issues: write        # for `gh issue edit`
 
     steps:
     - name: Label new ${{ github.event_name }}

From 6c9e3c95518306c45b5290c10dec5c0a53aaab2d Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 14 Aug 2024 14:45:56 +0300
Subject: [PATCH 1417/1571] refactor: error/anyhow::Error wrapping (#8697)

We can get CompactionError::Other(Cancelled) via the error handling with
a few ways.
[evidence](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8655/10301613380/index.html#suites/cae012a1e6acdd9fdd8b81541972b6ce/653a33de17802bb1/).
Hopefully fix it by:

1. replace the `map_err` which hid the
`GetReadyAncestorError::Cancelled` with `From<GetReadyAncestorError> for
GetVectoredError` conversion
2. simplifying the code in pgdatadir_mapping to eliminate the token
anyhow wrapping for deserialization errors
3. stop wrapping GetVectoredError as anyhow errors
4. stop wrapping PageReconstructError as anyhow errors

Additionally, produce warnings if we treat any other error (as was legal
before this PR) as missing key.

Cc: #8708.
---
 libs/postgres_ffi/src/lib.rs                  |  4 +-
 pageserver/src/http/routes.rs                 |  6 +-
 pageserver/src/pgdatadir_mapping.rs           | 90 +++++++------------
 pageserver/src/tenant.rs                      | 11 ++-
 .../src/tenant/storage_layer/delta_layer.rs   |  4 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  4 +-
 pageserver/src/tenant/timeline.rs             | 71 ++++++++++-----
 pageserver/src/tenant/timeline/compaction.rs  |  2 +-
 pageserver/src/walingest.rs                   |  4 +-
 pageserver/src/walrecord.rs                   |  2 +-
 10 files changed, 101 insertions(+), 97 deletions(-)

diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 729f57f829..0940ad207f 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -143,8 +143,8 @@ pub use v14::xlog_utils::XLogFileName;
 
 pub use v14::bindings::DBState_DB_SHUTDOWNED;
 
-pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
-    dispatch_pgversion!(version, Ok(pgv::bindings::bkpimg_is_compressed(bimg_info)))
+pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool {
+    dispatch_pgversion!(version, pgv::bindings::bkpimg_is_compressed(bimg_info))
 }
 
 pub fn generate_wal_segment(
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index d209f4eced..a4da8506d6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -178,10 +178,8 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
 impl From<PageReconstructError> for ApiError {
     fn from(pre: PageReconstructError) -> ApiError {
         match pre {
-            PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
-            PageReconstructError::MissingKey(e) => {
-                ApiError::InternalServerError(anyhow::anyhow!("{e}"))
-            }
+            PageReconstructError::Other(other) => ApiError::InternalServerError(other),
+            PageReconstructError::MissingKey(e) => ApiError::InternalServerError(e.into()),
             PageReconstructError::Cancelled => ApiError::Cancelled,
             PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
             PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 85f3a6e0fb..4f7eb1a00c 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -287,10 +287,7 @@ impl Timeline {
         // then check if the database was already initialized.
         // get_rel_exists can be called before dbdir is created.
         let buf = version.get(self, DBDIR_KEY, ctx).await?;
-        let dbdirs = match DbDirectory::des(&buf).context("deserialization failure") {
-            Ok(dir) => Ok(dir.dbdirs),
-            Err(e) => Err(PageReconstructError::from(e)),
-        }?;
+        let dbdirs = DbDirectory::des(&buf)?.dbdirs;
         if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
             return Ok(false);
         }
@@ -298,13 +295,8 @@ impl Timeline {
         let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
         let buf = version.get(self, key, ctx).await?;
 
-        match RelDirectory::des(&buf).context("deserialization failure") {
-            Ok(dir) => {
-                let exists = dir.rels.contains(&(tag.relnode, tag.forknum));
-                Ok(exists)
-            }
-            Err(e) => Err(PageReconstructError::from(e)),
-        }
+        let dir = RelDirectory::des(&buf)?;
+        Ok(dir.rels.contains(&(tag.relnode, tag.forknum)))
     }
 
     /// Get a list of all existing relations in given tablespace and database.
@@ -323,20 +315,16 @@ impl Timeline {
         let key = rel_dir_to_key(spcnode, dbnode);
         let buf = version.get(self, key, ctx).await?;
 
-        match RelDirectory::des(&buf).context("deserialization failure") {
-            Ok(dir) => {
-                let rels: HashSet<RelTag> =
-                    HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
-                        spcnode,
-                        dbnode,
-                        relnode: *relnode,
-                        forknum: *forknum,
-                    }));
+        let dir = RelDirectory::des(&buf)?;
+        let rels: HashSet<RelTag> =
+            HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
+                spcnode,
+                dbnode,
+                relnode: *relnode,
+                forknum: *forknum,
+            }));
 
-                Ok(rels)
-            }
-            Err(e) => Err(PageReconstructError::from(e)),
-        }
+        Ok(rels)
     }
 
     /// Get the whole SLRU segment
@@ -398,13 +386,8 @@ impl Timeline {
         let key = slru_dir_to_key(kind);
         let buf = version.get(self, key, ctx).await?;
 
-        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
-            Ok(dir) => {
-                let exists = dir.segments.contains(&segno);
-                Ok(exists)
-            }
-            Err(e) => Err(PageReconstructError::from(e)),
-        }
+        let dir = SlruSegmentDirectory::des(&buf)?;
+        Ok(dir.segments.contains(&segno))
     }
 
     /// Locate LSN, such that all transactions that committed before
@@ -620,10 +603,7 @@ impl Timeline {
         let key = slru_dir_to_key(kind);
 
         let buf = version.get(self, key, ctx).await?;
-        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
-            Ok(dir) => Ok(dir.segments),
-            Err(e) => Err(PageReconstructError::from(e)),
-        }
+        Ok(SlruSegmentDirectory::des(&buf)?.segments)
     }
 
     pub(crate) async fn get_relmap_file(
@@ -647,10 +627,7 @@ impl Timeline {
         // fetch directory entry
         let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
 
-        match DbDirectory::des(&buf).context("deserialization failure") {
-            Ok(dir) => Ok(dir.dbdirs),
-            Err(e) => Err(PageReconstructError::from(e)),
-        }
+        Ok(DbDirectory::des(&buf)?.dbdirs)
     }
 
     pub(crate) async fn get_twophase_file(
@@ -672,10 +649,7 @@ impl Timeline {
         // fetch directory entry
         let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
 
-        match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
-            Ok(dir) => Ok(dir.xids),
-            Err(e) => Err(PageReconstructError::from(e)),
-        }
+        Ok(TwoPhaseDirectory::des(&buf)?.xids)
     }
 
     pub(crate) async fn get_control_file(
@@ -700,10 +674,7 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
         match self.get(AUX_FILES_KEY, lsn, ctx).await {
-            Ok(buf) => match AuxFilesDirectory::des(&buf).context("deserialization failure") {
-                Ok(dir) => Ok(dir.files),
-                Err(e) => Err(PageReconstructError::from(e)),
-            },
+            Ok(buf) => Ok(AuxFilesDirectory::des(&buf)?.files),
             Err(e) => {
                 // This is expected: historical databases do not have the key.
                 debug!("Failed to get info about AUX files: {}", e);
@@ -719,13 +690,14 @@ impl Timeline {
     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
         let kv = self
             .scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx)
-            .await
-            .context("scan")?;
+            .await?;
         let mut result = HashMap::new();
         let mut sz = 0;
         for (_, v) in kv {
-            let v = v.context("get value")?;
-            let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
+            let v = v?;
+            let v = aux_file::decode_file_value_bytes(&v)
+                .context("value decode")
+                .map_err(PageReconstructError::Other)?;
             for (fname, content) in v {
                 sz += fname.len();
                 sz += content.len();
@@ -793,11 +765,10 @@ impl Timeline {
     ) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
         let kv = self
             .scan(KeySpace::single(repl_origin_key_range()), lsn, ctx)
-            .await
-            .context("scan")?;
+            .await?;
         let mut result = HashMap::new();
         for (k, v) in kv {
-            let v = v.context("get value")?;
+            let v = v?;
             let origin_id = k.field6 as RepOriginId;
             let origin_lsn = Lsn::des(&v).unwrap();
             if origin_lsn != Lsn::INVALID {
@@ -1733,12 +1704,17 @@ impl<'a> DatadirModification<'a> {
                     // the original code assumes all other errors are missing keys. Therefore, we keep the code path
                     // the same for now, though in theory, we should only match the `MissingKey` variant.
                     Err(
-                        PageReconstructError::Other(_)
+                        e @ (PageReconstructError::Other(_)
                         | PageReconstructError::WalRedo(_)
-                        | PageReconstructError::MissingKey { .. },
+                        | PageReconstructError::MissingKey(_)),
                     ) => {
                         // Key is missing, we must insert an image as the basis for subsequent deltas.
 
+                        if !matches!(e, PageReconstructError::MissingKey(_)) {
+                            let e = utils::error::report_compact_sources(&e);
+                            tracing::warn!("treating error as if it was a missing key: {}", e);
+                        }
+
                         let mut dir = AuxFilesDirectory {
                             files: HashMap::new(),
                         };
@@ -1893,7 +1869,7 @@ impl<'a> DatadirModification<'a> {
                     // work directly with Images, and we never need to read actual
                     // data pages. We could handle this if we had to, by calling
                     // the walredo manager, but let's keep it simple for now.
-                    Err(PageReconstructError::from(anyhow::anyhow!(
+                    Err(PageReconstructError::Other(anyhow::anyhow!(
                         "unexpected pending WAL record"
                     )))
                 };
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index b065f58382..8ab8d08ce1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4491,10 +4491,13 @@ mod tests {
 
         // This needs to traverse to the parent, and fails.
         let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
-        assert!(err.to_string().starts_with(&format!(
-            "Bad state on timeline {}: Broken",
-            tline.timeline_id
-        )));
+        assert!(
+            err.to_string().starts_with(&format!(
+                "bad state on timeline {}: Broken",
+                tline.timeline_id
+            )),
+            "{err}"
+        );
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index f4e965b99a..0ed2f72c3f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1020,7 +1020,7 @@ impl DeltaLayerInner {
                     for (_, blob_meta) in read.blobs_at.as_slice() {
                         reconstruct_state.on_key_error(
                             blob_meta.key,
-                            PageReconstructError::from(anyhow!(
+                            PageReconstructError::Other(anyhow!(
                                 "Failed to read blobs from virtual file {}: {}",
                                 self.file.path,
                                 kind
@@ -1047,7 +1047,7 @@ impl DeltaLayerInner {
                     Err(e) => {
                         reconstruct_state.on_key_error(
                             meta.meta.key,
-                            PageReconstructError::from(anyhow!(e).context(format!(
+                            PageReconstructError::Other(anyhow!(e).context(format!(
                                 "Failed to deserialize blob from virtual file {}",
                                 self.file.path,
                             ))),
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 9c31d5dc3f..774f97e1d9 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -312,7 +312,9 @@ impl Layer {
             .get_or_maybe_download(true, Some(ctx))
             .await
             .map_err(|err| match err {
-                DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
+                DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
+                    GetVectoredError::Cancelled
+                }
                 other => GetVectoredError::Other(anyhow::anyhow!(other)),
             })?;
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a799ce764a..d437724673 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -511,7 +511,7 @@ pub(crate) struct TimelineVisitOutcome {
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum PageReconstructError {
     #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    Other(anyhow::Error),
 
     #[error("Ancestor LSN wait error: {0}")]
     AncestorLsnTimeout(WaitLsnError),
@@ -527,6 +527,22 @@ pub(crate) enum PageReconstructError {
     MissingKey(MissingKeyError),
 }
 
+impl From<anyhow::Error> for PageReconstructError {
+    fn from(value: anyhow::Error) -> Self {
+        // with walingest.rs many PageReconstructError are wrapped in as anyhow::Error
+        match value.downcast::<PageReconstructError>() {
+            Ok(pre) => pre,
+            Err(other) => PageReconstructError::Other(other),
+        }
+    }
+}
+
+impl From<utils::bin_ser::DeserializeError> for PageReconstructError {
+    fn from(value: utils::bin_ser::DeserializeError) -> Self {
+        PageReconstructError::Other(anyhow::Error::new(value).context("deserialization failure"))
+    }
+}
+
 impl From<layer_manager::Shutdown> for PageReconstructError {
     fn from(_: layer_manager::Shutdown) -> Self {
         PageReconstructError::Cancelled
@@ -546,6 +562,7 @@ impl From<layer_manager::Shutdown> for GetVectoredError {
     }
 }
 
+#[derive(thiserror::Error)]
 pub struct MissingKeyError {
     key: Key,
     shard: ShardNumber,
@@ -585,11 +602,8 @@ impl PageReconstructError {
     pub(crate) fn is_stopping(&self) -> bool {
         use PageReconstructError::*;
         match self {
-            Other(_) => false,
-            AncestorLsnTimeout(_) => false,
             Cancelled => true,
-            WalRedo(_) => false,
-            MissingKey { .. } => false,
+            Other(_) | AncestorLsnTimeout(_) | WalRedo(_) | MissingKey(_) => false,
         }
     }
 }
@@ -599,11 +613,11 @@ pub(crate) enum CreateImageLayersError {
     #[error("timeline shutting down")]
     Cancelled,
 
-    #[error(transparent)]
-    GetVectoredError(GetVectoredError),
+    #[error("read failed")]
+    GetVectoredError(#[source] GetVectoredError),
 
-    #[error(transparent)]
-    PageReconstructError(PageReconstructError),
+    #[error("reconstruction failed")]
+    PageReconstructError(#[source] PageReconstructError),
 
     #[error(transparent)]
     Other(#[from] anyhow::Error),
@@ -627,10 +641,10 @@ pub(crate) enum FlushLayerError {
 
     // Arc<> the following non-clonable error types: we must be Clone-able because the flush error is propagated from the flush
     // loop via a watch channel, where we can only borrow it.
-    #[error(transparent)]
+    #[error("create image layers (shared)")]
     CreateImageLayersError(Arc<CreateImageLayersError>),
 
-    #[error(transparent)]
+    #[error("other (shared)")]
     Other(#[from] Arc<anyhow::Error>),
 }
 
@@ -663,34 +677,46 @@ pub(crate) enum GetVectoredError {
     #[error("timeline shutting down")]
     Cancelled,
 
-    #[error("Requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)]
+    #[error("requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)]
     Oversized(u64),
 
-    #[error("Requested at invalid LSN: {0}")]
+    #[error("requested at invalid LSN: {0}")]
     InvalidLsn(Lsn),
 
-    #[error("Requested key not found: {0}")]
+    #[error("requested key not found: {0}")]
     MissingKey(MissingKeyError),
 
-    #[error(transparent)]
-    GetReadyAncestorError(GetReadyAncestorError),
+    #[error("ancestry walk")]
+    GetReadyAncestorError(#[source] GetReadyAncestorError),
 
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
 
+impl From<GetReadyAncestorError> for GetVectoredError {
+    fn from(value: GetReadyAncestorError) -> Self {
+        use GetReadyAncestorError::*;
+        match value {
+            Cancelled => GetVectoredError::Cancelled,
+            AncestorLsnTimeout(_) | BadState { .. } => {
+                GetVectoredError::GetReadyAncestorError(value)
+            }
+        }
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum GetReadyAncestorError {
-    #[error("Ancestor LSN wait error: {0}")]
+    #[error("ancestor LSN wait error")]
     AncestorLsnTimeout(#[from] WaitLsnError),
 
-    #[error("Bad state on timeline {timeline_id}: {state:?}")]
+    #[error("bad state on timeline {timeline_id}: {state:?}")]
     BadState {
         timeline_id: TimelineId,
         state: TimelineState,
     },
 
-    #[error("Cancelled")]
+    #[error("cancelled")]
     Cancelled,
 }
 
@@ -3046,8 +3072,7 @@ impl Timeline {
             cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
             timeline_owned = timeline
                 .get_ready_ancestor_timeline(ancestor_timeline, ctx)
-                .await
-                .map_err(GetVectoredError::GetReadyAncestorError)?;
+                .await?;
             timeline = &*timeline_owned;
         };
 
@@ -3944,7 +3969,7 @@ impl Timeline {
                                     warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
                                     ZERO_PAGE.clone()
                                 } else {
-                                    return Err(CreateImageLayersError::PageReconstructError(err));
+                                    return Err(CreateImageLayersError::from(err));
                                 }
                             }
                         };
@@ -4004,7 +4029,7 @@ impl Timeline {
             let mut total_kb_retrieved = 0;
             let mut total_keys_retrieved = 0;
             for (k, v) in data {
-                let v = v.map_err(CreateImageLayersError::PageReconstructError)?;
+                let v = v?;
                 total_kb_retrieved += KEY_SIZE + v.len();
                 total_keys_retrieved += 1;
                 new_data.insert(k, v);
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 9ac0086cde..e24459e7b9 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -2325,7 +2325,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
                 key_range,
             ))
         } else {
-            // The current compaction implementatin only ever requests the key space
+            // The current compaction implementation only ever requests the key space
             // at the compaction end LSN.
             anyhow::bail!("keyspace not available for requested lsn");
         }
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 804c7fca97..8425528740 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -515,7 +515,7 @@ impl WalIngest {
             && (decoded.xl_info == pg_constants::XLOG_FPI
                 || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
             // compression of WAL is not yet supported: fall back to storing the original WAL record
-            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)?
+            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)
             // do not materialize null pages because them most likely be soon replaced with real data
             && blk.bimg_len != 0
         {
@@ -1702,7 +1702,7 @@ async fn get_relsize(
     modification: &DatadirModification<'_>,
     rel: RelTag,
     ctx: &RequestContext,
-) -> anyhow::Result<BlockNumber> {
+) -> Result<BlockNumber, PageReconstructError> {
     let nblocks = if !modification
         .tline
         .get_rel_exists(rel, Version::Modified(modification), ctx)
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index 62a3a91b0b..edddcefbe1 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -1018,7 +1018,7 @@ pub fn decode_wal_record(
                     );
 
                     let blk_img_is_compressed =
-                        postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)?;
+                        postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version);
 
                     if blk_img_is_compressed {
                         debug!("compressed block image , pg_version = {}", pg_version);

From a7028d92b7560228e6cf63a3cf102c076bad3aa6 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 14 Aug 2024 13:35:29 +0100
Subject: [PATCH 1418/1571] proxy: start of jwk cache (#8690)

basic JWT implementation that caches JWKs and verifies signatures.

this code is currently not reachable from proxy, I just wanted to get
something merged in.
---
 Cargo.lock                    | 273 ++++++++++++++++-
 deny.toml                     |   5 +-
 proxy/Cargo.toml              |  11 +-
 proxy/src/auth/backend.rs     |   1 +
 proxy/src/auth/backend/jwt.rs | 556 ++++++++++++++++++++++++++++++++++
 proxy/src/http.rs             |  33 ++
 workspace_hack/Cargo.toml     |  12 +-
 7 files changed, 872 insertions(+), 19 deletions(-)
 create mode 100644 proxy/src/auth/backend/jwt.rs

diff --git a/Cargo.lock b/Cargo.lock
index 031fae0f37..dee15b6aa7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -484,7 +484,7 @@ dependencies = [
  "http 0.2.9",
  "http 1.1.0",
  "once_cell",
- "p256",
+ "p256 0.11.1",
  "percent-encoding",
  "ring 0.17.6",
  "sha2",
@@ -848,6 +848,12 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce"
 
+[[package]]
+name = "base16ct"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf"
+
 [[package]]
 name = "base64"
 version = "0.13.1"
@@ -971,9 +977,9 @@ checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
 
 [[package]]
 name = "bytemuck"
-version = "1.16.0"
+version = "1.16.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
+checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83"
 
 [[package]]
 name = "byteorder"
@@ -1526,8 +1532,10 @@ version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76"
 dependencies = [
+ "generic-array",
  "rand_core 0.6.4",
  "subtle",
+ "zeroize",
 ]
 
 [[package]]
@@ -1621,6 +1629,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c"
 dependencies = [
  "const-oid",
+ "pem-rfc7468",
  "zeroize",
 ]
 
@@ -1720,6 +1729,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
 dependencies = [
  "block-buffer",
+ "const-oid",
  "crypto-common",
  "subtle",
 ]
@@ -1771,11 +1781,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c"
 dependencies = [
  "der 0.6.1",
- "elliptic-curve",
- "rfc6979",
+ "elliptic-curve 0.12.3",
+ "rfc6979 0.3.1",
  "signature 1.6.4",
 ]
 
+[[package]]
+name = "ecdsa"
+version = "0.16.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca"
+dependencies = [
+ "der 0.7.8",
+ "digest",
+ "elliptic-curve 0.13.8",
+ "rfc6979 0.4.0",
+ "signature 2.2.0",
+ "spki 0.7.3",
+]
+
 [[package]]
 name = "either"
 version = "1.8.1"
@@ -1788,16 +1812,36 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3"
 dependencies = [
- "base16ct",
+ "base16ct 0.1.1",
  "crypto-bigint 0.4.9",
  "der 0.6.1",
  "digest",
- "ff",
+ "ff 0.12.1",
  "generic-array",
- "group",
- "pkcs8",
+ "group 0.12.1",
+ "pkcs8 0.9.0",
  "rand_core 0.6.4",
- "sec1",
+ "sec1 0.3.0",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "elliptic-curve"
+version = "0.13.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47"
+dependencies = [
+ "base16ct 0.2.0",
+ "crypto-bigint 0.5.5",
+ "digest",
+ "ff 0.13.0",
+ "generic-array",
+ "group 0.13.0",
+ "pem-rfc7468",
+ "pkcs8 0.10.2",
+ "rand_core 0.6.4",
+ "sec1 0.7.3",
  "subtle",
  "zeroize",
 ]
@@ -1951,6 +1995,16 @@ dependencies = [
  "subtle",
 ]
 
+[[package]]
+name = "ff"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ded41244b729663b1e574f1b4fb731469f69f79c17667b5d776b16cda0479449"
+dependencies = [
+ "rand_core 0.6.4",
+ "subtle",
+]
+
 [[package]]
 name = "filetime"
 version = "0.2.22"
@@ -2148,6 +2202,7 @@ checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
 dependencies = [
  "typenum",
  "version_check",
+ "zeroize",
 ]
 
 [[package]]
@@ -2214,7 +2269,18 @@ version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7"
 dependencies = [
- "ff",
+ "ff 0.12.1",
+ "rand_core 0.6.4",
+ "subtle",
+]
+
+[[package]]
+name = "group"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63"
+dependencies = [
+ "ff 0.13.0",
  "rand_core 0.6.4",
  "subtle",
 ]
@@ -2776,6 +2842,42 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "jose-b64"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bec69375368709666b21c76965ce67549f2d2db7605f1f8707d17c9656801b56"
+dependencies = [
+ "base64ct",
+ "serde",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "jose-jwa"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ab78e053fe886a351d67cf0d194c000f9d0dcb92906eb34d853d7e758a4b3a7"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "jose-jwk"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "280fa263807fe0782ecb6f2baadc28dffc04e00558a58e33bfdb801d11fd58e7"
+dependencies = [
+ "jose-b64",
+ "jose-jwa",
+ "p256 0.13.2",
+ "p384",
+ "rsa",
+ "serde",
+ "zeroize",
+]
+
 [[package]]
 name = "js-sys"
 version = "0.3.69"
@@ -2835,6 +2937,9 @@ name = "lazy_static"
 version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+dependencies = [
+ "spin 0.5.2",
+]
 
 [[package]]
 name = "lazycell"
@@ -3204,6 +3309,23 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "num-bigint-dig"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151"
+dependencies = [
+ "byteorder",
+ "lazy_static",
+ "libm",
+ "num-integer",
+ "num-iter",
+ "num-traits",
+ "rand 0.8.5",
+ "smallvec",
+ "zeroize",
+]
+
 [[package]]
 name = "num-complex"
 version = "0.4.4"
@@ -3481,11 +3603,33 @@ version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594"
 dependencies = [
- "ecdsa",
- "elliptic-curve",
+ "ecdsa 0.14.8",
+ "elliptic-curve 0.12.3",
  "sha2",
 ]
 
+[[package]]
+name = "p256"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b"
+dependencies = [
+ "ecdsa 0.16.9",
+ "elliptic-curve 0.13.8",
+ "primeorder",
+ "sha2",
+]
+
+[[package]]
+name = "p384"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70786f51bcc69f6a4c0360e063a4cac5419ef7c5cd5b3c99ad70f3be5ba79209"
+dependencies = [
+ "elliptic-curve 0.13.8",
+ "primeorder",
+]
+
 [[package]]
 name = "pagebench"
 version = "0.1.0"
@@ -3847,6 +3991,15 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "pem-rfc7468"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
+dependencies = [
+ "base64ct",
+]
+
 [[package]]
 name = "percent-encoding"
 version = "2.2.0"
@@ -3913,6 +4066,17 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
+[[package]]
+name = "pkcs1"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f"
+dependencies = [
+ "der 0.7.8",
+ "pkcs8 0.10.2",
+ "spki 0.7.3",
+]
+
 [[package]]
 name = "pkcs8"
 version = "0.9.0"
@@ -3923,6 +4087,16 @@ dependencies = [
  "spki 0.6.0",
 ]
 
+[[package]]
+name = "pkcs8"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
+dependencies = [
+ "der 0.7.8",
+ "spki 0.7.3",
+]
+
 [[package]]
 name = "pkg-config"
 version = "0.3.27"
@@ -4116,6 +4290,15 @@ dependencies = [
  "syn 2.0.52",
 ]
 
+[[package]]
+name = "primeorder"
+version = "0.13.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6"
+dependencies = [
+ "elliptic-curve 0.13.8",
+]
+
 [[package]]
 name = "proc-macro-hack"
 version = "0.5.20+deprecated"
@@ -4233,6 +4416,7 @@ version = "0.1.0"
 dependencies = [
  "ahash",
  "anyhow",
+ "arc-swap",
  "async-compression",
  "async-trait",
  "atomic-take",
@@ -4250,6 +4434,7 @@ dependencies = [
  "consumption_metrics",
  "crossbeam-deque",
  "dashmap",
+ "ecdsa 0.16.9",
  "env_logger",
  "fallible-iterator",
  "framed-websockets",
@@ -4270,12 +4455,15 @@ dependencies = [
  "indexmap 2.0.1",
  "ipnet",
  "itertools 0.10.5",
+ "jose-jwa",
+ "jose-jwk",
  "lasso",
  "md5",
  "measured",
  "metrics",
  "once_cell",
  "opentelemetry",
+ "p256 0.13.2",
  "parking_lot 0.12.1",
  "parquet",
  "parquet_derive",
@@ -4296,6 +4484,7 @@ dependencies = [
  "reqwest-retry",
  "reqwest-tracing",
  "routerify",
+ "rsa",
  "rstest",
  "rustc-hash",
  "rustls 0.22.4",
@@ -4305,6 +4494,7 @@ dependencies = [
  "serde",
  "serde_json",
  "sha2",
+ "signature 2.2.0",
  "smallvec",
  "smol_str",
  "socket2 0.5.5",
@@ -4807,6 +4997,16 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "rfc6979"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2"
+dependencies = [
+ "hmac",
+ "subtle",
+]
+
 [[package]]
 name = "ring"
 version = "0.16.20"
@@ -4867,6 +5067,26 @@ dependencies = [
  "archery",
 ]
 
+[[package]]
+name = "rsa"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc"
+dependencies = [
+ "const-oid",
+ "digest",
+ "num-bigint-dig",
+ "num-integer",
+ "num-traits",
+ "pkcs1",
+ "pkcs8 0.10.2",
+ "rand_core 0.6.4",
+ "signature 2.2.0",
+ "spki 0.7.3",
+ "subtle",
+ "zeroize",
+]
+
 [[package]]
 name = "rstest"
 version = "0.18.2"
@@ -5195,10 +5415,24 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928"
 dependencies = [
- "base16ct",
+ "base16ct 0.1.1",
  "der 0.6.1",
  "generic-array",
- "pkcs8",
+ "pkcs8 0.9.0",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "sec1"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc"
+dependencies = [
+ "base16ct 0.2.0",
+ "der 0.7.8",
+ "generic-array",
+ "pkcs8 0.10.2",
  "subtle",
  "zeroize",
 ]
@@ -5545,6 +5779,7 @@ version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
 dependencies = [
+ "digest",
  "rand_core 0.6.4",
 ]
 
@@ -7379,13 +7614,17 @@ dependencies = [
  "clap",
  "clap_builder",
  "crossbeam-utils",
+ "crypto-bigint 0.5.5",
+ "der 0.7.8",
  "deranged",
+ "digest",
  "either",
  "fail",
  "futures-channel",
  "futures-executor",
  "futures-io",
  "futures-util",
+ "generic-array",
  "getrandom 0.2.11",
  "hashbrown 0.14.5",
  "hex",
@@ -7393,6 +7632,7 @@ dependencies = [
  "hyper 0.14.26",
  "indexmap 1.9.3",
  "itertools 0.10.5",
+ "lazy_static",
  "libc",
  "log",
  "memchr",
@@ -7416,7 +7656,9 @@ dependencies = [
  "serde",
  "serde_json",
  "sha2",
+ "signature 2.2.0",
  "smallvec",
+ "spki 0.7.3",
  "subtle",
  "syn 1.0.109",
  "syn 2.0.52",
@@ -7527,6 +7769,7 @@ version = "1.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
 dependencies = [
+ "serde",
  "zeroize_derive",
 ]
 
diff --git a/deny.toml b/deny.toml
index dc985138e6..327ac58db7 100644
--- a/deny.toml
+++ b/deny.toml
@@ -22,7 +22,10 @@ feature-depth = 1
 [advisories]
 db-urls = ["https://github.com/rustsec/advisory-db"]
 yanked = "warn"
-ignore = []
+
+[[advisories.ignore]]
+id = "RUSTSEC-2023-0071"
+reason = "the marvin attack only affects private key decryption, not public key signature verification"
 
 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index b316c53034..21d92abb20 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -11,6 +11,7 @@ testing = []
 [dependencies]
 ahash.workspace = true
 anyhow.workspace = true
+arc-swap.workspace = true
 async-compression.workspace = true
 async-trait.workspace = true
 atomic-take.workspace = true
@@ -73,7 +74,7 @@ rustls.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-sha2 = { workspace = true, features = ["asm"] }
+sha2 = { workspace = true, features = ["asm", "oid"] }
 smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
@@ -103,6 +104,14 @@ x509-parser.workspace = true
 postgres-protocol.workspace = true
 redis.workspace = true
 
+# jwt stuff
+jose-jwa = "0.1.2"
+jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] }
+signature = "2"
+ecdsa = "0.16"
+p256 = "0.13"
+rsa = "0.9"
+
 workspace_hack.workspace = true
 
 [dev-dependencies]
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 90dea01bf3..c6a0b2af5a 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -1,5 +1,6 @@
 mod classic;
 mod hacks;
+pub mod jwt;
 mod link;
 
 use std::net::IpAddr;
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
new file mode 100644
index 0000000000..0c2ca8fb97
--- /dev/null
+++ b/proxy/src/auth/backend/jwt.rs
@@ -0,0 +1,556 @@
+use std::{future::Future, sync::Arc, time::Duration};
+
+use anyhow::{bail, ensure, Context};
+use arc_swap::ArcSwapOption;
+use dashmap::DashMap;
+use jose_jwk::crypto::KeyInfo;
+use signature::Verifier;
+use tokio::time::Instant;
+
+use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt};
+
+// TODO(conrad): make these configurable.
+const MIN_RENEW: Duration = Duration::from_secs(30);
+const AUTO_RENEW: Duration = Duration::from_secs(300);
+const MAX_RENEW: Duration = Duration::from_secs(3600);
+const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
+
+/// How to get the JWT auth rules
+pub trait FetchAuthRules: Clone + Send + Sync + 'static {
+    fn fetch_auth_rules(&self) -> impl Future<Output = anyhow::Result<AuthRules>> + Send;
+}
+
+#[derive(Clone)]
+struct FetchAuthRulesFromCplane {
+    #[allow(dead_code)]
+    endpoint: EndpointIdInt,
+}
+
+impl FetchAuthRules for FetchAuthRulesFromCplane {
+    async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
+        Err(anyhow::anyhow!("not yet implemented"))
+    }
+}
+
+pub struct AuthRules {
+    jwks_urls: Vec<url::Url>,
+}
+
+#[derive(Default)]
+pub struct JwkCache {
+    client: reqwest::Client,
+
+    map: DashMap<EndpointIdInt, Arc<JwkCacheEntryLock>>,
+}
+
+pub struct JwkCacheEntryLock {
+    cached: ArcSwapOption<JwkCacheEntry>,
+    lookup: tokio::sync::Semaphore,
+}
+
+impl Default for JwkCacheEntryLock {
+    fn default() -> Self {
+        JwkCacheEntryLock {
+            cached: ArcSwapOption::empty(),
+            lookup: tokio::sync::Semaphore::new(1),
+        }
+    }
+}
+
+pub struct JwkCacheEntry {
+    /// Should refetch at least every hour to verify when old keys have been removed.
+    /// Should refetch when new key IDs are seen only every 5 minutes or so
+    last_retrieved: Instant,
+
+    /// cplane will return multiple JWKs urls that we need to scrape.
+    key_sets: ahash::HashMap<url::Url, jose_jwk::JwkSet>,
+}
+
+impl JwkCacheEntryLock {
+    async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
+        JwkRenewalPermit::acquire_permit(self).await
+    }
+
+    fn try_acquire_permit<'a>(self: &'a Arc<Self>) -> Option<JwkRenewalPermit<'a>> {
+        JwkRenewalPermit::try_acquire_permit(self)
+    }
+
+    async fn renew_jwks<F: FetchAuthRules>(
+        &self,
+        _permit: JwkRenewalPermit<'_>,
+        client: &reqwest::Client,
+        auth_rules: &F,
+    ) -> anyhow::Result<Arc<JwkCacheEntry>> {
+        // double check that no one beat us to updating the cache.
+        let now = Instant::now();
+        let guard = self.cached.load_full();
+        if let Some(cached) = guard {
+            let last_update = now.duration_since(cached.last_retrieved);
+            if last_update < Duration::from_secs(300) {
+                return Ok(cached);
+            }
+        }
+
+        let rules = auth_rules.fetch_auth_rules().await?;
+        let mut key_sets = ahash::HashMap::with_capacity_and_hasher(
+            rules.jwks_urls.len(),
+            ahash::RandomState::new(),
+        );
+        // TODO(conrad): run concurrently
+        // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284)
+        for url in rules.jwks_urls {
+            let req = client.get(url.clone());
+            // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
+            match req.send().await.and_then(|r| r.error_for_status()) {
+                // todo: should we re-insert JWKs if we want to keep this JWKs URL?
+                // I expect these failures would be quite sparse.
+                Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"),
+                Ok(r) => {
+                    let resp: http::Response<reqwest::Body> = r.into();
+                    match parse_json_body_with_limit::<jose_jwk::JwkSet>(
+                        resp.into_body(),
+                        MAX_JWK_BODY_SIZE,
+                    )
+                    .await
+                    {
+                        Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"),
+                        Ok(jwks) => {
+                            key_sets.insert(url, jwks);
+                        }
+                    }
+                }
+            }
+        }
+
+        let entry = Arc::new(JwkCacheEntry {
+            last_retrieved: now,
+            key_sets,
+        });
+        self.cached.swap(Some(Arc::clone(&entry)));
+
+        Ok(entry)
+    }
+
+    async fn get_or_update_jwk_cache<F: FetchAuthRules>(
+        self: &Arc<Self>,
+        client: &reqwest::Client,
+        fetch: &F,
+    ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
+        let now = Instant::now();
+        let guard = self.cached.load_full();
+
+        // if we have no cached JWKs, try and get some
+        let Some(cached) = guard else {
+            let permit = self.acquire_permit().await;
+            return self.renew_jwks(permit, client, fetch).await;
+        };
+
+        let last_update = now.duration_since(cached.last_retrieved);
+
+        // check if the cached JWKs need updating.
+        if last_update > MAX_RENEW {
+            let permit = self.acquire_permit().await;
+
+            // it's been too long since we checked the keys. wait for them to update.
+            return self.renew_jwks(permit, client, fetch).await;
+        }
+
+        // every 5 minutes we should spawn a job to eagerly update the token.
+        if last_update > AUTO_RENEW {
+            if let Some(permit) = self.try_acquire_permit() {
+                tracing::debug!("JWKs should be renewed. Renewal permit acquired");
+                let permit = permit.into_owned();
+                let entry = self.clone();
+                let client = client.clone();
+                let fetch = fetch.clone();
+                tokio::spawn(async move {
+                    if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await {
+                        tracing::warn!(error=?e, "could not fetch JWKs in background job");
+                    }
+                });
+            } else {
+                tracing::debug!("JWKs should be renewed. Renewal permit already taken, skipping");
+            }
+        }
+
+        Ok(cached)
+    }
+
+    async fn check_jwt<F: FetchAuthRules>(
+        self: &Arc<Self>,
+        jwt: String,
+        client: &reqwest::Client,
+        fetch: &F,
+    ) -> Result<(), anyhow::Error> {
+        // JWT compact form is defined to be
+        // <B64(Header)> || . || <B64(Payload)> || . || <B64(Signature)>
+        // where Signature = alg(<B64(Header)> || . || <B64(Payload)>);
+
+        let (header_payload, signature) = jwt
+            .rsplit_once(".")
+            .context("Provided authentication token is not a valid JWT encoding")?;
+        let (header, _payload) = header_payload
+            .split_once(".")
+            .context("Provided authentication token is not a valid JWT encoding")?;
+
+        let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
+            .context("Provided authentication token is not a valid JWT encoding")?;
+        let header = serde_json::from_slice::<JWTHeader>(&header)
+            .context("Provided authentication token is not a valid JWT encoding")?;
+
+        let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
+            .context("Provided authentication token is not a valid JWT encoding")?;
+
+        ensure!(header.typ == "JWT");
+        let kid = header.kid.context("missing key id")?;
+
+        let mut guard = self.get_or_update_jwk_cache(client, fetch).await?;
+
+        // get the key from the JWKs if possible. If not, wait for the keys to update.
+        let jwk = loop {
+            let jwk = guard
+                .key_sets
+                .values()
+                .flat_map(|jwks| &jwks.keys)
+                .find(|jwk| jwk.prm.kid.as_deref() == Some(kid));
+
+            match jwk {
+                Some(jwk) => break jwk,
+                None if guard.last_retrieved.elapsed() > MIN_RENEW => {
+                    let permit = self.acquire_permit().await;
+                    guard = self.renew_jwks(permit, client, fetch).await?;
+                }
+                _ => {
+                    bail!("jwk not found");
+                }
+            }
+        };
+
+        ensure!(
+            jwk.is_supported(&header.alg),
+            "signature algorithm not supported"
+        );
+
+        match &jwk.key {
+            jose_jwk::Key::Ec(key) => {
+                verify_ec_signature(header_payload.as_bytes(), &sig, key)?;
+            }
+            jose_jwk::Key::Rsa(key) => {
+                verify_rsa_signature(header_payload.as_bytes(), &sig, key, &jwk.prm.alg)?;
+            }
+            key => bail!("unsupported key type {key:?}"),
+        };
+
+        // TODO(conrad): verify iss, exp, nbf, etc...
+
+        Ok(())
+    }
+}
+
+impl JwkCache {
+    pub async fn check_jwt(
+        &self,
+        endpoint: EndpointIdInt,
+        jwt: String,
+    ) -> Result<(), anyhow::Error> {
+        // try with just a read lock first
+        let entry = self.map.get(&endpoint).as_deref().map(Arc::clone);
+        let entry = match entry {
+            Some(entry) => entry,
+            None => {
+                // acquire a write lock after to insert.
+                let entry = self.map.entry(endpoint).or_default();
+                Arc::clone(&*entry)
+            }
+        };
+
+        let fetch = FetchAuthRulesFromCplane { endpoint };
+        entry.check_jwt(jwt, &self.client, &fetch).await
+    }
+}
+
+fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> anyhow::Result<()> {
+    use ecdsa::Signature;
+    use signature::Verifier;
+
+    match key.crv {
+        jose_jwk::EcCurves::P256 => {
+            let pk =
+                p256::PublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid P256 key"))?;
+            let key = p256::ecdsa::VerifyingKey::from(&pk);
+            let sig = Signature::from_slice(sig)?;
+            key.verify(data, &sig)?;
+        }
+        key => bail!("unsupported ec key type {key:?}"),
+    }
+
+    Ok(())
+}
+
+fn verify_rsa_signature(
+    data: &[u8],
+    sig: &[u8],
+    key: &jose_jwk::Rsa,
+    alg: &Option<jose_jwa::Algorithm>,
+) -> anyhow::Result<()> {
+    use jose_jwa::{Algorithm, Signing};
+    use rsa::{
+        pkcs1v15::{Signature, VerifyingKey},
+        RsaPublicKey,
+    };
+
+    let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?;
+
+    match alg {
+        Some(Algorithm::Signing(Signing::Rs256)) => {
+            let key = VerifyingKey::<sha2::Sha256>::new(key);
+            let sig = Signature::try_from(sig)?;
+            key.verify(data, &sig)?;
+        }
+        _ => bail!("invalid RSA signing algorithm"),
+    };
+
+    Ok(())
+}
+
+/// <https://datatracker.ietf.org/doc/html/rfc7515#section-4.1>
+#[derive(serde::Deserialize, serde::Serialize)]
+struct JWTHeader<'a> {
+    /// must be "JWT"
+    typ: &'a str,
+    /// must be a supported alg
+    alg: jose_jwa::Algorithm,
+    /// key id, must be provided for our usecase
+    kid: Option<&'a str>,
+}
+
+struct JwkRenewalPermit<'a> {
+    inner: Option<JwkRenewalPermitInner<'a>>,
+}
+
+enum JwkRenewalPermitInner<'a> {
+    Owned(Arc<JwkCacheEntryLock>),
+    Borrowed(&'a Arc<JwkCacheEntryLock>),
+}
+
+impl JwkRenewalPermit<'_> {
+    fn into_owned(mut self) -> JwkRenewalPermit<'static> {
+        JwkRenewalPermit {
+            inner: self.inner.take().map(JwkRenewalPermitInner::into_owned),
+        }
+    }
+
+    async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit {
+        match from.lookup.acquire().await {
+            Ok(permit) => {
+                permit.forget();
+                JwkRenewalPermit {
+                    inner: Some(JwkRenewalPermitInner::Borrowed(from)),
+                }
+            }
+            Err(_) => panic!("semaphore should not be closed"),
+        }
+    }
+
+    fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit> {
+        match from.lookup.try_acquire() {
+            Ok(permit) => {
+                permit.forget();
+                Some(JwkRenewalPermit {
+                    inner: Some(JwkRenewalPermitInner::Borrowed(from)),
+                })
+            }
+            Err(tokio::sync::TryAcquireError::NoPermits) => None,
+            Err(tokio::sync::TryAcquireError::Closed) => panic!("semaphore should not be closed"),
+        }
+    }
+}
+
+impl JwkRenewalPermitInner<'_> {
+    fn into_owned(self) -> JwkRenewalPermitInner<'static> {
+        match self {
+            JwkRenewalPermitInner::Owned(p) => JwkRenewalPermitInner::Owned(p),
+            JwkRenewalPermitInner::Borrowed(p) => JwkRenewalPermitInner::Owned(Arc::clone(p)),
+        }
+    }
+}
+
+impl Drop for JwkRenewalPermit<'_> {
+    fn drop(&mut self) {
+        let entry = match &self.inner {
+            None => return,
+            Some(JwkRenewalPermitInner::Owned(p)) => p,
+            Some(JwkRenewalPermitInner::Borrowed(p)) => *p,
+        };
+        entry.lookup.add_permits(1);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::{future::IntoFuture, net::SocketAddr, time::SystemTime};
+
+    use base64::URL_SAFE_NO_PAD;
+    use bytes::Bytes;
+    use http::Response;
+    use http_body_util::Full;
+    use hyper1::service::service_fn;
+    use hyper_util::rt::TokioIo;
+    use rand::rngs::OsRng;
+    use signature::Signer;
+    use tokio::net::TcpListener;
+
+    fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) {
+        let sk = p256::SecretKey::random(&mut OsRng);
+        let pk = sk.public_key().into();
+        let jwk = jose_jwk::Jwk {
+            key: jose_jwk::Key::Ec(pk),
+            prm: jose_jwk::Parameters {
+                kid: Some(kid),
+                alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)),
+                ..Default::default()
+            },
+        };
+        (sk, jwk)
+    }
+
+    fn new_rsa_jwk(kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) {
+        let sk = rsa::RsaPrivateKey::new(&mut OsRng, 2048).unwrap();
+        let pk = sk.to_public_key().into();
+        let jwk = jose_jwk::Jwk {
+            key: jose_jwk::Key::Rsa(pk),
+            prm: jose_jwk::Parameters {
+                kid: Some(kid),
+                alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)),
+                ..Default::default()
+            },
+        };
+        (sk, jwk)
+    }
+
+    fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
+        let header = JWTHeader {
+            typ: "JWT",
+            alg: jose_jwa::Algorithm::Signing(sig),
+            kid: Some(&kid),
+        };
+        let body = typed_json::json! {{
+            "exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
+        }};
+
+        let header =
+            base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD);
+        let body = base64::encode_config(body.to_string(), URL_SAFE_NO_PAD);
+
+        format!("{header}.{body}")
+    }
+
+    fn new_ec_jwt(kid: String, key: p256::SecretKey) -> String {
+        use p256::ecdsa::{Signature, SigningKey};
+
+        let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256);
+        let sig: Signature = SigningKey::from(key).sign(payload.as_bytes());
+        let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
+
+        format!("{payload}.{sig}")
+    }
+
+    fn new_rsa_jwt(kid: String, key: rsa::RsaPrivateKey) -> String {
+        use rsa::pkcs1v15::SigningKey;
+        use rsa::signature::SignatureEncoding;
+
+        let payload = build_jwt_payload(kid, jose_jwa::Signing::Rs256);
+        let sig = SigningKey::<sha2::Sha256>::new(key).sign(payload.as_bytes());
+        let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
+
+        format!("{payload}.{sig}")
+    }
+
+    #[tokio::test]
+    async fn renew() {
+        let (rs1, jwk1) = new_rsa_jwk("1".into());
+        let (rs2, jwk2) = new_rsa_jwk("2".into());
+        let (ec1, jwk3) = new_ec_jwk("3".into());
+        let (ec2, jwk4) = new_ec_jwk("4".into());
+
+        let jwt1 = new_rsa_jwt("1".into(), rs1);
+        let jwt2 = new_rsa_jwt("2".into(), rs2);
+        let jwt3 = new_ec_jwt("3".into(), ec1);
+        let jwt4 = new_ec_jwt("4".into(), ec2);
+
+        let foo_jwks = jose_jwk::JwkSet {
+            keys: vec![jwk1, jwk3],
+        };
+        let bar_jwks = jose_jwk::JwkSet {
+            keys: vec![jwk2, jwk4],
+        };
+
+        let service = service_fn(move |req| {
+            let foo_jwks = foo_jwks.clone();
+            let bar_jwks = bar_jwks.clone();
+            async move {
+                let jwks = match req.uri().path() {
+                    "/foo" => &foo_jwks,
+                    "/bar" => &bar_jwks,
+                    _ => {
+                        return Response::builder()
+                            .status(404)
+                            .body(Full::new(Bytes::new()));
+                    }
+                };
+                let body = serde_json::to_vec(jwks).unwrap();
+                Response::builder()
+                    .status(200)
+                    .body(Full::new(Bytes::from(body)))
+            }
+        });
+
+        let listener = TcpListener::bind("0.0.0.0:0").await.unwrap();
+        let server = hyper1::server::conn::http1::Builder::new();
+        let addr = listener.local_addr().unwrap();
+        tokio::spawn(async move {
+            loop {
+                let (s, _) = listener.accept().await.unwrap();
+                let serve = server.serve_connection(TokioIo::new(s), service.clone());
+                tokio::spawn(serve.into_future());
+            }
+        });
+
+        let client = reqwest::Client::new();
+
+        #[derive(Clone)]
+        struct Fetch(SocketAddr);
+
+        impl FetchAuthRules for Fetch {
+            async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
+                Ok(AuthRules {
+                    jwks_urls: vec![
+                        format!("http://{}/foo", self.0).parse().unwrap(),
+                        format!("http://{}/bar", self.0).parse().unwrap(),
+                    ],
+                })
+            }
+        }
+
+        let jwk_cache = Arc::new(JwkCacheEntryLock::default());
+
+        jwk_cache
+            .check_jwt(jwt1, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt2, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt3, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt4, &client, &Fetch(addr))
+            .await
+            .unwrap();
+    }
+}
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index dd7164181d..1f1dd8c415 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -6,6 +6,12 @@ pub mod health_server;
 
 use std::time::Duration;
 
+use anyhow::bail;
+use bytes::Bytes;
+use http_body_util::BodyExt;
+use hyper1::body::Body;
+use serde::de::DeserializeOwned;
+
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
@@ -96,6 +102,33 @@ impl Endpoint {
     }
 }
 
+pub async fn parse_json_body_with_limit<D: DeserializeOwned>(
+    mut b: impl Body<Data = Bytes, Error = reqwest::Error> + Unpin,
+    limit: usize,
+) -> anyhow::Result<D> {
+    // We could use `b.limited().collect().await.to_bytes()` here
+    // but this ends up being slightly more efficient as far as I can tell.
+
+    // check the lower bound of the size hint.
+    // in reqwest, this value is influenced by the Content-Length header.
+    let lower_bound = match usize::try_from(b.size_hint().lower()) {
+        Ok(bound) if bound <= limit => bound,
+        _ => bail!("Content length exceeds limit of {limit} bytes"),
+    };
+    let mut bytes = Vec::with_capacity(lower_bound);
+
+    while let Some(frame) = b.frame().await.transpose()? {
+        if let Ok(data) = frame.into_data() {
+            if bytes.len() + data.len() > limit {
+                bail!("Content length exceeds limit of {limit} bytes")
+            }
+            bytes.extend_from_slice(&data);
+        }
+    }
+
+    Ok(serde_json::from_slice::<D>(&bytes)?)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 832fe06bf6..2d9b372654 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -30,13 +30,17 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
+crypto-bigint = { version = "0.5", features = ["generic-array", "zeroize"] }
+der = { version = "0.7", default-features = false, features = ["oid", "pem", "std"] }
 deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] }
+digest = { version = "0.10", features = ["mac", "oid", "std"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures-channel = { version = "0.3", features = ["sink"] }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
+generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 hex = { version = "0.4", features = ["serde"] }
@@ -44,6 +48,7 @@ hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
+lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
@@ -64,8 +69,10 @@ rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
-sha2 = { version = "0.10", features = ["asm"] }
+sha2 = { version = "0.10", features = ["asm", "oid"] }
+signature = { version = "2", default-features = false, features = ["digest", "rand_core", "std"] }
 smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
+spki = { version = "0.7", default-features = false, features = ["pem", "std"] }
 subtle = { version = "2" }
 sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
 tikv-jemalloc-sys = { version = "0.5" }
@@ -81,7 +88,7 @@ tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
 url = { version = "2", features = ["serde"] }
 uuid = { version = "1", features = ["serde", "v4", "v7"] }
-zeroize = { version = "1", features = ["derive"] }
+zeroize = { version = "1", features = ["derive", "serde"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }
@@ -97,6 +104,7 @@ getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
+lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }

From abb53ba36d0cc5da7ead626c3de91d41a255fc68 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 14 Aug 2024 13:37:03 +0100
Subject: [PATCH 1419/1571] storcon_cli: don't clobber heatmap interval when
 setting eviction (#8722)

## Problem

This command is kind of a hack, used when we're migrating large tenants
and want to get their resident size down. It sets the tenant config to a
fixed value, which omitted heatmap_period, so caused secondaries to get
out of date.

## Summary of changes

- Set heatmap period to the same 300s default that we use elsewhere when
updating eviction settings

This is not as elegant as some general purpose partial modification of
the config, but it practically makes the command safer to use.
---
 control_plane/storcon_cli/src/main.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 5c1add070a..e27491c1c8 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -622,6 +622,7 @@ async fn main() -> anyhow::Result<()> {
                                 threshold: threshold.into(),
                             },
                         )),
+                        heatmap_period: Some("300s".to_string()),
                         ..Default::default()
                     },
                 })

From 36c1719a07a8480f9681dccc6ec6f4b192e3ebbe Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 14 Aug 2024 13:38:25 +0100
Subject: [PATCH 1420/1571] CI(build-neon): fix accidental neon rebuild on
 `cargo test` (#8721)

## Problem

During `Run rust tests` step (for debug builds), we accidentally rebuild
neon twice (by `cargo test --doc` and by `cargo nextest run`).
It happens because we don't set `cov_prefix` for the `cargo test --doc`
command, which triggers rebuilding with different build flags, and one
more rebuild by `cargo nextest run`.

## Summary of changes
- Set `cov_prefix` for `cargo test --doc` to prevent unneeded rebuilds
---
 .github/workflows/_build-and-test-locally.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index a0ed169024..5e5908860e 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -208,7 +208,7 @@ jobs:
           export LD_LIBRARY_PATH
 
           #nextest does not yet support running doctests
-          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
+          ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
 
           for io_engine in std-fs tokio-epoll-uring ; do
             NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES

From 60fc1e8cc8a906e8b37ee795fe5fca666703fbec Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 14 Aug 2024 16:48:15 +0300
Subject: [PATCH 1421/1571] chore: even more responsive compaction cancellation
 (#8725)

Some benchmarks and tests might still fail because of #8655 (tracked in
#8708) because we are not fast enough to shut down ([one evidence]).
Partially this is explained by the current validation mode of streaming
k-merge, but otherwise because that is where we use a lot of time in
compaction. Outside of L0 => L1 compaction, the image layer generation
is already guarded by vectored reads doing cancellation checks.

32768 is a wild guess based on looking how many keys we put in each
layer in a bench (1-2 million), but I assume it will be good enough
divisor. Doing checks more often will start showing up as contention
which we cannot currently measure. Doing checks less often might be
reasonable.

[one evidence]:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/10384136483/index.html#suites/9681106e61a1222669b9d22ab136d07b/96e6d53af234924/

Earlier PR: #8706.
---
 pageserver/src/tenant/timeline.rs            |  7 ++++++-
 pageserver/src/tenant/timeline/compaction.rs | 13 +++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d437724673..b4d908b130 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4540,7 +4540,12 @@ impl Timeline {
         new_images: &[ResidentLayer],
         layers_to_remove: &[Layer],
     ) -> Result<(), CompactionError> {
-        let mut guard = self.layers.write().await;
+        let mut guard = tokio::select! {
+            guard = self.layers.write() => guard,
+            _ = self.cancel.cancelled() => {
+                return Err(CompactionError::ShuttingDown);
+            }
+        };
 
         let mut duplicated_layers = HashSet::new();
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index e24459e7b9..7370ec1386 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1048,11 +1048,22 @@ impl Timeline {
         let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
         let mut next_hole = 0; // index of next hole in holes vector
 
+        let mut keys = 0;
+
         while let Some((key, lsn, value)) = all_values_iter
             .next(ctx)
             .await
             .map_err(CompactionError::Other)?
         {
+            keys += 1;
+
+            if keys % 32_768 == 0 && self.cancel.is_cancelled() {
+                // avoid hitting the cancellation token on every key. in benches, we end up
+                // shuffling an order of million keys per layer, this means we'll check it
+                // around tens of times per layer.
+                return Err(CompactionError::ShuttingDown);
+            }
+
             let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
             // We need to check key boundaries once we reach next key or end of layer with the same key
             if !same_key || lsn == dup_end_lsn {
@@ -1157,6 +1168,8 @@ impl Timeline {
                         .await
                         .map_err(CompactionError::Other)?,
                     );
+
+                    keys = 0;
                 }
 
                 writer

From 70b18ff4817658160a34305c7a3f3fa1a21d164e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 14 Aug 2024 17:03:21 +0100
Subject: [PATCH 1422/1571] CI(neon-image): add ARM-specific RUSTFLAGS (#8566)

## Problem

It's recommended that a couple of additional RUSTFLAGS be set up to
improve the performance of Rust applications on AWS Graviton.

See
https://github.com/aws/aws-graviton-getting-started/blob/57dc813626d0266f1cc12ef83474745bb1f31fb4/rust.md

Note: Apple Silicon is compatible with neoverse-n1:
```
$ clang --version
Apple clang version 15.0.0 (clang-1500.3.9.4)
Target: arm64-apple-darwin23.6.0
Thread model: posix
InstalledDir: /Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin
$
$ clang --print-supported-cpus 2>&1 | grep neoverse-
	neoverse-512tvb
	neoverse-e1
	neoverse-n1
	neoverse-n2
	neoverse-v1
	neoverse-v2
```

## Summary of changes
- Add `-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1` to RUSTFLAGS for
ARM images
---
 .github/workflows/build_and_test.yml | 3 +++
 Dockerfile                           | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 78f9f11a65..a591cb73f2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -503,7 +503,10 @@ jobs:
       - uses: docker/build-push-action@v6
         with:
           context: .
+          # ARM-specific flags are recommended for Graviton ≥ 2, these flags are also supported by Ampere Altra (Azure)
+          # https://github.com/aws/aws-graviton-getting-started/blob/57dc813626d0266f1cc12ef83474745bb1f31fb4/rust.md
           build-args: |
+            ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }}
             GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
             BUILD_TAG=${{ needs.tag.outputs.build-tag }}
             TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
diff --git a/Dockerfile b/Dockerfile
index ceb1c7cb55..d3d12330c6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -35,8 +35,9 @@ COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_i
 COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
 COPY --chown=nonroot . .
 
+ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
+    && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \

From aa2e16f307c4a55f5ae1ece22a344d3cddc1dccc Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 14 Aug 2024 17:56:59 +0100
Subject: [PATCH 1423/1571] CI: misc cleanup & fixes (#8559)

## Problem
A bunch of small fixes and improvements for CI, that are too small to
have a separate PR for them

## Summary of changes
- CI(build-and-test): fix parenthesis
- CI(actionlint): fix path to workflow file
- CI: remove default args from actions/checkout
- CI: remove `gen3` label, using a couple `self-hosted` +
`small{,-arm64}`/`large{,-arm64}` is enough
- CI: prettify Slack messages, hide links behind text messages
- C(build-and-test): add more dependencies to `conclusion` job
---
 .github/actionlint.yml                        |  1 -
 .../actions/run-python-test-set/action.yml    |  1 -
 .github/workflows/_build-and-test-locally.yml |  2 -
 .github/workflows/actionlint.yml              |  2 +-
 .github/workflows/benchmarking.yml            | 48 ++++++++++++++-----
 .github/workflows/build-build-tools-image.yml |  2 +-
 .github/workflows/build_and_test.yml          | 40 +++++++---------
 .github/workflows/neon_extra_builds.yml       |  2 -
 .github/workflows/periodic_pagebench.yml      |  2 +-
 9 files changed, 58 insertions(+), 42 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index a5282876d0..4ad8a7b460 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -1,7 +1,6 @@
 self-hosted-runner:
   labels:
     - arm64
-    - gen3
     - large
     - large-arm64
     - small
diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 9d39ab6ad7..4ccf190c6a 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -83,7 +83,6 @@ runs:
       uses: actions/checkout@v4
       with:
         submodules: true
-        fetch-depth: 1
 
     - name: Cache poetry deps
       uses: actions/cache@v4
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 5e5908860e..af76e51ebc 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -70,7 +70,6 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: true
-          fetch-depth: 1
 
       - name: Set pg 14 revision for caching
         id: pg_v14_rev
@@ -263,7 +262,6 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: true
-          fetch-depth: 1
 
       - name: Pytest regression tests
         uses: ./.github/actions/run-python-test-set
diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index 34fd8b1d15..85cfe7446e 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -44,7 +44,7 @@ jobs:
             grep -ERl $PAT .github/workflows |\
             while read -r f
             do
-              l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
+              l=$(grep -nE $PAT $f | awk -F: '{print $1}' | head -1)
               echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'"
             done
             exit 1
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 106c3e3138..f99a037489 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -96,7 +96,7 @@ jobs:
       uses: aws-actions/configure-aws-credentials@v4
       with:
         aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}  
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         role-duration-seconds: 18000 # 5 hours
 
     - name: Download Neon artifact
@@ -146,6 +146,7 @@ jobs:
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
 
     - name: Create Allure report
+      id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
 
@@ -154,7 +155,10 @@ jobs:
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: |
+          Periodic perf testing: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
@@ -176,7 +180,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
-    
+
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
@@ -215,6 +219,7 @@ jobs:
         NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
 
     - name: Create Allure report
+      id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
 
@@ -365,7 +370,7 @@ jobs:
         aws-region: eu-central-1
         role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         role-duration-seconds: 18000 # 5 hours
-        
+
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
@@ -460,6 +465,7 @@ jobs:
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
 
     - name: Create Allure report
+      id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
 
@@ -468,7 +474,10 @@ jobs:
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: |
+          Periodic perf testing on ${{ matrix.platform }}: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
@@ -542,7 +551,7 @@ jobs:
         esac
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-        
+
     - name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
       uses: aws-actions/configure-aws-credentials@v4
       with:
@@ -577,8 +586,9 @@ jobs:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-    
+
     - name: Create Allure report
+      id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
 
@@ -587,7 +597,10 @@ jobs:
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: |
+          Periodic perf testing on ${{ env.PLATFORM }}: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
@@ -670,6 +683,7 @@ jobs:
         TEST_OLAP_SCALE: 10
 
     - name: Create Allure report
+      id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
 
@@ -678,7 +692,10 @@ jobs:
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic OLAP perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: |
+          Periodic OLAP perf testing on ${{ matrix.platform }}: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
@@ -764,6 +781,7 @@ jobs:
         TEST_OLAP_SCALE: ${{ matrix.scale }}
 
     - name: Create Allure report
+      id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
 
@@ -772,7 +790,10 @@ jobs:
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: |
+          Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
@@ -843,6 +864,7 @@ jobs:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
 
     - name: Create Allure report
+      id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
 
@@ -851,6 +873,10 @@ jobs:
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: |
+          Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
+
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index f4f6e6971f..ca5ff573e1 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -38,7 +38,7 @@ jobs:
       matrix:
         arch: [ x64, arm64 ]
 
-    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     env:
       IMAGE_TAG: ${{ inputs.image-tag }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index a591cb73f2..ee6d3ba005 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -48,7 +48,7 @@ jobs:
 
   tag:
     needs: [ check-permissions ]
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
     outputs:
       build-tag: ${{steps.build-tag.outputs.tag}}
@@ -90,7 +90,7 @@ jobs:
 
   check-codestyle-python:
     needs: [ check-permissions, build-build-tools-image ]
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
       credentials:
@@ -101,9 +101,6 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
-        with:
-          submodules: false
-          fetch-depth: 1
 
       - name: Cache poetry deps
         uses: actions/cache@v4
@@ -142,7 +139,6 @@ jobs:
         uses: actions/checkout@v4
         with:
           submodules: true
-          fetch-depth: 1
 
 #      Disabled for now
 #      - name: Restore cargo deps cache
@@ -204,7 +200,7 @@ jobs:
       matrix:
         arch: [ x64 ]
         # Do not build or run tests in debug for release branches
-        build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
+        build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }}
         include:
           - build-type: release
             arch: arm64
@@ -224,7 +220,7 @@ jobs:
     outputs:
       json: ${{ steps.get-benchmark-durations.outputs.json }}
     needs: [ check-permissions, build-build-tools-image ]
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
       credentials:
@@ -257,7 +253,7 @@ jobs:
   benchmarks:
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
     needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
       credentials:
@@ -302,9 +298,8 @@ jobs:
       with:
         channel-id: C060CNA47S9 # on-call-staging-storage-stream
         slack-message: |
-          Benchmarks failed on main: ${{ github.event.head_commit.url }}
-
-          Allure report: ${{ needs.create-test-report.outputs.report-url }}
+          Benchmarks failed on main <${{ github.event.head_commit.url }}|${{ github.sha }}>
+          <${{ needs.create-test-report.outputs.report-url }}|Allure report>
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
@@ -314,7 +309,7 @@ jobs:
     outputs:
       report-url: ${{ steps.create-allure-report.outputs.report-url }}
 
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
       credentials:
@@ -361,7 +356,7 @@ jobs:
 
   coverage-report:
     needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
       credentials:
@@ -475,7 +470,7 @@ jobs:
       matrix:
         arch: [ x64, arm64 ]
 
-    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     steps:
       - name: Checkout
@@ -554,7 +549,7 @@ jobs:
         version: [ v14, v15, v16 ]
         arch: [ x64, arm64 ]
 
-    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     steps:
       - name: Checkout
@@ -699,7 +694,7 @@ jobs:
 
   vm-compute-node-image:
     needs: [ check-permissions, tag, compute-node-image ]
-    runs-on: [ self-hosted, gen3, large ]
+    runs-on: [ self-hosted, large ]
     strategy:
       fail-fast: false
       matrix:
@@ -748,7 +743,7 @@ jobs:
       matrix:
         arch: [ x64, arm64 ]
 
-    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
 
     steps:
       - name: Checkout
@@ -963,7 +958,7 @@ jobs:
     needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
     if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
 
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
     steps:
       - name: Fix git ownership
@@ -983,7 +978,6 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
         with:
-          submodules: false
           fetch-depth: 0
 
       - name: Trigger deploy workflow
@@ -1064,7 +1058,7 @@ jobs:
     needs: [ check-permissions, promote-images, tag, build-and-test-locally ]
     if: github.ref_name == 'release'
 
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
       options: --init
@@ -1120,10 +1114,12 @@ jobs:
     # Format `needs` differently to make the list more readable.
     # Usually we do `needs: [...]`
     needs:
+      - build-and-test-locally
       - check-codestyle-python
       - check-codestyle-rust
-      - build-and-test-locally
+      - promote-images
       - test-images
+      - trigger-custom-extensions-build-and-wait
     runs-on: ubuntu-22.04
     steps:
       # The list of possible results:
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 2ee66cfdc1..7fecdbde8c 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -56,7 +56,6 @@ jobs:
         uses: actions/checkout@v4
         with:
           submodules: true
-          fetch-depth: 1
 
       - name: Install macOS postgres dependencies
         run: brew install flex bison openssl protobuf icu4c pkg-config
@@ -158,7 +157,6 @@ jobs:
         uses: actions/checkout@v4
         with:
           submodules: true
-          fetch-depth: 1
 
       # Some of our rust modules use FFI and need those to be checked
       - name: Get postgres headers
diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index ed4e6be712..615937b5a1 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -27,7 +27,7 @@ concurrency:
 
 jobs:
   trigger_bench_on_ec2_machine_in_eu_central_1:
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, small ]
     container:
       image: neondatabase/build-tools:pinned
       credentials:

From 168913bdf0aa9665099b4bba2cf891ce8d48f691 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 14 Aug 2024 21:57:17 +0200
Subject: [PATCH 1424/1571] refactor(write path): newtype to enforce use of
 fully initialized slices (#8717)

The `tokio_epoll_uring::Slice` / `tokio_uring::Slice` type is weird.
The new `FullSlice` newtype is better. See the doc comment for details.

The naming is not ideal, but we'll clean that up in a future refactoring
where we move the `FullSlice` into `tokio_epoll_uring`. Then, we'll do
the following:
* tokio_epoll_uring::Slice is removed
* `FullSlice` becomes `tokio_epoll_uring::IoBufView`
* new type `tokio_epoll_uring::IoBufMutView` for the current
`tokio_epoll_uring::Slice<IoBufMut>`

Context
-------

I did this work in preparation for
https://github.com/neondatabase/neon/pull/8537.
There, I'm changing the type that the `inmemory_layer.rs` passes to
`DeltaLayerWriter::put_value_bytes` and thus it seemed like a good
opportunity to make this cleanup first.
---
 pageserver/src/tenant/blob_io.rs              |  93 ++++++++------
 .../src/tenant/ephemeral_file/page_caching.rs |  31 ++---
 .../zero_padded_read_write/zero_padded.rs     |   6 +-
 .../tenant/remote_timeline_client/download.rs |   5 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  40 ++++--
 .../src/tenant/storage_layer/image_layer.rs   |   9 +-
 .../tenant/storage_layer/inmemory_layer.rs    |  25 +++-
 pageserver/src/virtual_file.rs                | 120 +++++++++---------
 pageserver/src/virtual_file/io_engine.rs      |  19 ++-
 .../owned_buffers_io/io_buf_ext.rs            |  78 ++++++++++++
 .../virtual_file/owned_buffers_io/slice.rs    |   4 +-
 .../util/size_tracking_writer.rs              |  13 +-
 .../virtual_file/owned_buffers_io/write.rs    |  71 +++++------
 13 files changed, 310 insertions(+), 204 deletions(-)
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 8e9d349ca8..a245c99a88 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -24,6 +24,7 @@ use tracing::warn;
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::tenant::block_io::BlockCursor;
+use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
 use crate::virtual_file::VirtualFile;
 use std::cmp::min;
 use std::io::{Error, ErrorKind};
@@ -186,11 +187,11 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     /// You need to make sure that the internal buffer is empty, otherwise
     /// data will be written in wrong order.
     #[inline(always)]
-    async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    async fn write_all_unbuffered<Buf: IoBuf + Send>(
         &mut self,
-        src_buf: B,
+        src_buf: FullSlice<Buf>,
         ctx: &RequestContext,
-    ) -> (B::Buf, Result<(), Error>) {
+    ) -> (FullSlice<Buf>, Result<(), Error>) {
         let (src_buf, res) = self.inner.write_all(src_buf, ctx).await;
         let nbytes = match res {
             Ok(nbytes) => nbytes,
@@ -204,8 +205,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     /// Flushes the internal buffer to the underlying `VirtualFile`.
     pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> {
         let buf = std::mem::take(&mut self.buf);
-        let (mut buf, res) = self.inner.write_all(buf, ctx).await;
+        let (slice, res) = self.inner.write_all(buf.slice_len(), ctx).await;
         res?;
+        let mut buf = slice.into_raw_slice().into_inner();
         buf.clear();
         self.buf = buf;
         Ok(())
@@ -222,19 +224,30 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     }
 
     /// Internal, possibly buffered, write function
-    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    async fn write_all<Buf: IoBuf + Send>(
         &mut self,
-        src_buf: B,
+        src_buf: FullSlice<Buf>,
         ctx: &RequestContext,
-    ) -> (B::Buf, Result<(), Error>) {
+    ) -> (FullSlice<Buf>, Result<(), Error>) {
+        let src_buf = src_buf.into_raw_slice();
+        let src_buf_bounds = src_buf.bounds();
+        let restore = move |src_buf_slice: Slice<_>| {
+            FullSlice::must_new(Slice::from_buf_bounds(
+                src_buf_slice.into_inner(),
+                src_buf_bounds,
+            ))
+        };
+
         if !BUFFERED {
             assert!(self.buf.is_empty());
-            return self.write_all_unbuffered(src_buf, ctx).await;
+            return self
+                .write_all_unbuffered(FullSlice::must_new(src_buf), ctx)
+                .await;
         }
         let remaining = Self::CAPACITY - self.buf.len();
         let src_buf_len = src_buf.bytes_init();
         if src_buf_len == 0 {
-            return (Slice::into_inner(src_buf.slice_full()), Ok(()));
+            return (restore(src_buf), Ok(()));
         }
         let mut src_buf = src_buf.slice(0..src_buf_len);
         // First try to copy as much as we can into the buffer
@@ -245,7 +258,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         // Then, if the buffer is full, flush it out
         if self.buf.len() == Self::CAPACITY {
             if let Err(e) = self.flush_buffer(ctx).await {
-                return (Slice::into_inner(src_buf), Err(e));
+                return (restore(src_buf), Err(e));
             }
         }
         // Finally, write the tail of src_buf:
@@ -258,27 +271,29 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                 let copied = self.write_into_buffer(&src_buf);
                 // We just verified above that src_buf fits into our internal buffer.
                 assert_eq!(copied, src_buf.len());
-                Slice::into_inner(src_buf)
+                restore(src_buf)
             } else {
-                let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await;
+                let (src_buf, res) = self
+                    .write_all_unbuffered(FullSlice::must_new(src_buf), ctx)
+                    .await;
                 if let Err(e) = res {
                     return (src_buf, Err(e));
                 }
                 src_buf
             }
         } else {
-            Slice::into_inner(src_buf)
+            restore(src_buf)
         };
         (src_buf, Ok(()))
     }
 
     /// Write a blob of data. Returns the offset that it was written to,
     /// which can be used to retrieve the data later.
-    pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    pub async fn write_blob<Buf: IoBuf + Send>(
         &mut self,
-        srcbuf: B,
+        srcbuf: FullSlice<Buf>,
         ctx: &RequestContext,
-    ) -> (B::Buf, Result<u64, Error>) {
+    ) -> (FullSlice<Buf>, Result<u64, Error>) {
         let (buf, res) = self
             .write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
             .await;
@@ -287,43 +302,40 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
 
     /// Write a blob of data. Returns the offset that it was written to,
     /// which can be used to retrieve the data later.
-    pub async fn write_blob_maybe_compressed<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    pub(crate) async fn write_blob_maybe_compressed<Buf: IoBuf + Send>(
         &mut self,
-        srcbuf: B,
+        srcbuf: FullSlice<Buf>,
         ctx: &RequestContext,
         algorithm: ImageCompressionAlgorithm,
-    ) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
+    ) -> (FullSlice<Buf>, Result<(u64, CompressionInfo), Error>) {
         let offset = self.offset;
         let mut compression_info = CompressionInfo {
             written_compressed: false,
             compressed_size: None,
         };
 
-        let len = srcbuf.bytes_init();
+        let len = srcbuf.len();
 
         let mut io_buf = self.io_buf.take().expect("we always put it back below");
         io_buf.clear();
         let mut compressed_buf = None;
-        let ((io_buf, hdr_res), srcbuf) = async {
+        let ((io_buf_slice, hdr_res), srcbuf) = async {
             if len < 128 {
                 // Short blob. Write a 1-byte length header
                 io_buf.put_u8(len as u8);
-                (
-                    self.write_all(io_buf, ctx).await,
-                    srcbuf.slice_full().into_inner(),
-                )
+                (self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
             } else {
                 // Write a 4-byte length header
                 if len > MAX_SUPPORTED_LEN {
                     return (
                         (
-                            io_buf,
+                            io_buf.slice_len(),
                             Err(Error::new(
                                 ErrorKind::Other,
                                 format!("blob too large ({len} bytes)"),
                             )),
                         ),
-                        srcbuf.slice_full().into_inner(),
+                        srcbuf,
                     );
                 }
                 let (high_bit_mask, len_written, srcbuf) = match algorithm {
@@ -336,8 +348,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                         } else {
                             async_compression::tokio::write::ZstdEncoder::new(Vec::new())
                         };
-                        let slice = srcbuf.slice_full();
-                        encoder.write_all(&slice[..]).await.unwrap();
+                        encoder.write_all(&srcbuf[..]).await.unwrap();
                         encoder.shutdown().await.unwrap();
                         let compressed = encoder.into_inner();
                         compression_info.compressed_size = Some(compressed.len());
@@ -345,31 +356,29 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                             compression_info.written_compressed = true;
                             let compressed_len = compressed.len();
                             compressed_buf = Some(compressed);
-                            (BYTE_ZSTD, compressed_len, slice.into_inner())
+                            (BYTE_ZSTD, compressed_len, srcbuf)
                         } else {
-                            (BYTE_UNCOMPRESSED, len, slice.into_inner())
+                            (BYTE_UNCOMPRESSED, len, srcbuf)
                         }
                     }
-                    ImageCompressionAlgorithm::Disabled => {
-                        (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
-                    }
+                    ImageCompressionAlgorithm::Disabled => (BYTE_UNCOMPRESSED, len, srcbuf),
                 };
                 let mut len_buf = (len_written as u32).to_be_bytes();
                 assert_eq!(len_buf[0] & 0xf0, 0);
                 len_buf[0] |= high_bit_mask;
                 io_buf.extend_from_slice(&len_buf[..]);
-                (self.write_all(io_buf, ctx).await, srcbuf)
+                (self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
             }
         }
         .await;
-        self.io_buf = Some(io_buf);
+        self.io_buf = Some(io_buf_slice.into_raw_slice().into_inner());
         match hdr_res {
             Ok(_) => (),
-            Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
+            Err(e) => return (srcbuf, Err(e)),
         }
         let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf {
-            let (_buf, res) = self.write_all(compressed_buf, ctx).await;
-            (Slice::into_inner(srcbuf.slice(..)), res)
+            let (_buf, res) = self.write_all(compressed_buf.slice_len(), ctx).await;
+            (srcbuf, res)
         } else {
             self.write_all(srcbuf, ctx).await
         };
@@ -432,21 +441,21 @@ pub(crate) mod tests {
                 let (_, res) = if compression {
                     let res = wtr
                         .write_blob_maybe_compressed(
-                            blob.clone(),
+                            blob.clone().slice_len(),
                             ctx,
                             ImageCompressionAlgorithm::Zstd { level: Some(1) },
                         )
                         .await;
                     (res.0, res.1.map(|(off, _)| off))
                 } else {
-                    wtr.write_blob(blob.clone(), ctx).await
+                    wtr.write_blob(blob.clone().slice_len(), ctx).await
                 };
                 let offs = res?;
                 offsets.push(offs);
             }
             // Write out one page worth of zeros so that we can
             // read again with read_blk
-            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await;
+            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ].slice_len(), ctx).await;
             let offs = res?;
             println!("Writing final blob at offs={offs}");
             wtr.flush_buffer(ctx).await?;
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
index 0a12b64a7c..7355b3b5a3 100644
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -4,6 +4,7 @@
 use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::BlockLease;
+use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
 use crate::virtual_file::VirtualFile;
 
 use once_cell::sync::Lazy;
@@ -208,21 +209,11 @@ impl PreWarmingWriter {
 }
 
 impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
-    async fn write_all<
-        B: tokio_epoll_uring::BoundedBuf<Buf = Buf>,
-        Buf: tokio_epoll_uring::IoBuf + Send,
-    >(
+    async fn write_all<Buf: tokio_epoll_uring::IoBuf + Send>(
         &mut self,
-        buf: B,
+        buf: FullSlice<Buf>,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, B::Buf)> {
-        let buf = buf.slice(..);
-        let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done
-        let check_bounds_stuff_works = if cfg!(test) && cfg!(debug_assertions) {
-            Some(buf.to_vec())
-        } else {
-            None
-        };
+    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
         let buflen = buf.len();
         assert_eq!(
             buflen % PAGE_SZ,
@@ -231,10 +222,10 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
         );
 
         // Do the IO.
-        let iobuf = match self.file.write_all(buf, ctx).await {
-            (iobuf, Ok(nwritten)) => {
+        let buf = match self.file.write_all(buf, ctx).await {
+            (buf, Ok(nwritten)) => {
                 assert_eq!(nwritten, buflen);
-                iobuf
+                buf
             }
             (_, Err(e)) => {
                 return Err(std::io::Error::new(
@@ -248,12 +239,6 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
             }
         };
 
-        // Reconstruct the Slice (the write path consumed the Slice and returned us the underlying IoBuf)
-        let buf = tokio_epoll_uring::Slice::from_buf_bounds(iobuf, saved_bounds);
-        if let Some(check_bounds_stuff_works) = check_bounds_stuff_works {
-            assert_eq!(&check_bounds_stuff_works, &*buf);
-        }
-
         let nblocks = buflen / PAGE_SZ;
         let nblocks32 = u32::try_from(nblocks).unwrap();
 
@@ -300,6 +285,6 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
         }
 
         self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
-        Ok((buflen, buf.into_inner()))
+        Ok((buflen, buf))
     }
 }
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
index f90291bbf8..2dc0277638 100644
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
@@ -5,6 +5,8 @@
 
 use std::mem::MaybeUninit;
 
+use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
+
 /// See module-level comment.
 pub struct Buffer<const N: usize> {
     allocation: Box<[u8; N]>,
@@ -60,10 +62,10 @@ impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Bu
         self.written
     }
 
-    fn flush(self) -> tokio_epoll_uring::Slice<Self> {
+    fn flush(self) -> FullSlice<Self> {
         self.invariants();
         let written = self.written;
-        tokio_epoll_uring::BoundedBuf::slice(self, 0..written)
+        FullSlice::must_new(tokio_epoll_uring::BoundedBuf::slice(self, 0..written))
     }
 
     fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index a17b32c983..8199218c3c 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -23,6 +23,7 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
+use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
@@ -219,9 +220,7 @@ async fn download_object<'a>(
                             Ok(chunk) => chunk,
                             Err(e) => return Err(e),
                         };
-                        buffered
-                            .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk), ctx)
-                            .await?;
+                        buffered.write_buffered(chunk.slice_len(), ctx).await?;
                     }
                     let size_tracking = buffered.flush_and_into_inner(ctx).await?;
                     Ok(size_tracking.into_inner())
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 0ed2f72c3f..6c2391d72d 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -42,6 +42,7 @@ use crate::tenant::vectored_blob_io::{
     VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
+use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
@@ -63,6 +64,7 @@ use std::os::unix::fs::FileExt;
 use std::str::FromStr;
 use std::sync::Arc;
 use tokio::sync::OnceCell;
+use tokio_epoll_uring::IoBufMut;
 use tracing::*;
 
 use utils::{
@@ -436,19 +438,28 @@ impl DeltaLayerWriterInner {
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let (_, res) = self
-            .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init(), ctx)
+            .put_value_bytes(
+                key,
+                lsn,
+                Value::ser(&val)?.slice_len(),
+                val.will_init(),
+                ctx,
+            )
             .await;
         res
     }
 
-    async fn put_value_bytes(
+    async fn put_value_bytes<Buf>(
         &mut self,
         key: Key,
         lsn: Lsn,
-        val: Vec<u8>,
+        val: FullSlice<Buf>,
         will_init: bool,
         ctx: &RequestContext,
-    ) -> (Vec<u8>, anyhow::Result<()>) {
+    ) -> (FullSlice<Buf>, anyhow::Result<()>)
+    where
+        Buf: IoBufMut + Send,
+    {
         assert!(
             self.lsn_range.start <= lsn,
             "lsn_start={}, lsn={}",
@@ -514,7 +525,7 @@ impl DeltaLayerWriterInner {
         file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
             .await?;
         for buf in block_buf.blocks {
-            let (_buf, res) = file.write_all(buf, ctx).await;
+            let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
             res?;
         }
         assert!(self.lsn_range.start < self.lsn_range.end);
@@ -534,7 +545,7 @@ impl DeltaLayerWriterInner {
         // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&summary, &mut buf)?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf, ctx).await;
+        let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
         res?;
 
         let metadata = file
@@ -646,14 +657,17 @@ impl DeltaLayerWriter {
             .await
     }
 
-    pub async fn put_value_bytes(
+    pub async fn put_value_bytes<Buf>(
         &mut self,
         key: Key,
         lsn: Lsn,
-        val: Vec<u8>,
+        val: FullSlice<Buf>,
         will_init: bool,
         ctx: &RequestContext,
-    ) -> (Vec<u8>, anyhow::Result<()>) {
+    ) -> (FullSlice<Buf>, anyhow::Result<()>)
+    where
+        Buf: IoBufMut + Send,
+    {
         self.inner
             .as_mut()
             .unwrap()
@@ -743,7 +757,7 @@ impl DeltaLayer {
         // TODO: could use smallvec here, but it's a pain with Slice<T>
         Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf, ctx).await;
+        let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
         res?;
         Ok(())
     }
@@ -1291,12 +1305,12 @@ impl DeltaLayerInner {
                         .put_value_bytes(
                             key,
                             lsn,
-                            std::mem::take(&mut per_blob_copy),
+                            std::mem::take(&mut per_blob_copy).slice_len(),
                             will_init,
                             ctx,
                         )
                         .await;
-                    per_blob_copy = tmp;
+                    per_blob_copy = tmp.into_raw_slice().into_inner();
 
                     res?;
 
@@ -1871,7 +1885,7 @@ pub(crate) mod test {
 
         for entry in entries {
             let (_, res) = writer
-                .put_value_bytes(entry.key, entry.lsn, entry.value, false, &ctx)
+                .put_value_bytes(entry.key, entry.lsn, entry.value.slice_len(), false, &ctx)
                 .await;
             res?;
         }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index f9d3fdf186..9a19e4e2c7 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -38,6 +38,7 @@ use crate::tenant::vectored_blob_io::{
     VectoredReadPlanner,
 };
 use crate::tenant::{PageReconstructError, Timeline};
+use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
@@ -354,7 +355,7 @@ impl ImageLayer {
         // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf, ctx).await;
+        let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
         res?;
         Ok(())
     }
@@ -786,7 +787,7 @@ impl ImageLayerWriterInner {
         self.num_keys += 1;
         let (_img, res) = self
             .blob_writer
-            .write_blob_maybe_compressed(img, ctx, compression)
+            .write_blob_maybe_compressed(img.slice_len(), ctx, compression)
             .await;
         // TODO: re-use the buffer for `img` further upstack
         let (off, compression_info) = res?;
@@ -838,7 +839,7 @@ impl ImageLayerWriterInner {
             .await?;
         let (index_root_blk, block_buf) = self.tree.finish()?;
         for buf in block_buf.blocks {
-            let (_buf, res) = file.write_all(buf, ctx).await;
+            let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
             res?;
         }
 
@@ -858,7 +859,7 @@ impl ImageLayerWriterInner {
         // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&summary, &mut buf)?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf, ctx).await;
+        let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
         res?;
 
         let metadata = file
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index fb15ddfba9..748d79c149 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -12,6 +12,7 @@ use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
+use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::{l0_flush, page_cache, walrecord};
 use anyhow::{anyhow, Result};
 use camino::Utf8PathBuf;
@@ -581,11 +582,17 @@ impl InMemoryLayer {
                     for (lsn, pos) in vec_map.as_slice() {
                         cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
                         let will_init = Value::des(&buf)?.will_init();
-                        let res;
-                        (buf, res) = delta_layer_writer
-                            .put_value_bytes(Key::from_compact(*key), *lsn, buf, will_init, &ctx)
+                        let (tmp, res) = delta_layer_writer
+                            .put_value_bytes(
+                                Key::from_compact(*key),
+                                *lsn,
+                                buf.slice_len(),
+                                will_init,
+                                &ctx,
+                            )
                             .await;
                         res?;
+                        buf = tmp.into_raw_slice().into_inner();
                     }
                 }
             }
@@ -620,11 +627,17 @@ impl InMemoryLayer {
                         // => https://github.com/neondatabase/neon/issues/8183
                         cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
                         let will_init = Value::des(&buf)?.will_init();
-                        let res;
-                        (buf, res) = delta_layer_writer
-                            .put_value_bytes(Key::from_compact(*key), *lsn, buf, will_init, ctx)
+                        let (tmp, res) = delta_layer_writer
+                            .put_value_bytes(
+                                Key::from_compact(*key),
+                                *lsn,
+                                buf.slice_len(),
+                                will_init,
+                                ctx,
+                            )
                             .await;
                         res?;
+                        buf = tmp.into_raw_slice().into_inner();
                     }
                 }
             }
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 27f6fe90a4..b4695e5f40 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -17,6 +17,7 @@ use crate::page_cache::{PageWriteGuard, PAGE_SZ};
 use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
+use owned_buffers_io::io_buf_ext::FullSlice;
 use pageserver_api::shard::TenantShardId;
 use std::fs::File;
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
@@ -50,6 +51,7 @@ pub(crate) mod owned_buffers_io {
     //! but for the time being we're proving out the primitives in the neon.git repo
     //! for faster iteration.
 
+    pub(crate) mod io_buf_ext;
     pub(crate) mod slice;
     pub(crate) mod write;
     pub(crate) mod util {
@@ -637,24 +639,24 @@ impl VirtualFile {
     }
 
     // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
-    pub async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    pub async fn write_all_at<Buf: IoBuf + Send>(
         &self,
-        buf: B,
+        buf: FullSlice<Buf>,
         mut offset: u64,
         ctx: &RequestContext,
-    ) -> (B::Buf, Result<(), Error>) {
-        let buf_len = buf.bytes_init();
-        if buf_len == 0 {
-            return (Slice::into_inner(buf.slice_full()), Ok(()));
-        }
-        let mut buf = buf.slice(0..buf_len);
+    ) -> (FullSlice<Buf>, Result<(), Error>) {
+        let buf = buf.into_raw_slice();
+        let bounds = buf.bounds();
+        let restore =
+            |buf: Slice<_>| FullSlice::must_new(Slice::from_buf_bounds(buf.into_inner(), bounds));
+        let mut buf = buf;
         while !buf.is_empty() {
-            let res;
-            (buf, res) = self.write_at(buf, offset, ctx).await;
+            let (tmp, res) = self.write_at(FullSlice::must_new(buf), offset, ctx).await;
+            buf = tmp.into_raw_slice();
             match res {
                 Ok(0) => {
                     return (
-                        Slice::into_inner(buf),
+                        restore(buf),
                         Err(Error::new(
                             std::io::ErrorKind::WriteZero,
                             "failed to write whole buffer",
@@ -666,33 +668,33 @@ impl VirtualFile {
                     offset += n as u64;
                 }
                 Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return (Slice::into_inner(buf), Err(e)),
+                Err(e) => return (restore(buf), Err(e)),
             }
         }
-        (Slice::into_inner(buf), Ok(()))
+        (restore(buf), Ok(()))
     }
 
-    /// Writes `buf.slice(0..buf.bytes_init())`.
-    /// Returns the IoBuf that is underlying the BoundedBuf `buf`.
-    /// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in.
-    /// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant.
-    pub async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    /// Writes `buf` to the file at the current offset.
+    ///
+    /// Panics if there is an uninitialized range in `buf`, as that is most likely a bug in the caller.
+    pub async fn write_all<Buf: IoBuf + Send>(
         &mut self,
-        buf: B,
+        buf: FullSlice<Buf>,
         ctx: &RequestContext,
-    ) -> (B::Buf, Result<usize, Error>) {
-        let nbytes = buf.bytes_init();
-        if nbytes == 0 {
-            return (Slice::into_inner(buf.slice_full()), Ok(0));
-        }
-        let mut buf = buf.slice(0..nbytes);
+    ) -> (FullSlice<Buf>, Result<usize, Error>) {
+        let buf = buf.into_raw_slice();
+        let bounds = buf.bounds();
+        let restore =
+            |buf: Slice<_>| FullSlice::must_new(Slice::from_buf_bounds(buf.into_inner(), bounds));
+        let nbytes = buf.len();
+        let mut buf = buf;
         while !buf.is_empty() {
-            let res;
-            (buf, res) = self.write(buf, ctx).await;
+            let (tmp, res) = self.write(FullSlice::must_new(buf), ctx).await;
+            buf = tmp.into_raw_slice();
             match res {
                 Ok(0) => {
                     return (
-                        Slice::into_inner(buf),
+                        restore(buf),
                         Err(Error::new(
                             std::io::ErrorKind::WriteZero,
                             "failed to write whole buffer",
@@ -703,17 +705,17 @@ impl VirtualFile {
                     buf = buf.slice(n..);
                 }
                 Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return (Slice::into_inner(buf), Err(e)),
+                Err(e) => return (restore(buf), Err(e)),
             }
         }
-        (Slice::into_inner(buf), Ok(nbytes))
+        (restore(buf), Ok(nbytes))
     }
 
     async fn write<B: IoBuf + Send>(
         &mut self,
-        buf: Slice<B>,
+        buf: FullSlice<B>,
         ctx: &RequestContext,
-    ) -> (Slice<B>, Result<usize, std::io::Error>) {
+    ) -> (FullSlice<B>, Result<usize, std::io::Error>) {
         let pos = self.pos;
         let (buf, res) = self.write_at(buf, pos, ctx).await;
         let n = match res {
@@ -756,10 +758,10 @@ impl VirtualFile {
 
     async fn write_at<B: IoBuf + Send>(
         &self,
-        buf: Slice<B>,
+        buf: FullSlice<B>,
         offset: u64,
         _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
-    ) -> (Slice<B>, Result<usize, Error>) {
+    ) -> (FullSlice<B>, Result<usize, Error>) {
         let file_guard = match self.lock_file().await {
             Ok(file_guard) => file_guard,
             Err(e) => return (buf, Err(e)),
@@ -1093,11 +1095,11 @@ impl Drop for VirtualFile {
 
 impl OwnedAsyncWriter for VirtualFile {
     #[inline(always)]
-    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    async fn write_all<Buf: IoBuf + Send>(
         &mut self,
-        buf: B,
+        buf: FullSlice<Buf>,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, B::Buf)> {
+    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
         let (buf, res) = VirtualFile::write_all(self, buf, ctx).await;
         res.map(move |v| (v, buf))
     }
@@ -1159,7 +1161,8 @@ mod tests {
     use crate::task_mgr::TaskKind;
 
     use super::*;
-    use owned_buffers_io::slice::SliceExt;
+    use owned_buffers_io::io_buf_ext::IoBufExt;
+    use owned_buffers_io::slice::SliceMutExt;
     use rand::seq::SliceRandom;
     use rand::thread_rng;
     use rand::Rng;
@@ -1193,9 +1196,9 @@ mod tests {
                 }
             }
         }
-        async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        async fn write_all_at<Buf: IoBuf + Send>(
             &self,
-            buf: B,
+            buf: FullSlice<Buf>,
             offset: u64,
             ctx: &RequestContext,
         ) -> Result<(), Error> {
@@ -1204,13 +1207,7 @@ mod tests {
                     let (_buf, res) = file.write_all_at(buf, offset, ctx).await;
                     res
                 }
-                MaybeVirtualFile::File(file) => {
-                    let buf_len = buf.bytes_init();
-                    if buf_len == 0 {
-                        return Ok(());
-                    }
-                    file.write_all_at(&buf.slice(0..buf_len), offset)
-                }
+                MaybeVirtualFile::File(file) => file.write_all_at(&buf[..], offset),
             }
         }
         async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
@@ -1219,9 +1216,9 @@ mod tests {
                 MaybeVirtualFile::File(file) => file.seek(pos),
             }
         }
-        async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        async fn write_all<Buf: IoBuf + Send>(
             &mut self,
-            buf: B,
+            buf: FullSlice<Buf>,
             ctx: &RequestContext,
         ) -> Result<(), Error> {
             match self {
@@ -1229,13 +1226,7 @@ mod tests {
                     let (_buf, res) = file.write_all(buf, ctx).await;
                     res.map(|_| ())
                 }
-                MaybeVirtualFile::File(file) => {
-                    let buf_len = buf.bytes_init();
-                    if buf_len == 0 {
-                        return Ok(());
-                    }
-                    file.write_all(&buf.slice(0..buf_len))
-                }
+                MaybeVirtualFile::File(file) => file.write_all(&buf[..]),
             }
         }
 
@@ -1347,7 +1338,9 @@ mod tests {
             &ctx,
         )
         .await?;
-        file_a.write_all(b"foobar".to_vec(), &ctx).await?;
+        file_a
+            .write_all(b"foobar".to_vec().slice_len(), &ctx)
+            .await?;
 
         // cannot read from a file opened in write-only mode
         let _ = file_a.read_string(&ctx).await.unwrap_err();
@@ -1356,7 +1349,10 @@ mod tests {
         let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?;
 
         // cannot write to a file opened in read-only mode
-        let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err();
+        let _ = file_a
+            .write_all(b"bar".to_vec().slice_len(), &ctx)
+            .await
+            .unwrap_err();
 
         // Try simple read
         assert_eq!("foobar", file_a.read_string(&ctx).await?);
@@ -1399,8 +1395,12 @@ mod tests {
             &ctx,
         )
         .await?;
-        file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?;
-        file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?;
+        file_b
+            .write_all_at(b"BAR".to_vec().slice_len(), 3, &ctx)
+            .await?;
+        file_b
+            .write_all_at(b"FOO".to_vec().slice_len(), 0, &ctx)
+            .await?;
 
         assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA");
 
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 0ffcd9fa05..faef1ba9ff 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -12,7 +12,7 @@
 #[cfg(target_os = "linux")]
 pub(super) mod tokio_epoll_uring_ext;
 
-use tokio_epoll_uring::{IoBuf, Slice};
+use tokio_epoll_uring::IoBuf;
 use tracing::Instrument;
 
 pub(crate) use super::api::IoEngineKind;
@@ -107,7 +107,10 @@ use std::{
     sync::atomic::{AtomicU8, Ordering},
 };
 
-use super::{owned_buffers_io::slice::SliceExt, FileGuard, Metadata};
+use super::{
+    owned_buffers_io::{io_buf_ext::FullSlice, slice::SliceMutExt},
+    FileGuard, Metadata,
+};
 
 #[cfg(target_os = "linux")]
 fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std::io::Error {
@@ -206,8 +209,8 @@ impl IoEngine {
         &self,
         file_guard: FileGuard,
         offset: u64,
-        buf: Slice<B>,
-    ) -> ((FileGuard, Slice<B>), std::io::Result<usize>) {
+        buf: FullSlice<B>,
+    ) -> ((FileGuard, FullSlice<B>), std::io::Result<usize>) {
         match self {
             IoEngine::NotSet => panic!("not initialized"),
             IoEngine::StdFs => {
@@ -217,8 +220,12 @@ impl IoEngine {
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring_ext::thread_local_system().await;
-                let (resources, res) = system.write(file_guard, offset, buf).await;
-                (resources, res.map_err(epoll_uring_error_to_std))
+                let ((file_guard, slice), res) =
+                    system.write(file_guard, offset, buf.into_raw_slice()).await;
+                (
+                    (file_guard, FullSlice::must_new(slice)),
+                    res.map_err(epoll_uring_error_to_std),
+                )
             }
         }
     }
diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
new file mode 100644
index 0000000000..7c773b6b21
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
@@ -0,0 +1,78 @@
+//! See [`FullSlice`].
+
+use bytes::{Bytes, BytesMut};
+use std::ops::{Deref, Range};
+use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
+
+/// The true owned equivalent for Rust [`slice`]. Use this for the write path.
+///
+/// Unlike [`tokio_epoll_uring::Slice`], which we unfortunately inherited from `tokio-uring`,
+/// [`FullSlice`] is guaranteed to have all its bytes initialized. This means that
+/// [`<FullSlice as Deref<Target = [u8]>>::len`] is equal to [`Slice::bytes_init`] and [`Slice::bytes_total`].
+///
+pub struct FullSlice<B> {
+    slice: Slice<B>,
+}
+
+impl<B> FullSlice<B>
+where
+    B: IoBuf,
+{
+    pub(crate) fn must_new(slice: Slice<B>) -> Self {
+        assert_eq!(slice.bytes_init(), slice.bytes_total());
+        FullSlice { slice }
+    }
+    pub(crate) fn into_raw_slice(self) -> Slice<B> {
+        let FullSlice { slice: s } = self;
+        s
+    }
+}
+
+impl<B> Deref for FullSlice<B>
+where
+    B: IoBuf,
+{
+    type Target = [u8];
+
+    fn deref(&self) -> &[u8] {
+        let rust_slice = &self.slice[..];
+        assert_eq!(rust_slice.len(), self.slice.bytes_init());
+        assert_eq!(rust_slice.len(), self.slice.bytes_total());
+        rust_slice
+    }
+}
+
+pub(crate) trait IoBufExt {
+    /// Get a [`FullSlice`] for the entire buffer, i.e., `self[..]` or `self[0..self.len()]`.
+    fn slice_len(self) -> FullSlice<Self>
+    where
+        Self: Sized;
+}
+
+macro_rules! impl_io_buf_ext {
+    ($T:ty) => {
+        impl IoBufExt for $T {
+            #[inline(always)]
+            fn slice_len(self) -> FullSlice<Self> {
+                let len = self.len();
+                let s = if len == 0 {
+                    // `BoundedBuf::slice(0..len)` or `BoundedBuf::slice(..)` has an incorrect assertion,
+                    // causing a panic if len == 0.
+                    // The Slice::from_buf_bounds has the correct assertion (<= instead of <).
+                    // => https://github.com/neondatabase/tokio-epoll-uring/issues/46
+                    let slice = self.slice_full();
+                    let mut bounds: Range<_> = slice.bounds();
+                    bounds.end = bounds.start;
+                    Slice::from_buf_bounds(slice.into_inner(), bounds)
+                } else {
+                    self.slice(0..len)
+                };
+                FullSlice::must_new(s)
+            }
+        }
+    };
+}
+
+impl_io_buf_ext!(Bytes);
+impl_io_buf_ext!(BytesMut);
+impl_io_buf_ext!(Vec<u8>);
diff --git a/pageserver/src/virtual_file/owned_buffers_io/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/slice.rs
index d19e5ddffe..6100593663 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/slice.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/slice.rs
@@ -3,14 +3,14 @@ use tokio_epoll_uring::BoundedBufMut;
 use tokio_epoll_uring::IoBufMut;
 use tokio_epoll_uring::Slice;
 
-pub(crate) trait SliceExt {
+pub(crate) trait SliceMutExt {
     /// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO.
     ///
     /// See the test case `test_slice_full_zeroed` for the difference to just doing `&slice[..]`
     fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8];
 }
 
-impl<B> SliceExt for Slice<B>
+impl<B> SliceMutExt for Slice<B>
 where
     B: IoBufMut,
 {
diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
index 55b1d0b46b..efcb61ba65 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
@@ -1,5 +1,8 @@
-use crate::{context::RequestContext, virtual_file::owned_buffers_io::write::OwnedAsyncWriter};
-use tokio_epoll_uring::{BoundedBuf, IoBuf};
+use crate::{
+    context::RequestContext,
+    virtual_file::owned_buffers_io::{io_buf_ext::FullSlice, write::OwnedAsyncWriter},
+};
+use tokio_epoll_uring::IoBuf;
 
 pub struct Writer<W> {
     dst: W,
@@ -35,11 +38,11 @@ where
     W: OwnedAsyncWriter,
 {
     #[inline(always)]
-    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    async fn write_all<Buf: IoBuf + Send>(
         &mut self,
-        buf: B,
+        buf: FullSlice<Buf>,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, B::Buf)> {
+    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
         let (nwritten, buf) = self.dst.write_all(buf, ctx).await?;
         self.bytes_amount += u64::try_from(nwritten).unwrap();
         Ok((nwritten, buf))
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index 8599d95cdf..f8f37b17e3 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -1,16 +1,18 @@
 use bytes::BytesMut;
-use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
+use tokio_epoll_uring::IoBuf;
 
 use crate::context::RequestContext;
 
+use super::io_buf_ext::{FullSlice, IoBufExt};
+
 /// A trait for doing owned-buffer write IO.
 /// Think [`tokio::io::AsyncWrite`] but with owned buffers.
 pub trait OwnedAsyncWriter {
-    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    async fn write_all<Buf: IoBuf + Send>(
         &mut self,
-        buf: B,
+        buf: FullSlice<Buf>,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, B::Buf)>;
+    ) -> std::io::Result<(usize, FullSlice<Buf>)>;
 }
 
 /// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch
@@ -79,9 +81,11 @@ where
     #[cfg_attr(target_os = "macos", allow(dead_code))]
     pub async fn write_buffered<S: IoBuf + Send>(
         &mut self,
-        chunk: Slice<S>,
+        chunk: FullSlice<S>,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, S)> {
+    ) -> std::io::Result<(usize, FullSlice<S>)> {
+        let chunk = chunk.into_raw_slice();
+
         let chunk_len = chunk.len();
         // avoid memcpy for the middle of the chunk
         if chunk.len() >= self.buf().cap() {
@@ -94,7 +98,10 @@ where
                     .pending(),
                 0
             );
-            let (nwritten, chunk) = self.writer.write_all(chunk, ctx).await?;
+            let (nwritten, chunk) = self
+                .writer
+                .write_all(FullSlice::must_new(chunk), ctx)
+                .await?;
             assert_eq!(nwritten, chunk_len);
             return Ok((nwritten, chunk));
         }
@@ -114,7 +121,7 @@ where
             }
         }
         assert!(slice.is_empty(), "by now we should have drained the chunk");
-        Ok((chunk_len, chunk.into_inner()))
+        Ok((chunk_len, FullSlice::must_new(chunk)))
     }
 
     /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data.
@@ -150,9 +157,12 @@ where
             self.buf = Some(buf);
             return Ok(());
         }
-        let (nwritten, io_buf) = self.writer.write_all(buf.flush(), ctx).await?;
+        let slice = buf.flush();
+        let (nwritten, slice) = self.writer.write_all(slice, ctx).await?;
         assert_eq!(nwritten, buf_len);
-        self.buf = Some(Buffer::reuse_after_flush(io_buf));
+        self.buf = Some(Buffer::reuse_after_flush(
+            slice.into_raw_slice().into_inner(),
+        ));
         Ok(())
     }
 }
@@ -172,9 +182,9 @@ pub trait Buffer {
     /// Number of bytes in the buffer.
     fn pending(&self) -> usize;
 
-    /// Turns `self` into a [`tokio_epoll_uring::Slice`] of the pending data
+    /// Turns `self` into a [`FullSlice`] of the pending data
     /// so we can use [`tokio_epoll_uring`] to write it to disk.
-    fn flush(self) -> Slice<Self::IoBuf>;
+    fn flush(self) -> FullSlice<Self::IoBuf>;
 
     /// After the write to disk is done and we have gotten back the slice,
     /// [`BufferedWriter`] uses this method to re-use the io buffer.
@@ -198,12 +208,8 @@ impl Buffer for BytesMut {
         self.len()
     }
 
-    fn flush(self) -> Slice<BytesMut> {
-        if self.is_empty() {
-            return self.slice_full();
-        }
-        let len = self.len();
-        self.slice(0..len)
+    fn flush(self) -> FullSlice<BytesMut> {
+        self.slice_len()
     }
 
     fn reuse_after_flush(mut iobuf: BytesMut) -> Self {
@@ -213,18 +219,13 @@ impl Buffer for BytesMut {
 }
 
 impl OwnedAsyncWriter for Vec<u8> {
-    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+    async fn write_all<Buf: IoBuf + Send>(
         &mut self,
-        buf: B,
+        buf: FullSlice<Buf>,
         _: &RequestContext,
-    ) -> std::io::Result<(usize, B::Buf)> {
-        let nbytes = buf.bytes_init();
-        if nbytes == 0 {
-            return Ok((0, Slice::into_inner(buf.slice_full())));
-        }
-        let buf = buf.slice(0..nbytes);
+    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
         self.extend_from_slice(&buf[..]);
-        Ok((buf.len(), Slice::into_inner(buf)))
+        Ok((buf.len(), buf))
     }
 }
 
@@ -241,19 +242,13 @@ mod tests {
         writes: Vec<Vec<u8>>,
     }
     impl OwnedAsyncWriter for RecorderWriter {
-        async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        async fn write_all<Buf: IoBuf + Send>(
             &mut self,
-            buf: B,
+            buf: FullSlice<Buf>,
             _: &RequestContext,
-        ) -> std::io::Result<(usize, B::Buf)> {
-            let nbytes = buf.bytes_init();
-            if nbytes == 0 {
-                self.writes.push(vec![]);
-                return Ok((0, Slice::into_inner(buf.slice_full())));
-            }
-            let buf = buf.slice(0..nbytes);
+        ) -> std::io::Result<(usize, FullSlice<Buf>)> {
             self.writes.push(Vec::from(&buf[..]));
-            Ok((buf.len(), Slice::into_inner(buf)))
+            Ok((buf.len(), buf))
         }
     }
 
@@ -264,7 +259,7 @@ mod tests {
     macro_rules! write {
         ($writer:ident, $data:literal) => {{
             $writer
-                .write_buffered(::bytes::Bytes::from_static($data).slice_full(), &test_ctx())
+                .write_buffered(::bytes::Bytes::from_static($data).slice_len(), &test_ctx())
                 .await?;
         }};
     }

From fef77b0cc981f71238e1117d392ea55ec867e61f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 15 Aug 2024 09:02:33 +0100
Subject: [PATCH 1425/1571] safekeeper: consider partial uploads when pulling
 timeline (#8628)

## Problem
The control file contains the id of the safekeeper that uploaded it.
Previously, when sending a snapshot of the control file to another sk,
it would eventually be gc-ed by the receiving sk. This is incorrect
because the original sk might still need it later.

## Summary of Changes
When sending a snapshot and the control file contains an uploaded
segment:
* Create a copy of the segment in s3 with the destination sk in the
  object name
* Tweak the streamed control file to point to the object create in the
  previous step

Note that the snapshot endpoint now has to know the id of the requestor,
so the api has been extended to include the node if of the destination
sk.

Closes https://github.com/neondatabase/neon/issues/8542
---
 safekeeper/src/control_file.rs           |  42 +++---
 safekeeper/src/http/client.rs            |   7 +-
 safekeeper/src/http/routes.rs            |  11 +-
 safekeeper/src/pull_timeline.rs          |  64 +++++++---
 safekeeper/src/wal_backup.rs             |  10 ++
 safekeeper/src/wal_backup_partial.rs     |  57 ++++++++-
 test_runner/fixtures/neon_fixtures.py    |  23 +++-
 test_runner/regress/test_wal_acceptor.py | 155 ++++++++++++++++++++++-
 8 files changed, 327 insertions(+), 42 deletions(-)

diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index d574bb438f..c551cd3122 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -164,6 +164,30 @@ impl Deref for FileStorage {
     }
 }
 
+impl TimelinePersistentState {
+    pub(crate) fn write_to_buf(&self) -> Result<Vec<u8>> {
+        let mut buf: Vec<u8> = Vec::new();
+        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
+
+        if self.eviction_state == EvictionState::Present {
+            // temp hack for forward compatibility
+            const PREV_FORMAT_VERSION: u32 = 8;
+            let prev = downgrade_v9_to_v8(self);
+            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
+            prev.ser_into(&mut buf)?;
+        } else {
+            // otherwise, we write the current format version
+            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
+            self.ser_into(&mut buf)?;
+        }
+
+        // calculate checksum before resize
+        let checksum = crc32c::crc32c(&buf);
+        buf.extend_from_slice(&checksum.to_le_bytes());
+        Ok(buf)
+    }
+}
+
 #[async_trait::async_trait]
 impl Storage for FileStorage {
     /// Persists state durably to the underlying storage.
@@ -180,24 +204,8 @@ impl Storage for FileStorage {
                 &control_partial_path
             )
         })?;
-        let mut buf: Vec<u8> = Vec::new();
-        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
 
-        if s.eviction_state == EvictionState::Present {
-            // temp hack for forward compatibility
-            const PREV_FORMAT_VERSION: u32 = 8;
-            let prev = downgrade_v9_to_v8(s);
-            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
-            prev.ser_into(&mut buf)?;
-        } else {
-            // otherwise, we write the current format version
-            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
-            s.ser_into(&mut buf)?;
-        }
-
-        // calculate checksum before resize
-        let checksum = crc32c::crc32c(&buf);
-        buf.extend_from_slice(&checksum.to_le_bytes());
+        let buf: Vec<u8> = s.write_to_buf()?;
 
         control_partial.write_all(&buf).await.with_context(|| {
             format!(
diff --git a/safekeeper/src/http/client.rs b/safekeeper/src/http/client.rs
index 0bb31c200d..c56f7880d4 100644
--- a/safekeeper/src/http/client.rs
+++ b/safekeeper/src/http/client.rs
@@ -10,7 +10,7 @@
 use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
     http::error::HttpErrorBody,
-    id::{TenantId, TimelineId},
+    id::{NodeId, TenantId, TimelineId},
     logging::SecretString,
 };
 
@@ -97,10 +97,11 @@ impl Client {
         &self,
         tenant_id: TenantId,
         timeline_id: TimelineId,
+        stream_to: NodeId,
     ) -> Result<reqwest::Response> {
         let uri = format!(
-            "{}/v1/tenant/{}/timeline/{}/snapshot",
-            self.mgmt_api_endpoint, tenant_id, timeline_id
+            "{}/v1/tenant/{}/timeline/{}/snapshot/{}",
+            self.mgmt_api_endpoint, tenant_id, timeline_id, stream_to.0
         );
         self.get(&uri).await
     }
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index fe6d325cee..c9defb0bcf 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -205,6 +205,7 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
 
 /// Stream tar archive with all timeline data.
 async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let destination = parse_request_param(&request, "destination_id")?;
     let ttid = TenantTimelineId::new(
         parse_request_param(&request, "tenant_id")?,
         parse_request_param(&request, "timeline_id")?,
@@ -225,7 +226,13 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
     // so create the chan and write to it in another task.
     let (tx, rx) = mpsc::channel(1);
 
-    task::spawn(pull_timeline::stream_snapshot(tli, tx));
+    let conf = get_conf(&request);
+    task::spawn(pull_timeline::stream_snapshot(
+        tli,
+        conf.my_id,
+        destination,
+        tx,
+    ));
 
     let rx_stream = ReceiverStream::new(rx);
     let body = Body::wrap_stream(rx_stream);
@@ -565,7 +572,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
             request_span(r, tenant_delete_handler)
         })
         .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot",
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id",
             |r| request_span(r, timeline_snapshot_handler),
         )
         .post("/v1/pull_timeline", |r| {
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 618c6b278f..1eacec9981 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -11,13 +11,8 @@ use std::{
     io::{self, ErrorKind},
     sync::Arc,
 };
-use tokio::{
-    fs::{File, OpenOptions},
-    io::AsyncWrite,
-    sync::mpsc,
-    task,
-};
-use tokio_tar::{Archive, Builder};
+use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task};
+use tokio_tar::{Archive, Builder, Header};
 use tokio_util::{
     io::{CopyToBytes, SinkWriter},
     sync::PollSender,
@@ -32,13 +27,15 @@ use crate::{
         routes::TimelineStatus,
     },
     safekeeper::Term,
+    state::TimelinePersistentState,
     timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError, WalResidentTimeline},
+    wal_backup,
     wal_storage::{self, open_wal_file, Storage},
     GlobalTimelines, SafeKeeperConf,
 };
 use utils::{
     crashsafe::{durable_rename, fsync_async_opt},
-    id::{TenantId, TenantTimelineId, TimelineId},
+    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
     logging::SecretString,
     lsn::Lsn,
     pausable_failpoint,
@@ -46,8 +43,13 @@ use utils::{
 
 /// Stream tar archive of timeline to tx.
 #[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))]
-pub async fn stream_snapshot(tli: WalResidentTimeline, tx: mpsc::Sender<Result<Bytes>>) {
-    if let Err(e) = stream_snapshot_guts(tli, tx.clone()).await {
+pub async fn stream_snapshot(
+    tli: WalResidentTimeline,
+    source: NodeId,
+    destination: NodeId,
+    tx: mpsc::Sender<Result<Bytes>>,
+) {
+    if let Err(e) = stream_snapshot_guts(tli, source, destination, tx.clone()).await {
         // Error type/contents don't matter as they won't can't reach the client
         // (hyper likely doesn't do anything with it), but http stream will be
         // prematurely terminated. It would be nice to try to send the error in
@@ -81,6 +83,8 @@ impl Drop for SnapshotContext {
 
 pub async fn stream_snapshot_guts(
     tli: WalResidentTimeline,
+    source: NodeId,
+    destination: NodeId,
     tx: mpsc::Sender<Result<Bytes>>,
 ) -> Result<()> {
     // tokio-tar wants Write implementor, but we have mpsc tx <Result<Bytes>>;
@@ -104,7 +108,7 @@ pub async fn stream_snapshot_guts(
     // which is also likely suboptimal.
     let mut ar = Builder::new_non_terminated(pinned_writer);
 
-    let bctx = tli.start_snapshot(&mut ar).await?;
+    let bctx = tli.start_snapshot(&mut ar, source, destination).await?;
     pausable_failpoint!("sk-snapshot-after-list-pausable");
 
     let tli_dir = tli.get_timeline_dir();
@@ -158,13 +162,43 @@ impl WalResidentTimeline {
     async fn start_snapshot<W: AsyncWrite + Unpin + Send>(
         &self,
         ar: &mut tokio_tar::Builder<W>,
+        source: NodeId,
+        destination: NodeId,
     ) -> Result<SnapshotContext> {
         let mut shared_state = self.write_shared_state().await;
         let wal_seg_size = shared_state.get_wal_seg_size();
 
-        let cf_path = self.get_timeline_dir().join(CONTROL_FILE_NAME);
-        let mut cf = File::open(cf_path).await?;
-        ar.append_file(CONTROL_FILE_NAME, &mut cf).await?;
+        let mut control_store = TimelinePersistentState::clone(shared_state.sk.state());
+        // Modify the partial segment of the in-memory copy for the control file to
+        // point to the destination safekeeper.
+        let replace = control_store
+            .partial_backup
+            .replace_uploaded_segment(source, destination)?;
+
+        if let Some(replace) = replace {
+            // The deserialized control file has an uploaded partial. We upload a copy
+            // of it to object storage for the destination safekeeper and send an updated
+            // control file in the snapshot.
+            tracing::info!(
+                "Replacing uploaded partial segment in in-mem control file: {replace:?}"
+            );
+
+            let remote_timeline_path = wal_backup::remote_timeline_path(&self.tli.ttid)?;
+            wal_backup::copy_partial_segment(
+                &replace.previous.remote_path(&remote_timeline_path),
+                &replace.current.remote_path(&remote_timeline_path),
+            )
+            .await?;
+        }
+
+        let buf = control_store
+            .write_to_buf()
+            .with_context(|| "failed to serialize control store")?;
+        let mut header = Header::new_gnu();
+        header.set_size(buf.len().try_into().expect("never breaches u64"));
+        ar.append_data(&mut header, CONTROL_FILE_NAME, buf.as_slice())
+            .await
+            .with_context(|| "failed to append to archive")?;
 
         // We need to stream since the oldest segment someone (s3 or pageserver)
         // still needs. This duplicates calc_horizon_lsn logic.
@@ -342,7 +376,7 @@ async fn pull_timeline(
     let client = Client::new(host.clone(), sk_auth_token.clone());
     // Request stream with basebackup archive.
     let bb_resp = client
-        .snapshot(status.tenant_id, status.timeline_id)
+        .snapshot(status.tenant_id, status.timeline_id, conf.my_id)
         .await?;
 
     // Make Stream of Bytes from it...
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 234273e133..aa1a6696a1 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -483,6 +483,16 @@ pub(crate) async fn backup_partial_segment(
         .await
 }
 
+pub(crate) async fn copy_partial_segment(
+    source: &RemotePath,
+    destination: &RemotePath,
+) -> Result<()> {
+    let storage = get_configured_remote_storage();
+    let cancel = CancellationToken::new();
+
+    storage.copy_object(source, destination, &cancel).await
+}
+
 pub async fn read_object(
     file_path: &RemotePath,
     offset: u64,
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 52765b0e98..675a051887 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -17,14 +17,13 @@
 //! file. Code updates state in the control file before doing any S3 operations.
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.
-
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
 
 use tracing::{debug, error, info, instrument, warn};
-use utils::lsn::Lsn;
+use utils::{id::NodeId, lsn::Lsn};
 
 use crate::{
     metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
@@ -82,6 +81,12 @@ pub struct State {
     pub segments: Vec<PartialRemoteSegment>,
 }
 
+#[derive(Debug)]
+pub(crate) struct ReplaceUploadedSegment {
+    pub(crate) previous: PartialRemoteSegment,
+    pub(crate) current: PartialRemoteSegment,
+}
+
 impl State {
     /// Find an Uploaded segment. There should be only one Uploaded segment at a time.
     pub(crate) fn uploaded_segment(&self) -> Option<PartialRemoteSegment> {
@@ -90,6 +95,54 @@ impl State {
             .find(|seg| seg.status == UploadStatus::Uploaded)
             .cloned()
     }
+
+    /// Replace the name of the Uploaded segment (if one exists) in order to match
+    /// it with `destination` safekeeper. Returns a description of the change or None
+    /// wrapped in anyhow::Result.
+    pub(crate) fn replace_uploaded_segment(
+        &mut self,
+        source: NodeId,
+        destination: NodeId,
+    ) -> anyhow::Result<Option<ReplaceUploadedSegment>> {
+        let current = self
+            .segments
+            .iter_mut()
+            .find(|seg| seg.status == UploadStatus::Uploaded);
+
+        let current = match current {
+            Some(some) => some,
+            None => {
+                return anyhow::Ok(None);
+            }
+        };
+
+        // Sanity check that the partial segment we are replacing is belongs
+        // to the `source` SK.
+        if !current
+            .name
+            .ends_with(format!("sk{}.partial", source.0).as_str())
+        {
+            anyhow::bail!(
+                "Partial segment name ({}) doesn't match self node id ({})",
+                current.name,
+                source
+            );
+        }
+
+        let previous = current.clone();
+
+        let new_name = current.name.replace(
+            format!("_sk{}", source.0).as_str(),
+            format!("_sk{}", destination.0).as_str(),
+        );
+
+        current.name = new_name;
+
+        anyhow::Ok(Some(ReplaceUploadedSegment {
+            previous,
+            current: current.clone(),
+        }))
+    }
 }
 
 struct PartialBackup {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index aaa1f21997..b76432127d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -67,6 +67,7 @@ from fixtures.pageserver.utils import (
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import (
+    LocalFsStorage,
     MockS3Server,
     RemoteStorage,
     RemoteStorageKind,
@@ -4425,14 +4426,32 @@ class Safekeeper(LogUtils):
     def timeline_dir(self, tenant_id, timeline_id) -> Path:
         return self.data_dir / str(tenant_id) / str(timeline_id)
 
+    def list_uploaded_segments(self, tenant_id: TenantId, timeline_id: TimelineId):
+        tline_path = (
+            self.env.repo_dir
+            / "local_fs_remote_storage"
+            / "safekeeper"
+            / str(tenant_id)
+            / str(timeline_id)
+        )
+        assert isinstance(self.env.safekeepers_remote_storage, LocalFsStorage)
+        return self._list_segments_in_dir(
+            tline_path, lambda name: ".metadata" not in name and ".___temp" not in name
+        )
+
     def list_segments(self, tenant_id, timeline_id) -> List[str]:
         """
         Get list of segment names of the given timeline.
         """
         tli_dir = self.timeline_dir(tenant_id, timeline_id)
+        return self._list_segments_in_dir(
+            tli_dir, lambda name: not name.startswith("safekeeper.control")
+        )
+
+    def _list_segments_in_dir(self, path: Path, keep_filter: Callable[[str], bool]) -> list[str]:
         segments = []
-        for _, _, filenames in os.walk(tli_dir):
-            segments.extend([f for f in filenames if not f.startswith("safekeeper.control")])
+        for _, _, filenames in os.walk(path):
+            segments.extend([f for f in filenames if keep_filter(f)])
         segments.sort()
         return segments
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index bf7829fc84..5d3b263936 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -49,7 +49,13 @@ from fixtures.remote_storage import (
 )
 from fixtures.safekeeper.http import SafekeeperHttpClient
 from fixtures.safekeeper.utils import are_walreceivers_absent
-from fixtures.utils import PropagatingThread, get_dir_size, query_scalar, start_in_background
+from fixtures.utils import (
+    PropagatingThread,
+    get_dir_size,
+    query_scalar,
+    start_in_background,
+    wait_until,
+)
 
 
 def wait_lsn_force_checkpoint(
@@ -63,6 +69,18 @@ def wait_lsn_force_checkpoint(
     lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
     log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver")
 
+    wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options)
+
+
+def wait_lsn_force_checkpoint_at(
+    lsn: Lsn,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ps: NeonPageserver,
+    pageserver_conn_options=None,
+):
+    pageserver_conn_options = pageserver_conn_options or {}
+
     auth_token = None
     if "password" in pageserver_conn_options:
         auth_token = pageserver_conn_options["password"]
@@ -2304,3 +2322,138 @@ def test_s3_eviction(
     )
 
     assert event_metrics_seen
+
+
+def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilder):
+    """
+    Verify that pulling timeline from a SK with an uploaded partial segment
+    does not lead to consistency issues:
+    1. Start 3 SKs - only use two
+    2. Ingest a bit of WAL
+    3. Wait for partial to be uploaded
+    4. Pull timeline to the third SK
+    6. Replace source with destination SK and start compute
+    5. Wait for source SK to evict timeline
+    6. Go back to initial compute SK config and validate that
+    source SK can unevict the timeline (S3 state is consistent)
+    """
+    neon_env_builder.auth_enabled = True
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
+
+    neon_env_builder.safekeeper_extra_opts = [
+        "--enable-offload",
+        "--delete-offloaded-wal",
+        "--partial-backup-timeout",
+        "500ms",
+        "--control-file-save-interval",
+        "500ms",
+        "--eviction-min-resident=500ms",
+    ]
+
+    env = neon_env_builder.init_start(initial_tenant_conf={"checkpoint_timeout": "100ms"})
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2])
+
+    log.info("use only first 2 safekeepers, 3rd will be seeded")
+    endpoint = env.endpoints.create("main")
+    endpoint.active_safekeepers = [1, 2]
+    endpoint.start()
+    endpoint.safe_psql("create table t(key int, value text)")
+    endpoint.safe_psql("insert into t select generate_series(1, 180000), 'papaya'")
+
+    endpoint.stop()
+
+    def source_partial_segment_uploaded():
+        first_segment_name = "000000010000000000000001"
+        segs = src_sk.list_uploaded_segments(tenant_id, timeline_id)
+
+        candidate_seg = None
+        for seg in segs:
+            if "partial" in seg and "sk1" in seg and not seg.startswith(first_segment_name):
+                candidate_seg = seg
+
+        if candidate_seg is not None:
+            # The term might change, causing the segment to be gc-ed shortly after,
+            # so give it a bit of time to make sure it's stable.
+            time.sleep(2)
+
+            segs = src_sk.list_uploaded_segments(tenant_id, timeline_id)
+            assert candidate_seg in segs
+            return candidate_seg
+
+        raise Exception("Partial segment not uploaded yet")
+
+    source_partial_segment = wait_until(15, 1, source_partial_segment_uploaded)
+    log.info(
+        f"Uploaded segments before pull are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
+    )
+    log.info(f"Tracking source partial segment: {source_partial_segment}")
+
+    src_flush_lsn = src_sk.get_flush_lsn(tenant_id, timeline_id)
+    log.info(f"flush_lsn on src before pull_timeline: {src_flush_lsn}")
+
+    pageserver_conn_options = {"password": env.auth_keys.generate_tenant_token(tenant_id)}
+    wait_lsn_force_checkpoint_at(
+        src_flush_lsn, tenant_id, timeline_id, env.pageserver, pageserver_conn_options
+    )
+
+    dst_sk.pull_timeline([src_sk], tenant_id, timeline_id)
+
+    def evicted():
+        evictions = src_sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_completed_total", {"kind": "evict"}
+        )
+
+        if evictions is None or evictions == 0:
+            raise Exception("Eviction did not happen on source safekeeper yet")
+
+    wait_until(30, 1, evicted)
+
+    endpoint.start(safekeepers=[2, 3])
+
+    def new_partial_segment_uploaded():
+        segs = src_sk.list_uploaded_segments(tenant_id, timeline_id)
+        for seg in segs:
+            if "partial" in seg and "sk3" in seg:
+                return seg
+
+        raise Exception("Partial segment not uploaded yet")
+
+    log.info(
+        f"Uploaded segments before post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
+    )
+
+    endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'")
+    wait_until(15, 1, new_partial_segment_uploaded)
+
+    log.info(
+        f"Uploaded segments after post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
+    )
+
+    # Allow for some gc iterations to happen and assert that the original
+    # uploaded partial segment remains in place.
+    time.sleep(5)
+    segs = src_sk.list_uploaded_segments(tenant_id, timeline_id)
+    assert source_partial_segment in segs
+
+    log.info(
+        f"Uploaded segments at the end are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
+    )
+
+    # Restart the endpoint in order to check that the source safekeeper
+    # can unevict the timeline
+    endpoint.stop()
+    endpoint.start(safekeepers=[1, 2])
+
+    def unevicted():
+        unevictions = src_sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_completed_total", {"kind": "restore"}
+        )
+
+        if unevictions is None or unevictions == 0:
+            raise Exception("Uneviction did not happen on source safekeeper yet")
+
+    wait_until(10, 1, unevicted)

From a9c28be7d02226032f153edf6c7b527aec9fa5db Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 15 Aug 2024 10:06:28 +0100
Subject: [PATCH 1426/1571] fix(pageserver): allow unused_imports in
 download.rs on macOS (#8733)

## Problem

On macOS, clippy fails with the following error:

```
error: unused import: `crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt`
  --> pageserver/src/tenant/remote_timeline_client/download.rs:26:5
   |
26 | use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   |
   = note: `-D unused-imports` implied by `-D warnings`
   = help: to override `-D warnings` add `#[allow(unused_imports)]`
```

Introduced in https://github.com/neondatabase/neon/pull/8717

## Summary of changes
- allow `unused_imports` for
`crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt` on macOS
in download.rs
---
 pageserver/src/tenant/remote_timeline_client/download.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 8199218c3c..d9725ad756 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -23,6 +23,7 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
+#[cfg_attr(target_os = "macos", allow(unused_imports))]
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;

From d9a57aeed9ca9b0e2134e7183355d52fb6a089d1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 15 Aug 2024 12:54:05 +0300
Subject: [PATCH 1427/1571] storcon: deny external node configuration if an
 operation is ongoing (#8727)

Per #8674, disallow node configuration while drain/fill are ongoing.
Implement it by adding a only-http wrapper
`Service::external_node_configure` which checks for operation existing
before configuring.

Additionally:
- allow cancelling drain/fill after a pageserver has restarted and
transitioned to WarmingUp

Fixes: #8674
---
 libs/pageserver_api/src/controller_api.rs     |  3 --
 storage_controller/src/http.rs                |  2 +-
 storage_controller/src/service.rs             | 42 +++++++++++-------
 .../regress/test_storage_controller.py        | 44 +++++++++++++++++++
 4 files changed, 70 insertions(+), 21 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index a5b452da83..a50707a1b8 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -313,20 +313,17 @@ pub struct MetadataHealthUpdateRequest {
 pub struct MetadataHealthUpdateResponse {}
 
 #[derive(Serialize, Deserialize, Debug)]
-
 pub struct MetadataHealthListUnhealthyResponse {
     pub unhealthy_tenant_shards: Vec<TenantShardId>,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
-
 pub struct MetadataHealthListOutdatedRequest {
     #[serde(with = "humantime_serde")]
     pub not_scrubbed_for: Duration,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
-
 pub struct MetadataHealthListOutdatedResponse {
     pub health_records: Vec<MetadataHealthRecord>,
 }
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index e8513b31eb..e755aaed19 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -500,7 +500,7 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
         StatusCode::OK,
         state
             .service
-            .node_configure(
+            .external_node_configure(
                 config_req.node_id,
                 config_req.availability.map(NodeAvailability::from),
                 config_req.scheduling,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ef4cd91efd..d717924ae6 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4912,6 +4912,26 @@ impl Service {
         Ok(())
     }
 
+    /// Wrapper around [`Self::node_configure`] which only allows changes while there is no ongoing
+    /// operation for HTTP api.
+    pub(crate) async fn external_node_configure(
+        &self,
+        node_id: NodeId,
+        availability: Option<NodeAvailability>,
+        scheduling: Option<NodeSchedulingPolicy>,
+    ) -> Result<(), ApiError> {
+        {
+            let locked = self.inner.read().unwrap();
+            if let Some(op) = locked.ongoing_operation.as_ref().map(|op| op.operation) {
+                return Err(ApiError::PreconditionFailed(
+                    format!("Ongoing background operation forbids configuring: {op}").into(),
+                ));
+            }
+        }
+
+        self.node_configure(node_id, availability, scheduling).await
+    }
+
     pub(crate) async fn start_node_drain(
         self: &Arc<Self>,
         node_id: NodeId,
@@ -5017,14 +5037,14 @@ impl Service {
     }
 
     pub(crate) async fn cancel_node_drain(&self, node_id: NodeId) -> Result<(), ApiError> {
-        let (node_available, node_policy) = {
+        let node_available = {
             let locked = self.inner.read().unwrap();
             let nodes = &locked.nodes;
             let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
                 anyhow::anyhow!("Node {} not registered", node_id).into(),
             ))?;
 
-            (node.is_available(), node.get_scheduling())
+            node.is_available()
         };
 
         if !node_available {
@@ -5033,12 +5053,6 @@ impl Service {
             ));
         }
 
-        if !matches!(node_policy, NodeSchedulingPolicy::Draining) {
-            return Err(ApiError::PreconditionFailed(
-                format!("Node {node_id} has no drain in progress").into(),
-            ));
-        }
-
         if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
             if let Operation::Drain(drain) = op_handler.operation {
                 if drain.node_id == node_id {
@@ -5152,14 +5166,14 @@ impl Service {
     }
 
     pub(crate) async fn cancel_node_fill(&self, node_id: NodeId) -> Result<(), ApiError> {
-        let (node_available, node_policy) = {
+        let node_available = {
             let locked = self.inner.read().unwrap();
             let nodes = &locked.nodes;
             let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
                 anyhow::anyhow!("Node {} not registered", node_id).into(),
             ))?;
 
-            (node.is_available(), node.get_scheduling())
+            node.is_available()
         };
 
         if !node_available {
@@ -5168,12 +5182,6 @@ impl Service {
             ));
         }
 
-        if !matches!(node_policy, NodeSchedulingPolicy::Filling) {
-            return Err(ApiError::PreconditionFailed(
-                format!("Node {node_id} has no fill in progress").into(),
-            ));
-        }
-
         if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
             if let Operation::Fill(fill) = op_handler.operation {
                 if fill.node_id == node_id {
@@ -5982,7 +5990,7 @@ impl Service {
                 .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
                 .await;
 
-            failpoint_support::sleep_millis_async!("sleepy-drain-loop");
+            failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel);
         }
 
         while !waiters.is_empty() {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 9b2557a165..7d98ff2923 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2091,3 +2091,47 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
         )
         == 0
     )
+
+
+def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):
+    # single unsharded tenant, two locations
+    neon_env_builder.num_pageservers = 2
+
+    env = neon_env_builder.init_start()
+
+    env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}})
+    env.storage_controller.reconcile_until_idle()
+
+    attached_id = int(env.storage_controller.locate(env.initial_tenant)[0]["node_id"])
+    attached = next((ps for ps in env.pageservers if ps.id == attached_id))
+
+    def attached_is_draining():
+        details = env.storage_controller.node_status(attached.id)
+        assert details["scheduling"] == "Draining"
+
+    env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(10000)"))
+    env.storage_controller.node_drain(attached.id)
+
+    wait_until(10, 0.5, attached_is_draining)
+
+    attached.restart()
+
+    # we are unable to reconfigure node while the operation is still ongoing
+    with pytest.raises(
+        StorageControllerApiException,
+        match="Precondition failed: Ongoing background operation forbids configuring: drain.*",
+    ):
+        env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"})
+    with pytest.raises(
+        StorageControllerApiException,
+        match="Precondition failed: Ongoing background operation forbids configuring: drain.*",
+    ):
+        env.storage_controller.node_configure(attached.id, {"availability": "Offline"})
+
+    env.storage_controller.cancel_node_drain(attached.id)
+
+    def reconfigure_node_again():
+        env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"})
+
+    # allow for small delay between actually having cancelled and being able reconfigure again
+    wait_until(4, 0.5, reconfigure_node_again)

From 52641eb8533ec0bdd70523f2595a0265c9208dc7 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 15 Aug 2024 15:30:04 +0300
Subject: [PATCH 1428/1571] storcon: add spans to drain/fill ops (#8735)

this way we do not need to repeat the %node_id everywhere, and we get no
stray messages in logs from within the op.
---
 storage_controller/src/service.rs | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index d717924ae6..84db088a42 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4989,6 +4989,8 @@ impl Service {
                     cancel: cancel.clone(),
                 });
 
+                let span = tracing::info_span!(parent: None, "drain_node", %node_id);
+
                 tokio::task::spawn({
                     let service = self.clone();
                     let cancel = cancel.clone();
@@ -5005,21 +5007,21 @@ impl Service {
                             }
                         }
 
-                        tracing::info!(%node_id, "Drain background operation starting");
+                        tracing::info!("Drain background operation starting");
                         let res = service.drain_node(node_id, cancel).await;
                         match res {
                             Ok(()) => {
-                                tracing::info!(%node_id, "Drain background operation completed successfully");
+                                tracing::info!("Drain background operation completed successfully");
                             }
                             Err(OperationError::Cancelled) => {
-                                tracing::info!(%node_id, "Drain background operation was cancelled");
+                                tracing::info!("Drain background operation was cancelled");
                             }
                             Err(err) => {
-                                tracing::error!(%node_id, "Drain background operation encountered: {err}")
+                                tracing::error!("Drain background operation encountered: {err}")
                             }
                         }
                     }
-                });
+                }.instrument(span));
             }
             NodeSchedulingPolicy::Draining => {
                 return Err(ApiError::Conflict(format!(
@@ -5118,6 +5120,8 @@ impl Service {
                     cancel: cancel.clone(),
                 });
 
+                let span = tracing::info_span!(parent: None, "fill_node", %node_id);
+
                 tokio::task::spawn({
                     let service = self.clone();
                     let cancel = cancel.clone();
@@ -5134,21 +5138,21 @@ impl Service {
                             }
                         }
 
-                        tracing::info!(%node_id, "Fill background operation starting");
+                        tracing::info!("Fill background operation starting");
                         let res = service.fill_node(node_id, cancel).await;
                         match res {
                             Ok(()) => {
-                                tracing::info!(%node_id, "Fill background operation completed successfully");
+                                tracing::info!("Fill background operation completed successfully");
                             }
                             Err(OperationError::Cancelled) => {
-                                tracing::info!(%node_id, "Fill background operation was cancelled");
+                                tracing::info!("Fill background operation was cancelled");
                             }
                             Err(err) => {
-                                tracing::error!(%node_id, "Fill background operation encountered: {err}")
+                                tracing::error!("Fill background operation encountered: {err}")
                             }
                         }
                     }
-                });
+                }.instrument(span));
             }
             NodeSchedulingPolicy::Filling => {
                 return Err(ApiError::Conflict(format!(

From 24d347f50b15bb8ba44f0b25589e180e6482e1a8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 15 Aug 2024 16:27:07 +0300
Subject: [PATCH 1429/1571] storcon: use tracing for logging panics (#8734)

this gives spans for panics, and does not globber loki output by writing
to stderr while all of the other logging is to stdout.

See: #3475
---
 storage_controller/src/main.rs | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 5a68799141..7387d36690 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -196,14 +196,26 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
 }
 
 fn main() -> anyhow::Result<()> {
-    let default_panic = std::panic::take_hook();
-    std::panic::set_hook(Box::new(move |info| {
-        default_panic(info);
-        std::process::exit(1);
-    }));
+    logging::init(
+        LogFormat::Plain,
+        logging::TracingErrorLayerEnablement::Disabled,
+        logging::Output::Stdout,
+    )?;
+
+    // log using tracing so we don't get confused output by default hook writing to stderr
+    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
 
     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
+    let hook = std::panic::take_hook();
+    std::panic::set_hook(Box::new(move |info| {
+        // let sentry send a message (and flush)
+        // and trace the error
+        hook(info);
+
+        std::process::exit(1);
+    }));
+
     tokio::runtime::Builder::new_current_thread()
         // We use spawn_blocking for database operations, so require approximately
         // as many blocking threads as we will open database connections.
@@ -217,12 +229,6 @@ fn main() -> anyhow::Result<()> {
 async fn async_main() -> anyhow::Result<()> {
     let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
 
-    logging::init(
-        LogFormat::Plain,
-        logging::TracingErrorLayerEnablement::Disabled,
-        logging::Output::Stdout,
-    )?;
-
     preinitialize_metrics();
 
     let args = Cli::parse();

From f087423a0111d4fb5ac1e12007447c56b2a1c2a6 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 15 Aug 2024 16:28:25 +0300
Subject: [PATCH 1430/1571] Handle reload config file request in LR monitor
 (#8732)

## Problem

Logical replication BGW checking replication lag is not reloading config

## Summary of changes

Add handling of reload config request

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/neon.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 784d0f1da3..fe8e276d1c 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -192,6 +192,13 @@ LogicalSlotsMonitorMain(Datum main_arg)
 	{
 		XLogRecPtr	cutoff_lsn;
 
+		/* In case of a SIGHUP, just reload the configuration. */
+		if (ConfigReloadPending)
+		{
+			ConfigReloadPending = false;
+			ProcessConfigFile(PGC_SIGHUP);
+		}
+
 		/*
 		 * If there are too many .snap files, just drop all logical slots to
 		 * prevent aux files bloat.

From 4e58fd93216c5274e49488de161dc9ce12abd82d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 15 Aug 2024 18:37:15 +0100
Subject: [PATCH 1431/1571] CI(label-for-external-users): use CI_ACCESS_TOKEN
 (#8738)

## Problem

`secrets.GITHUB_TOKEN` (with any permissions) is not enough to get
a user's membership info if they decide to hide it.

## Summary of changes
- Use `secrets.CI_ACCESS_TOKEN` for `gh api` call
- Use `pull_request_target` instead of `pull_request` event to get
access to secrets
---
 .github/workflows/label-for-external-users.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/label-for-external-users.yml b/.github/workflows/label-for-external-users.yml
index 7cf5ee254c..585d118dfb 100644
--- a/.github/workflows/label-for-external-users.yml
+++ b/.github/workflows/label-for-external-users.yml
@@ -4,7 +4,7 @@ on:
   issues:
     types:
       - opened
-  pull_request:
+  pull_request_target:
     types:
       - opened
 
@@ -25,7 +25,7 @@ jobs:
     - name: Check whether `${{ github.actor }}` is a member of `${{ github.repository_owner }}`
       id: check-user
       env:
-        GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
       run: |
         if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
           is_member=true
@@ -45,10 +45,10 @@ jobs:
       issues: write        # for `gh issue edit`
 
     steps:
-    - name: Label new ${{ github.event_name }}
+    - name: Add `${{ env.LABEL }}` label
       env:
         GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].number }}
-        GH_CLI_COMMAND: ${{ github.event_name == 'pull_request' && 'pr' || 'issue' }}
+        ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request_target' && 'pull_request' || 'issue'].number }}
+        GH_CLI_COMMAND: ${{ github.event_name == 'pull_request_target' && 'pr' || 'issue' }}
       run: |
         gh ${GH_CLI_COMMAND} --repo ${GITHUB_REPOSITORY} edit --add-label=${LABEL} ${ITEM_NUMBER}

From 69cb1ee479ecdc99dd117fe4149b59dd54676fea Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 15 Aug 2024 22:41:58 +0100
Subject: [PATCH 1432/1571] CI(replication-tests): store test results & change
 notification channel (#8687)

## Problem

We want to store Nightly Replication test results in the database and
notify the relevant Slack channel about failures

## Summary of changes
- Store test results in the database
- Notify `on-call-compute-staging-stream` about failures
---
 .github/workflows/benchmarking.yml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index f99a037489..a4a597acde 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -222,13 +222,20 @@ jobs:
       id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
+      with:
+        store-test-results-into-db: true
+      env:
+        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream
+        slack-message: |
+          Periodic replication testing: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
@@ -330,7 +337,7 @@ jobs:
   prepare_AWS_RDS_databases:
     uses: ./.github/workflows/_benchmarking_preparation.yml
     secrets: inherit
-  
+
   pgbench-compare:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     needs: [ generate-matrices, prepare_AWS_RDS_databases ]

From df086cd139ee5ecc82bf096fc3fc6ee4397ac983 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 15 Aug 2024 15:34:45 -0700
Subject: [PATCH 1433/1571] Add logical replication test to exercise snapfiles
 (#8364)

---
 .../performance/test_logical_replication.py   | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 4b4ffc1fee..c4e42a7834 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -262,3 +262,85 @@ def test_publisher_restart(
             sub_workload.terminate()
     finally:
         pub_workload.terminate()
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_snap_files(
+    pg_bin: PgBin,
+    benchmark_project_pub: NeonApiEndpoint,
+    zenbenchmark: NeonBenchmarker,
+):
+    """
+    Creates a node with a replication slot. Generates pgbench into the replication slot,
+    then runs pgbench inserts while generating large numbers of snapfiles. Then restarts
+    the node and tries to peek the replication changes.
+    """
+    test_duration_min = 60
+    test_interval_min = 5
+    pgbench_duration = f"-T{test_duration_min * 60 * 2}"
+
+    env = benchmark_project_pub.pgbench_env
+    connstr = benchmark_project_pub.connstr
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)
+
+    with psycopg2.connect(connstr) as conn:
+        conn.autocommit = True
+        with conn.cursor() as cur:
+            cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
+            is_super = cur.fetchall()[0]
+            assert is_super, "This benchmark won't work if we don't have superuser"
+
+    conn = psycopg2.connect(connstr)
+    conn.autocommit = True
+    cur = conn.cursor()
+    cur.execute("ALTER SYSTEM SET neon.logical_replication_max_snap_files = -1")
+
+    with psycopg2.connect(connstr) as conn:
+        conn.autocommit = True
+        with conn.cursor() as cur:
+            cur.execute("SELECT pg_reload_conf()")
+
+    with psycopg2.connect(connstr) as conn:
+        conn.autocommit = True
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                DO $$
+                    BEGIN
+                    IF EXISTS (
+                        SELECT 1
+                        FROM pg_replication_slots
+                        WHERE slot_name = 'slotter'
+                    ) THEN
+                        PERFORM pg_drop_replication_slot('slotter');
+                    END IF;
+                END $$;
+            """
+            )
+            cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')")
+
+    workload = pg_bin.run_nonblocking(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=env)
+    try:
+        start = time.time()
+        prev_measurement = time.time()
+        while time.time() - start < test_duration_min * 60:
+            with psycopg2.connect(connstr) as conn:
+                with conn.cursor() as cur:
+                    cur.execute(
+                        "SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
+                    )
+                    check_pgbench_still_running(workload)
+                    cur.execute(
+                        "SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())"
+                    )
+
+            # Measure storage
+            if time.time() - prev_measurement > test_interval_min * 60:
+                storage = benchmark_project_pub.get_synthetic_storage_size()
+                zenbenchmark.record("storage", storage, "B", MetricReport.LOWER_IS_BETTER)
+                prev_measurement = time.time()
+            time.sleep(test_interval_min * 60 / 3)
+
+    finally:
+        workload.terminate()

From 4763a960d103a27250eadd6892368ae77a3d66c4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 16 Aug 2024 08:10:05 +0300
Subject: [PATCH 1434/1571] chore: log if we have an open layer or any frozen
 on shutdown (#8740)

Some benchmarks are failing with a "long" flushing, which might be
because there is a queue of in-memory layers, or something else. Add
logging to narrow it down.

Private slack DM ref:
https://neondb.slack.com/archives/D049K7HJ9JM/p1723727305238099
---
 pageserver/src/tenant/timeline.rs | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b4d908b130..01e77fa1b1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1645,6 +1645,20 @@ impl Timeline {
         self.last_record_lsn.shutdown();
 
         if try_freeze_and_flush {
+            if let Some((open, frozen)) = self
+                .layers
+                .read()
+                .await
+                .layer_map()
+                .map(|lm| (lm.open_layer.is_some(), lm.frozen_layers.len()))
+                .ok()
+                .filter(|(open, frozen)| *open || *frozen > 0)
+            {
+                tracing::info!(?open, frozen, "flushing and freezing on shutdown");
+            } else {
+                // this is double-shutdown, ignore it
+            }
+
             // we shut down walreceiver above, so, we won't add anything more
             // to the InMemoryLayer; freeze it and wait for all frozen layers
             // to reach the disk & upload queue, then shut the upload queue and

From 7fdc3ea16296ae7ac6f74ed2843ecee454391276 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 16 Aug 2024 13:30:53 +0300
Subject: [PATCH 1435/1571] Add retroactive RFC about physical replication
 (#8546)

We've had physical replication support for a long time, but we never
created an RFC for the feature. This RFC does that after the fact. Even
though we've already implemented the feature, let's have a design
discussion as if it hadn't done that. It can still be quite insightful.

This is written from a pretty compute-centric viewpoint, not much
on how it works in the control plane.
---
 docs/rfcs/036-physical-replication.md | 265 ++++++++++++++++++++++++++
 1 file changed, 265 insertions(+)
 create mode 100644 docs/rfcs/036-physical-replication.md

diff --git a/docs/rfcs/036-physical-replication.md b/docs/rfcs/036-physical-replication.md
new file mode 100644
index 0000000000..41aced0545
--- /dev/null
+++ b/docs/rfcs/036-physical-replication.md
@@ -0,0 +1,265 @@
+# Physical Replication
+
+This RFC is a bit special in that we have already implemented physical
+replication a long time ago. However, we never properly wrote down all
+the decisions and assumptions, and in the last months when more users
+have started to use the feature, numerous issues have surfaced.
+
+This RFC documents the design decisions that have been made.
+
+## Summary
+
+PostgreSQL has a feature called streaming replication, where a replica
+streams WAL from the primary and continuously applies it. It is also
+known as "physical replication", to distinguish it from logical
+replication.  In PostgreSQL, a replica is initialized by taking a
+physical backup of the primary. In Neon, the replica is initialized
+from a slim "base backup" from the pageserver, just like a primary,
+and the primary and the replicas connect to the same pageserver,
+sharing the storage.
+
+There are two kinds of read-only replicas in Neon:
+- replicas that follow the primary, and
+- "static" replicas that are pinned at a particular LSN.
+
+A static replica is useful e.g. for performing time-travel queries and
+running one-off slow queries without affecting the primary. A replica
+that follows the primary can be used e.g. to scale out read-only
+workloads.
+
+## Motivation
+
+Read-only replicas allow offloading read-only queries. It's useful for
+isolation, if you want to make sure that read-only queries don't
+affect the primary, and it's also an easy way to provide guaranteed
+read-only access to an application, without having to mess with access
+controls.
+
+## Non Goals (if relevant)
+
+This RFC is all about WAL-based *physical* replication. Logical
+replication is a different feature.
+
+Neon also has the capability to launch "static" read-only nodes which
+do not follow the primary, but are pinned to a particular LSN. They
+can be used for long-running one-off queries, or for Point-in-time
+queries. They work similarly to read replicas that follow the primary,
+but some things are simpler: there are no concerns about cache
+invalidation when the data changes on the primary, or worrying about
+transactions that are in-progress on the primary.
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+- Control plane launches the replica
+- Replica Postgres instance connects to the safekeepers, to stream the WAL
+- The primary does not know about the standby, except for the hot standby feedback
+- The primary and replicas all connect to the same pageservers
+
+
+# Context
+
+Some useful things to know about hot standby and replicas in
+PostgreSQL.
+
+## PostgreSQL startup sequence
+
+"Running" and "start up" terms are little imprecise. PostgreSQL
+replica startup goes through several stages:
+
+1. First, the process is started up, and various initialization steps
+   are performed, like initializing shared memory. If you try to
+   connect to the server in this stage, you get an error: ERROR: the
+   database system is starting up. This stage happens very quickly, no
+
+2. Then the server reads the checpoint record from the WAL and starts
+   the WAL replay starting from the checkpoint. This works differently
+   in Neon: we start the WAL replay at the basebackup LSN, not from a
+   checkpoint! If you connect to the server in this state, you get an
+   error: ERROR: the database system is not yet accepting
+   connections. We proceed to the next stage, when the WAL replay sees
+   a running-xacts record. Or in Neon, the "CLOG scanning" mechanism
+   can allow us to move directly to next stage, with all the caveats
+   listed in this RFC.
+
+3. When the running-xacts information is established, the server
+   starts to accept connections normally.
+
+From PostgreSQL's point of view, the server is already running in
+stage 2, even though it's not accepting connections yet. Our
+`compute_ctl` does not consider it as running until stage 3. If the
+transition from stage 2 to 3 doesn't happen fast enough, the control
+plane will mark the start operation as failed.
+
+
+## Decisions, Issues
+
+### Cache invalidation in replica
+
+When a read replica follows the primary in PostgreSQL, it needs to
+stream all the WAL from the primary and apply all the records, to keep
+the local copy of the data consistent with the primary. In Neon, the
+replica can fetch the updated page versions from the pageserver, so
+it's not necessary to apply all the WAL. However, it needs to ensure
+that any pages that are currently in the Postgres buffer cache, or the
+Local File Cache, are either updated, or thrown away so that the next
+read of the page will fetch the latest version.
+
+We choose to apply the WAL records for pages that are already in the
+buffer cache, and skip records for other pages. Somewhat arbitrarily,
+we also apply records affecting catalog relations, fetching the old
+page version from the pageserver if necessary first. See
+`neon_redo_read_buffer_filter()` function.
+
+The replica wouldn't necessarily need to see all the WAL records, only
+the records that apply to cached pages. For simplicity, we do stream
+all the WAL to the replica, and the replica simply ignores WAL records
+that require no action.
+
+Like in PostgreSQL, the read replica maintains a "replay LSN", which
+is the LSN up to which the replica has received and replayed the
+WAL. The replica can lag behind the primary, if it cannot quite keep
+up with the primary, or if a long-running query conflicts with changes
+that are about to be applied, or even intentionally if the user wishes
+to see delayed data (see recovery_min_apply_delay). It's important
+that the replica sees a consistent view of the whole cluster at the
+replay LSN, when it's lagging behind.
+
+In Neon, the replica connects to a safekeeper to get the WAL
+stream. That means that the safekeepers must be able to regurgitate
+the original WAL as far back as the replay LSN of any running read
+replica. (A static read-only node that does not follow the primary
+does not require a WAL stream however). The primary does not need to
+be running, and when it is, the replicas don't incur any extra
+overhead to the primary (see hot standby feedback though).
+
+### In-progress transactions
+
+In PostgreSQL, when a hot standby server starts up, it cannot
+immediately open up for queries (see [PostgreSQL startup
+sequence]). It first needs to establish a complete list of in-progress
+transactions, including subtransactions, that are running at the
+primary, at the current replay LSN. Normally that happens quickly,
+when the replica sees a "running-xacts" WAL record, because the
+primary writes a running-xacts WAL record at every checkpoint, and in
+PostgreSQL the replica always starts the WAL replay from a checkpoint
+REDO point. (A shutdown checkpoint WAL record also implies that all
+the non-prepared transactions have ended.) If there are a lot of
+subtransactions in progress, however, the standby might need to wait
+for old transactions to complete before it can open up for queries.
+
+In Neon that problem is worse: a replica can start at any LSN, so
+there's no guarantee that it will see a running-xacts record any time
+soon. In particular, if the primary is not running when the replica is
+started, it might never see a running-xacts record.
+
+To make things worse, we initially missed this issue, and always
+started accepting queries at replica startup, even if it didn't have
+the transaction information. That could lead to incorrect query
+results and data corruption later. However, as we fixed that, we
+introduced a new problem compared to what we had before: previously
+the replica would always start up, but after fixing that bug, it might
+not. In a superficial way, the old behavior was better (but could lead
+to serious issues later!). That made fixing that bug was very hard,
+because as we fixed it, we made things (superficially) worse for
+others.
+
+See https://github.com/neondatabase/neon/pull/7288 which fixed the
+bug, and follow-up PRs https://github.com/neondatabase/neon/pull/8323
+and https://github.com/neondatabase/neon/pull/8484 to try to claw back
+the cases that started to cause trouble as fixing it. As of this
+writing, there are still cases where a replica might not immediately
+start up, causing the control plane operation to fail, the remaining
+issues are tracked in https://github.com/neondatabase/neon/issues/6211.
+
+One long-term fix for this is to switch to using so-called CSN
+snapshots in read replica. That would make it unnecessary to have the
+full in-progress transaction list in the replica at startup time. See
+https://commitfest.postgresql.org/48/4912/ for a work-in-progress
+patch to upstream to implement that.
+
+Another thing we could do is to teach the control plane about that
+distinction between "starting up" and "running but haven't received
+running-xacts information yet", so that we could keep the replica
+waiting longer in that stage, and also give any client connections the
+same `ERROR: the database system is not yet accepting connections`
+error that you get in standalone PostgreSQL in that state.
+
+
+### Recovery conflicts and Hot standby feedback
+
+It's possible that a tuple version is vacuumed away in the primary,
+even though it is still needed by a running transactions in the
+replica. This is called a "recovery conflict", and PostgreSQL provides
+various options for dealing with it. By default, the WAL replay will
+wait up to 30 s for the conflicting query to finish. After that, it
+will kill the running query, so that the WAL replay can proceed.
+
+Another way to avoid the situation is to enable the
+[`hot_standby_feedback`](https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-HOT-STANDBY-FEEDBACK)
+option. When it is enabled, the primary will refrain from vacuuming
+tuples that are still needed in the primary. That means potentially
+bloating the primary, which violates the usual rule that read replicas
+don't affect the operations on the primary, which is why it's off by
+default. We leave it to users to decide if they want to turn it on,
+same as PostgreSQL.
+
+Neon supports `hot_standby_feedback` by passing the feedback messages
+from the replica to the safekeepers, and from safekeepers to the
+primary.
+
+### Relationship of settings between primary and replica
+
+In order to enter hot standby mode, some configuration options need to
+be set to the same or larger values in the standby, compared to the
+primary.  See [explanation in the PostgreSQL
+docs](https://www.postgresql.org/docs/current/hot-standby.html#HOT-STANDBY-ADMIN)
+
+In Neon, we have this problem too. To prevent customers from hitting
+it, the control plane automatically adjusts the settings of a replica,
+so that they match or exceed the primary's settings (see
+https://github.com/neondatabase/cloud/issues/14903). However, you
+can still hit the issue if the primary is restarted with larger
+settings, while the replica is running.
+
+
+### Interaction with Pageserver GC
+
+The read replica can lag behind the primary. If there are recovery
+conflicts or the replica cannot keep up for some reason, the lag can
+in principle grow indefinitely. The replica will issue all GetPage
+requests to the pageservers at the current replay LSN, and needs to
+see the old page versions.
+
+If the retention period in the pageserver is set to be small, it may
+have already garbage collected away the old page versions. That will
+cause read errors in the compute, and can mean that the replica cannot
+make progress with the replication anymore.
+
+There is a mechanism for replica to pass information about its replay
+LSN to the pageserver, so that the pageserver refrains from GC'ing
+data that is still needed by the standby. It's called
+'standby_horizon' in the pageserver code, see
+https://github.com/neondatabase/neon/pull/7368. A separate "lease"
+mechanism also is in the works, where the replica could hold a lease
+on the old LSN, preventing the pageserver from advancing the GC
+horizon past that point. The difference is that the standby_horizon
+mechanism relies on a feedback message from replica to safekeeper,
+while the least API is exposed directly from the pageserver. A static
+read-only node is not connected to safekeepers, so it cannot use the
+standby_horizon mechanism.
+
+
+### Synchronous replication
+
+We haven't put any effort into synchronous replication yet.
+
+PostgreSQL provides multiple levels of synchronicity. In the weaker
+levels, a transaction is not acknowledged as committed to the client
+in the primary until the WAL has been streamed to a replica or flushed
+to disk there. Those modes don't make senses in Neon, because the
+safekeepers handle durability.
+
+`synchronous_commit=remote_apply` mode would make sense. In that mode,
+the commit is not acknowledged to the client until it has been
+replayed in the replica. That ensures that after commit, you can see
+the commit in the replica too (aka. read-your-write consistency).

From 3f91ea28d997a23b899ef0c3ce237e7ae85f2916 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 16 Aug 2024 13:05:04 +0100
Subject: [PATCH 1436/1571] tests: add infra and test for storcon leadership
 transfer (#8587)

## Problem
https://github.com/neondatabase/neon/pull/8588 implemented the mechanism
for storage controller
leadership transfers. However, there's no tests that exercise the
behaviour.

## Summary of changes
1. Teach `neon_local` how to handle multiple storage controller
instances. Each storage controller
instance gets its own subdirectory (`storage_controller_1, ...`).
`storage_controller start|stop` subcommands
have also been extended to optionally accept an instance id.
2. Add a storage controller proxy test fixture. It's a basic HTTP server
that forwards requests from pageserver
and test env to the currently configured storage controller.
3. Add a test which exercises storage controller leadership transfer.
4. Finally fix a couple bugs that the test surfaced
---
 control_plane/src/background_process.rs       |   2 +-
 control_plane/src/bin/neon_local.rs           |  86 +++-
 control_plane/src/local_env.rs                |  37 ++
 control_plane/src/storage_controller.rs       | 396 ++++++++++++------
 storage_controller/src/http.rs                |  16 +
 storage_controller/src/peer_client.rs         |   4 +-
 storage_controller/src/service.rs             | 114 ++---
 test_runner/conftest.py                       |   1 +
 test_runner/fixtures/neon_fixtures.py         | 232 +++++++---
 .../fixtures/storage_controller_proxy.py      |  73 ++++
 test_runner/fixtures/utils.py                 |   2 +-
 .../regress/test_storage_controller.py        | 129 ++++++
 12 files changed, 841 insertions(+), 251 deletions(-)
 create mode 100644 test_runner/fixtures/storage_controller_proxy.py

diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index bf8a27e550..619c5bce3e 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -379,7 +379,7 @@ where
     }
 }
 
-fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
+pub(crate) fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
     match kill(pid, None) {
         // Process exists, keep waiting
         Ok(_) => Ok(false),
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 51e9a51a57..edd88dc71c 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -15,7 +15,9 @@ use control_plane::local_env::{
 };
 use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
-use control_plane::storage_controller::StorageController;
+use control_plane::storage_controller::{
+    NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
+};
 use control_plane::{broker, local_env};
 use pageserver_api::config::{
     DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
@@ -1052,6 +1054,36 @@ fn get_start_timeout(args: &ArgMatches) -> &Duration {
     humantime_duration.as_ref()
 }
 
+fn storage_controller_start_args(args: &ArgMatches) -> NeonStorageControllerStartArgs {
+    let maybe_instance_id = args.get_one::<u8>("instance-id");
+
+    let base_port = args.get_one::<u16>("base-port");
+
+    if maybe_instance_id.is_some() && base_port.is_none() {
+        panic!("storage-controller start specificied instance-id but did not provide base-port");
+    }
+
+    let start_timeout = args
+        .get_one::<humantime::Duration>("start-timeout")
+        .expect("invalid value for start-timeout");
+
+    NeonStorageControllerStartArgs {
+        instance_id: maybe_instance_id.copied().unwrap_or(1),
+        base_port: base_port.copied(),
+        start_timeout: *start_timeout,
+    }
+}
+
+fn storage_controller_stop_args(args: &ArgMatches) -> NeonStorageControllerStopArgs {
+    let maybe_instance_id = args.get_one::<u8>("instance-id");
+    let immediate = args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
+
+    NeonStorageControllerStopArgs {
+        instance_id: maybe_instance_id.copied().unwrap_or(1),
+        immediate,
+    }
+}
+
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     match sub_match.subcommand() {
         Some(("start", subcommand_args)) => {
@@ -1113,19 +1145,14 @@ async fn handle_storage_controller(
     let svc = StorageController::from_env(env);
     match sub_match.subcommand() {
         Some(("start", start_match)) => {
-            if let Err(e) = svc.start(get_start_timeout(start_match)).await {
+            if let Err(e) = svc.start(storage_controller_start_args(start_match)).await {
                 eprintln!("start failed: {e}");
                 exit(1);
             }
         }
 
         Some(("stop", stop_match)) => {
-            let immediate = stop_match
-                .get_one::<String>("stop-mode")
-                .map(|s| s.as_str())
-                == Some("immediate");
-
-            if let Err(e) = svc.stop(immediate).await {
+            if let Err(e) = svc.stop(storage_controller_stop_args(stop_match)).await {
                 eprintln!("stop failed: {}", e);
                 exit(1);
             }
@@ -1228,7 +1255,12 @@ async fn handle_start_all(
     // Only start the storage controller if the pageserver is configured to need it
     if env.control_plane_api.is_some() {
         let storage_controller = StorageController::from_env(env);
-        if let Err(e) = storage_controller.start(retry_timeout).await {
+        if let Err(e) = storage_controller
+            .start(NeonStorageControllerStartArgs::with_default_instance_id(
+                (*retry_timeout).into(),
+            ))
+            .await
+        {
             eprintln!("storage_controller start failed: {:#}", e);
             try_stop_all(env, true).await;
             exit(1);
@@ -1358,10 +1390,21 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
         eprintln!("neon broker stop failed: {e:#}");
     }
 
-    if env.control_plane_api.is_some() {
+    // Stop all storage controller instances. In the most common case there's only one,
+    // but iterate though the base data directory in order to discover the instances.
+    let storcon_instances = env
+        .storage_controller_instances()
+        .await
+        .expect("Must inspect data dir");
+    for (instance_id, _instance_dir_path) in storcon_instances {
         let storage_controller = StorageController::from_env(env);
-        if let Err(e) = storage_controller.stop(immediate).await {
-            eprintln!("storage controller stop failed: {e:#}");
+        let stop_args = NeonStorageControllerStopArgs {
+            instance_id,
+            immediate,
+        };
+
+        if let Err(e) = storage_controller.stop(stop_args).await {
+            eprintln!("Storage controller instance {instance_id} stop failed: {e:#}");
         }
     }
 }
@@ -1501,6 +1544,18 @@ fn cli() -> Command {
         .action(ArgAction::SetTrue)
         .required(false);
 
+    let instance_id = Arg::new("instance-id")
+        .long("instance-id")
+        .help("Identifier used to distinguish storage controller instances (default 1)")
+        .value_parser(value_parser!(u8))
+        .required(false);
+
+    let base_port = Arg::new("base-port")
+        .long("base-port")
+        .help("Base port for the storage controller instance idenfified by instance-id (defaults to pagserver cplane api)")
+        .value_parser(value_parser!(u16))
+        .required(false);
+
     Command::new("Neon CLI")
         .arg_required_else_help(true)
         .version(GIT_VERSION)
@@ -1609,9 +1664,12 @@ fn cli() -> Command {
                 .arg_required_else_help(true)
                 .about("Manage storage_controller")
                 .subcommand(Command::new("start").about("Start storage controller")
-                            .arg(timeout_arg.clone()))
+                            .arg(timeout_arg.clone())
+                            .arg(instance_id.clone())
+                            .arg(base_port))
                 .subcommand(Command::new("stop").about("Stop storage controller")
-                            .arg(stop_mode_arg.clone()))
+                            .arg(stop_mode_arg.clone())
+                            .arg(instance_id))
         )
         .subcommand(
             Command::new("safekeeper")
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 15bbac702f..807519c88d 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -156,6 +156,11 @@ pub struct NeonStorageControllerConf {
     #[serde(with = "humantime_serde")]
     pub max_warming_up: Duration,
 
+    pub start_as_candidate: bool,
+
+    /// Database url used when running multiple storage controller instances
+    pub database_url: Option<SocketAddr>,
+
     /// Threshold for auto-splitting a tenant into shards
     pub split_threshold: Option<u64>,
 
@@ -174,6 +179,8 @@ impl Default for NeonStorageControllerConf {
         Self {
             max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
             max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
+            start_as_candidate: false,
+            database_url: None,
             split_threshold: None,
             max_secondary_lag_bytes: None,
         }
@@ -392,6 +399,36 @@ impl LocalEnv {
         }
     }
 
+    /// Inspect the base data directory and extract the instance id and instance directory path
+    /// for all storage controller instances
+    pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
+        let mut instances = Vec::default();
+
+        let dir = std::fs::read_dir(self.base_data_dir.clone())?;
+        for dentry in dir {
+            let dentry = dentry?;
+            let is_dir = dentry.metadata()?.is_dir();
+            let filename = dentry.file_name().into_string().unwrap();
+            let parsed_instance_id = match filename.strip_prefix("storage_controller_") {
+                Some(suffix) => suffix.parse::<u8>().ok(),
+                None => None,
+            };
+
+            let is_instance_dir = is_dir && parsed_instance_id.is_some();
+
+            if !is_instance_dir {
+                continue;
+            }
+
+            instances.push((
+                parsed_instance_id.expect("Checked previously"),
+                dentry.path(),
+            ));
+        }
+
+        Ok(instances)
+    }
+
     pub fn register_branch_mapping(
         &mut self,
         branch_name: String,
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index f180e922e8..2c077595a1 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -3,6 +3,8 @@ use crate::{
     local_env::{LocalEnv, NeonStorageControllerConf},
 };
 use camino::{Utf8Path, Utf8PathBuf};
+use hyper::Uri;
+use nix::unistd::Pid;
 use pageserver_api::{
     controller_api::{
         NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
@@ -18,7 +20,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{fs, str::FromStr, time::Duration};
+use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
@@ -29,12 +31,14 @@ use utils::{
 
 pub struct StorageController {
     env: LocalEnv,
-    listen: String,
     private_key: Option<Vec<u8>>,
     public_key: Option<String>,
-    postgres_port: u16,
     client: reqwest::Client,
     config: NeonStorageControllerConf,
+
+    // The listen addresses is learned when starting the storage controller,
+    // hence the use of OnceLock to init it at the right time.
+    listen: OnceLock<SocketAddr>,
 }
 
 const COMMAND: &str = "storage_controller";
@@ -43,6 +47,36 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 
 const DB_NAME: &str = "storage_controller";
 
+pub struct NeonStorageControllerStartArgs {
+    pub instance_id: u8,
+    pub base_port: Option<u16>,
+    pub start_timeout: humantime::Duration,
+}
+
+impl NeonStorageControllerStartArgs {
+    pub fn with_default_instance_id(start_timeout: humantime::Duration) -> Self {
+        Self {
+            instance_id: 1,
+            base_port: None,
+            start_timeout,
+        }
+    }
+}
+
+pub struct NeonStorageControllerStopArgs {
+    pub instance_id: u8,
+    pub immediate: bool,
+}
+
+impl NeonStorageControllerStopArgs {
+    pub fn with_default_instance_id(immediate: bool) -> Self {
+        Self {
+            instance_id: 1,
+            immediate,
+        }
+    }
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
@@ -67,23 +101,6 @@ pub struct InspectResponse {
 
 impl StorageController {
     pub fn from_env(env: &LocalEnv) -> Self {
-        // Makes no sense to construct this if pageservers aren't going to use it: assume
-        // pageservers have control plane API set
-        let listen_url = env.control_plane_api.clone().unwrap();
-
-        let listen = format!(
-            "{}:{}",
-            listen_url.host_str().unwrap(),
-            listen_url.port().unwrap()
-        );
-
-        // Convention: NeonEnv in python tests reserves the next port after the control_plane_api
-        // port, for use by our captive postgres.
-        let postgres_port = listen_url
-            .port()
-            .expect("Control plane API setting should always have a port")
-            + 1;
-
         // Assume all pageservers have symmetric auth configuration: this service
         // expects to use one JWT token to talk to all of them.
         let ps_conf = env
@@ -126,20 +143,28 @@ impl StorageController {
 
         Self {
             env: env.clone(),
-            listen,
             private_key,
             public_key,
-            postgres_port,
             client: reqwest::ClientBuilder::new()
                 .build()
                 .expect("Failed to construct http client"),
             config: env.storage_controller.clone(),
+            listen: OnceLock::default(),
         }
     }
 
-    fn pid_file(&self) -> Utf8PathBuf {
-        Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
-            .expect("non-Unicode path")
+    fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf {
+        self.env
+            .base_data_dir
+            .join(format!("storage_controller_{}", instance_id))
+    }
+
+    fn pid_file(&self, instance_id: u8) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(
+            self.storage_controller_instance_dir(instance_id)
+                .join("storage_controller.pid"),
+        )
+        .expect("non-Unicode path")
     }
 
     /// PIDFile for the postgres instance used to store storage controller state
@@ -184,9 +209,9 @@ impl StorageController {
     }
 
     /// Readiness check for our postgres process
-    async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
+    async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result<bool> {
         let bin_path = pg_bin_dir.join("pg_isready");
-        let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
+        let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)];
         let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
 
         Ok(exitcode.success())
@@ -199,8 +224,8 @@ impl StorageController {
     /// who just want to run `cargo neon_local` without knowing about diesel.
     ///
     /// Returns the database url
-    pub async fn setup_database(&self) -> anyhow::Result<String> {
-        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
+    pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result<String> {
+        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
 
         let pg_bin_dir = self.get_pg_bin_dir().await?;
         let createdb_path = pg_bin_dir.join("createdb");
@@ -209,7 +234,7 @@ impl StorageController {
                 "-h",
                 "localhost",
                 "-p",
-                &format!("{}", self.postgres_port),
+                &format!("{}", postgres_port),
                 DB_NAME,
             ])
             .output()
@@ -230,13 +255,14 @@ impl StorageController {
 
     pub async fn connect_to_database(
         &self,
+        postgres_port: u16,
     ) -> anyhow::Result<(
         tokio_postgres::Client,
         tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
     )> {
         tokio_postgres::Config::new()
             .host("localhost")
-            .port(self.postgres_port)
+            .port(postgres_port)
             // The user is the ambient operating system user name.
             // That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
             //
@@ -252,72 +278,115 @@ impl StorageController {
             .map_err(anyhow::Error::new)
     }
 
-    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
-        // Start a vanilla Postgres process used by the storage controller for persistence.
-        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
-            .unwrap()
-            .join("storage_controller_db");
-        let pg_bin_dir = self.get_pg_bin_dir().await?;
-        let pg_lib_dir = self.get_pg_lib_dir().await?;
-        let pg_log_path = pg_data_path.join("postgres.log");
+    pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
+        let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
+        if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
+            if err.kind() != std::io::ErrorKind::AlreadyExists {
+                panic!("Failed to create instance dir {instance_dir:?}");
+            }
+        }
 
-        if !tokio::fs::try_exists(&pg_data_path).await? {
-            // Initialize empty database
-            let initdb_path = pg_bin_dir.join("initdb");
-            let mut child = Command::new(&initdb_path)
-                .envs(vec![
-                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                ])
-                .args(["-D", pg_data_path.as_ref()])
-                .spawn()
-                .expect("Failed to spawn initdb");
-            let status = child.wait().await?;
-            if !status.success() {
-                anyhow::bail!("initdb failed with status {status}");
+        let (listen, postgres_port) = {
+            if let Some(base_port) = start_args.base_port {
+                (
+                    format!("127.0.0.1:{base_port}"),
+                    self.config
+                        .database_url
+                        .expect("--base-port requires NeonStorageControllerConf::database_url")
+                        .port(),
+                )
+            } else {
+                let listen_url = self.env.control_plane_api.clone().unwrap();
+
+                let listen = format!(
+                    "{}:{}",
+                    listen_url.host_str().unwrap(),
+                    listen_url.port().unwrap()
+                );
+
+                (listen, listen_url.port().unwrap() + 1)
             }
         };
 
-        // Write a minimal config file:
-        // - Specify the port, since this is chosen dynamically
-        // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
-        //   the storage controller we don't want a slow local disk to interfere with that.
-        //
-        // NB: it's important that we rewrite this file on each start command so we propagate changes
-        // from `LocalEnv`'s config file (`.neon/config`).
-        tokio::fs::write(
-            &pg_data_path.join("postgresql.conf"),
-            format!("port = {}\nfsync=off\n", self.postgres_port),
-        )
-        .await?;
+        let socket_addr = listen
+            .parse()
+            .expect("listen address is a valid socket address");
+        self.listen
+            .set(socket_addr)
+            .expect("StorageController::listen is only set here");
 
-        println!("Starting storage controller database...");
-        let db_start_args = [
-            "-w",
-            "-D",
-            pg_data_path.as_ref(),
-            "-l",
-            pg_log_path.as_ref(),
-            "start",
-        ];
+        // Do we remove the pid file on stop?
+        let pg_started = self.is_postgres_running().await?;
+        let pg_lib_dir = self.get_pg_lib_dir().await?;
 
-        background_process::start_process(
-            "storage_controller_db",
-            &self.env.base_data_dir,
-            pg_bin_dir.join("pg_ctl").as_std_path(),
-            db_start_args,
-            vec![
-                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-            ],
-            background_process::InitialPidFile::Create(self.postgres_pid_file()),
-            retry_timeout,
-            || self.pg_isready(&pg_bin_dir),
-        )
-        .await?;
+        if !pg_started {
+            // Start a vanilla Postgres process used by the storage controller for persistence.
+            let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
+                .unwrap()
+                .join("storage_controller_db");
+            let pg_bin_dir = self.get_pg_bin_dir().await?;
+            let pg_log_path = pg_data_path.join("postgres.log");
 
-        // Run migrations on every startup, in case something changed.
-        let database_url = self.setup_database().await?;
+            if !tokio::fs::try_exists(&pg_data_path).await? {
+                // Initialize empty database
+                let initdb_path = pg_bin_dir.join("initdb");
+                let mut child = Command::new(&initdb_path)
+                    .envs(vec![
+                        ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                        ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                    ])
+                    .args(["-D", pg_data_path.as_ref()])
+                    .spawn()
+                    .expect("Failed to spawn initdb");
+                let status = child.wait().await?;
+                if !status.success() {
+                    anyhow::bail!("initdb failed with status {status}");
+                }
+            };
+
+            // Write a minimal config file:
+            // - Specify the port, since this is chosen dynamically
+            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
+            //   the storage controller we don't want a slow local disk to interfere with that.
+            //
+            // NB: it's important that we rewrite this file on each start command so we propagate changes
+            // from `LocalEnv`'s config file (`.neon/config`).
+            tokio::fs::write(
+                &pg_data_path.join("postgresql.conf"),
+                format!("port = {}\nfsync=off\n", postgres_port),
+            )
+            .await?;
+
+            println!("Starting storage controller database...");
+            let db_start_args = [
+                "-w",
+                "-D",
+                pg_data_path.as_ref(),
+                "-l",
+                pg_log_path.as_ref(),
+                "start",
+            ];
+
+            background_process::start_process(
+                "storage_controller_db",
+                &self.env.base_data_dir,
+                pg_bin_dir.join("pg_ctl").as_std_path(),
+                db_start_args,
+                vec![
+                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ],
+                background_process::InitialPidFile::Create(self.postgres_pid_file()),
+                &start_args.start_timeout,
+                || self.pg_isready(&pg_bin_dir, postgres_port),
+            )
+            .await?;
+
+            // Run migrations on every startup, in case something changed.
+            self.setup_database(postgres_port).await?;
+        }
+
+        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
 
         // We support running a startup SQL script to fiddle with the database before we launch storcon.
         // This is used by the test suite.
@@ -339,7 +408,7 @@ impl StorageController {
                 }
             }
         };
-        let (mut client, conn) = self.connect_to_database().await?;
+        let (mut client, conn) = self.connect_to_database(postgres_port).await?;
         let conn = tokio::spawn(conn);
         let tx = client.build_transaction();
         let tx = tx.start().await?;
@@ -348,9 +417,20 @@ impl StorageController {
         drop(client);
         conn.await??;
 
+        let listen = self
+            .listen
+            .get()
+            .expect("cell is set earlier in this function");
+        let address_for_peers = Uri::builder()
+            .scheme("http")
+            .authority(format!("{}:{}", listen.ip(), listen.port()))
+            .path_and_query("")
+            .build()
+            .unwrap();
+
         let mut args = vec![
             "-l",
-            &self.listen,
+            &listen.to_string(),
             "--dev",
             "--database-url",
             &database_url,
@@ -358,10 +438,17 @@ impl StorageController {
             &humantime::Duration::from(self.config.max_offline).to_string(),
             "--max-warming-up-interval",
             &humantime::Duration::from(self.config.max_warming_up).to_string(),
+            "--address-for-peers",
+            &address_for_peers.to_string(),
         ]
         .into_iter()
         .map(|s| s.to_string())
         .collect::<Vec<_>>();
+
+        if self.config.start_as_candidate {
+            args.push("--start-as-candidate".to_string());
+        }
+
         if let Some(private_key) = &self.private_key {
             let claims = Claims::new(None, Scope::PageServerApi);
             let jwt_token =
@@ -394,15 +481,15 @@ impl StorageController {
 
         background_process::start_process(
             COMMAND,
-            &self.env.base_data_dir,
+            &instance_dir,
             &self.env.storage_controller_bin(),
             args,
             vec![
                 ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
                 ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
             ],
-            background_process::InitialPidFile::Create(self.pid_file()),
-            retry_timeout,
+            background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)),
+            &start_args.start_timeout,
             || async {
                 match self.ready().await {
                     Ok(_) => Ok(true),
@@ -415,8 +502,35 @@ impl StorageController {
         Ok(())
     }
 
-    pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
+    pub async fn stop(&self, stop_args: NeonStorageControllerStopArgs) -> anyhow::Result<()> {
+        background_process::stop_process(
+            stop_args.immediate,
+            COMMAND,
+            &self.pid_file(stop_args.instance_id),
+        )?;
+
+        let storcon_instances = self.env.storage_controller_instances().await?;
+        for (instance_id, instanced_dir_path) in storcon_instances {
+            if instance_id == stop_args.instance_id {
+                continue;
+            }
+
+            let pid_file = instanced_dir_path.join("storage_controller.pid");
+            let pid = tokio::fs::read_to_string(&pid_file)
+                .await
+                .map_err(|err| {
+                    anyhow::anyhow!("Failed to read storcon pid file at {pid_file:?}: {err}")
+                })?
+                .parse::<i32>()
+                .expect("pid is valid i32");
+
+            let other_proc_alive = !background_process::process_has_stopped(Pid::from_raw(pid))?;
+            if other_proc_alive {
+                // There is another storage controller instance running, so we return
+                // and leave the database running.
+                return Ok(());
+            }
+        }
 
         let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
         let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -429,27 +543,51 @@ impl StorageController {
             .wait()
             .await?;
         if !stop_status.success() {
-            let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
-            let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
-                .args(pg_status_args)
-                .spawn()?
-                .wait()
-                .await?;
-
-            // pg_ctl status returns this exit code if postgres is not running: in this case it is
-            // fine that stop failed.  Otherwise it is an error that stop failed.
-            const PG_STATUS_NOT_RUNNING: i32 = 3;
-            if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
-                println!("Storage controller database is already stopped");
-                return Ok(());
-            } else {
-                anyhow::bail!("Failed to stop storage controller database: {stop_status}")
+            match self.is_postgres_running().await {
+                Ok(false) => {
+                    println!("Storage controller database is already stopped");
+                    return Ok(());
+                }
+                Ok(true) => {
+                    anyhow::bail!("Failed to stop storage controller database");
+                }
+                Err(err) => {
+                    anyhow::bail!("Failed to stop storage controller database: {err}");
+                }
             }
         }
 
         Ok(())
     }
 
+    async fn is_postgres_running(&self) -> anyhow::Result<bool> {
+        let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
+        let pg_bin_dir = self.get_pg_bin_dir().await?;
+
+        let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
+        let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
+            .args(pg_status_args)
+            .spawn()?
+            .wait()
+            .await?;
+
+        // pg_ctl status returns this exit code if postgres is not running: in this case it is
+        // fine that stop failed.  Otherwise it is an error that stop failed.
+        const PG_STATUS_NOT_RUNNING: i32 = 3;
+        const PG_NO_DATA_DIR: i32 = 4;
+        const PG_STATUS_RUNNING: i32 = 0;
+        match status_exitcode.code() {
+            Some(PG_STATUS_NOT_RUNNING) => Ok(false),
+            Some(PG_NO_DATA_DIR) => Ok(false),
+            Some(PG_STATUS_RUNNING) => Ok(true),
+            Some(code) => Err(anyhow::anyhow!(
+                "pg_ctl status returned unexpected status code: {:?}",
+                code
+            )),
+            None => Err(anyhow::anyhow!("pg_ctl status returned no status code")),
+        }
+    }
+
     fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
         let category = match path.find('/') {
             Some(idx) => &path[..idx],
@@ -475,15 +613,31 @@ impl StorageController {
         RQ: Serialize + Sized,
         RS: DeserializeOwned + Sized,
     {
-        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
-        // for general purpose API access.
-        let listen_url = self.env.control_plane_api.clone().unwrap();
-        let url = Url::from_str(&format!(
-            "http://{}:{}/{path}",
-            listen_url.host_str().unwrap(),
-            listen_url.port().unwrap()
-        ))
-        .unwrap();
+        // In the special case of the `storage_controller start` subcommand, we wish
+        // to use the API endpoint of the newly started storage controller in order
+        // to pass the readiness check. In this scenario [`Self::listen`] will be set
+        // (see [`Self::start`]).
+        //
+        // Otherwise, we infer the storage controller api endpoint from the configured
+        // control plane API.
+        let url = if let Some(socket_addr) = self.listen.get() {
+            Url::from_str(&format!(
+                "http://{}:{}/{path}",
+                socket_addr.ip().to_canonical(),
+                socket_addr.port()
+            ))
+            .unwrap()
+        } else {
+            // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
+            // for general purpose API access.
+            let listen_url = self.env.control_plane_api.clone().unwrap();
+            Url::from_str(&format!(
+                "http://{}:{}/{path}",
+                listen_url.host_str().unwrap(),
+                listen_url.port().unwrap()
+            ))
+            .unwrap()
+        };
 
         let mut builder = self.client.request(method, url);
         if let Some(body) = body {
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index e755aaed19..7bbd1541cf 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -520,6 +520,19 @@ async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiErr
     json_response(StatusCode::OK, node_status)
 }
 
+async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let leader = state.service.get_leader().await.map_err(|err| {
+        ApiError::InternalServerError(anyhow::anyhow!(
+            "Failed to read leader from database: {err}"
+        ))
+    })?;
+
+    json_response(StatusCode::OK, leader)
+}
+
 async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -1016,6 +1029,9 @@ pub fn make_router(
         .get("/control/v1/node/:node_id", |r| {
             named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
         })
+        .get("/control/v1/leader", |r| {
+            named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader"))
+        })
         .put("/control/v1/node/:node_id/drain", |r| {
             named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
         })
diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs
index ebb59a1720..3f8520fe55 100644
--- a/storage_controller/src/peer_client.rs
+++ b/storage_controller/src/peer_client.rs
@@ -1,7 +1,7 @@
 use crate::tenant_shard::ObservedState;
 use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
+use std::{collections::HashMap, time::Duration};
 use tokio_util::sync::CancellationToken;
 
 use hyper::Uri;
@@ -69,6 +69,8 @@ impl PeerClient {
             req
         };
 
+        let req = req.timeout(Duration::from_secs(2));
+
         let res = req
             .send()
             .await
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 84db088a42..3459b44774 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -20,7 +20,8 @@ use crate::{
     metrics,
     peer_client::{GlobalObservedState, PeerClient},
     persistence::{
-        AbortShardSplitStatus, ControllerPersistence, MetadataHealthPersistence, TenantFilter,
+        AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
+        TenantFilter,
     },
     reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
     scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
@@ -489,11 +490,6 @@ pub(crate) enum ReconcileResultRequest {
     Stop,
 }
 
-struct LeaderStepDownState {
-    observed: GlobalObservedState,
-    leader: ControllerPersistence,
-}
-
 impl Service {
     pub fn get_config(&self) -> &Config {
         &self.config
@@ -504,7 +500,8 @@ impl Service {
     #[instrument(skip_all)]
     async fn startup_reconcile(
         self: &Arc<Service>,
-        leader_step_down_state: Option<LeaderStepDownState>,
+        current_leader: Option<ControllerPersistence>,
+        leader_step_down_state: Option<GlobalObservedState>,
         bg_compute_notify_result_tx: tokio::sync::mpsc::Sender<
             Result<(), (TenantShardId, NotifyError)>,
         >,
@@ -522,17 +519,15 @@ impl Service {
             .checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
             .expect("Reconcile timeout is a modest constant");
 
-        let (observed, current_leader) = if let Some(state) = leader_step_down_state {
+        let observed = if let Some(state) = leader_step_down_state {
             tracing::info!(
                 "Using observed state received from leader at {}",
-                state.leader.address,
+                current_leader.as_ref().unwrap().address
             );
-            (state.observed, Some(state.leader))
+
+            state
         } else {
-            (
-                self.build_global_observed_state(node_scan_deadline).await,
-                None,
-            )
+            self.build_global_observed_state(node_scan_deadline).await
         };
 
         // Accumulate a list of any tenant locations that ought to be detached
@@ -1382,13 +1377,32 @@ impl Service {
                 };
 
                 let leadership_status = this.inner.read().unwrap().get_leadership_status();
-                let peer_observed_state = match leadership_status {
-                    LeadershipStatus::Candidate => this.request_step_down().await,
+                let leader = match this.get_leader().await {
+                    Ok(ok) => ok,
+                    Err(err) => {
+                        tracing::error!(
+                            "Failed to query database for current leader: {err}. Aborting start-up ..."
+                        );
+                        std::process::exit(1);
+                    }
+                };
+
+                let leader_step_down_state = match leadership_status {
+                    LeadershipStatus::Candidate => {
+                        if let Some(ref leader) = leader {
+                            this.request_step_down(leader).await
+                        } else {
+                            tracing::info!(
+                                "No leader found to request step down from. Will build observed state."
+                            );
+                            None
+                        }
+                    }
                     LeadershipStatus::Leader => None,
                     LeadershipStatus::SteppedDown => unreachable!(),
                 };
 
-                this.startup_reconcile(peer_observed_state, bg_compute_notify_result_tx)
+                this.startup_reconcile(leader, leader_step_down_state, bg_compute_notify_result_tx)
                     .await;
 
                 drop(startup_completion);
@@ -4650,6 +4664,10 @@ impl Service {
             ))
     }
 
+    pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
+        self.persistence.get_leader().await
+    }
+
     pub(crate) async fn node_register(
         &self,
         register_req: NodeRegisterRequest,
@@ -6342,6 +6360,7 @@ impl Service {
 
     pub(crate) async fn step_down(&self) -> GlobalObservedState {
         tracing::info!("Received step down request from peer");
+        failpoint_support::sleep_millis_async!("sleep-on-step-down-handling");
 
         self.inner.write().unwrap().step_down();
         // TODO: would it make sense to have a time-out for this?
@@ -6367,50 +6386,31 @@ impl Service {
     ///
     /// On failures to query the database or step down error responses the process is killed
     /// and we rely on k8s to retry.
-    async fn request_step_down(&self) -> Option<LeaderStepDownState> {
-        let leader = match self.persistence.get_leader().await {
-            Ok(leader) => leader,
+    async fn request_step_down(
+        &self,
+        leader: &ControllerPersistence,
+    ) -> Option<GlobalObservedState> {
+        tracing::info!("Sending step down request to {leader:?}");
+
+        // TODO: jwt token
+        let client = PeerClient::new(
+            Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
+            self.config.jwt_token.clone(),
+        );
+        let state = client.step_down(&self.cancel).await;
+        match state {
+            Ok(state) => Some(state),
             Err(err) => {
+                // TODO: Make leaders periodically update a timestamp field in the
+                // database and, if the leader is not reachable from the current instance,
+                // but inferred as alive from the timestamp, abort start-up. This avoids
+                // a potential scenario in which we have two controllers acting as leaders.
                 tracing::error!(
-                    "Failed to query database for current leader: {err}. Aborting start-up ..."
+                    "Leader ({}) did not respond to step-down request: {}",
+                    leader.address,
+                    err
                 );
-                std::process::exit(1);
-            }
-        };
 
-        match leader {
-            Some(leader) => {
-                tracing::info!("Sending step down request to {leader:?}");
-
-                // TODO: jwt token
-                let client = PeerClient::new(
-                    Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
-                    self.config.jwt_token.clone(),
-                );
-                let state = client.step_down(&self.cancel).await;
-                match state {
-                    Ok(state) => Some(LeaderStepDownState {
-                        observed: state,
-                        leader: leader.clone(),
-                    }),
-                    Err(err) => {
-                        // TODO: Make leaders periodically update a timestamp field in the
-                        // database and, if the leader is not reachable from the current instance,
-                        // but inferred as alive from the timestamp, abort start-up. This avoids
-                        // a potential scenario in which we have two controllers acting as leaders.
-                        tracing::error!(
-                            "Leader ({}) did not respond to step-down request: {}",
-                            leader.address,
-                            err
-                        );
-                        None
-                    }
-                }
-            }
-            None => {
-                tracing::info!(
-                    "No leader found to request step down from. Will build observed state."
-                );
                 None
             }
         }
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 4b0c9ac71d..996ca4d652 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -3,6 +3,7 @@ pytest_plugins = (
     "fixtures.parametrize",
     "fixtures.httpserver",
     "fixtures.compute_reconfigure",
+    "fixtures.storage_controller_proxy",
     "fixtures.neon_fixtures",
     "fixtures.benchmark_fixture",
     "fixtures.pg_stats",
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b76432127d..ec5a83601e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -497,6 +497,7 @@ class NeonEnvBuilder:
         pageserver_aux_file_policy: Optional[AuxFileStore] = None,
         pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
         safekeeper_extra_opts: Optional[list[str]] = None,
+        storage_controller_port_override: Optional[int] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -549,6 +550,8 @@ class NeonEnvBuilder:
 
         self.safekeeper_extra_opts = safekeeper_extra_opts
 
+        self.storage_controller_port_override = storage_controller_port_override
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -1054,6 +1057,7 @@ class NeonEnv:
     """
 
     BASE_PAGESERVER_ID = 1
+    storage_controller: NeonStorageController | NeonProxiedStorageController
 
     def __init__(self, config: NeonEnvBuilder):
         self.repo_dir = config.repo_dir
@@ -1084,27 +1088,41 @@ class NeonEnv:
         self.initial_tenant = config.initial_tenant
         self.initial_timeline = config.initial_timeline
 
-        # Find two adjacent ports for storage controller and its postgres DB.  This
-        # loop would eventually throw from get_port() if we run out of ports (extremely
-        # unlikely): usually we find two adjacent free ports on the first iteration.
-        while True:
-            self.storage_controller_port = self.port_distributor.get_port()
-            storage_controller_pg_port = self.port_distributor.get_port()
-            if storage_controller_pg_port == self.storage_controller_port + 1:
-                break
-
         # The URL for the pageserver to use as its control_plane_api config
-        self.control_plane_api: str = f"http://127.0.0.1:{self.storage_controller_port}/upcall/v1"
-        # The base URL of the storage controller
-        self.storage_controller_api: str = f"http://127.0.0.1:{self.storage_controller_port}"
+        if config.storage_controller_port_override is not None:
+            log.info(
+                f"Using storage controller api override {config.storage_controller_port_override}"
+            )
+
+            self.storage_controller_port = config.storage_controller_port_override
+            self.storage_controller = NeonProxiedStorageController(
+                self, config.storage_controller_port_override, config.auth_enabled
+            )
+        else:
+            # Find two adjacent ports for storage controller and its postgres DB.  This
+            # loop would eventually throw from get_port() if we run out of ports (extremely
+            # unlikely): usually we find two adjacent free ports on the first iteration.
+            while True:
+                storage_controller_port = self.port_distributor.get_port()
+                storage_controller_pg_port = self.port_distributor.get_port()
+                if storage_controller_pg_port == storage_controller_port + 1:
+                    break
+
+            self.storage_controller_port = storage_controller_port
+            self.storage_controller = NeonStorageController(
+                self, storage_controller_port, config.auth_enabled
+            )
+
+            log.info(
+                f"Using generated control_plane_api: {self.storage_controller.upcall_api_endpoint()}"
+            )
+
+        self.storage_controller_api: str = self.storage_controller.api_root()
+        self.control_plane_api: str = self.storage_controller.upcall_api_endpoint()
 
         # For testing this with a fake HTTP server, enable passing through a URL from config
         self.control_plane_compute_hook_api = config.control_plane_compute_hook_api
 
-        self.storage_controller: NeonStorageController = NeonStorageController(
-            self, config.auth_enabled
-        )
-
         self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
         self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
 
@@ -1869,16 +1887,24 @@ class NeonCli(AbstractNeonCli):
     def storage_controller_start(
         self,
         timeout_in_seconds: Optional[int] = None,
+        instance_id: Optional[int] = None,
+        base_port: Optional[int] = None,
     ):
         cmd = ["storage_controller", "start"]
         if timeout_in_seconds is not None:
             cmd.append(f"--start-timeout={timeout_in_seconds}s")
+        if instance_id is not None:
+            cmd.append(f"--instance-id={instance_id}")
+        if base_port is not None:
+            cmd.append(f"--base-port={base_port}")
         return self.raw_cli(cmd)
 
-    def storage_controller_stop(self, immediate: bool):
+    def storage_controller_stop(self, immediate: bool, instance_id: Optional[int] = None):
         cmd = ["storage_controller", "stop"]
         if immediate:
             cmd.extend(["-m", "immediate"])
+        if instance_id is not None:
+            cmd.append(f"--instance-id={instance_id}")
         return self.raw_cli(cmd)
 
     def pageserver_start(
@@ -2189,17 +2215,30 @@ class PageserverSchedulingPolicy(str, Enum):
     PAUSE_FOR_RESTART = "PauseForRestart"
 
 
+class StorageControllerLeadershipStatus(str, Enum):
+    LEADER = "leader"
+    STEPPED_DOWN = "stepped_down"
+    CANDIDATE = "candidate"
+
+
 class NeonStorageController(MetricsGetter, LogUtils):
-    def __init__(self, env: NeonEnv, auth_enabled: bool):
+    def __init__(self, env: NeonEnv, port: int, auth_enabled: bool):
         self.env = env
+        self.port: int = port
+        self.api: str = f"http://127.0.0.1:{port}"
         self.running = False
         self.auth_enabled = auth_enabled
         self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
-        self.logfile = self.workdir / "storage_controller.log"
+        self.logfile = self.env.repo_dir / "storage_controller_1" / "storage_controller.log"
 
-    def start(self, timeout_in_seconds: Optional[int] = None):
+    def start(
+        self,
+        timeout_in_seconds: Optional[int] = None,
+        instance_id: Optional[int] = None,
+        base_port: Optional[int] = None,
+    ):
         assert not self.running
-        self.env.neon_cli.storage_controller_start(timeout_in_seconds)
+        self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
         self.running = True
         return self
 
@@ -2209,6 +2248,12 @@ class NeonStorageController(MetricsGetter, LogUtils):
             self.running = False
         return self
 
+    def upcall_api_endpoint(self) -> str:
+        return f"{self.api}/upcall/v1"
+
+    def api_root(self) -> str:
+        return self.api
+
     @staticmethod
     def retryable_node_operation(op, ps_id, max_attempts, backoff):
         while max_attempts > 0:
@@ -2237,7 +2282,9 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
     def assert_no_errors(self):
         assert_no_errors(
-            self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors
+            self.logfile,
+            "storage_controller",
+            self.allowed_errors,
         )
 
     def pageserver_api(self) -> PageserverHttpClient:
@@ -2249,7 +2296,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         auth_token = None
         if self.auth_enabled:
             auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
-        return PageserverHttpClient(self.env.storage_controller_port, lambda: True, auth_token)
+        return PageserverHttpClient(self.port, lambda: True, auth_token)
 
     def request(self, method, *args, **kwargs) -> requests.Response:
         resp = requests.request(method, *args, **kwargs)
@@ -2266,13 +2313,13 @@ class NeonStorageController(MetricsGetter, LogUtils):
         return headers
 
     def get_metrics(self) -> Metrics:
-        res = self.request("GET", f"{self.env.storage_controller_api}/metrics")
+        res = self.request("GET", f"{self.api}/metrics")
         return parse_metrics(res.text)
 
     def ready(self) -> bool:
         status = None
         try:
-            resp = self.request("GET", f"{self.env.storage_controller_api}/ready")
+            resp = self.request("GET", f"{self.api}/ready")
             status = resp.status_code
         except StorageControllerApiException as e:
             status = e.status_code
@@ -2305,7 +2352,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         response = self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/attach-hook",
+            f"{self.api}/debug/v1/attach-hook",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2316,7 +2363,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
         self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/attach-hook",
+            f"{self.api}/debug/v1/attach-hook",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2327,7 +2374,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         """
         response = self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/inspect",
+            f"{self.api}/debug/v1/inspect",
             json={"tenant_shard_id": str(tenant_shard_id)},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2350,7 +2397,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"node_register({body})")
         self.request(
             "POST",
-            f"{self.env.storage_controller_api}/control/v1/node",
+            f"{self.api}/control/v1/node",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2359,7 +2406,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"node_delete({node_id})")
         self.request(
             "DELETE",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
+            f"{self.api}/control/v1/node/{node_id}",
             headers=self.headers(TokenScope.ADMIN),
         )
 
@@ -2367,7 +2414,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"node_drain({node_id})")
         self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
+            f"{self.api}/control/v1/node/{node_id}/drain",
             headers=self.headers(TokenScope.ADMIN),
         )
 
@@ -2375,7 +2422,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"cancel_node_drain({node_id})")
         self.request(
             "DELETE",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
+            f"{self.api}/control/v1/node/{node_id}/drain",
             headers=self.headers(TokenScope.ADMIN),
         )
 
@@ -2383,7 +2430,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"node_fill({node_id})")
         self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
+            f"{self.api}/control/v1/node/{node_id}/fill",
             headers=self.headers(TokenScope.ADMIN),
         )
 
@@ -2391,14 +2438,22 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"cancel_node_fill({node_id})")
         self.request(
             "DELETE",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
+            f"{self.api}/control/v1/node/{node_id}/fill",
             headers=self.headers(TokenScope.ADMIN),
         )
 
     def node_status(self, node_id):
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
+            f"{self.api}/control/v1/node/{node_id}",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        return response.json()
+
+    def get_leader(self):
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/leader",
             headers=self.headers(TokenScope.ADMIN),
         )
         return response.json()
@@ -2406,7 +2461,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def node_list(self):
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/control/v1/node",
+            f"{self.api}/control/v1/node",
             headers=self.headers(TokenScope.ADMIN),
         )
         return response.json()
@@ -2414,7 +2469,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def tenant_list(self):
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/debug/v1/tenant",
+            f"{self.api}/debug/v1/tenant",
             headers=self.headers(TokenScope.ADMIN),
         )
         return response.json()
@@ -2424,7 +2479,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         body["node_id"] = node_id
         self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/config",
+            f"{self.api}/control/v1/node/{node_id}/config",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2459,7 +2514,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         response = self.request(
             "POST",
-            f"{self.env.storage_controller_api}/v1/tenant",
+            f"{self.api}/v1/tenant",
             json=body,
             headers=self.headers(TokenScope.PAGE_SERVER_API),
         )
@@ -2472,7 +2527,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         """
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/locate",
+            f"{self.api}/debug/v1/tenant/{tenant_id}/locate",
             headers=self.headers(TokenScope.ADMIN),
         )
         body = response.json()
@@ -2485,7 +2540,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         """
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}",
+            f"{self.api}/control/v1/tenant/{tenant_id}",
             headers=self.headers(TokenScope.ADMIN),
         )
         response.raise_for_status()
@@ -2496,7 +2551,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     ) -> list[TenantShardId]:
         response = self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/shard_split",
+            f"{self.api}/control/v1/tenant/{tenant_id}/shard_split",
             json={"new_shard_count": shard_count, "new_stripe_size": shard_stripe_size},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2508,7 +2563,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
         self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_shard_id}/migrate",
+            f"{self.api}/control/v1/tenant/{tenant_shard_id}/migrate",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2519,7 +2574,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"tenant_policy_update({tenant_id}, {body})")
         self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/policy",
+            f"{self.api}/control/v1/tenant/{tenant_id}/policy",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2527,14 +2582,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def tenant_import(self, tenant_id: TenantId):
         self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/import",
+            f"{self.api}/debug/v1/tenant/{tenant_id}/import",
             headers=self.headers(TokenScope.ADMIN),
         )
 
     def reconcile_all(self):
         r = self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/reconcile_all",
+            f"{self.api}/debug/v1/reconcile_all",
             headers=self.headers(TokenScope.ADMIN),
         )
         r.raise_for_status()
@@ -2567,7 +2622,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         """
         self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/consistency_check",
+            f"{self.api}/debug/v1/consistency_check",
             headers=self.headers(TokenScope.ADMIN),
         )
         log.info("storage controller passed consistency check")
@@ -2640,7 +2695,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         self.request(
             "POST",
-            f"{self.env.storage_controller_api}/control/v1/metadata_health/update",
+            f"{self.api}/control/v1/metadata_health/update",
             json=body,
             headers=self.headers(TokenScope.SCRUBBER),
         )
@@ -2648,7 +2703,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def metadata_health_list_unhealthy(self):
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/control/v1/metadata_health/unhealthy",
+            f"{self.api}/control/v1/metadata_health/unhealthy",
             headers=self.headers(TokenScope.ADMIN),
         )
         return response.json()
@@ -2658,7 +2713,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         response = self.request(
             "POST",
-            f"{self.env.storage_controller_api}/control/v1/metadata_health/outdated",
+            f"{self.api}/control/v1/metadata_health/outdated",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2681,7 +2736,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info("Asking storage controller to step down")
         response = self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/step_down",
+            f"{self.api}/control/v1/step_down",
             headers=self.headers(TokenScope.ADMIN),
         )
 
@@ -2698,7 +2753,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         res = self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/debug/v1/failpoints",
+            f"{self.api}/debug/v1/failpoints",
             json=[{"name": name, "actions": actions} for name, actions in pairs],
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2768,9 +2823,21 @@ class NeonStorageController(MetricsGetter, LogUtils):
                 parsed_tid, wait_ms=250
             )
 
-    @property
-    def workdir(self) -> Path:
-        return self.env.repo_dir
+    def get_leadership_status(self) -> StorageControllerLeadershipStatus:
+        metric_values = {}
+        for status in StorageControllerLeadershipStatus:
+            metric_value = self.get_metric_value(
+                "storage_controller_leadership_status", filter={"status": status}
+            )
+            metric_values[status] = metric_value
+
+        assert list(metric_values.values()).count(1) == 1
+
+        for status, metric_value in metric_values.items():
+            if metric_value == 1:
+                return status
+
+        raise AssertionError("unreachable")
 
     def __enter__(self) -> "NeonStorageController":
         return self
@@ -2784,6 +2851,59 @@ class NeonStorageController(MetricsGetter, LogUtils):
         self.stop(immediate=True)
 
 
+class NeonProxiedStorageController(NeonStorageController):
+    def __init__(self, env: NeonEnv, proxy_port: int, auth_enabled: bool):
+        super(NeonProxiedStorageController, self).__init__(env, proxy_port, auth_enabled)
+        self.instances: dict[int, dict[str, Any]] = {}
+
+    def start(
+        self,
+        timeout_in_seconds: Optional[int] = None,
+        instance_id: Optional[int] = None,
+        base_port: Optional[int] = None,
+    ):
+        assert instance_id is not None and base_port is not None
+
+        self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
+        self.instances[instance_id] = {"running": True}
+
+        self.running = True
+        return self
+
+    def stop_instance(
+        self, immediate: bool = False, instance_id: Optional[int] = None
+    ) -> "NeonStorageController":
+        assert instance_id in self.instances
+        if self.instances[instance_id]["running"]:
+            self.env.neon_cli.storage_controller_stop(immediate, instance_id)
+            self.instances[instance_id]["running"] = False
+
+        self.running = any(meta["running"] for meta in self.instances.values())
+        return self
+
+    def stop(self, immediate: bool = False) -> "NeonStorageController":
+        for iid, details in self.instances.items():
+            if details["running"]:
+                self.env.neon_cli.storage_controller_stop(immediate, iid)
+                self.instances[iid]["running"] = False
+
+        self.running = False
+        return self
+
+    def assert_no_errors(self):
+        for instance_id in self.instances.keys():
+            assert_no_errors(
+                self.env.repo_dir / f"storage_controller_{instance_id}" / "storage_controller.log",
+                "storage_controller",
+                self.allowed_errors,
+            )
+
+    def log_contains(
+        self, pattern: str, offset: None | LogCursor = None
+    ) -> Optional[Tuple[str, LogCursor]]:
+        raise NotImplementedError()
+
+
 @dataclass
 class LogCursor:
     _line_no: int
@@ -4520,7 +4640,7 @@ class StorageScrubber:
 
         base_args = [
             str(self.env.neon_binpath / "storage_scrubber"),
-            f"--controller-api={self.env.storage_controller_api}",
+            f"--controller-api={self.env.storage_controller.api_root()}",
         ]
         args = base_args + args
 
diff --git a/test_runner/fixtures/storage_controller_proxy.py b/test_runner/fixtures/storage_controller_proxy.py
new file mode 100644
index 0000000000..3477f8b1f2
--- /dev/null
+++ b/test_runner/fixtures/storage_controller_proxy.py
@@ -0,0 +1,73 @@
+import re
+from typing import Any, Optional
+
+import pytest
+import requests
+from pytest_httpserver import HTTPServer
+from werkzeug.datastructures import Headers
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
+
+from fixtures.log_helper import log
+
+
+class StorageControllerProxy:
+    def __init__(self, server: HTTPServer):
+        self.server: HTTPServer = server
+        self.listen: str = f"http://{server.host}:{server.port}"
+        self.routing_to: Optional[str] = None
+
+    def route_to(self, storage_controller_api: str):
+        self.routing_to = storage_controller_api
+
+    def port(self) -> int:
+        return self.server.port
+
+    def upcall_api_endpoint(self) -> str:
+        return f"{self.listen}/upcall/v1"
+
+
+def proxy_request(method: str, url: str, **kwargs) -> requests.Response:
+    return requests.request(method, url, **kwargs)
+
+
+@pytest.fixture(scope="function")
+def storage_controller_proxy(make_httpserver):
+    """
+    Proxies requests into the storage controller to the currently
+    selected storage controller instance via `StorageControllerProxy.route_to`.
+
+    This fixture is intended for tests that need to run multiple instances
+    of the storage controller at the same time.
+    """
+    server = make_httpserver
+
+    self = StorageControllerProxy(server)
+
+    log.info(f"Storage controller proxy listening on {self.listen}")
+
+    def handler(request: Request):
+        if self.route_to is None:
+            log.info(f"Storage controller proxy has no routing configured for {request.url}")
+            return Response("Routing not configured", status=503)
+
+        route_to_url = f"{self.routing_to}{request.path}"
+
+        log.info(f"Routing {request.url} to {route_to_url}")
+
+        args: dict[str, Any] = {"headers": request.headers}
+        if request.is_json:
+            args["json"] = request.json
+
+        response = proxy_request(request.method, route_to_url, **args)
+
+        headers = Headers()
+        for key, value in response.headers.items():
+            headers.add(key, value)
+
+        return Response(response.content, headers=headers, status=response.status_code)
+
+    self.server.expect_request(re.compile(".*")).respond_with_handler(handler)
+
+    yield self
+    server.clear()
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 4dc9f7caae..80f1c9e4e3 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -403,7 +403,7 @@ def wait_until(
         try:
             res = func()
         except Exception as e:
-            log.info("waiting for %s iteration %s failed", func, i + 1)
+            log.info("waiting for %s iteration %s failed: %s", func, i + 1, e)
             last_exception = e
             if show_intermediate_error:
                 log.info(e)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 7d98ff2923..95c35e9641 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1,3 +1,4 @@
+import concurrent.futures
 import json
 import threading
 import time
@@ -16,6 +17,7 @@ from fixtures.neon_fixtures import (
     PageserverSchedulingPolicy,
     PgBin,
     StorageControllerApiException,
+    StorageControllerLeadershipStatus,
     TokenScope,
     last_flush_lsn_upload,
 )
@@ -30,7 +32,9 @@ from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
 )
 from fixtures.pg_version import PgVersion
+from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
+from fixtures.storage_controller_proxy import StorageControllerProxy
 from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until
 from fixtures.workload import Workload
 from mypy_boto3_s3.type_defs import (
@@ -2093,6 +2097,131 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
     )
 
 
+# This is a copy of NeonEnv.start which injects the instance id and port
+# into the call to NeonStorageController.start
+def start_env(env: NeonEnv, storage_controller_port: int):
+    timeout_in_seconds = 30
+
+    # Storage controller starts first, so that pageserver /re-attach calls don't
+    # bounce through retries on startup
+    env.storage_controller.start(timeout_in_seconds, 1, storage_controller_port)
+
+    # Wait for storage controller readiness to prevent unnecessary post start-up
+    # reconcile.
+    env.storage_controller.wait_until_ready()
+
+    # Start up broker, pageserver and all safekeepers
+    futs = []
+    with concurrent.futures.ThreadPoolExecutor(
+        max_workers=2 + len(env.pageservers) + len(env.safekeepers)
+    ) as executor:
+        futs.append(
+            executor.submit(lambda: env.broker.try_start() or None)
+        )  # The `or None` is for the linter
+
+        for pageserver in env.pageservers:
+            futs.append(
+                executor.submit(
+                    lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)
+                )
+            )
+
+        for safekeeper in env.safekeepers:
+            futs.append(
+                executor.submit(
+                    lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds)
+                )
+            )
+
+    for f in futs:
+        f.result()
+
+
+@pytest.mark.parametrize("step_down_times_out", [False, True])
+def test_storage_controller_leadership_transfer(
+    neon_env_builder: NeonEnvBuilder,
+    storage_controller_proxy: StorageControllerProxy,
+    port_distributor: PortDistributor,
+    step_down_times_out: bool,
+):
+    neon_env_builder.num_pageservers = 3
+
+    neon_env_builder.storage_controller_config = {
+        "database_url": f"127.0.0.1:{port_distributor.get_port()}",
+        "start_as_candidate": True,
+    }
+
+    neon_env_builder.storage_controller_port_override = storage_controller_proxy.port()
+
+    storage_controller_1_port = port_distributor.get_port()
+    storage_controller_2_port = port_distributor.get_port()
+
+    storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
+
+    env = neon_env_builder.init_configs()
+    start_env(env, storage_controller_1_port)
+
+    assert (
+        env.storage_controller.get_leadership_status() == StorageControllerLeadershipStatus.LEADER
+    )
+    leader = env.storage_controller.get_leader()
+    assert leader["address"] == f"http://127.0.0.1:{storage_controller_1_port}/"
+
+    if step_down_times_out:
+        env.storage_controller.configure_failpoints(
+            ("sleep-on-step-down-handling", "return(10000)")
+        )
+        env.storage_controller.allowed_errors.append(".*request was dropped before completing.*")
+
+    tenant_count = 2
+    shard_count = 4
+    tenants = set(TenantId.generate() for _ in range(0, tenant_count))
+
+    for tid in tenants:
+        env.storage_controller.tenant_create(
+            tid, shard_count=shard_count, placement_policy={"Attached": 1}
+        )
+    env.storage_controller.reconcile_until_idle()
+
+    env.storage_controller.start(
+        timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port
+    )
+
+    if not step_down_times_out:
+
+        def previous_stepped_down():
+            assert (
+                env.storage_controller.get_leadership_status()
+                == StorageControllerLeadershipStatus.STEPPED_DOWN
+            )
+
+        wait_until(5, 1, previous_stepped_down)
+
+    storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
+
+    def new_becomes_leader():
+        assert (
+            env.storage_controller.get_leadership_status()
+            == StorageControllerLeadershipStatus.LEADER
+        )
+
+    wait_until(15, 1, new_becomes_leader)
+    leader = env.storage_controller.get_leader()
+    assert leader["address"] == f"http://127.0.0.1:{storage_controller_2_port}/"
+
+    env.storage_controller.wait_until_ready()
+    env.storage_controller.consistency_check()
+
+    if step_down_times_out:
+        env.storage_controller.allowed_errors.extend(
+            [
+                ".*Leader.*did not respond to step-down request.*",
+                ".*Send step down request failed.*",
+                ".*Send step down request still failed.*",
+            ]
+        )
+
+
 def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):
     # single unsharded tenant, two locations
     neon_env_builder.num_pageservers = 2

From 25e7d321f474e5cbc5ac53ed42de697a48db50db Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 6 Aug 2024 12:51:05 +0300
Subject: [PATCH 1437/1571] safekeeper: cross check divergence point in
 ProposerElected handling.

Previously, we protected from multiple ProposerElected messages from the same
walproposer with the following condition:

msg.term == self.get_last_log_term() && self.flush_lsn() >
msg.start_streaming_at

It is not exhaustive, i.e. we could still proceed to truncating WAL even though
safekeeper inserted something since the divergence point has been
calculated. While it was most likely safe because walproposer can't use
safekeeper position to commit WAL until last_log_term reaches the current
walproposer term, let's be more careful and properly calculate the divergence
point like walproposer does.
---
 safekeeper/src/safekeeper.rs | 62 +++++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 19 deletions(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 33ec39b852..0814d9ba67 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -92,7 +92,7 @@ impl TermHistory {
     }
 
     /// Find point of divergence between leader (walproposer) term history and
-    /// safekeeper. Arguments are not symmetrics as proposer history ends at
+    /// safekeeper. Arguments are not symmetric as proposer history ends at
     /// +infinity while safekeeper at flush_lsn.
     /// C version is at walproposer SendProposerElected.
     pub fn find_highest_common_point(
@@ -701,7 +701,13 @@ where
             .with_label_values(&["handle_elected"])
             .start_timer();
 
-        info!("received ProposerElected {:?}", msg);
+        info!(
+            "received ProposerElected {:?}, term={}, last_log_term={}, flush_lsn={}",
+            msg,
+            self.state.acceptor_state.term,
+            self.get_last_log_term(),
+            self.flush_lsn()
+        );
         if self.state.acceptor_state.term < msg.term {
             let mut state = self.state.start_change();
             state.acceptor_state.term = msg.term;
@@ -713,22 +719,43 @@ where
             return Ok(None);
         }
 
-        // This might happen in a rare race when another (old) connection from
-        // the same walproposer writes + flushes WAL after this connection
-        // already sent flush_lsn in VoteRequest. It is generally safe to
-        // proceed, but to prevent commit_lsn surprisingly going down we should
-        // either refuse the session (simpler) or skip the part we already have
-        // from the stream (can be implemented).
-        if msg.term == self.get_last_log_term() && self.flush_lsn() > msg.start_streaming_at {
-            bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
-                   msg.term, self.flush_lsn(), msg.start_streaming_at)
+        // Before truncating WAL check-cross the check divergence point received
+        // from the walproposer.
+        let sk_th = self.get_term_history();
+        let last_common_point = match TermHistory::find_highest_common_point(
+            &msg.term_history,
+            &sk_th,
+            self.flush_lsn(),
+        ) {
+            // No common point. Expect streaming from the beginning of the
+            // history like walproposer while we don't have proper init.
+            None => *msg.term_history.0.first().ok_or(anyhow::anyhow!(
+                "empty walproposer term history {:?}",
+                msg.term_history
+            ))?,
+            Some(lcp) => lcp,
+        };
+        // This is expected to happen in a rare race when another connection
+        // from the same walproposer writes + flushes WAL after this connection
+        // sent flush_lsn in VoteRequest; for instance, very late
+        // ProposerElected message delivery after another connection was
+        // established and wrote WAL. In such cases error is transient;
+        // reconnection makes safekeeper send newest term history and flush_lsn
+        // and walproposer recalculates the streaming point. OTOH repeating
+        // error indicates a serious bug.
+        if last_common_point.lsn != msg.start_streaming_at {
+            bail!("refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}",
+                    last_common_point, msg.start_streaming_at,
+                    self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history,
+            );
         }
-        // Otherwise we must never attempt to truncate committed data.
+
+        // We are also expected to never attempt to truncate committed data.
         assert!(
             msg.start_streaming_at >= self.state.inmem.commit_lsn,
-            "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}",
-            msg.start_streaming_at,
-            self.state.inmem.commit_lsn
+            "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}",
+            msg.start_streaming_at, self.state.inmem.commit_lsn,
+            self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history,
         );
 
         // Before first WAL write initialize its segment. It makes first segment
@@ -743,9 +770,6 @@ where
                 .await?;
         }
 
-        // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to
-        // intersection of our history and history from msg
-
         // truncate wal, update the LSNs
         self.wal_store.truncate_wal(msg.start_streaming_at).await?;
 
@@ -1069,7 +1093,7 @@ mod tests {
 
         let pem = ProposerElected {
             term: 1,
-            start_streaming_at: Lsn(1),
+            start_streaming_at: Lsn(3),
             term_history: TermHistory(vec![TermLsn {
                 term: 1,
                 lsn: Lsn(3),

From e2d89f7991bc9cea88661e50722a02346b7b6485 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 16 Aug 2024 13:35:02 +0100
Subject: [PATCH 1438/1571] pageserver: prioritize secondary downloads to get
 most recent layers first, except l0s (#8729)

## Problem

When a secondary location is trying to catch up while a tenant is
receiving new writes, it can become quite wasteful:
- Downloading L0s which are soon destroyed by compaction to L1s
- Downloading older layer files which are soon made irrelevant when
covered by image layers.

## Summary of changes

Sort the layer files in the heatmap:
- L0 layers are the lowest priority
- Other layers are sorted to download the highest LSNs first.
---
 pageserver/src/tenant/secondary/heatmap.rs |   8 +-
 pageserver/src/tenant/timeline.rs          | 134 +++++++++++++++++++--
 2 files changed, 130 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
index 166483ba5d..4a8e66d38a 100644
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -29,16 +29,16 @@ pub(super) struct HeatMapTenant {
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapTimeline {
     #[serde_as(as = "DisplayFromStr")]
-    pub(super) timeline_id: TimelineId,
+    pub(crate) timeline_id: TimelineId,
 
-    pub(super) layers: Vec<HeatMapLayer>,
+    pub(crate) layers: Vec<HeatMapLayer>,
 }
 
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapLayer {
-    pub(super) name: LayerName,
-    pub(super) metadata: LayerFileMetadata,
+    pub(crate) name: LayerName,
+    pub(crate) metadata: LayerFileMetadata,
 
     #[serde_as(as = "TimestampSeconds<i64>")]
     pub(super) access_time: SystemTime,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 01e77fa1b1..26dc87c373 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2977,11 +2977,7 @@ impl Timeline {
                 LayerVisibilityHint::Visible => {
                     // Layer is visible to one or more read LSNs: elegible for inclusion in layer map
                     let last_activity_ts = layer.latest_activity();
-                    Some(HeatMapLayer::new(
-                        layer.layer_desc().layer_name(),
-                        layer.metadata(),
-                        last_activity_ts,
-                    ))
+                    Some((layer.layer_desc(), layer.metadata(), last_activity_ts))
                 }
                 LayerVisibilityHint::Covered => {
                     // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
@@ -2990,7 +2986,23 @@ impl Timeline {
             }
         });
 
-        let layers = resident.collect();
+        let mut layers = resident.collect::<Vec<_>>();
+
+        // Sort layers in order of which to download first.  For a large set of layers to download, we
+        // want to prioritize those layers which are most likely to still be in the resident many minutes
+        // or hours later:
+        // - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
+        //   only exist for a few minutes before being compacted into L1s.
+        // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
+        //   the layer is likely to be covered by an image layer during compaction.
+        layers.sort_by_key(|(desc, _meta, _atime)| {
+            std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end))
+        });
+
+        let layers = layers
+            .into_iter()
+            .map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
+            .collect();
 
         Some(HeatMapTimeline::new(self.timeline_id, layers))
     }
@@ -4516,6 +4528,7 @@ impl DurationRecorder {
 /// the layer descriptor requires the user to provide the ranges, which should cover all
 /// keys specified in the `data` field.
 #[cfg(test)]
+#[derive(Clone)]
 pub struct DeltaLayerTestDesc {
     pub lsn_range: Range<Lsn>,
     pub key_range: Range<Key>,
@@ -4545,6 +4558,13 @@ impl DeltaLayerTestDesc {
             data,
         }
     }
+
+    pub(crate) fn layer_name(&self) -> LayerName {
+        LayerName::Delta(super::storage_layer::DeltaLayerName {
+            key_range: self.key_range.clone(),
+            lsn_range: self.lsn_range.clone(),
+        })
+    }
 }
 
 impl Timeline {
@@ -5768,12 +5788,110 @@ fn is_send() {
 
 #[cfg(test)]
 mod tests {
+    use pageserver_api::key::Key;
     use utils::{id::TimelineId, lsn::Lsn};
 
-    use crate::tenant::{
-        harness::TenantHarness, storage_layer::Layer, timeline::EvictionError, Timeline,
+    use crate::{
+        repository::Value,
+        tenant::{
+            harness::{test_img, TenantHarness},
+            layer_map::LayerMap,
+            storage_layer::{Layer, LayerName},
+            timeline::{DeltaLayerTestDesc, EvictionError},
+            Timeline,
+        },
     };
 
+    #[tokio::test]
+    async fn test_heatmap_generation() {
+        let harness = TenantHarness::create("heatmap_generation").await.unwrap();
+
+        let covered_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
+            Lsn(0x10)..Lsn(0x20),
+            vec![(
+                Key::from_hex("620000000033333333444444445500000000").unwrap(),
+                Lsn(0x11),
+                Value::Image(test_img("foo")),
+            )],
+        );
+        let visible_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
+            Lsn(0x10)..Lsn(0x20),
+            vec![(
+                Key::from_hex("720000000033333333444444445500000000").unwrap(),
+                Lsn(0x11),
+                Value::Image(test_img("foo")),
+            )],
+        );
+        let l0_delta = DeltaLayerTestDesc::new(
+            Lsn(0x20)..Lsn(0x30),
+            Key::from_hex("000000000000000000000000000000000000").unwrap()
+                ..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(),
+            vec![(
+                Key::from_hex("720000000033333333444444445500000000").unwrap(),
+                Lsn(0x25),
+                Value::Image(test_img("foo")),
+            )],
+        );
+        let delta_layers = vec![
+            covered_delta.clone(),
+            visible_delta.clone(),
+            l0_delta.clone(),
+        ];
+
+        let image_layer = (
+            Lsn(0x40),
+            vec![(
+                Key::from_hex("620000000033333333444444445500000000").unwrap(),
+                test_img("bar"),
+            )],
+        );
+        let image_layers = vec![image_layer];
+
+        let (tenant, ctx) = harness.load().await;
+        let timeline = tenant
+            .create_test_timeline_with_layers(
+                TimelineId::generate(),
+                Lsn(0x10),
+                14,
+                &ctx,
+                delta_layers,
+                image_layers,
+                Lsn(0x100),
+            )
+            .await
+            .unwrap();
+
+        // Layer visibility is an input to heatmap generation, so refresh it first
+        timeline.update_layer_visibility().await.unwrap();
+
+        let heatmap = timeline
+            .generate_heatmap()
+            .await
+            .expect("Infallible while timeline is not shut down");
+
+        assert_eq!(heatmap.timeline_id, timeline.timeline_id);
+
+        // L0 should come last
+        assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
+
+        let mut last_lsn = Lsn::MAX;
+        for layer in heatmap.layers {
+            // Covered layer should be omitted
+            assert!(layer.name != covered_delta.layer_name());
+
+            let layer_lsn = match &layer.name {
+                LayerName::Delta(d) => d.lsn_range.end,
+                LayerName::Image(i) => i.lsn,
+            };
+
+            // Apart from L0s, newest Layers should come first
+            if !LayerMap::is_l0(layer.name.key_range()) {
+                assert!(layer_lsn <= last_lsn);
+                last_lsn = layer_lsn;
+            }
+        }
+    }
+
     #[tokio::test]
     async fn two_layer_eviction_attempts_at_the_same_time() {
         let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")

From c6b6b7700a31dc945276ccd091d33373548f518c Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Fri, 16 Aug 2024 11:13:18 -0700
Subject: [PATCH 1439/1571] Fix superuser check in test_snap_files (#8749)

## Problem
Current superuser check always passes because it returns a tuple like
`(False,)`, and then the `if not superuser` passes.

## Summary of changes
Fixes the issue by unwrapping the tuple. Verified that it works against
a project where I don't have superuser.
---
 test_runner/performance/test_logical_replication.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index c4e42a7834..077f73ac06 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -282,15 +282,16 @@ def test_snap_files(
 
     env = benchmark_project_pub.pgbench_env
     connstr = benchmark_project_pub.connstr
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)
 
     with psycopg2.connect(connstr) as conn:
         conn.autocommit = True
         with conn.cursor() as cur:
             cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
-            is_super = cur.fetchall()[0]
+            is_super = cur.fetchall()[0][0]
             assert is_super, "This benchmark won't work if we don't have superuser"
 
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)
+
     conn = psycopg2.connect(connstr)
     conn.autocommit = True
     cur = conn.cursor()

From 2be69af6c3a595c90f747dabe44fe898b59375c9 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 16 Aug 2024 22:19:44 +0300
Subject: [PATCH 1440/1571] Track holes to be able to reuse them once LFC limit
 is increased (#8575)

## Problem

Multiple increase/decrease LFC limit may cause unlimited growth of LFC
file because punched holes while LFC shrinking are not reused when LFC
is extended.

## Summary of changes

Keep track of holes and reused them when LFC size is increased.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c                 | 117 ++++++++++++++++++++-----
 pgxn/neon/neon_pgversioncompat.h       |   4 +
 test_runner/regress/test_lfc_resize.py |  28 ++++--
 3 files changed, 119 insertions(+), 30 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 1894e8c72a..479209a537 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -41,6 +41,8 @@
 
 #include "hll.h"
 
+#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
+
 /*
  * Local file cache is used to temporary store relations pages in local file system.
  * All blocks of all relations are stored inside one file and addressed using shared hash map.
@@ -51,19 +53,43 @@
  *
  * Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about
  * its consistency.
+
+ *
+ * ## Holes
+ *
+ * The LFC can be resized on the fly, up to a maximum size that's determined
+ * at server startup (neon.max_file_cache_size). After server startup, we
+ * expand the underlying file when needed, until it reaches the soft limit
+ * (neon.file_cache_size_limit). If the soft limit is later reduced, we shrink
+ * the LFC by punching holes in the underlying file with a
+ * fallocate(FALLOC_FL_PUNCH_HOLE) call. The nominal size of the file doesn't
+ * shrink, but the disk space it uses does.
+ *
+ * Each hole is tracked by a dummy FileCacheEntry, which are kept in the
+ * 'holes' linked list. They are entered into the chunk hash table, with a
+ * special key where the blockNumber is used to store the 'offset' of the
+ * hole, and all other fields are zero. Holes are never looked up in the hash
+ * table, we only enter them there to have a FileCacheEntry that we can keep
+ * in the linked list. If the soft limit is raised again, we reuse the holes
+ * before extending the nominal size of the file.
  */
 
 /* Local file storage allocation chunk.
- * Should be power of two and not less than 32. Using larger than page chunks can
+ * Should be power of two. Using larger than page chunks can
  * 1. Reduce hash-map memory footprint: 8TB database contains billion pages
  *    and size of hash entry is 40 bytes, so we need 40Gb just for hash map.
  *    1Mb chunks can reduce hash map size to 320Mb.
  * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed
  */
 #define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
+/*
+ * Smaller chunk seems to be better for OLTP workload
+ */
+// #define BLOCKS_PER_CHUNK	8 /* 64kb chunk */
 #define MB					((uint64)1024*1024)
 
 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
+#define CHUNK_BITMAP_SIZE ((BLOCKS_PER_CHUNK + 31) / 32)
 
 typedef struct FileCacheEntry
 {
@@ -71,8 +97,8 @@ typedef struct FileCacheEntry
 	uint32		hash;
 	uint32		offset;
 	uint32		access_count;
-	uint32		bitmap[BLOCKS_PER_CHUNK / 32];
-	dlist_node	lru_node;		/* LRU list node */
+	uint32		bitmap[CHUNK_BITMAP_SIZE];
+	dlist_node	list_node;		/* LRU/holes list node */
 } FileCacheEntry;
 
 typedef struct FileCacheControl
@@ -87,6 +113,7 @@ typedef struct FileCacheControl
 	uint64		writes;
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
+	dlist_head  holes;          /* double linked list of punched holes */
 	HyperLogLogState wss_estimation; /* estimation of working set size */
 } FileCacheControl;
 
@@ -135,6 +162,7 @@ lfc_disable(char const *op)
 		lfc_ctl->used = 0;
 		lfc_ctl->limit = 0;
 		dlist_init(&lfc_ctl->lru);
+		dlist_init(&lfc_ctl->holes);
 
 		if (lfc_desc > 0)
 		{
@@ -214,18 +242,18 @@ lfc_shmem_startup(void)
 	if (!found)
 	{
 		int			fd;
-		uint32		lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
+		uint32		n_chunks = SIZE_MB_TO_CHUNKS(lfc_max_size);
 
 		lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
 		info.keysize = sizeof(BufferTag);
 		info.entrysize = sizeof(FileCacheEntry);
 
 		/*
-		 * lfc_size+1 because we add new element to hash table before eviction
+		 * n_chunks+1 because we add new element to hash table before eviction
 		 * of victim
 		 */
 		lfc_hash = ShmemInitHash("lfc_hash",
-								 lfc_size + 1, lfc_size + 1,
+								 n_chunks + 1, n_chunks + 1,
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
 		lfc_ctl->generation = 0;
@@ -235,6 +263,7 @@ lfc_shmem_startup(void)
 		lfc_ctl->misses = 0;
 		lfc_ctl->writes = 0;
 		dlist_init(&lfc_ctl->lru);
+		dlist_init(&lfc_ctl->holes);
 
 		/* Initialize hyper-log-log structure for estimating working set size */
 		initSHLL(&lfc_ctl->wss_estimation);
@@ -310,14 +339,31 @@ lfc_change_limit_hook(int newval, void *extra)
 		 * Shrink cache by throwing away least recently accessed chunks and
 		 * returning their space to file system
 		 */
-		FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+		FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
+		FileCacheEntry *hole;
+		uint32		offset = victim->offset;
+		uint32		hash;
+		bool		found;
+		BufferTag	holetag;
 
-		Assert(victim->access_count == 0);
+		CriticalAssert(victim->access_count == 0);
 #ifdef FALLOC_FL_PUNCH_HOLE
 		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
 			neon_log(LOG, "Failed to punch hole in file: %m");
 #endif
+		/* We remove the old entry, and re-enter a hole to the hash table */
 		hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
+
+		memset(&holetag, 0, sizeof(holetag));
+		holetag.blockNum = offset;
+		hash = get_hash_value(lfc_hash, &holetag);
+		hole = hash_search_with_hash_value(lfc_hash, &holetag, hash, HASH_ENTER, &found);
+		hole->hash = hash;
+		hole->offset = offset;
+		hole->access_count = 0;
+		CriticalAssert(!found);
+		dlist_push_tail(&lfc_ctl->holes, &hole->list_node);
+
 		lfc_ctl->used -= 1;
 	}
 	lfc_ctl->limit = new_size;
@@ -409,6 +455,8 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);
 
 	LWLockAcquire(lfc_lock, LW_SHARED);
@@ -440,6 +488,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	tag.forkNum = forkNum;
 	tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1));
 
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);
 
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -470,7 +519,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	{
 		bool		has_remaining_pages;
 
-		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++)
+		for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
 		{
 			if (entry->bitmap[i] != 0)
 			{
@@ -485,8 +534,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 		 */
 		if (!has_remaining_pages)
 		{
-			dlist_delete(&entry->lru_node);
-			dlist_push_head(&lfc_ctl->lru, &entry->lru_node);
+			dlist_delete(&entry->list_node);
+			dlist_push_head(&lfc_ctl->lru, &entry->list_node);
 		}
 	}
 
@@ -525,6 +574,8 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);
 
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -551,7 +602,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	}
 	/* Unlink entry from LRU list to pin it for the duration of IO operation */
 	if (entry->access_count++ == 0)
-		dlist_delete(&entry->lru_node);
+		dlist_delete(&entry->list_node);
 	generation = lfc_ctl->generation;
 	entry_offset = entry->offset;
 
@@ -569,12 +620,12 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	if (lfc_ctl->generation == generation)
 	{
-		Assert(LFC_ENABLED());
+		CriticalAssert(LFC_ENABLED());
 		lfc_ctl->hits += 1;
 		pgBufferUsage.file_cache.hits += 1;
-		Assert(entry->access_count > 0);
+		CriticalAssert(entry->access_count > 0);
 		if (--entry->access_count == 0)
-			dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
+			dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
 	}
 	else
 		result = false;
@@ -613,6 +664,8 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 	CopyNRelFileInfoToBufTag(tag, rinfo);
+
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);
 
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -632,7 +685,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 		 * operation
 		 */
 		if (entry->access_count++ == 0)
-			dlist_delete(&entry->lru_node);
+			dlist_delete(&entry->list_node);
 	}
 	else
 	{
@@ -655,13 +708,26 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 		if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
 		{
 			/* Cache overflow: evict least recently used chunk */
-			FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+			FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
 
-			Assert(victim->access_count == 0);
+			CriticalAssert(victim->access_count == 0);
 			entry->offset = victim->offset; /* grab victim's chunk */
 			hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
 			neon_log(DEBUG2, "Swap file cache page");
 		}
+		else if (!dlist_is_empty(&lfc_ctl->holes))
+		{
+			/* We can reuse a hole that was left behind when the LFC was shrunk previously */
+			FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
+			uint32		offset = hole->offset;
+			bool		found;
+
+			hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found);
+			CriticalAssert(found);
+
+			lfc_ctl->used += 1;
+			entry->offset = offset;	/* reuse the hole */
+		}
 		else
 		{
 			lfc_ctl->used += 1;
@@ -689,11 +755,11 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 
 		if (lfc_ctl->generation == generation)
 		{
-			Assert(LFC_ENABLED());
+			CriticalAssert(LFC_ENABLED());
 			/* Place entry to the head of LRU list */
-			Assert(entry->access_count > 0);
+			CriticalAssert(entry->access_count > 0);
 			if (--entry->access_count == 0)
-				dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
+				dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
 
 			entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
 		}
@@ -708,7 +774,6 @@ typedef struct
 } NeonGetStatsCtx;
 
 #define NUM_NEON_GET_STATS_COLS	2
-#define NUM_NEON_GET_STATS_ROWS	3
 
 PG_FUNCTION_INFO_V1(neon_get_lfc_stats);
 Datum
@@ -744,7 +809,6 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 						   INT8OID, -1, 0);
 
 		fctx->tupdesc = BlessTupleDesc(tupledesc);
-		funcctx->max_calls = NUM_NEON_GET_STATS_ROWS;
 		funcctx->user_fctx = fctx;
 
 		/* Return to original context when allocating transient memory */
@@ -778,6 +842,11 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 			if (lfc_ctl)
 				value = lfc_ctl->writes;
 			break;
+		case 4:
+			key = "file_cache_size";
+			if (lfc_ctl)
+				value = lfc_ctl->size;
+			break;
 		default:
 			SRF_RETURN_DONE(funcctx);
 	}
@@ -901,7 +970,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				hash_seq_init(&status, lfc_hash);
 				while ((entry = hash_seq_search(&status)) != NULL)
 				{
-					for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++)
+					for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
 						n_pages += pg_popcount32(entry->bitmap[i]);
 				}
 			}
diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h
index f19732cbbb..addb6ccce6 100644
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -54,6 +54,10 @@
 
 #define BufTagGetNRelFileInfo(tag) tag.rnode
 
+#define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode)
+
+#define InvalidRelFileNumber InvalidOid
+
 #define SMgrRelGetRelInfo(reln) \
 	(reln->smgr_rnode.node)
 
diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py
index 2a3442448a..1b2c7f808f 100644
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -1,3 +1,7 @@
+import os
+import random
+import re
+import subprocess
 import threading
 import time
 
@@ -17,17 +21,17 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
         "test_lfc_resize",
         config_lines=[
             "neon.file_cache_path='file.cache'",
-            "neon.max_file_cache_size=1GB",
-            "neon.file_cache_size_limit=1GB",
+            "neon.max_file_cache_size=512MB",
+            "neon.file_cache_size_limit=512MB",
         ],
     )
     n_resize = 10
-    scale = 10
+    scale = 100
 
     def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
         pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
-        pg_bin.run_capture(["pgbench", "-c4", f"-T{n_resize}", "-Mprepared", connstr])
+        pg_bin.run_capture(["pgbench", "-c10", f"-T{n_resize}", "-Mprepared", "-S", connstr])
 
     thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
     thread.start()
@@ -35,9 +39,21 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
     conn = endpoint.connect()
     cur = conn.cursor()
 
-    for i in range(n_resize):
-        cur.execute(f"alter system set neon.file_cache_size_limit='{i*10}MB'")
+    for _ in range(n_resize):
+        size = random.randint(1, 512)
+        cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'")
         cur.execute("select pg_reload_conf()")
         time.sleep(1)
 
+    cur.execute("alter system set neon.file_cache_size_limit='100MB'")
+    cur.execute("select pg_reload_conf()")
+
     thread.join()
+
+    lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache"
+    lfc_file_size = os.path.getsize(lfc_file_path)
+    res = subprocess.run(["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True)
+    lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0]
+    log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}")
+    assert lfc_file_size <= 512 * 1024 * 1024
+    assert int(lfc_file_blocks) <= 128 * 1024

From 7131ac4730f7268a8624a9c7345c23938cc8b6a6 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Sat, 17 Aug 2024 11:48:53 -0400
Subject: [PATCH 1441/1571] refactor(scrubber): add unified command suitable
 for cron job (#8635)

Part of #8128.

## Description

This PR creates a unified command to run both physical gc and metadata
health check as a cron job. This also enables us to add additional tasks
to the cron job in the future.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 storage_scrubber/src/main.rs                  | 240 ++++++++++++------
 .../src/pageserver_physical_gc.rs             |  16 +-
 .../src/scan_pageserver_metadata.rs           |   2 +-
 3 files changed, 175 insertions(+), 83 deletions(-)

diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index cbc836755a..3935e513e3 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -3,9 +3,10 @@ use camino::Utf8PathBuf;
 use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
 use pageserver_api::shard::TenantShardId;
 use reqwest::{Method, Url};
+use storage_controller_client::control_api;
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
-use storage_scrubber::scan_pageserver_metadata::scan_metadata;
+use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata;
 use storage_scrubber::tenant_snapshot::SnapshotDownloader;
 use storage_scrubber::{find_large_objects, ControllerClientConfig};
 use storage_scrubber::{
@@ -68,7 +69,7 @@ enum Command {
         #[arg(long = "tenant-id", num_args = 0..)]
         tenant_ids: Vec<TenantShardId>,
         #[arg(long = "post", default_value_t = false)]
-        post_to_storage_controller: bool,
+        post_to_storcon: bool,
         #[arg(long, default_value = None)]
         /// For safekeeper node_kind only, points to db with debug dump
         dump_db_connstr: Option<String>,
@@ -100,6 +101,16 @@ enum Command {
         #[arg(long = "concurrency", short = 'j', default_value_t = 64)]
         concurrency: usize,
     },
+    CronJob {
+        // PageserverPhysicalGc
+        #[arg(long = "min-age")]
+        gc_min_age: humantime::Duration,
+        #[arg(short, long, default_value_t = GcMode::IndicesOnly)]
+        gc_mode: GcMode,
+        // ScanMetadata
+        #[arg(long = "post", default_value_t = false)]
+        post_to_storcon: bool,
+    },
 }
 
 #[tokio::main]
@@ -117,6 +128,7 @@ async fn main() -> anyhow::Result<()> {
         Command::TenantSnapshot { .. } => "tenant-snapshot",
         Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc",
         Command::FindLargeObjects { .. } => "find-large-objects",
+        Command::CronJob { .. } => "cron-job",
     };
     let _guard = init_logging(&format!(
         "{}_{}_{}_{}.log",
@@ -126,12 +138,13 @@ async fn main() -> anyhow::Result<()> {
         chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
     ));
 
-    let controller_client_conf = cli.controller_api.map(|controller_api| {
+    let controller_client = cli.controller_api.map(|controller_api| {
         ControllerClientConfig {
             controller_api,
             // Default to no key: this is a convenience when working in a development environment
             controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
         }
+        .build_client()
     });
 
     match cli.command {
@@ -139,7 +152,7 @@ async fn main() -> anyhow::Result<()> {
             json,
             tenant_ids,
             node_kind,
-            post_to_storage_controller,
+            post_to_storcon,
             dump_db_connstr,
             dump_db_table,
         } => {
@@ -178,53 +191,14 @@ async fn main() -> anyhow::Result<()> {
                 }
                 Ok(())
             } else {
-                if controller_client_conf.is_none() && post_to_storage_controller {
-                    return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
-                }
-                match scan_metadata(bucket_config.clone(), tenant_ids).await {
-                    Err(e) => {
-                        tracing::error!("Failed: {e}");
-                        Err(e)
-                    }
-                    Ok(summary) => {
-                        if json {
-                            println!("{}", serde_json::to_string(&summary).unwrap())
-                        } else {
-                            println!("{}", summary.summary_string());
-                        }
-
-                        if post_to_storage_controller {
-                            if let Some(conf) = controller_client_conf {
-                                let controller_client = conf.build_client();
-                                let body = summary.build_health_update_request();
-                                controller_client
-                                    .dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
-                                        Method::POST,
-                                        "control/v1/metadata_health/update".to_string(),
-                                        Some(body),
-                                    )
-                                    .await?;
-                            }
-                        }
-
-                        if summary.is_fatal() {
-                            tracing::error!("Fatal scrub errors detected");
-                        } else if summary.is_empty() {
-                            // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
-                            // scrubber they were likely expecting to scan something, and if we see no timelines
-                            // at all then it's likely due to some configuration issues like a bad prefix
-                            tracing::error!(
-                                "No timelines found in bucket {} prefix {}",
-                                bucket_config.bucket,
-                                bucket_config
-                                    .prefix_in_bucket
-                                    .unwrap_or("<none>".to_string())
-                            );
-                        }
-
-                        Ok(())
-                    }
-                }
+                scan_pageserver_metadata_cmd(
+                    bucket_config,
+                    controller_client.as_ref(),
+                    tenant_ids,
+                    json,
+                    post_to_storcon,
+                )
+                .await
             }
         }
         Command::FindGarbage {
@@ -254,31 +228,14 @@ async fn main() -> anyhow::Result<()> {
             min_age,
             mode,
         } => {
-            match (&controller_client_conf, mode) {
-                (Some(_), _) => {
-                    // Any mode may run when controller API is set
-                }
-                (None, GcMode::Full) => {
-                    // The part of physical GC where we erase ancestor layers cannot be done safely without
-                    // confirming the most recent complete shard split with the controller.  Refuse to run, rather
-                    // than doing it unsafely.
-                    return Err(anyhow!("Full physical GC requires `--controller-api` and `--controller-jwt` to run"));
-                }
-                (None, GcMode::DryRun | GcMode::IndicesOnly) => {
-                    // These GcModes do not require the controller to run.
-                }
-            }
-
-            let summary = pageserver_physical_gc(
-                bucket_config,
-                controller_client_conf,
+            pageserver_physical_gc_cmd(
+                &bucket_config,
+                controller_client.as_ref(),
                 tenant_ids,
-                min_age.into(),
+                min_age,
                 mode,
             )
-            .await?;
-            println!("{}", serde_json::to_string(&summary).unwrap());
-            Ok(())
+            .await
         }
         Command::FindLargeObjects {
             min_size,
@@ -295,5 +252,142 @@ async fn main() -> anyhow::Result<()> {
             println!("{}", serde_json::to_string(&summary).unwrap());
             Ok(())
         }
+        Command::CronJob {
+            gc_min_age,
+            gc_mode,
+            post_to_storcon,
+        } => {
+            run_cron_job(
+                bucket_config,
+                controller_client.as_ref(),
+                gc_min_age,
+                gc_mode,
+                post_to_storcon,
+            )
+            .await
+        }
+    }
+}
+
+/// Runs the scrubber cron job.
+/// 1. Do pageserver physical gc
+/// 2. Scan pageserver metadata
+pub async fn run_cron_job(
+    bucket_config: BucketConfig,
+    controller_client: Option<&control_api::Client>,
+    gc_min_age: humantime::Duration,
+    gc_mode: GcMode,
+    post_to_storcon: bool,
+) -> anyhow::Result<()> {
+    tracing::info!(%gc_min_age, %gc_mode, "Running pageserver-physical-gc");
+    pageserver_physical_gc_cmd(
+        &bucket_config,
+        controller_client,
+        Vec::new(),
+        gc_min_age,
+        gc_mode,
+    )
+    .await?;
+    tracing::info!(%post_to_storcon, node_kind = %NodeKind::Pageserver, "Running scan-metadata");
+    scan_pageserver_metadata_cmd(
+        bucket_config,
+        controller_client,
+        Vec::new(),
+        true,
+        post_to_storcon,
+    )
+    .await?;
+
+    Ok(())
+}
+
+pub async fn pageserver_physical_gc_cmd(
+    bucket_config: &BucketConfig,
+    controller_client: Option<&control_api::Client>,
+    tenant_shard_ids: Vec<TenantShardId>,
+    min_age: humantime::Duration,
+    mode: GcMode,
+) -> anyhow::Result<()> {
+    match (controller_client, mode) {
+        (Some(_), _) => {
+            // Any mode may run when controller API is set
+        }
+        (None, GcMode::Full) => {
+            // The part of physical GC where we erase ancestor layers cannot be done safely without
+            // confirming the most recent complete shard split with the controller.  Refuse to run, rather
+            // than doing it unsafely.
+            return Err(anyhow!(
+                "Full physical GC requires `--controller-api` and `--controller-jwt` to run"
+            ));
+        }
+        (None, GcMode::DryRun | GcMode::IndicesOnly) => {
+            // These GcModes do not require the controller to run.
+        }
+    }
+
+    let summary = pageserver_physical_gc(
+        bucket_config,
+        controller_client,
+        tenant_shard_ids,
+        min_age.into(),
+        mode,
+    )
+    .await?;
+    println!("{}", serde_json::to_string(&summary).unwrap());
+    Ok(())
+}
+
+pub async fn scan_pageserver_metadata_cmd(
+    bucket_config: BucketConfig,
+    controller_client: Option<&control_api::Client>,
+    tenant_shard_ids: Vec<TenantShardId>,
+    json: bool,
+    post_to_storcon: bool,
+) -> anyhow::Result<()> {
+    if controller_client.is_none() && post_to_storcon {
+        return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
+    }
+    match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids).await {
+        Err(e) => {
+            tracing::error!("Failed: {e}");
+            Err(e)
+        }
+        Ok(summary) => {
+            if json {
+                println!("{}", serde_json::to_string(&summary).unwrap())
+            } else {
+                println!("{}", summary.summary_string());
+            }
+
+            if post_to_storcon {
+                if let Some(client) = controller_client {
+                    let body = summary.build_health_update_request();
+                    client
+                        .dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
+                            Method::POST,
+                            "control/v1/metadata_health/update".to_string(),
+                            Some(body),
+                        )
+                        .await?;
+                }
+            }
+
+            if summary.is_fatal() {
+                tracing::error!("Fatal scrub errors detected");
+            } else if summary.is_empty() {
+                // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                // scrubber they were likely expecting to scan something, and if we see no timelines
+                // at all then it's likely due to some configuration issues like a bad prefix
+                tracing::error!(
+                    "No timelines found in bucket {} prefix {}",
+                    bucket_config.bucket,
+                    bucket_config
+                        .prefix_in_bucket
+                        .unwrap_or("<none>".to_string())
+                );
+            }
+
+            Ok(())
+        }
     }
 }
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index c8b1ed49f4..20d9bd6dd4 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -4,9 +4,7 @@ use std::time::{Duration, SystemTime};
 
 use crate::checks::{list_timeline_blobs, BlobDataParseResult};
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{
-    init_remote, BucketConfig, ControllerClientConfig, NodeKind, RootTarget, TenantShardTimelineId,
-};
+use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use aws_sdk_s3::Client;
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
@@ -473,8 +471,8 @@ async fn gc_ancestor(
 /// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and
 /// make sure that object listings don't get slowed down by large numbers of garbage objects.
 pub async fn pageserver_physical_gc(
-    bucket_config: BucketConfig,
-    controller_client_conf: Option<ControllerClientConfig>,
+    bucket_config: &BucketConfig,
+    controller_client: Option<&control_api::Client>,
     tenant_shard_ids: Vec<TenantShardId>,
     min_age: Duration,
     mode: GcMode,
@@ -558,7 +556,7 @@ pub async fn pageserver_physical_gc(
         let timelines = timelines.map_ok(|ttid| {
             gc_timeline(
                 &s3_client,
-                &bucket_config,
+                bucket_config,
                 &min_age,
                 &target,
                 mode,
@@ -574,7 +572,7 @@ pub async fn pageserver_physical_gc(
     }
 
     // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
-    let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
+    let Some(client) = controller_client else {
         tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
         return Ok(summary);
     };
@@ -583,13 +581,13 @@ pub async fn pageserver_physical_gc(
         .unwrap()
         .into_inner()
         .unwrap()
-        .into_gc_ancestors(&controller_client, &mut summary)
+        .into_gc_ancestors(client, &mut summary)
         .await;
 
     for ancestor_shard in ancestor_shards {
         gc_ancestor(
             &s3_client,
-            &bucket_config,
+            bucket_config,
             &target,
             &min_age,
             ancestor_shard,
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index b9630056e1..2409b7b132 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -116,7 +116,7 @@ Index versions: {version_summary}
 }
 
 /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
-pub async fn scan_metadata(
+pub async fn scan_pageserver_metadata(
     bucket_config: BucketConfig,
     tenant_ids: Vec<TenantShardId>,
 ) -> anyhow::Result<MetadataSummary> {

From 188bde7f0776636310260cbf636922d1029add7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sun, 18 Aug 2024 19:32:10 +0200
Subject: [PATCH 1442/1571] Default image compression to zstd at level 1
 (#8677)

After the rollout has succeeded, we now set the default image
compression to be enabled.

We also remove its explicit mention from `neon_fixtures.py` added in
#8368 as it is now the default (and we switch to `zstd(1)` which is a
bit nicer on CPU time).

Part of https://github.com/neondatabase/neon/issues/5431
---
 pageserver/src/config.rs                            | 12 +++++-------
 pageserver/src/tenant/storage_layer/split_writer.rs |  7 ++++++-
 test_runner/fixtures/neon_fixtures.py               |  1 -
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 3ac5ac539f..0ebaf78840 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -50,7 +50,6 @@ pub mod defaults {
         DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
         DEFAULT_PG_LISTEN_PORT,
     };
-    use pageserver_api::models::ImageCompressionAlgorithm;
     pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
 
     pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
@@ -90,8 +89,7 @@ pub mod defaults {
 
     pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
 
-    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
-        ImageCompressionAlgorithm::Disabled;
+    pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)";
 
     pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
 
@@ -478,7 +476,7 @@ impl PageServerConfigBuilder {
             max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                 NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
             )),
-            image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
+            image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()),
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: Set(L0FlushConfig::default()),
             compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
@@ -1065,7 +1063,7 @@ impl PageServerConf {
                 NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                     .expect("Invalid default constant"),
             ),
-            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
+            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
             l0_flush: L0FlushConfig::default(),
             compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
@@ -1305,7 +1303,7 @@ background_task_maximum_delay = '334 s'
                     NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                         .expect("Invalid default constant")
                 ),
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
                 compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
@@ -1378,7 +1376,7 @@ background_task_maximum_delay = '334 s'
                     NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                         .expect("Invalid default constant")
                 ),
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
                 compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index d7bfe48c60..e12e29cd45 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -208,6 +208,8 @@ impl SplitDeltaLayerWriter {
 
 #[cfg(test)]
 mod tests {
+    use rand::{RngCore, SeedableRng};
+
     use crate::{
         tenant::{
             harness::{TenantHarness, TIMELINE_ID},
@@ -229,7 +231,10 @@ mod tests {
     }
 
     fn get_large_img() -> Bytes {
-        vec![0; 8192].into()
+        let mut rng = rand::rngs::SmallRng::seed_from_u64(42);
+        let mut data = vec![0; 8192];
+        rng.fill_bytes(&mut data);
+        data.into()
     }
 
     #[tokio::test]
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ec5a83601e..ba6fbc003a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1162,7 +1162,6 @@ class NeonEnv:
                 "listen_http_addr": f"localhost:{pageserver_port.http}",
                 "pg_auth_type": pg_auth_type,
                 "http_auth_type": http_auth_type,
-                "image_compression": "zstd",
             }
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine

From f246aa3ca7f19993e0582dfd8069375c09c5158c Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 19 Aug 2024 10:33:46 +0200
Subject: [PATCH 1443/1571] proxy: Fix some warnings by extended clippy checks
 (#8748)

* Missing blank lifetimes which is now deprecated.
* Matching off unqualified enum variants that could act like variable.
* Missing semicolons.
---
 proxy/src/auth.rs                             |  46 +++--
 proxy/src/auth/backend.rs                     |  72 +++-----
 proxy/src/auth/backend/jwt.rs                 |   6 +-
 proxy/src/auth/credentials.rs                 |  17 +-
 proxy/src/cache/common.rs                     |   2 +-
 proxy/src/cache/timed_lru.rs                  |   2 +-
 proxy/src/compute.rs                          |  23 ++-
 proxy/src/config.rs                           |   6 +-
 proxy/src/console/messages.rs                 |  22 ++-
 proxy/src/console/mgmt.rs                     |   3 +-
 proxy/src/console/provider.rs                 | 170 +++++++++---------
 proxy/src/context.rs                          |   8 +-
 proxy/src/context/parquet.rs                  |   2 +-
 proxy/src/intern.rs                           |   2 +-
 proxy/src/metrics.rs                          |   2 +-
 proxy/src/proxy/copy_bidirectional.rs         |   2 +-
 proxy/src/proxy/handshake.rs                  |  15 +-
 proxy/src/proxy/tests/mitm.rs                 |   4 +-
 proxy/src/rate_limiter/limit_algorithm.rs     |   4 +-
 .../src/rate_limiter/limit_algorithm/aimd.rs  |   5 +-
 .../connection_with_credentials_provider.rs   |   2 +-
 proxy/src/redis/notifications.rs              |  23 ++-
 proxy/src/sasl.rs                             |   5 +-
 proxy/src/sasl/channel_binding.rs             |  21 +--
 proxy/src/sasl/messages.rs                    |   5 +-
 proxy/src/scram.rs                            |   4 +-
 proxy/src/scram/countmin.rs                   |   2 -
 proxy/src/scram/exchange.rs                   |  22 +--
 proxy/src/scram/messages.rs                   |   6 +-
 proxy/src/scram/pbkdf2.rs                     |   2 +-
 proxy/src/scram/threadpool.rs                 |   4 +-
 proxy/src/serverless.rs                       |   4 +-
 proxy/src/serverless/conn_pool.rs             |   6 +-
 proxy/src/stream.rs                           |   2 +-
 proxy/src/url.rs                              |   2 +-
 proxy/src/waiters.rs                          |   2 +-
 36 files changed, 246 insertions(+), 279 deletions(-)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index 8c44823c98..3b3c571129 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -113,38 +113,36 @@ impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
 
 impl UserFacingError for AuthError {
     fn to_string_client(&self) -> String {
-        use AuthErrorImpl::*;
         match self.0.as_ref() {
-            Link(e) => e.to_string_client(),
-            GetAuthInfo(e) => e.to_string_client(),
-            Sasl(e) => e.to_string_client(),
-            AuthFailed(_) => self.to_string(),
-            BadAuthMethod(_) => self.to_string(),
-            MalformedPassword(_) => self.to_string(),
-            MissingEndpointName => self.to_string(),
-            Io(_) => "Internal error".to_string(),
-            IpAddressNotAllowed(_) => self.to_string(),
-            TooManyConnections => self.to_string(),
-            UserTimeout(_) => self.to_string(),
+            AuthErrorImpl::Link(e) => e.to_string_client(),
+            AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(),
+            AuthErrorImpl::Sasl(e) => e.to_string_client(),
+            AuthErrorImpl::AuthFailed(_) => self.to_string(),
+            AuthErrorImpl::BadAuthMethod(_) => self.to_string(),
+            AuthErrorImpl::MalformedPassword(_) => self.to_string(),
+            AuthErrorImpl::MissingEndpointName => self.to_string(),
+            AuthErrorImpl::Io(_) => "Internal error".to_string(),
+            AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(),
+            AuthErrorImpl::TooManyConnections => self.to_string(),
+            AuthErrorImpl::UserTimeout(_) => self.to_string(),
         }
     }
 }
 
 impl ReportableError for AuthError {
     fn get_error_kind(&self) -> crate::error::ErrorKind {
-        use AuthErrorImpl::*;
         match self.0.as_ref() {
-            Link(e) => e.get_error_kind(),
-            GetAuthInfo(e) => e.get_error_kind(),
-            Sasl(e) => e.get_error_kind(),
-            AuthFailed(_) => crate::error::ErrorKind::User,
-            BadAuthMethod(_) => crate::error::ErrorKind::User,
-            MalformedPassword(_) => crate::error::ErrorKind::User,
-            MissingEndpointName => crate::error::ErrorKind::User,
-            Io(_) => crate::error::ErrorKind::ClientDisconnect,
-            IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
-            TooManyConnections => crate::error::ErrorKind::RateLimit,
-            UserTimeout(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::Link(e) => e.get_error_kind(),
+            AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(),
+            AuthErrorImpl::Sasl(e) => e.get_error_kind(),
+            AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::BadAuthMethod(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::MalformedPassword(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::MissingEndpointName => crate::error::ErrorKind::User,
+            AuthErrorImpl::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+            AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit,
+            AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User,
         }
     }
 }
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index c6a0b2af5a..7592d076ec 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -80,9 +80,8 @@ pub trait TestBackend: Send + Sync + 'static {
 
 impl std::fmt::Display for BackendType<'_, (), ()> {
     fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        use BackendType::*;
         match self {
-            Console(api, _) => match &**api {
+            Self::Console(api, _) => match &**api {
                 ConsoleBackend::Console(endpoint) => {
                     fmt.debug_tuple("Console").field(&endpoint.url()).finish()
                 }
@@ -93,7 +92,7 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
                 #[cfg(test)]
                 ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
             },
-            Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Self::Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
         }
     }
 }
@@ -102,10 +101,9 @@ impl<T, D> BackendType<'_, T, D> {
     /// Very similar to [`std::option::Option::as_ref`].
     /// This helps us pass structured config to async tasks.
     pub fn as_ref(&self) -> BackendType<'_, &T, &D> {
-        use BackendType::*;
         match self {
-            Console(c, x) => Console(MaybeOwned::Borrowed(c), x),
-            Link(c, x) => Link(MaybeOwned::Borrowed(c), x),
+            Self::Console(c, x) => BackendType::Console(MaybeOwned::Borrowed(c), x),
+            Self::Link(c, x) => BackendType::Link(MaybeOwned::Borrowed(c), x),
         }
     }
 }
@@ -115,10 +113,9 @@ impl<'a, T, D> BackendType<'a, T, D> {
     /// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
     /// a function to a contained value.
     pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
-        use BackendType::*;
         match self {
-            Console(c, x) => Console(c, f(x)),
-            Link(c, x) => Link(c, x),
+            Self::Console(c, x) => BackendType::Console(c, f(x)),
+            Self::Link(c, x) => BackendType::Link(c, x),
         }
     }
 }
@@ -126,10 +123,9 @@ impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
     /// Very similar to [`std::option::Option::transpose`].
     /// This is most useful for error handling.
     pub fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
-        use BackendType::*;
         match self {
-            Console(c, x) => x.map(|x| Console(c, x)),
-            Link(c, x) => Ok(Link(c, x)),
+            Self::Console(c, x) => x.map(|x| BackendType::Console(c, x)),
+            Self::Link(c, x) => Ok(BackendType::Link(c, x)),
         }
     }
 }
@@ -293,7 +289,9 @@ async fn auth_quirks(
             ctx.set_endpoint_id(res.info.endpoint.clone());
             let password = match res.keys {
                 ComputeCredentialKeys::Password(p) => p,
-                _ => unreachable!("password hack should return a password"),
+                ComputeCredentialKeys::AuthKeys(_) => {
+                    unreachable!("password hack should return a password")
+                }
             };
             (res.info, Some(password))
         }
@@ -400,21 +398,17 @@ async fn authenticate_with_secret(
 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
     /// Get compute endpoint name from the credentials.
     pub fn get_endpoint(&self) -> Option<EndpointId> {
-        use BackendType::*;
-
         match self {
-            Console(_, user_info) => user_info.endpoint_id.clone(),
-            Link(_, _) => Some("link".into()),
+            Self::Console(_, user_info) => user_info.endpoint_id.clone(),
+            Self::Link(_, _) => Some("link".into()),
         }
     }
 
     /// Get username from the credentials.
     pub fn get_user(&self) -> &str {
-        use BackendType::*;
-
         match self {
-            Console(_, user_info) => &user_info.user,
-            Link(_, _) => "link",
+            Self::Console(_, user_info) => &user_info.user,
+            Self::Link(_, _) => "link",
         }
     }
 
@@ -428,10 +422,8 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
         config: &'static AuthenticationConfig,
         endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     ) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
-        use BackendType::*;
-
         let res = match self {
-            Console(api, user_info) => {
+            Self::Console(api, user_info) => {
                 info!(
                     user = &*user_info.user,
                     project = user_info.endpoint(),
@@ -451,7 +443,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
                 BackendType::Console(api, credentials)
             }
             // NOTE: this auth backend doesn't use client credentials.
-            Link(url, _) => {
+            Self::Link(url, _) => {
                 info!("performing link authentication");
 
                 let info = link::authenticate(ctx, &url, client).await?;
@@ -470,10 +462,9 @@ impl BackendType<'_, ComputeUserInfo, &()> {
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        use BackendType::*;
         match self {
-            Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
-            Link(_, _) => Ok(Cached::new_uncached(None)),
+            Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
+            Self::Link(_, _) => Ok(Cached::new_uncached(None)),
         }
     }
 
@@ -481,10 +472,9 @@ impl BackendType<'_, ComputeUserInfo, &()> {
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        use BackendType::*;
         match self {
-            Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
         }
     }
 }
@@ -495,18 +485,16 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
-        use BackendType::*;
-
         match self {
-            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Link(_, info) => Ok(Cached::new_uncached(info.clone())),
+            Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Self::Link(_, info) => Ok(Cached::new_uncached(info.clone())),
         }
     }
 
     fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
         match self {
-            BackendType::Console(_, creds) => Some(&creds.keys),
-            BackendType::Link(_, _) => None,
+            Self::Console(_, creds) => Some(&creds.keys),
+            Self::Link(_, _) => None,
         }
     }
 }
@@ -517,18 +505,16 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
-        use BackendType::*;
-
         match self {
-            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
+            Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Self::Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
         }
     }
 
     fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
         match self {
-            BackendType::Console(_, creds) => Some(&creds.keys),
-            BackendType::Link(_, _) => None,
+            Self::Console(_, creds) => Some(&creds.keys),
+            Self::Link(_, _) => None,
         }
     }
 }
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 0c2ca8fb97..e021a7e23f 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -195,7 +195,7 @@ impl JwkCacheEntryLock {
 
         let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
             .context("Provided authentication token is not a valid JWT encoding")?;
-        let header = serde_json::from_slice::<JWTHeader>(&header)
+        let header = serde_json::from_slice::<JWTHeader<'_>>(&header)
             .context("Provided authentication token is not a valid JWT encoding")?;
 
         let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
@@ -340,7 +340,7 @@ impl JwkRenewalPermit<'_> {
         }
     }
 
-    async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit {
+    async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit<'_> {
         match from.lookup.acquire().await {
             Ok(permit) => {
                 permit.forget();
@@ -352,7 +352,7 @@ impl JwkRenewalPermit<'_> {
         }
     }
 
-    fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit> {
+    fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit<'_>> {
         match from.lookup.try_acquire() {
             Ok(permit) => {
                 permit.forget();
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 8f4a392131..849e7d65e8 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -89,10 +89,12 @@ impl ComputeUserInfoMaybeEndpoint {
         sni: Option<&str>,
         common_names: Option<&HashSet<String>>,
     ) -> Result<Self, ComputeUserInfoParseError> {
-        use ComputeUserInfoParseError::*;
-
         // Some parameters are stored in the startup message.
-        let get_param = |key| params.get(key).ok_or(MissingKey(key));
+        let get_param = |key| {
+            params
+                .get(key)
+                .ok_or(ComputeUserInfoParseError::MissingKey(key))
+        };
         let user: RoleName = get_param("user")?.into();
 
         // Project name might be passed via PG's command-line options.
@@ -122,11 +124,14 @@ impl ComputeUserInfoMaybeEndpoint {
         let endpoint = match (endpoint_option, endpoint_from_domain) {
             // Invariant: if we have both project name variants, they should match.
             (Some(option), Some(domain)) if option != domain => {
-                Some(Err(InconsistentProjectNames { domain, option }))
+                Some(Err(ComputeUserInfoParseError::InconsistentProjectNames {
+                    domain,
+                    option,
+                }))
             }
             // Invariant: project name may not contain certain characters.
             (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
-                false => Err(MalformedProjectName(name)),
+                false => Err(ComputeUserInfoParseError::MalformedProjectName(name)),
                 true => Ok(name),
             }),
         }
@@ -186,7 +191,7 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern {
         impl<'de> serde::de::Visitor<'de> for StrVisitor {
             type Value = IpPattern;
 
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
                 write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask")
             }
 
diff --git a/proxy/src/cache/common.rs b/proxy/src/cache/common.rs
index 4e393fddb2..82c78e3eb2 100644
--- a/proxy/src/cache/common.rs
+++ b/proxy/src/cache/common.rs
@@ -24,7 +24,7 @@ impl<C: Cache> Cache for &C {
     type LookupInfo<Key> = C::LookupInfo<Key>;
 
     fn invalidate(&self, info: &Self::LookupInfo<Self::Key>) {
-        C::invalidate(self, info)
+        C::invalidate(self, info);
     }
 }
 
diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs
index c5c4f6a1ed..07fad56643 100644
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -58,7 +58,7 @@ impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
     type LookupInfo<Key> = LookupInfo<Key>;
 
     fn invalidate(&self, info: &Self::LookupInfo<K>) {
-        self.invalidate_raw(info)
+        self.invalidate_raw(info);
     }
 }
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 18c82fe379..c071a59d58 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -44,11 +44,10 @@ pub enum ConnectionError {
 
 impl UserFacingError for ConnectionError {
     fn to_string_client(&self) -> String {
-        use ConnectionError::*;
         match self {
             // This helps us drop irrelevant library-specific prefixes.
             // TODO: propagate severity level and other parameters.
-            Postgres(err) => match err.as_db_error() {
+            ConnectionError::Postgres(err) => match err.as_db_error() {
                 Some(err) => {
                     let msg = err.message();
 
@@ -62,8 +61,8 @@ impl UserFacingError for ConnectionError {
                 }
                 None => err.to_string(),
             },
-            WakeComputeError(err) => err.to_string_client(),
-            TooManyConnectionAttempts(_) => {
+            ConnectionError::WakeComputeError(err) => err.to_string_client(),
+            ConnectionError::TooManyConnectionAttempts(_) => {
                 "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
             }
             _ => COULD_NOT_CONNECT.to_owned(),
@@ -366,16 +365,16 @@ static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
 struct AcceptEverythingVerifier;
 impl ServerCertVerifier for AcceptEverythingVerifier {
     fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
-        use rustls::SignatureScheme::*;
+        use rustls::SignatureScheme;
         // The schemes for which `SignatureScheme::supported_in_tls13` returns true.
         vec![
-            ECDSA_NISTP521_SHA512,
-            ECDSA_NISTP384_SHA384,
-            ECDSA_NISTP256_SHA256,
-            RSA_PSS_SHA512,
-            RSA_PSS_SHA384,
-            RSA_PSS_SHA256,
-            ED25519,
+            SignatureScheme::ECDSA_NISTP521_SHA512,
+            SignatureScheme::ECDSA_NISTP384_SHA384,
+            SignatureScheme::ECDSA_NISTP256_SHA256,
+            SignatureScheme::RSA_PSS_SHA512,
+            SignatureScheme::RSA_PSS_SHA384,
+            SignatureScheme::RSA_PSS_SHA256,
+            SignatureScheme::ED25519,
         ]
     }
     fn verify_server_cert(
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 1412095505..36d04924f2 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -155,7 +155,7 @@ pub enum TlsServerEndPoint {
 }
 
 impl TlsServerEndPoint {
-    pub fn new(cert: &CertificateDer) -> anyhow::Result<Self> {
+    pub fn new(cert: &CertificateDer<'_>) -> anyhow::Result<Self> {
         let sha256_oids = [
             // I'm explicitly not adding MD5 or SHA1 here... They're bad.
             oid_registry::OID_SIG_ECDSA_WITH_SHA256,
@@ -278,7 +278,7 @@ impl CertResolver {
 impl rustls::server::ResolvesServerCert for CertResolver {
     fn resolve(
         &self,
-        client_hello: rustls::server::ClientHello,
+        client_hello: rustls::server::ClientHello<'_>,
     ) -> Option<Arc<rustls::sign::CertifiedKey>> {
         self.resolve(client_hello.server_name()).map(|x| x.0)
     }
@@ -559,7 +559,7 @@ impl RetryConfig {
             match key {
                 "num_retries" => num_retries = Some(value.parse()?),
                 "base_retry_wait_duration" => {
-                    base_retry_wait_duration = Some(humantime::parse_duration(value)?)
+                    base_retry_wait_duration = Some(humantime::parse_duration(value)?);
                 }
                 "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?),
                 unknown => bail!("unknown key: {unknown}"),
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 9abf24ab7f..ac66e116d0 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -22,16 +22,15 @@ impl ConsoleError {
         self.status
             .as_ref()
             .and_then(|s| s.details.error_info.as_ref())
-            .map(|e| e.reason)
-            .unwrap_or(Reason::Unknown)
+            .map_or(Reason::Unknown, |e| e.reason)
     }
+
     pub fn get_user_facing_message(&self) -> String {
         use super::provider::errors::REQUEST_FAILED;
         self.status
             .as_ref()
             .and_then(|s| s.details.user_facing_message.as_ref())
-            .map(|m| m.message.clone().into())
-            .unwrap_or_else(|| {
+            .map_or_else(|| {
                 // Ask @neondatabase/control-plane for review before adding more.
                 match self.http_status_code {
                     http::StatusCode::NOT_FOUND => {
@@ -48,19 +47,18 @@ impl ConsoleError {
                     }
                     _ => REQUEST_FAILED.to_owned(),
                 }
-            })
+            }, |m| m.message.clone().into())
     }
 }
 
 impl Display for ConsoleError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let msg = self
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let msg: &str = self
             .status
             .as_ref()
             .and_then(|s| s.details.user_facing_message.as_ref())
-            .map(|m| m.message.as_ref())
-            .unwrap_or_else(|| &self.error);
-        write!(f, "{}", msg)
+            .map_or_else(|| self.error.as_ref(), |m| m.message.as_ref());
+        write!(f, "{msg}")
     }
 }
 
@@ -286,7 +284,7 @@ pub struct DatabaseInfo {
 
 // Manually implement debug to omit sensitive info.
 impl fmt::Debug for DatabaseInfo {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.debug_struct("DatabaseInfo")
             .field("host", &self.host)
             .field("port", &self.port)
@@ -373,7 +371,7 @@ mod tests {
                 }
             }
         });
-        let _: KickSession = serde_json::from_str(&json.to_string())?;
+        let _: KickSession<'_> = serde_json::from_str(&json.to_string())?;
 
         Ok(())
     }
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index befe7d7510..82d5033aab 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -93,7 +93,8 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for MgmtHandler {
 }
 
 fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> {
-    let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?;
+    let resp: KickSession<'_> =
+        serde_json::from_str(query).context("Failed to parse query as json")?;
 
     let span = info_span!("event", session_id = resp.session_id);
     let _enter = span.enter();
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 15fc0134b3..cc2ee10062 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -26,7 +26,7 @@ use tracing::info;
 pub mod errors {
     use crate::{
         console::messages::{self, ConsoleError, Reason},
-        error::{io_error, ReportableError, UserFacingError},
+        error::{io_error, ErrorKind, ReportableError, UserFacingError},
         proxy::retry::CouldRetry,
     };
     use thiserror::Error;
@@ -51,21 +51,19 @@ pub mod errors {
     impl ApiError {
         /// Returns HTTP status code if it's the reason for failure.
         pub fn get_reason(&self) -> messages::Reason {
-            use ApiError::*;
             match self {
-                Console(e) => e.get_reason(),
-                _ => messages::Reason::Unknown,
+                ApiError::Console(e) => e.get_reason(),
+                ApiError::Transport(_) => messages::Reason::Unknown,
             }
         }
     }
 
     impl UserFacingError for ApiError {
         fn to_string_client(&self) -> String {
-            use ApiError::*;
             match self {
                 // To minimize risks, only select errors are forwarded to users.
-                Console(c) => c.get_user_facing_message(),
-                _ => REQUEST_FAILED.to_owned(),
+                ApiError::Console(c) => c.get_user_facing_message(),
+                ApiError::Transport(_) => REQUEST_FAILED.to_owned(),
             }
         }
     }
@@ -73,57 +71,53 @@ pub mod errors {
     impl ReportableError for ApiError {
         fn get_error_kind(&self) -> crate::error::ErrorKind {
             match self {
-                ApiError::Console(e) => {
-                    use crate::error::ErrorKind::*;
-                    match e.get_reason() {
-                        Reason::RoleProtected => User,
-                        Reason::ResourceNotFound => User,
-                        Reason::ProjectNotFound => User,
-                        Reason::EndpointNotFound => User,
-                        Reason::BranchNotFound => User,
-                        Reason::RateLimitExceeded => ServiceRateLimit,
-                        Reason::NonDefaultBranchComputeTimeExceeded => User,
-                        Reason::ActiveTimeQuotaExceeded => User,
-                        Reason::ComputeTimeQuotaExceeded => User,
-                        Reason::WrittenDataQuotaExceeded => User,
-                        Reason::DataTransferQuotaExceeded => User,
-                        Reason::LogicalSizeQuotaExceeded => User,
-                        Reason::ConcurrencyLimitReached => ControlPlane,
-                        Reason::LockAlreadyTaken => ControlPlane,
-                        Reason::RunningOperations => ControlPlane,
-                        Reason::Unknown => match &e {
-                            ConsoleError {
-                                http_status_code:
-                                    http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
-                                ..
-                            } => crate::error::ErrorKind::User,
-                            ConsoleError {
-                                http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
-                                error,
-                                ..
-                            } if error.contains(
-                                "compute time quota of non-primary branches is exceeded",
-                            ) =>
-                            {
-                                crate::error::ErrorKind::User
-                            }
-                            ConsoleError {
-                                http_status_code: http::StatusCode::LOCKED,
-                                error,
-                                ..
-                            } if error.contains("quota exceeded")
-                                || error.contains("the limit for current plan reached") =>
-                            {
-                                crate::error::ErrorKind::User
-                            }
-                            ConsoleError {
-                                http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
-                                ..
-                            } => crate::error::ErrorKind::ServiceRateLimit,
-                            ConsoleError { .. } => crate::error::ErrorKind::ControlPlane,
-                        },
-                    }
-                }
+                ApiError::Console(e) => match e.get_reason() {
+                    Reason::RoleProtected => ErrorKind::User,
+                    Reason::ResourceNotFound => ErrorKind::User,
+                    Reason::ProjectNotFound => ErrorKind::User,
+                    Reason::EndpointNotFound => ErrorKind::User,
+                    Reason::BranchNotFound => ErrorKind::User,
+                    Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
+                    Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::User,
+                    Reason::ActiveTimeQuotaExceeded => ErrorKind::User,
+                    Reason::ComputeTimeQuotaExceeded => ErrorKind::User,
+                    Reason::WrittenDataQuotaExceeded => ErrorKind::User,
+                    Reason::DataTransferQuotaExceeded => ErrorKind::User,
+                    Reason::LogicalSizeQuotaExceeded => ErrorKind::User,
+                    Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
+                    Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
+                    Reason::RunningOperations => ErrorKind::ControlPlane,
+                    Reason::Unknown => match &e {
+                        ConsoleError {
+                            http_status_code:
+                                http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
+                            ..
+                        } => crate::error::ErrorKind::User,
+                        ConsoleError {
+                            http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
+                            error,
+                            ..
+                        } if error
+                            .contains("compute time quota of non-primary branches is exceeded") =>
+                        {
+                            crate::error::ErrorKind::User
+                        }
+                        ConsoleError {
+                            http_status_code: http::StatusCode::LOCKED,
+                            error,
+                            ..
+                        } if error.contains("quota exceeded")
+                            || error.contains("the limit for current plan reached") =>
+                        {
+                            crate::error::ErrorKind::User
+                        }
+                        ConsoleError {
+                            http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
+                            ..
+                        } => crate::error::ErrorKind::ServiceRateLimit,
+                        ConsoleError { .. } => crate::error::ErrorKind::ControlPlane,
+                    },
+                },
                 ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
             }
         }
@@ -170,12 +164,11 @@ pub mod errors {
 
     impl UserFacingError for GetAuthInfoError {
         fn to_string_client(&self) -> String {
-            use GetAuthInfoError::*;
             match self {
                 // We absolutely should not leak any secrets!
-                BadSecret => REQUEST_FAILED.to_owned(),
+                Self::BadSecret => REQUEST_FAILED.to_owned(),
                 // However, API might return a meaningful error.
-                ApiError(e) => e.to_string_client(),
+                Self::ApiError(e) => e.to_string_client(),
             }
         }
     }
@@ -183,8 +176,8 @@ pub mod errors {
     impl ReportableError for GetAuthInfoError {
         fn get_error_kind(&self) -> crate::error::ErrorKind {
             match self {
-                GetAuthInfoError::BadSecret => crate::error::ErrorKind::ControlPlane,
-                GetAuthInfoError::ApiError(_) => crate::error::ErrorKind::ControlPlane,
+                Self::BadSecret => crate::error::ErrorKind::ControlPlane,
+                Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
             }
         }
     }
@@ -213,17 +206,16 @@ pub mod errors {
 
     impl UserFacingError for WakeComputeError {
         fn to_string_client(&self) -> String {
-            use WakeComputeError::*;
             match self {
                 // We shouldn't show user the address even if it's broken.
                 // Besides, user is unlikely to care about this detail.
-                BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
+                Self::BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
                 // However, API might return a meaningful error.
-                ApiError(e) => e.to_string_client(),
+                Self::ApiError(e) => e.to_string_client(),
 
-                TooManyConnections => self.to_string(),
+                Self::TooManyConnections => self.to_string(),
 
-                TooManyConnectionAttempts(_) => {
+                Self::TooManyConnectionAttempts(_) => {
                     "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
                 }
             }
@@ -233,10 +225,10 @@ pub mod errors {
     impl ReportableError for WakeComputeError {
         fn get_error_kind(&self) -> crate::error::ErrorKind {
             match self {
-                WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
-                WakeComputeError::ApiError(e) => e.get_error_kind(),
-                WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit,
-                WakeComputeError::TooManyConnectionAttempts(e) => e.get_error_kind(),
+                Self::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
+                Self::ApiError(e) => e.get_error_kind(),
+                Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
+                Self::TooManyConnectionAttempts(e) => e.get_error_kind(),
             }
         }
     }
@@ -244,10 +236,10 @@ pub mod errors {
     impl CouldRetry for WakeComputeError {
         fn could_retry(&self) -> bool {
             match self {
-                WakeComputeError::BadComputeAddress(_) => false,
-                WakeComputeError::ApiError(e) => e.could_retry(),
-                WakeComputeError::TooManyConnections => false,
-                WakeComputeError::TooManyConnectionAttempts(_) => false,
+                Self::BadComputeAddress(_) => false,
+                Self::ApiError(e) => e.could_retry(),
+                Self::TooManyConnections => false,
+                Self::TooManyConnectionAttempts(_) => false,
             }
         }
     }
@@ -366,13 +358,14 @@ impl Api for ConsoleBackend {
         ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
-        use ConsoleBackend::*;
         match self {
-            Console(api) => api.get_role_secret(ctx, user_info).await,
+            Self::Console(api) => api.get_role_secret(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
-            Postgres(api) => api.get_role_secret(ctx, user_info).await,
+            Self::Postgres(api) => api.get_role_secret(ctx, user_info).await,
             #[cfg(test)]
-            Test(_) => unreachable!("this function should never be called in the test backend"),
+            Self::Test(_) => {
+                unreachable!("this function should never be called in the test backend")
+            }
         }
     }
 
@@ -381,13 +374,12 @@ impl Api for ConsoleBackend {
         ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
-        use ConsoleBackend::*;
         match self {
-            Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
-            Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             #[cfg(test)]
-            Test(api) => api.get_allowed_ips_and_secret(),
+            Self::Test(api) => api.get_allowed_ips_and_secret(),
         }
     }
 
@@ -396,14 +388,12 @@ impl Api for ConsoleBackend {
         ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
-        use ConsoleBackend::*;
-
         match self {
-            Console(api) => api.wake_compute(ctx, user_info).await,
+            Self::Console(api) => api.wake_compute(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
-            Postgres(api) => api.wake_compute(ctx, user_info).await,
+            Self::Postgres(api) => api.wake_compute(ctx, user_info).await,
             #[cfg(test)]
-            Test(api) => api.wake_compute(),
+            Self::Test(api) => api.wake_compute(),
         }
     }
 }
@@ -549,7 +539,7 @@ impl WakeComputePermit {
         !self.permit.is_disabled()
     }
     pub fn release(self, outcome: Outcome) {
-        self.permit.release(outcome)
+        self.permit.release(outcome);
     }
     pub fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
         match res {
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index e925f67233..cafbdedc15 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -166,7 +166,7 @@ impl RequestMonitoring {
     pub fn set_project(&self, x: MetricsAuxInfo) {
         let mut this = self.0.try_lock().expect("should not deadlock");
         if this.endpoint_id.is_none() {
-            this.set_endpoint_id(x.endpoint_id.as_str().into())
+            this.set_endpoint_id(x.endpoint_id.as_str().into());
         }
         this.branch = Some(x.branch_id);
         this.project = Some(x.project_id);
@@ -260,7 +260,7 @@ impl RequestMonitoring {
             .cold_start_info
     }
 
-    pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause {
+    pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
         LatencyTimerPause {
             ctx: self,
             start: tokio::time::Instant::now(),
@@ -273,7 +273,7 @@ impl RequestMonitoring {
             .try_lock()
             .expect("should not deadlock")
             .latency_timer
-            .success()
+            .success();
     }
 }
 
@@ -328,7 +328,7 @@ impl RequestMonitoringInner {
     fn has_private_peer_addr(&self) -> bool {
         match self.peer_addr {
             IpAddr::V4(ip) => ip.is_private(),
-            _ => false,
+            IpAddr::V6(_) => false,
         }
     }
 
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index bb02a476fc..e5962b35fa 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -736,7 +736,7 @@ mod tests {
                 while let Some(r) = s.next().await {
                     tx.send(r).unwrap();
                 }
-                time::sleep(time::Duration::from_secs(70)).await
+                time::sleep(time::Duration::from_secs(70)).await;
             }
         });
 
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index e38135dd22..d418caa511 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -56,7 +56,7 @@ impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString<Id> {
         impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor<Id> {
             type Value = InternedString<Id>;
 
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
                 formatter.write_str("a string")
             }
 
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 0167553e30..ccef88231b 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -252,7 +252,7 @@ impl Drop for HttpEndpointPoolsGuard<'_> {
 }
 
 impl HttpEndpointPools {
-    pub fn guard(&self) -> HttpEndpointPoolsGuard {
+    pub fn guard(&self) -> HttpEndpointPoolsGuard<'_> {
         self.http_pool_endpoints_registered_total.inc();
         HttpEndpointPoolsGuard {
             dec: &self.http_pool_endpoints_unregistered_total,
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 3c45fff969..048523f69c 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -184,7 +184,7 @@ impl CopyBuffer {
                 }
                 Poll::Pending
             }
-            res => res.map_err(ErrorDirection::Write),
+            res @ Poll::Ready(_) => res.map_err(ErrorDirection::Write),
         }
     }
 
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index c65a5558d9..27a72f8072 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -82,9 +82,8 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     let mut stream = PqStream::new(Stream::from_raw(stream));
     loop {
         let msg = stream.read_startup_packet().await?;
-        use FeStartupPacket::*;
         match msg {
-            SslRequest { direct } => match stream.get_ref() {
+            FeStartupPacket::SslRequest { direct } => match stream.get_ref() {
                 Stream::Raw { .. } if !tried_ssl => {
                     tried_ssl = true;
 
@@ -139,7 +138,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
 
                         let tls_stream = accept.await.inspect_err(|_| {
                             if record_handshake_error {
-                                Metrics::get().proxy.tls_handshake_failures.inc()
+                                Metrics::get().proxy.tls_handshake_failures.inc();
                             }
                         })?;
 
@@ -182,7 +181,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 }
                 _ => return Err(HandshakeError::ProtocolViolation),
             },
-            GssEncRequest => match stream.get_ref() {
+            FeStartupPacket::GssEncRequest => match stream.get_ref() {
                 Stream::Raw { .. } if !tried_gss => {
                     tried_gss = true;
 
@@ -191,7 +190,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 }
                 _ => return Err(HandshakeError::ProtocolViolation),
             },
-            StartupMessage { params, version }
+            FeStartupPacket::StartupMessage { params, version }
                 if PG_PROTOCOL_EARLIEST <= version && version <= PG_PROTOCOL_LATEST =>
             {
                 // Check that the config has been consumed during upgrade
@@ -211,7 +210,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 break Ok(HandshakeData::Startup(stream, params));
             }
             // downgrade protocol version
-            StartupMessage { params, version }
+            FeStartupPacket::StartupMessage { params, version }
                 if version.major() == 3 && version > PG_PROTOCOL_LATEST =>
             {
                 warn!(?version, "unsupported minor version");
@@ -241,7 +240,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 );
                 break Ok(HandshakeData::Startup(stream, params));
             }
-            StartupMessage { version, .. } => {
+            FeStartupPacket::StartupMessage { version, .. } => {
                 warn!(
                     ?version,
                     session_type = "normal",
@@ -249,7 +248,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 );
                 return Err(HandshakeError::ProtocolViolation);
             }
-            CancelRequest(cancel_key_data) => {
+            FeStartupPacket::CancelRequest(cancel_key_data) => {
                 info!(session_type = "cancellation", "successful handshake");
                 break Ok(HandshakeData::Cancel(cancel_key_data));
             }
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index c8ec2b2db6..2d752b9183 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -68,7 +68,7 @@ async fn proxy_mitm(
                                 end_client.send(Bytes::from_static(b"R\0\0\0\x17\0\0\0\x0aSCRAM-SHA-256\0\0")).await.unwrap();
                                 continue;
                             }
-                            end_client.send(message).await.unwrap()
+                            end_client.send(message).await.unwrap();
                         }
                         _ => break,
                     }
@@ -88,7 +88,7 @@ async fn proxy_mitm(
                                 end_server.send(buf.freeze()).await.unwrap();
                                 continue;
                             }
-                            end_server.send(message).await.unwrap()
+                            end_server.send(message).await.unwrap();
                         }
                         _ => break,
                     }
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
index 3842ce269e..80a62b2a76 100644
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -237,7 +237,7 @@ impl Token {
     }
 
     pub fn release(mut self, outcome: Outcome) {
-        self.release_mut(Some(outcome))
+        self.release_mut(Some(outcome));
     }
 
     pub fn release_mut(&mut self, outcome: Option<Outcome>) {
@@ -249,7 +249,7 @@ impl Token {
 
 impl Drop for Token {
     fn drop(&mut self) {
-        self.release_mut(None)
+        self.release_mut(None);
     }
 }
 
diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
index b39740bb21..d669492fa6 100644
--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -25,9 +25,8 @@ pub struct Aimd {
 
 impl LimitAlgorithm for Aimd {
     fn update(&self, old_limit: usize, sample: Sample) -> usize {
-        use Outcome::*;
         match sample.outcome {
-            Success => {
+            Outcome::Success => {
                 let utilisation = sample.in_flight as f32 / old_limit as f32;
 
                 if utilisation > self.utilisation {
@@ -42,7 +41,7 @@ impl LimitAlgorithm for Aimd {
                     old_limit
                 }
             }
-            Overload => {
+            Outcome::Overload => {
                 let limit = old_limit as f32 * self.dec;
 
                 // Floor instead of round, so the limit reduces even with small numbers.
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index b02ce472c0..c78ee166f1 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -98,7 +98,7 @@ impl ConnectionWithCredentialsProvider {
         info!("Establishing a new connection...");
         self.con = None;
         if let Some(f) = self.refresh_token_task.take() {
-            f.abort()
+            f.abort();
         }
         let mut con = self
             .get_client()
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index efd7437d5d..ad69246443 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -108,7 +108,6 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
     }
     #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
     async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> {
-        use Notification::*;
         let payload: String = msg.get_payload()?;
         tracing::debug!(?payload, "received a message payload");
 
@@ -124,7 +123,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
         };
         tracing::debug!(?msg, "received a message");
         match msg {
-            Cancel(cancel_session) => {
+            Notification::Cancel(cancel_session) => {
                 tracing::Span::current().record(
                     "session_id",
                     tracing::field::display(cancel_session.session_id),
@@ -153,12 +152,12 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
             }
             _ => {
                 invalidate_cache(self.cache.clone(), msg.clone());
-                if matches!(msg, AllowedIpsUpdate { .. }) {
+                if matches!(msg, Notification::AllowedIpsUpdate { .. }) {
                     Metrics::get()
                         .proxy
                         .redis_events_count
                         .inc(RedisEventsCount::AllowedIpsUpdate);
-                } else if matches!(msg, PasswordUpdate { .. }) {
+                } else if matches!(msg, Notification::PasswordUpdate { .. }) {
                     Metrics::get()
                         .proxy
                         .redis_events_count
@@ -180,16 +179,16 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
 }
 
 fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
-    use Notification::*;
     match msg {
-        AllowedIpsUpdate { allowed_ips_update } => {
-            cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id)
+        Notification::AllowedIpsUpdate { allowed_ips_update } => {
+            cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id);
         }
-        PasswordUpdate { password_update } => cache.invalidate_role_secret_for_project(
-            password_update.project_id,
-            password_update.role_name,
-        ),
-        Cancel(_) => unreachable!("cancel message should be handled separately"),
+        Notification::PasswordUpdate { password_update } => cache
+            .invalidate_role_secret_for_project(
+                password_update.project_id,
+                password_update.role_name,
+            ),
+        Notification::Cancel(_) => unreachable!("cancel message should be handled separately"),
     }
 }
 
diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs
index 0811416ca2..60207fc824 100644
--- a/proxy/src/sasl.rs
+++ b/proxy/src/sasl.rs
@@ -42,10 +42,9 @@ pub enum Error {
 
 impl UserFacingError for Error {
     fn to_string_client(&self) -> String {
-        use Error::*;
         match self {
-            ChannelBindingFailed(m) => m.to_string(),
-            ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"),
+            Self::ChannelBindingFailed(m) => (*m).to_string(),
+            Self::ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"),
             _ => "authentication protocol violation".to_string(),
         }
     }
diff --git a/proxy/src/sasl/channel_binding.rs b/proxy/src/sasl/channel_binding.rs
index 13d681de6d..6e2d3057ce 100644
--- a/proxy/src/sasl/channel_binding.rs
+++ b/proxy/src/sasl/channel_binding.rs
@@ -13,11 +13,10 @@ pub enum ChannelBinding<T> {
 
 impl<T> ChannelBinding<T> {
     pub fn and_then<R, E>(self, f: impl FnOnce(T) -> Result<R, E>) -> Result<ChannelBinding<R>, E> {
-        use ChannelBinding::*;
         Ok(match self {
-            NotSupportedClient => NotSupportedClient,
-            NotSupportedServer => NotSupportedServer,
-            Required(x) => Required(f(x)?),
+            Self::NotSupportedClient => ChannelBinding::NotSupportedClient,
+            Self::NotSupportedServer => ChannelBinding::NotSupportedServer,
+            Self::Required(x) => ChannelBinding::Required(f(x)?),
         })
     }
 }
@@ -25,11 +24,10 @@ impl<T> ChannelBinding<T> {
 impl<'a> ChannelBinding<&'a str> {
     // NB: FromStr doesn't work with lifetimes
     pub fn parse(input: &'a str) -> Option<Self> {
-        use ChannelBinding::*;
         Some(match input {
-            "n" => NotSupportedClient,
-            "y" => NotSupportedServer,
-            other => Required(other.strip_prefix("p=")?),
+            "n" => Self::NotSupportedClient,
+            "y" => Self::NotSupportedServer,
+            other => Self::Required(other.strip_prefix("p=")?),
         })
     }
 }
@@ -40,17 +38,16 @@ impl<T: std::fmt::Display> ChannelBinding<T> {
         &self,
         get_cbind_data: impl FnOnce(&T) -> Result<&'a [u8], E>,
     ) -> Result<std::borrow::Cow<'static, str>, E> {
-        use ChannelBinding::*;
         Ok(match self {
-            NotSupportedClient => {
+            Self::NotSupportedClient => {
                 // base64::encode("n,,")
                 "biws".into()
             }
-            NotSupportedServer => {
+            Self::NotSupportedServer => {
                 // base64::encode("y,,")
                 "eSws".into()
             }
-            Required(mode) => {
+            Self::Required(mode) => {
                 use std::io::Write;
                 let mut cbind_input = vec![];
                 write!(&mut cbind_input, "p={mode},,",).unwrap();
diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs
index b9208f6f1f..2b5ae1785d 100644
--- a/proxy/src/sasl/messages.rs
+++ b/proxy/src/sasl/messages.rs
@@ -42,10 +42,9 @@ pub(super) enum ServerMessage<T> {
 
 impl<'a> ServerMessage<&'a str> {
     pub(super) fn to_reply(&self) -> BeMessage<'a> {
-        use BeAuthenticationSaslMessage::*;
         BeMessage::AuthenticationSasl(match self {
-            ServerMessage::Continue(s) => Continue(s.as_bytes()),
-            ServerMessage::Final(s) => Final(s.as_bytes()),
+            ServerMessage::Continue(s) => BeAuthenticationSaslMessage::Continue(s.as_bytes()),
+            ServerMessage::Final(s) => BeAuthenticationSaslMessage::Final(s.as_bytes()),
         })
     }
 }
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index 862facb4e5..145e727a74 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -137,12 +137,12 @@ mod tests {
 
     #[tokio::test]
     async fn round_trip() {
-        run_round_trip_test("pencil", "pencil").await
+        run_round_trip_test("pencil", "pencil").await;
     }
 
     #[tokio::test]
     #[should_panic(expected = "password doesn't match")]
     async fn failure() {
-        run_round_trip_test("pencil", "eraser").await
+        run_round_trip_test("pencil", "eraser").await;
     }
 }
diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs
index e8e7ef5c86..944bb3c83e 100644
--- a/proxy/src/scram/countmin.rs
+++ b/proxy/src/scram/countmin.rs
@@ -98,8 +98,6 @@ mod tests {
         // q% of counts will be within p of the actual value
         let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
 
-        dbg!(sketch.buckets.len());
-
         // insert a bunch of entries in a random order
         let mut ids2 = ids.clone();
         while !ids2.is_empty() {
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index d0adbc780e..f2494379a5 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -210,23 +210,23 @@ impl sasl::Mechanism for Exchange<'_> {
     type Output = super::ScramKey;
 
     fn exchange(mut self, input: &str) -> sasl::Result<sasl::Step<Self, Self::Output>> {
-        use {sasl::Step::*, ExchangeState::*};
+        use {sasl::Step, ExchangeState};
         match &self.state {
-            Initial(init) => {
+            ExchangeState::Initial(init) => {
                 match init.transition(self.secret, &self.tls_server_end_point, input)? {
-                    Continue(sent, msg) => {
-                        self.state = SaltSent(sent);
-                        Ok(Continue(self, msg))
+                    Step::Continue(sent, msg) => {
+                        self.state = ExchangeState::SaltSent(sent);
+                        Ok(Step::Continue(self, msg))
                     }
-                    Success(x, _) => match x {},
-                    Failure(msg) => Ok(Failure(msg)),
+                    Step::Success(x, _) => match x {},
+                    Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
             }
-            SaltSent(sent) => {
+            ExchangeState::SaltSent(sent) => {
                 match sent.transition(self.secret, &self.tls_server_end_point, input)? {
-                    Success(keys, msg) => Ok(Success(keys, msg)),
-                    Continue(x, _) => match x {},
-                    Failure(msg) => Ok(Failure(msg)),
+                    Step::Success(keys, msg) => Ok(Step::Success(keys, msg)),
+                    Step::Continue(x, _) => match x {},
+                    Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
             }
         }
diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs
index cf677a3334..5ecbbf7004 100644
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -59,7 +59,7 @@ impl<'a> ClientFirstMessage<'a> {
 
         // https://github.com/postgres/postgres/blob/f83908798f78c4cafda217ca875602c88ea2ae28/src/backend/libpq/auth-scram.c#L13-L14
         if !username.is_empty() {
-            tracing::warn!(username, "scram username provided, but is not expected")
+            tracing::warn!(username, "scram username provided, but is not expected");
             // TODO(conrad):
             // return None;
         }
@@ -137,7 +137,7 @@ impl<'a> ClientFinalMessage<'a> {
     /// Build a response to [`ClientFinalMessage`].
     pub fn build_server_final_message(
         &self,
-        signature_builder: SignatureBuilder,
+        signature_builder: SignatureBuilder<'_>,
         server_key: &ScramKey,
     ) -> String {
         let mut buf = String::from("v=");
@@ -212,7 +212,7 @@ mod tests {
 
     #[test]
     fn parse_client_first_message_with_invalid_gs2_authz() {
-        assert!(ClientFirstMessage::parse("n,authzid,n=,r=nonce").is_none())
+        assert!(ClientFirstMessage::parse("n,authzid,n=,r=nonce").is_none());
     }
 
     #[test]
diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs
index a803ba7e1b..f690cc7738 100644
--- a/proxy/src/scram/pbkdf2.rs
+++ b/proxy/src/scram/pbkdf2.rs
@@ -84,6 +84,6 @@ mod tests {
         };
 
         let expected = pbkdf2_hmac_array::<Sha256, 32>(pass, salt, 600000);
-        assert_eq!(hash, expected)
+        assert_eq!(hash, expected);
     }
 }
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
index 7701b869a3..fa3d3ccca2 100644
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -270,7 +270,7 @@ fn thread_rt(pool: Arc<ThreadPool>, worker: Worker<JobSpec>, index: usize) {
                         .inc(ThreadPoolWorkerId(index));
 
                     // skip for now
-                    worker.push(job)
+                    worker.push(job);
                 }
             }
 
@@ -316,6 +316,6 @@ mod tests {
             10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242,
             178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140,
         ];
-        assert_eq!(actual, expected)
+        assert_eq!(actual, expected);
     }
 }
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 115bef7375..5416d63b5b 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -120,7 +120,7 @@ pub async fn task_main(
             tracing::trace!("attempting to cancel a random connection");
             if let Some(token) = config.http_config.cancel_set.take() {
                 tracing::debug!("cancelling a random connection");
-                token.cancel()
+                token.cancel();
             }
         }
 
@@ -198,7 +198,7 @@ async fn connection_startup(
     let peer_addr = peer.unwrap_or(peer_addr).ip();
     let has_private_peer_addr = match peer_addr {
         IpAddr::V4(ip) => ip.is_private(),
-        _ => false,
+        IpAddr::V6(_) => false,
     };
     info!(?session_id, %peer_addr, "accepted new TCP connection");
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index e1dc44dc1c..9ede659cc4 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -390,7 +390,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
             .write()
             .get_conn_entry(conn_info.db_and_user())
         {
-            client = Some(entry.conn)
+            client = Some(entry.conn);
         }
         let endpoint_pool = Arc::downgrade(&endpoint_pool);
 
@@ -662,13 +662,13 @@ impl<C: ClientInnerExt> Discard<'_, C> {
     pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
         let conn_info = &self.conn_info;
         if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
-            info!("pool: throwing away connection '{conn_info}' because connection is not idle")
+            info!("pool: throwing away connection '{conn_info}' because connection is not idle");
         }
     }
     pub fn discard(&mut self) {
         let conn_info = &self.conn_info;
         if std::mem::take(self.pool).strong_count() > 0 {
-            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
+            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
         }
     }
 }
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 690e92ffb1..7809d2e574 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -234,7 +234,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
                 .await
                 .inspect_err(|_| {
                     if record_handshake_error {
-                        Metrics::get().proxy.tls_handshake_failures.inc()
+                        Metrics::get().proxy.tls_handshake_failures.inc();
                     }
                 })?),
             Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
diff --git a/proxy/src/url.rs b/proxy/src/url.rs
index 92c64bb8ad..202fe8de1f 100644
--- a/proxy/src/url.rs
+++ b/proxy/src/url.rs
@@ -12,7 +12,7 @@ impl ApiUrl {
     }
 
     /// See [`url::Url::path_segments_mut`].
-    pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut {
+    pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut<'_> {
         // We've already verified that it works during construction.
         self.0.path_segments_mut().expect("bad API url")
     }
diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs
index 888ad38048..3bd8f4c8ef 100644
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -36,7 +36,7 @@ impl<T> Default for Waiters<T> {
 }
 
 impl<T> Waiters<T> {
-    pub fn register(&self, key: String) -> Result<Waiter<T>, RegisterError> {
+    pub fn register(&self, key: String) -> Result<Waiter<'_, T>, RegisterError> {
         let (tx, rx) = oneshot::channel();
 
         self.0

From eb7241c798d445cd7bcb52d14fbf6c59f4a54d32 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 19 Aug 2024 16:35:34 +0200
Subject: [PATCH 1444/1571] l0_flush: remove support for mode `page-cached`
 (#8739)

It's been rolled out everywhere, no configs are referencing it.

All code that's made dead by the removal of the config option is removed
as part of this PR.

The `page_caching::PreWarmingWriter` in `::No` mode is equivalent to a
`size_tracking_writer`, so, use that.

part of https://github.com/neondatabase/neon/issues/7418
---
 pageserver/src/l0_flush.rs                    |  19 +-
 pageserver/src/tenant/ephemeral_file.rs       |   5 +-
 .../src/tenant/ephemeral_file/page_caching.rs | 169 ++----------------
 .../tenant/storage_layer/inmemory_layer.rs    |  68 +------
 4 files changed, 20 insertions(+), 241 deletions(-)

diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs
index 10187f2ba3..313a7961a6 100644
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -1,15 +1,10 @@
 use std::{num::NonZeroUsize, sync::Arc};
 
-use crate::tenant::ephemeral_file;
-
 #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
-    PageCached,
     #[serde(rename_all = "snake_case")]
-    Direct {
-        max_concurrency: NonZeroUsize,
-    },
+    Direct { max_concurrency: NonZeroUsize },
 }
 
 impl Default for L0FlushConfig {
@@ -25,14 +20,12 @@ impl Default for L0FlushConfig {
 pub struct L0FlushGlobalState(Arc<Inner>);
 
 pub enum Inner {
-    PageCached,
     Direct { semaphore: tokio::sync::Semaphore },
 }
 
 impl L0FlushGlobalState {
     pub fn new(config: L0FlushConfig) -> Self {
         match config {
-            L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
             L0FlushConfig::Direct { max_concurrency } => {
                 let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
                 Self(Arc::new(Inner::Direct { semaphore }))
@@ -44,13 +37,3 @@ impl L0FlushGlobalState {
         &self.0
     }
 }
-
-impl L0FlushConfig {
-    pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
-        use L0FlushConfig::*;
-        match self {
-            PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
-            Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
-        }
-    }
-}
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 770f3ca5f0..3eb8384d05 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -21,7 +21,6 @@ pub struct EphemeralFile {
 }
 
 mod page_caching;
-pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
 mod zero_padded_read_write;
 
 impl EphemeralFile {
@@ -52,12 +51,10 @@ impl EphemeralFile {
         )
         .await?;
 
-        let prewarm = conf.l0_flush.prewarm_on_write();
-
         Ok(EphemeralFile {
             _tenant_shard_id: tenant_shard_id,
             _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, prewarm, gate_guard),
+            rw: page_caching::RW::new(file, gate_guard),
         })
     }
 
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
index 7355b3b5a3..48926354f1 100644
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -1,15 +1,15 @@
 //! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
 //! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
+//!
+//! Subject to removal in <https://github.com/neondatabase/neon/pull/8537>
 
 use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::BlockLease;
-use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
+use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
 use crate::virtual_file::VirtualFile;
 
-use once_cell::sync::Lazy;
-use std::io::{self, ErrorKind};
-use std::ops::{Deref, Range};
+use std::io::{self};
 use tokio_epoll_uring::BoundedBuf;
 use tracing::*;
 
@@ -18,33 +18,17 @@ use super::zero_padded_read_write;
 /// See module-level comment.
 pub struct RW {
     page_cache_file_id: page_cache::FileId,
-    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
+    rw: super::zero_padded_read_write::RW<size_tracking_writer::Writer<VirtualFile>>,
     /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
     _gate_guard: utils::sync::gate::GateGuard,
 }
 
-/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
-/// should we pre-warm the [`crate::page_cache`] with the contents?
-#[derive(Clone, Copy)]
-pub enum PrewarmOnWrite {
-    Yes,
-    No,
-}
-
 impl RW {
-    pub fn new(
-        file: VirtualFile,
-        prewarm_on_write: PrewarmOnWrite,
-        _gate_guard: utils::sync::gate::GateGuard,
-    ) -> Self {
+    pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self {
         let page_cache_file_id = page_cache::next_file_id();
         Self {
             page_cache_file_id,
-            rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
-                page_cache_file_id,
-                file,
-                prewarm_on_write,
-            )),
+            rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)),
             _gate_guard,
         }
     }
@@ -84,10 +68,10 @@ impl RW {
         let vec = Vec::with_capacity(size);
 
         // read from disk what we've already flushed
-        let writer = self.rw.as_writer();
-        let flushed_range = writer.written_range();
-        let mut vec = writer
-            .file
+        let file_size_tracking_writer = self.rw.as_writer();
+        let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap();
+        let mut vec = file_size_tracking_writer
+            .as_inner()
             .read_exact_at(
                 vec.slice(0..(flushed_range.end - flushed_range.start)),
                 u64::try_from(flushed_range.start).unwrap(),
@@ -122,7 +106,7 @@ impl RW {
                             format!(
                                 "ephemeral file: read immutable page #{}: {}: {:#}",
                                 blknum,
-                                self.rw.as_writer().file.path,
+                                self.rw.as_writer().as_inner().path,
                                 e,
                             ),
                         )
@@ -132,7 +116,7 @@ impl RW {
                     }
                     page_cache::ReadBufResult::NotFound(write_guard) => {
                         let write_guard = writer
-                            .file
+                            .as_inner()
                             .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
                             .await?;
                         let read_guard = write_guard.mark_valid();
@@ -154,137 +138,16 @@ impl Drop for RW {
 
         // unlink the file
         // we are clear to do this, because we have entered a gate
-        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
+        let path = &self.rw.as_writer().as_inner().path;
+        let res = std::fs::remove_file(path);
         if let Err(e) = res {
             if e.kind() != std::io::ErrorKind::NotFound {
                 // just never log the not found errors, we cannot do anything for them; on detach
                 // the tenant directory is already gone.
                 //
                 // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!(
-                    "could not remove ephemeral file '{}': {}",
-                    self.rw.as_writer().file.path,
-                    e
-                );
+                error!("could not remove ephemeral file '{path}': {e}");
             }
         }
     }
 }
-
-struct PreWarmingWriter {
-    prewarm_on_write: PrewarmOnWrite,
-    nwritten_blocks: u32,
-    page_cache_file_id: page_cache::FileId,
-    file: VirtualFile,
-}
-
-impl PreWarmingWriter {
-    fn new(
-        page_cache_file_id: page_cache::FileId,
-        file: VirtualFile,
-        prewarm_on_write: PrewarmOnWrite,
-    ) -> Self {
-        Self {
-            prewarm_on_write,
-            nwritten_blocks: 0,
-            page_cache_file_id,
-            file,
-        }
-    }
-
-    /// Return the byte range within `file` that has been written though `write_all`.
-    ///
-    /// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
-    fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
-        let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
-        struct Wrapper(Range<usize>);
-        impl Deref for Wrapper {
-            type Target = Range<usize>;
-            fn deref(&self) -> &Range<usize> {
-                &self.0
-            }
-        }
-        Wrapper(0..nwritten_blocks * PAGE_SZ)
-    }
-}
-
-impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
-    async fn write_all<Buf: tokio_epoll_uring::IoBuf + Send>(
-        &mut self,
-        buf: FullSlice<Buf>,
-        ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        let buflen = buf.len();
-        assert_eq!(
-            buflen % PAGE_SZ,
-            0,
-            "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
-        );
-
-        // Do the IO.
-        let buf = match self.file.write_all(buf, ctx).await {
-            (buf, Ok(nwritten)) => {
-                assert_eq!(nwritten, buflen);
-                buf
-            }
-            (_, Err(e)) => {
-                return Err(std::io::Error::new(
-                    ErrorKind::Other,
-                    // order error before path because path is long and error is short
-                    format!(
-                        "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
-                        self.nwritten_blocks, buflen, e, self.file.path,
-                    ),
-                ));
-            }
-        };
-
-        let nblocks = buflen / PAGE_SZ;
-        let nblocks32 = u32::try_from(nblocks).unwrap();
-
-        if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
-            // Pre-warm page cache with the contents.
-            // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
-            // benefits the code that writes InMemoryLayer=>L0 layers.
-
-            let cache = page_cache::get();
-            static CTX: Lazy<RequestContext> = Lazy::new(|| {
-                RequestContext::new(
-                    crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
-                    crate::context::DownloadBehavior::Error,
-                )
-            });
-            for blknum_in_buffer in 0..nblocks {
-                let blk_in_buffer =
-                    &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
-                let blknum = self
-                    .nwritten_blocks
-                    .checked_add(blknum_in_buffer as u32)
-                    .unwrap();
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
-                    .await
-                {
-                    Err(e) => {
-                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
-                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
-                    }
-                    Ok(v) => match v {
-                        page_cache::ReadBufResult::Found(_guard) => {
-                            // This function takes &mut self, so, it shouldn't be possible to reach this point.
-                            unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
-                                      and this function takes &mut self, so, no concurrent read_blk is possible");
-                        }
-                        page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                            write_guard.copy_from_slice(blk_in_buffer);
-                            let _ = write_guard.mark_valid();
-                        }
-                    },
-                }
-            }
-        }
-
-        self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
-        Ok((buflen, buf))
-    }
-}
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 748d79c149..130d1002a0 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -13,7 +13,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
-use crate::{l0_flush, page_cache, walrecord};
+use crate::{l0_flush, page_cache};
 use anyhow::{anyhow, Result};
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
@@ -249,9 +249,7 @@ impl InMemoryLayer {
     /// debugging function to print out the contents of the layer
     ///
     /// this is likely completly unused
-    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        let inner = self.inner.read().await;
-
+    pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
         let end_str = self.end_lsn_or_max();
 
         println!(
@@ -259,39 +257,6 @@ impl InMemoryLayer {
             self.timeline_id, self.start_lsn, end_str,
         );
 
-        if !verbose {
-            return Ok(());
-        }
-
-        let cursor = inner.file.block_cursor();
-        let mut buf = Vec::new();
-        for (key, vec_map) in inner.index.iter() {
-            for (lsn, pos) in vec_map.as_slice() {
-                let mut desc = String::new();
-                cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
-                let val = Value::des(&buf);
-                match val {
-                    Ok(Value::Image(img)) => {
-                        write!(&mut desc, " img {} bytes", img.len())?;
-                    }
-                    Ok(Value::WalRecord(rec)) => {
-                        let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
-                        write!(
-                            &mut desc,
-                            " rec {} bytes will_init: {} {}",
-                            buf.len(),
-                            rec.will_init(),
-                            wal_desc
-                        )?;
-                    }
-                    Err(err) => {
-                        write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
-                    }
-                }
-                println!("  key {} at {}: {}", key, lsn, desc);
-            }
-        }
-
         Ok(())
     }
 
@@ -536,7 +501,6 @@ impl InMemoryLayer {
 
         use l0_flush::Inner;
         let _concurrency_permit = match l0_flush_global_state {
-            Inner::PageCached => None,
             Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
         };
 
@@ -568,34 +532,6 @@ impl InMemoryLayer {
         .await?;
 
         match l0_flush_global_state {
-            l0_flush::Inner::PageCached => {
-                let ctx = RequestContextBuilder::extend(ctx)
-                    .page_content_kind(PageContentKind::InMemoryLayer)
-                    .build();
-
-                let mut buf = Vec::new();
-
-                let cursor = inner.file.block_cursor();
-
-                for (key, vec_map) in inner.index.iter() {
-                    // Write all page versions
-                    for (lsn, pos) in vec_map.as_slice() {
-                        cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
-                        let will_init = Value::des(&buf)?.will_init();
-                        let (tmp, res) = delta_layer_writer
-                            .put_value_bytes(
-                                Key::from_compact(*key),
-                                *lsn,
-                                buf.slice_len(),
-                                will_init,
-                                &ctx,
-                            )
-                            .await;
-                        res?;
-                        buf = tmp.into_raw_slice().into_inner();
-                    }
-                }
-            }
             l0_flush::Inner::Direct { .. } => {
                 let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
                 assert_eq!(

From 3b8ca477ab6852143f8acb5b8217e5f24e9e8605 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 19 Aug 2024 16:39:44 +0200
Subject: [PATCH 1445/1571] Migrate physical GC and scan_metadata to
 remote_storage (#8673)

Migrates most of the remaining parts of the scrubber to remote_storage:

* `pageserver_physical_gc`
* `scan_metadata` for pageservers (safekeepers were done in #8595)
* `download()` in `tenant_snapshot`. The main `tenant_snapshot` is not
migrated as it uses version history to be able to work in the face of
ongoing changes.

Part of #7547
---
 libs/remote_storage/src/azure_blob.rs         |  42 +++++
 libs/remote_storage/src/lib.rs                |  24 ++-
 libs/remote_storage/src/local_fs.rs           |  14 ++
 libs/remote_storage/src/metrics.rs            |   9 +-
 libs/remote_storage/src/s3_bucket.rs          |  74 +++++++-
 libs/remote_storage/src/simulate_failures.rs  |  11 ++
 storage_scrubber/src/checks.rs                |  85 ++++-----
 storage_scrubber/src/lib.rs                   |  69 ++++---
 storage_scrubber/src/metadata_stream.rs       | 170 ++++--------------
 .../src/pageserver_physical_gc.rs             | 134 +++++---------
 .../src/scan_pageserver_metadata.rs           |  40 ++---
 .../src/scan_safekeeper_metadata.rs           |   7 +-
 storage_scrubber/src/tenant_snapshot.rs       |  27 +--
 test_runner/fixtures/neon_fixtures.py         |   1 +
 14 files changed, 366 insertions(+), 341 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 3c77d5a227..cb7479f6cd 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -383,6 +383,48 @@ impl RemoteStorage for AzureBlobStorage {
         }
     }
 
+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        let kind = RequestKind::Head;
+        let _permit = self.permit(kind, cancel).await?;
+
+        let started_at = start_measuring_requests(kind);
+
+        let blob_client = self.client.blob_client(self.relative_path_to_name(key));
+        let properties_future = blob_client.get_properties().into_future();
+
+        let properties_future = tokio::time::timeout(self.timeout, properties_future);
+
+        let res = tokio::select! {
+            res = properties_future => res,
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        if let Ok(inner) = &res {
+            // do not incl. timeouts as errors in metrics but cancellations
+            let started_at = ScopeGuard::into_inner(started_at);
+            crate::metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, inner, started_at);
+        }
+
+        let data = match res {
+            Ok(Ok(data)) => Ok(data),
+            Ok(Err(sdk)) => Err(to_download_error(sdk)),
+            Err(_timeout) => Err(DownloadError::Timeout),
+        }?;
+
+        let properties = data.blob.properties;
+        Ok(ListingObject {
+            key: key.to_owned(),
+            last_modified: SystemTime::from(properties.last_modified),
+            size: properties.content_length,
+        })
+    }
+
     async fn upload(
         &self,
         from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 2c9e298f79..cc1d3e0ae4 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -150,7 +150,7 @@ pub enum ListingMode {
     NoDelimiter,
 }
 
-#[derive(PartialEq, Eq, Debug)]
+#[derive(PartialEq, Eq, Debug, Clone)]
 pub struct ListingObject {
     pub key: RemotePath,
     pub last_modified: SystemTime,
@@ -215,6 +215,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
         Ok(combined)
     }
 
+    /// Obtain metadata information about an object.
+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError>;
+
     /// Streams the local file contents into remote into the remote storage entry.
     ///
     /// If the operation fails because of timeout or cancellation, the root cause of the error will be
@@ -363,6 +370,20 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         }
     }
 
+    // See [`RemoteStorage::head_object`].
+    pub async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.head_object(key, cancel).await,
+            Self::AwsS3(s) => s.head_object(key, cancel).await,
+            Self::AzureBlob(s) => s.head_object(key, cancel).await,
+            Self::Unreliable(s) => s.head_object(key, cancel).await,
+        }
+    }
+
     /// See [`RemoteStorage::upload`]
     pub async fn upload(
         &self,
@@ -598,6 +619,7 @@ impl ConcurrencyLimiter {
             RequestKind::Delete => &self.write,
             RequestKind::Copy => &self.write,
             RequestKind::TimeTravel => &self.write,
+            RequestKind::Head => &self.read,
         }
     }
 
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 99b4aa4061..c3ef18cab1 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -445,6 +445,20 @@ impl RemoteStorage for LocalFs {
         }
     }
 
+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        _cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        let target_file_path = key.with_base(&self.storage_root);
+        let metadata = file_metadata(&target_file_path).await?;
+        Ok(ListingObject {
+            key: key.clone(),
+            last_modified: metadata.modified()?,
+            size: metadata.len(),
+        })
+    }
+
     async fn upload(
         &self,
         data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
diff --git a/libs/remote_storage/src/metrics.rs b/libs/remote_storage/src/metrics.rs
index bbb51590f3..f1aa4c433b 100644
--- a/libs/remote_storage/src/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -13,6 +13,7 @@ pub(crate) enum RequestKind {
     List = 3,
     Copy = 4,
     TimeTravel = 5,
+    Head = 6,
 }
 
 use scopeguard::ScopeGuard;
@@ -27,6 +28,7 @@ impl RequestKind {
             List => "list_objects",
             Copy => "copy_object",
             TimeTravel => "time_travel_recover",
+            Head => "head_object",
         }
     }
     const fn as_index(&self) -> usize {
@@ -34,7 +36,8 @@ impl RequestKind {
     }
 }
 
-pub(crate) struct RequestTyped<C>([C; 6]);
+const REQUEST_KIND_COUNT: usize = 7;
+pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);
 
 impl<C> RequestTyped<C> {
     pub(crate) fn get(&self, kind: RequestKind) -> &C {
@@ -43,8 +46,8 @@ impl<C> RequestTyped<C> {
 
     fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
         use RequestKind::*;
-        let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
-        let arr = std::array::from_fn::<C, 6, _>(|index| {
+        let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
+        let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
             let next = it.next().unwrap();
             assert_eq!(index, next.as_index());
             f(next)
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 1f25da813d..11f6598cbf 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -23,7 +23,7 @@ use aws_config::{
 use aws_sdk_s3::{
     config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
     error::SdkError,
-    operation::get_object::GetObjectError,
+    operation::{get_object::GetObjectError, head_object::HeadObjectError},
     types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
     Client,
 };
@@ -604,6 +604,78 @@ impl RemoteStorage for S3Bucket {
         }
     }
 
+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        let kind = RequestKind::Head;
+        let _permit = self.permit(kind, cancel).await?;
+
+        let started_at = start_measuring_requests(kind);
+
+        let head_future = self
+            .client
+            .head_object()
+            .bucket(self.bucket_name())
+            .key(self.relative_path_to_s3_object(key))
+            .send();
+
+        let head_future = tokio::time::timeout(self.timeout, head_future);
+
+        let res = tokio::select! {
+            res = head_future => res,
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        let res = res.map_err(|_e| DownloadError::Timeout)?;
+
+        // do not incl. timeouts as errors in metrics but cancellations
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+
+        let data = match res {
+            Ok(object_output) => object_output,
+            Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
+                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
+                // an error: we expect to sometimes fetch an object and find it missing,
+                // e.g. when probing for timeline indices.
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Ok,
+                    started_at,
+                );
+                return Err(DownloadError::NotFound);
+            }
+            Err(e) => {
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Err,
+                    started_at,
+                );
+
+                return Err(DownloadError::Other(
+                    anyhow::Error::new(e).context("s3 head object"),
+                ));
+            }
+        };
+
+        let (Some(last_modified), Some(size)) = (data.last_modified, data.content_length) else {
+            return Err(DownloadError::Other(anyhow!(
+                "head_object doesn't contain last_modified or content_length"
+            )))?;
+        };
+        Ok(ListingObject {
+            key: key.to_owned(),
+            last_modified: SystemTime::try_from(last_modified).map_err(|e| {
+                DownloadError::Other(anyhow!("can't convert time '{last_modified}': {e}"))
+            })?,
+            size: size as u64,
+        })
+    }
+
     async fn upload(
         &self,
         from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 13f873dcdb..c7eb634af3 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -30,6 +30,7 @@ pub struct UnreliableWrapper {
 #[derive(Debug, Hash, Eq, PartialEq)]
 enum RemoteOp {
     ListPrefixes(Option<RemotePath>),
+    HeadObject(RemotePath),
     Upload(RemotePath),
     Download(RemotePath),
     Delete(RemotePath),
@@ -137,6 +138,16 @@ impl RemoteStorage for UnreliableWrapper {
         self.inner.list(prefix, mode, max_keys, cancel).await
     }
 
+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<crate::ListingObject, DownloadError> {
+        self.attempt(RemoteOp::HeadObject(key.clone()))
+            .map_err(DownloadError::Other)?;
+        self.inner.head_object(key, cancel).await
+    }
+
     async fn upload(
         &self,
         data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 35ec69fd50..9063b3c197 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -1,22 +1,22 @@
 use std::collections::{HashMap, HashSet};
 
 use anyhow::Context;
-use aws_sdk_s3::Client;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
+use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};
 use utils::generation::Generation;
 use utils::id::TimelineId;
 
 use crate::cloud_admin_api::BranchData;
-use crate::metadata_stream::stream_listing;
-use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
+use crate::metadata_stream::stream_listing_generic;
+use crate::{download_object_with_retries_generic, RootTarget, TenantShardTimelineId};
 use futures_util::StreamExt;
 use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
-use remote_storage::RemotePath;
+use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
 
 pub(crate) struct TimelineAnalysis {
     /// Anomalies detected
@@ -48,13 +48,12 @@ impl TimelineAnalysis {
 }
 
 pub(crate) async fn branch_cleanup_and_check_errors(
-    s3_client: &Client,
-    target: &RootTarget,
+    remote_client: &GenericRemoteStorage,
     id: &TenantShardTimelineId,
     tenant_objects: &mut TenantObjectListing,
     s3_active_branch: Option<&BranchData>,
     console_branch: Option<BranchData>,
-    s3_data: Option<S3TimelineBlobData>,
+    s3_data: Option<RemoteTimelineBlobData>,
 ) -> TimelineAnalysis {
     let mut result = TimelineAnalysis::new();
 
@@ -78,7 +77,9 @@ pub(crate) async fn branch_cleanup_and_check_errors(
 
     match s3_data {
         Some(s3_data) => {
-            result.garbage_keys.extend(s3_data.unknown_keys);
+            result
+                .garbage_keys
+                .extend(s3_data.unknown_keys.into_iter().map(|k| k.key.to_string()));
 
             match s3_data.blob_data {
                 BlobDataParseResult::Parsed {
@@ -143,11 +144,8 @@ pub(crate) async fn branch_cleanup_and_check_errors(
 
                             // HEAD request used here to address a race condition  when an index was uploaded concurrently
                             // with our scan. We check if the object is uploaded to S3 after taking the listing snapshot.
-                            let response = s3_client
-                                .head_object()
-                                .bucket(target.bucket_name())
-                                .key(path.get_path().as_str())
-                                .send()
+                            let response = remote_client
+                                .head_object(&path, &CancellationToken::new())
                                 .await;
 
                             if response.is_err() {
@@ -284,14 +282,14 @@ impl TenantObjectListing {
 }
 
 #[derive(Debug)]
-pub(crate) struct S3TimelineBlobData {
+pub(crate) struct RemoteTimelineBlobData {
     pub(crate) blob_data: BlobDataParseResult,
 
     // Index objects that were not used when loading `blob_data`, e.g. those from old generations
-    pub(crate) unused_index_keys: Vec<String>,
+    pub(crate) unused_index_keys: Vec<ListingObject>,
 
     // Objects whose keys were not recognized at all, i.e. not layer files, not indices
-    pub(crate) unknown_keys: Vec<String>,
+    pub(crate) unknown_keys: Vec<ListingObject>,
 }
 
 #[derive(Debug)]
@@ -322,11 +320,11 @@ pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generati
     }
 }
 
-pub(crate) async fn list_timeline_blobs(
-    s3_client: &Client,
+pub(crate) async fn list_timeline_blobs_generic(
+    remote_client: &GenericRemoteStorage,
     id: TenantShardTimelineId,
     s3_root: &RootTarget,
-) -> anyhow::Result<S3TimelineBlobData> {
+) -> anyhow::Result<RemoteTimelineBlobData> {
     let mut s3_layers = HashSet::new();
 
     let mut errors = Vec::new();
@@ -335,19 +333,25 @@ pub(crate) async fn list_timeline_blobs(
     let mut timeline_dir_target = s3_root.timeline_root(&id);
     timeline_dir_target.delimiter = String::new();
 
-    let mut index_part_keys: Vec<String> = Vec::new();
+    let mut index_part_keys: Vec<ListingObject> = Vec::new();
     let mut initdb_archive: bool = false;
 
-    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
-    while let Some(obj) = stream.next().await {
-        let obj = obj?;
-        let key = obj.key();
+    let prefix_str = &timeline_dir_target
+        .prefix_in_bucket
+        .strip_prefix("/")
+        .unwrap_or(&timeline_dir_target.prefix_in_bucket);
 
-        let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
+    let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
+    while let Some(obj) = stream.next().await {
+        let (key, Some(obj)) = obj? else {
+            panic!("ListingObject not specified");
+        };
+
+        let blob_name = key.get_path().as_str().strip_prefix(prefix_str);
         match blob_name {
             Some(name) if name.starts_with("index_part.json") => {
                 tracing::debug!("Index key {key}");
-                index_part_keys.push(key.to_owned())
+                index_part_keys.push(obj)
             }
             Some("initdb.tar.zst") => {
                 tracing::debug!("initdb archive {key}");
@@ -358,7 +362,7 @@ pub(crate) async fn list_timeline_blobs(
             }
             Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
                 Ok((new_layer, gen)) => {
-                    tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen);
+                    tracing::debug!("Parsed layer key: {new_layer} {gen:?}");
                     s3_layers.insert((new_layer, gen));
                 }
                 Err(e) => {
@@ -366,13 +370,13 @@ pub(crate) async fn list_timeline_blobs(
                     errors.push(
                         format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
                     );
-                    unknown_keys.push(key.to_string());
+                    unknown_keys.push(obj);
                 }
             },
             None => {
-                tracing::warn!("Unknown key {}", key);
+                tracing::warn!("Unknown key {key}");
                 errors.push(format!("S3 list response got an object with odd key {key}"));
-                unknown_keys.push(key.to_string());
+                unknown_keys.push(obj);
             }
         }
     }
@@ -381,7 +385,7 @@ pub(crate) async fn list_timeline_blobs(
         tracing::debug!(
             "Timeline is empty apart from initdb archive: expected post-deletion state."
         );
-        return Ok(S3TimelineBlobData {
+        return Ok(RemoteTimelineBlobData {
             blob_data: BlobDataParseResult::Relic,
             unused_index_keys: index_part_keys,
             unknown_keys: Vec::new(),
@@ -395,13 +399,13 @@ pub(crate) async fn list_timeline_blobs(
             // Stripping the index key to the last part, because RemotePath doesn't
             // like absolute paths, and depending on prefix_in_bucket it's possible
             // for the keys we read back to start with a slash.
-            let basename = key.rsplit_once('/').unwrap().1;
+            let basename = key.key.get_path().as_str().rsplit_once('/').unwrap().1;
             parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (key, g))
         })
         .max_by_key(|i| i.1)
         .map(|(k, g)| (k.clone(), g))
     {
-        Some((key, gen)) => (Some(key), gen),
+        Some((key, gen)) => (Some::<ListingObject>(key.to_owned()), gen),
         None => {
             // Legacy/missing case: one or zero index parts, which did not have a generation
             (index_part_keys.pop(), Generation::none())
@@ -416,17 +420,14 @@ pub(crate) async fn list_timeline_blobs(
     }
 
     if let Some(index_part_object_key) = index_part_object.as_ref() {
-        let index_part_bytes = download_object_with_retries(
-            s3_client,
-            &timeline_dir_target.bucket_name,
-            index_part_object_key,
-        )
-        .await
-        .context("index_part.json download")?;
+        let index_part_bytes =
+            download_object_with_retries_generic(remote_client, &index_part_object_key.key)
+                .await
+                .context("index_part.json download")?;
 
         match serde_json::from_slice(&index_part_bytes) {
             Ok(index_part) => {
-                return Ok(S3TimelineBlobData {
+                return Ok(RemoteTimelineBlobData {
                     blob_data: BlobDataParseResult::Parsed {
                         index_part: Box::new(index_part),
                         index_part_generation,
@@ -448,7 +449,7 @@ pub(crate) async fn list_timeline_blobs(
         );
     }
 
-    Ok(S3TimelineBlobData {
+    Ok(RemoteTimelineBlobData {
         blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
         unused_index_keys: index_part_keys,
         unknown_keys,
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 1fc94cc174..3183bc3c64 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -452,23 +452,26 @@ fn stream_objects_with_retries<'a>(
         let mut list_stream =
             storage_client.list_streaming(Some(&prefix), listing_mode, None, &cancel);
         while let Some(res) = list_stream.next().await {
-            if let Err(err) = res {
-                let yield_err = if err.is_permanent() {
-                    true
-                } else {
-                    let backoff_time = 1 << trial.max(5);
-                    tokio::time::sleep(Duration::from_secs(backoff_time)).await;
-                    trial += 1;
-                    trial == MAX_RETRIES - 1
-                };
-                if yield_err {
-                    yield Err(err)
-                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
-                    break;
+            match res {
+                Err(err) => {
+                    let yield_err = if err.is_permanent() {
+                        true
+                    } else {
+                        let backoff_time = 1 << trial.max(5);
+                        tokio::time::sleep(Duration::from_secs(backoff_time)).await;
+                        trial += 1;
+                        trial == MAX_RETRIES - 1
+                    };
+                    if yield_err {
+                        yield Err(err)
+                            .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
+                        break;
+                    }
+                }
+                Ok(res) => {
+                    trial = 0;
+                    yield Ok(res);
                 }
-            } else {
-                trial = 0;
-                yield res.map_err(anyhow::Error::from);
             }
         }
     }
@@ -513,41 +516,35 @@ async fn list_objects_with_retries_generic(
     panic!("MAX_RETRIES is not allowed to be 0");
 }
 
-async fn download_object_with_retries(
-    s3_client: &Client,
-    bucket_name: &str,
-    key: &str,
+async fn download_object_with_retries_generic(
+    remote_client: &GenericRemoteStorage,
+    key: &RemotePath,
 ) -> anyhow::Result<Vec<u8>> {
-    for _ in 0..MAX_RETRIES {
-        let mut body_buf = Vec::new();
-        let response_stream = match s3_client
-            .get_object()
-            .bucket(bucket_name)
-            .key(key)
-            .send()
-            .await
-        {
+    let cancel = CancellationToken::new();
+    for trial in 0..MAX_RETRIES {
+        let mut buf = Vec::new();
+        let download = match remote_client.download(key, &cancel).await {
             Ok(response) => response,
             Err(e) => {
                 error!("Failed to download object for key {key}: {e}");
-                tokio::time::sleep(Duration::from_secs(1)).await;
+                let backoff_time = 1 << trial.max(5);
+                tokio::time::sleep(Duration::from_secs(backoff_time)).await;
                 continue;
             }
         };
 
-        match response_stream
-            .body
-            .into_async_read()
-            .read_to_end(&mut body_buf)
+        match tokio_util::io::StreamReader::new(download.download_stream)
+            .read_to_end(&mut buf)
             .await
         {
             Ok(bytes_read) => {
                 tracing::debug!("Downloaded {bytes_read} bytes for object {key}");
-                return Ok(body_buf);
+                return Ok(buf);
             }
             Err(e) => {
                 error!("Failed to stream object body for key {key}: {e}");
-                tokio::time::sleep(Duration::from_secs(1)).await;
+                let backoff_time = 1 << trial.max(5);
+                tokio::time::sleep(Duration::from_secs(backoff_time)).await;
             }
         }
     }
diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index 54812ffc94..eca774413a 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -2,14 +2,14 @@ use std::str::FromStr;
 
 use anyhow::{anyhow, Context};
 use async_stream::{stream, try_stream};
-use aws_sdk_s3::{types::ObjectIdentifier, Client};
+use aws_sdk_s3::Client;
 use futures::StreamExt;
 use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath};
 use tokio_stream::Stream;
 
 use crate::{
-    list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target,
-    TenantShardTimelineId,
+    list_objects_with_retries, list_objects_with_retries_generic, stream_objects_with_retries,
+    RootTarget, S3Target, TenantShardTimelineId,
 };
 use pageserver_api::shard::TenantShardId;
 use utils::id::{TenantId, TimelineId};
@@ -75,53 +75,38 @@ pub fn stream_tenants<'a>(
 }
 
 pub async fn stream_tenant_shards<'a>(
-    s3_client: &'a Client,
+    remote_client: &'a GenericRemoteStorage,
     target: &'a RootTarget,
     tenant_id: TenantId,
 ) -> anyhow::Result<impl Stream<Item = Result<TenantShardId, anyhow::Error>> + 'a> {
-    let mut tenant_shard_ids: Vec<Result<TenantShardId, anyhow::Error>> = Vec::new();
-    let mut continuation_token = None;
     let shards_target = target.tenant_shards_prefix(&tenant_id);
 
-    loop {
-        tracing::info!("Listing in {}", shards_target.prefix_in_bucket);
-        let fetch_response =
-            list_objects_with_retries(s3_client, &shards_target, continuation_token.clone()).await;
-        let fetch_response = match fetch_response {
-            Err(e) => {
-                tenant_shard_ids.push(Err(e));
-                break;
-            }
-            Ok(r) => r,
-        };
+    let strip_prefix = target.tenants_root().prefix_in_bucket;
+    let prefix_str = &strip_prefix.strip_prefix("/").unwrap_or(&strip_prefix);
 
-        let new_entry_ids = fetch_response
-            .common_prefixes()
-            .iter()
-            .filter_map(|prefix| prefix.prefix())
-            .filter_map(|prefix| -> Option<&str> {
-                prefix
-                    .strip_prefix(&target.tenants_root().prefix_in_bucket)?
-                    .strip_suffix('/')
-            })
-            .map(|entry_id_str| {
-                let first_part = entry_id_str.split('/').next().unwrap();
+    tracing::info!("Listing shards in {}", shards_target.prefix_in_bucket);
+    let listing = list_objects_with_retries_generic(
+        remote_client,
+        ListingMode::WithDelimiter,
+        &shards_target,
+    )
+    .await?;
 
-                first_part
-                    .parse::<TenantShardId>()
-                    .with_context(|| format!("Incorrect entry id str: {first_part}"))
-            });
+    let tenant_shard_ids = listing
+        .prefixes
+        .iter()
+        .map(|prefix| prefix.get_path().as_str())
+        .filter_map(|prefix| -> Option<&str> { prefix.strip_prefix(prefix_str) })
+        .map(|entry_id_str| {
+            let first_part = entry_id_str.split('/').next().unwrap();
 
-        for i in new_entry_ids {
-            tenant_shard_ids.push(i);
-        }
-
-        match fetch_response.next_continuation_token {
-            Some(new_token) => continuation_token = Some(new_token),
-            None => break,
-        }
-    }
+            first_part
+                .parse::<TenantShardId>()
+                .with_context(|| format!("Incorrect entry id str: {first_part}"))
+        })
+        .collect::<Vec<_>>();
 
+    tracing::debug!("Yielding {} shards for {tenant_id}", tenant_shard_ids.len());
     Ok(stream! {
         for i in tenant_shard_ids {
             let id = i?;
@@ -130,65 +115,6 @@ pub async fn stream_tenant_shards<'a>(
     })
 }
 
-/// Given a TenantShardId, output a stream of the timelines within that tenant, discovered
-/// using ListObjectsv2.  The listing is done before the stream is built, so that this
-/// function can be used to generate concurrency on a stream using buffer_unordered.
-pub async fn stream_tenant_timelines<'a>(
-    s3_client: &'a Client,
-    target: &'a RootTarget,
-    tenant: TenantShardId,
-) -> anyhow::Result<impl Stream<Item = Result<TenantShardTimelineId, anyhow::Error>> + 'a> {
-    let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
-    let mut continuation_token = None;
-    let timelines_target = target.timelines_root(&tenant);
-
-    loop {
-        tracing::debug!("Listing in {}", tenant);
-        let fetch_response =
-            list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone())
-                .await;
-        let fetch_response = match fetch_response {
-            Err(e) => {
-                timeline_ids.push(Err(e));
-                break;
-            }
-            Ok(r) => r,
-        };
-
-        let new_entry_ids = fetch_response
-            .common_prefixes()
-            .iter()
-            .filter_map(|prefix| prefix.prefix())
-            .filter_map(|prefix| -> Option<&str> {
-                prefix
-                    .strip_prefix(&timelines_target.prefix_in_bucket)?
-                    .strip_suffix('/')
-            })
-            .map(|entry_id_str| {
-                entry_id_str
-                    .parse::<TimelineId>()
-                    .with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
-            });
-
-        for i in new_entry_ids {
-            timeline_ids.push(i);
-        }
-
-        match fetch_response.next_continuation_token {
-            Some(new_token) => continuation_token = Some(new_token),
-            None => break,
-        }
-    }
-
-    tracing::debug!("Yielding for {}", tenant);
-    Ok(stream! {
-        for i in timeline_ids {
-            let id = i?;
-            yield Ok(TenantShardTimelineId::new(tenant, id));
-        }
-    })
-}
-
 /// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
 /// using a listing. The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
@@ -200,6 +126,11 @@ pub async fn stream_tenant_timelines_generic<'a>(
     let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
     let timelines_target = target.timelines_root(&tenant);
 
+    let prefix_str = &timelines_target
+        .prefix_in_bucket
+        .strip_prefix("/")
+        .unwrap_or(&timelines_target.prefix_in_bucket);
+
     let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
         remote_client,
         ListingMode::WithDelimiter,
@@ -220,11 +151,7 @@ pub async fn stream_tenant_timelines_generic<'a>(
             .prefixes
             .iter()
             .filter_map(|prefix| -> Option<&str> {
-                prefix
-                    .get_path()
-                    .as_str()
-                    .strip_prefix(&timelines_target.prefix_in_bucket)?
-                    .strip_suffix('/')
+                prefix.get_path().as_str().strip_prefix(prefix_str)
             })
             .map(|entry_id_str| {
                 entry_id_str
@@ -237,7 +164,7 @@ pub async fn stream_tenant_timelines_generic<'a>(
         }
     }
 
-    tracing::debug!("Yielding for {}", tenant);
+    tracing::debug!("Yielding {} timelines for {}", timeline_ids.len(), tenant);
     Ok(stream! {
         for i in timeline_ids {
             let id = i?;
@@ -246,37 +173,6 @@ pub async fn stream_tenant_timelines_generic<'a>(
     })
 }
 
-pub(crate) fn stream_listing<'a>(
-    s3_client: &'a Client,
-    target: &'a S3Target,
-) -> impl Stream<Item = anyhow::Result<ObjectIdentifier>> + 'a {
-    try_stream! {
-        let mut continuation_token = None;
-        loop {
-            let fetch_response =
-                list_objects_with_retries(s3_client, target, continuation_token.clone()).await?;
-
-            if target.delimiter.is_empty() {
-                for object_key in fetch_response.contents().iter().filter_map(|object| object.key())
-                {
-                    let object_id = ObjectIdentifier::builder().key(object_key).build()?;
-                    yield object_id;
-                }
-            } else {
-                for prefix in fetch_response.common_prefixes().iter().filter_map(|p| p.prefix()) {
-                    let object_id = ObjectIdentifier::builder().key(prefix).build()?;
-                    yield object_id;
-                }
-            }
-
-            match fetch_response.next_continuation_token {
-                Some(new_token) => continuation_token = Some(new_token),
-                None => break,
-            }
-        }
-    }
-}
-
 pub(crate) fn stream_listing_generic<'a>(
     remote_client: &'a GenericRemoteStorage,
     target: &'a S3Target,
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 20d9bd6dd4..6828081128 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -1,11 +1,10 @@
 use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::sync::Arc;
-use std::time::{Duration, SystemTime};
+use std::time::Duration;
 
-use crate::checks::{list_timeline_blobs, BlobDataParseResult};
-use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
-use aws_sdk_s3::Client;
+use crate::checks::{list_timeline_blobs_generic, BlobDataParseResult};
+use crate::metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic};
+use crate::{init_remote_generic, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
@@ -13,10 +12,11 @@ use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use pageserver_api::controller_api::TenantDescribeResponse;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
-use remote_storage::RemotePath;
+use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
 use reqwest::Method;
 use serde::Serialize;
 use storage_controller_client::control_api;
+use tokio_util::sync::CancellationToken;
 use tracing::{info_span, Instrument};
 use utils::generation::Generation;
 use utils::id::{TenantId, TenantTimelineId};
@@ -240,38 +240,13 @@ impl TenantRefAccumulator {
     }
 }
 
-async fn is_old_enough(
-    s3_client: &Client,
-    bucket_config: &BucketConfig,
-    min_age: &Duration,
-    key: &str,
-    summary: &mut GcSummary,
-) -> bool {
+fn is_old_enough(min_age: &Duration, key: &ListingObject, summary: &mut GcSummary) -> bool {
     // Validation: we will only GC indices & layers after a time threshold (e.g. one week) so that during an incident
     // it is easier to read old data for analysis, and easier to roll back shard splits without having to un-delete any objects.
-    let age: Duration = match s3_client
-        .head_object()
-        .bucket(&bucket_config.bucket)
-        .key(key)
-        .send()
-        .await
-    {
-        Ok(response) => match response.last_modified {
-            None => {
-                tracing::warn!("Missing last_modified");
-                summary.remote_storage_errors += 1;
-                return false;
-            }
-            Some(last_modified) => match SystemTime::try_from(last_modified).map(|t| t.elapsed()) {
-                Ok(Ok(e)) => e,
-                Err(_) | Ok(Err(_)) => {
-                    tracing::warn!("Bad last_modified time: {last_modified:?}");
-                    return false;
-                }
-            },
-        },
-        Err(e) => {
-            tracing::warn!("Failed to HEAD {key}: {e}");
+    let age = match key.last_modified.elapsed() {
+        Ok(e) => e,
+        Err(_) => {
+            tracing::warn!("Bad last_modified time: {:?}", key.last_modified);
             summary.remote_storage_errors += 1;
             return false;
         }
@@ -289,17 +264,30 @@ async fn is_old_enough(
     old_enough
 }
 
+/// Same as [`is_old_enough`], but doesn't require a [`ListingObject`] passed to it.
+async fn check_is_old_enough(
+    remote_client: &GenericRemoteStorage,
+    key: &RemotePath,
+    min_age: &Duration,
+    summary: &mut GcSummary,
+) -> Option<bool> {
+    let listing_object = remote_client
+        .head_object(key, &CancellationToken::new())
+        .await
+        .ok()?;
+    Some(is_old_enough(min_age, &listing_object, summary))
+}
+
 async fn maybe_delete_index(
-    s3_client: &Client,
-    bucket_config: &BucketConfig,
+    remote_client: &GenericRemoteStorage,
     min_age: &Duration,
     latest_gen: Generation,
-    key: &str,
+    obj: &ListingObject,
     mode: GcMode,
     summary: &mut GcSummary,
 ) {
     // Validation: we will only delete things that parse cleanly
-    let basename = key.rsplit_once('/').unwrap().1;
+    let basename = obj.key.get_path().file_name().unwrap();
     let candidate_generation =
         match parse_remote_index_path(RemotePath::from_string(basename).unwrap()) {
             Some(g) => g,
@@ -328,7 +316,7 @@ async fn maybe_delete_index(
         return;
     }
 
-    if !is_old_enough(s3_client, bucket_config, min_age, key, summary).await {
+    if !is_old_enough(min_age, obj, summary) {
         return;
     }
 
@@ -338,11 +326,8 @@ async fn maybe_delete_index(
     }
 
     // All validations passed: erase the object
-    match s3_client
-        .delete_object()
-        .bucket(&bucket_config.bucket)
-        .key(key)
-        .send()
+    match remote_client
+        .delete(&obj.key, &CancellationToken::new())
         .await
     {
         Ok(_) => {
@@ -358,8 +343,7 @@ async fn maybe_delete_index(
 
 #[allow(clippy::too_many_arguments)]
 async fn gc_ancestor(
-    s3_client: &Client,
-    bucket_config: &BucketConfig,
+    remote_client: &GenericRemoteStorage,
     root_target: &RootTarget,
     min_age: &Duration,
     ancestor: TenantShardId,
@@ -368,7 +352,7 @@ async fn gc_ancestor(
     summary: &mut GcSummary,
 ) -> anyhow::Result<()> {
     // Scan timelines in the ancestor
-    let timelines = stream_tenant_timelines(s3_client, root_target, ancestor).await?;
+    let timelines = stream_tenant_timelines_generic(remote_client, root_target, ancestor).await?;
     let mut timelines = std::pin::pin!(timelines);
 
     // Build a list of keys to retain
@@ -376,7 +360,7 @@ async fn gc_ancestor(
     while let Some(ttid) = timelines.next().await {
         let ttid = ttid?;
 
-        let data = list_timeline_blobs(s3_client, ttid, root_target).await?;
+        let data = list_timeline_blobs_generic(remote_client, ttid, root_target).await?;
 
         let s3_layers = match data.blob_data {
             BlobDataParseResult::Parsed {
@@ -427,7 +411,8 @@ async fn gc_ancestor(
 
             // We apply a time threshold to GCing objects that are un-referenced: this preserves our ability
             // to roll back a shard split if we have to, by avoiding deleting ancestor layers right away
-            if !is_old_enough(s3_client, bucket_config, min_age, &key, summary).await {
+            let path = RemotePath::from_string(key.strip_prefix("/").unwrap_or(&key)).unwrap();
+            if check_is_old_enough(remote_client, &path, min_age, summary).await != Some(true) {
                 continue;
             }
 
@@ -437,13 +422,7 @@ async fn gc_ancestor(
             }
 
             // All validations passed: erase the object
-            match s3_client
-                .delete_object()
-                .bucket(&bucket_config.bucket)
-                .key(&key)
-                .send()
-                .await
-            {
+            match remote_client.delete(&path, &CancellationToken::new()).await {
                 Ok(_) => {
                     tracing::info!("Successfully deleted unreferenced ancestor layer {key}");
                     summary.ancestor_layers_deleted += 1;
@@ -477,10 +456,11 @@ pub async fn pageserver_physical_gc(
     min_age: Duration,
     mode: GcMode,
 ) -> anyhow::Result<GcSummary> {
-    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let (remote_client, target) =
+        init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
 
     let tenants = if tenant_shard_ids.is_empty() {
-        futures::future::Either::Left(stream_tenants(&s3_client, &target))
+        futures::future::Either::Left(stream_tenants_generic(&remote_client, &target))
     } else {
         futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok)))
     };
@@ -493,14 +473,13 @@ pub async fn pageserver_physical_gc(
     let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default()));
 
     // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
+    let timelines = tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, t));
     let timelines = timelines.try_buffered(CONCURRENCY);
     let timelines = timelines.try_flatten();
 
     // Generate a stream of S3TimelineBlobData
     async fn gc_timeline(
-        s3_client: &Client,
-        bucket_config: &BucketConfig,
+        remote_client: &GenericRemoteStorage,
         min_age: &Duration,
         target: &RootTarget,
         mode: GcMode,
@@ -508,7 +487,7 @@ pub async fn pageserver_physical_gc(
         accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
     ) -> anyhow::Result<GcSummary> {
         let mut summary = GcSummary::default();
-        let data = list_timeline_blobs(s3_client, ttid, target).await?;
+        let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
 
         let (index_part, latest_gen, candidates) = match &data.blob_data {
             BlobDataParseResult::Parsed {
@@ -533,17 +512,9 @@ pub async fn pageserver_physical_gc(
         accumulator.lock().unwrap().update(ttid, index_part);
 
         for key in candidates {
-            maybe_delete_index(
-                s3_client,
-                bucket_config,
-                min_age,
-                latest_gen,
-                &key,
-                mode,
-                &mut summary,
-            )
-            .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, key))
-            .await;
+            maybe_delete_index(remote_client, min_age, latest_gen, &key, mode, &mut summary)
+                .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, %key.key))
+                .await;
         }
 
         Ok(summary)
@@ -554,15 +525,7 @@ pub async fn pageserver_physical_gc(
     // Drain futures for per-shard GC, populating accumulator as a side effect
     {
         let timelines = timelines.map_ok(|ttid| {
-            gc_timeline(
-                &s3_client,
-                bucket_config,
-                &min_age,
-                &target,
-                mode,
-                ttid,
-                &accumulator,
-            )
+            gc_timeline(&remote_client, &min_age, &target, mode, ttid, &accumulator)
         });
         let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
@@ -586,8 +549,7 @@ pub async fn pageserver_physical_gc(
 
     for ancestor_shard in ancestor_shards {
         gc_ancestor(
-            &s3_client,
-            bucket_config,
+            &remote_client,
             &target,
             &min_age,
             ancestor_shard,
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index 2409b7b132..e89e97ccb6 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -1,16 +1,16 @@
 use std::collections::{HashMap, HashSet};
 
 use crate::checks::{
-    branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData,
-    TenantObjectListing, TimelineAnalysis,
+    branch_cleanup_and_check_errors, list_timeline_blobs_generic, BlobDataParseResult,
+    RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis,
 };
-use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
-use aws_sdk_s3::Client;
+use crate::metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic};
+use crate::{init_remote_generic, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
 use pageserver_api::controller_api::MetadataHealthUpdateRequest;
 use pageserver_api::shard::TenantShardId;
+use remote_storage::GenericRemoteStorage;
 use serde::Serialize;
 use utils::id::TenantId;
 use utils::shard::ShardCount;
@@ -36,7 +36,7 @@ impl MetadataSummary {
         Self::default()
     }
 
-    fn update_data(&mut self, data: &S3TimelineBlobData) {
+    fn update_data(&mut self, data: &RemoteTimelineBlobData) {
         self.timeline_shard_count += 1;
         if let BlobDataParseResult::Parsed {
             index_part,
@@ -120,10 +120,10 @@ pub async fn scan_pageserver_metadata(
     bucket_config: BucketConfig,
     tenant_ids: Vec<TenantShardId>,
 ) -> anyhow::Result<MetadataSummary> {
-    let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?;
+    let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Pageserver).await?;
 
     let tenants = if tenant_ids.is_empty() {
-        futures::future::Either::Left(stream_tenants(&s3_client, &target))
+        futures::future::Either::Left(stream_tenants_generic(&remote_client, &target))
     } else {
         futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
     };
@@ -133,20 +133,20 @@ pub async fn scan_pageserver_metadata(
     const CONCURRENCY: usize = 32;
 
     // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
+    let timelines = tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, t));
     let timelines = timelines.try_buffered(CONCURRENCY);
     let timelines = timelines.try_flatten();
 
     // Generate a stream of S3TimelineBlobData
     async fn report_on_timeline(
-        s3_client: &Client,
+        remote_client: &GenericRemoteStorage,
         target: &RootTarget,
         ttid: TenantShardTimelineId,
-    ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> {
-        let data = list_timeline_blobs(s3_client, ttid, target).await?;
+    ) -> anyhow::Result<(TenantShardTimelineId, RemoteTimelineBlobData)> {
+        let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
         Ok((ttid, data))
     }
-    let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid));
+    let timelines = timelines.map_ok(|ttid| report_on_timeline(&remote_client, &target, ttid));
     let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
     // We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different
@@ -157,12 +157,11 @@ pub async fn scan_pageserver_metadata(
     let mut tenant_timeline_results = Vec::new();
 
     async fn analyze_tenant(
-        s3_client: &Client,
-        target: &RootTarget,
+        remote_client: &GenericRemoteStorage,
         tenant_id: TenantId,
         summary: &mut MetadataSummary,
         mut tenant_objects: TenantObjectListing,
-        timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>,
+        timelines: Vec<(TenantShardTimelineId, RemoteTimelineBlobData)>,
         highest_shard_count: ShardCount,
     ) {
         summary.tenant_count += 1;
@@ -191,8 +190,7 @@ pub async fn scan_pageserver_metadata(
                 // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
                 // reference counts for layers across the tenant.
                 let analysis = branch_cleanup_and_check_errors(
-                    s3_client,
-                    target,
+                    remote_client,
                     &ttid,
                     &mut tenant_objects,
                     None,
@@ -273,8 +271,7 @@ pub async fn scan_pageserver_metadata(
                     let tenant_objects = std::mem::take(&mut tenant_objects);
                     let timelines = std::mem::take(&mut tenant_timeline_results);
                     analyze_tenant(
-                        &s3_client,
-                        &target,
+                        &remote_client,
                         prev_tenant_id,
                         &mut summary,
                         tenant_objects,
@@ -311,8 +308,7 @@ pub async fn scan_pageserver_metadata(
 
     if !tenant_timeline_results.is_empty() {
         analyze_tenant(
-            &s3_client,
-            &target,
+            &remote_client,
             tenant_id.expect("Must be set if results are present"),
             &mut summary,
             tenant_objects,
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index 08a4541c5c..f20fa27d13 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -188,6 +188,11 @@ async fn check_timeline(
     // we need files, so unset it.
     timeline_dir_target.delimiter = String::new();
 
+    let prefix_str = &timeline_dir_target
+        .prefix_in_bucket
+        .strip_prefix("/")
+        .unwrap_or(&timeline_dir_target.prefix_in_bucket);
+
     let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
     while let Some(obj) = stream.next().await {
         let (key, _obj) = obj?;
@@ -195,7 +200,7 @@ async fn check_timeline(
         let seg_name = key
             .get_path()
             .as_str()
-            .strip_prefix(&timeline_dir_target.prefix_in_bucket)
+            .strip_prefix(prefix_str)
             .expect("failed to extract segment name");
         expected_segfiles.remove(seg_name);
     }
diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
index 1866e6ec80..fc3a973922 100644
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -1,10 +1,11 @@
 use std::collections::HashMap;
 use std::sync::Arc;
 
-use crate::checks::{list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData};
-use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines};
+use crate::checks::{list_timeline_blobs_generic, BlobDataParseResult, RemoteTimelineBlobData};
+use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines_generic};
 use crate::{
-    download_object_to_file, init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId,
+    download_object_to_file, init_remote, init_remote_generic, BucketConfig, NodeKind, RootTarget,
+    TenantShardTimelineId,
 };
 use anyhow::Context;
 use async_stream::stream;
@@ -15,6 +16,7 @@ use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use pageserver_api::shard::TenantShardId;
+use remote_storage::GenericRemoteStorage;
 use utils::generation::Generation;
 use utils::id::TenantId;
 
@@ -215,11 +217,11 @@ impl SnapshotDownloader {
     }
 
     pub async fn download(&self) -> anyhow::Result<()> {
-        let (s3_client, target) =
-            init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?;
+        let (remote_client, target) =
+            init_remote_generic(self.bucket_config.clone(), NodeKind::Pageserver).await?;
 
         // Generate a stream of TenantShardId
-        let shards = stream_tenant_shards(&s3_client, &target, self.tenant_id).await?;
+        let shards = stream_tenant_shards(&remote_client, &target, self.tenant_id).await?;
         let shards: Vec<TenantShardId> = shards.try_collect().await?;
 
         // Only read from shards that have the highest count: avoids redundantly downloading
@@ -237,18 +239,19 @@ impl SnapshotDownloader {
 
         for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
             // Generate a stream of TenantTimelineId
-            let timelines = stream_tenant_timelines(&s3_client, &self.s3_root, shard).await?;
+            let timelines = stream_tenant_timelines_generic(&remote_client, &target, shard).await?;
 
             // Generate a stream of S3TimelineBlobData
             async fn load_timeline_index(
-                s3_client: &Client,
+                remote_client: &GenericRemoteStorage,
                 target: &RootTarget,
                 ttid: TenantShardTimelineId,
-            ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> {
-                let data = list_timeline_blobs(s3_client, ttid, target).await?;
+            ) -> anyhow::Result<(TenantShardTimelineId, RemoteTimelineBlobData)> {
+                let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
                 Ok((ttid, data))
             }
-            let timelines = timelines.map_ok(|ttid| load_timeline_index(&s3_client, &target, ttid));
+            let timelines =
+                timelines.map_ok(|ttid| load_timeline_index(&remote_client, &target, ttid));
             let mut timelines = std::pin::pin!(timelines.try_buffered(8));
 
             while let Some(i) = timelines.next().await {
@@ -278,7 +281,7 @@ impl SnapshotDownloader {
 
         for (ttid, layers) in ancestor_layers.into_iter() {
             tracing::info!(
-                "Downloading {} layers from ancvestor timeline {ttid}...",
+                "Downloading {} layers from ancestor timeline {ttid}...",
                 layers.len()
             );
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ba6fbc003a..9aa275d343 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4643,6 +4643,7 @@ class StorageScrubber:
         ]
         args = base_args + args
 
+        log.info(f"Invoking scrubber command {args} with env: {env}")
         (output_path, stdout, status_code) = subprocess_capture(
             self.log_dir,
             args,

From 6949b45e1795816507f5025a474e15d718e07456 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 19 Aug 2024 17:44:10 +0200
Subject: [PATCH 1446/1571] Update aws -> infra for repo rename (#8755)

See slack thread:
https://neondb.slack.com/archives/C039YKBRZB4/p1722501766006179
---
 .github/workflows/build_and_test.yml               | 10 +++++-----
 docs/rfcs/033-storage-controller-drain-and-fill.md |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index ee6d3ba005..92fff4ffbc 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -985,10 +985,10 @@ jobs:
           GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
         run: |
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
+            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
             gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
               -f deployPgSniRouter=false \
               -f deployProxy=false \
               -f deployStorage=true \
@@ -998,14 +998,14 @@ jobs:
               -f dockerTag=${{needs.tag.outputs.build-tag}} \
               -f deployPreprodRegion=true
 
-            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
               -f deployStorage=true \
               -f deployStorageBroker=true \
               -f deployStorageController=true \
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
               -f deployPgSniRouter=true \
               -f deployProxy=true \
               -f deployStorage=false \
@@ -1015,7 +1015,7 @@ jobs:
               -f dockerTag=${{needs.tag.outputs.build-tag}} \
               -f deployPreprodRegion=true
 
-            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
               -f deployPgSniRouter=true \
               -f deployProxy=true \
               -f branch=main \
diff --git a/docs/rfcs/033-storage-controller-drain-and-fill.md b/docs/rfcs/033-storage-controller-drain-and-fill.md
index 77c84cd2a5..733f7c0bd8 100644
--- a/docs/rfcs/033-storage-controller-drain-and-fill.md
+++ b/docs/rfcs/033-storage-controller-drain-and-fill.md
@@ -14,7 +14,7 @@ picked tenant (which requested on-demand activation) for around 30 seconds
 during the restart at 2024-04-03 16:37 UTC.
 
 Note that lots of shutdowns on loaded pageservers do not finish within the
-[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
+[10 second systemd enforced timeout](https://github.com/neondatabase/infra/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
 and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
 
 This problem is not yet very acutely felt in storage controller managed pageservers since

From 4b26783c94b582dad20efb49ca2ca842c6f944b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 19 Aug 2024 23:58:47 +0200
Subject: [PATCH 1447/1571] scrubber: remove _generic postfix and two unused
 functions (#8761)

Removes the `_generic` postfix from the `GenericRemoteStorage` using
APIs, as `remote_storage` is the "default" now, and add a `_s3` postfix
to the remaining APIs using the S3 SDK (only in tenant snapshot). Also,
remove two unused functions: `list_objects_with_retries` and
`stream_tenants functions`.

Part of https://github.com/neondatabase/neon/issues/7547
---
 storage_scrubber/src/checks.rs                | 14 ++---
 storage_scrubber/src/find_large_objects.rs    |  7 +--
 storage_scrubber/src/garbage.rs               | 26 ++++-----
 storage_scrubber/src/lib.rs                   | 51 +++-------------
 storage_scrubber/src/metadata_stream.rs       | 58 +++----------------
 .../src/pageserver_physical_gc.rs             | 19 +++---
 .../src/scan_pageserver_metadata.rs           | 14 ++---
 .../src/scan_safekeeper_metadata.rs           |  9 ++-
 storage_scrubber/src/tenant_snapshot.rs       | 17 +++---
 9 files changed, 67 insertions(+), 148 deletions(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 9063b3c197..b35838bcf7 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -10,8 +10,8 @@ use utils::generation::Generation;
 use utils::id::TimelineId;
 
 use crate::cloud_admin_api::BranchData;
-use crate::metadata_stream::stream_listing_generic;
-use crate::{download_object_with_retries_generic, RootTarget, TenantShardTimelineId};
+use crate::metadata_stream::stream_listing;
+use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
 use futures_util::StreamExt;
 use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
 use pageserver::tenant::storage_layer::LayerName;
@@ -320,17 +320,17 @@ pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generati
     }
 }
 
-pub(crate) async fn list_timeline_blobs_generic(
+pub(crate) async fn list_timeline_blobs(
     remote_client: &GenericRemoteStorage,
     id: TenantShardTimelineId,
-    s3_root: &RootTarget,
+    root_target: &RootTarget,
 ) -> anyhow::Result<RemoteTimelineBlobData> {
     let mut s3_layers = HashSet::new();
 
     let mut errors = Vec::new();
     let mut unknown_keys = Vec::new();
 
-    let mut timeline_dir_target = s3_root.timeline_root(&id);
+    let mut timeline_dir_target = root_target.timeline_root(&id);
     timeline_dir_target.delimiter = String::new();
 
     let mut index_part_keys: Vec<ListingObject> = Vec::new();
@@ -341,7 +341,7 @@ pub(crate) async fn list_timeline_blobs_generic(
         .strip_prefix("/")
         .unwrap_or(&timeline_dir_target.prefix_in_bucket);
 
-    let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
+    let mut stream = std::pin::pin!(stream_listing(remote_client, &timeline_dir_target));
     while let Some(obj) = stream.next().await {
         let (key, Some(obj)) = obj? else {
             panic!("ListingObject not specified");
@@ -421,7 +421,7 @@ pub(crate) async fn list_timeline_blobs_generic(
 
     if let Some(index_part_object_key) = index_part_object.as_ref() {
         let index_part_bytes =
-            download_object_with_retries_generic(remote_client, &index_part_object_key.key)
+            download_object_with_retries(remote_client, &index_part_object_key.key)
                 .await
                 .context("index_part.json download")?;
 
diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs
index f5bb7e088a..88e36af560 100644
--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -6,7 +6,7 @@ use remote_storage::ListingMode;
 use serde::{Deserialize, Serialize};
 
 use crate::{
-    checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic,
+    checks::parse_layer_object_name, init_remote, metadata_stream::stream_tenants,
     stream_objects_with_retries, BucketConfig, NodeKind,
 };
 
@@ -50,9 +50,8 @@ pub async fn find_large_objects(
     ignore_deltas: bool,
     concurrency: usize,
 ) -> anyhow::Result<LargeObjectListing> {
-    let (remote_client, target) =
-        init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
-    let tenants = pin!(stream_tenants_generic(&remote_client, &target));
+    let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let tenants = pin!(stream_tenants(&remote_client, &target));
 
     let objects_stream = tenants.map_ok(|tenant_shard_id| {
         let mut tenant_root = target.tenant_root(&tenant_shard_id);
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index d6a73bf366..3e22960f8d 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -19,8 +19,8 @@ use utils::id::TenantId;
 
 use crate::{
     cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
-    init_remote_generic, list_objects_with_retries_generic,
-    metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic},
+    init_remote, list_objects_with_retries,
+    metadata_stream::{stream_tenant_timelines, stream_tenants},
     BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
 };
 
@@ -153,7 +153,7 @@ async fn find_garbage_inner(
     node_kind: NodeKind,
 ) -> anyhow::Result<GarbageList> {
     // Construct clients for S3 and for Console API
-    let (remote_client, target) = init_remote_generic(bucket_config.clone(), node_kind).await?;
+    let (remote_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
     let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));
 
     // Build a set of console-known tenants, for quickly eliminating known-active tenants without having
@@ -179,7 +179,7 @@ async fn find_garbage_inner(
 
     // Enumerate Tenants in S3, and check if each one exists in Console
     tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
-    let tenants = stream_tenants_generic(&remote_client, &target);
+    let tenants = stream_tenants(&remote_client, &target);
     let tenants_checked = tenants.map_ok(|t| {
         let api_client = cloud_admin_api_client.clone();
         let console_cache = console_cache.clone();
@@ -237,14 +237,13 @@ async fn find_garbage_inner(
         // Special case: If it's missing in console, check for known bugs that would enable us to conclusively
         // identify it as purge-able anyway
         if console_result.is_none() {
-            let timelines =
-                stream_tenant_timelines_generic(&remote_client, &target, tenant_shard_id)
-                    .await?
-                    .collect::<Vec<_>>()
-                    .await;
+            let timelines = stream_tenant_timelines(&remote_client, &target, tenant_shard_id)
+                .await?
+                .collect::<Vec<_>>()
+                .await;
             if timelines.is_empty() {
                 // No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
-                let tenant_objects = list_objects_with_retries_generic(
+                let tenant_objects = list_objects_with_retries(
                     &remote_client,
                     ListingMode::WithDelimiter,
                     &target.tenant_root(&tenant_shard_id),
@@ -265,7 +264,7 @@ async fn find_garbage_inner(
 
                 for timeline_r in timelines {
                     let timeline = timeline_r?;
-                    let timeline_objects = list_objects_with_retries_generic(
+                    let timeline_objects = list_objects_with_retries(
                         &remote_client,
                         ListingMode::WithDelimiter,
                         &target.timeline_root(&timeline),
@@ -331,8 +330,7 @@ async fn find_garbage_inner(
 
     // Construct a stream of all timelines within active tenants
     let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok));
-    let timelines =
-        active_tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, *t));
+    let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, *t));
     let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY);
     let timelines = timelines.try_flatten();
 
@@ -507,7 +505,7 @@ pub async fn purge_garbage(
     );
 
     let (remote_client, _target) =
-        init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
+        init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
 
     assert_eq!(
         &garbage_list.bucket_config.bucket,
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 3183bc3c64..112f052e07 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -15,7 +15,7 @@ use std::fmt::Display;
 use std::sync::Arc;
 use std::time::Duration;
 
-use anyhow::{anyhow, Context};
+use anyhow::Context;
 use aws_config::retry::{RetryConfigBuilder, RetryMode};
 use aws_sdk_s3::config::Region;
 use aws_sdk_s3::error::DisplayErrorContext;
@@ -352,7 +352,7 @@ fn make_root_target(
     }
 }
 
-async fn init_remote(
+async fn init_remote_s3(
     bucket_config: BucketConfig,
     node_kind: NodeKind,
 ) -> anyhow::Result<(Arc<Client>, RootTarget)> {
@@ -369,7 +369,7 @@ async fn init_remote(
     Ok((s3_client, s3_root))
 }
 
-async fn init_remote_generic(
+async fn init_remote(
     bucket_config: BucketConfig,
     node_kind: NodeKind,
 ) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> {
@@ -394,45 +394,10 @@ async fn init_remote_generic(
 
     // We already pass the prefix to the remote client above
     let prefix_in_root_target = String::new();
-    let s3_root = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind);
+    let root_target = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind);
 
     let client = GenericRemoteStorage::from_config(&storage_config).await?;
-    Ok((client, s3_root))
-}
-
-async fn list_objects_with_retries(
-    s3_client: &Client,
-    s3_target: &S3Target,
-    continuation_token: Option<String>,
-) -> anyhow::Result<aws_sdk_s3::operation::list_objects_v2::ListObjectsV2Output> {
-    for trial in 0..MAX_RETRIES {
-        match s3_client
-            .list_objects_v2()
-            .bucket(&s3_target.bucket_name)
-            .prefix(&s3_target.prefix_in_bucket)
-            .delimiter(&s3_target.delimiter)
-            .set_continuation_token(continuation_token.clone())
-            .send()
-            .await
-        {
-            Ok(response) => return Ok(response),
-            Err(e) => {
-                if trial == MAX_RETRIES - 1 {
-                    return Err(e)
-                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
-                }
-                error!(
-                    "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
-                    s3_target.bucket_name,
-                    s3_target.prefix_in_bucket,
-                    s3_target.delimiter,
-                    DisplayErrorContext(e),
-                );
-                tokio::time::sleep(Duration::from_secs(1)).await;
-            }
-        }
-    }
-    Err(anyhow!("unreachable unless MAX_RETRIES==0"))
+    Ok((client, root_target))
 }
 
 /// Listing possibly large amounts of keys in a streaming fashion.
@@ -479,7 +444,7 @@ fn stream_objects_with_retries<'a>(
 
 /// If you want to list a bounded amount of prefixes or keys. For larger numbers of keys/prefixes,
 /// use [`stream_objects_with_retries`] instead.
-async fn list_objects_with_retries_generic(
+async fn list_objects_with_retries(
     remote_client: &GenericRemoteStorage,
     listing_mode: ListingMode,
     s3_target: &S3Target,
@@ -516,7 +481,7 @@ async fn list_objects_with_retries_generic(
     panic!("MAX_RETRIES is not allowed to be 0");
 }
 
-async fn download_object_with_retries_generic(
+async fn download_object_with_retries(
     remote_client: &GenericRemoteStorage,
     key: &RemotePath,
 ) -> anyhow::Result<Vec<u8>> {
@@ -552,7 +517,7 @@ async fn download_object_with_retries_generic(
     anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times")
 }
 
-async fn download_object_to_file(
+async fn download_object_to_file_s3(
     s3_client: &Client,
     bucket_name: &str,
     key: &str,
diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index eca774413a..10d77937f1 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -2,20 +2,19 @@ use std::str::FromStr;
 
 use anyhow::{anyhow, Context};
 use async_stream::{stream, try_stream};
-use aws_sdk_s3::Client;
 use futures::StreamExt;
 use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath};
 use tokio_stream::Stream;
 
 use crate::{
-    list_objects_with_retries, list_objects_with_retries_generic, stream_objects_with_retries,
-    RootTarget, S3Target, TenantShardTimelineId,
+    list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target,
+    TenantShardTimelineId,
 };
 use pageserver_api::shard::TenantShardId;
 use utils::id::{TenantId, TimelineId};
 
 /// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes
-pub fn stream_tenants_generic<'a>(
+pub fn stream_tenants<'a>(
     remote_client: &'a GenericRemoteStorage,
     target: &'a RootTarget,
 ) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
@@ -36,44 +35,6 @@ pub fn stream_tenants_generic<'a>(
     }
 }
 
-/// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
-pub fn stream_tenants<'a>(
-    s3_client: &'a Client,
-    target: &'a RootTarget,
-) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
-    try_stream! {
-        let mut continuation_token = None;
-        let tenants_target = target.tenants_root();
-        loop {
-            let fetch_response =
-                list_objects_with_retries(s3_client, &tenants_target, continuation_token.clone()).await?;
-
-            let new_entry_ids = fetch_response
-                .common_prefixes()
-                .iter()
-                .filter_map(|prefix| prefix.prefix())
-                .filter_map(|prefix| -> Option<&str> {
-                    prefix
-                        .strip_prefix(&tenants_target.prefix_in_bucket)?
-                        .strip_suffix('/')
-                }).map(|entry_id_str| {
-                entry_id_str
-                    .parse()
-                    .with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
-            });
-
-            for i in new_entry_ids {
-                yield i?;
-            }
-
-            match fetch_response.next_continuation_token {
-                Some(new_token) => continuation_token = Some(new_token),
-                None => break,
-            }
-        }
-    }
-}
-
 pub async fn stream_tenant_shards<'a>(
     remote_client: &'a GenericRemoteStorage,
     target: &'a RootTarget,
@@ -85,12 +46,9 @@ pub async fn stream_tenant_shards<'a>(
     let prefix_str = &strip_prefix.strip_prefix("/").unwrap_or(&strip_prefix);
 
     tracing::info!("Listing shards in {}", shards_target.prefix_in_bucket);
-    let listing = list_objects_with_retries_generic(
-        remote_client,
-        ListingMode::WithDelimiter,
-        &shards_target,
-    )
-    .await?;
+    let listing =
+        list_objects_with_retries(remote_client, ListingMode::WithDelimiter, &shards_target)
+            .await?;
 
     let tenant_shard_ids = listing
         .prefixes
@@ -118,7 +76,7 @@ pub async fn stream_tenant_shards<'a>(
 /// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
 /// using a listing. The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
-pub async fn stream_tenant_timelines_generic<'a>(
+pub async fn stream_tenant_timelines<'a>(
     remote_client: &'a GenericRemoteStorage,
     target: &'a RootTarget,
     tenant: TenantShardId,
@@ -173,7 +131,7 @@ pub async fn stream_tenant_timelines_generic<'a>(
     })
 }
 
-pub(crate) fn stream_listing_generic<'a>(
+pub(crate) fn stream_listing<'a>(
     remote_client: &'a GenericRemoteStorage,
     target: &'a S3Target,
 ) -> impl Stream<Item = anyhow::Result<(RemotePath, Option<ListingObject>)>> + 'a {
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 6828081128..88681e38c2 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -2,9 +2,9 @@ use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::sync::Arc;
 use std::time::Duration;
 
-use crate::checks::{list_timeline_blobs_generic, BlobDataParseResult};
-use crate::metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic};
-use crate::{init_remote_generic, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
+use crate::checks::{list_timeline_blobs, BlobDataParseResult};
+use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
+use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
@@ -352,7 +352,7 @@ async fn gc_ancestor(
     summary: &mut GcSummary,
 ) -> anyhow::Result<()> {
     // Scan timelines in the ancestor
-    let timelines = stream_tenant_timelines_generic(remote_client, root_target, ancestor).await?;
+    let timelines = stream_tenant_timelines(remote_client, root_target, ancestor).await?;
     let mut timelines = std::pin::pin!(timelines);
 
     // Build a list of keys to retain
@@ -360,7 +360,7 @@ async fn gc_ancestor(
     while let Some(ttid) = timelines.next().await {
         let ttid = ttid?;
 
-        let data = list_timeline_blobs_generic(remote_client, ttid, root_target).await?;
+        let data = list_timeline_blobs(remote_client, ttid, root_target).await?;
 
         let s3_layers = match data.blob_data {
             BlobDataParseResult::Parsed {
@@ -456,11 +456,10 @@ pub async fn pageserver_physical_gc(
     min_age: Duration,
     mode: GcMode,
 ) -> anyhow::Result<GcSummary> {
-    let (remote_client, target) =
-        init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
 
     let tenants = if tenant_shard_ids.is_empty() {
-        futures::future::Either::Left(stream_tenants_generic(&remote_client, &target))
+        futures::future::Either::Left(stream_tenants(&remote_client, &target))
     } else {
         futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok)))
     };
@@ -473,7 +472,7 @@ pub async fn pageserver_physical_gc(
     let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default()));
 
     // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, t));
+    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t));
     let timelines = timelines.try_buffered(CONCURRENCY);
     let timelines = timelines.try_flatten();
 
@@ -487,7 +486,7 @@ pub async fn pageserver_physical_gc(
         accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
     ) -> anyhow::Result<GcSummary> {
         let mut summary = GcSummary::default();
-        let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
+        let data = list_timeline_blobs(remote_client, ttid, target).await?;
 
         let (index_part, latest_gen, candidates) = match &data.blob_data {
             BlobDataParseResult::Parsed {
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index e89e97ccb6..151ef27672 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -1,11 +1,11 @@
 use std::collections::{HashMap, HashSet};
 
 use crate::checks::{
-    branch_cleanup_and_check_errors, list_timeline_blobs_generic, BlobDataParseResult,
+    branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult,
     RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis,
 };
-use crate::metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic};
-use crate::{init_remote_generic, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
+use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
+use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
 use pageserver_api::controller_api::MetadataHealthUpdateRequest;
@@ -120,10 +120,10 @@ pub async fn scan_pageserver_metadata(
     bucket_config: BucketConfig,
     tenant_ids: Vec<TenantShardId>,
 ) -> anyhow::Result<MetadataSummary> {
-    let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Pageserver).await?;
+    let (remote_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?;
 
     let tenants = if tenant_ids.is_empty() {
-        futures::future::Either::Left(stream_tenants_generic(&remote_client, &target))
+        futures::future::Either::Left(stream_tenants(&remote_client, &target))
     } else {
         futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
     };
@@ -133,7 +133,7 @@ pub async fn scan_pageserver_metadata(
     const CONCURRENCY: usize = 32;
 
     // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, t));
+    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t));
     let timelines = timelines.try_buffered(CONCURRENCY);
     let timelines = timelines.try_flatten();
 
@@ -143,7 +143,7 @@ pub async fn scan_pageserver_metadata(
         target: &RootTarget,
         ttid: TenantShardTimelineId,
     ) -> anyhow::Result<(TenantShardTimelineId, RemoteTimelineBlobData)> {
-        let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
+        let data = list_timeline_blobs(remote_client, ttid, target).await?;
         Ok((ttid, data))
     }
     let timelines = timelines.map_ok(|ttid| report_on_timeline(&remote_client, &target, ttid));
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index f20fa27d13..1a9f3d0ef5 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -14,9 +14,8 @@ use utils::{
 };
 
 use crate::{
-    cloud_admin_api::CloudAdminApiClient, init_remote_generic,
-    metadata_stream::stream_listing_generic, BucketConfig, ConsoleConfig, NodeKind, RootTarget,
-    TenantShardTimelineId,
+    cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
+    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
 };
 
 /// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
@@ -107,7 +106,7 @@ pub async fn scan_safekeeper_metadata(
     let timelines = client.query(&query, &[]).await?;
     info!("loaded {} timelines", timelines.len());
 
-    let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Safekeeper).await?;
+    let (remote_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?;
     let console_config = ConsoleConfig::from_env()?;
     let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
 
@@ -193,7 +192,7 @@ async fn check_timeline(
         .strip_prefix("/")
         .unwrap_or(&timeline_dir_target.prefix_in_bucket);
 
-    let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
+    let mut stream = std::pin::pin!(stream_listing(remote_client, &timeline_dir_target));
     while let Some(obj) = stream.next().await {
         let (key, _obj) = obj?;
 
diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
index fc3a973922..bb4079b5f4 100644
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -1,10 +1,10 @@
 use std::collections::HashMap;
 use std::sync::Arc;
 
-use crate::checks::{list_timeline_blobs_generic, BlobDataParseResult, RemoteTimelineBlobData};
-use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines_generic};
+use crate::checks::{list_timeline_blobs, BlobDataParseResult, RemoteTimelineBlobData};
+use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines};
 use crate::{
-    download_object_to_file, init_remote, init_remote_generic, BucketConfig, NodeKind, RootTarget,
+    download_object_to_file_s3, init_remote, init_remote_s3, BucketConfig, NodeKind, RootTarget,
     TenantShardTimelineId,
 };
 use anyhow::Context;
@@ -36,7 +36,8 @@ impl SnapshotDownloader {
         output_path: Utf8PathBuf,
         concurrency: usize,
     ) -> anyhow::Result<Self> {
-        let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
+        let (s3_client, s3_root) =
+            init_remote_s3(bucket_config.clone(), NodeKind::Pageserver).await?;
         Ok(Self {
             s3_client,
             s3_root,
@@ -93,7 +94,7 @@ impl SnapshotDownloader {
             let Some(version) = versions.versions.as_ref().and_then(|v| v.first()) else {
                 return Err(anyhow::anyhow!("No versions found for {remote_layer_path}"));
             };
-            download_object_to_file(
+            download_object_to_file_s3(
                 &self.s3_client,
                 &self.bucket_config.bucket,
                 &remote_layer_path,
@@ -218,7 +219,7 @@ impl SnapshotDownloader {
 
     pub async fn download(&self) -> anyhow::Result<()> {
         let (remote_client, target) =
-            init_remote_generic(self.bucket_config.clone(), NodeKind::Pageserver).await?;
+            init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?;
 
         // Generate a stream of TenantShardId
         let shards = stream_tenant_shards(&remote_client, &target, self.tenant_id).await?;
@@ -239,7 +240,7 @@ impl SnapshotDownloader {
 
         for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
             // Generate a stream of TenantTimelineId
-            let timelines = stream_tenant_timelines_generic(&remote_client, &target, shard).await?;
+            let timelines = stream_tenant_timelines(&remote_client, &target, shard).await?;
 
             // Generate a stream of S3TimelineBlobData
             async fn load_timeline_index(
@@ -247,7 +248,7 @@ impl SnapshotDownloader {
                 target: &RootTarget,
                 ttid: TenantShardTimelineId,
             ) -> anyhow::Result<(TenantShardTimelineId, RemoteTimelineBlobData)> {
-                let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
+                let data = list_timeline_blobs(remote_client, ttid, target).await?;
                 Ok((ttid, data))
             }
             let timelines =

From 4c5a0fdc75c16b007ed9c042d41ec37bae1c0f75 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 7 Aug 2024 19:26:06 +0300
Subject: [PATCH 1448/1571] safekeeper: check for non-consecutive writes in
 safekeeper.rs

wal_storage.rs already checks this, but since this is a quite legit scenario
check it at safekeeper.rs (consensus level) as well.

ref https://github.com/neondatabase/neon/issues/8212
---
 safekeeper/src/safekeeper.rs                  | 27 +++++++++++++++++++
 safekeeper/src/wal_storage.rs                 |  6 +++++
 .../tests/walproposer_sim/safekeeper_disk.rs  |  4 +++
 3 files changed, 37 insertions(+)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 0814d9ba67..9d103887ae 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -875,6 +875,29 @@ where
             return Ok(Some(AcceptorProposerMessage::AppendResponse(resp)));
         }
 
+        // Disallow any non-sequential writes, which can result in gaps or
+        // overwrites. If we need to move the pointer, ProposerElected message
+        // should have truncated WAL first accordingly. Note that the first
+        // condition (WAL rewrite) is quite expected in real world; it happens
+        // when walproposer reconnects to safekeeper and writes some more data
+        // while first connection still gets some packets later. It might be
+        // better to not log this as error! above.
+        let write_lsn = self.wal_store.write_lsn();
+        if write_lsn > msg.h.begin_lsn {
+            bail!(
+                "append request rewrites WAL written before, write_lsn={}, msg lsn={}",
+                write_lsn,
+                msg.h.begin_lsn
+            );
+        }
+        if write_lsn < msg.h.begin_lsn && write_lsn != Lsn(0) {
+            bail!(
+                "append request creates gap in written WAL, write_lsn={}, msg lsn={}",
+                write_lsn,
+                msg.h.begin_lsn,
+            );
+        }
+
         // Now we know that we are in the same term as the proposer,
         // processing the message.
 
@@ -1005,6 +1028,10 @@ mod tests {
 
     #[async_trait::async_trait]
     impl wal_storage::Storage for DummyWalStore {
+        fn write_lsn(&self) -> Lsn {
+            self.lsn
+        }
+
         fn flush_lsn(&self) -> Lsn {
             self.lsn
         }
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index ded8571a3e..5136bdb9d8 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -37,6 +37,8 @@ use utils::{id::TenantTimelineId, lsn::Lsn};
 
 #[async_trait::async_trait]
 pub trait Storage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn;
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn;
 
@@ -327,6 +329,10 @@ impl PhysicalStorage {
 
 #[async_trait::async_trait]
 impl Storage for PhysicalStorage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn {
+        self.write_lsn
+    }
     /// flush_lsn returns LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn
diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
index c2db9de78a..be56e86562 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
@@ -177,6 +177,10 @@ impl DiskWALStorage {
 
 #[async_trait::async_trait]
 impl wal_storage::Storage for DiskWALStorage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn {
+        self.write_lsn
+    }
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn

From ef57e73fbf4ab4972d07e598d0b1ab3139a5abbf Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 20 Aug 2024 10:26:44 +0200
Subject: [PATCH 1449/1571] task_mgr::spawn: require a `TenantId` (#8462)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… to dis-incentivize global tasks via task_mgr in the future

(As of https://github.com/neondatabase/neon/pull/8339 all remaining
task_mgr usage is tenant or timeline scoped.)
---
 pageserver/src/task_mgr.rs                      | 15 +++++----------
 pageserver/src/tenant.rs                        |  2 +-
 pageserver/src/tenant/remote_timeline_client.rs |  2 +-
 pageserver/src/tenant/tasks.rs                  |  6 +++---
 pageserver/src/tenant/timeline.rs               |  8 ++++----
 pageserver/src/tenant/timeline/delete.rs        |  2 +-
 pageserver/src/tenant/timeline/eviction_task.rs |  2 +-
 7 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 5cd78874c1..ed9e001fd2 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -393,7 +393,7 @@ struct PageServerTask {
 
     /// Tasks may optionally be launched for a particular tenant/timeline, enabling
     /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
-    tenant_shard_id: Option<TenantShardId>,
+    tenant_shard_id: TenantShardId,
     timeline_id: Option<TimelineId>,
 
     mutable: Mutex<MutableTaskState>,
@@ -405,7 +405,7 @@ struct PageServerTask {
 pub fn spawn<F>(
     runtime: &tokio::runtime::Handle,
     kind: TaskKind,
-    tenant_shard_id: Option<TenantShardId>,
+    tenant_shard_id: TenantShardId,
     timeline_id: Option<TimelineId>,
     name: &str,
     future: F,
@@ -550,7 +550,7 @@ pub async fn shutdown_tasks(
         let tasks = TASKS.lock().unwrap();
         for task in tasks.values() {
             if (kind.is_none() || Some(task.kind) == kind)
-                && (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
+                && (tenant_shard_id.is_none() || Some(task.tenant_shard_id) == tenant_shard_id)
                 && (timeline_id.is_none() || task.timeline_id == timeline_id)
             {
                 task.cancel.cancel();
@@ -573,13 +573,8 @@ pub async fn shutdown_tasks(
         };
         if let Some(mut join_handle) = join_handle {
             if log_all {
-                if tenant_shard_id.is_none() {
-                    // there are quite few of these
-                    info!(name = task.name, kind = ?task_kind, "stopping global task");
-                } else {
-                    // warn to catch these in tests; there shouldn't be any
-                    warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
-                }
+                // warn to catch these in tests; there shouldn't be any
+                warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
             }
             if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
                 .await
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 8ab8d08ce1..65a7504b74 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -798,7 +798,7 @@ impl Tenant {
         task_mgr::spawn(
             &tokio::runtime::Handle::current(),
             TaskKind::Attach,
-            Some(tenant_shard_id),
+            tenant_shard_id,
             None,
             "attach tenant",
             async move {
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index b4d7ad1e97..71b766e4c7 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1728,7 +1728,7 @@ impl RemoteTimelineClient {
             task_mgr::spawn(
                 &self.runtime,
                 TaskKind::RemoteUploadTask,
-                Some(self.tenant_shard_id),
+                self.tenant_shard_id,
                 Some(self.timeline_id),
                 "remote upload",
                 async move {
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index dbcd704b4e..3972685a8e 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -98,7 +98,7 @@ pub fn start_background_loops(
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::Compaction,
-        Some(tenant_shard_id),
+        tenant_shard_id,
         None,
         &format!("compactor for tenant {tenant_shard_id}"),
         {
@@ -121,7 +121,7 @@ pub fn start_background_loops(
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::GarbageCollector,
-        Some(tenant_shard_id),
+        tenant_shard_id,
         None,
         &format!("garbage collector for tenant {tenant_shard_id}"),
         {
@@ -144,7 +144,7 @@ pub fn start_background_loops(
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::IngestHousekeeping,
-        Some(tenant_shard_id),
+        tenant_shard_id,
         None,
         &format!("ingest housekeeping for tenant {tenant_shard_id}"),
         {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 26dc87c373..9732cf8b50 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2281,7 +2281,7 @@ impl Timeline {
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::LayerFlushTask,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
             Some(self.timeline_id),
             "layer flush task",
             async move {
@@ -2635,7 +2635,7 @@ impl Timeline {
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::InitialLogicalSizeCalculation,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
             Some(self.timeline_id),
             "initial size calculation",
             // NB: don't log errors here, task_mgr will do that.
@@ -2803,7 +2803,7 @@ impl Timeline {
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::OndemandLogicalSizeCalculation,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
             Some(self.timeline_id),
             "ondemand logical size calculation",
             async move {
@@ -5162,7 +5162,7 @@ impl Timeline {
         let task_id = task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::DownloadAllRemoteLayers,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
             Some(self.timeline_id),
             "download all remote layers task",
             async move {
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index b03dbb092e..dc4118bb4a 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -395,7 +395,7 @@ impl DeleteTimelineFlow {
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             TaskKind::TimelineDeletionWorker,
-            Some(tenant_shard_id),
+            tenant_shard_id,
             Some(timeline_id),
             "timeline_delete",
             async move {
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index eaa9c0ff62..2f6cb4d73a 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -60,7 +60,7 @@ impl Timeline {
         task_mgr::spawn(
             BACKGROUND_RUNTIME.handle(),
             TaskKind::Eviction,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
             Some(self.timeline_id),
             &format!(
                 "layer eviction for {}/{}",

From c96593b473a22e76ce6dae912177128c3ec21867 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 20 Aug 2024 10:46:58 +0100
Subject: [PATCH 1450/1571] Make Postgres 16 default version (#8745)

## Problem

The default Postgres version is set to 15 in code, while we use 16 in
most of the other places (and Postgres 17 is coming)

## Summary of changes
- Run `benchmarks` job with Postgres 16 (instead of Postgres 14)
- Set `DEFAULT_PG_VERSION` to 16 in all places
- Remove deprecated `--pg-version` pytest argument
- Update `test_metadata_bincode_serde_ensure_roundtrip` for Postgres 16
---
 .github/actions/run-python-test-set/action.yml |  2 +-
 .github/workflows/build_and_test.yml           |  1 +
 README.md                                      |  2 +-
 control_plane/src/bin/neon_local.rs            |  2 +-
 control_plane/src/local_env.rs                 |  2 +-
 pageserver/src/lib.rs                          |  2 +-
 pageserver/src/tenant/metadata.rs              |  4 ++--
 scripts/ps_ec2_setup_instance_store            |  2 +-
 test_runner/README.md                          |  3 +--
 test_runner/fixtures/pg_version.py             | 18 +-----------------
 test_runner/performance/README.md              |  2 +-
 test_runner/performance/pageserver/README.md   |  2 +-
 .../interactive/test_many_small_tenants.py     |  2 +-
 test_runner/regress/test_compatibility.py      |  4 ++--
 14 files changed, 16 insertions(+), 32 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 4ccf190c6a..814067fb8e 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -43,7 +43,7 @@ inputs:
   pg_version:
     description: 'Postgres version to use for tests'
     required: false
-    default: 'v14'
+    default: 'v16'
   benchmark_durations:
     description: 'benchmark durations JSON'
     required: false
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 92fff4ffbc..715f1af656 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -280,6 +280,7 @@ jobs:
           save_perf_report: ${{ github.ref_name == 'main' }}
           extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
           benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
+          pg_version: v16
         env:
           VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
diff --git a/README.md b/README.md
index ea0a289502..f01442da5d 100644
--- a/README.md
+++ b/README.md
@@ -262,7 +262,7 @@ By default, this runs both debug and release modes, and all supported postgres v
 testing locally, it is convenient to run just one set of permutations, like this:
 
 ```sh
-DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
+DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
 ```
 
 ## Flamegraphs
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index edd88dc71c..1d66532d49 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -54,7 +54,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);
 
-const DEFAULT_PG_VERSION: &str = "15";
+const DEFAULT_PG_VERSION: &str = "16";
 
 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
 
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 807519c88d..74caba2b56 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -27,7 +27,7 @@ use crate::pageserver::PageServerNode;
 use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
 use crate::safekeeper::SafekeeperNode;
 
-pub const DEFAULT_PG_VERSION: u32 = 15;
+pub const DEFAULT_PG_VERSION: u32 = 16;
 
 //
 // This data structures represents neon_local CLI config
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 5aee13cfc6..5829a1c188 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -49,7 +49,7 @@ use tracing::{info, info_span};
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;
 
-pub const DEFAULT_PG_VERSION: u32 = 15;
+pub const DEFAULT_PG_VERSION: u32 = 16;
 
 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 6073abc8c3..190316df42 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -565,7 +565,7 @@ mod tests {
         );
         let expected_bytes = vec![
             /* TimelineMetadataHeader */
-            4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
+            74, 104, 158, 105, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
             /* TimelineMetadataBodyV2 */
             0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
             1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
@@ -574,7 +574,7 @@ mod tests {
             0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
             0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
             0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
-            0, 0, 0, 15, // pg_version (4 bytes)
+            0, 0, 0, 16, // pg_version (4 bytes)
             /* padding bytes */
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
diff --git a/scripts/ps_ec2_setup_instance_store b/scripts/ps_ec2_setup_instance_store
index 1f88f252eb..7c383e322f 100755
--- a/scripts/ps_ec2_setup_instance_store
+++ b/scripts/ps_ec2_setup_instance_store
@@ -44,7 +44,7 @@ run the following commands from the top of the neon.git checkout
 
     # test suite run
     export TEST_OUTPUT="$TEST_OUTPUT"
-    DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py
+    DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py
 
     # for interactive use
     export NEON_REPO_DIR="$NEON_REPO_DIR"
diff --git a/test_runner/README.md b/test_runner/README.md
index e2f26a19ce..73aa29d4bb 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -71,8 +71,7 @@ a subdirectory for each version with naming convention `v{PG_VERSION}/`.
 Inside that dir, a `bin/postgres` binary should be present.
 `DEFAULT_PG_VERSION`: The version of Postgres to use,
 This is used to construct full path to the postgres binaries.
-Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"`. Alternatively,
-you can use `--pg-version` argument.
+Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=16`
 `TEST_OUTPUT`: Set the directory where test state and test output files
 should go.
 `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py
index 941889a2f5..e12c8e5f4a 100644
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -3,8 +3,6 @@ import os
 from typing import Optional
 
 import pytest
-from _pytest.config import Config
-from _pytest.config.argparsing import Parser
 
 """
 This fixture is used to determine which version of Postgres to use for tests.
@@ -52,7 +50,7 @@ class PgVersion(str, enum.Enum):
         return None
 
 
-DEFAULT_VERSION: PgVersion = PgVersion.V15
+DEFAULT_VERSION: PgVersion = PgVersion.V16
 
 
 def skip_on_postgres(version: PgVersion, reason: str):
@@ -69,22 +67,8 @@ def xfail_on_postgres(version: PgVersion, reason: str):
     )
 
 
-def pytest_addoption(parser: Parser):
-    parser.addoption(
-        "--pg-version",
-        action="store",
-        type=PgVersion,
-        help="DEPRECATED: Postgres version to use for tests",
-    )
-
-
 def run_only_on_default_postgres(reason: str):
     return pytest.mark.skipif(
         PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is not DEFAULT_VERSION,
         reason=reason,
     )
-
-
-def pytest_configure(config: Config):
-    if config.getoption("--pg-version"):
-        raise Exception("--pg-version is deprecated, use DEFAULT_PG_VERSION env var instead")
diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md
index 7ad65821d4..70d75a6dcf 100644
--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -7,7 +7,7 @@ easier to see if you have compile errors without scrolling up.
 You may also need to run `./scripts/pysync`.
 
 Then run the tests
-`DEFAULT_PG_VERSION=15 NEON_BIN=./target/release poetry run pytest test_runner/performance`
+`DEFAULT_PG_VERSION=16 NEON_BIN=./target/release poetry run pytest test_runner/performance`
 
 Some handy pytest flags for local development:
 - `-x` tells pytest to stop on first error
diff --git a/test_runner/performance/pageserver/README.md b/test_runner/performance/pageserver/README.md
index fdd09cd946..56ffad9963 100644
--- a/test_runner/performance/pageserver/README.md
+++ b/test_runner/performance/pageserver/README.md
@@ -11,6 +11,6 @@ It supports mounting snapshots using overlayfs, which improves iteration time.
 Here's a full command line.
 
 ```
-RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=15 BUILD_TYPE=release \
+RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release \
     ./scripts/pytest test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
 ````
diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
index 33848b06d3..8d781c1609 100644
--- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
+++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
@@ -14,7 +14,7 @@ from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
 
 """
 Usage:
-DEFAULT_PG_VERSION=15 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \
+DEFAULT_PG_VERSION=16 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \
     ./scripts/pytest --timeout 0 test_runner/performance/pageserver/interactive/test_many_small_tenants.py
 """
 
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index afa5f6873c..30ff40b7df 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -39,7 +39,7 @@ from fixtures.workload import Workload
 #
 # How to run `test_backward_compatibility` locally:
 #
-#    export DEFAULT_PG_VERSION=15
+#    export DEFAULT_PG_VERSION=16
 #    export BUILD_TYPE=release
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
 #    export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION}
@@ -61,7 +61,7 @@ from fixtures.workload import Workload
 #
 # How to run `test_forward_compatibility` locally:
 #
-#    export DEFAULT_PG_VERSION=15
+#    export DEFAULT_PG_VERSION=16
 #    export BUILD_TYPE=release
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
 #    export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE}

From 02a28c01ca1083e4fb2fc28b2db761220c161ff7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 20 Aug 2024 12:34:53 +0100
Subject: [PATCH 1451/1571] Revert "safekeeper: check for non-consecutive
 writes in safekeeper.rs" (#8771)

Reverts neondatabase/neon#8640

This broke `test_last_log_term_switch` via a merge race of some kind.
---
 safekeeper/src/safekeeper.rs                  | 27 -------------------
 safekeeper/src/wal_storage.rs                 |  6 -----
 .../tests/walproposer_sim/safekeeper_disk.rs  |  4 ---
 3 files changed, 37 deletions(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 9d103887ae..0814d9ba67 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -875,29 +875,6 @@ where
             return Ok(Some(AcceptorProposerMessage::AppendResponse(resp)));
         }
 
-        // Disallow any non-sequential writes, which can result in gaps or
-        // overwrites. If we need to move the pointer, ProposerElected message
-        // should have truncated WAL first accordingly. Note that the first
-        // condition (WAL rewrite) is quite expected in real world; it happens
-        // when walproposer reconnects to safekeeper and writes some more data
-        // while first connection still gets some packets later. It might be
-        // better to not log this as error! above.
-        let write_lsn = self.wal_store.write_lsn();
-        if write_lsn > msg.h.begin_lsn {
-            bail!(
-                "append request rewrites WAL written before, write_lsn={}, msg lsn={}",
-                write_lsn,
-                msg.h.begin_lsn
-            );
-        }
-        if write_lsn < msg.h.begin_lsn && write_lsn != Lsn(0) {
-            bail!(
-                "append request creates gap in written WAL, write_lsn={}, msg lsn={}",
-                write_lsn,
-                msg.h.begin_lsn,
-            );
-        }
-
         // Now we know that we are in the same term as the proposer,
         // processing the message.
 
@@ -1028,10 +1005,6 @@ mod tests {
 
     #[async_trait::async_trait]
     impl wal_storage::Storage for DummyWalStore {
-        fn write_lsn(&self) -> Lsn {
-            self.lsn
-        }
-
         fn flush_lsn(&self) -> Lsn {
             self.lsn
         }
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 5136bdb9d8..ded8571a3e 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -37,8 +37,6 @@ use utils::{id::TenantTimelineId, lsn::Lsn};
 
 #[async_trait::async_trait]
 pub trait Storage {
-    // Last written LSN.
-    fn write_lsn(&self) -> Lsn;
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn;
 
@@ -329,10 +327,6 @@ impl PhysicalStorage {
 
 #[async_trait::async_trait]
 impl Storage for PhysicalStorage {
-    // Last written LSN.
-    fn write_lsn(&self) -> Lsn {
-        self.write_lsn
-    }
     /// flush_lsn returns LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn
diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
index be56e86562..c2db9de78a 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
@@ -177,10 +177,6 @@ impl DiskWALStorage {
 
 #[async_trait::async_trait]
 impl wal_storage::Storage for DiskWALStorage {
-    // Last written LSN.
-    fn write_lsn(&self) -> Lsn {
-        self.write_lsn
-    }
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn

From 1c96957e85700eaa0333fb0230f5135e7a982d1e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 20 Aug 2024 14:00:36 +0100
Subject: [PATCH 1452/1571] storcon: run db migrations after step down sequence
 (#8756)

## Problem

Previously, we would run db migrations before doing the step-down
sequence. This meant that the current leader would have to deal with
the schema changes and that's generally not safe.

## Summary of changes

Push the step-down procedure earlier in start-up and
do db migrations right after it (but before we load-up the in-memory
state from the db).

Epic: https://github.com/neondatabase/cloud/issues/14701
---
 control_plane/src/storage_controller.rs |   3 +-
 storage_controller/src/leadership.rs    | 136 ++++++++++++++++++++++++
 storage_controller/src/lib.rs           |   1 +
 storage_controller/src/main.rs          |  24 +----
 storage_controller/src/metrics.rs       |   2 +
 storage_controller/src/persistence.rs   |  18 ++++
 storage_controller/src/service.rs       | 102 ++++--------------
 7 files changed, 180 insertions(+), 106 deletions(-)
 create mode 100644 storage_controller/src/leadership.rs

diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 2c077595a1..f6539ad5b0 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -217,7 +217,7 @@ impl StorageController {
         Ok(exitcode.success())
     }
 
-    /// Create our database if it doesn't exist, and run migrations.
+    /// Create our database if it doesn't exist
     ///
     /// This function is equivalent to the `diesel setup` command in the diesel CLI.  We implement
     /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
@@ -382,7 +382,6 @@ impl StorageController {
             )
             .await?;
 
-            // Run migrations on every startup, in case something changed.
             self.setup_database(postgres_port).await?;
         }
 
diff --git a/storage_controller/src/leadership.rs b/storage_controller/src/leadership.rs
new file mode 100644
index 0000000000..a171bab451
--- /dev/null
+++ b/storage_controller/src/leadership.rs
@@ -0,0 +1,136 @@
+use std::sync::Arc;
+
+use hyper::Uri;
+use tokio_util::sync::CancellationToken;
+
+use crate::{
+    peer_client::{GlobalObservedState, PeerClient},
+    persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence},
+    service::Config,
+};
+
+/// Helper for storage controller leadership acquisition
+pub(crate) struct Leadership {
+    persistence: Arc<Persistence>,
+    config: Config,
+    cancel: CancellationToken,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum Error {
+    #[error(transparent)]
+    Database(#[from] DatabaseError),
+}
+
+pub(crate) type Result<T> = std::result::Result<T, Error>;
+
+impl Leadership {
+    pub(crate) fn new(
+        persistence: Arc<Persistence>,
+        config: Config,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            persistence,
+            config,
+            cancel,
+        }
+    }
+
+    /// Find the current leader in the database and request it to step down if required.
+    /// Should be called early on in within the start-up sequence.
+    ///
+    /// Returns a tuple of two optionals: the current leader and its observed state
+    pub(crate) async fn step_down_current_leader(
+        &self,
+    ) -> Result<(Option<ControllerPersistence>, Option<GlobalObservedState>)> {
+        let leader = self.current_leader().await?;
+        let leader_step_down_state = if let Some(ref leader) = leader {
+            if self.config.start_as_candidate {
+                self.request_step_down(leader).await
+            } else {
+                None
+            }
+        } else {
+            tracing::info!("No leader found to request step down from. Will build observed state.");
+            None
+        };
+
+        Ok((leader, leader_step_down_state))
+    }
+
+    /// Mark the current storage controller instance as the leader in the database
+    pub(crate) async fn become_leader(
+        &self,
+        current_leader: Option<ControllerPersistence>,
+    ) -> Result<()> {
+        if let Some(address_for_peers) = &self.config.address_for_peers {
+            // TODO: `address-for-peers` can become a mandatory cli arg
+            // after we update the k8s setup
+            let proposed_leader = ControllerPersistence {
+                address: address_for_peers.to_string(),
+                started_at: chrono::Utc::now(),
+            };
+
+            self.persistence
+                .update_leader(current_leader, proposed_leader)
+                .await
+                .map_err(Error::Database)
+        } else {
+            tracing::info!("No address-for-peers provided. Skipping leader persistence.");
+            Ok(())
+        }
+    }
+
+    async fn current_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
+        let res = self.persistence.get_leader().await;
+        if let Err(DatabaseError::Query(diesel::result::Error::DatabaseError(_kind, ref err))) = res
+        {
+            const REL_NOT_FOUND_MSG: &str = "relation \"controllers\" does not exist";
+            if err.message().trim() == REL_NOT_FOUND_MSG {
+                // Special case: if this is a brand new storage controller, migrations will not
+                // have run at this point yet, and, hence, the controllers table does not exist.
+                // Detect this case via the error string (diesel doesn't type it) and allow it.
+                tracing::info!("Detected first storage controller start-up. Allowing missing controllers table ...");
+                return Ok(None);
+            }
+        }
+
+        res
+    }
+
+    /// Request step down from the currently registered leader in the database
+    ///
+    /// If such an entry is persisted, the success path returns the observed
+    /// state and details of the leader. Otherwise, None is returned indicating
+    /// there is no leader currently.
+    async fn request_step_down(
+        &self,
+        leader: &ControllerPersistence,
+    ) -> Option<GlobalObservedState> {
+        tracing::info!("Sending step down request to {leader:?}");
+
+        // TODO: jwt token
+        let client = PeerClient::new(
+            Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
+            self.config.jwt_token.clone(),
+        );
+        let state = client.step_down(&self.cancel).await;
+        match state {
+            Ok(state) => Some(state),
+            Err(err) => {
+                // TODO: Make leaders periodically update a timestamp field in the
+                // database and, if the leader is not reachable from the current instance,
+                // but inferred as alive from the timestamp, abort start-up. This avoids
+                // a potential scenario in which we have two controllers acting as leaders.
+                tracing::error!(
+                    "Leader ({}) did not respond to step-down request: {}",
+                    leader.address,
+                    err
+                );
+
+                None
+            }
+        }
+    }
+}
diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs
index 2034addbe1..60e613bb5c 100644
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -8,6 +8,7 @@ mod drain_utils;
 mod heartbeater;
 pub mod http;
 mod id_lock_map;
+mod leadership;
 pub mod metrics;
 mod node;
 mod pageserver_client;
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 7387d36690..17685b1140 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -1,6 +1,5 @@
 use anyhow::{anyhow, Context};
 use clap::Parser;
-use diesel::Connection;
 use hyper::Uri;
 use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
@@ -27,9 +26,6 @@ use utils::{project_build_tag, project_git_version, tcp_listener};
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
-use diesel_migrations::{embed_migrations, EmbeddedMigrations};
-pub const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
-
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[command(arg_required_else_help(true))]
@@ -181,20 +177,6 @@ impl Secrets {
     }
 }
 
-/// Execute the diesel migrations that are built into this binary
-async fn migration_run(database_url: &str) -> anyhow::Result<()> {
-    use diesel::PgConnection;
-    use diesel_migrations::{HarnessWithOutput, MigrationHarness};
-    let mut conn = PgConnection::establish(database_url)?;
-
-    HarnessWithOutput::write_to_stdout(&mut conn)
-        .run_pending_migrations(MIGRATIONS)
-        .map(|_| ())
-        .map_err(|e| anyhow::anyhow!(e))?;
-
-    Ok(())
-}
-
 fn main() -> anyhow::Result<()> {
     logging::init(
         LogFormat::Plain,
@@ -304,13 +286,9 @@ async fn async_main() -> anyhow::Result<()> {
         http_service_port: args.listen.port() as i32,
     };
 
-    // After loading secrets & config, but before starting anything else, apply database migrations
+    // Validate that we can connect to the database
     Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
 
-    migration_run(&secrets.database_url)
-        .await
-        .context("Running database migrations")?;
-
     let persistence = Arc::new(Persistence::new(secrets.database_url));
 
     let service = Service::spawn(config, persistence.clone()).await?;
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index c2303e7a7f..5cfcfb4b1f 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -230,6 +230,7 @@ pub(crate) enum DatabaseErrorLabel {
     Connection,
     ConnectionPool,
     Logical,
+    Migration,
 }
 
 impl DatabaseError {
@@ -239,6 +240,7 @@ impl DatabaseError {
             Self::Connection(_) => DatabaseErrorLabel::Connection,
             Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
             Self::Logical(_) => DatabaseErrorLabel::Logical,
+            Self::Migration(_) => DatabaseErrorLabel::Migration,
         }
     }
 }
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index aebbdec0d1..16df19026c 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -25,6 +25,9 @@ use crate::metrics::{
 };
 use crate::node::Node;
 
+use diesel_migrations::{embed_migrations, EmbeddedMigrations};
+const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
+
 /// ## What do we store?
 ///
 /// The storage controller service does not store most of its state durably.
@@ -72,6 +75,8 @@ pub(crate) enum DatabaseError {
     ConnectionPool(#[from] r2d2::Error),
     #[error("Logical error: {0}")]
     Logical(String),
+    #[error("Migration error: {0}")]
+    Migration(String),
 }
 
 #[derive(measured::FixedCardinalityLabel, Copy, Clone)]
@@ -167,6 +172,19 @@ impl Persistence {
         }
     }
 
+    /// Execute the diesel migrations that are built into this binary
+    pub(crate) async fn migration_run(&self) -> DatabaseResult<()> {
+        use diesel_migrations::{HarnessWithOutput, MigrationHarness};
+
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            HarnessWithOutput::write_to_stdout(conn)
+                .run_pending_migrations(MIGRATIONS)
+                .map(|_| ())
+                .map_err(|e| DatabaseError::Migration(e.to_string()))
+        })
+        .await
+    }
+
     /// Wraps `with_conn` in order to collect latency and error metrics
     async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
     where
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 3459b44774..780f4a7ee5 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -17,8 +17,9 @@ use crate::{
     compute_hook::NotifyError,
     drain_utils::{self, TenantShardDrain, TenantShardIterator},
     id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
+    leadership::Leadership,
     metrics,
-    peer_client::{GlobalObservedState, PeerClient},
+    peer_client::GlobalObservedState,
     persistence::{
         AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
         TenantFilter,
@@ -333,7 +334,7 @@ impl From<DatabaseError> for ApiError {
             DatabaseError::Connection(_) | DatabaseError::ConnectionPool(_) => {
                 ApiError::ShuttingDown
             }
-            DatabaseError::Logical(reason) => {
+            DatabaseError::Logical(reason) | DatabaseError::Migration(reason) => {
                 ApiError::InternalServerError(anyhow::anyhow!(reason))
             }
         }
@@ -606,22 +607,15 @@ impl Service {
 
         // Before making any obeservable changes to the cluster, persist self
         // as leader in database and memory.
-        if let Some(address_for_peers) = &self.config.address_for_peers {
-            // TODO: `address-for-peers` can become a mandatory cli arg
-            // after we update the k8s setup
-            let proposed_leader = ControllerPersistence {
-                address: address_for_peers.to_string(),
-                started_at: chrono::Utc::now(),
-            };
+        let leadership = Leadership::new(
+            self.persistence.clone(),
+            self.config.clone(),
+            self.cancel.child_token(),
+        );
 
-            if let Err(err) = self
-                .persistence
-                .update_leader(current_leader, proposed_leader)
-                .await
-            {
-                tracing::error!("Failed to persist self as leader: {err}. Aborting start-up ...");
-                std::process::exit(1);
-            }
+        if let Err(e) = leadership.become_leader(current_leader).await {
+            tracing::error!("Failed to persist self as leader: {e}. Aborting start-up ...");
+            std::process::exit(1);
         }
 
         self.inner.write().unwrap().become_leader();
@@ -1159,6 +1153,16 @@ impl Service {
         let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
         let (abort_tx, abort_rx) = tokio::sync::mpsc::unbounded_channel();
 
+        let leadership_cancel = CancellationToken::new();
+        let leadership = Leadership::new(persistence.clone(), config.clone(), leadership_cancel);
+        let (leader, leader_step_down_state) = leadership.step_down_current_leader().await?;
+
+        // Apply the migrations **after** the current leader has stepped down
+        // (or we've given up waiting for it), but **before** reading from the
+        // database. The only exception is reading the current leader before
+        // migrating.
+        persistence.migration_run().await?;
+
         tracing::info!("Loading nodes from database...");
         let nodes = persistence
             .list_nodes()
@@ -1376,32 +1380,6 @@ impl Service {
                     return;
                 };
 
-                let leadership_status = this.inner.read().unwrap().get_leadership_status();
-                let leader = match this.get_leader().await {
-                    Ok(ok) => ok,
-                    Err(err) => {
-                        tracing::error!(
-                            "Failed to query database for current leader: {err}. Aborting start-up ..."
-                        );
-                        std::process::exit(1);
-                    }
-                };
-
-                let leader_step_down_state = match leadership_status {
-                    LeadershipStatus::Candidate => {
-                        if let Some(ref leader) = leader {
-                            this.request_step_down(leader).await
-                        } else {
-                            tracing::info!(
-                                "No leader found to request step down from. Will build observed state."
-                            );
-                            None
-                        }
-                    }
-                    LeadershipStatus::Leader => None,
-                    LeadershipStatus::SteppedDown => unreachable!(),
-                };
-
                 this.startup_reconcile(leader, leader_step_down_state, bg_compute_notify_result_tx)
                     .await;
 
@@ -6377,42 +6355,4 @@ impl Service {
 
         global_observed
     }
-
-    /// Request step down from the currently registered leader in the database
-    ///
-    /// If such an entry is persisted, the success path returns the observed
-    /// state and details of the leader. Otherwise, None is returned indicating
-    /// there is no leader currently.
-    ///
-    /// On failures to query the database or step down error responses the process is killed
-    /// and we rely on k8s to retry.
-    async fn request_step_down(
-        &self,
-        leader: &ControllerPersistence,
-    ) -> Option<GlobalObservedState> {
-        tracing::info!("Sending step down request to {leader:?}");
-
-        // TODO: jwt token
-        let client = PeerClient::new(
-            Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
-            self.config.jwt_token.clone(),
-        );
-        let state = client.step_down(&self.cancel).await;
-        match state {
-            Ok(state) => Some(state),
-            Err(err) => {
-                // TODO: Make leaders periodically update a timestamp field in the
-                // database and, if the leader is not reachable from the current instance,
-                // but inferred as alive from the timestamp, abort start-up. This avoids
-                // a potential scenario in which we have two controllers acting as leaders.
-                tracing::error!(
-                    "Leader ({}) did not respond to step-down request: {}",
-                    leader.address,
-                    err
-                );
-
-                None
-            }
-        }
-    }
 }

From 0170611a97fc233f4e3bcc56648a77fb3cf33a2c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 20 Aug 2024 14:16:27 +0100
Subject: [PATCH 1453/1571] proxy: small changes (#8752)

## Problem

#8736 is getting too big. splitting off some simple changes here

## Summary of changes

Local proxy wont always be using tls, so make it optional. Local proxy
wont be using ws for now, so make it optional. Remove a dead config var.
---
 proxy/src/bin/proxy.rs                |  4 +-
 proxy/src/config.rs                   |  1 +
 proxy/src/serverless.rs               | 59 ++++++++++++++++++++-------
 proxy/src/serverless/conn_pool.rs     |  1 +
 proxy/src/serverless/sql_over_http.rs | 26 ++++++++----
 5 files changed, 65 insertions(+), 26 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index b44e0ddd2f..d83a1f3bcf 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -173,9 +173,6 @@ struct ProxyCliArgs {
     /// cache for `role_secret` (use `size=0` to disable)
     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     role_secret_cache: String,
-    /// disable ip check for http requests. If it is too time consuming, it could be turned off.
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    disable_ip_check_for_http: bool,
     /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
     #[clap(long)]
     redis_notifications: Option<String>,
@@ -661,6 +658,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     )?;
 
     let http_config = HttpConfig {
+        accept_websockets: true,
         pool_options: GlobalConnPoolOptions {
             max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
             gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 36d04924f2..a280aa88ce 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -52,6 +52,7 @@ pub struct TlsConfig {
 }
 
 pub struct HttpConfig {
+    pub accept_websockets: bool,
     pub pool_options: GlobalConnPoolOptions,
     pub cancel_set: CancelSet,
     pub client_conn_threshold: u64,
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 5416d63b5b..b2bf93dc6d 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -10,6 +10,7 @@ mod json;
 mod sql_over_http;
 mod websocket;
 
+use async_trait::async_trait;
 use atomic_take::AtomicTake;
 use bytes::Bytes;
 pub use conn_pool::GlobalConnPoolOptions;
@@ -26,8 +27,9 @@ use rand::rngs::StdRng;
 use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::time::timeout;
-use tokio_rustls::{server::TlsStream, TlsAcceptor};
+use tokio_rustls::TlsAcceptor;
 use tokio_util::task::TaskTracker;
 
 use crate::cancellation::CancellationHandlerMain;
@@ -41,7 +43,7 @@ use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};
 
 use std::net::{IpAddr, SocketAddr};
-use std::pin::pin;
+use std::pin::{pin, Pin};
 use std::sync::Arc;
 use tokio::net::{TcpListener, TcpStream};
 use tokio_util::sync::CancellationToken;
@@ -86,18 +88,18 @@ pub async fn task_main(
         config,
         endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter),
     });
-
-    let tls_config = match config.tls_config.as_ref() {
-        Some(config) => config,
+    let tls_acceptor: Arc<dyn MaybeTlsAcceptor> = match config.tls_config.as_ref() {
+        Some(config) => {
+            let mut tls_server_config = rustls::ServerConfig::clone(&config.to_server_config());
+            // prefer http2, but support http/1.1
+            tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
+            Arc::new(tls_server_config) as Arc<_>
+        }
         None => {
-            warn!("TLS config is missing, WebSocket Secure server will not be started");
-            return Ok(());
+            warn!("TLS config is missing");
+            Arc::new(NoTls) as Arc<_>
         }
     };
-    let mut tls_server_config = rustls::ServerConfig::clone(&tls_config.to_server_config());
-    // prefer http2, but support http/1.1
-    tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
-    let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into();
 
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
     connections.close(); // allows `connections.wait to complete`
@@ -176,16 +178,41 @@ pub async fn task_main(
     Ok(())
 }
 
+pub trait AsyncReadWrite: AsyncRead + AsyncWrite + Send + 'static {}
+impl<T: AsyncRead + AsyncWrite + Send + 'static> AsyncReadWrite for T {}
+pub type AsyncRW = Pin<Box<dyn AsyncReadWrite>>;
+
+#[async_trait]
+trait MaybeTlsAcceptor: Send + Sync + 'static {
+    async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW>;
+}
+
+#[async_trait]
+impl MaybeTlsAcceptor for rustls::ServerConfig {
+    async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW> {
+        Ok(Box::pin(TlsAcceptor::from(self).accept(conn).await?))
+    }
+}
+
+struct NoTls;
+
+#[async_trait]
+impl MaybeTlsAcceptor for NoTls {
+    async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW> {
+        Ok(Box::pin(conn))
+    }
+}
+
 /// Handles the TCP startup lifecycle.
 /// 1. Parses PROXY protocol V2
 /// 2. Handles TLS handshake
 async fn connection_startup(
     config: &ProxyConfig,
-    tls_acceptor: TlsAcceptor,
+    tls_acceptor: Arc<dyn MaybeTlsAcceptor>,
     session_id: uuid::Uuid,
     conn: TcpStream,
     peer_addr: SocketAddr,
-) -> Option<(TlsStream<ChainRW<TcpStream>>, IpAddr)> {
+) -> Option<(AsyncRW, IpAddr)> {
     // handle PROXY protocol
     let (conn, peer) = match read_proxy_protocol(conn).await {
         Ok(c) => c,
@@ -241,7 +268,7 @@ async fn connection_handler(
     cancellation_handler: Arc<CancellationHandlerMain>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_token: CancellationToken,
-    conn: TlsStream<ChainRW<TcpStream>>,
+    conn: AsyncRW,
     peer_addr: IpAddr,
     session_id: uuid::Uuid,
 ) {
@@ -326,7 +353,9 @@ async fn request_handler(
         .map(|s| s.to_string());
 
     // Check if the request is a websocket upgrade request.
-    if framed_websockets::upgrade::is_upgrade_request(&request) {
+    if config.http_config.accept_websockets
+        && framed_websockets::upgrade::is_upgrade_request(&request)
+    {
         let ctx = RequestMonitoring::new(
             session_id,
             peer_addr,
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 9ede659cc4..3478787995 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -758,6 +758,7 @@ mod tests {
     async fn test_pool() {
         let _ = env_logger::try_init();
         let config = Box::leak(Box::new(crate::config::HttpConfig {
+            accept_websockets: false,
             pool_options: GlobalConnPoolOptions {
                 max_conns_per_endpoint: 2,
                 gc_epoch: Duration::from_secs(1),
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index c41df07a4d..bbfed90f39 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -147,7 +147,7 @@ impl UserFacingError for ConnInfoError {
 fn get_conn_info(
     ctx: &RequestMonitoring,
     headers: &HeaderMap,
-    tls: &TlsConfig,
+    tls: Option<&TlsConfig>,
 ) -> Result<ConnInfo, ConnInfoError> {
     // HTTP only uses cleartext (for now and likely always)
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
@@ -184,12 +184,22 @@ fn get_conn_info(
         .ok_or(ConnInfoError::MissingPassword)?;
     let password = urlencoding::decode_binary(password.as_bytes());
 
-    let hostname = connection_url
-        .host_str()
-        .ok_or(ConnInfoError::MissingHostname)?;
-
-    let endpoint =
-        endpoint_sni(hostname, &tls.common_names)?.ok_or(ConnInfoError::MalformedEndpoint)?;
+    let endpoint = match connection_url.host() {
+        Some(url::Host::Domain(hostname)) => {
+            if let Some(tls) = tls {
+                endpoint_sni(hostname, &tls.common_names)?
+                    .ok_or(ConnInfoError::MalformedEndpoint)?
+            } else {
+                hostname
+                    .split_once(".")
+                    .map_or(hostname, |(prefix, _)| prefix)
+                    .into()
+            }
+        }
+        Some(url::Host::Ipv4(_)) | Some(url::Host::Ipv6(_)) | None => {
+            return Err(ConnInfoError::MissingHostname)
+        }
+    };
     ctx.set_endpoint_id(endpoint.clone());
 
     let pairs = connection_url.query_pairs();
@@ -502,7 +512,7 @@ async fn handle_inner(
     let headers = request.headers();
 
     // TLS config should be there.
-    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
+    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref())?;
     info!(user = conn_info.user_info.user.as_str(), "credentials");
 
     // Allow connection pooling only if explicitly requested

From fa0750a37e01cee2e909d91be9b556ee2f128406 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 20 Aug 2024 15:25:21 +0100
Subject: [PATCH 1454/1571] storcon: add peer jwt token (#8764)

## Problem

Storage controllers did not have the right token to speak to their peers
for leadership transitions.

## Summary of changes

Accept a peer jwt token for the storage controller.

Epic: https://github.com/neondatabase/cloud/issues/14701
---
 control_plane/src/storage_controller.rs       |  5 +++++
 storage_controller/src/leadership.rs          |  3 +--
 storage_controller/src/main.rs                | 20 +++++++++++--------
 storage_controller/src/service.rs             |  3 +++
 .../regress/test_storage_controller.py        |  2 ++
 5 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index f6539ad5b0..27d8e2de0c 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -453,6 +453,11 @@ impl StorageController {
             let jwt_token =
                 encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
             args.push(format!("--jwt-token={jwt_token}"));
+
+            let peer_claims = Claims::new(None, Scope::Admin);
+            let peer_jwt_token = encode_from_key_file(&peer_claims, private_key)
+                .expect("failed to generate jwt token");
+            args.push(format!("--peer-jwt-token={peer_jwt_token}"));
         }
 
         if let Some(public_key) = &self.public_key {
diff --git a/storage_controller/src/leadership.rs b/storage_controller/src/leadership.rs
index a171bab451..5fae8991ec 100644
--- a/storage_controller/src/leadership.rs
+++ b/storage_controller/src/leadership.rs
@@ -110,10 +110,9 @@ impl Leadership {
     ) -> Option<GlobalObservedState> {
         tracing::info!("Sending step down request to {leader:?}");
 
-        // TODO: jwt token
         let client = PeerClient::new(
             Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
-            self.config.jwt_token.clone(),
+            self.config.peer_jwt_token.clone(),
         );
         let state = client.step_down(&self.cancel).await;
         match state {
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 17685b1140..e3f29b84e7 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -47,6 +47,9 @@ struct Cli {
     #[arg(long)]
     control_plane_jwt_token: Option<String>,
 
+    #[arg(long)]
+    peer_jwt_token: Option<String>,
+
     /// URL to control plane compute notification endpoint
     #[arg(long)]
     compute_hook_url: Option<String>,
@@ -126,28 +129,28 @@ struct Secrets {
     public_key: Option<JwtAuth>,
     jwt_token: Option<String>,
     control_plane_jwt_token: Option<String>,
+    peer_jwt_token: Option<String>,
 }
 
 impl Secrets {
     const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
     const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
     const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
+    const PEER_JWT_TOKEN_ENV: &'static str = "PEER_JWT_TOKEN";
     const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
 
     /// Load secrets from, in order of preference:
     /// - CLI args if database URL is provided on the CLI
     /// - Environment variables if DATABASE_URL is set.
-    /// - AWS Secrets Manager secrets
     async fn load(args: &Cli) -> anyhow::Result<Self> {
-        let Some(database_url) =
-            Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await
+        let Some(database_url) = Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV)
         else {
             anyhow::bail!(
                 "Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)"
             )
         };
 
-        let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await {
+        let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV) {
             Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?),
             None => None,
         };
@@ -155,18 +158,18 @@ impl Secrets {
         let this = Self {
             database_url,
             public_key,
-            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await,
+            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV),
             control_plane_jwt_token: Self::load_secret(
                 &args.control_plane_jwt_token,
                 Self::CONTROL_PLANE_JWT_TOKEN_ENV,
-            )
-            .await,
+            ),
+            peer_jwt_token: Self::load_secret(&args.peer_jwt_token, Self::PEER_JWT_TOKEN_ENV),
         };
 
         Ok(this)
     }
 
-    async fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
+    fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
         if let Some(v) = cli {
             Some(v.clone())
         } else if let Ok(v) = std::env::var(env_name) {
@@ -266,6 +269,7 @@ async fn async_main() -> anyhow::Result<()> {
     let config = Config {
         jwt_token: secrets.jwt_token,
         control_plane_jwt_token: secrets.control_plane_jwt_token,
+        peer_jwt_token: secrets.peer_jwt_token,
         compute_hook_url: args.compute_hook_url,
         max_offline_interval: args
             .max_offline_interval
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 780f4a7ee5..453e96bad3 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -288,6 +288,9 @@ pub struct Config {
     // This JWT token will be used to authenticate this service to the control plane.
     pub control_plane_jwt_token: Option<String>,
 
+    // This JWT token will be used to authenticate with other storage controller instances
+    pub peer_jwt_token: Option<String>,
+
     /// Where the compute hook should send notifications of pageserver attachment locations
     /// (this URL points to the control plane in prod). If this is None, the compute hook will
     /// assume it is running in a test environment and try to update neon_local.
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 95c35e9641..94d71a7677 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2144,6 +2144,8 @@ def test_storage_controller_leadership_transfer(
     port_distributor: PortDistributor,
     step_down_times_out: bool,
 ):
+    neon_env_builder.auth_enabled = True
+
     neon_env_builder.num_pageservers = 3
 
     neon_env_builder.storage_controller_config = {

From beefc7a8108e5af333bc1e453749acf872f18fdd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 20 Aug 2024 19:47:42 +0100
Subject: [PATCH 1455/1571] pageserver: add metric
 pageserver_secondary_heatmap_total_size (#8768)

## Problem

We don't have a convenient way for a human to ask "how far are secondary
downloads along for this tenant".

This is useful when driving migrations of tenants to the storage
controller, as we first create a secondary location and want to see it
warm up before we cut over. That can already be done via storcon_cli,
but we would like a way that doesn't require direct API access.

## Summary of changes

Add a metric that reports to total size of layers in the heatmap: this
may be used in conjunction with the existing
`pageserver_secondary_resident_physical_size` to estimate "warmth" of
the secondary location.
---
 pageserver/src/metrics.rs                     |  9 +++++++++
 pageserver/src/tenant/secondary.rs            | 10 ++++++++++
 pageserver/src/tenant/secondary/downloader.rs |  6 ++++++
 3 files changed, 25 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index cd2cd43f27..1bc9352256 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1803,6 +1803,15 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
     .expect("failed to define a metric")
 });
 
+pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_secondary_heatmap_total_size",
+        "The total size in bytes of all layers in the most recently downloaded heatmap.",
+        &["tenant_id", "shard_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
     Upload,
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 3132a28b12..1331c07d05 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -8,6 +8,7 @@ use std::{sync::Arc, time::SystemTime};
 use crate::{
     context::RequestContext,
     disk_usage_eviction_task::DiskUsageEvictionInfo,
+    metrics::SECONDARY_HEATMAP_TOTAL_SIZE,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
 };
 
@@ -105,6 +106,9 @@ pub(crate) struct SecondaryTenant {
 
     // Sum of layer sizes on local disk
     pub(super) resident_size_metric: UIntGauge,
+
+    // Sum of layer sizes in the most recently downloaded heatmap
+    pub(super) heatmap_total_size_metric: UIntGauge,
 }
 
 impl Drop for SecondaryTenant {
@@ -112,6 +116,7 @@ impl Drop for SecondaryTenant {
         let tenant_id = self.tenant_shard_id.tenant_id.to_string();
         let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
         let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
+        let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
     }
 }
 
@@ -128,6 +133,10 @@ impl SecondaryTenant {
             .get_metric_with_label_values(&[&tenant_id, &shard_id])
             .unwrap();
 
+        let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id])
+            .unwrap();
+
         Arc::new(Self {
             tenant_shard_id,
             // todo: shall we make this a descendent of the
@@ -145,6 +154,7 @@ impl SecondaryTenant {
             progress: std::sync::Mutex::default(),
 
             resident_size_metric,
+            heatmap_total_size_metric,
         })
     }
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 8cff1d2864..90e1c01dbd 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -829,6 +829,12 @@ impl<'a> TenantDownloader<'a> {
             layers_downloaded: 0,
             bytes_downloaded: 0,
         };
+
+        // Also expose heatmap bytes_total as a metric
+        self.secondary_state
+            .heatmap_total_size_metric
+            .set(heatmap_stats.bytes);
+
         // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
         let mut delete_layers = Vec::new();
         let mut delete_timelines = Vec::new();

From c8b9116a97e047a5f349e69fda1fe96790797820 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 20 Aug 2024 15:05:33 -0400
Subject: [PATCH 1456/1571] impr(pageserver): abort on fatal I/O writer error
 (#8777)

part of https://github.com/neondatabase/neon/issues/8140

The blob writer path now uses `maybe_fatal_err`

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/virtual_file.rs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index b4695e5f40..c0017280fd 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -756,11 +756,23 @@ impl VirtualFile {
         })
     }
 
+    /// The function aborts the process if the error is fatal.
     async fn write_at<B: IoBuf + Send>(
         &self,
         buf: FullSlice<B>,
         offset: u64,
         _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
+    ) -> (FullSlice<B>, Result<usize, Error>) {
+        let (slice, result) = self.write_at_inner(buf, offset, _ctx).await;
+        let result = result.maybe_fatal_err("write_at");
+        (slice, result)
+    }
+
+    async fn write_at_inner<B: IoBuf + Send>(
+        &self,
+        buf: FullSlice<B>,
+        offset: u64,
+        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
     ) -> (FullSlice<B>, Result<usize, Error>) {
         let file_guard = match self.lock_file().await {
             Ok(file_guard) => file_guard,

From 6d8572ded607e6cb583ff6b9a1690ceecce5a407 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 21 Aug 2024 09:08:49 +0200
Subject: [PATCH 1457/1571] Benchmarking: need to checkout actions to download
 Neon artifacts (#8770)

## Problem

Database preparation workflow needs Neon artifacts but does not checkout
necessary download action.

We were lucke in a few runs like this one

https://github.com/neondatabase/neon/actions/runs/10413970941/job/28870668020

but this is flaky and a race condition which failed here


https://github.com/neondatabase/neon/actions/runs/10446395644/job/28923749772#step:4:1


## Summary of changes

Checkout code (including actions) before invoking download action

Successful test run
https://github.com/neondatabase/neon/actions/runs/10469356296/job/28992200694
---
 .github/workflows/_benchmarking_preparation.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml
index 7229776cd6..a52e43b4da 100644
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -48,6 +48,8 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT  
 
+    - uses: actions/checkout@v4
+
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:

From 21b684718e1e3e18e687d095d79322c5db9a3992 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 21 Aug 2024 12:55:01 +0200
Subject: [PATCH 1458/1571] pageserver: add counter for wait time on background
 loop semaphore (#8769)

## Problem

Compaction jobs and other background loops are concurrency-limited
through a global semaphore.

The current counters allow quantifying how _many_ tasks are waiting.
But there is no way to tell how _much_ delay is added by the semaphore.

So, add a counter that aggregates the wall clock time seconds spent
acquiring the semaphore.

The metrics can be used as follows:

* retroactively calculate average acquisition time in a given time range
* compare the degree of background loop backlog among pageservers

The metric is insufficient to calculate

* run-up of ongoing acquisitions that haven't finished acquiring yet
* Not easily feasible because ["Cancelling a call to acquire makes you
lose your place in the
queue"](https://docs.rs/tokio/latest/tokio/sync/struct.Semaphore.html#method.acquire)

## Summary of changes

* Refactor the metrics to follow the current best practice for typed
metrics in `metrics.rs`.
* Add the new counter.
---
 pageserver/src/metrics.rs      | 69 +++++++++++++++++++++++++++++-----
 pageserver/src/tenant/tasks.rs | 11 +-----
 2 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 1bc9352256..0a1a22b6e8 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1862,16 +1862,64 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
     .expect("Failed to register tenant_task_events metric")
 });
 
-pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "pageserver_background_loop_semaphore_wait_start_count",
-        "Counter for background loop concurrency-limiting semaphore acquire calls started",
-        "pageserver_background_loop_semaphore_wait_finish_count",
-        "Counter for background loop concurrency-limiting semaphore acquire calls finished",
-        &["task"],
-    )
-    .unwrap()
-});
+pub struct BackgroundLoopSemaphoreMetrics {
+    counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
+    durations: EnumMap<BackgroundLoopKind, Counter>,
+}
+
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
+    || {
+        let counters = register_int_counter_pair_vec!(
+            "pageserver_background_loop_semaphore_wait_start_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls started",
+            "pageserver_background_loop_semaphore_wait_finish_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
+            &["task"],
+        )
+        .unwrap();
+
+        let durations = register_counter_vec!(
+            "pageserver_background_loop_semaphore_wait_duration_seconds",
+            "Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
+            &["task"],
+        )
+        .unwrap();
+
+        BackgroundLoopSemaphoreMetrics {
+            counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+                counters.with_label_values(&[kind.into()])
+            })),
+            durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+                durations.with_label_values(&[kind.into()])
+            })),
+        }
+    },
+);
+
+impl BackgroundLoopSemaphoreMetrics {
+    pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
+        struct Record<'a> {
+            metrics: &'a BackgroundLoopSemaphoreMetrics,
+            task: BackgroundLoopKind,
+            _counter_guard: metrics::IntCounterPairGuard,
+            start: Instant,
+        }
+        impl Drop for Record<'_> {
+            fn drop(&mut self) {
+                let elapsed = self.start.elapsed().as_secs_f64();
+                self.metrics.durations[self.task].inc_by(elapsed);
+            }
+        }
+        Record {
+            metrics: self,
+            task,
+            _counter_guard: self.counters[task].guard(),
+            start: Instant::now(),
+        }
+    }
+}
 
 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
@@ -2553,6 +2601,7 @@ use std::time::{Duration, Instant};
 use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
+use crate::tenant::tasks::BackgroundLoopKind;
 
 /// Maintain a per timeline gauge in addition to the global gauge.
 pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 3972685a8e..12f080f3c1 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -61,21 +61,12 @@ impl BackgroundLoopKind {
     }
 }
 
-static PERMIT_GAUGES: once_cell::sync::Lazy<
-    enum_map::EnumMap<BackgroundLoopKind, metrics::IntCounterPair>,
-> = once_cell::sync::Lazy::new(|| {
-    enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-        let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
-        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()])
-    }))
-});
-
 /// Cancellation safe.
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
     loop_kind: BackgroundLoopKind,
     _ctx: &RequestContext,
 ) -> tokio::sync::SemaphorePermit<'static> {
-    let _guard = PERMIT_GAUGES[loop_kind].guard();
+    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);
 
     pausable_failpoint!(
         "initial-size-calculation-permit-pause",

From 477246f42cf984015d654521174fff763f9e1263 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 21 Aug 2024 14:28:27 +0300
Subject: [PATCH 1459/1571] storcon: handle heartbeater shutdown gracefully
 (#8767)

if a heartbeat happens during shutdown, then the task is already
cancelled and will not be sending responses.

Fixes: #8766
---
 storage_controller/src/heartbeater.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 1bb9c17f30..c0e27bafdb 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -87,9 +87,12 @@ impl Heartbeater {
                 pageservers,
                 reply: sender,
             })
-            .unwrap();
+            .map_err(|_| HeartbeaterError::Cancel)?;
 
-        receiver.await.unwrap()
+        receiver
+            .await
+            .map_err(|_| HeartbeaterError::Cancel)
+            .and_then(|x| x)
     }
 }
 

From 3b8016488efc4cecee1a956285b3365162706894 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 21 Aug 2024 14:51:08 +0300
Subject: [PATCH 1460/1571] test: test_timeline_ancestor_detach_errors rare
 allowed_error (#8782)

Add another allowed_error for this rarity.

Fixes: #8773
---
 test_runner/regress/test_timeline_detach_ancestor.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 82fc26126d..d152d0f41f 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -639,8 +639,12 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard
 
     for ps in pageservers.values():
         ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
-        ps.allowed_errors.append(
-            ".* WARN .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: request was dropped before completing"
+        ps.allowed_errors.extend(
+            [
+                ".* WARN .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: request was dropped before completing",
+                # rare error logging, which is hard to reproduce without instrumenting responding with random sleep
+                '.* ERROR .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: Cancelled request finished with an error: Conflict\\("no ancestors"\\)',
+            ]
         )
 
     client = (

From 75175f3628bc88053e13555a3ada8082639b2db6 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 21 Aug 2024 14:29:11 +0100
Subject: [PATCH 1461/1571] CI(build-and-test): run regression tests on arm
 (#8552)

## Problem

We want to run our regression test suite on ARM.

## Summary of changes
- run regression tests on release ARM builds
- run `build-neon` (including rust tests) on debug ARM builds
- add `arch` parameter to test to distinguish them in the allure report
and in a database
---
 .../actions/run-python-test-set/action.yml    |  4 +---
 .github/workflows/_build-and-test-locally.yml | 15 +++++++++----
 .github/workflows/build_and_test.yml          |  2 +-
 .../ingest_regress_test_result-new-format.py  | 14 ++++++++++++-
 test_runner/fixtures/parametrize.py           | 21 +++++++++++++++++++
 5 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 814067fb8e..6c2cee0971 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -169,10 +169,8 @@ runs:
           EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
         fi
 
-        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
+        if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
           cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
-        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
-          cov_prefix=()
         else
           cov_prefix=()
         fi
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index af76e51ebc..5e9fff0e6a 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -94,11 +94,16 @@ jobs:
       # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
       # corresponding Cargo.toml files for their descriptions.
       - name: Set env variables
+        env:
+          ARCH: ${{ inputs.arch }}
         run: |
           CARGO_FEATURES="--features testing"
-          if [[ $BUILD_TYPE == "debug" ]]; then
+          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
             cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
             CARGO_FLAGS="--locked"
+          elif [[ $BUILD_TYPE == "debug" ]]; then
+            cov_prefix=""
+            CARGO_FLAGS="--locked"
           elif [[ $BUILD_TYPE == "release" ]]; then
             cov_prefix=""
             CARGO_FLAGS="--locked --release"
@@ -158,6 +163,8 @@ jobs:
       # Do install *before* running rust tests because they might recompile the
       # binaries with different features/flags.
       - name: Install rust binaries
+        env:
+          ARCH: ${{ inputs.arch }}
         run: |
           # Install target binaries
           mkdir -p /tmp/neon/bin/
@@ -172,7 +179,7 @@ jobs:
           done
 
           # Install test executables and write list of all binaries (for code coverage)
-          if [[ $BUILD_TYPE == "debug" ]]; then
+          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
             # Keep bloated coverage data files away from the rest of the artifact
             mkdir -p /tmp/coverage/
 
@@ -243,8 +250,8 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   regress-tests:
-    # Run test on x64 only
-    if: inputs.arch == 'x64'
+    # Don't run regression tests on debug arm64 builds
+    if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
     needs: [ build-neon ]
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
     container:
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 715f1af656..1e7f3598c2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -198,7 +198,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        arch: [ x64 ]
+        arch: [ x64, arm64 ]
         # Do not build or run tests in debug for release branches
         build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }}
         include:
diff --git a/scripts/ingest_regress_test_result-new-format.py b/scripts/ingest_regress_test_result-new-format.py
index cff1d9875f..40d7254e00 100644
--- a/scripts/ingest_regress_test_result-new-format.py
+++ b/scripts/ingest_regress_test_result-new-format.py
@@ -18,6 +18,7 @@ import psycopg2
 from psycopg2.extras import execute_values
 
 CREATE_TABLE = """
+CREATE TYPE arch AS ENUM ('ARM64', 'X64', 'UNKNOWN');
 CREATE TABLE IF NOT EXISTS results (
     id           BIGSERIAL PRIMARY KEY,
     parent_suite TEXT NOT NULL,
@@ -28,6 +29,7 @@ CREATE TABLE IF NOT EXISTS results (
     stopped_at   TIMESTAMPTZ NOT NULL,
     duration     INT NOT NULL,
     flaky        BOOLEAN NOT NULL,
+    arch         arch DEFAULT 'X64',
     build_type   TEXT NOT NULL,
     pg_version   INT NOT NULL,
     run_id       BIGINT NOT NULL,
@@ -35,7 +37,7 @@ CREATE TABLE IF NOT EXISTS results (
     reference    TEXT NOT NULL,
     revision     CHAR(40) NOT NULL,
     raw          JSONB COMPRESSION lz4 NOT NULL,
-    UNIQUE (parent_suite, suite, name, build_type, pg_version, started_at, stopped_at, run_id)
+    UNIQUE (parent_suite, suite, name, arch, build_type, pg_version, started_at, stopped_at, run_id)
 );
 """
 
@@ -50,6 +52,7 @@ class Row:
     stopped_at: datetime
     duration: int
     flaky: bool
+    arch: str
     build_type: str
     pg_version: int
     run_id: int
@@ -121,6 +124,14 @@ def ingest_test_result(
         raw.pop("labels")
         raw.pop("extra")
 
+        # All allure parameters are prefixed with "__", see test_runner/fixtures/parametrize.py
+        parameters = {
+            p["name"].removeprefix("__"): p["value"]
+            for p in test["parameters"]
+            if p["name"].startswith("__")
+        }
+        arch = parameters.get("arch", "UNKNOWN").strip("'")
+
         build_type, pg_version, unparametrized_name = parse_test_name(test["name"])
         labels = {label["name"]: label["value"] for label in test["labels"]}
         row = Row(
@@ -132,6 +143,7 @@ def ingest_test_result(
             stopped_at=datetime.fromtimestamp(test["time"]["stop"] / 1000, tz=timezone.utc),
             duration=test["time"]["duration"],
             flaky=test["flaky"] or test["retriesStatusChange"],
+            arch=arch,
             build_type=build_type,
             pg_version=pg_version,
             run_id=run_id,
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 0227285822..92c98763e3 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -1,6 +1,7 @@
 import os
 from typing import Any, Dict, Optional
 
+import allure
 import pytest
 import toml
 from _pytest.python import Metafunc
@@ -91,3 +92,23 @@ def pytest_generate_tests(metafunc: Metafunc):
         and (platform := os.getenv("PLATFORM")) is not None
     ):
         metafunc.parametrize("platform", [platform.lower()])
+
+
+@pytest.hookimpl(hookwrapper=True, tryfirst=True)
+def pytest_runtest_makereport(*args, **kwargs):
+    # Add test parameters to Allue report to distinguish the same tests with different parameters.
+    # Names has `__` prefix to avoid conflicts with `pytest.mark.parametrize` parameters
+
+    # A mapping between `uname -m` and `RUNNER_ARCH` values.
+    # `RUNNER_ARCH` environment variable is set on GitHub Runners,
+    # possible values are X86, X64, ARM, or ARM64.
+    # See https://docs.github.com/en/actions/learn-github-actions/variables#default-environment-variables
+    uname_m = {
+        "aarch64": "ARM64",
+        "arm64": "ARM64",
+        "x86_64": "X64",
+    }.get(os.uname().machine, "UNKNOWN")
+    arch = os.getenv("RUNNER_ARCH", uname_m)
+    allure.dynamic.parameter("__arch", arch)
+
+    yield

From 428b105dde089d402b1de035a8cb43ebea930583 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 21 Aug 2024 14:45:32 +0100
Subject: [PATCH 1462/1571] remove workspace hack from libs (#8780)

This removes workspace hack from all libs, not from any binaries. This
does not change the behaviour of the hack.

Running
```
cargo clean
cargo build --release --bin proxy
```

Before this change took 5m16s. After this change took 3m3s. This is
because this allows the build to be parallelisable much more.
---
 .config/hakari.toml                    | 28 ++++++++++++++++++++++----
 Cargo.lock                             | 18 -----------------
 libs/compute_api/Cargo.toml            |  2 --
 libs/consumption_metrics/Cargo.toml    |  4 +---
 libs/desim/Cargo.toml                  |  2 --
 libs/metrics/Cargo.toml                |  2 --
 libs/pageserver_api/Cargo.toml         |  4 +---
 libs/postgres_backend/Cargo.toml       |  1 -
 libs/postgres_connection/Cargo.toml    |  2 --
 libs/postgres_ffi/Cargo.toml           |  2 --
 libs/postgres_ffi/wal_craft/Cargo.toml |  2 --
 libs/pq_proto/Cargo.toml               |  4 +---
 libs/remote_storage/Cargo.toml         |  3 ++-
 libs/safekeeper_api/Cargo.toml         |  2 --
 libs/tenant_size_model/Cargo.toml      |  2 --
 libs/tracing-utils/Cargo.toml          |  2 --
 libs/utils/Cargo.toml                  |  4 ++--
 libs/walproposer/Cargo.toml            |  2 --
 workspace_hack/Cargo.toml              |  3 ---
 19 files changed, 31 insertions(+), 58 deletions(-)

diff --git a/.config/hakari.toml b/.config/hakari.toml
index 9913ecc9c0..b5990d090e 100644
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -23,10 +23,30 @@ platforms = [
 ]
 
 [final-excludes]
-# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
-# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
-# from depending on workspace-hack because most of the dependencies are not used.
-workspace-members = ["vm_monitor"]
+workspace-members = [
+    # vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
+    # it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
+    # from depending on workspace-hack because most of the dependencies are not used.
+    "vm_monitor",
+    # All of these exist in libs and are not usually built independently.
+    # Putting workspace hack there adds a bottleneck for cargo builds.
+    "compute_api",
+    "consumption_metrics",
+    "desim",
+    "metrics",
+    "pageserver_api",
+    "postgres_backend",
+    "postgres_connection",
+    "postgres_ffi",
+    "pq_proto",
+    "remote_storage",
+    "safekeeper_api",
+    "tenant_size_model",
+    "tracing-utils",
+    "utils",
+    "wal_craft",
+    "walproposer",
+]
 
 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
diff --git a/Cargo.lock b/Cargo.lock
index dee15b6aa7..a506da8c02 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1208,7 +1208,6 @@ dependencies = [
  "serde_json",
  "serde_with",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -1321,7 +1320,6 @@ dependencies = [
  "serde",
  "serde_with",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -1670,7 +1668,6 @@ dependencies = [
  "smallvec",
  "tracing",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -3147,7 +3144,6 @@ dependencies = [
  "rand 0.8.5",
  "rand_distr",
  "twox-hash",
- "workspace_hack",
 ]
 
 [[package]]
@@ -3791,7 +3787,6 @@ dependencies = [
  "strum_macros",
  "thiserror",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -4193,7 +4188,6 @@ dependencies = [
  "tokio-rustls 0.25.0",
  "tokio-util",
  "tracing",
- "workspace_hack",
 ]
 
 [[package]]
@@ -4206,7 +4200,6 @@ dependencies = [
  "postgres",
  "tokio-postgres",
  "url",
- "workspace_hack",
 ]
 
 [[package]]
@@ -4229,7 +4222,6 @@ dependencies = [
  "serde",
  "thiserror",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -4267,7 +4259,6 @@ dependencies = [
  "thiserror",
  "tokio",
  "tracing",
- "workspace_hack",
 ]
 
 [[package]]
@@ -4832,7 +4823,6 @@ dependencies = [
  "toml_edit 0.19.10",
  "tracing",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -5357,7 +5347,6 @@ dependencies = [
  "serde",
  "serde_with",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -6193,7 +6182,6 @@ dependencies = [
  "anyhow",
  "serde",
  "serde_json",
- "workspace_hack",
 ]
 
 [[package]]
@@ -6794,7 +6782,6 @@ dependencies = [
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
- "workspace_hack",
 ]
 
 [[package]]
@@ -7012,7 +6999,6 @@ dependencies = [
  "url",
  "uuid",
  "walkdir",
- "workspace_hack",
 ]
 
 [[package]]
@@ -7091,7 +7077,6 @@ dependencies = [
  "postgres_ffi",
  "regex",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -7112,7 +7097,6 @@ dependencies = [
  "bindgen",
  "postgres_ffi",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -7669,8 +7653,6 @@ dependencies = [
  "tokio",
  "tokio-rustls 0.24.0",
  "tokio-util",
- "toml_datetime",
- "toml_edit 0.19.10",
  "tonic",
  "tower",
  "tracing",
diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml
index b377bd2cce..8aaa481f8c 100644
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -14,5 +14,3 @@ regex.workspace = true
 
 utils = { path = "../utils" }
 remote_storage = { version = "0.1", path = "../remote_storage/" }
-
-workspace_hack.workspace = true
diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml
index 3f290821c2..a40b74b952 100644
--- a/libs/consumption_metrics/Cargo.toml
+++ b/libs/consumption_metrics/Cargo.toml
@@ -6,10 +6,8 @@ license = "Apache-2.0"
 
 [dependencies]
 anyhow.workspace = true
-chrono.workspace = true
+chrono = { workspace = true, features = ["serde"] }
 rand.workspace = true
 serde.workspace = true
 serde_with.workspace = true
 utils.workspace = true
-
-workspace_hack.workspace = true
diff --git a/libs/desim/Cargo.toml b/libs/desim/Cargo.toml
index 6f442d8243..0c4be90267 100644
--- a/libs/desim/Cargo.toml
+++ b/libs/desim/Cargo.toml
@@ -14,5 +14,3 @@ parking_lot.workspace = true
 hex.workspace = true
 scopeguard.workspace = true
 smallvec = { workspace = true, features = ["write"] }
-
-workspace_hack.workspace = true
diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml
index 0bd804051c..f87e7b8e3a 100644
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -12,8 +12,6 @@ chrono.workspace = true
 twox-hash.workspace = true
 measured.workspace = true
 
-workspace_hack.workspace = true
-
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
 measured-process.workspace = true
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 3bba89c76d..cb28359ac3 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -21,11 +21,9 @@ hex.workspace = true
 humantime.workspace = true
 thiserror.workspace = true
 humantime-serde.workspace = true
-chrono.workspace = true
+chrono = { workspace = true, features = ["serde"] }
 itertools.workspace = true
 
-workspace_hack.workspace = true
-
 [dev-dependencies]
 bincode.workspace = true
 rand.workspace = true
diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml
index c7611b9f21..f6854328fc 100644
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -18,7 +18,6 @@ tokio-rustls.workspace = true
 tracing.workspace = true
 
 pq_proto.workspace = true
-workspace_hack.workspace = true
 
 [dev-dependencies]
 once_cell.workspace = true
diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml
index fbfea80ae2..19027d13ff 100644
--- a/libs/postgres_connection/Cargo.toml
+++ b/libs/postgres_connection/Cargo.toml
@@ -11,7 +11,5 @@ postgres.workspace = true
 tokio-postgres.workspace = true
 url.workspace = true
 
-workspace_hack.workspace = true
-
 [dev-dependencies]
 once_cell.workspace = true
diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml
index 86e72f6bdd..ee69878f69 100644
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -19,8 +19,6 @@ thiserror.workspace = true
 serde.workspace = true
 utils.workspace = true
 
-workspace_hack.workspace = true
-
 [dev-dependencies]
 env_logger.workspace = true
 postgres.workspace = true
diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml
index 0edc642402..29dd01a936 100644
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -14,8 +14,6 @@ postgres.workspace = true
 postgres_ffi.workspace = true
 camino-tempfile.workspace = true
 
-workspace_hack.workspace = true
-
 [dev-dependencies]
 regex.workspace = true
 utils.workspace = true
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index 8afabe670e..66bbe03ebc 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -11,9 +11,7 @@ itertools.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["io-util"] }
 tracing.workspace = true
 thiserror.workspace = true
 serde.workspace = true
-
-workspace_hack.workspace = true
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 414bce1b26..02adee058f 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -32,7 +32,7 @@ scopeguard.workspace = true
 metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
-workspace_hack.workspace = true
+
 azure_core.workspace = true
 azure_identity.workspace = true
 azure_storage.workspace = true
@@ -46,3 +46,4 @@ sync_wrapper = { workspace = true, features = ["futures"] }
 camino-tempfile.workspace = true
 test-context.workspace = true
 rand.workspace = true
+tokio = { workspace = true, features = ["test-util"] }
diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml
index 327d98ee77..e1f4bcca46 100644
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -9,5 +9,3 @@ serde.workspace = true
 serde_with.workspace = true
 const_format.workspace = true
 utils.workspace = true
-
-workspace_hack.workspace = true
diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml
index 15e78932a8..8aa3c54f62 100644
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -9,5 +9,3 @@ license.workspace = true
 anyhow.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-
-workspace_hack.workspace = true
diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml
index 512a748124..5ea8db6b42 100644
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -14,5 +14,3 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
-
-workspace_hack.workspace = true
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index ec05f849cf..6e593eeac1 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -39,7 +39,7 @@ thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
 tokio-util.workspace = true
-toml_edit.workspace = true
+toml_edit = { workspace = true, features = ["serde"] }
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
@@ -54,7 +54,6 @@ walkdir.workspace = true
 pq_proto.workspace = true
 postgres_connection.workspace = true
 metrics.workspace = true
-workspace_hack.workspace = true
 
 const_format.workspace = true
 
@@ -71,6 +70,7 @@ criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
 serde_assert.workspace = true
+tokio = { workspace = true, features = ["test-util"] }
 
 [[bench]]
 name = "benchmarks"
diff --git a/libs/walproposer/Cargo.toml b/libs/walproposer/Cargo.toml
index 73aa073c44..2d442dc429 100644
--- a/libs/walproposer/Cargo.toml
+++ b/libs/walproposer/Cargo.toml
@@ -9,8 +9,6 @@ anyhow.workspace = true
 utils.workspace = true
 postgres_ffi.workspace = true
 
-workspace_hack.workspace = true
-
 [build-dependencies]
 anyhow.workspace = true
 bindgen.workspace = true
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 2d9b372654..20693ad63d 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -80,8 +80,6 @@ time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
-toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
-toml_edit = { version = "0.19", features = ["serde"] }
 tonic = { version = "0.9", features = ["tls-roots"] }
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
@@ -124,7 +122,6 @@ serde = { version = "1", features = ["alloc", "derive"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
 syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }
-toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }

From f4b3c317f394cb7f82c8c52754b290903957e85d Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 19 Aug 2024 16:34:04 -0500
Subject: [PATCH 1463/1571] Add compute_logical_snapshot_files metric

Track the number of logical snapshot files on an endpoint over time.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vm-image-spec.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 41d6e11725..8c1c4512b4 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -312,6 +312,22 @@ files:
         query: |
           SELECT checkpoints_timed FROM pg_stat_bgwriter;
 
+      - metric_name: compute_logical_snapshot_files
+        type: guage
+        help: 'Number of snapshot files in pg_logical/snapshot'
+        key_labels:
+          - tenant_id
+          - timeline_id
+        values: [num_logical_snapshot_files]
+        query: |
+          SELECT
+            (SELECT setting FROM pg_settings WHERE name = 'neon.tenant_id') AS tenant_id,
+            (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
+            -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
+            -- temporary snapshot files are renamed to the actual snapshot files after they are
+            -- completely built. We only WAL-log the completely built snapshot files.
+            (SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
+
       # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
       # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
 

From d919770c55b2a70fd0b19c888d3673b6fef2f889 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 21 Aug 2024 17:30:42 +0300
Subject: [PATCH 1464/1571] safekeeper: add listing timelines

Adds endpoint GET /tenant/timeline listing all not deleted timelines.
---
 safekeeper/src/http/routes.rs            | 13 +++++++++++++
 test_runner/fixtures/common_types.py     | 15 ++++++++++++++-
 test_runner/fixtures/safekeeper/http.py  |  8 +++++++-
 test_runner/regress/test_wal_acceptor.py |  4 ++++
 4 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index c9defb0bcf..d11815f6ef 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -114,6 +114,16 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
     })
 }
 
+/// List all (not deleted) timelines.
+async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let res: Vec<TenantTimelineId> = GlobalTimelines::get_all()
+        .iter()
+        .map(|tli| tli.ttid)
+        .collect();
+    json_response(StatusCode::OK, res)
+}
+
 /// Report info about timeline.
 async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let ttid = TenantTimelineId::new(
@@ -562,6 +572,9 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .post("/v1/tenant/timeline", |r| {
             request_span(r, timeline_create_handler)
         })
+        .get("/v1/tenant/timeline", |r| {
+            request_span(r, timeline_list_handler)
+        })
         .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
             request_span(r, timeline_status_handler)
         })
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index b63dfd4e47..7cadcbb4c2 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -1,7 +1,7 @@
 import random
 from dataclasses import dataclass
 from functools import total_ordering
-from typing import Any, Type, TypeVar, Union
+from typing import Any, Dict, Type, TypeVar, Union
 
 T = TypeVar("T", bound="Id")
 
@@ -147,6 +147,19 @@ class TimelineId(Id):
         return self.id.hex()
 
 
+@dataclass
+class TenantTimelineId:
+    tenant_id: TenantId
+    timeline_id: TimelineId
+
+    @classmethod
+    def from_json(cls, d: Dict[str, Any]) -> "TenantTimelineId":
+        return TenantTimelineId(
+            tenant_id=TenantId(d["tenant_id"]),
+            timeline_id=TimelineId(d["timeline_id"]),
+        )
+
+
 # Workaround for compat with python 3.9, which does not have `typing.Self`
 TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId")
 
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index a51b89744b..dd3a0a3d54 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import pytest
 import requests
 
-from fixtures.common_types import Lsn, TenantId, TimelineId
+from fixtures.common_types import Lsn, TenantId, TenantTimelineId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 
@@ -144,6 +144,12 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, dict)
         return res_json
 
+    def timeline_list(self) -> List[TenantTimelineId]:
+        res = self.get(f"http://localhost:{self.port}/v1/tenant/timeline")
+        res.raise_for_status()
+        resj = res.json()
+        return [TenantTimelineId.from_json(ttidj) for ttidj in resj]
+
     def timeline_create(
         self,
         tenant_id: TenantId,
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 5d3b263936..bb3b16f3e1 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -254,6 +254,10 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
     assert max(init_m[2].flush_lsns) <= min(final_m[2].flush_lsns) < middle_lsn
     assert max(init_m[2].commit_lsns) <= min(final_m[2].commit_lsns) < middle_lsn
 
+    # Test timeline_list endpoint.
+    http_cli = env.safekeepers[0].http_client()
+    assert len(http_cli.timeline_list()) == 3
+
 
 # Check that dead minority doesn't prevent the commits: execute insert n_inserts
 # times, with fault_probability chance of getting a wal acceptor down or up

From b83d722369f1cb1d9a55ab8d39c36f30b0886ea4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 21 Aug 2024 19:22:47 +0300
Subject: [PATCH 1465/1571] test: fix more flaky due to graceful shutdown
 (#8787)

Going through the list of recent flaky tests, trying to fix those
related to graceful shutdown.

- test_forward_compatibility: flush and wait for uploads to avoid
graceful shutdown
- test_layer_bloating: in the end the endpoint and vanilla are still up
=> immediate shutdown
- test_lagging_sk: pageserver shutdown is not related to the test =>
immediate shutdown
- test_lsn_lease_size: pageserver flushing is not needed => immediate
shutdown

Additionally:
- remove `wait_for_upload` usage from workload fixture

Cc: #8708
Fixes: #8710
---
 test_runner/fixtures/neon_fixtures.py      | 14 +++---------
 test_runner/fixtures/workload.py           |  7 +++---
 test_runner/regress/test_compatibility.py  | 12 ++++------
 test_runner/regress/test_import.py         |  5 ++---
 test_runner/regress/test_layer_bloating.py | 26 +++++++++++++---------
 test_runner/regress/test_tenant_size.py    |  3 +++
 test_runner/regress/test_wal_acceptor.py   |  2 ++
 7 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9aa275d343..2bb698f175 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -61,8 +61,6 @@ from fixtures.pageserver.common_types import IndexPartDump, LayerName, parse_lay
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
-    wait_for_upload,
-    wait_for_upload_queue_empty,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
@@ -5347,9 +5345,7 @@ def last_flush_lsn_upload(
     for tenant_shard_id, pageserver in shards:
         ps_http = pageserver.http_client(auth_token=auth_token)
         wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
-        # force a checkpoint to trigger upload
-        ps_http.timeline_checkpoint(tenant_shard_id, timeline_id)
-        wait_for_upload(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
+        ps_http.timeline_checkpoint(tenant_shard_id, timeline_id, wait_until_uploaded=True)
     return last_flush_lsn
 
 
@@ -5434,9 +5430,5 @@ def generate_uploads_and_deletions(
         # ensures that the pageserver is in a fully idle state: there will be no more
         # background ingest, no more uploads pending, and therefore no non-determinism
         # in subsequent actions like pageserver restarts.
-        final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
-        ps_http.timeline_checkpoint(tenant_id, timeline_id)
-        # Finish uploads
-        wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn)
-        # Finish all remote writes (including deletions)
-        wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
+        flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
+        ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index cc93762175..065a78bf9b 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -10,7 +10,7 @@ from fixtures.neon_fixtures import (
     tenant_get_shards,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
+from fixtures.pageserver.utils import wait_for_last_record_lsn
 
 # neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex
 # to ensure we don't do that: this enables running lots of Workloads in parallel safely.
@@ -174,8 +174,9 @@ class Workload:
 
                 if upload:
                     # Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload)
-                    ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
-                    wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
+                    ps_http.timeline_checkpoint(
+                        tenant_shard_id, self.timeline_id, wait_until_uploaded=True
+                    )
                     log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
                 else:
                     log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 30ff40b7df..de27191945 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -9,14 +9,12 @@ from typing import List, Optional
 
 import pytest
 import toml
-from fixtures.common_types import Lsn, TenantId, TimelineId
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, flush_ep_to_pageserver
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
-    wait_for_last_record_lsn,
-    wait_for_upload,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
@@ -122,11 +120,9 @@ def test_create_snapshot(
     timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
 
     pageserver_http = env.pageserver.http_client()
-    lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
 
-    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
+    flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
 
     env.endpoints.stop_all()
     for sk in env.safekeepers:
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 4dae9176b8..4385cfca76 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -18,7 +18,6 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
-    wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind
 from fixtures.utils import assert_pageserver_backups_equal, subprocess_capture
@@ -144,7 +143,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
 
     # Wait for data to land in s3
     wait_for_last_record_lsn(client, tenant, timeline, Lsn(end_lsn))
-    wait_for_upload(client, tenant, timeline, Lsn(end_lsn))
+    client.timeline_checkpoint(tenant, timeline, compact=False, wait_until_uploaded=True)
 
     # Check it worked
     endpoint = env.endpoints.create_start(branch_name, tenant_id=tenant)
@@ -290,7 +289,7 @@ def _import(
 
     # Wait for data to land in s3
     wait_for_last_record_lsn(client, tenant, timeline, lsn)
-    wait_for_upload(client, tenant, timeline, lsn)
+    client.timeline_checkpoint(tenant, timeline, compact=False, wait_until_uploaded=True)
 
     # Check it worked
     endpoint = env.endpoints.create_start(branch_name, tenant_id=tenant, lsn=lsn)
diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py
index 77dc8a35b5..b8126395fd 100644
--- a/test_runner/regress/test_layer_bloating.py
+++ b/test_runner/regress/test_layer_bloating.py
@@ -1,27 +1,31 @@
 import os
-import time
 
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
-    NeonEnv,
+    NeonEnvBuilder,
     logical_replication_sync,
     wait_for_last_flush_lsn,
 )
 from fixtures.pg_version import PgVersion
 
 
-def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
-    env = neon_simple_env
-
-    if env.pg_version != PgVersion.V16:
+def test_layer_bloating(neon_env_builder: NeonEnvBuilder, vanilla_pg):
+    if neon_env_builder.pg_version != PgVersion.V16:
         pytest.skip("pg_log_standby_snapshot() function is available only in PG16")
 
-    timeline = env.neon_cli.create_branch("test_logical_replication", "empty")
-    endpoint = env.endpoints.create_start(
-        "test_logical_replication", config_lines=["log_statement=all"]
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            "compaction_threshold": 99999,
+            "image_creation_threshold": 99999,
+        }
     )
 
+    timeline = env.initial_timeline
+    endpoint = env.endpoints.create_start("main", config_lines=["log_statement=all"])
+
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
@@ -54,7 +58,7 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
     # Wait logical replication to sync
     logical_replication_sync(vanilla_pg, endpoint)
     wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline)
-    time.sleep(10)
+    env.pageserver.http_client().timeline_checkpoint(env.initial_tenant, timeline, compact=False)
 
     # Check layer file sizes
     timeline_path = f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{timeline}/"
@@ -63,3 +67,5 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
         if filename.startswith("00000"):
             log.info(f"layer {filename} size is {os.path.getsize(timeline_path + filename)}")
             assert os.path.getsize(timeline_path + filename) < 512_000_000
+
+    env.stop(immediate=True)
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index b1ade77a14..f872116a1c 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -757,6 +757,9 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path,
 
     assert_size_approx_equal_for_lease_test(lease_res, ro_branch_res)
 
+    # we are writing a lot, and flushing all of that to disk is not important for this test
+    env.stop(immediate=True)
+
 
 def insert_with_action(
     env: NeonEnv,
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index bb3b16f3e1..19df834b81 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1300,6 +1300,8 @@ def test_lagging_sk(neon_env_builder: NeonEnvBuilder):
     # Check that WALs are the same.
     cmp_sk_wal([sk1, sk2, sk3], tenant_id, timeline_id)
 
+    env.stop(immediate=True)
+
 
 # Smaller version of test_one_sk_down testing peer recovery in isolation: that
 # it works without compute at all.

From 99c19cad24b5bb5974403a1e2541fe28ac4c0d53 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 16 Aug 2024 12:44:12 -0500
Subject: [PATCH 1466/1571] Add compute_receive_lsn metric

Useful for dashboarding the replication metrics of a single endpoint.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vm-image-spec.yaml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 8c1c4512b4..d1cfbda15d 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -272,6 +272,19 @@ files:
               else (pg_current_wal_lsn() - '0/0')::FLOAT8
             end as lsn;
 
+      - metric_name: compute_receive_lsn
+        type: gauge
+        help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
+        key_labels:
+        values: [lsn]
+        query: |
+          SELECT
+            CASE
+              WHEN pg_catalog.pg_is_in_recovery()
+              THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
+              ELSE 0
+            END AS lsn;
+
       - metric_name: replication_delay_bytes
         type: gauge
         help: 'Bytes between received and replayed LSN'

From 04752dfa757472062cb70f0fa1fa2e5ccff89225 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 21 Aug 2024 11:15:18 -0500
Subject: [PATCH 1467/1571] Prefix current_lsn with compute_

---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index d1cfbda15d..622004b931 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -259,7 +259,7 @@ files:
           from
             (values ('5m'),('15m'),('1h')) as t (x);
 
-      - metric_name: current_lsn
+      - metric_name: compute_current_lsn
         type: gauge
         help: 'Current LSN of the database'
         key_labels:

From 07b7c63975fbfaf60f28176b275c4d57e28a8e04 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 21 Aug 2024 21:26:27 +0300
Subject: [PATCH 1468/1571] test: avoid some too long shutdowns by flushing
 before shutdown (#8772)

After #8655, we needed to mark some tests to shut down immediately. To
aid these tests, try the new pattern of `flush_ep_to_pageserver`
followed by a non-compacting checkpoint. This moves the general graceful
shutdown problem of having too much to flush at shutdown into the test.
Also, add logging for how long the graceful shutdown took, if we got to
complete it for faster log eyeballing.

Fixes: #8712
Cc: #8715, #8708
---
 pageserver/src/lib.rs                         |  7 +++++-
 .../pagebench/test_ondemand_download_churn.py | 17 +++++++-------
 test_runner/performance/test_layer_map.py     | 23 +++++++++++--------
 test_runner/regress/test_combocid.py          | 20 +++++++++++++---
 4 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 5829a1c188..dbfc9f3544 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -88,6 +88,8 @@ pub async fn shutdown_pageserver(
 ) {
     use std::time::Duration;
 
+    let started_at = std::time::Instant::now();
+
     // If the orderly shutdown below takes too long, we still want to make
     // sure that all walredo processes are killed and wait()ed on by us, not systemd.
     //
@@ -241,7 +243,10 @@ pub async fn shutdown_pageserver(
     walredo_extraordinary_shutdown_thread.join().unwrap();
     info!("walredo_extraordinary_shutdown_thread done");
 
-    info!("Shut down successfully completed");
+    info!(
+        elapsed_ms = started_at.elapsed().as_millis(),
+        "Shut down successfully completed"
+    );
     std::process::exit(exit_code);
 }
 
diff --git a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
index 0348b08f04..9ad6e7907c 100644
--- a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
+++ b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
@@ -5,8 +5,12 @@ from typing import Any, Dict, Tuple
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
-from fixtures.pageserver.utils import wait_for_upload_queue_empty
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    flush_ep_to_pageserver,
+)
 from fixtures.remote_storage import s3_storage
 from fixtures.utils import humantime_to_ms
 
@@ -62,9 +66,6 @@ def test_download_churn(
 
     run_benchmark(env, pg_bin, record, io_engine, concurrency_per_target, duration)
 
-    # see https://github.com/neondatabase/neon/issues/8712
-    env.stop(immediate=True)
-
 
 def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     remote_storage_kind = s3_storage()
@@ -98,9 +99,9 @@ def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             f"INSERT INTO data SELECT lpad(i::text, {bytes_per_row}, '0') FROM generate_series(1, {int(nrows)})  as i",
             options="-c statement_timeout=0",
         )
-        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
-    # TODO: this is a bit imprecise, there could be frozen layers being written out that we don't observe here
-    wait_for_upload_queue_empty(client, tenant_id, timeline_id)
+        flush_ep_to_pageserver(env, ep, tenant_id, timeline_id)
+
+    client.timeline_checkpoint(tenant_id, timeline_id, compact=False, wait_until_uploaded=True)
 
     return env
 
diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index 890b70b9fc..bc6d9de346 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -1,20 +1,21 @@
 import time
 
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
 
 
-#
-# Benchmark searching the layer map, when there are a lot of small layer files.
-#
 def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
-    env = neon_env_builder.init_start()
+    """Benchmark searching the layer map, when there are a lot of small layer files."""
+
+    env = neon_env_builder.init_configs()
     n_iters = 10
     n_records = 100000
 
+    env.start()
+
     # We want to have a lot of lot of layer files to exercise the layer map. Disable
     # GC, and make checkpoint_distance very small, so that we get a lot of small layer
     # files.
-    tenant, _ = env.neon_cli.create_tenant(
+    tenant, timeline = env.neon_cli.create_tenant(
         conf={
             "gc_period": "0s",
             "checkpoint_distance": "16384",
@@ -24,8 +25,7 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
         }
     )
 
-    env.neon_cli.create_timeline("test_layer_map", tenant_id=tenant)
-    endpoint = env.endpoints.create_start("test_layer_map", tenant_id=tenant)
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant)
     cur = endpoint.connect().cursor()
     cur.execute("create table t(x integer)")
     for _ in range(n_iters):
@@ -33,9 +33,12 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
         time.sleep(1)
 
     cur.execute("vacuum t")
+
     with zenbenchmark.record_duration("test_query"):
         cur.execute("SELECT count(*) from t")
         assert cur.fetchone() == (n_iters * n_records,)
 
-    # see https://github.com/neondatabase/neon/issues/8712
-    env.stop(immediate=True)
+    flush_ep_to_pageserver(env, endpoint, tenant, timeline)
+    env.pageserver.http_client().timeline_checkpoint(
+        tenant, timeline, compact=False, wait_until_uploaded=True
+    )
diff --git a/test_runner/regress/test_combocid.py b/test_runner/regress/test_combocid.py
index 6d2567b7ee..41907b1f20 100644
--- a/test_runner/regress/test_combocid.py
+++ b/test_runner/regress/test_combocid.py
@@ -1,4 +1,4 @@
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
 
 
 def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
@@ -34,7 +34,7 @@ def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
 
     # Clear the cache, so that we exercise reconstructing the pages
     # from WAL
-    cur.execute("SELECT clear_buffer_cache()")
+    endpoint.clear_shared_buffers()
 
     # Check that the cursor opened earlier still works. If the
     # combocids are not restored correctly, it won't.
@@ -43,6 +43,10 @@ def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
     assert len(rows) == 500
 
     cur.execute("rollback")
+    flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
+    env.pageserver.http_client().timeline_checkpoint(
+        env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
+    )
 
 
 def test_combocid_delete(neon_env_builder: NeonEnvBuilder):
@@ -92,7 +96,7 @@ def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
     cur.execute("delete from t")
     # Clear the cache, so that we exercise reconstructing the pages
     # from WAL
-    cur.execute("SELECT clear_buffer_cache()")
+    endpoint.clear_shared_buffers()
 
     # Check that the cursor opened earlier still works. If the
     # combocids are not restored correctly, it won't.
@@ -102,6 +106,11 @@ def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
 
     cur.execute("rollback")
 
+    flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
+    env.pageserver.http_client().timeline_checkpoint(
+        env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
+    )
+
 
 def test_combocid(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
@@ -137,3 +146,8 @@ def test_combocid(neon_env_builder: NeonEnvBuilder):
     assert cur.rowcount == n_records
 
     cur.execute("rollback")
+
+    flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
+    env.pageserver.http_client().timeline_checkpoint(
+        env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
+    )

From a968554a8c36c2accf17c5a1f2f23c2bc2f2ec47 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 21 Aug 2024 16:25:21 -0400
Subject: [PATCH 1469/1571] fix(pageserver): unify initdb optimization for
 sparse keyspaces; fix force img generation (#8776)

close https://github.com/neondatabase/neon/issues/8558

* Directly generate image layers for sparse keyspaces during initdb
optimization.
* Support force image layer generation for sparse keyspaces.
* Fix a bug of incorrect image layer key range in case of duplicated
keys. (The added line: `start = img_range.end;`) This can cause
overlapping image layers and keys to disappear.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs | 64 ++++++++++++-------------------
 1 file changed, 24 insertions(+), 40 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 9732cf8b50..80e3843021 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3589,34 +3589,6 @@ impl Timeline {
                 return Err(FlushLayerError::Cancelled);
             }
 
-            // FIXME(auxfilesv2): support multiple metadata key partitions might need initdb support as well?
-            // This code path will not be hit during regression tests. After #7099 we have a single partition
-            // with two key ranges. If someone wants to fix initdb optimization in the future, this might need
-            // to be fixed.
-
-            // For metadata, always create delta layers.
-            let delta_layer = if !metadata_partition.parts.is_empty() {
-                assert_eq!(
-                    metadata_partition.parts.len(),
-                    1,
-                    "currently sparse keyspace should only contain a single metadata keyspace"
-                );
-                let metadata_keyspace = &metadata_partition.parts[0];
-                self.create_delta_layer(
-                    &frozen_layer,
-                    Some(
-                        metadata_keyspace.0.ranges.first().unwrap().start
-                            ..metadata_keyspace.0.ranges.last().unwrap().end,
-                    ),
-                    ctx,
-                )
-                .await
-                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
-            } else {
-                None
-            };
-
-            // For image layers, we add them immediately into the layer map.
             let mut layers_to_upload = Vec::new();
             layers_to_upload.extend(
                 self.create_image_layers(
@@ -3627,13 +3599,27 @@ impl Timeline {
                 )
                 .await?,
             );
-
-            if let Some(delta_layer) = delta_layer {
-                layers_to_upload.push(delta_layer.clone());
-                (layers_to_upload, Some(delta_layer))
-            } else {
-                (layers_to_upload, None)
+            if !metadata_partition.parts.is_empty() {
+                assert_eq!(
+                    metadata_partition.parts.len(),
+                    1,
+                    "currently sparse keyspace should only contain a single metadata keyspace"
+                );
+                layers_to_upload.extend(
+                    self.create_image_layers(
+                        // Safety: create_image_layers treat sparse keyspaces differently that it does not scan
+                        // every single key within the keyspace, and therefore, it's safe to force converting it
+                        // into a dense keyspace before calling this function.
+                        &metadata_partition.into_dense(),
+                        self.initdb_lsn,
+                        ImageLayerCreationMode::Initial,
+                        ctx,
+                    )
+                    .await?,
+                );
             }
+
+            (layers_to_upload, None)
         } else {
             // Normal case, write out a L0 delta layer file.
             // `create_delta_layer` will not modify the layer map.
@@ -4043,8 +4029,6 @@ impl Timeline {
         mode: ImageLayerCreationMode,
         start: Key,
     ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
-        assert!(!matches!(mode, ImageLayerCreationMode::Initial));
-
         // Metadata keys image layer creation.
         let mut reconstruct_state = ValuesReconstructState::default();
         let data = self
@@ -4210,15 +4194,13 @@ impl Timeline {
                         "metadata keys must be partitioned separately"
                     );
                 }
-                if mode == ImageLayerCreationMode::Initial {
-                    return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
-                }
                 if mode == ImageLayerCreationMode::Try && !check_for_image_layers {
                     // Skip compaction if there are not enough updates. Metadata compaction will do a scan and
                     // might mess up with evictions.
                     start = img_range.end;
                     continue;
                 }
+                // For initial and force modes, we always generate image layers for metadata keys.
             } else if let ImageLayerCreationMode::Try = mode {
                 // check_for_image_layers = false -> skip
                 // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
@@ -4226,7 +4208,8 @@ impl Timeline {
                     start = img_range.end;
                     continue;
                 }
-            } else if let ImageLayerCreationMode::Force = mode {
+            }
+            if let ImageLayerCreationMode::Force = mode {
                 // When forced to create image layers, we might try and create them where they already
                 // exist.  This mode is only used in tests/debug.
                 let layers = self.layers.read().await;
@@ -4240,6 +4223,7 @@ impl Timeline {
                         img_range.start,
                         img_range.end
                     );
+                    start = img_range.end;
                     continue;
                 }
             }

From 7c74112b2a6e23c07bfd9cc62c240cd6bbdd3bd9 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 22 Aug 2024 11:04:42 +0100
Subject: [PATCH 1470/1571] pageserver: batch InMemoryLayer `put`s, remove need
 to sort items by LSN during ingest (#8591)

## Problem/Solution

TimelineWriter::put_batch is simply a loop over individual puts. Each
put acquires and releases locks, and checks for potentially starting a
new layer. Batching these is more efficient, but more importantly
unlocks future changes where we can pre-build serialized buffers much
earlier in the ingest process, potentially even on the safekeeper
(imagine a future model where some variant of DatadirModification lives
on the safekeeper).

Ensuring that the values in put_batch are written to one layer also
enables a simplification upstream, where we no longer need to write
values in LSN-order. This saves us a sort, but also simplifies follow-on
refactors to DatadirModification: we can store metadata keys and data
keys separately at that level without needing to zip them together in
LSN order later.

## Why?

In this PR, these changes are simplify optimizations, but they are
motivated by evolving the ingest path in the direction of disentangling
extracting DatadirModification from Timeline. It may not obvious how
right now, but the general idea is that we'll end up with three phases
of ingest:
- A) Decode walrecords and build a datadirmodification with all the
simple data contents already in a big serialized buffer ready to write
to an ephemeral layer **<-- this part can be pipelined and parallelized,
and done on a safekeeper!**
- B) Let that datadirmodification see a Timeline, so that it can also
generate all the metadata updates that require a read-modify-write of
existing pages
- C) Dump the results of B into an ephemeral layer.

Related: https://github.com/neondatabase/neon/issues/8452

## Caveats

Doing a big monolithic buffer of values to write to disk is ordinarily
an anti-pattern: we prefer nice streaming I/O. However:
- In future, when we do this first decode stage on the safekeeper, it
would be inefficient to serialize a Vec of Value, and then later
deserialize it just to add blob size headers while writing into the
ephemeral layer format. The idea is that for bulk write data, we will
serialize exactly once.
- The monolithic buffer is a stepping stone to pipelining more of this:
by seriailizing earlier (rather than at the final put_value), we will be
able to parallelize the wal decoding and bulk serialization of data page
writes.
- The ephemeral layer's buffered writer already stalls writes while it
waits to flush: so while yes we'll stall for a couple milliseconds to
write a couple megabytes, we already have stalls like this, just
distributed across smaller writes.

## Benchmarks

This PR is primarily a stepping stone to safekeeper ingest filtering,
but also provides a modest efficiency improvement to the `wal_recovery`
part of `test_bulk_ingest`.

test_bulk_ingest:

```
test_bulk_insert[neon-release-pg16].insert: 23.659 s
test_bulk_insert[neon-release-pg16].pageserver_writes: 5,428 MB
test_bulk_insert[neon-release-pg16].peak_mem: 626 MB
test_bulk_insert[neon-release-pg16].size: 0 MB
test_bulk_insert[neon-release-pg16].data_uploaded: 1,922 MB
test_bulk_insert[neon-release-pg16].num_files_uploaded: 8
test_bulk_insert[neon-release-pg16].wal_written: 1,382 MB
test_bulk_insert[neon-release-pg16].wal_recovery: 18.981 s
test_bulk_insert[neon-release-pg16].compaction: 0.055 s

vs. tip of main:
test_bulk_insert[neon-release-pg16].insert: 24.001 s
test_bulk_insert[neon-release-pg16].pageserver_writes: 5,428 MB
test_bulk_insert[neon-release-pg16].peak_mem: 604 MB
test_bulk_insert[neon-release-pg16].size: 0 MB
test_bulk_insert[neon-release-pg16].data_uploaded: 1,922 MB
test_bulk_insert[neon-release-pg16].num_files_uploaded: 8
test_bulk_insert[neon-release-pg16].wal_written: 1,382 MB
test_bulk_insert[neon-release-pg16].wal_recovery: 23.586 s
test_bulk_insert[neon-release-pg16].compaction: 0.054 s
```
---
 pageserver/benches/bench_ingest.rs            |  19 ++-
 pageserver/src/pgdatadir_mapping.rs           |  70 +++++++---
 pageserver/src/tenant/ephemeral_file.rs       |  35 +++--
 pageserver/src/tenant/storage_layer.rs        |   2 +-
 .../tenant/storage_layer/inmemory_layer.rs    | 131 +++++++++++++-----
 pageserver/src/tenant/timeline.rs             | 101 +++++++-------
 .../walreceiver/walreceiver_connection.rs     |   9 +-
 7 files changed, 247 insertions(+), 120 deletions(-)

diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 0336302de0..bd99f5289d 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -10,6 +10,7 @@ use pageserver::{
     page_cache,
     repository::Value,
     task_mgr::TaskKind,
+    tenant::storage_layer::inmemory_layer::SerializedBatch,
     tenant::storage_layer::InMemoryLayer,
     virtual_file,
 };
@@ -67,12 +68,16 @@ async fn ingest(
     let layer =
         InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
 
-    let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
+    let data = Value::Image(Bytes::from(vec![0u8; put_size]));
+    let data_ser_size = data.serialized_size().unwrap() as usize;
     let ctx = RequestContext::new(
         pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
         pageserver::context::DownloadBehavior::Download,
     );
 
+    const BATCH_SIZE: usize = 16;
+    let mut batch = Vec::new();
+
     for i in 0..put_count {
         lsn += put_size as u64;
 
@@ -95,7 +100,17 @@ async fn ingest(
             }
         }
 
-        layer.put_value(key.to_compact(), lsn, &data, &ctx).await?;
+        batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
+        if batch.len() >= BATCH_SIZE {
+            let this_batch = std::mem::take(&mut batch);
+            let serialized = SerializedBatch::from_values(this_batch);
+            layer.put_batch(serialized, &ctx).await?;
+        }
+    }
+    if !batch.is_empty() {
+        let this_batch = std::mem::take(&mut batch);
+        let serialized = SerializedBatch::from_values(this_batch);
+        layer.put_batch(serialized, &ctx).await?;
     }
     layer.freeze(lsn + 1).await;
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 4f7eb1a00c..d6e0b82e1d 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,12 +15,11 @@ use crate::{aux_file, repository::*};
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
-use itertools::Itertools;
 use pageserver_api::key::{
     dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
     relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
     slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
-    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+    CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::models::AuxFilePolicy;
@@ -37,7 +36,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
-use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
@@ -174,6 +172,7 @@ impl Timeline {
             pending_deletions: Vec::new(),
             pending_nblocks: 0,
             pending_directory_entries: Vec::new(),
+            pending_bytes: 0,
             lsn,
         }
     }
@@ -1022,21 +1021,33 @@ pub struct DatadirModification<'a> {
     // The put-functions add the modifications here, and they are flushed to the
     // underlying key-value store by the 'finish' function.
     pending_lsns: Vec<Lsn>,
-    pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
+    pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
     pending_deletions: Vec<(Range<Key>, Lsn)>,
     pending_nblocks: i64,
 
     /// For special "directory" keys that store key-value maps, track the size of the map
     /// if it was updated in this modification.
     pending_directory_entries: Vec<(DirectoryKind, usize)>,
+
+    /// An **approximation** of how large our EphemeralFile write will be when committed.
+    pending_bytes: usize,
 }
 
 impl<'a> DatadirModification<'a> {
+    // When a DatadirModification is committed, we do a monolithic serialization of all its contents.  WAL records can
+    // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
+    // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
+    pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024;
+
     /// Get the current lsn
     pub(crate) fn get_lsn(&self) -> Lsn {
         self.lsn
     }
 
+    pub(crate) fn approx_pending_bytes(&self) -> usize {
+        self.pending_bytes
+    }
+
     /// Set the current lsn
     pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
         ensure!(
@@ -1769,21 +1780,25 @@ impl<'a> DatadirModification<'a> {
         // Flush relation and  SLRU data blocks, keep metadata.
         let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
         for (key, values) in self.pending_updates.drain() {
-            for (lsn, value) in values {
+            let mut write_batch = Vec::new();
+            for (lsn, value_ser_size, value) in values {
                 if key.is_rel_block_key() || key.is_slru_block_key() {
                     // This bails out on first error without modifying pending_updates.
                     // That's Ok, cf this function's doc comment.
-                    writer.put(key, lsn, &value, ctx).await?;
+                    write_batch.push((key.to_compact(), lsn, value_ser_size, value));
                 } else {
-                    retained_pending_updates
-                        .entry(key)
-                        .or_default()
-                        .push((lsn, value));
+                    retained_pending_updates.entry(key).or_default().push((
+                        lsn,
+                        value_ser_size,
+                        value,
+                    ));
                 }
             }
+            writer.put_batch(write_batch, ctx).await?;
         }
 
         self.pending_updates = retained_pending_updates;
+        self.pending_bytes = 0;
 
         if pending_nblocks != 0 {
             writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1809,17 +1824,20 @@ impl<'a> DatadirModification<'a> {
         self.pending_nblocks = 0;
 
         if !self.pending_updates.is_empty() {
-            // The put_batch call below expects expects the inputs to be sorted by Lsn,
-            // so we do that first.
-            let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
-                self.pending_updates
-                    .drain()
-                    .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
-                    .kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
-                VecMapOrdering::GreaterOrEqual,
-            );
+            // Ordering: the items in this batch do not need to be in any global order, but values for
+            // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
+            // this to do efficient updates to its index.
+            let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
+                .pending_updates
+                .drain()
+                .flat_map(|(key, values)| {
+                    values.into_iter().map(move |(lsn, val_ser_size, value)| {
+                        (key.to_compact(), lsn, val_ser_size, value)
+                    })
+                })
+                .collect::<Vec<_>>();
 
-            writer.put_batch(lsn_ordered_batch, ctx).await?;
+            writer.put_batch(batch, ctx).await?;
         }
 
         if !self.pending_deletions.is_empty() {
@@ -1844,6 +1862,8 @@ impl<'a> DatadirModification<'a> {
             writer.update_directory_entries_count(kind, count as u64);
         }
 
+        self.pending_bytes = 0;
+
         Ok(())
     }
 
@@ -1860,7 +1880,7 @@ impl<'a> DatadirModification<'a> {
         // Note: we don't check pending_deletions. It is an error to request a
         // value that has been removed, deletion only avoids leaking storage.
         if let Some(values) = self.pending_updates.get(&key) {
-            if let Some((_, value)) = values.last() {
+            if let Some((_, _, value)) = values.last() {
                 return if let Value::Image(img) = value {
                     Ok(img.clone())
                 } else {
@@ -1888,13 +1908,17 @@ impl<'a> DatadirModification<'a> {
     fn put(&mut self, key: Key, val: Value) {
         let values = self.pending_updates.entry(key).or_default();
         // Replace the previous value if it exists at the same lsn
-        if let Some((last_lsn, last_value)) = values.last_mut() {
+        if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
             if *last_lsn == self.lsn {
+                *last_value_ser_size = val.serialized_size().unwrap() as usize;
                 *last_value = val;
                 return;
             }
         }
-        values.push((self.lsn, val));
+
+        let val_serialized_size = val.serialized_size().unwrap() as usize;
+        self.pending_bytes += val_serialized_size;
+        values.push((self.lsn, val_serialized_size, val));
     }
 
     fn delete(&mut self, key_range: Range<Key>) {
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 3eb8384d05..44f0fc7ab1 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -79,6 +79,8 @@ impl EphemeralFile {
         self.rw.read_blk(blknum, ctx).await
     }
 
+    #[cfg(test)]
+    // This is a test helper: outside of tests, we are always written to via a pre-serialized batch.
     pub(crate) async fn write_blob(
         &mut self,
         srcbuf: &[u8],
@@ -86,17 +88,30 @@ impl EphemeralFile {
     ) -> Result<u64, io::Error> {
         let pos = self.rw.bytes_written();
 
-        // Write the length field
-        if srcbuf.len() < 0x80 {
-            // short one-byte length header
-            let len_buf = [srcbuf.len() as u8];
+        let mut len_bytes = std::io::Cursor::new(Vec::new());
+        crate::tenant::storage_layer::inmemory_layer::SerializedBatch::write_blob_length(
+            srcbuf.len(),
+            &mut len_bytes,
+        );
+        let len_bytes = len_bytes.into_inner();
 
-            self.rw.write_all_borrowed(&len_buf, ctx).await?;
-        } else {
-            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
-            len_buf[0] |= 0x80;
-            self.rw.write_all_borrowed(&len_buf, ctx).await?;
-        }
+        // Write the length field
+        self.rw.write_all_borrowed(&len_bytes, ctx).await?;
+
+        // Write the payload
+        self.rw.write_all_borrowed(srcbuf, ctx).await?;
+
+        Ok(pos)
+    }
+
+    /// Returns the offset at which the first byte of the input was written, for use
+    /// in constructing indices over the written value.
+    pub(crate) async fn write_raw(
+        &mut self,
+        srcbuf: &[u8],
+        ctx: &RequestContext,
+    ) -> Result<u64, io::Error> {
+        let pos = self.rw.bytes_written();
 
         // Write the payload
         self.rw.write_all_borrowed(srcbuf, ctx).await?;
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 04f89db401..133b34b8b5 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -2,7 +2,7 @@
 
 pub mod delta_layer;
 pub mod image_layer;
-pub(crate) mod inmemory_layer;
+pub mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 130d1002a0..a71b4dd83b 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -33,7 +33,7 @@ use std::fmt::Write;
 use std::ops::Range;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
-use tokio::sync::{RwLock, RwLockWriteGuard};
+use tokio::sync::RwLock;
 
 use super::{
     DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
@@ -320,6 +320,82 @@ impl InMemoryLayer {
     }
 }
 
+/// Offset of a particular Value within a serialized batch.
+struct SerializedBatchOffset {
+    key: CompactKey,
+    lsn: Lsn,
+    /// offset in bytes from the start of the batch's buffer to the Value's serialized size header.
+    offset: u64,
+}
+
+pub struct SerializedBatch {
+    /// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
+    pub(crate) raw: Vec<u8>,
+
+    /// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
+    offsets: Vec<SerializedBatchOffset>,
+
+    /// The highest LSN of any value in the batch
+    pub(crate) max_lsn: Lsn,
+}
+
+impl SerializedBatch {
+    /// Write a blob length in the internal format of the EphemeralFile
+    pub(crate) fn write_blob_length(len: usize, cursor: &mut std::io::Cursor<Vec<u8>>) {
+        use std::io::Write;
+
+        if len < 0x80 {
+            // short one-byte length header
+            let len_buf = [len as u8];
+
+            cursor
+                .write_all(&len_buf)
+                .expect("Writing to Vec is infallible");
+        } else {
+            let mut len_buf = u32::to_be_bytes(len as u32);
+            len_buf[0] |= 0x80;
+            cursor
+                .write_all(&len_buf)
+                .expect("Writing to Vec is infallible");
+        }
+    }
+
+    pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self {
+        // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
+        // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
+        let buffer_size = batch.iter().map(|i| i.2).sum::<usize>() + 4 * batch.len();
+        let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
+
+        let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
+        let mut max_lsn: Lsn = Lsn(0);
+        for (key, lsn, val_ser_size, val) in batch {
+            let relative_off = cursor.position();
+
+            Self::write_blob_length(val_ser_size, &mut cursor);
+            val.ser_into(&mut cursor)
+                .expect("Writing into in-memory buffer is infallible");
+
+            offsets.push(SerializedBatchOffset {
+                key,
+                lsn,
+                offset: relative_off,
+            });
+            max_lsn = std::cmp::max(max_lsn, lsn);
+        }
+
+        let buffer = cursor.into_inner();
+
+        // Assert that we didn't do any extra allocations while building buffer.
+        debug_assert!(buffer.len() <= buffer_size);
+
+        Self {
+            raw: buffer,
+            offsets,
+            max_lsn,
+        }
+    }
+}
+
 fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
     write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
 }
@@ -380,37 +456,20 @@ impl InMemoryLayer {
         })
     }
 
-    // Write operations
-
-    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
-    /// Adds the page version to the in-memory tree
-    pub async fn put_value(
+    // Write path.
+    pub async fn put_batch(
         &self,
-        key: CompactKey,
-        lsn: Lsn,
-        buf: &[u8],
+        serialized_batch: SerializedBatch,
         ctx: &RequestContext,
     ) -> Result<()> {
         let mut inner = self.inner.write().await;
         self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
-    }
 
-    async fn put_value_locked(
-        &self,
-        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
-        key: CompactKey,
-        lsn: Lsn,
-        buf: &[u8],
-        ctx: &RequestContext,
-    ) -> Result<()> {
-        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-
-        let off = {
-            locked_inner
+        let base_off = {
+            inner
                 .file
-                .write_blob(
-                    buf,
+                .write_raw(
+                    &serialized_batch.raw,
                     &RequestContextBuilder::extend(ctx)
                         .page_content_kind(PageContentKind::InMemoryLayer)
                         .build(),
@@ -418,15 +477,23 @@ impl InMemoryLayer {
                 .await?
         };
 
-        let vec_map = locked_inner.index.entry(key).or_default();
-        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
-        if old.is_some() {
-            // We already had an entry for this LSN. That's odd..
-            warn!("Key {} at {} already exists", key, lsn);
+        for SerializedBatchOffset {
+            key,
+            lsn,
+            offset: relative_off,
+        } in serialized_batch.offsets
+        {
+            let off = base_off + relative_off;
+            let vec_map = inner.index.entry(key).or_default();
+            let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
+            if old.is_some() {
+                // We already had an entry for this LSN. That's odd..
+                warn!("Key {} at {} already exists", key, lsn);
+            }
         }
 
-        let size = locked_inner.file.len();
-        locked_inner.resource_units.maybe_publish_size(size);
+        let size = inner.file.len();
+        inner.resource_units.maybe_publish_size(size);
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 80e3843021..e90f65942f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -22,8 +22,8 @@ use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::{
-        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
-        NON_INHERITED_SPARSE_RANGE,
+        CompactKey, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
+        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
     },
     keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
@@ -44,10 +44,8 @@ use tokio::{
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
-    bin_ser::BeSer,
     fs_ext, pausable_failpoint,
     sync::gate::{Gate, GateGuard},
-    vec_map::VecMap,
 };
 
 use std::pin::pin;
@@ -137,7 +135,10 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized};
+use super::{
+    config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint,
+    upload_queue::NotInitialized,
+};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{
@@ -5574,44 +5575,6 @@ enum OpenLayerAction {
 }
 
 impl<'a> TimelineWriter<'a> {
-    /// Put a new page version that can be constructed from a WAL record
-    ///
-    /// This will implicitly extend the relation, if the page is beyond the
-    /// current end-of-file.
-    pub(crate) async fn put(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        value: &Value,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // Avoid doing allocations for "small" values.
-        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
-        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
-        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
-        value.ser_into(&mut buf)?;
-        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
-
-        let action = self.get_open_layer_action(lsn, buf_size);
-        let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
-        let res = layer.put_value(key.to_compact(), lsn, &buf, ctx).await;
-
-        if res.is_ok() {
-            // Update the current size only when the entire write was ok.
-            // In case of failures, we may have had partial writes which
-            // render the size tracking out of sync. That's ok because
-            // the checkpoint distance should be significantly smaller
-            // than the S3 single shot upload limit of 5GiB.
-            let state = self.write_guard.as_mut().unwrap();
-
-            state.current_size += buf_size;
-            state.prev_lsn = Some(lsn);
-            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
-        }
-
-        res
-    }
-
     async fn handle_open_layer_action(
         &mut self,
         at: Lsn,
@@ -5717,18 +5680,58 @@ impl<'a> TimelineWriter<'a> {
     }
 
     /// Put a batch of keys at the specified Lsns.
-    ///
-    /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
     pub(crate) async fn put_batch(
         &mut self,
-        batch: VecMap<Lsn, (Key, Value)>,
+        batch: Vec<(CompactKey, Lsn, usize, Value)>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        for (lsn, (key, val)) in batch {
-            self.put(key, lsn, &val, ctx).await?
+        if batch.is_empty() {
+            return Ok(());
         }
 
-        Ok(())
+        let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch);
+        let batch_max_lsn = serialized_batch.max_lsn;
+        let buf_size: u64 = serialized_batch.raw.len() as u64;
+
+        let action = self.get_open_layer_action(batch_max_lsn, buf_size);
+        let layer = self
+            .handle_open_layer_action(batch_max_lsn, action, ctx)
+            .await?;
+
+        let res = layer.put_batch(serialized_batch, ctx).await;
+
+        if res.is_ok() {
+            // Update the current size only when the entire write was ok.
+            // In case of failures, we may have had partial writes which
+            // render the size tracking out of sync. That's ok because
+            // the checkpoint distance should be significantly smaller
+            // than the S3 single shot upload limit of 5GiB.
+            let state = self.write_guard.as_mut().unwrap();
+
+            state.current_size += buf_size;
+            state.prev_lsn = Some(batch_max_lsn);
+            state.max_lsn = std::cmp::max(state.max_lsn, Some(batch_max_lsn));
+        }
+
+        res
+    }
+
+    #[cfg(test)]
+    /// Test helper, for tests that would like to poke individual values without composing a batch
+    pub(crate) async fn put(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        value: &Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        use utils::bin_ser::BeSer;
+        let val_ser_size = value.serialized_size().unwrap() as usize;
+        self.put_batch(
+            vec![(key.to_compact(), lsn, val_ser_size, value.clone())],
+            ctx,
+        )
+        .await
     }
 
     pub(crate) async fn delete_batch(
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index b5c577af72..0114473eda 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -27,8 +27,8 @@ use super::TaskStateUpdate;
 use crate::{
     context::RequestContext,
     metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
-    task_mgr::TaskKind,
-    task_mgr::WALRECEIVER_RUNTIME,
+    pgdatadir_mapping::DatadirModification,
+    task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
     tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
     walingest::WalIngest,
     walrecord::DecodedWALRecord,
@@ -345,7 +345,10 @@ pub(super) async fn handle_walreceiver_connection(
                         // Commit every ingest_batch_size records. Even if we filtered out
                         // all records, we still need to call commit to advance the LSN.
                         uncommitted_records += 1;
-                        if uncommitted_records >= ingest_batch_size {
+                        if uncommitted_records >= ingest_batch_size
+                            || modification.approx_pending_bytes()
+                                > DatadirModification::MAX_PENDING_BYTES
+                        {
                             WAL_INGEST
                                 .records_committed
                                 .inc_by(uncommitted_records - filtered_records);

From d645645fab662df28ffb41dde18ca1963c237532 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 22 Aug 2024 12:45:29 +0200
Subject: [PATCH 1471/1571] Sleep in test_scrubber_physical_gc (#8798)

This copies a piece of code from `test_scrubber_physical_gc_ancestors`
to fix a source of flakiness: later on we rely on stuff being older than
a second, but the test can run faster under optimal conditions (as
happened to me locally, but also obvservable in
[this](https://neon-github-public-dev.s3.amazonaws.com/reports/main/10470762360/index.html#testresult/f713b02657db4b4c/retries)
allure report):

```
test_runner/regress/test_storage_scrubber.py:169: in test_scrubber_physical_gc
    assert gc_summary["remote_storage_errors"] == 0
E   assert 1 == 0
```
---
 test_runner/regress/test_storage_scrubber.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 2844d1b1d2..292a9a1010 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -152,6 +152,9 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt
         # This write includes remote upload, will generate an index in this generation
         workload.write_rows(1)
 
+    # We will use a min_age_secs=1 threshold for deletion, let it pass
+    time.sleep(2)
+
     # With a high min_age, the scrubber should decline to delete anything
     gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600)
     assert gc_summary["remote_storage_errors"] == 0

From 0e6c0d47a5d29e151d1a8013e627998df8772f6f Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 22 Aug 2024 12:52:36 +0200
Subject: [PATCH 1472/1571] Revert "Use sycnhronous commit for logical
 replicaiton worker (#8645)" (#8792)

This reverts commit cbe8c77997aea576a96a7f8d31147cb7a11d6a6b.

This change was originally made to test a hypothesis, but after that,
the proper fix #8669 was merged, so now it's not needed. Moreover, the
test is still flaky, so probably this bug was not a reason of the
flakiness.

Related to #8097
---
 test_runner/regress/test_subscriber_restart.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py
index 4581008022..91caad7220 100644
--- a/test_runner/regress/test_subscriber_restart.py
+++ b/test_runner/regress/test_subscriber_restart.py
@@ -37,9 +37,7 @@ def test_subscriber_restart(neon_simple_env: NeonEnv):
             scur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
             # scur.execute("CREATE INDEX on t(sk)") # slowdown applying WAL at replica
             pub_conn = f"host=localhost port={pub.pg_port} dbname=postgres user=cloud_admin"
-            # synchronous_commit=on to test a hypothesis for why this test has been flaky.
-            # XXX: Add link to the issue
-            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub with (synchronous_commit=on)"
+            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub"
             scur.execute(query)
             time.sleep(2)  # let initial table sync complete
 

From 1a9d559be8a77e7d8375c10238e4e4c0e76a40f7 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Thu, 22 Aug 2024 13:29:05 +0200
Subject: [PATCH 1473/1571] proxy: Enable stricter/pedantic clippy checks
 (#8775)

Create a list of currently allowed exceptions that should be reduced
over time.
---
 proxy/src/lib.rs | 90 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 89 insertions(+), 1 deletion(-)

diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index ea92eaaa55..b7d497ebcc 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -1,4 +1,92 @@
-#![deny(clippy::undocumented_unsafe_blocks)]
+// rustc lints/lint groups
+// https://doc.rust-lang.org/rustc/lints/groups.html
+#![deny(
+    deprecated,
+    future_incompatible,
+    // TODO: consider let_underscore
+    nonstandard_style,
+    rust_2024_compatibility
+)]
+#![warn(clippy::all, clippy::pedantic, clippy::cargo)]
+// List of denied lints from the clippy::restriction group.
+// https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction
+#![warn(
+    clippy::undocumented_unsafe_blocks,
+    clippy::dbg_macro,
+    clippy::empty_enum_variants_with_brackets,
+    clippy::exit,
+    clippy::float_cmp_const,
+    clippy::lossy_float_literal,
+    clippy::macro_use_imports,
+    clippy::manual_ok_or,
+    // TODO: consider clippy::map_err_ignore
+    // TODO: consider clippy::mem_forget
+    clippy::rc_mutex,
+    clippy::rest_pat_in_fully_bound_structs,
+    clippy::string_add,
+    clippy::string_to_string,
+    clippy::todo,
+    // TODO: consider clippy::unimplemented
+    // TODO: consider clippy::unwrap_used
+)]
+// List of permanently allowed lints.
+#![allow(
+    // It's ok to cast u8 to bool, etc.
+    clippy::cast_lossless,
+)]
+// List of temporarily allowed lints.
+// TODO: Switch to except() once stable with 1.81.
+// TODO: fix code and reduce list or move to permanent list above.
+#![allow(
+    clippy::cargo_common_metadata,
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    clippy::cast_precision_loss,
+    clippy::cast_sign_loss,
+    clippy::default_trait_access,
+    clippy::doc_markdown,
+    clippy::explicit_iter_loop,
+    clippy::float_cmp,
+    clippy::if_not_else,
+    clippy::ignored_unit_patterns,
+    clippy::implicit_hasher,
+    clippy::inconsistent_struct_constructor,
+    clippy::inline_always,
+    clippy::items_after_statements,
+    clippy::manual_assert,
+    clippy::manual_let_else,
+    clippy::manual_string_new,
+    clippy::match_bool,
+    clippy::match_same_arms,
+    clippy::match_wild_err_arm,
+    clippy::missing_errors_doc,
+    clippy::missing_panics_doc,
+    clippy::module_name_repetitions,
+    clippy::multiple_crate_versions,
+    clippy::must_use_candidate,
+    clippy::needless_for_each,
+    clippy::needless_pass_by_value,
+    clippy::needless_raw_string_hashes,
+    clippy::option_as_ref_cloned,
+    clippy::redundant_closure_for_method_calls,
+    clippy::redundant_else,
+    clippy::return_self_not_must_use,
+    clippy::similar_names,
+    clippy::single_char_pattern,
+    clippy::single_match_else,
+    clippy::struct_excessive_bools,
+    clippy::struct_field_names,
+    clippy::too_many_lines,
+    clippy::uninlined_format_args,
+    clippy::unnested_or_patterns,
+    clippy::unreadable_literal,
+    clippy::unused_async,
+    clippy::unused_self,
+    clippy::used_underscore_binding,
+    clippy::wildcard_imports
+)]
+// List of temporarily allowed lints to unblock beta/nightly.
+#![allow(unknown_lints, clippy::manual_inspect)]
 
 use std::convert::Infallible;
 

From b1c457898b7af111cd59d3a8c2d3bde5bae5085e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 22 Aug 2024 18:38:03 +0300
Subject: [PATCH 1474/1571] test_compatibility: flush in the end (#8804)

`test_forward_compatibility` is still often failing at graceful
shutdown. Fix this by explicit flush before shutdown.

Example:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/10506613738/index.html#testresult/5e7111907f7ecfb2/

Cc: #8655 and #8708
Previous attempt: #8787
---
 test_runner/regress/test_compatibility.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index de27191945..c361efe90a 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -11,7 +11,12 @@ import pytest
 import toml
 from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, flush_ep_to_pageserver
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    flush_ep_to_pageserver,
+)
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
@@ -296,7 +301,7 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
     pg_version = env.pg_version
 
     # Stop endpoint while we recreate timeline
-    ep.stop()
+    flush_ep_to_pageserver(env, ep, tenant_id, timeline_id)
 
     try:
         pageserver_http.timeline_preserve_initdb_archive(tenant_id, timeline_id)
@@ -344,6 +349,11 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
     assert not dump_from_wal_differs, "dump from WAL differs"
     assert not initial_dump_differs, "initial dump differs"
 
+    flush_ep_to_pageserver(env, ep, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(
+        tenant_id, timeline_id, compact=False, wait_until_uploaded=True
+    )
+
 
 def dump_differs(
     first: Path, second: Path, output: Path, allowed_diffs: Optional[List[str]] = None

From 7a485b599bd27ba135e3327bfb5710c495c99df6 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 22 Aug 2024 23:53:37 +0300
Subject: [PATCH 1475/1571] Fix race condition in LRU list update in
 get_cached_relsize (#8807)

## Problem

See https://neondb.slack.com/archives/C07J14D8GTX/p1724347552023709
Manipulations with LRU list in relation size cache are performed under
shared lock

## Summary of changes

Take exclusive lock

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/relsize_cache.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c
index cc7ac2c394..2a4c2dc799 100644
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -110,7 +110,8 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 
 		tag.rinfo = rinfo;
 		tag.forknum = forknum;
-		LWLockAcquire(relsize_lock, LW_SHARED);
+		/* We need exclusive lock here because of LRU list manipulation */
+		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
 		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
 		if (entry != NULL)
 		{

From 6eb638f4b390270fa004cdea45e00ca63c21f773 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 22 Aug 2024 17:31:38 -0400
Subject: [PATCH 1476/1571] feat(pageserver): warn on aux v1 tenants + default
 to v2 (#8625)

part of https://github.com/neondatabase/neon/issues/8623

We want to discover potential aux v1 customers that we might have missed
from the migrations.

## Summary of changes

Log warnings on basebackup, load timeline, and the first put_file.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs             |  2 +-
 pageserver/src/pgdatadir_mapping.rs           | 15 +++++++++++--
 pageserver/src/tenant.rs                      | 14 ++++++------
 pageserver/src/tenant/timeline.rs             |  5 +++++
 .../regress/test_logical_replication.py       | 22 +++++--------------
 5 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ab4adfbebe..d55c06b685 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -348,7 +348,7 @@ impl AuxFilePolicy {
 
     /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
     pub fn default_tenant_config() -> Self {
-        Self::V1
+        Self::V2
     }
 }
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index d6e0b82e1d..b7110d69b6 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -726,7 +726,17 @@ impl Timeline {
     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
         let current_policy = self.last_aux_file_policy.load();
         match current_policy {
-            Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
+            Some(AuxFilePolicy::V1) => {
+                warn!("this timeline is using deprecated aux file policy V1 (policy=V1)");
+                self.list_aux_files_v1(lsn, ctx).await
+            }
+            None => {
+                let res = self.list_aux_files_v1(lsn, ctx).await?;
+                if !res.is_empty() {
+                    warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
+                }
+                Ok(res)
+            }
             Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
             Some(AuxFilePolicy::CrossValidation) => {
                 let v1_result = self.list_aux_files_v1(lsn, ctx).await;
@@ -1587,6 +1597,7 @@ impl<'a> DatadirModification<'a> {
                 if aux_files_key_v1.is_empty() {
                     None
                 } else {
+                    warn!("this timeline is using deprecated aux file policy V1");
                     self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
                     Some(AuxFilePolicy::V1)
                 }
@@ -2048,7 +2059,7 @@ mod tests {
 
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
+            .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
         let tline = tline.raw_timeline().unwrap();
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 65a7504b74..2e19a46ac8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5932,10 +5932,10 @@ mod tests {
             .await
             .unwrap();
 
-        // the default aux file policy to switch is v1 if not set by the admins
+        // the default aux file policy to switch is v2 if not set by the admins
         assert_eq!(
             harness.tenant_conf.switch_aux_file_policy,
-            AuxFilePolicy::V1
+            AuxFilePolicy::default_tenant_config()
         );
         let (tenant, ctx) = harness.load().await;
 
@@ -5979,8 +5979,8 @@ mod tests {
         );
         assert_eq!(
             tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V1),
-            "aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
+            Some(AuxFilePolicy::V2),
+            "aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
         );
 
         // we can read everything from the storage
@@ -6002,8 +6002,8 @@ mod tests {
 
         assert_eq!(
             tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V1),
-            "keep v1 storage format when new files are written"
+            Some(AuxFilePolicy::V2),
+            "keep v2 storage format when new files are written"
         );
 
         let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
@@ -6019,7 +6019,7 @@ mod tests {
 
         // child copies the last flag even if that is not on remote storage yet
         assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
-        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));
+        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
 
         let files = child.list_aux_files(lsn, &ctx).await.unwrap();
         assert_eq!(files.get("pg_logical/mappings/test1"), None);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e90f65942f..dc9cddea43 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2234,6 +2234,11 @@ impl Timeline {
 
                 handles: Default::default(),
             };
+
+            if aux_file_policy == Some(AuxFilePolicy::V1) {
+                warn!("this timeline is using deprecated aux file policy V1");
+            }
+
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
 
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 0d18aa43b7..f83a833dda 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -22,7 +22,7 @@ def random_string(n: int):
 
 
 @pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.V2, AuxFileStore.CrossValidation]
+    "pageserver_aux_file_policy", [AuxFileStore.V2, AuxFileStore.CrossValidation]
 )
 def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy: AuxFileStore):
     env = neon_simple_env
@@ -31,9 +31,7 @@ def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy:
         assert pageserver_aux_file_policy == tenant_config["switch_aux_file_policy"]
 
 
-@pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
-)
+@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -175,9 +173,7 @@ COMMIT;
 
 
 # Test that neon.logical_replication_max_snap_files works
-@pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
-)
+@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
     def slot_removed(ep):
         assert (
@@ -355,9 +351,7 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
 #
 # Most pages start with a contrecord, so we don't do anything special
 # to ensure that.
-@pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
-)
+@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -402,9 +396,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
 # logical replication bug as such, but without logical replication,
 # records passed ot the WAL redo process are never large enough to hit
 # the bug.
-@pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
-)
+@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_large_records(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -476,9 +468,7 @@ def test_slots_and_branching(neon_simple_env: NeonEnv):
     ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')")
 
 
-@pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
-)
+@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_replication_shutdown(neon_simple_env: NeonEnv):
     # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed
     env = neon_simple_env

From ae63ac74887b9658c7a80f369b43247c1db51165 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 12 Aug 2024 14:57:50 -0500
Subject: [PATCH 1477/1571] Write messages field by field instead of bytes
 sheet in test_simple_sync_safekeepers

Co-authored-by: Arseny Sher <ars@neon.tech>
---
 libs/walproposer/build.rs           |   1 +
 libs/walproposer/src/walproposer.rs | 102 ++++++++++++++++++++--------
 2 files changed, 73 insertions(+), 30 deletions(-)

diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs
index 3126b170a4..7bb077062b 100644
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -95,6 +95,7 @@ fn main() -> anyhow::Result<()> {
         .allowlist_var("ERROR")
         .allowlist_var("FATAL")
         .allowlist_var("PANIC")
+        .allowlist_var("PG_VERSION_NUM")
         .allowlist_var("WPEVENT")
         .allowlist_var("WL_LATCH_SET")
         .allowlist_var("WL_SOCKET_READABLE")
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 37b1e0fa87..ba75171db2 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -282,7 +282,11 @@ mod tests {
     use std::cell::UnsafeCell;
     use utils::id::TenantTimelineId;
 
-    use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
+    use crate::{
+        api_bindings::Level,
+        bindings::{NeonWALReadResult, PG_VERSION_NUM},
+        walproposer::Wrapper,
+    };
 
     use super::ApiImpl;
 
@@ -489,41 +493,79 @@ mod tests {
 
         let (sender, receiver) = sync_channel(1);
 
+        // Messages definitions are at walproposer.h
+        // xxx: it would be better to extract them from safekeeper crate and
+        // use serialization/deserialization here.
+        let greeting_tag = (b'g' as u64).to_ne_bytes();
+        let proto_version = 2_u32.to_ne_bytes();
+        let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
+        let proposer_id = [0; 16];
+        let system_id = 0_u64.to_ne_bytes();
+        let tenant_id = ttid.tenant_id.as_arr();
+        let timeline_id = ttid.timeline_id.as_arr();
+        let pg_tli = 1_u32.to_ne_bytes();
+        let wal_seg_size = 16777216_u32.to_ne_bytes();
+        let proposer_greeting = [
+            greeting_tag.as_slice(),
+            proto_version.as_slice(),
+            pg_version.as_slice(),
+            proposer_id.as_slice(),
+            system_id.as_slice(),
+            tenant_id.as_slice(),
+            timeline_id.as_slice(),
+            pg_tli.as_slice(),
+            wal_seg_size.as_slice(),
+        ]
+        .concat();
+
+        let voting_tag = (b'v' as u64).to_ne_bytes();
+        let vote_request_term = 3_u64.to_ne_bytes();
+        let proposer_id = [0; 16];
+        let vote_request = [
+            voting_tag.as_slice(),
+            vote_request_term.as_slice(),
+            proposer_id.as_slice(),
+        ]
+        .concat();
+
+        let acceptor_greeting_term = 2_u64.to_ne_bytes();
+        let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
+        let acceptor_greeting = [
+            greeting_tag.as_slice(),
+            acceptor_greeting_term.as_slice(),
+            acceptor_greeting_node_id.as_slice(),
+        ]
+        .concat();
+
+        let vote_response_term = 3_u64.to_ne_bytes();
+        let vote_given = 1_u64.to_ne_bytes();
+        let flush_lsn = 0x539_u64.to_ne_bytes();
+        let truncate_lsn = 0x539_u64.to_ne_bytes();
+        let th_len = 1_u32.to_ne_bytes();
+        let th_term = 2_u64.to_ne_bytes();
+        let th_lsn = 0x539_u64.to_ne_bytes();
+        let timeline_start_lsn = 0x539_u64.to_ne_bytes();
+        let vote_response = [
+            voting_tag.as_slice(),
+            vote_response_term.as_slice(),
+            vote_given.as_slice(),
+            flush_lsn.as_slice(),
+            truncate_lsn.as_slice(),
+            th_len.as_slice(),
+            th_term.as_slice(),
+            th_lsn.as_slice(),
+            timeline_start_lsn.as_slice(),
+        ]
+        .concat();
+
         let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
             wait_events: Cell::new(WaitEventsData {
                 sk: std::ptr::null_mut(),
                 event_mask: 0,
             }),
-            expected_messages: vec![
-                // TODO: When updating Postgres versions, this test will cause
-                // problems. Postgres version in message needs updating.
-                //
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
-                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
-                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
-                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
-                ],
-                // VoteRequest(VoteRequest { term: 3 })
-                vec![
-                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0,
-                ],
-            ],
+            expected_messages: vec![proposer_greeting, vote_request],
             expected_ptr: AtomicUsize::new(0),
-            safekeeper_replies: vec![
-                // Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
-                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-                ],
-                // VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
-                vec![
-                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
-                    5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
-                    0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
-                ],
-            ],
+            safekeeper_replies: vec![acceptor_greeting, vote_response],
             replies_ptr: AtomicUsize::new(0),
             sync_channel: sender,
             shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),

From 6744ed19d8cc8cd09c6ccbbf66953e6ebb7a480d Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 13 Aug 2024 12:20:11 -0500
Subject: [PATCH 1478/1571] Update Postgres 14 to 14.13

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/revisions.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 3fd7a45f8a..b6910406e2 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 3fd7a45f8aae85c080df6329e3c85887b7f3a737
+Subproject commit b6910406e2d05a2c94baa2e530ec882733047759
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 6e3e489b5d..c2b5fb8915 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -8,7 +8,7 @@
     "46b4b235f38413ab5974bb22c022f9b829257674"
   ],
   "v14": [
-    "14.12",
-    "3fd7a45f8aae85c080df6329e3c85887b7f3a737"
+    "14.13",
+    "b6910406e2d05a2c94baa2e530ec882733047759"
   ]
 }

From 66db381dc9b9238618165c7ef36fa29a0577806c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 13 Aug 2024 12:27:05 -0500
Subject: [PATCH 1479/1571] Update Postgres 15 to 15.8

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vendor/postgres-v15   | 2 +-
 vendor/revisions.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 46b4b235f3..76063bff63 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 46b4b235f38413ab5974bb22c022f9b829257674
+Subproject commit 76063bff638ccce7afa99fc9037ac51338b9823d
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c2b5fb8915..2921372c24 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -4,8 +4,8 @@
     "47a9122a5a150a3217fafd3f3d4fe8e020ea718a"
   ],
   "v15": [
-    "15.7",
-    "46b4b235f38413ab5974bb22c022f9b829257674"
+    "15.8",
+    "76063bff638ccce7afa99fc9037ac51338b9823d"
   ],
   "v14": [
     "14.13",

From 2f8d548a125c490b29eb4a6ab4d79ce358300e74 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 13 Aug 2024 12:44:01 -0500
Subject: [PATCH 1480/1571] Update Postgres 16 to 16.4

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 47a9122a5a..8efa089aa7 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 47a9122a5a150a3217fafd3f3d4fe8e020ea718a
+Subproject commit 8efa089aa7786381543a4f9efc69b92d43eab8c0
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 2921372c24..50cc99c2f1 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v16": [
-    "16.3",
-    "47a9122a5a150a3217fafd3f3d4fe8e020ea718a"
+    "16.4",
+    "8efa089aa7786381543a4f9efc69b92d43eab8c0"
   ],
   "v15": [
     "15.8",

From f7ab3ffcb781c14bf35da8260518456d00cea04d Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 13 Aug 2024 14:05:06 -0500
Subject: [PATCH 1481/1571] Check that TERM != dumb before using colors in
 pre-commit.py

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pre-commit.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pre-commit.py b/pre-commit.py
index c5ed63ac44..ae432e8225 100755
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -2,6 +2,7 @@
 
 import argparse
 import enum
+import os
 import subprocess
 import sys
 from typing import List
@@ -93,7 +94,7 @@ if __name__ == "__main__":
         "--no-color",
         action="store_true",
         help="disable colored output",
-        default=not sys.stdout.isatty(),
+        default=not sys.stdout.isatty() or os.getenv("TERM") == "dumb",
     )
     args = parser.parse_args()
 

From dbdb8a1187d28cf98c93c9cc39c348db6d7e98f1 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 23 Aug 2024 09:15:55 +0300
Subject: [PATCH 1482/1571] Document how to use "git merge" for PostgreSQL
 minor version upgrades. (#8692)

Our new policy is to use the "rebase" method and slice all the Neon
commits into a nice patch set when doing a new major version, and use
"merge" method on minor version upgrades on the release branches.

"git merge" preserves the git history of Neon commits on the Postgres
branches. While it's nice to rebase all the Neon changes to a logical
patch set against upstream, having to do it between every minor release
is a fair amount work, and it loses the history, and is more
error-prone.
---
 docs/updating-postgres.md | 33 +++++++++++----------------------
 1 file changed, 11 insertions(+), 22 deletions(-)

diff --git a/docs/updating-postgres.md b/docs/updating-postgres.md
index 1868bbf5f7..7913b0a9e2 100644
--- a/docs/updating-postgres.md
+++ b/docs/updating-postgres.md
@@ -21,30 +21,21 @@ _Example: 15.4 is the new minor version to upgrade to from 15.3._
 1. Create a new branch based on the stable branch you are updating.
 
     ```shell
-    git checkout -b my-branch REL_15_STABLE_neon
+    git checkout -b my-branch-15 REL_15_STABLE_neon
     ```
 
-1. Tag the last commit on the stable branch you are updating.
+1. Find the upstream release tags you're looking for. They are of the form `REL_X_Y`.
 
-    ```shell
-    git tag REL_15_3_neon
-    ```
-
-1. Push the new tag to the Neon Postgres repository.
-
-    ```shell
-    git push origin REL_15_3_neon
-    ```
-
-1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
-
-1. Rebase the branch you created on the tag and resolve any conflicts.
+1. Merge the upstream tag into the branch you created on the tag and resolve any conflicts.
 
     ```shell
     git fetch upstream REL_15_4
-    git rebase REL_15_4
+    git merge REL_15_4
     ```
 
+    In the commit message of the merge commit, mention if there were
+    any non-trivial conflicts or other issues.
+
 1. Run the Postgres test suite to make sure our commits have not affected
 Postgres in a negative way.
 
@@ -57,7 +48,7 @@ Postgres in a negative way.
 1. Push your branch to the Neon Postgres repository.
 
     ```shell
-    git push origin my-branch
+    git push origin my-branch-15
     ```
 
 1. Clone the Neon repository if you have not done so already.
@@ -74,7 +65,7 @@ branch.
 1. Update the Git submodule.
 
     ```shell
-    git submodule set-branch --branch my-branch vendor/postgres-v15
+    git submodule set-branch --branch my-branch-15 vendor/postgres-v15
     git submodule update --remote vendor/postgres-v15
     ```
 
@@ -89,14 +80,12 @@ minor Postgres release.
 
 1. Create a pull request, and wait for CI to go green.
 
-1. Force push the rebased Postgres branches into the Neon Postgres repository.
+1. Push the Postgres branches with the merge commits into the Neon Postgres repository.
 
     ```shell
-    git push --force origin my-branch:REL_15_STABLE_neon
+    git push origin my-branch-15:REL_15_STABLE_neon
     ```
 
-    It may require disabling various branch protections.
-
 1. Update your Neon PR to point at the branches.
 
     ```shell

From d8ca495eae816ddfd5a06fed4e1e668fe1edad91 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Fri, 23 Aug 2024 12:48:26 +0200
Subject: [PATCH 1483/1571] Require poetry >=1.8 (#8812)

This was already a requirement for installing the python packages after
https://github.com/neondatabase/neon/pull/8609 got merged, so this
updates the documentation to reflect that.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f01442da5d..735edef0fc 100644
--- a/README.md
+++ b/README.md
@@ -126,7 +126,7 @@ make -j`sysctl -n hw.logicalcpu` -s
 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
 
 To run the integration tests or Python scripts (not required to use the code), install
-Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
+Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
 
 
 #### Running neon database

From e80ab8fd6a99bf46463695986b9f19e2cb06c8d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 23 Aug 2024 13:14:14 +0200
Subject: [PATCH 1484/1571] Update serde_json to 1.0.125 (#8813)

Updates `serde_json` to `1.0.125`, rolling out speedups added by a
serde_json contributor.

Release [link](https://github.com/serde-rs/json/releases/tag/1.0.125).
Blog post
[link](https://purplesyringa.moe/blog/i-sped-up-serde-json-strings-by-20-percent/).
---
 Cargo.lock | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a506da8c02..250427da2b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5590,11 +5590,12 @@ dependencies = [
 
 [[package]]
 name = "serde_json"
-version = "1.0.96"
+version = "1.0.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
+checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed"
 dependencies = [
  "itoa",
+ "memchr",
  "ryu",
  "serde",
 ]

From e62cd9e121928eca4f1f6b3ded4f5deb7e0a6110 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 23 Aug 2024 14:29:11 +0100
Subject: [PATCH 1485/1571] CI(autocomment): add arch to build type (#8809)

## Problem

Failed / flaky tests for different arches don't have any difference in
GitHub Autocomment

## Summary of changes
- Add arch to build type for GitHub autocomment
---
 scripts/comment-test-report.js | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js
index f42262cf48..e8e0b3c23a 100755
--- a/scripts/comment-test-report.js
+++ b/scripts/comment-test-report.js
@@ -68,16 +68,29 @@ const parseReportJson = async ({ reportJsonUrl, fetch }) => {
                     console.info(`Cannot get BUILD_TYPE and Postgres Version from test name: "${test.name}", defaulting to "release" and "14"`)
 
                     buildType = "release"
-                    pgVersion = "14"
+                    pgVersion = "16"
                 }
 
                 pgVersions.add(pgVersion)
 
+                // We use `arch` as it is returned by GitHub Actions
+                //  (RUNNER_ARCH env var): X86, X64, ARM, or ARM64
+                // Ref https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables
+                let arch = ""
+                if (test.parameters.includes("'X64'")) {
+                    arch = "x86-64"
+                } else if (test.parameters.includes("'ARM64'")) {
+                    arch = "arm64"
+                } else {
+                    arch = "unknown"
+                }
+
                 // Removing build type and PostgreSQL version from the test name to make it shorter
                 const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "")
                 test.pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${testName}`
                 test.pgVersion = pgVersion
                 test.buildType = buildType
+                test.arch = arch
 
                 if (test.status === "passed") {
                     passedTests[pgVersion][testName].push(test)
@@ -144,7 +157,7 @@ const reportSummary = async (params) => {
                 const links = []
                 for (const test of tests) {
                     const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
-                    links.push(`[${test.buildType}](${allureLink})`)
+                    links.push(`[${test.buildType}-${test.arch}](${allureLink})`)
                 }
                 summary += `- \`${testName}\`: ${links.join(", ")}\n`
             }
@@ -175,7 +188,7 @@ const reportSummary = async (params) => {
                     const links = []
                     for (const test of tests) {
                         const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries`
-                        links.push(`[${test.buildType}](${allureLink})`)
+                        links.push(`[${test.buildType}-${test.arch}](${allureLink})`)
                     }
                     summary += `- \`${testName}\`: ${links.join(", ")}\n`
                 }

From 6a74bcadecd0ce4f088b5a22c6183ff980559d87 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 23 Aug 2024 09:32:00 -0400
Subject: [PATCH 1486/1571] feat(pageserver): remove features=testing
 restriction for compact (#8815)

A small PR to make it possible to run force compaction in staging for
btm-gc compaction testing.

Part of https://github.com/neondatabase/neon/issues/8002

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a4da8506d6..4635e76ea9 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1706,11 +1706,6 @@ async fn timeline_compact_handler(
         flags |= CompactFlags::ForceImageLayerCreation;
     }
     if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
-        if !cfg!(feature = "testing") {
-            return Err(ApiError::InternalServerError(anyhow!(
-                "enhanced_gc_bottom_most_compaction is only available in testing mode"
-            )));
-        }
         flags |= CompactFlags::EnhancedGcBottomMostCompaction;
     }
     let wait_until_uploaded =
@@ -2942,7 +2937,7 @@ pub fn make_router(
         )
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
-            |r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
+            |r| api_handler(r, timeline_compact_handler),
         )
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",

From bc8cfe1b5567715995b884231bc2785a32307ce8 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 23 Aug 2024 09:42:45 -0400
Subject: [PATCH 1487/1571] fix(pageserver): l0 check criteria (#8797)

close https://github.com/neondatabase/neon/issues/8579

## Summary of changes

The `is_l0` check now takes both layer key range and the layer type.
This allows us to have image layers covering the full key range in
btm-most compaction (upcoming PR). However, we still don't allow delta
layers to cover the full key range, and I will make btm-most compaction
to generate delta layers with the key range of the keys existing in the
layer instead of `Key::MIN..Key::HACK_MAX` (upcoming PR).


Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/layer_map.rs                | 10 +++++-----
 pageserver/src/tenant/storage_layer/layer.rs      |  5 ++++-
 pageserver/src/tenant/storage_layer/layer_name.rs |  4 ++++
 pageserver/src/tenant/timeline.rs                 |  9 ++++++---
 storage_scrubber/src/checks.rs                    |  2 +-
 5 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 844f117ea2..707233b003 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -464,7 +464,7 @@ impl LayerMap {
     pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
         // TODO: See #3869, resulting #4088, attempted fix and repro #4094
 
-        if Self::is_l0(&layer_desc.key_range) {
+        if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
             self.l0_delta_layers.push(layer_desc.clone().into());
         }
 
@@ -483,7 +483,7 @@ impl LayerMap {
         self.historic
             .remove(historic_layer_coverage::LayerKey::from(layer_desc));
         let layer_key = layer_desc.key();
-        if Self::is_l0(&layer_desc.key_range) {
+        if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
             let len_before = self.l0_delta_layers.len();
             let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
             l0_delta_layers.retain(|other| other.key() != layer_key);
@@ -600,8 +600,8 @@ impl LayerMap {
     }
 
     /// Check if the key range resembles that of an L0 layer.
-    pub fn is_l0(key_range: &Range<Key>) -> bool {
-        key_range == &(Key::MIN..Key::MAX)
+    pub fn is_l0(key_range: &Range<Key>, is_delta_layer: bool) -> bool {
+        is_delta_layer && key_range == &(Key::MIN..Key::MAX)
     }
 
     /// This function determines which layers are counted in `count_deltas`:
@@ -628,7 +628,7 @@ impl LayerMap {
     ///      than just the current partition_range.
     pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
         // Case 1
-        if !Self::is_l0(&layer.key_range) {
+        if !Self::is_l0(&layer.key_range, layer.is_delta) {
             return true;
         }
 
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 774f97e1d9..2607b574e7 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1296,7 +1296,10 @@ impl LayerInner {
                 lsn_end: lsn_range.end,
                 remote: !resident,
                 access_stats,
-                l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range),
+                l0: crate::tenant::layer_map::LayerMap::is_l0(
+                    &self.layer_desc().key_range,
+                    self.layer_desc().is_delta,
+                ),
             }
         } else {
             let lsn = self.desc.image_layer_lsn();
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index f33ca076ab..47ae556279 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -256,6 +256,10 @@ impl LayerName {
             LayerName::Delta(layer) => &layer.key_range,
         }
     }
+
+    pub fn is_delta(&self) -> bool {
+        matches!(self, LayerName::Delta(_))
+    }
 }
 
 impl fmt::Display for LayerName {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index dc9cddea43..b33e436fce 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3002,7 +3002,10 @@ impl Timeline {
         // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
         //   the layer is likely to be covered by an image layer during compaction.
         layers.sort_by_key(|(desc, _meta, _atime)| {
-            std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end))
+            std::cmp::Reverse((
+                !LayerMap::is_l0(&desc.key_range, desc.is_delta),
+                desc.lsn_range.end,
+            ))
         });
 
         let layers = layers
@@ -4585,7 +4588,7 @@ impl Timeline {
                 // for compact_level0_phase1 creating an L0, which does not happen in practice
                 // because we have not implemented L0 => L0 compaction.
                 duplicated_layers.insert(l.layer_desc().key());
-            } else if LayerMap::is_l0(&l.layer_desc().key_range) {
+            } else if LayerMap::is_l0(&l.layer_desc().key_range, l.layer_desc().is_delta) {
                 return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
             } else {
                 insert_layers.push(l.clone());
@@ -5877,7 +5880,7 @@ mod tests {
             };
 
             // Apart from L0s, newest Layers should come first
-            if !LayerMap::is_l0(layer.name.key_range()) {
+            if !LayerMap::is_l0(layer.name.key_range(), layer.name.is_delta()) {
                 assert!(layer_lsn <= last_lsn);
                 last_lsn = layer_lsn;
             }
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index b35838bcf7..08b0f06ebf 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -150,7 +150,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
 
                             if response.is_err() {
                                 // Object is not present.
-                                let is_l0 = LayerMap::is_l0(layer.key_range());
+                                let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta());
 
                                 let msg = format!(
                                     "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})",

From 73286e6b9f8a0ba4fb00dd4b44e613963b62cb21 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 23 Aug 2024 16:43:08 +0300
Subject: [PATCH 1488/1571] test: copy dict to avoid error on retry (#8811)

there is no "const" in python, so when we modify the global dict, it
will remain that way on the retry. fix to not have it influence other
tests which might be run on the same runner.

evidence:
<https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8625/10513146742/index.html#/testresult/453c4ce05ada7496>
---
 test_runner/fixtures/pageserver/utils.py       | 17 +++++++++++------
 test_runner/regress/test_s3_restore.py         |  6 +++---
 test_runner/regress/test_storage_controller.py |  4 ++--
 test_runner/regress/test_tenant_delete.py      |  8 ++++----
 test_runner/regress/test_timeline_delete.py    |  4 ++--
 5 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index b75a480a63..a74fef6a60 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -430,12 +430,17 @@ def enable_remote_storage_versioning(
     return response
 
 
-MANY_SMALL_LAYERS_TENANT_CONFIG = {
-    "gc_period": "0s",
-    "compaction_period": "0s",
-    "checkpoint_distance": 1024**2,
-    "image_creation_threshold": 100,
-}
+def many_small_layers_tenant_config() -> Dict[str, Any]:
+    """
+    Create a new dict to avoid issues with deleting from the global value.
+    In python, the global is mutable.
+    """
+    return {
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        "checkpoint_distance": 1024**2,
+        "image_creation_threshold": 100,
+    }
 
 
 def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
index 9992647e56..c1a80a54bc 100644
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -8,9 +8,9 @@ from fixtures.neon_fixtures import (
     PgBin,
 )
 from fixtures.pageserver.utils import (
-    MANY_SMALL_LAYERS_TENANT_CONFIG,
     assert_prefix_empty,
     enable_remote_storage_versioning,
+    many_small_layers_tenant_config,
     wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
@@ -33,7 +33,7 @@ def test_tenant_s3_restore(
 
     # change it back after initdb, recovery doesn't work if the two
     # index_part.json uploads happen at same second or too close to each other.
-    initial_tenant_conf = MANY_SMALL_LAYERS_TENANT_CONFIG
+    initial_tenant_conf = many_small_layers_tenant_config()
     del initial_tenant_conf["checkpoint_distance"]
 
     env = neon_env_builder.init_start(initial_tenant_conf)
@@ -50,7 +50,7 @@ def test_tenant_s3_restore(
     tenant_id = env.initial_tenant
 
     # now lets create the small layers
-    ps_http.set_tenant_config(tenant_id, MANY_SMALL_LAYERS_TENANT_CONFIG)
+    ps_http.set_tenant_config(tenant_id, many_small_layers_tenant_config())
 
     # Default tenant and the one we created
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 94d71a7677..b3464b0c91 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -23,11 +23,11 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
-    MANY_SMALL_LAYERS_TENANT_CONFIG,
     assert_prefix_empty,
     assert_prefix_not_empty,
     enable_remote_storage_versioning,
     list_prefix,
+    many_small_layers_tenant_config,
     remote_storage_delete_key,
     timeline_delete_wait_completed,
 )
@@ -654,7 +654,7 @@ def test_storage_controller_s3_time_travel_recovery(
         tenant_id,
         shard_count=2,
         shard_stripe_size=8192,
-        tenant_config=MANY_SMALL_LAYERS_TENANT_CONFIG,
+        tenant_config=many_small_layers_tenant_config(),
     )
 
     # Check that the consistency check passes
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index dadf5ca672..448a28dc31 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -9,9 +9,9 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
-    MANY_SMALL_LAYERS_TENANT_CONFIG,
     assert_prefix_empty,
     assert_prefix_not_empty,
+    many_small_layers_tenant_config,
     wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
@@ -76,7 +76,7 @@ def test_tenant_delete_smoke(
 
     env.neon_cli.create_tenant(
         tenant_id=tenant_id,
-        conf=MANY_SMALL_LAYERS_TENANT_CONFIG,
+        conf=many_small_layers_tenant_config(),
     )
 
     # Default tenant and the one we created
@@ -215,7 +215,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
     # (and there is no way to reconstruct the used remote storage kind)
     remote_storage_kind = RemoteStorageKind.MOCK_S3
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    env = neon_env_builder.init_start(initial_tenant_conf=many_small_layers_tenant_config())
     ps_http = env.pageserver.http_client()
     tenant_id = env.initial_tenant
 
@@ -330,7 +330,7 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
 
     remote_storage_kind = RemoteStorageKind.MOCK_S3
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    env = neon_env_builder.init_start(initial_tenant_conf=many_small_layers_tenant_config())
 
     ps_http = env.pageserver.http_client()
     # create a tenant separate from the main tenant so that we have one remaining
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 6d96dda391..328131cd08 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -16,9 +16,9 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
-    MANY_SMALL_LAYERS_TENANT_CONFIG,
     assert_prefix_empty,
     assert_prefix_not_empty,
+    many_small_layers_tenant_config,
     poll_for_remote_storage_iterations,
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
@@ -782,7 +782,7 @@ def test_timeline_delete_resumed_on_attach(
     remote_storage_kind = s3_storage()
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
 
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    env = neon_env_builder.init_start(initial_tenant_conf=many_small_layers_tenant_config())
 
     tenant_id = env.initial_tenant
 

From bcc68a7866c633d74a482266bfe34053a093b9d8 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 23 Aug 2024 14:48:06 +0100
Subject: [PATCH 1489/1571] storcon_cli: add support for drain and fill
 operations (#8791)

## Problem
We have been naughty and curl-ed storcon to fix-up drains and fills.

## Summary of changes
Add support for starting/cancelling drain/fill operations via
`storcon_cli`.
---
 control_plane/storcon_cli/src/main.rs | 135 ++++++++++++++++++++++++--
 storage_controller/src/http.rs        |   1 -
 2 files changed, 126 insertions(+), 10 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index e27491c1c8..35510ccbca 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -147,9 +147,9 @@ enum Command {
         #[arg(long)]
         threshold: humantime::Duration,
     },
-    // Drain a set of specified pageservers by moving the primary attachments to pageservers
+    // Migrate away from a set of specified pageservers by moving the primary attachments to pageservers
     // outside of the specified set.
-    Drain {
+    BulkMigrate {
         // Set of pageserver node ids to drain.
         #[arg(long)]
         nodes: Vec<NodeId>,
@@ -163,6 +163,34 @@ enum Command {
         #[arg(long)]
         dry_run: Option<bool>,
     },
+    /// Start draining the specified pageserver.
+    /// The drain is complete when the schedulling policy returns to active.
+    StartDrain {
+        #[arg(long)]
+        node_id: NodeId,
+    },
+    /// Cancel draining the specified pageserver and wait for `timeout`
+    /// for the operation to be canceled. May be retried.
+    CancelDrain {
+        #[arg(long)]
+        node_id: NodeId,
+        #[arg(long)]
+        timeout: humantime::Duration,
+    },
+    /// Start filling the specified pageserver.
+    /// The drain is complete when the schedulling policy returns to active.
+    StartFill {
+        #[arg(long)]
+        node_id: NodeId,
+    },
+    /// Cancel filling the specified pageserver and wait for `timeout`
+    /// for the operation to be canceled. May be retried.
+    CancelFill {
+        #[arg(long)]
+        node_id: NodeId,
+        #[arg(long)]
+        timeout: humantime::Duration,
+    },
 }
 
 #[derive(Parser)]
@@ -249,6 +277,34 @@ impl FromStr for NodeAvailabilityArg {
     }
 }
 
+async fn wait_for_scheduling_policy<F>(
+    client: Client,
+    node_id: NodeId,
+    timeout: Duration,
+    f: F,
+) -> anyhow::Result<NodeSchedulingPolicy>
+where
+    F: Fn(NodeSchedulingPolicy) -> bool,
+{
+    let waiter = tokio::time::timeout(timeout, async move {
+        loop {
+            let node = client
+                .dispatch::<(), NodeDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/node/{node_id}"),
+                    None,
+                )
+                .await?;
+
+            if f(node.scheduling) {
+                return Ok::<NodeSchedulingPolicy, mgmt_api::Error>(node.scheduling);
+            }
+        }
+    });
+
+    Ok(waiter.await??)
+}
+
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
     let cli = Cli::parse();
@@ -628,7 +684,7 @@ async fn main() -> anyhow::Result<()> {
                 })
                 .await?;
         }
-        Command::Drain {
+        Command::BulkMigrate {
             nodes,
             concurrency,
             max_shards,
@@ -657,7 +713,7 @@ async fn main() -> anyhow::Result<()> {
             }
 
             if nodes.len() != node_to_drain_descs.len() {
-                anyhow::bail!("Drain requested for node which doesn't exist.")
+                anyhow::bail!("Bulk migration requested away from node which doesn't exist.")
             }
 
             node_to_fill_descs.retain(|desc| {
@@ -669,7 +725,7 @@ async fn main() -> anyhow::Result<()> {
             });
 
             if node_to_fill_descs.is_empty() {
-                anyhow::bail!("There are no nodes to drain to")
+                anyhow::bail!("There are no nodes to migrate to")
             }
 
             // Set the node scheduling policy to draining for the nodes which
@@ -690,7 +746,7 @@ async fn main() -> anyhow::Result<()> {
                     .await?;
             }
 
-            // Perform the drain: move each tenant shard scheduled on a node to
+            // Perform the migration: move each tenant shard scheduled on a node to
             // be drained to a node which is being filled. A simple round robin
             // strategy is used to pick the new node.
             let tenants = storcon_client
@@ -703,13 +759,13 @@ async fn main() -> anyhow::Result<()> {
 
             let mut selected_node_idx = 0;
 
-            struct DrainMove {
+            struct MigrationMove {
                 tenant_shard_id: TenantShardId,
                 from: NodeId,
                 to: NodeId,
             }
 
-            let mut moves: Vec<DrainMove> = Vec::new();
+            let mut moves: Vec<MigrationMove> = Vec::new();
 
             let shards = tenants
                 .into_iter()
@@ -739,7 +795,7 @@ async fn main() -> anyhow::Result<()> {
                     continue;
                 }
 
-                moves.push(DrainMove {
+                moves.push(MigrationMove {
                     tenant_shard_id: shard.tenant_shard_id,
                     from: shard
                         .node_attached
@@ -816,6 +872,67 @@ async fn main() -> anyhow::Result<()> {
                 failure
             );
         }
+        Command::StartDrain { node_id } => {
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::PUT,
+                    format!("control/v1/node/{node_id}/drain"),
+                    None,
+                )
+                .await?;
+            println!("Drain started for {node_id}");
+        }
+        Command::CancelDrain { node_id, timeout } => {
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::DELETE,
+                    format!("control/v1/node/{node_id}/drain"),
+                    None,
+                )
+                .await?;
+
+            println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
+
+            let final_policy =
+                wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
+                    use NodeSchedulingPolicy::*;
+                    matches!(sched, Active | PauseForRestart)
+                })
+                .await?;
+
+            println!(
+                "Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
+            );
+        }
+        Command::StartFill { node_id } => {
+            storcon_client
+                .dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None)
+                .await?;
+
+            println!("Fill started for {node_id}");
+        }
+        Command::CancelFill { node_id, timeout } => {
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::DELETE,
+                    format!("control/v1/node/{node_id}/fill"),
+                    None,
+                )
+                .await?;
+
+            println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
+
+            let final_policy =
+                wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
+                    use NodeSchedulingPolicy::*;
+                    matches!(sched, Active)
+                })
+                .await?;
+
+            println!(
+                "Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
+            );
+        }
     }
 
     Ok(())
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 7bbd1541cf..207bd5a1e6 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1074,7 +1074,6 @@ pub fn make_router(
                 RequestName("control_v1_metadata_health_list_outdated"),
             )
         })
-        // TODO(vlad): endpoint for cancelling drain and fill
         // Tenant Shard operations
         .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
             tenant_service_handler(

From 612b643315fbda4b489ae512b14d9bd66a4fbacb Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 23 Aug 2024 16:28:22 +0100
Subject: [PATCH 1490/1571] update diesel (#8816)

https://rustsec.org/advisories/RUSTSEC-2024-0365
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 250427da2b..441ca1ff86 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1672,9 +1672,9 @@ dependencies = [
 
 [[package]]
 name = "diesel"
-version = "2.2.1"
+version = "2.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
+checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71"
 dependencies = [
  "bitflags 2.4.1",
  "byteorder",

From f4cac1f30f096ceb8c1fa4a3281319883d10be6e Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Sat, 24 Aug 2024 00:38:42 +0800
Subject: [PATCH 1491/1571] impr(pageserver): error if keys are unordered in
 merge iter (#8818)

In case of corrupted delta layers, we can detect the corruption and bail
out the compaction.

## Summary of changes

* Detect wrong delta desc of key range
* Detect unordered deltas

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../src/tenant/storage_layer/delta_layer.rs   | 16 +++++++++++
 .../src/tenant/storage_layer/image_layer.rs   | 15 +++++++++++
 .../tenant/storage_layer/merge_iterator.rs    | 27 ++++++++++++++++++-
 3 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 6c2391d72d..b1b5217f7f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -232,6 +232,18 @@ pub struct DeltaLayerInner {
     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }
 
+impl DeltaLayerInner {
+    pub(crate) fn layer_dbg_info(&self) -> String {
+        format!(
+            "delta {}..{} {}..{}",
+            self.key_range().start,
+            self.key_range().end,
+            self.lsn_range().start,
+            self.lsn_range().end
+        )
+    }
+}
+
 impl std::fmt::Debug for DeltaLayerInner {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("DeltaLayerInner")
@@ -1527,6 +1539,10 @@ pub struct DeltaLayerIterator<'a> {
 }
 
 impl<'a> DeltaLayerIterator<'a> {
+    pub(crate) fn layer_dbg_info(&self) -> String {
+        self.delta_layer.layer_dbg_info()
+    }
+
     /// Retrieve a batch of key-value pairs into the iterator buffer.
     async fn next_batch(&mut self) -> anyhow::Result<()> {
         assert!(self.key_values_batch.is_empty());
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 9a19e4e2c7..94120a4e3e 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -167,6 +167,17 @@ pub struct ImageLayerInner {
     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }
 
+impl ImageLayerInner {
+    pub(crate) fn layer_dbg_info(&self) -> String {
+        format!(
+            "image {}..{} {}",
+            self.key_range().start,
+            self.key_range().end,
+            self.lsn()
+        )
+    }
+}
+
 impl std::fmt::Debug for ImageLayerInner {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("ImageLayerInner")
@@ -1024,6 +1035,10 @@ pub struct ImageLayerIterator<'a> {
 }
 
 impl<'a> ImageLayerIterator<'a> {
+    pub(crate) fn layer_dbg_info(&self) -> String {
+        self.image_layer.layer_dbg_info()
+    }
+
     /// Retrieve a batch of key-value pairs into the iterator buffer.
     async fn next_batch(&mut self) -> anyhow::Result<()> {
         assert!(self.key_values_batch.is_empty());
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index b4bd976033..d2c341e5ce 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -3,6 +3,7 @@ use std::{
     collections::{binary_heap, BinaryHeap},
 };
 
+use anyhow::bail;
 use pageserver_api::key::Key;
 use utils::lsn::Lsn;
 
@@ -26,6 +27,13 @@ impl<'a> LayerRef<'a> {
             Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
         }
     }
+
+    fn layer_dbg_info(&self) -> String {
+        match self {
+            Self::Image(x) => x.layer_dbg_info(),
+            Self::Delta(x) => x.layer_dbg_info(),
+        }
+    }
 }
 
 enum LayerIterRef<'a> {
@@ -40,6 +48,13 @@ impl LayerIterRef<'_> {
             Self::Image(x) => x.next().await,
         }
     }
+
+    fn layer_dbg_info(&self) -> String {
+        match self {
+            Self::Image(x) => x.layer_dbg_info(),
+            Self::Delta(x) => x.layer_dbg_info(),
+        }
+    }
 }
 
 /// This type plays several roles at once
@@ -75,6 +90,11 @@ impl<'a> PeekableLayerIterRef<'a> {
     async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
         let result = self.peeked.take();
         self.peeked = self.iter.next().await?;
+        if let (Some((k1, l1, _)), Some((k2, l2, _))) = (&self.peeked, &result) {
+            if (k1, l1) < (k2, l2) {
+                bail!("iterator is not ordered: {}", self.iter.layer_dbg_info());
+            }
+        }
         Ok(result)
     }
 }
@@ -178,7 +198,12 @@ impl<'a> IteratorWrapper<'a> {
         let iter = PeekableLayerIterRef::create(iter).await?;
         if let Some((k1, l1, _)) = iter.peek() {
             let (k2, l2) = first_key_lower_bound;
-            debug_assert!((k1, l1) >= (k2, l2));
+            if (k1, l1) < (k2, l2) {
+                bail!(
+                    "layer key range did not include the first key in the layer: {}",
+                    layer.layer_dbg_info()
+                );
+            }
         }
         *self = Self::Loaded { iter };
         Ok(())

From c1cb7a0fa0d0bb6b58aa0f3e0979905476a19225 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 23 Aug 2024 18:01:02 +0100
Subject: [PATCH 1492/1571] proxy: flesh out JWT verification code (#8805)

This change adds in the necessary verification steps for the JWT
payload, and adds per-role querying of JWKs as needed for #8736
---
 proxy/src/auth/backend/jwt.rs | 295 +++++++++++++++++++++++-----------
 1 file changed, 203 insertions(+), 92 deletions(-)

diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index e021a7e23f..49d5de16c3 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -1,15 +1,21 @@
-use std::{future::Future, sync::Arc, time::Duration};
+use std::{
+    future::Future,
+    sync::Arc,
+    time::{Duration, SystemTime},
+};
 
 use anyhow::{bail, ensure, Context};
 use arc_swap::ArcSwapOption;
 use dashmap::DashMap;
 use jose_jwk::crypto::KeyInfo;
+use serde::{Deserialize, Deserializer};
 use signature::Verifier;
 use tokio::time::Instant;
 
-use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt};
+use crate::{context::RequestMonitoring, http::parse_json_body_with_limit, EndpointId, RoleName};
 
 // TODO(conrad): make these configurable.
+const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
 const MIN_RENEW: Duration = Duration::from_secs(30);
 const AUTO_RENEW: Duration = Duration::from_secs(300);
 const MAX_RENEW: Duration = Duration::from_secs(3600);
@@ -17,30 +23,56 @@ const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
 
 /// How to get the JWT auth rules
 pub trait FetchAuthRules: Clone + Send + Sync + 'static {
-    fn fetch_auth_rules(&self) -> impl Future<Output = anyhow::Result<AuthRules>> + Send;
+    fn fetch_auth_rules(
+        &self,
+        role_name: RoleName,
+    ) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
 }
 
-#[derive(Clone)]
-struct FetchAuthRulesFromCplane {
-    #[allow(dead_code)]
-    endpoint: EndpointIdInt,
-}
-
-impl FetchAuthRules for FetchAuthRulesFromCplane {
-    async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
-        Err(anyhow::anyhow!("not yet implemented"))
-    }
-}
-
-pub struct AuthRules {
-    jwks_urls: Vec<url::Url>,
+pub struct AuthRule {
+    pub id: String,
+    pub jwks_url: url::Url,
+    pub audience: Option<String>,
 }
 
 #[derive(Default)]
 pub struct JwkCache {
     client: reqwest::Client,
 
-    map: DashMap<EndpointIdInt, Arc<JwkCacheEntryLock>>,
+    map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
+}
+
+pub struct JwkCacheEntry {
+    /// Should refetch at least every hour to verify when old keys have been removed.
+    /// Should refetch when new key IDs are seen only every 5 minutes or so
+    last_retrieved: Instant,
+
+    /// cplane will return multiple JWKs urls that we need to scrape.
+    key_sets: ahash::HashMap<String, KeySet>,
+}
+
+impl JwkCacheEntry {
+    fn find_jwk_and_audience(&self, key_id: &str) -> Option<(&jose_jwk::Jwk, Option<&str>)> {
+        self.key_sets.values().find_map(|key_set| {
+            key_set
+                .find_key(key_id)
+                .map(|jwk| (jwk, key_set.audience.as_deref()))
+        })
+    }
+}
+
+struct KeySet {
+    jwks: jose_jwk::JwkSet,
+    audience: Option<String>,
+}
+
+impl KeySet {
+    fn find_key(&self, key_id: &str) -> Option<&jose_jwk::Jwk> {
+        self.jwks
+            .keys
+            .iter()
+            .find(|jwk| jwk.prm.kid.as_deref() == Some(key_id))
+    }
 }
 
 pub struct JwkCacheEntryLock {
@@ -57,15 +89,6 @@ impl Default for JwkCacheEntryLock {
     }
 }
 
-pub struct JwkCacheEntry {
-    /// Should refetch at least every hour to verify when old keys have been removed.
-    /// Should refetch when new key IDs are seen only every 5 minutes or so
-    last_retrieved: Instant,
-
-    /// cplane will return multiple JWKs urls that we need to scrape.
-    key_sets: ahash::HashMap<url::Url, jose_jwk::JwkSet>,
-}
-
 impl JwkCacheEntryLock {
     async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
         JwkRenewalPermit::acquire_permit(self).await
@@ -79,6 +102,7 @@ impl JwkCacheEntryLock {
         &self,
         _permit: JwkRenewalPermit<'_>,
         client: &reqwest::Client,
+        role_name: RoleName,
         auth_rules: &F,
     ) -> anyhow::Result<Arc<JwkCacheEntry>> {
         // double check that no one beat us to updating the cache.
@@ -91,20 +115,19 @@ impl JwkCacheEntryLock {
             }
         }
 
-        let rules = auth_rules.fetch_auth_rules().await?;
-        let mut key_sets = ahash::HashMap::with_capacity_and_hasher(
-            rules.jwks_urls.len(),
-            ahash::RandomState::new(),
-        );
+        let rules = auth_rules.fetch_auth_rules(role_name).await?;
+        let mut key_sets =
+            ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new());
         // TODO(conrad): run concurrently
         // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284)
-        for url in rules.jwks_urls {
-            let req = client.get(url.clone());
+        for rule in rules {
+            let req = client.get(rule.jwks_url.clone());
             // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
+            // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only.
             match req.send().await.and_then(|r| r.error_for_status()) {
                 // todo: should we re-insert JWKs if we want to keep this JWKs URL?
                 // I expect these failures would be quite sparse.
-                Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"),
+                Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"),
                 Ok(r) => {
                     let resp: http::Response<reqwest::Body> = r.into();
                     match parse_json_body_with_limit::<jose_jwk::JwkSet>(
@@ -113,9 +136,17 @@ impl JwkCacheEntryLock {
                     )
                     .await
                     {
-                        Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"),
+                        Err(e) => {
+                            tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
+                        }
                         Ok(jwks) => {
-                            key_sets.insert(url, jwks);
+                            key_sets.insert(
+                                rule.id,
+                                KeySet {
+                                    jwks,
+                                    audience: rule.audience,
+                                },
+                            );
                         }
                     }
                 }
@@ -133,7 +164,9 @@ impl JwkCacheEntryLock {
 
     async fn get_or_update_jwk_cache<F: FetchAuthRules>(
         self: &Arc<Self>,
+        ctx: &RequestMonitoring,
         client: &reqwest::Client,
+        role_name: RoleName,
         fetch: &F,
     ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
         let now = Instant::now();
@@ -141,18 +174,20 @@ impl JwkCacheEntryLock {
 
         // if we have no cached JWKs, try and get some
         let Some(cached) = guard else {
+            let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
             let permit = self.acquire_permit().await;
-            return self.renew_jwks(permit, client, fetch).await;
+            return self.renew_jwks(permit, client, role_name, fetch).await;
         };
 
         let last_update = now.duration_since(cached.last_retrieved);
 
         // check if the cached JWKs need updating.
         if last_update > MAX_RENEW {
+            let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
             let permit = self.acquire_permit().await;
 
             // it's been too long since we checked the keys. wait for them to update.
-            return self.renew_jwks(permit, client, fetch).await;
+            return self.renew_jwks(permit, client, role_name, fetch).await;
         }
 
         // every 5 minutes we should spawn a job to eagerly update the token.
@@ -164,7 +199,7 @@ impl JwkCacheEntryLock {
                 let client = client.clone();
                 let fetch = fetch.clone();
                 tokio::spawn(async move {
-                    if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await {
+                    if let Err(e) = entry.renew_jwks(permit, &client, role_name, &fetch).await {
                         tracing::warn!(error=?e, "could not fetch JWKs in background job");
                     }
                 });
@@ -178,8 +213,10 @@ impl JwkCacheEntryLock {
 
     async fn check_jwt<F: FetchAuthRules>(
         self: &Arc<Self>,
-        jwt: String,
+        ctx: &RequestMonitoring,
+        jwt: &str,
         client: &reqwest::Client,
+        role_name: RoleName,
         fetch: &F,
     ) -> Result<(), anyhow::Error> {
         // JWT compact form is defined to be
@@ -189,36 +226,36 @@ impl JwkCacheEntryLock {
         let (header_payload, signature) = jwt
             .rsplit_once(".")
             .context("Provided authentication token is not a valid JWT encoding")?;
-        let (header, _payload) = header_payload
+        let (header, payload) = header_payload
             .split_once(".")
             .context("Provided authentication token is not a valid JWT encoding")?;
 
         let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
             .context("Provided authentication token is not a valid JWT encoding")?;
-        let header = serde_json::from_slice::<JWTHeader<'_>>(&header)
+        let header = serde_json::from_slice::<JwtHeader<'_>>(&header)
             .context("Provided authentication token is not a valid JWT encoding")?;
 
         let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
             .context("Provided authentication token is not a valid JWT encoding")?;
 
         ensure!(header.typ == "JWT");
-        let kid = header.kid.context("missing key id")?;
+        let kid = header.key_id.context("missing key id")?;
 
-        let mut guard = self.get_or_update_jwk_cache(client, fetch).await?;
+        let mut guard = self
+            .get_or_update_jwk_cache(ctx, client, role_name.clone(), fetch)
+            .await?;
 
         // get the key from the JWKs if possible. If not, wait for the keys to update.
-        let jwk = loop {
-            let jwk = guard
-                .key_sets
-                .values()
-                .flat_map(|jwks| &jwks.keys)
-                .find(|jwk| jwk.prm.kid.as_deref() == Some(kid));
-
-            match jwk {
+        let (jwk, expected_audience) = loop {
+            match guard.find_jwk_and_audience(kid) {
                 Some(jwk) => break jwk,
                 None if guard.last_retrieved.elapsed() > MIN_RENEW => {
+                    let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
+
                     let permit = self.acquire_permit().await;
-                    guard = self.renew_jwks(permit, client, fetch).await?;
+                    guard = self
+                        .renew_jwks(permit, client, role_name.clone(), fetch)
+                        .await?;
                 }
                 _ => {
                     bail!("jwk not found");
@@ -227,7 +264,7 @@ impl JwkCacheEntryLock {
         };
 
         ensure!(
-            jwk.is_supported(&header.alg),
+            jwk.is_supported(&header.algorithm),
             "signature algorithm not supported"
         );
 
@@ -241,31 +278,60 @@ impl JwkCacheEntryLock {
             key => bail!("unsupported key type {key:?}"),
         };
 
-        // TODO(conrad): verify iss, exp, nbf, etc...
+        let payload = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)
+            .context("Provided authentication token is not a valid JWT encoding")?;
+        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payload)
+            .context("Provided authentication token is not a valid JWT encoding")?;
+
+        tracing::debug!(?payload, "JWT signature valid with claims");
+
+        match (expected_audience, payload.audience) {
+            // check the audience matches
+            (Some(aud1), Some(aud2)) => ensure!(aud1 == aud2, "invalid JWT token audience"),
+            // the audience is expected but is missing
+            (Some(_), None) => bail!("invalid JWT token audience"),
+            // we don't care for the audience field
+            (None, _) => {}
+        }
+
+        let now = SystemTime::now();
+
+        if let Some(exp) = payload.expiration {
+            ensure!(now < exp + CLOCK_SKEW_LEEWAY);
+        }
+
+        if let Some(nbf) = payload.not_before {
+            ensure!(nbf < now + CLOCK_SKEW_LEEWAY);
+        }
 
         Ok(())
     }
 }
 
 impl JwkCache {
-    pub async fn check_jwt(
+    pub async fn check_jwt<F: FetchAuthRules>(
         &self,
-        endpoint: EndpointIdInt,
-        jwt: String,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+        role_name: RoleName,
+        fetch: &F,
+        jwt: &str,
     ) -> Result<(), anyhow::Error> {
         // try with just a read lock first
-        let entry = self.map.get(&endpoint).as_deref().map(Arc::clone);
+        let key = (endpoint, role_name.clone());
+        let entry = self.map.get(&key).as_deref().map(Arc::clone);
         let entry = match entry {
             Some(entry) => entry,
             None => {
                 // acquire a write lock after to insert.
-                let entry = self.map.entry(endpoint).or_default();
+                let entry = self.map.entry(key).or_default();
                 Arc::clone(&*entry)
             }
         };
 
-        let fetch = FetchAuthRulesFromCplane { endpoint };
-        entry.check_jwt(jwt, &self.client, &fetch).await
+        entry
+            .check_jwt(ctx, jwt, &self.client, role_name, fetch)
+            .await
     }
 }
 
@@ -315,13 +381,49 @@ fn verify_rsa_signature(
 
 /// <https://datatracker.ietf.org/doc/html/rfc7515#section-4.1>
 #[derive(serde::Deserialize, serde::Serialize)]
-struct JWTHeader<'a> {
+struct JwtHeader<'a> {
     /// must be "JWT"
+    #[serde(rename = "typ")]
     typ: &'a str,
     /// must be a supported alg
-    alg: jose_jwa::Algorithm,
+    #[serde(rename = "alg")]
+    algorithm: jose_jwa::Algorithm,
     /// key id, must be provided for our usecase
-    kid: Option<&'a str>,
+    #[serde(rename = "kid")]
+    key_id: Option<&'a str>,
+}
+
+/// <https://datatracker.ietf.org/doc/html/rfc7519#section-4.1>
+#[derive(serde::Deserialize, serde::Serialize, Debug)]
+struct JwtPayload<'a> {
+    /// Audience - Recipient for which the JWT is intended
+    #[serde(rename = "aud")]
+    audience: Option<&'a str>,
+    /// Expiration - Time after which the JWT expires
+    #[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)]
+    expiration: Option<SystemTime>,
+    /// Not before - Time after which the JWT expires
+    #[serde(deserialize_with = "numeric_date_opt", rename = "nbf", default)]
+    not_before: Option<SystemTime>,
+
+    // the following entries are only extracted for the sake of debug logging.
+    /// Issuer of the JWT
+    #[serde(rename = "iss")]
+    issuer: Option<&'a str>,
+    /// Subject of the JWT (the user)
+    #[serde(rename = "sub")]
+    subject: Option<&'a str>,
+    /// Unique token identifier
+    #[serde(rename = "jti")]
+    jwt_id: Option<&'a str>,
+    /// Unique session identifier
+    #[serde(rename = "sid")]
+    session_id: Option<&'a str>,
+}
+
+fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result<Option<SystemTime>, D::Error> {
+    let d = <Option<u64>>::deserialize(d)?;
+    Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n)))
 }
 
 struct JwkRenewalPermit<'a> {
@@ -388,6 +490,8 @@ impl Drop for JwkRenewalPermit<'_> {
 
 #[cfg(test)]
 mod tests {
+    use crate::RoleName;
+
     use super::*;
 
     use std::{future::IntoFuture, net::SocketAddr, time::SystemTime};
@@ -431,10 +535,10 @@ mod tests {
     }
 
     fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
-        let header = JWTHeader {
+        let header = JwtHeader {
             typ: "JWT",
-            alg: jose_jwa::Algorithm::Signing(sig),
-            kid: Some(&kid),
+            algorithm: jose_jwa::Algorithm::Signing(sig),
+            key_id: Some(&kid),
         };
         let body = typed_json::json! {{
             "exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
@@ -524,33 +628,40 @@ mod tests {
         struct Fetch(SocketAddr);
 
         impl FetchAuthRules for Fetch {
-            async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
-                Ok(AuthRules {
-                    jwks_urls: vec![
-                        format!("http://{}/foo", self.0).parse().unwrap(),
-                        format!("http://{}/bar", self.0).parse().unwrap(),
-                    ],
-                })
+            async fn fetch_auth_rules(
+                &self,
+                _role_name: RoleName,
+            ) -> anyhow::Result<Vec<AuthRule>> {
+                Ok(vec![
+                    AuthRule {
+                        id: "foo".to_owned(),
+                        jwks_url: format!("http://{}/foo", self.0).parse().unwrap(),
+                        audience: None,
+                    },
+                    AuthRule {
+                        id: "bar".to_owned(),
+                        jwks_url: format!("http://{}/bar", self.0).parse().unwrap(),
+                        audience: None,
+                    },
+                ])
             }
         }
 
+        let role_name = RoleName::from("user");
+
         let jwk_cache = Arc::new(JwkCacheEntryLock::default());
 
-        jwk_cache
-            .check_jwt(jwt1, &client, &Fetch(addr))
-            .await
-            .unwrap();
-        jwk_cache
-            .check_jwt(jwt2, &client, &Fetch(addr))
-            .await
-            .unwrap();
-        jwk_cache
-            .check_jwt(jwt3, &client, &Fetch(addr))
-            .await
-            .unwrap();
-        jwk_cache
-            .check_jwt(jwt4, &client, &Fetch(addr))
-            .await
-            .unwrap();
+        for token in [jwt1, jwt2, jwt3, jwt4] {
+            jwk_cache
+                .check_jwt(
+                    &RequestMonitoring::test(),
+                    &token,
+                    &client,
+                    role_name.clone(),
+                    &Fetch(addr),
+                )
+                .await
+                .unwrap();
+        }
     }
 }

From b65a95f12ef958a509e30f0d650a820b4e2e8c58 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 23 Aug 2024 18:32:56 +0100
Subject: [PATCH 1493/1571] controller: use PageserverUtilization for
 scheduling (#8711)

## Problem

Previously, the controller only used the shard counts for scheduling.
This works well when hosting only many-sharded tenants, but works much
less well when hosting single-sharded tenants that have a greater
deviation in size-per-shard.

Closes: https://github.com/neondatabase/neon/issues/7798

## Summary of changes

- Instead of UtilizationScore, carry the full PageserverUtilization
through into the Scheduler.
- Use the PageserverUtilization::score() instead of shard count when
ordering nodes in scheduling.

Q: Why did test_sharding_split_smoke need updating in this PR?
A: There's an interesting side effect during shard splits: because we do
not decrement the shard count in the utilization when we de-schedule the
shards from before the split, the controller will now prefer to pick
_different_ nodes for shards compared with which ones held secondaries
before the split. We could use our knowledge of splitting to fix up the
utilizations more actively in this situation, but I'm leaning toward
leaving the code simpler, as in practical systems the impact of one
shard on the utilization of a node should be fairly low (single digit
%).
---
 libs/pageserver_api/src/controller_api.rs     |  21 +-
 libs/pageserver_api/src/models/utilization.rs |  67 +++++-
 pageserver/src/metrics.rs                     |   8 +
 pageserver/src/tenant.rs                      |  18 +-
 pageserver/src/utilization.rs                 |  10 +-
 storage_controller/src/heartbeater.rs         |  10 +-
 storage_controller/src/node.rs                |  24 +-
 storage_controller/src/scheduler.rs           | 225 +++++++++++++++---
 storage_controller/src/service.rs             |  39 +--
 storage_controller/src/tenant_shard.rs        |   4 +-
 test_runner/regress/test_sharding.py          |  15 +-
 11 files changed, 340 insertions(+), 101 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index a50707a1b8..a9a57d77ce 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -8,6 +8,7 @@ use std::time::{Duration, Instant};
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId};
 
+use crate::models::PageserverUtilization;
 use crate::{
     models::{ShardParameters, TenantConfig},
     shard::{ShardStripeSize, TenantShardId},
@@ -140,23 +141,11 @@ pub struct TenantShardMigrateRequest {
     pub node_id: NodeId,
 }
 
-/// Utilisation score indicating how good a candidate a pageserver
-/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
-/// Lower values are better.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
-pub struct UtilizationScore(pub u64);
-
-impl UtilizationScore {
-    pub fn worst() -> Self {
-        UtilizationScore(u64::MAX)
-    }
-}
-
-#[derive(Serialize, Clone, Copy, Debug)]
+#[derive(Serialize, Clone, Debug)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
     // Normal, happy state
-    Active(UtilizationScore),
+    Active(PageserverUtilization),
     // Node is warming up, but we expect it to become available soon. Covers
     // the time span between the re-attach response being composed on the storage controller
     // and the first successful heartbeat after the processing of the re-attach response
@@ -195,7 +184,9 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
         match val {
             // Assume the worst utilisation score to begin with. It will later be updated by
             // the heartbeats.
-            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
+            NodeAvailabilityWrapper::Active => {
+                NodeAvailability::Active(PageserverUtilization::full())
+            }
             NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
             NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
         }
diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs
index 0fec221276..844a0cda5d 100644
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -38,7 +38,7 @@ pub struct PageserverUtilization {
     pub max_shard_count: u32,
 
     /// Cached result of [`Self::score`]
-    pub utilization_score: u64,
+    pub utilization_score: Option<u64>,
 
     /// When was this snapshot captured, pageserver local time.
     ///
@@ -50,6 +50,8 @@ fn unity_percent() -> Percent {
     Percent::new(0).unwrap()
 }
 
+pub type RawScore = u64;
+
 impl PageserverUtilization {
     const UTILIZATION_FULL: u64 = 1000000;
 
@@ -62,7 +64,7 @@ impl PageserverUtilization {
     /// - Negative values are forbidden
     /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
     ///   layer eviction.
-    pub fn score(&self) -> u64 {
+    pub fn score(&self) -> RawScore {
         let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
             * self.disk_usable_pct.get() as u64)
             / 100;
@@ -74,8 +76,30 @@ impl PageserverUtilization {
         std::cmp::max(disk_utilization_score, shard_utilization_score)
     }
 
-    pub fn refresh_score(&mut self) {
-        self.utilization_score = self.score();
+    pub fn cached_score(&mut self) -> RawScore {
+        match self.utilization_score {
+            None => {
+                let s = self.score();
+                self.utilization_score = Some(s);
+                s
+            }
+            Some(s) => s,
+        }
+    }
+
+    /// If a node is currently hosting more work than it can comfortably handle.  This does not indicate that
+    /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
+    pub fn is_overloaded(score: RawScore) -> bool {
+        score >= Self::UTILIZATION_FULL
+    }
+
+    pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
+        if self.shard_count < shard_count {
+            self.shard_count = shard_count;
+
+            // Dirty cache: this will be calculated next time someone retrives the score
+            self.utilization_score = None;
+        }
     }
 
     /// A utilization structure that has a full utilization score: use this as a placeholder when
@@ -88,7 +112,38 @@ impl PageserverUtilization {
             disk_usable_pct: Percent::new(100).unwrap(),
             shard_count: 1,
             max_shard_count: 1,
-            utilization_score: Self::UTILIZATION_FULL,
+            utilization_score: Some(Self::UTILIZATION_FULL),
+            captured_at: serde_system_time::SystemTime(SystemTime::now()),
+        }
+    }
+}
+
+/// Test helper
+pub mod test_utilization {
+    use super::PageserverUtilization;
+    use std::time::SystemTime;
+    use utils::{
+        serde_percent::Percent,
+        serde_system_time::{self},
+    };
+
+    // Parameters of the imaginary node used for test utilization instances
+    const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
+    const TEST_SHARDS_MAX: u32 = 1000;
+
+    /// Unit test helper.  Unconditionally compiled because cfg(test) doesn't carry across crates.  Do
+    /// not abuse this function from non-test code.
+    ///
+    /// Emulates a node with a 1000 shard limit and a 1TB disk.
+    pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
+        PageserverUtilization {
+            disk_usage_bytes: disk_wanted_bytes,
+            free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
+            disk_wanted_bytes,
+            disk_usable_pct: Percent::new(100).unwrap(),
+            shard_count,
+            max_shard_count: TEST_SHARDS_MAX,
+            utilization_score: None,
             captured_at: serde_system_time::SystemTime(SystemTime::now()),
         }
     }
@@ -120,7 +175,7 @@ mod tests {
             disk_usage_bytes: u64::MAX,
             free_space_bytes: 0,
             disk_wanted_bytes: u64::MAX,
-            utilization_score: 13,
+            utilization_score: Some(13),
             disk_usable_pct: Percent::new(90).unwrap(),
             shard_count: 100,
             max_shard_count: 200,
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 0a1a22b6e8..1f8634df93 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1803,6 +1803,14 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
     .expect("failed to define a metric")
 });
 
+pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_utilization_score",
+        "The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_secondary_heatmap_total_size",
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2e19a46ac8..3a7afff211 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3741,13 +3741,21 @@ impl Tenant {
     /// less than this (via eviction and on-demand downloads), but this function enables
     /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
     /// by keeping important things on local disk.
+    ///
+    /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
+    /// than they report here, due to layer eviction.  Tenants with many active branches may
+    /// actually use more than they report here.
     pub(crate) fn local_storage_wanted(&self) -> u64 {
-        let mut wanted = 0;
         let timelines = self.timelines.lock().unwrap();
-        for timeline in timelines.values() {
-            wanted += timeline.metrics.visible_physical_size_gauge.get();
-        }
-        wanted
+
+        // Heuristic: we use the max() of the timelines' visible sizes, rather than the sum.  This
+        // reflects the observation that on tenants with multiple large branches, typically only one
+        // of them is used actively enough to occupy space on disk.
+        timelines
+            .values()
+            .map(|t| t.metrics.visible_physical_size_gauge.get())
+            .max()
+            .unwrap_or(0)
     }
 }
 
diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs
index 3c48c84598..a0223f3bce 100644
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -9,7 +9,7 @@ use utils::serde_percent::Percent;
 
 use pageserver_api::models::PageserverUtilization;
 
-use crate::{config::PageServerConf, tenant::mgr::TenantManager};
+use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager};
 
 pub(crate) fn regenerate(
     conf: &PageServerConf,
@@ -58,13 +58,13 @@ pub(crate) fn regenerate(
         disk_usable_pct,
         shard_count,
         max_shard_count: MAX_SHARDS,
-        utilization_score: 0,
+        utilization_score: None,
         captured_at: utils::serde_system_time::SystemTime(captured_at),
     };
 
-    doc.refresh_score();
-
-    // TODO: make utilization_score into a metric
+    // Initialize `PageserverUtilization::utilization_score`
+    let score = doc.cached_score();
+    NODE_UTILIZATION_SCORE.set(score);
 
     Ok(doc)
 }
diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index c0e27bafdb..b7e66d33eb 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -6,10 +6,7 @@ use std::{
 };
 use tokio_util::sync::CancellationToken;
 
-use pageserver_api::{
-    controller_api::{NodeAvailability, UtilizationScore},
-    models::PageserverUtilization,
-};
+use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization};
 
 use thiserror::Error;
 use utils::id::NodeId;
@@ -147,7 +144,8 @@ impl HeartbeaterTask {
                 // goes through to the pageserver even when the node is marked offline.
                 // This doesn't impact the availability observed by [`crate::service::Service`].
                 let mut node_clone = node.clone();
-                node_clone.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+                node_clone
+                    .set_availability(NodeAvailability::Active(PageserverUtilization::full()));
 
                 async move {
                     let response = node_clone
@@ -179,7 +177,7 @@ impl HeartbeaterTask {
                         node.get_availability()
                     {
                         PageserverState::WarmingUp {
-                            started_at: last_seen_at,
+                            started_at: *last_seen_at,
                         }
                     } else {
                         PageserverState::Offline
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index ea765ca123..61a44daca9 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -92,15 +92,15 @@ impl Node {
         }
     }
 
-    pub(crate) fn get_availability(&self) -> NodeAvailability {
-        self.availability
+    pub(crate) fn get_availability(&self) -> &NodeAvailability {
+        &self.availability
     }
 
     pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
         use AvailabilityTransition::*;
         use NodeAvailability::WarmingUp;
 
-        match self.get_availability_transition(availability) {
+        match self.get_availability_transition(&availability) {
             ToActive => {
                 // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
                 // users of previously-cloned copies of the node will still see the old cancellation
@@ -115,8 +115,8 @@ impl Node {
             Unchanged | ToWarmingUpFromOffline => {}
         }
 
-        if let (WarmingUp(crnt), WarmingUp(proposed)) = (self.availability, availability) {
-            self.availability = WarmingUp(std::cmp::max(crnt, proposed));
+        if let (WarmingUp(crnt), WarmingUp(proposed)) = (&self.availability, &availability) {
+            self.availability = WarmingUp(std::cmp::max(*crnt, *proposed));
         } else {
             self.availability = availability;
         }
@@ -126,12 +126,12 @@ impl Node {
     /// into a description of the transition.
     pub(crate) fn get_availability_transition(
         &self,
-        availability: NodeAvailability,
+        availability: &NodeAvailability,
     ) -> AvailabilityTransition {
         use AvailabilityTransition::*;
         use NodeAvailability::*;
 
-        match (self.availability, availability) {
+        match (&self.availability, availability) {
             (Offline, Active(_)) => ToActive,
             (Active(_), Offline) => ToOffline,
             (Active(_), WarmingUp(_)) => ToWarmingUpFromActive,
@@ -153,15 +153,15 @@ impl Node {
 
     /// Is this node elegible to have work scheduled onto it?
     pub(crate) fn may_schedule(&self) -> MaySchedule {
-        let score = match self.availability {
-            NodeAvailability::Active(score) => score,
+        let utilization = match &self.availability {
+            NodeAvailability::Active(u) => u.clone(),
             NodeAvailability::Offline | NodeAvailability::WarmingUp(_) => return MaySchedule::No,
         };
 
         match self.scheduling {
-            NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
+            NodeSchedulingPolicy::Active => MaySchedule::Yes(utilization),
             NodeSchedulingPolicy::Draining => MaySchedule::No,
-            NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
+            NodeSchedulingPolicy::Filling => MaySchedule::Yes(utilization),
             NodeSchedulingPolicy::Pause => MaySchedule::No,
             NodeSchedulingPolicy::PauseForRestart => MaySchedule::No,
         }
@@ -285,7 +285,7 @@ impl Node {
     pub(crate) fn describe(&self) -> NodeDescribeResponse {
         NodeDescribeResponse {
             id: self.id,
-            availability: self.availability.into(),
+            availability: self.availability.clone().into(),
             scheduling: self.scheduling,
             listen_http_addr: self.listen_http_addr.clone(),
             listen_http_port: self.listen_http_port,
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 843159010d..060e3cc6ca 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -1,6 +1,6 @@
 use crate::{node::Node, tenant_shard::TenantShard};
 use itertools::Itertools;
-use pageserver_api::controller_api::UtilizationScore;
+use pageserver_api::models::PageserverUtilization;
 use serde::Serialize;
 use std::collections::HashMap;
 use utils::{http::error::ApiError, id::NodeId};
@@ -20,9 +20,9 @@ impl From<ScheduleError> for ApiError {
     }
 }
 
-#[derive(Serialize, Eq, PartialEq)]
+#[derive(Serialize)]
 pub enum MaySchedule {
-    Yes(UtilizationScore),
+    Yes(PageserverUtilization),
     No,
 }
 
@@ -282,6 +282,28 @@ impl Scheduler {
                 node.shard_count -= 1;
             }
         }
+
+        // Maybe update PageserverUtilization
+        match update {
+            RefCountUpdate::AddSecondary | RefCountUpdate::Attach => {
+                // Referencing the node: if this takes our shard_count above the utilzation structure's
+                // shard count, then artifically bump it: this ensures that the scheduler immediately
+                // recognizes that this node has more work on it, without waiting for the next heartbeat
+                // to update the utilization.
+                if let MaySchedule::Yes(utilization) = &mut node.may_schedule {
+                    utilization.adjust_shard_count_max(node.shard_count as u32);
+                }
+            }
+            RefCountUpdate::PromoteSecondary
+            | RefCountUpdate::Detach
+            | RefCountUpdate::RemoveSecondary
+            | RefCountUpdate::DemoteAttached => {
+                // De-referencing the node: leave the utilization's shard_count at a stale higher
+                // value until some future heartbeat after we have physically removed this shard
+                // from the node: this prevents the scheduler over-optimistically trying to schedule
+                // more work onto the node before earlier detaches are done.
+            }
+        }
     }
 
     // Check if the number of shards attached to a given node is lagging below
@@ -326,7 +348,18 @@ impl Scheduler {
         use std::collections::hash_map::Entry::*;
         match self.nodes.entry(node.get_id()) {
             Occupied(mut entry) => {
-                entry.get_mut().may_schedule = node.may_schedule();
+                // Updates to MaySchedule are how we receive updated PageserverUtilization: adjust these values
+                // to account for any shards scheduled on the controller but not yet visible to the pageserver.
+                let mut may_schedule = node.may_schedule();
+                match &mut may_schedule {
+                    MaySchedule::Yes(utilization) => {
+                        utilization.adjust_shard_count_max(entry.get().shard_count as u32);
+                    }
+                    MaySchedule::No => { // Nothing to tweak
+                    }
+                }
+
+                entry.get_mut().may_schedule = may_schedule;
             }
             Vacant(entry) => {
                 entry.insert(SchedulerNode {
@@ -363,7 +396,7 @@ impl Scheduler {
                 let may_schedule = self
                     .nodes
                     .get(node_id)
-                    .map(|n| n.may_schedule != MaySchedule::No)
+                    .map(|n| !matches!(n.may_schedule, MaySchedule::No))
                     .unwrap_or(false);
                 (*node_id, may_schedule)
             })
@@ -383,7 +416,7 @@ impl Scheduler {
     /// the same tenant on the same node.  This is a soft constraint: the context will never
     /// cause us to fail to schedule a shard.
     pub(crate) fn schedule_shard(
-        &self,
+        &mut self,
         hard_exclude: &[NodeId],
         context: &ScheduleContext,
     ) -> Result<NodeId, ScheduleError> {
@@ -391,31 +424,41 @@ impl Scheduler {
             return Err(ScheduleError::NoPageservers);
         }
 
-        let mut scores: Vec<(NodeId, AffinityScore, usize, usize)> = self
+        let mut scores: Vec<(NodeId, AffinityScore, u64, usize)> = self
             .nodes
-            .iter()
-            .filter_map(|(k, v)| {
-                if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
-                    None
-                } else {
-                    Some((
-                        *k,
-                        context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
-                        v.shard_count,
-                        v.attached_shard_count,
-                    ))
-                }
+            .iter_mut()
+            .filter_map(|(k, v)| match &mut v.may_schedule {
+                MaySchedule::No => None,
+                MaySchedule::Yes(_) if hard_exclude.contains(k) => None,
+                MaySchedule::Yes(utilization) => Some((
+                    *k,
+                    context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
+                    utilization.cached_score(),
+                    v.attached_shard_count,
+                )),
             })
             .collect();
 
+        // Exclude nodes whose utilization is critically high, if there are alternatives available.  This will
+        // cause us to violate affinity rules if it is necessary to avoid critically overloading nodes: for example
+        // we may place shards in the same tenant together on the same pageserver if all other pageservers are
+        // overloaded.
+        let non_overloaded_scores = scores
+            .iter()
+            .filter(|i| !PageserverUtilization::is_overloaded(i.2))
+            .copied()
+            .collect::<Vec<_>>();
+        if !non_overloaded_scores.is_empty() {
+            scores = non_overloaded_scores;
+        }
+
         // Sort by, in order of precedence:
         //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
-        //  2nd: Attached shard count.  Within nodes with the same affinity, we always pick the node with
-        //  the least number of attached shards.
-        //  3rd: Total shard count.  Within nodes with the same affinity and attached shard count, use nodes
-        //  with the lower total shard count.
+        //  2nd: Utilization score (this combines shard count and disk utilization)
+        //  3rd: Attached shard count.  When nodes have identical utilization (e.g. when populating some
+        //       empty nodes), this acts as an anti-affinity between attached shards.
         //  4th: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
-        scores.sort_by_key(|i| (i.1, i.3, i.2, i.0));
+        scores.sort_by_key(|i| (i.1, i.2, i.3, i.0));
 
         if scores.is_empty() {
             // After applying constraints, no pageservers were left.
@@ -429,7 +472,7 @@ impl Scheduler {
                 for (node_id, node) in &self.nodes {
                     tracing::info!(
                         "Node {node_id}: may_schedule={} shards={}",
-                        node.may_schedule != MaySchedule::No,
+                        !matches!(node.may_schedule, MaySchedule::No),
                         node.shard_count
                     );
                 }
@@ -469,7 +512,7 @@ impl Scheduler {
 pub(crate) mod test_utils {
 
     use crate::node::Node;
-    use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
+    use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
     use std::collections::HashMap;
     use utils::id::NodeId;
     /// Test helper: synthesize the requested number of nodes, all in active state.
@@ -486,7 +529,7 @@ pub(crate) mod test_utils {
                         format!("pghost-{i}"),
                         5432 + i as u16,
                     );
-                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+                    node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
                     assert!(node.is_available());
                     node
                 })
@@ -497,6 +540,8 @@ pub(crate) mod test_utils {
 
 #[cfg(test)]
 mod tests {
+    use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
+
     use super::*;
 
     use crate::tenant_shard::IntentState;
@@ -557,4 +602,130 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    /// Test the PageserverUtilization's contribution to scheduling algorithm
+    fn scheduler_utilization() {
+        let mut nodes = test_utils::make_test_nodes(3);
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        // Need to keep these alive because they contribute to shard counts via RAII
+        let mut scheduled_intents = Vec::new();
+
+        let empty_context = ScheduleContext::default();
+
+        fn assert_scheduler_chooses(
+            expect_node: NodeId,
+            scheduled_intents: &mut Vec<IntentState>,
+            scheduler: &mut Scheduler,
+            context: &ScheduleContext,
+        ) {
+            let scheduled = scheduler.schedule_shard(&[], context).unwrap();
+            let mut intent = IntentState::new();
+            intent.set_attached(scheduler, Some(scheduled));
+            scheduled_intents.push(intent);
+            assert_eq!(scheduled, expect_node);
+        }
+
+        // Independent schedule calls onto empty nodes should round-robin, because each node's
+        // utilization's shard count is updated inline.  The order is determinsitic because when all other factors are
+        // equal, we order by node ID.
+        assert_scheduler_chooses(
+            NodeId(1),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+        assert_scheduler_chooses(
+            NodeId(2),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+        assert_scheduler_chooses(
+            NodeId(3),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+
+        // Manually setting utilization higher should cause schedule calls to round-robin the other nodes
+        // which have equal utilization.
+        nodes
+            .get_mut(&NodeId(1))
+            .unwrap()
+            .set_availability(NodeAvailability::Active(test_utilization::simple(
+                10,
+                1024 * 1024 * 1024,
+            )));
+        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
+
+        assert_scheduler_chooses(
+            NodeId(2),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+        assert_scheduler_chooses(
+            NodeId(3),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+        assert_scheduler_chooses(
+            NodeId(2),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+        assert_scheduler_chooses(
+            NodeId(3),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+
+        // The scheduler should prefer nodes with lower affinity score,
+        // even if they have higher utilization (as long as they aren't utilized at >100%)
+        let mut context_prefer_node1 = ScheduleContext::default();
+        context_prefer_node1.avoid(&[NodeId(2), NodeId(3)]);
+        assert_scheduler_chooses(
+            NodeId(1),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &context_prefer_node1,
+        );
+        assert_scheduler_chooses(
+            NodeId(1),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &context_prefer_node1,
+        );
+
+        // If a node is over-utilized, it will not be used even if affinity scores prefer it
+        nodes
+            .get_mut(&NodeId(1))
+            .unwrap()
+            .set_availability(NodeAvailability::Active(test_utilization::simple(
+                20000,
+                1024 * 1024 * 1024,
+            )));
+        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
+        assert_scheduler_chooses(
+            NodeId(2),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &context_prefer_node1,
+        );
+        assert_scheduler_chooses(
+            NodeId(3),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &context_prefer_node1,
+        );
+
+        for mut intent in scheduled_intents {
+            intent.clear(&mut scheduler);
+        }
+    }
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 453e96bad3..4b0c556824 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -44,7 +44,7 @@ use pageserver_api::{
         NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
         TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
         TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-        TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
+        TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
     models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
 };
@@ -542,7 +542,7 @@ impl Service {
             let locked = self.inner.read().unwrap();
             locked.nodes.clone()
         };
-        let nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;
+        let mut nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;
 
         // List of tenants for which we will attempt to notify compute of their location at startup
         let mut compute_notifications = Vec::new();
@@ -556,10 +556,8 @@ impl Service {
             // Mark nodes online if they responded to us: nodes are offline by default after a restart.
             let mut new_nodes = (**nodes).clone();
             for (node_id, node) in new_nodes.iter_mut() {
-                if let Some(utilization) = nodes_online.get(node_id) {
-                    node.set_availability(NodeAvailability::Active(UtilizationScore(
-                        utilization.utilization_score,
-                    )));
+                if let Some(utilization) = nodes_online.remove(node_id) {
+                    node.set_availability(NodeAvailability::Active(utilization));
                     scheduler.node_upsert(node);
                 }
             }
@@ -925,9 +923,9 @@ impl Service {
             if let Ok(deltas) = res {
                 for (node_id, state) in deltas.0 {
                     let new_availability = match state {
-                        PageserverState::Available { utilization, .. } => NodeAvailability::Active(
-                            UtilizationScore(utilization.utilization_score),
-                        ),
+                        PageserverState::Available { utilization, .. } => {
+                            NodeAvailability::Active(utilization)
+                        }
                         PageserverState::WarmingUp { started_at } => {
                             NodeAvailability::WarmingUp(started_at)
                         }
@@ -936,14 +934,17 @@ impl Service {
                             // while the heartbeat round was on-going. Hence, filter out
                             // offline transitions for WarmingUp nodes that are still within
                             // their grace period.
-                            if let Ok(NodeAvailability::WarmingUp(started_at)) =
-                                self.get_node(node_id).await.map(|n| n.get_availability())
+                            if let Ok(NodeAvailability::WarmingUp(started_at)) = self
+                                .get_node(node_id)
+                                .await
+                                .as_ref()
+                                .map(|n| n.get_availability())
                             {
                                 let now = Instant::now();
-                                if now - started_at >= self.config.max_warming_up_interval {
+                                if now - *started_at >= self.config.max_warming_up_interval {
                                     NodeAvailability::Offline
                                 } else {
-                                    NodeAvailability::WarmingUp(started_at)
+                                    NodeAvailability::WarmingUp(*started_at)
                                 }
                             } else {
                                 NodeAvailability::Offline
@@ -1625,7 +1626,7 @@ impl Service {
         // This Node is a mutable local copy: we will set it active so that we can use its
         // API client to reconcile with the node.  The Node in [`Self::nodes`] will get updated
         // later.
-        node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+        node.set_availability(NodeAvailability::Active(PageserverUtilization::full()));
 
         let configs = match node
             .with_client_retries(
@@ -2473,7 +2474,7 @@ impl Service {
         .await;
 
         let node = {
-            let locked = self.inner.read().unwrap();
+            let mut locked = self.inner.write().unwrap();
             // Just a sanity check to prevent misuse: the API expects that the tenant is fully
             // detached everywhere, and nothing writes to S3 storage. Here, we verify that,
             // but only at the start of the process, so it's really just to prevent operator
@@ -2500,7 +2501,7 @@ impl Service {
                     return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}")));
                 }
             }
-            let scheduler = &locked.scheduler;
+            let scheduler = &mut locked.scheduler;
             // Right now we only perform the operation on a single node without parallelization
             // TODO fan out the operation to multiple nodes for better performance
             let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
@@ -4761,7 +4762,7 @@ impl Service {
         //
         // The transition we calculate here remains valid later in the function because we hold the op lock on the node:
         // nothing else can mutate its availability while we run.
-        let availability_transition = if let Some(input_availability) = availability {
+        let availability_transition = if let Some(input_availability) = availability.as_ref() {
             let (activate_node, availability_transition) = {
                 let locked = self.inner.read().unwrap();
                 let Some(node) = locked.nodes.get(&node_id) else {
@@ -4797,8 +4798,8 @@ impl Service {
             ));
         };
 
-        if let Some(availability) = &availability {
-            node.set_availability(*availability);
+        if let Some(availability) = availability.as_ref() {
+            node.set_availability(availability.clone());
         }
 
         if let Some(scheduling) = scheduling {
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 1fcc3c8547..30723a3b36 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -779,7 +779,7 @@ impl TenantShard {
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
     pub(crate) fn optimize_secondary(
         &self,
-        scheduler: &Scheduler,
+        scheduler: &mut Scheduler,
         schedule_context: &ScheduleContext,
     ) -> Option<ScheduleOptimization> {
         if self.intent.secondary.is_empty() {
@@ -1595,7 +1595,7 @@ pub(crate) mod tests {
         schedule_context.avoid(&shard_b.intent.all_pageservers());
         schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
 
-        let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);
+        let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context);
 
         // Since there is a node with no locations available, the node with two locations for the
         // same tenant should generate an optimization to move one away
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 1011a6fd22..bfd82242e9 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -394,6 +394,7 @@ def test_sharding_split_smoke(
 
     # Note which pageservers initially hold a shard after tenant creation
     pre_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)]
+    log.info("Pre-split pageservers: {pre_split_pageserver_ids}")
 
     # For pageservers holding a shard, validate their ingest statistics
     # reflect a proper splitting of the WAL.
@@ -555,9 +556,9 @@ def test_sharding_split_smoke(
     assert sum(total.values()) == split_shard_count * 2
     check_effective_tenant_config()
 
-    # More specific check: that we are fully balanced.  This is deterministic because
-    # the order in which we consider shards for optimization is deterministic, and the
-    # order of preference of nodes is also deterministic (lower node IDs win).
+    # More specific check: that we are fully balanced.  It is deterministic that we will get exactly
+    # one shard on each pageserver, because for these small shards the utilization metric is
+    # dominated by shard count.
     log.info(f"total: {total}")
     assert total == {
         1: 1,
@@ -577,8 +578,14 @@ def test_sharding_split_smoke(
         15: 1,
         16: 1,
     }
+
+    # The controller is not required to lay out the attached locations in any particular way, but
+    # all the pageservers that originally held an attached shard should still hold one, otherwise
+    # it would indicate that we had done some unnecessary migration.
     log.info(f"attached: {attached}")
-    assert attached == {1: 1, 2: 1, 3: 1, 5: 1, 6: 1, 7: 1, 9: 1, 11: 1}
+    for ps_id in pre_split_pageserver_ids:
+        log.info("Pre-split pageserver {ps_id} should still hold an attached location")
+        assert ps_id in attached
 
     # Ensure post-split pageserver locations survive a restart (i.e. the child shards
     # correctly wrote config to disk, and the storage controller responds correctly

From 0aa14509368d81acf253f406ffafd740bf13b01a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 23 Aug 2024 18:56:05 +0100
Subject: [PATCH 1494/1571] storage controller: enable timeline CRUD operations
 to run concurrently with reconciliation & make them safer (#8783)

## Problem

- If a reconciler was waiting to be able to notify computes about a
change, but the control plane was waiting for the controller to finish a
timeline creation/deletion, the overall system can deadlock.
- If a tenant shard was migrated concurrently with a timeline
creation/deletion, there was a risk that the timeline operation could be
applied to a non-latest-generation location, and thereby not really be
persistent. This has never happened in practice, but would eventually
happen at scale.

Closes: #8743

## Summary of changes

- Introduce `Service::tenant_remote_mutation` helper, which looks up
shards & generations and passes them into an inner function that may do
remote I/O to pageservers. Before returning success, this helper checks
that generations haven't incremented, to guarantee that changes are
persistent.
- Convert tenant_timeline_create, tenant_timeline_delete, and
tenant_timeline_detach_ancestor to use this helper.
- These functions no longer block on ensure_attached unless the tenant
was never attached at all, so they should make progress even if we can't
complete compute notifications.

This increases the database load from timeline/create operations, but
only with cheap read transactions.
---
 .../down.sql                                  |   2 +
 .../2024-08-23-170149_tenant_id_index/up.sql  |   2 +
 storage_controller/src/persistence.rs         |  38 ++
 storage_controller/src/service.rs             | 480 +++++++++---------
 test_runner/fixtures/neon_fixtures.py         |   4 +-
 .../regress/test_storage_controller.py        |  66 ++-
 6 files changed, 360 insertions(+), 232 deletions(-)
 create mode 100644 storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql
 create mode 100644 storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql

diff --git a/storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql
new file mode 100644
index 0000000000..518c747100
--- /dev/null
+++ b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql
@@ -0,0 +1,2 @@
+-- This file should undo anything in `up.sql`
+DROP INDEX tenant_shards_tenant_id;
\ No newline at end of file
diff --git a/storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql
new file mode 100644
index 0000000000..dd6b37781a
--- /dev/null
+++ b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql
@@ -0,0 +1,2 @@
+-- Your SQL goes here
+CREATE INDEX tenant_shards_tenant_id ON tenant_shards (tenant_id);
\ No newline at end of file
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 16df19026c..1a905753a1 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -91,6 +91,7 @@ pub(crate) enum DatabaseOperation {
     Detach,
     ReAttach,
     IncrementGeneration,
+    PeekGenerations,
     ListTenantShards,
     InsertTenantShards,
     UpdateTenantShard,
@@ -502,6 +503,43 @@ impl Persistence {
         Ok(Generation::new(g as u32))
     }
 
+    /// When we want to call out to the running shards for a tenant, e.g. during timeline CRUD operations,
+    /// we need to know where the shard is attached, _and_ the generation, so that we can re-check the generation
+    /// afterwards to confirm that our timeline CRUD operation is truly persistent (it must have happened in the
+    /// latest generation)
+    ///
+    /// If the tenant doesn't exist, an empty vector is returned.
+    ///
+    /// Output is sorted by shard number
+    pub(crate) async fn peek_generations(
+        &self,
+        filter_tenant_id: TenantId,
+    ) -> Result<Vec<(TenantShardId, Option<Generation>, Option<NodeId>)>, DatabaseError> {
+        use crate::schema::tenant_shards::dsl::*;
+        let rows = self
+            .with_measured_conn(DatabaseOperation::PeekGenerations, move |conn| {
+                let result = tenant_shards
+                    .filter(tenant_id.eq(filter_tenant_id.to_string()))
+                    .select(TenantShardPersistence::as_select())
+                    .order(shard_number)
+                    .load(conn)?;
+                Ok(result)
+            })
+            .await?;
+
+        Ok(rows
+            .into_iter()
+            .map(|p| {
+                (
+                    p.get_tenant_shard_id()
+                        .expect("Corrupt tenant shard id in database"),
+                    p.generation.map(|g| Generation::new(g as u32)),
+                    p.generation_pageserver.map(|n| NodeId(n as u64)),
+                )
+            })
+            .collect())
+    }
+
     #[allow(non_local_definitions)]
     /// For use when updating a persistent property of a tenant, such as its config or placement_policy.
     ///
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 4b0c556824..7daa1e4f5f 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2854,82 +2854,67 @@ impl Service {
         .await;
         failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock");
 
-        self.ensure_attached_wait(tenant_id).await?;
+        self.tenant_remote_mutation(tenant_id, move |mut targets| async move {
+            if targets.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
+            };
+            let shard_zero = targets.remove(0);
 
-        let mut targets = {
-            let locked = self.inner.read().unwrap();
-            let mut targets = Vec::new();
+            async fn create_one(
+                tenant_shard_id: TenantShardId,
+                node: Node,
+                jwt: Option<String>,
+                create_req: TimelineCreateRequest,
+            ) -> Result<TimelineInfo, ApiError> {
+                tracing::info!(
+                    "Creating timeline on shard {}/{}, attached to node {node}",
+                    tenant_shard_id,
+                    create_req.new_timeline_id,
+                );
+                let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
 
-            for (tenant_shard_id, shard) in
-                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-            {
-                let node_id = shard.intent.get_attached().ok_or_else(|| {
-                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
-                })?;
-                let node = locked
-                    .nodes
-                    .get(&node_id)
-                    .expect("Pageservers may not be deleted while referenced");
-
-                targets.push((*tenant_shard_id, node.clone()));
+                client
+                    .timeline_create(tenant_shard_id, &create_req)
+                    .await
+                    .map_err(|e| passthrough_api_error(&node, e))
             }
-            targets
-        };
 
-        if targets.is_empty() {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("Tenant not found").into(),
-            ));
-        };
-        let shard_zero = targets.remove(0);
-
-        async fn create_one(
-            tenant_shard_id: TenantShardId,
-            node: Node,
-            jwt: Option<String>,
-            create_req: TimelineCreateRequest,
-        ) -> Result<TimelineInfo, ApiError> {
-            tracing::info!(
-                "Creating timeline on shard {}/{}, attached to node {node}",
-                tenant_shard_id,
-                create_req.new_timeline_id,
-            );
-            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
-
-            client
-                .timeline_create(tenant_shard_id, &create_req)
-                .await
-                .map_err(|e| passthrough_api_error(&node, e))
-        }
-
-        // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then
-        // use whatever LSN that shard picked when creating on subsequent shards.  We arbitrarily use shard zero as the shard
-        // that will get the first creation request, and propagate the LSN to all the >0 shards.
-        let timeline_info = create_one(
-            shard_zero.0,
-            shard_zero.1,
-            self.config.jwt_token.clone(),
-            create_req.clone(),
-        )
-        .await?;
-
-        // Propagate the LSN that shard zero picked, if caller didn't provide one
-        if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none() {
-            create_req.ancestor_start_lsn = timeline_info.ancestor_lsn;
-        }
-
-        // Create timeline on remaining shards with number >0
-        if !targets.is_empty() {
-            // If we had multiple shards, issue requests for the remainder now.
-            let jwt = &self.config.jwt_token;
-            self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
-                let create_req = create_req.clone();
-                Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req))
-            })
+            // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then
+            // use whatever LSN that shard picked when creating on subsequent shards.  We arbitrarily use shard zero as the shard
+            // that will get the first creation request, and propagate the LSN to all the >0 shards.
+            let timeline_info = create_one(
+                shard_zero.0,
+                shard_zero.1,
+                self.config.jwt_token.clone(),
+                create_req.clone(),
+            )
             .await?;
-        }
 
-        Ok(timeline_info)
+            // Propagate the LSN that shard zero picked, if caller didn't provide one
+            if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none()
+            {
+                create_req.ancestor_start_lsn = timeline_info.ancestor_lsn;
+            }
+
+            // Create timeline on remaining shards with number >0
+            if !targets.is_empty() {
+                // If we had multiple shards, issue requests for the remainder now.
+                let jwt = &self.config.jwt_token;
+                self.tenant_for_shards(
+                    targets.iter().map(|t| (t.0, t.1.clone())).collect(),
+                    |tenant_shard_id: TenantShardId, node: Node| {
+                        let create_req = create_req.clone();
+                        Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req))
+                    },
+                )
+                .await?;
+            }
+
+            Ok(timeline_info)
+        })
+        .await?
     }
 
     pub(crate) async fn tenant_timeline_detach_ancestor(
@@ -2946,107 +2931,87 @@ impl Service {
         )
         .await;
 
-        self.ensure_attached_wait(tenant_id).await?;
-
-        let targets = {
-            let locked = self.inner.read().unwrap();
-            let mut targets = Vec::new();
-
-            for (tenant_shard_id, shard) in
-                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-            {
-                let node_id = shard.intent.get_attached().ok_or_else(|| {
-                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
-                })?;
-                let node = locked
-                    .nodes
-                    .get(&node_id)
-                    .expect("Pageservers may not be deleted while referenced");
-
-                targets.push((*tenant_shard_id, node.clone()));
+        self.tenant_remote_mutation(tenant_id, move |targets| async move {
+            if targets.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
             }
-            targets
-        };
 
-        if targets.is_empty() {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("Tenant not found").into(),
-            ));
-        }
+            async fn detach_one(
+                tenant_shard_id: TenantShardId,
+                timeline_id: TimelineId,
+                node: Node,
+                jwt: Option<String>,
+            ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> {
+                tracing::info!(
+                    "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
+                );
 
-        async fn detach_one(
-            tenant_shard_id: TenantShardId,
-            timeline_id: TimelineId,
-            node: Node,
-            jwt: Option<String>,
-        ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> {
-            tracing::info!(
-                "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
-            );
+                let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
 
-            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+                client
+                    .timeline_detach_ancestor(tenant_shard_id, timeline_id)
+                    .await
+                    .map_err(|e| {
+                        use mgmt_api::Error;
 
-            client
-                .timeline_detach_ancestor(tenant_shard_id, timeline_id)
-                .await
-                .map_err(|e| {
-                    use mgmt_api::Error;
-
-                    match e {
-                        // no ancestor (ever)
-                        Error::ApiError(StatusCode::CONFLICT, msg) => ApiError::Conflict(format!(
-                            "{node}: {}",
-                            msg.strip_prefix("Conflict: ").unwrap_or(&msg)
-                        )),
-                        // too many ancestors
-                        Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
-                            ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
+                        match e {
+                            // no ancestor (ever)
+                            Error::ApiError(StatusCode::CONFLICT, msg) => ApiError::Conflict(format!(
+                                "{node}: {}",
+                                msg.strip_prefix("Conflict: ").unwrap_or(&msg)
+                            )),
+                            // too many ancestors
+                            Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
+                                ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
+                            }
+                            Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, msg) => {
+                                // avoid turning these into conflicts to remain compatible with
+                                // pageservers, 500 errors are sadly retryable with timeline ancestor
+                                // detach
+                                ApiError::InternalServerError(anyhow::anyhow!("{node}: {msg}"))
+                            }
+                            // rest can be mapped as usual
+                            other => passthrough_api_error(&node, other),
                         }
-                        Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, msg) => {
-                            // avoid turning these into conflicts to remain compatible with
-                            // pageservers, 500 errors are sadly retryable with timeline ancestor
-                            // detach
-                            ApiError::InternalServerError(anyhow::anyhow!("{node}: {msg}"))
-                        }
-                        // rest can be mapped as usual
-                        other => passthrough_api_error(&node, other),
-                    }
+                    })
+                    .map(|res| (tenant_shard_id.shard_number, res))
+            }
+
+            // no shard needs to go first/last; the operation should be idempotent
+            let mut results = self
+                .tenant_for_shards(targets, |tenant_shard_id, node| {
+                    futures::FutureExt::boxed(detach_one(
+                        tenant_shard_id,
+                        timeline_id,
+                        node,
+                        self.config.jwt_token.clone(),
+                    ))
                 })
-                .map(|res| (tenant_shard_id.shard_number, res))
-        }
+                .await?;
 
-        // no shard needs to go first/last; the operation should be idempotent
-        let mut results = self
-            .tenant_for_shards(targets, |tenant_shard_id, node| {
-                futures::FutureExt::boxed(detach_one(
-                    tenant_shard_id,
-                    timeline_id,
-                    node,
-                    self.config.jwt_token.clone(),
-                ))
-            })
-            .await?;
+            let any = results.pop().expect("we must have at least one response");
 
-        let any = results.pop().expect("we must have at least one response");
+            let mismatching = results
+                .iter()
+                .filter(|(_, res)| res != &any.1)
+                .collect::<Vec<_>>();
+            if !mismatching.is_empty() {
+                // this can be hit by races which should not happen because operation lock on cplane
+                let matching = results.len() - mismatching.len();
+                tracing::error!(
+                    matching,
+                    compared_against=?any,
+                    ?mismatching,
+                    "shards returned different results"
+                );
 
-        let mismatching = results
-            .iter()
-            .filter(|(_, res)| res != &any.1)
-            .collect::<Vec<_>>();
-        if !mismatching.is_empty() {
-            // this can be hit by races which should not happen because operation lock on cplane
-            let matching = results.len() - mismatching.len();
-            tracing::error!(
-                matching,
-                compared_against=?any,
-                ?mismatching,
-                "shards returned different results"
-            );
+                return Err(ApiError::InternalServerError(anyhow::anyhow!("pageservers returned mixed results for ancestor detach; manual intervention is required.")));
+            }
 
-            return Err(ApiError::InternalServerError(anyhow::anyhow!("pageservers returned mixed results for ancestor detach; manual intervention is required.")));
-        }
-
-        Ok(any.1)
+            Ok(any.1)
+        }).await?
     }
 
     /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
@@ -3117,6 +3082,84 @@ impl Service {
         results
     }
 
+    /// Helper for safely working with the shards in a tenant remotely on pageservers, for example
+    /// when creating and deleting timelines:
+    /// - Makes sure shards are attached somewhere if they weren't already
+    /// - Looks up the shards and the nodes where they were most recently attached
+    /// - Guarantees that after the inner function returns, the shards' generations haven't moved on: this
+    ///   ensures that the remote operation acted on the most recent generation, and is therefore durable.
+    async fn tenant_remote_mutation<R, O, F>(
+        &self,
+        tenant_id: TenantId,
+        op: O,
+    ) -> Result<R, ApiError>
+    where
+        O: FnOnce(Vec<(TenantShardId, Node)>) -> F,
+        F: std::future::Future<Output = R>,
+    {
+        let target_gens = {
+            let mut targets = Vec::new();
+
+            // Load the currently attached pageservers for the latest generation of each shard.  This can
+            // run concurrently with reconciliations, and it is not guaranteed that the node we find here
+            // will still be the latest when we're done: we will check generations again at the end of
+            // this function to handle that.
+            let generations = self.persistence.peek_generations(tenant_id).await?;
+            let generations = if generations.iter().any(|i| i.1.is_none()) {
+                // One or more shards is not attached to anything: maybe this is a new tenant?  Wait for
+                // it to reconcile.
+                self.ensure_attached_wait(tenant_id).await?;
+                self.persistence.peek_generations(tenant_id).await?
+            } else {
+                generations
+            };
+
+            let locked = self.inner.read().unwrap();
+            for (tenant_shard_id, generation, generation_pageserver) in generations {
+                let node_id = generation_pageserver.ok_or(ApiError::Conflict(
+                    "Tenant not currently attached".to_string(),
+                ))?;
+                let node = locked
+                    .nodes
+                    .get(&node_id)
+                    .ok_or(ApiError::Conflict(format!(
+                        "Raced with removal of node {node_id}"
+                    )))?;
+                targets.push((tenant_shard_id, node.clone(), generation));
+            }
+
+            targets
+        };
+
+        let targets = target_gens.iter().map(|t| (t.0, t.1.clone())).collect();
+        let result = op(targets).await;
+
+        // Post-check: are all the generations of all the shards the same as they were initially?  This proves that
+        // our remote operation executed on the latest generation and is therefore persistent.
+        {
+            let latest_generations = self.persistence.peek_generations(tenant_id).await?;
+            if latest_generations
+                .into_iter()
+                .map(|g| (g.0, g.1))
+                .collect::<Vec<_>>()
+                != target_gens
+                    .into_iter()
+                    .map(|i| (i.0, i.2))
+                    .collect::<Vec<_>>()
+            {
+                // We raced with something that incremented the generation, and therefore cannot be
+                // confident that our actions are persistent (they might have hit an old generation).
+                //
+                // This is safe but requires a retry: ask the client to do that by giving them a 503 response.
+                return Err(ApiError::ResourceUnavailable(
+                    "Tenant attachment changed, please retry".into(),
+                ));
+            }
+        }
+
+        Ok(result)
+    }
+
     pub(crate) async fn tenant_timeline_delete(
         &self,
         tenant_id: TenantId,
@@ -3130,83 +3173,62 @@ impl Service {
         )
         .await;
 
-        self.ensure_attached_wait(tenant_id).await?;
-
-        let mut targets = {
-            let locked = self.inner.read().unwrap();
-            let mut targets = Vec::new();
-
-            for (tenant_shard_id, shard) in
-                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-            {
-                let node_id = shard.intent.get_attached().ok_or_else(|| {
-                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
-                })?;
-                let node = locked
-                    .nodes
-                    .get(&node_id)
-                    .expect("Pageservers may not be deleted while referenced");
-
-                targets.push((*tenant_shard_id, node.clone()));
+        self.tenant_remote_mutation(tenant_id, move |mut targets| async move {
+            if targets.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
             }
-            targets
-        };
+            let shard_zero = targets.remove(0);
 
-        if targets.is_empty() {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("Tenant not found").into(),
-            ));
-        }
-        let shard_zero = targets.remove(0);
+            async fn delete_one(
+                tenant_shard_id: TenantShardId,
+                timeline_id: TimelineId,
+                node: Node,
+                jwt: Option<String>,
+            ) -> Result<StatusCode, ApiError> {
+                tracing::info!(
+                    "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
+                );
 
-        async fn delete_one(
-            tenant_shard_id: TenantShardId,
-            timeline_id: TimelineId,
-            node: Node,
-            jwt: Option<String>,
-        ) -> Result<StatusCode, ApiError> {
-            tracing::info!(
-                "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
-            );
+                let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+                client
+                    .timeline_delete(tenant_shard_id, timeline_id)
+                    .await
+                    .map_err(|e| {
+                        ApiError::InternalServerError(anyhow::anyhow!(
+                            "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
+                        ))
+                    })
+            }
 
-            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
-            client
-                .timeline_delete(tenant_shard_id, timeline_id)
-                .await
-                .map_err(|e| {
-                    ApiError::InternalServerError(anyhow::anyhow!(
-                        "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
+            let statuses = self
+                .tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
+                    Box::pin(delete_one(
+                        tenant_shard_id,
+                        timeline_id,
+                        node,
+                        self.config.jwt_token.clone(),
                     ))
                 })
-        }
+                .await?;
 
-        let statuses = self
-            .tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
-                Box::pin(delete_one(
-                    tenant_shard_id,
-                    timeline_id,
-                    node,
-                    self.config.jwt_token.clone(),
-                ))
-            })
+            // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero
+            if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) {
+                return Ok(StatusCode::ACCEPTED);
+            }
+
+            // Delete shard zero last: this is not strictly necessary, but since a caller's GET on a timeline will be routed
+            // to shard zero, it gives a more obvious behavior that a GET returns 404 once the deletion is done.
+            let shard_zero_status = delete_one(
+                shard_zero.0,
+                timeline_id,
+                shard_zero.1,
+                self.config.jwt_token.clone(),
+            )
             .await?;
-
-        // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero
-        if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) {
-            return Ok(StatusCode::ACCEPTED);
-        }
-
-        // Delete shard zero last: this is not strictly necessary, but since a caller's GET on a timeline will be routed
-        // to shard zero, it gives a more obvious behavior that a GET returns 404 once the deletion is done.
-        let shard_zero_status = delete_one(
-            shard_zero.0,
-            timeline_id,
-            shard_zero.1,
-            self.config.jwt_token.clone(),
-        )
-        .await?;
-
-        Ok(shard_zero_status)
+            Ok(shard_zero_status)
+        }).await?
     }
 
     /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 2bb698f175..92febfec9b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2284,7 +2284,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
             self.allowed_errors,
         )
 
-    def pageserver_api(self) -> PageserverHttpClient:
+    def pageserver_api(self, *args, **kwargs) -> PageserverHttpClient:
         """
         The storage controller implements a subset of the pageserver REST API, for mapping
         per-tenant actions into per-shard actions (e.g. timeline creation).  Tests should invoke those
@@ -2293,7 +2293,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         auth_token = None
         if self.auth_enabled:
             auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
-        return PageserverHttpClient(self.port, lambda: True, auth_token)
+        return PageserverHttpClient(self.port, lambda: True, auth_token, *args, **kwargs)
 
     def request(self, method, *args, **kwargs) -> requests.Response:
         resp = requests.request(method, *args, **kwargs)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index b3464b0c91..03eb7628be 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -21,7 +21,7 @@ from fixtures.neon_fixtures import (
     TokenScope,
     last_flush_lsn_upload,
 )
-from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
     assert_prefix_empty,
     assert_prefix_not_empty,
@@ -41,6 +41,7 @@ from mypy_boto3_s3.type_defs import (
     ObjectTypeDef,
 )
 from pytest_httpserver import HTTPServer
+from urllib3 import Retry
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
@@ -2266,3 +2267,66 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB
 
     # allow for small delay between actually having cancelled and being able reconfigure again
     wait_until(4, 0.5, reconfigure_node_again)
+
+
+def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder):
+    """
+    The storage controller is meant to handle the case where a timeline CRUD operation races
+    with a generation-incrementing change to the tenant: this should trigger a retry so that
+    the operation lands on the highest-generation'd tenant location.
+    """
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+    tenant_id = TenantId.generate()
+    env.storage_controller.tenant_create(tenant_id)
+
+    # Set up a failpoint so that a timeline creation will be very slow
+    failpoint = "timeline-creation-after-uninit"
+    for ps in env.pageservers:
+        ps.http_client().configure_failpoints((failpoint, "sleep(10000)"))
+
+    # Start a timeline creation in the background
+    create_timeline_id = TimelineId.generate()
+    futs = []
+    with concurrent.futures.ThreadPoolExecutor(
+        max_workers=2 + len(env.pageservers) + len(env.safekeepers)
+    ) as executor:
+        futs.append(
+            executor.submit(
+                env.storage_controller.pageserver_api(
+                    retries=Retry(
+                        status=0,
+                        connect=0,  # Disable retries: we want to see the 503
+                    )
+                ).timeline_create,
+                PgVersion.NOT_SET,
+                tenant_id,
+                create_timeline_id,
+            )
+        )
+
+        def has_hit_failpoint():
+            assert any(
+                ps.log_contains(f"at failpoint {failpoint}") is not None for ps in env.pageservers
+            )
+
+        wait_until(10, 1, has_hit_failpoint)
+
+        # Migrate the tenant while the timeline creation is in progress: this migration will complete once it
+        # can detach from the old pageserver, which will happen once the failpoint completes.
+        env.storage_controller.tenant_shard_migrate(
+            TenantShardId(tenant_id, 0, 0), env.pageservers[1].id
+        )
+
+        with pytest.raises(PageserverApiException, match="Tenant attachment changed, please retry"):
+            futs[0].result(timeout=20)
+
+    # Timeline creation should work when there isn't a concurrent migration, even though it's
+    # slow (our failpoint is still enabled)
+    env.storage_controller.pageserver_api(
+        retries=Retry(
+            status=0,
+            connect=0,  # Disable retries: we want to see the 503
+        )
+    ).timeline_create(PgVersion.NOT_SET, tenant_id, create_timeline_id)

From 701cb61b572eb0ef3cc29697f86aab36aafbba70 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 23 Aug 2024 19:48:06 +0100
Subject: [PATCH 1495/1571] proxy: local auth backend (#8806)

Adds a Local authentication backend. Updates http to extract JWT bearer
tokens and passes them to the local backend to validate.
---
 proxy/src/auth/backend.rs             | 34 +++++++++---
 proxy/src/auth/backend/local.rs       | 79 +++++++++++++++++++++++++++
 proxy/src/console/messages.rs         | 22 ++++++++
 proxy/src/console/provider.rs         |  1 +
 proxy/src/proxy/connect_compute.rs    |  7 +--
 proxy/src/serverless.rs               |  2 +-
 proxy/src/serverless/backend.rs       | 71 ++++++++++++++++++------
 proxy/src/serverless/conn_pool.rs     | 12 +++-
 proxy/src/serverless/sql_over_http.rs | 58 +++++++++++++++-----
 9 files changed, 240 insertions(+), 46 deletions(-)
 create mode 100644 proxy/src/auth/backend/local.rs

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 7592d076ec..ae72bc6de3 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -2,6 +2,7 @@ mod classic;
 mod hacks;
 pub mod jwt;
 mod link;
+pub mod local;
 
 use std::net::IpAddr;
 use std::sync::Arc;
@@ -9,6 +10,7 @@ use std::time::Duration;
 
 use ipnet::{Ipv4Net, Ipv6Net};
 pub use link::LinkAuthError;
+use local::LocalBackend;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::AuthKeys;
 use tracing::{info, warn};
@@ -68,6 +70,8 @@ pub enum BackendType<'a, T, D> {
     Console(MaybeOwned<'a, ConsoleBackend>, T),
     /// Authentication via a web browser.
     Link(MaybeOwned<'a, url::ApiUrl>, D),
+    /// Local proxy uses configured auth credentials and does not wake compute
+    Local(MaybeOwned<'a, LocalBackend>),
 }
 
 pub trait TestBackend: Send + Sync + 'static {
@@ -93,6 +97,7 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
                 ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
             },
             Self::Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Self::Local(_) => fmt.debug_tuple("Local").finish(),
         }
     }
 }
@@ -104,6 +109,7 @@ impl<T, D> BackendType<'_, T, D> {
         match self {
             Self::Console(c, x) => BackendType::Console(MaybeOwned::Borrowed(c), x),
             Self::Link(c, x) => BackendType::Link(MaybeOwned::Borrowed(c), x),
+            Self::Local(l) => BackendType::Local(MaybeOwned::Borrowed(l)),
         }
     }
 }
@@ -116,6 +122,7 @@ impl<'a, T, D> BackendType<'a, T, D> {
         match self {
             Self::Console(c, x) => BackendType::Console(c, f(x)),
             Self::Link(c, x) => BackendType::Link(c, x),
+            Self::Local(l) => BackendType::Local(l),
         }
     }
 }
@@ -126,6 +133,7 @@ impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
         match self {
             Self::Console(c, x) => x.map(|x| BackendType::Console(c, x)),
             Self::Link(c, x) => Ok(BackendType::Link(c, x)),
+            Self::Local(l) => Ok(BackendType::Local(l)),
         }
     }
 }
@@ -157,6 +165,7 @@ impl ComputeUserInfo {
 pub enum ComputeCredentialKeys {
     Password(Vec<u8>),
     AuthKeys(AuthKeys),
+    None,
 }
 
 impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
@@ -289,7 +298,7 @@ async fn auth_quirks(
             ctx.set_endpoint_id(res.info.endpoint.clone());
             let password = match res.keys {
                 ComputeCredentialKeys::Password(p) => p,
-                ComputeCredentialKeys::AuthKeys(_) => {
+                ComputeCredentialKeys::AuthKeys(_) | ComputeCredentialKeys::None => {
                     unreachable!("password hack should return a password")
                 }
             };
@@ -401,6 +410,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
         match self {
             Self::Console(_, user_info) => user_info.endpoint_id.clone(),
             Self::Link(_, _) => Some("link".into()),
+            Self::Local(_) => Some("local".into()),
         }
     }
 
@@ -409,6 +419,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
         match self {
             Self::Console(_, user_info) => &user_info.user,
             Self::Link(_, _) => "link",
+            Self::Local(_) => "local",
         }
     }
 
@@ -450,6 +461,9 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
 
                 BackendType::Link(url, info)
             }
+            Self::Local(_) => {
+                return Err(auth::AuthError::bad_auth_method("invalid for local proxy"))
+            }
         };
 
         info!("user successfully authenticated");
@@ -465,6 +479,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
         match self {
             Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
             Self::Link(_, _) => Ok(Cached::new_uncached(None)),
+            Self::Local(_) => Ok(Cached::new_uncached(None)),
         }
     }
 
@@ -475,6 +490,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
         match self {
             Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             Self::Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
         }
     }
 }
@@ -488,13 +504,15 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
         match self {
             Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
             Self::Link(_, info) => Ok(Cached::new_uncached(info.clone())),
+            Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
         }
     }
 
-    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
+    fn get_keys(&self) -> &ComputeCredentialKeys {
         match self {
-            Self::Console(_, creds) => Some(&creds.keys),
-            Self::Link(_, _) => None,
+            Self::Console(_, creds) => &creds.keys,
+            Self::Link(_, _) => &ComputeCredentialKeys::None,
+            Self::Local(_) => &ComputeCredentialKeys::None,
         }
     }
 }
@@ -508,13 +526,15 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
         match self {
             Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
             Self::Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
+            Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
         }
     }
 
-    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
+    fn get_keys(&self) -> &ComputeCredentialKeys {
         match self {
-            Self::Console(_, creds) => Some(&creds.keys),
-            Self::Link(_, _) => None,
+            Self::Console(_, creds) => &creds.keys,
+            Self::Link(_, _) => &ComputeCredentialKeys::None,
+            Self::Local(_) => &ComputeCredentialKeys::None,
         }
     }
 }
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
new file mode 100644
index 0000000000..6d18564dd6
--- /dev/null
+++ b/proxy/src/auth/backend/local.rs
@@ -0,0 +1,79 @@
+use std::{collections::HashMap, net::SocketAddr};
+
+use anyhow::Context;
+use arc_swap::ArcSwapOption;
+
+use crate::{
+    compute::ConnCfg,
+    console::{
+        messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo},
+        NodeInfo,
+    },
+    intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag},
+    RoleName,
+};
+
+use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
+
+pub struct LocalBackend {
+    pub jwks_cache: JwkCache,
+    pub postgres_addr: SocketAddr,
+    pub node_info: NodeInfo,
+}
+
+impl LocalBackend {
+    pub fn new(postgres_addr: SocketAddr) -> Self {
+        LocalBackend {
+            jwks_cache: JwkCache::default(),
+            postgres_addr,
+            node_info: NodeInfo {
+                config: {
+                    let mut cfg = ConnCfg::new();
+                    cfg.host(&postgres_addr.ip().to_string());
+                    cfg.port(postgres_addr.port());
+                    cfg
+                },
+                // TODO(conrad): make this better reflect compute info rather than endpoint info.
+                aux: MetricsAuxInfo {
+                    endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"),
+                    project_id: ProjectIdTag::get_interner().get_or_intern("local"),
+                    branch_id: BranchIdTag::get_interner().get_or_intern("local"),
+                    cold_start_info: ColdStartInfo::WarmCached,
+                },
+                allow_self_signed_compute: false,
+            },
+        }
+    }
+}
+
+#[derive(Clone, Copy)]
+pub struct StaticAuthRules;
+
+pub static JWKS_ROLE_MAP: ArcSwapOption<JwksRoleSettings> = ArcSwapOption::const_empty();
+
+#[derive(Debug, Clone)]
+pub struct JwksRoleSettings {
+    pub roles: HashMap<RoleName, EndpointJwksResponse>,
+    pub project_id: ProjectIdInt,
+    pub branch_id: BranchIdInt,
+}
+
+impl FetchAuthRules for StaticAuthRules {
+    async fn fetch_auth_rules(&self, role_name: RoleName) -> anyhow::Result<Vec<AuthRule>> {
+        let mappings = JWKS_ROLE_MAP.load();
+        let role_mappings = mappings
+            .as_deref()
+            .and_then(|m| m.roles.get(&role_name))
+            .context("JWKs settings for this role were not configured")?;
+        let mut rules = vec![];
+        for setting in &role_mappings.jwks {
+            rules.push(AuthRule {
+                id: setting.id.clone(),
+                jwks_url: setting.jwks_url.clone(),
+                audience: setting.jwt_audience.clone(),
+            });
+        }
+
+        Ok(rules)
+    }
+}
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index ac66e116d0..a7ccf076b0 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,11 +1,13 @@
 use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
 use std::fmt::{self, Display};
 
 use crate::auth::IpPattern;
 
 use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
 use crate::proxy::retry::CouldRetry;
+use crate::RoleName;
 
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
@@ -341,6 +343,26 @@ impl ColdStartInfo {
     }
 }
 
+#[derive(Debug, Deserialize, Clone)]
+pub struct JwksRoleMapping {
+    pub roles: HashMap<RoleName, EndpointJwksResponse>,
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct EndpointJwksResponse {
+    pub jwks: Vec<JwksSettings>,
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct JwksSettings {
+    pub id: String,
+    pub project_id: ProjectIdInt,
+    pub branch_id: BranchIdInt,
+    pub jwks_url: url::Url,
+    pub provider_name: String,
+    pub jwt_audience: Option<String>,
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index cc2ee10062..4794527410 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -305,6 +305,7 @@ impl NodeInfo {
         match keys {
             ComputeCredentialKeys::Password(password) => self.config.password(password),
             ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
+            ComputeCredentialKeys::None => &mut self.config,
         };
     }
 }
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index f38e43ba5a..e1a54a9c98 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -61,7 +61,7 @@ pub trait ComputeConnectBackend {
         ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
 
-    fn get_keys(&self) -> Option<&ComputeCredentialKeys>;
+    fn get_keys(&self) -> &ComputeCredentialKeys;
 }
 
 pub struct TcpMechanism<'a> {
@@ -112,9 +112,8 @@ where
     let mut num_retries = 0;
     let mut node_info =
         wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
-    if let Some(keys) = user_info.get_keys() {
-        node_info.set_keys(keys);
-    }
+
+    node_info.set_keys(user_info.get_keys());
     node_info.allow_self_signed_compute = allow_self_signed_compute;
     // let mut node_info = credentials.get_node_info(ctx, user_info).await?;
     mechanism.update_connect_config(&mut node_info.config);
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index b2bf93dc6d..ea65867293 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -407,7 +407,7 @@ async fn request_handler(
             .header("Access-Control-Allow-Origin", "*")
             .header(
                 "Access-Control-Allow-Headers",
-                "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In, Neon-Batch-Read-Only, Neon-Batch-Isolation-Level",
+                "Authorization, Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In, Neon-Batch-Read-Only, Neon-Batch-Isolation-Level",
             )
             .header("Access-Control-Max-Age", "86400" /* 24 hours */)
             .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 295ea1a1c7..b44ecb76e3 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -4,7 +4,10 @@ use async_trait::async_trait;
 use tracing::{field::display, info};
 
 use crate::{
-    auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError},
+    auth::{
+        backend::{local::StaticAuthRules, ComputeCredentials, ComputeUserInfo},
+        check_peer_addr_is_in_list, AuthError,
+    },
     compute,
     config::{AuthenticationConfig, ProxyConfig},
     console::{
@@ -24,7 +27,7 @@ use crate::{
     Host,
 };
 
-use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
+use super::conn_pool::{poll_client, AuthData, Client, ConnInfo, GlobalConnPool};
 
 pub struct PoolingBackend {
     pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
@@ -33,13 +36,14 @@ pub struct PoolingBackend {
 }
 
 impl PoolingBackend {
-    pub async fn authenticate(
+    pub async fn authenticate_with_password(
         &self,
         ctx: &RequestMonitoring,
         config: &AuthenticationConfig,
-        conn_info: &ConnInfo,
+        user_info: &ComputeUserInfo,
+        password: &[u8],
     ) -> Result<ComputeCredentials, AuthError> {
-        let user_info = conn_info.user_info.clone();
+        let user_info = user_info.clone();
         let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
         if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
@@ -47,7 +51,7 @@ impl PoolingBackend {
         }
         if !self
             .endpoint_rate_limiter
-            .check(conn_info.user_info.endpoint.clone().into(), 1)
+            .check(user_info.endpoint.clone().into(), 1)
         {
             return Err(AuthError::too_many_connections());
         }
@@ -70,14 +74,10 @@ impl PoolingBackend {
                 return Err(AuthError::auth_failed(&*user_info.user));
             }
         };
-        let ep = EndpointIdInt::from(&conn_info.user_info.endpoint);
-        let auth_outcome = crate::auth::validate_password_and_exchange(
-            &config.thread_pool,
-            ep,
-            &conn_info.password,
-            secret,
-        )
-        .await?;
+        let ep = EndpointIdInt::from(&user_info.endpoint);
+        let auth_outcome =
+            crate::auth::validate_password_and_exchange(&config.thread_pool, ep, password, secret)
+                .await?;
         let res = match auth_outcome {
             crate::sasl::Outcome::Success(key) => {
                 info!("user successfully authenticated");
@@ -85,7 +85,7 @@ impl PoolingBackend {
             }
             crate::sasl::Outcome::Failure(reason) => {
                 info!("auth backend failed with an error: {reason}");
-                Err(AuthError::auth_failed(&*conn_info.user_info.user))
+                Err(AuthError::auth_failed(&*user_info.user))
             }
         };
         res.map(|key| ComputeCredentials {
@@ -94,6 +94,39 @@ impl PoolingBackend {
         })
     }
 
+    pub async fn authenticate_with_jwt(
+        &self,
+        ctx: &RequestMonitoring,
+        user_info: &ComputeUserInfo,
+        jwt: &str,
+    ) -> Result<ComputeCredentials, AuthError> {
+        match &self.config.auth_backend {
+            crate::auth::BackendType::Console(_, _) => {
+                Err(AuthError::auth_failed("JWT login is not yet supported"))
+            }
+            crate::auth::BackendType::Link(_, _) => Err(AuthError::auth_failed(
+                "JWT login over link proxy is not supported",
+            )),
+            crate::auth::BackendType::Local(cache) => {
+                cache
+                    .jwks_cache
+                    .check_jwt(
+                        ctx,
+                        user_info.endpoint.clone(),
+                        user_info.user.clone(),
+                        &StaticAuthRules,
+                        jwt,
+                    )
+                    .await
+                    .map_err(|e| AuthError::auth_failed(e.to_string()))?;
+                Ok(ComputeCredentials {
+                    info: user_info.clone(),
+                    keys: crate::auth::backend::ComputeCredentialKeys::None,
+                })
+            }
+        }
+    }
+
     // Wake up the destination if needed. Code here is a bit involved because
     // we reuse the code from the usual proxy and we need to prepare few structures
     // that this code expects.
@@ -232,10 +265,16 @@ impl ConnectMechanism for TokioMechanism {
         let mut config = (*node_info.config).clone();
         let config = config
             .user(&self.conn_info.user_info.user)
-            .password(&*self.conn_info.password)
             .dbname(&self.conn_info.dbname)
             .connect_timeout(timeout);
 
+        match &self.conn_info.auth {
+            AuthData::Jwt(_) => {}
+            AuthData::Password(pw) => {
+                config.password(pw);
+            }
+        }
+
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let res = config.connect(tokio_postgres::NoTls).await;
         drop(pause);
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 3478787995..6ed694af58 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -33,7 +33,13 @@ use super::backend::HttpConnError;
 pub struct ConnInfo {
     pub user_info: ComputeUserInfo,
     pub dbname: DbName,
-    pub password: SmallVec<[u8; 16]>,
+    pub auth: AuthData,
+}
+
+#[derive(Debug, Clone)]
+pub enum AuthData {
+    Password(SmallVec<[u8; 16]>),
+    Jwt(String),
 }
 
 impl ConnInfo {
@@ -778,7 +784,7 @@ mod tests {
                 options: Default::default(),
             },
             dbname: "dbname".into(),
-            password: "password".as_bytes().into(),
+            auth: AuthData::Password("password".as_bytes().into()),
         };
         let ep_pool = Arc::downgrade(
             &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
@@ -836,7 +842,7 @@ mod tests {
                 options: Default::default(),
             },
             dbname: "dbname".into(),
-            password: "password".as_bytes().into(),
+            auth: AuthData::Password("password".as_bytes().into()),
         };
         let ep_pool = Arc::downgrade(
             &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index bbfed90f39..79baef45f6 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -7,6 +7,7 @@ use futures::future::try_join;
 use futures::future::Either;
 use futures::StreamExt;
 use futures::TryFutureExt;
+use http::header::AUTHORIZATION;
 use http_body_util::BodyExt;
 use http_body_util::Full;
 use hyper1::body::Body;
@@ -56,6 +57,7 @@ use crate::DbName;
 use crate::RoleName;
 
 use super::backend::PoolingBackend;
+use super::conn_pool::AuthData;
 use super::conn_pool::Client;
 use super::conn_pool::ConnInfo;
 use super::http_util::json_response;
@@ -88,6 +90,7 @@ enum Payload {
 const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB
 const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB
 
+static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string");
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
 static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
@@ -109,7 +112,7 @@ where
 #[derive(Debug, thiserror::Error)]
 pub enum ConnInfoError {
     #[error("invalid header: {0}")]
-    InvalidHeader(&'static str),
+    InvalidHeader(&'static HeaderName),
     #[error("invalid connection string: {0}")]
     UrlParseError(#[from] url::ParseError),
     #[error("incorrect scheme")]
@@ -153,10 +156,10 @@ fn get_conn_info(
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     let connection_string = headers
-        .get("Neon-Connection-String")
-        .ok_or(ConnInfoError::InvalidHeader("Neon-Connection-String"))?
+        .get(&CONN_STRING)
+        .ok_or(ConnInfoError::InvalidHeader(&CONN_STRING))?
         .to_str()
-        .map_err(|_| ConnInfoError::InvalidHeader("Neon-Connection-String"))?;
+        .map_err(|_| ConnInfoError::InvalidHeader(&CONN_STRING))?;
 
     let connection_url = Url::parse(connection_string)?;
 
@@ -179,10 +182,23 @@ fn get_conn_info(
     }
     ctx.set_user(username.clone());
 
-    let password = connection_url
-        .password()
-        .ok_or(ConnInfoError::MissingPassword)?;
-    let password = urlencoding::decode_binary(password.as_bytes());
+    let auth = if let Some(auth) = headers.get(&AUTHORIZATION) {
+        let auth = auth
+            .to_str()
+            .map_err(|_| ConnInfoError::InvalidHeader(&AUTHORIZATION))?;
+        AuthData::Jwt(
+            auth.strip_prefix("Bearer ")
+                .ok_or(ConnInfoError::MissingPassword)?
+                .into(),
+        )
+    } else if let Some(pass) = connection_url.password() {
+        AuthData::Password(match urlencoding::decode_binary(pass.as_bytes()) {
+            std::borrow::Cow::Borrowed(b) => b.into(),
+            std::borrow::Cow::Owned(b) => b.into(),
+        })
+    } else {
+        return Err(ConnInfoError::MissingPassword);
+    };
 
     let endpoint = match connection_url.host() {
         Some(url::Host::Domain(hostname)) => {
@@ -225,10 +241,7 @@ fn get_conn_info(
     Ok(ConnInfo {
         user_info,
         dbname,
-        password: match password {
-            std::borrow::Cow::Borrowed(b) => b.into(),
-            std::borrow::Cow::Owned(b) => b.into(),
-        },
+        auth,
     })
 }
 
@@ -550,9 +563,24 @@ async fn handle_inner(
 
     let authenticate_and_connect = Box::pin(
         async {
-            let keys = backend
-                .authenticate(ctx, &config.authentication_config, &conn_info)
-                .await?;
+            let keys = match &conn_info.auth {
+                AuthData::Password(pw) => {
+                    backend
+                        .authenticate_with_password(
+                            ctx,
+                            &config.authentication_config,
+                            &conn_info.user_info,
+                            pw,
+                        )
+                        .await?
+                }
+                AuthData::Jwt(jwt) => {
+                    backend
+                        .authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
+                        .await?
+                }
+            };
+
             let client = backend
                 .connect_to_compute(ctx, conn_info, keys, !allow_pool)
                 .await?;

From 06795c6b9a6b4664dadd4c75ccf9f75087b05614 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 23 Aug 2024 22:32:10 +0100
Subject: [PATCH 1496/1571] proxy: new local-proxy application (#8736)

Add binary for local-proxy that uses the local auth backend. Runs only
the http serverless driver support and offers config reload based on a
config file and SIGHUP
---
 proxy/src/bin/local_proxy.rs   | 316 +++++++++++++++++++++++++++++++++
 proxy/src/bin/pg_sni_router.rs |   4 +-
 proxy/src/bin/proxy.rs         |   7 +-
 proxy/src/lib.rs               |  14 +-
 4 files changed, 335 insertions(+), 6 deletions(-)
 create mode 100644 proxy/src/bin/local_proxy.rs

diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
new file mode 100644
index 0000000000..8acba33bac
--- /dev/null
+++ b/proxy/src/bin/local_proxy.rs
@@ -0,0 +1,316 @@
+use std::{
+    net::SocketAddr,
+    path::{Path, PathBuf},
+    pin::pin,
+    sync::Arc,
+    time::Duration,
+};
+
+use anyhow::{bail, ensure};
+use dashmap::DashMap;
+use futures::{future::Either, FutureExt};
+use proxy::{
+    auth::backend::local::{JwksRoleSettings, LocalBackend, JWKS_ROLE_MAP},
+    cancellation::CancellationHandlerMain,
+    config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig},
+    console::{locks::ApiLocks, messages::JwksRoleMapping},
+    http::health_server::AppMetrics,
+    metrics::{Metrics, ThreadPoolMetrics},
+    rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo},
+    scram::threadpool::ThreadPool,
+    serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions},
+};
+
+project_git_version!(GIT_VERSION);
+project_build_tag!(BUILD_TAG);
+
+use clap::Parser;
+use tokio::{net::TcpListener, task::JoinSet};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info, warn};
+use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
+
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
+/// Neon proxy/router
+#[derive(Parser)]
+#[command(version = GIT_VERSION, about)]
+struct LocalProxyCliArgs {
+    /// listen for incoming metrics connections on ip:port
+    #[clap(long, default_value = "127.0.0.1:7001")]
+    metrics: String,
+    /// listen for incoming http connections on ip:port
+    #[clap(long)]
+    http: String,
+    /// timeout for the TLS handshake
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    handshake_timeout: tokio::time::Duration,
+    /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
+    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
+    connect_compute_lock: String,
+    #[clap(flatten)]
+    sql_over_http: SqlOverHttpArgs,
+    /// User rate limiter max number of requests per second.
+    ///
+    /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
+    /// Can be given multiple times for different bucket sizes.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
+    user_rps_limit: Vec<RateBucketInfo>,
+    /// Whether the auth rate limiter actually takes effect (for testing)
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    auth_rate_limit_enabled: bool,
+    /// Authentication rate limiter max number of hashes per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
+    auth_rate_limit: Vec<RateBucketInfo>,
+    /// The IP subnet to use when considering whether two IP addresses are considered the same.
+    #[clap(long, default_value_t = 64)]
+    auth_rate_limit_ip_subnet: u8,
+    /// Whether to retry the connection to the compute node
+    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
+    connect_to_compute_retry: String,
+    /// Address of the postgres server
+    #[clap(long, default_value = "127.0.0.1:5432")]
+    compute: SocketAddr,
+    /// File address of the local proxy config file
+    #[clap(long, default_value = "./localproxy.json")]
+    config_path: PathBuf,
+}
+
+#[derive(clap::Args, Clone, Copy, Debug)]
+struct SqlOverHttpArgs {
+    /// How many connections to pool for each endpoint. Excess connections are discarded
+    #[clap(long, default_value_t = 200)]
+    sql_over_http_pool_max_total_conns: usize,
+
+    /// How long pooled connections should remain idle for before closing
+    #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
+    sql_over_http_idle_timeout: tokio::time::Duration,
+
+    #[clap(long, default_value_t = 100)]
+    sql_over_http_client_conn_threshold: u64,
+
+    #[clap(long, default_value_t = 16)]
+    sql_over_http_cancel_set_shards: usize,
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let _logging_guard = proxy::logging::init().await?;
+    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
+
+    Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
+
+    info!("Version: {GIT_VERSION}");
+    info!("Build_tag: {BUILD_TAG}");
+    let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
+        revision: GIT_VERSION,
+        build_tag: BUILD_TAG,
+    });
+
+    let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
+        Ok(t) => Some(t),
+        Err(e) => {
+            tracing::error!(error = ?e, "could not start jemalloc metrics loop");
+            None
+        }
+    };
+
+    let args = LocalProxyCliArgs::parse();
+    let config = build_config(&args)?;
+
+    let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?;
+    let http_listener = TcpListener::bind(args.http).await?;
+    let shutdown = CancellationToken::new();
+
+    // todo: should scale with CU
+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
+        LeakyBucketConfig {
+            rps: 10.0,
+            max: 100.0,
+        },
+        16,
+    ));
+
+    refresh_config(args.config_path.clone()).await;
+
+    let mut maintenance_tasks = JoinSet::new();
+    maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), move || {
+        refresh_config(args.config_path.clone()).map(Ok)
+    }));
+    maintenance_tasks.spawn(proxy::http::health_server::task_main(
+        metrics_listener,
+        AppMetrics {
+            jemalloc,
+            neon_metrics,
+            proxy: proxy::metrics::Metrics::get(),
+        },
+    ));
+
+    let task = serverless::task_main(
+        config,
+        http_listener,
+        shutdown.clone(),
+        Arc::new(CancellationHandlerMain::new(
+            Arc::new(DashMap::new()),
+            None,
+            proxy::metrics::CancellationSource::Local,
+        )),
+        endpoint_rate_limiter,
+    );
+
+    match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await {
+        // exit immediately on maintenance task completion
+        Either::Left((Some(res), _)) => match proxy::flatten_err(res)? {},
+        // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
+        Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
+        // exit immediately on client task error
+        Either::Right((res, _)) => res?,
+    }
+
+    Ok(())
+}
+
+/// ProxyConfig is created at proxy startup, and lives forever.
+fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
+    let config::ConcurrencyLockOptions {
+        shards,
+        limiter,
+        epoch,
+        timeout,
+    } = args.connect_compute_lock.parse()?;
+    info!(
+        ?limiter,
+        shards,
+        ?epoch,
+        "Using NodeLocks (connect_compute)"
+    );
+    let connect_compute_locks = ApiLocks::new(
+        "connect_compute_lock",
+        limiter,
+        shards,
+        timeout,
+        epoch,
+        &Metrics::get().proxy.connect_compute_lock,
+    )?;
+
+    let http_config = HttpConfig {
+        accept_websockets: false,
+        pool_options: GlobalConnPoolOptions {
+            gc_epoch: Duration::from_secs(60),
+            pool_shards: 2,
+            idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
+            opt_in: false,
+
+            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns,
+            max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
+        },
+        cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
+        client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
+    };
+
+    Ok(Box::leak(Box::new(ProxyConfig {
+        tls_config: None,
+        auth_backend: proxy::auth::BackendType::Local(proxy::auth::backend::MaybeOwned::Owned(
+            LocalBackend::new(args.compute),
+        )),
+        metric_collection: None,
+        allow_self_signed_compute: false,
+        http_config,
+        authentication_config: AuthenticationConfig {
+            thread_pool: ThreadPool::new(0),
+            scram_protocol_timeout: Duration::from_secs(10),
+            rate_limiter_enabled: false,
+            rate_limiter: BucketRateLimiter::new(vec![]),
+            rate_limit_ip_subnet: 64,
+        },
+        require_client_ip: false,
+        handshake_timeout: Duration::from_secs(10),
+        region: "local".into(),
+        wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
+        connect_compute_locks,
+        connect_to_compute_retry_config: RetryConfig::parse(
+            RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES,
+        )?,
+    })))
+}
+
+async fn refresh_config(path: PathBuf) {
+    match refresh_config_inner(&path).await {
+        Ok(()) => {}
+        Err(e) => {
+            error!(error=?e, ?path, "could not read config file");
+        }
+    }
+}
+
+async fn refresh_config_inner(path: &Path) -> anyhow::Result<()> {
+    let bytes = tokio::fs::read(&path).await?;
+    let mut data: JwksRoleMapping = serde_json::from_slice(&bytes)?;
+
+    let mut settings = None;
+
+    for mapping in data.roles.values_mut() {
+        for jwks in &mut mapping.jwks {
+            ensure!(
+                jwks.jwks_url.has_authority()
+                    && (jwks.jwks_url.scheme() == "http" || jwks.jwks_url.scheme() == "https"),
+                "Invalid JWKS url. Must be HTTP",
+            );
+
+            ensure!(
+                jwks.jwks_url
+                    .host()
+                    .is_some_and(|h| h != url::Host::Domain("")),
+                "Invalid JWKS url. No domain listed",
+            );
+
+            // clear username, password and ports
+            jwks.jwks_url.set_username("").expect(
+                "url can be a base and has a valid host and is not a file. should not error",
+            );
+            jwks.jwks_url.set_password(None).expect(
+                "url can be a base and has a valid host and is not a file. should not error",
+            );
+            // local testing is hard if we need to have a specific restricted port
+            if cfg!(not(feature = "testing")) {
+                jwks.jwks_url.set_port(None).expect(
+                    "url can be a base and has a valid host and is not a file. should not error",
+                );
+            }
+
+            // clear query params
+            jwks.jwks_url.set_fragment(None);
+            jwks.jwks_url.query_pairs_mut().clear().finish();
+
+            if jwks.jwks_url.scheme() != "https" {
+                // local testing is hard if we need to set up https support.
+                if cfg!(not(feature = "testing")) {
+                    jwks.jwks_url
+                        .set_scheme("https")
+                        .expect("should not error to set the scheme to https if it was http");
+                } else {
+                    warn!(scheme = jwks.jwks_url.scheme(), "JWKS url is not HTTPS");
+                }
+            }
+
+            let (pr, br) = settings.get_or_insert((jwks.project_id, jwks.branch_id));
+            ensure!(
+                *pr == jwks.project_id,
+                "inconsistent project IDs configured"
+            );
+            ensure!(*br == jwks.branch_id, "inconsistent branch IDs configured");
+        }
+    }
+
+    if let Some((project_id, branch_id)) = settings {
+        JWKS_ROLE_MAP.store(Some(Arc::new(JwksRoleSettings {
+            roles: data.roles,
+            project_id,
+            branch_id,
+        })));
+    }
+
+    Ok(())
+}
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 1038fa5116..20d2d3df9a 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -133,7 +133,9 @@ async fn main() -> anyhow::Result<()> {
         proxy_listener,
         cancellation_token.clone(),
     ));
-    let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token));
+    let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || async {
+        Ok(())
+    }));
 
     // the signal task cant ever succeed.
     // the main task can error, or can succeed on cancellation.
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index d83a1f3bcf..1f45a33ed5 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -148,7 +148,7 @@ struct ProxyCliArgs {
     disable_dynamic_rate_limiter: bool,
     /// Endpoint rate limiter max number of requests per second.
     ///
-    /// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
+    /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
     /// Can be given multiple times for different bucket sizes.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
     endpoint_rps_limit: Vec<RateBucketInfo>,
@@ -447,7 +447,10 @@ async fn main() -> anyhow::Result<()> {
 
     // maintenance tasks. these never return unless there's an error
     let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
+    maintenance_tasks.spawn(proxy::handle_signals(
+        cancellation_token.clone(),
+        || async { Ok(()) },
+    ));
     maintenance_tasks.spawn(http::health_server::task_main(
         http_listener,
         AppMetrics {
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index b7d497ebcc..8e1a4e4fa2 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -88,7 +88,7 @@
 // List of temporarily allowed lints to unblock beta/nightly.
 #![allow(unknown_lints, clippy::manual_inspect)]
 
-use std::convert::Infallible;
+use std::{convert::Infallible, future::Future};
 
 use anyhow::{bail, Context};
 use intern::{EndpointIdInt, EndpointIdTag, InternId};
@@ -123,7 +123,14 @@ pub mod usage_metrics;
 pub mod waiters;
 
 /// Handle unix signals appropriately.
-pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallible> {
+pub async fn handle_signals<F, Fut>(
+    token: CancellationToken,
+    mut refresh_config: F,
+) -> anyhow::Result<Infallible>
+where
+    F: FnMut() -> Fut,
+    Fut: Future<Output = anyhow::Result<()>>,
+{
     use tokio::signal::unix::{signal, SignalKind};
 
     let mut hangup = signal(SignalKind::hangup())?;
@@ -134,7 +141,8 @@ pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallib
         tokio::select! {
             // Hangup is commonly used for config reload.
             _ = hangup.recv() => {
-                warn!("received SIGHUP; config reload is not supported");
+                warn!("received SIGHUP");
+                refresh_config().await?;
             }
             // Shut down the whole application.
             _ = interrupt.recv() => {

From cdfdcd3e5d665c6f8093623e7323cef3d58aa308 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Ko=C5=82odziejczak?=
 <31549762+mrl5@users.noreply.github.com>
Date: Sun, 25 Aug 2024 17:33:45 +0200
Subject: [PATCH 1497/1571] chore: improve markdown formatting (#8825)

fixes:

![Screenshot_2024-08-25_16-25-30](https://github.com/user-attachments/assets/c993309b-6c2d-4938-9fd0-ce0953fc63ff)

fixes:

![Screenshot_2024-08-25_16-26-29](https://github.com/user-attachments/assets/cf497f4a-d9e3-45a6-a1a5-7e215d96d022)
---
 proxy/README.md            | 5 +++--
 storage_scrubber/README.md | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/proxy/README.md b/proxy/README.md
index d1f2e3f27b..afc8b77db8 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -36,7 +36,7 @@ To play with it locally one may start proxy over a local postgres installation
 ```
 
 If both postgres and proxy are running you may send a SQL query:
-```json
+```console
 curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
   -H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \
   -H 'Content-Type: application/json' \
@@ -44,7 +44,8 @@ curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
     "query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num",
     "params":[ "{{1,2},{\"3\",4}}", {"key":"val", "ikey":4242}]
   }' | jq
-
+```
+```json
 {
   "command": "SELECT",
   "fields": [
diff --git a/storage_scrubber/README.md b/storage_scrubber/README.md
index 9fbd92feef..5be8541419 100644
--- a/storage_scrubber/README.md
+++ b/storage_scrubber/README.md
@@ -98,7 +98,7 @@ to list timelines and find their backup and start LSNs.
 
 If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers.
 
-First, we need to group pageservers by buckets, `https://<admin host>/admin/pageservers`` can be used for all env nodes, then `cat /storage/pageserver/data/pageserver.toml` on every node will show the bucket names and regions needed.
+First, we need to group pageservers by buckets, `https://<admin host>/admin/pageservers` can be used for all env nodes, then `cat /storage/pageserver/data/pageserver.toml` on every node will show the bucket names and regions needed.
 
 Per bucket, for every pageserver id related, find deleted tenants:
 

From d48229f50f9253026267083f9b31513754d03365 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 26 Aug 2024 14:34:18 +0200
Subject: [PATCH 1498/1571] feat(compute): Introduce new
 compute_subscriptions_count metric (#8796)

## Problem

We need some metric to sneak peek into how many people use inbound
logical replication (Neon is a subscriber).

## Summary of changes

This commit adds a new metric `compute_subscriptions_count`, which is
number of subscriptions grouped by enabled/disabled state.

Resolves: neondatabase/cloud#16146
---
 vm-image-spec.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 622004b931..0bacb63509 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -356,6 +356,17 @@ files:
           from pg_replication_slots
           where slot_type = 'logical';
 
+      - metric_name: compute_subscriptions_count
+        type: gauge
+        help: 'Number of logical replication subscriptions grouped by enabled/disabled'
+        key_labels:
+          - enabled
+        values: [subscriptions_count]
+        query: |
+          select subenabled::text as enabled, count(*) as subscriptions_count
+          from pg_subscription
+          group by subenabled;
+
       - metric_name: retained_wal
         type: gauge
         help: 'Retained WAL in inactive replication slots'

From d6eede515a18283b3110ad98ab5dd50f029a38e2 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 26 Aug 2024 15:16:54 +0200
Subject: [PATCH 1499/1571] proxy: clippy lints: handle some low hanging fruit
 (#8829)

Should be mostly uncontroversial ones.
---
 proxy/src/auth/backend.rs                 | 37 ++++++++++----------
 proxy/src/auth/backend/jwt.rs             | 17 ++++------
 proxy/src/auth/credentials.rs             |  9 +++--
 proxy/src/cache/project_info.rs           |  8 ++---
 proxy/src/cancellation.rs                 |  3 +-
 proxy/src/compute.rs                      |  2 +-
 proxy/src/config.rs                       |  2 +-
 proxy/src/console/provider/mock.rs        | 22 +++++-------
 proxy/src/console/provider/neon.rs        | 10 +++---
 proxy/src/lib.rs                          | 31 ++++++-----------
 proxy/src/proxy.rs                        |  6 ++--
 proxy/src/proxy/connect_compute.rs        |  7 ++--
 proxy/src/proxy/copy_bidirectional.rs     |  7 ++--
 proxy/src/proxy/tests.rs                  |  6 ++--
 proxy/src/proxy/tests/mitm.rs             |  4 +--
 proxy/src/rate_limiter/leaky_bucket.rs    |  1 +
 proxy/src/rate_limiter/limit_algorithm.rs |  3 +-
 proxy/src/redis/notifications.rs          |  2 +-
 proxy/src/scram/messages.rs               |  2 +-
 proxy/src/scram/secret.rs                 |  8 +----
 proxy/src/scram/threadpool.rs             |  7 ++--
 proxy/src/serverless.rs                   |  4 +--
 proxy/src/serverless/backend.rs           | 20 ++++++-----
 proxy/src/serverless/conn_pool.rs         | 41 ++++++++++++-----------
 proxy/src/serverless/json.rs              |  2 +-
 proxy/src/serverless/sql_over_http.rs     |  4 +--
 proxy/src/stream.rs                       |  2 +-
 proxy/src/usage_metrics.rs                |  9 ++---
 proxy/src/waiters.rs                      |  2 +-
 29 files changed, 128 insertions(+), 150 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index ae72bc6de3..bb9a0ddffc 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -85,7 +85,7 @@ pub trait TestBackend: Send + Sync + 'static {
 impl std::fmt::Display for BackendType<'_, (), ()> {
     fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
-            Self::Console(api, _) => match &**api {
+            Self::Console(api, ()) => match &**api {
                 ConsoleBackend::Console(endpoint) => {
                     fmt.debug_tuple("Console").field(&endpoint.url()).finish()
                 }
@@ -96,7 +96,7 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
                 #[cfg(test)]
                 ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
             },
-            Self::Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Self::Link(url, ()) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
             Self::Local(_) => fmt.debug_tuple("Local").finish(),
         }
     }
@@ -324,21 +324,20 @@ async fn auth_quirks(
     };
     let (cached_entry, secret) = cached_secret.take_value();
 
-    let secret = match secret {
-        Some(secret) => config.check_rate_limit(
+    let secret = if let Some(secret) = secret {
+        config.check_rate_limit(
             ctx,
             config,
             secret,
             &info.endpoint,
             unauthenticated_password.is_some() || allow_cleartext,
-        )?,
-        None => {
-            // If we don't have an authentication secret, we mock one to
-            // prevent malicious probing (possible due to missing protocol steps).
-            // This mocked secret will never lead to successful authentication.
-            info!("authentication info not found, mocking it");
-            AuthSecret::Scram(scram::ServerSecret::mock(rand::random()))
-        }
+        )?
+    } else {
+        // If we don't have an authentication secret, we mock one to
+        // prevent malicious probing (possible due to missing protocol steps).
+        // This mocked secret will never lead to successful authentication.
+        info!("authentication info not found, mocking it");
+        AuthSecret::Scram(scram::ServerSecret::mock(rand::random()))
     };
 
     match authenticate_with_secret(
@@ -409,7 +408,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
     pub fn get_endpoint(&self) -> Option<EndpointId> {
         match self {
             Self::Console(_, user_info) => user_info.endpoint_id.clone(),
-            Self::Link(_, _) => Some("link".into()),
+            Self::Link(_, ()) => Some("link".into()),
             Self::Local(_) => Some("local".into()),
         }
     }
@@ -418,7 +417,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
     pub fn get_user(&self) -> &str {
         match self {
             Self::Console(_, user_info) => &user_info.user,
-            Self::Link(_, _) => "link",
+            Self::Link(_, ()) => "link",
             Self::Local(_) => "local",
         }
     }
@@ -454,7 +453,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
                 BackendType::Console(api, credentials)
             }
             // NOTE: this auth backend doesn't use client credentials.
-            Self::Link(url, _) => {
+            Self::Link(url, ()) => {
                 info!("performing link authentication");
 
                 let info = link::authenticate(ctx, &url, client).await?;
@@ -478,7 +477,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         match self {
             Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
-            Self::Link(_, _) => Ok(Cached::new_uncached(None)),
+            Self::Link(_, ()) => Ok(Cached::new_uncached(None)),
             Self::Local(_) => Ok(Cached::new_uncached(None)),
         }
     }
@@ -489,7 +488,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         match self {
             Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Self::Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Self::Link(_, ()) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
             Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
         }
     }
@@ -525,7 +524,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
         match self {
             Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Self::Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
+            Self::Link(_, ()) => unreachable!("link auth flow doesn't support waking the compute"),
             Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
         }
     }
@@ -533,7 +532,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
     fn get_keys(&self) -> &ComputeCredentialKeys {
         match self {
             Self::Console(_, creds) => &creds.keys,
-            Self::Link(_, _) => &ComputeCredentialKeys::None,
+            Self::Link(_, ()) => &ComputeCredentialKeys::None,
             Self::Local(_) => &ComputeCredentialKeys::None,
         }
     }
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 49d5de16c3..61833e19ed 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -224,10 +224,10 @@ impl JwkCacheEntryLock {
         // where Signature = alg(<B64(Header)> || . || <B64(Payload)>);
 
         let (header_payload, signature) = jwt
-            .rsplit_once(".")
+            .rsplit_once('.')
             .context("Provided authentication token is not a valid JWT encoding")?;
         let (header, payload) = header_payload
-            .split_once(".")
+            .split_once('.')
             .context("Provided authentication token is not a valid JWT encoding")?;
 
         let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
@@ -320,14 +320,11 @@ impl JwkCache {
         // try with just a read lock first
         let key = (endpoint, role_name.clone());
         let entry = self.map.get(&key).as_deref().map(Arc::clone);
-        let entry = match entry {
-            Some(entry) => entry,
-            None => {
-                // acquire a write lock after to insert.
-                let entry = self.map.entry(key).or_default();
-                Arc::clone(&*entry)
-            }
-        };
+        let entry = entry.unwrap_or_else(|| {
+            // acquire a write lock after to insert.
+            let entry = self.map.entry(key).or_default();
+            Arc::clone(&*entry)
+        });
 
         entry
             .check_jwt(ctx, jwt, &self.client, role_name, fetch)
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 849e7d65e8..cb06fcaf55 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -130,9 +130,12 @@ impl ComputeUserInfoMaybeEndpoint {
                 }))
             }
             // Invariant: project name may not contain certain characters.
-            (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
-                false => Err(ComputeUserInfoParseError::MalformedProjectName(name)),
-                true => Ok(name),
+            (a, b) => a.or(b).map(|name| {
+                if project_name_valid(name.as_ref()) {
+                    Ok(name)
+                } else {
+                    Err(ComputeUserInfoParseError::MalformedProjectName(name))
+                }
             }),
         }
         .transpose()?;
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 10cc4ceee1..eda886a7af 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -274,13 +274,13 @@ impl ProjectInfoCacheImpl {
         let ttl_disabled_since_us = self
             .ttl_disabled_since_us
             .load(std::sync::atomic::Ordering::Relaxed);
-        let ignore_cache_since = if ttl_disabled_since_us != u64::MAX {
+        let ignore_cache_since = if ttl_disabled_since_us == u64::MAX {
+            None
+        } else {
             let ignore_cache_since = self.start_time + Duration::from_micros(ttl_disabled_since_us);
             // We are fine if entry is not older than ttl or was added before we are getting notifications.
             valid_since = valid_since.min(ignore_cache_since);
             Some(ignore_cache_since)
-        } else {
-            None
         };
         (valid_since, ignore_cache_since)
     }
@@ -306,7 +306,7 @@ impl ProjectInfoCacheImpl {
         let mut removed = 0;
         let shard = self.project2ep.shards()[shard].write();
         for (_, endpoints) in shard.iter() {
-            for endpoint in endpoints.get().iter() {
+            for endpoint in endpoints.get() {
                 self.cache.remove(endpoint);
                 removed += 1;
             }
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 34512e9f5b..ea8f7b4070 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -220,7 +220,8 @@ mod tests {
 
     #[tokio::test]
     async fn cancel_session_noop_regression() {
-        let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local);
+        let handler =
+            CancellationHandler::<()>::new(CancelMap::default(), CancellationSource::Local);
         handler
             .cancel_session(
                 CancelKeyData {
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index c071a59d58..b6659f5dd0 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -286,7 +286,7 @@ impl ConnCfg {
 
         let client_config = if allow_self_signed_compute {
             // Allow all certificates for creating the connection
-            let verifier = Arc::new(AcceptEverythingVerifier) as Arc<dyn ServerCertVerifier>;
+            let verifier = Arc::new(AcceptEverythingVerifier);
             rustls::ClientConfig::builder()
                 .dangerous()
                 .with_custom_certificate_verifier(verifier)
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index a280aa88ce..6c42fb8d19 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -318,7 +318,7 @@ impl CertResolver {
             // a) Instead of multi-cert approach use single cert with extra
             //    domains listed in Subject Alternative Name (SAN).
             // b) Deploy separate proxy instances for extra domains.
-            self.default.as_ref().cloned()
+            self.default.clone()
         }
     }
 }
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 2093da7562..4e8b7a9365 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -64,7 +64,7 @@ impl Api {
                 tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
 
             tokio::spawn(connection);
-            let secret = match get_execute_postgres_query(
+            let secret = if let Some(entry) = get_execute_postgres_query(
                 &client,
                 "select rolpassword from pg_catalog.pg_authid where rolname = $1",
                 &[&&*user_info.user],
@@ -72,15 +72,12 @@ impl Api {
             )
             .await?
             {
-                Some(entry) => {
-                    info!("got a secret: {entry}"); // safe since it's not a prod scenario
-                    let secret = scram::ServerSecret::parse(&entry).map(AuthSecret::Scram);
-                    secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5))
-                }
-                None => {
-                    warn!("user '{}' does not exist", user_info.user);
-                    None
-                }
+                info!("got a secret: {entry}"); // safe since it's not a prod scenario
+                let secret = scram::ServerSecret::parse(&entry).map(AuthSecret::Scram);
+                secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5))
+            } else {
+                warn!("user '{}' does not exist", user_info.user);
+                None
             };
             let allowed_ips = match get_execute_postgres_query(
                 &client,
@@ -142,12 +139,11 @@ async fn get_execute_postgres_query(
     let rows = client.query(query, params).await?;
 
     // We can get at most one row, because `rolname` is unique.
-    let row = match rows.first() {
-        Some(row) => row,
+    let Some(row) = rows.first() else {
         // This means that the user doesn't exist, so there can be no secret.
         // However, this is still a *valid* outcome which is very similar
         // to getting `404 Not found` from the Neon console.
-        None => return Ok(None),
+        return Ok(None);
     };
 
     let entry = row.try_get(idx).map_err(MockApiError::PasswordNotSet)?;
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 7eda238b66..a6c0e233fc 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -38,9 +38,9 @@ impl Api {
         locks: &'static ApiLocks<EndpointCacheKey>,
         wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
     ) -> Self {
-        let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
+        let jwt = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
             Ok(v) => v,
-            Err(_) => "".to_string(),
+            Err(_) => String::new(),
         };
         Self {
             endpoint,
@@ -96,10 +96,10 @@ impl Api {
                 // Error 404 is special: it's ok not to have a secret.
                 // TODO(anna): retry
                 Err(e) => {
-                    if e.get_reason().is_not_found() {
-                        return Ok(AuthInfo::default());
+                    return if e.get_reason().is_not_found() {
+                        Ok(AuthInfo::default())
                     } else {
-                        return Err(e.into());
+                        Err(e.into())
                     }
                 }
             };
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 8e1a4e4fa2..1e14ca59ec 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -12,6 +12,8 @@
 // https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction
 #![warn(
     clippy::undocumented_unsafe_blocks,
+    // TODO: Enable once all individual checks are enabled.
+    //clippy::as_conversions,
     clippy::dbg_macro,
     clippy::empty_enum_variants_with_brackets,
     clippy::exit,
@@ -31,8 +33,15 @@
 )]
 // List of permanently allowed lints.
 #![allow(
-    // It's ok to cast u8 to bool, etc.
+    // It's ok to cast bool to u8, etc.
     clippy::cast_lossless,
+    // Seems unavoidable.
+    clippy::multiple_crate_versions,
+    // While #[must_use] is a great feature this check is too noisy.
+    clippy::must_use_candidate,
+    // Inline consts, structs, fns, imports, etc. are ok if they're used by
+    // the following statement(s).
+    clippy::items_after_statements,
 )]
 // List of temporarily allowed lints.
 // TODO: Switch to except() once stable with 1.81.
@@ -43,46 +52,26 @@
     clippy::cast_possible_wrap,
     clippy::cast_precision_loss,
     clippy::cast_sign_loss,
-    clippy::default_trait_access,
     clippy::doc_markdown,
-    clippy::explicit_iter_loop,
-    clippy::float_cmp,
-    clippy::if_not_else,
-    clippy::ignored_unit_patterns,
     clippy::implicit_hasher,
-    clippy::inconsistent_struct_constructor,
     clippy::inline_always,
-    clippy::items_after_statements,
-    clippy::manual_assert,
-    clippy::manual_let_else,
-    clippy::manual_string_new,
-    clippy::match_bool,
     clippy::match_same_arms,
     clippy::match_wild_err_arm,
     clippy::missing_errors_doc,
     clippy::missing_panics_doc,
     clippy::module_name_repetitions,
-    clippy::multiple_crate_versions,
-    clippy::must_use_candidate,
-    clippy::needless_for_each,
     clippy::needless_pass_by_value,
     clippy::needless_raw_string_hashes,
-    clippy::option_as_ref_cloned,
     clippy::redundant_closure_for_method_calls,
-    clippy::redundant_else,
     clippy::return_self_not_must_use,
     clippy::similar_names,
-    clippy::single_char_pattern,
     clippy::single_match_else,
     clippy::struct_excessive_bools,
     clippy::struct_field_names,
     clippy::too_many_lines,
-    clippy::uninlined_format_args,
-    clippy::unnested_or_patterns,
     clippy::unreadable_literal,
     clippy::unused_async,
     clippy::unused_self,
-    clippy::used_underscore_binding,
     clippy::wildcard_imports
 )]
 // List of temporarily allowed lints to unblock beta/nightly.
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 2182f38fe7..aa1025a29f 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -254,7 +254,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     let metrics = &Metrics::get().proxy;
     let proto = ctx.protocol();
-    let _request_gauge = metrics.connection_requests.guard(proto);
+    let request_gauge = metrics.connection_requests.guard(proto);
 
     let tls = config.tls_config.as_ref();
 
@@ -283,7 +283,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let result = config
         .auth_backend
         .as_ref()
-        .map(|_| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names))
+        .map(|()| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names))
         .transpose();
 
     let user_info = match result {
@@ -340,7 +340,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         client: stream,
         aux: node.aux.clone(),
         compute: node,
-        req: _request_gauge,
+        req: request_gauge,
         conn: conn_gauge,
         cancel: session,
     }))
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index e1a54a9c98..6305dc204e 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -30,9 +30,10 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
     if is_cached {
         warn!("invalidating stalled compute node info cache entry");
     }
-    let label = match is_cached {
-        true => ConnectionFailureKind::ComputeCached,
-        false => ConnectionFailureKind::ComputeUncached,
+    let label = if is_cached {
+        ConnectionFailureKind::ComputeCached
+    } else {
+        ConnectionFailureKind::ComputeUncached
     };
     Metrics::get().proxy.connection_failures_total.inc(label);
 
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 048523f69c..f8c8e8bc4b 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -230,11 +230,10 @@ impl CopyBuffer {
                         io::ErrorKind::WriteZero,
                         "write zero byte into writer",
                     ))));
-                } else {
-                    self.pos += i;
-                    self.amt += i as u64;
-                    self.need_flush = true;
                 }
+                self.pos += i;
+                self.amt += i as u64;
+                self.need_flush = true;
             }
 
             // If pos larger than cap, this loop will never stop.
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index d8308c4f2a..21c0641a7f 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -433,7 +433,7 @@ impl ReportableError for TestConnectError {
 
 impl std::fmt::Display for TestConnectError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:?}", self)
+        write!(f, "{self:?}")
     }
 }
 
@@ -475,7 +475,7 @@ impl ConnectMechanism for TestConnectMechanism {
                 retryable: false,
                 kind: ErrorKind::Compute,
             }),
-            x => panic!("expecting action {:?}, connect is called instead", x),
+            x => panic!("expecting action {x:?}, connect is called instead"),
         }
     }
 
@@ -515,7 +515,7 @@ impl TestBackend for TestConnectMechanism {
                 assert!(err.could_retry());
                 Err(console::errors::WakeComputeError::ApiError(err))
             }
-            x => panic!("expecting action {:?}, wake_compute is called instead", x),
+            x => panic!("expecting action {x:?}, wake_compute is called instead"),
         }
     }
 
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index 2d752b9183..71f07f4682 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -115,9 +115,7 @@ where
     let mut buf = [0];
     stream.read_exact(&mut buf).await.unwrap();
 
-    if buf[0] != b'S' {
-        panic!("ssl not supported by server");
-    }
+    assert!(buf[0] == b'S', "ssl not supported by server");
 
     tls.connect(stream).await.unwrap()
 }
diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs
index 2d5e056540..f184e18f4c 100644
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -119,6 +119,7 @@ impl Default for LeakyBucketState {
 }
 
 #[cfg(test)]
+#[allow(clippy::float_cmp)]
 mod tests {
     use std::time::Duration;
 
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
index 80a62b2a76..bc16837f65 100644
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -174,9 +174,8 @@ impl DynamicLimiter {
                     let mut inner = self.inner.lock();
                     if inner.take(&self.ready).is_some() {
                         break Ok(Token::new(self.clone()));
-                    } else {
-                        notified.set(self.ready.notified());
                     }
+                    notified.set(self.ready.notified());
                 }
                 notified.as_mut().await;
                 ready = true;
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index ad69246443..31c0e62c2c 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -150,7 +150,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                     }
                 }
             }
-            _ => {
+            Notification::AllowedIpsUpdate { .. } | Notification::PasswordUpdate { .. } => {
                 invalidate_cache(self.cache.clone(), msg.clone());
                 if matches!(msg, Notification::AllowedIpsUpdate { .. }) {
                     Metrics::get()
diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs
index 5ecbbf7004..54157e450d 100644
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -89,7 +89,7 @@ impl<'a> ClientFirstMessage<'a> {
         write!(&mut message, "r={}", self.nonce).unwrap();
         base64::encode_config_buf(nonce, base64::STANDARD, &mut message);
         let combined_nonce = 2..message.len();
-        write!(&mut message, ",s={},i={}", salt_base64, iterations).unwrap();
+        write!(&mut message, ",s={salt_base64},i={iterations}").unwrap();
 
         // This design guarantees that it's impossible to create a
         // server-first-message without receiving a client-first-message
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index 44c4f9e44a..a08cb943c3 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -82,13 +82,7 @@ mod tests {
         let stored_key = "D5h6KTMBlUvDJk2Y8ELfC1Sjtc6k9YHjRyuRZyBNJns=";
         let server_key = "Pi3QHbcluX//NDfVkKlFl88GGzlJ5LkyPwcdlN/QBvI=";
 
-        let secret = format!(
-            "SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}",
-            iterations = iterations,
-            salt = salt,
-            stored_key = stored_key,
-            server_key = server_key,
-        );
+        let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}");
 
         let parsed = ServerSecret::parse(&secret).unwrap();
         assert_eq!(parsed.iterations, iterations);
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
index fa3d3ccca2..8fbaecf93d 100644
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -222,12 +222,11 @@ fn thread_rt(pool: Arc<ThreadPool>, worker: Worker<JobSpec>, index: usize) {
         }
 
         for i in 0.. {
-            let mut job = match worker
+            let Some(mut job) = worker
                 .pop()
                 .or_else(|| pool.steal(&mut rng, index, &worker))
-            {
-                Some(job) => job,
-                None => continue 'wait,
+            else {
+                continue 'wait;
             };
 
             pool.metrics
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index ea65867293..d9a9019746 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -93,11 +93,11 @@ pub async fn task_main(
             let mut tls_server_config = rustls::ServerConfig::clone(&config.to_server_config());
             // prefer http2, but support http/1.1
             tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
-            Arc::new(tls_server_config) as Arc<_>
+            Arc::new(tls_server_config)
         }
         None => {
             warn!("TLS config is missing");
-            Arc::new(NoTls) as Arc<_>
+            Arc::new(NoTls)
         }
     };
 
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index b44ecb76e3..9cc271c588 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -44,7 +44,11 @@ impl PoolingBackend {
         password: &[u8],
     ) -> Result<ComputeCredentials, AuthError> {
         let user_info = user_info.clone();
-        let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
+        let backend = self
+            .config
+            .auth_backend
+            .as_ref()
+            .map(|()| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
         if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
             return Err(AuthError::ip_address_not_allowed(ctx.peer_addr()));
@@ -101,10 +105,10 @@ impl PoolingBackend {
         jwt: &str,
     ) -> Result<ComputeCredentials, AuthError> {
         match &self.config.auth_backend {
-            crate::auth::BackendType::Console(_, _) => {
+            crate::auth::BackendType::Console(_, ()) => {
                 Err(AuthError::auth_failed("JWT login is not yet supported"))
             }
-            crate::auth::BackendType::Link(_, _) => Err(AuthError::auth_failed(
+            crate::auth::BackendType::Link(_, ()) => Err(AuthError::auth_failed(
                 "JWT login over link proxy is not supported",
             )),
             crate::auth::BackendType::Local(cache) => {
@@ -138,12 +142,12 @@ impl PoolingBackend {
         keys: ComputeCredentials,
         force_new: bool,
     ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
-        let maybe_client = if !force_new {
-            info!("pool: looking for an existing connection");
-            self.pool.get(ctx, &conn_info)?
-        } else {
+        let maybe_client = if force_new {
             info!("pool: pool is disabled");
             None
+        } else {
+            info!("pool: looking for an existing connection");
+            self.pool.get(ctx, &conn_info)?
         };
 
         if let Some(client) = maybe_client {
@@ -152,7 +156,7 @@ impl PoolingBackend {
         let conn_id = uuid::Uuid::new_v4();
         tracing::Span::current().record("conn_id", display(conn_id));
         info!(%conn_id, "pool: opening a new connection '{conn_info}'");
-        let backend = self.config.auth_backend.as_ref().map(|_| keys);
+        let backend = self.config.auth_backend.as_ref().map(|()| keys);
         crate::proxy::connect_compute::connect_to_compute(
             ctx,
             &TokioMechanism {
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 6ed694af58..476083d71e 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -339,9 +339,9 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                 } = pool.get_mut();
 
                 // ensure that closed clients are removed
-                pools.iter_mut().for_each(|(_, db_pool)| {
+                for db_pool in pools.values_mut() {
                     clients_removed += db_pool.clear_closed_clients(total_conns);
-                });
+                }
 
                 // we only remove this pool if it has no active connections
                 if *total_conns == 0 {
@@ -405,21 +405,20 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
             if client.is_closed() {
                 info!("pool: cached connection '{conn_info}' is closed, opening a new one");
                 return Ok(None);
-            } else {
-                tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
-                tracing::Span::current().record(
-                    "pid",
-                    tracing::field::display(client.inner.get_process_id()),
-                );
-                info!(
-                    cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
-                    "pool: reusing connection '{conn_info}'"
-                );
-                client.session.send(ctx.session_id())?;
-                ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
-                ctx.success();
-                return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
             }
+            tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
+            tracing::Span::current().record(
+                "pid",
+                tracing::field::display(client.inner.get_process_id()),
+            );
+            info!(
+                cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
+                "pool: reusing connection '{conn_info}'"
+            );
+            client.session.send(ctx.session_id())?;
+            ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
+            ctx.success();
+            return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
         }
         Ok(None)
     }
@@ -660,7 +659,7 @@ impl<C: ClientInnerExt> Client<C> {
             span: _,
         } = self;
         let inner = inner.as_mut().expect("client inner should not be removed");
-        (&mut inner.inner, Discard { pool, conn_info })
+        (&mut inner.inner, Discard { conn_info, pool })
     }
 }
 
@@ -722,7 +721,9 @@ impl<C: ClientInnerExt> Drop for Client<C> {
 mod tests {
     use std::{mem, sync::atomic::AtomicBool};
 
-    use crate::{serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId};
+    use crate::{
+        proxy::NeonOptions, serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId,
+    };
 
     use super::*;
 
@@ -781,7 +782,7 @@ mod tests {
             user_info: ComputeUserInfo {
                 user: "user".into(),
                 endpoint: "endpoint".into(),
-                options: Default::default(),
+                options: NeonOptions::default(),
             },
             dbname: "dbname".into(),
             auth: AuthData::Password("password".as_bytes().into()),
@@ -839,7 +840,7 @@ mod tests {
             user_info: ComputeUserInfo {
                 user: "user".into(),
                 endpoint: "endpoint-2".into(),
-                options: Default::default(),
+                options: NeonOptions::default(),
             },
             dbname: "dbname".into(),
             auth: AuthData::Password("password".as_bytes().into()),
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index c22c63e85b..3776971fa1 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -55,7 +55,7 @@ fn json_array_to_pg_array(value: &Value) -> Option<String> {
                 .collect::<Vec<_>>()
                 .join(",");
 
-            Some(format!("{{{}}}", vals))
+            Some(format!("{{{vals}}}"))
         }
     }
 }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 79baef45f6..9143469eea 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -207,12 +207,12 @@ fn get_conn_info(
                     .ok_or(ConnInfoError::MalformedEndpoint)?
             } else {
                 hostname
-                    .split_once(".")
+                    .split_once('.')
                     .map_or(hostname, |(prefix, _)| prefix)
                     .into()
             }
         }
-        Some(url::Host::Ipv4(_)) | Some(url::Host::Ipv6(_)) | None => {
+        Some(url::Host::Ipv4(_) | url::Host::Ipv6(_)) | None => {
             return Err(ConnInfoError::MissingHostname)
         }
     };
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 7809d2e574..ef13f5fc1a 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -67,7 +67,7 @@ impl<S: AsyncRead + Unpin> PqStream<S> {
             FeMessage::PasswordMessage(msg) => Ok(msg),
             bad => Err(io::Error::new(
                 io::ErrorKind::InvalidData,
-                format!("unexpected message type: {:?}", bad),
+                format!("unexpected message type: {bad:?}"),
             )),
         }
     }
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index a8735fe0bb..4cf6da7e2d 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -450,12 +450,9 @@ async fn upload_events_chunk(
     remote_path: &RemotePath,
     cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
-    let storage = match storage {
-        Some(storage) => storage,
-        None => {
-            error!("no remote storage configured");
-            return Ok(());
-        }
+    let Some(storage) = storage else {
+        error!("no remote storage configured");
+        return Ok(());
     };
     let data = serde_json::to_vec(&chunk).context("serialize metrics")?;
     let mut encoder = GzipEncoder::new(Vec::new());
diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs
index 3bd8f4c8ef..9f78242ed3 100644
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -31,7 +31,7 @@ pub struct Waiters<T>(pub(self) Mutex<HashMap<String, oneshot::Sender<T>>>);
 
 impl<T> Default for Waiters<T> {
     fn default() -> Self {
-        Waiters(Default::default())
+        Waiters(Mutex::default())
     }
 }
 

From 2dd53e7ae0adf7c8a5856bb86a287eddd591718d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 26 Aug 2024 17:30:19 +0200
Subject: [PATCH 1500/1571] Timeline archival test (#8824)

This PR:

* Implements the rule that archived timelines require all of their
children to be archived as well, as specified in the RFC. There is no
fancy locking mechanism though, so the precondition can still be broken.
As a TODO for later, we still allow unarchiving timelines with archived
parents.
* Adds an `is_archived` flag to `TimelineInfo`
* Adds timeline_archival_config to `PageserverHttpClient`
* Adds a new `test_timeline_archive` test, loosely based on
`test_timeline_delete`

Part of #8088
---
 libs/pageserver_api/src/models.rs            |  1 +
 pageserver/src/http/routes.rs                | 25 ++++-
 pageserver/src/tenant.rs                     | 70 ++++++++++++--
 test_runner/fixtures/common_types.py         |  7 ++
 test_runner/fixtures/pageserver/http.py      | 18 +++-
 test_runner/regress/test_timeline_archive.py | 96 ++++++++++++++++++++
 6 files changed, 207 insertions(+), 10 deletions(-)
 create mode 100644 test_runner/regress/test_timeline_archive.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d55c06b685..4cab56771b 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -718,6 +718,7 @@ pub struct TimelineInfo {
     pub pg_version: u32,
 
     pub state: TimelineState,
+    pub is_archived: bool,
 
     pub walreceiver_status: String,
 
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 4635e76ea9..cbcc162b32 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -318,6 +318,24 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
     }
 }
 
+impl From<crate::tenant::TimelineArchivalError> for ApiError {
+    fn from(value: crate::tenant::TimelineArchivalError) -> Self {
+        use crate::tenant::TimelineArchivalError::*;
+        match value {
+            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
+            Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
+            HasUnarchivedChildren(children) => ApiError::PreconditionFailed(
+                format!(
+                    "Cannot archive timeline which has non-archived child timelines: {children:?}"
+                )
+                .into_boxed_str(),
+            ),
+            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
+            Other(e) => ApiError::InternalServerError(e),
+        }
+    }
+}
+
 impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
     fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
         use crate::tenant::mgr::DeleteTimelineError::*;
@@ -405,6 +423,8 @@ async fn build_timeline_info_common(
     let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
     let current_physical_size = Some(timeline.layer_size_sum().await);
     let state = timeline.current_state();
+    // Report is_archived = false if the timeline is still loading
+    let is_archived = timeline.is_archived().unwrap_or(false);
     let remote_consistent_lsn_projected = timeline
         .get_remote_consistent_lsn_projected()
         .unwrap_or(Lsn(0));
@@ -445,6 +465,7 @@ async fn build_timeline_info_common(
         pg_version: timeline.pg_version,
 
         state,
+        is_archived,
 
         walreceiver_status,
 
@@ -686,9 +707,7 @@ async fn timeline_archival_config_handler(
 
         tenant
             .apply_timeline_archival_config(timeline_id, request_data.state)
-            .await
-            .context("applying archival config")
-            .map_err(ApiError::InternalServerError)?;
+            .await?;
         Ok::<_, ApiError>(())
     }
     .instrument(info_span!("timeline_archival_config",
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 3a7afff211..d3589a12c8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -501,6 +501,38 @@ impl Debug for DeleteTimelineError {
     }
 }
 
+#[derive(thiserror::Error)]
+pub enum TimelineArchivalError {
+    #[error("NotFound")]
+    NotFound,
+
+    #[error("Timeout")]
+    Timeout,
+
+    #[error("HasUnarchivedChildren")]
+    HasUnarchivedChildren(Vec<TimelineId>),
+
+    #[error("Timeline archival is already in progress")]
+    AlreadyInProgress,
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl Debug for TimelineArchivalError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::NotFound => write!(f, "NotFound"),
+            Self::Timeout => write!(f, "Timeout"),
+            Self::HasUnarchivedChildren(c) => {
+                f.debug_tuple("HasUnarchivedChildren").field(c).finish()
+            }
+            Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(),
+            Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
+        }
+    }
+}
+
 pub enum SetStoppingError {
     AlreadyStopping(completion::Barrier),
     Broken,
@@ -1326,24 +1358,50 @@ impl Tenant {
         &self,
         timeline_id: TimelineId,
         state: TimelineArchivalState,
-    ) -> anyhow::Result<()> {
-        let timeline = self
-            .get_timeline(timeline_id, false)
-            .context("Cannot apply timeline archival config to inexistent timeline")?;
+    ) -> Result<(), TimelineArchivalError> {
+        info!("setting timeline archival config");
+        let timeline = {
+            let timelines = self.timelines.lock().unwrap();
+
+            let timeline = match timelines.get(&timeline_id) {
+                Some(t) => t,
+                None => return Err(TimelineArchivalError::NotFound),
+            };
+
+            // Ensure that there are no non-archived child timelines
+            let children: Vec<TimelineId> = timelines
+                .iter()
+                .filter_map(|(id, entry)| {
+                    if entry.get_ancestor_timeline_id() != Some(timeline_id) {
+                        return None;
+                    }
+                    if entry.is_archived() == Some(true) {
+                        return None;
+                    }
+                    Some(*id)
+                })
+                .collect();
+
+            if !children.is_empty() && state == TimelineArchivalState::Archived {
+                return Err(TimelineArchivalError::HasUnarchivedChildren(children));
+            }
+            Arc::clone(timeline)
+        };
 
         let upload_needed = timeline
             .remote_client
             .schedule_index_upload_for_timeline_archival_state(state)?;
 
         if upload_needed {
+            info!("Uploading new state");
             const MAX_WAIT: Duration = Duration::from_secs(10);
             let Ok(v) =
                 tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
             else {
                 tracing::warn!("reached timeout for waiting on upload queue");
-                bail!("reached timeout for upload queue flush");
+                return Err(TimelineArchivalError::Timeout);
             };
-            v?;
+            v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?;
         }
         Ok(())
     }
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index 7cadcbb4c2..8eda19d1e2 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -1,5 +1,6 @@
 import random
 from dataclasses import dataclass
+from enum import Enum
 from functools import total_ordering
 from typing import Any, Dict, Type, TypeVar, Union
 
@@ -213,3 +214,9 @@ class TenantShardId:
 
     def __hash__(self) -> int:
         return hash(self._tuple())
+
+
+# TODO: Replace with `StrEnum` when we upgrade to python 3.11
+class TimelineArchivalState(str, Enum):
+    ARCHIVED = "Archived"
+    UNARCHIVED = "Unarchived"
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index cd4261f1b8..582f9c0264 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -10,7 +10,7 @@ import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 
-from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchivalState, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pg_version import PgVersion
@@ -621,6 +621,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         )
         self.verbose_error(res)
 
+    def timeline_archival_config(
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+        timeline_id: TimelineId,
+        state: TimelineArchivalState,
+    ):
+        config = {"state": state.value}
+        log.info(
+            f"requesting timeline archival config {config} for tenant {tenant_id} and timeline {timeline_id}"
+        )
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/archival_config",
+            json=config,
+        )
+        self.verbose_error(res)
+
     def timeline_get_lsn_by_timestamp(
         self,
         tenant_id: Union[TenantId, TenantShardId],
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
new file mode 100644
index 0000000000..b774c7c9fe
--- /dev/null
+++ b/test_runner/regress/test_timeline_archive.py
@@ -0,0 +1,96 @@
+import pytest
+from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId
+from fixtures.neon_fixtures import (
+    NeonEnv,
+)
+from fixtures.pageserver.http import PageserverApiException
+
+
+def test_timeline_archive(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*Timeline .* was not found.*",
+            ".*timeline not found.*",
+            ".*Cannot archive timeline which has unarchived child timelines.*",
+            ".*Precondition failed: Requested tenant is missing.*",
+        ]
+    )
+
+    ps_http = env.pageserver.http_client()
+
+    # first try to archive non existing timeline
+    # for existing tenant:
+    invalid_timeline_id = TimelineId.generate()
+    with pytest.raises(PageserverApiException, match="timeline not found") as exc:
+        ps_http.timeline_archival_config(
+            tenant_id=env.initial_tenant,
+            timeline_id=invalid_timeline_id,
+            state=TimelineArchivalState.ARCHIVED,
+        )
+
+    assert exc.value.status_code == 404
+
+    # for non existing tenant:
+    invalid_tenant_id = TenantId.generate()
+    with pytest.raises(
+        PageserverApiException,
+        match=f"NotFound: tenant {invalid_tenant_id}",
+    ) as exc:
+        ps_http.timeline_archival_config(
+            tenant_id=invalid_tenant_id,
+            timeline_id=invalid_timeline_id,
+            state=TimelineArchivalState.ARCHIVED,
+        )
+
+    assert exc.value.status_code == 404
+
+    # construct pair of branches to validate that pageserver prohibits
+    # archival of ancestor timelines when they have non-archived child branches
+    parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_archive_parent", "empty")
+
+    leaf_timeline_id = env.neon_cli.create_branch(
+        "test_ancestor_branch_archive_branch1", "test_ancestor_branch_archive_parent"
+    )
+
+    timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id)
+
+    with pytest.raises(
+        PageserverApiException,
+        match="Cannot archive timeline which has non-archived child timelines",
+    ) as exc:
+        assert timeline_path.exists()
+
+        ps_http.timeline_archival_config(
+            tenant_id=env.initial_tenant,
+            timeline_id=parent_timeline_id,
+            state=TimelineArchivalState.ARCHIVED,
+        )
+
+    assert exc.value.status_code == 412
+
+    # Test timeline_detail
+    leaf_detail = ps_http.timeline_detail(
+        tenant_id=env.initial_tenant,
+        timeline_id=leaf_timeline_id,
+    )
+    assert leaf_detail["is_archived"] is False
+
+    # Test that archiving the leaf timeline and then the parent works
+    ps_http.timeline_archival_config(
+        tenant_id=env.initial_tenant,
+        timeline_id=leaf_timeline_id,
+        state=TimelineArchivalState.ARCHIVED,
+    )
+    leaf_detail = ps_http.timeline_detail(
+        tenant_id=env.initial_tenant,
+        timeline_id=leaf_timeline_id,
+    )
+    assert leaf_detail["is_archived"] is True
+
+    ps_http.timeline_archival_config(
+        tenant_id=env.initial_tenant,
+        timeline_id=parent_timeline_id,
+        state=TimelineArchivalState.ARCHIVED,
+    )

From 97241776aa6ca9612d580b2ef87caab8bd292c4e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 26 Aug 2024 18:07:55 +0200
Subject: [PATCH 1501/1571] pageserver: startup: ensure local disk state is
 durable (#8835)

refs https://github.com/neondatabase/neon/issues/6989

Problem
-------

After unclean shutdown, we get restarted, start reading the local
filesystem,
and make decisions based on those reads. However, some of the data might
have
not yet been fsynced when the unclean shutdown completed.

Durability matters even though Pageservers are conceptually just a cache
of state in S3. For example:
- the cloud control plane is no control loop => pageserver responses
  to tenant attachmentm, etc, needs to be durable.
  - the storage controller does not rely on this (as much?)
- we don't have layer file checksumming, so, downloaded+renamed but not
  fsynced layer files are technically not to be trusted
  - https://github.com/neondatabase/neon/issues/2683

Solution
--------

`syncfs` the tenants directory during startup, before we start reading
from it.

This is a bit overkill because we do remove some temp files
(InMemoryLayer!)
later during startup. Further, these temp files are particularly likely
to
be dirty in the kernel page cache. However, we don't want to refactor
that
cleanup code right now, and the dirty data on pageservers is generally
not that high. Last, with [direct
IO](https://github.com/neondatabase/neon/issues/8130) we're going to
have near-zero kernel page cache anyway quite soon.
---
 Cargo.toml                       |  2 +-
 pageserver/src/bin/pageserver.rs | 52 ++++++++++++++++++++++++++++++--
 2 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 963841e340..e038c0b4ff 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -113,7 +113,7 @@ md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
 memoffset = "0.8"
-nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
+nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index da0c11d9bf..7d404e50a5 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -126,10 +126,56 @@ fn main() -> anyhow::Result<()> {
     info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
     info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
 
+    // The tenants directory contains all the pageserver local disk state.
+    // Create if not exists and make sure all the contents are durable before proceeding.
+    // Ensuring durability eliminates a whole bug class where we come up after an unclean shutdown.
+    // After unclea shutdown, we don't know if all the filesystem content we can read via syscalls is actually durable or not.
+    // Examples for that: OOM kill, systemd killing us during shutdown, self abort due to unrecoverable IO error.
     let tenants_path = conf.tenants_path();
-    if !tenants_path.exists() {
-        utils::crashsafe::create_dir_all(conf.tenants_path())
-            .with_context(|| format!("Failed to create tenants root dir at '{tenants_path}'"))?;
+    {
+        let open = || {
+            nix::dir::Dir::open(
+                tenants_path.as_std_path(),
+                nix::fcntl::OFlag::O_DIRECTORY | nix::fcntl::OFlag::O_RDONLY,
+                nix::sys::stat::Mode::empty(),
+            )
+        };
+        let dirfd = match open() {
+            Ok(dirfd) => dirfd,
+            Err(e) => match e {
+                nix::errno::Errno::ENOENT => {
+                    utils::crashsafe::create_dir_all(&tenants_path).with_context(|| {
+                        format!("Failed to create tenants root dir at '{tenants_path}'")
+                    })?;
+                    open().context("open tenants dir after creating it")?
+                }
+                e => anyhow::bail!(e),
+            },
+        };
+
+        let started = Instant::now();
+        // Linux guarantees durability for syncfs.
+        // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
+        #[cfg(target_os = "linux")]
+        {
+            use std::os::fd::AsRawFd;
+            nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?;
+        }
+        #[cfg(target_os = "macos")]
+        {
+            // macOS is not a production platform for Neon, don't even bother.
+            drop(dirfd);
+        }
+        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
+        {
+            compile_error!("Unsupported OS");
+        }
+
+        let elapsed = started.elapsed();
+        info!(
+            elapsed_ms = elapsed.as_millis(),
+            "made tenant directory contents durable"
+        );
     }
 
     // Initialize up failpoints support

From 0f6568426342f80c6faba81af9827cc4ad9fa2d6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 27 Aug 2024 02:19:47 +0800
Subject: [PATCH 1502/1571] feat(pageserver): use split layer writer in
 gc-compaction (#8608)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Part of #8002, the final big PR in the batch.

## Summary of changes

This pull request uses the new split layer writer in the gc-compaction.

* It changes how layers are split. Previously, we split layers based on
the original split point, but this creates too many layers
(test_gc_feedback has one key per layer).
* Therefore, we first verify if the layer map can be processed by the
current algorithm (See https://github.com/neondatabase/neon/pull/8191,
it's basically the same check)
* On that, we proceed with the compaction. This way, it creates a large
enough layer close to the target layer size.
* Added a new set of functions `with_discard` in the split layer writer.
This helps us skip layers if we are going to produce the same persistent
key.
* The delta writer will keep the updates of the same key in a single
file. This might create a super large layer, but we can optimize it
later.
* The split layer writer is used in the gc-compaction algorithm, and it
will split layers based on size.
* Fix the image layer summary block encoded the wrong key range.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 libs/pageserver_api/src/key.rs                |   9 +
 pageserver/src/tenant.rs                      | 220 ++++++++-
 pageserver/src/tenant/storage_layer.rs        |   1 -
 .../src/tenant/storage_layer/delta_layer.rs   |   4 +-
 .../src/tenant/storage_layer/image_layer.rs   |  25 +-
 pageserver/src/tenant/storage_layer/layer.rs  |   2 +
 .../src/tenant/storage_layer/split_writer.rs  | 369 ++++++++++++---
 pageserver/src/tenant/timeline.rs             |  17 +-
 pageserver/src/tenant/timeline/compaction.rs  | 447 ++++++++----------
 9 files changed, 751 insertions(+), 343 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 2fdd7de38f..77da58d63e 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -236,6 +236,15 @@ impl Key {
         field5: u8::MAX,
         field6: u32::MAX,
     };
+    /// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers
+    pub const NON_L0_MAX: Key = Key {
+        field1: u8::MAX,
+        field2: u32::MAX,
+        field3: u32::MAX,
+        field4: u32::MAX,
+        field5: u8::MAX,
+        field6: u32::MAX - 1,
+    };
 
     pub fn from_hex(s: &str) -> Result<Self> {
         if s.len() != 36 {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d3589a12c8..0364d521b6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -7071,18 +7071,14 @@ mod tests {
             vec![
                 // Image layer at GC horizon
                 PersistentLayerKey {
-                    key_range: {
-                        let mut key = Key::MAX;
-                        key.field6 -= 1;
-                        Key::MIN..key
-                    },
+                    key_range: Key::MIN..Key::NON_L0_MAX,
                     lsn_range: Lsn(0x30)..Lsn(0x31),
                     is_delta: false
                 },
-                // The delta layer that is cut in the middle
+                // The delta layer covers the full range (with the layer key hack to avoid being recognized as L0)
                 PersistentLayerKey {
-                    key_range: get_key(3)..get_key(4),
-                    lsn_range: Lsn(0x30)..Lsn(0x41),
+                    key_range: Key::MIN..Key::NON_L0_MAX,
+                    lsn_range: Lsn(0x30)..Lsn(0x48),
                     is_delta: true
                 },
                 // The delta3 layer that should not be picked for the compaction
@@ -8062,6 +8058,214 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
+    {
+        let harness =
+            TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns_single_key")
+                .await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x28),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
+            ),
+        ];
+        let delta2 = vec![
+            (
+                get_key(1),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x38),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![
+                    // delta1 and delta 2 only contain a single key but multiple updates
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x50), delta3),
+                ], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![
+                    (Lsn(0x10), tline.timeline_id),
+                    (Lsn(0x20), tline.timeline_id),
+                ],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let expected_result = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10@0x48"),
+            Bytes::from_static(b"value 9@0x10@0x48"),
+        ];
+
+        let expected_result_at_gc_horizon = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_20 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_10 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let verify_result = || async {
+            let gc_horizon = {
+                let gc_info = tline.gc_info.read().unwrap();
+                gc_info.cutoffs.time
+            };
+            for idx in 0..10 {
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), gc_horizon, &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_gc_horizon[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x20), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_20[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x10), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_10[idx]
+                );
+            }
+        };
+
+        verify_result().await;
+
+        let cancel = CancellationToken::new();
+        let mut dryrun_flags = EnumSet::new();
+        dryrun_flags.insert(CompactFlags::DryRun);
+
+        tline
+            .compact_with_gc(&cancel, dryrun_flags, &ctx)
+            .await
+            .unwrap();
+        // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
+        // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
+        verify_result().await;
+
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        verify_result().await;
+
+        // compact again
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        verify_result().await;
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 133b34b8b5..a1202ad507 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,7 +8,6 @@ mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;
 
-#[cfg(test)]
 pub mod split_writer;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index b1b5217f7f..f4a2957972 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,6 +36,7 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{
     DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
+use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -568,7 +569,6 @@ impl DeltaLayerWriterInner {
         // 5GB limit for objects without multipart upload (which we don't want to use)
         // Make it a little bit below to account for differing GB units
         // https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html
-        const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
         ensure!(
             metadata.len() <= S3_UPLOAD_LIMIT,
             "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
@@ -702,12 +702,10 @@ impl DeltaLayerWriter {
         self.inner.take().unwrap().finish(key_end, ctx).await
     }
 
-    #[cfg(test)]
     pub(crate) fn num_keys(&self) -> usize {
         self.inner.as_ref().unwrap().num_keys
     }
 
-    #[cfg(test)]
     pub(crate) fn estimated_size(&self) -> u64 {
         let inner = self.inner.as_ref().unwrap();
         inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 94120a4e3e..3cb2b1c83a 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -716,10 +716,6 @@ struct ImageLayerWriterInner {
 }
 
 impl ImageLayerWriterInner {
-    fn size(&self) -> u64 {
-        self.tree.borrow_writer().size() + self.blob_writer.size()
-    }
-
     ///
     /// Start building a new image layer.
     ///
@@ -854,13 +850,19 @@ impl ImageLayerWriterInner {
             res?;
         }
 
+        let final_key_range = if let Some(end_key) = end_key {
+            self.key_range.start..end_key
+        } else {
+            self.key_range.clone()
+        };
+
         // Fill in the summary on blk 0
         let summary = Summary {
             magic: IMAGE_FILE_MAGIC,
             format_version: STORAGE_FORMAT_VERSION,
             tenant_id: self.tenant_shard_id.tenant_id,
             timeline_id: self.timeline_id,
-            key_range: self.key_range.clone(),
+            key_range: final_key_range.clone(),
             lsn: self.lsn,
             index_start_blk,
             index_root_blk,
@@ -881,11 +883,7 @@ impl ImageLayerWriterInner {
         let desc = PersistentLayerDesc::new_img(
             self.tenant_shard_id,
             self.timeline_id,
-            if let Some(end_key) = end_key {
-                self.key_range.start..end_key
-            } else {
-                self.key_range.clone()
-            },
+            final_key_range,
             self.lsn,
             metadata.len(),
         );
@@ -974,14 +972,12 @@ impl ImageLayerWriter {
         self.inner.as_mut().unwrap().put_image(key, img, ctx).await
     }
 
-    #[cfg(test)]
     /// Estimated size of the image layer.
     pub(crate) fn estimated_size(&self) -> u64 {
         let inner = self.inner.as_ref().unwrap();
         inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
     }
 
-    #[cfg(test)]
     pub(crate) fn num_keys(&self) -> usize {
         self.inner.as_ref().unwrap().num_keys
     }
@@ -997,7 +993,6 @@ impl ImageLayerWriter {
         self.inner.take().unwrap().finish(timeline, ctx, None).await
     }
 
-    #[cfg(test)]
     /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
     pub(super) async fn finish_with_end_key(
         mut self,
@@ -1011,10 +1006,6 @@ impl ImageLayerWriter {
             .finish(timeline, ctx, Some(end_key))
             .await
     }
-
-    pub(crate) fn size(&self) -> u64 {
-        self.inner.as_ref().unwrap().size()
-    }
 }
 
 impl Drop for ImageLayerWriter {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 2607b574e7..53bb66b95e 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -35,6 +35,8 @@ mod tests;
 #[cfg(test)]
 mod failpoints;
 
+pub const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
+
 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
 ///
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index e12e29cd45..df910b5ad9 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -1,4 +1,4 @@
-use std::{ops::Range, sync::Arc};
+use std::{future::Future, ops::Range, sync::Arc};
 
 use bytes::Bytes;
 use pageserver_api::key::{Key, KEY_SIZE};
@@ -7,7 +7,32 @@ use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
 use crate::tenant::storage_layer::Layer;
 use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
 
-use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
+use super::layer::S3_UPLOAD_LIMIT;
+use super::{
+    DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer,
+};
+
+pub(crate) enum SplitWriterResult {
+    Produced(ResidentLayer),
+    Discarded(PersistentLayerKey),
+}
+
+#[cfg(test)]
+impl SplitWriterResult {
+    fn into_resident_layer(self) -> ResidentLayer {
+        match self {
+            SplitWriterResult::Produced(layer) => layer,
+            SplitWriterResult::Discarded(_) => panic!("unexpected discarded layer"),
+        }
+    }
+
+    fn into_discarded_layer(self) -> PersistentLayerKey {
+        match self {
+            SplitWriterResult::Produced(_) => panic!("unexpected produced layer"),
+            SplitWriterResult::Discarded(layer) => layer,
+        }
+    }
+}
 
 /// An image writer that takes images and produces multiple image layers. The interface does not
 /// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
@@ -16,11 +41,12 @@ use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
 pub struct SplitImageLayerWriter {
     inner: ImageLayerWriter,
     target_layer_size: u64,
-    generated_layers: Vec<ResidentLayer>,
+    generated_layers: Vec<SplitWriterResult>,
     conf: &'static PageServerConf,
     timeline_id: TimelineId,
     tenant_shard_id: TenantShardId,
     lsn: Lsn,
+    start_key: Key,
 }
 
 impl SplitImageLayerWriter {
@@ -49,16 +75,22 @@ impl SplitImageLayerWriter {
             timeline_id,
             tenant_shard_id,
             lsn,
+            start_key,
         })
     }
 
-    pub async fn put_image(
+    pub async fn put_image_with_discard_fn<D, F>(
         &mut self,
         key: Key,
         img: Bytes,
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+        discard: D,
+    ) -> anyhow::Result<()>
+    where
+        D: FnOnce(&PersistentLayerKey) -> F,
+        F: Future<Output = bool>,
+    {
         // The current estimation is an upper bound of the space that the key/image could take
         // because we did not consider compression in this estimation. The resulting image layer
         // could be smaller than the target size.
@@ -76,33 +108,87 @@ impl SplitImageLayerWriter {
             )
             .await?;
             let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
-            self.generated_layers.push(
-                prev_image_writer
-                    .finish_with_end_key(tline, key, ctx)
-                    .await?,
-            );
+            let layer_key = PersistentLayerKey {
+                key_range: self.start_key..key,
+                lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
+                is_delta: false,
+            };
+            self.start_key = key;
+
+            if discard(&layer_key).await {
+                drop(prev_image_writer);
+                self.generated_layers
+                    .push(SplitWriterResult::Discarded(layer_key));
+            } else {
+                self.generated_layers.push(SplitWriterResult::Produced(
+                    prev_image_writer
+                        .finish_with_end_key(tline, key, ctx)
+                        .await?,
+                ));
+            }
         }
         self.inner.put_image(key, img, ctx).await
     }
 
-    pub(crate) async fn finish(
+    #[cfg(test)]
+    pub async fn put_image(
+        &mut self,
+        key: Key,
+        img: Bytes,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.put_image_with_discard_fn(key, img, tline, ctx, |_| async { false })
+            .await
+    }
+
+    pub(crate) async fn finish_with_discard_fn<D, F>(
         self,
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
         end_key: Key,
-    ) -> anyhow::Result<Vec<ResidentLayer>> {
+        discard: D,
+    ) -> anyhow::Result<Vec<SplitWriterResult>>
+    where
+        D: FnOnce(&PersistentLayerKey) -> F,
+        F: Future<Output = bool>,
+    {
         let Self {
             mut generated_layers,
             inner,
             ..
         } = self;
-        generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
+        if inner.num_keys() == 0 {
+            return Ok(generated_layers);
+        }
+        let layer_key = PersistentLayerKey {
+            key_range: self.start_key..end_key,
+            lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
+            is_delta: false,
+        };
+        if discard(&layer_key).await {
+            generated_layers.push(SplitWriterResult::Discarded(layer_key));
+        } else {
+            generated_layers.push(SplitWriterResult::Produced(
+                inner.finish_with_end_key(tline, end_key, ctx).await?,
+            ));
+        }
         Ok(generated_layers)
     }
 
+    #[cfg(test)]
+    pub(crate) async fn finish(
+        self,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        end_key: Key,
+    ) -> anyhow::Result<Vec<SplitWriterResult>> {
+        self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
+            .await
+    }
+
     /// When split writer fails, the caller should call this function and handle partially generated layers.
-    #[allow(dead_code)]
-    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
+    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, ImageLayerWriter)> {
         Ok((self.generated_layers, self.inner))
     }
 }
@@ -110,15 +196,21 @@ impl SplitImageLayerWriter {
 /// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
 /// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
 /// to be cleaned up).
+///
+/// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
+/// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
+/// will split them into multiple files based on size.
 #[must_use]
 pub struct SplitDeltaLayerWriter {
     inner: DeltaLayerWriter,
     target_layer_size: u64,
-    generated_layers: Vec<ResidentLayer>,
+    generated_layers: Vec<SplitWriterResult>,
     conf: &'static PageServerConf,
     timeline_id: TimelineId,
     tenant_shard_id: TenantShardId,
     lsn_range: Range<Lsn>,
+    last_key_written: Key,
+    start_key: Key,
 }
 
 impl SplitDeltaLayerWriter {
@@ -147,9 +239,74 @@ impl SplitDeltaLayerWriter {
             timeline_id,
             tenant_shard_id,
             lsn_range,
+            last_key_written: Key::MIN,
+            start_key,
         })
     }
 
+    /// Put value into the layer writer. In the case the writer decides to produce a layer, and the discard fn returns true, no layer will be written in the end.
+    pub async fn put_value_with_discard_fn<D, F>(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: Value,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        discard: D,
+    ) -> anyhow::Result<()>
+    where
+        D: FnOnce(&PersistentLayerKey) -> F,
+        F: Future<Output = bool>,
+    {
+        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
+        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
+        //
+        // Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction
+        // strategy. https://github.com/neondatabase/neon/issues/8837
+        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
+        if self.inner.num_keys() >= 1
+            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
+        {
+            if key != self.last_key_written {
+                let next_delta_writer = DeltaLayerWriter::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_shard_id,
+                    key,
+                    self.lsn_range.clone(),
+                    ctx,
+                )
+                .await?;
+                let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
+                let layer_key = PersistentLayerKey {
+                    key_range: self.start_key..key,
+                    lsn_range: self.lsn_range.clone(),
+                    is_delta: true,
+                };
+                self.start_key = key;
+                if discard(&layer_key).await {
+                    drop(prev_delta_writer);
+                    self.generated_layers
+                        .push(SplitWriterResult::Discarded(layer_key));
+                } else {
+                    let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
+                    let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+                    self.generated_layers
+                        .push(SplitWriterResult::Produced(delta_layer));
+                }
+            } else if self.inner.estimated_size() >= S3_UPLOAD_LIMIT {
+                // We have to produce a very large file b/c a key is updated too often.
+                anyhow::bail!(
+                    "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced",
+                    key,
+                    self.inner.estimated_size()
+                );
+            }
+        }
+        self.last_key_written = key;
+        self.inner.put_value(key, lsn, val, ctx).await
+    }
+
     pub async fn put_value(
         &mut self,
         key: Key,
@@ -158,56 +315,64 @@ impl SplitDeltaLayerWriter {
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
-        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
-        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
-        {
-            let next_delta_writer = DeltaLayerWriter::new(
-                self.conf,
-                self.timeline_id,
-                self.tenant_shard_id,
-                key,
-                self.lsn_range.clone(),
-                ctx,
-            )
-            .await?;
-            let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
-            let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
-            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-            self.generated_layers.push(delta_layer);
-        }
-        self.inner.put_value(key, lsn, val, ctx).await
+        self.put_value_with_discard_fn(key, lsn, val, tline, ctx, |_| async { false })
+            .await
     }
 
-    pub(crate) async fn finish(
+    pub(crate) async fn finish_with_discard_fn<D, F>(
         self,
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
         end_key: Key,
-    ) -> anyhow::Result<Vec<ResidentLayer>> {
+        discard: D,
+    ) -> anyhow::Result<Vec<SplitWriterResult>>
+    where
+        D: FnOnce(&PersistentLayerKey) -> F,
+        F: Future<Output = bool>,
+    {
         let Self {
             mut generated_layers,
             inner,
             ..
         } = self;
-
-        let (desc, path) = inner.finish(end_key, ctx).await?;
-        let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-        generated_layers.push(delta_layer);
+        if inner.num_keys() == 0 {
+            return Ok(generated_layers);
+        }
+        let layer_key = PersistentLayerKey {
+            key_range: self.start_key..end_key,
+            lsn_range: self.lsn_range.clone(),
+            is_delta: true,
+        };
+        if discard(&layer_key).await {
+            generated_layers.push(SplitWriterResult::Discarded(layer_key));
+        } else {
+            let (desc, path) = inner.finish(end_key, ctx).await?;
+            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+            generated_layers.push(SplitWriterResult::Produced(delta_layer));
+        }
         Ok(generated_layers)
     }
 
-    /// When split writer fails, the caller should call this function and handle partially generated layers.
     #[allow(dead_code)]
-    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
+    pub(crate) async fn finish(
+        self,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        end_key: Key,
+    ) -> anyhow::Result<Vec<SplitWriterResult>> {
+        self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
+            .await
+    }
+
+    /// When split writer fails, the caller should call this function and handle partially generated layers.
+    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, DeltaLayerWriter)> {
         Ok((self.generated_layers, self.inner))
     }
 }
 
 #[cfg(test)]
 mod tests {
+    use itertools::Itertools;
     use rand::{RngCore, SeedableRng};
 
     use crate::{
@@ -302,9 +467,16 @@ mod tests {
 
     #[tokio::test]
     async fn write_split() {
-        let harness = TenantHarness::create("split_writer_write_split")
-            .await
-            .unwrap();
+        write_split_helper("split_writer_write_split", false).await;
+    }
+
+    #[tokio::test]
+    async fn write_split_discard() {
+        write_split_helper("split_writer_write_split_discard", false).await;
+    }
+
+    async fn write_split_helper(harness_name: &'static str, discard: bool) {
+        let harness = TenantHarness::create(harness_name).await.unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let tline = tenant
@@ -338,16 +510,19 @@ mod tests {
         for i in 0..N {
             let i = i as u32;
             image_writer
-                .put_image(get_key(i), get_large_img(), &tline, &ctx)
+                .put_image_with_discard_fn(get_key(i), get_large_img(), &tline, &ctx, |_| async {
+                    discard
+                })
                 .await
                 .unwrap();
             delta_writer
-                .put_value(
+                .put_value_with_discard_fn(
                     get_key(i),
                     Lsn(0x20),
                     Value::Image(get_large_img()),
                     &tline,
                     &ctx,
+                    |_| async { discard },
                 )
                 .await
                 .unwrap();
@@ -360,22 +535,39 @@ mod tests {
             .finish(&tline, &ctx, get_key(N as u32))
             .await
             .unwrap();
-        assert_eq!(image_layers.len(), N / 512 + 1);
-        assert_eq!(delta_layers.len(), N / 512 + 1);
-        for idx in 0..image_layers.len() {
-            assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
-            assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
-            if idx > 0 {
-                assert_eq!(
-                    image_layers[idx - 1].layer_desc().key_range.end,
-                    image_layers[idx].layer_desc().key_range.start
-                );
-                assert_eq!(
-                    delta_layers[idx - 1].layer_desc().key_range.end,
-                    delta_layers[idx].layer_desc().key_range.start
-                );
+        if discard {
+            for layer in image_layers {
+                layer.into_discarded_layer();
+            }
+            for layer in delta_layers {
+                layer.into_discarded_layer();
+            }
+        } else {
+            let image_layers = image_layers
+                .into_iter()
+                .map(|x| x.into_resident_layer())
+                .collect_vec();
+            let delta_layers = delta_layers
+                .into_iter()
+                .map(|x| x.into_resident_layer())
+                .collect_vec();
+            assert_eq!(image_layers.len(), N / 512 + 1);
+            assert_eq!(delta_layers.len(), N / 512 + 1);
+            for idx in 0..image_layers.len() {
+                assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
+                assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
+                assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
+                assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
+                if idx > 0 {
+                    assert_eq!(
+                        image_layers[idx - 1].layer_desc().key_range.end,
+                        image_layers[idx].layer_desc().key_range.start
+                    );
+                    assert_eq!(
+                        delta_layers[idx - 1].layer_desc().key_range.end,
+                        delta_layers[idx].layer_desc().key_range.start
+                    );
+                }
             }
         }
     }
@@ -456,4 +648,49 @@ mod tests {
             .unwrap();
         assert_eq!(layers.len(), 2);
     }
+
+    #[tokio::test]
+    async fn write_split_single_key() {
+        let harness = TenantHarness::create("split_writer_write_split_single_key")
+            .await
+            .unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        const N: usize = 2000;
+        let mut delta_writer = SplitDeltaLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
+            4 * 1024 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
+
+        for i in 0..N {
+            let i = i as u32;
+            delta_writer
+                .put_value(
+                    get_key(0),
+                    Lsn(i as u64 * 16 + 0x10),
+                    Value::Image(get_large_img()),
+                    &tline,
+                    &ctx,
+                )
+                .await
+                .unwrap();
+        }
+        let delta_layers = delta_writer
+            .finish(&tline, &ctx, get_key(N as u32))
+            .await
+            .unwrap();
+        assert_eq!(delta_layers.len(), 1);
+    }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b33e436fce..098c196ee8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5444,12 +5444,17 @@ impl Timeline {
                 !(a.end <= b.start || b.end <= a.start)
             }
 
-            let guard = self.layers.read().await;
-            for layer in guard.layer_map()?.iter_historic_layers() {
-                if layer.is_delta()
-                    && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
-                    && layer.lsn_range != deltas.lsn_range
-                {
+            if deltas.key_range.start.next() != deltas.key_range.end {
+                let guard = self.layers.read().await;
+                let mut invalid_layers =
+                    guard.layer_map()?.iter_historic_layers().filter(|layer| {
+                        layer.is_delta()
+                        && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
+                        && layer.lsn_range != deltas.lsn_range
+                        // skip single-key layer files
+                        && layer.key_range.start.next() != layer.key_range.end
+                    });
+                if let Some(layer) = invalid_layers.next() {
                     // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
                     panic!(
                         "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 7370ec1386..aad75ac59c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -14,7 +14,7 @@ use super::{
     RecordedDuration, Timeline,
 };
 
-use anyhow::{anyhow, Context};
+use anyhow::{anyhow, bail, Context};
 use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
@@ -32,6 +32,9 @@ use crate::page_cache;
 use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
+use crate::tenant::storage_layer::split_writer::{
+    SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
+};
 use crate::tenant::storage_layer::{
     AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
 };
@@ -71,15 +74,60 @@ pub(crate) struct KeyHistoryRetention {
 }
 
 impl KeyHistoryRetention {
+    /// Hack: skip delta layer if we need to produce a layer of a same key-lsn.
+    ///
+    /// This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
+    /// For example, consider the case where a single delta with range [0x10,0x50) exists.
+    /// And we have branches at LSN 0x10, 0x20, 0x30.
+    /// Then we delete branch @ 0x20.
+    /// Bottom-most compaction may now delete the delta [0x20,0x30).
+    /// And that wouldnt' change the shape of the layer.
+    ///
+    /// Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
+    ///
+    /// `discard_key` will only be called when the writer reaches its target (instead of for every key), so it's fine to grab a lock inside.
+    async fn discard_key(key: &PersistentLayerKey, tline: &Arc<Timeline>, dry_run: bool) -> bool {
+        if dry_run {
+            return true;
+        }
+        let guard = tline.layers.read().await;
+        if !guard.contains_key(key) {
+            return false;
+        }
+        let layer_generation = guard.get_from_key(key).metadata().generation;
+        drop(guard);
+        if layer_generation == tline.generation {
+            info!(
+                key=%key,
+                ?layer_generation,
+                "discard layer due to duplicated layer key in the same generation",
+            );
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Pipe a history of a single key to the writers.
+    ///
+    /// If `image_writer` is none, the images will be placed into the delta layers.
+    /// The delta writer will contain all images and deltas (below and above the horizon) except the bottom-most images.
+    #[allow(clippy::too_many_arguments)]
     async fn pipe_to(
         self,
         key: Key,
-        delta_writer: &mut Vec<(Key, Lsn, Value)>,
-        mut image_writer: Option<&mut ImageLayerWriter>,
+        tline: &Arc<Timeline>,
+        delta_writer: &mut SplitDeltaLayerWriter,
+        mut image_writer: Option<&mut SplitImageLayerWriter>,
         stat: &mut CompactionStatistics,
+        dry_run: bool,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let mut first_batch = true;
+        let discard = |key: &PersistentLayerKey| {
+            let key = key.clone();
+            async move { Self::discard_key(&key, tline, dry_run).await }
+        };
         for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
             if first_batch {
                 if logs.len() == 1 && logs[0].1.is_image() {
@@ -88,28 +136,45 @@ impl KeyHistoryRetention {
                     };
                     stat.produce_image_key(img);
                     if let Some(image_writer) = image_writer.as_mut() {
-                        image_writer.put_image(key, img.clone(), ctx).await?;
+                        image_writer
+                            .put_image_with_discard_fn(key, img.clone(), tline, ctx, discard)
+                            .await?;
                     } else {
-                        delta_writer.push((key, cutoff_lsn, Value::Image(img.clone())));
+                        delta_writer
+                            .put_value_with_discard_fn(
+                                key,
+                                cutoff_lsn,
+                                Value::Image(img.clone()),
+                                tline,
+                                ctx,
+                                discard,
+                            )
+                            .await?;
                     }
                 } else {
                     for (lsn, val) in logs {
                         stat.produce_key(&val);
-                        delta_writer.push((key, lsn, val));
+                        delta_writer
+                            .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
+                            .await?;
                     }
                 }
                 first_batch = false;
             } else {
                 for (lsn, val) in logs {
                     stat.produce_key(&val);
-                    delta_writer.push((key, lsn, val));
+                    delta_writer
+                        .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
+                        .await?;
                 }
             }
         }
         let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
         for (lsn, val) in above_horizon_logs {
             stat.produce_key(&val);
-            delta_writer.push((key, lsn, val));
+            delta_writer
+                .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
+                .await?;
         }
         Ok(())
     }
@@ -1814,11 +1879,27 @@ impl Timeline {
             }
             let mut selected_layers = Vec::new();
             drop(gc_info);
+            // Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
+            let Some(max_layer_lsn) = layers
+                .iter_historic_layers()
+                .filter(|desc| desc.get_lsn_range().start <= gc_cutoff)
+                .map(|desc| desc.get_lsn_range().end)
+                .max()
+            else {
+                info!("no layers to compact with gc");
+                return Ok(());
+            };
+            // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
+            // layers to compact.
             for desc in layers.iter_historic_layers() {
-                if desc.get_lsn_range().start <= gc_cutoff {
+                if desc.get_lsn_range().end <= max_layer_lsn {
                     selected_layers.push(guard.get_from_desc(&desc));
                 }
             }
+            if selected_layers.is_empty() {
+                info!("no layers to compact with gc");
+                return Ok(());
+            }
             retain_lsns_below_horizon.sort();
             (selected_layers, gc_cutoff, retain_lsns_below_horizon)
         };
@@ -1848,27 +1929,53 @@ impl Timeline {
             lowest_retain_lsn
         );
         // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
-        // Also, collect the layer information to decide when to split the new delta layers.
-        let mut downloaded_layers = Vec::new();
-        let mut delta_split_points = BTreeSet::new();
+        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
+        let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
         for layer in &layer_selection {
-            let resident_layer = layer.download_and_keep_resident().await?;
-            downloaded_layers.push(resident_layer);
-
             let desc = layer.layer_desc();
             if desc.is_delta() {
-                // TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon)
-                // so that we can avoid having too many small delta layers.
-                let key_range = desc.get_key_range();
-                delta_split_points.insert(key_range.start);
-                delta_split_points.insert(key_range.end);
+                // ignore single-key layer files
+                if desc.key_range.start.next() != desc.key_range.end {
+                    let lsn_range = &desc.lsn_range;
+                    lsn_split_point.insert(lsn_range.start);
+                    lsn_split_point.insert(lsn_range.end);
+                }
                 stat.visit_delta_layer(desc.file_size());
             } else {
                 stat.visit_image_layer(desc.file_size());
             }
         }
+        for layer in &layer_selection {
+            let desc = layer.layer_desc();
+            let key_range = &desc.key_range;
+            if desc.is_delta() && key_range.start.next() != key_range.end {
+                let lsn_range = desc.lsn_range.clone();
+                let intersects = lsn_split_point.range(lsn_range).collect_vec();
+                if intersects.len() > 1 {
+                    bail!(
+                        "cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
+                        desc.key(),
+                        intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
+                    );
+                }
+            }
+        }
+        // The maximum LSN we are processing in this compaction loop
+        let end_lsn = layer_selection
+            .iter()
+            .map(|l| l.layer_desc().lsn_range.end)
+            .max()
+            .unwrap();
+        // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
+        // as an L0 layer.
+        let hack_end_key = Key::NON_L0_MAX;
         let mut delta_layers = Vec::new();
         let mut image_layers = Vec::new();
+        let mut downloaded_layers = Vec::new();
+        for layer in &layer_selection {
+            let resident_layer = layer.download_and_keep_resident().await?;
+            downloaded_layers.push(resident_layer);
+        }
         for resident_layer in &downloaded_layers {
             if resident_layer.layer_desc().is_delta() {
                 let layer = resident_layer.get_as_delta(ctx).await?;
@@ -1884,138 +1991,17 @@ impl Timeline {
         let mut accumulated_values = Vec::new();
         let mut last_key: Option<Key> = None;
 
-        enum FlushDeltaResult {
-            /// Create a new resident layer
-            CreateResidentLayer(ResidentLayer),
-            /// Keep an original delta layer
-            KeepLayer(PersistentLayerKey),
-        }
-
-        #[allow(clippy::too_many_arguments)]
-        async fn flush_deltas(
-            deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
-            last_key: Key,
-            delta_split_points: &[Key],
-            current_delta_split_point: &mut usize,
-            tline: &Arc<Timeline>,
-            lowest_retain_lsn: Lsn,
-            ctx: &RequestContext,
-            stats: &mut CompactionStatistics,
-            dry_run: bool,
-            last_batch: bool,
-        ) -> anyhow::Result<Option<FlushDeltaResult>> {
-            // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
-            // overlapping layers.
-            //
-            // If we have a structure like this:
-            //
-            // | Delta 1 |         | Delta 4 |
-            // |---------| Delta 2 |---------|
-            // | Delta 3 |         | Delta 5 |
-            //
-            // And we choose to compact delta 2+3+5. We will get an overlapping delta layer with delta 1+4.
-            // A simple solution here is to split the delta layers using the original boundary, while this
-            // might produce a lot of small layers. This should be improved and fixed in the future.
-            let mut need_split = false;
-            while *current_delta_split_point < delta_split_points.len()
-                && last_key >= delta_split_points[*current_delta_split_point]
-            {
-                *current_delta_split_point += 1;
-                need_split = true;
-            }
-            if !need_split && !last_batch {
-                return Ok(None);
-            }
-            let deltas: Vec<(Key, Lsn, Value)> = std::mem::take(deltas);
-            if deltas.is_empty() {
-                return Ok(None);
-            }
-            let end_lsn = deltas.iter().map(|(_, lsn, _)| lsn).max().copied().unwrap() + 1;
-            let delta_key = PersistentLayerKey {
-                key_range: {
-                    let key_start = deltas.first().unwrap().0;
-                    let key_end = deltas.last().unwrap().0.next();
-                    key_start..key_end
-                },
-                lsn_range: lowest_retain_lsn..end_lsn,
-                is_delta: true,
-            };
-            {
-                // Hack: skip delta layer if we need to produce a layer of a same key-lsn.
-                //
-                // This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
-                // For example, consider the case where a single delta with range [0x10,0x50) exists.
-                // And we have branches at LSN 0x10, 0x20, 0x30.
-                // Then we delete branch @ 0x20.
-                // Bottom-most compaction may now delete the delta [0x20,0x30).
-                // And that wouldnt' change the shape of the layer.
-                //
-                // Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
-                // That's why it's safe to skip.
-                let guard = tline.layers.read().await;
-
-                if guard.contains_key(&delta_key) {
-                    let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
-                    drop(guard);
-                    if layer_generation == tline.generation {
-                        stats.discard_delta_layer();
-                        // TODO: depending on whether we design this compaction process to run along with
-                        // other compactions, there could be layer map modifications after we drop the
-                        // layer guard, and in case it creates duplicated layer key, we will still error
-                        // in the end.
-                        info!(
-                            key=%delta_key,
-                            ?layer_generation,
-                            "discard delta layer due to duplicated layer in the same generation"
-                        );
-                        return Ok(Some(FlushDeltaResult::KeepLayer(delta_key)));
-                    }
-                }
-            }
-
-            let mut delta_layer_writer = DeltaLayerWriter::new(
-                tline.conf,
-                tline.timeline_id,
-                tline.tenant_shard_id,
-                delta_key.key_range.start,
-                lowest_retain_lsn..end_lsn,
-                ctx,
-            )
-            .await?;
-            for (key, lsn, val) in deltas {
-                delta_layer_writer.put_value(key, lsn, val, ctx).await?;
-            }
-
-            stats.produce_delta_layer(delta_layer_writer.size());
-            if dry_run {
-                return Ok(None);
-            }
-
-            let (desc, path) = delta_layer_writer
-                .finish(delta_key.key_range.end, ctx)
-                .await?;
-            let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?;
-            Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
-        }
-
-        // Hack the key range to be min..(max-1). Otherwise, the image layer will be
-        // interpreted as an L0 delta layer.
-        let hack_image_layer_range = {
-            let mut end_key = Key::MAX;
-            end_key.field6 -= 1;
-            Key::MIN..end_key
-        };
-
         // Only create image layers when there is no ancestor branches. TODO: create covering image layer
         // when some condition meet.
         let mut image_layer_writer = if self.ancestor_timeline.is_none() {
             Some(
-                ImageLayerWriter::new(
+                SplitImageLayerWriter::new(
                     self.conf,
                     self.timeline_id,
                     self.tenant_shard_id,
-                    &hack_image_layer_range, // covers the full key range
+                    Key::MIN,
                     lowest_retain_lsn,
+                    self.get_compaction_target_size(),
                     ctx,
                 )
                 .await?,
@@ -2024,6 +2010,17 @@ impl Timeline {
             None
         };
 
+        let mut delta_layer_writer = SplitDeltaLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            Key::MIN,
+            lowest_retain_lsn..end_lsn,
+            self.get_compaction_target_size(),
+            ctx,
+        )
+        .await?;
+
         /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
         ///
         /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
@@ -2044,47 +2041,11 @@ impl Timeline {
             let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
             Ok(Some((key, tline.ancestor_lsn, img)))
         }
-        let image_layer_key = PersistentLayerKey {
-            key_range: hack_image_layer_range,
-            lsn_range: PersistentLayerDesc::image_layer_lsn_range(lowest_retain_lsn),
-            is_delta: false,
-        };
-
-        // Like with delta layers, it can happen that we re-produce an already existing image layer.
-        // This could happen when a user triggers force compaction and image generation. In this case,
-        // it's always safe to rewrite the layer.
-        let discard_image_layer = {
-            let guard = self.layers.read().await;
-            if guard.contains_key(&image_layer_key) {
-                let layer_generation = guard.get_from_key(&image_layer_key).metadata().generation;
-                drop(guard);
-                if layer_generation == self.generation {
-                    // TODO: depending on whether we design this compaction process to run along with
-                    // other compactions, there could be layer map modifications after we drop the
-                    // layer guard, and in case it creates duplicated layer key, we will still error
-                    // in the end.
-                    info!(
-                        key=%image_layer_key,
-                        ?layer_generation,
-                        "discard image layer due to duplicated layer key in the same generation",
-                    );
-                    true
-                } else {
-                    false
-                }
-            } else {
-                false
-            }
-        };
 
         // Actually, we can decide not to write to the image layer at all at this point because
         // the key and LSN range are determined. However, to keep things simple here, we still
         // create this writer, and discard the writer in the end.
 
-        let mut delta_values = Vec::new();
-        let delta_split_points = delta_split_points.into_iter().collect_vec();
-        let mut current_delta_split_point = 0;
-        let mut delta_layers = Vec::new();
         while let Some((key, lsn, val)) = merge_iter.next().await? {
             if cancel.is_cancelled() {
                 return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
@@ -2115,27 +2076,14 @@ impl Timeline {
                 retention
                     .pipe_to(
                         *last_key,
-                        &mut delta_values,
+                        self,
+                        &mut delta_layer_writer,
                         image_layer_writer.as_mut(),
                         &mut stat,
+                        dry_run,
                         ctx,
                     )
                     .await?;
-                delta_layers.extend(
-                    flush_deltas(
-                        &mut delta_values,
-                        *last_key,
-                        &delta_split_points,
-                        &mut current_delta_split_point,
-                        self,
-                        lowest_retain_lsn,
-                        ctx,
-                        &mut stat,
-                        dry_run,
-                        false,
-                    )
-                    .await?,
-                );
                 accumulated_values.clear();
                 *last_key = key;
                 accumulated_values.push((key, lsn, val));
@@ -2159,43 +2107,75 @@ impl Timeline {
         retention
             .pipe_to(
                 last_key,
-                &mut delta_values,
+                self,
+                &mut delta_layer_writer,
                 image_layer_writer.as_mut(),
                 &mut stat,
+                dry_run,
                 ctx,
             )
             .await?;
-        delta_layers.extend(
-            flush_deltas(
-                &mut delta_values,
-                last_key,
-                &delta_split_points,
-                &mut current_delta_split_point,
-                self,
-                lowest_retain_lsn,
-                ctx,
-                &mut stat,
-                dry_run,
-                true,
-            )
-            .await?,
-        );
-        assert!(delta_values.is_empty(), "unprocessed keys");
 
-        let image_layer = if discard_image_layer {
-            stat.discard_image_layer();
-            None
-        } else if let Some(writer) = image_layer_writer {
-            stat.produce_image_layer(writer.size());
+        let discard = |key: &PersistentLayerKey| {
+            let key = key.clone();
+            async move { KeyHistoryRetention::discard_key(&key, self, dry_run).await }
+        };
+
+        let produced_image_layers = if let Some(writer) = image_layer_writer {
             if !dry_run {
-                Some(writer.finish(self, ctx).await?)
+                writer
+                    .finish_with_discard_fn(self, ctx, hack_end_key, discard)
+                    .await?
             } else {
-                None
+                let (layers, _) = writer.take()?;
+                assert!(layers.is_empty(), "image layers produced in dry run mode?");
+                Vec::new()
             }
         } else {
-            None
+            Vec::new()
         };
 
+        let produced_delta_layers = if !dry_run {
+            delta_layer_writer
+                .finish_with_discard_fn(self, ctx, hack_end_key, discard)
+                .await?
+        } else {
+            let (layers, _) = delta_layer_writer.take()?;
+            assert!(layers.is_empty(), "delta layers produced in dry run mode?");
+            Vec::new()
+        };
+
+        let mut compact_to = Vec::new();
+        let mut keep_layers = HashSet::new();
+        let produced_delta_layers_len = produced_delta_layers.len();
+        let produced_image_layers_len = produced_image_layers.len();
+        for action in produced_delta_layers {
+            match action {
+                SplitWriterResult::Produced(layer) => {
+                    stat.produce_delta_layer(layer.layer_desc().file_size());
+                    compact_to.push(layer);
+                }
+                SplitWriterResult::Discarded(l) => {
+                    keep_layers.insert(l);
+                    stat.discard_delta_layer();
+                }
+            }
+        }
+        for action in produced_image_layers {
+            match action {
+                SplitWriterResult::Produced(layer) => {
+                    stat.produce_image_layer(layer.layer_desc().file_size());
+                    compact_to.push(layer);
+                }
+                SplitWriterResult::Discarded(l) => {
+                    keep_layers.insert(l);
+                    stat.discard_image_layer();
+                }
+            }
+        }
+        let mut layer_selection = layer_selection;
+        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
+
         info!(
             "gc-compaction statistics: {}",
             serde_json::to_string(&stat)?
@@ -2206,28 +2186,11 @@ impl Timeline {
         }
 
         info!(
-            "produced {} delta layers and {} image layers",
-            delta_layers.len(),
-            if image_layer.is_some() { 1 } else { 0 }
+            "produced {} delta layers and {} image layers, {} layers are kept",
+            produced_delta_layers_len,
+            produced_image_layers_len,
+            layer_selection.len()
         );
-        let mut compact_to = Vec::new();
-        let mut keep_layers = HashSet::new();
-        for action in delta_layers {
-            match action {
-                FlushDeltaResult::CreateResidentLayer(layer) => {
-                    compact_to.push(layer);
-                }
-                FlushDeltaResult::KeepLayer(l) => {
-                    keep_layers.insert(l);
-                }
-            }
-        }
-        if discard_image_layer {
-            keep_layers.insert(image_layer_key);
-        }
-        let mut layer_selection = layer_selection;
-        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
-        compact_to.extend(image_layer);
 
         // Step 3: Place back to the layer map.
         {

From bf03713fa1d0d02a931d00d4625c6f0a2bb85645 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 26 Aug 2024 23:17:07 +0200
Subject: [PATCH 1503/1571] fix(sql-exporter): Fix typo in  gauge

In f4b3c317f there was a typo and I missed that on review
---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 0bacb63509..55a7cc5a9f 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -326,7 +326,7 @@ files:
           SELECT checkpoints_timed FROM pg_stat_bgwriter;
 
       - metric_name: compute_logical_snapshot_files
-        type: guage
+        type: gauge
         help: 'Number of snapshot files in pg_logical/snapshot'
         key_labels:
           - tenant_id

From 7820c572e73160b0dfa4628edb1723c8527d6c7e Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 26 Aug 2024 23:25:57 +0200
Subject: [PATCH 1504/1571] fix(sql-exporter): Remove tenant_id from
 compute_logical_snapshot_files

It appeared to be that it's already auto-added to all metrics [1]

[1]: https://github.com/neondatabase/flux-fleet/blob/3a907c317c1e4515190b04a8726e00a9f976214a/apps/base/ext-vmagent/vmagent.yaml#L43
---
 vm-image-spec.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 55a7cc5a9f..c94f95f447 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -329,12 +329,10 @@ files:
         type: gauge
         help: 'Number of snapshot files in pg_logical/snapshot'
         key_labels:
-          - tenant_id
           - timeline_id
         values: [num_logical_snapshot_files]
         query: |
           SELECT
-            (SELECT setting FROM pg_settings WHERE name = 'neon.tenant_id') AS tenant_id,
             (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
             -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
             -- temporary snapshot files are renamed to the actual snapshot files after they are

From 09362b63635d46cecef77cb04bd1e406dabd2026 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 26 Aug 2024 16:12:51 +0300
Subject: [PATCH 1505/1571] safekeeper: reorder routes and their handlers.

Routes and their handlers were in a bit different order in 1) routes
list 2) their implementation 3) python client 4) openapi spec, making
addition of new ones intimidating. Make it the same everywhere, roughly
lexicographically but preserving some of existing logic.

No functional changes.
---
 safekeeper/src/http/openapi_spec.yaml   |  70 ++++----
 safekeeper/src/http/routes.rs           | 216 ++++++++++++------------
 test_runner/fixtures/safekeeper/http.py | 148 ++++++++--------
 3 files changed, 216 insertions(+), 218 deletions(-)

diff --git a/safekeeper/src/http/openapi_spec.yaml b/safekeeper/src/http/openapi_spec.yaml
index a617e0310c..70999853c2 100644
--- a/safekeeper/src/http/openapi_spec.yaml
+++ b/safekeeper/src/http/openapi_spec.yaml
@@ -86,42 +86,6 @@ paths:
         default:
           $ref: "#/components/responses/GenericError"
 
-  /v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-      - name: source_timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-
-    post:
-      tags:
-      - "Timeline"
-      summary: Register new timeline as copy of existing timeline
-      description: ""
-      operationId: v1CopyTenantTimeline
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/TimelineCopyRequest"
-      responses:
-        "201":
-          description: Timeline created
-          # TODO: return timeline info?
-        "403":
-          $ref: "#/components/responses/ForbiddenError"
-        default:
-          $ref: "#/components/responses/GenericError"
-
-
   /v1/tenant/{tenant_id}/timeline/{timeline_id}:
     parameters:
       - name: tenant_id
@@ -179,6 +143,40 @@ paths:
         default:
           $ref: "#/components/responses/GenericError"
 
+  /v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+      - name: source_timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+
+    post:
+      tags:
+      - "Timeline"
+      summary: Register new timeline as copy of existing timeline
+      description: ""
+      operationId: v1CopyTenantTimeline
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/TimelineCopyRequest"
+      responses:
+        "201":
+          description: Timeline created
+          # TODO: return timeline info?
+        "403":
+          $ref: "#/components/responses/ForbiddenError"
+        default:
+          $ref: "#/components/responses/GenericError"
 
   /v1/record_safekeeper_info/{tenant_id}/{timeline_id}:
     parameters:
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index d11815f6ef..91ffa95c21 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -114,7 +114,55 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
     })
 }
 
+/// Deactivates all timelines for the tenant and removes its data directory.
+/// See `timeline_delete_handler`.
+async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id = parse_request_param(&request, "tenant_id")?;
+    let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
+    check_permission(&request, Some(tenant_id))?;
+    ensure_no_body(&mut request).await?;
+    // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
+    // Using an `InternalServerError` should be fixed when the types support it
+    let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    json_response(
+        StatusCode::OK,
+        delete_info
+            .iter()
+            .map(|(ttid, resp)| (format!("{}", ttid.timeline_id), *resp))
+            .collect::<HashMap<String, TimelineDeleteForceResult>>(),
+    )
+}
+
+async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let request_data: TimelineCreateRequest = json_request(&mut request).await?;
+
+    let ttid = TenantTimelineId {
+        tenant_id: request_data.tenant_id,
+        timeline_id: request_data.timeline_id,
+    };
+    check_permission(&request, Some(ttid.tenant_id))?;
+
+    let server_info = ServerInfo {
+        pg_version: request_data.pg_version,
+        system_id: request_data.system_id.unwrap_or(0),
+        wal_seg_size: request_data.wal_seg_size.unwrap_or(WAL_SEGMENT_SIZE as u32),
+    };
+    let local_start_lsn = request_data.local_start_lsn.unwrap_or_else(|| {
+        request_data
+            .commit_lsn
+            .segment_lsn(server_info.wal_seg_size as usize)
+    });
+    GlobalTimelines::create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 /// List all (not deleted) timelines.
+/// Note: it is possible to do the same with debug_dump.
 async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
     let res: Vec<TenantTimelineId> = GlobalTimelines::get_all()
@@ -174,30 +222,21 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
     json_response(StatusCode::OK, status)
 }
 
-async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let request_data: TimelineCreateRequest = json_request(&mut request).await?;
-
-    let ttid = TenantTimelineId {
-        tenant_id: request_data.tenant_id,
-        timeline_id: request_data.timeline_id,
-    };
+/// Deactivates the timeline and removes its data directory.
+async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+    let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
     check_permission(&request, Some(ttid.tenant_id))?;
-
-    let server_info = ServerInfo {
-        pg_version: request_data.pg_version,
-        system_id: request_data.system_id.unwrap_or(0),
-        wal_seg_size: request_data.wal_seg_size.unwrap_or(WAL_SEGMENT_SIZE as u32),
-    };
-    let local_start_lsn = request_data.local_start_lsn.unwrap_or_else(|| {
-        request_data
-            .commit_lsn
-            .segment_lsn(server_info.wal_seg_size as usize)
-    });
-    GlobalTimelines::create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
+    ensure_no_body(&mut request).await?;
+    // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
+    // error handling here when we're able to.
+    let resp = GlobalTimelines::delete(&ttid, only_local)
         .await
         .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
+    json_response(StatusCode::OK, resp)
 }
 
 /// Pull timeline from peer safekeeper instances.
@@ -279,6 +318,46 @@ async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Bo
     json_response(StatusCode::OK, ())
 }
 
+async fn patch_control_file_handler(
+    mut request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+
+    let patch_request: patch_control_file::Request = json_request(&mut request).await?;
+    let response = patch_control_file::handle_request(tli, patch_request)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, response)
+}
+
+/// Force persist control file.
+async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+
+    let tli = GlobalTimelines::get(ttid)?;
+    tli.write_shared_state()
+        .await
+        .sk
+        .state_mut()
+        .flush()
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    json_response(StatusCode::OK, ())
+}
+
 async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let ttid = TenantTimelineId::new(
         parse_request_param(&request, "tenant_id")?,
@@ -310,64 +389,6 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
     json_response(StatusCode::OK, response)
 }
 
-/// Force persist control file.
-async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permission(&request, None)?;
-
-    let ttid = TenantTimelineId::new(
-        parse_request_param(&request, "tenant_id")?,
-        parse_request_param(&request, "timeline_id")?,
-    );
-
-    let tli = GlobalTimelines::get(ttid)?;
-    tli.write_shared_state()
-        .await
-        .sk
-        .state_mut()
-        .flush()
-        .await
-        .map_err(ApiError::InternalServerError)?;
-    json_response(StatusCode::OK, ())
-}
-
-/// Deactivates the timeline and removes its data directory.
-async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let ttid = TenantTimelineId::new(
-        parse_request_param(&request, "tenant_id")?,
-        parse_request_param(&request, "timeline_id")?,
-    );
-    let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
-    check_permission(&request, Some(ttid.tenant_id))?;
-    ensure_no_body(&mut request).await?;
-    // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
-    // error handling here when we're able to.
-    let resp = GlobalTimelines::delete(&ttid, only_local)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-    json_response(StatusCode::OK, resp)
-}
-
-/// Deactivates all timelines for the tenant and removes its data directory.
-/// See `timeline_delete_handler`.
-async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let tenant_id = parse_request_param(&request, "tenant_id")?;
-    let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
-    check_permission(&request, Some(tenant_id))?;
-    ensure_no_body(&mut request).await?;
-    // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
-    // Using an `InternalServerError` should be fixed when the types support it
-    let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-    json_response(
-        StatusCode::OK,
-        delete_info
-            .iter()
-            .map(|(ttid, resp)| (format!("{}", ttid.timeline_id), *resp))
-            .collect::<HashMap<String, TimelineDeleteForceResult>>(),
-    )
-}
-
 /// Used only in tests to hand craft required data.
 async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let ttid = TenantTimelineId::new(
@@ -509,26 +530,6 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
     Ok(response)
 }
 
-async fn patch_control_file_handler(
-    mut request: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permission(&request, None)?;
-
-    let ttid = TenantTimelineId::new(
-        parse_request_param(&request, "tenant_id")?,
-        parse_request_param(&request, "timeline_id")?,
-    );
-
-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
-
-    let patch_request: patch_control_file::Request = json_request(&mut request).await?;
-    let response = patch_control_file::handle_request(tli, patch_request)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, response)
-}
-
 /// Safekeeper http router.
 pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
     let mut router = endpoint::make_router();
@@ -568,6 +569,9 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
                 failpoints_handler(r, cancel).await
             })
         })
+        .delete("/v1/tenant/:tenant_id", |r| {
+            request_span(r, tenant_delete_handler)
+        })
         // Will be used in the future instead of implicit timeline creation
         .post("/v1/tenant/timeline", |r| {
             request_span(r, timeline_create_handler)
@@ -581,16 +585,13 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
             request_span(r, timeline_delete_handler)
         })
-        .delete("/v1/tenant/:tenant_id", |r| {
-            request_span(r, tenant_delete_handler)
+        .post("/v1/pull_timeline", |r| {
+            request_span(r, timeline_pull_handler)
         })
         .get(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id",
             |r| request_span(r, timeline_snapshot_handler),
         )
-        .post("/v1/pull_timeline", |r| {
-            request_span(r, timeline_pull_handler)
-        })
         .post(
             "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
             |r| request_span(r, timeline_copy_handler),
@@ -603,14 +604,13 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
             "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
             |r| request_span(r, timeline_checkpoint_handler),
         )
-        // for tests
+        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| {
+            request_span(r, timeline_digest_handler)
+        })
         .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
             request_span(r, record_safekeeper_info)
         })
         .get("/v1/debug_dump", |r| request_span(r, dump_debug_handler))
-        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| {
-            request_span(r, timeline_digest_handler)
-        })
 }
 
 #[cfg(test)]
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index dd3a0a3d54..05b43cfb72 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -65,6 +65,16 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
     def check_status(self):
         self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
 
+    def get_metrics_str(self) -> str:
+        """You probably want to use get_metrics() instead."""
+        request_result = self.get(f"http://localhost:{self.port}/metrics")
+        request_result.raise_for_status()
+        return request_result.text
+
+    def get_metrics(self) -> SafekeeperMetrics:
+        res = self.get_metrics_str()
+        return SafekeeperMetrics(parse_metrics(res))
+
     def is_testing_enabled_or_skip(self):
         if not self.is_testing_enabled:
             pytest.skip("safekeeper was built without 'testing' feature")
@@ -89,56 +99,8 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         assert res_json is None
         return res_json
 
-    def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
-        params = params or {}
-        res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
-        res.raise_for_status()
-        res_json = json.loads(res.text)
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def patch_control_file(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        patch: Dict[str, Any],
-    ) -> Dict[str, Any]:
-        res = self.patch(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
-            json={
-                "updates": patch,
-                "apply_fields": list(patch.keys()),
-            },
-        )
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
-        res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
-            json=body,
-        )
-        res.raise_for_status()
-
-    def timeline_digest(
-        self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
-    ) -> Dict[str, Any]:
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
-            params={
-                "from_lsn": str(from_lsn),
-                "until_lsn": str(until_lsn),
-            },
-        )
+    def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]:
+        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
         res.raise_for_status()
         res_json = res.json()
         assert isinstance(res_json, dict)
@@ -189,20 +151,6 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
     def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
         return self.timeline_status(tenant_id, timeline_id).commit_lsn
 
-    def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
-        res = self.post(
-            f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
-            json=body,
-        )
-        res.raise_for_status()
-
-    def checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
-            json={},
-        )
-        res.raise_for_status()
-
     # only_local doesn't remove segments in the remote storage.
     def timeline_delete(
         self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
@@ -218,19 +166,71 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, dict)
         return res_json
 
-    def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]:
-        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
+    def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
+        params = params or {}
+        res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
+        res.raise_for_status()
+        res_json = json.loads(res.text)
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
+        res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
         res.raise_for_status()
         res_json = res.json()
         assert isinstance(res_json, dict)
         return res_json
 
-    def get_metrics_str(self) -> str:
-        """You probably want to use get_metrics() instead."""
-        request_result = self.get(f"http://localhost:{self.port}/metrics")
-        request_result.raise_for_status()
-        return request_result.text
+    def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
+            json=body,
+        )
+        res.raise_for_status()
 
-    def get_metrics(self) -> SafekeeperMetrics:
-        res = self.get_metrics_str()
-        return SafekeeperMetrics(parse_metrics(res))
+    def patch_control_file(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        patch: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        res = self.patch(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
+            json={
+                "updates": patch,
+                "apply_fields": list(patch.keys()),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
+            json={},
+        )
+        res.raise_for_status()
+
+    def timeline_digest(
+        self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
+    ) -> Dict[str, Any]:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
+            params={
+                "from_lsn": str(from_lsn),
+                "until_lsn": str(until_lsn),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
+            json=body,
+        )
+        res.raise_for_status()

From 5d527133a322a940d18f8613eb12078d2254fa07 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 27 Aug 2024 12:39:42 +0200
Subject: [PATCH 1506/1571] Fix the pg_hintplan flakyness (#8834)

## Problem
pg_hintplan test seems to be flaky, sometimes it fails, while usually it
passes

## Summary of changes

The regression test is changed to filter out the Neon service queries. The
expected file is changed as well.

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 Dockerfile.compute-node                       |  4 +-
 docker-compose/run-tests.sh                   |  2 +-
 .../{pg_hintplan.patch => pg_hint_plan.patch} | 40 +++++++++++++------
 3 files changed, 30 insertions(+), 16 deletions(-)
 rename patches/{pg_hintplan.patch => pg_hint_plan.patch} (55%)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 7acaf2f2fd..b6c89cd71f 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -942,7 +942,7 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
 COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
 #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
 COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
-COPY patches/pg_hintplan.patch /ext-src
+COPY patches/pg_hint_plan.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
@@ -964,7 +964,7 @@ RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
 RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 # cmake is required for the h3 test
 RUN apt-get update && apt-get install -y cmake
-RUN patch -p1 < /ext-src/pg_hintplan.patch
+RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
 RUN patch -p1 </ext-src/pg_anon.patch
 RUN patch -p1 </ext-src/pg_cron.patch
diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh
index 58b2581197..3fc0b90071 100644
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -3,7 +3,7 @@ set -x
 
 cd /ext-src || exit 2
 FAILED=
-LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
+LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
 for d in ${LIST}
 do
        [ -d "${d}" ] || continue
diff --git a/patches/pg_hintplan.patch b/patches/pg_hint_plan.patch
similarity index 55%
rename from patches/pg_hintplan.patch
rename to patches/pg_hint_plan.patch
index 61a5ecbb90..4039a036df 100644
--- a/patches/pg_hintplan.patch
+++ b/patches/pg_hint_plan.patch
@@ -1,13 +1,7 @@
-commit f7925d4d1406c0f0229e3c691c94b69e381899b1 (HEAD -> master)
-Author: Alexey Masterov <alexeymasterov@neon.tech>
-Date:   Thu Jun 6 08:02:42 2024 +0000
-
-    Patch expected files to consider Neon's log messages
-
-diff --git a/ext-src/pg_hint_plan-src/expected/ut-A.out b/ext-src/pg_hint_plan-src/expected/ut-A.out
-index da723b8..f8d0102 100644
---- a/ext-src/pg_hint_plan-src/expected/ut-A.out
-+++ b/ext-src/pg_hint_plan-src/expected/ut-A.out
+diff --git a/expected/ut-A.out b/expected/ut-A.out
+index da723b8..5328114 100644
+--- a/expected/ut-A.out
++++ b/expected/ut-A.out
 @@ -9,13 +9,16 @@ SET search_path TO public;
  ----
  -- No.A-1-1-3
@@ -25,10 +19,18 @@ index da723b8..f8d0102 100644
  DROP SCHEMA other_schema;
  ----
  ---- No. A-5-1 comment pattern
-diff --git a/ext-src/pg_hint_plan-src/expected/ut-fdw.out b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
+@@ -3175,6 +3178,7 @@ SELECT s.query, s.calls
+   FROM public.pg_stat_statements s
+   JOIN pg_catalog.pg_database d
+     ON (s.dbid = d.oid)
++  WHERE s.query LIKE 'SELECT * FROM s1.t1%' OR s.query LIKE '%pg_stat_statements_reset%'
+  ORDER BY 1;
+                 query                 | calls 
+ --------------------------------------+-------
+diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out
 index d372459..6282afe 100644
---- a/ext-src/pg_hint_plan-src/expected/ut-fdw.out
-+++ b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
+--- a/expected/ut-fdw.out
++++ b/expected/ut-fdw.out
 @@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
  SET client_min_messages TO LOG;
  SET pg_hint_plan.enable_hint TO on;
@@ -37,3 +39,15 @@ index d372459..6282afe 100644
  CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
  CREATE USER MAPPING FOR PUBLIC SERVER file_server;
  CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
+diff --git a/sql/ut-A.sql b/sql/ut-A.sql
+index 7c7d58a..4fd1a07 100644
+--- a/sql/ut-A.sql
++++ b/sql/ut-A.sql
+@@ -963,6 +963,7 @@ SELECT s.query, s.calls
+   FROM public.pg_stat_statements s
+   JOIN pg_catalog.pg_database d
+     ON (s.dbid = d.oid)
++  WHERE s.query LIKE 'SELECT * FROM s1.t1%' OR s.query LIKE '%pg_stat_statements_reset%'
+  ORDER BY 1;
+ 
+ ----

From 12850dd5e972ed2be27ce05cf36eff3f846d9aab Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 27 Aug 2024 12:00:35 +0100
Subject: [PATCH 1507/1571] proxy: remove dead code (#8847)

By marking everything possible as pub(crate), we find a few dead code
candidates.
---
 proxy/src/auth.rs                             | 24 ++---
 proxy/src/auth/backend.rs                     | 55 +++++------
 proxy/src/auth/backend/hacks.rs               |  4 +-
 proxy/src/auth/backend/jwt.rs                 | 18 ++--
 proxy/src/auth/backend/link.rs                |  4 +-
 proxy/src/auth/backend/local.rs               |  8 +-
 proxy/src/auth/credentials.rs                 | 20 ++--
 proxy/src/auth/flow.rs                        | 31 +++---
 proxy/src/auth/password_hack.rs               | 10 +-
 proxy/src/cache.rs                            | 10 +-
 proxy/src/cache/common.rs                     | 18 ++--
 proxy/src/cache/endpoints.rs                  |  6 +-
 proxy/src/cache/project_info.rs               | 24 ++---
 proxy/src/cache/timed_lru.rs                  | 45 ++-------
 proxy/src/cancellation.rs                     | 16 ++--
 proxy/src/compute.rs                          | 26 ++---
 proxy/src/console.rs                          |  2 +-
 proxy/src/console/messages.rs                 | 94 ++++++++++---------
 proxy/src/console/mgmt.rs                     |  6 +-
 proxy/src/console/provider.rs                 | 59 ++++++------
 proxy/src/console/provider/mock.rs            |  2 +-
 proxy/src/console/provider/neon.rs            |  6 +-
 proxy/src/context.rs                          | 57 +++++------
 proxy/src/context/parquet.rs                  |  6 +-
 proxy/src/error.rs                            | 10 +-
 proxy/src/http.rs                             | 16 ++--
 proxy/src/intern.rs                           | 34 ++++---
 proxy/src/lib.rs                              | 17 ++--
 proxy/src/parse.rs                            |  4 +-
 proxy/src/protocol2.rs                        |  6 +-
 proxy/src/proxy.rs                            | 40 ++++----
 proxy/src/proxy/connect_compute.rs            | 14 +--
 proxy/src/proxy/copy_bidirectional.rs         |  2 +-
 proxy/src/proxy/handshake.rs                  |  6 +-
 proxy/src/proxy/passthrough.rs                | 18 ++--
 proxy/src/proxy/retry.rs                      |  8 +-
 proxy/src/proxy/tests.rs                      | 13 +--
 proxy/src/proxy/tests/mitm.rs                 |  2 +-
 proxy/src/proxy/wake_compute.rs               |  2 +-
 proxy/src/rate_limiter.rs                     | 14 ++-
 proxy/src/rate_limiter/leaky_bucket.rs        |  9 +-
 proxy/src/rate_limiter/limit_algorithm.rs     | 58 ++++++------
 .../src/rate_limiter/limit_algorithm/aimd.rs  | 12 +--
 proxy/src/rate_limiter/limiter.rs             | 14 +--
 proxy/src/redis/cancellation_publisher.rs     |  2 +-
 .../connection_with_credentials_provider.rs   | 11 ++-
 proxy/src/redis/elasticache.rs                |  2 +-
 proxy/src/redis/notifications.rs              | 12 +--
 proxy/src/sasl.rs                             | 14 +--
 proxy/src/sasl/channel_binding.rs             | 11 ++-
 proxy/src/sasl/messages.rs                    |  8 +-
 proxy/src/sasl/stream.rs                      |  8 +-
 proxy/src/scram.rs                            | 10 +-
 proxy/src/scram/countmin.rs                   |  8 +-
 proxy/src/scram/exchange.rs                   |  6 +-
 proxy/src/scram/key.rs                        |  8 +-
 proxy/src/scram/messages.rs                   | 34 +++----
 proxy/src/scram/pbkdf2.rs                     |  8 +-
 proxy/src/scram/secret.rs                     | 20 ++--
 proxy/src/scram/signature.rs                  | 14 +--
 proxy/src/scram/threadpool.rs                 |  2 +-
 proxy/src/serverless.rs                       |  8 +-
 proxy/src/serverless/backend.rs               | 16 ++--
 proxy/src/serverless/cancel_set.rs            | 10 +-
 proxy/src/serverless/conn_pool.rs             | 50 +++++-----
 proxy/src/serverless/http_util.rs             |  6 +-
 proxy/src/serverless/json.rs                  |  6 +-
 proxy/src/serverless/sql_over_http.rs         | 10 +-
 proxy/src/serverless/websocket.rs             |  6 +-
 proxy/src/stream.rs                           | 15 +--
 proxy/src/url.rs                              |  4 +-
 proxy/src/usage_metrics.rs                    | 16 ++--
 proxy/src/waiters.rs                          | 14 +--
 73 files changed, 580 insertions(+), 609 deletions(-)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index 3b3c571129..f3ecedb839 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -4,17 +4,17 @@ pub mod backend;
 pub use backend::BackendType;
 
 mod credentials;
-pub use credentials::{
+pub(crate) use credentials::{
     check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint,
     ComputeUserInfoParseError, IpPattern,
 };
 
 mod password_hack;
-pub use password_hack::parse_endpoint_param;
+pub(crate) use password_hack::parse_endpoint_param;
 use password_hack::PasswordHackPayload;
 
 mod flow;
-pub use flow::*;
+pub(crate) use flow::*;
 use tokio::time::error::Elapsed;
 
 use crate::{
@@ -25,11 +25,11 @@ use std::{io, net::IpAddr};
 use thiserror::Error;
 
 /// Convenience wrapper for the authentication error.
-pub type Result<T> = std::result::Result<T, AuthError>;
+pub(crate) type Result<T> = std::result::Result<T, AuthError>;
 
 /// Common authentication error.
 #[derive(Debug, Error)]
-pub enum AuthErrorImpl {
+pub(crate) enum AuthErrorImpl {
     #[error(transparent)]
     Link(#[from] backend::LinkAuthError),
 
@@ -77,30 +77,30 @@ pub enum AuthErrorImpl {
 
 #[derive(Debug, Error)]
 #[error(transparent)]
-pub struct AuthError(Box<AuthErrorImpl>);
+pub(crate) struct AuthError(Box<AuthErrorImpl>);
 
 impl AuthError {
-    pub fn bad_auth_method(name: impl Into<Box<str>>) -> Self {
+    pub(crate) fn bad_auth_method(name: impl Into<Box<str>>) -> Self {
         AuthErrorImpl::BadAuthMethod(name.into()).into()
     }
 
-    pub fn auth_failed(user: impl Into<Box<str>>) -> Self {
+    pub(crate) fn auth_failed(user: impl Into<Box<str>>) -> Self {
         AuthErrorImpl::AuthFailed(user.into()).into()
     }
 
-    pub fn ip_address_not_allowed(ip: IpAddr) -> Self {
+    pub(crate) fn ip_address_not_allowed(ip: IpAddr) -> Self {
         AuthErrorImpl::IpAddressNotAllowed(ip).into()
     }
 
-    pub fn too_many_connections() -> Self {
+    pub(crate) fn too_many_connections() -> Self {
         AuthErrorImpl::TooManyConnections.into()
     }
 
-    pub fn is_auth_failed(&self) -> bool {
+    pub(crate) fn is_auth_failed(&self) -> bool {
         matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_))
     }
 
-    pub fn user_timeout(elapsed: Elapsed) -> Self {
+    pub(crate) fn user_timeout(elapsed: Elapsed) -> Self {
         AuthErrorImpl::UserTimeout(elapsed).into()
     }
 }
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index bb9a0ddffc..77dea39fdc 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -9,7 +9,7 @@ use std::sync::Arc;
 use std::time::Duration;
 
 use ipnet::{Ipv4Net, Ipv6Net};
-pub use link::LinkAuthError;
+pub(crate) use link::LinkAuthError;
 use local::LocalBackend;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::AuthKeys;
@@ -74,12 +74,12 @@ pub enum BackendType<'a, T, D> {
     Local(MaybeOwned<'a, LocalBackend>),
 }
 
-pub trait TestBackend: Send + Sync + 'static {
+#[cfg(test)]
+pub(crate) trait TestBackend: Send + Sync + 'static {
     fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
     fn get_allowed_ips_and_secret(
         &self,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
-    fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError>;
 }
 
 impl std::fmt::Display for BackendType<'_, (), ()> {
@@ -105,7 +105,7 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
 impl<T, D> BackendType<'_, T, D> {
     /// Very similar to [`std::option::Option::as_ref`].
     /// This helps us pass structured config to async tasks.
-    pub fn as_ref(&self) -> BackendType<'_, &T, &D> {
+    pub(crate) fn as_ref(&self) -> BackendType<'_, &T, &D> {
         match self {
             Self::Console(c, x) => BackendType::Console(MaybeOwned::Borrowed(c), x),
             Self::Link(c, x) => BackendType::Link(MaybeOwned::Borrowed(c), x),
@@ -118,7 +118,7 @@ impl<'a, T, D> BackendType<'a, T, D> {
     /// Very similar to [`std::option::Option::map`].
     /// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
     /// a function to a contained value.
-    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
+    pub(crate) fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
         match self {
             Self::Console(c, x) => BackendType::Console(c, f(x)),
             Self::Link(c, x) => BackendType::Link(c, x),
@@ -129,7 +129,7 @@ impl<'a, T, D> BackendType<'a, T, D> {
 impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
     /// Very similar to [`std::option::Option::transpose`].
     /// This is most useful for error handling.
-    pub fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
+    pub(crate) fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
         match self {
             Self::Console(c, x) => x.map(|x| BackendType::Console(c, x)),
             Self::Link(c, x) => Ok(BackendType::Link(c, x)),
@@ -138,31 +138,31 @@ impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
     }
 }
 
-pub struct ComputeCredentials {
-    pub info: ComputeUserInfo,
-    pub keys: ComputeCredentialKeys,
+pub(crate) struct ComputeCredentials {
+    pub(crate) info: ComputeUserInfo,
+    pub(crate) keys: ComputeCredentialKeys,
 }
 
 #[derive(Debug, Clone)]
-pub struct ComputeUserInfoNoEndpoint {
-    pub user: RoleName,
-    pub options: NeonOptions,
+pub(crate) struct ComputeUserInfoNoEndpoint {
+    pub(crate) user: RoleName,
+    pub(crate) options: NeonOptions,
 }
 
 #[derive(Debug, Clone)]
-pub struct ComputeUserInfo {
-    pub endpoint: EndpointId,
-    pub user: RoleName,
-    pub options: NeonOptions,
+pub(crate) struct ComputeUserInfo {
+    pub(crate) endpoint: EndpointId,
+    pub(crate) user: RoleName,
+    pub(crate) options: NeonOptions,
 }
 
 impl ComputeUserInfo {
-    pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
+    pub(crate) fn endpoint_cache_key(&self) -> EndpointCacheKey {
         self.options.get_cache_key(&self.endpoint)
     }
 }
 
-pub enum ComputeCredentialKeys {
+pub(crate) enum ComputeCredentialKeys {
     Password(Vec<u8>),
     AuthKeys(AuthKeys),
     None,
@@ -222,7 +222,7 @@ impl RateBucketInfo {
 }
 
 impl AuthenticationConfig {
-    pub fn check_rate_limit(
+    pub(crate) fn check_rate_limit(
         &self,
         ctx: &RequestMonitoring,
         config: &AuthenticationConfig,
@@ -404,17 +404,8 @@ async fn authenticate_with_secret(
 }
 
 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
-    /// Get compute endpoint name from the credentials.
-    pub fn get_endpoint(&self) -> Option<EndpointId> {
-        match self {
-            Self::Console(_, user_info) => user_info.endpoint_id.clone(),
-            Self::Link(_, ()) => Some("link".into()),
-            Self::Local(_) => Some("local".into()),
-        }
-    }
-
     /// Get username from the credentials.
-    pub fn get_user(&self) -> &str {
+    pub(crate) fn get_user(&self) -> &str {
         match self {
             Self::Console(_, user_info) => &user_info.user,
             Self::Link(_, ()) => "link",
@@ -424,7 +415,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
 
     /// Authenticate the client via the requested backend, possibly using credentials.
     #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
-    pub async fn authenticate(
+    pub(crate) async fn authenticate(
         self,
         ctx: &RequestMonitoring,
         client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -471,7 +462,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
 }
 
 impl BackendType<'_, ComputeUserInfo, &()> {
-    pub async fn get_role_secret(
+    pub(crate) async fn get_role_secret(
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
@@ -482,7 +473,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
         }
     }
 
-    pub async fn get_allowed_ips_and_secret(
+    pub(crate) async fn get_allowed_ips_and_secret(
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 56921dd949..e9019ce2cf 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -17,7 +17,7 @@ use tracing::{info, warn};
 /// one round trip and *expensive* computations (>= 4096 HMAC iterations).
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
-pub async fn authenticate_cleartext(
+pub(crate) async fn authenticate_cleartext(
     ctx: &RequestMonitoring,
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -59,7 +59,7 @@ pub async fn authenticate_cleartext(
 /// Workaround for clients which don't provide an endpoint (project) name.
 /// Similar to [`authenticate_cleartext`], but there's a specific password format,
 /// and passwords are not yet validated (we don't know how to validate them!)
-pub async fn password_hack_no_authentication(
+pub(crate) async fn password_hack_no_authentication(
     ctx: &RequestMonitoring,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 61833e19ed..e98da82053 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -22,27 +22,27 @@ const MAX_RENEW: Duration = Duration::from_secs(3600);
 const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
 
 /// How to get the JWT auth rules
-pub trait FetchAuthRules: Clone + Send + Sync + 'static {
+pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
     fn fetch_auth_rules(
         &self,
         role_name: RoleName,
     ) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
 }
 
-pub struct AuthRule {
-    pub id: String,
-    pub jwks_url: url::Url,
-    pub audience: Option<String>,
+pub(crate) struct AuthRule {
+    pub(crate) id: String,
+    pub(crate) jwks_url: url::Url,
+    pub(crate) audience: Option<String>,
 }
 
 #[derive(Default)]
-pub struct JwkCache {
+pub(crate) struct JwkCache {
     client: reqwest::Client,
 
     map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
 }
 
-pub struct JwkCacheEntry {
+pub(crate) struct JwkCacheEntry {
     /// Should refetch at least every hour to verify when old keys have been removed.
     /// Should refetch when new key IDs are seen only every 5 minutes or so
     last_retrieved: Instant,
@@ -75,7 +75,7 @@ impl KeySet {
     }
 }
 
-pub struct JwkCacheEntryLock {
+pub(crate) struct JwkCacheEntryLock {
     cached: ArcSwapOption<JwkCacheEntry>,
     lookup: tokio::sync::Semaphore,
 }
@@ -309,7 +309,7 @@ impl JwkCacheEntryLock {
 }
 
 impl JwkCache {
-    pub async fn check_jwt<F: FetchAuthRules>(
+    pub(crate) async fn check_jwt<F: FetchAuthRules>(
         &self,
         ctx: &RequestMonitoring,
         endpoint: EndpointId,
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index 95f4614736..19515f95a8 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -13,7 +13,7 @@ use tokio_postgres::config::SslMode;
 use tracing::{info, info_span};
 
 #[derive(Debug, Error)]
-pub enum LinkAuthError {
+pub(crate) enum LinkAuthError {
     #[error(transparent)]
     WaiterRegister(#[from] waiters::RegisterError),
 
@@ -52,7 +52,7 @@ fn hello_message(redirect_uri: &reqwest::Url, session_id: &str) -> String {
     )
 }
 
-pub fn new_psql_session_id() -> String {
+pub(crate) fn new_psql_session_id() -> String {
     hex::encode(rand::random::<[u8; 8]>())
 }
 
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index 6d18564dd6..8124f568cf 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -16,16 +16,14 @@ use crate::{
 use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
 
 pub struct LocalBackend {
-    pub jwks_cache: JwkCache,
-    pub postgres_addr: SocketAddr,
-    pub node_info: NodeInfo,
+    pub(crate) jwks_cache: JwkCache,
+    pub(crate) node_info: NodeInfo,
 }
 
 impl LocalBackend {
     pub fn new(postgres_addr: SocketAddr) -> Self {
         LocalBackend {
             jwks_cache: JwkCache::default(),
-            postgres_addr,
             node_info: NodeInfo {
                 config: {
                     let mut cfg = ConnCfg::new();
@@ -47,7 +45,7 @@ impl LocalBackend {
 }
 
 #[derive(Clone, Copy)]
-pub struct StaticAuthRules;
+pub(crate) struct StaticAuthRules;
 
 pub static JWKS_ROLE_MAP: ArcSwapOption<JwksRoleSettings> = ArcSwapOption::const_empty();
 
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index cb06fcaf55..0e91ae570a 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -16,7 +16,7 @@ use thiserror::Error;
 use tracing::{info, warn};
 
 #[derive(Debug, Error, PartialEq, Eq, Clone)]
-pub enum ComputeUserInfoParseError {
+pub(crate) enum ComputeUserInfoParseError {
     #[error("Parameter '{0}' is missing in startup packet.")]
     MissingKey(&'static str),
 
@@ -51,20 +51,20 @@ impl ReportableError for ComputeUserInfoParseError {
 /// Various client credentials which we use for authentication.
 /// Note that we don't store any kind of client key or password here.
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub struct ComputeUserInfoMaybeEndpoint {
-    pub user: RoleName,
-    pub endpoint_id: Option<EndpointId>,
-    pub options: NeonOptions,
+pub(crate) struct ComputeUserInfoMaybeEndpoint {
+    pub(crate) user: RoleName,
+    pub(crate) endpoint_id: Option<EndpointId>,
+    pub(crate) options: NeonOptions,
 }
 
 impl ComputeUserInfoMaybeEndpoint {
     #[inline]
-    pub fn endpoint(&self) -> Option<&str> {
+    pub(crate) fn endpoint(&self) -> Option<&str> {
         self.endpoint_id.as_deref()
     }
 }
 
-pub fn endpoint_sni(
+pub(crate) fn endpoint_sni(
     sni: &str,
     common_names: &HashSet<String>,
 ) -> Result<Option<EndpointId>, ComputeUserInfoParseError> {
@@ -83,7 +83,7 @@ pub fn endpoint_sni(
 }
 
 impl ComputeUserInfoMaybeEndpoint {
-    pub fn parse(
+    pub(crate) fn parse(
         ctx: &RequestMonitoring,
         params: &StartupMessageParams,
         sni: Option<&str>,
@@ -173,12 +173,12 @@ impl ComputeUserInfoMaybeEndpoint {
     }
 }
 
-pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool {
+pub(crate) fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool {
     ip_list.is_empty() || ip_list.iter().any(|pattern| check_ip(peer_addr, pattern))
 }
 
 #[derive(Debug, Clone, Eq, PartialEq)]
-pub enum IpPattern {
+pub(crate) enum IpPattern {
     Subnet(ipnet::IpNet),
     Range(IpAddr, IpAddr),
     Single(IpAddr),
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index acf7b4f6b6..f7e2b5296e 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -17,17 +17,20 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
 /// Every authentication selector is supposed to implement this trait.
-pub trait AuthMethod {
+pub(crate) trait AuthMethod {
     /// Any authentication selector should provide initial backend message
     /// containing auth method name and parameters, e.g. md5 salt.
     fn first_message(&self, channel_binding: bool) -> BeMessage<'_>;
 }
 
 /// Initial state of [`AuthFlow`].
-pub struct Begin;
+pub(crate) struct Begin;
 
 /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
-pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring);
+pub(crate) struct Scram<'a>(
+    pub(crate) &'a scram::ServerSecret,
+    pub(crate) &'a RequestMonitoring,
+);
 
 impl AuthMethod for Scram<'_> {
     #[inline(always)]
@@ -44,7 +47,7 @@ impl AuthMethod for Scram<'_> {
 
 /// Use an ad hoc auth flow (for clients which don't support SNI) proposed in
 /// <https://github.com/neondatabase/cloud/issues/1620#issuecomment-1165332290>.
-pub struct PasswordHack;
+pub(crate) struct PasswordHack;
 
 impl AuthMethod for PasswordHack {
     #[inline(always)]
@@ -55,10 +58,10 @@ impl AuthMethod for PasswordHack {
 
 /// Use clear-text password auth called `password` in docs
 /// <https://www.postgresql.org/docs/current/auth-password.html>
-pub struct CleartextPassword {
-    pub pool: Arc<ThreadPool>,
-    pub endpoint: EndpointIdInt,
-    pub secret: AuthSecret,
+pub(crate) struct CleartextPassword {
+    pub(crate) pool: Arc<ThreadPool>,
+    pub(crate) endpoint: EndpointIdInt,
+    pub(crate) secret: AuthSecret,
 }
 
 impl AuthMethod for CleartextPassword {
@@ -70,7 +73,7 @@ impl AuthMethod for CleartextPassword {
 
 /// This wrapper for [`PqStream`] performs client authentication.
 #[must_use]
-pub struct AuthFlow<'a, S, State> {
+pub(crate) struct AuthFlow<'a, S, State> {
     /// The underlying stream which implements libpq's protocol.
     stream: &'a mut PqStream<Stream<S>>,
     /// State might contain ancillary data (see [`Self::begin`]).
@@ -81,7 +84,7 @@ pub struct AuthFlow<'a, S, State> {
 /// Initial state of the stream wrapper.
 impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
     /// Create a new wrapper for client authentication.
-    pub fn new(stream: &'a mut PqStream<Stream<S>>) -> Self {
+    pub(crate) fn new(stream: &'a mut PqStream<Stream<S>>) -> Self {
         let tls_server_end_point = stream.get_ref().tls_server_end_point();
 
         Self {
@@ -92,7 +95,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
     }
 
     /// Move to the next step by sending auth method's name & params to client.
-    pub async fn begin<M: AuthMethod>(self, method: M) -> io::Result<AuthFlow<'a, S, M>> {
+    pub(crate) async fn begin<M: AuthMethod>(self, method: M) -> io::Result<AuthFlow<'a, S, M>> {
         self.stream
             .write_message(&method.first_message(self.tls_server_end_point.supported()))
             .await?;
@@ -107,7 +110,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
 
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
     /// Perform user authentication. Raise an error in case authentication failed.
-    pub async fn get_password(self) -> super::Result<PasswordHackPayload> {
+    pub(crate) async fn get_password(self) -> super::Result<PasswordHackPayload> {
         let msg = self.stream.read_password_message().await?;
         let password = msg
             .strip_suffix(&[0])
@@ -126,7 +129,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
 
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
     /// Perform user authentication. Raise an error in case authentication failed.
-    pub async fn authenticate(self) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
+    pub(crate) async fn authenticate(self) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
         let msg = self.stream.read_password_message().await?;
         let password = msg
             .strip_suffix(&[0])
@@ -151,7 +154,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
 /// Stream wrapper for handling [SCRAM](crate::scram) auth.
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
     /// Perform user authentication. Raise an error in case authentication failed.
-    pub async fn authenticate(self) -> super::Result<sasl::Outcome<scram::ScramKey>> {
+    pub(crate) async fn authenticate(self) -> super::Result<sasl::Outcome<scram::ScramKey>> {
         let Scram(secret, ctx) = self.state;
 
         // pause the timer while we communicate with the client
diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs
index 2ddf46fe25..3f5d006f7b 100644
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -7,13 +7,13 @@ use bstr::ByteSlice;
 
 use crate::EndpointId;
 
-pub struct PasswordHackPayload {
-    pub endpoint: EndpointId,
-    pub password: Vec<u8>,
+pub(crate) struct PasswordHackPayload {
+    pub(crate) endpoint: EndpointId,
+    pub(crate) password: Vec<u8>,
 }
 
 impl PasswordHackPayload {
-    pub fn parse(bytes: &[u8]) -> Option<Self> {
+    pub(crate) fn parse(bytes: &[u8]) -> Option<Self> {
         // The format is `project=<utf-8>;<password-bytes>` or `project=<utf-8>$<password-bytes>`.
         let separators = [";", "$"];
         for sep in separators {
@@ -30,7 +30,7 @@ impl PasswordHackPayload {
     }
 }
 
-pub fn parse_endpoint_param(bytes: &str) -> Option<&str> {
+pub(crate) fn parse_endpoint_param(bytes: &str) -> Option<&str> {
     bytes
         .strip_prefix("project=")
         .or_else(|| bytes.strip_prefix("endpoint="))
diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs
index d1d4087241..6c168144a7 100644
--- a/proxy/src/cache.rs
+++ b/proxy/src/cache.rs
@@ -1,7 +1,7 @@
-pub mod common;
-pub mod endpoints;
-pub mod project_info;
+pub(crate) mod common;
+pub(crate) mod endpoints;
+pub(crate) mod project_info;
 mod timed_lru;
 
-pub use common::{Cache, Cached};
-pub use timed_lru::TimedLru;
+pub(crate) use common::{Cache, Cached};
+pub(crate) use timed_lru::TimedLru;
diff --git a/proxy/src/cache/common.rs b/proxy/src/cache/common.rs
index 82c78e3eb2..b5caf94788 100644
--- a/proxy/src/cache/common.rs
+++ b/proxy/src/cache/common.rs
@@ -3,7 +3,7 @@ use std::ops::{Deref, DerefMut};
 /// A generic trait which exposes types of cache's key and value,
 /// as well as the notion of cache entry invalidation.
 /// This is useful for [`Cached`].
-pub trait Cache {
+pub(crate) trait Cache {
     /// Entry's key.
     type Key;
 
@@ -29,21 +29,21 @@ impl<C: Cache> Cache for &C {
 }
 
 /// Wrapper for convenient entry invalidation.
-pub struct Cached<C: Cache, V = <C as Cache>::Value> {
+pub(crate) struct Cached<C: Cache, V = <C as Cache>::Value> {
     /// Cache + lookup info.
-    pub token: Option<(C, C::LookupInfo<C::Key>)>,
+    pub(crate) token: Option<(C, C::LookupInfo<C::Key>)>,
 
     /// The value itself.
-    pub value: V,
+    pub(crate) value: V,
 }
 
 impl<C: Cache, V> Cached<C, V> {
     /// Place any entry into this wrapper; invalidation will be a no-op.
-    pub fn new_uncached(value: V) -> Self {
+    pub(crate) fn new_uncached(value: V) -> Self {
         Self { token: None, value }
     }
 
-    pub fn take_value(self) -> (Cached<C, ()>, V) {
+    pub(crate) fn take_value(self) -> (Cached<C, ()>, V) {
         (
             Cached {
                 token: self.token,
@@ -53,7 +53,7 @@ impl<C: Cache, V> Cached<C, V> {
         )
     }
 
-    pub fn map<U>(self, f: impl FnOnce(V) -> U) -> Cached<C, U> {
+    pub(crate) fn map<U>(self, f: impl FnOnce(V) -> U) -> Cached<C, U> {
         Cached {
             token: self.token,
             value: f(self.value),
@@ -61,7 +61,7 @@ impl<C: Cache, V> Cached<C, V> {
     }
 
     /// Drop this entry from a cache if it's still there.
-    pub fn invalidate(self) -> V {
+    pub(crate) fn invalidate(self) -> V {
         if let Some((cache, info)) = &self.token {
             cache.invalidate(info);
         }
@@ -69,7 +69,7 @@ impl<C: Cache, V> Cached<C, V> {
     }
 
     /// Tell if this entry is actually cached.
-    pub fn cached(&self) -> bool {
+    pub(crate) fn cached(&self) -> bool {
         self.token.is_some()
     }
 }
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 8c851790c2..f4762232d8 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -28,7 +28,7 @@ use crate::{
 };
 
 #[derive(Deserialize, Debug, Clone)]
-pub struct ControlPlaneEventKey {
+pub(crate) struct ControlPlaneEventKey {
     endpoint_created: Option<EndpointCreated>,
     branch_created: Option<BranchCreated>,
     project_created: Option<ProjectCreated>,
@@ -56,7 +56,7 @@ pub struct EndpointsCache {
 }
 
 impl EndpointsCache {
-    pub fn new(config: EndpointCacheConfig) -> Self {
+    pub(crate) fn new(config: EndpointCacheConfig) -> Self {
         Self {
             limiter: Arc::new(Mutex::new(GlobalRateLimiter::new(
                 config.limiter_info.clone(),
@@ -68,7 +68,7 @@ impl EndpointsCache {
             ready: AtomicBool::new(false),
         }
     }
-    pub async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
+    pub(crate) async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
         if !self.ready.load(Ordering::Acquire) {
             return true;
         }
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index eda886a7af..ceae74a9a0 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -24,7 +24,7 @@ use crate::{
 use super::{Cache, Cached};
 
 #[async_trait]
-pub trait ProjectInfoCache {
+pub(crate) trait ProjectInfoCache {
     fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
     fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
     async fn decrement_active_listeners(&self);
@@ -37,7 +37,7 @@ struct Entry<T> {
 }
 
 impl<T> Entry<T> {
-    pub fn new(value: T) -> Self {
+    pub(crate) fn new(value: T) -> Self {
         Self {
             created_at: Instant::now(),
             value,
@@ -64,7 +64,7 @@ impl EndpointInfo {
             Some(t) => t < created_at,
         }
     }
-    pub fn get_role_secret(
+    pub(crate) fn get_role_secret(
         &self,
         role_name: RoleNameInt,
         valid_since: Instant,
@@ -81,7 +81,7 @@ impl EndpointInfo {
         None
     }
 
-    pub fn get_allowed_ips(
+    pub(crate) fn get_allowed_ips(
         &self,
         valid_since: Instant,
         ignore_cache_since: Option<Instant>,
@@ -96,10 +96,10 @@ impl EndpointInfo {
         }
         None
     }
-    pub fn invalidate_allowed_ips(&mut self) {
+    pub(crate) fn invalidate_allowed_ips(&mut self) {
         self.allowed_ips = None;
     }
-    pub fn invalidate_role_secret(&mut self, role_name: RoleNameInt) {
+    pub(crate) fn invalidate_role_secret(&mut self, role_name: RoleNameInt) {
         self.secret.remove(&role_name);
     }
 }
@@ -178,7 +178,7 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
 }
 
 impl ProjectInfoCacheImpl {
-    pub fn new(config: ProjectInfoCacheOptions) -> Self {
+    pub(crate) fn new(config: ProjectInfoCacheOptions) -> Self {
         Self {
             cache: DashMap::new(),
             project2ep: DashMap::new(),
@@ -189,7 +189,7 @@ impl ProjectInfoCacheImpl {
         }
     }
 
-    pub fn get_role_secret(
+    pub(crate) fn get_role_secret(
         &self,
         endpoint_id: &EndpointId,
         role_name: &RoleName,
@@ -212,7 +212,7 @@ impl ProjectInfoCacheImpl {
         }
         Some(Cached::new_uncached(value))
     }
-    pub fn get_allowed_ips(
+    pub(crate) fn get_allowed_ips(
         &self,
         endpoint_id: &EndpointId,
     ) -> Option<Cached<&Self, Arc<Vec<IpPattern>>>> {
@@ -230,7 +230,7 @@ impl ProjectInfoCacheImpl {
         }
         Some(Cached::new_uncached(value))
     }
-    pub fn insert_role_secret(
+    pub(crate) fn insert_role_secret(
         &self,
         project_id: ProjectIdInt,
         endpoint_id: EndpointIdInt,
@@ -247,7 +247,7 @@ impl ProjectInfoCacheImpl {
             entry.secret.insert(role_name, secret.into());
         }
     }
-    pub fn insert_allowed_ips(
+    pub(crate) fn insert_allowed_ips(
         &self,
         project_id: ProjectIdInt,
         endpoint_id: EndpointIdInt,
@@ -319,7 +319,7 @@ impl ProjectInfoCacheImpl {
 
 /// Lookup info for project info cache.
 /// This is used to invalidate cache entries.
-pub struct CachedLookupInfo {
+pub(crate) struct CachedLookupInfo {
     /// Search by this key.
     endpoint_id: EndpointIdInt,
     lookup_type: LookupType,
diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs
index 07fad56643..8bb482f7c6 100644
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -39,7 +39,7 @@ use super::{common::Cached, *};
 ///
 /// * It's possible for an entry that has not yet expired entry to be evicted
 ///   before expired items. That's a bit wasteful, but probably fine in practice.
-pub struct TimedLru<K, V> {
+pub(crate) struct TimedLru<K, V> {
     /// Cache's name for tracing.
     name: &'static str,
 
@@ -72,7 +72,7 @@ struct Entry<T> {
 
 impl<K: Hash + Eq, V> TimedLru<K, V> {
     /// Construct a new LRU cache with timed entries.
-    pub fn new(
+    pub(crate) fn new(
         name: &'static str,
         capacity: usize,
         ttl: Duration,
@@ -207,11 +207,11 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
 }
 
 impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
-    pub fn insert_ttl(&self, key: K, value: V, ttl: Duration) {
+    pub(crate) fn insert_ttl(&self, key: K, value: V, ttl: Duration) {
         self.insert_raw_ttl(key, value, ttl, false);
     }
 
-    pub fn insert_unit(&self, key: K, value: V) -> (Option<V>, Cached<&Self, ()>) {
+    pub(crate) fn insert_unit(&self, key: K, value: V) -> (Option<V>, Cached<&Self, ()>) {
         let (created_at, old) = self.insert_raw(key.clone(), value);
 
         let cached = Cached {
@@ -221,22 +221,11 @@ impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
 
         (old, cached)
     }
-
-    pub fn insert(&self, key: K, value: V) -> (Option<V>, Cached<&Self>) {
-        let (created_at, old) = self.insert_raw(key.clone(), value.clone());
-
-        let cached = Cached {
-            token: Some((self, LookupInfo { created_at, key })),
-            value,
-        };
-
-        (old, cached)
-    }
 }
 
 impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
     /// Retrieve a cached entry in convenient wrapper.
-    pub fn get<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
+    pub(crate) fn get<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
     where
         K: Borrow<Q> + Clone,
         Q: Hash + Eq + ?Sized,
@@ -253,32 +242,10 @@ impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
             }
         })
     }
-
-    /// Retrieve a cached entry in convenient wrapper, ignoring its TTL.
-    pub fn get_ignoring_ttl<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
-    where
-        K: Borrow<Q>,
-        Q: Hash + Eq + ?Sized,
-    {
-        let mut cache = self.cache.lock();
-        cache
-            .get(key)
-            .map(|entry| Cached::new_uncached(entry.value.clone()))
-    }
-
-    /// Remove an entry from the cache.
-    pub fn remove<Q>(&self, key: &Q) -> Option<V>
-    where
-        K: Borrow<Q> + Clone,
-        Q: Hash + Eq + ?Sized,
-    {
-        let mut cache = self.cache.lock();
-        cache.remove(key).map(|entry| entry.value)
-    }
 }
 
 /// Lookup information for key invalidation.
-pub struct LookupInfo<K> {
+pub(crate) struct LookupInfo<K> {
     /// Time of creation of a cache [`Entry`].
     /// We use this during invalidation lookups to prevent eviction of a newer
     /// entry sharing the same key (it might've been inserted by a different
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index ea8f7b4070..71a2a16af8 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -18,7 +18,7 @@ use crate::{
 
 pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
 pub type CancellationHandlerMain = CancellationHandler<Option<Arc<Mutex<RedisPublisherClient>>>>;
-pub type CancellationHandlerMainInternal = Option<Arc<Mutex<RedisPublisherClient>>>;
+pub(crate) type CancellationHandlerMainInternal = Option<Arc<Mutex<RedisPublisherClient>>>;
 
 /// Enables serving `CancelRequest`s.
 ///
@@ -32,7 +32,7 @@ pub struct CancellationHandler<P> {
 }
 
 #[derive(Debug, Error)]
-pub enum CancelError {
+pub(crate) enum CancelError {
     #[error("{0}")]
     IO(#[from] std::io::Error),
     #[error("{0}")]
@@ -53,7 +53,7 @@ impl ReportableError for CancelError {
 
 impl<P: CancellationPublisher> CancellationHandler<P> {
     /// Run async action within an ephemeral session identified by [`CancelKeyData`].
-    pub fn get_session(self: Arc<Self>) -> Session<P> {
+    pub(crate) fn get_session(self: Arc<Self>) -> Session<P> {
         // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
         // expose it and we don't want to do another roundtrip to query
         // for it. The client will be able to notice that this is not the
@@ -81,7 +81,7 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
     }
     /// Try to cancel a running query for the corresponding connection.
     /// If the cancellation key is not found, it will be published to Redis.
-    pub async fn cancel_session(
+    pub(crate) async fn cancel_session(
         &self,
         key: CancelKeyData,
         session_id: Uuid,
@@ -155,14 +155,14 @@ pub struct CancelClosure {
 }
 
 impl CancelClosure {
-    pub fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self {
+    pub(crate) fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self {
         Self {
             socket_addr,
             cancel_token,
         }
     }
     /// Cancels the query running on user's compute node.
-    pub async fn try_cancel_query(self) -> Result<(), CancelError> {
+    pub(crate) async fn try_cancel_query(self) -> Result<(), CancelError> {
         let socket = TcpStream::connect(self.socket_addr).await?;
         self.cancel_token.cancel_query_raw(socket, NoTls).await?;
         info!("query was cancelled");
@@ -171,7 +171,7 @@ impl CancelClosure {
 }
 
 /// Helper for registering query cancellation tokens.
-pub struct Session<P> {
+pub(crate) struct Session<P> {
     /// The user-facing key identifying this session.
     key: CancelKeyData,
     /// The [`CancelMap`] this session belongs to.
@@ -181,7 +181,7 @@ pub struct Session<P> {
 impl<P> Session<P> {
     /// Store the cancel token for the given session.
     /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
-    pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
+    pub(crate) fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
         info!("enabling query cancellation for this session");
         self.cancellation_handler
             .map
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index b6659f5dd0..246501a21e 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -23,7 +23,7 @@ use tracing::{error, info, warn};
 const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
 
 #[derive(Debug, Error)]
-pub enum ConnectionError {
+pub(crate) enum ConnectionError {
     /// This error doesn't seem to reveal any secrets; for instance,
     /// `tokio_postgres::error::Kind` doesn't contain ip addresses and such.
     #[error("{COULD_NOT_CONNECT}: {0}")]
@@ -86,22 +86,22 @@ impl ReportableError for ConnectionError {
 }
 
 /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`.
-pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
+pub(crate) type ScramKeys = tokio_postgres::config::ScramKeys<32>;
 
 /// A config for establishing a connection to compute node.
 /// Eventually, `tokio_postgres` will be replaced with something better.
 /// Newtype allows us to implement methods on top of it.
 #[derive(Clone, Default)]
-pub struct ConnCfg(Box<tokio_postgres::Config>);
+pub(crate) struct ConnCfg(Box<tokio_postgres::Config>);
 
 /// Creation and initialization routines.
 impl ConnCfg {
-    pub fn new() -> Self {
+    pub(crate) fn new() -> Self {
         Self::default()
     }
 
     /// Reuse password or auth keys from the other config.
-    pub fn reuse_password(&mut self, other: Self) {
+    pub(crate) fn reuse_password(&mut self, other: Self) {
         if let Some(password) = other.get_password() {
             self.password(password);
         }
@@ -111,7 +111,7 @@ impl ConnCfg {
         }
     }
 
-    pub fn get_host(&self) -> Result<Host, WakeComputeError> {
+    pub(crate) fn get_host(&self) -> Result<Host, WakeComputeError> {
         match self.0.get_hosts() {
             [tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()),
             // we should not have multiple address or unix addresses.
@@ -122,7 +122,7 @@ impl ConnCfg {
     }
 
     /// Apply startup message params to the connection config.
-    pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
+    pub(crate) fn set_startup_params(&mut self, params: &StartupMessageParams) {
         // Only set `user` if it's not present in the config.
         // Link auth flow takes username from the console's response.
         if let (None, Some(user)) = (self.get_user(), params.get("user")) {
@@ -255,25 +255,25 @@ impl ConnCfg {
     }
 }
 
-pub struct PostgresConnection {
+pub(crate) struct PostgresConnection {
     /// Socket connected to a compute node.
-    pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
+    pub(crate) stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
         tokio::net::TcpStream,
         tokio_postgres_rustls::RustlsStream<tokio::net::TcpStream>,
     >,
     /// PostgreSQL connection parameters.
-    pub params: std::collections::HashMap<String, String>,
+    pub(crate) params: std::collections::HashMap<String, String>,
     /// Query cancellation token.
-    pub cancel_closure: CancelClosure,
+    pub(crate) cancel_closure: CancelClosure,
     /// Labels for proxy's metrics.
-    pub aux: MetricsAuxInfo,
+    pub(crate) aux: MetricsAuxInfo,
 
     _guage: NumDbConnectionsGuard<'static>,
 }
 
 impl ConnCfg {
     /// Connect to a corresponding compute node.
-    pub async fn connect(
+    pub(crate) async fn connect(
         &self,
         ctx: &RequestMonitoring,
         allow_self_signed_compute: bool,
diff --git a/proxy/src/console.rs b/proxy/src/console.rs
index ea95e83437..87d8e781aa 100644
--- a/proxy/src/console.rs
+++ b/proxy/src/console.rs
@@ -10,7 +10,7 @@ pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};
 
 /// Various cache-related types.
 pub mod caches {
-    pub use super::provider::{ApiCaches, NodeInfoCache};
+    pub use super::provider::ApiCaches;
 }
 
 /// Various cache-related types.
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index a7ccf076b0..0df1a450ac 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -12,22 +12,22 @@ use crate::RoleName;
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
 #[derive(Debug, Deserialize, Clone)]
-pub struct ConsoleError {
-    pub error: Box<str>,
+pub(crate) struct ConsoleError {
+    pub(crate) error: Box<str>,
     #[serde(skip)]
-    pub http_status_code: http::StatusCode,
-    pub status: Option<Status>,
+    pub(crate) http_status_code: http::StatusCode,
+    pub(crate) status: Option<Status>,
 }
 
 impl ConsoleError {
-    pub fn get_reason(&self) -> Reason {
+    pub(crate) fn get_reason(&self) -> Reason {
         self.status
             .as_ref()
             .and_then(|s| s.details.error_info.as_ref())
             .map_or(Reason::Unknown, |e| e.reason)
     }
 
-    pub fn get_user_facing_message(&self) -> String {
+    pub(crate) fn get_user_facing_message(&self) -> String {
         use super::provider::errors::REQUEST_FAILED;
         self.status
             .as_ref()
@@ -88,27 +88,28 @@ impl CouldRetry for ConsoleError {
 }
 
 #[derive(Debug, Deserialize, Clone)]
-pub struct Status {
-    pub code: Box<str>,
-    pub message: Box<str>,
-    pub details: Details,
+#[allow(dead_code)]
+pub(crate) struct Status {
+    pub(crate) code: Box<str>,
+    pub(crate) message: Box<str>,
+    pub(crate) details: Details,
 }
 
 #[derive(Debug, Deserialize, Clone)]
-pub struct Details {
-    pub error_info: Option<ErrorInfo>,
-    pub retry_info: Option<RetryInfo>,
-    pub user_facing_message: Option<UserFacingMessage>,
+pub(crate) struct Details {
+    pub(crate) error_info: Option<ErrorInfo>,
+    pub(crate) retry_info: Option<RetryInfo>,
+    pub(crate) user_facing_message: Option<UserFacingMessage>,
 }
 
 #[derive(Copy, Clone, Debug, Deserialize)]
-pub struct ErrorInfo {
-    pub reason: Reason,
+pub(crate) struct ErrorInfo {
+    pub(crate) reason: Reason,
     // Schema could also have `metadata` field, but it's not structured. Skip it for now.
 }
 
 #[derive(Clone, Copy, Debug, Deserialize, Default)]
-pub enum Reason {
+pub(crate) enum Reason {
     /// RoleProtected indicates that the role is protected and the attempted operation is not permitted on protected roles.
     #[serde(rename = "ROLE_PROTECTED")]
     RoleProtected,
@@ -168,7 +169,7 @@ pub enum Reason {
 }
 
 impl Reason {
-    pub fn is_not_found(&self) -> bool {
+    pub(crate) fn is_not_found(self) -> bool {
         matches!(
             self,
             Reason::ResourceNotFound
@@ -178,7 +179,7 @@ impl Reason {
         )
     }
 
-    pub fn can_retry(&self) -> bool {
+    pub(crate) fn can_retry(self) -> bool {
         match self {
             // do not retry role protected errors
             // not a transitive error
@@ -208,22 +209,23 @@ impl Reason {
 }
 
 #[derive(Copy, Clone, Debug, Deserialize)]
-pub struct RetryInfo {
-    pub retry_delay_ms: u64,
+#[allow(dead_code)]
+pub(crate) struct RetryInfo {
+    pub(crate) retry_delay_ms: u64,
 }
 
 #[derive(Debug, Deserialize, Clone)]
-pub struct UserFacingMessage {
-    pub message: Box<str>,
+pub(crate) struct UserFacingMessage {
+    pub(crate) message: Box<str>,
 }
 
 /// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
 /// Returned by the `/proxy_get_role_secret` API method.
 #[derive(Deserialize)]
-pub struct GetRoleSecret {
-    pub role_secret: Box<str>,
-    pub allowed_ips: Option<Vec<IpPattern>>,
-    pub project_id: Option<ProjectIdInt>,
+pub(crate) struct GetRoleSecret {
+    pub(crate) role_secret: Box<str>,
+    pub(crate) allowed_ips: Option<Vec<IpPattern>>,
+    pub(crate) project_id: Option<ProjectIdInt>,
 }
 
 // Manually implement debug to omit sensitive info.
@@ -236,21 +238,21 @@ impl fmt::Debug for GetRoleSecret {
 /// Response which holds compute node's `host:port` pair.
 /// Returned by the `/proxy_wake_compute` API method.
 #[derive(Debug, Deserialize)]
-pub struct WakeCompute {
-    pub address: Box<str>,
-    pub aux: MetricsAuxInfo,
+pub(crate) struct WakeCompute {
+    pub(crate) address: Box<str>,
+    pub(crate) aux: MetricsAuxInfo,
 }
 
 /// Async response which concludes the link auth flow.
 /// Also known as `kickResponse` in the console.
 #[derive(Debug, Deserialize)]
-pub struct KickSession<'a> {
+pub(crate) struct KickSession<'a> {
     /// Session ID is assigned by the proxy.
-    pub session_id: &'a str,
+    pub(crate) session_id: &'a str,
 
     /// Compute node connection params.
     #[serde(deserialize_with = "KickSession::parse_db_info")]
-    pub result: DatabaseInfo,
+    pub(crate) result: DatabaseInfo,
 }
 
 impl KickSession<'_> {
@@ -273,15 +275,15 @@ impl KickSession<'_> {
 
 /// Compute node connection params.
 #[derive(Deserialize)]
-pub struct DatabaseInfo {
-    pub host: Box<str>,
-    pub port: u16,
-    pub dbname: Box<str>,
-    pub user: Box<str>,
+pub(crate) struct DatabaseInfo {
+    pub(crate) host: Box<str>,
+    pub(crate) port: u16,
+    pub(crate) dbname: Box<str>,
+    pub(crate) user: Box<str>,
     /// Console always provides a password, but it might
     /// be inconvenient for debug with local PG instance.
-    pub password: Option<Box<str>>,
-    pub aux: MetricsAuxInfo,
+    pub(crate) password: Option<Box<str>>,
+    pub(crate) aux: MetricsAuxInfo,
 }
 
 // Manually implement debug to omit sensitive info.
@@ -299,12 +301,12 @@ impl fmt::Debug for DatabaseInfo {
 /// Various labels for prometheus metrics.
 /// Also known as `ProxyMetricsAuxInfo` in the console.
 #[derive(Debug, Deserialize, Clone)]
-pub struct MetricsAuxInfo {
-    pub endpoint_id: EndpointIdInt,
-    pub project_id: ProjectIdInt,
-    pub branch_id: BranchIdInt,
+pub(crate) struct MetricsAuxInfo {
+    pub(crate) endpoint_id: EndpointIdInt,
+    pub(crate) project_id: ProjectIdInt,
+    pub(crate) branch_id: BranchIdInt,
     #[serde(default)]
-    pub cold_start_info: ColdStartInfo,
+    pub(crate) cold_start_info: ColdStartInfo,
 }
 
 #[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)]
@@ -331,7 +333,7 @@ pub enum ColdStartInfo {
 }
 
 impl ColdStartInfo {
-    pub fn as_str(&self) -> &'static str {
+    pub(crate) fn as_str(self) -> &'static str {
         match self {
             ColdStartInfo::Unknown => "unknown",
             ColdStartInfo::Warm => "warm",
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index 82d5033aab..f318ac529b 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -14,13 +14,13 @@ use tracing::{error, info, info_span, Instrument};
 static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);
 
 /// Give caller an opportunity to wait for the cloud's reply.
-pub fn get_waiter(
+pub(crate) fn get_waiter(
     psql_session_id: impl Into<String>,
 ) -> Result<Waiter<'static, ComputeReady>, waiters::RegisterError> {
     CPLANE_WAITERS.register(psql_session_id.into())
 }
 
-pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::NotifyError> {
+pub(crate) fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::NotifyError> {
     CPLANE_WAITERS.notify(psql_session_id, msg)
 }
 
@@ -74,7 +74,7 @@ async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
 }
 
 /// A message received by `mgmt` when a compute node is ready.
-pub type ComputeReady = DatabaseInfo;
+pub(crate) type ComputeReady = DatabaseInfo;
 
 // TODO: replace with an http-based protocol.
 struct MgmtHandler;
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 4794527410..12a6e2f12a 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -23,7 +23,7 @@ use std::{hash::Hash, sync::Arc, time::Duration};
 use tokio::time::Instant;
 use tracing::info;
 
-pub mod errors {
+pub(crate) mod errors {
     use crate::{
         console::messages::{self, ConsoleError, Reason},
         error::{io_error, ErrorKind, ReportableError, UserFacingError},
@@ -34,11 +34,11 @@ pub mod errors {
     use super::ApiLockError;
 
     /// A go-to error message which doesn't leak any detail.
-    pub const REQUEST_FAILED: &str = "Console request failed";
+    pub(crate) const REQUEST_FAILED: &str = "Console request failed";
 
     /// Common console API error.
     #[derive(Debug, Error)]
-    pub enum ApiError {
+    pub(crate) enum ApiError {
         /// Error returned by the console itself.
         #[error("{REQUEST_FAILED} with {0}")]
         Console(ConsoleError),
@@ -50,7 +50,7 @@ pub mod errors {
 
     impl ApiError {
         /// Returns HTTP status code if it's the reason for failure.
-        pub fn get_reason(&self) -> messages::Reason {
+        pub(crate) fn get_reason(&self) -> messages::Reason {
             match self {
                 ApiError::Console(e) => e.get_reason(),
                 ApiError::Transport(_) => messages::Reason::Unknown,
@@ -146,7 +146,7 @@ pub mod errors {
     }
 
     #[derive(Debug, Error)]
-    pub enum GetAuthInfoError {
+    pub(crate) enum GetAuthInfoError {
         // We shouldn't include the actual secret here.
         #[error("Console responded with a malformed auth secret")]
         BadSecret,
@@ -183,7 +183,7 @@ pub mod errors {
     }
 
     #[derive(Debug, Error)]
-    pub enum WakeComputeError {
+    pub(crate) enum WakeComputeError {
         #[error("Console responded with a malformed compute address: {0}")]
         BadComputeAddress(Box<str>),
 
@@ -247,7 +247,7 @@ pub mod errors {
 
 /// Auth secret which is managed by the cloud.
 #[derive(Clone, Eq, PartialEq, Debug)]
-pub enum AuthSecret {
+pub(crate) enum AuthSecret {
     #[cfg(any(test, feature = "testing"))]
     /// Md5 hash of user's password.
     Md5([u8; 16]),
@@ -257,32 +257,32 @@ pub enum AuthSecret {
 }
 
 #[derive(Default)]
-pub struct AuthInfo {
-    pub secret: Option<AuthSecret>,
+pub(crate) struct AuthInfo {
+    pub(crate) secret: Option<AuthSecret>,
     /// List of IP addresses allowed for the autorization.
-    pub allowed_ips: Vec<IpPattern>,
+    pub(crate) allowed_ips: Vec<IpPattern>,
     /// Project ID. This is used for cache invalidation.
-    pub project_id: Option<ProjectIdInt>,
+    pub(crate) project_id: Option<ProjectIdInt>,
 }
 
 /// Info for establishing a connection to a compute node.
 /// This is what we get after auth succeeded, but not before!
 #[derive(Clone)]
-pub struct NodeInfo {
+pub(crate) struct NodeInfo {
     /// Compute node connection params.
     /// It's sad that we have to clone this, but this will improve
     /// once we migrate to a bespoke connection logic.
-    pub config: compute::ConnCfg,
+    pub(crate) config: compute::ConnCfg,
 
     /// Labels for proxy's metrics.
-    pub aux: MetricsAuxInfo,
+    pub(crate) aux: MetricsAuxInfo,
 
     /// Whether we should accept self-signed certificates (for testing)
-    pub allow_self_signed_compute: bool,
+    pub(crate) allow_self_signed_compute: bool,
 }
 
 impl NodeInfo {
-    pub async fn connect(
+    pub(crate) async fn connect(
         &self,
         ctx: &RequestMonitoring,
         timeout: Duration,
@@ -296,12 +296,12 @@ impl NodeInfo {
             )
             .await
     }
-    pub fn reuse_settings(&mut self, other: Self) {
+    pub(crate) fn reuse_settings(&mut self, other: Self) {
         self.allow_self_signed_compute = other.allow_self_signed_compute;
         self.config.reuse_password(other.config);
     }
 
-    pub fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
+    pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
         match keys {
             ComputeCredentialKeys::Password(password) => self.config.password(password),
             ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
@@ -310,10 +310,10 @@ impl NodeInfo {
     }
 }
 
-pub type NodeInfoCache = TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ConsoleError>>>;
-pub type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
-pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
-pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
+pub(crate) type NodeInfoCache = TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ConsoleError>>>;
+pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
+pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
+pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
 
 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
@@ -350,6 +350,7 @@ pub enum ConsoleBackend {
     Postgres(mock::Api),
     /// Internal testing
     #[cfg(test)]
+    #[allow(private_interfaces)]
     Test(Box<dyn crate::auth::backend::TestBackend>),
 }
 
@@ -402,7 +403,7 @@ impl Api for ConsoleBackend {
 /// Various caches for [`console`](super).
 pub struct ApiCaches {
     /// Cache for the `wake_compute` API method.
-    pub node_info: NodeInfoCache,
+    pub(crate) node_info: NodeInfoCache,
     /// Cache which stores project_id -> endpoint_ids mapping.
     pub project_info: Arc<ProjectInfoCacheImpl>,
     /// List of all valid endpoints.
@@ -439,7 +440,7 @@ pub struct ApiLocks<K> {
 }
 
 #[derive(Debug, thiserror::Error)]
-pub enum ApiLockError {
+pub(crate) enum ApiLockError {
     #[error("timeout acquiring resource permit")]
     TimeoutError(#[from] tokio::time::error::Elapsed),
 }
@@ -471,7 +472,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
         })
     }
 
-    pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
+    pub(crate) async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
         if self.config.initial_limit == 0 {
             return Ok(WakeComputePermit {
                 permit: Token::disabled(),
@@ -531,18 +532,18 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
     }
 }
 
-pub struct WakeComputePermit {
+pub(crate) struct WakeComputePermit {
     permit: Token,
 }
 
 impl WakeComputePermit {
-    pub fn should_check_cache(&self) -> bool {
+    pub(crate) fn should_check_cache(&self) -> bool {
         !self.permit.is_disabled()
     }
-    pub fn release(self, outcome: Outcome) {
+    pub(crate) fn release(self, outcome: Outcome) {
         self.permit.release(outcome);
     }
-    pub fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
+    pub(crate) fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
         match res {
             Ok(_) => self.release(Outcome::Success),
             Err(_) => self.release(Outcome::Overload),
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 4e8b7a9365..08b87cd87a 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -48,7 +48,7 @@ impl Api {
         Self { endpoint }
     }
 
-    pub fn url(&self) -> &str {
+    pub(crate) fn url(&self) -> &str {
         self.endpoint.as_str()
     }
 
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index a6c0e233fc..33eda72e65 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -25,8 +25,8 @@ use tracing::{debug, error, info, info_span, warn, Instrument};
 pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
-    pub locks: &'static ApiLocks<EndpointCacheKey>,
-    pub wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
+    pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
+    pub(crate) wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
     jwt: String,
 }
 
@@ -51,7 +51,7 @@ impl Api {
         }
     }
 
-    pub fn url(&self) -> &str {
+    pub(crate) fn url(&self) -> &str {
         self.endpoint.url().as_str()
     }
 
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index cafbdedc15..9edba543fe 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -22,8 +22,9 @@ use self::parquet::RequestData;
 
 pub mod parquet;
 
-pub static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
-pub static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
+pub(crate) static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
+pub(crate) static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>> =
+    OnceCell::new();
 
 /// Context data for a single request to connect to a database.
 ///
@@ -38,12 +39,12 @@ pub struct RequestMonitoring(
 );
 
 struct RequestMonitoringInner {
-    pub peer_addr: IpAddr,
-    pub session_id: Uuid,
-    pub protocol: Protocol,
+    pub(crate) peer_addr: IpAddr,
+    pub(crate) session_id: Uuid,
+    pub(crate) protocol: Protocol,
     first_packet: chrono::DateTime<Utc>,
     region: &'static str,
-    pub span: Span,
+    pub(crate) span: Span,
 
     // filled in as they are discovered
     project: Option<ProjectIdInt>,
@@ -63,14 +64,14 @@ struct RequestMonitoringInner {
     sender: Option<mpsc::UnboundedSender<RequestData>>,
     // This sender is only used to log the length of session in case of success.
     disconnect_sender: Option<mpsc::UnboundedSender<RequestData>>,
-    pub latency_timer: LatencyTimer,
+    pub(crate) latency_timer: LatencyTimer,
     // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
     rejected: Option<bool>,
     disconnect_timestamp: Option<chrono::DateTime<Utc>>,
 }
 
 #[derive(Clone, Debug)]
-pub enum AuthMethod {
+pub(crate) enum AuthMethod {
     // aka link aka passwordless
     Web,
     ScramSha256,
@@ -125,11 +126,11 @@ impl RequestMonitoring {
     }
 
     #[cfg(test)]
-    pub fn test() -> Self {
+    pub(crate) fn test() -> Self {
         RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test")
     }
 
-    pub fn console_application_name(&self) -> String {
+    pub(crate) fn console_application_name(&self) -> String {
         let this = self.0.try_lock().expect("should not deadlock");
         format!(
             "{}/{}",
@@ -138,19 +139,19 @@ impl RequestMonitoring {
         )
     }
 
-    pub fn set_rejected(&self, rejected: bool) {
+    pub(crate) fn set_rejected(&self, rejected: bool) {
         let mut this = self.0.try_lock().expect("should not deadlock");
         this.rejected = Some(rejected);
     }
 
-    pub fn set_cold_start_info(&self, info: ColdStartInfo) {
+    pub(crate) fn set_cold_start_info(&self, info: ColdStartInfo) {
         self.0
             .try_lock()
             .expect("should not deadlock")
             .set_cold_start_info(info);
     }
 
-    pub fn set_db_options(&self, options: StartupMessageParams) {
+    pub(crate) fn set_db_options(&self, options: StartupMessageParams) {
         let mut this = self.0.try_lock().expect("should not deadlock");
         this.set_application(options.get("application_name").map(SmolStr::from));
         if let Some(user) = options.get("user") {
@@ -163,7 +164,7 @@ impl RequestMonitoring {
         this.pg_options = Some(options);
     }
 
-    pub fn set_project(&self, x: MetricsAuxInfo) {
+    pub(crate) fn set_project(&self, x: MetricsAuxInfo) {
         let mut this = self.0.try_lock().expect("should not deadlock");
         if this.endpoint_id.is_none() {
             this.set_endpoint_id(x.endpoint_id.as_str().into());
@@ -173,33 +174,33 @@ impl RequestMonitoring {
         this.set_cold_start_info(x.cold_start_info);
     }
 
-    pub fn set_project_id(&self, project_id: ProjectIdInt) {
+    pub(crate) fn set_project_id(&self, project_id: ProjectIdInt) {
         let mut this = self.0.try_lock().expect("should not deadlock");
         this.project = Some(project_id);
     }
 
-    pub fn set_endpoint_id(&self, endpoint_id: EndpointId) {
+    pub(crate) fn set_endpoint_id(&self, endpoint_id: EndpointId) {
         self.0
             .try_lock()
             .expect("should not deadlock")
             .set_endpoint_id(endpoint_id);
     }
 
-    pub fn set_dbname(&self, dbname: DbName) {
+    pub(crate) fn set_dbname(&self, dbname: DbName) {
         self.0
             .try_lock()
             .expect("should not deadlock")
             .set_dbname(dbname);
     }
 
-    pub fn set_user(&self, user: RoleName) {
+    pub(crate) fn set_user(&self, user: RoleName) {
         self.0
             .try_lock()
             .expect("should not deadlock")
             .set_user(user);
     }
 
-    pub fn set_auth_method(&self, auth_method: AuthMethod) {
+    pub(crate) fn set_auth_method(&self, auth_method: AuthMethod) {
         let mut this = self.0.try_lock().expect("should not deadlock");
         this.auth_method = Some(auth_method);
     }
@@ -211,7 +212,7 @@ impl RequestMonitoring {
             .has_private_peer_addr()
     }
 
-    pub fn set_error_kind(&self, kind: ErrorKind) {
+    pub(crate) fn set_error_kind(&self, kind: ErrorKind) {
         let mut this = self.0.try_lock().expect("should not deadlock");
         // Do not record errors from the private address to metrics.
         if !this.has_private_peer_addr() {
@@ -237,30 +238,30 @@ impl RequestMonitoring {
             .log_connect();
     }
 
-    pub fn protocol(&self) -> Protocol {
+    pub(crate) fn protocol(&self) -> Protocol {
         self.0.try_lock().expect("should not deadlock").protocol
     }
 
-    pub fn span(&self) -> Span {
+    pub(crate) fn span(&self) -> Span {
         self.0.try_lock().expect("should not deadlock").span.clone()
     }
 
-    pub fn session_id(&self) -> Uuid {
+    pub(crate) fn session_id(&self) -> Uuid {
         self.0.try_lock().expect("should not deadlock").session_id
     }
 
-    pub fn peer_addr(&self) -> IpAddr {
+    pub(crate) fn peer_addr(&self) -> IpAddr {
         self.0.try_lock().expect("should not deadlock").peer_addr
     }
 
-    pub fn cold_start_info(&self) -> ColdStartInfo {
+    pub(crate) fn cold_start_info(&self) -> ColdStartInfo {
         self.0
             .try_lock()
             .expect("should not deadlock")
             .cold_start_info
     }
 
-    pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
+    pub(crate) fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
         LatencyTimerPause {
             ctx: self,
             start: tokio::time::Instant::now(),
@@ -268,7 +269,7 @@ impl RequestMonitoring {
         }
     }
 
-    pub fn success(&self) {
+    pub(crate) fn success(&self) {
         self.0
             .try_lock()
             .expect("should not deadlock")
@@ -277,7 +278,7 @@ impl RequestMonitoring {
     }
 }
 
-pub struct LatencyTimerPause<'a> {
+pub(crate) struct LatencyTimerPause<'a> {
     ctx: &'a RequestMonitoring,
     start: tokio::time::Instant,
     waiting_for: Waiting,
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index e5962b35fa..88caa9a316 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -62,8 +62,8 @@ pub struct ParquetUploadArgs {
 // But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN
 // level instead, as repeated failures can mean a more serious problem. If it
 // fails more than FAILED_UPLOAD_RETRIES times, we give up
-pub const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
-pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
+pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
 
 // the parquet crate leaves a lot to be desired...
 // what follows is an attempt to write parquet files with minimal allocs.
@@ -73,7 +73,7 @@ pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
 // * after each rowgroup write, we check the length of the file and upload to s3 if large enough
 
 #[derive(parquet_derive::ParquetRecordWriter)]
-pub struct RequestData {
+pub(crate) struct RequestData {
     region: &'static str,
     protocol: &'static str,
     /// Must be UTC. The derive macro doesn't like the timezones
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index fdfe50a494..53f9f75c5b 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -3,12 +3,12 @@ use std::{error::Error as StdError, fmt, io};
 use measured::FixedCardinalityLabel;
 
 /// Upcast (almost) any error into an opaque [`io::Error`].
-pub fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
+pub(crate) fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
     io::Error::new(io::ErrorKind::Other, e)
 }
 
 /// A small combinator for pluggable error logging.
-pub fn log_error<E: fmt::Display>(e: E) -> E {
+pub(crate) fn log_error<E: fmt::Display>(e: E) -> E {
     tracing::error!("{e}");
     e
 }
@@ -19,7 +19,7 @@ pub fn log_error<E: fmt::Display>(e: E) -> E {
 /// NOTE: This trait should not be implemented for [`anyhow::Error`], since it
 /// is way too convenient and tends to proliferate all across the codebase,
 /// ultimately leading to accidental leaks of sensitive data.
-pub trait UserFacingError: ReportableError {
+pub(crate) trait UserFacingError: ReportableError {
     /// Format the error for client, stripping all sensitive info.
     ///
     /// Although this might be a no-op for many types, it's highly
@@ -64,7 +64,7 @@ pub enum ErrorKind {
 }
 
 impl ErrorKind {
-    pub fn to_metric_label(&self) -> &'static str {
+    pub(crate) fn to_metric_label(self) -> &'static str {
         match self {
             ErrorKind::User => "user",
             ErrorKind::ClientDisconnect => "clientdisconnect",
@@ -78,7 +78,7 @@ impl ErrorKind {
     }
 }
 
-pub trait ReportableError: fmt::Display + Send + 'static {
+pub(crate) trait ReportableError: fmt::Display + Send + 'static {
     fn get_error_kind(&self) -> ErrorKind;
 }
 
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 1f1dd8c415..fee634f67f 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -12,9 +12,9 @@ use http_body_util::BodyExt;
 use hyper1::body::Body;
 use serde::de::DeserializeOwned;
 
-pub use reqwest::{Request, Response, StatusCode};
-pub use reqwest_middleware::{ClientWithMiddleware, Error};
-pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+pub(crate) use reqwest::{Request, Response};
+pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error};
+pub(crate) use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 
 use crate::{
     metrics::{ConsoleRequest, Metrics},
@@ -35,7 +35,7 @@ pub fn new_client() -> ClientWithMiddleware {
         .build()
 }
 
-pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
+pub(crate) fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
     let timeout_client = reqwest::ClientBuilder::new()
         .timeout(default_timout)
         .build()
@@ -77,20 +77,20 @@ impl Endpoint {
     }
 
     #[inline(always)]
-    pub fn url(&self) -> &ApiUrl {
+    pub(crate) fn url(&self) -> &ApiUrl {
         &self.endpoint
     }
 
     /// Return a [builder](RequestBuilder) for a `GET` request,
     /// appending a single `path` segment to the base endpoint URL.
-    pub fn get(&self, path: &str) -> RequestBuilder {
+    pub(crate) fn get(&self, path: &str) -> RequestBuilder {
         let mut url = self.endpoint.clone();
         url.path_segments_mut().push(path);
         self.client.get(url.into_inner())
     }
 
     /// Execute a [request](reqwest::Request).
-    pub async fn execute(&self, request: Request) -> Result<Response, Error> {
+    pub(crate) async fn execute(&self, request: Request) -> Result<Response, Error> {
         let _timer = Metrics::get()
             .proxy
             .console_request_latency
@@ -102,7 +102,7 @@ impl Endpoint {
     }
 }
 
-pub async fn parse_json_body_with_limit<D: DeserializeOwned>(
+pub(crate) async fn parse_json_body_with_limit<D: DeserializeOwned>(
     mut b: impl Body<Data = Bytes, Error = reqwest::Error> + Unpin,
     limit: usize,
 ) -> anyhow::Result<D> {
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index d418caa511..e5144cfe2e 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -29,10 +29,10 @@ impl<Id: InternId> std::fmt::Display for InternedString<Id> {
 }
 
 impl<Id: InternId> InternedString<Id> {
-    pub fn as_str(&self) -> &'static str {
+    pub(crate) fn as_str(&self) -> &'static str {
         Id::get_interner().inner.resolve(&self.inner)
     }
-    pub fn get(s: &str) -> Option<Self> {
+    pub(crate) fn get(s: &str) -> Option<Self> {
         Id::get_interner().get(s)
     }
 }
@@ -78,7 +78,7 @@ impl<Id: InternId> serde::Serialize for InternedString<Id> {
 }
 
 impl<Id: InternId> StringInterner<Id> {
-    pub fn new() -> Self {
+    pub(crate) fn new() -> Self {
         StringInterner {
             inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher(
                 Capacity::new(2500, NonZeroUsize::new(1 << 16).unwrap()),
@@ -90,26 +90,24 @@ impl<Id: InternId> StringInterner<Id> {
         }
     }
 
-    pub fn is_empty(&self) -> bool {
-        self.inner.is_empty()
-    }
-
-    pub fn len(&self) -> usize {
+    #[cfg(test)]
+    fn len(&self) -> usize {
         self.inner.len()
     }
 
-    pub fn current_memory_usage(&self) -> usize {
+    #[cfg(test)]
+    fn current_memory_usage(&self) -> usize {
         self.inner.current_memory_usage()
     }
 
-    pub fn get_or_intern(&self, s: &str) -> InternedString<Id> {
+    pub(crate) fn get_or_intern(&self, s: &str) -> InternedString<Id> {
         InternedString {
             inner: self.inner.get_or_intern(s),
             _id: PhantomData,
         }
     }
 
-    pub fn get(&self, s: &str) -> Option<InternedString<Id>> {
+    pub(crate) fn get(&self, s: &str) -> Option<InternedString<Id>> {
         Some(InternedString {
             inner: self.inner.get(s)?,
             _id: PhantomData,
@@ -132,14 +130,14 @@ impl<Id: InternId> Default for StringInterner<Id> {
 }
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub struct RoleNameTag;
+pub(crate) struct RoleNameTag;
 impl InternId for RoleNameTag {
     fn get_interner() -> &'static StringInterner<Self> {
-        pub static ROLE_NAMES: OnceLock<StringInterner<RoleNameTag>> = OnceLock::new();
+        static ROLE_NAMES: OnceLock<StringInterner<RoleNameTag>> = OnceLock::new();
         ROLE_NAMES.get_or_init(Default::default)
     }
 }
-pub type RoleNameInt = InternedString<RoleNameTag>;
+pub(crate) type RoleNameInt = InternedString<RoleNameTag>;
 impl From<&RoleName> for RoleNameInt {
     fn from(value: &RoleName) -> Self {
         RoleNameTag::get_interner().get_or_intern(value)
@@ -150,7 +148,7 @@ impl From<&RoleName> for RoleNameInt {
 pub struct EndpointIdTag;
 impl InternId for EndpointIdTag {
     fn get_interner() -> &'static StringInterner<Self> {
-        pub static ROLE_NAMES: OnceLock<StringInterner<EndpointIdTag>> = OnceLock::new();
+        static ROLE_NAMES: OnceLock<StringInterner<EndpointIdTag>> = OnceLock::new();
         ROLE_NAMES.get_or_init(Default::default)
     }
 }
@@ -170,7 +168,7 @@ impl From<EndpointId> for EndpointIdInt {
 pub struct BranchIdTag;
 impl InternId for BranchIdTag {
     fn get_interner() -> &'static StringInterner<Self> {
-        pub static ROLE_NAMES: OnceLock<StringInterner<BranchIdTag>> = OnceLock::new();
+        static ROLE_NAMES: OnceLock<StringInterner<BranchIdTag>> = OnceLock::new();
         ROLE_NAMES.get_or_init(Default::default)
     }
 }
@@ -190,7 +188,7 @@ impl From<BranchId> for BranchIdInt {
 pub struct ProjectIdTag;
 impl InternId for ProjectIdTag {
     fn get_interner() -> &'static StringInterner<Self> {
-        pub static ROLE_NAMES: OnceLock<StringInterner<ProjectIdTag>> = OnceLock::new();
+        static ROLE_NAMES: OnceLock<StringInterner<ProjectIdTag>> = OnceLock::new();
         ROLE_NAMES.get_or_init(Default::default)
     }
 }
@@ -217,7 +215,7 @@ mod tests {
     struct MyId;
     impl InternId for MyId {
         fn get_interner() -> &'static StringInterner<Self> {
-            pub static ROLE_NAMES: OnceLock<StringInterner<MyId>> = OnceLock::new();
+            pub(crate) static ROLE_NAMES: OnceLock<StringInterner<MyId>> = OnceLock::new();
             ROLE_NAMES.get_or_init(Default::default)
         }
     }
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 1e14ca59ec..8d7e586b3d 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -157,7 +157,8 @@ macro_rules! smol_str_wrapper {
         pub struct $name(smol_str::SmolStr);
 
         impl $name {
-            pub fn as_str(&self) -> &str {
+            #[allow(unused)]
+            pub(crate) fn as_str(&self) -> &str {
                 self.0.as_str()
             }
         }
@@ -252,19 +253,19 @@ smol_str_wrapper!(Host);
 
 // Endpoints are a bit tricky. Rare they might be branches or projects.
 impl EndpointId {
-    pub fn is_endpoint(&self) -> bool {
+    pub(crate) fn is_endpoint(&self) -> bool {
         self.0.starts_with("ep-")
     }
-    pub fn is_branch(&self) -> bool {
+    pub(crate) fn is_branch(&self) -> bool {
         self.0.starts_with("br-")
     }
-    pub fn is_project(&self) -> bool {
-        !self.is_endpoint() && !self.is_branch()
-    }
-    pub fn as_branch(&self) -> BranchId {
+    // pub(crate) fn is_project(&self) -> bool {
+    //     !self.is_endpoint() && !self.is_branch()
+    // }
+    pub(crate) fn as_branch(&self) -> BranchId {
         BranchId(self.0.clone())
     }
-    pub fn as_project(&self) -> ProjectId {
+    pub(crate) fn as_project(&self) -> ProjectId {
         ProjectId(self.0.clone())
     }
 }
diff --git a/proxy/src/parse.rs b/proxy/src/parse.rs
index 0d03574901..8c0f251066 100644
--- a/proxy/src/parse.rs
+++ b/proxy/src/parse.rs
@@ -2,14 +2,14 @@
 
 use std::ffi::CStr;
 
-pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> {
+pub(crate) fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> {
     let cstr = CStr::from_bytes_until_nul(bytes).ok()?;
     let (_, other) = bytes.split_at(cstr.to_bytes_with_nul().len());
     Some((cstr, other))
 }
 
 /// See <https://doc.rust-lang.org/std/primitive.slice.html#method.split_array_ref>.
-pub fn split_at_const<const N: usize>(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> {
+pub(crate) fn split_at_const<const N: usize>(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> {
     (bytes.len() >= N).then(|| {
         let (head, tail) = bytes.split_at(N);
         (head.try_into().unwrap(), tail)
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 1dd4563514..17764f78d1 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -13,9 +13,9 @@ use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
 
 pin_project! {
     /// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough
-    pub struct ChainRW<T> {
+    pub(crate) struct ChainRW<T> {
         #[pin]
-        pub inner: T,
+        pub(crate) inner: T,
         buf: BytesMut,
     }
 }
@@ -60,7 +60,7 @@ const HEADER: [u8; 12] = [
     0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A,
 ];
 
-pub async fn read_proxy_protocol<T: AsyncRead + Unpin>(
+pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
     mut read: T,
 ) -> std::io::Result<(ChainRW<T>, Option<SocketAddr>)> {
     let mut buf = BytesMut::with_capacity(128);
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index aa1025a29f..ff199ac701 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -1,12 +1,12 @@
 #[cfg(test)]
 mod tests;
 
-pub mod connect_compute;
+pub(crate) mod connect_compute;
 mod copy_bidirectional;
-pub mod handshake;
-pub mod passthrough;
-pub mod retry;
-pub mod wake_compute;
+pub(crate) mod handshake;
+pub(crate) mod passthrough;
+pub(crate) mod retry;
+pub(crate) mod wake_compute;
 pub use copy_bidirectional::copy_bidirectional_client_compute;
 pub use copy_bidirectional::ErrorSource;
 
@@ -170,21 +170,21 @@ pub async fn task_main(
     Ok(())
 }
 
-pub enum ClientMode {
+pub(crate) enum ClientMode {
     Tcp,
     Websockets { hostname: Option<String> },
 }
 
 /// Abstracts the logic of handling TCP vs WS clients
 impl ClientMode {
-    pub fn allow_cleartext(&self) -> bool {
+    pub(crate) fn allow_cleartext(&self) -> bool {
         match self {
             ClientMode::Tcp => false,
             ClientMode::Websockets { .. } => true,
         }
     }
 
-    pub fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
+    pub(crate) fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
         match self {
             ClientMode::Tcp => config.allow_self_signed_compute,
             ClientMode::Websockets { .. } => false,
@@ -213,7 +213,7 @@ impl ClientMode {
 // 2. Handshake: handshake reports errors if it can, otherwise if the handshake fails due to protocol violation,
 //    we cannot be sure the client even understands our error message
 // 3. PrepareClient: The client disconnected, so we can't tell them anyway...
-pub enum ClientRequestError {
+pub(crate) enum ClientRequestError {
     #[error("{0}")]
     Cancellation(#[from] cancellation::CancelError),
     #[error("{0}")]
@@ -238,7 +238,7 @@ impl ReportableError for ClientRequestError {
     }
 }
 
-pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
+pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     ctx: &RequestMonitoring,
     cancellation_handler: Arc<CancellationHandlerMain>,
@@ -340,9 +340,9 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         client: stream,
         aux: node.aux.clone(),
         compute: node,
-        req: request_gauge,
-        conn: conn_gauge,
-        cancel: session,
+        _req: request_gauge,
+        _conn: conn_gauge,
+        _cancel: session,
     }))
 }
 
@@ -377,20 +377,20 @@ async fn prepare_client_connection<P>(
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, Default)]
-pub struct NeonOptions(Vec<(SmolStr, SmolStr)>);
+pub(crate) struct NeonOptions(Vec<(SmolStr, SmolStr)>);
 
 impl NeonOptions {
-    pub fn parse_params(params: &StartupMessageParams) -> Self {
+    pub(crate) fn parse_params(params: &StartupMessageParams) -> Self {
         params
             .options_raw()
             .map(Self::parse_from_iter)
             .unwrap_or_default()
     }
-    pub fn parse_options_raw(options: &str) -> Self {
+    pub(crate) fn parse_options_raw(options: &str) -> Self {
         Self::parse_from_iter(StartupMessageParams::parse_options_raw(options))
     }
 
-    pub fn is_ephemeral(&self) -> bool {
+    pub(crate) fn is_ephemeral(&self) -> bool {
         // Currently, neon endpoint options are all reserved for ephemeral endpoints.
         !self.0.is_empty()
     }
@@ -404,7 +404,7 @@ impl NeonOptions {
         Self(options)
     }
 
-    pub fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey {
+    pub(crate) fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey {
         // prefix + format!(" {k}:{v}")
         // kinda jank because SmolStr is immutable
         std::iter::once(prefix)
@@ -415,7 +415,7 @@ impl NeonOptions {
 
     /// <https://swagger.io/docs/specification/serialization/> DeepObject format
     /// `paramName[prop1]=value1&paramName[prop2]=value2&...`
-    pub fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> {
+    pub(crate) fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> {
         self.0
             .iter()
             .map(|(k, v)| (format_smolstr!("options[{}]", k), v.clone()))
@@ -423,7 +423,7 @@ impl NeonOptions {
     }
 }
 
-pub fn neon_option(bytes: &str) -> Option<(&str, &str)> {
+pub(crate) fn neon_option(bytes: &str) -> Option<(&str, &str)> {
     static RE: OnceCell<Regex> = OnceCell::new();
     let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").unwrap());
 
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 6305dc204e..613548d4a0 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -25,7 +25,7 @@ const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
 /// (e.g. the compute node's address might've changed at the wrong time).
 /// Invalidate the cache entry (if any) to prevent subsequent errors.
 #[tracing::instrument(name = "invalidate_cache", skip_all)]
-pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
+pub(crate) fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
     let is_cached = node_info.cached();
     if is_cached {
         warn!("invalidating stalled compute node info cache entry");
@@ -41,7 +41,7 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
 }
 
 #[async_trait]
-pub trait ConnectMechanism {
+pub(crate) trait ConnectMechanism {
     type Connection;
     type ConnectError: ReportableError;
     type Error: From<Self::ConnectError>;
@@ -56,7 +56,7 @@ pub trait ConnectMechanism {
 }
 
 #[async_trait]
-pub trait ComputeConnectBackend {
+pub(crate) trait ComputeConnectBackend {
     async fn wake_compute(
         &self,
         ctx: &RequestMonitoring,
@@ -65,12 +65,12 @@ pub trait ComputeConnectBackend {
     fn get_keys(&self) -> &ComputeCredentialKeys;
 }
 
-pub struct TcpMechanism<'a> {
+pub(crate) struct TcpMechanism<'a> {
     /// KV-dictionary with PostgreSQL connection params.
-    pub params: &'a StartupMessageParams,
+    pub(crate) params: &'a StartupMessageParams,
 
     /// connect_to_compute concurrency lock
-    pub locks: &'static ApiLocks<Host>,
+    pub(crate) locks: &'static ApiLocks<Host>,
 }
 
 #[async_trait]
@@ -98,7 +98,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
 
 /// Try to connect to the compute node, retrying if necessary.
 #[tracing::instrument(skip_all)]
-pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
+pub(crate) async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
     ctx: &RequestMonitoring,
     mechanism: &M,
     user_info: &B,
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index f8c8e8bc4b..4ebda013ac 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -14,7 +14,7 @@ enum TransferState {
 }
 
 #[derive(Debug)]
-pub enum ErrorDirection {
+pub(crate) enum ErrorDirection {
     Read(io::Error),
     Write(io::Error),
 }
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index 27a72f8072..5996b11c11 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -18,7 +18,7 @@ use crate::{
 };
 
 #[derive(Error, Debug)]
-pub enum HandshakeError {
+pub(crate) enum HandshakeError {
     #[error("data is sent before server replied with EncryptionResponse")]
     EarlyData,
 
@@ -57,7 +57,7 @@ impl ReportableError for HandshakeError {
     }
 }
 
-pub enum HandshakeData<S> {
+pub(crate) enum HandshakeData<S> {
     Startup(PqStream<Stream<S>>, StartupMessageParams),
     Cancel(CancelKeyData),
 }
@@ -67,7 +67,7 @@ pub enum HandshakeData<S> {
 /// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
 /// we also take an extra care of propagating only the select handshake errors to client.
 #[tracing::instrument(skip_all)]
-pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
+pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     ctx: &RequestMonitoring,
     stream: S,
     mut tls: Option<&TlsConfig>,
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index 9942fac383..c17108de0a 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -14,7 +14,7 @@ use super::copy_bidirectional::ErrorSource;
 
 /// Forward bytes in both directions (client <-> compute).
 #[tracing::instrument(skip_all)]
-pub async fn proxy_pass(
+pub(crate) async fn proxy_pass(
     client: impl AsyncRead + AsyncWrite + Unpin,
     compute: impl AsyncRead + AsyncWrite + Unpin,
     aux: MetricsAuxInfo,
@@ -57,18 +57,18 @@ pub async fn proxy_pass(
     Ok(())
 }
 
-pub struct ProxyPassthrough<P, S> {
-    pub client: Stream<S>,
-    pub compute: PostgresConnection,
-    pub aux: MetricsAuxInfo,
+pub(crate) struct ProxyPassthrough<P, S> {
+    pub(crate) client: Stream<S>,
+    pub(crate) compute: PostgresConnection,
+    pub(crate) aux: MetricsAuxInfo,
 
-    pub req: NumConnectionRequestsGuard<'static>,
-    pub conn: NumClientConnectionsGuard<'static>,
-    pub cancel: cancellation::Session<P>,
+    pub(crate) _req: NumConnectionRequestsGuard<'static>,
+    pub(crate) _conn: NumClientConnectionsGuard<'static>,
+    pub(crate) _cancel: cancellation::Session<P>,
 }
 
 impl<P, S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<P, S> {
-    pub async fn proxy_pass(self) -> Result<(), ErrorSource> {
+    pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> {
         let res = proxy_pass(self.client, self.compute.stream, self.aux).await;
         if let Err(err) = self.compute.cancel_closure.try_cancel_query().await {
             tracing::error!(?err, "could not cancel the query in the database");
diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs
index 644b183a91..15895d37e6 100644
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -2,18 +2,18 @@ use crate::{compute, config::RetryConfig};
 use std::{error::Error, io};
 use tokio::time;
 
-pub trait CouldRetry {
+pub(crate) trait CouldRetry {
     /// Returns true if the error could be retried
     fn could_retry(&self) -> bool;
 }
 
-pub trait ShouldRetryWakeCompute {
+pub(crate) trait ShouldRetryWakeCompute {
     /// Returns true if we need to invalidate the cache for this node.
     /// If false, we can continue retrying with the current node cache.
     fn should_retry_wake_compute(&self) -> bool;
 }
 
-pub fn should_retry(err: &impl CouldRetry, num_retries: u32, config: RetryConfig) -> bool {
+pub(crate) fn should_retry(err: &impl CouldRetry, num_retries: u32, config: RetryConfig) -> bool {
     num_retries < config.max_retries && err.could_retry()
 }
 
@@ -101,7 +101,7 @@ impl ShouldRetryWakeCompute for compute::ConnectionError {
     }
 }
 
-pub fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration {
+pub(crate) fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration {
     config
         .base_delay
         .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1))
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 21c0641a7f..b3b284ef27 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -11,14 +11,14 @@ use crate::auth::backend::{
     ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
 };
 use crate::config::{CertResolver, RetryConfig};
-use crate::console::caches::NodeInfoCache;
 use crate::console::messages::{ConsoleError, Details, MetricsAuxInfo, Status};
-use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
+use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend, NodeInfoCache};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
-use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
+use crate::{sasl, scram, BranchId, EndpointId, ProjectId};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
+use http::StatusCode;
 use retry::{retry_after, ShouldRetryWakeCompute};
 use rstest::rstest;
 use rustls::pki_types;
@@ -491,7 +491,7 @@ impl TestBackend for TestConnectMechanism {
             ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
             ConnectAction::WakeFail => {
                 let err = console::errors::ApiError::Console(ConsoleError {
-                    http_status_code: http::StatusCode::BAD_REQUEST,
+                    http_status_code: StatusCode::BAD_REQUEST,
                     error: "TEST".into(),
                     status: None,
                 });
@@ -500,7 +500,7 @@ impl TestBackend for TestConnectMechanism {
             }
             ConnectAction::WakeRetry => {
                 let err = console::errors::ApiError::Console(ConsoleError {
-                    http_status_code: http::StatusCode::BAD_REQUEST,
+                    http_status_code: StatusCode::BAD_REQUEST,
                     error: "TEST".into(),
                     status: Some(Status {
                         code: "error".into(),
@@ -525,9 +525,6 @@ impl TestBackend for TestConnectMechanism {
     {
         unimplemented!("not used in tests")
     }
-    fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
-        unimplemented!("not used in tests")
-    }
 }
 
 fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index 71f07f4682..33a2162bc7 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -102,7 +102,7 @@ async fn proxy_mitm(
 }
 
 /// taken from tokio-postgres
-pub async fn connect_tls<S, T>(mut stream: S, tls: T) -> T::Stream
+pub(crate) async fn connect_tls<S, T>(mut stream: S, tls: T) -> T::Stream
 where
     S: AsyncRead + AsyncWrite + Unpin,
     T: TlsConnect<S>,
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 5b06e8f054..9b8ac6d29d 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -12,7 +12,7 @@ use tracing::{error, info, warn};
 
 use super::connect_compute::ComputeConnectBackend;
 
-pub async fn wake_compute<B: ComputeConnectBackend>(
+pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
     ctx: &RequestMonitoring,
     api: &B,
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index 222cd431d2..e5f5867998 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -1,10 +1,16 @@
+mod leaky_bucket;
 mod limit_algorithm;
 mod limiter;
-pub use limit_algorithm::{
-    aimd::Aimd, DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
+
+#[cfg(test)]
+pub(crate) use limit_algorithm::aimd::Aimd;
+
+pub(crate) use limit_algorithm::{
+    DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
 };
-pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
-mod leaky_bucket;
+pub(crate) use limiter::GlobalRateLimiter;
+
 pub use leaky_bucket::{
     EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState,
 };
+pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs
index f184e18f4c..fa8cb75256 100644
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -35,7 +35,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
     }
 
     /// Check that number of connections to the endpoint is below `max_rps` rps.
-    pub fn check(&self, key: K, n: u32) -> bool {
+    pub(crate) fn check(&self, key: K, n: u32) -> bool {
         let now = Instant::now();
 
         if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 {
@@ -73,8 +73,9 @@ pub struct LeakyBucketState {
     time: Instant,
 }
 
+#[cfg(test)]
 impl LeakyBucketConfig {
-    pub fn new(rps: f64, max: f64) -> Self {
+    pub(crate) fn new(rps: f64, max: f64) -> Self {
         assert!(rps > 0.0, "rps must be positive");
         assert!(max > 0.0, "max must be positive");
         Self { rps, max }
@@ -82,7 +83,7 @@ impl LeakyBucketConfig {
 }
 
 impl LeakyBucketState {
-    pub fn new() -> Self {
+    pub(crate) fn new() -> Self {
         Self {
             filled: 0.0,
             time: Instant::now(),
@@ -100,7 +101,7 @@ impl LeakyBucketState {
         self.filled == 0.0
     }
 
-    pub fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool {
+    pub(crate) fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool {
         self.update(info, now);
 
         if self.filled + n > info.max {
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
index bc16837f65..25607b7e10 100644
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -8,13 +8,13 @@ use tokio::{
 
 use self::aimd::Aimd;
 
-pub mod aimd;
+pub(crate) mod aimd;
 
 /// Whether a job succeeded or failed as a result of congestion/overload.
 ///
 /// Errors not considered to be caused by overload should be ignored.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum Outcome {
+pub(crate) enum Outcome {
     /// The job succeeded, or failed in a way unrelated to overload.
     Success,
     /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal
@@ -23,14 +23,14 @@ pub enum Outcome {
 }
 
 /// An algorithm for controlling a concurrency limit.
-pub trait LimitAlgorithm: Send + Sync + 'static {
+pub(crate) trait LimitAlgorithm: Send + Sync + 'static {
     /// Update the concurrency limit in response to a new job completion.
     fn update(&self, old_limit: usize, sample: Sample) -> usize;
 }
 
 /// The result of a job (or jobs), including the [`Outcome`] (loss) and latency (delay).
 #[derive(Debug, Clone, PartialEq, Eq, Copy)]
-pub struct Sample {
+pub(crate) struct Sample {
     pub(crate) latency: Duration,
     /// Jobs in flight when the sample was taken.
     pub(crate) in_flight: usize,
@@ -39,7 +39,7 @@ pub struct Sample {
 
 #[derive(Clone, Copy, Debug, Default, serde::Deserialize, PartialEq)]
 #[serde(rename_all = "snake_case")]
-pub enum RateLimitAlgorithm {
+pub(crate) enum RateLimitAlgorithm {
     #[default]
     Fixed,
     Aimd {
@@ -48,7 +48,7 @@ pub enum RateLimitAlgorithm {
     },
 }
 
-pub struct Fixed;
+pub(crate) struct Fixed;
 
 impl LimitAlgorithm for Fixed {
     fn update(&self, old_limit: usize, _sample: Sample) -> usize {
@@ -59,12 +59,12 @@ impl LimitAlgorithm for Fixed {
 #[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)]
 pub struct RateLimiterConfig {
     #[serde(flatten)]
-    pub algorithm: RateLimitAlgorithm,
-    pub initial_limit: usize,
+    pub(crate) algorithm: RateLimitAlgorithm,
+    pub(crate) initial_limit: usize,
 }
 
 impl RateLimiterConfig {
-    pub fn create_rate_limit_algorithm(self) -> Box<dyn LimitAlgorithm> {
+    pub(crate) fn create_rate_limit_algorithm(self) -> Box<dyn LimitAlgorithm> {
         match self.algorithm {
             RateLimitAlgorithm::Fixed => Box::new(Fixed),
             RateLimitAlgorithm::Aimd { conf } => Box::new(conf),
@@ -72,7 +72,7 @@ impl RateLimiterConfig {
     }
 }
 
-pub struct LimiterInner {
+pub(crate) struct LimiterInner {
     alg: Box<dyn LimitAlgorithm>,
     available: usize,
     limit: usize,
@@ -114,7 +114,7 @@ impl LimiterInner {
 ///
 /// The limit will be automatically adjusted based on observed latency (delay) and/or failures
 /// caused by overload (loss).
-pub struct DynamicLimiter {
+pub(crate) struct DynamicLimiter {
     config: RateLimiterConfig,
     inner: Mutex<LimiterInner>,
     // to notify when a token is available
@@ -124,7 +124,7 @@ pub struct DynamicLimiter {
 /// A concurrency token, required to run a job.
 ///
 /// Release the token back to the [`DynamicLimiter`] after the job is complete.
-pub struct Token {
+pub(crate) struct Token {
     start: Instant,
     limiter: Option<Arc<DynamicLimiter>>,
 }
@@ -133,14 +133,14 @@ pub struct Token {
 ///
 /// Not guaranteed to be consistent under high concurrency.
 #[derive(Debug, Clone, Copy)]
-pub struct LimiterState {
+#[cfg(test)]
+struct LimiterState {
     limit: usize,
-    in_flight: usize,
 }
 
 impl DynamicLimiter {
     /// Create a limiter with a given limit control algorithm.
-    pub fn new(config: RateLimiterConfig) -> Arc<Self> {
+    pub(crate) fn new(config: RateLimiterConfig) -> Arc<Self> {
         let ready = Notify::new();
         ready.notify_one();
 
@@ -157,7 +157,10 @@ impl DynamicLimiter {
     }
 
     /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available.
-    pub async fn acquire_timeout(self: &Arc<Self>, duration: Duration) -> Result<Token, Elapsed> {
+    pub(crate) async fn acquire_timeout(
+        self: &Arc<Self>,
+        duration: Duration,
+    ) -> Result<Token, Elapsed> {
         tokio::time::timeout(duration, self.acquire()).await?
     }
 
@@ -208,12 +211,10 @@ impl DynamicLimiter {
     }
 
     /// The current state of the limiter.
-    pub fn state(&self) -> LimiterState {
+    #[cfg(test)]
+    fn state(&self) -> LimiterState {
         let inner = self.inner.lock();
-        LimiterState {
-            limit: inner.limit,
-            in_flight: inner.in_flight,
-        }
+        LimiterState { limit: inner.limit }
     }
 }
 
@@ -224,22 +225,22 @@ impl Token {
             limiter: Some(limiter),
         }
     }
-    pub fn disabled() -> Self {
+    pub(crate) fn disabled() -> Self {
         Self {
             start: Instant::now(),
             limiter: None,
         }
     }
 
-    pub fn is_disabled(&self) -> bool {
+    pub(crate) fn is_disabled(&self) -> bool {
         self.limiter.is_none()
     }
 
-    pub fn release(mut self, outcome: Outcome) {
+    pub(crate) fn release(mut self, outcome: Outcome) {
         self.release_mut(Some(outcome));
     }
 
-    pub fn release_mut(&mut self, outcome: Option<Outcome>) {
+    pub(crate) fn release_mut(&mut self, outcome: Option<Outcome>) {
         if let Some(limiter) = self.limiter.take() {
             limiter.release_inner(self.start, outcome);
         }
@@ -252,13 +253,10 @@ impl Drop for Token {
     }
 }
 
+#[cfg(test)]
 impl LimiterState {
     /// The current concurrency limit.
-    pub fn limit(&self) -> usize {
+    fn limit(self) -> usize {
         self.limit
     }
-    /// The number of jobs in flight.
-    pub fn in_flight(&self) -> usize {
-        self.in_flight
-    }
 }
diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
index d669492fa6..86b56e38fb 100644
--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -10,17 +10,17 @@ use super::{LimitAlgorithm, Outcome, Sample};
 ///
 /// Reduces available concurrency by a factor when load-based errors are detected.
 #[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)]
-pub struct Aimd {
+pub(crate) struct Aimd {
     /// Minimum limit for AIMD algorithm.
-    pub min: usize,
+    pub(crate) min: usize,
     /// Maximum limit for AIMD algorithm.
-    pub max: usize,
+    pub(crate) max: usize,
     /// Decrease AIMD decrease by value in case of error.
-    pub dec: f32,
+    pub(crate) dec: f32,
     /// Increase AIMD increase by value in case of success.
-    pub inc: usize,
+    pub(crate) inc: usize,
     /// A threshold below which the limit won't be increased.
-    pub utilisation: f32,
+    pub(crate) utilisation: f32,
 }
 
 impl LimitAlgorithm for Aimd {
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 5db4efed37..be529f174d 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -17,13 +17,13 @@ use tracing::info;
 
 use crate::intern::EndpointIdInt;
 
-pub struct GlobalRateLimiter {
+pub(crate) struct GlobalRateLimiter {
     data: Vec<RateBucket>,
     info: Vec<RateBucketInfo>,
 }
 
 impl GlobalRateLimiter {
-    pub fn new(info: Vec<RateBucketInfo>) -> Self {
+    pub(crate) fn new(info: Vec<RateBucketInfo>) -> Self {
         Self {
             data: vec![
                 RateBucket {
@@ -37,7 +37,7 @@ impl GlobalRateLimiter {
     }
 
     /// Check that number of connections is below `max_rps` rps.
-    pub fn check(&mut self) -> bool {
+    pub(crate) fn check(&mut self) -> bool {
         let now = Instant::now();
 
         let should_allow_request = self
@@ -96,9 +96,9 @@ impl RateBucket {
 
 #[derive(Clone, Copy, PartialEq)]
 pub struct RateBucketInfo {
-    pub interval: Duration,
+    pub(crate) interval: Duration,
     // requests per interval
-    pub max_rpi: u32,
+    pub(crate) max_rpi: u32,
 }
 
 impl std::fmt::Display for RateBucketInfo {
@@ -192,7 +192,7 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
     }
 
     /// Check that number of connections to the endpoint is below `max_rps` rps.
-    pub fn check(&self, key: K, n: u32) -> bool {
+    pub(crate) fn check(&self, key: K, n: u32) -> bool {
         // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map.
         // worst case memory usage is about:
         //    = 2 * 2048 * 64 * (48B + 72B)
@@ -228,7 +228,7 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
     /// Clean the map. Simple strategy: remove all entries in a random shard.
     /// At worst, we'll double the effective max_rps during the cleanup.
     /// But that way deletion does not aquire mutex on each entry access.
-    pub fn do_gc(&self) {
+    pub(crate) fn do_gc(&self) {
         info!(
             "cleaning up bucket rate limiter, current size = {}",
             self.map.len()
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index c9a946fa4a..95bdfc0965 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -109,7 +109,7 @@ impl RedisPublisherClient {
         let _: () = self.client.publish(PROXY_CHANNEL_NAME, payload).await?;
         Ok(())
     }
-    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
+    pub(crate) async fn try_connect(&mut self) -> anyhow::Result<()> {
         match self.client.connect().await {
             Ok(()) => {}
             Err(e) => {
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index c78ee166f1..7d222e2dec 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -81,7 +81,7 @@ impl ConnectionWithCredentialsProvider {
         redis::cmd("PING").query_async(con).await
     }
 
-    pub async fn connect(&mut self) -> anyhow::Result<()> {
+    pub(crate) async fn connect(&mut self) -> anyhow::Result<()> {
         let _guard = self.mutex.lock().await;
         if let Some(con) = self.con.as_mut() {
             match Self::ping(con).await {
@@ -149,7 +149,7 @@ impl ConnectionWithCredentialsProvider {
 
     // PubSub does not support credentials refresh.
     // Requires manual reconnection every 12h.
-    pub async fn get_async_pubsub(&self) -> anyhow::Result<redis::aio::PubSub> {
+    pub(crate) async fn get_async_pubsub(&self) -> anyhow::Result<redis::aio::PubSub> {
         Ok(self.get_client().await?.get_async_pubsub().await?)
     }
 
@@ -187,7 +187,10 @@ impl ConnectionWithCredentialsProvider {
     }
     /// Sends an already encoded (packed) command into the TCP socket and
     /// reads the single response from it.
-    pub async fn send_packed_command(&mut self, cmd: &redis::Cmd) -> RedisResult<redis::Value> {
+    pub(crate) async fn send_packed_command(
+        &mut self,
+        cmd: &redis::Cmd,
+    ) -> RedisResult<redis::Value> {
         // Clone connection to avoid having to lock the ArcSwap in write mode
         let con = self.con.as_mut().ok_or(redis::RedisError::from((
             redis::ErrorKind::IoError,
@@ -199,7 +202,7 @@ impl ConnectionWithCredentialsProvider {
     /// Sends multiple already encoded (packed) command into the TCP socket
     /// and reads `count` responses from it.  This is used to implement
     /// pipelining.
-    pub async fn send_packed_commands(
+    pub(crate) async fn send_packed_commands(
         &mut self,
         cmd: &redis::Pipeline,
         offset: usize,
diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs
index eded8250af..d118c8f412 100644
--- a/proxy/src/redis/elasticache.rs
+++ b/proxy/src/redis/elasticache.rs
@@ -51,7 +51,7 @@ impl CredentialsProvider {
             credentials_provider,
         }
     }
-    pub async fn provide_credentials(&self) -> anyhow::Result<(String, String)> {
+    pub(crate) async fn provide_credentials(&self) -> anyhow::Result<(String, String)> {
         let aws_credentials = self
             .credentials_provider
             .provide_credentials()
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 31c0e62c2c..36a3443603 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -58,9 +58,9 @@ pub(crate) struct PasswordUpdate {
 }
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 pub(crate) struct CancelSession {
-    pub region_id: Option<String>,
-    pub cancel_key_data: CancelKeyData,
-    pub session_id: Uuid,
+    pub(crate) region_id: Option<String>,
+    pub(crate) cancel_key_data: CancelKeyData,
+    pub(crate) session_id: Uuid,
 }
 
 fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
@@ -89,7 +89,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> Clone for MessageHandler<C> {
 }
 
 impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
-    pub fn new(
+    pub(crate) fn new(
         cache: Arc<C>,
         cancellation_handler: Arc<CancellationHandler<()>>,
         region_id: String,
@@ -100,10 +100,10 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
             region_id,
         }
     }
-    pub async fn increment_active_listeners(&self) {
+    pub(crate) async fn increment_active_listeners(&self) {
         self.cache.increment_active_listeners().await;
     }
-    pub async fn decrement_active_listeners(&self) {
+    pub(crate) async fn decrement_active_listeners(&self) {
         self.cache.decrement_active_listeners().await;
     }
     #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs
index 60207fc824..0a36694359 100644
--- a/proxy/src/sasl.rs
+++ b/proxy/src/sasl.rs
@@ -14,13 +14,13 @@ use crate::error::{ReportableError, UserFacingError};
 use std::io;
 use thiserror::Error;
 
-pub use channel_binding::ChannelBinding;
-pub use messages::FirstMessage;
-pub use stream::{Outcome, SaslStream};
+pub(crate) use channel_binding::ChannelBinding;
+pub(crate) use messages::FirstMessage;
+pub(crate) use stream::{Outcome, SaslStream};
 
 /// Fine-grained auth errors help in writing tests.
 #[derive(Error, Debug)]
-pub enum Error {
+pub(crate) enum Error {
     #[error("Channel binding failed: {0}")]
     ChannelBindingFailed(&'static str),
 
@@ -64,11 +64,11 @@ impl ReportableError for Error {
 }
 
 /// A convenient result type for SASL exchange.
-pub type Result<T> = std::result::Result<T, Error>;
+pub(crate) type Result<T> = std::result::Result<T, Error>;
 
 /// A result of one SASL exchange.
 #[must_use]
-pub enum Step<T, R> {
+pub(crate) enum Step<T, R> {
     /// We should continue exchanging messages.
     Continue(T, String),
     /// The client has been authenticated successfully.
@@ -78,7 +78,7 @@ pub enum Step<T, R> {
 }
 
 /// Every SASL mechanism (e.g. [SCRAM](crate::scram)) is expected to implement this trait.
-pub trait Mechanism: Sized {
+pub(crate) trait Mechanism: Sized {
     /// What's produced as a result of successful authentication.
     type Output;
 
diff --git a/proxy/src/sasl/channel_binding.rs b/proxy/src/sasl/channel_binding.rs
index 6e2d3057ce..fdd011448e 100644
--- a/proxy/src/sasl/channel_binding.rs
+++ b/proxy/src/sasl/channel_binding.rs
@@ -2,7 +2,7 @@
 
 /// Channel binding flag (possibly with params).
 #[derive(Debug, PartialEq, Eq)]
-pub enum ChannelBinding<T> {
+pub(crate) enum ChannelBinding<T> {
     /// Client doesn't support channel binding.
     NotSupportedClient,
     /// Client thinks server doesn't support channel binding.
@@ -12,7 +12,10 @@ pub enum ChannelBinding<T> {
 }
 
 impl<T> ChannelBinding<T> {
-    pub fn and_then<R, E>(self, f: impl FnOnce(T) -> Result<R, E>) -> Result<ChannelBinding<R>, E> {
+    pub(crate) fn and_then<R, E>(
+        self,
+        f: impl FnOnce(T) -> Result<R, E>,
+    ) -> Result<ChannelBinding<R>, E> {
         Ok(match self {
             Self::NotSupportedClient => ChannelBinding::NotSupportedClient,
             Self::NotSupportedServer => ChannelBinding::NotSupportedServer,
@@ -23,7 +26,7 @@ impl<T> ChannelBinding<T> {
 
 impl<'a> ChannelBinding<&'a str> {
     // NB: FromStr doesn't work with lifetimes
-    pub fn parse(input: &'a str) -> Option<Self> {
+    pub(crate) fn parse(input: &'a str) -> Option<Self> {
         Some(match input {
             "n" => Self::NotSupportedClient,
             "y" => Self::NotSupportedServer,
@@ -34,7 +37,7 @@ impl<'a> ChannelBinding<&'a str> {
 
 impl<T: std::fmt::Display> ChannelBinding<T> {
     /// Encode channel binding data as base64 for subsequent checks.
-    pub fn encode<'a, E>(
+    pub(crate) fn encode<'a, E>(
         &self,
         get_cbind_data: impl FnOnce(&T) -> Result<&'a [u8], E>,
     ) -> Result<std::borrow::Cow<'static, str>, E> {
diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs
index 2b5ae1785d..6c9a42b2db 100644
--- a/proxy/src/sasl/messages.rs
+++ b/proxy/src/sasl/messages.rs
@@ -5,16 +5,16 @@ use pq_proto::{BeAuthenticationSaslMessage, BeMessage};
 
 /// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage).
 #[derive(Debug)]
-pub struct FirstMessage<'a> {
+pub(crate) struct FirstMessage<'a> {
     /// Authentication method, e.g. `"SCRAM-SHA-256"`.
-    pub method: &'a str,
+    pub(crate) method: &'a str,
     /// Initial client message.
-    pub message: &'a str,
+    pub(crate) message: &'a str,
 }
 
 impl<'a> FirstMessage<'a> {
     // NB: FromStr doesn't work with lifetimes
-    pub fn parse(bytes: &'a [u8]) -> Option<Self> {
+    pub(crate) fn parse(bytes: &'a [u8]) -> Option<Self> {
         let (method_cstr, tail) = split_cstr(bytes)?;
         let method = method_cstr.to_str().ok()?;
 
diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs
index 9115b0f61a..b6becd28e1 100644
--- a/proxy/src/sasl/stream.rs
+++ b/proxy/src/sasl/stream.rs
@@ -7,7 +7,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
 /// Abstracts away all peculiarities of the libpq's protocol.
-pub struct SaslStream<'a, S> {
+pub(crate) struct SaslStream<'a, S> {
     /// The underlying stream.
     stream: &'a mut PqStream<S>,
     /// Current password message we received from client.
@@ -17,7 +17,7 @@ pub struct SaslStream<'a, S> {
 }
 
 impl<'a, S> SaslStream<'a, S> {
-    pub fn new(stream: &'a mut PqStream<S>, first: &'a str) -> Self {
+    pub(crate) fn new(stream: &'a mut PqStream<S>, first: &'a str) -> Self {
         Self {
             stream,
             current: bytes::Bytes::new(),
@@ -53,7 +53,7 @@ impl<S: AsyncWrite + Unpin> SaslStream<'_, S> {
 /// It's much easier to match on those two variants
 /// than to peek into a noisy protocol error type.
 #[must_use = "caller must explicitly check for success"]
-pub enum Outcome<R> {
+pub(crate) enum Outcome<R> {
     /// Authentication succeeded and produced some value.
     Success(R),
     /// Authentication failed (reason attached).
@@ -63,7 +63,7 @@ pub enum Outcome<R> {
 impl<S: AsyncRead + AsyncWrite + Unpin> SaslStream<'_, S> {
     /// Perform SASL message exchange according to the underlying algorithm
     /// until user is either authenticated or denied access.
-    pub async fn authenticate<M: Mechanism>(
+    pub(crate) async fn authenticate<M: Mechanism>(
         mut self,
         mut mechanism: M,
     ) -> super::Result<Outcome<M::Output>> {
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index 145e727a74..d058f1c3f8 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -15,9 +15,9 @@ mod secret;
 mod signature;
 pub mod threadpool;
 
-pub use exchange::{exchange, Exchange};
-pub use key::ScramKey;
-pub use secret::ServerSecret;
+pub(crate) use exchange::{exchange, Exchange};
+pub(crate) use key::ScramKey;
+pub(crate) use secret::ServerSecret;
 
 use hmac::{Hmac, Mac};
 use sha2::{Digest, Sha256};
@@ -26,8 +26,8 @@ const SCRAM_SHA_256: &str = "SCRAM-SHA-256";
 const SCRAM_SHA_256_PLUS: &str = "SCRAM-SHA-256-PLUS";
 
 /// A list of supported SCRAM methods.
-pub const METHODS: &[&str] = &[SCRAM_SHA_256_PLUS, SCRAM_SHA_256];
-pub const METHODS_WITHOUT_PLUS: &[&str] = &[SCRAM_SHA_256];
+pub(crate) const METHODS: &[&str] = &[SCRAM_SHA_256_PLUS, SCRAM_SHA_256];
+pub(crate) const METHODS_WITHOUT_PLUS: &[&str] = &[SCRAM_SHA_256];
 
 /// Decode base64 into array without any heap allocations
 fn base64_decode_array<const N: usize>(input: impl AsRef<[u8]>) -> Option<[u8; N]> {
diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs
index 944bb3c83e..255694b33e 100644
--- a/proxy/src/scram/countmin.rs
+++ b/proxy/src/scram/countmin.rs
@@ -2,7 +2,7 @@ use std::hash::Hash;
 
 /// estimator of hash jobs per second.
 /// <https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch>
-pub struct CountMinSketch {
+pub(crate) struct CountMinSketch {
     // one for each depth
     hashers: Vec<ahash::RandomState>,
     width: usize,
@@ -20,7 +20,7 @@ impl CountMinSketch {
     /// actual <= estimate
     /// estimate <= actual + ε * N with probability 1 - δ
     /// where N is the cardinality of the stream
-    pub fn with_params(epsilon: f64, delta: f64) -> Self {
+    pub(crate) fn with_params(epsilon: f64, delta: f64) -> Self {
         CountMinSketch::new(
             (std::f64::consts::E / epsilon).ceil() as usize,
             (1.0_f64 / delta).ln().ceil() as usize,
@@ -49,7 +49,7 @@ impl CountMinSketch {
         }
     }
 
-    pub fn inc_and_return<T: Hash>(&mut self, t: &T, x: u32) -> u32 {
+    pub(crate) fn inc_and_return<T: Hash>(&mut self, t: &T, x: u32) -> u32 {
         let mut min = u32::MAX;
         for row in 0..self.depth {
             let col = (self.hashers[row].hash_one(t) as usize) % self.width;
@@ -61,7 +61,7 @@ impl CountMinSketch {
         min
     }
 
-    pub fn reset(&mut self) {
+    pub(crate) fn reset(&mut self) {
         self.buckets.clear();
         self.buckets.resize(self.width * self.depth, 0);
     }
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index f2494379a5..7fdadc7038 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -56,14 +56,14 @@ enum ExchangeState {
 }
 
 /// Server's side of SCRAM auth algorithm.
-pub struct Exchange<'a> {
+pub(crate) struct Exchange<'a> {
     state: ExchangeState,
     secret: &'a ServerSecret,
     tls_server_end_point: config::TlsServerEndPoint,
 }
 
 impl<'a> Exchange<'a> {
-    pub fn new(
+    pub(crate) fn new(
         secret: &'a ServerSecret,
         nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN],
         tls_server_end_point: config::TlsServerEndPoint,
@@ -101,7 +101,7 @@ async fn derive_client_key(
     make_key(b"Client Key").into()
 }
 
-pub async fn exchange(
+pub(crate) async fn exchange(
     pool: &ThreadPool,
     endpoint: EndpointIdInt,
     secret: &ServerSecret,
diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs
index 32a3dbd203..fe55ff493b 100644
--- a/proxy/src/scram/key.rs
+++ b/proxy/src/scram/key.rs
@@ -3,14 +3,14 @@
 use subtle::ConstantTimeEq;
 
 /// Faithfully taken from PostgreSQL.
-pub const SCRAM_KEY_LEN: usize = 32;
+pub(crate) const SCRAM_KEY_LEN: usize = 32;
 
 /// One of the keys derived from the user's password.
 /// We use the same structure for all keys, i.e.
 /// `ClientKey`, `StoredKey`, and `ServerKey`.
 #[derive(Clone, Default, Eq, Debug)]
 #[repr(transparent)]
-pub struct ScramKey {
+pub(crate) struct ScramKey {
     bytes: [u8; SCRAM_KEY_LEN],
 }
 
@@ -27,11 +27,11 @@ impl ConstantTimeEq for ScramKey {
 }
 
 impl ScramKey {
-    pub fn sha256(&self) -> Self {
+    pub(crate) fn sha256(&self) -> Self {
         super::sha256([self.as_ref()]).into()
     }
 
-    pub fn as_bytes(&self) -> [u8; SCRAM_KEY_LEN] {
+    pub(crate) fn as_bytes(&self) -> [u8; SCRAM_KEY_LEN] {
         self.bytes
     }
 }
diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs
index 54157e450d..fd9e77764c 100644
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -8,7 +8,7 @@ use std::fmt;
 use std::ops::Range;
 
 /// Faithfully taken from PostgreSQL.
-pub const SCRAM_RAW_NONCE_LEN: usize = 18;
+pub(crate) const SCRAM_RAW_NONCE_LEN: usize = 18;
 
 /// Although we ignore all extensions, we still have to validate the message.
 fn validate_sasl_extensions<'a>(parts: impl Iterator<Item = &'a str>) -> Option<()> {
@@ -27,18 +27,18 @@ fn validate_sasl_extensions<'a>(parts: impl Iterator<Item = &'a str>) -> Option<
 }
 
 #[derive(Debug)]
-pub struct ClientFirstMessage<'a> {
+pub(crate) struct ClientFirstMessage<'a> {
     /// `client-first-message-bare`.
-    pub bare: &'a str,
+    pub(crate) bare: &'a str,
     /// Channel binding mode.
-    pub cbind_flag: ChannelBinding<&'a str>,
+    pub(crate) cbind_flag: ChannelBinding<&'a str>,
     /// Client nonce.
-    pub nonce: &'a str,
+    pub(crate) nonce: &'a str,
 }
 
 impl<'a> ClientFirstMessage<'a> {
     // NB: FromStr doesn't work with lifetimes
-    pub fn parse(input: &'a str) -> Option<Self> {
+    pub(crate) fn parse(input: &'a str) -> Option<Self> {
         let mut parts = input.split(',');
 
         let cbind_flag = ChannelBinding::parse(parts.next()?)?;
@@ -77,7 +77,7 @@ impl<'a> ClientFirstMessage<'a> {
     }
 
     /// Build a response to [`ClientFirstMessage`].
-    pub fn build_server_first_message(
+    pub(crate) fn build_server_first_message(
         &self,
         nonce: &[u8; SCRAM_RAW_NONCE_LEN],
         salt_base64: &str,
@@ -101,20 +101,20 @@ impl<'a> ClientFirstMessage<'a> {
 }
 
 #[derive(Debug)]
-pub struct ClientFinalMessage<'a> {
+pub(crate) struct ClientFinalMessage<'a> {
     /// `client-final-message-without-proof`.
-    pub without_proof: &'a str,
+    pub(crate) without_proof: &'a str,
     /// Channel binding data (base64).
-    pub channel_binding: &'a str,
+    pub(crate) channel_binding: &'a str,
     /// Combined client & server nonce.
-    pub nonce: &'a str,
+    pub(crate) nonce: &'a str,
     /// Client auth proof.
-    pub proof: [u8; SCRAM_KEY_LEN],
+    pub(crate) proof: [u8; SCRAM_KEY_LEN],
 }
 
 impl<'a> ClientFinalMessage<'a> {
     // NB: FromStr doesn't work with lifetimes
-    pub fn parse(input: &'a str) -> Option<Self> {
+    pub(crate) fn parse(input: &'a str) -> Option<Self> {
         let (without_proof, proof) = input.rsplit_once(',')?;
 
         let mut parts = without_proof.split(',');
@@ -135,7 +135,7 @@ impl<'a> ClientFinalMessage<'a> {
     }
 
     /// Build a response to [`ClientFinalMessage`].
-    pub fn build_server_final_message(
+    pub(crate) fn build_server_final_message(
         &self,
         signature_builder: SignatureBuilder<'_>,
         server_key: &ScramKey,
@@ -153,7 +153,7 @@ impl<'a> ClientFinalMessage<'a> {
 
 /// We need to keep a convenient representation of this
 /// message for the next authentication step.
-pub struct OwnedServerFirstMessage {
+pub(crate) struct OwnedServerFirstMessage {
     /// Owned `server-first-message`.
     message: String,
     /// Slice into `message`.
@@ -163,13 +163,13 @@ pub struct OwnedServerFirstMessage {
 impl OwnedServerFirstMessage {
     /// Extract combined nonce from the message.
     #[inline(always)]
-    pub fn nonce(&self) -> &str {
+    pub(crate) fn nonce(&self) -> &str {
         &self.message[self.nonce.clone()]
     }
 
     /// Get reference to a text representation of the message.
     #[inline(always)]
-    pub fn as_str(&self) -> &str {
+    pub(crate) fn as_str(&self) -> &str {
         &self.message
     }
 }
diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs
index f690cc7738..d5ed9002ad 100644
--- a/proxy/src/scram/pbkdf2.rs
+++ b/proxy/src/scram/pbkdf2.rs
@@ -4,7 +4,7 @@ use hmac::{
 };
 use sha2::Sha256;
 
-pub struct Pbkdf2 {
+pub(crate) struct Pbkdf2 {
     hmac: Hmac<Sha256>,
     prev: GenericArray<u8, U32>,
     hi: GenericArray<u8, U32>,
@@ -13,7 +13,7 @@ pub struct Pbkdf2 {
 
 // inspired from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
 impl Pbkdf2 {
-    pub fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
+    pub(crate) fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
         let hmac =
             Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
 
@@ -33,11 +33,11 @@ impl Pbkdf2 {
         }
     }
 
-    pub fn cost(&self) -> u32 {
+    pub(crate) fn cost(&self) -> u32 {
         (self.iterations).clamp(0, 4096)
     }
 
-    pub fn turn(&mut self) -> std::task::Poll<[u8; 32]> {
+    pub(crate) fn turn(&mut self) -> std::task::Poll<[u8; 32]> {
         let Self {
             hmac,
             prev,
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index a08cb943c3..8c6a08d432 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -8,22 +8,22 @@ use super::key::ScramKey;
 /// Server secret is produced from user's password,
 /// and is used throughout the authentication process.
 #[derive(Clone, Eq, PartialEq, Debug)]
-pub struct ServerSecret {
+pub(crate) struct ServerSecret {
     /// Number of iterations for `PBKDF2` function.
-    pub iterations: u32,
+    pub(crate) iterations: u32,
     /// Salt used to hash user's password.
-    pub salt_base64: String,
+    pub(crate) salt_base64: String,
     /// Hashed `ClientKey`.
-    pub stored_key: ScramKey,
+    pub(crate) stored_key: ScramKey,
     /// Used by client to verify server's signature.
-    pub server_key: ScramKey,
+    pub(crate) server_key: ScramKey,
     /// Should auth fail no matter what?
     /// This is exactly the case for mocked secrets.
-    pub doomed: bool,
+    pub(crate) doomed: bool,
 }
 
 impl ServerSecret {
-    pub fn parse(input: &str) -> Option<Self> {
+    pub(crate) fn parse(input: &str) -> Option<Self> {
         // SCRAM-SHA-256$<iterations>:<salt>$<storedkey>:<serverkey>
         let s = input.strip_prefix("SCRAM-SHA-256$")?;
         let (params, keys) = s.split_once('$')?;
@@ -42,7 +42,7 @@ impl ServerSecret {
         Some(secret)
     }
 
-    pub fn is_password_invalid(&self, client_key: &ScramKey) -> Choice {
+    pub(crate) fn is_password_invalid(&self, client_key: &ScramKey) -> Choice {
         // constant time to not leak partial key match
         client_key.sha256().ct_ne(&self.stored_key) | Choice::from(self.doomed as u8)
     }
@@ -50,7 +50,7 @@ impl ServerSecret {
     /// To avoid revealing information to an attacker, we use a
     /// mocked server secret even if the user doesn't exist.
     /// See `auth-scram.c : mock_scram_secret` for details.
-    pub fn mock(nonce: [u8; 32]) -> Self {
+    pub(crate) fn mock(nonce: [u8; 32]) -> Self {
         Self {
             // this doesn't reveal much information as we're going to use
             // iteration count 1 for our generated passwords going forward.
@@ -66,7 +66,7 @@ impl ServerSecret {
     /// Build a new server secret from the prerequisites.
     /// XXX: We only use this function in tests.
     #[cfg(test)]
-    pub async fn build(password: &str) -> Option<Self> {
+    pub(crate) async fn build(password: &str) -> Option<Self> {
         Self::parse(&postgres_protocol::password::scram_sha_256(password.as_bytes()).await)
     }
 }
diff --git a/proxy/src/scram/signature.rs b/proxy/src/scram/signature.rs
index 1c2811d757..d3255cf2ca 100644
--- a/proxy/src/scram/signature.rs
+++ b/proxy/src/scram/signature.rs
@@ -4,14 +4,14 @@ use super::key::{ScramKey, SCRAM_KEY_LEN};
 
 /// A collection of message parts needed to derive the client's signature.
 #[derive(Debug)]
-pub struct SignatureBuilder<'a> {
-    pub client_first_message_bare: &'a str,
-    pub server_first_message: &'a str,
-    pub client_final_message_without_proof: &'a str,
+pub(crate) struct SignatureBuilder<'a> {
+    pub(crate) client_first_message_bare: &'a str,
+    pub(crate) server_first_message: &'a str,
+    pub(crate) client_final_message_without_proof: &'a str,
 }
 
 impl SignatureBuilder<'_> {
-    pub fn build(&self, key: &ScramKey) -> Signature {
+    pub(crate) fn build(&self, key: &ScramKey) -> Signature {
         let parts = [
             self.client_first_message_bare.as_bytes(),
             b",",
@@ -28,13 +28,13 @@ impl SignatureBuilder<'_> {
 /// produces `ClientKey` that we need for authentication.
 #[derive(Debug)]
 #[repr(transparent)]
-pub struct Signature {
+pub(crate) struct Signature {
     bytes: [u8; SCRAM_KEY_LEN],
 }
 
 impl Signature {
     /// Derive `ClientKey` from client's signature and proof.
-    pub fn derive_client_key(&self, proof: &[u8; SCRAM_KEY_LEN]) -> ScramKey {
+    pub(crate) fn derive_client_key(&self, proof: &[u8; SCRAM_KEY_LEN]) -> ScramKey {
         // This is how the proof is calculated:
         //
         // 1. sha256(ClientKey) -> StoredKey
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
index 8fbaecf93d..262c6d146e 100644
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -68,7 +68,7 @@ impl ThreadPool {
         pool
     }
 
-    pub fn spawn_job(
+    pub(crate) fn spawn_job(
         &self,
         endpoint: EndpointIdInt,
         pbkdf2: Pbkdf2,
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index d9a9019746..84f98cb8ad 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -25,8 +25,6 @@ use hyper_util::rt::TokioExecutor;
 use hyper_util::server::conn::auto::Builder;
 use rand::rngs::StdRng;
 use rand::SeedableRng;
-pub use reqwest_middleware::{ClientWithMiddleware, Error};
-pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::time::timeout;
 use tokio_rustls::TlsAcceptor;
@@ -50,7 +48,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn, Instrument};
 use utils::http::error::ApiError;
 
-pub const SERVERLESS_DRIVER_SNI: &str = "api";
+pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api";
 
 pub async fn task_main(
     config: &'static ProxyConfig,
@@ -178,9 +176,9 @@ pub async fn task_main(
     Ok(())
 }
 
-pub trait AsyncReadWrite: AsyncRead + AsyncWrite + Send + 'static {}
+pub(crate) trait AsyncReadWrite: AsyncRead + AsyncWrite + Send + 'static {}
 impl<T: AsyncRead + AsyncWrite + Send + 'static> AsyncReadWrite for T {}
-pub type AsyncRW = Pin<Box<dyn AsyncReadWrite>>;
+pub(crate) type AsyncRW = Pin<Box<dyn AsyncReadWrite>>;
 
 #[async_trait]
 trait MaybeTlsAcceptor: Send + Sync + 'static {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 9cc271c588..2699411c28 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -29,14 +29,14 @@ use crate::{
 
 use super::conn_pool::{poll_client, AuthData, Client, ConnInfo, GlobalConnPool};
 
-pub struct PoolingBackend {
-    pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
-    pub config: &'static ProxyConfig,
-    pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+pub(crate) struct PoolingBackend {
+    pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
+    pub(crate) config: &'static ProxyConfig,
+    pub(crate) endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 }
 
 impl PoolingBackend {
-    pub async fn authenticate_with_password(
+    pub(crate) async fn authenticate_with_password(
         &self,
         ctx: &RequestMonitoring,
         config: &AuthenticationConfig,
@@ -98,7 +98,7 @@ impl PoolingBackend {
         })
     }
 
-    pub async fn authenticate_with_jwt(
+    pub(crate) async fn authenticate_with_jwt(
         &self,
         ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
@@ -135,7 +135,7 @@ impl PoolingBackend {
     // we reuse the code from the usual proxy and we need to prepare few structures
     // that this code expects.
     #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
-    pub async fn connect_to_compute(
+    pub(crate) async fn connect_to_compute(
         &self,
         ctx: &RequestMonitoring,
         conn_info: ConnInfo,
@@ -175,7 +175,7 @@ impl PoolingBackend {
 }
 
 #[derive(Debug, thiserror::Error)]
-pub enum HttpConnError {
+pub(crate) enum HttpConnError {
     #[error("pooled connection closed at inconsistent state")]
     ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError<uuid::Uuid>),
     #[error("could not connection to compute")]
diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs
index 390df7f4f7..7659745473 100644
--- a/proxy/src/serverless/cancel_set.rs
+++ b/proxy/src/serverless/cancel_set.rs
@@ -22,7 +22,7 @@ pub struct CancelSet {
     hasher: Hasher,
 }
 
-pub struct CancelShard {
+pub(crate) struct CancelShard {
     tokens: IndexMap<uuid::Uuid, (Instant, CancellationToken), Hasher>,
 }
 
@@ -40,7 +40,7 @@ impl CancelSet {
         }
     }
 
-    pub fn take(&self) -> Option<CancellationToken> {
+    pub(crate) fn take(&self) -> Option<CancellationToken> {
         for _ in 0..4 {
             if let Some(token) = self.take_raw(thread_rng().gen()) {
                 return Some(token);
@@ -50,12 +50,12 @@ impl CancelSet {
         None
     }
 
-    pub fn take_raw(&self, rng: usize) -> Option<CancellationToken> {
+    pub(crate) fn take_raw(&self, rng: usize) -> Option<CancellationToken> {
         NonZeroUsize::new(self.shards.len())
             .and_then(|len| self.shards[rng % len].lock().take(rng / len))
     }
 
-    pub fn insert(&self, id: uuid::Uuid, token: CancellationToken) -> CancelGuard<'_> {
+    pub(crate) fn insert(&self, id: uuid::Uuid, token: CancellationToken) -> CancelGuard<'_> {
         let shard = NonZeroUsize::new(self.shards.len()).map(|len| {
             let hash = self.hasher.hash_one(id) as usize;
             let shard = &self.shards[hash % len];
@@ -88,7 +88,7 @@ impl CancelShard {
     }
 }
 
-pub struct CancelGuard<'a> {
+pub(crate) struct CancelGuard<'a> {
     shard: Option<&'a Mutex<CancelShard>>,
     id: Uuid,
 }
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 476083d71e..bea599e9b9 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -30,25 +30,25 @@ use tracing::{info, info_span, Instrument};
 use super::backend::HttpConnError;
 
 #[derive(Debug, Clone)]
-pub struct ConnInfo {
-    pub user_info: ComputeUserInfo,
-    pub dbname: DbName,
-    pub auth: AuthData,
+pub(crate) struct ConnInfo {
+    pub(crate) user_info: ComputeUserInfo,
+    pub(crate) dbname: DbName,
+    pub(crate) auth: AuthData,
 }
 
 #[derive(Debug, Clone)]
-pub enum AuthData {
+pub(crate) enum AuthData {
     Password(SmallVec<[u8; 16]>),
     Jwt(String),
 }
 
 impl ConnInfo {
     // hm, change to hasher to avoid cloning?
-    pub fn db_and_user(&self) -> (DbName, RoleName) {
+    pub(crate) fn db_and_user(&self) -> (DbName, RoleName) {
         (self.dbname.clone(), self.user_info.user.clone())
     }
 
-    pub fn endpoint_cache_key(&self) -> Option<EndpointCacheKey> {
+    pub(crate) fn endpoint_cache_key(&self) -> Option<EndpointCacheKey> {
         // We don't want to cache http connections for ephemeral endpoints.
         if self.user_info.options.is_ephemeral() {
             None
@@ -79,7 +79,7 @@ struct ConnPoolEntry<C: ClientInnerExt> {
 
 // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
-pub struct EndpointConnPool<C: ClientInnerExt> {
+pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
     pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
     total_conns: usize,
     max_conns: usize,
@@ -198,7 +198,7 @@ impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
     }
 }
 
-pub struct DbUserConnPool<C: ClientInnerExt> {
+pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
     conns: Vec<ConnPoolEntry<C>>,
 }
 
@@ -241,7 +241,7 @@ impl<C: ClientInnerExt> DbUserConnPool<C> {
     }
 }
 
-pub struct GlobalConnPool<C: ClientInnerExt> {
+pub(crate) struct GlobalConnPool<C: ClientInnerExt> {
     // endpoint -> per-endpoint connection pool
     //
     // That should be a fairly conteded map, so return reference to the per-endpoint
@@ -282,7 +282,7 @@ pub struct GlobalConnPoolOptions {
 }
 
 impl<C: ClientInnerExt> GlobalConnPool<C> {
-    pub fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
+    pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
         let shards = config.pool_options.pool_shards;
         Arc::new(Self {
             global_pool: DashMap::with_shard_amount(shards),
@@ -293,21 +293,21 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
     }
 
     #[cfg(test)]
-    pub fn get_global_connections_count(&self) -> usize {
+    pub(crate) fn get_global_connections_count(&self) -> usize {
         self.global_connections_count
             .load(atomic::Ordering::Relaxed)
     }
 
-    pub fn get_idle_timeout(&self) -> Duration {
+    pub(crate) fn get_idle_timeout(&self) -> Duration {
         self.config.pool_options.idle_timeout
     }
 
-    pub fn shutdown(&self) {
+    pub(crate) fn shutdown(&self) {
         // drops all strong references to endpoint-pools
         self.global_pool.clear();
     }
 
-    pub async fn gc_worker(&self, mut rng: impl Rng) {
+    pub(crate) async fn gc_worker(&self, mut rng: impl Rng) {
         let epoch = self.config.pool_options.gc_epoch;
         let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
         loop {
@@ -381,7 +381,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         }
     }
 
-    pub fn get(
+    pub(crate) fn get(
         self: &Arc<Self>,
         ctx: &RequestMonitoring,
         conn_info: &ConnInfo,
@@ -468,7 +468,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
     }
 }
 
-pub fn poll_client<C: ClientInnerExt>(
+pub(crate) fn poll_client<C: ClientInnerExt>(
     global_pool: Arc<GlobalConnPool<C>>,
     ctx: &RequestMonitoring,
     conn_info: ConnInfo,
@@ -596,7 +596,7 @@ impl<C: ClientInnerExt> Drop for ClientInner<C> {
     }
 }
 
-pub trait ClientInnerExt: Sync + Send + 'static {
+pub(crate) trait ClientInnerExt: Sync + Send + 'static {
     fn is_closed(&self) -> bool;
     fn get_process_id(&self) -> i32;
 }
@@ -611,13 +611,13 @@ impl ClientInnerExt for tokio_postgres::Client {
 }
 
 impl<C: ClientInnerExt> ClientInner<C> {
-    pub fn is_closed(&self) -> bool {
+    pub(crate) fn is_closed(&self) -> bool {
         self.inner.is_closed()
     }
 }
 
 impl<C: ClientInnerExt> Client<C> {
-    pub fn metrics(&self) -> Arc<MetricCounter> {
+    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
         let aux = &self.inner.as_ref().unwrap().aux;
         USAGE_METRICS.register(Ids {
             endpoint_id: aux.endpoint_id,
@@ -626,14 +626,14 @@ impl<C: ClientInnerExt> Client<C> {
     }
 }
 
-pub struct Client<C: ClientInnerExt> {
+pub(crate) struct Client<C: ClientInnerExt> {
     span: Span,
     inner: Option<ClientInner<C>>,
     conn_info: ConnInfo,
     pool: Weak<RwLock<EndpointConnPool<C>>>,
 }
 
-pub struct Discard<'a, C: ClientInnerExt> {
+pub(crate) struct Discard<'a, C: ClientInnerExt> {
     conn_info: &'a ConnInfo,
     pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
 }
@@ -651,7 +651,7 @@ impl<C: ClientInnerExt> Client<C> {
             pool,
         }
     }
-    pub fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
+    pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
         let Self {
             inner,
             pool,
@@ -664,13 +664,13 @@ impl<C: ClientInnerExt> Client<C> {
 }
 
 impl<C: ClientInnerExt> Discard<'_, C> {
-    pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
+    pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
         let conn_info = &self.conn_info;
         if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
             info!("pool: throwing away connection '{conn_info}' because connection is not idle");
         }
     }
-    pub fn discard(&mut self) {
+    pub(crate) fn discard(&mut self) {
         let conn_info = &self.conn_info;
         if std::mem::take(self.pool).strong_count() > 0 {
             info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs
index 701ab58f63..abf0ffe290 100644
--- a/proxy/src/serverless/http_util.rs
+++ b/proxy/src/serverless/http_util.rs
@@ -11,7 +11,7 @@ use serde::Serialize;
 use utils::http::error::ApiError;
 
 /// Like [`ApiError::into_response`]
-pub fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
+pub(crate) fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
     match this {
         ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status(
             format!("{err:#?}"), // use debug printing so that we give the cause
@@ -59,7 +59,7 @@ pub fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
 /// Same as [`utils::http::error::HttpErrorBody`]
 #[derive(Serialize)]
 struct HttpErrorBody {
-    pub msg: String,
+    pub(crate) msg: String,
 }
 
 impl HttpErrorBody {
@@ -80,7 +80,7 @@ impl HttpErrorBody {
 }
 
 /// Same as [`utils::http::json::json_response`]
-pub fn json_response<T: Serialize>(
+pub(crate) fn json_response<T: Serialize>(
     status: StatusCode,
     data: T,
 ) -> Result<Response<Full<Bytes>>, ApiError> {
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index 3776971fa1..9f328a0e1d 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -8,7 +8,7 @@ use tokio_postgres::Row;
 // Convert json non-string types to strings, so that they can be passed to Postgres
 // as parameters.
 //
-pub fn json_to_pg_text(json: Vec<Value>) -> Vec<Option<String>> {
+pub(crate) fn json_to_pg_text(json: Vec<Value>) -> Vec<Option<String>> {
     json.iter().map(json_value_to_pg_text).collect()
 }
 
@@ -61,7 +61,7 @@ fn json_array_to_pg_array(value: &Value) -> Option<String> {
 }
 
 #[derive(Debug, thiserror::Error)]
-pub enum JsonConversionError {
+pub(crate) enum JsonConversionError {
     #[error("internal error compute returned invalid data: {0}")]
     AsTextError(tokio_postgres::Error),
     #[error("parse int error: {0}")]
@@ -77,7 +77,7 @@ pub enum JsonConversionError {
 //
 // Convert postgres row with text-encoded values to JSON object
 //
-pub fn pg_text_row_to_json(
+pub(crate) fn pg_text_row_to_json(
     row: &Row,
     columns: &[Type],
     raw_output: bool,
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 9143469eea..5b36f5e91d 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -110,7 +110,7 @@ where
 }
 
 #[derive(Debug, thiserror::Error)]
-pub enum ConnInfoError {
+pub(crate) enum ConnInfoError {
     #[error("invalid header: {0}")]
     InvalidHeader(&'static HeaderName),
     #[error("invalid connection string: {0}")]
@@ -246,7 +246,7 @@ fn get_conn_info(
 }
 
 // TODO: return different http error codes
-pub async fn handle(
+pub(crate) async fn handle(
     config: &'static ProxyConfig,
     ctx: RequestMonitoring,
     request: Request<Incoming>,
@@ -359,7 +359,7 @@ pub async fn handle(
 }
 
 #[derive(Debug, thiserror::Error)]
-pub enum SqlOverHttpError {
+pub(crate) enum SqlOverHttpError {
     #[error("{0}")]
     ReadPayload(#[from] ReadPayloadError),
     #[error("{0}")]
@@ -413,7 +413,7 @@ impl UserFacingError for SqlOverHttpError {
 }
 
 #[derive(Debug, thiserror::Error)]
-pub enum ReadPayloadError {
+pub(crate) enum ReadPayloadError {
     #[error("could not read the HTTP request body: {0}")]
     Read(#[from] hyper1::Error),
     #[error("could not parse the HTTP request body: {0}")]
@@ -430,7 +430,7 @@ impl ReportableError for ReadPayloadError {
 }
 
 #[derive(Debug, thiserror::Error)]
-pub enum SqlOverHttpCancel {
+pub(crate) enum SqlOverHttpCancel {
     #[error("query was cancelled")]
     Postgres,
     #[error("query was cancelled while stuck trying to connect to the database")]
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 4fba4d141c..3d257223b8 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -27,7 +27,7 @@ use tracing::warn;
 pin_project! {
     /// This is a wrapper around a [`WebSocketStream`] that
     /// implements [`AsyncRead`] and [`AsyncWrite`].
-    pub struct WebSocketRw<S> {
+    pub(crate) struct WebSocketRw<S> {
         #[pin]
         stream: WebSocketServer<S>,
         recv: Bytes,
@@ -36,7 +36,7 @@ pin_project! {
 }
 
 impl<S> WebSocketRw<S> {
-    pub fn new(stream: WebSocketServer<S>) -> Self {
+    pub(crate) fn new(stream: WebSocketServer<S>) -> Self {
         Self {
             stream,
             recv: Bytes::new(),
@@ -127,7 +127,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
     }
 }
 
-pub async fn serve_websocket(
+pub(crate) async fn serve_websocket(
     config: &'static ProxyConfig,
     ctx: RequestMonitoring,
     websocket: OnUpgrade,
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index ef13f5fc1a..332dc27787 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -35,7 +35,7 @@ impl<S> PqStream<S> {
     }
 
     /// Get a shared reference to the underlying stream.
-    pub fn get_ref(&self) -> &S {
+    pub(crate) fn get_ref(&self) -> &S {
         self.framed.get_ref()
     }
 }
@@ -62,7 +62,7 @@ impl<S: AsyncRead + Unpin> PqStream<S> {
             .ok_or_else(err_connection)
     }
 
-    pub async fn read_password_message(&mut self) -> io::Result<bytes::Bytes> {
+    pub(crate) async fn read_password_message(&mut self) -> io::Result<bytes::Bytes> {
         match self.read_message().await? {
             FeMessage::PasswordMessage(msg) => Ok(msg),
             bad => Err(io::Error::new(
@@ -99,7 +99,10 @@ impl ReportableError for ReportedError {
 
 impl<S: AsyncWrite + Unpin> PqStream<S> {
     /// Write the message into an internal buffer, but don't flush the underlying stream.
-    pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
+    pub(crate) fn write_message_noflush(
+        &mut self,
+        message: &BeMessage<'_>,
+    ) -> io::Result<&mut Self> {
         self.framed
             .write_message(message)
             .map_err(ProtocolError::into_io_error)?;
@@ -114,7 +117,7 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
     }
 
     /// Flush the output buffer into the underlying stream.
-    pub async fn flush(&mut self) -> io::Result<&mut Self> {
+    pub(crate) async fn flush(&mut self) -> io::Result<&mut Self> {
         self.framed.flush().await?;
         Ok(self)
     }
@@ -146,7 +149,7 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
 
     /// Write the error message using [`Self::write_message`], then re-throw it.
     /// Trait [`UserFacingError`] acts as an allowlist for error types.
-    pub async fn throw_error<T, E>(&mut self, error: E) -> Result<T, ReportedError>
+    pub(crate) async fn throw_error<T, E>(&mut self, error: E) -> Result<T, ReportedError>
     where
         E: UserFacingError + Into<anyhow::Error>,
     {
@@ -200,7 +203,7 @@ impl<S> Stream<S> {
         }
     }
 
-    pub fn tls_server_end_point(&self) -> TlsServerEndPoint {
+    pub(crate) fn tls_server_end_point(&self) -> TlsServerEndPoint {
         match self {
             Stream::Raw { .. } => TlsServerEndPoint::Undefined,
             Stream::Tls {
diff --git a/proxy/src/url.rs b/proxy/src/url.rs
index 202fe8de1f..28ac7efdfc 100644
--- a/proxy/src/url.rs
+++ b/proxy/src/url.rs
@@ -7,12 +7,12 @@ pub struct ApiUrl(url::Url);
 
 impl ApiUrl {
     /// Consume the wrapper and return inner [url](url::Url).
-    pub fn into_inner(self) -> url::Url {
+    pub(crate) fn into_inner(self) -> url::Url {
         self.0
     }
 
     /// See [`url::Url::path_segments_mut`].
-    pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut<'_> {
+    pub(crate) fn path_segments_mut(&mut self) -> url::PathSegmentsMut<'_> {
         // We've already verified that it works during construction.
         self.0.path_segments_mut().expect("bad API url")
     }
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index 4cf6da7e2d..aa8c7ba319 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -43,12 +43,12 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// so while the project-id is unique across regions the whole pipeline will work correctly
 /// because we enrich the event with project_id in the control-plane endpoint.
 #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
-pub struct Ids {
-    pub endpoint_id: EndpointIdInt,
-    pub branch_id: BranchIdInt,
+pub(crate) struct Ids {
+    pub(crate) endpoint_id: EndpointIdInt,
+    pub(crate) branch_id: BranchIdInt,
 }
 
-pub trait MetricCounterRecorder {
+pub(crate) trait MetricCounterRecorder {
     /// Record that some bytes were sent from the proxy to the client
     fn record_egress(&self, bytes: u64);
     /// Record that some connections were opened
@@ -92,7 +92,7 @@ impl MetricCounterReporter for MetricBackupCounter {
 }
 
 #[derive(Debug)]
-pub struct MetricCounter {
+pub(crate) struct MetricCounter {
     transmitted: AtomicU64,
     opened_connections: AtomicUsize,
     backup: Arc<MetricBackupCounter>,
@@ -173,14 +173,14 @@ impl<C: MetricCounterReporter> Clearable for C {
 type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;
 
 #[derive(Default)]
-pub struct Metrics {
+pub(crate) struct Metrics {
     endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
     backup_endpoints: DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
 }
 
 impl Metrics {
     /// Register a new byte metrics counter for this endpoint
-    pub fn register(&self, ids: Ids) -> Arc<MetricCounter> {
+    pub(crate) fn register(&self, ids: Ids) -> Arc<MetricCounter> {
         let backup = if let Some(entry) = self.backup_endpoints.get(&ids) {
             entry.clone()
         } else {
@@ -215,7 +215,7 @@ impl Metrics {
     }
 }
 
-pub static USAGE_METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
+pub(crate) static USAGE_METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
 
 pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infallible> {
     info!("metrics collector config: {config:?}");
diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs
index 9f78242ed3..86d0f9e8b2 100644
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -7,13 +7,13 @@ use thiserror::Error;
 use tokio::sync::oneshot;
 
 #[derive(Debug, Error)]
-pub enum RegisterError {
+pub(crate) enum RegisterError {
     #[error("Waiter `{0}` already registered")]
     Occupied(String),
 }
 
 #[derive(Debug, Error)]
-pub enum NotifyError {
+pub(crate) enum NotifyError {
     #[error("Notify failed: waiter `{0}` not registered")]
     NotFound(String),
 
@@ -22,12 +22,12 @@ pub enum NotifyError {
 }
 
 #[derive(Debug, Error)]
-pub enum WaitError {
+pub(crate) enum WaitError {
     #[error("Wait failed: channel hangup")]
     Hangup,
 }
 
-pub struct Waiters<T>(pub(self) Mutex<HashMap<String, oneshot::Sender<T>>>);
+pub(crate) struct Waiters<T>(pub(self) Mutex<HashMap<String, oneshot::Sender<T>>>);
 
 impl<T> Default for Waiters<T> {
     fn default() -> Self {
@@ -36,7 +36,7 @@ impl<T> Default for Waiters<T> {
 }
 
 impl<T> Waiters<T> {
-    pub fn register(&self, key: String) -> Result<Waiter<'_, T>, RegisterError> {
+    pub(crate) fn register(&self, key: String) -> Result<Waiter<'_, T>, RegisterError> {
         let (tx, rx) = oneshot::channel();
 
         self.0
@@ -53,7 +53,7 @@ impl<T> Waiters<T> {
         })
     }
 
-    pub fn notify(&self, key: &str, value: T) -> Result<(), NotifyError>
+    pub(crate) fn notify(&self, key: &str, value: T) -> Result<(), NotifyError>
     where
         T: Send + Sync,
     {
@@ -79,7 +79,7 @@ impl<'a, T> Drop for DropKey<'a, T> {
 }
 
 pin_project! {
-    pub struct Waiter<'a, T> {
+    pub(crate) struct Waiter<'a, T> {
         #[pin]
         receiver: oneshot::Receiver<T>,
         guard: DropKey<'a, T>,

From 52cb33770b1f5e0215305e67582843f055e0b435 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 27 Aug 2024 14:12:42 +0200
Subject: [PATCH 1508/1571] proxy: Rename backend types and variants as prep
 for refactor (#8845)

* AuthBackend enum to AuthBackendType
* BackendType enum to Backend
* Link variants to Web
* Adjust messages, comments, etc.
---
 proxy/README.md                            |  2 +-
 proxy/src/auth.rs                          |  8 +--
 proxy/src/auth/backend.rs                  | 80 +++++++++++-----------
 proxy/src/auth/backend/{link.rs => web.rs} | 16 ++---
 proxy/src/auth/password_hack.rs            |  2 +-
 proxy/src/bin/local_proxy.rs               |  2 +-
 proxy/src/bin/proxy.rs                     | 27 ++++----
 proxy/src/compute.rs                       |  4 +-
 proxy/src/config.rs                        |  4 +-
 proxy/src/console/messages.rs              |  2 +-
 proxy/src/console/mgmt.rs                  |  2 +-
 proxy/src/context.rs                       |  2 +-
 proxy/src/proxy/tests.rs                   |  4 +-
 proxy/src/serverless/backend.rs            |  8 +--
 14 files changed, 84 insertions(+), 79 deletions(-)
 rename proxy/src/auth/backend/{link.rs => web.rs} (87%)

diff --git a/proxy/README.md b/proxy/README.md
index afc8b77db8..8d850737be 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -6,7 +6,7 @@ Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme a
   new SCRAM-based console API; uses SNI info to select the destination project (endpoint soon)
 * postgres
   uses postgres to select auth secrets of existing roles. Useful for local testing
-* link
+* web (or link)
   sends login link for all usernames
 
 Also proxy can expose following services to the external world:
diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index f3ecedb839..7c408f817c 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -1,7 +1,7 @@
 //! Client authentication mechanisms.
 
 pub mod backend;
-pub use backend::BackendType;
+pub use backend::Backend;
 
 mod credentials;
 pub(crate) use credentials::{
@@ -31,7 +31,7 @@ pub(crate) type Result<T> = std::result::Result<T, AuthError>;
 #[derive(Debug, Error)]
 pub(crate) enum AuthErrorImpl {
     #[error(transparent)]
-    Link(#[from] backend::LinkAuthError),
+    Web(#[from] backend::WebAuthError),
 
     #[error(transparent)]
     GetAuthInfo(#[from] console::errors::GetAuthInfoError),
@@ -114,7 +114,7 @@ impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
 impl UserFacingError for AuthError {
     fn to_string_client(&self) -> String {
         match self.0.as_ref() {
-            AuthErrorImpl::Link(e) => e.to_string_client(),
+            AuthErrorImpl::Web(e) => e.to_string_client(),
             AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(),
             AuthErrorImpl::Sasl(e) => e.to_string_client(),
             AuthErrorImpl::AuthFailed(_) => self.to_string(),
@@ -132,7 +132,7 @@ impl UserFacingError for AuthError {
 impl ReportableError for AuthError {
     fn get_error_kind(&self) -> crate::error::ErrorKind {
         match self.0.as_ref() {
-            AuthErrorImpl::Link(e) => e.get_error_kind(),
+            AuthErrorImpl::Web(e) => e.get_error_kind(),
             AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(),
             AuthErrorImpl::Sasl(e) => e.get_error_kind(),
             AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User,
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 77dea39fdc..1d28c6df31 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -1,19 +1,19 @@
 mod classic;
 mod hacks;
 pub mod jwt;
-mod link;
 pub mod local;
+mod web;
 
 use std::net::IpAddr;
 use std::sync::Arc;
 use std::time::Duration;
 
 use ipnet::{Ipv4Net, Ipv6Net};
-pub(crate) use link::LinkAuthError;
 use local::LocalBackend;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::AuthKeys;
 use tracing::{info, warn};
+pub(crate) use web::WebAuthError;
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::{validate_password_and_exchange, AuthError};
@@ -65,11 +65,11 @@ impl<T> std::ops::Deref for MaybeOwned<'_, T> {
 /// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`],
 ///   this helps us provide the credentials only to those auth
 ///   backends which require them for the authentication process.
-pub enum BackendType<'a, T, D> {
+pub enum Backend<'a, T, D> {
     /// Cloud API (V2).
     Console(MaybeOwned<'a, ConsoleBackend>, T),
     /// Authentication via a web browser.
-    Link(MaybeOwned<'a, url::ApiUrl>, D),
+    Web(MaybeOwned<'a, url::ApiUrl>, D),
     /// Local proxy uses configured auth credentials and does not wake compute
     Local(MaybeOwned<'a, LocalBackend>),
 }
@@ -82,7 +82,7 @@ pub(crate) trait TestBackend: Send + Sync + 'static {
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
 }
 
-impl std::fmt::Display for BackendType<'_, (), ()> {
+impl std::fmt::Display for Backend<'_, (), ()> {
     fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
             Self::Console(api, ()) => match &**api {
@@ -96,44 +96,44 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
                 #[cfg(test)]
                 ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
             },
-            Self::Link(url, ()) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Self::Web(url, ()) => fmt.debug_tuple("Web").field(&url.as_str()).finish(),
             Self::Local(_) => fmt.debug_tuple("Local").finish(),
         }
     }
 }
 
-impl<T, D> BackendType<'_, T, D> {
+impl<T, D> Backend<'_, T, D> {
     /// Very similar to [`std::option::Option::as_ref`].
     /// This helps us pass structured config to async tasks.
-    pub(crate) fn as_ref(&self) -> BackendType<'_, &T, &D> {
+    pub(crate) fn as_ref(&self) -> Backend<'_, &T, &D> {
         match self {
-            Self::Console(c, x) => BackendType::Console(MaybeOwned::Borrowed(c), x),
-            Self::Link(c, x) => BackendType::Link(MaybeOwned::Borrowed(c), x),
-            Self::Local(l) => BackendType::Local(MaybeOwned::Borrowed(l)),
+            Self::Console(c, x) => Backend::Console(MaybeOwned::Borrowed(c), x),
+            Self::Web(c, x) => Backend::Web(MaybeOwned::Borrowed(c), x),
+            Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)),
         }
     }
 }
 
-impl<'a, T, D> BackendType<'a, T, D> {
+impl<'a, T, D> Backend<'a, T, D> {
     /// Very similar to [`std::option::Option::map`].
-    /// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
+    /// Maps [`Backend<T>`] to [`Backend<R>`] by applying
     /// a function to a contained value.
-    pub(crate) fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
+    pub(crate) fn map<R>(self, f: impl FnOnce(T) -> R) -> Backend<'a, R, D> {
         match self {
-            Self::Console(c, x) => BackendType::Console(c, f(x)),
-            Self::Link(c, x) => BackendType::Link(c, x),
-            Self::Local(l) => BackendType::Local(l),
+            Self::Console(c, x) => Backend::Console(c, f(x)),
+            Self::Web(c, x) => Backend::Web(c, x),
+            Self::Local(l) => Backend::Local(l),
         }
     }
 }
-impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
+impl<'a, T, D, E> Backend<'a, Result<T, E>, D> {
     /// Very similar to [`std::option::Option::transpose`].
     /// This is most useful for error handling.
-    pub(crate) fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
+    pub(crate) fn transpose(self) -> Result<Backend<'a, T, D>, E> {
         match self {
-            Self::Console(c, x) => x.map(|x| BackendType::Console(c, x)),
-            Self::Link(c, x) => Ok(BackendType::Link(c, x)),
-            Self::Local(l) => Ok(BackendType::Local(l)),
+            Self::Console(c, x) => x.map(|x| Backend::Console(c, x)),
+            Self::Web(c, x) => Ok(Backend::Web(c, x)),
+            Self::Local(l) => Ok(Backend::Local(l)),
         }
     }
 }
@@ -403,12 +403,12 @@ async fn authenticate_with_secret(
     classic::authenticate(ctx, info, client, config, secret).await
 }
 
-impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
+impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> {
     /// Get username from the credentials.
     pub(crate) fn get_user(&self) -> &str {
         match self {
             Self::Console(_, user_info) => &user_info.user,
-            Self::Link(_, ()) => "link",
+            Self::Web(_, ()) => "web",
             Self::Local(_) => "local",
         }
     }
@@ -422,7 +422,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
         endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    ) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
+    ) -> auth::Result<Backend<'a, ComputeCredentials, NodeInfo>> {
         let res = match self {
             Self::Console(api, user_info) => {
                 info!(
@@ -441,15 +441,15 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
                     endpoint_rate_limiter,
                 )
                 .await?;
-                BackendType::Console(api, credentials)
+                Backend::Console(api, credentials)
             }
             // NOTE: this auth backend doesn't use client credentials.
-            Self::Link(url, ()) => {
-                info!("performing link authentication");
+            Self::Web(url, ()) => {
+                info!("performing web authentication");
 
-                let info = link::authenticate(ctx, &url, client).await?;
+                let info = web::authenticate(ctx, &url, client).await?;
 
-                BackendType::Link(url, info)
+                Backend::Web(url, info)
             }
             Self::Local(_) => {
                 return Err(auth::AuthError::bad_auth_method("invalid for local proxy"))
@@ -461,14 +461,14 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
     }
 }
 
-impl BackendType<'_, ComputeUserInfo, &()> {
+impl Backend<'_, ComputeUserInfo, &()> {
     pub(crate) async fn get_role_secret(
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         match self {
             Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
-            Self::Link(_, ()) => Ok(Cached::new_uncached(None)),
+            Self::Web(_, ()) => Ok(Cached::new_uncached(None)),
             Self::Local(_) => Ok(Cached::new_uncached(None)),
         }
     }
@@ -479,21 +479,21 @@ impl BackendType<'_, ComputeUserInfo, &()> {
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         match self {
             Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Self::Link(_, ()) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Self::Web(_, ()) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
             Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
         }
     }
 }
 
 #[async_trait::async_trait]
-impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
+impl ComputeConnectBackend for Backend<'_, ComputeCredentials, NodeInfo> {
     async fn wake_compute(
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
         match self {
             Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Self::Link(_, info) => Ok(Cached::new_uncached(info.clone())),
+            Self::Web(_, info) => Ok(Cached::new_uncached(info.clone())),
             Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
         }
     }
@@ -501,21 +501,23 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
     fn get_keys(&self) -> &ComputeCredentialKeys {
         match self {
             Self::Console(_, creds) => &creds.keys,
-            Self::Link(_, _) => &ComputeCredentialKeys::None,
+            Self::Web(_, _) => &ComputeCredentialKeys::None,
             Self::Local(_) => &ComputeCredentialKeys::None,
         }
     }
 }
 
 #[async_trait::async_trait]
-impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
+impl ComputeConnectBackend for Backend<'_, ComputeCredentials, &()> {
     async fn wake_compute(
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
         match self {
             Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Self::Link(_, ()) => unreachable!("link auth flow doesn't support waking the compute"),
+            Self::Web(_, ()) => {
+                unreachable!("web auth flow doesn't support waking the compute")
+            }
             Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
         }
     }
@@ -523,7 +525,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
     fn get_keys(&self) -> &ComputeCredentialKeys {
         match self {
             Self::Console(_, creds) => &creds.keys,
-            Self::Link(_, ()) => &ComputeCredentialKeys::None,
+            Self::Web(_, ()) => &ComputeCredentialKeys::None,
             Self::Local(_) => &ComputeCredentialKeys::None,
         }
     }
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/web.rs
similarity index 87%
rename from proxy/src/auth/backend/link.rs
rename to proxy/src/auth/backend/web.rs
index 19515f95a8..58a4bef62e 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/web.rs
@@ -13,7 +13,7 @@ use tokio_postgres::config::SslMode;
 use tracing::{info, info_span};
 
 #[derive(Debug, Error)]
-pub(crate) enum LinkAuthError {
+pub(crate) enum WebAuthError {
     #[error(transparent)]
     WaiterRegister(#[from] waiters::RegisterError),
 
@@ -24,18 +24,18 @@ pub(crate) enum LinkAuthError {
     Io(#[from] std::io::Error),
 }
 
-impl UserFacingError for LinkAuthError {
+impl UserFacingError for WebAuthError {
     fn to_string_client(&self) -> String {
         "Internal error".to_string()
     }
 }
 
-impl ReportableError for LinkAuthError {
+impl ReportableError for WebAuthError {
     fn get_error_kind(&self) -> crate::error::ErrorKind {
         match self {
-            LinkAuthError::WaiterRegister(_) => crate::error::ErrorKind::Service,
-            LinkAuthError::WaiterWait(_) => crate::error::ErrorKind::Service,
-            LinkAuthError::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+            Self::WaiterRegister(_) => crate::error::ErrorKind::Service,
+            Self::WaiterWait(_) => crate::error::ErrorKind::Service,
+            Self::Io(_) => crate::error::ErrorKind::ClientDisconnect,
         }
     }
 }
@@ -74,7 +74,7 @@ pub(super) async fn authenticate(
         }
     };
 
-    let span = info_span!("link", psql_session_id = &psql_session_id);
+    let span = info_span!("web", psql_session_id = &psql_session_id);
     let greeting = hello_message(link_uri, &psql_session_id);
 
     // Give user a URL to spawn a new database.
@@ -87,7 +87,7 @@ pub(super) async fn authenticate(
 
     // Wait for web console response (see `mgmt`).
     info!(parent: &span, "waiting for console's reply...");
-    let db_info = waiter.await.map_err(LinkAuthError::from)?;
+    let db_info = waiter.await.map_err(WebAuthError::from)?;
 
     client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
 
diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs
index 3f5d006f7b..8585b8ff48 100644
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -1,5 +1,5 @@
 //! Payload for ad hoc authentication method for clients that don't support SNI.
-//! See the `impl` for [`super::backend::BackendType<ClientCredentials>`].
+//! See the `impl` for [`super::backend::Backend<ClientCredentials>`].
 //! Read more: <https://github.com/neondatabase/cloud/issues/1620#issuecomment-1165332290>.
 //! UPDATE (Mon Aug  8 13:20:34 UTC 2022): the payload format has been simplified.
 
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index 8acba33bac..08effeff99 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -212,7 +212,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
 
     Ok(Box::leak(Box::new(ProxyConfig {
         tls_config: None,
-        auth_backend: proxy::auth::BackendType::Local(proxy::auth::backend::MaybeOwned::Owned(
+        auth_backend: proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned(
             LocalBackend::new(args.compute),
         )),
         metric_collection: None,
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 1f45a33ed5..7706a1f7cd 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -60,11 +60,14 @@ use clap::{Parser, ValueEnum};
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
 #[derive(Clone, Debug, ValueEnum)]
-enum AuthBackend {
+enum AuthBackendType {
     Console,
     #[cfg(feature = "testing")]
     Postgres,
-    Link,
+    // clap only shows the name, not the alias, in usage text.
+    // TODO: swap name/alias and deprecate "link"
+    #[value(name("link"), alias("web"))]
+    Web,
 }
 
 /// Neon proxy/router
@@ -77,8 +80,8 @@ struct ProxyCliArgs {
     /// listen for incoming client connections on ip:port
     #[clap(short, long, default_value = "127.0.0.1:4432")]
     proxy: String,
-    #[clap(value_enum, long, default_value_t = AuthBackend::Link)]
-    auth_backend: AuthBackend,
+    #[clap(value_enum, long, default_value_t = AuthBackendType::Web)]
+    auth_backend: AuthBackendType,
     /// listen for management callback connection on ip:port
     #[clap(short, long, default_value = "127.0.0.1:7000")]
     mgmt: String,
@@ -88,7 +91,7 @@ struct ProxyCliArgs {
     /// listen for incoming wss connections on ip:port
     #[clap(long)]
     wss: Option<String>,
-    /// redirect unauthenticated users to the given uri in case of link auth
+    /// redirect unauthenticated users to the given uri in case of web auth
     #[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
     uri: String,
     /// cloud API endpoint for authenticating users
@@ -470,7 +473,7 @@ async fn main() -> anyhow::Result<()> {
         ));
     }
 
-    if let auth::BackendType::Console(api, _) = &config.auth_backend {
+    if let auth::Backend::Console(api, _) = &config.auth_backend {
         if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
             match (redis_notifications_client, regional_redis_client.clone()) {
                 (None, None) => {}
@@ -575,7 +578,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     }
 
     let auth_backend = match &args.auth_backend {
-        AuthBackend::Console => {
+        AuthBackendType::Console => {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
                 args.project_info_cache.parse()?;
@@ -624,18 +627,18 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
                 wake_compute_endpoint_rate_limiter,
             );
             let api = console::provider::ConsoleBackend::Console(api);
-            auth::BackendType::Console(MaybeOwned::Owned(api), ())
+            auth::Backend::Console(MaybeOwned::Owned(api), ())
         }
         #[cfg(feature = "testing")]
-        AuthBackend::Postgres => {
+        AuthBackendType::Postgres => {
             let url = args.auth_endpoint.parse()?;
             let api = console::provider::mock::Api::new(url);
             let api = console::provider::ConsoleBackend::Postgres(api);
-            auth::BackendType::Console(MaybeOwned::Owned(api), ())
+            auth::Backend::Console(MaybeOwned::Owned(api), ())
         }
-        AuthBackend::Link => {
+        AuthBackendType::Web => {
             let url = args.uri.parse()?;
-            auth::BackendType::Link(MaybeOwned::Owned(url), ())
+            auth::Backend::Web(MaybeOwned::Owned(url), ())
         }
     };
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 246501a21e..8d3cb8ee3c 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -124,13 +124,13 @@ impl ConnCfg {
     /// Apply startup message params to the connection config.
     pub(crate) fn set_startup_params(&mut self, params: &StartupMessageParams) {
         // Only set `user` if it's not present in the config.
-        // Link auth flow takes username from the console's response.
+        // Web auth flow takes username from the console's response.
         if let (None, Some(user)) = (self.get_user(), params.get("user")) {
             self.user(user);
         }
 
         // Only set `dbname` if it's not present in the config.
-        // Link auth flow takes dbname from the console's response.
+        // Web auth flow takes dbname from the console's response.
         if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
             self.dbname(dbname);
         }
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 6c42fb8d19..d7fc6eee22 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -25,7 +25,7 @@ use x509_parser::oid_registry;
 
 pub struct ProxyConfig {
     pub tls_config: Option<TlsConfig>,
-    pub auth_backend: auth::BackendType<'static, (), ()>,
+    pub auth_backend: auth::Backend<'static, (), ()>,
     pub metric_collection: Option<MetricCollectionConfig>,
     pub allow_self_signed_compute: bool,
     pub http_config: HttpConfig,
@@ -247,7 +247,7 @@ impl CertResolver {
 
         let common_name = pem.subject().to_string();
 
-        // We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as
+        // We only use non-wildcard certificates in web auth proxy so it seems okay to treat them the same as
         // wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so
         // verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names
         // and passed None instead, which blows up number of cases downstream code should handle. Proper coding
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 0df1a450ac..a48c7316f6 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -243,7 +243,7 @@ pub(crate) struct WakeCompute {
     pub(crate) aux: MetricsAuxInfo,
 }
 
-/// Async response which concludes the link auth flow.
+/// Async response which concludes the web auth flow.
 /// Also known as `kickResponse` in the console.
 #[derive(Debug, Deserialize)]
 pub(crate) struct KickSession<'a> {
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index f318ac529b..2ed4f5f206 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -25,7 +25,7 @@ pub(crate) fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), wai
 }
 
 /// Console management API listener task.
-/// It spawns console response handlers needed for the link auth.
+/// It spawns console response handlers needed for the web auth.
 pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {
     scopeguard::defer! {
         info!("mgmt has shut down");
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 9edba543fe..72e1fa1cee 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -72,7 +72,7 @@ struct RequestMonitoringInner {
 
 #[derive(Clone, Debug)]
 pub(crate) enum AuthMethod {
-    // aka link aka passwordless
+    // aka passwordless, fka link
     Web,
     ScramSha256,
     ScramSha256Plus,
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index b3b284ef27..4264dbae0f 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -544,8 +544,8 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
 
 fn helper_create_connect_info(
     mechanism: &TestConnectMechanism,
-) -> auth::BackendType<'static, ComputeCredentials, &()> {
-    let user_info = auth::BackendType::Console(
+) -> auth::Backend<'static, ComputeCredentials, &()> {
+    let user_info = auth::Backend::Console(
         MaybeOwned::Owned(ConsoleBackend::Test(Box::new(mechanism.clone()))),
         ComputeCredentials {
             info: ComputeUserInfo {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 2699411c28..f24e0478be 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -105,13 +105,13 @@ impl PoolingBackend {
         jwt: &str,
     ) -> Result<ComputeCredentials, AuthError> {
         match &self.config.auth_backend {
-            crate::auth::BackendType::Console(_, ()) => {
+            crate::auth::Backend::Console(_, ()) => {
                 Err(AuthError::auth_failed("JWT login is not yet supported"))
             }
-            crate::auth::BackendType::Link(_, ()) => Err(AuthError::auth_failed(
-                "JWT login over link proxy is not supported",
+            crate::auth::Backend::Web(_, ()) => Err(AuthError::auth_failed(
+                "JWT login over web auth proxy is not supported",
             )),
-            crate::auth::BackendType::Local(cache) => {
+            crate::auth::Backend::Local(cache) => {
                 cache
                     .jwks_cache
                     .check_jwt(

From 9b9f90c562d8b78628b66e1441fea7036ed66fc8 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 27 Aug 2024 15:49:47 +0200
Subject: [PATCH 1509/1571] fix(walproposer): Do not restart on safekeepers
 reordering (#8840)

## Problem

Currently, we compare `neon.safekeepers` values as is, so we
unnecessarily restart walproposer even if safekeepers set didn't change.
This leads to errors like:
```log
FATAL:  [WP] restarting walproposer to change safekeeper list
from safekeeper-8.us-east-2.aws.neon.tech:6401,safekeeper-11.us-east-2.aws.neon.tech:6401,safekeeper-10.us-east-2.aws.neon.tech:6401
to safekeeper-11.us-east-2.aws.neon.tech:6401,safekeeper-8.us-east-2.aws.neon.tech:6401,safekeeper-10.us-east-2.aws.neon.tech:6401
```

## Summary of changes

Split the GUC into the list of individual safekeepers and properly
compare. We could've done that somewhere on the upper level, e.g.,
control plane, but I think it's still better when the actual config
consumer is smarter and doesn't rely on upper levels.
---
 pgxn/neon/walproposer_pg.c         | 69 +++++++++++++++++++++++++++++-
 test_runner/regress/test_config.py | 45 ++++++++++++++++++-
 2 files changed, 111 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index f3ddc64061..65ef588ba5 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -220,6 +220,64 @@ nwp_register_gucs(void)
 							NULL, NULL, NULL);
 }
 
+
+static int
+split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
+{
+	int n_safekeepers = 0;
+	char *curr_sk = safekeepers_list;
+
+	for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma)
+	{
+		if (++n_safekeepers >= MAX_SAFEKEEPERS) {
+			wpg_log(FATAL, "too many safekeepers");
+		}
+
+		coma = strchr(coma, ',');
+		safekeepers[n_safekeepers-1] = curr_sk;
+
+		if (coma != NULL) {
+			*coma++ = '\0';
+		}
+	}
+
+	return n_safekeepers;
+}
+
+/*
+ * Accept two coma-separated strings with list of safekeeper host:port addresses.
+ * Split them into arrays and return false if two sets do not match, ignoring the order.
+ */
+static bool
+safekeepers_cmp(char *old, char *new)
+{
+	char *safekeepers_old[MAX_SAFEKEEPERS];
+	char *safekeepers_new[MAX_SAFEKEEPERS];
+	int len_old = 0;
+	int len_new = 0;
+
+	len_old = split_safekeepers_list(old, safekeepers_old);
+	len_new = split_safekeepers_list(new, safekeepers_new);
+
+	if (len_old != len_new)
+	{
+		return false;
+	}
+
+	qsort(&safekeepers_old, len_old, sizeof(char *), pg_qsort_strcmp);
+	qsort(&safekeepers_new, len_new, sizeof(char *), pg_qsort_strcmp);
+
+	for (int i = 0; i < len_new; i++)
+	{
+		if (strcmp(safekeepers_old[i], safekeepers_new[i]) != 0)
+		{
+			return false;
+		}
+	}
+
+	return true;
+}
+
 /*
  * GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if
  * the list changed.
@@ -235,19 +293,26 @@ assign_neon_safekeepers(const char *newval, void *extra)
 		wpg_log(FATAL, "neon.safekeepers is empty");
 	}
 
+	/* Copy values because we will modify them in split_safekeepers_list() */
+	char *newval_copy = pstrdup(newval);
+	char *oldval = pstrdup(wal_acceptors_list);
+
 	/* 
 	 * TODO: restarting through FATAL is stupid and introduces 1s delay before
 	 * next bgw start. We should refactor walproposer to allow graceful exit and
 	 * thus remove this delay.
+	 * XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder.
 	 */
-	if (strcmp(wal_acceptors_list, newval) != 0)
+	if (!safekeepers_cmp(oldval, newval_copy))
 	{
 		wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s",
 				wal_acceptors_list, newval);
 	}
+	pfree(newval_copy);
+	pfree(oldval);
 }
 
-/*  Check if we need to suspend inserts because of lagging replication. */
+/* Check if we need to suspend inserts because of lagging replication. */
 static uint64
 backpressure_lag_impl(void)
 {
diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py
index 4bb7df1e6a..2ef28eb94b 100644
--- a/test_runner/regress/test_config.py
+++ b/test_runner/regress/test_config.py
@@ -1,6 +1,7 @@
+import os
 from contextlib import closing
 
-from fixtures.neon_fixtures import NeonEnv
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
 
 
 #
@@ -28,3 +29,45 @@ def test_config(neon_simple_env: NeonEnv):
 
             # check that config change was applied
             assert cur.fetchone() == ("debug1",)
+
+
+#
+# Test that reordering of safekeepers does not restart walproposer
+#
+def test_safekeepers_reconfigure_reorder(
+    neon_env_builder: NeonEnvBuilder,
+):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_safekeepers_reconfigure_reorder")
+
+    endpoint = env.endpoints.create_start("test_safekeepers_reconfigure_reorder")
+
+    old_sks = ""
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("SHOW neon.safekeepers")
+            res = cur.fetchone()
+            assert res is not None, "neon.safekeepers GUC is set"
+            old_sks = res[0]
+
+    # Reorder safekeepers
+    safekeepers = endpoint.active_safekeepers
+    safekeepers = safekeepers[1:] + safekeepers[:1]
+
+    endpoint.reconfigure(safekeepers=safekeepers)
+
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("SHOW neon.safekeepers")
+            res = cur.fetchone()
+            assert res is not None, "neon.safekeepers GUC is set"
+            new_sks = res[0]
+
+            assert new_sks != old_sks, "GUC changes were applied"
+
+    log_path = os.path.join(endpoint.endpoint_path(), "compute.log")
+    with open(log_path, "r") as log_file:
+        logs = log_file.read()
+        # Check that walproposer was not restarted
+        assert "restarting walproposer" not in logs

From 2d10306f7a596d1f2ad8c70d01b427600a29c622 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 27 Aug 2024 18:36:33 +0300
Subject: [PATCH 1510/1571] Remove support for pageserver <-> compute protocol
 version 1 (#8774)

Protocol version 2 has been the default for a while now, and we no
longer have any computes running in production that used protocol
version 1. This completes the migration by removing support for v1 in
both the pageserver and the compute.

See issue #6211.
---
 libs/pageserver_api/src/models.rs           | 41 ++++--------------
 pageserver/src/metrics.rs                   |  1 -
 pageserver/src/page_service.rs              | 34 +--------------
 pgxn/neon/libpagestore.c                    |  5 +--
 pgxn/neon/pagestore_client.h                |  5 +--
 pgxn/neon/pagestore_smgr.c                  | 47 ++-------------------
 test_runner/regress/test_auth.py            |  2 +-
 test_runner/regress/test_read_validation.py | 10 -----
 8 files changed, 17 insertions(+), 128 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 4cab56771b..d39ac75707 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1063,7 +1063,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
     }
 }
 
-// In the V2 protocol version, a GetPage request contains two LSN values:
+// A GetPage request contains two LSN values:
 //
 // request_lsn: Get the page version at this point in time.  Lsn::Max is a special value that means
 // "get the latest version present". It's used by the primary server, which knows that no one else
@@ -1076,7 +1076,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
 // passing an earlier LSN can speed up the request, by allowing the pageserver to process the
 // request without waiting for 'request_lsn' to arrive.
 //
-// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
+// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
 // sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
 // 'latest' was set to true. The V2 interface was added because there was no correct way for a
 // standby to request a page at a particular non-latest LSN, and also include the
@@ -1084,15 +1084,11 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
 // request, if the standby knows that the page hasn't been modified since, and risk getting an error
 // if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
 // require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
-// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
+// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
 // difference in the responses between V1 and V2.
 //
-// The Request structs below reflect the V2 interface. If V1 is used, the parse function
-// maps the old format requests to the new format.
-//
 #[derive(Clone, Copy)]
 pub enum PagestreamProtocolVersion {
-    V1,
     V2,
 }
 
@@ -1231,36 +1227,17 @@ impl PagestreamFeMessage {
         bytes.into()
     }
 
-    pub fn parse<R: std::io::Read>(
-        body: &mut R,
-        protocol_version: PagestreamProtocolVersion,
-    ) -> anyhow::Result<PagestreamFeMessage> {
+    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
         // these correspond to the NeonMessageTag enum in pagestore_client.h
         //
         // TODO: consider using protobuf or serde bincode for less error prone
         // serialization.
         let msg_tag = body.read_u8()?;
 
-        let (request_lsn, not_modified_since) = match protocol_version {
-            PagestreamProtocolVersion::V2 => (
-                Lsn::from(body.read_u64::<BigEndian>()?),
-                Lsn::from(body.read_u64::<BigEndian>()?),
-            ),
-            PagestreamProtocolVersion::V1 => {
-                // In the old protocol, each message starts with a boolean 'latest' flag,
-                // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
-                // 'not_modified_since', used in the new protocol version.
-                let latest = body.read_u8()? != 0;
-                let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
-                if latest {
-                    (Lsn::MAX, request_lsn) // get latest version
-                } else {
-                    (request_lsn, request_lsn) // get version at specified LSN
-                }
-            }
-        };
+        // these two fields are the same for every request type
+        let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
+        let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
 
-        // The rest of the messages are the same between V1 and V2
         match msg_tag {
             0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
                 request_lsn,
@@ -1468,9 +1445,7 @@ mod tests {
         ];
         for msg in messages {
             let bytes = msg.serialize();
-            let reconstructed =
-                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
-                    .unwrap();
+            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
             assert!(msg == reconstructed);
         }
     }
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 1f8634df93..c4011d593c 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1552,7 +1552,6 @@ pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
 #[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
 pub(crate) enum ComputeCommandKind {
     PageStreamV2,
-    PageStream,
     Basebackup,
     Fullbackup,
     LeaseLsn,
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 81294291a9..cb1ab70147 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -557,7 +557,7 @@ impl PageServerHandler {
         pgb: &mut PostgresBackend<IO>,
         tenant_id: TenantId,
         timeline_id: TimelineId,
-        protocol_version: PagestreamProtocolVersion,
+        _protocol_version: PagestreamProtocolVersion,
         ctx: RequestContext,
     ) -> Result<(), QueryError>
     where
@@ -601,8 +601,7 @@ impl PageServerHandler {
             fail::fail_point!("ps::handle-pagerequest-message");
 
             // parse request
-            let neon_fe_msg =
-                PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
+            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
 
             // invoke handler function
             let (handler_result, span) = match neon_fe_msg {
@@ -1275,35 +1274,6 @@ where
                 ctx,
             )
             .await?;
-        } else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
-            if params.len() != 2 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for pagestream command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::PageStream)
-                .inc();
-
-            self.handle_pagerequests(
-                pgb,
-                tenant_id,
-                timeline_id,
-                PagestreamProtocolVersion::V1,
-                ctx,
-            )
-            .await?;
         } else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
             if params.len() < 2 {
                 return Err(QueryError::Other(anyhow::anyhow!(
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 73a001b6ba..5126c26c5d 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -550,9 +550,6 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		case 2:
 			pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
 			break;
-		case 1:
-			pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
-			break;
 		default:
 			elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
 		}
@@ -1063,7 +1060,7 @@ pg_init_libpagestore(void)
 							NULL,
 							&neon_protocol_version,
 							2, /* use protocol version 2 */
-							1, /* min */
+							2, /* min */
 							2, /* max */
 							PGC_SU_BACKEND,
 							0,	/* no flags required */
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 8951e6607b..1f196d016c 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -87,9 +87,8 @@ typedef enum {
  * can skip traversing through recent layers which we know to not contain any
  * versions for the requested page.
  *
- * These structs describe the V2 of these requests. The old V1 protocol contained
- * just one LSN and a boolean 'latest' flag. If the neon_protocol_version GUC is
- * set to 1, we will convert these to the V1 requests before sending.
+ * These structs describe the V2 of these requests. (The old now-defunct V1
+ * protocol contained just one LSN and a boolean 'latest' flag.)
  */
 typedef struct
 {
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 8edaf65639..7f39c7d026 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1001,51 +1001,10 @@ nm_pack_request(NeonRequest *msg)
 
 	initStringInfo(&s);
 
-	if (neon_protocol_version >= 2)
-	{
-		pq_sendbyte(&s, msg->tag);
-		pq_sendint64(&s, msg->lsn);
-		pq_sendint64(&s, msg->not_modified_since);
-	}
-	else
-	{
-		bool		latest;
-		XLogRecPtr	lsn;
+	pq_sendbyte(&s, msg->tag);
+	pq_sendint64(&s, msg->lsn);
+	pq_sendint64(&s, msg->not_modified_since);
 
-		/*
-		 * In primary, we always request the latest page version.
-		 */
-		if (!RecoveryInProgress())
-		{
-			latest = true;
-			lsn = msg->not_modified_since;
-		}
-		else
-		{
-			/*
-			 * In the protocol V1, we cannot represent that we want to read
-			 * page at LSN X, and we know that it hasn't been modified since
-			 * Y. We can either use 'not_modified_lsn' as the request LSN, and
-			 * risk getting an error if that LSN is too old and has already
-			 * fallen out of the pageserver's GC horizon, or we can send
-			 * 'request_lsn', causing the pageserver to possibly wait for the
-			 * recent WAL to arrive unnecessarily. Or something in between. We
-			 * choose to use the old LSN and risk GC errors, because that's
-			 * what we've done historically.
-			 */
-			latest = false;
-			lsn = msg->not_modified_since;
-		}
-
-		pq_sendbyte(&s, msg->tag);
-		pq_sendbyte(&s, latest);
-		pq_sendint64(&s, lsn);
-	}
-
-	/*
-	 * The rest of the request messages are the same between protocol V1 and
-	 * V2
-	 */
 	switch (messageTag(msg))
 	{
 			/* pagestore_client -> pagestore */
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index 7cb85e3dd1..780c0e1602 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -211,7 +211,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     def check_pageserver(expect_success: bool, **conn_kwargs):
         check_connection(
             env.pageserver,
-            f"pagestream {env.initial_tenant} {env.initial_timeline}",
+            f"pagestream_v2 {env.initial_tenant} {env.initial_timeline}",
             expect_success,
             **conn_kwargs,
         )
diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py
index d128c60a99..1ac881553f 100644
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -19,11 +19,6 @@ def test_read_validation(neon_simple_env: NeonEnv):
 
     endpoint = env.endpoints.create_start(
         "test_read_validation",
-        # Use protocol version 2, because the code that constructs the V1 messages
-        # assumes that a primary always wants to read the latest version of a page,
-        # and therefore doesn't work with the test functions below to read an older
-        # page version.
-        config_lines=["neon.protocol_version=2"],
     )
 
     with closing(endpoint.connect()) as con:
@@ -142,11 +137,6 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
 
     endpoint = env.endpoints.create_start(
         "test_read_validation_neg",
-        # Use protocol version 2, because the code that constructs the V1 messages
-        # assumes that a primary always wants to read the latest version of a page,
-        # and therefore doesn't work with the test functions below to read an older
-        # page version.
-        config_lines=["neon.protocol_version=2"],
     )
 
     with closing(endpoint.connect()) as con:

From c5ef779801f58b7faac96db123a1c4e3b6388678 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 27 Aug 2024 19:47:05 +0300
Subject: [PATCH 1511/1571] tests: Remove unnecessary entries from list of
 allowed errors (#8199)

The "manual_gc" context was removed in commit be0c73f8e7. The code that
generated the other error was removed in commit 9a6c0be823.
---
 test_runner/fixtures/pageserver/allowed_errors.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index dff002bd4b..f8d9a51c91 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -52,9 +52,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*Error processing HTTP request: Forbidden",
     # intentional failpoints
     ".*failpoint ",
-    # FIXME: These need investigation
-    ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
-    ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
     # Tenant::delete_timeline() can cause any of the four following errors.
     # FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946
     ".*could not flush frozen layer.*queue is in state Stopped",  # when schedule layer upload fails because queued got closed before compaction got killed

From 992a951b5e989b27d2aee118c9180c1df0b7483d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 28 Aug 2024 09:22:19 +0100
Subject: [PATCH 1512/1571] .github: direct feature requests to the feedback
 form (#8849)

## Problem

When folks open github issues for feature requests, they don't have a
clear recipient: engineers usually see them during bug triage, but that
doesn't necessarily get the work prioritized.

## Summary of changes

Give end users a clearer path to submitting feedback to Neon
---
 .github/ISSUE_TEMPLATE/config.yml | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/config.yml

diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000000..c8fd1209de
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,6 @@
+
+blank_issues_enabled: true
+contact_links:
+  - name: Feature request
+    url: https://console.neon.tech/app/projects?modal=feedback
+    about: For feature requests in the Neon product, please submit via the feedback form on `https://console.neon.tech`

From c0ba18a112668438e5f2de5ae04d369c48976200 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 28 Aug 2024 12:20:43 +0300
Subject: [PATCH 1513/1571] bench: flush before shutting down (#8844)

while driving by:
- remove the extra tenant
- remove the extra timelines

implement this by turning the pg_compare to a yielding fixture.

evidence:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/10571779162/index.html#suites/9681106e61a1222669b9d22ab136d07b/3bbe9f007b3ffae1/
---
 test_runner/fixtures/compare_fixtures.py      | 16 ++++------------
 .../performance/test_wal_backpressure.py      | 19 ++++++++++++-------
 2 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 5fe544b3bd..98a9dd7184 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -102,7 +102,6 @@ class NeonCompare(PgCompare):
         zenbenchmark: NeonBenchmarker,
         neon_simple_env: NeonEnv,
         pg_bin: PgBin,
-        branch_name: str,
     ):
         self.env = neon_simple_env
         self._zenbenchmark = zenbenchmark
@@ -110,16 +109,11 @@ class NeonCompare(PgCompare):
         self.pageserver_http_client = self.env.pageserver.http_client()
 
         # note that neon_simple_env now uses LOCAL_FS remote storage
-
-        # Create tenant
-        tenant_conf: Dict[str, str] = {}
-        self.tenant, _ = self.env.neon_cli.create_tenant(conf=tenant_conf)
-
-        # Create timeline
-        self.timeline = self.env.neon_cli.create_timeline(branch_name, tenant_id=self.tenant)
+        self.tenant = self.env.initial_tenant
+        self.timeline = self.env.initial_timeline
 
         # Start pg
-        self._pg = self.env.endpoints.create_start(branch_name, "main", self.tenant)
+        self._pg = self.env.endpoints.create_start("main", "main", self.tenant)
 
     @property
     def pg(self) -> PgProtocol:
@@ -297,13 +291,11 @@ class RemoteCompare(PgCompare):
 
 @pytest.fixture(scope="function")
 def neon_compare(
-    request: FixtureRequest,
     zenbenchmark: NeonBenchmarker,
     pg_bin: PgBin,
     neon_simple_env: NeonEnv,
 ) -> NeonCompare:
-    branch_name = request.node.name
-    return NeonCompare(zenbenchmark, neon_simple_env, pg_bin, branch_name)
+    return NeonCompare(zenbenchmark, neon_simple_env, pg_bin)
 
 
 @pytest.fixture(scope="function")
diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py
index 513ebc74c3..c824e60c29 100644
--- a/test_runner/performance/test_wal_backpressure.py
+++ b/test_runner/performance/test_wal_backpressure.py
@@ -2,14 +2,14 @@ import statistics
 import threading
 import time
 import timeit
-from typing import Any, Callable, List
+from typing import Any, Callable, Generator, List
 
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, flush_ep_to_pageserver
 
 from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix
 
@@ -20,7 +20,7 @@ from performance.test_perf_pgbench import get_durations_matrix, get_scales_matri
 # For example, to build a `NeonCompare` interface, the corresponding fixture's param should have
 # a format of `neon_{safekeepers_enable_fsync}`.
 # Note that, here "_" is used to separate builder parameters.
-def pg_compare(request) -> PgCompare:
+def pg_compare(request) -> Generator[PgCompare, None, None]:
     x = request.param.split("_")
 
     if x[0] == "vanilla":
@@ -28,7 +28,7 @@ def pg_compare(request) -> PgCompare:
         fixture = request.getfixturevalue("vanilla_compare")
         assert isinstance(fixture, VanillaCompare)
 
-        return fixture
+        yield fixture
     else:
         assert (
             len(x) == 2
@@ -47,10 +47,15 @@ def pg_compare(request) -> PgCompare:
         neon_env_builder.safekeepers_enable_fsync = x[1] == "on"
 
         env = neon_env_builder.init_start()
-        env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME)
 
-        branch_name = request.node.name
-        return NeonCompare(zenbenchmark, env, pg_bin, branch_name)
+        cmp = NeonCompare(zenbenchmark, env, pg_bin)
+
+        yield cmp
+
+        flush_ep_to_pageserver(env, cmp._pg, cmp.tenant, cmp.timeline)
+        env.pageserver.http_client().timeline_checkpoint(
+            cmp.tenant, cmp.timeline, compact=False, wait_until_uploaded=True
+        )
 
 
 def start_heavy_write_workload(env: PgCompare, n_tables: int, scale: int, num_iters: int):

From 5eb7322d08e93912653dd6ba02a4507e80c50aec Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 28 Aug 2024 14:56:14 +0100
Subject: [PATCH 1514/1571] docs: rolling storage controller restarts RFC
 (#8310)

## Problem
Storage controller upgrades (restarts, more generally) can cause
multi-second availability gaps.
While the storage controller does not sit on the main data path, it's
generally not acceptable
to block management requests for extended periods of time (e.g.
https://github.com/neondatabase/neon/issues/8034).

## Summary of changes
This RFC describes the issues around the current storage controller
restart procedure
and describes an implementation which reduces downtime to a few
milliseconds on the happy path.

Related https://github.com/neondatabase/neon/issues/7797
---
 docs/rfcs/037-storage-controller-restarts.md | 259 +++++++++++++++++++
 1 file changed, 259 insertions(+)
 create mode 100644 docs/rfcs/037-storage-controller-restarts.md

diff --git a/docs/rfcs/037-storage-controller-restarts.md b/docs/rfcs/037-storage-controller-restarts.md
new file mode 100644
index 0000000000..bad422344f
--- /dev/null
+++ b/docs/rfcs/037-storage-controller-restarts.md
@@ -0,0 +1,259 @@
+# Rolling Storage Controller Restarts
+
+## Summary
+
+This RFC describes the issues around the current storage controller restart procedure
+and describes an implementation which reduces downtime to a few milliseconds on the happy path.
+
+## Motivation
+
+Storage controller upgrades (restarts, more generally) can cause multi-second availability gaps.
+While the storage controller does not sit on the main data path, it's generally not acceptable
+to block management requests for extended periods of time (e.g. https://github.com/neondatabase/neon/issues/8034).
+
+### Current Implementation
+
+The storage controller runs in a Kubernetes Deployment configured for one replica and strategy set to [Recreate](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#recreate-deployment).
+In non Kubernetes terms, during an upgrade, the currently running storage controller is stopped and, only after,
+a new instance is created.
+
+At start-up, the storage controller calls into all the pageservers it manages (retrieved from DB) to learn the
+latest locations of all tenant shards present on them. This is usually fast, but can push into tens of seconds
+under unfavourable circumstances: pageservers are heavily loaded or unavailable.
+
+## Prior Art
+
+There's probably as many ways of handling restarts gracefully as there are distributed systems. Some examples include:
+* Active/Standby architectures: Two or more instance of the same service run, but traffic is only routed to one of them.
+For fail-over, traffic is routed to one of the standbys (which becomes active).
+* Consensus Algorithms (Raft, Paxos and friends): The part of consensus we care about here is leader election: peers communicate to each other
+and use a voting scheme that ensures the existence of a single leader (e.g. Raft epochs).
+
+## Requirements
+
+* Reduce storage controller unavailability during upgrades to milliseconds
+* Minimize the interval in which it's possible for more than one storage controller
+to issue reconciles.
+* Have one uniform implementation for restarts and upgrades
+* Fit in with the current Kubernetes deployment scheme
+
+## Non Goals
+
+* Implement our own consensus algorithm from scratch
+* Completely eliminate downtime storage controller downtime. Instead we aim to reduce it to the point where it looks
+like a transient error to the control plane
+
+## Impacted Components
+
+* storage controller
+* deployment orchestration (i.e. Ansible)
+* helm charts
+
+## Terminology
+
+* Observed State: in-memory mapping between tenant shards and their current pageserver locations - currently built up
+at start-up by quering pageservers
+* Deployment: Kubernetes [primitive](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) that models
+a set of replicas
+
+## Implementation
+
+### High Level Flow
+
+At a very high level the proposed idea is to start a new storage controller instance while
+the previous one is still running and cut-over to it when it becomes ready. The new instance,
+should coordinate with the existing one and transition responsibility gracefully. While the controller
+has built in safety against split-brain situations (via generation numbers), we'd like to avoid such
+scenarios since they can lead to availability issues for tenants that underwent changes while two controllers
+were operating at the same time and require operator intervention to remedy.
+
+### Kubernetes Deployment Configuration
+
+On the Kubernetes configuration side, the proposal is to update the storage controller `Deployment`
+to use `spec.strategy.type = RollingUpdate`, `spec.strategy.rollingUpdate.maxSurge=1` and `spec.strategy.maxUnavailable=0`.
+Under the hood, Kubernetes creates a new replica set and adds one pod to it (`maxSurge=1`). The old replica set does not
+scale down until the new replica set has one replica in the ready state (`maxUnavailable=0`).
+
+The various possible failure scenarios are investigated in the [Handling Failures](#handling-failures) section.
+
+### Storage Controller Start-Up
+
+This section describes the primitives required on the storage controller side and the flow of the happy path.
+
+#### Database Table For Leader Synchronization
+
+A new table should be added to the storage controller database for leader synchronization during startup.
+This table will always contain at most one row. The proposed name for the table is `leader` and the schema
+contains two elements:
+* `hostname`: represents the hostname for the current storage controller leader - should be addressible
+from other pods in the deployment
+* `start_timestamp`: holds the start timestamp for the current storage controller leader (UTC timezone) - only required
+for failure case handling: see [Previous Leader Crashes Before New Leader Readiness](#previous-leader-crashes-before-new-leader-readiness)
+
+Storage controllers will read the leader row at start-up and then update it to mark themselves as the leader
+at the end of the start-up sequence. We want compare-and-exchange semantics for the update: avoid the
+situation where two concurrent updates succeed and overwrite each other. The default Postgres isolation
+level is `READ COMMITTED`, which isn't strict enough here. This update transaction should use at least `REPEATABLE
+READ` isolation level in order to [prevent lost updates](https://www.interdb.jp/pg/pgsql05/08.html). Currently,
+the storage controller uses the stricter `SERIALIZABLE` isolation level for all transactions. This more than suits
+our needs here.
+
+```
+START TRANSACTION ISOLATION LEVEL REPEATABLE READ
+UPDATE leader SET hostname=<new_hostname>, start_timestamp=<new_start_ts>
+WHERE hostname=<old_hostname>, start_timestampt=<old_start_ts>;
+```
+
+If the transaction fails or if no rows have been updated, then the compare-and-exchange is regarded as a failure.
+
+#### Step Down API
+
+A new HTTP endpoint should be added to the storage controller: `POST /control/v1/step_down`. Upon receiving this
+request the leader cancels any pending reconciles and goes into a mode where it replies with 503 to all other APIs
+and does not issue any location configurations to its pageservers. The successful HTTP response will return a serialized
+snapshot of the observed state.
+
+If other step down requests come in after the initial one, the request is handled and the observed state is returned (required
+for failure scenario handling - see [Handling Failures](#handling-failures)).
+
+#### Graceful Restart Happy Path
+
+At start-up, the first thing the storage controller does is retrieve the sole row from the new
+`leader` table. If such an entry exists, send a `/step_down` PUT API call to the current leader.
+This should be retried a few times with a short backoff (see [1]). The aspiring leader loads the
+observed state into memory and the start-up sequence proceeds as usual, but *without* querying the
+pageservers in order to build up the observed state.
+
+Before doing any reconciliations or persistence change, update the `leader` database table as described in the [Database Table For Leader Synchronization](database-table-for-leader-synchronization)
+section. If this step fails, the storage controller process exits.
+
+Note that no row will exist in the `leaders` table for the first graceful restart. In that case, force update the `leader` table
+(without the WHERE clause) and perform with the pre-existing start-up procedure (i.e. build observed state by querying pageservers).
+
+Summary of proposed new start-up sequence:
+1. Call `/step_down`
+2. Perform any pending database migrations
+3. Load state from database
+4. Load observed state returned in step (1) into memory
+5. Do initial heartbeat round (may be moved after 5)
+7. Mark self as leader by updating the database
+8. Reschedule and reconcile everything
+
+Some things to note from the steps above:
+* The storage controller makes no changes to the cluster state before step (5) (i.e. no location config
+calls to the pageserver and no compute notifications)
+* Ask the current leader to step down before loading state from database so we don't get a lost update
+if the transactions overlap.
+* Before loading the observed state at step (3), cross-validate against the database. If validation fails,
+fall back to asking the pageservers about their current locations.
+* Database migrations should only run **after** the previous instance steps down (or the step down times out).
+
+
+[1] The API call might fail because there's no storage controller running (i.e. [restart](#storage-controller-crash-or-restart)),
+so we don't want to extend the unavailability period by much. We still want to retry since that's not the common case.
+
+### Handling Failures
+
+#### Storage Controller Crash Or Restart
+
+The storage controller may crash or be restarted outside of roll-outs. When a new pod is created, its call to
+`/step_down` will fail since the previous leader is no longer reachable. In this case perform the pre-existing
+start-up procedure and update the leader table (with the WHERE clause). If the update fails, the storage controller
+exists and consistency is maintained.
+
+#### Previous Leader Crashes Before New Leader Readiness
+
+When the previous leader (P1) crashes before the new leader (P2) passses the readiness check, Kubernetes will
+reconcile the old replica set and create a new pod for it (P1'). The `/step_down` API call will fail for P1'
+(see [2]).
+
+Now we have two cases to consider:
+* P2 updates the `leader` table first: The database update from P1' will fail and P1' will exit, or be terminated
+by Kubernetes depending on timings.
+* P1' updates the `leader` table first: The `hostname` field of the `leader` row stays the same, but the `start_timestamp` field changes.
+The database update from P2 will fail (since `start_timestamp` does not match). P2 will exit and Kubernetes will
+create a new replacement pod for it (P2'). Now the entire dance starts again, but with P1' as the leader and P2' as the incumbent.
+
+[2] P1 and P1' may (more likely than not) be the same pod and have the same hostname. The implementation
+should avoid this self reference and fail the API call at the client if the persisted hostname matches
+the current one.
+
+#### Previous Leader Crashes After New Leader Readiness
+
+The deployment's replica sets already satisfy the deployment's replica count requirements and the
+Kubernetes deployment rollout will just clean up the dead pod.
+
+#### New Leader Crashes Before Pasing Readiness Check
+
+The deployment controller scales up the new replica sets by creating a new pod. The entire procedure is repeated
+with the new pod.
+
+#### Network Partition Between New Pod and Previous Leader
+
+This feels very unlikely, but should be considered in any case. P2 (the new aspiring leader) fails the `/step_down`
+API call into P1 (the current leader). P2 proceeds with the pre-existing startup procedure and updates the `leader` table.
+Kubernetes will terminate P1, but there may be a brief period where both storage controller can drive reconciles.
+
+### Dealing With Split Brain Scenarios
+
+As we've seen in the previous section, we can end up with two storage controller running at the same time. The split brain
+duration is not bounded since the Kubernetes controller might become partitioned from the pods (unlikely though). While these
+scenarios are not fatal, they can cause tenant unavailability, so we'd like to reduce the chances of this happening.
+The rest of this section sketches some safety measure. It's likely overkill to implement all of them however.
+
+### Ensure Leadership Before Producing Side Effects
+
+The storage controller has two types of side effects: location config requests into pageservers and compute notifications into the control plane.
+Before issuing either, the storage controller could check that it is indeed still the leader by querying the database. Side effects might still be
+applied if they race with the database updatem, but the situation will eventually be detected. The storage controller process should terminate in these cases.
+
+### Leadership Lease
+
+Up until now, the leadership defined by this RFC is static. In order to bound the length of the split brain scenario, we could require the leadership
+to be renewed periodically. Two new columns would be added to the leaders table:
+1. `last_renewed` - timestamp indicating when the lease was last renewed
+2. `lease_duration` - duration indicating the amount of time after which the lease expires
+
+The leader periodically attempts to renew the lease by checking that it is in fact still the legitimate leader and updating `last_renewed` in the
+same transaction. If the update fails, the process exits. New storage controller instances wishing to become leaders must wait for the current lease
+to expire before acquiring leadership if they have not succesfully received a response to the `/step_down` request.
+
+### Notify Pageserver Of Storage Controller Term
+
+Each time that leadership changes, we can bump a `term` integer column in the `leader` table. This term uniquely identifies a leader.
+Location config requests and re-attach responses can include this term. On the pageserver side, keep the latest term in memory and refuse
+anything which contains a stale term (i.e. smaller than the current one).
+
+### Observability
+
+* The storage controller should expose a metric which describes it's state (`Active | WarmingUp | SteppedDown`).
+Per region alerts should be added on this metric which triggers when:
+  + no storage controller has been in the `Active` state for an extended period of time
+  + more than one storage controllers are in the `Active` state
+
+* An alert that periodically verifies that the `leader` table is in sync with the metric above would be very useful.
+We'd have to expose the storage controller read only database to Grafana (perhaps it is already done).
+
+## Alternatives
+
+### Kubernetes Leases
+
+Kubernetes has a [lease primitive](https://kubernetes.io/docs/concepts/architecture/leases/) which can be used to implement leader election.
+Only one instance may hold a lease at any given time. This lease needs to be periodically renewed and has an expiration period.
+
+In our case, it would work something like this:
+* `/step_down` deletes the lease or stops it from renewing
+* lease acquisition becomes part of the start-up procedure
+
+The kubert crate implements a [lightweight lease API](https://docs.rs/kubert/latest/kubert/lease/struct.LeaseManager.html), but it's still
+not exactly trivial to implement.
+
+This approach has the benefit of baked in observability (`kubectl describe lease`), but:
+* We offload the responsibility to Kubernetes which makes it harder to debug when things go wrong.
+* More code surface than the simple "row in database" approach. Also, most of this code would be in
+a dependency not subject to code review, etc.
+* Hard to test. Our testing infra does not run the storage controller in Kubernetes and changing it do
+so is not simple and complictes and the test set-up.
+
+To my mind, the "row in database" approach is straightforward enough that we don't have to offload this
+to something external.

From a889a49e06101a91d548eb66d3ba1c0d89d7fb53 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Wed, 28 Aug 2024 10:54:42 -0400
Subject: [PATCH 1515/1571] pageserver: do vectored read on each dio-aligned
 section once (#8763)

Part of #8130, closes #8719.

## Problem

Currently, vectored blob io only coalesce blocks if they are immediately
adjacent to each other. When we switch to Direct IO, we need a way to
coalesce blobs that are within the dio-aligned boundary but has gap
between them.

## Summary of changes

- Introduces a `VectoredReadCoalesceMode` for `VectoredReadPlanner` and
`StreamingVectoredReadPlanner` which has two modes:
  - `AdjacentOnly` (current implementation)
  - `Chunked(<alignment requirement>)`
- New `ChunkedVectorBuilder` that considers batching `dio-align`-sized
read, the start and end of the vectored read will respect
`stx_dio_offset_align` / `stx_dio_mem_align` (`vectored_read.start` and
`vectored_read.blobs_at.first().start_offset` will be two different
value).
- Since we break the assumption that blobs within single `VectoredRead`
are next to each other (implicit end offset), we start to store blob end
offsets in the `VectoredRead`.
- Adapted existing tests to run in both `VectoredReadCoalesceMode`.
- The io alignment can also be live configured at runtime.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 .github/workflows/_build-and-test-locally.yml |   4 +-
 pageserver/benches/bench_ingest.rs            |   8 +-
 pageserver/client/src/mgmt_api.rs             |  10 +
 pageserver/ctl/src/layer_map_analyzer.rs      |   7 +-
 pageserver/ctl/src/layers.rs                  |   9 +-
 pageserver/ctl/src/main.rs                    |   7 +-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |   9 +
 pageserver/src/bin/pageserver.rs              |   7 +-
 pageserver/src/config.rs                      |  18 +
 pageserver/src/http/routes.rs                 |  17 +
 .../src/tenant/storage_layer/delta_layer.rs   |   6 +-
 .../src/tenant/storage_layer/image_layer.rs   |   2 +-
 pageserver/src/tenant/vectored_blob_io.rs     | 352 ++++++++++++++++--
 pageserver/src/virtual_file.rs                |  61 ++-
 test_runner/fixtures/neon_fixtures.py         |  10 +
 test_runner/fixtures/parametrize.py           |   5 +
 16 files changed, 480 insertions(+), 52 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 5e9fff0e6a..a8526fc6b1 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -217,7 +217,9 @@ jobs:
           ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
 
           for io_engine in std-fs tokio-epoll-uring ; do
-            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+            for io_buffer_alignment in 0 1 512 ; do
+              NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+            done
           done
 
           # Run separate tests for real S3
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index bd99f5289d..f450f46efa 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -4,7 +4,7 @@ use bytes::Bytes;
 use camino::Utf8PathBuf;
 use criterion::{criterion_group, criterion_main, Criterion};
 use pageserver::{
-    config::PageServerConf,
+    config::{defaults::DEFAULT_IO_BUFFER_ALIGNMENT, PageServerConf},
     context::{DownloadBehavior, RequestContext},
     l0_flush::{L0FlushConfig, L0FlushGlobalState},
     page_cache,
@@ -164,7 +164,11 @@ fn criterion_benchmark(c: &mut Criterion) {
     let conf: &'static PageServerConf = Box::leak(Box::new(
         pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
     ));
-    virtual_file::init(16384, virtual_file::io_engine_for_bench());
+    virtual_file::init(
+        16384,
+        virtual_file::io_engine_for_bench(),
+        DEFAULT_IO_BUFFER_ALIGNMENT,
+    );
     page_cache::init(conf.page_cache_size);
 
     {
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index ac3ff1bb89..71d36f3113 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -506,6 +506,16 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    /// Configs io buffer alignment at runtime.
+    pub async fn put_io_alignment(&self, align: usize) -> Result<()> {
+        let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint);
+        self.request(Method::PUT, uri, align)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
         let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
         self.get(uri)
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index b4bb239f44..8092c203c3 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -4,6 +4,7 @@
 
 use anyhow::Result;
 use camino::{Utf8Path, Utf8PathBuf};
+use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -144,7 +145,11 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
 
     // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+    pageserver::virtual_file::init(
+        10,
+        virtual_file::api::IoEngineKind::StdFs,
+        DEFAULT_IO_BUFFER_ALIGNMENT,
+    );
     pageserver::page_cache::init(100);
 
     let mut total_delta_layers = 0usize;
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index 3611b0baab..a183a3968d 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -3,6 +3,7 @@ use std::path::{Path, PathBuf};
 use anyhow::Result;
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::Subcommand;
+use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::block_io::BlockCursor;
@@ -59,7 +60,7 @@ pub(crate) enum LayerCmd {
 
 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
     let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1);
     page_cache::init(100);
     let file = VirtualFile::open(path, ctx).await?;
     let file_id = page_cache::next_file_id();
@@ -189,7 +190,11 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             new_tenant_id,
             new_timeline_id,
         } => {
-            pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+            pageserver::virtual_file::init(
+                10,
+                virtual_file::api::IoEngineKind::StdFs,
+                DEFAULT_IO_BUFFER_ALIGNMENT,
+            );
             pageserver::page_cache::init(100);
 
             let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 3fabf62987..7a6c7675bb 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -20,6 +20,7 @@ use clap::{Parser, Subcommand};
 use index_part::IndexPartCmd;
 use layers::LayerCmd;
 use pageserver::{
+    config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
     context::{DownloadBehavior, RequestContext},
     page_cache,
     task_mgr::TaskKind,
@@ -205,7 +206,11 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
 
 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
     // Basic initialization of things that don't change after startup
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+    virtual_file::init(
+        10,
+        virtual_file::api::IoEngineKind::StdFs,
+        DEFAULT_IO_BUFFER_ALIGNMENT,
+    );
     page_cache::init(100);
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
     dump_layerfile_from_path(path, true, &ctx).await
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 4992f37465..ac4a732377 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -58,6 +58,11 @@ pub(crate) struct Args {
     /// [`pageserver_api::models::virtual_file::IoEngineKind`].
     #[clap(long)]
     set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
+
+    /// Before starting the benchmark, live-reconfigure the pageserver to use specified alignment for io buffers.
+    #[clap(long)]
+    set_io_alignment: Option<usize>,
+
     targets: Option<Vec<TenantTimelineId>>,
 }
 
@@ -124,6 +129,10 @@ async fn main_impl(
         mgmt_api_client.put_io_engine(engine_str).await?;
     }
 
+    if let Some(align) = args.set_io_alignment {
+        mgmt_api_client.put_io_alignment(align).await?;
+    }
+
     // discover targets
     let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
         &mgmt_api_client,
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 7d404e50a5..850bd87b95 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -125,6 +125,7 @@ fn main() -> anyhow::Result<()> {
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
     info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
+    info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");
 
     // The tenants directory contains all the pageserver local disk state.
     // Create if not exists and make sure all the contents are durable before proceeding.
@@ -182,7 +183,11 @@ fn main() -> anyhow::Result<()> {
     let scenario = failpoint_support::init();
 
     // Basic initialization of things that don't change after startup
-    virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
+    virtual_file::init(
+        conf.max_file_descriptors,
+        conf.virtual_file_io_engine,
+        conf.io_buffer_alignment,
+    );
     page_cache::init(conf.page_cache_size);
 
     start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 0ebaf78840..ae473bcc5f 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -95,6 +95,8 @@ pub mod defaults {
 
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
 
+    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 0;
+
     ///
     /// Default built-in configuration file.
     ///
@@ -289,6 +291,8 @@ pub struct PageServerConf {
 
     /// Direct IO settings
     pub virtual_file_direct_io: virtual_file::DirectIoMode,
+
+    pub io_buffer_alignment: usize,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -393,6 +397,8 @@ struct PageServerConfigBuilder {
     compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
 
     virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
+
+    io_buffer_alignment: BuilderValue<usize>,
 }
 
 impl PageServerConfigBuilder {
@@ -481,6 +487,7 @@ impl PageServerConfigBuilder {
             l0_flush: Set(L0FlushConfig::default()),
             compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
             virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
+            io_buffer_alignment: Set(DEFAULT_IO_BUFFER_ALIGNMENT),
         }
     }
 }
@@ -660,6 +667,10 @@ impl PageServerConfigBuilder {
         self.virtual_file_direct_io = BuilderValue::Set(value);
     }
 
+    pub fn io_buffer_alignment(&mut self, value: usize) {
+        self.io_buffer_alignment = BuilderValue::Set(value);
+    }
+
     pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
@@ -716,6 +727,7 @@ impl PageServerConfigBuilder {
                 l0_flush,
                 compact_level0_phase1_value_access,
                 virtual_file_direct_io,
+                io_buffer_alignment,
             }
             CUSTOM LOGIC
             {
@@ -985,6 +997,9 @@ impl PageServerConf {
                 "virtual_file_direct_io" => {
                     builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
                 }
+                "io_buffer_alignment" => {
+                    builder.io_buffer_alignment(parse_toml_u64("io_buffer_alignment", item)? as usize)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1068,6 +1083,7 @@ impl PageServerConf {
             l0_flush: L0FlushConfig::default(),
             compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
             virtual_file_direct_io: virtual_file::DirectIoMode::default(),
+            io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
         }
     }
 }
@@ -1308,6 +1324,7 @@ background_task_maximum_delay = '334 s'
                 l0_flush: L0FlushConfig::default(),
                 compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
                 virtual_file_direct_io: virtual_file::DirectIoMode::default(),
+                io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1381,6 +1398,7 @@ background_task_maximum_delay = '334 s'
                 l0_flush: L0FlushConfig::default(),
                 compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
                 virtual_file_direct_io: virtual_file::DirectIoMode::default(),
+                io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index cbcc162b32..a126136d20 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2344,6 +2344,20 @@ async fn put_io_engine_handler(
     json_response(StatusCode::OK, ())
 }
 
+async fn put_io_alignment_handler(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
+    let align: usize = json_request(&mut r).await?;
+    crate::virtual_file::set_io_buffer_alignment(align).map_err(|align| {
+        ApiError::PreconditionFailed(
+            format!("Requested io alignment ({align}) is not a power of two").into(),
+        )
+    })?;
+    json_response(StatusCode::OK, ())
+}
+
 /// Polled by control plane.
 ///
 /// See [`crate::utilization`].
@@ -3031,6 +3045,9 @@ pub fn make_router(
             |r| api_handler(r, timeline_collect_keyspace),
         )
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
+        .put("/v1/io_alignment", |r| {
+            api_handler(r, put_io_alignment_handler)
+        })
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
             |r| api_handler(r, force_aux_policy_switch_handler),
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index f4a2957972..c0508e13c0 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -40,7 +40,7 @@ use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
-    VectoredReadPlanner,
+    VectoredReadCoalesceMode, VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
@@ -1205,6 +1205,7 @@ impl DeltaLayerInner {
         let mut prev: Option<(Key, Lsn, BlobRef)> = None;
 
         let mut read_builder: Option<VectoredReadBuilder> = None;
+        let read_mode = VectoredReadCoalesceMode::get();
 
         let max_read_size = self
             .max_vectored_read_bytes
@@ -1253,6 +1254,7 @@ impl DeltaLayerInner {
                         offsets.end.pos(),
                         meta,
                         max_read_size,
+                        read_mode,
                     ))
                 }
             } else {
@@ -2281,7 +2283,7 @@ pub(crate) mod test {
             .await
             .unwrap();
         let delta_layer = resident_layer.get_as_delta(&ctx).await.unwrap();
-        for max_read_size in [1, 1024] {
+        for max_read_size in [1, 2048] {
             for batch_size in [1, 2, 4, 8, 3, 7, 13] {
                 println!("running with batch_size={batch_size} max_read_size={max_read_size}");
                 // Test if the batch size is correctly determined
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 3cb2b1c83a..38411e9d9e 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1367,7 +1367,7 @@ mod test {
                 .await
                 .unwrap();
         let img_layer = resident_layer.get_as_image(&ctx).await.unwrap();
-        for max_read_size in [1, 1024] {
+        for max_read_size in [1, 2048] {
             for batch_size in [1, 2, 4, 8, 3, 7, 13] {
                 println!("running with batch_size={batch_size} max_read_size={max_read_size}");
                 // Test if the batch size is correctly determined
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 54a3ad789b..80bc56092d 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -25,9 +25,10 @@ use tokio_epoll_uring::BoundedBuf;
 use utils::lsn::Lsn;
 use utils::vec_map::VecMap;
 
+use crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use crate::context::RequestContext;
 use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::{self, VirtualFile};
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub struct MaxVectoredReadBytes(pub NonZeroUsize);
@@ -60,7 +61,7 @@ pub struct VectoredBlobsBuf {
 pub struct VectoredRead {
     pub start: u64,
     pub end: u64,
-    /// Starting offsets and metadata for each blob in this read
+    /// Start offset and metadata for each blob in this read
     pub blobs_at: VecMap<u64, BlobMeta>,
 }
 
@@ -76,14 +77,109 @@ pub(crate) enum VectoredReadExtended {
     No,
 }
 
-pub(crate) struct VectoredReadBuilder {
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum VectoredReadCoalesceMode {
+    /// Only coalesce exactly adjacent reads.
+    AdjacentOnly,
+    /// In addition to adjacent reads, also consider reads whose corresponding
+    /// `end` and `start` offsets reside at the same chunk.
+    Chunked(usize),
+}
+
+impl VectoredReadCoalesceMode {
+    /// [`AdjacentVectoredReadBuilder`] is used if alignment requirement is 0,
+    /// whereas [`ChunkedVectoredReadBuilder`] is used for alignment requirement 1 and higher.
+    pub(crate) fn get() -> Self {
+        let align = virtual_file::get_io_buffer_alignment_raw();
+        if align == DEFAULT_IO_BUFFER_ALIGNMENT {
+            VectoredReadCoalesceMode::AdjacentOnly
+        } else {
+            VectoredReadCoalesceMode::Chunked(align)
+        }
+    }
+}
+
+pub(crate) enum VectoredReadBuilder {
+    Adjacent(AdjacentVectoredReadBuilder),
+    Chunked(ChunkedVectoredReadBuilder),
+}
+
+impl VectoredReadBuilder {
+    fn new_impl(
+        start_offset: u64,
+        end_offset: u64,
+        meta: BlobMeta,
+        max_read_size: Option<usize>,
+        mode: VectoredReadCoalesceMode,
+    ) -> Self {
+        match mode {
+            VectoredReadCoalesceMode::AdjacentOnly => Self::Adjacent(
+                AdjacentVectoredReadBuilder::new(start_offset, end_offset, meta, max_read_size),
+            ),
+            VectoredReadCoalesceMode::Chunked(chunk_size) => {
+                Self::Chunked(ChunkedVectoredReadBuilder::new(
+                    start_offset,
+                    end_offset,
+                    meta,
+                    max_read_size,
+                    chunk_size,
+                ))
+            }
+        }
+    }
+
+    pub(crate) fn new(
+        start_offset: u64,
+        end_offset: u64,
+        meta: BlobMeta,
+        max_read_size: usize,
+        mode: VectoredReadCoalesceMode,
+    ) -> Self {
+        Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), mode)
+    }
+
+    pub(crate) fn new_streaming(
+        start_offset: u64,
+        end_offset: u64,
+        meta: BlobMeta,
+        mode: VectoredReadCoalesceMode,
+    ) -> Self {
+        Self::new_impl(start_offset, end_offset, meta, None, mode)
+    }
+
+    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
+        match self {
+            VectoredReadBuilder::Adjacent(builder) => builder.extend(start, end, meta),
+            VectoredReadBuilder::Chunked(builder) => builder.extend(start, end, meta),
+        }
+    }
+
+    pub(crate) fn build(self) -> VectoredRead {
+        match self {
+            VectoredReadBuilder::Adjacent(builder) => builder.build(),
+            VectoredReadBuilder::Chunked(builder) => builder.build(),
+        }
+    }
+
+    pub(crate) fn size(&self) -> usize {
+        match self {
+            VectoredReadBuilder::Adjacent(builder) => builder.size(),
+            VectoredReadBuilder::Chunked(builder) => builder.size(),
+        }
+    }
+}
+
+pub(crate) struct AdjacentVectoredReadBuilder {
+    /// Start offset of the read.
     start: u64,
+    // End offset of the read.
     end: u64,
+    /// Start offset and metadata for each blob in this read
     blobs_at: VecMap<u64, BlobMeta>,
     max_read_size: Option<usize>,
 }
 
-impl VectoredReadBuilder {
+impl AdjacentVectoredReadBuilder {
     /// Start building a new vectored read.
     ///
     /// Note that by design, this does not check against reading more than `max_read_size` to
@@ -93,7 +189,7 @@ impl VectoredReadBuilder {
         start_offset: u64,
         end_offset: u64,
         meta: BlobMeta,
-        max_read_size: usize,
+        max_read_size: Option<usize>,
     ) -> Self {
         let mut blobs_at = VecMap::default();
         blobs_at
@@ -104,7 +200,7 @@ impl VectoredReadBuilder {
             start: start_offset,
             end: end_offset,
             blobs_at,
-            max_read_size: Some(max_read_size),
+            max_read_size,
         }
     }
     /// Attempt to extend the current read with a new blob if the start
@@ -113,13 +209,15 @@ impl VectoredReadBuilder {
     pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
         tracing::trace!(start, end, "trying to extend");
         let size = (end - start) as usize;
-        if self.end == start && {
+        let not_limited_by_max_read_size = {
             if let Some(max_read_size) = self.max_read_size {
                 self.size() + size <= max_read_size
             } else {
                 true
             }
-        } {
+        };
+
+        if self.end == start && not_limited_by_max_read_size {
             self.end = end;
             self.blobs_at
                 .append(start, meta)
@@ -144,6 +242,107 @@ impl VectoredReadBuilder {
     }
 }
 
+pub(crate) struct ChunkedVectoredReadBuilder {
+    /// Start block number
+    start_blk_no: usize,
+    /// End block number (exclusive).
+    end_blk_no: usize,
+    /// Start offset and metadata for each blob in this read
+    blobs_at: VecMap<u64, BlobMeta>,
+    max_read_size: Option<usize>,
+    /// Chunk size reads are coalesced into.
+    chunk_size: usize,
+}
+
+/// Computes x / d rounded up.
+fn div_round_up(x: usize, d: usize) -> usize {
+    (x + (d - 1)) / d
+}
+
+impl ChunkedVectoredReadBuilder {
+    /// Start building a new vectored read.
+    ///
+    /// Note that by design, this does not check against reading more than `max_read_size` to
+    /// support reading larger blobs than the configuration value. The builder will be single use
+    /// however after that.
+    pub(crate) fn new(
+        start_offset: u64,
+        end_offset: u64,
+        meta: BlobMeta,
+        max_read_size: Option<usize>,
+        chunk_size: usize,
+    ) -> Self {
+        let mut blobs_at = VecMap::default();
+        blobs_at
+            .append(start_offset, meta)
+            .expect("First insertion always succeeds");
+
+        let start_blk_no = start_offset as usize / chunk_size;
+        let end_blk_no = div_round_up(end_offset as usize, chunk_size);
+        Self {
+            start_blk_no,
+            end_blk_no,
+            blobs_at,
+            max_read_size,
+            chunk_size,
+        }
+    }
+
+    /// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk.
+    ///
+    /// The resulting size also must be below the max read size.
+    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
+        tracing::trace!(start, end, "trying to extend");
+        let start_blk_no = start as usize / self.chunk_size;
+        let end_blk_no = div_round_up(end as usize, self.chunk_size);
+
+        let not_limited_by_max_read_size = {
+            if let Some(max_read_size) = self.max_read_size {
+                let coalesced_size = (end_blk_no - self.start_blk_no) * self.chunk_size;
+                coalesced_size <= max_read_size
+            } else {
+                true
+            }
+        };
+
+        // True if the second block starts in the same block or the immediate next block where the first block ended.
+        //
+        // Note: This automatically handles the case where two blocks are adjacent to each other,
+        // whether they starts on chunk size boundary or not.
+        let is_adjacent_chunk_read = {
+            // 1. first.end & second.start are in the same block
+            self.end_blk_no == start_blk_no + 1 ||
+            // 2. first.end ends one block before second.start
+            self.end_blk_no == start_blk_no
+        };
+
+        if is_adjacent_chunk_read && not_limited_by_max_read_size {
+            self.end_blk_no = end_blk_no;
+            self.blobs_at
+                .append(start, meta)
+                .expect("LSNs are ordered within vectored reads");
+
+            return VectoredReadExtended::Yes;
+        }
+
+        VectoredReadExtended::No
+    }
+
+    pub(crate) fn size(&self) -> usize {
+        (self.end_blk_no - self.start_blk_no) * self.chunk_size
+    }
+
+    pub(crate) fn build(self) -> VectoredRead {
+        let start = (self.start_blk_no * self.chunk_size) as u64;
+        let end = (self.end_blk_no * self.chunk_size) as u64;
+        VectoredRead {
+            start,
+            end,
+            blobs_at: self.blobs_at,
+        }
+    }
+}
+
 #[derive(Copy, Clone, Debug)]
 pub enum BlobFlag {
     None,
@@ -166,14 +365,18 @@ pub struct VectoredReadPlanner {
     prev: Option<(Key, Lsn, u64, BlobFlag)>,
 
     max_read_size: usize,
+
+    mode: VectoredReadCoalesceMode,
 }
 
 impl VectoredReadPlanner {
     pub fn new(max_read_size: usize) -> Self {
+        let mode = VectoredReadCoalesceMode::get();
         Self {
             blobs: BTreeMap::new(),
             prev: None,
             max_read_size,
+            mode,
         }
     }
 
@@ -252,6 +455,7 @@ impl VectoredReadPlanner {
                         end_offset,
                         BlobMeta { key, lsn },
                         self.max_read_size,
+                        self.mode,
                     );
 
                     let prev_read_builder = current_read_builder.replace(next_read_builder);
@@ -303,6 +507,18 @@ impl<'a> VectoredBlobReader<'a> {
             read.size(),
             buf.capacity()
         );
+
+        if cfg!(debug_assertions) {
+            let align = virtual_file::get_io_buffer_alignment() as u64;
+            debug_assert_eq!(
+                read.start % align,
+                0,
+                "Read start at {} does not satisfy the required io buffer alignment ({} bytes)",
+                read.start,
+                align
+            );
+        }
+
         let mut buf = self
             .file
             .read_exact_at(buf.slice(0..read.size()), read.start, ctx)
@@ -310,27 +526,20 @@ impl<'a> VectoredBlobReader<'a> {
             .into_inner();
 
         let blobs_at = read.blobs_at.as_slice();
-        let start_offset = blobs_at.first().expect("VectoredRead is never empty").0;
+
+        let start_offset = read.start;
 
         let mut metas = Vec::with_capacity(blobs_at.len());
-
         // Blobs in `read` only provide their starting offset. The end offset
         // of a blob is implicit: the start of the next blob if one exists
         // or the end of the read.
-        let pairs = blobs_at.iter().zip(
-            blobs_at
-                .iter()
-                .map(Some)
-                .skip(1)
-                .chain(std::iter::once(None)),
-        );
 
         // Some scratch space, put here for reusing the allocation
         let mut decompressed_vec = Vec::new();
 
-        for ((offset, meta), next) in pairs {
-            let offset_in_buf = offset - start_offset;
-            let first_len_byte = buf[offset_in_buf as usize];
+        for (blob_start, meta) in blobs_at {
+            let blob_start_in_buf = blob_start - start_offset;
+            let first_len_byte = buf[blob_start_in_buf as usize];
 
             // Each blob is prefixed by a header containing its size and compression information.
             // Extract the size and skip that header to find the start of the data.
@@ -340,7 +549,7 @@ impl<'a> VectoredBlobReader<'a> {
                 (1, first_len_byte as u64, BYTE_UNCOMPRESSED)
             } else {
                 let mut blob_size_buf = [0u8; 4];
-                let offset_in_buf = offset_in_buf as usize;
+                let offset_in_buf = blob_start_in_buf as usize;
 
                 blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
                 blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
@@ -353,12 +562,8 @@ impl<'a> VectoredBlobReader<'a> {
                 )
             };
 
-            let start_raw = offset_in_buf + size_length;
-            let end_raw = match next {
-                Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
-                None => start_raw + blob_size,
-            };
-            assert_eq!(end_raw - start_raw, blob_size);
+            let start_raw = blob_start_in_buf + size_length;
+            let end_raw = start_raw + blob_size;
             let (start, end);
             if compression_bits == BYTE_UNCOMPRESSED {
                 start = start_raw as usize;
@@ -407,18 +612,22 @@ pub struct StreamingVectoredReadPlanner {
     max_cnt: usize,
     /// Size of the current batch
     cnt: usize,
+
+    mode: VectoredReadCoalesceMode,
 }
 
 impl StreamingVectoredReadPlanner {
     pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
         assert!(max_cnt > 0);
         assert!(max_read_size > 0);
+        let mode = VectoredReadCoalesceMode::get();
         Self {
             read_builder: None,
             prev: None,
             max_cnt,
             max_read_size,
             cnt: 0,
+            mode,
         }
     }
 
@@ -467,17 +676,12 @@ impl StreamingVectoredReadPlanner {
             }
             None => {
                 self.read_builder = {
-                    let mut blobs_at = VecMap::default();
-                    blobs_at
-                        .append(start_offset, BlobMeta { key, lsn })
-                        .expect("First insertion always succeeds");
-
-                    Some(VectoredReadBuilder {
-                        start: start_offset,
-                        end: end_offset,
-                        blobs_at,
-                        max_read_size: None,
-                    })
+                    Some(VectoredReadBuilder::new_streaming(
+                        start_offset,
+                        end_offset,
+                        BlobMeta { key, lsn },
+                        self.mode,
+                    ))
                 };
             }
         }
@@ -511,7 +715,9 @@ mod tests {
     use super::*;
 
     fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
-        assert_eq!(read.start, offset_range.first().unwrap().2);
+        let align = virtual_file::get_io_buffer_alignment() as u64;
+        assert_eq!(read.start % align, 0);
+        assert_eq!(read.start / align, offset_range.first().unwrap().2 / align);
 
         let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect();
 
@@ -525,6 +731,63 @@ mod tests {
         assert_eq!(expected_offsets_in_read, offsets_in_read);
     }
 
+    #[test]
+    fn planner_chunked_coalesce_all_test() {
+        use crate::virtual_file;
+
+        const CHUNK_SIZE: u64 = 512;
+        virtual_file::set_io_buffer_alignment(CHUNK_SIZE as usize).unwrap();
+        let max_read_size = CHUNK_SIZE as usize * 8;
+        let key = Key::MIN;
+        let lsn = Lsn(0);
+
+        let blob_descriptions = [
+            (key, lsn, CHUNK_SIZE / 8, BlobFlag::None), // Read 1 BEGIN
+            (key, lsn, CHUNK_SIZE / 4, BlobFlag::Ignore), // Gap
+            (key, lsn, CHUNK_SIZE / 2, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE - 2, BlobFlag::Ignore), // Gap
+            (key, lsn, CHUNK_SIZE, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 2 - 1, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 2 + 1, BlobFlag::Ignore), // Gap
+            (key, lsn, CHUNK_SIZE * 3 + 1, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 5 + 1, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce.
+            (key, lsn, CHUNK_SIZE * 7 + 1, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size)
+            (key, lsn, CHUNK_SIZE * 9, BlobFlag::Ignore), // ==== skipped a chunk
+            (key, lsn, CHUNK_SIZE * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce)
+        ];
+
+        let ranges = [
+            &[
+                blob_descriptions[0],
+                blob_descriptions[2],
+                blob_descriptions[4],
+                blob_descriptions[5],
+                blob_descriptions[7],
+                blob_descriptions[8],
+                blob_descriptions[10],
+            ],
+            &blob_descriptions[11..12],
+            &blob_descriptions[13..],
+        ];
+
+        let mut planner = VectoredReadPlanner::new(max_read_size);
+        for (key, lsn, offset, flag) in blob_descriptions {
+            planner.handle(key, lsn, offset, flag);
+        }
+
+        planner.handle_range_end(652 * 1024);
+
+        let reads = planner.finish();
+
+        assert_eq!(reads.len(), ranges.len());
+
+        for (idx, read) in reads.iter().enumerate() {
+            validate_read(read, ranges[idx]);
+        }
+    }
+
     #[test]
     fn planner_max_read_size_test() {
         let max_read_size = 128 * 1024;
@@ -737,6 +1000,7 @@ mod tests {
         let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
         let mut buf = BytesMut::with_capacity(reserved_bytes);
 
+        let mode = VectoredReadCoalesceMode::get();
         let vectored_blob_reader = VectoredBlobReader::new(&file);
         let meta = BlobMeta {
             key: Key::MIN,
@@ -748,7 +1012,7 @@ mod tests {
             if idx + 1 == offsets.len() {
                 continue;
             }
-            let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096);
+            let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, mode);
             let read = read_builder.build();
             let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
             assert_eq!(result.blobs.len(), 1);
@@ -784,4 +1048,12 @@ mod tests {
         round_trip_test_compressed(&blobs, true).await?;
         Ok(())
     }
+
+    #[test]
+    fn test_div_round_up() {
+        const CHUNK_SIZE: usize = 512;
+        assert_eq!(1, div_round_up(200, CHUNK_SIZE));
+        assert_eq!(1, div_round_up(CHUNK_SIZE, CHUNK_SIZE));
+        assert_eq!(2, div_round_up(CHUNK_SIZE + 1, CHUNK_SIZE));
+    }
 }
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index c0017280fd..4b11dc1a94 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -10,6 +10,7 @@
 //! This is similar to PostgreSQL's virtual file descriptor facility in
 //! src/backend/storage/file/fd.c
 //!
+use crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use crate::context::RequestContext;
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
 
@@ -1140,10 +1141,13 @@ impl OpenFiles {
 /// server startup.
 ///
 #[cfg(not(test))]
-pub fn init(num_slots: usize, engine: IoEngineKind) {
+pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize) {
     if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
         panic!("virtual_file::init called twice");
     }
+    if set_io_buffer_alignment(io_buffer_alignment).is_err() {
+        panic!("IO buffer alignment ({io_buffer_alignment}) is not a power of two");
+    }
     io_engine::init(engine);
     crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }
@@ -1167,6 +1171,61 @@ fn get_open_files() -> &'static OpenFiles {
     }
 }
 
+static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT);
+
+/// Returns true if `x` is zero or a power of two.
+fn is_zero_or_power_of_two(x: usize) -> bool {
+    (x == 0) || ((x & (x - 1)) == 0)
+}
+
+#[allow(unused)]
+pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
+    if is_zero_or_power_of_two(align) {
+        IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed);
+        Ok(())
+    } else {
+        Err(align)
+    }
+}
+
+/// Gets the io buffer alignment requirement. Returns 0 if there is no requirement specified.
+///
+/// This function should be used to check the raw config value.
+pub(crate) fn get_io_buffer_alignment_raw() -> usize {
+    let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed);
+
+    if cfg!(test) {
+        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT";
+        if align == DEFAULT_IO_BUFFER_ALIGNMENT {
+            if let Some(test_align) = utils::env::var(env_var_name) {
+                if is_zero_or_power_of_two(test_align) {
+                    test_align
+                } else {
+                    panic!("IO buffer alignment ({test_align}) is not a power of two");
+                }
+            } else {
+                crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT
+            }
+        } else {
+            align
+        }
+    } else {
+        align
+    }
+}
+
+/// Gets the io buffer alignment requirement. Returns 1 if the alignment config is set to zero.
+///
+/// This function should be used for getting the actual alignment value to use.
+pub(crate) fn get_io_buffer_alignment() -> usize {
+    let align = get_io_buffer_alignment_raw();
+    if align == DEFAULT_IO_BUFFER_ALIGNMENT {
+        1
+    } else {
+        align
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::context::DownloadBehavior;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 92febfec9b..69a4234617 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -496,6 +496,7 @@ class NeonEnvBuilder:
         pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
         safekeeper_extra_opts: Optional[list[str]] = None,
         storage_controller_port_override: Optional[int] = None,
+        pageserver_io_buffer_alignment: Optional[int] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -550,6 +551,8 @@ class NeonEnvBuilder:
 
         self.storage_controller_port_override = storage_controller_port_override
 
+        self.pageserver_io_buffer_alignment = pageserver_io_buffer_alignment
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -1123,6 +1126,7 @@ class NeonEnv:
 
         self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
         self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
+        self.pageserver_io_buffer_alignment = config.pageserver_io_buffer_alignment
 
         # Create the neon_local's `NeonLocalInitConf`
         cfg: Dict[str, Any] = {
@@ -1184,6 +1188,8 @@ class NeonEnv:
                         for key, value in override.items():
                             ps_cfg[key] = value
 
+            ps_cfg["io_buffer_alignment"] = self.pageserver_io_buffer_alignment
+
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
                 NeonPageserver(
@@ -1425,6 +1431,7 @@ def _shared_simple_env(
     pageserver_virtual_file_io_engine: str,
     pageserver_aux_file_policy: Optional[AuxFileStore],
     pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
+    pageserver_io_buffer_alignment: Optional[int],
 ) -> Iterator[NeonEnv]:
     """
     # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
@@ -1457,6 +1464,7 @@ def _shared_simple_env(
         pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
         pageserver_aux_file_policy=pageserver_aux_file_policy,
         pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
+        pageserver_io_buffer_alignment=pageserver_io_buffer_alignment,
     ) as builder:
         env = builder.init_start()
 
@@ -1499,6 +1507,7 @@ def neon_env_builder(
     pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
     pageserver_aux_file_policy: Optional[AuxFileStore],
     record_property: Callable[[str, object], None],
+    pageserver_io_buffer_alignment: Optional[int],
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.
@@ -1534,6 +1543,7 @@ def neon_env_builder(
         test_overlay_dir=test_overlay_dir,
         pageserver_aux_file_policy=pageserver_aux_file_policy,
         pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
+        pageserver_io_buffer_alignment=pageserver_io_buffer_alignment,
     ) as builder:
         yield builder
         # Propogate `preserve_database_files` to make it possible to use in other fixtures,
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 92c98763e3..e2dd51802c 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -34,6 +34,11 @@ def pageserver_virtual_file_io_engine() -> Optional[str]:
     return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE")
 
 
+@pytest.fixture(scope="function", autouse=True)
+def pageserver_io_buffer_alignment() -> Optional[int]:
+    return None
+
+
 @pytest.fixture(scope="function", autouse=True)
 def pageserver_aux_file_policy() -> Optional[AuxFileStore]:
     return None

From 793b5061ecb2ab20250c762248afd24e23ed1e16 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 28 Aug 2024 18:23:55 +0100
Subject: [PATCH 1516/1571] storcon: track pageserver availability zone (#8852)

## Problem
In order to build AZ aware scheduling, the storage controller needs to
know what AZ pageservers are in.

Related https://github.com/neondatabase/neon/issues/8848

## Summary of changes
This patch set adds a new nullable column to the `nodes` table:
`availability_zone_id`. The node registration
request is extended to include the AZ id (pageservers already have this
in their `metadata.json` file).

If the node is already registered, then we update the persistent and
in-memory state with the provided AZ.
Otherwise, we add the node with the AZ to begin with.

A couple assumptions are made here:
1. Pageserver AZ ids are stable
2. AZ ids do not change over time

Once all pageservers have a configured AZ, we can remove the optionals
in the code and make the database column not nullable.
---
 control_plane/storcon_cli/src/main.rs         |  4 +
 libs/pageserver_api/src/controller_api.rs     |  2 +
 pageserver/src/control_plane_client.rs        |  6 ++
 .../2024-08-27-184400_pageserver_az/down.sql  |  1 +
 .../2024-08-27-184400_pageserver_az/up.sql    |  1 +
 storage_controller/src/node.rs                | 23 ++++-
 storage_controller/src/persistence.rs         | 27 ++++++
 storage_controller/src/scheduler.rs           |  1 +
 storage_controller/src/schema.rs              |  1 +
 storage_controller/src/service.rs             | 93 +++++++++++++++----
 .../fixtures/pageserver/allowed_errors.py     |  3 +
 11 files changed, 143 insertions(+), 19 deletions(-)
 create mode 100644 storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql
 create mode 100644 storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 35510ccbca..5cce6cf3ae 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -41,6 +41,8 @@ enum Command {
         listen_http_addr: String,
         #[arg(long)]
         listen_http_port: u16,
+        #[arg(long)]
+        availability_zone_id: String,
     },
 
     /// Modify a node's configuration in the storage controller
@@ -322,6 +324,7 @@ async fn main() -> anyhow::Result<()> {
             listen_pg_port,
             listen_http_addr,
             listen_http_port,
+            availability_zone_id,
         } => {
             storcon_client
                 .dispatch::<_, ()>(
@@ -333,6 +336,7 @@ async fn main() -> anyhow::Result<()> {
                         listen_pg_port,
                         listen_http_addr,
                         listen_http_port,
+                        availability_zone_id: Some(availability_zone_id),
                     }),
                 )
                 .await?;
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index a9a57d77ce..345abd69b6 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -56,6 +56,8 @@ pub struct NodeRegisterRequest {
 
     pub listen_http_addr: String,
     pub listen_http_port: u16,
+
+    pub availability_zone_id: Option<String>,
 }
 
 #[derive(Serialize, Deserialize)]
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index b5d9267d79..56a536c387 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -141,12 +141,18 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                         m.other
                     );
 
+                    let az_id = m
+                        .other
+                        .get("availability_zone_id")
+                        .and_then(|jv| jv.as_str().map(|str| str.to_owned()));
+
                     Some(NodeRegisterRequest {
                         node_id: conf.id,
                         listen_pg_addr: m.postgres_host,
                         listen_pg_port: m.postgres_port,
                         listen_http_addr: m.http_host,
                         listen_http_port: m.http_port,
+                        availability_zone_id: az_id,
                     })
                 }
                 Err(e) => {
diff --git a/storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql b/storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql
new file mode 100644
index 0000000000..22df81c83c
--- /dev/null
+++ b/storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql
@@ -0,0 +1 @@
+ALTER TABLE nodes DROP availability_zone_id;
diff --git a/storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql b/storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql
new file mode 100644
index 0000000000..7112f92bf2
--- /dev/null
+++ b/storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql
@@ -0,0 +1 @@
+ALTER TABLE nodes ADD availability_zone_id VARCHAR;
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 61a44daca9..73cecc491d 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -36,6 +36,8 @@ pub(crate) struct Node {
     listen_pg_addr: String,
     listen_pg_port: u16,
 
+    availability_zone_id: Option<String>,
+
     // This cancellation token means "stop any RPCs in flight to this node, and don't start
     // any more". It is not related to process shutdown.
     #[serde(skip)]
@@ -61,6 +63,10 @@ impl Node {
         self.id
     }
 
+    pub(crate) fn get_availability_zone_id(&self) -> Option<&str> {
+        self.availability_zone_id.as_deref()
+    }
+
     pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy {
         self.scheduling
     }
@@ -72,7 +78,18 @@ impl Node {
     /// Does this registration request match `self`?  This is used when deciding whether a registration
     /// request should be allowed to update an existing record with the same node ID.
     pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
-        self.id == register_req.node_id
+        let az_ids_match = {
+            match (
+                self.availability_zone_id.as_deref(),
+                register_req.availability_zone_id.as_deref(),
+            ) {
+                (Some(current_az), Some(register_req_az)) => current_az == register_req_az,
+                _ => true,
+            }
+        };
+
+        az_ids_match
+            && self.id == register_req.node_id
             && self.listen_http_addr == register_req.listen_http_addr
             && self.listen_http_port == register_req.listen_http_port
             && self.listen_pg_addr == register_req.listen_pg_addr
@@ -173,6 +190,7 @@ impl Node {
         listen_http_port: u16,
         listen_pg_addr: String,
         listen_pg_port: u16,
+        availability_zone_id: Option<String>,
     ) -> Self {
         Self {
             id,
@@ -182,6 +200,7 @@ impl Node {
             listen_pg_port,
             scheduling: NodeSchedulingPolicy::Active,
             availability: NodeAvailability::Offline,
+            availability_zone_id,
             cancel: CancellationToken::new(),
         }
     }
@@ -194,6 +213,7 @@ impl Node {
             listen_http_port: self.listen_http_port as i32,
             listen_pg_addr: self.listen_pg_addr.clone(),
             listen_pg_port: self.listen_pg_port as i32,
+            availability_zone_id: self.availability_zone_id.clone(),
         }
     }
 
@@ -208,6 +228,7 @@ impl Node {
             listen_http_port: np.listen_http_port as u16,
             listen_pg_addr: np.listen_pg_addr,
             listen_pg_port: np.listen_pg_port as u16,
+            availability_zone_id: np.availability_zone_id,
             cancel: CancellationToken::new(),
         }
     }
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 1a905753a1..a842079ce7 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -103,6 +103,7 @@ pub(crate) enum DatabaseOperation {
     ListMetadataHealthOutdated,
     GetLeader,
     UpdateLeader,
+    SetNodeAzId,
 }
 
 #[must_use]
@@ -315,6 +316,31 @@ impl Persistence {
         }
     }
 
+    pub(crate) async fn set_node_availability_zone_id(
+        &self,
+        input_node_id: NodeId,
+        input_az_id: String,
+    ) -> DatabaseResult<()> {
+        use crate::schema::nodes::dsl::*;
+        let updated = self
+            .with_measured_conn(DatabaseOperation::SetNodeAzId, move |conn| {
+                let updated = diesel::update(nodes)
+                    .filter(node_id.eq(input_node_id.0 as i64))
+                    .set((availability_zone_id.eq(input_az_id.clone()),))
+                    .execute(conn)?;
+                Ok(updated)
+            })
+            .await?;
+
+        if updated != 1 {
+            Err(DatabaseError::Logical(format!(
+                "Node {node_id:?} not found for setting az id",
+            )))
+        } else {
+            Ok(())
+        }
+    }
+
     /// At startup, load the high level state for shards, such as their config + policy.  This will
     /// be enriched at runtime with state discovered on pageservers.
     pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
@@ -974,6 +1000,7 @@ pub(crate) struct NodePersistence {
     pub(crate) listen_http_port: i32,
     pub(crate) listen_pg_addr: String,
     pub(crate) listen_pg_port: i32,
+    pub(crate) availability_zone_id: Option<String>,
 }
 
 /// Tenant metadata health status that are stored durably.
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 060e3cc6ca..ef4da6861c 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -528,6 +528,7 @@ pub(crate) mod test_utils {
                         80 + i as u16,
                         format!("pghost-{i}"),
                         5432 + i as u16,
+                        None,
                     );
                     node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
                     assert!(node.is_available());
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 77ba47e114..1e8379500c 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -25,6 +25,7 @@ diesel::table! {
         listen_http_port -> Int4,
         listen_pg_addr -> Varchar,
         listen_pg_port -> Int4,
+        availability_zone_id -> Nullable<Varchar>,
     }
 }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 7daa1e4f5f..1f221a9b45 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1257,6 +1257,7 @@ impl Service {
                     123,
                     "".to_string(),
                     123,
+                    None,
                 );
 
                 scheduler.node_upsert(&node);
@@ -4683,29 +4684,84 @@ impl Service {
         )
         .await;
 
-        {
+        if register_req.availability_zone_id.is_none() {
+            tracing::warn!(
+                "Node {} registering without specific availability zone id",
+                register_req.node_id
+            );
+        }
+
+        enum RegistrationStatus {
+            Matched(Node),
+            Mismatched,
+            New,
+        }
+
+        let registration_status = {
             let locked = self.inner.read().unwrap();
             if let Some(node) = locked.nodes.get(&register_req.node_id) {
-                // Note that we do not do a total equality of the struct, because we don't require
-                // the availability/scheduling states to agree for a POST to be idempotent.
                 if node.registration_match(&register_req) {
-                    tracing::info!(
-                        "Node {} re-registered with matching address",
-                        register_req.node_id
-                    );
-                    return Ok(());
+                    RegistrationStatus::Matched(node.clone())
                 } else {
-                    // TODO: decide if we want to allow modifying node addresses without removing and re-adding
-                    // the node.  Safest/simplest thing is to refuse it, and usually we deploy with
-                    // a fixed address through the lifetime of a node.
-                    tracing::warn!(
-                        "Node {} tried to register with different address",
-                        register_req.node_id
-                    );
-                    return Err(ApiError::Conflict(
-                        "Node is already registered with different address".to_string(),
-                    ));
+                    RegistrationStatus::Mismatched
                 }
+            } else {
+                RegistrationStatus::New
+            }
+        };
+
+        match registration_status {
+            RegistrationStatus::Matched(node) => {
+                tracing::info!(
+                    "Node {} re-registered with matching address",
+                    register_req.node_id
+                );
+
+                if node.get_availability_zone_id().is_none() {
+                    if let Some(az_id) = register_req.availability_zone_id.clone() {
+                        tracing::info!("Extracting availability zone id from registration request for node {}: {}",
+                                       register_req.node_id, az_id);
+
+                        // Persist to the database and update in memory state. See comment below
+                        // on ordering.
+                        self.persistence
+                            .set_node_availability_zone_id(register_req.node_id, az_id)
+                            .await?;
+                        let node_with_az = Node::new(
+                            register_req.node_id,
+                            register_req.listen_http_addr,
+                            register_req.listen_http_port,
+                            register_req.listen_pg_addr,
+                            register_req.listen_pg_port,
+                            register_req.availability_zone_id,
+                        );
+
+                        let mut locked = self.inner.write().unwrap();
+                        let mut new_nodes = (*locked.nodes).clone();
+
+                        locked.scheduler.node_upsert(&node_with_az);
+                        new_nodes.insert(register_req.node_id, node_with_az);
+
+                        locked.nodes = Arc::new(new_nodes);
+                    }
+                }
+
+                return Ok(());
+            }
+            RegistrationStatus::Mismatched => {
+                // TODO: decide if we want to allow modifying node addresses without removing and re-adding
+                // the node.  Safest/simplest thing is to refuse it, and usually we deploy with
+                // a fixed address through the lifetime of a node.
+                tracing::warn!(
+                    "Node {} tried to register with different address",
+                    register_req.node_id
+                );
+                return Err(ApiError::Conflict(
+                    "Node is already registered with different address".to_string(),
+                ));
+            }
+            RegistrationStatus::New => {
+                // fallthrough
             }
         }
 
@@ -4742,6 +4798,7 @@ impl Service {
             register_req.listen_http_port,
             register_req.listen_pg_addr,
             register_req.listen_pg_port,
+            register_req.availability_zone_id,
         );
 
         // TODO: idempotency if the node already exists in the database
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index f8d9a51c91..70f2676245 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -109,6 +109,9 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
     # controller's attempts to notify the endpoint).
     ".*reconciler.*neon_local notification hook failed.*",
     ".*reconciler.*neon_local error.*",
+    # Neon local does not provide pageserver with an AZ
+    # TODO: remove this once neon local does so
+    ".*registering without specific availability zone id.*",
 ]
 
 
From 63a0d0d0397218ed9e830a35d8939da28ad5b6ee Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 29 Aug 2024 01:39:21 +0800
Subject: [PATCH 1517/1571] fix(storage-scrubber): make retry error into
 warnings (#8851)

We get many HTTP connect timeout errors from scrubber logs, and it
turned out that the scrubber is retrying, and this is not an actual
error. In the future, we should revisit all places where we log errors
in the storage scrubber, and only error when necessary (i.e., errors
that might need manual fixing)

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 112f052e07..3c21d2f8cf 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -36,7 +36,7 @@ use serde::{Deserialize, Serialize};
 use storage_controller_client::control_api;
 use tokio::io::AsyncReadExt;
 use tokio_util::sync::CancellationToken;
-use tracing::error;
+use tracing::{error, warn};
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
 use utils::fs_ext;
@@ -466,7 +466,7 @@ async fn list_objects_with_retries(
                     return Err(e)
                         .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
                 }
-                error!(
+                warn!(
                     "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
                     s3_target.bucket_name,
                     s3_target.prefix_in_bucket,

From 9627747d35dd9a5b7ceec099eb8f9604a95408dc Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 28 Aug 2024 20:31:41 +0200
Subject: [PATCH 1518/1571] bypass `PageCache` for `InMemoryLayer` + avoid
 `Value::deser` on L0 flush (#8537)

Part of [Epic: Bypass PageCache for user data
blocks](https://github.com/neondatabase/neon/issues/7386).

# Problem

`InMemoryLayer` still uses the `PageCache` for all data stored in the
`VirtualFile` that underlies the `EphemeralFile`.

# Background

Before this PR, `EphemeralFile` is a fancy and (code-bloated) buffered
writer around a `VirtualFile` that supports `blob_io`.

The `InMemoryLayerInner::index` stores offsets into the `EphemeralFile`.
At those offset, we find a varint length followed by the serialized
`Value`.

Vectored reads (`get_values_reconstruct_data`) are not in fact vectored
- each `Value` that needs to be read is read sequentially.

The `will_init` bit of information which we use to early-exit the
`get_values_reconstruct_data` for a given key is stored in the
serialized `Value`, meaning we have to read & deserialize the `Value`
from the `EphemeralFile`.

The L0 flushing **also** needs to re-determine the `will_init` bit of
information, by deserializing each value during L0 flush.

# Changes

1. Store the value length and `will_init` information in the
`InMemoryLayer::index`. The `EphemeralFile` thus only needs to store the
values.
2. For `get_values_reconstruct_data`:
- Use the in-memory `index` figures out which values need to be read.
Having the `will_init` stored in the index enables us to do that.
- View the EphemeralFile as a byte array of "DIO chunks", each 512 bytes
in size (adjustable constant). A "DIO chunk" is the minimal unit that we
can read under direct IO.
- Figure out which chunks need to be read to retrieve the serialized
bytes for thes values we need to read.
- Coalesce chunk reads such that each DIO chunk is only read once to
serve all value reads that need data from that chunk.
- Merge adjacent chunk reads into larger
`EphemeralFile::read_exact_at_eof_ok` of up to 128k (adjustable
constant).
3. The new `EphemeralFile::read_exact_at_eof_ok` fills the IO buffer
from the underlying VirtualFile and/or its in-memory buffer.
4. The L0 flush code is changed to use the `index` directly, `blob_io`
5. We can remove the `ephemeral_file::page_caching` construct now.

The `get_values_reconstruct_data` changes seem like a bit overkill but
they are necessary so we issue the equivalent amount of read system
calls compared to before this PR where it was highly likely that even if
the first PageCache access was a miss, remaining reads within the same
`get_values_reconstruct_data` call from the same `EphemeralFile` page
were a hit.

The "DIO chunk" stuff is truly unnecessary for page cache bypass, but,
since we're working on [direct
IO](https://github.com/neondatabase/neon/issues/8130) and
https://github.com/neondatabase/neon/issues/8719 specifically, we need
to do _something_ like this anyways in the near future.

# Alternative Design

The original plan was to use the `vectored_blob_io` code it relies on
the invariant of Delta&Image layers that `index order == values order`.

Further, `vectored_blob_io` code's strategy for merging IOs is limited
to adjacent reads. However, with direct IO, there is another level of
merging that should be done, specifically, if multiple reads map to the
same "DIO chunk" (=alignment-requirement-sized and -aligned region of
the file), then it's "free" to read the chunk into an IO buffer and
serve the two reads from that buffer.
=> https://github.com/neondatabase/neon/issues/8719

# Testing / Performance

Correctness of the IO merging code is ensured by unit tests.

Additionally, minimal tests are added for the `EphemeralFile`
implementation and the bit-packed `InMemoryLayerIndexValue`.

Performance testing results are presented below.
All pref testing done on my M2 MacBook Pro, running a Linux VM.
It's a release build without `--features testing`.

We see definitive improvement in ingest performance microbenchmark and
an ad-hoc microbenchmark for getpage against InMemoryLayer.

```
baseline: commit 7c74112b2a6e23c07bfd9cc62c240cd6bbdd3bd9 origin/main
HEAD: ef1c55c52e0c313be4d302794d29534591f9cdc5
```

<details>

```
cargo bench --bench bench_ingest -- 'ingest 128MB/100b seq, no delta'

baseline

ingest-small-values/ingest 128MB/100b seq, no delta
                        time:   [483.50 ms 498.73 ms 522.53 ms]
                        thrpt:  [244.96 MiB/s 256.65 MiB/s 264.73 MiB/s]

HEAD

ingest-small-values/ingest 128MB/100b seq, no delta
                        time:   [479.22 ms 482.92 ms 487.35 ms]
                        thrpt:  [262.64 MiB/s 265.06 MiB/s 267.10 MiB/s]
```

</details>

We don't have a micro-benchmark for InMemoryLayer and it's quite
cumbersome to add one. So, I did manual testing in `neon_local`.

<details>

```

  ./target/release/neon_local stop
  rm -rf .neon
  ./target/release/neon_local init
  ./target/release/neon_local start
  ./target/release/neon_local tenant create --set-default
  ./target/release/neon_local endpoint create foo
  ./target/release/neon_local endpoint start foo
  psql 'postgresql://cloud_admin@127.0.0.1:55432/postgres'
psql (13.16 (Debian 13.16-0+deb11u1), server 15.7)

CREATE TABLE wal_test (
    id SERIAL PRIMARY KEY,
    data TEXT
);

DO $$
DECLARE
    i INTEGER := 1;
BEGIN
    WHILE i <= 500000 LOOP
        INSERT INTO wal_test (data) VALUES ('data');
        i := i + 1;
    END LOOP;
END $$;

-- => result is one L0 from initdb and one 137M-sized ephemeral-2

DO $$
DECLARE
    i INTEGER := 1;
    random_id INTEGER;
    random_record wal_test%ROWTYPE;
    start_time TIMESTAMP := clock_timestamp();
    selects_completed INTEGER := 0;
    min_id INTEGER := 1;  -- Minimum ID value
    max_id INTEGER := 100000;  -- Maximum ID value, based on your insert range
    iters INTEGER := 100000000;  -- Number of iterations to run
BEGIN
    WHILE i <= iters LOOP
        -- Generate a random ID within the known range
        random_id := min_id + floor(random() * (max_id - min_id + 1))::int;

        -- Select the row with the generated random ID
        SELECT * INTO random_record
        FROM wal_test
        WHERE id = random_id;

        -- Increment the select counter
        selects_completed := selects_completed + 1;

        -- Check if a second has passed
        IF EXTRACT(EPOCH FROM clock_timestamp() - start_time) >= 1 THEN
            -- Print the number of selects completed in the last second
            RAISE NOTICE 'Selects completed in last second: %', selects_completed;

            -- Reset counters for the next second
            selects_completed := 0;
            start_time := clock_timestamp();
        END IF;

        -- Increment the loop counter
        i := i + 1;
    END LOOP;
END $$;

./target/release/neon_local stop

baseline: commit 7c74112b2a6e23c07bfd9cc62c240cd6bbdd3bd9 origin/main

NOTICE:  Selects completed in last second: 1864
NOTICE:  Selects completed in last second: 1850
NOTICE:  Selects completed in last second: 1851
NOTICE:  Selects completed in last second: 1918
NOTICE:  Selects completed in last second: 1911
NOTICE:  Selects completed in last second: 1879
NOTICE:  Selects completed in last second: 1858
NOTICE:  Selects completed in last second: 1827
NOTICE:  Selects completed in last second: 1933

ours

NOTICE:  Selects completed in last second: 1915
NOTICE:  Selects completed in last second: 1928
NOTICE:  Selects completed in last second: 1913
NOTICE:  Selects completed in last second: 1932
NOTICE:  Selects completed in last second: 1846
NOTICE:  Selects completed in last second: 1955
NOTICE:  Selects completed in last second: 1991
NOTICE:  Selects completed in last second: 1973
```

NB: the ephemeral file sizes differ by ca 1MiB, ours being 1MiB smaller.

</details>

# Rollout

This PR changes the code in-place and  is not gated by a feature flag.
---
 Cargo.lock                                    |  14 +
 Cargo.toml                                    |   2 +
 pageserver/Cargo.toml                         |   2 +
 pageserver/benches/bench_ingest.rs            |   4 +-
 pageserver/src/assert_u64_eq_usize.rs         |  39 +
 pageserver/src/config.rs                      |  10 +
 pageserver/src/lib.rs                         |   1 +
 pageserver/src/tenant.rs                      |   6 +
 pageserver/src/tenant/blob_io.rs              |   4 +-
 pageserver/src/tenant/block_io.rs             |  23 -
 pageserver/src/tenant/ephemeral_file.rs       | 430 +++++---
 .../src/tenant/ephemeral_file/page_caching.rs | 153 ---
 .../ephemeral_file/zero_padded_read_write.rs  | 145 ---
 .../zero_padded_read_write/zero_padded.rs     | 110 --
 .../src/tenant/storage_layer/delta_layer.rs   |   6 +-
 .../tenant/storage_layer/inmemory_layer.rs    | 509 ++++++++--
 .../inmemory_layer/vectored_dio_read.rs       | 937 ++++++++++++++++++
 pageserver/src/tenant/timeline.rs             |   6 +-
 .../virtual_file/owned_buffers_io/write.rs    |   1 +
 .../regress/test_pageserver_layer_rolling.py  |   9 +-
 20 files changed, 1757 insertions(+), 654 deletions(-)
 create mode 100644 pageserver/src/assert_u64_eq_usize.rs
 delete mode 100644 pageserver/src/tenant/ephemeral_file/page_caching.rs
 delete mode 100644 pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
 delete mode 100644 pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
 create mode 100644 pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs

diff --git a/Cargo.lock b/Cargo.lock
index 441ca1ff86..c514625518 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -936,6 +936,12 @@ dependencies = [
  "which",
 ]
 
+[[package]]
+name = "bit_field"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61"
+
 [[package]]
 name = "bitflags"
 version = "1.3.2"
@@ -3683,6 +3689,7 @@ dependencies = [
  "async-compression",
  "async-stream",
  "async-trait",
+ "bit_field",
  "byteorder",
  "bytes",
  "camino",
@@ -3732,6 +3739,7 @@ dependencies = [
  "reqwest 0.12.4",
  "rpds",
  "scopeguard",
+ "send-future",
  "serde",
  "serde_json",
  "serde_path_to_error",
@@ -5455,6 +5463,12 @@ version = "1.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
 
+[[package]]
+name = "send-future"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "224e328af6e080cddbab3c770b1cf50f0351ba0577091ef2410c3951d835ff87"
+
 [[package]]
 name = "sentry"
 version = "0.32.3"
diff --git a/Cargo.toml b/Cargo.toml
index e038c0b4ff..7bd9a26394 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -65,6 +65,7 @@ axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.65"
+bit_field = "0.10.2"
 bstr = "1.0"
 byteorder = "1.4"
 bytes = "1.0"
@@ -145,6 +146,7 @@ rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
 sd-notify = "0.4.1"
+send-future = "0.1.0"
 sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 0e748ee3db..85c5e24afc 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -16,6 +16,7 @@ arc-swap.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
+bit_field.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
 camino.workspace = true
@@ -52,6 +53,7 @@ rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
 scopeguard.workspace = true
+send-future.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 serde_path_to_error.workspace = true
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index f450f46efa..1be4391d81 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -103,13 +103,13 @@ async fn ingest(
         batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
         if batch.len() >= BATCH_SIZE {
             let this_batch = std::mem::take(&mut batch);
-            let serialized = SerializedBatch::from_values(this_batch);
+            let serialized = SerializedBatch::from_values(this_batch).unwrap();
             layer.put_batch(serialized, &ctx).await?;
         }
     }
     if !batch.is_empty() {
         let this_batch = std::mem::take(&mut batch);
-        let serialized = SerializedBatch::from_values(this_batch);
+        let serialized = SerializedBatch::from_values(this_batch).unwrap();
         layer.put_batch(serialized, &ctx).await?;
     }
     layer.freeze(lsn + 1).await;
diff --git a/pageserver/src/assert_u64_eq_usize.rs b/pageserver/src/assert_u64_eq_usize.rs
new file mode 100644
index 0000000000..66ca7fd057
--- /dev/null
+++ b/pageserver/src/assert_u64_eq_usize.rs
@@ -0,0 +1,39 @@
+//! `u64`` and `usize`` aren't guaranteed to be identical in Rust, but life is much simpler if that's the case.
+
+pub(crate) const _ASSERT_U64_EQ_USIZE: () = {
+    if std::mem::size_of::<usize>() != std::mem::size_of::<u64>() {
+        panic!("the traits defined in this module assume that usize and u64 can be converted to each other without loss of information");
+    }
+};
+
+pub(crate) trait U64IsUsize {
+    fn into_usize(self) -> usize;
+}
+
+impl U64IsUsize for u64 {
+    #[inline(always)]
+    fn into_usize(self) -> usize {
+        #[allow(clippy::let_unit_value)]
+        let _ = _ASSERT_U64_EQ_USIZE;
+        self as usize
+    }
+}
+
+pub(crate) trait UsizeIsU64 {
+    fn into_u64(self) -> u64;
+}
+
+impl UsizeIsU64 for usize {
+    #[inline(always)]
+    fn into_u64(self) -> u64 {
+        #[allow(clippy::let_unit_value)]
+        let _ = _ASSERT_U64_EQ_USIZE;
+        self as u64
+    }
+}
+
+pub const fn u64_to_usize(x: u64) -> usize {
+    #[allow(clippy::let_unit_value)]
+    let _ = _ASSERT_U64_EQ_USIZE;
+    x as usize
+}
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index ae473bcc5f..994075bef6 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -31,6 +31,7 @@ use utils::{
 
 use crate::l0_flush::L0FlushConfig;
 use crate::tenant::config::TenantConfOpt;
+use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
 use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -1020,6 +1021,15 @@ impl PageServerConf {
 
         conf.default_tenant_conf = t_conf.merge(TenantConf::default());
 
+        IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
+            .map_err(|msg| anyhow::anyhow!("{msg}"))
+            .with_context(|| {
+                format!(
+                    "effective checkpoint distance is unsupported: {}",
+                    conf.default_tenant_conf.checkpoint_distance
+                )
+            })?;
+
         Ok(conf)
     }
 
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index dbfc9f3544..7a9cf495c7 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -16,6 +16,7 @@ pub mod l0_flush;
 use futures::{stream::FuturesUnordered, StreamExt};
 pub use pageserver_api::keyspace;
 use tokio_util::sync::CancellationToken;
+mod assert_u64_eq_usize;
 pub mod aux_file;
 pub mod metrics;
 pub mod page_cache;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0364d521b6..60ab242ffc 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -877,6 +877,12 @@ impl Tenant {
                         });
                     };
 
+                // TODO: should also be rejecting tenant conf changes that violate this check.
+                if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) {
+                    make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
+                    return Ok(());
+                }
+
                 let mut init_order = init_order;
                 // take the completion because initial tenant loading will complete when all of
                 // these tasks complete.
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index a245c99a88..dd70f6bbff 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -148,7 +148,7 @@ pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
 
 /// The maximum size of blobs we support. The highest few bits
 /// are reserved for compression and other further uses.
-const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
+pub(crate) const MAX_SUPPORTED_BLOB_LEN: usize = 0x0fff_ffff;
 
 pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
 pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
@@ -326,7 +326,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                 (self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
             } else {
                 // Write a 4-byte length header
-                if len > MAX_SUPPORTED_LEN {
+                if len > MAX_SUPPORTED_BLOB_LEN {
                     return (
                         (
                             io_buf.slice_len(),
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 601b095155..3afa3a86b9 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -2,7 +2,6 @@
 //! Low-level Block-oriented I/O functions
 //!
 
-use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
 use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
@@ -81,9 +80,7 @@ impl<'a> Deref for BlockLease<'a> {
 /// Unlike traits, we also support the read function to be async though.
 pub(crate) enum BlockReaderRef<'a> {
     FileBlockReader(&'a FileBlockReader<'a>),
-    EphemeralFile(&'a EphemeralFile),
     Adapter(Adapter<&'a DeltaLayerInner>),
-    Slice(&'a [u8]),
     #[cfg(test)]
     TestDisk(&'a super::disk_btree::tests::TestDisk),
     #[cfg(test)]
@@ -100,9 +97,7 @@ impl<'a> BlockReaderRef<'a> {
         use BlockReaderRef::*;
         match self {
             FileBlockReader(r) => r.read_blk(blknum, ctx).await,
-            EphemeralFile(r) => r.read_blk(blknum, ctx).await,
             Adapter(r) => r.read_blk(blknum, ctx).await,
-            Slice(s) => Self::read_blk_slice(s, blknum),
             #[cfg(test)]
             TestDisk(r) => r.read_blk(blknum),
             #[cfg(test)]
@@ -111,24 +106,6 @@ impl<'a> BlockReaderRef<'a> {
     }
 }
 
-impl<'a> BlockReaderRef<'a> {
-    fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
-        let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
-        let end = start.checked_add(PAGE_SZ).unwrap();
-        if end > slice.len() {
-            return Err(std::io::Error::new(
-                std::io::ErrorKind::UnexpectedEof,
-                format!("slice too short, len={} end={}", slice.len(), end),
-            ));
-        }
-        let slice = &slice[start..end];
-        let page_sized: &[u8; PAGE_SZ] = slice
-            .try_into()
-            .expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
-        Ok(BlockLease::Slice(page_sized))
-    }
-}
-
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 44f0fc7ab1..5324e1807d 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -1,13 +1,21 @@
 //! Implementation of append-only file data structure
 //! used to keep in-memory layers spilled on disk.
 
+use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache;
-use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
-use crate::virtual_file::{self, VirtualFile};
+use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
+use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
+use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
+use crate::virtual_file::owned_buffers_io::write::Buffer;
+use crate::virtual_file::{self, owned_buffers_io, VirtualFile};
+use bytes::BytesMut;
 use camino::Utf8PathBuf;
+use num_traits::Num;
 use pageserver_api::shard::TenantShardId;
+use tokio_epoll_uring::{BoundedBuf, Slice};
+use tracing::error;
 
 use std::io;
 use std::sync::atomic::AtomicU64;
@@ -16,12 +24,17 @@ use utils::id::TimelineId;
 pub struct EphemeralFile {
     _tenant_shard_id: TenantShardId,
     _timeline_id: TimelineId,
-
-    rw: page_caching::RW,
+    page_cache_file_id: page_cache::FileId,
+    bytes_written: u64,
+    buffered_writer: owned_buffers_io::write::BufferedWriter<
+        BytesMut,
+        size_tracking_writer::Writer<VirtualFile>,
+    >,
+    /// Gate guard is held on as long as we need to do operations in the path (delete on drop)
+    _gate_guard: utils::sync::gate::GateGuard,
 }
 
-mod page_caching;
-mod zero_padded_read_write;
+const TAIL_SZ: usize = 64 * 1024;
 
 impl EphemeralFile {
     pub async fn create(
@@ -51,75 +64,178 @@ impl EphemeralFile {
         )
         .await?;
 
+        let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore
+
         Ok(EphemeralFile {
             _tenant_shard_id: tenant_shard_id,
             _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, gate_guard),
+            page_cache_file_id,
+            bytes_written: 0,
+            buffered_writer: owned_buffers_io::write::BufferedWriter::new(
+                size_tracking_writer::Writer::new(file),
+                BytesMut::with_capacity(TAIL_SZ),
+            ),
+            _gate_guard: gate_guard,
         })
     }
+}
 
+impl Drop for EphemeralFile {
+    fn drop(&mut self) {
+        // unlink the file
+        // we are clear to do this, because we have entered a gate
+        let path = &self.buffered_writer.as_inner().as_inner().path;
+        let res = std::fs::remove_file(path);
+        if let Err(e) = res {
+            if e.kind() != std::io::ErrorKind::NotFound {
+                // just never log the not found errors, we cannot do anything for them; on detach
+                // the tenant directory is already gone.
+                //
+                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
+                error!("could not remove ephemeral file '{path}': {e}");
+            }
+        }
+    }
+}
+
+impl EphemeralFile {
     pub(crate) fn len(&self) -> u64 {
-        self.rw.bytes_written()
+        self.bytes_written
     }
 
     pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
-        self.rw.page_cache_file_id()
+        self.page_cache_file_id
     }
 
-    /// See [`self::page_caching::RW::load_to_vec`].
     pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
-        self.rw.load_to_vec(ctx).await
-    }
-
-    pub(crate) async fn read_blk(
-        &self,
-        blknum: u32,
-        ctx: &RequestContext,
-    ) -> Result<BlockLease, io::Error> {
-        self.rw.read_blk(blknum, ctx).await
-    }
-
-    #[cfg(test)]
-    // This is a test helper: outside of tests, we are always written to via a pre-serialized batch.
-    pub(crate) async fn write_blob(
-        &mut self,
-        srcbuf: &[u8],
-        ctx: &RequestContext,
-    ) -> Result<u64, io::Error> {
-        let pos = self.rw.bytes_written();
-
-        let mut len_bytes = std::io::Cursor::new(Vec::new());
-        crate::tenant::storage_layer::inmemory_layer::SerializedBatch::write_blob_length(
-            srcbuf.len(),
-            &mut len_bytes,
-        );
-        let len_bytes = len_bytes.into_inner();
-
-        // Write the length field
-        self.rw.write_all_borrowed(&len_bytes, ctx).await?;
-
-        // Write the payload
-        self.rw.write_all_borrowed(srcbuf, ctx).await?;
-
-        Ok(pos)
+        let size = self.len().into_usize();
+        let vec = Vec::with_capacity(size);
+        let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?;
+        assert_eq!(nread, size);
+        let vec = slice.into_inner();
+        assert_eq!(vec.len(), nread);
+        assert_eq!(vec.capacity(), size, "we shouldn't be reallocating");
+        Ok(vec)
     }
 
     /// Returns the offset at which the first byte of the input was written, for use
     /// in constructing indices over the written value.
+    ///
+    /// Panics if the write is short because there's no way we can recover from that.
+    /// TODO: make upstack handle this as an error.
     pub(crate) async fn write_raw(
         &mut self,
         srcbuf: &[u8],
         ctx: &RequestContext,
-    ) -> Result<u64, io::Error> {
-        let pos = self.rw.bytes_written();
+    ) -> std::io::Result<u64> {
+        let pos = self.bytes_written;
+
+        let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
+            std::io::Error::new(
+                std::io::ErrorKind::Other,
+                format!(
+                    "write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}",
+                    srcbuf_len = srcbuf.len(),
+                ),
+            )
+        })?;
 
         // Write the payload
-        self.rw.write_all_borrowed(srcbuf, ctx).await?;
+        let nwritten = self
+            .buffered_writer
+            .write_buffered_borrowed(srcbuf, ctx)
+            .await?;
+        assert_eq!(
+            nwritten,
+            srcbuf.len(),
+            "buffered writer has no short writes"
+        );
+
+        self.bytes_written = new_bytes_written;
 
         Ok(pos)
     }
 }
 
+impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
+    async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>(
+        &'b self,
+        start: u64,
+        dst: tokio_epoll_uring::Slice<B>,
+        ctx: &'a RequestContext,
+    ) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
+        let file_size_tracking_writer = self.buffered_writer.as_inner();
+        let flushed_offset = file_size_tracking_writer.bytes_written();
+
+        let buffer = self.buffered_writer.inspect_buffer();
+        let buffered = &buffer[0..buffer.pending()];
+
+        let dst_cap = dst.bytes_total().into_u64();
+        let end = {
+            // saturating_add is correct here because the max file size is u64::MAX, so,
+            // if start + dst.len() > u64::MAX, then we know it will be a short read
+            let mut end: u64 = start.saturating_add(dst_cap);
+            if end > self.bytes_written {
+                end = self.bytes_written;
+            }
+            end
+        };
+
+        // inclusive, exclusive
+        #[derive(Debug)]
+        struct Range<N>(N, N);
+        impl<N: Num + Clone + Copy + PartialOrd + Ord> Range<N> {
+            fn len(&self) -> N {
+                if self.0 > self.1 {
+                    N::zero()
+                } else {
+                    self.1 - self.0
+                }
+            }
+        }
+        let written_range = Range(start, std::cmp::min(end, flushed_offset));
+        let buffered_range = Range(std::cmp::max(start, flushed_offset), end);
+
+        let dst = if written_range.len() > 0 {
+            let file: &VirtualFile = file_size_tracking_writer.as_inner();
+            let bounds = dst.bounds();
+            let slice = file
+                .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
+                .await?;
+            Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
+        } else {
+            dst
+        };
+
+        let dst = if buffered_range.len() > 0 {
+            let offset_in_buffer = buffered_range
+                .0
+                .checked_sub(flushed_offset)
+                .unwrap()
+                .into_usize();
+            let to_copy =
+                &buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())];
+            let bounds = dst.bounds();
+            let mut view = dst.slice({
+                let start = written_range.len().into_usize();
+                let end = start
+                    .checked_add(buffered_range.len().into_usize())
+                    .unwrap();
+                start..end
+            });
+            view.as_mut_rust_slice_full_zeroed()
+                .copy_from_slice(to_copy);
+            Slice::from_buf_bounds(Slice::into_inner(view), bounds)
+        } else {
+            dst
+        };
+
+        // TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs
+
+        Ok((dst, (end - start).into_usize()))
+    }
+}
+
 /// Does the given filename look like an ephemeral file?
 pub fn is_ephemeral_file(filename: &str) -> bool {
     if let Some(rest) = filename.strip_prefix("ephemeral-") {
@@ -129,19 +245,13 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
     }
 }
 
-impl BlockReader for EphemeralFile {
-    fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
-        BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
-    }
-}
-
 #[cfg(test)]
 mod tests {
+    use rand::Rng;
+
     use super::*;
     use crate::context::DownloadBehavior;
     use crate::task_mgr::TaskKind;
-    use crate::tenant::block_io::BlockReaderRef;
-    use rand::{thread_rng, RngCore};
     use std::fs;
     use std::str::FromStr;
 
@@ -172,69 +282,6 @@ mod tests {
         Ok((conf, tenant_shard_id, timeline_id, ctx))
     }
 
-    #[tokio::test]
-    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
-        let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
-
-        let gate = utils::sync::gate::Gate::default();
-
-        let entered = gate.enter().unwrap();
-
-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
-
-        let pos_foo = file.write_blob(b"foo", &ctx).await?;
-        assert_eq!(
-            b"foo",
-            file.block_cursor()
-                .read_blob(pos_foo, &ctx)
-                .await?
-                .as_slice()
-        );
-        let pos_bar = file.write_blob(b"bar", &ctx).await?;
-        assert_eq!(
-            b"foo",
-            file.block_cursor()
-                .read_blob(pos_foo, &ctx)
-                .await?
-                .as_slice()
-        );
-        assert_eq!(
-            b"bar",
-            file.block_cursor()
-                .read_blob(pos_bar, &ctx)
-                .await?
-                .as_slice()
-        );
-
-        let mut blobs = Vec::new();
-        for i in 0..10000 {
-            let data = Vec::from(format!("blob{}", i).as_bytes());
-            let pos = file.write_blob(&data, &ctx).await?;
-            blobs.push((pos, data));
-        }
-        // also test with a large blobs
-        for i in 0..100 {
-            let data = format!("blob{}", i).as_bytes().repeat(100);
-            let pos = file.write_blob(&data, &ctx).await?;
-            blobs.push((pos, data));
-        }
-
-        let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
-        for (pos, expected) in blobs {
-            let actual = cursor.read_blob(pos, &ctx).await?;
-            assert_eq!(actual, expected);
-        }
-
-        // Test a large blob that spans multiple pages
-        let mut large_data = vec![0; 20000];
-        thread_rng().fill_bytes(&mut large_data);
-        let pos_large = file.write_blob(&large_data, &ctx).await?;
-        let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
-        assert_eq!(result, large_data);
-
-        Ok(())
-    }
-
     #[tokio::test]
     async fn ephemeral_file_holds_gate_open() {
         const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
@@ -268,4 +315,151 @@ mod tests {
             .expect("closing completes right away")
             .expect("closing does not panic");
     }
+
+    #[tokio::test]
+    async fn test_ephemeral_file_basics() {
+        let (conf, tenant_id, timeline_id, ctx) = harness("test_ephemeral_file_basics").unwrap();
+
+        let gate = utils::sync::gate::Gate::default();
+
+        let mut file =
+            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
+                .await
+                .unwrap();
+
+        let cap = file.buffered_writer.inspect_buffer().capacity();
+
+        let write_nbytes = cap + cap / 2;
+
+        let content: Vec<u8> = rand::thread_rng()
+            .sample_iter(rand::distributions::Standard)
+            .take(write_nbytes)
+            .collect();
+
+        let mut value_offsets = Vec::new();
+        for i in 0..write_nbytes {
+            let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap();
+            value_offsets.push(off);
+        }
+
+        assert!(file.len() as usize == write_nbytes);
+        for i in 0..write_nbytes {
+            assert_eq!(value_offsets[i], i.into_u64());
+            let buf = Vec::with_capacity(1);
+            let (buf_slice, nread) = file
+                .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
+                .await
+                .unwrap();
+            let buf = buf_slice.into_inner();
+            assert_eq!(nread, 1);
+            assert_eq!(&buf, &content[i..i + 1]);
+        }
+
+        let file_contents =
+            std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap();
+        assert_eq!(file_contents, &content[0..cap]);
+
+        let buffer_contents = file.buffered_writer.inspect_buffer();
+        assert_eq!(buffer_contents, &content[cap..write_nbytes]);
+    }
+
+    #[tokio::test]
+    async fn test_flushes_do_happen() {
+        let (conf, tenant_id, timeline_id, ctx) = harness("test_flushes_do_happen").unwrap();
+
+        let gate = utils::sync::gate::Gate::default();
+
+        let mut file =
+            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
+                .await
+                .unwrap();
+
+        let cap = file.buffered_writer.inspect_buffer().capacity();
+
+        let content: Vec<u8> = rand::thread_rng()
+            .sample_iter(rand::distributions::Standard)
+            .take(cap + cap / 2)
+            .collect();
+
+        file.write_raw(&content, &ctx).await.unwrap();
+
+        // assert the state is as this test expects it to be
+        assert_eq!(
+            &file.load_to_vec(&ctx).await.unwrap(),
+            &content[0..cap + cap / 2]
+        );
+        let md = file
+            .buffered_writer
+            .as_inner()
+            .as_inner()
+            .path
+            .metadata()
+            .unwrap();
+        assert_eq!(
+            md.len(),
+            cap.into_u64(),
+            "buffered writer does one write if we write 1.5x buffer capacity"
+        );
+        assert_eq!(
+            &file.buffered_writer.inspect_buffer()[0..cap / 2],
+            &content[cap..cap + cap / 2]
+        );
+    }
+
+    #[tokio::test]
+    async fn test_read_split_across_file_and_buffer() {
+        // This test exercises the logic on the read path that splits the logical read
+        // into a read from the flushed part (= the file) and a copy from the buffered writer's buffer.
+        //
+        // This test build on the assertions in test_flushes_do_happen
+
+        let (conf, tenant_id, timeline_id, ctx) =
+            harness("test_read_split_across_file_and_buffer").unwrap();
+
+        let gate = utils::sync::gate::Gate::default();
+
+        let mut file =
+            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
+                .await
+                .unwrap();
+
+        let cap = file.buffered_writer.inspect_buffer().capacity();
+
+        let content: Vec<u8> = rand::thread_rng()
+            .sample_iter(rand::distributions::Standard)
+            .take(cap + cap / 2)
+            .collect();
+
+        file.write_raw(&content, &ctx).await.unwrap();
+
+        let test_read = |start: usize, len: usize| {
+            let file = &file;
+            let ctx = &ctx;
+            let content = &content;
+            async move {
+                let (buf, nread) = file
+                    .read_exact_at_eof_ok(
+                        start.into_u64(),
+                        Vec::with_capacity(len).slice_full(),
+                        ctx,
+                    )
+                    .await
+                    .unwrap();
+                assert_eq!(nread, len);
+                assert_eq!(&buf.into_inner(), &content[start..(start + len)]);
+            }
+        };
+
+        // completely within the file range
+        assert!(20 < cap, "test assumption");
+        test_read(10, 10).await;
+        // border onto edge of file
+        test_read(cap - 10, 10).await;
+        // read across file and buffer
+        test_read(cap - 10, 20).await;
+        // stay from start of buffer
+        test_read(cap, 10).await;
+        // completely within buffer
+        test_read(cap + 10, 10).await;
+    }
 }
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
deleted file mode 100644
index 48926354f1..0000000000
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ /dev/null
@@ -1,153 +0,0 @@
-//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
-//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
-//!
-//! Subject to removal in <https://github.com/neondatabase/neon/pull/8537>
-
-use crate::context::RequestContext;
-use crate::page_cache::{self, PAGE_SZ};
-use crate::tenant::block_io::BlockLease;
-use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
-use crate::virtual_file::VirtualFile;
-
-use std::io::{self};
-use tokio_epoll_uring::BoundedBuf;
-use tracing::*;
-
-use super::zero_padded_read_write;
-
-/// See module-level comment.
-pub struct RW {
-    page_cache_file_id: page_cache::FileId,
-    rw: super::zero_padded_read_write::RW<size_tracking_writer::Writer<VirtualFile>>,
-    /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
-    _gate_guard: utils::sync::gate::GateGuard,
-}
-
-impl RW {
-    pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self {
-        let page_cache_file_id = page_cache::next_file_id();
-        Self {
-            page_cache_file_id,
-            rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)),
-            _gate_guard,
-        }
-    }
-
-    pub fn page_cache_file_id(&self) -> page_cache::FileId {
-        self.page_cache_file_id
-    }
-
-    pub(crate) async fn write_all_borrowed(
-        &mut self,
-        srcbuf: &[u8],
-        ctx: &RequestContext,
-    ) -> Result<usize, io::Error> {
-        // It doesn't make sense to proactively fill the page cache on the Pageserver write path
-        // because Compute is unlikely to access recently written data.
-        self.rw.write_all_borrowed(srcbuf, ctx).await
-    }
-
-    pub(crate) fn bytes_written(&self) -> u64 {
-        self.rw.bytes_written()
-    }
-
-    /// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer.
-    ///
-    /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer.
-    /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`].
-    pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
-        // round up to the next PAGE_SZ multiple, required by blob_io
-        let size = {
-            let s = usize::try_from(self.bytes_written()).unwrap();
-            if s % PAGE_SZ == 0 {
-                s
-            } else {
-                s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap()
-            }
-        };
-        let vec = Vec::with_capacity(size);
-
-        // read from disk what we've already flushed
-        let file_size_tracking_writer = self.rw.as_writer();
-        let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap();
-        let mut vec = file_size_tracking_writer
-            .as_inner()
-            .read_exact_at(
-                vec.slice(0..(flushed_range.end - flushed_range.start)),
-                u64::try_from(flushed_range.start).unwrap(),
-                ctx,
-            )
-            .await?
-            .into_inner();
-
-        // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
-        let buffered = self.rw.get_tail_zero_padded();
-        vec.extend_from_slice(buffered);
-        assert_eq!(vec.len(), size);
-        assert_eq!(vec.len() % PAGE_SZ, 0);
-        Ok(vec)
-    }
-
-    pub(crate) async fn read_blk(
-        &self,
-        blknum: u32,
-        ctx: &RequestContext,
-    ) -> Result<BlockLease, io::Error> {
-        match self.rw.read_blk(blknum).await? {
-            zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => {
-                let cache = page_cache::get();
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
-                    .await
-                    .map_err(|e| {
-                        std::io::Error::new(
-                            std::io::ErrorKind::Other,
-                            // order path before error because error is anyhow::Error => might have many contexts
-                            format!(
-                                "ephemeral file: read immutable page #{}: {}: {:#}",
-                                blknum,
-                                self.rw.as_writer().as_inner().path,
-                                e,
-                            ),
-                        )
-                    })? {
-                    page_cache::ReadBufResult::Found(guard) => {
-                        return Ok(BlockLease::PageReadGuard(guard))
-                    }
-                    page_cache::ReadBufResult::NotFound(write_guard) => {
-                        let write_guard = writer
-                            .as_inner()
-                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
-                            .await?;
-                        let read_guard = write_guard.mark_valid();
-                        return Ok(BlockLease::PageReadGuard(read_guard));
-                    }
-                }
-            }
-            zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
-                Ok(BlockLease::EphemeralFileMutableTail(buffer))
-            }
-        }
-    }
-}
-
-impl Drop for RW {
-    fn drop(&mut self) {
-        // There might still be pages in the [`crate::page_cache`] for this file.
-        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
-
-        // unlink the file
-        // we are clear to do this, because we have entered a gate
-        let path = &self.rw.as_writer().as_inner().path;
-        let res = std::fs::remove_file(path);
-        if let Err(e) = res {
-            if e.kind() != std::io::ErrorKind::NotFound {
-                // just never log the not found errors, we cannot do anything for them; on detach
-                // the tenant directory is already gone.
-                //
-                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!("could not remove ephemeral file '{path}': {e}");
-            }
-        }
-    }
-}
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
deleted file mode 100644
index fe310acab8..0000000000
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ /dev/null
@@ -1,145 +0,0 @@
-//! The heart of how [`super::EphemeralFile`] does its reads and writes.
-//!
-//! # Writes
-//!
-//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`].
-//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`].
-//!
-//! # Reads
-//!
-//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`].
-//!
-//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer
-//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`]
-//! if the read is for the prefix that has already been flushed.
-//!
-//! # Current Usage
-//!
-//! The current user of this module is [`super::page_caching::RW`].
-
-mod zero_padded;
-
-use crate::{
-    context::RequestContext,
-    page_cache::PAGE_SZ,
-    virtual_file::owned_buffers_io::{
-        self,
-        write::{Buffer, OwnedAsyncWriter},
-    },
-};
-
-const TAIL_SZ: usize = 64 * 1024;
-
-/// See module-level comment.
-pub struct RW<W: OwnedAsyncWriter> {
-    buffered_writer: owned_buffers_io::write::BufferedWriter<
-        zero_padded::Buffer<TAIL_SZ>,
-        owned_buffers_io::util::size_tracking_writer::Writer<W>,
-    >,
-}
-
-pub enum ReadResult<'a, W> {
-    NeedsReadFromWriter { writer: &'a W },
-    ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
-}
-
-impl<W> RW<W>
-where
-    W: OwnedAsyncWriter,
-{
-    pub fn new(writer: W) -> Self {
-        let bytes_flushed_tracker =
-            owned_buffers_io::util::size_tracking_writer::Writer::new(writer);
-        let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
-            bytes_flushed_tracker,
-            zero_padded::Buffer::default(),
-        );
-        Self { buffered_writer }
-    }
-
-    pub(crate) fn as_writer(&self) -> &W {
-        self.buffered_writer.as_inner().as_inner()
-    }
-
-    pub async fn write_all_borrowed(
-        &mut self,
-        buf: &[u8],
-        ctx: &RequestContext,
-    ) -> std::io::Result<usize> {
-        self.buffered_writer.write_buffered_borrowed(buf, ctx).await
-    }
-
-    pub fn bytes_written(&self) -> u64 {
-        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
-        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
-        flushed_offset + u64::try_from(buffer.pending()).unwrap()
-    }
-
-    /// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`].
-    pub fn get_tail_zero_padded(&self) -> &[u8] {
-        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
-        let buffer_written_up_to = buffer.pending();
-        // pad to next page boundary
-        let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 {
-            buffer_written_up_to
-        } else {
-            buffer_written_up_to
-                .checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ))
-                .unwrap()
-        };
-        &buffer.as_zero_padded_slice()[0..read_up_to]
-    }
-
-    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
-        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
-        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
-        let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
-        let read_offset = (blknum as u64) * (PAGE_SZ as u64);
-
-        // The trailing page ("block") might only be partially filled,
-        // yet the blob_io code relies on us to return a full PAGE_SZed slice anyway.
-        // Moreover, it has to be zero-padded, because when we still had
-        // a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it.
-        // DeltaLayer probably has the same issue, not sure why it needs no special treatment.
-        // => check here that the read doesn't go beyond this potentially trailing
-        // => the zero-padding is done in the `else` branch below
-        let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 {
-            buffered_offset / (PAGE_SZ as u64)
-        } else {
-            (buffered_offset / (PAGE_SZ as u64)) + 1
-        };
-        if (blknum as u64) >= blocks_written {
-            return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}")));
-        }
-
-        // assertions for the `if-else` below
-        assert_eq!(
-            flushed_offset % (TAIL_SZ as u64), 0,
-            "we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks"
-        );
-        assert_eq!(
-            flushed_offset % (PAGE_SZ as u64),
-            0,
-            "the logic below can't handle if the page is spread across the flushed part and the buffer"
-        );
-
-        if read_offset < flushed_offset {
-            assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset);
-            Ok(ReadResult::NeedsReadFromWriter {
-                writer: self.as_writer(),
-            })
-        } else {
-            let read_offset_in_buffer = read_offset
-                .checked_sub(flushed_offset)
-                .expect("would have taken `if` branch instead of this one");
-            let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
-            let zero_padded_slice = buffer.as_zero_padded_slice();
-            let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
-            Ok(ReadResult::ServedFromZeroPaddedMutableTail {
-                buffer: page
-                    .try_into()
-                    .expect("the slice above got it as page-size slice"),
-            })
-        }
-    }
-}
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
deleted file mode 100644
index 2dc0277638..0000000000
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
+++ /dev/null
@@ -1,110 +0,0 @@
-//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose
-//! unwritten range is guaranteed to be zero-initialized.
-//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`]
-//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled.
-
-use std::mem::MaybeUninit;
-
-use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
-
-/// See module-level comment.
-pub struct Buffer<const N: usize> {
-    allocation: Box<[u8; N]>,
-    written: usize,
-}
-
-impl<const N: usize> Default for Buffer<N> {
-    fn default() -> Self {
-        Self {
-            allocation: Box::new(
-                // SAFETY: zeroed memory is a valid [u8; N]
-                unsafe { MaybeUninit::zeroed().assume_init() },
-            ),
-            written: 0,
-        }
-    }
-}
-
-impl<const N: usize> Buffer<N> {
-    #[inline(always)]
-    fn invariants(&self) {
-        // don't check by default, unoptimized is too expensive even for debug mode
-        if false {
-            debug_assert!(self.written <= N, "{}", self.written);
-            debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
-        }
-    }
-
-    pub fn as_zero_padded_slice(&self) -> &[u8; N] {
-        &self.allocation
-    }
-}
-
-impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buffer<N> {
-    type IoBuf = Self;
-
-    fn cap(&self) -> usize {
-        self.allocation.len()
-    }
-
-    fn extend_from_slice(&mut self, other: &[u8]) {
-        self.invariants();
-        let remaining = self.allocation.len() - self.written;
-        if other.len() > remaining {
-            panic!("calling extend_from_slice() with insufficient remaining capacity");
-        }
-        self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
-        self.written += other.len();
-        self.invariants();
-    }
-
-    fn pending(&self) -> usize {
-        self.written
-    }
-
-    fn flush(self) -> FullSlice<Self> {
-        self.invariants();
-        let written = self.written;
-        FullSlice::must_new(tokio_epoll_uring::BoundedBuf::slice(self, 0..written))
-    }
-
-    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
-        let Self {
-            mut allocation,
-            written,
-        } = iobuf;
-        allocation[0..written].fill(0);
-        let new = Self {
-            allocation,
-            written: 0,
-        };
-        new.invariants();
-        new
-    }
-}
-
-/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a
-/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data.
-///
-/// Remember that bytes_init is generally _not_ a tracker of the amount
-/// of valid data in the io buffer; we use `Slice` for that.
-/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit.
-///
-/// SAFETY:
-///
-/// The [`Self::allocation`] is stable becauses boxes are stable.
-/// The memory is zero-initialized, so, bytes_init is always N.
-unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buffer<N> {
-    fn stable_ptr(&self) -> *const u8 {
-        self.allocation.as_ptr()
-    }
-
-    fn bytes_init(&self) -> usize {
-        // Yes, N, not self.written; Read the full comment of this impl block!
-        N
-    }
-
-    fn bytes_total(&self) -> usize {
-        N
-    }
-}
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index c0508e13c0..00ef5b0afd 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -65,7 +65,7 @@ use std::os::unix::fs::FileExt;
 use std::str::FromStr;
 use std::sync::Arc;
 use tokio::sync::OnceCell;
-use tokio_epoll_uring::IoBufMut;
+use tokio_epoll_uring::IoBuf;
 use tracing::*;
 
 use utils::{
@@ -471,7 +471,7 @@ impl DeltaLayerWriterInner {
         ctx: &RequestContext,
     ) -> (FullSlice<Buf>, anyhow::Result<()>)
     where
-        Buf: IoBufMut + Send,
+        Buf: IoBuf + Send,
     {
         assert!(
             self.lsn_range.start <= lsn,
@@ -678,7 +678,7 @@ impl DeltaLayerWriter {
         ctx: &RequestContext,
     ) -> (FullSlice<Buf>, anyhow::Result<()>)
     where
-        Buf: IoBufMut + Send,
+        Buf: IoBuf + Send,
     {
         self.inner
             .as_mut()
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index a71b4dd83b..f31ab4b1e8 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -4,23 +4,23 @@
 //! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
 //! its position in the file, is kept in memory, though.
 //!
+use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
-use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
-use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::{l0_flush, page_cache};
-use anyhow::{anyhow, Result};
+use anyhow::{anyhow, Context, Result};
+use bytes::Bytes;
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, HashMap};
 use std::sync::{Arc, OnceLock};
 use std::time::Instant;
 use tracing::*;
@@ -39,6 +39,8 @@ use super::{
     DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
 };
 
+pub(crate) mod vectored_dio_read;
+
 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
 pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
 
@@ -78,9 +80,9 @@ impl std::fmt::Debug for InMemoryLayer {
 
 pub struct InMemoryLayerInner {
     /// All versions of all pages in the layer are kept here. Indexed
-    /// by block number and LSN. The value is an offset into the
+    /// by block number and LSN. The [`IndexEntry`] is an offset into the
     /// ephemeral file where the page version is stored.
-    index: BTreeMap<CompactKey, VecMap<Lsn, u64>>,
+    index: BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>,
 
     /// The values are stored in a serialized format in this file.
     /// Each serialized Value is preceded by a 'u32' length field.
@@ -90,6 +92,154 @@ pub struct InMemoryLayerInner {
     resource_units: GlobalResourceUnits,
 }
 
+/// Support the same max blob length as blob_io, because ultimately
+/// all the InMemoryLayer contents end up being written into a delta layer,
+/// using the [`crate::tenant::blob_io`].
+const MAX_SUPPORTED_BLOB_LEN: usize = crate::tenant::blob_io::MAX_SUPPORTED_BLOB_LEN;
+const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
+    let trailing_ones = MAX_SUPPORTED_BLOB_LEN.trailing_ones() as usize;
+    let leading_zeroes = MAX_SUPPORTED_BLOB_LEN.leading_zeros() as usize;
+    assert!(trailing_ones + leading_zeroes == std::mem::size_of::<usize>() * 8);
+    trailing_ones
+};
+
+/// See [`InMemoryLayerInner::index`].
+///
+/// For memory efficiency, the data is packed into a u64.
+///
+/// Layout:
+/// - 1 bit: `will_init`
+/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len`
+/// - [`MAX_SUPPORTED_POS_BITS`]: `pos`
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct IndexEntry(u64);
+
+impl IndexEntry {
+    /// See [`Self::MAX_SUPPORTED_POS`].
+    const MAX_SUPPORTED_POS_BITS: usize = {
+        let remainder = 64 - 1 - MAX_SUPPORTED_BLOB_LEN_BITS;
+        if remainder < 32 {
+            panic!("pos can be u32 as per type system, support that");
+        }
+        remainder
+    };
+    /// The maximum supported blob offset that can be represented by [`Self`].
+    /// See also [`Self::validate_checkpoint_distance`].
+    const MAX_SUPPORTED_POS: usize = (1 << Self::MAX_SUPPORTED_POS_BITS) - 1;
+
+    // Layout
+    const WILL_INIT_RANGE: Range<usize> = 0..1;
+    const LEN_RANGE: Range<usize> =
+        Self::WILL_INIT_RANGE.end..Self::WILL_INIT_RANGE.end + MAX_SUPPORTED_BLOB_LEN_BITS;
+    const POS_RANGE: Range<usize> =
+        Self::LEN_RANGE.end..Self::LEN_RANGE.end + Self::MAX_SUPPORTED_POS_BITS;
+    const _ASSERT: () = {
+        if Self::POS_RANGE.end != 64 {
+            panic!("we don't want undefined bits for our own sanity")
+        }
+    };
+
+    /// Fails if and only if the offset or length encoded in `arg` is too large to be represented by [`Self`].
+    ///
+    /// The only reason why that can happen in the system is if the [`InMemoryLayer`] grows too long.
+    /// The [`InMemoryLayer`] size is determined by the checkpoint distance, enforced by [`crate::tenant::Timeline::should_roll`].
+    ///
+    /// Thus, to avoid failure of this function, whenever we start up and/or change checkpoint distance,
+    /// call [`Self::validate_checkpoint_distance`] with the new checkpoint distance value.
+    ///
+    /// TODO: this check should happen ideally at config parsing time (and in the request handler when a change to checkpoint distance is requested)
+    /// When cleaning this up, also look into the s3 max file size check that is performed in delta layer writer.
+    #[inline(always)]
+    fn new(arg: IndexEntryNewArgs) -> anyhow::Result<Self> {
+        let IndexEntryNewArgs {
+            base_offset,
+            batch_offset,
+            len,
+            will_init,
+        } = arg;
+
+        let pos = base_offset
+            .checked_add(batch_offset)
+            .ok_or_else(|| anyhow::anyhow!("base_offset + batch_offset overflows u64: base_offset={base_offset} batch_offset={batch_offset}"))?;
+
+        if pos.into_usize() > Self::MAX_SUPPORTED_POS {
+            anyhow::bail!(
+                "base_offset+batch_offset exceeds the maximum supported value: base_offset={base_offset} batch_offset={batch_offset} (+)={pos} max={max}",
+                max = Self::MAX_SUPPORTED_POS
+            );
+        }
+
+        if len > MAX_SUPPORTED_BLOB_LEN {
+            anyhow::bail!(
+                "len exceeds the maximum supported length: len={len} max={MAX_SUPPORTED_BLOB_LEN}",
+            );
+        }
+
+        let mut data: u64 = 0;
+        use bit_field::BitField;
+        data.set_bits(Self::WILL_INIT_RANGE, if will_init { 1 } else { 0 });
+        data.set_bits(Self::LEN_RANGE, len.into_u64());
+        data.set_bits(Self::POS_RANGE, pos);
+
+        Ok(Self(data))
+    }
+
+    #[inline(always)]
+    fn unpack(&self) -> IndexEntryUnpacked {
+        use bit_field::BitField;
+        IndexEntryUnpacked {
+            will_init: self.0.get_bits(Self::WILL_INIT_RANGE) != 0,
+            len: self.0.get_bits(Self::LEN_RANGE),
+            pos: self.0.get_bits(Self::POS_RANGE),
+        }
+    }
+
+    /// See [`Self::new`].
+    pub(crate) const fn validate_checkpoint_distance(
+        checkpoint_distance: u64,
+    ) -> Result<(), &'static str> {
+        if checkpoint_distance > Self::MAX_SUPPORTED_POS as u64 {
+            return Err("exceeds the maximum supported value");
+        }
+        let res = u64_to_usize(checkpoint_distance).checked_add(MAX_SUPPORTED_BLOB_LEN);
+        if res.is_none() {
+            return Err(
+                "checkpoint distance + max supported blob len overflows in-memory addition",
+            );
+        }
+
+        // NB: it is ok for the result of the addition to be larger than MAX_SUPPORTED_POS
+
+        Ok(())
+    }
+
+    const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = {
+        let res = Self::validate_checkpoint_distance(
+            crate::tenant::config::defaults::DEFAULT_CHECKPOINT_DISTANCE,
+        );
+        if res.is_err() {
+            panic!("default checkpoint distance is valid")
+        }
+    };
+}
+
+/// Args to [`IndexEntry::new`].
+#[derive(Clone, Copy)]
+struct IndexEntryNewArgs {
+    base_offset: u64,
+    batch_offset: u64,
+    len: usize,
+    will_init: bool,
+}
+
+/// Unpacked representation of the bitfielded [`IndexEntry`].
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+struct IndexEntryUnpacked {
+    will_init: bool,
+    len: u64,
+    pos: u64,
+}
+
 impl std::fmt::Debug for InMemoryLayerInner {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("InMemoryLayerInner").finish()
@@ -276,7 +426,12 @@ impl InMemoryLayer {
             .build();
 
         let inner = self.inner.read().await;
-        let reader = inner.file.block_cursor();
+
+        struct ValueRead {
+            entry_lsn: Lsn,
+            read: vectored_dio_read::LogicalRead<Vec<u8>>,
+        }
+        let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
 
         for range in keyspace.ranges.iter() {
             for (key, vec_map) in inner
@@ -291,24 +446,62 @@ impl InMemoryLayer {
 
                 let slice = vec_map.slice_range(lsn_range);
 
-                for (entry_lsn, pos) in slice.iter().rev() {
-                    // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
-                    let buf = reader.read_blob(*pos, &ctx).await;
-                    if let Err(e) = buf {
-                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
+                for (entry_lsn, index_entry) in slice.iter().rev() {
+                    let IndexEntryUnpacked {
+                        pos,
+                        len,
+                        will_init,
+                    } = index_entry.unpack();
+                    reads.entry(key).or_default().push(ValueRead {
+                        entry_lsn: *entry_lsn,
+                        read: vectored_dio_read::LogicalRead::new(
+                            pos,
+                            Vec::with_capacity(len as usize),
+                        ),
+                    });
+                    if will_init {
                         break;
                     }
+                }
+            }
+        }
 
-                    let value = Value::des(&buf.unwrap());
-                    if let Err(e) = value {
+        // Execute the reads.
+
+        let f = vectored_dio_read::execute(
+            &inner.file,
+            reads
+                .iter()
+                .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
+            &ctx,
+        );
+        send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
+            .await;
+
+        // Process results into the reconstruct state
+        'next_key: for (key, value_reads) in reads {
+            for ValueRead { entry_lsn, read } in value_reads {
+                match read.into_result().expect("we run execute() above") {
+                    Err(e) => {
                         reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
-                        break;
+                        continue 'next_key;
                     }
+                    Ok(value_buf) => {
+                        let value = Value::des(&value_buf);
+                        if let Err(e) = value {
+                            reconstruct_state
+                                .on_key_error(key, PageReconstructError::from(anyhow!(e)));
+                            continue 'next_key;
+                        }
 
-                    let key_situation =
-                        reconstruct_state.update_key(&key, *entry_lsn, value.unwrap());
-                    if key_situation == ValueReconstructSituation::Complete {
-                        break;
+                        let key_situation =
+                            reconstruct_state.update_key(&key, entry_lsn, value.unwrap());
+                        if key_situation == ValueReconstructSituation::Complete {
+                            // TODO: metric to see if we fetched more values than necessary
+                            continue 'next_key;
+                        }
+
+                        // process the next value in the next iteration of the loop
                     }
                 }
             }
@@ -324,8 +517,9 @@ impl InMemoryLayer {
 struct SerializedBatchOffset {
     key: CompactKey,
     lsn: Lsn,
-    /// offset in bytes from the start of the batch's buffer to the Value's serialized size header.
-    offset: u64,
+    // TODO: separate type when we start serde-serializing this value, to avoid coupling
+    // in-memory representation to serialization format.
+    index_entry: IndexEntry,
 }
 
 pub struct SerializedBatch {
@@ -340,30 +534,10 @@ pub struct SerializedBatch {
 }
 
 impl SerializedBatch {
-    /// Write a blob length in the internal format of the EphemeralFile
-    pub(crate) fn write_blob_length(len: usize, cursor: &mut std::io::Cursor<Vec<u8>>) {
-        use std::io::Write;
-
-        if len < 0x80 {
-            // short one-byte length header
-            let len_buf = [len as u8];
-
-            cursor
-                .write_all(&len_buf)
-                .expect("Writing to Vec is infallible");
-        } else {
-            let mut len_buf = u32::to_be_bytes(len as u32);
-            len_buf[0] |= 0x80;
-            cursor
-                .write_all(&len_buf)
-                .expect("Writing to Vec is infallible");
-        }
-    }
-
-    pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self {
+    pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> anyhow::Result<Self> {
         // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
         // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
-        let buffer_size = batch.iter().map(|i| i.2).sum::<usize>() + 4 * batch.len();
+        let buffer_size = batch.iter().map(|i| i.2).sum::<usize>();
         let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
 
         let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
@@ -371,14 +545,19 @@ impl SerializedBatch {
         for (key, lsn, val_ser_size, val) in batch {
             let relative_off = cursor.position();
 
-            Self::write_blob_length(val_ser_size, &mut cursor);
             val.ser_into(&mut cursor)
                 .expect("Writing into in-memory buffer is infallible");
 
             offsets.push(SerializedBatchOffset {
                 key,
                 lsn,
-                offset: relative_off,
+                index_entry: IndexEntry::new(IndexEntryNewArgs {
+                    base_offset: 0,
+                    batch_offset: relative_off,
+                    len: val_ser_size,
+                    will_init: val.will_init(),
+                })
+                .context("higher-level code ensures that values are within supported ranges")?,
             });
             max_lsn = std::cmp::max(max_lsn, lsn);
         }
@@ -388,11 +567,11 @@ impl SerializedBatch {
         // Assert that we didn't do any extra allocations while building buffer.
         debug_assert!(buffer.len() <= buffer_size);
 
-        Self {
+        Ok(Self {
             raw: buffer,
             offsets,
             max_lsn,
-        }
+        })
     }
 }
 
@@ -456,44 +635,69 @@ impl InMemoryLayer {
         })
     }
 
-    // Write path.
+    /// Write path.
+    ///
+    /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from.
+    /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable.
+    /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
     pub async fn put_batch(
         &self,
         serialized_batch: SerializedBatch,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         let mut inner = self.inner.write().await;
         self.assert_writable();
 
-        let base_off = {
-            inner
-                .file
-                .write_raw(
-                    &serialized_batch.raw,
-                    &RequestContextBuilder::extend(ctx)
-                        .page_content_kind(PageContentKind::InMemoryLayer)
-                        .build(),
-                )
-                .await?
-        };
+        let base_offset = inner.file.len();
 
+        let SerializedBatch {
+            raw,
+            mut offsets,
+            max_lsn: _,
+        } = serialized_batch;
+
+        // Add the base_offset to the batch's index entries which are relative to the batch start.
+        for offset in &mut offsets {
+            let IndexEntryUnpacked {
+                will_init,
+                len,
+                pos,
+            } = offset.index_entry.unpack();
+            offset.index_entry = IndexEntry::new(IndexEntryNewArgs {
+                base_offset,
+                batch_offset: pos,
+                len: len.into_usize(),
+                will_init,
+            })?;
+        }
+
+        // Write the batch to the file
+        inner.file.write_raw(&raw, ctx).await?;
+        let new_size = inner.file.len();
+        let expected_new_len = base_offset
+            .checked_add(raw.len().into_u64())
+            // write_raw would error if we were to overflow u64.
+            // also IndexEntry and higher levels in
+            //the code don't allow the file to grow that large
+            .unwrap();
+        assert_eq!(new_size, expected_new_len);
+
+        // Update the index with the new entries
         for SerializedBatchOffset {
             key,
             lsn,
-            offset: relative_off,
-        } in serialized_batch.offsets
+            index_entry,
+        } in offsets
         {
-            let off = base_off + relative_off;
             let vec_map = inner.index.entry(key).or_default();
-            let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
+            let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
             if old.is_some() {
                 // We already had an entry for this LSN. That's odd..
                 warn!("Key {} at {} already exists", key, lsn);
             }
         }
 
-        let size = inner.file.len();
-        inner.resource_units.maybe_publish_size(size);
+        inner.resource_units.maybe_publish_size(new_size);
 
         Ok(())
     }
@@ -537,7 +741,7 @@ impl InMemoryLayer {
         {
             let inner = self.inner.write().await;
             for vec_map in inner.index.values() {
-                for (lsn, _pos) in vec_map.as_slice() {
+                for (lsn, _) in vec_map.as_slice() {
                     assert!(*lsn < end_lsn);
                 }
             }
@@ -601,36 +805,23 @@ impl InMemoryLayer {
         match l0_flush_global_state {
             l0_flush::Inner::Direct { .. } => {
                 let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
-                assert_eq!(
-                    file_contents.len() % PAGE_SZ,
-                    0,
-                    "needed by BlockReaderRef::Slice"
-                );
-                assert_eq!(file_contents.len(), {
-                    let written = usize::try_from(inner.file.len()).unwrap();
-                    if written % PAGE_SZ == 0 {
-                        written
-                    } else {
-                        written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap()
-                    }
-                });
 
-                let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents));
-
-                let mut buf = Vec::new();
+                let file_contents = Bytes::from(file_contents);
 
                 for (key, vec_map) in inner.index.iter() {
                     // Write all page versions
-                    for (lsn, pos) in vec_map.as_slice() {
-                        // TODO: once we have blob lengths in the in-memory index, we can
-                        // 1. get rid of the blob_io / BlockReaderRef::Slice business and
-                        // 2. load the file contents into a Bytes and
-                        // 3. the use `Bytes::slice` to get the `buf` that is our blob
-                        // 4. pass that `buf` into `put_value_bytes`
-                        // => https://github.com/neondatabase/neon/issues/8183
-                        cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
-                        let will_init = Value::des(&buf)?.will_init();
-                        let (tmp, res) = delta_layer_writer
+                    for (lsn, entry) in vec_map
+                        .as_slice()
+                        .iter()
+                        .map(|(lsn, entry)| (lsn, entry.unpack()))
+                    {
+                        let IndexEntryUnpacked {
+                            pos,
+                            len,
+                            will_init,
+                        } = entry;
+                        let buf = Bytes::slice(&file_contents, pos as usize..(pos + len) as usize);
+                        let (_buf, res) = delta_layer_writer
                             .put_value_bytes(
                                 Key::from_compact(*key),
                                 *lsn,
@@ -640,7 +831,6 @@ impl InMemoryLayer {
                             )
                             .await;
                         res?;
-                        buf = tmp.into_raw_slice().into_inner();
                     }
                 }
             }
@@ -662,3 +852,134 @@ impl InMemoryLayer {
         Ok(Some((desc, path)))
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_index_entry() {
+        const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS;
+        use IndexEntryNewArgs as Args;
+        use IndexEntryUnpacked as Unpacked;
+
+        let roundtrip = |args, expect: Unpacked| {
+            let res = IndexEntry::new(args).expect("this tests expects no errors");
+            let IndexEntryUnpacked {
+                will_init,
+                len,
+                pos,
+            } = res.unpack();
+            assert_eq!(will_init, expect.will_init);
+            assert_eq!(len, expect.len);
+            assert_eq!(pos, expect.pos);
+        };
+
+        // basic roundtrip
+        for pos in [0, MAX_SUPPORTED_POS] {
+            for len in [0, MAX_SUPPORTED_BLOB_LEN] {
+                for will_init in [true, false] {
+                    let expect = Unpacked {
+                        will_init,
+                        len: len.into_u64(),
+                        pos: pos.into_u64(),
+                    };
+                    roundtrip(
+                        Args {
+                            will_init,
+                            base_offset: pos.into_u64(),
+                            batch_offset: 0,
+                            len,
+                        },
+                        expect,
+                    );
+                    roundtrip(
+                        Args {
+                            will_init,
+                            base_offset: 0,
+                            batch_offset: pos.into_u64(),
+                            len,
+                        },
+                        expect,
+                    );
+                }
+            }
+        }
+
+        // too-large len
+        let too_large = Args {
+            will_init: false,
+            len: MAX_SUPPORTED_BLOB_LEN + 1,
+            base_offset: 0,
+            batch_offset: 0,
+        };
+        assert!(IndexEntry::new(too_large).is_err());
+
+        // too-large pos
+        {
+            let too_large = Args {
+                will_init: false,
+                len: 0,
+                base_offset: MAX_SUPPORTED_POS.into_u64() + 1,
+                batch_offset: 0,
+            };
+            assert!(IndexEntry::new(too_large).is_err());
+            let too_large = Args {
+                will_init: false,
+                len: 0,
+                base_offset: 0,
+                batch_offset: MAX_SUPPORTED_POS.into_u64() + 1,
+            };
+            assert!(IndexEntry::new(too_large).is_err());
+        }
+
+        // too large (base_offset + batch_offset)
+        {
+            let too_large = Args {
+                will_init: false,
+                len: 0,
+                base_offset: MAX_SUPPORTED_POS.into_u64(),
+                batch_offset: 1,
+            };
+            assert!(IndexEntry::new(too_large).is_err());
+            let too_large = Args {
+                will_init: false,
+                len: 0,
+                base_offset: MAX_SUPPORTED_POS.into_u64() - 1,
+                batch_offset: MAX_SUPPORTED_POS.into_u64() - 1,
+            };
+            assert!(IndexEntry::new(too_large).is_err());
+        }
+
+        // valid special cases
+        // - area past the max supported pos that is accessible by len
+        for len in [1, MAX_SUPPORTED_BLOB_LEN] {
+            roundtrip(
+                Args {
+                    will_init: false,
+                    len,
+                    base_offset: MAX_SUPPORTED_POS.into_u64(),
+                    batch_offset: 0,
+                },
+                Unpacked {
+                    will_init: false,
+                    len: len as u64,
+                    pos: MAX_SUPPORTED_POS.into_u64(),
+                },
+            );
+            roundtrip(
+                Args {
+                    will_init: false,
+                    len,
+                    base_offset: 0,
+                    batch_offset: MAX_SUPPORTED_POS.into_u64(),
+                },
+                Unpacked {
+                    will_init: false,
+                    len: len as u64,
+                    pos: MAX_SUPPORTED_POS.into_u64(),
+                },
+            );
+        }
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
new file mode 100644
index 0000000000..0683e15659
--- /dev/null
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
@@ -0,0 +1,937 @@
+use std::{
+    collections::BTreeMap,
+    sync::{Arc, RwLock},
+};
+
+use itertools::Itertools;
+use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
+
+use crate::{
+    assert_u64_eq_usize::{U64IsUsize, UsizeIsU64},
+    context::RequestContext,
+};
+
+/// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`].
+pub trait File: Send {
+    /// Attempt to read the bytes in `self` in range `[start,start+dst.bytes_total())`
+    /// and return the number of bytes read (let's call it `nread`).
+    /// The bytes read are placed in `dst`, i.e., `&dst[..nread]` will contain the read bytes.
+    ///
+    /// The only reason why the read may be short (i.e., `nread != dst.bytes_total()`)
+    /// is if the file is shorter than `start+dst.len()`.
+    ///
+    /// This is unlike [`std::os::unix::fs::FileExt::read_exact_at`] which returns an
+    /// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`.
+    ///
+    /// No guarantees are made about the remaining bytes in `dst` in case of a short read.
+    async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
+        &'b self,
+        start: u64,
+        dst: Slice<B>,
+        ctx: &'a RequestContext,
+    ) -> std::io::Result<(Slice<B>, usize)>;
+}
+
+/// A logical read from [`File`]. See [`Self::new`].
+pub struct LogicalRead<B: Buffer> {
+    pos: u64,
+    state: RwLockRefCell<LogicalReadState<B>>,
+}
+
+enum LogicalReadState<B: Buffer> {
+    NotStarted(B),
+    Ongoing(B),
+    Ok(B),
+    Error(Arc<std::io::Error>),
+    Undefined,
+}
+
+impl<B: Buffer> LogicalRead<B> {
+    /// Create a new [`LogicalRead`] from [`File`] of the data in the file in range `[ pos, pos + buf.cap() )`.
+    pub fn new(pos: u64, buf: B) -> Self {
+        Self {
+            pos,
+            state: RwLockRefCell::new(LogicalReadState::NotStarted(buf)),
+        }
+    }
+    pub fn into_result(self) -> Option<Result<B, Arc<std::io::Error>>> {
+        match self.state.into_inner() {
+            LogicalReadState::Ok(buf) => Some(Ok(buf)),
+            LogicalReadState::Error(e) => Some(Err(e)),
+            LogicalReadState::NotStarted(_) | LogicalReadState::Ongoing(_) => None,
+            LogicalReadState::Undefined => unreachable!(),
+        }
+    }
+}
+
+/// The buffer into which a [`LogicalRead`] result is placed.
+pub trait Buffer: std::ops::Deref<Target = [u8]> {
+    /// Immutable.
+    fn cap(&self) -> usize;
+    /// Changes only through [`Self::extend_from_slice`].
+    fn len(&self) -> usize;
+    /// Panics if the total length would exceed the initialized capacity.
+    fn extend_from_slice(&mut self, src: &[u8]);
+}
+
+/// The minimum alignment and size requirement for disk offsets and memory buffer size for direct IO.
+const DIO_CHUNK_SIZE: usize = 512;
+
+/// If multiple chunks need to be read, merge adjacent chunk reads into batches of max size `MAX_CHUNK_BATCH_SIZE`.
+/// (The unit is the number of chunks.)
+const MAX_CHUNK_BATCH_SIZE: usize = {
+    let desired = 128 * 1024; // 128k
+    if desired % DIO_CHUNK_SIZE != 0 {
+        panic!("MAX_CHUNK_BATCH_SIZE must be a multiple of DIO_CHUNK_SIZE")
+        // compile-time error
+    }
+    desired / DIO_CHUNK_SIZE
+};
+
+/// Execute the given logical `reads` against `file`.
+/// The results are placed in the buffers of the [`LogicalRead`]s.
+/// Retrieve the results by calling [`LogicalRead::into_result`] on each [`LogicalRead`].
+///
+/// The [`LogicalRead`]s must be freshly created using [`LogicalRead::new`] when calling this function.
+/// Otherwise, this function panics.
+pub async fn execute<'a, I, F, B>(file: &F, reads: I, ctx: &RequestContext)
+where
+    I: IntoIterator<Item = &'a LogicalRead<B>>,
+    F: File,
+    B: Buffer + IoBufMut + Send,
+{
+    // Terminology:
+    // logical read = a request to read an arbitrary range of bytes from `file`; byte-level granularity
+    // chunk = we conceptually divide up the byte range of `file` into DIO_CHUNK_SIZEs ranges
+    // interest = a range within a chunk that a logical read is interested in; one logical read gets turned into many interests
+    // physical read = the read request we're going to issue to the OS; covers a range of chunks; chunk-level granularity
+
+    // Preserve a copy of the logical reads for debug assertions at the end
+    #[cfg(debug_assertions)]
+    let (reads, assert_logical_reads) = {
+        let (reads, assert) = reads.into_iter().tee();
+        (reads, Some(Vec::from_iter(assert)))
+    };
+    #[cfg(not(debug_assertions))]
+    let (reads, assert_logical_reads): (_, Option<Vec<&'a LogicalRead<B>>>) = (reads, None);
+
+    // Plan which parts of which chunks need to be appended to which buffer
+    let mut by_chunk: BTreeMap<u64, Vec<Interest<B>>> = BTreeMap::new();
+    struct Interest<'a, B: Buffer> {
+        logical_read: &'a LogicalRead<B>,
+        offset_in_chunk: u64,
+        len: u64,
+    }
+    for logical_read in reads {
+        let LogicalRead { pos, state } = logical_read;
+        let mut state = state.borrow_mut();
+
+        // transition from NotStarted to Ongoing
+        let cur = std::mem::replace(&mut *state, LogicalReadState::Undefined);
+        let req_len = match cur {
+            LogicalReadState::NotStarted(buf) => {
+                if buf.len() != 0 {
+                    panic!("The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`");
+                }
+                // buf.cap() == 0 is ok
+
+                // transition into Ongoing state
+                let req_len = buf.cap();
+                *state = LogicalReadState::Ongoing(buf);
+                req_len
+            }
+            x => panic!("must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}"),
+        };
+
+        // plan which chunks we need to read from
+        let mut remaining = req_len;
+        let mut chunk_no = *pos / (DIO_CHUNK_SIZE.into_u64());
+        let mut offset_in_chunk = pos.into_usize() % DIO_CHUNK_SIZE;
+        while remaining > 0 {
+            let remaining_in_chunk = std::cmp::min(remaining, DIO_CHUNK_SIZE - offset_in_chunk);
+            by_chunk.entry(chunk_no).or_default().push(Interest {
+                logical_read,
+                offset_in_chunk: offset_in_chunk.into_u64(),
+                len: remaining_in_chunk.into_u64(),
+            });
+            offset_in_chunk = 0;
+            chunk_no += 1;
+            remaining -= remaining_in_chunk;
+        }
+    }
+
+    // At this point, we could iterate over by_chunk, in chunk order,
+    // read each chunk from disk, and fill the buffers.
+    // However, we can merge adjacent chunks into batches of MAX_CHUNK_BATCH_SIZE
+    // so we issue fewer IOs = fewer roundtrips = lower overall latency.
+    struct PhysicalRead<'a, B: Buffer> {
+        start_chunk_no: u64,
+        nchunks: usize,
+        dsts: Vec<PhysicalInterest<'a, B>>,
+    }
+    struct PhysicalInterest<'a, B: Buffer> {
+        logical_read: &'a LogicalRead<B>,
+        offset_in_physical_read: u64,
+        len: u64,
+    }
+    let mut physical_reads: Vec<PhysicalRead<B>> = Vec::new();
+    let mut by_chunk = by_chunk.into_iter().peekable();
+    loop {
+        let mut last_chunk_no = None;
+        let to_merge: Vec<(u64, Vec<Interest<B>>)> = by_chunk
+            .peeking_take_while(|(chunk_no, _)| {
+                if let Some(last_chunk_no) = last_chunk_no {
+                    if *chunk_no != last_chunk_no + 1 {
+                        return false;
+                    }
+                }
+                last_chunk_no = Some(*chunk_no);
+                true
+            })
+            .take(MAX_CHUNK_BATCH_SIZE)
+            .collect(); // TODO: avoid this .collect()
+        let Some(start_chunk_no) = to_merge.first().map(|(chunk_no, _)| *chunk_no) else {
+            break;
+        };
+        let nchunks = to_merge.len();
+        let dsts = to_merge
+            .into_iter()
+            .enumerate()
+            .flat_map(|(i, (_, dsts))| {
+                dsts.into_iter().map(
+                    move |Interest {
+                              logical_read,
+                              offset_in_chunk,
+                              len,
+                          }| {
+                        PhysicalInterest {
+                            logical_read,
+                            offset_in_physical_read: i
+                                .checked_mul(DIO_CHUNK_SIZE)
+                                .unwrap()
+                                .into_u64()
+                                + offset_in_chunk,
+                            len,
+                        }
+                    },
+                )
+            })
+            .collect();
+        physical_reads.push(PhysicalRead {
+            start_chunk_no,
+            nchunks,
+            dsts,
+        });
+    }
+    drop(by_chunk);
+
+    // Execute physical reads and fill the logical read buffers
+    // TODO: pipelined reads; prefetch;
+    let get_io_buffer = |nchunks| Vec::with_capacity(nchunks * DIO_CHUNK_SIZE);
+    for PhysicalRead {
+        start_chunk_no,
+        nchunks,
+        dsts,
+    } in physical_reads
+    {
+        let all_done = dsts
+            .iter()
+            .all(|PhysicalInterest { logical_read, .. }| logical_read.state.borrow().is_terminal());
+        if all_done {
+            continue;
+        }
+        let read_offset = start_chunk_no
+            .checked_mul(DIO_CHUNK_SIZE.into_u64())
+            .expect("we produce chunk_nos by dividing by DIO_CHUNK_SIZE earlier");
+        let io_buf = get_io_buffer(nchunks).slice_full();
+        let req_len = io_buf.len();
+        let (io_buf_slice, nread) = match file.read_exact_at_eof_ok(read_offset, io_buf, ctx).await
+        {
+            Ok(t) => t,
+            Err(e) => {
+                let e = Arc::new(e);
+                for PhysicalInterest { logical_read, .. } in dsts {
+                    *logical_read.state.borrow_mut() = LogicalReadState::Error(Arc::clone(&e));
+                    // this will make later reads for the given LogicalRead short-circuit, see top of loop body
+                }
+                continue;
+            }
+        };
+        let io_buf = io_buf_slice.into_inner();
+        assert!(
+            nread <= io_buf.len(),
+            "the last chunk in the file can be a short read, so, no =="
+        );
+        let io_buf = &io_buf[..nread];
+        for PhysicalInterest {
+            logical_read,
+            offset_in_physical_read,
+            len,
+        } in dsts
+        {
+            let mut logical_read_state_borrow = logical_read.state.borrow_mut();
+            let logical_read_buf = match &mut *logical_read_state_borrow {
+                LogicalReadState::NotStarted(_) => {
+                    unreachable!("we transition it into Ongoing at function entry")
+                }
+                LogicalReadState::Ongoing(buf) => buf,
+                LogicalReadState::Ok(_) | LogicalReadState::Error(_) => {
+                    continue;
+                }
+                LogicalReadState::Undefined => unreachable!(),
+            };
+            let range_in_io_buf = std::ops::Range {
+                start: offset_in_physical_read as usize,
+                end: offset_in_physical_read as usize + len as usize,
+            };
+            assert!(range_in_io_buf.end >= range_in_io_buf.start);
+            if range_in_io_buf.end > nread {
+                let msg = format!(
+                    "physical read returned EOF where this logical read expected more data in the file: offset=0x{read_offset:x} req_len=0x{req_len:x} nread=0x{nread:x} {:?}",
+                    &*logical_read_state_borrow
+                );
+                logical_read_state_borrow.transition_to_terminal(Err(std::io::Error::new(
+                    std::io::ErrorKind::UnexpectedEof,
+                    msg,
+                )));
+                continue;
+            }
+            let data = &io_buf[range_in_io_buf];
+
+            // Copy data from io buffer into the logical read buffer.
+            // (And in debug mode, validate that the buffer impl adheres to the Buffer trait spec.)
+            let pre = if cfg!(debug_assertions) {
+                Some((logical_read_buf.len(), logical_read_buf.cap()))
+            } else {
+                None
+            };
+            logical_read_buf.extend_from_slice(data);
+            let post = if cfg!(debug_assertions) {
+                Some((logical_read_buf.len(), logical_read_buf.cap()))
+            } else {
+                None
+            };
+            match (pre, post) {
+                (None, None) => {}
+                (Some(_), None) | (None, Some(_)) => unreachable!(),
+                (Some((pre_len, pre_cap)), Some((post_len, post_cap))) => {
+                    assert_eq!(pre_len + len as usize, post_len);
+                    assert_eq!(pre_cap, post_cap);
+                }
+            }
+
+            if logical_read_buf.len() == logical_read_buf.cap() {
+                logical_read_state_borrow.transition_to_terminal(Ok(()));
+            }
+        }
+    }
+
+    if let Some(assert_logical_reads) = assert_logical_reads {
+        for logical_read in assert_logical_reads {
+            assert!(logical_read.state.borrow().is_terminal());
+        }
+    }
+}
+
+impl<B: Buffer> LogicalReadState<B> {
+    fn is_terminal(&self) -> bool {
+        match self {
+            LogicalReadState::NotStarted(_) | LogicalReadState::Ongoing(_) => false,
+            LogicalReadState::Ok(_) | LogicalReadState::Error(_) => true,
+            LogicalReadState::Undefined => unreachable!(),
+        }
+    }
+    fn transition_to_terminal(&mut self, err: std::io::Result<()>) {
+        let cur = std::mem::replace(self, LogicalReadState::Undefined);
+        let buf = match cur {
+            LogicalReadState::Ongoing(buf) => buf,
+            x => panic!("must only call in state Ongoing, got {x:?}"),
+        };
+        *self = match err {
+            Ok(()) => LogicalReadState::Ok(buf),
+            Err(e) => LogicalReadState::Error(Arc::new(e)),
+        };
+    }
+}
+
+impl<B: Buffer> std::fmt::Debug for LogicalReadState<B> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        #[derive(Debug)]
+        #[allow(unused)]
+        struct BufferDebug {
+            len: usize,
+            cap: usize,
+        }
+        impl<'a> From<&'a dyn Buffer> for BufferDebug {
+            fn from(buf: &'a dyn Buffer) -> Self {
+                Self {
+                    len: buf.len(),
+                    cap: buf.cap(),
+                }
+            }
+        }
+        match self {
+            LogicalReadState::NotStarted(b) => {
+                write!(f, "NotStarted({:?})", BufferDebug::from(b as &dyn Buffer))
+            }
+            LogicalReadState::Ongoing(b) => {
+                write!(f, "Ongoing({:?})", BufferDebug::from(b as &dyn Buffer))
+            }
+            LogicalReadState::Ok(b) => write!(f, "Ok({:?})", BufferDebug::from(b as &dyn Buffer)),
+            LogicalReadState::Error(e) => write!(f, "Error({:?})", e),
+            LogicalReadState::Undefined => write!(f, "Undefined"),
+        }
+    }
+}
+
+#[derive(Debug)]
+struct RwLockRefCell<T>(RwLock<T>);
+impl<T> RwLockRefCell<T> {
+    fn new(value: T) -> Self {
+        Self(RwLock::new(value))
+    }
+    fn borrow(&self) -> impl std::ops::Deref<Target = T> + '_ {
+        self.0.try_read().unwrap()
+    }
+    fn borrow_mut(&self) -> impl std::ops::DerefMut<Target = T> + '_ {
+        self.0.try_write().unwrap()
+    }
+    fn into_inner(self) -> T {
+        self.0.into_inner().unwrap()
+    }
+}
+
+impl Buffer for Vec<u8> {
+    fn cap(&self) -> usize {
+        self.capacity()
+    }
+
+    fn len(&self) -> usize {
+        self.len()
+    }
+
+    fn extend_from_slice(&mut self, src: &[u8]) {
+        if self.len() + src.len() > self.cap() {
+            panic!("Buffer capacity exceeded");
+        }
+        Vec::extend_from_slice(self, src);
+    }
+}
+
+#[cfg(test)]
+#[allow(clippy::assertions_on_constants)]
+mod tests {
+    use rand::Rng;
+
+    use crate::{
+        context::DownloadBehavior, task_mgr::TaskKind,
+        virtual_file::owned_buffers_io::slice::SliceMutExt,
+    };
+
+    use super::*;
+    use std::{cell::RefCell, collections::VecDeque};
+
+    struct InMemoryFile {
+        content: Vec<u8>,
+    }
+
+    impl InMemoryFile {
+        fn new_random(len: usize) -> Self {
+            Self {
+                content: rand::thread_rng()
+                    .sample_iter(rand::distributions::Standard)
+                    .take(len)
+                    .collect(),
+            }
+        }
+        fn test_logical_read(&self, pos: u64, len: usize) -> TestLogicalRead {
+            let expected_result = if pos as usize + len > self.content.len() {
+                Err("InMemoryFile short read".to_string())
+            } else {
+                Ok(self.content[pos as usize..pos as usize + len].to_vec())
+            };
+            TestLogicalRead::new(pos, len, expected_result)
+        }
+    }
+
+    #[test]
+    fn test_in_memory_file() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let file = InMemoryFile::new_random(10);
+        let test_read = |pos, len| {
+            let buf = vec![0; len];
+            let fut = file.read_exact_at_eof_ok(pos, buf.slice_full(), &ctx);
+            use futures::FutureExt;
+            let (slice, nread) = fut
+                .now_or_never()
+                .expect("impl never awaits")
+                .expect("impl never errors");
+            let mut buf = slice.into_inner();
+            buf.truncate(nread);
+            buf
+        };
+        assert_eq!(test_read(0, 1), &file.content[0..1]);
+        assert_eq!(test_read(1, 2), &file.content[1..3]);
+        assert_eq!(test_read(9, 2), &file.content[9..]);
+        assert!(test_read(10, 2).is_empty());
+        assert!(test_read(11, 2).is_empty());
+    }
+
+    impl File for InMemoryFile {
+        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
+            &'b self,
+            start: u64,
+            mut dst: Slice<B>,
+            _ctx: &'a RequestContext,
+        ) -> std::io::Result<(Slice<B>, usize)> {
+            let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed();
+            let nread = {
+                let req_len = dst_slice.len();
+                let len = std::cmp::min(req_len, self.content.len().saturating_sub(start as usize));
+                if start as usize >= self.content.len() {
+                    0
+                } else {
+                    dst_slice[..len]
+                        .copy_from_slice(&self.content[start as usize..start as usize + len]);
+                    len
+                }
+            };
+            rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[nread..]); // to discover bugs
+            Ok((dst, nread))
+        }
+    }
+
+    #[derive(Clone)]
+    struct TestLogicalRead {
+        pos: u64,
+        len: usize,
+        expected_result: Result<Vec<u8>, String>,
+    }
+
+    impl TestLogicalRead {
+        fn new(pos: u64, len: usize, expected_result: Result<Vec<u8>, String>) -> Self {
+            Self {
+                pos,
+                len,
+                expected_result,
+            }
+        }
+        fn make_logical_read(&self) -> LogicalRead<Vec<u8>> {
+            LogicalRead::new(self.pos, Vec::with_capacity(self.len))
+        }
+    }
+
+    async fn execute_and_validate_test_logical_reads<I, F>(
+        file: &F,
+        test_logical_reads: I,
+        ctx: &RequestContext,
+    ) where
+        I: IntoIterator<Item = TestLogicalRead>,
+        F: File,
+    {
+        let (tmp, test_logical_reads) = test_logical_reads.into_iter().tee();
+        let logical_reads = tmp.map(|tr| tr.make_logical_read()).collect::<Vec<_>>();
+        execute(file, logical_reads.iter(), ctx).await;
+        for (logical_read, test_logical_read) in logical_reads.into_iter().zip(test_logical_reads) {
+            let actual = logical_read.into_result().expect("we call execute()");
+            match (actual, test_logical_read.expected_result) {
+                (Ok(actual), Ok(expected)) if actual == expected => {}
+                (Err(actual), Err(expected)) => {
+                    assert_eq!(actual.to_string(), expected);
+                }
+                (actual, expected) => panic!("expected {expected:?}\nactual {actual:?}"),
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_blackbox() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let cs = DIO_CHUNK_SIZE;
+        let cs_u64 = cs.into_u64();
+
+        let file = InMemoryFile::new_random(10 * cs);
+
+        let test_logical_reads = vec![
+            file.test_logical_read(0, 1),
+            // adjacent to logical_read0
+            file.test_logical_read(1, 2),
+            // gap
+            // spans adjacent chunks
+            file.test_logical_read(cs_u64 - 1, 2),
+            // gap
+            //  tail of chunk 3, all of chunk 4, and 2 bytes of chunk 5
+            file.test_logical_read(3 * cs_u64 - 1, cs + 2),
+            // gap
+            file.test_logical_read(5 * cs_u64, 1),
+        ];
+        let num_test_logical_reads = test_logical_reads.len();
+        let test_logical_reads_perms = test_logical_reads
+            .into_iter()
+            .permutations(num_test_logical_reads);
+
+        // test all orderings of LogicalReads, the order shouldn't matter for the results
+        for test_logical_reads in test_logical_reads_perms {
+            execute_and_validate_test_logical_reads(&file, test_logical_reads, &ctx).await;
+        }
+    }
+
+    #[tokio::test]
+    #[should_panic]
+    async fn test_reusing_logical_reads_panics() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let file = InMemoryFile::new_random(DIO_CHUNK_SIZE);
+        let a = file.test_logical_read(23, 10);
+        let logical_reads = vec![a.make_logical_read()];
+        execute(&file, &logical_reads, &ctx).await;
+        // reuse pancis
+        execute(&file, &logical_reads, &ctx).await;
+    }
+
+    struct RecorderFile<'a> {
+        recorded: RefCell<Vec<RecordedRead>>,
+        file: &'a InMemoryFile,
+    }
+
+    struct RecordedRead {
+        pos: u64,
+        req_len: usize,
+        res: Vec<u8>,
+    }
+
+    impl<'a> RecorderFile<'a> {
+        fn new(file: &'a InMemoryFile) -> RecorderFile<'a> {
+            Self {
+                recorded: Default::default(),
+                file,
+            }
+        }
+    }
+
+    impl<'x> File for RecorderFile<'x> {
+        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
+            &'b self,
+            start: u64,
+            dst: Slice<B>,
+            ctx: &'a RequestContext,
+        ) -> std::io::Result<(Slice<B>, usize)> {
+            let (dst, nread) = self.file.read_exact_at_eof_ok(start, dst, ctx).await?;
+            self.recorded.borrow_mut().push(RecordedRead {
+                pos: start,
+                req_len: dst.bytes_total(),
+                res: Vec::from(&dst[..nread]),
+            });
+            Ok((dst, nread))
+        }
+    }
+
+    #[tokio::test]
+    async fn test_logical_reads_to_same_chunk_are_merged_into_one_chunk_read() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
+        let file = InMemoryFile::new_random(2 * DIO_CHUNK_SIZE);
+
+        let a = file.test_logical_read(DIO_CHUNK_SIZE.into_u64(), 10);
+        let b = file.test_logical_read(DIO_CHUNK_SIZE.into_u64() + 30, 20);
+
+        let recorder = RecorderFile::new(&file);
+
+        execute_and_validate_test_logical_reads(&recorder, vec![a, b], &ctx).await;
+
+        let recorded = recorder.recorded.borrow();
+        assert_eq!(recorded.len(), 1);
+        let RecordedRead { pos, req_len, .. } = &recorded[0];
+        assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64());
+        assert_eq!(*req_len, DIO_CHUNK_SIZE);
+    }
+
+    #[tokio::test]
+    async fn test_max_chunk_batch_size_is_respected() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
+        let file = InMemoryFile::new_random(4 * MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE);
+
+        // read the 10th byte of each chunk 3 .. 3+2*MAX_CHUNK_BATCH_SIZE
+        assert!(3 < MAX_CHUNK_BATCH_SIZE, "test assumption");
+        assert!(10 < DIO_CHUNK_SIZE, "test assumption");
+        let mut test_logical_reads = Vec::new();
+        for i in 3..3 + MAX_CHUNK_BATCH_SIZE + MAX_CHUNK_BATCH_SIZE / 2 {
+            test_logical_reads
+                .push(file.test_logical_read(i.into_u64() * DIO_CHUNK_SIZE.into_u64() + 10, 1));
+        }
+
+        let recorder = RecorderFile::new(&file);
+
+        execute_and_validate_test_logical_reads(&recorder, test_logical_reads, &ctx).await;
+
+        let recorded = recorder.recorded.borrow();
+        assert_eq!(recorded.len(), 2);
+        {
+            let RecordedRead { pos, req_len, .. } = &recorded[0];
+            assert_eq!(*pos as usize, 3 * DIO_CHUNK_SIZE);
+            assert_eq!(*req_len, MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE);
+        }
+        {
+            let RecordedRead { pos, req_len, .. } = &recorded[1];
+            assert_eq!(*pos as usize, (3 + MAX_CHUNK_BATCH_SIZE) * DIO_CHUNK_SIZE);
+            assert_eq!(*req_len, MAX_CHUNK_BATCH_SIZE / 2 * DIO_CHUNK_SIZE);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_batch_breaks_if_chunk_is_not_interesting() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
+        assert!(MAX_CHUNK_BATCH_SIZE > 10, "test assumption");
+        let file = InMemoryFile::new_random(3 * DIO_CHUNK_SIZE);
+
+        let a = file.test_logical_read(0, 1); // chunk 0
+        let b = file.test_logical_read(2 * DIO_CHUNK_SIZE.into_u64(), 1); // chunk 2
+
+        let recorder = RecorderFile::new(&file);
+
+        execute_and_validate_test_logical_reads(&recorder, vec![a, b], &ctx).await;
+
+        let recorded = recorder.recorded.borrow();
+
+        assert_eq!(recorded.len(), 2);
+        {
+            let RecordedRead { pos, req_len, .. } = &recorded[0];
+            assert_eq!(*pos, 0);
+            assert_eq!(*req_len, DIO_CHUNK_SIZE);
+        }
+        {
+            let RecordedRead { pos, req_len, .. } = &recorded[1];
+            assert_eq!(*pos, 2 * DIO_CHUNK_SIZE.into_u64());
+            assert_eq!(*req_len, DIO_CHUNK_SIZE);
+        }
+    }
+
+    struct ExpectedRead {
+        expect_pos: u64,
+        expect_len: usize,
+        respond: Result<Vec<u8>, String>,
+    }
+
+    struct MockFile {
+        expected: RefCell<VecDeque<ExpectedRead>>,
+    }
+
+    impl Drop for MockFile {
+        fn drop(&mut self) {
+            assert!(
+                self.expected.borrow().is_empty(),
+                "expected reads not satisfied"
+            );
+        }
+    }
+
+    macro_rules! mock_file {
+        ($($pos:expr , $len:expr => $respond:expr),* $(,)?) => {{
+            MockFile {
+                expected: RefCell::new(VecDeque::from(vec![$(ExpectedRead {
+                    expect_pos: $pos,
+                    expect_len: $len,
+                    respond: $respond,
+                }),*])),
+            }
+        }};
+    }
+
+    impl File for MockFile {
+        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
+            &'b self,
+            start: u64,
+            mut dst: Slice<B>,
+            _ctx: &'a RequestContext,
+        ) -> std::io::Result<(Slice<B>, usize)> {
+            let ExpectedRead {
+                expect_pos,
+                expect_len,
+                respond,
+            } = self
+                .expected
+                .borrow_mut()
+                .pop_front()
+                .expect("unexpected read");
+            assert_eq!(start, expect_pos);
+            assert_eq!(dst.bytes_total(), expect_len);
+            match respond {
+                Ok(mocked_bytes) => {
+                    let len = std::cmp::min(dst.bytes_total(), mocked_bytes.len());
+                    let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed();
+                    dst_slice[..len].copy_from_slice(&mocked_bytes[..len]);
+                    rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[len..]); // to discover bugs
+                    Ok((dst, len))
+                }
+                Err(e) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)),
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_mock_file() {
+        // Self-test to ensure the relevant features of mock file work as expected.
+
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
+        let mock_file = mock_file! {
+            0    , 512 => Ok(vec![0; 512]),
+            512  , 512 => Ok(vec![1; 512]),
+            1024 , 512 => Ok(vec![2; 10]),
+            2048,  1024 => Err("foo".to_owned()),
+        };
+
+        let buf = Vec::with_capacity(512);
+        let (buf, nread) = mock_file
+            .read_exact_at_eof_ok(0, buf.slice_full(), &ctx)
+            .await
+            .unwrap();
+        assert_eq!(nread, 512);
+        assert_eq!(&buf.into_inner()[..nread], &[0; 512]);
+
+        let buf = Vec::with_capacity(512);
+        let (buf, nread) = mock_file
+            .read_exact_at_eof_ok(512, buf.slice_full(), &ctx)
+            .await
+            .unwrap();
+        assert_eq!(nread, 512);
+        assert_eq!(&buf.into_inner()[..nread], &[1; 512]);
+
+        let buf = Vec::with_capacity(512);
+        let (buf, nread) = mock_file
+            .read_exact_at_eof_ok(1024, buf.slice_full(), &ctx)
+            .await
+            .unwrap();
+        assert_eq!(nread, 10);
+        assert_eq!(&buf.into_inner()[..nread], &[2; 10]);
+
+        let buf = Vec::with_capacity(1024);
+        let err = mock_file
+            .read_exact_at_eof_ok(2048, buf.slice_full(), &ctx)
+            .await
+            .err()
+            .unwrap();
+        assert_eq!(err.to_string(), "foo");
+    }
+
+    #[tokio::test]
+    async fn test_error_on_one_chunk_read_fails_only_dependent_logical_reads() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
+        let test_logical_reads = vec![
+            // read spanning two batches
+            TestLogicalRead::new(
+                DIO_CHUNK_SIZE.into_u64() / 2,
+                MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE,
+                Err("foo".to_owned()),
+            ),
+            // second read in failing chunk
+            TestLogicalRead::new(
+                (MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE).into_u64() + DIO_CHUNK_SIZE.into_u64() - 10,
+                5,
+                Err("foo".to_owned()),
+            ),
+            // read unaffected
+            TestLogicalRead::new(
+                (MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE).into_u64()
+                    + 2 * DIO_CHUNK_SIZE.into_u64()
+                    + 10,
+                5,
+                Ok(vec![1; 5]),
+            ),
+        ];
+        let (tmp, test_logical_reads) = test_logical_reads.into_iter().tee();
+        let test_logical_read_perms = tmp.permutations(test_logical_reads.len());
+
+        for test_logical_reads in test_logical_read_perms {
+            let file = mock_file!(
+                0, MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE => Ok(vec![0; MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE]),
+                (MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE).into_u64(), DIO_CHUNK_SIZE => Err("foo".to_owned()),
+                (MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE + 2*DIO_CHUNK_SIZE).into_u64(), DIO_CHUNK_SIZE => Ok(vec![1; DIO_CHUNK_SIZE]),
+            );
+            execute_and_validate_test_logical_reads(&file, test_logical_reads, &ctx).await;
+        }
+    }
+
+    struct TestShortReadsSetup {
+        ctx: RequestContext,
+        file: InMemoryFile,
+        written: u64,
+    }
+    fn setup_short_chunk_read_tests() -> TestShortReadsSetup {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        assert!(DIO_CHUNK_SIZE > 20, "test assumption");
+        let written = (2 * DIO_CHUNK_SIZE - 10).into_u64();
+        let file = InMemoryFile::new_random(written as usize);
+        TestShortReadsSetup { ctx, file, written }
+    }
+
+    #[tokio::test]
+    async fn test_short_chunk_read_from_written_range() {
+        // Test what happens if there are logical reads
+        // that start within the last chunk, and
+        // the last chunk is not the full chunk length.
+        //
+        // The read should succeed despite the short chunk length.
+        let TestShortReadsSetup { ctx, file, written } = setup_short_chunk_read_tests();
+
+        let a = file.test_logical_read(written - 10, 5);
+        let recorder = RecorderFile::new(&file);
+
+        execute_and_validate_test_logical_reads(&recorder, vec![a], &ctx).await;
+
+        let recorded = recorder.recorded.borrow();
+        assert_eq!(recorded.len(), 1);
+        let RecordedRead { pos, req_len, res } = &recorded[0];
+        assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64());
+        assert_eq!(*req_len, DIO_CHUNK_SIZE);
+        assert_eq!(res, &file.content[DIO_CHUNK_SIZE..(written as usize)]);
+    }
+
+    #[tokio::test]
+    async fn test_short_chunk_read_and_logical_read_from_unwritten_range() {
+        // Test what happens if there are logical reads
+        // that start within the last chunk, and
+        // the last chunk is not the full chunk length, and
+        // the logical reads end in the unwritten range.
+        //
+        // All should fail with UnexpectedEof and have the same IO pattern.
+        async fn the_impl(offset_delta: i64) {
+            let TestShortReadsSetup { ctx, file, written } = setup_short_chunk_read_tests();
+
+            let offset = u64::try_from(
+                i64::try_from(written)
+                    .unwrap()
+                    .checked_add(offset_delta)
+                    .unwrap(),
+            )
+            .unwrap();
+            let a = file.test_logical_read(offset, 5);
+            let recorder = RecorderFile::new(&file);
+            let a_vr = a.make_logical_read();
+            execute(&recorder, vec![&a_vr], &ctx).await;
+
+            // validate the LogicalRead result
+            let a_res = a_vr.into_result().unwrap();
+            let a_err = a_res.unwrap_err();
+            assert_eq!(a_err.kind(), std::io::ErrorKind::UnexpectedEof);
+
+            // validate the IO pattern
+            let recorded = recorder.recorded.borrow();
+            assert_eq!(recorded.len(), 1);
+            let RecordedRead { pos, req_len, res } = &recorded[0];
+            assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64());
+            assert_eq!(*req_len, DIO_CHUNK_SIZE);
+            assert_eq!(res, &file.content[DIO_CHUNK_SIZE..(written as usize)]);
+        }
+
+        the_impl(-1).await; // start == length - 1
+        the_impl(0).await; // start == length
+        the_impl(1).await; // start == length + 1
+    }
+
+    // TODO: mixed: some valid, some UnexpectedEof
+
+    // TODO: same tests but with merges
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 098c196ee8..e1dd80fbf2 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -69,7 +69,7 @@ use crate::{
         config::defaults::DEFAULT_PITR_INTERVAL,
         layer_map::{LayerMap, SearchResult},
         metadata::TimelineMetadata,
-        storage_layer::PersistentLayerDesc,
+        storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
     },
     walredo,
 };
@@ -1907,6 +1907,8 @@ impl Timeline {
 
             true
         } else if projected_layer_size >= checkpoint_distance {
+            // NB: this check is relied upon by:
+            let _ = IndexEntry::validate_checkpoint_distance;
             info!(
                 "Will roll layer at {} with layer size {} due to layer size ({})",
                 projected_lsn, layer_size, projected_layer_size
@@ -5702,7 +5704,7 @@ impl<'a> TimelineWriter<'a> {
             return Ok(());
         }
 
-        let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch);
+        let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch)?;
         let batch_max_lsn = serialized_batch.max_lsn;
         let buf_size: u64 = serialized_batch.raw.len() as u64;
 
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index f8f37b17e3..568cf62e56 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -78,6 +78,7 @@ where
             .expect("must not use after we returned an error")
     }
 
+    /// Guarantees that if Ok() is returned, all bytes in `chunk` have been accepted.
     #[cfg_attr(target_os = "macos", allow(dead_code))]
     pub async fn write_buffered<S: IoBuf + Send>(
         &mut self,
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index 66b6185aaa..f6404d68ac 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -247,9 +247,10 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
 
     compaction_period_s = 10
 
+    checkpoint_distance = 1024**3
     tenant_conf = {
         # Large space + time thresholds: effectively disable these limits
-        "checkpoint_distance": f"{1024 ** 4}",
+        "checkpoint_distance": f"{checkpoint_distance}",
         "checkpoint_timeout": "3600s",
         "compaction_period": f"{compaction_period_s}s",
     }
@@ -269,7 +270,11 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
     for tenant, timeline, last_flush_lsn in last_flush_lsns:
         http_client = env.pageserver.http_client()
         initdb_lsn = Lsn(http_client.timeline_detail(tenant, timeline)["initdb_lsn"])
-        total_bytes_ingested += last_flush_lsn - initdb_lsn
+        this_timeline_ingested = last_flush_lsn - initdb_lsn
+        assert (
+            this_timeline_ingested < checkpoint_distance * 0.8
+        ), "this test is supposed to fill InMemoryLayer"
+        total_bytes_ingested += this_timeline_ingested
 
     log.info(f"Ingested {total_bytes_ingested} bytes since initdb (vs max dirty {max_dirty_data})")
     assert total_bytes_ingested > max_dirty_data

From acc075071dbb5f365f809fcf5372216e17adae6f Mon Sep 17 00:00:00 2001
From: Andrew Rudenko <me@prepor.dev>
Date: Wed, 28 Aug 2024 21:09:26 +0200
Subject: [PATCH 1519/1571] feat(compute_ctl): add periodic `lease lsn`
 requests for static computes (#7994)

Part of #7497

## Problem

Static computes pinned at some fix LSN could be created initially within
PITR interval but eventually go out it. To make sure that Static
computes are not affected by GC, we need to start using the LSN lease
API (introduced in #8084) in compute_ctl.

## Summary of changes

**compute_ctl**
- Spawn a thread for when a static compute starts to periodically ping
pageserver(s) to make LSN lease requests.
- Add `test_readonly_node_gc` to test if static compute can read all
pages without error.
  - (test will fail on main without the code change here)

**page_service**
- `wait_or_get_last_lsn` will now allow `request_lsn` less than
`latest_gc_cutoff_lsn` to proceed if there is a lease on `request_lsn`.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
---
 compute_tools/src/bin/compute_ctl.rs      |   3 +
 compute_tools/src/lib.rs                  |   1 +
 compute_tools/src/lsn_lease.rs            | 186 ++++++++++++++++++++++
 pageserver/src/page_service.rs            |  45 ++++--
 test_runner/regress/test_readonly_node.py | 114 ++++++++++++-
 5 files changed, 331 insertions(+), 18 deletions(-)
 create mode 100644 compute_tools/src/lsn_lease.rs

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 0ba2c1aeb4..9499a7186e 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -44,6 +44,7 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
+use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info, warn};
@@ -366,6 +367,8 @@ fn wait_spec(
         state.start_time = now;
     }
 
+    launch_lsn_lease_bg_task_for_static(&compute);
+
     Ok(WaitSpecResult {
         compute,
         http_port,
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index 543d4462ed..c402d63305 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -11,6 +11,7 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod extension_server;
+pub mod lsn_lease;
 mod migration;
 pub mod monitor;
 pub mod params;
diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs
new file mode 100644
index 0000000000..7e5917c55f
--- /dev/null
+++ b/compute_tools/src/lsn_lease.rs
@@ -0,0 +1,186 @@
+use anyhow::bail;
+use anyhow::Result;
+use postgres::{NoTls, SimpleQueryMessage};
+use std::time::SystemTime;
+use std::{str::FromStr, sync::Arc, thread, time::Duration};
+use utils::id::TenantId;
+use utils::id::TimelineId;
+
+use compute_api::spec::ComputeMode;
+use tracing::{info, warn};
+use utils::{
+    lsn::Lsn,
+    shard::{ShardCount, ShardNumber, TenantShardId},
+};
+
+use crate::compute::ComputeNode;
+
+/// Spawns a background thread to periodically renew LSN leases for static compute.
+/// Do nothing if the compute is not in static mode.
+pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc<ComputeNode>) {
+    let (tenant_id, timeline_id, lsn) = {
+        let state = compute.state.lock().unwrap();
+        let spec = state.pspec.as_ref().expect("Spec must be set");
+        match spec.spec.mode {
+            ComputeMode::Static(lsn) => (spec.tenant_id, spec.timeline_id, lsn),
+            _ => return,
+        }
+    };
+    let compute = compute.clone();
+
+    let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn);
+    thread::spawn(move || {
+        let _entered = span.entered();
+        if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) {
+            // TODO: might need stronger error feedback than logging an warning.
+            warn!("Exited with error: {e}");
+        }
+    });
+}
+
+/// Renews lsn lease periodically so static compute are not affected by GC.
+fn lsn_lease_bg_task(
+    compute: Arc<ComputeNode>,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    lsn: Lsn,
+) -> Result<()> {
+    loop {
+        let valid_until = acquire_lsn_lease_with_retry(&compute, tenant_id, timeline_id, lsn)?;
+        let valid_duration = valid_until
+            .duration_since(SystemTime::now())
+            .unwrap_or(Duration::ZERO);
+
+        // Sleep for 60 seconds less than the valid duration but no more than half of the valid duration.
+        let sleep_duration = valid_duration
+            .saturating_sub(Duration::from_secs(60))
+            .max(valid_duration / 2);
+
+        info!(
+            "Succeeded, sleeping for {} seconds",
+            sleep_duration.as_secs()
+        );
+        thread::sleep(sleep_duration);
+    }
+}
+
+/// Acquires lsn lease in a retry loop. Returns the expiration time if a lease is granted.
+/// Returns an error if a lease is explicitly not granted. Otherwise, we keep sending requests.
+fn acquire_lsn_lease_with_retry(
+    compute: &Arc<ComputeNode>,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    lsn: Lsn,
+) -> Result<SystemTime> {
+    let mut attempts = 0usize;
+    let mut retry_period_ms: f64 = 500.0;
+    const MAX_RETRY_PERIOD_MS: f64 = 60.0 * 1000.0;
+
+    loop {
+        // Note: List of pageservers is dynamic, need to re-read configs before each attempt.
+        let configs = {
+            let state = compute.state.lock().unwrap();
+
+            let spec = state.pspec.as_ref().expect("spec must be set");
+
+            let conn_strings = spec.pageserver_connstr.split(',');
+
+            conn_strings
+                .map(|connstr| {
+                    let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
+                    if let Some(storage_auth_token) = &spec.storage_auth_token {
+                        info!("Got storage auth token from spec file");
+                        config.password(storage_auth_token.clone());
+                    } else {
+                        info!("Storage auth token not set");
+                    }
+                    config
+                })
+                .collect::<Vec<_>>()
+        };
+
+        let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs);
+        match result {
+            Ok(Some(res)) => {
+                return Ok(res);
+            }
+            Ok(None) => {
+                bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff");
+            }
+            Err(e) => {
+                warn!("Failed to acquire lsn lease: {e} (attempt {attempts}");
+
+                thread::sleep(Duration::from_millis(retry_period_ms as u64));
+                retry_period_ms *= 1.5;
+                retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS);
+            }
+        }
+        attempts += 1;
+    }
+}
+
+/// Tries to acquire an LSN lease through PS page_service API.
+fn try_acquire_lsn_lease(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    lsn: Lsn,
+    configs: &[postgres::Config],
+) -> Result<Option<SystemTime>> {
+    fn get_valid_until(
+        config: &postgres::Config,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+    ) -> Result<Option<SystemTime>> {
+        let mut client = config.connect(NoTls)?;
+        let cmd = format!("lease lsn {} {} {} ", tenant_shard_id, timeline_id, lsn);
+        let res = client.simple_query(&cmd)?;
+        let msg = match res.first() {
+            Some(msg) => msg,
+            None => bail!("empty response"),
+        };
+        let row = match msg {
+            SimpleQueryMessage::Row(row) => row,
+            _ => bail!("error parsing lsn lease response"),
+        };
+
+        // Note: this will be None if a lease is explicitly not granted.
+        let valid_until_str = row.get("valid_until");
+
+        let valid_until = valid_until_str.map(|s| {
+            SystemTime::UNIX_EPOCH
+                .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
+                .expect("Time larger than max SystemTime could handle")
+        });
+        Ok(valid_until)
+    }
+
+    let shard_count = configs.len();
+
+    let valid_until = if shard_count > 1 {
+        configs
+            .iter()
+            .enumerate()
+            .map(|(shard_number, config)| {
+                let tenant_shard_id = TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount::new(shard_count as u8),
+                    shard_number: ShardNumber(shard_number as u8),
+                };
+                get_valid_until(config, tenant_shard_id, timeline_id, lsn)
+            })
+            .collect::<Result<Vec<Option<SystemTime>>>>()?
+            .into_iter()
+            .min()
+            .unwrap()
+    } else {
+        get_valid_until(
+            &configs[0],
+            TenantShardId::unsharded(tenant_id),
+            timeline_id,
+            lsn,
+        )?
+    };
+
+    Ok(valid_until)
+}
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index cb1ab70147..39c6a6fb74 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -753,16 +753,21 @@ impl PageServerHandler {
         }
 
         if request_lsn < **latest_gc_cutoff_lsn {
-            // Check explicitly for INVALID just to get a less scary error message if the
-            // request is obviously bogus
-            return Err(if request_lsn == Lsn::INVALID {
-                PageStreamError::BadRequest("invalid LSN(0) in request".into())
-            } else {
-                PageStreamError::BadRequest(format!(
+            let gc_info = &timeline.gc_info.read().unwrap();
+            if !gc_info.leases.contains_key(&request_lsn) {
+                // The requested LSN is below gc cutoff and is not guarded by a lease.
+
+                // Check explicitly for INVALID just to get a less scary error message if the
+                // request is obviously bogus
+                return Err(if request_lsn == Lsn::INVALID {
+                    PageStreamError::BadRequest("invalid LSN(0) in request".into())
+                } else {
+                    PageStreamError::BadRequest(format!(
                         "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
                         request_lsn, **latest_gc_cutoff_lsn
                     ).into())
-            });
+                });
+            }
         }
 
         // Wait for WAL up to 'not_modified_since' to arrive, if necessary
@@ -789,6 +794,8 @@ impl PageServerHandler {
         }
     }
 
+    /// Handles the lsn lease request.
+    /// If a lease cannot be obtained, the client will receive NULL.
     #[instrument(skip_all, fields(shard_id, %lsn))]
     async fn handle_make_lsn_lease<IO>(
         &mut self,
@@ -811,19 +818,25 @@ impl PageServerHandler {
             .await?;
         set_tracing_field_shard_id(&timeline);
 
-        let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?;
-        let valid_until = lease
-            .valid_until
-            .duration_since(SystemTime::UNIX_EPOCH)
-            .map_err(|e| QueryError::Other(e.into()))?;
+        let lease = timeline
+            .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
+            .inspect_err(|e| {
+                warn!("{e}");
+            })
+            .ok();
+        let valid_until_str = lease.map(|l| {
+            l.valid_until
+                .duration_since(SystemTime::UNIX_EPOCH)
+                .expect("valid_until is earlier than UNIX_EPOCH")
+                .as_millis()
+                .to_string()
+        });
+        let bytes = valid_until_str.as_ref().map(|x| x.as_bytes());
 
         pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
             b"valid_until",
         )]))?
-        .write_message_noflush(&BeMessage::DataRow(&[Some(
-            &valid_until.as_millis().to_be_bytes(),
-        )]))?
-        .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        .write_message_noflush(&BeMessage::DataRow(&[bytes]))?;
 
         Ok(())
     }
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index ba8b91e84d..368f60127e 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -1,7 +1,15 @@
+import time
+
 import pytest
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv
+from fixtures.neon_fixtures import (
+    Endpoint,
+    NeonEnv,
+    NeonEnvBuilder,
+    last_flush_lsn_upload,
+    tenant_get_shards,
+)
 from fixtures.pageserver.utils import wait_for_last_record_lsn
 from fixtures.utils import query_scalar
 
@@ -17,7 +25,12 @@ def test_readonly_node(neon_simple_env: NeonEnv):
     env.neon_cli.create_branch("test_readonly_node", "empty")
     endpoint_main = env.endpoints.create_start("test_readonly_node")
 
-    env.pageserver.allowed_errors.append(".*basebackup .* failed: invalid basebackup lsn.*")
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*basebackup .* failed: invalid basebackup lsn.*",
+            ".*page_service.*handle_make_lsn_lease.*.*tried to request a page version that was garbage collected",
+        ]
+    )
 
     main_pg_conn = endpoint_main.connect()
     main_cur = main_pg_conn.cursor()
@@ -105,6 +118,103 @@ def test_readonly_node(neon_simple_env: NeonEnv):
         )
 
 
+def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
+    """
+    Test static endpoint is protected from GC by acquiring and renewing lsn leases.
+    """
+
+    neon_env_builder.num_pageservers = 2
+    # GC is manual triggered.
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            # small checkpointing and compaction targets to ensure we generate many upload operations
+            "checkpoint_distance": f"{128 * 1024}",
+            "compaction_threshold": "1",
+            "compaction_target_size": f"{128 * 1024}",
+            # no PITR horizon, we specify the horizon when we request on-demand GC
+            "pitr_interval": "0s",
+            # disable background compaction and GC. We invoke it manually when we want it to happen.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            # create image layers eagerly, so that GC can remove some layers
+            "image_creation_threshold": "1",
+            "image_layer_creation_check_threshold": "0",
+            # Short lease length to fit test.
+            "lsn_lease_length": "3s",
+        },
+        initial_tenant_shard_count=2,
+    )
+
+    ROW_COUNT = 500
+
+    def generate_updates_on_main(
+        env: NeonEnv,
+        ep_main: Endpoint,
+        data: int,
+        start=1,
+        end=ROW_COUNT,
+    ) -> Lsn:
+        """
+        Generates some load on main branch that results in some uploads.
+        """
+        with ep_main.cursor() as cur:
+            cur.execute(
+                f"INSERT INTO t0 (v0, v1) SELECT g, '{data}' FROM generate_series({start}, {end}) g ON CONFLICT (v0) DO UPDATE SET v1 = EXCLUDED.v1"
+            )
+            cur.execute("VACUUM t0")
+            last_flush_lsn = last_flush_lsn_upload(
+                env, ep_main, env.initial_tenant, env.initial_timeline
+            )
+        return last_flush_lsn
+
+    # Insert some records on main branch
+    with env.endpoints.create_start("main") as ep_main:
+        with ep_main.cursor() as cur:
+            cur.execute("CREATE TABLE t0(v0 int primary key, v1 text)")
+        lsn = None
+        for i in range(2):
+            lsn = generate_updates_on_main(env, ep_main, i)
+
+        with env.endpoints.create_start(
+            branch_name="main",
+            endpoint_id="static",
+            lsn=lsn,
+        ) as ep_static:
+            with ep_static.cursor() as cur:
+                cur.execute("SELECT count(*) FROM t0")
+                assert cur.fetchone() == (ROW_COUNT,)
+
+            time.sleep(3)
+
+            generate_updates_on_main(env, ep_main, i, end=100)
+
+            # Trigger GC
+            for shard, ps in tenant_get_shards(env, env.initial_tenant):
+                client = ps.http_client()
+                gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
+                log.info(f"{gc_result=}")
+
+                assert (
+                    gc_result["layers_removed"] == 0
+                ), "No layers should be removed, old layers are guarded by leases."
+
+            with ep_static.cursor() as cur:
+                cur.execute("SELECT count(*) FROM t0")
+                assert cur.fetchone() == (ROW_COUNT,)
+
+        # Do some update so we can increment latest_gc_cutoff
+        generate_updates_on_main(env, ep_main, i, end=100)
+
+    # Now trigger GC again, layers should be removed.
+    time.sleep(4)
+    for shard, ps in tenant_get_shards(env, env.initial_tenant):
+        client = ps.http_client()
+        gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
+        log.info(f"{gc_result=}")
+
+        assert gc_result["layers_removed"] > 0, "Old layers should be removed after leases expired."
+
+
 # Similar test, but with more data, and we force checkpoints
 def test_timetravel(neon_simple_env: NeonEnv):
     env = neon_simple_env

From cfa45ff5eee33a46f54ab2571fddf5e47925f363 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 29 Aug 2024 07:45:33 +0300
Subject: [PATCH 1520/1571] Undo walloging replorgin file on checkpoint (#8794)

## Problem

See #8620

## Summary of changes

Remove walloping of replorigin file because it is reconstructed by PS

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index b6910406e2..48388a5b59 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit b6910406e2d05a2c94baa2e530ec882733047759
+Subproject commit 48388a5b597c81c09e28c016650a7156b48717a1
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 76063bff63..8aa1ded772 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 76063bff638ccce7afa99fc9037ac51338b9823d
+Subproject commit 8aa1ded7726d416ac8e02600aad387a353478fc7
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 8efa089aa7..95132feffe 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 8efa089aa7786381543a4f9efc69b92d43eab8c0
+Subproject commit 95132feffe277ce84309d93a42e9aadfd2cb0437
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 50cc99c2f1..319e648488 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,14 +1,14 @@
 {
   "v16": [
     "16.4",
-    "8efa089aa7786381543a4f9efc69b92d43eab8c0"
+    "95132feffe277ce84309d93a42e9aadfd2cb0437"
   ],
   "v15": [
     "15.8",
-    "76063bff638ccce7afa99fc9037ac51338b9823d"
+    "8aa1ded7726d416ac8e02600aad387a353478fc7"
   ],
   "v14": [
     "14.13",
-    "b6910406e2d05a2c94baa2e530ec882733047759"
+    "48388a5b597c81c09e28c016650a7156b48717a1"
   ]
 }

From c2f8fdccd79b89e14dcef072d6169691f8d49f5a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 29 Aug 2024 13:06:00 +0200
Subject: [PATCH 1521/1571] ingest: rate-limited warning if WAL commit
 timestamps lags for > wait_lsn_timeout (#8839)

refs https://github.com/neondatabase/cloud/issues/13750

The logging in this commit will make it easier to detect lagging ingest.

We're trusting compute timestamps --- ideally we'd use SK timestmaps
instead.
But trusting the compute timestamp is ok for now.
---
 libs/postgres_ffi/src/lib.rs              |  2 +-
 libs/postgres_ffi/src/xlog_utils.rs       | 14 ++---
 libs/utils/src/rate_limit.rs              | 18 ++++++-
 pageserver/src/http/routes.rs             |  5 +-
 pageserver/src/tenant/timeline.rs         |  2 +-
 pageserver/src/walingest.rs               | 66 +++++++++++++++++++++++
 test_runner/regress/test_compatibility.py |  8 +++
 test_runner/regress/test_wal_receiver.py  |  6 +++
 8 files changed, 111 insertions(+), 10 deletions(-)

diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 0940ad207f..9acb105e9b 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -136,9 +136,9 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
 
 // Export some version independent functions that are used outside of this mod
 pub use v14::xlog_utils::encode_logical_message;
-pub use v14::xlog_utils::from_pg_timestamp;
 pub use v14::xlog_utils::get_current_timestamp;
 pub use v14::xlog_utils::to_pg_timestamp;
+pub use v14::xlog_utils::try_from_pg_timestamp;
 pub use v14::xlog_utils::XLogFileName;
 
 pub use v14::bindings::DBState_DB_SHUTDOWNED;
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 9fe7e8198b..0cfd56962e 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -135,6 +135,8 @@ pub fn get_current_timestamp() -> TimestampTz {
 mod timestamp_conversions {
     use std::time::Duration;
 
+    use anyhow::Context;
+
     use super::*;
 
     const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1)
@@ -154,18 +156,18 @@ mod timestamp_conversions {
         }
     }
 
-    pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime {
+    pub fn try_from_pg_timestamp(time: TimestampTz) -> anyhow::Result<SystemTime> {
         let time: u64 = time
             .try_into()
-            .expect("timestamp before millenium (postgres epoch)");
+            .context("timestamp before millenium (postgres epoch)")?;
         let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC;
         SystemTime::UNIX_EPOCH
             .checked_add(Duration::from_micros(since_unix_epoch))
-            .expect("SystemTime overflow")
+            .context("SystemTime overflow")
     }
 }
 
-pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp};
+pub use timestamp_conversions::{to_pg_timestamp, try_from_pg_timestamp};
 
 // Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
 // start_lsn must point to some previously known record boundary (beginning of
@@ -545,14 +547,14 @@ mod tests {
     #[test]
     fn test_ts_conversion() {
         let now = SystemTime::now();
-        let round_trip = from_pg_timestamp(to_pg_timestamp(now));
+        let round_trip = try_from_pg_timestamp(to_pg_timestamp(now)).unwrap();
 
         let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap();
         let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap();
         assert_eq!(now_since.as_micros(), round_trip_since.as_micros());
 
         let now_pg = get_current_timestamp();
-        let round_trip_pg = to_pg_timestamp(from_pg_timestamp(now_pg));
+        let round_trip_pg = to_pg_timestamp(try_from_pg_timestamp(now_pg).unwrap());
 
         assert_eq!(now_pg, round_trip_pg);
     }
diff --git a/libs/utils/src/rate_limit.rs b/libs/utils/src/rate_limit.rs
index 557955bb88..f3f8f219e3 100644
--- a/libs/utils/src/rate_limit.rs
+++ b/libs/utils/src/rate_limit.rs
@@ -5,6 +5,15 @@ use std::time::{Duration, Instant};
 pub struct RateLimit {
     last: Option<Instant>,
     interval: Duration,
+    dropped: u64,
+}
+
+pub struct RateLimitStats(u64);
+
+impl std::fmt::Display for RateLimitStats {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{} dropped calls", self.0)
+    }
 }
 
 impl RateLimit {
@@ -12,20 +21,27 @@ impl RateLimit {
         Self {
             last: None,
             interval,
+            dropped: 0,
         }
     }
 
     /// Call `f` if the rate limit allows.
     /// Don't call it otherwise.
     pub fn call<F: FnOnce()>(&mut self, f: F) {
+        self.call2(|_| f())
+    }
+
+    pub fn call2<F: FnOnce(RateLimitStats)>(&mut self, f: F) {
         let now = Instant::now();
         match self.last {
             Some(last) if now - last <= self.interval => {
                 // ratelimit
+                self.dropped += 1;
             }
             _ => {
                 self.last = Some(now);
-                f();
+                f(RateLimitStats(self.dropped));
+                self.dropped = 0;
             }
         }
     }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a126136d20..cb7c2b60ef 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -871,7 +871,10 @@ async fn get_timestamp_of_lsn_handler(
 
     match result {
         Some(time) => {
-            let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
+            let time = format_rfc3339(
+                postgres_ffi::try_from_pg_timestamp(time).map_err(ApiError::InternalServerError)?,
+            )
+            .to_string();
             json_response(StatusCode::OK, time)
         }
         None => Err(ApiError::NotFound(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e1dd80fbf2..8096a0d18c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -218,7 +218,7 @@ pub(crate) struct RelSizeCache {
 }
 
 pub struct Timeline {
-    conf: &'static PageServerConf,
+    pub(crate) conf: &'static PageServerConf,
     tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
 
     myself: Weak<Self>,
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 8425528740..8ccd20adb1 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,19 +21,25 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.
 
+use std::time::Duration;
+use std::time::SystemTime;
+
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
+use postgres_ffi::TimestampTz;
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
 
 use anyhow::{bail, Context, Result};
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
 use utils::failpoint_support;
+use utils::rate_limit::RateLimit;
 
 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::{DatadirModification, Version};
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::walrecord::*;
@@ -53,6 +59,13 @@ pub struct WalIngest {
     shard: ShardIdentity,
     checkpoint: CheckPoint,
     checkpoint_modified: bool,
+    warn_ingest_lag: WarnIngestLag,
+}
+
+struct WarnIngestLag {
+    lag_msg_ratelimit: RateLimit,
+    future_lsn_msg_ratelimit: RateLimit,
+    timestamp_invalid_msg_ratelimit: RateLimit,
 }
 
 impl WalIngest {
@@ -71,6 +84,11 @@ impl WalIngest {
             shard: *timeline.get_shard_identity(),
             checkpoint,
             checkpoint_modified: false,
+            warn_ingest_lag: WarnIngestLag {
+                lag_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
+                future_lsn_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
+                timestamp_invalid_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
+            },
         })
     }
 
@@ -1212,6 +1230,48 @@ impl WalIngest {
         Ok(())
     }
 
+    fn warn_on_ingest_lag(
+        &mut self,
+        conf: &crate::config::PageServerConf,
+        wal_timestmap: TimestampTz,
+    ) {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        let now = SystemTime::now();
+        let rate_limits = &mut self.warn_ingest_lag;
+        match try_from_pg_timestamp(wal_timestmap) {
+            Ok(ts) => {
+                match now.duration_since(ts) {
+                    Ok(lag) => {
+                        if lag > conf.wait_lsn_timeout {
+                            rate_limits.lag_msg_ratelimit.call2(|rate_limit_stats| {
+                                let lag = humantime::format_duration(lag);
+                                warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout");
+                            })
+                        }
+                    },
+                    Err(e) => {
+                        let delta_t = e.duration();
+                        // determined by prod victoriametrics query: 1000 * (timestamp(node_time_seconds{neon_service="pageserver"}) - node_time_seconds)
+                        // => https://www.robustperception.io/time-metric-from-the-node-exporter/
+                        const IGNORED_DRIFT: Duration = Duration::from_millis(100);
+                        if delta_t > IGNORED_DRIFT {
+                            let delta_t = humantime::format_duration(delta_t);
+                            rate_limits.future_lsn_msg_ratelimit.call2(|rate_limit_stats| {
+                                warn!(%rate_limit_stats, %delta_t, "ingesting record with timestamp from future");
+                            })
+                        }
+                    }
+                };
+
+            }
+            Err(error) => {
+                rate_limits.timestamp_invalid_msg_ratelimit.call2(|rate_limit_stats| {
+                    warn!(%rate_limit_stats, %error, "ingesting record with invalid timestamp, cannot calculate lag and will fail find-lsn-for-timestamp type queries");
+                })
+            }
+        }
+    }
+
     /// Subroutine of ingest_record(), to handle an XLOG_XACT_* records.
     ///
     async fn ingest_xact_record(
@@ -1228,6 +1288,8 @@ impl WalIngest {
         let mut rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
         let mut page_xids: Vec<TransactionId> = vec![parsed.xid];
 
+        self.warn_on_ingest_lag(modification.tline.conf, parsed.xact_time);
+
         for subxact in &parsed.subxacts {
             let subxact_pageno = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
             if subxact_pageno != pageno {
@@ -2303,6 +2365,9 @@ mod tests {
         let _endpoint = Lsn::from_hex("1FFFF98").unwrap();
 
         let harness = TenantHarness::create("test_ingest_real_wal").await.unwrap();
+        let span = harness
+            .span()
+            .in_scope(|| info_span!("timeline_span", timeline_id=%TIMELINE_ID));
         let (tenant, ctx) = harness.load().await;
 
         let remote_initdb_path =
@@ -2354,6 +2419,7 @@ mod tests {
             while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
                 walingest
                     .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
+                    .instrument(span.clone())
                     .await
                     .unwrap();
             }
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index c361efe90a..cd3f405a86 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -173,6 +173,11 @@ def test_backward_compatibility(
     try:
         neon_env_builder.num_safekeepers = 3
         env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
+        # check_neon_works does recovery from WAL => the compatibility snapshot's WAL is old => will log this warning
+        ingest_lag_log_line = (
+            ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
+        )
+        env.pageserver.allowed_errors.append(ingest_lag_log_line)
         neon_env_builder.start()
 
         check_neon_works(
@@ -181,6 +186,9 @@ def test_backward_compatibility(
             sql_dump_path=compatibility_snapshot_dir / "dump.sql",
             repo_dir=env.repo_dir,
         )
+
+        env.pageserver.assert_log_contains(ingest_lag_log_line)
+
     except Exception:
         if breaking_changes_allowed:
             pytest.xfail(
diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py
index 6582b34218..229d3efd8e 100644
--- a/test_runner/regress/test_wal_receiver.py
+++ b/test_runner/regress/test_wal_receiver.py
@@ -62,6 +62,12 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
     elements_to_insert = 1_000_000
     expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
     env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
+    # we configure wait_lsn_timeout to a shorter value than the lagging_wal_timeout / walreceiver_connect_timeout
+    # => after we run into a timeout and reconnect to a different SK, more time than wait_lsn_timeout has passed
+    # ==> we log this error
+    env.pageserver.allowed_errors.append(
+        ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
+    )
 
     insert_test_elements(env, tenant_id, start=0, count=elements_to_insert)
 

From a644f01b6af2d414f877a78bddb928f0b033762d Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 29 Aug 2024 12:26:52 +0100
Subject: [PATCH 1522/1571] proxy+pageserver: shared leaky bucket impl (#8539)

In proxy I switched to a leaky-bucket impl using the GCRA algorithm. I
figured I could share the code with pageserver and remove the
leaky_bucket crate dependency with some very basic tokio timers and
queues for fairness.

The underlying algorithm should be fairly clear how it works from the
comments I have left in the code.

---

In benchmarking pageserver, @problame found that the new implementation
fixes a getpage throughput discontinuity in pageserver under the
`pagebench get-page-latest-lsn` benchmark with the clickbench dataset
(`test_perf_olap.py`).
The discontinuity is that for any of `--num-clients={2,3,4}`, getpage
throughput remains 10k.
With `--num-clients=5` and greater, getpage throughput then jumps to the
configured 20k rate limit.
With the changes in this PR, the discontinuity is gone, and we scale
throughput linearly to `--num-clients` until the configured rate limit.

More context in
https://github.com/neondatabase/cloud/issues/16886#issuecomment-2315257641.

closes https://github.com/neondatabase/cloud/issues/16886

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 Cargo.lock                                    |  13 -
 Cargo.toml                                    |   1 -
 libs/pageserver_api/src/models.rs             |  12 +-
 libs/utils/Cargo.toml                         |   1 -
 libs/utils/src/leaky_bucket.rs                | 280 ++++++++++++++++++
 libs/utils/src/lib.rs                         |   1 +
 pageserver/Cargo.toml                         |   1 -
 pageserver/src/tenant/throttle.rs             |  47 ++-
 proxy/src/rate_limiter.rs                     |   4 +-
 proxy/src/rate_limiter/leaky_bucket.rs        |  92 ++----
 .../regress/test_attach_tenant_config.py      |   1 -
 .../test_pageserver_getpage_throttle.py       |  56 ++++
 12 files changed, 395 insertions(+), 114 deletions(-)
 create mode 100644 libs/utils/src/leaky_bucket.rs

diff --git a/Cargo.lock b/Cargo.lock
index c514625518..0c246bd258 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2950,17 +2950,6 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
-[[package]]
-name = "leaky-bucket"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853"
-dependencies = [
- "parking_lot 0.12.1",
- "tokio",
- "tracing",
-]
-
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3714,7 +3703,6 @@ dependencies = [
  "humantime-serde",
  "hyper 0.14.26",
  "itertools 0.10.5",
- "leaky-bucket",
  "md5",
  "metrics",
  "nix 0.27.1",
@@ -6983,7 +6971,6 @@ dependencies = [
  "humantime",
  "hyper 0.14.26",
  "jsonwebtoken",
- "leaky-bucket",
  "metrics",
  "nix 0.27.1",
  "once_cell",
diff --git a/Cargo.toml b/Cargo.toml
index 7bd9a26394..fa949f9757 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -108,7 +108,6 @@ ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "9"
 lasso = "0.7"
-leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d39ac75707..1d896863df 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -7,7 +7,7 @@ pub use utilization::PageserverUtilization;
 use std::{
     collections::HashMap,
     io::{BufRead, Read},
-    num::{NonZeroU64, NonZeroUsize},
+    num::{NonZeroU32, NonZeroU64, NonZeroUsize},
     str::FromStr,
     sync::atomic::AtomicUsize,
     time::{Duration, SystemTime},
@@ -486,12 +486,11 @@ pub struct EvictionPolicyLayerAccessThreshold {
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
 pub struct ThrottleConfig {
     pub task_kinds: Vec<String>, // TaskKind
-    pub initial: usize,
+    pub initial: u32,
     #[serde(with = "humantime_serde")]
     pub refill_interval: Duration,
-    pub refill_amount: NonZeroUsize,
-    pub max: usize,
-    pub fair: bool,
+    pub refill_amount: NonZeroU32,
+    pub max: u32,
 }
 
 impl ThrottleConfig {
@@ -501,9 +500,8 @@ impl ThrottleConfig {
             // other values don't matter with emtpy `task_kinds`.
             initial: 0,
             refill_interval: Duration::from_millis(1),
-            refill_amount: NonZeroUsize::new(1).unwrap(),
+            refill_amount: NonZeroU32::new(1).unwrap(),
             max: 1,
-            fair: true,
         }
     }
     /// The requests per second allowed  by the given config.
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 6e593eeac1..777fb95ece 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -26,7 +26,6 @@ hyper = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
 jsonwebtoken.workspace = true
-leaky-bucket.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
diff --git a/libs/utils/src/leaky_bucket.rs b/libs/utils/src/leaky_bucket.rs
new file mode 100644
index 0000000000..a120dc0ac5
--- /dev/null
+++ b/libs/utils/src/leaky_bucket.rs
@@ -0,0 +1,280 @@
+//! This module implements the Generic Cell Rate Algorithm for a simplified
+//! version of the Leaky Bucket rate limiting system.
+//!
+//! # Leaky Bucket
+//!
+//! If the bucket is full, no new requests are allowed and are throttled/errored.
+//! If the bucket is partially full/empty, new requests are added to the bucket in
+//! terms of "tokens".
+//!
+//! Over time, tokens are removed from the bucket, naturally allowing new requests at a steady rate.
+//!
+//! The bucket size tunes the burst support. The drain rate tunes the steady-rate requests per second.
+//!
+//! # [GCRA](https://en.wikipedia.org/wiki/Generic_cell_rate_algorithm)
+//!
+//! GCRA is a continuous rate leaky-bucket impl that stores minimal state and requires
+//! no background jobs to drain tokens, as the design utilises timestamps to drain automatically over time.
+//!
+//! We store an "empty_at" timestamp as the only state. As time progresses, we will naturally approach
+//! the empty state. The full-bucket state is calculated from `empty_at - config.bucket_width`.
+//!
+//! Another explaination can be found here: <https://brandur.org/rate-limiting>
+
+use std::{sync::Mutex, time::Duration};
+
+use tokio::{sync::Notify, time::Instant};
+
+pub struct LeakyBucketConfig {
+    /// This is the "time cost" of a single request unit.
+    /// Should loosely represent how long it takes to handle a request unit in active resource time.
+    /// Loosely speaking this is the inverse of the steady-rate requests-per-second
+    pub cost: Duration,
+
+    /// total size of the bucket
+    pub bucket_width: Duration,
+}
+
+impl LeakyBucketConfig {
+    pub fn new(rps: f64, bucket_size: f64) -> Self {
+        let cost = Duration::from_secs_f64(rps.recip());
+        let bucket_width = cost.mul_f64(bucket_size);
+        Self { cost, bucket_width }
+    }
+}
+
+pub struct LeakyBucketState {
+    /// Bucket is represented by `allow_at..empty_at` where `allow_at = empty_at - config.bucket_width`.
+    ///
+    /// At any given time, `empty_at - now` represents the number of tokens in the bucket, multiplied by the "time_cost".
+    /// Adding `n` tokens to the bucket is done by moving `empty_at` forward by `n * config.time_cost`.
+    /// If `now < allow_at`, the bucket is considered filled and cannot accept any more tokens.
+    /// Draining the bucket will happen naturally as `now` moves forward.
+    ///
+    /// Let `n` be some "time cost" for the request,
+    /// If now is after empty_at, the bucket is empty and the empty_at is reset to now,
+    /// If now is within the `bucket window + n`, we are within time budget.
+    /// If now is before the `bucket window + n`, we have run out of budget.
+    ///
+    /// This is inspired by the generic cell rate algorithm (GCRA) and works
+    /// exactly the same as a leaky-bucket.
+    pub empty_at: Instant,
+}
+
+impl LeakyBucketState {
+    pub fn with_initial_tokens(config: &LeakyBucketConfig, initial_tokens: f64) -> Self {
+        LeakyBucketState {
+            empty_at: Instant::now() + config.cost.mul_f64(initial_tokens),
+        }
+    }
+
+    pub fn bucket_is_empty(&self, now: Instant) -> bool {
+        // if self.end is after now, the bucket is not empty
+        self.empty_at <= now
+    }
+
+    /// Immediately adds tokens to the bucket, if there is space.
+    ///
+    /// In a scenario where you are waiting for available rate,
+    /// rather than just erroring immediately, `started` corresponds to when this waiting started.
+    ///
+    /// `n` is the number of tokens that will be filled in the bucket.
+    ///
+    /// # Errors
+    ///
+    /// If there is not enough space, no tokens are added. Instead, an error is returned with the time when
+    /// there will be space again.
+    pub fn add_tokens(
+        &mut self,
+        config: &LeakyBucketConfig,
+        started: Instant,
+        n: f64,
+    ) -> Result<(), Instant> {
+        let now = Instant::now();
+
+        // invariant: started <= now
+        debug_assert!(started <= now);
+
+        // If the bucket was empty when we started our search,
+        // we should update the `empty_at` value accordingly.
+        // this prevents us from having negative tokens in the bucket.
+        let mut empty_at = self.empty_at;
+        if empty_at < started {
+            empty_at = started;
+        }
+
+        let n = config.cost.mul_f64(n);
+        let new_empty_at = empty_at + n;
+        let allow_at = new_empty_at.checked_sub(config.bucket_width);
+
+        //                     empty_at
+        //          allow_at    |   new_empty_at
+        //           /          |   /
+        // -------o-[---------o-|--]---------
+        //   now1 ^      now2 ^
+        //
+        // at now1, the bucket would be completely filled if we add n tokens.
+        // at now2, the bucket would be partially filled if we add n tokens.
+
+        match allow_at {
+            Some(allow_at) if now < allow_at => Err(allow_at),
+            _ => {
+                self.empty_at = new_empty_at;
+                Ok(())
+            }
+        }
+    }
+}
+
+pub struct RateLimiter {
+    pub config: LeakyBucketConfig,
+    pub state: Mutex<LeakyBucketState>,
+    /// a queue to provide this fair ordering.
+    pub queue: Notify,
+}
+
+struct Requeue<'a>(&'a Notify);
+
+impl Drop for Requeue<'_> {
+    fn drop(&mut self) {
+        self.0.notify_one();
+    }
+}
+
+impl RateLimiter {
+    pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self {
+        RateLimiter {
+            state: Mutex::new(LeakyBucketState::with_initial_tokens(
+                &config,
+                initial_tokens,
+            )),
+            config,
+            queue: {
+                let queue = Notify::new();
+                queue.notify_one();
+                queue
+            },
+        }
+    }
+
+    pub fn steady_rps(&self) -> f64 {
+        self.config.cost.as_secs_f64().recip()
+    }
+
+    /// returns true if we did throttle
+    pub async fn acquire(&self, count: usize) -> bool {
+        let mut throttled = false;
+
+        let start = tokio::time::Instant::now();
+
+        // wait until we are the first in the queue
+        let mut notified = std::pin::pin!(self.queue.notified());
+        if !notified.as_mut().enable() {
+            throttled = true;
+            notified.await;
+        }
+
+        // notify the next waiter in the queue when we are done.
+        let _guard = Requeue(&self.queue);
+
+        loop {
+            let res = self
+                .state
+                .lock()
+                .unwrap()
+                .add_tokens(&self.config, start, count as f64);
+            match res {
+                Ok(()) => return throttled,
+                Err(ready_at) => {
+                    throttled = true;
+                    tokio::time::sleep_until(ready_at).await;
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use tokio::time::Instant;
+
+    use super::{LeakyBucketConfig, LeakyBucketState};
+
+    #[tokio::test(start_paused = true)]
+    async fn check() {
+        let config = LeakyBucketConfig {
+            // average 100rps
+            cost: Duration::from_millis(10),
+            // burst up to 100 requests
+            bucket_width: Duration::from_millis(1000),
+        };
+
+        let mut state = LeakyBucketState {
+            empty_at: Instant::now(),
+        };
+
+        // supports burst
+        {
+            // should work for 100 requests this instant
+            for _ in 0..100 {
+                state.add_tokens(&config, Instant::now(), 1.0).unwrap();
+            }
+            let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
+            assert_eq!(ready - Instant::now(), Duration::from_millis(10));
+        }
+
+        // doesn't overfill
+        {
+            // after 1s we should have an empty bucket again.
+            tokio::time::advance(Duration::from_secs(1)).await;
+            assert!(state.bucket_is_empty(Instant::now()));
+
+            // after 1s more, we should not over count the tokens and allow more than 200 requests.
+            tokio::time::advance(Duration::from_secs(1)).await;
+            for _ in 0..100 {
+                state.add_tokens(&config, Instant::now(), 1.0).unwrap();
+            }
+            let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
+            assert_eq!(ready - Instant::now(), Duration::from_millis(10));
+        }
+
+        // supports sustained rate over a long period
+        {
+            tokio::time::advance(Duration::from_secs(1)).await;
+
+            // should sustain 100rps
+            for _ in 0..2000 {
+                tokio::time::advance(Duration::from_millis(10)).await;
+                state.add_tokens(&config, Instant::now(), 1.0).unwrap();
+            }
+        }
+
+        // supports requesting more tokens than can be stored in the bucket
+        // we just wait a little bit longer upfront.
+        {
+            // start the bucket completely empty
+            tokio::time::advance(Duration::from_secs(5)).await;
+            assert!(state.bucket_is_empty(Instant::now()));
+
+            // requesting 200 tokens of space should take 200*cost = 2s
+            // but we already have 1s available, so we wait 1s from start.
+            let start = Instant::now();
+
+            let ready = state.add_tokens(&config, start, 200.0).unwrap_err();
+            assert_eq!(ready - Instant::now(), Duration::from_secs(1));
+
+            tokio::time::advance(Duration::from_millis(500)).await;
+            let ready = state.add_tokens(&config, start, 200.0).unwrap_err();
+            assert_eq!(ready - Instant::now(), Duration::from_millis(500));
+
+            tokio::time::advance(Duration::from_millis(500)).await;
+            state.add_tokens(&config, start, 200.0).unwrap();
+
+            // bucket should be completely full now
+            let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
+            assert_eq!(ready - Instant::now(), Duration::from_millis(10));
+        }
+    }
+}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index f4fc0ba57b..218dd468b1 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -71,6 +71,7 @@ pub mod postgres_client;
 
 pub mod tracing_span_assert;
 
+pub mod leaky_bucket;
 pub mod rate_limit;
 
 /// Simple once-barrier and a guard which keeps barrier awaiting.
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 85c5e24afc..9c02ce3fbc 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -37,7 +37,6 @@ humantime.workspace = true
 humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
-leaky-bucket.workspace = true
 md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index f3f3d5e3ae..f222e708e1 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -10,6 +10,7 @@ use std::{
 use arc_swap::ArcSwap;
 use enumset::EnumSet;
 use tracing::{error, warn};
+use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};
 
 use crate::{context::RequestContext, task_mgr::TaskKind};
 
@@ -33,8 +34,7 @@ pub struct Throttle<M: Metric> {
 
 pub struct Inner {
     task_kinds: EnumSet<TaskKind>,
-    rate_limiter: Arc<leaky_bucket::RateLimiter>,
-    config: Config,
+    rate_limiter: Arc<RateLimiter>,
 }
 
 pub type Config = pageserver_api::models::ThrottleConfig;
@@ -77,8 +77,7 @@ where
             refill_interval,
             refill_amount,
             max,
-            fair,
-        } = &config;
+        } = config;
         let task_kinds: EnumSet<TaskKind> = task_kinds
             .iter()
             .filter_map(|s| match TaskKind::from_str(s) {
@@ -93,18 +92,21 @@ where
                 }
             })
             .collect();
+
+        // steady rate, we expect `refill_amount` requests per `refill_interval`.
+        // dividing gives us the rps.
+        let rps = f64::from(refill_amount.get()) / refill_interval.as_secs_f64();
+        let config = LeakyBucketConfig::new(rps, f64::from(max));
+
+        // initial tracks how many tokens are available to put in the bucket
+        // we want how many tokens are currently in the bucket
+        let initial_tokens = max - initial;
+
+        let rate_limiter = RateLimiter::with_initial_tokens(config, f64::from(initial_tokens));
+
         Inner {
             task_kinds,
-            rate_limiter: Arc::new(
-                leaky_bucket::RateLimiter::builder()
-                    .initial(*initial)
-                    .interval(*refill_interval)
-                    .refill(refill_amount.get())
-                    .max(*max)
-                    .fair(*fair)
-                    .build(),
-            ),
-            config,
+            rate_limiter: Arc::new(rate_limiter),
         }
     }
     pub fn reconfigure(&self, config: Config) {
@@ -127,7 +129,7 @@ where
 
     /// See [`Config::steady_rps`].
     pub fn steady_rps(&self) -> f64 {
-        self.inner.load().config.steady_rps()
+        self.inner.load().rate_limiter.steady_rps()
     }
 
     pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option<Duration> {
@@ -136,18 +138,9 @@ where
             return None;
         };
         let start = std::time::Instant::now();
-        let mut did_throttle = false;
-        let acquire = inner.rate_limiter.acquire(key_count);
-        // turn off runtime-induced preemption (aka coop) so our `did_throttle` is accurate
-        let acquire = tokio::task::unconstrained(acquire);
-        let mut acquire = std::pin::pin!(acquire);
-        std::future::poll_fn(|cx| {
-            use std::future::Future;
-            let poll = acquire.as_mut().poll(cx);
-            did_throttle = did_throttle || poll.is_pending();
-            poll
-        })
-        .await;
+
+        let did_throttle = inner.rate_limiter.acquire(key_count).await;
+
         self.count_accounted.fetch_add(1, Ordering::Relaxed);
         if did_throttle {
             self.count_throttled.fetch_add(1, Ordering::Relaxed);
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index e5f5867998..6e38f89458 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -10,7 +10,5 @@ pub(crate) use limit_algorithm::{
 };
 pub(crate) use limiter::GlobalRateLimiter;
 
-pub use leaky_bucket::{
-    EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState,
-};
+pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter};
 pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs
index fa8cb75256..bf4d85f2e4 100644
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -8,6 +8,7 @@ use dashmap::DashMap;
 use rand::{thread_rng, Rng};
 use tokio::time::Instant;
 use tracing::info;
+use utils::leaky_bucket::LeakyBucketState;
 
 use crate::intern::EndpointIdInt;
 
@@ -16,7 +17,7 @@ pub type EndpointRateLimiter = LeakyBucketRateLimiter<EndpointIdInt>;
 
 pub struct LeakyBucketRateLimiter<Key> {
     map: DashMap<Key, LeakyBucketState, RandomState>,
-    config: LeakyBucketConfig,
+    config: utils::leaky_bucket::LeakyBucketConfig,
     access_count: AtomicUsize,
 }
 
@@ -29,7 +30,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
     pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self {
         Self {
             map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
-            config,
+            config: config.into(),
             access_count: AtomicUsize::new(0),
         }
     }
@@ -42,12 +43,12 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
             self.do_gc(now);
         }
 
-        let mut entry = self.map.entry(key).or_insert_with(|| LeakyBucketState {
-            time: now,
-            filled: 0.0,
-        });
+        let mut entry = self
+            .map
+            .entry(key)
+            .or_insert_with(|| LeakyBucketState { empty_at: now });
 
-        entry.check(&self.config, now, n as f64)
+        entry.add_tokens(&self.config, now, n as f64).is_ok()
     }
 
     fn do_gc(&self, now: Instant) {
@@ -59,7 +60,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
         let shard = thread_rng().gen_range(0..n);
         self.map.shards()[shard]
             .write()
-            .retain(|_, value| !value.get_mut().update(&self.config, now));
+            .retain(|_, value| !value.get().bucket_is_empty(now));
     }
 }
 
@@ -68,11 +69,6 @@ pub struct LeakyBucketConfig {
     pub max: f64,
 }
 
-pub struct LeakyBucketState {
-    filled: f64,
-    time: Instant,
-}
-
 #[cfg(test)]
 impl LeakyBucketConfig {
     pub(crate) fn new(rps: f64, max: f64) -> Self {
@@ -82,40 +78,9 @@ impl LeakyBucketConfig {
     }
 }
 
-impl LeakyBucketState {
-    pub(crate) fn new() -> Self {
-        Self {
-            filled: 0.0,
-            time: Instant::now(),
-        }
-    }
-
-    /// updates the timer and returns true if the bucket is empty
-    fn update(&mut self, info: &LeakyBucketConfig, now: Instant) -> bool {
-        let drain = now.duration_since(self.time);
-        let drain = drain.as_secs_f64() * info.rps;
-
-        self.filled = (self.filled - drain).clamp(0.0, info.max);
-        self.time = now;
-
-        self.filled == 0.0
-    }
-
-    pub(crate) fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool {
-        self.update(info, now);
-
-        if self.filled + n > info.max {
-            return false;
-        }
-        self.filled += n;
-
-        true
-    }
-}
-
-impl Default for LeakyBucketState {
-    fn default() -> Self {
-        Self::new()
+impl From<LeakyBucketConfig> for utils::leaky_bucket::LeakyBucketConfig {
+    fn from(config: LeakyBucketConfig) -> Self {
+        utils::leaky_bucket::LeakyBucketConfig::new(config.rps, config.max)
     }
 }
 
@@ -125,48 +90,55 @@ mod tests {
     use std::time::Duration;
 
     use tokio::time::Instant;
+    use utils::leaky_bucket::LeakyBucketState;
 
-    use super::{LeakyBucketConfig, LeakyBucketState};
+    use super::LeakyBucketConfig;
 
     #[tokio::test(start_paused = true)]
     async fn check() {
-        let info = LeakyBucketConfig::new(500.0, 2000.0);
-        let mut bucket = LeakyBucketState::new();
+        let config: utils::leaky_bucket::LeakyBucketConfig =
+            LeakyBucketConfig::new(500.0, 2000.0).into();
+        assert_eq!(config.cost, Duration::from_millis(2));
+        assert_eq!(config.bucket_width, Duration::from_secs(4));
+
+        let mut bucket = LeakyBucketState {
+            empty_at: Instant::now(),
+        };
 
         // should work for 2000 requests this second
         for _ in 0..2000 {
-            assert!(bucket.check(&info, Instant::now(), 1.0));
+            bucket.add_tokens(&config, Instant::now(), 1.0).unwrap();
         }
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
-        assert_eq!(bucket.filled, 2000.0);
+        bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
+        assert_eq!(bucket.empty_at - Instant::now(), config.bucket_width);
 
         // in 1ms we should drain 0.5 tokens.
         // make sure we don't lose any tokens
         tokio::time::advance(Duration::from_millis(1)).await;
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
+        bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
         tokio::time::advance(Duration::from_millis(1)).await;
-        assert!(bucket.check(&info, Instant::now(), 1.0));
+        bucket.add_tokens(&config, Instant::now(), 1.0).unwrap();
 
         // in 10ms we should drain 5 tokens
         tokio::time::advance(Duration::from_millis(10)).await;
         for _ in 0..5 {
-            assert!(bucket.check(&info, Instant::now(), 1.0));
+            bucket.add_tokens(&config, Instant::now(), 1.0).unwrap();
         }
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
+        bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
 
         // in 10s we should drain 5000 tokens
         // but cap is only 2000
         tokio::time::advance(Duration::from_secs(10)).await;
         for _ in 0..2000 {
-            assert!(bucket.check(&info, Instant::now(), 1.0));
+            bucket.add_tokens(&config, Instant::now(), 1.0).unwrap();
         }
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
+        bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
 
         // should sustain 500rps
         for _ in 0..2000 {
             tokio::time::advance(Duration::from_millis(10)).await;
             for _ in 0..5 {
-                assert!(bucket.check(&info, Instant::now(), 1.0));
+                bucket.add_tokens(&config, Instant::now(), 1.0).unwrap();
             }
         }
     }
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index a7eda73d4c..bb337d9cc1 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -162,7 +162,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "min_resident_size_override": 23,
         "timeline_get_throttle": {
             "task_kinds": ["PageRequestHandler"],
-            "fair": True,
             "initial": 0,
             "refill_interval": "1s",
             "refill_amount": 1000,
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index 111285b40c..4c9eac5cd7 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -1,3 +1,4 @@
+import copy
 import json
 import uuid
 
@@ -116,3 +117,58 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
     assert (
         duration_secs >= 10 * actual_smgr_query_seconds
     ), "smgr metrics should not include throttle wait time"
+
+
+throttle_config_with_field_fair_set = {
+    "task_kinds": ["PageRequestHandler"],
+    "fair": True,
+    "initial": 27,
+    "refill_interval": "43s",
+    "refill_amount": 23,
+    "max": 42,
+}
+
+
+def assert_throttle_config_with_field_fair_set(conf):
+    """
+    Field `fair` is ignored, so, responses don't contain it
+    """
+    without_fair = copy.deepcopy(throttle_config_with_field_fair_set)
+    without_fair.pop("fair")
+
+    assert conf == without_fair
+
+
+def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder):
+    """
+    To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
+    """
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+    # with_fair config should still be settable
+    ps_http.set_tenant_config(
+        env.initial_tenant,
+        {"timeline_get_throttle": throttle_config_with_field_fair_set},
+    )
+    conf = ps_http.tenant_config(env.initial_tenant)
+    assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
+    assert_throttle_config_with_field_fair_set(
+        conf.tenant_specific_overrides["timeline_get_throttle"]
+    )
+
+
+def test_throttle_fair_config_is_settable_but_ignored_in_config_toml(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
+    """
+
+    def set_tenant_config(ps_cfg):
+        ps_cfg["tenant_config"] = {"timeline_get_throttle": throttle_config_with_field_fair_set}
+
+    neon_env_builder.pageserver_config_override = set_tenant_config
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+    conf = ps_http.tenant_config(env.initial_tenant)
+    assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])

From c7481402a0654f919faeb633d8c07ba17607d2f5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 29 Aug 2024 14:02:27 +0200
Subject: [PATCH 1523/1571] pageserver: default to 4MiB stack size and add env
 var to control it (#8862)

# Motivation

In https://github.com/neondatabase/neon/pull/8832 I get tokio runtime
worker stack overflow errors in debug builds.

In a similar vein, I had tokio runtimer worker stack overflow when
trying to eliminate `async_trait`
(https://github.com/neondatabase/neon/pull/8296).

The 2MiB default is kind of arbitrary - so this PR bumps it to 4MiB.

It also adds an env var to control it.

# Risk Assessment

With our 4 runtimes, the worst case stack memory usage is `4 (runtimes)
* ($num_cpus (executor threads) + 512 (blocking pool threads)) * 4MiB`.

On i3en.3xlarge, that's `8384 MiB`.
On im4gn.2xlarge, that's `8320 MiB`.
Before this change, it was half that.

Looking at production metrics, we _do_ have the headroom to accomodate
this worst case case.

# Alternatives

The problems only occur with debug builds, so technically we could only
raise the stack size for debug builds.

However, it would be another configuration where `debug != release`.

# Future Work

If we ever enable single runtime mode in prod (=>
https://github.com/neondatabase/neon/issues/7312 ) then the worst case
will drop to 25% of its current value.

Eliminating the use of `tokio::spawn_blocking` / `tokio::fs` in favor of
`tokio-epoll-uring` (=> https://github.com/neondatabase/neon/issues/7370
) would reduce the worst case to `4 (runtimes) * $num_cpus (executor
threads) * 4 MiB`.
---
 pageserver/src/task_mgr.rs | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index ed9e001fd2..6a4e90dd55 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -146,6 +146,12 @@ impl FromStr for TokioRuntimeMode {
     }
 }
 
+static TOKIO_THREAD_STACK_SIZE: Lazy<NonZeroUsize> = Lazy::new(|| {
+    env::var("NEON_PAGESERVER_TOKIO_THREAD_STACK_SIZE")
+        // the default 2MiB are insufficent, especially in debug mode
+        .unwrap_or_else(|| NonZeroUsize::new(4 * 1024 * 1024).unwrap())
+});
+
 static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
     let thread_name = "pageserver-tokio";
     let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
@@ -164,6 +170,7 @@ static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
             tokio::runtime::Builder::new_current_thread()
                 .thread_name(thread_name)
                 .enable_all()
+                .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
                 .build()
                 .expect("failed to create one single runtime")
         }
@@ -173,6 +180,7 @@ static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
                 .thread_name(thread_name)
                 .enable_all()
                 .worker_threads(num_workers.get())
+                .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
                 .build()
                 .expect("failed to create one multi-threaded runtime")
         }
@@ -199,6 +207,7 @@ macro_rules! pageserver_runtime {
                     .thread_name($name)
                     .worker_threads(TOKIO_WORKER_THREADS.get())
                     .enable_all()
+                    .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
                     .build()
                     .expect(std::concat!("Failed to create runtime ", $name))
             });

From 96b5c4d33dc76583d1d52fd254a36ee47f6b312a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 29 Aug 2024 14:54:02 +0200
Subject: [PATCH 1524/1571] Don't unarchive a timeline if its ancestor is
 archived (#8853)

If a timeline unarchival request comes in, give an error if the parent
timeline is archived. This prevents us from the situation of having an
archived timeline with children that are not archived.

Follow up of #8824

Part of #8088

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/src/http/routes.rs                |  3 +++
 pageserver/src/tenant.rs                     | 19 +++++++++++---
 pageserver/src/tenant/timeline.rs            |  5 ++++
 test_runner/regress/test_timeline_archive.py | 26 ++++++++++++++++++++
 4 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index cb7c2b60ef..f18f0b730c 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -324,6 +324,9 @@ impl From<crate::tenant::TimelineArchivalError> for ApiError {
         match value {
             NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
             Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
+            e @ HasArchivedParent(_) => {
+                ApiError::PreconditionFailed(e.to_string().into_boxed_str())
+            }
             HasUnarchivedChildren(children) => ApiError::PreconditionFailed(
                 format!(
                     "Cannot archive timeline which has non-archived child timelines: {children:?}"
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 60ab242ffc..fb30857ddf 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -509,6 +509,9 @@ pub enum TimelineArchivalError {
     #[error("Timeout")]
     Timeout,
 
+    #[error("ancestor is archived: {}", .0)]
+    HasArchivedParent(TimelineId),
+
     #[error("HasUnarchivedChildren")]
     HasUnarchivedChildren(Vec<TimelineId>),
 
@@ -524,6 +527,7 @@ impl Debug for TimelineArchivalError {
         match self {
             Self::NotFound => write!(f, "NotFound"),
             Self::Timeout => write!(f, "Timeout"),
+            Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(),
             Self::HasUnarchivedChildren(c) => {
                 f.debug_tuple("HasUnarchivedChildren").field(c).finish()
             }
@@ -1369,11 +1373,20 @@ impl Tenant {
         let timeline = {
             let timelines = self.timelines.lock().unwrap();
 
-            let timeline = match timelines.get(&timeline_id) {
-                Some(t) => t,
-                None => return Err(TimelineArchivalError::NotFound),
+            let Some(timeline) = timelines.get(&timeline_id) else {
+                return Err(TimelineArchivalError::NotFound);
             };
 
+            if state == TimelineArchivalState::Unarchived {
+                if let Some(ancestor_timeline) = timeline.ancestor_timeline() {
+                    if ancestor_timeline.is_archived() == Some(true) {
+                        return Err(TimelineArchivalError::HasArchivedParent(
+                            ancestor_timeline.timeline_id,
+                        ));
+                    }
+                }
+            }
+
             // Ensure that there are no non-archived child timelines
             let children: Vec<TimelineId> = timelines
                 .iter()
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8096a0d18c..63d59e06a5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -867,6 +867,11 @@ impl Timeline {
             .map(|ancestor| ancestor.timeline_id)
     }
 
+    /// Get the ancestor timeline
+    pub(crate) fn ancestor_timeline(&self) -> Option<&Arc<Timeline>> {
+        self.ancestor_timeline.as_ref()
+    }
+
     /// Get the bytes written since the PITR cutoff on this branch, and
     /// whether this branch's ancestor_lsn is within its parent's PITR.
     pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index b774c7c9fe..7f158ad251 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -94,3 +94,29 @@ def test_timeline_archive(neon_simple_env: NeonEnv):
         timeline_id=parent_timeline_id,
         state=TimelineArchivalState.ARCHIVED,
     )
+
+    # Test that the leaf can't be unarchived
+    with pytest.raises(
+        PageserverApiException,
+        match="ancestor is archived",
+    ) as exc:
+        assert timeline_path.exists()
+
+        ps_http.timeline_archival_config(
+            tenant_id=env.initial_tenant,
+            timeline_id=leaf_timeline_id,
+            state=TimelineArchivalState.UNARCHIVED,
+        )
+
+    # Unarchive works for the leaf if the parent gets unarchived first
+    ps_http.timeline_archival_config(
+        tenant_id=env.initial_tenant,
+        timeline_id=parent_timeline_id,
+        state=TimelineArchivalState.UNARCHIVED,
+    )
+
+    ps_http.timeline_archival_config(
+        tenant_id=env.initial_tenant,
+        timeline_id=leaf_timeline_id,
+        state=TimelineArchivalState.UNARCHIVED,
+    )

From a8fbc63be2a628297102fe1d85557f3423308117 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 29 Aug 2024 15:06:13 +0200
Subject: [PATCH 1525/1571] tenant background loops: periodic log message if
 long-running iteration (#8832)

refs https://github.com/neondatabase/neon/issues/7524

Problem
-------

When browsing Pageserver logs, background loop iterations that take a
long time are hard to spot / easy to miss because they tend to not
produce any log messages unless:

- they overrun their period, but that's only one message when the
iteration completes late
- they do something that produces logs (e.g., create image layers)

Further, a slow iteration that is still running does will not
log nor bump the metrics of `warn_when_period_overrun`until _after_
it has finished. Again, that makes a still-running iteration hard to
spot.

Solution
--------

This PR adds a wrapper around the per-tenant background loops
that, while a slow iteration is ongoing, emit a log message
every $period.
---
 pageserver/src/tenant/tasks.rs | 112 ++++++++++++++++++++++++---------
 1 file changed, 83 insertions(+), 29 deletions(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 12f080f3c1..f5680ced90 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -192,20 +192,28 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
             }
 
-            let started_at = Instant::now();
 
-            let sleep_duration = if period == Duration::ZERO {
+
+            let sleep_duration;
+            if period == Duration::ZERO {
                 #[cfg(not(feature = "testing"))]
                 info!("automatic compaction is disabled");
                 // check again in 10 seconds, in case it's been enabled again.
-                Duration::from_secs(10)
+                sleep_duration = Duration::from_secs(10)
             } else {
+                let iteration = Iteration {
+                    started_at: Instant::now(),
+                    period,
+                    kind: BackgroundLoopKind::Compaction,
+                };
+
                 // Run compaction
-                match tenant.compaction_iteration(&cancel, &ctx).await {
+                let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await;
+                match output {
                     Ok(has_pending_task) => {
                         error_run_count = 0;
                         // schedule the next compaction immediately in case there is a pending compaction task
-                        if has_pending_task { Duration::ZERO } else { period }
+                        sleep_duration = if has_pending_task { Duration::ZERO } else { period };
                     }
                     Err(e) => {
                         let wait_duration = backoff::exponential_backoff_duration_seconds(
@@ -221,16 +229,14 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                             &wait_duration,
                             cancel.is_cancelled(),
                         );
-                        wait_duration
+                        sleep_duration = wait_duration;
                     }
                 }
+
+                // the duration is recorded by performance tests by enabling debug in this function
+                tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
             };
 
-            let elapsed = started_at.elapsed();
-            warn_when_period_overrun(elapsed, period, BackgroundLoopKind::Compaction);
-
-            // the duration is recorded by performance tests by enabling debug in this function
-            tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
 
             // Perhaps we did no work and the walredo process has been idle for some time:
             // give it a chance to shut down to avoid leaving walredo process running indefinitely.
@@ -368,23 +374,27 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
             }
 
-            let started_at = Instant::now();
-
             let gc_horizon = tenant.get_gc_horizon();
-            let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
+            let sleep_duration;
+            if period == Duration::ZERO || gc_horizon == 0 {
                 #[cfg(not(feature = "testing"))]
                 info!("automatic GC is disabled");
                 // check again in 10 seconds, in case it's been enabled again.
-                Duration::from_secs(10)
+                sleep_duration = Duration::from_secs(10);
             } else {
+                let iteration = Iteration {
+                    started_at: Instant::now(),
+                    period,
+                    kind: BackgroundLoopKind::Gc,
+                };
                 // Run gc
-                let res = tenant
-                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
+                let IterationResult { output, elapsed: _ } =
+                    iteration.run(tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx))
                     .await;
-                match res {
+                match output {
                     Ok(_) => {
                         error_run_count = 0;
-                        period
+                        sleep_duration = period;
                     }
                     Err(crate::tenant::GcError::TenantCancelled) => {
                         return;
@@ -408,13 +418,11 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                             error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
                         }
 
-                        wait_duration
+                        sleep_duration = wait_duration;
                     }
                 }
             };
 
-            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
-
             if tokio::time::timeout(sleep_duration, cancel.cancelled())
                 .await
                 .is_ok()
@@ -468,14 +476,12 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
                 break;
             }
 
-            let started_at = Instant::now();
-            tenant.ingest_housekeeping().await;
-
-            warn_when_period_overrun(
-                started_at.elapsed(),
+            let iteration = Iteration {
+                started_at: Instant::now(),
                 period,
-                BackgroundLoopKind::IngestHouseKeeping,
-            );
+                kind: BackgroundLoopKind::IngestHouseKeeping,
+            };
+            iteration.run(tenant.ingest_housekeeping()).await;
         }
     }
     .await;
@@ -553,6 +559,54 @@ pub(crate) async fn delay_by_lease_length(
     }
 }
 
+struct Iteration {
+    started_at: Instant,
+    period: Duration,
+    kind: BackgroundLoopKind,
+}
+
+struct IterationResult<O> {
+    output: O,
+    elapsed: Duration,
+}
+
+impl Iteration {
+    #[instrument(skip_all)]
+    pub(crate) async fn run<Fut, O>(self, fut: Fut) -> IterationResult<O>
+    where
+        Fut: std::future::Future<Output = O>,
+    {
+        let Self {
+            started_at,
+            period,
+            kind,
+        } = self;
+
+        let mut fut = std::pin::pin!(fut);
+
+        // Wrap `fut` into a future that logs a message every `period` so that we get a
+        // very obvious breadcrumb in the logs _while_ a slow iteration is happening.
+        let liveness_logger = async move {
+            loop {
+                match tokio::time::timeout(period, &mut fut).await {
+                    Ok(x) => return x,
+                    Err(_) => {
+                        // info level as per the same rationale why warn_when_period_overrun is info
+                        // =>  https://github.com/neondatabase/neon/pull/5724
+                        info!("still running");
+                    }
+                }
+            }
+        };
+
+        let output = liveness_logger.await;
+
+        let elapsed = started_at.elapsed();
+        warn_when_period_overrun(elapsed, period, kind);
+
+        IterationResult { output, elapsed }
+    }
+}
 /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
 pub(crate) fn warn_when_period_overrun(
     elapsed: Duration,

From 7ce49fe6e312d0bbfcf27fe3f41b8ad70d8725b0 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 29 Aug 2024 14:20:15 +0100
Subject: [PATCH 1526/1571] proxy: improve test performance (#8863)

Some tests were very slow and some tests occasionally stalled. This PR
improves some test performance and replaces the custom threadpool in
order to fix the stalling of tests.
---
 proxy/src/auth/backend/jwt.rs |  69 ++++++-
 proxy/src/context/parquet.rs  |  34 ----
 proxy/src/metrics.rs          |   8 +-
 proxy/src/scram/countmin.rs   |  26 +--
 proxy/src/scram/pbkdf2.rs     |   4 +-
 proxy/src/scram/threadpool.rs | 363 +++++++++++-----------------------
 6 files changed, 199 insertions(+), 305 deletions(-)

diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index e98da82053..1f44e4af5d 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -500,6 +500,7 @@ mod tests {
     use hyper1::service::service_fn;
     use hyper_util::rt::TokioIo;
     use rand::rngs::OsRng;
+    use rsa::pkcs8::DecodePrivateKey;
     use signature::Signer;
     use tokio::net::TcpListener;
 
@@ -517,8 +518,8 @@ mod tests {
         (sk, jwk)
     }
 
-    fn new_rsa_jwk(kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) {
-        let sk = rsa::RsaPrivateKey::new(&mut OsRng, 2048).unwrap();
+    fn new_rsa_jwk(key: &str, kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) {
+        let sk = rsa::RsaPrivateKey::from_pkcs8_pem(key).unwrap();
         let pk = sk.to_public_key().into();
         let jwk = jose_jwk::Jwk {
             key: jose_jwk::Key::Rsa(pk),
@@ -569,10 +570,70 @@ mod tests {
         format!("{payload}.{sig}")
     }
 
+    // RSA key gen is slow....
+    const RS1: &str = "-----BEGIN PRIVATE KEY-----
+MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQDNuWBIWTlo+54Y
+aifpGInIrpv6LlsbI/2/2CC81Arlx4RsABORklgA9XSGwaCbHTshHsfd1S916JwA
+SpjyPQYWfqo6iAV8a4MhjIeJIkRr74prDCSzOGZvIc6VaGeCIb9clf3HSrPHm3hA
+cfLMB8/p5MgoxERPDOIn3XYoS9SEEuP7l0LkmEZMerg6W6lDjQRDny0Lb50Jky9X
+mDqnYXBhs99ranbwL5vjy0ba6OIeCWFJme5u+rv5C/P0BOYrJfGxIcEoKa8Ukw5s
+PlM+qrz9ope1eOuXMNNdyFDReNBUyaM1AwBAayU5rz57crer7K/UIofaJ42T4cMM
+nx/SWfBNAgMBAAECggEACqdpBxYn1PoC6/zDaFzu9celKEWyTiuE/qRwvZa1ocS9
+ZOJ0IPvVNud/S2NHsADJiSOQ8joSJScQvSsf1Ju4bv3MTw+wSQtAVUJz2nQ92uEi
+5/xPAkEPfP3hNvebNLAOuvrBk8qYmOPCTIQaMNrOt6wzeXkAmJ9wLuRXNCsJLHW+
+KLpf2WdgTYxqK06ZiJERFgJ2r1MsC2IgTydzjOAdEIrtMarerTLqqCpwFrk/l0cz
+1O2OAb17ZxmhuzMhjNMin81c8F2fZAGMeOjn92Jl5kUsYw/pG+0S8QKlbveR/fdP
+We2tJsgXw2zD0q7OJpp8NXS2yddrZGyysYsof983wQKBgQD2McqNJqo+eWL5zony
+UbL19loYw0M15EjhzIuzW1Jk0rPj65yQyzpJ6pqicRuWr34MvzCx+ZHM2b3jSiNu
+GES2fnC7xLIKyeRxfqsXF71xz+6UStEGRQX27r1YWEtyQVuBhvlqB+AGWP3PYAC+
+HecZecnZ+vcihJ2K3+l5O3paVQKBgQDV6vKH5h2SY9vgO8obx0P7XSS+djHhmPuU
+f8C/Fq6AuRbIA1g04pzuLU2WS9T26eIjgM173uVNg2TuqJveWzz+CAAp6nCR6l24
+DBg49lMGCWrMo4FqPG46QkUqvK8uSj42GkX/e5Rut1Gyu0209emeM6h2d2K15SvY
+9563tYSmGQKBgQDwcH5WTi20KA7e07TroJi8GKWzS3gneNUpGQBS4VxdtV4UuXXF
+/4TkzafJ/9cm2iurvUmMd6XKP9lw0mY5zp/E70WgTCBp4vUlVsU3H2tYbO+filYL
+3ntNx6nKTykX4/a/UJfj0t8as+zli+gNxNx/h+734V9dKdFG4Rl+2fTLpQKBgQCE
+qJkTEe+Q0wCOBEYICADupwqcWqwAXWDW7IrZdfVtulqYWwqecVIkmk+dPxWosc4d
+ekjz4nyNH0i+gC15LVebqdaAJ/T7aD4KXuW+nXNLMRfcJCGjgipRUruWD0EMEdqW
+rqBuGXMpXeH6VxGPgVkJVLvKC6tZZe9VM+pnvteuMQKBgQC8GaL+Lz+al4biyZBf
+JE8ekWrIotq/gfUBLP7x70+PB9bNtXtlgmTvjgYg4jiu3KR/ZIYYQ8vfVgkb6tDI
+rWGZw86Pzuoi1ppg/pYhKk9qrmCIT4HPEXbHl7ATahu2BOCIU3hybjTh2lB6LbX9
+8LMFlz1QPqSZYN/A/kOcLBfa3A==
+-----END PRIVATE KEY-----
+";
+    const RS2: &str = "-----BEGIN PRIVATE KEY-----
+MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDipm6FIKSRab3J
+HwmK18t7hp+pohllxIDUSPi7S5mIhN/JG2Plq2Lp746E/fuT8dcBF2R4sJlG2L0J
+zmxOvBU/i/sQF9s1i4CEfg05k2//gKENIEsF3pMMmrH+mcZi0TTD6rezHpdVxPHk
+qWxSyOCtIJV29X+wxPwAB59kQFHzy2ooPB1isZcpE8tO0KthAM+oZ3KuCwE0++cO
+IWLeq9aPwyKhtip/xjTMxd1kzdKh592mGSyzr9D0QSWOYFGvgJXANDdiPdhSSOLt
+ECWPNPlm2FQvGGvYYBafUqz7VumKHE6x8J6lKdYa2J0ZdDzCIo2IHzlxe+RZNgwy
+uAD2jhVxAgMBAAECggEAbsZHWBu3MzcKQiVARbLoygvnN0J5xUqAaMDtiKUPejDv
+K1yOu67DXnDuKEP2VL2rhuYG/hHaKE1AP227c9PrUq6424m9YvM2sgrlrdFIuQkG
+LeMtp8W7+zoUasp/ssZrUqICfLIj5xCl5UuFHQT/Ar7dLlIYwa3VOLKBDb9+Dnfe
+QH5/So4uMXG6vw34JN9jf+eAc8Yt0PeIz62ycvRwdpTJQ0MxZN9ZKpCAQp+VTuXT
+zlzNvDMilabEdqUvAyGyz8lBLNl0wdaVrqPqAEWM5U45QXsdFZknWammP7/tijeX
+0z+Bi0J0uSEU5X502zm7GArj/NNIiWMcjmDjwUUhwQKBgQD9C2GoqxOxuVPYqwYR
++Jz7f2qMjlSP8adA5Lzuh8UKXDp8JCEQC8ryweLzaOKS9C5MAw+W4W2wd4nJoQI1
+P1dgGvBlfvEeRHMgqWtq7FuTsjSe7e0uSEkC4ngDb4sc0QOpv15cMuEz+4+aFLPL
+x29EcHWAaBX+rkid3zpQHFU4eQKBgQDlTCEqRuXwwa3V+Sq+mNWzD9QIGtD87TH/
+FPO/Ij/cK2+GISgFDqhetiGTH4qrvPL0psPT+iH5zGFYcoFmTtwLdWQJdxhxz0bg
+iX/AceyX5e1Bm+ThT36sU83NrxKPkrdk6jNmr2iUF1OTzTwUKOYdHOPZqdMPfF4M
+4XAaWVT2uQKBgQD4nKcNdU+7LE9Rr+4d1/o8Klp/0BMK/ayK2HE7lc8kt6qKb2DA
+iCWUTqPw7Fq3cQrPia5WWhNP7pJEtFkcAaiR9sW7onW5fBz0uR+dhK0QtmR2xWJj
+N4fsOp8ZGQ0/eae0rh1CTobucLkM9EwV6VLLlgYL67e4anlUCo8bSEr+WQKBgQCB
+uf6RgqcY/RqyklPCnYlZ0zyskS9nyXKd1GbK3j+u+swP4LZZlh9f5j88k33LCA2U
+qLzmMwAB6cWxWqcnELqhqPq9+ClWSmTZKDGk2U936NfAZMirSGRsbsVi9wfTPriP
+WYlXMSpDjqb0WgsBhNob4npubQxCGKTFOM5Jufy90QKBgB0Lte1jX144uaXx6dtB
+rjXNuWNir0Jy31wHnQuCA+XnfUgPcrKmRLm8taMbXgZwxkNvgFkpUWU8aPEK08Ne
+X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
+5JiconnI5aLek0QVPoFaVXFa
+-----END PRIVATE KEY-----
+";
+
     #[tokio::test]
     async fn renew() {
-        let (rs1, jwk1) = new_rsa_jwk("1".into());
-        let (rs2, jwk2) = new_rsa_jwk("2".into());
+        let (rs1, jwk1) = new_rsa_jwk(RS1, "1".into());
+        let (rs2, jwk2) = new_rsa_jwk(RS2, "2".into());
         let (ec1, jwk3) = new_ec_jwk("3".into());
         let (ec2, jwk4) = new_ec_jwk("4".into());
 
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 88caa9a316..c6f83fd069 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -613,40 +613,6 @@ mod tests {
         tmpdir.close().unwrap();
     }
 
-    #[tokio::test]
-    async fn verify_parquet_min_compression() {
-        let tmpdir = camino_tempfile::tempdir().unwrap();
-
-        let config = ParquetConfig {
-            propeties: Arc::new(
-                WriterProperties::builder()
-                    .set_compression(parquet::basic::Compression::ZSTD(ZstdLevel::default()))
-                    .build(),
-            ),
-            rows_per_group: 2_000,
-            file_size: 1_000_000,
-            max_duration: time::Duration::from_secs(20 * 60),
-            test_remote_failures: 0,
-        };
-
-        let rx = random_stream(50_000);
-        let file_stats = run_test(tmpdir.path(), config, rx).await;
-
-        // with compression, there are fewer files with more rows per file
-        assert_eq!(
-            file_stats,
-            [
-                (1223214, 5, 10000),
-                (1229364, 5, 10000),
-                (1231158, 5, 10000),
-                (1230520, 5, 10000),
-                (1221798, 5, 10000)
-            ]
-        );
-
-        tmpdir.close().unwrap();
-    }
-
     #[tokio::test]
     async fn verify_parquet_strong_compression() {
         let tmpdir = camino_tempfile::tempdir().unwrap();
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index ccef88231b..2da7eac580 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -4,8 +4,8 @@ use lasso::ThreadedRodeo;
 use measured::{
     label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet},
     metric::{histogram::Thresholds, name::MetricName},
-    Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
-    LabelGroup, MetricGroup,
+    Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
+    MetricGroup,
 };
 use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
 
@@ -548,6 +548,7 @@ pub enum RedisEventsCount {
 }
 
 pub struct ThreadPoolWorkers(usize);
+#[derive(Copy, Clone)]
 pub struct ThreadPoolWorkerId(pub usize);
 
 impl LabelValue for ThreadPoolWorkerId {
@@ -613,9 +614,6 @@ impl FixedCardinalitySet for ThreadPoolWorkers {
 #[derive(MetricGroup)]
 #[metric(new(workers: usize))]
 pub struct ThreadPoolMetrics {
-    pub injector_queue_depth: Gauge,
-    #[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))]
-    pub worker_queue_depth: GaugeVec<ThreadPoolWorkers>,
     #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
     pub worker_task_turns_total: CounterVec<ThreadPoolWorkers>,
     #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs
index 255694b33e..64ee0135e1 100644
--- a/proxy/src/scram/countmin.rs
+++ b/proxy/src/scram/countmin.rs
@@ -83,10 +83,10 @@ mod tests {
         let mut ids = vec![];
 
         for _ in 0..n {
-            // number of insert operations
-            let n = rng.gen_range(1..100);
             // number to insert at once
-            let m = rng.gen_range(1..4096);
+            let n = rng.gen_range(1..4096);
+            // number of insert operations
+            let m = rng.gen_range(1..100);
 
             let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid();
             ids.push((id, n, m));
@@ -102,17 +102,11 @@ mod tests {
         let mut ids2 = ids.clone();
         while !ids2.is_empty() {
             ids2.shuffle(&mut rng);
-
-            let mut i = 0;
-            while i < ids2.len() {
-                sketch.inc_and_return(&ids2[i].0, ids2[i].1);
-                ids2[i].2 -= 1;
-                if ids2[i].2 == 0 {
-                    ids2.remove(i);
-                } else {
-                    i += 1;
-                }
-            }
+            ids2.retain_mut(|id| {
+                sketch.inc_and_return(&id.0, id.1);
+                id.2 -= 1;
+                id.2 > 0
+            });
         }
 
         let mut within_p = 0;
@@ -144,8 +138,8 @@ mod tests {
         // probably numbers are too small to truly represent the probabilities.
         assert_eq!(eval_precision(100, 4096.0, 0.90), 100);
         assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000);
-        assert_eq!(eval_precision(100, 4096.0, 0.1), 98);
-        assert_eq!(eval_precision(1000, 4096.0, 0.1), 991);
+        assert_eq!(eval_precision(100, 4096.0, 0.1), 96);
+        assert_eq!(eval_precision(1000, 4096.0, 0.1), 988);
     }
 
     // returns memory usage in bytes, and the time complexity per insert.
diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs
index d5ed9002ad..4cf76c8452 100644
--- a/proxy/src/scram/pbkdf2.rs
+++ b/proxy/src/scram/pbkdf2.rs
@@ -75,7 +75,7 @@ mod tests {
         let salt = b"sodium chloride";
         let pass = b"Ne0n_!5_50_C007";
 
-        let mut job = Pbkdf2::start(pass, salt, 600000);
+        let mut job = Pbkdf2::start(pass, salt, 60000);
         let hash = loop {
             let std::task::Poll::Ready(hash) = job.turn() else {
                 continue;
@@ -83,7 +83,7 @@ mod tests {
             break hash;
         };
 
-        let expected = pbkdf2_hmac_array::<Sha256, 32>(pass, salt, 600000);
+        let expected = pbkdf2_hmac_array::<Sha256, 32>(pass, salt, 60000);
         assert_eq!(hash, expected);
     }
 }
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
index 262c6d146e..d73a927995 100644
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -4,17 +4,19 @@
 //! 1. Fairness per endpoint.
 //! 2. Yield support for high iteration counts.
 
-use std::sync::{
-    atomic::{AtomicU64, Ordering},
-    Arc,
+use std::{
+    cell::RefCell,
+    future::Future,
+    pin::Pin,
+    sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc, Weak,
+    },
+    task::{Context, Poll},
 };
 
-use crossbeam_deque::{Injector, Stealer, Worker};
-use itertools::Itertools;
-use parking_lot::{Condvar, Mutex};
 use rand::Rng;
 use rand::{rngs::SmallRng, SeedableRng};
-use tokio::sync::oneshot;
 
 use crate::{
     intern::EndpointIdInt,
@@ -25,273 +27,146 @@ use crate::{
 use super::pbkdf2::Pbkdf2;
 
 pub struct ThreadPool {
-    queue: Injector<JobSpec>,
-    stealers: Vec<Stealer<JobSpec>>,
-    parkers: Vec<(Condvar, Mutex<ThreadState>)>,
-    /// bitpacked representation.
-    /// lower 8 bits = number of sleeping threads
-    /// next 8 bits = number of idle threads (searching for work)
-    counters: AtomicU64,
-
+    runtime: Option<tokio::runtime::Runtime>,
     pub metrics: Arc<ThreadPoolMetrics>,
 }
 
-#[derive(PartialEq)]
-enum ThreadState {
-    Parked,
-    Active,
+/// How often to reset the sketch values
+const SKETCH_RESET_INTERVAL: u64 = 1021;
+
+thread_local! {
+    static STATE: RefCell<Option<ThreadRt>> = const { RefCell::new(None) };
 }
 
 impl ThreadPool {
     pub fn new(n_workers: u8) -> Arc<Self> {
-        let workers = (0..n_workers).map(|_| Worker::new_fifo()).collect_vec();
-        let stealers = workers.iter().map(|w| w.stealer()).collect_vec();
+        // rayon would be nice here, but yielding in rayon does not work well afaict.
 
-        let parkers = (0..n_workers)
-            .map(|_| (Condvar::new(), Mutex::new(ThreadState::Active)))
-            .collect_vec();
+        Arc::new_cyclic(|pool| {
+            let pool = pool.clone();
+            let worker_id = AtomicUsize::new(0);
 
-        let pool = Arc::new(Self {
-            queue: Injector::new(),
-            stealers,
-            parkers,
-            // threads start searching for work
-            counters: AtomicU64::new((n_workers as u64) << 8),
-            metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)),
-        });
+            let runtime = tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(n_workers as usize)
+                .on_thread_start(move || {
+                    STATE.with_borrow_mut(|state| {
+                        *state = Some(ThreadRt {
+                            pool: pool.clone(),
+                            id: ThreadPoolWorkerId(worker_id.fetch_add(1, Ordering::Relaxed)),
+                            rng: SmallRng::from_entropy(),
+                            // used to determine whether we should temporarily skip tasks for fairness.
+                            // 99% of estimates will overcount by no more than 4096 samples
+                            countmin: CountMinSketch::with_params(
+                                1.0 / (SKETCH_RESET_INTERVAL as f64),
+                                0.01,
+                            ),
+                            tick: 0,
+                        });
+                    });
+                })
+                .build()
+                .unwrap();
 
-        for (i, worker) in workers.into_iter().enumerate() {
-            let pool = Arc::clone(&pool);
-            std::thread::spawn(move || thread_rt(pool, worker, i));
-        }
-
-        pool
+            Self {
+                runtime: Some(runtime),
+                metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)),
+            }
+        })
     }
 
     pub(crate) fn spawn_job(
         &self,
         endpoint: EndpointIdInt,
         pbkdf2: Pbkdf2,
-    ) -> oneshot::Receiver<[u8; 32]> {
-        let (tx, rx) = oneshot::channel();
-
-        let queue_was_empty = self.queue.is_empty();
-
-        self.metrics.injector_queue_depth.inc();
-        self.queue.push(JobSpec {
-            response: tx,
-            pbkdf2,
-            endpoint,
-        });
-
-        // inspired from <https://github.com/rayon-rs/rayon/blob/3e3962cb8f7b50773bcc360b48a7a674a53a2c77/rayon-core/src/sleep/mod.rs#L242>
-        let counts = self.counters.load(Ordering::SeqCst);
-        let num_awake_but_idle = (counts >> 8) & 0xff;
-        let num_sleepers = counts & 0xff;
-
-        // If the queue is non-empty, then we always wake up a worker
-        // -- clearly the existing idle jobs aren't enough. Otherwise,
-        // check to see if we have enough idle workers.
-        if !queue_was_empty || num_awake_but_idle == 0 {
-            let num_to_wake = Ord::min(1, num_sleepers);
-            self.wake_any_threads(num_to_wake);
-        }
-
-        rx
-    }
-
-    #[cold]
-    fn wake_any_threads(&self, mut num_to_wake: u64) {
-        if num_to_wake > 0 {
-            for i in 0..self.parkers.len() {
-                if self.wake_specific_thread(i) {
-                    num_to_wake -= 1;
-                    if num_to_wake == 0 {
-                        return;
-                    }
-                }
-            }
-        }
-    }
-
-    fn wake_specific_thread(&self, index: usize) -> bool {
-        let (condvar, lock) = &self.parkers[index];
-
-        let mut state = lock.lock();
-        if *state == ThreadState::Parked {
-            condvar.notify_one();
-
-            // When the thread went to sleep, it will have incremented
-            // this value. When we wake it, its our job to decrement
-            // it. We could have the thread do it, but that would
-            // introduce a delay between when the thread was
-            // *notified* and when this counter was decremented. That
-            // might mislead people with new work into thinking that
-            // there are sleeping threads that they should try to
-            // wake, when in fact there is nothing left for them to
-            // do.
-            self.counters.fetch_sub(1, Ordering::SeqCst);
-            *state = ThreadState::Active;
-
-            true
-        } else {
-            false
-        }
-    }
-
-    fn steal(&self, rng: &mut impl Rng, skip: usize, worker: &Worker<JobSpec>) -> Option<JobSpec> {
-        // announce thread as idle
-        self.counters.fetch_add(256, Ordering::SeqCst);
-
-        // try steal from the global queue
-        loop {
-            match self.queue.steal_batch_and_pop(worker) {
-                crossbeam_deque::Steal::Success(job) => {
-                    self.metrics
-                        .injector_queue_depth
-                        .set(self.queue.len() as i64);
-                    // no longer idle
-                    self.counters.fetch_sub(256, Ordering::SeqCst);
-                    return Some(job);
-                }
-                crossbeam_deque::Steal::Retry => continue,
-                crossbeam_deque::Steal::Empty => break,
-            }
-        }
-
-        // try steal from our neighbours
-        loop {
-            let mut retry = false;
-            let start = rng.gen_range(0..self.stealers.len());
-            let job = (start..self.stealers.len())
-                .chain(0..start)
-                .filter(|i| *i != skip)
-                .find_map(
-                    |victim| match self.stealers[victim].steal_batch_and_pop(worker) {
-                        crossbeam_deque::Steal::Success(job) => Some(job),
-                        crossbeam_deque::Steal::Empty => None,
-                        crossbeam_deque::Steal::Retry => {
-                            retry = true;
-                            None
-                        }
-                    },
-                );
-            if job.is_some() {
-                // no longer idle
-                self.counters.fetch_sub(256, Ordering::SeqCst);
-                return job;
-            }
-            if !retry {
-                return None;
-            }
-        }
+    ) -> tokio::task::JoinHandle<[u8; 32]> {
+        self.runtime
+            .as_ref()
+            .unwrap()
+            .spawn(JobSpec { pbkdf2, endpoint })
     }
 }
 
-fn thread_rt(pool: Arc<ThreadPool>, worker: Worker<JobSpec>, index: usize) {
-    /// interval when we should steal from the global queue
-    /// so that tail latencies are managed appropriately
-    const STEAL_INTERVAL: usize = 61;
+impl Drop for ThreadPool {
+    fn drop(&mut self) {
+        self.runtime.take().unwrap().shutdown_background();
+    }
+}
 
-    /// How often to reset the sketch values
-    const SKETCH_RESET_INTERVAL: usize = 1021;
+struct ThreadRt {
+    pool: Weak<ThreadPool>,
+    id: ThreadPoolWorkerId,
+    rng: SmallRng,
+    countmin: CountMinSketch,
+    tick: u64,
+}
 
-    let mut rng = SmallRng::from_entropy();
+impl ThreadRt {
+    fn should_run(&mut self, job: &JobSpec) -> bool {
+        let rate = self
+            .countmin
+            .inc_and_return(&job.endpoint, job.pbkdf2.cost());
 
-    // used to determine whether we should temporarily skip tasks for fairness.
-    // 99% of estimates will overcount by no more than 4096 samples
-    let mut sketch = CountMinSketch::with_params(1.0 / (SKETCH_RESET_INTERVAL as f64), 0.01);
-
-    let (condvar, lock) = &pool.parkers[index];
-
-    'wait: loop {
-        // wait for notification of work
-        {
-            let mut lock = lock.lock();
-
-            // queue is empty
-            pool.metrics
-                .worker_queue_depth
-                .set(ThreadPoolWorkerId(index), 0);
-
-            // subtract 1 from idle count, add 1 to sleeping count.
-            pool.counters.fetch_sub(255, Ordering::SeqCst);
-
-            *lock = ThreadState::Parked;
-            condvar.wait(&mut lock);
-        }
-
-        for i in 0.. {
-            let Some(mut job) = worker
-                .pop()
-                .or_else(|| pool.steal(&mut rng, index, &worker))
-            else {
-                continue 'wait;
-            };
-
-            pool.metrics
-                .worker_queue_depth
-                .set(ThreadPoolWorkerId(index), worker.len() as i64);
-
-            // receiver is closed, cancel the task
-            if !job.response.is_closed() {
-                let rate = sketch.inc_and_return(&job.endpoint, job.pbkdf2.cost());
-
-                const P: f64 = 2000.0;
-                // probability decreases as rate increases.
-                // lower probability, higher chance of being skipped
-                //
-                // estimates (rate in terms of 4096 rounds):
-                // rate = 0    => probability = 100%
-                // rate = 10   => probability = 71.3%
-                // rate = 50   => probability = 62.1%
-                // rate = 500  => probability = 52.3%
-                // rate = 1021 => probability = 49.8%
-                //
-                // My expectation is that the pool queue will only begin backing up at ~1000rps
-                // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above
-                // are in requests per second.
-                let probability = P.ln() / (P + rate as f64).ln();
-                if pool.queue.len() > 32 || rng.gen_bool(probability) {
-                    pool.metrics
-                        .worker_task_turns_total
-                        .inc(ThreadPoolWorkerId(index));
-
-                    match job.pbkdf2.turn() {
-                        std::task::Poll::Ready(result) => {
-                            let _ = job.response.send(result);
-                        }
-                        std::task::Poll::Pending => worker.push(job),
-                    }
-                } else {
-                    pool.metrics
-                        .worker_task_skips_total
-                        .inc(ThreadPoolWorkerId(index));
-
-                    // skip for now
-                    worker.push(job);
-                }
-            }
-
-            // if we get stuck with a few long lived jobs in the queue
-            // it's better to try and steal from the queue too for fairness
-            if i % STEAL_INTERVAL == 0 {
-                let _ = pool.queue.steal_batch(&worker);
-            }
-
-            if i % SKETCH_RESET_INTERVAL == 0 {
-                sketch.reset();
-            }
-        }
+        const P: f64 = 2000.0;
+        // probability decreases as rate increases.
+        // lower probability, higher chance of being skipped
+        //
+        // estimates (rate in terms of 4096 rounds):
+        // rate = 0    => probability = 100%
+        // rate = 10   => probability = 71.3%
+        // rate = 50   => probability = 62.1%
+        // rate = 500  => probability = 52.3%
+        // rate = 1021 => probability = 49.8%
+        //
+        // My expectation is that the pool queue will only begin backing up at ~1000rps
+        // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above
+        // are in requests per second.
+        let probability = P.ln() / (P + rate as f64).ln();
+        self.rng.gen_bool(probability)
     }
 }
 
 struct JobSpec {
-    response: oneshot::Sender<[u8; 32]>,
     pbkdf2: Pbkdf2,
     endpoint: EndpointIdInt,
 }
 
+impl Future for JobSpec {
+    type Output = [u8; 32];
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        STATE.with_borrow_mut(|state| {
+            let state = state.as_mut().expect("should be set on thread startup");
+
+            state.tick = state.tick.wrapping_add(1);
+            if state.tick % SKETCH_RESET_INTERVAL == 0 {
+                state.countmin.reset();
+            }
+
+            if state.should_run(&self) {
+                if let Some(pool) = state.pool.upgrade() {
+                    pool.metrics.worker_task_turns_total.inc(state.id);
+                }
+
+                match self.pbkdf2.turn() {
+                    Poll::Ready(result) => Poll::Ready(result),
+                    // more to do, we shall requeue
+                    Poll::Pending => {
+                        cx.waker().wake_by_ref();
+                        Poll::Pending
+                    }
+                }
+            } else {
+                if let Some(pool) = state.pool.upgrade() {
+                    pool.metrics.worker_task_skips_total.inc(state.id);
+                }
+
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
+        })
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::EndpointId;

From 18bfc43fa706fc6e550d29c539f30c7e5deb1d2b Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 29 Aug 2024 22:01:54 +0800
Subject: [PATCH 1527/1571] fix(pageserver): add dry-run to force compact API
 (#8859)

Add `dry-run` flag to the compact API

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index f18f0b730c..8cf2c99c09 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1733,6 +1733,10 @@ async fn timeline_compact_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
         flags |= CompactFlags::EnhancedGcBottomMostCompaction;
     }
+    if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? {
+        flags |= CompactFlags::DryRun;
+    }
+
     let wait_until_uploaded =
         parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 

From 653a6532a229038683256b08bc6ab5c1b270f52a Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 29 Aug 2024 22:07:05 +0800
Subject: [PATCH 1528/1571] fix(pageserver): reject non-i128 key on the write
 path (#8648)

It's better to reject invalid keys on the write path than storing it and
panic-ing the pageserver.
https://github.com/neondatabase/neon/issues/8636

## Summary of changes

If a key cannot be represented using i128, we don't allow writing that
key into the pageserver.

There are two versions of the check valid function: the normal one that
simply rejects i128 keys, and the stronger one that rejects all keys
that we don't support.

The current behavior when a key gets rejected is that safekeeper will
keep retrying streaming that key to the pageserver. And once such key
gets written, no new computes can be started. Therefore, there could be
a large amount of pageserver warnings if a key cannot be ingested. To
validate this behavior by yourself, the reviewer can (1) use the
stronger version of the valid check (2) run the following SQL.

```
set neon.regress_test_mode = true;
CREATE TABLESPACE regress_tblspace LOCATION '/Users/skyzh/Work/neon-test/tablespace';
CREATE SCHEMA testschema;
CREATE TABLE testschema.foo (i int) TABLESPACE regress_tblspace;
insert into testschema.foo values (1), (2), (3);
```

For now, I'd like to merge the patch with only rejecting non-i128 keys.
It's still unknown whether the stronger version covers all the cases
that basebackup doesn't support. Furthermore, the behavior of rejecting
a key will produce large amounts of warnings due to safekeeper retry.
Therefore, I'd like to reject the minimum set of keys that we don't
support (i128 ones) for now. (well, erroring out is better than panic on
`to_compact_key`)

The next step is to fix the safekeeper behavior (i.e., on such key
rejections, stop streaming WAL), so that we can properly stop writing.
An alternative solution is to simply drop these keys on the write path.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs      | 35 +++++++++++++++++++++++++----
 pageserver/ctl/src/layers.rs        |  1 +
 pageserver/src/pgdatadir_mapping.rs | 14 +++++++++---
 pageserver/src/tenant/timeline.rs   |  6 +++++
 4 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 77da58d63e..77d744e4da 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -108,14 +108,41 @@ impl Key {
         }
     }
 
+    /// This function checks more extensively what keys we can take on the write path.
+    /// If a key beginning with 00 does not have a global/default tablespace OID, it
+    /// will be rejected on the write path.
+    #[allow(dead_code)]
+    pub fn is_valid_key_on_write_path_strong(&self) -> bool {
+        use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
+        if !self.is_i128_representable() {
+            return false;
+        }
+        if self.field1 == 0
+            && !(self.field2 == GLOBALTABLESPACE_OID
+                || self.field2 == DEFAULTTABLESPACE_OID
+                || self.field2 == 0)
+        {
+            return false; // User defined tablespaces are not supported
+        }
+        true
+    }
+
+    /// This is a weaker version of `is_valid_key_on_write_path_strong` that simply
+    /// checks if the key is i128 representable. Note that some keys can be successfully
+    /// ingested into the pageserver, but will cause errors on generating basebackup.
+    pub fn is_valid_key_on_write_path(&self) -> bool {
+        self.is_i128_representable()
+    }
+
+    pub fn is_i128_representable(&self) -> bool {
+        self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222
+    }
+
     /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
     /// As long as Neon does not support tablespace (because of lack of access to local file system),
     /// we can assume that only some predefined namespace OIDs are used which can fit in u16
     pub fn to_i128(&self) -> i128 {
-        assert!(
-            self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
-            "invalid key: {self}",
-        );
+        assert!(self.is_i128_representable(), "invalid key: {self}");
         (((self.field1 & 0x7F) as i128) << 120)
             | (((self.field2 & 0xFFFF) as i128) << 104)
             | ((self.field3 as i128) << 72)
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index a183a3968d..e0f978eaa2 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -90,6 +90,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
     for (k, v) in all {
         let value = cursor.read_blob(v.pos(), ctx).await?;
         println!("key:{} value_len:{}", k, value.len());
+        assert!(k.is_i128_representable(), "invalid key: ");
     }
     // TODO(chi): special handling for last key?
     Ok(())
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index b7110d69b6..edcbac970b 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -12,7 +12,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use crate::{aux_file, repository::*};
-use anyhow::{ensure, Context};
+use anyhow::{bail, ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use pageserver_api::key::{
@@ -1791,6 +1791,11 @@ impl<'a> DatadirModification<'a> {
         // Flush relation and  SLRU data blocks, keep metadata.
         let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
         for (key, values) in self.pending_updates.drain() {
+            if !key.is_valid_key_on_write_path() {
+                bail!(
+                    "the request contains data not supported by pageserver at TimelineWriter::put: {}", key
+                );
+            }
             let mut write_batch = Vec::new();
             for (lsn, value_ser_size, value) in values {
                 if key.is_rel_block_key() || key.is_slru_block_key() {
@@ -1843,10 +1848,13 @@ impl<'a> DatadirModification<'a> {
                 .drain()
                 .flat_map(|(key, values)| {
                     values.into_iter().map(move |(lsn, val_ser_size, value)| {
-                        (key.to_compact(), lsn, val_ser_size, value)
+                        if !key.is_valid_key_on_write_path() {
+                            bail!("the request contains data not supported by pageserver at TimelineWriter::put: {}", key);
+                        }
+                        Ok((key.to_compact(), lsn, val_ser_size, value))
                     })
                 })
-                .collect::<Vec<_>>();
+                .collect::<anyhow::Result<Vec<_>>>()?;
 
             writer.put_batch(batch, ctx).await?;
         }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 63d59e06a5..35e0825bac 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5746,6 +5746,12 @@ impl<'a> TimelineWriter<'a> {
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         use utils::bin_ser::BeSer;
+        if !key.is_valid_key_on_write_path() {
+            bail!(
+                "the request contains data not supported by pageserver at TimelineWriter::put: {}",
+                key
+            );
+        }
         let val_ser_size = value.serialized_size().unwrap() as usize;
         self.put_batch(
             vec![(key.to_compact(), lsn, val_ser_size, value.clone())],

From 8eaa8ad3582b28b67a927f9d40ddab74feb13713 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 29 Aug 2024 18:24:25 +0200
Subject: [PATCH 1529/1571] Remove async_trait usages from safekeeper and
 neon_local (#8864)

Removes additional async_trait usages from safekeeper and neon_local.

Also removes now redundant dependencies of the `async_trait` crate.

cc earlier work: #6305, #6464, #7303, #7342, #7212, #8296
---
 Cargo.lock                                         |  4 ----
 control_plane/Cargo.toml                           |  1 -
 control_plane/src/safekeeper.rs                    |  7 +++----
 libs/utils/Cargo.toml                              |  1 -
 pageserver/client/Cargo.toml                       |  1 -
 safekeeper/src/control_file.rs                     |  5 ++---
 safekeeper/src/safekeeper.rs                       |  2 --
 safekeeper/src/wal_storage.rs                      | 14 ++++++++------
 .../tests/walproposer_sim/safekeeper_disk.rs       |  2 --
 storage_controller/client/Cargo.toml               |  1 -
 10 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0c246bd258..5af3ef3804 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1333,7 +1333,6 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "async-trait",
  "camino",
  "clap",
  "comfy-table",
@@ -3790,7 +3789,6 @@ name = "pageserver_client"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "async-trait",
  "bytes",
  "futures",
  "pageserver_api",
@@ -5952,7 +5950,6 @@ name = "storage_controller_client"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "async-trait",
  "bytes",
  "futures",
  "pageserver_api",
@@ -6955,7 +6952,6 @@ dependencies = [
  "anyhow",
  "arc-swap",
  "async-compression",
- "async-trait",
  "bincode",
  "byteorder",
  "bytes",
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 487ac8f047..6fca59b368 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -6,7 +6,6 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
-async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index a0a73f5609..573f1688d5 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -5,6 +5,7 @@
 //! ```text
 //!   .neon/safekeepers/<safekeeper id>
 //! ```
+use std::future::Future;
 use std::io::Write;
 use std::path::PathBuf;
 use std::time::Duration;
@@ -34,12 +35,10 @@ pub enum SafekeeperHttpError {
 
 type Result<T> = result::Result<T, SafekeeperHttpError>;
 
-#[async_trait::async_trait]
-pub trait ResponseErrorMessageExt: Sized {
-    async fn error_from_body(self) -> Result<Self>;
+pub(crate) trait ResponseErrorMessageExt: Sized {
+    fn error_from_body(self) -> impl Future<Output = Result<Self>> + Send;
 }
 
-#[async_trait::async_trait]
 impl ResponseErrorMessageExt for reqwest::Response {
     async fn error_from_body(self) -> Result<Self> {
         let status = self.status();
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 777fb95ece..19deaab63f 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -14,7 +14,6 @@ testing = ["fail/failpoints"]
 arc-swap.workspace = true
 sentry.workspace = true
 async-compression.workspace = true
-async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
 bytes.workspace = true
diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml
index a938367334..d9b36bf3d4 100644
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -7,7 +7,6 @@ license.workspace = true
 [dependencies]
 pageserver_api.workspace = true
 thiserror.workspace = true
-async-trait.workspace = true
 reqwest = { workspace = true, features = [ "stream" ] }
 utils.workspace = true
 serde.workspace = true
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index c551cd3122..8b252b4ab4 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -7,6 +7,7 @@ use tokio::fs::File;
 use tokio::io::AsyncWriteExt;
 use utils::crashsafe::durable_rename;
 
+use std::future::Future;
 use std::io::Read;
 use std::ops::Deref;
 use std::path::Path;
@@ -31,10 +32,9 @@ pub const CHECKSUM_SIZE: usize = size_of::<u32>();
 
 /// Storage should keep actual state inside of it. It should implement Deref
 /// trait to access state fields and have persist method for updating that state.
-#[async_trait::async_trait]
 pub trait Storage: Deref<Target = TimelinePersistentState> {
     /// Persist safekeeper state on disk and update internal state.
-    async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()>;
+    fn persist(&mut self, s: &TimelinePersistentState) -> impl Future<Output = Result<()>> + Send;
 
     /// Timestamp of last persist.
     fn last_persist_at(&self) -> Instant;
@@ -188,7 +188,6 @@ impl TimelinePersistentState {
     }
 }
 
-#[async_trait::async_trait]
 impl Storage for FileStorage {
     /// Persists state durably to the underlying storage.
     ///
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 0814d9ba67..486954c7b9 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -971,7 +971,6 @@ mod tests {
         persisted_state: TimelinePersistentState,
     }
 
-    #[async_trait::async_trait]
     impl control_file::Storage for InMemoryState {
         async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> {
             self.persisted_state = s.clone();
@@ -1003,7 +1002,6 @@ mod tests {
         lsn: Lsn,
     }
 
-    #[async_trait::async_trait]
     impl wal_storage::Storage for DummyWalStore {
         fn flush_lsn(&self) -> Lsn {
             self.lsn
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index ded8571a3e..6fd7c91a68 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -15,6 +15,7 @@ use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogF
 use postgres_ffi::{dispatch_pgversion, XLogSegNo, PG_TLI};
 use remote_storage::RemotePath;
 use std::cmp::{max, min};
+use std::future::Future;
 use std::io::{self, SeekFrom};
 use std::pin::Pin;
 use tokio::fs::{self, remove_file, File, OpenOptions};
@@ -35,7 +36,6 @@ use postgres_ffi::XLOG_BLCKSZ;
 use pq_proto::SystemId;
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
-#[async_trait::async_trait]
 pub trait Storage {
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn;
@@ -44,16 +44,19 @@ pub trait Storage {
     /// the segment and short header at the page of given LSN. This is only used
     /// for timeline initialization because compute will stream data only since
     /// init_lsn. Other segment headers are included in compute stream.
-    async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()>;
+    fn initialize_first_segment(
+        &mut self,
+        init_lsn: Lsn,
+    ) -> impl Future<Output = Result<()>> + Send;
 
     /// Write piece of WAL from buf to disk, but not necessarily sync it.
-    async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>;
+    fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> impl Future<Output = Result<()>> + Send;
 
     /// Truncate WAL at specified LSN, which must be the end of WAL record.
-    async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>;
+    fn truncate_wal(&mut self, end_pos: Lsn) -> impl Future<Output = Result<()>> + Send;
 
     /// Durably store WAL on disk, up to the last written WAL record.
-    async fn flush_wal(&mut self) -> Result<()>;
+    fn flush_wal(&mut self) -> impl Future<Output = Result<()>> + Send;
 
     /// Remove all segments <= given segno. Returns function doing that as we
     /// want to perform it without timeline lock.
@@ -325,7 +328,6 @@ impl PhysicalStorage {
     }
 }
 
-#[async_trait::async_trait]
 impl Storage for PhysicalStorage {
     /// flush_lsn returns LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
index c2db9de78a..6b31edb1f2 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
@@ -83,7 +83,6 @@ impl DiskStateStorage {
     }
 }
 
-#[async_trait::async_trait]
 impl control_file::Storage for DiskStateStorage {
     /// Persist safekeeper state on disk and update internal state.
     async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> {
@@ -175,7 +174,6 @@ impl DiskWALStorage {
     }
 }
 
-#[async_trait::async_trait]
 impl wal_storage::Storage for DiskWALStorage {
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
diff --git a/storage_controller/client/Cargo.toml b/storage_controller/client/Cargo.toml
index c3bfe2bfd2..e7a4264fd0 100644
--- a/storage_controller/client/Cargo.toml
+++ b/storage_controller/client/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 thiserror.workspace = true
-async-trait.workspace = true
 reqwest.workspace = true
 utils.workspace = true
 serde.workspace = true

From 022fad65eba4a89e5356096aebf4517e46d9416c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 29 Aug 2024 20:16:44 +0100
Subject: [PATCH 1530/1571] proxy: fix password hash cancellation (#8868)

In #8863 I replaced the threadpool with tokio tasks, but there was a
behaviour I missed regarding cancellation. Adding the JoinHandle wrapper
that triggers abort on drop should fix this.

Another change, any panics that occur in password hashing will be
propagated through the resume_unwind functionality.
---
 proxy/src/scram/exchange.rs   |  3 +--
 proxy/src/scram/threadpool.rs | 40 +++++++++++++++++++++++++----------
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 7fdadc7038..786cbcaa19 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -86,8 +86,7 @@ async fn derive_client_key(
 ) -> ScramKey {
     let salted_password = pool
         .spawn_job(endpoint, Pbkdf2::start(password, salt, iterations))
-        .await
-        .expect("job should not be cancelled");
+        .await;
 
     let make_key = |name| {
         let key = Hmac::<Sha256>::new_from_slice(&salted_password)
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
index d73a927995..2702aeebfe 100644
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -15,6 +15,7 @@ use std::{
     task::{Context, Poll},
 };
 
+use futures::FutureExt;
 use rand::Rng;
 use rand::{rngs::SmallRng, SeedableRng};
 
@@ -74,15 +75,13 @@ impl ThreadPool {
         })
     }
 
-    pub(crate) fn spawn_job(
-        &self,
-        endpoint: EndpointIdInt,
-        pbkdf2: Pbkdf2,
-    ) -> tokio::task::JoinHandle<[u8; 32]> {
-        self.runtime
-            .as_ref()
-            .unwrap()
-            .spawn(JobSpec { pbkdf2, endpoint })
+    pub(crate) fn spawn_job(&self, endpoint: EndpointIdInt, pbkdf2: Pbkdf2) -> JobHandle {
+        JobHandle(
+            self.runtime
+                .as_ref()
+                .unwrap()
+                .spawn(JobSpec { pbkdf2, endpoint }),
+        )
     }
 }
 
@@ -167,6 +166,26 @@ impl Future for JobSpec {
     }
 }
 
+pub(crate) struct JobHandle(tokio::task::JoinHandle<[u8; 32]>);
+
+impl Future for JobHandle {
+    type Output = [u8; 32];
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        match self.0.poll_unpin(cx) {
+            Poll::Ready(Ok(ok)) => Poll::Ready(ok),
+            Poll::Ready(Err(err)) => std::panic::resume_unwind(err.into_panic()),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+impl Drop for JobHandle {
+    fn drop(&mut self) {
+        self.0.abort();
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::EndpointId;
@@ -183,8 +202,7 @@ mod tests {
         let salt = [0x55; 32];
         let actual = pool
             .spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096))
-            .await
-            .unwrap();
+            .await;
 
         let expected = [
             10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242,

From 72aa6b02dab6a8d0748fa79eac59f10f1d4dc4f1 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 30 Aug 2024 11:34:23 +0100
Subject: [PATCH 1531/1571] chore: speed up testing (#8874)

`safekeeper::random_test test_random_schedules` debug test takes over 2
minutes to run on our arm runners. Running it 6 times with pageserver
settings seems redundant.
---
 .github/workflows/_build-and-test-locally.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index a8526fc6b1..e18e6a1201 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -216,9 +216,13 @@ jobs:
           #nextest does not yet support running doctests
           ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
 
+          # run all non-pageserver tests
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'
+
+          # run pageserver tests with different settings
           for io_engine in std-fs tokio-epoll-uring ; do
             for io_buffer_alignment in 0 1 512 ; do
-              NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+              NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
             done
           done
 

From 20f82f91698fc64265b18e12cd7482b141e0832c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 30 Aug 2024 11:44:13 +0100
Subject: [PATCH 1532/1571] storage controller: sleep between compute notify
 retries (#8869)

## Problem

Live migration retries when it fails to notify the compute of the new
location. It should sleep between attempts.

Closes: https://github.com/neondatabase/neon/issues/8820

## Summary of changes

- Do an `exponential_backoff` in the retry loop for compute
notifications
---
 storage_controller/src/reconciler.rs | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 94db879ade..102a3124d2 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -12,6 +12,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
+use utils::backoff::exponential_backoff;
 use utils::failpoint_support;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
@@ -568,6 +569,7 @@ impl Reconciler {
 
         // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach
         // the origin without notifying compute, we will render the tenant unavailable.
+        let mut notify_attempts = 0;
         while let Err(e) = self.compute_notify().await {
             match e {
                 NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
@@ -578,6 +580,17 @@ impl Reconciler {
                     );
                 }
             }
+
+            exponential_backoff(
+                notify_attempts,
+                // Generous waits: control plane operations which might be blocking us usually complete on the order
+                // of hundreds to thousands of milliseconds, so no point busy polling.
+                1.0,
+                10.0,
+                &self.cancel,
+            )
+            .await;
+            notify_attempts += 1;
         }
 
         // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then

From e58e045ebb80940f8fa05c8c75fdb118978fa14a Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 30 Aug 2024 13:18:30 +0100
Subject: [PATCH 1533/1571] CI(promote-compatibility-data): fix job (#8871)

## Problem

`promote-compatibility-data` job got broken and slightly outdated after
- https://github.com/neondatabase/neon/pull/8552 -- we don't upload
artifacts for ARM64
- https://github.com/neondatabase/neon/pull/8561 -- we don't prepare
`debug` artifacts in the release branch anymore

## Summary of changes
- Promote artifacts from release PRs to the latest version (but do it
from `release` branch)
- Upload artifacts for both X64 and ARM64
---
 .../actions/run-python-test-set/action.yml    |  8 +-
 .github/workflows/build_and_test.yml          | 97 ++++++++++++++-----
 2 files changed, 75 insertions(+), 30 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 6c2cee0971..4008cd0d36 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -71,7 +71,7 @@ runs:
       if: inputs.build_type != 'remote'
       uses: ./.github/actions/download
       with:
-        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
+        name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
         path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
         prefix: latest
         # The lack of compatibility snapshot (for example, for the new Postgres version)
@@ -211,13 +211,13 @@ runs:
         fi
 
     - name: Upload compatibility snapshot
-      if: github.ref_name == 'release'
+      # Note, that we use `github.base_ref` which is a target branch for a PR
+      if: github.event_name == 'pull_request' && github.base_ref == 'release'
       uses: ./.github/actions/upload
       with:
-        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }}
+        name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
         # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
         path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/
-        prefix: latest
 
     - name: Upload test results
       if: ${{ !cancelled() }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1e7f3598c2..53d33b420f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1055,43 +1055,88 @@ jobs:
               generate_release_notes: true,
             })
 
+  # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
   promote-compatibility-data:
-    needs: [ check-permissions, promote-images, tag, build-and-test-locally ]
+    needs: [ deploy ]
     if: github.ref_name == 'release'
 
-    runs-on: [ self-hosted, small ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-      options: --init
+    runs-on: ubuntu-22.04
     steps:
-      - name: Promote compatibility snapshot for the release
+      - name: Fetch GITHUB_RUN_ID and COMMIT_SHA for the last merged release PR
+        id: fetch-last-release-pr-info
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          branch_name_and_pr_number=$(gh pr list \
+            --repo "${GITHUB_REPOSITORY}" \
+            --base release \
+            --state merged \
+            --limit 10 \
+            --json mergeCommit,headRefName,number \
+            --jq ".[] | select(.mergeCommit.oid==\"${GITHUB_SHA}\") | { branch_name: .headRefName, pr_number: .number }")
+          branch_name=$(echo "${branch_name_and_pr_number}" | jq -r '.branch_name')
+          pr_number=$(echo "${branch_name_and_pr_number}" | jq -r '.pr_number')
+
+          run_id=$(gh run list \
+            --repo "${GITHUB_REPOSITORY}" \
+            --workflow build_and_test.yml \
+            --branch "${branch_name}" \
+            --json databaseId \
+            --limit 1 \
+            --jq '.[].databaseId')
+
+          last_commit_sha=$(gh pr view "${pr_number}" \
+            --repo "${GITHUB_REPOSITORY}" \
+            --json commits \
+            --jq '.commits[-1].oid')
+
+          echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT}
+          echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT}
+
+      - name: Promote compatibility snapshot and Neon artifact
         env:
           BUCKET: neon-github-public-dev
-          PREFIX: artifacts/latest
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          AWS_REGION: eu-central-1
+          COMMIT_SHA: ${{ steps.fetch-last-release-pr-info.outputs.commit-sha }}
+          RUN_ID: ${{ steps.fetch-last-release-pr-info.outputs.run-id }}
         run: |
-          # Update compatibility snapshot for the release
-          for pg_version in v14 v15 v16; do
-            for build_type in debug release; do
-              OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
-              NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst
+          old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}"
+          new_prefix="artifacts/latest"
 
-              time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
+          files_to_promote=()
+          files_on_s3=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${old_prefix} | jq -r '.Contents[]?.Key' || true)
+
+          for arch in X64 ARM64; do
+            for build_type in debug release; do
+              neon_artifact_filename="neon-Linux-${arch}-${build_type}-artifact.tar.zst"
+              s3_key=$(echo "${files_on_s3}" | grep ${neon_artifact_filename} | sort --version-sort | tail -1 || true)
+              if [ -z "${s3_key}" ]; then
+                echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${neon_artifact_filename} nor its version from previous attempts exist"
+                exit 1
+              fi
+
+              files_to_promote+=("s3://${BUCKET}/${s3_key}")
+
+              for pg_version in v14 v15 v16; do
+                # We run less tests for debug builds, so we don't need to promote them
+                if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then
+                  continue
+                fi
+
+                compatibility_data_filename="compatibility-snapshot-${arch}-${build_type}-pg${pg_version}.tar.zst"
+                s3_key=$(echo "${files_on_s3}" | grep ${compatibility_data_filename} | sort --version-sort | tail -1 || true)
+                if [ -z "${s3_key}" ]; then
+                  echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${compatibility_data_filename} nor its version from previous attempts exist"
+                  exit 1
+                fi
+
+                files_to_promote+=("s3://${BUCKET}/${s3_key}")
+              done
             done
           done
 
-          # Update Neon artifact for the release (reuse already uploaded artifact)
-          for build_type in debug release; do
-            OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
-            FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
-
-            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
-            if [ -z "${S3_KEY}" ]; then
-              echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
-              exit 1
-            fi
-
-            time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
+          for f in "${files_to_promote[@]}"; do
+            time aws s3 cp --only-show-errors ${f} s3://${BUCKET}/${new_prefix}/
           done
 
   pin-build-tools-image:

From df971f995c3d1ab864426190382e297654938500 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Sat, 31 Aug 2024 02:12:39 +0800
Subject: [PATCH 1534/1571] feat(storage-scrubber): check layer map validity
 (#8867)

When implementing bottom-most gc-compaction, we analyzed the structure
of layer maps that the current compaction algorithm could produce, and
decided to only support structures without delta layer overlaps and LSN
intersections with the exception of single key layers.

## Summary of changes

This patch adds the layer map valid check in the storage scrubber.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/checks.rs | 59 +++++++++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 08b0f06ebf..15dfb101b5 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -1,6 +1,7 @@
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeSet, HashMap, HashSet};
 
 use anyhow::Context;
+use itertools::Itertools;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
@@ -47,6 +48,56 @@ impl TimelineAnalysis {
     }
 }
 
+/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
+/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
+///
+/// ```plain
+/// |       |                 |       |
+/// |   1   |    |   2   |    |   3   |
+/// |       |    |       |    |       |
+/// ```
+///
+/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have
+/// the same LSN range.
+///
+/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example,
+///
+/// ```plain
+/// |       |    |   2   |    |       |
+/// |   1   |    |-------|    |   3   |
+/// |       |    |   4   |    |       |
+///
+/// If layer 2 and 4 contain the same single key, this is also a valid layer map.
+fn check_valid_layermap(metadata: &HashMap<LayerName, LayerFileMetadata>) -> Option<String> {
+    let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
+    let mut all_delta_layers = Vec::new();
+    for (name, _) in metadata.iter() {
+        if let LayerName::Delta(layer) = name {
+            if layer.key_range.start.next() != layer.key_range.end {
+                all_delta_layers.push(layer.clone());
+            }
+        }
+    }
+    for layer in &all_delta_layers {
+        let lsn_range = &layer.lsn_range;
+        lsn_split_point.insert(lsn_range.start);
+        lsn_split_point.insert(lsn_range.end);
+    }
+    for layer in &all_delta_layers {
+        let lsn_range = layer.lsn_range.clone();
+        let intersects = lsn_split_point.range(lsn_range).collect_vec();
+        if intersects.len() > 1 {
+            let err = format!(
+                        "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
+                        layer,
+                        intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
+                    );
+            return Some(err);
+        }
+    }
+    None
+}
+
 pub(crate) async fn branch_cleanup_and_check_errors(
     remote_client: &GenericRemoteStorage,
     id: &TenantShardTimelineId,
@@ -126,6 +177,12 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                         }
                     }
 
+                    if let Some(err) = check_valid_layermap(&index_part.layer_metadata) {
+                        result.errors.push(format!(
+                            "index_part.json contains invalid layer map structure: {err}"
+                        ));
+                    }
+
                     for (layer, metadata) in index_part.layer_metadata {
                         if metadata.file_size == 0 {
                             result.errors.push(format!(

From cacb1ae3331873f2b34c56b03596caabad830f14 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Fri, 30 Aug 2024 14:53:52 -0400
Subject: [PATCH 1535/1571] pageserver: set default io_buffer_alignment to 512
 bytes (#8878)

## Summary of changes

- Setting default io_buffer_alignment to 512 bytes.
- Fix places that assumed `DEFAULT_IO_BUFFER_ALIGNMENT=0`
- Adapt unit tests to handle merge with `chunk size <= 4096`.

## Testing and Performance

We have done sufficient performance de-risking.

Enabling it by default completes our correctness de-risking before the
next release.

Context: https://neondb.slack.com/archives/C07BZ38E6SD/p1725026845455259

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/config.rs                      |  2 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  4 +-
 .../src/tenant/storage_layer/image_layer.rs   |  4 +-
 pageserver/src/tenant/vectored_blob_io.rs     | 59 ++++++++++---------
 pageserver/src/virtual_file.rs                | 18 ++----
 5 files changed, 42 insertions(+), 45 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 994075bef6..9e4530ba3c 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -96,7 +96,7 @@ pub mod defaults {
 
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
 
-    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 0;
+    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
 
     ///
     /// Default built-in configuration file.
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 00ef5b0afd..885eb13b29 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -2283,7 +2283,7 @@ pub(crate) mod test {
             .await
             .unwrap();
         let delta_layer = resident_layer.get_as_delta(&ctx).await.unwrap();
-        for max_read_size in [1, 2048] {
+        for max_read_size in [1, 1024] {
             for batch_size in [1, 2, 4, 8, 3, 7, 13] {
                 println!("running with batch_size={batch_size} max_read_size={max_read_size}");
                 // Test if the batch size is correctly determined
@@ -2297,7 +2297,7 @@ pub(crate) mod test {
                         // every key should be a batch b/c the value is larger than max_read_size
                         assert_eq!(iter.key_values_batch.len(), 1);
                     } else {
-                        assert_eq!(iter.key_values_batch.len(), batch_size);
+                        assert!(iter.key_values_batch.len() <= batch_size);
                     }
                     if num_items >= N {
                         break;
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 38411e9d9e..4c22541e02 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1367,7 +1367,7 @@ mod test {
                 .await
                 .unwrap();
         let img_layer = resident_layer.get_as_image(&ctx).await.unwrap();
-        for max_read_size in [1, 2048] {
+        for max_read_size in [1, 1024] {
             for batch_size in [1, 2, 4, 8, 3, 7, 13] {
                 println!("running with batch_size={batch_size} max_read_size={max_read_size}");
                 // Test if the batch size is correctly determined
@@ -1381,7 +1381,7 @@ mod test {
                         // every key should be a batch b/c the value is larger than max_read_size
                         assert_eq!(iter.key_values_batch.len(), 1);
                     } else {
-                        assert_eq!(iter.key_values_batch.len(), batch_size);
+                        assert!(iter.key_values_batch.len() <= batch_size);
                     }
                     if num_items >= N {
                         break;
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 80bc56092d..146bcf0e35 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -25,7 +25,6 @@ use tokio_epoll_uring::BoundedBuf;
 use utils::lsn::Lsn;
 use utils::vec_map::VecMap;
 
-use crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use crate::context::RequestContext;
 use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
 use crate::virtual_file::{self, VirtualFile};
@@ -91,7 +90,7 @@ impl VectoredReadCoalesceMode {
     /// whereas [`ChunkedVectoredReadBuilder`] is used for alignment requirement 1 and higher.
     pub(crate) fn get() -> Self {
         let align = virtual_file::get_io_buffer_alignment_raw();
-        if align == DEFAULT_IO_BUFFER_ALIGNMENT {
+        if align == 0 {
             VectoredReadCoalesceMode::AdjacentOnly
         } else {
             VectoredReadCoalesceMode::Chunked(align)
@@ -735,27 +734,32 @@ mod tests {
     fn planner_chunked_coalesce_all_test() {
         use crate::virtual_file;
 
-        const CHUNK_SIZE: u64 = 512;
-        virtual_file::set_io_buffer_alignment(CHUNK_SIZE as usize).unwrap();
-        let max_read_size = CHUNK_SIZE as usize * 8;
+        let chunk_size = virtual_file::get_io_buffer_alignment() as u64;
+
+        // The test explicitly does not check chunk size < 512
+        if chunk_size < 512 {
+            return;
+        }
+
+        let max_read_size = chunk_size as usize * 8;
         let key = Key::MIN;
         let lsn = Lsn(0);
 
         let blob_descriptions = [
-            (key, lsn, CHUNK_SIZE / 8, BlobFlag::None), // Read 1 BEGIN
-            (key, lsn, CHUNK_SIZE / 4, BlobFlag::Ignore), // Gap
-            (key, lsn, CHUNK_SIZE / 2, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE - 2, BlobFlag::Ignore), // Gap
-            (key, lsn, CHUNK_SIZE, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE * 2 - 1, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE * 2 + 1, BlobFlag::Ignore), // Gap
-            (key, lsn, CHUNK_SIZE * 3 + 1, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE * 5 + 1, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce.
-            (key, lsn, CHUNK_SIZE * 7 + 1, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size)
-            (key, lsn, CHUNK_SIZE * 9, BlobFlag::Ignore), // ==== skipped a chunk
-            (key, lsn, CHUNK_SIZE * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce)
+            (key, lsn, chunk_size / 8, BlobFlag::None), // Read 1 BEGIN
+            (key, lsn, chunk_size / 4, BlobFlag::Ignore), // Gap
+            (key, lsn, chunk_size / 2, BlobFlag::None),
+            (key, lsn, chunk_size - 2, BlobFlag::Ignore), // Gap
+            (key, lsn, chunk_size, BlobFlag::None),
+            (key, lsn, chunk_size * 2 - 1, BlobFlag::None),
+            (key, lsn, chunk_size * 2 + 1, BlobFlag::Ignore), // Gap
+            (key, lsn, chunk_size * 3 + 1, BlobFlag::None),
+            (key, lsn, chunk_size * 5 + 1, BlobFlag::None),
+            (key, lsn, chunk_size * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce.
+            (key, lsn, chunk_size * 7 + 1, BlobFlag::None),
+            (key, lsn, chunk_size * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size)
+            (key, lsn, chunk_size * 9, BlobFlag::Ignore), // ==== skipped a chunk
+            (key, lsn, chunk_size * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce)
         ];
 
         let ranges = [
@@ -834,18 +838,19 @@ mod tests {
 
     #[test]
     fn planner_replacement_test() {
-        let max_read_size = 128 * 1024;
+        let chunk_size = virtual_file::get_io_buffer_alignment() as u64;
+        let max_read_size = 128 * chunk_size as usize;
         let first_key = Key::MIN;
         let second_key = first_key.next();
         let lsn = Lsn(0);
 
         let blob_descriptions = vec![
-            (first_key, lsn, 0, BlobFlag::None),    // First in read 1
-            (first_key, lsn, 1024, BlobFlag::None), // Last in read 1
-            (second_key, lsn, 2 * 1024, BlobFlag::ReplaceAll),
-            (second_key, lsn, 3 * 1024, BlobFlag::None),
-            (second_key, lsn, 4 * 1024, BlobFlag::ReplaceAll), // First in read 2
-            (second_key, lsn, 5 * 1024, BlobFlag::None),       // Last in read 2
+            (first_key, lsn, 0, BlobFlag::None),          // First in read 1
+            (first_key, lsn, chunk_size, BlobFlag::None), // Last in read 1
+            (second_key, lsn, 2 * chunk_size, BlobFlag::ReplaceAll),
+            (second_key, lsn, 3 * chunk_size, BlobFlag::None),
+            (second_key, lsn, 4 * chunk_size, BlobFlag::ReplaceAll), // First in read 2
+            (second_key, lsn, 5 * chunk_size, BlobFlag::None),       // Last in read 2
         ];
 
         let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]];
@@ -855,7 +860,7 @@ mod tests {
             planner.handle(key, lsn, offset, flag);
         }
 
-        planner.handle_range_end(6 * 1024);
+        planner.handle_range_end(6 * chunk_size);
 
         let reads = planner.finish();
         assert_eq!(reads.len(), 2);
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 4b11dc1a94..97d966e2da 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -1196,15 +1196,11 @@ pub(crate) fn get_io_buffer_alignment_raw() -> usize {
 
     if cfg!(test) {
         let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT";
-        if align == DEFAULT_IO_BUFFER_ALIGNMENT {
-            if let Some(test_align) = utils::env::var(env_var_name) {
-                if is_zero_or_power_of_two(test_align) {
-                    test_align
-                } else {
-                    panic!("IO buffer alignment ({test_align}) is not a power of two");
-                }
+        if let Some(test_align) = utils::env::var(env_var_name) {
+            if is_zero_or_power_of_two(test_align) {
+                test_align
             } else {
-                crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT
+                panic!("IO buffer alignment ({test_align}) is not a power of two");
             }
         } else {
             align
@@ -1219,11 +1215,7 @@ pub(crate) fn get_io_buffer_alignment_raw() -> usize {
 /// This function should be used for getting the actual alignment value to use.
 pub(crate) fn get_io_buffer_alignment() -> usize {
     let align = get_io_buffer_alignment_raw();
-    if align == DEFAULT_IO_BUFFER_ALIGNMENT {
-        1
-    } else {
-        align
-    }
+    align.max(1)
 }
 
 #[cfg(test)]

From 05caaab8504093f708c81fd01454c8da45a4901d Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Sat, 31 Aug 2024 05:22:26 +0800
Subject: [PATCH 1536/1571] fix(pageserver): fire layer eviction alert only
 when it's visible (#8882)

The pull request https://github.com/neondatabase/neon/pull/8679
explicitly mentioned that it will evict layers earlier than before.
Given that the eviction metrics is solely based on eviction threshold
(which is 86400s now), we should consider the early eviction and do not
fire alert if it's a covered layer.

## Summary of changes

Record eviction timer only when the layer is visible + accessed.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/storage_layer/layer.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 53bb66b95e..86a200ce28 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1494,8 +1494,9 @@ impl LayerInner {
                 let duration = SystemTime::now().duration_since(local_layer_mtime);
                 match duration {
                     Ok(elapsed) => {
-                        let accessed = self.access_stats.accessed();
-                        if accessed {
+                        let accessed_and_visible = self.access_stats.accessed()
+                            && self.access_stats.visibility() == LayerVisibilityHint::Visible;
+                        if accessed_and_visible {
                             // Only layers used for reads contribute to our "low residence" metric that is used
                             // to detect thrashing.  Layers promoted for other reasons (e.g. compaction) are allowed
                             // to be rapidly evicted without contributing to this metric.
@@ -1509,7 +1510,7 @@ impl LayerInner {
 
                         tracing::info!(
                             residence_millis = elapsed.as_millis(),
-                            accessed,
+                            accessed_and_visible,
                             "evicted layer after known residence period"
                         );
                     }

From 3ec785f30d248739daba93d10353187ca733da0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 31 Aug 2024 01:12:25 +0200
Subject: [PATCH 1537/1571] Add safekeeper scrubber test (#8785)

The test is very rudimentary, it only checks that before and after
tenant deletion, we can run `scan_metadata` for the safekeeper node
kind. Also, we don't actually expect any uploaded data, for that we
don't have enough WAL (needs to create at least one S3-uploaded file,
the scrubber doesn't recognize partial files yet).

The `scan_metadata` scrubber subcommand is extended to support either
specifying a database connection string, which was previously the only
way, and required a database to be present, or specifying the timeline
information manually via json. This is ideal for testing scenarios
because in those, the number of timelines is usually limited,
but it is involved to spin up a database just to write the timeline
information.
---
 storage_scrubber/src/main.rs                  |  32 ++--
 .../src/scan_safekeeper_metadata.rs           | 156 ++++++++++++------
 test_runner/fixtures/neon_fixtures.py         |  46 +++++-
 test_runner/regress/test_tenant_delete.py     |  52 +++++-
 4 files changed, 213 insertions(+), 73 deletions(-)

diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index 3935e513e3..c5961753c5 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,4 +1,4 @@
-use anyhow::{anyhow, bail};
+use anyhow::{anyhow, bail, Context};
 use camino::Utf8PathBuf;
 use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
 use pageserver_api::shard::TenantShardId;
@@ -7,6 +7,7 @@ use storage_controller_client::control_api;
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
 use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata;
+use storage_scrubber::scan_safekeeper_metadata::DatabaseOrList;
 use storage_scrubber::tenant_snapshot::SnapshotDownloader;
 use storage_scrubber::{find_large_objects, ControllerClientConfig};
 use storage_scrubber::{
@@ -76,6 +77,9 @@ enum Command {
         /// For safekeeper node_kind only, table in the db with debug dump
         #[arg(long, default_value = None)]
         dump_db_table: Option<String>,
+        /// For safekeeper node_kind only, json list of timelines and their lsn info
+        #[arg(long, default_value = None)]
+        timeline_lsns: Option<String>,
     },
     TenantSnapshot {
         #[arg(long = "tenant-id")]
@@ -155,20 +159,22 @@ async fn main() -> anyhow::Result<()> {
             post_to_storcon,
             dump_db_connstr,
             dump_db_table,
+            timeline_lsns,
         } => {
             if let NodeKind::Safekeeper = node_kind {
-                let dump_db_connstr =
-                    dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?;
-                let dump_db_table =
-                    dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?;
-
-                let summary = scan_safekeeper_metadata(
-                    bucket_config.clone(),
-                    tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(),
-                    dump_db_connstr,
-                    dump_db_table,
-                )
-                .await?;
+                let db_or_list = match (timeline_lsns, dump_db_connstr) {
+                    (Some(timeline_lsns), _) => {
+                        let timeline_lsns = serde_json::from_str(&timeline_lsns).context("parsing timeline_lsns")?;
+                        DatabaseOrList::List(timeline_lsns)
+                    }
+                    (None, Some(dump_db_connstr)) => {
+                        let dump_db_table = dump_db_table.ok_or_else(|| anyhow::anyhow!("dump_db_table not specified"))?;
+                        let tenant_ids = tenant_ids.iter().map(|tshid| tshid.tenant_id).collect();
+                        DatabaseOrList::Database { tenant_ids, connstr: dump_db_connstr, table: dump_db_table }
+                    }
+                    (None, None) => anyhow::bail!("neither `timeline_lsns` specified, nor `dump_db_connstr` and `dump_db_table`"),
+                };
+                let summary = scan_safekeeper_metadata(bucket_config.clone(), db_or_list).await?;
                 if json {
                     println!("{}", serde_json::to_string(&summary).unwrap())
                 } else {
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index 1a9f3d0ef5..15f3665fac 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -7,7 +7,7 @@ use postgres_ffi::{XLogFileName, PG_TLI};
 use remote_storage::GenericRemoteStorage;
 use serde::Serialize;
 use tokio_postgres::types::PgLsn;
-use tracing::{error, info, trace};
+use tracing::{debug, error, info};
 use utils::{
     id::{TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
@@ -54,6 +54,23 @@ impl MetadataSummary {
     }
 }
 
+#[derive(serde::Deserialize)]
+pub struct TimelineLsnData {
+    tenant_id: String,
+    timeline_id: String,
+    timeline_start_lsn: Lsn,
+    backup_lsn: Lsn,
+}
+
+pub enum DatabaseOrList {
+    Database {
+        tenant_ids: Vec<TenantId>,
+        connstr: String,
+        table: String,
+    },
+    List(Vec<TimelineLsnData>),
+}
+
 /// Scan the safekeeper metadata in an S3 bucket, reporting errors and
 /// statistics.
 ///
@@ -63,68 +80,39 @@ impl MetadataSummary {
 /// the project wasn't deleted in the meanwhile.
 pub async fn scan_safekeeper_metadata(
     bucket_config: BucketConfig,
-    tenant_ids: Vec<TenantId>,
-    dump_db_connstr: String,
-    dump_db_table: String,
+    db_or_list: DatabaseOrList,
 ) -> anyhow::Result<MetadataSummary> {
     info!(
-        "checking bucket {}, region {}, dump_db_table {}",
-        bucket_config.bucket, bucket_config.region, dump_db_table
+        "checking bucket {}, region {}",
+        bucket_config.bucket, bucket_config.region
     );
-    // Use rustls (Neon requires TLS)
-    let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
-    let client_config = rustls::ClientConfig::builder()
-        .with_root_certificates(root_store)
-        .with_no_client_auth();
-    let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
-    let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
-    // The connection object performs the actual communication with the database,
-    // so spawn it off to run on its own.
-    tokio::spawn(async move {
-        if let Err(e) = connection.await {
-            eprintln!("connection error: {}", e);
-        }
-    });
-
-    let tenant_filter_clause = if !tenant_ids.is_empty() {
-        format!(
-            "and tenant_id in ({})",
-            tenant_ids
-                .iter()
-                .map(|t| format!("'{}'", t))
-                .collect::<Vec<_>>()
-                .join(", ")
-        )
-    } else {
-        "".to_owned()
-    };
-    let query = format!(
-        "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;",
-        dump_db_table, tenant_filter_clause,
-    );
-    info!("query is {}", query);
-    let timelines = client.query(&query, &[]).await?;
-    info!("loaded {} timelines", timelines.len());
 
     let (remote_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?;
     let console_config = ConsoleConfig::from_env()?;
     let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
 
-    let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| {
-        let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id");
-        let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id");
-        let timeline_start_lsn_pg: PgLsn = row.get(2);
-        let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg));
-        let backup_lsn_pg: PgLsn = row.get(3);
-        let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
+    let timelines = match db_or_list {
+        DatabaseOrList::Database {
+            tenant_ids,
+            connstr,
+            table,
+        } => load_timelines_from_db(tenant_ids, connstr, table).await?,
+        DatabaseOrList::List(list) => list,
+    };
+    info!("loaded {} timelines", timelines.len());
+
+    let checks = futures::stream::iter(timelines.into_iter().map(Ok)).map_ok(|timeline| {
+        let tenant_id = TenantId::from_str(&timeline.tenant_id).expect("failed to parse tenant_id");
+        let timeline_id =
+            TimelineId::from_str(&timeline.timeline_id).expect("failed to parse tenant_id");
         let ttid = TenantTimelineId::new(tenant_id, timeline_id);
         check_timeline(
             &remote_client,
             &target,
             &cloud_admin_api_client,
             ttid,
-            timeline_start_lsn,
-            backup_lsn,
+            timeline.timeline_start_lsn,
+            timeline.backup_lsn,
         )
     });
     // Run multiple check_timeline's concurrently.
@@ -163,11 +151,9 @@ async fn check_timeline(
     timeline_start_lsn: Lsn,
     backup_lsn: Lsn,
 ) -> anyhow::Result<TimelineCheckResult> {
-    trace!(
+    debug!(
         "checking ttid {}, should contain WAL [{}-{}]",
-        ttid,
-        timeline_start_lsn,
-        backup_lsn
+        ttid, timeline_start_lsn, backup_lsn
     );
     // calculate expected segfiles
     let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE);
@@ -177,7 +163,7 @@ async fn check_timeline(
             .map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)),
     );
     let expected_files_num = expected_segfiles.len();
-    trace!("expecting {} files", expected_segfiles.len(),);
+    debug!("expecting {} files", expected_segfiles.len(),);
 
     // now list s3 and check if it misses something
     let ttshid =
@@ -252,3 +238,65 @@ fn load_certs() -> Result<Arc<rustls::RootCertStore>, std::io::Error> {
     Ok(Arc::new(store))
 }
 static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
+
+async fn load_timelines_from_db(
+    tenant_ids: Vec<TenantId>,
+    dump_db_connstr: String,
+    dump_db_table: String,
+) -> anyhow::Result<Vec<TimelineLsnData>> {
+    info!("loading from table {dump_db_table}");
+
+    // Use rustls (Neon requires TLS)
+    let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
+    let client_config = rustls::ClientConfig::builder()
+        .with_root_certificates(root_store)
+        .with_no_client_auth();
+    let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
+    let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
+    // The connection object performs the actual communication with the database,
+    // so spawn it off to run on its own.
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    let tenant_filter_clause = if !tenant_ids.is_empty() {
+        format!(
+            "and tenant_id in ({})",
+            tenant_ids
+                .iter()
+                .map(|t| format!("'{}'", t))
+                .collect::<Vec<_>>()
+                .join(", ")
+        )
+    } else {
+        "".to_owned()
+    };
+    let query = format!(
+        "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) \
+        from \"{dump_db_table}\" \
+        where not is_cancelled {tenant_filter_clause} \
+        group by tenant_id, timeline_id;"
+    );
+    info!("query is {}", query);
+    let timelines = client.query(&query, &[]).await?;
+
+    let timelines = timelines
+        .into_iter()
+        .map(|row| {
+            let tenant_id = row.get(0);
+            let timeline_id = row.get(1);
+            let timeline_start_lsn_pg: PgLsn = row.get(2);
+            let backup_lsn_pg: PgLsn = row.get(3);
+
+            TimelineLsnData {
+                tenant_id,
+                timeline_id,
+                timeline_start_lsn: Lsn(u64::from(timeline_start_lsn_pg)),
+                backup_lsn: Lsn(u64::from(backup_lsn_pg)),
+            }
+        })
+        .collect::<Vec<TimelineLsnData>>();
+    Ok(timelines)
+}
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 69a4234617..800ae03d13 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4625,12 +4625,20 @@ class Safekeeper(LogUtils):
         wait_until(20, 0.5, paused)
 
 
+# TODO: Replace with `StrEnum` when we upgrade to python 3.11
+class NodeKind(str, Enum):
+    PAGESERVER = "pageserver"
+    SAFEKEEPER = "safekeeper"
+
+
 class StorageScrubber:
     def __init__(self, env: NeonEnv, log_dir: Path):
         self.env = env
         self.log_dir = log_dir
 
-    def scrubber_cli(self, args: list[str], timeout) -> str:
+    def scrubber_cli(
+        self, args: list[str], timeout, extra_env: Optional[Dict[str, str]] = None
+    ) -> str:
         assert isinstance(self.env.pageserver_remote_storage, S3Storage)
         s3_storage = self.env.pageserver_remote_storage
 
@@ -4645,6 +4653,9 @@ class StorageScrubber:
         if s3_storage.endpoint is not None:
             env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})
 
+        if extra_env is not None:
+            env.update(extra_env)
+
         base_args = [
             str(self.env.neon_binpath / "storage_scrubber"),
             f"--controller-api={self.env.storage_controller.api_root()}",
@@ -4672,18 +4683,43 @@ class StorageScrubber:
         assert stdout is not None
         return stdout
 
-    def scan_metadata(self, post_to_storage_controller: bool = False) -> Tuple[bool, Any]:
+    def scan_metadata_safekeeper(
+        self,
+        timeline_lsns: List[Dict[str, Any]],
+        cloud_admin_api_url: str,
+        cloud_admin_api_token: str,
+    ) -> Tuple[bool, Any]:
+        extra_env = {
+            "CLOUD_ADMIN_API_URL": cloud_admin_api_url,
+            "CLOUD_ADMIN_API_TOKEN": cloud_admin_api_token,
+        }
+        return self.scan_metadata(
+            node_kind=NodeKind.SAFEKEEPER, timeline_lsns=timeline_lsns, extra_env=extra_env
+        )
+
+    def scan_metadata(
+        self,
+        post_to_storage_controller: bool = False,
+        node_kind: NodeKind = NodeKind.PAGESERVER,
+        timeline_lsns: Optional[List[Dict[str, Any]]] = None,
+        extra_env: Optional[Dict[str, str]] = None,
+    ) -> Tuple[bool, Any]:
         """
         Returns the health status and the metadata summary.
         """
-        args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
+        args = ["scan-metadata", "--node-kind", node_kind.value, "--json"]
         if post_to_storage_controller:
             args.append("--post")
-        stdout = self.scrubber_cli(args, timeout=30)
+        if timeline_lsns is not None:
+            args.append("--timeline-lsns")
+            args.append(json.dumps(timeline_lsns))
+        stdout = self.scrubber_cli(args, timeout=30, extra_env=extra_env)
 
         try:
             summary = json.loads(stdout)
-            healthy = not summary["with_errors"] and not summary["with_warnings"]
+            # summary does not contain "with_warnings" if node_kind is the safekeeper
+            no_warnings = "with_warnings" not in summary or not summary["with_warnings"]
+            healthy = not summary["with_errors"] and no_warnings
             return healthy, summary
         except:
             log.error("Failed to decode JSON output from `scan-metadata`.  Dumping stdout:")
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 448a28dc31..7ee949e8d3 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -1,7 +1,9 @@
+import json
 from threading import Thread
 
 import pytest
 from fixtures.common_types import Lsn, TenantId, TimelineId
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
@@ -17,6 +19,8 @@ from fixtures.pageserver.utils import (
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.utils import run_pg_bench_small, wait_until
 from requests.exceptions import ReadTimeout
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
 
 
 def error_tolerant_delete(ps_http, tenant_id):
@@ -322,7 +326,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
     env.pageserver.stop()
 
 
-def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
+def test_tenant_delete_scrubber(pg_bin: PgBin, make_httpserver, neon_env_builder: NeonEnvBuilder):
     """
     Validate that creating and then deleting the tenant both survives the scrubber,
     and that one can run the scrubber without problems.
@@ -347,6 +351,45 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
     healthy, _ = env.storage_scrubber.scan_metadata()
     assert healthy
 
+    timeline_lsns = {
+        "tenant_id": f"{tenant_id}",
+        "timeline_id": f"{timeline_id}",
+        "timeline_start_lsn": f"{last_flush_lsn}",
+        "backup_lsn": f"{last_flush_lsn}",
+    }
+
+    cloud_admin_url = f"http://{make_httpserver.host}:{make_httpserver.port}/"
+    cloud_admin_token = ""
+
+    def get_branches(request: Request):
+        # Compare definition with `BranchData` struct
+        dummy_data = {
+            "id": "test-branch-id",
+            "created_at": "",  # TODO
+            "updated_at": "",  # TODO
+            "name": "testbranchname",
+            "project_id": "test-project-id",
+            "timeline_id": f"{timeline_id}",
+            "default": False,
+            "deleted": False,
+            "logical_size": 42000,
+            "physical_size": 42000,
+            "written_size": 42000,
+        }
+        # This test does all its own compute configuration (by passing explicit pageserver ID to Workload functions),
+        # so we send controller notifications to /dev/null to prevent it fighting the test for control of the compute.
+        log.info(f"got get_branches request: {request.json}")
+        return Response(json.dumps(dummy_data), content_type="application/json", status=200)
+
+    make_httpserver.expect_request("/branches", method="GET").respond_with_handler(get_branches)
+
+    healthy, _ = env.storage_scrubber.scan_metadata_safekeeper(
+        timeline_lsns=[timeline_lsns],
+        cloud_admin_api_url=cloud_admin_url,
+        cloud_admin_api_token=cloud_admin_token,
+    )
+    assert healthy
+
     env.start()
     ps_http = env.pageserver.http_client()
     ps_http.tenant_delete(tenant_id)
@@ -354,3 +397,10 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
 
     healthy, _ = env.storage_scrubber.scan_metadata()
     assert healthy
+
+    healthy, _ = env.storage_scrubber.scan_metadata_safekeeper(
+        timeline_lsns=[timeline_lsns],
+        cloud_admin_api_url=cloud_admin_url,
+        cloud_admin_api_token=cloud_admin_token,
+    )
+    assert healthy

From 516ac0591e762142ca0ce85f212192c5af59a097 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 2 Sep 2024 12:36:57 +0100
Subject: [PATCH 1538/1571] storage controller: eliminate ensure_attached
 (#8875)

## Problem

This is a followup to #8783

- The old blocking ensure_attached function had been retained to handle
the case where a shard had a None generation_pageserver, but this wasn't
really necessary.
- There was a subtle `.1` in the code where a struct would have been
clearer

Closes #8819

## Summary of changes

- Add ShardGenerationState to represent the results of peek_generation
- Instead of calling ensure_attached when a tenant has a non-attached
shard, check the shard's policy and return 409 if it isn't Attached,
else return 503 if the shard's policy is attached but it hasn't been
reconciled yet (i.e. has a None generation_pageserver)
---
 storage_controller/src/persistence.rs |  22 +++--
 storage_controller/src/service.rs     | 124 +++++++++-----------------
 2 files changed, 58 insertions(+), 88 deletions(-)

diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index a842079ce7..6e1c2016ff 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -122,6 +122,13 @@ pub(crate) enum TenantFilter {
     Shard(TenantShardId),
 }
 
+/// Represents the results of looking up generation+pageserver for the shards of a tenant
+pub(crate) struct ShardGenerationState {
+    pub(crate) tenant_shard_id: TenantShardId,
+    pub(crate) generation: Option<Generation>,
+    pub(crate) generation_pageserver: Option<NodeId>,
+}
+
 impl Persistence {
     // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
     // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -540,7 +547,7 @@ impl Persistence {
     pub(crate) async fn peek_generations(
         &self,
         filter_tenant_id: TenantId,
-    ) -> Result<Vec<(TenantShardId, Option<Generation>, Option<NodeId>)>, DatabaseError> {
+    ) -> Result<Vec<ShardGenerationState>, DatabaseError> {
         use crate::schema::tenant_shards::dsl::*;
         let rows = self
             .with_measured_conn(DatabaseOperation::PeekGenerations, move |conn| {
@@ -555,13 +562,12 @@ impl Persistence {
 
         Ok(rows
             .into_iter()
-            .map(|p| {
-                (
-                    p.get_tenant_shard_id()
-                        .expect("Corrupt tenant shard id in database"),
-                    p.generation.map(|g| Generation::new(g as u32)),
-                    p.generation_pageserver.map(|n| NodeId(n as u64)),
-                )
+            .map(|p| ShardGenerationState {
+                tenant_shard_id: p
+                    .get_tenant_shard_id()
+                    .expect("Corrupt tenant shard id in database"),
+                generation: p.generation.map(|g| Generation::new(g as u32)),
+                generation_pageserver: p.generation_pageserver.map(|n| NodeId(n as u64)),
             })
             .collect())
     }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 1f221a9b45..78627953d0 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -22,7 +22,7 @@ use crate::{
     peer_client::GlobalObservedState,
     persistence::{
         AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
-        TenantFilter,
+        ShardGenerationState, TenantFilter,
     },
     reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
     scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
@@ -3106,20 +3106,44 @@ impl Service {
             // will still be the latest when we're done: we will check generations again at the end of
             // this function to handle that.
             let generations = self.persistence.peek_generations(tenant_id).await?;
-            let generations = if generations.iter().any(|i| i.1.is_none()) {
-                // One or more shards is not attached to anything: maybe this is a new tenant?  Wait for
-                // it to reconcile.
-                self.ensure_attached_wait(tenant_id).await?;
-                self.persistence.peek_generations(tenant_id).await?
-            } else {
-                generations
-            };
+
+            if generations
+                .iter()
+                .any(|i| i.generation.is_none() || i.generation_pageserver.is_none())
+            {
+                // One or more shards has not been attached to a pageserver.  Check if this is because it's configured
+                // to be detached (409: caller should give up), or because it's meant to be attached but isn't yet (503: caller should retry)
+                let locked = self.inner.read().unwrap();
+                for (shard_id, shard) in
+                    locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+                {
+                    match shard.policy {
+                        PlacementPolicy::Attached(_) => {
+                            // This shard is meant to be attached: the caller is not wrong to try and
+                            // use this function, but we can't service the request right now.
+                        }
+                        PlacementPolicy::Secondary | PlacementPolicy::Detached => {
+                            return Err(ApiError::Conflict(format!(
+                                "Shard {shard_id} tenant has policy {:?}",
+                                shard.policy
+                            )));
+                        }
+                    }
+                }
+
+                return Err(ApiError::ResourceUnavailable(
+                    "One or more shards in tenant is not yet attached".into(),
+                ));
+            }
 
             let locked = self.inner.read().unwrap();
-            for (tenant_shard_id, generation, generation_pageserver) in generations {
-                let node_id = generation_pageserver.ok_or(ApiError::Conflict(
-                    "Tenant not currently attached".to_string(),
-                ))?;
+            for ShardGenerationState {
+                tenant_shard_id,
+                generation,
+                generation_pageserver,
+            } in generations
+            {
+                let node_id = generation_pageserver.expect("We checked for None above");
                 let node = locked
                     .nodes
                     .get(&node_id)
@@ -3141,7 +3165,13 @@ impl Service {
             let latest_generations = self.persistence.peek_generations(tenant_id).await?;
             if latest_generations
                 .into_iter()
-                .map(|g| (g.0, g.1))
+                .map(
+                    |ShardGenerationState {
+                         tenant_shard_id,
+                         generation,
+                         generation_pageserver: _,
+                     }| (tenant_shard_id, generation),
+                )
                 .collect::<Vec<_>>()
                 != target_gens
                     .into_iter()
@@ -5280,72 +5310,6 @@ impl Service {
         ))
     }
 
-    /// Helper for methods that will try and call pageserver APIs for
-    /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
-    /// is attached somewhere.
-    fn ensure_attached_schedule(
-        &self,
-        mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
-        tenant_id: TenantId,
-    ) -> Result<Vec<ReconcilerWaiter>, anyhow::Error> {
-        let mut waiters = Vec::new();
-        let (nodes, tenants, scheduler) = locked.parts_mut();
-
-        let mut schedule_context = ScheduleContext::default();
-        for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
-            shard.schedule(scheduler, &mut schedule_context)?;
-
-            // The shard's policies may not result in an attached location being scheduled: this
-            // is an error because our caller needs it attached somewhere.
-            if shard.intent.get_attached().is_none() {
-                return Err(anyhow::anyhow!(
-                    "Tenant {tenant_id} not scheduled to be attached"
-                ));
-            };
-
-            if shard.stably_attached().is_some() {
-                // We do not require the shard to be totally up to date on reconciliation: we just require
-                // that it has been attached on the intended node.   Other dirty state such as unattached secondary
-                // locations, or compute hook notifications can be ignored.
-                continue;
-            }
-
-            if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
-                tracing::info!("Waiting for shard {tenant_shard_id} to reconcile, in order to ensure it is attached");
-                waiters.push(waiter);
-            }
-        }
-        Ok(waiters)
-    }
-
-    async fn ensure_attached_wait(&self, tenant_id: TenantId) -> Result<(), ApiError> {
-        let ensure_waiters = {
-            let locked = self.inner.write().unwrap();
-
-            // Check if the tenant is splitting: in this case, even if it is attached,
-            // we must act as if it is not: this blocks e.g. timeline creation/deletion
-            // operations during the split.
-            for (_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) {
-                if !matches!(shard.splitting, SplitState::Idle) {
-                    return Err(ApiError::ResourceUnavailable(
-                        "Tenant shards are currently splitting".into(),
-                    ));
-                }
-            }
-
-            self.ensure_attached_schedule(locked, tenant_id)
-                .map_err(ApiError::InternalServerError)?
-        };
-
-        let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
-        for waiter in ensure_waiters {
-            let timeout = deadline.duration_since(Instant::now());
-            waiter.wait_timeout(timeout).await?;
-        }
-
-        Ok(())
-    }
-
     /// Like [`Self::maybe_configured_reconcile_shard`], but uses the default reconciler
     /// configuration
     fn maybe_reconcile_shard(

From 9746b6ea312a15b2d607d5a124ca3899ec953d06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 2 Sep 2024 13:51:45 +0200
Subject: [PATCH 1539/1571] Implement archival_config timeline endpoint in the
 storage controller (#8680)

Implement the timeline specific `archival_config` endpoint also in the
storage controller.

It's mostly a copy-paste of the detach handler: the task is the same: do
the same operation on all shards.

Part of #8088.
---
 pageserver/client/src/mgmt_api.rs            | 18 +++++
 storage_controller/src/http.rs               | 30 ++++++-
 storage_controller/src/pageserver_client.rs  | 20 ++++-
 storage_controller/src/service.rs            | 73 ++++++++++++++++-
 test_runner/regress/test_timeline_archive.py | 83 +++++++++-----------
 5 files changed, 174 insertions(+), 50 deletions(-)

diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 71d36f3113..737cb00835 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -419,6 +419,24 @@ impl Client {
         }
     }
 
+    pub async fn timeline_archival_config(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        req: &TimelineArchivalConfigRequest,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config",
+            self.mgmt_api_endpoint
+        );
+
+        self.request(Method::POST, &uri, req)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn timeline_detach_ancestor(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 207bd5a1e6..d3eb081be4 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -17,7 +17,7 @@ use pageserver_api::controller_api::{
 };
 use pageserver_api::models::{
     TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
-    TenantTimeTravelRequest, TimelineCreateRequest,
+    TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
@@ -334,6 +334,24 @@ async fn handle_tenant_timeline_delete(
     .await
 }
 
+async fn handle_tenant_timeline_archival_config(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
+    let create_req = json_request::<TimelineArchivalConfigRequest>(&mut req).await?;
+
+    service
+        .tenant_timeline_archival_config(tenant_id, timeline_id, create_req)
+        .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn handle_tenant_timeline_detach_ancestor(
     service: Arc<Service>,
     req: Request<Body>,
@@ -1160,6 +1178,16 @@ pub fn make_router(
                 RequestName("v1_tenant_timeline"),
             )
         })
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/archival_config",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_timeline_archival_config,
+                    RequestName("v1_tenant_timeline_archival_config"),
+                )
+            },
+        )
         .put(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor",
             |r| {
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 8d64201cd9..20770ed703 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -2,8 +2,8 @@ use pageserver_api::{
     models::{
         detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse,
         PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse,
-        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
-        TopTenantShardsRequest, TopTenantShardsResponse,
+        TenantShardSplitRequest, TenantShardSplitResponse, TimelineArchivalConfigRequest,
+        TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse,
     },
     shard::TenantShardId,
 };
@@ -227,6 +227,22 @@ impl PageserverClient {
         )
     }
 
+    pub(crate) async fn timeline_archival_config(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        req: &TimelineArchivalConfigRequest,
+    ) -> Result<()> {
+        measured_request!(
+            "timeline_archival_config",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner
+                .timeline_archival_config(tenant_shard_id, timeline_id, req)
+                .await
+        )
+    }
+
     pub(crate) async fn timeline_detach_ancestor(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 78627953d0..95821827e2 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -46,7 +46,10 @@ use pageserver_api::{
         TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
         TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
-    models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
+    models::{
+        SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
+        TopTenantShardsRequest,
+    },
 };
 use reqwest::StatusCode;
 use tracing::{instrument, Instrument};
@@ -131,6 +134,7 @@ enum TenantOperations {
     TimelineCreate,
     TimelineDelete,
     AttachHook,
+    TimelineArchivalConfig,
     TimelineDetachAncestor,
 }
 
@@ -2918,6 +2922,73 @@ impl Service {
         .await?
     }
 
+    pub(crate) async fn tenant_timeline_archival_config(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        req: TimelineArchivalConfigRequest,
+    ) -> Result<(), ApiError> {
+        tracing::info!(
+            "Setting archival config of timeline {tenant_id}/{timeline_id} to '{:?}'",
+            req.state
+        );
+
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimelineArchivalConfig,
+        )
+        .await;
+
+        self.tenant_remote_mutation(tenant_id, move |targets| async move {
+            if targets.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
+            }
+            async fn config_one(
+                tenant_shard_id: TenantShardId,
+                timeline_id: TimelineId,
+                node: Node,
+                jwt: Option<String>,
+                req: TimelineArchivalConfigRequest,
+            ) -> Result<(), ApiError> {
+                tracing::info!(
+                    "Setting archival config of timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
+                );
+
+                let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+
+                client
+                    .timeline_archival_config(tenant_shard_id, timeline_id, &req)
+                    .await
+                    .map_err(|e| match e {
+                        mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg) => {
+                            ApiError::PreconditionFailed(msg.into_boxed_str())
+                        }
+                        _ => passthrough_api_error(&node, e),
+                    })
+            }
+
+            // no shard needs to go first/last; the operation should be idempotent
+            // TODO: it would be great to ensure that all shards return the same error
+            let results = self
+                .tenant_for_shards(targets, |tenant_shard_id, node| {
+                    futures::FutureExt::boxed(config_one(
+                        tenant_shard_id,
+                        timeline_id,
+                        node,
+                        self.config.jwt_token.clone(),
+                        req.clone(),
+                    ))
+                })
+                .await?;
+            assert!(!results.is_empty(), "must have at least one result");
+
+            Ok(())
+        }).await?
+    }
+
     pub(crate) async fn tenant_timeline_detach_ancestor(
         &self,
         tenant_id: TenantId,
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 7f158ad251..de43e51c9e 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -1,97 +1,90 @@
 import pytest
 from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId
 from fixtures.neon_fixtures import (
-    NeonEnv,
+    NeonEnvBuilder,
 )
 from fixtures.pageserver.http import PageserverApiException
 
 
-def test_timeline_archive(neon_simple_env: NeonEnv):
-    env = neon_simple_env
+@pytest.mark.parametrize("shard_count", [0, 4])
+def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int):
+    unsharded = shard_count == 0
+    if unsharded:
+        env = neon_env_builder.init_start()
+        # If we run the unsharded version, talk to the pageserver directly
+        ps_http = env.pageserver.http_client()
+    else:
+        neon_env_builder.num_pageservers = shard_count
+        env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+        # If we run the unsharded version, talk to the storage controller
+        ps_http = env.storage_controller.pageserver_api()
 
-    env.pageserver.allowed_errors.extend(
-        [
-            ".*Timeline .* was not found.*",
-            ".*timeline not found.*",
-            ".*Cannot archive timeline which has unarchived child timelines.*",
-            ".*Precondition failed: Requested tenant is missing.*",
-        ]
-    )
-
-    ps_http = env.pageserver.http_client()
-
-    # first try to archive non existing timeline
-    # for existing tenant:
+    # first try to archive a non existing timeline for an existing tenant:
     invalid_timeline_id = TimelineId.generate()
     with pytest.raises(PageserverApiException, match="timeline not found") as exc:
         ps_http.timeline_archival_config(
-            tenant_id=env.initial_tenant,
-            timeline_id=invalid_timeline_id,
+            env.initial_tenant,
+            invalid_timeline_id,
             state=TimelineArchivalState.ARCHIVED,
         )
 
     assert exc.value.status_code == 404
 
-    # for non existing tenant:
+    # for a non existing tenant:
     invalid_tenant_id = TenantId.generate()
     with pytest.raises(
         PageserverApiException,
-        match=f"NotFound: tenant {invalid_tenant_id}",
+        match="NotFound: [tT]enant",
     ) as exc:
         ps_http.timeline_archival_config(
-            tenant_id=invalid_tenant_id,
-            timeline_id=invalid_timeline_id,
+            invalid_tenant_id,
+            invalid_timeline_id,
             state=TimelineArchivalState.ARCHIVED,
         )
 
     assert exc.value.status_code == 404
 
-    # construct pair of branches to validate that pageserver prohibits
+    # construct a pair of branches to validate that pageserver prohibits
     # archival of ancestor timelines when they have non-archived child branches
-    parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_archive_parent", "empty")
+    parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_archive_parent")
 
     leaf_timeline_id = env.neon_cli.create_branch(
         "test_ancestor_branch_archive_branch1", "test_ancestor_branch_archive_parent"
     )
 
-    timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id)
-
     with pytest.raises(
         PageserverApiException,
         match="Cannot archive timeline which has non-archived child timelines",
     ) as exc:
-        assert timeline_path.exists()
-
         ps_http.timeline_archival_config(
-            tenant_id=env.initial_tenant,
-            timeline_id=parent_timeline_id,
+            env.initial_tenant,
+            parent_timeline_id,
             state=TimelineArchivalState.ARCHIVED,
         )
 
     assert exc.value.status_code == 412
 
-    # Test timeline_detail
     leaf_detail = ps_http.timeline_detail(
-        tenant_id=env.initial_tenant,
+        env.initial_tenant,
         timeline_id=leaf_timeline_id,
     )
     assert leaf_detail["is_archived"] is False
 
     # Test that archiving the leaf timeline and then the parent works
     ps_http.timeline_archival_config(
-        tenant_id=env.initial_tenant,
-        timeline_id=leaf_timeline_id,
+        env.initial_tenant,
+        leaf_timeline_id,
         state=TimelineArchivalState.ARCHIVED,
     )
     leaf_detail = ps_http.timeline_detail(
-        tenant_id=env.initial_tenant,
-        timeline_id=leaf_timeline_id,
+        env.initial_tenant,
+        leaf_timeline_id,
     )
     assert leaf_detail["is_archived"] is True
 
     ps_http.timeline_archival_config(
-        tenant_id=env.initial_tenant,
-        timeline_id=parent_timeline_id,
+        env.initial_tenant,
+        parent_timeline_id,
         state=TimelineArchivalState.ARCHIVED,
     )
 
@@ -100,23 +93,21 @@ def test_timeline_archive(neon_simple_env: NeonEnv):
         PageserverApiException,
         match="ancestor is archived",
     ) as exc:
-        assert timeline_path.exists()
-
         ps_http.timeline_archival_config(
-            tenant_id=env.initial_tenant,
-            timeline_id=leaf_timeline_id,
+            env.initial_tenant,
+            leaf_timeline_id,
             state=TimelineArchivalState.UNARCHIVED,
         )
 
     # Unarchive works for the leaf if the parent gets unarchived first
     ps_http.timeline_archival_config(
-        tenant_id=env.initial_tenant,
-        timeline_id=parent_timeline_id,
+        env.initial_tenant,
+        parent_timeline_id,
         state=TimelineArchivalState.UNARCHIVED,
     )
 
     ps_http.timeline_archival_config(
-        tenant_id=env.initial_tenant,
-        timeline_id=leaf_timeline_id,
+        env.initial_tenant,
+        leaf_timeline_id,
         state=TimelineArchivalState.UNARCHIVED,
     )

From 15e90cc427aad8f9ded4e0c13a283217631cbd07 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 2 Sep 2024 15:45:17 +0200
Subject: [PATCH 1540/1571] bottommost-compaction: remove dead code / rectify
 cfg!()s (#8884)

part of https://github.com/neondatabase/neon/issues/8002
---
 .../src/tenant/storage_layer/delta_layer.rs   | 40 -------------------
 .../src/tenant/storage_layer/image_layer.rs   | 32 ++-------------
 pageserver/src/tenant/storage_layer/layer.rs  | 31 --------------
 .../src/tenant/storage_layer/layer/tests.rs   |  2 +-
 .../src/tenant/storage_layer/split_writer.rs  |  2 +-
 pageserver/src/tenant/timeline.rs             |  1 -
 6 files changed, 5 insertions(+), 103 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 885eb13b29..b8e9a98149 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -225,9 +225,7 @@ pub struct DeltaLayerInner {
     file: VirtualFile,
     file_id: FileId,
 
-    #[allow(dead_code)]
     layer_key_range: Range<Key>,
-    #[allow(dead_code)]
     layer_lsn_range: Range<Lsn>,
 
     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
@@ -882,44 +880,6 @@ impl DeltaLayerInner {
         Ok(())
     }
 
-    /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
-    pub(super) async fn load_key_values(
-        &self,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            block_reader,
-        );
-        let mut result = Vec::new();
-        let mut stream =
-            Box::pin(self.stream_index_forwards(index_reader, &[0; DELTA_KEY_SIZE], ctx));
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let cursor = block_reader.block_cursor();
-        let mut buf = Vec::new();
-        while let Some(item) = stream.next().await {
-            let (key, lsn, pos) = item?;
-            // TODO: dedup code with get_reconstruct_value
-            // TODO: ctx handling and sharding
-            cursor
-                .read_blob_into_buf(pos.pos(), &mut buf, ctx)
-                .await
-                .with_context(|| {
-                    format!("Failed to read blob from virtual file {}", self.file.path)
-                })?;
-            let val = Value::des(&buf).with_context(|| {
-                format!(
-                    "Failed to deserialize file blob from virtual file {}",
-                    self.file.path
-                )
-            })?;
-            result.push((key, lsn, val));
-        }
-        Ok(result)
-    }
-
     async fn plan_reads<Reader>(
         keyspace: &KeySpace,
         lsn_range: Range<Lsn>,
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 4c22541e02..4a095c564d 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -28,7 +28,7 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
-use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
+use crate::tenant::block_io::{BlockBuf, FileBlockReader};
 use crate::tenant::disk_btree::{
     DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
@@ -453,33 +453,6 @@ impl ImageLayerInner {
         Ok(())
     }
 
-    /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
-    pub(super) async fn load_key_values(
-        &self,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader =
-            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
-        let mut result = Vec::new();
-        let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx));
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let cursor = block_reader.block_cursor();
-        while let Some(item) = stream.next().await {
-            // TODO: dedup code with get_reconstruct_value
-            let (raw_key, offset) = item?;
-            let key = Key::from_slice(&raw_key[..KEY_SIZE]);
-            // TODO: ctx handling and sharding
-            let blob = cursor
-                .read_blob(offset, ctx)
-                .await
-                .with_context(|| format!("failed to read value from offset {}", offset))?;
-            let value = Bytes::from(blob);
-            result.push((key, self.lsn, Value::Image(value)));
-        }
-        Ok(result)
-    }
-
     /// Traverse the layer's index to build read operations on the overlap of the input keyspace
     /// and the keys in this layer.
     ///
@@ -711,7 +684,7 @@ struct ImageLayerWriterInner {
     blob_writer: BlobWriter<false>,
     tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 
-    #[cfg_attr(not(feature = "testing"), allow(dead_code))]
+    #[cfg(feature = "testing")]
     last_written_key: Key,
 }
 
@@ -770,6 +743,7 @@ impl ImageLayerWriterInner {
             uncompressed_bytes_eligible: 0,
             uncompressed_bytes_chosen: 0,
             num_keys: 0,
+            #[cfg(feature = "testing")]
             last_written_key: Key::MIN,
         };
 
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 86a200ce28..56f5cc556d 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -14,7 +14,6 @@ use utils::sync::{gate, heavier_once_cell};
 
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::repository::Key;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::task_mgr::TaskKind;
 use crate::tenant::timeline::{CompactionError, GetVectoredError};
@@ -334,23 +333,6 @@ impl Layer {
             })
     }
 
-    /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
-    #[allow(dead_code)]
-    pub(crate) async fn load_key_values(
-        &self,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
-        let layer = self
-            .0
-            .get_or_maybe_download(true, Some(ctx))
-            .await
-            .map_err(|err| match err {
-                DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
-                other => GetVectoredError::Other(anyhow::anyhow!(other)),
-            })?;
-        layer.load_key_values(&self.0, ctx).await
-    }
-
     /// Download the layer if evicted.
     ///
     /// Will not error when the layer is already downloaded.
@@ -1777,19 +1759,6 @@ impl DownloadedLayer {
         }
     }
 
-    async fn load_key_values(
-        &self,
-        owner: &Arc<LayerInner>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
-        use LayerKind::*;
-
-        match self.get(owner, ctx).await? {
-            Delta(d) => d.load_key_values(ctx).await,
-            Image(i) => i.load_key_values(ctx).await,
-        }
-    }
-
     async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
         use LayerKind::*;
         match self.get(owner, ctx).await? {
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index bffd2db800..0b9bde4f57 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -782,7 +782,7 @@ async fn eviction_cancellation_on_drop() {
         let mut writer = timeline.writer().await;
         writer
             .put(
-                Key::from_i128(5),
+                crate::repository::Key::from_i128(5),
                 Lsn(0x20),
                 &Value::Image(Bytes::from_static(b"this does not matter either")),
                 &ctx,
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index df910b5ad9..e8deb0a1e5 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -353,7 +353,7 @@ impl SplitDeltaLayerWriter {
         Ok(generated_layers)
     }
 
-    #[allow(dead_code)]
+    #[cfg(test)]
     pub(crate) async fn finish(
         self,
         tline: &Arc<Timeline>,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 35e0825bac..6eadf9a564 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4537,7 +4537,6 @@ pub struct DeltaLayerTestDesc {
 
 #[cfg(test)]
 impl DeltaLayerTestDesc {
-    #[allow(dead_code)]
     pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
         Self {
             lsn_range,

From bf0531d10703e1f6cd92e29ca69a9bb68503121e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 2 Sep 2024 16:10:10 +0200
Subject: [PATCH 1541/1571] fixup(#8839): `test_forward_compatibility` needs to
 allow lag warning as well (#8891)

Found in
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8885/10665614629/index.html#suites/0fbaeb107ef328d03993d44a1fb15690/ea10ba1c140fba1d
---
 test_runner/regress/test_compatibility.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index cd3f405a86..467e5b1734 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -149,6 +149,10 @@ def test_create_snapshot(
     )
 
 
+# check_neon_works does recovery from WAL => the compatibility snapshot's WAL is old => will log this warning
+ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
+
+
 @check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
@@ -173,10 +177,6 @@ def test_backward_compatibility(
     try:
         neon_env_builder.num_safekeepers = 3
         env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
-        # check_neon_works does recovery from WAL => the compatibility snapshot's WAL is old => will log this warning
-        ingest_lag_log_line = (
-            ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
-        )
         env.pageserver.allowed_errors.append(ingest_lag_log_line)
         neon_env_builder.start()
 
@@ -246,6 +246,8 @@ def test_forward_compatibility(
         env = neon_env_builder.from_repo_dir(
             compatibility_snapshot_dir / "repo",
         )
+        # there may be an arbitrary number of unrelated tests run between create_snapshot and here
+        env.pageserver.allowed_errors.append(ingest_lag_log_line)
 
         # not using env.pageserver.version because it was initialized before
         prev_pageserver_version_str = env.get_binary_version("pageserver")

From 3b317cae071a7eb84247e616e15541868e292bd3 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 2 Sep 2024 17:09:26 +0200
Subject: [PATCH 1542/1571] page_cache/layer load: correctly classify layer
 summary block reads (#8885)

Before this PR, we would classify layer summary block reads as "Unknown"
content kind.

<img width="1267" alt="image"
src="https://github.com/user-attachments/assets/508af034-5c2a-4c89-80db-2899967b337c">
---
 pageserver/src/context.rs                    |  2 ++
 pageserver/src/tenant/storage_layer/layer.rs | 12 +++++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index 0b07e07524..012cb8d96f 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -105,8 +105,10 @@ pub struct RequestContext {
 #[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
 pub enum PageContentKind {
     Unknown,
+    DeltaLayerSummary,
     DeltaLayerBtreeNode,
     DeltaLayerValue,
+    ImageLayerSummary,
     ImageLayerBtreeNode,
     ImageLayerValue,
     InMemoryLayer,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 56f5cc556d..b15cd4da39 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -13,7 +13,7 @@ use utils::lsn::Lsn;
 use utils::sync::{gate, heavier_once_cell};
 
 use crate::config::PageServerConf;
-use crate::context::{DownloadBehavior, RequestContext};
+use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::task_mgr::TaskKind;
 use crate::tenant::timeline::{CompactionError, GetVectoredError};
@@ -1678,6 +1678,9 @@ impl DownloadedLayer {
             );
 
             let res = if owner.desc.is_delta {
+                let ctx = RequestContextBuilder::extend(ctx)
+                    .page_content_kind(crate::context::PageContentKind::DeltaLayerSummary)
+                    .build();
                 let summary = Some(delta_layer::Summary::expected(
                     owner.desc.tenant_shard_id.tenant_id,
                     owner.desc.timeline_id,
@@ -1688,11 +1691,14 @@ impl DownloadedLayer {
                     &owner.path,
                     summary,
                     Some(owner.conf.max_vectored_read_bytes),
-                    ctx,
+                    &ctx,
                 )
                 .await
                 .map(LayerKind::Delta)
             } else {
+                let ctx = RequestContextBuilder::extend(ctx)
+                    .page_content_kind(crate::context::PageContentKind::ImageLayerSummary)
+                    .build();
                 let lsn = owner.desc.image_layer_lsn();
                 let summary = Some(image_layer::Summary::expected(
                     owner.desc.tenant_shard_id.tenant_id,
@@ -1705,7 +1711,7 @@ impl DownloadedLayer {
                     lsn,
                     summary,
                     Some(owner.conf.max_vectored_read_bytes),
-                    ctx,
+                    &ctx,
                 )
                 .await
                 .map(LayerKind::Image)

From b37da32c6f56f31f39661c9364a7a662df59dbbc Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@grinaker.org>
Date: Tue, 3 Sep 2024 10:05:24 +0200
Subject: [PATCH 1543/1571] pageserver: reuse idempotency keys across metrics
 sinks (#8876)

## Problem

Metrics event idempotency keys differ across S3 and Vector. The events
should be identical.

Resolves #8605.

## Summary of changes

Pre-generate the idempotency keys and pass the same set into both
metrics sinks.

Co-authored-by: John Spray <john@neon.tech>
---
 pageserver/src/consumption_metrics.rs        | 24 +++++++--
 pageserver/src/consumption_metrics/upload.rs | 52 +++++++++++---------
 2 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index f94d945d46..64a267e0e4 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -1,6 +1,8 @@
 //! Periodically collect consumption metrics for all active tenants
 //! and push them to a HTTP endpoint.
 use crate::config::PageServerConf;
+use crate::consumption_metrics::metrics::MetricsKey;
+use crate::consumption_metrics::upload::KeyGen as _;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::size::CalculateSyntheticSizeError;
@@ -8,6 +10,7 @@ use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
+use itertools::Itertools as _;
 use pageserver_api::models::TenantState;
 use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
 use reqwest::Url;
@@ -19,9 +22,8 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::NodeId;
 
-mod metrics;
-use crate::consumption_metrics::metrics::MetricsKey;
 mod disk_cache;
+mod metrics;
 mod upload;
 
 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
@@ -143,6 +145,12 @@ async fn collect_metrics(
         // these are point in time, with variable "now"
         let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
 
+        // Pre-generate event idempotency keys, to reuse them across the bucket
+        // and HTTP sinks.
+        let idempotency_keys = std::iter::repeat_with(|| node_id.as_str().generate())
+            .take(metrics.len())
+            .collect_vec();
+
         let metrics = Arc::new(metrics);
 
         // why not race cancellation here? because we are one of the last tasks, and if we are
@@ -161,8 +169,14 @@ async fn collect_metrics(
             }
 
             if let Some(bucket_client) = &bucket_client {
-                let res =
-                    upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
+                let res = upload::upload_metrics_bucket(
+                    bucket_client,
+                    &cancel,
+                    &node_id,
+                    &metrics,
+                    &idempotency_keys,
+                )
+                .await;
                 if let Err(e) = res {
                     tracing::error!("failed to upload to S3: {e:#}");
                 }
@@ -174,9 +188,9 @@ async fn collect_metrics(
                 &client,
                 metric_collection_endpoint,
                 &cancel,
-                &node_id,
                 &metrics,
                 &mut cached_metrics,
+                &idempotency_keys,
             )
             .await;
             if let Err(e) = res {
diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs
index 4e8283c3e4..0325ee403a 100644
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -24,16 +24,16 @@ pub(super) async fn upload_metrics_http(
     client: &reqwest::Client,
     metric_collection_endpoint: &reqwest::Url,
     cancel: &CancellationToken,
-    node_id: &str,
     metrics: &[RawMetric],
     cached_metrics: &mut Cache,
+    idempotency_keys: &[IdempotencyKey<'_>],
 ) -> anyhow::Result<()> {
     let mut uploaded = 0;
     let mut failed = 0;
 
     let started_at = std::time::Instant::now();
 
-    let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id);
+    let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys);
 
     while let Some(res) = iter.next() {
         let (chunk, body) = res?;
@@ -87,6 +87,7 @@ pub(super) async fn upload_metrics_bucket(
     cancel: &CancellationToken,
     node_id: &str,
     metrics: &[RawMetric],
+    idempotency_keys: &[IdempotencyKey<'_>],
 ) -> anyhow::Result<()> {
     if metrics.is_empty() {
         // Skip uploads if we have no metrics, so that readers don't have to handle the edge case
@@ -106,7 +107,7 @@ pub(super) async fn upload_metrics_bucket(
 
     // Serialize and write into compressed buffer
     let started_at = std::time::Instant::now();
-    for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
+    for res in serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys) {
         let (_chunk, body) = res?;
         gzip_writer.write_all(&body).await?;
     }
@@ -134,29 +135,31 @@ pub(super) async fn upload_metrics_bucket(
     Ok(())
 }
 
-// The return type is quite ugly, but we gain testability in isolation
-fn serialize_in_chunks<'a, F>(
+/// Serializes the input metrics as JSON in chunks of chunk_size. The provided
+/// idempotency keys are injected into the corresponding metric events (reused
+/// across different metrics sinks), and must have the same length as input.
+fn serialize_in_chunks<'a>(
     chunk_size: usize,
     input: &'a [RawMetric],
-    factory: F,
+    idempotency_keys: &'a [IdempotencyKey<'a>],
 ) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
-where
-    F: KeyGen<'a> + 'a,
 {
     use bytes::BufMut;
 
-    struct Iter<'a, F> {
+    assert_eq!(input.len(), idempotency_keys.len());
+
+    struct Iter<'a> {
         inner: std::slice::Chunks<'a, RawMetric>,
+        idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>,
         chunk_size: usize,
 
         // write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
         buffer: bytes::BytesMut,
         // chunk amount of events are reused to produce the serialized document
         scratch: Vec<Event<Ids, Name>>,
-        factory: F,
     }
 
-    impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> {
+    impl<'a> Iterator for Iter<'a> {
         type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
 
         fn next(&mut self) -> Option<Self::Item> {
@@ -167,17 +170,14 @@ where
                 self.scratch.extend(
                     chunk
                         .iter()
-                        .map(|raw_metric| raw_metric.as_event(&self.factory.generate())),
+                        .zip(&mut self.idempotency_keys)
+                        .map(|(raw_metric, key)| raw_metric.as_event(key)),
                 );
             } else {
                 // next rounds: update_in_place to reuse allocations
                 assert_eq!(self.scratch.len(), self.chunk_size);
-                self.scratch
-                    .iter_mut()
-                    .zip(chunk.iter())
-                    .for_each(|(slot, raw_metric)| {
-                        raw_metric.update_in_place(slot, &self.factory.generate())
-                    });
+                itertools::izip!(self.scratch.iter_mut(), chunk, &mut self.idempotency_keys)
+                    .for_each(|(slot, raw_metric, key)| raw_metric.update_in_place(slot, key));
             }
 
             let res = serde_json::to_writer(
@@ -198,18 +198,19 @@ where
         }
     }
 
-    impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {}
+    impl<'a> ExactSizeIterator for Iter<'a> {}
 
     let buffer = bytes::BytesMut::new();
     let inner = input.chunks(chunk_size);
+    let idempotency_keys = idempotency_keys.iter();
     let scratch = Vec::new();
 
     Iter {
         inner,
+        idempotency_keys,
         chunk_size,
         buffer,
         scratch,
-        factory,
     }
 }
 
@@ -268,7 +269,7 @@ impl RawMetricExt for RawMetric {
     }
 }
 
-trait KeyGen<'a>: Copy {
+pub(crate) trait KeyGen<'a> {
     fn generate(&self) -> IdempotencyKey<'a>;
 }
 
@@ -389,7 +390,10 @@ mod tests {
         let examples = metric_samples();
         assert!(examples.len() > 1);
 
-        let factory = FixedGen::new(Utc::now(), "1", 42);
+        let now = Utc::now();
+        let idempotency_keys = (0..examples.len())
+            .map(|i| FixedGen::new(now, "1", i as u16).generate())
+            .collect::<Vec<_>>();
 
         // need to use Event here because serde_json::Value uses default hashmap, not linked
         // hashmap
@@ -398,13 +402,13 @@ mod tests {
             events: Vec<Event<Ids, Name>>,
         }
 
-        let correct = serialize_in_chunks(examples.len(), &examples, factory)
+        let correct = serialize_in_chunks(examples.len(), &examples, &idempotency_keys)
             .map(|res| res.unwrap().1)
             .flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
             .collect::<Vec<_>>();
 
         for chunk_size in 1..examples.len() {
-            let actual = serialize_in_chunks(chunk_size, &examples, factory)
+            let actual = serialize_in_chunks(chunk_size, &examples, &idempotency_keys)
                 .map(|res| res.unwrap().1)
                 .flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
                 .collect::<Vec<_>>();

From c43e664ff577d4568722e4e7a2b2c6267b609607 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 3 Sep 2024 15:11:30 +0100
Subject: [PATCH 1544/1571] storcon: provide an az id in metadata.json from
 neon local (#8897)

## Problem
Neon local set-up does not inject an az id in `metadata.json`. See real
change in https://github.com/neondatabase/neon/pull/8852.

## Summary of changes
We piggyback on the existing `availability_zone` pageserver
configuration in order to avoid making neon local even more complex.
---
 control_plane/src/pageserver.rs               | 23 ++++++++++++++++++-
 test_runner/fixtures/neon_fixtures.py         | 12 +++++-----
 .../fixtures/pageserver/allowed_errors.py     |  3 ---
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 399b1c2653..31777eb7a5 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -181,6 +181,23 @@ impl PageServerNode {
         );
         io::stdout().flush()?;
 
+        // If the config file we got as a CLI argument includes the `availability_zone`
+        // config, then use that to populate the `metadata.json` file for the pageserver.
+        // In production the deployment orchestrator does this for us.
+        let az_id = conf
+            .other
+            .get("availability_zone")
+            .map(|toml| {
+                let az_str = toml.to_string();
+                // Trim the (") chars from the toml representation
+                if az_str.starts_with('"') && az_str.ends_with('"') {
+                    az_str[1..az_str.len() - 1].to_string()
+                } else {
+                    az_str
+                }
+            })
+            .unwrap_or("local".to_string());
+
         let config = self
             .pageserver_init_make_toml(conf)
             .context("make pageserver toml")?;
@@ -216,6 +233,7 @@ impl PageServerNode {
         let (_http_host, http_port) =
             parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
         let http_port = http_port.unwrap_or(9898);
+
         // Intentionally hand-craft JSON: this acts as an implicit format compat test
         // in case the pageserver-side structure is edited, and reflects the real life
         // situation: the metadata is written by some other script.
@@ -226,7 +244,10 @@ impl PageServerNode {
                 postgres_port: self.pg_connection_config.port(),
                 http_host: "localhost".to_string(),
                 http_port,
-                other: HashMap::new(),
+                other: HashMap::from([(
+                    "availability_zone_id".to_string(),
+                    serde_json::json!(az_id),
+                )]),
             })
             .unwrap(),
         )
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 800ae03d13..0cbab71cc3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1164,6 +1164,8 @@ class NeonEnv:
                 "listen_http_addr": f"localhost:{pageserver_port.http}",
                 "pg_auth_type": pg_auth_type,
                 "http_auth_type": http_auth_type,
+                # Default which can be overriden with `NeonEnvBuilder.pageserver_config_override`
+                "availability_zone": "us-east-2a",
             }
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
@@ -1192,11 +1194,7 @@ class NeonEnv:
 
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
-                NeonPageserver(
-                    self,
-                    ps_id,
-                    port=pageserver_port,
-                )
+                NeonPageserver(self, ps_id, port=pageserver_port, az_id=ps_cfg["availability_zone"])
             )
             cfg["pageservers"].append(ps_cfg)
 
@@ -2400,6 +2398,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
             "listen_http_port": node.service_port.http,
             "listen_pg_addr": "localhost",
             "listen_pg_port": node.service_port.pg,
+            "availability_zone_id": node.az_id,
         }
         log.info(f"node_register({body})")
         self.request(
@@ -2923,10 +2922,11 @@ class NeonPageserver(PgProtocol, LogUtils):
 
     TEMP_FILE_SUFFIX = "___temp"
 
-    def __init__(self, env: NeonEnv, id: int, port: PageserverPort):
+    def __init__(self, env: NeonEnv, id: int, port: PageserverPort, az_id: str):
         super().__init__(host="localhost", port=port.pg, user="cloud_admin")
         self.env = env
         self.id = id
+        self.az_id = az_id
         self.running = False
         self.service_port = port
         self.version = env.get_binary_version("pageserver")
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 70f2676245..f8d9a51c91 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -109,9 +109,6 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
     # controller's attempts to notify the endpoint).
     ".*reconciler.*neon_local notification hook failed.*",
     ".*reconciler.*neon_local error.*",
-    # Neon local does not provide pageserver with an AZ
-    # TODO: remove this once neon local does so
-    ".*registering without specific availability zone id.*",
 ]
 
 
From 3916810f203cb086d4d6f6db760a39e5cffb0223 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 28 Aug 2024 17:39:13 +0300
Subject: [PATCH 1545/1571] safekeeper: add remote_path to Timeline

It is used in many places, let's reduce number of ? on construction
results.
---
 safekeeper/src/pull_timeline.rs      |  6 +++---
 safekeeper/src/timeline.rs           |  8 +++++++-
 safekeeper/src/timeline_eviction.rs  | 12 ++++--------
 safekeeper/src/wal_backup.rs         |  2 +-
 safekeeper/src/wal_backup_partial.rs | 10 ++--------
 5 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 1eacec9981..600a6bd8f0 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -183,10 +183,10 @@ impl WalResidentTimeline {
                 "Replacing uploaded partial segment in in-mem control file: {replace:?}"
             );
 
-            let remote_timeline_path = wal_backup::remote_timeline_path(&self.tli.ttid)?;
+            let remote_timeline_path = &self.tli.remote_path;
             wal_backup::copy_partial_segment(
-                &replace.previous.remote_path(&remote_timeline_path),
-                &replace.current.remote_path(&remote_timeline_path),
+                &replace.previous.remote_path(remote_timeline_path),
+                &replace.current.remote_path(remote_timeline_path),
             )
             .await?;
         }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 57935d879f..f7c96d4f02 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -3,6 +3,7 @@
 
 use anyhow::{anyhow, bail, Result};
 use camino::Utf8PathBuf;
+use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
 use tokio::fs::{self};
 use tokio_util::sync::CancellationToken;
@@ -36,7 +37,7 @@ use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, Tim
 use crate::timeline_guard::ResidenceGuard;
 use crate::timeline_manager::{AtomicStatus, ManagerCtl};
 use crate::timelines_set::TimelinesSet;
-use crate::wal_backup::{self};
+use crate::wal_backup::{self, remote_timeline_path};
 use crate::wal_backup_partial::PartialRemoteSegment;
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
@@ -469,6 +470,7 @@ impl From<TimelineError> for ApiError {
 /// It also holds SharedState and provides mutually exclusive access to it.
 pub struct Timeline {
     pub ttid: TenantTimelineId,
+    pub remote_path: RemotePath,
 
     /// Used to broadcast commit_lsn updates to all background jobs.
     commit_lsn_watch_tx: watch::Sender<Lsn>,
@@ -519,8 +521,10 @@ impl Timeline {
         let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
 
         let walreceivers = WalReceivers::new();
+        let remote_path = remote_timeline_path(&ttid)?;
         Ok(Timeline {
             ttid,
+            remote_path,
             commit_lsn_watch_tx,
             commit_lsn_watch_rx,
             term_flush_lsn_watch_tx,
@@ -557,8 +561,10 @@ impl Timeline {
             TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
 
         let walreceivers = WalReceivers::new();
+        let remote_path = remote_timeline_path(&ttid)?;
         Ok(Timeline {
             ttid,
+            remote_path,
             commit_lsn_watch_tx,
             commit_lsn_watch_rx,
             term_flush_lsn_watch_tx,
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index ae6f3f4b7e..2ccb058720 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -167,7 +167,7 @@ async fn redownload_partial_segment(
     partial: &PartialRemoteSegment,
 ) -> anyhow::Result<()> {
     let tmp_file = mgr.tli.timeline_dir().join("remote_partial.tmp");
-    let remote_segfile = remote_segment_path(mgr, partial)?;
+    let remote_segfile = remote_segment_path(mgr, partial);
 
     debug!(
         "redownloading partial segment: {} -> {}",
@@ -252,7 +252,7 @@ async fn do_validation(
         );
     }
 
-    let remote_segfile = remote_segment_path(mgr, partial)?;
+    let remote_segfile = remote_segment_path(mgr, partial);
     let mut remote_reader: std::pin::Pin<Box<dyn AsyncRead + Send + Sync>> =
         wal_backup::read_object(&remote_segfile, 0).await?;
 
@@ -279,12 +279,8 @@ fn local_segment_path(mgr: &Manager, partial: &PartialRemoteSegment) -> Utf8Path
     local_partial_segfile
 }
 
-fn remote_segment_path(
-    mgr: &Manager,
-    partial: &PartialRemoteSegment,
-) -> anyhow::Result<RemotePath> {
-    let remote_timeline_path = wal_backup::remote_timeline_path(&mgr.tli.ttid)?;
-    Ok(partial.remote_path(&remote_timeline_path))
+fn remote_segment_path(mgr: &Manager, partial: &PartialRemoteSegment) -> RemotePath {
+    partial.remote_path(&mgr.tli.remote_path)
 }
 
 /// Compare first `n` bytes of two readers. If the bytes differ, return an error.
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index aa1a6696a1..1c9ec5c007 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -315,7 +315,7 @@ async fn backup_lsn_range(
         anyhow::bail!("parallel_jobs must be >= 1");
     }
 
-    let remote_timeline_path = remote_timeline_path(&timeline.ttid)?;
+    let remote_timeline_path = &timeline.remote_path;
     let start_lsn = *backup_lsn;
     let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
 
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 675a051887..4022c9409b 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -31,7 +31,7 @@ use crate::{
     safekeeper::Term,
     timeline::WalResidentTimeline,
     timeline_manager::StateSnapshot,
-    wal_backup::{self, remote_timeline_path},
+    wal_backup::{self},
     SafeKeeperConf,
 };
 
@@ -388,13 +388,7 @@ pub async fn main_task(
     let wal_seg_size = tli.get_wal_seg_size().await;
 
     let local_prefix = tli.get_timeline_dir();
-    let remote_timeline_path = match remote_timeline_path(&tli.ttid) {
-        Ok(path) => path,
-        Err(e) => {
-            error!("failed to create remote path: {:?}", e);
-            return None;
-        }
-    };
+    let remote_timeline_path = tli.remote_path.clone();
 
     let mut backup = PartialBackup {
         wal_seg_size,

From 80512e2779f40af7602fe3221ccc7eaa0499e61e Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 30 Aug 2024 12:35:41 +0300
Subject: [PATCH 1546/1571] safekeeper: add endpoint resetting uploaded partial
 segment state.

Endpoint implementation sends msg to manager requesting to do the
reset. Manager stops current partial backup upload task if it exists and
performs the reset.

Also slightly tweak eviction condition: all full segments before
flush_lsn must be uploaded (and committed) and there must be only one
segment left on disk (partial). This allows to evict timelines which
started not on the first segment and didn't fill the whole
segment (previous condition wasn't good because last_removed_segno was
0).

ref https://github.com/neondatabase/neon/issues/8759
---
 safekeeper/src/http/routes.rs            |  23 ++++++
 safekeeper/src/timeline.rs               |   4 +
 safekeeper/src/timeline_eviction.rs      |  21 +++--
 safekeeper/src/timeline_manager.rs       |  88 ++++++++++++++++++--
 safekeeper/src/wal_backup.rs             |   6 +-
 safekeeper/src/wal_backup_partial.rs     | 101 +++++++++++++++++------
 test_runner/fixtures/neon_fixtures.py    |   6 +-
 test_runner/fixtures/safekeeper/http.py  |  24 ++++++
 test_runner/regress/test_wal_acceptor.py |  99 +++++++++++++++++++++-
 9 files changed, 325 insertions(+), 47 deletions(-)

diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 91ffa95c21..9b7424a818 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -389,6 +389,25 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
     json_response(StatusCode::OK, response)
 }
 
+/// Unevict timeline and remove uploaded partial segment(s) from the remote storage.
+/// Successfull response returns list of segments existed before the deletion.
+/// Aimed for one-off usage not normally needed.
+async fn timeline_backup_partial_reset(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+    check_permission(&request, Some(ttid.tenant_id))?;
+
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+
+    let response = tli
+        .backup_partial_reset()
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    json_response(StatusCode::OK, response)
+}
+
 /// Used only in tests to hand craft required data.
 async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let ttid = TenantTimelineId::new(
@@ -607,6 +626,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| {
             request_span(r, timeline_digest_handler)
         })
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/backup_partial_reset",
+            |r| request_span(r, timeline_backup_partial_reset),
+        )
         .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
             request_span(r, record_safekeeper_info)
         })
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index f7c96d4f02..95ee925e1a 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -908,6 +908,10 @@ impl Timeline {
 
         Ok(WalResidentTimeline::new(self.clone(), guard))
     }
+
+    pub async fn backup_partial_reset(self: &Arc<Self>) -> Result<Vec<String>> {
+        self.manager_ctl.backup_partial_reset().await
+    }
 }
 
 /// This is a guard that allows to read/write disk timeline state.
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index 2ccb058720..5d0567575c 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -28,28 +28,38 @@ impl Manager {
     /// - control file is flushed (no next event scheduled)
     /// - no WAL residence guards
     /// - no pushes to the broker
-    /// - partial WAL backup is uploaded
+    /// - last partial WAL segment is uploaded
+    /// - all local segments before the uploaded partial are committed and uploaded
     pub(crate) fn ready_for_eviction(
         &self,
         next_event: &Option<tokio::time::Instant>,
         state: &StateSnapshot,
     ) -> bool {
-        self.backup_task.is_none()
+        let ready = self.backup_task.is_none()
             && self.recovery_task.is_none()
             && self.wal_removal_task.is_none()
             && self.partial_backup_task.is_none()
-            && self.partial_backup_uploaded.is_some()
             && next_event.is_none()
             && self.access_service.is_empty()
             && !self.tli_broker_active.get()
+            // Partial segment of current flush_lsn is uploaded up to this flush_lsn.
             && !wal_backup_partial::needs_uploading(state, &self.partial_backup_uploaded)
+            // And it is the next one after the last removed. Given that local
+            // WAL is removed only after it is uploaded to s3 (and pageserver
+            // advancing remote_consistent_lsn) which happens only after WAL is
+            // committed, true means all this is done.
+            //
+            // This also works for the first segment despite last_removed_segno
+            // being 0 on init because this 0 triggers run of wal_removal_task
+            // on success of which manager updates the horizon.
             && self
                 .partial_backup_uploaded
                 .as_ref()
                 .unwrap()
                 .flush_lsn
                 .segment_number(self.wal_seg_size)
-                == self.last_removed_segno + 1
+                == self.last_removed_segno + 1;
+        ready
     }
 
     /// Evict the timeline to remote storage.
@@ -83,7 +93,8 @@ impl Manager {
         info!("successfully evicted timeline");
     }
 
-    /// Restore evicted timeline from remote storage.
+    /// Attempt to restore evicted timeline from remote storage; it must be
+    /// offloaded.
     #[instrument(name = "unevict_timeline", skip_all)]
     pub(crate) async fn unevict_timeline(&mut self) {
         assert!(self.is_offloaded);
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index 482614fac7..f997f48454 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -11,12 +11,14 @@ use std::{
     time::Duration,
 };
 
+use futures::channel::oneshot;
 use postgres_ffi::XLogSegNo;
 use serde::{Deserialize, Serialize};
 use tokio::{
     task::{JoinError, JoinHandle},
     time::Instant,
 };
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, instrument, warn, Instrument};
 use utils::lsn::Lsn;
 
@@ -33,7 +35,7 @@ use crate::{
     timeline_guard::{AccessService, GuardId, ResidenceGuard},
     timelines_set::{TimelineSetGuard, TimelinesSet},
     wal_backup::{self, WalBackupTaskHandle},
-    wal_backup_partial::{self, PartialRemoteSegment},
+    wal_backup_partial::{self, PartialBackup, PartialRemoteSegment},
     SafeKeeperConf,
 };
 
@@ -96,6 +98,8 @@ pub enum ManagerCtlMessage {
     GuardRequest(tokio::sync::oneshot::Sender<anyhow::Result<ResidenceGuard>>),
     /// Request to drop the guard.
     GuardDrop(GuardId),
+    /// Request to reset uploaded partial backup state.
+    BackupPartialReset(oneshot::Sender<anyhow::Result<Vec<String>>>),
 }
 
 impl std::fmt::Debug for ManagerCtlMessage {
@@ -103,6 +107,7 @@ impl std::fmt::Debug for ManagerCtlMessage {
         match self {
             ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"),
             ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id),
+            ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"),
         }
     }
 }
@@ -143,6 +148,19 @@ impl ManagerCtl {
             .and_then(std::convert::identity)
     }
 
+    /// Request timeline manager to reset uploaded partial segment state and
+    /// wait for the result.
+    pub async fn backup_partial_reset(&self) -> anyhow::Result<Vec<String>> {
+        let (tx, rx) = oneshot::channel();
+        self.manager_tx
+            .send(ManagerCtlMessage::BackupPartialReset(tx))
+            .expect("manager task is not running");
+        match rx.await {
+            Ok(res) => res,
+            Err(_) => anyhow::bail!("timeline manager is gone"),
+        }
+    }
+
     /// Must be called exactly once to bootstrap the manager.
     pub fn bootstrap_manager(
         &self,
@@ -181,7 +199,8 @@ pub(crate) struct Manager {
     pub(crate) wal_removal_task: Option<JoinHandle<anyhow::Result<u64>>>,
 
     // partial backup
-    pub(crate) partial_backup_task: Option<JoinHandle<Option<PartialRemoteSegment>>>,
+    pub(crate) partial_backup_task:
+        Option<(JoinHandle<Option<PartialRemoteSegment>>, CancellationToken)>,
     pub(crate) partial_backup_uploaded: Option<PartialRemoteSegment>,
 
     // misc
@@ -302,12 +321,12 @@ pub async fn main_task(
             _ = sleep_until(&next_event) => {
                 // we were waiting for some event (e.g. cfile save)
             }
-            res = await_task_finish(&mut mgr.wal_removal_task) => {
+            res = await_task_finish(mgr.wal_removal_task.as_mut()) => {
                 // WAL removal task finished
                 mgr.wal_removal_task = None;
                 mgr.update_wal_removal_end(res);
             }
-            res = await_task_finish(&mut mgr.partial_backup_task) => {
+            res = await_task_finish(mgr.partial_backup_task.as_mut().map(|(handle, _)| handle)) => {
                 // partial backup task finished
                 mgr.partial_backup_task = None;
                 mgr.update_partial_backup_end(res);
@@ -335,8 +354,9 @@ pub async fn main_task(
         }
     }
 
-    if let Some(partial_backup_task) = &mut mgr.partial_backup_task {
-        if let Err(e) = partial_backup_task.await {
+    if let Some((handle, cancel)) = &mut mgr.partial_backup_task {
+        cancel.cancel();
+        if let Err(e) = handle.await {
             warn!("partial backup task failed: {:?}", e);
         }
     }
@@ -560,11 +580,14 @@ impl Manager {
         }
 
         // Get WalResidentTimeline and start partial backup task.
-        self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
+        let cancel = CancellationToken::new();
+        let handle = tokio::spawn(wal_backup_partial::main_task(
             self.wal_resident_timeline(),
             self.conf.clone(),
             self.global_rate_limiter.clone(),
-        )));
+            cancel.clone(),
+        ));
+        self.partial_backup_task = Some((handle, cancel));
     }
 
     /// Update the state after partial WAL backup task finished.
@@ -579,6 +602,39 @@ impl Manager {
         }
     }
 
+    /// Reset partial backup state and remove its remote storage data. Since it
+    /// might concurrently uploading something, cancel the task first.
+    async fn backup_partial_reset(&mut self) -> anyhow::Result<Vec<String>> {
+        info!("resetting partial backup state");
+        // Force unevict timeline if it is evicted before erasing partial backup
+        // state. The intended use of this function is to drop corrupted remote
+        // state; we haven't enabled local files deletion yet anywhere,
+        // so direct switch is safe.
+        if self.is_offloaded {
+            self.tli.switch_to_present().await?;
+            // switch manager state as soon as possible
+            self.is_offloaded = false;
+        }
+
+        if let Some((handle, cancel)) = &mut self.partial_backup_task {
+            cancel.cancel();
+            info!("cancelled partial backup task, awaiting it");
+            // we're going to reset .partial_backup_uploaded to None anyway, so ignore the result
+            handle.await.ok();
+            self.partial_backup_task = None;
+        }
+
+        let tli = self.wal_resident_timeline();
+        let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await;
+        // Reset might fail e.g. when cfile is already reset but s3 removal
+        // failed, so set manager state to None beforehand. In any case caller
+        // is expected to retry until success.
+        self.partial_backup_uploaded = None;
+        let res = partial_backup.reset().await?;
+        info!("reset is done");
+        Ok(res)
+    }
+
     /// Handle message arrived from ManagerCtl.
     async fn handle_message(&mut self, msg: Option<ManagerCtlMessage>) {
         debug!("received manager message: {:?}", msg);
@@ -602,6 +658,16 @@ impl Manager {
             Some(ManagerCtlMessage::GuardDrop(guard_id)) => {
                 self.access_service.drop_guard(guard_id);
             }
+            Some(ManagerCtlMessage::BackupPartialReset(tx)) => {
+                info!("resetting uploaded partial backup state");
+                let res = self.backup_partial_reset().await;
+                if let Err(ref e) = res {
+                    warn!("failed to reset partial backup state: {:?}", e);
+                }
+                if tx.send(res).is_err() {
+                    warn!("failed to send partial backup reset result, receiver dropped");
+                }
+            }
             None => {
                 // can't happen, we're holding the sender
                 unreachable!();
@@ -619,7 +685,11 @@ async fn sleep_until(option: &Option<tokio::time::Instant>) {
     }
 }
 
-async fn await_task_finish<T>(option: &mut Option<JoinHandle<T>>) -> Result<T, JoinError> {
+/// Future that resolves when the task is finished or never if the task is None.
+///
+/// Note: it accepts Option<&mut> instead of &mut Option<> because mapping the
+/// option to get the latter is hard.
+async fn await_task_finish<T>(option: Option<&mut JoinHandle<T>>) -> Result<T, JoinError> {
     if let Some(task) = option {
         task.await
     } else {
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 1c9ec5c007..95012bb004 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -328,11 +328,7 @@ async fn backup_lsn_range(
     loop {
         let added_task = match iter.next() {
             Some(s) => {
-                uploads.push_back(backup_single_segment(
-                    s,
-                    timeline_dir,
-                    &remote_timeline_path,
-                ));
+                uploads.push_back(backup_single_segment(s, timeline_dir, remote_timeline_path));
                 true
             }
             None => false,
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 4022c9409b..4f320f43f8 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -22,6 +22,7 @@ use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
 
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn};
 use utils::{id::NodeId, lsn::Lsn};
 
@@ -145,7 +146,7 @@ impl State {
     }
 }
 
-struct PartialBackup {
+pub struct PartialBackup {
     wal_seg_size: usize,
     tli: WalResidentTimeline,
     conf: SafeKeeperConf,
@@ -155,8 +156,25 @@ struct PartialBackup {
     state: State,
 }
 
-// Read-only methods for getting segment names
 impl PartialBackup {
+    pub async fn new(tli: WalResidentTimeline, conf: SafeKeeperConf) -> PartialBackup {
+        let (_, persistent_state) = tli.get_state().await;
+        let wal_seg_size = tli.get_wal_seg_size().await;
+
+        let local_prefix = tli.get_timeline_dir();
+        let remote_timeline_path = tli.remote_path.clone();
+
+        PartialBackup {
+            wal_seg_size,
+            tli,
+            state: persistent_state.partial_backup,
+            conf,
+            local_prefix,
+            remote_timeline_path,
+        }
+    }
+
+    // Read-only methods for getting segment names
     fn segno(&self, lsn: Lsn) -> XLogSegNo {
         lsn.segment_number(self.wal_seg_size)
     }
@@ -297,6 +315,18 @@ impl PartialBackup {
         Ok(())
     }
 
+    // Prepend to the given segments remote prefix and delete them from the
+    // remote storage.
+    async fn delete_segments(&self, segments_to_delete: &Vec<String>) -> anyhow::Result<()> {
+        info!("deleting objects: {:?}", segments_to_delete);
+        let mut objects_to_delete = vec![];
+        for seg in segments_to_delete.iter() {
+            let remote_path = self.remote_timeline_path.join(seg);
+            objects_to_delete.push(remote_path);
+        }
+        wal_backup::delete_objects(&objects_to_delete).await
+    }
+
     /// Delete all non-Uploaded segments from the remote storage. There should be only one
     /// Uploaded segment at a time.
     #[instrument(name = "gc", skip_all)]
@@ -329,15 +359,8 @@ impl PartialBackup {
             );
         }
 
-        info!("deleting objects: {:?}", segments_to_delete);
-        let mut objects_to_delete = vec![];
-        for seg in segments_to_delete.iter() {
-            let remote_path = self.remote_timeline_path.join(seg);
-            objects_to_delete.push(remote_path);
-        }
-
-        // removing segments from remote storage
-        wal_backup::delete_objects(&objects_to_delete).await?;
+        // execute the deletion
+        self.delete_segments(&segments_to_delete).await?;
 
         // now we can update the state on disk
         let new_state = {
@@ -349,6 +372,27 @@ impl PartialBackup {
 
         Ok(())
     }
+
+    /// Remove uploaded segment(s) from the state and remote storage. Aimed for
+    /// manual intervention, not normally needed.
+    /// Returns list of segments which potentially existed in the remote storage.
+    pub async fn reset(&mut self) -> anyhow::Result<Vec<String>> {
+        let segments_to_delete = self
+            .state
+            .segments
+            .iter()
+            .map(|seg| seg.name.clone())
+            .collect();
+
+        // First reset cfile state, and only then objects themselves. If the
+        // later fails we might leave some garbage behind; that's ok for this
+        // single time usage.
+        let new_state = State { segments: vec![] };
+        self.commit_state(new_state).await?;
+
+        self.delete_segments(&segments_to_delete).await?;
+        Ok(segments_to_delete)
+    }
 }
 
 /// Check if everything is uploaded and partial backup task doesn't need to run.
@@ -377,27 +421,16 @@ pub async fn main_task(
     tli: WalResidentTimeline,
     conf: SafeKeeperConf,
     limiter: RateLimiter,
+    cancel: CancellationToken,
 ) -> Option<PartialRemoteSegment> {
     debug!("started");
     let await_duration = conf.partial_backup_timeout;
     let mut first_iteration = true;
 
-    let (_, persistent_state) = tli.get_state().await;
     let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
     let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
-    let wal_seg_size = tli.get_wal_seg_size().await;
 
-    let local_prefix = tli.get_timeline_dir();
-    let remote_timeline_path = tli.remote_path.clone();
-
-    let mut backup = PartialBackup {
-        wal_seg_size,
-        tli,
-        state: persistent_state.partial_backup,
-        conf,
-        local_prefix,
-        remote_timeline_path,
-    };
+    let mut backup = PartialBackup::new(tli, conf).await;
 
     debug!("state: {:?}", backup.state);
 
@@ -427,6 +460,10 @@ pub async fn main_task(
                 && flush_lsn_rx.borrow().term == seg.term
             {
                 // we have nothing to do, the last segment is already uploaded
+                debug!(
+                    "exiting, uploaded up to term={} flush_lsn={} commit_lsn={}",
+                    seg.term, seg.flush_lsn, seg.commit_lsn
+                );
                 return Some(seg.clone());
             }
         }
@@ -438,6 +475,10 @@ pub async fn main_task(
                     info!("timeline canceled");
                     return None;
                 }
+                _ = cancel.cancelled() => {
+                    info!("task canceled");
+                    return None;
+                }
                 _ = flush_lsn_rx.changed() => {}
             }
         }
@@ -464,6 +505,10 @@ pub async fn main_task(
                     info!("timeline canceled");
                     return None;
                 }
+                _ = cancel.cancelled() => {
+                    info!("task canceled");
+                    return None;
+                }
                 _ = commit_lsn_rx.changed() => {}
                 _ = flush_lsn_rx.changed() => {
                     let segno = backup.segno(flush_lsn_rx.borrow().lsn);
@@ -486,7 +531,13 @@ pub async fn main_task(
         }
 
         // limit concurrent uploads
-        let _upload_permit = limiter.acquire_partial_backup().await;
+        let _upload_permit = tokio::select! {
+            acq = limiter.acquire_partial_backup() => acq,
+            _ = cancel.cancelled() => {
+                info!("task canceled");
+                return None;
+            }
+        };
 
         let prepared = backup.prepare_upload().await;
         if let Some(seg) = &uploaded_segment {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0cbab71cc3..8c99408cfb 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4553,6 +4553,8 @@ class Safekeeper(LogUtils):
     def timeline_dir(self, tenant_id, timeline_id) -> Path:
         return self.data_dir / str(tenant_id) / str(timeline_id)
 
+    # List partial uploaded segments of this safekeeper. Works only for
+    # RemoteStorageKind.LOCAL_FS.
     def list_uploaded_segments(self, tenant_id: TenantId, timeline_id: TimelineId):
         tline_path = (
             self.env.repo_dir
@@ -4562,9 +4564,11 @@ class Safekeeper(LogUtils):
             / str(timeline_id)
         )
         assert isinstance(self.env.safekeepers_remote_storage, LocalFsStorage)
-        return self._list_segments_in_dir(
+        segs = self._list_segments_in_dir(
             tline_path, lambda name: ".metadata" not in name and ".___temp" not in name
         )
+        mysegs = [s for s in segs if f"sk{self.id}" in s]
+        return mysegs
 
     def list_segments(self, tenant_id, timeline_id) -> List[str]:
         """
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 05b43cfb72..9bf03554e7 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -174,6 +174,22 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, dict)
         return res_json
 
+    def debug_dump_timeline(
+        self, timeline_id: TimelineId, params: Optional[Dict[str, str]] = None
+    ) -> Any:
+        params = params or {}
+        params["timeline_id"] = str(timeline_id)
+        dump = self.debug_dump(params)
+        return dump["timelines"][0]
+
+    def get_partial_backup(self, timeline_id: TimelineId) -> Any:
+        dump = self.debug_dump_timeline(timeline_id, {"dump_control_file": "true"})
+        return dump["control_file"]["partial_backup"]
+
+    def get_eviction_state(self, timeline_id: TimelineId) -> Any:
+        dump = self.debug_dump_timeline(timeline_id, {"dump_control_file": "true"})
+        return dump["control_file"]["eviction_state"]
+
     def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
         res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
         res.raise_for_status()
@@ -228,6 +244,14 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, dict)
         return res_json
 
+    def backup_partial_reset(self, tenant_id: TenantId, timeline_id: TimelineId):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/backup_partial_reset",
+            json={},
+        )
+        res.raise_for_status()
+        return res.json()
+
     def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
         res = self.post(
             f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 19df834b81..3785651aed 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -72,6 +72,17 @@ def wait_lsn_force_checkpoint(
     wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options)
 
 
+def wait_lsn_force_checkpoint_at_sk(
+    safekeeper: Safekeeper,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ps: NeonPageserver,
+    pageserver_conn_options=None,
+):
+    sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id)
+    wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options)
+
+
 def wait_lsn_force_checkpoint_at(
     lsn: Lsn,
     tenant_id: TenantId,
@@ -79,6 +90,10 @@ def wait_lsn_force_checkpoint_at(
     ps: NeonPageserver,
     pageserver_conn_options=None,
 ):
+    """
+    Wait until pageserver receives given lsn, force checkpoint and wait for
+    upload, i.e. remote_consistent_lsn advancement.
+    """
     pageserver_conn_options = pageserver_conn_options or {}
 
     auth_token = None
@@ -2330,6 +2345,77 @@ def test_s3_eviction(
     assert event_metrics_seen
 
 
+# Test resetting uploaded partial segment state.
+def test_backup_partial_reset(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 1
+    neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
+    # We want to upload/evict quickly, but not too quickly to check that s3 is
+    # empty before next round of upload happens.
+    # Note: this test fails with --delete-offloaded-wal, this is expected.
+    neon_env_builder.safekeeper_extra_opts = [
+        "--enable-offload",
+        "--partial-backup-timeout",
+        "1s",
+        "--control-file-save-interval",
+        "1s",
+        "--eviction-min-resident=1s",
+    ]
+    # XXX: pageserver currently connects to safekeeper as long as connection
+    # manager doesn't remove its entry (default lagging_wal_timeout is 10s),
+    # causing uneviction. It should be fixed to not reconnect if last
+    # remote_consistent_lsn is communicated and there is nothing to fetch. Make
+    # value lower to speed up the test.
+    initial_tenant_conf = {
+        "lagging_wal_timeout": "1s",
+    }
+    env = neon_env_builder.init_start(initial_tenant_conf)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create("main")
+    endpoint.start()
+    endpoint.safe_psql("create table t(key int, value text)")
+    endpoint.stop()
+    sk = env.safekeepers[0]
+    # eviction won't happen until remote_consistent_lsn catches up.
+    wait_lsn_force_checkpoint_at_sk(sk, tenant_id, timeline_id, env.pageserver)
+
+    http_cli = env.safekeepers[0].http_client()
+
+    # wait until eviction happens
+    def evicted():
+        eviction_state = http_cli.get_eviction_state(timeline_id)
+        log.info(f"eviction_state: {eviction_state}")
+        if isinstance(eviction_state, str) and eviction_state == "Present":
+            raise Exception("eviction didn't happen yet")
+
+    wait_until(30, 1, evicted)
+    # it must have uploaded something
+    uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id)
+    log.info(f"uploaded segments before reset: {uploaded_segs}")
+    assert len(uploaded_segs) > 0
+
+    reset_res = http_cli.backup_partial_reset(tenant_id, timeline_id)
+    log.info(f"reset res: {reset_res}")
+
+    # Backup_partial_reset must have reset the state and dropped s3 segment.
+    #
+    # Note: if listing takes more than --partial-backup-timeout test becomes
+    # flaky because file might be reuploaded. With local fs it shouldn't be an
+    # issue, but can add retry if this appears.
+    uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id)
+    log.info(f"uploaded segments after reset: {uploaded_segs}")
+    assert len(uploaded_segs) == 0
+
+    # calling second time should be ok
+    http_cli.backup_partial_reset(tenant_id, timeline_id)
+
+    # inserting data should be ok
+    endpoint.start()
+    endpoint.safe_psql("insert into t values(1, 'hehe')")
+
+
 def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilder):
     """
     Verify that pulling timeline from a SK with an uploaded partial segment
@@ -2357,7 +2443,16 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
         "--eviction-min-resident=500ms",
     ]
 
-    env = neon_env_builder.init_start(initial_tenant_conf={"checkpoint_timeout": "100ms"})
+    # XXX: pageserver currently connects to safekeeper as long as connection
+    # manager doesn't remove its entry (default lagging_wal_timeout is 10s),
+    # causing uneviction. It should be fixed to not reconnect if last
+    # remote_consistent_lsn is communicated and there is nothing to fetch. Until
+    # this is fixed make value lower to speed up the test.
+    initial_tenant_conf = {
+        "lagging_wal_timeout": "1s",
+        "checkpoint_timeout": "100ms",
+    }
+    env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf)
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
@@ -2421,7 +2516,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
     endpoint.start(safekeepers=[2, 3])
 
     def new_partial_segment_uploaded():
-        segs = src_sk.list_uploaded_segments(tenant_id, timeline_id)
+        segs = dst_sk.list_uploaded_segments(tenant_id, timeline_id)
         for seg in segs:
             if "partial" in seg and "sk3" in seg:
                 return seg

From 83dd7f559c16aa0ed546b9fa6d78e04d32a01de1 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 3 Sep 2024 15:35:59 +0300
Subject: [PATCH 1547/1571] safekeeper: more consistent task naming.

Make all them snake case.
---
 safekeeper/src/broker.rs             | 2 +-
 safekeeper/src/recovery.rs           | 2 +-
 safekeeper/src/wal_backup.rs         | 2 +-
 safekeeper/src/wal_backup_partial.rs | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index 7cc2142291..485816408f 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -86,7 +86,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
 }
 
 /// Subscribe and fetch all the interesting data from the broker.
-#[instrument(name = "broker pull", skip_all)]
+#[instrument(name = "broker_pull", skip_all)]
 async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
     let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
 
diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index a59ff07b96..9c4149d8f1 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -35,7 +35,7 @@ use crate::{
 
 /// Entrypoint for per timeline task which always runs, checking whether
 /// recovery for this safekeeper is needed and starting it if so.
-#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
+#[instrument(name = "recovery", skip_all, fields(ttid = %tli.ttid))]
 pub async fn recovery_main(tli: WalResidentTimeline, conf: SafeKeeperConf) {
     info!("started");
 
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 95012bb004..ef26ac99c5 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -203,7 +203,7 @@ struct WalBackupTask {
 }
 
 /// Offload single timeline.
-#[instrument(name = "WAL backup", skip_all, fields(ttid = %tli.ttid))]
+#[instrument(name = "wal_backup", skip_all, fields(ttid = %tli.ttid))]
 async fn backup_task_main(
     tli: WalResidentTimeline,
     parallel_jobs: usize,
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 4f320f43f8..4050a82fff 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -416,7 +416,7 @@ pub(crate) fn needs_uploading(
 ///
 /// When there is nothing more to do and the last segment was successfully uploaded, the task
 /// returns PartialRemoteSegment, to signal readiness for offloading the timeline.
-#[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))]
+#[instrument(name = "partial_backup", skip_all, fields(ttid = %tli.ttid))]
 pub async fn main_task(
     tli: WalResidentTimeline,
     conf: SafeKeeperConf,

From c7187be8a11a43a0bc74d8745912df4a6c5c1db7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 7 Aug 2024 19:26:06 +0300
Subject: [PATCH 1548/1571] safekeeper: check for non-consecutive writes in
 safekeeper.rs

wal_storage.rs already checks this, but since this is a quite legit scenario
check it at safekeeper.rs (consensus level) as well.

ref https://github.com/neondatabase/neon/issues/8212

This is a take 2; previous PR #8640 had been reverted because interplay
with another change broke test_last_log_term_switch.
---
 safekeeper/src/safekeeper.rs                  | 126 ++++++++++++++----
 safekeeper/src/wal_storage.rs                 |   6 +
 .../tests/walproposer_sim/safekeeper_disk.rs  |   4 +
 3 files changed, 113 insertions(+), 23 deletions(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 486954c7b9..dbe0034de2 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -875,6 +875,29 @@ where
             return Ok(Some(AcceptorProposerMessage::AppendResponse(resp)));
         }
 
+        // Disallow any non-sequential writes, which can result in gaps or
+        // overwrites. If we need to move the pointer, ProposerElected message
+        // should have truncated WAL first accordingly. Note that the first
+        // condition (WAL rewrite) is quite expected in real world; it happens
+        // when walproposer reconnects to safekeeper and writes some more data
+        // while first connection still gets some packets later. It might be
+        // better to not log this as error! above.
+        let write_lsn = self.wal_store.write_lsn();
+        if write_lsn > msg.h.begin_lsn {
+            bail!(
+                "append request rewrites WAL written before, write_lsn={}, msg lsn={}",
+                write_lsn,
+                msg.h.begin_lsn
+            );
+        }
+        if write_lsn < msg.h.begin_lsn && write_lsn != Lsn(0) {
+            bail!(
+                "append request creates gap in written WAL, write_lsn={}, msg lsn={}",
+                write_lsn,
+                msg.h.begin_lsn,
+            );
+        }
+
         // Now we know that we are in the same term as the proposer,
         // processing the message.
 
@@ -960,10 +983,7 @@ mod tests {
     use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE};
 
     use super::*;
-    use crate::{
-        state::{EvictionState, PersistedPeers, TimelinePersistentState},
-        wal_storage::Storage,
-    };
+    use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState};
     use std::{ops::Deref, str::FromStr, time::Instant};
 
     // fake storage for tests
@@ -1003,6 +1023,10 @@ mod tests {
     }
 
     impl wal_storage::Storage for DummyWalStore {
+        fn write_lsn(&self) -> Lsn {
+            self.lsn
+        }
+
         fn flush_lsn(&self) -> Lsn {
             self.lsn
         }
@@ -1076,7 +1100,7 @@ mod tests {
         let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();
 
         let mut ar_hdr = AppendRequestHeader {
-            term: 1,
+            term: 2,
             term_start_lsn: Lsn(3),
             begin_lsn: Lsn(1),
             end_lsn: Lsn(2),
@@ -1090,24 +1114,29 @@ mod tests {
         };
 
         let pem = ProposerElected {
-            term: 1,
-            start_streaming_at: Lsn(3),
-            term_history: TermHistory(vec![TermLsn {
-                term: 1,
-                lsn: Lsn(3),
-            }]),
-            timeline_start_lsn: Lsn(0),
+            term: 2,
+            start_streaming_at: Lsn(1),
+            term_history: TermHistory(vec![
+                TermLsn {
+                    term: 1,
+                    lsn: Lsn(1),
+                },
+                TermLsn {
+                    term: 2,
+                    lsn: Lsn(3),
+                },
+            ]),
+            timeline_start_lsn: Lsn(1),
         };
         sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
             .await
             .unwrap();
 
         // check that AppendRequest before term_start_lsn doesn't switch last_log_term.
-        let resp = sk
-            .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
-            .await;
-        assert!(resp.is_ok());
-        assert_eq!(sk.get_last_log_term(), 0);
+        sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
+            .await
+            .unwrap();
+        assert_eq!(sk.get_last_log_term(), 1);
 
         // but record at term_start_lsn does the switch
         ar_hdr.begin_lsn = Lsn(2);
@@ -1116,12 +1145,63 @@ mod tests {
             h: ar_hdr,
             wal_data: Bytes::from_static(b"b"),
         };
-        let resp = sk
-            .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
-            .await;
-        assert!(resp.is_ok());
-        sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
-        assert_eq!(sk.get_last_log_term(), 1);
+        sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
+            .await
+            .unwrap();
+        assert_eq!(sk.get_last_log_term(), 2);
+    }
+
+    #[tokio::test]
+    async fn test_non_consecutive_write() {
+        let storage = InMemoryState {
+            persisted_state: test_sk_state(),
+        };
+        let wal_store = DummyWalStore { lsn: Lsn(0) };
+
+        let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();
+
+        let pem = ProposerElected {
+            term: 1,
+            start_streaming_at: Lsn(1),
+            term_history: TermHistory(vec![TermLsn {
+                term: 1,
+                lsn: Lsn(1),
+            }]),
+            timeline_start_lsn: Lsn(1),
+        };
+        sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
+            .await
+            .unwrap();
+
+        let ar_hdr = AppendRequestHeader {
+            term: 1,
+            term_start_lsn: Lsn(3),
+            begin_lsn: Lsn(1),
+            end_lsn: Lsn(2),
+            commit_lsn: Lsn(0),
+            truncate_lsn: Lsn(0),
+            proposer_uuid: [0; 16],
+        };
+        let append_request = AppendRequest {
+            h: ar_hdr.clone(),
+            wal_data: Bytes::from_static(b"b"),
+        };
+
+        // do write ending at 2, it should be ok
+        sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
+            .await
+            .unwrap();
+        let mut ar_hrd2 = ar_hdr.clone();
+        ar_hrd2.begin_lsn = Lsn(4);
+        ar_hrd2.end_lsn = Lsn(5);
+        let append_request = AppendRequest {
+            h: ar_hdr,
+            wal_data: Bytes::from_static(b"b"),
+        };
+        // and now starting at 4, it must fail
+        sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
+            .await
+            .unwrap_err();
     }
 
     #[test]
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 6fd7c91a68..89c2e98a94 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -37,6 +37,8 @@ use pq_proto::SystemId;
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
 pub trait Storage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn;
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn;
 
@@ -329,6 +331,10 @@ impl PhysicalStorage {
 }
 
 impl Storage for PhysicalStorage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn {
+        self.write_lsn
+    }
     /// flush_lsn returns LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn
diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
index 6b31edb1f2..b854754ecf 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
@@ -175,6 +175,10 @@ impl DiskWALStorage {
 }
 
 impl wal_storage::Storage for DiskWALStorage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn {
+        self.write_lsn
+    }
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn

From c4fe6641c1695b1d7c450358af2cec6018fb2359 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 3 Sep 2024 18:16:49 +0100
Subject: [PATCH 1549/1571] pageserver: separate metadata and data pages in
 DatadirModification (#8621)

## Problem

Currently, DatadirModification keeps a key-indexed map of all pending
writes, even though we (almost) never need to read back dirty pages for
anything other than metadata pages (e.g. relation sizes).

Related: https://github.com/neondatabase/neon/issues/6345

## Summary of changes

- commit() modifications before ingesting database creation wal records,
so that they are guaranteed to be able to get() everything they need
directly from the underlying Timeline.
- Split dirty pages in DatadirModification into pending_metadata_pages
and pending_data_pages. The data ones don't need to be in a
key-addressable format, so they just go in a Vec instead.
- Special case handling of zero-page writes in DatadirModification,
putting them in a map which is flushed on the end of a WAL record. This
handles the case where during ingest, we might first write a zero page,
and then ingest a postgres write to that page. We used to do this via
the key-indexed map of writes, but in this PR we change the data page
write path to not bother indexing these by key.

My least favorite thing about this PR is that I needed to change the
DatadirModification interface to add the on_record_end call. This is not
very invasive because there's really only one place we use it, but it
changes the object's behaviour from being clearly an aggregation of many
records to having some per-record state. I could avoid this by
implicitly doing the work when someone calls set_lsn or commit -- I'm
open to opinions on whether that's cleaner or dirtier.

## Performance

There may be some efficiency improvement here, but the primary
motivation is to enable an earlier stage of ingest to operate without
access to a Timeline. The `pending_data_pages` part is the "fast path"
bulk write data that can in principle be generated without a Timeline,
in parallel with other ingest batches, and ultimately on the safekeeper.

`test_bulk_insert` on AX102 shows approximately the same results as in
the previous PR #8591:

```
------------------------------ Benchmark results -------------------------------
test_bulk_insert[neon-release-pg16].insert: 23.577 s
test_bulk_insert[neon-release-pg16].pageserver_writes: 5,428 MB
test_bulk_insert[neon-release-pg16].peak_mem: 637 MB
test_bulk_insert[neon-release-pg16].size: 0 MB
test_bulk_insert[neon-release-pg16].data_uploaded: 1,922 MB
test_bulk_insert[neon-release-pg16].num_files_uploaded: 8
test_bulk_insert[neon-release-pg16].wal_written: 1,382 MB
test_bulk_insert[neon-release-pg16].wal_recovery: 18.264 s
test_bulk_insert[neon-release-pg16].compaction: 0.052 s
```
---
 pageserver/src/import_datadir.rs              |  12 +-
 pageserver/src/pgdatadir_mapping.rs           | 228 +++++++++++++-----
 .../tenant/storage_layer/inmemory_layer.rs    |   9 +-
 .../walreceiver/walreceiver_connection.rs     |  64 ++++-
 pageserver/src/walingest.rs                   |  42 +++-
 pageserver/src/walrecord.rs                   |  24 ++
 6 files changed, 281 insertions(+), 98 deletions(-)

diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index ed409d3130..5a0894cd1b 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -19,6 +19,7 @@ use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
+use crate::walrecord::decode_wal_record;
 use crate::walrecord::DecodedWALRecord;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
@@ -310,11 +311,13 @@ async fn import_wal(
 
         let mut nrecords = 0;
         let mut modification = tline.begin_modification(last_lsn);
-        let mut decoded = DecodedWALRecord::default();
         while last_lsn <= endpoint {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                let mut decoded = DecodedWALRecord::default();
+                decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
+
                 walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
+                    .ingest_record(decoded, lsn, &mut modification, ctx)
                     .await?;
                 WAL_INGEST.records_committed.inc();
 
@@ -449,11 +452,12 @@ pub async fn import_wal_from_tar(
         waldecoder.feed_bytes(&bytes[offset..]);
 
         let mut modification = tline.begin_modification(last_lsn);
-        let mut decoded = DecodedWALRecord::default();
         while last_lsn <= end_lsn {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                let mut decoded = DecodedWALRecord::default();
+                decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
                 walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
+                    .ingest_record(decoded, lsn, &mut modification, ctx)
                     .await?;
                 modification.commit(ctx).await?;
                 last_lsn = lsn;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index edcbac970b..c26abca1f7 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -12,7 +12,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use crate::{aux_file, repository::*};
-use anyhow::{bail, ensure, Context};
+use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use pageserver_api::key::{
@@ -168,7 +168,9 @@ impl Timeline {
         DatadirModification {
             tline: self,
             pending_lsns: Vec::new(),
-            pending_updates: HashMap::new(),
+            pending_metadata_pages: HashMap::new(),
+            pending_data_pages: Vec::new(),
+            pending_zero_data_pages: Default::default(),
             pending_deletions: Vec::new(),
             pending_nblocks: 0,
             pending_directory_entries: Vec::new(),
@@ -1031,10 +1033,24 @@ pub struct DatadirModification<'a> {
     // The put-functions add the modifications here, and they are flushed to the
     // underlying key-value store by the 'finish' function.
     pending_lsns: Vec<Lsn>,
-    pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
     pending_deletions: Vec<(Range<Key>, Lsn)>,
     pending_nblocks: i64,
 
+    /// Metadata writes, indexed by key so that they can be read from not-yet-committed modifications
+    /// while ingesting subsequent records. See [`Self::is_data_key`] for the definition of 'metadata'.
+    pending_metadata_pages: HashMap<CompactKey, Vec<(Lsn, usize, Value)>>,
+
+    /// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for
+    /// which keys are stored here.
+    pending_data_pages: Vec<(CompactKey, Lsn, usize, Value)>,
+
+    // Sometimes during ingest, for example when extending a relation, we would like to write a zero page.  However,
+    // if we encounter a write from postgres in the same wal record, we will drop this entry.
+    //
+    // Unlike other 'pending' fields, this does not last until the next call to commit(): it is flushed
+    // at the end of each wal record, and all these writes implicitly are at lsn Self::lsn
+    pending_zero_data_pages: HashSet<CompactKey>,
+
     /// For special "directory" keys that store key-value maps, track the size of the map
     /// if it was updated in this modification.
     pending_directory_entries: Vec<(DirectoryKind, usize)>,
@@ -1058,6 +1074,10 @@ impl<'a> DatadirModification<'a> {
         self.pending_bytes
     }
 
+    pub(crate) fn has_dirty_data_pages(&self) -> bool {
+        (!self.pending_data_pages.is_empty()) || (!self.pending_zero_data_pages.is_empty())
+    }
+
     /// Set the current lsn
     pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
         ensure!(
@@ -1066,6 +1086,10 @@ impl<'a> DatadirModification<'a> {
             lsn,
             self.lsn
         );
+
+        // If we are advancing LSN, then state from previous wal record should have been flushed.
+        assert!(self.pending_zero_data_pages.is_empty());
+
         if lsn > self.lsn {
             self.pending_lsns.push(self.lsn);
             self.lsn = lsn;
@@ -1073,6 +1097,17 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
+    /// In this context, 'metadata' means keys that are only read by the pageserver internally, and 'data' means
+    /// keys that represent literal blocks that postgres can read.  So data includes relation blocks and
+    /// SLRU blocks, which are read directly by postgres, and everything else is considered metadata.
+    ///
+    /// The distinction is important because data keys are handled on a fast path where dirty writes are
+    /// not readable until this modification is committed, whereas metadata keys are visible for read
+    /// via [`Self::get`] as soon as their record has been ingested.
+    fn is_data_key(key: &Key) -> bool {
+        key.is_rel_block_key() || key.is_slru_block_key()
+    }
+
     /// Initialize a completely new repository.
     ///
     /// This inserts the directory metadata entries that are assumed to
@@ -1180,6 +1215,31 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
+    pub(crate) fn put_rel_page_image_zero(&mut self, rel: RelTag, blknum: BlockNumber) {
+        self.pending_zero_data_pages
+            .insert(rel_block_to_key(rel, blknum).to_compact());
+        self.pending_bytes += ZERO_PAGE.len();
+    }
+
+    pub(crate) fn put_slru_page_image_zero(
+        &mut self,
+        kind: SlruKind,
+        segno: u32,
+        blknum: BlockNumber,
+    ) {
+        self.pending_zero_data_pages
+            .insert(slru_block_to_key(kind, segno, blknum).to_compact());
+        self.pending_bytes += ZERO_PAGE.len();
+    }
+
+    /// Call this at the end of each WAL record.
+    pub(crate) fn on_record_end(&mut self) {
+        let pending_zero_data_pages = std::mem::take(&mut self.pending_zero_data_pages);
+        for key in pending_zero_data_pages {
+            self.put_data(key, Value::Image(ZERO_PAGE.clone()));
+        }
+    }
+
     /// Store a relmapper file (pg_filenode.map) in the repository
     pub async fn put_relmap_file(
         &mut self,
@@ -1778,7 +1838,7 @@ impl<'a> DatadirModification<'a> {
     /// retains all the metadata, but data pages are flushed. That's again OK
     /// for bulk import, where you are just loading data pages and won't try to
     /// modify the same pages twice.
-    pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
+    pub(crate) async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
         // Unless we have accumulated a decent amount of changes, it's not worth it
         // to scan through the pending_updates list.
         let pending_nblocks = self.pending_nblocks;
@@ -1789,31 +1849,11 @@ impl<'a> DatadirModification<'a> {
         let mut writer = self.tline.writer().await;
 
         // Flush relation and  SLRU data blocks, keep metadata.
-        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
-        for (key, values) in self.pending_updates.drain() {
-            if !key.is_valid_key_on_write_path() {
-                bail!(
-                    "the request contains data not supported by pageserver at TimelineWriter::put: {}", key
-                );
-            }
-            let mut write_batch = Vec::new();
-            for (lsn, value_ser_size, value) in values {
-                if key.is_rel_block_key() || key.is_slru_block_key() {
-                    // This bails out on first error without modifying pending_updates.
-                    // That's Ok, cf this function's doc comment.
-                    write_batch.push((key.to_compact(), lsn, value_ser_size, value));
-                } else {
-                    retained_pending_updates.entry(key).or_default().push((
-                        lsn,
-                        value_ser_size,
-                        value,
-                    ));
-                }
-            }
-            writer.put_batch(write_batch, ctx).await?;
-        }
+        let pending_data_pages = std::mem::take(&mut self.pending_data_pages);
 
-        self.pending_updates = retained_pending_updates;
+        // This bails out on first error without modifying pending_updates.
+        // That's Ok, cf this function's doc comment.
+        writer.put_batch(pending_data_pages, ctx).await?;
         self.pending_bytes = 0;
 
         if pending_nblocks != 0 {
@@ -1834,29 +1874,31 @@ impl<'a> DatadirModification<'a> {
     /// All the modifications in this atomic update are stamped by the specified LSN.
     ///
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
+        // Commit should never be called mid-wal-record
+        assert!(self.pending_zero_data_pages.is_empty());
+
         let mut writer = self.tline.writer().await;
 
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
-        if !self.pending_updates.is_empty() {
-            // Ordering: the items in this batch do not need to be in any global order, but values for
-            // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
-            // this to do efficient updates to its index.
-            let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
-                .pending_updates
+        // Ordering: the items in this batch do not need to be in any global order, but values for
+        // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
+        // this to do efficient updates to its index.
+        let mut write_batch = std::mem::take(&mut self.pending_data_pages);
+
+        write_batch.extend(
+            self.pending_metadata_pages
                 .drain()
                 .flat_map(|(key, values)| {
-                    values.into_iter().map(move |(lsn, val_ser_size, value)| {
-                        if !key.is_valid_key_on_write_path() {
-                            bail!("the request contains data not supported by pageserver at TimelineWriter::put: {}", key);
-                        }
-                        Ok((key.to_compact(), lsn, val_ser_size, value))
-                    })
-                })
-                .collect::<anyhow::Result<Vec<_>>>()?;
+                    values
+                        .into_iter()
+                        .map(move |(lsn, value_size, value)| (key, lsn, value_size, value))
+                }),
+        );
 
-            writer.put_batch(batch, ctx).await?;
+        if !write_batch.is_empty() {
+            writer.put_batch(write_batch, ctx).await?;
         }
 
         if !self.pending_deletions.is_empty() {
@@ -1887,33 +1929,58 @@ impl<'a> DatadirModification<'a> {
     }
 
     pub(crate) fn len(&self) -> usize {
-        self.pending_updates.len() + self.pending_deletions.len()
+        self.pending_metadata_pages.len()
+            + self.pending_data_pages.len()
+            + self.pending_deletions.len()
     }
 
-    // Internal helper functions to batch the modifications
-
+    /// Read a page from the Timeline we are writing to.  For metadata pages, this passes through
+    /// a cache in Self, which makes writes earlier in this modification visible to WAL records later
+    /// in the modification.
+    ///
+    /// For data pages, reads pass directly to the owning Timeline: any ingest code which reads a data
+    /// page must ensure that the pages they read are already committed in Timeline, for example
+    /// DB create operations are always preceded by a call to commit().  This is special cased because
+    /// it's rare: all the 'normal' WAL operations will only read metadata pages such as relation sizes,
+    /// and not data pages.
     async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
-        // Have we already updated the same key? Read the latest pending updated
-        // version in that case.
-        //
-        // Note: we don't check pending_deletions. It is an error to request a
-        // value that has been removed, deletion only avoids leaking storage.
-        if let Some(values) = self.pending_updates.get(&key) {
-            if let Some((_, _, value)) = values.last() {
-                return if let Value::Image(img) = value {
-                    Ok(img.clone())
-                } else {
-                    // Currently, we never need to read back a WAL record that we
-                    // inserted in the same "transaction". All the metadata updates
-                    // work directly with Images, and we never need to read actual
-                    // data pages. We could handle this if we had to, by calling
-                    // the walredo manager, but let's keep it simple for now.
-                    Err(PageReconstructError::Other(anyhow::anyhow!(
-                        "unexpected pending WAL record"
-                    )))
-                };
+        if !Self::is_data_key(&key) {
+            // Have we already updated the same key? Read the latest pending updated
+            // version in that case.
+            //
+            // Note: we don't check pending_deletions. It is an error to request a
+            // value that has been removed, deletion only avoids leaking storage.
+            if let Some(values) = self.pending_metadata_pages.get(&key.to_compact()) {
+                if let Some((_, _, value)) = values.last() {
+                    return if let Value::Image(img) = value {
+                        Ok(img.clone())
+                    } else {
+                        // Currently, we never need to read back a WAL record that we
+                        // inserted in the same "transaction". All the metadata updates
+                        // work directly with Images, and we never need to read actual
+                        // data pages. We could handle this if we had to, by calling
+                        // the walredo manager, but let's keep it simple for now.
+                        Err(PageReconstructError::Other(anyhow::anyhow!(
+                            "unexpected pending WAL record"
+                        )))
+                    };
+                }
+            }
+        } else {
+            // This is an expensive check, so we only do it in debug mode. If reading a data key,
+            // this key should never be present in pending_data_pages. We ensure this by committing
+            // modifications before ingesting DB create operations, which are the only kind that reads
+            // data pages during ingest.
+            if cfg!(debug_assertions) {
+                for (dirty_key, _, _, _) in &self.pending_data_pages {
+                    debug_assert!(&key.to_compact() != dirty_key);
+                }
+
+                debug_assert!(!self.pending_zero_data_pages.contains(&key.to_compact()))
             }
         }
+
+        // Metadata page cache miss, or we're reading a data page.
         let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
         self.tline.get(key, lsn, ctx).await
     }
@@ -1925,11 +1992,40 @@ impl<'a> DatadirModification<'a> {
     }
 
     fn put(&mut self, key: Key, val: Value) {
-        let values = self.pending_updates.entry(key).or_default();
+        if Self::is_data_key(&key) {
+            self.put_data(key.to_compact(), val)
+        } else {
+            self.put_metadata(key.to_compact(), val)
+        }
+    }
+
+    fn put_data(&mut self, key: CompactKey, val: Value) {
+        let val_serialized_size = val.serialized_size().unwrap() as usize;
+
+        // If this page was previously zero'd in the same WalRecord, then drop the previous zero page write.  This
+        // is an optimization that avoids persisting both the zero page generated by us (e.g. during a relation extend),
+        // and the subsequent postgres-originating write
+        if self.pending_zero_data_pages.remove(&key) {
+            self.pending_bytes -= ZERO_PAGE.len();
+        }
+
+        self.pending_bytes += val_serialized_size;
+        self.pending_data_pages
+            .push((key, self.lsn, val_serialized_size, val))
+    }
+
+    fn put_metadata(&mut self, key: CompactKey, val: Value) {
+        let values = self.pending_metadata_pages.entry(key).or_default();
         // Replace the previous value if it exists at the same lsn
         if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
             if *last_lsn == self.lsn {
+                // Update the pending_bytes contribution from this entry, and update the serialized size in place
+                self.pending_bytes -= *last_value_ser_size;
                 *last_value_ser_size = val.serialized_size().unwrap() as usize;
+                self.pending_bytes += *last_value_ser_size;
+
+                // Use the latest value, this replaces any earlier write to the same (key,lsn), such as much
+                // have been generated by synthesized zero page writes prior to the first real write to a page.
                 *last_value = val;
                 return;
             }
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index f31ab4b1e8..2c19e5b19f 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -692,8 +692,13 @@ impl InMemoryLayer {
             let vec_map = inner.index.entry(key).or_default();
             let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
             if old.is_some() {
-                // We already had an entry for this LSN. That's odd..
-                warn!("Key {} at {} already exists", key, lsn);
+                // This should not break anything, but is unexpected: ingestion code aims to filter out
+                // multiple writes to the same key at the same LSN.  This happens in cases where our
+                // ingenstion code generates some write like an empty page, and we see a write from postgres
+                // to the same key in the same wal record.  If one such write makes it through, we
+                // index the most recent write, implicitly ignoring the earlier write.  We log a warning
+                // because this case is unexpected, and we would like tests to fail if this happens.
+                warn!("Key {} at {} written twice at same LSN", key, lsn);
             }
         }
 
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 0114473eda..cee259e2e0 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -31,7 +31,7 @@ use crate::{
     task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
     tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
     walingest::WalIngest,
-    walrecord::DecodedWALRecord,
+    walrecord::{decode_wal_record, DecodedWALRecord},
 };
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
@@ -312,10 +312,25 @@ pub(super) async fn handle_walreceiver_connection(
                 waldecoder.feed_bytes(data);
 
                 {
-                    let mut decoded = DecodedWALRecord::default();
                     let mut modification = timeline.begin_modification(startlsn);
                     let mut uncommitted_records = 0;
                     let mut filtered_records = 0;
+
+                    async fn commit(
+                        modification: &mut DatadirModification<'_>,
+                        uncommitted: &mut u64,
+                        filtered: &mut u64,
+                        ctx: &RequestContext,
+                    ) -> anyhow::Result<()> {
+                        WAL_INGEST
+                            .records_committed
+                            .inc_by(*uncommitted - *filtered);
+                        modification.commit(ctx).await?;
+                        *uncommitted = 0;
+                        *filtered = 0;
+                        Ok(())
+                    }
+
                     while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                         // It is important to deal with the aligned records as lsn in getPage@LSN is
                         // aligned and can be several bytes bigger. Without this alignment we are
@@ -324,9 +339,28 @@ pub(super) async fn handle_walreceiver_connection(
                             return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
                         }
 
+                        // Deserialize WAL record
+                        let mut decoded = DecodedWALRecord::default();
+                        decode_wal_record(recdata, &mut decoded, modification.tline.pg_version)?;
+
+                        if decoded.is_dbase_create_copy(timeline.pg_version)
+                            && uncommitted_records > 0
+                        {
+                            // Special case: legacy PG database creations operate by reading pages from a 'template' database:
+                            // these are the only kinds of WAL record that require reading data blocks while ingesting.  Ensure
+                            // all earlier writes of data blocks are visible by committing any modification in flight.
+                            commit(
+                                &mut modification,
+                                &mut uncommitted_records,
+                                &mut filtered_records,
+                                &ctx,
+                            )
+                            .await?;
+                        }
+
                         // Ingest the records without immediately committing them.
                         let ingested = walingest
-                            .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
+                            .ingest_record(decoded, lsn, &mut modification, &ctx)
                             .await
                             .with_context(|| format!("could not ingest record at {lsn}"))?;
                         if !ingested {
@@ -349,21 +383,25 @@ pub(super) async fn handle_walreceiver_connection(
                             || modification.approx_pending_bytes()
                                 > DatadirModification::MAX_PENDING_BYTES
                         {
-                            WAL_INGEST
-                                .records_committed
-                                .inc_by(uncommitted_records - filtered_records);
-                            modification.commit(&ctx).await?;
-                            uncommitted_records = 0;
-                            filtered_records = 0;
+                            commit(
+                                &mut modification,
+                                &mut uncommitted_records,
+                                &mut filtered_records,
+                                &ctx,
+                            )
+                            .await?;
                         }
                     }
 
                     // Commit the remaining records.
                     if uncommitted_records > 0 {
-                        WAL_INGEST
-                            .records_committed
-                            .inc_by(uncommitted_records - filtered_records);
-                        modification.commit(&ctx).await?;
+                        commit(
+                            &mut modification,
+                            &mut uncommitted_records,
+                            &mut filtered_records,
+                            &ctx,
+                        )
+                        .await?;
                     }
                 }
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 8ccd20adb1..2d3841881b 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -57,6 +57,7 @@ use utils::lsn::Lsn;
 
 pub struct WalIngest {
     shard: ShardIdentity,
+    pg_version: u32,
     checkpoint: CheckPoint,
     checkpoint_modified: bool,
     warn_ingest_lag: WarnIngestLag,
@@ -82,6 +83,7 @@ impl WalIngest {
 
         Ok(WalIngest {
             shard: *timeline.get_shard_identity(),
+            pg_version: timeline.pg_version,
             checkpoint,
             checkpoint_modified: false,
             warn_ingest_lag: WarnIngestLag {
@@ -104,10 +106,9 @@ impl WalIngest {
     ///
     pub async fn ingest_record(
         &mut self,
-        recdata: Bytes,
+        decoded: DecodedWALRecord,
         lsn: Lsn,
         modification: &mut DatadirModification<'_>,
-        decoded: &mut DecodedWALRecord,
         ctx: &RequestContext,
     ) -> anyhow::Result<bool> {
         WAL_INGEST.records_received.inc();
@@ -115,7 +116,12 @@ impl WalIngest {
         let prev_len = modification.len();
 
         modification.set_lsn(lsn)?;
-        decode_wal_record(recdata, decoded, pg_version)?;
+
+        if decoded.is_dbase_create_copy(self.pg_version) {
+            // Records of this type should always be preceded by a commit(), as they
+            // rely on reading data pages back from the Timeline.
+            assert!(!modification.has_dirty_data_pages());
+        }
 
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
@@ -133,11 +139,11 @@ impl WalIngest {
             pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
                 // Heap AM records need some special handling, because they modify VM pages
                 // without registering them with the standard mechanism.
-                self.ingest_heapam_record(&mut buf, modification, decoded, ctx)
+                self.ingest_heapam_record(&mut buf, modification, &decoded, ctx)
                     .await?;
             }
             pg_constants::RM_NEON_ID => {
-                self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx)
+                self.ingest_neonrmgr_record(&mut buf, modification, &decoded, ctx)
                     .await?;
             }
             // Handle other special record types
@@ -325,7 +331,7 @@ impl WalIngest {
             }
             pg_constants::RM_RELMAP_ID => {
                 let xlrec = XlRelmapUpdate::decode(&mut buf);
-                self.ingest_relmap_page(modification, &xlrec, decoded, ctx)
+                self.ingest_relmap_page(modification, &xlrec, &decoded, ctx)
                     .await?;
             }
             pg_constants::RM_XLOG_ID => {
@@ -470,7 +476,7 @@ impl WalIngest {
 
                 continue;
             }
-            self.ingest_decoded_block(modification, lsn, decoded, blk, ctx)
+            self.ingest_decoded_block(modification, lsn, &decoded, blk, ctx)
                 .await?;
         }
 
@@ -486,6 +492,8 @@ impl WalIngest {
         // until commit() is called to flush the data into the repository and update
         // the latest LSN.
 
+        modification.on_record_end();
+
         Ok(modification.len() > prev_len)
     }
 
@@ -557,6 +565,7 @@ impl WalIngest {
                 page_set_lsn(&mut image, lsn)
             }
             assert_eq!(image.len(), BLCKSZ as usize);
+
             self.put_rel_page_image(modification, rel, blk.blkno, image.freeze(), ctx)
                 .await?;
         } else {
@@ -1195,7 +1204,7 @@ impl WalIngest {
             if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
                 // Tail of last remaining FSM page has to be zeroed.
                 // We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
-                modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
+                modification.put_rel_page_image_zero(rel, fsm_physical_page_no);
                 fsm_physical_page_no += 1;
             }
             let nblocks = get_relsize(modification, rel, ctx).await?;
@@ -1217,7 +1226,7 @@ impl WalIngest {
             if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 {
                 // Tail of last remaining vm page has to be zeroed.
                 // We are not precise here and instead of digging in VM bitmap format just clear the whole page.
-                modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
+                modification.put_rel_page_image_zero(rel, vm_page_no);
                 vm_page_no += 1;
             }
             let nblocks = get_relsize(modification, rel, ctx).await?;
@@ -1687,7 +1696,7 @@ impl WalIngest {
                     continue;
                 }
 
-                modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
+                modification.put_rel_page_image_zero(rel, gap_blknum);
             }
         }
         Ok(())
@@ -1753,7 +1762,7 @@ impl WalIngest {
 
             // fill the gap with zeros
             for gap_blknum in old_nblocks..blknum {
-                modification.put_slru_page_image(kind, segno, gap_blknum, ZERO_PAGE.clone())?;
+                modification.put_slru_page_image_zero(kind, segno, gap_blknum);
             }
         }
         Ok(())
@@ -1827,21 +1836,25 @@ mod tests {
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x30));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x50));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
 
         assert_current_logical_size(&tline, Lsn(0x50));
@@ -1983,6 +1996,7 @@ mod tests {
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
         assert_eq!(
             tline
@@ -2008,6 +2022,7 @@ mod tests {
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
         assert_eq!(
             tline
@@ -2409,7 +2424,6 @@ mod tests {
             .await
             .unwrap();
         let mut modification = tline.begin_modification(startpoint);
-        let mut decoded = DecodedWALRecord::default();
         println!("decoding {} bytes", bytes.len() - xlogoff);
 
         // Decode and ingest wal. We process the wal in chunks because
@@ -2417,8 +2431,10 @@ mod tests {
         for chunk in bytes[xlogoff..].chunks(50) {
             decoder.feed_bytes(chunk);
             while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
+                let mut decoded = DecodedWALRecord::default();
+                decode_wal_record(recdata, &mut decoded, modification.tline.pg_version).unwrap();
                 walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
+                    .ingest_record(decoded, lsn, &mut modification, &ctx)
                     .instrument(span.clone())
                     .await
                     .unwrap();
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index edddcefbe1..0c4d575de8 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -160,6 +160,30 @@ pub struct DecodedWALRecord {
     pub origin_id: u16,
 }
 
+impl DecodedWALRecord {
+    /// Check if this WAL record represents a legacy "copy" database creation, which populates new relations
+    /// by reading other existing relations' data blocks.  This is more complex to apply than new-style database
+    /// creations which simply include all the desired blocks in the WAL, so we need a helper function to detect this case.
+    pub(crate) fn is_dbase_create_copy(&self, pg_version: u32) -> bool {
+        if self.xl_rmid == pg_constants::RM_DBASE_ID {
+            let info = self.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+            match pg_version {
+                14 => {
+                    // Postgres 14 database creations are always the legacy kind
+                    info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE
+                }
+                15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                _ => {
+                    panic!("Unsupported postgres version {pg_version}")
+                }
+            }
+        } else {
+            false
+        }
+    }
+}
+
 #[repr(C)]
 #[derive(Debug, Clone, Copy)]
 pub struct RelFileNode {

From 1a874a3e863ac613f52eb0bbfe5e8d83bcfaba55 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Sep 2024 17:31:42 +0000
Subject: [PATCH 1550/1571] build(deps): bump flask-cors from 4.0.1 to 5.0.0
 (#8899)

---
 poetry.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 7db91e51f7..b8ef08b02d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1110,13 +1110,13 @@ dotenv = ["python-dotenv"]
 
 [[package]]
 name = "flask-cors"
-version = "4.0.1"
+version = "5.0.0"
 description = "A Flask extension adding a decorator for CORS support"
 optional = false
 python-versions = "*"
 files = [
-    {file = "Flask_Cors-4.0.1-py2.py3-none-any.whl", hash = "sha256:f2a704e4458665580c074b714c4627dd5a306b333deb9074d0b1794dfa2fb677"},
-    {file = "flask_cors-4.0.1.tar.gz", hash = "sha256:eeb69b342142fdbf4766ad99357a7f3876a2ceb77689dc10ff912aac06c389e4"},
+    {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"},
+    {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"},
 ]
 
 [package.dependencies]

From 3d9001d83ff54e8bd6a297c3328408323c4e21ff Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 4 Sep 2024 02:05:06 +0800
Subject: [PATCH 1551/1571] fix(pageserver): is_archived should be optional
 (#8902)

Set the field to optional, otherwise there will be decode errors when
newer version of the storage controller receives the JSON from older
version of the pageservers.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs | 7 ++++++-
 pageserver/src/http/routes.rs     | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 1d896863df..87e8f8305a 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -716,12 +716,17 @@ pub struct TimelineInfo {
     pub pg_version: u32,
 
     pub state: TimelineState,
-    pub is_archived: bool,
 
     pub walreceiver_status: String,
 
+    // ALWAYS add new fields at the end of the struct with `Option` to ensure forward/backward compatibility.
+    // Backward compatibility: you will get a JSON not containing the newly-added field.
+    // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
+    // not deny unknown fields by default so it's safe to set the field to some value, though it won't be
+    // read.
     /// The last aux file policy being used on this timeline
     pub last_aux_file_policy: Option<AuxFilePolicy>,
+    pub is_archived: Option<bool>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 8cf2c99c09..90ae6c5557 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -468,7 +468,7 @@ async fn build_timeline_info_common(
         pg_version: timeline.pg_version,
 
         state,
-        is_archived,
+        is_archived: Some(is_archived),
 
         walreceiver_status,
 

From ecfa3d9de9eec824800db55f5e9592fe0502c96e Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 4 Sep 2024 05:39:56 +0800
Subject: [PATCH 1552/1571] fix(storage-scrubber): wrong trial condition
 (#8905)

ref https://github.com/neondatabase/neon/issues/8872

## Summary of changes

We saw stuck storage scrubber in staging caused by infinite retries. I
believe here we should use `min` instead of `max` to avoid getting
minutes or hours of retry backoff.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/lib.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 3c21d2f8cf..3f08cddf50 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -422,7 +422,7 @@ fn stream_objects_with_retries<'a>(
                     let yield_err = if err.is_permanent() {
                         true
                     } else {
-                        let backoff_time = 1 << trial.max(5);
+                        let backoff_time = 1 << trial.min(5);
                         tokio::time::sleep(Duration::from_secs(backoff_time)).await;
                         trial += 1;
                         trial == MAX_RETRIES - 1
@@ -473,7 +473,7 @@ async fn list_objects_with_retries(
                     s3_target.delimiter,
                     DisplayErrorContext(e),
                 );
-                let backoff_time = 1 << trial.max(5);
+                let backoff_time = 1 << trial.min(5);
                 tokio::time::sleep(Duration::from_secs(backoff_time)).await;
             }
         }
@@ -492,7 +492,7 @@ async fn download_object_with_retries(
             Ok(response) => response,
             Err(e) => {
                 error!("Failed to download object for key {key}: {e}");
-                let backoff_time = 1 << trial.max(5);
+                let backoff_time = 1 << trial.min(5);
                 tokio::time::sleep(Duration::from_secs(backoff_time)).await;
                 continue;
             }
@@ -508,7 +508,7 @@ async fn download_object_with_retries(
             }
             Err(e) => {
                 error!("Failed to stream object body for key {key}: {e}");
-                let backoff_time = 1 << trial.max(5);
+                let backoff_time = 1 << trial.min(5);
                 tokio::time::sleep(Duration::from_secs(backoff_time)).await;
             }
         }

From 75310fe441b87d399213e365f1364aa9f08aa40d Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 4 Sep 2024 10:09:41 +0100
Subject: [PATCH 1553/1571] storcon: make hb interval an argument and speed up
 tests (#8880)

## Problem
Each test might wait for up to 5s in order to HB the pageserver.

## Summary of changes
Make the heartbeat interval configurable and use a really tight one for
neon local => startup quicker
---
 control_plane/src/local_env.rs          |  7 +++++++
 control_plane/src/storage_controller.rs |  2 ++
 storage_controller/src/main.rs          | 12 ++++++++++--
 storage_controller/src/service.rs       |  9 ++++++---
 test_runner/regress/test_tenants.py     |  4 +++-
 5 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 74caba2b56..5dbc3bcbbc 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -165,6 +165,9 @@ pub struct NeonStorageControllerConf {
     pub split_threshold: Option<u64>,
 
     pub max_secondary_lag_bytes: Option<u64>,
+
+    #[serde(with = "humantime_serde")]
+    pub heartbeat_interval: Duration,
 }
 
 impl NeonStorageControllerConf {
@@ -172,6 +175,9 @@ impl NeonStorageControllerConf {
     const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
 
     const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30);
+
+    // Very tight heartbeat interval to speed up tests
+    const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100);
 }
 
 impl Default for NeonStorageControllerConf {
@@ -183,6 +189,7 @@ impl Default for NeonStorageControllerConf {
             database_url: None,
             split_threshold: None,
             max_secondary_lag_bytes: None,
+            heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
         }
     }
 }
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 27d8e2de0c..c715d6b789 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -437,6 +437,8 @@ impl StorageController {
             &humantime::Duration::from(self.config.max_offline).to_string(),
             "--max-warming-up-interval",
             &humantime::Duration::from(self.config.max_warming_up).to_string(),
+            "--heartbeat-interval",
+            &humantime::Duration::from(self.config.heartbeat_interval).to_string(),
             "--address-for-peers",
             &address_for_peers.to_string(),
         ]
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index e3f29b84e7..00e90f4467 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -11,8 +11,8 @@ use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
 use storage_controller::service::chaos_injector::ChaosInjector;
 use storage_controller::service::{
-    Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
-    RECONCILER_CONCURRENCY_DEFAULT,
+    Config, Service, HEARTBEAT_INTERVAL_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT,
+    MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
@@ -104,6 +104,10 @@ struct Cli {
     // a pageserver
     #[arg(long)]
     max_secondary_lag_bytes: Option<u64>,
+
+    // Period with which to send heartbeats to registered nodes
+    #[arg(long)]
+    heartbeat_interval: Option<humantime::Duration>,
 }
 
 enum StrictMode {
@@ -285,6 +289,10 @@ async fn async_main() -> anyhow::Result<()> {
         split_threshold: args.split_threshold,
         neon_local_repo_dir: args.neon_local_repo_dir,
         max_secondary_lag_bytes: args.max_secondary_lag_bytes,
+        heartbeat_interval: args
+            .heartbeat_interval
+            .map(humantime::Duration::into)
+            .unwrap_or(HEARTBEAT_INTERVAL_DEFAULT),
         address_for_peers: args.address_for_peers,
         start_as_candidate: args.start_as_candidate,
         http_service_port: args.listen.port() as i32,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 95821827e2..49253cb4e0 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -121,6 +121,9 @@ pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
 /// being handled on the pageserver side.
 pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
 
+/// How often to send heartbeats to registered nodes?
+pub const HEARTBEAT_INTERVAL_DEFAULT: Duration = Duration::from_secs(5);
+
 #[derive(Clone, strum_macros::Display)]
 enum TenantOperations {
     Create,
@@ -326,6 +329,8 @@ pub struct Config {
     // upgraded to primary.
     pub max_secondary_lag_bytes: Option<u64>,
 
+    pub heartbeat_interval: Duration,
+
     pub address_for_peers: Option<Uri>,
 
     pub start_as_candidate: bool,
@@ -909,9 +914,7 @@ impl Service {
     async fn spawn_heartbeat_driver(&self) {
         self.startup_complete.clone().wait().await;
 
-        const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5);
-
-        let mut interval = tokio::time::interval(HEARTBEAT_INTERVAL);
+        let mut interval = tokio::time::interval(self.config.heartbeat_interval);
         while !self.cancel.is_cancelled() {
             tokio::select! {
               _ = interval.tick() => { }
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 0ebf714de0..b63ff7f6bd 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -372,8 +372,10 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
     tenant_id: TenantId = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    # Multiple creation requests which race will generate this error
+    # Multiple creation requests which race will generate this error on the pageserver
+    # and storage controller respectively
     env.pageserver.allowed_errors.append(".*Conflict: Tenant is already being modified.*")
+    env.storage_controller.allowed_errors.append(".*Conflict: Tenant is already being modified.*")
 
     # Tenant creation requests which arrive out of order will generate complaints about
     # generation nubmers out of order.

From 7a1397cf376cc4169385f6f19c371179396ada5f Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 4 Sep 2024 13:10:05 +0300
Subject: [PATCH 1554/1571] storcon: boilerplate to upsert safekeeper records
 on deploy (#8879)

We currently do not record safekeepers in the storage controller
database. We want to migrate timelines across safekeepers eventually, so
start recording the safekeepers on deploy.

Cc: #8698
---
 .../2024-08-23-102952_safekeepers/down.sql    |  2 +
 .../2024-08-23-102952_safekeepers/up.sql      | 15 ++++
 storage_controller/src/http.rs                | 57 ++++++++++++
 storage_controller/src/persistence.rs         | 86 +++++++++++++++++++
 storage_controller/src/schema.rs              | 14 +++
 storage_controller/src/service.rs             | 14 +++
 test_runner/fixtures/neon_fixtures.py         | 23 +++++
 .../regress/test_storage_controller.py        | 68 ++++++++++++++-
 8 files changed, 278 insertions(+), 1 deletion(-)
 create mode 100644 storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql
 create mode 100644 storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql

diff --git a/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql b/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql
new file mode 100644
index 0000000000..9dfc750586
--- /dev/null
+++ b/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql
@@ -0,0 +1,2 @@
+-- This file should undo anything in `up.sql`
+DROP TABLE safekeepers;
diff --git a/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql b/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql
new file mode 100644
index 0000000000..c78716660f
--- /dev/null
+++ b/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql
@@ -0,0 +1,15 @@
+-- started out as a copy of cplane schema, removed the unnecessary columns.
+CREATE TABLE safekeepers (
+	-- the surrogate identifier defined by control plane database sequence
+	id BIGINT PRIMARY KEY,
+	region_id TEXT NOT NULL,
+	version BIGINT NOT NULL,
+	-- the natural id on whatever cloud platform, not needed in storage controller
+	-- instance_id TEXT UNIQUE NOT NULL,
+	host TEXT NOT NULL,
+	port INTEGER NOT NULL,
+	active BOOLEAN NOT NULL DEFAULT false,
+	-- projects_count INTEGER NOT NULL DEFAULT 0,
+	http_port INTEGER NOT NULL,
+	availability_zone_id TEXT NOT NULL
+);
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index d3eb081be4..0fa4f4fd0e 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -2,6 +2,7 @@ use crate::metrics::{
     HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
     METRICS_REGISTRY,
 };
+use crate::persistence::SafekeeperPersistence;
 use crate::reconciler::ReconcileError;
 use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT};
 use anyhow::Context;
@@ -767,6 +768,55 @@ impl From<ReconcileError> for ApiError {
     }
 }
 
+/// Return the safekeeper record by instance id, or 404.
+///
+/// Not used by anything except manual testing.
+async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let id = parse_request_param::<i64>(&req, "id")?;
+
+    let state = get_state(&req);
+
+    let res = state.service.get_safekeeper(id).await;
+
+    match res {
+        Ok(b) => json_response(StatusCode::OK, b),
+        Err(crate::persistence::DatabaseError::Query(diesel::result::Error::NotFound)) => {
+            Err(ApiError::NotFound("unknown instance_id".into()))
+        }
+        Err(other) => Err(other.into()),
+    }
+}
+
+/// Used as part of deployment scripts.
+///
+/// Assumes information is only relayed to storage controller after first selecting an unique id on
+/// control plane database, which means we have an id field in the request and payload.
+async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let body = json_request::<SafekeeperPersistence>(&mut req).await?;
+    let id = parse_request_param::<i64>(&req, "id")?;
+
+    if id != body.id {
+        // it should be repeated
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "id mismatch: url={id:?}, body={:?}",
+            body.id
+        )));
+    }
+
+    let state = get_state(&req);
+
+    state.service.upsert_safekeeper(body).await?;
+
+    Ok(Response::builder()
+        .status(StatusCode::NO_CONTENT)
+        .body(Body::empty())
+        .unwrap())
+}
+
 /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
 /// be allowed to run if Service has finished its initial reconciliation.
 async fn tenant_service_handler<R, H>(
@@ -1127,6 +1177,13 @@ pub fn make_router(
         .put("/control/v1/step_down", |r| {
             named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
         })
+        .get("/control/v1/safekeeper/:id", |r| {
+            named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
+        })
+        .post("/control/v1/safekeeper/:id", |r| {
+            // id is in the body
+            named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
+        })
         // Tenant operations
         // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
         // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 6e1c2016ff..d03eb87242 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -938,6 +938,48 @@ impl Persistence {
 
         Ok(())
     }
+
+    pub(crate) async fn safekeeper_get(
+        &self,
+        id: i64,
+    ) -> Result<SafekeeperPersistence, DatabaseError> {
+        use crate::schema::safekeepers::dsl::{id as id_column, safekeepers};
+        self.with_conn(move |conn| -> DatabaseResult<SafekeeperPersistence> {
+            Ok(safekeepers
+                .filter(id_column.eq(&id))
+                .select(SafekeeperPersistence::as_select())
+                .get_result(conn)?)
+        })
+        .await
+    }
+
+    pub(crate) async fn safekeeper_upsert(
+        &self,
+        record: SafekeeperPersistence,
+    ) -> Result<(), DatabaseError> {
+        use crate::schema::safekeepers::dsl::*;
+
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            let bind = record.as_insert_or_update();
+
+            let inserted_updated = diesel::insert_into(safekeepers)
+                .values(&bind)
+                .on_conflict(id)
+                .do_update()
+                .set(&bind)
+                .execute(conn)?;
+
+            if inserted_updated != 1 {
+                return Err(DatabaseError::Logical(format!(
+                    "unexpected number of rows ({})",
+                    inserted_updated
+                )));
+            }
+
+            Ok(())
+        })
+        .await
+    }
 }
 
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
@@ -1073,3 +1115,47 @@ pub(crate) struct ControllerPersistence {
     pub(crate) address: String,
     pub(crate) started_at: chrono::DateTime<chrono::Utc>,
 }
+
+#[derive(Serialize, Deserialize, Queryable, Selectable, Eq, PartialEq, Debug, Clone)]
+#[diesel(table_name = crate::schema::safekeepers)]
+pub(crate) struct SafekeeperPersistence {
+    pub(crate) id: i64,
+    pub(crate) region_id: String,
+    /// 1 is special, it means just created (not currently posted to storcon).
+    /// Zero or negative is not really expected.
+    /// Otherwise the number from `release-$(number_of_commits_on_branch)` tag.
+    pub(crate) version: i64,
+    pub(crate) host: String,
+    pub(crate) port: i32,
+    pub(crate) active: bool,
+    pub(crate) http_port: i32,
+    pub(crate) availability_zone_id: String,
+}
+
+impl SafekeeperPersistence {
+    fn as_insert_or_update(&self) -> InsertUpdateSafekeeper<'_> {
+        InsertUpdateSafekeeper {
+            id: self.id,
+            region_id: &self.region_id,
+            version: self.version,
+            host: &self.host,
+            port: self.port,
+            active: self.active,
+            http_port: self.http_port,
+            availability_zone_id: &self.availability_zone_id,
+        }
+    }
+}
+
+#[derive(Insertable, AsChangeset)]
+#[diesel(table_name = crate::schema::safekeepers)]
+struct InsertUpdateSafekeeper<'a> {
+    id: i64,
+    region_id: &'a str,
+    version: i64,
+    host: &'a str,
+    port: i32,
+    active: bool,
+    http_port: i32,
+    availability_zone_id: &'a str,
+}
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 1e8379500c..e0f515daea 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -45,3 +45,17 @@ diesel::table! {
 }
 
 diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,);
+
+diesel::table! {
+    safekeepers {
+        id -> Int8,
+        region_id -> Text,
+        version -> Int8,
+        instance_id -> Text,
+        host -> Text,
+        port -> Int4,
+        active -> Bool,
+        http_port -> Int4,
+        availability_zone_id -> Text,
+    }
+}
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 49253cb4e0..4ccc5c951c 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -6476,4 +6476,18 @@ impl Service {
 
         global_observed
     }
+
+    pub(crate) async fn get_safekeeper(
+        &self,
+        id: i64,
+    ) -> Result<crate::persistence::SafekeeperPersistence, DatabaseError> {
+        self.persistence.safekeeper_get(id).await
+    }
+
+    pub(crate) async fn upsert_safekeeper(
+        &self,
+        record: crate::persistence::SafekeeperPersistence,
+    ) -> Result<(), DatabaseError> {
+        self.persistence.safekeeper_upsert(record).await
+    }
 }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8c99408cfb..890538b86a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2845,6 +2845,29 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         raise AssertionError("unreachable")
 
+    def on_safekeeper_deploy(self, id: int, body: dict[str, Any]):
+        self.request(
+            "POST",
+            f"{self.api}/control/v1/safekeeper/{id}",
+            headers=self.headers(TokenScope.ADMIN),
+            json=body,
+        )
+
+    def get_safekeeper(self, id: int) -> Optional[dict[str, Any]]:
+        try:
+            response = self.request(
+                "GET",
+                f"{self.api}/control/v1/safekeeper/{id}",
+                headers=self.headers(TokenScope.ADMIN),
+            )
+            json = response.json()
+            assert isinstance(json, dict)
+            return json
+        except StorageControllerApiException as e:
+            if e.status_code == 404:
+                return None
+            raise e
+
     def __enter__(self) -> "NeonStorageController":
         return self
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 03eb7628be..13f5ec1b4f 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -31,7 +31,7 @@ from fixtures.pageserver.utils import (
     remote_storage_delete_key,
     timeline_delete_wait_completed,
 )
-from fixtures.pg_version import PgVersion
+from fixtures.pg_version import PgVersion, run_only_on_default_postgres
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.storage_controller_proxy import StorageControllerProxy
@@ -2330,3 +2330,69 @@ def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder)
             connect=0,  # Disable retries: we want to see the 503
         )
     ).timeline_create(PgVersion.NOT_SET, tenant_id, create_timeline_id)
+
+
+@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
+def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    fake_id = 5
+
+    target = env.storage_controller
+
+    assert target.get_safekeeper(fake_id) is None
+
+    body = {
+        "active": True,
+        "id": fake_id,
+        "created_at": "2023-10-25T09:11:25Z",
+        "updated_at": "2024-08-28T11:32:43Z",
+        "region_id": "aws-us-east-2",
+        "host": "safekeeper-333.us-east-2.aws.neon.build",
+        "port": 6401,
+        "http_port": 7676,
+        "version": 5957,
+        "availability_zone_id": "us-east-2b",
+    }
+
+    target.on_safekeeper_deploy(fake_id, body)
+
+    inserted = target.get_safekeeper(fake_id)
+    assert inserted is not None
+    assert eq_safekeeper_records(body, inserted)
+
+    # error out if pk is changed (unexpected)
+    with pytest.raises(StorageControllerApiException) as exc:
+        different_pk = dict(body)
+        different_pk["id"] = 4
+        assert different_pk["id"] != body["id"]
+        target.on_safekeeper_deploy(fake_id, different_pk)
+    assert exc.value.status_code == 400
+
+    inserted_again = target.get_safekeeper(fake_id)
+    assert inserted_again is not None
+    assert eq_safekeeper_records(inserted, inserted_again)
+
+    # the most common case, version goes up:
+    assert isinstance(body["version"], int)
+    body["version"] += 1
+    target.on_safekeeper_deploy(fake_id, body)
+    inserted_now = target.get_safekeeper(fake_id)
+    assert inserted_now is not None
+
+    assert eq_safekeeper_records(body, inserted_now)
+
+
+def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
+    compared = [dict(a), dict(b)]
+
+    masked_keys = ["created_at", "updated_at"]
+
+    for d in compared:
+        # keep deleting these in case we are comparing the body as it will be uploaded by real scripts
+        for key in masked_keys:
+            if key in d:
+                del d[key]
+
+    return compared[0] == compared[1]

From a046717a2409b5291ad341c1f4d26cb1df1a55bd Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 4 Sep 2024 14:41:51 +0300
Subject: [PATCH 1555/1571] Fix submodule refs to point to the correct
 REL_X_STABLE_neon branches (#8910)

Commit cfa45ff5ee (PR #8860) updated the vendor/postgres submodules, but
didn't use the same commit SHAs that were pushed as the corresponding
REL_*_STABLE_neon branches in the postgres repository. The contents were
the same, but the REL_*_STABLE_neon branches pointed to squashed
versions of the commits, whereas the SHAs used in the submodules
referred to the pre-squash revisions.

Note: The vendor/postgres-v14 submodule still doesn't match with the tip
of REL_14_STABLE_neon branch, because there has been one more commit on
that branch since then. That's another confusion which we should fix,
but let's do that separately. This commit doesn't change the code that
gets built in any way, only changes the submodule references to point to
the correct SHAs in the REL_*_STABLE_neon branch histories, rather than
some detached commits.
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 48388a5b59..7602e907ab 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 48388a5b597c81c09e28c016650a7156b48717a1
+Subproject commit 7602e907ab30f16188bebfd66b8f297c2889d339
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 8aa1ded772..49d5e576a5 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 8aa1ded7726d416ac8e02600aad387a353478fc7
+Subproject commit 49d5e576a56e4cc59cd6a6a0791b2324b9fa675e
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 95132feffe..6e9a4ff624 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 95132feffe277ce84309d93a42e9aadfd2cb0437
+Subproject commit 6e9a4ff6249ac02b8175054b7b3f7dfb198be48b
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 319e648488..751b9e8679 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,14 +1,14 @@
 {
   "v16": [
     "16.4",
-    "95132feffe277ce84309d93a42e9aadfd2cb0437"
+    "6e9a4ff6249ac02b8175054b7b3f7dfb198be48b"
   ],
   "v15": [
     "15.8",
-    "8aa1ded7726d416ac8e02600aad387a353478fc7"
+    "49d5e576a56e4cc59cd6a6a0791b2324b9fa675e"
   ],
   "v14": [
     "14.13",
-    "48388a5b597c81c09e28c016650a7156b48717a1"
+    "7602e907ab30f16188bebfd66b8f297c2889d339"
   ]
 }

From 3f43823a9b333140ccf21a55ff1316c351bacd58 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 4 Sep 2024 13:41:10 +0100
Subject: [PATCH 1556/1571] build(deps): bump cryptography from 42.0.4 to
 43.0.1 (#8908)

---
 poetry.lock | 63 ++++++++++++++++++++++++-----------------------------
 1 file changed, 29 insertions(+), 34 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index b8ef08b02d..48943a73e9 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -985,43 +985,38 @@ files = [
 
 [[package]]
 name = "cryptography"
-version = "42.0.4"
+version = "43.0.1"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449"},
-    {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18"},
-    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2"},
-    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1"},
-    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b"},
-    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1"},
-    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992"},
-    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885"},
-    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824"},
-    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b"},
-    {file = "cryptography-42.0.4-cp37-abi3-win32.whl", hash = "sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925"},
-    {file = "cryptography-42.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923"},
-    {file = "cryptography-42.0.4-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7"},
-    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52"},
-    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a"},
-    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9"},
-    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764"},
-    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff"},
-    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257"},
-    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929"},
-    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0"},
-    {file = "cryptography-42.0.4-cp39-abi3-win32.whl", hash = "sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129"},
-    {file = "cryptography-42.0.4-cp39-abi3-win_amd64.whl", hash = "sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854"},
-    {file = "cryptography-42.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298"},
-    {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88"},
-    {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20"},
-    {file = "cryptography-42.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce"},
-    {file = "cryptography-42.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74"},
-    {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd"},
-    {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b"},
-    {file = "cryptography-42.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660"},
-    {file = "cryptography-42.0.4.tar.gz", hash = "sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb"},
+    {file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"},
+    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"},
+    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962"},
+    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277"},
+    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a"},
+    {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042"},
+    {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494"},
+    {file = "cryptography-43.0.1-cp37-abi3-win32.whl", hash = "sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2"},
+    {file = "cryptography-43.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d"},
+    {file = "cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d"},
+    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806"},
+    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85"},
+    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c"},
+    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1"},
+    {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa"},
+    {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4"},
+    {file = "cryptography-43.0.1-cp39-abi3-win32.whl", hash = "sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47"},
+    {file = "cryptography-43.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb"},
+    {file = "cryptography-43.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034"},
+    {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d"},
+    {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289"},
+    {file = "cryptography-43.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84"},
+    {file = "cryptography-43.0.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:88cce104c36870d70c49c7c8fd22885875d950d9ee6ab54df2745f83ba0dc365"},
+    {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:9d3cdb25fa98afdd3d0892d132b8d7139e2c087da1712041f6b762e4f807cc96"},
+    {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e710bf40870f4db63c3d7d929aa9e09e4e7ee219e703f949ec4073b4294f6172"},
+    {file = "cryptography-43.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7c05650fe8023c5ed0d46793d4b7d7e6cd9c04e68eabe5b0aeea836e37bdcec2"},
+    {file = "cryptography-43.0.1.tar.gz", hash = "sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d"},
 ]
 
 [package.dependencies]
@@ -1034,7 +1029,7 @@ nox = ["nox"]
 pep8test = ["check-sdist", "click", "mypy", "ruff"]
 sdist = ["build"]
 ssh = ["bcrypt (>=3.1.5)"]
-test = ["certifi", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
+test = ["certifi", "cryptography-vectors (==43.0.1)", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
 test-randomorder = ["pytest-randomly"]
 
 [[package]]

From 1a9b54f1d99fb373eddc7f3ff57174031d34c7b6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 4 Sep 2024 15:00:40 +0100
Subject: [PATCH 1557/1571] storage controller: read from database in validate
 API (#8784)

## Problem

The initial implementation of the validate API treats the in-memory
generations as authoritative.
- This is true when only one storage controller is running, but if a
rogue controller was running that hadn't been shut down properly, and
some pageserver requests were routed to that bad controller, it could
incorrectly return valid=true for stale generations.
- The generation in the main in-memory map gets out of date while a live
migration is in flight, and if the origin location for the migration
tries to do some deletions even though it is in AttachedStale (for
example because it had already started compaction), these might be
wrongly validated + executed.

## Summary of changes

- Continue to do the in-memory check: if this returns valid=false it is
sufficient to reject requests.
- When valid=true, do an additional read from the database to confirm
the generation is fresh.
- Revise behavior for validation on missing shards: this used to always
return valid=true as a convenience for deletions and shard splits, so
that pageservers weren't prevented from completing any enqueued
deletions for these shards after they're gone. However, this becomes
unsafe when we consider split brain scenarios. We could reinstate this
in future if we wanted to store some tombstones for deleted shards.
- Update test_scrubber_physical_gc to cope with the behavioral change:
they must now explicitly flush the deletion queue before splits, to
avoid tripping up on deletions that are enqueued at the time of the
split (these tests assert "scrubber deletes nothing", which check fails
if the split leaves behind some remote objects that are legitimately
GC'able)
- Add `test_storage_controller_validate_during_migration`, which uses
failpoints to create a situation where incorrect generation validation
during a live migration could result in a corruption

The rate of validate calls for tenants is pretty low: it happens as a
consequence deletions from GC and compaction, which are both
concurrency-limited on the pageserver side.
---
 storage_controller/src/http.rs                |   2 +-
 storage_controller/src/persistence.rs         |  70 ++++++++++-
 storage_controller/src/reconciler.rs          |   3 +
 storage_controller/src/service.rs             |  91 ++++++++++----
 .../regress/test_storage_controller.py        | 116 ++++++++++++++++++
 test_runner/regress/test_storage_scrubber.py  |  11 ++
 6 files changed, 261 insertions(+), 32 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 0fa4f4fd0e..32882c201a 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -102,7 +102,7 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
 
     let validate_req = json_request::<ValidateRequest>(&mut req).await?;
     let state = get_state(&req);
-    json_response(StatusCode::OK, state.service.validate(validate_req))
+    json_response(StatusCode::OK, state.service.validate(validate_req).await?)
 }
 
 /// Call into this before attaching a tenant to a pageserver, to acquire a generation number
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index d03eb87242..e801289752 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -8,6 +8,7 @@ use self::split_state::SplitState;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
+use itertools::Itertools;
 use pageserver_api::controller_api::MetadataHealthRecord;
 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
@@ -91,7 +92,8 @@ pub(crate) enum DatabaseOperation {
     Detach,
     ReAttach,
     IncrementGeneration,
-    PeekGenerations,
+    TenantGenerations,
+    ShardGenerations,
     ListTenantShards,
     InsertTenantShards,
     UpdateTenantShard,
@@ -544,13 +546,13 @@ impl Persistence {
     /// If the tenant doesn't exist, an empty vector is returned.
     ///
     /// Output is sorted by shard number
-    pub(crate) async fn peek_generations(
+    pub(crate) async fn tenant_generations(
         &self,
         filter_tenant_id: TenantId,
     ) -> Result<Vec<ShardGenerationState>, DatabaseError> {
         use crate::schema::tenant_shards::dsl::*;
         let rows = self
-            .with_measured_conn(DatabaseOperation::PeekGenerations, move |conn| {
+            .with_measured_conn(DatabaseOperation::TenantGenerations, move |conn| {
                 let result = tenant_shards
                     .filter(tenant_id.eq(filter_tenant_id.to_string()))
                     .select(TenantShardPersistence::as_select())
@@ -572,6 +574,64 @@ impl Persistence {
             .collect())
     }
 
+    /// Read the generation number of specific tenant shards
+    ///
+    /// Output is unsorted.  Output may not include values for all inputs, if they are missing in the database.
+    pub(crate) async fn shard_generations(
+        &self,
+        mut tenant_shard_ids: impl Iterator<Item = &TenantShardId>,
+    ) -> Result<Vec<(TenantShardId, Option<Generation>)>, DatabaseError> {
+        let mut rows = Vec::with_capacity(tenant_shard_ids.size_hint().0);
+
+        // We will chunk our input to avoid composing arbitrarily long `IN` clauses.  Typically we are
+        // called with a single digit number of IDs, but in principle we could be called with tens
+        // of thousands (all the shards on one pageserver) from the generation validation API.
+        loop {
+            // A modest hardcoded chunk size to handle typical cases in a single query but never generate particularly
+            // large query strings.
+            let chunk_ids = tenant_shard_ids.by_ref().take(32);
+
+            // Compose a comma separated list of tuples for matching on (tenant_id, shard_number, shard_count)
+            let in_clause = chunk_ids
+                .map(|tsid| {
+                    format!(
+                        "('{}', {}, {})",
+                        tsid.tenant_id, tsid.shard_number.0, tsid.shard_count.0
+                    )
+                })
+                .join(",");
+
+            // We are done when our iterator gives us nothing to filter on
+            if in_clause.is_empty() {
+                break;
+            }
+
+            let chunk_rows = self
+                .with_measured_conn(DatabaseOperation::ShardGenerations, move |conn| {
+                    // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
+                    // the inputs are strongly typed and cannot carry any user-supplied raw string content.
+                    let result : Vec<TenantShardPersistence> = diesel::sql_query(
+                        format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
+                    ).load(conn)?;
+
+                    Ok(result)
+                })
+                .await?;
+            rows.extend(chunk_rows.into_iter())
+        }
+
+        Ok(rows
+            .into_iter()
+            .map(|tsp| {
+                (
+                    tsp.get_tenant_shard_id()
+                        .expect("Bad tenant ID in database"),
+                    tsp.generation.map(|g| Generation::new(g as u32)),
+                )
+            })
+            .collect())
+    }
+
     #[allow(non_local_definitions)]
     /// For use when updating a persistent property of a tenant, such as its config or placement_policy.
     ///
@@ -983,7 +1043,9 @@ impl Persistence {
 }
 
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
-#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
+#[derive(
+    QueryableByName, Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq,
+)]
 #[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
     #[serde(default)]
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 102a3124d2..83b7b2b4f2 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -17,6 +17,7 @@ use utils::failpoint_support;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
 use utils::lsn::Lsn;
+use utils::pausable_failpoint;
 use utils::sync::gate::GateGuard;
 
 use crate::compute_hook::{ComputeHook, NotifyError};
@@ -593,6 +594,8 @@ impl Reconciler {
             notify_attempts += 1;
         }
 
+        pausable_failpoint!("reconciler-live-migrate-post-notify");
+
         // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then
         // this location will be deleted in the general case reconciliation that runs after this.
         let origin_secondary_conf = build_location_config(
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 4ccc5c951c..90334d10a7 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1854,37 +1854,74 @@ impl Service {
         Ok(response)
     }
 
-    pub(crate) fn validate(&self, validate_req: ValidateRequest) -> ValidateResponse {
-        let locked = self.inner.read().unwrap();
+    pub(crate) async fn validate(
+        &self,
+        validate_req: ValidateRequest,
+    ) -> Result<ValidateResponse, DatabaseError> {
+        // Fast in-memory check: we may reject validation on anything that doesn't match our
+        // in-memory generation for a shard
+        let in_memory_result = {
+            let mut in_memory_result = Vec::new();
+            let locked = self.inner.read().unwrap();
+            for req_tenant in validate_req.tenants {
+                if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) {
+                    let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen));
+                    tracing::info!(
+                        "handle_validate: {}(gen {}): valid={valid} (latest {:?})",
+                        req_tenant.id,
+                        req_tenant.gen,
+                        tenant_shard.generation
+                    );
+
+                    in_memory_result.push((req_tenant.id, Generation::new(req_tenant.gen), valid));
+                } else {
+                    // This is legal: for example during a shard split the pageserver may still
+                    // have deletions in its queue from the old pre-split shard, or after deletion
+                    // of a tenant that was busy with compaction/gc while being deleted.
+                    tracing::info!(
+                        "Refusing deletion validation for missing shard {}",
+                        req_tenant.id
+                    );
+                }
+            }
+
+            in_memory_result
+        };
+
+        // Database calls to confirm validity for anything that passed the in-memory check.  We must do this
+        // in case of controller split-brain, where some other controller process might have incremented the generation.
+        let db_generations = self
+            .persistence
+            .shard_generations(in_memory_result.iter().filter_map(|i| {
+                if i.2 {
+                    Some(&i.0)
+                } else {
+                    None
+                }
+            }))
+            .await?;
+        let db_generations = db_generations.into_iter().collect::<HashMap<_, _>>();
 
         let mut response = ValidateResponse {
             tenants: Vec::new(),
         };
-
-        for req_tenant in validate_req.tenants {
-            if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) {
-                let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen));
-                tracing::info!(
-                    "handle_validate: {}(gen {}): valid={valid} (latest {:?})",
-                    req_tenant.id,
-                    req_tenant.gen,
-                    tenant_shard.generation
-                );
-                response.tenants.push(ValidateResponseTenant {
-                    id: req_tenant.id,
-                    valid,
-                });
+        for (tenant_shard_id, validate_generation, valid) in in_memory_result.into_iter() {
+            let valid = if valid {
+                let db_generation = db_generations.get(&tenant_shard_id);
+                db_generation == Some(&Some(validate_generation))
             } else {
-                // After tenant deletion, we may approve any validation.  This avoids
-                // spurious warnings on the pageserver if it has pending LSN updates
-                // at the point a deletion happens.
-                response.tenants.push(ValidateResponseTenant {
-                    id: req_tenant.id,
-                    valid: true,
-                });
-            }
+                // If in-memory state says it's invalid, trust that.  It's always safe to fail a validation, at worst
+                // this prevents a pageserver from cleaning up an object in S3.
+                false
+            };
+
+            response.tenants.push(ValidateResponseTenant {
+                id: tenant_shard_id,
+                valid,
+            })
         }
-        response
+
+        Ok(response)
     }
 
     pub(crate) async fn tenant_create(
@@ -3179,7 +3216,7 @@ impl Service {
             // run concurrently with reconciliations, and it is not guaranteed that the node we find here
             // will still be the latest when we're done: we will check generations again at the end of
             // this function to handle that.
-            let generations = self.persistence.peek_generations(tenant_id).await?;
+            let generations = self.persistence.tenant_generations(tenant_id).await?;
 
             if generations
                 .iter()
@@ -3236,7 +3273,7 @@ impl Service {
         // Post-check: are all the generations of all the shards the same as they were initially?  This proves that
         // our remote operation executed on the latest generation and is therefore persistent.
         {
-            let latest_generations = self.persistence.peek_generations(tenant_id).await?;
+            let latest_generations = self.persistence.tenant_generations(tenant_id).await?;
             if latest_generations
                 .into_iter()
                 .map(
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 13f5ec1b4f..8da42294b0 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2332,6 +2332,122 @@ def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder)
     ).timeline_create(PgVersion.NOT_SET, tenant_id, create_timeline_id)
 
 
+def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvBuilder):
+    """
+    A correctness edge case: while we are live migrating and a shard's generation is
+    visible to the Reconciler but not to the central Service, the generation validation
+    API should still prevent stale generations from doing deletions.
+    """
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    TENANT_CONF = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": 128 * 1024,
+        "compaction_threshold": 1,
+        "compaction_target_size": 128 * 1024,
+        # disable background compaction and GC. We invoke it manually when we want it to happen.
+        "gc_period": "0s",
+        "compaction_period": "0s",
+    }
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    env.neon_cli.create_tenant(tenant_id, timeline_id)
+    env.storage_controller.pageserver_api().set_tenant_config(tenant_id, TENANT_CONF)
+
+    # Write enough data that a compaction would do some work (deleting some L0s)
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(64)
+    for _i in range(0, 2):
+        workload.churn_rows(64, upload=False)
+
+    # Upload but don't compact
+    origin_pageserver = env.get_tenant_pageserver(tenant_id)
+    dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0]
+    origin_pageserver.http_client().timeline_checkpoint(
+        tenant_id, timeline_id, wait_until_uploaded=True, compact=False
+    )
+
+    # Start a compaction that will pause on a failpoint.
+    compaction_failpoint = "before-upload-index-pausable"
+    origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "pause"))
+
+    # This failpoint can also cause migration code to time out trying to politely flush
+    # during migrations
+    origin_pageserver.allowed_errors.append(".*Timed out waiting for flush to remote storage.*")
+
+    try:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+            compact_fut = executor.submit(
+                origin_pageserver.http_client().timeline_compact,
+                tenant_id,
+                timeline_id,
+                wait_until_uploaded=True,
+            )
+
+            # Let the compaction start and then get stuck uploading an index: when we live migrate, the new generation's
+            # index will be initialized from the pre-compaction index, referencing layers that the compaction will try to delete
+            def has_hit_compaction_failpoint():
+                assert origin_pageserver.log_contains(f"at failpoint {compaction_failpoint}")
+
+            wait_until(10, 1, has_hit_compaction_failpoint)
+
+            # While the compaction is running, start a live migration which will pause long enough for the compaction to sleep,
+            # after incrementing generation and attaching the new location
+            migration_failpoint = "reconciler-live-migrate-post-notify"
+            env.storage_controller.configure_failpoints((migration_failpoint, "pause"))
+            migrate_fut = executor.submit(
+                env.storage_controller.tenant_shard_migrate,
+                TenantShardId(tenant_id, 0, 0),
+                dest_ps_id,
+            )
+
+            def has_hit_migration_failpoint():
+                assert env.storage_controller.log_contains(f"at failpoint {migration_failpoint}")
+
+            # Long wait because the migration will have to time out during transition to AttachedStale
+            # before it reaches this point.  The timeout is because the AttachedStale transition includes
+            # a flush of remote storage, and if the compaction already enqueued an index upload this cannot
+            # make progress.
+            wait_until(60, 1, has_hit_migration_failpoint)
+
+            # Origin pageserver has succeeded with compaction before the migration completed. It has done all the writes it wanted to do in its own (stale) generation
+            origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off"))
+            compact_fut.result()
+            origin_pageserver.http_client().deletion_queue_flush(execute=True)
+
+            # Eventually migration completes
+            env.storage_controller.configure_failpoints((migration_failpoint, "off"))
+            migrate_fut.result()
+    except:
+        # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown
+        env.storage_controller.configure_failpoints((migration_failpoint, "off"))
+        origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off"))
+        raise
+
+    # Ensure the destination of the migration writes an index, so that if it has corrupt state that is
+    # visible to the scrubber.
+    workload.write_rows(1, upload=False)
+    env.get_pageserver(dest_ps_id).http_client().timeline_checkpoint(
+        tenant_id, timeline_id, wait_until_uploaded=True, compact=False
+    )
+
+    # The destination of the live migration would now have a corrupt index (referencing deleted L0s) if
+    # the controller had not properly applied validation rules.
+    healthy, _summary = env.storage_scrubber.scan_metadata()
+    try:
+        log.info(f"scrubbed, healthy={healthy}")
+        assert healthy
+    except:
+        # On failures, we want to report them FAIL during the test, not as ERROR during teardown
+        neon_env_builder.enable_scrub_on_exit = False
+        raise
+
+
 @run_only_on_default_postgres("this is like a 'unit test' against storcon db")
 def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_configs()
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 292a9a1010..848e214c5e 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -217,6 +217,13 @@ def test_scrubber_physical_gc_ancestors(
     workload.init()
     workload.write_rows(100)
 
+    # Issue a deletion queue flush so that the parent shard can't leave behind layers
+    # that will look like unexpected garbage to the scrubber
+    for pre_split_shard in env.storage_controller.locate(tenant_id):
+        env.get_pageserver(pre_split_shard["node_id"]).http_client().deletion_queue_flush(
+            execute=True
+        )
+
     new_shard_count = 4
     assert shard_count is None or new_shard_count > shard_count
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
@@ -321,6 +328,10 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
     workload.write_rows(100, upload=False)
     workload.stop()
 
+    # Issue a deletion queue flush so that the parent shard can't leave behind layers
+    # that will look like unexpected garbage to the scrubber
+    env.get_tenant_pageserver(tenant_id).http_client().deletion_queue_flush(execute=True)
+
     new_shard_count = 4
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
     for shard in shards:

From 0205ce184967f4510b6034bf2051a495bf464b44 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 4 Sep 2024 17:41:51 +0300
Subject: [PATCH 1558/1571] Update submodule reference for vendor/postgres-v14
 (#8913)

There was a confusion on the REL_14_STABLE_neon branch. PR
https://github.com/neondatabase/postgres/pull/471 was merged ot the
branch, but the corresponding PRs on the other REL_15_STABLE_neon and
REL_16_STABLE_neon branches were not merged. Also, the submodule
reference in the neon repository was never updated, so even though the
REL_14_STABLE_neon branch contained the commit, it was never used.

That PR https://github.com/neondatabase/postgres/pull/471 was a few
bricks shy of a load (no tests, some differences between the different
branches), so to get us to a good state, revert that change from the
REL_14_STABLE_neon branch. This PR in the neon repository updates the
submodule reference past two commites on the REL_14_STABLE_neon branch:
first the commit from PR
https://github.com/neondatabase/postgres/pull/471, and immediately after
that the revert of the same commit. This brings us back to square one,
but now the submodule reference matches the tip of the
REL_14_STABLE_neon branch again.
---
 vendor/postgres-v14   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 7602e907ab..a317b9b5b9 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 7602e907ab30f16188bebfd66b8f297c2889d339
+Subproject commit a317b9b5b96978b49e78986697f3dd80d06f99a7
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 751b9e8679..e52576e61f 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -9,6 +9,6 @@
   ],
   "v14": [
     "14.13",
-    "7602e907ab30f16188bebfd66b8f297c2889d339"
+    "a317b9b5b96978b49e78986697f3dd80d06f99a7"
   ]
 }

From 99fa1c36004d710c65a47ffefaf66b4b5c6b4ce1 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 5 Sep 2024 04:45:04 +0800
Subject: [PATCH 1559/1571] fix(pageserver): more information on aux v1
 warnings (#8906)

Part of https://github.com/neondatabase/neon/issues/8623

## Summary of changes

It seems that we have tenants with aux policy set to v1 but don't have
any aux files in the storage. It is still safe to force migrate them
without notifying the customers. This patch adds more details to the
warning to identify the cases where we have to reach out to the users
before retiring aux v1.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs | 10 +++++++---
 pageserver/src/tenant/timeline.rs   |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index c26abca1f7..d28a214265 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -729,8 +729,12 @@ impl Timeline {
         let current_policy = self.last_aux_file_policy.load();
         match current_policy {
             Some(AuxFilePolicy::V1) => {
-                warn!("this timeline is using deprecated aux file policy V1 (policy=V1)");
-                self.list_aux_files_v1(lsn, ctx).await
+                let res = self.list_aux_files_v1(lsn, ctx).await?;
+                let empty_str = if res.is_empty() { ", empty" } else { "" };
+                warn!(
+                    "this timeline is using deprecated aux file policy V1 (policy=v1{empty_str})"
+                );
+                Ok(res)
             }
             None => {
                 let res = self.list_aux_files_v1(lsn, ctx).await?;
@@ -1657,7 +1661,7 @@ impl<'a> DatadirModification<'a> {
                 if aux_files_key_v1.is_empty() {
                     None
                 } else {
-                    warn!("this timeline is using deprecated aux file policy V1");
+                    warn!("this timeline is using deprecated aux file policy V1 (detected existing v1 files)");
                     self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
                     Some(AuxFilePolicy::V1)
                 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6eadf9a564..3b8f19a6c0 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2243,7 +2243,7 @@ impl Timeline {
             };
 
             if aux_file_policy == Some(AuxFilePolicy::V1) {
-                warn!("this timeline is using deprecated aux file policy V1");
+                warn!("this timeline is using deprecated aux file policy V1 (when loading the timeline)");
             }
 
             result.repartition_threshold =

From 708322ce3c0d55bcee5ee9e3632ecfb8c37415f5 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 5 Sep 2024 09:56:26 +0100
Subject: [PATCH 1560/1571] storcon: handle fills including high tput tenants
 more gracefully (#8865)

## Problem
A tenant may ingest a lot of data between being drained for node restart
and being moved back
in the fill phase. This is expensive and causes the fill to stall.

## Summary of changes
We make a tactical change to reduce secondary warm-up time for
migrations in fills.
---
 storage_controller/src/service.rs | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 90334d10a7..ca416095bb 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -6297,9 +6297,13 @@ impl Service {
         node_id: NodeId,
         cancel: CancellationToken,
     ) -> Result<(), OperationError> {
-        // TODO(vlad): Currently this operates on the assumption that all
-        // secondaries are warm. This is not always true (e.g. we just migrated the
-        // tenant). Take that into consideration by checking the secondary status.
+        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
+        const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
+        let reconciler_config = ReconcilerConfigBuilder::new()
+            .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
+            .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT)
+            .build();
+
         let mut tids_to_promote = self.fill_node_plan(node_id);
         let mut waiters = Vec::new();
 
@@ -6367,9 +6371,11 @@ impl Service {
                                         node_id
                                     );
 
-                                    if let Some(waiter) =
-                                        self.maybe_reconcile_shard(tenant_shard, nodes)
-                                    {
+                                    if let Some(waiter) = self.maybe_configured_reconcile_shard(
+                                        tenant_shard,
+                                        nodes,
+                                        reconciler_config,
+                                    ) {
                                         waiters.push(waiter);
                                     }
                                 }

From 6dfbf49128c4392464d6832ccc2e6bdc390b0b37 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Thu, 5 Sep 2024 13:34:27 +0200
Subject: [PATCH 1561/1571] proxy: don't let one timeout eat entire retry
 budget (#8924)

This reduces the per-request timeout to 10sec while keeping the total
retry duration at 1min.

Relates: neondatabase/cloud#15944
---
 proxy/src/http.rs          | 9 ++++++---
 proxy/src/usage_metrics.rs | 8 ++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index fee634f67f..c77d95f47d 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -35,14 +35,17 @@ pub fn new_client() -> ClientWithMiddleware {
         .build()
 }
 
-pub(crate) fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
+pub(crate) fn new_client_with_timeout(
+    request_timeout: Duration,
+    total_retry_duration: Duration,
+) -> ClientWithMiddleware {
     let timeout_client = reqwest::ClientBuilder::new()
-        .timeout(default_timout)
+        .timeout(request_timeout)
         .build()
         .expect("Failed to create http client with timeout");
 
     let retry_policy =
-        ExponentialBackoff::builder().build_with_total_retry_duration(default_timout);
+        ExponentialBackoff::builder().build_with_total_retry_duration(total_retry_duration);
 
     reqwest_middleware::ClientBuilder::new(timeout_client)
         .with(reqwest_tracing::TracingMiddleware::default())
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index aa8c7ba319..fd8599bcb3 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -33,7 +33,8 @@ use uuid::{NoContext, Timestamp};
 
 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 
-const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
+const HTTP_REPORTING_REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
+const HTTP_REPORTING_RETRY_DURATION: Duration = Duration::from_secs(60);
 
 /// Key that uniquely identifies the object, this metric describes.
 /// Currently, endpoint_id is enough, but this may change later,
@@ -223,7 +224,10 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
         info!("metrics collector has shut down");
     }
 
-    let http_client = http::new_client_with_timeout(DEFAULT_HTTP_REPORTING_TIMEOUT);
+    let http_client = http::new_client_with_timeout(
+        HTTP_REPORTING_REQUEST_TIMEOUT,
+        HTTP_REPORTING_RETRY_DURATION,
+    );
     let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
 
     let mut prev = Utc::now();

From 850421ec06dae634b762af0d4a38194eba103884 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 5 Sep 2024 14:59:49 +0200
Subject: [PATCH 1562/1571] refactor(pageserver): rely on serde derive for toml
 deserialization (#7656)

This PR simplifies the pageserver configuration parsing as follows:

* introduce the `pageserver_api::config::ConfigToml` type
* implement `Default` for `ConfigToml`
* use serde derive to do the brain-dead leg-work of processing the toml
document
  * use `serde(default)` to fill in default values
* in `pageserver` crate:
* use `toml_edit` to deserialize the pageserver.toml string into a
`ConfigToml`
  * `PageServerConfig::parse_and_validate` then
    * consumes the `ConfigToml`
    * destructures it exhaustively into its constituent fields
    * constructs the `PageServerConfig`

The rules are:

* in `ConfigToml`, use `deny_unknown_fields` everywhere
* static default values go in `pageserver_api`
* if there cannot be a static default value (e.g. which default IO
engine to use, because it depends on the runtime), make the field in
`ConfigToml` an `Option`
* if runtime-augmentation of a value is needed, do that in
`parse_and_validate`
* a good example is `virtual_file_io_engine` or `l0_flush`, both of
which need to execute code to determine the effective value in
`PageServerConf`

The benefits:

* massive amount of brain-dead repetitive code can be deleted
* "unused variable" compile-time errors when removing a config value,
due to the exhaustive destructuring in `parse_and_validate`
* compile-time errors guide you when adding a new config field

Drawbacks:

* serde derive is sometimes a bit too magical
* `deny_unknown_fields` is easy to miss

Future Work / Benefits:
* make `neon_local` use `pageserver_api` to construct `ConfigToml` and
write it to `pageserver.toml`
* This provides more type safety / coompile-time errors than the current
approach.

### Refs

Fixes #3682

### Future Work

* `remote_storage` deser doesn't reject unknown fields
https://github.com/neondatabase/neon/issues/8915
* clean up `libs/pageserver_api/src/config.rs` further
  * break up into multiple files, at least for tenant config
* move `models` as appropriate / refine distinction between config and
API models / be explicit about when it's the same
  * use `pub(crate)` visibility on `mod defaults` to detect stale values
---
 Cargo.lock                                    |   13 +
 Cargo.toml                                    |    1 +
 libs/pageserver_api/Cargo.toml                |   10 +
 libs/pageserver_api/src/config.rs             |  527 +++++-
 libs/pageserver_api/src/models.rs             |   71 +-
 libs/remote_storage/src/config.rs             |   25 +
 libs/utils/src/logging.rs                     |   12 +-
 pageserver/Cargo.toml                         |    3 +-
 pageserver/benches/bench_ingest.rs            |    4 +-
 pageserver/ctl/src/layer_map_analyzer.rs      |    3 +-
 pageserver/ctl/src/layers.rs                  |    3 +-
 pageserver/ctl/src/main.rs                    |    3 +-
 pageserver/src/bin/pageserver.rs              |   31 +-
 pageserver/src/config.rs                      | 1539 +++--------------
 pageserver/src/disk_usage_eviction_task.rs    |   48 +-
 pageserver/src/http/routes.rs                 |    4 +-
 pageserver/src/l0_flush.rs                    |   14 +-
 pageserver/src/statvfs.rs                     |   28 +-
 pageserver/src/tenant/config.rs               |  196 +--
 .../src/tenant/storage_layer/delta_layer.rs   |    3 +-
 .../src/tenant/storage_layer/image_layer.rs   |    4 +-
 .../tenant/storage_layer/inmemory_layer.rs    |    2 +-
 pageserver/src/tenant/tasks.rs                |    9 +-
 pageserver/src/tenant/timeline.rs             |    2 +-
 pageserver/src/tenant/timeline/compaction.rs  |   42 +-
 pageserver/src/tenant/vectored_blob_io.rs     |    4 -
 pageserver/src/virtual_file.rs                |    2 +-
 pageserver/src/virtual_file/io_engine.rs      |   11 +-
 test_runner/fixtures/neon_fixtures.py         |   22 +-
 .../regress/test_pageserver_generations.py    |   15 +-
 test_runner/regress/test_timeline_size.py     |    6 +-
 31 files changed, 1001 insertions(+), 1656 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5af3ef3804..91917d5351 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2727,6 +2727,12 @@ dependencies = [
  "hashbrown 0.14.5",
 ]
 
+[[package]]
+name = "indoc"
+version = "2.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5"
+
 [[package]]
 name = "infer"
 version = "0.2.3"
@@ -3701,6 +3707,7 @@ dependencies = [
  "humantime",
  "humantime-serde",
  "hyper 0.14.26",
+ "indoc",
  "itertools 0.10.5",
  "md5",
  "metrics",
@@ -3766,6 +3773,7 @@ dependencies = [
  "bincode",
  "byteorder",
  "bytes",
+ "camino",
  "chrono",
  "const_format",
  "enum-map",
@@ -3773,11 +3781,16 @@ dependencies = [
  "humantime",
  "humantime-serde",
  "itertools 0.10.5",
+ "nix 0.27.1",
+ "postgres_backend",
  "postgres_ffi",
  "rand 0.8.5",
+ "remote_storage",
+ "reqwest 0.12.4",
  "serde",
  "serde_json",
  "serde_with",
+ "storage_broker",
  "strum",
  "strum_macros",
  "thiserror",
diff --git a/Cargo.toml b/Cargo.toml
index fa949f9757..4fea3e8d80 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -103,6 +103,7 @@ humantime-serde = "1.1.1"
 hyper = "0.14"
 tokio-tungstenite = "0.20.0"
 indexmap = "2"
+indoc = "2"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index cb28359ac3..8710904cec 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -4,6 +4,10 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+# See pageserver/Cargo.toml
+testing = ["dep:nix"]
+
 [dependencies]
 serde.workspace = true
 serde_with.workspace = true
@@ -23,6 +27,12 @@ thiserror.workspace = true
 humantime-serde.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 itertools.workspace = true
+storage_broker.workspace = true
+camino = {workspace = true, features = ["serde1"]}
+remote_storage.workspace = true
+postgres_backend.workspace = true
+nix = {workspace = true, optional = true}
+reqwest.workspace = true
 
 [dev-dependencies]
 bincode.workspace = true
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index d996a62349..b2662c562a 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -1,15 +1,28 @@
-use std::collections::HashMap;
-
-use const_format::formatcp;
+use camino::Utf8PathBuf;
 
 #[cfg(test)]
 mod tests;
 
+use const_format::formatcp;
 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
 pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
 pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
 
+use postgres_backend::AuthType;
+use remote_storage::RemoteStorageConfig;
+use serde_with::serde_as;
+use std::{
+    collections::HashMap,
+    num::{NonZeroU64, NonZeroUsize},
+    str::FromStr,
+    time::Duration,
+};
+use utils::logging::LogFormat;
+
+use crate::models::ImageCompressionAlgorithm;
+use crate::models::LsnLease;
+
 // Certain metadata (e.g. externally-addressable name, AZ) is delivered
 // as a separate structure.  This information is not neeed by the pageserver
 // itself, it is only used for registering the pageserver with the control
@@ -29,3 +42,511 @@ pub struct NodeMetadata {
     #[serde(flatten)]
     pub other: HashMap<String, serde_json::Value>,
 }
+
+/// `pageserver.toml`
+///
+/// We use serde derive with `#[serde(default)]` to generate a deserializer
+/// that fills in the default values for each config field.
+///
+/// If there cannot be a static default value because we need to make runtime
+/// checks to determine the default, make it an `Option` (which defaults to None).
+/// The runtime check should be done in the consuming crate, i.e., `pageserver`.
+#[serde_as]
+#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
+#[serde(default, deny_unknown_fields)]
+pub struct ConfigToml {
+    // types mapped 1:1 into the runtime PageServerConfig type
+    pub listen_pg_addr: String,
+    pub listen_http_addr: String,
+    pub availability_zone: Option<String>,
+    #[serde(with = "humantime_serde")]
+    pub wait_lsn_timeout: Duration,
+    #[serde(with = "humantime_serde")]
+    pub wal_redo_timeout: Duration,
+    pub superuser: String,
+    pub page_cache_size: usize,
+    pub max_file_descriptors: usize,
+    pub pg_distrib_dir: Option<Utf8PathBuf>,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub http_auth_type: AuthType,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub pg_auth_type: AuthType,
+    pub auth_validation_public_key_path: Option<Utf8PathBuf>,
+    pub remote_storage: Option<RemoteStorageConfig>,
+    pub tenant_config: TenantConfigToml,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub broker_endpoint: storage_broker::Uri,
+    #[serde(with = "humantime_serde")]
+    pub broker_keepalive_interval: Duration,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub log_format: LogFormat,
+    pub concurrent_tenant_warmup: NonZeroUsize,
+    pub concurrent_tenant_size_logical_size_queries: NonZeroUsize,
+    #[serde(with = "humantime_serde")]
+    pub metric_collection_interval: Duration,
+    pub metric_collection_endpoint: Option<reqwest::Url>,
+    pub metric_collection_bucket: Option<RemoteStorageConfig>,
+    #[serde(with = "humantime_serde")]
+    pub synthetic_size_calculation_interval: Duration,
+    pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
+    pub test_remote_failures: u64,
+    pub ondemand_download_behavior_treat_error_as_warn: bool,
+    #[serde(with = "humantime_serde")]
+    pub background_task_maximum_delay: Duration,
+    pub control_plane_api: Option<reqwest::Url>,
+    pub control_plane_api_token: Option<String>,
+    pub control_plane_emergency_mode: bool,
+    pub heatmap_upload_concurrency: usize,
+    pub secondary_download_concurrency: usize,
+    pub virtual_file_io_engine: Option<crate::models::virtual_file::IoEngineKind>,
+    pub ingest_batch_size: u64,
+    pub max_vectored_read_bytes: MaxVectoredReadBytes,
+    pub image_compression: ImageCompressionAlgorithm,
+    pub ephemeral_bytes_per_memory_kb: usize,
+    pub l0_flush: Option<crate::models::L0FlushConfig>,
+    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
+    pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
+    pub io_buffer_alignment: usize,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct DiskUsageEvictionTaskConfig {
+    pub max_usage_pct: utils::serde_percent::Percent,
+    pub min_avail_bytes: u64,
+    #[serde(with = "humantime_serde")]
+    pub period: Duration,
+    #[cfg(feature = "testing")]
+    pub mock_statvfs: Option<statvfs::mock::Behavior>,
+    /// Select sorting for evicted layers
+    #[serde(default)]
+    pub eviction_order: EvictionOrder,
+}
+
+pub mod statvfs {
+    pub mod mock {
+        #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+        #[serde(tag = "type")]
+        pub enum Behavior {
+            Success {
+                blocksize: u64,
+                total_blocks: u64,
+                name_filter: Option<utils::serde_regex::Regex>,
+            },
+            #[cfg(feature = "testing")]
+            Failure { mocked_error: MockedError },
+        }
+
+        #[cfg(feature = "testing")]
+        #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+        #[allow(clippy::upper_case_acronyms)]
+        pub enum MockedError {
+            EIO,
+        }
+
+        #[cfg(feature = "testing")]
+        impl From<MockedError> for nix::Error {
+            fn from(e: MockedError) -> Self {
+                match e {
+                    MockedError::EIO => nix::Error::EIO,
+                }
+            }
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(tag = "type", content = "args")]
+pub enum EvictionOrder {
+    RelativeAccessed {
+        highest_layer_count_loses_first: bool,
+    },
+}
+
+impl Default for EvictionOrder {
+    fn default() -> Self {
+        Self::RelativeAccessed {
+            highest_layer_count_loses_first: true,
+        }
+    }
+}
+
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum GetVectoredImpl {
+    Sequential,
+    Vectored,
+}
+
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum GetImpl {
+    Legacy,
+    Vectored,
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(transparent)]
+pub struct MaxVectoredReadBytes(pub NonZeroUsize);
+
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+pub enum CompactL0Phase1ValueAccess {
+    /// The old way.
+    PageCachedBlobIo,
+    /// The new way.
+    StreamingKmerge {
+        /// If set, we run both the old way and the new way, validate that
+        /// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
+        /// and if the validation fails,
+        /// - in tests: fail them with a panic or
+        /// - in prod, log a rate-limited warning and use the old way's results.
+        ///
+        /// If not set, we only run the new way and trust its results.
+        validate: Option<CompactL0BypassPageCacheValidation>,
+    },
+}
+
+/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum CompactL0BypassPageCacheValidation {
+    /// Validate that the series of (key, lsn) pairs are the same.
+    KeyLsn,
+    /// Validate that the entire output of old and new way is identical.
+    KeyLsnValue,
+}
+
+impl Default for CompactL0Phase1ValueAccess {
+    fn default() -> Self {
+        CompactL0Phase1ValueAccess::StreamingKmerge {
+            // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
+            validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
+        }
+    }
+}
+
+/// A tenant's calcuated configuration, which is the result of merging a
+/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
+///
+/// For storing and transmitting individual tenant's configuration, see
+/// TenantConfOpt.
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(deny_unknown_fields, default)]
+pub struct TenantConfigToml {
+    // Flush out an inmemory layer, if it's holding WAL older than this
+    // This puts a backstop on how much WAL needs to be re-digested if the
+    // page server crashes.
+    // This parameter actually determines L0 layer file size.
+    pub checkpoint_distance: u64,
+    // Inmemory layer is also flushed at least once in checkpoint_timeout to
+    // eventually upload WAL after activity is stopped.
+    #[serde(with = "humantime_serde")]
+    pub checkpoint_timeout: Duration,
+    // Target file size, when creating image and delta layers.
+    // This parameter determines L1 layer file size.
+    pub compaction_target_size: u64,
+    // How often to check if there's compaction work to be done.
+    // Duration::ZERO means automatic compaction is disabled.
+    #[serde(with = "humantime_serde")]
+    pub compaction_period: Duration,
+    // Level0 delta layer threshold for compaction.
+    pub compaction_threshold: usize,
+    pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
+    // Determines how much history is retained, to allow
+    // branching and read replicas at an older point in time.
+    // The unit is #of bytes of WAL.
+    // Page versions older than this are garbage collected away.
+    pub gc_horizon: u64,
+    // Interval at which garbage collection is triggered.
+    // Duration::ZERO means automatic GC is disabled
+    #[serde(with = "humantime_serde")]
+    pub gc_period: Duration,
+    // Delta layer churn threshold to create L1 image layers.
+    pub image_creation_threshold: usize,
+    // Determines how much history is retained, to allow
+    // branching and read replicas at an older point in time.
+    // The unit is time.
+    // Page versions older than this are garbage collected away.
+    #[serde(with = "humantime_serde")]
+    pub pitr_interval: Duration,
+    /// Maximum amount of time to wait while opening a connection to receive wal, before erroring.
+    #[serde(with = "humantime_serde")]
+    pub walreceiver_connect_timeout: Duration,
+    /// Considers safekeepers stalled after no WAL updates were received longer than this threshold.
+    /// A stalled safekeeper will be changed to a newer one when it appears.
+    #[serde(with = "humantime_serde")]
+    pub lagging_wal_timeout: Duration,
+    /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold.
+    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
+    /// to avoid eager reconnects.
+    pub max_lsn_wal_lag: NonZeroU64,
+    pub eviction_policy: crate::models::EvictionPolicy,
+    pub min_resident_size_override: Option<u64>,
+    // See the corresponding metric's help string.
+    #[serde(with = "humantime_serde")]
+    pub evictions_low_residence_duration_metric_threshold: Duration,
+
+    /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
+    /// may be disabled if a Tenant will not have secondary locations: only secondary
+    /// locations will use the heatmap uploaded by attached locations.
+    #[serde(with = "humantime_serde")]
+    pub heatmap_period: Duration,
+
+    /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
+    pub lazy_slru_download: bool,
+
+    pub timeline_get_throttle: crate::models::ThrottleConfig,
+
+    // How much WAL must be ingested before checking again whether a new image layer is required.
+    // Expresed in multiples of checkpoint distance.
+    pub image_layer_creation_check_threshold: u8,
+
+    /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
+    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
+    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
+    /// file is written.
+    pub switch_aux_file_policy: crate::models::AuxFilePolicy,
+
+    /// The length for an explicit LSN lease request.
+    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
+    #[serde(with = "humantime_serde")]
+    pub lsn_lease_length: Duration,
+
+    /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
+    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
+    #[serde(with = "humantime_serde")]
+    pub lsn_lease_length_for_ts: Duration,
+}
+
+pub mod defaults {
+    use crate::models::ImageCompressionAlgorithm;
+
+    pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
+
+    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
+    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
+
+    pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
+
+    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
+    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
+
+    pub const DEFAULT_LOG_FORMAT: &str = "plain";
+
+    pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
+
+    pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = 1;
+
+    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
+    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
+    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
+    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
+
+    pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
+    pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
+
+    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
+
+    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
+
+    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
+        ImageCompressionAlgorithm::Zstd { level: Some(1) };
+
+    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
+
+    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
+
+    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
+}
+
+impl Default for ConfigToml {
+    fn default() -> Self {
+        use defaults::*;
+
+        Self {
+            listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()),
+            listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()),
+            availability_zone: (None),
+            wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
+                .expect("cannot parse default wait lsn timeout")),
+            wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
+                .expect("cannot parse default wal redo timeout")),
+            superuser: (DEFAULT_SUPERUSER.to_string()),
+            page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
+            max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
+            pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
+            http_auth_type: (AuthType::Trust),
+            pg_auth_type: (AuthType::Trust),
+            auth_validation_public_key_path: (None),
+            remote_storage: None,
+            broker_endpoint: (storage_broker::DEFAULT_ENDPOINT
+                .parse()
+                .expect("failed to parse default broker endpoint")),
+            broker_keepalive_interval: (humantime::parse_duration(
+                storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
+            )
+            .expect("cannot parse default keepalive interval")),
+            log_format: (LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
+
+            concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
+                .expect("Invalid default constant")),
+            concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(),
+            metric_collection_interval: (humantime::parse_duration(
+                DEFAULT_METRIC_COLLECTION_INTERVAL,
+            )
+            .expect("cannot parse default metric collection interval")),
+            synthetic_size_calculation_interval: (humantime::parse_duration(
+                DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
+            )
+            .expect("cannot parse default synthetic size calculation interval")),
+            metric_collection_endpoint: (DEFAULT_METRIC_COLLECTION_ENDPOINT),
+
+            metric_collection_bucket: (None),
+
+            disk_usage_based_eviction: (None),
+
+            test_remote_failures: (0),
+
+            ondemand_download_behavior_treat_error_as_warn: (false),
+
+            background_task_maximum_delay: (humantime::parse_duration(
+                DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
+            )
+            .unwrap()),
+
+            control_plane_api: (None),
+            control_plane_api_token: (None),
+            control_plane_emergency_mode: (false),
+
+            heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
+            secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
+
+            ingest_batch_size: (DEFAULT_INGEST_BATCH_SIZE),
+
+            virtual_file_io_engine: None,
+
+            max_vectored_read_bytes: (MaxVectoredReadBytes(
+                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
+            )),
+            image_compression: (DEFAULT_IMAGE_COMPRESSION),
+            ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
+            l0_flush: None,
+            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+            virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
+
+            io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
+
+            tenant_config: TenantConfigToml::default(),
+        }
+    }
+}
+
+pub mod tenant_conf_defaults {
+
+    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
+    // would be more appropriate. But a low value forces the code to be exercised more,
+    // which is good for now to trigger bugs.
+    // This parameter actually determines L0 layer file size.
+    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
+    pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
+
+    // FIXME the below configs are only used by legacy algorithm. The new algorithm
+    // has different parameters.
+
+    // Target file size, when creating image and delta layers.
+    // This parameter determines L1 layer file size.
+    pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
+
+    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
+    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
+    pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
+        crate::models::CompactionAlgorithm::Legacy;
+
+    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
+
+    // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
+    // If there's a need to decrease this value, first make sure that GC
+    // doesn't hold a layer map write lock for non-trivial operations.
+    // Relevant: https://github.com/neondatabase/neon/issues/3394
+    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
+    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
+    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
+    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
+    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
+    // The default limit on WAL lag should be set to avoid causing disconnects under high throughput
+    // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
+    // throughputs up to 1GiB/s per timeline.
+    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
+    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
+    // By default ingest enough WAL for two new L0 layers before checking if new image
+    // image layers should be created.
+    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
+
+    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
+}
+
+impl Default for TenantConfigToml {
+    fn default() -> Self {
+        use tenant_conf_defaults::*;
+        Self {
+            checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
+            checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
+                .expect("cannot parse default checkpoint timeout"),
+            compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
+            compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
+                .expect("cannot parse default compaction period"),
+            compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
+            compaction_algorithm: crate::models::CompactionAlgorithmSettings {
+                kind: DEFAULT_COMPACTION_ALGORITHM,
+            },
+            gc_horizon: DEFAULT_GC_HORIZON,
+            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
+                .expect("cannot parse default gc period"),
+            image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
+            pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
+                .expect("cannot parse default PITR interval"),
+            walreceiver_connect_timeout: humantime::parse_duration(
+                DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
+            )
+            .expect("cannot parse default walreceiver connect timeout"),
+            lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT)
+                .expect("cannot parse default walreceiver lagging wal timeout"),
+            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
+                .expect("cannot parse default max walreceiver Lsn wal lag"),
+            eviction_policy: crate::models::EvictionPolicy::NoEviction,
+            min_resident_size_override: None,
+            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
+                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
+            )
+            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
+            heatmap_period: Duration::ZERO,
+            lazy_slru_download: false,
+            timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
+            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
+            switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(),
+            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
+            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
+        }
+    }
+}
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 87e8f8305a..d13d04eb1b 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -6,6 +6,7 @@ pub use utilization::PageserverUtilization;
 
 use std::{
     collections::HashMap,
+    fmt::Display,
     io::{BufRead, Read},
     num::{NonZeroU32, NonZeroU64, NonZeroUsize},
     str::FromStr,
@@ -435,7 +436,9 @@ pub enum CompactionAlgorithm {
     Tiered,
 }
 
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(
+    Debug, Clone, Copy, PartialEq, Eq, serde_with::DeserializeFromStr, serde_with::SerializeDisplay,
+)]
 pub enum ImageCompressionAlgorithm {
     // Disabled for writes, support decompressing during read path
     Disabled,
@@ -470,11 +473,33 @@ impl FromStr for ImageCompressionAlgorithm {
     }
 }
 
+impl Display for ImageCompressionAlgorithm {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            ImageCompressionAlgorithm::Disabled => write!(f, "disabled"),
+            ImageCompressionAlgorithm::Zstd { level } => {
+                if let Some(level) = level {
+                    write!(f, "zstd({})", level)
+                } else {
+                    write!(f, "zstd")
+                }
+            }
+        }
+    }
+}
+
 #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
 pub struct CompactionAlgorithmSettings {
     pub kind: CompactionAlgorithm,
 }
 
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+pub enum L0FlushConfig {
+    #[serde(rename_all = "snake_case")]
+    Direct { max_concurrency: NonZeroUsize },
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
     #[serde(with = "humantime_serde")]
@@ -1656,21 +1681,33 @@ mod tests {
     #[test]
     fn test_image_compression_algorithm_parsing() {
         use ImageCompressionAlgorithm::*;
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("disabled").unwrap(),
-            Disabled
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd").unwrap(),
-            Zstd { level: None }
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
-            Zstd { level: Some(18) }
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
-            Zstd { level: Some(-3) }
-        );
+        let cases = [
+            ("disabled", Disabled),
+            ("zstd", Zstd { level: None }),
+            ("zstd(18)", Zstd { level: Some(18) }),
+            ("zstd(-3)", Zstd { level: Some(-3) }),
+        ];
+
+        for (display, expected) in cases {
+            assert_eq!(
+                ImageCompressionAlgorithm::from_str(display).unwrap(),
+                expected,
+                "parsing works"
+            );
+            assert_eq!(format!("{expected}"), display, "Display FromStr roundtrip");
+
+            let ser = serde_json::to_string(&expected).expect("serialization");
+            assert_eq!(
+                serde_json::from_str::<ImageCompressionAlgorithm>(&ser).unwrap(),
+                expected,
+                "serde roundtrip"
+            );
+
+            assert_eq!(
+                serde_json::Value::String(display.to_string()),
+                serde_json::to_value(expected).unwrap(),
+                "Display is the serde serialization"
+            );
+        }
     }
 }
diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index fa3f2cba58..f819a1572a 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -235,6 +235,31 @@ timeout = '5s'";
         );
     }
 
+    #[test]
+    fn test_storage_class_serde_roundtrip() {
+        let classes = [
+            None,
+            Some(StorageClass::Standard),
+            Some(StorageClass::IntelligentTiering),
+        ];
+        for class in classes {
+            #[derive(Serialize, Deserialize)]
+            struct Wrapper {
+                #[serde(
+                    deserialize_with = "deserialize_storage_class",
+                    serialize_with = "serialize_storage_class"
+                )]
+                class: Option<StorageClass>,
+            }
+            let wrapped = Wrapper {
+                class: class.clone(),
+            };
+            let serialized = serde_json::to_string(&wrapped).unwrap();
+            let deserialized: Wrapper = serde_json::from_str(&serialized).unwrap();
+            assert_eq!(class, deserialized.class);
+        }
+    }
+
     #[test]
     fn test_azure_parsing() {
         let toml = "\
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index f7b73dc984..71af43a4da 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -5,7 +5,9 @@ use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, EnumVariantNames};
 
-#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
+#[derive(
+    EnumString, strum_macros::Display, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy,
+)]
 #[strum(serialize_all = "snake_case")]
 pub enum LogFormat {
     Plain,
@@ -274,6 +276,14 @@ impl From<String> for SecretString {
     }
 }
 
+impl FromStr for SecretString {
+    type Err = std::convert::Infallible;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(Self(s.to_string()))
+    }
+}
+
 impl std::fmt::Debug for SecretString {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "[SECRET]")
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 9c02ce3fbc..24373afca3 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 default = []
 # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
 # which adds some runtime cost to run tests on outage conditions
-testing = ["fail/failpoints"]
+testing = ["fail/failpoints", "pageserver_api/testing" ]
 
 [dependencies]
 anyhow.workspace = true
@@ -101,6 +101,7 @@ procfs.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
+indoc.workspace = true
 
 [[bench]]
 name = "bench_layer_map"
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 1be4391d81..72cbb6beab 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -4,7 +4,7 @@ use bytes::Bytes;
 use camino::Utf8PathBuf;
 use criterion::{criterion_group, criterion_main, Criterion};
 use pageserver::{
-    config::{defaults::DEFAULT_IO_BUFFER_ALIGNMENT, PageServerConf},
+    config::PageServerConf,
     context::{DownloadBehavior, RequestContext},
     l0_flush::{L0FlushConfig, L0FlushGlobalState},
     page_cache,
@@ -167,7 +167,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     virtual_file::init(
         16384,
         virtual_file::io_engine_for_bench(),
-        DEFAULT_IO_BUFFER_ALIGNMENT,
+        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
     );
     page_cache::init(conf.page_cache_size);
 
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index 8092c203c3..a07107753e 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -4,7 +4,6 @@
 
 use anyhow::Result;
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -148,7 +147,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
     pageserver::virtual_file::init(
         10,
         virtual_file::api::IoEngineKind::StdFs,
-        DEFAULT_IO_BUFFER_ALIGNMENT,
+        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
     );
     pageserver::page_cache::init(100);
 
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index e0f978eaa2..dd753398e2 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -3,7 +3,6 @@ use std::path::{Path, PathBuf};
 use anyhow::Result;
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::Subcommand;
-use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::block_io::BlockCursor;
@@ -194,7 +193,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             pageserver::virtual_file::init(
                 10,
                 virtual_file::api::IoEngineKind::StdFs,
-                DEFAULT_IO_BUFFER_ALIGNMENT,
+                pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
             );
             pageserver::page_cache::init(100);
 
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 7a6c7675bb..3b66b0c4aa 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -20,14 +20,13 @@ use clap::{Parser, Subcommand};
 use index_part::IndexPartCmd;
 use layers::LayerCmd;
 use pageserver::{
-    config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
     context::{DownloadBehavior, RequestContext},
     page_cache,
     task_mgr::TaskKind,
     tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
     virtual_file,
 };
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, shard::TenantShardId};
 use postgres_ffi::ControlFileData;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use tokio_util::sync::CancellationToken;
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 850bd87b95..2c60e8d7d1 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -5,6 +5,7 @@
 use std::env;
 use std::env::{var, VarError};
 use std::io::Read;
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 
@@ -223,27 +224,15 @@ fn initialize_config(
         }
     };
 
-    let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) {
-        Ok(mut f) => {
-            let md = f.metadata().context("stat config file")?;
-            if md.is_file() {
-                let mut s = String::new();
-                f.read_to_string(&mut s).context("read config file")?;
-                s.parse().context("parse config file toml")?
-            } else {
-                anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
-            }
-        }
-        Err(e) => {
-            anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
-        }
-    };
-
-    debug!("Using pageserver toml: {config}");
-
-    // Construct the runtime representation
-    let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir)
-        .context("Failed to parse pageserver configuration")?;
+    let config_file_contents =
+        std::fs::read_to_string(cfg_file_path).context("read config file from filesystem")?;
+    let config_toml = serde_path_to_error::deserialize(
+        toml_edit::de::Deserializer::from_str(&config_file_contents)
+            .context("build toml deserializer")?,
+    )
+    .context("deserialize config toml")?;
+    let conf = PageServerConf::parse_and_validate(identity.id, config_toml, workdir)
+        .context("runtime-validation of config toml")?;
 
     Ok(Box::leak(Box::new(conf)))
 }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 9e4530ba3c..c159b66905 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -4,11 +4,13 @@
 //! file, or on the command line.
 //! See also `settings.md` for better description on every parameter.
 
-use anyhow::{anyhow, bail, ensure, Context, Result};
-use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
+use anyhow::{bail, ensure, Context};
+use pageserver_api::models::ImageCompressionAlgorithm;
+use pageserver_api::{
+    config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes},
+    shard::TenantShardId,
+};
 use remote_storage::{RemotePath, RemoteStorageConfig};
-use serde::de::IntoDeserializer;
-use serde::{self, Deserialize};
 use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
@@ -17,10 +19,8 @@ use utils::logging::SecretString;
 use once_cell::sync::OnceCell;
 use reqwest::Url;
 use std::num::NonZeroUsize;
-use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
-use toml_edit::{Document, Item};
 
 use camino::{Utf8Path, Utf8PathBuf};
 use postgres_backend::AuthType;
@@ -29,139 +29,27 @@ use utils::{
     logging::LogFormat,
 };
 
-use crate::l0_flush::L0FlushConfig;
-use crate::tenant::config::TenantConfOpt;
 use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
-use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
-use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
-use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
-use crate::{tenant::config::TenantConf, virtual_file};
+use crate::virtual_file;
+use crate::virtual_file::io_engine;
 use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
 
-use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
-
-use self::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE;
-
-pub mod defaults {
-    use crate::tenant::config::defaults::*;
-    use const_format::formatcp;
-
-    pub use pageserver_api::config::{
-        DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
-        DEFAULT_PG_LISTEN_PORT,
-    };
-    pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
-
-    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
-    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
-
-    pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
-
-    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
-    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
-
-    pub const DEFAULT_LOG_FORMAT: &str = "plain";
-
-    pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
-
-    pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
-        super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
-
-    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
-    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
-
-    pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
-    pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
-
-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
-
-    #[cfg(target_os = "linux")]
-    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "tokio-epoll-uring";
-
-    #[cfg(not(target_os = "linux"))]
-    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
-
-    pub const DEFAULT_GET_VECTORED_IMPL: &str = "vectored";
-
-    pub const DEFAULT_GET_IMPL: &str = "vectored";
-
-    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
-
-    pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)";
-
-    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
-
-    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
-
-    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
-
-    ///
-    /// Default built-in configuration file.
-    ///
-    pub const DEFAULT_CONFIG_FILE: &str = formatcp!(
-        r#"
-# Initial configuration file created by 'pageserver --init'
-#listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}'
-#listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
-
-#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
-#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
-
-#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
-#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
-
-# initial superuser role name to use when creating a new tenant
-#initial_superuser_name = '{DEFAULT_SUPERUSER}'
-
-#broker_endpoint = '{BROKER_DEFAULT_ENDPOINT}'
-
-#log_format = '{DEFAULT_LOG_FORMAT}'
-
-#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
-#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'
-
-#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
-#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
-
-#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
-
-#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
-
-#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
-
-#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
-
-#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
-
-[tenant_config]
-#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
-#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
-#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
-#compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
-#compaction_threshold = {DEFAULT_COMPACTION_THRESHOLD}
-
-#gc_period = '{DEFAULT_GC_PERIOD}'
-#gc_horizon = {DEFAULT_GC_HORIZON}
-#image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD}
-#pitr_interval = '{DEFAULT_PITR_INTERVAL}'
-
-#min_resident_size_override = .. # in bytes
-#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
-
-#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
-#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
-
-#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
-
-#[remote_storage]
-
-"#
-    );
-}
-
+/// Global state of pageserver.
+///
+/// It's mostly immutable configuration, but some semaphores and the
+/// like crept in over time and the name stuck.
+///
+/// Instantiated by deserializing `pageserver.toml` into  [`pageserver_api::config::ConfigToml`]
+/// and passing that to [`PageServerConf::parse_and_validate`].
+///
+/// # Adding a New Field
+///
+/// 1. Add the field to `pageserver_api::config::ConfigToml`.
+/// 2. Fix compiler errors (exhaustive destructuring will guide you).
+///
+/// For fields that require additional validation or filling in of defaults at runtime,
+/// check for examples in the [`PageServerConf::parse_and_validate`] method.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct PageServerConf {
     // Identifier of that particular pageserver so e g safekeepers
@@ -207,7 +95,7 @@ pub struct PageServerConf {
 
     pub remote_storage_config: Option<RemoteStorageConfig>,
 
-    pub default_tenant_conf: TenantConf,
+    pub default_tenant_conf: crate::tenant::config::TenantConf,
 
     /// Storage broker endpoints to connect to.
     pub broker_endpoint: Uri,
@@ -284,11 +172,11 @@ pub struct PageServerConf {
     /// Setting this to zero disables limits on total ephemeral layer size.
     pub ephemeral_bytes_per_memory_kb: usize,
 
-    pub l0_flush: L0FlushConfig,
+    pub l0_flush: crate::l0_flush::L0FlushConfig,
 
     /// This flag is temporary and will be removed after gradual rollout.
     /// See <https://github.com/neondatabase/neon/issues/8184>.
-    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
+    pub compact_level0_phase1_value_access: pageserver_api::config::CompactL0Phase1ValueAccess,
 
     /// Direct IO settings
     pub virtual_file_direct_io: virtual_file::DirectIoMode,
@@ -304,472 +192,6 @@ pub struct PageServerConf {
 /// startup code to the connection code through a dozen layers.
 pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
 
-// use dedicated enum for builder to better indicate the intention
-// and avoid possible confusion with nested options
-#[derive(Clone, Default)]
-pub enum BuilderValue<T> {
-    Set(T),
-    #[default]
-    NotSet,
-}
-
-impl<T: Clone> BuilderValue<T> {
-    pub fn ok_or(&self, field_name: &'static str, default: BuilderValue<T>) -> anyhow::Result<T> {
-        match self {
-            Self::Set(v) => Ok(v.clone()),
-            Self::NotSet => match default {
-                BuilderValue::Set(v) => Ok(v.clone()),
-                BuilderValue::NotSet => {
-                    anyhow::bail!("missing config value {field_name:?}")
-                }
-            },
-        }
-    }
-}
-
-// needed to simplify config construction
-#[derive(Default)]
-struct PageServerConfigBuilder {
-    listen_pg_addr: BuilderValue<String>,
-
-    listen_http_addr: BuilderValue<String>,
-
-    availability_zone: BuilderValue<Option<String>>,
-
-    wait_lsn_timeout: BuilderValue<Duration>,
-    wal_redo_timeout: BuilderValue<Duration>,
-
-    superuser: BuilderValue<String>,
-
-    page_cache_size: BuilderValue<usize>,
-    max_file_descriptors: BuilderValue<usize>,
-
-    workdir: BuilderValue<Utf8PathBuf>,
-
-    pg_distrib_dir: BuilderValue<Utf8PathBuf>,
-
-    http_auth_type: BuilderValue<AuthType>,
-    pg_auth_type: BuilderValue<AuthType>,
-
-    //
-    auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
-    remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
-
-    broker_endpoint: BuilderValue<Uri>,
-    broker_keepalive_interval: BuilderValue<Duration>,
-
-    log_format: BuilderValue<LogFormat>,
-
-    concurrent_tenant_warmup: BuilderValue<NonZeroUsize>,
-    concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
-
-    metric_collection_interval: BuilderValue<Duration>,
-    metric_collection_endpoint: BuilderValue<Option<Url>>,
-    synthetic_size_calculation_interval: BuilderValue<Duration>,
-    metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
-
-    disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,
-
-    test_remote_failures: BuilderValue<u64>,
-
-    ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
-
-    background_task_maximum_delay: BuilderValue<Duration>,
-
-    control_plane_api: BuilderValue<Option<Url>>,
-    control_plane_api_token: BuilderValue<Option<SecretString>>,
-    control_plane_emergency_mode: BuilderValue<bool>,
-
-    heatmap_upload_concurrency: BuilderValue<usize>,
-    secondary_download_concurrency: BuilderValue<usize>,
-
-    ingest_batch_size: BuilderValue<u64>,
-
-    virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
-
-    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
-
-    image_compression: BuilderValue<ImageCompressionAlgorithm>,
-
-    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
-
-    l0_flush: BuilderValue<L0FlushConfig>,
-
-    compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
-
-    virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
-
-    io_buffer_alignment: BuilderValue<usize>,
-}
-
-impl PageServerConfigBuilder {
-    fn new() -> Self {
-        Self::default()
-    }
-
-    #[inline(always)]
-    fn default_values() -> Self {
-        use self::BuilderValue::*;
-        use defaults::*;
-        Self {
-            listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()),
-            listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()),
-            availability_zone: Set(None),
-            wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
-                .expect("cannot parse default wait lsn timeout")),
-            wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
-                .expect("cannot parse default wal redo timeout")),
-            superuser: Set(DEFAULT_SUPERUSER.to_string()),
-            page_cache_size: Set(DEFAULT_PAGE_CACHE_SIZE),
-            max_file_descriptors: Set(DEFAULT_MAX_FILE_DESCRIPTORS),
-            workdir: Set(Utf8PathBuf::new()),
-            pg_distrib_dir: Set(Utf8PathBuf::from_path_buf(
-                env::current_dir().expect("cannot access current directory"),
-            )
-            .expect("non-Unicode path")
-            .join("pg_install")),
-            http_auth_type: Set(AuthType::Trust),
-            pg_auth_type: Set(AuthType::Trust),
-            auth_validation_public_key_path: Set(None),
-            remote_storage_config: Set(None),
-            broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
-                .parse()
-                .expect("failed to parse default broker endpoint")),
-            broker_keepalive_interval: Set(humantime::parse_duration(
-                storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
-            )
-            .expect("cannot parse default keepalive interval")),
-            log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
-
-            concurrent_tenant_warmup: Set(NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
-                .expect("Invalid default constant")),
-            concurrent_tenant_size_logical_size_queries: Set(
-                ConfigurableSemaphore::DEFAULT_INITIAL,
-            ),
-            metric_collection_interval: Set(humantime::parse_duration(
-                DEFAULT_METRIC_COLLECTION_INTERVAL,
-            )
-            .expect("cannot parse default metric collection interval")),
-            synthetic_size_calculation_interval: Set(humantime::parse_duration(
-                DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
-            )
-            .expect("cannot parse default synthetic size calculation interval")),
-            metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
-
-            metric_collection_bucket: Set(None),
-
-            disk_usage_based_eviction: Set(None),
-
-            test_remote_failures: Set(0),
-
-            ondemand_download_behavior_treat_error_as_warn: Set(false),
-
-            background_task_maximum_delay: Set(humantime::parse_duration(
-                DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
-            )
-            .unwrap()),
-
-            control_plane_api: Set(None),
-            control_plane_api_token: Set(None),
-            control_plane_emergency_mode: Set(false),
-
-            heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
-            secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
-
-            ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
-
-            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
-
-            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
-                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
-            )),
-            image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()),
-            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
-            l0_flush: Set(L0FlushConfig::default()),
-            compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
-            virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
-            io_buffer_alignment: Set(DEFAULT_IO_BUFFER_ALIGNMENT),
-        }
-    }
-}
-
-impl PageServerConfigBuilder {
-    pub fn listen_pg_addr(&mut self, listen_pg_addr: String) {
-        self.listen_pg_addr = BuilderValue::Set(listen_pg_addr)
-    }
-
-    pub fn listen_http_addr(&mut self, listen_http_addr: String) {
-        self.listen_http_addr = BuilderValue::Set(listen_http_addr)
-    }
-
-    pub fn availability_zone(&mut self, availability_zone: Option<String>) {
-        self.availability_zone = BuilderValue::Set(availability_zone)
-    }
-
-    pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) {
-        self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout)
-    }
-
-    pub fn wal_redo_timeout(&mut self, wal_redo_timeout: Duration) {
-        self.wal_redo_timeout = BuilderValue::Set(wal_redo_timeout)
-    }
-
-    pub fn superuser(&mut self, superuser: String) {
-        self.superuser = BuilderValue::Set(superuser)
-    }
-
-    pub fn page_cache_size(&mut self, page_cache_size: usize) {
-        self.page_cache_size = BuilderValue::Set(page_cache_size)
-    }
-
-    pub fn max_file_descriptors(&mut self, max_file_descriptors: usize) {
-        self.max_file_descriptors = BuilderValue::Set(max_file_descriptors)
-    }
-
-    pub fn workdir(&mut self, workdir: Utf8PathBuf) {
-        self.workdir = BuilderValue::Set(workdir)
-    }
-
-    pub fn pg_distrib_dir(&mut self, pg_distrib_dir: Utf8PathBuf) {
-        self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir)
-    }
-
-    pub fn http_auth_type(&mut self, auth_type: AuthType) {
-        self.http_auth_type = BuilderValue::Set(auth_type)
-    }
-
-    pub fn pg_auth_type(&mut self, auth_type: AuthType) {
-        self.pg_auth_type = BuilderValue::Set(auth_type)
-    }
-
-    pub fn auth_validation_public_key_path(
-        &mut self,
-        auth_validation_public_key_path: Option<Utf8PathBuf>,
-    ) {
-        self.auth_validation_public_key_path = BuilderValue::Set(auth_validation_public_key_path)
-    }
-
-    pub fn remote_storage_config(&mut self, remote_storage_config: Option<RemoteStorageConfig>) {
-        self.remote_storage_config = BuilderValue::Set(remote_storage_config)
-    }
-
-    pub fn broker_endpoint(&mut self, broker_endpoint: Uri) {
-        self.broker_endpoint = BuilderValue::Set(broker_endpoint)
-    }
-
-    pub fn broker_keepalive_interval(&mut self, broker_keepalive_interval: Duration) {
-        self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
-    }
-
-    pub fn log_format(&mut self, log_format: LogFormat) {
-        self.log_format = BuilderValue::Set(log_format)
-    }
-
-    pub fn concurrent_tenant_warmup(&mut self, u: NonZeroUsize) {
-        self.concurrent_tenant_warmup = BuilderValue::Set(u);
-    }
-
-    pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) {
-        self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
-    }
-
-    pub fn metric_collection_interval(&mut self, metric_collection_interval: Duration) {
-        self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
-    }
-
-    pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
-        self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
-    }
-
-    pub fn metric_collection_bucket(
-        &mut self,
-        metric_collection_bucket: Option<RemoteStorageConfig>,
-    ) {
-        self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket)
-    }
-
-    pub fn synthetic_size_calculation_interval(
-        &mut self,
-        synthetic_size_calculation_interval: Duration,
-    ) {
-        self.synthetic_size_calculation_interval =
-            BuilderValue::Set(synthetic_size_calculation_interval)
-    }
-
-    pub fn test_remote_failures(&mut self, fail_first: u64) {
-        self.test_remote_failures = BuilderValue::Set(fail_first);
-    }
-
-    pub fn disk_usage_based_eviction(&mut self, value: Option<DiskUsageEvictionTaskConfig>) {
-        self.disk_usage_based_eviction = BuilderValue::Set(value);
-    }
-
-    pub fn ondemand_download_behavior_treat_error_as_warn(
-        &mut self,
-        ondemand_download_behavior_treat_error_as_warn: bool,
-    ) {
-        self.ondemand_download_behavior_treat_error_as_warn =
-            BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
-    }
-
-    pub fn background_task_maximum_delay(&mut self, delay: Duration) {
-        self.background_task_maximum_delay = BuilderValue::Set(delay);
-    }
-
-    pub fn control_plane_api(&mut self, api: Option<Url>) {
-        self.control_plane_api = BuilderValue::Set(api)
-    }
-
-    pub fn control_plane_api_token(&mut self, token: Option<SecretString>) {
-        self.control_plane_api_token = BuilderValue::Set(token)
-    }
-
-    pub fn control_plane_emergency_mode(&mut self, enabled: bool) {
-        self.control_plane_emergency_mode = BuilderValue::Set(enabled)
-    }
-
-    pub fn heatmap_upload_concurrency(&mut self, value: usize) {
-        self.heatmap_upload_concurrency = BuilderValue::Set(value)
-    }
-
-    pub fn secondary_download_concurrency(&mut self, value: usize) {
-        self.secondary_download_concurrency = BuilderValue::Set(value)
-    }
-
-    pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
-        self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
-    }
-
-    pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) {
-        self.virtual_file_io_engine = BuilderValue::Set(value);
-    }
-
-    pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
-        self.max_vectored_read_bytes = BuilderValue::Set(value);
-    }
-
-    pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
-        self.image_compression = BuilderValue::Set(value);
-    }
-
-    pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
-        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
-    }
-
-    pub fn l0_flush(&mut self, value: L0FlushConfig) {
-        self.l0_flush = BuilderValue::Set(value);
-    }
-
-    pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) {
-        self.compact_level0_phase1_value_access = BuilderValue::Set(value);
-    }
-
-    pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) {
-        self.virtual_file_direct_io = BuilderValue::Set(value);
-    }
-
-    pub fn io_buffer_alignment(&mut self, value: usize) {
-        self.io_buffer_alignment = BuilderValue::Set(value);
-    }
-
-    pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
-        let default = Self::default_values();
-
-        macro_rules! conf {
-            (USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => {
-                PageServerConf {
-                    $(
-                        $field: self.$field.ok_or(stringify!($field), default.$field)?,
-                    )*
-                    $(
-                        $custom_field: $custom_value,
-                    )*
-                }
-            };
-        }
-
-        Ok(conf!(
-            USING DEFAULT
-            {
-                listen_pg_addr,
-                listen_http_addr,
-                availability_zone,
-                wait_lsn_timeout,
-                wal_redo_timeout,
-                superuser,
-                page_cache_size,
-                max_file_descriptors,
-                workdir,
-                pg_distrib_dir,
-                http_auth_type,
-                pg_auth_type,
-                auth_validation_public_key_path,
-                remote_storage_config,
-                broker_endpoint,
-                broker_keepalive_interval,
-                log_format,
-                metric_collection_interval,
-                metric_collection_endpoint,
-                metric_collection_bucket,
-                synthetic_size_calculation_interval,
-                disk_usage_based_eviction,
-                test_remote_failures,
-                ondemand_download_behavior_treat_error_as_warn,
-                background_task_maximum_delay,
-                control_plane_api,
-                control_plane_api_token,
-                control_plane_emergency_mode,
-                heatmap_upload_concurrency,
-                secondary_download_concurrency,
-                ingest_batch_size,
-                max_vectored_read_bytes,
-                image_compression,
-                ephemeral_bytes_per_memory_kb,
-                l0_flush,
-                compact_level0_phase1_value_access,
-                virtual_file_direct_io,
-                io_buffer_alignment,
-            }
-            CUSTOM LOGIC
-            {
-                id: id,
-                // TenantConf is handled separately
-                default_tenant_conf: TenantConf::default(),
-                concurrent_tenant_warmup: ConfigurableSemaphore::new({
-                    self
-                        .concurrent_tenant_warmup
-                        .ok_or("concurrent_tenant_warmpup",
-                               default.concurrent_tenant_warmup)?
-                }),
-                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
-                    self
-                        .concurrent_tenant_size_logical_size_queries
-                        .ok_or("concurrent_tenant_size_logical_size_queries",
-                               default.concurrent_tenant_size_logical_size_queries.clone())?
-                ),
-                eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
-                    // re-use `concurrent_tenant_size_logical_size_queries`
-                    self
-                        .concurrent_tenant_size_logical_size_queries
-                        .ok_or("eviction_task_immitated_concurrent_logical_size_queries",
-                               default.concurrent_tenant_size_logical_size_queries.clone())?,
-                ),
-                virtual_file_io_engine: match self.virtual_file_io_engine {
-                    BuilderValue::Set(v) => v,
-                    BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? {
-                        io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise
-                        io_engine::FeatureTestResult::Worse { engine, remark } => {
-                            // TODO: bubble this up to the caller so we can tracing::warn! it.
-                            eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}");
-                            engine
-                        }
-                    },
-                },
-            }
-        ))
-    }
-}
-
 impl PageServerConf {
     //
     // Repository paths, relative to workdir.
@@ -878,134 +300,135 @@ impl PageServerConf {
     ///
     /// This leaves any options not present in the file in the built-in defaults.
     pub fn parse_and_validate(
-        node_id: NodeId,
-        toml: &Document,
+        id: NodeId,
+        config_toml: pageserver_api::config::ConfigToml,
         workdir: &Utf8Path,
     ) -> anyhow::Result<Self> {
-        let mut builder = PageServerConfigBuilder::new();
-        builder.workdir(workdir.to_owned());
+        let pageserver_api::config::ConfigToml {
+            listen_pg_addr,
+            listen_http_addr,
+            availability_zone,
+            wait_lsn_timeout,
+            wal_redo_timeout,
+            superuser,
+            page_cache_size,
+            max_file_descriptors,
+            pg_distrib_dir,
+            http_auth_type,
+            pg_auth_type,
+            auth_validation_public_key_path,
+            remote_storage,
+            broker_endpoint,
+            broker_keepalive_interval,
+            log_format,
+            metric_collection_interval,
+            metric_collection_endpoint,
+            metric_collection_bucket,
+            synthetic_size_calculation_interval,
+            disk_usage_based_eviction,
+            test_remote_failures,
+            ondemand_download_behavior_treat_error_as_warn,
+            background_task_maximum_delay,
+            control_plane_api,
+            control_plane_api_token,
+            control_plane_emergency_mode,
+            heatmap_upload_concurrency,
+            secondary_download_concurrency,
+            ingest_batch_size,
+            max_vectored_read_bytes,
+            image_compression,
+            ephemeral_bytes_per_memory_kb,
+            compact_level0_phase1_value_access,
+            l0_flush,
+            virtual_file_direct_io,
+            concurrent_tenant_warmup,
+            concurrent_tenant_size_logical_size_queries,
+            virtual_file_io_engine,
+            io_buffer_alignment,
+            tenant_config,
+        } = config_toml;
 
-        let mut t_conf = TenantConfOpt::default();
+        let mut conf = PageServerConf {
+            // ------------------------------------------------------------
+            // fields that are already fully validated by the ConfigToml Deserialize impl
+            // ------------------------------------------------------------
+            listen_pg_addr,
+            listen_http_addr,
+            availability_zone,
+            wait_lsn_timeout,
+            wal_redo_timeout,
+            superuser,
+            page_cache_size,
+            max_file_descriptors,
+            http_auth_type,
+            pg_auth_type,
+            auth_validation_public_key_path,
+            remote_storage_config: remote_storage,
+            broker_endpoint,
+            broker_keepalive_interval,
+            log_format,
+            metric_collection_interval,
+            metric_collection_endpoint,
+            metric_collection_bucket,
+            synthetic_size_calculation_interval,
+            disk_usage_based_eviction,
+            test_remote_failures,
+            ondemand_download_behavior_treat_error_as_warn,
+            background_task_maximum_delay,
+            control_plane_api,
+            control_plane_emergency_mode,
+            heatmap_upload_concurrency,
+            secondary_download_concurrency,
+            ingest_batch_size,
+            max_vectored_read_bytes,
+            image_compression,
+            ephemeral_bytes_per_memory_kb,
+            compact_level0_phase1_value_access,
+            virtual_file_direct_io,
+            io_buffer_alignment,
 
-        for (key, item) in toml.iter() {
-            match key {
-                "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?),
-                "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?),
-                "availability_zone" => builder.availability_zone(Some(parse_toml_string(key, item)?)),
-                "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?),
-                "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?),
-                "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?),
-                "page_cache_size" => builder.page_cache_size(parse_toml_u64(key, item)? as usize),
-                "max_file_descriptors" => {
-                    builder.max_file_descriptors(parse_toml_u64(key, item)? as usize)
-                }
-                "pg_distrib_dir" => {
-                    builder.pg_distrib_dir(Utf8PathBuf::from(parse_toml_string(key, item)?))
-                }
-                "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some(
-                    Utf8PathBuf::from(parse_toml_string(key, item)?),
-                )),
-                "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
-                "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
-                "remote_storage" => {
-                    builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?))
-                }
-                "tenant_config" => {
-                    t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
-                }
-                "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
-                "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
-                "log_format" => builder.log_format(
-                    LogFormat::from_config(&parse_toml_string(key, item)?)?
-                ),
-                "concurrent_tenant_warmup" => builder.concurrent_tenant_warmup({
-                    let input = parse_toml_string(key, item)?;
-                    let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
-                    NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
-                }),
-                "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
-                    let input = parse_toml_string(key, item)?;
-                    let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
-                    NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
-                }),
-                "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
-                "metric_collection_endpoint" => {
-                    let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
-                    builder.metric_collection_endpoint(Some(endpoint));
-                },
-                "metric_collection_bucket" => {
-                    builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?))
-                }
-                "synthetic_size_calculation_interval" =>
-                    builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
-                "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
-                "disk_usage_based_eviction" => {
-                    tracing::info!("disk_usage_based_eviction: {:#?}", &item);
-                    builder.disk_usage_based_eviction(
-                        deserialize_from_item("disk_usage_based_eviction", item)
-                            .context("parse disk_usage_based_eviction")?
-                    )
-                },
-                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
-                "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
-                "control_plane_api" => {
-                    let parsed = parse_toml_string(key, item)?;
-                    if parsed.is_empty() {
-                        builder.control_plane_api(None)
-                    } else {
-                        builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?))
+            // ------------------------------------------------------------
+            // fields that require additional validation or custom handling
+            // ------------------------------------------------------------
+            workdir: workdir.to_owned(),
+            pg_distrib_dir: pg_distrib_dir.unwrap_or_else(|| {
+                std::env::current_dir()
+                    .expect("current_dir() failed")
+                    .try_into()
+                    .expect("current_dir() is not a valid Utf8Path")
+            }),
+            control_plane_api_token: control_plane_api_token.map(SecretString::from),
+            id,
+            default_tenant_conf: tenant_config,
+            concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
+            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
+                concurrent_tenant_size_logical_size_queries,
+            ),
+            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
+                // re-use `concurrent_tenant_size_logical_size_queries`
+                concurrent_tenant_size_logical_size_queries,
+            ),
+            virtual_file_io_engine: match virtual_file_io_engine {
+                Some(v) => v,
+                None => match crate::virtual_file::io_engine_feature_test()
+                    .context("auto-detect virtual_file_io_engine")?
+                {
+                    io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise
+                    io_engine::FeatureTestResult::Worse { engine, remark } => {
+                        // TODO: bubble this up to the caller so we can tracing::warn! it.
+                        eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}");
+                        engine
                     }
                 },
-                "control_plane_api_token" => {
-                    let parsed = parse_toml_string(key, item)?;
-                    if parsed.is_empty() {
-                        builder.control_plane_api_token(None)
-                    } else {
-                        builder.control_plane_api_token(Some(parsed.into()))
-                    }
-                },
-                "control_plane_emergency_mode" => {
-                    builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
-                },
-                "heatmap_upload_concurrency" => {
-                    builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
-                },
-                "secondary_download_concurrency" => {
-                    builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
-                },
-                "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
-                "virtual_file_io_engine" => {
-                    builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
-                }
-                "max_vectored_read_bytes" => {
-                    let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
-                    builder.get_max_vectored_read_bytes(
-                        MaxVectoredReadBytes(
-                            NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0")))
-                }
-                "image_compression" => {
-                    builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
-                }
-                "ephemeral_bytes_per_memory_kb" => {
-                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
-                }
-                "l0_flush" => {
-                    builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
-                }
-                "compact_level0_phase1_value_access" => {
-                    builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
-                }
-                "virtual_file_direct_io" => {
-                    builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
-                }
-                "io_buffer_alignment" => {
-                    builder.io_buffer_alignment(parse_toml_u64("io_buffer_alignment", item)? as usize)
-                }
-                _ => bail!("unrecognized pageserver option '{key}'"),
-            }
-        }
+            },
+            l0_flush: l0_flush
+                .map(crate::l0_flush::L0FlushConfig::from)
+                .unwrap_or_default(),
+        };
 
-        let mut conf = builder.build(node_id).context("invalid config")?;
+        // ------------------------------------------------------------
+        // custom validation code that covers more than one field in isolation
+        // ------------------------------------------------------------
 
         if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
             let auth_validation_public_key_path = conf
@@ -1019,10 +442,8 @@ impl PageServerConf {
             );
         }
 
-        conf.default_tenant_conf = t_conf.merge(TenantConf::default());
-
         IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
-            .map_err(|msg| anyhow::anyhow!("{msg}"))
+            .map_err(anyhow::Error::msg)
             .with_context(|| {
                 format!(
                     "effective checkpoint distance is unsupported: {}",
@@ -1042,130 +463,25 @@ impl PageServerConf {
     pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
         let pg_distrib_dir = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
 
-        PageServerConf {
-            id: NodeId(0),
+        let config_toml = pageserver_api::config::ConfigToml {
             wait_lsn_timeout: Duration::from_secs(60),
             wal_redo_timeout: Duration::from_secs(60),
-            page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
-            max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
-            listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
-            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
-            availability_zone: None,
-            superuser: "cloud_admin".to_string(),
-            workdir: repo_dir,
-            pg_distrib_dir,
-            http_auth_type: AuthType::Trust,
-            pg_auth_type: AuthType::Trust,
-            auth_validation_public_key_path: None,
-            remote_storage_config: None,
-            default_tenant_conf: TenantConf::default(),
-            broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
-            broker_keepalive_interval: Duration::from_secs(5000),
-            log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
-            concurrent_tenant_warmup: ConfigurableSemaphore::new(
-                NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
-                    .expect("Invalid default constant"),
-            ),
-            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
-            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
-            ),
+            pg_distrib_dir: Some(pg_distrib_dir),
             metric_collection_interval: Duration::from_secs(60),
-            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
-            metric_collection_bucket: None,
             synthetic_size_calculation_interval: Duration::from_secs(60),
-            disk_usage_based_eviction: None,
-            test_remote_failures: 0,
-            ondemand_download_behavior_treat_error_as_warn: false,
             background_task_maximum_delay: Duration::ZERO,
-            control_plane_api: None,
-            control_plane_api_token: None,
-            control_plane_emergency_mode: false,
-            heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-            secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
-            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
-            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
-            max_vectored_read_bytes: MaxVectoredReadBytes(
-                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
-                    .expect("Invalid default constant"),
-            ),
-            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
-            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-            l0_flush: L0FlushConfig::default(),
-            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-            virtual_file_direct_io: virtual_file::DirectIoMode::default(),
-            io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-        }
+            ..Default::default()
+        };
+        PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
     }
 }
 
-#[derive(Deserialize)]
+#[derive(serde::Deserialize, serde::Serialize)]
 #[serde(deny_unknown_fields)]
 pub struct PageserverIdentity {
     pub id: NodeId,
 }
 
-// Helper functions to parse a toml Item
-
-fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
-    let s = item
-        .as_str()
-        .with_context(|| format!("configure option {name} is not a string"))?;
-    Ok(s.to_string())
-}
-
-fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
-    // A toml integer is signed, so it cannot represent the full range of an u64. That's OK
-    // for our use, though.
-    let i: i64 = item
-        .as_integer()
-        .with_context(|| format!("configure option {name} is not an integer"))?;
-    if i < 0 {
-        bail!("configure option {name} cannot be negative");
-    }
-    Ok(i as u64)
-}
-
-fn parse_toml_bool(name: &str, item: &Item) -> Result<bool> {
-    item.as_bool()
-        .with_context(|| format!("configure option {name} is not a bool"))
-}
-
-fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
-    let s = item
-        .as_str()
-        .with_context(|| format!("configure option {name} is not a string"))?;
-
-    Ok(humantime::parse_duration(s)?)
-}
-
-fn parse_toml_from_str<T>(name: &str, item: &Item) -> anyhow::Result<T>
-where
-    T: FromStr,
-    <T as FromStr>::Err: std::fmt::Display,
-{
-    let v = item
-        .as_str()
-        .with_context(|| format!("configure option {name} is not a string"))?;
-    T::from_str(v).map_err(|e| {
-        anyhow!(
-            "Failed to parse string as {parse_type} for configure option {name}: {e}",
-            parse_type = stringify!(T)
-        )
-    })
-}
-
-fn deserialize_from_item<T>(name: &str, item: &Item) -> anyhow::Result<T>
-where
-    T: serde::de::DeserializeOwned,
-{
-    // ValueDeserializer::new is not public, so use the ValueDeserializer's documented way
-    let deserializer = match item.clone().into_value() {
-        Ok(value) => value.into_deserializer(),
-        Err(item) => anyhow::bail!("toml_edit::Item '{item}' is not a toml_edit::Value"),
-    };
-    T::deserialize(deserializer).with_context(|| format!("deserializing item for node {name}"))
-}
-
 /// Configurable semaphore permits setting.
 ///
 /// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty
@@ -1227,469 +543,108 @@ impl ConfigurableSemaphore {
 
 #[cfg(test)]
 mod tests {
-    use std::{fs, num::NonZeroU32};
 
-    use camino_tempfile::{tempdir, Utf8TempDir};
-    use pageserver_api::models::EvictionPolicy;
-    use remote_storage::{RemoteStorageKind, S3Config};
-    use utils::serde_percent::Percent;
+    use camino::Utf8PathBuf;
+    use utils::id::NodeId;
 
-    use super::*;
-    use crate::DEFAULT_PG_VERSION;
-
-    const ALL_BASE_VALUES_TOML: &str = r#"
-# Initial configuration file created by 'pageserver --init'
-
-listen_pg_addr = '127.0.0.1:64000'
-listen_http_addr = '127.0.0.1:9898'
-
-wait_lsn_timeout = '111 s'
-wal_redo_timeout = '111 s'
-
-page_cache_size = 444
-max_file_descriptors = 333
-
-# initial superuser role name to use when creating a new tenant
-initial_superuser_name = 'zzzz'
-
-metric_collection_interval = '222 s'
-metric_collection_endpoint = 'http://localhost:80/metrics'
-synthetic_size_calculation_interval = '333 s'
-
-log_format = 'json'
-background_task_maximum_delay = '334 s'
-
-"#;
+    use super::PageServerConf;
 
     #[test]
-    fn parse_defaults() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-        let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
-        // we have to create dummy values to overcome the validation errors
-        let config_string =
-            format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",);
-        let toml = config_string.parse()?;
-
-        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
-            .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
-
-        assert_eq!(
-            parsed_config,
-            PageServerConf {
-                id: NodeId(10),
-                listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
-                listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
-                availability_zone: None,
-                wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
-                wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?,
-                superuser: defaults::DEFAULT_SUPERUSER.to_string(),
-                page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
-                max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
-                workdir,
-                pg_distrib_dir,
-                http_auth_type: AuthType::Trust,
-                pg_auth_type: AuthType::Trust,
-                auth_validation_public_key_path: None,
-                remote_storage_config: None,
-                default_tenant_conf: TenantConf::default(),
-                broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
-                broker_keepalive_interval: humantime::parse_duration(
-                    storage_broker::DEFAULT_KEEPALIVE_INTERVAL
-                )?,
-                log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
-                concurrent_tenant_warmup: ConfigurableSemaphore::new(
-                    NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
-                ),
-                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
-                eviction_task_immitated_concurrent_logical_size_queries:
-                    ConfigurableSemaphore::default(),
-                metric_collection_interval: humantime::parse_duration(
-                    defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
-                )?,
-                metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
-                metric_collection_bucket: None,
-                synthetic_size_calculation_interval: humantime::parse_duration(
-                    defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
-                )?,
-                disk_usage_based_eviction: None,
-                test_remote_failures: 0,
-                ondemand_download_behavior_treat_error_as_warn: false,
-                background_task_maximum_delay: humantime::parse_duration(
-                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
-                )?,
-                control_plane_api: None,
-                control_plane_api_token: None,
-                control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
-                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
-                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
-                max_vectored_read_bytes: MaxVectoredReadBytes(
-                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
-                        .expect("Invalid default constant")
-                ),
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-                l0_flush: L0FlushConfig::default(),
-                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
-                io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-            },
-            "Correct defaults should be used when no config values are provided"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn parse_basic_config() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-        let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
-
-        let config_string = format!(
-            "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",
-        );
-        let toml = config_string.parse()?;
-
-        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
-            .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
-
-        assert_eq!(
-            parsed_config,
-            PageServerConf {
-                id: NodeId(10),
-                listen_pg_addr: "127.0.0.1:64000".to_string(),
-                listen_http_addr: "127.0.0.1:9898".to_string(),
-                availability_zone: None,
-                wait_lsn_timeout: Duration::from_secs(111),
-                wal_redo_timeout: Duration::from_secs(111),
-                superuser: "zzzz".to_string(),
-                page_cache_size: 444,
-                max_file_descriptors: 333,
-                workdir,
-                pg_distrib_dir,
-                http_auth_type: AuthType::Trust,
-                pg_auth_type: AuthType::Trust,
-                auth_validation_public_key_path: None,
-                remote_storage_config: None,
-                default_tenant_conf: TenantConf::default(),
-                broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
-                broker_keepalive_interval: Duration::from_secs(5),
-                log_format: LogFormat::Json,
-                concurrent_tenant_warmup: ConfigurableSemaphore::new(
-                    NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
-                ),
-                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
-                eviction_task_immitated_concurrent_logical_size_queries:
-                    ConfigurableSemaphore::default(),
-                metric_collection_interval: Duration::from_secs(222),
-                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
-                metric_collection_bucket: None,
-                synthetic_size_calculation_interval: Duration::from_secs(333),
-                disk_usage_based_eviction: None,
-                test_remote_failures: 0,
-                ondemand_download_behavior_treat_error_as_warn: false,
-                background_task_maximum_delay: Duration::from_secs(334),
-                control_plane_api: None,
-                control_plane_api_token: None,
-                control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
-                ingest_batch_size: 100,
-                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
-                max_vectored_read_bytes: MaxVectoredReadBytes(
-                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
-                        .expect("Invalid default constant")
-                ),
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-                l0_flush: L0FlushConfig::default(),
-                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
-                io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-            },
-            "Should be able to parse all basic config values correctly"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn parse_remote_fs_storage_config() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-        let broker_endpoint = "http://127.0.0.1:7777";
-
-        let local_storage_path = tempdir.path().join("local_remote_storage");
-
-        let identical_toml_declarations = &[
-            format!(
-                r#"[remote_storage]
-local_path = '{local_storage_path}'"#,
-            ),
-            format!("remote_storage={{local_path='{local_storage_path}'}}"),
-        ];
-
-        for remote_storage_config_str in identical_toml_declarations {
-            let config_string = format!(
-                r#"{ALL_BASE_VALUES_TOML}
-pg_distrib_dir='{pg_distrib_dir}'
-broker_endpoint = '{broker_endpoint}'
-
-{remote_storage_config_str}"#,
-            );
-
-            let toml = config_string.parse()?;
-
-            let parsed_remote_storage_config =
-                PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
-                    .unwrap_or_else(|e| {
-                        panic!("Failed to parse config '{config_string}', reason: {e:?}")
-                    })
-                    .remote_storage_config
-                    .expect("Should have remote storage config for the local FS");
-
-            assert_eq!(
-                parsed_remote_storage_config,
-                RemoteStorageConfig {
-                    storage: RemoteStorageKind::LocalFs { local_path: local_storage_path.clone() },
-                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
-                },
-                "Remote storage config should correctly parse the local FS config and fill other storage defaults"
-            );
-        }
-        Ok(())
-    }
-
-    #[test]
-    fn parse_remote_s3_storage_config() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-
-        let bucket_name = "some-sample-bucket".to_string();
-        let bucket_region = "eu-north-1".to_string();
-        let prefix_in_bucket = "test_prefix".to_string();
-        let endpoint = "http://localhost:5000".to_string();
-        let max_concurrent_syncs = NonZeroUsize::new(111).unwrap();
-        let max_sync_errors = NonZeroU32::new(222).unwrap();
-        let s3_concurrency_limit = NonZeroUsize::new(333).unwrap();
-        let broker_endpoint = "http://127.0.0.1:7777";
-
-        let identical_toml_declarations = &[
-            format!(
-                r#"[remote_storage]
-max_concurrent_syncs = {max_concurrent_syncs}
-max_sync_errors = {max_sync_errors}
-bucket_name = '{bucket_name}'
-bucket_region = '{bucket_region}'
-prefix_in_bucket = '{prefix_in_bucket}'
-endpoint = '{endpoint}'
-concurrency_limit = {s3_concurrency_limit}"#
-            ),
-            format!(
-                "remote_storage={{max_concurrent_syncs={max_concurrent_syncs}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\
-                bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}",
-            ),
-        ];
-
-        for remote_storage_config_str in identical_toml_declarations {
-            let config_string = format!(
-                r#"{ALL_BASE_VALUES_TOML}
-pg_distrib_dir='{pg_distrib_dir}'
-broker_endpoint = '{broker_endpoint}'
-
-{remote_storage_config_str}"#,
-            );
-
-            let toml = config_string.parse()?;
-
-            let parsed_remote_storage_config =
-                PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
-                    .unwrap_or_else(|e| {
-                        panic!("Failed to parse config '{config_string}', reason: {e:?}")
-                    })
-                    .remote_storage_config
-                    .expect("Should have remote storage config for S3");
-
-            assert_eq!(
-                parsed_remote_storage_config,
-                RemoteStorageConfig {
-                    storage: RemoteStorageKind::AwsS3(S3Config {
-                        bucket_name: bucket_name.clone(),
-                        bucket_region: bucket_region.clone(),
-                        prefix_in_bucket: Some(prefix_in_bucket.clone()),
-                        endpoint: Some(endpoint.clone()),
-                        concurrency_limit: s3_concurrency_limit,
-                        max_keys_per_list_response: None,
-                        upload_storage_class: None,
-                    }),
-                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
-                },
-                "Remote storage config should correctly parse the S3 config"
-            );
-        }
-        Ok(())
-    }
-
-    #[test]
-    fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
-        let config_string = r#"
-            [tenant_config]
-            checkpoint_distance = -1 # supposed to be an u64
-        "#
-        .to_string();
-
-        let toml: Document = config_string.parse()?;
-        let item = toml.get("tenant_config").unwrap();
-        let error = TenantConfOpt::try_from(item.to_owned()).unwrap_err();
-
-        let expected_error_str = "checkpoint_distance: invalid value: integer `-1`, expected u64";
-        assert_eq!(error.to_string(), expected_error_str);
-
-        Ok(())
-    }
-
-    #[test]
-    fn parse_override_tenant_config() -> anyhow::Result<()> {
-        let config_string = r#"tenant_config={ min_resident_size_override =  400 }"#.to_string();
-
-        let toml: Document = config_string.parse()?;
-        let item = toml.get("tenant_config").unwrap();
-        let conf = TenantConfOpt::try_from(item.to_owned()).unwrap();
-
-        assert_eq!(conf.min_resident_size_override, Some(400));
-
-        Ok(())
-    }
-
-    #[test]
-    fn eviction_pageserver_config_parse() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-
-        let pageserver_conf_toml = format!(
-            r#"pg_distrib_dir = "{pg_distrib_dir}"
-metric_collection_endpoint = "http://sample.url"
-metric_collection_interval = "10min"
-
-[disk_usage_based_eviction]
-max_usage_pct = 80
-min_avail_bytes = 0
-period = "10s"
-
-[tenant_config]
-evictions_low_residence_duration_metric_threshold = "20m"
-
-[tenant_config.eviction_policy]
-kind = "LayerAccessThreshold"
-period = "20m"
-threshold = "20m"
-"#,
-        );
-        let toml: Document = pageserver_conf_toml.parse()?;
-        let conf = PageServerConf::parse_and_validate(NodeId(333), &toml, &workdir)?;
-
-        assert_eq!(conf.pg_distrib_dir, pg_distrib_dir);
-        assert_eq!(
-            conf.metric_collection_endpoint,
-            Some("http://sample.url".parse().unwrap())
-        );
-        assert_eq!(
-            conf.metric_collection_interval,
-            Duration::from_secs(10 * 60)
-        );
-        assert_eq!(
-            conf.default_tenant_conf
-                .evictions_low_residence_duration_metric_threshold,
-            Duration::from_secs(20 * 60)
-        );
-
-        // Assert that the node id provided by the indentity file (threaded
-        // through the call to [`PageServerConf::parse_and_validate`] is
-        // used.
-        assert_eq!(conf.id, NodeId(333));
-        assert_eq!(
-            conf.disk_usage_based_eviction,
-            Some(DiskUsageEvictionTaskConfig {
-                max_usage_pct: Percent::new(80).unwrap(),
-                min_avail_bytes: 0,
-                period: Duration::from_secs(10),
-                #[cfg(feature = "testing")]
-                mock_statvfs: None,
-                eviction_order: Default::default(),
-            })
-        );
-
-        match &conf.default_tenant_conf.eviction_policy {
-            EvictionPolicy::LayerAccessThreshold(eviction_threshold) => {
-                assert_eq!(eviction_threshold.period, Duration::from_secs(20 * 60));
-                assert_eq!(eviction_threshold.threshold, Duration::from_secs(20 * 60));
-            }
-            other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"),
-        }
-
-        Ok(())
-    }
-
-    #[test]
-    fn parse_imitation_only_pageserver_config() {
-        let tempdir = tempdir().unwrap();
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir).unwrap();
-
-        let pageserver_conf_toml = format!(
-            r#"pg_distrib_dir = "{pg_distrib_dir}"
-metric_collection_endpoint = "http://sample.url"
-metric_collection_interval = "10min"
-
-[tenant_config]
-evictions_low_residence_duration_metric_threshold = "20m"
-
-[tenant_config.eviction_policy]
-kind = "OnlyImitiate"
-period = "20m"
-threshold = "20m"
-"#,
-        );
-        let toml: Document = pageserver_conf_toml.parse().unwrap();
-        let conf = PageServerConf::parse_and_validate(NodeId(222), &toml, &workdir).unwrap();
-
-        match &conf.default_tenant_conf.eviction_policy {
-            EvictionPolicy::OnlyImitiate(t) => {
-                assert_eq!(t.period, Duration::from_secs(20 * 60));
-                assert_eq!(t.threshold, Duration::from_secs(20 * 60));
-            }
-            other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"),
-        }
-    }
-
-    #[test]
-    fn empty_remote_storage_is_error() {
-        let tempdir = tempdir().unwrap();
-        let (workdir, _) = prepare_fs(&tempdir).unwrap();
+    fn test_empty_config_toml_is_valid() {
+        // we use Default impl of everything in this situation
         let input = r#"
-remote_storage = {}
         "#;
-        let doc = toml_edit::Document::from_str(input).unwrap();
-        let err = PageServerConf::parse_and_validate(NodeId(222), &doc, &workdir)
-            .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
-        assert!(format!("{err}").contains("remote_storage"), "{err}");
+        let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
+            .expect("empty config is valid");
+        let workdir = Utf8PathBuf::from("/nonexistent");
+        PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
+            .expect("parse_and_validate");
     }
 
-    fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
-        let tempdir_path = tempdir.path();
+    /// If there's a typo in the pageserver config, we'd rather catch that typo
+    /// and fail pageserver startup than silently ignoring the typo, leaving whoever
+    /// made it in the believe that their config change is effective.
+    ///
+    /// The default in serde is to allow unknown fields, so, we rely
+    /// on developer+review discipline to add `deny_unknown_fields` when adding
+    /// new structs to the config, and these tests here as a regression test.
+    ///
+    /// The alternative to all of this would be to allow unknown fields in the config.
+    /// To catch them, we could have a config check tool or mgmt API endpoint that
+    /// compares the effective config with the TOML on disk and makes sure that
+    /// the on-disk TOML is a strict subset of the effective config.
+    mod unknown_fields_handling {
+        macro_rules! test {
+            ($short_name:ident, $input:expr) => {
+                #[test]
+                fn $short_name() {
+                    let input = $input;
+                    let err = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(&input)
+                        .expect_err("some_invalid_field is an invalid field");
+                    dbg!(&err);
+                    assert!(err.to_string().contains("some_invalid_field"));
+                }
+            };
+        }
+        use indoc::indoc;
 
-        let workdir = tempdir_path.join("workdir");
-        fs::create_dir_all(&workdir)?;
+        test!(
+            toplevel,
+            indoc! {r#"
+                some_invalid_field = 23
+            "#}
+        );
 
-        let pg_distrib_dir = tempdir_path.join("pg_distrib");
-        let pg_distrib_dir_versioned = pg_distrib_dir.join(format!("v{DEFAULT_PG_VERSION}"));
-        fs::create_dir_all(&pg_distrib_dir_versioned)?;
-        let postgres_bin_dir = pg_distrib_dir_versioned.join("bin");
-        fs::create_dir_all(&postgres_bin_dir)?;
-        fs::write(postgres_bin_dir.join("postgres"), "I'm postgres, trust me")?;
+        test!(
+            toplevel_nested,
+            indoc! {r#"
+                [some_invalid_field]
+                foo = 23
+            "#}
+        );
 
-        Ok((workdir, pg_distrib_dir))
+        test!(
+            disk_usage_based_eviction,
+            indoc! {r#"
+                [disk_usage_based_eviction]
+                some_invalid_field = 23
+            "#}
+        );
+
+        test!(
+            tenant_config,
+            indoc! {r#"
+                [tenant_config]
+                some_invalid_field = 23
+            "#}
+        );
+
+        test!(
+            l0_flush,
+            indoc! {r#"
+                [l0_flush]
+                mode = "direct"
+                some_invalid_field = 23
+            "#}
+        );
+
+        // TODO: fix this => https://github.com/neondatabase/neon/issues/8915
+        // test!(
+        //     remote_storage_config,
+        //     indoc! {r#"
+        //         [remote_storage_config]
+        //         local_path = "/nonexistent"
+        //         some_invalid_field = 23
+        //     "#}
+        // );
+
+        test!(
+            compact_level0_phase1_value_access,
+            indoc! {r#"
+                [compact_level0_phase1_value_access]
+                mode = "streaming-kmerge"
+                some_invalid_field = 23
+            "#}
+        );
     }
 }
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 5e4a49bc56..a58fa2c0b1 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -41,19 +41,15 @@
 // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
 //   reading these fields. We use the Debug impl for semi-structured logging, though.
 
-use std::{
-    sync::Arc,
-    time::{Duration, SystemTime},
-};
+use std::{sync::Arc, time::SystemTime};
 
 use anyhow::Context;
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{config::DiskUsageEvictionTaskConfig, shard::TenantShardId};
 use remote_storage::GenericRemoteStorage;
-use serde::{Deserialize, Serialize};
+use serde::Serialize;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn, Instrument};
-use utils::serde_percent::Percent;
 use utils::{completion, id::TimelineId};
 
 use crate::{
@@ -69,23 +65,9 @@ use crate::{
     CancellableTask, DiskUsageEvictionTask,
 };
 
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub struct DiskUsageEvictionTaskConfig {
-    pub max_usage_pct: Percent,
-    pub min_avail_bytes: u64,
-    #[serde(with = "humantime_serde")]
-    pub period: Duration,
-    #[cfg(feature = "testing")]
-    pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
-    /// Select sorting for evicted layers
-    #[serde(default)]
-    pub eviction_order: EvictionOrder,
-}
-
 /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
 /// partitioning.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "type", content = "args")]
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum EvictionOrder {
     /// Order the layers to be evicted by how recently they have been accessed relatively within
     /// the set of resident layers of a tenant.
@@ -96,23 +78,22 @@ pub enum EvictionOrder {
         /// we read tenants is deterministic. If we find the need to use this as `false`, we need
         /// to ensure nondeterminism by adding in a random number to break the
         /// `relative_last_activity==0.0` ties.
-        #[serde(default = "default_highest_layer_count_loses_first")]
         highest_layer_count_loses_first: bool,
     },
 }
 
-impl Default for EvictionOrder {
-    fn default() -> Self {
-        Self::RelativeAccessed {
-            highest_layer_count_loses_first: true,
+impl From<pageserver_api::config::EvictionOrder> for EvictionOrder {
+    fn from(value: pageserver_api::config::EvictionOrder) -> Self {
+        match value {
+            pageserver_api::config::EvictionOrder::RelativeAccessed {
+                highest_layer_count_loses_first,
+            } => Self::RelativeAccessed {
+                highest_layer_count_loses_first,
+            },
         }
     }
 }
 
-fn default_highest_layer_count_loses_first() -> bool {
-    true
-}
-
 impl EvictionOrder {
     fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) {
         use EvictionOrder::*;
@@ -295,7 +276,7 @@ async fn disk_usage_eviction_task_iteration(
         storage,
         usage_pre,
         tenant_manager,
-        task_config.eviction_order,
+        task_config.eviction_order.into(),
         cancel,
     )
     .await;
@@ -1257,7 +1238,6 @@ mod filesystem_level_usage {
 
     #[test]
     fn max_usage_pct_pressure() {
-        use super::EvictionOrder;
         use super::Usage as _;
         use std::time::Duration;
         use utils::serde_percent::Percent;
@@ -1269,7 +1249,7 @@ mod filesystem_level_usage {
                 period: Duration::MAX,
                 #[cfg(feature = "testing")]
                 mock_statvfs: None,
-                eviction_order: EvictionOrder::default(),
+                eviction_order: pageserver_api::config::EvictionOrder::default(),
             },
             total_bytes: 100_000,
             avail_bytes: 0,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 90ae6c5557..d645f3b7b6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2076,7 +2076,7 @@ async fn disk_usage_eviction_run(
         evict_bytes: u64,
 
         #[serde(default)]
-        eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
+        eviction_order: pageserver_api::config::EvictionOrder,
     }
 
     #[derive(Debug, Clone, Copy, serde::Serialize)]
@@ -2112,7 +2112,7 @@ async fn disk_usage_eviction_run(
         &state.remote_storage,
         usage,
         &state.tenant_manager,
-        config.eviction_order,
+        config.eviction_order.into(),
         &cancel,
     )
     .await;
diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs
index 313a7961a6..491c9fb96c 100644
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -1,9 +1,7 @@
 use std::{num::NonZeroUsize, sync::Arc};
 
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
-#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+#[derive(Debug, PartialEq, Eq, Clone)]
 pub enum L0FlushConfig {
-    #[serde(rename_all = "snake_case")]
     Direct { max_concurrency: NonZeroUsize },
 }
 
@@ -16,6 +14,16 @@ impl Default for L0FlushConfig {
     }
 }
 
+impl From<pageserver_api::models::L0FlushConfig> for L0FlushConfig {
+    fn from(config: pageserver_api::models::L0FlushConfig) -> Self {
+        match config {
+            pageserver_api::models::L0FlushConfig::Direct { max_concurrency } => {
+                Self::Direct { max_concurrency }
+            }
+        }
+    }
+}
+
 #[derive(Clone)]
 pub struct L0FlushGlobalState(Arc<Inner>);
 
diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs
index ede1791afa..5a6f6e5176 100644
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -60,32 +60,7 @@ pub mod mock {
     use regex::Regex;
     use tracing::log::info;
 
-    #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-    #[serde(tag = "type")]
-    pub enum Behavior {
-        Success {
-            blocksize: u64,
-            total_blocks: u64,
-            name_filter: Option<utils::serde_regex::Regex>,
-        },
-        Failure {
-            mocked_error: MockedError,
-        },
-    }
-
-    #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-    #[allow(clippy::upper_case_acronyms)]
-    pub enum MockedError {
-        EIO,
-    }
-
-    impl From<MockedError> for nix::Error {
-        fn from(e: MockedError) -> Self {
-            match e {
-                MockedError::EIO => nix::Error::EIO,
-            }
-        }
-    }
+    pub use pageserver_api::config::statvfs::mock::Behavior;
 
     pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result<Statvfs> {
         info!("running mocked statvfs");
@@ -116,6 +91,7 @@ pub mod mock {
                     block_size: *blocksize,
                 })
             }
+            #[cfg(feature = "testing")]
             Behavior::Failure { mocked_error } => Err((*mocked_error).into()),
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 48ff17db94..7e0344666b 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,11 +9,10 @@
 //! may lead to a data loss.
 //!
 use anyhow::bail;
+pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::AuxFilePolicy;
-use pageserver_api::models::CompactionAlgorithm;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
-use pageserver_api::models::LsnLease;
 use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
@@ -23,50 +22,6 @@ use std::num::NonZeroU64;
 use std::time::Duration;
 use utils::generation::Generation;
 
-pub mod defaults {
-
-    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
-    // would be more appropriate. But a low value forces the code to be exercised more,
-    // which is good for now to trigger bugs.
-    // This parameter actually determines L0 layer file size.
-    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
-    pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
-
-    // FIXME the below configs are only used by legacy algorithm. The new algorithm
-    // has different parameters.
-
-    // Target file size, when creating image and delta layers.
-    // This parameter determines L1 layer file size.
-    pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
-
-    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
-    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
-    pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
-        super::CompactionAlgorithm::Legacy;
-
-    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
-
-    // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
-    // If there's a need to decrease this value, first make sure that GC
-    // doesn't hold a layer map write lock for non-trivial operations.
-    // Relevant: https://github.com/neondatabase/neon/issues/3394
-    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
-    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
-    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
-    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
-    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
-    // The default limit on WAL lag should be set to avoid causing disconnects under high throughput
-    // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
-    // throughputs up to 1GiB/s per timeline.
-    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
-    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
-    // By default ingest enough WAL for two new L0 layers before checking if new image
-    // image layers should be created.
-    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
-
-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
-}
-
 #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub(crate) enum AttachmentMode {
     /// Our generation is current as far as we know, and as far as we know we are the only attached
@@ -281,96 +236,20 @@ impl LocationConf {
     }
 }
 
-/// A tenant's calcuated configuration, which is the result of merging a
-/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
-///
-/// For storing and transmitting individual tenant's configuration, see
-/// TenantConfOpt.
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub struct TenantConf {
-    // Flush out an inmemory layer, if it's holding WAL older than this
-    // This puts a backstop on how much WAL needs to be re-digested if the
-    // page server crashes.
-    // This parameter actually determines L0 layer file size.
-    pub checkpoint_distance: u64,
-    // Inmemory layer is also flushed at least once in checkpoint_timeout to
-    // eventually upload WAL after activity is stopped.
-    #[serde(with = "humantime_serde")]
-    pub checkpoint_timeout: Duration,
-    // Target file size, when creating image and delta layers.
-    // This parameter determines L1 layer file size.
-    pub compaction_target_size: u64,
-    // How often to check if there's compaction work to be done.
-    // Duration::ZERO means automatic compaction is disabled.
-    #[serde(with = "humantime_serde")]
-    pub compaction_period: Duration,
-    // Level0 delta layer threshold for compaction.
-    pub compaction_threshold: usize,
-    pub compaction_algorithm: CompactionAlgorithmSettings,
-    // Determines how much history is retained, to allow
-    // branching and read replicas at an older point in time.
-    // The unit is #of bytes of WAL.
-    // Page versions older than this are garbage collected away.
-    pub gc_horizon: u64,
-    // Interval at which garbage collection is triggered.
-    // Duration::ZERO means automatic GC is disabled
-    #[serde(with = "humantime_serde")]
-    pub gc_period: Duration,
-    // Delta layer churn threshold to create L1 image layers.
-    pub image_creation_threshold: usize,
-    // Determines how much history is retained, to allow
-    // branching and read replicas at an older point in time.
-    // The unit is time.
-    // Page versions older than this are garbage collected away.
-    #[serde(with = "humantime_serde")]
-    pub pitr_interval: Duration,
-    /// Maximum amount of time to wait while opening a connection to receive wal, before erroring.
-    #[serde(with = "humantime_serde")]
-    pub walreceiver_connect_timeout: Duration,
-    /// Considers safekeepers stalled after no WAL updates were received longer than this threshold.
-    /// A stalled safekeeper will be changed to a newer one when it appears.
-    #[serde(with = "humantime_serde")]
-    pub lagging_wal_timeout: Duration,
-    /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold.
-    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
-    /// to avoid eager reconnects.
-    pub max_lsn_wal_lag: NonZeroU64,
-    pub eviction_policy: EvictionPolicy,
-    pub min_resident_size_override: Option<u64>,
-    // See the corresponding metric's help string.
-    #[serde(with = "humantime_serde")]
-    pub evictions_low_residence_duration_metric_threshold: Duration,
-
-    /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
-    /// may be disabled if a Tenant will not have secondary locations: only secondary
-    /// locations will use the heatmap uploaded by attached locations.
-    #[serde(with = "humantime_serde")]
-    pub heatmap_period: Duration,
-
-    /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
-    pub lazy_slru_download: bool,
-
-    pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
-
-    // How much WAL must be ingested before checking again whether a new image layer is required.
-    // Expresed in multiples of checkpoint distance.
-    pub image_layer_creation_check_threshold: u8,
-
-    /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
-    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
-    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
-    /// file is written.
-    pub switch_aux_file_policy: AuxFilePolicy,
-
-    /// The length for an explicit LSN lease request.
-    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
-    #[serde(with = "humantime_serde")]
-    pub lsn_lease_length: Duration,
-
-    /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
-    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
-    #[serde(with = "humantime_serde")]
-    pub lsn_lease_length_for_ts: Duration,
+impl Default for LocationConf {
+    // TODO: this should be removed once tenant loading can guarantee that we are never
+    // loading from a directory without a configuration.
+    // => tech debt since https://github.com/neondatabase/neon/issues/1555
+    fn default() -> Self {
+        Self {
+            mode: LocationMode::Attached(AttachedLocationConfig {
+                generation: Generation::none(),
+                attach_mode: AttachmentMode::Single,
+            }),
+            tenant_conf: TenantConfOpt::default(),
+            shard: ShardIdentity::unsharded(),
+        }
+    }
 }
 
 /// Same as TenantConf, but this struct preserves the information about
@@ -545,51 +424,6 @@ impl TenantConfOpt {
     }
 }
 
-impl Default for TenantConf {
-    fn default() -> Self {
-        use defaults::*;
-        Self {
-            checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
-            checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
-                .expect("cannot parse default checkpoint timeout"),
-            compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
-            compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
-                .expect("cannot parse default compaction period"),
-            compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
-            compaction_algorithm: CompactionAlgorithmSettings {
-                kind: DEFAULT_COMPACTION_ALGORITHM,
-            },
-            gc_horizon: DEFAULT_GC_HORIZON,
-            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
-                .expect("cannot parse default gc period"),
-            image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
-            pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
-                .expect("cannot parse default PITR interval"),
-            walreceiver_connect_timeout: humantime::parse_duration(
-                DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
-            )
-            .expect("cannot parse default walreceiver connect timeout"),
-            lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT)
-                .expect("cannot parse default walreceiver lagging wal timeout"),
-            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
-                .expect("cannot parse default max walreceiver Lsn wal lag"),
-            eviction_policy: EvictionPolicy::NoEviction,
-            min_resident_size_override: None,
-            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
-                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
-            )
-            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
-            heatmap_period: Duration::ZERO,
-            lazy_slru_download: false,
-            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
-            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
-            switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
-            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
-            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
-        }
-    }
-}
-
 impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
     type Error = anyhow::Error;
 
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index b8e9a98149..6a2cd94232 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -39,7 +39,7 @@ use crate::tenant::disk_btree::{
 use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
     VectoredReadCoalesceMode, VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
@@ -52,6 +52,7 @@ use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
+use pageserver_api::config::MaxVectoredReadBytes;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 4a095c564d..77ce1ae670 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -34,8 +34,7 @@ use crate::tenant::disk_btree::{
 };
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
-    VectoredReadPlanner,
+    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
 };
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
@@ -46,6 +45,7 @@ use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use itertools::Itertools;
+use pageserver_api::config::MaxVectoredReadBytes;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 2c19e5b19f..e487bee1f2 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -215,7 +215,7 @@ impl IndexEntry {
 
     const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = {
         let res = Self::validate_checkpoint_distance(
-            crate::tenant::config::defaults::DEFAULT_CHECKPOINT_DISTANCE,
+            pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE,
         );
         if res.is_err() {
             panic!("default checkpoint distance is valid")
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index f5680ced90..478e9bb4f0 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -10,7 +10,6 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD;
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
@@ -456,9 +455,11 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
 
             // If compaction period is set to zero (to disable it), then we will use a reasonable default
             let period = if period == Duration::ZERO {
-                humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD)
-                    .unwrap()
-                    .into()
+                humantime::Duration::from_str(
+                    pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD,
+                )
+                .unwrap()
+                .into()
             } else {
                 period
             };
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3b8f19a6c0..262dccac7d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -66,7 +66,6 @@ use std::{
 use crate::{
     aux_file::AuxFileSizeEstimator,
     tenant::{
-        config::defaults::DEFAULT_PITR_INTERVAL,
         layer_map::{LayerMap, SearchResult},
         metadata::TimelineMetadata,
         storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
@@ -102,6 +101,7 @@ use crate::{
     pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
     virtual_file::{MaybeFatalIo, VirtualFile},
 };
+use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index aad75ac59c..6b9c8386f7 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -19,6 +19,7 @@ use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
+use pageserver_api::config::{CompactL0BypassPageCacheValidation, CompactL0Phase1ValueAccess};
 use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
@@ -29,7 +30,6 @@ use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
-use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::split_writer::{
@@ -43,6 +43,9 @@ use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
+use pageserver_api::config::tenant_conf_defaults::{
+    DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
+};
 
 use crate::keyspace::KeySpace;
 use crate::repository::{Key, Value};
@@ -1433,43 +1436,6 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
     }
 }
 
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-pub enum CompactL0Phase1ValueAccess {
-    /// The old way.
-    PageCachedBlobIo,
-    /// The new way.
-    StreamingKmerge {
-        /// If set, we run both the old way and the new way, validate that
-        /// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
-        /// and if the validation fails,
-        /// - in tests: fail them with a panic or
-        /// - in prod, log a rate-limited warning and use the old way's results.
-        ///
-        /// If not set, we only run the new way and trust its results.
-        validate: Option<CompactL0BypassPageCacheValidation>,
-    },
-}
-
-/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(rename_all = "kebab-case")]
-pub enum CompactL0BypassPageCacheValidation {
-    /// Validate that the series of (key, lsn) pairs are the same.
-    KeyLsn,
-    /// Validate that the entire output of old and new way is identical.
-    KeyLsnValue,
-}
-
-impl Default for CompactL0Phase1ValueAccess {
-    fn default() -> Self {
-        CompactL0Phase1ValueAccess::StreamingKmerge {
-            // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
-            validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
-        }
-    }
-}
-
 impl Timeline {
     /// Entry point for new tiered compaction algorithm.
     ///
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 146bcf0e35..4d51dc442d 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -16,7 +16,6 @@
 //! Note that the vectored blob api does *not* go through the page cache.
 
 use std::collections::BTreeMap;
-use std::num::NonZeroUsize;
 
 use bytes::BytesMut;
 use pageserver_api::key::Key;
@@ -29,9 +28,6 @@ use crate::context::RequestContext;
 use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
 use crate::virtual_file::{self, VirtualFile};
 
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub struct MaxVectoredReadBytes(pub NonZeroUsize);
-
 /// Metadata bundled with the start and end offset of a blob.
 #[derive(Copy, Clone, Debug)]
 pub struct BlobMeta {
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 97d966e2da..ed6ff86c10 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -10,7 +10,6 @@
 //! This is similar to PostgreSQL's virtual file descriptor facility in
 //! src/backend/storage/file/fd.c
 //!
-use crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use crate::context::RequestContext;
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
 
@@ -19,6 +18,7 @@ use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use owned_buffers_io::io_buf_ext::FullSlice;
+use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver_api::shard::TenantShardId;
 use std::fs::File;
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index faef1ba9ff..ccde90ee1a 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -84,9 +84,14 @@ pub(crate) fn get() -> IoEngine {
                         }
                     },
                     Err(std::env::VarError::NotPresent) => {
-                        crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
-                            .parse()
-                            .unwrap()
+                        #[cfg(target_os = "linux")]
+                        {
+                            IoEngineKind::TokioEpollUring
+                        }
+                        #[cfg(not(target_os = "linux"))]
+                        {
+                            IoEngineKind::StdFs
+                        }
                     }
                     Err(std::env::VarError::NotUnicode(_)) => {
                         panic!("env var {env_var_name} is not unicode");
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 890538b86a..2df45a7e0e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -24,7 +24,20 @@ from functools import cached_property, partial
 from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
-from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union, cast
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+    cast,
+)
 from urllib.parse import quote, urlparse
 
 import asyncpg
@@ -90,6 +103,8 @@ from fixtures.utils import AuxFileStore as AuxFileStore  # reexport
 
 from .neon_api import NeonAPI, NeonApiEndpoint
 
+T = TypeVar("T")
+
 """
 This file contains pytest fixtures. A fixture is a test resource that can be
 summoned by placing its name in the test's arguments.
@@ -2986,16 +3001,17 @@ class NeonPageserver(PgProtocol, LogUtils):
     def config_toml_path(self) -> Path:
         return self.workdir / "pageserver.toml"
 
-    def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], None]):
+    def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], T]) -> T:
         """
         Edit the pageserver's config toml file in place.
         """
         path = self.config_toml_path
         with open(path, "r") as f:
             config = toml.load(f)
-        edit_fn(config)
+        res = edit_fn(config)
         with open(path, "w") as f:
             toml.dump(config, f)
+        return res
 
     def patch_config_toml_nonrecursive(self, patch: Dict[str, Any]) -> Dict[str, Any]:
         """
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 73af7950f1..ebf58d2bd1 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -142,11 +142,10 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     # We will start a pageserver with no control_plane_api set, so it won't be able to self-register
     env.storage_controller.node_register(env.pageserver)
 
-    replaced_config = env.pageserver.patch_config_toml_nonrecursive(
-        {
-            "control_plane_api": "",
-        }
-    )
+    def remove_control_plane_api_field(config):
+        return config.pop("control_plane_api")
+
+    control_plane_api = env.pageserver.edit_config_toml(remove_control_plane_api_field)
     env.pageserver.start()
     env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"})
 
@@ -179,7 +178,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
 
     env.pageserver.stop()
     # Starting without the override that disabled control_plane_api
-    env.pageserver.patch_config_toml_nonrecursive(replaced_config)
+    env.pageserver.patch_config_toml_nonrecursive(
+        {
+            "control_plane_api": control_plane_api,
+        }
+    )
     env.pageserver.start()
 
     generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False)
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 642b9e449b..9bf5f8680b 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -733,7 +733,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
 
     # We will run with the limit set to 1, so that once we have one tenant stuck
     # in a pausable failpoint, the rest are prevented from proceeding through warmup.
-    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1"
 
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
@@ -984,7 +984,7 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
 
 
 def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1"
 
     env = neon_env_builder.init_start()
 
@@ -1062,7 +1062,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
 @pytest.mark.parametrize("activation_method", ["endpoint", "branch", "delete"])
 def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_method: str):
     # env.initial_tenant will take up this permit when attaching with lazy because of a failpoint activated after restart
-    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1"
 
     env = neon_env_builder.init_start()
 

From efe03d5a1ccce8e0f53e733d61fd0e3d0dd904f8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 5 Sep 2024 16:29:48 +0300
Subject: [PATCH 1563/1571] build: sync between benchies (#8919)

Sometimes, the benchmarks fail to start up pageserver in 10s without any
obvious reason. Benchmarks run sequentially on otherwise idle runners.
Try running `sync(2)` after each bench to force a cleaner slate.

Implement this via:
- SYNC_AFTER_EACH_TEST environment variable enabled autouse fixture
- autouse fixture seems to be outermost fixture, so it works as expected
- set SYNC_AFTER_EACH_TEST=true for benchmarks in build_and_test
workflow

Evidence:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/10678984691/index.html#suites/5008d72a1ba3c0d618a030a938fc035c/1210266507534c0f/

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/build_and_test.yml     |  1 +
 test_runner/fixtures/compare_fixtures.py | 26 ++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 53d33b420f..ee5fd1b0c6 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -286,6 +286,7 @@ jobs:
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          SYNC_AFTER_EACH_TEST: true
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 98a9dd7184..7c4a8db36f 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -1,3 +1,5 @@
+import os
+import time
 from abc import ABC, abstractmethod
 from contextlib import _GeneratorContextManager, contextmanager
 
@@ -8,6 +10,7 @@ import pytest
 from _pytest.fixtures import FixtureRequest
 
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     PgBin,
@@ -333,3 +336,26 @@ def neon_with_baseline(request: FixtureRequest) -> PgCompare:
     fixture = request.getfixturevalue(request.param)
     assert isinstance(fixture, PgCompare), f"test error: fixture {fixture} is not PgCompare"
     return fixture
+
+
+@pytest.fixture(scope="function", autouse=True)
+def sync_after_each_test():
+    # The fixture calls `sync(2)` after each test if `SYNC_AFTER_EACH_TEST` env var is `true`
+    #
+    # In CI, `SYNC_AFTER_EACH_TEST` is set to `true` only for benchmarks (`test_runner/performance`)
+    # that are run on self-hosted runners because some of these tests are pretty write-heavy
+    # and create issues to start the processes within 10s
+    key = "SYNC_AFTER_EACH_TEST"
+    enabled = os.environ.get(key) == "true"
+
+    yield
+
+    if not enabled:
+        # regress test, or running locally
+        return
+
+    start = time.time()
+    # we only run benches on unices, the method might not exist on windows
+    os.sync()
+    elapsed = time.time() - start
+    log.info(f"called sync after test {elapsed=}")

From ebddda5b7f85587998df00dbf7dc88679459b494 Mon Sep 17 00:00:00 2001
From: vladov <vladov3000@gmail.com>
Date: Thu, 5 Sep 2024 08:06:57 -0700
Subject: [PATCH 1564/1571] Fix precedence issue causing yielding loop to never
 yield. (#8922)

There is a bug in `yielding_loop` that causes it to never yield.

## Summary of changes

Fixed the bug. `i + 1 % interval == 0` will always evaluate to `i + 1 ==
0` which is false
([Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2021&gist=68e6ca393a02113cb7720115c2842e75)).
This function is called in 2 places
[here](https://github.com/neondatabase/neon/blob/99fa1c36004d710c65a47ffefaf66b4b5c6b4ce1/pageserver/src/tenant/secondary/scheduler.rs#L389)
and
[here](https://github.com/neondatabase/neon/blob/99fa1c36004d710c65a47ffefaf66b4b5c6b4ce1/pageserver/src/tenant/secondary/heatmap_uploader.rs#L152)
with `interval == 1000` in both cases.

This may change the performance of the system since now we are yielding
to tokio. Also, this may expose undefined behavior since it is now
possible for tasks to be moved between threads/whatever tokio does to
tasks. However, this was the intention of the author of the code.
---
 libs/utils/src/yielding_loop.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/utils/src/yielding_loop.rs b/libs/utils/src/yielding_loop.rs
index 963279eb4c..41c4cee45d 100644
--- a/libs/utils/src/yielding_loop.rs
+++ b/libs/utils/src/yielding_loop.rs
@@ -23,7 +23,7 @@ where
     for (i, item) in iter.enumerate() {
         visitor(item);
 
-        if i + 1 % interval == 0 {
+        if (i + 1) % interval == 0 {
             tokio::task::yield_now().await;
             if cancel.is_cancelled() {
                 return Err(YieldingLoopError::Cancelled);

From fd12dd942f61a0a22016fa219f4b3a87c81dc0b0 Mon Sep 17 00:00:00 2001
From: Stefan Radig <stefan@neon.tech>
Date: Thu, 5 Sep 2024 17:48:51 +0200
Subject: [PATCH 1565/1571] Add installation instructions for m4 on mac (#8929)

## Problem
Building on MacOS failed due to missing m4. Although a window was
popping up claiming to install m4, this was not helping.

## Summary of changes
Add instructions to install m4 using brew and link it (thanks to Folke
for helping).
---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 735edef0fc..b54956f773 100644
--- a/README.md
+++ b/README.md
@@ -64,6 +64,12 @@ brew install protobuf openssl flex bison icu4c pkg-config
 echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
 ```
 
+If you get errors about missing `m4` you may have to install it manually:
+```
+brew install m4
+brew link --force m4
+```
+
 2. [Install Rust](https://www.rust-lang.org/tools/install)
 ```
 # recommended approach from https://www.rust-lang.org/tools/install

From 04f99a87bfee4da41df2bd5724e73b3646c2bf3e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 5 Sep 2024 19:14:21 +0100
Subject: [PATCH 1566/1571] storcon: make pageserver AZ id mandatory (#8856)

## Problem
https://github.com/neondatabase/neon/pull/8852 introduced a new nullable
column for the `nodes` table: `availability_zone_id`

## Summary of changes
* Make neon local and the test suite always provide an az id
* Make the az id field in the ps registration request mandatory
* Migrate the column to non-nullable and adjust in memory state
accordingly
* Remove the code that was used to populate the az id for pre-existing
nodes
---
 Dockerfile                                    |  1 +
 control_plane/storcon_cli/src/main.rs         |  2 +-
 libs/pageserver_api/src/controller_api.rs     |  2 +-
 pageserver/src/control_plane_client.rs        | 24 +++++++---
 .../down.sql                                  |  1 +
 .../up.sql                                    |  1 +
 storage_controller/src/node.rs                | 23 +++-------
 storage_controller/src/persistence.rs         | 28 +-----------
 storage_controller/src/scheduler.rs           |  2 +-
 storage_controller/src/schema.rs              |  2 +-
 storage_controller/src/service.rs             | 44 ++-----------------
 test_runner/fixtures/neon_fixtures.py         |  3 ++
 12 files changed, 41 insertions(+), 92 deletions(-)
 create mode 100644 storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql
 create mode 100644 storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql

diff --git a/Dockerfile b/Dockerfile
index d3d12330c6..1efedfa9bc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -87,6 +87,7 @@ RUN mkdir -p /data/.neon/ && \
        "pg_distrib_dir='/usr/local/'\n" \
        "listen_pg_addr='0.0.0.0:6400'\n" \
        "listen_http_addr='0.0.0.0:9898'\n" \
+       "availability_zone='local'\n" \
   > /data/.neon/pageserver.toml && \
   chown -R neon:neon /data/.neon
 
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 5cce6cf3ae..2a81a3d825 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -336,7 +336,7 @@ async fn main() -> anyhow::Result<()> {
                         listen_pg_port,
                         listen_http_addr,
                         listen_http_port,
-                        availability_zone_id: Some(availability_zone_id),
+                        availability_zone_id,
                     }),
                 )
                 .await?;
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 345abd69b6..6fb5a9a139 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -57,7 +57,7 @@ pub struct NodeRegisterRequest {
     pub listen_http_addr: String,
     pub listen_http_port: u16,
 
-    pub availability_zone_id: Option<String>,
+    pub availability_zone_id: String,
 }
 
 #[derive(Serialize, Deserialize)]
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 56a536c387..f6d1c35a8c 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -141,10 +141,24 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                         m.other
                     );
 
-                    let az_id = m
-                        .other
-                        .get("availability_zone_id")
-                        .and_then(|jv| jv.as_str().map(|str| str.to_owned()));
+                    let az_id = {
+                        let az_id_from_metadata = m
+                            .other
+                            .get("availability_zone_id")
+                            .and_then(|jv| jv.as_str().map(|str| str.to_owned()));
+
+                        match az_id_from_metadata {
+                            Some(az_id) => Some(az_id),
+                            None => {
+                                tracing::warn!("metadata.json does not contain an 'availability_zone_id' field");
+                                conf.availability_zone.clone()
+                            }
+                        }
+                    };
+
+                    if az_id.is_none() {
+                        panic!("Availablity zone id could not be inferred from metadata.json or pageserver config");
+                    }
 
                     Some(NodeRegisterRequest {
                         node_id: conf.id,
@@ -152,7 +166,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                         listen_pg_port: m.postgres_port,
                         listen_http_addr: m.http_host,
                         listen_http_port: m.http_port,
-                        availability_zone_id: az_id,
+                        availability_zone_id: az_id.expect("Checked above"),
                     })
                 }
                 Err(e) => {
diff --git a/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql
new file mode 100644
index 0000000000..4fcb928533
--- /dev/null
+++ b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql
@@ -0,0 +1 @@
+ALTER TABLE nodes ALTER availability_zone_id DROP NOT NULL;
diff --git a/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql
new file mode 100644
index 0000000000..c5b4534087
--- /dev/null
+++ b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql
@@ -0,0 +1 @@
+ALTER TABLE nodes ALTER availability_zone_id SET NOT NULL;
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 73cecc491d..cb9ce10d23 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -36,7 +36,7 @@ pub(crate) struct Node {
     listen_pg_addr: String,
     listen_pg_port: u16,
 
-    availability_zone_id: Option<String>,
+    availability_zone_id: String,
 
     // This cancellation token means "stop any RPCs in flight to this node, and don't start
     // any more". It is not related to process shutdown.
@@ -63,8 +63,9 @@ impl Node {
         self.id
     }
 
-    pub(crate) fn get_availability_zone_id(&self) -> Option<&str> {
-        self.availability_zone_id.as_deref()
+    #[allow(unused)]
+    pub(crate) fn get_availability_zone_id(&self) -> &str {
+        self.availability_zone_id.as_str()
     }
 
     pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy {
@@ -78,22 +79,12 @@ impl Node {
     /// Does this registration request match `self`?  This is used when deciding whether a registration
     /// request should be allowed to update an existing record with the same node ID.
     pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
-        let az_ids_match = {
-            match (
-                self.availability_zone_id.as_deref(),
-                register_req.availability_zone_id.as_deref(),
-            ) {
-                (Some(current_az), Some(register_req_az)) => current_az == register_req_az,
-                _ => true,
-            }
-        };
-
-        az_ids_match
-            && self.id == register_req.node_id
+        self.id == register_req.node_id
             && self.listen_http_addr == register_req.listen_http_addr
             && self.listen_http_port == register_req.listen_http_port
             && self.listen_pg_addr == register_req.listen_pg_addr
             && self.listen_pg_port == register_req.listen_pg_port
+            && self.availability_zone_id == register_req.availability_zone_id
     }
 
     /// For a shard located on this node, populate a response object
@@ -190,7 +181,7 @@ impl Node {
         listen_http_port: u16,
         listen_pg_addr: String,
         listen_pg_port: u16,
-        availability_zone_id: Option<String>,
+        availability_zone_id: String,
     ) -> Self {
         Self {
             id,
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index e801289752..6df05ebd13 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -105,7 +105,6 @@ pub(crate) enum DatabaseOperation {
     ListMetadataHealthOutdated,
     GetLeader,
     UpdateLeader,
-    SetNodeAzId,
 }
 
 #[must_use]
@@ -325,31 +324,6 @@ impl Persistence {
         }
     }
 
-    pub(crate) async fn set_node_availability_zone_id(
-        &self,
-        input_node_id: NodeId,
-        input_az_id: String,
-    ) -> DatabaseResult<()> {
-        use crate::schema::nodes::dsl::*;
-        let updated = self
-            .with_measured_conn(DatabaseOperation::SetNodeAzId, move |conn| {
-                let updated = diesel::update(nodes)
-                    .filter(node_id.eq(input_node_id.0 as i64))
-                    .set((availability_zone_id.eq(input_az_id.clone()),))
-                    .execute(conn)?;
-                Ok(updated)
-            })
-            .await?;
-
-        if updated != 1 {
-            Err(DatabaseError::Logical(format!(
-                "Node {node_id:?} not found for setting az id",
-            )))
-        } else {
-            Ok(())
-        }
-    }
-
     /// At startup, load the high level state for shards, such as their config + policy.  This will
     /// be enriched at runtime with state discovered on pageservers.
     pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
@@ -1110,7 +1084,7 @@ pub(crate) struct NodePersistence {
     pub(crate) listen_http_port: i32,
     pub(crate) listen_pg_addr: String,
     pub(crate) listen_pg_port: i32,
-    pub(crate) availability_zone_id: Option<String>,
+    pub(crate) availability_zone_id: String,
 }
 
 /// Tenant metadata health status that are stored durably.
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index ef4da6861c..deb5f27226 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -528,7 +528,7 @@ pub(crate) mod test_utils {
                         80 + i as u16,
                         format!("pghost-{i}"),
                         5432 + i as u16,
-                        None,
+                        "test-az".to_string(),
                     );
                     node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
                     assert!(node.is_available());
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index e0f515daea..93ab774b5f 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -25,7 +25,7 @@ diesel::table! {
         listen_http_port -> Int4,
         listen_pg_addr -> Varchar,
         listen_pg_port -> Int4,
-        availability_zone_id -> Nullable<Varchar>,
+        availability_zone_id -> Varchar,
     }
 }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ca416095bb..2911cd5ac4 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1264,7 +1264,7 @@ impl Service {
                     123,
                     "".to_string(),
                     123,
-                    None,
+                    "test_az".to_string(),
                 );
 
                 scheduler.node_upsert(&node);
@@ -4825,15 +4825,8 @@ impl Service {
         )
         .await;
 
-        if register_req.availability_zone_id.is_none() {
-            tracing::warn!(
-                "Node {} registering without specific availability zone id",
-                register_req.node_id
-            );
-        }
-
         enum RegistrationStatus {
-            Matched(Node),
+            Matched,
             Mismatched,
             New,
         }
@@ -4842,7 +4835,7 @@ impl Service {
             let locked = self.inner.read().unwrap();
             if let Some(node) = locked.nodes.get(&register_req.node_id) {
                 if node.registration_match(&register_req) {
-                    RegistrationStatus::Matched(node.clone())
+                    RegistrationStatus::Matched
                 } else {
                     RegistrationStatus::Mismatched
                 }
@@ -4852,41 +4845,12 @@ impl Service {
         };
 
         match registration_status {
-            RegistrationStatus::Matched(node) => {
+            RegistrationStatus::Matched => {
                 tracing::info!(
                     "Node {} re-registered with matching address",
                     register_req.node_id
                 );
 
-                if node.get_availability_zone_id().is_none() {
-                    if let Some(az_id) = register_req.availability_zone_id.clone() {
-                        tracing::info!("Extracting availability zone id from registration request for node {}: {}",
-                                       register_req.node_id, az_id);
-
-                        // Persist to the database and update in memory state. See comment below
-                        // on ordering.
-                        self.persistence
-                            .set_node_availability_zone_id(register_req.node_id, az_id)
-                            .await?;
-                        let node_with_az = Node::new(
-                            register_req.node_id,
-                            register_req.listen_http_addr,
-                            register_req.listen_http_port,
-                            register_req.listen_pg_addr,
-                            register_req.listen_pg_port,
-                            register_req.availability_zone_id,
-                        );
-
-                        let mut locked = self.inner.write().unwrap();
-                        let mut new_nodes = (*locked.nodes).clone();
-
-                        locked.scheduler.node_upsert(&node_with_az);
-                        new_nodes.insert(register_req.node_id, node_with_az);
-
-                        locked.nodes = Arc::new(new_nodes);
-                    }
-                }
-
                 return Ok(());
             }
             RegistrationStatus::Mismatched => {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 2df45a7e0e..0c692ceb69 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -758,6 +758,9 @@ class NeonEnvBuilder:
         patch_script = ""
         for ps in self.env.pageservers:
             patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg}  WHERE node_id = '{ps.id}';"
+            # This is a temporary to get the backward compat test happy
+            # since the compat snapshot was generated with an older version of neon local
+            patch_script += f"UPDATE nodes SET availability_zone_id='{ps.az_id}'  WHERE node_id = '{ps.id}' AND availability_zone_id IS NULL;"
         patch_script_path.write_text(patch_script)
 
         # Update the config with info about tenants and timelines

From cf11c8ab6aa234b59354425116da98d58fa1826d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 6 Sep 2024 10:52:29 +0200
Subject: [PATCH 1567/1571] update svg_fmt to 0.4.3 (#8930)

Audited

```
diff -r -u ~/.cargo/registry/src/index.crates.io-6f17d22bba15001f/svg_fmt-0.4.{2,3}
```

fixes https://github.com/neondatabase/neon/issues/7763
---
 Cargo.lock | 5 +++--
 Cargo.toml | 3 +--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 91917d5351..3f2787f15b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6094,8 +6094,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 
 [[package]]
 name = "svg_fmt"
-version = "0.4.2"
-source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20e16a0f46cf5fd675563ef54f26e83e20f2366bcf027bcb3cc3ed2b98aaf2ca"
 
 [[package]]
 name = "syn"
diff --git a/Cargo.toml b/Cargo.toml
index 4fea3e8d80..2415337110 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -161,8 +161,7 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
-svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
+svg_fmt = "0.4.3"
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"

From 06e840b884c242550e2a5ad0e72bfa762bce1709 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 6 Sep 2024 10:58:48 +0200
Subject: [PATCH 1568/1571] compact_level0_phase1: ignore access mode config,
 always do streaming-kmerge without validation (#8934)

refs https://github.com/neondatabase/neon/issues/8184

PR https://github.com/neondatabase/infra/pull/1905 enabled
streaming-kmerge without validation everywhere.

It rolls into prod sooner or in the same release as this PR.
---
 libs/pageserver_api/src/config.rs            |  43 +-----
 pageserver/src/bin/pageserver.rs             |   1 -
 pageserver/src/config.rs                     |  26 ++--
 pageserver/src/tenant/timeline/compaction.rs | 139 +------------------
 4 files changed, 22 insertions(+), 187 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index b2662c562a..1194ee93ef 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -104,7 +104,9 @@ pub struct ConfigToml {
     pub image_compression: ImageCompressionAlgorithm,
     pub ephemeral_bytes_per_memory_kb: usize,
     pub l0_flush: Option<crate::models::L0FlushConfig>,
-    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
+    #[serde(skip_serializing)]
+    // TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's
+    pub compact_level0_phase1_value_access: serde::de::IgnoredAny,
     pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
     pub io_buffer_alignment: usize,
 }
@@ -209,43 +211,6 @@ pub enum GetImpl {
 #[serde(transparent)]
 pub struct MaxVectoredReadBytes(pub NonZeroUsize);
 
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-pub enum CompactL0Phase1ValueAccess {
-    /// The old way.
-    PageCachedBlobIo,
-    /// The new way.
-    StreamingKmerge {
-        /// If set, we run both the old way and the new way, validate that
-        /// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
-        /// and if the validation fails,
-        /// - in tests: fail them with a panic or
-        /// - in prod, log a rate-limited warning and use the old way's results.
-        ///
-        /// If not set, we only run the new way and trust its results.
-        validate: Option<CompactL0BypassPageCacheValidation>,
-    },
-}
-
-/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(rename_all = "kebab-case")]
-pub enum CompactL0BypassPageCacheValidation {
-    /// Validate that the series of (key, lsn) pairs are the same.
-    KeyLsn,
-    /// Validate that the entire output of old and new way is identical.
-    KeyLsnValue,
-}
-
-impl Default for CompactL0Phase1ValueAccess {
-    fn default() -> Self {
-        CompactL0Phase1ValueAccess::StreamingKmerge {
-            // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
-            validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
-        }
-    }
-}
-
 /// A tenant's calcuated configuration, which is the result of merging a
 /// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
 ///
@@ -452,7 +417,7 @@ impl Default for ConfigToml {
             image_compression: (DEFAULT_IMAGE_COMPRESSION),
             ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: None,
-            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+            compact_level0_phase1_value_access: Default::default(),
             virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
 
             io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 2c60e8d7d1..59194ab4bd 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -125,7 +125,6 @@ fn main() -> anyhow::Result<()> {
     // after setting up logging, log the effective IO engine choice and read path implementations
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
-    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
     info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");
 
     // The tenants directory contains all the pageserver local disk state.
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index c159b66905..4e68e276d3 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -174,10 +174,6 @@ pub struct PageServerConf {
 
     pub l0_flush: crate::l0_flush::L0FlushConfig,
 
-    /// This flag is temporary and will be removed after gradual rollout.
-    /// See <https://github.com/neondatabase/neon/issues/8184>.
-    pub compact_level0_phase1_value_access: pageserver_api::config::CompactL0Phase1ValueAccess,
-
     /// Direct IO settings
     pub virtual_file_direct_io: virtual_file::DirectIoMode,
 
@@ -338,7 +334,7 @@ impl PageServerConf {
             max_vectored_read_bytes,
             image_compression,
             ephemeral_bytes_per_memory_kb,
-            compact_level0_phase1_value_access,
+            compact_level0_phase1_value_access: _,
             l0_flush,
             virtual_file_direct_io,
             concurrent_tenant_warmup,
@@ -383,7 +379,6 @@ impl PageServerConf {
             max_vectored_read_bytes,
             image_compression,
             ephemeral_bytes_per_memory_kb,
-            compact_level0_phase1_value_access,
             virtual_file_direct_io,
             io_buffer_alignment,
 
@@ -561,6 +556,16 @@ mod tests {
             .expect("parse_and_validate");
     }
 
+    #[test]
+    fn test_compactl0_phase1_access_mode_is_ignored_silently() {
+        let input = indoc::indoc! {r#"
+            [compact_level0_phase1_value_access]
+            mode = "streaming-kmerge"
+            validate = "key-lsn-value"
+        "#};
+        toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input).unwrap();
+    }
+
     /// If there's a typo in the pageserver config, we'd rather catch that typo
     /// and fail pageserver startup than silently ignoring the typo, leaving whoever
     /// made it in the believe that their config change is effective.
@@ -637,14 +642,5 @@ mod tests {
         //         some_invalid_field = 23
         //     "#}
         // );
-
-        test!(
-            compact_level0_phase1_value_access,
-            indoc! {r#"
-                [compact_level0_phase1_value_access]
-                mode = "streaming-kmerge"
-                some_invalid_field = 23
-            "#}
-        );
     }
 }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 6b9c8386f7..a87b502cd6 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -19,7 +19,6 @@ use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
-use pageserver_api::config::{CompactL0BypassPageCacheValidation, CompactL0Phase1ValueAccess};
 use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
@@ -912,137 +911,13 @@ impl Timeline {
         // we're compacting, in key, LSN order.
         // If there's both a Value::Image and Value::WalRecord for the same (key,lsn),
         // then the Value::Image is ordered before Value::WalRecord.
-        //
-        // TODO(https://github.com/neondatabase/neon/issues/8184): remove the page cached blob_io
-        // option and validation code once we've reached confidence.
-        enum AllValuesIter<'a> {
-            PageCachedBlobIo {
-                all_keys_iter: VecIter<'a>,
-            },
-            StreamingKmergeBypassingPageCache {
-                merge_iter: MergeIterator<'a>,
-            },
-            ValidatingStreamingKmergeBypassingPageCache {
-                mode: CompactL0BypassPageCacheValidation,
-                merge_iter: MergeIterator<'a>,
-                all_keys_iter: VecIter<'a>,
-            },
-        }
-        type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes
-        impl AllValuesIter<'_> {
-            async fn next_all_keys_iter(
-                iter: &mut VecIter<'_>,
-                ctx: &RequestContext,
-            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-                let Some(DeltaEntry {
-                    key,
-                    lsn,
-                    val: value_ref,
-                    ..
-                }) = iter.next()
-                else {
-                    return Ok(None);
-                };
-                let value = value_ref.load(ctx).await?;
-                Ok(Some((*key, *lsn, value)))
-            }
-            async fn next(
-                &mut self,
-                ctx: &RequestContext,
-            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-                match self {
-                    AllValuesIter::PageCachedBlobIo { all_keys_iter: iter } => {
-                      Self::next_all_keys_iter(iter, ctx).await
-                    }
-                    AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await,
-                    AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async {
-                        // advance both iterators
-                        let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await;
-                        let merge_iter_item = merge_iter.next().await;
-                        // compare results & log warnings as needed
-                        macro_rules! rate_limited_warn {
-                            ($($arg:tt)*) => {{
-                                if cfg!(debug_assertions) || cfg!(feature = "testing") {
-                                    warn!($($arg)*);
-                                    panic!("CompactL0BypassPageCacheValidation failure, check logs");
-                                }
-                                use once_cell::sync::Lazy;
-                                use utils::rate_limit::RateLimit;
-                                use std::sync::Mutex;
-                                use std::time::Duration;
-                                static LOGGED: Lazy<Mutex<RateLimit>> =
-                                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                                let mut rate_limit = LOGGED.lock().unwrap();
-                                rate_limit.call(|| {
-                                    warn!($($arg)*);
-                                });
-                            }}
-                        }
-                        match (&all_keys_iter_item, &merge_iter_item) {
-                            (Err(_), Err(_)) => {
-                                // don't bother asserting equivality of the errors
-                            }
-                            (Err(all_keys), Ok(merge)) => {
-                                rate_limited_warn!(?merge, "all_keys_iter returned an error where merge did not: {all_keys:?}");
-                            },
-                            (Ok(all_keys), Err(merge)) => {
-                                rate_limited_warn!(?all_keys, "merge returned an error where all_keys_iter did not: {merge:?}");
-                            },
-                            (Ok(None), Ok(None)) => { }
-                            (Ok(Some(all_keys)), Ok(None)) => {
-                                rate_limited_warn!(?all_keys, "merge returned None where all_keys_iter returned Some");
-                            }
-                            (Ok(None), Ok(Some(merge))) => {
-                                rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some");
-                            }
-                            (Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => {
-                                match mode {
-                                    // TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one
-                                    CompactL0BypassPageCacheValidation::KeyLsn => {
-                                        let all_keys = (all_keys_key, all_keys_lsn);
-                                        let merge = (merge_key, merge_lsn);
-                                        if all_keys != merge {
-                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter");
-                                        }
-                                    }
-                                    CompactL0BypassPageCacheValidation::KeyLsnValue => {
-                                        let all_keys = (all_keys_key, all_keys_lsn, all_keys_value);
-                                        let merge = (merge_key, merge_lsn, merge_value);
-                                        if all_keys != merge {
-                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN,Value) than all_keys_iter");
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                        // in case of mismatch, trust the legacy all_keys_iter_item
-                        all_keys_iter_item
-                    }.instrument(info_span!("next")).await
-                }
-            }
-        }
-        let mut all_values_iter = match &self.conf.compact_level0_phase1_value_access {
-            CompactL0Phase1ValueAccess::PageCachedBlobIo => AllValuesIter::PageCachedBlobIo {
-                all_keys_iter: all_keys.iter(),
-            },
-            CompactL0Phase1ValueAccess::StreamingKmerge { validate } => {
-                let merge_iter = {
-                    let mut deltas = Vec::with_capacity(deltas_to_compact.len());
-                    for l in deltas_to_compact.iter() {
-                        let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
-                        deltas.push(l);
-                    }
-                    MergeIterator::create(&deltas, &[], ctx)
-                };
-                match validate {
-                    None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter },
-                    Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache {
-                        mode: validate.clone(),
-                        merge_iter,
-                        all_keys_iter: all_keys.iter(),
-                    },
-                }
+        let mut all_values_iter = {
+            let mut deltas = Vec::with_capacity(deltas_to_compact.len());
+            for l in deltas_to_compact.iter() {
+                let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
+                deltas.push(l);
             }
+            MergeIterator::create(&deltas, &[], ctx)
         };
 
         // This iterator walks through all keys and is needed to calculate size used by each key
@@ -1119,7 +994,7 @@ impl Timeline {
         let mut keys = 0;
 
         while let Some((key, lsn, value)) = all_values_iter
-            .next(ctx)
+            .next()
             .await
             .map_err(CompactionError::Other)?
         {

From a1323231bc65539f55eb1bfd341fb65d06d0ed22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 6 Sep 2024 12:40:19 +0200
Subject: [PATCH 1569/1571] Update Rust to 1.81.0 (#8939)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

[Release notes](https://github.com/rust-lang/rust/blob/master/RELEASES.md#version-1810-2024-09-05).

Prior update was in #8667 and #8518
---
 Cargo.lock                                    | 30 +++++--------------
 Cargo.toml                                    |  2 +-
 Dockerfile.build-tools                        |  2 +-
 libs/postgres_ffi/build.rs                    |  2 +-
 libs/walproposer/build.rs                     | 21 ++++++++++---
 libs/walproposer/src/api_bindings.rs          | 10 +++----
 .../tenant/remote_timeline_client/download.rs |  3 +-
 proxy/src/console/provider/neon.rs            |  5 +---
 rust-toolchain.toml                           |  2 +-
 safekeeper/src/send_wal.rs                    |  5 ++--
 workspace_hack/Cargo.toml                     |  6 ++--
 11 files changed, 42 insertions(+), 46 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3f2787f15b..634af67198 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -915,25 +915,22 @@ dependencies = [
 
 [[package]]
 name = "bindgen"
-version = "0.65.1"
+version = "0.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5"
+checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.4.1",
  "cexpr",
  "clang-sys",
- "lazy_static",
- "lazycell",
+ "itertools 0.12.1",
  "log",
- "peeking_take_while",
- "prettyplease 0.2.6",
+ "prettyplease 0.2.17",
  "proc-macro2",
  "quote",
  "regex",
  "rustc-hash",
  "shlex",
  "syn 2.0.52",
- "which",
 ]
 
 [[package]]
@@ -2949,12 +2946,6 @@ dependencies = [
  "spin 0.5.2",
 ]
 
-[[package]]
-name = "lazycell"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
-
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3977,12 +3968,6 @@ dependencies = [
  "sha2",
 ]
 
-[[package]]
-name = "peeking_take_while"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
-
 [[package]]
 name = "pem"
 version = "3.0.3"
@@ -4280,9 +4265,9 @@ dependencies = [
 
 [[package]]
 name = "prettyplease"
-version = "0.2.6"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
+checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7"
 dependencies = [
  "proc-macro2",
  "syn 2.0.52",
@@ -7628,6 +7613,7 @@ dependencies = [
  "hyper 0.14.26",
  "indexmap 1.9.3",
  "itertools 0.10.5",
+ "itertools 0.12.1",
  "lazy_static",
  "libc",
  "log",
diff --git a/Cargo.toml b/Cargo.toml
index 2415337110..5045ee0d4d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -64,7 +64,7 @@ aws-types = "1.2.0"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
-bindgen = "0.65"
+bindgen = "0.70"
 bit_field = "0.10.2"
 bstr = "1.0"
 byteorder = "1.4"
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index d6beb61369..a9cbed85fb 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -192,7 +192,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.80.1
+ENV RUSTC_VERSION=1.81.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs
index 370d9e9a6f..d3e3ce648f 100644
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -14,7 +14,7 @@ impl ParseCallbacks for PostgresFfiCallbacks {
     fn include_file(&self, filename: &str) {
         // This does the equivalent of passing bindgen::CargoCallbacks
         // to the builder .parse_callbacks() method.
-        let cargo_callbacks = bindgen::CargoCallbacks;
+        let cargo_callbacks = bindgen::CargoCallbacks::new();
         cargo_callbacks.include_file(filename)
     }
 
diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs
index 7bb077062b..28547f52bf 100644
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -4,7 +4,6 @@
 use std::{env, path::PathBuf, process::Command};
 
 use anyhow::{anyhow, Context};
-use bindgen::CargoCallbacks;
 
 fn main() -> anyhow::Result<()> {
     // Tell cargo to invalidate the built crate whenever the wrapper changes
@@ -64,16 +63,25 @@ fn main() -> anyhow::Result<()> {
             .map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
     };
 
+    let unwind_abi_functions = [
+        "log_internal",
+        "recovery_download",
+        "start_streaming",
+        "finish_sync_safekeepers",
+        "wait_event_set",
+        "WalProposerStart",
+    ];
+
     // The bindgen::Builder is the main entry point
     // to bindgen, and lets you build up options for
     // the resulting bindings.
-    let bindings = bindgen::Builder::default()
+    let mut builder = bindgen::Builder::default()
         // The input header we would like to generate
         // bindings for.
         .header("bindgen_deps.h")
         // Tell cargo to invalidate the built crate whenever any of the
         // included header files changed.
-        .parse_callbacks(Box::new(CargoCallbacks))
+        .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
         .allowlist_type("WalProposer")
         .allowlist_type("WalProposerConfig")
         .allowlist_type("walproposer_api")
@@ -105,7 +113,12 @@ fn main() -> anyhow::Result<()> {
         .allowlist_var("WL_SOCKET_MASK")
         .clang_arg("-DWALPROPOSER_LIB")
         .clang_arg(format!("-I{pgxn_neon}"))
-        .clang_arg(format!("-I{inc_server_path}"))
+        .clang_arg(format!("-I{inc_server_path}"));
+
+    for name in unwind_abi_functions {
+        builder = builder.override_abi(bindgen::Abi::CUnwind, name);
+    }
+    let bindings = builder
         // Finish the builder and generate the bindings.
         .generate()
         // Unwrap the Result and panic on failure.
diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index bbc3663402..2fbea3fe45 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -33,7 +33,7 @@ extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemStat
     }
 }
 
-extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
+extern "C-unwind" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
     unsafe {
         let callback_data = (*(*wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -187,7 +187,7 @@ extern "C" fn conn_blocking_write(
     }
 }
 
-extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
+extern "C-unwind" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
     unsafe {
         let callback_data = (*(*(*sk).wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -272,7 +272,7 @@ extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
     }
 }
 
-extern "C" fn wait_event_set(
+extern "C-unwind" fn wait_event_set(
     wp: *mut WalProposer,
     timeout: ::std::os::raw::c_long,
     event_sk: *mut *mut Safekeeper,
@@ -324,7 +324,7 @@ extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
     }
 }
 
-extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
+extern "C-unwind" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
     unsafe {
         let callback_data = (*(*wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -340,7 +340,7 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekee
     }
 }
 
-extern "C" fn log_internal(
+extern "C-unwind" fn log_internal(
     wp: *mut WalProposer,
     level: ::std::os::raw::c_int,
     line: *const ::std::os::raw::c_char,
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index d9725ad756..9fbe2f0da5 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -548,7 +548,7 @@ pub(crate) async fn download_initdb_tar_zst(
         cancel,
     )
     .await
-    .map_err(|e| {
+    .inspect_err(|_e| {
         // Do a best-effort attempt at deleting the temporary file upon encountering an error.
         // We don't have async here nor do we want to pile on any extra errors.
         if let Err(e) = std::fs::remove_file(&temp_path) {
@@ -556,7 +556,6 @@ pub(crate) async fn download_initdb_tar_zst(
                 warn!("error deleting temporary file {temp_path}: {e}");
             }
         }
-        e
     })?;
 
     Ok((temp_path, file))
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 33eda72e65..b004bf4ecf 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -38,10 +38,7 @@ impl Api {
         locks: &'static ApiLocks<EndpointCacheKey>,
         wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
     ) -> Self {
-        let jwt = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
-            Ok(v) => v,
-            Err(_) => String::new(),
-        };
+        let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN").unwrap_or_default();
         Self {
             endpoint,
             caches,
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 368b8d300a..e78c4d6790 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.80.1"
+channel = "1.81.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 90b1604adb..6d677f405a 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -758,9 +758,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
                 // pq_sendint32(&reply_message, xmin);
                 // pq_sendint32(&reply_message, xmin_epoch);
                 // So it is two big endian 32-bit words in low endian order!
-                hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32);
-                hs_feedback.catalog_xmin =
-                    (hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32);
+                hs_feedback.xmin = hs_feedback.xmin.rotate_left(32);
+                hs_feedback.catalog_xmin = hs_feedback.catalog_xmin.rotate_left(32);
                 self.ws_guard
                     .walsenders
                     .record_hs_feedback(self.ws_guard.id, &hs_feedback);
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 20693ad63d..3d2fa8c214 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -47,7 +47,8 @@ hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
-itertools = { version = "0.10" }
+itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] }
+itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" }
 lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -101,7 +102,8 @@ either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
-itertools = { version = "0.10" }
+itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] }
+itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" }
 lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }

From e86fef05ddbc276170ec29d035d86d03e3ad4ec2 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 6 Sep 2024 13:11:17 +0100
Subject: [PATCH 1570/1571] storcon: track preferred AZ for each tenant shard
 (#8937)

## Problem
We want to do AZ aware scheduling, but don't have enough metadata.

## Summary of changes
Introduce a `preferred_az_id` concept for each managed tenant shard.

In a future PR, the scheduler will use this as a soft preference.
The idea is to try and keep the shard attachments within the same AZ.
Under the assumption that the compute was placed in the correct AZ,
this reduces the chances of cross AZ trafic from between compute and PS.

In terms of code changes we:
1. Add a new nullable `preferred_az_id` column to the `tenant_shards`
table. Also include an in-memory counterpart.
2. Populate the preferred az on tenant creation and shard splits.
3. Add an endpoint which allows to bulk-set preferred AZs.

(3) gives us the migration path. I'll write a script which queries the
cplane db in the region and sets the preferred az of all shards with an
active compute to the AZ of said compute. For shards without an active compute,
I'll use the AZ of the currently attached pageserver
since this is what cplane uses now to schedule computes.
---
 libs/pageserver_api/src/controller_api.rs     |  15 +-
 .../down.sql                                  |   1 +
 .../up.sql                                    |   1 +
 storage_controller/src/http.rs                |  21 +-
 storage_controller/src/persistence.rs         |  33 ++
 storage_controller/src/schema.rs              |   1 +
 storage_controller/src/service.rs             | 327 +++++++++++++-----
 storage_controller/src/tenant_shard.rs        |  15 +
 test_runner/fixtures/neon_fixtures.py         |  13 +-
 .../regress/test_storage_controller.py        |  52 +++
 10 files changed, 384 insertions(+), 95 deletions(-)
 create mode 100644 storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql
 create mode 100644 storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 6fb5a9a139..94104af002 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,4 +1,4 @@
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::str::FromStr;
 use std::time::{Duration, Instant};
 
@@ -74,6 +74,17 @@ pub struct TenantPolicyRequest {
     pub scheduling: Option<ShardSchedulingPolicy>,
 }
 
+#[derive(Serialize, Deserialize)]
+pub struct ShardsPreferredAzsRequest {
+    #[serde(flatten)]
+    pub preferred_az_ids: HashMap<TenantShardId, String>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct ShardsPreferredAzsResponse {
+    pub updated: Vec<TenantShardId>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantLocateResponseShard {
     pub shard_id: TenantShardId,
@@ -132,6 +143,8 @@ pub struct TenantDescribeResponseShard {
     pub is_splitting: bool,
 
     pub scheduling_policy: ShardSchedulingPolicy,
+
+    pub preferred_az_id: Option<String>,
 }
 
 /// Explicitly migrating a particular shard is a low level operation
diff --git a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql
new file mode 100644
index 0000000000..127972a2e4
--- /dev/null
+++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql
@@ -0,0 +1 @@
+ALTER TABLE tenant_shards DROP preferred_az_id;
diff --git a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql
new file mode 100644
index 0000000000..641a54feb2
--- /dev/null
+++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql
@@ -0,0 +1 @@
+ALTER TABLE tenant_shards ADD preferred_az_id VARCHAR;
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 32882c201a..5d4d0460be 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -14,7 +14,7 @@ use metrics::{BuildInfo, NeonMetrics};
 use pageserver_api::controller_api::{
     MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
     MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
-    TenantCreateRequest,
+    ShardsPreferredAzsRequest, TenantCreateRequest,
 };
 use pageserver_api::models::{
     TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
@@ -688,6 +688,18 @@ async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<
     )
 }
 
+async fn handle_update_preferred_azs(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let azs_req = json_request::<ShardsPreferredAzsRequest>(&mut req).await?;
+    let state = get_state(&req);
+
+    json_response(
+        StatusCode::OK,
+        state.service.update_shards_preferred_azs(azs_req).await?,
+    )
+}
+
 async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -1174,6 +1186,13 @@ pub fn make_router(
                 RequestName("control_v1_tenant_policy"),
             )
         })
+        .put("/control/v1/preferred_azs", |r| {
+            named_request_span(
+                r,
+                handle_update_preferred_azs,
+                RequestName("control_v1_preferred_azs"),
+            )
+        })
         .put("/control/v1/step_down", |r| {
             named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
         })
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 6df05ebd13..1dc1040d96 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -105,6 +105,7 @@ pub(crate) enum DatabaseOperation {
     ListMetadataHealthOutdated,
     GetLeader,
     UpdateLeader,
+    SetPreferredAzs,
 }
 
 #[must_use]
@@ -664,6 +665,33 @@ impl Persistence {
         Ok(())
     }
 
+    pub(crate) async fn set_tenant_shard_preferred_azs(
+        &self,
+        preferred_azs: Vec<(TenantShardId, String)>,
+    ) -> DatabaseResult<Vec<(TenantShardId, String)>> {
+        use crate::schema::tenant_shards::dsl::*;
+
+        self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
+            let mut shards_updated = Vec::default();
+
+            for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .set(preferred_az_id.eq(preferred_az))
+                    .execute(conn)?;
+
+                if updated == 1 {
+                    shards_updated.push((*tenant_shard_id, preferred_az.clone()));
+                }
+            }
+
+            Ok(shards_updated)
+        })
+        .await
+    }
+
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
         use crate::schema::tenant_shards::dsl::*;
         self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
@@ -1050,6 +1078,11 @@ pub(crate) struct TenantShardPersistence {
     pub(crate) config: String,
     #[serde(default)]
     pub(crate) scheduling_policy: String,
+
+    // Hint that we should attempt to schedule this tenant shard the given
+    // availability zone in order to minimise the chances of cross-AZ communication
+    // with compute.
+    pub(crate) preferred_az_id: Option<String>,
 }
 
 impl TenantShardPersistence {
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 93ab774b5f..1717a9369d 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -41,6 +41,7 @@ diesel::table! {
         splitting -> Int2,
         config -> Text,
         scheduling_policy -> Varchar,
+        preferred_az_id -> Nullable<Varchar>,
     }
 }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 2911cd5ac4..324f864291 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -25,7 +25,7 @@ use crate::{
         ShardGenerationState, TenantFilter,
     },
     reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
-    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
+    scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
     tenant_shard::{
         MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
         ScheduleOptimizationAction,
@@ -41,10 +41,11 @@ use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
         MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
-        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
-        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
-        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
+        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
+        ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse,
+        TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
+        TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
+        TenantShardMigrateResponse,
     },
     models::{
         SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
@@ -353,6 +354,12 @@ impl From<DatabaseError> for ApiError {
     }
 }
 
+enum InitialShardScheduleOutcome {
+    Scheduled(TenantCreateResponseShard),
+    NotScheduled,
+    ShardScheduleError(ScheduleError),
+}
+
 pub struct Service {
     inner: Arc<std::sync::RwLock<ServiceState>>,
     config: Config,
@@ -1452,6 +1459,7 @@ impl Service {
                 splitting: SplitState::default(),
                 scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                     .unwrap(),
+                preferred_az_id: None,
             };
 
             match self.persistence.insert_tenant_shards(vec![tsp]).await {
@@ -2023,6 +2031,7 @@ impl Service {
                 splitting: SplitState::default(),
                 scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                     .unwrap(),
+                preferred_az_id: None,
             })
             .collect();
 
@@ -2046,99 +2055,87 @@ impl Service {
         };
 
         let mut schedule_context = ScheduleContext::default();
+        let mut schedule_error = None;
+        let mut response_shards = Vec::new();
+        for tenant_shard_id in create_ids {
+            tracing::info!("Creating shard {tenant_shard_id}...");
 
-        let (waiters, response_shards) = {
+            let outcome = self
+                .do_initial_shard_scheduling(
+                    tenant_shard_id,
+                    initial_generation,
+                    &create_req.shard_parameters,
+                    create_req.config.clone(),
+                    placement_policy.clone(),
+                    &mut schedule_context,
+                )
+                .await;
+
+            match outcome {
+                InitialShardScheduleOutcome::Scheduled(resp) => response_shards.push(resp),
+                InitialShardScheduleOutcome::NotScheduled => {}
+                InitialShardScheduleOutcome::ShardScheduleError(err) => {
+                    schedule_error = Some(err);
+                }
+            }
+        }
+
+        let preferred_azs = {
+            let locked = self.inner.read().unwrap();
+            response_shards
+                .iter()
+                .filter_map(|resp| {
+                    let az_id = locked
+                        .nodes
+                        .get(&resp.node_id)
+                        .map(|n| n.get_availability_zone_id().to_string())?;
+
+                    Some((resp.shard_id, az_id))
+                })
+                .collect::<Vec<_>>()
+        };
+
+        // Note that we persist the preferred AZ for the new shards separately.
+        // In theory, we could "peek" the scheduler to determine where the shard will
+        // land, but the subsequent "real" call into the scheduler might select a different
+        // node. Hence, we do this awkward update to keep things consistent.
+        let updated = self
+            .persistence
+            .set_tenant_shard_preferred_azs(preferred_azs)
+            .await
+            .map_err(|err| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "Failed to persist preferred az ids: {err}"
+                ))
+            })?;
+
+        {
             let mut locked = self.inner.write().unwrap();
-            let (nodes, tenants, scheduler) = locked.parts_mut();
-
-            let mut response_shards = Vec::new();
-            let mut schcedule_error = None;
-
-            for tenant_shard_id in create_ids {
-                tracing::info!("Creating shard {tenant_shard_id}...");
-
-                use std::collections::btree_map::Entry;
-                match tenants.entry(tenant_shard_id) {
-                    Entry::Occupied(mut entry) => {
-                        tracing::info!(
-                            "Tenant shard {tenant_shard_id} already exists while creating"
-                        );
-
-                        // TODO: schedule() should take an anti-affinity expression that pushes
-                        // attached and secondary locations (independently) away frorm those
-                        // pageservers also holding a shard for this tenant.
-
-                        entry
-                            .get_mut()
-                            .schedule(scheduler, &mut schedule_context)
-                            .map_err(|e| {
-                                ApiError::Conflict(format!(
-                                    "Failed to schedule shard {tenant_shard_id}: {e}"
-                                ))
-                            })?;
-
-                        if let Some(node_id) = entry.get().intent.get_attached() {
-                            let generation = entry
-                                .get()
-                                .generation
-                                .expect("Generation is set when in attached mode");
-                            response_shards.push(TenantCreateResponseShard {
-                                shard_id: tenant_shard_id,
-                                node_id: *node_id,
-                                generation: generation.into().unwrap(),
-                            });
-                        }
-
-                        continue;
-                    }
-                    Entry::Vacant(entry) => {
-                        let state = entry.insert(TenantShard::new(
-                            tenant_shard_id,
-                            ShardIdentity::from_params(
-                                tenant_shard_id.shard_number,
-                                &create_req.shard_parameters,
-                            ),
-                            placement_policy.clone(),
-                        ));
-
-                        state.generation = initial_generation;
-                        state.config = create_req.config.clone();
-                        if let Err(e) = state.schedule(scheduler, &mut schedule_context) {
-                            schcedule_error = Some(e);
-                        }
-
-                        // Only include shards in result if we are attaching: the purpose
-                        // of the response is to tell the caller where the shards are attached.
-                        if let Some(node_id) = state.intent.get_attached() {
-                            let generation = state
-                                .generation
-                                .expect("Generation is set when in attached mode");
-                            response_shards.push(TenantCreateResponseShard {
-                                shard_id: tenant_shard_id,
-                                node_id: *node_id,
-                                generation: generation.into().unwrap(),
-                            });
-                        }
-                    }
-                };
+            for (tid, az_id) in updated {
+                if let Some(shard) = locked.tenants.get_mut(&tid) {
+                    shard.set_preferred_az(az_id);
+                }
             }
+        }
 
-            // If we failed to schedule shards, then they are still created in the controller,
-            // but we return an error to the requester to avoid a silent failure when someone
-            // tries to e.g. create a tenant whose placement policy requires more nodes than
-            // are present in the system.  We do this here rather than in the above loop, to
-            // avoid situations where we only create a subset of shards in the tenant.
-            if let Some(e) = schcedule_error {
-                return Err(ApiError::Conflict(format!(
-                    "Failed to schedule shard(s): {e}"
-                )));
-            }
+        // If we failed to schedule shards, then they are still created in the controller,
+        // but we return an error to the requester to avoid a silent failure when someone
+        // tries to e.g. create a tenant whose placement policy requires more nodes than
+        // are present in the system.  We do this here rather than in the above loop, to
+        // avoid situations where we only create a subset of shards in the tenant.
+        if let Some(e) = schedule_error {
+            return Err(ApiError::Conflict(format!(
+                "Failed to schedule shard(s): {e}"
+            )));
+        }
 
-            let waiters = tenants
+        let waiters = {
+            let mut locked = self.inner.write().unwrap();
+            let (nodes, tenants, _scheduler) = locked.parts_mut();
+            tenants
                 .range_mut(TenantShardId::tenant_range(tenant_id))
                 .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes))
-                .collect::<Vec<_>>();
-            (waiters, response_shards)
+                .collect::<Vec<_>>()
         };
 
         Ok((
@@ -2149,6 +2146,78 @@ impl Service {
         ))
     }
 
+    /// Helper for tenant creation that does the scheduling for an individual shard. Covers both the
+    /// case of a new tenant and a pre-existing one.
+    async fn do_initial_shard_scheduling(
+        &self,
+        tenant_shard_id: TenantShardId,
+        initial_generation: Option<Generation>,
+        shard_params: &ShardParameters,
+        config: TenantConfig,
+        placement_policy: PlacementPolicy,
+        schedule_context: &mut ScheduleContext,
+    ) -> InitialShardScheduleOutcome {
+        let mut locked = self.inner.write().unwrap();
+        let (_nodes, tenants, scheduler) = locked.parts_mut();
+
+        use std::collections::btree_map::Entry;
+        match tenants.entry(tenant_shard_id) {
+            Entry::Occupied(mut entry) => {
+                tracing::info!("Tenant shard {tenant_shard_id} already exists while creating");
+
+                // TODO: schedule() should take an anti-affinity expression that pushes
+                // attached and secondary locations (independently) away frorm those
+                // pageservers also holding a shard for this tenant.
+
+                if let Err(err) = entry.get_mut().schedule(scheduler, schedule_context) {
+                    return InitialShardScheduleOutcome::ShardScheduleError(err);
+                }
+
+                if let Some(node_id) = entry.get().intent.get_attached() {
+                    let generation = entry
+                        .get()
+                        .generation
+                        .expect("Generation is set when in attached mode");
+                    InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard {
+                        shard_id: tenant_shard_id,
+                        node_id: *node_id,
+                        generation: generation.into().unwrap(),
+                    })
+                } else {
+                    InitialShardScheduleOutcome::NotScheduled
+                }
+            }
+            Entry::Vacant(entry) => {
+                let state = entry.insert(TenantShard::new(
+                    tenant_shard_id,
+                    ShardIdentity::from_params(tenant_shard_id.shard_number, shard_params),
+                    placement_policy,
+                ));
+
+                state.generation = initial_generation;
+                state.config = config;
+                if let Err(e) = state.schedule(scheduler, schedule_context) {
+                    return InitialShardScheduleOutcome::ShardScheduleError(e);
+                }
+
+                // Only include shards in result if we are attaching: the purpose
+                // of the response is to tell the caller where the shards are attached.
+                if let Some(node_id) = state.intent.get_attached() {
+                    let generation = state
+                        .generation
+                        .expect("Generation is set when in attached mode");
+                    InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard {
+                        shard_id: tenant_shard_id,
+                        node_id: *node_id,
+                        generation: generation.into().unwrap(),
+                    })
+                } else {
+                    InitialShardScheduleOutcome::NotScheduled
+                }
+            }
+        }
+    }
+
     /// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
     /// wait for reconciliation to complete before responding.
     async fn await_waiters(
@@ -3511,6 +3580,7 @@ impl Service {
                 is_pending_compute_notification: shard.pending_compute_notification,
                 is_splitting: matches!(shard.splitting, SplitState::Splitting),
                 scheduling_policy: *shard.get_scheduling_policy(),
+                preferred_az_id: shard.preferred_az().map(ToString::to_string),
             })
         }
 
@@ -4214,9 +4284,10 @@ impl Service {
                     config: serde_json::to_string(&config).unwrap(),
                     splitting: SplitState::Splitting,
 
-                    // Scheduling policies do not carry through to children
+                    // Scheduling policies and preferred AZ do not carry through to children
                     scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                         .unwrap(),
+                    preferred_az_id: None,
                 });
             }
 
@@ -4336,6 +4407,47 @@ impl Service {
         let (response, child_locations, waiters) =
             self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
 
+        // Now that we have scheduled the child shards, attempt to set their preferred AZ
+        // to that of the pageserver they've been attached on.
+        let preferred_azs = {
+            let locked = self.inner.read().unwrap();
+            child_locations
+                .iter()
+                .filter_map(|(tid, node_id, _stripe_size)| {
+                    let az_id = locked
+                        .nodes
+                        .get(node_id)
+                        .map(|n| n.get_availability_zone_id().to_string())?;
+
+                    Some((*tid, az_id))
+                })
+                .collect::<Vec<_>>()
+        };
+
+        let updated = self
+            .persistence
+            .set_tenant_shard_preferred_azs(preferred_azs)
+            .await
+            .map_err(|err| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "Failed to persist preferred az ids: {err}"
+                ))
+            });
+
+        match updated {
+            Ok(updated) => {
+                let mut locked = self.inner.write().unwrap();
+                for (tid, az_id) in updated {
+                    if let Some(shard) = locked.tenants.get_mut(&tid) {
+                        shard.set_preferred_az(az_id);
+                    }
+                }
+            }
+            Err(err) => {
+                tracing::warn!("Failed to persist preferred AZs after split: {err}");
+            }
+        }
+
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
         for (child_id, child_ps, stripe_size) in child_locations {
@@ -6497,4 +6609,35 @@ impl Service {
     ) -> Result<(), DatabaseError> {
         self.persistence.safekeeper_upsert(record).await
     }
+
+    pub(crate) async fn update_shards_preferred_azs(
+        &self,
+        req: ShardsPreferredAzsRequest,
+    ) -> Result<ShardsPreferredAzsResponse, ApiError> {
+        let preferred_azs = req.preferred_az_ids.into_iter().collect::<Vec<_>>();
+        let updated = self
+            .persistence
+            .set_tenant_shard_preferred_azs(preferred_azs)
+            .await
+            .map_err(|err| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "Failed to persist preferred AZs: {err}"
+                ))
+            })?;
+
+        let mut updated_in_mem_and_db = Vec::default();
+
+        let mut locked = self.inner.write().unwrap();
+        for (tid, az_id) in updated {
+            let shard = locked.tenants.get_mut(&tid);
+            if let Some(shard) = shard {
+                shard.set_preferred_az(az_id);
+                updated_in_mem_and_db.push(tid);
+            }
+        }
+
+        Ok(ShardsPreferredAzsResponse {
+            updated: updated_in_mem_and_db,
+        })
+    }
 }
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 30723a3b36..cdb0633e2b 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -140,6 +140,10 @@ pub(crate) struct TenantShard {
     // Support/debug tool: if something is going wrong or flapping with scheduling, this may
     // be set to a non-active state to avoid making changes while the issue is fixed.
     scheduling_policy: ShardSchedulingPolicy,
+
+    // We should attempt to schedule this shard in the provided AZ to
+    // decrease chances of cross-AZ compute.
+    preferred_az_id: Option<String>,
 }
 
 #[derive(Default, Clone, Debug, Serialize)]
@@ -463,6 +467,7 @@ impl TenantShard {
             last_error: Arc::default(),
             pending_compute_notification: false,
             scheduling_policy: ShardSchedulingPolicy::default(),
+            preferred_az_id: None,
         }
     }
 
@@ -1297,6 +1302,7 @@ impl TenantShard {
             pending_compute_notification: false,
             delayed_reconcile: false,
             scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
+            preferred_az_id: tsp.preferred_az_id,
         })
     }
 
@@ -1312,8 +1318,17 @@ impl TenantShard {
             config: serde_json::to_string(&self.config).unwrap(),
             splitting: SplitState::default(),
             scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
+            preferred_az_id: self.preferred_az_id.clone(),
         }
     }
+
+    pub(crate) fn preferred_az(&self) -> Option<&str> {
+        self.preferred_az_id.as_deref()
+    }
+
+    pub(crate) fn set_preferred_az(&mut self, preferred_az_id: String) {
+        self.preferred_az_id = Some(preferred_az_id);
+    }
 }
 
 #[cfg(test)]
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0c692ceb69..18fbbde637 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2560,7 +2560,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
     def tenant_describe(self, tenant_id: TenantId):
         """
-        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
+        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int, preferred_az_id: str}
         """
         response = self.request(
             "GET",
@@ -2886,6 +2886,17 @@ class NeonStorageController(MetricsGetter, LogUtils):
                 return None
             raise e
 
+    def set_preferred_azs(self, preferred_azs: dict[TenantShardId, str]) -> list[TenantShardId]:
+        response = self.request(
+            "PUT",
+            f"{self.api}/control/v1/preferred_azs",
+            headers=self.headers(TokenScope.ADMIN),
+            json={str(tid): az for tid, az in preferred_azs.items()},
+        )
+
+        response.raise_for_status()
+        return [TenantShardId.parse(tid) for tid in response.json()["updated"]]
+
     def __enter__(self) -> "NeonStorageController":
         return self
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 8da42294b0..92cd74eba5 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2512,3 +2512,55 @@ def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
                 del d[key]
 
     return compared[0] == compared[1]
+
+
+@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
+def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
+    def assign_az(ps_cfg):
+        az = f"az-{ps_cfg['id']}"
+        ps_cfg["availability_zone"] = az
+
+    neon_env_builder.pageserver_config_override = assign_az
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tids = [TenantId.generate() for _ in range(0, 3)]
+    for tid in tids:
+        env.storage_controller.tenant_create(tid)
+
+        shards = env.storage_controller.tenant_describe(tid)["shards"]
+        assert len(shards) == 1
+        attached_to = shards[0]["node_attached"]
+        expected_az = env.get_pageserver(attached_to).az_id
+
+        assert shards[0]["preferred_az_id"] == expected_az
+
+    updated = env.storage_controller.set_preferred_azs(
+        {TenantShardId(tid, 0, 0): "foo" for tid in tids}
+    )
+
+    assert set(updated) == set([TenantShardId(tid, 0, 0) for tid in tids])
+
+    for tid in tids:
+        shards = env.storage_controller.tenant_describe(tid)["shards"]
+        assert len(shards) == 1
+        assert shards[0]["preferred_az_id"] == "foo"
+
+    # Generate a layer to avoid shard split handling on ps from tripping
+    # up on debug assert.
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_timeline("bar", tids[0], timeline_id)
+
+    workload = Workload(env, tids[0], timeline_id, branch_name="bar")
+    workload.init()
+    workload.write_rows(256)
+    workload.validate()
+
+    env.storage_controller.tenant_shard_split(tids[0], shard_count=2)
+    shards = env.storage_controller.tenant_describe(tids[0])["shards"]
+    assert len(shards) == 2
+    for shard in shards:
+        attached_to = shard["node_attached"]
+        expected_az = env.get_pageserver(attached_to).az_id
+        assert shard["preferred_az_id"] == expected_az

From cbcd4058edb7a2c2bb3bfe1a6fc1ffb0d820b870 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 6 Sep 2024 14:33:52 +0200
Subject: [PATCH 1571/1571] Fix 1.82 clippy lint too_long_first_doc_paragraph
 (#8941)

Addresses the 1.82 beta clippy lint `too_long_first_doc_paragraph` by
adding newlines to the first sentence if it is short enough, and making
a short first sentence if there is the need.
---
 compute_tools/src/pg_helpers.rs                    |  7 ++++---
 libs/metrics/src/lib.rs                            |  1 +
 libs/pageserver_api/src/controller_api.rs          |  2 ++
 libs/pageserver_api/src/models.rs                  | 10 +++++++---
 libs/postgres_backend/src/lib.rs                   |  6 ++++--
 libs/postgres_connection/src/lib.rs                |  1 +
 libs/remote_storage/src/lib.rs                     |  6 +++++-
 libs/tenant_size_model/src/lib.rs                  |  7 ++++---
 libs/utils/src/circuit_breaker.rs                  |  6 ++++--
 libs/utils/src/id.rs                               |  6 ++++--
 libs/utils/src/lock_file.rs                        |  4 +++-
 libs/utils/src/pageserver_feedback.rs              |  1 +
 libs/utils/src/poison.rs                           |  2 ++
 libs/utils/src/shard.rs                            |  9 +++++----
 libs/utils/src/simple_rcu.rs                       |  7 +++----
 libs/utils/src/sync/heavier_once_cell.rs           |  4 +++-
 libs/utils/src/vec_map.rs                          |  1 +
 libs/utils/src/yielding_loop.rs                    |  7 ++++---
 pageserver/src/config.rs                           |  2 ++
 pageserver/src/context.rs                          | 10 ++++++----
 pageserver/src/pgdatadir_mapping.rs                |  8 +++++---
 pageserver/src/tenant.rs                           |  9 +++++----
 pageserver/src/tenant/metadata.rs                  |  9 +++++----
 pageserver/src/tenant/mgr.rs                       | 12 +++++++-----
 pageserver/src/tenant/remote_timeline_client.rs    |  2 ++
 .../src/tenant/remote_timeline_client/index.rs     |  1 +
 pageserver/src/tenant/storage_layer.rs             |  9 +++++----
 pageserver/src/tenant/storage_layer/delta_layer.rs |  9 +++++----
 pageserver/src/tenant/storage_layer/image_layer.rs |  8 +++++---
 pageserver/src/tenant/storage_layer/layer_desc.rs  |  6 ++++--
 pageserver/src/tenant/storage_layer/layer_name.rs  |  5 +++--
 .../src/tenant/storage_layer/merge_iterator.rs     |  8 +++++---
 .../src/tenant/storage_layer/split_writer.rs       | 14 ++++++++------
 pageserver/src/tenant/vectored_blob_io.rs          |  6 ++++--
 pageserver/src/virtual_file.rs                     |  5 +++--
 pageserver/src/walredo.rs                          | 11 +++++------
 proxy/src/stream.rs                                |  1 +
 safekeeper/src/pull_timeline.rs                    |  1 +
 safekeeper/src/receive_wal.rs                      |  6 ++++--
 safekeeper/src/state.rs                            |  8 +++++---
 safekeeper/src/timeline.rs                         |  1 +
 safekeeper/src/timeline_eviction.rs                |  8 +++++---
 safekeeper/src/timeline_guard.rs                   |  4 +++-
 safekeeper/src/timeline_manager.rs                 |  1 +
 safekeeper/src/timelines_set.rs                    |  3 ++-
 safekeeper/src/wal_backup_partial.rs               |  6 ++++--
 safekeeper/src/wal_service.rs                      |  1 +
 storage_controller/src/service.rs                  |  4 +++-
 storage_scrubber/src/garbage.rs                    |  7 ++++---
 storage_scrubber/src/metadata_stream.rs            |  4 +++-
 storage_scrubber/src/pageserver_physical_gc.rs     |  7 ++++---
 51 files changed, 180 insertions(+), 103 deletions(-)

diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 863fa9468f..b2dc265864 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -22,9 +22,10 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
 
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
 
-/// Escape a string for including it in a SQL literal. Wrapping the result
-/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
-/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
+/// Escape a string for including it in a SQL literal.
+///
+/// Wrapping the result with `E'{}'` or `'{}'` is not required,
+/// as it returns a ready-to-use SQL string literal, e.g. `'db'''` or `E'db\\'`.
 /// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
 /// for the original implementation.
 pub fn escape_literal(s: &str) -> String {
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index df000cd0fb..cd4526c089 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -68,6 +68,7 @@ macro_rules! register_uint_gauge {
 static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
 
 /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
+///
 /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
 /// while holding the lock.
 pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 94104af002..5c8dcbf571 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -147,6 +147,8 @@ pub struct TenantDescribeResponseShard {
     pub preferred_az_id: Option<String>,
 }
 
+/// Migration request for a given tenant shard to a given node.
+///
 /// Explicitly migrating a particular shard is a low level operation
 /// TODO: higher level "Reschedule tenant" operation where the request
 /// specifies some constraints, e.g. asking it to get off particular node(s)
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d13d04eb1b..ffe79c8350 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -305,8 +305,10 @@ pub struct TenantConfig {
     pub lsn_lease_length_for_ts: Option<String>,
 }
 
-/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
-/// tenant config. When the first aux file written, the policy will be persisted in the
+/// The policy for the aux file storage.
+///
+/// It can be switched through `switch_aux_file_policy` tenant config.
+/// When the first aux file written, the policy will be persisted in the
 /// `index_part.json` file and has a limited migration path.
 ///
 /// Currently, we only allow the following migration path:
@@ -896,7 +898,9 @@ pub struct WalRedoManagerStatus {
     pub process: Option<WalRedoManagerProcessStatus>,
 }
 
-/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
+/// The progress of a secondary tenant.
+///
+/// It is mostly useful when doing a long running download: e.g. initiating
 /// a download job, timing out while waiting for it to run, and then inspecting this status to understand
 /// what's happening.
 #[derive(Default, Debug, Serialize, Deserialize, Clone)]
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 7c7c6535b3..600f1d728c 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -69,8 +69,10 @@ impl QueryError {
 }
 
 /// Returns true if the given error is a normal consequence of a network issue,
-/// or the client closing the connection. These errors can happen during normal
-/// operations, and don't indicate a bug in our code.
+/// or the client closing the connection.
+///
+/// These errors can happen during normal operations,
+/// and don't indicate a bug in our code.
 pub fn is_expected_io_error(e: &io::Error) -> bool {
     use io::ErrorKind::*;
     matches!(
diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs
index 9f57f3d507..ddf9f7b610 100644
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -7,6 +7,7 @@ use std::fmt;
 use url::Host;
 
 /// Parses a string of format either `host:port` or `host` into a corresponding pair.
+///
 /// The `host` part should be a correct `url::Host`, while `port` (if present) should be
 /// a valid decimal u16 of digits only.
 pub fn parse_host_port<S: AsRef<str>>(host_port: S) -> Result<(Host, Option<u16>), anyhow::Error> {
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index cc1d3e0ae4..b5b69c9faf 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -45,6 +45,8 @@ pub use azure_core::Etag;
 
 pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
 
+/// Default concurrency limit for S3 operations
+///
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -300,7 +302,9 @@ pub trait RemoteStorage: Send + Sync + 'static {
     ) -> Result<(), TimeTravelError>;
 }
 
-/// DownloadStream is sensitive to the timeout and cancellation used with the original
+/// Data part of an ongoing [`Download`].
+///
+/// `DownloadStream` is sensitive to the timeout and cancellation used with the original
 /// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
 /// with `tokio::io::copy_buf`.
 // This has 'static because safekeepers do not use cancellation tokens (yet)
diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs
index a3e12cf0e3..974a498404 100644
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -5,9 +5,10 @@
 mod calculation;
 pub mod svg;
 
-/// StorageModel is the input to the synthetic size calculation. It represents
-/// a tree of timelines, with just the information that's needed for the
-/// calculation. This doesn't track timeline names or where each timeline
+/// StorageModel is the input to the synthetic size calculation.
+///
+/// It represents a tree of timelines, with just the information that's needed
+/// for the calculation. This doesn't track timeline names or where each timeline
 /// begins and ends, for example. Instead, it consists of "points of interest"
 /// on the timelines. A point of interest could be the timeline start or end point,
 /// the oldest point on a timeline that needs to be retained because of PITR
diff --git a/libs/utils/src/circuit_breaker.rs b/libs/utils/src/circuit_breaker.rs
index 720ea39d4f..e1ddfd8650 100644
--- a/libs/utils/src/circuit_breaker.rs
+++ b/libs/utils/src/circuit_breaker.rs
@@ -5,8 +5,10 @@ use std::{
 
 use metrics::IntCounter;
 
-/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
-/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
+/// Circuit breakers are for operations that are expensive and fallible.
+///
+/// If a circuit breaker fails repeatedly, we will stop attempting it for some
+/// period of time, to avoid denial-of-service from retries, and
 /// to mitigate the log spam from repeated failures.
 pub struct CircuitBreaker {
     /// An identifier that enables us to log useful errors when a circuit is broken
diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs
index db468e3054..2cda899b15 100644
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -249,8 +249,10 @@ macro_rules! id_newtype {
     };
 }
 
-/// Neon timeline IDs are different from PostgreSQL timeline
-/// IDs. They serve a similar purpose though: they differentiate
+/// Neon timeline ID.
+///
+/// They are different from PostgreSQL timeline
+/// IDs, but serve a similar purpose: they differentiate
 /// between different "histories" of the same cluster.  However,
 /// PostgreSQL timeline IDs are a bit cumbersome, because they are only
 /// 32-bits wide, and they must be in ascending order in any given
diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs
index 59c66ca757..3a2ed3e830 100644
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -100,7 +100,9 @@ pub enum LockFileRead {
 }
 
 /// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
-/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
+/// inspect its content.
+///
+/// It is not an `Err(...)` if the file does not exist or is already locked.
 /// Check the [`LockFileRead`] variants for details.
 pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
     let res = fs::OpenOptions::new().read(true).open(path);
diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs
index 3ddfa44f41..dede65e699 100644
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -8,6 +8,7 @@ use tracing::{trace, warn};
 use crate::lsn::Lsn;
 
 /// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
+///
 /// Serialized in custom flexible key/value format. In replication protocol, it
 /// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
 /// Standby status update / Hot standby feedback messages.
diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs
index 27378c69fc..c3e2fba20c 100644
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -65,6 +65,8 @@ impl<T> Poison<T> {
     }
 }
 
+/// Armed pointer to a [`Poison`].
+///
 /// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
 /// Once modifications are done, use [`Self::disarm`].
 /// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
index f6b430657e..d146010b41 100644
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -13,10 +13,11 @@ pub struct ShardNumber(pub u8);
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(pub u8);
 
-/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
-/// when we need to know which shard we're dealing with, but do not need to know the full
-/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
-/// the fully qualified TenantShardId.
+/// Combination of ShardNumber and ShardCount.
+///
+/// For use within the context of a particular tenant, when we need to know which shard we're
+/// dealing with, but do not need to know the full ShardIdentity (because we won't be doing
+/// any page->shard mapping), and do not need to know the fully qualified TenantShardId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct ShardIndex {
     pub shard_number: ShardNumber,
diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs
index ecc5353be3..01750b2aef 100644
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -49,12 +49,11 @@ use std::sync::{RwLock, RwLockWriteGuard};
 
 use tokio::sync::watch;
 
-///
 /// Rcu allows multiple readers to read and hold onto a value without blocking
-/// (for very long).  Storing to the Rcu updates the value, making new readers
-/// immediately see the new value, but it also waits for all current readers to
-/// finish.
+/// (for very long).
 ///
+/// Storing to the Rcu updates the value, making new readers immediately see
+/// the new value, but it also waits for all current readers to finish.
 pub struct Rcu<V> {
     inner: RwLock<RcuInner<V>>,
 }
diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 1abd3d9861..dc711fb028 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -5,7 +5,9 @@ use std::sync::{
 use tokio::sync::Semaphore;
 
 /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
-/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
+/// `SemaphorePermit`.
+///
+/// Allows use of `take` which does not require holding an outer mutex guard
 /// for the duration of initialization.
 ///
 /// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs
index 18b2af14f1..5f0028bacd 100644
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -7,6 +7,7 @@ pub enum VecMapOrdering {
 }
 
 /// Ordered map datastructure implemented in a Vec.
+///
 /// Append only - can only add keys that are larger than the
 /// current max key.
 /// Ordering can be adjusted using [`VecMapOrdering`]
diff --git a/libs/utils/src/yielding_loop.rs b/libs/utils/src/yielding_loop.rs
index 41c4cee45d..68274f0631 100644
--- a/libs/utils/src/yielding_loop.rs
+++ b/libs/utils/src/yielding_loop.rs
@@ -6,9 +6,10 @@ pub enum YieldingLoopError {
     Cancelled,
 }
 
-/// Helper for long synchronous loops, e.g. over all tenants in the system.  Periodically
-/// yields to avoid blocking the executor, and after resuming checks the provided
-/// cancellation token to drop out promptly on shutdown.
+/// Helper for long synchronous loops, e.g. over all tenants in the system.
+///
+/// Periodically yields to avoid blocking the executor, and after resuming
+/// checks the provided cancellation token to drop out promptly on shutdown.
 #[inline(always)]
 pub async fn yielding_loop<I, T, F>(
     interval: usize,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 4e68e276d3..29a98855d3 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -180,6 +180,8 @@ pub struct PageServerConf {
     pub io_buffer_alignment: usize,
 }
 
+/// Token for authentication to safekeepers
+///
 /// We do not want to store this in a PageServerConf because the latter may be logged
 /// and/or serialized at a whim, while the token is secret. Currently this token is the
 /// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index 012cb8d96f..7afcf52cf2 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -1,7 +1,9 @@
-//! This module defines `RequestContext`, a structure that we use throughout
-//! the pageserver to propagate high-level context from places
-//! that _originate_ activity down to the shared code paths at the
-//! heart of the pageserver. It's inspired by Golang's `context.Context`.
+//! Defines [`RequestContext`].
+//!
+//! It is a structure that we use throughout the pageserver to propagate
+//! high-level context from places that _originate_ activity down to the
+//! shared code paths at the heart of the pageserver. It's inspired by
+//! Golang's `context.Context`.
 //!
 //! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
 //! 1. What high-level activity ([`TaskKind`]) needs this page?
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index d28a214265..808d4b666e 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1021,9 +1021,10 @@ impl Timeline {
 }
 
 /// DatadirModification represents an operation to ingest an atomic set of
-/// updates to the repository. It is created by the 'begin_record'
-/// function. It is called for each WAL record, so that all the modifications
-/// by a one WAL record appear atomic.
+/// updates to the repository.
+///
+/// It is created by the 'begin_record' function. It is called for each WAL
+/// record, so that all the modifications by a one WAL record appear atomic.
 pub struct DatadirModification<'a> {
     /// The timeline this modification applies to. You can access this to
     /// read the state, but note that any pending updates are *not* reflected
@@ -2048,6 +2049,7 @@ impl<'a> DatadirModification<'a> {
 
 /// This struct facilitates accessing either a committed key from the timeline at a
 /// specific LSN, or the latest uncommitted key from a pending modification.
+///
 /// During WAL ingestion, the records from multiple LSNs may be batched in the same
 /// modification before being flushed to the timeline. Hence, the routines in WalIngest
 /// need to look up the keys in the modification first before looking them up in the
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index fb30857ddf..fd2520a42e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1,8 +1,9 @@
+//! Timeline repository implementation that keeps old data in layer files, and
+//! the recent changes in ephemeral files.
 //!
-//! Timeline repository implementation that keeps old data in files on disk, and
-//! the recent changes in memory. See tenant/*_layer.rs files.
-//! The functions here are responsible for locating the correct layer for the
-//! get/put call, walking back the timeline branching history as needed.
+//! See tenant/*_layer.rs files. The functions here are responsible for locating
+//! the correct layer for the get/put call, walking back the timeline branching
+//! history as needed.
 //!
 //! The files are stored in the .neon/tenants/<tenant_id>/timelines/<timeline_id>
 //! directory. See docs/pageserver-storage.md for how the files are managed.
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 190316df42..24440d4b35 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -1,7 +1,8 @@
-//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in
-//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines,
-//! this struct and it's original serialization format is still needed because they were written a
-//! long time ago.
+//! Describes the legacy now hopefully no longer modified per-timeline metadata.
+//!
+//! It is stored in `index_part.json` managed by [`remote_timeline_client`]. For many tenants and
+//! their timelines, this struct and its original serialization format is still needed because
+//! they were written a long time ago.
 //!
 //! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json
 //! versioning.
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 4e6ea0c8f9..2104f41531 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -282,9 +282,10 @@ impl BackgroundPurges {
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
     Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
 
-/// The TenantManager is responsible for storing and mutating the collection of all tenants
-/// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
-/// lives inside the TenantManager.
+/// Responsible for storing and mutating the collection of all tenants
+/// that this pageserver has state for.
+///
+/// Every Tenant and SecondaryTenant instance lives inside the TenantManager.
 ///
 /// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach
 /// the same tenant twice concurrently, or trying to configure the same tenant into secondary
@@ -2346,8 +2347,9 @@ pub enum TenantMapError {
     ShuttingDown,
 }
 
-/// Guards a particular tenant_id's content in the TenantsMap.  While this
-/// structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
+/// Guards a particular tenant_id's content in the TenantsMap.
+///
+/// While this structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
 /// for this tenant, which acts as a marker for any operations targeting
 /// this tenant to retry later, or wait for the InProgress state to end.
 ///
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 71b766e4c7..1f9ae40af5 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2184,6 +2184,8 @@ pub fn remote_timeline_path(
     remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string()))
 }
 
+/// Obtains the path of the given Layer in the remote
+///
 /// Note that the shard component of a remote layer path is _not_ always the same
 /// as in the TenantShardId of the caller: tenants may reference layers from a different
 /// ShardIndex.  Use the ShardIndex from the layer's metadata.
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 757fb9d032..c51ff54919 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -1,4 +1,5 @@
 //! In-memory index to track the tenant files on the remote storage.
+//!
 //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
 //! remote timeline layers and its metadata.
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index a1202ad507..dac6b2f893 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -434,10 +434,11 @@ impl ReadableLayer {
     }
 }
 
-/// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
-/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
-/// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
-/// be used for cache management but not for correctness-critical checks.
+/// Layers contain a hint indicating whether they are likely to be used for reads.
+///
+/// This is a hint rather than an authoritative value, so that we do not have to update it synchronously
+/// when changing the visibility of layers (for example when creating a branch that makes some previously
+/// covered layers visible).  It should be used for cache management but not for correctness-critical checks.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum LayerVisibilityHint {
     /// A Visible layer might be read while serving a read, because there is not an image layer between it
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 6a2cd94232..34f1b15138 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -136,10 +136,11 @@ impl Summary {
 // Flag indicating that this version initialize the page
 const WILL_INIT: u64 = 1;
 
-/// Struct representing reference to BLOB in layers. Reference contains BLOB
-/// offset, and for WAL records it also contains `will_init` flag. The flag
-/// helps to determine the range of records that needs to be applied, without
-/// reading/deserializing records themselves.
+/// Struct representing reference to BLOB in layers.
+///
+/// Reference contains BLOB offset, and for WAL records it also contains
+/// `will_init` flag. The flag helps to determine the range of records
+/// that needs to be applied, without reading/deserializing records themselves.
 #[derive(Debug, Serialize, Deserialize, Copy, Clone)]
 pub struct BlobRef(pub u64);
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 77ce1ae670..875e223c9c 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1,7 +1,9 @@
 //! An ImageLayer represents an image or a snapshot of a key-range at
-//! one particular LSN. It contains an image of all key-value pairs
-//! in its key-range. Any key that falls into the image layer's range
-//! but does not exist in the layer, does not exist.
+//! one particular LSN.
+//!
+//! It contains an image of all key-value pairs in its key-range. Any key
+//! that falls into the image layer's range but does not exist in the layer,
+//! does not exist.
 //!
 //! An image layer is stored in a file on disk. The file is stored in
 //! timelines/<timeline_id> directory.  Currently, there are no
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index cbd18e650f..e90ff3c4b2 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -12,8 +12,10 @@ use serde::{Deserialize, Serialize};
 #[cfg(test)]
 use utils::id::TenantId;
 
-/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
-/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
+/// A unique identifier of a persistent layer.
+///
+/// This is different from `LayerDescriptor`, which is only used in the benchmarks.
+/// This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)]
 pub struct PersistentLayerDesc {
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index 47ae556279..ffe7ca5f3e 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -217,8 +217,9 @@ impl fmt::Display for ImageLayerName {
     }
 }
 
-/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.  The
-/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
+/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.
+///
+/// The LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
 /// over time (e.g. across shard splits or compression). The physical filenames of layers in local
 /// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
 /// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index d2c341e5ce..0831fd9530 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -226,9 +226,11 @@ impl<'a> IteratorWrapper<'a> {
     }
 }
 
-/// A merge iterator over delta/image layer iterators. When duplicated records are
-/// found, the iterator will not perform any deduplication, and the caller should handle
-/// these situation. By saying duplicated records, there are many possibilities:
+/// A merge iterator over delta/image layer iterators.
+///
+/// When duplicated records are found, the iterator will not perform any
+/// deduplication, and the caller should handle these situation. By saying
+/// duplicated records, there are many possibilities:
 ///
 /// * Two same delta at the same LSN.
 /// * Two same image at the same LSN.
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index e8deb0a1e5..7c1ac863bf 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -34,9 +34,10 @@ impl SplitWriterResult {
     }
 }
 
-/// An image writer that takes images and produces multiple image layers. The interface does not
-/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
-/// to be cleaned up)
+/// An image writer that takes images and produces multiple image layers.
+///
+/// The interface does not guarantee atomicity (i.e., if the image layer generation
+/// fails, there might be leftover files to be cleaned up)
 #[must_use]
 pub struct SplitImageLayerWriter {
     inner: ImageLayerWriter,
@@ -193,9 +194,10 @@ impl SplitImageLayerWriter {
     }
 }
 
-/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
-/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
-/// to be cleaned up).
+/// A delta writer that takes key-lsn-values and produces multiple delta layers.
+///
+/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails,
+/// there might be leftover files to be cleaned up).
 ///
 /// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
 /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 4d51dc442d..553edf6d8b 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -593,8 +593,10 @@ impl<'a> VectoredBlobReader<'a> {
     }
 }
 
-/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
-/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
+/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
+///
+/// It provides a streaming API for getting read blobs. It returns a batch when
+/// `handle` gets called and when the current key would just exceed the read_size and
 /// max_cnt constraints.
 pub struct StreamingVectoredReadPlanner {
     read_builder: Option<VectoredReadBuilder>,
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index ed6ff86c10..57856eea80 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -1,6 +1,7 @@
-//!
 //! VirtualFile is like a normal File, but it's not bound directly to
-//! a file descriptor. Instead, the file is opened when it's read from,
+//! a file descriptor.
+//!
+//! Instead, the file is opened when it's read from,
 //! and if too many files are open globally in the system, least-recently
 //! used ones are closed.
 //!
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 82585f9ed8..a36955fa21 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -43,13 +43,12 @@ use utils::lsn::Lsn;
 use utils::sync::gate::GateError;
 use utils::sync::heavier_once_cell;
 
+/// The real implementation that uses a Postgres process to
+/// perform WAL replay.
 ///
-/// This is the real implementation that uses a Postgres process to
-/// perform WAL replay. Only one thread can use the process at a time,
-/// that is controlled by the Mutex. In the future, we might want to
-/// launch a pool of processes to allow concurrent replay of multiple
-/// records.
-///
+/// Only one thread can use the process at a time, that is controlled by the
+/// Mutex. In the future, we might want to launch a pool of processes to allow
+/// concurrent replay of multiple records.
 pub struct PostgresRedoManager {
     tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 332dc27787..c14dd18afe 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -14,6 +14,7 @@ use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 use tokio_rustls::server::TlsStream;
 
 /// Stream wrapper which implements libpq's protocol.
+///
 /// NOTE: This object deliberately doesn't implement [`AsyncRead`]
 /// or [`AsyncWrite`] to prevent subtle errors (e.g. trying
 /// to pass random malformed bytes through the connection).
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 600a6bd8f0..64585f5edc 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -484,6 +484,7 @@ pub async fn validate_temp_timeline(
 }
 
 /// Move timeline from a temp directory to the main storage, and load it to the global map.
+///
 /// This operation is done under a lock to prevent bugs if several concurrent requests are
 /// trying to load the same timeline. Note that it doesn't guard against creating the
 /// timeline with the same ttid, but no one should be doing this anyway.
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index ab8c76dc17..e35f806e90 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -448,8 +448,10 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
 const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
 
 /// Encapsulates a task which takes messages from msg_rx, processes and pushes
-/// replies to reply_tx; reading from socket and writing to disk in parallel is
-/// beneficial for performance, this struct provides writing to disk part.
+/// replies to reply_tx.
+///
+/// Reading from socket and writing to disk in parallel is beneficial for
+/// performance, this struct provides the writing to disk part.
 pub struct WalAcceptor {
     tli: WalResidentTimeline,
     msg_rx: Receiver<ProposerAcceptorMessage>,
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index dca6414082..97eeae3638 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -147,9 +147,11 @@ pub struct TimelineMemState {
     pub proposer_uuid: PgUuid,
 }
 
-/// Safekeeper persistent state plus in memory layer, to avoid frequent fsyncs
-/// when we update fields like commit_lsn which don't need immediate
-/// persistence. Provides transactional like API to atomically update the state.
+/// Safekeeper persistent state plus in memory layer.
+///
+/// Allows us to avoid frequent fsyncs when we update fields like commit_lsn
+/// which don't need immediate persistence. Provides transactional like API
+/// to atomically update the state.
 ///
 /// Implements Deref into *persistent* part.
 pub struct TimelineState<CTRL: control_file::Storage> {
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 95ee925e1a..6fd5de0ad6 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -169,6 +169,7 @@ impl<'a> Drop for WriteGuardSharedState<'a> {
 }
 
 /// This structure is stored in shared state and represents the state of the timeline.
+///
 /// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this
 /// case, SafeKeeper is not available (because WAL is not present on disk) and all
 /// operations can be done only with control file.
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index 5d0567575c..5aa4921a92 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -1,6 +1,8 @@
-//! Code related to evicting WAL files to remote storage. The actual upload is done by the
-//! partial WAL backup code. This file has code to delete and re-download WAL files,
-//! cross-validate with partial WAL backup if local file is still present.
+//! Code related to evicting WAL files to remote storage.
+//!
+//! The actual upload is done by the partial WAL backup code. This file has
+//! code to delete and re-download WAL files, cross-validate with partial WAL
+//! backup if local file is still present.
 
 use anyhow::Context;
 use camino::Utf8PathBuf;
diff --git a/safekeeper/src/timeline_guard.rs b/safekeeper/src/timeline_guard.rs
index dbdf46412d..1ddac573d2 100644
--- a/safekeeper/src/timeline_guard.rs
+++ b/safekeeper/src/timeline_guard.rs
@@ -1,4 +1,6 @@
-//! Timeline residence guard is needed to ensure that WAL segments are present on disk,
+//! Timeline residence guard
+//!
+//! It is needed to ensure that WAL segments are present on disk,
 //! as long as the code is holding the guard. This file implements guard logic, to issue
 //! and drop guards, and to notify the manager when the guard is dropped.
 
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index f997f48454..6be75479db 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -1,4 +1,5 @@
 //! The timeline manager task is responsible for managing the timeline's background tasks.
+//!
 //! It is spawned alongside each timeline and exits when the timeline is deleted.
 //! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
 //! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs
index d6eea79f82..096e348295 100644
--- a/safekeeper/src/timelines_set.rs
+++ b/safekeeper/src/timelines_set.rs
@@ -60,7 +60,8 @@ impl TimelinesSet {
     }
 }
 
-/// Guard is used to add or remove timeline from the set.
+/// Guard is used to add or remove timelines from the set.
+///
 /// If the timeline present in set, it will be removed from it on drop.
 /// Note: do not use more than one guard for the same timeline, it caches the presence state.
 /// It is designed to be used in the manager task only.
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 4050a82fff..bddfca50e4 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -1,6 +1,8 @@
 //! Safekeeper timeline has a background task which is subscribed to `commit_lsn`
-//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn`
-//! was changed), the segment will be uploaded to S3 in about 15 minutes.
+//! and `flush_lsn` updates.
+//!
+//! After the partial segment was updated (`flush_lsn` was changed), the segment
+//! will be uploaded to S3 within the configured `partial_backup_timeout`.
 //!
 //! The filename format for partial segments is
 //! `Segment_Term_Flush_Commit_skNN.partial`, where:
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 16f7748eb4..1ab54d4cce 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -17,6 +17,7 @@ use crate::SafeKeeperConf;
 use postgres_backend::{AuthType, PostgresBackend};
 
 /// Accept incoming TCP connections and spawn them into a background thread.
+///
 /// allowed_auth_scope is either SafekeeperData (wide JWT tokens giving access
 /// to any tenant are allowed) or Tenant (only tokens giving access to specific
 /// tenant are allowed). Doesn't matter if auth is disabled in conf.
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 324f864291..e7eae647df 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -117,7 +117,9 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
 
 /// How long a node may be unresponsive to heartbeats during start up before we declare it
-/// offline. This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's
+/// offline.
+///
+/// This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's
 /// handling of the re-attach response may take a long time and blocks heartbeats from
 /// being handled on the pageserver side.
 pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index 3e22960f8d..d53611ed6e 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -1,6 +1,7 @@
-//! Functionality for finding and purging garbage, as in "garbage collection".  Garbage means
-//! S3 objects which are either not referenced by any metadata, or are referenced by a
-//! control plane tenant/timeline in a deleted state.
+//! Functionality for finding and purging garbage, as in "garbage collection".
+//!
+//! Garbage means S3 objects which are either not referenced by any metadata,
+//! or are referenced by a control plane tenant/timeline in a deleted state.
 
 use std::{
     collections::{HashMap, HashSet},
diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index 10d77937f1..f896cff2d5 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -74,7 +74,9 @@ pub async fn stream_tenant_shards<'a>(
 }
 
 /// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
-/// using a listing. The listing is done before the stream is built, so that this
+/// using a listing.
+///
+/// The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
 pub async fn stream_tenant_timelines<'a>(
     remote_client: &'a GenericRemoteStorage,
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 88681e38c2..c96d9cad3b 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -440,9 +440,10 @@ async fn gc_ancestor(
     Ok(())
 }
 
-/// Physical garbage collection: removing unused S3 objects.  This is distinct from the garbage collection
-/// done inside the pageserver, which operates at a higher level (keys, layers).  This type of garbage collection
-/// is about removing:
+/// Physical garbage collection: removing unused S3 objects.
+///
+/// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level
+/// (keys, layers).  This type of garbage collection is about removing:
 /// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between
 ///   uploading a layer and uploading an index)
 /// - Index objects from historic generations